33 



   31 


    6 






   34 



































    4 






    4 











    2 



























    2 
    2 








































































    1 





    1 





























    1 








    1 





    1 


    1 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
// SPDX-License-Identifier: GPL-2.0
/*
 * Fast batching percpu counters.
 */

#include <linux/percpu_counter.h>
#include <linux/mutex.h>
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/debugobjects.h>

#ifdef CONFIG_HOTPLUG_CPU
static LIST_HEAD(percpu_counters);
static DEFINE_SPINLOCK(percpu_counters_lock);
#endif

#ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER

static const struct debug_obj_descr percpu_counter_debug_descr;

static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
{
        struct percpu_counter *fbc = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                percpu_counter_destroy(fbc);
                debug_object_free(fbc, &percpu_counter_debug_descr);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr percpu_counter_debug_descr = {
        .name                = "percpu_counter",
        .fixup_free        = percpu_counter_fixup_free,
};

static inline void debug_percpu_counter_activate(struct percpu_counter *fbc)
{
        debug_object_init(fbc, &percpu_counter_debug_descr);
        debug_object_activate(fbc, &percpu_counter_debug_descr);
}

static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
{
        debug_object_deactivate(fbc, &percpu_counter_debug_descr);
        debug_object_free(fbc, &percpu_counter_debug_descr);
}

#else        /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */
static inline void debug_percpu_counter_activate(struct percpu_counter *fbc)
{ }
static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
{ }
#endif        /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */

void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
{
        int cpu;
        unsigned long flags;

        raw_spin_lock_irqsave(&fbc->lock, flags);
        for_each_possible_cpu(cpu) {
                s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
                *pcount = 0;
        }
        fbc->count = amount;
        raw_spin_unlock_irqrestore(&fbc->lock, flags);
}
EXPORT_SYMBOL(percpu_counter_set);

/*
 * local_irq_save() is needed to make the function irq safe:
 * - The slow path would be ok as protected by an irq-safe spinlock.
 * - this_cpu_add would be ok as it is irq-safe by definition.
 * But:
 * The decision slow path/fast path and the actual update must be atomic, too.
 * Otherwise a call in process context could check the current values and
 * decide that the fast path can be used. If now an interrupt occurs before
 * the this_cpu_add(), and the interrupt updates this_cpu(*fbc->counters),
 * then the this_cpu_add() that is executed after the interrupt has completed
 * can produce values larger than "batch" or even overflows.
 */
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
        s64 count;
        unsigned long flags;

        local_irq_save(flags);
        count = __this_cpu_read(*fbc->counters) + amount;
        if (abs(count) >= batch) {
                raw_spin_lock(&fbc->lock);
                fbc->count += count;
                __this_cpu_sub(*fbc->counters, count - amount);
                raw_spin_unlock(&fbc->lock);
        } else {
                this_cpu_add(*fbc->counters, amount);
        }
        local_irq_restore(flags);
}
EXPORT_SYMBOL(percpu_counter_add_batch);

/*
 * For percpu_counter with a big batch, the devication of its count could
 * be big, and there is requirement to reduce the deviation, like when the
 * counter's batch could be runtime decreased to get a better accuracy,
 * which can be achieved by running this sync function on each CPU.
 */
void percpu_counter_sync(struct percpu_counter *fbc)
{
        unsigned long flags;
        s64 count;

        raw_spin_lock_irqsave(&fbc->lock, flags);
        count = __this_cpu_read(*fbc->counters);
        fbc->count += count;
        __this_cpu_sub(*fbc->counters, count);
        raw_spin_unlock_irqrestore(&fbc->lock, flags);
}
EXPORT_SYMBOL(percpu_counter_sync);

/*
 * Add up all the per-cpu counts, return the result.  This is a more accurate
 * but much slower version of percpu_counter_read_positive().
 *
 * We use the cpu mask of (cpu_online_mask | cpu_dying_mask) to capture sums
 * from CPUs that are in the process of being taken offline. Dying cpus have
 * been removed from the online mask, but may not have had the hotplug dead
 * notifier called to fold the percpu count back into the global counter sum.
 * By including dying CPUs in the iteration mask, we avoid this race condition
 * so __percpu_counter_sum() just does the right thing when CPUs are being taken
 * offline.
 */
s64 __percpu_counter_sum(struct percpu_counter *fbc)
{
        s64 ret;
        int cpu;
        unsigned long flags;

        raw_spin_lock_irqsave(&fbc->lock, flags);
        ret = fbc->count;
        for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) {
                s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
                ret += *pcount;
        }
        raw_spin_unlock_irqrestore(&fbc->lock, flags);
        return ret;
}
EXPORT_SYMBOL(__percpu_counter_sum);

int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
                               gfp_t gfp, u32 nr_counters,
                               struct lock_class_key *key)
{
        unsigned long flags __maybe_unused;
        size_t counter_size;
        s32 __percpu *counters;
        u32 i;

        counter_size = ALIGN(sizeof(*counters), __alignof__(*counters));
        counters = __alloc_percpu_gfp(nr_counters * counter_size,
                                      __alignof__(*counters), gfp);
        if (!counters) {
                fbc[0].counters = NULL;
                return -ENOMEM;
        }

        for (i = 0; i < nr_counters; i++) {
                raw_spin_lock_init(&fbc[i].lock);
                lockdep_set_class(&fbc[i].lock, key);
#ifdef CONFIG_HOTPLUG_CPU
                INIT_LIST_HEAD(&fbc[i].list);
#endif
                fbc[i].count = amount;
                fbc[i].counters = (void *)counters + (i * counter_size);

                debug_percpu_counter_activate(&fbc[i]);
        }

#ifdef CONFIG_HOTPLUG_CPU
        spin_lock_irqsave(&percpu_counters_lock, flags);
        for (i = 0; i < nr_counters; i++)
                list_add(&fbc[i].list, &percpu_counters);
        spin_unlock_irqrestore(&percpu_counters_lock, flags);
#endif
        return 0;
}
EXPORT_SYMBOL(__percpu_counter_init_many);

void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters)
{
        unsigned long flags __maybe_unused;
        u32 i;

        if (WARN_ON_ONCE(!fbc))
                return;

        if (!fbc[0].counters)
                return;

        for (i = 0; i < nr_counters; i++)
                debug_percpu_counter_deactivate(&fbc[i]);

#ifdef CONFIG_HOTPLUG_CPU
        spin_lock_irqsave(&percpu_counters_lock, flags);
        for (i = 0; i < nr_counters; i++)
                list_del(&fbc[i].list);
        spin_unlock_irqrestore(&percpu_counters_lock, flags);
#endif

        free_percpu(fbc[0].counters);

        for (i = 0; i < nr_counters; i++)
                fbc[i].counters = NULL;
}
EXPORT_SYMBOL(percpu_counter_destroy_many);

int percpu_counter_batch __read_mostly = 32;
EXPORT_SYMBOL(percpu_counter_batch);

static int compute_batch_value(unsigned int cpu)
{
        int nr = num_online_cpus();

        percpu_counter_batch = max(32, nr*2);
        return 0;
}

static int percpu_counter_cpu_dead(unsigned int cpu)
{
#ifdef CONFIG_HOTPLUG_CPU
        struct percpu_counter *fbc;

        compute_batch_value(cpu);

        spin_lock_irq(&percpu_counters_lock);
        list_for_each_entry(fbc, &percpu_counters, list) {
                s32 *pcount;

                raw_spin_lock(&fbc->lock);
                pcount = per_cpu_ptr(fbc->counters, cpu);
                fbc->count += *pcount;
                *pcount = 0;
                raw_spin_unlock(&fbc->lock);
        }
        spin_unlock_irq(&percpu_counters_lock);
#endif
        return 0;
}

/*
 * Compare counter against given value.
 * Return 1 if greater, 0 if equal and -1 if less
 */
int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
{
        s64        count;

        count = percpu_counter_read(fbc);
        /* Check to see if rough count will be sufficient for comparison */
        if (abs(count - rhs) > (batch * num_online_cpus())) {
                if (count > rhs)
                        return 1;
                else
                        return -1;
        }
        /* Need to use precise count */
        count = percpu_counter_sum(fbc);
        if (count > rhs)
                return 1;
        else if (count < rhs)
                return -1;
        else
                return 0;
}
EXPORT_SYMBOL(__percpu_counter_compare);

/*
 * Compare counter, and add amount if total is: less than or equal to limit if
 * amount is positive, or greater than or equal to limit if amount is negative.
 * Return true if amount is added, or false if total would be beyond the limit.
 *
 * Negative limit is allowed, but unusual.
 * When negative amounts (subs) are given to percpu_counter_limited_add(),
 * the limit would most naturally be 0 - but other limits are also allowed.
 *
 * Overflow beyond S64_MAX is not allowed for: counter, limit and amount
 * are all assumed to be sane (far from S64_MIN and S64_MAX).
 */
bool __percpu_counter_limited_add(struct percpu_counter *fbc,
                                  s64 limit, s64 amount, s32 batch)
{
        s64 count;
        s64 unknown;
        unsigned long flags;
        bool good = false;

        if (amount == 0)
                return true;

        local_irq_save(flags);
        unknown = batch * num_online_cpus();
        count = __this_cpu_read(*fbc->counters);

        /* Skip taking the lock when safe */
        if (abs(count + amount) <= batch &&
            ((amount > 0 && fbc->count + unknown <= limit) ||
             (amount < 0 && fbc->count - unknown >= limit))) {
                this_cpu_add(*fbc->counters, amount);
                local_irq_restore(flags);
                return true;
        }

        raw_spin_lock(&fbc->lock);
        count = fbc->count + amount;

        /* Skip percpu_counter_sum() when safe */
        if (amount > 0) {
                if (count - unknown > limit)
                        goto out;
                if (count + unknown <= limit)
                        good = true;
        } else {
                if (count + unknown < limit)
                        goto out;
                if (count - unknown >= limit)
                        good = true;
        }

        if (!good) {
                s32 *pcount;
                int cpu;

                for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) {
                        pcount = per_cpu_ptr(fbc->counters, cpu);
                        count += *pcount;
                }
                if (amount > 0) {
                        if (count > limit)
                                goto out;
                } else {
                        if (count < limit)
                                goto out;
                }
                good = true;
        }

        count = __this_cpu_read(*fbc->counters);
        fbc->count += count + amount;
        __this_cpu_sub(*fbc->counters, count);
out:
        raw_spin_unlock(&fbc->lock);
        local_irq_restore(flags);
        return good;
}

static int __init percpu_counter_startup(void)
{
        int ret;

        ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online",
                                compute_batch_value, NULL);
        WARN_ON(ret < 0);
        ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD,
                                        "lib/percpu_cnt:dead", NULL,
                                        percpu_counter_cpu_dead);
        WARN_ON(ret < 0);
        return 0;
}
module_init(percpu_counter_startup);













































































































































































































































































































































    1 





    1 











































































































































































































































































































































    1 




    1 







    1 



























    1 








    1 











    1 



    1 













    1 


    1 






    1 







    1 

    1 

    1 










































    1 




    1 



    1 

















































































































































    1 




















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
// SPDX-License-Identifier: GPL-2.0-only
/*
 *
 * Copyright (C) 2011 Novell Inc.
 */

#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/rbtree.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/ratelimit.h>
#include "overlayfs.h"

struct ovl_cache_entry {
        unsigned int len;
        unsigned int type;
        u64 real_ino;
        u64 ino;
        struct list_head l_node;
        struct rb_node node;
        struct ovl_cache_entry *next_maybe_whiteout;
        bool is_upper;
        bool is_whiteout;
        bool check_xwhiteout;
        char name[];
};

struct ovl_dir_cache {
        long refcount;
        u64 version;
        struct list_head entries;
        struct rb_root root;
};

struct ovl_readdir_data {
        struct dir_context ctx;
        struct dentry *dentry;
        bool is_lowest;
        struct rb_root *root;
        struct list_head *list;
        struct list_head middle;
        struct ovl_cache_entry *first_maybe_whiteout;
        int count;
        int err;
        bool is_upper;
        bool d_type_supported;
        bool in_xwhiteouts_dir;
};

struct ovl_dir_file {
        bool is_real;
        bool is_upper;
        struct ovl_dir_cache *cache;
        struct list_head *cursor;
        struct file *realfile;
        struct file *upperfile;
};

static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
{
        return rb_entry(n, struct ovl_cache_entry, node);
}

static bool ovl_cache_entry_find_link(const char *name, int len,
                                      struct rb_node ***link,
                                      struct rb_node **parent)
{
        bool found = false;
        struct rb_node **newp = *link;

        while (!found && *newp) {
                int cmp;
                struct ovl_cache_entry *tmp;

                *parent = *newp;
                tmp = ovl_cache_entry_from_node(*newp);
                cmp = strncmp(name, tmp->name, len);
                if (cmp > 0)
                        newp = &tmp->node.rb_right;
                else if (cmp < 0 || len < tmp->len)
                        newp = &tmp->node.rb_left;
                else
                        found = true;
        }
        *link = newp;

        return found;
}

static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
                                                    const char *name, int len)
{
        struct rb_node *node = root->rb_node;
        int cmp;

        while (node) {
                struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);

                cmp = strncmp(name, p->name, len);
                if (cmp > 0)
                        node = p->node.rb_right;
                else if (cmp < 0 || len < p->len)
                        node = p->node.rb_left;
                else
                        return p;
        }

        return NULL;
}

static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd,
                           struct ovl_cache_entry *p)
{
        /* Don't care if not doing ovl_iter() */
        if (!rdd->dentry)
                return false;

        /* Always recalc d_ino when remapping lower inode numbers */
        if (ovl_xino_bits(OVL_FS(rdd->dentry->d_sb)))
                return true;

        /* Always recalc d_ino for parent */
        if (strcmp(p->name, "..") == 0)
                return true;

        /* If this is lower, then native d_ino will do */
        if (!rdd->is_upper)
                return false;

        /*
         * Recalc d_ino for '.' and for all entries if dir is impure (contains
         * copied up entries)
         */
        if ((p->name[0] == '.' && p->len == 1) ||
            ovl_test_flag(OVL_IMPURE, d_inode(rdd->dentry)))
                return true;

        return false;
}

static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
                                                   const char *name, int len,
                                                   u64 ino, unsigned int d_type)
{
        struct ovl_cache_entry *p;
        size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);

        p = kmalloc(size, GFP_KERNEL);
        if (!p)
                return NULL;

        memcpy(p->name, name, len);
        p->name[len] = '\0';
        p->len = len;
        p->type = d_type;
        p->real_ino = ino;
        p->ino = ino;
        /* Defer setting d_ino for upper entry to ovl_iterate() */
        if (ovl_calc_d_ino(rdd, p))
                p->ino = 0;
        p->is_upper = rdd->is_upper;
        p->is_whiteout = false;
        /* Defer check for overlay.whiteout to ovl_iterate() */
        p->check_xwhiteout = rdd->in_xwhiteouts_dir && d_type == DT_REG;

        if (d_type == DT_CHR) {
                p->next_maybe_whiteout = rdd->first_maybe_whiteout;
                rdd->first_maybe_whiteout = p;
        }
        return p;
}

static bool ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
                                  const char *name, int len, u64 ino,
                                  unsigned int d_type)
{
        struct rb_node **newp = &rdd->root->rb_node;
        struct rb_node *parent = NULL;
        struct ovl_cache_entry *p;

        if (ovl_cache_entry_find_link(name, len, &newp, &parent))
                return true;

        p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
        if (p == NULL) {
                rdd->err = -ENOMEM;
                return false;
        }

        list_add_tail(&p->l_node, rdd->list);
        rb_link_node(&p->node, parent, newp);
        rb_insert_color(&p->node, rdd->root);

        return true;
}

static bool ovl_fill_lowest(struct ovl_readdir_data *rdd,
                           const char *name, int namelen,
                           loff_t offset, u64 ino, unsigned int d_type)
{
        struct ovl_cache_entry *p;

        p = ovl_cache_entry_find(rdd->root, name, namelen);
        if (p) {
                list_move_tail(&p->l_node, &rdd->middle);
        } else {
                p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
                if (p == NULL)
                        rdd->err = -ENOMEM;
                else
                        list_add_tail(&p->l_node, &rdd->middle);
        }

        return rdd->err == 0;
}

void ovl_cache_free(struct list_head *list)
{
        struct ovl_cache_entry *p;
        struct ovl_cache_entry *n;

        list_for_each_entry_safe(p, n, list, l_node)
                kfree(p);

        INIT_LIST_HEAD(list);
}

void ovl_dir_cache_free(struct inode *inode)
{
        struct ovl_dir_cache *cache = ovl_dir_cache(inode);

        if (cache) {
                ovl_cache_free(&cache->entries);
                kfree(cache);
        }
}

static void ovl_cache_put(struct ovl_dir_file *od, struct inode *inode)
{
        struct ovl_dir_cache *cache = od->cache;

        WARN_ON(cache->refcount <= 0);
        cache->refcount--;
        if (!cache->refcount) {
                if (ovl_dir_cache(inode) == cache)
                        ovl_set_dir_cache(inode, NULL);

                ovl_cache_free(&cache->entries);
                kfree(cache);
        }
}

static bool ovl_fill_merge(struct dir_context *ctx, const char *name,
                          int namelen, loff_t offset, u64 ino,
                          unsigned int d_type)
{
        struct ovl_readdir_data *rdd =
                container_of(ctx, struct ovl_readdir_data, ctx);

        rdd->count++;
        if (!rdd->is_lowest)
                return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
        else
                return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type);
}

static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd)
{
        int err;
        struct ovl_cache_entry *p;
        struct dentry *dentry, *dir = path->dentry;
        const struct cred *old_cred;

        old_cred = ovl_override_creds(rdd->dentry->d_sb);

        err = down_write_killable(&dir->d_inode->i_rwsem);
        if (!err) {
                while (rdd->first_maybe_whiteout) {
                        p = rdd->first_maybe_whiteout;
                        rdd->first_maybe_whiteout = p->next_maybe_whiteout;
                        dentry = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len);
                        if (!IS_ERR(dentry)) {
                                p->is_whiteout = ovl_is_whiteout(dentry);
                                dput(dentry);
                        }
                }
                inode_unlock(dir->d_inode);
        }
        revert_creds(old_cred);

        return err;
}

static inline int ovl_dir_read(const struct path *realpath,
                               struct ovl_readdir_data *rdd)
{
        struct file *realfile;
        int err;

        realfile = ovl_path_open(realpath, O_RDONLY | O_LARGEFILE);
        if (IS_ERR(realfile))
                return PTR_ERR(realfile);

        rdd->first_maybe_whiteout = NULL;
        rdd->ctx.pos = 0;
        do {
                rdd->count = 0;
                rdd->err = 0;
                err = iterate_dir(realfile, &rdd->ctx);
                if (err >= 0)
                        err = rdd->err;
        } while (!err && rdd->count);

        if (!err && rdd->first_maybe_whiteout && rdd->dentry)
                err = ovl_check_whiteouts(realpath, rdd);

        fput(realfile);

        return err;
}

static void ovl_dir_reset(struct file *file)
{
        struct ovl_dir_file *od = file->private_data;
        struct ovl_dir_cache *cache = od->cache;
        struct inode *inode = file_inode(file);
        bool is_real;

        if (cache && ovl_inode_version_get(inode) != cache->version) {
                ovl_cache_put(od, inode);
                od->cache = NULL;
                od->cursor = NULL;
        }
        is_real = ovl_dir_is_real(inode);
        if (od->is_real != is_real) {
                /* is_real can only become false when dir is copied up */
                if (WARN_ON(is_real))
                        return;
                od->is_real = false;
        }
}

static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list,
        struct rb_root *root)
{
        int err;
        struct path realpath;
        struct ovl_readdir_data rdd = {
                .ctx.actor = ovl_fill_merge,
                .dentry = dentry,
                .list = list,
                .root = root,
                .is_lowest = false,
        };
        int idx, next;
        const struct ovl_layer *layer;

        for (idx = 0; idx != -1; idx = next) {
                next = ovl_path_next(idx, dentry, &realpath, &layer);
                rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry;
                rdd.in_xwhiteouts_dir = layer->has_xwhiteouts &&
                                        ovl_dentry_has_xwhiteouts(dentry);

                if (next != -1) {
                        err = ovl_dir_read(&realpath, &rdd);
                        if (err)
                                break;
                } else {
                        /*
                         * Insert lowest layer entries before upper ones, this
                         * allows offsets to be reasonably constant
                         */
                        list_add(&rdd.middle, rdd.list);
                        rdd.is_lowest = true;
                        err = ovl_dir_read(&realpath, &rdd);
                        list_del(&rdd.middle);
                }
        }
        return err;
}

static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
{
        struct list_head *p;
        loff_t off = 0;

        list_for_each(p, &od->cache->entries) {
                if (off >= pos)
                        break;
                off++;
        }
        /* Cursor is safe since the cache is stable */
        od->cursor = p;
}

static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
        int res;
        struct ovl_dir_cache *cache;
        struct inode *inode = d_inode(dentry);

        cache = ovl_dir_cache(inode);
        if (cache && ovl_inode_version_get(inode) == cache->version) {
                WARN_ON(!cache->refcount);
                cache->refcount++;
                return cache;
        }
        ovl_set_dir_cache(d_inode(dentry), NULL);

        cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
        if (!cache)
                return ERR_PTR(-ENOMEM);

        cache->refcount = 1;
        INIT_LIST_HEAD(&cache->entries);
        cache->root = RB_ROOT;

        res = ovl_dir_read_merged(dentry, &cache->entries, &cache->root);
        if (res) {
                ovl_cache_free(&cache->entries);
                kfree(cache);
                return ERR_PTR(res);
        }

        cache->version = ovl_inode_version_get(inode);
        ovl_set_dir_cache(inode, cache);

        return cache;
}

/* Map inode number to lower fs unique range */
static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid,
                               const char *name, int namelen, bool warn)
{
        unsigned int xinoshift = 64 - xinobits;

        if (unlikely(ino >> xinoshift)) {
                if (warn) {
                        pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n",
                                            namelen, name, ino, xinobits);
                }
                return ino;
        }

        /*
         * The lowest xinobit is reserved for mapping the non-peresistent inode
         * numbers range, but this range is only exposed via st_ino, not here.
         */
        return ino | ((u64)fsid) << (xinoshift + 1);
}

/*
 * Set d_ino for upper entries if needed. Non-upper entries should always report
 * the uppermost real inode ino and should not call this function.
 *
 * When not all layer are on same fs, report real ino also for upper.
 *
 * When all layers are on the same fs, and upper has a reference to
 * copy up origin, call vfs_getattr() on the overlay entry to make
 * sure that d_ino will be consistent with st_ino from stat(2).
 *
 * Also checks the overlay.whiteout xattr by doing a full lookup which will return
 * negative in this case.
 */
static int ovl_cache_update(const struct path *path, struct ovl_cache_entry *p, bool update_ino)

{
        struct dentry *dir = path->dentry;
        struct ovl_fs *ofs = OVL_FS(dir->d_sb);
        struct dentry *this = NULL;
        enum ovl_path_type type;
        u64 ino = p->real_ino;
        int xinobits = ovl_xino_bits(ofs);
        int err = 0;

        if (!ovl_same_dev(ofs) && !p->check_xwhiteout)
                goto out;

        if (p->name[0] == '.') {
                if (p->len == 1) {
                        this = dget(dir);
                        goto get;
                }
                if (p->len == 2 && p->name[1] == '.') {
                        /* we shall not be moved */
                        this = dget(dir->d_parent);
                        goto get;
                }
        }
        /* This checks also for xwhiteouts */
        this = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len);
        if (IS_ERR_OR_NULL(this) || !this->d_inode) {
                /* Mark a stale entry */
                p->is_whiteout = true;
                if (IS_ERR(this)) {
                        err = PTR_ERR(this);
                        this = NULL;
                        goto fail;
                }
                goto out;
        }

get:
        if (!ovl_same_dev(ofs) || !update_ino)
                goto out;

        type = ovl_path_type(this);
        if (OVL_TYPE_ORIGIN(type)) {
                struct kstat stat;
                struct path statpath = *path;

                statpath.dentry = this;
                err = vfs_getattr(&statpath, &stat, STATX_INO, 0);
                if (err)
                        goto fail;

                /*
                 * Directory inode is always on overlay st_dev.
                 * Non-dir with ovl_same_dev() could be on pseudo st_dev in case
                 * of xino bits overflow.
                 */
                WARN_ON_ONCE(S_ISDIR(stat.mode) &&
                             dir->d_sb->s_dev != stat.dev);
                ino = stat.ino;
        } else if (xinobits && !OVL_TYPE_UPPER(type)) {
                ino = ovl_remap_lower_ino(ino, xinobits,
                                          ovl_layer_lower(this)->fsid,
                                          p->name, p->len,
                                          ovl_xino_warn(ofs));
        }

out:
        p->ino = ino;
        dput(this);
        return err;

fail:
        pr_warn_ratelimited("failed to look up (%s) for ino (%i)\n",
                            p->name, err);
        goto out;
}

static bool ovl_fill_plain(struct dir_context *ctx, const char *name,
                          int namelen, loff_t offset, u64 ino,
                          unsigned int d_type)
{
        struct ovl_cache_entry *p;
        struct ovl_readdir_data *rdd =
                container_of(ctx, struct ovl_readdir_data, ctx);

        rdd->count++;
        p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
        if (p == NULL) {
                rdd->err = -ENOMEM;
                return false;
        }
        list_add_tail(&p->l_node, rdd->list);

        return true;
}

static int ovl_dir_read_impure(const struct path *path,  struct list_head *list,
                               struct rb_root *root)
{
        int err;
        struct path realpath;
        struct ovl_cache_entry *p, *n;
        struct ovl_readdir_data rdd = {
                .ctx.actor = ovl_fill_plain,
                .list = list,
                .root = root,
        };

        INIT_LIST_HEAD(list);
        *root = RB_ROOT;
        ovl_path_upper(path->dentry, &realpath);

        err = ovl_dir_read(&realpath, &rdd);
        if (err)
                return err;

        list_for_each_entry_safe(p, n, list, l_node) {
                if (strcmp(p->name, ".") != 0 &&
                    strcmp(p->name, "..") != 0) {
                        err = ovl_cache_update(path, p, true);
                        if (err)
                                return err;
                }
                if (p->ino == p->real_ino) {
                        list_del(&p->l_node);
                        kfree(p);
                } else {
                        struct rb_node **newp = &root->rb_node;
                        struct rb_node *parent = NULL;

                        if (WARN_ON(ovl_cache_entry_find_link(p->name, p->len,
                                                              &newp, &parent)))
                                return -EIO;

                        rb_link_node(&p->node, parent, newp);
                        rb_insert_color(&p->node, root);
                }
        }
        return 0;
}

static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path)
{
        int res;
        struct dentry *dentry = path->dentry;
        struct inode *inode = d_inode(dentry);
        struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
        struct ovl_dir_cache *cache;

        cache = ovl_dir_cache(inode);
        if (cache && ovl_inode_version_get(inode) == cache->version)
                return cache;

        /* Impure cache is not refcounted, free it here */
        ovl_dir_cache_free(inode);
        ovl_set_dir_cache(inode, NULL);

        cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
        if (!cache)
                return ERR_PTR(-ENOMEM);

        res = ovl_dir_read_impure(path, &cache->entries, &cache->root);
        if (res) {
                ovl_cache_free(&cache->entries);
                kfree(cache);
                return ERR_PTR(res);
        }
        if (list_empty(&cache->entries)) {
                /*
                 * A good opportunity to get rid of an unneeded "impure" flag.
                 * Removing the "impure" xattr is best effort.
                 */
                if (!ovl_want_write(dentry)) {
                        ovl_removexattr(ofs, ovl_dentry_upper(dentry),
                                        OVL_XATTR_IMPURE);
                        ovl_drop_write(dentry);
                }
                ovl_clear_flag(OVL_IMPURE, inode);
                kfree(cache);
                return NULL;
        }

        cache->version = ovl_inode_version_get(inode);
        ovl_set_dir_cache(inode, cache);

        return cache;
}

struct ovl_readdir_translate {
        struct dir_context *orig_ctx;
        struct ovl_dir_cache *cache;
        struct dir_context ctx;
        u64 parent_ino;
        int fsid;
        int xinobits;
        bool xinowarn;
};

static bool ovl_fill_real(struct dir_context *ctx, const char *name,
                           int namelen, loff_t offset, u64 ino,
                           unsigned int d_type)
{
        struct ovl_readdir_translate *rdt =
                container_of(ctx, struct ovl_readdir_translate, ctx);
        struct dir_context *orig_ctx = rdt->orig_ctx;

        if (rdt->parent_ino && strcmp(name, "..") == 0) {
                ino = rdt->parent_ino;
        } else if (rdt->cache) {
                struct ovl_cache_entry *p;

                p = ovl_cache_entry_find(&rdt->cache->root, name, namelen);
                if (p)
                        ino = p->ino;
        } else if (rdt->xinobits) {
                ino = ovl_remap_lower_ino(ino, rdt->xinobits, rdt->fsid,
                                          name, namelen, rdt->xinowarn);
        }

        return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
}

static bool ovl_is_impure_dir(struct file *file)
{
        struct ovl_dir_file *od = file->private_data;
        struct inode *dir = file_inode(file);

        /*
         * Only upper dir can be impure, but if we are in the middle of
         * iterating a lower real dir, dir could be copied up and marked
         * impure. We only want the impure cache if we started iterating
         * a real upper dir to begin with.
         */
        return od->is_upper && ovl_test_flag(OVL_IMPURE, dir);

}

static int ovl_iterate_real(struct file *file, struct dir_context *ctx)
{
        int err;
        struct ovl_dir_file *od = file->private_data;
        struct dentry *dir = file->f_path.dentry;
        struct ovl_fs *ofs = OVL_FS(dir->d_sb);
        const struct ovl_layer *lower_layer = ovl_layer_lower(dir);
        struct ovl_readdir_translate rdt = {
                .ctx.actor = ovl_fill_real,
                .orig_ctx = ctx,
                .xinobits = ovl_xino_bits(ofs),
                .xinowarn = ovl_xino_warn(ofs),
        };

        if (rdt.xinobits && lower_layer)
                rdt.fsid = lower_layer->fsid;

        if (OVL_TYPE_MERGE(ovl_path_type(dir->d_parent))) {
                struct kstat stat;
                struct path statpath = file->f_path;

                statpath.dentry = dir->d_parent;
                err = vfs_getattr(&statpath, &stat, STATX_INO, 0);
                if (err)
                        return err;

                WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev);
                rdt.parent_ino = stat.ino;
        }

        if (ovl_is_impure_dir(file)) {
                rdt.cache = ovl_cache_get_impure(&file->f_path);
                if (IS_ERR(rdt.cache))
                        return PTR_ERR(rdt.cache);
        }

        err = iterate_dir(od->realfile, &rdt.ctx);
        ctx->pos = rdt.ctx.pos;

        return err;
}


static int ovl_iterate(struct file *file, struct dir_context *ctx)
{
        struct ovl_dir_file *od = file->private_data;
        struct dentry *dentry = file->f_path.dentry;
        struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
        struct ovl_cache_entry *p;
        const struct cred *old_cred;
        int err;

        old_cred = ovl_override_creds(dentry->d_sb);
        if (!ctx->pos)
                ovl_dir_reset(file);

        if (od->is_real) {
                /*
                 * If parent is merge, then need to adjust d_ino for '..', if
                 * dir is impure then need to adjust d_ino for copied up
                 * entries.
                 */
                if (ovl_xino_bits(ofs) ||
                    (ovl_same_fs(ofs) &&
                     (ovl_is_impure_dir(file) ||
                      OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) {
                        err = ovl_iterate_real(file, ctx);
                } else {
                        err = iterate_dir(od->realfile, ctx);
                }
                goto out;
        }

        if (!od->cache) {
                struct ovl_dir_cache *cache;

                cache = ovl_cache_get(dentry);
                err = PTR_ERR(cache);
                if (IS_ERR(cache))
                        goto out;

                od->cache = cache;
                ovl_seek_cursor(od, ctx->pos);
        }

        while (od->cursor != &od->cache->entries) {
                p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
                if (!p->is_whiteout) {
                        if (!p->ino || p->check_xwhiteout) {
                                err = ovl_cache_update(&file->f_path, p, !p->ino);
                                if (err)
                                        goto out;
                        }
                }
                /* ovl_cache_update() sets is_whiteout on stale entry */
                if (!p->is_whiteout) {
                        if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
                                break;
                }
                od->cursor = p->l_node.next;
                ctx->pos++;
        }
        err = 0;
out:
        revert_creds(old_cred);
        return err;
}

static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
{
        loff_t res;
        struct ovl_dir_file *od = file->private_data;

        inode_lock(file_inode(file));
        if (!file->f_pos)
                ovl_dir_reset(file);

        if (od->is_real) {
                res = vfs_llseek(od->realfile, offset, origin);
                file->f_pos = od->realfile->f_pos;
        } else {
                res = -EINVAL;

                switch (origin) {
                case SEEK_CUR:
                        offset += file->f_pos;
                        break;
                case SEEK_SET:
                        break;
                default:
                        goto out_unlock;
                }
                if (offset < 0)
                        goto out_unlock;

                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        if (od->cache)
                                ovl_seek_cursor(od, offset);
                }
                res = offset;
        }
out_unlock:
        inode_unlock(file_inode(file));

        return res;
}

static struct file *ovl_dir_open_realfile(const struct file *file,
                                          const struct path *realpath)
{
        struct file *res;
        const struct cred *old_cred;

        old_cred = ovl_override_creds(file_inode(file)->i_sb);
        res = ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE));
        revert_creds(old_cred);

        return res;
}

/*
 * Like ovl_real_fdget(), returns upperfile if dir was copied up since open.
 * Unlike ovl_real_fdget(), this caches upperfile in file->private_data.
 *
 * TODO: use same abstract type for file->private_data of dir and file so
 * upperfile could also be cached for files as well.
 */
struct file *ovl_dir_real_file(const struct file *file, bool want_upper)
{

        struct ovl_dir_file *od = file->private_data;
        struct dentry *dentry = file->f_path.dentry;
        struct file *old, *realfile = od->realfile;

        if (!OVL_TYPE_UPPER(ovl_path_type(dentry)))
                return want_upper ? NULL : realfile;

        /*
         * Need to check if we started out being a lower dir, but got copied up
         */
        if (!od->is_upper) {
                realfile = READ_ONCE(od->upperfile);
                if (!realfile) {
                        struct path upperpath;

                        ovl_path_upper(dentry, &upperpath);
                        realfile = ovl_dir_open_realfile(file, &upperpath);
                        if (IS_ERR(realfile))
                                return realfile;

                        old = cmpxchg_release(&od->upperfile, NULL, realfile);
                        if (old) {
                                fput(realfile);
                                realfile = old;
                        }
                }
        }

        return realfile;
}

static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
                         int datasync)
{
        struct file *realfile;
        int err;

        err = ovl_sync_status(OVL_FS(file_inode(file)->i_sb));
        if (err <= 0)
                return err;

        realfile = ovl_dir_real_file(file, true);
        err = PTR_ERR_OR_ZERO(realfile);

        /* Nothing to sync for lower */
        if (!realfile || err)
                return err;

        return vfs_fsync_range(realfile, start, end, datasync);
}

static int ovl_dir_release(struct inode *inode, struct file *file)
{
        struct ovl_dir_file *od = file->private_data;

        if (od->cache) {
                inode_lock(inode);
                ovl_cache_put(od, inode);
                inode_unlock(inode);
        }
        fput(od->realfile);
        if (od->upperfile)
                fput(od->upperfile);
        kfree(od);

        return 0;
}

static int ovl_dir_open(struct inode *inode, struct file *file)
{
        struct path realpath;
        struct file *realfile;
        struct ovl_dir_file *od;
        enum ovl_path_type type;

        od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
        if (!od)
                return -ENOMEM;

        type = ovl_path_real(file->f_path.dentry, &realpath);
        realfile = ovl_dir_open_realfile(file, &realpath);
        if (IS_ERR(realfile)) {
                kfree(od);
                return PTR_ERR(realfile);
        }
        od->realfile = realfile;
        od->is_real = ovl_dir_is_real(inode);
        od->is_upper = OVL_TYPE_UPPER(type);
        file->private_data = od;

        return 0;
}

WRAP_DIR_ITER(ovl_iterate) // FIXME!
const struct file_operations ovl_dir_operations = {
        .read                = generic_read_dir,
        .open                = ovl_dir_open,
        .iterate_shared        = shared_ovl_iterate,
        .llseek                = ovl_dir_llseek,
        .fsync                = ovl_dir_fsync,
        .release        = ovl_dir_release,
};

int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
{
        int err;
        struct ovl_cache_entry *p, *n;
        struct rb_root root = RB_ROOT;
        const struct cred *old_cred;

        old_cred = ovl_override_creds(dentry->d_sb);
        err = ovl_dir_read_merged(dentry, list, &root);
        revert_creds(old_cred);
        if (err)
                return err;

        err = 0;

        list_for_each_entry_safe(p, n, list, l_node) {
                /*
                 * Select whiteouts in upperdir, they should
                 * be cleared when deleting this directory.
                 */
                if (p->is_whiteout) {
                        if (p->is_upper)
                                continue;
                        goto del_entry;
                }

                if (p->name[0] == '.') {
                        if (p->len == 1)
                                goto del_entry;
                        if (p->len == 2 && p->name[1] == '.')
                                goto del_entry;
                }
                err = -ENOTEMPTY;
                break;

del_entry:
                list_del(&p->l_node);
                kfree(p);
        }

        return err;
}

void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper,
                           struct list_head *list)
{
        struct ovl_cache_entry *p;

        inode_lock_nested(upper->d_inode, I_MUTEX_CHILD);
        list_for_each_entry(p, list, l_node) {
                struct dentry *dentry;

                if (WARN_ON(!p->is_whiteout || !p->is_upper))
                        continue;

                dentry = ovl_lookup_upper(ofs, p->name, upper, p->len);
                if (IS_ERR(dentry)) {
                        pr_err("lookup '%s/%.*s' failed (%i)\n",
                               upper->d_name.name, p->len, p->name,
                               (int) PTR_ERR(dentry));
                        continue;
                }
                if (dentry->d_inode)
                        ovl_cleanup(ofs, upper->d_inode, dentry);
                dput(dentry);
        }
        inode_unlock(upper->d_inode);
}

static bool ovl_check_d_type(struct dir_context *ctx, const char *name,
                          int namelen, loff_t offset, u64 ino,
                          unsigned int d_type)
{
        struct ovl_readdir_data *rdd =
                container_of(ctx, struct ovl_readdir_data, ctx);

        /* Even if d_type is not supported, DT_DIR is returned for . and .. */
        if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen))
                return true;

        if (d_type != DT_UNKNOWN)
                rdd->d_type_supported = true;

        return true;
}

/*
 * Returns 1 if d_type is supported, 0 not supported/unknown. Negative values
 * if error is encountered.
 */
int ovl_check_d_type_supported(const struct path *realpath)
{
        int err;
        struct ovl_readdir_data rdd = {
                .ctx.actor = ovl_check_d_type,
                .d_type_supported = false,
        };

        err = ovl_dir_read(realpath, &rdd);
        if (err)
                return err;

        return rdd.d_type_supported;
}

#define OVL_INCOMPATDIR_NAME "incompat"

static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *path,
                                       int level)
{
        int err;
        struct inode *dir = path->dentry->d_inode;
        LIST_HEAD(list);
        struct ovl_cache_entry *p;
        struct ovl_readdir_data rdd = {
                .ctx.actor = ovl_fill_plain,
                .list = &list,
        };
        bool incompat = false;

        /*
         * The "work/incompat" directory is treated specially - if it is not
         * empty, instead of printing a generic error and mounting read-only,
         * we will error about incompat features and fail the mount.
         *
         * When called from ovl_indexdir_cleanup(), path->dentry->d_name.name
         * starts with '#'.
         */
        if (level == 2 &&
            !strcmp(path->dentry->d_name.name, OVL_INCOMPATDIR_NAME))
                incompat = true;

        err = ovl_dir_read(path, &rdd);
        if (err)
                goto out;

        inode_lock_nested(dir, I_MUTEX_PARENT);
        list_for_each_entry(p, &list, l_node) {
                struct dentry *dentry;

                if (p->name[0] == '.') {
                        if (p->len == 1)
                                continue;
                        if (p->len == 2 && p->name[1] == '.')
                                continue;
                } else if (incompat) {
                        pr_err("overlay with incompat feature '%s' cannot be mounted\n",
                                p->name);
                        err = -EINVAL;
                        break;
                }
                dentry = ovl_lookup_upper(ofs, p->name, path->dentry, p->len);
                if (IS_ERR(dentry))
                        continue;
                if (dentry->d_inode)
                        err = ovl_workdir_cleanup(ofs, dir, path->mnt, dentry, level);
                dput(dentry);
                if (err)
                        break;
        }
        inode_unlock(dir);
out:
        ovl_cache_free(&list);
        return err;
}

int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir,
                        struct vfsmount *mnt, struct dentry *dentry, int level)
{
        int err;

        if (!d_is_dir(dentry) || level > 1) {
                return ovl_cleanup(ofs, dir, dentry);
        }

        err = ovl_do_rmdir(ofs, dir, dentry);
        if (err) {
                struct path path = { .mnt = mnt, .dentry = dentry };

                inode_unlock(dir);
                err = ovl_workdir_cleanup_recurse(ofs, &path, level + 1);
                inode_lock_nested(dir, I_MUTEX_PARENT);
                if (!err)
                        err = ovl_cleanup(ofs, dir, dentry);
        }

        return err;
}

int ovl_indexdir_cleanup(struct ovl_fs *ofs)
{
        int err;
        struct dentry *indexdir = ofs->workdir;
        struct dentry *index = NULL;
        struct inode *dir = indexdir->d_inode;
        struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir };
        LIST_HEAD(list);
        struct ovl_cache_entry *p;
        struct ovl_readdir_data rdd = {
                .ctx.actor = ovl_fill_plain,
                .list = &list,
        };

        err = ovl_dir_read(&path, &rdd);
        if (err)
                goto out;

        inode_lock_nested(dir, I_MUTEX_PARENT);
        list_for_each_entry(p, &list, l_node) {
                if (p->name[0] == '.') {
                        if (p->len == 1)
                                continue;
                        if (p->len == 2 && p->name[1] == '.')
                                continue;
                }
                index = ovl_lookup_upper(ofs, p->name, indexdir, p->len);
                if (IS_ERR(index)) {
                        err = PTR_ERR(index);
                        index = NULL;
                        break;
                }
                /* Cleanup leftover from index create/cleanup attempt */
                if (index->d_name.name[0] == '#') {
                        err = ovl_workdir_cleanup(ofs, dir, path.mnt, index, 1);
                        if (err)
                                break;
                        goto next;
                }
                err = ovl_verify_index(ofs, index);
                if (!err) {
                        goto next;
                } else if (err == -ESTALE) {
                        /* Cleanup stale index entries */
                        err = ovl_cleanup(ofs, dir, index);
                } else if (err != -ENOENT) {
                        /*
                         * Abort mount to avoid corrupting the index if
                         * an incompatible index entry was found or on out
                         * of memory.
                         */
                        break;
                } else if (ofs->config.nfs_export) {
                        /*
                         * Whiteout orphan index to block future open by
                         * handle after overlay nlink dropped to zero.
                         */
                        err = ovl_cleanup_and_whiteout(ofs, dir, index);
                } else {
                        /* Cleanup orphan index entries */
                        err = ovl_cleanup(ofs, dir, index);
                }

                if (err)
                        break;

next:
                dput(index);
                index = NULL;
        }
        dput(index);
        inode_unlock(dir);
out:
        ovl_cache_free(&list);
        if (err)
                pr_err("failed index dir cleanup (%i)\n", err);
        return err;
}























































































    1 














    1 

    1 
    1 



























































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
/*
 * linux/fs/nls/nls_base.c
 *
 * Native language support--charsets and unicode translations.
 * By Gordon Chaffee 1996, 1997
 *
 * Unicode based case conversion 1999 by Wolfram Pienkoss
 *
 */

#include <linux/module.h>
#include <linux/string.h>
#include <linux/nls.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/kmod.h>
#include <linux/spinlock.h>
#include <asm/byteorder.h>

static struct nls_table default_table;
static struct nls_table *tables = &default_table;
static DEFINE_SPINLOCK(nls_lock);

/*
 * Sample implementation from Unicode home page.
 * http://www.stonehand.com/unicode/standard/fss-utf.html
 */
struct utf8_table {
        int     cmask;
        int     cval;
        int     shift;
        long    lmask;
        long    lval;
};

static const struct utf8_table utf8_table[] =
{
    {0x80,  0x00,   0*6,    0x7F,           0,         /* 1 byte sequence */},
    {0xE0,  0xC0,   1*6,    0x7FF,          0x80,      /* 2 byte sequence */},
    {0xF0,  0xE0,   2*6,    0xFFFF,         0x800,     /* 3 byte sequence */},
    {0xF8,  0xF0,   3*6,    0x1FFFFF,       0x10000,   /* 4 byte sequence */},
    {0xFC,  0xF8,   4*6,    0x3FFFFFF,      0x200000,  /* 5 byte sequence */},
    {0xFE,  0xFC,   5*6,    0x7FFFFFFF,     0x4000000, /* 6 byte sequence */},
    {0,                                                       /* end of table    */}
};

#define UNICODE_MAX        0x0010ffff
#define PLANE_SIZE        0x00010000

#define SURROGATE_MASK        0xfffff800
#define SURROGATE_PAIR        0x0000d800
#define SURROGATE_LOW        0x00000400
#define SURROGATE_BITS        0x000003ff

int utf8_to_utf32(const u8 *s, int inlen, unicode_t *pu)
{
        unsigned long l;
        int c0, c, nc;
        const struct utf8_table *t;
  
        nc = 0;
        c0 = *s;
        l = c0;
        for (t = utf8_table; t->cmask; t++) {
                nc++;
                if ((c0 & t->cmask) == t->cval) {
                        l &= t->lmask;
                        if (l < t->lval || l > UNICODE_MAX ||
                                        (l & SURROGATE_MASK) == SURROGATE_PAIR)
                                return -1;
                        *pu = (unicode_t) l;
                        return nc;
                }
                if (inlen <= nc)
                        return -1;
                s++;
                c = (*s ^ 0x80) & 0xFF;
                if (c & 0xC0)
                        return -1;
                l = (l << 6) | c;
        }
        return -1;
}
EXPORT_SYMBOL(utf8_to_utf32);

int utf32_to_utf8(unicode_t u, u8 *s, int maxout)
{
        unsigned long l;
        int c, nc;
        const struct utf8_table *t;

        if (!s)
                return 0;

        l = u;
        if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR)
                return -1;

        nc = 0;
        for (t = utf8_table; t->cmask && maxout; t++, maxout--) {
                nc++;
                if (l <= t->lmask) {
                        c = t->shift;
                        *s = (u8) (t->cval | (l >> c));
                        while (c > 0) {
                                c -= 6;
                                s++;
                                *s = (u8) (0x80 | ((l >> c) & 0x3F));
                        }
                        return nc;
                }
        }
        return -1;
}
EXPORT_SYMBOL(utf32_to_utf8);

static inline void put_utf16(wchar_t *s, unsigned c, enum utf16_endian endian)
{
        switch (endian) {
        default:
                *s = (wchar_t) c;
                break;
        case UTF16_LITTLE_ENDIAN:
                *s = __cpu_to_le16(c);
                break;
        case UTF16_BIG_ENDIAN:
                *s = __cpu_to_be16(c);
                break;
        }
}

int utf8s_to_utf16s(const u8 *s, int inlen, enum utf16_endian endian,
                wchar_t *pwcs, int maxout)
{
        u16 *op;
        int size;
        unicode_t u;

        op = pwcs;
        while (inlen > 0 && maxout > 0 && *s) {
                if (*s & 0x80) {
                        size = utf8_to_utf32(s, inlen, &u);
                        if (size < 0)
                                return -EINVAL;
                        s += size;
                        inlen -= size;

                        if (u >= PLANE_SIZE) {
                                if (maxout < 2)
                                        break;
                                u -= PLANE_SIZE;
                                put_utf16(op++, SURROGATE_PAIR |
                                                ((u >> 10) & SURROGATE_BITS),
                                                endian);
                                put_utf16(op++, SURROGATE_PAIR |
                                                SURROGATE_LOW |
                                                (u & SURROGATE_BITS),
                                                endian);
                                maxout -= 2;
                        } else {
                                put_utf16(op++, u, endian);
                                maxout--;
                        }
                } else {
                        put_utf16(op++, *s++, endian);
                        inlen--;
                        maxout--;
                }
        }
        return op - pwcs;
}
EXPORT_SYMBOL(utf8s_to_utf16s);

static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian)
{
        switch (endian) {
        default:
                return c;
        case UTF16_LITTLE_ENDIAN:
                return __le16_to_cpu(c);
        case UTF16_BIG_ENDIAN:
                return __be16_to_cpu(c);
        }
}

int utf16s_to_utf8s(const wchar_t *pwcs, int inlen, enum utf16_endian endian,
                u8 *s, int maxout)
{
        u8 *op;
        int size;
        unsigned long u, v;

        op = s;
        while (inlen > 0 && maxout > 0) {
                u = get_utf16(*pwcs, endian);
                if (!u)
                        break;
                pwcs++;
                inlen--;
                if (u > 0x7f) {
                        if ((u & SURROGATE_MASK) == SURROGATE_PAIR) {
                                if (u & SURROGATE_LOW) {
                                        /* Ignore character and move on */
                                        continue;
                                }
                                if (inlen <= 0)
                                        break;
                                v = get_utf16(*pwcs, endian);
                                if ((v & SURROGATE_MASK) != SURROGATE_PAIR ||
                                                !(v & SURROGATE_LOW)) {
                                        /* Ignore character and move on */
                                        continue;
                                }
                                u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10)
                                                + (v & SURROGATE_BITS);
                                pwcs++;
                                inlen--;
                        }
                        size = utf32_to_utf8(u, op, maxout);
                        if (size == -1) {
                                /* Ignore character and move on */
                        } else {
                                op += size;
                                maxout -= size;
                        }
                } else {
                        *op++ = (u8) u;
                        maxout--;
                }
        }
        return op - s;
}
EXPORT_SYMBOL(utf16s_to_utf8s);

int __register_nls(struct nls_table *nls, struct module *owner)
{
        struct nls_table ** tmp = &tables;

        if (nls->next)
                return -EBUSY;

        nls->owner = owner;
        spin_lock(&nls_lock);
        while (*tmp) {
                if (nls == *tmp) {
                        spin_unlock(&nls_lock);
                        return -EBUSY;
                }
                tmp = &(*tmp)->next;
        }
        nls->next = tables;
        tables = nls;
        spin_unlock(&nls_lock);
        return 0;        
}
EXPORT_SYMBOL(__register_nls);

int unregister_nls(struct nls_table * nls)
{
        struct nls_table ** tmp = &tables;

        spin_lock(&nls_lock);
        while (*tmp) {
                if (nls == *tmp) {
                        *tmp = nls->next;
                        spin_unlock(&nls_lock);
                        return 0;
                }
                tmp = &(*tmp)->next;
        }
        spin_unlock(&nls_lock);
        return -EINVAL;
}

static struct nls_table *find_nls(const char *charset)
{
        struct nls_table *nls;
        spin_lock(&nls_lock);
        for (nls = tables; nls; nls = nls->next) {
                if (!strcmp(nls->charset, charset))
                        break;
                if (nls->alias && !strcmp(nls->alias, charset))
                        break;
        }
        if (nls && !try_module_get(nls->owner))
                nls = NULL;
        spin_unlock(&nls_lock);
        return nls;
}

struct nls_table *load_nls(const char *charset)
{
        return try_then_request_module(find_nls(charset), "nls_%s", charset);
}

void unload_nls(struct nls_table *nls)
{
        if (nls)
                module_put(nls->owner);
}

static const wchar_t charset2uni[256] = {
        /* 0x00*/
        0x0000, 0x0001, 0x0002, 0x0003,
        0x0004, 0x0005, 0x0006, 0x0007,
        0x0008, 0x0009, 0x000a, 0x000b,
        0x000c, 0x000d, 0x000e, 0x000f,
        /* 0x10*/
        0x0010, 0x0011, 0x0012, 0x0013,
        0x0014, 0x0015, 0x0016, 0x0017,
        0x0018, 0x0019, 0x001a, 0x001b,
        0x001c, 0x001d, 0x001e, 0x001f,
        /* 0x20*/
        0x0020, 0x0021, 0x0022, 0x0023,
        0x0024, 0x0025, 0x0026, 0x0027,
        0x0028, 0x0029, 0x002a, 0x002b,
        0x002c, 0x002d, 0x002e, 0x002f,
        /* 0x30*/
        0x0030, 0x0031, 0x0032, 0x0033,
        0x0034, 0x0035, 0x0036, 0x0037,
        0x0038, 0x0039, 0x003a, 0x003b,
        0x003c, 0x003d, 0x003e, 0x003f,
        /* 0x40*/
        0x0040, 0x0041, 0x0042, 0x0043,
        0x0044, 0x0045, 0x0046, 0x0047,
        0x0048, 0x0049, 0x004a, 0x004b,
        0x004c, 0x004d, 0x004e, 0x004f,
        /* 0x50*/
        0x0050, 0x0051, 0x0052, 0x0053,
        0x0054, 0x0055, 0x0056, 0x0057,
        0x0058, 0x0059, 0x005a, 0x005b,
        0x005c, 0x005d, 0x005e, 0x005f,
        /* 0x60*/
        0x0060, 0x0061, 0x0062, 0x0063,
        0x0064, 0x0065, 0x0066, 0x0067,
        0x0068, 0x0069, 0x006a, 0x006b,
        0x006c, 0x006d, 0x006e, 0x006f,
        /* 0x70*/
        0x0070, 0x0071, 0x0072, 0x0073,
        0x0074, 0x0075, 0x0076, 0x0077,
        0x0078, 0x0079, 0x007a, 0x007b,
        0x007c, 0x007d, 0x007e, 0x007f,
        /* 0x80*/
        0x0080, 0x0081, 0x0082, 0x0083,
        0x0084, 0x0085, 0x0086, 0x0087,
        0x0088, 0x0089, 0x008a, 0x008b,
        0x008c, 0x008d, 0x008e, 0x008f,
        /* 0x90*/
        0x0090, 0x0091, 0x0092, 0x0093,
        0x0094, 0x0095, 0x0096, 0x0097,
        0x0098, 0x0099, 0x009a, 0x009b,
        0x009c, 0x009d, 0x009e, 0x009f,
        /* 0xa0*/
        0x00a0, 0x00a1, 0x00a2, 0x00a3,
        0x00a4, 0x00a5, 0x00a6, 0x00a7,
        0x00a8, 0x00a9, 0x00aa, 0x00ab,
        0x00ac, 0x00ad, 0x00ae, 0x00af,
        /* 0xb0*/
        0x00b0, 0x00b1, 0x00b2, 0x00b3,
        0x00b4, 0x00b5, 0x00b6, 0x00b7,
        0x00b8, 0x00b9, 0x00ba, 0x00bb,
        0x00bc, 0x00bd, 0x00be, 0x00bf,
        /* 0xc0*/
        0x00c0, 0x00c1, 0x00c2, 0x00c3,
        0x00c4, 0x00c5, 0x00c6, 0x00c7,
        0x00c8, 0x00c9, 0x00ca, 0x00cb,
        0x00cc, 0x00cd, 0x00ce, 0x00cf,
        /* 0xd0*/
        0x00d0, 0x00d1, 0x00d2, 0x00d3,
        0x00d4, 0x00d5, 0x00d6, 0x00d7,
        0x00d8, 0x00d9, 0x00da, 0x00db,
        0x00dc, 0x00dd, 0x00de, 0x00df,
        /* 0xe0*/
        0x00e0, 0x00e1, 0x00e2, 0x00e3,
        0x00e4, 0x00e5, 0x00e6, 0x00e7,
        0x00e8, 0x00e9, 0x00ea, 0x00eb,
        0x00ec, 0x00ed, 0x00ee, 0x00ef,
        /* 0xf0*/
        0x00f0, 0x00f1, 0x00f2, 0x00f3,
        0x00f4, 0x00f5, 0x00f6, 0x00f7,
        0x00f8, 0x00f9, 0x00fa, 0x00fb,
        0x00fc, 0x00fd, 0x00fe, 0x00ff,
};

static const unsigned char page00[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
        0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
        0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
        0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
        0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
        0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
        0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
        0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
        0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
        0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
        0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
        0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
        0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
        0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
        0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
        0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
        0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
};

static const unsigned char *const page_uni2charset[256] = {
        page00
};

static const unsigned char charset2lower[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
        0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
        0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
        0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
        0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
        0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
        0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
        0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
        0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
        0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
        0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
        0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
        0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
        0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
        0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
        0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
        0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
};

static const unsigned char charset2upper[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
        0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
        0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
        0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
        0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
        0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
        0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
        0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
        0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
        0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
        0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
        0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
        0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
        0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
        0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
        0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
        0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
};


static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
{
        const unsigned char *uni2charset;
        unsigned char cl = uni & 0x00ff;
        unsigned char ch = (uni & 0xff00) >> 8;

        if (boundlen <= 0)
                return -ENAMETOOLONG;

        uni2charset = page_uni2charset[ch];
        if (uni2charset && uni2charset[cl])
                out[0] = uni2charset[cl];
        else
                return -EINVAL;
        return 1;
}

static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
{
        *uni = charset2uni[*rawstring];
        if (*uni == 0x0000)
                return -EINVAL;
        return 1;
}

static struct nls_table default_table = {
        .charset        = "default",
        .uni2char        = uni2char,
        .char2uni        = char2uni,
        .charset2lower        = charset2lower,
        .charset2upper        = charset2upper,
};

/* Returns a simple default translation table */
struct nls_table *load_nls_default(void)
{
        struct nls_table *default_nls;
        
        default_nls = load_nls(CONFIG_NLS_DEFAULT);
        if (default_nls != NULL)
                return default_nls;
        else
                return &default_table;
}

EXPORT_SYMBOL(unregister_nls);
EXPORT_SYMBOL(unload_nls);
EXPORT_SYMBOL(load_nls);
EXPORT_SYMBOL(load_nls_default);

MODULE_LICENSE("Dual BSD/GPL");


























































































































































    1 

































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * bvec iterator
 *
 * Copyright (C) 2001 Ming Lei <ming.lei@canonical.com>
 */
#ifndef __LINUX_BVEC_H
#define __LINUX_BVEC_H

#include <linux/highmem.h>
#include <linux/bug.h>
#include <linux/errno.h>
#include <linux/limits.h>
#include <linux/minmax.h>
#include <linux/types.h>

struct page;

/**
 * struct bio_vec - a contiguous range of physical memory addresses
 * @bv_page:   First page associated with the address range.
 * @bv_len:    Number of bytes in the address range.
 * @bv_offset: Start of the address range relative to the start of @bv_page.
 *
 * The following holds for a bvec if n * PAGE_SIZE < bv_offset + bv_len:
 *
 *   nth_page(@bv_page, n) == @bv_page + n
 *
 * This holds because page_is_mergeable() checks the above property.
 */
struct bio_vec {
        struct page        *bv_page;
        unsigned int        bv_len;
        unsigned int        bv_offset;
};

/**
 * bvec_set_page - initialize a bvec based off a struct page
 * @bv:                bvec to initialize
 * @page:        page the bvec should point to
 * @len:        length of the bvec
 * @offset:        offset into the page
 */
static inline void bvec_set_page(struct bio_vec *bv, struct page *page,
                unsigned int len, unsigned int offset)
{
        bv->bv_page = page;
        bv->bv_len = len;
        bv->bv_offset = offset;
}

/**
 * bvec_set_folio - initialize a bvec based off a struct folio
 * @bv:                bvec to initialize
 * @folio:        folio the bvec should point to
 * @len:        length of the bvec
 * @offset:        offset into the folio
 */
static inline void bvec_set_folio(struct bio_vec *bv, struct folio *folio,
                unsigned int len, unsigned int offset)
{
        bvec_set_page(bv, &folio->page, len, offset);
}

/**
 * bvec_set_virt - initialize a bvec based on a virtual address
 * @bv:                bvec to initialize
 * @vaddr:        virtual address to set the bvec to
 * @len:        length of the bvec
 */
static inline void bvec_set_virt(struct bio_vec *bv, void *vaddr,
                unsigned int len)
{
        bvec_set_page(bv, virt_to_page(vaddr), len, offset_in_page(vaddr));
}

struct bvec_iter {
        sector_t                bi_sector;        /* device address in 512 byte
                                                   sectors */
        unsigned int                bi_size;        /* residual I/O count */

        unsigned int                bi_idx;                /* current index into bvl_vec */

        unsigned int            bi_bvec_done;        /* number of bytes completed in
                                                   current bvec */
} __packed __aligned(4);

struct bvec_iter_all {
        struct bio_vec        bv;
        int                idx;
        unsigned        done;
};

/*
 * various member access, note that bio_data should of course not be used
 * on highmem page vectors
 */
#define __bvec_iter_bvec(bvec, iter)        (&(bvec)[(iter).bi_idx])

/* multi-page (mp_bvec) helpers */
#define mp_bvec_iter_page(bvec, iter)                                \
        (__bvec_iter_bvec((bvec), (iter))->bv_page)

#define mp_bvec_iter_len(bvec, iter)                                \
        min((iter).bi_size,                                        \
            __bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done)

#define mp_bvec_iter_offset(bvec, iter)                                \
        (__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done)

#define mp_bvec_iter_page_idx(bvec, iter)                        \
        (mp_bvec_iter_offset((bvec), (iter)) / PAGE_SIZE)

#define mp_bvec_iter_bvec(bvec, iter)                                \
((struct bio_vec) {                                                \
        .bv_page        = mp_bvec_iter_page((bvec), (iter)),        \
        .bv_len                = mp_bvec_iter_len((bvec), (iter)),        \
        .bv_offset        = mp_bvec_iter_offset((bvec), (iter)),        \
})

/* For building single-page bvec in flight */
 #define bvec_iter_offset(bvec, iter)                                \
        (mp_bvec_iter_offset((bvec), (iter)) % PAGE_SIZE)

#define bvec_iter_len(bvec, iter)                                \
        min_t(unsigned, mp_bvec_iter_len((bvec), (iter)),                \
              PAGE_SIZE - bvec_iter_offset((bvec), (iter)))

#define bvec_iter_page(bvec, iter)                                \
        (mp_bvec_iter_page((bvec), (iter)) +                        \
         mp_bvec_iter_page_idx((bvec), (iter)))

#define bvec_iter_bvec(bvec, iter)                                \
((struct bio_vec) {                                                \
        .bv_page        = bvec_iter_page((bvec), (iter)),        \
        .bv_len                = bvec_iter_len((bvec), (iter)),        \
        .bv_offset        = bvec_iter_offset((bvec), (iter)),        \
})

static inline bool bvec_iter_advance(const struct bio_vec *bv,
                struct bvec_iter *iter, unsigned bytes)
{
        unsigned int idx = iter->bi_idx;

        if (WARN_ONCE(bytes > iter->bi_size,
                     "Attempted to advance past end of bvec iter\n")) {
                iter->bi_size = 0;
                return false;
        }

        iter->bi_size -= bytes;
        bytes += iter->bi_bvec_done;

        while (bytes && bytes >= bv[idx].bv_len) {
                bytes -= bv[idx].bv_len;
                idx++;
        }

        iter->bi_idx = idx;
        iter->bi_bvec_done = bytes;
        return true;
}

/*
 * A simpler version of bvec_iter_advance(), @bytes should not span
 * across multiple bvec entries, i.e. bytes <= bv[i->bi_idx].bv_len
 */
static inline void bvec_iter_advance_single(const struct bio_vec *bv,
                                struct bvec_iter *iter, unsigned int bytes)
{
        unsigned int done = iter->bi_bvec_done + bytes;

        if (done == bv[iter->bi_idx].bv_len) {
                done = 0;
                iter->bi_idx++;
        }
        iter->bi_bvec_done = done;
        iter->bi_size -= bytes;
}

#define for_each_bvec(bvl, bio_vec, iter, start)                        \
        for (iter = (start);                                                \
             (iter).bi_size &&                                                \
                ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1);        \
             bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len))

/* for iterating one bio from start to end */
#define BVEC_ITER_ALL_INIT (struct bvec_iter)                                \
{                                                                        \
        .bi_sector        = 0,                                                \
        .bi_size        = UINT_MAX,                                        \
        .bi_idx                = 0,                                                \
        .bi_bvec_done        = 0,                                                \
}

static inline struct bio_vec *bvec_init_iter_all(struct bvec_iter_all *iter_all)
{
        iter_all->done = 0;
        iter_all->idx = 0;

        return &iter_all->bv;
}

static inline void bvec_advance(const struct bio_vec *bvec,
                                struct bvec_iter_all *iter_all)
{
        struct bio_vec *bv = &iter_all->bv;

        if (iter_all->done) {
                bv->bv_page++;
                bv->bv_offset = 0;
        } else {
                bv->bv_page = bvec->bv_page + (bvec->bv_offset >> PAGE_SHIFT);
                bv->bv_offset = bvec->bv_offset & ~PAGE_MASK;
        }
        bv->bv_len = min_t(unsigned int, PAGE_SIZE - bv->bv_offset,
                           bvec->bv_len - iter_all->done);
        iter_all->done += bv->bv_len;

        if (iter_all->done == bvec->bv_len) {
                iter_all->idx++;
                iter_all->done = 0;
        }
}

/**
 * bvec_kmap_local - map a bvec into the kernel virtual address space
 * @bvec: bvec to map
 *
 * Must be called on single-page bvecs only.  Call kunmap_local on the returned
 * address to unmap.
 */
static inline void *bvec_kmap_local(struct bio_vec *bvec)
{
        return kmap_local_page(bvec->bv_page) + bvec->bv_offset;
}

/**
 * memcpy_from_bvec - copy data from a bvec
 * @bvec: bvec to copy from
 *
 * Must be called on single-page bvecs only.
 */
static inline void memcpy_from_bvec(char *to, struct bio_vec *bvec)
{
        memcpy_from_page(to, bvec->bv_page, bvec->bv_offset, bvec->bv_len);
}

/**
 * memcpy_to_bvec - copy data to a bvec
 * @bvec: bvec to copy to
 *
 * Must be called on single-page bvecs only.
 */
static inline void memcpy_to_bvec(struct bio_vec *bvec, const char *from)
{
        memcpy_to_page(bvec->bv_page, bvec->bv_offset, from, bvec->bv_len);
}

/**
 * memzero_bvec - zero all data in a bvec
 * @bvec: bvec to zero
 *
 * Must be called on single-page bvecs only.
 */
static inline void memzero_bvec(struct bio_vec *bvec)
{
        memzero_page(bvec->bv_page, bvec->bv_offset, bvec->bv_len);
}

/**
 * bvec_virt - return the virtual address for a bvec
 * @bvec: bvec to return the virtual address for
 *
 * Note: the caller must ensure that @bvec->bv_page is not a highmem page.
 */
static inline void *bvec_virt(struct bio_vec *bvec)
{
        WARN_ON_ONCE(PageHighMem(bvec->bv_page));
        return page_address(bvec->bv_page) + bvec->bv_offset;
}

#endif /* __LINUX_BVEC_H */









































































































































































































































































































































    1 
    1 
















    1 


















































    1 




    1 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 2008 Oracle.  All rights reserved.
 */

#ifndef BTRFS_DELAYED_REF_H
#define BTRFS_DELAYED_REF_H

#include <linux/types.h>
#include <linux/refcount.h>
#include <linux/list.h>
#include <linux/rbtree.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <uapi/linux/btrfs_tree.h>

struct btrfs_trans_handle;
struct btrfs_fs_info;

/* these are the possible values of struct btrfs_delayed_ref_node->action */
enum btrfs_delayed_ref_action {
        /* Add one backref to the tree */
        BTRFS_ADD_DELAYED_REF = 1,
        /* Delete one backref from the tree */
        BTRFS_DROP_DELAYED_REF,
        /* Record a full extent allocation */
        BTRFS_ADD_DELAYED_EXTENT,
        /* Not changing ref count on head ref */
        BTRFS_UPDATE_DELAYED_HEAD,
} __packed;

struct btrfs_data_ref {
        /* For EXTENT_DATA_REF */

        /* Inode which refers to this data extent */
        u64 objectid;

        /*
         * file_offset - extent_offset
         *
         * file_offset is the key.offset of the EXTENT_DATA key.
         * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data.
         */
        u64 offset;
};

struct btrfs_tree_ref {
        /*
         * Level of this tree block.
         *
         * Shared for skinny (TREE_BLOCK_REF) and normal tree ref.
         */
        int level;

        /* For non-skinny metadata, no special member needed */
};

struct btrfs_delayed_ref_node {
        struct rb_node ref_node;
        /*
         * If action is BTRFS_ADD_DELAYED_REF, also link this node to
         * ref_head->ref_add_list, then we do not need to iterate the
         * whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes.
         */
        struct list_head add_list;

        /* the starting bytenr of the extent */
        u64 bytenr;

        /* the size of the extent */
        u64 num_bytes;

        /* seq number to keep track of insertion order */
        u64 seq;

        /* The ref_root for this ref */
        u64 ref_root;

        /*
         * The parent for this ref, if this isn't set the ref_root is the
         * reference owner.
         */
        u64 parent;

        /* ref count on this data structure */
        refcount_t refs;

        /*
         * how many refs is this entry adding or deleting.  For
         * head refs, this may be a negative number because it is keeping
         * track of the total mods done to the reference count.
         * For individual refs, this will always be a positive number
         *
         * It may be more than one, since it is possible for a single
         * parent to have more than one ref on an extent
         */
        int ref_mod;

        unsigned int action:8;
        unsigned int type:8;

        union {
                struct btrfs_tree_ref tree_ref;
                struct btrfs_data_ref data_ref;
        };
};

struct btrfs_delayed_extent_op {
        struct btrfs_disk_key key;
        u8 level;
        bool update_key;
        bool update_flags;
        u64 flags_to_set;
};

/*
 * the head refs are used to hold a lock on a given extent, which allows us
 * to make sure that only one process is running the delayed refs
 * at a time for a single extent.  They also store the sum of all the
 * reference count modifications we've queued up.
 */
struct btrfs_delayed_ref_head {
        u64 bytenr;
        u64 num_bytes;
        /*
         * For insertion into struct btrfs_delayed_ref_root::href_root.
         * Keep it in the same cache line as 'bytenr' for more efficient
         * searches in the rbtree.
         */
        struct rb_node href_node;
        /*
         * the mutex is held while running the refs, and it is also
         * held when checking the sum of reference modifications.
         */
        struct mutex mutex;

        refcount_t refs;

        /* Protects 'ref_tree' and 'ref_add_list'. */
        spinlock_t lock;
        struct rb_root_cached ref_tree;
        /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */
        struct list_head ref_add_list;

        struct btrfs_delayed_extent_op *extent_op;

        /*
         * This is used to track the final ref_mod from all the refs associated
         * with this head ref, this is not adjusted as delayed refs are run,
         * this is meant to track if we need to do the csum accounting or not.
         */
        int total_ref_mod;

        /*
         * This is the current outstanding mod references for this bytenr.  This
         * is used with lookup_extent_info to get an accurate reference count
         * for a bytenr, so it is adjusted as delayed refs are run so that any
         * on disk reference count + ref_mod is accurate.
         */
        int ref_mod;

        /*
         * The root that triggered the allocation when must_insert_reserved is
         * set to true.
         */
        u64 owning_root;

        /*
         * Track reserved bytes when setting must_insert_reserved.  On success
         * or cleanup, we will need to free the reservation.
         */
        u64 reserved_bytes;

        /*
         * when a new extent is allocated, it is just reserved in memory
         * The actual extent isn't inserted into the extent allocation tree
         * until the delayed ref is processed.  must_insert_reserved is
         * used to flag a delayed ref so the accounting can be updated
         * when a full insert is done.
         *
         * It is possible the extent will be freed before it is ever
         * inserted into the extent allocation tree.  In this case
         * we need to update the in ram accounting to properly reflect
         * the free has happened.
         */
        bool must_insert_reserved;

        bool is_data;
        bool is_system;
        bool processing;
};

enum btrfs_delayed_ref_flags {
        /* Indicate that we are flushing delayed refs for the commit */
        BTRFS_DELAYED_REFS_FLUSHING,
};

struct btrfs_delayed_ref_root {
        /* head ref rbtree */
        struct rb_root_cached href_root;

        /* dirty extent records */
        struct rb_root dirty_extent_root;

        /* this spin lock protects the rbtree and the entries inside */
        spinlock_t lock;

        /* how many delayed ref updates we've queued, used by the
         * throttling code
         */
        atomic_t num_entries;

        /* total number of head nodes in tree */
        unsigned long num_heads;

        /* total number of head nodes ready for processing */
        unsigned long num_heads_ready;

        u64 pending_csums;

        unsigned long flags;

        u64 run_delayed_start;

        /*
         * To make qgroup to skip given root.
         * This is for snapshot, as btrfs_qgroup_inherit() will manually
         * modify counters for snapshot and its source, so we should skip
         * the snapshot in new_root/old_roots or it will get calculated twice
         */
        u64 qgroup_to_skip;
};

enum btrfs_ref_type {
        BTRFS_REF_NOT_SET,
        BTRFS_REF_DATA,
        BTRFS_REF_METADATA,
        BTRFS_REF_LAST,
} __packed;

struct btrfs_ref {
        enum btrfs_ref_type type;
        enum btrfs_delayed_ref_action action;

        /*
         * Whether this extent should go through qgroup record.
         *
         * Normally false, but for certain cases like delayed subtree scan,
         * setting this flag can hugely reduce qgroup overhead.
         */
        bool skip_qgroup;

#ifdef CONFIG_BTRFS_FS_REF_VERIFY
        /* Through which root is this modification. */
        u64 real_root;
#endif
        u64 bytenr;
        u64 num_bytes;
        u64 owning_root;

        /*
         * The root that owns the reference for this reference, this will be set
         * or ->parent will be set, depending on what type of reference this is.
         */
        u64 ref_root;

        /* Bytenr of the parent tree block */
        u64 parent;
        union {
                struct btrfs_data_ref data_ref;
                struct btrfs_tree_ref tree_ref;
        };
};

extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
extern struct kmem_cache *btrfs_delayed_ref_node_cachep;
extern struct kmem_cache *btrfs_delayed_extent_op_cachep;

int __init btrfs_delayed_ref_init(void);
void __cold btrfs_delayed_ref_exit(void);

static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_info,
                                               int num_delayed_refs)
{
        u64 num_bytes;

        num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_delayed_refs);

        /*
         * We have to check the mount option here because we could be enabling
         * the free space tree for the first time and don't have the compat_ro
         * option set yet.
         *
         * We need extra reservations if we have the free space tree because
         * we'll have to modify that tree as well.
         */
        if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
                num_bytes *= 2;

        return num_bytes;
}

static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info *fs_info,
                                                    int num_csum_items)
{
        /*
         * Deleting csum items does not result in new nodes/leaves and does not
         * require changing the free space tree, only the csum tree, so this is
         * all we need.
         */
        return btrfs_calc_metadata_size(fs_info, num_csum_items);
}

void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root,
                         bool skip_qgroup);
void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset,
                         u64 mod_root, bool skip_qgroup);

static inline struct btrfs_delayed_extent_op *
btrfs_alloc_delayed_extent_op(void)
{
        return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS);
}

static inline void
btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
{
        if (op)
                kmem_cache_free(btrfs_delayed_extent_op_cachep, op);
}

void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref);

static inline u64 btrfs_ref_head_to_space_flags(
                                struct btrfs_delayed_ref_head *head_ref)
{
        if (head_ref->is_data)
                return BTRFS_BLOCK_GROUP_DATA;
        else if (head_ref->is_system)
                return BTRFS_BLOCK_GROUP_SYSTEM;
        return BTRFS_BLOCK_GROUP_METADATA;
}

static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head)
{
        if (refcount_dec_and_test(&head->refs))
                kmem_cache_free(btrfs_delayed_ref_head_cachep, head);
}

int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
                               struct btrfs_ref *generic_ref,
                               struct btrfs_delayed_extent_op *extent_op);
int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
                               struct btrfs_ref *generic_ref,
                               u64 reserved);
int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes,
                                struct btrfs_delayed_extent_op *extent_op);
void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
                              struct btrfs_delayed_ref_root *delayed_refs,
                              struct btrfs_delayed_ref_head *head);

struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
                            u64 bytenr);
int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
                           struct btrfs_delayed_ref_head *head);
static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
{
        mutex_unlock(&head->mutex);
}
void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
                           struct btrfs_delayed_ref_head *head);

struct btrfs_delayed_ref_head *btrfs_select_ref_head(
                struct btrfs_delayed_ref_root *delayed_refs);

int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);

void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums);
void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info);
void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info);
void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
                                  enum btrfs_reserve_flush_enum flush);
void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
                                       u64 num_bytes);
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);

static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
{
        if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
            node->type == BTRFS_SHARED_DATA_REF_KEY)
                return node->data_ref.objectid;
        return node->tree_ref.level;
}

static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node)
{
        if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
            node->type == BTRFS_SHARED_DATA_REF_KEY)
                return node->data_ref.offset;
        return 0;
}

static inline u8 btrfs_ref_type(struct btrfs_ref *ref)
{
        ASSERT(ref->type == BTRFS_REF_DATA || ref->type == BTRFS_REF_METADATA);

        if (ref->type == BTRFS_REF_DATA) {
                if (ref->parent)
                        return BTRFS_SHARED_DATA_REF_KEY;
                else
                        return BTRFS_EXTENT_DATA_REF_KEY;
        } else {
                if (ref->parent)
                        return BTRFS_SHARED_BLOCK_REF_KEY;
                else
                        return BTRFS_TREE_BLOCK_REF_KEY;
        }

        return 0;
}

#endif
































































































































































































































































































































































































    9 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kmem

#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_KMEM_H

#include <linux/types.h>
#include <linux/tracepoint.h>
#include <trace/events/mmflags.h>

TRACE_EVENT(kmem_cache_alloc,

        TP_PROTO(unsigned long call_site,
                 const void *ptr,
                 struct kmem_cache *s,
                 gfp_t gfp_flags,
                 int node),

        TP_ARGS(call_site, ptr, s, gfp_flags, node),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
                __field(        size_t,                bytes_req        )
                __field(        size_t,                bytes_alloc        )
                __field(        unsigned long,        gfp_flags        )
                __field(        int,                node                )
                __field(        bool,                accounted        )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
                __entry->bytes_req        = s->object_size;
                __entry->bytes_alloc        = s->size;
                __entry->gfp_flags        = (__force unsigned long)gfp_flags;
                __entry->node                = node;
                __entry->accounted        = IS_ENABLED(CONFIG_MEMCG_KMEM) ?
                                          ((gfp_flags & __GFP_ACCOUNT) ||
                                          (s->flags & SLAB_ACCOUNT)) : false;
        ),

        TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s",
                (void *)__entry->call_site,
                __entry->ptr,
                __entry->bytes_req,
                __entry->bytes_alloc,
                show_gfp_flags(__entry->gfp_flags),
                __entry->node,
                __entry->accounted ? "true" : "false")
);

TRACE_EVENT(kmalloc,

        TP_PROTO(unsigned long call_site,
                 const void *ptr,
                 size_t bytes_req,
                 size_t bytes_alloc,
                 gfp_t gfp_flags,
                 int node),

        TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
                __field(        size_t,                bytes_req        )
                __field(        size_t,                bytes_alloc        )
                __field(        unsigned long,        gfp_flags        )
                __field(        int,                node                )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
                __entry->bytes_req        = bytes_req;
                __entry->bytes_alloc        = bytes_alloc;
                __entry->gfp_flags        = (__force unsigned long)gfp_flags;
                __entry->node                = node;
        ),

        TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s",
                (void *)__entry->call_site,
                __entry->ptr,
                __entry->bytes_req,
                __entry->bytes_alloc,
                show_gfp_flags(__entry->gfp_flags),
                __entry->node,
                (IS_ENABLED(CONFIG_MEMCG_KMEM) &&
                 (__entry->gfp_flags & (__force unsigned long)__GFP_ACCOUNT)) ? "true" : "false")
);

TRACE_EVENT(kfree,

        TP_PROTO(unsigned long call_site, const void *ptr),

        TP_ARGS(call_site, ptr),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
        ),

        TP_printk("call_site=%pS ptr=%p",
                  (void *)__entry->call_site, __entry->ptr)
);

TRACE_EVENT(kmem_cache_free,

        TP_PROTO(unsigned long call_site, const void *ptr, const struct kmem_cache *s),

        TP_ARGS(call_site, ptr, s),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
                __string(        name,                s->name                )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
                __assign_str(name);
        ),

        TP_printk("call_site=%pS ptr=%p name=%s",
                  (void *)__entry->call_site, __entry->ptr, __get_str(name))
);

TRACE_EVENT(mm_page_free,

        TP_PROTO(struct page *page, unsigned int order),

        TP_ARGS(page, order),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
        ),

        TP_fast_assign(
                __entry->pfn                = page_to_pfn(page);
                __entry->order                = order;
        ),

        TP_printk("page=%p pfn=0x%lx order=%d",
                        pfn_to_page(__entry->pfn),
                        __entry->pfn,
                        __entry->order)
);

TRACE_EVENT(mm_page_free_batched,

        TP_PROTO(struct page *page),

        TP_ARGS(page),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
        ),

        TP_fast_assign(
                __entry->pfn                = page_to_pfn(page);
        ),

        TP_printk("page=%p pfn=0x%lx order=0",
                        pfn_to_page(__entry->pfn),
                        __entry->pfn)
);

TRACE_EVENT(mm_page_alloc,

        TP_PROTO(struct page *page, unsigned int order,
                        gfp_t gfp_flags, int migratetype),

        TP_ARGS(page, order, gfp_flags, migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        unsigned long,        gfp_flags        )
                __field(        int,                migratetype        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->gfp_flags        = (__force unsigned long)gfp_flags;
                __entry->migratetype        = migratetype;
        ),

        TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d gfp_flags=%s",
                __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL,
                __entry->pfn != -1UL ? __entry->pfn : 0,
                __entry->order,
                __entry->migratetype,
                show_gfp_flags(__entry->gfp_flags))
);

DECLARE_EVENT_CLASS(mm_page,

        TP_PROTO(struct page *page, unsigned int order, int migratetype,
                 int percpu_refill),

        TP_ARGS(page, order, migratetype, percpu_refill),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        int,                migratetype        )
                __field(        int,                percpu_refill        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->migratetype        = migratetype;
                __entry->percpu_refill        = percpu_refill;
        ),

        TP_printk("page=%p pfn=0x%lx order=%u migratetype=%d percpu_refill=%d",
                __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL,
                __entry->pfn != -1UL ? __entry->pfn : 0,
                __entry->order,
                __entry->migratetype,
                __entry->percpu_refill)
);

DEFINE_EVENT(mm_page, mm_page_alloc_zone_locked,

        TP_PROTO(struct page *page, unsigned int order, int migratetype,
                 int percpu_refill),

        TP_ARGS(page, order, migratetype, percpu_refill)
);

TRACE_EVENT(mm_page_pcpu_drain,

        TP_PROTO(struct page *page, unsigned int order, int migratetype),

        TP_ARGS(page, order, migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        int,                migratetype        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->migratetype        = migratetype;
        ),

        TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d",
                pfn_to_page(__entry->pfn), __entry->pfn,
                __entry->order, __entry->migratetype)
);

TRACE_EVENT(mm_page_alloc_extfrag,

        TP_PROTO(struct page *page,
                int alloc_order, int fallback_order,
                int alloc_migratetype, int fallback_migratetype),

        TP_ARGS(page,
                alloc_order, fallback_order,
                alloc_migratetype, fallback_migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                        )
                __field(        int,                alloc_order                )
                __field(        int,                fallback_order                )
                __field(        int,                alloc_migratetype        )
                __field(        int,                fallback_migratetype        )
                __field(        int,                change_ownership        )
        ),

        TP_fast_assign(
                __entry->pfn                        = page_to_pfn(page);
                __entry->alloc_order                = alloc_order;
                __entry->fallback_order                = fallback_order;
                __entry->alloc_migratetype        = alloc_migratetype;
                __entry->fallback_migratetype        = fallback_migratetype;
                __entry->change_ownership        = (alloc_migratetype ==
                                        get_pageblock_migratetype(page));
        ),

        TP_printk("page=%p pfn=0x%lx alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d",
                pfn_to_page(__entry->pfn),
                __entry->pfn,
                __entry->alloc_order,
                __entry->fallback_order,
                pageblock_order,
                __entry->alloc_migratetype,
                __entry->fallback_migratetype,
                __entry->fallback_order < pageblock_order,
                __entry->change_ownership)
);

TRACE_EVENT(mm_alloc_contig_migrate_range_info,

        TP_PROTO(unsigned long start,
                 unsigned long end,
                 unsigned long nr_migrated,
                 unsigned long nr_reclaimed,
                 unsigned long nr_mapped,
                 int migratetype),

        TP_ARGS(start, end, nr_migrated, nr_reclaimed, nr_mapped, migratetype),

        TP_STRUCT__entry(
                __field(unsigned long, start)
                __field(unsigned long, end)
                __field(unsigned long, nr_migrated)
                __field(unsigned long, nr_reclaimed)
                __field(unsigned long, nr_mapped)
                __field(int, migratetype)
        ),

        TP_fast_assign(
                __entry->start = start;
                __entry->end = end;
                __entry->nr_migrated = nr_migrated;
                __entry->nr_reclaimed = nr_reclaimed;
                __entry->nr_mapped = nr_mapped;
                __entry->migratetype = migratetype;
        ),

        TP_printk("start=0x%lx end=0x%lx migratetype=%d nr_migrated=%lu nr_reclaimed=%lu nr_mapped=%lu",
                  __entry->start,
                  __entry->end,
                  __entry->migratetype,
                  __entry->nr_migrated,
                  __entry->nr_reclaimed,
                  __entry->nr_mapped)
);

/*
 * Required for uniquely and securely identifying mm in rss_stat tracepoint.
 */
#ifndef __PTR_TO_HASHVAL
static unsigned int __maybe_unused mm_ptr_to_hash(const void *ptr)
{
        int ret;
        unsigned long hashval;

        ret = ptr_to_hashval(ptr, &hashval);
        if (ret)
                return 0;

        /* The hashed value is only 32-bit */
        return (unsigned int)hashval;
}
#define __PTR_TO_HASHVAL
#endif

#define TRACE_MM_PAGES                \
        EM(MM_FILEPAGES)        \
        EM(MM_ANONPAGES)        \
        EM(MM_SWAPENTS)                \
        EMe(MM_SHMEMPAGES)

#undef EM
#undef EMe

#define EM(a)        TRACE_DEFINE_ENUM(a);
#define EMe(a)        TRACE_DEFINE_ENUM(a);

TRACE_MM_PAGES

#undef EM
#undef EMe

#define EM(a)        { a, #a },
#define EMe(a)        { a, #a }

TRACE_EVENT(rss_stat,

        TP_PROTO(struct mm_struct *mm,
                int member),

        TP_ARGS(mm, member),

        TP_STRUCT__entry(
                __field(unsigned int, mm_id)
                __field(unsigned int, curr)
                __field(int, member)
                __field(long, size)
        ),

        TP_fast_assign(
                __entry->mm_id = mm_ptr_to_hash(mm);
                __entry->curr = !!(current->mm == mm);
                __entry->member = member;
                __entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member])
                                                            << PAGE_SHIFT);
        ),

        TP_printk("mm_id=%u curr=%d type=%s size=%ldB",
                __entry->mm_id,
                __entry->curr,
                __print_symbolic(__entry->member, TRACE_MM_PAGES),
                __entry->size)
        );
#endif /* _TRACE_KMEM_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



















































































































    1 











































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Internal procfs definitions
 *
 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/refcount.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/binfmts.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>

struct ctl_table_header;
struct mempolicy;

/*
 * This is not completely implemented yet. The idea is to
 * create an in-memory tree (like the actual /proc filesystem
 * tree) of these proc_dir_entries, so that we can dynamically
 * add new files to /proc.
 *
 * parent/subdir are used for the directory structure (every /proc file has a
 * parent, but "subdir" is empty for all non-directory entries).
 * subdir_node is used to build the rb tree "subdir" of the parent.
 */
struct proc_dir_entry {
        /*
         * number of callers into module in progress;
         * negative -> it's going away RSN
         */
        atomic_t in_use;
        refcount_t refcnt;
        struct list_head pde_openers;        /* who did ->open, but not ->release */
        /* protects ->pde_openers and all struct pde_opener instances */
        spinlock_t pde_unload_lock;
        struct completion *pde_unload_completion;
        const struct inode_operations *proc_iops;
        union {
                const struct proc_ops *proc_ops;
                const struct file_operations *proc_dir_ops;
        };
        const struct dentry_operations *proc_dops;
        union {
                const struct seq_operations *seq_ops;
                int (*single_show)(struct seq_file *, void *);
        };
        proc_write_t write;
        void *data;
        unsigned int state_size;
        unsigned int low_ino;
        nlink_t nlink;
        kuid_t uid;
        kgid_t gid;
        loff_t size;
        struct proc_dir_entry *parent;
        struct rb_root subdir;
        struct rb_node subdir_node;
        char *name;
        umode_t mode;
        u8 flags;
        u8 namelen;
        char inline_name[];
} __randomize_layout;

#define SIZEOF_PDE        (                                \
        sizeof(struct proc_dir_entry) < 128 ? 128 :        \
        sizeof(struct proc_dir_entry) < 192 ? 192 :        \
        sizeof(struct proc_dir_entry) < 256 ? 256 :        \
        sizeof(struct proc_dir_entry) < 512 ? 512 :        \
        0)
#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry))

static inline bool pde_is_permanent(const struct proc_dir_entry *pde)
{
        return pde->flags & PROC_ENTRY_PERMANENT;
}

static inline void pde_make_permanent(struct proc_dir_entry *pde)
{
        pde->flags |= PROC_ENTRY_PERMANENT;
}

extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);

union proc_op {
        int (*proc_get_link)(struct dentry *, struct path *);
        int (*proc_show)(struct seq_file *m,
                struct pid_namespace *ns, struct pid *pid,
                struct task_struct *task);
        int lsmid;
};

struct proc_inode {
        struct pid *pid;
        unsigned int fd;
        union proc_op op;
        struct proc_dir_entry *pde;
        struct ctl_table_header *sysctl;
        struct ctl_table *sysctl_entry;
        struct hlist_node sibling_inodes;
        const struct proc_ns_operations *ns_ops;
        struct inode vfs_inode;
} __randomize_layout;

/*
 * General functions
 */
static inline struct proc_inode *PROC_I(const struct inode *inode)
{
        return container_of(inode, struct proc_inode, vfs_inode);
}

static inline struct proc_dir_entry *PDE(const struct inode *inode)
{
        return PROC_I(inode)->pde;
}

static inline struct pid *proc_pid(const struct inode *inode)
{
        return PROC_I(inode)->pid;
}

static inline struct task_struct *get_proc_task(const struct inode *inode)
{
        return get_pid_task(proc_pid(inode), PIDTYPE_PID);
}

void task_dump_owner(struct task_struct *task, umode_t mode,
                     kuid_t *ruid, kgid_t *rgid);

unsigned name_to_int(const struct qstr *qstr);
/*
 * Offset of the first process in the /proc root directory..
 */
#define FIRST_PROCESS_ENTRY 256

/* Worst case buffer size needed for holding an integer. */
#define PROC_NUMBUF 13

/*
 * array.c
 */
extern const struct file_operations proc_tid_children_operations;

extern void proc_task_name(struct seq_file *m, struct task_struct *p,
                           bool escape);
extern int proc_tid_stat(struct seq_file *, struct pid_namespace *,
                         struct pid *, struct task_struct *);
extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *,
                          struct pid *, struct task_struct *);
extern int proc_pid_status(struct seq_file *, struct pid_namespace *,
                           struct pid *, struct task_struct *);
extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
                          struct pid *, struct task_struct *);

/*
 * base.c
 */
extern const struct dentry_operations pid_dentry_operations;
extern int pid_getattr(struct mnt_idmap *, const struct path *,
                       struct kstat *, u32, unsigned int);
extern int proc_setattr(struct mnt_idmap *, struct dentry *,
                        struct iattr *);
extern void proc_pid_evict_inode(struct proc_inode *);
extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *);
extern int proc_pid_readdir(struct file *, struct dir_context *);
struct dentry *proc_pid_lookup(struct dentry *, unsigned int);
extern loff_t mem_lseek(struct file *, loff_t, int);

/* Lookups */
typedef struct dentry *instantiate_t(struct dentry *,
                                     struct task_struct *, const void *);
bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int,
                           instantiate_t, struct task_struct *, const void *);

/*
 * generic.c
 */
struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
                struct proc_dir_entry **parent, void *data);
struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
                struct proc_dir_entry *dp);
extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *);
extern int proc_readdir(struct file *, struct dir_context *);
int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *);

static inline void pde_get(struct proc_dir_entry *pde)
{
        refcount_inc(&pde->refcnt);
}
extern void pde_put(struct proc_dir_entry *);

static inline bool is_empty_pde(const struct proc_dir_entry *pde)
{
        return S_ISDIR(pde->mode) && !pde->proc_iops;
}
extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, loff_t *);

/*
 * inode.c
 */
struct pde_opener {
        struct list_head lh;
        struct file *file;
        bool closing;
        struct completion *c;
} __randomize_layout;
extern const struct inode_operations proc_link_inode_operations;
extern const struct inode_operations proc_pid_link_inode_operations;
extern const struct super_operations proc_sops;

void proc_init_kmemcache(void);
void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock);
void set_proc_pid_nlink(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
extern void proc_entry_rundown(struct proc_dir_entry *);

/*
 * proc_namespaces.c
 */
extern const struct inode_operations proc_ns_dir_inode_operations;
extern const struct file_operations proc_ns_dir_operations;

/*
 * proc_net.c
 */
extern const struct file_operations proc_net_operations;
extern const struct inode_operations proc_net_inode_operations;

#ifdef CONFIG_NET
extern int proc_net_init(void);
#else
static inline int proc_net_init(void) { return 0; }
#endif

/*
 * proc_self.c
 */
extern int proc_setup_self(struct super_block *);

/*
 * proc_thread_self.c
 */
extern int proc_setup_thread_self(struct super_block *);
extern void proc_thread_self_init(void);

/*
 * proc_sysctl.c
 */
#ifdef CONFIG_PROC_SYSCTL
extern int proc_sys_init(void);
extern void proc_sys_evict_inode(struct inode *inode,
                                 struct ctl_table_header *head);
#else
static inline void proc_sys_init(void) { }
static inline void proc_sys_evict_inode(struct  inode *inode,
                                        struct ctl_table_header *head) { }
#endif

/*
 * proc_tty.c
 */
#ifdef CONFIG_TTY
extern void proc_tty_init(void);
#else
static inline void proc_tty_init(void) {}
#endif

/*
 * root.c
 */
extern struct proc_dir_entry proc_root;

extern void proc_self_init(void);

/*
 * task_[no]mmu.c
 */
struct mem_size_stats;
struct proc_maps_private {
        struct inode *inode;
        struct task_struct *task;
        struct mm_struct *mm;
        struct vma_iterator iter;
#ifdef CONFIG_NUMA
        struct mempolicy *task_mempolicy;
#endif
} __randomize_layout;

struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode);

extern const struct file_operations proc_pid_maps_operations;
extern const struct file_operations proc_pid_numa_maps_operations;
extern const struct file_operations proc_pid_smaps_operations;
extern const struct file_operations proc_pid_smaps_rollup_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;

extern unsigned long task_vsize(struct mm_struct *);
extern unsigned long task_statm(struct mm_struct *,
                                unsigned long *, unsigned long *,
                                unsigned long *, unsigned long *);
extern void task_mem(struct seq_file *, struct mm_struct *);

extern const struct dentry_operations proc_net_dentry_ops;
static inline void pde_force_lookup(struct proc_dir_entry *pde)
{
        /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
        pde->proc_dops = &proc_net_dentry_ops;
}








































































































































































































































    6 






    3 






    2 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
// SPDX-License-Identifier: GPL-2.0
/*
 *  fs/ext4/mballoc.h
 *
 *  Written by: Alex Tomas <alex@clusterfs.com>
 *
 */
#ifndef _EXT4_MBALLOC_H
#define _EXT4_MBALLOC_H

#include <linux/time.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/module.h>
#include <linux/swap.h>
#include <linux/proc_fs.h>
#include <linux/pagemap.h>
#include <linux/seq_file.h>
#include <linux/blkdev.h>
#include <linux/mutex.h>
#include "ext4_jbd2.h"
#include "ext4.h"

/*
 * mb_debug() dynamic printk msgs could be used to debug mballoc code.
 */
#ifdef CONFIG_EXT4_DEBUG
#define mb_debug(sb, fmt, ...)                                                \
        pr_debug("[%s/%d] EXT4-fs (%s): (%s, %d): %s: " fmt,                \
                current->comm, task_pid_nr(current), sb->s_id,                \
               __FILE__, __LINE__, __func__, ##__VA_ARGS__)
#else
#define mb_debug(sb, fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

#define EXT4_MB_HISTORY_ALLOC                1        /* allocation */
#define EXT4_MB_HISTORY_PREALLOC        2        /* preallocated blocks used */

/*
 * How long mballoc can look for a best extent (in found extents)
 */
#define MB_DEFAULT_MAX_TO_SCAN                200

/*
 * How long mballoc must look for a best extent
 */
#define MB_DEFAULT_MIN_TO_SCAN                10

/*
 * with 's_mb_stats' allocator will collect stats that will be
 * shown at umount. The collecting costs though!
 */
#define MB_DEFAULT_STATS                0

/*
 * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
 * by the stream allocator, which purpose is to pack requests
 * as close each to other as possible to produce smooth I/O traffic
 * We use locality group prealloc space for stream request.
 * We can tune the same via /proc/fs/ext4/<partition>/stream_req
 */
#define MB_DEFAULT_STREAM_THRESHOLD        16        /* 64K */

/*
 * for which requests use 2^N search using buddies
 */
#define MB_DEFAULT_ORDER2_REQS                2

/*
 * default group prealloc size 512 blocks
 */
#define MB_DEFAULT_GROUP_PREALLOC        512

/*
 * Number of groups to search linearly before performing group scanning
 * optimization.
 */
#define MB_DEFAULT_LINEAR_LIMIT                4

/*
 * Minimum number of groups that should be present in the file system to perform
 * group scanning optimizations.
 */
#define MB_DEFAULT_LINEAR_SCAN_THRESHOLD        16

/*
 * The maximum order upto which CR_BEST_AVAIL_LEN can trim a particular
 * allocation request. Example, if we have an order 7 request and max trim order
 * of 3, we can trim this request upto order 4.
 */
#define MB_DEFAULT_BEST_AVAIL_TRIM_ORDER        3

/*
 * Number of valid buddy orders
 */
#define MB_NUM_ORDERS(sb)                ((sb)->s_blocksize_bits + 2)

struct ext4_free_data {
        /* this links the free block information from sb_info */
        struct list_head                efd_list;

        /* this links the free block information from group_info */
        struct rb_node                        efd_node;

        /* group which free block extent belongs */
        ext4_group_t                        efd_group;

        /* free block extent */
        ext4_grpblk_t                        efd_start_cluster;
        ext4_grpblk_t                        efd_count;

        /* transaction which freed this extent */
        tid_t                                efd_tid;
};

struct ext4_prealloc_space {
        union {
                struct rb_node        inode_node;                /* for inode PA rbtree */
                struct list_head        lg_list;        /* for lg PAs */
        } pa_node;
        struct list_head        pa_group_list;
        union {
                struct list_head pa_tmp_list;
                struct rcu_head        pa_rcu;
        } u;
        spinlock_t                pa_lock;
        atomic_t                pa_count;
        unsigned                pa_deleted;
        ext4_fsblk_t                pa_pstart;        /* phys. block */
        ext4_lblk_t                pa_lstart;        /* log. block */
        ext4_grpblk_t                pa_len;                /* len of preallocated chunk */
        ext4_grpblk_t                pa_free;        /* how many blocks are free */
        unsigned short                pa_type;        /* pa type. inode or group */
        union {
                rwlock_t                *inode_lock;        /* locks the rbtree holding this PA */
                spinlock_t                *lg_lock;        /* locks the lg list holding this PA */
        } pa_node_lock;
        struct inode                *pa_inode;        /* used to get the inode during group discard */
};

enum {
        MB_INODE_PA = 0,
        MB_GROUP_PA = 1
};

struct ext4_free_extent {
        ext4_lblk_t fe_logical;
        ext4_grpblk_t fe_start;        /* In cluster units */
        ext4_group_t fe_group;
        ext4_grpblk_t fe_len;        /* In cluster units */
};

/*
 * Locality group:
 *   we try to group all related changes together
 *   so that writeback can flush/allocate them together as well
 *   Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
 *   (512). We store prealloc space into the hash based on the pa_free blocks
 *   order value.ie, fls(pa_free)-1;
 */
#define PREALLOC_TB_SIZE 10
struct ext4_locality_group {
        /* for allocator */
        /* to serialize allocates */
        struct mutex                lg_mutex;
        /* list of preallocations */
        struct list_head        lg_prealloc_list[PREALLOC_TB_SIZE];
        spinlock_t                lg_prealloc_lock;
};

struct ext4_allocation_context {
        struct inode *ac_inode;
        struct super_block *ac_sb;

        /* original request */
        struct ext4_free_extent ac_o_ex;

        /* goal request (normalized ac_o_ex) */
        struct ext4_free_extent ac_g_ex;

        /* the best found extent */
        struct ext4_free_extent ac_b_ex;

        /* copy of the best found extent taken before preallocation efforts */
        struct ext4_free_extent ac_f_ex;

        /*
         * goal len can change in CR_BEST_AVAIL_LEN, so save the original len.
         * This is used while adjusting the PA window and for accounting.
         */
        ext4_grpblk_t        ac_orig_goal_len;

        __u32 ac_flags;                /* allocation hints */
        __u32 ac_groups_linear_remaining;
        __u16 ac_groups_scanned;
        __u16 ac_found;
        __u16 ac_cX_found[EXT4_MB_NUM_CRS];
        __u16 ac_tail;
        __u16 ac_buddy;
        __u8 ac_status;
        __u8 ac_criteria;
        __u8 ac_2order;                /* if request is to allocate 2^N blocks and
                                 * N > 0, the field stores N, otherwise 0 */
        __u8 ac_op;                /* operation, for history only */
        struct folio *ac_bitmap_folio;
        struct folio *ac_buddy_folio;
        struct ext4_prealloc_space *ac_pa;
        struct ext4_locality_group *ac_lg;
};

#define AC_STATUS_CONTINUE        1
#define AC_STATUS_FOUND                2
#define AC_STATUS_BREAK                3

struct ext4_buddy {
        struct folio *bd_buddy_folio;
        void *bd_buddy;
        struct folio *bd_bitmap_folio;
        void *bd_bitmap;
        struct ext4_group_info *bd_info;
        struct super_block *bd_sb;
        __u16 bd_blkbits;
        ext4_group_t bd_group;
};

static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
                                        struct ext4_free_extent *fex)
{
        return ext4_group_first_block_no(sb, fex->fe_group) +
                (fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
}

static inline loff_t extent_logical_end(struct ext4_sb_info *sbi,
                                        struct ext4_free_extent *fex)
{
        /* Use loff_t to avoid end exceeding ext4_lblk_t max. */
        return (loff_t)fex->fe_logical + EXT4_C2B(sbi, fex->fe_len);
}

static inline loff_t pa_logical_end(struct ext4_sb_info *sbi,
                                    struct ext4_prealloc_space *pa)
{
        /* Use loff_t to avoid end exceeding ext4_lblk_t max. */
        return (loff_t)pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len);
}

typedef int (*ext4_mballoc_query_range_fn)(
        struct super_block                *sb,
        ext4_group_t                        agno,
        ext4_grpblk_t                        start,
        ext4_grpblk_t                        len,
        void                                *priv);

int
ext4_mballoc_query_range(
        struct super_block                *sb,
        ext4_group_t                        agno,
        ext4_grpblk_t                        start,
        ext4_grpblk_t                        end,
        ext4_mballoc_query_range_fn        formatter,
        void                                *priv);

#endif































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM vsyscall

#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
#define __VSYSCALL_TRACE_H

#include <linux/tracepoint.h>

TRACE_EVENT(emulate_vsyscall,

            TP_PROTO(int nr),

            TP_ARGS(nr),

            TP_STRUCT__entry(__field(int, nr)),

            TP_fast_assign(
                           __entry->nr = nr;
                           ),

            TP_printk("nr = %d", __entry->nr)
);

#endif

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/
#define TRACE_INCLUDE_FILE vsyscall_trace
#include <trace/define_trace.h>








































































































































































































































































































































































































































































































































































































































































































































































































































































    2 


    2 



























































































































































    2 

    2 







































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Helpers for formatting and printing strings
 *
 * Copyright 31 August 2008 James Bottomley
 * Copyright (C) 2013, Intel Corporation
 */
#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/math64.h>
#include <linux/export.h>
#include <linux/ctype.h>
#include <linux/device.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/limits.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/string_helpers.h>
#include <kunit/test.h>
#include <kunit/test-bug.h>

/**
 * string_get_size - get the size in the specified units
 * @size:        The size to be converted in blocks
 * @blk_size:        Size of the block (use 1 for size in bytes)
 * @units:        Units to use (powers of 1000 or 1024), whether to include space separator
 * @buf:        buffer to format to
 * @len:        length of buffer
 *
 * This function returns a string formatted to 3 significant figures
 * giving the size in the required units.  @buf should have room for
 * at least 9 bytes and will always be zero terminated.
 *
 * Return value: number of characters of output that would have been written
 * (which may be greater than len, if output was truncated).
 */
int string_get_size(u64 size, u64 blk_size, const enum string_size_units units,
                    char *buf, int len)
{
        enum string_size_units units_base = units & STRING_UNITS_MASK;
        static const char *const units_10[] = {
                "", "k", "M", "G", "T", "P", "E", "Z", "Y",
        };
        static const char *const units_2[] = {
                "", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi",
        };
        static const char *const *const units_str[] = {
                [STRING_UNITS_10] = units_10,
                [STRING_UNITS_2] = units_2,
        };
        static const unsigned int divisor[] = {
                [STRING_UNITS_10] = 1000,
                [STRING_UNITS_2] = 1024,
        };
        static const unsigned int rounding[] = { 500, 50, 5 };
        int i = 0, j;
        u32 remainder = 0, sf_cap;
        char tmp[8];
        const char *unit;

        tmp[0] = '\0';

        if (blk_size == 0)
                size = 0;
        if (size == 0)
                goto out;

        /* This is Napier's algorithm.  Reduce the original block size to
         *
         * coefficient * divisor[units_base]^i
         *
         * we do the reduction so both coefficients are just under 32 bits so
         * that multiplying them together won't overflow 64 bits and we keep
         * as much precision as possible in the numbers.
         *
         * Note: it's safe to throw away the remainders here because all the
         * precision is in the coefficients.
         */
        while (blk_size >> 32) {
                do_div(blk_size, divisor[units_base]);
                i++;
        }

        while (size >> 32) {
                do_div(size, divisor[units_base]);
                i++;
        }

        /* now perform the actual multiplication keeping i as the sum of the
         * two logarithms */
        size *= blk_size;

        /* and logarithmically reduce it until it's just under the divisor */
        while (size >= divisor[units_base]) {
                remainder = do_div(size, divisor[units_base]);
                i++;
        }

        /* work out in j how many digits of precision we need from the
         * remainder */
        sf_cap = size;
        for (j = 0; sf_cap*10 < 1000; j++)
                sf_cap *= 10;

        if (units_base == STRING_UNITS_2) {
                /* express the remainder as a decimal.  It's currently the
                 * numerator of a fraction whose denominator is
                 * divisor[units_base], which is 1 << 10 for STRING_UNITS_2 */
                remainder *= 1000;
                remainder >>= 10;
        }

        /* add a 5 to the digit below what will be printed to ensure
         * an arithmetical round up and carry it through to size */
        remainder += rounding[j];
        if (remainder >= 1000) {
                remainder -= 1000;
                size += 1;
        }

        if (j) {
                snprintf(tmp, sizeof(tmp), ".%03u", remainder);
                tmp[j+1] = '\0';
        }

 out:
        if (i >= ARRAY_SIZE(units_2))
                unit = "UNK";
        else
                unit = units_str[units_base][i];

        return snprintf(buf, len, "%u%s%s%s%s", (u32)size, tmp,
                        (units & STRING_UNITS_NO_SPACE) ? "" : " ",
                        unit,
                        (units & STRING_UNITS_NO_BYTES) ? "" : "B");
}
EXPORT_SYMBOL(string_get_size);

/**
 * parse_int_array_user - Split string into a sequence of integers
 * @from:        The user space buffer to read from
 * @count:        The maximum number of bytes to read
 * @array:        Returned pointer to sequence of integers
 *
 * On success @array is allocated and initialized with a sequence of
 * integers extracted from the @from plus an additional element that
 * begins the sequence and specifies the integers count.
 *
 * Caller takes responsibility for freeing @array when it is no longer
 * needed.
 */
int parse_int_array_user(const char __user *from, size_t count, int **array)
{
        int *ints, nints;
        char *buf;
        int ret = 0;

        buf = memdup_user_nul(from, count);
        if (IS_ERR(buf))
                return PTR_ERR(buf);

        get_options(buf, 0, &nints);
        if (!nints) {
                ret = -ENOENT;
                goto free_buf;
        }

        ints = kcalloc(nints + 1, sizeof(*ints), GFP_KERNEL);
        if (!ints) {
                ret = -ENOMEM;
                goto free_buf;
        }

        get_options(buf, nints + 1, ints);
        *array = ints;

free_buf:
        kfree(buf);
        return ret;
}
EXPORT_SYMBOL(parse_int_array_user);

static bool unescape_space(char **src, char **dst)
{
        char *p = *dst, *q = *src;

        switch (*q) {
        case 'n':
                *p = '\n';
                break;
        case 'r':
                *p = '\r';
                break;
        case 't':
                *p = '\t';
                break;
        case 'v':
                *p = '\v';
                break;
        case 'f':
                *p = '\f';
                break;
        default:
                return false;
        }
        *dst += 1;
        *src += 1;
        return true;
}

static bool unescape_octal(char **src, char **dst)
{
        char *p = *dst, *q = *src;
        u8 num;

        if (isodigit(*q) == 0)
                return false;

        num = (*q++) & 7;
        while (num < 32 && isodigit(*q) && (q - *src < 3)) {
                num <<= 3;
                num += (*q++) & 7;
        }
        *p = num;
        *dst += 1;
        *src = q;
        return true;
}

static bool unescape_hex(char **src, char **dst)
{
        char *p = *dst, *q = *src;
        int digit;
        u8 num;

        if (*q++ != 'x')
                return false;

        num = digit = hex_to_bin(*q++);
        if (digit < 0)
                return false;

        digit = hex_to_bin(*q);
        if (digit >= 0) {
                q++;
                num = (num << 4) | digit;
        }
        *p = num;
        *dst += 1;
        *src = q;
        return true;
}

static bool unescape_special(char **src, char **dst)
{
        char *p = *dst, *q = *src;

        switch (*q) {
        case '\"':
                *p = '\"';
                break;
        case '\\':
                *p = '\\';
                break;
        case 'a':
                *p = '\a';
                break;
        case 'e':
                *p = '\e';
                break;
        default:
                return false;
        }
        *dst += 1;
        *src += 1;
        return true;
}

/**
 * string_unescape - unquote characters in the given string
 * @src:        source buffer (escaped)
 * @dst:        destination buffer (unescaped)
 * @size:        size of the destination buffer (0 to unlimit)
 * @flags:        combination of the flags.
 *
 * Description:
 * The function unquotes characters in the given string.
 *
 * Because the size of the output will be the same as or less than the size of
 * the input, the transformation may be performed in place.
 *
 * Caller must provide valid source and destination pointers. Be aware that
 * destination buffer will always be NULL-terminated. Source string must be
 * NULL-terminated as well.  The supported flags are::
 *
 *        UNESCAPE_SPACE:
 *                '\f' - form feed
 *                '\n' - new line
 *                '\r' - carriage return
 *                '\t' - horizontal tab
 *                '\v' - vertical tab
 *        UNESCAPE_OCTAL:
 *                '\NNN' - byte with octal value NNN (1 to 3 digits)
 *        UNESCAPE_HEX:
 *                '\xHH' - byte with hexadecimal value HH (1 to 2 digits)
 *        UNESCAPE_SPECIAL:
 *                '\"' - double quote
 *                '\\' - backslash
 *                '\a' - alert (BEL)
 *                '\e' - escape
 *        UNESCAPE_ANY:
 *                all previous together
 *
 * Return:
 * The amount of the characters processed to the destination buffer excluding
 * trailing '\0' is returned.
 */
int string_unescape(char *src, char *dst, size_t size, unsigned int flags)
{
        char *out = dst;

        while (*src && --size) {
                if (src[0] == '\\' && src[1] != '\0' && size > 1) {
                        src++;
                        size--;

                        if (flags & UNESCAPE_SPACE &&
                                        unescape_space(&src, &out))
                                continue;

                        if (flags & UNESCAPE_OCTAL &&
                                        unescape_octal(&src, &out))
                                continue;

                        if (flags & UNESCAPE_HEX &&
                                        unescape_hex(&src, &out))
                                continue;

                        if (flags & UNESCAPE_SPECIAL &&
                                        unescape_special(&src, &out))
                                continue;

                        *out++ = '\\';
                }
                *out++ = *src++;
        }
        *out = '\0';

        return out - dst;
}
EXPORT_SYMBOL(string_unescape);

static bool escape_passthrough(unsigned char c, char **dst, char *end)
{
        char *out = *dst;

        if (out < end)
                *out = c;
        *dst = out + 1;
        return true;
}

static bool escape_space(unsigned char c, char **dst, char *end)
{
        char *out = *dst;
        unsigned char to;

        switch (c) {
        case '\n':
                to = 'n';
                break;
        case '\r':
                to = 'r';
                break;
        case '\t':
                to = 't';
                break;
        case '\v':
                to = 'v';
                break;
        case '\f':
                to = 'f';
                break;
        default:
                return false;
        }

        if (out < end)
                *out = '\\';
        ++out;
        if (out < end)
                *out = to;
        ++out;

        *dst = out;
        return true;
}

static bool escape_special(unsigned char c, char **dst, char *end)
{
        char *out = *dst;
        unsigned char to;

        switch (c) {
        case '\\':
                to = '\\';
                break;
        case '\a':
                to = 'a';
                break;
        case '\e':
                to = 'e';
                break;
        case '"':
                to = '"';
                break;
        default:
                return false;
        }

        if (out < end)
                *out = '\\';
        ++out;
        if (out < end)
                *out = to;
        ++out;

        *dst = out;
        return true;
}

static bool escape_null(unsigned char c, char **dst, char *end)
{
        char *out = *dst;

        if (c)
                return false;

        if (out < end)
                *out = '\\';
        ++out;
        if (out < end)
                *out = '0';
        ++out;

        *dst = out;
        return true;
}

static bool escape_octal(unsigned char c, char **dst, char *end)
{
        char *out = *dst;

        if (out < end)
                *out = '\\';
        ++out;
        if (out < end)
                *out = ((c >> 6) & 0x07) + '0';
        ++out;
        if (out < end)
                *out = ((c >> 3) & 0x07) + '0';
        ++out;
        if (out < end)
                *out = ((c >> 0) & 0x07) + '0';
        ++out;

        *dst = out;
        return true;
}

static bool escape_hex(unsigned char c, char **dst, char *end)
{
        char *out = *dst;

        if (out < end)
                *out = '\\';
        ++out;
        if (out < end)
                *out = 'x';
        ++out;
        if (out < end)
                *out = hex_asc_hi(c);
        ++out;
        if (out < end)
                *out = hex_asc_lo(c);
        ++out;

        *dst = out;
        return true;
}

/**
 * string_escape_mem - quote characters in the given memory buffer
 * @src:        source buffer (unescaped)
 * @isz:        source buffer size
 * @dst:        destination buffer (escaped)
 * @osz:        destination buffer size
 * @flags:        combination of the flags
 * @only:        NULL-terminated string containing characters used to limit
 *                the selected escape class. If characters are included in @only
 *                that would not normally be escaped by the classes selected
 *                in @flags, they will be copied to @dst unescaped.
 *
 * Description:
 * The process of escaping byte buffer includes several parts. They are applied
 * in the following sequence.
 *
 *        1. The character is not matched to the one from @only string and thus
 *           must go as-is to the output.
 *        2. The character is matched to the printable and ASCII classes, if asked,
 *           and in case of match it passes through to the output.
 *        3. The character is matched to the printable or ASCII class, if asked,
 *           and in case of match it passes through to the output.
 *        4. The character is checked if it falls into the class given by @flags.
 *           %ESCAPE_OCTAL and %ESCAPE_HEX are going last since they cover any
 *           character. Note that they actually can't go together, otherwise
 *           %ESCAPE_HEX will be ignored.
 *
 * Caller must provide valid source and destination pointers. Be aware that
 * destination buffer will not be NULL-terminated, thus caller have to append
 * it if needs. The supported flags are::
 *
 *        %ESCAPE_SPACE: (special white space, not space itself)
 *                '\f' - form feed
 *                '\n' - new line
 *                '\r' - carriage return
 *                '\t' - horizontal tab
 *                '\v' - vertical tab
 *        %ESCAPE_SPECIAL:
 *                '\"' - double quote
 *                '\\' - backslash
 *                '\a' - alert (BEL)
 *                '\e' - escape
 *        %ESCAPE_NULL:
 *                '\0' - null
 *        %ESCAPE_OCTAL:
 *                '\NNN' - byte with octal value NNN (3 digits)
 *        %ESCAPE_ANY:
 *                all previous together
 *        %ESCAPE_NP:
 *                escape only non-printable characters, checked by isprint()
 *        %ESCAPE_ANY_NP:
 *                all previous together
 *        %ESCAPE_HEX:
 *                '\xHH' - byte with hexadecimal value HH (2 digits)
 *        %ESCAPE_NA:
 *                escape only non-ascii characters, checked by isascii()
 *        %ESCAPE_NAP:
 *                escape only non-printable or non-ascii characters
 *        %ESCAPE_APPEND:
 *                append characters from @only to be escaped by the given classes
 *
 * %ESCAPE_APPEND would help to pass additional characters to the escaped, when
 * one of %ESCAPE_NP, %ESCAPE_NA, or %ESCAPE_NAP is provided.
 *
 * One notable caveat, the %ESCAPE_NAP, %ESCAPE_NP and %ESCAPE_NA have the
 * higher priority than the rest of the flags (%ESCAPE_NAP is the highest).
 * It doesn't make much sense to use either of them without %ESCAPE_OCTAL
 * or %ESCAPE_HEX, because they cover most of the other character classes.
 * %ESCAPE_NAP can utilize %ESCAPE_SPACE or %ESCAPE_SPECIAL in addition to
 * the above.
 *
 * Return:
 * The total size of the escaped output that would be generated for
 * the given input and flags. To check whether the output was
 * truncated, compare the return value to osz. There is room left in
 * dst for a '\0' terminator if and only if ret < osz.
 */
int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
                      unsigned int flags, const char *only)
{
        char *p = dst;
        char *end = p + osz;
        bool is_dict = only && *only;
        bool is_append = flags & ESCAPE_APPEND;

        while (isz--) {
                unsigned char c = *src++;
                bool in_dict = is_dict && strchr(only, c);

                /*
                 * Apply rules in the following sequence:
                 *        - the @only string is supplied and does not contain a
                 *          character under question
                 *        - the character is printable and ASCII, when @flags has
                 *          %ESCAPE_NAP bit set
                 *        - the character is printable, when @flags has
                 *          %ESCAPE_NP bit set
                 *        - the character is ASCII, when @flags has
                 *          %ESCAPE_NA bit set
                 *        - the character doesn't fall into a class of symbols
                 *          defined by given @flags
                 * In these cases we just pass through a character to the
                 * output buffer.
                 *
                 * When %ESCAPE_APPEND is passed, the characters from @only
                 * have been excluded from the %ESCAPE_NAP, %ESCAPE_NP, and
                 * %ESCAPE_NA cases.
                 */
                if (!(is_append || in_dict) && is_dict &&
                                          escape_passthrough(c, &p, end))
                        continue;

                if (!(is_append && in_dict) && isascii(c) && isprint(c) &&
                    flags & ESCAPE_NAP && escape_passthrough(c, &p, end))
                        continue;

                if (!(is_append && in_dict) && isprint(c) &&
                    flags & ESCAPE_NP && escape_passthrough(c, &p, end))
                        continue;

                if (!(is_append && in_dict) && isascii(c) &&
                    flags & ESCAPE_NA && escape_passthrough(c, &p, end))
                        continue;

                if (flags & ESCAPE_SPACE && escape_space(c, &p, end))
                        continue;

                if (flags & ESCAPE_SPECIAL && escape_special(c, &p, end))
                        continue;

                if (flags & ESCAPE_NULL && escape_null(c, &p, end))
                        continue;

                /* ESCAPE_OCTAL and ESCAPE_HEX always go last */
                if (flags & ESCAPE_OCTAL && escape_octal(c, &p, end))
                        continue;

                if (flags & ESCAPE_HEX && escape_hex(c, &p, end))
                        continue;

                escape_passthrough(c, &p, end);
        }

        return p - dst;
}
EXPORT_SYMBOL(string_escape_mem);

/*
 * Return an allocated string that has been escaped of special characters
 * and double quotes, making it safe to log in quotes.
 */
char *kstrdup_quotable(const char *src, gfp_t gfp)
{
        size_t slen, dlen;
        char *dst;
        const int flags = ESCAPE_HEX;
        const char esc[] = "\f\n\r\t\v\a\e\\\"";

        if (!src)
                return NULL;
        slen = strlen(src);

        dlen = string_escape_mem(src, slen, NULL, 0, flags, esc);
        dst = kmalloc(dlen + 1, gfp);
        if (!dst)
                return NULL;

        WARN_ON(string_escape_mem(src, slen, dst, dlen, flags, esc) != dlen);
        dst[dlen] = '\0';

        return dst;
}
EXPORT_SYMBOL_GPL(kstrdup_quotable);

/*
 * Returns allocated NULL-terminated string containing process
 * command line, with inter-argument NULLs replaced with spaces,
 * and other special characters escaped.
 */
char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp)
{
        char *buffer, *quoted;
        int i, res;

        buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
        if (!buffer)
                return NULL;

        res = get_cmdline(task, buffer, PAGE_SIZE - 1);
        buffer[res] = '\0';

        /* Collapse trailing NULLs, leave res pointing to last non-NULL. */
        while (--res >= 0 && buffer[res] == '\0')
                ;

        /* Replace inter-argument NULLs. */
        for (i = 0; i <= res; i++)
                if (buffer[i] == '\0')
                        buffer[i] = ' ';

        /* Make sure result is printable. */
        quoted = kstrdup_quotable(buffer, gfp);
        kfree(buffer);
        return quoted;
}
EXPORT_SYMBOL_GPL(kstrdup_quotable_cmdline);

/*
 * Returns allocated NULL-terminated string containing pathname,
 * with special characters escaped, able to be safely logged. If
 * there is an error, the leading character will be "<".
 */
char *kstrdup_quotable_file(struct file *file, gfp_t gfp)
{
        char *temp, *pathname;

        if (!file)
                return kstrdup("<unknown>", gfp);

        /* We add 11 spaces for ' (deleted)' to be appended */
        temp = kmalloc(PATH_MAX + 11, GFP_KERNEL);
        if (!temp)
                return kstrdup("<no_memory>", gfp);

        pathname = file_path(file, temp, PATH_MAX + 11);
        if (IS_ERR(pathname))
                pathname = kstrdup("<too_long>", gfp);
        else
                pathname = kstrdup_quotable(pathname, gfp);

        kfree(temp);
        return pathname;
}
EXPORT_SYMBOL_GPL(kstrdup_quotable_file);

/*
 * Returns duplicate string in which the @old characters are replaced by @new.
 */
char *kstrdup_and_replace(const char *src, char old, char new, gfp_t gfp)
{
        char *dst;

        dst = kstrdup(src, gfp);
        if (!dst)
                return NULL;

        return strreplace(dst, old, new);
}
EXPORT_SYMBOL_GPL(kstrdup_and_replace);

/**
 * kasprintf_strarray - allocate and fill array of sequential strings
 * @gfp: flags for the slab allocator
 * @prefix: prefix to be used
 * @n: amount of lines to be allocated and filled
 *
 * Allocates and fills @n strings using pattern "%s-%zu", where prefix
 * is provided by caller. The caller is responsible to free them with
 * kfree_strarray() after use.
 *
 * Returns array of strings or NULL when memory can't be allocated.
 */
char **kasprintf_strarray(gfp_t gfp, const char *prefix, size_t n)
{
        char **names;
        size_t i;

        names = kcalloc(n + 1, sizeof(char *), gfp);
        if (!names)
                return NULL;

        for (i = 0; i < n; i++) {
                names[i] = kasprintf(gfp, "%s-%zu", prefix, i);
                if (!names[i]) {
                        kfree_strarray(names, i);
                        return NULL;
                }
        }

        return names;
}
EXPORT_SYMBOL_GPL(kasprintf_strarray);

/**
 * kfree_strarray - free a number of dynamically allocated strings contained
 *                  in an array and the array itself
 *
 * @array: Dynamically allocated array of strings to free.
 * @n: Number of strings (starting from the beginning of the array) to free.
 *
 * Passing a non-NULL @array and @n == 0 as well as NULL @array are valid
 * use-cases. If @array is NULL, the function does nothing.
 */
void kfree_strarray(char **array, size_t n)
{
        unsigned int i;

        if (!array)
                return;

        for (i = 0; i < n; i++)
                kfree(array[i]);
        kfree(array);
}
EXPORT_SYMBOL_GPL(kfree_strarray);

struct strarray {
        char **array;
        size_t n;
};

static void devm_kfree_strarray(struct device *dev, void *res)
{
        struct strarray *array = res;

        kfree_strarray(array->array, array->n);
}

char **devm_kasprintf_strarray(struct device *dev, const char *prefix, size_t n)
{
        struct strarray *ptr;

        ptr = devres_alloc(devm_kfree_strarray, sizeof(*ptr), GFP_KERNEL);
        if (!ptr)
                return ERR_PTR(-ENOMEM);

        ptr->array = kasprintf_strarray(GFP_KERNEL, prefix, n);
        if (!ptr->array) {
                devres_free(ptr);
                return ERR_PTR(-ENOMEM);
        }

        ptr->n = n;
        devres_add(dev, ptr);

        return ptr->array;
}
EXPORT_SYMBOL_GPL(devm_kasprintf_strarray);

/**
 * skip_spaces - Removes leading whitespace from @str.
 * @str: The string to be stripped.
 *
 * Returns a pointer to the first non-whitespace character in @str.
 */
char *skip_spaces(const char *str)
{
        while (isspace(*str))
                ++str;
        return (char *)str;
}
EXPORT_SYMBOL(skip_spaces);

/**
 * strim - Removes leading and trailing whitespace from @s.
 * @s: The string to be stripped.
 *
 * Note that the first trailing whitespace is replaced with a %NUL-terminator
 * in the given string @s. Returns a pointer to the first non-whitespace
 * character in @s.
 */
char *strim(char *s)
{
        size_t size;
        char *end;

        size = strlen(s);
        if (!size)
                return s;

        end = s + size - 1;
        while (end >= s && isspace(*end))
                end--;
        *(end + 1) = '\0';

        return skip_spaces(s);
}
EXPORT_SYMBOL(strim);

/**
 * sysfs_streq - return true if strings are equal, modulo trailing newline
 * @s1: one string
 * @s2: another string
 *
 * This routine returns true iff two strings are equal, treating both
 * NUL and newline-then-NUL as equivalent string terminations.  It's
 * geared for use with sysfs input strings, which generally terminate
 * with newlines but are compared against values without newlines.
 */
bool sysfs_streq(const char *s1, const char *s2)
{
        while (*s1 && *s1 == *s2) {
                s1++;
                s2++;
        }

        if (*s1 == *s2)
                return true;
        if (!*s1 && *s2 == '\n' && !s2[1])
                return true;
        if (*s1 == '\n' && !s1[1] && !*s2)
                return true;
        return false;
}
EXPORT_SYMBOL(sysfs_streq);

/**
 * match_string - matches given string in an array
 * @array:        array of strings
 * @n:                number of strings in the array or -1 for NULL terminated arrays
 * @string:        string to match with
 *
 * This routine will look for a string in an array of strings up to the
 * n-th element in the array or until the first NULL element.
 *
 * Historically the value of -1 for @n, was used to search in arrays that
 * are NULL terminated. However, the function does not make a distinction
 * when finishing the search: either @n elements have been compared OR
 * the first NULL element was found.
 *
 * Return:
 * index of a @string in the @array if matches, or %-EINVAL otherwise.
 */
int match_string(const char * const *array, size_t n, const char *string)
{
        int index;
        const char *item;

        for (index = 0; index < n; index++) {
                item = array[index];
                if (!item)
                        break;
                if (!strcmp(item, string))
                        return index;
        }

        return -EINVAL;
}
EXPORT_SYMBOL(match_string);

/**
 * __sysfs_match_string - matches given string in an array
 * @array: array of strings
 * @n: number of strings in the array or -1 for NULL terminated arrays
 * @str: string to match with
 *
 * Returns index of @str in the @array or -EINVAL, just like match_string().
 * Uses sysfs_streq instead of strcmp for matching.
 *
 * This routine will look for a string in an array of strings up to the
 * n-th element in the array or until the first NULL element.
 *
 * Historically the value of -1 for @n, was used to search in arrays that
 * are NULL terminated. However, the function does not make a distinction
 * when finishing the search: either @n elements have been compared OR
 * the first NULL element was found.
 */
int __sysfs_match_string(const char * const *array, size_t n, const char *str)
{
        const char *item;
        int index;

        for (index = 0; index < n; index++) {
                item = array[index];
                if (!item)
                        break;
                if (sysfs_streq(item, str))
                        return index;
        }

        return -EINVAL;
}
EXPORT_SYMBOL(__sysfs_match_string);

/**
 * strreplace - Replace all occurrences of character in string.
 * @str: The string to operate on.
 * @old: The character being replaced.
 * @new: The character @old is replaced with.
 *
 * Replaces the each @old character with a @new one in the given string @str.
 *
 * Return: pointer to the string @str itself.
 */
char *strreplace(char *str, char old, char new)
{
        char *s = str;

        for (; *s; ++s)
                if (*s == old)
                        *s = new;
        return str;
}
EXPORT_SYMBOL(strreplace);

/**
 * memcpy_and_pad - Copy one buffer to another with padding
 * @dest: Where to copy to
 * @dest_len: The destination buffer size
 * @src: Where to copy from
 * @count: The number of bytes to copy
 * @pad: Character to use for padding if space is left in destination.
 */
void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count,
                    int pad)
{
        if (dest_len > count) {
                memcpy(dest, src, count);
                memset(dest + count, pad,  dest_len - count);
        } else {
                memcpy(dest, src, dest_len);
        }
}
EXPORT_SYMBOL(memcpy_and_pad);

#ifdef CONFIG_FORTIFY_SOURCE
/* These are placeholders for fortify compile-time warnings. */
void __read_overflow2_field(size_t avail, size_t wanted) { }
EXPORT_SYMBOL(__read_overflow2_field);
void __write_overflow_field(size_t avail, size_t wanted) { }
EXPORT_SYMBOL(__write_overflow_field);

static const char * const fortify_func_name[] = {
#define MAKE_FORTIFY_FUNC_NAME(func)        [MAKE_FORTIFY_FUNC(func)] = #func
        EACH_FORTIFY_FUNC(MAKE_FORTIFY_FUNC_NAME)
#undef  MAKE_FORTIFY_FUNC_NAME
};

void __fortify_report(const u8 reason, const size_t avail, const size_t size)
{
        const u8 func = FORTIFY_REASON_FUNC(reason);
        const bool write = FORTIFY_REASON_DIR(reason);
        const char *name;

        name = fortify_func_name[umin(func, FORTIFY_FUNC_UNKNOWN)];
        WARN(1, "%s: detected buffer overflow: %zu byte %s of buffer size %zu\n",
                 name, size, str_read_write(!write), avail);
}
EXPORT_SYMBOL(__fortify_report);

void __fortify_panic(const u8 reason, const size_t avail, const size_t size)
{
        __fortify_report(reason, avail, size);
        BUG();
}
EXPORT_SYMBOL(__fortify_panic);
#endif /* CONFIG_FORTIFY_SOURCE */









































    1 









    1 
    1 







    2 










    1 







































    1 











    1 










    1 


    1 






    1 












    1 























    1 



















    1 



    1 

    1 







































    1 









































    1 























    1 















    1 










    1 

    1 











    1 




    1 

    1 





































    2 

































































    2 

















    2 












    2 











    1 











    1 




    2 




    2 





    1 




    1 










    2 













    2 











    1 














    1 

    1 




    1 














    1 


















    3 













    3 





    2 










    1 



    1 





    1 
    1 























    2 













    1 


    1 


    1 

    1 












    3 














    3 





    4 



















    4 

    4 


    4 








    4 


    4 









    3 












    1 




    3 



    3 







    1 





    1 




    1 












    1 






    1 




    1 
    1 




    1 














    2 















   10 











    2 





    4 
    3 
    2 






    8 












    8 







    8 

    6 



    3 
    4 

    6 


    6 

    6 
    1 






    7 






    4 







    1 



















    6 



    2 

    2 
    2 




































































    8 






    3 









    3 




























































































    7 


    2 
    5 




    6 

































    3 

    5 













    8 










    7 






























    4 

    2 
    1 



















    7 

























































    2 

    2 



    2 





    6 















    4 


    5 








    5 




    1 



    1 


    4 




    4 





















































    1 














    4 








    4 


    1 
    3 






    2 














    2 





    1 











    2 













































    4 
    4 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/open.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/string.h>
#include <linux/mm.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/fsnotify.h>
#include <linux/module.h>
#include <linux/tty.h>
#include <linux/namei.h>
#include <linux/backing-dev.h>
#include <linux/capability.h>
#include <linux/securebits.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/fcntl.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/rcupdate.h>
#include <linux/audit.h>
#include <linux/falloc.h>
#include <linux/fs_struct.h>
#include <linux/dnotify.h>
#include <linux/compat.h>
#include <linux/mnt_idmapping.h>
#include <linux/filelock.h>

#include "internal.h"

int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry,
                loff_t length, unsigned int time_attrs, struct file *filp)
{
        int ret;
        struct iattr newattrs;

        /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
        if (length < 0)
                return -EINVAL;

        newattrs.ia_size = length;
        newattrs.ia_valid = ATTR_SIZE | time_attrs;
        if (filp) {
                newattrs.ia_file = filp;
                newattrs.ia_valid |= ATTR_FILE;
        }

        /* Remove suid, sgid, and file capabilities on truncate too */
        ret = dentry_needs_remove_privs(idmap, dentry);
        if (ret < 0)
                return ret;
        if (ret)
                newattrs.ia_valid |= ret | ATTR_FORCE;

        inode_lock(dentry->d_inode);
        /* Note any delegations or leases have already been broken: */
        ret = notify_change(idmap, dentry, &newattrs, NULL);
        inode_unlock(dentry->d_inode);
        return ret;
}

long vfs_truncate(const struct path *path, loff_t length)
{
        struct mnt_idmap *idmap;
        struct inode *inode;
        long error;

        inode = path->dentry->d_inode;

        /* For directories it's -EISDIR, for other non-regulars - -EINVAL */
        if (S_ISDIR(inode->i_mode))
                return -EISDIR;
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;

        error = mnt_want_write(path->mnt);
        if (error)
                goto out;

        idmap = mnt_idmap(path->mnt);
        error = inode_permission(idmap, inode, MAY_WRITE);
        if (error)
                goto mnt_drop_write_and_out;

        error = -EPERM;
        if (IS_APPEND(inode))
                goto mnt_drop_write_and_out;

        error = get_write_access(inode);
        if (error)
                goto mnt_drop_write_and_out;

        /*
         * Make sure that there are no leases.  get_write_access() protects
         * against the truncate racing with a lease-granting setlease().
         */
        error = break_lease(inode, O_WRONLY);
        if (error)
                goto put_write_and_out;

        error = security_path_truncate(path);
        if (!error)
                error = do_truncate(idmap, path->dentry, length, 0, NULL);

put_write_and_out:
        put_write_access(inode);
mnt_drop_write_and_out:
        mnt_drop_write(path->mnt);
out:
        return error;
}
EXPORT_SYMBOL_GPL(vfs_truncate);

long do_sys_truncate(const char __user *pathname, loff_t length)
{
        unsigned int lookup_flags = LOOKUP_FOLLOW;
        struct path path;
        int error;

        if (length < 0)        /* sorry, but loff_t says... */
                return -EINVAL;

retry:
        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (!error) {
                error = vfs_truncate(&path, length);
                path_put(&path);
        }
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
{
        return do_sys_truncate(path, length);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
{
        return do_sys_truncate(path, length);
}
#endif

long do_ftruncate(struct file *file, loff_t length, int small)
{
        struct inode *inode;
        struct dentry *dentry;
        int error;

        /* explicitly opened as large or we are on 64-bit box */
        if (file->f_flags & O_LARGEFILE)
                small = 0;

        dentry = file->f_path.dentry;
        inode = dentry->d_inode;
        if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE))
                return -EINVAL;

        /* Cannot ftruncate over 2^31 bytes without large file support */
        if (small && length > MAX_NON_LFS)
                return -EINVAL;

        /* Check IS_APPEND on real upper inode */
        if (IS_APPEND(file_inode(file)))
                return -EPERM;
        sb_start_write(inode->i_sb);
        error = security_file_truncate(file);
        if (!error)
                error = do_truncate(file_mnt_idmap(file), dentry, length,
                                    ATTR_MTIME | ATTR_CTIME, file);
        sb_end_write(inode->i_sb);

        return error;
}

long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
        struct fd f;
        int error;

        if (length < 0)
                return -EINVAL;
        f = fdget(fd);
        if (!f.file)
                return -EBADF;

        error = do_ftruncate(f.file, length, small);

        fdput(f);
        return error;
}

SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
{
        return do_sys_ftruncate(fd, length, 1);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
{
        return do_sys_ftruncate(fd, length, 1);
}
#endif

/* LFS versions of truncate are only needed on 32 bit machines */
#if BITS_PER_LONG == 32
SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
{
        return do_sys_truncate(path, length);
}

SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
{
        return do_sys_ftruncate(fd, length, 0);
}
#endif /* BITS_PER_LONG == 32 */

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64)
COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname,
                       compat_arg_u64_dual(length))
{
        return ksys_truncate(pathname, compat_arg_u64_glue(length));
}
#endif

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64)
COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd,
                       compat_arg_u64_dual(length))
{
        return ksys_ftruncate(fd, compat_arg_u64_glue(length));
}
#endif

int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
        struct inode *inode = file_inode(file);
        long ret;

        if (offset < 0 || len <= 0)
                return -EINVAL;

        /* Return error if mode is not supported */
        if (mode & ~FALLOC_FL_SUPPORTED_MASK)
                return -EOPNOTSUPP;

        /* Punch hole and zero range are mutually exclusive */
        if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) ==
            (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
                return -EOPNOTSUPP;

        /* Punch hole must have keep size set */
        if ((mode & FALLOC_FL_PUNCH_HOLE) &&
            !(mode & FALLOC_FL_KEEP_SIZE))
                return -EOPNOTSUPP;

        /* Collapse range should only be used exclusively. */
        if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
            (mode & ~FALLOC_FL_COLLAPSE_RANGE))
                return -EINVAL;

        /* Insert range should only be used exclusively. */
        if ((mode & FALLOC_FL_INSERT_RANGE) &&
            (mode & ~FALLOC_FL_INSERT_RANGE))
                return -EINVAL;

        /* Unshare range should only be used with allocate mode. */
        if ((mode & FALLOC_FL_UNSHARE_RANGE) &&
            (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
                return -EINVAL;

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;

        /*
         * We can only allow pure fallocate on append only files
         */
        if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
                return -EPERM;

        if (IS_IMMUTABLE(inode))
                return -EPERM;

        /*
         * We cannot allow any fallocate operation on an active swapfile
         */
        if (IS_SWAPFILE(inode))
                return -ETXTBSY;

        /*
         * Revalidate the write permissions, in case security policy has
         * changed since the files were opened.
         */
        ret = security_file_permission(file, MAY_WRITE);
        if (ret)
                return ret;

        ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len);
        if (ret)
                return ret;

        if (S_ISFIFO(inode->i_mode))
                return -ESPIPE;

        if (S_ISDIR(inode->i_mode))
                return -EISDIR;

        if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
                return -ENODEV;

        /* Check for wrap through zero too */
        if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
                return -EFBIG;

        if (!file->f_op->fallocate)
                return -EOPNOTSUPP;

        file_start_write(file);
        ret = file->f_op->fallocate(file, mode, offset, len);

        /*
         * Create inotify and fanotify events.
         *
         * To keep the logic simple always create events if fallocate succeeds.
         * This implies that events are even created if the file size remains
         * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE.
         */
        if (ret == 0)
                fsnotify_modify(file);

        file_end_write(file);
        return ret;
}
EXPORT_SYMBOL_GPL(vfs_fallocate);

int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len)
{
        struct fd f = fdget(fd);
        int error = -EBADF;

        if (f.file) {
                error = vfs_fallocate(f.file, mode, offset, len);
                fdput(f);
        }
        return error;
}

SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
{
        return ksys_fallocate(fd, mode, offset, len);
}

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE)
COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset),
                       compat_arg_u64_dual(len))
{
        return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset),
                              compat_arg_u64_glue(len));
}
#endif

/*
 * access() needs to use the real uid/gid, not the effective uid/gid.
 * We do this by temporarily clearing all FS-related capabilities and
 * switching the fsuid/fsgid around to the real ones.
 *
 * Creating new credentials is expensive, so we try to skip doing it,
 * which we can if the result would match what we already got.
 */
static bool access_need_override_creds(int flags)
{
        const struct cred *cred;

        if (flags & AT_EACCESS)
                return false;

        cred = current_cred();
        if (!uid_eq(cred->fsuid, cred->uid) ||
            !gid_eq(cred->fsgid, cred->gid))
                return true;

        if (!issecure(SECURE_NO_SETUID_FIXUP)) {
                kuid_t root_uid = make_kuid(cred->user_ns, 0);
                if (!uid_eq(cred->uid, root_uid)) {
                        if (!cap_isclear(cred->cap_effective))
                                return true;
                } else {
                        if (!cap_isidentical(cred->cap_effective,
                            cred->cap_permitted))
                                return true;
                }
        }

        return false;
}

static const struct cred *access_override_creds(void)
{
        const struct cred *old_cred;
        struct cred *override_cred;

        override_cred = prepare_creds();
        if (!override_cred)
                return NULL;

        /*
         * XXX access_need_override_creds performs checks in hopes of skipping
         * this work. Make sure it stays in sync if making any changes in this
         * routine.
         */

        override_cred->fsuid = override_cred->uid;
        override_cred->fsgid = override_cred->gid;

        if (!issecure(SECURE_NO_SETUID_FIXUP)) {
                /* Clear the capabilities if we switch to a non-root user */
                kuid_t root_uid = make_kuid(override_cred->user_ns, 0);
                if (!uid_eq(override_cred->uid, root_uid))
                        cap_clear(override_cred->cap_effective);
                else
                        override_cred->cap_effective =
                                override_cred->cap_permitted;
        }

        /*
         * The new set of credentials can *only* be used in
         * task-synchronous circumstances, and does not need
         * RCU freeing, unless somebody then takes a separate
         * reference to it.
         *
         * NOTE! This is _only_ true because this credential
         * is used purely for override_creds() that installs
         * it as the subjective cred. Other threads will be
         * accessing ->real_cred, not the subjective cred.
         *
         * If somebody _does_ make a copy of this (using the
         * 'get_current_cred()' function), that will clear the
         * non_rcu field, because now that other user may be
         * expecting RCU freeing. But normal thread-synchronous
         * cred accesses will keep things non-racy to avoid RCU
         * freeing.
         */
        override_cred->non_rcu = 1;

        old_cred = override_creds(override_cred);

        /* override_cred() gets its own ref */
        put_cred(override_cred);

        return old_cred;
}

static long do_faccessat(int dfd, const char __user *filename, int mode, int flags)
{
        struct path path;
        struct inode *inode;
        int res;
        unsigned int lookup_flags = LOOKUP_FOLLOW;
        const struct cred *old_cred = NULL;

        if (mode & ~S_IRWXO)        /* where's F_OK, X_OK, W_OK, R_OK? */
                return -EINVAL;

        if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))
                return -EINVAL;

        if (flags & AT_SYMLINK_NOFOLLOW)
                lookup_flags &= ~LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;

        if (access_need_override_creds(flags)) {
                old_cred = access_override_creds();
                if (!old_cred)
                        return -ENOMEM;
        }

retry:
        res = user_path_at(dfd, filename, lookup_flags, &path);
        if (res)
                goto out;

        inode = d_backing_inode(path.dentry);

        if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
                /*
                 * MAY_EXEC on regular files is denied if the fs is mounted
                 * with the "noexec" flag.
                 */
                res = -EACCES;
                if (path_noexec(&path))
                        goto out_path_release;
        }

        res = inode_permission(mnt_idmap(path.mnt), inode, mode | MAY_ACCESS);
        /* SuS v2 requires we report a read only fs too */
        if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
                goto out_path_release;
        /*
         * This is a rare case where using __mnt_is_readonly()
         * is OK without a mnt_want/drop_write() pair.  Since
         * no actual write to the fs is performed here, we do
         * not need to telegraph to that to anyone.
         *
         * By doing this, we accept that this access is
         * inherently racy and know that the fs may change
         * state before we even see this result.
         */
        if (__mnt_is_readonly(path.mnt))
                res = -EROFS;

out_path_release:
        path_put(&path);
        if (retry_estale(res, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        if (old_cred)
                revert_creds(old_cred);

        return res;
}

SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
{
        return do_faccessat(dfd, filename, mode, 0);
}

SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode,
                int, flags)
{
        return do_faccessat(dfd, filename, mode, flags);
}

SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
{
        return do_faccessat(AT_FDCWD, filename, mode, 0);
}

SYSCALL_DEFINE1(chdir, const char __user *, filename)
{
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
retry:
        error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
        if (error)
                goto out;

        error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
        if (error)
                goto dput_and_out;

        set_fs_pwd(current->fs, &path);

dput_and_out:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        return error;
}

SYSCALL_DEFINE1(fchdir, unsigned int, fd)
{
        struct fd f = fdget_raw(fd);
        int error;

        error = -EBADF;
        if (!f.file)
                goto out;

        error = -ENOTDIR;
        if (!d_can_lookup(f.file->f_path.dentry))
                goto out_putf;

        error = file_permission(f.file, MAY_EXEC | MAY_CHDIR);
        if (!error)
                set_fs_pwd(current->fs, &f.file->f_path);
out_putf:
        fdput(f);
out:
        return error;
}

SYSCALL_DEFINE1(chroot, const char __user *, filename)
{
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
retry:
        error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
        if (error)
                goto out;

        error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
        if (error)
                goto dput_and_out;

        error = -EPERM;
        if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
                goto dput_and_out;
        error = security_path_chroot(&path);
        if (error)
                goto dput_and_out;

        set_fs_root(current->fs, &path);
        error = 0;
dput_and_out:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        return error;
}

int chmod_common(const struct path *path, umode_t mode)
{
        struct inode *inode = path->dentry->d_inode;
        struct inode *delegated_inode = NULL;
        struct iattr newattrs;
        int error;

        error = mnt_want_write(path->mnt);
        if (error)
                return error;
retry_deleg:
        inode_lock(inode);
        error = security_path_chmod(path, mode);
        if (error)
                goto out_unlock;
        newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
        error = notify_change(mnt_idmap(path->mnt), path->dentry,
                              &newattrs, &delegated_inode);
out_unlock:
        inode_unlock(inode);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        mnt_drop_write(path->mnt);
        return error;
}

int vfs_fchmod(struct file *file, umode_t mode)
{
        audit_file(file);
        return chmod_common(&file->f_path, mode);
}

SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
{
        struct fd f = fdget(fd);
        int err = -EBADF;

        if (f.file) {
                err = vfs_fchmod(f.file, mode);
                fdput(f);
        }
        return err;
}

static int do_fchmodat(int dfd, const char __user *filename, umode_t mode,
                       unsigned int flags)
{
        struct path path;
        int error;
        unsigned int lookup_flags;

        if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)))
                return -EINVAL;

        lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;

retry:
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (!error) {
                error = chmod_common(&path, mode);
                path_put(&path);
                if (retry_estale(error, lookup_flags)) {
                        lookup_flags |= LOOKUP_REVAL;
                        goto retry;
                }
        }
        return error;
}

SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename,
                umode_t, mode, unsigned int, flags)
{
        return do_fchmodat(dfd, filename, mode, flags);
}

SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename,
                umode_t, mode)
{
        return do_fchmodat(dfd, filename, mode, 0);
}

SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
{
        return do_fchmodat(AT_FDCWD, filename, mode, 0);
}

/*
 * Check whether @kuid is valid and if so generate and set vfsuid_t in
 * ia_vfsuid.
 *
 * Return: true if @kuid is valid, false if not.
 */
static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid)
{
        if (!uid_valid(kuid))
                return false;
        attr->ia_valid |= ATTR_UID;
        attr->ia_vfsuid = VFSUIDT_INIT(kuid);
        return true;
}

/*
 * Check whether @kgid is valid and if so generate and set vfsgid_t in
 * ia_vfsgid.
 *
 * Return: true if @kgid is valid, false if not.
 */
static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid)
{
        if (!gid_valid(kgid))
                return false;
        attr->ia_valid |= ATTR_GID;
        attr->ia_vfsgid = VFSGIDT_INIT(kgid);
        return true;
}

int chown_common(const struct path *path, uid_t user, gid_t group)
{
        struct mnt_idmap *idmap;
        struct user_namespace *fs_userns;
        struct inode *inode = path->dentry->d_inode;
        struct inode *delegated_inode = NULL;
        int error;
        struct iattr newattrs;
        kuid_t uid;
        kgid_t gid;

        uid = make_kuid(current_user_ns(), user);
        gid = make_kgid(current_user_ns(), group);

        idmap = mnt_idmap(path->mnt);
        fs_userns = i_user_ns(inode);

retry_deleg:
        newattrs.ia_vfsuid = INVALID_VFSUID;
        newattrs.ia_vfsgid = INVALID_VFSGID;
        newattrs.ia_valid =  ATTR_CTIME;
        if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid))
                return -EINVAL;
        if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid))
                return -EINVAL;
        inode_lock(inode);
        if (!S_ISDIR(inode->i_mode))
                newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV |
                                     setattr_should_drop_sgid(idmap, inode);
        /* Continue to send actual fs values, not the mount values. */
        error = security_path_chown(
                path,
                from_vfsuid(idmap, fs_userns, newattrs.ia_vfsuid),
                from_vfsgid(idmap, fs_userns, newattrs.ia_vfsgid));
        if (!error)
                error = notify_change(idmap, path->dentry, &newattrs,
                                      &delegated_inode);
        inode_unlock(inode);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        return error;
}

int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
                int flag)
{
        struct path path;
        int error = -EINVAL;
        int lookup_flags;

        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                goto out;

        lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
        if (flag & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;
retry:
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (error)
                goto out;
        error = mnt_want_write(path.mnt);
        if (error)
                goto out_release;
        error = chown_common(&path, user, group);
        mnt_drop_write(path.mnt);
out_release:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        return error;
}

SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
                gid_t, group, int, flag)
{
        return do_fchownat(dfd, filename, user, group, flag);
}

SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
{
        return do_fchownat(AT_FDCWD, filename, user, group, 0);
}

SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
{
        return do_fchownat(AT_FDCWD, filename, user, group,
                           AT_SYMLINK_NOFOLLOW);
}

int vfs_fchown(struct file *file, uid_t user, gid_t group)
{
        int error;

        error = mnt_want_write_file(file);
        if (error)
                return error;
        audit_file(file);
        error = chown_common(&file->f_path, user, group);
        mnt_drop_write_file(file);
        return error;
}

int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
{
        struct fd f = fdget(fd);
        int error = -EBADF;

        if (f.file) {
                error = vfs_fchown(f.file, user, group);
                fdput(f);
        }
        return error;
}

SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
{
        return ksys_fchown(fd, user, group);
}

static inline int file_get_write_access(struct file *f)
{
        int error;

        error = get_write_access(f->f_inode);
        if (unlikely(error))
                return error;
        error = mnt_get_write_access(f->f_path.mnt);
        if (unlikely(error))
                goto cleanup_inode;
        if (unlikely(f->f_mode & FMODE_BACKING)) {
                error = mnt_get_write_access(backing_file_user_path(f)->mnt);
                if (unlikely(error))
                        goto cleanup_mnt;
        }
        return 0;

cleanup_mnt:
        mnt_put_write_access(f->f_path.mnt);
cleanup_inode:
        put_write_access(f->f_inode);
        return error;
}

static int do_dentry_open(struct file *f,
                          int (*open)(struct inode *, struct file *))
{
        static const struct file_operations empty_fops = {};
        struct inode *inode = f->f_path.dentry->d_inode;
        int error;

        path_get(&f->f_path);
        f->f_inode = inode;
        f->f_mapping = inode->i_mapping;
        f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
        f->f_sb_err = file_sample_sb_err(f);

        if (unlikely(f->f_flags & O_PATH)) {
                f->f_mode = FMODE_PATH | FMODE_OPENED;
                f->f_op = &empty_fops;
                return 0;
        }

        if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
                i_readcount_inc(inode);
        } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
                error = file_get_write_access(f);
                if (unlikely(error))
                        goto cleanup_file;
                f->f_mode |= FMODE_WRITER;
        }

        /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
                f->f_mode |= FMODE_ATOMIC_POS;

        f->f_op = fops_get(inode->i_fop);
        if (WARN_ON(!f->f_op)) {
                error = -ENODEV;
                goto cleanup_all;
        }

        error = security_file_open(f);
        if (error)
                goto cleanup_all;

        error = break_lease(file_inode(f), f->f_flags);
        if (error)
                goto cleanup_all;

        /* normally all 3 are set; ->open() can clear them if needed */
        f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
        if (!open)
                open = f->f_op->open;
        if (open) {
                error = open(inode, f);
                if (error)
                        goto cleanup_all;
        }
        f->f_mode |= FMODE_OPENED;
        if ((f->f_mode & FMODE_READ) &&
             likely(f->f_op->read || f->f_op->read_iter))
                f->f_mode |= FMODE_CAN_READ;
        if ((f->f_mode & FMODE_WRITE) &&
             likely(f->f_op->write || f->f_op->write_iter))
                f->f_mode |= FMODE_CAN_WRITE;
        if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek)
                f->f_mode &= ~FMODE_LSEEK;
        if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
                f->f_mode |= FMODE_CAN_ODIRECT;

        f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
        f->f_iocb_flags = iocb_flags(f);

        file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);

        if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT))
                return -EINVAL;

        /*
         * XXX: Huge page cache doesn't support writing yet. Drop all page
         * cache for this file before processing writes.
         */
        if (f->f_mode & FMODE_WRITE) {
                /*
                 * Paired with smp_mb() in collapse_file() to ensure nr_thps
                 * is up to date and the update to i_writecount by
                 * get_write_access() is visible. Ensures subsequent insertion
                 * of THPs into the page cache will fail.
                 */
                smp_mb();
                if (filemap_nr_thps(inode->i_mapping)) {
                        struct address_space *mapping = inode->i_mapping;

                        filemap_invalidate_lock(inode->i_mapping);
                        /*
                         * unmap_mapping_range just need to be called once
                         * here, because the private pages is not need to be
                         * unmapped mapping (e.g. data segment of dynamic
                         * shared libraries here).
                         */
                        unmap_mapping_range(mapping, 0, 0, 0);
                        truncate_inode_pages(mapping, 0);
                        filemap_invalidate_unlock(inode->i_mapping);
                }
        }

        /*
         * Once we return a file with FMODE_OPENED, __fput() will call
         * fsnotify_close(), so we need fsnotify_open() here for symmetry.
         */
        fsnotify_open(f);
        return 0;

cleanup_all:
        if (WARN_ON_ONCE(error > 0))
                error = -EINVAL;
        fops_put(f->f_op);
        put_file_access(f);
cleanup_file:
        path_put(&f->f_path);
        f->f_path.mnt = NULL;
        f->f_path.dentry = NULL;
        f->f_inode = NULL;
        return error;
}

/**
 * finish_open - finish opening a file
 * @file: file pointer
 * @dentry: pointer to dentry
 * @open: open callback
 *
 * This can be used to finish opening a file passed to i_op->atomic_open().
 *
 * If the open callback is set to NULL, then the standard f_op->open()
 * filesystem callback is substituted.
 *
 * NB: the dentry reference is _not_ consumed.  If, for example, the dentry is
 * the return value of d_splice_alias(), then the caller needs to perform dput()
 * on it after finish_open().
 *
 * Returns zero on success or -errno if the open failed.
 */
int finish_open(struct file *file, struct dentry *dentry,
                int (*open)(struct inode *, struct file *))
{
        BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */

        file->f_path.dentry = dentry;
        return do_dentry_open(file, open);
}
EXPORT_SYMBOL(finish_open);

/**
 * finish_no_open - finish ->atomic_open() without opening the file
 *
 * @file: file pointer
 * @dentry: dentry or NULL (as returned from ->lookup())
 *
 * This can be used to set the result of a successful lookup in ->atomic_open().
 *
 * NB: unlike finish_open() this function does consume the dentry reference and
 * the caller need not dput() it.
 *
 * Returns "0" which must be the return value of ->atomic_open() after having
 * called this function.
 */
int finish_no_open(struct file *file, struct dentry *dentry)
{
        file->f_path.dentry = dentry;
        return 0;
}
EXPORT_SYMBOL(finish_no_open);

char *file_path(struct file *filp, char *buf, int buflen)
{
        return d_path(&filp->f_path, buf, buflen);
}
EXPORT_SYMBOL(file_path);

/**
 * vfs_open - open the file at the given path
 * @path: path to open
 * @file: newly allocated file with f_flag initialized
 */
int vfs_open(const struct path *path, struct file *file)
{
        file->f_path = *path;
        return do_dentry_open(file, NULL);
}

struct file *dentry_open(const struct path *path, int flags,
                         const struct cred *cred)
{
        int error;
        struct file *f;

        /* We must always pass in a valid mount pointer. */
        BUG_ON(!path->mnt);

        f = alloc_empty_file(flags, cred);
        if (!IS_ERR(f)) {
                error = vfs_open(path, f);
                if (error) {
                        fput(f);
                        f = ERR_PTR(error);
                }
        }
        return f;
}
EXPORT_SYMBOL(dentry_open);

/**
 * dentry_create - Create and open a file
 * @path: path to create
 * @flags: O_ flags
 * @mode: mode bits for new file
 * @cred: credentials to use
 *
 * Caller must hold the parent directory's lock, and have prepared
 * a negative dentry, placed in @path->dentry, for the new file.
 *
 * Caller sets @path->mnt to the vfsmount of the filesystem where
 * the new file is to be created. The parent directory and the
 * negative dentry must reside on the same filesystem instance.
 *
 * On success, returns a "struct file *". Otherwise a ERR_PTR
 * is returned.
 */
struct file *dentry_create(const struct path *path, int flags, umode_t mode,
                           const struct cred *cred)
{
        struct file *f;
        int error;

        f = alloc_empty_file(flags, cred);
        if (IS_ERR(f))
                return f;

        error = vfs_create(mnt_idmap(path->mnt),
                           d_inode(path->dentry->d_parent),
                           path->dentry, mode, true);
        if (!error)
                error = vfs_open(path, f);

        if (unlikely(error)) {
                fput(f);
                return ERR_PTR(error);
        }
        return f;
}
EXPORT_SYMBOL(dentry_create);

/**
 * kernel_file_open - open a file for kernel internal use
 * @path:        path of the file to open
 * @flags:        open flags
 * @cred:        credentials for open
 *
 * Open a file for use by in-kernel consumers. The file is not accounted
 * against nr_files and must not be installed into the file descriptor
 * table.
 *
 * Return: Opened file on success, an error pointer on failure.
 */
struct file *kernel_file_open(const struct path *path, int flags,
                                const struct cred *cred)
{
        struct file *f;
        int error;

        f = alloc_empty_file_noaccount(flags, cred);
        if (IS_ERR(f))
                return f;

        f->f_path = *path;
        error = do_dentry_open(f, NULL);
        if (error) {
                fput(f);
                f = ERR_PTR(error);
        }
        return f;
}
EXPORT_SYMBOL_GPL(kernel_file_open);

#define WILL_CREATE(flags)        (flags & (O_CREAT | __O_TMPFILE))
#define O_PATH_FLAGS                (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)

inline struct open_how build_open_how(int flags, umode_t mode)
{
        struct open_how how = {
                .flags = flags & VALID_OPEN_FLAGS,
                .mode = mode & S_IALLUGO,
        };

        /* O_PATH beats everything else. */
        if (how.flags & O_PATH)
                how.flags &= O_PATH_FLAGS;
        /* Modes should only be set for create-like flags. */
        if (!WILL_CREATE(how.flags))
                how.mode = 0;
        return how;
}

inline int build_open_flags(const struct open_how *how, struct open_flags *op)
{
        u64 flags = how->flags;
        u64 strip = __FMODE_NONOTIFY | O_CLOEXEC;
        int lookup_flags = 0;
        int acc_mode = ACC_MODE(flags);

        BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS),
                         "struct open_flags doesn't yet handle flags > 32 bits");

        /*
         * Strip flags that either shouldn't be set by userspace like
         * FMODE_NONOTIFY or that aren't relevant in determining struct
         * open_flags like O_CLOEXEC.
         */
        flags &= ~strip;

        /*
         * Older syscalls implicitly clear all of the invalid flags or argument
         * values before calling build_open_flags(), but openat2(2) checks all
         * of its arguments.
         */
        if (flags & ~VALID_OPEN_FLAGS)
                return -EINVAL;
        if (how->resolve & ~VALID_RESOLVE_FLAGS)
                return -EINVAL;

        /* Scoping flags are mutually exclusive. */
        if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT))
                return -EINVAL;

        /* Deal with the mode. */
        if (WILL_CREATE(flags)) {
                if (how->mode & ~S_IALLUGO)
                        return -EINVAL;
                op->mode = how->mode | S_IFREG;
        } else {
                if (how->mode != 0)
                        return -EINVAL;
                op->mode = 0;
        }

        /*
         * Block bugs where O_DIRECTORY | O_CREAT created regular files.
         * Note, that blocking O_DIRECTORY | O_CREAT here also protects
         * O_TMPFILE below which requires O_DIRECTORY being raised.
         */
        if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT))
                return -EINVAL;

        /* Now handle the creative implementation of O_TMPFILE. */
        if (flags & __O_TMPFILE) {
                /*
                 * In order to ensure programs get explicit errors when trying
                 * to use O_TMPFILE on old kernels we enforce that O_DIRECTORY
                 * is raised alongside __O_TMPFILE.
                 */
                if (!(flags & O_DIRECTORY))
                        return -EINVAL;
                if (!(acc_mode & MAY_WRITE))
                        return -EINVAL;
        }
        if (flags & O_PATH) {
                /* O_PATH only permits certain other flags to be set. */
                if (flags & ~O_PATH_FLAGS)
                        return -EINVAL;
                acc_mode = 0;
        }

        /*
         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
         * check for O_DSYNC if the need any syncing at all we enforce it's
         * always set instead of having to deal with possibly weird behaviour
         * for malicious applications setting only __O_SYNC.
         */
        if (flags & __O_SYNC)
                flags |= O_DSYNC;

        op->open_flag = flags;

        /* O_TRUNC implies we need access checks for write permissions */
        if (flags & O_TRUNC)
                acc_mode |= MAY_WRITE;

        /* Allow the LSM permission hook to distinguish append
           access from general write access. */
        if (flags & O_APPEND)
                acc_mode |= MAY_APPEND;

        op->acc_mode = acc_mode;

        op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;

        if (flags & O_CREAT) {
                op->intent |= LOOKUP_CREATE;
                if (flags & O_EXCL) {
                        op->intent |= LOOKUP_EXCL;
                        flags |= O_NOFOLLOW;
                }
        }

        if (flags & O_DIRECTORY)
                lookup_flags |= LOOKUP_DIRECTORY;
        if (!(flags & O_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;

        if (how->resolve & RESOLVE_NO_XDEV)
                lookup_flags |= LOOKUP_NO_XDEV;
        if (how->resolve & RESOLVE_NO_MAGICLINKS)
                lookup_flags |= LOOKUP_NO_MAGICLINKS;
        if (how->resolve & RESOLVE_NO_SYMLINKS)
                lookup_flags |= LOOKUP_NO_SYMLINKS;
        if (how->resolve & RESOLVE_BENEATH)
                lookup_flags |= LOOKUP_BENEATH;
        if (how->resolve & RESOLVE_IN_ROOT)
                lookup_flags |= LOOKUP_IN_ROOT;
        if (how->resolve & RESOLVE_CACHED) {
                /* Don't bother even trying for create/truncate/tmpfile open */
                if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE))
                        return -EAGAIN;
                lookup_flags |= LOOKUP_CACHED;
        }

        op->lookup_flags = lookup_flags;
        return 0;
}

/**
 * file_open_name - open file and return file pointer
 *
 * @name:        struct filename containing path to open
 * @flags:        open flags as per the open(2) second argument
 * @mode:        mode for the new file if O_CREAT is set, else ignored
 *
 * This is the helper to open a file from kernelspace if you really
 * have to.  But in generally you should not do this, so please move
 * along, nothing to see here..
 */
struct file *file_open_name(struct filename *name, int flags, umode_t mode)
{
        struct open_flags op;
        struct open_how how = build_open_how(flags, mode);
        int err = build_open_flags(&how, &op);
        if (err)
                return ERR_PTR(err);
        return do_filp_open(AT_FDCWD, name, &op);
}

/**
 * filp_open - open file and return file pointer
 *
 * @filename:        path to open
 * @flags:        open flags as per the open(2) second argument
 * @mode:        mode for the new file if O_CREAT is set, else ignored
 *
 * This is the helper to open a file from kernelspace if you really
 * have to.  But in generally you should not do this, so please move
 * along, nothing to see here..
 */
struct file *filp_open(const char *filename, int flags, umode_t mode)
{
        struct filename *name = getname_kernel(filename);
        struct file *file = ERR_CAST(name);

        if (!IS_ERR(name)) {
                file = file_open_name(name, flags, mode);
                putname(name);
        }
        return file;
}
EXPORT_SYMBOL(filp_open);

struct file *file_open_root(const struct path *root,
                            const char *filename, int flags, umode_t mode)
{
        struct open_flags op;
        struct open_how how = build_open_how(flags, mode);
        int err = build_open_flags(&how, &op);
        if (err)
                return ERR_PTR(err);
        return do_file_open_root(root, filename, &op);
}
EXPORT_SYMBOL(file_open_root);

static long do_sys_openat2(int dfd, const char __user *filename,
                           struct open_how *how)
{
        struct open_flags op;
        int fd = build_open_flags(how, &op);
        struct filename *tmp;

        if (fd)
                return fd;

        tmp = getname(filename);
        if (IS_ERR(tmp))
                return PTR_ERR(tmp);

        fd = get_unused_fd_flags(how->flags);
        if (fd >= 0) {
                struct file *f = do_filp_open(dfd, tmp, &op);
                if (IS_ERR(f)) {
                        put_unused_fd(fd);
                        fd = PTR_ERR(f);
                } else {
                        fd_install(fd, f);
                }
        }
        putname(tmp);
        return fd;
}

long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
        struct open_how how = build_open_how(flags, mode);
        return do_sys_openat2(dfd, filename, &how);
}


SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
        if (force_o_largefile())
                flags |= O_LARGEFILE;
        return do_sys_open(AT_FDCWD, filename, flags, mode);
}

SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
                umode_t, mode)
{
        if (force_o_largefile())
                flags |= O_LARGEFILE;
        return do_sys_open(dfd, filename, flags, mode);
}

SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename,
                struct open_how __user *, how, size_t, usize)
{
        int err;
        struct open_how tmp;

        BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0);
        BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST);

        if (unlikely(usize < OPEN_HOW_SIZE_VER0))
                return -EINVAL;

        err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize);
        if (err)
                return err;

        audit_openat2_how(&tmp);

        /* O_LARGEFILE is only allowed for non-O_PATH. */
        if (!(tmp.flags & O_PATH) && force_o_largefile())
                tmp.flags |= O_LARGEFILE;

        return do_sys_openat2(dfd, filename, &tmp);
}

#ifdef CONFIG_COMPAT
/*
 * Exactly like sys_open(), except that it doesn't set the
 * O_LARGEFILE flag.
 */
COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
        return do_sys_open(AT_FDCWD, filename, flags, mode);
}

/*
 * Exactly like sys_openat(), except that it doesn't set the
 * O_LARGEFILE flag.
 */
COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
{
        return do_sys_open(dfd, filename, flags, mode);
}
#endif

#ifndef __alpha__

/*
 * For backward compatibility?  Maybe this should be moved
 * into arch/i386 instead?
 */
SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
{
        int flags = O_CREAT | O_WRONLY | O_TRUNC;

        if (force_o_largefile())
                flags |= O_LARGEFILE;
        return do_sys_open(AT_FDCWD, pathname, flags, mode);
}
#endif

/*
 * "id" is the POSIX thread ID. We use the
 * files pointer for this..
 */
static int filp_flush(struct file *filp, fl_owner_t id)
{
        int retval = 0;

        if (CHECK_DATA_CORRUPTION(file_count(filp) == 0,
                        "VFS: Close: file count is 0 (f_op=%ps)",
                        filp->f_op)) {
                return 0;
        }

        if (filp->f_op->flush)
                retval = filp->f_op->flush(filp, id);

        if (likely(!(filp->f_mode & FMODE_PATH))) {
                dnotify_flush(filp, id);
                locks_remove_posix(filp, id);
        }
        return retval;
}

int filp_close(struct file *filp, fl_owner_t id)
{
        int retval;

        retval = filp_flush(filp, id);
        fput(filp);

        return retval;
}
EXPORT_SYMBOL(filp_close);

/*
 * Careful here! We test whether the file pointer is NULL before
 * releasing the fd. This ensures that one clone task can't release
 * an fd while another clone is opening it.
 */
SYSCALL_DEFINE1(close, unsigned int, fd)
{
        int retval;
        struct file *file;

        file = file_close_fd(fd);
        if (!file)
                return -EBADF;

        retval = filp_flush(file, current->files);

        /*
         * We're returning to user space. Don't bother
         * with any delayed fput() cases.
         */
        __fput_sync(file);

        /* can't restart close syscall because file table entry was cleared */
        if (unlikely(retval == -ERESTARTSYS ||
                     retval == -ERESTARTNOINTR ||
                     retval == -ERESTARTNOHAND ||
                     retval == -ERESTART_RESTARTBLOCK))
                retval = -EINTR;

        return retval;
}

/**
 * sys_close_range() - Close all file descriptors in a given range.
 *
 * @fd:     starting file descriptor to close
 * @max_fd: last file descriptor to close
 * @flags:  reserved for future extensions
 *
 * This closes a range of file descriptors. All file descriptors
 * from @fd up to and including @max_fd are closed.
 * Currently, errors to close a given file descriptor are ignored.
 */
SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
                unsigned int, flags)
{
        return __close_range(fd, max_fd, flags);
}

/*
 * This routine simulates a hangup on the tty, to arrange that users
 * are given clean terminals at login time.
 */
SYSCALL_DEFINE0(vhangup)
{
        if (capable(CAP_SYS_TTY_CONFIG)) {
                tty_vhangup_self();
                return 0;
        }
        return -EPERM;
}

/*
 * Called when an inode is about to be open.
 * We use this to disallow opening large files on 32bit systems if
 * the caller didn't specify O_LARGEFILE.  On 64bit systems we force
 * on this flag in sys_open.
 */
int generic_file_open(struct inode * inode, struct file * filp)
{
        if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
                return -EOVERFLOW;
        return 0;
}

EXPORT_SYMBOL(generic_file_open);

/*
 * This is used by subsystems that don't want seekable
 * file descriptors. The function is not supposed to ever fail, the only
 * reason it returns an 'int' and not 'void' is so that it can be plugged
 * directly into file_operations structure.
 */
int nonseekable_open(struct inode *inode, struct file *filp)
{
        filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
        return 0;
}

EXPORT_SYMBOL(nonseekable_open);

/*
 * stream_open is used by subsystems that want stream-like file descriptors.
 * Such file descriptors are not seekable and don't have notion of position
 * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL).
 * Contrary to file descriptors of other regular files, .read() and .write()
 * can run simultaneously.
 *
 * stream_open never fails and is marked to return int so that it could be
 * directly used as file_operations.open .
 */
int stream_open(struct inode *inode, struct file *filp)
{
        filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS);
        filp->f_mode |= FMODE_STREAM;
        return 0;
}

EXPORT_SYMBOL(stream_open);























    9 



































   10 









































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM workqueue

#if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_WORKQUEUE_H

#include <linux/tracepoint.h>
#include <linux/workqueue.h>

struct pool_workqueue;

/**
 * workqueue_queue_work - called when a work gets queued
 * @req_cpu:        the requested cpu
 * @pwq:        pointer to struct pool_workqueue
 * @work:        pointer to struct work_struct
 *
 * This event occurs when a work is queued immediately or once a
 * delayed work is actually queued on a workqueue (ie: once the delay
 * has been reached).
 */
TRACE_EVENT(workqueue_queue_work,

        TP_PROTO(int req_cpu, struct pool_workqueue *pwq,
                 struct work_struct *work),

        TP_ARGS(req_cpu, pwq, work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
                __string( workqueue,        pwq->wq->name)
                __field( int,        req_cpu        )
                __field( int,        cpu        )
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
                __assign_str(workqueue);
                __entry->req_cpu        = req_cpu;
                __entry->cpu                = pwq->pool->cpu;
        ),

        TP_printk("work struct=%p function=%ps workqueue=%s req_cpu=%d cpu=%d",
                  __entry->work, __entry->function, __get_str(workqueue),
                  __entry->req_cpu, __entry->cpu)
);

/**
 * workqueue_activate_work - called when a work gets activated
 * @work:        pointer to struct work_struct
 *
 * This event occurs when a queued work is put on the active queue,
 * which happens immediately after queueing unless @max_active limit
 * is reached.
 */
TRACE_EVENT(workqueue_activate_work,

        TP_PROTO(struct work_struct *work),

        TP_ARGS(work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
        ),

        TP_printk("work struct %p function=%ps ", __entry->work, __entry->function)
);

/**
 * workqueue_execute_start - called immediately before the workqueue callback
 * @work:        pointer to struct work_struct
 *
 * Allows to track workqueue execution.
 */
TRACE_EVENT(workqueue_execute_start,

        TP_PROTO(struct work_struct *work),

        TP_ARGS(work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
        ),

        TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
);

/**
 * workqueue_execute_end - called immediately after the workqueue callback
 * @work:        pointer to struct work_struct
 * @function:   pointer to worker function
 *
 * Allows to track workqueue execution.
 */
TRACE_EVENT(workqueue_execute_end,

        TP_PROTO(struct work_struct *work, work_func_t function),

        TP_ARGS(work, function),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = function;
        ),

        TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
);

#endif /*  _TRACE_WORKQUEUE_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



















































    1 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_ERR_H
#define _LINUX_ERR_H

#include <linux/compiler.h>
#include <linux/types.h>

#include <asm/errno.h>

/*
 * Kernel pointers have redundant information, so we can use a
 * scheme where we can return either an error code or a normal
 * pointer with the same return value.
 *
 * This should be a per-architecture thing, to allow different
 * error and pointer decisions.
 */
#define MAX_ERRNO        4095

#ifndef __ASSEMBLY__

/**
 * IS_ERR_VALUE - Detect an error pointer.
 * @x: The pointer to check.
 *
 * Like IS_ERR(), but does not generate a compiler warning if result is unused.
 */
#define IS_ERR_VALUE(x) unlikely((unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO)

/**
 * ERR_PTR - Create an error pointer.
 * @error: A negative error code.
 *
 * Encodes @error into a pointer value. Users should consider the result
 * opaque and not assume anything about how the error is encoded.
 *
 * Return: A pointer with @error encoded within its value.
 */
static inline void * __must_check ERR_PTR(long error)
{
        return (void *) error;
}

/**
 * PTR_ERR - Extract the error code from an error pointer.
 * @ptr: An error pointer.
 * Return: The error code within @ptr.
 */
static inline long __must_check PTR_ERR(__force const void *ptr)
{
        return (long) ptr;
}

/**
 * IS_ERR - Detect an error pointer.
 * @ptr: The pointer to check.
 * Return: true if @ptr is an error pointer, false otherwise.
 */
static inline bool __must_check IS_ERR(__force const void *ptr)
{
        return IS_ERR_VALUE((unsigned long)ptr);
}

/**
 * IS_ERR_OR_NULL - Detect an error pointer or a null pointer.
 * @ptr: The pointer to check.
 *
 * Like IS_ERR(), but also returns true for a null pointer.
 */
static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr)
{
        return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr);
}

/**
 * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type
 * @ptr: The pointer to cast.
 *
 * Explicitly cast an error-valued pointer to another pointer type in such a
 * way as to make it clear that's what's going on.
 */
static inline void * __must_check ERR_CAST(__force const void *ptr)
{
        /* cast away the const */
        return (void *) ptr;
}

/**
 * PTR_ERR_OR_ZERO - Extract the error code from a pointer if it has one.
 * @ptr: A potential error pointer.
 *
 * Convenience function that can be used inside a function that returns
 * an error code to propagate errors received as error pointers.
 * For example, ``return PTR_ERR_OR_ZERO(ptr);`` replaces:
 *
 * .. code-block:: c
 *
 *        if (IS_ERR(ptr))
 *                return PTR_ERR(ptr);
 *        else
 *                return 0;
 *
 * Return: The error code within @ptr if it is an error pointer; 0 otherwise.
 */
static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr)
{
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);
        else
                return 0;
}

#endif

#endif /* _LINUX_ERR_H */





















   32 


    2 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/fs.h>

#define DEVCG_ACC_MKNOD 1
#define DEVCG_ACC_READ  2
#define DEVCG_ACC_WRITE 4
#define DEVCG_ACC_MASK (DEVCG_ACC_MKNOD | DEVCG_ACC_READ | DEVCG_ACC_WRITE)

#define DEVCG_DEV_BLOCK 1
#define DEVCG_DEV_CHAR  2
#define DEVCG_DEV_ALL   4  /* this represents all devices */


#if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF)
int devcgroup_check_permission(short type, u32 major, u32 minor,
                               short access);
static inline int devcgroup_inode_permission(struct inode *inode, int mask)
{
        short type, access = 0;

        if (likely(!inode->i_rdev))
                return 0;

        if (S_ISBLK(inode->i_mode))
                type = DEVCG_DEV_BLOCK;
        else if (S_ISCHR(inode->i_mode))
                type = DEVCG_DEV_CHAR;
        else
                return 0;

        if (mask & MAY_WRITE)
                access |= DEVCG_ACC_WRITE;
        if (mask & MAY_READ)
                access |= DEVCG_ACC_READ;

        return devcgroup_check_permission(type, imajor(inode), iminor(inode),
                                          access);
}

static inline int devcgroup_inode_mknod(int mode, dev_t dev)
{
        short type;

        if (!S_ISBLK(mode) && !S_ISCHR(mode))
                return 0;

        if (S_ISCHR(mode) && dev == WHITEOUT_DEV)
                return 0;

        if (S_ISBLK(mode))
                type = DEVCG_DEV_BLOCK;
        else
                type = DEVCG_DEV_CHAR;

        return devcgroup_check_permission(type, MAJOR(dev), MINOR(dev),
                                          DEVCG_ACC_MKNOD);
}

#else
static inline int devcgroup_check_permission(short type, u32 major, u32 minor,
                               short access)
{ return 0; }
static inline int devcgroup_inode_permission(struct inode *inode, int mask)
{ return 0; }
static inline int devcgroup_inode_mknod(int mode, dev_t dev)
{ return 0; }
#endif










































































































































































    1 
















    1 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#include <linux/kernel.h>
#include <linux/bio.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/fsnotify.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/writeback.h>
#include <linux/compat.h>
#include <linux/security.h>
#include <linux/xattr.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/uuid.h>
#include <linux/btrfs.h>
#include <linux/uaccess.h>
#include <linux/iversion.h>
#include <linux/fileattr.h>
#include <linux/fsverity.h>
#include <linux/sched/xacct.h>
#include "ctree.h"
#include "disk-io.h"
#include "export.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "volumes.h"
#include "locking.h"
#include "backref.h"
#include "send.h"
#include "dev-replace.h"
#include "props.h"
#include "sysfs.h"
#include "qgroup.h"
#include "tree-log.h"
#include "compression.h"
#include "space-info.h"
#include "block-group.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
#include "defrag.h"
#include "dir-item.h"
#include "uuid-tree.h"
#include "ioctl.h"
#include "file.h"
#include "scrub.h"
#include "super.h"

#ifdef CONFIG_64BIT
/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
 * structures are incorrect, as the timespec structure from userspace
 * is 4 bytes too small. We define these alternatives here to teach
 * the kernel about the 32-bit struct packing.
 */
struct btrfs_ioctl_timespec_32 {
        __u64 sec;
        __u32 nsec;
} __attribute__ ((__packed__));

struct btrfs_ioctl_received_subvol_args_32 {
        char        uuid[BTRFS_UUID_SIZE];        /* in */
        __u64        stransid;                /* in */
        __u64        rtransid;                /* out */
        struct btrfs_ioctl_timespec_32 stime; /* in */
        struct btrfs_ioctl_timespec_32 rtime; /* out */
        __u64        flags;                        /* in */
        __u64        reserved[16];                /* in */
} __attribute__ ((__packed__));

#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
                                struct btrfs_ioctl_received_subvol_args_32)
#endif

#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
struct btrfs_ioctl_send_args_32 {
        __s64 send_fd;                        /* in */
        __u64 clone_sources_count;        /* in */
        compat_uptr_t clone_sources;        /* in */
        __u64 parent_root;                /* in */
        __u64 flags;                        /* in */
        __u32 version;                        /* in */
        __u8  reserved[28];                /* in */
} __attribute__ ((__packed__));

#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
                               struct btrfs_ioctl_send_args_32)

struct btrfs_ioctl_encoded_io_args_32 {
        compat_uptr_t iov;
        compat_ulong_t iovcnt;
        __s64 offset;
        __u64 flags;
        __u64 len;
        __u64 unencoded_len;
        __u64 unencoded_offset;
        __u32 compression;
        __u32 encryption;
        __u8 reserved[64];
};

#define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \
                                       struct btrfs_ioctl_encoded_io_args_32)
#define BTRFS_IOC_ENCODED_WRITE_32 _IOW(BTRFS_IOCTL_MAGIC, 64, \
                                        struct btrfs_ioctl_encoded_io_args_32)
#endif

/* Mask out flags that are inappropriate for the given type of inode. */
static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
                unsigned int flags)
{
        if (S_ISDIR(inode->i_mode))
                return flags;
        else if (S_ISREG(inode->i_mode))
                return flags & ~FS_DIRSYNC_FL;
        else
                return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
}

/*
 * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
 * ioctl.
 */
static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
{
        unsigned int iflags = 0;
        u32 flags = binode->flags;
        u32 ro_flags = binode->ro_flags;

        if (flags & BTRFS_INODE_SYNC)
                iflags |= FS_SYNC_FL;
        if (flags & BTRFS_INODE_IMMUTABLE)
                iflags |= FS_IMMUTABLE_FL;
        if (flags & BTRFS_INODE_APPEND)
                iflags |= FS_APPEND_FL;
        if (flags & BTRFS_INODE_NODUMP)
                iflags |= FS_NODUMP_FL;
        if (flags & BTRFS_INODE_NOATIME)
                iflags |= FS_NOATIME_FL;
        if (flags & BTRFS_INODE_DIRSYNC)
                iflags |= FS_DIRSYNC_FL;
        if (flags & BTRFS_INODE_NODATACOW)
                iflags |= FS_NOCOW_FL;
        if (ro_flags & BTRFS_INODE_RO_VERITY)
                iflags |= FS_VERITY_FL;

        if (flags & BTRFS_INODE_NOCOMPRESS)
                iflags |= FS_NOCOMP_FL;
        else if (flags & BTRFS_INODE_COMPRESS)
                iflags |= FS_COMPR_FL;

        return iflags;
}

/*
 * Update inode->i_flags based on the btrfs internal flags.
 */
void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
{
        struct btrfs_inode *binode = BTRFS_I(inode);
        unsigned int new_fl = 0;

        if (binode->flags & BTRFS_INODE_SYNC)
                new_fl |= S_SYNC;
        if (binode->flags & BTRFS_INODE_IMMUTABLE)
                new_fl |= S_IMMUTABLE;
        if (binode->flags & BTRFS_INODE_APPEND)
                new_fl |= S_APPEND;
        if (binode->flags & BTRFS_INODE_NOATIME)
                new_fl |= S_NOATIME;
        if (binode->flags & BTRFS_INODE_DIRSYNC)
                new_fl |= S_DIRSYNC;
        if (binode->ro_flags & BTRFS_INODE_RO_VERITY)
                new_fl |= S_VERITY;

        set_mask_bits(&inode->i_flags,
                      S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
                      S_VERITY, new_fl);
}

/*
 * Check if @flags are a supported and valid set of FS_*_FL flags and that
 * the old and new flags are not conflicting
 */
static int check_fsflags(unsigned int old_flags, unsigned int flags)
{
        if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
                      FS_NOATIME_FL | FS_NODUMP_FL | \
                      FS_SYNC_FL | FS_DIRSYNC_FL | \
                      FS_NOCOMP_FL | FS_COMPR_FL |
                      FS_NOCOW_FL))
                return -EOPNOTSUPP;

        /* COMPR and NOCOMP on new/old are valid */
        if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
                return -EINVAL;

        if ((flags & FS_COMPR_FL) && (flags & FS_NOCOW_FL))
                return -EINVAL;

        /* NOCOW and compression options are mutually exclusive */
        if ((old_flags & FS_NOCOW_FL) && (flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
                return -EINVAL;
        if ((flags & FS_NOCOW_FL) && (old_flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
                return -EINVAL;

        return 0;
}

static int check_fsflags_compatible(struct btrfs_fs_info *fs_info,
                                    unsigned int flags)
{
        if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL))
                return -EPERM;

        return 0;
}

int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args)
{
        if (memchr(vol_args->name, 0, sizeof(vol_args->name)) == NULL)
                return -ENAMETOOLONG;
        return 0;
}

static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_args_v2 *vol_args2)
{
        if (memchr(vol_args2->name, 0, sizeof(vol_args2->name)) == NULL)
                return -ENAMETOOLONG;
        return 0;
}

/*
 * Set flags/xflags from the internal inode flags. The remaining items of
 * fsxattr are zeroed.
 */
int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
        struct btrfs_inode *binode = BTRFS_I(d_inode(dentry));

        fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode));
        return 0;
}

int btrfs_fileattr_set(struct mnt_idmap *idmap,
                       struct dentry *dentry, struct fileattr *fa)
{
        struct inode *inode = d_inode(dentry);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct btrfs_inode *binode = BTRFS_I(inode);
        struct btrfs_root *root = binode->root;
        struct btrfs_trans_handle *trans;
        unsigned int fsflags, old_fsflags;
        int ret;
        const char *comp = NULL;
        u32 binode_flags;

        if (btrfs_root_readonly(root))
                return -EROFS;

        if (fileattr_has_fsx(fa))
                return -EOPNOTSUPP;

        fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags);
        old_fsflags = btrfs_inode_flags_to_fsflags(binode);
        ret = check_fsflags(old_fsflags, fsflags);
        if (ret)
                return ret;

        ret = check_fsflags_compatible(fs_info, fsflags);
        if (ret)
                return ret;

        binode_flags = binode->flags;
        if (fsflags & FS_SYNC_FL)
                binode_flags |= BTRFS_INODE_SYNC;
        else
                binode_flags &= ~BTRFS_INODE_SYNC;
        if (fsflags & FS_IMMUTABLE_FL)
                binode_flags |= BTRFS_INODE_IMMUTABLE;
        else
                binode_flags &= ~BTRFS_INODE_IMMUTABLE;
        if (fsflags & FS_APPEND_FL)
                binode_flags |= BTRFS_INODE_APPEND;
        else
                binode_flags &= ~BTRFS_INODE_APPEND;
        if (fsflags & FS_NODUMP_FL)
                binode_flags |= BTRFS_INODE_NODUMP;
        else
                binode_flags &= ~BTRFS_INODE_NODUMP;
        if (fsflags & FS_NOATIME_FL)
                binode_flags |= BTRFS_INODE_NOATIME;
        else
                binode_flags &= ~BTRFS_INODE_NOATIME;

        /* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */
        if (!fa->flags_valid) {
                /* 1 item for the inode */
                trans = btrfs_start_transaction(root, 1);
                if (IS_ERR(trans))
                        return PTR_ERR(trans);
                goto update_flags;
        }

        if (fsflags & FS_DIRSYNC_FL)
                binode_flags |= BTRFS_INODE_DIRSYNC;
        else
                binode_flags &= ~BTRFS_INODE_DIRSYNC;
        if (fsflags & FS_NOCOW_FL) {
                if (S_ISREG(inode->i_mode)) {
                        /*
                         * It's safe to turn csums off here, no extents exist.
                         * Otherwise we want the flag to reflect the real COW
                         * status of the file and will not set it.
                         */
                        if (inode->i_size == 0)
                                binode_flags |= BTRFS_INODE_NODATACOW |
                                                BTRFS_INODE_NODATASUM;
                } else {
                        binode_flags |= BTRFS_INODE_NODATACOW;
                }
        } else {
                /*
                 * Revert back under same assumptions as above
                 */
                if (S_ISREG(inode->i_mode)) {
                        if (inode->i_size == 0)
                                binode_flags &= ~(BTRFS_INODE_NODATACOW |
                                                  BTRFS_INODE_NODATASUM);
                } else {
                        binode_flags &= ~BTRFS_INODE_NODATACOW;
                }
        }

        /*
         * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
         * flag may be changed automatically if compression code won't make
         * things smaller.
         */
        if (fsflags & FS_NOCOMP_FL) {
                binode_flags &= ~BTRFS_INODE_COMPRESS;
                binode_flags |= BTRFS_INODE_NOCOMPRESS;
        } else if (fsflags & FS_COMPR_FL) {

                if (IS_SWAPFILE(inode))
                        return -ETXTBSY;

                binode_flags |= BTRFS_INODE_COMPRESS;
                binode_flags &= ~BTRFS_INODE_NOCOMPRESS;

                comp = btrfs_compress_type2str(fs_info->compress_type);
                if (!comp || comp[0] == 0)
                        comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB);
        } else {
                binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
        }

        /*
         * 1 for inode item
         * 2 for properties
         */
        trans = btrfs_start_transaction(root, 3);
        if (IS_ERR(trans))
                return PTR_ERR(trans);

        if (comp) {
                ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp,
                                     strlen(comp), 0);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        goto out_end_trans;
                }
        } else {
                ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL,
                                     0, 0);
                if (ret && ret != -ENODATA) {
                        btrfs_abort_transaction(trans, ret);
                        goto out_end_trans;
                }
        }

update_flags:
        binode->flags = binode_flags;
        btrfs_sync_inode_flags_to_i_flags(inode);
        inode_inc_iversion(inode);
        inode_set_ctime_current(inode);
        ret = btrfs_update_inode(trans, BTRFS_I(inode));

 out_end_trans:
        btrfs_end_transaction(trans);
        return ret;
}

/*
 * Start exclusive operation @type, return true on success
 */
bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
                        enum btrfs_exclusive_operation type)
{
        bool ret = false;

        spin_lock(&fs_info->super_lock);
        if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) {
                fs_info->exclusive_operation = type;
                ret = true;
        }
        spin_unlock(&fs_info->super_lock);

        return ret;
}

/*
 * Conditionally allow to enter the exclusive operation in case it's compatible
 * with the running one.  This must be paired with btrfs_exclop_start_unlock and
 * btrfs_exclop_finish.
 *
 * Compatibility:
 * - the same type is already running
 * - when trying to add a device and balance has been paused
 * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller
 *   must check the condition first that would allow none -> @type
 */
bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
                                 enum btrfs_exclusive_operation type)
{
        spin_lock(&fs_info->super_lock);
        if (fs_info->exclusive_operation == type ||
            (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED &&
             type == BTRFS_EXCLOP_DEV_ADD))
                return true;

        spin_unlock(&fs_info->super_lock);
        return false;
}

void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info)
{
        spin_unlock(&fs_info->super_lock);
}

void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
{
        spin_lock(&fs_info->super_lock);
        WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
        spin_unlock(&fs_info->super_lock);
        sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
}

void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
                          enum btrfs_exclusive_operation op)
{
        switch (op) {
        case BTRFS_EXCLOP_BALANCE_PAUSED:
                spin_lock(&fs_info->super_lock);
                ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE ||
                       fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD ||
                       fs_info->exclusive_operation == BTRFS_EXCLOP_NONE ||
                       fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
                fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED;
                spin_unlock(&fs_info->super_lock);
                break;
        case BTRFS_EXCLOP_BALANCE:
                spin_lock(&fs_info->super_lock);
                ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
                fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
                spin_unlock(&fs_info->super_lock);
                break;
        default:
                btrfs_warn(fs_info,
                        "invalid exclop balance operation %d requested", op);
        }
}

static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg)
{
        return put_user(inode->i_generation, arg);
}

static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
                                        void __user *arg)
{
        struct btrfs_device *device;
        struct fstrim_range range;
        u64 minlen = ULLONG_MAX;
        u64 num_devices = 0;
        int ret;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        /*
         * btrfs_trim_block_group() depends on space cache, which is not
         * available in zoned filesystem. So, disallow fitrim on a zoned
         * filesystem for now.
         */
        if (btrfs_is_zoned(fs_info))
                return -EOPNOTSUPP;

        /*
         * If the fs is mounted with nologreplay, which requires it to be
         * mounted in RO mode as well, we can not allow discard on free space
         * inside block groups, because log trees refer to extents that are not
         * pinned in a block group's free space cache (pinning the extents is
         * precisely the first phase of replaying a log tree).
         */
        if (btrfs_test_opt(fs_info, NOLOGREPLAY))
                return -EROFS;

        rcu_read_lock();
        list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
                                dev_list) {
                if (!device->bdev || !bdev_max_discard_sectors(device->bdev))
                        continue;
                num_devices++;
                minlen = min_t(u64, bdev_discard_granularity(device->bdev),
                                    minlen);
        }
        rcu_read_unlock();

        if (!num_devices)
                return -EOPNOTSUPP;
        if (copy_from_user(&range, arg, sizeof(range)))
                return -EFAULT;

        /*
         * NOTE: Don't truncate the range using super->total_bytes.  Bytenr of
         * block group is in the logical address space, which can be any
         * sectorsize aligned bytenr in  the range [0, U64_MAX].
         */
        if (range.len < fs_info->sectorsize)
                return -EINVAL;

        range.minlen = max(range.minlen, minlen);
        ret = btrfs_trim_fs(fs_info, &range);
        if (ret < 0)
                return ret;

        if (copy_to_user(arg, &range, sizeof(range)))
                return -EFAULT;

        return 0;
}

int __pure btrfs_is_empty_uuid(u8 *uuid)
{
        int i;

        for (i = 0; i < BTRFS_UUID_SIZE; i++) {
                if (uuid[i])
                        return 0;
        }
        return 1;
}

/*
 * Calculate the number of transaction items to reserve for creating a subvolume
 * or snapshot, not including the inode, directory entries, or parent directory.
 */
static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit)
{
        /*
         * 1 to add root block
         * 1 to add root item
         * 1 to add root ref
         * 1 to add root backref
         * 1 to add UUID item
         * 1 to add qgroup info
         * 1 to add qgroup limit
         *
         * Ideally the last two would only be accounted if qgroups are enabled,
         * but that can change between now and the time we would insert them.
         */
        unsigned int num_items = 7;

        if (inherit) {
                /* 2 to add qgroup relations for each inherited qgroup */
                num_items += 2 * inherit->num_qgroups;
        }
        return num_items;
}

static noinline int create_subvol(struct mnt_idmap *idmap,
                                  struct inode *dir, struct dentry *dentry,
                                  struct btrfs_qgroup_inherit *inherit)
{
        struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
        struct btrfs_trans_handle *trans;
        struct btrfs_key key;
        struct btrfs_root_item *root_item;
        struct btrfs_inode_item *inode_item;
        struct extent_buffer *leaf;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_root *new_root;
        struct btrfs_block_rsv block_rsv;
        struct timespec64 cur_time = current_time(dir);
        struct btrfs_new_inode_args new_inode_args = {
                .dir = dir,
                .dentry = dentry,
                .subvol = true,
        };
        unsigned int trans_num_items;
        int ret;
        dev_t anon_dev;
        u64 objectid;
        u64 qgroup_reserved = 0;

        root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
        if (!root_item)
                return -ENOMEM;

        ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid);
        if (ret)
                goto out_root_item;

        /*
         * Don't create subvolume whose level is not zero. Or qgroup will be
         * screwed up since it assumes subvolume qgroup's level to be 0.
         */
        if (btrfs_qgroup_level(objectid)) {
                ret = -ENOSPC;
                goto out_root_item;
        }

        ret = get_anon_bdev(&anon_dev);
        if (ret < 0)
                goto out_root_item;

        new_inode_args.inode = btrfs_new_subvol_inode(idmap, dir);
        if (!new_inode_args.inode) {
                ret = -ENOMEM;
                goto out_anon_dev;
        }
        ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
        if (ret)
                goto out_inode;
        trans_num_items += create_subvol_num_items(inherit);

        btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
        ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
                                               trans_num_items, false);
        if (ret)
                goto out_new_inode_args;
        qgroup_reserved = block_rsv.qgroup_rsv_reserved;

        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto out_release_rsv;
        }
        ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root);
        if (ret)
                goto out;
        btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
        qgroup_reserved = 0;
        trans->block_rsv = &block_rsv;
        trans->bytes_reserved = block_rsv.size;
        /* Tree log can't currently deal with an inode which is a new root. */
        btrfs_set_log_full_commit(trans);

        ret = btrfs_qgroup_inherit(trans, 0, objectid, btrfs_root_id(root), inherit);
        if (ret)
                goto out;

        leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
                                      0, BTRFS_NESTING_NORMAL);
        if (IS_ERR(leaf)) {
                ret = PTR_ERR(leaf);
                goto out;
        }

        btrfs_mark_buffer_dirty(trans, leaf);

        inode_item = &root_item->inode;
        btrfs_set_stack_inode_generation(inode_item, 1);
        btrfs_set_stack_inode_size(inode_item, 3);
        btrfs_set_stack_inode_nlink(inode_item, 1);
        btrfs_set_stack_inode_nbytes(inode_item,
                                     fs_info->nodesize);
        btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);

        btrfs_set_root_flags(root_item, 0);
        btrfs_set_root_limit(root_item, 0);
        btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);

        btrfs_set_root_bytenr(root_item, leaf->start);
        btrfs_set_root_generation(root_item, trans->transid);
        btrfs_set_root_level(root_item, 0);
        btrfs_set_root_refs(root_item, 1);
        btrfs_set_root_used(root_item, leaf->len);
        btrfs_set_root_last_snapshot(root_item, 0);

        btrfs_set_root_generation_v2(root_item,
                        btrfs_root_generation(root_item));
        generate_random_guid(root_item->uuid);
        btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);
        btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);
        root_item->ctime = root_item->otime;
        btrfs_set_root_ctransid(root_item, trans->transid);
        btrfs_set_root_otransid(root_item, trans->transid);

        btrfs_tree_unlock(leaf);

        btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID);

        key.objectid = objectid;
        key.offset = 0;
        key.type = BTRFS_ROOT_ITEM_KEY;
        ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
                                root_item);
        if (ret) {
                /*
                 * Since we don't abort the transaction in this case, free the
                 * tree block so that we don't leak space and leave the
                 * filesystem in an inconsistent state (an extent item in the
                 * extent tree with a backreference for a root that does not
                 * exists).
                 */
                btrfs_tree_lock(leaf);
                btrfs_clear_buffer_dirty(trans, leaf);
                btrfs_tree_unlock(leaf);
                btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
                free_extent_buffer(leaf);
                goto out;
        }

        free_extent_buffer(leaf);
        leaf = NULL;

        new_root = btrfs_get_new_fs_root(fs_info, objectid, &anon_dev);
        if (IS_ERR(new_root)) {
                ret = PTR_ERR(new_root);
                btrfs_abort_transaction(trans, ret);
                goto out;
        }
        /* anon_dev is owned by new_root now. */
        anon_dev = 0;
        BTRFS_I(new_inode_args.inode)->root = new_root;
        /* ... and new_root is owned by new_inode_args.inode now. */

        ret = btrfs_record_root_in_trans(trans, new_root);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out;
        }

        ret = btrfs_uuid_tree_add(trans, root_item->uuid,
                                  BTRFS_UUID_KEY_SUBVOL, objectid);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out;
        }

        ret = btrfs_create_new_inode(trans, &new_inode_args);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out;
        }

        d_instantiate_new(dentry, new_inode_args.inode);
        new_inode_args.inode = NULL;

out:
        trans->block_rsv = NULL;
        trans->bytes_reserved = 0;
        btrfs_end_transaction(trans);
out_release_rsv:
        btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
        if (qgroup_reserved)
                btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
out_new_inode_args:
        btrfs_new_inode_args_destroy(&new_inode_args);
out_inode:
        iput(new_inode_args.inode);
out_anon_dev:
        if (anon_dev)
                free_anon_bdev(anon_dev);
out_root_item:
        kfree(root_item);
        return ret;
}

static int create_snapshot(struct btrfs_root *root, struct inode *dir,
                           struct dentry *dentry, bool readonly,
                           struct btrfs_qgroup_inherit *inherit)
{
        struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
        struct inode *inode;
        struct btrfs_pending_snapshot *pending_snapshot;
        unsigned int trans_num_items;
        struct btrfs_trans_handle *trans;
        struct btrfs_block_rsv *block_rsv;
        u64 qgroup_reserved = 0;
        int ret;

        /* We do not support snapshotting right now. */
        if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
                btrfs_warn(fs_info,
                           "extent tree v2 doesn't support snapshotting yet");
                return -EOPNOTSUPP;
        }

        if (btrfs_root_refs(&root->root_item) == 0)
                return -ENOENT;

        if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
                return -EINVAL;

        if (atomic_read(&root->nr_swapfiles)) {
                btrfs_warn(fs_info,
                           "cannot snapshot subvolume with active swapfile");
                return -ETXTBSY;
        }

        pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL);
        if (!pending_snapshot)
                return -ENOMEM;

        ret = get_anon_bdev(&pending_snapshot->anon_dev);
        if (ret < 0)
                goto free_pending;
        pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
                        GFP_KERNEL);
        pending_snapshot->path = btrfs_alloc_path();
        if (!pending_snapshot->root_item || !pending_snapshot->path) {
                ret = -ENOMEM;
                goto free_pending;
        }

        block_rsv = &pending_snapshot->block_rsv;
        btrfs_init_block_rsv(block_rsv, BTRFS_BLOCK_RSV_TEMP);
        /*
         * 1 to add dir item
         * 1 to add dir index
         * 1 to update parent inode item
         */
        trans_num_items = create_subvol_num_items(inherit) + 3;
        ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, block_rsv,
                                               trans_num_items, false);
        if (ret)
                goto free_pending;
        qgroup_reserved = block_rsv->qgroup_rsv_reserved;

        pending_snapshot->dentry = dentry;
        pending_snapshot->root = root;
        pending_snapshot->readonly = readonly;
        pending_snapshot->dir = dir;
        pending_snapshot->inherit = inherit;

        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto fail;
        }
        ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root);
        if (ret) {
                btrfs_end_transaction(trans);
                goto fail;
        }
        btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
        qgroup_reserved = 0;

        trans->pending_snapshot = pending_snapshot;

        ret = btrfs_commit_transaction(trans);
        if (ret)
                goto fail;

        ret = pending_snapshot->error;
        if (ret)
                goto fail;

        ret = btrfs_orphan_cleanup(pending_snapshot->snap);
        if (ret)
                goto fail;

        inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry);
        if (IS_ERR(inode)) {
                ret = PTR_ERR(inode);
                goto fail;
        }

        d_instantiate(dentry, inode);
        ret = 0;
        pending_snapshot->anon_dev = 0;
fail:
        /* Prevent double freeing of anon_dev */
        if (ret && pending_snapshot->snap)
                pending_snapshot->snap->anon_dev = 0;
        btrfs_put_root(pending_snapshot->snap);
        btrfs_block_rsv_release(fs_info, block_rsv, (u64)-1, NULL);
        if (qgroup_reserved)
                btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
free_pending:
        if (pending_snapshot->anon_dev)
                free_anon_bdev(pending_snapshot->anon_dev);
        kfree(pending_snapshot->root_item);
        btrfs_free_path(pending_snapshot->path);
        kfree(pending_snapshot);

        return ret;
}

/*  copy of may_delete in fs/namei.c()
 *        Check whether we can remove a link victim from directory dir, check
 *  whether the type of victim is right.
 *  1. We can't do it if dir is read-only (done in permission())
 *  2. We should have write and exec permissions on dir
 *  3. We can't remove anything from append-only dir
 *  4. We can't do anything with immutable dir (done in permission())
 *  5. If the sticky bit on dir is set we should either
 *        a. be owner of dir, or
 *        b. be owner of victim, or
 *        c. have CAP_FOWNER capability
 *  6. If the victim is append-only or immutable we can't do anything with
 *     links pointing to it.
 *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 *  9. We can't remove a root or mountpoint.
 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
 *     nfs_async_unlink().
 */

static int btrfs_may_delete(struct mnt_idmap *idmap,
                            struct inode *dir, struct dentry *victim, int isdir)
{
        int error;

        if (d_really_is_negative(victim))
                return -ENOENT;

        /* The @victim is not inside @dir. */
        if (d_inode(victim->d_parent) != dir)
                return -EINVAL;
        audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);

        error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
        if (IS_APPEND(dir))
                return -EPERM;
        if (check_sticky(idmap, dir, d_inode(victim)) ||
            IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) ||
            IS_SWAPFILE(d_inode(victim)))
                return -EPERM;
        if (isdir) {
                if (!d_is_dir(victim))
                        return -ENOTDIR;
                if (IS_ROOT(victim))
                        return -EBUSY;
        } else if (d_is_dir(victim))
                return -EISDIR;
        if (IS_DEADDIR(dir))
                return -ENOENT;
        if (victim->d_flags & DCACHE_NFSFS_RENAMED)
                return -EBUSY;
        return 0;
}

/* copy of may_create in fs/namei.c() */
static inline int btrfs_may_create(struct mnt_idmap *idmap,
                                   struct inode *dir, struct dentry *child)
{
        if (d_really_is_positive(child))
                return -EEXIST;
        if (IS_DEADDIR(dir))
                return -ENOENT;
        if (!fsuidgid_has_mapping(dir->i_sb, idmap))
                return -EOVERFLOW;
        return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
}

/*
 * Create a new subvolume below @parent.  This is largely modeled after
 * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
 * inside this filesystem so it's quite a bit simpler.
 */
static noinline int btrfs_mksubvol(const struct path *parent,
                                   struct mnt_idmap *idmap,
                                   const char *name, int namelen,
                                   struct btrfs_root *snap_src,
                                   bool readonly,
                                   struct btrfs_qgroup_inherit *inherit)
{
        struct inode *dir = d_inode(parent->dentry);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
        struct dentry *dentry;
        struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen);
        int error;

        error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
        if (error == -EINTR)
                return error;

        dentry = lookup_one(idmap, name, parent->dentry, namelen);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto out_unlock;

        error = btrfs_may_create(idmap, dir, dentry);
        if (error)
                goto out_dput;

        /*
         * even if this name doesn't exist, we may get hash collisions.
         * check for them now when we can safely fail
         */
        error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
                                               dir->i_ino, &name_str);
        if (error)
                goto out_dput;

        down_read(&fs_info->subvol_sem);

        if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
                goto out_up_read;

        if (snap_src)
                error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
        else
                error = create_subvol(idmap, dir, dentry, inherit);

        if (!error)
                fsnotify_mkdir(dir, dentry);
out_up_read:
        up_read(&fs_info->subvol_sem);
out_dput:
        dput(dentry);
out_unlock:
        btrfs_inode_unlock(BTRFS_I(dir), 0);
        return error;
}

static noinline int btrfs_mksnapshot(const struct path *parent,
                                   struct mnt_idmap *idmap,
                                   const char *name, int namelen,
                                   struct btrfs_root *root,
                                   bool readonly,
                                   struct btrfs_qgroup_inherit *inherit)
{
        int ret;
        bool snapshot_force_cow = false;

        /*
         * Force new buffered writes to reserve space even when NOCOW is
         * possible. This is to avoid later writeback (running dealloc) to
         * fallback to COW mode and unexpectedly fail with ENOSPC.
         */
        btrfs_drew_read_lock(&root->snapshot_lock);

        ret = btrfs_start_delalloc_snapshot(root, false);
        if (ret)
                goto out;

        /*
         * All previous writes have started writeback in NOCOW mode, so now
         * we force future writes to fallback to COW mode during snapshot
         * creation.
         */
        atomic_inc(&root->snapshot_force_cow);
        snapshot_force_cow = true;

        btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);

        ret = btrfs_mksubvol(parent, idmap, name, namelen,
                             root, readonly, inherit);
out:
        if (snapshot_force_cow)
                atomic_dec(&root->snapshot_force_cow);
        btrfs_drew_read_unlock(&root->snapshot_lock);
        return ret;
}

/*
 * Try to start exclusive operation @type or cancel it if it's running.
 *
 * Return:
 *   0        - normal mode, newly claimed op started
 *  >0        - normal mode, something else is running,
 *              return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS to user space
 * ECANCELED  - cancel mode, successful cancel
 * ENOTCONN   - cancel mode, operation not running anymore
 */
static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info,
                        enum btrfs_exclusive_operation type, bool cancel)
{
        if (!cancel) {
                /* Start normal op */
                if (!btrfs_exclop_start(fs_info, type))
                        return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
                /* Exclusive operation is now claimed */
                return 0;
        }

        /* Cancel running op */
        if (btrfs_exclop_start_try_lock(fs_info, type)) {
                /*
                 * This blocks any exclop finish from setting it to NONE, so we
                 * request cancellation. Either it runs and we will wait for it,
                 * or it has finished and no waiting will happen.
                 */
                atomic_inc(&fs_info->reloc_cancel_req);
                btrfs_exclop_start_unlock(fs_info);

                if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
                        wait_on_bit(&fs_info->flags, BTRFS_FS_RELOC_RUNNING,
                                    TASK_INTERRUPTIBLE);

                return -ECANCELED;
        }

        /* Something else is running or none */
        return -ENOTCONN;
}

static noinline int btrfs_ioctl_resize(struct file *file,
                                        void __user *arg)
{
        BTRFS_DEV_LOOKUP_ARGS(args);
        struct inode *inode = file_inode(file);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        u64 new_size;
        u64 old_size;
        u64 devid = 1;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ioctl_vol_args *vol_args;
        struct btrfs_trans_handle *trans;
        struct btrfs_device *device = NULL;
        char *sizestr;
        char *retptr;
        char *devstr = NULL;
        int ret = 0;
        int mod = 0;
        bool cancel;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        ret = mnt_want_write_file(file);
        if (ret)
                return ret;

        /*
         * Read the arguments before checking exclusivity to be able to
         * distinguish regular resize and cancel
         */
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
                goto out_drop;
        }
        ret = btrfs_check_ioctl_vol_args_path(vol_args);
        if (ret < 0)
                goto out_free;

        sizestr = vol_args->name;
        cancel = (strcmp("cancel", sizestr) == 0);
        ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel);
        if (ret)
                goto out_free;
        /* Exclusive operation is now claimed */

        devstr = strchr(sizestr, ':');
        if (devstr) {
                sizestr = devstr + 1;
                *devstr = '\0';
                devstr = vol_args->name;
                ret = kstrtoull(devstr, 10, &devid);
                if (ret)
                        goto out_finish;
                if (!devid) {
                        ret = -EINVAL;
                        goto out_finish;
                }
                btrfs_info(fs_info, "resizing devid %llu", devid);
        }

        args.devid = devid;
        device = btrfs_find_device(fs_info->fs_devices, &args);
        if (!device) {
                btrfs_info(fs_info, "resizer unable to find device %llu",
                           devid);
                ret = -ENODEV;
                goto out_finish;
        }

        if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
                btrfs_info(fs_info,
                           "resizer unable to apply on readonly device %llu",
                       devid);
                ret = -EPERM;
                goto out_finish;
        }

        if (!strcmp(sizestr, "max"))
                new_size = bdev_nr_bytes(device->bdev);
        else {
                if (sizestr[0] == '-') {
                        mod = -1;
                        sizestr++;
                } else if (sizestr[0] == '+') {
                        mod = 1;
                        sizestr++;
                }
                new_size = memparse(sizestr, &retptr);
                if (*retptr != '\0' || new_size == 0) {
                        ret = -EINVAL;
                        goto out_finish;
                }
        }

        if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
                ret = -EPERM;
                goto out_finish;
        }

        old_size = btrfs_device_get_total_bytes(device);

        if (mod < 0) {
                if (new_size > old_size) {
                        ret = -EINVAL;
                        goto out_finish;
                }
                new_size = old_size - new_size;
        } else if (mod > 0) {
                if (new_size > ULLONG_MAX - old_size) {
                        ret = -ERANGE;
                        goto out_finish;
                }
                new_size = old_size + new_size;
        }

        if (new_size < SZ_256M) {
                ret = -EINVAL;
                goto out_finish;
        }
        if (new_size > bdev_nr_bytes(device->bdev)) {
                ret = -EFBIG;
                goto out_finish;
        }

        new_size = round_down(new_size, fs_info->sectorsize);

        if (new_size > old_size) {
                trans = btrfs_start_transaction(root, 0);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
                        goto out_finish;
                }
                ret = btrfs_grow_device(trans, device, new_size);
                btrfs_commit_transaction(trans);
        } else if (new_size < old_size) {
                ret = btrfs_shrink_device(device, new_size);
        } /* equal, nothing need to do */

        if (ret == 0 && new_size != old_size)
                btrfs_info_in_rcu(fs_info,
                        "resize device %s (devid %llu) from %llu to %llu",
                        btrfs_dev_name(device), device->devid,
                        old_size, new_size);
out_finish:
        btrfs_exclop_finish(fs_info);
out_free:
        kfree(vol_args);
out_drop:
        mnt_drop_write_file(file);
        return ret;
}

static noinline int __btrfs_ioctl_snap_create(struct file *file,
                                struct mnt_idmap *idmap,
                                const char *name, unsigned long fd, int subvol,
                                bool readonly,
                                struct btrfs_qgroup_inherit *inherit)
{
        int namelen;
        int ret = 0;

        if (!S_ISDIR(file_inode(file)->i_mode))
                return -ENOTDIR;

        ret = mnt_want_write_file(file);
        if (ret)
                goto out;

        namelen = strlen(name);
        if (strchr(name, '/')) {
                ret = -EINVAL;
                goto out_drop_write;
        }

        if (name[0] == '.' &&
           (namelen == 1 || (name[1] == '.' && namelen == 2))) {
                ret = -EEXIST;
                goto out_drop_write;
        }

        if (subvol) {
                ret = btrfs_mksubvol(&file->f_path, idmap, name,
                                     namelen, NULL, readonly, inherit);
        } else {
                struct fd src = fdget(fd);
                struct inode *src_inode;
                if (!src.file) {
                        ret = -EINVAL;
                        goto out_drop_write;
                }

                src_inode = file_inode(src.file);
                if (src_inode->i_sb != file_inode(file)->i_sb) {
                        btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
                                   "Snapshot src from another FS");
                        ret = -EXDEV;
                } else if (!inode_owner_or_capable(idmap, src_inode)) {
                        /*
                         * Subvolume creation is not restricted, but snapshots
                         * are limited to own subvolumes only
                         */
                        ret = -EPERM;
                } else if (btrfs_ino(BTRFS_I(src_inode)) != BTRFS_FIRST_FREE_OBJECTID) {
                        /*
                         * Snapshots must be made with the src_inode referring
                         * to the subvolume inode, otherwise the permission
                         * checking above is useless because we may have
                         * permission on a lower directory but not the subvol
                         * itself.
                         */
                        ret = -EINVAL;
                } else {
                        ret = btrfs_mksnapshot(&file->f_path, idmap,
                                               name, namelen,
                                               BTRFS_I(src_inode)->root,
                                               readonly, inherit);
                }
                fdput(src);
        }
out_drop_write:
        mnt_drop_write_file(file);
out:
        return ret;
}

static noinline int btrfs_ioctl_snap_create(struct file *file,
                                            void __user *arg, int subvol)
{
        struct btrfs_ioctl_vol_args *vol_args;
        int ret;

        if (!S_ISDIR(file_inode(file)->i_mode))
                return -ENOTDIR;

        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args))
                return PTR_ERR(vol_args);
        ret = btrfs_check_ioctl_vol_args_path(vol_args);
        if (ret < 0)
                goto out;

        ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file),
                                        vol_args->name, vol_args->fd, subvol,
                                        false, NULL);

out:
        kfree(vol_args);
        return ret;
}

static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
                                               void __user *arg, int subvol)
{
        struct btrfs_ioctl_vol_args_v2 *vol_args;
        int ret;
        bool readonly = false;
        struct btrfs_qgroup_inherit *inherit = NULL;

        if (!S_ISDIR(file_inode(file)->i_mode))
                return -ENOTDIR;

        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args))
                return PTR_ERR(vol_args);
        ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args);
        if (ret < 0)
                goto free_args;

        if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) {
                ret = -EOPNOTSUPP;
                goto free_args;
        }

        if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
                readonly = true;
        if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
                struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file));

                if (vol_args->size < sizeof(*inherit) ||
                    vol_args->size > PAGE_SIZE) {
                        ret = -EINVAL;
                        goto free_args;
                }
                inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
                if (IS_ERR(inherit)) {
                        ret = PTR_ERR(inherit);
                        goto free_args;
                }

                ret = btrfs_qgroup_check_inherit(fs_info, inherit, vol_args->size);
                if (ret < 0)
                        goto free_inherit;
        }

        ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file),
                                        vol_args->name, vol_args->fd, subvol,
                                        readonly, inherit);
        if (ret)
                goto free_inherit;
free_inherit:
        kfree(inherit);
free_args:
        kfree(vol_args);
        return ret;
}

static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode,
                                                void __user *arg)
{
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
        u64 flags = 0;

        if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID)
                return -EINVAL;

        down_read(&fs_info->subvol_sem);
        if (btrfs_root_readonly(root))
                flags |= BTRFS_SUBVOL_RDONLY;
        up_read(&fs_info->subvol_sem);

        if (copy_to_user(arg, &flags, sizeof(flags)))
                ret = -EFAULT;

        return ret;
}

static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
                                              void __user *arg)
{
        struct inode *inode = file_inode(file);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        u64 root_flags;
        u64 flags;
        int ret = 0;

        if (!inode_owner_or_capable(file_mnt_idmap(file), inode))
                return -EPERM;

        ret = mnt_want_write_file(file);
        if (ret)
                goto out;

        if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
                ret = -EINVAL;
                goto out_drop_write;
        }

        if (copy_from_user(&flags, arg, sizeof(flags))) {
                ret = -EFAULT;
                goto out_drop_write;
        }

        if (flags & ~BTRFS_SUBVOL_RDONLY) {
                ret = -EOPNOTSUPP;
                goto out_drop_write;
        }

        down_write(&fs_info->subvol_sem);

        /* nothing to do */
        if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
                goto out_drop_sem;

        root_flags = btrfs_root_flags(&root->root_item);
        if (flags & BTRFS_SUBVOL_RDONLY) {
                btrfs_set_root_flags(&root->root_item,
                                     root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
        } else {
                /*
                 * Block RO -> RW transition if this subvolume is involved in
                 * send
                 */
                spin_lock(&root->root_item_lock);
                if (root->send_in_progress == 0) {
                        btrfs_set_root_flags(&root->root_item,
                                     root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
                        spin_unlock(&root->root_item_lock);
                } else {
                        spin_unlock(&root->root_item_lock);
                        btrfs_warn(fs_info,
                                   "Attempt to set subvolume %llu read-write during send",
                                   btrfs_root_id(root));
                        ret = -EPERM;
                        goto out_drop_sem;
                }
        }

        trans = btrfs_start_transaction(root, 1);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto out_reset;
        }

        ret = btrfs_update_root(trans, fs_info->tree_root,
                                &root->root_key, &root->root_item);
        if (ret < 0) {
                btrfs_end_transaction(trans);
                goto out_reset;
        }

        ret = btrfs_commit_transaction(trans);

out_reset:
        if (ret)
                btrfs_set_root_flags(&root->root_item, root_flags);
out_drop_sem:
        up_write(&fs_info->subvol_sem);
out_drop_write:
        mnt_drop_write_file(file);
out:
        return ret;
}

static noinline int key_in_sk(struct btrfs_key *key,
                              struct btrfs_ioctl_search_key *sk)
{
        struct btrfs_key test;
        int ret;

        test.objectid = sk->min_objectid;
        test.type = sk->min_type;
        test.offset = sk->min_offset;

        ret = btrfs_comp_cpu_keys(key, &test);
        if (ret < 0)
                return 0;

        test.objectid = sk->max_objectid;
        test.type = sk->max_type;
        test.offset = sk->max_offset;

        ret = btrfs_comp_cpu_keys(key, &test);
        if (ret > 0)
                return 0;
        return 1;
}

static noinline int copy_to_sk(struct btrfs_path *path,
                               struct btrfs_key *key,
                               struct btrfs_ioctl_search_key *sk,
                               u64 *buf_size,
                               char __user *ubuf,
                               unsigned long *sk_offset,
                               int *num_found)
{
        u64 found_transid;
        struct extent_buffer *leaf;
        struct btrfs_ioctl_search_header sh;
        struct btrfs_key test;
        unsigned long item_off;
        unsigned long item_len;
        int nritems;
        int i;
        int slot;
        int ret = 0;

        leaf = path->nodes[0];
        slot = path->slots[0];
        nritems = btrfs_header_nritems(leaf);

        if (btrfs_header_generation(leaf) > sk->max_transid) {
                i = nritems;
                goto advance_key;
        }
        found_transid = btrfs_header_generation(leaf);

        for (i = slot; i < nritems; i++) {
                item_off = btrfs_item_ptr_offset(leaf, i);
                item_len = btrfs_item_size(leaf, i);

                btrfs_item_key_to_cpu(leaf, key, i);
                if (!key_in_sk(key, sk))
                        continue;

                if (sizeof(sh) + item_len > *buf_size) {
                        if (*num_found) {
                                ret = 1;
                                goto out;
                        }

                        /*
                         * return one empty item back for v1, which does not
                         * handle -EOVERFLOW
                         */

                        *buf_size = sizeof(sh) + item_len;
                        item_len = 0;
                        ret = -EOVERFLOW;
                }

                if (sizeof(sh) + item_len + *sk_offset > *buf_size) {
                        ret = 1;
                        goto out;
                }

                sh.objectid = key->objectid;
                sh.offset = key->offset;
                sh.type = key->type;
                sh.len = item_len;
                sh.transid = found_transid;

                /*
                 * Copy search result header. If we fault then loop again so we
                 * can fault in the pages and -EFAULT there if there's a
                 * problem. Otherwise we'll fault and then copy the buffer in
                 * properly this next time through
                 */
                if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) {
                        ret = 0;
                        goto out;
                }

                *sk_offset += sizeof(sh);

                if (item_len) {
                        char __user *up = ubuf + *sk_offset;
                        /*
                         * Copy the item, same behavior as above, but reset the
                         * * sk_offset so we copy the full thing again.
                         */
                        if (read_extent_buffer_to_user_nofault(leaf, up,
                                                item_off, item_len)) {
                                ret = 0;
                                *sk_offset -= sizeof(sh);
                                goto out;
                        }

                        *sk_offset += item_len;
                }
                (*num_found)++;

                if (ret) /* -EOVERFLOW from above */
                        goto out;

                if (*num_found >= sk->nr_items) {
                        ret = 1;
                        goto out;
                }
        }
advance_key:
        ret = 0;
        test.objectid = sk->max_objectid;
        test.type = sk->max_type;
        test.offset = sk->max_offset;
        if (btrfs_comp_cpu_keys(key, &test) >= 0)
                ret = 1;
        else if (key->offset < (u64)-1)
                key->offset++;
        else if (key->type < (u8)-1) {
                key->offset = 0;
                key->type++;
        } else if (key->objectid < (u64)-1) {
                key->offset = 0;
                key->type = 0;
                key->objectid++;
        } else
                ret = 1;
out:
        /*
         *  0: all items from this leaf copied, continue with next
         *  1: * more items can be copied, but unused buffer is too small
         *     * all items were found
         *     Either way, it will stops the loop which iterates to the next
         *     leaf
         *  -EOVERFLOW: item was to large for buffer
         *  -EFAULT: could not copy extent buffer back to userspace
         */
        return ret;
}

static noinline int search_ioctl(struct inode *inode,
                                 struct btrfs_ioctl_search_key *sk,
                                 u64 *buf_size,
                                 char __user *ubuf)
{
        struct btrfs_fs_info *info = inode_to_fs_info(inode);
        struct btrfs_root *root;
        struct btrfs_key key;
        struct btrfs_path *path;
        int ret;
        int num_found = 0;
        unsigned long sk_offset = 0;

        if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) {
                *buf_size = sizeof(struct btrfs_ioctl_search_header);
                return -EOVERFLOW;
        }

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        if (sk->tree_id == 0) {
                /* search the root of the inode that was passed */
                root = btrfs_grab_root(BTRFS_I(inode)->root);
        } else {
                root = btrfs_get_fs_root(info, sk->tree_id, true);
                if (IS_ERR(root)) {
                        btrfs_free_path(path);
                        return PTR_ERR(root);
                }
        }

        key.objectid = sk->min_objectid;
        key.type = sk->min_type;
        key.offset = sk->min_offset;

        while (1) {
                ret = -EFAULT;
                /*
                 * Ensure that the whole user buffer is faulted in at sub-page
                 * granularity, otherwise the loop may live-lock.
                 */
                if (fault_in_subpage_writeable(ubuf + sk_offset,
                                               *buf_size - sk_offset))
                        break;

                ret = btrfs_search_forward(root, &key, path, sk->min_transid);
                if (ret != 0) {
                        if (ret > 0)
                                ret = 0;
                        goto err;
                }
                ret = copy_to_sk(path, &key, sk, buf_size, ubuf,
                                 &sk_offset, &num_found);
                btrfs_release_path(path);
                if (ret)
                        break;

        }
        if (ret > 0)
                ret = 0;
err:
        sk->nr_items = num_found;
        btrfs_put_root(root);
        btrfs_free_path(path);
        return ret;
}

static noinline int btrfs_ioctl_tree_search(struct inode *inode,
                                            void __user *argp)
{
        struct btrfs_ioctl_search_args __user *uargs = argp;
        struct btrfs_ioctl_search_key sk;
        int ret;
        u64 buf_size;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
                return -EFAULT;

        buf_size = sizeof(uargs->buf);

        ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);

        /*
         * In the origin implementation an overflow is handled by returning a
         * search header with a len of zero, so reset ret.
         */
        if (ret == -EOVERFLOW)
                ret = 0;

        if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk)))
                ret = -EFAULT;
        return ret;
}

static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
                                               void __user *argp)
{
        struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
        struct btrfs_ioctl_search_args_v2 args;
        int ret;
        u64 buf_size;
        const u64 buf_limit = SZ_16M;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        /* copy search header and buffer size */
        if (copy_from_user(&args, uarg, sizeof(args)))
                return -EFAULT;

        buf_size = args.buf_size;

        /* limit result size to 16MB */
        if (buf_size > buf_limit)
                buf_size = buf_limit;

        ret = search_ioctl(inode, &args.key, &buf_size,
                           (char __user *)(&uarg->buf[0]));
        if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
                ret = -EFAULT;
        else if (ret == -EOVERFLOW &&
                copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size)))
                ret = -EFAULT;

        return ret;
}

/*
 * Search INODE_REFs to identify path name of 'dirid' directory
 * in a 'tree_id' tree. and sets path name to 'name'.
 */
static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
                                u64 tree_id, u64 dirid, char *name)
{
        struct btrfs_root *root;
        struct btrfs_key key;
        char *ptr;
        int ret = -1;
        int slot;
        int len;
        int total_len = 0;
        struct btrfs_inode_ref *iref;
        struct extent_buffer *l;
        struct btrfs_path *path;

        if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
                name[0]='\0';
                return 0;
        }

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1];

        root = btrfs_get_fs_root(info, tree_id, true);
        if (IS_ERR(root)) {
                ret = PTR_ERR(root);
                root = NULL;
                goto out;
        }

        key.objectid = dirid;
        key.type = BTRFS_INODE_REF_KEY;
        key.offset = (u64)-1;

        while (1) {
                ret = btrfs_search_backwards(root, &key, path);
                if (ret < 0)
                        goto out;
                else if (ret > 0) {
                        ret = -ENOENT;
                        goto out;
                }

                l = path->nodes[0];
                slot = path->slots[0];

                iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
                len = btrfs_inode_ref_name_len(l, iref);
                ptr -= len + 1;
                total_len += len + 1;
                if (ptr < name) {
                        ret = -ENAMETOOLONG;
                        goto out;
                }

                *(ptr + len) = '/';
                read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len);

                if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
                        break;

                btrfs_release_path(path);
                key.objectid = key.offset;
                key.offset = (u64)-1;
                dirid = key.objectid;
        }
        memmove(name, ptr, total_len);
        name[total_len] = '\0';
        ret = 0;
out:
        btrfs_put_root(root);
        btrfs_free_path(path);
        return ret;
}

static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
                                struct inode *inode,
                                struct btrfs_ioctl_ino_lookup_user_args *args)
{
        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
        struct super_block *sb = inode->i_sb;
        struct btrfs_key upper_limit = BTRFS_I(inode)->location;
        u64 treeid = btrfs_root_id(BTRFS_I(inode)->root);
        u64 dirid = args->dirid;
        unsigned long item_off;
        unsigned long item_len;
        struct btrfs_inode_ref *iref;
        struct btrfs_root_ref *rref;
        struct btrfs_root *root = NULL;
        struct btrfs_path *path;
        struct btrfs_key key, key2;
        struct extent_buffer *leaf;
        struct inode *temp_inode;
        char *ptr;
        int slot;
        int len;
        int total_len = 0;
        int ret;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        /*
         * If the bottom subvolume does not exist directly under upper_limit,
         * construct the path in from the bottom up.
         */
        if (dirid != upper_limit.objectid) {
                ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];

                root = btrfs_get_fs_root(fs_info, treeid, true);
                if (IS_ERR(root)) {
                        ret = PTR_ERR(root);
                        goto out;
                }

                key.objectid = dirid;
                key.type = BTRFS_INODE_REF_KEY;
                key.offset = (u64)-1;
                while (1) {
                        ret = btrfs_search_backwards(root, &key, path);
                        if (ret < 0)
                                goto out_put;
                        else if (ret > 0) {
                                ret = -ENOENT;
                                goto out_put;
                        }

                        leaf = path->nodes[0];
                        slot = path->slots[0];

                        iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref);
                        len = btrfs_inode_ref_name_len(leaf, iref);
                        ptr -= len + 1;
                        total_len += len + 1;
                        if (ptr < args->path) {
                                ret = -ENAMETOOLONG;
                                goto out_put;
                        }

                        *(ptr + len) = '/';
                        read_extent_buffer(leaf, ptr,
                                        (unsigned long)(iref + 1), len);

                        /* Check the read+exec permission of this directory */
                        ret = btrfs_previous_item(root, path, dirid,
                                                  BTRFS_INODE_ITEM_KEY);
                        if (ret < 0) {
                                goto out_put;
                        } else if (ret > 0) {
                                ret = -ENOENT;
                                goto out_put;
                        }

                        leaf = path->nodes[0];
                        slot = path->slots[0];
                        btrfs_item_key_to_cpu(leaf, &key2, slot);
                        if (key2.objectid != dirid) {
                                ret = -ENOENT;
                                goto out_put;
                        }

                        /*
                         * We don't need the path anymore, so release it and
                         * avoid deadlocks and lockdep warnings in case
                         * btrfs_iget() needs to lookup the inode from its root
                         * btree and lock the same leaf.
                         */
                        btrfs_release_path(path);
                        temp_inode = btrfs_iget(sb, key2.objectid, root);
                        if (IS_ERR(temp_inode)) {
                                ret = PTR_ERR(temp_inode);
                                goto out_put;
                        }
                        ret = inode_permission(idmap, temp_inode,
                                               MAY_READ | MAY_EXEC);
                        iput(temp_inode);
                        if (ret) {
                                ret = -EACCES;
                                goto out_put;
                        }

                        if (key.offset == upper_limit.objectid)
                                break;
                        if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {
                                ret = -EACCES;
                                goto out_put;
                        }

                        key.objectid = key.offset;
                        key.offset = (u64)-1;
                        dirid = key.objectid;
                }

                memmove(args->path, ptr, total_len);
                args->path[total_len] = '\0';
                btrfs_put_root(root);
                root = NULL;
                btrfs_release_path(path);
        }

        /* Get the bottom subvolume's name from ROOT_REF */
        key.objectid = treeid;
        key.type = BTRFS_ROOT_REF_KEY;
        key.offset = args->treeid;
        ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
        if (ret < 0) {
                goto out;
        } else if (ret > 0) {
                ret = -ENOENT;
                goto out;
        }

        leaf = path->nodes[0];
        slot = path->slots[0];
        btrfs_item_key_to_cpu(leaf, &key, slot);

        item_off = btrfs_item_ptr_offset(leaf, slot);
        item_len = btrfs_item_size(leaf, slot);
        /* Check if dirid in ROOT_REF corresponds to passed dirid */
        rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
        if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
                ret = -EINVAL;
                goto out;
        }

        /* Copy subvolume's name */
        item_off += sizeof(struct btrfs_root_ref);
        item_len -= sizeof(struct btrfs_root_ref);
        read_extent_buffer(leaf, args->name, item_off, item_len);
        args->name[item_len] = 0;

out_put:
        btrfs_put_root(root);
out:
        btrfs_free_path(path);
        return ret;
}

static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root,
                                           void __user *argp)
{
        struct btrfs_ioctl_ino_lookup_args *args;
        int ret = 0;

        args = memdup_user(argp, sizeof(*args));
        if (IS_ERR(args))
                return PTR_ERR(args);

        /*
         * Unprivileged query to obtain the containing subvolume root id. The
         * path is reset so it's consistent with btrfs_search_path_in_tree.
         */
        if (args->treeid == 0)
                args->treeid = btrfs_root_id(root);

        if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
                args->name[0] = 0;
                goto out;
        }

        if (!capable(CAP_SYS_ADMIN)) {
                ret = -EPERM;
                goto out;
        }

        ret = btrfs_search_path_in_tree(root->fs_info,
                                        args->treeid, args->objectid,
                                        args->name);

out:
        if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
                ret = -EFAULT;

        kfree(args);
        return ret;
}

/*
 * Version of ino_lookup ioctl (unprivileged)
 *
 * The main differences from ino_lookup ioctl are:
 *
 *   1. Read + Exec permission will be checked using inode_permission() during
 *      path construction. -EACCES will be returned in case of failure.
 *   2. Path construction will be stopped at the inode number which corresponds
 *      to the fd with which this ioctl is called. If constructed path does not
 *      exist under fd's inode, -EACCES will be returned.
 *   3. The name of bottom subvolume is also searched and filled.
 */
static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
{
        struct btrfs_ioctl_ino_lookup_user_args *args;
        struct inode *inode;
        int ret;

        args = memdup_user(argp, sizeof(*args));
        if (IS_ERR(args))
                return PTR_ERR(args);

        inode = file_inode(file);

        if (args->dirid == BTRFS_FIRST_FREE_OBJECTID &&
            BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) {
                /*
                 * The subvolume does not exist under fd with which this is
                 * called
                 */
                kfree(args);
                return -EACCES;
        }

        ret = btrfs_search_path_in_tree_user(file_mnt_idmap(file), inode, args);

        if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
                ret = -EFAULT;

        kfree(args);
        return ret;
}

/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
{
        struct btrfs_ioctl_get_subvol_info_args *subvol_info;
        struct btrfs_fs_info *fs_info;
        struct btrfs_root *root;
        struct btrfs_path *path;
        struct btrfs_key key;
        struct btrfs_root_item *root_item;
        struct btrfs_root_ref *rref;
        struct extent_buffer *leaf;
        unsigned long item_off;
        unsigned long item_len;
        int slot;
        int ret = 0;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL);
        if (!subvol_info) {
                btrfs_free_path(path);
                return -ENOMEM;
        }

        fs_info = BTRFS_I(inode)->root->fs_info;

        /* Get root_item of inode's subvolume */
        key.objectid = btrfs_root_id(BTRFS_I(inode)->root);
        root = btrfs_get_fs_root(fs_info, key.objectid, true);
        if (IS_ERR(root)) {
                ret = PTR_ERR(root);
                goto out_free;
        }
        root_item = &root->root_item;

        subvol_info->treeid = key.objectid;

        subvol_info->generation = btrfs_root_generation(root_item);
        subvol_info->flags = btrfs_root_flags(root_item);

        memcpy(subvol_info->uuid, root_item->uuid, BTRFS_UUID_SIZE);
        memcpy(subvol_info->parent_uuid, root_item->parent_uuid,
                                                    BTRFS_UUID_SIZE);
        memcpy(subvol_info->received_uuid, root_item->received_uuid,
                                                    BTRFS_UUID_SIZE);

        subvol_info->ctransid = btrfs_root_ctransid(root_item);
        subvol_info->ctime.sec = btrfs_stack_timespec_sec(&root_item->ctime);
        subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(&root_item->ctime);

        subvol_info->otransid = btrfs_root_otransid(root_item);
        subvol_info->otime.sec = btrfs_stack_timespec_sec(&root_item->otime);
        subvol_info->otime.nsec = btrfs_stack_timespec_nsec(&root_item->otime);

        subvol_info->stransid = btrfs_root_stransid(root_item);
        subvol_info->stime.sec = btrfs_stack_timespec_sec(&root_item->stime);
        subvol_info->stime.nsec = btrfs_stack_timespec_nsec(&root_item->stime);

        subvol_info->rtransid = btrfs_root_rtransid(root_item);
        subvol_info->rtime.sec = btrfs_stack_timespec_sec(&root_item->rtime);
        subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(&root_item->rtime);

        if (key.objectid != BTRFS_FS_TREE_OBJECTID) {
                /* Search root tree for ROOT_BACKREF of this subvolume */
                key.type = BTRFS_ROOT_BACKREF_KEY;
                key.offset = 0;
                ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
                if (ret < 0) {
                        goto out;
                } else if (path->slots[0] >=
                           btrfs_header_nritems(path->nodes[0])) {
                        ret = btrfs_next_leaf(fs_info->tree_root, path);
                        if (ret < 0) {
                                goto out;
                        } else if (ret > 0) {
                                ret = -EUCLEAN;
                                goto out;
                        }
                }

                leaf = path->nodes[0];
                slot = path->slots[0];
                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (key.objectid == subvol_info->treeid &&
                    key.type == BTRFS_ROOT_BACKREF_KEY) {
                        subvol_info->parent_id = key.offset;

                        rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
                        subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref);

                        item_off = btrfs_item_ptr_offset(leaf, slot)
                                        + sizeof(struct btrfs_root_ref);
                        item_len = btrfs_item_size(leaf, slot)
                                        - sizeof(struct btrfs_root_ref);
                        read_extent_buffer(leaf, subvol_info->name,
                                           item_off, item_len);
                } else {
                        ret = -ENOENT;
                        goto out;
                }
        }

        btrfs_free_path(path);
        path = NULL;
        if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
                ret = -EFAULT;

out:
        btrfs_put_root(root);
out_free:
        btrfs_free_path(path);
        kfree(subvol_info);
        return ret;
}

/*
 * Return ROOT_REF information of the subvolume containing this inode
 * except the subvolume name.
 */
static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
                                          void __user *argp)
{
        struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
        struct btrfs_root_ref *rref;
        struct btrfs_path *path;
        struct btrfs_key key;
        struct extent_buffer *leaf;
        u64 objectid;
        int slot;
        int ret;
        u8 found;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        rootrefs = memdup_user(argp, sizeof(*rootrefs));
        if (IS_ERR(rootrefs)) {
                btrfs_free_path(path);
                return PTR_ERR(rootrefs);
        }

        objectid = btrfs_root_id(root);
        key.objectid = objectid;
        key.type = BTRFS_ROOT_REF_KEY;
        key.offset = rootrefs->min_treeid;
        found = 0;

        root = root->fs_info->tree_root;
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0) {
                goto out;
        } else if (path->slots[0] >=
                   btrfs_header_nritems(path->nodes[0])) {
                ret = btrfs_next_leaf(root, path);
                if (ret < 0) {
                        goto out;
                } else if (ret > 0) {
                        ret = -EUCLEAN;
                        goto out;
                }
        }
        while (1) {
                leaf = path->nodes[0];
                slot = path->slots[0];

                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (key.objectid != objectid || key.type != BTRFS_ROOT_REF_KEY) {
                        ret = 0;
                        goto out;
                }

                if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) {
                        ret = -EOVERFLOW;
                        goto out;
                }

                rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
                rootrefs->rootref[found].treeid = key.offset;
                rootrefs->rootref[found].dirid =
                                  btrfs_root_ref_dirid(leaf, rref);
                found++;

                ret = btrfs_next_item(root, path);
                if (ret < 0) {
                        goto out;
                } else if (ret > 0) {
                        ret = -EUCLEAN;
                        goto out;
                }
        }

out:
        btrfs_free_path(path);

        if (!ret || ret == -EOVERFLOW) {
                rootrefs->num_items = found;
                /* update min_treeid for next search */
                if (found)
                        rootrefs->min_treeid =
                                rootrefs->rootref[found - 1].treeid + 1;
                if (copy_to_user(argp, rootrefs, sizeof(*rootrefs)))
                        ret = -EFAULT;
        }

        kfree(rootrefs);

        return ret;
}

static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                                             void __user *arg,
                                             bool destroy_v2)
{
        struct dentry *parent = file->f_path.dentry;
        struct dentry *dentry;
        struct inode *dir = d_inode(parent);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
        struct inode *inode;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_root *dest = NULL;
        struct btrfs_ioctl_vol_args *vol_args = NULL;
        struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
        struct mnt_idmap *idmap = file_mnt_idmap(file);
        char *subvol_name, *subvol_name_ptr = NULL;
        int subvol_namelen;
        int ret = 0;
        bool destroy_parent = false;

        /* We don't support snapshots with extent tree v2 yet. */
        if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
                btrfs_err(fs_info,
                          "extent tree v2 doesn't support snapshot deletion yet");
                return -EOPNOTSUPP;
        }

        if (destroy_v2) {
                vol_args2 = memdup_user(arg, sizeof(*vol_args2));
                if (IS_ERR(vol_args2))
                        return PTR_ERR(vol_args2);

                if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) {
                        ret = -EOPNOTSUPP;
                        goto out;
                }

                /*
                 * If SPEC_BY_ID is not set, we are looking for the subvolume by
                 * name, same as v1 currently does.
                 */
                if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) {
                        ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args2);
                        if (ret < 0)
                                goto out;
                        subvol_name = vol_args2->name;

                        ret = mnt_want_write_file(file);
                        if (ret)
                                goto out;
                } else {
                        struct inode *old_dir;

                        if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
                                ret = -EINVAL;
                                goto out;
                        }

                        ret = mnt_want_write_file(file);
                        if (ret)
                                goto out;

                        dentry = btrfs_get_dentry(fs_info->sb,
                                        BTRFS_FIRST_FREE_OBJECTID,
                                        vol_args2->subvolid, 0);
                        if (IS_ERR(dentry)) {
                                ret = PTR_ERR(dentry);
                                goto out_drop_write;
                        }

                        /*
                         * Change the default parent since the subvolume being
                         * deleted can be outside of the current mount point.
                         */
                        parent = btrfs_get_parent(dentry);

                        /*
                         * At this point dentry->d_name can point to '/' if the
                         * subvolume we want to destroy is outsite of the
                         * current mount point, so we need to release the
                         * current dentry and execute the lookup to return a new
                         * one with ->d_name pointing to the
                         * <mount point>/subvol_name.
                         */
                        dput(dentry);
                        if (IS_ERR(parent)) {
                                ret = PTR_ERR(parent);
                                goto out_drop_write;
                        }
                        old_dir = dir;
                        dir = d_inode(parent);

                        /*
                         * If v2 was used with SPEC_BY_ID, a new parent was
                         * allocated since the subvolume can be outside of the
                         * current mount point. Later on we need to release this
                         * new parent dentry.
                         */
                        destroy_parent = true;

                        /*
                         * On idmapped mounts, deletion via subvolid is
                         * restricted to subvolumes that are immediate
                         * ancestors of the inode referenced by the file
                         * descriptor in the ioctl. Otherwise the idmapping
                         * could potentially be abused to delete subvolumes
                         * anywhere in the filesystem the user wouldn't be able
                         * to delete without an idmapped mount.
                         */
                        if (old_dir != dir && idmap != &nop_mnt_idmap) {
                                ret = -EOPNOTSUPP;
                                goto free_parent;
                        }

                        subvol_name_ptr = btrfs_get_subvol_name_from_objectid(
                                                fs_info, vol_args2->subvolid);
                        if (IS_ERR(subvol_name_ptr)) {
                                ret = PTR_ERR(subvol_name_ptr);
                                goto free_parent;
                        }
                        /* subvol_name_ptr is already nul terminated */
                        subvol_name = (char *)kbasename(subvol_name_ptr);
                }
        } else {
                vol_args = memdup_user(arg, sizeof(*vol_args));
                if (IS_ERR(vol_args))
                        return PTR_ERR(vol_args);

                ret = btrfs_check_ioctl_vol_args_path(vol_args);
                if (ret < 0)
                        goto out;

                subvol_name = vol_args->name;

                ret = mnt_want_write_file(file);
                if (ret)
                        goto out;
        }

        subvol_namelen = strlen(subvol_name);

        if (strchr(subvol_name, '/') ||
            strncmp(subvol_name, "..", subvol_namelen) == 0) {
                ret = -EINVAL;
                goto free_subvol_name;
        }

        if (!S_ISDIR(dir->i_mode)) {
                ret = -ENOTDIR;
                goto free_subvol_name;
        }

        ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
        if (ret == -EINTR)
                goto free_subvol_name;
        dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen);
        if (IS_ERR(dentry)) {
                ret = PTR_ERR(dentry);
                goto out_unlock_dir;
        }

        if (d_really_is_negative(dentry)) {
                ret = -ENOENT;
                goto out_dput;
        }

        inode = d_inode(dentry);
        dest = BTRFS_I(inode)->root;
        if (!capable(CAP_SYS_ADMIN)) {
                /*
                 * Regular user.  Only allow this with a special mount
                 * option, when the user has write+exec access to the
                 * subvol root, and when rmdir(2) would have been
                 * allowed.
                 *
                 * Note that this is _not_ check that the subvol is
                 * empty or doesn't contain data that we wouldn't
                 * otherwise be able to delete.
                 *
                 * Users who want to delete empty subvols should try
                 * rmdir(2).
                 */
                ret = -EPERM;
                if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED))
                        goto out_dput;

                /*
                 * Do not allow deletion if the parent dir is the same
                 * as the dir to be deleted.  That means the ioctl
                 * must be called on the dentry referencing the root
                 * of the subvol, not a random directory contained
                 * within it.
                 */
                ret = -EINVAL;
                if (root == dest)
                        goto out_dput;

                ret = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC);
                if (ret)
                        goto out_dput;
        }

        /* check if subvolume may be deleted by a user */
        ret = btrfs_may_delete(idmap, dir, dentry, 1);
        if (ret)
                goto out_dput;

        if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
                ret = -EINVAL;
                goto out_dput;
        }

        btrfs_inode_lock(BTRFS_I(inode), 0);
        ret = btrfs_delete_subvolume(BTRFS_I(dir), dentry);
        btrfs_inode_unlock(BTRFS_I(inode), 0);
        if (!ret)
                d_delete_notify(dir, dentry);

out_dput:
        dput(dentry);
out_unlock_dir:
        btrfs_inode_unlock(BTRFS_I(dir), 0);
free_subvol_name:
        kfree(subvol_name_ptr);
free_parent:
        if (destroy_parent)
                dput(parent);
out_drop_write:
        mnt_drop_write_file(file);
out:
        kfree(vol_args2);
        kfree(vol_args);
        return ret;
}

static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
{
        struct inode *inode = file_inode(file);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ioctl_defrag_range_args range = {0};
        int ret;

        ret = mnt_want_write_file(file);
        if (ret)
                return ret;

        if (btrfs_root_readonly(root)) {
                ret = -EROFS;
                goto out;
        }

        switch (inode->i_mode & S_IFMT) {
        case S_IFDIR:
                if (!capable(CAP_SYS_ADMIN)) {
                        ret = -EPERM;
                        goto out;
                }
                ret = btrfs_defrag_root(root);
                break;
        case S_IFREG:
                /*
                 * Note that this does not check the file descriptor for write
                 * access. This prevents defragmenting executables that are
                 * running and allows defrag on files open in read-only mode.
                 */
                if (!capable(CAP_SYS_ADMIN) &&
                    inode_permission(&nop_mnt_idmap, inode, MAY_WRITE)) {
                        ret = -EPERM;
                        goto out;
                }

                if (argp) {
                        if (copy_from_user(&range, argp, sizeof(range))) {
                                ret = -EFAULT;
                                goto out;
                        }
                        if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) {
                                ret = -EOPNOTSUPP;
                                goto out;
                        }
                        /* compression requires us to start the IO */
                        if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
                                range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
                                range.extent_thresh = (u32)-1;
                        }
                } else {
                        /* the rest are all set to zero by kzalloc */
                        range.len = (u64)-1;
                }
                ret = btrfs_defrag_file(file_inode(file), &file->f_ra,
                                        &range, BTRFS_OLDEST_GENERATION, 0);
                if (ret > 0)
                        ret = 0;
                break;
        default:
                ret = -EINVAL;
        }
out:
        mnt_drop_write_file(file);
        return ret;
}

static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
{
        struct btrfs_ioctl_vol_args *vol_args;
        bool restore_op = false;
        int ret;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
                btrfs_err(fs_info, "device add not supported on extent tree v2 yet");
                return -EINVAL;
        }

        if (fs_info->fs_devices->temp_fsid) {
                btrfs_err(fs_info,
                          "device add not supported on cloned temp-fsid mount");
                return -EINVAL;
        }

        if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) {
                if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD))
                        return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;

                /*
                 * We can do the device add because we have a paused balanced,
                 * change the exclusive op type and remember we should bring
                 * back the paused balance
                 */
                fs_info->exclusive_operation = BTRFS_EXCLOP_DEV_ADD;
                btrfs_exclop_start_unlock(fs_info);
                restore_op = true;
        }

        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
                goto out;
        }

        ret = btrfs_check_ioctl_vol_args_path(vol_args);
        if (ret < 0)
                goto out_free;

        ret = btrfs_init_new_device(fs_info, vol_args->name);

        if (!ret)
                btrfs_info(fs_info, "disk added %s", vol_args->name);

out_free:
        kfree(vol_args);
out:
        if (restore_op)
                btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
        else
                btrfs_exclop_finish(fs_info);
        return ret;
}

static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
{
        BTRFS_DEV_LOOKUP_ARGS(args);
        struct inode *inode = file_inode(file);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct btrfs_ioctl_vol_args_v2 *vol_args;
        struct file *bdev_file = NULL;
        int ret;
        bool cancel = false;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args))
                return PTR_ERR(vol_args);

        if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args);
        if (ret < 0)
                goto out;

        if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
                args.devid = vol_args->devid;
        } else if (!strcmp("cancel", vol_args->name)) {
                cancel = true;
        } else {
                ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
                if (ret)
                        goto out;
        }

        ret = mnt_want_write_file(file);
        if (ret)
                goto out;

        ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
                                           cancel);
        if (ret)
                goto err_drop;

        /* Exclusive operation is now claimed */
        ret = btrfs_rm_device(fs_info, &args, &bdev_file);

        btrfs_exclop_finish(fs_info);

        if (!ret) {
                if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
                        btrfs_info(fs_info, "device deleted: id %llu",
                                        vol_args->devid);
                else
                        btrfs_info(fs_info, "device deleted: %s",
                                        vol_args->name);
        }
err_drop:
        mnt_drop_write_file(file);
        if (bdev_file)
                fput(bdev_file);
out:
        btrfs_put_dev_args_from_path(&args);
        kfree(vol_args);
        return ret;
}

static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
{
        BTRFS_DEV_LOOKUP_ARGS(args);
        struct inode *inode = file_inode(file);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct btrfs_ioctl_vol_args *vol_args;
        struct file *bdev_file = NULL;
        int ret;
        bool cancel = false;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args))
                return PTR_ERR(vol_args);

        ret = btrfs_check_ioctl_vol_args_path(vol_args);
        if (ret < 0)
                goto out_free;

        if (!strcmp("cancel", vol_args->name)) {
                cancel = true;
        } else {
                ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
                if (ret)
                        goto out;
        }

        ret = mnt_want_write_file(file);
        if (ret)
                goto out;

        ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
                                           cancel);
        if (ret == 0) {
                ret = btrfs_rm_device(fs_info, &args, &bdev_file);
                if (!ret)
                        btrfs_info(fs_info, "disk deleted %s", vol_args->name);
                btrfs_exclop_finish(fs_info);
        }

        mnt_drop_write_file(file);
        if (bdev_file)
                fput(bdev_file);
out:
        btrfs_put_dev_args_from_path(&args);
out_free:
        kfree(vol_args);
        return ret;
}

static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
                                void __user *arg)
{
        struct btrfs_ioctl_fs_info_args *fi_args;
        struct btrfs_device *device;
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        u64 flags_in;
        int ret = 0;

        fi_args = memdup_user(arg, sizeof(*fi_args));
        if (IS_ERR(fi_args))
                return PTR_ERR(fi_args);

        flags_in = fi_args->flags;
        memset(fi_args, 0, sizeof(*fi_args));

        rcu_read_lock();
        fi_args->num_devices = fs_devices->num_devices;

        list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
                if (device->devid > fi_args->max_id)
                        fi_args->max_id = device->devid;
        }
        rcu_read_unlock();

        memcpy(&fi_args->fsid, fs_devices->fsid, sizeof(fi_args->fsid));
        fi_args->nodesize = fs_info->nodesize;
        fi_args->sectorsize = fs_info->sectorsize;
        fi_args->clone_alignment = fs_info->sectorsize;

        if (flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO) {
                fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy);
                fi_args->csum_size = btrfs_super_csum_size(fs_info->super_copy);
                fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO;
        }

        if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) {
                fi_args->generation = btrfs_get_fs_generation(fs_info);
                fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION;
        }

        if (flags_in & BTRFS_FS_INFO_FLAG_METADATA_UUID) {
                memcpy(&fi_args->metadata_uuid, fs_devices->metadata_uuid,
                       sizeof(fi_args->metadata_uuid));
                fi_args->flags |= BTRFS_FS_INFO_FLAG_METADATA_UUID;
        }

        if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
                ret = -EFAULT;

        kfree(fi_args);
        return ret;
}

static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
                                 void __user *arg)
{
        BTRFS_DEV_LOOKUP_ARGS(args);
        struct btrfs_ioctl_dev_info_args *di_args;
        struct btrfs_device *dev;
        int ret = 0;

        di_args = memdup_user(arg, sizeof(*di_args));
        if (IS_ERR(di_args))
                return PTR_ERR(di_args);

        args.devid = di_args->devid;
        if (!btrfs_is_empty_uuid(di_args->uuid))
                args.uuid = di_args->uuid;

        rcu_read_lock();
        dev = btrfs_find_device(fs_info->fs_devices, &args);
        if (!dev) {
                ret = -ENODEV;
                goto out;
        }

        di_args->devid = dev->devid;
        di_args->bytes_used = btrfs_device_get_bytes_used(dev);
        di_args->total_bytes = btrfs_device_get_total_bytes(dev);
        memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
        memcpy(di_args->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
        if (dev->name)
                strscpy(di_args->path, btrfs_dev_name(dev), sizeof(di_args->path));
        else
                di_args->path[0] = '\0';

out:
        rcu_read_unlock();
        if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
                ret = -EFAULT;

        kfree(di_args);
        return ret;
}

static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
{
        struct inode *inode = file_inode(file);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_root *new_root;
        struct btrfs_dir_item *di;
        struct btrfs_trans_handle *trans;
        struct btrfs_path *path = NULL;
        struct btrfs_disk_key disk_key;
        struct fscrypt_str name = FSTR_INIT("default", 7);
        u64 objectid = 0;
        u64 dir_id;
        int ret;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        ret = mnt_want_write_file(file);
        if (ret)
                return ret;

        if (copy_from_user(&objectid, argp, sizeof(objectid))) {
                ret = -EFAULT;
                goto out;
        }

        if (!objectid)
                objectid = BTRFS_FS_TREE_OBJECTID;

        new_root = btrfs_get_fs_root(fs_info, objectid, true);
        if (IS_ERR(new_root)) {
                ret = PTR_ERR(new_root);
                goto out;
        }
        if (!is_fstree(btrfs_root_id(new_root))) {
                ret = -ENOENT;
                goto out_free;
        }

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out_free;
        }

        trans = btrfs_start_transaction(root, 1);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto out_free;
        }

        dir_id = btrfs_super_root_dir(fs_info->super_copy);
        di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path,
                                   dir_id, &name, 1);
        if (IS_ERR_OR_NULL(di)) {
                btrfs_release_path(path);
                btrfs_end_transaction(trans);
                btrfs_err(fs_info,
                          "Umm, you don't have the default diritem, this isn't going to work");
                ret = -ENOENT;
                goto out_free;
        }

        btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
        btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
        btrfs_mark_buffer_dirty(trans, path->nodes[0]);
        btrfs_release_path(path);

        btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
        btrfs_end_transaction(trans);
out_free:
        btrfs_put_root(new_root);
        btrfs_free_path(path);
out:
        mnt_drop_write_file(file);
        return ret;
}

static void get_block_group_info(struct list_head *groups_list,
                                 struct btrfs_ioctl_space_info *space)
{
        struct btrfs_block_group *block_group;

        space->total_bytes = 0;
        space->used_bytes = 0;
        space->flags = 0;
        list_for_each_entry(block_group, groups_list, list) {
                space->flags = block_group->flags;
                space->total_bytes += block_group->length;
                space->used_bytes += block_group->used;
        }
}

static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
                                   void __user *arg)
{
        struct btrfs_ioctl_space_args space_args = { 0 };
        struct btrfs_ioctl_space_info space;
        struct btrfs_ioctl_space_info *dest;
        struct btrfs_ioctl_space_info *dest_orig;
        struct btrfs_ioctl_space_info __user *user_dest;
        struct btrfs_space_info *info;
        static const u64 types[] = {
                BTRFS_BLOCK_GROUP_DATA,
                BTRFS_BLOCK_GROUP_SYSTEM,
                BTRFS_BLOCK_GROUP_METADATA,
                BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA
        };
        int num_types = 4;
        int alloc_size;
        int ret = 0;
        u64 slot_count = 0;
        int i, c;

        if (copy_from_user(&space_args,
                           (struct btrfs_ioctl_space_args __user *)arg,
                           sizeof(space_args)))
                return -EFAULT;

        for (i = 0; i < num_types; i++) {
                struct btrfs_space_info *tmp;

                info = NULL;
                list_for_each_entry(tmp, &fs_info->space_info, list) {
                        if (tmp->flags == types[i]) {
                                info = tmp;
                                break;
                        }
                }

                if (!info)
                        continue;

                down_read(&info->groups_sem);
                for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
                        if (!list_empty(&info->block_groups[c]))
                                slot_count++;
                }
                up_read(&info->groups_sem);
        }

        /*
         * Global block reserve, exported as a space_info
         */
        slot_count++;

        /* space_slots == 0 means they are asking for a count */
        if (space_args.space_slots == 0) {
                space_args.total_spaces = slot_count;
                goto out;
        }

        slot_count = min_t(u64, space_args.space_slots, slot_count);

        alloc_size = sizeof(*dest) * slot_count;

        /* we generally have at most 6 or so space infos, one for each raid
         * level.  So, a whole page should be more than enough for everyone
         */
        if (alloc_size > PAGE_SIZE)
                return -ENOMEM;

        space_args.total_spaces = 0;
        dest = kmalloc(alloc_size, GFP_KERNEL);
        if (!dest)
                return -ENOMEM;
        dest_orig = dest;

        /* now we have a buffer to copy into */
        for (i = 0; i < num_types; i++) {
                struct btrfs_space_info *tmp;

                if (!slot_count)
                        break;

                info = NULL;
                list_for_each_entry(tmp, &fs_info->space_info, list) {
                        if (tmp->flags == types[i]) {
                                info = tmp;
                                break;
                        }
                }

                if (!info)
                        continue;
                down_read(&info->groups_sem);
                for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
                        if (!list_empty(&info->block_groups[c])) {
                                get_block_group_info(&info->block_groups[c],
                                                     &space);
                                memcpy(dest, &space, sizeof(space));
                                dest++;
                                space_args.total_spaces++;
                                slot_count--;
                        }
                        if (!slot_count)
                                break;
                }
                up_read(&info->groups_sem);
        }

        /*
         * Add global block reserve
         */
        if (slot_count) {
                struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;

                spin_lock(&block_rsv->lock);
                space.total_bytes = block_rsv->size;
                space.used_bytes = block_rsv->size - block_rsv->reserved;
                spin_unlock(&block_rsv->lock);
                space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV;
                memcpy(dest, &space, sizeof(space));
                space_args.total_spaces++;
        }

        user_dest = (struct btrfs_ioctl_space_info __user *)
                (arg + sizeof(struct btrfs_ioctl_space_args));

        if (copy_to_user(user_dest, dest_orig, alloc_size))
                ret = -EFAULT;

        kfree(dest_orig);
out:
        if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
                ret = -EFAULT;

        return ret;
}

static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
                                            void __user *argp)
{
        struct btrfs_trans_handle *trans;
        u64 transid;

        /*
         * Start orphan cleanup here for the given root in case it hasn't been
         * started already by other means. Errors are handled in the other
         * functions during transaction commit.
         */
        btrfs_orphan_cleanup(root);

        trans = btrfs_attach_transaction_barrier(root);
        if (IS_ERR(trans)) {
                if (PTR_ERR(trans) != -ENOENT)
                        return PTR_ERR(trans);

                /* No running transaction, don't bother */
                transid = btrfs_get_last_trans_committed(root->fs_info);
                goto out;
        }
        transid = trans->transid;
        btrfs_commit_transaction_async(trans);
out:
        if (argp)
                if (copy_to_user(argp, &transid, sizeof(transid)))
                        return -EFAULT;
        return 0;
}

static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info,
                                           void __user *argp)
{
        /* By default wait for the current transaction. */
        u64 transid = 0;

        if (argp)
                if (copy_from_user(&transid, argp, sizeof(transid)))
                        return -EFAULT;

        return btrfs_wait_for_commit(fs_info, transid);
}

static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
{
        struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file));
        struct btrfs_ioctl_scrub_args *sa;
        int ret;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
                btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet");
                return -EINVAL;
        }

        sa = memdup_user(arg, sizeof(*sa));
        if (IS_ERR(sa))
                return PTR_ERR(sa);

        if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
                ret = mnt_want_write_file(file);
                if (ret)
                        goto out;
        }

        ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end,
                              &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
                              0);

        /*
         * Copy scrub args to user space even if btrfs_scrub_dev() returned an
         * error. This is important as it allows user space to know how much
         * progress scrub has done. For example, if scrub is canceled we get
         * -ECANCELED from btrfs_scrub_dev() and return that error back to user
         * space. Later user space can inspect the progress from the structure
         * btrfs_ioctl_scrub_args and resume scrub from where it left off
         * previously (btrfs-progs does this).
         * If we fail to copy the btrfs_ioctl_scrub_args structure to user space
         * then return -EFAULT to signal the structure was not copied or it may
         * be corrupt and unreliable due to a partial copy.
         */
        if (copy_to_user(arg, sa, sizeof(*sa)))
                ret = -EFAULT;

        if (!(sa->flags & BTRFS_SCRUB_READONLY))
                mnt_drop_write_file(file);
out:
        kfree(sa);
        return ret;
}

static long btrfs_ioctl_scrub_cancel(struct btrfs_fs_info *fs_info)
{
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        return btrfs_scrub_cancel(fs_info);
}

static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
                                       void __user *arg)
{
        struct btrfs_ioctl_scrub_args *sa;
        int ret;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        sa = memdup_user(arg, sizeof(*sa));
        if (IS_ERR(sa))
                return PTR_ERR(sa);

        ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);

        if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
                ret = -EFAULT;

        kfree(sa);
        return ret;
}

static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
                                      void __user *arg)
{
        struct btrfs_ioctl_get_dev_stats *sa;
        int ret;

        sa = memdup_user(arg, sizeof(*sa));
        if (IS_ERR(sa))
                return PTR_ERR(sa);

        if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) {
                kfree(sa);
                return -EPERM;
        }

        ret = btrfs_get_dev_stats(fs_info, sa);

        if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
                ret = -EFAULT;

        kfree(sa);
        return ret;
}

static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
                                    void __user *arg)
{
        struct btrfs_ioctl_dev_replace_args *p;
        int ret;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
                btrfs_err(fs_info, "device replace not supported on extent tree v2 yet");
                return -EINVAL;
        }

        p = memdup_user(arg, sizeof(*p));
        if (IS_ERR(p))
                return PTR_ERR(p);

        switch (p->cmd) {
        case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
                if (sb_rdonly(fs_info->sb)) {
                        ret = -EROFS;
                        goto out;
                }
                if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
                        ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
                } else {
                        ret = btrfs_dev_replace_by_ioctl(fs_info, p);
                        btrfs_exclop_finish(fs_info);
                }
                break;
        case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
                btrfs_dev_replace_status(fs_info, p);
                ret = 0;
                break;
        case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
                p->result = btrfs_dev_replace_cancel(fs_info);
                ret = 0;
                break;
        default:
                ret = -EINVAL;
                break;
        }

        if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p)))
                ret = -EFAULT;
out:
        kfree(p);
        return ret;
}

static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
{
        int ret = 0;
        int i;
        u64 rel_ptr;
        int size;
        struct btrfs_ioctl_ino_path_args *ipa = NULL;
        struct inode_fs_paths *ipath = NULL;
        struct btrfs_path *path;

        if (!capable(CAP_DAC_READ_SEARCH))
                return -EPERM;

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }

        ipa = memdup_user(arg, sizeof(*ipa));
        if (IS_ERR(ipa)) {
                ret = PTR_ERR(ipa);
                ipa = NULL;
                goto out;
        }

        size = min_t(u32, ipa->size, 4096);
        ipath = init_ipath(size, root, path);
        if (IS_ERR(ipath)) {
                ret = PTR_ERR(ipath);
                ipath = NULL;
                goto out;
        }

        ret = paths_from_inode(ipa->inum, ipath);
        if (ret < 0)
                goto out;

        for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
                rel_ptr = ipath->fspath->val[i] -
                          (u64)(unsigned long)ipath->fspath->val;
                ipath->fspath->val[i] = rel_ptr;
        }

        btrfs_free_path(path);
        path = NULL;
        ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
                           ipath->fspath, size);
        if (ret) {
                ret = -EFAULT;
                goto out;
        }

out:
        btrfs_free_path(path);
        free_ipath(ipath);
        kfree(ipa);

        return ret;
}

static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
                                        void __user *arg, int version)
{
        int ret = 0;
        int size;
        struct btrfs_ioctl_logical_ino_args *loi;
        struct btrfs_data_container *inodes = NULL;
        struct btrfs_path *path = NULL;
        bool ignore_offset;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        loi = memdup_user(arg, sizeof(*loi));
        if (IS_ERR(loi))
                return PTR_ERR(loi);

        if (version == 1) {
                ignore_offset = false;
                size = min_t(u32, loi->size, SZ_64K);
        } else {
                /* All reserved bits must be 0 for now */
                if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) {
                        ret = -EINVAL;
                        goto out_loi;
                }
                /* Only accept flags we have defined so far */
                if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) {
                        ret = -EINVAL;
                        goto out_loi;
                }
                ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET;
                size = min_t(u32, loi->size, SZ_16M);
        }

        inodes = init_data_container(size);
        if (IS_ERR(inodes)) {
                ret = PTR_ERR(inodes);
                goto out_loi;
        }

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }
        ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
                                          inodes, ignore_offset);
        btrfs_free_path(path);
        if (ret == -EINVAL)
                ret = -ENOENT;
        if (ret < 0)
                goto out;

        ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes,
                           size);
        if (ret)
                ret = -EFAULT;

out:
        kvfree(inodes);
out_loi:
        kfree(loi);

        return ret;
}

void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
                               struct btrfs_ioctl_balance_args *bargs)
{
        struct btrfs_balance_control *bctl = fs_info->balance_ctl;

        bargs->flags = bctl->flags;

        if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags))
                bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
        if (atomic_read(&fs_info->balance_pause_req))
                bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
        if (atomic_read(&fs_info->balance_cancel_req))
                bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;

        memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
        memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
        memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));

        spin_lock(&fs_info->balance_lock);
        memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
        spin_unlock(&fs_info->balance_lock);
}

/*
 * Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as
 * required.
 *
 * @fs_info:       the filesystem
 * @excl_acquired: ptr to boolean value which is set to false in case balance
 *                 is being resumed
 *
 * Return 0 on success in which case both fs_info::balance is acquired as well
 * as exclusive ops are blocked. In case of failure return an error code.
 */
static int btrfs_try_lock_balance(struct btrfs_fs_info *fs_info, bool *excl_acquired)
{
        int ret;

        /*
         * Exclusive operation is locked. Three possibilities:
         *   (1) some other op is running
         *   (2) balance is running
         *   (3) balance is paused -- special case (think resume)
         */
        while (1) {
                if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
                        *excl_acquired = true;
                        mutex_lock(&fs_info->balance_mutex);
                        return 0;
                }

                mutex_lock(&fs_info->balance_mutex);
                if (fs_info->balance_ctl) {
                        /* This is either (2) or (3) */
                        if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
                                /* This is (2) */
                                ret = -EINPROGRESS;
                                goto out_failure;

                        } else {
                                mutex_unlock(&fs_info->balance_mutex);
                                /*
                                 * Lock released to allow other waiters to
                                 * continue, we'll reexamine the status again.
                                 */
                                mutex_lock(&fs_info->balance_mutex);

                                if (fs_info->balance_ctl &&
                                    !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
                                        /* This is (3) */
                                        *excl_acquired = false;
                                        return 0;
                                }
                        }
                } else {
                        /* This is (1) */
                        ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
                        goto out_failure;
                }

                mutex_unlock(&fs_info->balance_mutex);
        }

out_failure:
        mutex_unlock(&fs_info->balance_mutex);
        *excl_acquired = false;
        return ret;
}

static long btrfs_ioctl_balance(struct file *file, void __user *arg)
{
        struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_ioctl_balance_args *bargs;
        struct btrfs_balance_control *bctl;
        bool need_unlock = true;
        int ret;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        ret = mnt_want_write_file(file);
        if (ret)
                return ret;

        bargs = memdup_user(arg, sizeof(*bargs));
        if (IS_ERR(bargs)) {
                ret = PTR_ERR(bargs);
                bargs = NULL;
                goto out;
        }

        ret = btrfs_try_lock_balance(fs_info, &need_unlock);
        if (ret)
                goto out;

        lockdep_assert_held(&fs_info->balance_mutex);

        if (bargs->flags & BTRFS_BALANCE_RESUME) {
                if (!fs_info->balance_ctl) {
                        ret = -ENOTCONN;
                        goto out_unlock;
                }

                bctl = fs_info->balance_ctl;
                spin_lock(&fs_info->balance_lock);
                bctl->flags |= BTRFS_BALANCE_RESUME;
                spin_unlock(&fs_info->balance_lock);
                btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);

                goto do_balance;
        }

        if (bargs->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
                ret = -EINVAL;
                goto out_unlock;
        }

        if (fs_info->balance_ctl) {
                ret = -EINPROGRESS;
                goto out_unlock;
        }

        bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
        if (!bctl) {
                ret = -ENOMEM;
                goto out_unlock;
        }

        memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
        memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
        memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));

        bctl->flags = bargs->flags;
do_balance:
        /*
         * Ownership of bctl and exclusive operation goes to btrfs_balance.
         * bctl is freed in reset_balance_state, or, if restriper was paused
         * all the way until unmount, in free_fs_info.  The flag should be
         * cleared after reset_balance_state.
         */
        need_unlock = false;

        ret = btrfs_balance(fs_info, bctl, bargs);
        bctl = NULL;

        if (ret == 0 || ret == -ECANCELED) {
                if (copy_to_user(arg, bargs, sizeof(*bargs)))
                        ret = -EFAULT;
        }

        kfree(bctl);
out_unlock:
        mutex_unlock(&fs_info->balance_mutex);
        if (need_unlock)
                btrfs_exclop_finish(fs_info);
out:
        mnt_drop_write_file(file);
        kfree(bargs);
        return ret;
}

static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd)
{
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        switch (cmd) {
        case BTRFS_BALANCE_CTL_PAUSE:
                return btrfs_pause_balance(fs_info);
        case BTRFS_BALANCE_CTL_CANCEL:
                return btrfs_cancel_balance(fs_info);
        }

        return -EINVAL;
}

static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info,
                                         void __user *arg)
{
        struct btrfs_ioctl_balance_args *bargs;
        int ret = 0;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        mutex_lock(&fs_info->balance_mutex);
        if (!fs_info->balance_ctl) {
                ret = -ENOTCONN;
                goto out;
        }

        bargs = kzalloc(sizeof(*bargs), GFP_KERNEL);
        if (!bargs) {
                ret = -ENOMEM;
                goto out;
        }

        btrfs_update_ioctl_balance_args(fs_info, bargs);

        if (copy_to_user(arg, bargs, sizeof(*bargs)))
                ret = -EFAULT;

        kfree(bargs);
out:
        mutex_unlock(&fs_info->balance_mutex);
        return ret;
}

static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
{
        struct inode *inode = file_inode(file);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct btrfs_ioctl_quota_ctl_args *sa;
        int ret;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        ret = mnt_want_write_file(file);
        if (ret)
                return ret;

        sa = memdup_user(arg, sizeof(*sa));
        if (IS_ERR(sa)) {
                ret = PTR_ERR(sa);
                goto drop_write;
        }

        switch (sa->cmd) {
        case BTRFS_QUOTA_CTL_ENABLE:
        case BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA:
                down_write(&fs_info->subvol_sem);
                ret = btrfs_quota_enable(fs_info, sa);
                up_write(&fs_info->subvol_sem);
                break;
        case BTRFS_QUOTA_CTL_DISABLE:
                /*
                 * Lock the cleaner mutex to prevent races with concurrent
                 * relocation, because relocation may be building backrefs for
                 * blocks of the quota root while we are deleting the root. This
                 * is like dropping fs roots of deleted snapshots/subvolumes, we
                 * need the same protection.
                 *
                 * This also prevents races between concurrent tasks trying to
                 * disable quotas, because we will unlock and relock
                 * qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
                 *
                 * We take this here because we have the dependency of
                 *
                 * inode_lock -> subvol_sem
                 *
                 * because of rename.  With relocation we can prealloc extents,
                 * so that makes the dependency chain
                 *
                 * cleaner_mutex -> inode_lock -> subvol_sem
                 *
                 * so we must take the cleaner_mutex here before we take the
                 * subvol_sem.  The deadlock can't actually happen, but this
                 * quiets lockdep.
                 */
                mutex_lock(&fs_info->cleaner_mutex);
                down_write(&fs_info->subvol_sem);
                ret = btrfs_quota_disable(fs_info);
                up_write(&fs_info->subvol_sem);
                mutex_unlock(&fs_info->cleaner_mutex);
                break;
        default:
                ret = -EINVAL;
                break;
        }

        kfree(sa);
drop_write:
        mnt_drop_write_file(file);
        return ret;
}

static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
{
        struct inode *inode = file_inode(file);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ioctl_qgroup_assign_args *sa;
        struct btrfs_trans_handle *trans;
        int ret;
        int err;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        ret = mnt_want_write_file(file);
        if (ret)
                return ret;

        sa = memdup_user(arg, sizeof(*sa));
        if (IS_ERR(sa)) {
                ret = PTR_ERR(sa);
                goto drop_write;
        }

        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto out;
        }

        if (sa->assign) {
                ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst);
        } else {
                ret = btrfs_del_qgroup_relation(trans, sa->src, sa->dst);
        }

        /* update qgroup status and info */
        mutex_lock(&fs_info->qgroup_ioctl_lock);
        err = btrfs_run_qgroups(trans);
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
        if (err < 0)
                btrfs_handle_fs_error(fs_info, err,
                                      "failed to update qgroup status and info");
        err = btrfs_end_transaction(trans);
        if (err && !ret)
                ret = err;

out:
        kfree(sa);
drop_write:
        mnt_drop_write_file(file);
        return ret;
}

static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
{
        struct inode *inode = file_inode(file);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ioctl_qgroup_create_args *sa;
        struct btrfs_trans_handle *trans;
        int ret;
        int err;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        ret = mnt_want_write_file(file);
        if (ret)
                return ret;

        sa = memdup_user(arg, sizeof(*sa));
        if (IS_ERR(sa)) {
                ret = PTR_ERR(sa);
                goto drop_write;
        }

        if (!sa->qgroupid) {
                ret = -EINVAL;
                goto out;
        }

        if (sa->create && is_fstree(sa->qgroupid)) {
                ret = -EINVAL;
                goto out;
        }

        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto out;
        }

        if (sa->create) {
                ret = btrfs_create_qgroup(trans, sa->qgroupid);
        } else {
                ret = btrfs_remove_qgroup(trans, sa->qgroupid);
        }

        err = btrfs_end_transaction(trans);
        if (err && !ret)
                ret = err;

out:
        kfree(sa);
drop_write:
        mnt_drop_write_file(file);
        return ret;
}

static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
{
        struct inode *inode = file_inode(file);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ioctl_qgroup_limit_args *sa;
        struct btrfs_trans_handle *trans;
        int ret;
        int err;
        u64 qgroupid;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        ret = mnt_want_write_file(file);
        if (ret)
                return ret;

        sa = memdup_user(arg, sizeof(*sa));
        if (IS_ERR(sa)) {
                ret = PTR_ERR(sa);
                goto drop_write;
        }

        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto out;
        }

        qgroupid = sa->qgroupid;
        if (!qgroupid) {
                /* take the current subvol as qgroup */
                qgroupid = btrfs_root_id(root);
        }

        ret = btrfs_limit_qgroup(trans, qgroupid, &sa->lim);

        err = btrfs_end_transaction(trans);
        if (err && !ret)
                ret = err;

out:
        kfree(sa);
drop_write:
        mnt_drop_write_file(file);
        return ret;
}

static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
{
        struct inode *inode = file_inode(file);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct btrfs_ioctl_quota_rescan_args *qsa;
        int ret;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        ret = mnt_want_write_file(file);
        if (ret)
                return ret;

        qsa = memdup_user(arg, sizeof(*qsa));
        if (IS_ERR(qsa)) {
                ret = PTR_ERR(qsa);
                goto drop_write;
        }

        if (qsa->flags) {
                ret = -EINVAL;
                goto out;
        }

        ret = btrfs_qgroup_rescan(fs_info);

out:
        kfree(qsa);
drop_write:
        mnt_drop_write_file(file);
        return ret;
}

static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
                                                void __user *arg)
{
        struct btrfs_ioctl_quota_rescan_args qsa = {0};

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
                qsa.flags = 1;
                qsa.progress = fs_info->qgroup_rescan_progress.objectid;
        }

        if (copy_to_user(arg, &qsa, sizeof(qsa)))
                return -EFAULT;

        return 0;
}

static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
                                                void __user *arg)
{
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        return btrfs_qgroup_wait_for_completion(fs_info, true);
}

static long _btrfs_ioctl_set_received_subvol(struct file *file,
                                            struct mnt_idmap *idmap,
                                            struct btrfs_ioctl_received_subvol_args *sa)
{
        struct inode *inode = file_inode(file);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_root_item *root_item = &root->root_item;
        struct btrfs_trans_handle *trans;
        struct timespec64 ct = current_time(inode);
        int ret = 0;
        int received_uuid_changed;

        if (!inode_owner_or_capable(idmap, inode))
                return -EPERM;

        ret = mnt_want_write_file(file);
        if (ret < 0)
                return ret;

        down_write(&fs_info->subvol_sem);

        if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
                ret = -EINVAL;
                goto out;
        }

        if (btrfs_root_readonly(root)) {
                ret = -EROFS;
                goto out;
        }

        /*
         * 1 - root item
         * 2 - uuid items (received uuid + subvol uuid)
         */
        trans = btrfs_start_transaction(root, 3);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                trans = NULL;
                goto out;
        }

        sa->rtransid = trans->transid;
        sa->rtime.sec = ct.tv_sec;
        sa->rtime.nsec = ct.tv_nsec;

        received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid,
                                       BTRFS_UUID_SIZE);
        if (received_uuid_changed &&
            !btrfs_is_empty_uuid(root_item->received_uuid)) {
                ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid,
                                          BTRFS_UUID_KEY_RECEIVED_SUBVOL,
                                          btrfs_root_id(root));
                if (ret && ret != -ENOENT) {
                        btrfs_abort_transaction(trans, ret);
                        btrfs_end_transaction(trans);
                        goto out;
                }
        }
        memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
        btrfs_set_root_stransid(root_item, sa->stransid);
        btrfs_set_root_rtransid(root_item, sa->rtransid);
        btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec);
        btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec);
        btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec);
        btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec);

        ret = btrfs_update_root(trans, fs_info->tree_root,
                                &root->root_key, &root->root_item);
        if (ret < 0) {
                btrfs_end_transaction(trans);
                goto out;
        }
        if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
                ret = btrfs_uuid_tree_add(trans, sa->uuid,
                                          BTRFS_UUID_KEY_RECEIVED_SUBVOL,
                                          btrfs_root_id(root));
                if (ret < 0 && ret != -EEXIST) {
                        btrfs_abort_transaction(trans, ret);
                        btrfs_end_transaction(trans);
                        goto out;
                }
        }
        ret = btrfs_commit_transaction(trans);
out:
        up_write(&fs_info->subvol_sem);
        mnt_drop_write_file(file);
        return ret;
}

#ifdef CONFIG_64BIT
static long btrfs_ioctl_set_received_subvol_32(struct file *file,
                                                void __user *arg)
{
        struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
        struct btrfs_ioctl_received_subvol_args *args64 = NULL;
        int ret = 0;

        args32 = memdup_user(arg, sizeof(*args32));
        if (IS_ERR(args32))
                return PTR_ERR(args32);

        args64 = kmalloc(sizeof(*args64), GFP_KERNEL);
        if (!args64) {
                ret = -ENOMEM;
                goto out;
        }

        memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
        args64->stransid = args32->stransid;
        args64->rtransid = args32->rtransid;
        args64->stime.sec = args32->stime.sec;
        args64->stime.nsec = args32->stime.nsec;
        args64->rtime.sec = args32->rtime.sec;
        args64->rtime.nsec = args32->rtime.nsec;
        args64->flags = args32->flags;

        ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_idmap(file), args64);
        if (ret)
                goto out;

        memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
        args32->stransid = args64->stransid;
        args32->rtransid = args64->rtransid;
        args32->stime.sec = args64->stime.sec;
        args32->stime.nsec = args64->stime.nsec;
        args32->rtime.sec = args64->rtime.sec;
        args32->rtime.nsec = args64->rtime.nsec;
        args32->flags = args64->flags;

        ret = copy_to_user(arg, args32, sizeof(*args32));
        if (ret)
                ret = -EFAULT;

out:
        kfree(args32);
        kfree(args64);
        return ret;
}
#endif

static long btrfs_ioctl_set_received_subvol(struct file *file,
                                            void __user *arg)
{
        struct btrfs_ioctl_received_subvol_args *sa = NULL;
        int ret = 0;

        sa = memdup_user(arg, sizeof(*sa));
        if (IS_ERR(sa))
                return PTR_ERR(sa);

        ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_idmap(file), sa);

        if (ret)
                goto out;

        ret = copy_to_user(arg, sa, sizeof(*sa));
        if (ret)
                ret = -EFAULT;

out:
        kfree(sa);
        return ret;
}

static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info,
                                        void __user *arg)
{
        size_t len;
        int ret;
        char label[BTRFS_LABEL_SIZE];

        spin_lock(&fs_info->super_lock);
        memcpy(label, fs_info->super_copy->label, BTRFS_LABEL_SIZE);
        spin_unlock(&fs_info->super_lock);

        len = strnlen(label, BTRFS_LABEL_SIZE);

        if (len == BTRFS_LABEL_SIZE) {
                btrfs_warn(fs_info,
                           "label is too long, return the first %zu bytes",
                           --len);
        }

        ret = copy_to_user(arg, label, len);

        return ret ? -EFAULT : 0;
}

static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
{
        struct inode *inode = file_inode(file);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_super_block *super_block = fs_info->super_copy;
        struct btrfs_trans_handle *trans;
        char label[BTRFS_LABEL_SIZE];
        int ret;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        if (copy_from_user(label, arg, sizeof(label)))
                return -EFAULT;

        if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
                btrfs_err(fs_info,
                          "unable to set label with more than %d bytes",
                          BTRFS_LABEL_SIZE - 1);
                return -EINVAL;
        }

        ret = mnt_want_write_file(file);
        if (ret)
                return ret;

        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto out_unlock;
        }

        spin_lock(&fs_info->super_lock);
        strcpy(super_block->label, label);
        spin_unlock(&fs_info->super_lock);
        ret = btrfs_commit_transaction(trans);

out_unlock:
        mnt_drop_write_file(file);
        return ret;
}

#define INIT_FEATURE_FLAGS(suffix) \
        { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \
          .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
          .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }

int btrfs_ioctl_get_supported_features(void __user *arg)
{
        static const struct btrfs_ioctl_feature_flags features[3] = {
                INIT_FEATURE_FLAGS(SUPP),
                INIT_FEATURE_FLAGS(SAFE_SET),
                INIT_FEATURE_FLAGS(SAFE_CLEAR)
        };

        if (copy_to_user(arg, &features, sizeof(features)))
                return -EFAULT;

        return 0;
}

static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info,
                                        void __user *arg)
{
        struct btrfs_super_block *super_block = fs_info->super_copy;
        struct btrfs_ioctl_feature_flags features;

        features.compat_flags = btrfs_super_compat_flags(super_block);
        features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block);
        features.incompat_flags = btrfs_super_incompat_flags(super_block);

        if (copy_to_user(arg, &features, sizeof(features)))
                return -EFAULT;

        return 0;
}

static int check_feature_bits(struct btrfs_fs_info *fs_info,
                              enum btrfs_feature_set set,
                              u64 change_mask, u64 flags, u64 supported_flags,
                              u64 safe_set, u64 safe_clear)
{
        const char *type = btrfs_feature_set_name(set);
        char *names;
        u64 disallowed, unsupported;
        u64 set_mask = flags & change_mask;
        u64 clear_mask = ~flags & change_mask;

        unsupported = set_mask & ~supported_flags;
        if (unsupported) {
                names = btrfs_printable_features(set, unsupported);
                if (names) {
                        btrfs_warn(fs_info,
                                   "this kernel does not support the %s feature bit%s",
                                   names, strchr(names, ',') ? "s" : "");
                        kfree(names);
                } else
                        btrfs_warn(fs_info,
                                   "this kernel does not support %s bits 0x%llx",
                                   type, unsupported);
                return -EOPNOTSUPP;
        }

        disallowed = set_mask & ~safe_set;
        if (disallowed) {
                names = btrfs_printable_features(set, disallowed);
                if (names) {
                        btrfs_warn(fs_info,
                                   "can't set the %s feature bit%s while mounted",
                                   names, strchr(names, ',') ? "s" : "");
                        kfree(names);
                } else
                        btrfs_warn(fs_info,
                                   "can't set %s bits 0x%llx while mounted",
                                   type, disallowed);
                return -EPERM;
        }

        disallowed = clear_mask & ~safe_clear;
        if (disallowed) {
                names = btrfs_printable_features(set, disallowed);
                if (names) {
                        btrfs_warn(fs_info,
                                   "can't clear the %s feature bit%s while mounted",
                                   names, strchr(names, ',') ? "s" : "");
                        kfree(names);
                } else
                        btrfs_warn(fs_info,
                                   "can't clear %s bits 0x%llx while mounted",
                                   type, disallowed);
                return -EPERM;
        }

        return 0;
}

#define check_feature(fs_info, change_mask, flags, mask_base)        \
check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags,        \
                   BTRFS_FEATURE_ ## mask_base ## _SUPP,        \
                   BTRFS_FEATURE_ ## mask_base ## _SAFE_SET,        \
                   BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR)

static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
{
        struct inode *inode = file_inode(file);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_super_block *super_block = fs_info->super_copy;
        struct btrfs_ioctl_feature_flags flags[2];
        struct btrfs_trans_handle *trans;
        u64 newflags;
        int ret;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        if (copy_from_user(flags, arg, sizeof(flags)))
                return -EFAULT;

        /* Nothing to do */
        if (!flags[0].compat_flags && !flags[0].compat_ro_flags &&
            !flags[0].incompat_flags)
                return 0;

        ret = check_feature(fs_info, flags[0].compat_flags,
                            flags[1].compat_flags, COMPAT);
        if (ret)
                return ret;

        ret = check_feature(fs_info, flags[0].compat_ro_flags,
                            flags[1].compat_ro_flags, COMPAT_RO);
        if (ret)
                return ret;

        ret = check_feature(fs_info, flags[0].incompat_flags,
                            flags[1].incompat_flags, INCOMPAT);
        if (ret)
                return ret;

        ret = mnt_want_write_file(file);
        if (ret)
                return ret;

        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto out_drop_write;
        }

        spin_lock(&fs_info->super_lock);
        newflags = btrfs_super_compat_flags(super_block);
        newflags |= flags[0].compat_flags & flags[1].compat_flags;
        newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags);
        btrfs_set_super_compat_flags(super_block, newflags);

        newflags = btrfs_super_compat_ro_flags(super_block);
        newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags;
        newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags);
        btrfs_set_super_compat_ro_flags(super_block, newflags);

        newflags = btrfs_super_incompat_flags(super_block);
        newflags |= flags[0].incompat_flags & flags[1].incompat_flags;
        newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags);
        btrfs_set_super_incompat_flags(super_block, newflags);
        spin_unlock(&fs_info->super_lock);

        ret = btrfs_commit_transaction(trans);
out_drop_write:
        mnt_drop_write_file(file);

        return ret;
}

static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat)
{
        struct btrfs_ioctl_send_args *arg;
        int ret;

        if (compat) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
                struct btrfs_ioctl_send_args_32 args32 = { 0 };

                ret = copy_from_user(&args32, argp, sizeof(args32));
                if (ret)
                        return -EFAULT;
                arg = kzalloc(sizeof(*arg), GFP_KERNEL);
                if (!arg)
                        return -ENOMEM;
                arg->send_fd = args32.send_fd;
                arg->clone_sources_count = args32.clone_sources_count;
                arg->clone_sources = compat_ptr(args32.clone_sources);
                arg->parent_root = args32.parent_root;
                arg->flags = args32.flags;
                arg->version = args32.version;
                memcpy(arg->reserved, args32.reserved,
                       sizeof(args32.reserved));
#else
                return -ENOTTY;
#endif
        } else {
                arg = memdup_user(argp, sizeof(*arg));
                if (IS_ERR(arg))
                        return PTR_ERR(arg);
        }
        ret = btrfs_ioctl_send(inode, arg);
        kfree(arg);
        return ret;
}

static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
                                    bool compat)
{
        struct btrfs_ioctl_encoded_io_args args = { 0 };
        size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
                                             flags);
        size_t copy_end;
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        loff_t pos;
        struct kiocb kiocb;
        ssize_t ret;

        if (!capable(CAP_SYS_ADMIN)) {
                ret = -EPERM;
                goto out_acct;
        }

        if (compat) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
                struct btrfs_ioctl_encoded_io_args_32 args32;

                copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32,
                                       flags);
                if (copy_from_user(&args32, argp, copy_end)) {
                        ret = -EFAULT;
                        goto out_acct;
                }
                args.iov = compat_ptr(args32.iov);
                args.iovcnt = args32.iovcnt;
                args.offset = args32.offset;
                args.flags = args32.flags;
#else
                return -ENOTTY;
#endif
        } else {
                copy_end = copy_end_kernel;
                if (copy_from_user(&args, argp, copy_end)) {
                        ret = -EFAULT;
                        goto out_acct;
                }
        }
        if (args.flags != 0) {
                ret = -EINVAL;
                goto out_acct;
        }

        ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
                           &iov, &iter);
        if (ret < 0)
                goto out_acct;

        if (iov_iter_count(&iter) == 0) {
                ret = 0;
                goto out_iov;
        }
        pos = args.offset;
        ret = rw_verify_area(READ, file, &pos, args.len);
        if (ret < 0)
                goto out_iov;

        init_sync_kiocb(&kiocb, file);
        kiocb.ki_pos = pos;

        ret = btrfs_encoded_read(&kiocb, &iter, &args);
        if (ret >= 0) {
                fsnotify_access(file);
                if (copy_to_user(argp + copy_end,
                                 (char *)&args + copy_end_kernel,
                                 sizeof(args) - copy_end_kernel))
                        ret = -EFAULT;
        }

out_iov:
        kfree(iov);
out_acct:
        if (ret > 0)
                add_rchar(current, ret);
        inc_syscr(current);
        return ret;
}

static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat)
{
        struct btrfs_ioctl_encoded_io_args args;
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        loff_t pos;
        struct kiocb kiocb;
        ssize_t ret;

        if (!capable(CAP_SYS_ADMIN)) {
                ret = -EPERM;
                goto out_acct;
        }

        if (!(file->f_mode & FMODE_WRITE)) {
                ret = -EBADF;
                goto out_acct;
        }

        if (compat) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
                struct btrfs_ioctl_encoded_io_args_32 args32;

                if (copy_from_user(&args32, argp, sizeof(args32))) {
                        ret = -EFAULT;
                        goto out_acct;
                }
                args.iov = compat_ptr(args32.iov);
                args.iovcnt = args32.iovcnt;
                args.offset = args32.offset;
                args.flags = args32.flags;
                args.len = args32.len;
                args.unencoded_len = args32.unencoded_len;
                args.unencoded_offset = args32.unencoded_offset;
                args.compression = args32.compression;
                args.encryption = args32.encryption;
                memcpy(args.reserved, args32.reserved, sizeof(args.reserved));
#else
                return -ENOTTY;
#endif
        } else {
                if (copy_from_user(&args, argp, sizeof(args))) {
                        ret = -EFAULT;
                        goto out_acct;
                }
        }

        ret = -EINVAL;
        if (args.flags != 0)
                goto out_acct;
        if (memchr_inv(args.reserved, 0, sizeof(args.reserved)))
                goto out_acct;
        if (args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE &&
            args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE)
                goto out_acct;
        if (args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES ||
            args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES)
                goto out_acct;
        if (args.unencoded_offset > args.unencoded_len)
                goto out_acct;
        if (args.len > args.unencoded_len - args.unencoded_offset)
                goto out_acct;

        ret = import_iovec(ITER_SOURCE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
                           &iov, &iter);
        if (ret < 0)
                goto out_acct;

        if (iov_iter_count(&iter) == 0) {
                ret = 0;
                goto out_iov;
        }
        pos = args.offset;
        ret = rw_verify_area(WRITE, file, &pos, args.len);
        if (ret < 0)
                goto out_iov;

        init_sync_kiocb(&kiocb, file);
        ret = kiocb_set_rw_flags(&kiocb, 0);
        if (ret)
                goto out_iov;
        kiocb.ki_pos = pos;

        file_start_write(file);

        ret = btrfs_do_write_iter(&kiocb, &iter, &args);
        if (ret > 0)
                fsnotify_modify(file);

        file_end_write(file);
out_iov:
        kfree(iov);
out_acct:
        if (ret > 0)
                add_wchar(current, ret);
        inc_syscw(current);
        return ret;
}

long btrfs_ioctl(struct file *file, unsigned int
                cmd, unsigned long arg)
{
        struct inode *inode = file_inode(file);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        void __user *argp = (void __user *)arg;

        switch (cmd) {
        case FS_IOC_GETVERSION:
                return btrfs_ioctl_getversion(inode, argp);
        case FS_IOC_GETFSLABEL:
                return btrfs_ioctl_get_fslabel(fs_info, argp);
        case FS_IOC_SETFSLABEL:
                return btrfs_ioctl_set_fslabel(file, argp);
        case FITRIM:
                return btrfs_ioctl_fitrim(fs_info, argp);
        case BTRFS_IOC_SNAP_CREATE:
                return btrfs_ioctl_snap_create(file, argp, 0);
        case BTRFS_IOC_SNAP_CREATE_V2:
                return btrfs_ioctl_snap_create_v2(file, argp, 0);
        case BTRFS_IOC_SUBVOL_CREATE:
                return btrfs_ioctl_snap_create(file, argp, 1);
        case BTRFS_IOC_SUBVOL_CREATE_V2:
                return btrfs_ioctl_snap_create_v2(file, argp, 1);
        case BTRFS_IOC_SNAP_DESTROY:
                return btrfs_ioctl_snap_destroy(file, argp, false);
        case BTRFS_IOC_SNAP_DESTROY_V2:
                return btrfs_ioctl_snap_destroy(file, argp, true);
        case BTRFS_IOC_SUBVOL_GETFLAGS:
                return btrfs_ioctl_subvol_getflags(inode, argp);
        case BTRFS_IOC_SUBVOL_SETFLAGS:
                return btrfs_ioctl_subvol_setflags(file, argp);
        case BTRFS_IOC_DEFAULT_SUBVOL:
                return btrfs_ioctl_default_subvol(file, argp);
        case BTRFS_IOC_DEFRAG:
                return btrfs_ioctl_defrag(file, NULL);
        case BTRFS_IOC_DEFRAG_RANGE:
                return btrfs_ioctl_defrag(file, argp);
        case BTRFS_IOC_RESIZE:
                return btrfs_ioctl_resize(file, argp);
        case BTRFS_IOC_ADD_DEV:
                return btrfs_ioctl_add_dev(fs_info, argp);
        case BTRFS_IOC_RM_DEV:
                return btrfs_ioctl_rm_dev(file, argp);
        case BTRFS_IOC_RM_DEV_V2:
                return btrfs_ioctl_rm_dev_v2(file, argp);
        case BTRFS_IOC_FS_INFO:
                return btrfs_ioctl_fs_info(fs_info, argp);
        case BTRFS_IOC_DEV_INFO:
                return btrfs_ioctl_dev_info(fs_info, argp);
        case BTRFS_IOC_TREE_SEARCH:
                return btrfs_ioctl_tree_search(inode, argp);
        case BTRFS_IOC_TREE_SEARCH_V2:
                return btrfs_ioctl_tree_search_v2(inode, argp);
        case BTRFS_IOC_INO_LOOKUP:
                return btrfs_ioctl_ino_lookup(root, argp);
        case BTRFS_IOC_INO_PATHS:
                return btrfs_ioctl_ino_to_path(root, argp);
        case BTRFS_IOC_LOGICAL_INO:
                return btrfs_ioctl_logical_to_ino(fs_info, argp, 1);
        case BTRFS_IOC_LOGICAL_INO_V2:
                return btrfs_ioctl_logical_to_ino(fs_info, argp, 2);
        case BTRFS_IOC_SPACE_INFO:
                return btrfs_ioctl_space_info(fs_info, argp);
        case BTRFS_IOC_SYNC: {
                int ret;

                ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
                if (ret)
                        return ret;
                ret = btrfs_sync_fs(inode->i_sb, 1);
                /*
                 * The transaction thread may want to do more work,
                 * namely it pokes the cleaner kthread that will start
                 * processing uncleaned subvols.
                 */
                wake_up_process(fs_info->transaction_kthread);
                return ret;
        }
        case BTRFS_IOC_START_SYNC:
                return btrfs_ioctl_start_sync(root, argp);
        case BTRFS_IOC_WAIT_SYNC:
                return btrfs_ioctl_wait_sync(fs_info, argp);
        case BTRFS_IOC_SCRUB:
                return btrfs_ioctl_scrub(file, argp);
        case BTRFS_IOC_SCRUB_CANCEL:
                return btrfs_ioctl_scrub_cancel(fs_info);
        case BTRFS_IOC_SCRUB_PROGRESS:
                return btrfs_ioctl_scrub_progress(fs_info, argp);
        case BTRFS_IOC_BALANCE_V2:
                return btrfs_ioctl_balance(file, argp);
        case BTRFS_IOC_BALANCE_CTL:
                return btrfs_ioctl_balance_ctl(fs_info, arg);
        case BTRFS_IOC_BALANCE_PROGRESS:
                return btrfs_ioctl_balance_progress(fs_info, argp);
        case BTRFS_IOC_SET_RECEIVED_SUBVOL:
                return btrfs_ioctl_set_received_subvol(file, argp);
#ifdef CONFIG_64BIT
        case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
                return btrfs_ioctl_set_received_subvol_32(file, argp);
#endif
        case BTRFS_IOC_SEND:
                return _btrfs_ioctl_send(inode, argp, false);
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
        case BTRFS_IOC_SEND_32:
                return _btrfs_ioctl_send(inode, argp, true);
#endif
        case BTRFS_IOC_GET_DEV_STATS:
                return btrfs_ioctl_get_dev_stats(fs_info, argp);
        case BTRFS_IOC_QUOTA_CTL:
                return btrfs_ioctl_quota_ctl(file, argp);
        case BTRFS_IOC_QGROUP_ASSIGN:
                return btrfs_ioctl_qgroup_assign(file, argp);
        case BTRFS_IOC_QGROUP_CREATE:
                return btrfs_ioctl_qgroup_create(file, argp);
        case BTRFS_IOC_QGROUP_LIMIT:
                return btrfs_ioctl_qgroup_limit(file, argp);
        case BTRFS_IOC_QUOTA_RESCAN:
                return btrfs_ioctl_quota_rescan(file, argp);
        case BTRFS_IOC_QUOTA_RESCAN_STATUS:
                return btrfs_ioctl_quota_rescan_status(fs_info, argp);
        case BTRFS_IOC_QUOTA_RESCAN_WAIT:
                return btrfs_ioctl_quota_rescan_wait(fs_info, argp);
        case BTRFS_IOC_DEV_REPLACE:
                return btrfs_ioctl_dev_replace(fs_info, argp);
        case BTRFS_IOC_GET_SUPPORTED_FEATURES:
                return btrfs_ioctl_get_supported_features(argp);
        case BTRFS_IOC_GET_FEATURES:
                return btrfs_ioctl_get_features(fs_info, argp);
        case BTRFS_IOC_SET_FEATURES:
                return btrfs_ioctl_set_features(file, argp);
        case BTRFS_IOC_GET_SUBVOL_INFO:
                return btrfs_ioctl_get_subvol_info(inode, argp);
        case BTRFS_IOC_GET_SUBVOL_ROOTREF:
                return btrfs_ioctl_get_subvol_rootref(root, argp);
        case BTRFS_IOC_INO_LOOKUP_USER:
                return btrfs_ioctl_ino_lookup_user(file, argp);
        case FS_IOC_ENABLE_VERITY:
                return fsverity_ioctl_enable(file, (const void __user *)argp);
        case FS_IOC_MEASURE_VERITY:
                return fsverity_ioctl_measure(file, argp);
        case BTRFS_IOC_ENCODED_READ:
                return btrfs_ioctl_encoded_read(file, argp, false);
        case BTRFS_IOC_ENCODED_WRITE:
                return btrfs_ioctl_encoded_write(file, argp, false);
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
        case BTRFS_IOC_ENCODED_READ_32:
                return btrfs_ioctl_encoded_read(file, argp, true);
        case BTRFS_IOC_ENCODED_WRITE_32:
                return btrfs_ioctl_encoded_write(file, argp, true);
#endif
        }

        return -ENOTTY;
}

#ifdef CONFIG_COMPAT
long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        /*
         * These all access 32-bit values anyway so no further
         * handling is necessary.
         */
        switch (cmd) {
        case FS_IOC32_GETVERSION:
                cmd = FS_IOC_GETVERSION;
                break;
        }

        return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
}
#endif




































































    1 
    2 






























































































































    1 


    1 
































































































































































































    1 
    1 















    1 
    1 
























































































































































































































































































    1 

    2 

    1 

    1 









    1 





    1 
    1 






























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
// SPDX-License-Identifier: GPL-2.0

#include <linux/slab.h>
#include "messages.h"
#include "ctree.h"
#include "subpage.h"
#include "btrfs_inode.h"

/*
 * Subpage (sectorsize < PAGE_SIZE) support overview:
 *
 * Limitations:
 *
 * - Only support 64K page size for now
 *   This is to make metadata handling easier, as 64K page would ensure
 *   all nodesize would fit inside one page, thus we don't need to handle
 *   cases where a tree block crosses several pages.
 *
 * - Only metadata read-write for now
 *   The data read-write part is in development.
 *
 * - Metadata can't cross 64K page boundary
 *   btrfs-progs and kernel have done that for a while, thus only ancient
 *   filesystems could have such problem.  For such case, do a graceful
 *   rejection.
 *
 * Special behavior:
 *
 * - Metadata
 *   Metadata read is fully supported.
 *   Meaning when reading one tree block will only trigger the read for the
 *   needed range, other unrelated range in the same page will not be touched.
 *
 *   Metadata write support is partial.
 *   The writeback is still for the full page, but we will only submit
 *   the dirty extent buffers in the page.
 *
 *   This means, if we have a metadata page like this:
 *
 *   Page offset
 *   0         16K         32K         48K        64K
 *   |/////////|           |///////////|
 *        \- Tree block A        \- Tree block B
 *
 *   Even if we just want to writeback tree block A, we will also writeback
 *   tree block B if it's also dirty.
 *
 *   This may cause extra metadata writeback which results more COW.
 *
 * Implementation:
 *
 * - Common
 *   Both metadata and data will use a new structure, btrfs_subpage, to
 *   record the status of each sector inside a page.  This provides the extra
 *   granularity needed.
 *
 * - Metadata
 *   Since we have multiple tree blocks inside one page, we can't rely on page
 *   locking anymore, or we will have greatly reduced concurrency or even
 *   deadlocks (hold one tree lock while trying to lock another tree lock in
 *   the same page).
 *
 *   Thus for metadata locking, subpage support relies on io_tree locking only.
 *   This means a slightly higher tree locking latency.
 */

bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping)
{
        if (fs_info->sectorsize >= PAGE_SIZE)
                return false;

        /*
         * Only data pages (either through DIO or compression) can have no
         * mapping. And if page->mapping->host is data inode, it's subpage.
         * As we have ruled our sectorsize >= PAGE_SIZE case already.
         */
        if (!mapping || !mapping->host || is_data_inode(mapping->host))
                return true;

        /*
         * Now the only remaining case is metadata, which we only go subpage
         * routine if nodesize < PAGE_SIZE.
         */
        if (fs_info->nodesize < PAGE_SIZE)
                return true;
        return false;
}

void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize)
{
        unsigned int cur = 0;
        unsigned int nr_bits;

        ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize));

        nr_bits = PAGE_SIZE / sectorsize;
        subpage_info->bitmap_nr_bits = nr_bits;

        subpage_info->uptodate_offset = cur;
        cur += nr_bits;

        subpage_info->dirty_offset = cur;
        cur += nr_bits;

        subpage_info->writeback_offset = cur;
        cur += nr_bits;

        subpage_info->ordered_offset = cur;
        cur += nr_bits;

        subpage_info->checked_offset = cur;
        cur += nr_bits;

        subpage_info->locked_offset = cur;
        cur += nr_bits;

        subpage_info->total_nr_bits = cur;
}

int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
                         struct folio *folio, enum btrfs_subpage_type type)
{
        struct btrfs_subpage *subpage;

        /*
         * We have cases like a dummy extent buffer page, which is not mapped
         * and doesn't need to be locked.
         */
        if (folio->mapping)
                ASSERT(folio_test_locked(folio));

        /* Either not subpage, or the folio already has private attached. */
        if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio))
                return 0;

        subpage = btrfs_alloc_subpage(fs_info, type);
        if (IS_ERR(subpage))
                return  PTR_ERR(subpage);

        folio_attach_private(folio, subpage);
        return 0;
}

void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
        struct btrfs_subpage *subpage;

        /* Either not subpage, or the folio already has private attached. */
        if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio))
                return;

        subpage = folio_detach_private(folio);
        ASSERT(subpage);
        btrfs_free_subpage(subpage);
}

struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
                                          enum btrfs_subpage_type type)
{
        struct btrfs_subpage *ret;
        unsigned int real_size;

        ASSERT(fs_info->sectorsize < PAGE_SIZE);

        real_size = struct_size(ret, bitmaps,
                        BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits));
        ret = kzalloc(real_size, GFP_NOFS);
        if (!ret)
                return ERR_PTR(-ENOMEM);

        spin_lock_init(&ret->lock);
        if (type == BTRFS_SUBPAGE_METADATA) {
                atomic_set(&ret->eb_refs, 0);
        } else {
                atomic_set(&ret->readers, 0);
                atomic_set(&ret->writers, 0);
        }
        return ret;
}

void btrfs_free_subpage(struct btrfs_subpage *subpage)
{
        kfree(subpage);
}

/*
 * Increase the eb_refs of current subpage.
 *
 * This is important for eb allocation, to prevent race with last eb freeing
 * of the same page.
 * With the eb_refs increased before the eb inserted into radix tree,
 * detach_extent_buffer_page() won't detach the folio private while we're still
 * allocating the extent buffer.
 */
void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
        struct btrfs_subpage *subpage;

        if (!btrfs_is_subpage(fs_info, folio->mapping))
                return;

        ASSERT(folio_test_private(folio) && folio->mapping);
        lockdep_assert_held(&folio->mapping->i_private_lock);

        subpage = folio_get_private(folio);
        atomic_inc(&subpage->eb_refs);
}

void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
        struct btrfs_subpage *subpage;

        if (!btrfs_is_subpage(fs_info, folio->mapping))
                return;

        ASSERT(folio_test_private(folio) && folio->mapping);
        lockdep_assert_held(&folio->mapping->i_private_lock);

        subpage = folio_get_private(folio);
        ASSERT(atomic_read(&subpage->eb_refs));
        atomic_dec(&subpage->eb_refs);
}

static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
                                 struct folio *folio, u64 start, u32 len)
{
        /* For subpage support, the folio must be single page. */
        ASSERT(folio_order(folio) == 0);

        /* Basic checks */
        ASSERT(folio_test_private(folio) && folio_get_private(folio));
        ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
               IS_ALIGNED(len, fs_info->sectorsize));
        /*
         * The range check only works for mapped page, we can still have
         * unmapped page like dummy extent buffer pages.
         */
        if (folio->mapping)
                ASSERT(folio_pos(folio) <= start &&
                       start + len <= folio_pos(folio) + PAGE_SIZE);
}

#define subpage_calc_start_bit(fs_info, folio, name, start, len)        \
({                                                                        \
        unsigned int start_bit;                                                \
                                                                        \
        btrfs_subpage_assert(fs_info, folio, start, len);                \
        start_bit = offset_in_page(start) >> fs_info->sectorsize_bits;        \
        start_bit += fs_info->subpage_info->name##_offset;                \
        start_bit;                                                        \
})

void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
                                struct folio *folio, u64 start, u32 len)
{
        struct btrfs_subpage *subpage = folio_get_private(folio);
        const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
        const int nbits = len >> fs_info->sectorsize_bits;
        unsigned long flags;


        btrfs_subpage_assert(fs_info, folio, start, len);

        spin_lock_irqsave(&subpage->lock, flags);
        /*
         * Even though it's just for reading the page, no one should have
         * locked the subpage range.
         */
        ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
        bitmap_set(subpage->bitmaps, start_bit, nbits);
        atomic_add(nbits, &subpage->readers);
        spin_unlock_irqrestore(&subpage->lock, flags);
}

void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
                              struct folio *folio, u64 start, u32 len)
{
        struct btrfs_subpage *subpage = folio_get_private(folio);
        const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
        const int nbits = len >> fs_info->sectorsize_bits;
        unsigned long flags;
        bool is_data;
        bool last;

        btrfs_subpage_assert(fs_info, folio, start, len);
        is_data = is_data_inode(folio->mapping->host);

        spin_lock_irqsave(&subpage->lock, flags);

        /* The range should have already been locked. */
        ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits));
        ASSERT(atomic_read(&subpage->readers) >= nbits);

        bitmap_clear(subpage->bitmaps, start_bit, nbits);
        last = atomic_sub_and_test(nbits, &subpage->readers);

        /*
         * For data we need to unlock the page if the last read has finished.
         *
         * And please don't replace @last with atomic_sub_and_test() call
         * inside if () condition.
         * As we want the atomic_sub_and_test() to be always executed.
         */
        if (is_data && last)
                folio_unlock(folio);
        spin_unlock_irqrestore(&subpage->lock, flags);
}

static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
{
        u64 orig_start = *start;
        u32 orig_len = *len;

        *start = max_t(u64, folio_pos(folio), orig_start);
        /*
         * For certain call sites like btrfs_drop_pages(), we may have pages
         * beyond the target range. In that case, just set @len to 0, subpage
         * helpers can handle @len == 0 without any problem.
         */
        if (folio_pos(folio) >= orig_start + orig_len)
                *len = 0;
        else
                *len = min_t(u64, folio_pos(folio) + PAGE_SIZE,
                             orig_start + orig_len) - *start;
}

static void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
                                       struct folio *folio, u64 start, u32 len)
{
        struct btrfs_subpage *subpage = folio_get_private(folio);
        const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
        const int nbits = (len >> fs_info->sectorsize_bits);
        unsigned long flags;
        int ret;

        btrfs_subpage_assert(fs_info, folio, start, len);

        spin_lock_irqsave(&subpage->lock, flags);
        ASSERT(atomic_read(&subpage->readers) == 0);
        ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
        bitmap_set(subpage->bitmaps, start_bit, nbits);
        ret = atomic_add_return(nbits, &subpage->writers);
        ASSERT(ret == nbits);
        spin_unlock_irqrestore(&subpage->lock, flags);
}

static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
                                              struct folio *folio, u64 start, u32 len)
{
        struct btrfs_subpage *subpage = folio_get_private(folio);
        const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
        const int nbits = (len >> fs_info->sectorsize_bits);
        unsigned long flags;
        bool last;

        btrfs_subpage_assert(fs_info, folio, start, len);

        spin_lock_irqsave(&subpage->lock, flags);
        /*
         * We have call sites passing @lock_page into
         * extent_clear_unlock_delalloc() for compression path.
         *
         * This @locked_page is locked by plain lock_page(), thus its
         * subpage::writers is 0.  Handle them in a special way.
         */
        if (atomic_read(&subpage->writers) == 0) {
                spin_unlock_irqrestore(&subpage->lock, flags);
                return true;
        }

        ASSERT(atomic_read(&subpage->writers) >= nbits);
        /* The target range should have been locked. */
        ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits));
        bitmap_clear(subpage->bitmaps, start_bit, nbits);
        last = atomic_sub_and_test(nbits, &subpage->writers);
        spin_unlock_irqrestore(&subpage->lock, flags);
        return last;
}

/*
 * Lock a folio for delalloc page writeback.
 *
 * Return -EAGAIN if the page is not properly initialized.
 * Return 0 with the page locked, and writer counter updated.
 *
 * Even with 0 returned, the page still need extra check to make sure
 * it's really the correct page, as the caller is using
 * filemap_get_folios_contig(), which can race with page invalidating.
 */
int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
                                  struct folio *folio, u64 start, u32 len)
{
        if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
                folio_lock(folio);
                return 0;
        }
        folio_lock(folio);
        if (!folio_test_private(folio) || !folio_get_private(folio)) {
                folio_unlock(folio);
                return -EAGAIN;
        }
        btrfs_subpage_clamp_range(folio, &start, &len);
        btrfs_subpage_start_writer(fs_info, folio, start, len);
        return 0;
}

void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
                                 struct folio *folio, u64 start, u32 len)
{
        if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
                folio_unlock(folio);
                return;
        }
        btrfs_subpage_clamp_range(folio, &start, &len);
        if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len))
                folio_unlock(folio);
}

#define subpage_test_bitmap_all_set(fs_info, subpage, name)                \
        bitmap_test_range_all_set(subpage->bitmaps,                        \
                        fs_info->subpage_info->name##_offset,                \
                        fs_info->subpage_info->bitmap_nr_bits)

#define subpage_test_bitmap_all_zero(fs_info, subpage, name)                \
        bitmap_test_range_all_zero(subpage->bitmaps,                        \
                        fs_info->subpage_info->name##_offset,                \
                        fs_info->subpage_info->bitmap_nr_bits)

void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
                                struct folio *folio, u64 start, u32 len)
{
        struct btrfs_subpage *subpage = folio_get_private(folio);
        unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
                                                        uptodate, start, len);
        unsigned long flags;

        spin_lock_irqsave(&subpage->lock, flags);
        bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
        if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate))
                folio_mark_uptodate(folio);
        spin_unlock_irqrestore(&subpage->lock, flags);
}

void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
                                  struct folio *folio, u64 start, u32 len)
{
        struct btrfs_subpage *subpage = folio_get_private(folio);
        unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
                                                        uptodate, start, len);
        unsigned long flags;

        spin_lock_irqsave(&subpage->lock, flags);
        bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
        folio_clear_uptodate(folio);
        spin_unlock_irqrestore(&subpage->lock, flags);
}

void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
                             struct folio *folio, u64 start, u32 len)
{
        struct btrfs_subpage *subpage = folio_get_private(folio);
        unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
                                                        dirty, start, len);
        unsigned long flags;

        spin_lock_irqsave(&subpage->lock, flags);
        bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
        spin_unlock_irqrestore(&subpage->lock, flags);
        folio_mark_dirty(folio);
}

/*
 * Extra clear_and_test function for subpage dirty bitmap.
 *
 * Return true if we're the last bits in the dirty_bitmap and clear the
 * dirty_bitmap.
 * Return false otherwise.
 *
 * NOTE: Callers should manually clear page dirty for true case, as we have
 * extra handling for tree blocks.
 */
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
                                        struct folio *folio, u64 start, u32 len)
{
        struct btrfs_subpage *subpage = folio_get_private(folio);
        unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
                                                        dirty, start, len);
        unsigned long flags;
        bool last = false;

        spin_lock_irqsave(&subpage->lock, flags);
        bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
        if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty))
                last = true;
        spin_unlock_irqrestore(&subpage->lock, flags);
        return last;
}

void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
                               struct folio *folio, u64 start, u32 len)
{
        bool last;

        last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, start, len);
        if (last)
                folio_clear_dirty_for_io(folio);
}

void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
                                 struct folio *folio, u64 start, u32 len)
{
        struct btrfs_subpage *subpage = folio_get_private(folio);
        unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
                                                        writeback, start, len);
        unsigned long flags;

        spin_lock_irqsave(&subpage->lock, flags);
        bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
        if (!folio_test_writeback(folio))
                folio_start_writeback(folio);
        spin_unlock_irqrestore(&subpage->lock, flags);
}

void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
                                   struct folio *folio, u64 start, u32 len)
{
        struct btrfs_subpage *subpage = folio_get_private(folio);
        unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
                                                        writeback, start, len);
        unsigned long flags;

        spin_lock_irqsave(&subpage->lock, flags);
        bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
        if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) {
                ASSERT(folio_test_writeback(folio));
                folio_end_writeback(folio);
        }
        spin_unlock_irqrestore(&subpage->lock, flags);
}

void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
                               struct folio *folio, u64 start, u32 len)
{
        struct btrfs_subpage *subpage = folio_get_private(folio);
        unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
                                                        ordered, start, len);
        unsigned long flags;

        spin_lock_irqsave(&subpage->lock, flags);
        bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
        folio_set_ordered(folio);
        spin_unlock_irqrestore(&subpage->lock, flags);
}

void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
                                 struct folio *folio, u64 start, u32 len)
{
        struct btrfs_subpage *subpage = folio_get_private(folio);
        unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
                                                        ordered, start, len);
        unsigned long flags;

        spin_lock_irqsave(&subpage->lock, flags);
        bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
        if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered))
                folio_clear_ordered(folio);
        spin_unlock_irqrestore(&subpage->lock, flags);
}

void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
                               struct folio *folio, u64 start, u32 len)
{
        struct btrfs_subpage *subpage = folio_get_private(folio);
        unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
                                                        checked, start, len);
        unsigned long flags;

        spin_lock_irqsave(&subpage->lock, flags);
        bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
        if (subpage_test_bitmap_all_set(fs_info, subpage, checked))
                folio_set_checked(folio);
        spin_unlock_irqrestore(&subpage->lock, flags);
}

void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
                                 struct folio *folio, u64 start, u32 len)
{
        struct btrfs_subpage *subpage = folio_get_private(folio);
        unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
                                                        checked, start, len);
        unsigned long flags;

        spin_lock_irqsave(&subpage->lock, flags);
        bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
        folio_clear_checked(folio);
        spin_unlock_irqrestore(&subpage->lock, flags);
}

/*
 * Unlike set/clear which is dependent on each page status, for test all bits
 * are tested in the same way.
 */
#define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name)                                \
bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info,        \
                               struct folio *folio, u64 start, u32 len)        \
{                                                                        \
        struct btrfs_subpage *subpage = folio_get_private(folio);        \
        unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,        \
                                                name, start, len);        \
        unsigned long flags;                                                \
        bool ret;                                                        \
                                                                        \
        spin_lock_irqsave(&subpage->lock, flags);                        \
        ret = bitmap_test_range_all_set(subpage->bitmaps, start_bit,        \
                                len >> fs_info->sectorsize_bits);        \
        spin_unlock_irqrestore(&subpage->lock, flags);                        \
        return ret;                                                        \
}
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);

/*
 * Note that, in selftests (extent-io-tests), we can have empty fs_info passed
 * in.  We only test sectorsize == PAGE_SIZE cases so far, thus we can fall
 * back to regular sectorsize branch.
 */
#define IMPLEMENT_BTRFS_PAGE_OPS(name, folio_set_func,                        \
                                 folio_clear_func, folio_test_func)        \
void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info,        \
                            struct folio *folio, u64 start, u32 len)        \
{                                                                        \
        if (unlikely(!fs_info) ||                                        \
            !btrfs_is_subpage(fs_info, folio->mapping)) {                \
                folio_set_func(folio);                                        \
                return;                                                        \
        }                                                                \
        btrfs_subpage_set_##name(fs_info, folio, start, len);                \
}                                                                        \
void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info,        \
                              struct folio *folio, u64 start, u32 len)        \
{                                                                        \
        if (unlikely(!fs_info) ||                                        \
            !btrfs_is_subpage(fs_info, folio->mapping)) {                \
                folio_clear_func(folio);                                \
                return;                                                        \
        }                                                                \
        btrfs_subpage_clear_##name(fs_info, folio, start, len);                \
}                                                                        \
bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info,        \
                             struct folio *folio, u64 start, u32 len)        \
{                                                                        \
        if (unlikely(!fs_info) ||                                        \
            !btrfs_is_subpage(fs_info, folio->mapping))                        \
                return folio_test_func(folio);                                \
        return btrfs_subpage_test_##name(fs_info, folio, start, len);        \
}                                                                        \
void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info,        \
                                  struct folio *folio, u64 start, u32 len) \
{                                                                        \
        if (unlikely(!fs_info) ||                                        \
            !btrfs_is_subpage(fs_info, folio->mapping)) {                \
                folio_set_func(folio);                                        \
                return;                                                        \
        }                                                                \
        btrfs_subpage_clamp_range(folio, &start, &len);                        \
        btrfs_subpage_set_##name(fs_info, folio, start, len);                \
}                                                                        \
void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
                                    struct folio *folio, u64 start, u32 len) \
{                                                                        \
        if (unlikely(!fs_info) ||                                        \
            !btrfs_is_subpage(fs_info, folio->mapping)) {                \
                folio_clear_func(folio);                                \
                return;                                                        \
        }                                                                \
        btrfs_subpage_clamp_range(folio, &start, &len);                        \
        btrfs_subpage_clear_##name(fs_info, folio, start, len);                \
}                                                                        \
bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info,        \
                                   struct folio *folio, u64 start, u32 len) \
{                                                                        \
        if (unlikely(!fs_info) ||                                        \
            !btrfs_is_subpage(fs_info, folio->mapping))                        \
                return folio_test_func(folio);                                \
        btrfs_subpage_clamp_range(folio, &start, &len);                        \
        return btrfs_subpage_test_##name(fs_info, folio, start, len);        \
}
IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate,
                         folio_test_uptodate);
IMPLEMENT_BTRFS_PAGE_OPS(dirty, folio_mark_dirty, folio_clear_dirty_for_io,
                         folio_test_dirty);
IMPLEMENT_BTRFS_PAGE_OPS(writeback, folio_start_writeback, folio_end_writeback,
                         folio_test_writeback);
IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered,
                         folio_test_ordered);
IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
                         folio_test_checked);

/*
 * Make sure not only the page dirty bit is cleared, but also subpage dirty bit
 * is cleared.
 */
void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
        struct btrfs_subpage *subpage = folio_get_private(folio);

        if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
                return;

        ASSERT(!folio_test_dirty(folio));
        if (!btrfs_is_subpage(fs_info, folio->mapping))
                return;

        ASSERT(folio_test_private(folio) && folio_get_private(folio));
        ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty));
}

/*
 * Handle different locked pages with different page sizes:
 *
 * - Page locked by plain lock_page()
 *   It should not have any subpage::writers count.
 *   Can be unlocked by unlock_page().
 *   This is the most common locked page for __extent_writepage() called
 *   inside extent_write_cache_pages().
 *   Rarer cases include the @locked_page from extent_write_locked_range().
 *
 * - Page locked by lock_delalloc_pages()
 *   There is only one caller, all pages except @locked_page for
 *   extent_write_locked_range().
 *   In this case, we have to call subpage helper to handle the case.
 */
void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info,
                               struct folio *folio, u64 start, u32 len)
{
        struct btrfs_subpage *subpage;

        ASSERT(folio_test_locked(folio));
        /* For non-subpage case, we just unlock the page */
        if (!btrfs_is_subpage(fs_info, folio->mapping)) {
                folio_unlock(folio);
                return;
        }

        ASSERT(folio_test_private(folio) && folio_get_private(folio));
        subpage = folio_get_private(folio);

        /*
         * For subpage case, there are two types of locked page.  With or
         * without writers number.
         *
         * Since we own the page lock, no one else could touch subpage::writers
         * and we are safe to do several atomic operations without spinlock.
         */
        if (atomic_read(&subpage->writers) == 0) {
                /* No writers, locked by plain lock_page() */
                folio_unlock(folio);
                return;
        }

        /* Have writers, use proper subpage helper to end it */
        btrfs_folio_end_writer_lock(fs_info, folio, start, len);
}

#define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst)                \
        bitmap_cut(dst, subpage->bitmaps, 0,                                \
                   subpage_info->name##_offset, subpage_info->bitmap_nr_bits)

void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
                                      struct folio *folio, u64 start, u32 len)
{
        struct btrfs_subpage_info *subpage_info = fs_info->subpage_info;
        struct btrfs_subpage *subpage;
        unsigned long uptodate_bitmap;
        unsigned long error_bitmap;
        unsigned long dirty_bitmap;
        unsigned long writeback_bitmap;
        unsigned long ordered_bitmap;
        unsigned long checked_bitmap;
        unsigned long flags;

        ASSERT(folio_test_private(folio) && folio_get_private(folio));
        ASSERT(subpage_info);
        subpage = folio_get_private(folio);

        spin_lock_irqsave(&subpage->lock, flags);
        GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap);
        GET_SUBPAGE_BITMAP(subpage, subpage_info, dirty, &dirty_bitmap);
        GET_SUBPAGE_BITMAP(subpage, subpage_info, writeback, &writeback_bitmap);
        GET_SUBPAGE_BITMAP(subpage, subpage_info, ordered, &ordered_bitmap);
        GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap);
        GET_SUBPAGE_BITMAP(subpage, subpage_info, locked, &checked_bitmap);
        spin_unlock_irqrestore(&subpage->lock, flags);

        dump_page(folio_page(folio, 0), "btrfs subpage dump");
        btrfs_warn(fs_info,
"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl error=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
                    start, len, folio_pos(folio),
                    subpage_info->bitmap_nr_bits, &uptodate_bitmap,
                    subpage_info->bitmap_nr_bits, &error_bitmap,
                    subpage_info->bitmap_nr_bits, &dirty_bitmap,
                    subpage_info->bitmap_nr_bits, &writeback_bitmap,
                    subpage_info->bitmap_nr_bits, &ordered_bitmap,
                    subpage_info->bitmap_nr_bits, &checked_bitmap);
}





















































































































































































    5 













    5 




    5 
    4 
    5 





    5 

















    3 


































































































































































































































































































































































































































































    3 



















    3 







































    3 






    3 

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
// SPDX-License-Identifier: GPL-2.0
#include <linux/memcontrol.h>
#include <linux/rwsem.h>
#include <linux/shrinker.h>
#include <linux/rculist.h>
#include <trace/events/vmscan.h>

#include "internal.h"

LIST_HEAD(shrinker_list);
DEFINE_MUTEX(shrinker_mutex);

#ifdef CONFIG_MEMCG
static int shrinker_nr_max;

static inline int shrinker_unit_size(int nr_items)
{
        return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *));
}

static inline void shrinker_unit_free(struct shrinker_info *info, int start)
{
        struct shrinker_info_unit **unit;
        int nr, i;

        if (!info)
                return;

        unit = info->unit;
        nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS);

        for (i = start; i < nr; i++) {
                if (!unit[i])
                        break;

                kfree(unit[i]);
                unit[i] = NULL;
        }
}

static inline int shrinker_unit_alloc(struct shrinker_info *new,
                                       struct shrinker_info *old, int nid)
{
        struct shrinker_info_unit *unit;
        int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS);
        int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0;
        int i;

        for (i = start; i < nr; i++) {
                unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid);
                if (!unit) {
                        shrinker_unit_free(new, start);
                        return -ENOMEM;
                }

                new->unit[i] = unit;
        }

        return 0;
}

void free_shrinker_info(struct mem_cgroup *memcg)
{
        struct mem_cgroup_per_node *pn;
        struct shrinker_info *info;
        int nid;

        for_each_node(nid) {
                pn = memcg->nodeinfo[nid];
                info = rcu_dereference_protected(pn->shrinker_info, true);
                shrinker_unit_free(info, 0);
                kvfree(info);
                rcu_assign_pointer(pn->shrinker_info, NULL);
        }
}

int alloc_shrinker_info(struct mem_cgroup *memcg)
{
        struct shrinker_info *info;
        int nid, ret = 0;
        int array_size = 0;

        mutex_lock(&shrinker_mutex);
        array_size = shrinker_unit_size(shrinker_nr_max);
        for_each_node(nid) {
                info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid);
                if (!info)
                        goto err;
                info->map_nr_max = shrinker_nr_max;
                if (shrinker_unit_alloc(info, NULL, nid))
                        goto err;
                rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
        }
        mutex_unlock(&shrinker_mutex);

        return ret;

err:
        mutex_unlock(&shrinker_mutex);
        free_shrinker_info(memcg);
        return -ENOMEM;
}

static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
                                                     int nid)
{
        return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
                                         lockdep_is_held(&shrinker_mutex));
}

static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size,
                                    int old_size, int new_nr_max)
{
        struct shrinker_info *new, *old;
        struct mem_cgroup_per_node *pn;
        int nid;

        for_each_node(nid) {
                pn = memcg->nodeinfo[nid];
                old = shrinker_info_protected(memcg, nid);
                /* Not yet online memcg */
                if (!old)
                        return 0;

                /* Already expanded this shrinker_info */
                if (new_nr_max <= old->map_nr_max)
                        continue;

                new = kvzalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid);
                if (!new)
                        return -ENOMEM;

                new->map_nr_max = new_nr_max;

                memcpy(new->unit, old->unit, old_size);
                if (shrinker_unit_alloc(new, old, nid)) {
                        kvfree(new);
                        return -ENOMEM;
                }

                rcu_assign_pointer(pn->shrinker_info, new);
                kvfree_rcu(old, rcu);
        }

        return 0;
}

static int expand_shrinker_info(int new_id)
{
        int ret = 0;
        int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS);
        int new_size, old_size = 0;
        struct mem_cgroup *memcg;

        if (!root_mem_cgroup)
                goto out;

        lockdep_assert_held(&shrinker_mutex);

        new_size = shrinker_unit_size(new_nr_max);
        old_size = shrinker_unit_size(shrinker_nr_max);

        memcg = mem_cgroup_iter(NULL, NULL, NULL);
        do {
                ret = expand_one_shrinker_info(memcg, new_size, old_size,
                                               new_nr_max);
                if (ret) {
                        mem_cgroup_iter_break(NULL, memcg);
                        goto out;
                }
        } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
out:
        if (!ret)
                shrinker_nr_max = new_nr_max;

        return ret;
}

static inline int shrinker_id_to_index(int shrinker_id)
{
        return shrinker_id / SHRINKER_UNIT_BITS;
}

static inline int shrinker_id_to_offset(int shrinker_id)
{
        return shrinker_id % SHRINKER_UNIT_BITS;
}

static inline int calc_shrinker_id(int index, int offset)
{
        return index * SHRINKER_UNIT_BITS + offset;
}

void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
{
        if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
                struct shrinker_info *info;
                struct shrinker_info_unit *unit;

                rcu_read_lock();
                info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
                unit = info->unit[shrinker_id_to_index(shrinker_id)];
                if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) {
                        /* Pairs with smp mb in shrink_slab() */
                        smp_mb__before_atomic();
                        set_bit(shrinker_id_to_offset(shrinker_id), unit->map);
                }
                rcu_read_unlock();
        }
}

static DEFINE_IDR(shrinker_idr);

static int shrinker_memcg_alloc(struct shrinker *shrinker)
{
        int id, ret = -ENOMEM;

        if (mem_cgroup_disabled())
                return -ENOSYS;

        mutex_lock(&shrinker_mutex);
        id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
        if (id < 0)
                goto unlock;

        if (id >= shrinker_nr_max) {
                if (expand_shrinker_info(id)) {
                        idr_remove(&shrinker_idr, id);
                        goto unlock;
                }
        }
        shrinker->id = id;
        ret = 0;
unlock:
        mutex_unlock(&shrinker_mutex);
        return ret;
}

static void shrinker_memcg_remove(struct shrinker *shrinker)
{
        int id = shrinker->id;

        BUG_ON(id < 0);

        lockdep_assert_held(&shrinker_mutex);

        idr_remove(&shrinker_idr, id);
}

static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
                                   struct mem_cgroup *memcg)
{
        struct shrinker_info *info;
        struct shrinker_info_unit *unit;
        long nr_deferred;

        rcu_read_lock();
        info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
        unit = info->unit[shrinker_id_to_index(shrinker->id)];
        nr_deferred = atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0);
        rcu_read_unlock();

        return nr_deferred;
}

static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
                                  struct mem_cgroup *memcg)
{
        struct shrinker_info *info;
        struct shrinker_info_unit *unit;
        long nr_deferred;

        rcu_read_lock();
        info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
        unit = info->unit[shrinker_id_to_index(shrinker->id)];
        nr_deferred =
                atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]);
        rcu_read_unlock();

        return nr_deferred;
}

void reparent_shrinker_deferred(struct mem_cgroup *memcg)
{
        int nid, index, offset;
        long nr;
        struct mem_cgroup *parent;
        struct shrinker_info *child_info, *parent_info;
        struct shrinker_info_unit *child_unit, *parent_unit;

        parent = parent_mem_cgroup(memcg);
        if (!parent)
                parent = root_mem_cgroup;

        /* Prevent from concurrent shrinker_info expand */
        mutex_lock(&shrinker_mutex);
        for_each_node(nid) {
                child_info = shrinker_info_protected(memcg, nid);
                parent_info = shrinker_info_protected(parent, nid);
                for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) {
                        child_unit = child_info->unit[index];
                        parent_unit = parent_info->unit[index];
                        for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) {
                                nr = atomic_long_read(&child_unit->nr_deferred[offset]);
                                atomic_long_add(nr, &parent_unit->nr_deferred[offset]);
                        }
                }
        }
        mutex_unlock(&shrinker_mutex);
}
#else
static int shrinker_memcg_alloc(struct shrinker *shrinker)
{
        return -ENOSYS;
}

static void shrinker_memcg_remove(struct shrinker *shrinker)
{
}

static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
                                   struct mem_cgroup *memcg)
{
        return 0;
}

static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
                                  struct mem_cgroup *memcg)
{
        return 0;
}
#endif /* CONFIG_MEMCG */

static long xchg_nr_deferred(struct shrinker *shrinker,
                             struct shrink_control *sc)
{
        int nid = sc->nid;

        if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
                nid = 0;

        if (sc->memcg &&
            (shrinker->flags & SHRINKER_MEMCG_AWARE))
                return xchg_nr_deferred_memcg(nid, shrinker,
                                              sc->memcg);

        return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
}


static long add_nr_deferred(long nr, struct shrinker *shrinker,
                            struct shrink_control *sc)
{
        int nid = sc->nid;

        if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
                nid = 0;

        if (sc->memcg &&
            (shrinker->flags & SHRINKER_MEMCG_AWARE))
                return add_nr_deferred_memcg(nr, nid, shrinker,
                                             sc->memcg);

        return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
}

#define SHRINK_BATCH 128

static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
                                    struct shrinker *shrinker, int priority)
{
        unsigned long freed = 0;
        unsigned long long delta;
        long total_scan;
        long freeable;
        long nr;
        long new_nr;
        long batch_size = shrinker->batch ? shrinker->batch
                                          : SHRINK_BATCH;
        long scanned = 0, next_deferred;

        freeable = shrinker->count_objects(shrinker, shrinkctl);
        if (freeable == 0 || freeable == SHRINK_EMPTY)
                return freeable;

        /*
         * copy the current shrinker scan count into a local variable
         * and zero it so that other concurrent shrinker invocations
         * don't also do this scanning work.
         */
        nr = xchg_nr_deferred(shrinker, shrinkctl);

        if (shrinker->seeks) {
                delta = freeable >> priority;
                delta *= 4;
                do_div(delta, shrinker->seeks);
        } else {
                /*
                 * These objects don't require any IO to create. Trim
                 * them aggressively under memory pressure to keep
                 * them from causing refetches in the IO caches.
                 */
                delta = freeable / 2;
        }

        total_scan = nr >> priority;
        total_scan += delta;
        total_scan = min(total_scan, (2 * freeable));

        trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
                                   freeable, delta, total_scan, priority);

        /*
         * Normally, we should not scan less than batch_size objects in one
         * pass to avoid too frequent shrinker calls, but if the slab has less
         * than batch_size objects in total and we are really tight on memory,
         * we will try to reclaim all available objects, otherwise we can end
         * up failing allocations although there are plenty of reclaimable
         * objects spread over several slabs with usage less than the
         * batch_size.
         *
         * We detect the "tight on memory" situations by looking at the total
         * number of objects we want to scan (total_scan). If it is greater
         * than the total number of objects on slab (freeable), we must be
         * scanning at high prio and therefore should try to reclaim as much as
         * possible.
         */
        while (total_scan >= batch_size ||
               total_scan >= freeable) {
                unsigned long ret;
                unsigned long nr_to_scan = min(batch_size, total_scan);

                shrinkctl->nr_to_scan = nr_to_scan;
                shrinkctl->nr_scanned = nr_to_scan;
                ret = shrinker->scan_objects(shrinker, shrinkctl);
                if (ret == SHRINK_STOP)
                        break;
                freed += ret;

                count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
                total_scan -= shrinkctl->nr_scanned;
                scanned += shrinkctl->nr_scanned;

                cond_resched();
        }

        /*
         * The deferred work is increased by any new work (delta) that wasn't
         * done, decreased by old deferred work that was done now.
         *
         * And it is capped to two times of the freeable items.
         */
        next_deferred = max_t(long, (nr + delta - scanned), 0);
        next_deferred = min(next_deferred, (2 * freeable));

        /*
         * move the unused scan count back into the shrinker in a
         * manner that handles concurrent updates.
         */
        new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);

        trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
        return freed;
}

#ifdef CONFIG_MEMCG
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                        struct mem_cgroup *memcg, int priority)
{
        struct shrinker_info *info;
        unsigned long ret, freed = 0;
        int offset, index = 0;

        if (!mem_cgroup_online(memcg))
                return 0;

        /*
         * lockless algorithm of memcg shrink.
         *
         * The shrinker_info may be freed asynchronously via RCU in the
         * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used
         * to ensure the existence of the shrinker_info.
         *
         * The shrinker_info_unit is never freed unless its corresponding memcg
         * is destroyed. Here we already hold the refcount of memcg, so the
         * memcg will not be destroyed, and of course shrinker_info_unit will
         * not be freed.
         *
         * So in the memcg shrink:
         *  step 1: use rcu_read_lock() to guarantee existence of the
         *          shrinker_info.
         *  step 2: after getting shrinker_info_unit we can safely release the
         *          RCU lock.
         *  step 3: traverse the bitmap and calculate shrinker_id
         *  step 4: use rcu_read_lock() to guarantee existence of the shrinker.
         *  step 5: use shrinker_id to find the shrinker, then use
         *          shrinker_try_get() to guarantee existence of the shrinker,
         *          then we can release the RCU lock to do do_shrink_slab() that
         *          may sleep.
         *  step 6: do shrinker_put() paired with step 5 to put the refcount,
         *          if the refcount reaches 0, then wake up the waiter in
         *          shrinker_free() by calling complete().
         *          Note: here is different from the global shrink, we don't
         *                need to acquire the RCU lock to guarantee existence of
         *                the shrinker, because we don't need to use this
         *                shrinker to traverse the next shrinker in the bitmap.
         *  step 7: we have already exited the read-side of rcu critical section
         *          before calling do_shrink_slab(), the shrinker_info may be
         *          released in expand_one_shrinker_info(), so go back to step 1
         *          to reacquire the shrinker_info.
         */
again:
        rcu_read_lock();
        info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
        if (unlikely(!info))
                goto unlock;

        if (index < shrinker_id_to_index(info->map_nr_max)) {
                struct shrinker_info_unit *unit;

                unit = info->unit[index];

                rcu_read_unlock();

                for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) {
                        struct shrink_control sc = {
                                .gfp_mask = gfp_mask,
                                .nid = nid,
                                .memcg = memcg,
                        };
                        struct shrinker *shrinker;
                        int shrinker_id = calc_shrinker_id(index, offset);

                        rcu_read_lock();
                        shrinker = idr_find(&shrinker_idr, shrinker_id);
                        if (unlikely(!shrinker || !shrinker_try_get(shrinker))) {
                                clear_bit(offset, unit->map);
                                rcu_read_unlock();
                                continue;
                        }
                        rcu_read_unlock();

                        /* Call non-slab shrinkers even though kmem is disabled */
                        if (!memcg_kmem_online() &&
                            !(shrinker->flags & SHRINKER_NONSLAB))
                                continue;

                        ret = do_shrink_slab(&sc, shrinker, priority);
                        if (ret == SHRINK_EMPTY) {
                                clear_bit(offset, unit->map);
                                /*
                                 * After the shrinker reported that it had no objects to
                                 * free, but before we cleared the corresponding bit in
                                 * the memcg shrinker map, a new object might have been
                                 * added. To make sure, we have the bit set in this
                                 * case, we invoke the shrinker one more time and reset
                                 * the bit if it reports that it is not empty anymore.
                                 * The memory barrier here pairs with the barrier in
                                 * set_shrinker_bit():
                                 *
                                 * list_lru_add()     shrink_slab_memcg()
                                 *   list_add_tail()    clear_bit()
                                 *   <MB>               <MB>
                                 *   set_bit()          do_shrink_slab()
                                 */
                                smp_mb__after_atomic();
                                ret = do_shrink_slab(&sc, shrinker, priority);
                                if (ret == SHRINK_EMPTY)
                                        ret = 0;
                                else
                                        set_shrinker_bit(memcg, nid, shrinker_id);
                        }
                        freed += ret;
                        shrinker_put(shrinker);
                }

                index++;
                goto again;
        }
unlock:
        rcu_read_unlock();
        return freed;
}
#else /* !CONFIG_MEMCG */
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                        struct mem_cgroup *memcg, int priority)
{
        return 0;
}
#endif /* CONFIG_MEMCG */

/**
 * shrink_slab - shrink slab caches
 * @gfp_mask: allocation context
 * @nid: node whose slab caches to target
 * @memcg: memory cgroup whose slab caches to target
 * @priority: the reclaim priority
 *
 * Call the shrink functions to age shrinkable caches.
 *
 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
 * unaware shrinkers will receive a node id of 0 instead.
 *
 * @memcg specifies the memory cgroup to target. Unaware shrinkers
 * are called only if it is the root cgroup.
 *
 * @priority is sc->priority, we take the number of objects and >> by priority
 * in order to get the scan target.
 *
 * Returns the number of reclaimed slab objects.
 */
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
                          int priority)
{
        unsigned long ret, freed = 0;
        struct shrinker *shrinker;

        /*
         * The root memcg might be allocated even though memcg is disabled
         * via "cgroup_disable=memory" boot parameter.  This could make
         * mem_cgroup_is_root() return false, then just run memcg slab
         * shrink, but skip global shrink.  This may result in premature
         * oom.
         */
        if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
                return shrink_slab_memcg(gfp_mask, nid, memcg, priority);

        /*
         * lockless algorithm of global shrink.
         *
         * In the unregistration setp, the shrinker will be freed asynchronously
         * via RCU after its refcount reaches 0. So both rcu_read_lock() and
         * shrinker_try_get() can be used to ensure the existence of the shrinker.
         *
         * So in the global shrink:
         *  step 1: use rcu_read_lock() to guarantee existence of the shrinker
         *          and the validity of the shrinker_list walk.
         *  step 2: use shrinker_try_get() to try get the refcount, if successful,
         *          then the existence of the shrinker can also be guaranteed,
         *          so we can release the RCU lock to do do_shrink_slab() that
         *          may sleep.
         *  step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(),
         *          which ensures that neither this shrinker nor the next shrinker
         *          will be freed in the next traversal operation.
         *  step 4: do shrinker_put() paired with step 2 to put the refcount,
         *          if the refcount reaches 0, then wake up the waiter in
         *          shrinker_free() by calling complete().
         */
        rcu_read_lock();
        list_for_each_entry_rcu(shrinker, &shrinker_list, list) {
                struct shrink_control sc = {
                        .gfp_mask = gfp_mask,
                        .nid = nid,
                        .memcg = memcg,
                };

                if (!shrinker_try_get(shrinker))
                        continue;

                rcu_read_unlock();

                ret = do_shrink_slab(&sc, shrinker, priority);
                if (ret == SHRINK_EMPTY)
                        ret = 0;
                freed += ret;

                rcu_read_lock();
                shrinker_put(shrinker);
        }

        rcu_read_unlock();
        cond_resched();
        return freed;
}

struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...)
{
        struct shrinker *shrinker;
        unsigned int size;
        va_list ap;
        int err;

        shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL);
        if (!shrinker)
                return NULL;

        va_start(ap, fmt);
        err = shrinker_debugfs_name_alloc(shrinker, fmt, ap);
        va_end(ap);
        if (err)
                goto err_name;

        shrinker->flags = flags | SHRINKER_ALLOCATED;
        shrinker->seeks = DEFAULT_SEEKS;

        if (flags & SHRINKER_MEMCG_AWARE) {
                err = shrinker_memcg_alloc(shrinker);
                if (err == -ENOSYS) {
                        /* Memcg is not supported, fallback to non-memcg-aware shrinker. */
                        shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
                        goto non_memcg;
                }

                if (err)
                        goto err_flags;

                return shrinker;
        }

non_memcg:
        /*
         * The nr_deferred is available on per memcg level for memcg aware
         * shrinkers, so only allocate nr_deferred in the following cases:
         *  - non-memcg-aware shrinkers
         *  - !CONFIG_MEMCG
         *  - memcg is disabled by kernel command line
         */
        size = sizeof(*shrinker->nr_deferred);
        if (flags & SHRINKER_NUMA_AWARE)
                size *= nr_node_ids;

        shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
        if (!shrinker->nr_deferred)
                goto err_flags;

        return shrinker;

err_flags:
        shrinker_debugfs_name_free(shrinker);
err_name:
        kfree(shrinker);
        return NULL;
}
EXPORT_SYMBOL_GPL(shrinker_alloc);

void shrinker_register(struct shrinker *shrinker)
{
        if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) {
                pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker");
                return;
        }

        mutex_lock(&shrinker_mutex);
        list_add_tail_rcu(&shrinker->list, &shrinker_list);
        shrinker->flags |= SHRINKER_REGISTERED;
        shrinker_debugfs_add(shrinker);
        mutex_unlock(&shrinker_mutex);

        init_completion(&shrinker->done);
        /*
         * Now the shrinker is fully set up, take the first reference to it to
         * indicate that lookup operations are now allowed to use it via
         * shrinker_try_get().
         */
        refcount_set(&shrinker->refcount, 1);
}
EXPORT_SYMBOL_GPL(shrinker_register);

static void shrinker_free_rcu_cb(struct rcu_head *head)
{
        struct shrinker *shrinker = container_of(head, struct shrinker, rcu);

        kfree(shrinker->nr_deferred);
        kfree(shrinker);
}

void shrinker_free(struct shrinker *shrinker)
{
        struct dentry *debugfs_entry = NULL;
        int debugfs_id;

        if (!shrinker)
                return;

        if (shrinker->flags & SHRINKER_REGISTERED) {
                /* drop the initial refcount */
                shrinker_put(shrinker);
                /*
                 * Wait for all lookups of the shrinker to complete, after that,
                 * no shrinker is running or will run again, then we can safely
                 * free it asynchronously via RCU and safely free the structure
                 * where the shrinker is located, such as super_block etc.
                 */
                wait_for_completion(&shrinker->done);
        }

        mutex_lock(&shrinker_mutex);
        if (shrinker->flags & SHRINKER_REGISTERED) {
                /*
                 * Now we can safely remove it from the shrinker_list and then
                 * free it.
                 */
                list_del_rcu(&shrinker->list);
                debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id);
                shrinker->flags &= ~SHRINKER_REGISTERED;
        }

        shrinker_debugfs_name_free(shrinker);

        if (shrinker->flags & SHRINKER_MEMCG_AWARE)
                shrinker_memcg_remove(shrinker);
        mutex_unlock(&shrinker_mutex);

        if (debugfs_entry)
                shrinker_debugfs_remove(debugfs_entry, debugfs_id);

        call_rcu(&shrinker->rcu, shrinker_free_rcu_cb);
}
EXPORT_SYMBOL_GPL(shrinker_free);





























































































































































   33 































    8 





    8 















































































    4 
























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __X86_KERNEL_FPU_XSTATE_H
#define __X86_KERNEL_FPU_XSTATE_H

#include <asm/cpufeature.h>
#include <asm/fpu/xstate.h>
#include <asm/fpu/xcr.h>

#ifdef CONFIG_X86_64
DECLARE_PER_CPU(u64, xfd_state);
#endif

static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
{
        /*
         * XRSTORS requires these bits set in xcomp_bv, or it will
         * trigger #GP:
         */
        if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED))
                xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT;
}

static inline u64 xstate_get_group_perm(bool guest)
{
        struct fpu *fpu = &current->group_leader->thread.fpu;
        struct fpu_state_perm *perm;

        /* Pairs with WRITE_ONCE() in xstate_request_perm() */
        perm = guest ? &fpu->guest_perm : &fpu->perm;
        return READ_ONCE(perm->__state_perm);
}

static inline u64 xstate_get_host_group_perm(void)
{
        return xstate_get_group_perm(false);
}

enum xstate_copy_mode {
        XSTATE_COPY_FP,
        XSTATE_COPY_FX,
        XSTATE_COPY_XSAVE,
};

struct membuf;
extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
                                      u64 xfeatures, u32 pkru_val,
                                      enum xstate_copy_mode copy_mode);
extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
                                    enum xstate_copy_mode mode);
extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru);
extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf);


extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system_xstate(unsigned int legacy_size);

static inline u64 xfeatures_mask_supervisor(void)
{
        return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
}

static inline u64 xfeatures_mask_independent(void)
{
        if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR))
                return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR;

        return XFEATURE_MASK_INDEPENDENT;
}

/* XSAVE/XRSTOR wrapper functions */

#ifdef CONFIG_X86_64
#define REX_PREFIX        "0x48, "
#else
#define REX_PREFIX
#endif

/* These macros all use (%edi)/(%rdi) as the single memory argument. */
#define XSAVE                ".byte " REX_PREFIX "0x0f,0xae,0x27"
#define XSAVEOPT        ".byte " REX_PREFIX "0x0f,0xae,0x37"
#define XSAVEC                ".byte " REX_PREFIX "0x0f,0xc7,0x27"
#define XSAVES                ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
#define XRSTOR                ".byte " REX_PREFIX "0x0f,0xae,0x2f"
#define XRSTORS                ".byte " REX_PREFIX "0x0f,0xc7,0x1f"

/*
 * After this @err contains 0 on success or the trap number when the
 * operation raises an exception.
 */
#define XSTATE_OP(op, st, lmask, hmask, err)                                \
        asm volatile("1:" op "\n\t"                                        \
                     "xor %[err], %[err]\n"                                \
                     "2:\n\t"                                                \
                     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE)        \
                     : [err] "=a" (err)                                        \
                     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)        \
                     : "memory")

/*
 * If XSAVES is enabled, it replaces XSAVEC because it supports supervisor
 * states in addition to XSAVEC.
 *
 * Otherwise if XSAVEC is enabled, it replaces XSAVEOPT because it supports
 * compacted storage format in addition to XSAVEOPT.
 *
 * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
 * supports modified optimization which is not supported by XSAVE.
 *
 * We use XSAVE as a fallback.
 *
 * The 661 label is defined in the ALTERNATIVE* macros as the address of the
 * original instruction which gets replaced. We need to use it here as the
 * address of the instruction where we might get an exception at.
 */
#define XSTATE_XSAVE(st, lmask, hmask, err)                                \
        asm volatile(ALTERNATIVE_3(XSAVE,                                \
                                   XSAVEOPT, X86_FEATURE_XSAVEOPT,        \
                                   XSAVEC,   X86_FEATURE_XSAVEC,        \
                                   XSAVES,   X86_FEATURE_XSAVES)        \
                     "\n"                                                \
                     "xor %[err], %[err]\n"                                \
                     "3:\n"                                                \
                     _ASM_EXTABLE_TYPE_REG(661b, 3b, EX_TYPE_EFAULT_REG, %[err]) \
                     : [err] "=r" (err)                                        \
                     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)        \
                     : "memory")

/*
 * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
 * XSAVE area format.
 */
#define XSTATE_XRESTORE(st, lmask, hmask)                                \
        asm volatile(ALTERNATIVE(XRSTOR,                                \
                                 XRSTORS, X86_FEATURE_XSAVES)                \
                     "\n"                                                \
                     "3:\n"                                                \
                     _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE)        \
                     :                                                        \
                     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)        \
                     : "memory")

#if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU)
extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor);
#else
static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { }
#endif

#ifdef CONFIG_X86_64
static inline void xfd_set_state(u64 xfd)
{
        wrmsrl(MSR_IA32_XFD, xfd);
        __this_cpu_write(xfd_state, xfd);
}

static inline void xfd_update_state(struct fpstate *fpstate)
{
        if (fpu_state_size_dynamic()) {
                u64 xfd = fpstate->xfd;

                if (__this_cpu_read(xfd_state) != xfd)
                        xfd_set_state(xfd);
        }
}

extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu);
#else
static inline void xfd_set_state(u64 xfd) { }

static inline void xfd_update_state(struct fpstate *fpstate) { }

static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) {
        return -EPERM;
}
#endif

/*
 * Save processor xstate to xsave area.
 *
 * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features
 * and command line options. The choice is permanent until the next reboot.
 */
static inline void os_xsave(struct fpstate *fpstate)
{
        u64 mask = fpstate->xfeatures;
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        WARN_ON_FPU(!alternatives_patched);
        xfd_validate_state(fpstate, mask, false);

        XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err);

        /* We should never fault when copying to a kernel buffer: */
        WARN_ON_FPU(err);
}

/*
 * Restore processor xstate from xsave area.
 *
 * Uses XRSTORS when XSAVES is used, XRSTOR otherwise.
 */
static inline void os_xrstor(struct fpstate *fpstate, u64 mask)
{
        u32 lmask = mask;
        u32 hmask = mask >> 32;

        xfd_validate_state(fpstate, mask, true);
        XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
}

/* Restore of supervisor state. Does not require XFD */
static inline void os_xrstor_supervisor(struct fpstate *fpstate)
{
        u64 mask = xfeatures_mask_supervisor();
        u32 lmask = mask;
        u32 hmask = mask >> 32;

        XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask);
}

/*
 * XSAVE itself always writes all requested xfeatures.  Removing features
 * from the request bitmap reduces the features which are written.
 * Generate a mask of features which must be written to a sigframe.  The
 * unset features can be optimized away and not written.
 *
 * This optimization is user-visible.  Only use for states where
 * uninitialized sigframe contents are tolerable, like dynamic features.
 *
 * Users of buffers produced with this optimization must check XSTATE_BV
 * to determine which features have been optimized out.
 */
static inline u64 xfeatures_need_sigframe_write(void)
{
        u64 xfeaures_to_write;

        /* In-use features must be written: */
        xfeaures_to_write = xfeatures_in_use();

        /* Also write all non-optimizable sigframe features: */
        xfeaures_to_write |= XFEATURE_MASK_USER_SUPPORTED &
                             ~XFEATURE_MASK_SIGFRAME_INITOPT;

        return xfeaures_to_write;
}

/*
 * Save xstate to user space xsave area.
 *
 * We don't use modified optimization because xrstor/xrstors might track
 * a different application.
 *
 * We don't use compacted format xsave area for backward compatibility for
 * old applications which don't understand the compacted format of the
 * xsave area.
 *
 * The caller has to zero buf::header before calling this because XSAVE*
 * does not touch the reserved fields in the header.
 */
static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
{
        /*
         * Include the features which are not xsaved/rstored by the kernel
         * internally, e.g. PKRU. That's user space ABI and also required
         * to allow the signal handler to modify PKRU.
         */
        struct fpstate *fpstate = current->thread.fpu.fpstate;
        u64 mask = fpstate->user_xfeatures;
        u32 lmask;
        u32 hmask;
        int err;

        /* Optimize away writing unnecessary xfeatures: */
        if (fpu_state_size_dynamic())
                mask &= xfeatures_need_sigframe_write();

        lmask = mask;
        hmask = mask >> 32;
        xfd_validate_state(fpstate, mask, false);

        stac();
        XSTATE_OP(XSAVE, buf, lmask, hmask, err);
        clac();

        return err;
}

/*
 * Restore xstate from user space xsave area.
 */
static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask)
{
        struct xregs_state *xstate = ((__force struct xregs_state *)buf);
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        xfd_validate_state(current->thread.fpu.fpstate, mask, true);

        stac();
        XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
        clac();

        return err;
}

/*
 * Restore xstate from kernel space xsave area, return an error code instead of
 * an exception.
 */
static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask)
{
        struct xregs_state *xstate = &fpstate->regs.xsave;
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        /* Ensure that XFD is up to date */
        xfd_update_state(fpstate);

        if (cpu_feature_enabled(X86_FEATURE_XSAVES))
                XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
        else
                XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);

        return err;
}


#endif
































































    1 
    1 




















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
// SPDX-License-Identifier: GPL-2.0-or-later
/*  Paravirtualization interfaces
    Copyright (C) 2006 Rusty Russell IBM Corporation


    2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc
*/

#include <linux/errno.h>
#include <linux/init.h>
#include <linux/export.h>
#include <linux/efi.h>
#include <linux/bcd.h>
#include <linux/highmem.h>
#include <linux/kprobes.h>
#include <linux/pgtable.h>
#include <linux/static_call.h>

#include <asm/bug.h>
#include <asm/paravirt.h>
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/time.h>
#include <asm/pgalloc.h>
#include <asm/irq.h>
#include <asm/delay.h>
#include <asm/fixmap.h>
#include <asm/apic.h>
#include <asm/tlbflush.h>
#include <asm/timer.h>
#include <asm/special_insns.h>
#include <asm/tlb.h>
#include <asm/io_bitmap.h>
#include <asm/gsseg.h>

/* stub always returning 0. */
DEFINE_ASM_FUNC(paravirt_ret0, "xor %eax,%eax", .entry.text);

void __init default_banner(void)
{
        printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
               pv_info.name);
}

#ifdef CONFIG_PARAVIRT_XXL
DEFINE_ASM_FUNC(_paravirt_ident_64, "mov %rdi, %rax", .text);
DEFINE_ASM_FUNC(pv_native_save_fl, "pushf; pop %rax", .noinstr.text);
DEFINE_ASM_FUNC(pv_native_irq_disable, "cli", .noinstr.text);
DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text);
DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
#endif

DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);

void __init native_pv_lock_init(void)
{
        if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) &&
            !boot_cpu_has(X86_FEATURE_HYPERVISOR))
                static_branch_disable(&virt_spin_lock_key);
}

static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
{
        tlb_remove_page(tlb, table);
}

struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;

static u64 native_steal_clock(int cpu)
{
        return 0;
}

DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock);
DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock);

void paravirt_set_sched_clock(u64 (*func)(void))
{
        static_call_update(pv_sched_clock, func);
}

/* These are in entry.S */
static struct resource reserve_ioports = {
        .start = 0,
        .end = IO_SPACE_LIMIT,
        .name = "paravirt-ioport",
        .flags = IORESOURCE_IO | IORESOURCE_BUSY,
};

/*
 * Reserve the whole legacy IO space to prevent any legacy drivers
 * from wasting time probing for their hardware.  This is a fairly
 * brute-force approach to disabling all non-virtual drivers.
 *
 * Note that this must be called very early to have any effect.
 */
int paravirt_disable_iospace(void)
{
        return request_resource(&ioport_resource, &reserve_ioports);
}

#ifdef CONFIG_PARAVIRT_XXL
static noinstr void pv_native_write_cr2(unsigned long val)
{
        native_write_cr2(val);
}

static noinstr unsigned long pv_native_get_debugreg(int regno)
{
        return native_get_debugreg(regno);
}

static noinstr void pv_native_set_debugreg(int regno, unsigned long val)
{
        native_set_debugreg(regno, val);
}

noinstr void pv_native_wbinvd(void)
{
        native_wbinvd();
}

static noinstr void pv_native_safe_halt(void)
{
        native_safe_halt();
}
#endif

struct pv_info pv_info = {
        .name = "bare hardware",
#ifdef CONFIG_PARAVIRT_XXL
        .extra_user_64bit_cs = __USER_CS,
#endif
};

/* 64-bit pagetable entries */
#define PTE_IDENT        __PV_IS_CALLEE_SAVE(_paravirt_ident_64)

struct paravirt_patch_template pv_ops = {
        /* Cpu ops. */
        .cpu.io_delay                = native_io_delay,

#ifdef CONFIG_PARAVIRT_XXL
        .cpu.cpuid                = native_cpuid,
        .cpu.get_debugreg        = pv_native_get_debugreg,
        .cpu.set_debugreg        = pv_native_set_debugreg,
        .cpu.read_cr0                = native_read_cr0,
        .cpu.write_cr0                = native_write_cr0,
        .cpu.write_cr4                = native_write_cr4,
        .cpu.wbinvd                = pv_native_wbinvd,
        .cpu.read_msr                = native_read_msr,
        .cpu.write_msr                = native_write_msr,
        .cpu.read_msr_safe        = native_read_msr_safe,
        .cpu.write_msr_safe        = native_write_msr_safe,
        .cpu.read_pmc                = native_read_pmc,
        .cpu.load_tr_desc        = native_load_tr_desc,
        .cpu.set_ldt                = native_set_ldt,
        .cpu.load_gdt                = native_load_gdt,
        .cpu.load_idt                = native_load_idt,
        .cpu.store_tr                = native_store_tr,
        .cpu.load_tls                = native_load_tls,
        .cpu.load_gs_index        = native_load_gs_index,
        .cpu.write_ldt_entry        = native_write_ldt_entry,
        .cpu.write_gdt_entry        = native_write_gdt_entry,
        .cpu.write_idt_entry        = native_write_idt_entry,

        .cpu.alloc_ldt                = paravirt_nop,
        .cpu.free_ldt                = paravirt_nop,

        .cpu.load_sp0                = native_load_sp0,

#ifdef CONFIG_X86_IOPL_IOPERM
        .cpu.invalidate_io_bitmap        = native_tss_invalidate_io_bitmap,
        .cpu.update_io_bitmap                = native_tss_update_io_bitmap,
#endif

        .cpu.start_context_switch        = paravirt_nop,
        .cpu.end_context_switch                = paravirt_nop,

        /* Irq ops. */
        .irq.save_fl                = __PV_IS_CALLEE_SAVE(pv_native_save_fl),
        .irq.irq_disable        = __PV_IS_CALLEE_SAVE(pv_native_irq_disable),
        .irq.irq_enable                = __PV_IS_CALLEE_SAVE(pv_native_irq_enable),
        .irq.safe_halt                = pv_native_safe_halt,
        .irq.halt                = native_halt,
#endif /* CONFIG_PARAVIRT_XXL */

        /* Mmu ops. */
        .mmu.flush_tlb_user        = native_flush_tlb_local,
        .mmu.flush_tlb_kernel        = native_flush_tlb_global,
        .mmu.flush_tlb_one_user        = native_flush_tlb_one_user,
        .mmu.flush_tlb_multi        = native_flush_tlb_multi,
        .mmu.tlb_remove_table        = native_tlb_remove_table,

        .mmu.exit_mmap                = paravirt_nop,
        .mmu.notify_page_enc_status_changed        = paravirt_nop,

#ifdef CONFIG_PARAVIRT_XXL
        .mmu.read_cr2                = __PV_IS_CALLEE_SAVE(pv_native_read_cr2),
        .mmu.write_cr2                = pv_native_write_cr2,
        .mmu.read_cr3                = __native_read_cr3,
        .mmu.write_cr3                = native_write_cr3,

        .mmu.pgd_alloc                = __paravirt_pgd_alloc,
        .mmu.pgd_free                = paravirt_nop,

        .mmu.alloc_pte                = paravirt_nop,
        .mmu.alloc_pmd                = paravirt_nop,
        .mmu.alloc_pud                = paravirt_nop,
        .mmu.alloc_p4d                = paravirt_nop,
        .mmu.release_pte        = paravirt_nop,
        .mmu.release_pmd        = paravirt_nop,
        .mmu.release_pud        = paravirt_nop,
        .mmu.release_p4d        = paravirt_nop,

        .mmu.set_pte                = native_set_pte,
        .mmu.set_pmd                = native_set_pmd,

        .mmu.ptep_modify_prot_start        = __ptep_modify_prot_start,
        .mmu.ptep_modify_prot_commit        = __ptep_modify_prot_commit,

        .mmu.set_pud                = native_set_pud,

        .mmu.pmd_val                = PTE_IDENT,
        .mmu.make_pmd                = PTE_IDENT,

        .mmu.pud_val                = PTE_IDENT,
        .mmu.make_pud                = PTE_IDENT,

        .mmu.set_p4d                = native_set_p4d,

#if CONFIG_PGTABLE_LEVELS >= 5
        .mmu.p4d_val                = PTE_IDENT,
        .mmu.make_p4d                = PTE_IDENT,

        .mmu.set_pgd                = native_set_pgd,
#endif /* CONFIG_PGTABLE_LEVELS >= 5 */

        .mmu.pte_val                = PTE_IDENT,
        .mmu.pgd_val                = PTE_IDENT,

        .mmu.make_pte                = PTE_IDENT,
        .mmu.make_pgd                = PTE_IDENT,

        .mmu.enter_mmap                = paravirt_nop,

        .mmu.lazy_mode = {
                .enter                = paravirt_nop,
                .leave                = paravirt_nop,
                .flush                = paravirt_nop,
        },

        .mmu.set_fixmap                = native_set_fixmap,
#endif /* CONFIG_PARAVIRT_XXL */

#if defined(CONFIG_PARAVIRT_SPINLOCKS)
        /* Lock ops. */
#ifdef CONFIG_SMP
        .lock.queued_spin_lock_slowpath        = native_queued_spin_lock_slowpath,
        .lock.queued_spin_unlock        =
                                PV_CALLEE_SAVE(__native_queued_spin_unlock),
        .lock.wait                        = paravirt_nop,
        .lock.kick                        = paravirt_nop,
        .lock.vcpu_is_preempted                =
                                PV_CALLEE_SAVE(__native_vcpu_is_preempted),
#endif /* SMP */
#endif
};

#ifdef CONFIG_PARAVIRT_XXL
NOKPROBE_SYMBOL(native_load_idt);
#endif

EXPORT_SYMBOL(pv_ops);
EXPORT_SYMBOL_GPL(pv_info);














































































































    1 






























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#ifndef BTRFS_DISK_IO_H
#define BTRFS_DISK_IO_H

#include <linux/sizes.h>
#include <linux/compiler_types.h>
#include "ctree.h"
#include "fs.h"

struct block_device;
struct super_block;
struct extent_buffer;
struct btrfs_device;
struct btrfs_fs_devices;
struct btrfs_fs_info;
struct btrfs_super_block;
struct btrfs_trans_handle;
struct btrfs_tree_parent_check;
struct btrfs_transaction;

#define BTRFS_SUPER_MIRROR_MAX         3
#define BTRFS_SUPER_MIRROR_SHIFT 12

/*
 * Fixed blocksize for all devices, applies to specific ways of reading
 * metadata like superblock. Must meet the set_blocksize requirements.
 *
 * Do not change.
 */
#define BTRFS_BDEV_BLOCKSIZE        (4096)

static inline u64 btrfs_sb_offset(int mirror)
{
        u64 start = SZ_16K;
        if (mirror)
                return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
        return BTRFS_SUPER_INFO_OFFSET;
}

void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info);
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
                                      struct btrfs_tree_parent_check *check);
struct extent_buffer *btrfs_find_create_tree_block(
                                                struct btrfs_fs_info *fs_info,
                                                u64 bytenr, u64 owner_root,
                                                int level);
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
                           const struct btrfs_super_block *disk_sb);
int __cold open_ctree(struct super_block *sb,
               struct btrfs_fs_devices *fs_devices,
               char *options);
void __cold close_ctree(struct btrfs_fs_info *fs_info);
int btrfs_validate_super(struct btrfs_fs_info *fs_info,
                         struct btrfs_super_block *sb, int mirror_num);
int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount);
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
                                                   int copy_num, bool drop_cache);
int btrfs_commit_super(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
                                        struct btrfs_key *key);
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
                         struct btrfs_root *root);
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);

struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
                                     u64 objectid, bool check_ref);
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
                                         u64 objectid, dev_t *anon_dev);
struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
                                                 struct btrfs_path *path,
                                                 u64 objectid);
int btrfs_global_root_insert(struct btrfs_root *root);
void btrfs_global_root_delete(struct btrfs_root *root);
struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
                                     struct btrfs_key *key);
struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr);
struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr);
struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info);

void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
                                 struct btrfs_root *root);
int btrfs_validate_extent_buffer(struct extent_buffer *eb,
                                 struct btrfs_tree_parent_check *check);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif

/*
 * This function is used to grab the root, and avoid it is freed when we
 * access it. But it doesn't ensure that the tree is not dropped.
 *
 * If you want to ensure the whole tree is safe, you should use
 *         fs_info->subvol_srcu
 */
static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
{
        if (!root)
                return NULL;
        if (refcount_inc_not_zero(&root->refs))
                return root;
        return NULL;
}

void btrfs_put_root(struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
                             struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
                          int atomic);
int btrfs_read_extent_buffer(struct extent_buffer *buf,
                             struct btrfs_tree_parent_check *check);

blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio);
int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root);
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *fs_info);
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root);
void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans,
                             struct btrfs_fs_info *fs_info);
void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
                                  struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
                                     u64 objectid);
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid);
int btrfs_init_root_free_objectid(struct btrfs_root *root);

#endif

































































    3 








    3 








    3 














    2 













    2 























    1 









    1 


















    4 




    1 

    4 


















































    5 



    3 




    5 



    4 




    4 



    4 



















    2 




    2 




    2 




    2 










    6 
    5 


    2 



































































































































































   14 
















   15 







    2 




   15 

















    2 

    2 

    5 







    3 
   16 

    3 




   16 
    1 
    1 



   15 















   16 






   14 





   16 
    3 
   16 



   16 

































    2 
















   30 


   27 




    2 

















   31 

   30 

   25 
    5 









































































   28 



   32 








































































































































































































































    1 







    1 








































































































































































































































































































































































































































































































































































































































































































































    2 




























































































































































































































































































































































































































































































































































































































































































































































    3 







































    2 


















    1 

























































   29 

























   32 





   26 















    3 












   31 








   30 















   30 


   13 












    9 

    3 



   28 



    3 








    8 



































   13 












    4 




















































   32 


















   32 








   30 
   10 



















    3 






    3 



    3 








   29 



    3 

    3 


























   13 


























    2 


    5 







   29 
   16 



























    3 







    2 





















   14 




































































    3 































































































































































































































































































































































































































    2 













    1 




    2 


































    2 







    2 





























    2 














































































































    2 







    2 

    2 




    1 



































    1 






    1 















    2 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/lib/vsprintf.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/* vsprintf.c -- Lars Wirzenius & Linus Torvalds. */
/*
 * Wirzenius wrote this portably, Torvalds fucked it up :-)
 */

/*
 * Fri Jul 13 2001 Crutcher Dunnavant <crutcher+kernel@datastacks.com>
 * - changed to provide snprintf and vsnprintf functions
 * So Feb  1 16:51:32 CET 2004 Juergen Quade <quade@hsnr.de>
 * - scnprintf and vscnprintf
 */

#include <linux/stdarg.h>
#include <linux/build_bug.h>
#include <linux/clk.h>
#include <linux/clk-provider.h>
#include <linux/errname.h>
#include <linux/module.h>        /* for KSYM_SYMBOL_LEN */
#include <linux/types.h>
#include <linux/string.h>
#include <linux/ctype.h>
#include <linux/kernel.h>
#include <linux/kallsyms.h>
#include <linux/math64.h>
#include <linux/uaccess.h>
#include <linux/ioport.h>
#include <linux/dcache.h>
#include <linux/cred.h>
#include <linux/rtc.h>
#include <linux/sprintf.h>
#include <linux/time.h>
#include <linux/uuid.h>
#include <linux/of.h>
#include <net/addrconf.h>
#include <linux/siphash.h>
#include <linux/compiler.h>
#include <linux/property.h>
#include <linux/notifier.h>
#ifdef CONFIG_BLOCK
#include <linux/blkdev.h>
#endif

#include "../mm/internal.h"        /* For the trace_print_flags arrays */

#include <asm/page.h>                /* for PAGE_SIZE */
#include <asm/byteorder.h>        /* cpu_to_le16 */
#include <asm/unaligned.h>

#include <linux/string_helpers.h>
#include "kstrtox.h"

/* Disable pointer hashing if requested */
bool no_hash_pointers __ro_after_init;
EXPORT_SYMBOL_GPL(no_hash_pointers);

noinline
static unsigned long long simple_strntoull(const char *startp, char **endp, unsigned int base, size_t max_chars)
{
        const char *cp;
        unsigned long long result = 0ULL;
        size_t prefix_chars;
        unsigned int rv;

        cp = _parse_integer_fixup_radix(startp, &base);
        prefix_chars = cp - startp;
        if (prefix_chars < max_chars) {
                rv = _parse_integer_limit(cp, base, &result, max_chars - prefix_chars);
                /* FIXME */
                cp += (rv & ~KSTRTOX_OVERFLOW);
        } else {
                /* Field too short for prefix + digit, skip over without converting */
                cp = startp + max_chars;
        }

        if (endp)
                *endp = (char *)cp;

        return result;
}

/**
 * simple_strtoull - convert a string to an unsigned long long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtoull instead.
 */
noinline
unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
{
        return simple_strntoull(cp, endp, base, INT_MAX);
}
EXPORT_SYMBOL(simple_strtoull);

/**
 * simple_strtoul - convert a string to an unsigned long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtoul instead.
 */
unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base)
{
        return simple_strtoull(cp, endp, base);
}
EXPORT_SYMBOL(simple_strtoul);

/**
 * simple_strtol - convert a string to a signed long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtol instead.
 */
long simple_strtol(const char *cp, char **endp, unsigned int base)
{
        if (*cp == '-')
                return -simple_strtoul(cp + 1, endp, base);

        return simple_strtoul(cp, endp, base);
}
EXPORT_SYMBOL(simple_strtol);

noinline
static long long simple_strntoll(const char *cp, char **endp, unsigned int base, size_t max_chars)
{
        /*
         * simple_strntoull() safely handles receiving max_chars==0 in the
         * case cp[0] == '-' && max_chars == 1.
         * If max_chars == 0 we can drop through and pass it to simple_strntoull()
         * and the content of *cp is irrelevant.
         */
        if (*cp == '-' && max_chars > 0)
                return -simple_strntoull(cp + 1, endp, base, max_chars - 1);

        return simple_strntoull(cp, endp, base, max_chars);
}

/**
 * simple_strtoll - convert a string to a signed long long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtoll instead.
 */
long long simple_strtoll(const char *cp, char **endp, unsigned int base)
{
        return simple_strntoll(cp, endp, base, INT_MAX);
}
EXPORT_SYMBOL(simple_strtoll);

static noinline_for_stack
int skip_atoi(const char **s)
{
        int i = 0;

        do {
                i = i*10 + *((*s)++) - '0';
        } while (isdigit(**s));

        return i;
}

/*
 * Decimal conversion is by far the most typical, and is used for
 * /proc and /sys data. This directly impacts e.g. top performance
 * with many processes running. We optimize it for speed by emitting
 * two characters at a time, using a 200 byte lookup table. This
 * roughly halves the number of multiplications compared to computing
 * the digits one at a time. Implementation strongly inspired by the
 * previous version, which in turn used ideas described at
 * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission
 * from the author, Douglas W. Jones).
 *
 * It turns out there is precisely one 26 bit fixed-point
 * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32
 * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual
 * range happens to be somewhat larger (x <= 1073741898), but that's
 * irrelevant for our purpose.
 *
 * For dividing a number in the range [10^4, 10^6-1] by 100, we still
 * need a 32x32->64 bit multiply, so we simply use the same constant.
 *
 * For dividing a number in the range [100, 10^4-1] by 100, there are
 * several options. The simplest is (x * 0x147b) >> 19, which is valid
 * for all x <= 43698.
 */

static const u16 decpair[100] = {
#define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030)
        _( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9),
        _(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19),
        _(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29),
        _(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39),
        _(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49),
        _(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59),
        _(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69),
        _(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79),
        _(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89),
        _(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99),
#undef _
};

/*
 * This will print a single '0' even if r == 0, since we would
 * immediately jump to out_r where two 0s would be written but only
 * one of them accounted for in buf. This is needed by ip4_string
 * below. All other callers pass a non-zero value of r.
*/
static noinline_for_stack
char *put_dec_trunc8(char *buf, unsigned r)
{
        unsigned q;

        /* 1 <= r < 10^8 */
        if (r < 100)
                goto out_r;

        /* 100 <= r < 10^8 */
        q = (r * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;

        /* 1 <= q < 10^6 */
        if (q < 100)
                goto out_q;

        /*  100 <= q < 10^6 */
        r = (q * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[q - 100*r];
        buf += 2;

        /* 1 <= r < 10^4 */
        if (r < 100)
                goto out_r;

        /* 100 <= r < 10^4 */
        q = (r * 0x147b) >> 19;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;
out_q:
        /* 1 <= q < 100 */
        r = q;
out_r:
        /* 1 <= r < 100 */
        *((u16 *)buf) = decpair[r];
        buf += r < 10 ? 1 : 2;
        return buf;
}

#if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64
static noinline_for_stack
char *put_dec_full8(char *buf, unsigned r)
{
        unsigned q;

        /* 0 <= r < 10^8 */
        q = (r * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;

        /* 0 <= q < 10^6 */
        r = (q * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[q - 100*r];
        buf += 2;

        /* 0 <= r < 10^4 */
        q = (r * 0x147b) >> 19;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;

        /* 0 <= q < 100 */
        *((u16 *)buf) = decpair[q];
        buf += 2;
        return buf;
}

static noinline_for_stack
char *put_dec(char *buf, unsigned long long n)
{
        if (n >= 100*1000*1000)
                buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
        /* 1 <= n <= 1.6e11 */
        if (n >= 100*1000*1000)
                buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
        /* 1 <= n < 1e8 */
        return put_dec_trunc8(buf, n);
}

#elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64

static void
put_dec_full4(char *buf, unsigned r)
{
        unsigned q;

        /* 0 <= r < 10^4 */
        q = (r * 0x147b) >> 19;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;
        /* 0 <= q < 100 */
        *((u16 *)buf) = decpair[q];
}

/*
 * Call put_dec_full4 on x % 10000, return x / 10000.
 * The approximation x/10000 == (x * 0x346DC5D7) >> 43
 * holds for all x < 1,128,869,999.  The largest value this
 * helper will ever be asked to convert is 1,125,520,955.
 * (second call in the put_dec code, assuming n is all-ones).
 */
static noinline_for_stack
unsigned put_dec_helper4(char *buf, unsigned x)
{
        uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43;

        put_dec_full4(buf, x - q * 10000);
        return q;
}

/* Based on code by Douglas W. Jones found at
 * <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour>
 * (with permission from the author).
 * Performs no 64-bit division and hence should be fast on 32-bit machines.
 */
static
char *put_dec(char *buf, unsigned long long n)
{
        uint32_t d3, d2, d1, q, h;

        if (n < 100*1000*1000)
                return put_dec_trunc8(buf, n);

        d1  = ((uint32_t)n >> 16); /* implicit "& 0xffff" */
        h   = (n >> 32);
        d2  = (h      ) & 0xffff;
        d3  = (h >> 16); /* implicit "& 0xffff" */

        /* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0
             = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */
        q   = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
        q = put_dec_helper4(buf, q);

        q += 7671 * d3 + 9496 * d2 + 6 * d1;
        q = put_dec_helper4(buf+4, q);

        q += 4749 * d3 + 42 * d2;
        q = put_dec_helper4(buf+8, q);

        q += 281 * d3;
        buf += 12;
        if (q)
                buf = put_dec_trunc8(buf, q);
        else while (buf[-1] == '0')
                --buf;

        return buf;
}

#endif

/*
 * Convert passed number to decimal string.
 * Returns the length of string.  On buffer overflow, returns 0.
 *
 * If speed is not important, use snprintf(). It's easy to read the code.
 */
int num_to_str(char *buf, int size, unsigned long long num, unsigned int width)
{
        /* put_dec requires 2-byte alignment of the buffer. */
        char tmp[sizeof(num) * 3] __aligned(2);
        int idx, len;

        /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
        if (num <= 9) {
                tmp[0] = '0' + num;
                len = 1;
        } else {
                len = put_dec(tmp, num) - tmp;
        }

        if (len > size || width > size)
                return 0;

        if (width > len) {
                width = width - len;
                for (idx = 0; idx < width; idx++)
                        buf[idx] = ' ';
        } else {
                width = 0;
        }

        for (idx = 0; idx < len; ++idx)
                buf[idx + width] = tmp[len - idx - 1];

        return len + width;
}

#define SIGN        1                /* unsigned/signed, must be 1 */
#define LEFT        2                /* left justified */
#define PLUS        4                /* show plus */
#define SPACE        8                /* space if plus */
#define ZEROPAD        16                /* pad with zero, must be 16 == '0' - ' ' */
#define SMALL        32                /* use lowercase in hex (must be 32 == 0x20) */
#define SPECIAL        64                /* prefix hex with "0x", octal with "0" */

static_assert(SIGN == 1);
static_assert(ZEROPAD == ('0' - ' '));
static_assert(SMALL == ('a' ^ 'A'));

enum format_type {
        FORMAT_TYPE_NONE, /* Just a string part */
        FORMAT_TYPE_WIDTH,
        FORMAT_TYPE_PRECISION,
        FORMAT_TYPE_CHAR,
        FORMAT_TYPE_STR,
        FORMAT_TYPE_PTR,
        FORMAT_TYPE_PERCENT_CHAR,
        FORMAT_TYPE_INVALID,
        FORMAT_TYPE_LONG_LONG,
        FORMAT_TYPE_ULONG,
        FORMAT_TYPE_LONG,
        FORMAT_TYPE_UBYTE,
        FORMAT_TYPE_BYTE,
        FORMAT_TYPE_USHORT,
        FORMAT_TYPE_SHORT,
        FORMAT_TYPE_UINT,
        FORMAT_TYPE_INT,
        FORMAT_TYPE_SIZE_T,
        FORMAT_TYPE_PTRDIFF
};

struct printf_spec {
        unsigned int        type:8;                /* format_type enum */
        signed int        field_width:24;        /* width of output field */
        unsigned int        flags:8;        /* flags to number() */
        unsigned int        base:8;                /* number base, 8, 10 or 16 only */
        signed int        precision:16;        /* # of digits/chars */
} __packed;
static_assert(sizeof(struct printf_spec) == 8);

#define FIELD_WIDTH_MAX ((1 << 23) - 1)
#define PRECISION_MAX ((1 << 15) - 1)

static noinline_for_stack
char *number(char *buf, char *end, unsigned long long num,
             struct printf_spec spec)
{
        /* put_dec requires 2-byte alignment of the buffer. */
        char tmp[3 * sizeof(num)] __aligned(2);
        char sign;
        char locase;
        int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
        int i;
        bool is_zero = num == 0LL;
        int field_width = spec.field_width;
        int precision = spec.precision;

        /* locase = 0 or 0x20. ORing digits or letters with 'locase'
         * produces same digits or (maybe lowercased) letters */
        locase = (spec.flags & SMALL);
        if (spec.flags & LEFT)
                spec.flags &= ~ZEROPAD;
        sign = 0;
        if (spec.flags & SIGN) {
                if ((signed long long)num < 0) {
                        sign = '-';
                        num = -(signed long long)num;
                        field_width--;
                } else if (spec.flags & PLUS) {
                        sign = '+';
                        field_width--;
                } else if (spec.flags & SPACE) {
                        sign = ' ';
                        field_width--;
                }
        }
        if (need_pfx) {
                if (spec.base == 16)
                        field_width -= 2;
                else if (!is_zero)
                        field_width--;
        }

        /* generate full string in tmp[], in reverse order */
        i = 0;
        if (num < spec.base)
                tmp[i++] = hex_asc_upper[num] | locase;
        else if (spec.base != 10) { /* 8 or 16 */
                int mask = spec.base - 1;
                int shift = 3;

                if (spec.base == 16)
                        shift = 4;
                do {
                        tmp[i++] = (hex_asc_upper[((unsigned char)num) & mask] | locase);
                        num >>= shift;
                } while (num);
        } else { /* base 10 */
                i = put_dec(tmp, num) - tmp;
        }

        /* printing 100 using %2d gives "100", not "00" */
        if (i > precision)
                precision = i;
        /* leading space padding */
        field_width -= precision;
        if (!(spec.flags & (ZEROPAD | LEFT))) {
                while (--field_width >= 0) {
                        if (buf < end)
                                *buf = ' ';
                        ++buf;
                }
        }
        /* sign */
        if (sign) {
                if (buf < end)
                        *buf = sign;
                ++buf;
        }
        /* "0x" / "0" prefix */
        if (need_pfx) {
                if (spec.base == 16 || !is_zero) {
                        if (buf < end)
                                *buf = '0';
                        ++buf;
                }
                if (spec.base == 16) {
                        if (buf < end)
                                *buf = ('X' | locase);
                        ++buf;
                }
        }
        /* zero or space padding */
        if (!(spec.flags & LEFT)) {
                char c = ' ' + (spec.flags & ZEROPAD);

                while (--field_width >= 0) {
                        if (buf < end)
                                *buf = c;
                        ++buf;
                }
        }
        /* hmm even more zero padding? */
        while (i <= --precision) {
                if (buf < end)
                        *buf = '0';
                ++buf;
        }
        /* actual digits of result */
        while (--i >= 0) {
                if (buf < end)
                        *buf = tmp[i];
                ++buf;
        }
        /* trailing space padding */
        while (--field_width >= 0) {
                if (buf < end)
                        *buf = ' ';
                ++buf;
        }

        return buf;
}

static noinline_for_stack
char *special_hex_number(char *buf, char *end, unsigned long long num, int size)
{
        struct printf_spec spec;

        spec.type = FORMAT_TYPE_PTR;
        spec.field_width = 2 + 2 * size;        /* 0x + hex */
        spec.flags = SPECIAL | SMALL | ZEROPAD;
        spec.base = 16;
        spec.precision = -1;

        return number(buf, end, num, spec);
}

static void move_right(char *buf, char *end, unsigned len, unsigned spaces)
{
        size_t size;
        if (buf >= end)        /* nowhere to put anything */
                return;
        size = end - buf;
        if (size <= spaces) {
                memset(buf, ' ', size);
                return;
        }
        if (len) {
                if (len > size - spaces)
                        len = size - spaces;
                memmove(buf + spaces, buf, len);
        }
        memset(buf, ' ', spaces);
}

/*
 * Handle field width padding for a string.
 * @buf: current buffer position
 * @n: length of string
 * @end: end of output buffer
 * @spec: for field width and flags
 * Returns: new buffer position after padding.
 */
static noinline_for_stack
char *widen_string(char *buf, int n, char *end, struct printf_spec spec)
{
        unsigned spaces;

        if (likely(n >= spec.field_width))
                return buf;
        /* we want to pad the sucker */
        spaces = spec.field_width - n;
        if (!(spec.flags & LEFT)) {
                move_right(buf - n, end, n, spaces);
                return buf + spaces;
        }
        while (spaces--) {
                if (buf < end)
                        *buf = ' ';
                ++buf;
        }
        return buf;
}

/* Handle string from a well known address. */
static char *string_nocheck(char *buf, char *end, const char *s,
                            struct printf_spec spec)
{
        int len = 0;
        int lim = spec.precision;

        while (lim--) {
                char c = *s++;
                if (!c)
                        break;
                if (buf < end)
                        *buf = c;
                ++buf;
                ++len;
        }
        return widen_string(buf, len, end, spec);
}

static char *err_ptr(char *buf, char *end, void *ptr,
                     struct printf_spec spec)
{
        int err = PTR_ERR(ptr);
        const char *sym = errname(err);

        if (sym)
                return string_nocheck(buf, end, sym, spec);

        /*
         * Somebody passed ERR_PTR(-1234) or some other non-existing
         * Efoo - or perhaps CONFIG_SYMBOLIC_ERRNAME=n. Fall back to
         * printing it as its decimal representation.
         */
        spec.flags |= SIGN;
        spec.base = 10;
        return number(buf, end, err, spec);
}

/* Be careful: error messages must fit into the given buffer. */
static char *error_string(char *buf, char *end, const char *s,
                          struct printf_spec spec)
{
        /*
         * Hard limit to avoid a completely insane messages. It actually
         * works pretty well because most error messages are in
         * the many pointer format modifiers.
         */
        if (spec.precision == -1)
                spec.precision = 2 * sizeof(void *);

        return string_nocheck(buf, end, s, spec);
}

/*
 * Do not call any complex external code here. Nested printk()/vsprintf()
 * might cause infinite loops. Failures might break printk() and would
 * be hard to debug.
 */
static const char *check_pointer_msg(const void *ptr)
{
        if (!ptr)
                return "(null)";

        if ((unsigned long)ptr < PAGE_SIZE || IS_ERR_VALUE(ptr))
                return "(efault)";

        return NULL;
}

static int check_pointer(char **buf, char *end, const void *ptr,
                         struct printf_spec spec)
{
        const char *err_msg;

        err_msg = check_pointer_msg(ptr);
        if (err_msg) {
                *buf = error_string(*buf, end, err_msg, spec);
                return -EFAULT;
        }

        return 0;
}

static noinline_for_stack
char *string(char *buf, char *end, const char *s,
             struct printf_spec spec)
{
        if (check_pointer(&buf, end, s, spec))
                return buf;

        return string_nocheck(buf, end, s, spec);
}

static char *pointer_string(char *buf, char *end,
                            const void *ptr,
                            struct printf_spec spec)
{
        spec.base = 16;
        spec.flags |= SMALL;
        if (spec.field_width == -1) {
                spec.field_width = 2 * sizeof(ptr);
                spec.flags |= ZEROPAD;
        }

        return number(buf, end, (unsigned long int)ptr, spec);
}

/* Make pointers available for printing early in the boot sequence. */
static int debug_boot_weak_hash __ro_after_init;

static int __init debug_boot_weak_hash_enable(char *str)
{
        debug_boot_weak_hash = 1;
        pr_info("debug_boot_weak_hash enabled\n");
        return 0;
}
early_param("debug_boot_weak_hash", debug_boot_weak_hash_enable);

static bool filled_random_ptr_key __read_mostly;
static siphash_key_t ptr_key __read_mostly;

static int fill_ptr_key(struct notifier_block *nb, unsigned long action, void *data)
{
        get_random_bytes(&ptr_key, sizeof(ptr_key));

        /* Pairs with smp_rmb() before reading ptr_key. */
        smp_wmb();
        WRITE_ONCE(filled_random_ptr_key, true);
        return NOTIFY_DONE;
}

static int __init vsprintf_init_hashval(void)
{
        static struct notifier_block fill_ptr_key_nb = { .notifier_call = fill_ptr_key };
        execute_with_initialized_rng(&fill_ptr_key_nb);
        return 0;
}
subsys_initcall(vsprintf_init_hashval)

/* Maps a pointer to a 32 bit unique identifier. */
static inline int __ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
{
        unsigned long hashval;

        if (!READ_ONCE(filled_random_ptr_key))
                return -EBUSY;

        /* Pairs with smp_wmb() after writing ptr_key. */
        smp_rmb();

#ifdef CONFIG_64BIT
        hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key);
        /*
         * Mask off the first 32 bits, this makes explicit that we have
         * modified the address (and 32 bits is plenty for a unique ID).
         */
        hashval = hashval & 0xffffffff;
#else
        hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key);
#endif
        *hashval_out = hashval;
        return 0;
}

int ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
{
        return __ptr_to_hashval(ptr, hashval_out);
}

static char *ptr_to_id(char *buf, char *end, const void *ptr,
                       struct printf_spec spec)
{
        const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)";
        unsigned long hashval;
        int ret;

        /*
         * Print the real pointer value for NULL and error pointers,
         * as they are not actual addresses.
         */
        if (IS_ERR_OR_NULL(ptr))
                return pointer_string(buf, end, ptr, spec);

        /* When debugging early boot use non-cryptographically secure hash. */
        if (unlikely(debug_boot_weak_hash)) {
                hashval = hash_long((unsigned long)ptr, 32);
                return pointer_string(buf, end, (const void *)hashval, spec);
        }

        ret = __ptr_to_hashval(ptr, &hashval);
        if (ret) {
                spec.field_width = 2 * sizeof(ptr);
                /* string length must be less than default_width */
                return error_string(buf, end, str, spec);
        }

        return pointer_string(buf, end, (const void *)hashval, spec);
}

static char *default_pointer(char *buf, char *end, const void *ptr,
                             struct printf_spec spec)
{
        /*
         * default is to _not_ leak addresses, so hash before printing,
         * unless no_hash_pointers is specified on the command line.
         */
        if (unlikely(no_hash_pointers))
                return pointer_string(buf, end, ptr, spec);

        return ptr_to_id(buf, end, ptr, spec);
}

int kptr_restrict __read_mostly;

static noinline_for_stack
char *restricted_pointer(char *buf, char *end, const void *ptr,
                         struct printf_spec spec)
{
        switch (kptr_restrict) {
        case 0:
                /* Handle as %p, hash and do _not_ leak addresses. */
                return default_pointer(buf, end, ptr, spec);
        case 1: {
                const struct cred *cred;

                /*
                 * kptr_restrict==1 cannot be used in IRQ context
                 * because its test for CAP_SYSLOG would be meaningless.
                 */
                if (in_hardirq() || in_serving_softirq() || in_nmi()) {
                        if (spec.field_width == -1)
                                spec.field_width = 2 * sizeof(ptr);
                        return error_string(buf, end, "pK-error", spec);
                }

                /*
                 * Only print the real pointer value if the current
                 * process has CAP_SYSLOG and is running with the
                 * same credentials it started with. This is because
                 * access to files is checked at open() time, but %pK
                 * checks permission at read() time. We don't want to
                 * leak pointer values if a binary opens a file using
                 * %pK and then elevates privileges before reading it.
                 */
                cred = current_cred();
                if (!has_capability_noaudit(current, CAP_SYSLOG) ||
                    !uid_eq(cred->euid, cred->uid) ||
                    !gid_eq(cred->egid, cred->gid))
                        ptr = NULL;
                break;
        }
        case 2:
        default:
                /* Always print 0's for %pK */
                ptr = NULL;
                break;
        }

        return pointer_string(buf, end, ptr, spec);
}

static noinline_for_stack
char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_spec spec,
                  const char *fmt)
{
        const char *array[4], *s;
        const struct dentry *p;
        int depth;
        int i, n;

        switch (fmt[1]) {
                case '2': case '3': case '4':
                        depth = fmt[1] - '0';
                        break;
                default:
                        depth = 1;
        }

        rcu_read_lock();
        for (i = 0; i < depth; i++, d = p) {
                if (check_pointer(&buf, end, d, spec)) {
                        rcu_read_unlock();
                        return buf;
                }

                p = READ_ONCE(d->d_parent);
                array[i] = READ_ONCE(d->d_name.name);
                if (p == d) {
                        if (i)
                                array[i] = "";
                        i++;
                        break;
                }
        }
        s = array[--i];
        for (n = 0; n != spec.precision; n++, buf++) {
                char c = *s++;
                if (!c) {
                        if (!i)
                                break;
                        c = '/';
                        s = array[--i];
                }
                if (buf < end)
                        *buf = c;
        }
        rcu_read_unlock();
        return widen_string(buf, n, end, spec);
}

static noinline_for_stack
char *file_dentry_name(char *buf, char *end, const struct file *f,
                        struct printf_spec spec, const char *fmt)
{
        if (check_pointer(&buf, end, f, spec))
                return buf;

        return dentry_name(buf, end, f->f_path.dentry, spec, fmt);
}
#ifdef CONFIG_BLOCK
static noinline_for_stack
char *bdev_name(char *buf, char *end, struct block_device *bdev,
                struct printf_spec spec, const char *fmt)
{
        struct gendisk *hd;

        if (check_pointer(&buf, end, bdev, spec))
                return buf;

        hd = bdev->bd_disk;
        buf = string(buf, end, hd->disk_name, spec);
        if (bdev_is_partition(bdev)) {
                if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) {
                        if (buf < end)
                                *buf = 'p';
                        buf++;
                }
                buf = number(buf, end, bdev_partno(bdev), spec);
        }
        return buf;
}
#endif

static noinline_for_stack
char *symbol_string(char *buf, char *end, void *ptr,
                    struct printf_spec spec, const char *fmt)
{
        unsigned long value;
#ifdef CONFIG_KALLSYMS
        char sym[KSYM_SYMBOL_LEN];
#endif

        if (fmt[1] == 'R')
                ptr = __builtin_extract_return_addr(ptr);
        value = (unsigned long)ptr;

#ifdef CONFIG_KALLSYMS
        if (*fmt == 'B' && fmt[1] == 'b')
                sprint_backtrace_build_id(sym, value);
        else if (*fmt == 'B')
                sprint_backtrace(sym, value);
        else if (*fmt == 'S' && (fmt[1] == 'b' || (fmt[1] == 'R' && fmt[2] == 'b')))
                sprint_symbol_build_id(sym, value);
        else if (*fmt != 's')
                sprint_symbol(sym, value);
        else
                sprint_symbol_no_offset(sym, value);

        return string_nocheck(buf, end, sym, spec);
#else
        return special_hex_number(buf, end, value, sizeof(void *));
#endif
}

static const struct printf_spec default_str_spec = {
        .field_width = -1,
        .precision = -1,
};

static const struct printf_spec default_flag_spec = {
        .base = 16,
        .precision = -1,
        .flags = SPECIAL | SMALL,
};

static const struct printf_spec default_dec_spec = {
        .base = 10,
        .precision = -1,
};

static const struct printf_spec default_dec02_spec = {
        .base = 10,
        .field_width = 2,
        .precision = -1,
        .flags = ZEROPAD,
};

static const struct printf_spec default_dec04_spec = {
        .base = 10,
        .field_width = 4,
        .precision = -1,
        .flags = ZEROPAD,
};

static noinline_for_stack
char *resource_string(char *buf, char *end, struct resource *res,
                      struct printf_spec spec, const char *fmt)
{
#ifndef IO_RSRC_PRINTK_SIZE
#define IO_RSRC_PRINTK_SIZE        6
#endif

#ifndef MEM_RSRC_PRINTK_SIZE
#define MEM_RSRC_PRINTK_SIZE        10
#endif
        static const struct printf_spec io_spec = {
                .base = 16,
                .field_width = IO_RSRC_PRINTK_SIZE,
                .precision = -1,
                .flags = SPECIAL | SMALL | ZEROPAD,
        };
        static const struct printf_spec mem_spec = {
                .base = 16,
                .field_width = MEM_RSRC_PRINTK_SIZE,
                .precision = -1,
                .flags = SPECIAL | SMALL | ZEROPAD,
        };
        static const struct printf_spec bus_spec = {
                .base = 16,
                .field_width = 2,
                .precision = -1,
                .flags = SMALL | ZEROPAD,
        };
        static const struct printf_spec str_spec = {
                .field_width = -1,
                .precision = 10,
                .flags = LEFT,
        };

        /* 32-bit res (sizeof==4): 10 chars in dec, 10 in hex ("0x" + 8)
         * 64-bit res (sizeof==8): 20 chars in dec, 18 in hex ("0x" + 16) */
#define RSRC_BUF_SIZE                ((2 * sizeof(resource_size_t)) + 4)
#define FLAG_BUF_SIZE                (2 * sizeof(res->flags))
#define DECODED_BUF_SIZE        sizeof("[mem - 64bit pref window disabled]")
#define RAW_BUF_SIZE                sizeof("[mem - flags 0x]")
        char sym[max(2*RSRC_BUF_SIZE + DECODED_BUF_SIZE,
                     2*RSRC_BUF_SIZE + FLAG_BUF_SIZE + RAW_BUF_SIZE)];

        char *p = sym, *pend = sym + sizeof(sym);
        int decode = (fmt[0] == 'R') ? 1 : 0;
        const struct printf_spec *specp;

        if (check_pointer(&buf, end, res, spec))
                return buf;

        *p++ = '[';
        if (res->flags & IORESOURCE_IO) {
                p = string_nocheck(p, pend, "io  ", str_spec);
                specp = &io_spec;
        } else if (res->flags & IORESOURCE_MEM) {
                p = string_nocheck(p, pend, "mem ", str_spec);
                specp = &mem_spec;
        } else if (res->flags & IORESOURCE_IRQ) {
                p = string_nocheck(p, pend, "irq ", str_spec);
                specp = &default_dec_spec;
        } else if (res->flags & IORESOURCE_DMA) {
                p = string_nocheck(p, pend, "dma ", str_spec);
                specp = &default_dec_spec;
        } else if (res->flags & IORESOURCE_BUS) {
                p = string_nocheck(p, pend, "bus ", str_spec);
                specp = &bus_spec;
        } else {
                p = string_nocheck(p, pend, "??? ", str_spec);
                specp = &mem_spec;
                decode = 0;
        }
        if (decode && res->flags & IORESOURCE_UNSET) {
                p = string_nocheck(p, pend, "size ", str_spec);
                p = number(p, pend, resource_size(res), *specp);
        } else {
                p = number(p, pend, res->start, *specp);
                if (res->start != res->end) {
                        *p++ = '-';
                        p = number(p, pend, res->end, *specp);
                }
        }
        if (decode) {
                if (res->flags & IORESOURCE_MEM_64)
                        p = string_nocheck(p, pend, " 64bit", str_spec);
                if (res->flags & IORESOURCE_PREFETCH)
                        p = string_nocheck(p, pend, " pref", str_spec);
                if (res->flags & IORESOURCE_WINDOW)
                        p = string_nocheck(p, pend, " window", str_spec);
                if (res->flags & IORESOURCE_DISABLED)
                        p = string_nocheck(p, pend, " disabled", str_spec);
        } else {
                p = string_nocheck(p, pend, " flags ", str_spec);
                p = number(p, pend, res->flags, default_flag_spec);
        }
        *p++ = ']';
        *p = '\0';

        return string_nocheck(buf, end, sym, spec);
}

static noinline_for_stack
char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
                 const char *fmt)
{
        int i, len = 1;                /* if we pass '%ph[CDN]', field width remains
                                   negative value, fallback to the default */
        char separator;

        if (spec.field_width == 0)
                /* nothing to print */
                return buf;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'C':
                separator = ':';
                break;
        case 'D':
                separator = '-';
                break;
        case 'N':
                separator = 0;
                break;
        default:
                separator = ' ';
                break;
        }

        if (spec.field_width > 0)
                len = min_t(int, spec.field_width, 64);

        for (i = 0; i < len; ++i) {
                if (buf < end)
                        *buf = hex_asc_hi(addr[i]);
                ++buf;
                if (buf < end)
                        *buf = hex_asc_lo(addr[i]);
                ++buf;

                if (separator && i != len - 1) {
                        if (buf < end)
                                *buf = separator;
                        ++buf;
                }
        }

        return buf;
}

static noinline_for_stack
char *bitmap_string(char *buf, char *end, const unsigned long *bitmap,
                    struct printf_spec spec, const char *fmt)
{
        const int CHUNKSZ = 32;
        int nr_bits = max_t(int, spec.field_width, 0);
        int i, chunksz;
        bool first = true;

        if (check_pointer(&buf, end, bitmap, spec))
                return buf;

        /* reused to print numbers */
        spec = (struct printf_spec){ .flags = SMALL | ZEROPAD, .base = 16 };

        chunksz = nr_bits & (CHUNKSZ - 1);
        if (chunksz == 0)
                chunksz = CHUNKSZ;

        i = ALIGN(nr_bits, CHUNKSZ) - CHUNKSZ;
        for (; i >= 0; i -= CHUNKSZ) {
                u32 chunkmask, val;
                int word, bit;

                chunkmask = ((1ULL << chunksz) - 1);
                word = i / BITS_PER_LONG;
                bit = i % BITS_PER_LONG;
                val = (bitmap[word] >> bit) & chunkmask;

                if (!first) {
                        if (buf < end)
                                *buf = ',';
                        buf++;
                }
                first = false;

                spec.field_width = DIV_ROUND_UP(chunksz, 4);
                buf = number(buf, end, val, spec);

                chunksz = CHUNKSZ;
        }
        return buf;
}

static noinline_for_stack
char *bitmap_list_string(char *buf, char *end, const unsigned long *bitmap,
                         struct printf_spec spec, const char *fmt)
{
        int nr_bits = max_t(int, spec.field_width, 0);
        bool first = true;
        int rbot, rtop;

        if (check_pointer(&buf, end, bitmap, spec))
                return buf;

        for_each_set_bitrange(rbot, rtop, bitmap, nr_bits) {
                if (!first) {
                        if (buf < end)
                                *buf = ',';
                        buf++;
                }
                first = false;

                buf = number(buf, end, rbot, default_dec_spec);
                if (rtop == rbot + 1)
                        continue;

                if (buf < end)
                        *buf = '-';
                buf = number(++buf, end, rtop - 1, default_dec_spec);
        }
        return buf;
}

static noinline_for_stack
char *mac_address_string(char *buf, char *end, u8 *addr,
                         struct printf_spec spec, const char *fmt)
{
        char mac_addr[sizeof("xx:xx:xx:xx:xx:xx")];
        char *p = mac_addr;
        int i;
        char separator;
        bool reversed = false;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'F':
                separator = '-';
                break;

        case 'R':
                reversed = true;
                fallthrough;

        default:
                separator = ':';
                break;
        }

        for (i = 0; i < 6; i++) {
                if (reversed)
                        p = hex_byte_pack(p, addr[5 - i]);
                else
                        p = hex_byte_pack(p, addr[i]);

                if (fmt[0] == 'M' && i != 5)
                        *p++ = separator;
        }
        *p = '\0';

        return string_nocheck(buf, end, mac_addr, spec);
}

static noinline_for_stack
char *ip4_string(char *p, const u8 *addr, const char *fmt)
{
        int i;
        bool leading_zeros = (fmt[0] == 'i');
        int index;
        int step;

        switch (fmt[2]) {
        case 'h':
#ifdef __BIG_ENDIAN
                index = 0;
                step = 1;
#else
                index = 3;
                step = -1;
#endif
                break;
        case 'l':
                index = 3;
                step = -1;
                break;
        case 'n':
        case 'b':
        default:
                index = 0;
                step = 1;
                break;
        }
        for (i = 0; i < 4; i++) {
                char temp[4] __aligned(2);        /* hold each IP quad in reverse order */
                int digits = put_dec_trunc8(temp, addr[index]) - temp;
                if (leading_zeros) {
                        if (digits < 3)
                                *p++ = '0';
                        if (digits < 2)
                                *p++ = '0';
                }
                /* reverse the digits in the quad */
                while (digits--)
                        *p++ = temp[digits];
                if (i < 3)
                        *p++ = '.';
                index += step;
        }
        *p = '\0';

        return p;
}

static noinline_for_stack
char *ip6_compressed_string(char *p, const char *addr)
{
        int i, j, range;
        unsigned char zerolength[8];
        int longest = 1;
        int colonpos = -1;
        u16 word;
        u8 hi, lo;
        bool needcolon = false;
        bool useIPv4;
        struct in6_addr in6;

        memcpy(&in6, addr, sizeof(struct in6_addr));

        useIPv4 = ipv6_addr_v4mapped(&in6) || ipv6_addr_is_isatap(&in6);

        memset(zerolength, 0, sizeof(zerolength));

        if (useIPv4)
                range = 6;
        else
                range = 8;

        /* find position of longest 0 run */
        for (i = 0; i < range; i++) {
                for (j = i; j < range; j++) {
                        if (in6.s6_addr16[j] != 0)
                                break;
                        zerolength[i]++;
                }
        }
        for (i = 0; i < range; i++) {
                if (zerolength[i] > longest) {
                        longest = zerolength[i];
                        colonpos = i;
                }
        }
        if (longest == 1)                /* don't compress a single 0 */
                colonpos = -1;

        /* emit address */
        for (i = 0; i < range; i++) {
                if (i == colonpos) {
                        if (needcolon || i == 0)
                                *p++ = ':';
                        *p++ = ':';
                        needcolon = false;
                        i += longest - 1;
                        continue;
                }
                if (needcolon) {
                        *p++ = ':';
                        needcolon = false;
                }
                /* hex u16 without leading 0s */
                word = ntohs(in6.s6_addr16[i]);
                hi = word >> 8;
                lo = word & 0xff;
                if (hi) {
                        if (hi > 0x0f)
                                p = hex_byte_pack(p, hi);
                        else
                                *p++ = hex_asc_lo(hi);
                        p = hex_byte_pack(p, lo);
                }
                else if (lo > 0x0f)
                        p = hex_byte_pack(p, lo);
                else
                        *p++ = hex_asc_lo(lo);
                needcolon = true;
        }

        if (useIPv4) {
                if (needcolon)
                        *p++ = ':';
                p = ip4_string(p, &in6.s6_addr[12], "I4");
        }
        *p = '\0';

        return p;
}

static noinline_for_stack
char *ip6_string(char *p, const char *addr, const char *fmt)
{
        int i;

        for (i = 0; i < 8; i++) {
                p = hex_byte_pack(p, *addr++);
                p = hex_byte_pack(p, *addr++);
                if (fmt[0] == 'I' && i != 7)
                        *p++ = ':';
        }
        *p = '\0';

        return p;
}

static noinline_for_stack
char *ip6_addr_string(char *buf, char *end, const u8 *addr,
                      struct printf_spec spec, const char *fmt)
{
        char ip6_addr[sizeof("xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255")];

        if (fmt[0] == 'I' && fmt[2] == 'c')
                ip6_compressed_string(ip6_addr, addr);
        else
                ip6_string(ip6_addr, addr, fmt);

        return string_nocheck(buf, end, ip6_addr, spec);
}

static noinline_for_stack
char *ip4_addr_string(char *buf, char *end, const u8 *addr,
                      struct printf_spec spec, const char *fmt)
{
        char ip4_addr[sizeof("255.255.255.255")];

        ip4_string(ip4_addr, addr, fmt);

        return string_nocheck(buf, end, ip4_addr, spec);
}

static noinline_for_stack
char *ip6_addr_string_sa(char *buf, char *end, const struct sockaddr_in6 *sa,
                         struct printf_spec spec, const char *fmt)
{
        bool have_p = false, have_s = false, have_f = false, have_c = false;
        char ip6_addr[sizeof("[xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255]") +
                      sizeof(":12345") + sizeof("/123456789") +
                      sizeof("%1234567890")];
        char *p = ip6_addr, *pend = ip6_addr + sizeof(ip6_addr);
        const u8 *addr = (const u8 *) &sa->sin6_addr;
        char fmt6[2] = { fmt[0], '6' };
        u8 off = 0;

        fmt++;
        while (isalpha(*++fmt)) {
                switch (*fmt) {
                case 'p':
                        have_p = true;
                        break;
                case 'f':
                        have_f = true;
                        break;
                case 's':
                        have_s = true;
                        break;
                case 'c':
                        have_c = true;
                        break;
                }
        }

        if (have_p || have_s || have_f) {
                *p = '[';
                off = 1;
        }

        if (fmt6[0] == 'I' && have_c)
                p = ip6_compressed_string(ip6_addr + off, addr);
        else
                p = ip6_string(ip6_addr + off, addr, fmt6);

        if (have_p || have_s || have_f)
                *p++ = ']';

        if (have_p) {
                *p++ = ':';
                p = number(p, pend, ntohs(sa->sin6_port), spec);
        }
        if (have_f) {
                *p++ = '/';
                p = number(p, pend, ntohl(sa->sin6_flowinfo &
                                          IPV6_FLOWINFO_MASK), spec);
        }
        if (have_s) {
                *p++ = '%';
                p = number(p, pend, sa->sin6_scope_id, spec);
        }
        *p = '\0';

        return string_nocheck(buf, end, ip6_addr, spec);
}

static noinline_for_stack
char *ip4_addr_string_sa(char *buf, char *end, const struct sockaddr_in *sa,
                         struct printf_spec spec, const char *fmt)
{
        bool have_p = false;
        char *p, ip4_addr[sizeof("255.255.255.255") + sizeof(":12345")];
        char *pend = ip4_addr + sizeof(ip4_addr);
        const u8 *addr = (const u8 *) &sa->sin_addr.s_addr;
        char fmt4[3] = { fmt[0], '4', 0 };

        fmt++;
        while (isalpha(*++fmt)) {
                switch (*fmt) {
                case 'p':
                        have_p = true;
                        break;
                case 'h':
                case 'l':
                case 'n':
                case 'b':
                        fmt4[2] = *fmt;
                        break;
                }
        }

        p = ip4_string(ip4_addr, addr, fmt4);
        if (have_p) {
                *p++ = ':';
                p = number(p, pend, ntohs(sa->sin_port), spec);
        }
        *p = '\0';

        return string_nocheck(buf, end, ip4_addr, spec);
}

static noinline_for_stack
char *ip_addr_string(char *buf, char *end, const void *ptr,
                     struct printf_spec spec, const char *fmt)
{
        char *err_fmt_msg;

        if (check_pointer(&buf, end, ptr, spec))
                return buf;

        switch (fmt[1]) {
        case '6':
                return ip6_addr_string(buf, end, ptr, spec, fmt);
        case '4':
                return ip4_addr_string(buf, end, ptr, spec, fmt);
        case 'S': {
                const union {
                        struct sockaddr                raw;
                        struct sockaddr_in        v4;
                        struct sockaddr_in6        v6;
                } *sa = ptr;

                switch (sa->raw.sa_family) {
                case AF_INET:
                        return ip4_addr_string_sa(buf, end, &sa->v4, spec, fmt);
                case AF_INET6:
                        return ip6_addr_string_sa(buf, end, &sa->v6, spec, fmt);
                default:
                        return error_string(buf, end, "(einval)", spec);
                }}
        }

        err_fmt_msg = fmt[0] == 'i' ? "(%pi?)" : "(%pI?)";
        return error_string(buf, end, err_fmt_msg, spec);
}

static noinline_for_stack
char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
                     const char *fmt)
{
        bool found = true;
        int count = 1;
        unsigned int flags = 0;
        int len;

        if (spec.field_width == 0)
                return buf;                                /* nothing to print */

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        do {
                switch (fmt[count++]) {
                case 'a':
                        flags |= ESCAPE_ANY;
                        break;
                case 'c':
                        flags |= ESCAPE_SPECIAL;
                        break;
                case 'h':
                        flags |= ESCAPE_HEX;
                        break;
                case 'n':
                        flags |= ESCAPE_NULL;
                        break;
                case 'o':
                        flags |= ESCAPE_OCTAL;
                        break;
                case 'p':
                        flags |= ESCAPE_NP;
                        break;
                case 's':
                        flags |= ESCAPE_SPACE;
                        break;
                default:
                        found = false;
                        break;
                }
        } while (found);

        if (!flags)
                flags = ESCAPE_ANY_NP;

        len = spec.field_width < 0 ? 1 : spec.field_width;

        /*
         * string_escape_mem() writes as many characters as it can to
         * the given buffer, and returns the total size of the output
         * had the buffer been big enough.
         */
        buf += string_escape_mem(addr, len, buf, buf < end ? end - buf : 0, flags, NULL);

        return buf;
}

static char *va_format(char *buf, char *end, struct va_format *va_fmt,
                       struct printf_spec spec, const char *fmt)
{
        va_list va;

        if (check_pointer(&buf, end, va_fmt, spec))
                return buf;

        va_copy(va, *va_fmt->va);
        buf += vsnprintf(buf, end > buf ? end - buf : 0, va_fmt->fmt, va);
        va_end(va);

        return buf;
}

static noinline_for_stack
char *uuid_string(char *buf, char *end, const u8 *addr,
                  struct printf_spec spec, const char *fmt)
{
        char uuid[UUID_STRING_LEN + 1];
        char *p = uuid;
        int i;
        const u8 *index = uuid_index;
        bool uc = false;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (*(++fmt)) {
        case 'L':
                uc = true;
                fallthrough;
        case 'l':
                index = guid_index;
                break;
        case 'B':
                uc = true;
                break;
        }

        for (i = 0; i < 16; i++) {
                if (uc)
                        p = hex_byte_pack_upper(p, addr[index[i]]);
                else
                        p = hex_byte_pack(p, addr[index[i]]);
                switch (i) {
                case 3:
                case 5:
                case 7:
                case 9:
                        *p++ = '-';
                        break;
                }
        }

        *p = 0;

        return string_nocheck(buf, end, uuid, spec);
}

static noinline_for_stack
char *netdev_bits(char *buf, char *end, const void *addr,
                  struct printf_spec spec,  const char *fmt)
{
        unsigned long long num;
        int size;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'F':
                num = *(const netdev_features_t *)addr;
                size = sizeof(netdev_features_t);
                break;
        default:
                return error_string(buf, end, "(%pN?)", spec);
        }

        return special_hex_number(buf, end, num, size);
}

static noinline_for_stack
char *fourcc_string(char *buf, char *end, const u32 *fourcc,
                    struct printf_spec spec, const char *fmt)
{
        char output[sizeof("0123 little-endian (0x01234567)")];
        char *p = output;
        unsigned int i;
        u32 orig, val;

        if (fmt[1] != 'c' || fmt[2] != 'c')
                return error_string(buf, end, "(%p4?)", spec);

        if (check_pointer(&buf, end, fourcc, spec))
                return buf;

        orig = get_unaligned(fourcc);
        val = orig & ~BIT(31);

        for (i = 0; i < sizeof(u32); i++) {
                unsigned char c = val >> (i * 8);

                /* Print non-control ASCII characters as-is, dot otherwise */
                *p++ = isascii(c) && isprint(c) ? c : '.';
        }

        *p++ = ' ';
        strcpy(p, orig & BIT(31) ? "big-endian" : "little-endian");
        p += strlen(p);

        *p++ = ' ';
        *p++ = '(';
        p = special_hex_number(p, output + sizeof(output) - 2, orig, sizeof(u32));
        *p++ = ')';
        *p = '\0';

        return string(buf, end, output, spec);
}

static noinline_for_stack
char *address_val(char *buf, char *end, const void *addr,
                  struct printf_spec spec, const char *fmt)
{
        unsigned long long num;
        int size;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'd':
                num = *(const dma_addr_t *)addr;
                size = sizeof(dma_addr_t);
                break;
        case 'p':
        default:
                num = *(const phys_addr_t *)addr;
                size = sizeof(phys_addr_t);
                break;
        }

        return special_hex_number(buf, end, num, size);
}

static noinline_for_stack
char *date_str(char *buf, char *end, const struct rtc_time *tm, bool r)
{
        int year = tm->tm_year + (r ? 0 : 1900);
        int mon = tm->tm_mon + (r ? 0 : 1);

        buf = number(buf, end, year, default_dec04_spec);
        if (buf < end)
                *buf = '-';
        buf++;

        buf = number(buf, end, mon, default_dec02_spec);
        if (buf < end)
                *buf = '-';
        buf++;

        return number(buf, end, tm->tm_mday, default_dec02_spec);
}

static noinline_for_stack
char *time_str(char *buf, char *end, const struct rtc_time *tm, bool r)
{
        buf = number(buf, end, tm->tm_hour, default_dec02_spec);
        if (buf < end)
                *buf = ':';
        buf++;

        buf = number(buf, end, tm->tm_min, default_dec02_spec);
        if (buf < end)
                *buf = ':';
        buf++;

        return number(buf, end, tm->tm_sec, default_dec02_spec);
}

static noinline_for_stack
char *rtc_str(char *buf, char *end, const struct rtc_time *tm,
              struct printf_spec spec, const char *fmt)
{
        bool have_t = true, have_d = true;
        bool raw = false, iso8601_separator = true;
        bool found = true;
        int count = 2;

        if (check_pointer(&buf, end, tm, spec))
                return buf;

        switch (fmt[count]) {
        case 'd':
                have_t = false;
                count++;
                break;
        case 't':
                have_d = false;
                count++;
                break;
        }

        do {
                switch (fmt[count++]) {
                case 'r':
                        raw = true;
                        break;
                case 's':
                        iso8601_separator = false;
                        break;
                default:
                        found = false;
                        break;
                }
        } while (found);

        if (have_d)
                buf = date_str(buf, end, tm, raw);
        if (have_d && have_t) {
                if (buf < end)
                        *buf = iso8601_separator ? 'T' : ' ';
                buf++;
        }
        if (have_t)
                buf = time_str(buf, end, tm, raw);

        return buf;
}

static noinline_for_stack
char *time64_str(char *buf, char *end, const time64_t time,
                 struct printf_spec spec, const char *fmt)
{
        struct rtc_time rtc_time;
        struct tm tm;

        time64_to_tm(time, 0, &tm);

        rtc_time.tm_sec = tm.tm_sec;
        rtc_time.tm_min = tm.tm_min;
        rtc_time.tm_hour = tm.tm_hour;
        rtc_time.tm_mday = tm.tm_mday;
        rtc_time.tm_mon = tm.tm_mon;
        rtc_time.tm_year = tm.tm_year;
        rtc_time.tm_wday = tm.tm_wday;
        rtc_time.tm_yday = tm.tm_yday;

        rtc_time.tm_isdst = 0;

        return rtc_str(buf, end, &rtc_time, spec, fmt);
}

static noinline_for_stack
char *time_and_date(char *buf, char *end, void *ptr, struct printf_spec spec,
                    const char *fmt)
{
        switch (fmt[1]) {
        case 'R':
                return rtc_str(buf, end, (const struct rtc_time *)ptr, spec, fmt);
        case 'T':
                return time64_str(buf, end, *(const time64_t *)ptr, spec, fmt);
        default:
                return error_string(buf, end, "(%pt?)", spec);
        }
}

static noinline_for_stack
char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec,
            const char *fmt)
{
        if (!IS_ENABLED(CONFIG_HAVE_CLK))
                return error_string(buf, end, "(%pC?)", spec);

        if (check_pointer(&buf, end, clk, spec))
                return buf;

        switch (fmt[1]) {
        case 'n':
        default:
#ifdef CONFIG_COMMON_CLK
                return string(buf, end, __clk_get_name(clk), spec);
#else
                return ptr_to_id(buf, end, clk, spec);
#endif
        }
}

static
char *format_flags(char *buf, char *end, unsigned long flags,
                                        const struct trace_print_flags *names)
{
        unsigned long mask;

        for ( ; flags && names->name; names++) {
                mask = names->mask;
                if ((flags & mask) != mask)
                        continue;

                buf = string(buf, end, names->name, default_str_spec);

                flags &= ~mask;
                if (flags) {
                        if (buf < end)
                                *buf = '|';
                        buf++;
                }
        }

        if (flags)
                buf = number(buf, end, flags, default_flag_spec);

        return buf;
}

struct page_flags_fields {
        int width;
        int shift;
        int mask;
        const struct printf_spec *spec;
        const char *name;
};

static const struct page_flags_fields pff[] = {
        {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK,
         &default_dec_spec, "section"},
        {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK,
         &default_dec_spec, "node"},
        {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK,
         &default_dec_spec, "zone"},
        {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK,
         &default_flag_spec, "lastcpupid"},
        {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK,
         &default_flag_spec, "kasantag"},
};

static
char *format_page_flags(char *buf, char *end, unsigned long flags)
{
        unsigned long main_flags = flags & PAGEFLAGS_MASK;
        bool append = false;
        int i;

        buf = number(buf, end, flags, default_flag_spec);
        if (buf < end)
                *buf = '(';
        buf++;

        /* Page flags from the main area. */
        if (main_flags) {
                buf = format_flags(buf, end, main_flags, pageflag_names);
                append = true;
        }

        /* Page flags from the fields area */
        for (i = 0; i < ARRAY_SIZE(pff); i++) {
                /* Skip undefined fields. */
                if (!pff[i].width)
                        continue;

                /* Format: Flag Name + '=' (equals sign) + Number + '|' (separator) */
                if (append) {
                        if (buf < end)
                                *buf = '|';
                        buf++;
                }

                buf = string(buf, end, pff[i].name, default_str_spec);
                if (buf < end)
                        *buf = '=';
                buf++;
                buf = number(buf, end, (flags >> pff[i].shift) & pff[i].mask,
                             *pff[i].spec);

                append = true;
        }
        if (buf < end)
                *buf = ')';
        buf++;

        return buf;
}

static
char *format_page_type(char *buf, char *end, unsigned int page_type)
{
        buf = number(buf, end, page_type, default_flag_spec);

        if (buf < end)
                *buf = '(';
        buf++;

        if (page_type_has_type(page_type))
                buf = format_flags(buf, end, ~page_type, pagetype_names);

        if (buf < end)
                *buf = ')';
        buf++;

        return buf;
}

static noinline_for_stack
char *flags_string(char *buf, char *end, void *flags_ptr,
                   struct printf_spec spec, const char *fmt)
{
        unsigned long flags;
        const struct trace_print_flags *names;

        if (check_pointer(&buf, end, flags_ptr, spec))
                return buf;

        switch (fmt[1]) {
        case 'p':
                return format_page_flags(buf, end, *(unsigned long *)flags_ptr);
        case 't':
                return format_page_type(buf, end, *(unsigned int *)flags_ptr);
        case 'v':
                flags = *(unsigned long *)flags_ptr;
                names = vmaflag_names;
                break;
        case 'g':
                flags = (__force unsigned long)(*(gfp_t *)flags_ptr);
                names = gfpflag_names;
                break;
        default:
                return error_string(buf, end, "(%pG?)", spec);
        }

        return format_flags(buf, end, flags, names);
}

static noinline_for_stack
char *fwnode_full_name_string(struct fwnode_handle *fwnode, char *buf,
                              char *end)
{
        int depth;

        /* Loop starting from the root node to the current node. */
        for (depth = fwnode_count_parents(fwnode); depth >= 0; depth--) {
                /*
                 * Only get a reference for other nodes (i.e. parent nodes).
                 * fwnode refcount may be 0 here.
                 */
                struct fwnode_handle *__fwnode = depth ?
                        fwnode_get_nth_parent(fwnode, depth) : fwnode;

                buf = string(buf, end, fwnode_get_name_prefix(__fwnode),
                             default_str_spec);
                buf = string(buf, end, fwnode_get_name(__fwnode),
                             default_str_spec);

                if (depth)
                        fwnode_handle_put(__fwnode);
        }

        return buf;
}

static noinline_for_stack
char *device_node_string(char *buf, char *end, struct device_node *dn,
                         struct printf_spec spec, const char *fmt)
{
        char tbuf[sizeof("xxxx") + 1];
        const char *p;
        int ret;
        char *buf_start = buf;
        struct property *prop;
        bool has_mult, pass;

        struct printf_spec str_spec = spec;
        str_spec.field_width = -1;

        if (fmt[0] != 'F')
                return error_string(buf, end, "(%pO?)", spec);

        if (!IS_ENABLED(CONFIG_OF))
                return error_string(buf, end, "(%pOF?)", spec);

        if (check_pointer(&buf, end, dn, spec))
                return buf;

        /* simple case without anything any more format specifiers */
        fmt++;
        if (fmt[0] == '\0' || strcspn(fmt,"fnpPFcC") > 0)
                fmt = "f";

        for (pass = false; strspn(fmt,"fnpPFcC"); fmt++, pass = true) {
                int precision;
                if (pass) {
                        if (buf < end)
                                *buf = ':';
                        buf++;
                }

                switch (*fmt) {
                case 'f':        /* full_name */
                        buf = fwnode_full_name_string(of_fwnode_handle(dn), buf,
                                                      end);
                        break;
                case 'n':        /* name */
                        p = fwnode_get_name(of_fwnode_handle(dn));
                        precision = str_spec.precision;
                        str_spec.precision = strchrnul(p, '@') - p;
                        buf = string(buf, end, p, str_spec);
                        str_spec.precision = precision;
                        break;
                case 'p':        /* phandle */
                        buf = number(buf, end, (unsigned int)dn->phandle, default_dec_spec);
                        break;
                case 'P':        /* path-spec */
                        p = fwnode_get_name(of_fwnode_handle(dn));
                        if (!p[1])
                                p = "/";
                        buf = string(buf, end, p, str_spec);
                        break;
                case 'F':        /* flags */
                        tbuf[0] = of_node_check_flag(dn, OF_DYNAMIC) ? 'D' : '-';
                        tbuf[1] = of_node_check_flag(dn, OF_DETACHED) ? 'd' : '-';
                        tbuf[2] = of_node_check_flag(dn, OF_POPULATED) ? 'P' : '-';
                        tbuf[3] = of_node_check_flag(dn, OF_POPULATED_BUS) ? 'B' : '-';
                        tbuf[4] = 0;
                        buf = string_nocheck(buf, end, tbuf, str_spec);
                        break;
                case 'c':        /* major compatible string */
                        ret = of_property_read_string(dn, "compatible", &p);
                        if (!ret)
                                buf = string(buf, end, p, str_spec);
                        break;
                case 'C':        /* full compatible string */
                        has_mult = false;
                        of_property_for_each_string(dn, "compatible", prop, p) {
                                if (has_mult)
                                        buf = string_nocheck(buf, end, ",", str_spec);
                                buf = string_nocheck(buf, end, "\"", str_spec);
                                buf = string(buf, end, p, str_spec);
                                buf = string_nocheck(buf, end, "\"", str_spec);

                                has_mult = true;
                        }
                        break;
                default:
                        break;
                }
        }

        return widen_string(buf, buf - buf_start, end, spec);
}

static noinline_for_stack
char *fwnode_string(char *buf, char *end, struct fwnode_handle *fwnode,
                    struct printf_spec spec, const char *fmt)
{
        struct printf_spec str_spec = spec;
        char *buf_start = buf;

        str_spec.field_width = -1;

        if (*fmt != 'w')
                return error_string(buf, end, "(%pf?)", spec);

        if (check_pointer(&buf, end, fwnode, spec))
                return buf;

        fmt++;

        switch (*fmt) {
        case 'P':        /* name */
                buf = string(buf, end, fwnode_get_name(fwnode), str_spec);
                break;
        case 'f':        /* full_name */
        default:
                buf = fwnode_full_name_string(fwnode, buf, end);
                break;
        }

        return widen_string(buf, buf - buf_start, end, spec);
}

int __init no_hash_pointers_enable(char *str)
{
        if (no_hash_pointers)
                return 0;

        no_hash_pointers = true;

        pr_warn("**********************************************************\n");
        pr_warn("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
        pr_warn("**                                                      **\n");
        pr_warn("** This system shows unhashed kernel memory addresses   **\n");
        pr_warn("** via the console, logs, and other interfaces. This    **\n");
        pr_warn("** might reduce the security of your system.            **\n");
        pr_warn("**                                                      **\n");
        pr_warn("** If you see this message and you are not debugging    **\n");
        pr_warn("** the kernel, report this immediately to your system   **\n");
        pr_warn("** administrator!                                       **\n");
        pr_warn("**                                                      **\n");
        pr_warn("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
        pr_warn("**********************************************************\n");

        return 0;
}
early_param("no_hash_pointers", no_hash_pointers_enable);

/* Used for Rust formatting ('%pA'). */
char *rust_fmt_argument(char *buf, char *end, void *ptr);

/*
 * Show a '%p' thing.  A kernel extension is that the '%p' is followed
 * by an extra set of alphanumeric characters that are extended format
 * specifiers.
 *
 * Please update scripts/checkpatch.pl when adding/removing conversion
 * characters.  (Search for "check for vsprintf extension").
 *
 * Right now we handle:
 *
 * - 'S' For symbolic direct pointers (or function descriptors) with offset
 * - 's' For symbolic direct pointers (or function descriptors) without offset
 * - '[Ss]R' as above with __builtin_extract_return_addr() translation
 * - 'S[R]b' as above with module build ID (for use in backtraces)
 * - '[Ff]' %pf and %pF were obsoleted and later removed in favor of
 *            %ps and %pS. Be careful when re-using these specifiers.
 * - 'B' For backtraced symbolic direct pointers with offset
 * - 'Bb' as above with module build ID (for use in backtraces)
 * - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref]
 * - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201]
 * - 'b[l]' For a bitmap, the number of bits is determined by the field
 *       width which must be explicitly specified either as part of the
 *       format string '%32b[l]' or through '%*b[l]', [l] selects
 *       range-list format instead of hex format
 * - 'M' For a 6-byte MAC address, it prints the address in the
 *       usual colon-separated hex notation
 * - 'm' For a 6-byte MAC address, it prints the hex address without colons
 * - 'MF' For a 6-byte MAC FDDI address, it prints the address
 *       with a dash-separated hex notation
 * - '[mM]R' For a 6-byte MAC address, Reverse order (Bluetooth)
 * - 'I' [46] for IPv4/IPv6 addresses printed in the usual way
 *       IPv4 uses dot-separated decimal without leading 0's (1.2.3.4)
 *       IPv6 uses colon separated network-order 16 bit hex with leading 0's
 *       [S][pfs]
 *       Generic IPv4/IPv6 address (struct sockaddr *) that falls back to
 *       [4] or [6] and is able to print port [p], flowinfo [f], scope [s]
 * - 'i' [46] for 'raw' IPv4/IPv6 addresses
 *       IPv6 omits the colons (01020304...0f)
 *       IPv4 uses dot-separated decimal with leading 0's (010.123.045.006)
 *       [S][pfs]
 *       Generic IPv4/IPv6 address (struct sockaddr *) that falls back to
 *       [4] or [6] and is able to print port [p], flowinfo [f], scope [s]
 * - '[Ii][4S][hnbl]' IPv4 addresses in host, network, big or little endian order
 * - 'I[6S]c' for IPv6 addresses printed as specified by
 *       https://tools.ietf.org/html/rfc5952
 * - 'E[achnops]' For an escaped buffer, where rules are defined by combination
 *                of the following flags (see string_escape_mem() for the
 *                details):
 *                  a - ESCAPE_ANY
 *                  c - ESCAPE_SPECIAL
 *                  h - ESCAPE_HEX
 *                  n - ESCAPE_NULL
 *                  o - ESCAPE_OCTAL
 *                  p - ESCAPE_NP
 *                  s - ESCAPE_SPACE
 *                By default ESCAPE_ANY_NP is used.
 * - 'U' For a 16 byte UUID/GUID, it prints the UUID/GUID in the form
 *       "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
 *       Options for %pU are:
 *         b big endian lower case hex (default)
 *         B big endian UPPER case hex
 *         l little endian lower case hex
 *         L little endian UPPER case hex
 *           big endian output byte order is:
 *             [0][1][2][3]-[4][5]-[6][7]-[8][9]-[10][11][12][13][14][15]
 *           little endian output byte order is:
 *             [3][2][1][0]-[5][4]-[7][6]-[8][9]-[10][11][12][13][14][15]
 * - 'V' For a struct va_format which contains a format string * and va_list *,
 *       call vsnprintf(->format, *->va_list).
 *       Implements a "recursive vsnprintf".
 *       Do not use this feature without some mechanism to verify the
 *       correctness of the format string and va_list arguments.
 * - 'K' For a kernel pointer that should be hidden from unprivileged users.
 *       Use only for procfs, sysfs and similar files, not printk(); please
 *       read the documentation (path below) first.
 * - 'NF' For a netdev_features_t
 * - '4cc' V4L2 or DRM FourCC code, with endianness and raw numerical value.
 * - 'h[CDN]' For a variable-length buffer, it prints it as a hex string with
 *            a certain separator (' ' by default):
 *              C colon
 *              D dash
 *              N no separator
 *            The maximum supported length is 64 bytes of the input. Consider
 *            to use print_hex_dump() for the larger input.
 * - 'a[pd]' For address types [p] phys_addr_t, [d] dma_addr_t and derivatives
 *           (default assumed to be phys_addr_t, passed by reference)
 * - 'd[234]' For a dentry name (optionally 2-4 last components)
 * - 'D[234]' Same as 'd' but for a struct file
 * - 'g' For block_device name (gendisk + partition number)
 * - 't[RT][dt][r][s]' For time and date as represented by:
 *      R    struct rtc_time
 *      T    time64_t
 * - 'C' For a clock, it prints the name (Common Clock Framework) or address
 *       (legacy clock framework) of the clock
 * - 'Cn' For a clock, it prints the name (Common Clock Framework) or address
 *        (legacy clock framework) of the clock
 * - 'G' For flags to be printed as a collection of symbolic strings that would
 *       construct the specific value. Supported flags given by option:
 *       p page flags (see struct page) given as pointer to unsigned long
 *       g gfp flags (GFP_* and __GFP_*) given as pointer to gfp_t
 *       v vma flags (VM_*) given as pointer to unsigned long
 * - 'OF[fnpPcCF]'  For a device tree object
 *                  Without any optional arguments prints the full_name
 *                  f device node full_name
 *                  n device node name
 *                  p device node phandle
 *                  P device node path spec (name + @unit)
 *                  F device node flags
 *                  c major compatible string
 *                  C full compatible string
 * - 'fw[fP]'        For a firmware node (struct fwnode_handle) pointer
 *                Without an option prints the full name of the node
 *                f full name
 *                P node name, including a possible unit address
 * - 'x' For printing the address unmodified. Equivalent to "%lx".
 *       Please read the documentation (path below) before using!
 * - '[ku]s' For a BPF/tracing related format specifier, e.g. used out of
 *           bpf_trace_printk() where [ku] prefix specifies either kernel (k)
 *           or user (u) memory to probe, and:
 *              s a string, equivalent to "%s" on direct vsnprintf() use
 *
 * ** When making changes please also update:
 *        Documentation/core-api/printk-formats.rst
 *
 * Note: The default behaviour (unadorned %p) is to hash the address,
 * rendering it useful as a unique identifier.
 *
 * There is also a '%pA' format specifier, but it is only intended to be used
 * from Rust code to format core::fmt::Arguments. Do *not* use it from C.
 * See rust/kernel/print.rs for details.
 */
static noinline_for_stack
char *pointer(const char *fmt, char *buf, char *end, void *ptr,
              struct printf_spec spec)
{
        switch (*fmt) {
        case 'S':
        case 's':
                ptr = dereference_symbol_descriptor(ptr);
                fallthrough;
        case 'B':
                return symbol_string(buf, end, ptr, spec, fmt);
        case 'R':
        case 'r':
                return resource_string(buf, end, ptr, spec, fmt);
        case 'h':
                return hex_string(buf, end, ptr, spec, fmt);
        case 'b':
                switch (fmt[1]) {
                case 'l':
                        return bitmap_list_string(buf, end, ptr, spec, fmt);
                default:
                        return bitmap_string(buf, end, ptr, spec, fmt);
                }
        case 'M':                        /* Colon separated: 00:01:02:03:04:05 */
        case 'm':                        /* Contiguous: 000102030405 */
                                        /* [mM]F (FDDI) */
                                        /* [mM]R (Reverse order; Bluetooth) */
                return mac_address_string(buf, end, ptr, spec, fmt);
        case 'I':                        /* Formatted IP supported
                                         * 4:        1.2.3.4
                                         * 6:        0001:0203:...:0708
                                         * 6c:        1::708 or 1::1.2.3.4
                                         */
        case 'i':                        /* Contiguous:
                                         * 4:        001.002.003.004
                                         * 6:   000102...0f
                                         */
                return ip_addr_string(buf, end, ptr, spec, fmt);
        case 'E':
                return escaped_string(buf, end, ptr, spec, fmt);
        case 'U':
                return uuid_string(buf, end, ptr, spec, fmt);
        case 'V':
                return va_format(buf, end, ptr, spec, fmt);
        case 'K':
                return restricted_pointer(buf, end, ptr, spec);
        case 'N':
                return netdev_bits(buf, end, ptr, spec, fmt);
        case '4':
                return fourcc_string(buf, end, ptr, spec, fmt);
        case 'a':
                return address_val(buf, end, ptr, spec, fmt);
        case 'd':
                return dentry_name(buf, end, ptr, spec, fmt);
        case 't':
                return time_and_date(buf, end, ptr, spec, fmt);
        case 'C':
                return clock(buf, end, ptr, spec, fmt);
        case 'D':
                return file_dentry_name(buf, end, ptr, spec, fmt);
#ifdef CONFIG_BLOCK
        case 'g':
                return bdev_name(buf, end, ptr, spec, fmt);
#endif

        case 'G':
                return flags_string(buf, end, ptr, spec, fmt);
        case 'O':
                return device_node_string(buf, end, ptr, spec, fmt + 1);
        case 'f':
                return fwnode_string(buf, end, ptr, spec, fmt + 1);
        case 'A':
                if (!IS_ENABLED(CONFIG_RUST)) {
                        WARN_ONCE(1, "Please remove %%pA from non-Rust code\n");
                        return error_string(buf, end, "(%pA?)", spec);
                }
                return rust_fmt_argument(buf, end, ptr);
        case 'x':
                return pointer_string(buf, end, ptr, spec);
        case 'e':
                /* %pe with a non-ERR_PTR gets treated as plain %p */
                if (!IS_ERR(ptr))
                        return default_pointer(buf, end, ptr, spec);
                return err_ptr(buf, end, ptr, spec);
        case 'u':
        case 'k':
                switch (fmt[1]) {
                case 's':
                        return string(buf, end, ptr, spec);
                default:
                        return error_string(buf, end, "(einval)", spec);
                }
        default:
                return default_pointer(buf, end, ptr, spec);
        }
}

/*
 * Helper function to decode printf style format.
 * Each call decode a token from the format and return the
 * number of characters read (or likely the delta where it wants
 * to go on the next call).
 * The decoded token is returned through the parameters
 *
 * 'h', 'l', or 'L' for integer fields
 * 'z' support added 23/7/1999 S.H.
 * 'z' changed to 'Z' --davidm 1/25/99
 * 'Z' changed to 'z' --adobriyan 2017-01-25
 * 't' added for ptrdiff_t
 *
 * @fmt: the format string
 * @type of the token returned
 * @flags: various flags such as +, -, # tokens..
 * @field_width: overwritten width
 * @base: base of the number (octal, hex, ...)
 * @precision: precision of a number
 * @qualifier: qualifier of a number (long, size_t, ...)
 */
static noinline_for_stack
int format_decode(const char *fmt, struct printf_spec *spec)
{
        const char *start = fmt;
        char qualifier;

        /* we finished early by reading the field width */
        if (spec->type == FORMAT_TYPE_WIDTH) {
                if (spec->field_width < 0) {
                        spec->field_width = -spec->field_width;
                        spec->flags |= LEFT;
                }
                spec->type = FORMAT_TYPE_NONE;
                goto precision;
        }

        /* we finished early by reading the precision */
        if (spec->type == FORMAT_TYPE_PRECISION) {
                if (spec->precision < 0)
                        spec->precision = 0;

                spec->type = FORMAT_TYPE_NONE;
                goto qualifier;
        }

        /* By default */
        spec->type = FORMAT_TYPE_NONE;

        for (; *fmt ; ++fmt) {
                if (*fmt == '%')
                        break;
        }

        /* Return the current non-format string */
        if (fmt != start || !*fmt)
                return fmt - start;

        /* Process flags */
        spec->flags = 0;

        while (1) { /* this also skips first '%' */
                bool found = true;

                ++fmt;

                switch (*fmt) {
                case '-': spec->flags |= LEFT;    break;
                case '+': spec->flags |= PLUS;    break;
                case ' ': spec->flags |= SPACE;   break;
                case '#': spec->flags |= SPECIAL; break;
                case '0': spec->flags |= ZEROPAD; break;
                default:  found = false;
                }

                if (!found)
                        break;
        }

        /* get field width */
        spec->field_width = -1;

        if (isdigit(*fmt))
                spec->field_width = skip_atoi(&fmt);
        else if (*fmt == '*') {
                /* it's the next argument */
                spec->type = FORMAT_TYPE_WIDTH;
                return ++fmt - start;
        }

precision:
        /* get the precision */
        spec->precision = -1;
        if (*fmt == '.') {
                ++fmt;
                if (isdigit(*fmt)) {
                        spec->precision = skip_atoi(&fmt);
                        if (spec->precision < 0)
                                spec->precision = 0;
                } else if (*fmt == '*') {
                        /* it's the next argument */
                        spec->type = FORMAT_TYPE_PRECISION;
                        return ++fmt - start;
                }
        }

qualifier:
        /* get the conversion qualifier */
        qualifier = 0;
        if (*fmt == 'h' || _tolower(*fmt) == 'l' ||
            *fmt == 'z' || *fmt == 't') {
                qualifier = *fmt++;
                if (unlikely(qualifier == *fmt)) {
                        if (qualifier == 'l') {
                                qualifier = 'L';
                                ++fmt;
                        } else if (qualifier == 'h') {
                                qualifier = 'H';
                                ++fmt;
                        }
                }
        }

        /* default base */
        spec->base = 10;
        switch (*fmt) {
        case 'c':
                spec->type = FORMAT_TYPE_CHAR;
                return ++fmt - start;

        case 's':
                spec->type = FORMAT_TYPE_STR;
                return ++fmt - start;

        case 'p':
                spec->type = FORMAT_TYPE_PTR;
                return ++fmt - start;

        case '%':
                spec->type = FORMAT_TYPE_PERCENT_CHAR;
                return ++fmt - start;

        /* integer number formats - set up the flags and "break" */
        case 'o':
                spec->base = 8;
                break;

        case 'x':
                spec->flags |= SMALL;
                fallthrough;

        case 'X':
                spec->base = 16;
                break;

        case 'd':
        case 'i':
                spec->flags |= SIGN;
                break;
        case 'u':
                break;

        case 'n':
                /*
                 * Since %n poses a greater security risk than
                 * utility, treat it as any other invalid or
                 * unsupported format specifier.
                 */
                fallthrough;

        default:
                WARN_ONCE(1, "Please remove unsupported %%%c in format string\n", *fmt);
                spec->type = FORMAT_TYPE_INVALID;
                return fmt - start;
        }

        if (qualifier == 'L')
                spec->type = FORMAT_TYPE_LONG_LONG;
        else if (qualifier == 'l') {
                BUILD_BUG_ON(FORMAT_TYPE_ULONG + SIGN != FORMAT_TYPE_LONG);
                spec->type = FORMAT_TYPE_ULONG + (spec->flags & SIGN);
        } else if (qualifier == 'z') {
                spec->type = FORMAT_TYPE_SIZE_T;
        } else if (qualifier == 't') {
                spec->type = FORMAT_TYPE_PTRDIFF;
        } else if (qualifier == 'H') {
                BUILD_BUG_ON(FORMAT_TYPE_UBYTE + SIGN != FORMAT_TYPE_BYTE);
                spec->type = FORMAT_TYPE_UBYTE + (spec->flags & SIGN);
        } else if (qualifier == 'h') {
                BUILD_BUG_ON(FORMAT_TYPE_USHORT + SIGN != FORMAT_TYPE_SHORT);
                spec->type = FORMAT_TYPE_USHORT + (spec->flags & SIGN);
        } else {
                BUILD_BUG_ON(FORMAT_TYPE_UINT + SIGN != FORMAT_TYPE_INT);
                spec->type = FORMAT_TYPE_UINT + (spec->flags & SIGN);
        }

        return ++fmt - start;
}

static void
set_field_width(struct printf_spec *spec, int width)
{
        spec->field_width = width;
        if (WARN_ONCE(spec->field_width != width, "field width %d too large", width)) {
                spec->field_width = clamp(width, -FIELD_WIDTH_MAX, FIELD_WIDTH_MAX);
        }
}

static void
set_precision(struct printf_spec *spec, int prec)
{
        spec->precision = prec;
        if (WARN_ONCE(spec->precision != prec, "precision %d too large", prec)) {
                spec->precision = clamp(prec, 0, PRECISION_MAX);
        }
}

/**
 * vsnprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @args: Arguments for the format string
 *
 * This function generally follows C99 vsnprintf, but has some
 * extensions and a few limitations:
 *
 *  - ``%n`` is unsupported
 *  - ``%p*`` is handled by pointer()
 *
 * See pointer() or Documentation/core-api/printk-formats.rst for more
 * extensive description.
 *
 * **Please update the documentation in both places when making changes**
 *
 * The return value is the number of characters which would
 * be generated for the given input, excluding the trailing
 * '\0', as per ISO C99. If you want to have the exact
 * number of characters written into @buf as return value
 * (not including the trailing '\0'), use vscnprintf(). If the
 * return is greater than or equal to @size, the resulting
 * string is truncated.
 *
 * If you're not already dealing with a va_list consider using snprintf().
 */
int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
{
        unsigned long long num;
        char *str, *end;
        struct printf_spec spec = {0};

        /* Reject out-of-range values early.  Large positive sizes are
           used for unknown buffer sizes. */
        if (WARN_ON_ONCE(size > INT_MAX))
                return 0;

        str = buf;
        end = buf + size;

        /* Make sure end is always >= buf */
        if (end < buf) {
                end = ((void *)-1);
                size = end - buf;
        }

        while (*fmt) {
                const char *old_fmt = fmt;
                int read = format_decode(fmt, &spec);

                fmt += read;

                switch (spec.type) {
                case FORMAT_TYPE_NONE: {
                        int copy = read;
                        if (str < end) {
                                if (copy > end - str)
                                        copy = end - str;
                                memcpy(str, old_fmt, copy);
                        }
                        str += read;
                        break;
                }

                case FORMAT_TYPE_WIDTH:
                        set_field_width(&spec, va_arg(args, int));
                        break;

                case FORMAT_TYPE_PRECISION:
                        set_precision(&spec, va_arg(args, int));
                        break;

                case FORMAT_TYPE_CHAR: {
                        char c;

                        if (!(spec.flags & LEFT)) {
                                while (--spec.field_width > 0) {
                                        if (str < end)
                                                *str = ' ';
                                        ++str;

                                }
                        }
                        c = (unsigned char) va_arg(args, int);
                        if (str < end)
                                *str = c;
                        ++str;
                        while (--spec.field_width > 0) {
                                if (str < end)
                                        *str = ' ';
                                ++str;
                        }
                        break;
                }

                case FORMAT_TYPE_STR:
                        str = string(str, end, va_arg(args, char *), spec);
                        break;

                case FORMAT_TYPE_PTR:
                        str = pointer(fmt, str, end, va_arg(args, void *),
                                      spec);
                        while (isalnum(*fmt))
                                fmt++;
                        break;

                case FORMAT_TYPE_PERCENT_CHAR:
                        if (str < end)
                                *str = '%';
                        ++str;
                        break;

                case FORMAT_TYPE_INVALID:
                        /*
                         * Presumably the arguments passed gcc's type
                         * checking, but there is no safe or sane way
                         * for us to continue parsing the format and
                         * fetching from the va_list; the remaining
                         * specifiers and arguments would be out of
                         * sync.
                         */
                        goto out;

                default:
                        switch (spec.type) {
                        case FORMAT_TYPE_LONG_LONG:
                                num = va_arg(args, long long);
                                break;
                        case FORMAT_TYPE_ULONG:
                                num = va_arg(args, unsigned long);
                                break;
                        case FORMAT_TYPE_LONG:
                                num = va_arg(args, long);
                                break;
                        case FORMAT_TYPE_SIZE_T:
                                if (spec.flags & SIGN)
                                        num = va_arg(args, ssize_t);
                                else
                                        num = va_arg(args, size_t);
                                break;
                        case FORMAT_TYPE_PTRDIFF:
                                num = va_arg(args, ptrdiff_t);
                                break;
                        case FORMAT_TYPE_UBYTE:
                                num = (unsigned char) va_arg(args, int);
                                break;
                        case FORMAT_TYPE_BYTE:
                                num = (signed char) va_arg(args, int);
                                break;
                        case FORMAT_TYPE_USHORT:
                                num = (unsigned short) va_arg(args, int);
                                break;
                        case FORMAT_TYPE_SHORT:
                                num = (short) va_arg(args, int);
                                break;
                        case FORMAT_TYPE_INT:
                                num = (int) va_arg(args, int);
                                break;
                        default:
                                num = va_arg(args, unsigned int);
                        }

                        str = number(str, end, num, spec);
                }
        }

out:
        if (size > 0) {
                if (str < end)
                        *str = '\0';
                else
                        end[-1] = '\0';
        }

        /* the trailing null byte doesn't count towards the total */
        return str-buf;

}
EXPORT_SYMBOL(vsnprintf);

/**
 * vscnprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @args: Arguments for the format string
 *
 * The return value is the number of characters which have been written into
 * the @buf not including the trailing '\0'. If @size is == 0 the function
 * returns 0.
 *
 * If you're not already dealing with a va_list consider using scnprintf().
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
{
        int i;

        if (unlikely(!size))
                return 0;

        i = vsnprintf(buf, size, fmt, args);

        if (likely(i < size))
                return i;

        return size - 1;
}
EXPORT_SYMBOL(vscnprintf);

/**
 * snprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The return value is the number of characters which would be
 * generated for the given input, excluding the trailing null,
 * as per ISO C99.  If the return is greater than or equal to
 * @size, the resulting string is truncated.
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int snprintf(char *buf, size_t size, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vsnprintf(buf, size, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(snprintf);

/**
 * scnprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The return value is the number of characters written into @buf not including
 * the trailing '\0'. If @size is == 0 the function returns 0.
 */

int scnprintf(char *buf, size_t size, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vscnprintf(buf, size, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(scnprintf);

/**
 * vsprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @fmt: The format string to use
 * @args: Arguments for the format string
 *
 * The function returns the number of characters written
 * into @buf. Use vsnprintf() or vscnprintf() in order to avoid
 * buffer overflows.
 *
 * If you're not already dealing with a va_list consider using sprintf().
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int vsprintf(char *buf, const char *fmt, va_list args)
{
        return vsnprintf(buf, INT_MAX, fmt, args);
}
EXPORT_SYMBOL(vsprintf);

/**
 * sprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The function returns the number of characters written
 * into @buf. Use snprintf() or scnprintf() in order to avoid
 * buffer overflows.
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int sprintf(char *buf, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vsnprintf(buf, INT_MAX, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(sprintf);

#ifdef CONFIG_BINARY_PRINTF
/*
 * bprintf service:
 * vbin_printf() - VA arguments to binary data
 * bstr_printf() - Binary data to text string
 */

/**
 * vbin_printf - Parse a format string and place args' binary value in a buffer
 * @bin_buf: The buffer to place args' binary value
 * @size: The size of the buffer(by words(32bits), not characters)
 * @fmt: The format string to use
 * @args: Arguments for the format string
 *
 * The format follows C99 vsnprintf, except %n is ignored, and its argument
 * is skipped.
 *
 * The return value is the number of words(32bits) which would be generated for
 * the given input.
 *
 * NOTE:
 * If the return value is greater than @size, the resulting bin_buf is NOT
 * valid for bstr_printf().
 */
int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args)
{
        struct printf_spec spec = {0};
        char *str, *end;
        int width;

        str = (char *)bin_buf;
        end = (char *)(bin_buf + size);

#define save_arg(type)                                                        \
({                                                                        \
        unsigned long long value;                                        \
        if (sizeof(type) == 8) {                                        \
                unsigned long long val8;                                \
                str = PTR_ALIGN(str, sizeof(u32));                        \
                val8 = va_arg(args, unsigned long long);                \
                if (str + sizeof(type) <= end) {                        \
                        *(u32 *)str = *(u32 *)&val8;                        \
                        *(u32 *)(str + 4) = *((u32 *)&val8 + 1);        \
                }                                                        \
                value = val8;                                                \
        } else {                                                        \
                unsigned int val4;                                        \
                str = PTR_ALIGN(str, sizeof(type));                        \
                val4 = va_arg(args, int);                                \
                if (str + sizeof(type) <= end)                                \
                        *(typeof(type) *)str = (type)(long)val4;        \
                value = (unsigned long long)val4;                        \
        }                                                                \
        str += sizeof(type);                                                \
        value;                                                                \
})

        while (*fmt) {
                int read = format_decode(fmt, &spec);

                fmt += read;

                switch (spec.type) {
                case FORMAT_TYPE_NONE:
                case FORMAT_TYPE_PERCENT_CHAR:
                        break;
                case FORMAT_TYPE_INVALID:
                        goto out;

                case FORMAT_TYPE_WIDTH:
                case FORMAT_TYPE_PRECISION:
                        width = (int)save_arg(int);
                        /* Pointers may require the width */
                        if (*fmt == 'p')
                                set_field_width(&spec, width);
                        break;

                case FORMAT_TYPE_CHAR:
                        save_arg(char);
                        break;

                case FORMAT_TYPE_STR: {
                        const char *save_str = va_arg(args, char *);
                        const char *err_msg;
                        size_t len;

                        err_msg = check_pointer_msg(save_str);
                        if (err_msg)
                                save_str = err_msg;

                        len = strlen(save_str) + 1;
                        if (str + len < end)
                                memcpy(str, save_str, len);
                        str += len;
                        break;
                }

                case FORMAT_TYPE_PTR:
                        /* Dereferenced pointers must be done now */
                        switch (*fmt) {
                        /* Dereference of functions is still OK */
                        case 'S':
                        case 's':
                        case 'x':
                        case 'K':
                        case 'e':
                                save_arg(void *);
                                break;
                        default:
                                if (!isalnum(*fmt)) {
                                        save_arg(void *);
                                        break;
                                }
                                str = pointer(fmt, str, end, va_arg(args, void *),
                                              spec);
                                if (str + 1 < end)
                                        *str++ = '\0';
                                else
                                        end[-1] = '\0'; /* Must be nul terminated */
                        }
                        /* skip all alphanumeric pointer suffixes */
                        while (isalnum(*fmt))
                                fmt++;
                        break;

                default:
                        switch (spec.type) {

                        case FORMAT_TYPE_LONG_LONG:
                                save_arg(long long);
                                break;
                        case FORMAT_TYPE_ULONG:
                        case FORMAT_TYPE_LONG:
                                save_arg(unsigned long);
                                break;
                        case FORMAT_TYPE_SIZE_T:
                                save_arg(size_t);
                                break;
                        case FORMAT_TYPE_PTRDIFF:
                                save_arg(ptrdiff_t);
                                break;
                        case FORMAT_TYPE_UBYTE:
                        case FORMAT_TYPE_BYTE:
                                save_arg(char);
                                break;
                        case FORMAT_TYPE_USHORT:
                        case FORMAT_TYPE_SHORT:
                                save_arg(short);
                                break;
                        default:
                                save_arg(int);
                        }
                }
        }

out:
        return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf;
#undef save_arg
}
EXPORT_SYMBOL_GPL(vbin_printf);

/**
 * bstr_printf - Format a string from binary arguments and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @bin_buf: Binary arguments for the format string
 *
 * This function like C99 vsnprintf, but the difference is that vsnprintf gets
 * arguments from stack, and bstr_printf gets arguments from @bin_buf which is
 * a binary buffer that generated by vbin_printf.
 *
 * The format follows C99 vsnprintf, but has some extensions:
 *  see vsnprintf comment for details.
 *
 * The return value is the number of characters which would
 * be generated for the given input, excluding the trailing
 * '\0', as per ISO C99. If you want to have the exact
 * number of characters written into @buf as return value
 * (not including the trailing '\0'), use vscnprintf(). If the
 * return is greater than or equal to @size, the resulting
 * string is truncated.
 */
int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
{
        struct printf_spec spec = {0};
        char *str, *end;
        const char *args = (const char *)bin_buf;

        if (WARN_ON_ONCE(size > INT_MAX))
                return 0;

        str = buf;
        end = buf + size;

#define get_arg(type)                                                        \
({                                                                        \
        typeof(type) value;                                                \
        if (sizeof(type) == 8) {                                        \
                args = PTR_ALIGN(args, sizeof(u32));                        \
                *(u32 *)&value = *(u32 *)args;                                \
                *((u32 *)&value + 1) = *(u32 *)(args + 4);                \
        } else {                                                        \
                args = PTR_ALIGN(args, sizeof(type));                        \
                value = *(typeof(type) *)args;                                \
        }                                                                \
        args += sizeof(type);                                                \
        value;                                                                \
})

        /* Make sure end is always >= buf */
        if (end < buf) {
                end = ((void *)-1);
                size = end - buf;
        }

        while (*fmt) {
                const char *old_fmt = fmt;
                int read = format_decode(fmt, &spec);

                fmt += read;

                switch (spec.type) {
                case FORMAT_TYPE_NONE: {
                        int copy = read;
                        if (str < end) {
                                if (copy > end - str)
                                        copy = end - str;
                                memcpy(str, old_fmt, copy);
                        }
                        str += read;
                        break;
                }

                case FORMAT_TYPE_WIDTH:
                        set_field_width(&spec, get_arg(int));
                        break;

                case FORMAT_TYPE_PRECISION:
                        set_precision(&spec, get_arg(int));
                        break;

                case FORMAT_TYPE_CHAR: {
                        char c;

                        if (!(spec.flags & LEFT)) {
                                while (--spec.field_width > 0) {
                                        if (str < end)
                                                *str = ' ';
                                        ++str;
                                }
                        }
                        c = (unsigned char) get_arg(char);
                        if (str < end)
                                *str = c;
                        ++str;
                        while (--spec.field_width > 0) {
                                if (str < end)
                                        *str = ' ';
                                ++str;
                        }
                        break;
                }

                case FORMAT_TYPE_STR: {
                        const char *str_arg = args;
                        args += strlen(str_arg) + 1;
                        str = string(str, end, (char *)str_arg, spec);
                        break;
                }

                case FORMAT_TYPE_PTR: {
                        bool process = false;
                        int copy, len;
                        /* Non function dereferences were already done */
                        switch (*fmt) {
                        case 'S':
                        case 's':
                        case 'x':
                        case 'K':
                        case 'e':
                                process = true;
                                break;
                        default:
                                if (!isalnum(*fmt)) {
                                        process = true;
                                        break;
                                }
                                /* Pointer dereference was already processed */
                                if (str < end) {
                                        len = copy = strlen(args);
                                        if (copy > end - str)
                                                copy = end - str;
                                        memcpy(str, args, copy);
                                        str += len;
                                        args += len + 1;
                                }
                        }
                        if (process)
                                str = pointer(fmt, str, end, get_arg(void *), spec);

                        while (isalnum(*fmt))
                                fmt++;
                        break;
                }

                case FORMAT_TYPE_PERCENT_CHAR:
                        if (str < end)
                                *str = '%';
                        ++str;
                        break;

                case FORMAT_TYPE_INVALID:
                        goto out;

                default: {
                        unsigned long long num;

                        switch (spec.type) {

                        case FORMAT_TYPE_LONG_LONG:
                                num = get_arg(long long);
                                break;
                        case FORMAT_TYPE_ULONG:
                        case FORMAT_TYPE_LONG:
                                num = get_arg(unsigned long);
                                break;
                        case FORMAT_TYPE_SIZE_T:
                                num = get_arg(size_t);
                                break;
                        case FORMAT_TYPE_PTRDIFF:
                                num = get_arg(ptrdiff_t);
                                break;
                        case FORMAT_TYPE_UBYTE:
                                num = get_arg(unsigned char);
                                break;
                        case FORMAT_TYPE_BYTE:
                                num = get_arg(signed char);
                                break;
                        case FORMAT_TYPE_USHORT:
                                num = get_arg(unsigned short);
                                break;
                        case FORMAT_TYPE_SHORT:
                                num = get_arg(short);
                                break;
                        case FORMAT_TYPE_UINT:
                                num = get_arg(unsigned int);
                                break;
                        default:
                                num = get_arg(int);
                        }

                        str = number(str, end, num, spec);
                } /* default: */
                } /* switch(spec.type) */
        } /* while(*fmt) */

out:
        if (size > 0) {
                if (str < end)
                        *str = '\0';
                else
                        end[-1] = '\0';
        }

#undef get_arg

        /* the trailing null byte doesn't count towards the total */
        return str - buf;
}
EXPORT_SYMBOL_GPL(bstr_printf);

/**
 * bprintf - Parse a format string and place args' binary value in a buffer
 * @bin_buf: The buffer to place args' binary value
 * @size: The size of the buffer(by words(32bits), not characters)
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The function returns the number of words(u32) written
 * into @bin_buf.
 */
int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...)
{
        va_list args;
        int ret;

        va_start(args, fmt);
        ret = vbin_printf(bin_buf, size, fmt, args);
        va_end(args);

        return ret;
}
EXPORT_SYMBOL_GPL(bprintf);

#endif /* CONFIG_BINARY_PRINTF */

/**
 * vsscanf - Unformat a buffer into a list of arguments
 * @buf:        input buffer
 * @fmt:        format of buffer
 * @args:        arguments
 */
int vsscanf(const char *buf, const char *fmt, va_list args)
{
        const char *str = buf;
        char *next;
        char digit;
        int num = 0;
        u8 qualifier;
        unsigned int base;
        union {
                long long s;
                unsigned long long u;
        } val;
        s16 field_width;
        bool is_sign;

        while (*fmt) {
                /* skip any white space in format */
                /* white space in format matches any amount of
                 * white space, including none, in the input.
                 */
                if (isspace(*fmt)) {
                        fmt = skip_spaces(++fmt);
                        str = skip_spaces(str);
                }

                /* anything that is not a conversion must match exactly */
                if (*fmt != '%' && *fmt) {
                        if (*fmt++ != *str++)
                                break;
                        continue;
                }

                if (!*fmt)
                        break;
                ++fmt;

                /* skip this conversion.
                 * advance both strings to next white space
                 */
                if (*fmt == '*') {
                        if (!*str)
                                break;
                        while (!isspace(*fmt) && *fmt != '%' && *fmt) {
                                /* '%*[' not yet supported, invalid format */
                                if (*fmt == '[')
                                        return num;
                                fmt++;
                        }
                        while (!isspace(*str) && *str)
                                str++;
                        continue;
                }

                /* get field width */
                field_width = -1;
                if (isdigit(*fmt)) {
                        field_width = skip_atoi(&fmt);
                        if (field_width <= 0)
                                break;
                }

                /* get conversion qualifier */
                qualifier = -1;
                if (*fmt == 'h' || _tolower(*fmt) == 'l' ||
                    *fmt == 'z') {
                        qualifier = *fmt++;
                        if (unlikely(qualifier == *fmt)) {
                                if (qualifier == 'h') {
                                        qualifier = 'H';
                                        fmt++;
                                } else if (qualifier == 'l') {
                                        qualifier = 'L';
                                        fmt++;
                                }
                        }
                }

                if (!*fmt)
                        break;

                if (*fmt == 'n') {
                        /* return number of characters read so far */
                        *va_arg(args, int *) = str - buf;
                        ++fmt;
                        continue;
                }

                if (!*str)
                        break;

                base = 10;
                is_sign = false;

                switch (*fmt++) {
                case 'c':
                {
                        char *s = (char *)va_arg(args, char*);
                        if (field_width == -1)
                                field_width = 1;
                        do {
                                *s++ = *str++;
                        } while (--field_width > 0 && *str);
                        num++;
                }
                continue;
                case 's':
                {
                        char *s = (char *)va_arg(args, char *);
                        if (field_width == -1)
                                field_width = SHRT_MAX;
                        /* first, skip leading white space in buffer */
                        str = skip_spaces(str);

                        /* now copy until next white space */
                        while (*str && !isspace(*str) && field_width--)
                                *s++ = *str++;
                        *s = '\0';
                        num++;
                }
                continue;
                /*
                 * Warning: This implementation of the '[' conversion specifier
                 * deviates from its glibc counterpart in the following ways:
                 * (1) It does NOT support ranges i.e. '-' is NOT a special
                 *     character
                 * (2) It cannot match the closing bracket ']' itself
                 * (3) A field width is required
                 * (4) '%*[' (discard matching input) is currently not supported
                 *
                 * Example usage:
                 * ret = sscanf("00:0a:95","%2[^:]:%2[^:]:%2[^:]",
                 *                buf1, buf2, buf3);
                 * if (ret < 3)
                 *    // etc..
                 */
                case '[':
                {
                        char *s = (char *)va_arg(args, char *);
                        DECLARE_BITMAP(set, 256) = {0};
                        unsigned int len = 0;
                        bool negate = (*fmt == '^');

                        /* field width is required */
                        if (field_width == -1)
                                return num;

                        if (negate)
                                ++fmt;

                        for ( ; *fmt && *fmt != ']'; ++fmt, ++len)
                                __set_bit((u8)*fmt, set);

                        /* no ']' or no character set found */
                        if (!*fmt || !len)
                                return num;
                        ++fmt;

                        if (negate) {
                                bitmap_complement(set, set, 256);
                                /* exclude null '\0' byte */
                                __clear_bit(0, set);
                        }

                        /* match must be non-empty */
                        if (!test_bit((u8)*str, set))
                                return num;

                        while (test_bit((u8)*str, set) && field_width--)
                                *s++ = *str++;
                        *s = '\0';
                        ++num;
                }
                continue;
                case 'o':
                        base = 8;
                        break;
                case 'x':
                case 'X':
                        base = 16;
                        break;
                case 'i':
                        base = 0;
                        fallthrough;
                case 'd':
                        is_sign = true;
                        fallthrough;
                case 'u':
                        break;
                case '%':
                        /* looking for '%' in str */
                        if (*str++ != '%')
                                return num;
                        continue;
                default:
                        /* invalid format; stop here */
                        return num;
                }

                /* have some sort of integer conversion.
                 * first, skip white space in buffer.
                 */
                str = skip_spaces(str);

                digit = *str;
                if (is_sign && digit == '-') {
                        if (field_width == 1)
                                break;

                        digit = *(str + 1);
                }

                if (!digit
                    || (base == 16 && !isxdigit(digit))
                    || (base == 10 && !isdigit(digit))
                    || (base == 8 && !isodigit(digit))
                    || (base == 0 && !isdigit(digit)))
                        break;

                if (is_sign)
                        val.s = simple_strntoll(str, &next, base,
                                                field_width >= 0 ? field_width : INT_MAX);
                else
                        val.u = simple_strntoull(str, &next, base,
                                                 field_width >= 0 ? field_width : INT_MAX);

                switch (qualifier) {
                case 'H':        /* that's 'hh' in format */
                        if (is_sign)
                                *va_arg(args, signed char *) = val.s;
                        else
                                *va_arg(args, unsigned char *) = val.u;
                        break;
                case 'h':
                        if (is_sign)
                                *va_arg(args, short *) = val.s;
                        else
                                *va_arg(args, unsigned short *) = val.u;
                        break;
                case 'l':
                        if (is_sign)
                                *va_arg(args, long *) = val.s;
                        else
                                *va_arg(args, unsigned long *) = val.u;
                        break;
                case 'L':
                        if (is_sign)
                                *va_arg(args, long long *) = val.s;
                        else
                                *va_arg(args, unsigned long long *) = val.u;
                        break;
                case 'z':
                        *va_arg(args, size_t *) = val.u;
                        break;
                default:
                        if (is_sign)
                                *va_arg(args, int *) = val.s;
                        else
                                *va_arg(args, unsigned int *) = val.u;
                        break;
                }
                num++;

                if (!next)
                        break;
                str = next;
        }

        return num;
}
EXPORT_SYMBOL(vsscanf);

/**
 * sscanf - Unformat a buffer into a list of arguments
 * @buf:        input buffer
 * @fmt:        formatting of buffer
 * @...:        resulting arguments
 */
int sscanf(const char *buf, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vsscanf(buf, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(sscanf);












































































































































































































































































































































































































































































































































































































































































































    2 






    2 

    2 
    1 

    2 
































    2 










    1 















    1 











    1 

    1 













































































































































































































































































































































































































































































































































































































































































































   12 







   11 





































































































   12 













































    2 















    2 

















    4 



































    9 






    8 


    3 


    7 

























































    7 





    7 








































    1 




    1 

    1 












































































    5 






    5 


    1 


    4 











































    2 






    2 




















































































































































































    1 





    1 















































































    1 




    1 

































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
// SPDX-License-Identifier: GPL-2.0
/*
 * Implementation of the diskquota system for the LINUX operating system. QUOTA
 * is implemented using the BSD system call interface as the means of
 * communication with the user level. This file contains the generic routines
 * called by the different filesystems on allocation of an inode or block.
 * These routines take care of the administration needed to have a consistent
 * diskquota tracking system. The ideas of both user and group quotas are based
 * on the Melbourne quota system as used on BSD derived systems. The internal
 * implementation is based on one of the several variants of the LINUX
 * inode-subsystem with added complexity of the diskquota system.
 *
 * Author:        Marco van Wieringen <mvw@planets.elm.net>
 *
 * Fixes:   Dmitry Gorodchanin <pgmdsg@ibi.com>, 11 Feb 96
 *
 *                Revised list management to avoid races
 *                -- Bill Hawes, <whawes@star.net>, 9/98
 *
 *                Fixed races in dquot_transfer(), dqget() and dquot_alloc_...().
 *                As the consequence the locking was moved from dquot_decr_...(),
 *                dquot_incr_...() to calling functions.
 *                invalidate_dquots() now writes modified dquots.
 *                Serialized quota_off() and quota_on() for mount point.
 *                Fixed a few bugs in grow_dquots().
 *                Fixed deadlock in write_dquot() - we no longer account quotas on
 *                quota files
 *                remove_dquot_ref() moved to inode.c - it now traverses through inodes
 *                add_dquot_ref() restarts after blocking
 *                Added check for bogus uid and fixed check for group in quotactl.
 *                Jan Kara, <jack@suse.cz>, sponsored by SuSE CR, 10-11/99
 *
 *                Used struct list_head instead of own list struct
 *                Invalidation of referenced dquots is no longer possible
 *                Improved free_dquots list management
 *                Quota and i_blocks are now updated in one place to avoid races
 *                Warnings are now delayed so we won't block in critical section
 *                Write updated not to require dquot lock
 *                Jan Kara, <jack@suse.cz>, 9/2000
 *
 *                Added dynamic quota structure allocation
 *                Jan Kara <jack@suse.cz> 12/2000
 *
 *                Rewritten quota interface. Implemented new quota format and
 *                formats registering.
 *                Jan Kara, <jack@suse.cz>, 2001,2002
 *
 *                New SMP locking.
 *                Jan Kara, <jack@suse.cz>, 10/2002
 *
 *                Added journalled quota support, fix lock inversion problems
 *                Jan Kara, <jack@suse.cz>, 2003,2004
 *
 * (C) Copyright 1994 - 1997 Marco van Wieringen
 */

#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/mm.h>
#include <linux/time.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/tty.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/security.h>
#include <linux/sched.h>
#include <linux/cred.h>
#include <linux/kmod.h>
#include <linux/namei.h>
#include <linux/capability.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
#include <linux/sched/mm.h>
#include "../internal.h" /* ugh */

#include <linux/uaccess.h>

/*
 * There are five quota SMP locks:
 * * dq_list_lock protects all lists with quotas and quota formats.
 * * dquot->dq_dqb_lock protects data from dq_dqb
 * * inode->i_lock protects inode->i_blocks, i_bytes and also guards
 *   consistency of dquot->dq_dqb with inode->i_blocks, i_bytes so that
 *   dquot_transfer() can stabilize amount it transfers
 * * dq_data_lock protects mem_dqinfo structures and modifications of dquot
 *   pointers in the inode
 * * dq_state_lock protects modifications of quota state (on quotaon and
 *   quotaoff) and readers who care about latest values take it as well.
 *
 * The spinlock ordering is hence:
 *   dq_data_lock > dq_list_lock > i_lock > dquot->dq_dqb_lock,
 *   dq_list_lock > dq_state_lock
 *
 * Note that some things (eg. sb pointer, type, id) doesn't change during
 * the life of the dquot structure and so needn't to be protected by a lock
 *
 * Operation accessing dquots via inode pointers are protected by dquot_srcu.
 * Operation of reading pointer needs srcu_read_lock(&dquot_srcu), and
 * synchronize_srcu(&dquot_srcu) is called after clearing pointers from
 * inode and before dropping dquot references to avoid use of dquots after
 * they are freed. dq_data_lock is used to serialize the pointer setting and
 * clearing operations.
 * Special care needs to be taken about S_NOQUOTA inode flag (marking that
 * inode is a quota file). Functions adding pointers from inode to dquots have
 * to check this flag under dq_data_lock and then (if S_NOQUOTA is not set) they
 * have to do all pointer modifications before dropping dq_data_lock. This makes
 * sure they cannot race with quotaon which first sets S_NOQUOTA flag and
 * then drops all pointers to dquots from an inode.
 *
 * Each dquot has its dq_lock mutex.  Dquot is locked when it is being read to
 * memory (or space for it is being allocated) on the first dqget(), when it is
 * being written out, and when it is being released on the last dqput(). The
 * allocation and release operations are serialized by the dq_lock and by
 * checking the use count in dquot_release().
 *
 * Lock ordering (including related VFS locks) is the following:
 *   s_umount > i_mutex > journal_lock > dquot->dq_lock > dqio_sem
 */

static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock);
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
EXPORT_SYMBOL(dq_data_lock);
DEFINE_STATIC_SRCU(dquot_srcu);

static DECLARE_WAIT_QUEUE_HEAD(dquot_ref_wq);

void __quota_error(struct super_block *sb, const char *func,
                   const char *fmt, ...)
{
        if (printk_ratelimit()) {
                va_list args;
                struct va_format vaf;

                va_start(args, fmt);

                vaf.fmt = fmt;
                vaf.va = &args;

                printk(KERN_ERR "Quota error (device %s): %s: %pV\n",
                       sb->s_id, func, &vaf);

                va_end(args);
        }
}
EXPORT_SYMBOL(__quota_error);

#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
static char *quotatypes[] = INITQFNAMES;
#endif
static struct quota_format_type *quota_formats;        /* List of registered formats */
static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;

/* SLAB cache for dquot structures */
static struct kmem_cache *dquot_cachep;

int register_quota_format(struct quota_format_type *fmt)
{
        spin_lock(&dq_list_lock);
        fmt->qf_next = quota_formats;
        quota_formats = fmt;
        spin_unlock(&dq_list_lock);
        return 0;
}
EXPORT_SYMBOL(register_quota_format);

void unregister_quota_format(struct quota_format_type *fmt)
{
        struct quota_format_type **actqf;

        spin_lock(&dq_list_lock);
        for (actqf = &quota_formats; *actqf && *actqf != fmt;
             actqf = &(*actqf)->qf_next)
                ;
        if (*actqf)
                *actqf = (*actqf)->qf_next;
        spin_unlock(&dq_list_lock);
}
EXPORT_SYMBOL(unregister_quota_format);

static struct quota_format_type *find_quota_format(int id)
{
        struct quota_format_type *actqf;

        spin_lock(&dq_list_lock);
        for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
             actqf = actqf->qf_next)
                ;
        if (!actqf || !try_module_get(actqf->qf_owner)) {
                int qm;

                spin_unlock(&dq_list_lock);

                for (qm = 0; module_names[qm].qm_fmt_id &&
                             module_names[qm].qm_fmt_id != id; qm++)
                        ;
                if (!module_names[qm].qm_fmt_id ||
                    request_module(module_names[qm].qm_mod_name))
                        return NULL;

                spin_lock(&dq_list_lock);
                for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
                     actqf = actqf->qf_next)
                        ;
                if (actqf && !try_module_get(actqf->qf_owner))
                        actqf = NULL;
        }
        spin_unlock(&dq_list_lock);
        return actqf;
}

static void put_quota_format(struct quota_format_type *fmt)
{
        module_put(fmt->qf_owner);
}

/*
 * Dquot List Management:
 * The quota code uses five lists for dquot management: the inuse_list,
 * releasing_dquots, free_dquots, dqi_dirty_list, and dquot_hash[] array.
 * A single dquot structure may be on some of those lists, depending on
 * its current state.
 *
 * All dquots are placed to the end of inuse_list when first created, and this
 * list is used for invalidate operation, which must look at every dquot.
 *
 * When the last reference of a dquot is dropped, the dquot is added to
 * releasing_dquots. We'll then queue work item which will call
 * synchronize_srcu() and after that perform the final cleanup of all the
 * dquots on the list. Each cleaned up dquot is moved to free_dquots list.
 * Both releasing_dquots and free_dquots use the dq_free list_head in the dquot
 * struct.
 *
 * Unused and cleaned up dquots are in the free_dquots list and this list is
 * searched whenever we need an available dquot. Dquots are removed from the
 * list as soon as they are used again and dqstats.free_dquots gives the number
 * of dquots on the list. When dquot is invalidated it's completely released
 * from memory.
 *
 * Dirty dquots are added to the dqi_dirty_list of quota_info when mark
 * dirtied, and this list is searched when writing dirty dquots back to
 * quota file. Note that some filesystems do dirty dquot tracking on their
 * own (e.g. in a journal) and thus don't use dqi_dirty_list.
 *
 * Dquots with a specific identity (device, type and id) are placed on
 * one of the dquot_hash[] hash chains. The provides an efficient search
 * mechanism to locate a specific dquot.
 */

static LIST_HEAD(inuse_list);
static LIST_HEAD(free_dquots);
static LIST_HEAD(releasing_dquots);
static unsigned int dq_hash_bits, dq_hash_mask;
static struct hlist_head *dquot_hash;

struct dqstats dqstats;
EXPORT_SYMBOL(dqstats);

static qsize_t inode_get_rsv_space(struct inode *inode);
static qsize_t __inode_get_rsv_space(struct inode *inode);
static int __dquot_initialize(struct inode *inode, int type);

static void quota_release_workfn(struct work_struct *work);
static DECLARE_DELAYED_WORK(quota_release_work, quota_release_workfn);

static inline unsigned int
hashfn(const struct super_block *sb, struct kqid qid)
{
        unsigned int id = from_kqid(&init_user_ns, qid);
        int type = qid.type;
        unsigned long tmp;

        tmp = (((unsigned long)sb>>L1_CACHE_SHIFT) ^ id) * (MAXQUOTAS - type);
        return (tmp + (tmp >> dq_hash_bits)) & dq_hash_mask;
}

/*
 * Following list functions expect dq_list_lock to be held
 */
static inline void insert_dquot_hash(struct dquot *dquot)
{
        struct hlist_head *head;
        head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id);
        hlist_add_head(&dquot->dq_hash, head);
}

static inline void remove_dquot_hash(struct dquot *dquot)
{
        hlist_del_init(&dquot->dq_hash);
}

static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb,
                                struct kqid qid)
{
        struct dquot *dquot;

        hlist_for_each_entry(dquot, dquot_hash+hashent, dq_hash)
                if (dquot->dq_sb == sb && qid_eq(dquot->dq_id, qid))
                        return dquot;

        return NULL;
}

/* Add a dquot to the tail of the free list */
static inline void put_dquot_last(struct dquot *dquot)
{
        list_add_tail(&dquot->dq_free, &free_dquots);
        dqstats_inc(DQST_FREE_DQUOTS);
}

static inline void put_releasing_dquots(struct dquot *dquot)
{
        list_add_tail(&dquot->dq_free, &releasing_dquots);
        set_bit(DQ_RELEASING_B, &dquot->dq_flags);
}

static inline void remove_free_dquot(struct dquot *dquot)
{
        if (list_empty(&dquot->dq_free))
                return;
        list_del_init(&dquot->dq_free);
        if (!test_bit(DQ_RELEASING_B, &dquot->dq_flags))
                dqstats_dec(DQST_FREE_DQUOTS);
        else
                clear_bit(DQ_RELEASING_B, &dquot->dq_flags);
}

static inline void put_inuse(struct dquot *dquot)
{
        /* We add to the back of inuse list so we don't have to restart
         * when traversing this list and we block */
        list_add_tail(&dquot->dq_inuse, &inuse_list);
        dqstats_inc(DQST_ALLOC_DQUOTS);
}

static inline void remove_inuse(struct dquot *dquot)
{
        dqstats_dec(DQST_ALLOC_DQUOTS);
        list_del(&dquot->dq_inuse);
}
/*
 * End of list functions needing dq_list_lock
 */

static void wait_on_dquot(struct dquot *dquot)
{
        mutex_lock(&dquot->dq_lock);
        mutex_unlock(&dquot->dq_lock);
}

static inline int dquot_active(struct dquot *dquot)
{
        return test_bit(DQ_ACTIVE_B, &dquot->dq_flags);
}

static inline int dquot_dirty(struct dquot *dquot)
{
        return test_bit(DQ_MOD_B, &dquot->dq_flags);
}

static inline int mark_dquot_dirty(struct dquot *dquot)
{
        return dquot->dq_sb->dq_op->mark_dirty(dquot);
}

/* Mark dquot dirty in atomic manner, and return it's old dirty flag state */
int dquot_mark_dquot_dirty(struct dquot *dquot)
{
        int ret = 1;

        if (!dquot_active(dquot))
                return 0;

        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY)
                return test_and_set_bit(DQ_MOD_B, &dquot->dq_flags);

        /* If quota is dirty already, we don't have to acquire dq_list_lock */
        if (dquot_dirty(dquot))
                return 1;

        spin_lock(&dq_list_lock);
        if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
                list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
                                info[dquot->dq_id.type].dqi_dirty_list);
                ret = 0;
        }
        spin_unlock(&dq_list_lock);
        return ret;
}
EXPORT_SYMBOL(dquot_mark_dquot_dirty);

/* Dirtify all the dquots - this can block when journalling */
static inline int mark_all_dquot_dirty(struct dquot __rcu * const *dquots)
{
        int ret, err, cnt;
        struct dquot *dquot;

        ret = err = 0;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (dquot)
                        /* Even in case of error we have to continue */
                        ret = mark_dquot_dirty(dquot);
                if (!err && ret < 0)
                        err = ret;
        }
        return err;
}

static inline void dqput_all(struct dquot **dquot)
{
        unsigned int cnt;

        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                dqput(dquot[cnt]);
}

static inline int clear_dquot_dirty(struct dquot *dquot)
{
        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY)
                return test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags);

        spin_lock(&dq_list_lock);
        if (!test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags)) {
                spin_unlock(&dq_list_lock);
                return 0;
        }
        list_del_init(&dquot->dq_dirty);
        spin_unlock(&dq_list_lock);
        return 1;
}

void mark_info_dirty(struct super_block *sb, int type)
{
        spin_lock(&dq_data_lock);
        sb_dqopt(sb)->info[type].dqi_flags |= DQF_INFO_DIRTY;
        spin_unlock(&dq_data_lock);
}
EXPORT_SYMBOL(mark_info_dirty);

/*
 *        Read dquot from disk and alloc space for it
 */

int dquot_acquire(struct dquot *dquot)
{
        int ret = 0, ret2 = 0;
        unsigned int memalloc;
        struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);

        mutex_lock(&dquot->dq_lock);
        memalloc = memalloc_nofs_save();
        if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
                ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot);
                if (ret < 0)
                        goto out_iolock;
        }
        /* Make sure flags update is visible after dquot has been filled */
        smp_mb__before_atomic();
        set_bit(DQ_READ_B, &dquot->dq_flags);
        /* Instantiate dquot if needed */
        if (!dquot_active(dquot) && !dquot->dq_off) {
                ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot);
                /* Write the info if needed */
                if (info_dirty(&dqopt->info[dquot->dq_id.type])) {
                        ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info(
                                        dquot->dq_sb, dquot->dq_id.type);
                }
                if (ret < 0)
                        goto out_iolock;
                if (ret2 < 0) {
                        ret = ret2;
                        goto out_iolock;
                }
        }
        /*
         * Make sure flags update is visible after on-disk struct has been
         * allocated. Paired with smp_rmb() in dqget().
         */
        smp_mb__before_atomic();
        set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_iolock:
        memalloc_nofs_restore(memalloc);
        mutex_unlock(&dquot->dq_lock);
        return ret;
}
EXPORT_SYMBOL(dquot_acquire);

/*
 *        Write dquot to disk
 */
int dquot_commit(struct dquot *dquot)
{
        int ret = 0;
        unsigned int memalloc;
        struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);

        mutex_lock(&dquot->dq_lock);
        memalloc = memalloc_nofs_save();
        if (!clear_dquot_dirty(dquot))
                goto out_lock;
        /* Inactive dquot can be only if there was error during read/init
         * => we have better not writing it */
        if (dquot_active(dquot))
                ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot);
        else
                ret = -EIO;
out_lock:
        memalloc_nofs_restore(memalloc);
        mutex_unlock(&dquot->dq_lock);
        return ret;
}
EXPORT_SYMBOL(dquot_commit);

/*
 *        Release dquot
 */
int dquot_release(struct dquot *dquot)
{
        int ret = 0, ret2 = 0;
        unsigned int memalloc;
        struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);

        mutex_lock(&dquot->dq_lock);
        memalloc = memalloc_nofs_save();
        /* Check whether we are not racing with some other dqget() */
        if (dquot_is_busy(dquot))
                goto out_dqlock;
        if (dqopt->ops[dquot->dq_id.type]->release_dqblk) {
                ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot);
                /* Write the info */
                if (info_dirty(&dqopt->info[dquot->dq_id.type])) {
                        ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info(
                                                dquot->dq_sb, dquot->dq_id.type);
                }
                if (ret >= 0)
                        ret = ret2;
        }
        clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_dqlock:
        memalloc_nofs_restore(memalloc);
        mutex_unlock(&dquot->dq_lock);
        return ret;
}
EXPORT_SYMBOL(dquot_release);

void dquot_destroy(struct dquot *dquot)
{
        kmem_cache_free(dquot_cachep, dquot);
}
EXPORT_SYMBOL(dquot_destroy);

static inline void do_destroy_dquot(struct dquot *dquot)
{
        dquot->dq_sb->dq_op->destroy_dquot(dquot);
}

/* Invalidate all dquots on the list. Note that this function is called after
 * quota is disabled and pointers from inodes removed so there cannot be new
 * quota users. There can still be some users of quotas due to inodes being
 * just deleted or pruned by prune_icache() (those are not attached to any
 * list) or parallel quotactl call. We have to wait for such users.
 */
static void invalidate_dquots(struct super_block *sb, int type)
{
        struct dquot *dquot, *tmp;

restart:
        flush_delayed_work(&quota_release_work);

        spin_lock(&dq_list_lock);
        list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) {
                if (dquot->dq_sb != sb)
                        continue;
                if (dquot->dq_id.type != type)
                        continue;
                /* Wait for dquot users */
                if (atomic_read(&dquot->dq_count)) {
                        atomic_inc(&dquot->dq_count);
                        spin_unlock(&dq_list_lock);
                        /*
                         * Once dqput() wakes us up, we know it's time to free
                         * the dquot.
                         * IMPORTANT: we rely on the fact that there is always
                         * at most one process waiting for dquot to free.
                         * Otherwise dq_count would be > 1 and we would never
                         * wake up.
                         */
                        wait_event(dquot_ref_wq,
                                   atomic_read(&dquot->dq_count) == 1);
                        dqput(dquot);
                        /* At this moment dquot() need not exist (it could be
                         * reclaimed by prune_dqcache(). Hence we must
                         * restart. */
                        goto restart;
                }
                /*
                 * The last user already dropped its reference but dquot didn't
                 * get fully cleaned up yet. Restart the scan which flushes the
                 * work cleaning up released dquots.
                 */
                if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) {
                        spin_unlock(&dq_list_lock);
                        goto restart;
                }
                /*
                 * Quota now has no users and it has been written on last
                 * dqput()
                 */
                remove_dquot_hash(dquot);
                remove_free_dquot(dquot);
                remove_inuse(dquot);
                do_destroy_dquot(dquot);
        }
        spin_unlock(&dq_list_lock);
}

/* Call callback for every active dquot on given filesystem */
int dquot_scan_active(struct super_block *sb,
                      int (*fn)(struct dquot *dquot, unsigned long priv),
                      unsigned long priv)
{
        struct dquot *dquot, *old_dquot = NULL;
        int ret = 0;

        WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount));

        spin_lock(&dq_list_lock);
        list_for_each_entry(dquot, &inuse_list, dq_inuse) {
                if (!dquot_active(dquot))
                        continue;
                if (dquot->dq_sb != sb)
                        continue;
                /* Now we have active dquot so we can just increase use count */
                atomic_inc(&dquot->dq_count);
                spin_unlock(&dq_list_lock);
                dqput(old_dquot);
                old_dquot = dquot;
                /*
                 * ->release_dquot() can be racing with us. Our reference
                 * protects us from new calls to it so just wait for any
                 * outstanding call and recheck the DQ_ACTIVE_B after that.
                 */
                wait_on_dquot(dquot);
                if (dquot_active(dquot)) {
                        ret = fn(dquot, priv);
                        if (ret < 0)
                                goto out;
                }
                spin_lock(&dq_list_lock);
                /* We are safe to continue now because our dquot could not
                 * be moved out of the inuse list while we hold the reference */
        }
        spin_unlock(&dq_list_lock);
out:
        dqput(old_dquot);
        return ret;
}
EXPORT_SYMBOL(dquot_scan_active);

static inline int dquot_write_dquot(struct dquot *dquot)
{
        int ret = dquot->dq_sb->dq_op->write_dquot(dquot);
        if (ret < 0) {
                quota_error(dquot->dq_sb, "Can't write quota structure "
                            "(error %d). Quota may get out of sync!", ret);
                /* Clear dirty bit anyway to avoid infinite loop. */
                clear_dquot_dirty(dquot);
        }
        return ret;
}

/* Write all dquot structures to quota files */
int dquot_writeback_dquots(struct super_block *sb, int type)
{
        struct list_head dirty;
        struct dquot *dquot;
        struct quota_info *dqopt = sb_dqopt(sb);
        int cnt;
        int err, ret = 0;

        WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount));

        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                if (!sb_has_quota_active(sb, cnt))
                        continue;
                spin_lock(&dq_list_lock);
                /* Move list away to avoid livelock. */
                list_replace_init(&dqopt->info[cnt].dqi_dirty_list, &dirty);
                while (!list_empty(&dirty)) {
                        dquot = list_first_entry(&dirty, struct dquot,
                                                 dq_dirty);

                        WARN_ON(!dquot_active(dquot));
                        /* If the dquot is releasing we should not touch it */
                        if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) {
                                spin_unlock(&dq_list_lock);
                                flush_delayed_work(&quota_release_work);
                                spin_lock(&dq_list_lock);
                                continue;
                        }

                        /* Now we have active dquot from which someone is
                          * holding reference so we can safely just increase
                         * use count */
                        dqgrab(dquot);
                        spin_unlock(&dq_list_lock);
                        err = dquot_write_dquot(dquot);
                        if (err && !ret)
                                ret = err;
                        dqput(dquot);
                        spin_lock(&dq_list_lock);
                }
                spin_unlock(&dq_list_lock);
        }

        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
                    && info_dirty(&dqopt->info[cnt]))
                        sb->dq_op->write_info(sb, cnt);
        dqstats_inc(DQST_SYNCS);

        return ret;
}
EXPORT_SYMBOL(dquot_writeback_dquots);

/* Write all dquot structures to disk and make them visible from userspace */
int dquot_quota_sync(struct super_block *sb, int type)
{
        struct quota_info *dqopt = sb_dqopt(sb);
        int cnt;
        int ret;

        ret = dquot_writeback_dquots(sb, type);
        if (ret)
                return ret;
        if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
                return 0;

        /* This is not very clever (and fast) but currently I don't know about
         * any other simple way of getting quota data to disk and we must get
         * them there for userspace to be visible... */
        if (sb->s_op->sync_fs) {
                ret = sb->s_op->sync_fs(sb, 1);
                if (ret)
                        return ret;
        }
        ret = sync_blockdev(sb->s_bdev);
        if (ret)
                return ret;

        /*
         * Now when everything is written we can discard the pagecache so
         * that userspace sees the changes.
         */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                if (!sb_has_quota_active(sb, cnt))
                        continue;
                inode_lock(dqopt->files[cnt]);
                truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
                inode_unlock(dqopt->files[cnt]);
        }

        return 0;
}
EXPORT_SYMBOL(dquot_quota_sync);

static unsigned long
dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
        struct dquot *dquot;
        unsigned long freed = 0;

        spin_lock(&dq_list_lock);
        while (!list_empty(&free_dquots) && sc->nr_to_scan) {
                dquot = list_first_entry(&free_dquots, struct dquot, dq_free);
                remove_dquot_hash(dquot);
                remove_free_dquot(dquot);
                remove_inuse(dquot);
                do_destroy_dquot(dquot);
                sc->nr_to_scan--;
                freed++;
        }
        spin_unlock(&dq_list_lock);
        return freed;
}

static unsigned long
dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
        return vfs_pressure_ratio(
        percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS]));
}

/*
 * Safely release dquot and put reference to dquot.
 */
static void quota_release_workfn(struct work_struct *work)
{
        struct dquot *dquot;
        struct list_head rls_head;

        spin_lock(&dq_list_lock);
        /* Exchange the list head to avoid livelock. */
        list_replace_init(&releasing_dquots, &rls_head);
        spin_unlock(&dq_list_lock);
        synchronize_srcu(&dquot_srcu);

restart:
        spin_lock(&dq_list_lock);
        while (!list_empty(&rls_head)) {
                dquot = list_first_entry(&rls_head, struct dquot, dq_free);
                WARN_ON_ONCE(atomic_read(&dquot->dq_count));
                /*
                 * Note that DQ_RELEASING_B protects us from racing with
                 * invalidate_dquots() calls so we are safe to work with the
                 * dquot even after we drop dq_list_lock.
                 */
                if (dquot_dirty(dquot)) {
                        spin_unlock(&dq_list_lock);
                        /* Commit dquot before releasing */
                        dquot_write_dquot(dquot);
                        goto restart;
                }
                if (dquot_active(dquot)) {
                        spin_unlock(&dq_list_lock);
                        dquot->dq_sb->dq_op->release_dquot(dquot);
                        goto restart;
                }
                /* Dquot is inactive and clean, now move it to free list */
                remove_free_dquot(dquot);
                put_dquot_last(dquot);
        }
        spin_unlock(&dq_list_lock);
}

/*
 * Put reference to dquot
 */
void dqput(struct dquot *dquot)
{
        if (!dquot)
                return;
#ifdef CONFIG_QUOTA_DEBUG
        if (!atomic_read(&dquot->dq_count)) {
                quota_error(dquot->dq_sb, "trying to free free dquot of %s %d",
                            quotatypes[dquot->dq_id.type],
                            from_kqid(&init_user_ns, dquot->dq_id));
                BUG();
        }
#endif
        dqstats_inc(DQST_DROPS);

        spin_lock(&dq_list_lock);
        if (atomic_read(&dquot->dq_count) > 1) {
                /* We have more than one user... nothing to do */
                atomic_dec(&dquot->dq_count);
                /* Releasing dquot during quotaoff phase? */
                if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_id.type) &&
                    atomic_read(&dquot->dq_count) == 1)
                        wake_up(&dquot_ref_wq);
                spin_unlock(&dq_list_lock);
                return;
        }

        /* Need to release dquot? */
        WARN_ON_ONCE(!list_empty(&dquot->dq_free));
        put_releasing_dquots(dquot);
        atomic_dec(&dquot->dq_count);
        spin_unlock(&dq_list_lock);
        queue_delayed_work(system_unbound_wq, &quota_release_work, 1);
}
EXPORT_SYMBOL(dqput);

struct dquot *dquot_alloc(struct super_block *sb, int type)
{
        return kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
}
EXPORT_SYMBOL(dquot_alloc);

static struct dquot *get_empty_dquot(struct super_block *sb, int type)
{
        struct dquot *dquot;

        dquot = sb->dq_op->alloc_dquot(sb, type);
        if(!dquot)
                return NULL;

        mutex_init(&dquot->dq_lock);
        INIT_LIST_HEAD(&dquot->dq_free);
        INIT_LIST_HEAD(&dquot->dq_inuse);
        INIT_HLIST_NODE(&dquot->dq_hash);
        INIT_LIST_HEAD(&dquot->dq_dirty);
        dquot->dq_sb = sb;
        dquot->dq_id = make_kqid_invalid(type);
        atomic_set(&dquot->dq_count, 1);
        spin_lock_init(&dquot->dq_dqb_lock);

        return dquot;
}

/*
 * Get reference to dquot
 *
 * Locking is slightly tricky here. We are guarded from parallel quotaoff()
 * destroying our dquot by:
 *   a) checking for quota flags under dq_list_lock and
 *   b) getting a reference to dquot before we release dq_list_lock
 */
struct dquot *dqget(struct super_block *sb, struct kqid qid)
{
        unsigned int hashent = hashfn(sb, qid);
        struct dquot *dquot, *empty = NULL;

        if (!qid_has_mapping(sb->s_user_ns, qid))
                return ERR_PTR(-EINVAL);

        if (!sb_has_quota_active(sb, qid.type))
                return ERR_PTR(-ESRCH);
we_slept:
        spin_lock(&dq_list_lock);
        spin_lock(&dq_state_lock);
        if (!sb_has_quota_active(sb, qid.type)) {
                spin_unlock(&dq_state_lock);
                spin_unlock(&dq_list_lock);
                dquot = ERR_PTR(-ESRCH);
                goto out;
        }
        spin_unlock(&dq_state_lock);

        dquot = find_dquot(hashent, sb, qid);
        if (!dquot) {
                if (!empty) {
                        spin_unlock(&dq_list_lock);
                        empty = get_empty_dquot(sb, qid.type);
                        if (!empty)
                                schedule();        /* Try to wait for a moment... */
                        goto we_slept;
                }
                dquot = empty;
                empty = NULL;
                dquot->dq_id = qid;
                /* all dquots go on the inuse_list */
                put_inuse(dquot);
                /* hash it first so it can be found */
                insert_dquot_hash(dquot);
                spin_unlock(&dq_list_lock);
                dqstats_inc(DQST_LOOKUPS);
        } else {
                if (!atomic_read(&dquot->dq_count))
                        remove_free_dquot(dquot);
                atomic_inc(&dquot->dq_count);
                spin_unlock(&dq_list_lock);
                dqstats_inc(DQST_CACHE_HITS);
                dqstats_inc(DQST_LOOKUPS);
        }
        /* Wait for dq_lock - after this we know that either dquot_release() is
         * already finished or it will be canceled due to dq_count > 0 test */
        wait_on_dquot(dquot);
        /* Read the dquot / allocate space in quota file */
        if (!dquot_active(dquot)) {
                int err;

                err = sb->dq_op->acquire_dquot(dquot);
                if (err < 0) {
                        dqput(dquot);
                        dquot = ERR_PTR(err);
                        goto out;
                }
        }
        /*
         * Make sure following reads see filled structure - paired with
         * smp_mb__before_atomic() in dquot_acquire().
         */
        smp_rmb();
        /* Has somebody invalidated entry under us? */
        WARN_ON_ONCE(hlist_unhashed(&dquot->dq_hash));
out:
        if (empty)
                do_destroy_dquot(empty);

        return dquot;
}
EXPORT_SYMBOL(dqget);

static inline struct dquot __rcu **i_dquot(struct inode *inode)
{
        return inode->i_sb->s_op->get_dquots(inode);
}

static int dqinit_needed(struct inode *inode, int type)
{
        struct dquot __rcu * const *dquots;
        int cnt;

        if (IS_NOQUOTA(inode))
                return 0;

        dquots = i_dquot(inode);
        if (type != -1)
                return !dquots[type];
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (!dquots[cnt])
                        return 1;
        return 0;
}

/* This routine is guarded by s_umount semaphore */
static int add_dquot_ref(struct super_block *sb, int type)
{
        struct inode *inode, *old_inode = NULL;
#ifdef CONFIG_QUOTA_DEBUG
        int reserved = 0;
#endif
        int err = 0;

        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                spin_lock(&inode->i_lock);
                if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
                    !atomic_read(&inode->i_writecount) ||
                    !dqinit_needed(inode, type)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                spin_unlock(&sb->s_inode_list_lock);

#ifdef CONFIG_QUOTA_DEBUG
                if (unlikely(inode_get_rsv_space(inode) > 0))
                        reserved = 1;
#endif
                iput(old_inode);
                err = __dquot_initialize(inode, type);
                if (err) {
                        iput(inode);
                        goto out;
                }

                /*
                 * We hold a reference to 'inode' so it couldn't have been
                 * removed from s_inodes list while we dropped the
                 * s_inode_list_lock. We cannot iput the inode now as we can be
                 * holding the last reference and we cannot iput it under
                 * s_inode_list_lock. So we keep the reference and iput it
                 * later.
                 */
                old_inode = inode;
                cond_resched();
                spin_lock(&sb->s_inode_list_lock);
        }
        spin_unlock(&sb->s_inode_list_lock);
        iput(old_inode);
out:
#ifdef CONFIG_QUOTA_DEBUG
        if (reserved) {
                quota_error(sb, "Writes happened before quota was turned on "
                        "thus quota information is probably inconsistent. "
                        "Please run quotacheck(8)");
        }
#endif
        return err;
}

static void remove_dquot_ref(struct super_block *sb, int type)
{
        struct inode *inode;
#ifdef CONFIG_QUOTA_DEBUG
        int reserved = 0;
#endif

        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                /*
                 *  We have to scan also I_NEW inodes because they can already
                 *  have quota pointer initialized. Luckily, we need to touch
                 *  only quota pointers and these have separate locking
                 *  (dq_data_lock).
                 */
                spin_lock(&dq_data_lock);
                if (!IS_NOQUOTA(inode)) {
                        struct dquot __rcu **dquots = i_dquot(inode);
                        struct dquot *dquot = srcu_dereference_check(
                                dquots[type], &dquot_srcu,
                                lockdep_is_held(&dq_data_lock));

#ifdef CONFIG_QUOTA_DEBUG
                        if (unlikely(inode_get_rsv_space(inode) > 0))
                                reserved = 1;
#endif
                        rcu_assign_pointer(dquots[type], NULL);
                        if (dquot)
                                dqput(dquot);
                }
                spin_unlock(&dq_data_lock);
        }
        spin_unlock(&sb->s_inode_list_lock);
#ifdef CONFIG_QUOTA_DEBUG
        if (reserved) {
                printk(KERN_WARNING "VFS (%s): Writes happened after quota"
                        " was disabled thus quota information is probably "
                        "inconsistent. Please run quotacheck(8).\n", sb->s_id);
        }
#endif
}

/* Gather all references from inodes and drop them */
static void drop_dquot_ref(struct super_block *sb, int type)
{
        if (sb->dq_op)
                remove_dquot_ref(sb, type);
}

static inline
void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
{
        if (dquot->dq_dqb.dqb_rsvspace >= number)
                dquot->dq_dqb.dqb_rsvspace -= number;
        else {
                WARN_ON_ONCE(1);
                dquot->dq_dqb.dqb_rsvspace = 0;
        }
        if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <=
            dquot->dq_dqb.dqb_bsoftlimit)
                dquot->dq_dqb.dqb_btime = (time64_t) 0;
        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
}

static void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
{
        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
            dquot->dq_dqb.dqb_curinodes >= number)
                dquot->dq_dqb.dqb_curinodes -= number;
        else
                dquot->dq_dqb.dqb_curinodes = 0;
        if (dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
                dquot->dq_dqb.dqb_itime = (time64_t) 0;
        clear_bit(DQ_INODES_B, &dquot->dq_flags);
}

static void dquot_decr_space(struct dquot *dquot, qsize_t number)
{
        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
            dquot->dq_dqb.dqb_curspace >= number)
                dquot->dq_dqb.dqb_curspace -= number;
        else
                dquot->dq_dqb.dqb_curspace = 0;
        if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <=
            dquot->dq_dqb.dqb_bsoftlimit)
                dquot->dq_dqb.dqb_btime = (time64_t) 0;
        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
}

struct dquot_warn {
        struct super_block *w_sb;
        struct kqid w_dq_id;
        short w_type;
};

static int warning_issued(struct dquot *dquot, const int warntype)
{
        int flag = (warntype == QUOTA_NL_BHARDWARN ||
                warntype == QUOTA_NL_BSOFTLONGWARN) ? DQ_BLKS_B :
                ((warntype == QUOTA_NL_IHARDWARN ||
                warntype == QUOTA_NL_ISOFTLONGWARN) ? DQ_INODES_B : 0);

        if (!flag)
                return 0;
        return test_and_set_bit(flag, &dquot->dq_flags);
}

#ifdef CONFIG_PRINT_QUOTA_WARNING
static int flag_print_warnings = 1;

static int need_print_warning(struct dquot_warn *warn)
{
        if (!flag_print_warnings)
                return 0;

        switch (warn->w_dq_id.type) {
                case USRQUOTA:
                        return uid_eq(current_fsuid(), warn->w_dq_id.uid);
                case GRPQUOTA:
                        return in_group_p(warn->w_dq_id.gid);
                case PRJQUOTA:
                        return 1;
        }
        return 0;
}

/* Print warning to user which exceeded quota */
static void print_warning(struct dquot_warn *warn)
{
        char *msg = NULL;
        struct tty_struct *tty;
        int warntype = warn->w_type;

        if (warntype == QUOTA_NL_IHARDBELOW ||
            warntype == QUOTA_NL_ISOFTBELOW ||
            warntype == QUOTA_NL_BHARDBELOW ||
            warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(warn))
                return;

        tty = get_current_tty();
        if (!tty)
                return;
        tty_write_message(tty, warn->w_sb->s_id);
        if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN)
                tty_write_message(tty, ": warning, ");
        else
                tty_write_message(tty, ": write failed, ");
        tty_write_message(tty, quotatypes[warn->w_dq_id.type]);
        switch (warntype) {
                case QUOTA_NL_IHARDWARN:
                        msg = " file limit reached.\r\n";
                        break;
                case QUOTA_NL_ISOFTLONGWARN:
                        msg = " file quota exceeded too long.\r\n";
                        break;
                case QUOTA_NL_ISOFTWARN:
                        msg = " file quota exceeded.\r\n";
                        break;
                case QUOTA_NL_BHARDWARN:
                        msg = " block limit reached.\r\n";
                        break;
                case QUOTA_NL_BSOFTLONGWARN:
                        msg = " block quota exceeded too long.\r\n";
                        break;
                case QUOTA_NL_BSOFTWARN:
                        msg = " block quota exceeded.\r\n";
                        break;
        }
        tty_write_message(tty, msg);
        tty_kref_put(tty);
}
#endif

static void prepare_warning(struct dquot_warn *warn, struct dquot *dquot,
                            int warntype)
{
        if (warning_issued(dquot, warntype))
                return;
        warn->w_type = warntype;
        warn->w_sb = dquot->dq_sb;
        warn->w_dq_id = dquot->dq_id;
}

/*
 * Write warnings to the console and send warning messages over netlink.
 *
 * Note that this function can call into tty and networking code.
 */
static void flush_warnings(struct dquot_warn *warn)
{
        int i;

        for (i = 0; i < MAXQUOTAS; i++) {
                if (warn[i].w_type == QUOTA_NL_NOWARN)
                        continue;
#ifdef CONFIG_PRINT_QUOTA_WARNING
                print_warning(&warn[i]);
#endif
                quota_send_warning(warn[i].w_dq_id,
                                   warn[i].w_sb->s_dev, warn[i].w_type);
        }
}

static int ignore_hardlimit(struct dquot *dquot)
{
        struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];

        return capable(CAP_SYS_RESOURCE) &&
               (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
                !(info->dqi_flags & DQF_ROOT_SQUASH));
}

static int dquot_add_inodes(struct dquot *dquot, qsize_t inodes,
                            struct dquot_warn *warn)
{
        qsize_t newinodes;
        int ret = 0;

        spin_lock(&dquot->dq_dqb_lock);
        newinodes = dquot->dq_dqb.dqb_curinodes + inodes;
        if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type) ||
            test_bit(DQ_FAKE_B, &dquot->dq_flags))
                goto add;

        if (dquot->dq_dqb.dqb_ihardlimit &&
            newinodes > dquot->dq_dqb.dqb_ihardlimit &&
            !ignore_hardlimit(dquot)) {
                prepare_warning(warn, dquot, QUOTA_NL_IHARDWARN);
                ret = -EDQUOT;
                goto out;
        }

        if (dquot->dq_dqb.dqb_isoftlimit &&
            newinodes > dquot->dq_dqb.dqb_isoftlimit &&
            dquot->dq_dqb.dqb_itime &&
            ktime_get_real_seconds() >= dquot->dq_dqb.dqb_itime &&
            !ignore_hardlimit(dquot)) {
                prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN);
                ret = -EDQUOT;
                goto out;
        }

        if (dquot->dq_dqb.dqb_isoftlimit &&
            newinodes > dquot->dq_dqb.dqb_isoftlimit &&
            dquot->dq_dqb.dqb_itime == 0) {
                prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN);
                dquot->dq_dqb.dqb_itime = ktime_get_real_seconds() +
                    sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type].dqi_igrace;
        }
add:
        dquot->dq_dqb.dqb_curinodes = newinodes;

out:
        spin_unlock(&dquot->dq_dqb_lock);
        return ret;
}

static int dquot_add_space(struct dquot *dquot, qsize_t space,
                           qsize_t rsv_space, unsigned int flags,
                           struct dquot_warn *warn)
{
        qsize_t tspace;
        struct super_block *sb = dquot->dq_sb;
        int ret = 0;

        spin_lock(&dquot->dq_dqb_lock);
        if (!sb_has_quota_limits_enabled(sb, dquot->dq_id.type) ||
            test_bit(DQ_FAKE_B, &dquot->dq_flags))
                goto finish;

        tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
                + space + rsv_space;

        if (dquot->dq_dqb.dqb_bhardlimit &&
            tspace > dquot->dq_dqb.dqb_bhardlimit &&
            !ignore_hardlimit(dquot)) {
                if (flags & DQUOT_SPACE_WARN)
                        prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN);
                ret = -EDQUOT;
                goto finish;
        }

        if (dquot->dq_dqb.dqb_bsoftlimit &&
            tspace > dquot->dq_dqb.dqb_bsoftlimit &&
            dquot->dq_dqb.dqb_btime &&
            ktime_get_real_seconds() >= dquot->dq_dqb.dqb_btime &&
            !ignore_hardlimit(dquot)) {
                if (flags & DQUOT_SPACE_WARN)
                        prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN);
                ret = -EDQUOT;
                goto finish;
        }

        if (dquot->dq_dqb.dqb_bsoftlimit &&
            tspace > dquot->dq_dqb.dqb_bsoftlimit &&
            dquot->dq_dqb.dqb_btime == 0) {
                if (flags & DQUOT_SPACE_WARN) {
                        prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN);
                        dquot->dq_dqb.dqb_btime = ktime_get_real_seconds() +
                            sb_dqopt(sb)->info[dquot->dq_id.type].dqi_bgrace;
                } else {
                        /*
                         * We don't allow preallocation to exceed softlimit so exceeding will
                         * be always printed
                         */
                        ret = -EDQUOT;
                        goto finish;
                }
        }
finish:
        /*
         * We have to be careful and go through warning generation & grace time
         * setting even if DQUOT_SPACE_NOFAIL is set. That's why we check it
         * only here...
         */
        if (flags & DQUOT_SPACE_NOFAIL)
                ret = 0;
        if (!ret) {
                dquot->dq_dqb.dqb_rsvspace += rsv_space;
                dquot->dq_dqb.dqb_curspace += space;
        }
        spin_unlock(&dquot->dq_dqb_lock);
        return ret;
}

static int info_idq_free(struct dquot *dquot, qsize_t inodes)
{
        qsize_t newinodes;

        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
            dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
            !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type))
                return QUOTA_NL_NOWARN;

        newinodes = dquot->dq_dqb.dqb_curinodes - inodes;
        if (newinodes <= dquot->dq_dqb.dqb_isoftlimit)
                return QUOTA_NL_ISOFTBELOW;
        if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit &&
            newinodes < dquot->dq_dqb.dqb_ihardlimit)
                return QUOTA_NL_IHARDBELOW;
        return QUOTA_NL_NOWARN;
}

static int info_bdq_free(struct dquot *dquot, qsize_t space)
{
        qsize_t tspace;

        tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace;

        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
            tspace <= dquot->dq_dqb.dqb_bsoftlimit)
                return QUOTA_NL_NOWARN;

        if (tspace - space <= dquot->dq_dqb.dqb_bsoftlimit)
                return QUOTA_NL_BSOFTBELOW;
        if (tspace >= dquot->dq_dqb.dqb_bhardlimit &&
            tspace - space < dquot->dq_dqb.dqb_bhardlimit)
                return QUOTA_NL_BHARDBELOW;
        return QUOTA_NL_NOWARN;
}

static int inode_quota_active(const struct inode *inode)
{
        struct super_block *sb = inode->i_sb;

        if (IS_NOQUOTA(inode))
                return 0;
        return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb);
}

/*
 * Initialize quota pointers in inode
 *
 * It is better to call this function outside of any transaction as it
 * might need a lot of space in journal for dquot structure allocation.
 */
static int __dquot_initialize(struct inode *inode, int type)
{
        int cnt, init_needed = 0;
        struct dquot __rcu **dquots;
        struct dquot *got[MAXQUOTAS] = {};
        struct super_block *sb = inode->i_sb;
        qsize_t rsv;
        int ret = 0;

        if (!inode_quota_active(inode))
                return 0;

        dquots = i_dquot(inode);

        /* First get references to structures we might need. */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                struct kqid qid;
                kprojid_t projid;
                int rc;
                struct dquot *dquot;

                if (type != -1 && cnt != type)
                        continue;
                /*
                 * The i_dquot should have been initialized in most cases,
                 * we check it without locking here to avoid unnecessary
                 * dqget()/dqput() calls.
                 */
                if (dquots[cnt])
                        continue;

                if (!sb_has_quota_active(sb, cnt))
                        continue;

                init_needed = 1;

                switch (cnt) {
                case USRQUOTA:
                        qid = make_kqid_uid(inode->i_uid);
                        break;
                case GRPQUOTA:
                        qid = make_kqid_gid(inode->i_gid);
                        break;
                case PRJQUOTA:
                        rc = inode->i_sb->dq_op->get_projid(inode, &projid);
                        if (rc)
                                continue;
                        qid = make_kqid_projid(projid);
                        break;
                }
                dquot = dqget(sb, qid);
                if (IS_ERR(dquot)) {
                        /* We raced with somebody turning quotas off... */
                        if (PTR_ERR(dquot) != -ESRCH) {
                                ret = PTR_ERR(dquot);
                                goto out_put;
                        }
                        dquot = NULL;
                }
                got[cnt] = dquot;
        }

        /* All required i_dquot has been initialized */
        if (!init_needed)
                return 0;

        spin_lock(&dq_data_lock);
        if (IS_NOQUOTA(inode))
                goto out_lock;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                /* Avoid races with quotaoff() */
                if (!sb_has_quota_active(sb, cnt))
                        continue;
                /* We could race with quotaon or dqget() could have failed */
                if (!got[cnt])
                        continue;
                if (!dquots[cnt]) {
                        rcu_assign_pointer(dquots[cnt], got[cnt]);
                        got[cnt] = NULL;
                        /*
                         * Make quota reservation system happy if someone
                         * did a write before quota was turned on
                         */
                        rsv = inode_get_rsv_space(inode);
                        if (unlikely(rsv)) {
                                struct dquot *dquot = srcu_dereference_check(
                                        dquots[cnt], &dquot_srcu,
                                        lockdep_is_held(&dq_data_lock));

                                spin_lock(&inode->i_lock);
                                /* Get reservation again under proper lock */
                                rsv = __inode_get_rsv_space(inode);
                                spin_lock(&dquot->dq_dqb_lock);
                                dquot->dq_dqb.dqb_rsvspace += rsv;
                                spin_unlock(&dquot->dq_dqb_lock);
                                spin_unlock(&inode->i_lock);
                        }
                }
        }
out_lock:
        spin_unlock(&dq_data_lock);
out_put:
        /* Drop unused references */
        dqput_all(got);

        return ret;
}

int dquot_initialize(struct inode *inode)
{
        return __dquot_initialize(inode, -1);
}
EXPORT_SYMBOL(dquot_initialize);

bool dquot_initialize_needed(struct inode *inode)
{
        struct dquot __rcu **dquots;
        int i;

        if (!inode_quota_active(inode))
                return false;

        dquots = i_dquot(inode);
        for (i = 0; i < MAXQUOTAS; i++)
                if (!dquots[i] && sb_has_quota_active(inode->i_sb, i))
                        return true;
        return false;
}
EXPORT_SYMBOL(dquot_initialize_needed);

/*
 * Release all quotas referenced by inode.
 *
 * This function only be called on inode free or converting
 * a file to quota file, no other users for the i_dquot in
 * both cases, so we needn't call synchronize_srcu() after
 * clearing i_dquot.
 */
static void __dquot_drop(struct inode *inode)
{
        int cnt;
        struct dquot __rcu **dquots = i_dquot(inode);
        struct dquot *put[MAXQUOTAS];

        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                put[cnt] = srcu_dereference_check(dquots[cnt], &dquot_srcu,
                                        lockdep_is_held(&dq_data_lock));
                rcu_assign_pointer(dquots[cnt], NULL);
        }
        spin_unlock(&dq_data_lock);
        dqput_all(put);
}

void dquot_drop(struct inode *inode)
{
        struct dquot __rcu * const *dquots;
        int cnt;

        if (IS_NOQUOTA(inode))
                return;

        /*
         * Test before calling to rule out calls from proc and such
         * where we are not allowed to block. Note that this is
         * actually reliable test even without the lock - the caller
         * must assure that nobody can come after the DQUOT_DROP and
         * add quota pointers back anyway.
         */
        dquots = i_dquot(inode);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (dquots[cnt])
                        break;
        }

        if (cnt < MAXQUOTAS)
                __dquot_drop(inode);
}
EXPORT_SYMBOL(dquot_drop);

/*
 * inode_reserved_space is managed internally by quota, and protected by
 * i_lock similar to i_blocks+i_bytes.
 */
static qsize_t *inode_reserved_space(struct inode * inode)
{
        /* Filesystem must explicitly define it's own method in order to use
         * quota reservation interface */
        BUG_ON(!inode->i_sb->dq_op->get_reserved_space);
        return inode->i_sb->dq_op->get_reserved_space(inode);
}

static qsize_t __inode_get_rsv_space(struct inode *inode)
{
        if (!inode->i_sb->dq_op->get_reserved_space)
                return 0;
        return *inode_reserved_space(inode);
}

static qsize_t inode_get_rsv_space(struct inode *inode)
{
        qsize_t ret;

        if (!inode->i_sb->dq_op->get_reserved_space)
                return 0;
        spin_lock(&inode->i_lock);
        ret = __inode_get_rsv_space(inode);
        spin_unlock(&inode->i_lock);
        return ret;
}

/*
 * This functions updates i_blocks+i_bytes fields and quota information
 * (together with appropriate checks).
 *
 * NOTE: We absolutely rely on the fact that caller dirties the inode
 * (usually helpers in quotaops.h care about this) and holds a handle for
 * the current transaction so that dquot write and inode write go into the
 * same transaction.
 */

/*
 * This operation can block, but only after everything is updated
 */
int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
{
        int cnt, ret = 0, index;
        struct dquot_warn warn[MAXQUOTAS];
        int reserve = flags & DQUOT_SPACE_RESERVE;
        struct dquot __rcu **dquots;
        struct dquot *dquot;

        if (!inode_quota_active(inode)) {
                if (reserve) {
                        spin_lock(&inode->i_lock);
                        *inode_reserved_space(inode) += number;
                        spin_unlock(&inode->i_lock);
                } else {
                        inode_add_bytes(inode, number);
                }
                goto out;
        }

        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warn[cnt].w_type = QUOTA_NL_NOWARN;

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (!dquot)
                        continue;
                if (reserve) {
                        ret = dquot_add_space(dquot, 0, number, flags, &warn[cnt]);
                } else {
                        ret = dquot_add_space(dquot, number, 0, flags, &warn[cnt]);
                }
                if (ret) {
                        /* Back out changes we already did */
                        for (cnt--; cnt >= 0; cnt--) {
                                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                                if (!dquot)
                                        continue;
                                spin_lock(&dquot->dq_dqb_lock);
                                if (reserve)
                                        dquot_free_reserved_space(dquot, number);
                                else
                                        dquot_decr_space(dquot, number);
                                spin_unlock(&dquot->dq_dqb_lock);
                        }
                        spin_unlock(&inode->i_lock);
                        goto out_flush_warn;
                }
        }
        if (reserve)
                *inode_reserved_space(inode) += number;
        else
                __inode_add_bytes(inode, number);
        spin_unlock(&inode->i_lock);

        if (reserve)
                goto out_flush_warn;
        ret = mark_all_dquot_dirty(dquots);
out_flush_warn:
        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
out:
        return ret;
}
EXPORT_SYMBOL(__dquot_alloc_space);

/*
 * This operation can block, but only after everything is updated
 */
int dquot_alloc_inode(struct inode *inode)
{
        int cnt, ret = 0, index;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot __rcu * const *dquots;
        struct dquot *dquot;

        if (!inode_quota_active(inode))
                return 0;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warn[cnt].w_type = QUOTA_NL_NOWARN;

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (!dquot)
                        continue;
                ret = dquot_add_inodes(dquot, 1, &warn[cnt]);
                if (ret) {
                        for (cnt--; cnt >= 0; cnt--) {
                                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                                if (!dquot)
                                        continue;
                                /* Back out changes we already did */
                                spin_lock(&dquot->dq_dqb_lock);
                                dquot_decr_inodes(dquot, 1);
                                spin_unlock(&dquot->dq_dqb_lock);
                        }
                        goto warn_put_all;
                }
        }

warn_put_all:
        spin_unlock(&inode->i_lock);
        if (ret == 0)
                ret = mark_all_dquot_dirty(dquots);
        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
        return ret;
}
EXPORT_SYMBOL(dquot_alloc_inode);

/*
 * Convert in-memory reserved quotas to real consumed quotas
 */
void dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
{
        struct dquot __rcu **dquots;
        struct dquot *dquot;
        int cnt, index;

        if (!inode_quota_active(inode)) {
                spin_lock(&inode->i_lock);
                *inode_reserved_space(inode) -= number;
                __inode_add_bytes(inode, number);
                spin_unlock(&inode->i_lock);
                return;
        }

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        /* Claim reserved quotas to allocated quotas */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (dquot) {
                        spin_lock(&dquot->dq_dqb_lock);
                        if (WARN_ON_ONCE(dquot->dq_dqb.dqb_rsvspace < number))
                                number = dquot->dq_dqb.dqb_rsvspace;
                        dquot->dq_dqb.dqb_curspace += number;
                        dquot->dq_dqb.dqb_rsvspace -= number;
                        spin_unlock(&dquot->dq_dqb_lock);
                }
        }
        /* Update inode bytes */
        *inode_reserved_space(inode) -= number;
        __inode_add_bytes(inode, number);
        spin_unlock(&inode->i_lock);
        mark_all_dquot_dirty(dquots);
        srcu_read_unlock(&dquot_srcu, index);
        return;
}
EXPORT_SYMBOL(dquot_claim_space_nodirty);

/*
 * Convert allocated space back to in-memory reserved quotas
 */
void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
{
        struct dquot __rcu **dquots;
        struct dquot *dquot;
        int cnt, index;

        if (!inode_quota_active(inode)) {
                spin_lock(&inode->i_lock);
                *inode_reserved_space(inode) += number;
                __inode_sub_bytes(inode, number);
                spin_unlock(&inode->i_lock);
                return;
        }

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        /* Claim reserved quotas to allocated quotas */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (dquot) {
                        spin_lock(&dquot->dq_dqb_lock);
                        if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number))
                                number = dquot->dq_dqb.dqb_curspace;
                        dquot->dq_dqb.dqb_rsvspace += number;
                        dquot->dq_dqb.dqb_curspace -= number;
                        spin_unlock(&dquot->dq_dqb_lock);
                }
        }
        /* Update inode bytes */
        *inode_reserved_space(inode) += number;
        __inode_sub_bytes(inode, number);
        spin_unlock(&inode->i_lock);
        mark_all_dquot_dirty(dquots);
        srcu_read_unlock(&dquot_srcu, index);
        return;
}
EXPORT_SYMBOL(dquot_reclaim_space_nodirty);

/*
 * This operation can block, but only after everything is updated
 */
void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
{
        unsigned int cnt;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot __rcu **dquots;
        struct dquot *dquot;
        int reserve = flags & DQUOT_SPACE_RESERVE, index;

        if (!inode_quota_active(inode)) {
                if (reserve) {
                        spin_lock(&inode->i_lock);
                        *inode_reserved_space(inode) -= number;
                        spin_unlock(&inode->i_lock);
                } else {
                        inode_sub_bytes(inode, number);
                }
                return;
        }

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                int wtype;

                warn[cnt].w_type = QUOTA_NL_NOWARN;
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (!dquot)
                        continue;
                spin_lock(&dquot->dq_dqb_lock);
                wtype = info_bdq_free(dquot, number);
                if (wtype != QUOTA_NL_NOWARN)
                        prepare_warning(&warn[cnt], dquot, wtype);
                if (reserve)
                        dquot_free_reserved_space(dquot, number);
                else
                        dquot_decr_space(dquot, number);
                spin_unlock(&dquot->dq_dqb_lock);
        }
        if (reserve)
                *inode_reserved_space(inode) -= number;
        else
                __inode_sub_bytes(inode, number);
        spin_unlock(&inode->i_lock);

        if (reserve)
                goto out_unlock;
        mark_all_dquot_dirty(dquots);
out_unlock:
        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
}
EXPORT_SYMBOL(__dquot_free_space);

/*
 * This operation can block, but only after everything is updated
 */
void dquot_free_inode(struct inode *inode)
{
        unsigned int cnt;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot __rcu * const *dquots;
        struct dquot *dquot;
        int index;

        if (!inode_quota_active(inode))
                return;

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                int wtype;
                warn[cnt].w_type = QUOTA_NL_NOWARN;
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (!dquot)
                        continue;
                spin_lock(&dquot->dq_dqb_lock);
                wtype = info_idq_free(dquot, 1);
                if (wtype != QUOTA_NL_NOWARN)
                        prepare_warning(&warn[cnt], dquot, wtype);
                dquot_decr_inodes(dquot, 1);
                spin_unlock(&dquot->dq_dqb_lock);
        }
        spin_unlock(&inode->i_lock);
        mark_all_dquot_dirty(dquots);
        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
}
EXPORT_SYMBOL(dquot_free_inode);

/*
 * Transfer the number of inode and blocks from one diskquota to an other.
 * On success, dquot references in transfer_to are consumed and references
 * to original dquots that need to be released are placed there. On failure,
 * references are kept untouched.
 *
 * This operation can block, but only after everything is updated
 * A transaction must be started when entering this function.
 *
 * We are holding reference on transfer_from & transfer_to, no need to
 * protect them by srcu_read_lock().
 */
int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
{
        qsize_t cur_space;
        qsize_t rsv_space = 0;
        qsize_t inode_usage = 1;
        struct dquot __rcu **dquots;
        struct dquot *transfer_from[MAXQUOTAS] = {};
        int cnt, index, ret = 0, err;
        char is_valid[MAXQUOTAS] = {};
        struct dquot_warn warn_to[MAXQUOTAS];
        struct dquot_warn warn_from_inodes[MAXQUOTAS];
        struct dquot_warn warn_from_space[MAXQUOTAS];

        if (IS_NOQUOTA(inode))
                return 0;

        if (inode->i_sb->dq_op->get_inode_usage) {
                ret = inode->i_sb->dq_op->get_inode_usage(inode, &inode_usage);
                if (ret)
                        return ret;
        }

        /* Initialize the arrays */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                warn_to[cnt].w_type = QUOTA_NL_NOWARN;
                warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN;
                warn_from_space[cnt].w_type = QUOTA_NL_NOWARN;
        }

        spin_lock(&dq_data_lock);
        spin_lock(&inode->i_lock);
        if (IS_NOQUOTA(inode)) {        /* File without quota accounting? */
                spin_unlock(&inode->i_lock);
                spin_unlock(&dq_data_lock);
                return 0;
        }
        cur_space = __inode_get_bytes(inode);
        rsv_space = __inode_get_rsv_space(inode);
        dquots = i_dquot(inode);
        /*
         * Build the transfer_from list, check limits, and update usage in
         * the target structures.
         */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                /*
                 * Skip changes for same uid or gid or for turned off quota-type.
                 */
                if (!transfer_to[cnt])
                        continue;
                /* Avoid races with quotaoff() */
                if (!sb_has_quota_active(inode->i_sb, cnt))
                        continue;
                is_valid[cnt] = 1;
                transfer_from[cnt] = srcu_dereference_check(dquots[cnt],
                                &dquot_srcu, lockdep_is_held(&dq_data_lock));
                ret = dquot_add_inodes(transfer_to[cnt], inode_usage,
                                       &warn_to[cnt]);
                if (ret)
                        goto over_quota;
                ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space,
                                      DQUOT_SPACE_WARN, &warn_to[cnt]);
                if (ret) {
                        spin_lock(&transfer_to[cnt]->dq_dqb_lock);
                        dquot_decr_inodes(transfer_to[cnt], inode_usage);
                        spin_unlock(&transfer_to[cnt]->dq_dqb_lock);
                        goto over_quota;
                }
        }

        /* Decrease usage for source structures and update quota pointers */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (!is_valid[cnt])
                        continue;
                /* Due to IO error we might not have transfer_from[] structure */
                if (transfer_from[cnt]) {
                        int wtype;

                        spin_lock(&transfer_from[cnt]->dq_dqb_lock);
                        wtype = info_idq_free(transfer_from[cnt], inode_usage);
                        if (wtype != QUOTA_NL_NOWARN)
                                prepare_warning(&warn_from_inodes[cnt],
                                                transfer_from[cnt], wtype);
                        wtype = info_bdq_free(transfer_from[cnt],
                                              cur_space + rsv_space);
                        if (wtype != QUOTA_NL_NOWARN)
                                prepare_warning(&warn_from_space[cnt],
                                                transfer_from[cnt], wtype);
                        dquot_decr_inodes(transfer_from[cnt], inode_usage);
                        dquot_decr_space(transfer_from[cnt], cur_space);
                        dquot_free_reserved_space(transfer_from[cnt],
                                                  rsv_space);
                        spin_unlock(&transfer_from[cnt]->dq_dqb_lock);
                }
                rcu_assign_pointer(dquots[cnt], transfer_to[cnt]);
        }
        spin_unlock(&inode->i_lock);
        spin_unlock(&dq_data_lock);

        /*
         * These arrays are local and we hold dquot references so we don't need
         * the srcu protection but still take dquot_srcu to avoid warning in
         * mark_all_dquot_dirty().
         */
        index = srcu_read_lock(&dquot_srcu);
        err = mark_all_dquot_dirty((struct dquot __rcu **)transfer_from);
        if (err < 0)
                ret = err;
        err = mark_all_dquot_dirty((struct dquot __rcu **)transfer_to);
        if (err < 0)
                ret = err;
        srcu_read_unlock(&dquot_srcu, index);

        flush_warnings(warn_to);
        flush_warnings(warn_from_inodes);
        flush_warnings(warn_from_space);
        /* Pass back references to put */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (is_valid[cnt])
                        transfer_to[cnt] = transfer_from[cnt];
        return ret;
over_quota:
        /* Back out changes we already did */
        for (cnt--; cnt >= 0; cnt--) {
                if (!is_valid[cnt])
                        continue;
                spin_lock(&transfer_to[cnt]->dq_dqb_lock);
                dquot_decr_inodes(transfer_to[cnt], inode_usage);
                dquot_decr_space(transfer_to[cnt], cur_space);
                dquot_free_reserved_space(transfer_to[cnt], rsv_space);
                spin_unlock(&transfer_to[cnt]->dq_dqb_lock);
        }
        spin_unlock(&inode->i_lock);
        spin_unlock(&dq_data_lock);
        flush_warnings(warn_to);
        return ret;
}
EXPORT_SYMBOL(__dquot_transfer);

/* Wrapper for transferring ownership of an inode for uid/gid only
 * Called from FSXXX_setattr()
 */
int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode,
                   struct iattr *iattr)
{
        struct dquot *transfer_to[MAXQUOTAS] = {};
        struct dquot *dquot;
        struct super_block *sb = inode->i_sb;
        int ret;

        if (!inode_quota_active(inode))
                return 0;

        if (i_uid_needs_update(idmap, iattr, inode)) {
                kuid_t kuid = from_vfsuid(idmap, i_user_ns(inode),
                                          iattr->ia_vfsuid);

                dquot = dqget(sb, make_kqid_uid(kuid));
                if (IS_ERR(dquot)) {
                        if (PTR_ERR(dquot) != -ESRCH) {
                                ret = PTR_ERR(dquot);
                                goto out_put;
                        }
                        dquot = NULL;
                }
                transfer_to[USRQUOTA] = dquot;
        }
        if (i_gid_needs_update(idmap, iattr, inode)) {
                kgid_t kgid = from_vfsgid(idmap, i_user_ns(inode),
                                          iattr->ia_vfsgid);

                dquot = dqget(sb, make_kqid_gid(kgid));
                if (IS_ERR(dquot)) {
                        if (PTR_ERR(dquot) != -ESRCH) {
                                ret = PTR_ERR(dquot);
                                goto out_put;
                        }
                        dquot = NULL;
                }
                transfer_to[GRPQUOTA] = dquot;
        }
        ret = __dquot_transfer(inode, transfer_to);
out_put:
        dqput_all(transfer_to);
        return ret;
}
EXPORT_SYMBOL(dquot_transfer);

/*
 * Write info of quota file to disk
 */
int dquot_commit_info(struct super_block *sb, int type)
{
        struct quota_info *dqopt = sb_dqopt(sb);

        return dqopt->ops[type]->write_file_info(sb, type);
}
EXPORT_SYMBOL(dquot_commit_info);

int dquot_get_next_id(struct super_block *sb, struct kqid *qid)
{
        struct quota_info *dqopt = sb_dqopt(sb);

        if (!sb_has_quota_active(sb, qid->type))
                return -ESRCH;
        if (!dqopt->ops[qid->type]->get_next_id)
                return -ENOSYS;
        return dqopt->ops[qid->type]->get_next_id(sb, qid);
}
EXPORT_SYMBOL(dquot_get_next_id);

/*
 * Definitions of diskquota operations.
 */
const struct dquot_operations dquot_operations = {
        .write_dquot        = dquot_commit,
        .acquire_dquot        = dquot_acquire,
        .release_dquot        = dquot_release,
        .mark_dirty        = dquot_mark_dquot_dirty,
        .write_info        = dquot_commit_info,
        .alloc_dquot        = dquot_alloc,
        .destroy_dquot        = dquot_destroy,
        .get_next_id        = dquot_get_next_id,
};
EXPORT_SYMBOL(dquot_operations);

/*
 * Generic helper for ->open on filesystems supporting disk quotas.
 */
int dquot_file_open(struct inode *inode, struct file *file)
{
        int error;

        error = generic_file_open(inode, file);
        if (!error && (file->f_mode & FMODE_WRITE))
                error = dquot_initialize(inode);
        return error;
}
EXPORT_SYMBOL(dquot_file_open);

static void vfs_cleanup_quota_inode(struct super_block *sb, int type)
{
        struct quota_info *dqopt = sb_dqopt(sb);
        struct inode *inode = dqopt->files[type];

        if (!inode)
                return;
        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
                inode_lock(inode);
                inode->i_flags &= ~S_NOQUOTA;
                inode_unlock(inode);
        }
        dqopt->files[type] = NULL;
        iput(inode);
}

/*
 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
 */
int dquot_disable(struct super_block *sb, int type, unsigned int flags)
{
        int cnt;
        struct quota_info *dqopt = sb_dqopt(sb);

        /* s_umount should be held in exclusive mode */
        if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
                up_read(&sb->s_umount);

        /* Cannot turn off usage accounting without turning off limits, or
         * suspend quotas and simultaneously turn quotas off. */
        if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
            || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED |
            DQUOT_USAGE_ENABLED)))
                return -EINVAL;

        /*
         * Skip everything if there's nothing to do. We have to do this because
         * sometimes we are called when fill_super() failed and calling
         * sync_fs() in such cases does no good.
         */
        if (!sb_any_quota_loaded(sb))
                return 0;

        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                if (!sb_has_quota_loaded(sb, cnt))
                        continue;

                if (flags & DQUOT_SUSPENDED) {
                        spin_lock(&dq_state_lock);
                        dqopt->flags |=
                                dquot_state_flag(DQUOT_SUSPENDED, cnt);
                        spin_unlock(&dq_state_lock);
                } else {
                        spin_lock(&dq_state_lock);
                        dqopt->flags &= ~dquot_state_flag(flags, cnt);
                        /* Turning off suspended quotas? */
                        if (!sb_has_quota_loaded(sb, cnt) &&
                            sb_has_quota_suspended(sb, cnt)) {
                                dqopt->flags &=        ~dquot_state_flag(
                                                        DQUOT_SUSPENDED, cnt);
                                spin_unlock(&dq_state_lock);
                                vfs_cleanup_quota_inode(sb, cnt);
                                continue;
                        }
                        spin_unlock(&dq_state_lock);
                }

                /* We still have to keep quota loaded? */
                if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED))
                        continue;

                /* Note: these are blocking operations */
                drop_dquot_ref(sb, cnt);
                invalidate_dquots(sb, cnt);
                /*
                 * Now all dquots should be invalidated, all writes done so we
                 * should be only users of the info. No locks needed.
                 */
                if (info_dirty(&dqopt->info[cnt]))
                        sb->dq_op->write_info(sb, cnt);
                if (dqopt->ops[cnt]->free_file_info)
                        dqopt->ops[cnt]->free_file_info(sb, cnt);
                put_quota_format(dqopt->info[cnt].dqi_format);
                dqopt->info[cnt].dqi_flags = 0;
                dqopt->info[cnt].dqi_igrace = 0;
                dqopt->info[cnt].dqi_bgrace = 0;
                dqopt->ops[cnt] = NULL;
        }

        /* Skip syncing and setting flags if quota files are hidden */
        if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
                goto put_inodes;

        /* Sync the superblock so that buffers with quota data are written to
         * disk (and so userspace sees correct data afterwards). */
        if (sb->s_op->sync_fs)
                sb->s_op->sync_fs(sb, 1);
        sync_blockdev(sb->s_bdev);
        /* Now the quota files are just ordinary files and we can set the
         * inode flags back. Moreover we discard the pagecache so that
         * userspace sees the writes we did bypassing the pagecache. We
         * must also discard the blockdev buffers so that we see the
         * changes done by userspace on the next quotaon() */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (!sb_has_quota_loaded(sb, cnt) && dqopt->files[cnt]) {
                        inode_lock(dqopt->files[cnt]);
                        truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
                        inode_unlock(dqopt->files[cnt]);
                }
        if (sb->s_bdev)
                invalidate_bdev(sb->s_bdev);
put_inodes:
        /* We are done when suspending quotas */
        if (flags & DQUOT_SUSPENDED)
                return 0;

        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (!sb_has_quota_loaded(sb, cnt))
                        vfs_cleanup_quota_inode(sb, cnt);
        return 0;
}
EXPORT_SYMBOL(dquot_disable);

int dquot_quota_off(struct super_block *sb, int type)
{
        return dquot_disable(sb, type,
                             DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
}
EXPORT_SYMBOL(dquot_quota_off);

/*
 *        Turn quotas on on a device
 */

static int vfs_setup_quota_inode(struct inode *inode, int type)
{
        struct super_block *sb = inode->i_sb;
        struct quota_info *dqopt = sb_dqopt(sb);

        if (is_bad_inode(inode))
                return -EUCLEAN;
        if (!S_ISREG(inode->i_mode))
                return -EACCES;
        if (IS_RDONLY(inode))
                return -EROFS;
        if (sb_has_quota_loaded(sb, type))
                return -EBUSY;

        /*
         * Quota files should never be encrypted.  They should be thought of as
         * filesystem metadata, not user data.  New-style internal quota files
         * cannot be encrypted by users anyway, but old-style external quota
         * files could potentially be incorrectly created in an encrypted
         * directory, hence this explicit check.  Some reasons why encrypted
         * quota files don't work include: (1) some filesystems that support
         * encryption don't handle it in their quota_read and quota_write, and
         * (2) cleaning up encrypted quota files at unmount would need special
         * consideration, as quota files are cleaned up later than user files.
         */
        if (IS_ENCRYPTED(inode))
                return -EINVAL;

        dqopt->files[type] = igrab(inode);
        if (!dqopt->files[type])
                return -EIO;
        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
                /* We don't want quota and atime on quota files (deadlocks
                 * possible) Also nobody should write to the file - we use
                 * special IO operations which ignore the immutable bit. */
                inode_lock(inode);
                inode->i_flags |= S_NOQUOTA;
                inode_unlock(inode);
                /*
                 * When S_NOQUOTA is set, remove dquot references as no more
                 * references can be added
                 */
                __dquot_drop(inode);
        }
        return 0;
}

int dquot_load_quota_sb(struct super_block *sb, int type, int format_id,
        unsigned int flags)
{
        struct quota_format_type *fmt = find_quota_format(format_id);
        struct quota_info *dqopt = sb_dqopt(sb);
        int error;

        lockdep_assert_held_write(&sb->s_umount);

        /* Just unsuspend quotas? */
        if (WARN_ON_ONCE(flags & DQUOT_SUSPENDED))
                return -EINVAL;

        if (!fmt)
                return -ESRCH;
        if (!sb->dq_op || !sb->s_qcop ||
            (type == PRJQUOTA && sb->dq_op->get_projid == NULL)) {
                error = -EINVAL;
                goto out_fmt;
        }
        /* Filesystems outside of init_user_ns not yet supported */
        if (sb->s_user_ns != &init_user_ns) {
                error = -EINVAL;
                goto out_fmt;
        }
        /* Usage always has to be set... */
        if (!(flags & DQUOT_USAGE_ENABLED)) {
                error = -EINVAL;
                goto out_fmt;
        }
        if (sb_has_quota_loaded(sb, type)) {
                error = -EBUSY;
                goto out_fmt;
        }

        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
                /* As we bypass the pagecache we must now flush all the
                 * dirty data and invalidate caches so that kernel sees
                 * changes from userspace. It is not enough to just flush
                 * the quota file since if blocksize < pagesize, invalidation
                 * of the cache could fail because of other unrelated dirty
                 * data */
                sync_filesystem(sb);
                invalidate_bdev(sb->s_bdev);
        }

        error = -EINVAL;
        if (!fmt->qf_ops->check_quota_file(sb, type))
                goto out_fmt;

        dqopt->ops[type] = fmt->qf_ops;
        dqopt->info[type].dqi_format = fmt;
        dqopt->info[type].dqi_fmt_id = format_id;
        INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list);
        error = dqopt->ops[type]->read_file_info(sb, type);
        if (error < 0)
                goto out_fmt;
        if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) {
                spin_lock(&dq_data_lock);
                dqopt->info[type].dqi_flags |= DQF_SYS_FILE;
                spin_unlock(&dq_data_lock);
        }
        spin_lock(&dq_state_lock);
        dqopt->flags |= dquot_state_flag(flags, type);
        spin_unlock(&dq_state_lock);

        error = add_dquot_ref(sb, type);
        if (error)
                dquot_disable(sb, type,
                              DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);

        return error;
out_fmt:
        put_quota_format(fmt);

        return error;
}
EXPORT_SYMBOL(dquot_load_quota_sb);

/*
 * More powerful function for turning on quotas on given quota inode allowing
 * setting of individual quota flags
 */
int dquot_load_quota_inode(struct inode *inode, int type, int format_id,
        unsigned int flags)
{
        int err;

        err = vfs_setup_quota_inode(inode, type);
        if (err < 0)
                return err;
        err = dquot_load_quota_sb(inode->i_sb, type, format_id, flags);
        if (err < 0)
                vfs_cleanup_quota_inode(inode->i_sb, type);
        return err;
}
EXPORT_SYMBOL(dquot_load_quota_inode);

/* Reenable quotas on remount RW */
int dquot_resume(struct super_block *sb, int type)
{
        struct quota_info *dqopt = sb_dqopt(sb);
        int ret = 0, cnt;
        unsigned int flags;

        /* s_umount should be held in exclusive mode */
        if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
                up_read(&sb->s_umount);

        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                if (!sb_has_quota_suspended(sb, cnt))
                        continue;

                spin_lock(&dq_state_lock);
                flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
                                                        DQUOT_LIMITS_ENABLED,
                                                        cnt);
                dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt);
                spin_unlock(&dq_state_lock);

                flags = dquot_generic_flag(flags, cnt);
                ret = dquot_load_quota_sb(sb, cnt, dqopt->info[cnt].dqi_fmt_id,
                                          flags);
                if (ret < 0)
                        vfs_cleanup_quota_inode(sb, cnt);
        }

        return ret;
}
EXPORT_SYMBOL(dquot_resume);

int dquot_quota_on(struct super_block *sb, int type, int format_id,
                   const struct path *path)
{
        int error = security_quota_on(path->dentry);
        if (error)
                return error;
        /* Quota file not on the same filesystem? */
        if (path->dentry->d_sb != sb)
                error = -EXDEV;
        else
                error = dquot_load_quota_inode(d_inode(path->dentry), type,
                                             format_id, DQUOT_USAGE_ENABLED |
                                             DQUOT_LIMITS_ENABLED);
        return error;
}
EXPORT_SYMBOL(dquot_quota_on);

/*
 * This function is used when filesystem needs to initialize quotas
 * during mount time.
 */
int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
                int format_id, int type)
{
        struct dentry *dentry;
        int error;

        dentry = lookup_positive_unlocked(qf_name, sb->s_root, strlen(qf_name));
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);

        error = security_quota_on(dentry);
        if (!error)
                error = dquot_load_quota_inode(d_inode(dentry), type, format_id,
                                DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);

        dput(dentry);
        return error;
}
EXPORT_SYMBOL(dquot_quota_on_mount);

static int dquot_quota_enable(struct super_block *sb, unsigned int flags)
{
        int ret;
        int type;
        struct quota_info *dqopt = sb_dqopt(sb);

        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
                return -ENOSYS;
        /* Accounting cannot be turned on while fs is mounted */
        flags &= ~(FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT);
        if (!flags)
                return -EINVAL;
        for (type = 0; type < MAXQUOTAS; type++) {
                if (!(flags & qtype_enforce_flag(type)))
                        continue;
                /* Can't enforce without accounting */
                if (!sb_has_quota_usage_enabled(sb, type)) {
                        ret = -EINVAL;
                        goto out_err;
                }
                if (sb_has_quota_limits_enabled(sb, type)) {
                        ret = -EBUSY;
                        goto out_err;
                }
                spin_lock(&dq_state_lock);
                dqopt->flags |= dquot_state_flag(DQUOT_LIMITS_ENABLED, type);
                spin_unlock(&dq_state_lock);
        }
        return 0;
out_err:
        /* Backout enforcement enablement we already did */
        for (type--; type >= 0; type--)  {
                if (flags & qtype_enforce_flag(type))
                        dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
        }
        /* Error code translation for better compatibility with XFS */
        if (ret == -EBUSY)
                ret = -EEXIST;
        return ret;
}

static int dquot_quota_disable(struct super_block *sb, unsigned int flags)
{
        int ret;
        int type;
        struct quota_info *dqopt = sb_dqopt(sb);

        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
                return -ENOSYS;
        /*
         * We don't support turning off accounting via quotactl. In principle
         * quota infrastructure can do this but filesystems don't expect
         * userspace to be able to do it.
         */
        if (flags &
                  (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT))
                return -EOPNOTSUPP;

        /* Filter out limits not enabled */
        for (type = 0; type < MAXQUOTAS; type++)
                if (!sb_has_quota_limits_enabled(sb, type))
                        flags &= ~qtype_enforce_flag(type);
        /* Nothing left? */
        if (!flags)
                return -EEXIST;
        for (type = 0; type < MAXQUOTAS; type++) {
                if (flags & qtype_enforce_flag(type)) {
                        ret = dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
                        if (ret < 0)
                                goto out_err;
                }
        }
        return 0;
out_err:
        /* Backout enforcement disabling we already did */
        for (type--; type >= 0; type--)  {
                if (flags & qtype_enforce_flag(type)) {
                        spin_lock(&dq_state_lock);
                        dqopt->flags |=
                                dquot_state_flag(DQUOT_LIMITS_ENABLED, type);
                        spin_unlock(&dq_state_lock);
                }
        }
        return ret;
}

/* Generic routine for getting common part of quota structure */
static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di)
{
        struct mem_dqblk *dm = &dquot->dq_dqb;

        memset(di, 0, sizeof(*di));
        spin_lock(&dquot->dq_dqb_lock);
        di->d_spc_hardlimit = dm->dqb_bhardlimit;
        di->d_spc_softlimit = dm->dqb_bsoftlimit;
        di->d_ino_hardlimit = dm->dqb_ihardlimit;
        di->d_ino_softlimit = dm->dqb_isoftlimit;
        di->d_space = dm->dqb_curspace + dm->dqb_rsvspace;
        di->d_ino_count = dm->dqb_curinodes;
        di->d_spc_timer = dm->dqb_btime;
        di->d_ino_timer = dm->dqb_itime;
        spin_unlock(&dquot->dq_dqb_lock);
}

int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
                    struct qc_dqblk *di)
{
        struct dquot *dquot;

        dquot = dqget(sb, qid);
        if (IS_ERR(dquot))
                return PTR_ERR(dquot);
        do_get_dqblk(dquot, di);
        dqput(dquot);

        return 0;
}
EXPORT_SYMBOL(dquot_get_dqblk);

int dquot_get_next_dqblk(struct super_block *sb, struct kqid *qid,
                         struct qc_dqblk *di)
{
        struct dquot *dquot;
        int err;

        if (!sb->dq_op->get_next_id)
                return -ENOSYS;
        err = sb->dq_op->get_next_id(sb, qid);
        if (err < 0)
                return err;
        dquot = dqget(sb, *qid);
        if (IS_ERR(dquot))
                return PTR_ERR(dquot);
        do_get_dqblk(dquot, di);
        dqput(dquot);

        return 0;
}
EXPORT_SYMBOL(dquot_get_next_dqblk);

#define VFS_QC_MASK \
        (QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \
         QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \
         QC_SPC_TIMER | QC_INO_TIMER)

/* Generic routine for setting common part of quota structure */
static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
{
        struct mem_dqblk *dm = &dquot->dq_dqb;
        int check_blim = 0, check_ilim = 0;
        struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
        int ret;

        if (di->d_fieldmask & ~VFS_QC_MASK)
                return -EINVAL;

        if (((di->d_fieldmask & QC_SPC_SOFT) &&
             di->d_spc_softlimit > dqi->dqi_max_spc_limit) ||
            ((di->d_fieldmask & QC_SPC_HARD) &&
             di->d_spc_hardlimit > dqi->dqi_max_spc_limit) ||
            ((di->d_fieldmask & QC_INO_SOFT) &&
             (di->d_ino_softlimit > dqi->dqi_max_ino_limit)) ||
            ((di->d_fieldmask & QC_INO_HARD) &&
             (di->d_ino_hardlimit > dqi->dqi_max_ino_limit)))
                return -ERANGE;

        spin_lock(&dquot->dq_dqb_lock);
        if (di->d_fieldmask & QC_SPACE) {
                dm->dqb_curspace = di->d_space - dm->dqb_rsvspace;
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_SPC_SOFT)
                dm->dqb_bsoftlimit = di->d_spc_softlimit;
        if (di->d_fieldmask & QC_SPC_HARD)
                dm->dqb_bhardlimit = di->d_spc_hardlimit;
        if (di->d_fieldmask & (QC_SPC_SOFT | QC_SPC_HARD)) {
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_INO_COUNT) {
                dm->dqb_curinodes = di->d_ino_count;
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_INO_SOFT)
                dm->dqb_isoftlimit = di->d_ino_softlimit;
        if (di->d_fieldmask & QC_INO_HARD)
                dm->dqb_ihardlimit = di->d_ino_hardlimit;
        if (di->d_fieldmask & (QC_INO_SOFT | QC_INO_HARD)) {
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_SPC_TIMER) {
                dm->dqb_btime = di->d_spc_timer;
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_INO_TIMER) {
                dm->dqb_itime = di->d_ino_timer;
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
        }

        if (check_blim) {
                if (!dm->dqb_bsoftlimit ||
                    dm->dqb_curspace + dm->dqb_rsvspace <= dm->dqb_bsoftlimit) {
                        dm->dqb_btime = 0;
                        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
                } else if (!(di->d_fieldmask & QC_SPC_TIMER))
                        /* Set grace only if user hasn't provided his own... */
                        dm->dqb_btime = ktime_get_real_seconds() + dqi->dqi_bgrace;
        }
        if (check_ilim) {
                if (!dm->dqb_isoftlimit ||
                    dm->dqb_curinodes <= dm->dqb_isoftlimit) {
                        dm->dqb_itime = 0;
                        clear_bit(DQ_INODES_B, &dquot->dq_flags);
                } else if (!(di->d_fieldmask & QC_INO_TIMER))
                        /* Set grace only if user hasn't provided his own... */
                        dm->dqb_itime = ktime_get_real_seconds() + dqi->dqi_igrace;
        }
        if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit ||
            dm->dqb_isoftlimit)
                clear_bit(DQ_FAKE_B, &dquot->dq_flags);
        else
                set_bit(DQ_FAKE_B, &dquot->dq_flags);
        spin_unlock(&dquot->dq_dqb_lock);
        ret = mark_dquot_dirty(dquot);
        if (ret < 0)
                return ret;
        return 0;
}

int dquot_set_dqblk(struct super_block *sb, struct kqid qid,
                  struct qc_dqblk *di)
{
        struct dquot *dquot;
        int rc;

        dquot = dqget(sb, qid);
        if (IS_ERR(dquot)) {
                rc = PTR_ERR(dquot);
                goto out;
        }
        rc = do_set_dqblk(dquot, di);
        dqput(dquot);
out:
        return rc;
}
EXPORT_SYMBOL(dquot_set_dqblk);

/* Generic routine for getting common part of quota file information */
int dquot_get_state(struct super_block *sb, struct qc_state *state)
{
        struct mem_dqinfo *mi;
        struct qc_type_state *tstate;
        struct quota_info *dqopt = sb_dqopt(sb);
        int type;

        memset(state, 0, sizeof(*state));
        for (type = 0; type < MAXQUOTAS; type++) {
                if (!sb_has_quota_active(sb, type))
                        continue;
                tstate = state->s_state + type;
                mi = sb_dqopt(sb)->info + type;
                tstate->flags = QCI_ACCT_ENABLED;
                spin_lock(&dq_data_lock);
                if (mi->dqi_flags & DQF_SYS_FILE)
                        tstate->flags |= QCI_SYSFILE;
                if (mi->dqi_flags & DQF_ROOT_SQUASH)
                        tstate->flags |= QCI_ROOT_SQUASH;
                if (sb_has_quota_limits_enabled(sb, type))
                        tstate->flags |= QCI_LIMITS_ENFORCED;
                tstate->spc_timelimit = mi->dqi_bgrace;
                tstate->ino_timelimit = mi->dqi_igrace;
                if (dqopt->files[type]) {
                        tstate->ino = dqopt->files[type]->i_ino;
                        tstate->blocks = dqopt->files[type]->i_blocks;
                }
                tstate->nextents = 1;        /* We don't know... */
                spin_unlock(&dq_data_lock);
        }
        return 0;
}
EXPORT_SYMBOL(dquot_get_state);

/* Generic routine for setting common part of quota file information */
int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii)
{
        struct mem_dqinfo *mi;

        if ((ii->i_fieldmask & QC_WARNS_MASK) ||
            (ii->i_fieldmask & QC_RT_SPC_TIMER))
                return -EINVAL;
        if (!sb_has_quota_active(sb, type))
                return -ESRCH;
        mi = sb_dqopt(sb)->info + type;
        if (ii->i_fieldmask & QC_FLAGS) {
                if ((ii->i_flags & QCI_ROOT_SQUASH &&
                     mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD))
                        return -EINVAL;
        }
        spin_lock(&dq_data_lock);
        if (ii->i_fieldmask & QC_SPC_TIMER)
                mi->dqi_bgrace = ii->i_spc_timelimit;
        if (ii->i_fieldmask & QC_INO_TIMER)
                mi->dqi_igrace = ii->i_ino_timelimit;
        if (ii->i_fieldmask & QC_FLAGS) {
                if (ii->i_flags & QCI_ROOT_SQUASH)
                        mi->dqi_flags |= DQF_ROOT_SQUASH;
                else
                        mi->dqi_flags &= ~DQF_ROOT_SQUASH;
        }
        spin_unlock(&dq_data_lock);
        mark_info_dirty(sb, type);
        /* Force write to disk */
        return sb->dq_op->write_info(sb, type);
}
EXPORT_SYMBOL(dquot_set_dqinfo);

const struct quotactl_ops dquot_quotactl_sysfile_ops = {
        .quota_enable        = dquot_quota_enable,
        .quota_disable        = dquot_quota_disable,
        .quota_sync        = dquot_quota_sync,
        .get_state        = dquot_get_state,
        .set_info        = dquot_set_dqinfo,
        .get_dqblk        = dquot_get_dqblk,
        .get_nextdqblk        = dquot_get_next_dqblk,
        .set_dqblk        = dquot_set_dqblk
};
EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);

static int do_proc_dqstats(struct ctl_table *table, int write,
                     void *buffer, size_t *lenp, loff_t *ppos)
{
        unsigned int type = (unsigned long *)table->data - dqstats.stat;
        s64 value = percpu_counter_sum(&dqstats.counter[type]);

        /* Filter negative values for non-monotonic counters */
        if (value < 0 && (type == DQST_ALLOC_DQUOTS ||
                          type == DQST_FREE_DQUOTS))
                value = 0;

        /* Update global table */
        dqstats.stat[type] = value;
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

static struct ctl_table fs_dqstats_table[] = {
        {
                .procname        = "lookups",
                .data                = &dqstats.stat[DQST_LOOKUPS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "drops",
                .data                = &dqstats.stat[DQST_DROPS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "reads",
                .data                = &dqstats.stat[DQST_READS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "writes",
                .data                = &dqstats.stat[DQST_WRITES],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "cache_hits",
                .data                = &dqstats.stat[DQST_CACHE_HITS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "allocated_dquots",
                .data                = &dqstats.stat[DQST_ALLOC_DQUOTS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "free_dquots",
                .data                = &dqstats.stat[DQST_FREE_DQUOTS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "syncs",
                .data                = &dqstats.stat[DQST_SYNCS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
#ifdef CONFIG_PRINT_QUOTA_WARNING
        {
                .procname        = "warnings",
                .data                = &flag_print_warnings,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#endif
};

static int __init dquot_init(void)
{
        int i, ret;
        unsigned long nr_hash, order;
        struct shrinker *dqcache_shrinker;

        printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);

        register_sysctl_init("fs/quota", fs_dqstats_table);

        dquot_cachep = kmem_cache_create("dquot",
                        sizeof(struct dquot), sizeof(unsigned long) * 4,
                        (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
                                SLAB_PANIC),
                        NULL);

        order = 0;
        dquot_hash = (struct hlist_head *)__get_free_pages(GFP_KERNEL, order);
        if (!dquot_hash)
                panic("Cannot create dquot hash table");

        ret = percpu_counter_init_many(dqstats.counter, 0, GFP_KERNEL,
                                       _DQST_DQSTAT_LAST);
        if (ret)
                panic("Cannot create dquot stat counters");

        /* Find power-of-two hlist_heads which can fit into allocation */
        nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
        dq_hash_bits = ilog2(nr_hash);

        nr_hash = 1UL << dq_hash_bits;
        dq_hash_mask = nr_hash - 1;
        for (i = 0; i < nr_hash; i++)
                INIT_HLIST_HEAD(dquot_hash + i);

        pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld,"
                " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));

        dqcache_shrinker = shrinker_alloc(0, "dquota-cache");
        if (!dqcache_shrinker)
                panic("Cannot allocate dquot shrinker");

        dqcache_shrinker->count_objects = dqcache_shrink_count;
        dqcache_shrinker->scan_objects = dqcache_shrink_scan;

        shrinker_register(dqcache_shrinker);

        return 0;
}
fs_initcall(dquot_init);


































    1 















    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/fs/ext4/truncate.h
 *
 * Common inline functions needed for truncate support
 */

/*
 * Truncate blocks that were not used by write. We have to truncate the
 * pagecache as well so that corresponding buffers get properly unmapped.
 */
static inline void ext4_truncate_failed_write(struct inode *inode)
{
        struct address_space *mapping = inode->i_mapping;

        /*
         * We don't need to call ext4_break_layouts() because the blocks we
         * are truncating were never visible to userspace.
         */
        filemap_invalidate_lock(mapping);
        truncate_inode_pages(mapping, inode->i_size);
        ext4_truncate(inode);
        filemap_invalidate_unlock(mapping);
}

/*
 * Work out how many blocks we need to proceed with the next chunk of a
 * truncate transaction.
 */
static inline unsigned long ext4_blocks_for_truncate(struct inode *inode)
{
        ext4_lblk_t needed;

        needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);

        /* Give ourselves just enough room to cope with inodes in which
         * i_blocks is corrupt: we've seen disk corruptions in the past
         * which resulted in random data in an inode which looked enough
         * like a regular file for ext4 to try to delete it.  Things
         * will go a bit crazy if that happens, but at least we should
         * try not to panic the whole kernel. */
        if (needed < 2)
                needed = 2;

        /* But we need to bound the transaction so we don't overflow the
         * journal. */
        if (needed > EXT4_MAX_TRANS_DATA)
                needed = EXT4_MAX_TRANS_DATA;

        return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
}




























































































    2 






















    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Derived from arch/ppc/mm/extable.c and arch/i386/mm/extable.c.
 *
 * Copyright (C) 2004 Paul Mackerras, IBM Corp.
 */

#include <linux/bsearch.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sort.h>
#include <linux/uaccess.h>
#include <linux/extable.h>

#ifndef ARCH_HAS_RELATIVE_EXTABLE
#define ex_to_insn(x)        ((x)->insn)
#else
static inline unsigned long ex_to_insn(const struct exception_table_entry *x)
{
        return (unsigned long)&x->insn + x->insn;
}
#endif

#ifndef ARCH_HAS_RELATIVE_EXTABLE
#define swap_ex                NULL
#else
static void swap_ex(void *a, void *b, int size)
{
        struct exception_table_entry *x = a, *y = b, tmp;
        int delta = b - a;

        tmp = *x;
        x->insn = y->insn + delta;
        y->insn = tmp.insn - delta;

#ifdef swap_ex_entry_fixup
        swap_ex_entry_fixup(x, y, tmp, delta);
#else
        x->fixup = y->fixup + delta;
        y->fixup = tmp.fixup - delta;
#endif
}
#endif /* ARCH_HAS_RELATIVE_EXTABLE */

/*
 * The exception table needs to be sorted so that the binary
 * search that we use to find entries in it works properly.
 * This is used both for the kernel exception table and for
 * the exception tables of modules that get loaded.
 */
static int cmp_ex_sort(const void *a, const void *b)
{
        const struct exception_table_entry *x = a, *y = b;

        /* avoid overflow */
        if (ex_to_insn(x) > ex_to_insn(y))
                return 1;
        if (ex_to_insn(x) < ex_to_insn(y))
                return -1;
        return 0;
}

void sort_extable(struct exception_table_entry *start,
                  struct exception_table_entry *finish)
{
        sort(start, finish - start, sizeof(struct exception_table_entry),
             cmp_ex_sort, swap_ex);
}

#ifdef CONFIG_MODULES
/*
 * If the exception table is sorted, any referring to the module init
 * will be at the beginning or the end.
 */
void trim_init_extable(struct module *m)
{
        /*trim the beginning*/
        while (m->num_exentries &&
               within_module_init(ex_to_insn(&m->extable[0]), m)) {
                m->extable++;
                m->num_exentries--;
        }
        /*trim the end*/
        while (m->num_exentries &&
               within_module_init(ex_to_insn(&m->extable[m->num_exentries - 1]),
                                  m))
                m->num_exentries--;
}
#endif /* CONFIG_MODULES */

static int cmp_ex_search(const void *key, const void *elt)
{
        const struct exception_table_entry *_elt = elt;
        unsigned long _key = *(unsigned long *)key;

        /* avoid overflow */
        if (_key > ex_to_insn(_elt))
                return 1;
        if (_key < ex_to_insn(_elt))
                return -1;
        return 0;
}

/*
 * Search one exception table for an entry corresponding to the
 * given instruction address, and return the address of the entry,
 * or NULL if none is found.
 * We use a binary search, and thus we assume that the table is
 * already sorted.
 */
const struct exception_table_entry *
search_extable(const struct exception_table_entry *base,
               const size_t num,
               unsigned long value)
{
        return bsearch(&value, base, num,
                       sizeof(struct exception_table_entry), cmp_ex_search);
}





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 





































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
/*
  FUSE: Filesystem in Userspace
  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>

  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
*/

#include "fuse_i.h"

#include <linux/init.h>
#include <linux/module.h>
#include <linux/poll.h>
#include <linux/sched/signal.h>
#include <linux/uio.h>
#include <linux/miscdevice.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/pipe_fs_i.h>
#include <linux/swap.h>
#include <linux/splice.h>
#include <linux/sched.h>

MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");

/* Ordinary requests have even IDs, while interrupts IDs are odd */
#define FUSE_INT_REQ_BIT (1ULL << 0)
#define FUSE_REQ_ID_STEP (1ULL << 1)

static struct kmem_cache *fuse_req_cachep;

static struct fuse_dev *fuse_get_dev(struct file *file)
{
        /*
         * Lockless access is OK, because file->private data is set
         * once during mount and is valid until the file is released.
         */
        return READ_ONCE(file->private_data);
}

static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
{
        INIT_LIST_HEAD(&req->list);
        INIT_LIST_HEAD(&req->intr_entry);
        init_waitqueue_head(&req->waitq);
        refcount_set(&req->count, 1);
        __set_bit(FR_PENDING, &req->flags);
        req->fm = fm;
}

static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags)
{
        struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags);
        if (req)
                fuse_request_init(fm, req);

        return req;
}

static void fuse_request_free(struct fuse_req *req)
{
        kmem_cache_free(fuse_req_cachep, req);
}

static void __fuse_get_request(struct fuse_req *req)
{
        refcount_inc(&req->count);
}

/* Must be called with > 1 refcount */
static void __fuse_put_request(struct fuse_req *req)
{
        refcount_dec(&req->count);
}

void fuse_set_initialized(struct fuse_conn *fc)
{
        /* Make sure stores before this are seen on another CPU */
        smp_wmb();
        fc->initialized = 1;
}

static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
{
        return !fc->initialized || (for_background && fc->blocked);
}

static void fuse_drop_waiting(struct fuse_conn *fc)
{
        /*
         * lockess check of fc->connected is okay, because atomic_dec_and_test()
         * provides a memory barrier matched with the one in fuse_wait_aborted()
         * to ensure no wake-up is missed.
         */
        if (atomic_dec_and_test(&fc->num_waiting) &&
            !READ_ONCE(fc->connected)) {
                /* wake up aborters */
                wake_up_all(&fc->blocked_waitq);
        }
}

static void fuse_put_request(struct fuse_req *req);

static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background)
{
        struct fuse_conn *fc = fm->fc;
        struct fuse_req *req;
        int err;
        atomic_inc(&fc->num_waiting);

        if (fuse_block_alloc(fc, for_background)) {
                err = -EINTR;
                if (wait_event_killable_exclusive(fc->blocked_waitq,
                                !fuse_block_alloc(fc, for_background)))
                        goto out;
        }
        /* Matches smp_wmb() in fuse_set_initialized() */
        smp_rmb();

        err = -ENOTCONN;
        if (!fc->connected)
                goto out;

        err = -ECONNREFUSED;
        if (fc->conn_error)
                goto out;

        req = fuse_request_alloc(fm, GFP_KERNEL);
        err = -ENOMEM;
        if (!req) {
                if (for_background)
                        wake_up(&fc->blocked_waitq);
                goto out;
        }

        req->in.h.uid = from_kuid(fc->user_ns, current_fsuid());
        req->in.h.gid = from_kgid(fc->user_ns, current_fsgid());
        req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);

        __set_bit(FR_WAITING, &req->flags);
        if (for_background)
                __set_bit(FR_BACKGROUND, &req->flags);

        if (unlikely(req->in.h.uid == ((uid_t)-1) ||
                     req->in.h.gid == ((gid_t)-1))) {
                fuse_put_request(req);
                return ERR_PTR(-EOVERFLOW);
        }
        return req;

 out:
        fuse_drop_waiting(fc);
        return ERR_PTR(err);
}

static void fuse_put_request(struct fuse_req *req)
{
        struct fuse_conn *fc = req->fm->fc;

        if (refcount_dec_and_test(&req->count)) {
                if (test_bit(FR_BACKGROUND, &req->flags)) {
                        /*
                         * We get here in the unlikely case that a background
                         * request was allocated but not sent
                         */
                        spin_lock(&fc->bg_lock);
                        if (!fc->blocked)
                                wake_up(&fc->blocked_waitq);
                        spin_unlock(&fc->bg_lock);
                }

                if (test_bit(FR_WAITING, &req->flags)) {
                        __clear_bit(FR_WAITING, &req->flags);
                        fuse_drop_waiting(fc);
                }

                fuse_request_free(req);
        }
}

unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args)
{
        unsigned nbytes = 0;
        unsigned i;

        for (i = 0; i < numargs; i++)
                nbytes += args[i].size;

        return nbytes;
}
EXPORT_SYMBOL_GPL(fuse_len_args);

u64 fuse_get_unique(struct fuse_iqueue *fiq)
{
        fiq->reqctr += FUSE_REQ_ID_STEP;
        return fiq->reqctr;
}
EXPORT_SYMBOL_GPL(fuse_get_unique);

static unsigned int fuse_req_hash(u64 unique)
{
        return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS);
}

/*
 * A new request is available, wake fiq->waitq
 */
static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq)
__releases(fiq->lock)
{
        wake_up(&fiq->waitq);
        kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
        spin_unlock(&fiq->lock);
}

const struct fuse_iqueue_ops fuse_dev_fiq_ops = {
        .wake_forget_and_unlock                = fuse_dev_wake_and_unlock,
        .wake_interrupt_and_unlock        = fuse_dev_wake_and_unlock,
        .wake_pending_and_unlock        = fuse_dev_wake_and_unlock,
};
EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops);

static void queue_request_and_unlock(struct fuse_iqueue *fiq,
                                     struct fuse_req *req)
__releases(fiq->lock)
{
        req->in.h.len = sizeof(struct fuse_in_header) +
                fuse_len_args(req->args->in_numargs,
                              (struct fuse_arg *) req->args->in_args);
        list_add_tail(&req->list, &fiq->pending);
        fiq->ops->wake_pending_and_unlock(fiq);
}

void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
                       u64 nodeid, u64 nlookup)
{
        struct fuse_iqueue *fiq = &fc->iq;

        forget->forget_one.nodeid = nodeid;
        forget->forget_one.nlookup = nlookup;

        spin_lock(&fiq->lock);
        if (fiq->connected) {
                fiq->forget_list_tail->next = forget;
                fiq->forget_list_tail = forget;
                fiq->ops->wake_forget_and_unlock(fiq);
        } else {
                kfree(forget);
                spin_unlock(&fiq->lock);
        }
}

static void flush_bg_queue(struct fuse_conn *fc)
{
        struct fuse_iqueue *fiq = &fc->iq;

        while (fc->active_background < fc->max_background &&
               !list_empty(&fc->bg_queue)) {
                struct fuse_req *req;

                req = list_first_entry(&fc->bg_queue, struct fuse_req, list);
                list_del(&req->list);
                fc->active_background++;
                spin_lock(&fiq->lock);
                req->in.h.unique = fuse_get_unique(fiq);
                queue_request_and_unlock(fiq, req);
        }
}

/*
 * This function is called when a request is finished.  Either a reply
 * has arrived or it was aborted (and not yet sent) or some error
 * occurred during communication with userspace, or the device file
 * was closed.  The requester thread is woken up (if still waiting),
 * the 'end' callback is called if given, else the reference to the
 * request is released
 */
void fuse_request_end(struct fuse_req *req)
{
        struct fuse_mount *fm = req->fm;
        struct fuse_conn *fc = fm->fc;
        struct fuse_iqueue *fiq = &fc->iq;

        if (test_and_set_bit(FR_FINISHED, &req->flags))
                goto put_request;

        /*
         * test_and_set_bit() implies smp_mb() between bit
         * changing and below FR_INTERRUPTED check. Pairs with
         * smp_mb() from queue_interrupt().
         */
        if (test_bit(FR_INTERRUPTED, &req->flags)) {
                spin_lock(&fiq->lock);
                list_del_init(&req->intr_entry);
                spin_unlock(&fiq->lock);
        }
        WARN_ON(test_bit(FR_PENDING, &req->flags));
        WARN_ON(test_bit(FR_SENT, &req->flags));
        if (test_bit(FR_BACKGROUND, &req->flags)) {
                spin_lock(&fc->bg_lock);
                clear_bit(FR_BACKGROUND, &req->flags);
                if (fc->num_background == fc->max_background) {
                        fc->blocked = 0;
                        wake_up(&fc->blocked_waitq);
                } else if (!fc->blocked) {
                        /*
                         * Wake up next waiter, if any.  It's okay to use
                         * waitqueue_active(), as we've already synced up
                         * fc->blocked with waiters with the wake_up() call
                         * above.
                         */
                        if (waitqueue_active(&fc->blocked_waitq))
                                wake_up(&fc->blocked_waitq);
                }

                fc->num_background--;
                fc->active_background--;
                flush_bg_queue(fc);
                spin_unlock(&fc->bg_lock);
        } else {
                /* Wake up waiter sleeping in request_wait_answer() */
                wake_up(&req->waitq);
        }

        if (test_bit(FR_ASYNC, &req->flags))
                req->args->end(fm, req->args, req->out.h.error);
put_request:
        fuse_put_request(req);
}
EXPORT_SYMBOL_GPL(fuse_request_end);

static int queue_interrupt(struct fuse_req *req)
{
        struct fuse_iqueue *fiq = &req->fm->fc->iq;

        spin_lock(&fiq->lock);
        /* Check for we've sent request to interrupt this req */
        if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) {
                spin_unlock(&fiq->lock);
                return -EINVAL;
        }

        if (list_empty(&req->intr_entry)) {
                list_add_tail(&req->intr_entry, &fiq->interrupts);
                /*
                 * Pairs with smp_mb() implied by test_and_set_bit()
                 * from fuse_request_end().
                 */
                smp_mb();
                if (test_bit(FR_FINISHED, &req->flags)) {
                        list_del_init(&req->intr_entry);
                        spin_unlock(&fiq->lock);
                        return 0;
                }
                fiq->ops->wake_interrupt_and_unlock(fiq);
        } else {
                spin_unlock(&fiq->lock);
        }
        return 0;
}

static void request_wait_answer(struct fuse_req *req)
{
        struct fuse_conn *fc = req->fm->fc;
        struct fuse_iqueue *fiq = &fc->iq;
        int err;

        if (!fc->no_interrupt) {
                /* Any signal may interrupt this */
                err = wait_event_interruptible(req->waitq,
                                        test_bit(FR_FINISHED, &req->flags));
                if (!err)
                        return;

                set_bit(FR_INTERRUPTED, &req->flags);
                /* matches barrier in fuse_dev_do_read() */
                smp_mb__after_atomic();
                if (test_bit(FR_SENT, &req->flags))
                        queue_interrupt(req);
        }

        if (!test_bit(FR_FORCE, &req->flags)) {
                /* Only fatal signals may interrupt this */
                err = wait_event_killable(req->waitq,
                                        test_bit(FR_FINISHED, &req->flags));
                if (!err)
                        return;

                spin_lock(&fiq->lock);
                /* Request is not yet in userspace, bail out */
                if (test_bit(FR_PENDING, &req->flags)) {
                        list_del(&req->list);
                        spin_unlock(&fiq->lock);
                        __fuse_put_request(req);
                        req->out.h.error = -EINTR;
                        return;
                }
                spin_unlock(&fiq->lock);
        }

        /*
         * Either request is already in userspace, or it was forced.
         * Wait it out.
         */
        wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags));
}

static void __fuse_request_send(struct fuse_req *req)
{
        struct fuse_iqueue *fiq = &req->fm->fc->iq;

        BUG_ON(test_bit(FR_BACKGROUND, &req->flags));
        spin_lock(&fiq->lock);
        if (!fiq->connected) {
                spin_unlock(&fiq->lock);
                req->out.h.error = -ENOTCONN;
        } else {
                req->in.h.unique = fuse_get_unique(fiq);
                /* acquire extra reference, since request is still needed
                   after fuse_request_end() */
                __fuse_get_request(req);
                queue_request_and_unlock(fiq, req);

                request_wait_answer(req);
                /* Pairs with smp_wmb() in fuse_request_end() */
                smp_rmb();
        }
}

static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args)
{
        if (fc->minor < 4 && args->opcode == FUSE_STATFS)
                args->out_args[0].size = FUSE_COMPAT_STATFS_SIZE;

        if (fc->minor < 9) {
                switch (args->opcode) {
                case FUSE_LOOKUP:
                case FUSE_CREATE:
                case FUSE_MKNOD:
                case FUSE_MKDIR:
                case FUSE_SYMLINK:
                case FUSE_LINK:
                        args->out_args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
                        break;
                case FUSE_GETATTR:
                case FUSE_SETATTR:
                        args->out_args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
                        break;
                }
        }
        if (fc->minor < 12) {
                switch (args->opcode) {
                case FUSE_CREATE:
                        args->in_args[0].size = sizeof(struct fuse_open_in);
                        break;
                case FUSE_MKNOD:
                        args->in_args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE;
                        break;
                }
        }
}

static void fuse_force_creds(struct fuse_req *req)
{
        struct fuse_conn *fc = req->fm->fc;

        req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid());
        req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid());
        req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
}

static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args)
{
        req->in.h.opcode = args->opcode;
        req->in.h.nodeid = args->nodeid;
        req->args = args;
        if (args->is_ext)
                req->in.h.total_extlen = args->in_args[args->ext_idx].size / 8;
        if (args->end)
                __set_bit(FR_ASYNC, &req->flags);
}

ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args)
{
        struct fuse_conn *fc = fm->fc;
        struct fuse_req *req;
        ssize_t ret;

        if (args->force) {
                atomic_inc(&fc->num_waiting);
                req = fuse_request_alloc(fm, GFP_KERNEL | __GFP_NOFAIL);

                if (!args->nocreds)
                        fuse_force_creds(req);

                __set_bit(FR_WAITING, &req->flags);
                __set_bit(FR_FORCE, &req->flags);
        } else {
                WARN_ON(args->nocreds);
                req = fuse_get_req(fm, false);
                if (IS_ERR(req))
                        return PTR_ERR(req);
        }

        /* Needs to be done after fuse_get_req() so that fc->minor is valid */
        fuse_adjust_compat(fc, args);
        fuse_args_to_req(req, args);

        if (!args->noreply)
                __set_bit(FR_ISREPLY, &req->flags);
        __fuse_request_send(req);
        ret = req->out.h.error;
        if (!ret && args->out_argvar) {
                BUG_ON(args->out_numargs == 0);
                ret = args->out_args[args->out_numargs - 1].size;
        }
        fuse_put_request(req);

        return ret;
}

static bool fuse_request_queue_background(struct fuse_req *req)
{
        struct fuse_mount *fm = req->fm;
        struct fuse_conn *fc = fm->fc;
        bool queued = false;

        WARN_ON(!test_bit(FR_BACKGROUND, &req->flags));
        if (!test_bit(FR_WAITING, &req->flags)) {
                __set_bit(FR_WAITING, &req->flags);
                atomic_inc(&fc->num_waiting);
        }
        __set_bit(FR_ISREPLY, &req->flags);
        spin_lock(&fc->bg_lock);
        if (likely(fc->connected)) {
                fc->num_background++;
                if (fc->num_background == fc->max_background)
                        fc->blocked = 1;
                list_add_tail(&req->list, &fc->bg_queue);
                flush_bg_queue(fc);
                queued = true;
        }
        spin_unlock(&fc->bg_lock);

        return queued;
}

int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args,
                            gfp_t gfp_flags)
{
        struct fuse_req *req;

        if (args->force) {
                WARN_ON(!args->nocreds);
                req = fuse_request_alloc(fm, gfp_flags);
                if (!req)
                        return -ENOMEM;
                __set_bit(FR_BACKGROUND, &req->flags);
        } else {
                WARN_ON(args->nocreds);
                req = fuse_get_req(fm, true);
                if (IS_ERR(req))
                        return PTR_ERR(req);
        }

        fuse_args_to_req(req, args);

        if (!fuse_request_queue_background(req)) {
                fuse_put_request(req);
                return -ENOTCONN;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(fuse_simple_background);

static int fuse_simple_notify_reply(struct fuse_mount *fm,
                                    struct fuse_args *args, u64 unique)
{
        struct fuse_req *req;
        struct fuse_iqueue *fiq = &fm->fc->iq;
        int err = 0;

        req = fuse_get_req(fm, false);
        if (IS_ERR(req))
                return PTR_ERR(req);

        __clear_bit(FR_ISREPLY, &req->flags);
        req->in.h.unique = unique;

        fuse_args_to_req(req, args);

        spin_lock(&fiq->lock);
        if (fiq->connected) {
                queue_request_and_unlock(fiq, req);
        } else {
                err = -ENODEV;
                spin_unlock(&fiq->lock);
                fuse_put_request(req);
        }

        return err;
}

/*
 * Lock the request.  Up to the next unlock_request() there mustn't be
 * anything that could cause a page-fault.  If the request was already
 * aborted bail out.
 */
static int lock_request(struct fuse_req *req)
{
        int err = 0;
        if (req) {
                spin_lock(&req->waitq.lock);
                if (test_bit(FR_ABORTED, &req->flags))
                        err = -ENOENT;
                else
                        set_bit(FR_LOCKED, &req->flags);
                spin_unlock(&req->waitq.lock);
        }
        return err;
}

/*
 * Unlock request.  If it was aborted while locked, caller is responsible
 * for unlocking and ending the request.
 */
static int unlock_request(struct fuse_req *req)
{
        int err = 0;
        if (req) {
                spin_lock(&req->waitq.lock);
                if (test_bit(FR_ABORTED, &req->flags))
                        err = -ENOENT;
                else
                        clear_bit(FR_LOCKED, &req->flags);
                spin_unlock(&req->waitq.lock);
        }
        return err;
}

struct fuse_copy_state {
        int write;
        struct fuse_req *req;
        struct iov_iter *iter;
        struct pipe_buffer *pipebufs;
        struct pipe_buffer *currbuf;
        struct pipe_inode_info *pipe;
        unsigned long nr_segs;
        struct page *pg;
        unsigned len;
        unsigned offset;
        unsigned move_pages:1;
};

static void fuse_copy_init(struct fuse_copy_state *cs, int write,
                           struct iov_iter *iter)
{
        memset(cs, 0, sizeof(*cs));
        cs->write = write;
        cs->iter = iter;
}

/* Unmap and put previous page of userspace buffer */
static void fuse_copy_finish(struct fuse_copy_state *cs)
{
        if (cs->currbuf) {
                struct pipe_buffer *buf = cs->currbuf;

                if (cs->write)
                        buf->len = PAGE_SIZE - cs->len;
                cs->currbuf = NULL;
        } else if (cs->pg) {
                if (cs->write) {
                        flush_dcache_page(cs->pg);
                        set_page_dirty_lock(cs->pg);
                }
                put_page(cs->pg);
        }
        cs->pg = NULL;
}

/*
 * Get another pagefull of userspace buffer, and map it to kernel
 * address space, and lock request
 */
static int fuse_copy_fill(struct fuse_copy_state *cs)
{
        struct page *page;
        int err;

        err = unlock_request(cs->req);
        if (err)
                return err;

        fuse_copy_finish(cs);
        if (cs->pipebufs) {
                struct pipe_buffer *buf = cs->pipebufs;

                if (!cs->write) {
                        err = pipe_buf_confirm(cs->pipe, buf);
                        if (err)
                                return err;

                        BUG_ON(!cs->nr_segs);
                        cs->currbuf = buf;
                        cs->pg = buf->page;
                        cs->offset = buf->offset;
                        cs->len = buf->len;
                        cs->pipebufs++;
                        cs->nr_segs--;
                } else {
                        if (cs->nr_segs >= cs->pipe->max_usage)
                                return -EIO;

                        page = alloc_page(GFP_HIGHUSER);
                        if (!page)
                                return -ENOMEM;

                        buf->page = page;
                        buf->offset = 0;
                        buf->len = 0;

                        cs->currbuf = buf;
                        cs->pg = page;
                        cs->offset = 0;
                        cs->len = PAGE_SIZE;
                        cs->pipebufs++;
                        cs->nr_segs++;
                }
        } else {
                size_t off;
                err = iov_iter_get_pages2(cs->iter, &page, PAGE_SIZE, 1, &off);
                if (err < 0)
                        return err;
                BUG_ON(!err);
                cs->len = err;
                cs->offset = off;
                cs->pg = page;
        }

        return lock_request(cs->req);
}

/* Do as much copy to/from userspace buffer as we can */
static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
{
        unsigned ncpy = min(*size, cs->len);
        if (val) {
                void *pgaddr = kmap_local_page(cs->pg);
                void *buf = pgaddr + cs->offset;

                if (cs->write)
                        memcpy(buf, *val, ncpy);
                else
                        memcpy(*val, buf, ncpy);

                kunmap_local(pgaddr);
                *val += ncpy;
        }
        *size -= ncpy;
        cs->len -= ncpy;
        cs->offset += ncpy;
        return ncpy;
}

static int fuse_check_folio(struct folio *folio)
{
        if (folio_mapped(folio) ||
            folio->mapping != NULL ||
            (folio->flags & PAGE_FLAGS_CHECK_AT_PREP &
             ~(1 << PG_locked |
               1 << PG_referenced |
               1 << PG_uptodate |
               1 << PG_lru |
               1 << PG_active |
               1 << PG_workingset |
               1 << PG_reclaim |
               1 << PG_waiters |
               LRU_GEN_MASK | LRU_REFS_MASK))) {
                dump_page(&folio->page, "fuse: trying to steal weird page");
                return 1;
        }
        return 0;
}

static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
{
        int err;
        struct folio *oldfolio = page_folio(*pagep);
        struct folio *newfolio;
        struct pipe_buffer *buf = cs->pipebufs;

        folio_get(oldfolio);
        err = unlock_request(cs->req);
        if (err)
                goto out_put_old;

        fuse_copy_finish(cs);

        err = pipe_buf_confirm(cs->pipe, buf);
        if (err)
                goto out_put_old;

        BUG_ON(!cs->nr_segs);
        cs->currbuf = buf;
        cs->len = buf->len;
        cs->pipebufs++;
        cs->nr_segs--;

        if (cs->len != PAGE_SIZE)
                goto out_fallback;

        if (!pipe_buf_try_steal(cs->pipe, buf))
                goto out_fallback;

        newfolio = page_folio(buf->page);

        if (!folio_test_uptodate(newfolio))
                folio_mark_uptodate(newfolio);

        folio_clear_mappedtodisk(newfolio);

        if (fuse_check_folio(newfolio) != 0)
                goto out_fallback_unlock;

        /*
         * This is a new and locked page, it shouldn't be mapped or
         * have any special flags on it
         */
        if (WARN_ON(folio_mapped(oldfolio)))
                goto out_fallback_unlock;
        if (WARN_ON(folio_has_private(oldfolio)))
                goto out_fallback_unlock;
        if (WARN_ON(folio_test_dirty(oldfolio) ||
                                folio_test_writeback(oldfolio)))
                goto out_fallback_unlock;
        if (WARN_ON(folio_test_mlocked(oldfolio)))
                goto out_fallback_unlock;

        replace_page_cache_folio(oldfolio, newfolio);

        folio_get(newfolio);

        if (!(buf->flags & PIPE_BUF_FLAG_LRU))
                folio_add_lru(newfolio);

        /*
         * Release while we have extra ref on stolen page.  Otherwise
         * anon_pipe_buf_release() might think the page can be reused.
         */
        pipe_buf_release(cs->pipe, buf);

        err = 0;
        spin_lock(&cs->req->waitq.lock);
        if (test_bit(FR_ABORTED, &cs->req->flags))
                err = -ENOENT;
        else
                *pagep = &newfolio->page;
        spin_unlock(&cs->req->waitq.lock);

        if (err) {
                folio_unlock(newfolio);
                folio_put(newfolio);
                goto out_put_old;
        }

        folio_unlock(oldfolio);
        /* Drop ref for ap->pages[] array */
        folio_put(oldfolio);
        cs->len = 0;

        err = 0;
out_put_old:
        /* Drop ref obtained in this function */
        folio_put(oldfolio);
        return err;

out_fallback_unlock:
        folio_unlock(newfolio);
out_fallback:
        cs->pg = buf->page;
        cs->offset = buf->offset;

        err = lock_request(cs->req);
        if (!err)
                err = 1;

        goto out_put_old;
}

static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
                         unsigned offset, unsigned count)
{
        struct pipe_buffer *buf;
        int err;

        if (cs->nr_segs >= cs->pipe->max_usage)
                return -EIO;

        get_page(page);
        err = unlock_request(cs->req);
        if (err) {
                put_page(page);
                return err;
        }

        fuse_copy_finish(cs);

        buf = cs->pipebufs;
        buf->page = page;
        buf->offset = offset;
        buf->len = count;

        cs->pipebufs++;
        cs->nr_segs++;
        cs->len = 0;

        return 0;
}

/*
 * Copy a page in the request to/from the userspace buffer.  Must be
 * done atomically
 */
static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
                          unsigned offset, unsigned count, int zeroing)
{
        int err;
        struct page *page = *pagep;

        if (page && zeroing && count < PAGE_SIZE)
                clear_highpage(page);

        while (count) {
                if (cs->write && cs->pipebufs && page) {
                        /*
                         * Can't control lifetime of pipe buffers, so always
                         * copy user pages.
                         */
                        if (cs->req->args->user_pages) {
                                err = fuse_copy_fill(cs);
                                if (err)
                                        return err;
                        } else {
                                return fuse_ref_page(cs, page, offset, count);
                        }
                } else if (!cs->len) {
                        if (cs->move_pages && page &&
                            offset == 0 && count == PAGE_SIZE) {
                                err = fuse_try_move_page(cs, pagep);
                                if (err <= 0)
                                        return err;
                        } else {
                                err = fuse_copy_fill(cs);
                                if (err)
                                        return err;
                        }
                }
                if (page) {
                        void *mapaddr = kmap_local_page(page);
                        void *buf = mapaddr + offset;
                        offset += fuse_copy_do(cs, &buf, &count);
                        kunmap_local(mapaddr);
                } else
                        offset += fuse_copy_do(cs, NULL, &count);
        }
        if (page && !cs->write)
                flush_dcache_page(page);
        return 0;
}

/* Copy pages in the request to/from userspace buffer */
static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
                           int zeroing)
{
        unsigned i;
        struct fuse_req *req = cs->req;
        struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args);


        for (i = 0; i < ap->num_pages && (nbytes || zeroing); i++) {
                int err;
                unsigned int offset = ap->descs[i].offset;
                unsigned int count = min(nbytes, ap->descs[i].length);

                err = fuse_copy_page(cs, &ap->pages[i], offset, count, zeroing);
                if (err)
                        return err;

                nbytes -= count;
        }
        return 0;
}

/* Copy a single argument in the request to/from userspace buffer */
static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
{
        while (size) {
                if (!cs->len) {
                        int err = fuse_copy_fill(cs);
                        if (err)
                                return err;
                }
                fuse_copy_do(cs, &val, &size);
        }
        return 0;
}

/* Copy request arguments to/from userspace buffer */
static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
                          unsigned argpages, struct fuse_arg *args,
                          int zeroing)
{
        int err = 0;
        unsigned i;

        for (i = 0; !err && i < numargs; i++)  {
                struct fuse_arg *arg = &args[i];
                if (i == numargs - 1 && argpages)
                        err = fuse_copy_pages(cs, arg->size, zeroing);
                else
                        err = fuse_copy_one(cs, arg->value, arg->size);
        }
        return err;
}

static int forget_pending(struct fuse_iqueue *fiq)
{
        return fiq->forget_list_head.next != NULL;
}

static int request_pending(struct fuse_iqueue *fiq)
{
        return !list_empty(&fiq->pending) || !list_empty(&fiq->interrupts) ||
                forget_pending(fiq);
}

/*
 * Transfer an interrupt request to userspace
 *
 * Unlike other requests this is assembled on demand, without a need
 * to allocate a separate fuse_req structure.
 *
 * Called with fiq->lock held, releases it
 */
static int fuse_read_interrupt(struct fuse_iqueue *fiq,
                               struct fuse_copy_state *cs,
                               size_t nbytes, struct fuse_req *req)
__releases(fiq->lock)
{
        struct fuse_in_header ih;
        struct fuse_interrupt_in arg;
        unsigned reqsize = sizeof(ih) + sizeof(arg);
        int err;

        list_del_init(&req->intr_entry);
        memset(&ih, 0, sizeof(ih));
        memset(&arg, 0, sizeof(arg));
        ih.len = reqsize;
        ih.opcode = FUSE_INTERRUPT;
        ih.unique = (req->in.h.unique | FUSE_INT_REQ_BIT);
        arg.unique = req->in.h.unique;

        spin_unlock(&fiq->lock);
        if (nbytes < reqsize)
                return -EINVAL;

        err = fuse_copy_one(cs, &ih, sizeof(ih));
        if (!err)
                err = fuse_copy_one(cs, &arg, sizeof(arg));
        fuse_copy_finish(cs);

        return err ? err : reqsize;
}

struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq,
                                             unsigned int max,
                                             unsigned int *countp)
{
        struct fuse_forget_link *head = fiq->forget_list_head.next;
        struct fuse_forget_link **newhead = &head;
        unsigned count;

        for (count = 0; *newhead != NULL && count < max; count++)
                newhead = &(*newhead)->next;

        fiq->forget_list_head.next = *newhead;
        *newhead = NULL;
        if (fiq->forget_list_head.next == NULL)
                fiq->forget_list_tail = &fiq->forget_list_head;

        if (countp != NULL)
                *countp = count;

        return head;
}
EXPORT_SYMBOL(fuse_dequeue_forget);

static int fuse_read_single_forget(struct fuse_iqueue *fiq,
                                   struct fuse_copy_state *cs,
                                   size_t nbytes)
__releases(fiq->lock)
{
        int err;
        struct fuse_forget_link *forget = fuse_dequeue_forget(fiq, 1, NULL);
        struct fuse_forget_in arg = {
                .nlookup = forget->forget_one.nlookup,
        };
        struct fuse_in_header ih = {
                .opcode = FUSE_FORGET,
                .nodeid = forget->forget_one.nodeid,
                .unique = fuse_get_unique(fiq),
                .len = sizeof(ih) + sizeof(arg),
        };

        spin_unlock(&fiq->lock);
        kfree(forget);
        if (nbytes < ih.len)
                return -EINVAL;

        err = fuse_copy_one(cs, &ih, sizeof(ih));
        if (!err)
                err = fuse_copy_one(cs, &arg, sizeof(arg));
        fuse_copy_finish(cs);

        if (err)
                return err;

        return ih.len;
}

static int fuse_read_batch_forget(struct fuse_iqueue *fiq,
                                   struct fuse_copy_state *cs, size_t nbytes)
__releases(fiq->lock)
{
        int err;
        unsigned max_forgets;
        unsigned count;
        struct fuse_forget_link *head;
        struct fuse_batch_forget_in arg = { .count = 0 };
        struct fuse_in_header ih = {
                .opcode = FUSE_BATCH_FORGET,
                .unique = fuse_get_unique(fiq),
                .len = sizeof(ih) + sizeof(arg),
        };

        if (nbytes < ih.len) {
                spin_unlock(&fiq->lock);
                return -EINVAL;
        }

        max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
        head = fuse_dequeue_forget(fiq, max_forgets, &count);
        spin_unlock(&fiq->lock);

        arg.count = count;
        ih.len += count * sizeof(struct fuse_forget_one);
        err = fuse_copy_one(cs, &ih, sizeof(ih));
        if (!err)
                err = fuse_copy_one(cs, &arg, sizeof(arg));

        while (head) {
                struct fuse_forget_link *forget = head;

                if (!err) {
                        err = fuse_copy_one(cs, &forget->forget_one,
                                            sizeof(forget->forget_one));
                }
                head = forget->next;
                kfree(forget);
        }

        fuse_copy_finish(cs);

        if (err)
                return err;

        return ih.len;
}

static int fuse_read_forget(struct fuse_conn *fc, struct fuse_iqueue *fiq,
                            struct fuse_copy_state *cs,
                            size_t nbytes)
__releases(fiq->lock)
{
        if (fc->minor < 16 || fiq->forget_list_head.next->next == NULL)
                return fuse_read_single_forget(fiq, cs, nbytes);
        else
                return fuse_read_batch_forget(fiq, cs, nbytes);
}

/*
 * Read a single request into the userspace filesystem's buffer.  This
 * function waits until a request is available, then removes it from
 * the pending list and copies request data to userspace buffer.  If
 * no reply is needed (FORGET) or request has been aborted or there
 * was an error during the copying then it's finished by calling
 * fuse_request_end().  Otherwise add it to the processing list, and set
 * the 'sent' flag.
 */
static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
                                struct fuse_copy_state *cs, size_t nbytes)
{
        ssize_t err;
        struct fuse_conn *fc = fud->fc;
        struct fuse_iqueue *fiq = &fc->iq;
        struct fuse_pqueue *fpq = &fud->pq;
        struct fuse_req *req;
        struct fuse_args *args;
        unsigned reqsize;
        unsigned int hash;

        /*
         * Require sane minimum read buffer - that has capacity for fixed part
         * of any request header + negotiated max_write room for data.
         *
         * Historically libfuse reserves 4K for fixed header room, but e.g.
         * GlusterFS reserves only 80 bytes
         *
         *        = `sizeof(fuse_in_header) + sizeof(fuse_write_in)`
         *
         * which is the absolute minimum any sane filesystem should be using
         * for header room.
         */
        if (nbytes < max_t(size_t, FUSE_MIN_READ_BUFFER,
                           sizeof(struct fuse_in_header) +
                           sizeof(struct fuse_write_in) +
                           fc->max_write))
                return -EINVAL;

 restart:
        for (;;) {
                spin_lock(&fiq->lock);
                if (!fiq->connected || request_pending(fiq))
                        break;
                spin_unlock(&fiq->lock);

                if (file->f_flags & O_NONBLOCK)
                        return -EAGAIN;
                err = wait_event_interruptible_exclusive(fiq->waitq,
                                !fiq->connected || request_pending(fiq));
                if (err)
                        return err;
        }

        if (!fiq->connected) {
                err = fc->aborted ? -ECONNABORTED : -ENODEV;
                goto err_unlock;
        }

        if (!list_empty(&fiq->interrupts)) {
                req = list_entry(fiq->interrupts.next, struct fuse_req,
                                 intr_entry);
                return fuse_read_interrupt(fiq, cs, nbytes, req);
        }

        if (forget_pending(fiq)) {
                if (list_empty(&fiq->pending) || fiq->forget_batch-- > 0)
                        return fuse_read_forget(fc, fiq, cs, nbytes);

                if (fiq->forget_batch <= -8)
                        fiq->forget_batch = 16;
        }

        req = list_entry(fiq->pending.next, struct fuse_req, list);
        clear_bit(FR_PENDING, &req->flags);
        list_del_init(&req->list);
        spin_unlock(&fiq->lock);

        args = req->args;
        reqsize = req->in.h.len;

        /* If request is too large, reply with an error and restart the read */
        if (nbytes < reqsize) {
                req->out.h.error = -EIO;
                /* SETXATTR is special, since it may contain too large data */
                if (args->opcode == FUSE_SETXATTR)
                        req->out.h.error = -E2BIG;
                fuse_request_end(req);
                goto restart;
        }
        spin_lock(&fpq->lock);
        /*
         *  Must not put request on fpq->io queue after having been shut down by
         *  fuse_abort_conn()
         */
        if (!fpq->connected) {
                req->out.h.error = err = -ECONNABORTED;
                goto out_end;

        }
        list_add(&req->list, &fpq->io);
        spin_unlock(&fpq->lock);
        cs->req = req;
        err = fuse_copy_one(cs, &req->in.h, sizeof(req->in.h));
        if (!err)
                err = fuse_copy_args(cs, args->in_numargs, args->in_pages,
                                     (struct fuse_arg *) args->in_args, 0);
        fuse_copy_finish(cs);
        spin_lock(&fpq->lock);
        clear_bit(FR_LOCKED, &req->flags);
        if (!fpq->connected) {
                err = fc->aborted ? -ECONNABORTED : -ENODEV;
                goto out_end;
        }
        if (err) {
                req->out.h.error = -EIO;
                goto out_end;
        }
        if (!test_bit(FR_ISREPLY, &req->flags)) {
                err = reqsize;
                goto out_end;
        }
        hash = fuse_req_hash(req->in.h.unique);
        list_move_tail(&req->list, &fpq->processing[hash]);
        __fuse_get_request(req);
        set_bit(FR_SENT, &req->flags);
        spin_unlock(&fpq->lock);
        /* matches barrier in request_wait_answer() */
        smp_mb__after_atomic();
        if (test_bit(FR_INTERRUPTED, &req->flags))
                queue_interrupt(req);
        fuse_put_request(req);

        return reqsize;

out_end:
        if (!test_bit(FR_PRIVATE, &req->flags))
                list_del_init(&req->list);
        spin_unlock(&fpq->lock);
        fuse_request_end(req);
        return err;

 err_unlock:
        spin_unlock(&fiq->lock);
        return err;
}

static int fuse_dev_open(struct inode *inode, struct file *file)
{
        /*
         * The fuse device's file's private_data is used to hold
         * the fuse_conn(ection) when it is mounted, and is used to
         * keep track of whether the file has been mounted already.
         */
        file->private_data = NULL;
        return 0;
}

static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to)
{
        struct fuse_copy_state cs;
        struct file *file = iocb->ki_filp;
        struct fuse_dev *fud = fuse_get_dev(file);

        if (!fud)
                return -EPERM;

        if (!user_backed_iter(to))
                return -EINVAL;

        fuse_copy_init(&cs, 1, to);

        return fuse_dev_do_read(fud, file, &cs, iov_iter_count(to));
}

static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
                                    struct pipe_inode_info *pipe,
                                    size_t len, unsigned int flags)
{
        int total, ret;
        int page_nr = 0;
        struct pipe_buffer *bufs;
        struct fuse_copy_state cs;
        struct fuse_dev *fud = fuse_get_dev(in);

        if (!fud)
                return -EPERM;

        bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer),
                              GFP_KERNEL);
        if (!bufs)
                return -ENOMEM;

        fuse_copy_init(&cs, 1, NULL);
        cs.pipebufs = bufs;
        cs.pipe = pipe;
        ret = fuse_dev_do_read(fud, in, &cs, len);
        if (ret < 0)
                goto out;

        if (pipe_occupancy(pipe->head, pipe->tail) + cs.nr_segs > pipe->max_usage) {
                ret = -EIO;
                goto out;
        }

        for (ret = total = 0; page_nr < cs.nr_segs; total += ret) {
                /*
                 * Need to be careful about this.  Having buf->ops in module
                 * code can Oops if the buffer persists after module unload.
                 */
                bufs[page_nr].ops = &nosteal_pipe_buf_ops;
                bufs[page_nr].flags = 0;
                ret = add_to_pipe(pipe, &bufs[page_nr++]);
                if (unlikely(ret < 0))
                        break;
        }
        if (total)
                ret = total;
out:
        for (; page_nr < cs.nr_segs; page_nr++)
                put_page(bufs[page_nr].page);

        kvfree(bufs);
        return ret;
}

static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
                            struct fuse_copy_state *cs)
{
        struct fuse_notify_poll_wakeup_out outarg;
        int err = -EINVAL;

        if (size != sizeof(outarg))
                goto err;

        err = fuse_copy_one(cs, &outarg, sizeof(outarg));
        if (err)
                goto err;

        fuse_copy_finish(cs);
        return fuse_notify_poll_wakeup(fc, &outarg);

err:
        fuse_copy_finish(cs);
        return err;
}

static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
                                   struct fuse_copy_state *cs)
{
        struct fuse_notify_inval_inode_out outarg;
        int err = -EINVAL;

        if (size != sizeof(outarg))
                goto err;

        err = fuse_copy_one(cs, &outarg, sizeof(outarg));
        if (err)
                goto err;
        fuse_copy_finish(cs);

        down_read(&fc->killsb);
        err = fuse_reverse_inval_inode(fc, outarg.ino,
                                       outarg.off, outarg.len);
        up_read(&fc->killsb);
        return err;

err:
        fuse_copy_finish(cs);
        return err;
}

static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
                                   struct fuse_copy_state *cs)
{
        struct fuse_notify_inval_entry_out outarg;
        int err = -ENOMEM;
        char *buf;
        struct qstr name;

        buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
        if (!buf)
                goto err;

        err = -EINVAL;
        if (size < sizeof(outarg))
                goto err;

        err = fuse_copy_one(cs, &outarg, sizeof(outarg));
        if (err)
                goto err;

        err = -ENAMETOOLONG;
        if (outarg.namelen > FUSE_NAME_MAX)
                goto err;

        err = -EINVAL;
        if (size != sizeof(outarg) + outarg.namelen + 1)
                goto err;

        name.name = buf;
        name.len = outarg.namelen;
        err = fuse_copy_one(cs, buf, outarg.namelen + 1);
        if (err)
                goto err;
        fuse_copy_finish(cs);
        buf[outarg.namelen] = 0;

        down_read(&fc->killsb);
        err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags);
        up_read(&fc->killsb);
        kfree(buf);
        return err;

err:
        kfree(buf);
        fuse_copy_finish(cs);
        return err;
}

static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
                              struct fuse_copy_state *cs)
{
        struct fuse_notify_delete_out outarg;
        int err = -ENOMEM;
        char *buf;
        struct qstr name;

        buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
        if (!buf)
                goto err;

        err = -EINVAL;
        if (size < sizeof(outarg))
                goto err;

        err = fuse_copy_one(cs, &outarg, sizeof(outarg));
        if (err)
                goto err;

        err = -ENAMETOOLONG;
        if (outarg.namelen > FUSE_NAME_MAX)
                goto err;

        err = -EINVAL;
        if (size != sizeof(outarg) + outarg.namelen + 1)
                goto err;

        name.name = buf;
        name.len = outarg.namelen;
        err = fuse_copy_one(cs, buf, outarg.namelen + 1);
        if (err)
                goto err;
        fuse_copy_finish(cs);
        buf[outarg.namelen] = 0;

        down_read(&fc->killsb);
        err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0);
        up_read(&fc->killsb);
        kfree(buf);
        return err;

err:
        kfree(buf);
        fuse_copy_finish(cs);
        return err;
}

static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
                             struct fuse_copy_state *cs)
{
        struct fuse_notify_store_out outarg;
        struct inode *inode;
        struct address_space *mapping;
        u64 nodeid;
        int err;
        pgoff_t index;
        unsigned int offset;
        unsigned int num;
        loff_t file_size;
        loff_t end;

        err = -EINVAL;
        if (size < sizeof(outarg))
                goto out_finish;

        err = fuse_copy_one(cs, &outarg, sizeof(outarg));
        if (err)
                goto out_finish;

        err = -EINVAL;
        if (size - sizeof(outarg) != outarg.size)
                goto out_finish;

        nodeid = outarg.nodeid;

        down_read(&fc->killsb);

        err = -ENOENT;
        inode = fuse_ilookup(fc, nodeid,  NULL);
        if (!inode)
                goto out_up_killsb;

        mapping = inode->i_mapping;
        index = outarg.offset >> PAGE_SHIFT;
        offset = outarg.offset & ~PAGE_MASK;
        file_size = i_size_read(inode);
        end = outarg.offset + outarg.size;
        if (end > file_size) {
                file_size = end;
                fuse_write_update_attr(inode, file_size, outarg.size);
        }

        num = outarg.size;
        while (num) {
                struct page *page;
                unsigned int this_num;

                err = -ENOMEM;
                page = find_or_create_page(mapping, index,
                                           mapping_gfp_mask(mapping));
                if (!page)
                        goto out_iput;

                this_num = min_t(unsigned, num, PAGE_SIZE - offset);
                err = fuse_copy_page(cs, &page, offset, this_num, 0);
                if (!err && offset == 0 &&
                    (this_num == PAGE_SIZE || file_size == end))
                        SetPageUptodate(page);
                unlock_page(page);
                put_page(page);

                if (err)
                        goto out_iput;

                num -= this_num;
                offset = 0;
                index++;
        }

        err = 0;

out_iput:
        iput(inode);
out_up_killsb:
        up_read(&fc->killsb);
out_finish:
        fuse_copy_finish(cs);
        return err;
}

struct fuse_retrieve_args {
        struct fuse_args_pages ap;
        struct fuse_notify_retrieve_in inarg;
};

static void fuse_retrieve_end(struct fuse_mount *fm, struct fuse_args *args,
                              int error)
{
        struct fuse_retrieve_args *ra =
                container_of(args, typeof(*ra), ap.args);

        release_pages(ra->ap.pages, ra->ap.num_pages);
        kfree(ra);
}

static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
                         struct fuse_notify_retrieve_out *outarg)
{
        int err;
        struct address_space *mapping = inode->i_mapping;
        pgoff_t index;
        loff_t file_size;
        unsigned int num;
        unsigned int offset;
        size_t total_len = 0;
        unsigned int num_pages;
        struct fuse_conn *fc = fm->fc;
        struct fuse_retrieve_args *ra;
        size_t args_size = sizeof(*ra);
        struct fuse_args_pages *ap;
        struct fuse_args *args;

        offset = outarg->offset & ~PAGE_MASK;
        file_size = i_size_read(inode);

        num = min(outarg->size, fc->max_write);
        if (outarg->offset > file_size)
                num = 0;
        else if (outarg->offset + num > file_size)
                num = file_size - outarg->offset;

        num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
        num_pages = min(num_pages, fc->max_pages);

        args_size += num_pages * (sizeof(ap->pages[0]) + sizeof(ap->descs[0]));

        ra = kzalloc(args_size, GFP_KERNEL);
        if (!ra)
                return -ENOMEM;

        ap = &ra->ap;
        ap->pages = (void *) (ra + 1);
        ap->descs = (void *) (ap->pages + num_pages);

        args = &ap->args;
        args->nodeid = outarg->nodeid;
        args->opcode = FUSE_NOTIFY_REPLY;
        args->in_numargs = 2;
        args->in_pages = true;
        args->end = fuse_retrieve_end;

        index = outarg->offset >> PAGE_SHIFT;

        while (num && ap->num_pages < num_pages) {
                struct page *page;
                unsigned int this_num;

                page = find_get_page(mapping, index);
                if (!page)
                        break;

                this_num = min_t(unsigned, num, PAGE_SIZE - offset);
                ap->pages[ap->num_pages] = page;
                ap->descs[ap->num_pages].offset = offset;
                ap->descs[ap->num_pages].length = this_num;
                ap->num_pages++;

                offset = 0;
                num -= this_num;
                total_len += this_num;
                index++;
        }
        ra->inarg.offset = outarg->offset;
        ra->inarg.size = total_len;
        args->in_args[0].size = sizeof(ra->inarg);
        args->in_args[0].value = &ra->inarg;
        args->in_args[1].size = total_len;

        err = fuse_simple_notify_reply(fm, args, outarg->notify_unique);
        if (err)
                fuse_retrieve_end(fm, args, err);

        return err;
}

static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
                                struct fuse_copy_state *cs)
{
        struct fuse_notify_retrieve_out outarg;
        struct fuse_mount *fm;
        struct inode *inode;
        u64 nodeid;
        int err;

        err = -EINVAL;
        if (size != sizeof(outarg))
                goto copy_finish;

        err = fuse_copy_one(cs, &outarg, sizeof(outarg));
        if (err)
                goto copy_finish;

        fuse_copy_finish(cs);

        down_read(&fc->killsb);
        err = -ENOENT;
        nodeid = outarg.nodeid;

        inode = fuse_ilookup(fc, nodeid, &fm);
        if (inode) {
                err = fuse_retrieve(fm, inode, &outarg);
                iput(inode);
        }
        up_read(&fc->killsb);

        return err;

copy_finish:
        fuse_copy_finish(cs);
        return err;
}

/*
 * Resending all processing queue requests.
 *
 * During a FUSE daemon panics and failover, it is possible for some inflight
 * requests to be lost and never returned. As a result, applications awaiting
 * replies would become stuck forever. To address this, we can use notification
 * to trigger resending of these pending requests to the FUSE daemon, ensuring
 * they are properly processed again.
 *
 * Please note that this strategy is applicable only to idempotent requests or
 * if the FUSE daemon takes careful measures to avoid processing duplicated
 * non-idempotent requests.
 */
static void fuse_resend(struct fuse_conn *fc)
{
        struct fuse_dev *fud;
        struct fuse_req *req, *next;
        struct fuse_iqueue *fiq = &fc->iq;
        LIST_HEAD(to_queue);
        unsigned int i;

        spin_lock(&fc->lock);
        if (!fc->connected) {
                spin_unlock(&fc->lock);
                return;
        }

        list_for_each_entry(fud, &fc->devices, entry) {
                struct fuse_pqueue *fpq = &fud->pq;

                spin_lock(&fpq->lock);
                for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
                        list_splice_tail_init(&fpq->processing[i], &to_queue);
                spin_unlock(&fpq->lock);
        }
        spin_unlock(&fc->lock);

        list_for_each_entry_safe(req, next, &to_queue, list) {
                set_bit(FR_PENDING, &req->flags);
                clear_bit(FR_SENT, &req->flags);
                /* mark the request as resend request */
                req->in.h.unique |= FUSE_UNIQUE_RESEND;
        }

        spin_lock(&fiq->lock);
        /* iq and pq requests are both oldest to newest */
        list_splice(&to_queue, &fiq->pending);
        fiq->ops->wake_pending_and_unlock(fiq);
}

static int fuse_notify_resend(struct fuse_conn *fc)
{
        fuse_resend(fc);
        return 0;
}

static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
                       unsigned int size, struct fuse_copy_state *cs)
{
        /* Don't try to move pages (yet) */
        cs->move_pages = 0;

        switch (code) {
        case FUSE_NOTIFY_POLL:
                return fuse_notify_poll(fc, size, cs);

        case FUSE_NOTIFY_INVAL_INODE:
                return fuse_notify_inval_inode(fc, size, cs);

        case FUSE_NOTIFY_INVAL_ENTRY:
                return fuse_notify_inval_entry(fc, size, cs);

        case FUSE_NOTIFY_STORE:
                return fuse_notify_store(fc, size, cs);

        case FUSE_NOTIFY_RETRIEVE:
                return fuse_notify_retrieve(fc, size, cs);

        case FUSE_NOTIFY_DELETE:
                return fuse_notify_delete(fc, size, cs);

        case FUSE_NOTIFY_RESEND:
                return fuse_notify_resend(fc);

        default:
                fuse_copy_finish(cs);
                return -EINVAL;
        }
}

/* Look up request on processing list by unique ID */
static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique)
{
        unsigned int hash = fuse_req_hash(unique);
        struct fuse_req *req;

        list_for_each_entry(req, &fpq->processing[hash], list) {
                if (req->in.h.unique == unique)
                        return req;
        }
        return NULL;
}

static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
                         unsigned nbytes)
{
        unsigned reqsize = sizeof(struct fuse_out_header);

        reqsize += fuse_len_args(args->out_numargs, args->out_args);

        if (reqsize < nbytes || (reqsize > nbytes && !args->out_argvar))
                return -EINVAL;
        else if (reqsize > nbytes) {
                struct fuse_arg *lastarg = &args->out_args[args->out_numargs-1];
                unsigned diffsize = reqsize - nbytes;

                if (diffsize > lastarg->size)
                        return -EINVAL;
                lastarg->size -= diffsize;
        }
        return fuse_copy_args(cs, args->out_numargs, args->out_pages,
                              args->out_args, args->page_zeroing);
}

/*
 * Write a single reply to a request.  First the header is copied from
 * the write buffer.  The request is then searched on the processing
 * list by the unique ID found in the header.  If found, then remove
 * it from the list and copy the rest of the buffer to the request.
 * The request is finished by calling fuse_request_end().
 */
static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
                                 struct fuse_copy_state *cs, size_t nbytes)
{
        int err;
        struct fuse_conn *fc = fud->fc;
        struct fuse_pqueue *fpq = &fud->pq;
        struct fuse_req *req;
        struct fuse_out_header oh;

        err = -EINVAL;
        if (nbytes < sizeof(struct fuse_out_header))
                goto out;

        err = fuse_copy_one(cs, &oh, sizeof(oh));
        if (err)
                goto copy_finish;

        err = -EINVAL;
        if (oh.len != nbytes)
                goto copy_finish;

        /*
         * Zero oh.unique indicates unsolicited notification message
         * and error contains notification code.
         */
        if (!oh.unique) {
                err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
                goto out;
        }

        err = -EINVAL;
        if (oh.error <= -512 || oh.error > 0)
                goto copy_finish;

        spin_lock(&fpq->lock);
        req = NULL;
        if (fpq->connected)
                req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT);

        err = -ENOENT;
        if (!req) {
                spin_unlock(&fpq->lock);
                goto copy_finish;
        }

        /* Is it an interrupt reply ID? */
        if (oh.unique & FUSE_INT_REQ_BIT) {
                __fuse_get_request(req);
                spin_unlock(&fpq->lock);

                err = 0;
                if (nbytes != sizeof(struct fuse_out_header))
                        err = -EINVAL;
                else if (oh.error == -ENOSYS)
                        fc->no_interrupt = 1;
                else if (oh.error == -EAGAIN)
                        err = queue_interrupt(req);

                fuse_put_request(req);

                goto copy_finish;
        }

        clear_bit(FR_SENT, &req->flags);
        list_move(&req->list, &fpq->io);
        req->out.h = oh;
        set_bit(FR_LOCKED, &req->flags);
        spin_unlock(&fpq->lock);
        cs->req = req;
        if (!req->args->page_replace)
                cs->move_pages = 0;

        if (oh.error)
                err = nbytes != sizeof(oh) ? -EINVAL : 0;
        else
                err = copy_out_args(cs, req->args, nbytes);
        fuse_copy_finish(cs);

        spin_lock(&fpq->lock);
        clear_bit(FR_LOCKED, &req->flags);
        if (!fpq->connected)
                err = -ENOENT;
        else if (err)
                req->out.h.error = -EIO;
        if (!test_bit(FR_PRIVATE, &req->flags))
                list_del_init(&req->list);
        spin_unlock(&fpq->lock);

        fuse_request_end(req);
out:
        return err ? err : nbytes;

copy_finish:
        fuse_copy_finish(cs);
        goto out;
}

static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
{
        struct fuse_copy_state cs;
        struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp);

        if (!fud)
                return -EPERM;

        if (!user_backed_iter(from))
                return -EINVAL;

        fuse_copy_init(&cs, 0, from);

        return fuse_dev_do_write(fud, &cs, iov_iter_count(from));
}

static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
                                     struct file *out, loff_t *ppos,
                                     size_t len, unsigned int flags)
{
        unsigned int head, tail, mask, count;
        unsigned nbuf;
        unsigned idx;
        struct pipe_buffer *bufs;
        struct fuse_copy_state cs;
        struct fuse_dev *fud;
        size_t rem;
        ssize_t ret;

        fud = fuse_get_dev(out);
        if (!fud)
                return -EPERM;

        pipe_lock(pipe);

        head = pipe->head;
        tail = pipe->tail;
        mask = pipe->ring_size - 1;
        count = head - tail;

        bufs = kvmalloc_array(count, sizeof(struct pipe_buffer), GFP_KERNEL);
        if (!bufs) {
                pipe_unlock(pipe);
                return -ENOMEM;
        }

        nbuf = 0;
        rem = 0;
        for (idx = tail; idx != head && rem < len; idx++)
                rem += pipe->bufs[idx & mask].len;

        ret = -EINVAL;
        if (rem < len)
                goto out_free;

        rem = len;
        while (rem) {
                struct pipe_buffer *ibuf;
                struct pipe_buffer *obuf;

                if (WARN_ON(nbuf >= count || tail == head))
                        goto out_free;

                ibuf = &pipe->bufs[tail & mask];
                obuf = &bufs[nbuf];

                if (rem >= ibuf->len) {
                        *obuf = *ibuf;
                        ibuf->ops = NULL;
                        tail++;
                        pipe->tail = tail;
                } else {
                        if (!pipe_buf_get(pipe, ibuf))
                                goto out_free;

                        *obuf = *ibuf;
                        obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
                        obuf->len = rem;
                        ibuf->offset += obuf->len;
                        ibuf->len -= obuf->len;
                }
                nbuf++;
                rem -= obuf->len;
        }
        pipe_unlock(pipe);

        fuse_copy_init(&cs, 0, NULL);
        cs.pipebufs = bufs;
        cs.nr_segs = nbuf;
        cs.pipe = pipe;

        if (flags & SPLICE_F_MOVE)
                cs.move_pages = 1;

        ret = fuse_dev_do_write(fud, &cs, len);

        pipe_lock(pipe);
out_free:
        for (idx = 0; idx < nbuf; idx++) {
                struct pipe_buffer *buf = &bufs[idx];

                if (buf->ops)
                        pipe_buf_release(pipe, buf);
        }
        pipe_unlock(pipe);

        kvfree(bufs);
        return ret;
}

static __poll_t fuse_dev_poll(struct file *file, poll_table *wait)
{
        __poll_t mask = EPOLLOUT | EPOLLWRNORM;
        struct fuse_iqueue *fiq;
        struct fuse_dev *fud = fuse_get_dev(file);

        if (!fud)
                return EPOLLERR;

        fiq = &fud->fc->iq;
        poll_wait(file, &fiq->waitq, wait);

        spin_lock(&fiq->lock);
        if (!fiq->connected)
                mask = EPOLLERR;
        else if (request_pending(fiq))
                mask |= EPOLLIN | EPOLLRDNORM;
        spin_unlock(&fiq->lock);

        return mask;
}

/* Abort all requests on the given list (pending or processing) */
static void end_requests(struct list_head *head)
{
        while (!list_empty(head)) {
                struct fuse_req *req;
                req = list_entry(head->next, struct fuse_req, list);
                req->out.h.error = -ECONNABORTED;
                clear_bit(FR_SENT, &req->flags);
                list_del_init(&req->list);
                fuse_request_end(req);
        }
}

static void end_polls(struct fuse_conn *fc)
{
        struct rb_node *p;

        p = rb_first(&fc->polled_files);

        while (p) {
                struct fuse_file *ff;
                ff = rb_entry(p, struct fuse_file, polled_node);
                wake_up_interruptible_all(&ff->poll_wait);

                p = rb_next(p);
        }
}

/*
 * Abort all requests.
 *
 * Emergency exit in case of a malicious or accidental deadlock, or just a hung
 * filesystem.
 *
 * The same effect is usually achievable through killing the filesystem daemon
 * and all users of the filesystem.  The exception is the combination of an
 * asynchronous request and the tricky deadlock (see
 * Documentation/filesystems/fuse.rst).
 *
 * Aborting requests under I/O goes as follows: 1: Separate out unlocked
 * requests, they should be finished off immediately.  Locked requests will be
 * finished after unlock; see unlock_request(). 2: Finish off the unlocked
 * requests.  It is possible that some request will finish before we can.  This
 * is OK, the request will in that case be removed from the list before we touch
 * it.
 */
void fuse_abort_conn(struct fuse_conn *fc)
{
        struct fuse_iqueue *fiq = &fc->iq;

        spin_lock(&fc->lock);
        if (fc->connected) {
                struct fuse_dev *fud;
                struct fuse_req *req, *next;
                LIST_HEAD(to_end);
                unsigned int i;

                /* Background queuing checks fc->connected under bg_lock */
                spin_lock(&fc->bg_lock);
                fc->connected = 0;
                spin_unlock(&fc->bg_lock);

                fuse_set_initialized(fc);
                list_for_each_entry(fud, &fc->devices, entry) {
                        struct fuse_pqueue *fpq = &fud->pq;

                        spin_lock(&fpq->lock);
                        fpq->connected = 0;
                        list_for_each_entry_safe(req, next, &fpq->io, list) {
                                req->out.h.error = -ECONNABORTED;
                                spin_lock(&req->waitq.lock);
                                set_bit(FR_ABORTED, &req->flags);
                                if (!test_bit(FR_LOCKED, &req->flags)) {
                                        set_bit(FR_PRIVATE, &req->flags);
                                        __fuse_get_request(req);
                                        list_move(&req->list, &to_end);
                                }
                                spin_unlock(&req->waitq.lock);
                        }
                        for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
                                list_splice_tail_init(&fpq->processing[i],
                                                      &to_end);
                        spin_unlock(&fpq->lock);
                }
                spin_lock(&fc->bg_lock);
                fc->blocked = 0;
                fc->max_background = UINT_MAX;
                flush_bg_queue(fc);
                spin_unlock(&fc->bg_lock);

                spin_lock(&fiq->lock);
                fiq->connected = 0;
                list_for_each_entry(req, &fiq->pending, list)
                        clear_bit(FR_PENDING, &req->flags);
                list_splice_tail_init(&fiq->pending, &to_end);
                while (forget_pending(fiq))
                        kfree(fuse_dequeue_forget(fiq, 1, NULL));
                wake_up_all(&fiq->waitq);
                spin_unlock(&fiq->lock);
                kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
                end_polls(fc);
                wake_up_all(&fc->blocked_waitq);
                spin_unlock(&fc->lock);

                end_requests(&to_end);
        } else {
                spin_unlock(&fc->lock);
        }
}
EXPORT_SYMBOL_GPL(fuse_abort_conn);

void fuse_wait_aborted(struct fuse_conn *fc)
{
        /* matches implicit memory barrier in fuse_drop_waiting() */
        smp_mb();
        wait_event(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0);
}

int fuse_dev_release(struct inode *inode, struct file *file)
{
        struct fuse_dev *fud = fuse_get_dev(file);

        if (fud) {
                struct fuse_conn *fc = fud->fc;
                struct fuse_pqueue *fpq = &fud->pq;
                LIST_HEAD(to_end);
                unsigned int i;

                spin_lock(&fpq->lock);
                WARN_ON(!list_empty(&fpq->io));
                for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
                        list_splice_init(&fpq->processing[i], &to_end);
                spin_unlock(&fpq->lock);

                end_requests(&to_end);

                /* Are we the last open device? */
                if (atomic_dec_and_test(&fc->dev_count)) {
                        WARN_ON(fc->iq.fasync != NULL);
                        fuse_abort_conn(fc);
                }
                fuse_dev_free(fud);
        }
        return 0;
}
EXPORT_SYMBOL_GPL(fuse_dev_release);

static int fuse_dev_fasync(int fd, struct file *file, int on)
{
        struct fuse_dev *fud = fuse_get_dev(file);

        if (!fud)
                return -EPERM;

        /* No locking - fasync_helper does its own locking */
        return fasync_helper(fd, file, on, &fud->fc->iq.fasync);
}

static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
{
        struct fuse_dev *fud;

        if (new->private_data)
                return -EINVAL;

        fud = fuse_dev_alloc_install(fc);
        if (!fud)
                return -ENOMEM;

        new->private_data = fud;
        atomic_inc(&fc->dev_count);

        return 0;
}

static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp)
{
        int res;
        int oldfd;
        struct fuse_dev *fud = NULL;
        struct fd f;

        if (get_user(oldfd, argp))
                return -EFAULT;

        f = fdget(oldfd);
        if (!f.file)
                return -EINVAL;

        /*
         * Check against file->f_op because CUSE
         * uses the same ioctl handler.
         */
        if (f.file->f_op == file->f_op)
                fud = fuse_get_dev(f.file);

        res = -EINVAL;
        if (fud) {
                mutex_lock(&fuse_mutex);
                res = fuse_device_clone(fud->fc, file);
                mutex_unlock(&fuse_mutex);
        }

        fdput(f);
        return res;
}

static long fuse_dev_ioctl_backing_open(struct file *file,
                                        struct fuse_backing_map __user *argp)
{
        struct fuse_dev *fud = fuse_get_dev(file);
        struct fuse_backing_map map;

        if (!fud)
                return -EPERM;

        if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
                return -EOPNOTSUPP;

        if (copy_from_user(&map, argp, sizeof(map)))
                return -EFAULT;

        return fuse_backing_open(fud->fc, &map);
}

static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp)
{
        struct fuse_dev *fud = fuse_get_dev(file);
        int backing_id;

        if (!fud)
                return -EPERM;

        if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
                return -EOPNOTSUPP;

        if (get_user(backing_id, argp))
                return -EFAULT;

        return fuse_backing_close(fud->fc, backing_id);
}

static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
                           unsigned long arg)
{
        void __user *argp = (void __user *)arg;

        switch (cmd) {
        case FUSE_DEV_IOC_CLONE:
                return fuse_dev_ioctl_clone(file, argp);

        case FUSE_DEV_IOC_BACKING_OPEN:
                return fuse_dev_ioctl_backing_open(file, argp);

        case FUSE_DEV_IOC_BACKING_CLOSE:
                return fuse_dev_ioctl_backing_close(file, argp);

        default:
                return -ENOTTY;
        }
}

const struct file_operations fuse_dev_operations = {
        .owner                = THIS_MODULE,
        .open                = fuse_dev_open,
        .llseek                = no_llseek,
        .read_iter        = fuse_dev_read,
        .splice_read        = fuse_dev_splice_read,
        .write_iter        = fuse_dev_write,
        .splice_write        = fuse_dev_splice_write,
        .poll                = fuse_dev_poll,
        .release        = fuse_dev_release,
        .fasync                = fuse_dev_fasync,
        .unlocked_ioctl = fuse_dev_ioctl,
        .compat_ioctl   = compat_ptr_ioctl,
};
EXPORT_SYMBOL_GPL(fuse_dev_operations);

static struct miscdevice fuse_miscdevice = {
        .minor = FUSE_MINOR,
        .name  = "fuse",
        .fops = &fuse_dev_operations,
};

int __init fuse_dev_init(void)
{
        int err = -ENOMEM;
        fuse_req_cachep = kmem_cache_create("fuse_request",
                                            sizeof(struct fuse_req),
                                            0, 0, NULL);
        if (!fuse_req_cachep)
                goto out;

        err = misc_register(&fuse_miscdevice);
        if (err)
                goto out_cache_clean;

        return 0;

 out_cache_clean:
        kmem_cache_destroy(fuse_req_cachep);
 out:
        return err;
}

void fuse_dev_cleanup(void)
{
        misc_deregister(&fuse_miscdevice);
        kmem_cache_destroy(fuse_req_cachep);
}






























































































    2 































































































































































































































































































































































































































































































































   12 
   13 













































































































































































































































































































































































    1 







    1 
    1 

    1 






    1 
    1 

    1 

    1 


    1 
















    1 




    1 
    1 







































































































































































































































































































































































































































































































































   17 


   18 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
// SPDX-License-Identifier: GPL-2.0-only
/* Kernel thread helper functions.
 *   Copyright (C) 2004 IBM Corporation, Rusty Russell.
 *   Copyright (C) 2009 Red Hat, Inc.
 *
 * Creation is done via kthreadd, so that we get a clean environment
 * even if we're invoked from userspace (think modprobe, hotplug cpu,
 * etc.).
 */
#include <uapi/linux/sched/types.h>
#include <linux/mm.h>
#include <linux/mmu_context.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/err.h>
#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/unistd.h>
#include <linux/file.h>
#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <linux/numa.h>
#include <linux/sched/isolation.h>
#include <trace/events/sched.h>


static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;

struct kthread_create_info
{
        /* Information passed to kthread() from kthreadd. */
        char *full_name;
        int (*threadfn)(void *data);
        void *data;
        int node;

        /* Result passed back to kthread_create() from kthreadd. */
        struct task_struct *result;
        struct completion *done;

        struct list_head list;
};

struct kthread {
        unsigned long flags;
        unsigned int cpu;
        int result;
        int (*threadfn)(void *);
        void *data;
        struct completion parked;
        struct completion exited;
#ifdef CONFIG_BLK_CGROUP
        struct cgroup_subsys_state *blkcg_css;
#endif
        /* To store the full name if task comm is truncated. */
        char *full_name;
};

enum KTHREAD_BITS {
        KTHREAD_IS_PER_CPU = 0,
        KTHREAD_SHOULD_STOP,
        KTHREAD_SHOULD_PARK,
};

static inline struct kthread *to_kthread(struct task_struct *k)
{
        WARN_ON(!(k->flags & PF_KTHREAD));
        return k->worker_private;
}

/*
 * Variant of to_kthread() that doesn't assume @p is a kthread.
 *
 * Per construction; when:
 *
 *   (p->flags & PF_KTHREAD) && p->worker_private
 *
 * the task is both a kthread and struct kthread is persistent. However
 * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and
 * begin_new_exec()).
 */
static inline struct kthread *__to_kthread(struct task_struct *p)
{
        void *kthread = p->worker_private;
        if (kthread && !(p->flags & PF_KTHREAD))
                kthread = NULL;
        return kthread;
}

void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk)
{
        struct kthread *kthread = to_kthread(tsk);

        if (!kthread || !kthread->full_name) {
                __get_task_comm(buf, buf_size, tsk);
                return;
        }

        strscpy_pad(buf, kthread->full_name, buf_size);
}

bool set_kthread_struct(struct task_struct *p)
{
        struct kthread *kthread;

        if (WARN_ON_ONCE(to_kthread(p)))
                return false;

        kthread = kzalloc(sizeof(*kthread), GFP_KERNEL);
        if (!kthread)
                return false;

        init_completion(&kthread->exited);
        init_completion(&kthread->parked);
        p->vfork_done = &kthread->exited;

        p->worker_private = kthread;
        return true;
}

void free_kthread_struct(struct task_struct *k)
{
        struct kthread *kthread;

        /*
         * Can be NULL if kmalloc() in set_kthread_struct() failed.
         */
        kthread = to_kthread(k);
        if (!kthread)
                return;

#ifdef CONFIG_BLK_CGROUP
        WARN_ON_ONCE(kthread->blkcg_css);
#endif
        k->worker_private = NULL;
        kfree(kthread->full_name);
        kfree(kthread);
}

/**
 * kthread_should_stop - should this kthread return now?
 *
 * When someone calls kthread_stop() on your kthread, it will be woken
 * and this will return true.  You should then return, and your return
 * value will be passed through to kthread_stop().
 */
bool kthread_should_stop(void)
{
        return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags);
}
EXPORT_SYMBOL(kthread_should_stop);

static bool __kthread_should_park(struct task_struct *k)
{
        return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags);
}

/**
 * kthread_should_park - should this kthread park now?
 *
 * When someone calls kthread_park() on your kthread, it will be woken
 * and this will return true.  You should then do the necessary
 * cleanup and call kthread_parkme()
 *
 * Similar to kthread_should_stop(), but this keeps the thread alive
 * and in a park position. kthread_unpark() "restarts" the thread and
 * calls the thread function again.
 */
bool kthread_should_park(void)
{
        return __kthread_should_park(current);
}
EXPORT_SYMBOL_GPL(kthread_should_park);

bool kthread_should_stop_or_park(void)
{
        struct kthread *kthread = __to_kthread(current);

        if (!kthread)
                return false;

        return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK));
}

/**
 * kthread_freezable_should_stop - should this freezable kthread return now?
 * @was_frozen: optional out parameter, indicates whether %current was frozen
 *
 * kthread_should_stop() for freezable kthreads, which will enter
 * refrigerator if necessary.  This function is safe from kthread_stop() /
 * freezer deadlock and freezable kthreads should use this function instead
 * of calling try_to_freeze() directly.
 */
bool kthread_freezable_should_stop(bool *was_frozen)
{
        bool frozen = false;

        might_sleep();

        if (unlikely(freezing(current)))
                frozen = __refrigerator(true);

        if (was_frozen)
                *was_frozen = frozen;

        return kthread_should_stop();
}
EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);

/**
 * kthread_func - return the function specified on kthread creation
 * @task: kthread task in question
 *
 * Returns NULL if the task is not a kthread.
 */
void *kthread_func(struct task_struct *task)
{
        struct kthread *kthread = __to_kthread(task);
        if (kthread)
                return kthread->threadfn;
        return NULL;
}
EXPORT_SYMBOL_GPL(kthread_func);

/**
 * kthread_data - return data value specified on kthread creation
 * @task: kthread task in question
 *
 * Return the data value specified when kthread @task was created.
 * The caller is responsible for ensuring the validity of @task when
 * calling this function.
 */
void *kthread_data(struct task_struct *task)
{
        return to_kthread(task)->data;
}
EXPORT_SYMBOL_GPL(kthread_data);

/**
 * kthread_probe_data - speculative version of kthread_data()
 * @task: possible kthread task in question
 *
 * @task could be a kthread task.  Return the data value specified when it
 * was created if accessible.  If @task isn't a kthread task or its data is
 * inaccessible for any reason, %NULL is returned.  This function requires
 * that @task itself is safe to dereference.
 */
void *kthread_probe_data(struct task_struct *task)
{
        struct kthread *kthread = __to_kthread(task);
        void *data = NULL;

        if (kthread)
                copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
        return data;
}

static void __kthread_parkme(struct kthread *self)
{
        for (;;) {
                /*
                 * TASK_PARKED is a special state; we must serialize against
                 * possible pending wakeups to avoid store-store collisions on
                 * task->state.
                 *
                 * Such a collision might possibly result in the task state
                 * changin from TASK_PARKED and us failing the
                 * wait_task_inactive() in kthread_park().
                 */
                set_special_state(TASK_PARKED);
                if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
                        break;

                /*
                 * Thread is going to call schedule(), do not preempt it,
                 * or the caller of kthread_park() may spend more time in
                 * wait_task_inactive().
                 */
                preempt_disable();
                complete(&self->parked);
                schedule_preempt_disabled();
                preempt_enable();
        }
        __set_current_state(TASK_RUNNING);
}

void kthread_parkme(void)
{
        __kthread_parkme(to_kthread(current));
}
EXPORT_SYMBOL_GPL(kthread_parkme);

/**
 * kthread_exit - Cause the current kthread return @result to kthread_stop().
 * @result: The integer value to return to kthread_stop().
 *
 * While kthread_exit can be called directly, it exists so that
 * functions which do some additional work in non-modular code such as
 * module_put_and_kthread_exit can be implemented.
 *
 * Does not return.
 */
void __noreturn kthread_exit(long result)
{
        struct kthread *kthread = to_kthread(current);
        kthread->result = result;
        do_exit(0);
}
EXPORT_SYMBOL(kthread_exit);

/**
 * kthread_complete_and_exit - Exit the current kthread.
 * @comp: Completion to complete
 * @code: The integer value to return to kthread_stop().
 *
 * If present, complete @comp and then return code to kthread_stop().
 *
 * A kernel thread whose module may be removed after the completion of
 * @comp can use this function to exit safely.
 *
 * Does not return.
 */
void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
{
        if (comp)
                complete(comp);

        kthread_exit(code);
}
EXPORT_SYMBOL(kthread_complete_and_exit);

static int kthread(void *_create)
{
        static const struct sched_param param = { .sched_priority = 0 };
        /* Copy data: it's on kthread's stack */
        struct kthread_create_info *create = _create;
        int (*threadfn)(void *data) = create->threadfn;
        void *data = create->data;
        struct completion *done;
        struct kthread *self;
        int ret;

        self = to_kthread(current);

        /* Release the structure when caller killed by a fatal signal. */
        done = xchg(&create->done, NULL);
        if (!done) {
                kfree(create->full_name);
                kfree(create);
                kthread_exit(-EINTR);
        }

        self->full_name = create->full_name;
        self->threadfn = threadfn;
        self->data = data;

        /*
         * The new thread inherited kthreadd's priority and CPU mask. Reset
         * back to default in case they have been changed.
         */
        sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
        set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));

        /* OK, tell user we're spawned, wait for stop or wakeup */
        __set_current_state(TASK_UNINTERRUPTIBLE);
        create->result = current;
        /*
         * Thread is going to call schedule(), do not preempt it,
         * or the creator may spend more time in wait_task_inactive().
         */
        preempt_disable();
        complete(done);
        schedule_preempt_disabled();
        preempt_enable();

        ret = -EINTR;
        if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
                cgroup_kthread_ready();
                __kthread_parkme(self);
                ret = threadfn(data);
        }
        kthread_exit(ret);
}

/* called from kernel_clone() to get node information for about to be created task */
int tsk_fork_get_node(struct task_struct *tsk)
{
#ifdef CONFIG_NUMA
        if (tsk == kthreadd_task)
                return tsk->pref_node_fork;
#endif
        return NUMA_NO_NODE;
}

static void create_kthread(struct kthread_create_info *create)
{
        int pid;

#ifdef CONFIG_NUMA
        current->pref_node_fork = create->node;
#endif
        /* We want our own signal handler (we take no signals by default). */
        pid = kernel_thread(kthread, create, create->full_name,
                            CLONE_FS | CLONE_FILES | SIGCHLD);
        if (pid < 0) {
                /* Release the structure when caller killed by a fatal signal. */
                struct completion *done = xchg(&create->done, NULL);

                kfree(create->full_name);
                if (!done) {
                        kfree(create);
                        return;
                }
                create->result = ERR_PTR(pid);
                complete(done);
        }
}

static __printf(4, 0)
struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
                                                    void *data, int node,
                                                    const char namefmt[],
                                                    va_list args)
{
        DECLARE_COMPLETION_ONSTACK(done);
        struct task_struct *task;
        struct kthread_create_info *create = kmalloc(sizeof(*create),
                                                     GFP_KERNEL);

        if (!create)
                return ERR_PTR(-ENOMEM);
        create->threadfn = threadfn;
        create->data = data;
        create->node = node;
        create->done = &done;
        create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
        if (!create->full_name) {
                task = ERR_PTR(-ENOMEM);
                goto free_create;
        }

        spin_lock(&kthread_create_lock);
        list_add_tail(&create->list, &kthread_create_list);
        spin_unlock(&kthread_create_lock);

        wake_up_process(kthreadd_task);
        /*
         * Wait for completion in killable state, for I might be chosen by
         * the OOM killer while kthreadd is trying to allocate memory for
         * new kernel thread.
         */
        if (unlikely(wait_for_completion_killable(&done))) {
                /*
                 * If I was killed by a fatal signal before kthreadd (or new
                 * kernel thread) calls complete(), leave the cleanup of this
                 * structure to that thread.
                 */
                if (xchg(&create->done, NULL))
                        return ERR_PTR(-EINTR);
                /*
                 * kthreadd (or new kernel thread) will call complete()
                 * shortly.
                 */
                wait_for_completion(&done);
        }
        task = create->result;
free_create:
        kfree(create);
        return task;
}

/**
 * kthread_create_on_node - create a kthread.
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @node: task and thread structures for the thread are allocated on this node
 * @namefmt: printf-style name for the thread.
 *
 * Description: This helper function creates and names a kernel
 * thread.  The thread will be stopped: use wake_up_process() to start
 * it.  See also kthread_run().  The new thread has SCHED_NORMAL policy and
 * is affine to all CPUs.
 *
 * If thread is going to be bound on a particular cpu, give its node
 * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
 * When woken, the thread will run @threadfn() with @data as its
 * argument. @threadfn() can either return directly if it is a
 * standalone thread for which no one will call kthread_stop(), or
 * return when 'kthread_should_stop()' is true (which means
 * kthread_stop() has been called).  The return value should be zero
 * or a negative error number; it will be passed to kthread_stop().
 *
 * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
 */
struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
                                           void *data, int node,
                                           const char namefmt[],
                                           ...)
{
        struct task_struct *task;
        va_list args;

        va_start(args, namefmt);
        task = __kthread_create_on_node(threadfn, data, node, namefmt, args);
        va_end(args);

        return task;
}
EXPORT_SYMBOL(kthread_create_on_node);

static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
{
        unsigned long flags;

        if (!wait_task_inactive(p, state)) {
                WARN_ON(1);
                return;
        }

        /* It's safe because the task is inactive. */
        raw_spin_lock_irqsave(&p->pi_lock, flags);
        do_set_cpus_allowed(p, mask);
        p->flags |= PF_NO_SETAFFINITY;
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}

static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
{
        __kthread_bind_mask(p, cpumask_of(cpu), state);
}

void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
{
        __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
}

/**
 * kthread_bind - bind a just-created kthread to a cpu.
 * @p: thread created by kthread_create().
 * @cpu: cpu (might not be online, must be possible) for @k to run on.
 *
 * Description: This function is equivalent to set_cpus_allowed(),
 * except that @cpu doesn't need to be online, and the thread must be
 * stopped (i.e., just returned from kthread_create()).
 */
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
        __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(kthread_bind);

/**
 * kthread_create_on_cpu - Create a cpu bound kthread
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @cpu: The cpu on which the thread should be bound,
 * @namefmt: printf-style name for the thread. Format is restricted
 *             to "name.*%u". Code fills in cpu number.
 *
 * Description: This helper function creates and names a kernel thread
 */
struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
                                          void *data, unsigned int cpu,
                                          const char *namefmt)
{
        struct task_struct *p;

        p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
                                   cpu);
        if (IS_ERR(p))
                return p;
        kthread_bind(p, cpu);
        /* CPU hotplug need to bind once again when unparking the thread. */
        to_kthread(p)->cpu = cpu;
        return p;
}
EXPORT_SYMBOL(kthread_create_on_cpu);

void kthread_set_per_cpu(struct task_struct *k, int cpu)
{
        struct kthread *kthread = to_kthread(k);
        if (!kthread)
                return;

        WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY));

        if (cpu < 0) {
                clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
                return;
        }

        kthread->cpu = cpu;
        set_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}

bool kthread_is_per_cpu(struct task_struct *p)
{
        struct kthread *kthread = __to_kthread(p);
        if (!kthread)
                return false;

        return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}

/**
 * kthread_unpark - unpark a thread created by kthread_create().
 * @k:                thread created by kthread_create().
 *
 * Sets kthread_should_park() for @k to return false, wakes it, and
 * waits for it to return. If the thread is marked percpu then its
 * bound to the cpu again.
 */
void kthread_unpark(struct task_struct *k)
{
        struct kthread *kthread = to_kthread(k);

        /*
         * Newly created kthread was parked when the CPU was offline.
         * The binding was lost and we need to set it again.
         */
        if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
                __kthread_bind(k, kthread->cpu, TASK_PARKED);

        clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
        /*
         * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
         */
        wake_up_state(k, TASK_PARKED);
}
EXPORT_SYMBOL_GPL(kthread_unpark);

/**
 * kthread_park - park a thread created by kthread_create().
 * @k: thread created by kthread_create().
 *
 * Sets kthread_should_park() for @k to return true, wakes it, and
 * waits for it to return. This can also be called after kthread_create()
 * instead of calling wake_up_process(): the thread will park without
 * calling threadfn().
 *
 * Returns 0 if the thread is parked, -ENOSYS if the thread exited.
 * If called by the kthread itself just the park bit is set.
 */
int kthread_park(struct task_struct *k)
{
        struct kthread *kthread = to_kthread(k);

        if (WARN_ON(k->flags & PF_EXITING))
                return -ENOSYS;

        if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)))
                return -EBUSY;

        set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
        if (k != current) {
                wake_up_process(k);
                /*
                 * Wait for __kthread_parkme() to complete(), this means we
                 * _will_ have TASK_PARKED and are about to call schedule().
                 */
                wait_for_completion(&kthread->parked);
                /*
                 * Now wait for that schedule() to complete and the task to
                 * get scheduled out.
                 */
                WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED));
        }

        return 0;
}
EXPORT_SYMBOL_GPL(kthread_park);

/**
 * kthread_stop - stop a thread created by kthread_create().
 * @k: thread created by kthread_create().
 *
 * Sets kthread_should_stop() for @k to return true, wakes it, and
 * waits for it to exit. This can also be called after kthread_create()
 * instead of calling wake_up_process(): the thread will exit without
 * calling threadfn().
 *
 * If threadfn() may call kthread_exit() itself, the caller must ensure
 * task_struct can't go away.
 *
 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
 * was never called.
 */
int kthread_stop(struct task_struct *k)
{
        struct kthread *kthread;
        int ret;

        trace_sched_kthread_stop(k);

        get_task_struct(k);
        kthread = to_kthread(k);
        set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
        kthread_unpark(k);
        set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL);
        wake_up_process(k);
        wait_for_completion(&kthread->exited);
        ret = kthread->result;
        put_task_struct(k);

        trace_sched_kthread_stop_ret(ret);
        return ret;
}
EXPORT_SYMBOL(kthread_stop);

/**
 * kthread_stop_put - stop a thread and put its task struct
 * @k: thread created by kthread_create().
 *
 * Stops a thread created by kthread_create() and put its task_struct.
 * Only use when holding an extra task struct reference obtained by
 * calling get_task_struct().
 */
int kthread_stop_put(struct task_struct *k)
{
        int ret;

        ret = kthread_stop(k);
        put_task_struct(k);
        return ret;
}
EXPORT_SYMBOL(kthread_stop_put);

int kthreadd(void *unused)
{
        struct task_struct *tsk = current;

        /* Setup a clean context for our children to inherit. */
        set_task_comm(tsk, "kthreadd");
        ignore_signals(tsk);
        set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD));
        set_mems_allowed(node_states[N_MEMORY]);

        current->flags |= PF_NOFREEZE;
        cgroup_init_kthreadd();

        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
                if (list_empty(&kthread_create_list))
                        schedule();
                __set_current_state(TASK_RUNNING);

                spin_lock(&kthread_create_lock);
                while (!list_empty(&kthread_create_list)) {
                        struct kthread_create_info *create;

                        create = list_entry(kthread_create_list.next,
                                            struct kthread_create_info, list);
                        list_del_init(&create->list);
                        spin_unlock(&kthread_create_lock);

                        create_kthread(create);

                        spin_lock(&kthread_create_lock);
                }
                spin_unlock(&kthread_create_lock);
        }

        return 0;
}

void __kthread_init_worker(struct kthread_worker *worker,
                                const char *name,
                                struct lock_class_key *key)
{
        memset(worker, 0, sizeof(struct kthread_worker));
        raw_spin_lock_init(&worker->lock);
        lockdep_set_class_and_name(&worker->lock, key, name);
        INIT_LIST_HEAD(&worker->work_list);
        INIT_LIST_HEAD(&worker->delayed_work_list);
}
EXPORT_SYMBOL_GPL(__kthread_init_worker);

/**
 * kthread_worker_fn - kthread function to process kthread_worker
 * @worker_ptr: pointer to initialized kthread_worker
 *
 * This function implements the main cycle of kthread worker. It processes
 * work_list until it is stopped with kthread_stop(). It sleeps when the queue
 * is empty.
 *
 * The works are not allowed to keep any locks, disable preemption or interrupts
 * when they finish. There is defined a safe point for freezing when one work
 * finishes and before a new one is started.
 *
 * Also the works must not be handled by more than one worker at the same time,
 * see also kthread_queue_work().
 */
int kthread_worker_fn(void *worker_ptr)
{
        struct kthread_worker *worker = worker_ptr;
        struct kthread_work *work;

        /*
         * FIXME: Update the check and remove the assignment when all kthread
         * worker users are created using kthread_create_worker*() functions.
         */
        WARN_ON(worker->task && worker->task != current);
        worker->task = current;

        if (worker->flags & KTW_FREEZABLE)
                set_freezable();

repeat:
        set_current_state(TASK_INTERRUPTIBLE);        /* mb paired w/ kthread_stop */

        if (kthread_should_stop()) {
                __set_current_state(TASK_RUNNING);
                raw_spin_lock_irq(&worker->lock);
                worker->task = NULL;
                raw_spin_unlock_irq(&worker->lock);
                return 0;
        }

        work = NULL;
        raw_spin_lock_irq(&worker->lock);
        if (!list_empty(&worker->work_list)) {
                work = list_first_entry(&worker->work_list,
                                        struct kthread_work, node);
                list_del_init(&work->node);
        }
        worker->current_work = work;
        raw_spin_unlock_irq(&worker->lock);

        if (work) {
                kthread_work_func_t func = work->func;
                __set_current_state(TASK_RUNNING);
                trace_sched_kthread_work_execute_start(work);
                work->func(work);
                /*
                 * Avoid dereferencing work after this point.  The trace
                 * event only cares about the address.
                 */
                trace_sched_kthread_work_execute_end(work, func);
        } else if (!freezing(current))
                schedule();

        try_to_freeze();
        cond_resched();
        goto repeat;
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);

static __printf(3, 0) struct kthread_worker *
__kthread_create_worker(int cpu, unsigned int flags,
                        const char namefmt[], va_list args)
{
        struct kthread_worker *worker;
        struct task_struct *task;
        int node = NUMA_NO_NODE;

        worker = kzalloc(sizeof(*worker), GFP_KERNEL);
        if (!worker)
                return ERR_PTR(-ENOMEM);

        kthread_init_worker(worker);

        if (cpu >= 0)
                node = cpu_to_node(cpu);

        task = __kthread_create_on_node(kthread_worker_fn, worker,
                                                node, namefmt, args);
        if (IS_ERR(task))
                goto fail_task;

        if (cpu >= 0)
                kthread_bind(task, cpu);

        worker->flags = flags;
        worker->task = task;
        wake_up_process(task);
        return worker;

fail_task:
        kfree(worker);
        return ERR_CAST(task);
}

/**
 * kthread_create_worker - create a kthread worker
 * @flags: flags modifying the default behavior of the worker
 * @namefmt: printf-style name for the kthread worker (task).
 *
 * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
 * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
 * when the caller was killed by a fatal signal.
 */
struct kthread_worker *
kthread_create_worker(unsigned int flags, const char namefmt[], ...)
{
        struct kthread_worker *worker;
        va_list args;

        va_start(args, namefmt);
        worker = __kthread_create_worker(-1, flags, namefmt, args);
        va_end(args);

        return worker;
}
EXPORT_SYMBOL(kthread_create_worker);

/**
 * kthread_create_worker_on_cpu - create a kthread worker and bind it
 *        to a given CPU and the associated NUMA node.
 * @cpu: CPU number
 * @flags: flags modifying the default behavior of the worker
 * @namefmt: printf-style name for the kthread worker (task).
 *
 * Use a valid CPU number if you want to bind the kthread worker
 * to the given CPU and the associated NUMA node.
 *
 * A good practice is to add the cpu number also into the worker name.
 * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu).
 *
 * CPU hotplug:
 * The kthread worker API is simple and generic. It just provides a way
 * to create, use, and destroy workers.
 *
 * It is up to the API user how to handle CPU hotplug. They have to decide
 * how to handle pending work items, prevent queuing new ones, and
 * restore the functionality when the CPU goes off and on. There are a
 * few catches:
 *
 *    - CPU affinity gets lost when it is scheduled on an offline CPU.
 *
 *    - The worker might not exist when the CPU was off when the user
 *      created the workers.
 *
 * Good practice is to implement two CPU hotplug callbacks and to
 * destroy/create the worker when the CPU goes down/up.
 *
 * Return:
 * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
 * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
 * when the caller was killed by a fatal signal.
 */
struct kthread_worker *
kthread_create_worker_on_cpu(int cpu, unsigned int flags,
                             const char namefmt[], ...)
{
        struct kthread_worker *worker;
        va_list args;

        va_start(args, namefmt);
        worker = __kthread_create_worker(cpu, flags, namefmt, args);
        va_end(args);

        return worker;
}
EXPORT_SYMBOL(kthread_create_worker_on_cpu);

/*
 * Returns true when the work could not be queued at the moment.
 * It happens when it is already pending in a worker list
 * or when it is being cancelled.
 */
static inline bool queuing_blocked(struct kthread_worker *worker,
                                   struct kthread_work *work)
{
        lockdep_assert_held(&worker->lock);

        return !list_empty(&work->node) || work->canceling;
}

static void kthread_insert_work_sanity_check(struct kthread_worker *worker,
                                             struct kthread_work *work)
{
        lockdep_assert_held(&worker->lock);
        WARN_ON_ONCE(!list_empty(&work->node));
        /* Do not use a work with >1 worker, see kthread_queue_work() */
        WARN_ON_ONCE(work->worker && work->worker != worker);
}

/* insert @work before @pos in @worker */
static void kthread_insert_work(struct kthread_worker *worker,
                                struct kthread_work *work,
                                struct list_head *pos)
{
        kthread_insert_work_sanity_check(worker, work);

        trace_sched_kthread_work_queue_work(worker, work);

        list_add_tail(&work->node, pos);
        work->worker = worker;
        if (!worker->current_work && likely(worker->task))
                wake_up_process(worker->task);
}

/**
 * kthread_queue_work - queue a kthread_work
 * @worker: target kthread_worker
 * @work: kthread_work to queue
 *
 * Queue @work to work processor @task for async execution.  @task
 * must have been created with kthread_worker_create().  Returns %true
 * if @work was successfully queued, %false if it was already pending.
 *
 * Reinitialize the work if it needs to be used by another worker.
 * For example, when the worker was stopped and started again.
 */
bool kthread_queue_work(struct kthread_worker *worker,
                        struct kthread_work *work)
{
        bool ret = false;
        unsigned long flags;

        raw_spin_lock_irqsave(&worker->lock, flags);
        if (!queuing_blocked(worker, work)) {
                kthread_insert_work(worker, work, &worker->work_list);
                ret = true;
        }
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_work);

/**
 * kthread_delayed_work_timer_fn - callback that queues the associated kthread
 *        delayed work when the timer expires.
 * @t: pointer to the expired timer
 *
 * The format of the function is defined by struct timer_list.
 * It should have been called from irqsafe timer with irq already off.
 */
void kthread_delayed_work_timer_fn(struct timer_list *t)
{
        struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);
        struct kthread_work *work = &dwork->work;
        struct kthread_worker *worker = work->worker;
        unsigned long flags;

        /*
         * This might happen when a pending work is reinitialized.
         * It means that it is used a wrong way.
         */
        if (WARN_ON_ONCE(!worker))
                return;

        raw_spin_lock_irqsave(&worker->lock, flags);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        /* Move the work from worker->delayed_work_list. */
        WARN_ON_ONCE(list_empty(&work->node));
        list_del_init(&work->node);
        if (!work->canceling)
                kthread_insert_work(worker, work, &worker->work_list);

        raw_spin_unlock_irqrestore(&worker->lock, flags);
}
EXPORT_SYMBOL(kthread_delayed_work_timer_fn);

static void __kthread_queue_delayed_work(struct kthread_worker *worker,
                                         struct kthread_delayed_work *dwork,
                                         unsigned long delay)
{
        struct timer_list *timer = &dwork->timer;
        struct kthread_work *work = &dwork->work;

        WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);

        /*
         * If @delay is 0, queue @dwork->work immediately.  This is for
         * both optimization and correctness.  The earliest @timer can
         * expire is on the closest next tick and delayed_work users depend
         * on that there's no such delay when @delay is 0.
         */
        if (!delay) {
                kthread_insert_work(worker, work, &worker->work_list);
                return;
        }

        /* Be paranoid and try to detect possible races already now. */
        kthread_insert_work_sanity_check(worker, work);

        list_add(&work->node, &worker->delayed_work_list);
        work->worker = worker;
        timer->expires = jiffies + delay;
        add_timer(timer);
}

/**
 * kthread_queue_delayed_work - queue the associated kthread work
 *        after a delay.
 * @worker: target kthread_worker
 * @dwork: kthread_delayed_work to queue
 * @delay: number of jiffies to wait before queuing
 *
 * If the work has not been pending it starts a timer that will queue
 * the work after the given @delay. If @delay is zero, it queues the
 * work immediately.
 *
 * Return: %false if the @work has already been pending. It means that
 * either the timer was running or the work was queued. It returns %true
 * otherwise.
 */
bool kthread_queue_delayed_work(struct kthread_worker *worker,
                                struct kthread_delayed_work *dwork,
                                unsigned long delay)
{
        struct kthread_work *work = &dwork->work;
        unsigned long flags;
        bool ret = false;

        raw_spin_lock_irqsave(&worker->lock, flags);

        if (!queuing_blocked(worker, work)) {
                __kthread_queue_delayed_work(worker, dwork, delay);
                ret = true;
        }

        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_delayed_work);

struct kthread_flush_work {
        struct kthread_work        work;
        struct completion        done;
};

static void kthread_flush_work_fn(struct kthread_work *work)
{
        struct kthread_flush_work *fwork =
                container_of(work, struct kthread_flush_work, work);
        complete(&fwork->done);
}

/**
 * kthread_flush_work - flush a kthread_work
 * @work: work to flush
 *
 * If @work is queued or executing, wait for it to finish execution.
 */
void kthread_flush_work(struct kthread_work *work)
{
        struct kthread_flush_work fwork = {
                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
                COMPLETION_INITIALIZER_ONSTACK(fwork.done),
        };
        struct kthread_worker *worker;
        bool noop = false;

        worker = work->worker;
        if (!worker)
                return;

        raw_spin_lock_irq(&worker->lock);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        if (!list_empty(&work->node))
                kthread_insert_work(worker, &fwork.work, work->node.next);
        else if (worker->current_work == work)
                kthread_insert_work(worker, &fwork.work,
                                    worker->work_list.next);
        else
                noop = true;

        raw_spin_unlock_irq(&worker->lock);

        if (!noop)
                wait_for_completion(&fwork.done);
}
EXPORT_SYMBOL_GPL(kthread_flush_work);

/*
 * Make sure that the timer is neither set nor running and could
 * not manipulate the work list_head any longer.
 *
 * The function is called under worker->lock. The lock is temporary
 * released but the timer can't be set again in the meantime.
 */
static void kthread_cancel_delayed_work_timer(struct kthread_work *work,
                                              unsigned long *flags)
{
        struct kthread_delayed_work *dwork =
                container_of(work, struct kthread_delayed_work, work);
        struct kthread_worker *worker = work->worker;

        /*
         * del_timer_sync() must be called to make sure that the timer
         * callback is not running. The lock must be temporary released
         * to avoid a deadlock with the callback. In the meantime,
         * any queuing is blocked by setting the canceling counter.
         */
        work->canceling++;
        raw_spin_unlock_irqrestore(&worker->lock, *flags);
        del_timer_sync(&dwork->timer);
        raw_spin_lock_irqsave(&worker->lock, *flags);
        work->canceling--;
}

/*
 * This function removes the work from the worker queue.
 *
 * It is called under worker->lock. The caller must make sure that
 * the timer used by delayed work is not running, e.g. by calling
 * kthread_cancel_delayed_work_timer().
 *
 * The work might still be in use when this function finishes. See the
 * current_work proceed by the worker.
 *
 * Return: %true if @work was pending and successfully canceled,
 *        %false if @work was not pending
 */
static bool __kthread_cancel_work(struct kthread_work *work)
{
        /*
         * Try to remove the work from a worker list. It might either
         * be from worker->work_list or from worker->delayed_work_list.
         */
        if (!list_empty(&work->node)) {
                list_del_init(&work->node);
                return true;
        }

        return false;
}

/**
 * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work
 * @worker: kthread worker to use
 * @dwork: kthread delayed work to queue
 * @delay: number of jiffies to wait before queuing
 *
 * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise,
 * modify @dwork's timer so that it expires after @delay. If @delay is zero,
 * @work is guaranteed to be queued immediately.
 *
 * Return: %false if @dwork was idle and queued, %true otherwise.
 *
 * A special case is when the work is being canceled in parallel.
 * It might be caused either by the real kthread_cancel_delayed_work_sync()
 * or yet another kthread_mod_delayed_work() call. We let the other command
 * win and return %true here. The return value can be used for reference
 * counting and the number of queued works stays the same. Anyway, the caller
 * is supposed to synchronize these operations a reasonable way.
 *
 * This function is safe to call from any context including IRQ handler.
 * See __kthread_cancel_work() and kthread_delayed_work_timer_fn()
 * for details.
 */
bool kthread_mod_delayed_work(struct kthread_worker *worker,
                              struct kthread_delayed_work *dwork,
                              unsigned long delay)
{
        struct kthread_work *work = &dwork->work;
        unsigned long flags;
        int ret;

        raw_spin_lock_irqsave(&worker->lock, flags);

        /* Do not bother with canceling when never queued. */
        if (!work->worker) {
                ret = false;
                goto fast_queue;
        }

        /* Work must not be used with >1 worker, see kthread_queue_work() */
        WARN_ON_ONCE(work->worker != worker);

        /*
         * Temporary cancel the work but do not fight with another command
         * that is canceling the work as well.
         *
         * It is a bit tricky because of possible races with another
         * mod_delayed_work() and cancel_delayed_work() callers.
         *
         * The timer must be canceled first because worker->lock is released
         * when doing so. But the work can be removed from the queue (list)
         * only when it can be queued again so that the return value can
         * be used for reference counting.
         */
        kthread_cancel_delayed_work_timer(work, &flags);
        if (work->canceling) {
                /* The number of works in the queue does not change. */
                ret = true;
                goto out;
        }
        ret = __kthread_cancel_work(work);

fast_queue:
        __kthread_queue_delayed_work(worker, dwork, delay);
out:
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_mod_delayed_work);

static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
{
        struct kthread_worker *worker = work->worker;
        unsigned long flags;
        int ret = false;

        if (!worker)
                goto out;

        raw_spin_lock_irqsave(&worker->lock, flags);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        if (is_dwork)
                kthread_cancel_delayed_work_timer(work, &flags);

        ret = __kthread_cancel_work(work);

        if (worker->current_work != work)
                goto out_fast;

        /*
         * The work is in progress and we need to wait with the lock released.
         * In the meantime, block any queuing by setting the canceling counter.
         */
        work->canceling++;
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        kthread_flush_work(work);
        raw_spin_lock_irqsave(&worker->lock, flags);
        work->canceling--;

out_fast:
        raw_spin_unlock_irqrestore(&worker->lock, flags);
out:
        return ret;
}

/**
 * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish
 * @work: the kthread work to cancel
 *
 * Cancel @work and wait for its execution to finish.  This function
 * can be used even if the work re-queues itself. On return from this
 * function, @work is guaranteed to be not pending or executing on any CPU.
 *
 * kthread_cancel_work_sync(&delayed_work->work) must not be used for
 * delayed_work's. Use kthread_cancel_delayed_work_sync() instead.
 *
 * The caller must ensure that the worker on which @work was last
 * queued can't be destroyed before this function returns.
 *
 * Return: %true if @work was pending, %false otherwise.
 */
bool kthread_cancel_work_sync(struct kthread_work *work)
{
        return __kthread_cancel_work_sync(work, false);
}
EXPORT_SYMBOL_GPL(kthread_cancel_work_sync);

/**
 * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and
 *        wait for it to finish.
 * @dwork: the kthread delayed work to cancel
 *
 * This is kthread_cancel_work_sync() for delayed works.
 *
 * Return: %true if @dwork was pending, %false otherwise.
 */
bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork)
{
        return __kthread_cancel_work_sync(&dwork->work, true);
}
EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync);

/**
 * kthread_flush_worker - flush all current works on a kthread_worker
 * @worker: worker to flush
 *
 * Wait until all currently executing or pending works on @worker are
 * finished.
 */
void kthread_flush_worker(struct kthread_worker *worker)
{
        struct kthread_flush_work fwork = {
                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
                COMPLETION_INITIALIZER_ONSTACK(fwork.done),
        };

        kthread_queue_work(worker, &fwork.work);
        wait_for_completion(&fwork.done);
}
EXPORT_SYMBOL_GPL(kthread_flush_worker);

/**
 * kthread_destroy_worker - destroy a kthread worker
 * @worker: worker to be destroyed
 *
 * Flush and destroy @worker.  The simple flush is enough because the kthread
 * worker API is used only in trivial scenarios.  There are no multi-step state
 * machines needed.
 *
 * Note that this function is not responsible for handling delayed work, so
 * caller should be responsible for queuing or canceling all delayed work items
 * before invoke this function.
 */
void kthread_destroy_worker(struct kthread_worker *worker)
{
        struct task_struct *task;

        task = worker->task;
        if (WARN_ON(!task))
                return;

        kthread_flush_worker(worker);
        kthread_stop(task);
        WARN_ON(!list_empty(&worker->delayed_work_list));
        WARN_ON(!list_empty(&worker->work_list));
        kfree(worker);
}
EXPORT_SYMBOL(kthread_destroy_worker);

/**
 * kthread_use_mm - make the calling kthread operate on an address space
 * @mm: address space to operate on
 */
void kthread_use_mm(struct mm_struct *mm)
{
        struct mm_struct *active_mm;
        struct task_struct *tsk = current;

        WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
        WARN_ON_ONCE(tsk->mm);

        /*
         * It is possible for mm to be the same as tsk->active_mm, but
         * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm),
         * because these references are not equivalent.
         */
        mmgrab(mm);

        task_lock(tsk);
        /* Hold off tlb flush IPIs while switching mm's */
        local_irq_disable();
        active_mm = tsk->active_mm;
        tsk->active_mm = mm;
        tsk->mm = mm;
        membarrier_update_current_mm(mm);
        switch_mm_irqs_off(active_mm, mm, tsk);
        local_irq_enable();
        task_unlock(tsk);
#ifdef finish_arch_post_lock_switch
        finish_arch_post_lock_switch();
#endif

        /*
         * When a kthread starts operating on an address space, the loop
         * in membarrier_{private,global}_expedited() may not observe
         * that tsk->mm, and not issue an IPI. Membarrier requires a
         * memory barrier after storing to tsk->mm, before accessing
         * user-space memory. A full memory barrier for membarrier
         * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
         * mmdrop_lazy_tlb().
         */
        mmdrop_lazy_tlb(active_mm);
}
EXPORT_SYMBOL_GPL(kthread_use_mm);

/**
 * kthread_unuse_mm - reverse the effect of kthread_use_mm()
 * @mm: address space to operate on
 */
void kthread_unuse_mm(struct mm_struct *mm)
{
        struct task_struct *tsk = current;

        WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
        WARN_ON_ONCE(!tsk->mm);

        task_lock(tsk);
        /*
         * When a kthread stops operating on an address space, the loop
         * in membarrier_{private,global}_expedited() may not observe
         * that tsk->mm, and not issue an IPI. Membarrier requires a
         * memory barrier after accessing user-space memory, before
         * clearing tsk->mm.
         */
        smp_mb__after_spinlock();
        local_irq_disable();
        tsk->mm = NULL;
        membarrier_update_current_mm(NULL);
        mmgrab_lazy_tlb(mm);
        /* active_mm is still 'mm' */
        enter_lazy_tlb(mm, tsk);
        local_irq_enable();
        task_unlock(tsk);

        mmdrop(mm);
}
EXPORT_SYMBOL_GPL(kthread_unuse_mm);

#ifdef CONFIG_BLK_CGROUP
/**
 * kthread_associate_blkcg - associate blkcg to current kthread
 * @css: the cgroup info
 *
 * Current thread must be a kthread. The thread is running jobs on behalf of
 * other threads. In some cases, we expect the jobs attach cgroup info of
 * original threads instead of that of current thread. This function stores
 * original thread's cgroup info in current kthread context for later
 * retrieval.
 */
void kthread_associate_blkcg(struct cgroup_subsys_state *css)
{
        struct kthread *kthread;

        if (!(current->flags & PF_KTHREAD))
                return;
        kthread = to_kthread(current);
        if (!kthread)
                return;

        if (kthread->blkcg_css) {
                css_put(kthread->blkcg_css);
                kthread->blkcg_css = NULL;
        }
        if (css) {
                css_get(css);
                kthread->blkcg_css = css;
        }
}
EXPORT_SYMBOL(kthread_associate_blkcg);

/**
 * kthread_blkcg - get associated blkcg css of current kthread
 *
 * Current thread must be a kthread.
 */
struct cgroup_subsys_state *kthread_blkcg(void)
{
        struct kthread *kthread;

        if (current->flags & PF_KTHREAD) {
                kthread = to_kthread(current);
                if (kthread)
                        return kthread->blkcg_css;
        }
        return NULL;
}
#endif



















































































    1 
















































































































































































































































































































































































































































































    1 


















    1 






    1 

    1 










    1 









    1 







    1 
































    1 
















    1 




















    1 


    1 




















    1 




    1 













    1 



















    1 




    1 




















    1 



















    1 







    1 






    1 




    1 









    1 
















    1 




    1 






















    1 




















    1 











































    1 



    1 













    1 


















    1 






























    1 







































    1 













    1 







    1 




















    1 























































































































    1 

    1 

    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
// SPDX-License-Identifier: GPL-2.0
/*
 *        mm/mremap.c
 *
 *        (C) Copyright 1996 Linus Torvalds
 *
 *        Address space accounting code        <alan@lxorguk.ukuu.org.uk>
 *        (C) Copyright 2002 Red Hat Inc, All Rights Reserved
 */

#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/hugetlb.h>
#include <linux/shm.h>
#include <linux/ksm.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/capability.h>
#include <linux/fs.h>
#include <linux/swapops.h>
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/mmu_notifier.h>
#include <linux/uaccess.h>
#include <linux/userfaultfd_k.h>
#include <linux/mempolicy.h>

#include <asm/cacheflush.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>

#include "internal.h"

static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;

        pgd = pgd_offset(mm, addr);
        if (pgd_none_or_clear_bad(pgd))
                return NULL;

        p4d = p4d_offset(pgd, addr);
        if (p4d_none_or_clear_bad(p4d))
                return NULL;

        pud = pud_offset(p4d, addr);
        if (pud_none_or_clear_bad(pud))
                return NULL;

        return pud;
}

static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
{
        pud_t *pud;
        pmd_t *pmd;

        pud = get_old_pud(mm, addr);
        if (!pud)
                return NULL;

        pmd = pmd_offset(pud, addr);
        if (pmd_none(*pmd))
                return NULL;

        return pmd;
}

static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
                            unsigned long addr)
{
        pgd_t *pgd;
        p4d_t *p4d;

        pgd = pgd_offset(mm, addr);
        p4d = p4d_alloc(mm, pgd, addr);
        if (!p4d)
                return NULL;

        return pud_alloc(mm, p4d, addr);
}

static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
                            unsigned long addr)
{
        pud_t *pud;
        pmd_t *pmd;

        pud = alloc_new_pud(mm, vma, addr);
        if (!pud)
                return NULL;

        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return NULL;

        VM_BUG_ON(pmd_trans_huge(*pmd));

        return pmd;
}

static void take_rmap_locks(struct vm_area_struct *vma)
{
        if (vma->vm_file)
                i_mmap_lock_write(vma->vm_file->f_mapping);
        if (vma->anon_vma)
                anon_vma_lock_write(vma->anon_vma);
}

static void drop_rmap_locks(struct vm_area_struct *vma)
{
        if (vma->anon_vma)
                anon_vma_unlock_write(vma->anon_vma);
        if (vma->vm_file)
                i_mmap_unlock_write(vma->vm_file->f_mapping);
}

static pte_t move_soft_dirty_pte(pte_t pte)
{
        /*
         * Set soft dirty bit so we can notice
         * in userspace the ptes were moved.
         */
#ifdef CONFIG_MEM_SOFT_DIRTY
        if (pte_present(pte))
                pte = pte_mksoft_dirty(pte);
        else if (is_swap_pte(pte))
                pte = pte_swp_mksoft_dirty(pte);
#endif
        return pte;
}

static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                unsigned long old_addr, unsigned long old_end,
                struct vm_area_struct *new_vma, pmd_t *new_pmd,
                unsigned long new_addr, bool need_rmap_locks)
{
        struct mm_struct *mm = vma->vm_mm;
        pte_t *old_pte, *new_pte, pte;
        spinlock_t *old_ptl, *new_ptl;
        bool force_flush = false;
        unsigned long len = old_end - old_addr;
        int err = 0;

        /*
         * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
         * locks to ensure that rmap will always observe either the old or the
         * new ptes. This is the easiest way to avoid races with
         * truncate_pagecache(), page migration, etc...
         *
         * When need_rmap_locks is false, we use other ways to avoid
         * such races:
         *
         * - During exec() shift_arg_pages(), we use a specially tagged vma
         *   which rmap call sites look for using vma_is_temporary_stack().
         *
         * - During mremap(), new_vma is often known to be placed after vma
         *   in rmap traversal order. This ensures rmap will always observe
         *   either the old pte, or the new pte, or both (the page table locks
         *   serialize access to individual ptes, but only rmap traversal
         *   order guarantees that we won't miss both the old and new ptes).
         */
        if (need_rmap_locks)
                take_rmap_locks(vma);

        /*
         * We don't have to worry about the ordering of src and dst
         * pte locks because exclusive mmap_lock prevents deadlock.
         */
        old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
        if (!old_pte) {
                err = -EAGAIN;
                goto out;
        }
        new_pte = pte_offset_map_nolock(mm, new_pmd, new_addr, &new_ptl);
        if (!new_pte) {
                pte_unmap_unlock(old_pte, old_ptl);
                err = -EAGAIN;
                goto out;
        }
        if (new_ptl != old_ptl)
                spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
        flush_tlb_batched_pending(vma->vm_mm);
        arch_enter_lazy_mmu_mode();

        for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
                                   new_pte++, new_addr += PAGE_SIZE) {
                if (pte_none(ptep_get(old_pte)))
                        continue;

                pte = ptep_get_and_clear(mm, old_addr, old_pte);
                /*
                 * If we are remapping a valid PTE, make sure
                 * to flush TLB before we drop the PTL for the
                 * PTE.
                 *
                 * NOTE! Both old and new PTL matter: the old one
                 * for racing with page_mkclean(), the new one to
                 * make sure the physical page stays valid until
                 * the TLB entry for the old mapping has been
                 * flushed.
                 */
                if (pte_present(pte))
                        force_flush = true;
                pte = move_pte(pte, old_addr, new_addr);
                pte = move_soft_dirty_pte(pte);
                set_pte_at(mm, new_addr, new_pte, pte);
        }

        arch_leave_lazy_mmu_mode();
        if (force_flush)
                flush_tlb_range(vma, old_end - len, old_end);
        if (new_ptl != old_ptl)
                spin_unlock(new_ptl);
        pte_unmap(new_pte - 1);
        pte_unmap_unlock(old_pte - 1, old_ptl);
out:
        if (need_rmap_locks)
                drop_rmap_locks(vma);
        return err;
}

#ifndef arch_supports_page_table_move
#define arch_supports_page_table_move arch_supports_page_table_move
static inline bool arch_supports_page_table_move(void)
{
        return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
                IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
}
#endif

#ifdef CONFIG_HAVE_MOVE_PMD
static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
                  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
        spinlock_t *old_ptl, *new_ptl;
        struct mm_struct *mm = vma->vm_mm;
        pmd_t pmd;

        if (!arch_supports_page_table_move())
                return false;
        /*
         * The destination pmd shouldn't be established, free_pgtables()
         * should have released it.
         *
         * However, there's a case during execve() where we use mremap
         * to move the initial stack, and in that case the target area
         * may overlap the source area (always moving down).
         *
         * If everything is PMD-aligned, that works fine, as moving
         * each pmd down will clear the source pmd. But if we first
         * have a few 4kB-only pages that get moved down, and then
         * hit the "now the rest is PMD-aligned, let's do everything
         * one pmd at a time", we will still have the old (now empty
         * of any 4kB pages, but still there) PMD in the page table
         * tree.
         *
         * Warn on it once - because we really should try to figure
         * out how to do this better - but then say "I won't move
         * this pmd".
         *
         * One alternative might be to just unmap the target pmd at
         * this point, and verify that it really is empty. We'll see.
         */
        if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
                return false;

        /*
         * We don't have to worry about the ordering of src and dst
         * ptlocks because exclusive mmap_lock prevents deadlock.
         */
        old_ptl = pmd_lock(vma->vm_mm, old_pmd);
        new_ptl = pmd_lockptr(mm, new_pmd);
        if (new_ptl != old_ptl)
                spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);

        /* Clear the pmd */
        pmd = *old_pmd;
        pmd_clear(old_pmd);

        VM_BUG_ON(!pmd_none(*new_pmd));

        pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
        flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
        if (new_ptl != old_ptl)
                spin_unlock(new_ptl);
        spin_unlock(old_ptl);

        return true;
}
#else
static inline bool move_normal_pmd(struct vm_area_struct *vma,
                unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
                pmd_t *new_pmd)
{
        return false;
}
#endif

#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
                  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
{
        spinlock_t *old_ptl, *new_ptl;
        struct mm_struct *mm = vma->vm_mm;
        pud_t pud;

        if (!arch_supports_page_table_move())
                return false;
        /*
         * The destination pud shouldn't be established, free_pgtables()
         * should have released it.
         */
        if (WARN_ON_ONCE(!pud_none(*new_pud)))
                return false;

        /*
         * We don't have to worry about the ordering of src and dst
         * ptlocks because exclusive mmap_lock prevents deadlock.
         */
        old_ptl = pud_lock(vma->vm_mm, old_pud);
        new_ptl = pud_lockptr(mm, new_pud);
        if (new_ptl != old_ptl)
                spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);

        /* Clear the pud */
        pud = *old_pud;
        pud_clear(old_pud);

        VM_BUG_ON(!pud_none(*new_pud));

        pud_populate(mm, new_pud, pud_pgtable(pud));
        flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
        if (new_ptl != old_ptl)
                spin_unlock(new_ptl);
        spin_unlock(old_ptl);

        return true;
}
#else
static inline bool move_normal_pud(struct vm_area_struct *vma,
                unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
                pud_t *new_pud)
{
        return false;
}
#endif

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
                          unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
{
        spinlock_t *old_ptl, *new_ptl;
        struct mm_struct *mm = vma->vm_mm;
        pud_t pud;

        /*
         * The destination pud shouldn't be established, free_pgtables()
         * should have released it.
         */
        if (WARN_ON_ONCE(!pud_none(*new_pud)))
                return false;

        /*
         * We don't have to worry about the ordering of src and dst
         * ptlocks because exclusive mmap_lock prevents deadlock.
         */
        old_ptl = pud_lock(vma->vm_mm, old_pud);
        new_ptl = pud_lockptr(mm, new_pud);
        if (new_ptl != old_ptl)
                spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);

        /* Clear the pud */
        pud = *old_pud;
        pud_clear(old_pud);

        VM_BUG_ON(!pud_none(*new_pud));

        /* Set the new pud */
        /* mark soft_ditry when we add pud level soft dirty support */
        set_pud_at(mm, new_addr, new_pud, pud);
        flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
        if (new_ptl != old_ptl)
                spin_unlock(new_ptl);
        spin_unlock(old_ptl);

        return true;
}
#else
static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
                          unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
{
        WARN_ON_ONCE(1);
        return false;

}
#endif

enum pgt_entry {
        NORMAL_PMD,
        HPAGE_PMD,
        NORMAL_PUD,
        HPAGE_PUD,
};

/*
 * Returns an extent of the corresponding size for the pgt_entry specified if
 * valid. Else returns a smaller extent bounded by the end of the source and
 * destination pgt_entry.
 */
static __always_inline unsigned long get_extent(enum pgt_entry entry,
                        unsigned long old_addr, unsigned long old_end,
                        unsigned long new_addr)
{
        unsigned long next, extent, mask, size;

        switch (entry) {
        case HPAGE_PMD:
        case NORMAL_PMD:
                mask = PMD_MASK;
                size = PMD_SIZE;
                break;
        case HPAGE_PUD:
        case NORMAL_PUD:
                mask = PUD_MASK;
                size = PUD_SIZE;
                break;
        default:
                BUILD_BUG();
                break;
        }

        next = (old_addr + size) & mask;
        /* even if next overflowed, extent below will be ok */
        extent = next - old_addr;
        if (extent > old_end - old_addr)
                extent = old_end - old_addr;
        next = (new_addr + size) & mask;
        if (extent > next - new_addr)
                extent = next - new_addr;
        return extent;
}

/*
 * Attempts to speedup the move by moving entry at the level corresponding to
 * pgt_entry. Returns true if the move was successful, else false.
 */
static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
                        unsigned long old_addr, unsigned long new_addr,
                        void *old_entry, void *new_entry, bool need_rmap_locks)
{
        bool moved = false;

        /* See comment in move_ptes() */
        if (need_rmap_locks)
                take_rmap_locks(vma);

        switch (entry) {
        case NORMAL_PMD:
                moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
                                        new_entry);
                break;
        case NORMAL_PUD:
                moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
                                        new_entry);
                break;
        case HPAGE_PMD:
                moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
                        move_huge_pmd(vma, old_addr, new_addr, old_entry,
                                      new_entry);
                break;
        case HPAGE_PUD:
                moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
                        move_huge_pud(vma, old_addr, new_addr, old_entry,
                                      new_entry);
                break;

        default:
                WARN_ON_ONCE(1);
                break;
        }

        if (need_rmap_locks)
                drop_rmap_locks(vma);

        return moved;
}

/*
 * A helper to check if aligning down is OK. The aligned address should fall
 * on *no mapping*. For the stack moving down, that's a special move within
 * the VMA that is created to span the source and destination of the move,
 * so we make an exception for it.
 */
static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align,
                            unsigned long mask, bool for_stack)
{
        unsigned long addr_masked = addr_to_align & mask;

        /*
         * If @addr_to_align of either source or destination is not the beginning
         * of the corresponding VMA, we can't align down or we will destroy part
         * of the current mapping.
         */
        if (!for_stack && vma->vm_start != addr_to_align)
                return false;

        /* In the stack case we explicitly permit in-VMA alignment. */
        if (for_stack && addr_masked >= vma->vm_start)
                return true;

        /*
         * Make sure the realignment doesn't cause the address to fall on an
         * existing mapping.
         */
        return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL;
}

/* Opportunistically realign to specified boundary for faster copy. */
static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma,
                             unsigned long *new_addr, struct vm_area_struct *new_vma,
                             unsigned long mask, bool for_stack)
{
        /* Skip if the addresses are already aligned. */
        if ((*old_addr & ~mask) == 0)
                return;

        /* Only realign if the new and old addresses are mutually aligned. */
        if ((*old_addr & ~mask) != (*new_addr & ~mask))
                return;

        /* Ensure realignment doesn't cause overlap with existing mappings. */
        if (!can_align_down(old_vma, *old_addr, mask, for_stack) ||
            !can_align_down(new_vma, *new_addr, mask, for_stack))
                return;

        *old_addr = *old_addr & mask;
        *new_addr = *new_addr & mask;
}

unsigned long move_page_tables(struct vm_area_struct *vma,
                unsigned long old_addr, struct vm_area_struct *new_vma,
                unsigned long new_addr, unsigned long len,
                bool need_rmap_locks, bool for_stack)
{
        unsigned long extent, old_end;
        struct mmu_notifier_range range;
        pmd_t *old_pmd, *new_pmd;
        pud_t *old_pud, *new_pud;

        if (!len)
                return 0;

        old_end = old_addr + len;

        if (is_vm_hugetlb_page(vma))
                return move_hugetlb_page_tables(vma, new_vma, old_addr,
                                                new_addr, len);

        /*
         * If possible, realign addresses to PMD boundary for faster copy.
         * Only realign if the mremap copying hits a PMD boundary.
         */
        if (len >= PMD_SIZE - (old_addr & ~PMD_MASK))
                try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK,
                                 for_stack);

        flush_cache_range(vma, old_addr, old_end);
        mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
                                old_addr, old_end);
        mmu_notifier_invalidate_range_start(&range);

        for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
                cond_resched();
                /*
                 * If extent is PUD-sized try to speed up the move by moving at the
                 * PUD level if possible.
                 */
                extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);

                old_pud = get_old_pud(vma->vm_mm, old_addr);
                if (!old_pud)
                        continue;
                new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
                if (!new_pud)
                        break;
                if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
                        if (extent == HPAGE_PUD_SIZE) {
                                move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
                                               old_pud, new_pud, need_rmap_locks);
                                /* We ignore and continue on error? */
                                continue;
                        }
                } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {

                        if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
                                           old_pud, new_pud, true))
                                continue;
                }

                extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
                old_pmd = get_old_pmd(vma->vm_mm, old_addr);
                if (!old_pmd)
                        continue;
                new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
                if (!new_pmd)
                        break;
again:
                if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
                    pmd_devmap(*old_pmd)) {
                        if (extent == HPAGE_PMD_SIZE &&
                            move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
                                           old_pmd, new_pmd, need_rmap_locks))
                                continue;
                        split_huge_pmd(vma, old_pmd, old_addr);
                } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
                           extent == PMD_SIZE) {
                        /*
                         * If the extent is PMD-sized, try to speed the move by
                         * moving at the PMD level if possible.
                         */
                        if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
                                           old_pmd, new_pmd, true))
                                continue;
                }
                if (pmd_none(*old_pmd))
                        continue;
                if (pte_alloc(new_vma->vm_mm, new_pmd))
                        break;
                if (move_ptes(vma, old_pmd, old_addr, old_addr + extent,
                              new_vma, new_pmd, new_addr, need_rmap_locks) < 0)
                        goto again;
        }

        mmu_notifier_invalidate_range_end(&range);

        /*
         * Prevent negative return values when {old,new}_addr was realigned
         * but we broke out of the above loop for the first PMD itself.
         */
        if (len + old_addr < old_end)
                return 0;

        return len + old_addr - old_end;        /* how much done */
}

static unsigned long move_vma(struct vm_area_struct *vma,
                unsigned long old_addr, unsigned long old_len,
                unsigned long new_len, unsigned long new_addr,
                bool *locked, unsigned long flags,
                struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
{
        long to_account = new_len - old_len;
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *new_vma;
        unsigned long vm_flags = vma->vm_flags;
        unsigned long new_pgoff;
        unsigned long moved_len;
        unsigned long account_start = 0;
        unsigned long account_end = 0;
        unsigned long hiwater_vm;
        int err = 0;
        bool need_rmap_locks;
        struct vma_iterator vmi;

        /*
         * We'd prefer to avoid failure later on in do_munmap:
         * which may split one vma into three before unmapping.
         */
        if (mm->map_count >= sysctl_max_map_count - 3)
                return -ENOMEM;

        if (unlikely(flags & MREMAP_DONTUNMAP))
                to_account = new_len;

        if (vma->vm_ops && vma->vm_ops->may_split) {
                if (vma->vm_start != old_addr)
                        err = vma->vm_ops->may_split(vma, old_addr);
                if (!err && vma->vm_end != old_addr + old_len)
                        err = vma->vm_ops->may_split(vma, old_addr + old_len);
                if (err)
                        return err;
        }

        /*
         * Advise KSM to break any KSM pages in the area to be moved:
         * it would be confusing if they were to turn up at the new
         * location, where they happen to coincide with different KSM
         * pages recently unmapped.  But leave vma->vm_flags as it was,
         * so KSM can come around to merge on vma and new_vma afterwards.
         */
        err = ksm_madvise(vma, old_addr, old_addr + old_len,
                                                MADV_UNMERGEABLE, &vm_flags);
        if (err)
                return err;

        if (vm_flags & VM_ACCOUNT) {
                if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
                        return -ENOMEM;
        }

        vma_start_write(vma);
        new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
        new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
                           &need_rmap_locks);
        if (!new_vma) {
                if (vm_flags & VM_ACCOUNT)
                        vm_unacct_memory(to_account >> PAGE_SHIFT);
                return -ENOMEM;
        }

        moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
                                     need_rmap_locks, false);
        if (moved_len < old_len) {
                err = -ENOMEM;
        } else if (vma->vm_ops && vma->vm_ops->mremap) {
                err = vma->vm_ops->mremap(new_vma);
        }

        if (unlikely(err)) {
                /*
                 * On error, move entries back from new area to old,
                 * which will succeed since page tables still there,
                 * and then proceed to unmap new area instead of old.
                 */
                move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
                                 true, false);
                vma = new_vma;
                old_len = new_len;
                old_addr = new_addr;
                new_addr = err;
        } else {
                mremap_userfaultfd_prep(new_vma, uf);
        }

        if (is_vm_hugetlb_page(vma)) {
                clear_vma_resv_huge_pages(vma);
        }

        /* Conceal VM_ACCOUNT so old reservation is not undone */
        if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
                vm_flags_clear(vma, VM_ACCOUNT);
                if (vma->vm_start < old_addr)
                        account_start = vma->vm_start;
                if (vma->vm_end > old_addr + old_len)
                        account_end = vma->vm_end;
        }

        /*
         * If we failed to move page tables we still do total_vm increment
         * since do_munmap() will decrement it by old_len == new_len.
         *
         * Since total_vm is about to be raised artificially high for a
         * moment, we need to restore high watermark afterwards: if stats
         * are taken meanwhile, total_vm and hiwater_vm appear too high.
         * If this were a serious issue, we'd add a flag to do_munmap().
         */
        hiwater_vm = mm->hiwater_vm;
        vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);

        /* Tell pfnmap has moved from this vma */
        if (unlikely(vma->vm_flags & VM_PFNMAP))
                untrack_pfn_clear(vma);

        if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
                /* We always clear VM_LOCKED[ONFAULT] on the old vma */
                vm_flags_clear(vma, VM_LOCKED_MASK);

                /*
                 * anon_vma links of the old vma is no longer needed after its page
                 * table has been moved.
                 */
                if (new_vma != vma && vma->vm_start == old_addr &&
                        vma->vm_end == (old_addr + old_len))
                        unlink_anon_vmas(vma);

                /* Because we won't unmap we don't need to touch locked_vm */
                return new_addr;
        }

        vma_iter_init(&vmi, mm, old_addr);
        if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) {
                /* OOM: unable to split vma, just get accounts right */
                if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
                        vm_acct_memory(old_len >> PAGE_SHIFT);
                account_start = account_end = 0;
        }

        if (vm_flags & VM_LOCKED) {
                mm->locked_vm += new_len >> PAGE_SHIFT;
                *locked = true;
        }

        mm->hiwater_vm = hiwater_vm;

        /* Restore VM_ACCOUNT if one or two pieces of vma left */
        if (account_start) {
                vma = vma_prev(&vmi);
                vm_flags_set(vma, VM_ACCOUNT);
        }

        if (account_end) {
                vma = vma_next(&vmi);
                vm_flags_set(vma, VM_ACCOUNT);
        }

        return new_addr;
}

static struct vm_area_struct *vma_to_resize(unsigned long addr,
        unsigned long old_len, unsigned long new_len, unsigned long flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long pgoff;

        vma = vma_lookup(mm, addr);
        if (!vma)
                return ERR_PTR(-EFAULT);

        /*
         * !old_len is a special case where an attempt is made to 'duplicate'
         * a mapping.  This makes no sense for private mappings as it will
         * instead create a fresh/new mapping unrelated to the original.  This
         * is contrary to the basic idea of mremap which creates new mappings
         * based on the original.  There are no known use cases for this
         * behavior.  As a result, fail such attempts.
         */
        if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
                pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap.  This is not supported.\n", current->comm, current->pid);
                return ERR_PTR(-EINVAL);
        }

        if ((flags & MREMAP_DONTUNMAP) &&
                        (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
                return ERR_PTR(-EINVAL);

        /* We can't remap across vm area boundaries */
        if (old_len > vma->vm_end - addr)
                return ERR_PTR(-EFAULT);

        if (new_len == old_len)
                return vma;

        /* Need to be careful about a growing mapping */
        pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
        pgoff += vma->vm_pgoff;
        if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
                return ERR_PTR(-EINVAL);

        if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
                return ERR_PTR(-EFAULT);

        if (!mlock_future_ok(mm, vma->vm_flags, new_len - old_len))
                return ERR_PTR(-EAGAIN);

        if (!may_expand_vm(mm, vma->vm_flags,
                                (new_len - old_len) >> PAGE_SHIFT))
                return ERR_PTR(-ENOMEM);

        return vma;
}

static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
                unsigned long new_addr, unsigned long new_len, bool *locked,
                unsigned long flags, struct vm_userfaultfd_ctx *uf,
                struct list_head *uf_unmap_early,
                struct list_head *uf_unmap)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long ret = -EINVAL;
        unsigned long map_flags = 0;

        if (offset_in_page(new_addr))
                goto out;

        if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
                goto out;

        /* Ensure the old/new locations do not overlap */
        if (addr + old_len > new_addr && new_addr + new_len > addr)
                goto out;

        /*
         * move_vma() need us to stay 4 maps below the threshold, otherwise
         * it will bail out at the very beginning.
         * That is a problem if we have already unmaped the regions here
         * (new_addr, and old_addr), because userspace will not know the
         * state of the vma's after it gets -ENOMEM.
         * So, to avoid such scenario we can pre-compute if the whole
         * operation has high chances to success map-wise.
         * Worst-scenario case is when both vma's (new_addr and old_addr) get
         * split in 3 before unmapping it.
         * That means 2 more maps (1 for each) to the ones we already hold.
         * Check whether current map count plus 2 still leads us to 4 maps below
         * the threshold, otherwise return -ENOMEM here to be more safe.
         */
        if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
                return -ENOMEM;

        /*
         * In mremap_to().
         * Move a VMA to another location, check if src addr is sealed.
         *
         * Place can_modify_mm here because mremap_to()
         * does its own checking for address range, and we only
         * check the sealing after passing those checks.
         *
         * can_modify_mm assumes we have acquired the lock on MM.
         */
        if (unlikely(!can_modify_mm(mm, addr, addr + old_len)))
                return -EPERM;

        if (flags & MREMAP_FIXED) {
                /*
                 * In mremap_to().
                 * VMA is moved to dst address, and munmap dst first.
                 * do_munmap will check if dst is sealed.
                 */
                ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
                if (ret)
                        goto out;
        }

        if (old_len > new_len) {
                ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
                if (ret)
                        goto out;
                old_len = new_len;
        }

        vma = vma_to_resize(addr, old_len, new_len, flags);
        if (IS_ERR(vma)) {
                ret = PTR_ERR(vma);
                goto out;
        }

        /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
        if (flags & MREMAP_DONTUNMAP &&
                !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
                ret = -ENOMEM;
                goto out;
        }

        if (flags & MREMAP_FIXED)
                map_flags |= MAP_FIXED;

        if (vma->vm_flags & VM_MAYSHARE)
                map_flags |= MAP_SHARED;

        ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
                                ((addr - vma->vm_start) >> PAGE_SHIFT),
                                map_flags);
        if (IS_ERR_VALUE(ret))
                goto out;

        /* We got a new mapping */
        if (!(flags & MREMAP_FIXED))
                new_addr = ret;

        ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
                       uf_unmap);

out:
        return ret;
}

static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
{
        unsigned long end = vma->vm_end + delta;

        if (end < vma->vm_end) /* overflow */
                return 0;
        if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
                return 0;
        if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
                              0, MAP_FIXED) & ~PAGE_MASK)
                return 0;
        return 1;
}

/*
 * Expand (or shrink) an existing mapping, potentially moving it at the
 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
 *
 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
 * This option implies MREMAP_MAYMOVE.
 */
SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
                unsigned long, new_len, unsigned long, flags,
                unsigned long, new_addr)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long ret = -EINVAL;
        bool locked = false;
        struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
        LIST_HEAD(uf_unmap_early);
        LIST_HEAD(uf_unmap);

        /*
         * There is a deliberate asymmetry here: we strip the pointer tag
         * from the old address but leave the new address alone. This is
         * for consistency with mmap(), where we prevent the creation of
         * aliasing mappings in userspace by leaving the tag bits of the
         * mapping address intact. A non-zero tag will cause the subsequent
         * range checks to reject the address as invalid.
         *
         * See Documentation/arch/arm64/tagged-address-abi.rst for more
         * information.
         */
        addr = untagged_addr(addr);

        if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
                return ret;

        if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
                return ret;

        /*
         * MREMAP_DONTUNMAP is always a move and it does not allow resizing
         * in the process.
         */
        if (flags & MREMAP_DONTUNMAP &&
                        (!(flags & MREMAP_MAYMOVE) || old_len != new_len))
                return ret;


        if (offset_in_page(addr))
                return ret;

        old_len = PAGE_ALIGN(old_len);
        new_len = PAGE_ALIGN(new_len);

        /*
         * We allow a zero old-len as a special case
         * for DOS-emu "duplicate shm area" thing. But
         * a zero new-len is nonsensical.
         */
        if (!new_len)
                return ret;

        if (mmap_write_lock_killable(current->mm))
                return -EINTR;
        vma = vma_lookup(mm, addr);
        if (!vma) {
                ret = -EFAULT;
                goto out;
        }

        if (is_vm_hugetlb_page(vma)) {
                struct hstate *h __maybe_unused = hstate_vma(vma);

                old_len = ALIGN(old_len, huge_page_size(h));
                new_len = ALIGN(new_len, huge_page_size(h));

                /* addrs must be huge page aligned */
                if (addr & ~huge_page_mask(h))
                        goto out;
                if (new_addr & ~huge_page_mask(h))
                        goto out;

                /*
                 * Don't allow remap expansion, because the underlying hugetlb
                 * reservation is not yet capable to handle split reservation.
                 */
                if (new_len > old_len)
                        goto out;
        }

        if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
                ret = mremap_to(addr, old_len, new_addr, new_len,
                                &locked, flags, &uf, &uf_unmap_early,
                                &uf_unmap);
                goto out;
        }

        /*
         * Below is shrink/expand case (not mremap_to())
         * Check if src address is sealed, if so, reject.
         * In other words, prevent shrinking or expanding a sealed VMA.
         *
         * Place can_modify_mm here so we can keep the logic related to
         * shrink/expand together.
         */
        if (unlikely(!can_modify_mm(mm, addr, addr + old_len))) {
                ret = -EPERM;
                goto out;
        }

        /*
         * Always allow a shrinking remap: that just unmaps
         * the unnecessary pages..
         * do_vmi_munmap does all the needed commit accounting, and
         * unlocks the mmap_lock if so directed.
         */
        if (old_len >= new_len) {
                VMA_ITERATOR(vmi, mm, addr + new_len);

                if (old_len == new_len) {
                        ret = addr;
                        goto out;
                }

                ret = do_vmi_munmap(&vmi, mm, addr + new_len, old_len - new_len,
                                    &uf_unmap, true);
                if (ret)
                        goto out;

                ret = addr;
                goto out_unlocked;
        }

        /*
         * Ok, we need to grow..
         */
        vma = vma_to_resize(addr, old_len, new_len, flags);
        if (IS_ERR(vma)) {
                ret = PTR_ERR(vma);
                goto out;
        }

        /* old_len exactly to the end of the area..
         */
        if (old_len == vma->vm_end - addr) {
                unsigned long delta = new_len - old_len;

                /* can we just expand the current mapping? */
                if (vma_expandable(vma, delta)) {
                        long pages = delta >> PAGE_SHIFT;
                        VMA_ITERATOR(vmi, mm, vma->vm_end);
                        long charged = 0;

                        if (vma->vm_flags & VM_ACCOUNT) {
                                if (security_vm_enough_memory_mm(mm, pages)) {
                                        ret = -ENOMEM;
                                        goto out;
                                }
                                charged = pages;
                        }

                        /*
                         * Function vma_merge_extend() is called on the
                         * extension we are adding to the already existing vma,
                         * vma_merge_extend() will merge this extension with the
                         * already existing vma (expand operation itself) and
                         * possibly also with the next vma if it becomes
                         * adjacent to the expanded vma and otherwise
                         * compatible.
                         */
                        vma = vma_merge_extend(&vmi, vma, delta);
                        if (!vma) {
                                vm_unacct_memory(charged);
                                ret = -ENOMEM;
                                goto out;
                        }

                        vm_stat_account(mm, vma->vm_flags, pages);
                        if (vma->vm_flags & VM_LOCKED) {
                                mm->locked_vm += pages;
                                locked = true;
                                new_addr = addr;
                        }
                        ret = addr;
                        goto out;
                }
        }

        /*
         * We weren't able to just expand or shrink the area,
         * we need to create a new one and move it..
         */
        ret = -ENOMEM;
        if (flags & MREMAP_MAYMOVE) {
                unsigned long map_flags = 0;
                if (vma->vm_flags & VM_MAYSHARE)
                        map_flags |= MAP_SHARED;

                new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
                                        vma->vm_pgoff +
                                        ((addr - vma->vm_start) >> PAGE_SHIFT),
                                        map_flags);
                if (IS_ERR_VALUE(new_addr)) {
                        ret = new_addr;
                        goto out;
                }

                ret = move_vma(vma, addr, old_len, new_len, new_addr,
                               &locked, flags, &uf, &uf_unmap);
        }
out:
        if (offset_in_page(ret))
                locked = false;
        mmap_write_unlock(current->mm);
        if (locked && new_len > old_len)
                mm_populate(new_addr + old_len, new_len - old_len);
out_unlocked:
        userfaultfd_unmap_complete(mm, &uf_unmap_early);
        mremap_userfaultfd_complete(&uf, addr, ret, old_len);
        userfaultfd_unmap_complete(mm, &uf_unmap);
        return ret;
}


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 























































































































































































































































































































































































































































   21 






































































































    4 



































































































































































































   11 

   13 
























   19 














































    1 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   19 








































































    3 





























    3 






































































































































































































































































































































    2 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
// SPDX-License-Identifier: GPL-2.0

// Generated by scripts/atomic/gen-atomic-fallback.sh
// DO NOT MODIFY THIS FILE DIRECTLY

#ifndef _LINUX_ATOMIC_FALLBACK_H
#define _LINUX_ATOMIC_FALLBACK_H

#include <linux/compiler.h>

#if defined(arch_xchg)
#define raw_xchg arch_xchg
#elif defined(arch_xchg_relaxed)
#define raw_xchg(...) \
        __atomic_op_fence(arch_xchg, __VA_ARGS__)
#else
extern void raw_xchg_not_implemented(void);
#define raw_xchg(...) raw_xchg_not_implemented()
#endif

#if defined(arch_xchg_acquire)
#define raw_xchg_acquire arch_xchg_acquire
#elif defined(arch_xchg_relaxed)
#define raw_xchg_acquire(...) \
        __atomic_op_acquire(arch_xchg, __VA_ARGS__)
#elif defined(arch_xchg)
#define raw_xchg_acquire arch_xchg
#else
extern void raw_xchg_acquire_not_implemented(void);
#define raw_xchg_acquire(...) raw_xchg_acquire_not_implemented()
#endif

#if defined(arch_xchg_release)
#define raw_xchg_release arch_xchg_release
#elif defined(arch_xchg_relaxed)
#define raw_xchg_release(...) \
        __atomic_op_release(arch_xchg, __VA_ARGS__)
#elif defined(arch_xchg)
#define raw_xchg_release arch_xchg
#else
extern void raw_xchg_release_not_implemented(void);
#define raw_xchg_release(...) raw_xchg_release_not_implemented()
#endif

#if defined(arch_xchg_relaxed)
#define raw_xchg_relaxed arch_xchg_relaxed
#elif defined(arch_xchg)
#define raw_xchg_relaxed arch_xchg
#else
extern void raw_xchg_relaxed_not_implemented(void);
#define raw_xchg_relaxed(...) raw_xchg_relaxed_not_implemented()
#endif

#if defined(arch_cmpxchg)
#define raw_cmpxchg arch_cmpxchg
#elif defined(arch_cmpxchg_relaxed)
#define raw_cmpxchg(...) \
        __atomic_op_fence(arch_cmpxchg, __VA_ARGS__)
#else
extern void raw_cmpxchg_not_implemented(void);
#define raw_cmpxchg(...) raw_cmpxchg_not_implemented()
#endif

#if defined(arch_cmpxchg_acquire)
#define raw_cmpxchg_acquire arch_cmpxchg_acquire
#elif defined(arch_cmpxchg_relaxed)
#define raw_cmpxchg_acquire(...) \
        __atomic_op_acquire(arch_cmpxchg, __VA_ARGS__)
#elif defined(arch_cmpxchg)
#define raw_cmpxchg_acquire arch_cmpxchg
#else
extern void raw_cmpxchg_acquire_not_implemented(void);
#define raw_cmpxchg_acquire(...) raw_cmpxchg_acquire_not_implemented()
#endif

#if defined(arch_cmpxchg_release)
#define raw_cmpxchg_release arch_cmpxchg_release
#elif defined(arch_cmpxchg_relaxed)
#define raw_cmpxchg_release(...) \
        __atomic_op_release(arch_cmpxchg, __VA_ARGS__)
#elif defined(arch_cmpxchg)
#define raw_cmpxchg_release arch_cmpxchg
#else
extern void raw_cmpxchg_release_not_implemented(void);
#define raw_cmpxchg_release(...) raw_cmpxchg_release_not_implemented()
#endif

#if defined(arch_cmpxchg_relaxed)
#define raw_cmpxchg_relaxed arch_cmpxchg_relaxed
#elif defined(arch_cmpxchg)
#define raw_cmpxchg_relaxed arch_cmpxchg
#else
extern void raw_cmpxchg_relaxed_not_implemented(void);
#define raw_cmpxchg_relaxed(...) raw_cmpxchg_relaxed_not_implemented()
#endif

#if defined(arch_cmpxchg64)
#define raw_cmpxchg64 arch_cmpxchg64
#elif defined(arch_cmpxchg64_relaxed)
#define raw_cmpxchg64(...) \
        __atomic_op_fence(arch_cmpxchg64, __VA_ARGS__)
#else
extern void raw_cmpxchg64_not_implemented(void);
#define raw_cmpxchg64(...) raw_cmpxchg64_not_implemented()
#endif

#if defined(arch_cmpxchg64_acquire)
#define raw_cmpxchg64_acquire arch_cmpxchg64_acquire
#elif defined(arch_cmpxchg64_relaxed)
#define raw_cmpxchg64_acquire(...) \
        __atomic_op_acquire(arch_cmpxchg64, __VA_ARGS__)
#elif defined(arch_cmpxchg64)
#define raw_cmpxchg64_acquire arch_cmpxchg64
#else
extern void raw_cmpxchg64_acquire_not_implemented(void);
#define raw_cmpxchg64_acquire(...) raw_cmpxchg64_acquire_not_implemented()
#endif

#if defined(arch_cmpxchg64_release)
#define raw_cmpxchg64_release arch_cmpxchg64_release
#elif defined(arch_cmpxchg64_relaxed)
#define raw_cmpxchg64_release(...) \
        __atomic_op_release(arch_cmpxchg64, __VA_ARGS__)
#elif defined(arch_cmpxchg64)
#define raw_cmpxchg64_release arch_cmpxchg64
#else
extern void raw_cmpxchg64_release_not_implemented(void);
#define raw_cmpxchg64_release(...) raw_cmpxchg64_release_not_implemented()
#endif

#if defined(arch_cmpxchg64_relaxed)
#define raw_cmpxchg64_relaxed arch_cmpxchg64_relaxed
#elif defined(arch_cmpxchg64)
#define raw_cmpxchg64_relaxed arch_cmpxchg64
#else
extern void raw_cmpxchg64_relaxed_not_implemented(void);
#define raw_cmpxchg64_relaxed(...) raw_cmpxchg64_relaxed_not_implemented()
#endif

#if defined(arch_cmpxchg128)
#define raw_cmpxchg128 arch_cmpxchg128
#elif defined(arch_cmpxchg128_relaxed)
#define raw_cmpxchg128(...) \
        __atomic_op_fence(arch_cmpxchg128, __VA_ARGS__)
#else
extern void raw_cmpxchg128_not_implemented(void);
#define raw_cmpxchg128(...) raw_cmpxchg128_not_implemented()
#endif

#if defined(arch_cmpxchg128_acquire)
#define raw_cmpxchg128_acquire arch_cmpxchg128_acquire
#elif defined(arch_cmpxchg128_relaxed)
#define raw_cmpxchg128_acquire(...) \
        __atomic_op_acquire(arch_cmpxchg128, __VA_ARGS__)
#elif defined(arch_cmpxchg128)
#define raw_cmpxchg128_acquire arch_cmpxchg128
#else
extern void raw_cmpxchg128_acquire_not_implemented(void);
#define raw_cmpxchg128_acquire(...) raw_cmpxchg128_acquire_not_implemented()
#endif

#if defined(arch_cmpxchg128_release)
#define raw_cmpxchg128_release arch_cmpxchg128_release
#elif defined(arch_cmpxchg128_relaxed)
#define raw_cmpxchg128_release(...) \
        __atomic_op_release(arch_cmpxchg128, __VA_ARGS__)
#elif defined(arch_cmpxchg128)
#define raw_cmpxchg128_release arch_cmpxchg128
#else
extern void raw_cmpxchg128_release_not_implemented(void);
#define raw_cmpxchg128_release(...) raw_cmpxchg128_release_not_implemented()
#endif

#if defined(arch_cmpxchg128_relaxed)
#define raw_cmpxchg128_relaxed arch_cmpxchg128_relaxed
#elif defined(arch_cmpxchg128)
#define raw_cmpxchg128_relaxed arch_cmpxchg128
#else
extern void raw_cmpxchg128_relaxed_not_implemented(void);
#define raw_cmpxchg128_relaxed(...) raw_cmpxchg128_relaxed_not_implemented()
#endif

#if defined(arch_try_cmpxchg)
#define raw_try_cmpxchg arch_try_cmpxchg
#elif defined(arch_try_cmpxchg_relaxed)
#define raw_try_cmpxchg(...) \
        __atomic_op_fence(arch_try_cmpxchg, __VA_ARGS__)
#else
#define raw_try_cmpxchg(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg_acquire)
#define raw_try_cmpxchg_acquire arch_try_cmpxchg_acquire
#elif defined(arch_try_cmpxchg_relaxed)
#define raw_try_cmpxchg_acquire(...) \
        __atomic_op_acquire(arch_try_cmpxchg, __VA_ARGS__)
#elif defined(arch_try_cmpxchg)
#define raw_try_cmpxchg_acquire arch_try_cmpxchg
#else
#define raw_try_cmpxchg_acquire(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg_acquire((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg_release)
#define raw_try_cmpxchg_release arch_try_cmpxchg_release
#elif defined(arch_try_cmpxchg_relaxed)
#define raw_try_cmpxchg_release(...) \
        __atomic_op_release(arch_try_cmpxchg, __VA_ARGS__)
#elif defined(arch_try_cmpxchg)
#define raw_try_cmpxchg_release arch_try_cmpxchg
#else
#define raw_try_cmpxchg_release(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg_release((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg_relaxed)
#define raw_try_cmpxchg_relaxed arch_try_cmpxchg_relaxed
#elif defined(arch_try_cmpxchg)
#define raw_try_cmpxchg_relaxed arch_try_cmpxchg
#else
#define raw_try_cmpxchg_relaxed(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg_relaxed((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg64)
#define raw_try_cmpxchg64 arch_try_cmpxchg64
#elif defined(arch_try_cmpxchg64_relaxed)
#define raw_try_cmpxchg64(...) \
        __atomic_op_fence(arch_try_cmpxchg64, __VA_ARGS__)
#else
#define raw_try_cmpxchg64(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg64_acquire)
#define raw_try_cmpxchg64_acquire arch_try_cmpxchg64_acquire
#elif defined(arch_try_cmpxchg64_relaxed)
#define raw_try_cmpxchg64_acquire(...) \
        __atomic_op_acquire(arch_try_cmpxchg64, __VA_ARGS__)
#elif defined(arch_try_cmpxchg64)
#define raw_try_cmpxchg64_acquire arch_try_cmpxchg64
#else
#define raw_try_cmpxchg64_acquire(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64_acquire((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg64_release)
#define raw_try_cmpxchg64_release arch_try_cmpxchg64_release
#elif defined(arch_try_cmpxchg64_relaxed)
#define raw_try_cmpxchg64_release(...) \
        __atomic_op_release(arch_try_cmpxchg64, __VA_ARGS__)
#elif defined(arch_try_cmpxchg64)
#define raw_try_cmpxchg64_release arch_try_cmpxchg64
#else
#define raw_try_cmpxchg64_release(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64_release((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg64_relaxed)
#define raw_try_cmpxchg64_relaxed arch_try_cmpxchg64_relaxed
#elif defined(arch_try_cmpxchg64)
#define raw_try_cmpxchg64_relaxed arch_try_cmpxchg64
#else
#define raw_try_cmpxchg64_relaxed(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64_relaxed((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg128)
#define raw_try_cmpxchg128 arch_try_cmpxchg128
#elif defined(arch_try_cmpxchg128_relaxed)
#define raw_try_cmpxchg128(...) \
        __atomic_op_fence(arch_try_cmpxchg128, __VA_ARGS__)
#else
#define raw_try_cmpxchg128(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg128_acquire)
#define raw_try_cmpxchg128_acquire arch_try_cmpxchg128_acquire
#elif defined(arch_try_cmpxchg128_relaxed)
#define raw_try_cmpxchg128_acquire(...) \
        __atomic_op_acquire(arch_try_cmpxchg128, __VA_ARGS__)
#elif defined(arch_try_cmpxchg128)
#define raw_try_cmpxchg128_acquire arch_try_cmpxchg128
#else
#define raw_try_cmpxchg128_acquire(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128_acquire((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg128_release)
#define raw_try_cmpxchg128_release arch_try_cmpxchg128_release
#elif defined(arch_try_cmpxchg128_relaxed)
#define raw_try_cmpxchg128_release(...) \
        __atomic_op_release(arch_try_cmpxchg128, __VA_ARGS__)
#elif defined(arch_try_cmpxchg128)
#define raw_try_cmpxchg128_release arch_try_cmpxchg128
#else
#define raw_try_cmpxchg128_release(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128_release((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg128_relaxed)
#define raw_try_cmpxchg128_relaxed arch_try_cmpxchg128_relaxed
#elif defined(arch_try_cmpxchg128)
#define raw_try_cmpxchg128_relaxed arch_try_cmpxchg128
#else
#define raw_try_cmpxchg128_relaxed(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128_relaxed((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#define raw_cmpxchg_local arch_cmpxchg_local

#ifdef arch_try_cmpxchg_local
#define raw_try_cmpxchg_local arch_try_cmpxchg_local
#else
#define raw_try_cmpxchg_local(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg_local((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#define raw_cmpxchg64_local arch_cmpxchg64_local

#ifdef arch_try_cmpxchg64_local
#define raw_try_cmpxchg64_local arch_try_cmpxchg64_local
#else
#define raw_try_cmpxchg64_local(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64_local((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#define raw_cmpxchg128_local arch_cmpxchg128_local

#ifdef arch_try_cmpxchg128_local
#define raw_try_cmpxchg128_local arch_try_cmpxchg128_local
#else
#define raw_try_cmpxchg128_local(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128_local((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#define raw_sync_cmpxchg arch_sync_cmpxchg

#ifdef arch_sync_try_cmpxchg
#define raw_sync_try_cmpxchg arch_sync_try_cmpxchg
#else
#define raw_sync_try_cmpxchg(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_sync_cmpxchg((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

/**
 * raw_atomic_read() - atomic load with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_read() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline int
raw_atomic_read(const atomic_t *v)
{
        return arch_atomic_read(v);
}

/**
 * raw_atomic_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_read_acquire() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline int
raw_atomic_read_acquire(const atomic_t *v)
{
#if defined(arch_atomic_read_acquire)
        return arch_atomic_read_acquire(v);
#else
        int ret;

        if (__native_word(atomic_t)) {
                ret = smp_load_acquire(&(v)->counter);
        } else {
                ret = raw_atomic_read(v);
                __atomic_acquire_fence();
        }

        return ret;
#endif
}

/**
 * raw_atomic_set() - atomic set with relaxed ordering
 * @v: pointer to atomic_t
 * @i: int value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_set() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_set(atomic_t *v, int i)
{
        arch_atomic_set(v, i);
}

/**
 * raw_atomic_set_release() - atomic set with release ordering
 * @v: pointer to atomic_t
 * @i: int value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_set_release() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_set_release(atomic_t *v, int i)
{
#if defined(arch_atomic_set_release)
        arch_atomic_set_release(v, i);
#else
        if (__native_word(atomic_t)) {
                smp_store_release(&(v)->counter, i);
        } else {
                __atomic_release_fence();
                raw_atomic_set(v, i);
        }
#endif
}

/**
 * raw_atomic_add() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_add(int i, atomic_t *v)
{
        arch_atomic_add(i, v);
}

/**
 * raw_atomic_add_return() - atomic add with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_add_return(int i, atomic_t *v)
{
#if defined(arch_atomic_add_return)
        return arch_atomic_add_return(i, v);
#elif defined(arch_atomic_add_return_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_add_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_add_return"
#endif
}

/**
 * raw_atomic_add_return_acquire() - atomic add with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_add_return_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_add_return_acquire)
        return arch_atomic_add_return_acquire(i, v);
#elif defined(arch_atomic_add_return_relaxed)
        int ret = arch_atomic_add_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_add_return)
        return arch_atomic_add_return(i, v);
#else
#error "Unable to define raw_atomic_add_return_acquire"
#endif
}

/**
 * raw_atomic_add_return_release() - atomic add with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_add_return_release(int i, atomic_t *v)
{
#if defined(arch_atomic_add_return_release)
        return arch_atomic_add_return_release(i, v);
#elif defined(arch_atomic_add_return_relaxed)
        __atomic_release_fence();
        return arch_atomic_add_return_relaxed(i, v);
#elif defined(arch_atomic_add_return)
        return arch_atomic_add_return(i, v);
#else
#error "Unable to define raw_atomic_add_return_release"
#endif
}

/**
 * raw_atomic_add_return_relaxed() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_add_return_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_add_return_relaxed)
        return arch_atomic_add_return_relaxed(i, v);
#elif defined(arch_atomic_add_return)
        return arch_atomic_add_return(i, v);
#else
#error "Unable to define raw_atomic_add_return_relaxed"
#endif
}

/**
 * raw_atomic_fetch_add() - atomic add with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_add)
        return arch_atomic_fetch_add(i, v);
#elif defined(arch_atomic_fetch_add_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_add_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_add"
#endif
}

/**
 * raw_atomic_fetch_add_acquire() - atomic add with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_add_acquire)
        return arch_atomic_fetch_add_acquire(i, v);
#elif defined(arch_atomic_fetch_add_relaxed)
        int ret = arch_atomic_fetch_add_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_add)
        return arch_atomic_fetch_add(i, v);
#else
#error "Unable to define raw_atomic_fetch_add_acquire"
#endif
}

/**
 * raw_atomic_fetch_add_release() - atomic add with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_add_release)
        return arch_atomic_fetch_add_release(i, v);
#elif defined(arch_atomic_fetch_add_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_add_relaxed(i, v);
#elif defined(arch_atomic_fetch_add)
        return arch_atomic_fetch_add(i, v);
#else
#error "Unable to define raw_atomic_fetch_add_release"
#endif
}

/**
 * raw_atomic_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_add_relaxed)
        return arch_atomic_fetch_add_relaxed(i, v);
#elif defined(arch_atomic_fetch_add)
        return arch_atomic_fetch_add(i, v);
#else
#error "Unable to define raw_atomic_fetch_add_relaxed"
#endif
}

/**
 * raw_atomic_sub() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_sub(int i, atomic_t *v)
{
        arch_atomic_sub(i, v);
}

/**
 * raw_atomic_sub_return() - atomic subtract with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_sub_return(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_return)
        return arch_atomic_sub_return(i, v);
#elif defined(arch_atomic_sub_return_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_sub_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_sub_return"
#endif
}

/**
 * raw_atomic_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_sub_return_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_return_acquire)
        return arch_atomic_sub_return_acquire(i, v);
#elif defined(arch_atomic_sub_return_relaxed)
        int ret = arch_atomic_sub_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_sub_return)
        return arch_atomic_sub_return(i, v);
#else
#error "Unable to define raw_atomic_sub_return_acquire"
#endif
}

/**
 * raw_atomic_sub_return_release() - atomic subtract with release ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_sub_return_release(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_return_release)
        return arch_atomic_sub_return_release(i, v);
#elif defined(arch_atomic_sub_return_relaxed)
        __atomic_release_fence();
        return arch_atomic_sub_return_relaxed(i, v);
#elif defined(arch_atomic_sub_return)
        return arch_atomic_sub_return(i, v);
#else
#error "Unable to define raw_atomic_sub_return_release"
#endif
}

/**
 * raw_atomic_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_sub_return_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_return_relaxed)
        return arch_atomic_sub_return_relaxed(i, v);
#elif defined(arch_atomic_sub_return)
        return arch_atomic_sub_return(i, v);
#else
#error "Unable to define raw_atomic_sub_return_relaxed"
#endif
}

/**
 * raw_atomic_fetch_sub() - atomic subtract with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_sub() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_sub(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_sub)
        return arch_atomic_fetch_sub(i, v);
#elif defined(arch_atomic_fetch_sub_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_sub_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_sub"
#endif
}

/**
 * raw_atomic_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_sub_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_sub_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_sub_acquire)
        return arch_atomic_fetch_sub_acquire(i, v);
#elif defined(arch_atomic_fetch_sub_relaxed)
        int ret = arch_atomic_fetch_sub_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_sub)
        return arch_atomic_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic_fetch_sub_acquire"
#endif
}

/**
 * raw_atomic_fetch_sub_release() - atomic subtract with release ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_sub_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_sub_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_sub_release)
        return arch_atomic_fetch_sub_release(i, v);
#elif defined(arch_atomic_fetch_sub_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_sub_relaxed(i, v);
#elif defined(arch_atomic_fetch_sub)
        return arch_atomic_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic_fetch_sub_release"
#endif
}

/**
 * raw_atomic_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_sub_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_sub_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_sub_relaxed)
        return arch_atomic_fetch_sub_relaxed(i, v);
#elif defined(arch_atomic_fetch_sub)
        return arch_atomic_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic_fetch_sub_relaxed"
#endif
}

/**
 * raw_atomic_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_inc(atomic_t *v)
{
#if defined(arch_atomic_inc)
        arch_atomic_inc(v);
#else
        raw_atomic_add(1, v);
#endif
}

/**
 * raw_atomic_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_inc_return(atomic_t *v)
{
#if defined(arch_atomic_inc_return)
        return arch_atomic_inc_return(v);
#elif defined(arch_atomic_inc_return_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_inc_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_add_return(1, v);
#endif
}

/**
 * raw_atomic_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_inc_return_acquire(atomic_t *v)
{
#if defined(arch_atomic_inc_return_acquire)
        return arch_atomic_inc_return_acquire(v);
#elif defined(arch_atomic_inc_return_relaxed)
        int ret = arch_atomic_inc_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_inc_return)
        return arch_atomic_inc_return(v);
#else
        return raw_atomic_add_return_acquire(1, v);
#endif
}

/**
 * raw_atomic_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_inc_return_release(atomic_t *v)
{
#if defined(arch_atomic_inc_return_release)
        return arch_atomic_inc_return_release(v);
#elif defined(arch_atomic_inc_return_relaxed)
        __atomic_release_fence();
        return arch_atomic_inc_return_relaxed(v);
#elif defined(arch_atomic_inc_return)
        return arch_atomic_inc_return(v);
#else
        return raw_atomic_add_return_release(1, v);
#endif
}

/**
 * raw_atomic_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_inc_return_relaxed(atomic_t *v)
{
#if defined(arch_atomic_inc_return_relaxed)
        return arch_atomic_inc_return_relaxed(v);
#elif defined(arch_atomic_inc_return)
        return arch_atomic_inc_return(v);
#else
        return raw_atomic_add_return_relaxed(1, v);
#endif
}

/**
 * raw_atomic_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_inc() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_inc(atomic_t *v)
{
#if defined(arch_atomic_fetch_inc)
        return arch_atomic_fetch_inc(v);
#elif defined(arch_atomic_fetch_inc_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_inc_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_fetch_add(1, v);
#endif
}

/**
 * raw_atomic_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_inc_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_inc_acquire(atomic_t *v)
{
#if defined(arch_atomic_fetch_inc_acquire)
        return arch_atomic_fetch_inc_acquire(v);
#elif defined(arch_atomic_fetch_inc_relaxed)
        int ret = arch_atomic_fetch_inc_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_inc)
        return arch_atomic_fetch_inc(v);
#else
        return raw_atomic_fetch_add_acquire(1, v);
#endif
}

/**
 * raw_atomic_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_inc_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_inc_release(atomic_t *v)
{
#if defined(arch_atomic_fetch_inc_release)
        return arch_atomic_fetch_inc_release(v);
#elif defined(arch_atomic_fetch_inc_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_inc_relaxed(v);
#elif defined(arch_atomic_fetch_inc)
        return arch_atomic_fetch_inc(v);
#else
        return raw_atomic_fetch_add_release(1, v);
#endif
}

/**
 * raw_atomic_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_inc_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_inc_relaxed(atomic_t *v)
{
#if defined(arch_atomic_fetch_inc_relaxed)
        return arch_atomic_fetch_inc_relaxed(v);
#elif defined(arch_atomic_fetch_inc)
        return arch_atomic_fetch_inc(v);
#else
        return raw_atomic_fetch_add_relaxed(1, v);
#endif
}

/**
 * raw_atomic_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_dec(atomic_t *v)
{
#if defined(arch_atomic_dec)
        arch_atomic_dec(v);
#else
        raw_atomic_sub(1, v);
#endif
}

/**
 * raw_atomic_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_dec_return(atomic_t *v)
{
#if defined(arch_atomic_dec_return)
        return arch_atomic_dec_return(v);
#elif defined(arch_atomic_dec_return_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_dec_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_sub_return(1, v);
#endif
}

/**
 * raw_atomic_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_dec_return_acquire(atomic_t *v)
{
#if defined(arch_atomic_dec_return_acquire)
        return arch_atomic_dec_return_acquire(v);
#elif defined(arch_atomic_dec_return_relaxed)
        int ret = arch_atomic_dec_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_dec_return)
        return arch_atomic_dec_return(v);
#else
        return raw_atomic_sub_return_acquire(1, v);
#endif
}

/**
 * raw_atomic_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_dec_return_release(atomic_t *v)
{
#if defined(arch_atomic_dec_return_release)
        return arch_atomic_dec_return_release(v);
#elif defined(arch_atomic_dec_return_relaxed)
        __atomic_release_fence();
        return arch_atomic_dec_return_relaxed(v);
#elif defined(arch_atomic_dec_return)
        return arch_atomic_dec_return(v);
#else
        return raw_atomic_sub_return_release(1, v);
#endif
}

/**
 * raw_atomic_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_dec_return_relaxed(atomic_t *v)
{
#if defined(arch_atomic_dec_return_relaxed)
        return arch_atomic_dec_return_relaxed(v);
#elif defined(arch_atomic_dec_return)
        return arch_atomic_dec_return(v);
#else
        return raw_atomic_sub_return_relaxed(1, v);
#endif
}

/**
 * raw_atomic_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_dec() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_dec(atomic_t *v)
{
#if defined(arch_atomic_fetch_dec)
        return arch_atomic_fetch_dec(v);
#elif defined(arch_atomic_fetch_dec_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_dec_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_fetch_sub(1, v);
#endif
}

/**
 * raw_atomic_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_dec_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_dec_acquire(atomic_t *v)
{
#if defined(arch_atomic_fetch_dec_acquire)
        return arch_atomic_fetch_dec_acquire(v);
#elif defined(arch_atomic_fetch_dec_relaxed)
        int ret = arch_atomic_fetch_dec_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_dec)
        return arch_atomic_fetch_dec(v);
#else
        return raw_atomic_fetch_sub_acquire(1, v);
#endif
}

/**
 * raw_atomic_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_dec_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_dec_release(atomic_t *v)
{
#if defined(arch_atomic_fetch_dec_release)
        return arch_atomic_fetch_dec_release(v);
#elif defined(arch_atomic_fetch_dec_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_dec_relaxed(v);
#elif defined(arch_atomic_fetch_dec)
        return arch_atomic_fetch_dec(v);
#else
        return raw_atomic_fetch_sub_release(1, v);
#endif
}

/**
 * raw_atomic_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_dec_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_dec_relaxed(atomic_t *v)
{
#if defined(arch_atomic_fetch_dec_relaxed)
        return arch_atomic_fetch_dec_relaxed(v);
#elif defined(arch_atomic_fetch_dec)
        return arch_atomic_fetch_dec(v);
#else
        return raw_atomic_fetch_sub_relaxed(1, v);
#endif
}

/**
 * raw_atomic_and() - atomic bitwise AND with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_and() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_and(int i, atomic_t *v)
{
        arch_atomic_and(i, v);
}

/**
 * raw_atomic_fetch_and() - atomic bitwise AND with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_and() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_and(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_and)
        return arch_atomic_fetch_and(i, v);
#elif defined(arch_atomic_fetch_and_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_and_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_and"
#endif
}

/**
 * raw_atomic_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_and_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_and_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_and_acquire)
        return arch_atomic_fetch_and_acquire(i, v);
#elif defined(arch_atomic_fetch_and_relaxed)
        int ret = arch_atomic_fetch_and_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_and)
        return arch_atomic_fetch_and(i, v);
#else
#error "Unable to define raw_atomic_fetch_and_acquire"
#endif
}

/**
 * raw_atomic_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_and_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_and_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_and_release)
        return arch_atomic_fetch_and_release(i, v);
#elif defined(arch_atomic_fetch_and_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_and_relaxed(i, v);
#elif defined(arch_atomic_fetch_and)
        return arch_atomic_fetch_and(i, v);
#else
#error "Unable to define raw_atomic_fetch_and_release"
#endif
}

/**
 * raw_atomic_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_and_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_and_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_and_relaxed)
        return arch_atomic_fetch_and_relaxed(i, v);
#elif defined(arch_atomic_fetch_and)
        return arch_atomic_fetch_and(i, v);
#else
#error "Unable to define raw_atomic_fetch_and_relaxed"
#endif
}

/**
 * raw_atomic_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_andnot() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_andnot(int i, atomic_t *v)
{
#if defined(arch_atomic_andnot)
        arch_atomic_andnot(i, v);
#else
        raw_atomic_and(~i, v);
#endif
}

/**
 * raw_atomic_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_andnot() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_andnot(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_andnot)
        return arch_atomic_fetch_andnot(i, v);
#elif defined(arch_atomic_fetch_andnot_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_andnot_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_fetch_and(~i, v);
#endif
}

/**
 * raw_atomic_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_andnot_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_andnot_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_andnot_acquire)
        return arch_atomic_fetch_andnot_acquire(i, v);
#elif defined(arch_atomic_fetch_andnot_relaxed)
        int ret = arch_atomic_fetch_andnot_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_andnot)
        return arch_atomic_fetch_andnot(i, v);
#else
        return raw_atomic_fetch_and_acquire(~i, v);
#endif
}

/**
 * raw_atomic_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_andnot_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_andnot_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_andnot_release)
        return arch_atomic_fetch_andnot_release(i, v);
#elif defined(arch_atomic_fetch_andnot_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_andnot_relaxed(i, v);
#elif defined(arch_atomic_fetch_andnot)
        return arch_atomic_fetch_andnot(i, v);
#else
        return raw_atomic_fetch_and_release(~i, v);
#endif
}

/**
 * raw_atomic_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_andnot_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_andnot_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_andnot_relaxed)
        return arch_atomic_fetch_andnot_relaxed(i, v);
#elif defined(arch_atomic_fetch_andnot)
        return arch_atomic_fetch_andnot(i, v);
#else
        return raw_atomic_fetch_and_relaxed(~i, v);
#endif
}

/**
 * raw_atomic_or() - atomic bitwise OR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_or() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_or(int i, atomic_t *v)
{
        arch_atomic_or(i, v);
}

/**
 * raw_atomic_fetch_or() - atomic bitwise OR with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_or() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_or(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_or)
        return arch_atomic_fetch_or(i, v);
#elif defined(arch_atomic_fetch_or_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_or_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_or"
#endif
}

/**
 * raw_atomic_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_or_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_or_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_or_acquire)
        return arch_atomic_fetch_or_acquire(i, v);
#elif defined(arch_atomic_fetch_or_relaxed)
        int ret = arch_atomic_fetch_or_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_or)
        return arch_atomic_fetch_or(i, v);
#else
#error "Unable to define raw_atomic_fetch_or_acquire"
#endif
}

/**
 * raw_atomic_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_or_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_or_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_or_release)
        return arch_atomic_fetch_or_release(i, v);
#elif defined(arch_atomic_fetch_or_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_or_relaxed(i, v);
#elif defined(arch_atomic_fetch_or)
        return arch_atomic_fetch_or(i, v);
#else
#error "Unable to define raw_atomic_fetch_or_release"
#endif
}

/**
 * raw_atomic_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_or_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_or_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_or_relaxed)
        return arch_atomic_fetch_or_relaxed(i, v);
#elif defined(arch_atomic_fetch_or)
        return arch_atomic_fetch_or(i, v);
#else
#error "Unable to define raw_atomic_fetch_or_relaxed"
#endif
}

/**
 * raw_atomic_xor() - atomic bitwise XOR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xor() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_xor(int i, atomic_t *v)
{
        arch_atomic_xor(i, v);
}

/**
 * raw_atomic_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_xor() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_xor(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_xor)
        return arch_atomic_fetch_xor(i, v);
#elif defined(arch_atomic_fetch_xor_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_xor_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_xor"
#endif
}

/**
 * raw_atomic_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_xor_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_xor_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_xor_acquire)
        return arch_atomic_fetch_xor_acquire(i, v);
#elif defined(arch_atomic_fetch_xor_relaxed)
        int ret = arch_atomic_fetch_xor_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_xor)
        return arch_atomic_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic_fetch_xor_acquire"
#endif
}

/**
 * raw_atomic_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_xor_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_xor_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_xor_release)
        return arch_atomic_fetch_xor_release(i, v);
#elif defined(arch_atomic_fetch_xor_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_xor_relaxed(i, v);
#elif defined(arch_atomic_fetch_xor)
        return arch_atomic_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic_fetch_xor_release"
#endif
}

/**
 * raw_atomic_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_xor_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_xor_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_xor_relaxed)
        return arch_atomic_fetch_xor_relaxed(i, v);
#elif defined(arch_atomic_fetch_xor)
        return arch_atomic_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic_fetch_xor_relaxed"
#endif
}

/**
 * raw_atomic_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_xchg(atomic_t *v, int new)
{
#if defined(arch_atomic_xchg)
        return arch_atomic_xchg(v, new);
#elif defined(arch_atomic_xchg_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_xchg_relaxed(v, new);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_xchg(&v->counter, new);
#endif
}

/**
 * raw_atomic_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_xchg_acquire(atomic_t *v, int new)
{
#if defined(arch_atomic_xchg_acquire)
        return arch_atomic_xchg_acquire(v, new);
#elif defined(arch_atomic_xchg_relaxed)
        int ret = arch_atomic_xchg_relaxed(v, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_xchg)
        return arch_atomic_xchg(v, new);
#else
        return raw_xchg_acquire(&v->counter, new);
#endif
}

/**
 * raw_atomic_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_xchg_release(atomic_t *v, int new)
{
#if defined(arch_atomic_xchg_release)
        return arch_atomic_xchg_release(v, new);
#elif defined(arch_atomic_xchg_relaxed)
        __atomic_release_fence();
        return arch_atomic_xchg_relaxed(v, new);
#elif defined(arch_atomic_xchg)
        return arch_atomic_xchg(v, new);
#else
        return raw_xchg_release(&v->counter, new);
#endif
}

/**
 * raw_atomic_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_xchg_relaxed(atomic_t *v, int new)
{
#if defined(arch_atomic_xchg_relaxed)
        return arch_atomic_xchg_relaxed(v, new);
#elif defined(arch_atomic_xchg)
        return arch_atomic_xchg(v, new);
#else
        return raw_xchg_relaxed(&v->counter, new);
#endif
}

/**
 * raw_atomic_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_cmpxchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_cmpxchg(atomic_t *v, int old, int new)
{
#if defined(arch_atomic_cmpxchg)
        return arch_atomic_cmpxchg(v, old, new);
#elif defined(arch_atomic_cmpxchg_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_cmpxchg(&v->counter, old, new);
#endif
}

/**
 * raw_atomic_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_cmpxchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_cmpxchg_acquire(atomic_t *v, int old, int new)
{
#if defined(arch_atomic_cmpxchg_acquire)
        return arch_atomic_cmpxchg_acquire(v, old, new);
#elif defined(arch_atomic_cmpxchg_relaxed)
        int ret = arch_atomic_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_cmpxchg)
        return arch_atomic_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_acquire(&v->counter, old, new);
#endif
}

/**
 * raw_atomic_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_cmpxchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_cmpxchg_release(atomic_t *v, int old, int new)
{
#if defined(arch_atomic_cmpxchg_release)
        return arch_atomic_cmpxchg_release(v, old, new);
#elif defined(arch_atomic_cmpxchg_relaxed)
        __atomic_release_fence();
        return arch_atomic_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic_cmpxchg)
        return arch_atomic_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_release(&v->counter, old, new);
#endif
}

/**
 * raw_atomic_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_cmpxchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_cmpxchg_relaxed(atomic_t *v, int old, int new)
{
#if defined(arch_atomic_cmpxchg_relaxed)
        return arch_atomic_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic_cmpxchg)
        return arch_atomic_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_relaxed(&v->counter, old, new);
#endif
}

/**
 * raw_atomic_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_try_cmpxchg() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
#if defined(arch_atomic_try_cmpxchg)
        return arch_atomic_try_cmpxchg(v, old, new);
#elif defined(arch_atomic_try_cmpxchg_relaxed)
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_try_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
#else
        int r, o = *old;
        r = raw_atomic_cmpxchg(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_try_cmpxchg_acquire() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
{
#if defined(arch_atomic_try_cmpxchg_acquire)
        return arch_atomic_try_cmpxchg_acquire(v, old, new);
#elif defined(arch_atomic_try_cmpxchg_relaxed)
        bool ret = arch_atomic_try_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_try_cmpxchg)
        return arch_atomic_try_cmpxchg(v, old, new);
#else
        int r, o = *old;
        r = raw_atomic_cmpxchg_acquire(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_try_cmpxchg_release() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
{
#if defined(arch_atomic_try_cmpxchg_release)
        return arch_atomic_try_cmpxchg_release(v, old, new);
#elif defined(arch_atomic_try_cmpxchg_relaxed)
        __atomic_release_fence();
        return arch_atomic_try_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic_try_cmpxchg)
        return arch_atomic_try_cmpxchg(v, old, new);
#else
        int r, o = *old;
        r = raw_atomic_cmpxchg_release(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_try_cmpxchg_relaxed() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
{
#if defined(arch_atomic_try_cmpxchg_relaxed)
        return arch_atomic_try_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic_try_cmpxchg)
        return arch_atomic_try_cmpxchg(v, old, new);
#else
        int r, o = *old;
        r = raw_atomic_cmpxchg_relaxed(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_sub_and_test(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_and_test)
        return arch_atomic_sub_and_test(i, v);
#else
        return raw_atomic_sub_return(i, v) == 0;
#endif
}

/**
 * raw_atomic_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_dec_and_test(atomic_t *v)
{
#if defined(arch_atomic_dec_and_test)
        return arch_atomic_dec_and_test(v);
#else
        return raw_atomic_dec_return(v) == 0;
#endif
}

/**
 * raw_atomic_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_inc_and_test(atomic_t *v)
{
#if defined(arch_atomic_inc_and_test)
        return arch_atomic_inc_and_test(v);
#else
        return raw_atomic_inc_return(v) == 0;
#endif
}

/**
 * raw_atomic_add_negative() - atomic add and test if negative with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_negative() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_negative(int i, atomic_t *v)
{
#if defined(arch_atomic_add_negative)
        return arch_atomic_add_negative(i, v);
#elif defined(arch_atomic_add_negative_relaxed)
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_add_negative_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_add_return(i, v) < 0;
#endif
}

/**
 * raw_atomic_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_negative_acquire() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_negative_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_add_negative_acquire)
        return arch_atomic_add_negative_acquire(i, v);
#elif defined(arch_atomic_add_negative_relaxed)
        bool ret = arch_atomic_add_negative_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_add_negative)
        return arch_atomic_add_negative(i, v);
#else
        return raw_atomic_add_return_acquire(i, v) < 0;
#endif
}

/**
 * raw_atomic_add_negative_release() - atomic add and test if negative with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_negative_release() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_negative_release(int i, atomic_t *v)
{
#if defined(arch_atomic_add_negative_release)
        return arch_atomic_add_negative_release(i, v);
#elif defined(arch_atomic_add_negative_relaxed)
        __atomic_release_fence();
        return arch_atomic_add_negative_relaxed(i, v);
#elif defined(arch_atomic_add_negative)
        return arch_atomic_add_negative(i, v);
#else
        return raw_atomic_add_return_release(i, v) < 0;
#endif
}

/**
 * raw_atomic_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_negative_relaxed() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_negative_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_add_negative_relaxed)
        return arch_atomic_add_negative_relaxed(i, v);
#elif defined(arch_atomic_add_negative)
        return arch_atomic_add_negative(i, v);
#else
        return raw_atomic_add_return_relaxed(i, v) < 0;
#endif
}

/**
 * raw_atomic_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_t
 * @a: int value to add
 * @u: int value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add_unless() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add_unless(atomic_t *v, int a, int u)
{
#if defined(arch_atomic_fetch_add_unless)
        return arch_atomic_fetch_add_unless(v, a, u);
#else
        int c = raw_atomic_read(v);

        do {
                if (unlikely(c == u))
                        break;
        } while (!raw_atomic_try_cmpxchg(v, &c, c + a));

        return c;
#endif
}

/**
 * raw_atomic_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_t
 * @a: int value to add
 * @u: int value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_add_unless() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_unless(atomic_t *v, int a, int u)
{
#if defined(arch_atomic_add_unless)
        return arch_atomic_add_unless(v, a, u);
#else
        return raw_atomic_fetch_add_unless(v, a, u) != u;
#endif
}

/**
 * raw_atomic_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_inc_not_zero() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_inc_not_zero(atomic_t *v)
{
#if defined(arch_atomic_inc_not_zero)
        return arch_atomic_inc_not_zero(v);
#else
        return raw_atomic_add_unless(v, 1, 0);
#endif
}

/**
 * raw_atomic_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_inc_unless_negative() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_inc_unless_negative(atomic_t *v)
{
#if defined(arch_atomic_inc_unless_negative)
        return arch_atomic_inc_unless_negative(v);
#else
        int c = raw_atomic_read(v);

        do {
                if (unlikely(c < 0))
                        return false;
        } while (!raw_atomic_try_cmpxchg(v, &c, c + 1));

        return true;
#endif
}

/**
 * raw_atomic_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_dec_unless_positive() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_dec_unless_positive(atomic_t *v)
{
#if defined(arch_atomic_dec_unless_positive)
        return arch_atomic_dec_unless_positive(v);
#else
        int c = raw_atomic_read(v);

        do {
                if (unlikely(c > 0))
                        return false;
        } while (!raw_atomic_try_cmpxchg(v, &c, c - 1));

        return true;
#endif
}

/**
 * raw_atomic_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_dec_if_positive() elsewhere.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline int
raw_atomic_dec_if_positive(atomic_t *v)
{
#if defined(arch_atomic_dec_if_positive)
        return arch_atomic_dec_if_positive(v);
#else
        int dec, c = raw_atomic_read(v);

        do {
                dec = c - 1;
                if (unlikely(dec < 0))
                        break;
        } while (!raw_atomic_try_cmpxchg(v, &c, dec));

        return dec;
#endif
}

#ifdef CONFIG_GENERIC_ATOMIC64
#include <asm-generic/atomic64.h>
#endif

/**
 * raw_atomic64_read() - atomic load with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_read() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline s64
raw_atomic64_read(const atomic64_t *v)
{
        return arch_atomic64_read(v);
}

/**
 * raw_atomic64_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_read_acquire() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline s64
raw_atomic64_read_acquire(const atomic64_t *v)
{
#if defined(arch_atomic64_read_acquire)
        return arch_atomic64_read_acquire(v);
#else
        s64 ret;

        if (__native_word(atomic64_t)) {
                ret = smp_load_acquire(&(v)->counter);
        } else {
                ret = raw_atomic64_read(v);
                __atomic_acquire_fence();
        }

        return ret;
#endif
}

/**
 * raw_atomic64_set() - atomic set with relaxed ordering
 * @v: pointer to atomic64_t
 * @i: s64 value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_set() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_set(atomic64_t *v, s64 i)
{
        arch_atomic64_set(v, i);
}

/**
 * raw_atomic64_set_release() - atomic set with release ordering
 * @v: pointer to atomic64_t
 * @i: s64 value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_set_release() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_set_release(atomic64_t *v, s64 i)
{
#if defined(arch_atomic64_set_release)
        arch_atomic64_set_release(v, i);
#else
        if (__native_word(atomic64_t)) {
                smp_store_release(&(v)->counter, i);
        } else {
                __atomic_release_fence();
                raw_atomic64_set(v, i);
        }
#endif
}

/**
 * raw_atomic64_add() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_add(s64 i, atomic64_t *v)
{
        arch_atomic64_add(i, v);
}

/**
 * raw_atomic64_add_return() - atomic add with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_add_return(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_return)
        return arch_atomic64_add_return(i, v);
#elif defined(arch_atomic64_add_return_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_add_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_add_return"
#endif
}

/**
 * raw_atomic64_add_return_acquire() - atomic add with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_add_return_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_return_acquire)
        return arch_atomic64_add_return_acquire(i, v);
#elif defined(arch_atomic64_add_return_relaxed)
        s64 ret = arch_atomic64_add_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_add_return)
        return arch_atomic64_add_return(i, v);
#else
#error "Unable to define raw_atomic64_add_return_acquire"
#endif
}

/**
 * raw_atomic64_add_return_release() - atomic add with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_add_return_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_return_release)
        return arch_atomic64_add_return_release(i, v);
#elif defined(arch_atomic64_add_return_relaxed)
        __atomic_release_fence();
        return arch_atomic64_add_return_relaxed(i, v);
#elif defined(arch_atomic64_add_return)
        return arch_atomic64_add_return(i, v);
#else
#error "Unable to define raw_atomic64_add_return_release"
#endif
}

/**
 * raw_atomic64_add_return_relaxed() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_add_return_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_return_relaxed)
        return arch_atomic64_add_return_relaxed(i, v);
#elif defined(arch_atomic64_add_return)
        return arch_atomic64_add_return(i, v);
#else
#error "Unable to define raw_atomic64_add_return_relaxed"
#endif
}

/**
 * raw_atomic64_fetch_add() - atomic add with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_add)
        return arch_atomic64_fetch_add(i, v);
#elif defined(arch_atomic64_fetch_add_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_add_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_add"
#endif
}

/**
 * raw_atomic64_fetch_add_acquire() - atomic add with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_add_acquire)
        return arch_atomic64_fetch_add_acquire(i, v);
#elif defined(arch_atomic64_fetch_add_relaxed)
        s64 ret = arch_atomic64_fetch_add_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_add)
        return arch_atomic64_fetch_add(i, v);
#else
#error "Unable to define raw_atomic64_fetch_add_acquire"
#endif
}

/**
 * raw_atomic64_fetch_add_release() - atomic add with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_add_release)
        return arch_atomic64_fetch_add_release(i, v);
#elif defined(arch_atomic64_fetch_add_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_add_relaxed(i, v);
#elif defined(arch_atomic64_fetch_add)
        return arch_atomic64_fetch_add(i, v);
#else
#error "Unable to define raw_atomic64_fetch_add_release"
#endif
}

/**
 * raw_atomic64_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_add_relaxed)
        return arch_atomic64_fetch_add_relaxed(i, v);
#elif defined(arch_atomic64_fetch_add)
        return arch_atomic64_fetch_add(i, v);
#else
#error "Unable to define raw_atomic64_fetch_add_relaxed"
#endif
}

/**
 * raw_atomic64_sub() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_sub(s64 i, atomic64_t *v)
{
        arch_atomic64_sub(i, v);
}

/**
 * raw_atomic64_sub_return() - atomic subtract with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_sub_return(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_return)
        return arch_atomic64_sub_return(i, v);
#elif defined(arch_atomic64_sub_return_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_sub_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_sub_return"
#endif
}

/**
 * raw_atomic64_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_sub_return_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_return_acquire)
        return arch_atomic64_sub_return_acquire(i, v);
#elif defined(arch_atomic64_sub_return_relaxed)
        s64 ret = arch_atomic64_sub_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_sub_return)
        return arch_atomic64_sub_return(i, v);
#else
#error "Unable to define raw_atomic64_sub_return_acquire"
#endif
}

/**
 * raw_atomic64_sub_return_release() - atomic subtract with release ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_sub_return_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_return_release)
        return arch_atomic64_sub_return_release(i, v);
#elif defined(arch_atomic64_sub_return_relaxed)
        __atomic_release_fence();
        return arch_atomic64_sub_return_relaxed(i, v);
#elif defined(arch_atomic64_sub_return)
        return arch_atomic64_sub_return(i, v);
#else
#error "Unable to define raw_atomic64_sub_return_release"
#endif
}

/**
 * raw_atomic64_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_sub_return_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_return_relaxed)
        return arch_atomic64_sub_return_relaxed(i, v);
#elif defined(arch_atomic64_sub_return)
        return arch_atomic64_sub_return(i, v);
#else
#error "Unable to define raw_atomic64_sub_return_relaxed"
#endif
}

/**
 * raw_atomic64_fetch_sub() - atomic subtract with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_sub() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_sub(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_sub)
        return arch_atomic64_fetch_sub(i, v);
#elif defined(arch_atomic64_fetch_sub_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_sub_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_sub"
#endif
}

/**
 * raw_atomic64_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_sub_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_sub_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_sub_acquire)
        return arch_atomic64_fetch_sub_acquire(i, v);
#elif defined(arch_atomic64_fetch_sub_relaxed)
        s64 ret = arch_atomic64_fetch_sub_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_sub)
        return arch_atomic64_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic64_fetch_sub_acquire"
#endif
}

/**
 * raw_atomic64_fetch_sub_release() - atomic subtract with release ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_sub_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_sub_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_sub_release)
        return arch_atomic64_fetch_sub_release(i, v);
#elif defined(arch_atomic64_fetch_sub_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_sub_relaxed(i, v);
#elif defined(arch_atomic64_fetch_sub)
        return arch_atomic64_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic64_fetch_sub_release"
#endif
}

/**
 * raw_atomic64_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_sub_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_sub_relaxed)
        return arch_atomic64_fetch_sub_relaxed(i, v);
#elif defined(arch_atomic64_fetch_sub)
        return arch_atomic64_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic64_fetch_sub_relaxed"
#endif
}

/**
 * raw_atomic64_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_inc(atomic64_t *v)
{
#if defined(arch_atomic64_inc)
        arch_atomic64_inc(v);
#else
        raw_atomic64_add(1, v);
#endif
}

/**
 * raw_atomic64_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_inc_return(atomic64_t *v)
{
#if defined(arch_atomic64_inc_return)
        return arch_atomic64_inc_return(v);
#elif defined(arch_atomic64_inc_return_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_inc_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_add_return(1, v);
#endif
}

/**
 * raw_atomic64_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_inc_return_acquire(atomic64_t *v)
{
#if defined(arch_atomic64_inc_return_acquire)
        return arch_atomic64_inc_return_acquire(v);
#elif defined(arch_atomic64_inc_return_relaxed)
        s64 ret = arch_atomic64_inc_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_inc_return)
        return arch_atomic64_inc_return(v);
#else
        return raw_atomic64_add_return_acquire(1, v);
#endif
}

/**
 * raw_atomic64_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_inc_return_release(atomic64_t *v)
{
#if defined(arch_atomic64_inc_return_release)
        return arch_atomic64_inc_return_release(v);
#elif defined(arch_atomic64_inc_return_relaxed)
        __atomic_release_fence();
        return arch_atomic64_inc_return_relaxed(v);
#elif defined(arch_atomic64_inc_return)
        return arch_atomic64_inc_return(v);
#else
        return raw_atomic64_add_return_release(1, v);
#endif
}

/**
 * raw_atomic64_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_inc_return_relaxed(atomic64_t *v)
{
#if defined(arch_atomic64_inc_return_relaxed)
        return arch_atomic64_inc_return_relaxed(v);
#elif defined(arch_atomic64_inc_return)
        return arch_atomic64_inc_return(v);
#else
        return raw_atomic64_add_return_relaxed(1, v);
#endif
}

/**
 * raw_atomic64_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_inc() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_inc(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_inc)
        return arch_atomic64_fetch_inc(v);
#elif defined(arch_atomic64_fetch_inc_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_inc_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_fetch_add(1, v);
#endif
}

/**
 * raw_atomic64_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_inc_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_inc_acquire(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_inc_acquire)
        return arch_atomic64_fetch_inc_acquire(v);
#elif defined(arch_atomic64_fetch_inc_relaxed)
        s64 ret = arch_atomic64_fetch_inc_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_inc)
        return arch_atomic64_fetch_inc(v);
#else
        return raw_atomic64_fetch_add_acquire(1, v);
#endif
}

/**
 * raw_atomic64_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_inc_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_inc_release(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_inc_release)
        return arch_atomic64_fetch_inc_release(v);
#elif defined(arch_atomic64_fetch_inc_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_inc_relaxed(v);
#elif defined(arch_atomic64_fetch_inc)
        return arch_atomic64_fetch_inc(v);
#else
        return raw_atomic64_fetch_add_release(1, v);
#endif
}

/**
 * raw_atomic64_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_inc_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_inc_relaxed(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_inc_relaxed)
        return arch_atomic64_fetch_inc_relaxed(v);
#elif defined(arch_atomic64_fetch_inc)
        return arch_atomic64_fetch_inc(v);
#else
        return raw_atomic64_fetch_add_relaxed(1, v);
#endif
}

/**
 * raw_atomic64_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_dec(atomic64_t *v)
{
#if defined(arch_atomic64_dec)
        arch_atomic64_dec(v);
#else
        raw_atomic64_sub(1, v);
#endif
}

/**
 * raw_atomic64_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_dec_return(atomic64_t *v)
{
#if defined(arch_atomic64_dec_return)
        return arch_atomic64_dec_return(v);
#elif defined(arch_atomic64_dec_return_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_dec_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_sub_return(1, v);
#endif
}

/**
 * raw_atomic64_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_dec_return_acquire(atomic64_t *v)
{
#if defined(arch_atomic64_dec_return_acquire)
        return arch_atomic64_dec_return_acquire(v);
#elif defined(arch_atomic64_dec_return_relaxed)
        s64 ret = arch_atomic64_dec_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_dec_return)
        return arch_atomic64_dec_return(v);
#else
        return raw_atomic64_sub_return_acquire(1, v);
#endif
}

/**
 * raw_atomic64_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_dec_return_release(atomic64_t *v)
{
#if defined(arch_atomic64_dec_return_release)
        return arch_atomic64_dec_return_release(v);
#elif defined(arch_atomic64_dec_return_relaxed)
        __atomic_release_fence();
        return arch_atomic64_dec_return_relaxed(v);
#elif defined(arch_atomic64_dec_return)
        return arch_atomic64_dec_return(v);
#else
        return raw_atomic64_sub_return_release(1, v);
#endif
}

/**
 * raw_atomic64_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_dec_return_relaxed(atomic64_t *v)
{
#if defined(arch_atomic64_dec_return_relaxed)
        return arch_atomic64_dec_return_relaxed(v);
#elif defined(arch_atomic64_dec_return)
        return arch_atomic64_dec_return(v);
#else
        return raw_atomic64_sub_return_relaxed(1, v);
#endif
}

/**
 * raw_atomic64_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_dec() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_dec(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_dec)
        return arch_atomic64_fetch_dec(v);
#elif defined(arch_atomic64_fetch_dec_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_dec_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_fetch_sub(1, v);
#endif
}

/**
 * raw_atomic64_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_dec_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_dec_acquire(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_dec_acquire)
        return arch_atomic64_fetch_dec_acquire(v);
#elif defined(arch_atomic64_fetch_dec_relaxed)
        s64 ret = arch_atomic64_fetch_dec_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_dec)
        return arch_atomic64_fetch_dec(v);
#else
        return raw_atomic64_fetch_sub_acquire(1, v);
#endif
}

/**
 * raw_atomic64_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_dec_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_dec_release(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_dec_release)
        return arch_atomic64_fetch_dec_release(v);
#elif defined(arch_atomic64_fetch_dec_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_dec_relaxed(v);
#elif defined(arch_atomic64_fetch_dec)
        return arch_atomic64_fetch_dec(v);
#else
        return raw_atomic64_fetch_sub_release(1, v);
#endif
}

/**
 * raw_atomic64_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_dec_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_dec_relaxed(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_dec_relaxed)
        return arch_atomic64_fetch_dec_relaxed(v);
#elif defined(arch_atomic64_fetch_dec)
        return arch_atomic64_fetch_dec(v);
#else
        return raw_atomic64_fetch_sub_relaxed(1, v);
#endif
}

/**
 * raw_atomic64_and() - atomic bitwise AND with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_and() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_and(s64 i, atomic64_t *v)
{
        arch_atomic64_and(i, v);
}

/**
 * raw_atomic64_fetch_and() - atomic bitwise AND with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_and() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_and(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_and)
        return arch_atomic64_fetch_and(i, v);
#elif defined(arch_atomic64_fetch_and_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_and_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_and"
#endif
}

/**
 * raw_atomic64_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_and_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_and_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_and_acquire)
        return arch_atomic64_fetch_and_acquire(i, v);
#elif defined(arch_atomic64_fetch_and_relaxed)
        s64 ret = arch_atomic64_fetch_and_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_and)
        return arch_atomic64_fetch_and(i, v);
#else
#error "Unable to define raw_atomic64_fetch_and_acquire"
#endif
}

/**
 * raw_atomic64_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_and_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_and_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_and_release)
        return arch_atomic64_fetch_and_release(i, v);
#elif defined(arch_atomic64_fetch_and_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_and_relaxed(i, v);
#elif defined(arch_atomic64_fetch_and)
        return arch_atomic64_fetch_and(i, v);
#else
#error "Unable to define raw_atomic64_fetch_and_release"
#endif
}

/**
 * raw_atomic64_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_and_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_and_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_and_relaxed)
        return arch_atomic64_fetch_and_relaxed(i, v);
#elif defined(arch_atomic64_fetch_and)
        return arch_atomic64_fetch_and(i, v);
#else
#error "Unable to define raw_atomic64_fetch_and_relaxed"
#endif
}

/**
 * raw_atomic64_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_andnot() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_andnot(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_andnot)
        arch_atomic64_andnot(i, v);
#else
        raw_atomic64_and(~i, v);
#endif
}

/**
 * raw_atomic64_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_andnot() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_andnot(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_andnot)
        return arch_atomic64_fetch_andnot(i, v);
#elif defined(arch_atomic64_fetch_andnot_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_andnot_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_fetch_and(~i, v);
#endif
}

/**
 * raw_atomic64_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_andnot_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_andnot_acquire)
        return arch_atomic64_fetch_andnot_acquire(i, v);
#elif defined(arch_atomic64_fetch_andnot_relaxed)
        s64 ret = arch_atomic64_fetch_andnot_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_andnot)
        return arch_atomic64_fetch_andnot(i, v);
#else
        return raw_atomic64_fetch_and_acquire(~i, v);
#endif
}

/**
 * raw_atomic64_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_andnot_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_andnot_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_andnot_release)
        return arch_atomic64_fetch_andnot_release(i, v);
#elif defined(arch_atomic64_fetch_andnot_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_andnot_relaxed(i, v);
#elif defined(arch_atomic64_fetch_andnot)
        return arch_atomic64_fetch_andnot(i, v);
#else
        return raw_atomic64_fetch_and_release(~i, v);
#endif
}

/**
 * raw_atomic64_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_andnot_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_andnot_relaxed)
        return arch_atomic64_fetch_andnot_relaxed(i, v);
#elif defined(arch_atomic64_fetch_andnot)
        return arch_atomic64_fetch_andnot(i, v);
#else
        return raw_atomic64_fetch_and_relaxed(~i, v);
#endif
}

/**
 * raw_atomic64_or() - atomic bitwise OR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_or() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_or(s64 i, atomic64_t *v)
{
        arch_atomic64_or(i, v);
}

/**
 * raw_atomic64_fetch_or() - atomic bitwise OR with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_or() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_or(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_or)
        return arch_atomic64_fetch_or(i, v);
#elif defined(arch_atomic64_fetch_or_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_or_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_or"
#endif
}

/**
 * raw_atomic64_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_or_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_or_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_or_acquire)
        return arch_atomic64_fetch_or_acquire(i, v);
#elif defined(arch_atomic64_fetch_or_relaxed)
        s64 ret = arch_atomic64_fetch_or_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_or)
        return arch_atomic64_fetch_or(i, v);
#else
#error "Unable to define raw_atomic64_fetch_or_acquire"
#endif
}

/**
 * raw_atomic64_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_or_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_or_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_or_release)
        return arch_atomic64_fetch_or_release(i, v);
#elif defined(arch_atomic64_fetch_or_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_or_relaxed(i, v);
#elif defined(arch_atomic64_fetch_or)
        return arch_atomic64_fetch_or(i, v);
#else
#error "Unable to define raw_atomic64_fetch_or_release"
#endif
}

/**
 * raw_atomic64_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_or_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_or_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_or_relaxed)
        return arch_atomic64_fetch_or_relaxed(i, v);
#elif defined(arch_atomic64_fetch_or)
        return arch_atomic64_fetch_or(i, v);
#else
#error "Unable to define raw_atomic64_fetch_or_relaxed"
#endif
}

/**
 * raw_atomic64_xor() - atomic bitwise XOR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xor() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_xor(s64 i, atomic64_t *v)
{
        arch_atomic64_xor(i, v);
}

/**
 * raw_atomic64_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_xor() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_xor)
        return arch_atomic64_fetch_xor(i, v);
#elif defined(arch_atomic64_fetch_xor_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_xor_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_xor"
#endif
}

/**
 * raw_atomic64_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_xor_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_xor_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_xor_acquire)
        return arch_atomic64_fetch_xor_acquire(i, v);
#elif defined(arch_atomic64_fetch_xor_relaxed)
        s64 ret = arch_atomic64_fetch_xor_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_xor)
        return arch_atomic64_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic64_fetch_xor_acquire"
#endif
}

/**
 * raw_atomic64_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_xor_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_xor_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_xor_release)
        return arch_atomic64_fetch_xor_release(i, v);
#elif defined(arch_atomic64_fetch_xor_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_xor_relaxed(i, v);
#elif defined(arch_atomic64_fetch_xor)
        return arch_atomic64_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic64_fetch_xor_release"
#endif
}

/**
 * raw_atomic64_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_xor_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_xor_relaxed)
        return arch_atomic64_fetch_xor_relaxed(i, v);
#elif defined(arch_atomic64_fetch_xor)
        return arch_atomic64_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic64_fetch_xor_relaxed"
#endif
}

/**
 * raw_atomic64_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_xchg(atomic64_t *v, s64 new)
{
#if defined(arch_atomic64_xchg)
        return arch_atomic64_xchg(v, new);
#elif defined(arch_atomic64_xchg_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_xchg_relaxed(v, new);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_xchg(&v->counter, new);
#endif
}

/**
 * raw_atomic64_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_xchg_acquire(atomic64_t *v, s64 new)
{
#if defined(arch_atomic64_xchg_acquire)
        return arch_atomic64_xchg_acquire(v, new);
#elif defined(arch_atomic64_xchg_relaxed)
        s64 ret = arch_atomic64_xchg_relaxed(v, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_xchg)
        return arch_atomic64_xchg(v, new);
#else
        return raw_xchg_acquire(&v->counter, new);
#endif
}

/**
 * raw_atomic64_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_xchg_release(atomic64_t *v, s64 new)
{
#if defined(arch_atomic64_xchg_release)
        return arch_atomic64_xchg_release(v, new);
#elif defined(arch_atomic64_xchg_relaxed)
        __atomic_release_fence();
        return arch_atomic64_xchg_relaxed(v, new);
#elif defined(arch_atomic64_xchg)
        return arch_atomic64_xchg(v, new);
#else
        return raw_xchg_release(&v->counter, new);
#endif
}

/**
 * raw_atomic64_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_xchg_relaxed(atomic64_t *v, s64 new)
{
#if defined(arch_atomic64_xchg_relaxed)
        return arch_atomic64_xchg_relaxed(v, new);
#elif defined(arch_atomic64_xchg)
        return arch_atomic64_xchg(v, new);
#else
        return raw_xchg_relaxed(&v->counter, new);
#endif
}

/**
 * raw_atomic64_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_cmpxchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
#if defined(arch_atomic64_cmpxchg)
        return arch_atomic64_cmpxchg(v, old, new);
#elif defined(arch_atomic64_cmpxchg_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_cmpxchg(&v->counter, old, new);
#endif
}

/**
 * raw_atomic64_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_cmpxchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new)
{
#if defined(arch_atomic64_cmpxchg_acquire)
        return arch_atomic64_cmpxchg_acquire(v, old, new);
#elif defined(arch_atomic64_cmpxchg_relaxed)
        s64 ret = arch_atomic64_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_cmpxchg)
        return arch_atomic64_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_acquire(&v->counter, old, new);
#endif
}

/**
 * raw_atomic64_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_cmpxchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new)
{
#if defined(arch_atomic64_cmpxchg_release)
        return arch_atomic64_cmpxchg_release(v, old, new);
#elif defined(arch_atomic64_cmpxchg_relaxed)
        __atomic_release_fence();
        return arch_atomic64_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic64_cmpxchg)
        return arch_atomic64_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_release(&v->counter, old, new);
#endif
}

/**
 * raw_atomic64_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_cmpxchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new)
{
#if defined(arch_atomic64_cmpxchg_relaxed)
        return arch_atomic64_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic64_cmpxchg)
        return arch_atomic64_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_relaxed(&v->counter, old, new);
#endif
}

/**
 * raw_atomic64_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_try_cmpxchg() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
#if defined(arch_atomic64_try_cmpxchg)
        return arch_atomic64_try_cmpxchg(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg_relaxed)
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
#else
        s64 r, o = *old;
        r = raw_atomic64_cmpxchg(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic64_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_acquire() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
{
#if defined(arch_atomic64_try_cmpxchg_acquire)
        return arch_atomic64_try_cmpxchg_acquire(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg_relaxed)
        bool ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_try_cmpxchg)
        return arch_atomic64_try_cmpxchg(v, old, new);
#else
        s64 r, o = *old;
        r = raw_atomic64_cmpxchg_acquire(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic64_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_release() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
{
#if defined(arch_atomic64_try_cmpxchg_release)
        return arch_atomic64_try_cmpxchg_release(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg_relaxed)
        __atomic_release_fence();
        return arch_atomic64_try_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg)
        return arch_atomic64_try_cmpxchg(v, old, new);
#else
        s64 r, o = *old;
        r = raw_atomic64_cmpxchg_release(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic64_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_relaxed() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new)
{
#if defined(arch_atomic64_try_cmpxchg_relaxed)
        return arch_atomic64_try_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg)
        return arch_atomic64_try_cmpxchg(v, old, new);
#else
        s64 r, o = *old;
        r = raw_atomic64_cmpxchg_relaxed(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic64_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic64_sub_and_test(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_and_test)
        return arch_atomic64_sub_and_test(i, v);
#else
        return raw_atomic64_sub_return(i, v) == 0;
#endif
}

/**
 * raw_atomic64_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic64_dec_and_test(atomic64_t *v)
{
#if defined(arch_atomic64_dec_and_test)
        return arch_atomic64_dec_and_test(v);
#else
        return raw_atomic64_dec_return(v) == 0;
#endif
}

/**
 * raw_atomic64_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic64_inc_and_test(atomic64_t *v)
{
#if defined(arch_atomic64_inc_and_test)
        return arch_atomic64_inc_and_test(v);
#else
        return raw_atomic64_inc_return(v) == 0;
#endif
}

/**
 * raw_atomic64_add_negative() - atomic add and test if negative with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_negative() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_negative(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_negative)
        return arch_atomic64_add_negative(i, v);
#elif defined(arch_atomic64_add_negative_relaxed)
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_add_negative_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_add_return(i, v) < 0;
#endif
}

/**
 * raw_atomic64_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_negative_acquire() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_negative_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_negative_acquire)
        return arch_atomic64_add_negative_acquire(i, v);
#elif defined(arch_atomic64_add_negative_relaxed)
        bool ret = arch_atomic64_add_negative_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_add_negative)
        return arch_atomic64_add_negative(i, v);
#else
        return raw_atomic64_add_return_acquire(i, v) < 0;
#endif
}

/**
 * raw_atomic64_add_negative_release() - atomic add and test if negative with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_negative_release() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_negative_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_negative_release)
        return arch_atomic64_add_negative_release(i, v);
#elif defined(arch_atomic64_add_negative_relaxed)
        __atomic_release_fence();
        return arch_atomic64_add_negative_relaxed(i, v);
#elif defined(arch_atomic64_add_negative)
        return arch_atomic64_add_negative(i, v);
#else
        return raw_atomic64_add_return_release(i, v) < 0;
#endif
}

/**
 * raw_atomic64_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_negative_relaxed() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_negative_relaxed)
        return arch_atomic64_add_negative_relaxed(i, v);
#elif defined(arch_atomic64_add_negative)
        return arch_atomic64_add_negative(i, v);
#else
        return raw_atomic64_add_return_relaxed(i, v) < 0;
#endif
}

/**
 * raw_atomic64_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic64_t
 * @a: s64 value to add
 * @u: s64 value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add_unless() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
{
#if defined(arch_atomic64_fetch_add_unless)
        return arch_atomic64_fetch_add_unless(v, a, u);
#else
        s64 c = raw_atomic64_read(v);

        do {
                if (unlikely(c == u))
                        break;
        } while (!raw_atomic64_try_cmpxchg(v, &c, c + a));

        return c;
#endif
}

/**
 * raw_atomic64_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic64_t
 * @a: s64 value to add
 * @u: s64 value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_add_unless() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
{
#if defined(arch_atomic64_add_unless)
        return arch_atomic64_add_unless(v, a, u);
#else
        return raw_atomic64_fetch_add_unless(v, a, u) != u;
#endif
}

/**
 * raw_atomic64_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_not_zero() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic64_inc_not_zero(atomic64_t *v)
{
#if defined(arch_atomic64_inc_not_zero)
        return arch_atomic64_inc_not_zero(v);
#else
        return raw_atomic64_add_unless(v, 1, 0);
#endif
}

/**
 * raw_atomic64_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_unless_negative() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic64_inc_unless_negative(atomic64_t *v)
{
#if defined(arch_atomic64_inc_unless_negative)
        return arch_atomic64_inc_unless_negative(v);
#else
        s64 c = raw_atomic64_read(v);

        do {
                if (unlikely(c < 0))
                        return false;
        } while (!raw_atomic64_try_cmpxchg(v, &c, c + 1));

        return true;
#endif
}

/**
 * raw_atomic64_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_unless_positive() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic64_dec_unless_positive(atomic64_t *v)
{
#if defined(arch_atomic64_dec_unless_positive)
        return arch_atomic64_dec_unless_positive(v);
#else
        s64 c = raw_atomic64_read(v);

        do {
                if (unlikely(c > 0))
                        return false;
        } while (!raw_atomic64_try_cmpxchg(v, &c, c - 1));

        return true;
#endif
}

/**
 * raw_atomic64_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_if_positive() elsewhere.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline s64
raw_atomic64_dec_if_positive(atomic64_t *v)
{
#if defined(arch_atomic64_dec_if_positive)
        return arch_atomic64_dec_if_positive(v);
#else
        s64 dec, c = raw_atomic64_read(v);

        do {
                dec = c - 1;
                if (unlikely(dec < 0))
                        break;
        } while (!raw_atomic64_try_cmpxchg(v, &c, dec));

        return dec;
#endif
}

#endif /* _LINUX_ATOMIC_FALLBACK_H */
// b565db590afeeff0d7c9485ccbca5bb6e155749f

















































    5 











    7 


































































































    6 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIME64_H
#define _LINUX_TIME64_H

#include <linux/math64.h>
#include <vdso/time64.h>

typedef __s64 time64_t;
typedef __u64 timeu64_t;

#include <uapi/linux/time.h>

struct timespec64 {
        time64_t        tv_sec;                        /* seconds */
        long                tv_nsec;                /* nanoseconds */
};

struct itimerspec64 {
        struct timespec64 it_interval;
        struct timespec64 it_value;
};

/* Parameters used to convert the timespec values: */
#define PSEC_PER_NSEC                        1000L

/* Located here for timespec[64]_valid_strict */
#define TIME64_MAX                        ((s64)~((u64)1 << 63))
#define TIME64_MIN                        (-TIME64_MAX - 1)

#define KTIME_MAX                        ((s64)~((u64)1 << 63))
#define KTIME_MIN                        (-KTIME_MAX - 1)
#define KTIME_SEC_MAX                        (KTIME_MAX / NSEC_PER_SEC)
#define KTIME_SEC_MIN                        (KTIME_MIN / NSEC_PER_SEC)

/*
 * Limits for settimeofday():
 *
 * To prevent setting the time close to the wraparound point time setting
 * is limited so a reasonable uptime can be accomodated. Uptime of 30 years
 * should be really sufficient, which means the cutoff is 2232. At that
 * point the cutoff is just a small part of the larger problem.
 */
#define TIME_UPTIME_SEC_MAX                (30LL * 365 * 24 *3600)
#define TIME_SETTOD_SEC_MAX                (KTIME_SEC_MAX - TIME_UPTIME_SEC_MAX)

static inline int timespec64_equal(const struct timespec64 *a,
                                   const struct timespec64 *b)
{
        return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec);
}

/*
 * lhs < rhs:  return <0
 * lhs == rhs: return 0
 * lhs > rhs:  return >0
 */
static inline int timespec64_compare(const struct timespec64 *lhs, const struct timespec64 *rhs)
{
        if (lhs->tv_sec < rhs->tv_sec)
                return -1;
        if (lhs->tv_sec > rhs->tv_sec)
                return 1;
        return lhs->tv_nsec - rhs->tv_nsec;
}

extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec);

static inline struct timespec64 timespec64_add(struct timespec64 lhs,
                                                struct timespec64 rhs)
{
        struct timespec64 ts_delta;
        set_normalized_timespec64(&ts_delta, lhs.tv_sec + rhs.tv_sec,
                                lhs.tv_nsec + rhs.tv_nsec);
        return ts_delta;
}

/*
 * sub = lhs - rhs, in normalized form
 */
static inline struct timespec64 timespec64_sub(struct timespec64 lhs,
                                                struct timespec64 rhs)
{
        struct timespec64 ts_delta;
        set_normalized_timespec64(&ts_delta, lhs.tv_sec - rhs.tv_sec,
                                lhs.tv_nsec - rhs.tv_nsec);
        return ts_delta;
}

/*
 * Returns true if the timespec64 is norm, false if denorm:
 */
static inline bool timespec64_valid(const struct timespec64 *ts)
{
        /* Dates before 1970 are bogus */
        if (ts->tv_sec < 0)
                return false;
        /* Can't have more nanoseconds then a second */
        if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
                return false;
        return true;
}

static inline bool timespec64_valid_strict(const struct timespec64 *ts)
{
        if (!timespec64_valid(ts))
                return false;
        /* Disallow values that could overflow ktime_t */
        if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX)
                return false;
        return true;
}

static inline bool timespec64_valid_settod(const struct timespec64 *ts)
{
        if (!timespec64_valid(ts))
                return false;
        /* Disallow values which cause overflow issues vs. CLOCK_REALTIME */
        if ((unsigned long long)ts->tv_sec >= TIME_SETTOD_SEC_MAX)
                return false;
        return true;
}

/**
 * timespec64_to_ns - Convert timespec64 to nanoseconds
 * @ts:                pointer to the timespec64 variable to be converted
 *
 * Returns the scalar nanosecond representation of the timespec64
 * parameter.
 */
static inline s64 timespec64_to_ns(const struct timespec64 *ts)
{
        /* Prevent multiplication overflow / underflow */
        if (ts->tv_sec >= KTIME_SEC_MAX)
                return KTIME_MAX;

        if (ts->tv_sec <= KTIME_SEC_MIN)
                return KTIME_MIN;

        return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
}

/**
 * ns_to_timespec64 - Convert nanoseconds to timespec64
 * @nsec:        the nanoseconds value to be converted
 *
 * Returns the timespec64 representation of the nsec parameter.
 */
extern struct timespec64 ns_to_timespec64(s64 nsec);

/**
 * timespec64_add_ns - Adds nanoseconds to a timespec64
 * @a:                pointer to timespec64 to be incremented
 * @ns:                unsigned nanoseconds value to be added
 *
 * This must always be inlined because its used from the x86-64 vdso,
 * which cannot call other kernel functions.
 */
static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns)
{
        a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns);
        a->tv_nsec = ns;
}

/*
 * timespec64_add_safe assumes both values are positive and checks for
 * overflow. It will return TIME64_MAX in case of overflow.
 */
extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
                                         const struct timespec64 rhs);

#endif /* _LINUX_TIME64_H */



























































































    5 
    2 








































































    1 
    3 













    2 



    3 
    3 































































    6 




    1 




    1 
    1 


































































    8 










    8 












    2 





























































    1 
















































    5 


















    5 
    4 





























    1 







    1 
    1 




    1 




    1 
    2 

    2 














    1 
    1 














































































    2 
    2 


























































































































    1 

    1 



































































    1 

    1 
























































    1 



































































    1 





























































































































































    1 
    1 


















































































    5 










    1 










    2 








    1 



    1 








    1 
    1 





















    5 



    5 









    6 





    5 















    2 









    1 






    1 
































    7 

    7 










    5 


























































































































































































































    1 


    1 






    1 




















































    1 





    1 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/export.h>
#include <linux/bvec.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/uio.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/splice.h>
#include <linux/compat.h>
#include <linux/scatterlist.h>
#include <linux/instrumented.h>
#include <linux/iov_iter.h>

static __always_inline
size_t copy_to_user_iter(void __user *iter_to, size_t progress,
                         size_t len, void *from, void *priv2)
{
        if (should_fail_usercopy())
                return len;
        if (access_ok(iter_to, len)) {
                from += progress;
                instrument_copy_to_user(iter_to, from, len);
                len = raw_copy_to_user(iter_to, from, len);
        }
        return len;
}

static __always_inline
size_t copy_to_user_iter_nofault(void __user *iter_to, size_t progress,
                                 size_t len, void *from, void *priv2)
{
        ssize_t res;

        if (should_fail_usercopy())
                return len;

        from += progress;
        res = copy_to_user_nofault(iter_to, from, len);
        return res < 0 ? len : res;
}

static __always_inline
size_t copy_from_user_iter(void __user *iter_from, size_t progress,
                           size_t len, void *to, void *priv2)
{
        size_t res = len;

        if (should_fail_usercopy())
                return len;
        if (access_ok(iter_from, len)) {
                to += progress;
                instrument_copy_from_user_before(to, iter_from, len);
                res = raw_copy_from_user(to, iter_from, len);
                instrument_copy_from_user_after(to, iter_from, len, res);
        }
        return res;
}

static __always_inline
size_t memcpy_to_iter(void *iter_to, size_t progress,
                      size_t len, void *from, void *priv2)
{
        memcpy(iter_to, from + progress, len);
        return 0;
}

static __always_inline
size_t memcpy_from_iter(void *iter_from, size_t progress,
                        size_t len, void *to, void *priv2)
{
        memcpy(to + progress, iter_from, len);
        return 0;
}

/*
 * fault_in_iov_iter_readable - fault in iov iterator for reading
 * @i: iterator
 * @size: maximum length
 *
 * Fault in one or more iovecs of the given iov_iter, to a maximum length of
 * @size.  For each iovec, fault in each page that constitutes the iovec.
 *
 * Returns the number of bytes not faulted in (like copy_to_user() and
 * copy_from_user()).
 *
 * Always returns 0 for non-userspace iterators.
 */
size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
{
        if (iter_is_ubuf(i)) {
                size_t n = min(size, iov_iter_count(i));
                n -= fault_in_readable(i->ubuf + i->iov_offset, n);
                return size - n;
        } else if (iter_is_iovec(i)) {
                size_t count = min(size, iov_iter_count(i));
                const struct iovec *p;
                size_t skip;

                size -= count;
                for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
                        size_t len = min(count, p->iov_len - skip);
                        size_t ret;

                        if (unlikely(!len))
                                continue;
                        ret = fault_in_readable(p->iov_base + skip, len);
                        count -= len - ret;
                        if (ret)
                                break;
                }
                return count + size;
        }
        return 0;
}
EXPORT_SYMBOL(fault_in_iov_iter_readable);

/*
 * fault_in_iov_iter_writeable - fault in iov iterator for writing
 * @i: iterator
 * @size: maximum length
 *
 * Faults in the iterator using get_user_pages(), i.e., without triggering
 * hardware page faults.  This is primarily useful when we already know that
 * some or all of the pages in @i aren't in memory.
 *
 * Returns the number of bytes not faulted in, like copy_to_user() and
 * copy_from_user().
 *
 * Always returns 0 for non-user-space iterators.
 */
size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
{
        if (iter_is_ubuf(i)) {
                size_t n = min(size, iov_iter_count(i));
                n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n);
                return size - n;
        } else if (iter_is_iovec(i)) {
                size_t count = min(size, iov_iter_count(i));
                const struct iovec *p;
                size_t skip;

                size -= count;
                for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
                        size_t len = min(count, p->iov_len - skip);
                        size_t ret;

                        if (unlikely(!len))
                                continue;
                        ret = fault_in_safe_writeable(p->iov_base + skip, len);
                        count -= len - ret;
                        if (ret)
                                break;
                }
                return count + size;
        }
        return 0;
}
EXPORT_SYMBOL(fault_in_iov_iter_writeable);

void iov_iter_init(struct iov_iter *i, unsigned int direction,
                        const struct iovec *iov, unsigned long nr_segs,
                        size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        *i = (struct iov_iter) {
                .iter_type = ITER_IOVEC,
                .nofault = false,
                .data_source = direction,
                .__iov = iov,
                .nr_segs = nr_segs,
                .iov_offset = 0,
                .count = count
        };
}
EXPORT_SYMBOL(iov_iter_init);

size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        if (user_backed_iter(i))
                might_fault();
        return iterate_and_advance(i, bytes, (void *)addr,
                                   copy_to_user_iter, memcpy_to_iter);
}
EXPORT_SYMBOL(_copy_to_iter);

#ifdef CONFIG_ARCH_HAS_COPY_MC
static __always_inline
size_t copy_to_user_iter_mc(void __user *iter_to, size_t progress,
                            size_t len, void *from, void *priv2)
{
        if (access_ok(iter_to, len)) {
                from += progress;
                instrument_copy_to_user(iter_to, from, len);
                len = copy_mc_to_user(iter_to, from, len);
        }
        return len;
}

static __always_inline
size_t memcpy_to_iter_mc(void *iter_to, size_t progress,
                         size_t len, void *from, void *priv2)
{
        return copy_mc_to_kernel(iter_to, from + progress, len);
}

/**
 * _copy_mc_to_iter - copy to iter with source memory error exception handling
 * @addr: source kernel address
 * @bytes: total transfer length
 * @i: destination iterator
 *
 * The pmem driver deploys this for the dax operation
 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
 * successfully copied.
 *
 * The main differences between this and typical _copy_to_iter().
 *
 * * Typical tail/residue handling after a fault retries the copy
 *   byte-by-byte until the fault happens again. Re-triggering machine
 *   checks is potentially fatal so the implementation uses source
 *   alignment and poison alignment assumptions to avoid re-triggering
 *   hardware exceptions.
 *
 * * ITER_KVEC and ITER_BVEC can return short copies.  Compare to
 *   copy_to_iter() where only ITER_IOVEC attempts might return a short copy.
 *
 * Return: number of bytes copied (may be %0)
 */
size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        if (user_backed_iter(i))
                might_fault();
        return iterate_and_advance(i, bytes, (void *)addr,
                                   copy_to_user_iter_mc, memcpy_to_iter_mc);
}
EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
#endif /* CONFIG_ARCH_HAS_COPY_MC */

static __always_inline
size_t __copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
        return iterate_and_advance(i, bytes, addr,
                                   copy_from_user_iter, memcpy_from_iter);
}

size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(!i->data_source))
                return 0;

        if (user_backed_iter(i))
                might_fault();
        return __copy_from_iter(addr, bytes, i);
}
EXPORT_SYMBOL(_copy_from_iter);

static __always_inline
size_t copy_from_user_iter_nocache(void __user *iter_from, size_t progress,
                                   size_t len, void *to, void *priv2)
{
        return __copy_from_user_inatomic_nocache(to + progress, iter_from, len);
}

size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(!i->data_source))
                return 0;

        return iterate_and_advance(i, bytes, addr,
                                   copy_from_user_iter_nocache,
                                   memcpy_from_iter);
}
EXPORT_SYMBOL(_copy_from_iter_nocache);

#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
static __always_inline
size_t copy_from_user_iter_flushcache(void __user *iter_from, size_t progress,
                                      size_t len, void *to, void *priv2)
{
        return __copy_from_user_flushcache(to + progress, iter_from, len);
}

static __always_inline
size_t memcpy_from_iter_flushcache(void *iter_from, size_t progress,
                                   size_t len, void *to, void *priv2)
{
        memcpy_flushcache(to + progress, iter_from, len);
        return 0;
}

/**
 * _copy_from_iter_flushcache - write destination through cpu cache
 * @addr: destination kernel address
 * @bytes: total transfer length
 * @i: source iterator
 *
 * The pmem driver arranges for filesystem-dax to use this facility via
 * dax_copy_from_iter() for ensuring that writes to persistent memory
 * are flushed through the CPU cache. It is differentiated from
 * _copy_from_iter_nocache() in that guarantees all data is flushed for
 * all iterator types. The _copy_from_iter_nocache() only attempts to
 * bypass the cache for the ITER_IOVEC case, and on some archs may use
 * instructions that strand dirty-data in the cache.
 *
 * Return: number of bytes copied (may be %0)
 */
size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(!i->data_source))
                return 0;

        return iterate_and_advance(i, bytes, addr,
                                   copy_from_user_iter_flushcache,
                                   memcpy_from_iter_flushcache);
}
EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
#endif

static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
{
        struct page *head;
        size_t v = n + offset;

        /*
         * The general case needs to access the page order in order
         * to compute the page size.
         * However, we mostly deal with order-0 pages and thus can
         * avoid a possible cache line miss for requests that fit all
         * page orders.
         */
        if (n <= v && v <= PAGE_SIZE)
                return true;

        head = compound_head(page);
        v += (page - head) << PAGE_SHIFT;

        if (WARN_ON(n > v || v > page_size(head)))
                return false;
        return true;
}

size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i)
{
        size_t res = 0;
        if (!page_copy_sane(page, offset, bytes))
                return 0;
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        page += offset / PAGE_SIZE; // first subpage
        offset %= PAGE_SIZE;
        while (1) {
                void *kaddr = kmap_local_page(page);
                size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
                n = _copy_to_iter(kaddr + offset, n, i);
                kunmap_local(kaddr);
                res += n;
                bytes -= n;
                if (!bytes || !n)
                        break;
                offset += n;
                if (offset == PAGE_SIZE) {
                        page++;
                        offset = 0;
                }
        }
        return res;
}
EXPORT_SYMBOL(copy_page_to_iter);

size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes,
                                 struct iov_iter *i)
{
        size_t res = 0;

        if (!page_copy_sane(page, offset, bytes))
                return 0;
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        page += offset / PAGE_SIZE; // first subpage
        offset %= PAGE_SIZE;
        while (1) {
                void *kaddr = kmap_local_page(page);
                size_t n = min(bytes, (size_t)PAGE_SIZE - offset);

                n = iterate_and_advance(i, n, kaddr + offset,
                                        copy_to_user_iter_nofault,
                                        memcpy_to_iter);
                kunmap_local(kaddr);
                res += n;
                bytes -= n;
                if (!bytes || !n)
                        break;
                offset += n;
                if (offset == PAGE_SIZE) {
                        page++;
                        offset = 0;
                }
        }
        return res;
}
EXPORT_SYMBOL(copy_page_to_iter_nofault);

size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i)
{
        size_t res = 0;
        if (!page_copy_sane(page, offset, bytes))
                return 0;
        page += offset / PAGE_SIZE; // first subpage
        offset %= PAGE_SIZE;
        while (1) {
                void *kaddr = kmap_local_page(page);
                size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
                n = _copy_from_iter(kaddr + offset, n, i);
                kunmap_local(kaddr);
                res += n;
                bytes -= n;
                if (!bytes || !n)
                        break;
                offset += n;
                if (offset == PAGE_SIZE) {
                        page++;
                        offset = 0;
                }
        }
        return res;
}
EXPORT_SYMBOL(copy_page_from_iter);

static __always_inline
size_t zero_to_user_iter(void __user *iter_to, size_t progress,
                         size_t len, void *priv, void *priv2)
{
        return clear_user(iter_to, len);
}

static __always_inline
size_t zero_to_iter(void *iter_to, size_t progress,
                    size_t len, void *priv, void *priv2)
{
        memset(iter_to, 0, len);
        return 0;
}

size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
{
        return iterate_and_advance(i, bytes, NULL,
                                   zero_to_user_iter, zero_to_iter);
}
EXPORT_SYMBOL(iov_iter_zero);

size_t copy_page_from_iter_atomic(struct page *page, size_t offset,
                size_t bytes, struct iov_iter *i)
{
        size_t n, copied = 0;

        if (!page_copy_sane(page, offset, bytes))
                return 0;
        if (WARN_ON_ONCE(!i->data_source))
                return 0;

        do {
                char *p;

                n = bytes - copied;
                if (PageHighMem(page)) {
                        page += offset / PAGE_SIZE;
                        offset %= PAGE_SIZE;
                        n = min_t(size_t, n, PAGE_SIZE - offset);
                }

                p = kmap_atomic(page) + offset;
                n = __copy_from_iter(p, n, i);
                kunmap_atomic(p);
                copied += n;
                offset += n;
        } while (PageHighMem(page) && copied != bytes && n > 0);

        return copied;
}
EXPORT_SYMBOL(copy_page_from_iter_atomic);

static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
{
        const struct bio_vec *bvec, *end;

        if (!i->count)
                return;
        i->count -= size;

        size += i->iov_offset;

        for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) {
                if (likely(size < bvec->bv_len))
                        break;
                size -= bvec->bv_len;
        }
        i->iov_offset = size;
        i->nr_segs -= bvec - i->bvec;
        i->bvec = bvec;
}

static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
{
        const struct iovec *iov, *end;

        if (!i->count)
                return;
        i->count -= size;

        size += i->iov_offset; // from beginning of current segment
        for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) {
                if (likely(size < iov->iov_len))
                        break;
                size -= iov->iov_len;
        }
        i->iov_offset = size;
        i->nr_segs -= iov - iter_iov(i);
        i->__iov = iov;
}

void iov_iter_advance(struct iov_iter *i, size_t size)
{
        if (unlikely(i->count < size))
                size = i->count;
        if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) {
                i->iov_offset += size;
                i->count -= size;
        } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
                /* iovec and kvec have identical layouts */
                iov_iter_iovec_advance(i, size);
        } else if (iov_iter_is_bvec(i)) {
                iov_iter_bvec_advance(i, size);
        } else if (iov_iter_is_discard(i)) {
                i->count -= size;
        }
}
EXPORT_SYMBOL(iov_iter_advance);

void iov_iter_revert(struct iov_iter *i, size_t unroll)
{
        if (!unroll)
                return;
        if (WARN_ON(unroll > MAX_RW_COUNT))
                return;
        i->count += unroll;
        if (unlikely(iov_iter_is_discard(i)))
                return;
        if (unroll <= i->iov_offset) {
                i->iov_offset -= unroll;
                return;
        }
        unroll -= i->iov_offset;
        if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) {
                BUG(); /* We should never go beyond the start of the specified
                        * range since we might then be straying into pages that
                        * aren't pinned.
                        */
        } else if (iov_iter_is_bvec(i)) {
                const struct bio_vec *bvec = i->bvec;
                while (1) {
                        size_t n = (--bvec)->bv_len;
                        i->nr_segs++;
                        if (unroll <= n) {
                                i->bvec = bvec;
                                i->iov_offset = n - unroll;
                                return;
                        }
                        unroll -= n;
                }
        } else { /* same logics for iovec and kvec */
                const struct iovec *iov = iter_iov(i);
                while (1) {
                        size_t n = (--iov)->iov_len;
                        i->nr_segs++;
                        if (unroll <= n) {
                                i->__iov = iov;
                                i->iov_offset = n - unroll;
                                return;
                        }
                        unroll -= n;
                }
        }
}
EXPORT_SYMBOL(iov_iter_revert);

/*
 * Return the count of just the current iov_iter segment.
 */
size_t iov_iter_single_seg_count(const struct iov_iter *i)
{
        if (i->nr_segs > 1) {
                if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                        return min(i->count, iter_iov(i)->iov_len - i->iov_offset);
                if (iov_iter_is_bvec(i))
                        return min(i->count, i->bvec->bv_len - i->iov_offset);
        }
        return i->count;
}
EXPORT_SYMBOL(iov_iter_single_seg_count);

void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
                        const struct kvec *kvec, unsigned long nr_segs,
                        size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        *i = (struct iov_iter){
                .iter_type = ITER_KVEC,
                .data_source = direction,
                .kvec = kvec,
                .nr_segs = nr_segs,
                .iov_offset = 0,
                .count = count
        };
}
EXPORT_SYMBOL(iov_iter_kvec);

void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
                        const struct bio_vec *bvec, unsigned long nr_segs,
                        size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        *i = (struct iov_iter){
                .iter_type = ITER_BVEC,
                .data_source = direction,
                .bvec = bvec,
                .nr_segs = nr_segs,
                .iov_offset = 0,
                .count = count
        };
}
EXPORT_SYMBOL(iov_iter_bvec);

/**
 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
 * @i: The iterator to initialise.
 * @direction: The direction of the transfer.
 * @xarray: The xarray to access.
 * @start: The start file position.
 * @count: The size of the I/O buffer in bytes.
 *
 * Set up an I/O iterator to either draw data out of the pages attached to an
 * inode or to inject data into those pages.  The pages *must* be prevented
 * from evaporation, either by taking a ref on them or locking them by the
 * caller.
 */
void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
                     struct xarray *xarray, loff_t start, size_t count)
{
        BUG_ON(direction & ~1);
        *i = (struct iov_iter) {
                .iter_type = ITER_XARRAY,
                .data_source = direction,
                .xarray = xarray,
                .xarray_start = start,
                .count = count,
                .iov_offset = 0
        };
}
EXPORT_SYMBOL(iov_iter_xarray);

/**
 * iov_iter_discard - Initialise an I/O iterator that discards data
 * @i: The iterator to initialise.
 * @direction: The direction of the transfer.
 * @count: The size of the I/O buffer in bytes.
 *
 * Set up an I/O iterator that just discards everything that's written to it.
 * It's only available as a READ iterator.
 */
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
{
        BUG_ON(direction != READ);
        *i = (struct iov_iter){
                .iter_type = ITER_DISCARD,
                .data_source = false,
                .count = count,
                .iov_offset = 0
        };
}
EXPORT_SYMBOL(iov_iter_discard);

static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
                                   unsigned len_mask)
{
        const struct iovec *iov = iter_iov(i);
        size_t size = i->count;
        size_t skip = i->iov_offset;

        do {
                size_t len = iov->iov_len - skip;

                if (len > size)
                        len = size;
                if (len & len_mask)
                        return false;
                if ((unsigned long)(iov->iov_base + skip) & addr_mask)
                        return false;

                iov++;
                size -= len;
                skip = 0;
        } while (size);

        return true;
}

static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask,
                                  unsigned len_mask)
{
        const struct bio_vec *bvec = i->bvec;
        unsigned skip = i->iov_offset;
        size_t size = i->count;

        do {
                size_t len = bvec->bv_len;

                if (len > size)
                        len = size;
                if (len & len_mask)
                        return false;
                if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
                        return false;

                bvec++;
                size -= len;
                skip = 0;
        } while (size);

        return true;
}

/**
 * iov_iter_is_aligned() - Check if the addresses and lengths of each segments
 *         are aligned to the parameters.
 *
 * @i: &struct iov_iter to restore
 * @addr_mask: bit mask to check against the iov element's addresses
 * @len_mask: bit mask to check against the iov element's lengths
 *
 * Return: false if any addresses or lengths intersect with the provided masks
 */
bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
                         unsigned len_mask)
{
        if (likely(iter_is_ubuf(i))) {
                if (i->count & len_mask)
                        return false;
                if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask)
                        return false;
                return true;
        }

        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                return iov_iter_aligned_iovec(i, addr_mask, len_mask);

        if (iov_iter_is_bvec(i))
                return iov_iter_aligned_bvec(i, addr_mask, len_mask);

        if (iov_iter_is_xarray(i)) {
                if (i->count & len_mask)
                        return false;
                if ((i->xarray_start + i->iov_offset) & addr_mask)
                        return false;
        }

        return true;
}
EXPORT_SYMBOL_GPL(iov_iter_is_aligned);

static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
{
        const struct iovec *iov = iter_iov(i);
        unsigned long res = 0;
        size_t size = i->count;
        size_t skip = i->iov_offset;

        do {
                size_t len = iov->iov_len - skip;
                if (len) {
                        res |= (unsigned long)iov->iov_base + skip;
                        if (len > size)
                                len = size;
                        res |= len;
                        size -= len;
                }
                iov++;
                skip = 0;
        } while (size);
        return res;
}

static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
{
        const struct bio_vec *bvec = i->bvec;
        unsigned res = 0;
        size_t size = i->count;
        unsigned skip = i->iov_offset;

        do {
                size_t len = bvec->bv_len - skip;
                res |= (unsigned long)bvec->bv_offset + skip;
                if (len > size)
                        len = size;
                res |= len;
                bvec++;
                size -= len;
                skip = 0;
        } while (size);

        return res;
}

unsigned long iov_iter_alignment(const struct iov_iter *i)
{
        if (likely(iter_is_ubuf(i))) {
                size_t size = i->count;
                if (size)
                        return ((unsigned long)i->ubuf + i->iov_offset) | size;
                return 0;
        }

        /* iovec and kvec have identical layouts */
        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                return iov_iter_alignment_iovec(i);

        if (iov_iter_is_bvec(i))
                return iov_iter_alignment_bvec(i);

        if (iov_iter_is_xarray(i))
                return (i->xarray_start + i->iov_offset) | i->count;

        return 0;
}
EXPORT_SYMBOL(iov_iter_alignment);

unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
{
        unsigned long res = 0;
        unsigned long v = 0;
        size_t size = i->count;
        unsigned k;

        if (iter_is_ubuf(i))
                return 0;

        if (WARN_ON(!iter_is_iovec(i)))
                return ~0U;

        for (k = 0; k < i->nr_segs; k++) {
                const struct iovec *iov = iter_iov(i) + k;
                if (iov->iov_len) {
                        unsigned long base = (unsigned long)iov->iov_base;
                        if (v) // if not the first one
                                res |= base | v; // this start | previous end
                        v = base + iov->iov_len;
                        if (size <= iov->iov_len)
                                break;
                        size -= iov->iov_len;
                }
        }
        return res;
}
EXPORT_SYMBOL(iov_iter_gap_alignment);

static int want_pages_array(struct page ***res, size_t size,
                            size_t start, unsigned int maxpages)
{
        unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE);

        if (count > maxpages)
                count = maxpages;
        WARN_ON(!count);        // caller should've prevented that
        if (!*res) {
                *res = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL);
                if (!*res)
                        return 0;
        }
        return count;
}

static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
                                          pgoff_t index, unsigned int nr_pages)
{
        XA_STATE(xas, xa, index);
        struct page *page;
        unsigned int ret = 0;

        rcu_read_lock();
        for (page = xas_load(&xas); page; page = xas_next(&xas)) {
                if (xas_retry(&xas, page))
                        continue;

                /* Has the page moved or been split? */
                if (unlikely(page != xas_reload(&xas))) {
                        xas_reset(&xas);
                        continue;
                }

                pages[ret] = find_subpage(page, xas.xa_index);
                get_page(pages[ret]);
                if (++ret == nr_pages)
                        break;
        }
        rcu_read_unlock();
        return ret;
}

static ssize_t iter_xarray_get_pages(struct iov_iter *i,
                                     struct page ***pages, size_t maxsize,
                                     unsigned maxpages, size_t *_start_offset)
{
        unsigned nr, offset, count;
        pgoff_t index;
        loff_t pos;

        pos = i->xarray_start + i->iov_offset;
        index = pos >> PAGE_SHIFT;
        offset = pos & ~PAGE_MASK;
        *_start_offset = offset;

        count = want_pages_array(pages, maxsize, offset, maxpages);
        if (!count)
                return -ENOMEM;
        nr = iter_xarray_populate_pages(*pages, i->xarray, index, count);
        if (nr == 0)
                return 0;

        maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
        i->iov_offset += maxsize;
        i->count -= maxsize;
        return maxsize;
}

/* must be done on non-empty ITER_UBUF or ITER_IOVEC one */
static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size)
{
        size_t skip;
        long k;

        if (iter_is_ubuf(i))
                return (unsigned long)i->ubuf + i->iov_offset;

        for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
                const struct iovec *iov = iter_iov(i) + k;
                size_t len = iov->iov_len - skip;

                if (unlikely(!len))
                        continue;
                if (*size > len)
                        *size = len;
                return (unsigned long)iov->iov_base + skip;
        }
        BUG(); // if it had been empty, we wouldn't get called
}

/* must be done on non-empty ITER_BVEC one */
static struct page *first_bvec_segment(const struct iov_iter *i,
                                       size_t *size, size_t *start)
{
        struct page *page;
        size_t skip = i->iov_offset, len;

        len = i->bvec->bv_len - skip;
        if (*size > len)
                *size = len;
        skip += i->bvec->bv_offset;
        page = i->bvec->bv_page + skip / PAGE_SIZE;
        *start = skip % PAGE_SIZE;
        return page;
}

static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
                   struct page ***pages, size_t maxsize,
                   unsigned int maxpages, size_t *start)
{
        unsigned int n, gup_flags = 0;

        if (maxsize > i->count)
                maxsize = i->count;
        if (!maxsize)
                return 0;
        if (maxsize > MAX_RW_COUNT)
                maxsize = MAX_RW_COUNT;

        if (likely(user_backed_iter(i))) {
                unsigned long addr;
                int res;

                if (iov_iter_rw(i) != WRITE)
                        gup_flags |= FOLL_WRITE;
                if (i->nofault)
                        gup_flags |= FOLL_NOFAULT;

                addr = first_iovec_segment(i, &maxsize);
                *start = addr % PAGE_SIZE;
                addr &= PAGE_MASK;
                n = want_pages_array(pages, maxsize, *start, maxpages);
                if (!n)
                        return -ENOMEM;
                res = get_user_pages_fast(addr, n, gup_flags, *pages);
                if (unlikely(res <= 0))
                        return res;
                maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start);
                iov_iter_advance(i, maxsize);
                return maxsize;
        }
        if (iov_iter_is_bvec(i)) {
                struct page **p;
                struct page *page;

                page = first_bvec_segment(i, &maxsize, start);
                n = want_pages_array(pages, maxsize, *start, maxpages);
                if (!n)
                        return -ENOMEM;
                p = *pages;
                for (int k = 0; k < n; k++)
                        get_page(p[k] = page + k);
                maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start);
                i->count -= maxsize;
                i->iov_offset += maxsize;
                if (i->iov_offset == i->bvec->bv_len) {
                        i->iov_offset = 0;
                        i->bvec++;
                        i->nr_segs--;
                }
                return maxsize;
        }
        if (iov_iter_is_xarray(i))
                return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
        return -EFAULT;
}

ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
                size_t maxsize, unsigned maxpages, size_t *start)
{
        if (!maxpages)
                return 0;
        BUG_ON(!pages);

        return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start);
}
EXPORT_SYMBOL(iov_iter_get_pages2);

ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
                struct page ***pages, size_t maxsize, size_t *start)
{
        ssize_t len;

        *pages = NULL;

        len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start);
        if (len <= 0) {
                kvfree(*pages);
                *pages = NULL;
        }
        return len;
}
EXPORT_SYMBOL(iov_iter_get_pages_alloc2);

static int iov_npages(const struct iov_iter *i, int maxpages)
{
        size_t skip = i->iov_offset, size = i->count;
        const struct iovec *p;
        int npages = 0;

        for (p = iter_iov(i); size; skip = 0, p++) {
                unsigned offs = offset_in_page(p->iov_base + skip);
                size_t len = min(p->iov_len - skip, size);

                if (len) {
                        size -= len;
                        npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
                        if (unlikely(npages > maxpages))
                                return maxpages;
                }
        }
        return npages;
}

static int bvec_npages(const struct iov_iter *i, int maxpages)
{
        size_t skip = i->iov_offset, size = i->count;
        const struct bio_vec *p;
        int npages = 0;

        for (p = i->bvec; size; skip = 0, p++) {
                unsigned offs = (p->bv_offset + skip) % PAGE_SIZE;
                size_t len = min(p->bv_len - skip, size);

                size -= len;
                npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
                if (unlikely(npages > maxpages))
                        return maxpages;
        }
        return npages;
}

int iov_iter_npages(const struct iov_iter *i, int maxpages)
{
        if (unlikely(!i->count))
                return 0;
        if (likely(iter_is_ubuf(i))) {
                unsigned offs = offset_in_page(i->ubuf + i->iov_offset);
                int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE);
                return min(npages, maxpages);
        }
        /* iovec and kvec have identical layouts */
        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                return iov_npages(i, maxpages);
        if (iov_iter_is_bvec(i))
                return bvec_npages(i, maxpages);
        if (iov_iter_is_xarray(i)) {
                unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
                int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
                return min(npages, maxpages);
        }
        return 0;
}
EXPORT_SYMBOL(iov_iter_npages);

const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
{
        *new = *old;
        if (iov_iter_is_bvec(new))
                return new->bvec = kmemdup(new->bvec,
                                    new->nr_segs * sizeof(struct bio_vec),
                                    flags);
        else if (iov_iter_is_kvec(new) || iter_is_iovec(new))
                /* iovec and kvec have identical layout */
                return new->__iov = kmemdup(new->__iov,
                                   new->nr_segs * sizeof(struct iovec),
                                   flags);
        return NULL;
}
EXPORT_SYMBOL(dup_iter);

static __noclone int copy_compat_iovec_from_user(struct iovec *iov,
                const struct iovec __user *uvec, u32 nr_segs)
{
        const struct compat_iovec __user *uiov =
                (const struct compat_iovec __user *)uvec;
        int ret = -EFAULT;
        u32 i;

        if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
                return -EFAULT;

        for (i = 0; i < nr_segs; i++) {
                compat_uptr_t buf;
                compat_ssize_t len;

                unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
                unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);

                /* check for compat_size_t not fitting in compat_ssize_t .. */
                if (len < 0) {
                        ret = -EINVAL;
                        goto uaccess_end;
                }
                iov[i].iov_base = compat_ptr(buf);
                iov[i].iov_len = len;
        }

        ret = 0;
uaccess_end:
        user_access_end();
        return ret;
}

static __noclone int copy_iovec_from_user(struct iovec *iov,
                const struct iovec __user *uiov, unsigned long nr_segs)
{
        int ret = -EFAULT;

        if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
                return -EFAULT;

        do {
                void __user *buf;
                ssize_t len;

                unsafe_get_user(len, &uiov->iov_len, uaccess_end);
                unsafe_get_user(buf, &uiov->iov_base, uaccess_end);

                /* check for size_t not fitting in ssize_t .. */
                if (unlikely(len < 0)) {
                        ret = -EINVAL;
                        goto uaccess_end;
                }
                iov->iov_base = buf;
                iov->iov_len = len;

                uiov++; iov++;
        } while (--nr_segs);

        ret = 0;
uaccess_end:
        user_access_end();
        return ret;
}

struct iovec *iovec_from_user(const struct iovec __user *uvec,
                unsigned long nr_segs, unsigned long fast_segs,
                struct iovec *fast_iov, bool compat)
{
        struct iovec *iov = fast_iov;
        int ret;

        /*
         * SuS says "The readv() function *may* fail if the iovcnt argument was
         * less than or equal to 0, or greater than {IOV_MAX}.  Linux has
         * traditionally returned zero for zero segments, so...
         */
        if (nr_segs == 0)
                return iov;
        if (nr_segs > UIO_MAXIOV)
                return ERR_PTR(-EINVAL);
        if (nr_segs > fast_segs) {
                iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
                if (!iov)
                        return ERR_PTR(-ENOMEM);
        }

        if (unlikely(compat))
                ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
        else
                ret = copy_iovec_from_user(iov, uvec, nr_segs);
        if (ret) {
                if (iov != fast_iov)
                        kfree(iov);
                return ERR_PTR(ret);
        }

        return iov;
}

/*
 * Single segment iovec supplied by the user, import it as ITER_UBUF.
 */
static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec,
                                   struct iovec **iovp, struct iov_iter *i,
                                   bool compat)
{
        struct iovec *iov = *iovp;
        ssize_t ret;

        if (compat)
                ret = copy_compat_iovec_from_user(iov, uvec, 1);
        else
                ret = copy_iovec_from_user(iov, uvec, 1);
        if (unlikely(ret))
                return ret;

        ret = import_ubuf(type, iov->iov_base, iov->iov_len, i);
        if (unlikely(ret))
                return ret;
        *iovp = NULL;
        return i->count;
}

ssize_t __import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
                 struct iov_iter *i, bool compat)
{
        ssize_t total_len = 0;
        unsigned long seg;
        struct iovec *iov;

        if (nr_segs == 1)
                return __import_iovec_ubuf(type, uvec, iovp, i, compat);

        iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
        if (IS_ERR(iov)) {
                *iovp = NULL;
                return PTR_ERR(iov);
        }

        /*
         * According to the Single Unix Specification we should return EINVAL if
         * an element length is < 0 when cast to ssize_t or if the total length
         * would overflow the ssize_t return value of the system call.
         *
         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
         * overflow case.
         */
        for (seg = 0; seg < nr_segs; seg++) {
                ssize_t len = (ssize_t)iov[seg].iov_len;

                if (!access_ok(iov[seg].iov_base, len)) {
                        if (iov != *iovp)
                                kfree(iov);
                        *iovp = NULL;
                        return -EFAULT;
                }

                if (len > MAX_RW_COUNT - total_len) {
                        len = MAX_RW_COUNT - total_len;
                        iov[seg].iov_len = len;
                }
                total_len += len;
        }

        iov_iter_init(i, type, iov, nr_segs, total_len);
        if (iov == *iovp)
                *iovp = NULL;
        else
                *iovp = iov;
        return total_len;
}

/**
 * import_iovec() - Copy an array of &struct iovec from userspace
 *     into the kernel, check that it is valid, and initialize a new
 *     &struct iov_iter iterator to access it.
 *
 * @type: One of %READ or %WRITE.
 * @uvec: Pointer to the userspace array.
 * @nr_segs: Number of elements in userspace array.
 * @fast_segs: Number of elements in @iov.
 * @iovp: (input and output parameter) Pointer to pointer to (usually small
 *     on-stack) kernel array.
 * @i: Pointer to iterator that will be initialized on success.
 *
 * If the array pointed to by *@iov is large enough to hold all @nr_segs,
 * then this function places %NULL in *@iov on return. Otherwise, a new
 * array will be allocated and the result placed in *@iov. This means that
 * the caller may call kfree() on *@iov regardless of whether the small
 * on-stack array was used or not (and regardless of whether this function
 * returns an error or not).
 *
 * Return: Negative error code on error, bytes imported on success
 */
ssize_t import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs,
                 struct iovec **iovp, struct iov_iter *i)
{
        return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
                              in_compat_syscall());
}
EXPORT_SYMBOL(import_iovec);

int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i)
{
        if (len > MAX_RW_COUNT)
                len = MAX_RW_COUNT;
        if (unlikely(!access_ok(buf, len)))
                return -EFAULT;

        iov_iter_ubuf(i, rw, buf, len);
        return 0;
}
EXPORT_SYMBOL_GPL(import_ubuf);

/**
 * iov_iter_restore() - Restore a &struct iov_iter to the same state as when
 *     iov_iter_save_state() was called.
 *
 * @i: &struct iov_iter to restore
 * @state: state to restore from
 *
 * Used after iov_iter_save_state() to bring restore @i, if operations may
 * have advanced it.
 *
 * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
 */
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
{
        if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) &&
                         !iter_is_ubuf(i)) && !iov_iter_is_kvec(i))
                return;
        i->iov_offset = state->iov_offset;
        i->count = state->count;
        if (iter_is_ubuf(i))
                return;
        /*
         * For the *vec iters, nr_segs + iov is constant - if we increment
         * the vec, then we also decrement the nr_segs count. Hence we don't
         * need to track both of these, just one is enough and we can deduct
         * the other from that. ITER_KVEC and ITER_IOVEC are the same struct
         * size, so we can just increment the iov pointer as they are unionzed.
         * ITER_BVEC _may_ be the same size on some archs, but on others it is
         * not. Be safe and handle it separately.
         */
        BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
        if (iov_iter_is_bvec(i))
                i->bvec -= state->nr_segs - i->nr_segs;
        else
                i->__iov -= state->nr_segs - i->nr_segs;
        i->nr_segs = state->nr_segs;
}

/*
 * Extract a list of contiguous pages from an ITER_XARRAY iterator.  This does not
 * get references on the pages, nor does it get a pin on them.
 */
static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i,
                                             struct page ***pages, size_t maxsize,
                                             unsigned int maxpages,
                                             iov_iter_extraction_t extraction_flags,
                                             size_t *offset0)
{
        struct page *page, **p;
        unsigned int nr = 0, offset;
        loff_t pos = i->xarray_start + i->iov_offset;
        pgoff_t index = pos >> PAGE_SHIFT;
        XA_STATE(xas, i->xarray, index);

        offset = pos & ~PAGE_MASK;
        *offset0 = offset;

        maxpages = want_pages_array(pages, maxsize, offset, maxpages);
        if (!maxpages)
                return -ENOMEM;
        p = *pages;

        rcu_read_lock();
        for (page = xas_load(&xas); page; page = xas_next(&xas)) {
                if (xas_retry(&xas, page))
                        continue;

                /* Has the page moved or been split? */
                if (unlikely(page != xas_reload(&xas))) {
                        xas_reset(&xas);
                        continue;
                }

                p[nr++] = find_subpage(page, xas.xa_index);
                if (nr == maxpages)
                        break;
        }
        rcu_read_unlock();

        maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
        iov_iter_advance(i, maxsize);
        return maxsize;
}

/*
 * Extract a list of contiguous pages from an ITER_BVEC iterator.  This does
 * not get references on the pages, nor does it get a pin on them.
 */
static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i,
                                           struct page ***pages, size_t maxsize,
                                           unsigned int maxpages,
                                           iov_iter_extraction_t extraction_flags,
                                           size_t *offset0)
{
        struct page **p, *page;
        size_t skip = i->iov_offset, offset, size;
        int k;

        for (;;) {
                if (i->nr_segs == 0)
                        return 0;
                size = min(maxsize, i->bvec->bv_len - skip);
                if (size)
                        break;
                i->iov_offset = 0;
                i->nr_segs--;
                i->bvec++;
                skip = 0;
        }

        skip += i->bvec->bv_offset;
        page = i->bvec->bv_page + skip / PAGE_SIZE;
        offset = skip % PAGE_SIZE;
        *offset0 = offset;

        maxpages = want_pages_array(pages, size, offset, maxpages);
        if (!maxpages)
                return -ENOMEM;
        p = *pages;
        for (k = 0; k < maxpages; k++)
                p[k] = page + k;

        size = min_t(size_t, size, maxpages * PAGE_SIZE - offset);
        iov_iter_advance(i, size);
        return size;
}

/*
 * Extract a list of virtually contiguous pages from an ITER_KVEC iterator.
 * This does not get references on the pages, nor does it get a pin on them.
 */
static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i,
                                           struct page ***pages, size_t maxsize,
                                           unsigned int maxpages,
                                           iov_iter_extraction_t extraction_flags,
                                           size_t *offset0)
{
        struct page **p, *page;
        const void *kaddr;
        size_t skip = i->iov_offset, offset, len, size;
        int k;

        for (;;) {
                if (i->nr_segs == 0)
                        return 0;
                size = min(maxsize, i->kvec->iov_len - skip);
                if (size)
                        break;
                i->iov_offset = 0;
                i->nr_segs--;
                i->kvec++;
                skip = 0;
        }

        kaddr = i->kvec->iov_base + skip;
        offset = (unsigned long)kaddr & ~PAGE_MASK;
        *offset0 = offset;

        maxpages = want_pages_array(pages, size, offset, maxpages);
        if (!maxpages)
                return -ENOMEM;
        p = *pages;

        kaddr -= offset;
        len = offset + size;
        for (k = 0; k < maxpages; k++) {
                size_t seg = min_t(size_t, len, PAGE_SIZE);

                if (is_vmalloc_or_module_addr(kaddr))
                        page = vmalloc_to_page(kaddr);
                else
                        page = virt_to_page(kaddr);

                p[k] = page;
                len -= seg;
                kaddr += PAGE_SIZE;
        }

        size = min_t(size_t, size, maxpages * PAGE_SIZE - offset);
        iov_iter_advance(i, size);
        return size;
}

/*
 * Extract a list of contiguous pages from a user iterator and get a pin on
 * each of them.  This should only be used if the iterator is user-backed
 * (IOBUF/UBUF).
 *
 * It does not get refs on the pages, but the pages must be unpinned by the
 * caller once the transfer is complete.
 *
 * This is safe to be used where background IO/DMA *is* going to be modifying
 * the buffer; using a pin rather than a ref makes forces fork() to give the
 * child a copy of the page.
 */
static ssize_t iov_iter_extract_user_pages(struct iov_iter *i,
                                           struct page ***pages,
                                           size_t maxsize,
                                           unsigned int maxpages,
                                           iov_iter_extraction_t extraction_flags,
                                           size_t *offset0)
{
        unsigned long addr;
        unsigned int gup_flags = 0;
        size_t offset;
        int res;

        if (i->data_source == ITER_DEST)
                gup_flags |= FOLL_WRITE;
        if (extraction_flags & ITER_ALLOW_P2PDMA)
                gup_flags |= FOLL_PCI_P2PDMA;
        if (i->nofault)
                gup_flags |= FOLL_NOFAULT;

        addr = first_iovec_segment(i, &maxsize);
        *offset0 = offset = addr % PAGE_SIZE;
        addr &= PAGE_MASK;
        maxpages = want_pages_array(pages, maxsize, offset, maxpages);
        if (!maxpages)
                return -ENOMEM;
        res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages);
        if (unlikely(res <= 0))
                return res;
        maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset);
        iov_iter_advance(i, maxsize);
        return maxsize;
}

/**
 * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator
 * @i: The iterator to extract from
 * @pages: Where to return the list of pages
 * @maxsize: The maximum amount of iterator to extract
 * @maxpages: The maximum size of the list of pages
 * @extraction_flags: Flags to qualify request
 * @offset0: Where to return the starting offset into (*@pages)[0]
 *
 * Extract a list of contiguous pages from the current point of the iterator,
 * advancing the iterator.  The maximum number of pages and the maximum amount
 * of page contents can be set.
 *
 * If *@pages is NULL, a page list will be allocated to the required size and
 * *@pages will be set to its base.  If *@pages is not NULL, it will be assumed
 * that the caller allocated a page list at least @maxpages in size and this
 * will be filled in.
 *
 * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA
 * be allowed on the pages extracted.
 *
 * The iov_iter_extract_will_pin() function can be used to query how cleanup
 * should be performed.
 *
 * Extra refs or pins on the pages may be obtained as follows:
 *
 *  (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be
 *      added to the pages, but refs will not be taken.
 *      iov_iter_extract_will_pin() will return true.
 *
 *  (*) If the iterator is ITER_KVEC, ITER_BVEC or ITER_XARRAY, the pages are
 *      merely listed; no extra refs or pins are obtained.
 *      iov_iter_extract_will_pin() will return 0.
 *
 * Note also:
 *
 *  (*) Use with ITER_DISCARD is not supported as that has no content.
 *
 * On success, the function sets *@pages to the new pagelist, if allocated, and
 * sets *offset0 to the offset into the first page.
 *
 * It may also return -ENOMEM and -EFAULT.
 */
ssize_t iov_iter_extract_pages(struct iov_iter *i,
                               struct page ***pages,
                               size_t maxsize,
                               unsigned int maxpages,
                               iov_iter_extraction_t extraction_flags,
                               size_t *offset0)
{
        maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT);
        if (!maxsize)
                return 0;

        if (likely(user_backed_iter(i)))
                return iov_iter_extract_user_pages(i, pages, maxsize,
                                                   maxpages, extraction_flags,
                                                   offset0);
        if (iov_iter_is_kvec(i))
                return iov_iter_extract_kvec_pages(i, pages, maxsize,
                                                   maxpages, extraction_flags,
                                                   offset0);
        if (iov_iter_is_bvec(i))
                return iov_iter_extract_bvec_pages(i, pages, maxsize,
                                                   maxpages, extraction_flags,
                                                   offset0);
        if (iov_iter_is_xarray(i))
                return iov_iter_extract_xarray_pages(i, pages, maxsize,
                                                     maxpages, extraction_flags,
                                                     offset0);
        return -EFAULT;
}
EXPORT_SYMBOL_GPL(iov_iter_extract_pages);





















    4 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *  include/linux/signalfd.h
 *
 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
 *
 */
#ifndef _LINUX_SIGNALFD_H
#define _LINUX_SIGNALFD_H

#include <uapi/linux/signalfd.h>
#include <linux/sched/signal.h>

#ifdef CONFIG_SIGNALFD

/*
 * Deliver the signal to listening signalfd.
 */
static inline void signalfd_notify(struct task_struct *tsk, int sig)
{
        if (unlikely(waitqueue_active(&tsk->sighand->signalfd_wqh)))
                wake_up(&tsk->sighand->signalfd_wqh);
}

extern void signalfd_cleanup(struct sighand_struct *sighand);

#else /* CONFIG_SIGNALFD */

static inline void signalfd_notify(struct task_struct *tsk, int sig) { }

static inline void signalfd_cleanup(struct sighand_struct *sighand) { }

#endif /* CONFIG_SIGNALFD */

#endif /* _LINUX_SIGNALFD_H */














































    2 

























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
/* SPDX-License-Identifier: GPL-2.0 */
/*
  File: linux/posix_acl.h

  (C) 2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/


#ifndef __LINUX_POSIX_ACL_H
#define __LINUX_POSIX_ACL_H

#include <linux/bug.h>
#include <linux/slab.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <uapi/linux/posix_acl.h>

struct user_namespace;

struct posix_acl_entry {
        short                        e_tag;
        unsigned short                e_perm;
        union {
                kuid_t                e_uid;
                kgid_t                e_gid;
        };
};

struct posix_acl {
        refcount_t                a_refcount;
        struct rcu_head                a_rcu;
        unsigned int                a_count;
        struct posix_acl_entry        a_entries[];
};

#define FOREACH_ACL_ENTRY(pa, acl, pe) \
        for(pa=(acl)->a_entries, pe=pa+(acl)->a_count; pa<pe; pa++)


/*
 * Duplicate an ACL handle.
 */
static inline struct posix_acl *
posix_acl_dup(struct posix_acl *acl)
{
        if (acl)
                refcount_inc(&acl->a_refcount);
        return acl;
}

/*
 * Free an ACL handle.
 */
static inline void
posix_acl_release(struct posix_acl *acl)
{
        if (acl && refcount_dec_and_test(&acl->a_refcount))
                kfree_rcu(acl, a_rcu);
}


/* posix_acl.c */

extern void posix_acl_init(struct posix_acl *, int);
extern struct posix_acl *posix_acl_alloc(int, gfp_t);
extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t);
extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *);
extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *);
extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t);

extern struct posix_acl *get_posix_acl(struct inode *, int);
int set_posix_acl(struct mnt_idmap *, struct dentry *, int,
                  struct posix_acl *);

struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type);
struct posix_acl *posix_acl_clone(const struct posix_acl *acl, gfp_t flags);

#ifdef CONFIG_FS_POSIX_ACL
int posix_acl_chmod(struct mnt_idmap *, struct dentry *, umode_t);
extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **,
                struct posix_acl **);
int posix_acl_update_mode(struct mnt_idmap *, struct inode *, umode_t *,
                          struct posix_acl **);

int simple_set_acl(struct mnt_idmap *, struct dentry *,
                   struct posix_acl *, int);
extern int simple_acl_create(struct inode *, struct inode *);

struct posix_acl *get_cached_acl(struct inode *inode, int type);
void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl);
void forget_cached_acl(struct inode *inode, int type);
void forget_all_cached_acls(struct inode *inode);
int posix_acl_valid(struct user_namespace *, const struct posix_acl *);
int posix_acl_permission(struct mnt_idmap *, struct inode *,
                         const struct posix_acl *, int);

static inline void cache_no_acl(struct inode *inode)
{
        inode->i_acl = NULL;
        inode->i_default_acl = NULL;
}

int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                const char *acl_name, struct posix_acl *kacl);
struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap,
                              struct dentry *dentry, const char *acl_name);
int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                   const char *acl_name);
int posix_acl_listxattr(struct inode *inode, char **buffer,
                        ssize_t *remaining_size);
#else
static inline int posix_acl_chmod(struct mnt_idmap *idmap,
                                  struct dentry *dentry, umode_t mode)
{
        return 0;
}

#define simple_set_acl                NULL

static inline int simple_acl_create(struct inode *dir, struct inode *inode)
{
        return 0;
}
static inline void cache_no_acl(struct inode *inode)
{
}

static inline int posix_acl_create(struct inode *inode, umode_t *mode,
                struct posix_acl **default_acl, struct posix_acl **acl)
{
        *default_acl = *acl = NULL;
        return 0;
}

static inline void forget_all_cached_acls(struct inode *inode)
{
}

static inline int vfs_set_acl(struct mnt_idmap *idmap,
                              struct dentry *dentry, const char *name,
                              struct posix_acl *acl)
{
        return -EOPNOTSUPP;
}

static inline struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap,
                                            struct dentry *dentry,
                                            const char *acl_name)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline int vfs_remove_acl(struct mnt_idmap *idmap,
                                 struct dentry *dentry, const char *acl_name)
{
        return -EOPNOTSUPP;
}
static inline int posix_acl_listxattr(struct inode *inode, char **buffer,
                                      ssize_t *remaining_size)
{
        return 0;
}
#endif /* CONFIG_FS_POSIX_ACL */

struct posix_acl *get_inode_acl(struct inode *inode, int type);

#endif  /* __LINUX_POSIX_ACL_H */



































    1 















    1 

    1 




    1 
    1 












    1 



    1 














    1 



































    1 





























    1 














    1 



    1 







    1 









































    2 
    2 









    1 




    1 







    1 



    1 


    1 
















































































































































































































































    1 
    1 



















































































































































































































































































































































































































    2 















    1 

    2 








    2 





    2 


















    1 







    2 




    1 




    2 


















    2 










    2 


    1 
    1 
































    2 






















































    2 




















    2 





























    2 















    2 



























































































































































    1 














































    2 











    2 




























    2 













    2 





































































































    2 







    2 









    2 
































    2 







    2 






    1 
    1 

    1 























    2 








    1 


































    1 










    2 

    2 































































































































































    1 








    1 
























    1 







    1 



































































    1 





    1 







    1 



























    3 










    2 









    2 

























































































    3 








    3 






    3 






















    2 





    1 



    3 




















































































































































































    3 











    1 
    3 

    3 


















































































































































































































    1 




    1 

























    1 













    1 

























    1 





































































































































































































    1 
















    1 





















    1 


















    1 

































































    1 


    1 











    1 


    1 

















    1 

















    1 


    1 




    1 

    1 
















































































































































































































































































    1 





























    1 


























    1 




























    1 



































    1 














    1 
    1 
    1 










    1 







    1 












    1 


    1 










    1 


















    1 


























































































    1 


    1 





























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/spinlock.h>

#include <linux/mm.h>
#include <linux/memremap.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/secretmem.h>

#include <linux/sched/signal.h>
#include <linux/rwsem.h>
#include <linux/hugetlb.h>
#include <linux/migrate.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
#include <linux/shmem_fs.h>

#include <asm/mmu_context.h>
#include <asm/tlbflush.h>

#include "internal.h"

struct follow_page_context {
        struct dev_pagemap *pgmap;
        unsigned int page_mask;
};

static inline void sanity_check_pinned_pages(struct page **pages,
                                             unsigned long npages)
{
        if (!IS_ENABLED(CONFIG_DEBUG_VM))
                return;

        /*
         * We only pin anonymous pages if they are exclusive. Once pinned, we
         * can no longer turn them possibly shared and PageAnonExclusive() will
         * stick around until the page is freed.
         *
         * We'd like to verify that our pinned anonymous pages are still mapped
         * exclusively. The issue with anon THP is that we don't know how
         * they are/were mapped when pinning them. However, for anon
         * THP we can assume that either the given page (PTE-mapped THP) or
         * the head page (PMD-mapped THP) should be PageAnonExclusive(). If
         * neither is the case, there is certainly something wrong.
         */
        for (; npages; npages--, pages++) {
                struct page *page = *pages;
                struct folio *folio = page_folio(page);

                if (is_zero_page(page) ||
                    !folio_test_anon(folio))
                        continue;
                if (!folio_test_large(folio) || folio_test_hugetlb(folio))
                        VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page);
                else
                        /* Either a PTE-mapped or a PMD-mapped THP. */
                        VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) &&
                                       !PageAnonExclusive(page), page);
        }
}

/*
 * Return the folio with ref appropriately incremented,
 * or NULL if that failed.
 */
static inline struct folio *try_get_folio(struct page *page, int refs)
{
        struct folio *folio;

retry:
        folio = page_folio(page);
        if (WARN_ON_ONCE(folio_ref_count(folio) < 0))
                return NULL;
        if (unlikely(!folio_ref_try_add_rcu(folio, refs)))
                return NULL;

        /*
         * At this point we have a stable reference to the folio; but it
         * could be that between calling page_folio() and the refcount
         * increment, the folio was split, in which case we'd end up
         * holding a reference on a folio that has nothing to do with the page
         * we were given anymore.
         * So now that the folio is stable, recheck that the page still
         * belongs to this folio.
         */
        if (unlikely(page_folio(page) != folio)) {
                if (!put_devmap_managed_folio_refs(folio, refs))
                        folio_put_refs(folio, refs);
                goto retry;
        }

        return folio;
}

/**
 * try_grab_folio() - Attempt to get or pin a folio.
 * @page:  pointer to page to be grabbed
 * @refs:  the value to (effectively) add to the folio's refcount
 * @flags: gup flags: these are the FOLL_* flag values.
 *
 * "grab" names in this file mean, "look at flags to decide whether to use
 * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
 *
 * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
 * same time. (That's true throughout the get_user_pages*() and
 * pin_user_pages*() APIs.) Cases:
 *
 *    FOLL_GET: folio's refcount will be incremented by @refs.
 *
 *    FOLL_PIN on large folios: folio's refcount will be incremented by
 *    @refs, and its pincount will be incremented by @refs.
 *
 *    FOLL_PIN on single-page folios: folio's refcount will be incremented by
 *    @refs * GUP_PIN_COUNTING_BIAS.
 *
 * Return: The folio containing @page (with refcount appropriately
 * incremented) for success, or NULL upon failure. If neither FOLL_GET
 * nor FOLL_PIN was set, that's considered failure, and furthermore,
 * a likely bug in the caller, so a warning is also emitted.
 */
struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
{
        struct folio *folio;

        if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0))
                return NULL;

        if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
                return NULL;

        if (flags & FOLL_GET)
                return try_get_folio(page, refs);

        /* FOLL_PIN is set */

        /*
         * Don't take a pin on the zero page - it's not going anywhere
         * and it is used in a *lot* of places.
         */
        if (is_zero_page(page))
                return page_folio(page);

        folio = try_get_folio(page, refs);
        if (!folio)
                return NULL;

        /*
         * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
         * right zone, so fail and let the caller fall back to the slow
         * path.
         */
        if (unlikely((flags & FOLL_LONGTERM) &&
                     !folio_is_longterm_pinnable(folio))) {
                if (!put_devmap_managed_folio_refs(folio, refs))
                        folio_put_refs(folio, refs);
                return NULL;
        }

        /*
         * When pinning a large folio, use an exact count to track it.
         *
         * However, be sure to *also* increment the normal folio
         * refcount field at least once, so that the folio really
         * is pinned.  That's why the refcount from the earlier
         * try_get_folio() is left intact.
         */
        if (folio_test_large(folio))
                atomic_add(refs, &folio->_pincount);
        else
                folio_ref_add(folio,
                                refs * (GUP_PIN_COUNTING_BIAS - 1));
        /*
         * Adjust the pincount before re-checking the PTE for changes.
         * This is essentially a smp_mb() and is paired with a memory
         * barrier in folio_try_share_anon_rmap_*().
         */
        smp_mb__after_atomic();

        node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);

        return folio;
}

static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
{
        if (flags & FOLL_PIN) {
                if (is_zero_folio(folio))
                        return;
                node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
                if (folio_test_large(folio))
                        atomic_sub(refs, &folio->_pincount);
                else
                        refs *= GUP_PIN_COUNTING_BIAS;
        }

        if (!put_devmap_managed_folio_refs(folio, refs))
                folio_put_refs(folio, refs);
}

/**
 * try_grab_page() - elevate a page's refcount by a flag-dependent amount
 * @page:    pointer to page to be grabbed
 * @flags:   gup flags: these are the FOLL_* flag values.
 *
 * This might not do anything at all, depending on the flags argument.
 *
 * "grab" names in this file mean, "look at flags to decide whether to use
 * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
 *
 * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
 * time. Cases: please see the try_grab_folio() documentation, with
 * "refs=1".
 *
 * Return: 0 for success, or if no action was required (if neither FOLL_PIN
 * nor FOLL_GET was set, nothing is done). A negative error code for failure:
 *
 *   -ENOMEM                FOLL_GET or FOLL_PIN was set, but the page could not
 *                        be grabbed.
 */
int __must_check try_grab_page(struct page *page, unsigned int flags)
{
        struct folio *folio = page_folio(page);

        if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
                return -ENOMEM;

        if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
                return -EREMOTEIO;

        if (flags & FOLL_GET)
                folio_ref_inc(folio);
        else if (flags & FOLL_PIN) {
                /*
                 * Don't take a pin on the zero page - it's not going anywhere
                 * and it is used in a *lot* of places.
                 */
                if (is_zero_page(page))
                        return 0;

                /*
                 * Similar to try_grab_folio(): be sure to *also*
                 * increment the normal page refcount field at least once,
                 * so that the page really is pinned.
                 */
                if (folio_test_large(folio)) {
                        folio_ref_add(folio, 1);
                        atomic_add(1, &folio->_pincount);
                } else {
                        folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
                }

                node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1);
        }

        return 0;
}

/**
 * unpin_user_page() - release a dma-pinned page
 * @page:            pointer to page to be released
 *
 * Pages that were pinned via pin_user_pages*() must be released via either
 * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
 * that such pages can be separately tracked and uniquely handled. In
 * particular, interactions with RDMA and filesystems need special handling.
 */
void unpin_user_page(struct page *page)
{
        sanity_check_pinned_pages(&page, 1);
        gup_put_folio(page_folio(page), 1, FOLL_PIN);
}
EXPORT_SYMBOL(unpin_user_page);

/**
 * folio_add_pin - Try to get an additional pin on a pinned folio
 * @folio: The folio to be pinned
 *
 * Get an additional pin on a folio we already have a pin on.  Makes no change
 * if the folio is a zero_page.
 */
void folio_add_pin(struct folio *folio)
{
        if (is_zero_folio(folio))
                return;

        /*
         * Similar to try_grab_folio(): be sure to *also* increment the normal
         * page refcount field at least once, so that the page really is
         * pinned.
         */
        if (folio_test_large(folio)) {
                WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1);
                folio_ref_inc(folio);
                atomic_inc(&folio->_pincount);
        } else {
                WARN_ON_ONCE(folio_ref_count(folio) < GUP_PIN_COUNTING_BIAS);
                folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
        }
}

static inline struct folio *gup_folio_range_next(struct page *start,
                unsigned long npages, unsigned long i, unsigned int *ntails)
{
        struct page *next = nth_page(start, i);
        struct folio *folio = page_folio(next);
        unsigned int nr = 1;

        if (folio_test_large(folio))
                nr = min_t(unsigned int, npages - i,
                           folio_nr_pages(folio) - folio_page_idx(folio, next));

        *ntails = nr;
        return folio;
}

static inline struct folio *gup_folio_next(struct page **list,
                unsigned long npages, unsigned long i, unsigned int *ntails)
{
        struct folio *folio = page_folio(list[i]);
        unsigned int nr;

        for (nr = i + 1; nr < npages; nr++) {
                if (page_folio(list[nr]) != folio)
                        break;
        }

        *ntails = nr - i;
        return folio;
}

/**
 * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
 * @pages:  array of pages to be maybe marked dirty, and definitely released.
 * @npages: number of pages in the @pages array.
 * @make_dirty: whether to mark the pages dirty
 *
 * "gup-pinned page" refers to a page that has had one of the get_user_pages()
 * variants called on that page.
 *
 * For each page in the @pages array, make that page (or its head page, if a
 * compound page) dirty, if @make_dirty is true, and if the page was previously
 * listed as clean. In any case, releases all pages using unpin_user_page(),
 * possibly via unpin_user_pages(), for the non-dirty case.
 *
 * Please see the unpin_user_page() documentation for details.
 *
 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
 * required, then the caller should a) verify that this is really correct,
 * because _lock() is usually required, and b) hand code it:
 * set_page_dirty_lock(), unpin_user_page().
 *
 */
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
                                 bool make_dirty)
{
        unsigned long i;
        struct folio *folio;
        unsigned int nr;

        if (!make_dirty) {
                unpin_user_pages(pages, npages);
                return;
        }

        sanity_check_pinned_pages(pages, npages);
        for (i = 0; i < npages; i += nr) {
                folio = gup_folio_next(pages, npages, i, &nr);
                /*
                 * Checking PageDirty at this point may race with
                 * clear_page_dirty_for_io(), but that's OK. Two key
                 * cases:
                 *
                 * 1) This code sees the page as already dirty, so it
                 * skips the call to set_page_dirty(). That could happen
                 * because clear_page_dirty_for_io() called
                 * page_mkclean(), followed by set_page_dirty().
                 * However, now the page is going to get written back,
                 * which meets the original intention of setting it
                 * dirty, so all is well: clear_page_dirty_for_io() goes
                 * on to call TestClearPageDirty(), and write the page
                 * back.
                 *
                 * 2) This code sees the page as clean, so it calls
                 * set_page_dirty(). The page stays dirty, despite being
                 * written back, so it gets written back again in the
                 * next writeback cycle. This is harmless.
                 */
                if (!folio_test_dirty(folio)) {
                        folio_lock(folio);
                        folio_mark_dirty(folio);
                        folio_unlock(folio);
                }
                gup_put_folio(folio, nr, FOLL_PIN);
        }
}
EXPORT_SYMBOL(unpin_user_pages_dirty_lock);

/**
 * unpin_user_page_range_dirty_lock() - release and optionally dirty
 * gup-pinned page range
 *
 * @page:  the starting page of a range maybe marked dirty, and definitely released.
 * @npages: number of consecutive pages to release.
 * @make_dirty: whether to mark the pages dirty
 *
 * "gup-pinned page range" refers to a range of pages that has had one of the
 * pin_user_pages() variants called on that page.
 *
 * For the page ranges defined by [page .. page+npages], make that range (or
 * its head pages, if a compound page) dirty, if @make_dirty is true, and if the
 * page range was previously listed as clean.
 *
 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
 * required, then the caller should a) verify that this is really correct,
 * because _lock() is usually required, and b) hand code it:
 * set_page_dirty_lock(), unpin_user_page().
 *
 */
void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
                                      bool make_dirty)
{
        unsigned long i;
        struct folio *folio;
        unsigned int nr;

        for (i = 0; i < npages; i += nr) {
                folio = gup_folio_range_next(page, npages, i, &nr);
                if (make_dirty && !folio_test_dirty(folio)) {
                        folio_lock(folio);
                        folio_mark_dirty(folio);
                        folio_unlock(folio);
                }
                gup_put_folio(folio, nr, FOLL_PIN);
        }
}
EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);

static void gup_fast_unpin_user_pages(struct page **pages, unsigned long npages)
{
        unsigned long i;
        struct folio *folio;
        unsigned int nr;

        /*
         * Don't perform any sanity checks because we might have raced with
         * fork() and some anonymous pages might now actually be shared --
         * which is why we're unpinning after all.
         */
        for (i = 0; i < npages; i += nr) {
                folio = gup_folio_next(pages, npages, i, &nr);
                gup_put_folio(folio, nr, FOLL_PIN);
        }
}

/**
 * unpin_user_pages() - release an array of gup-pinned pages.
 * @pages:  array of pages to be marked dirty and released.
 * @npages: number of pages in the @pages array.
 *
 * For each page in the @pages array, release the page using unpin_user_page().
 *
 * Please see the unpin_user_page() documentation for details.
 */
void unpin_user_pages(struct page **pages, unsigned long npages)
{
        unsigned long i;
        struct folio *folio;
        unsigned int nr;

        /*
         * If this WARN_ON() fires, then the system *might* be leaking pages (by
         * leaving them pinned), but probably not. More likely, gup/pup returned
         * a hard -ERRNO error to the caller, who erroneously passed it here.
         */
        if (WARN_ON(IS_ERR_VALUE(npages)))
                return;

        sanity_check_pinned_pages(pages, npages);
        for (i = 0; i < npages; i += nr) {
                folio = gup_folio_next(pages, npages, i, &nr);
                gup_put_folio(folio, nr, FOLL_PIN);
        }
}
EXPORT_SYMBOL(unpin_user_pages);

/*
 * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's
 * lifecycle.  Avoid setting the bit unless necessary, or it might cause write
 * cache bouncing on large SMP machines for concurrent pinned gups.
 */
static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
{
        if (!test_bit(MMF_HAS_PINNED, mm_flags))
                set_bit(MMF_HAS_PINNED, mm_flags);
}

#ifdef CONFIG_MMU

#if defined(CONFIG_ARCH_HAS_HUGEPD) || defined(CONFIG_HAVE_GUP_FAST)
static int record_subpages(struct page *page, unsigned long sz,
                           unsigned long addr, unsigned long end,
                           struct page **pages)
{
        struct page *start_page;
        int nr;

        start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT);
        for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
                pages[nr] = nth_page(start_page, nr);

        return nr;
}
#endif        /* CONFIG_ARCH_HAS_HUGEPD || CONFIG_HAVE_GUP_FAST */

#ifdef CONFIG_ARCH_HAS_HUGEPD
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
                                      unsigned long sz)
{
        unsigned long __boundary = (addr + sz) & ~(sz-1);
        return (__boundary - 1 < end - 1) ? __boundary : end;
}

/*
 * Returns 1 if succeeded, 0 if failed, -EMLINK if unshare needed.
 *
 * NOTE: for the same entry, gup-fast and gup-slow can return different
 * results (0 v.s. -EMLINK) depending on whether vma is available.  This is
 * the expected behavior, where we simply want gup-fast to fallback to
 * gup-slow to take the vma reference first.
 */
static int gup_hugepte(struct vm_area_struct *vma, pte_t *ptep, unsigned long sz,
                       unsigned long addr, unsigned long end, unsigned int flags,
                       struct page **pages, int *nr)
{
        unsigned long pte_end;
        struct page *page;
        struct folio *folio;
        pte_t pte;
        int refs;

        pte_end = (addr + sz) & ~(sz-1);
        if (pte_end < end)
                end = pte_end;

        pte = huge_ptep_get(ptep);

        if (!pte_access_permitted(pte, flags & FOLL_WRITE))
                return 0;

        /* hugepages are never "special" */
        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));

        page = pte_page(pte);
        refs = record_subpages(page, sz, addr, end, pages + *nr);

        folio = try_grab_folio(page, refs, flags);
        if (!folio)
                return 0;

        if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        if (!pte_write(pte) && gup_must_unshare(vma, flags, &folio->page)) {
                gup_put_folio(folio, refs, flags);
                return -EMLINK;
        }

        *nr += refs;
        folio_set_referenced(folio);
        return 1;
}

/*
 * NOTE: currently GUP for a hugepd is only possible on hugetlbfs file
 * systems on Power, which does not have issue with folio writeback against
 * GUP updates.  When hugepd will be extended to support non-hugetlbfs or
 * even anonymous memory, we need to do extra check as what we do with most
 * of the other folios. See writable_file_mapping_allowed() and
 * gup_fast_folio_allowed() for more information.
 */
static int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
                      unsigned long addr, unsigned int pdshift,
                      unsigned long end, unsigned int flags,
                      struct page **pages, int *nr)
{
        pte_t *ptep;
        unsigned long sz = 1UL << hugepd_shift(hugepd);
        unsigned long next;
        int ret;

        ptep = hugepte_offset(hugepd, addr, pdshift);
        do {
                next = hugepte_addr_end(addr, end, sz);
                ret = gup_hugepte(vma, ptep, sz, addr, end, flags, pages, nr);
                if (ret != 1)
                        return ret;
        } while (ptep++, addr = next, addr != end);

        return 1;
}

static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
                                  unsigned long addr, unsigned int pdshift,
                                  unsigned int flags,
                                  struct follow_page_context *ctx)
{
        struct page *page;
        struct hstate *h;
        spinlock_t *ptl;
        int nr = 0, ret;
        pte_t *ptep;

        /* Only hugetlb supports hugepd */
        if (WARN_ON_ONCE(!is_vm_hugetlb_page(vma)))
                return ERR_PTR(-EFAULT);

        h = hstate_vma(vma);
        ptep = hugepte_offset(hugepd, addr, pdshift);
        ptl = huge_pte_lock(h, vma->vm_mm, ptep);
        ret = gup_hugepd(vma, hugepd, addr, pdshift, addr + PAGE_SIZE,
                         flags, &page, &nr);
        spin_unlock(ptl);

        if (ret == 1) {
                /* GUP succeeded */
                WARN_ON_ONCE(nr != 1);
                ctx->page_mask = (1U << huge_page_order(h)) - 1;
                return page;
        }

        /* ret can be either 0 (translates to NULL) or negative */
        return ERR_PTR(ret);
}
#else /* CONFIG_ARCH_HAS_HUGEPD */
static inline int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
                             unsigned long addr, unsigned int pdshift,
                             unsigned long end, unsigned int flags,
                             struct page **pages, int *nr)
{
        return 0;
}

static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
                                  unsigned long addr, unsigned int pdshift,
                                  unsigned int flags,
                                  struct follow_page_context *ctx)
{
        return NULL;
}
#endif /* CONFIG_ARCH_HAS_HUGEPD */


static struct page *no_page_table(struct vm_area_struct *vma,
                                  unsigned int flags, unsigned long address)
{
        if (!(flags & FOLL_DUMP))
                return NULL;

        /*
         * When core dumping, we don't want to allocate unnecessary pages or
         * page tables.  Return error instead of NULL to skip handle_mm_fault,
         * then get_dump_page() will return NULL to leave a hole in the dump.
         * But we can only make this optimization where a hole would surely
         * be zero-filled if handle_mm_fault() actually did handle it.
         */
        if (is_vm_hugetlb_page(vma)) {
                struct hstate *h = hstate_vma(vma);

                if (!hugetlbfs_pagecache_present(h, vma, address))
                        return ERR_PTR(-EFAULT);
        } else if ((vma_is_anonymous(vma) || !vma->vm_ops->fault)) {
                return ERR_PTR(-EFAULT);
        }

        return NULL;
}

#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
static struct page *follow_huge_pud(struct vm_area_struct *vma,
                                    unsigned long addr, pud_t *pudp,
                                    int flags, struct follow_page_context *ctx)
{
        struct mm_struct *mm = vma->vm_mm;
        struct page *page;
        pud_t pud = *pudp;
        unsigned long pfn = pud_pfn(pud);
        int ret;

        assert_spin_locked(pud_lockptr(mm, pudp));

        if ((flags & FOLL_WRITE) && !pud_write(pud))
                return NULL;

        if (!pud_present(pud))
                return NULL;

        pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;

        if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
            pud_devmap(pud)) {
                /*
                 * device mapped pages can only be returned if the caller
                 * will manage the page reference count.
                 *
                 * At least one of FOLL_GET | FOLL_PIN must be set, so
                 * assert that here:
                 */
                if (!(flags & (FOLL_GET | FOLL_PIN)))
                        return ERR_PTR(-EEXIST);

                if (flags & FOLL_TOUCH)
                        touch_pud(vma, addr, pudp, flags & FOLL_WRITE);

                ctx->pgmap = get_dev_pagemap(pfn, ctx->pgmap);
                if (!ctx->pgmap)
                        return ERR_PTR(-EFAULT);
        }

        page = pfn_to_page(pfn);

        if (!pud_devmap(pud) && !pud_write(pud) &&
            gup_must_unshare(vma, flags, page))
                return ERR_PTR(-EMLINK);

        ret = try_grab_page(page, flags);
        if (ret)
                page = ERR_PTR(ret);
        else
                ctx->page_mask = HPAGE_PUD_NR - 1;

        return page;
}

/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
                                        struct vm_area_struct *vma,
                                        unsigned int flags)
{
        /* If the pmd is writable, we can write to the page. */
        if (pmd_write(pmd))
                return true;

        /* Maybe FOLL_FORCE is set to override it? */
        if (!(flags & FOLL_FORCE))
                return false;

        /* But FOLL_FORCE has no effect on shared mappings */
        if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
                return false;

        /* ... or read-only private ones */
        if (!(vma->vm_flags & VM_MAYWRITE))
                return false;

        /* ... or already writable ones that just need to take a write fault */
        if (vma->vm_flags & VM_WRITE)
                return false;

        /*
         * See can_change_pte_writable(): we broke COW and could map the page
         * writable if we have an exclusive anonymous page ...
         */
        if (!page || !PageAnon(page) || !PageAnonExclusive(page))
                return false;

        /* ... and a write-fault isn't required for other reasons. */
        if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
                return false;
        return !userfaultfd_huge_pmd_wp(vma, pmd);
}

static struct page *follow_huge_pmd(struct vm_area_struct *vma,
                                    unsigned long addr, pmd_t *pmd,
                                    unsigned int flags,
                                    struct follow_page_context *ctx)
{
        struct mm_struct *mm = vma->vm_mm;
        pmd_t pmdval = *pmd;
        struct page *page;
        int ret;

        assert_spin_locked(pmd_lockptr(mm, pmd));

        page = pmd_page(pmdval);
        if ((flags & FOLL_WRITE) &&
            !can_follow_write_pmd(pmdval, page, vma, flags))
                return NULL;

        /* Avoid dumping huge zero page */
        if ((flags & FOLL_DUMP) && is_huge_zero_pmd(pmdval))
                return ERR_PTR(-EFAULT);

        if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags))
                return NULL;

        if (!pmd_write(pmdval) && gup_must_unshare(vma, flags, page))
                return ERR_PTR(-EMLINK);

        VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
                        !PageAnonExclusive(page), page);

        ret = try_grab_page(page, flags);
        if (ret)
                return ERR_PTR(ret);

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        if (pmd_trans_huge(pmdval) && (flags & FOLL_TOUCH))
                touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
#endif        /* CONFIG_TRANSPARENT_HUGEPAGE */

        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
        ctx->page_mask = HPAGE_PMD_NR - 1;

        return page;
}

#else  /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
static struct page *follow_huge_pud(struct vm_area_struct *vma,
                                    unsigned long addr, pud_t *pudp,
                                    int flags, struct follow_page_context *ctx)
{
        return NULL;
}

static struct page *follow_huge_pmd(struct vm_area_struct *vma,
                                    unsigned long addr, pmd_t *pmd,
                                    unsigned int flags,
                                    struct follow_page_context *ctx)
{
        return NULL;
}
#endif        /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */

static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
                pte_t *pte, unsigned int flags)
{
        if (flags & FOLL_TOUCH) {
                pte_t orig_entry = ptep_get(pte);
                pte_t entry = orig_entry;

                if (flags & FOLL_WRITE)
                        entry = pte_mkdirty(entry);
                entry = pte_mkyoung(entry);

                if (!pte_same(orig_entry, entry)) {
                        set_pte_at(vma->vm_mm, address, pte, entry);
                        update_mmu_cache(vma, address, pte);
                }
        }

        /* Proper page table entry exists, but no corresponding struct page */
        return -EEXIST;
}

/* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */
static inline bool can_follow_write_pte(pte_t pte, struct page *page,
                                        struct vm_area_struct *vma,
                                        unsigned int flags)
{
        /* If the pte is writable, we can write to the page. */
        if (pte_write(pte))
                return true;

        /* Maybe FOLL_FORCE is set to override it? */
        if (!(flags & FOLL_FORCE))
                return false;

        /* But FOLL_FORCE has no effect on shared mappings */
        if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
                return false;

        /* ... or read-only private ones */
        if (!(vma->vm_flags & VM_MAYWRITE))
                return false;

        /* ... or already writable ones that just need to take a write fault */
        if (vma->vm_flags & VM_WRITE)
                return false;

        /*
         * See can_change_pte_writable(): we broke COW and could map the page
         * writable if we have an exclusive anonymous page ...
         */
        if (!page || !PageAnon(page) || !PageAnonExclusive(page))
                return false;

        /* ... and a write-fault isn't required for other reasons. */
        if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte))
                return false;
        return !userfaultfd_pte_wp(vma, pte);
}

static struct page *follow_page_pte(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmd, unsigned int flags,
                struct dev_pagemap **pgmap)
{
        struct mm_struct *mm = vma->vm_mm;
        struct page *page;
        spinlock_t *ptl;
        pte_t *ptep, pte;
        int ret;

        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
        if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
                         (FOLL_PIN | FOLL_GET)))
                return ERR_PTR(-EINVAL);

        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
        if (!ptep)
                return no_page_table(vma, flags, address);
        pte = ptep_get(ptep);
        if (!pte_present(pte))
                goto no_page;
        if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags))
                goto no_page;

        page = vm_normal_page(vma, address, pte);

        /*
         * We only care about anon pages in can_follow_write_pte() and don't
         * have to worry about pte_devmap() because they are never anon.
         */
        if ((flags & FOLL_WRITE) &&
            !can_follow_write_pte(pte, page, vma, flags)) {
                page = NULL;
                goto out;
        }

        if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
                /*
                 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
                 * case since they are only valid while holding the pgmap
                 * reference.
                 */
                *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
                if (*pgmap)
                        page = pte_page(pte);
                else
                        goto no_page;
        } else if (unlikely(!page)) {
                if (flags & FOLL_DUMP) {
                        /* Avoid special (like zero) pages in core dumps */
                        page = ERR_PTR(-EFAULT);
                        goto out;
                }

                if (is_zero_pfn(pte_pfn(pte))) {
                        page = pte_page(pte);
                } else {
                        ret = follow_pfn_pte(vma, address, ptep, flags);
                        page = ERR_PTR(ret);
                        goto out;
                }
        }

        if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) {
                page = ERR_PTR(-EMLINK);
                goto out;
        }

        VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
                       !PageAnonExclusive(page), page);

        /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
        ret = try_grab_page(page, flags);
        if (unlikely(ret)) {
                page = ERR_PTR(ret);
                goto out;
        }

        /*
         * We need to make the page accessible if and only if we are going
         * to access its content (the FOLL_PIN case).  Please see
         * Documentation/core-api/pin_user_pages.rst for details.
         */
        if (flags & FOLL_PIN) {
                ret = arch_make_page_accessible(page);
                if (ret) {
                        unpin_user_page(page);
                        page = ERR_PTR(ret);
                        goto out;
                }
        }
        if (flags & FOLL_TOUCH) {
                if ((flags & FOLL_WRITE) &&
                    !pte_dirty(pte) && !PageDirty(page))
                        set_page_dirty(page);
                /*
                 * pte_mkyoung() would be more correct here, but atomic care
                 * is needed to avoid losing the dirty bit: it is easier to use
                 * mark_page_accessed().
                 */
                mark_page_accessed(page);
        }
out:
        pte_unmap_unlock(ptep, ptl);
        return page;
no_page:
        pte_unmap_unlock(ptep, ptl);
        if (!pte_none(pte))
                return NULL;
        return no_page_table(vma, flags, address);
}

static struct page *follow_pmd_mask(struct vm_area_struct *vma,
                                    unsigned long address, pud_t *pudp,
                                    unsigned int flags,
                                    struct follow_page_context *ctx)
{
        pmd_t *pmd, pmdval;
        spinlock_t *ptl;
        struct page *page;
        struct mm_struct *mm = vma->vm_mm;

        pmd = pmd_offset(pudp, address);
        pmdval = pmdp_get_lockless(pmd);
        if (pmd_none(pmdval))
                return no_page_table(vma, flags, address);
        if (!pmd_present(pmdval))
                return no_page_table(vma, flags, address);
        if (unlikely(is_hugepd(__hugepd(pmd_val(pmdval)))))
                return follow_hugepd(vma, __hugepd(pmd_val(pmdval)),
                                     address, PMD_SHIFT, flags, ctx);
        if (pmd_devmap(pmdval)) {
                ptl = pmd_lock(mm, pmd);
                page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
                spin_unlock(ptl);
                if (page)
                        return page;
                return no_page_table(vma, flags, address);
        }
        if (likely(!pmd_leaf(pmdval)))
                return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);

        if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags))
                return no_page_table(vma, flags, address);

        ptl = pmd_lock(mm, pmd);
        pmdval = *pmd;
        if (unlikely(!pmd_present(pmdval))) {
                spin_unlock(ptl);
                return no_page_table(vma, flags, address);
        }
        if (unlikely(!pmd_leaf(pmdval))) {
                spin_unlock(ptl);
                return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
        }
        if (pmd_trans_huge(pmdval) && (flags & FOLL_SPLIT_PMD)) {
                spin_unlock(ptl);
                split_huge_pmd(vma, pmd, address);
                /* If pmd was left empty, stuff a page table in there quickly */
                return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) :
                        follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
        }
        page = follow_huge_pmd(vma, address, pmd, flags, ctx);
        spin_unlock(ptl);
        return page;
}

static struct page *follow_pud_mask(struct vm_area_struct *vma,
                                    unsigned long address, p4d_t *p4dp,
                                    unsigned int flags,
                                    struct follow_page_context *ctx)
{
        pud_t *pudp, pud;
        spinlock_t *ptl;
        struct page *page;
        struct mm_struct *mm = vma->vm_mm;

        pudp = pud_offset(p4dp, address);
        pud = READ_ONCE(*pudp);
        if (!pud_present(pud))
                return no_page_table(vma, flags, address);
        if (unlikely(is_hugepd(__hugepd(pud_val(pud)))))
                return follow_hugepd(vma, __hugepd(pud_val(pud)),
                                     address, PUD_SHIFT, flags, ctx);
        if (pud_leaf(pud)) {
                ptl = pud_lock(mm, pudp);
                page = follow_huge_pud(vma, address, pudp, flags, ctx);
                spin_unlock(ptl);
                if (page)
                        return page;
                return no_page_table(vma, flags, address);
        }
        if (unlikely(pud_bad(pud)))
                return no_page_table(vma, flags, address);

        return follow_pmd_mask(vma, address, pudp, flags, ctx);
}

static struct page *follow_p4d_mask(struct vm_area_struct *vma,
                                    unsigned long address, pgd_t *pgdp,
                                    unsigned int flags,
                                    struct follow_page_context *ctx)
{
        p4d_t *p4dp, p4d;

        p4dp = p4d_offset(pgdp, address);
        p4d = READ_ONCE(*p4dp);
        BUILD_BUG_ON(p4d_leaf(p4d));

        if (unlikely(is_hugepd(__hugepd(p4d_val(p4d)))))
                return follow_hugepd(vma, __hugepd(p4d_val(p4d)),
                                     address, P4D_SHIFT, flags, ctx);

        if (!p4d_present(p4d) || p4d_bad(p4d))
                return no_page_table(vma, flags, address);

        return follow_pud_mask(vma, address, p4dp, flags, ctx);
}

/**
 * follow_page_mask - look up a page descriptor from a user-virtual address
 * @vma: vm_area_struct mapping @address
 * @address: virtual address to look up
 * @flags: flags modifying lookup behaviour
 * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
 *       pointer to output page_mask
 *
 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
 *
 * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
 * the device's dev_pagemap metadata to avoid repeating expensive lookups.
 *
 * When getting an anonymous page and the caller has to trigger unsharing
 * of a shared anonymous page first, -EMLINK is returned. The caller should
 * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only
 * relevant with FOLL_PIN and !FOLL_WRITE.
 *
 * On output, the @ctx->page_mask is set according to the size of the page.
 *
 * Return: the mapped (struct page *), %NULL if no mapping exists, or
 * an error pointer if there is a mapping to something not represented
 * by a page descriptor (see also vm_normal_page()).
 */
static struct page *follow_page_mask(struct vm_area_struct *vma,
                              unsigned long address, unsigned int flags,
                              struct follow_page_context *ctx)
{
        pgd_t *pgd;
        struct mm_struct *mm = vma->vm_mm;
        struct page *page;

        vma_pgtable_walk_begin(vma);

        ctx->page_mask = 0;
        pgd = pgd_offset(mm, address);

        if (unlikely(is_hugepd(__hugepd(pgd_val(*pgd)))))
                page = follow_hugepd(vma, __hugepd(pgd_val(*pgd)),
                                     address, PGDIR_SHIFT, flags, ctx);
        else if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
                page = no_page_table(vma, flags, address);
        else
                page = follow_p4d_mask(vma, address, pgd, flags, ctx);

        vma_pgtable_walk_end(vma);

        return page;
}

struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                         unsigned int foll_flags)
{
        struct follow_page_context ctx = { NULL };
        struct page *page;

        if (vma_is_secretmem(vma))
                return NULL;

        if (WARN_ON_ONCE(foll_flags & FOLL_PIN))
                return NULL;

        /*
         * We never set FOLL_HONOR_NUMA_FAULT because callers don't expect
         * to fail on PROT_NONE-mapped pages.
         */
        page = follow_page_mask(vma, address, foll_flags, &ctx);
        if (ctx.pgmap)
                put_dev_pagemap(ctx.pgmap);
        return page;
}

static int get_gate_page(struct mm_struct *mm, unsigned long address,
                unsigned int gup_flags, struct vm_area_struct **vma,
                struct page **page)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;
        pte_t entry;
        int ret = -EFAULT;

        /* user gate pages are read-only */
        if (gup_flags & FOLL_WRITE)
                return -EFAULT;
        if (address > TASK_SIZE)
                pgd = pgd_offset_k(address);
        else
                pgd = pgd_offset_gate(mm, address);
        if (pgd_none(*pgd))
                return -EFAULT;
        p4d = p4d_offset(pgd, address);
        if (p4d_none(*p4d))
                return -EFAULT;
        pud = pud_offset(p4d, address);
        if (pud_none(*pud))
                return -EFAULT;
        pmd = pmd_offset(pud, address);
        if (!pmd_present(*pmd))
                return -EFAULT;
        pte = pte_offset_map(pmd, address);
        if (!pte)
                return -EFAULT;
        entry = ptep_get(pte);
        if (pte_none(entry))
                goto unmap;
        *vma = get_gate_vma(mm);
        if (!page)
                goto out;
        *page = vm_normal_page(*vma, address, entry);
        if (!*page) {
                if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry)))
                        goto unmap;
                *page = pte_page(entry);
        }
        ret = try_grab_page(*page, gup_flags);
        if (unlikely(ret))
                goto unmap;
out:
        ret = 0;
unmap:
        pte_unmap(pte);
        return ret;
}

/*
 * mmap_lock must be held on entry.  If @flags has FOLL_UNLOCKABLE but not
 * FOLL_NOWAIT, the mmap_lock may be released.  If it is, *@locked will be set
 * to 0 and -EBUSY returned.
 */
static int faultin_page(struct vm_area_struct *vma,
                unsigned long address, unsigned int *flags, bool unshare,
                int *locked)
{
        unsigned int fault_flags = 0;
        vm_fault_t ret;

        if (*flags & FOLL_NOFAULT)
                return -EFAULT;
        if (*flags & FOLL_WRITE)
                fault_flags |= FAULT_FLAG_WRITE;
        if (*flags & FOLL_REMOTE)
                fault_flags |= FAULT_FLAG_REMOTE;
        if (*flags & FOLL_UNLOCKABLE) {
                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
                /*
                 * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set
                 * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE.
                 * That's because some callers may not be prepared to
                 * handle early exits caused by non-fatal signals.
                 */
                if (*flags & FOLL_INTERRUPTIBLE)
                        fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
        }
        if (*flags & FOLL_NOWAIT)
                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
        if (*flags & FOLL_TRIED) {
                /*
                 * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
                 * can co-exist
                 */
                fault_flags |= FAULT_FLAG_TRIED;
        }
        if (unshare) {
                fault_flags |= FAULT_FLAG_UNSHARE;
                /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */
                VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE);
        }

        ret = handle_mm_fault(vma, address, fault_flags, NULL);

        if (ret & VM_FAULT_COMPLETED) {
                /*
                 * With FAULT_FLAG_RETRY_NOWAIT we'll never release the
                 * mmap lock in the page fault handler. Sanity check this.
                 */
                WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);
                *locked = 0;

                /*
                 * We should do the same as VM_FAULT_RETRY, but let's not
                 * return -EBUSY since that's not reflecting the reality of
                 * what has happened - we've just fully completed a page
                 * fault, with the mmap lock released.  Use -EAGAIN to show
                 * that we want to take the mmap lock _again_.
                 */
                return -EAGAIN;
        }

        if (ret & VM_FAULT_ERROR) {
                int err = vm_fault_to_errno(ret, *flags);

                if (err)
                        return err;
                BUG();
        }

        if (ret & VM_FAULT_RETRY) {
                if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
                        *locked = 0;
                return -EBUSY;
        }

        return 0;
}

/*
 * Writing to file-backed mappings which require folio dirty tracking using GUP
 * is a fundamentally broken operation, as kernel write access to GUP mappings
 * do not adhere to the semantics expected by a file system.
 *
 * Consider the following scenario:-
 *
 * 1. A folio is written to via GUP which write-faults the memory, notifying
 *    the file system and dirtying the folio.
 * 2. Later, writeback is triggered, resulting in the folio being cleaned and
 *    the PTE being marked read-only.
 * 3. The GUP caller writes to the folio, as it is mapped read/write via the
 *    direct mapping.
 * 4. The GUP caller, now done with the page, unpins it and sets it dirty
 *    (though it does not have to).
 *
 * This results in both data being written to a folio without writenotify, and
 * the folio being dirtied unexpectedly (if the caller decides to do so).
 */
static bool writable_file_mapping_allowed(struct vm_area_struct *vma,
                                          unsigned long gup_flags)
{
        /*
         * If we aren't pinning then no problematic write can occur. A long term
         * pin is the most egregious case so this is the case we disallow.
         */
        if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) !=
            (FOLL_PIN | FOLL_LONGTERM))
                return true;

        /*
         * If the VMA does not require dirty tracking then no problematic write
         * can occur either.
         */
        return !vma_needs_dirty_tracking(vma);
}

static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
{
        vm_flags_t vm_flags = vma->vm_flags;
        int write = (gup_flags & FOLL_WRITE);
        int foreign = (gup_flags & FOLL_REMOTE);
        bool vma_anon = vma_is_anonymous(vma);

        if (vm_flags & (VM_IO | VM_PFNMAP))
                return -EFAULT;

        if ((gup_flags & FOLL_ANON) && !vma_anon)
                return -EFAULT;

        if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
                return -EOPNOTSUPP;

        if (vma_is_secretmem(vma))
                return -EFAULT;

        if (write) {
                if (!vma_anon &&
                    !writable_file_mapping_allowed(vma, gup_flags))
                        return -EFAULT;

                if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) {
                        if (!(gup_flags & FOLL_FORCE))
                                return -EFAULT;
                        /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */
                        if (is_vm_hugetlb_page(vma))
                                return -EFAULT;
                        /*
                         * We used to let the write,force case do COW in a
                         * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
                         * set a breakpoint in a read-only mapping of an
                         * executable, without corrupting the file (yet only
                         * when that file had been opened for writing!).
                         * Anon pages in shared mappings are surprising: now
                         * just reject it.
                         */
                        if (!is_cow_mapping(vm_flags))
                                return -EFAULT;
                }
        } else if (!(vm_flags & VM_READ)) {
                if (!(gup_flags & FOLL_FORCE))
                        return -EFAULT;
                /*
                 * Is there actually any vma we can reach here which does not
                 * have VM_MAYREAD set?
                 */
                if (!(vm_flags & VM_MAYREAD))
                        return -EFAULT;
        }
        /*
         * gups are always data accesses, not instruction
         * fetches, so execute=false here
         */
        if (!arch_vma_access_permitted(vma, write, false, foreign))
                return -EFAULT;
        return 0;
}

/*
 * This is "vma_lookup()", but with a warning if we would have
 * historically expanded the stack in the GUP code.
 */
static struct vm_area_struct *gup_vma_lookup(struct mm_struct *mm,
         unsigned long addr)
{
#ifdef CONFIG_STACK_GROWSUP
        return vma_lookup(mm, addr);
#else
        static volatile unsigned long next_warn;
        struct vm_area_struct *vma;
        unsigned long now, next;

        vma = find_vma(mm, addr);
        if (!vma || (addr >= vma->vm_start))
                return vma;

        /* Only warn for half-way relevant accesses */
        if (!(vma->vm_flags & VM_GROWSDOWN))
                return NULL;
        if (vma->vm_start - addr > 65536)
                return NULL;

        /* Let's not warn more than once an hour.. */
        now = jiffies; next = next_warn;
        if (next && time_before(now, next))
                return NULL;
        next_warn = now + 60*60*HZ;

        /* Let people know things may have changed. */
        pr_warn("GUP no longer grows the stack in %s (%d): %lx-%lx (%lx)\n",
                current->comm, task_pid_nr(current),
                vma->vm_start, vma->vm_end, addr);
        dump_stack();
        return NULL;
#endif
}

/**
 * __get_user_pages() - pin user pages in memory
 * @mm:                mm_struct of target mm
 * @start:        starting user address
 * @nr_pages:        number of pages from start to pin
 * @gup_flags:        flags modifying pin behaviour
 * @pages:        array that receives pointers to the pages pinned.
 *                Should be at least nr_pages long. Or NULL, if caller
 *                only intends to ensure the pages are faulted in.
 * @locked:     whether we're still with the mmap_lock held
 *
 * Returns either number of pages pinned (which may be less than the
 * number requested), or an error. Details about the return value:
 *
 * -- If nr_pages is 0, returns 0.
 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
 * -- If nr_pages is >0, and some pages were pinned, returns the number of
 *    pages pinned. Again, this may be less than nr_pages.
 * -- 0 return value is possible when the fault would need to be retried.
 *
 * The caller is responsible for releasing returned @pages, via put_page().
 *
 * Must be called with mmap_lock held.  It may be released.  See below.
 *
 * __get_user_pages walks a process's page tables and takes a reference to
 * each struct page that each user address corresponds to at a given
 * instant. That is, it takes the page that would be accessed if a user
 * thread accesses the given user virtual address at that instant.
 *
 * This does not guarantee that the page exists in the user mappings when
 * __get_user_pages returns, and there may even be a completely different
 * page there in some cases (eg. if mmapped pagecache has been invalidated
 * and subsequently re-faulted). However it does guarantee that the page
 * won't be freed completely. And mostly callers simply care that the page
 * contains data that was valid *at some point in time*. Typically, an IO
 * or similar operation cannot guarantee anything stronger anyway because
 * locks can't be held over the syscall boundary.
 *
 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
 * appropriate) must be called after the page is finished with, and
 * before put_page is called.
 *
 * If FOLL_UNLOCKABLE is set without FOLL_NOWAIT then the mmap_lock may
 * be released. If this happens *@locked will be set to 0 on return.
 *
 * A caller using such a combination of @gup_flags must therefore hold the
 * mmap_lock for reading only, and recognize when it's been released. Otherwise,
 * it must be held for either reading or writing and will not be released.
 *
 * In most cases, get_user_pages or get_user_pages_fast should be used
 * instead of __get_user_pages. __get_user_pages should be used only if
 * you need some special @gup_flags.
 */
static long __get_user_pages(struct mm_struct *mm,
                unsigned long start, unsigned long nr_pages,
                unsigned int gup_flags, struct page **pages,
                int *locked)
{
        long ret = 0, i = 0;
        struct vm_area_struct *vma = NULL;
        struct follow_page_context ctx = { NULL };

        if (!nr_pages)
                return 0;

        start = untagged_addr_remote(mm, start);

        VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));

        do {
                struct page *page;
                unsigned int foll_flags = gup_flags;
                unsigned int page_increm;

                /* first iteration or cross vma bound */
                if (!vma || start >= vma->vm_end) {
                        /*
                         * MADV_POPULATE_(READ|WRITE) wants to handle VMA
                         * lookups+error reporting differently.
                         */
                        if (gup_flags & FOLL_MADV_POPULATE) {
                                vma = vma_lookup(mm, start);
                                if (!vma) {
                                        ret = -ENOMEM;
                                        goto out;
                                }
                                if (check_vma_flags(vma, gup_flags)) {
                                        ret = -EINVAL;
                                        goto out;
                                }
                                goto retry;
                        }
                        vma = gup_vma_lookup(mm, start);
                        if (!vma && in_gate_area(mm, start)) {
                                ret = get_gate_page(mm, start & PAGE_MASK,
                                                gup_flags, &vma,
                                                pages ? &page : NULL);
                                if (ret)
                                        goto out;
                                ctx.page_mask = 0;
                                goto next_page;
                        }

                        if (!vma) {
                                ret = -EFAULT;
                                goto out;
                        }
                        ret = check_vma_flags(vma, gup_flags);
                        if (ret)
                                goto out;
                }
retry:
                /*
                 * If we have a pending SIGKILL, don't keep faulting pages and
                 * potentially allocating memory.
                 */
                if (fatal_signal_pending(current)) {
                        ret = -EINTR;
                        goto out;
                }
                cond_resched();

                page = follow_page_mask(vma, start, foll_flags, &ctx);
                if (!page || PTR_ERR(page) == -EMLINK) {
                        ret = faultin_page(vma, start, &foll_flags,
                                           PTR_ERR(page) == -EMLINK, locked);
                        switch (ret) {
                        case 0:
                                goto retry;
                        case -EBUSY:
                        case -EAGAIN:
                                ret = 0;
                                fallthrough;
                        case -EFAULT:
                        case -ENOMEM:
                        case -EHWPOISON:
                                goto out;
                        }
                        BUG();
                } else if (PTR_ERR(page) == -EEXIST) {
                        /*
                         * Proper page table entry exists, but no corresponding
                         * struct page. If the caller expects **pages to be
                         * filled in, bail out now, because that can't be done
                         * for this page.
                         */
                        if (pages) {
                                ret = PTR_ERR(page);
                                goto out;
                        }
                } else if (IS_ERR(page)) {
                        ret = PTR_ERR(page);
                        goto out;
                }
next_page:
                page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
                if (page_increm > nr_pages)
                        page_increm = nr_pages;

                if (pages) {
                        struct page *subpage;
                        unsigned int j;

                        /*
                         * This must be a large folio (and doesn't need to
                         * be the whole folio; it can be part of it), do
                         * the refcount work for all the subpages too.
                         *
                         * NOTE: here the page may not be the head page
                         * e.g. when start addr is not thp-size aligned.
                         * try_grab_folio() should have taken care of tail
                         * pages.
                         */
                        if (page_increm > 1) {
                                struct folio *folio;

                                /*
                                 * Since we already hold refcount on the
                                 * large folio, this should never fail.
                                 */
                                folio = try_grab_folio(page, page_increm - 1,
                                                       foll_flags);
                                if (WARN_ON_ONCE(!folio)) {
                                        /*
                                         * Release the 1st page ref if the
                                         * folio is problematic, fail hard.
                                         */
                                        gup_put_folio(page_folio(page), 1,
                                                      foll_flags);
                                        ret = -EFAULT;
                                        goto out;
                                }
                        }

                        for (j = 0; j < page_increm; j++) {
                                subpage = nth_page(page, j);
                                pages[i + j] = subpage;
                                flush_anon_page(vma, subpage, start + j * PAGE_SIZE);
                                flush_dcache_page(subpage);
                        }
                }

                i += page_increm;
                start += page_increm * PAGE_SIZE;
                nr_pages -= page_increm;
        } while (nr_pages);
out:
        if (ctx.pgmap)
                put_dev_pagemap(ctx.pgmap);
        return i ? i : ret;
}

static bool vma_permits_fault(struct vm_area_struct *vma,
                              unsigned int fault_flags)
{
        bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
        bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
        vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;

        if (!(vm_flags & vma->vm_flags))
                return false;

        /*
         * The architecture might have a hardware protection
         * mechanism other than read/write that can deny access.
         *
         * gup always represents data access, not instruction
         * fetches, so execute=false here:
         */
        if (!arch_vma_access_permitted(vma, write, false, foreign))
                return false;

        return true;
}

/**
 * fixup_user_fault() - manually resolve a user page fault
 * @mm:                mm_struct of target mm
 * @address:        user address
 * @fault_flags:flags to pass down to handle_mm_fault()
 * @unlocked:        did we unlock the mmap_lock while retrying, maybe NULL if caller
 *                does not allow retry. If NULL, the caller must guarantee
 *                that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
 *
 * This is meant to be called in the specific scenario where for locking reasons
 * we try to access user memory in atomic context (within a pagefault_disable()
 * section), this returns -EFAULT, and we want to resolve the user fault before
 * trying again.
 *
 * Typically this is meant to be used by the futex code.
 *
 * The main difference with get_user_pages() is that this function will
 * unconditionally call handle_mm_fault() which will in turn perform all the
 * necessary SW fixup of the dirty and young bits in the PTE, while
 * get_user_pages() only guarantees to update these in the struct page.
 *
 * This is important for some architectures where those bits also gate the
 * access permission to the page because they are maintained in software.  On
 * such architectures, gup() will not be enough to make a subsequent access
 * succeed.
 *
 * This function will not return with an unlocked mmap_lock. So it has not the
 * same semantics wrt the @mm->mmap_lock as does filemap_fault().
 */
int fixup_user_fault(struct mm_struct *mm,
                     unsigned long address, unsigned int fault_flags,
                     bool *unlocked)
{
        struct vm_area_struct *vma;
        vm_fault_t ret;

        address = untagged_addr_remote(mm, address);

        if (unlocked)
                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;

retry:
        vma = gup_vma_lookup(mm, address);
        if (!vma)
                return -EFAULT;

        if (!vma_permits_fault(vma, fault_flags))
                return -EFAULT;

        if ((fault_flags & FAULT_FLAG_KILLABLE) &&
            fatal_signal_pending(current))
                return -EINTR;

        ret = handle_mm_fault(vma, address, fault_flags, NULL);

        if (ret & VM_FAULT_COMPLETED) {
                /*
                 * NOTE: it's a pity that we need to retake the lock here
                 * to pair with the unlock() in the callers. Ideally we
                 * could tell the callers so they do not need to unlock.
                 */
                mmap_read_lock(mm);
                *unlocked = true;
                return 0;
        }

        if (ret & VM_FAULT_ERROR) {
                int err = vm_fault_to_errno(ret, 0);

                if (err)
                        return err;
                BUG();
        }

        if (ret & VM_FAULT_RETRY) {
                mmap_read_lock(mm);
                *unlocked = true;
                fault_flags |= FAULT_FLAG_TRIED;
                goto retry;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(fixup_user_fault);

/*
 * GUP always responds to fatal signals.  When FOLL_INTERRUPTIBLE is
 * specified, it'll also respond to generic signals.  The caller of GUP
 * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption.
 */
static bool gup_signal_pending(unsigned int flags)
{
        if (fatal_signal_pending(current))
                return true;

        if (!(flags & FOLL_INTERRUPTIBLE))
                return false;

        return signal_pending(current);
}

/*
 * Locking: (*locked == 1) means that the mmap_lock has already been acquired by
 * the caller. This function may drop the mmap_lock. If it does so, then it will
 * set (*locked = 0).
 *
 * (*locked == 0) means that the caller expects this function to acquire and
 * drop the mmap_lock. Therefore, the value of *locked will still be zero when
 * the function returns, even though it may have changed temporarily during
 * function execution.
 *
 * Please note that this function, unlike __get_user_pages(), will not return 0
 * for nr_pages > 0, unless FOLL_NOWAIT is used.
 */
static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
                                                unsigned long start,
                                                unsigned long nr_pages,
                                                struct page **pages,
                                                int *locked,
                                                unsigned int flags)
{
        long ret, pages_done;
        bool must_unlock = false;

        if (!nr_pages)
                return 0;

        /*
         * The internal caller expects GUP to manage the lock internally and the
         * lock must be released when this returns.
         */
        if (!*locked) {
                if (mmap_read_lock_killable(mm))
                        return -EAGAIN;
                must_unlock = true;
                *locked = 1;
        }
        else
                mmap_assert_locked(mm);

        if (flags & FOLL_PIN)
                mm_set_has_pinned_flag(&mm->flags);

        /*
         * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
         * is to set FOLL_GET if the caller wants pages[] filled in (but has
         * carelessly failed to specify FOLL_GET), so keep doing that, but only
         * for FOLL_GET, not for the newer FOLL_PIN.
         *
         * FOLL_PIN always expects pages to be non-null, but no need to assert
         * that here, as any failures will be obvious enough.
         */
        if (pages && !(flags & FOLL_PIN))
                flags |= FOLL_GET;

        pages_done = 0;
        for (;;) {
                ret = __get_user_pages(mm, start, nr_pages, flags, pages,
                                       locked);
                if (!(flags & FOLL_UNLOCKABLE)) {
                        /* VM_FAULT_RETRY couldn't trigger, bypass */
                        pages_done = ret;
                        break;
                }

                /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
                if (!*locked) {
                        BUG_ON(ret < 0);
                        BUG_ON(ret >= nr_pages);
                }

                if (ret > 0) {
                        nr_pages -= ret;
                        pages_done += ret;
                        if (!nr_pages)
                                break;
                }
                if (*locked) {
                        /*
                         * VM_FAULT_RETRY didn't trigger or it was a
                         * FOLL_NOWAIT.
                         */
                        if (!pages_done)
                                pages_done = ret;
                        break;
                }
                /*
                 * VM_FAULT_RETRY triggered, so seek to the faulting offset.
                 * For the prefault case (!pages) we only update counts.
                 */
                if (likely(pages))
                        pages += ret;
                start += ret << PAGE_SHIFT;

                /* The lock was temporarily dropped, so we must unlock later */
                must_unlock = true;

retry:
                /*
                 * Repeat on the address that fired VM_FAULT_RETRY
                 * with both FAULT_FLAG_ALLOW_RETRY and
                 * FAULT_FLAG_TRIED.  Note that GUP can be interrupted
                 * by fatal signals of even common signals, depending on
                 * the caller's request. So we need to check it before we
                 * start trying again otherwise it can loop forever.
                 */
                if (gup_signal_pending(flags)) {
                        if (!pages_done)
                                pages_done = -EINTR;
                        break;
                }

                ret = mmap_read_lock_killable(mm);
                if (ret) {
                        BUG_ON(ret > 0);
                        if (!pages_done)
                                pages_done = ret;
                        break;
                }

                *locked = 1;
                ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
                                       pages, locked);
                if (!*locked) {
                        /* Continue to retry until we succeeded */
                        BUG_ON(ret != 0);
                        goto retry;
                }
                if (ret != 1) {
                        BUG_ON(ret > 1);
                        if (!pages_done)
                                pages_done = ret;
                        break;
                }
                nr_pages--;
                pages_done++;
                if (!nr_pages)
                        break;
                if (likely(pages))
                        pages++;
                start += PAGE_SIZE;
        }
        if (must_unlock && *locked) {
                /*
                 * We either temporarily dropped the lock, or the caller
                 * requested that we both acquire and drop the lock. Either way,
                 * we must now unlock, and notify the caller of that state.
                 */
                mmap_read_unlock(mm);
                *locked = 0;
        }

        /*
         * Failing to pin anything implies something has gone wrong (except when
         * FOLL_NOWAIT is specified).
         */
        if (WARN_ON_ONCE(pages_done == 0 && !(flags & FOLL_NOWAIT)))
                return -EFAULT;

        return pages_done;
}

/**
 * populate_vma_page_range() -  populate a range of pages in the vma.
 * @vma:   target vma
 * @start: start address
 * @end:   end address
 * @locked: whether the mmap_lock is still held
 *
 * This takes care of mlocking the pages too if VM_LOCKED is set.
 *
 * Return either number of pages pinned in the vma, or a negative error
 * code on error.
 *
 * vma->vm_mm->mmap_lock must be held.
 *
 * If @locked is NULL, it may be held for read or write and will
 * be unperturbed.
 *
 * If @locked is non-NULL, it must held for read only and may be
 * released.  If it's released, *@locked will be set to 0.
 */
long populate_vma_page_range(struct vm_area_struct *vma,
                unsigned long start, unsigned long end, int *locked)
{
        struct mm_struct *mm = vma->vm_mm;
        unsigned long nr_pages = (end - start) / PAGE_SIZE;
        int local_locked = 1;
        int gup_flags;
        long ret;

        VM_BUG_ON(!PAGE_ALIGNED(start));
        VM_BUG_ON(!PAGE_ALIGNED(end));
        VM_BUG_ON_VMA(start < vma->vm_start, vma);
        VM_BUG_ON_VMA(end   > vma->vm_end, vma);
        mmap_assert_locked(mm);

        /*
         * Rightly or wrongly, the VM_LOCKONFAULT case has never used
         * faultin_page() to break COW, so it has no work to do here.
         */
        if (vma->vm_flags & VM_LOCKONFAULT)
                return nr_pages;

        /* ... similarly, we've never faulted in PROT_NONE pages */
        if (!vma_is_accessible(vma))
                return -EFAULT;

        gup_flags = FOLL_TOUCH;
        /*
         * We want to touch writable mappings with a write fault in order
         * to break COW, except for shared mappings because these don't COW
         * and we would not want to dirty them for nothing.
         *
         * Otherwise, do a read fault, and use FOLL_FORCE in case it's not
         * readable (ie write-only or executable).
         */
        if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
                gup_flags |= FOLL_WRITE;
        else
                gup_flags |= FOLL_FORCE;

        if (locked)
                gup_flags |= FOLL_UNLOCKABLE;

        /*
         * We made sure addr is within a VMA, so the following will
         * not result in a stack expansion that recurses back here.
         */
        ret = __get_user_pages(mm, start, nr_pages, gup_flags,
                               NULL, locked ? locked : &local_locked);
        lru_add_drain();
        return ret;
}

/*
 * faultin_page_range() - populate (prefault) page tables inside the
 *                          given range readable/writable
 *
 * This takes care of mlocking the pages, too, if VM_LOCKED is set.
 *
 * @mm: the mm to populate page tables in
 * @start: start address
 * @end: end address
 * @write: whether to prefault readable or writable
 * @locked: whether the mmap_lock is still held
 *
 * Returns either number of processed pages in the MM, or a negative error
 * code on error (see __get_user_pages()). Note that this function reports
 * errors related to VMAs, such as incompatible mappings, as expected by
 * MADV_POPULATE_(READ|WRITE).
 *
 * The range must be page-aligned.
 *
 * mm->mmap_lock must be held. If it's released, *@locked will be set to 0.
 */
long faultin_page_range(struct mm_struct *mm, unsigned long start,
                        unsigned long end, bool write, int *locked)
{
        unsigned long nr_pages = (end - start) / PAGE_SIZE;
        int gup_flags;
        long ret;

        VM_BUG_ON(!PAGE_ALIGNED(start));
        VM_BUG_ON(!PAGE_ALIGNED(end));
        mmap_assert_locked(mm);

        /*
         * FOLL_TOUCH: Mark page accessed and thereby young; will also mark
         *               the page dirty with FOLL_WRITE -- which doesn't make a
         *               difference with !FOLL_FORCE, because the page is writable
         *               in the page table.
         * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
         *                  a poisoned page.
         * !FOLL_FORCE: Require proper access permissions.
         */
        gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE |
                    FOLL_MADV_POPULATE;
        if (write)
                gup_flags |= FOLL_WRITE;

        ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked,
                                      gup_flags);
        lru_add_drain();
        return ret;
}

/*
 * __mm_populate - populate and/or mlock pages within a range of address space.
 *
 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
 * flags. VMAs must be already marked with the desired vm_flags, and
 * mmap_lock must not be held.
 */
int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
{
        struct mm_struct *mm = current->mm;
        unsigned long end, nstart, nend;
        struct vm_area_struct *vma = NULL;
        int locked = 0;
        long ret = 0;

        end = start + len;

        for (nstart = start; nstart < end; nstart = nend) {
                /*
                 * We want to fault in pages for [nstart; end) address range.
                 * Find first corresponding VMA.
                 */
                if (!locked) {
                        locked = 1;
                        mmap_read_lock(mm);
                        vma = find_vma_intersection(mm, nstart, end);
                } else if (nstart >= vma->vm_end)
                        vma = find_vma_intersection(mm, vma->vm_end, end);

                if (!vma)
                        break;
                /*
                 * Set [nstart; nend) to intersection of desired address
                 * range with the first VMA. Also, skip undesirable VMA types.
                 */
                nend = min(end, vma->vm_end);
                if (vma->vm_flags & (VM_IO | VM_PFNMAP))
                        continue;
                if (nstart < vma->vm_start)
                        nstart = vma->vm_start;
                /*
                 * Now fault in a range of pages. populate_vma_page_range()
                 * double checks the vma flags, so that it won't mlock pages
                 * if the vma was already munlocked.
                 */
                ret = populate_vma_page_range(vma, nstart, nend, &locked);
                if (ret < 0) {
                        if (ignore_errors) {
                                ret = 0;
                                continue;        /* continue at next VMA */
                        }
                        break;
                }
                nend = nstart + ret * PAGE_SIZE;
                ret = 0;
        }
        if (locked)
                mmap_read_unlock(mm);
        return ret;        /* 0 or negative error code */
}
#else /* CONFIG_MMU */
static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
                unsigned long nr_pages, struct page **pages,
                int *locked, unsigned int foll_flags)
{
        struct vm_area_struct *vma;
        bool must_unlock = false;
        unsigned long vm_flags;
        long i;

        if (!nr_pages)
                return 0;

        /*
         * The internal caller expects GUP to manage the lock internally and the
         * lock must be released when this returns.
         */
        if (!*locked) {
                if (mmap_read_lock_killable(mm))
                        return -EAGAIN;
                must_unlock = true;
                *locked = 1;
        }

        /* calculate required read or write permissions.
         * If FOLL_FORCE is set, we only require the "MAY" flags.
         */
        vm_flags  = (foll_flags & FOLL_WRITE) ?
                        (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
        vm_flags &= (foll_flags & FOLL_FORCE) ?
                        (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);

        for (i = 0; i < nr_pages; i++) {
                vma = find_vma(mm, start);
                if (!vma)
                        break;

                /* protect what we can, including chardevs */
                if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
                    !(vm_flags & vma->vm_flags))
                        break;

                if (pages) {
                        pages[i] = virt_to_page((void *)start);
                        if (pages[i])
                                get_page(pages[i]);
                }

                start = (start + PAGE_SIZE) & PAGE_MASK;
        }

        if (must_unlock && *locked) {
                mmap_read_unlock(mm);
                *locked = 0;
        }

        return i ? : -EFAULT;
}
#endif /* !CONFIG_MMU */

/**
 * fault_in_writeable - fault in userspace address range for writing
 * @uaddr: start of address range
 * @size: size of address range
 *
 * Returns the number of bytes not faulted in (like copy_to_user() and
 * copy_from_user()).
 */
size_t fault_in_writeable(char __user *uaddr, size_t size)
{
        char __user *start = uaddr, *end;

        if (unlikely(size == 0))
                return 0;
        if (!user_write_access_begin(uaddr, size))
                return size;
        if (!PAGE_ALIGNED(uaddr)) {
                unsafe_put_user(0, uaddr, out);
                uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
        }
        end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
        if (unlikely(end < start))
                end = NULL;
        while (uaddr != end) {
                unsafe_put_user(0, uaddr, out);
                uaddr += PAGE_SIZE;
        }

out:
        user_write_access_end();
        if (size > uaddr - start)
                return size - (uaddr - start);
        return 0;
}
EXPORT_SYMBOL(fault_in_writeable);

/**
 * fault_in_subpage_writeable - fault in an address range for writing
 * @uaddr: start of address range
 * @size: size of address range
 *
 * Fault in a user address range for writing while checking for permissions at
 * sub-page granularity (e.g. arm64 MTE). This function should be used when
 * the caller cannot guarantee forward progress of a copy_to_user() loop.
 *
 * Returns the number of bytes not faulted in (like copy_to_user() and
 * copy_from_user()).
 */
size_t fault_in_subpage_writeable(char __user *uaddr, size_t size)
{
        size_t faulted_in;

        /*
         * Attempt faulting in at page granularity first for page table
         * permission checking. The arch-specific probe_subpage_writeable()
         * functions may not check for this.
         */
        faulted_in = size - fault_in_writeable(uaddr, size);
        if (faulted_in)
                faulted_in -= probe_subpage_writeable(uaddr, faulted_in);

        return size - faulted_in;
}
EXPORT_SYMBOL(fault_in_subpage_writeable);

/*
 * fault_in_safe_writeable - fault in an address range for writing
 * @uaddr: start of address range
 * @size: length of address range
 *
 * Faults in an address range for writing.  This is primarily useful when we
 * already know that some or all of the pages in the address range aren't in
 * memory.
 *
 * Unlike fault_in_writeable(), this function is non-destructive.
 *
 * Note that we don't pin or otherwise hold the pages referenced that we fault
 * in.  There's no guarantee that they'll stay in memory for any duration of
 * time.
 *
 * Returns the number of bytes not faulted in, like copy_to_user() and
 * copy_from_user().
 */
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
{
        unsigned long start = (unsigned long)uaddr, end;
        struct mm_struct *mm = current->mm;
        bool unlocked = false;

        if (unlikely(size == 0))
                return 0;
        end = PAGE_ALIGN(start + size);
        if (end < start)
                end = 0;

        mmap_read_lock(mm);
        do {
                if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
                        break;
                start = (start + PAGE_SIZE) & PAGE_MASK;
        } while (start != end);
        mmap_read_unlock(mm);

        if (size > (unsigned long)uaddr - start)
                return size - ((unsigned long)uaddr - start);
        return 0;
}
EXPORT_SYMBOL(fault_in_safe_writeable);

/**
 * fault_in_readable - fault in userspace address range for reading
 * @uaddr: start of user address range
 * @size: size of user address range
 *
 * Returns the number of bytes not faulted in (like copy_to_user() and
 * copy_from_user()).
 */
size_t fault_in_readable(const char __user *uaddr, size_t size)
{
        const char __user *start = uaddr, *end;
        volatile char c;

        if (unlikely(size == 0))
                return 0;
        if (!user_read_access_begin(uaddr, size))
                return size;
        if (!PAGE_ALIGNED(uaddr)) {
                unsafe_get_user(c, uaddr, out);
                uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
        }
        end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
        if (unlikely(end < start))
                end = NULL;
        while (uaddr != end) {
                unsafe_get_user(c, uaddr, out);
                uaddr += PAGE_SIZE;
        }

out:
        user_read_access_end();
        (void)c;
        if (size > uaddr - start)
                return size - (uaddr - start);
        return 0;
}
EXPORT_SYMBOL(fault_in_readable);

/**
 * get_dump_page() - pin user page in memory while writing it to core dump
 * @addr: user address
 *
 * Returns struct page pointer of user page pinned for dump,
 * to be freed afterwards by put_page().
 *
 * Returns NULL on any kind of failure - a hole must then be inserted into
 * the corefile, to preserve alignment with its headers; and also returns
 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
 * allowing a hole to be left in the corefile to save disk space.
 *
 * Called without mmap_lock (takes and releases the mmap_lock by itself).
 */
#ifdef CONFIG_ELF_CORE
struct page *get_dump_page(unsigned long addr)
{
        struct page *page;
        int locked = 0;
        int ret;

        ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked,
                                      FOLL_FORCE | FOLL_DUMP | FOLL_GET);
        return (ret == 1) ? page : NULL;
}
#endif /* CONFIG_ELF_CORE */

#ifdef CONFIG_MIGRATION
/*
 * Returns the number of collected pages. Return value is always >= 0.
 */
static unsigned long collect_longterm_unpinnable_pages(
                                        struct list_head *movable_page_list,
                                        unsigned long nr_pages,
                                        struct page **pages)
{
        unsigned long i, collected = 0;
        struct folio *prev_folio = NULL;
        bool drain_allow = true;

        for (i = 0; i < nr_pages; i++) {
                struct folio *folio = page_folio(pages[i]);

                if (folio == prev_folio)
                        continue;
                prev_folio = folio;

                if (folio_is_longterm_pinnable(folio))
                        continue;

                collected++;

                if (folio_is_device_coherent(folio))
                        continue;

                if (folio_test_hugetlb(folio)) {
                        isolate_hugetlb(folio, movable_page_list);
                        continue;
                }

                if (!folio_test_lru(folio) && drain_allow) {
                        lru_add_drain_all();
                        drain_allow = false;
                }

                if (!folio_isolate_lru(folio))
                        continue;

                list_add_tail(&folio->lru, movable_page_list);
                node_stat_mod_folio(folio,
                                    NR_ISOLATED_ANON + folio_is_file_lru(folio),
                                    folio_nr_pages(folio));
        }

        return collected;
}

/*
 * Unpins all pages and migrates device coherent pages and movable_page_list.
 * Returns -EAGAIN if all pages were successfully migrated or -errno for failure
 * (or partial success).
 */
static int migrate_longterm_unpinnable_pages(
                                        struct list_head *movable_page_list,
                                        unsigned long nr_pages,
                                        struct page **pages)
{
        int ret;
        unsigned long i;

        for (i = 0; i < nr_pages; i++) {
                struct folio *folio = page_folio(pages[i]);

                if (folio_is_device_coherent(folio)) {
                        /*
                         * Migration will fail if the page is pinned, so convert
                         * the pin on the source page to a normal reference.
                         */
                        pages[i] = NULL;
                        folio_get(folio);
                        gup_put_folio(folio, 1, FOLL_PIN);

                        if (migrate_device_coherent_page(&folio->page)) {
                                ret = -EBUSY;
                                goto err;
                        }

                        continue;
                }

                /*
                 * We can't migrate pages with unexpected references, so drop
                 * the reference obtained by __get_user_pages_locked().
                 * Migrating pages have been added to movable_page_list after
                 * calling folio_isolate_lru() which takes a reference so the
                 * page won't be freed if it's migrating.
                 */
                unpin_user_page(pages[i]);
                pages[i] = NULL;
        }

        if (!list_empty(movable_page_list)) {
                struct migration_target_control mtc = {
                        .nid = NUMA_NO_NODE,
                        .gfp_mask = GFP_USER | __GFP_NOWARN,
                        .reason = MR_LONGTERM_PIN,
                };

                if (migrate_pages(movable_page_list, alloc_migration_target,
                                  NULL, (unsigned long)&mtc, MIGRATE_SYNC,
                                  MR_LONGTERM_PIN, NULL)) {
                        ret = -ENOMEM;
                        goto err;
                }
        }

        putback_movable_pages(movable_page_list);

        return -EAGAIN;

err:
        for (i = 0; i < nr_pages; i++)
                if (pages[i])
                        unpin_user_page(pages[i]);
        putback_movable_pages(movable_page_list);

        return ret;
}

/*
 * Check whether all pages are *allowed* to be pinned. Rather confusingly, all
 * pages in the range are required to be pinned via FOLL_PIN, before calling
 * this routine.
 *
 * If any pages in the range are not allowed to be pinned, then this routine
 * will migrate those pages away, unpin all the pages in the range and return
 * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then
 * call this routine again.
 *
 * If an error other than -EAGAIN occurs, this indicates a migration failure.
 * The caller should give up, and propagate the error back up the call stack.
 *
 * If everything is OK and all pages in the range are allowed to be pinned, then
 * this routine leaves all pages pinned and returns zero for success.
 */
static long check_and_migrate_movable_pages(unsigned long nr_pages,
                                            struct page **pages)
{
        unsigned long collected;
        LIST_HEAD(movable_page_list);

        collected = collect_longterm_unpinnable_pages(&movable_page_list,
                                                nr_pages, pages);
        if (!collected)
                return 0;

        return migrate_longterm_unpinnable_pages(&movable_page_list, nr_pages,
                                                pages);
}
#else
static long check_and_migrate_movable_pages(unsigned long nr_pages,
                                            struct page **pages)
{
        return 0;
}
#endif /* CONFIG_MIGRATION */

/*
 * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
 * allows us to process the FOLL_LONGTERM flag.
 */
static long __gup_longterm_locked(struct mm_struct *mm,
                                  unsigned long start,
                                  unsigned long nr_pages,
                                  struct page **pages,
                                  int *locked,
                                  unsigned int gup_flags)
{
        unsigned int flags;
        long rc, nr_pinned_pages;

        if (!(gup_flags & FOLL_LONGTERM))
                return __get_user_pages_locked(mm, start, nr_pages, pages,
                                               locked, gup_flags);

        flags = memalloc_pin_save();
        do {
                nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,
                                                          pages, locked,
                                                          gup_flags);
                if (nr_pinned_pages <= 0) {
                        rc = nr_pinned_pages;
                        break;
                }

                /* FOLL_LONGTERM implies FOLL_PIN */
                rc = check_and_migrate_movable_pages(nr_pinned_pages, pages);
        } while (rc == -EAGAIN);
        memalloc_pin_restore(flags);
        return rc ? rc : nr_pinned_pages;
}

/*
 * Check that the given flags are valid for the exported gup/pup interface, and
 * update them with the required flags that the caller must have set.
 */
static bool is_valid_gup_args(struct page **pages, int *locked,
                              unsigned int *gup_flags_p, unsigned int to_set)
{
        unsigned int gup_flags = *gup_flags_p;

        /*
         * These flags not allowed to be specified externally to the gup
         * interfaces:
         * - FOLL_TOUCH/FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only
         * - FOLL_REMOTE is internal only and used on follow_page()
         * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL
         */
        if (WARN_ON_ONCE(gup_flags & INTERNAL_GUP_FLAGS))
                return false;

        gup_flags |= to_set;
        if (locked) {
                /* At the external interface locked must be set */
                if (WARN_ON_ONCE(*locked != 1))
                        return false;

                gup_flags |= FOLL_UNLOCKABLE;
        }

        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
        if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
                         (FOLL_PIN | FOLL_GET)))
                return false;

        /* LONGTERM can only be specified when pinning */
        if (WARN_ON_ONCE(!(gup_flags & FOLL_PIN) && (gup_flags & FOLL_LONGTERM)))
                return false;

        /* Pages input must be given if using GET/PIN */
        if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages))
                return false;

        /* We want to allow the pgmap to be hot-unplugged at all times */
        if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) &&
                         (gup_flags & FOLL_PCI_P2PDMA)))
                return false;

        *gup_flags_p = gup_flags;
        return true;
}

#ifdef CONFIG_MMU
/**
 * get_user_pages_remote() - pin user pages in memory
 * @mm:                mm_struct of target mm
 * @start:        starting user address
 * @nr_pages:        number of pages from start to pin
 * @gup_flags:        flags modifying lookup behaviour
 * @pages:        array that receives pointers to the pages pinned.
 *                Should be at least nr_pages long. Or NULL, if caller
 *                only intends to ensure the pages are faulted in.
 * @locked:        pointer to lock flag indicating whether lock is held and
 *                subsequently whether VM_FAULT_RETRY functionality can be
 *                utilised. Lock must initially be held.
 *
 * Returns either number of pages pinned (which may be less than the
 * number requested), or an error. Details about the return value:
 *
 * -- If nr_pages is 0, returns 0.
 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
 * -- If nr_pages is >0, and some pages were pinned, returns the number of
 *    pages pinned. Again, this may be less than nr_pages.
 *
 * The caller is responsible for releasing returned @pages, via put_page().
 *
 * Must be called with mmap_lock held for read or write.
 *
 * get_user_pages_remote walks a process's page tables and takes a reference
 * to each struct page that each user address corresponds to at a given
 * instant. That is, it takes the page that would be accessed if a user
 * thread accesses the given user virtual address at that instant.
 *
 * This does not guarantee that the page exists in the user mappings when
 * get_user_pages_remote returns, and there may even be a completely different
 * page there in some cases (eg. if mmapped pagecache has been invalidated
 * and subsequently re-faulted). However it does guarantee that the page
 * won't be freed completely. And mostly callers simply care that the page
 * contains data that was valid *at some point in time*. Typically, an IO
 * or similar operation cannot guarantee anything stronger anyway because
 * locks can't be held over the syscall boundary.
 *
 * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
 * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
 * be called after the page is finished with, and before put_page is called.
 *
 * get_user_pages_remote is typically used for fewer-copy IO operations,
 * to get a handle on the memory by some means other than accesses
 * via the user virtual addresses. The pages may be submitted for
 * DMA to devices or accessed via their kernel linear mapping (via the
 * kmap APIs). Care should be taken to use the correct cache flushing APIs.
 *
 * See also get_user_pages_fast, for performance critical applications.
 *
 * get_user_pages_remote should be phased out in favor of
 * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
 * should use get_user_pages_remote because it cannot pass
 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
 */
long get_user_pages_remote(struct mm_struct *mm,
                unsigned long start, unsigned long nr_pages,
                unsigned int gup_flags, struct page **pages,
                int *locked)
{
        int local_locked = 1;

        if (!is_valid_gup_args(pages, locked, &gup_flags,
                               FOLL_TOUCH | FOLL_REMOTE))
                return -EINVAL;

        return __get_user_pages_locked(mm, start, nr_pages, pages,
                                       locked ? locked : &local_locked,
                                       gup_flags);
}
EXPORT_SYMBOL(get_user_pages_remote);

#else /* CONFIG_MMU */
long get_user_pages_remote(struct mm_struct *mm,
                           unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           int *locked)
{
        return 0;
}
#endif /* !CONFIG_MMU */

/**
 * get_user_pages() - pin user pages in memory
 * @start:      starting user address
 * @nr_pages:   number of pages from start to pin
 * @gup_flags:  flags modifying lookup behaviour
 * @pages:      array that receives pointers to the pages pinned.
 *              Should be at least nr_pages long. Or NULL, if caller
 *              only intends to ensure the pages are faulted in.
 *
 * This is the same as get_user_pages_remote(), just with a less-flexible
 * calling convention where we assume that the mm being operated on belongs to
 * the current task, and doesn't allow passing of a locked parameter.  We also
 * obviously don't pass FOLL_REMOTE in here.
 */
long get_user_pages(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages)
{
        int locked = 1;

        if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH))
                return -EINVAL;

        return __get_user_pages_locked(current->mm, start, nr_pages, pages,
                                       &locked, gup_flags);
}
EXPORT_SYMBOL(get_user_pages);

/*
 * get_user_pages_unlocked() is suitable to replace the form:
 *
 *      mmap_read_lock(mm);
 *      get_user_pages(mm, ..., pages, NULL);
 *      mmap_read_unlock(mm);
 *
 *  with:
 *
 *      get_user_pages_unlocked(mm, ..., pages);
 *
 * It is functionally equivalent to get_user_pages_fast so
 * get_user_pages_fast should be used instead if specific gup_flags
 * (e.g. FOLL_FORCE) are not required.
 */
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                             struct page **pages, unsigned int gup_flags)
{
        int locked = 0;

        if (!is_valid_gup_args(pages, NULL, &gup_flags,
                               FOLL_TOUCH | FOLL_UNLOCKABLE))
                return -EINVAL;

        return __get_user_pages_locked(current->mm, start, nr_pages, pages,
                                       &locked, gup_flags);
}
EXPORT_SYMBOL(get_user_pages_unlocked);

/*
 * GUP-fast
 *
 * get_user_pages_fast attempts to pin user pages by walking the page
 * tables directly and avoids taking locks. Thus the walker needs to be
 * protected from page table pages being freed from under it, and should
 * block any THP splits.
 *
 * One way to achieve this is to have the walker disable interrupts, and
 * rely on IPIs from the TLB flushing code blocking before the page table
 * pages are freed. This is unsuitable for architectures that do not need
 * to broadcast an IPI when invalidating TLBs.
 *
 * Another way to achieve this is to batch up page table containing pages
 * belonging to more than one mm_user, then rcu_sched a callback to free those
 * pages. Disabling interrupts will allow the gup_fast() walker to both block
 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
 * (which is a relatively rare event). The code below adopts this strategy.
 *
 * Before activating this code, please be aware that the following assumptions
 * are currently made:
 *
 *  *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
 *  free pages containing page tables or TLB flushing requires IPI broadcast.
 *
 *  *) ptes can be read atomically by the architecture.
 *
 *  *) access_ok is sufficient to validate userspace address ranges.
 *
 * The last two assumptions can be relaxed by the addition of helper functions.
 *
 * This code is based heavily on the PowerPC implementation by Nick Piggin.
 */
#ifdef CONFIG_HAVE_GUP_FAST

/*
 * Used in the GUP-fast path to determine whether GUP is permitted to work on
 * a specific folio.
 *
 * This call assumes the caller has pinned the folio, that the lowest page table
 * level still points to this folio, and that interrupts have been disabled.
 *
 * GUP-fast must reject all secretmem folios.
 *
 * Writing to pinned file-backed dirty tracked folios is inherently problematic
 * (see comment describing the writable_file_mapping_allowed() function). We
 * therefore try to avoid the most egregious case of a long-term mapping doing
 * so.
 *
 * This function cannot be as thorough as that one as the VMA is not available
 * in the fast path, so instead we whitelist known good cases and if in doubt,
 * fall back to the slow path.
 */
static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
{
        bool reject_file_backed = false;
        struct address_space *mapping;
        bool check_secretmem = false;
        unsigned long mapping_flags;

        /*
         * If we aren't pinning then no problematic write can occur. A long term
         * pin is the most egregious case so this is the one we disallow.
         */
        if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) ==
            (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE))
                reject_file_backed = true;

        /* We hold a folio reference, so we can safely access folio fields. */

        /* secretmem folios are always order-0 folios. */
        if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio))
                check_secretmem = true;

        if (!reject_file_backed && !check_secretmem)
                return true;

        if (WARN_ON_ONCE(folio_test_slab(folio)))
                return false;

        /* hugetlb neither requires dirty-tracking nor can be secretmem. */
        if (folio_test_hugetlb(folio))
                return true;

        /*
         * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods
         * cannot proceed, which means no actions performed under RCU can
         * proceed either.
         *
         * inodes and thus their mappings are freed under RCU, which means the
         * mapping cannot be freed beneath us and thus we can safely dereference
         * it.
         */
        lockdep_assert_irqs_disabled();

        /*
         * However, there may be operations which _alter_ the mapping, so ensure
         * we read it once and only once.
         */
        mapping = READ_ONCE(folio->mapping);

        /*
         * The mapping may have been truncated, in any case we cannot determine
         * if this mapping is safe - fall back to slow path to determine how to
         * proceed.
         */
        if (!mapping)
                return false;

        /* Anonymous folios pose no problem. */
        mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS;
        if (mapping_flags)
                return mapping_flags & PAGE_MAPPING_ANON;

        /*
         * At this point, we know the mapping is non-null and points to an
         * address_space object.
         */
        if (check_secretmem && secretmem_mapping(mapping))
                return false;
        /* The only remaining allowed file system is shmem. */
        return !reject_file_backed || shmem_mapping(mapping);
}

static void __maybe_unused gup_fast_undo_dev_pagemap(int *nr, int nr_start,
                unsigned int flags, struct page **pages)
{
        while ((*nr) - nr_start) {
                struct folio *folio = page_folio(pages[--(*nr)]);

                folio_clear_referenced(folio);
                gup_put_folio(folio, 1, flags);
        }
}

#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
/*
 * GUP-fast relies on pte change detection to avoid concurrent pgtable
 * operations.
 *
 * To pin the page, GUP-fast needs to do below in order:
 * (1) pin the page (by prefetching pte), then (2) check pte not changed.
 *
 * For the rest of pgtable operations where pgtable updates can be racy
 * with GUP-fast, we need to do (1) clear pte, then (2) check whether page
 * is pinned.
 *
 * Above will work for all pte-level operations, including THP split.
 *
 * For THP collapse, it's a bit more complicated because GUP-fast may be
 * walking a pgtable page that is being freed (pte is still valid but pmd
 * can be cleared already).  To avoid race in such condition, we need to
 * also check pmd here to make sure pmd doesn't change (corresponds to
 * pmdp_collapse_flush() in the THP collapse code path).
 */
static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        struct dev_pagemap *pgmap = NULL;
        int nr_start = *nr, ret = 0;
        pte_t *ptep, *ptem;

        ptem = ptep = pte_offset_map(&pmd, addr);
        if (!ptep)
                return 0;
        do {
                pte_t pte = ptep_get_lockless(ptep);
                struct page *page;
                struct folio *folio;

                /*
                 * Always fallback to ordinary GUP on PROT_NONE-mapped pages:
                 * pte_access_permitted() better should reject these pages
                 * either way: otherwise, GUP-fast might succeed in
                 * cases where ordinary GUP would fail due to VMA access
                 * permissions.
                 */
                if (pte_protnone(pte))
                        goto pte_unmap;

                if (!pte_access_permitted(pte, flags & FOLL_WRITE))
                        goto pte_unmap;

                if (pte_devmap(pte)) {
                        if (unlikely(flags & FOLL_LONGTERM))
                                goto pte_unmap;

                        pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
                        if (unlikely(!pgmap)) {
                                gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
                                goto pte_unmap;
                        }
                } else if (pte_special(pte))
                        goto pte_unmap;

                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
                page = pte_page(pte);

                folio = try_grab_folio(page, 1, flags);
                if (!folio)
                        goto pte_unmap;

                if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
                    unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
                        gup_put_folio(folio, 1, flags);
                        goto pte_unmap;
                }

                if (!gup_fast_folio_allowed(folio, flags)) {
                        gup_put_folio(folio, 1, flags);
                        goto pte_unmap;
                }

                if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) {
                        gup_put_folio(folio, 1, flags);
                        goto pte_unmap;
                }

                /*
                 * We need to make the page accessible if and only if we are
                 * going to access its content (the FOLL_PIN case).  Please
                 * see Documentation/core-api/pin_user_pages.rst for
                 * details.
                 */
                if (flags & FOLL_PIN) {
                        ret = arch_make_page_accessible(page);
                        if (ret) {
                                gup_put_folio(folio, 1, flags);
                                goto pte_unmap;
                        }
                }
                folio_set_referenced(folio);
                pages[*nr] = page;
                (*nr)++;
        } while (ptep++, addr += PAGE_SIZE, addr != end);

        ret = 1;

pte_unmap:
        if (pgmap)
                put_dev_pagemap(pgmap);
        pte_unmap(ptem);
        return ret;
}
#else

/*
 * If we can't determine whether or not a pte is special, then fail immediately
 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
 * to be special.
 *
 * For a futex to be placed on a THP tail page, get_futex_key requires a
 * get_user_pages_fast_only implementation that can pin pages. Thus it's still
 * useful to have gup_fast_pmd_leaf even if we can't operate on ptes.
 */
static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        return 0;
}
#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */

#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr,
        unsigned long end, unsigned int flags, struct page **pages, int *nr)
{
        int nr_start = *nr;
        struct dev_pagemap *pgmap = NULL;

        do {
                struct folio *folio;
                struct page *page = pfn_to_page(pfn);

                pgmap = get_dev_pagemap(pfn, pgmap);
                if (unlikely(!pgmap)) {
                        gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
                        break;
                }

                if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) {
                        gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
                        break;
                }

                folio = try_grab_folio(page, 1, flags);
                if (!folio) {
                        gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
                        break;
                }
                folio_set_referenced(folio);
                pages[*nr] = page;
                (*nr)++;
                pfn++;
        } while (addr += PAGE_SIZE, addr != end);

        put_dev_pagemap(pgmap);
        return addr == end;
}

static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        unsigned long fault_pfn;
        int nr_start = *nr;

        fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
        if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr))
                return 0;

        if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
                gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
                return 0;
        }
        return 1;
}

static int gup_fast_devmap_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        unsigned long fault_pfn;
        int nr_start = *nr;

        fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
        if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr))
                return 0;

        if (unlikely(pud_val(orig) != pud_val(*pudp))) {
                gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
                return 0;
        }
        return 1;
}
#else
static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        BUILD_BUG();
        return 0;
}

static int gup_fast_devmap_pud_leaf(pud_t pud, pud_t *pudp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        BUILD_BUG();
        return 0;
}
#endif

static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        struct page *page;
        struct folio *folio;
        int refs;

        if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
                return 0;

        if (pmd_devmap(orig)) {
                if (unlikely(flags & FOLL_LONGTERM))
                        return 0;
                return gup_fast_devmap_pmd_leaf(orig, pmdp, addr, end, flags,
                                                pages, nr);
        }

        page = pmd_page(orig);
        refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr);

        folio = try_grab_folio(page, refs, flags);
        if (!folio)
                return 0;

        if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        if (!gup_fast_folio_allowed(folio, flags)) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }
        if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        *nr += refs;
        folio_set_referenced(folio);
        return 1;
}

static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        struct page *page;
        struct folio *folio;
        int refs;

        if (!pud_access_permitted(orig, flags & FOLL_WRITE))
                return 0;

        if (pud_devmap(orig)) {
                if (unlikely(flags & FOLL_LONGTERM))
                        return 0;
                return gup_fast_devmap_pud_leaf(orig, pudp, addr, end, flags,
                                                pages, nr);
        }

        page = pud_page(orig);
        refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr);

        folio = try_grab_folio(page, refs, flags);
        if (!folio)
                return 0;

        if (unlikely(pud_val(orig) != pud_val(*pudp))) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        if (!gup_fast_folio_allowed(folio, flags)) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        *nr += refs;
        folio_set_referenced(folio);
        return 1;
}

static int gup_fast_pgd_leaf(pgd_t orig, pgd_t *pgdp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        int refs;
        struct page *page;
        struct folio *folio;

        if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
                return 0;

        BUILD_BUG_ON(pgd_devmap(orig));

        page = pgd_page(orig);
        refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr);

        folio = try_grab_folio(page, refs, flags);
        if (!folio)
                return 0;

        if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        if (!gup_fast_folio_allowed(folio, flags)) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        *nr += refs;
        folio_set_referenced(folio);
        return 1;
}

static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        unsigned long next;
        pmd_t *pmdp;

        pmdp = pmd_offset_lockless(pudp, pud, addr);
        do {
                pmd_t pmd = pmdp_get_lockless(pmdp);

                next = pmd_addr_end(addr, end);
                if (!pmd_present(pmd))
                        return 0;

                if (unlikely(pmd_leaf(pmd))) {
                        /* See gup_fast_pte_range() */
                        if (pmd_protnone(pmd))
                                return 0;

                        if (!gup_fast_pmd_leaf(pmd, pmdp, addr, next, flags,
                                pages, nr))
                                return 0;

                } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
                        /*
                         * architecture have different format for hugetlbfs
                         * pmd format and THP pmd format
                         */
                        if (gup_hugepd(NULL, __hugepd(pmd_val(pmd)), addr,
                                       PMD_SHIFT, next, flags, pages, nr) != 1)
                                return 0;
                } else if (!gup_fast_pte_range(pmd, pmdp, addr, next, flags,
                                               pages, nr))
                        return 0;
        } while (pmdp++, addr = next, addr != end);

        return 1;
}

static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        unsigned long next;
        pud_t *pudp;

        pudp = pud_offset_lockless(p4dp, p4d, addr);
        do {
                pud_t pud = READ_ONCE(*pudp);

                next = pud_addr_end(addr, end);
                if (unlikely(!pud_present(pud)))
                        return 0;
                if (unlikely(pud_leaf(pud))) {
                        if (!gup_fast_pud_leaf(pud, pudp, addr, next, flags,
                                               pages, nr))
                                return 0;
                } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
                        if (gup_hugepd(NULL, __hugepd(pud_val(pud)), addr,
                                       PUD_SHIFT, next, flags, pages, nr) != 1)
                                return 0;
                } else if (!gup_fast_pmd_range(pudp, pud, addr, next, flags,
                                               pages, nr))
                        return 0;
        } while (pudp++, addr = next, addr != end);

        return 1;
}

static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        unsigned long next;
        p4d_t *p4dp;

        p4dp = p4d_offset_lockless(pgdp, pgd, addr);
        do {
                p4d_t p4d = READ_ONCE(*p4dp);

                next = p4d_addr_end(addr, end);
                if (!p4d_present(p4d))
                        return 0;
                BUILD_BUG_ON(p4d_leaf(p4d));
                if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
                        if (gup_hugepd(NULL, __hugepd(p4d_val(p4d)), addr,
                                       P4D_SHIFT, next, flags, pages, nr) != 1)
                                return 0;
                } else if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags,
                                               pages, nr))
                        return 0;
        } while (p4dp++, addr = next, addr != end);

        return 1;
}

static void gup_fast_pgd_range(unsigned long addr, unsigned long end,
                unsigned int flags, struct page **pages, int *nr)
{
        unsigned long next;
        pgd_t *pgdp;

        pgdp = pgd_offset(current->mm, addr);
        do {
                pgd_t pgd = READ_ONCE(*pgdp);

                next = pgd_addr_end(addr, end);
                if (pgd_none(pgd))
                        return;
                if (unlikely(pgd_leaf(pgd))) {
                        if (!gup_fast_pgd_leaf(pgd, pgdp, addr, next, flags,
                                               pages, nr))
                                return;
                } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
                        if (gup_hugepd(NULL, __hugepd(pgd_val(pgd)), addr,
                                       PGDIR_SHIFT, next, flags, pages, nr) != 1)
                                return;
                } else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,
                                               pages, nr))
                        return;
        } while (pgdp++, addr = next, addr != end);
}
#else
static inline void gup_fast_pgd_range(unsigned long addr, unsigned long end,
                unsigned int flags, struct page **pages, int *nr)
{
}
#endif /* CONFIG_HAVE_GUP_FAST */

#ifndef gup_fast_permitted
/*
 * Check if it's allowed to use get_user_pages_fast_only() for the range, or
 * we need to fall back to the slow version:
 */
static bool gup_fast_permitted(unsigned long start, unsigned long end)
{
        return true;
}
#endif

static unsigned long gup_fast(unsigned long start, unsigned long end,
                unsigned int gup_flags, struct page **pages)
{
        unsigned long flags;
        int nr_pinned = 0;
        unsigned seq;

        if (!IS_ENABLED(CONFIG_HAVE_GUP_FAST) ||
            !gup_fast_permitted(start, end))
                return 0;

        if (gup_flags & FOLL_PIN) {
                seq = raw_read_seqcount(&current->mm->write_protect_seq);
                if (seq & 1)
                        return 0;
        }

        /*
         * Disable interrupts. The nested form is used, in order to allow full,
         * general purpose use of this routine.
         *
         * With interrupts disabled, we block page table pages from being freed
         * from under us. See struct mmu_table_batch comments in
         * include/asm-generic/tlb.h for more details.
         *
         * We do not adopt an rcu_read_lock() here as we also want to block IPIs
         * that come from THPs splitting.
         */
        local_irq_save(flags);
        gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned);
        local_irq_restore(flags);

        /*
         * When pinning pages for DMA there could be a concurrent write protect
         * from fork() via copy_page_range(), in this case always fail GUP-fast.
         */
        if (gup_flags & FOLL_PIN) {
                if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
                        gup_fast_unpin_user_pages(pages, nr_pinned);
                        return 0;
                } else {
                        sanity_check_pinned_pages(pages, nr_pinned);
                }
        }
        return nr_pinned;
}

static int gup_fast_fallback(unsigned long start, unsigned long nr_pages,
                unsigned int gup_flags, struct page **pages)
{
        unsigned long len, end;
        unsigned long nr_pinned;
        int locked = 0;
        int ret;

        if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
                                       FOLL_FORCE | FOLL_PIN | FOLL_GET |
                                       FOLL_FAST_ONLY | FOLL_NOFAULT |
                                       FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT)))
                return -EINVAL;

        if (gup_flags & FOLL_PIN)
                mm_set_has_pinned_flag(&current->mm->flags);

        if (!(gup_flags & FOLL_FAST_ONLY))
                might_lock_read(&current->mm->mmap_lock);

        start = untagged_addr(start) & PAGE_MASK;
        len = nr_pages << PAGE_SHIFT;
        if (check_add_overflow(start, len, &end))
                return -EOVERFLOW;
        if (end > TASK_SIZE_MAX)
                return -EFAULT;
        if (unlikely(!access_ok((void __user *)start, len)))
                return -EFAULT;

        nr_pinned = gup_fast(start, end, gup_flags, pages);
        if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
                return nr_pinned;

        /* Slow path: try to get the remaining pages with get_user_pages */
        start += nr_pinned << PAGE_SHIFT;
        pages += nr_pinned;
        ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned,
                                    pages, &locked,
                                    gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE);
        if (ret < 0) {
                /*
                 * The caller has to unpin the pages we already pinned so
                 * returning -errno is not an option
                 */
                if (nr_pinned)
                        return nr_pinned;
                return ret;
        }
        return ret + nr_pinned;
}

/**
 * get_user_pages_fast_only() - pin user pages in memory
 * @start:      starting user address
 * @nr_pages:   number of pages from start to pin
 * @gup_flags:  flags modifying pin behaviour
 * @pages:      array that receives pointers to the pages pinned.
 *              Should be at least nr_pages long.
 *
 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
 * the regular GUP.
 *
 * If the architecture does not support this function, simply return with no
 * pages pinned.
 *
 * Careful, careful! COW breaking can go either way, so a non-write
 * access can get ambiguous page results. If you call this function without
 * 'write' set, you'd better be sure that you're ok with that ambiguity.
 */
int get_user_pages_fast_only(unsigned long start, int nr_pages,
                             unsigned int gup_flags, struct page **pages)
{
        /*
         * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
         * because gup fast is always a "pin with a +1 page refcount" request.
         *
         * FOLL_FAST_ONLY is required in order to match the API description of
         * this routine: no fall back to regular ("slow") GUP.
         */
        if (!is_valid_gup_args(pages, NULL, &gup_flags,
                               FOLL_GET | FOLL_FAST_ONLY))
                return -EINVAL;

        return gup_fast_fallback(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast_only);

/**
 * get_user_pages_fast() - pin user pages in memory
 * @start:      starting user address
 * @nr_pages:   number of pages from start to pin
 * @gup_flags:  flags modifying pin behaviour
 * @pages:      array that receives pointers to the pages pinned.
 *              Should be at least nr_pages long.
 *
 * Attempt to pin user pages in memory without taking mm->mmap_lock.
 * If not successful, it will fall back to taking the lock and
 * calling get_user_pages().
 *
 * Returns number of pages pinned. This may be fewer than the number requested.
 * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
 * -errno.
 */
int get_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages)
{
        /*
         * The caller may or may not have explicitly set FOLL_GET; either way is
         * OK. However, internally (within mm/gup.c), gup fast variants must set
         * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
         * request.
         */
        if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET))
                return -EINVAL;
        return gup_fast_fallback(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);

/**
 * pin_user_pages_fast() - pin user pages in memory without taking locks
 *
 * @start:      starting user address
 * @nr_pages:   number of pages from start to pin
 * @gup_flags:  flags modifying pin behaviour
 * @pages:      array that receives pointers to the pages pinned.
 *              Should be at least nr_pages long.
 *
 * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
 * get_user_pages_fast() for documentation on the function arguments, because
 * the arguments here are identical.
 *
 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
 * see Documentation/core-api/pin_user_pages.rst for further details.
 *
 * Note that if a zero_page is amongst the returned pages, it will not have
 * pins in it and unpin_user_page() will not remove pins from it.
 */
int pin_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages)
{
        if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
                return -EINVAL;
        return gup_fast_fallback(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(pin_user_pages_fast);

/**
 * pin_user_pages_remote() - pin pages of a remote process
 *
 * @mm:                mm_struct of target mm
 * @start:        starting user address
 * @nr_pages:        number of pages from start to pin
 * @gup_flags:        flags modifying lookup behaviour
 * @pages:        array that receives pointers to the pages pinned.
 *                Should be at least nr_pages long.
 * @locked:        pointer to lock flag indicating whether lock is held and
 *                subsequently whether VM_FAULT_RETRY functionality can be
 *                utilised. Lock must initially be held.
 *
 * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
 * get_user_pages_remote() for documentation on the function arguments, because
 * the arguments here are identical.
 *
 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
 * see Documentation/core-api/pin_user_pages.rst for details.
 *
 * Note that if a zero_page is amongst the returned pages, it will not have
 * pins in it and unpin_user_page*() will not remove pins from it.
 */
long pin_user_pages_remote(struct mm_struct *mm,
                           unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           int *locked)
{
        int local_locked = 1;

        if (!is_valid_gup_args(pages, locked, &gup_flags,
                               FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE))
                return 0;
        return __gup_longterm_locked(mm, start, nr_pages, pages,
                                     locked ? locked : &local_locked,
                                     gup_flags);
}
EXPORT_SYMBOL(pin_user_pages_remote);

/**
 * pin_user_pages() - pin user pages in memory for use by other devices
 *
 * @start:        starting user address
 * @nr_pages:        number of pages from start to pin
 * @gup_flags:        flags modifying lookup behaviour
 * @pages:        array that receives pointers to the pages pinned.
 *                Should be at least nr_pages long.
 *
 * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
 * FOLL_PIN is set.
 *
 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
 * see Documentation/core-api/pin_user_pages.rst for details.
 *
 * Note that if a zero_page is amongst the returned pages, it will not have
 * pins in it and unpin_user_page*() will not remove pins from it.
 */
long pin_user_pages(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages)
{
        int locked = 1;

        if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
                return 0;
        return __gup_longterm_locked(current->mm, start, nr_pages,
                                     pages, &locked, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages);

/*
 * pin_user_pages_unlocked() is the FOLL_PIN variant of
 * get_user_pages_unlocked(). Behavior is the same, except that this one sets
 * FOLL_PIN and rejects FOLL_GET.
 *
 * Note that if a zero_page is amongst the returned pages, it will not have
 * pins in it and unpin_user_page*() will not remove pins from it.
 */
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                             struct page **pages, unsigned int gup_flags)
{
        int locked = 0;

        if (!is_valid_gup_args(pages, NULL, &gup_flags,
                               FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE))
                return 0;

        return __gup_longterm_locked(current->mm, start, nr_pages, pages,
                                     &locked, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages_unlocked);




















































































































































































































































































































































































































































































    1 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
/*
 * linux/fs/nls/mac-cyrillic.c
 *
 * Charset maccyrillic translation tables.
 * Generated automatically from the Unicode and charset
 * tables from the Unicode Organization (www.unicode.org).
 * The Unicode to charset table has only exact mappings.
 */

/*
 * COPYRIGHT AND PERMISSION NOTICE
 *
 * Copyright 1991-2012 Unicode, Inc.  All rights reserved.  Distributed under
 * the Terms of Use in http://www.unicode.org/copyright.html.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of the Unicode data files and any associated documentation (the "Data
 * Files") or Unicode software and any associated documentation (the
 * "Software") to deal in the Data Files or Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, and/or sell copies of the Data Files or Software, and
 * to permit persons to whom the Data Files or Software are furnished to do
 * so, provided that (a) the above copyright notice(s) and this permission
 * notice appear with all copies of the Data Files or Software, (b) both the
 * above copyright notice(s) and this permission notice appear in associated
 * documentation, and (c) there is clear notice in each modified Data File or
 * in the Software as well as in the documentation associated with the Data
 * File(s) or Software that the data or software has been modified.
 *
 * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
 * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
 * THIRD PARTY RIGHTS.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
 * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT
 * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
 * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
 * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
 *
 * Except as contained in this notice, the name of a copyright holder shall
 * not be used in advertising or otherwise to promote the sale, use or other
 * dealings in these Data Files or Software without prior written
 * authorization of the copyright holder.
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/nls.h>
#include <linux/errno.h>

static const wchar_t charset2uni[256] = {
        /* 0x00 */
        0x0000, 0x0001, 0x0002, 0x0003,
        0x0004, 0x0005, 0x0006, 0x0007,
        0x0008, 0x0009, 0x000a, 0x000b,
        0x000c, 0x000d, 0x000e, 0x000f,
        /* 0x10 */
        0x0010, 0x0011, 0x0012, 0x0013,
        0x0014, 0x0015, 0x0016, 0x0017,
        0x0018, 0x0019, 0x001a, 0x001b,
        0x001c, 0x001d, 0x001e, 0x001f,
        /* 0x20 */
        0x0020, 0x0021, 0x0022, 0x0023,
        0x0024, 0x0025, 0x0026, 0x0027,
        0x0028, 0x0029, 0x002a, 0x002b,
        0x002c, 0x002d, 0x002e, 0x002f,
        /* 0x30 */
        0x0030, 0x0031, 0x0032, 0x0033,
        0x0034, 0x0035, 0x0036, 0x0037,
        0x0038, 0x0039, 0x003a, 0x003b,
        0x003c, 0x003d, 0x003e, 0x003f,
        /* 0x40 */
        0x0040, 0x0041, 0x0042, 0x0043,
        0x0044, 0x0045, 0x0046, 0x0047,
        0x0048, 0x0049, 0x004a, 0x004b,
        0x004c, 0x004d, 0x004e, 0x004f,
        /* 0x50 */
        0x0050, 0x0051, 0x0052, 0x0053,
        0x0054, 0x0055, 0x0056, 0x0057,
        0x0058, 0x0059, 0x005a, 0x005b,
        0x005c, 0x005d, 0x005e, 0x005f,
        /* 0x60 */
        0x0060, 0x0061, 0x0062, 0x0063,
        0x0064, 0x0065, 0x0066, 0x0067,
        0x0068, 0x0069, 0x006a, 0x006b,
        0x006c, 0x006d, 0x006e, 0x006f,
        /* 0x70 */
        0x0070, 0x0071, 0x0072, 0x0073,
        0x0074, 0x0075, 0x0076, 0x0077,
        0x0078, 0x0079, 0x007a, 0x007b,
        0x007c, 0x007d, 0x007e, 0x007f,
        /* 0x80 */
        0x0410, 0x0411, 0x0412, 0x0413,
        0x0414, 0x0415, 0x0416, 0x0417,
        0x0418, 0x0419, 0x041a, 0x041b,
        0x041c, 0x041d, 0x041e, 0x041f,
        /* 0x90 */
        0x0420, 0x0421, 0x0422, 0x0423,
        0x0424, 0x0425, 0x0426, 0x0427,
        0x0428, 0x0429, 0x042a, 0x042b,
        0x042c, 0x042d, 0x042e, 0x042f,
        /* 0xa0 */
        0x2020, 0x00b0, 0x0490, 0x00a3,
        0x00a7, 0x2022, 0x00b6, 0x0406,
        0x00ae, 0x00a9, 0x2122, 0x0402,
        0x0452, 0x2260, 0x0403, 0x0453,
        /* 0xb0 */
        0x221e, 0x00b1, 0x2264, 0x2265,
        0x0456, 0x00b5, 0x0491, 0x0408,
        0x0404, 0x0454, 0x0407, 0x0457,
        0x0409, 0x0459, 0x040a, 0x045a,
        /* 0xc0 */
        0x0458, 0x0405, 0x00ac, 0x221a,
        0x0192, 0x2248, 0x2206, 0x00ab,
        0x00bb, 0x2026, 0x00a0, 0x040b,
        0x045b, 0x040c, 0x045c, 0x0455,
        /* 0xd0 */
        0x2013, 0x2014, 0x201c, 0x201d,
        0x2018, 0x2019, 0x00f7, 0x201e,
        0x040e, 0x045e, 0x040f, 0x045f,
        0x2116, 0x0401, 0x0451, 0x044f,
        /* 0xe0 */
        0x0430, 0x0431, 0x0432, 0x0433,
        0x0434, 0x0435, 0x0436, 0x0437,
        0x0438, 0x0439, 0x043a, 0x043b,
        0x043c, 0x043d, 0x043e, 0x043f,
        /* 0xf0 */
        0x0440, 0x0441, 0x0442, 0x0443,
        0x0444, 0x0445, 0x0446, 0x0447,
        0x0448, 0x0449, 0x044a, 0x044b,
        0x044c, 0x044d, 0x044e, 0x20ac,
};

static const unsigned char page00[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
        0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
        0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
        0xca, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, 0xa4, /* 0xa0-0xa7 */
        0x00, 0xa9, 0x00, 0xc7, 0xc2, 0x00, 0xa8, 0x00, /* 0xa8-0xaf */
        0xa1, 0xb1, 0x00, 0x00, 0x00, 0xb5, 0xa6, 0x00, /* 0xb0-0xb7 */
        0x00, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd6, /* 0xf0-0xf7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
};

static const unsigned char page01[256] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
        0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
};

static const unsigned char page04[256] = {
        0x00, 0xdd, 0xab, 0xae, 0xb8, 0xc1, 0xa7, 0xba, /* 0x00-0x07 */
        0xb7, 0xbc, 0xbe, 0xcb, 0xcd, 0x00, 0xd8, 0xda, /* 0x08-0x0f */
        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x10-0x17 */
        0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x18-0x1f */
        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x20-0x27 */
        0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x28-0x2f */
        0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x30-0x37 */
        0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x38-0x3f */
        0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x40-0x47 */
        0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0x48-0x4f */
        0x00, 0xde, 0xac, 0xaf, 0xb9, 0xcf, 0xb4, 0xbb, /* 0x50-0x57 */
        0xc0, 0xbd, 0xbf, 0xcc, 0xce, 0x00, 0xd9, 0xdb, /* 0x58-0x5f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
        0xa2, 0xb6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
};

static const unsigned char page20[256] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
        0x00, 0x00, 0x00, 0xd0, 0xd1, 0x00, 0x00, 0x00, /* 0x10-0x17 */
        0xd4, 0xd5, 0x00, 0x00, 0xd2, 0xd3, 0xd7, 0x00, /* 0x18-0x1f */
        0xa0, 0x00, 0xa5, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
        0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
};

static const unsigned char page21[256] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x00, /* 0x10-0x17 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
        0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
};

static const unsigned char page22[256] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc6, 0x00, /* 0x00-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
        0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xb0, 0x00, /* 0x18-0x1f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
        0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
        0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
};

static const unsigned char *const page_uni2charset[256] = {
        page00, page01, NULL,   NULL,   page04, NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        page20, page21, page22, NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,
};

static const unsigned char charset2lower[256] = {
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
};

static const unsigned char charset2upper[256] = {
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */
};

static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
{
        const unsigned char *uni2charset;
        unsigned char cl = uni & 0x00ff;
        unsigned char ch = (uni & 0xff00) >> 8;

        if (boundlen <= 0)
                return -ENAMETOOLONG;

        uni2charset = page_uni2charset[ch];
        if (uni2charset && uni2charset[cl])
                out[0] = uni2charset[cl];
        else
                return -EINVAL;
        return 1;
}

static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
{
        *uni = charset2uni[*rawstring];
        if (*uni == 0x0000)
                return -EINVAL;
        return 1;
}

static struct nls_table table = {
        .charset        = "maccyrillic",
        .uni2char        = uni2char,
        .char2uni        = char2uni,
        .charset2lower        = charset2lower,
        .charset2upper        = charset2upper,
};

static int __init init_nls_maccyrillic(void)
{
        return register_nls(&table);
}

static void __exit exit_nls_maccyrillic(void)
{
        unregister_nls(&table);
}

module_init(init_nls_maccyrillic)
module_exit(exit_nls_maccyrillic)

MODULE_LICENSE("Dual BSD/GPL");





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *
 * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
 *
 * on-disk ntfs structs
 */

// clang-format off
#ifndef _LINUX_NTFS3_NTFS_H
#define _LINUX_NTFS3_NTFS_H

#include <linux/blkdev.h>
#include <linux/build_bug.h>
#include <linux/kernel.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/types.h>

#include "debug.h"

/* TODO: Check 4K MFT record and 512 bytes cluster. */

/* Check each run for marked clusters. */
#define NTFS3_CHECK_FREE_CLST

#define NTFS_NAME_LEN 255

/*
 * ntfs.sys used 500 maximum links on-disk struct allows up to 0xffff.
 * xfstest generic/041 creates 3003 hardlinks.
 */
#define NTFS_LINK_MAX 4000

/*
 * Activate to use 64 bit clusters instead of 32 bits in ntfs.sys.
 * Logical and virtual cluster number if needed, may be
 * redefined to use 64 bit value.
 */
//#define CONFIG_NTFS3_64BIT_CLUSTER

#define NTFS_LZNT_MAX_CLUSTER        4096
#define NTFS_LZNT_CUNIT                4
#define NTFS_LZNT_CLUSTERS        (1u<<NTFS_LZNT_CUNIT)

struct GUID {
        __le32 Data1;
        __le16 Data2;
        __le16 Data3;
        u8 Data4[8];
};

/*
 * This struct repeats layout of ATTR_FILE_NAME
 * at offset 0x40.
 * It used to store global constants NAME_MFT/NAME_MIRROR...
 * most constant names are shorter than 10.
 */
struct cpu_str {
        u8 len;
        u8 unused;
        u16 name[];
};

struct le_str {
        u8 len;
        u8 unused;
        __le16 name[];
};

static_assert(SECTOR_SHIFT == 9);

#ifdef CONFIG_NTFS3_64BIT_CLUSTER
typedef u64 CLST;
static_assert(sizeof(size_t) == 8);
#else
typedef u32 CLST;
#endif

#define SPARSE_LCN64   ((u64)-1)
#define SPARSE_LCN     ((CLST)-1)
#define RESIDENT_LCN   ((CLST)-2)
#define COMPRESSED_LCN ((CLST)-3)

#define COMPRESSION_UNIT     4
#define COMPRESS_MAX_CLUSTER 0x1000

enum RECORD_NUM {
        MFT_REC_MFT                = 0,
        MFT_REC_MIRR                = 1,
        MFT_REC_LOG                = 2,
        MFT_REC_VOL                = 3,
        MFT_REC_ATTR                = 4,
        MFT_REC_ROOT                = 5,
        MFT_REC_BITMAP                = 6,
        MFT_REC_BOOT                = 7,
        MFT_REC_BADCLUST        = 8,
        MFT_REC_SECURE                = 9,
        MFT_REC_UPCASE                = 10,
        MFT_REC_EXTEND                = 11,
        MFT_REC_RESERVED        = 12,
        MFT_REC_FREE                = 16,
        MFT_REC_USER                = 24,
};

enum ATTR_TYPE {
        ATTR_ZERO                = cpu_to_le32(0x00),
        ATTR_STD                = cpu_to_le32(0x10),
        ATTR_LIST                = cpu_to_le32(0x20),
        ATTR_NAME                = cpu_to_le32(0x30),
        ATTR_ID                        = cpu_to_le32(0x40),
        ATTR_SECURE                = cpu_to_le32(0x50),
        ATTR_LABEL                = cpu_to_le32(0x60),
        ATTR_VOL_INFO                = cpu_to_le32(0x70),
        ATTR_DATA                = cpu_to_le32(0x80),
        ATTR_ROOT                = cpu_to_le32(0x90),
        ATTR_ALLOC                = cpu_to_le32(0xA0),
        ATTR_BITMAP                = cpu_to_le32(0xB0),
        ATTR_REPARSE                = cpu_to_le32(0xC0),
        ATTR_EA_INFO                = cpu_to_le32(0xD0),
        ATTR_EA                        = cpu_to_le32(0xE0),
        ATTR_PROPERTYSET        = cpu_to_le32(0xF0),
        ATTR_LOGGED_UTILITY_STREAM = cpu_to_le32(0x100),
        ATTR_END                = cpu_to_le32(0xFFFFFFFF)
};

static_assert(sizeof(enum ATTR_TYPE) == 4);

enum FILE_ATTRIBUTE {
        FILE_ATTRIBUTE_READONLY                = cpu_to_le32(0x00000001),
        FILE_ATTRIBUTE_HIDDEN                = cpu_to_le32(0x00000002),
        FILE_ATTRIBUTE_SYSTEM                = cpu_to_le32(0x00000004),
        FILE_ATTRIBUTE_ARCHIVE                = cpu_to_le32(0x00000020),
        FILE_ATTRIBUTE_DEVICE                = cpu_to_le32(0x00000040),
        FILE_ATTRIBUTE_TEMPORARY        = cpu_to_le32(0x00000100),
        FILE_ATTRIBUTE_SPARSE_FILE        = cpu_to_le32(0x00000200),
        FILE_ATTRIBUTE_REPARSE_POINT        = cpu_to_le32(0x00000400),
        FILE_ATTRIBUTE_COMPRESSED        = cpu_to_le32(0x00000800),
        FILE_ATTRIBUTE_OFFLINE                = cpu_to_le32(0x00001000),
        FILE_ATTRIBUTE_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000),
        FILE_ATTRIBUTE_ENCRYPTED        = cpu_to_le32(0x00004000),
        FILE_ATTRIBUTE_VALID_FLAGS        = cpu_to_le32(0x00007fb7),
        FILE_ATTRIBUTE_DIRECTORY        = cpu_to_le32(0x10000000),
        FILE_ATTRIBUTE_INDEX                = cpu_to_le32(0x20000000)
};

static_assert(sizeof(enum FILE_ATTRIBUTE) == 4);

extern const struct cpu_str NAME_MFT;
extern const struct cpu_str NAME_MIRROR;
extern const struct cpu_str NAME_LOGFILE;
extern const struct cpu_str NAME_VOLUME;
extern const struct cpu_str NAME_ATTRDEF;
extern const struct cpu_str NAME_ROOT;
extern const struct cpu_str NAME_BITMAP;
extern const struct cpu_str NAME_BOOT;
extern const struct cpu_str NAME_BADCLUS;
extern const struct cpu_str NAME_QUOTA;
extern const struct cpu_str NAME_SECURE;
extern const struct cpu_str NAME_UPCASE;
extern const struct cpu_str NAME_EXTEND;
extern const struct cpu_str NAME_OBJID;
extern const struct cpu_str NAME_REPARSE;
extern const struct cpu_str NAME_USNJRNL;

extern const __le16 I30_NAME[4];
extern const __le16 SII_NAME[4];
extern const __le16 SDH_NAME[4];
extern const __le16 SO_NAME[2];
extern const __le16 SQ_NAME[2];
extern const __le16 SR_NAME[2];

extern const __le16 BAD_NAME[4];
extern const __le16 SDS_NAME[4];
extern const __le16 WOF_NAME[17];        /* WofCompressedData */

/* MFT record number structure. */
struct MFT_REF {
        __le32 low;        // The low part of the number.
        __le16 high;        // The high part of the number.
        __le16 seq;        // The sequence number of MFT record.
};

static_assert(sizeof(__le64) == sizeof(struct MFT_REF));

static inline CLST ino_get(const struct MFT_REF *ref)
{
#ifdef CONFIG_NTFS3_64BIT_CLUSTER
        return le32_to_cpu(ref->low) | ((u64)le16_to_cpu(ref->high) << 32);
#else
        return le32_to_cpu(ref->low);
#endif
}

struct NTFS_BOOT {
        u8 jump_code[3];        // 0x00: Jump to boot code.
        u8 system_id[8];        // 0x03: System ID, equals "NTFS    "

        // NOTE: This member is not aligned(!)
        // bytes_per_sector[0] must be 0.
        // bytes_per_sector[1] must be multiplied by 256.
        u8 bytes_per_sector[2];        // 0x0B: Bytes per sector.

        u8 sectors_per_clusters;// 0x0D: Sectors per cluster.
        u8 unused1[7];
        u8 media_type;                // 0x15: Media type (0xF8 - harddisk)
        u8 unused2[2];
        __le16 sct_per_track;        // 0x18: number of sectors per track.
        __le16 heads;                // 0x1A: number of heads per cylinder.
        __le32 hidden_sectors;        // 0x1C: number of 'hidden' sectors.
        u8 unused3[4];
        u8 bios_drive_num;        // 0x24: BIOS drive number =0x80.
        u8 unused4;
        u8 signature_ex;        // 0x26: Extended BOOT signature =0x80.
        u8 unused5;
        __le64 sectors_per_volume;// 0x28: Size of volume in sectors.
        __le64 mft_clst;        // 0x30: First cluster of $MFT
        __le64 mft2_clst;        // 0x38: First cluster of $MFTMirr
        s8 record_size;                // 0x40: Size of MFT record in clusters(sectors).
        u8 unused6[3];
        s8 index_size;                // 0x44: Size of INDX record in clusters(sectors).
        u8 unused7[3];
        __le64 serial_num;        // 0x48: Volume serial number
        __le32 check_sum;        // 0x50: Simple additive checksum of all
                                // of the u32's which precede the 'check_sum'.

        u8 boot_code[0x200 - 0x50 - 2 - 4]; // 0x54:
        u8 boot_magic[2];        // 0x1FE: Boot signature =0x55 + 0xAA
};

static_assert(sizeof(struct NTFS_BOOT) == 0x200);

enum NTFS_SIGNATURE {
        NTFS_FILE_SIGNATURE = cpu_to_le32(0x454C4946), // 'FILE'
        NTFS_INDX_SIGNATURE = cpu_to_le32(0x58444E49), // 'INDX'
        NTFS_CHKD_SIGNATURE = cpu_to_le32(0x444B4843), // 'CHKD'
        NTFS_RSTR_SIGNATURE = cpu_to_le32(0x52545352), // 'RSTR'
        NTFS_RCRD_SIGNATURE = cpu_to_le32(0x44524352), // 'RCRD'
        NTFS_BAAD_SIGNATURE = cpu_to_le32(0x44414142), // 'BAAD'
        NTFS_HOLE_SIGNATURE = cpu_to_le32(0x454C4F48), // 'HOLE'
        NTFS_FFFF_SIGNATURE = cpu_to_le32(0xffffffff),
};

static_assert(sizeof(enum NTFS_SIGNATURE) == 4);

/* MFT Record header structure. */
struct NTFS_RECORD_HEADER {
        /* Record magic number, equals 'FILE'/'INDX'/'RSTR'/'RCRD'. */
        enum NTFS_SIGNATURE sign; // 0x00:
        __le16 fix_off;                // 0x04:
        __le16 fix_num;                // 0x06:
        __le64 lsn;                // 0x08: Log file sequence number,
};

static_assert(sizeof(struct NTFS_RECORD_HEADER) == 0x10);

static inline int is_baad(const struct NTFS_RECORD_HEADER *hdr)
{
        return hdr->sign == NTFS_BAAD_SIGNATURE;
}

/* Possible bits in struct MFT_REC.flags. */
enum RECORD_FLAG {
        RECORD_FLAG_IN_USE        = cpu_to_le16(0x0001),
        RECORD_FLAG_DIR                = cpu_to_le16(0x0002),
        RECORD_FLAG_SYSTEM        = cpu_to_le16(0x0004),
        RECORD_FLAG_INDEX        = cpu_to_le16(0x0008),
};

/* MFT Record structure. */
struct MFT_REC {
        struct NTFS_RECORD_HEADER rhdr; // 'FILE'

        __le16 seq;                // 0x10: Sequence number for this record.
        __le16 hard_links;        // 0x12: The number of hard links to record.
        __le16 attr_off;        // 0x14: Offset to attributes.
        __le16 flags;                // 0x16: See RECORD_FLAG.
        __le32 used;                // 0x18: The size of used part.
        __le32 total;                // 0x1C: Total record size.

        struct MFT_REF parent_ref; // 0x20: Parent MFT record.
        __le16 next_attr_id;        // 0x28: The next attribute Id.

        __le16 res;                // 0x2A: High part of MFT record?
        __le32 mft_record;        // 0x2C: Current MFT record number.
        __le16 fixups[];        // 0x30:
};

#define MFTRECORD_FIXUP_OFFSET_1 offsetof(struct MFT_REC, res)
#define MFTRECORD_FIXUP_OFFSET_3 offsetof(struct MFT_REC, fixups)
/*
 * define MFTRECORD_FIXUP_OFFSET as MFTRECORD_FIXUP_OFFSET_3 (0x30)
 * to format new mft records with bigger header (as current ntfs.sys does)
 *
 * define MFTRECORD_FIXUP_OFFSET as MFTRECORD_FIXUP_OFFSET_1 (0x2A)
 * to format new mft records with smaller header (as old ntfs.sys did)
 * Both variants are valid.
 */
#define MFTRECORD_FIXUP_OFFSET  MFTRECORD_FIXUP_OFFSET_1

static_assert(MFTRECORD_FIXUP_OFFSET_1 == 0x2A);
static_assert(MFTRECORD_FIXUP_OFFSET_3 == 0x30);

static inline bool is_rec_base(const struct MFT_REC *rec)
{
        const struct MFT_REF *r = &rec->parent_ref;

        return !r->low && !r->high && !r->seq;
}

static inline bool is_mft_rec5(const struct MFT_REC *rec)
{
        return le16_to_cpu(rec->rhdr.fix_off) >=
               offsetof(struct MFT_REC, fixups);
}

static inline bool is_rec_inuse(const struct MFT_REC *rec)
{
        return rec->flags & RECORD_FLAG_IN_USE;
}

static inline bool clear_rec_inuse(struct MFT_REC *rec)
{
        return rec->flags &= ~RECORD_FLAG_IN_USE;
}

/* Possible values of ATTR_RESIDENT.flags */
#define RESIDENT_FLAG_INDEXED 0x01

struct ATTR_RESIDENT {
        __le32 data_size;        // 0x10: The size of data.
        __le16 data_off;        // 0x14: Offset to data.
        u8 flags;                // 0x16: Resident flags ( 1 - indexed ).
        u8 res;                        // 0x17:
}; // sizeof() = 0x18

struct ATTR_NONRESIDENT {
        __le64 svcn;                // 0x10: Starting VCN of this segment.
        __le64 evcn;                // 0x18: End VCN of this segment.
        __le16 run_off;                // 0x20: Offset to packed runs.
        // Unit of Compression size for this stream, expressed
        // as a log of the cluster size.
        //
        // 0 means file is not compressed
        // 1, 2, 3, and 4 are potentially legal values if the
        // stream is compressed, however the implementation
        // may only choose to use 4, or possibly 3.
        // Note that 4 means cluster size time 16.
        // If convenient the implementation may wish to accept a
        // reasonable range of legal values here (1-5?),
        // even if the implementation only generates
        // a smaller set of values itself.
        u8 c_unit;                // 0x22:
        u8 res1[5];                // 0x23:
        __le64 alloc_size;        // 0x28: The allocated size of attribute in bytes.
                                // (multiple of cluster size)
        __le64 data_size;        // 0x30: The size of attribute  in bytes <= alloc_size.
        __le64 valid_size;        // 0x38: The size of valid part in bytes <= data_size.
        __le64 total_size;        // 0x40: The sum of the allocated clusters for a file.
                                // (present only for the first segment (0 == vcn)
                                // of compressed attribute)

}; // sizeof()=0x40 or 0x48 (if compressed)

/* Possible values of ATTRIB.flags: */
#define ATTR_FLAG_COMPRESSED          cpu_to_le16(0x0001)
#define ATTR_FLAG_COMPRESSED_MASK cpu_to_le16(0x00FF)
#define ATTR_FLAG_ENCRYPTED          cpu_to_le16(0x4000)
#define ATTR_FLAG_SPARSED          cpu_to_le16(0x8000)

struct ATTRIB {
        enum ATTR_TYPE type;        // 0x00: The type of this attribute.
        __le32 size;                // 0x04: The size of this attribute.
        u8 non_res;                // 0x08: Is this attribute non-resident?
        u8 name_len;                // 0x09: This attribute name length.
        __le16 name_off;        // 0x0A: Offset to the attribute name.
        __le16 flags;                // 0x0C: See ATTR_FLAG_XXX.
        __le16 id;                // 0x0E: Unique id (per record).

        union {
                struct ATTR_RESIDENT res;     // 0x10
                struct ATTR_NONRESIDENT nres; // 0x10
        };
};

/* Define attribute sizes. */
#define SIZEOF_RESIDENT                        0x18
#define SIZEOF_NONRESIDENT_EX                0x48
#define SIZEOF_NONRESIDENT                0x40

#define SIZEOF_RESIDENT_LE                cpu_to_le16(0x18)
#define SIZEOF_NONRESIDENT_EX_LE        cpu_to_le16(0x48)
#define SIZEOF_NONRESIDENT_LE                cpu_to_le16(0x40)

static inline u64 attr_ondisk_size(const struct ATTRIB *attr)
{
        return attr->non_res ? ((attr->flags &
                                 (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) ?
                                        le64_to_cpu(attr->nres.total_size) :
                                        le64_to_cpu(attr->nres.alloc_size))
                             : ALIGN(le32_to_cpu(attr->res.data_size), 8);
}

static inline u64 attr_size(const struct ATTRIB *attr)
{
        return attr->non_res ? le64_to_cpu(attr->nres.data_size) :
                               le32_to_cpu(attr->res.data_size);
}

static inline bool is_attr_encrypted(const struct ATTRIB *attr)
{
        return attr->flags & ATTR_FLAG_ENCRYPTED;
}

static inline bool is_attr_sparsed(const struct ATTRIB *attr)
{
        return attr->flags & ATTR_FLAG_SPARSED;
}

static inline bool is_attr_compressed(const struct ATTRIB *attr)
{
        return attr->flags & ATTR_FLAG_COMPRESSED;
}

static inline bool is_attr_ext(const struct ATTRIB *attr)
{
        return attr->flags & (ATTR_FLAG_SPARSED | ATTR_FLAG_COMPRESSED);
}

static inline bool is_attr_indexed(const struct ATTRIB *attr)
{
        return !attr->non_res && (attr->res.flags & RESIDENT_FLAG_INDEXED);
}

static inline __le16 const *attr_name(const struct ATTRIB *attr)
{
        return Add2Ptr(attr, le16_to_cpu(attr->name_off));
}

static inline u64 attr_svcn(const struct ATTRIB *attr)
{
        return attr->non_res ? le64_to_cpu(attr->nres.svcn) : 0;
}

static_assert(sizeof(struct ATTRIB) == 0x48);
static_assert(sizeof(((struct ATTRIB *)NULL)->res) == 0x08);
static_assert(sizeof(((struct ATTRIB *)NULL)->nres) == 0x38);

static inline void *resident_data_ex(const struct ATTRIB *attr, u32 datasize)
{
        u32 asize, rsize;
        u16 off;

        if (attr->non_res)
                return NULL;

        asize = le32_to_cpu(attr->size);
        off = le16_to_cpu(attr->res.data_off);

        if (asize < datasize + off)
                return NULL;

        rsize = le32_to_cpu(attr->res.data_size);
        if (rsize < datasize)
                return NULL;

        return Add2Ptr(attr, off);
}

static inline void *resident_data(const struct ATTRIB *attr)
{
        return Add2Ptr(attr, le16_to_cpu(attr->res.data_off));
}

static inline void *attr_run(const struct ATTRIB *attr)
{
        return Add2Ptr(attr, le16_to_cpu(attr->nres.run_off));
}

/* Standard information attribute (0x10). */
struct ATTR_STD_INFO {
        __le64 cr_time;                // 0x00: File creation file.
        __le64 m_time;                // 0x08: File modification time.
        __le64 c_time;                // 0x10: Last time any attribute was modified.
        __le64 a_time;                // 0x18: File last access time.
        enum FILE_ATTRIBUTE fa;        // 0x20: Standard DOS attributes & more.
        __le32 max_ver_num;        // 0x24: Maximum Number of Versions.
        __le32 ver_num;                // 0x28: Version Number.
        __le32 class_id;        // 0x2C: Class Id from bidirectional Class Id index.
};

static_assert(sizeof(struct ATTR_STD_INFO) == 0x30);

#define SECURITY_ID_INVALID 0x00000000
#define SECURITY_ID_FIRST 0x00000100

struct ATTR_STD_INFO5 {
        __le64 cr_time;                // 0x00: File creation file.
        __le64 m_time;                // 0x08: File modification time.
        __le64 c_time;                // 0x10: Last time any attribute was modified.
        __le64 a_time;                // 0x18: File last access time.
        enum FILE_ATTRIBUTE fa;        // 0x20: Standard DOS attributes & more.
        __le32 max_ver_num;        // 0x24: Maximum Number of Versions.
        __le32 ver_num;                // 0x28: Version Number.
        __le32 class_id;        // 0x2C: Class Id from bidirectional Class Id index.

        __le32 owner_id;        // 0x30: Owner Id of the user owning the file.
        __le32 security_id;        // 0x34: The Security Id is a key in the $SII Index and $SDS.
        __le64 quota_charge;        // 0x38:
        __le64 usn;                // 0x40: Last Update Sequence Number of the file. This is a direct
                                // index into the file $UsnJrnl. If zero, the USN Journal is
                                // disabled.
};

static_assert(sizeof(struct ATTR_STD_INFO5) == 0x48);

/* Attribute list entry structure (0x20) */
struct ATTR_LIST_ENTRY {
        enum ATTR_TYPE type;        // 0x00: The type of attribute.
        __le16 size;                // 0x04: The size of this record.
        u8 name_len;                // 0x06: The length of attribute name.
        u8 name_off;                // 0x07: The offset to attribute name.
        __le64 vcn;                // 0x08: Starting VCN of this attribute.
        struct MFT_REF ref;        // 0x10: MFT record number with attribute.
        __le16 id;                // 0x18: struct ATTRIB ID.
        __le16 name[];                // 0x1A: To get real name use name_off.

}; // sizeof(0x20)

static inline u32 le_size(u8 name_len)
{
        return ALIGN(offsetof(struct ATTR_LIST_ENTRY, name) +
                     name_len * sizeof(short), 8);
}

/* Returns 0 if 'attr' has the same type and name. */
static inline int le_cmp(const struct ATTR_LIST_ENTRY *le,
                         const struct ATTRIB *attr)
{
        return le->type != attr->type || le->name_len != attr->name_len ||
               (!le->name_len &&
                memcmp(Add2Ptr(le, le->name_off),
                       Add2Ptr(attr, le16_to_cpu(attr->name_off)),
                       le->name_len * sizeof(short)));
}

static inline __le16 const *le_name(const struct ATTR_LIST_ENTRY *le)
{
        return Add2Ptr(le, le->name_off);
}

/* File name types (the field type in struct ATTR_FILE_NAME). */
#define FILE_NAME_POSIX   0
#define FILE_NAME_UNICODE 1
#define FILE_NAME_DOS          2
#define FILE_NAME_UNICODE_AND_DOS (FILE_NAME_DOS | FILE_NAME_UNICODE)

/* Filename attribute structure (0x30). */
struct NTFS_DUP_INFO {
        __le64 cr_time;                // 0x00: File creation file.
        __le64 m_time;                // 0x08: File modification time.
        __le64 c_time;                // 0x10: Last time any attribute was modified.
        __le64 a_time;                // 0x18: File last access time.
        __le64 alloc_size;        // 0x20: Data attribute allocated size, multiple of cluster size.
        __le64 data_size;        // 0x28: Data attribute size <= Dataalloc_size.
        enum FILE_ATTRIBUTE fa;        // 0x30: Standard DOS attributes & more.
        __le16 ea_size;                // 0x34: Packed EAs.
        __le16 reparse;                // 0x36: Used by Reparse.

}; // 0x38

struct ATTR_FILE_NAME {
        struct MFT_REF home;        // 0x00: MFT record for directory.
        struct NTFS_DUP_INFO dup;// 0x08:
        u8 name_len;                // 0x40: File name length in words.
        u8 type;                // 0x41: File name type.
        __le16 name[];                // 0x42: File name.
};

static_assert(sizeof(((struct ATTR_FILE_NAME *)NULL)->dup) == 0x38);
static_assert(offsetof(struct ATTR_FILE_NAME, name) == 0x42);
#define SIZEOF_ATTRIBUTE_FILENAME     0x44
#define SIZEOF_ATTRIBUTE_FILENAME_MAX (0x42 + 255 * 2)

static inline struct ATTRIB *attr_from_name(struct ATTR_FILE_NAME *fname)
{
        return (struct ATTRIB *)((char *)fname - SIZEOF_RESIDENT);
}

static inline u16 fname_full_size(const struct ATTR_FILE_NAME *fname)
{
        /* Don't return struct_size(fname, name, fname->name_len); */
        return offsetof(struct ATTR_FILE_NAME, name) +
               fname->name_len * sizeof(short);
}

static inline u8 paired_name(u8 type)
{
        if (type == FILE_NAME_UNICODE)
                return FILE_NAME_DOS;
        if (type == FILE_NAME_DOS)
                return FILE_NAME_UNICODE;
        return FILE_NAME_POSIX;
}

/* Index entry defines ( the field flags in NtfsDirEntry ). */
#define NTFS_IE_HAS_SUBNODES        cpu_to_le16(1)
#define NTFS_IE_LAST                cpu_to_le16(2)

/* Directory entry structure. */
struct NTFS_DE {
        union {
                struct MFT_REF ref; // 0x00: MFT record number with this file.
                struct {
                        __le16 data_off;  // 0x00:
                        __le16 data_size; // 0x02:
                        __le32 res;          // 0x04: Must be 0.
                } view;
        };
        __le16 size;                // 0x08: The size of this entry.
        __le16 key_size;        // 0x0A: The size of File name length in bytes + 0x42.
        __le16 flags;                // 0x0C: Entry flags: NTFS_IE_XXX.
        __le16 res;                // 0x0E:

        // Here any indexed attribute can be placed.
        // One of them is:
        // struct ATTR_FILE_NAME AttrFileName;
        //

        // The last 8 bytes of this structure contains
        // the VBN of subnode.
        // !!! Note !!!
        // This field is presented only if (flags & NTFS_IE_HAS_SUBNODES)
        // __le64 vbn;
};

static_assert(sizeof(struct NTFS_DE) == 0x10);

static inline void de_set_vbn_le(struct NTFS_DE *e, __le64 vcn)
{
        __le64 *v = Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64));

        *v = vcn;
}

static inline void de_set_vbn(struct NTFS_DE *e, CLST vcn)
{
        __le64 *v = Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64));

        *v = cpu_to_le64(vcn);
}

static inline __le64 de_get_vbn_le(const struct NTFS_DE *e)
{
        return *(__le64 *)Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64));
}

static inline CLST de_get_vbn(const struct NTFS_DE *e)
{
        __le64 *v = Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64));

        return le64_to_cpu(*v);
}

static inline struct NTFS_DE *de_get_next(const struct NTFS_DE *e)
{
        return Add2Ptr(e, le16_to_cpu(e->size));
}

static inline struct ATTR_FILE_NAME *de_get_fname(const struct NTFS_DE *e)
{
        return le16_to_cpu(e->key_size) >= SIZEOF_ATTRIBUTE_FILENAME ?
                       Add2Ptr(e, sizeof(struct NTFS_DE)) :
                       NULL;
}

static inline bool de_is_last(const struct NTFS_DE *e)
{
        return e->flags & NTFS_IE_LAST;
}

static inline bool de_has_vcn(const struct NTFS_DE *e)
{
        return e->flags & NTFS_IE_HAS_SUBNODES;
}

static inline bool de_has_vcn_ex(const struct NTFS_DE *e)
{
        return (e->flags & NTFS_IE_HAS_SUBNODES) &&
               (u64)(-1) != *((u64 *)Add2Ptr(e, le16_to_cpu(e->size) -
                                                        sizeof(__le64)));
}

#define MAX_BYTES_PER_NAME_ENTRY \
        ALIGN(sizeof(struct NTFS_DE) + \
              offsetof(struct ATTR_FILE_NAME, name) + \
              NTFS_NAME_LEN * sizeof(short), 8)

struct INDEX_HDR {
        __le32 de_off;        // 0x00: The offset from the start of this structure
                        // to the first NTFS_DE.
        __le32 used;        // 0x04: The size of this structure plus all
                        // entries (quad-word aligned).
        __le32 total;        // 0x08: The allocated size of for this structure plus all entries.
        u8 flags;        // 0x0C: 0x00 = Small directory, 0x01 = Large directory.
        u8 res[3];

        //
        // de_off + used <= total
        //
};

static_assert(sizeof(struct INDEX_HDR) == 0x10);

static inline struct NTFS_DE *hdr_first_de(const struct INDEX_HDR *hdr)
{
        u32 de_off = le32_to_cpu(hdr->de_off);
        u32 used = le32_to_cpu(hdr->used);
        struct NTFS_DE *e;
        u16 esize;

        if (de_off >= used || de_off + sizeof(struct NTFS_DE) > used )
                return NULL;

        e = Add2Ptr(hdr, de_off);
        esize = le16_to_cpu(e->size);
        if (esize < sizeof(struct NTFS_DE) || de_off + esize > used)
                return NULL;

        return e;
}

static inline struct NTFS_DE *hdr_next_de(const struct INDEX_HDR *hdr,
                                          const struct NTFS_DE *e)
{
        size_t off = PtrOffset(hdr, e);
        u32 used = le32_to_cpu(hdr->used);
        u16 esize;

        if (off >= used)
                return NULL;

        esize = le16_to_cpu(e->size);

        if (esize < sizeof(struct NTFS_DE) ||
            off + esize + sizeof(struct NTFS_DE) > used)
                return NULL;

        return Add2Ptr(e, esize);
}

static inline bool hdr_has_subnode(const struct INDEX_HDR *hdr)
{
        return hdr->flags & 1;
}

struct INDEX_BUFFER {
        struct NTFS_RECORD_HEADER rhdr; // 'INDX'
        __le64 vbn; // 0x10: vcn if index >= cluster or vsn id index < cluster
        struct INDEX_HDR ihdr; // 0x18:
};

static_assert(sizeof(struct INDEX_BUFFER) == 0x28);

static inline bool ib_is_empty(const struct INDEX_BUFFER *ib)
{
        const struct NTFS_DE *first = hdr_first_de(&ib->ihdr);

        return !first || de_is_last(first);
}

static inline bool ib_is_leaf(const struct INDEX_BUFFER *ib)
{
        return !(ib->ihdr.flags & 1);
}

/* Index root structure ( 0x90 ). */
enum COLLATION_RULE {
        NTFS_COLLATION_TYPE_BINARY        = cpu_to_le32(0),
        // $I30
        NTFS_COLLATION_TYPE_FILENAME        = cpu_to_le32(0x01),
        // $SII of $Secure and $Q of Quota
        NTFS_COLLATION_TYPE_UINT        = cpu_to_le32(0x10),
        // $O of Quota
        NTFS_COLLATION_TYPE_SID                = cpu_to_le32(0x11),
        // $SDH of $Secure
        NTFS_COLLATION_TYPE_SECURITY_HASH = cpu_to_le32(0x12),
        // $O of ObjId and "$R" for Reparse
        NTFS_COLLATION_TYPE_UINTS        = cpu_to_le32(0x13)
};

static_assert(sizeof(enum COLLATION_RULE) == 4);

//
struct INDEX_ROOT {
        enum ATTR_TYPE type;        // 0x00: The type of attribute to index on.
        enum COLLATION_RULE rule; // 0x04: The rule.
        __le32 index_block_size;// 0x08: The size of index record.
        u8 index_block_clst;        // 0x0C: The number of clusters or sectors per index.
        u8 res[3];
        struct INDEX_HDR ihdr;        // 0x10:
};

static_assert(sizeof(struct INDEX_ROOT) == 0x20);
static_assert(offsetof(struct INDEX_ROOT, ihdr) == 0x10);

#define VOLUME_FLAG_DIRTY            cpu_to_le16(0x0001)
#define VOLUME_FLAG_RESIZE_LOG_FILE cpu_to_le16(0x0002)

struct VOLUME_INFO {
        __le64 res1;        // 0x00
        u8 major_ver;        // 0x08: NTFS major version number (before .)
        u8 minor_ver;        // 0x09: NTFS minor version number (after .)
        __le16 flags;        // 0x0A: Volume flags, see VOLUME_FLAG_XXX

}; // sizeof=0xC

#define SIZEOF_ATTRIBUTE_VOLUME_INFO 0xc

#define NTFS_LABEL_MAX_LENGTH                (0x100 / sizeof(short))
#define NTFS_ATTR_INDEXABLE                cpu_to_le32(0x00000002)
#define NTFS_ATTR_DUPALLOWED                cpu_to_le32(0x00000004)
#define NTFS_ATTR_MUST_BE_INDEXED        cpu_to_le32(0x00000010)
#define NTFS_ATTR_MUST_BE_NAMED                cpu_to_le32(0x00000020)
#define NTFS_ATTR_MUST_BE_RESIDENT        cpu_to_le32(0x00000040)
#define NTFS_ATTR_LOG_ALWAYS                cpu_to_le32(0x00000080)

/* $AttrDef file entry. */
struct ATTR_DEF_ENTRY {
        __le16 name[0x40];        // 0x00: Attr name.
        enum ATTR_TYPE type;        // 0x80: struct ATTRIB type.
        __le32 res;                // 0x84:
        enum COLLATION_RULE rule; // 0x88:
        __le32 flags;                // 0x8C: NTFS_ATTR_XXX (see above).
        __le64 min_sz;                // 0x90: Minimum attribute data size.
        __le64 max_sz;                // 0x98: Maximum attribute data size.
};

static_assert(sizeof(struct ATTR_DEF_ENTRY) == 0xa0);

/* Object ID (0x40) */
struct OBJECT_ID {
        struct GUID ObjId;        // 0x00: Unique Id assigned to file.

        // Birth Volume Id is the Object Id of the Volume on.
        // which the Object Id was allocated. It never changes.
        struct GUID BirthVolumeId; //0x10:

        // Birth Object Id is the first Object Id that was
        // ever assigned to this MFT Record. I.e. If the Object Id
        // is changed for some reason, this field will reflect the
        // original value of the Object Id.
        struct GUID BirthObjectId; // 0x20:

        // Domain Id is currently unused but it is intended to be
        // used in a network environment where the local machine is
        // part of a Windows 2000 Domain. This may be used in a Windows
        // 2000 Advanced Server managed domain.
        struct GUID DomainId;        // 0x30:
};

static_assert(sizeof(struct OBJECT_ID) == 0x40);

/* O Directory entry structure ( rule = 0x13 ) */
struct NTFS_DE_O {
        struct NTFS_DE de;
        struct GUID ObjId;        // 0x10: Unique Id assigned to file.
        struct MFT_REF ref;        // 0x20: MFT record number with this file.

        // Birth Volume Id is the Object Id of the Volume on
        // which the Object Id was allocated. It never changes.
        struct GUID BirthVolumeId; // 0x28:

        // Birth Object Id is the first Object Id that was
        // ever assigned to this MFT Record. I.e. If the Object Id
        // is changed for some reason, this field will reflect the
        // original value of the Object Id.
        // This field is valid if data_size == 0x48.
        struct GUID BirthObjectId; // 0x38:

        // Domain Id is currently unused but it is intended
        // to be used in a network environment where the local
        // machine is part of a Windows 2000 Domain. This may be
        // used in a Windows 2000 Advanced Server managed domain.
        struct GUID BirthDomainId; // 0x48:
};

static_assert(sizeof(struct NTFS_DE_O) == 0x58);

/* Q Directory entry structure ( rule = 0x11 ) */
struct NTFS_DE_Q {
        struct NTFS_DE de;
        __le32 owner_id;        // 0x10: Unique Id assigned to file

        /* here is 0x30 bytes of user quota. NOTE: 4 byte aligned! */
        __le32 Version;                // 0x14: 0x02
        __le32 Flags;                // 0x18: Quota flags, see above
        __le64 BytesUsed;        // 0x1C:
        __le64 ChangeTime;        // 0x24:
        __le64 WarningLimit;        // 0x28:
        __le64 HardLimit;        // 0x34:
        __le64 ExceededTime;        // 0x3C:

        // SID is placed here
}__packed; // sizeof() = 0x44

static_assert(sizeof(struct NTFS_DE_Q) == 0x44);

#define SecurityDescriptorsBlockSize 0x40000 // 256K
#define SecurityDescriptorMaxSize    0x20000 // 128K
#define Log2OfSecurityDescriptorsBlockSize 18

struct SECURITY_KEY {
        __le32 hash; //  Hash value for descriptor
        __le32 sec_id; //  Security Id (guaranteed unique)
};

/* Security descriptors (the content of $Secure::SDS data stream) */
struct SECURITY_HDR {
        struct SECURITY_KEY key;        // 0x00: Security Key.
        __le64 off;                        // 0x08: Offset of this entry in the file.
        __le32 size;                        // 0x10: Size of this entry, 8 byte aligned.
        /*
         * Security descriptor itself is placed here.
         * Total size is 16 byte aligned.
         */
} __packed;

static_assert(sizeof(struct SECURITY_HDR) == 0x14);

/* SII Directory entry structure */
struct NTFS_DE_SII {
        struct NTFS_DE de;
        __le32 sec_id;                        // 0x10: Key: sizeof(security_id) = wKeySize
        struct SECURITY_HDR sec_hdr;        // 0x14:
} __packed;

static_assert(offsetof(struct NTFS_DE_SII, sec_hdr) == 0x14);
static_assert(sizeof(struct NTFS_DE_SII) == 0x28);

/* SDH Directory entry structure */
struct NTFS_DE_SDH {
        struct NTFS_DE de;
        struct SECURITY_KEY key;        // 0x10: Key
        struct SECURITY_HDR sec_hdr;        // 0x18: Data
        __le16 magic[2];                // 0x2C: 0x00490049 "I I"
};

#define SIZEOF_SDH_DIRENTRY 0x30

struct REPARSE_KEY {
        __le32 ReparseTag;                // 0x00: Reparse Tag
        struct MFT_REF ref;                // 0x04: MFT record number with this file
}; // sizeof() = 0x0C

static_assert(offsetof(struct REPARSE_KEY, ref) == 0x04);
#define SIZEOF_REPARSE_KEY 0x0C

/* Reparse Directory entry structure */
struct NTFS_DE_R {
        struct NTFS_DE de;
        struct REPARSE_KEY key;                // 0x10: Reparse Key.
        u32 zero;                        // 0x1c:
}; // sizeof() = 0x20

static_assert(sizeof(struct NTFS_DE_R) == 0x20);

/* CompressReparseBuffer.WofVersion */
#define WOF_CURRENT_VERSION                cpu_to_le32(1)
/* CompressReparseBuffer.WofProvider */
#define WOF_PROVIDER_WIM                cpu_to_le32(1)
/* CompressReparseBuffer.WofProvider */
#define WOF_PROVIDER_SYSTEM                cpu_to_le32(2)
/* CompressReparseBuffer.ProviderVer */
#define WOF_PROVIDER_CURRENT_VERSION        cpu_to_le32(1)

#define WOF_COMPRESSION_XPRESS4K        cpu_to_le32(0) // 4k
#define WOF_COMPRESSION_LZX32K                cpu_to_le32(1) // 32k
#define WOF_COMPRESSION_XPRESS8K        cpu_to_le32(2) // 8k
#define WOF_COMPRESSION_XPRESS16K        cpu_to_le32(3) // 16k

/*
 * ATTR_REPARSE (0xC0)
 *
 * The reparse struct GUID structure is used by all 3rd party layered drivers to
 * store data in a reparse point. For non-Microsoft tags, The struct GUID field
 * cannot be GUID_NULL.
 * The constraints on reparse tags are defined below.
 * Microsoft tags can also be used with this format of the reparse point buffer.
 */
struct REPARSE_POINT {
        __le32 ReparseTag;        // 0x00:
        __le16 ReparseDataLength;// 0x04:
        __le16 Reserved;

        struct GUID Guid;        // 0x08:

        //
        // Here GenericReparseBuffer is placed
        //
};

static_assert(sizeof(struct REPARSE_POINT) == 0x18);

/* Maximum allowed size of the reparse data. */
#define MAXIMUM_REPARSE_DATA_BUFFER_SIZE        (16 * 1024)

/*
 * The value of the following constant needs to satisfy the following
 * conditions:
 *  (1) Be at least as large as the largest of the reserved tags.
 *  (2) Be strictly smaller than all the tags in use.
 */
#define IO_REPARSE_TAG_RESERVED_RANGE                1

/*
 * The reparse tags are a ULONG. The 32 bits are laid out as follows:
 *
 *   3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
 *   1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
 *  +-+-+-+-+-----------------------+-------------------------------+
 *  |M|R|N|R|          Reserved bits     |            Reparse Tag Value            |
 *  +-+-+-+-+-----------------------+-------------------------------+
 *
 * M is the Microsoft bit. When set to 1, it denotes a tag owned by Microsoft.
 *   All ISVs must use a tag with a 0 in this position.
 *   Note: If a Microsoft tag is used by non-Microsoft software, the
 *   behavior is not defined.
 *
 * R is reserved.  Must be zero for non-Microsoft tags.
 *
 * N is name surrogate. When set to 1, the file represents another named
 *   entity in the system.
 *
 * The M and N bits are OR-able.
 * The following macros check for the M and N bit values:
 */

/*
 * Macro to determine whether a reparse point tag corresponds to a tag
 * owned by Microsoft.
 */
#define IsReparseTagMicrosoft(_tag)        (((_tag)&IO_REPARSE_TAG_MICROSOFT))

/* Macro to determine whether a reparse point tag is a name surrogate. */
#define IsReparseTagNameSurrogate(_tag)        (((_tag)&IO_REPARSE_TAG_NAME_SURROGATE))

/*
 * The following constant represents the bits that are valid to use in
 * reparse tags.
 */
#define IO_REPARSE_TAG_VALID_VALUES        0xF000FFFF

/*
 * Macro to determine whether a reparse tag is a valid tag.
 */
#define IsReparseTagValid(_tag)                                                       \
        (!((_tag) & ~IO_REPARSE_TAG_VALID_VALUES) &&                               \
         ((_tag) > IO_REPARSE_TAG_RESERVED_RANGE))

/* Microsoft tags for reparse points. */

enum IO_REPARSE_TAG {
        IO_REPARSE_TAG_SYMBOLIC_LINK        = cpu_to_le32(0),
        IO_REPARSE_TAG_NAME_SURROGATE        = cpu_to_le32(0x20000000),
        IO_REPARSE_TAG_MICROSOFT        = cpu_to_le32(0x80000000),
        IO_REPARSE_TAG_MOUNT_POINT        = cpu_to_le32(0xA0000003),
        IO_REPARSE_TAG_SYMLINK                = cpu_to_le32(0xA000000C),
        IO_REPARSE_TAG_HSM                = cpu_to_le32(0xC0000004),
        IO_REPARSE_TAG_SIS                = cpu_to_le32(0x80000007),
        IO_REPARSE_TAG_DEDUP                = cpu_to_le32(0x80000013),
        IO_REPARSE_TAG_COMPRESS                = cpu_to_le32(0x80000017),

        /*
         * The reparse tag 0x80000008 is reserved for Microsoft internal use.
         * May be published in the future.
         */

        /* Microsoft reparse tag reserved for DFS */
        IO_REPARSE_TAG_DFS        = cpu_to_le32(0x8000000A),

        /* Microsoft reparse tag reserved for the file system filter manager. */
        IO_REPARSE_TAG_FILTER_MANAGER        = cpu_to_le32(0x8000000B),

        /* Non-Microsoft tags for reparse points */

        /* Tag allocated to CONGRUENT, May 2000. Used by IFSTEST. */
        IO_REPARSE_TAG_IFSTEST_CONGRUENT = cpu_to_le32(0x00000009),

        /* Tag allocated to ARKIVIO. */
        IO_REPARSE_TAG_ARKIVIO        = cpu_to_le32(0x0000000C),

        /* Tag allocated to SOLUTIONSOFT. */
        IO_REPARSE_TAG_SOLUTIONSOFT        = cpu_to_le32(0x2000000D),

        /* Tag allocated to COMMVAULT. */
        IO_REPARSE_TAG_COMMVAULT        = cpu_to_le32(0x0000000E),

        /* OneDrive?? */
        IO_REPARSE_TAG_CLOUD        = cpu_to_le32(0x9000001A),
        IO_REPARSE_TAG_CLOUD_1        = cpu_to_le32(0x9000101A),
        IO_REPARSE_TAG_CLOUD_2        = cpu_to_le32(0x9000201A),
        IO_REPARSE_TAG_CLOUD_3        = cpu_to_le32(0x9000301A),
        IO_REPARSE_TAG_CLOUD_4        = cpu_to_le32(0x9000401A),
        IO_REPARSE_TAG_CLOUD_5        = cpu_to_le32(0x9000501A),
        IO_REPARSE_TAG_CLOUD_6        = cpu_to_le32(0x9000601A),
        IO_REPARSE_TAG_CLOUD_7        = cpu_to_le32(0x9000701A),
        IO_REPARSE_TAG_CLOUD_8        = cpu_to_le32(0x9000801A),
        IO_REPARSE_TAG_CLOUD_9        = cpu_to_le32(0x9000901A),
        IO_REPARSE_TAG_CLOUD_A        = cpu_to_le32(0x9000A01A),
        IO_REPARSE_TAG_CLOUD_B        = cpu_to_le32(0x9000B01A),
        IO_REPARSE_TAG_CLOUD_C        = cpu_to_le32(0x9000C01A),
        IO_REPARSE_TAG_CLOUD_D        = cpu_to_le32(0x9000D01A),
        IO_REPARSE_TAG_CLOUD_E        = cpu_to_le32(0x9000E01A),
        IO_REPARSE_TAG_CLOUD_F        = cpu_to_le32(0x9000F01A),

};

#define SYMLINK_FLAG_RELATIVE                1

/* Microsoft reparse buffer. (see DDK for details) */
struct REPARSE_DATA_BUFFER {
        __le32 ReparseTag;                // 0x00:
        __le16 ReparseDataLength;        // 0x04:
        __le16 Reserved;

        union {
                /* If ReparseTag == 0xA0000003 (IO_REPARSE_TAG_MOUNT_POINT) */
                struct {
                        __le16 SubstituteNameOffset; // 0x08
                        __le16 SubstituteNameLength; // 0x0A
                        __le16 PrintNameOffset;      // 0x0C
                        __le16 PrintNameLength;      // 0x0E
                        __le16 PathBuffer[];             // 0x10
                } MountPointReparseBuffer;

                /*
                 * If ReparseTag == 0xA000000C (IO_REPARSE_TAG_SYMLINK)
                 * https://msdn.microsoft.com/en-us/library/cc232006.aspx
                 */
                struct {
                        __le16 SubstituteNameOffset; // 0x08
                        __le16 SubstituteNameLength; // 0x0A
                        __le16 PrintNameOffset;      // 0x0C
                        __le16 PrintNameLength;      // 0x0E
                        // 0-absolute path 1- relative path, SYMLINK_FLAG_RELATIVE
                        __le32 Flags;                     // 0x10
                        __le16 PathBuffer[];             // 0x14
                } SymbolicLinkReparseBuffer;

                /* If ReparseTag == 0x80000017U */
                struct {
                        __le32 WofVersion;  // 0x08 == 1
                        /*
                         * 1 - WIM backing provider ("WIMBoot"),
                         * 2 - System compressed file provider
                         */
                        __le32 WofProvider; // 0x0C:
                        __le32 ProviderVer; // 0x10: == 1 WOF_FILE_PROVIDER_CURRENT_VERSION == 1
                        __le32 CompressionFormat; // 0x14: 0, 1, 2, 3. See WOF_COMPRESSION_XXX
                } CompressReparseBuffer;

                struct {
                        u8 DataBuffer[1];   // 0x08:
                } GenericReparseBuffer;
        };
};

/* ATTR_EA_INFO (0xD0) */

#define FILE_NEED_EA 0x80 // See ntifs.h
/*
 * FILE_NEED_EA, indicates that the file to which the EA belongs cannot be
 * interpreted without understanding the associated extended attributes.
 */
struct EA_INFO {
        __le16 size_pack;        // 0x00: Size of buffer to hold in packed form.
        __le16 count;                // 0x02: Count of EA's with FILE_NEED_EA bit set.
        __le32 size;                // 0x04: Size of buffer to hold in unpacked form.
};

static_assert(sizeof(struct EA_INFO) == 8);

/* ATTR_EA (0xE0) */
struct EA_FULL {
        __le32 size;                // 0x00: (not in packed)
        u8 flags;                // 0x04:
        u8 name_len;                // 0x05:
        __le16 elength;                // 0x06:
        u8 name[];                // 0x08:
};

static_assert(offsetof(struct EA_FULL, name) == 8);

#define ACL_REVISION        2
#define ACL_REVISION_DS 4

#define SE_SELF_RELATIVE cpu_to_le16(0x8000)

struct SECURITY_DESCRIPTOR_RELATIVE {
        u8 Revision;
        u8 Sbz1;
        __le16 Control;
        __le32 Owner;
        __le32 Group;
        __le32 Sacl;
        __le32 Dacl;
};
static_assert(sizeof(struct SECURITY_DESCRIPTOR_RELATIVE) == 0x14);

struct ACE_HEADER {
        u8 AceType;
        u8 AceFlags;
        __le16 AceSize;
};
static_assert(sizeof(struct ACE_HEADER) == 4);

struct ACL {
        u8 AclRevision;
        u8 Sbz1;
        __le16 AclSize;
        __le16 AceCount;
        __le16 Sbz2;
};
static_assert(sizeof(struct ACL) == 8);

struct SID {
        u8 Revision;
        u8 SubAuthorityCount;
        u8 IdentifierAuthority[6];
        __le32 SubAuthority[];
};
static_assert(offsetof(struct SID, SubAuthority) == 8);

#endif /* _LINUX_NTFS3_NTFS_H */
// clang-format on



























    2 



    2 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
// SPDX-License-Identifier: GPL-2.0
/*
 * x86 specific code for irq_work
 *
 * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
 */

#include <linux/kernel.h>
#include <linux/irq_work.h>
#include <linux/hardirq.h>
#include <asm/apic.h>
#include <asm/idtentry.h>
#include <asm/trace/irq_vectors.h>
#include <linux/interrupt.h>

#ifdef CONFIG_X86_LOCAL_APIC
DEFINE_IDTENTRY_SYSVEC(sysvec_irq_work)
{
        apic_eoi();
        trace_irq_work_entry(IRQ_WORK_VECTOR);
        inc_irq_stat(apic_irq_work_irqs);
        irq_work_run();
        trace_irq_work_exit(IRQ_WORK_VECTOR);
}

void arch_irq_work_raise(void)
{
        if (!arch_irq_work_has_interrupt())
                return;

        __apic_send_IPI_self(IRQ_WORK_VECTOR);
        apic_wait_icr_idle();
}
#endif











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Dynamic loading of modules into the kernel.
 *
 * Rewritten by Richard Henderson <rth@tamu.edu> Dec 1996
 * Rewritten again by Rusty Russell, 2002
 */

#ifndef _LINUX_MODULE_H
#define _LINUX_MODULE_H

#include <linux/list.h>
#include <linux/stat.h>
#include <linux/buildid.h>
#include <linux/compiler.h>
#include <linux/cache.h>
#include <linux/kmod.h>
#include <linux/init.h>
#include <linux/elf.h>
#include <linux/stringify.h>
#include <linux/kobject.h>
#include <linux/moduleparam.h>
#include <linux/jump_label.h>
#include <linux/export.h>
#include <linux/rbtree_latch.h>
#include <linux/error-injection.h>
#include <linux/tracepoint-defs.h>
#include <linux/srcu.h>
#include <linux/static_call_types.h>
#include <linux/dynamic_debug.h>

#include <linux/percpu.h>
#include <asm/module.h>

#define MODULE_NAME_LEN MAX_PARAM_PREFIX_LEN

struct modversion_info {
        unsigned long crc;
        char name[MODULE_NAME_LEN];
};

struct module;
struct exception_table_entry;

struct module_kobject {
        struct kobject kobj;
        struct module *mod;
        struct kobject *drivers_dir;
        struct module_param_attrs *mp;
        struct completion *kobj_completion;
} __randomize_layout;

struct module_attribute {
        struct attribute attr;
        ssize_t (*show)(struct module_attribute *, struct module_kobject *,
                        char *);
        ssize_t (*store)(struct module_attribute *, struct module_kobject *,
                         const char *, size_t count);
        void (*setup)(struct module *, const char *);
        int (*test)(struct module *);
        void (*free)(struct module *);
};

struct module_version_attribute {
        struct module_attribute mattr;
        const char *module_name;
        const char *version;
};

extern ssize_t __modver_version_show(struct module_attribute *,
                                     struct module_kobject *, char *);

extern struct module_attribute module_uevent;

/* These are either module local, or the kernel's dummy ones. */
extern int init_module(void);
extern void cleanup_module(void);

#ifndef MODULE
/**
 * module_init() - driver initialization entry point
 * @x: function to be run at kernel boot time or module insertion
 *
 * module_init() will either be called during do_initcalls() (if
 * builtin) or at module insertion time (if a module).  There can only
 * be one per module.
 */
#define module_init(x)        __initcall(x);

/**
 * module_exit() - driver exit entry point
 * @x: function to be run when driver is removed
 *
 * module_exit() will wrap the driver clean-up code
 * with cleanup_module() when used with rmmod when
 * the driver is a module.  If the driver is statically
 * compiled into the kernel, module_exit() has no effect.
 * There can only be one per module.
 */
#define module_exit(x)        __exitcall(x);

#else /* MODULE */

/*
 * In most cases loadable modules do not need custom
 * initcall levels. There are still some valid cases where
 * a driver may be needed early if built in, and does not
 * matter when built as a loadable module. Like bus
 * snooping debug drivers.
 */
#define early_initcall(fn)                module_init(fn)
#define core_initcall(fn)                module_init(fn)
#define core_initcall_sync(fn)                module_init(fn)
#define postcore_initcall(fn)                module_init(fn)
#define postcore_initcall_sync(fn)        module_init(fn)
#define arch_initcall(fn)                module_init(fn)
#define subsys_initcall(fn)                module_init(fn)
#define subsys_initcall_sync(fn)        module_init(fn)
#define fs_initcall(fn)                        module_init(fn)
#define fs_initcall_sync(fn)                module_init(fn)
#define rootfs_initcall(fn)                module_init(fn)
#define device_initcall(fn)                module_init(fn)
#define device_initcall_sync(fn)        module_init(fn)
#define late_initcall(fn)                module_init(fn)
#define late_initcall_sync(fn)                module_init(fn)

#define console_initcall(fn)                module_init(fn)

/* Each module must use one module_init(). */
#define module_init(initfn)                                        \
        static inline initcall_t __maybe_unused __inittest(void)                \
        { return initfn; }                                        \
        int init_module(void) __copy(initfn)                        \
                __attribute__((alias(#initfn)));                \
        ___ADDRESSABLE(init_module, __initdata);

/* This is only required if you want to be unloadable. */
#define module_exit(exitfn)                                        \
        static inline exitcall_t __maybe_unused __exittest(void)                \
        { return exitfn; }                                        \
        void cleanup_module(void) __copy(exitfn)                \
                __attribute__((alias(#exitfn)));                \
        ___ADDRESSABLE(cleanup_module, __exitdata);

#endif

/* This means "can be init if no module support, otherwise module load
   may call it." */
#ifdef CONFIG_MODULES
#define __init_or_module
#define __initdata_or_module
#define __initconst_or_module
#define __INIT_OR_MODULE        .text
#define __INITDATA_OR_MODULE        .data
#define __INITRODATA_OR_MODULE        .section ".rodata","a",%progbits
#else
#define __init_or_module __init
#define __initdata_or_module __initdata
#define __initconst_or_module __initconst
#define __INIT_OR_MODULE __INIT
#define __INITDATA_OR_MODULE __INITDATA
#define __INITRODATA_OR_MODULE __INITRODATA
#endif /*CONFIG_MODULES*/

/* Generic info of form tag = "info" */
#define MODULE_INFO(tag, info) __MODULE_INFO(tag, tag, info)

/* For userspace: you can also call me... */
#define MODULE_ALIAS(_alias) MODULE_INFO(alias, _alias)

/* Soft module dependencies. See man modprobe.d for details.
 * Example: MODULE_SOFTDEP("pre: module-foo module-bar post: module-baz")
 */
#define MODULE_SOFTDEP(_softdep) MODULE_INFO(softdep, _softdep)

/*
 * MODULE_FILE is used for generating modules.builtin
 * So, make it no-op when this is being built as a module
 */
#ifdef MODULE
#define MODULE_FILE
#else
#define MODULE_FILE        MODULE_INFO(file, KBUILD_MODFILE);
#endif

/*
 * The following license idents are currently accepted as indicating free
 * software modules
 *
 *        "GPL"                                [GNU Public License v2]
 *        "GPL v2"                        [GNU Public License v2]
 *        "GPL and additional rights"        [GNU Public License v2 rights and more]
 *        "Dual BSD/GPL"                        [GNU Public License v2
 *                                         or BSD license choice]
 *        "Dual MIT/GPL"                        [GNU Public License v2
 *                                         or MIT license choice]
 *        "Dual MPL/GPL"                        [GNU Public License v2
 *                                         or Mozilla license choice]
 *
 * The following other idents are available
 *
 *        "Proprietary"                        [Non free products]
 *
 * Both "GPL v2" and "GPL" (the latter also in dual licensed strings) are
 * merely stating that the module is licensed under the GPL v2, but are not
 * telling whether "GPL v2 only" or "GPL v2 or later". The reason why there
 * are two variants is a historic and failed attempt to convey more
 * information in the MODULE_LICENSE string. For module loading the
 * "only/or later" distinction is completely irrelevant and does neither
 * replace the proper license identifiers in the corresponding source file
 * nor amends them in any way. The sole purpose is to make the
 * 'Proprietary' flagging work and to refuse to bind symbols which are
 * exported with EXPORT_SYMBOL_GPL when a non free module is loaded.
 *
 * In the same way "BSD" is not a clear license information. It merely
 * states, that the module is licensed under one of the compatible BSD
 * license variants. The detailed and correct license information is again
 * to be found in the corresponding source files.
 *
 * There are dual licensed components, but when running with Linux it is the
 * GPL that is relevant so this is a non issue. Similarly LGPL linked with GPL
 * is a GPL combined work.
 *
 * This exists for several reasons
 * 1.        So modinfo can show license info for users wanting to vet their setup
 *        is free
 * 2.        So the community can ignore bug reports including proprietary modules
 * 3.        So vendors can do likewise based on their own policies
 */
#define MODULE_LICENSE(_license) MODULE_FILE MODULE_INFO(license, _license)

/*
 * Author(s), use "Name <email>" or just "Name", for multiple
 * authors use multiple MODULE_AUTHOR() statements/lines.
 */
#define MODULE_AUTHOR(_author) MODULE_INFO(author, _author)

/* What your module does. */
#define MODULE_DESCRIPTION(_description) MODULE_INFO(description, _description)

#ifdef MODULE
/* Creates an alias so file2alias.c can find device table. */
#define MODULE_DEVICE_TABLE(type, name)                                        \
extern typeof(name) __mod_##type##__##name##_device_table                \
  __attribute__ ((unused, alias(__stringify(name))))
#else  /* !MODULE */
#define MODULE_DEVICE_TABLE(type, name)
#endif

/* Version of form [<epoch>:]<version>[-<extra-version>].
 * Or for CVS/RCS ID version, everything but the number is stripped.
 * <epoch>: A (small) unsigned integer which allows you to start versions
 * anew. If not mentioned, it's zero.  eg. "2:1.0" is after
 * "1:2.0".

 * <version>: The <version> may contain only alphanumerics and the
 * character `.'.  Ordered by numeric sort for numeric parts,
 * ascii sort for ascii parts (as per RPM or DEB algorithm).

 * <extraversion>: Like <version>, but inserted for local
 * customizations, eg "rh3" or "rusty1".

 * Using this automatically adds a checksum of the .c files and the
 * local headers in "srcversion".
 */

#if defined(MODULE) || !defined(CONFIG_SYSFS)
#define MODULE_VERSION(_version) MODULE_INFO(version, _version)
#else
#define MODULE_VERSION(_version)                                        \
        MODULE_INFO(version, _version);                                        \
        static struct module_version_attribute __modver_attr                \
                __used __section("__modver")                                \
                __aligned(__alignof__(struct module_version_attribute)) \
                = {                                                        \
                        .mattr        = {                                        \
                                .attr        = {                                \
                                        .name        = "version",                \
                                        .mode        = S_IRUGO,                \
                                },                                        \
                                .show        = __modver_version_show,        \
                        },                                                \
                        .module_name        = KBUILD_MODNAME,                \
                        .version        = _version,                        \
                }
#endif

/* Optional firmware file (or files) needed by the module
 * format is simply firmware file name.  Multiple firmware
 * files require multiple MODULE_FIRMWARE() specifiers */
#define MODULE_FIRMWARE(_firmware) MODULE_INFO(firmware, _firmware)

#define MODULE_IMPORT_NS(ns)        MODULE_INFO(import_ns, __stringify(ns))

struct notifier_block;

#ifdef CONFIG_MODULES

extern int modules_disabled; /* for sysctl */
/* Get/put a kernel symbol (calls must be symmetric) */
void *__symbol_get(const char *symbol);
void *__symbol_get_gpl(const char *symbol);
#define symbol_get(x) ((typeof(&x))(__symbol_get(__stringify(x))))

/* modules using other modules: kdb wants to see this. */
struct module_use {
        struct list_head source_list;
        struct list_head target_list;
        struct module *source, *target;
};

enum module_state {
        MODULE_STATE_LIVE,        /* Normal state. */
        MODULE_STATE_COMING,        /* Full formed, running module_init. */
        MODULE_STATE_GOING,        /* Going away. */
        MODULE_STATE_UNFORMED,        /* Still setting it up. */
};

struct mod_tree_node {
        struct module *mod;
        struct latch_tree_node node;
};

enum mod_mem_type {
        MOD_TEXT = 0,
        MOD_DATA,
        MOD_RODATA,
        MOD_RO_AFTER_INIT,
        MOD_INIT_TEXT,
        MOD_INIT_DATA,
        MOD_INIT_RODATA,

        MOD_MEM_NUM_TYPES,
        MOD_INVALID = -1,
};

#define mod_mem_type_is_init(type)        \
        ((type) == MOD_INIT_TEXT ||        \
         (type) == MOD_INIT_DATA ||        \
         (type) == MOD_INIT_RODATA)

#define mod_mem_type_is_core(type) (!mod_mem_type_is_init(type))

#define mod_mem_type_is_text(type)        \
         ((type) == MOD_TEXT ||                \
          (type) == MOD_INIT_TEXT)

#define mod_mem_type_is_data(type) (!mod_mem_type_is_text(type))

#define mod_mem_type_is_core_data(type)        \
        (mod_mem_type_is_core(type) &&        \
         mod_mem_type_is_data(type))

#define for_each_mod_mem_type(type)                        \
        for (enum mod_mem_type (type) = 0;                \
             (type) < MOD_MEM_NUM_TYPES; (type)++)

#define for_class_mod_mem_type(type, class)                \
        for_each_mod_mem_type(type)                        \
                if (mod_mem_type_is_##class(type))

struct module_memory {
        void *base;
        unsigned int size;

#ifdef CONFIG_MODULES_TREE_LOOKUP
        struct mod_tree_node mtn;
#endif
};

#ifdef CONFIG_MODULES_TREE_LOOKUP
/* Only touch one cacheline for common rbtree-for-core-layout case. */
#define __module_memory_align ____cacheline_aligned
#else
#define __module_memory_align
#endif

struct mod_kallsyms {
        Elf_Sym *symtab;
        unsigned int num_symtab;
        char *strtab;
        char *typetab;
};

#ifdef CONFIG_LIVEPATCH
/**
 * struct klp_modinfo - ELF information preserved from the livepatch module
 *
 * @hdr: ELF header
 * @sechdrs: Section header table
 * @secstrings: String table for the section headers
 * @symndx: The symbol table section index
 */
struct klp_modinfo {
        Elf_Ehdr hdr;
        Elf_Shdr *sechdrs;
        char *secstrings;
        unsigned int symndx;
};
#endif

struct module {
        enum module_state state;

        /* Member of list of modules */
        struct list_head list;

        /* Unique handle for this module */
        char name[MODULE_NAME_LEN];

#ifdef CONFIG_STACKTRACE_BUILD_ID
        /* Module build ID */
        unsigned char build_id[BUILD_ID_SIZE_MAX];
#endif

        /* Sysfs stuff. */
        struct module_kobject mkobj;
        struct module_attribute *modinfo_attrs;
        const char *version;
        const char *srcversion;
        struct kobject *holders_dir;

        /* Exported symbols */
        const struct kernel_symbol *syms;
        const s32 *crcs;
        unsigned int num_syms;

#ifdef CONFIG_ARCH_USES_CFI_TRAPS
        s32 *kcfi_traps;
        s32 *kcfi_traps_end;
#endif

        /* Kernel parameters. */
#ifdef CONFIG_SYSFS
        struct mutex param_lock;
#endif
        struct kernel_param *kp;
        unsigned int num_kp;

        /* GPL-only exported symbols. */
        unsigned int num_gpl_syms;
        const struct kernel_symbol *gpl_syms;
        const s32 *gpl_crcs;
        bool using_gplonly_symbols;

#ifdef CONFIG_MODULE_SIG
        /* Signature was verified. */
        bool sig_ok;
#endif

        bool async_probe_requested;

        /* Exception table */
        unsigned int num_exentries;
        struct exception_table_entry *extable;

        /* Startup function. */
        int (*init)(void);

        struct module_memory mem[MOD_MEM_NUM_TYPES] __module_memory_align;

        /* Arch-specific module values */
        struct mod_arch_specific arch;

        unsigned long taints;        /* same bits as kernel:taint_flags */

#ifdef CONFIG_GENERIC_BUG
        /* Support for BUG */
        unsigned num_bugs;
        struct list_head bug_list;
        struct bug_entry *bug_table;
#endif

#ifdef CONFIG_KALLSYMS
        /* Protected by RCU and/or module_mutex: use rcu_dereference() */
        struct mod_kallsyms __rcu *kallsyms;
        struct mod_kallsyms core_kallsyms;

        /* Section attributes */
        struct module_sect_attrs *sect_attrs;

        /* Notes attributes */
        struct module_notes_attrs *notes_attrs;
#endif

        /* The command line arguments (may be mangled).  People like
           keeping pointers to this stuff */
        char *args;

#ifdef CONFIG_SMP
        /* Per-cpu data. */
        void __percpu *percpu;
        unsigned int percpu_size;
#endif
        void *noinstr_text_start;
        unsigned int noinstr_text_size;

#ifdef CONFIG_TRACEPOINTS
        unsigned int num_tracepoints;
        tracepoint_ptr_t *tracepoints_ptrs;
#endif
#ifdef CONFIG_TREE_SRCU
        unsigned int num_srcu_structs;
        struct srcu_struct **srcu_struct_ptrs;
#endif
#ifdef CONFIG_BPF_EVENTS
        unsigned int num_bpf_raw_events;
        struct bpf_raw_event_map *bpf_raw_events;
#endif
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
        unsigned int btf_data_size;
        void *btf_data;
#endif
#ifdef CONFIG_JUMP_LABEL
        struct jump_entry *jump_entries;
        unsigned int num_jump_entries;
#endif
#ifdef CONFIG_TRACING
        unsigned int num_trace_bprintk_fmt;
        const char **trace_bprintk_fmt_start;
#endif
#ifdef CONFIG_EVENT_TRACING
        struct trace_event_call **trace_events;
        unsigned int num_trace_events;
        struct trace_eval_map **trace_evals;
        unsigned int num_trace_evals;
#endif
#ifdef CONFIG_FTRACE_MCOUNT_RECORD
        unsigned int num_ftrace_callsites;
        unsigned long *ftrace_callsites;
#endif
#ifdef CONFIG_KPROBES
        void *kprobes_text_start;
        unsigned int kprobes_text_size;
        unsigned long *kprobe_blacklist;
        unsigned int num_kprobe_blacklist;
#endif
#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
        int num_static_call_sites;
        struct static_call_site *static_call_sites;
#endif
#if IS_ENABLED(CONFIG_KUNIT)
        int num_kunit_init_suites;
        struct kunit_suite **kunit_init_suites;
        int num_kunit_suites;
        struct kunit_suite **kunit_suites;
#endif


#ifdef CONFIG_LIVEPATCH
        bool klp; /* Is this a livepatch module? */
        bool klp_alive;

        /* ELF information */
        struct klp_modinfo *klp_info;
#endif

#ifdef CONFIG_PRINTK_INDEX
        unsigned int printk_index_size;
        struct pi_entry **printk_index_start;
#endif

#ifdef CONFIG_MODULE_UNLOAD
        /* What modules depend on me? */
        struct list_head source_list;
        /* What modules do I depend on? */
        struct list_head target_list;

        /* Destruction function. */
        void (*exit)(void);

        atomic_t refcnt;
#endif

#ifdef CONFIG_CONSTRUCTORS
        /* Constructor functions. */
        ctor_fn_t *ctors;
        unsigned int num_ctors;
#endif

#ifdef CONFIG_FUNCTION_ERROR_INJECTION
        struct error_injection_entry *ei_funcs;
        unsigned int num_ei_funcs;
#endif
#ifdef CONFIG_DYNAMIC_DEBUG_CORE
        struct _ddebug_info dyndbg_info;
#endif
} ____cacheline_aligned __randomize_layout;
#ifndef MODULE_ARCH_INIT
#define MODULE_ARCH_INIT {}
#endif

#ifndef HAVE_ARCH_KALLSYMS_SYMBOL_VALUE
static inline unsigned long kallsyms_symbol_value(const Elf_Sym *sym)
{
        return sym->st_value;
}
#endif

/* FIXME: It'd be nice to isolate modules during init, too, so they
   aren't used before they (may) fail.  But presently too much code
   (IDE & SCSI) require entry into the module during init.*/
static inline bool module_is_live(struct module *mod)
{
        return mod->state != MODULE_STATE_GOING;
}

static inline bool module_is_coming(struct module *mod)
{
        return mod->state == MODULE_STATE_COMING;
}

struct module *__module_text_address(unsigned long addr);
struct module *__module_address(unsigned long addr);
bool is_module_address(unsigned long addr);
bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr);
bool is_module_percpu_address(unsigned long addr);
bool is_module_text_address(unsigned long addr);

static inline bool within_module_mem_type(unsigned long addr,
                                          const struct module *mod,
                                          enum mod_mem_type type)
{
        unsigned long base, size;

        base = (unsigned long)mod->mem[type].base;
        size = mod->mem[type].size;
        return addr - base < size;
}

static inline bool within_module_core(unsigned long addr,
                                      const struct module *mod)
{
        for_class_mod_mem_type(type, core) {
                if (within_module_mem_type(addr, mod, type))
                        return true;
        }
        return false;
}

static inline bool within_module_init(unsigned long addr,
                                      const struct module *mod)
{
        for_class_mod_mem_type(type, init) {
                if (within_module_mem_type(addr, mod, type))
                        return true;
        }
        return false;
}

static inline bool within_module(unsigned long addr, const struct module *mod)
{
        return within_module_init(addr, mod) || within_module_core(addr, mod);
}

/* Search for module by name: must be in a RCU-sched critical section. */
struct module *find_module(const char *name);

extern void __noreturn __module_put_and_kthread_exit(struct module *mod,
                        long code);
#define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code)

#ifdef CONFIG_MODULE_UNLOAD
int module_refcount(struct module *mod);
void __symbol_put(const char *symbol);
#define symbol_put(x) __symbol_put(__stringify(x))
void symbol_put_addr(void *addr);

/* Sometimes we know we already have a refcount, and it's easier not
   to handle the error case (which only happens with rmmod --wait). */
extern void __module_get(struct module *module);

/**
 * try_module_get() - take module refcount unless module is being removed
 * @module: the module we should check for
 *
 * Only try to get a module reference count if the module is not being removed.
 * This call will fail if the module is in the process of being removed.
 *
 * Care must also be taken to ensure the module exists and is alive prior to
 * usage of this call. This can be gauranteed through two means:
 *
 * 1) Direct protection: you know an earlier caller must have increased the
 *    module reference through __module_get(). This can typically be achieved
 *    by having another entity other than the module itself increment the
 *    module reference count.
 *
 * 2) Implied protection: there is an implied protection against module
 *    removal. An example of this is the implied protection used by kernfs /
 *    sysfs. The sysfs store / read file operations are guaranteed to exist
 *    through the use of kernfs's active reference (see kernfs_active()) and a
 *    sysfs / kernfs file removal cannot happen unless the same file is not
 *    active. Therefore, if a sysfs file is being read or written to the module
 *    which created it must still exist. It is therefore safe to use
 *    try_module_get() on module sysfs store / read ops.
 *
 * One of the real values to try_module_get() is the module_is_live() check
 * which ensures that the caller of try_module_get() can yield to userspace
 * module removal requests and gracefully fail if the module is on its way out.
 *
 * Returns true if the reference count was successfully incremented.
 */
extern bool try_module_get(struct module *module);

/**
 * module_put() - release a reference count to a module
 * @module: the module we should release a reference count for
 *
 * If you successfully bump a reference count to a module with try_module_get(),
 * when you are finished you must call module_put() to release that reference
 * count.
 */
extern void module_put(struct module *module);

#else /*!CONFIG_MODULE_UNLOAD*/
static inline bool try_module_get(struct module *module)
{
        return !module || module_is_live(module);
}
static inline void module_put(struct module *module)
{
}
static inline void __module_get(struct module *module)
{
}
#define symbol_put(x) do { } while (0)
#define symbol_put_addr(p) do { } while (0)

#endif /* CONFIG_MODULE_UNLOAD */

/* This is a #define so the string doesn't get put in every .o file */
#define module_name(mod)                        \
({                                                \
        struct module *__mod = (mod);                \
        __mod ? __mod->name : "kernel";                \
})

/* Dereference module function descriptor */
void *dereference_module_function_descriptor(struct module *mod, void *ptr);

int register_module_notifier(struct notifier_block *nb);
int unregister_module_notifier(struct notifier_block *nb);

extern void print_modules(void);

static inline bool module_requested_async_probing(struct module *module)
{
        return module && module->async_probe_requested;
}

static inline bool is_livepatch_module(struct module *mod)
{
#ifdef CONFIG_LIVEPATCH
        return mod->klp;
#else
        return false;
#endif
}

void set_module_sig_enforced(void);

#else /* !CONFIG_MODULES... */

static inline struct module *__module_address(unsigned long addr)
{
        return NULL;
}

static inline struct module *__module_text_address(unsigned long addr)
{
        return NULL;
}

static inline bool is_module_address(unsigned long addr)
{
        return false;
}

static inline bool is_module_percpu_address(unsigned long addr)
{
        return false;
}

static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
{
        return false;
}

static inline bool is_module_text_address(unsigned long addr)
{
        return false;
}

static inline bool within_module_core(unsigned long addr,
                                      const struct module *mod)
{
        return false;
}

static inline bool within_module_init(unsigned long addr,
                                      const struct module *mod)
{
        return false;
}

static inline bool within_module(unsigned long addr, const struct module *mod)
{
        return false;
}

/* Get/put a kernel symbol (calls should be symmetric) */
#define symbol_get(x) ({ extern typeof(x) x __attribute__((weak,visibility("hidden"))); &(x); })
#define symbol_put(x) do { } while (0)
#define symbol_put_addr(x) do { } while (0)

static inline void __module_get(struct module *module)
{
}

static inline bool try_module_get(struct module *module)
{
        return true;
}

static inline void module_put(struct module *module)
{
}

#define module_name(mod) "kernel"

static inline int register_module_notifier(struct notifier_block *nb)
{
        /* no events will happen anyway, so this can always succeed */
        return 0;
}

static inline int unregister_module_notifier(struct notifier_block *nb)
{
        return 0;
}

#define module_put_and_kthread_exit(code) kthread_exit(code)

static inline void print_modules(void)
{
}

static inline bool module_requested_async_probing(struct module *module)
{
        return false;
}


static inline void set_module_sig_enforced(void)
{
}

/* Dereference module function descriptor */
static inline
void *dereference_module_function_descriptor(struct module *mod, void *ptr)
{
        return ptr;
}

static inline bool module_is_coming(struct module *mod)
{
        return false;
}
#endif /* CONFIG_MODULES */

#ifdef CONFIG_SYSFS
extern struct kset *module_kset;
extern const struct kobj_type module_ktype;
#endif /* CONFIG_SYSFS */

#define symbol_request(x) try_then_request_module(symbol_get(x), "symbol:" #x)

/* BELOW HERE ALL THESE ARE OBSOLETE AND WILL VANISH */

#define __MODULE_STRING(x) __stringify(x)

#ifdef CONFIG_GENERIC_BUG
void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *,
                         struct module *);
void module_bug_cleanup(struct module *);

#else        /* !CONFIG_GENERIC_BUG */

static inline void module_bug_finalize(const Elf_Ehdr *hdr,
                                        const Elf_Shdr *sechdrs,
                                        struct module *mod)
{
}
static inline void module_bug_cleanup(struct module *mod) {}
#endif        /* CONFIG_GENERIC_BUG */

#ifdef CONFIG_MITIGATION_RETPOLINE
extern bool retpoline_module_ok(bool has_retpoline);
#else
static inline bool retpoline_module_ok(bool has_retpoline)
{
        return true;
}
#endif

#ifdef CONFIG_MODULE_SIG
bool is_module_sig_enforced(void);

static inline bool module_sig_ok(struct module *module)
{
        return module->sig_ok;
}
#else        /* !CONFIG_MODULE_SIG */
static inline bool is_module_sig_enforced(void)
{
        return false;
}

static inline bool module_sig_ok(struct module *module)
{
        return true;
}
#endif        /* CONFIG_MODULE_SIG */

#if defined(CONFIG_MODULES) && defined(CONFIG_KALLSYMS)
int module_kallsyms_on_each_symbol(const char *modname,
                                   int (*fn)(void *, const char *, unsigned long),
                                   void *data);

/* For kallsyms to ask for address resolution.  namebuf should be at
 * least KSYM_NAME_LEN long: a pointer to namebuf is returned if
 * found, otherwise NULL.
 */
const char *module_address_lookup(unsigned long addr,
                                  unsigned long *symbolsize,
                                  unsigned long *offset,
                                  char **modname, const unsigned char **modbuildid,
                                  char *namebuf);
int lookup_module_symbol_name(unsigned long addr, char *symname);
int lookup_module_symbol_attrs(unsigned long addr,
                               unsigned long *size,
                               unsigned long *offset,
                               char *modname,
                               char *name);

/* Returns 0 and fills in value, defined and namebuf, or -ERANGE if
 * symnum out of range.
 */
int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                       char *name, char *module_name, int *exported);

/* Look for this name: can be of form module:name. */
unsigned long module_kallsyms_lookup_name(const char *name);

unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);

#else        /* CONFIG_MODULES && CONFIG_KALLSYMS */

static inline int module_kallsyms_on_each_symbol(const char *modname,
                                                 int (*fn)(void *, const char *, unsigned long),
                                                 void *data)
{
        return -EOPNOTSUPP;
}

/* For kallsyms to ask for address resolution.  NULL means not found. */
static inline const char *module_address_lookup(unsigned long addr,
                                                unsigned long *symbolsize,
                                                unsigned long *offset,
                                                char **modname,
                                                const unsigned char **modbuildid,
                                                char *namebuf)
{
        return NULL;
}

static inline int lookup_module_symbol_name(unsigned long addr, char *symname)
{
        return -ERANGE;
}

static inline int module_get_kallsym(unsigned int symnum, unsigned long *value,
                                     char *type, char *name,
                                     char *module_name, int *exported)
{
        return -ERANGE;
}

static inline unsigned long module_kallsyms_lookup_name(const char *name)
{
        return 0;
}

static inline unsigned long find_kallsyms_symbol_value(struct module *mod,
                                                       const char *name)
{
        return 0;
}

#endif  /* CONFIG_MODULES && CONFIG_KALLSYMS */

#endif /* _LINUX_MODULE_H */






























    1 



    1 




    1 

























    1 













    1 






















































































    1 







    1 













































































    1 













    1 




    1 












































    1 






    1 




















































    1 









    1 

























    1 



    1 




    1 




    1 




    1 























































    1 

















































































    1 
    1 








    1 











    1 











































    1 
    1 














    1 






    1 




    1 





























    1 




    1 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/minix/inode.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  Copyright (C) 1996  Gertjan van Wingerde
 *        Minix V2 fs support.
 *
 *  Modified for 680x0 by Andreas Schwab
 *  Updated to filesystem version 3 by Daniel Aragones
 */

#include <linux/module.h>
#include "minix.h"
#include <linux/buffer_head.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/highuid.h>
#include <linux/mpage.h>
#include <linux/vfs.h>
#include <linux/writeback.h>
#include <linux/fs_context.h>

static int minix_write_inode(struct inode *inode,
                struct writeback_control *wbc);
static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);

static void minix_evict_inode(struct inode *inode)
{
        truncate_inode_pages_final(&inode->i_data);
        if (!inode->i_nlink) {
                inode->i_size = 0;
                minix_truncate(inode);
        }
        invalidate_inode_buffers(inode);
        clear_inode(inode);
        if (!inode->i_nlink)
                minix_free_inode(inode);
}

static void minix_put_super(struct super_block *sb)
{
        int i;
        struct minix_sb_info *sbi = minix_sb(sb);

        if (!sb_rdonly(sb)) {
                if (sbi->s_version != MINIX_V3)         /* s_state is now out from V3 sb */
                        sbi->s_ms->s_state = sbi->s_mount_state;
                mark_buffer_dirty(sbi->s_sbh);
        }
        for (i = 0; i < sbi->s_imap_blocks; i++)
                brelse(sbi->s_imap[i]);
        for (i = 0; i < sbi->s_zmap_blocks; i++)
                brelse(sbi->s_zmap[i]);
        brelse (sbi->s_sbh);
        kfree(sbi->s_imap);
        sb->s_fs_info = NULL;
        kfree(sbi);
}

static struct kmem_cache * minix_inode_cachep;

static struct inode *minix_alloc_inode(struct super_block *sb)
{
        struct minix_inode_info *ei;
        ei = alloc_inode_sb(sb, minix_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
        return &ei->vfs_inode;
}

static void minix_free_in_core_inode(struct inode *inode)
{
        kmem_cache_free(minix_inode_cachep, minix_i(inode));
}

static void init_once(void *foo)
{
        struct minix_inode_info *ei = (struct minix_inode_info *) foo;

        inode_init_once(&ei->vfs_inode);
}

static int __init init_inodecache(void)
{
        minix_inode_cachep = kmem_cache_create("minix_inode_cache",
                                             sizeof(struct minix_inode_info),
                                             0, (SLAB_RECLAIM_ACCOUNT|
                                                SLAB_ACCOUNT),
                                             init_once);
        if (minix_inode_cachep == NULL)
                return -ENOMEM;
        return 0;
}

static void destroy_inodecache(void)
{
        /*
         * Make sure all delayed rcu free inodes are flushed before we
         * destroy cache.
         */
        rcu_barrier();
        kmem_cache_destroy(minix_inode_cachep);
}

static const struct super_operations minix_sops = {
        .alloc_inode        = minix_alloc_inode,
        .free_inode        = minix_free_in_core_inode,
        .write_inode        = minix_write_inode,
        .evict_inode        = minix_evict_inode,
        .put_super        = minix_put_super,
        .statfs                = minix_statfs,
};

static int minix_reconfigure(struct fs_context *fc)
{
        struct minix_super_block * ms;
        struct super_block *sb = fc->root->d_sb;
        struct minix_sb_info * sbi = sb->s_fs_info;

        sync_filesystem(sb);
        ms = sbi->s_ms;
        if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb))
                return 0;
        if (fc->sb_flags & SB_RDONLY) {
                if (ms->s_state & MINIX_VALID_FS ||
                    !(sbi->s_mount_state & MINIX_VALID_FS))
                        return 0;
                /* Mounting a rw partition read-only. */
                if (sbi->s_version != MINIX_V3)
                        ms->s_state = sbi->s_mount_state;
                mark_buffer_dirty(sbi->s_sbh);
        } else {
                  /* Mount a partition which is read-only, read-write. */
                if (sbi->s_version != MINIX_V3) {
                        sbi->s_mount_state = ms->s_state;
                        ms->s_state &= ~MINIX_VALID_FS;
                } else {
                        sbi->s_mount_state = MINIX_VALID_FS;
                }
                mark_buffer_dirty(sbi->s_sbh);

                if (!(sbi->s_mount_state & MINIX_VALID_FS))
                        printk("MINIX-fs warning: remounting unchecked fs, "
                                "running fsck is recommended\n");
                else if ((sbi->s_mount_state & MINIX_ERROR_FS))
                        printk("MINIX-fs warning: remounting fs with errors, "
                                "running fsck is recommended\n");
        }
        return 0;
}

static bool minix_check_superblock(struct super_block *sb)
{
        struct minix_sb_info *sbi = minix_sb(sb);

        if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0)
                return false;

        /*
         * s_max_size must not exceed the block mapping limitation.  This check
         * is only needed for V1 filesystems, since V2/V3 support an extra level
         * of indirect blocks which places the limit well above U32_MAX.
         */
        if (sbi->s_version == MINIX_V1 &&
            sb->s_maxbytes > (7 + 512 + 512*512) * BLOCK_SIZE)
                return false;

        return true;
}

static int minix_fill_super(struct super_block *s, struct fs_context *fc)
{
        struct buffer_head *bh;
        struct buffer_head **map;
        struct minix_super_block *ms;
        struct minix3_super_block *m3s = NULL;
        unsigned long i, block;
        struct inode *root_inode;
        struct minix_sb_info *sbi;
        int ret = -EINVAL;
        int silent = fc->sb_flags & SB_SILENT;

        sbi = kzalloc(sizeof(struct minix_sb_info), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
        s->s_fs_info = sbi;

        BUILD_BUG_ON(32 != sizeof (struct minix_inode));
        BUILD_BUG_ON(64 != sizeof(struct minix2_inode));

        if (!sb_set_blocksize(s, BLOCK_SIZE))
                goto out_bad_hblock;

        if (!(bh = sb_bread(s, 1)))
                goto out_bad_sb;

        ms = (struct minix_super_block *) bh->b_data;
        sbi->s_ms = ms;
        sbi->s_sbh = bh;
        sbi->s_mount_state = ms->s_state;
        sbi->s_ninodes = ms->s_ninodes;
        sbi->s_nzones = ms->s_nzones;
        sbi->s_imap_blocks = ms->s_imap_blocks;
        sbi->s_zmap_blocks = ms->s_zmap_blocks;
        sbi->s_firstdatazone = ms->s_firstdatazone;
        sbi->s_log_zone_size = ms->s_log_zone_size;
        s->s_maxbytes = ms->s_max_size;
        s->s_magic = ms->s_magic;
        if (s->s_magic == MINIX_SUPER_MAGIC) {
                sbi->s_version = MINIX_V1;
                sbi->s_dirsize = 16;
                sbi->s_namelen = 14;
                s->s_max_links = MINIX_LINK_MAX;
        } else if (s->s_magic == MINIX_SUPER_MAGIC2) {
                sbi->s_version = MINIX_V1;
                sbi->s_dirsize = 32;
                sbi->s_namelen = 30;
                s->s_max_links = MINIX_LINK_MAX;
        } else if (s->s_magic == MINIX2_SUPER_MAGIC) {
                sbi->s_version = MINIX_V2;
                sbi->s_nzones = ms->s_zones;
                sbi->s_dirsize = 16;
                sbi->s_namelen = 14;
                s->s_max_links = MINIX2_LINK_MAX;
        } else if (s->s_magic == MINIX2_SUPER_MAGIC2) {
                sbi->s_version = MINIX_V2;
                sbi->s_nzones = ms->s_zones;
                sbi->s_dirsize = 32;
                sbi->s_namelen = 30;
                s->s_max_links = MINIX2_LINK_MAX;
        } else if ( *(__u16 *)(bh->b_data + 24) == MINIX3_SUPER_MAGIC) {
                m3s = (struct minix3_super_block *) bh->b_data;
                s->s_magic = m3s->s_magic;
                sbi->s_imap_blocks = m3s->s_imap_blocks;
                sbi->s_zmap_blocks = m3s->s_zmap_blocks;
                sbi->s_firstdatazone = m3s->s_firstdatazone;
                sbi->s_log_zone_size = m3s->s_log_zone_size;
                s->s_maxbytes = m3s->s_max_size;
                sbi->s_ninodes = m3s->s_ninodes;
                sbi->s_nzones = m3s->s_zones;
                sbi->s_dirsize = 64;
                sbi->s_namelen = 60;
                sbi->s_version = MINIX_V3;
                sbi->s_mount_state = MINIX_VALID_FS;
                sb_set_blocksize(s, m3s->s_blocksize);
                s->s_max_links = MINIX2_LINK_MAX;
        } else
                goto out_no_fs;

        if (!minix_check_superblock(s))
                goto out_illegal_sb;

        /*
         * Allocate the buffer map to keep the superblock small.
         */
        i = (sbi->s_imap_blocks + sbi->s_zmap_blocks) * sizeof(bh);
        map = kzalloc(i, GFP_KERNEL);
        if (!map)
                goto out_no_map;
        sbi->s_imap = &map[0];
        sbi->s_zmap = &map[sbi->s_imap_blocks];

        block=2;
        for (i=0 ; i < sbi->s_imap_blocks ; i++) {
                if (!(sbi->s_imap[i]=sb_bread(s, block)))
                        goto out_no_bitmap;
                block++;
        }
        for (i=0 ; i < sbi->s_zmap_blocks ; i++) {
                if (!(sbi->s_zmap[i]=sb_bread(s, block)))
                        goto out_no_bitmap;
                block++;
        }

        minix_set_bit(0,sbi->s_imap[0]->b_data);
        minix_set_bit(0,sbi->s_zmap[0]->b_data);

        /* Apparently minix can create filesystems that allocate more blocks for
         * the bitmaps than needed.  We simply ignore that, but verify it didn't
         * create one with not enough blocks and bail out if so.
         */
        block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize);
        if (sbi->s_imap_blocks < block) {
                printk("MINIX-fs: file system does not have enough "
                                "imap blocks allocated.  Refusing to mount.\n");
                goto out_no_bitmap;
        }

        block = minix_blocks_needed(
                        (sbi->s_nzones - sbi->s_firstdatazone + 1),
                        s->s_blocksize);
        if (sbi->s_zmap_blocks < block) {
                printk("MINIX-fs: file system does not have enough "
                                "zmap blocks allocated.  Refusing to mount.\n");
                goto out_no_bitmap;
        }

        /* set up enough so that it can read an inode */
        s->s_op = &minix_sops;
        s->s_time_min = 0;
        s->s_time_max = U32_MAX;
        root_inode = minix_iget(s, MINIX_ROOT_INO);
        if (IS_ERR(root_inode)) {
                ret = PTR_ERR(root_inode);
                goto out_no_root;
        }

        ret = -ENOMEM;
        s->s_root = d_make_root(root_inode);
        if (!s->s_root)
                goto out_no_root;

        if (!sb_rdonly(s)) {
                if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */
                        ms->s_state &= ~MINIX_VALID_FS;
                mark_buffer_dirty(bh);
        }
        if (!(sbi->s_mount_state & MINIX_VALID_FS))
                printk("MINIX-fs: mounting unchecked file system, "
                        "running fsck is recommended\n");
        else if (sbi->s_mount_state & MINIX_ERROR_FS)
                printk("MINIX-fs: mounting file system with errors, "
                        "running fsck is recommended\n");

        return 0;

out_no_root:
        if (!silent)
                printk("MINIX-fs: get root inode failed\n");
        goto out_freemap;

out_no_bitmap:
        printk("MINIX-fs: bad superblock or unable to read bitmaps\n");
out_freemap:
        for (i = 0; i < sbi->s_imap_blocks; i++)
                brelse(sbi->s_imap[i]);
        for (i = 0; i < sbi->s_zmap_blocks; i++)
                brelse(sbi->s_zmap[i]);
        kfree(sbi->s_imap);
        goto out_release;

out_no_map:
        ret = -ENOMEM;
        if (!silent)
                printk("MINIX-fs: can't allocate map\n");
        goto out_release;

out_illegal_sb:
        if (!silent)
                printk("MINIX-fs: bad superblock\n");
        goto out_release;

out_no_fs:
        if (!silent)
                printk("VFS: Can't find a Minix filesystem V1 | V2 | V3 "
                       "on device %s.\n", s->s_id);
out_release:
        brelse(bh);
        goto out;

out_bad_hblock:
        printk("MINIX-fs: blocksize too small for device\n");
        goto out;

out_bad_sb:
        printk("MINIX-fs: unable to read superblock\n");
out:
        s->s_fs_info = NULL;
        kfree(sbi);
        return ret;
}

static int minix_get_tree(struct fs_context *fc)
{
         return get_tree_bdev(fc, minix_fill_super);
}

static const struct fs_context_operations minix_context_ops = {
        .get_tree        = minix_get_tree,
        .reconfigure        = minix_reconfigure,
};

static int minix_init_fs_context(struct fs_context *fc)
{
        fc->ops = &minix_context_ops;

        return 0;
}

static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct super_block *sb = dentry->d_sb;
        struct minix_sb_info *sbi = minix_sb(sb);
        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        buf->f_type = sb->s_magic;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
        buf->f_bfree = minix_count_free_blocks(sb);
        buf->f_bavail = buf->f_bfree;
        buf->f_files = sbi->s_ninodes;
        buf->f_ffree = minix_count_free_inodes(sb);
        buf->f_namelen = sbi->s_namelen;
        buf->f_fsid = u64_to_fsid(id);

        return 0;
}

static int minix_get_block(struct inode *inode, sector_t block,
                    struct buffer_head *bh_result, int create)
{
        if (INODE_VERSION(inode) == MINIX_V1)
                return V1_minix_get_block(inode, block, bh_result, create);
        else
                return V2_minix_get_block(inode, block, bh_result, create);
}

static int minix_writepages(struct address_space *mapping,
                struct writeback_control *wbc)
{
        return mpage_writepages(mapping, wbc, minix_get_block);
}

static int minix_read_folio(struct file *file, struct folio *folio)
{
        return block_read_full_folio(folio, minix_get_block);
}

int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len)
{
        return __block_write_begin(page, pos, len, minix_get_block);
}

static void minix_write_failed(struct address_space *mapping, loff_t to)
{
        struct inode *inode = mapping->host;

        if (to > inode->i_size) {
                truncate_pagecache(inode, inode->i_size);
                minix_truncate(inode);
        }
}

static int minix_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len,
                        struct page **pagep, void **fsdata)
{
        int ret;

        ret = block_write_begin(mapping, pos, len, pagep, minix_get_block);
        if (unlikely(ret))
                minix_write_failed(mapping, pos + len);

        return ret;
}

static sector_t minix_bmap(struct address_space *mapping, sector_t block)
{
        return generic_block_bmap(mapping,block,minix_get_block);
}

static const struct address_space_operations minix_aops = {
        .dirty_folio        = block_dirty_folio,
        .invalidate_folio = block_invalidate_folio,
        .read_folio = minix_read_folio,
        .writepages = minix_writepages,
        .write_begin = minix_write_begin,
        .write_end = generic_write_end,
        .migrate_folio = buffer_migrate_folio,
        .bmap = minix_bmap,
        .direct_IO = noop_direct_IO
};

static const struct inode_operations minix_symlink_inode_operations = {
        .get_link        = page_get_link,
        .getattr        = minix_getattr,
};

void minix_set_inode(struct inode *inode, dev_t rdev)
{
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &minix_file_inode_operations;
                inode->i_fop = &minix_file_operations;
                inode->i_mapping->a_ops = &minix_aops;
        } else if (S_ISDIR(inode->i_mode)) {
                inode->i_op = &minix_dir_inode_operations;
                inode->i_fop = &minix_dir_operations;
                inode->i_mapping->a_ops = &minix_aops;
        } else if (S_ISLNK(inode->i_mode)) {
                inode->i_op = &minix_symlink_inode_operations;
                inode_nohighmem(inode);
                inode->i_mapping->a_ops = &minix_aops;
        } else
                init_special_inode(inode, inode->i_mode, rdev);
}

/*
 * The minix V1 function to read an inode.
 */
static struct inode *V1_minix_iget(struct inode *inode)
{
        struct buffer_head * bh;
        struct minix_inode * raw_inode;
        struct minix_inode_info *minix_inode = minix_i(inode);
        int i;

        raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh);
        if (!raw_inode) {
                iget_failed(inode);
                return ERR_PTR(-EIO);
        }
        if (raw_inode->i_nlinks == 0) {
                printk("MINIX-fs: deleted inode referenced: %lu\n",
                       inode->i_ino);
                brelse(bh);
                iget_failed(inode);
                return ERR_PTR(-ESTALE);
        }
        inode->i_mode = raw_inode->i_mode;
        i_uid_write(inode, raw_inode->i_uid);
        i_gid_write(inode, raw_inode->i_gid);
        set_nlink(inode, raw_inode->i_nlinks);
        inode->i_size = raw_inode->i_size;
        inode_set_mtime_to_ts(inode,
                              inode_set_atime_to_ts(inode, inode_set_ctime(inode, raw_inode->i_time, 0)));
        inode->i_blocks = 0;
        for (i = 0; i < 9; i++)
                minix_inode->u.i1_data[i] = raw_inode->i_zone[i];
        minix_set_inode(inode, old_decode_dev(raw_inode->i_zone[0]));
        brelse(bh);
        unlock_new_inode(inode);
        return inode;
}

/*
 * The minix V2 function to read an inode.
 */
static struct inode *V2_minix_iget(struct inode *inode)
{
        struct buffer_head * bh;
        struct minix2_inode * raw_inode;
        struct minix_inode_info *minix_inode = minix_i(inode);
        int i;

        raw_inode = minix_V2_raw_inode(inode->i_sb, inode->i_ino, &bh);
        if (!raw_inode) {
                iget_failed(inode);
                return ERR_PTR(-EIO);
        }
        if (raw_inode->i_nlinks == 0) {
                printk("MINIX-fs: deleted inode referenced: %lu\n",
                       inode->i_ino);
                brelse(bh);
                iget_failed(inode);
                return ERR_PTR(-ESTALE);
        }
        inode->i_mode = raw_inode->i_mode;
        i_uid_write(inode, raw_inode->i_uid);
        i_gid_write(inode, raw_inode->i_gid);
        set_nlink(inode, raw_inode->i_nlinks);
        inode->i_size = raw_inode->i_size;
        inode_set_mtime(inode, raw_inode->i_mtime, 0);
        inode_set_atime(inode, raw_inode->i_atime, 0);
        inode_set_ctime(inode, raw_inode->i_ctime, 0);
        inode->i_blocks = 0;
        for (i = 0; i < 10; i++)
                minix_inode->u.i2_data[i] = raw_inode->i_zone[i];
        minix_set_inode(inode, old_decode_dev(raw_inode->i_zone[0]));
        brelse(bh);
        unlock_new_inode(inode);
        return inode;
}

/*
 * The global function to read an inode.
 */
struct inode *minix_iget(struct super_block *sb, unsigned long ino)
{
        struct inode *inode;

        inode = iget_locked(sb, ino);
        if (!inode)
                return ERR_PTR(-ENOMEM);
        if (!(inode->i_state & I_NEW))
                return inode;

        if (INODE_VERSION(inode) == MINIX_V1)
                return V1_minix_iget(inode);
        else
                return V2_minix_iget(inode);
}

/*
 * The minix V1 function to synchronize an inode.
 */
static struct buffer_head * V1_minix_update_inode(struct inode * inode)
{
        struct buffer_head * bh;
        struct minix_inode * raw_inode;
        struct minix_inode_info *minix_inode = minix_i(inode);
        int i;

        raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh);
        if (!raw_inode)
                return NULL;
        raw_inode->i_mode = inode->i_mode;
        raw_inode->i_uid = fs_high2lowuid(i_uid_read(inode));
        raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode));
        raw_inode->i_nlinks = inode->i_nlink;
        raw_inode->i_size = inode->i_size;
        raw_inode->i_time = inode_get_mtime_sec(inode);
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
                raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev);
        else for (i = 0; i < 9; i++)
                raw_inode->i_zone[i] = minix_inode->u.i1_data[i];
        mark_buffer_dirty(bh);
        return bh;
}

/*
 * The minix V2 function to synchronize an inode.
 */
static struct buffer_head * V2_minix_update_inode(struct inode * inode)
{
        struct buffer_head * bh;
        struct minix2_inode * raw_inode;
        struct minix_inode_info *minix_inode = minix_i(inode);
        int i;

        raw_inode = minix_V2_raw_inode(inode->i_sb, inode->i_ino, &bh);
        if (!raw_inode)
                return NULL;
        raw_inode->i_mode = inode->i_mode;
        raw_inode->i_uid = fs_high2lowuid(i_uid_read(inode));
        raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode));
        raw_inode->i_nlinks = inode->i_nlink;
        raw_inode->i_size = inode->i_size;
        raw_inode->i_mtime = inode_get_mtime_sec(inode);
        raw_inode->i_atime = inode_get_atime_sec(inode);
        raw_inode->i_ctime = inode_get_ctime_sec(inode);
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
                raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev);
        else for (i = 0; i < 10; i++)
                raw_inode->i_zone[i] = minix_inode->u.i2_data[i];
        mark_buffer_dirty(bh);
        return bh;
}

static int minix_write_inode(struct inode *inode, struct writeback_control *wbc)
{
        int err = 0;
        struct buffer_head *bh;

        if (INODE_VERSION(inode) == MINIX_V1)
                bh = V1_minix_update_inode(inode);
        else
                bh = V2_minix_update_inode(inode);
        if (!bh)
                return -EIO;
        if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) {
                sync_dirty_buffer(bh);
                if (buffer_req(bh) && !buffer_uptodate(bh)) {
                        printk("IO error syncing minix inode [%s:%08lx]\n",
                                inode->i_sb->s_id, inode->i_ino);
                        err = -EIO;
                }
        }
        brelse (bh);
        return err;
}

int minix_getattr(struct mnt_idmap *idmap, const struct path *path,
                  struct kstat *stat, u32 request_mask, unsigned int flags)
{
        struct super_block *sb = path->dentry->d_sb;
        struct inode *inode = d_inode(path->dentry);

        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
        if (INODE_VERSION(inode) == MINIX_V1)
                stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb);
        else
                stat->blocks = (sb->s_blocksize / 512) * V2_minix_blocks(stat->size, sb);
        stat->blksize = sb->s_blocksize;
        return 0;
}

/*
 * The function that is called for file truncation.
 */
void minix_truncate(struct inode * inode)
{
        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)))
                return;
        if (INODE_VERSION(inode) == MINIX_V1)
                V1_minix_truncate(inode);
        else
                V2_minix_truncate(inode);
}

static struct file_system_type minix_fs_type = {
        .owner                        = THIS_MODULE,
        .name                        = "minix",
        .kill_sb                = kill_block_super,
        .fs_flags                = FS_REQUIRES_DEV,
        .init_fs_context        = minix_init_fs_context,
};
MODULE_ALIAS_FS("minix");

static int __init init_minix_fs(void)
{
        int err = init_inodecache();
        if (err)
                goto out1;
        err = register_filesystem(&minix_fs_type);
        if (err)
                goto out;
        return 0;
out:
        destroy_inodecache();
out1:
        return err;
}

static void __exit exit_minix_fs(void)
{
        unregister_filesystem(&minix_fs_type);
        destroy_inodecache();
}

module_init(init_minix_fs)
module_exit(exit_minix_fs)
MODULE_LICENSE("GPL");






























































    1 
    3 







    2 
    2 

    2 
    2 















































































































































































































































    2 









    2 









   12 




   10 








   12 










   11 


   12 



























   10 






    3 







































   12 






   12 


   10 


   11 














    3 



    3 




   11 

    3 
   10 

   11 





   10 



























    3 
   11 

   12 
















































































































































































    1 

    1 











    1 





    1 
    1 



    1 


    1 

    1 




    1 


    1 

    1 


    1 


    1 
    1 


























    1 




    1 
    1 





























































































    1 










    1 








    1 


    1 












    1 



    1 











    1 


    1 



    1 

    1 












    1 








































    1 
    1 












    1 
    1 
    1 

    1 




   11 
   13 









   11 





   12 

   12 






    1 

    1 




    1 

    1 

    1 

    1 










    1 


    1 












































































































    1 






























    1 







    1 




    1 




    1 















    1 

    1 












   11 


   11 



   11 


   11 


   10 






   10 


   11 

















   11 
   11 


   12 
    3 
    2 






   11 





   12 
    3 


















































































































































































    2 






































































































































































































































































    3 




    3 





    3 








    3 


































































































































   10 































    3 








    3 

























































    3 



















    3 
    3 



    3 













    3 










    3 




    2 




































































    3 

































    1 










































    2 


    1 














    3 





    5 

    3 









    4 



    4 


    5 




    3 












































































































































































































    2 










    1 







    2 









































































   10 
    3 






   11 























    7 


















    8 


    8 
    8 
















































    8 







    9 

    7 



    9 





    7 


    1 
    8 









    8 













    8 








    1 



















    4 




    4 

    3 












    3 



    2 





   14 









    8 



    4 


    9 








    6 
    1 

    1 



    9 






    5 
    4 






































   10 
    3 

    3 

















   11 


   13 











   10 













   11 









































   11 











   11 





















   13 



   12 

    4 





   13 


   11 



   12 
   13 






   13 

   11 

   12 

   13 







   11 


   12 



   11 


















    3 














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
// SPDX-License-Identifier: GPL-2.0
/*
 * Block multiqueue core code
 *
 * Copyright (C) 2013-2014 Jens Axboe
 * Copyright (C) 2013-2014 Christoph Hellwig
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/kmemleak.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/llist.h>
#include <linux/cpu.h>
#include <linux/cache.h>
#include <linux/sched/topology.h>
#include <linux/sched/signal.h>
#include <linux/delay.h>
#include <linux/crash_dump.h>
#include <linux/prefetch.h>
#include <linux/blk-crypto.h>
#include <linux/part_stat.h>
#include <linux/sched/isolation.h>

#include <trace/events/block.h>

#include <linux/t10-pi.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-pm.h"
#include "blk-stat.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"

static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd);

static void blk_mq_insert_request(struct request *rq, blk_insert_t flags);
static void blk_mq_request_bypass_insert(struct request *rq,
                blk_insert_t flags);
static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
                struct list_head *list);
static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
                         struct io_comp_batch *iob, unsigned int flags);

/*
 * Check if any of the ctx, dispatch list or elevator
 * have pending work in this hardware queue.
 */
static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
{
        return !list_empty_careful(&hctx->dispatch) ||
                sbitmap_any_bit_set(&hctx->ctx_map) ||
                        blk_mq_sched_has_work(hctx);
}

/*
 * Mark this ctx as having pending work in this hardware queue
 */
static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
                                     struct blk_mq_ctx *ctx)
{
        const int bit = ctx->index_hw[hctx->type];

        if (!sbitmap_test_bit(&hctx->ctx_map, bit))
                sbitmap_set_bit(&hctx->ctx_map, bit);
}

static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
                                      struct blk_mq_ctx *ctx)
{
        const int bit = ctx->index_hw[hctx->type];

        sbitmap_clear_bit(&hctx->ctx_map, bit);
}

struct mq_inflight {
        struct block_device *part;
        unsigned int inflight[2];
};

static bool blk_mq_check_inflight(struct request *rq, void *priv)
{
        struct mq_inflight *mi = priv;

        if (rq->part && blk_do_io_stat(rq) &&
            (!bdev_is_partition(mi->part) || rq->part == mi->part) &&
            blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
                mi->inflight[rq_data_dir(rq)]++;

        return true;
}

unsigned int blk_mq_in_flight(struct request_queue *q,
                struct block_device *part)
{
        struct mq_inflight mi = { .part = part };

        blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);

        return mi.inflight[0] + mi.inflight[1];
}

void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
                unsigned int inflight[2])
{
        struct mq_inflight mi = { .part = part };

        blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
        inflight[0] = mi.inflight[0];
        inflight[1] = mi.inflight[1];
}

void blk_freeze_queue_start(struct request_queue *q)
{
        mutex_lock(&q->mq_freeze_lock);
        if (++q->mq_freeze_depth == 1) {
                percpu_ref_kill(&q->q_usage_counter);
                mutex_unlock(&q->mq_freeze_lock);
                if (queue_is_mq(q))
                        blk_mq_run_hw_queues(q, false);
        } else {
                mutex_unlock(&q->mq_freeze_lock);
        }
}
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);

void blk_mq_freeze_queue_wait(struct request_queue *q)
{
        wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);

int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
                                     unsigned long timeout)
{
        return wait_event_timeout(q->mq_freeze_wq,
                                        percpu_ref_is_zero(&q->q_usage_counter),
                                        timeout);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);

/*
 * Guarantee no request is in use, so we can change any data structure of
 * the queue afterward.
 */
void blk_freeze_queue(struct request_queue *q)
{
        /*
         * In the !blk_mq case we are only calling this to kill the
         * q_usage_counter, otherwise this increases the freeze depth
         * and waits for it to return to zero.  For this reason there is
         * no blk_unfreeze_queue(), and blk_freeze_queue() is not
         * exported to drivers as the only user for unfreeze is blk_mq.
         */
        blk_freeze_queue_start(q);
        blk_mq_freeze_queue_wait(q);
}

void blk_mq_freeze_queue(struct request_queue *q)
{
        /*
         * ...just an alias to keep freeze and unfreeze actions balanced
         * in the blk_mq_* namespace
         */
        blk_freeze_queue(q);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);

void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
{
        mutex_lock(&q->mq_freeze_lock);
        if (force_atomic)
                q->q_usage_counter.data->force_atomic = true;
        q->mq_freeze_depth--;
        WARN_ON_ONCE(q->mq_freeze_depth < 0);
        if (!q->mq_freeze_depth) {
                percpu_ref_resurrect(&q->q_usage_counter);
                wake_up_all(&q->mq_freeze_wq);
        }
        mutex_unlock(&q->mq_freeze_lock);
}

void blk_mq_unfreeze_queue(struct request_queue *q)
{
        __blk_mq_unfreeze_queue(q, false);
}
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);

/*
 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
 * mpt3sas driver such that this function can be removed.
 */
void blk_mq_quiesce_queue_nowait(struct request_queue *q)
{
        unsigned long flags;

        spin_lock_irqsave(&q->queue_lock, flags);
        if (!q->quiesce_depth++)
                blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
        spin_unlock_irqrestore(&q->queue_lock, flags);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);

/**
 * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
 * @set: tag_set to wait on
 *
 * Note: it is driver's responsibility for making sure that quiesce has
 * been started on or more of the request_queues of the tag_set.  This
 * function only waits for the quiesce on those request_queues that had
 * the quiesce flag set using blk_mq_quiesce_queue_nowait.
 */
void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set)
{
        if (set->flags & BLK_MQ_F_BLOCKING)
                synchronize_srcu(set->srcu);
        else
                synchronize_rcu();
}
EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);

/**
 * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
 * @q: request queue.
 *
 * Note: this function does not prevent that the struct request end_io()
 * callback function is invoked. Once this function is returned, we make
 * sure no dispatch can happen until the queue is unquiesced via
 * blk_mq_unquiesce_queue().
 */
void blk_mq_quiesce_queue(struct request_queue *q)
{
        blk_mq_quiesce_queue_nowait(q);
        /* nothing to wait for non-mq queues */
        if (queue_is_mq(q))
                blk_mq_wait_quiesce_done(q->tag_set);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);

/*
 * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
 * @q: request queue.
 *
 * This function recovers queue into the state before quiescing
 * which is done by blk_mq_quiesce_queue.
 */
void blk_mq_unquiesce_queue(struct request_queue *q)
{
        unsigned long flags;
        bool run_queue = false;

        spin_lock_irqsave(&q->queue_lock, flags);
        if (WARN_ON_ONCE(q->quiesce_depth <= 0)) {
                ;
        } else if (!--q->quiesce_depth) {
                blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
                run_queue = true;
        }
        spin_unlock_irqrestore(&q->queue_lock, flags);

        /* dispatch requests which are inserted during quiescing */
        if (run_queue)
                blk_mq_run_hw_queues(q, true);
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);

void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
{
        struct request_queue *q;

        mutex_lock(&set->tag_list_lock);
        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                if (!blk_queue_skip_tagset_quiesce(q))
                        blk_mq_quiesce_queue_nowait(q);
        }
        blk_mq_wait_quiesce_done(set);
        mutex_unlock(&set->tag_list_lock);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);

void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
{
        struct request_queue *q;

        mutex_lock(&set->tag_list_lock);
        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                if (!blk_queue_skip_tagset_quiesce(q))
                        blk_mq_unquiesce_queue(q);
        }
        mutex_unlock(&set->tag_list_lock);
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);

void blk_mq_wake_waiters(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i)
                if (blk_mq_hw_queue_mapped(hctx))
                        blk_mq_tag_wakeup_all(hctx->tags, true);
}

void blk_rq_init(struct request_queue *q, struct request *rq)
{
        memset(rq, 0, sizeof(*rq));

        INIT_LIST_HEAD(&rq->queuelist);
        rq->q = q;
        rq->__sector = (sector_t) -1;
        INIT_HLIST_NODE(&rq->hash);
        RB_CLEAR_NODE(&rq->rb_node);
        rq->tag = BLK_MQ_NO_TAG;
        rq->internal_tag = BLK_MQ_NO_TAG;
        rq->start_time_ns = blk_time_get_ns();
        rq->part = NULL;
        blk_crypto_rq_set_defaults(rq);
}
EXPORT_SYMBOL(blk_rq_init);

/* Set start and alloc time when the allocated request is actually used */
static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
{
        if (blk_mq_need_time_stamp(rq))
                rq->start_time_ns = blk_time_get_ns();
        else
                rq->start_time_ns = 0;

#ifdef CONFIG_BLK_RQ_ALLOC_TIME
        if (blk_queue_rq_alloc_time(rq->q))
                rq->alloc_time_ns = alloc_time_ns ?: rq->start_time_ns;
        else
                rq->alloc_time_ns = 0;
#endif
}

static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
                struct blk_mq_tags *tags, unsigned int tag)
{
        struct blk_mq_ctx *ctx = data->ctx;
        struct blk_mq_hw_ctx *hctx = data->hctx;
        struct request_queue *q = data->q;
        struct request *rq = tags->static_rqs[tag];

        rq->q = q;
        rq->mq_ctx = ctx;
        rq->mq_hctx = hctx;
        rq->cmd_flags = data->cmd_flags;

        if (data->flags & BLK_MQ_REQ_PM)
                data->rq_flags |= RQF_PM;
        if (blk_queue_io_stat(q))
                data->rq_flags |= RQF_IO_STAT;
        rq->rq_flags = data->rq_flags;

        if (data->rq_flags & RQF_SCHED_TAGS) {
                rq->tag = BLK_MQ_NO_TAG;
                rq->internal_tag = tag;
        } else {
                rq->tag = tag;
                rq->internal_tag = BLK_MQ_NO_TAG;
        }
        rq->timeout = 0;

        rq->part = NULL;
        rq->io_start_time_ns = 0;
        rq->stats_sectors = 0;
        rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
        rq->nr_integrity_segments = 0;
#endif
        rq->end_io = NULL;
        rq->end_io_data = NULL;

        blk_crypto_rq_set_defaults(rq);
        INIT_LIST_HEAD(&rq->queuelist);
        /* tag was already set */
        WRITE_ONCE(rq->deadline, 0);
        req_ref_set(rq, 1);

        if (rq->rq_flags & RQF_USE_SCHED) {
                struct elevator_queue *e = data->q->elevator;

                INIT_HLIST_NODE(&rq->hash);
                RB_CLEAR_NODE(&rq->rb_node);

                if (e->type->ops.prepare_request)
                        e->type->ops.prepare_request(rq);
        }

        return rq;
}

static inline struct request *
__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
{
        unsigned int tag, tag_offset;
        struct blk_mq_tags *tags;
        struct request *rq;
        unsigned long tag_mask;
        int i, nr = 0;

        tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
        if (unlikely(!tag_mask))
                return NULL;

        tags = blk_mq_tags_from_data(data);
        for (i = 0; tag_mask; i++) {
                if (!(tag_mask & (1UL << i)))
                        continue;
                tag = tag_offset + i;
                prefetch(tags->static_rqs[tag]);
                tag_mask &= ~(1UL << i);
                rq = blk_mq_rq_ctx_init(data, tags, tag);
                rq_list_add(data->cached_rq, rq);
                nr++;
        }
        if (!(data->rq_flags & RQF_SCHED_TAGS))
                blk_mq_add_active_requests(data->hctx, nr);
        /* caller already holds a reference, add for remainder */
        percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
        data->nr_tags -= nr;

        return rq_list_pop(data->cached_rq);
}

static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
{
        struct request_queue *q = data->q;
        u64 alloc_time_ns = 0;
        struct request *rq;
        unsigned int tag;

        /* alloc_time includes depth and tag waits */
        if (blk_queue_rq_alloc_time(q))
                alloc_time_ns = blk_time_get_ns();

        if (data->cmd_flags & REQ_NOWAIT)
                data->flags |= BLK_MQ_REQ_NOWAIT;

        if (q->elevator) {
                /*
                 * All requests use scheduler tags when an I/O scheduler is
                 * enabled for the queue.
                 */
                data->rq_flags |= RQF_SCHED_TAGS;

                /*
                 * Flush/passthrough requests are special and go directly to the
                 * dispatch list.
                 */
                if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH &&
                    !blk_op_is_passthrough(data->cmd_flags)) {
                        struct elevator_mq_ops *ops = &q->elevator->type->ops;

                        WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED);

                        data->rq_flags |= RQF_USE_SCHED;
                        if (ops->limit_depth)
                                ops->limit_depth(data->cmd_flags, data);
                }
        }

retry:
        data->ctx = blk_mq_get_ctx(q);
        data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
        if (!(data->rq_flags & RQF_SCHED_TAGS))
                blk_mq_tag_busy(data->hctx);

        if (data->flags & BLK_MQ_REQ_RESERVED)
                data->rq_flags |= RQF_RESV;

        /*
         * Try batched alloc if we want more than 1 tag.
         */
        if (data->nr_tags > 1) {
                rq = __blk_mq_alloc_requests_batch(data);
                if (rq) {
                        blk_mq_rq_time_init(rq, alloc_time_ns);
                        return rq;
                }
                data->nr_tags = 1;
        }

        /*
         * Waiting allocations only fail because of an inactive hctx.  In that
         * case just retry the hctx assignment and tag allocation as CPU hotplug
         * should have migrated us to an online CPU by now.
         */
        tag = blk_mq_get_tag(data);
        if (tag == BLK_MQ_NO_TAG) {
                if (data->flags & BLK_MQ_REQ_NOWAIT)
                        return NULL;
                /*
                 * Give up the CPU and sleep for a random short time to
                 * ensure that thread using a realtime scheduling class
                 * are migrated off the CPU, and thus off the hctx that
                 * is going away.
                 */
                msleep(3);
                goto retry;
        }

        if (!(data->rq_flags & RQF_SCHED_TAGS))
                blk_mq_inc_active_requests(data->hctx);
        rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag);
        blk_mq_rq_time_init(rq, alloc_time_ns);
        return rq;
}

static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
                                            struct blk_plug *plug,
                                            blk_opf_t opf,
                                            blk_mq_req_flags_t flags)
{
        struct blk_mq_alloc_data data = {
                .q                = q,
                .flags                = flags,
                .cmd_flags        = opf,
                .nr_tags        = plug->nr_ios,
                .cached_rq        = &plug->cached_rq,
        };
        struct request *rq;

        if (blk_queue_enter(q, flags))
                return NULL;

        plug->nr_ios = 1;

        rq = __blk_mq_alloc_requests(&data);
        if (unlikely(!rq))
                blk_queue_exit(q);
        return rq;
}

static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
                                                   blk_opf_t opf,
                                                   blk_mq_req_flags_t flags)
{
        struct blk_plug *plug = current->plug;
        struct request *rq;

        if (!plug)
                return NULL;

        if (rq_list_empty(plug->cached_rq)) {
                if (plug->nr_ios == 1)
                        return NULL;
                rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
                if (!rq)
                        return NULL;
        } else {
                rq = rq_list_peek(&plug->cached_rq);
                if (!rq || rq->q != q)
                        return NULL;

                if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
                        return NULL;
                if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
                        return NULL;

                plug->cached_rq = rq_list_next(rq);
                blk_mq_rq_time_init(rq, 0);
        }

        rq->cmd_flags = opf;
        INIT_LIST_HEAD(&rq->queuelist);
        return rq;
}

struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
                blk_mq_req_flags_t flags)
{
        struct request *rq;

        rq = blk_mq_alloc_cached_request(q, opf, flags);
        if (!rq) {
                struct blk_mq_alloc_data data = {
                        .q                = q,
                        .flags                = flags,
                        .cmd_flags        = opf,
                        .nr_tags        = 1,
                };
                int ret;

                ret = blk_queue_enter(q, flags);
                if (ret)
                        return ERR_PTR(ret);

                rq = __blk_mq_alloc_requests(&data);
                if (!rq)
                        goto out_queue_exit;
        }
        rq->__data_len = 0;
        rq->__sector = (sector_t) -1;
        rq->bio = rq->biotail = NULL;
        return rq;
out_queue_exit:
        blk_queue_exit(q);
        return ERR_PTR(-EWOULDBLOCK);
}
EXPORT_SYMBOL(blk_mq_alloc_request);

struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
        blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx)
{
        struct blk_mq_alloc_data data = {
                .q                = q,
                .flags                = flags,
                .cmd_flags        = opf,
                .nr_tags        = 1,
        };
        u64 alloc_time_ns = 0;
        struct request *rq;
        unsigned int cpu;
        unsigned int tag;
        int ret;

        /* alloc_time includes depth and tag waits */
        if (blk_queue_rq_alloc_time(q))
                alloc_time_ns = blk_time_get_ns();

        /*
         * If the tag allocator sleeps we could get an allocation for a
         * different hardware context.  No need to complicate the low level
         * allocator for this for the rare use case of a command tied to
         * a specific queue.
         */
        if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) ||
            WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED)))
                return ERR_PTR(-EINVAL);

        if (hctx_idx >= q->nr_hw_queues)
                return ERR_PTR(-EIO);

        ret = blk_queue_enter(q, flags);
        if (ret)
                return ERR_PTR(ret);

        /*
         * Check if the hardware context is actually mapped to anything.
         * If not tell the caller that it should skip this queue.
         */
        ret = -EXDEV;
        data.hctx = xa_load(&q->hctx_table, hctx_idx);
        if (!blk_mq_hw_queue_mapped(data.hctx))
                goto out_queue_exit;
        cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
        if (cpu >= nr_cpu_ids)
                goto out_queue_exit;
        data.ctx = __blk_mq_get_ctx(q, cpu);

        if (q->elevator)
                data.rq_flags |= RQF_SCHED_TAGS;
        else
                blk_mq_tag_busy(data.hctx);

        if (flags & BLK_MQ_REQ_RESERVED)
                data.rq_flags |= RQF_RESV;

        ret = -EWOULDBLOCK;
        tag = blk_mq_get_tag(&data);
        if (tag == BLK_MQ_NO_TAG)
                goto out_queue_exit;
        if (!(data.rq_flags & RQF_SCHED_TAGS))
                blk_mq_inc_active_requests(data.hctx);
        rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
        blk_mq_rq_time_init(rq, alloc_time_ns);
        rq->__data_len = 0;
        rq->__sector = (sector_t) -1;
        rq->bio = rq->biotail = NULL;
        return rq;

out_queue_exit:
        blk_queue_exit(q);
        return ERR_PTR(ret);
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);

static void blk_mq_finish_request(struct request *rq)
{
        struct request_queue *q = rq->q;

        blk_zone_finish_request(rq);

        if (rq->rq_flags & RQF_USE_SCHED) {
                q->elevator->type->ops.finish_request(rq);
                /*
                 * For postflush request that may need to be
                 * completed twice, we should clear this flag
                 * to avoid double finish_request() on the rq.
                 */
                rq->rq_flags &= ~RQF_USE_SCHED;
        }
}

static void __blk_mq_free_request(struct request *rq)
{
        struct request_queue *q = rq->q;
        struct blk_mq_ctx *ctx = rq->mq_ctx;
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
        const int sched_tag = rq->internal_tag;

        blk_crypto_free_request(rq);
        blk_pm_mark_last_busy(rq);
        rq->mq_hctx = NULL;

        if (rq->tag != BLK_MQ_NO_TAG) {
                blk_mq_dec_active_requests(hctx);
                blk_mq_put_tag(hctx->tags, ctx, rq->tag);
        }
        if (sched_tag != BLK_MQ_NO_TAG)
                blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
        blk_mq_sched_restart(hctx);
        blk_queue_exit(q);
}

void blk_mq_free_request(struct request *rq)
{
        struct request_queue *q = rq->q;

        blk_mq_finish_request(rq);

        if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
                laptop_io_completion(q->disk->bdi);

        rq_qos_done(q, rq);

        WRITE_ONCE(rq->state, MQ_RQ_IDLE);
        if (req_ref_put_and_test(rq))
                __blk_mq_free_request(rq);
}
EXPORT_SYMBOL_GPL(blk_mq_free_request);

void blk_mq_free_plug_rqs(struct blk_plug *plug)
{
        struct request *rq;

        while ((rq = rq_list_pop(&plug->cached_rq)) != NULL)
                blk_mq_free_request(rq);
}

void blk_dump_rq_flags(struct request *rq, char *msg)
{
        printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
                rq->q->disk ? rq->q->disk->disk_name : "?",
                (__force unsigned long long) rq->cmd_flags);

        printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
               (unsigned long long)blk_rq_pos(rq),
               blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
        printk(KERN_INFO "  bio %p, biotail %p, len %u\n",
               rq->bio, rq->biotail, blk_rq_bytes(rq));
}
EXPORT_SYMBOL(blk_dump_rq_flags);

static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
        if (req->part && blk_do_io_stat(req)) {
                const int sgrp = op_stat_group(req_op(req));

                part_stat_lock();
                part_stat_add(req->part, sectors[sgrp], bytes >> 9);
                part_stat_unlock();
        }
}

static void blk_print_req_error(struct request *req, blk_status_t status)
{
        printk_ratelimited(KERN_ERR
                "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
                "phys_seg %u prio class %u\n",
                blk_status_to_str(status),
                req->q->disk ? req->q->disk->disk_name : "?",
                blk_rq_pos(req), (__force u32)req_op(req),
                blk_op_str(req_op(req)),
                (__force u32)(req->cmd_flags & ~REQ_OP_MASK),
                req->nr_phys_segments,
                IOPRIO_PRIO_CLASS(req->ioprio));
}

/*
 * Fully end IO on a request. Does not support partial completions, or
 * errors.
 */
static void blk_complete_request(struct request *req)
{
        const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0;
        int total_bytes = blk_rq_bytes(req);
        struct bio *bio = req->bio;

        trace_block_rq_complete(req, BLK_STS_OK, total_bytes);

        if (!bio)
                return;

#ifdef CONFIG_BLK_DEV_INTEGRITY
        if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ)
                req->q->integrity.profile->complete_fn(req, total_bytes);
#endif

        /*
         * Upper layers may call blk_crypto_evict_key() anytime after the last
         * bio_endio().  Therefore, the keyslot must be released before that.
         */
        blk_crypto_rq_put_keyslot(req);

        blk_account_io_completion(req, total_bytes);

        do {
                struct bio *next = bio->bi_next;

                /* Completion has already been traced */
                bio_clear_flag(bio, BIO_TRACE_COMPLETION);

                blk_zone_update_request_bio(req, bio);

                if (!is_flush)
                        bio_endio(bio);
                bio = next;
        } while (bio);

        /*
         * Reset counters so that the request stacking driver
         * can find how many bytes remain in the request
         * later.
         */
        if (!req->end_io) {
                req->bio = NULL;
                req->__data_len = 0;
        }
}

/**
 * blk_update_request - Complete multiple bytes without completing the request
 * @req:      the request being processed
 * @error:    block status code
 * @nr_bytes: number of bytes to complete for @req
 *
 * Description:
 *     Ends I/O on a number of bytes attached to @req, but doesn't complete
 *     the request structure even if @req doesn't have leftover.
 *     If @req has leftover, sets it up for the next range of segments.
 *
 *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
 *     %false return from this function.
 *
 * Note:
 *        The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
 *      except in the consistency check at the end of this function.
 *
 * Return:
 *     %false - this request doesn't have any more data
 *     %true  - this request has more data
 **/
bool blk_update_request(struct request *req, blk_status_t error,
                unsigned int nr_bytes)
{
        bool is_flush = req->rq_flags & RQF_FLUSH_SEQ;
        bool quiet = req->rq_flags & RQF_QUIET;
        int total_bytes;

        trace_block_rq_complete(req, error, nr_bytes);

        if (!req->bio)
                return false;

#ifdef CONFIG_BLK_DEV_INTEGRITY
        if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
            error == BLK_STS_OK)
                req->q->integrity.profile->complete_fn(req, nr_bytes);
#endif

        /*
         * Upper layers may call blk_crypto_evict_key() anytime after the last
         * bio_endio().  Therefore, the keyslot must be released before that.
         */
        if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
                __blk_crypto_rq_put_keyslot(req);

        if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) &&
            !test_bit(GD_DEAD, &req->q->disk->state)) {
                blk_print_req_error(req, error);
                trace_block_rq_error(req, error, nr_bytes);
        }

        blk_account_io_completion(req, nr_bytes);

        total_bytes = 0;
        while (req->bio) {
                struct bio *bio = req->bio;
                unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);

                if (unlikely(error))
                        bio->bi_status = error;

                if (bio_bytes == bio->bi_iter.bi_size) {
                        req->bio = bio->bi_next;
                } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) {
                        /*
                         * Partial zone append completions cannot be supported
                         * as the BIO fragments may end up not being written
                         * sequentially.
                         */
                        bio->bi_status = BLK_STS_IOERR;
                }

                /* Completion has already been traced */
                bio_clear_flag(bio, BIO_TRACE_COMPLETION);
                if (unlikely(quiet))
                        bio_set_flag(bio, BIO_QUIET);

                bio_advance(bio, bio_bytes);

                /* Don't actually finish bio if it's part of flush sequence */
                if (!bio->bi_iter.bi_size) {
                        blk_zone_update_request_bio(req, bio);
                        if (!is_flush)
                                bio_endio(bio);
                }

                total_bytes += bio_bytes;
                nr_bytes -= bio_bytes;

                if (!nr_bytes)
                        break;
        }

        /*
         * completely done
         */
        if (!req->bio) {
                /*
                 * Reset counters so that the request stacking driver
                 * can find how many bytes remain in the request
                 * later.
                 */
                req->__data_len = 0;
                return false;
        }

        req->__data_len -= total_bytes;

        /* update sector only for requests with clear definition of sector */
        if (!blk_rq_is_passthrough(req))
                req->__sector += total_bytes >> 9;

        /* mixed attributes always follow the first bio */
        if (req->rq_flags & RQF_MIXED_MERGE) {
                req->cmd_flags &= ~REQ_FAILFAST_MASK;
                req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
        }

        if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
                /*
                 * If total number of sectors is less than the first segment
                 * size, something has gone terribly wrong.
                 */
                if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
                        blk_dump_rq_flags(req, "request botched");
                        req->__data_len = blk_rq_cur_bytes(req);
                }

                /* recalculate the number of segments */
                req->nr_phys_segments = blk_recalc_rq_segments(req);
        }

        return true;
}
EXPORT_SYMBOL_GPL(blk_update_request);

static inline void blk_account_io_done(struct request *req, u64 now)
{
        trace_block_io_done(req);

        /*
         * Account IO completion.  flush_rq isn't accounted as a
         * normal IO on queueing nor completion.  Accounting the
         * containing request is enough.
         */
        if (blk_do_io_stat(req) && req->part &&
            !(req->rq_flags & RQF_FLUSH_SEQ)) {
                const int sgrp = op_stat_group(req_op(req));

                part_stat_lock();
                update_io_ticks(req->part, jiffies, true);
                part_stat_inc(req->part, ios[sgrp]);
                part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
                part_stat_local_dec(req->part,
                                    in_flight[op_is_write(req_op(req))]);
                part_stat_unlock();
        }
}

static inline void blk_account_io_start(struct request *req)
{
        trace_block_io_start(req);

        if (blk_do_io_stat(req)) {
                /*
                 * All non-passthrough requests are created from a bio with one
                 * exception: when a flush command that is part of a flush sequence
                 * generated by the state machine in blk-flush.c is cloned onto the
                 * lower device by dm-multipath we can get here without a bio.
                 */
                if (req->bio)
                        req->part = req->bio->bi_bdev;
                else
                        req->part = req->q->disk->part0;

                part_stat_lock();
                update_io_ticks(req->part, jiffies, false);
                part_stat_local_inc(req->part,
                                    in_flight[op_is_write(req_op(req))]);
                part_stat_unlock();
        }
}

static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
{
        if (rq->rq_flags & RQF_STATS)
                blk_stat_add(rq, now);

        blk_mq_sched_completed_request(rq, now);
        blk_account_io_done(rq, now);
}

inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
{
        if (blk_mq_need_time_stamp(rq))
                __blk_mq_end_request_acct(rq, blk_time_get_ns());

        blk_mq_finish_request(rq);

        if (rq->end_io) {
                rq_qos_done(rq->q, rq);
                if (rq->end_io(rq, error) == RQ_END_IO_FREE)
                        blk_mq_free_request(rq);
        } else {
                blk_mq_free_request(rq);
        }
}
EXPORT_SYMBOL(__blk_mq_end_request);

void blk_mq_end_request(struct request *rq, blk_status_t error)
{
        if (blk_update_request(rq, error, blk_rq_bytes(rq)))
                BUG();
        __blk_mq_end_request(rq, error);
}
EXPORT_SYMBOL(blk_mq_end_request);

#define TAG_COMP_BATCH                32

static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
                                          int *tag_array, int nr_tags)
{
        struct request_queue *q = hctx->queue;

        blk_mq_sub_active_requests(hctx, nr_tags);

        blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
        percpu_ref_put_many(&q->q_usage_counter, nr_tags);
}

void blk_mq_end_request_batch(struct io_comp_batch *iob)
{
        int tags[TAG_COMP_BATCH], nr_tags = 0;
        struct blk_mq_hw_ctx *cur_hctx = NULL;
        struct request *rq;
        u64 now = 0;

        if (iob->need_ts)
                now = blk_time_get_ns();

        while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
                prefetch(rq->bio);
                prefetch(rq->rq_next);

                blk_complete_request(rq);
                if (iob->need_ts)
                        __blk_mq_end_request_acct(rq, now);

                blk_mq_finish_request(rq);

                rq_qos_done(rq->q, rq);

                /*
                 * If end_io handler returns NONE, then it still has
                 * ownership of the request.
                 */
                if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE)
                        continue;

                WRITE_ONCE(rq->state, MQ_RQ_IDLE);
                if (!req_ref_put_and_test(rq))
                        continue;

                blk_crypto_free_request(rq);
                blk_pm_mark_last_busy(rq);

                if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) {
                        if (cur_hctx)
                                blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
                        nr_tags = 0;
                        cur_hctx = rq->mq_hctx;
                }
                tags[nr_tags++] = rq->tag;
        }

        if (nr_tags)
                blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
}
EXPORT_SYMBOL_GPL(blk_mq_end_request_batch);

static void blk_complete_reqs(struct llist_head *list)
{
        struct llist_node *entry = llist_reverse_order(llist_del_all(list));
        struct request *rq, *next;

        llist_for_each_entry_safe(rq, next, entry, ipi_list)
                rq->q->mq_ops->complete(rq);
}

static __latent_entropy void blk_done_softirq(struct softirq_action *h)
{
        blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
}

static int blk_softirq_cpu_dead(unsigned int cpu)
{
        blk_complete_reqs(&per_cpu(blk_cpu_done, cpu));
        return 0;
}

static void __blk_mq_complete_request_remote(void *data)
{
        __raise_softirq_irqoff(BLOCK_SOFTIRQ);
}

static inline bool blk_mq_complete_need_ipi(struct request *rq)
{
        int cpu = raw_smp_processor_id();

        if (!IS_ENABLED(CONFIG_SMP) ||
            !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
                return false;
        /*
         * With force threaded interrupts enabled, raising softirq from an SMP
         * function call will always result in waking the ksoftirqd thread.
         * This is probably worse than completing the request on a different
         * cache domain.
         */
        if (force_irqthreads())
                return false;

        /* same CPU or cache domain and capacity?  Complete locally */
        if (cpu == rq->mq_ctx->cpu ||
            (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
             cpus_share_cache(cpu, rq->mq_ctx->cpu) &&
             cpus_equal_capacity(cpu, rq->mq_ctx->cpu)))
                return false;

        /* don't try to IPI to an offline CPU */
        return cpu_online(rq->mq_ctx->cpu);
}

static void blk_mq_complete_send_ipi(struct request *rq)
{
        unsigned int cpu;

        cpu = rq->mq_ctx->cpu;
        if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu)))
                smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu));
}

static void blk_mq_raise_softirq(struct request *rq)
{
        struct llist_head *list;

        preempt_disable();
        list = this_cpu_ptr(&blk_cpu_done);
        if (llist_add(&rq->ipi_list, list))
                raise_softirq(BLOCK_SOFTIRQ);
        preempt_enable();
}

bool blk_mq_complete_request_remote(struct request *rq)
{
        WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);

        /*
         * For request which hctx has only one ctx mapping,
         * or a polled request, always complete locally,
         * it's pointless to redirect the completion.
         */
        if ((rq->mq_hctx->nr_ctx == 1 &&
             rq->mq_ctx->cpu == raw_smp_processor_id()) ||
             rq->cmd_flags & REQ_POLLED)
                return false;

        if (blk_mq_complete_need_ipi(rq)) {
                blk_mq_complete_send_ipi(rq);
                return true;
        }

        if (rq->q->nr_hw_queues == 1) {
                blk_mq_raise_softirq(rq);
                return true;
        }
        return false;
}
EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);

/**
 * blk_mq_complete_request - end I/O on a request
 * @rq:                the request being processed
 *
 * Description:
 *        Complete a request by scheduling the ->complete_rq operation.
 **/
void blk_mq_complete_request(struct request *rq)
{
        if (!blk_mq_complete_request_remote(rq))
                rq->q->mq_ops->complete(rq);
}
EXPORT_SYMBOL(blk_mq_complete_request);

/**
 * blk_mq_start_request - Start processing a request
 * @rq: Pointer to request to be started
 *
 * Function used by device drivers to notify the block layer that a request
 * is going to be processed now, so blk layer can do proper initializations
 * such as starting the timeout timer.
 */
void blk_mq_start_request(struct request *rq)
{
        struct request_queue *q = rq->q;

        trace_block_rq_issue(rq);

        if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) &&
            !blk_rq_is_passthrough(rq)) {
                rq->io_start_time_ns = blk_time_get_ns();
                rq->stats_sectors = blk_rq_sectors(rq);
                rq->rq_flags |= RQF_STATS;
                rq_qos_issue(q, rq);
        }

        WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);

        blk_add_timer(rq);
        WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
        rq->mq_hctx->tags->rqs[rq->tag] = rq;

#ifdef CONFIG_BLK_DEV_INTEGRITY
        if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
                q->integrity.profile->prepare_fn(rq);
#endif
        if (rq->bio && rq->bio->bi_opf & REQ_POLLED)
                WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num);
}
EXPORT_SYMBOL(blk_mq_start_request);

/*
 * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
 * queues. This is important for md arrays to benefit from merging
 * requests.
 */
static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
{
        if (plug->multiple_queues)
                return BLK_MAX_REQUEST_COUNT * 2;
        return BLK_MAX_REQUEST_COUNT;
}

static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
{
        struct request *last = rq_list_peek(&plug->mq_list);

        if (!plug->rq_count) {
                trace_block_plug(rq->q);
        } else if (plug->rq_count >= blk_plug_max_rq_count(plug) ||
                   (!blk_queue_nomerges(rq->q) &&
                    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
                blk_mq_flush_plug_list(plug, false);
                last = NULL;
                trace_block_plug(rq->q);
        }

        if (!plug->multiple_queues && last && last->q != rq->q)
                plug->multiple_queues = true;
        /*
         * Any request allocated from sched tags can't be issued to
         * ->queue_rqs() directly
         */
        if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS))
                plug->has_elevator = true;
        rq->rq_next = NULL;
        rq_list_add(&plug->mq_list, rq);
        plug->rq_count++;
}

/**
 * blk_execute_rq_nowait - insert a request to I/O scheduler for execution
 * @rq:                request to insert
 * @at_head:    insert request at head or tail of queue
 *
 * Description:
 *    Insert a fully prepared request at the back of the I/O scheduler queue
 *    for execution.  Don't wait for completion.
 *
 * Note:
 *    This function will invoke @done directly if the queue is dead.
 */
void blk_execute_rq_nowait(struct request *rq, bool at_head)
{
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

        WARN_ON(irqs_disabled());
        WARN_ON(!blk_rq_is_passthrough(rq));

        blk_account_io_start(rq);

        if (current->plug && !at_head) {
                blk_add_rq_to_plug(current->plug, rq);
                return;
        }

        blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
        blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);
}
EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);

struct blk_rq_wait {
        struct completion done;
        blk_status_t ret;
};

static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret)
{
        struct blk_rq_wait *wait = rq->end_io_data;

        wait->ret = ret;
        complete(&wait->done);
        return RQ_END_IO_NONE;
}

bool blk_rq_is_poll(struct request *rq)
{
        if (!rq->mq_hctx)
                return false;
        if (rq->mq_hctx->type != HCTX_TYPE_POLL)
                return false;
        return true;
}
EXPORT_SYMBOL_GPL(blk_rq_is_poll);

static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
{
        do {
                blk_hctx_poll(rq->q, rq->mq_hctx, NULL, 0);
                cond_resched();
        } while (!completion_done(wait));
}

/**
 * blk_execute_rq - insert a request into queue for execution
 * @rq:                request to insert
 * @at_head:    insert request at head or tail of queue
 *
 * Description:
 *    Insert a fully prepared request at the back of the I/O scheduler queue
 *    for execution and wait for completion.
 * Return: The blk_status_t result provided to blk_mq_end_request().
 */
blk_status_t blk_execute_rq(struct request *rq, bool at_head)
{
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
        struct blk_rq_wait wait = {
                .done = COMPLETION_INITIALIZER_ONSTACK(wait.done),
        };

        WARN_ON(irqs_disabled());
        WARN_ON(!blk_rq_is_passthrough(rq));

        rq->end_io_data = &wait;
        rq->end_io = blk_end_sync_rq;

        blk_account_io_start(rq);
        blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
        blk_mq_run_hw_queue(hctx, false);

        if (blk_rq_is_poll(rq))
                blk_rq_poll_completion(rq, &wait.done);
        else
                blk_wait_io(&wait.done);

        return wait.ret;
}
EXPORT_SYMBOL(blk_execute_rq);

static void __blk_mq_requeue_request(struct request *rq)
{
        struct request_queue *q = rq->q;

        blk_mq_put_driver_tag(rq);

        trace_block_rq_requeue(rq);
        rq_qos_requeue(q, rq);

        if (blk_mq_request_started(rq)) {
                WRITE_ONCE(rq->state, MQ_RQ_IDLE);
                rq->rq_flags &= ~RQF_TIMED_OUT;
        }
}

void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
{
        struct request_queue *q = rq->q;
        unsigned long flags;

        __blk_mq_requeue_request(rq);

        /* this request will be re-inserted to io scheduler queue */
        blk_mq_sched_requeue_request(rq);

        spin_lock_irqsave(&q->requeue_lock, flags);
        list_add_tail(&rq->queuelist, &q->requeue_list);
        spin_unlock_irqrestore(&q->requeue_lock, flags);

        if (kick_requeue_list)
                blk_mq_kick_requeue_list(q);
}
EXPORT_SYMBOL(blk_mq_requeue_request);

static void blk_mq_requeue_work(struct work_struct *work)
{
        struct request_queue *q =
                container_of(work, struct request_queue, requeue_work.work);
        LIST_HEAD(rq_list);
        LIST_HEAD(flush_list);
        struct request *rq;

        spin_lock_irq(&q->requeue_lock);
        list_splice_init(&q->requeue_list, &rq_list);
        list_splice_init(&q->flush_list, &flush_list);
        spin_unlock_irq(&q->requeue_lock);

        while (!list_empty(&rq_list)) {
                rq = list_entry(rq_list.next, struct request, queuelist);
                /*
                 * If RQF_DONTPREP ist set, the request has been started by the
                 * driver already and might have driver-specific data allocated
                 * already.  Insert it into the hctx dispatch list to avoid
                 * block layer merges for the request.
                 */
                if (rq->rq_flags & RQF_DONTPREP) {
                        list_del_init(&rq->queuelist);
                        blk_mq_request_bypass_insert(rq, 0);
                } else {
                        list_del_init(&rq->queuelist);
                        blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD);
                }
        }

        while (!list_empty(&flush_list)) {
                rq = list_entry(flush_list.next, struct request, queuelist);
                list_del_init(&rq->queuelist);
                blk_mq_insert_request(rq, 0);
        }

        blk_mq_run_hw_queues(q, false);
}

void blk_mq_kick_requeue_list(struct request_queue *q)
{
        kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
}
EXPORT_SYMBOL(blk_mq_kick_requeue_list);

void blk_mq_delay_kick_requeue_list(struct request_queue *q,
                                    unsigned long msecs)
{
        kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
                                    msecs_to_jiffies(msecs));
}
EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);

static bool blk_is_flush_data_rq(struct request *rq)
{
        return (rq->rq_flags & RQF_FLUSH_SEQ) && !is_flush_rq(rq);
}

static bool blk_mq_rq_inflight(struct request *rq, void *priv)
{
        /*
         * If we find a request that isn't idle we know the queue is busy
         * as it's checked in the iter.
         * Return false to stop the iteration.
         *
         * In case of queue quiesce, if one flush data request is completed,
         * don't count it as inflight given the flush sequence is suspended,
         * and the original flush data request is invisible to driver, just
         * like other pending requests because of quiesce
         */
        if (blk_mq_request_started(rq) && !(blk_queue_quiesced(rq->q) &&
                                blk_is_flush_data_rq(rq) &&
                                blk_mq_request_completed(rq))) {
                bool *busy = priv;

                *busy = true;
                return false;
        }

        return true;
}

bool blk_mq_queue_inflight(struct request_queue *q)
{
        bool busy = false;

        blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
        return busy;
}
EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);

static void blk_mq_rq_timed_out(struct request *req)
{
        req->rq_flags |= RQF_TIMED_OUT;
        if (req->q->mq_ops->timeout) {
                enum blk_eh_timer_return ret;

                ret = req->q->mq_ops->timeout(req);
                if (ret == BLK_EH_DONE)
                        return;
                WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
        }

        blk_add_timer(req);
}

struct blk_expired_data {
        bool has_timedout_rq;
        unsigned long next;
        unsigned long timeout_start;
};

static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired)
{
        unsigned long deadline;

        if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
                return false;
        if (rq->rq_flags & RQF_TIMED_OUT)
                return false;

        deadline = READ_ONCE(rq->deadline);
        if (time_after_eq(expired->timeout_start, deadline))
                return true;

        if (expired->next == 0)
                expired->next = deadline;
        else if (time_after(expired->next, deadline))
                expired->next = deadline;
        return false;
}

void blk_mq_put_rq_ref(struct request *rq)
{
        if (is_flush_rq(rq)) {
                if (rq->end_io(rq, 0) == RQ_END_IO_FREE)
                        blk_mq_free_request(rq);
        } else if (req_ref_put_and_test(rq)) {
                __blk_mq_free_request(rq);
        }
}

static bool blk_mq_check_expired(struct request *rq, void *priv)
{
        struct blk_expired_data *expired = priv;

        /*
         * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
         * be reallocated underneath the timeout handler's processing, then
         * the expire check is reliable. If the request is not expired, then
         * it was completed and reallocated as a new request after returning
         * from blk_mq_check_expired().
         */
        if (blk_mq_req_expired(rq, expired)) {
                expired->has_timedout_rq = true;
                return false;
        }
        return true;
}

static bool blk_mq_handle_expired(struct request *rq, void *priv)
{
        struct blk_expired_data *expired = priv;

        if (blk_mq_req_expired(rq, expired))
                blk_mq_rq_timed_out(rq);
        return true;
}

static void blk_mq_timeout_work(struct work_struct *work)
{
        struct request_queue *q =
                container_of(work, struct request_queue, timeout_work);
        struct blk_expired_data expired = {
                .timeout_start = jiffies,
        };
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        /* A deadlock might occur if a request is stuck requiring a
         * timeout at the same time a queue freeze is waiting
         * completion, since the timeout code would not be able to
         * acquire the queue reference here.
         *
         * That's why we don't use blk_queue_enter here; instead, we use
         * percpu_ref_tryget directly, because we need to be able to
         * obtain a reference even in the short window between the queue
         * starting to freeze, by dropping the first reference in
         * blk_freeze_queue_start, and the moment the last request is
         * consumed, marked by the instant q_usage_counter reaches
         * zero.
         */
        if (!percpu_ref_tryget(&q->q_usage_counter))
                return;

        /* check if there is any timed-out request */
        blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired);
        if (expired.has_timedout_rq) {
                /*
                 * Before walking tags, we must ensure any submit started
                 * before the current time has finished. Since the submit
                 * uses srcu or rcu, wait for a synchronization point to
                 * ensure all running submits have finished
                 */
                blk_mq_wait_quiesce_done(q->tag_set);

                expired.next = 0;
                blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired);
        }

        if (expired.next != 0) {
                mod_timer(&q->timeout, expired.next);
        } else {
                /*
                 * Request timeouts are handled as a forward rolling timer. If
                 * we end up here it means that no requests are pending and
                 * also that no request has been pending for a while. Mark
                 * each hctx as idle.
                 */
                queue_for_each_hw_ctx(q, hctx, i) {
                        /* the hctx may be unmapped, so check it here */
                        if (blk_mq_hw_queue_mapped(hctx))
                                blk_mq_tag_idle(hctx);
                }
        }
        blk_queue_exit(q);
}

struct flush_busy_ctx_data {
        struct blk_mq_hw_ctx *hctx;
        struct list_head *list;
};

static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
{
        struct flush_busy_ctx_data *flush_data = data;
        struct blk_mq_hw_ctx *hctx = flush_data->hctx;
        struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
        enum hctx_type type = hctx->type;

        spin_lock(&ctx->lock);
        list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
        sbitmap_clear_bit(sb, bitnr);
        spin_unlock(&ctx->lock);
        return true;
}

/*
 * Process software queues that have been marked busy, splicing them
 * to the for-dispatch
 */
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{
        struct flush_busy_ctx_data data = {
                .hctx = hctx,
                .list = list,
        };

        sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
}
EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);

struct dispatch_rq_data {
        struct blk_mq_hw_ctx *hctx;
        struct request *rq;
};

static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
                void *data)
{
        struct dispatch_rq_data *dispatch_data = data;
        struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
        struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
        enum hctx_type type = hctx->type;

        spin_lock(&ctx->lock);
        if (!list_empty(&ctx->rq_lists[type])) {
                dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
                list_del_init(&dispatch_data->rq->queuelist);
                if (list_empty(&ctx->rq_lists[type]))
                        sbitmap_clear_bit(sb, bitnr);
        }
        spin_unlock(&ctx->lock);

        return !dispatch_data->rq;
}

struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
                                        struct blk_mq_ctx *start)
{
        unsigned off = start ? start->index_hw[hctx->type] : 0;
        struct dispatch_rq_data data = {
                .hctx = hctx,
                .rq   = NULL,
        };

        __sbitmap_for_each_set(&hctx->ctx_map, off,
                               dispatch_rq_from_ctx, &data);

        return data.rq;
}

bool __blk_mq_alloc_driver_tag(struct request *rq)
{
        struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
        unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
        int tag;

        blk_mq_tag_busy(rq->mq_hctx);

        if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
                bt = &rq->mq_hctx->tags->breserved_tags;
                tag_offset = 0;
        } else {
                if (!hctx_may_queue(rq->mq_hctx, bt))
                        return false;
        }

        tag = __sbitmap_queue_get(bt);
        if (tag == BLK_MQ_NO_TAG)
                return false;

        rq->tag = tag + tag_offset;
        blk_mq_inc_active_requests(rq->mq_hctx);
        return true;
}

static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
                                int flags, void *key)
{
        struct blk_mq_hw_ctx *hctx;

        hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);

        spin_lock(&hctx->dispatch_wait_lock);
        if (!list_empty(&wait->entry)) {
                struct sbitmap_queue *sbq;

                list_del_init(&wait->entry);
                sbq = &hctx->tags->bitmap_tags;
                atomic_dec(&sbq->ws_active);
        }
        spin_unlock(&hctx->dispatch_wait_lock);

        blk_mq_run_hw_queue(hctx, true);
        return 1;
}

/*
 * Mark us waiting for a tag. For shared tags, this involves hooking us into
 * the tag wakeups. For non-shared tags, we can simply mark us needing a
 * restart. For both cases, take care to check the condition again after
 * marking us as waiting.
 */
static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
                                 struct request *rq)
{
        struct sbitmap_queue *sbq;
        struct wait_queue_head *wq;
        wait_queue_entry_t *wait;
        bool ret;

        if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
            !(blk_mq_is_shared_tags(hctx->flags))) {
                blk_mq_sched_mark_restart_hctx(hctx);

                /*
                 * It's possible that a tag was freed in the window between the
                 * allocation failure and adding the hardware queue to the wait
                 * queue.
                 *
                 * Don't clear RESTART here, someone else could have set it.
                 * At most this will cost an extra queue run.
                 */
                return blk_mq_get_driver_tag(rq);
        }

        wait = &hctx->dispatch_wait;
        if (!list_empty_careful(&wait->entry))
                return false;

        if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag))
                sbq = &hctx->tags->breserved_tags;
        else
                sbq = &hctx->tags->bitmap_tags;
        wq = &bt_wait_ptr(sbq, hctx)->wait;

        spin_lock_irq(&wq->lock);
        spin_lock(&hctx->dispatch_wait_lock);
        if (!list_empty(&wait->entry)) {
                spin_unlock(&hctx->dispatch_wait_lock);
                spin_unlock_irq(&wq->lock);
                return false;
        }

        atomic_inc(&sbq->ws_active);
        wait->flags &= ~WQ_FLAG_EXCLUSIVE;
        __add_wait_queue(wq, wait);

        /*
         * Add one explicit barrier since blk_mq_get_driver_tag() may
         * not imply barrier in case of failure.
         *
         * Order adding us to wait queue and allocating driver tag.
         *
         * The pair is the one implied in sbitmap_queue_wake_up() which
         * orders clearing sbitmap tag bits and waitqueue_active() in
         * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless
         *
         * Otherwise, re-order of adding wait queue and getting driver tag
         * may cause __sbitmap_queue_wake_up() to wake up nothing because
         * the waitqueue_active() may not observe us in wait queue.
         */
        smp_mb();

        /*
         * It's possible that a tag was freed in the window between the
         * allocation failure and adding the hardware queue to the wait
         * queue.
         */
        ret = blk_mq_get_driver_tag(rq);
        if (!ret) {
                spin_unlock(&hctx->dispatch_wait_lock);
                spin_unlock_irq(&wq->lock);
                return false;
        }

        /*
         * We got a tag, remove ourselves from the wait queue to ensure
         * someone else gets the wakeup.
         */
        list_del_init(&wait->entry);
        atomic_dec(&sbq->ws_active);
        spin_unlock(&hctx->dispatch_wait_lock);
        spin_unlock_irq(&wq->lock);

        return true;
}

#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT  8
#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR  4
/*
 * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
 * - EWMA is one simple way to compute running average value
 * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
 * - take 4 as factor for avoiding to get too small(0) result, and this
 *   factor doesn't matter because EWMA decreases exponentially
 */
static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
{
        unsigned int ewma;

        ewma = hctx->dispatch_busy;

        if (!ewma && !busy)
                return;

        ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
        if (busy)
                ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
        ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;

        hctx->dispatch_busy = ewma;
}

#define BLK_MQ_RESOURCE_DELAY        3                /* ms units */

static void blk_mq_handle_dev_resource(struct request *rq,
                                       struct list_head *list)
{
        list_add(&rq->queuelist, list);
        __blk_mq_requeue_request(rq);
}

enum prep_dispatch {
        PREP_DISPATCH_OK,
        PREP_DISPATCH_NO_TAG,
        PREP_DISPATCH_NO_BUDGET,
};

static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
                                                  bool need_budget)
{
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
        int budget_token = -1;

        if (need_budget) {
                budget_token = blk_mq_get_dispatch_budget(rq->q);
                if (budget_token < 0) {
                        blk_mq_put_driver_tag(rq);
                        return PREP_DISPATCH_NO_BUDGET;
                }
                blk_mq_set_rq_budget_token(rq, budget_token);
        }

        if (!blk_mq_get_driver_tag(rq)) {
                /*
                 * The initial allocation attempt failed, so we need to
                 * rerun the hardware queue when a tag is freed. The
                 * waitqueue takes care of that. If the queue is run
                 * before we add this entry back on the dispatch list,
                 * we'll re-run it below.
                 */
                if (!blk_mq_mark_tag_wait(hctx, rq)) {
                        /*
                         * All budgets not got from this function will be put
                         * together during handling partial dispatch
                         */
                        if (need_budget)
                                blk_mq_put_dispatch_budget(rq->q, budget_token);
                        return PREP_DISPATCH_NO_TAG;
                }
        }

        return PREP_DISPATCH_OK;
}

/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
static void blk_mq_release_budgets(struct request_queue *q,
                struct list_head *list)
{
        struct request *rq;

        list_for_each_entry(rq, list, queuelist) {
                int budget_token = blk_mq_get_rq_budget_token(rq);

                if (budget_token >= 0)
                        blk_mq_put_dispatch_budget(q, budget_token);
        }
}

/*
 * blk_mq_commit_rqs will notify driver using bd->last that there is no
 * more requests. (See comment in struct blk_mq_ops for commit_rqs for
 * details)
 * Attention, we should explicitly call this in unusual cases:
 *  1) did not queue everything initially scheduled to queue
 *  2) the last attempt to queue a request failed
 */
static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued,
                              bool from_schedule)
{
        if (hctx->queue->mq_ops->commit_rqs && queued) {
                trace_block_unplug(hctx->queue, queued, !from_schedule);
                hctx->queue->mq_ops->commit_rqs(hctx);
        }
}

/*
 * Returns true if we did some work AND can potentially do more.
 */
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
                             unsigned int nr_budgets)
{
        enum prep_dispatch prep;
        struct request_queue *q = hctx->queue;
        struct request *rq;
        int queued;
        blk_status_t ret = BLK_STS_OK;
        bool needs_resource = false;

        if (list_empty(list))
                return false;

        /*
         * Now process all the entries, sending them to the driver.
         */
        queued = 0;
        do {
                struct blk_mq_queue_data bd;

                rq = list_first_entry(list, struct request, queuelist);

                WARN_ON_ONCE(hctx != rq->mq_hctx);
                prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
                if (prep != PREP_DISPATCH_OK)
                        break;

                list_del_init(&rq->queuelist);

                bd.rq = rq;
                bd.last = list_empty(list);

                /*
                 * once the request is queued to lld, no need to cover the
                 * budget any more
                 */
                if (nr_budgets)
                        nr_budgets--;
                ret = q->mq_ops->queue_rq(hctx, &bd);
                switch (ret) {
                case BLK_STS_OK:
                        queued++;
                        break;
                case BLK_STS_RESOURCE:
                        needs_resource = true;
                        fallthrough;
                case BLK_STS_DEV_RESOURCE:
                        blk_mq_handle_dev_resource(rq, list);
                        goto out;
                default:
                        blk_mq_end_request(rq, ret);
                }
        } while (!list_empty(list));
out:
        /* If we didn't flush the entire list, we could have told the driver
         * there was more coming, but that turned out to be a lie.
         */
        if (!list_empty(list) || ret != BLK_STS_OK)
                blk_mq_commit_rqs(hctx, queued, false);

        /*
         * Any items that need requeuing? Stuff them into hctx->dispatch,
         * that is where we will continue on next queue run.
         */
        if (!list_empty(list)) {
                bool needs_restart;
                /* For non-shared tags, the RESTART check will suffice */
                bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
                        ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) ||
                        blk_mq_is_shared_tags(hctx->flags));

                if (nr_budgets)
                        blk_mq_release_budgets(q, list);

                spin_lock(&hctx->lock);
                list_splice_tail_init(list, &hctx->dispatch);
                spin_unlock(&hctx->lock);

                /*
                 * Order adding requests to hctx->dispatch and checking
                 * SCHED_RESTART flag. The pair of this smp_mb() is the one
                 * in blk_mq_sched_restart(). Avoid restart code path to
                 * miss the new added requests to hctx->dispatch, meantime
                 * SCHED_RESTART is observed here.
                 */
                smp_mb();

                /*
                 * If SCHED_RESTART was set by the caller of this function and
                 * it is no longer set that means that it was cleared by another
                 * thread and hence that a queue rerun is needed.
                 *
                 * If 'no_tag' is set, that means that we failed getting
                 * a driver tag with an I/O scheduler attached. If our dispatch
                 * waitqueue is no longer active, ensure that we run the queue
                 * AFTER adding our entries back to the list.
                 *
                 * If no I/O scheduler has been configured it is possible that
                 * the hardware queue got stopped and restarted before requests
                 * were pushed back onto the dispatch list. Rerun the queue to
                 * avoid starvation. Notes:
                 * - blk_mq_run_hw_queue() checks whether or not a queue has
                 *   been stopped before rerunning a queue.
                 * - Some but not all block drivers stop a queue before
                 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
                 *   and dm-rq.
                 *
                 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
                 * bit is set, run queue after a delay to avoid IO stalls
                 * that could otherwise occur if the queue is idle.  We'll do
                 * similar if we couldn't get budget or couldn't lock a zone
                 * and SCHED_RESTART is set.
                 */
                needs_restart = blk_mq_sched_needs_restart(hctx);
                if (prep == PREP_DISPATCH_NO_BUDGET)
                        needs_resource = true;
                if (!needs_restart ||
                    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
                        blk_mq_run_hw_queue(hctx, true);
                else if (needs_resource)
                        blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);

                blk_mq_update_dispatch_busy(hctx, true);
                return false;
        }

        blk_mq_update_dispatch_busy(hctx, false);
        return true;
}

static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
{
        int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);

        if (cpu >= nr_cpu_ids)
                cpu = cpumask_first(hctx->cpumask);
        return cpu;
}

/*
 * ->next_cpu is always calculated from hctx->cpumask, so simply use
 * it for speeding up the check
 */
static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx)
{
        return hctx->next_cpu >= nr_cpu_ids;
}

/*
 * It'd be great if the workqueue API had a way to pass
 * in a mask and had some smarts for more clever placement.
 * For now we just round-robin here, switching for every
 * BLK_MQ_CPU_WORK_BATCH queued items.
 */
static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
{
        bool tried = false;
        int next_cpu = hctx->next_cpu;

        /* Switch to unbound if no allowable CPUs in this hctx */
        if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx))
                return WORK_CPU_UNBOUND;

        if (--hctx->next_cpu_batch <= 0) {
select_cpu:
                next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
                                cpu_online_mask);
                if (next_cpu >= nr_cpu_ids)
                        next_cpu = blk_mq_first_mapped_cpu(hctx);
                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
        }

        /*
         * Do unbound schedule if we can't find a online CPU for this hctx,
         * and it should only happen in the path of handling CPU DEAD.
         */
        if (!cpu_online(next_cpu)) {
                if (!tried) {
                        tried = true;
                        goto select_cpu;
                }

                /*
                 * Make sure to re-select CPU next time once after CPUs
                 * in hctx->cpumask become online again.
                 */
                hctx->next_cpu = next_cpu;
                hctx->next_cpu_batch = 1;
                return WORK_CPU_UNBOUND;
        }

        hctx->next_cpu = next_cpu;
        return next_cpu;
}

/**
 * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
 * @hctx: Pointer to the hardware queue to run.
 * @msecs: Milliseconds of delay to wait before running the queue.
 *
 * Run a hardware queue asynchronously with a delay of @msecs.
 */
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
{
        if (unlikely(blk_mq_hctx_stopped(hctx)))
                return;
        kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
                                    msecs_to_jiffies(msecs));
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);

/**
 * blk_mq_run_hw_queue - Start to run a hardware queue.
 * @hctx: Pointer to the hardware queue to run.
 * @async: If we want to run the queue asynchronously.
 *
 * Check if the request queue is not in a quiesced state and if there are
 * pending requests to be sent. If this is true, run the queue to send requests
 * to hardware.
 */
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
        bool need_run;

        /*
         * We can't run the queue inline with interrupts disabled.
         */
        WARN_ON_ONCE(!async && in_interrupt());

        might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING);

        /*
         * When queue is quiesced, we may be switching io scheduler, or
         * updating nr_hw_queues, or other things, and we can't run queue
         * any more, even __blk_mq_hctx_has_pending() can't be called safely.
         *
         * And queue will be rerun in blk_mq_unquiesce_queue() if it is
         * quiesced.
         */
        __blk_mq_run_dispatch_ops(hctx->queue, false,
                need_run = !blk_queue_quiesced(hctx->queue) &&
                blk_mq_hctx_has_pending(hctx));

        if (!need_run)
                return;

        if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
                blk_mq_delay_run_hw_queue(hctx, 0);
                return;
        }

        blk_mq_run_dispatch_ops(hctx->queue,
                                blk_mq_sched_dispatch_requests(hctx));
}
EXPORT_SYMBOL(blk_mq_run_hw_queue);

/*
 * Return prefered queue to dispatch from (if any) for non-mq aware IO
 * scheduler.
 */
static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
{
        struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
        /*
         * If the IO scheduler does not respect hardware queues when
         * dispatching, we just don't bother with multiple HW queues and
         * dispatch from hctx for the current CPU since running multiple queues
         * just causes lock contention inside the scheduler and pointless cache
         * bouncing.
         */
        struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT];

        if (!blk_mq_hctx_stopped(hctx))
                return hctx;
        return NULL;
}

/**
 * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
 * @q: Pointer to the request queue to run.
 * @async: If we want to run the queue asynchronously.
 */
void blk_mq_run_hw_queues(struct request_queue *q, bool async)
{
        struct blk_mq_hw_ctx *hctx, *sq_hctx;
        unsigned long i;

        sq_hctx = NULL;
        if (blk_queue_sq_sched(q))
                sq_hctx = blk_mq_get_sq_hctx(q);
        queue_for_each_hw_ctx(q, hctx, i) {
                if (blk_mq_hctx_stopped(hctx))
                        continue;
                /*
                 * Dispatch from this hctx either if there's no hctx preferred
                 * by IO scheduler or if it has requests that bypass the
                 * scheduler.
                 */
                if (!sq_hctx || sq_hctx == hctx ||
                    !list_empty_careful(&hctx->dispatch))
                        blk_mq_run_hw_queue(hctx, async);
        }
}
EXPORT_SYMBOL(blk_mq_run_hw_queues);

/**
 * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
 * @q: Pointer to the request queue to run.
 * @msecs: Milliseconds of delay to wait before running the queues.
 */
void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
{
        struct blk_mq_hw_ctx *hctx, *sq_hctx;
        unsigned long i;

        sq_hctx = NULL;
        if (blk_queue_sq_sched(q))
                sq_hctx = blk_mq_get_sq_hctx(q);
        queue_for_each_hw_ctx(q, hctx, i) {
                if (blk_mq_hctx_stopped(hctx))
                        continue;
                /*
                 * If there is already a run_work pending, leave the
                 * pending delay untouched. Otherwise, a hctx can stall
                 * if another hctx is re-delaying the other's work
                 * before the work executes.
                 */
                if (delayed_work_pending(&hctx->run_work))
                        continue;
                /*
                 * Dispatch from this hctx either if there's no hctx preferred
                 * by IO scheduler or if it has requests that bypass the
                 * scheduler.
                 */
                if (!sq_hctx || sq_hctx == hctx ||
                    !list_empty_careful(&hctx->dispatch))
                        blk_mq_delay_run_hw_queue(hctx, msecs);
        }
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);

/*
 * This function is often used for pausing .queue_rq() by driver when
 * there isn't enough resource or some conditions aren't satisfied, and
 * BLK_STS_RESOURCE is usually returned.
 *
 * We do not guarantee that dispatch can be drained or blocked
 * after blk_mq_stop_hw_queue() returns. Please use
 * blk_mq_quiesce_queue() for that requirement.
 */
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
{
        cancel_delayed_work(&hctx->run_work);

        set_bit(BLK_MQ_S_STOPPED, &hctx->state);
}
EXPORT_SYMBOL(blk_mq_stop_hw_queue);

/*
 * This function is often used for pausing .queue_rq() by driver when
 * there isn't enough resource or some conditions aren't satisfied, and
 * BLK_STS_RESOURCE is usually returned.
 *
 * We do not guarantee that dispatch can be drained or blocked
 * after blk_mq_stop_hw_queues() returns. Please use
 * blk_mq_quiesce_queue() for that requirement.
 */
void blk_mq_stop_hw_queues(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i)
                blk_mq_stop_hw_queue(hctx);
}
EXPORT_SYMBOL(blk_mq_stop_hw_queues);

void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
{
        clear_bit(BLK_MQ_S_STOPPED, &hctx->state);

        blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);
}
EXPORT_SYMBOL(blk_mq_start_hw_queue);

void blk_mq_start_hw_queues(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i)
                blk_mq_start_hw_queue(hctx);
}
EXPORT_SYMBOL(blk_mq_start_hw_queues);

void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
        if (!blk_mq_hctx_stopped(hctx))
                return;

        clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
        blk_mq_run_hw_queue(hctx, async);
}
EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);

void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i)
                blk_mq_start_stopped_hw_queue(hctx, async ||
                                        (hctx->flags & BLK_MQ_F_BLOCKING));
}
EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);

static void blk_mq_run_work_fn(struct work_struct *work)
{
        struct blk_mq_hw_ctx *hctx =
                container_of(work, struct blk_mq_hw_ctx, run_work.work);

        blk_mq_run_dispatch_ops(hctx->queue,
                                blk_mq_sched_dispatch_requests(hctx));
}

/**
 * blk_mq_request_bypass_insert - Insert a request at dispatch list.
 * @rq: Pointer to request to be inserted.
 * @flags: BLK_MQ_INSERT_*
 *
 * Should only be used carefully, when the caller knows we want to
 * bypass a potential IO scheduler on the target device.
 */
static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags)
{
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

        spin_lock(&hctx->lock);
        if (flags & BLK_MQ_INSERT_AT_HEAD)
                list_add(&rq->queuelist, &hctx->dispatch);
        else
                list_add_tail(&rq->queuelist, &hctx->dispatch);
        spin_unlock(&hctx->lock);
}

static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx,
                struct blk_mq_ctx *ctx, struct list_head *list,
                bool run_queue_async)
{
        struct request *rq;
        enum hctx_type type = hctx->type;

        /*
         * Try to issue requests directly if the hw queue isn't busy to save an
         * extra enqueue & dequeue to the sw queue.
         */
        if (!hctx->dispatch_busy && !run_queue_async) {
                blk_mq_run_dispatch_ops(hctx->queue,
                        blk_mq_try_issue_list_directly(hctx, list));
                if (list_empty(list))
                        goto out;
        }

        /*
         * preemption doesn't flush plug list, so it's possible ctx->cpu is
         * offline now
         */
        list_for_each_entry(rq, list, queuelist) {
                BUG_ON(rq->mq_ctx != ctx);
                trace_block_rq_insert(rq);
                if (rq->cmd_flags & REQ_NOWAIT)
                        run_queue_async = true;
        }

        spin_lock(&ctx->lock);
        list_splice_tail_init(list, &ctx->rq_lists[type]);
        blk_mq_hctx_mark_pending(hctx, ctx);
        spin_unlock(&ctx->lock);
out:
        blk_mq_run_hw_queue(hctx, run_queue_async);
}

static void blk_mq_insert_request(struct request *rq, blk_insert_t flags)
{
        struct request_queue *q = rq->q;
        struct blk_mq_ctx *ctx = rq->mq_ctx;
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

        if (blk_rq_is_passthrough(rq)) {
                /*
                 * Passthrough request have to be added to hctx->dispatch
                 * directly.  The device may be in a situation where it can't
                 * handle FS request, and always returns BLK_STS_RESOURCE for
                 * them, which gets them added to hctx->dispatch.
                 *
                 * If a passthrough request is required to unblock the queues,
                 * and it is added to the scheduler queue, there is no chance to
                 * dispatch it given we prioritize requests in hctx->dispatch.
                 */
                blk_mq_request_bypass_insert(rq, flags);
        } else if (req_op(rq) == REQ_OP_FLUSH) {
                /*
                 * Firstly normal IO request is inserted to scheduler queue or
                 * sw queue, meantime we add flush request to dispatch queue(
                 * hctx->dispatch) directly and there is at most one in-flight
                 * flush request for each hw queue, so it doesn't matter to add
                 * flush request to tail or front of the dispatch queue.
                 *
                 * Secondly in case of NCQ, flush request belongs to non-NCQ
                 * command, and queueing it will fail when there is any
                 * in-flight normal IO request(NCQ command). When adding flush
                 * rq to the front of hctx->dispatch, it is easier to introduce
                 * extra time to flush rq's latency because of S_SCHED_RESTART
                 * compared with adding to the tail of dispatch queue, then
                 * chance of flush merge is increased, and less flush requests
                 * will be issued to controller. It is observed that ~10% time
                 * is saved in blktests block/004 on disk attached to AHCI/NCQ
                 * drive when adding flush rq to the front of hctx->dispatch.
                 *
                 * Simply queue flush rq to the front of hctx->dispatch so that
                 * intensive flush workloads can benefit in case of NCQ HW.
                 */
                blk_mq_request_bypass_insert(rq, BLK_MQ_INSERT_AT_HEAD);
        } else if (q->elevator) {
                LIST_HEAD(list);

                WARN_ON_ONCE(rq->tag != BLK_MQ_NO_TAG);

                list_add(&rq->queuelist, &list);
                q->elevator->type->ops.insert_requests(hctx, &list, flags);
        } else {
                trace_block_rq_insert(rq);

                spin_lock(&ctx->lock);
                if (flags & BLK_MQ_INSERT_AT_HEAD)
                        list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]);
                else
                        list_add_tail(&rq->queuelist,
                                      &ctx->rq_lists[hctx->type]);
                blk_mq_hctx_mark_pending(hctx, ctx);
                spin_unlock(&ctx->lock);
        }
}

static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
                unsigned int nr_segs)
{
        int err;

        if (bio->bi_opf & REQ_RAHEAD)
                rq->cmd_flags |= REQ_FAILFAST_MASK;

        rq->__sector = bio->bi_iter.bi_sector;
        rq->write_hint = bio->bi_write_hint;
        blk_rq_bio_prep(rq, bio, nr_segs);

        /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
        err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
        WARN_ON_ONCE(err);

        blk_account_io_start(rq);
}

static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
                                            struct request *rq, bool last)
{
        struct request_queue *q = rq->q;
        struct blk_mq_queue_data bd = {
                .rq = rq,
                .last = last,
        };
        blk_status_t ret;

        /*
         * For OK queue, we are done. For error, caller may kill it.
         * Any other error (busy), just add it to our list as we
         * previously would have done.
         */
        ret = q->mq_ops->queue_rq(hctx, &bd);
        switch (ret) {
        case BLK_STS_OK:
                blk_mq_update_dispatch_busy(hctx, false);
                break;
        case BLK_STS_RESOURCE:
        case BLK_STS_DEV_RESOURCE:
                blk_mq_update_dispatch_busy(hctx, true);
                __blk_mq_requeue_request(rq);
                break;
        default:
                blk_mq_update_dispatch_busy(hctx, false);
                break;
        }

        return ret;
}

static bool blk_mq_get_budget_and_tag(struct request *rq)
{
        int budget_token;

        budget_token = blk_mq_get_dispatch_budget(rq->q);
        if (budget_token < 0)
                return false;
        blk_mq_set_rq_budget_token(rq, budget_token);
        if (!blk_mq_get_driver_tag(rq)) {
                blk_mq_put_dispatch_budget(rq->q, budget_token);
                return false;
        }
        return true;
}

/**
 * blk_mq_try_issue_directly - Try to send a request directly to device driver.
 * @hctx: Pointer of the associated hardware queue.
 * @rq: Pointer to request to be sent.
 *
 * If the device has enough resources to accept a new request now, send the
 * request directly to device driver. Else, insert at hctx->dispatch queue, so
 * we can try send it another time in the future. Requests inserted at this
 * queue have higher priority.
 */
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                struct request *rq)
{
        blk_status_t ret;

        if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
                blk_mq_insert_request(rq, 0);
                return;
        }

        if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) {
                blk_mq_insert_request(rq, 0);
                blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT);
                return;
        }

        ret = __blk_mq_issue_directly(hctx, rq, true);
        switch (ret) {
        case BLK_STS_OK:
                break;
        case BLK_STS_RESOURCE:
        case BLK_STS_DEV_RESOURCE:
                blk_mq_request_bypass_insert(rq, 0);
                blk_mq_run_hw_queue(hctx, false);
                break;
        default:
                blk_mq_end_request(rq, ret);
                break;
        }
}

static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
{
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

        if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
                blk_mq_insert_request(rq, 0);
                return BLK_STS_OK;
        }

        if (!blk_mq_get_budget_and_tag(rq))
                return BLK_STS_RESOURCE;
        return __blk_mq_issue_directly(hctx, rq, last);
}

static void blk_mq_plug_issue_direct(struct blk_plug *plug)
{
        struct blk_mq_hw_ctx *hctx = NULL;
        struct request *rq;
        int queued = 0;
        blk_status_t ret = BLK_STS_OK;

        while ((rq = rq_list_pop(&plug->mq_list))) {
                bool last = rq_list_empty(plug->mq_list);

                if (hctx != rq->mq_hctx) {
                        if (hctx) {
                                blk_mq_commit_rqs(hctx, queued, false);
                                queued = 0;
                        }
                        hctx = rq->mq_hctx;
                }

                ret = blk_mq_request_issue_directly(rq, last);
                switch (ret) {
                case BLK_STS_OK:
                        queued++;
                        break;
                case BLK_STS_RESOURCE:
                case BLK_STS_DEV_RESOURCE:
                        blk_mq_request_bypass_insert(rq, 0);
                        blk_mq_run_hw_queue(hctx, false);
                        goto out;
                default:
                        blk_mq_end_request(rq, ret);
                        break;
                }
        }

out:
        if (ret != BLK_STS_OK)
                blk_mq_commit_rqs(hctx, queued, false);
}

static void __blk_mq_flush_plug_list(struct request_queue *q,
                                     struct blk_plug *plug)
{
        if (blk_queue_quiesced(q))
                return;
        q->mq_ops->queue_rqs(&plug->mq_list);
}

static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
{
        struct blk_mq_hw_ctx *this_hctx = NULL;
        struct blk_mq_ctx *this_ctx = NULL;
        struct request *requeue_list = NULL;
        struct request **requeue_lastp = &requeue_list;
        unsigned int depth = 0;
        bool is_passthrough = false;
        LIST_HEAD(list);

        do {
                struct request *rq = rq_list_pop(&plug->mq_list);

                if (!this_hctx) {
                        this_hctx = rq->mq_hctx;
                        this_ctx = rq->mq_ctx;
                        is_passthrough = blk_rq_is_passthrough(rq);
                } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx ||
                           is_passthrough != blk_rq_is_passthrough(rq)) {
                        rq_list_add_tail(&requeue_lastp, rq);
                        continue;
                }
                list_add(&rq->queuelist, &list);
                depth++;
        } while (!rq_list_empty(plug->mq_list));

        plug->mq_list = requeue_list;
        trace_block_unplug(this_hctx->queue, depth, !from_sched);

        percpu_ref_get(&this_hctx->queue->q_usage_counter);
        /* passthrough requests should never be issued to the I/O scheduler */
        if (is_passthrough) {
                spin_lock(&this_hctx->lock);
                list_splice_tail_init(&list, &this_hctx->dispatch);
                spin_unlock(&this_hctx->lock);
                blk_mq_run_hw_queue(this_hctx, from_sched);
        } else if (this_hctx->queue->elevator) {
                this_hctx->queue->elevator->type->ops.insert_requests(this_hctx,
                                &list, 0);
                blk_mq_run_hw_queue(this_hctx, from_sched);
        } else {
                blk_mq_insert_requests(this_hctx, this_ctx, &list, from_sched);
        }
        percpu_ref_put(&this_hctx->queue->q_usage_counter);
}

void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
        struct request *rq;

        /*
         * We may have been called recursively midway through handling
         * plug->mq_list via a schedule() in the driver's queue_rq() callback.
         * To avoid mq_list changing under our feet, clear rq_count early and
         * bail out specifically if rq_count is 0 rather than checking
         * whether the mq_list is empty.
         */
        if (plug->rq_count == 0)
                return;
        plug->rq_count = 0;

        if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
                struct request_queue *q;

                rq = rq_list_peek(&plug->mq_list);
                q = rq->q;

                /*
                 * Peek first request and see if we have a ->queue_rqs() hook.
                 * If we do, we can dispatch the whole plug list in one go. We
                 * already know at this point that all requests belong to the
                 * same queue, caller must ensure that's the case.
                 */
                if (q->mq_ops->queue_rqs) {
                        blk_mq_run_dispatch_ops(q,
                                __blk_mq_flush_plug_list(q, plug));
                        if (rq_list_empty(plug->mq_list))
                                return;
                }

                blk_mq_run_dispatch_ops(q,
                                blk_mq_plug_issue_direct(plug));
                if (rq_list_empty(plug->mq_list))
                        return;
        }

        do {
                blk_mq_dispatch_plug_list(plug, from_schedule);
        } while (!rq_list_empty(plug->mq_list));
}

static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
                struct list_head *list)
{
        int queued = 0;
        blk_status_t ret = BLK_STS_OK;

        while (!list_empty(list)) {
                struct request *rq = list_first_entry(list, struct request,
                                queuelist);

                list_del_init(&rq->queuelist);
                ret = blk_mq_request_issue_directly(rq, list_empty(list));
                switch (ret) {
                case BLK_STS_OK:
                        queued++;
                        break;
                case BLK_STS_RESOURCE:
                case BLK_STS_DEV_RESOURCE:
                        blk_mq_request_bypass_insert(rq, 0);
                        if (list_empty(list))
                                blk_mq_run_hw_queue(hctx, false);
                        goto out;
                default:
                        blk_mq_end_request(rq, ret);
                        break;
                }
        }

out:
        if (ret != BLK_STS_OK)
                blk_mq_commit_rqs(hctx, queued, false);
}

static bool blk_mq_attempt_bio_merge(struct request_queue *q,
                                     struct bio *bio, unsigned int nr_segs)
{
        if (!blk_queue_nomerges(q) && bio_mergeable(bio)) {
                if (blk_attempt_plug_merge(q, bio, nr_segs))
                        return true;
                if (blk_mq_sched_bio_merge(q, bio, nr_segs))
                        return true;
        }
        return false;
}

static struct request *blk_mq_get_new_requests(struct request_queue *q,
                                               struct blk_plug *plug,
                                               struct bio *bio,
                                               unsigned int nsegs)
{
        struct blk_mq_alloc_data data = {
                .q                = q,
                .nr_tags        = 1,
                .cmd_flags        = bio->bi_opf,
        };
        struct request *rq;

        rq_qos_throttle(q, bio);

        if (plug) {
                data.nr_tags = plug->nr_ios;
                plug->nr_ios = 1;
                data.cached_rq = &plug->cached_rq;
        }

        rq = __blk_mq_alloc_requests(&data);
        if (rq)
                return rq;
        rq_qos_cleanup(q, bio);
        if (bio->bi_opf & REQ_NOWAIT)
                bio_wouldblock_error(bio);
        return NULL;
}

/*
 * Check if there is a suitable cached request and return it.
 */
static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
                struct request_queue *q, blk_opf_t opf)
{
        enum hctx_type type = blk_mq_get_hctx_type(opf);
        struct request *rq;

        if (!plug)
                return NULL;
        rq = rq_list_peek(&plug->cached_rq);
        if (!rq || rq->q != q)
                return NULL;
        if (type != rq->mq_hctx->type &&
            (type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT))
                return NULL;
        if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
                return NULL;
        return rq;
}

static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
                struct bio *bio)
{
        WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq);

        /*
         * If any qos ->throttle() end up blocking, we will have flushed the
         * plug and hence killed the cached_rq list as well. Pop this entry
         * before we throttle.
         */
        plug->cached_rq = rq_list_next(rq);
        rq_qos_throttle(rq->q, bio);

        blk_mq_rq_time_init(rq, 0);
        rq->cmd_flags = bio->bi_opf;
        INIT_LIST_HEAD(&rq->queuelist);
}

/**
 * blk_mq_submit_bio - Create and send a request to block device.
 * @bio: Bio pointer.
 *
 * Builds up a request structure from @q and @bio and send to the device. The
 * request may not be queued directly to hardware if:
 * * This request can be merged with another one
 * * We want to place request at plug queue for possible future merging
 * * There is an IO scheduler active at this queue
 *
 * It will not queue the request if there is an error with the bio, or at the
 * request creation.
 */
void blk_mq_submit_bio(struct bio *bio)
{
        struct request_queue *q = bdev_get_queue(bio->bi_bdev);
        struct blk_plug *plug = current->plug;
        const int is_sync = op_is_sync(bio->bi_opf);
        struct blk_mq_hw_ctx *hctx;
        unsigned int nr_segs = 1;
        struct request *rq;
        blk_status_t ret;

        /*
         * If the plug has a cached request for this queue, try to use it.
         */
        rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);

        /*
         * A BIO that was released from a zone write plug has already been
         * through the preparation in this function, already holds a reference
         * on the queue usage counter, and is the only write BIO in-flight for
         * the target zone. Go straight to preparing a request for it.
         */
        if (bio_zone_write_plugging(bio)) {
                nr_segs = bio->__bi_nr_segments;
                if (rq)
                        blk_queue_exit(q);
                goto new_request;
        }

        bio = blk_queue_bounce(bio, q);

        /*
         * The cached request already holds a q_usage_counter reference and we
         * don't have to acquire a new one if we use it.
         */
        if (!rq) {
                if (unlikely(bio_queue_enter(bio)))
                        return;
        }

        if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
                bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
                if (!bio)
                        goto queue_exit;
        }
        if (!bio_integrity_prep(bio))
                goto queue_exit;

        if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
                goto queue_exit;

        if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
                goto queue_exit;

new_request:
        if (!rq) {
                rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
                if (unlikely(!rq))
                        goto queue_exit;
        } else {
                blk_mq_use_cached_rq(rq, plug, bio);
        }

        trace_block_getrq(bio);

        rq_qos_track(q, rq, bio);

        blk_mq_bio_to_request(rq, bio, nr_segs);

        ret = blk_crypto_rq_get_keyslot(rq);
        if (ret != BLK_STS_OK) {
                bio->bi_status = ret;
                bio_endio(bio);
                blk_mq_free_request(rq);
                return;
        }

        if (bio_zone_write_plugging(bio))
                blk_zone_write_plug_init_request(rq);

        if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
                return;

        if (plug) {
                blk_add_rq_to_plug(plug, rq);
                return;
        }

        hctx = rq->mq_hctx;
        if ((rq->rq_flags & RQF_USE_SCHED) ||
            (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) {
                blk_mq_insert_request(rq, 0);
                blk_mq_run_hw_queue(hctx, true);
        } else {
                blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq));
        }
        return;

queue_exit:
        /*
         * Don't drop the queue reference if we were trying to use a cached
         * request and thus didn't acquire one.
         */
        if (!rq)
                blk_queue_exit(q);
}

#ifdef CONFIG_BLK_MQ_STACKING
/**
 * blk_insert_cloned_request - Helper for stacking drivers to submit a request
 * @rq: the request being queued
 */
blk_status_t blk_insert_cloned_request(struct request *rq)
{
        struct request_queue *q = rq->q;
        unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
        unsigned int max_segments = blk_rq_get_max_segments(rq);
        blk_status_t ret;

        if (blk_rq_sectors(rq) > max_sectors) {
                /*
                 * SCSI device does not have a good way to return if
                 * Write Same/Zero is actually supported. If a device rejects
                 * a non-read/write command (discard, write same,etc.) the
                 * low-level device driver will set the relevant queue limit to
                 * 0 to prevent blk-lib from issuing more of the offending
                 * operations. Commands queued prior to the queue limit being
                 * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
                 * errors being propagated to upper layers.
                 */
                if (max_sectors == 0)
                        return BLK_STS_NOTSUPP;

                printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
                        __func__, blk_rq_sectors(rq), max_sectors);
                return BLK_STS_IOERR;
        }

        /*
         * The queue settings related to segment counting may differ from the
         * original queue.
         */
        rq->nr_phys_segments = blk_recalc_rq_segments(rq);
        if (rq->nr_phys_segments > max_segments) {
                printk(KERN_ERR "%s: over max segments limit. (%u > %u)\n",
                        __func__, rq->nr_phys_segments, max_segments);
                return BLK_STS_IOERR;
        }

        if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq)))
                return BLK_STS_IOERR;

        ret = blk_crypto_rq_get_keyslot(rq);
        if (ret != BLK_STS_OK)
                return ret;

        blk_account_io_start(rq);

        /*
         * Since we have a scheduler attached on the top device,
         * bypass a potential scheduler on the bottom device for
         * insert.
         */
        blk_mq_run_dispatch_ops(q,
                        ret = blk_mq_request_issue_directly(rq, true));
        if (ret)
                blk_account_io_done(rq, blk_time_get_ns());
        return ret;
}
EXPORT_SYMBOL_GPL(blk_insert_cloned_request);

/**
 * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
 * @rq: the clone request to be cleaned up
 *
 * Description:
 *     Free all bios in @rq for a cloned request.
 */
void blk_rq_unprep_clone(struct request *rq)
{
        struct bio *bio;

        while ((bio = rq->bio) != NULL) {
                rq->bio = bio->bi_next;

                bio_put(bio);
        }
}
EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);

/**
 * blk_rq_prep_clone - Helper function to setup clone request
 * @rq: the request to be setup
 * @rq_src: original request to be cloned
 * @bs: bio_set that bios for clone are allocated from
 * @gfp_mask: memory allocation mask for bio
 * @bio_ctr: setup function to be called for each clone bio.
 *           Returns %0 for success, non %0 for failure.
 * @data: private data to be passed to @bio_ctr
 *
 * Description:
 *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
 *     Also, pages which the original bios are pointing to are not copied
 *     and the cloned bios just point same pages.
 *     So cloned bios must be completed before original bios, which means
 *     the caller must complete @rq before @rq_src.
 */
int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
                      struct bio_set *bs, gfp_t gfp_mask,
                      int (*bio_ctr)(struct bio *, struct bio *, void *),
                      void *data)
{
        struct bio *bio, *bio_src;

        if (!bs)
                bs = &fs_bio_set;

        __rq_for_each_bio(bio_src, rq_src) {
                bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask,
                                      bs);
                if (!bio)
                        goto free_and_out;

                if (bio_ctr && bio_ctr(bio, bio_src, data))
                        goto free_and_out;

                if (rq->bio) {
                        rq->biotail->bi_next = bio;
                        rq->biotail = bio;
                } else {
                        rq->bio = rq->biotail = bio;
                }
                bio = NULL;
        }

        /* Copy attributes of the original request to the clone request. */
        rq->__sector = blk_rq_pos(rq_src);
        rq->__data_len = blk_rq_bytes(rq_src);
        if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
                rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
                rq->special_vec = rq_src->special_vec;
        }
        rq->nr_phys_segments = rq_src->nr_phys_segments;
        rq->ioprio = rq_src->ioprio;
        rq->write_hint = rq_src->write_hint;

        if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
                goto free_and_out;

        return 0;

free_and_out:
        if (bio)
                bio_put(bio);
        blk_rq_unprep_clone(rq);

        return -ENOMEM;
}
EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
#endif /* CONFIG_BLK_MQ_STACKING */

/*
 * Steal bios from a request and add them to a bio list.
 * The request must not have been partially completed before.
 */
void blk_steal_bios(struct bio_list *list, struct request *rq)
{
        if (rq->bio) {
                if (list->tail)
                        list->tail->bi_next = rq->bio;
                else
                        list->head = rq->bio;
                list->tail = rq->biotail;

                rq->bio = NULL;
                rq->biotail = NULL;
        }

        rq->__data_len = 0;
}
EXPORT_SYMBOL_GPL(blk_steal_bios);

static size_t order_to_size(unsigned int order)
{
        return (size_t)PAGE_SIZE << order;
}

/* called before freeing request pool in @tags */
static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
                                    struct blk_mq_tags *tags)
{
        struct page *page;
        unsigned long flags;

        /*
         * There is no need to clear mapping if driver tags is not initialized
         * or the mapping belongs to the driver tags.
         */
        if (!drv_tags || drv_tags == tags)
                return;

        list_for_each_entry(page, &tags->page_list, lru) {
                unsigned long start = (unsigned long)page_address(page);
                unsigned long end = start + order_to_size(page->private);
                int i;

                for (i = 0; i < drv_tags->nr_tags; i++) {
                        struct request *rq = drv_tags->rqs[i];
                        unsigned long rq_addr = (unsigned long)rq;

                        if (rq_addr >= start && rq_addr < end) {
                                WARN_ON_ONCE(req_ref_read(rq) != 0);
                                cmpxchg(&drv_tags->rqs[i], rq, NULL);
                        }
                }
        }

        /*
         * Wait until all pending iteration is done.
         *
         * Request reference is cleared and it is guaranteed to be observed
         * after the ->lock is released.
         */
        spin_lock_irqsave(&drv_tags->lock, flags);
        spin_unlock_irqrestore(&drv_tags->lock, flags);
}

void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                     unsigned int hctx_idx)
{
        struct blk_mq_tags *drv_tags;
        struct page *page;

        if (list_empty(&tags->page_list))
                return;

        if (blk_mq_is_shared_tags(set->flags))
                drv_tags = set->shared_tags;
        else
                drv_tags = set->tags[hctx_idx];

        if (tags->static_rqs && set->ops->exit_request) {
                int i;

                for (i = 0; i < tags->nr_tags; i++) {
                        struct request *rq = tags->static_rqs[i];

                        if (!rq)
                                continue;
                        set->ops->exit_request(set, rq, hctx_idx);
                        tags->static_rqs[i] = NULL;
                }
        }

        blk_mq_clear_rq_mapping(drv_tags, tags);

        while (!list_empty(&tags->page_list)) {
                page = list_first_entry(&tags->page_list, struct page, lru);
                list_del_init(&page->lru);
                /*
                 * Remove kmemleak object previously allocated in
                 * blk_mq_alloc_rqs().
                 */
                kmemleak_free(page_address(page));
                __free_pages(page, page->private);
        }
}

void blk_mq_free_rq_map(struct blk_mq_tags *tags)
{
        kfree(tags->rqs);
        tags->rqs = NULL;
        kfree(tags->static_rqs);
        tags->static_rqs = NULL;

        blk_mq_free_tags(tags);
}

static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set,
                unsigned int hctx_idx)
{
        int i;

        for (i = 0; i < set->nr_maps; i++) {
                unsigned int start = set->map[i].queue_offset;
                unsigned int end = start + set->map[i].nr_queues;

                if (hctx_idx >= start && hctx_idx < end)
                        break;
        }

        if (i >= set->nr_maps)
                i = HCTX_TYPE_DEFAULT;

        return i;
}

static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set,
                unsigned int hctx_idx)
{
        enum hctx_type type = hctx_idx_to_type(set, hctx_idx);

        return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx);
}

static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
                                               unsigned int hctx_idx,
                                               unsigned int nr_tags,
                                               unsigned int reserved_tags)
{
        int node = blk_mq_get_hctx_node(set, hctx_idx);
        struct blk_mq_tags *tags;

        if (node == NUMA_NO_NODE)
                node = set->numa_node;

        tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
                                BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
        if (!tags)
                return NULL;

        tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
                                 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
                                 node);
        if (!tags->rqs)
                goto err_free_tags;

        tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
                                        GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
                                        node);
        if (!tags->static_rqs)
                goto err_free_rqs;

        return tags;

err_free_rqs:
        kfree(tags->rqs);
err_free_tags:
        blk_mq_free_tags(tags);
        return NULL;
}

static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
                               unsigned int hctx_idx, int node)
{
        int ret;

        if (set->ops->init_request) {
                ret = set->ops->init_request(set, rq, hctx_idx, node);
                if (ret)
                        return ret;
        }

        WRITE_ONCE(rq->state, MQ_RQ_IDLE);
        return 0;
}

static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
                            struct blk_mq_tags *tags,
                            unsigned int hctx_idx, unsigned int depth)
{
        unsigned int i, j, entries_per_page, max_order = 4;
        int node = blk_mq_get_hctx_node(set, hctx_idx);
        size_t rq_size, left;

        if (node == NUMA_NO_NODE)
                node = set->numa_node;

        INIT_LIST_HEAD(&tags->page_list);

        /*
         * rq_size is the size of the request plus driver payload, rounded
         * to the cacheline size
         */
        rq_size = round_up(sizeof(struct request) + set->cmd_size,
                                cache_line_size());
        left = rq_size * depth;

        for (i = 0; i < depth; ) {
                int this_order = max_order;
                struct page *page;
                int to_do;
                void *p;

                while (this_order && left < order_to_size(this_order - 1))
                        this_order--;

                do {
                        page = alloc_pages_node(node,
                                GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
                                this_order);
                        if (page)
                                break;
                        if (!this_order--)
                                break;
                        if (order_to_size(this_order) < rq_size)
                                break;
                } while (1);

                if (!page)
                        goto fail;

                page->private = this_order;
                list_add_tail(&page->lru, &tags->page_list);

                p = page_address(page);
                /*
                 * Allow kmemleak to scan these pages as they contain pointers
                 * to additional allocations like via ops->init_request().
                 */
                kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
                entries_per_page = order_to_size(this_order) / rq_size;
                to_do = min(entries_per_page, depth - i);
                left -= to_do * rq_size;
                for (j = 0; j < to_do; j++) {
                        struct request *rq = p;

                        tags->static_rqs[i] = rq;
                        if (blk_mq_init_request(set, rq, hctx_idx, node)) {
                                tags->static_rqs[i] = NULL;
                                goto fail;
                        }

                        p += rq_size;
                        i++;
                }
        }
        return 0;

fail:
        blk_mq_free_rqs(set, tags, hctx_idx);
        return -ENOMEM;
}

struct rq_iter_data {
        struct blk_mq_hw_ctx *hctx;
        bool has_rq;
};

static bool blk_mq_has_request(struct request *rq, void *data)
{
        struct rq_iter_data *iter_data = data;

        if (rq->mq_hctx != iter_data->hctx)
                return true;
        iter_data->has_rq = true;
        return false;
}

static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
{
        struct blk_mq_tags *tags = hctx->sched_tags ?
                        hctx->sched_tags : hctx->tags;
        struct rq_iter_data data = {
                .hctx        = hctx,
        };

        blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
        return data.has_rq;
}

static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx,
                unsigned int this_cpu)
{
        enum hctx_type type = hctx->type;
        int cpu;

        /*
         * hctx->cpumask has to rule out isolated CPUs, but userspace still
         * might submit IOs on these isolated CPUs, so use the queue map to
         * check if all CPUs mapped to this hctx are offline
         */
        for_each_online_cpu(cpu) {
                struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue,
                                type, cpu);

                if (h != hctx)
                        continue;

                /* this hctx has at least one online CPU */
                if (this_cpu != cpu)
                        return true;
        }

        return false;
}

static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
{
        struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
                        struct blk_mq_hw_ctx, cpuhp_online);

        if (blk_mq_hctx_has_online_cpu(hctx, cpu))
                return 0;

        /*
         * Prevent new request from being allocated on the current hctx.
         *
         * The smp_mb__after_atomic() Pairs with the implied barrier in
         * test_and_set_bit_lock in sbitmap_get().  Ensures the inactive flag is
         * seen once we return from the tag allocator.
         */
        set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
        smp_mb__after_atomic();

        /*
         * Try to grab a reference to the queue and wait for any outstanding
         * requests.  If we could not grab a reference the queue has been
         * frozen and there are no requests.
         */
        if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
                while (blk_mq_hctx_has_requests(hctx))
                        msleep(5);
                percpu_ref_put(&hctx->queue->q_usage_counter);
        }

        return 0;
}

/*
 * Check if one CPU is mapped to the specified hctx
 *
 * Isolated CPUs have been ruled out from hctx->cpumask, which is supposed
 * to be used for scheduling kworker only. For other usage, please call this
 * helper for checking if one CPU belongs to the specified hctx
 */
static bool blk_mq_cpu_mapped_to_hctx(unsigned int cpu,
                const struct blk_mq_hw_ctx *hctx)
{
        struct blk_mq_hw_ctx *mapped_hctx = blk_mq_map_queue_type(hctx->queue,
                        hctx->type, cpu);

        return mapped_hctx == hctx;
}

static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
{
        struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
                        struct blk_mq_hw_ctx, cpuhp_online);

        if (blk_mq_cpu_mapped_to_hctx(cpu, hctx))
                clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
        return 0;
}

/*
 * 'cpu' is going away. splice any existing rq_list entries from this
 * software queue to the hw queue dispatch list, and ensure that it
 * gets run.
 */
static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
{
        struct blk_mq_hw_ctx *hctx;
        struct blk_mq_ctx *ctx;
        LIST_HEAD(tmp);
        enum hctx_type type;

        hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
        if (!blk_mq_cpu_mapped_to_hctx(cpu, hctx))
                return 0;

        ctx = __blk_mq_get_ctx(hctx->queue, cpu);
        type = hctx->type;

        spin_lock(&ctx->lock);
        if (!list_empty(&ctx->rq_lists[type])) {
                list_splice_init(&ctx->rq_lists[type], &tmp);
                blk_mq_hctx_clear_pending(hctx, ctx);
        }
        spin_unlock(&ctx->lock);

        if (list_empty(&tmp))
                return 0;

        spin_lock(&hctx->lock);
        list_splice_tail_init(&tmp, &hctx->dispatch);
        spin_unlock(&hctx->lock);

        blk_mq_run_hw_queue(hctx, true);
        return 0;
}

static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
{
        if (!(hctx->flags & BLK_MQ_F_STACKING))
                cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
                                                    &hctx->cpuhp_online);
        cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
                                            &hctx->cpuhp_dead);
}

/*
 * Before freeing hw queue, clearing the flush request reference in
 * tags->rqs[] for avoiding potential UAF.
 */
static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
                unsigned int queue_depth, struct request *flush_rq)
{
        int i;
        unsigned long flags;

        /* The hw queue may not be mapped yet */
        if (!tags)
                return;

        WARN_ON_ONCE(req_ref_read(flush_rq) != 0);

        for (i = 0; i < queue_depth; i++)
                cmpxchg(&tags->rqs[i], flush_rq, NULL);

        /*
         * Wait until all pending iteration is done.
         *
         * Request reference is cleared and it is guaranteed to be observed
         * after the ->lock is released.
         */
        spin_lock_irqsave(&tags->lock, flags);
        spin_unlock_irqrestore(&tags->lock, flags);
}

/* hctx->ctxs will be freed in queue's release handler */
static void blk_mq_exit_hctx(struct request_queue *q,
                struct blk_mq_tag_set *set,
                struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
        struct request *flush_rq = hctx->fq->flush_rq;

        if (blk_mq_hw_queue_mapped(hctx))
                blk_mq_tag_idle(hctx);

        if (blk_queue_init_done(q))
                blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
                                set->queue_depth, flush_rq);
        if (set->ops->exit_request)
                set->ops->exit_request(set, flush_rq, hctx_idx);

        if (set->ops->exit_hctx)
                set->ops->exit_hctx(hctx, hctx_idx);

        blk_mq_remove_cpuhp(hctx);

        xa_erase(&q->hctx_table, hctx_idx);

        spin_lock(&q->unused_hctx_lock);
        list_add(&hctx->hctx_list, &q->unused_hctx_list);
        spin_unlock(&q->unused_hctx_lock);
}

static void blk_mq_exit_hw_queues(struct request_queue *q,
                struct blk_mq_tag_set *set, int nr_queue)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i) {
                if (i == nr_queue)
                        break;
                blk_mq_exit_hctx(q, set, hctx, i);
        }
}

static int blk_mq_init_hctx(struct request_queue *q,
                struct blk_mq_tag_set *set,
                struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
{
        hctx->queue_num = hctx_idx;

        if (!(hctx->flags & BLK_MQ_F_STACKING))
                cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
                                &hctx->cpuhp_online);
        cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);

        hctx->tags = set->tags[hctx_idx];

        if (set->ops->init_hctx &&
            set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
                goto unregister_cpu_notifier;

        if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
                                hctx->numa_node))
                goto exit_hctx;

        if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
                goto exit_flush_rq;

        return 0;

 exit_flush_rq:
        if (set->ops->exit_request)
                set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
 exit_hctx:
        if (set->ops->exit_hctx)
                set->ops->exit_hctx(hctx, hctx_idx);
 unregister_cpu_notifier:
        blk_mq_remove_cpuhp(hctx);
        return -1;
}

static struct blk_mq_hw_ctx *
blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
                int node)
{
        struct blk_mq_hw_ctx *hctx;
        gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;

        hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
        if (!hctx)
                goto fail_alloc_hctx;

        if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
                goto free_hctx;

        atomic_set(&hctx->nr_active, 0);
        if (node == NUMA_NO_NODE)
                node = set->numa_node;
        hctx->numa_node = node;

        INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
        spin_lock_init(&hctx->lock);
        INIT_LIST_HEAD(&hctx->dispatch);
        hctx->queue = q;
        hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;

        INIT_LIST_HEAD(&hctx->hctx_list);

        /*
         * Allocate space for all possible cpus to avoid allocation at
         * runtime
         */
        hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
                        gfp, node);
        if (!hctx->ctxs)
                goto free_cpumask;

        if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
                                gfp, node, false, false))
                goto free_ctxs;
        hctx->nr_ctx = 0;

        spin_lock_init(&hctx->dispatch_wait_lock);
        init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
        INIT_LIST_HEAD(&hctx->dispatch_wait.entry);

        hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
        if (!hctx->fq)
                goto free_bitmap;

        blk_mq_hctx_kobj_init(hctx);

        return hctx;

 free_bitmap:
        sbitmap_free(&hctx->ctx_map);
 free_ctxs:
        kfree(hctx->ctxs);
 free_cpumask:
        free_cpumask_var(hctx->cpumask);
 free_hctx:
        kfree(hctx);
 fail_alloc_hctx:
        return NULL;
}

static void blk_mq_init_cpu_queues(struct request_queue *q,
                                   unsigned int nr_hw_queues)
{
        struct blk_mq_tag_set *set = q->tag_set;
        unsigned int i, j;

        for_each_possible_cpu(i) {
                struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
                struct blk_mq_hw_ctx *hctx;
                int k;

                __ctx->cpu = i;
                spin_lock_init(&__ctx->lock);
                for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
                        INIT_LIST_HEAD(&__ctx->rq_lists[k]);

                __ctx->queue = q;

                /*
                 * Set local node, IFF we have more than one hw queue. If
                 * not, we remain on the home node of the device
                 */
                for (j = 0; j < set->nr_maps; j++) {
                        hctx = blk_mq_map_queue_type(q, j, i);
                        if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
                                hctx->numa_node = cpu_to_node(i);
                }
        }
}

struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
                                             unsigned int hctx_idx,
                                             unsigned int depth)
{
        struct blk_mq_tags *tags;
        int ret;

        tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags);
        if (!tags)
                return NULL;

        ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth);
        if (ret) {
                blk_mq_free_rq_map(tags);
                return NULL;
        }

        return tags;
}

static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
                                       int hctx_idx)
{
        if (blk_mq_is_shared_tags(set->flags)) {
                set->tags[hctx_idx] = set->shared_tags;

                return true;
        }

        set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx,
                                                       set->queue_depth);

        return set->tags[hctx_idx];
}

void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
                             struct blk_mq_tags *tags,
                             unsigned int hctx_idx)
{
        if (tags) {
                blk_mq_free_rqs(set, tags, hctx_idx);
                blk_mq_free_rq_map(tags);
        }
}

static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
                                      unsigned int hctx_idx)
{
        if (!blk_mq_is_shared_tags(set->flags))
                blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx);

        set->tags[hctx_idx] = NULL;
}

static void blk_mq_map_swqueue(struct request_queue *q)
{
        unsigned int j, hctx_idx;
        unsigned long i;
        struct blk_mq_hw_ctx *hctx;
        struct blk_mq_ctx *ctx;
        struct blk_mq_tag_set *set = q->tag_set;

        queue_for_each_hw_ctx(q, hctx, i) {
                cpumask_clear(hctx->cpumask);
                hctx->nr_ctx = 0;
                hctx->dispatch_from = NULL;
        }

        /*
         * Map software to hardware queues.
         *
         * If the cpu isn't present, the cpu is mapped to first hctx.
         */
        for_each_possible_cpu(i) {

                ctx = per_cpu_ptr(q->queue_ctx, i);
                for (j = 0; j < set->nr_maps; j++) {
                        if (!set->map[j].nr_queues) {
                                ctx->hctxs[j] = blk_mq_map_queue_type(q,
                                                HCTX_TYPE_DEFAULT, i);
                                continue;
                        }
                        hctx_idx = set->map[j].mq_map[i];
                        /* unmapped hw queue can be remapped after CPU topo changed */
                        if (!set->tags[hctx_idx] &&
                            !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) {
                                /*
                                 * If tags initialization fail for some hctx,
                                 * that hctx won't be brought online.  In this
                                 * case, remap the current ctx to hctx[0] which
                                 * is guaranteed to always have tags allocated
                                 */
                                set->map[j].mq_map[i] = 0;
                        }

                        hctx = blk_mq_map_queue_type(q, j, i);
                        ctx->hctxs[j] = hctx;
                        /*
                         * If the CPU is already set in the mask, then we've
                         * mapped this one already. This can happen if
                         * devices share queues across queue maps.
                         */
                        if (cpumask_test_cpu(i, hctx->cpumask))
                                continue;

                        cpumask_set_cpu(i, hctx->cpumask);
                        hctx->type = j;
                        ctx->index_hw[hctx->type] = hctx->nr_ctx;
                        hctx->ctxs[hctx->nr_ctx++] = ctx;

                        /*
                         * If the nr_ctx type overflows, we have exceeded the
                         * amount of sw queues we can support.
                         */
                        BUG_ON(!hctx->nr_ctx);
                }

                for (; j < HCTX_MAX_TYPES; j++)
                        ctx->hctxs[j] = blk_mq_map_queue_type(q,
                                        HCTX_TYPE_DEFAULT, i);
        }

        queue_for_each_hw_ctx(q, hctx, i) {
                int cpu;

                /*
                 * If no software queues are mapped to this hardware queue,
                 * disable it and free the request entries.
                 */
                if (!hctx->nr_ctx) {
                        /* Never unmap queue 0.  We need it as a
                         * fallback in case of a new remap fails
                         * allocation
                         */
                        if (i)
                                __blk_mq_free_map_and_rqs(set, i);

                        hctx->tags = NULL;
                        continue;
                }

                hctx->tags = set->tags[i];
                WARN_ON(!hctx->tags);

                /*
                 * Set the map size to the number of mapped software queues.
                 * This is more accurate and more efficient than looping
                 * over all possibly mapped software queues.
                 */
                sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);

                /*
                 * Rule out isolated CPUs from hctx->cpumask to avoid
                 * running block kworker on isolated CPUs
                 */
                for_each_cpu(cpu, hctx->cpumask) {
                        if (cpu_is_isolated(cpu))
                                cpumask_clear_cpu(cpu, hctx->cpumask);
                }

                /*
                 * Initialize batch roundrobin counts
                 */
                hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
        }
}

/*
 * Caller needs to ensure that we're either frozen/quiesced, or that
 * the queue isn't live yet.
 */
static void queue_set_hctx_shared(struct request_queue *q, bool shared)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i) {
                if (shared) {
                        hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
                } else {
                        blk_mq_tag_idle(hctx);
                        hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
                }
        }
}

static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
                                         bool shared)
{
        struct request_queue *q;

        lockdep_assert_held(&set->tag_list_lock);

        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                blk_mq_freeze_queue(q);
                queue_set_hctx_shared(q, shared);
                blk_mq_unfreeze_queue(q);
        }
}

static void blk_mq_del_queue_tag_set(struct request_queue *q)
{
        struct blk_mq_tag_set *set = q->tag_set;

        mutex_lock(&set->tag_list_lock);
        list_del(&q->tag_set_list);
        if (list_is_singular(&set->tag_list)) {
                /* just transitioned to unshared */
                set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
                /* update existing queue */
                blk_mq_update_tag_set_shared(set, false);
        }
        mutex_unlock(&set->tag_list_lock);
        INIT_LIST_HEAD(&q->tag_set_list);
}

static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
                                     struct request_queue *q)
{
        mutex_lock(&set->tag_list_lock);

        /*
         * Check to see if we're transitioning to shared (from 1 to 2 queues).
         */
        if (!list_empty(&set->tag_list) &&
            !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
                set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
                /* update existing queue */
                blk_mq_update_tag_set_shared(set, true);
        }
        if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
                queue_set_hctx_shared(q, true);
        list_add_tail(&q->tag_set_list, &set->tag_list);

        mutex_unlock(&set->tag_list_lock);
}

/* All allocations will be freed in release handler of q->mq_kobj */
static int blk_mq_alloc_ctxs(struct request_queue *q)
{
        struct blk_mq_ctxs *ctxs;
        int cpu;

        ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
        if (!ctxs)
                return -ENOMEM;

        ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
        if (!ctxs->queue_ctx)
                goto fail;

        for_each_possible_cpu(cpu) {
                struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
                ctx->ctxs = ctxs;
        }

        q->mq_kobj = &ctxs->kobj;
        q->queue_ctx = ctxs->queue_ctx;

        return 0;
 fail:
        kfree(ctxs);
        return -ENOMEM;
}

/*
 * It is the actual release handler for mq, but we do it from
 * request queue's release handler for avoiding use-after-free
 * and headache because q->mq_kobj shouldn't have been introduced,
 * but we can't group ctx/kctx kobj without it.
 */
void blk_mq_release(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx, *next;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i)
                WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));

        /* all hctx are in .unused_hctx_list now */
        list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
                list_del_init(&hctx->hctx_list);
                kobject_put(&hctx->kobj);
        }

        xa_destroy(&q->hctx_table);

        /*
         * release .mq_kobj and sw queue's kobject now because
         * both share lifetime with request queue.
         */
        blk_mq_sysfs_deinit(q);
}

struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
                struct queue_limits *lim, void *queuedata)
{
        struct queue_limits default_lim = { };
        struct request_queue *q;
        int ret;

        q = blk_alloc_queue(lim ? lim : &default_lim, set->numa_node);
        if (IS_ERR(q))
                return q;
        q->queuedata = queuedata;
        ret = blk_mq_init_allocated_queue(set, q);
        if (ret) {
                blk_put_queue(q);
                return ERR_PTR(ret);
        }
        return q;
}
EXPORT_SYMBOL(blk_mq_alloc_queue);

/**
 * blk_mq_destroy_queue - shutdown a request queue
 * @q: request queue to shutdown
 *
 * This shuts down a request queue allocated by blk_mq_alloc_queue(). All future
 * requests will be failed with -ENODEV. The caller is responsible for dropping
 * the reference from blk_mq_alloc_queue() by calling blk_put_queue().
 *
 * Context: can sleep
 */
void blk_mq_destroy_queue(struct request_queue *q)
{
        WARN_ON_ONCE(!queue_is_mq(q));
        WARN_ON_ONCE(blk_queue_registered(q));

        might_sleep();

        blk_queue_flag_set(QUEUE_FLAG_DYING, q);
        blk_queue_start_drain(q);
        blk_mq_freeze_queue_wait(q);

        blk_sync_queue(q);
        blk_mq_cancel_work_sync(q);
        blk_mq_exit_queue(q);
}
EXPORT_SYMBOL(blk_mq_destroy_queue);

struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
                struct queue_limits *lim, void *queuedata,
                struct lock_class_key *lkclass)
{
        struct request_queue *q;
        struct gendisk *disk;

        q = blk_mq_alloc_queue(set, lim, queuedata);
        if (IS_ERR(q))
                return ERR_CAST(q);

        disk = __alloc_disk_node(q, set->numa_node, lkclass);
        if (!disk) {
                blk_mq_destroy_queue(q);
                blk_put_queue(q);
                return ERR_PTR(-ENOMEM);
        }
        set_bit(GD_OWNS_QUEUE, &disk->state);
        return disk;
}
EXPORT_SYMBOL(__blk_mq_alloc_disk);

struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
                struct lock_class_key *lkclass)
{
        struct gendisk *disk;

        if (!blk_get_queue(q))
                return NULL;
        disk = __alloc_disk_node(q, NUMA_NO_NODE, lkclass);
        if (!disk)
                blk_put_queue(q);
        return disk;
}
EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue);

static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
                struct blk_mq_tag_set *set, struct request_queue *q,
                int hctx_idx, int node)
{
        struct blk_mq_hw_ctx *hctx = NULL, *tmp;

        /* reuse dead hctx first */
        spin_lock(&q->unused_hctx_lock);
        list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
                if (tmp->numa_node == node) {
                        hctx = tmp;
                        break;
                }
        }
        if (hctx)
                list_del_init(&hctx->hctx_list);
        spin_unlock(&q->unused_hctx_lock);

        if (!hctx)
                hctx = blk_mq_alloc_hctx(q, set, node);
        if (!hctx)
                goto fail;

        if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
                goto free_hctx;

        return hctx;

 free_hctx:
        kobject_put(&hctx->kobj);
 fail:
        return NULL;
}

static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
                                                struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i, j;

        /* protect against switching io scheduler  */
        mutex_lock(&q->sysfs_lock);
        for (i = 0; i < set->nr_hw_queues; i++) {
                int old_node;
                int node = blk_mq_get_hctx_node(set, i);
                struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i);

                if (old_hctx) {
                        old_node = old_hctx->numa_node;
                        blk_mq_exit_hctx(q, set, old_hctx, i);
                }

                if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) {
                        if (!old_hctx)
                                break;
                        pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
                                        node, old_node);
                        hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node);
                        WARN_ON_ONCE(!hctx);
                }
        }
        /*
         * Increasing nr_hw_queues fails. Free the newly allocated
         * hctxs and keep the previous q->nr_hw_queues.
         */
        if (i != set->nr_hw_queues) {
                j = q->nr_hw_queues;
        } else {
                j = i;
                q->nr_hw_queues = set->nr_hw_queues;
        }

        xa_for_each_start(&q->hctx_table, j, hctx, j)
                blk_mq_exit_hctx(q, set, hctx, j);
        mutex_unlock(&q->sysfs_lock);
}

static void blk_mq_update_poll_flag(struct request_queue *q)
{
        struct blk_mq_tag_set *set = q->tag_set;

        if (set->nr_maps > HCTX_TYPE_POLL &&
            set->map[HCTX_TYPE_POLL].nr_queues)
                blk_queue_flag_set(QUEUE_FLAG_POLL, q);
        else
                blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
}

int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
                struct request_queue *q)
{
        /* mark the queue as mq asap */
        q->mq_ops = set->ops;

        if (blk_mq_alloc_ctxs(q))
                goto err_exit;

        /* init q->mq_kobj and sw queues' kobjects */
        blk_mq_sysfs_init(q);

        INIT_LIST_HEAD(&q->unused_hctx_list);
        spin_lock_init(&q->unused_hctx_lock);

        xa_init(&q->hctx_table);

        blk_mq_realloc_hw_ctxs(set, q);
        if (!q->nr_hw_queues)
                goto err_hctxs;

        INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
        blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);

        q->tag_set = set;

        q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
        blk_mq_update_poll_flag(q);

        INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
        INIT_LIST_HEAD(&q->flush_list);
        INIT_LIST_HEAD(&q->requeue_list);
        spin_lock_init(&q->requeue_lock);

        q->nr_requests = set->queue_depth;

        blk_mq_init_cpu_queues(q, set->nr_hw_queues);
        blk_mq_add_queue_tag_set(set, q);
        blk_mq_map_swqueue(q);
        return 0;

err_hctxs:
        blk_mq_release(q);
err_exit:
        q->mq_ops = NULL;
        return -ENOMEM;
}
EXPORT_SYMBOL(blk_mq_init_allocated_queue);

/* tags can _not_ be used after returning from blk_mq_exit_queue */
void blk_mq_exit_queue(struct request_queue *q)
{
        struct blk_mq_tag_set *set = q->tag_set;

        /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */
        blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
        /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */
        blk_mq_del_queue_tag_set(q);
}

static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
{
        int i;

        if (blk_mq_is_shared_tags(set->flags)) {
                set->shared_tags = blk_mq_alloc_map_and_rqs(set,
                                                BLK_MQ_NO_HCTX_IDX,
                                                set->queue_depth);
                if (!set->shared_tags)
                        return -ENOMEM;
        }

        for (i = 0; i < set->nr_hw_queues; i++) {
                if (!__blk_mq_alloc_map_and_rqs(set, i))
                        goto out_unwind;
                cond_resched();
        }

        return 0;

out_unwind:
        while (--i >= 0)
                __blk_mq_free_map_and_rqs(set, i);

        if (blk_mq_is_shared_tags(set->flags)) {
                blk_mq_free_map_and_rqs(set, set->shared_tags,
                                        BLK_MQ_NO_HCTX_IDX);
        }

        return -ENOMEM;
}

/*
 * Allocate the request maps associated with this tag_set. Note that this
 * may reduce the depth asked for, if memory is tight. set->queue_depth
 * will be updated to reflect the allocated depth.
 */
static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set)
{
        unsigned int depth;
        int err;

        depth = set->queue_depth;
        do {
                err = __blk_mq_alloc_rq_maps(set);
                if (!err)
                        break;

                set->queue_depth >>= 1;
                if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
                        err = -ENOMEM;
                        break;
                }
        } while (set->queue_depth);

        if (!set->queue_depth || err) {
                pr_err("blk-mq: failed to allocate request map\n");
                return -ENOMEM;
        }

        if (depth != set->queue_depth)
                pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
                                                depth, set->queue_depth);

        return 0;
}

static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
{
        /*
         * blk_mq_map_queues() and multiple .map_queues() implementations
         * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
         * number of hardware queues.
         */
        if (set->nr_maps == 1)
                set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;

        if (set->ops->map_queues) {
                int i;

                /*
                 * transport .map_queues is usually done in the following
                 * way:
                 *
                 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
                 *         mask = get_cpu_mask(queue)
                 *         for_each_cpu(cpu, mask)
                 *                 set->map[x].mq_map[cpu] = queue;
                 * }
                 *
                 * When we need to remap, the table has to be cleared for
                 * killing stale mapping since one CPU may not be mapped
                 * to any hw queue.
                 */
                for (i = 0; i < set->nr_maps; i++)
                        blk_mq_clear_mq_map(&set->map[i]);

                set->ops->map_queues(set);
        } else {
                BUG_ON(set->nr_maps > 1);
                blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
        }
}

static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
                                       int new_nr_hw_queues)
{
        struct blk_mq_tags **new_tags;
        int i;

        if (set->nr_hw_queues >= new_nr_hw_queues)
                goto done;

        new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
                                GFP_KERNEL, set->numa_node);
        if (!new_tags)
                return -ENOMEM;

        if (set->tags)
                memcpy(new_tags, set->tags, set->nr_hw_queues *
                       sizeof(*set->tags));
        kfree(set->tags);
        set->tags = new_tags;

        for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) {
                if (!__blk_mq_alloc_map_and_rqs(set, i)) {
                        while (--i >= set->nr_hw_queues)
                                __blk_mq_free_map_and_rqs(set, i);
                        return -ENOMEM;
                }
                cond_resched();
        }

done:
        set->nr_hw_queues = new_nr_hw_queues;
        return 0;
}

/*
 * Alloc a tag set to be associated with one or more request queues.
 * May fail with EINVAL for various error conditions. May adjust the
 * requested depth down, if it's too large. In that case, the set
 * value will be stored in set->queue_depth.
 */
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
{
        int i, ret;

        BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);

        if (!set->nr_hw_queues)
                return -EINVAL;
        if (!set->queue_depth)
                return -EINVAL;
        if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
                return -EINVAL;

        if (!set->ops->queue_rq)
                return -EINVAL;

        if (!set->ops->get_budget ^ !set->ops->put_budget)
                return -EINVAL;

        if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
                pr_info("blk-mq: reduced tag depth to %u\n",
                        BLK_MQ_MAX_DEPTH);
                set->queue_depth = BLK_MQ_MAX_DEPTH;
        }

        if (!set->nr_maps)
                set->nr_maps = 1;
        else if (set->nr_maps > HCTX_MAX_TYPES)
                return -EINVAL;

        /*
         * If a crashdump is active, then we are potentially in a very
         * memory constrained environment. Limit us to  64 tags to prevent
         * using too much memory.
         */
        if (is_kdump_kernel())
                set->queue_depth = min(64U, set->queue_depth);

        /*
         * There is no use for more h/w queues than cpus if we just have
         * a single map
         */
        if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
                set->nr_hw_queues = nr_cpu_ids;

        if (set->flags & BLK_MQ_F_BLOCKING) {
                set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL);
                if (!set->srcu)
                        return -ENOMEM;
                ret = init_srcu_struct(set->srcu);
                if (ret)
                        goto out_free_srcu;
        }

        ret = -ENOMEM;
        set->tags = kcalloc_node(set->nr_hw_queues,
                                 sizeof(struct blk_mq_tags *), GFP_KERNEL,
                                 set->numa_node);
        if (!set->tags)
                goto out_cleanup_srcu;

        for (i = 0; i < set->nr_maps; i++) {
                set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
                                                  sizeof(set->map[i].mq_map[0]),
                                                  GFP_KERNEL, set->numa_node);
                if (!set->map[i].mq_map)
                        goto out_free_mq_map;
                set->map[i].nr_queues = set->nr_hw_queues;
        }

        blk_mq_update_queue_map(set);

        ret = blk_mq_alloc_set_map_and_rqs(set);
        if (ret)
                goto out_free_mq_map;

        mutex_init(&set->tag_list_lock);
        INIT_LIST_HEAD(&set->tag_list);

        return 0;

out_free_mq_map:
        for (i = 0; i < set->nr_maps; i++) {
                kfree(set->map[i].mq_map);
                set->map[i].mq_map = NULL;
        }
        kfree(set->tags);
        set->tags = NULL;
out_cleanup_srcu:
        if (set->flags & BLK_MQ_F_BLOCKING)
                cleanup_srcu_struct(set->srcu);
out_free_srcu:
        if (set->flags & BLK_MQ_F_BLOCKING)
                kfree(set->srcu);
        return ret;
}
EXPORT_SYMBOL(blk_mq_alloc_tag_set);

/* allocate and initialize a tagset for a simple single-queue device */
int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
                const struct blk_mq_ops *ops, unsigned int queue_depth,
                unsigned int set_flags)
{
        memset(set, 0, sizeof(*set));
        set->ops = ops;
        set->nr_hw_queues = 1;
        set->nr_maps = 1;
        set->queue_depth = queue_depth;
        set->numa_node = NUMA_NO_NODE;
        set->flags = set_flags;
        return blk_mq_alloc_tag_set(set);
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set);

void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
{
        int i, j;

        for (i = 0; i < set->nr_hw_queues; i++)
                __blk_mq_free_map_and_rqs(set, i);

        if (blk_mq_is_shared_tags(set->flags)) {
                blk_mq_free_map_and_rqs(set, set->shared_tags,
                                        BLK_MQ_NO_HCTX_IDX);
        }

        for (j = 0; j < set->nr_maps; j++) {
                kfree(set->map[j].mq_map);
                set->map[j].mq_map = NULL;
        }

        kfree(set->tags);
        set->tags = NULL;
        if (set->flags & BLK_MQ_F_BLOCKING) {
                cleanup_srcu_struct(set->srcu);
                kfree(set->srcu);
        }
}
EXPORT_SYMBOL(blk_mq_free_tag_set);

int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
{
        struct blk_mq_tag_set *set = q->tag_set;
        struct blk_mq_hw_ctx *hctx;
        int ret;
        unsigned long i;

        if (!set)
                return -EINVAL;

        if (q->nr_requests == nr)
                return 0;

        blk_mq_freeze_queue(q);
        blk_mq_quiesce_queue(q);

        ret = 0;
        queue_for_each_hw_ctx(q, hctx, i) {
                if (!hctx->tags)
                        continue;
                /*
                 * If we're using an MQ scheduler, just update the scheduler
                 * queue depth. This is similar to what the old code would do.
                 */
                if (hctx->sched_tags) {
                        ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
                                                      nr, true);
                } else {
                        ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
                                                      false);
                }
                if (ret)
                        break;
                if (q->elevator && q->elevator->type->ops.depth_updated)
                        q->elevator->type->ops.depth_updated(hctx);
        }
        if (!ret) {
                q->nr_requests = nr;
                if (blk_mq_is_shared_tags(set->flags)) {
                        if (q->elevator)
                                blk_mq_tag_update_sched_shared_tags(q);
                        else
                                blk_mq_tag_resize_shared_tags(set, nr);
                }
        }

        blk_mq_unquiesce_queue(q);
        blk_mq_unfreeze_queue(q);

        return ret;
}

/*
 * request_queue and elevator_type pair.
 * It is just used by __blk_mq_update_nr_hw_queues to cache
 * the elevator_type associated with a request_queue.
 */
struct blk_mq_qe_pair {
        struct list_head node;
        struct request_queue *q;
        struct elevator_type *type;
};

/*
 * Cache the elevator_type in qe pair list and switch the
 * io scheduler to 'none'
 */
static bool blk_mq_elv_switch_none(struct list_head *head,
                struct request_queue *q)
{
        struct blk_mq_qe_pair *qe;

        qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
        if (!qe)
                return false;

        /* q->elevator needs protection from ->sysfs_lock */
        mutex_lock(&q->sysfs_lock);

        /* the check has to be done with holding sysfs_lock */
        if (!q->elevator) {
                kfree(qe);
                goto unlock;
        }

        INIT_LIST_HEAD(&qe->node);
        qe->q = q;
        qe->type = q->elevator->type;
        /* keep a reference to the elevator module as we'll switch back */
        __elevator_get(qe->type);
        list_add(&qe->node, head);
        elevator_disable(q);
unlock:
        mutex_unlock(&q->sysfs_lock);

        return true;
}

static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head,
                                                struct request_queue *q)
{
        struct blk_mq_qe_pair *qe;

        list_for_each_entry(qe, head, node)
                if (qe->q == q)
                        return qe;

        return NULL;
}

static void blk_mq_elv_switch_back(struct list_head *head,
                                  struct request_queue *q)
{
        struct blk_mq_qe_pair *qe;
        struct elevator_type *t;

        qe = blk_lookup_qe_pair(head, q);
        if (!qe)
                return;
        t = qe->type;
        list_del(&qe->node);
        kfree(qe);

        mutex_lock(&q->sysfs_lock);
        elevator_switch(q, t);
        /* drop the reference acquired in blk_mq_elv_switch_none */
        elevator_put(t);
        mutex_unlock(&q->sysfs_lock);
}

static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
                                                        int nr_hw_queues)
{
        struct request_queue *q;
        LIST_HEAD(head);
        int prev_nr_hw_queues = set->nr_hw_queues;
        int i;

        lockdep_assert_held(&set->tag_list_lock);

        if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
                nr_hw_queues = nr_cpu_ids;
        if (nr_hw_queues < 1)
                return;
        if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
                return;

        list_for_each_entry(q, &set->tag_list, tag_set_list)
                blk_mq_freeze_queue(q);
        /*
         * Switch IO scheduler to 'none', cleaning up the data associated
         * with the previous scheduler. We will switch back once we are done
         * updating the new sw to hw queue mappings.
         */
        list_for_each_entry(q, &set->tag_list, tag_set_list)
                if (!blk_mq_elv_switch_none(&head, q))
                        goto switch_back;

        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                blk_mq_debugfs_unregister_hctxs(q);
                blk_mq_sysfs_unregister_hctxs(q);
        }

        if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
                goto reregister;

fallback:
        blk_mq_update_queue_map(set);
        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                blk_mq_realloc_hw_ctxs(set, q);
                blk_mq_update_poll_flag(q);
                if (q->nr_hw_queues != set->nr_hw_queues) {
                        int i = prev_nr_hw_queues;

                        pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
                                        nr_hw_queues, prev_nr_hw_queues);
                        for (; i < set->nr_hw_queues; i++)
                                __blk_mq_free_map_and_rqs(set, i);

                        set->nr_hw_queues = prev_nr_hw_queues;
                        goto fallback;
                }
                blk_mq_map_swqueue(q);
        }

reregister:
        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                blk_mq_sysfs_register_hctxs(q);
                blk_mq_debugfs_register_hctxs(q);
        }

switch_back:
        list_for_each_entry(q, &set->tag_list, tag_set_list)
                blk_mq_elv_switch_back(&head, q);

        list_for_each_entry(q, &set->tag_list, tag_set_list)
                blk_mq_unfreeze_queue(q);

        /* Free the excess tags when nr_hw_queues shrink. */
        for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++)
                __blk_mq_free_map_and_rqs(set, i);
}

void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
{
        mutex_lock(&set->tag_list_lock);
        __blk_mq_update_nr_hw_queues(set, nr_hw_queues);
        mutex_unlock(&set->tag_list_lock);
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);

static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
                         struct io_comp_batch *iob, unsigned int flags)
{
        long state = get_current_state();
        int ret;

        do {
                ret = q->mq_ops->poll(hctx, iob);
                if (ret > 0) {
                        __set_current_state(TASK_RUNNING);
                        return ret;
                }

                if (signal_pending_state(state, current))
                        __set_current_state(TASK_RUNNING);
                if (task_is_running(current))
                        return 1;

                if (ret < 0 || (flags & BLK_POLL_ONESHOT))
                        break;
                cpu_relax();
        } while (!need_resched());

        __set_current_state(TASK_RUNNING);
        return 0;
}

int blk_mq_poll(struct request_queue *q, blk_qc_t cookie,
                struct io_comp_batch *iob, unsigned int flags)
{
        struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, cookie);

        return blk_hctx_poll(q, hctx, iob, flags);
}

int blk_rq_poll(struct request *rq, struct io_comp_batch *iob,
                unsigned int poll_flags)
{
        struct request_queue *q = rq->q;
        int ret;

        if (!blk_rq_is_poll(rq))
                return 0;
        if (!percpu_ref_tryget(&q->q_usage_counter))
                return 0;

        ret = blk_hctx_poll(q, rq->mq_hctx, iob, poll_flags);
        blk_queue_exit(q);

        return ret;
}
EXPORT_SYMBOL_GPL(blk_rq_poll);

unsigned int blk_mq_rq_cpu(struct request *rq)
{
        return rq->mq_ctx->cpu;
}
EXPORT_SYMBOL(blk_mq_rq_cpu);

void blk_mq_cancel_work_sync(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        cancel_delayed_work_sync(&q->requeue_work);

        queue_for_each_hw_ctx(q, hctx, i)
                cancel_delayed_work_sync(&hctx->run_work);
}

static int __init blk_mq_init(void)
{
        int i;

        for_each_possible_cpu(i)
                init_llist_head(&per_cpu(blk_cpu_done, i));
        for_each_possible_cpu(i)
                INIT_CSD(&per_cpu(blk_cpu_csd, i),
                         __blk_mq_complete_request_remote, NULL);
        open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);

        cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
                                  "block/softirq:dead", NULL,
                                  blk_softirq_cpu_dead);
        cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
                                blk_mq_hctx_notify_dead);
        cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
                                blk_mq_hctx_notify_online,
                                blk_mq_hctx_notify_offline);
        return 0;
}
subsys_initcall(blk_mq_init);


































































































































   14 

   13 









    3 

    3 
    3 

    3 

    3 









    4 


    4 



























    4 


    4 

    1 

    3 


















    3 










    3 




    3 



    3 




    4 


    4 

    4 











    2 


    2 




    2 



























    2 
    2 













    1 
    1 




    2 






    2 






    2 







    2 
    2 




    2 

    2 



    2 
    2 



    5 


    5 


    4 











    1 
















    8 


    4 



    7 

























    1 

















    1 





















    2 

















































    7 







    5 





    3 


    2 
    5 


    4 

    4 






















    3 












































    1 






















    1 

    1 

















































    3 


















    4 






































    6 





    6 





































    5 














    4 

























































   14 






   14 
   11 
   14 

   15 
   15 

   14 











   13 














   14 




   13 























   14 

























   15 











   11 




   12 


   14 













   15 
   14 









   14 













   10 










































































































    2 





    1 










    2 




























    1 




    1 




















    2 
    2 



























    1 

    5 











    4 








    4 
    2 

































    4 
    5 
    4 










    5 











    5 




    4 







    5 












    1 


    1 











    5 








































































































    4 

























































   17 



   18 
   18 
    2 


















    1 




    1 
    1 



    1 







































































    1 
    1 








    1 











    1 

    1 
    1 








    1 




























































































































































































   20 



   20 



   17 











   18 




   18 






















   16 









   15 






    1 








    1 



   13 
    5 

    1 



   15 
    1 

    9 



    8 







   11 


   10 




   12 













    4 
    7 
















   10 
















    3 

    6 

   10 












    2 
































    4 



    4 
    3 

    2 


    4 

    4 



    2 

    2 



























    3 



    3 
    3 



    3 

    3 


    3 

    3 



    3 





    1 

    3 

    3 



    3 

    3 





















    4 





















    1 




    1 

    1 
















    1 
















    1 
    1 


    1 

























    7 



    9 
    9 







    7 
    2 















    9 




































    1 



    1 
    1 












    1 

    1 

    1 

    1 






    1 




    1 
    1 








    1 




    1 





    1 
    1 

    1 

















































































































































    1 











    1 



    1 
















    1 
    1 




    1 






































































































































































    1 










    1 
    1 






















































































    1 







    1 










































    1 










    1 








































    1 


























    1 
    1 


    1 
    1 






























































































































    2 






































    2 





















































    2 






































    2 


















    1 













    1 



    1 



























    2 























    2 

    2 









    2 









    2 





    2 







    2 















    2 


























    2 


    2 













    2 




























    2 

    2 

    2 
















    1 









    1 







    1 




    1 





    1 






    1 

    1 



    1 


    1 

























































































    1 










    1 











    1 











    1 




    1 












    1 






    1 


    1 
    1 







    1 


    1 

    1 












    1 

    1 


    1 












    1 


    1 




















































    2 




    2 







    1 









    2 






















    1 





























































    2 





    2 




    2 

















































    1 


    1 































































    5 






























    5 




    1 












    5 





















    4 



    5 


























    1 




























    1 

















    1 











    1 






















    4 


    4 
    4 

    4 


    1 
    4 


















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
// SPDX-License-Identifier: GPL-2.0-only
/*
 *        linux/mm/filemap.c
 *
 * Copyright (C) 1994-1999  Linus Torvalds
 */

/*
 * This file handles the generic file mmap semantics used by
 * most "normal" filesystems (but you don't /have/ to use this:
 * the NFS filesystem used to do this differently, for example)
 */
#include <linux/export.h>
#include <linux/compiler.h>
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/sched/signal.h>
#include <linux/uaccess.h>
#include <linux/capability.h>
#include <linux/kernel_stat.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/syscalls.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/error-injection.h>
#include <linux/hash.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/security.h>
#include <linux/cpuset.h>
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/shmem_fs.h>
#include <linux/rmap.h>
#include <linux/delayacct.h>
#include <linux/psi.h>
#include <linux/ramfs.h>
#include <linux/page_idle.h>
#include <linux/migrate.h>
#include <linux/pipe_fs_i.h>
#include <linux/splice.h>
#include <linux/rcupdate_wait.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"

#define CREATE_TRACE_POINTS
#include <trace/events/filemap.h>

/*
 * FIXME: remove all knowledge of the buffer layer from the core VM
 */
#include <linux/buffer_head.h> /* for try_to_free_buffers */

#include <asm/mman.h>

#include "swap.h"

/*
 * Shared mappings implemented 30.11.1994. It's not fully working yet,
 * though.
 *
 * Shared mappings now work. 15.8.1995  Bruno.
 *
 * finished 'unifying' the page and buffer cache and SMP-threaded the
 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
 *
 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
 */

/*
 * Lock ordering:
 *
 *  ->i_mmap_rwsem                (truncate_pagecache)
 *    ->private_lock                (__free_pte->block_dirty_folio)
 *      ->swap_lock                (exclusive_swap_page, others)
 *        ->i_pages lock
 *
 *  ->i_rwsem
 *    ->invalidate_lock                (acquired by fs in truncate path)
 *      ->i_mmap_rwsem                (truncate->unmap_mapping_range)
 *
 *  ->mmap_lock
 *    ->i_mmap_rwsem
 *      ->page_table_lock or pte_lock        (various, mainly in memory.c)
 *        ->i_pages lock        (arch-dependent flush_dcache_mmap_lock)
 *
 *  ->mmap_lock
 *    ->invalidate_lock                (filemap_fault)
 *      ->lock_page                (filemap_fault, access_process_vm)
 *
 *  ->i_rwsem                        (generic_perform_write)
 *    ->mmap_lock                (fault_in_readable->do_page_fault)
 *
 *  bdi->wb.list_lock
 *    sb_lock                        (fs/fs-writeback.c)
 *    ->i_pages lock                (__sync_single_inode)
 *
 *  ->i_mmap_rwsem
 *    ->anon_vma.lock                (vma_merge)
 *
 *  ->anon_vma.lock
 *    ->page_table_lock or pte_lock        (anon_vma_prepare and various)
 *
 *  ->page_table_lock or pte_lock
 *    ->swap_lock                (try_to_unmap_one)
 *    ->private_lock                (try_to_unmap_one)
 *    ->i_pages lock                (try_to_unmap_one)
 *    ->lruvec->lru_lock        (follow_page->mark_page_accessed)
 *    ->lruvec->lru_lock        (check_pte_range->isolate_lru_page)
 *    ->private_lock                (folio_remove_rmap_pte->set_page_dirty)
 *    ->i_pages lock                (folio_remove_rmap_pte->set_page_dirty)
 *    bdi.wb->list_lock                (folio_remove_rmap_pte->set_page_dirty)
 *    ->inode->i_lock                (folio_remove_rmap_pte->set_page_dirty)
 *    ->memcg->move_lock        (folio_remove_rmap_pte->folio_memcg_lock)
 *    bdi.wb->list_lock                (zap_pte_range->set_page_dirty)
 *    ->inode->i_lock                (zap_pte_range->set_page_dirty)
 *    ->private_lock                (zap_pte_range->block_dirty_folio)
 */

static void mapping_set_update(struct xa_state *xas,
                struct address_space *mapping)
{
        if (dax_mapping(mapping) || shmem_mapping(mapping))
                return;
        xas_set_update(xas, workingset_update_node);
        xas_set_lru(xas, &shadow_nodes);
}

static void page_cache_delete(struct address_space *mapping,
                                   struct folio *folio, void *shadow)
{
        XA_STATE(xas, &mapping->i_pages, folio->index);
        long nr = 1;

        mapping_set_update(&xas, mapping);

        xas_set_order(&xas, folio->index, folio_order(folio));
        nr = folio_nr_pages(folio);

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        xas_store(&xas, shadow);
        xas_init_marks(&xas);

        folio->mapping = NULL;
        /* Leave page->index set: truncation lookup relies upon it */
        mapping->nrpages -= nr;
}

static void filemap_unaccount_folio(struct address_space *mapping,
                struct folio *folio)
{
        long nr;

        VM_BUG_ON_FOLIO(folio_mapped(folio), folio);
        if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {
                pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n",
                         current->comm, folio_pfn(folio));
                dump_page(&folio->page, "still mapped when deleted");
                dump_stack();
                add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

                if (mapping_exiting(mapping) && !folio_test_large(folio)) {
                        int mapcount = folio_mapcount(folio);

                        if (folio_ref_count(folio) >= mapcount + 2) {
                                /*
                                 * All vmas have already been torn down, so it's
                                 * a good bet that actually the page is unmapped
                                 * and we'd rather not leak it: if we're wrong,
                                 * another bad page check should catch it later.
                                 */
                                page_mapcount_reset(&folio->page);
                                folio_ref_sub(folio, mapcount);
                        }
                }
        }

        /* hugetlb folios do not participate in page cache accounting. */
        if (folio_test_hugetlb(folio))
                return;

        nr = folio_nr_pages(folio);

        __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
        if (folio_test_swapbacked(folio)) {
                __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
                if (folio_test_pmd_mappable(folio))
                        __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
        } else if (folio_test_pmd_mappable(folio)) {
                __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
                filemap_nr_thps_dec(mapping);
        }

        /*
         * At this point folio must be either written or cleaned by
         * truncate.  Dirty folio here signals a bug and loss of
         * unwritten data - on ordinary filesystems.
         *
         * But it's harmless on in-memory filesystems like tmpfs; and can
         * occur when a driver which did get_user_pages() sets page dirty
         * before putting it, while the inode is being finally evicted.
         *
         * Below fixes dirty accounting after removing the folio entirely
         * but leaves the dirty flag set: it has no effect for truncated
         * folio and anyway will be cleared before returning folio to
         * buddy allocator.
         */
        if (WARN_ON_ONCE(folio_test_dirty(folio) &&
                         mapping_can_writeback(mapping)))
                folio_account_cleaned(folio, inode_to_wb(mapping->host));
}

/*
 * Delete a page from the page cache and free it. Caller has to make
 * sure the page is locked and that nobody else uses it - or that usage
 * is safe.  The caller must hold the i_pages lock.
 */
void __filemap_remove_folio(struct folio *folio, void *shadow)
{
        struct address_space *mapping = folio->mapping;

        trace_mm_filemap_delete_from_page_cache(folio);
        filemap_unaccount_folio(mapping, folio);
        page_cache_delete(mapping, folio, shadow);
}

void filemap_free_folio(struct address_space *mapping, struct folio *folio)
{
        void (*free_folio)(struct folio *);
        int refs = 1;

        free_folio = mapping->a_ops->free_folio;
        if (free_folio)
                free_folio(folio);

        if (folio_test_large(folio))
                refs = folio_nr_pages(folio);
        folio_put_refs(folio, refs);
}

/**
 * filemap_remove_folio - Remove folio from page cache.
 * @folio: The folio.
 *
 * This must be called only on folios that are locked and have been
 * verified to be in the page cache.  It will never put the folio into
 * the free list because the caller has a reference on the page.
 */
void filemap_remove_folio(struct folio *folio)
{
        struct address_space *mapping = folio->mapping;

        BUG_ON(!folio_test_locked(folio));
        spin_lock(&mapping->host->i_lock);
        xa_lock_irq(&mapping->i_pages);
        __filemap_remove_folio(folio, NULL);
        xa_unlock_irq(&mapping->i_pages);
        if (mapping_shrinkable(mapping))
                inode_add_lru(mapping->host);
        spin_unlock(&mapping->host->i_lock);

        filemap_free_folio(mapping, folio);
}

/*
 * page_cache_delete_batch - delete several folios from page cache
 * @mapping: the mapping to which folios belong
 * @fbatch: batch of folios to delete
 *
 * The function walks over mapping->i_pages and removes folios passed in
 * @fbatch from the mapping. The function expects @fbatch to be sorted
 * by page index and is optimised for it to be dense.
 * It tolerates holes in @fbatch (mapping entries at those indices are not
 * modified).
 *
 * The function expects the i_pages lock to be held.
 */
static void page_cache_delete_batch(struct address_space *mapping,
                             struct folio_batch *fbatch)
{
        XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);
        long total_pages = 0;
        int i = 0;
        struct folio *folio;

        mapping_set_update(&xas, mapping);
        xas_for_each(&xas, folio, ULONG_MAX) {
                if (i >= folio_batch_count(fbatch))
                        break;

                /* A swap/dax/shadow entry got inserted? Skip it. */
                if (xa_is_value(folio))
                        continue;
                /*
                 * A page got inserted in our range? Skip it. We have our
                 * pages locked so they are protected from being removed.
                 * If we see a page whose index is higher than ours, it
                 * means our page has been removed, which shouldn't be
                 * possible because we're holding the PageLock.
                 */
                if (folio != fbatch->folios[i]) {
                        VM_BUG_ON_FOLIO(folio->index >
                                        fbatch->folios[i]->index, folio);
                        continue;
                }

                WARN_ON_ONCE(!folio_test_locked(folio));

                folio->mapping = NULL;
                /* Leave folio->index set: truncation lookup relies on it */

                i++;
                xas_store(&xas, NULL);
                total_pages += folio_nr_pages(folio);
        }
        mapping->nrpages -= total_pages;
}

void delete_from_page_cache_batch(struct address_space *mapping,
                                  struct folio_batch *fbatch)
{
        int i;

        if (!folio_batch_count(fbatch))
                return;

        spin_lock(&mapping->host->i_lock);
        xa_lock_irq(&mapping->i_pages);
        for (i = 0; i < folio_batch_count(fbatch); i++) {
                struct folio *folio = fbatch->folios[i];

                trace_mm_filemap_delete_from_page_cache(folio);
                filemap_unaccount_folio(mapping, folio);
        }
        page_cache_delete_batch(mapping, fbatch);
        xa_unlock_irq(&mapping->i_pages);
        if (mapping_shrinkable(mapping))
                inode_add_lru(mapping->host);
        spin_unlock(&mapping->host->i_lock);

        for (i = 0; i < folio_batch_count(fbatch); i++)
                filemap_free_folio(mapping, fbatch->folios[i]);
}

int filemap_check_errors(struct address_space *mapping)
{
        int ret = 0;
        /* Check for outstanding write errors */
        if (test_bit(AS_ENOSPC, &mapping->flags) &&
            test_and_clear_bit(AS_ENOSPC, &mapping->flags))
                ret = -ENOSPC;
        if (test_bit(AS_EIO, &mapping->flags) &&
            test_and_clear_bit(AS_EIO, &mapping->flags))
                ret = -EIO;
        return ret;
}
EXPORT_SYMBOL(filemap_check_errors);

static int filemap_check_and_keep_errors(struct address_space *mapping)
{
        /* Check for outstanding write errors */
        if (test_bit(AS_EIO, &mapping->flags))
                return -EIO;
        if (test_bit(AS_ENOSPC, &mapping->flags))
                return -ENOSPC;
        return 0;
}

/**
 * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
 * @mapping:        address space structure to write
 * @wbc:        the writeback_control controlling the writeout
 *
 * Call writepages on the mapping using the provided wbc to control the
 * writeout.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int filemap_fdatawrite_wbc(struct address_space *mapping,
                           struct writeback_control *wbc)
{
        int ret;

        if (!mapping_can_writeback(mapping) ||
            !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                return 0;

        wbc_attach_fdatawrite_inode(wbc, mapping->host);
        ret = do_writepages(mapping, wbc);
        wbc_detach_inode(wbc);
        return ret;
}
EXPORT_SYMBOL(filemap_fdatawrite_wbc);

/**
 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
 * @mapping:        address space structure to write
 * @start:        offset in bytes where the range starts
 * @end:        offset in bytes where the range ends (inclusive)
 * @sync_mode:        enable synchronous operation
 *
 * Start writeback against all of a mapping's dirty pages that lie
 * within the byte offsets <start, end> inclusive.
 *
 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
 * opposed to a regular memory cleansing writeback.  The difference between
 * these two operations is that if a dirty page/buffer is encountered, it must
 * be waited upon, and not just skipped over.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
                                loff_t end, int sync_mode)
{
        struct writeback_control wbc = {
                .sync_mode = sync_mode,
                .nr_to_write = LONG_MAX,
                .range_start = start,
                .range_end = end,
        };

        return filemap_fdatawrite_wbc(mapping, &wbc);
}

static inline int __filemap_fdatawrite(struct address_space *mapping,
        int sync_mode)
{
        return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
}

int filemap_fdatawrite(struct address_space *mapping)
{
        return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
}
EXPORT_SYMBOL(filemap_fdatawrite);

int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
                                loff_t end)
{
        return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
}
EXPORT_SYMBOL(filemap_fdatawrite_range);

/**
 * filemap_flush - mostly a non-blocking flush
 * @mapping:        target address_space
 *
 * This is a mostly non-blocking flush.  Not suitable for data-integrity
 * purposes - I/O may not be started against all dirty pages.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int filemap_flush(struct address_space *mapping)
{
        return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
}
EXPORT_SYMBOL(filemap_flush);

/**
 * filemap_range_has_page - check if a page exists in range.
 * @mapping:           address space within which to check
 * @start_byte:        offset in bytes where the range starts
 * @end_byte:          offset in bytes where the range ends (inclusive)
 *
 * Find at least one page in the range supplied, usually used to check if
 * direct writing in this range will trigger a writeback.
 *
 * Return: %true if at least one page exists in the specified range,
 * %false otherwise.
 */
bool filemap_range_has_page(struct address_space *mapping,
                           loff_t start_byte, loff_t end_byte)
{
        struct folio *folio;
        XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
        pgoff_t max = end_byte >> PAGE_SHIFT;

        if (end_byte < start_byte)
                return false;

        rcu_read_lock();
        for (;;) {
                folio = xas_find(&xas, max);
                if (xas_retry(&xas, folio))
                        continue;
                /* Shadow entries don't count */
                if (xa_is_value(folio))
                        continue;
                /*
                 * We don't need to try to pin this page; we're about to
                 * release the RCU lock anyway.  It is enough to know that
                 * there was a page here recently.
                 */
                break;
        }
        rcu_read_unlock();

        return folio != NULL;
}
EXPORT_SYMBOL(filemap_range_has_page);

static void __filemap_fdatawait_range(struct address_space *mapping,
                                     loff_t start_byte, loff_t end_byte)
{
        pgoff_t index = start_byte >> PAGE_SHIFT;
        pgoff_t end = end_byte >> PAGE_SHIFT;
        struct folio_batch fbatch;
        unsigned nr_folios;

        folio_batch_init(&fbatch);

        while (index <= end) {
                unsigned i;

                nr_folios = filemap_get_folios_tag(mapping, &index, end,
                                PAGECACHE_TAG_WRITEBACK, &fbatch);

                if (!nr_folios)
                        break;

                for (i = 0; i < nr_folios; i++) {
                        struct folio *folio = fbatch.folios[i];

                        folio_wait_writeback(folio);
                        folio_clear_error(folio);
                }
                folio_batch_release(&fbatch);
                cond_resched();
        }
}

/**
 * filemap_fdatawait_range - wait for writeback to complete
 * @mapping:                address space structure to wait for
 * @start_byte:                offset in bytes where the range starts
 * @end_byte:                offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the given address space
 * in the given range and wait for all of them.  Check error status of
 * the address space and return it.
 *
 * Since the error status of the address space is cleared by this function,
 * callers are responsible for checking the return value and handling and/or
 * reporting the error.
 *
 * Return: error status of the address space.
 */
int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
                            loff_t end_byte)
{
        __filemap_fdatawait_range(mapping, start_byte, end_byte);
        return filemap_check_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_range);

/**
 * filemap_fdatawait_range_keep_errors - wait for writeback to complete
 * @mapping:                address space structure to wait for
 * @start_byte:                offset in bytes where the range starts
 * @end_byte:                offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the given address space in the
 * given range and wait for all of them.  Unlike filemap_fdatawait_range(),
 * this function does not clear error status of the address space.
 *
 * Use this function if callers don't handle errors themselves.  Expected
 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
 * fsfreeze(8)
 */
int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
                loff_t start_byte, loff_t end_byte)
{
        __filemap_fdatawait_range(mapping, start_byte, end_byte);
        return filemap_check_and_keep_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);

/**
 * file_fdatawait_range - wait for writeback to complete
 * @file:                file pointing to address space structure to wait for
 * @start_byte:                offset in bytes where the range starts
 * @end_byte:                offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the address space that file
 * refers to, in the given range and wait for all of them.  Check error
 * status of the address space vs. the file->f_wb_err cursor and return it.
 *
 * Since the error status of the file is advanced by this function,
 * callers are responsible for checking the return value and handling and/or
 * reporting the error.
 *
 * Return: error status of the address space vs. the file->f_wb_err cursor.
 */
int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
{
        struct address_space *mapping = file->f_mapping;

        __filemap_fdatawait_range(mapping, start_byte, end_byte);
        return file_check_and_advance_wb_err(file);
}
EXPORT_SYMBOL(file_fdatawait_range);

/**
 * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
 * @mapping: address space structure to wait for
 *
 * Walk the list of under-writeback pages of the given address space
 * and wait for all of them.  Unlike filemap_fdatawait(), this function
 * does not clear error status of the address space.
 *
 * Use this function if callers don't handle errors themselves.  Expected
 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
 * fsfreeze(8)
 *
 * Return: error status of the address space.
 */
int filemap_fdatawait_keep_errors(struct address_space *mapping)
{
        __filemap_fdatawait_range(mapping, 0, LLONG_MAX);
        return filemap_check_and_keep_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_keep_errors);

/* Returns true if writeback might be needed or already in progress. */
static bool mapping_needs_writeback(struct address_space *mapping)
{
        return mapping->nrpages;
}

bool filemap_range_has_writeback(struct address_space *mapping,
                                 loff_t start_byte, loff_t end_byte)
{
        XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
        pgoff_t max = end_byte >> PAGE_SHIFT;
        struct folio *folio;

        if (end_byte < start_byte)
                return false;

        rcu_read_lock();
        xas_for_each(&xas, folio, max) {
                if (xas_retry(&xas, folio))
                        continue;
                if (xa_is_value(folio))
                        continue;
                if (folio_test_dirty(folio) || folio_test_locked(folio) ||
                                folio_test_writeback(folio))
                        break;
        }
        rcu_read_unlock();
        return folio != NULL;
}
EXPORT_SYMBOL_GPL(filemap_range_has_writeback);

/**
 * filemap_write_and_wait_range - write out & wait on a file range
 * @mapping:        the address_space for the pages
 * @lstart:        offset in bytes where the range starts
 * @lend:        offset in bytes where the range ends (inclusive)
 *
 * Write out and wait upon file offsets lstart->lend, inclusive.
 *
 * Note that @lend is inclusive (describes the last byte to be written) so
 * that this function can be used to write to the very end-of-file (end = -1).
 *
 * Return: error status of the address space.
 */
int filemap_write_and_wait_range(struct address_space *mapping,
                                 loff_t lstart, loff_t lend)
{
        int err = 0, err2;

        if (lend < lstart)
                return 0;

        if (mapping_needs_writeback(mapping)) {
                err = __filemap_fdatawrite_range(mapping, lstart, lend,
                                                 WB_SYNC_ALL);
                /*
                 * Even if the above returned error, the pages may be
                 * written partially (e.g. -ENOSPC), so we wait for it.
                 * But the -EIO is special case, it may indicate the worst
                 * thing (e.g. bug) happened, so we avoid waiting for it.
                 */
                if (err != -EIO)
                        __filemap_fdatawait_range(mapping, lstart, lend);
        }
        err2 = filemap_check_errors(mapping);
        if (!err)
                err = err2;
        return err;
}
EXPORT_SYMBOL(filemap_write_and_wait_range);

void __filemap_set_wb_err(struct address_space *mapping, int err)
{
        errseq_t eseq = errseq_set(&mapping->wb_err, err);

        trace_filemap_set_wb_err(mapping, eseq);
}
EXPORT_SYMBOL(__filemap_set_wb_err);

/**
 * file_check_and_advance_wb_err - report wb error (if any) that was previously
 *                                    and advance wb_err to current one
 * @file: struct file on which the error is being reported
 *
 * When userland calls fsync (or something like nfsd does the equivalent), we
 * want to report any writeback errors that occurred since the last fsync (or
 * since the file was opened if there haven't been any).
 *
 * Grab the wb_err from the mapping. If it matches what we have in the file,
 * then just quickly return 0. The file is all caught up.
 *
 * If it doesn't match, then take the mapping value, set the "seen" flag in
 * it and try to swap it into place. If it works, or another task beat us
 * to it with the new value, then update the f_wb_err and return the error
 * portion. The error at this point must be reported via proper channels
 * (a'la fsync, or NFS COMMIT operation, etc.).
 *
 * While we handle mapping->wb_err with atomic operations, the f_wb_err
 * value is protected by the f_lock since we must ensure that it reflects
 * the latest value swapped in for this file descriptor.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int file_check_and_advance_wb_err(struct file *file)
{
        int err = 0;
        errseq_t old = READ_ONCE(file->f_wb_err);
        struct address_space *mapping = file->f_mapping;

        /* Locklessly handle the common case where nothing has changed */
        if (errseq_check(&mapping->wb_err, old)) {
                /* Something changed, must use slow path */
                spin_lock(&file->f_lock);
                old = file->f_wb_err;
                err = errseq_check_and_advance(&mapping->wb_err,
                                                &file->f_wb_err);
                trace_file_check_and_advance_wb_err(file, old);
                spin_unlock(&file->f_lock);
        }

        /*
         * We're mostly using this function as a drop in replacement for
         * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
         * that the legacy code would have had on these flags.
         */
        clear_bit(AS_EIO, &mapping->flags);
        clear_bit(AS_ENOSPC, &mapping->flags);
        return err;
}
EXPORT_SYMBOL(file_check_and_advance_wb_err);

/**
 * file_write_and_wait_range - write out & wait on a file range
 * @file:        file pointing to address_space with pages
 * @lstart:        offset in bytes where the range starts
 * @lend:        offset in bytes where the range ends (inclusive)
 *
 * Write out and wait upon file offsets lstart->lend, inclusive.
 *
 * Note that @lend is inclusive (describes the last byte to be written) so
 * that this function can be used to write to the very end-of-file (end = -1).
 *
 * After writing out and waiting on the data, we check and advance the
 * f_wb_err cursor to the latest value, and return any errors detected there.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
{
        int err = 0, err2;
        struct address_space *mapping = file->f_mapping;

        if (lend < lstart)
                return 0;

        if (mapping_needs_writeback(mapping)) {
                err = __filemap_fdatawrite_range(mapping, lstart, lend,
                                                 WB_SYNC_ALL);
                /* See comment of filemap_write_and_wait() */
                if (err != -EIO)
                        __filemap_fdatawait_range(mapping, lstart, lend);
        }
        err2 = file_check_and_advance_wb_err(file);
        if (!err)
                err = err2;
        return err;
}
EXPORT_SYMBOL(file_write_and_wait_range);

/**
 * replace_page_cache_folio - replace a pagecache folio with a new one
 * @old:        folio to be replaced
 * @new:        folio to replace with
 *
 * This function replaces a folio in the pagecache with a new one.  On
 * success it acquires the pagecache reference for the new folio and
 * drops it for the old folio.  Both the old and new folios must be
 * locked.  This function does not add the new folio to the LRU, the
 * caller must do that.
 *
 * The remove + add is atomic.  This function cannot fail.
 */
void replace_page_cache_folio(struct folio *old, struct folio *new)
{
        struct address_space *mapping = old->mapping;
        void (*free_folio)(struct folio *) = mapping->a_ops->free_folio;
        pgoff_t offset = old->index;
        XA_STATE(xas, &mapping->i_pages, offset);

        VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
        VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
        VM_BUG_ON_FOLIO(new->mapping, new);

        folio_get(new);
        new->mapping = mapping;
        new->index = offset;

        mem_cgroup_replace_folio(old, new);

        xas_lock_irq(&xas);
        xas_store(&xas, new);

        old->mapping = NULL;
        /* hugetlb pages do not participate in page cache accounting. */
        if (!folio_test_hugetlb(old))
                __lruvec_stat_sub_folio(old, NR_FILE_PAGES);
        if (!folio_test_hugetlb(new))
                __lruvec_stat_add_folio(new, NR_FILE_PAGES);
        if (folio_test_swapbacked(old))
                __lruvec_stat_sub_folio(old, NR_SHMEM);
        if (folio_test_swapbacked(new))
                __lruvec_stat_add_folio(new, NR_SHMEM);
        xas_unlock_irq(&xas);
        if (free_folio)
                free_folio(old);
        folio_put(old);
}
EXPORT_SYMBOL_GPL(replace_page_cache_folio);

noinline int __filemap_add_folio(struct address_space *mapping,
                struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
{
        XA_STATE(xas, &mapping->i_pages, index);
        void *alloced_shadow = NULL;
        int alloced_order = 0;
        bool huge;
        long nr;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
        mapping_set_update(&xas, mapping);

        VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
        xas_set_order(&xas, index, folio_order(folio));
        huge = folio_test_hugetlb(folio);
        nr = folio_nr_pages(folio);

        gfp &= GFP_RECLAIM_MASK;
        folio_ref_add(folio, nr);
        folio->mapping = mapping;
        folio->index = xas.xa_index;

        for (;;) {
                int order = -1, split_order = 0;
                void *entry, *old = NULL;

                xas_lock_irq(&xas);
                xas_for_each_conflict(&xas, entry) {
                        old = entry;
                        if (!xa_is_value(entry)) {
                                xas_set_err(&xas, -EEXIST);
                                goto unlock;
                        }
                        /*
                         * If a larger entry exists,
                         * it will be the first and only entry iterated.
                         */
                        if (order == -1)
                                order = xas_get_order(&xas);
                }

                /* entry may have changed before we re-acquire the lock */
                if (alloced_order && (old != alloced_shadow || order != alloced_order)) {
                        xas_destroy(&xas);
                        alloced_order = 0;
                }

                if (old) {
                        if (order > 0 && order > folio_order(folio)) {
                                /* How to handle large swap entries? */
                                BUG_ON(shmem_mapping(mapping));
                                if (!alloced_order) {
                                        split_order = order;
                                        goto unlock;
                                }
                                xas_split(&xas, old, order);
                                xas_reset(&xas);
                        }
                        if (shadowp)
                                *shadowp = old;
                }

                xas_store(&xas, folio);
                if (xas_error(&xas))
                        goto unlock;

                mapping->nrpages += nr;

                /* hugetlb pages do not participate in page cache accounting */
                if (!huge) {
                        __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
                        if (folio_test_pmd_mappable(folio))
                                __lruvec_stat_mod_folio(folio,
                                                NR_FILE_THPS, nr);
                }

unlock:
                xas_unlock_irq(&xas);

                /* split needed, alloc here and retry. */
                if (split_order) {
                        xas_split_alloc(&xas, old, split_order, gfp);
                        if (xas_error(&xas))
                                goto error;
                        alloced_shadow = old;
                        alloced_order = split_order;
                        xas_reset(&xas);
                        continue;
                }

                if (!xas_nomem(&xas, gfp))
                        break;
        }

        if (xas_error(&xas))
                goto error;

        trace_mm_filemap_add_to_page_cache(folio);
        return 0;
error:
        folio->mapping = NULL;
        /* Leave page->index set: truncation relies upon it */
        folio_put_refs(folio, nr);
        return xas_error(&xas);
}
ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);

int filemap_add_folio(struct address_space *mapping, struct folio *folio,
                                pgoff_t index, gfp_t gfp)
{
        void *shadow = NULL;
        int ret;

        ret = mem_cgroup_charge(folio, NULL, gfp);
        if (ret)
                return ret;

        __folio_set_locked(folio);
        ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
        if (unlikely(ret)) {
                mem_cgroup_uncharge(folio);
                __folio_clear_locked(folio);
        } else {
                /*
                 * The folio might have been evicted from cache only
                 * recently, in which case it should be activated like
                 * any other repeatedly accessed folio.
                 * The exception is folios getting rewritten; evicting other
                 * data from the working set, only to cache data that will
                 * get overwritten with something else, is a waste of memory.
                 */
                WARN_ON_ONCE(folio_test_active(folio));
                if (!(gfp & __GFP_WRITE) && shadow)
                        workingset_refault(folio, shadow);
                folio_add_lru(folio);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(filemap_add_folio);

#ifdef CONFIG_NUMA
struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
{
        int n;
        struct folio *folio;

        if (cpuset_do_page_mem_spread()) {
                unsigned int cpuset_mems_cookie;
                do {
                        cpuset_mems_cookie = read_mems_allowed_begin();
                        n = cpuset_mem_spread_node();
                        folio = __folio_alloc_node_noprof(gfp, order, n);
                } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));

                return folio;
        }
        return folio_alloc_noprof(gfp, order);
}
EXPORT_SYMBOL(filemap_alloc_folio_noprof);
#endif

/*
 * filemap_invalidate_lock_two - lock invalidate_lock for two mappings
 *
 * Lock exclusively invalidate_lock of any passed mapping that is not NULL.
 *
 * @mapping1: the first mapping to lock
 * @mapping2: the second mapping to lock
 */
void filemap_invalidate_lock_two(struct address_space *mapping1,
                                 struct address_space *mapping2)
{
        if (mapping1 > mapping2)
                swap(mapping1, mapping2);
        if (mapping1)
                down_write(&mapping1->invalidate_lock);
        if (mapping2 && mapping1 != mapping2)
                down_write_nested(&mapping2->invalidate_lock, 1);
}
EXPORT_SYMBOL(filemap_invalidate_lock_two);

/*
 * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
 *
 * Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
 *
 * @mapping1: the first mapping to unlock
 * @mapping2: the second mapping to unlock
 */
void filemap_invalidate_unlock_two(struct address_space *mapping1,
                                   struct address_space *mapping2)
{
        if (mapping1)
                up_write(&mapping1->invalidate_lock);
        if (mapping2 && mapping1 != mapping2)
                up_write(&mapping2->invalidate_lock);
}
EXPORT_SYMBOL(filemap_invalidate_unlock_two);

/*
 * In order to wait for pages to become available there must be
 * waitqueues associated with pages. By using a hash table of
 * waitqueues where the bucket discipline is to maintain all
 * waiters on the same queue and wake all when any of the pages
 * become available, and for the woken contexts to check to be
 * sure the appropriate page became available, this saves space
 * at a cost of "thundering herd" phenomena during rare hash
 * collisions.
 */
#define PAGE_WAIT_TABLE_BITS 8
#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;

static wait_queue_head_t *folio_waitqueue(struct folio *folio)
{
        return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
}

void __init pagecache_init(void)
{
        int i;

        for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
                init_waitqueue_head(&folio_wait_table[i]);

        page_writeback_init();
}

/*
 * The page wait code treats the "wait->flags" somewhat unusually, because
 * we have multiple different kinds of waits, not just the usual "exclusive"
 * one.
 *
 * We have:
 *
 *  (a) no special bits set:
 *
 *        We're just waiting for the bit to be released, and when a waker
 *        calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
 *        and remove it from the wait queue.
 *
 *        Simple and straightforward.
 *
 *  (b) WQ_FLAG_EXCLUSIVE:
 *
 *        The waiter is waiting to get the lock, and only one waiter should
 *        be woken up to avoid any thundering herd behavior. We'll set the
 *        WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
 *
 *        This is the traditional exclusive wait.
 *
 *  (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
 *
 *        The waiter is waiting to get the bit, and additionally wants the
 *        lock to be transferred to it for fair lock behavior. If the lock
 *        cannot be taken, we stop walking the wait queue without waking
 *        the waiter.
 *
 *        This is the "fair lock handoff" case, and in addition to setting
 *        WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
 *        that it now has the lock.
 */
static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
{
        unsigned int flags;
        struct wait_page_key *key = arg;
        struct wait_page_queue *wait_page
                = container_of(wait, struct wait_page_queue, wait);

        if (!wake_page_match(wait_page, key))
                return 0;

        /*
         * If it's a lock handoff wait, we get the bit for it, and
         * stop walking (and do not wake it up) if we can't.
         */
        flags = wait->flags;
        if (flags & WQ_FLAG_EXCLUSIVE) {
                if (test_bit(key->bit_nr, &key->folio->flags))
                        return -1;
                if (flags & WQ_FLAG_CUSTOM) {
                        if (test_and_set_bit(key->bit_nr, &key->folio->flags))
                                return -1;
                        flags |= WQ_FLAG_DONE;
                }
        }

        /*
         * We are holding the wait-queue lock, but the waiter that
         * is waiting for this will be checking the flags without
         * any locking.
         *
         * So update the flags atomically, and wake up the waiter
         * afterwards to avoid any races. This store-release pairs
         * with the load-acquire in folio_wait_bit_common().
         */
        smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
        wake_up_state(wait->private, mode);

        /*
         * Ok, we have successfully done what we're waiting for,
         * and we can unconditionally remove the wait entry.
         *
         * Note that this pairs with the "finish_wait()" in the
         * waiter, and has to be the absolute last thing we do.
         * After this list_del_init(&wait->entry) the wait entry
         * might be de-allocated and the process might even have
         * exited.
         */
        list_del_init_careful(&wait->entry);
        return (flags & WQ_FLAG_EXCLUSIVE) != 0;
}

static void folio_wake_bit(struct folio *folio, int bit_nr)
{
        wait_queue_head_t *q = folio_waitqueue(folio);
        struct wait_page_key key;
        unsigned long flags;

        key.folio = folio;
        key.bit_nr = bit_nr;
        key.page_match = 0;

        spin_lock_irqsave(&q->lock, flags);
        __wake_up_locked_key(q, TASK_NORMAL, &key);

        /*
         * It's possible to miss clearing waiters here, when we woke our page
         * waiters, but the hashed waitqueue has waiters for other pages on it.
         * That's okay, it's a rare case. The next waker will clear it.
         *
         * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,
         * other), the flag may be cleared in the course of freeing the page;
         * but that is not required for correctness.
         */
        if (!waitqueue_active(q) || !key.page_match)
                folio_clear_waiters(folio);

        spin_unlock_irqrestore(&q->lock, flags);
}

/*
 * A choice of three behaviors for folio_wait_bit_common():
 */
enum behavior {
        EXCLUSIVE,        /* Hold ref to page and take the bit when woken, like
                         * __folio_lock() waiting on then setting PG_locked.
                         */
        SHARED,                /* Hold ref to page and check the bit when woken, like
                         * folio_wait_writeback() waiting on PG_writeback.
                         */
        DROP,                /* Drop ref to page before wait, no check when woken,
                         * like folio_put_wait_locked() on PG_locked.
                         */
};

/*
 * Attempt to check (or get) the folio flag, and mark us done
 * if successful.
 */
static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
                                        struct wait_queue_entry *wait)
{
        if (wait->flags & WQ_FLAG_EXCLUSIVE) {
                if (test_and_set_bit(bit_nr, &folio->flags))
                        return false;
        } else if (test_bit(bit_nr, &folio->flags))
                return false;

        wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
        return true;
}

/* How many times do we accept lock stealing from under a waiter? */
int sysctl_page_lock_unfairness = 5;

static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
                int state, enum behavior behavior)
{
        wait_queue_head_t *q = folio_waitqueue(folio);
        int unfairness = sysctl_page_lock_unfairness;
        struct wait_page_queue wait_page;
        wait_queue_entry_t *wait = &wait_page.wait;
        bool thrashing = false;
        unsigned long pflags;
        bool in_thrashing;

        if (bit_nr == PG_locked &&
            !folio_test_uptodate(folio) && folio_test_workingset(folio)) {
                delayacct_thrashing_start(&in_thrashing);
                psi_memstall_enter(&pflags);
                thrashing = true;
        }

        init_wait(wait);
        wait->func = wake_page_function;
        wait_page.folio = folio;
        wait_page.bit_nr = bit_nr;

repeat:
        wait->flags = 0;
        if (behavior == EXCLUSIVE) {
                wait->flags = WQ_FLAG_EXCLUSIVE;
                if (--unfairness < 0)
                        wait->flags |= WQ_FLAG_CUSTOM;
        }

        /*
         * Do one last check whether we can get the
         * page bit synchronously.
         *
         * Do the folio_set_waiters() marking before that
         * to let any waker we _just_ missed know they
         * need to wake us up (otherwise they'll never
         * even go to the slow case that looks at the
         * page queue), and add ourselves to the wait
         * queue if we need to sleep.
         *
         * This part needs to be done under the queue
         * lock to avoid races.
         */
        spin_lock_irq(&q->lock);
        folio_set_waiters(folio);
        if (!folio_trylock_flag(folio, bit_nr, wait))
                __add_wait_queue_entry_tail(q, wait);
        spin_unlock_irq(&q->lock);

        /*
         * From now on, all the logic will be based on
         * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
         * see whether the page bit testing has already
         * been done by the wake function.
         *
         * We can drop our reference to the folio.
         */
        if (behavior == DROP)
                folio_put(folio);

        /*
         * Note that until the "finish_wait()", or until
         * we see the WQ_FLAG_WOKEN flag, we need to
         * be very careful with the 'wait->flags', because
         * we may race with a waker that sets them.
         */
        for (;;) {
                unsigned int flags;

                set_current_state(state);

                /* Loop until we've been woken or interrupted */
                flags = smp_load_acquire(&wait->flags);
                if (!(flags & WQ_FLAG_WOKEN)) {
                        if (signal_pending_state(state, current))
                                break;

                        io_schedule();
                        continue;
                }

                /* If we were non-exclusive, we're done */
                if (behavior != EXCLUSIVE)
                        break;

                /* If the waker got the lock for us, we're done */
                if (flags & WQ_FLAG_DONE)
                        break;

                /*
                 * Otherwise, if we're getting the lock, we need to
                 * try to get it ourselves.
                 *
                 * And if that fails, we'll have to retry this all.
                 */
                if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))
                        goto repeat;

                wait->flags |= WQ_FLAG_DONE;
                break;
        }

        /*
         * If a signal happened, this 'finish_wait()' may remove the last
         * waiter from the wait-queues, but the folio waiters bit will remain
         * set. That's ok. The next wakeup will take care of it, and trying
         * to do it here would be difficult and prone to races.
         */
        finish_wait(q, wait);

        if (thrashing) {
                delayacct_thrashing_end(&in_thrashing);
                psi_memstall_leave(&pflags);
        }

        /*
         * NOTE! The wait->flags weren't stable until we've done the
         * 'finish_wait()', and we could have exited the loop above due
         * to a signal, and had a wakeup event happen after the signal
         * test but before the 'finish_wait()'.
         *
         * So only after the finish_wait() can we reliably determine
         * if we got woken up or not, so we can now figure out the final
         * return value based on that state without races.
         *
         * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
         * waiter, but an exclusive one requires WQ_FLAG_DONE.
         */
        if (behavior == EXCLUSIVE)
                return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;

        return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
}

#ifdef CONFIG_MIGRATION
/**
 * migration_entry_wait_on_locked - Wait for a migration entry to be removed
 * @entry: migration swap entry.
 * @ptl: already locked ptl. This function will drop the lock.
 *
 * Wait for a migration entry referencing the given page to be removed. This is
 * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except
 * this can be called without taking a reference on the page. Instead this
 * should be called while holding the ptl for the migration entry referencing
 * the page.
 *
 * Returns after unlocking the ptl.
 *
 * This follows the same logic as folio_wait_bit_common() so see the comments
 * there.
 */
void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
        __releases(ptl)
{
        struct wait_page_queue wait_page;
        wait_queue_entry_t *wait = &wait_page.wait;
        bool thrashing = false;
        unsigned long pflags;
        bool in_thrashing;
        wait_queue_head_t *q;
        struct folio *folio = pfn_swap_entry_folio(entry);

        q = folio_waitqueue(folio);
        if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
                delayacct_thrashing_start(&in_thrashing);
                psi_memstall_enter(&pflags);
                thrashing = true;
        }

        init_wait(wait);
        wait->func = wake_page_function;
        wait_page.folio = folio;
        wait_page.bit_nr = PG_locked;
        wait->flags = 0;

        spin_lock_irq(&q->lock);
        folio_set_waiters(folio);
        if (!folio_trylock_flag(folio, PG_locked, wait))
                __add_wait_queue_entry_tail(q, wait);
        spin_unlock_irq(&q->lock);

        /*
         * If a migration entry exists for the page the migration path must hold
         * a valid reference to the page, and it must take the ptl to remove the
         * migration entry. So the page is valid until the ptl is dropped.
         */
        spin_unlock(ptl);

        for (;;) {
                unsigned int flags;

                set_current_state(TASK_UNINTERRUPTIBLE);

                /* Loop until we've been woken or interrupted */
                flags = smp_load_acquire(&wait->flags);
                if (!(flags & WQ_FLAG_WOKEN)) {
                        if (signal_pending_state(TASK_UNINTERRUPTIBLE, current))
                                break;

                        io_schedule();
                        continue;
                }
                break;
        }

        finish_wait(q, wait);

        if (thrashing) {
                delayacct_thrashing_end(&in_thrashing);
                psi_memstall_leave(&pflags);
        }
}
#endif

void folio_wait_bit(struct folio *folio, int bit_nr)
{
        folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
}
EXPORT_SYMBOL(folio_wait_bit);

int folio_wait_bit_killable(struct folio *folio, int bit_nr)
{
        return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);
}
EXPORT_SYMBOL(folio_wait_bit_killable);

/**
 * folio_put_wait_locked - Drop a reference and wait for it to be unlocked
 * @folio: The folio to wait for.
 * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
 *
 * The caller should hold a reference on @folio.  They expect the page to
 * become unlocked relatively soon, but do not wish to hold up migration
 * (for example) by holding the reference while waiting for the folio to
 * come unlocked.  After this function returns, the caller should not
 * dereference @folio.
 *
 * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.
 */
static int folio_put_wait_locked(struct folio *folio, int state)
{
        return folio_wait_bit_common(folio, PG_locked, state, DROP);
}

/**
 * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue
 * @folio: Folio defining the wait queue of interest
 * @waiter: Waiter to add to the queue
 *
 * Add an arbitrary @waiter to the wait queue for the nominated @folio.
 */
void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter)
{
        wait_queue_head_t *q = folio_waitqueue(folio);
        unsigned long flags;

        spin_lock_irqsave(&q->lock, flags);
        __add_wait_queue_entry_tail(q, waiter);
        folio_set_waiters(folio);
        spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL_GPL(folio_add_wait_queue);

/**
 * folio_unlock - Unlock a locked folio.
 * @folio: The folio.
 *
 * Unlocks the folio and wakes up any thread sleeping on the page lock.
 *
 * Context: May be called from interrupt or process context.  May not be
 * called from NMI context.
 */
void folio_unlock(struct folio *folio)
{
        /* Bit 7 allows x86 to check the byte's sign bit */
        BUILD_BUG_ON(PG_waiters != 7);
        BUILD_BUG_ON(PG_locked > 7);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        if (folio_xor_flags_has_waiters(folio, 1 << PG_locked))
                folio_wake_bit(folio, PG_locked);
}
EXPORT_SYMBOL(folio_unlock);

/**
 * folio_end_read - End read on a folio.
 * @folio: The folio.
 * @success: True if all reads completed successfully.
 *
 * When all reads against a folio have completed, filesystems should
 * call this function to let the pagecache know that no more reads
 * are outstanding.  This will unlock the folio and wake up any thread
 * sleeping on the lock.  The folio will also be marked uptodate if all
 * reads succeeded.
 *
 * Context: May be called from interrupt or process context.  May not be
 * called from NMI context.
 */
void folio_end_read(struct folio *folio, bool success)
{
        unsigned long mask = 1 << PG_locked;

        /* Must be in bottom byte for x86 to work */
        BUILD_BUG_ON(PG_uptodate > 7);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);

        if (likely(success))
                mask |= 1 << PG_uptodate;
        if (folio_xor_flags_has_waiters(folio, mask))
                folio_wake_bit(folio, PG_locked);
}
EXPORT_SYMBOL(folio_end_read);

/**
 * folio_end_private_2 - Clear PG_private_2 and wake any waiters.
 * @folio: The folio.
 *
 * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for
 * it.  The folio reference held for PG_private_2 being set is released.
 *
 * This is, for example, used when a netfs folio is being written to a local
 * disk cache, thereby allowing writes to the cache for the same folio to be
 * serialised.
 */
void folio_end_private_2(struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);
        clear_bit_unlock(PG_private_2, folio_flags(folio, 0));
        folio_wake_bit(folio, PG_private_2);
        folio_put(folio);
}
EXPORT_SYMBOL(folio_end_private_2);

/**
 * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
 * @folio: The folio to wait on.
 *
 * Wait for PG_private_2 to be cleared on a folio.
 */
void folio_wait_private_2(struct folio *folio)
{
        while (folio_test_private_2(folio))
                folio_wait_bit(folio, PG_private_2);
}
EXPORT_SYMBOL(folio_wait_private_2);

/**
 * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
 * @folio: The folio to wait on.
 *
 * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is
 * received by the calling task.
 *
 * Return:
 * - 0 if successful.
 * - -EINTR if a fatal signal was encountered.
 */
int folio_wait_private_2_killable(struct folio *folio)
{
        int ret = 0;

        while (folio_test_private_2(folio)) {
                ret = folio_wait_bit_killable(folio, PG_private_2);
                if (ret < 0)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(folio_wait_private_2_killable);

/**
 * folio_end_writeback - End writeback against a folio.
 * @folio: The folio.
 *
 * The folio must actually be under writeback.
 *
 * Context: May be called from process or interrupt context.
 */
void folio_end_writeback(struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);

        /*
         * folio_test_clear_reclaim() could be used here but it is an
         * atomic operation and overkill in this particular case. Failing
         * to shuffle a folio marked for immediate reclaim is too mild
         * a gain to justify taking an atomic operation penalty at the
         * end of every folio writeback.
         */
        if (folio_test_reclaim(folio)) {
                folio_clear_reclaim(folio);
                folio_rotate_reclaimable(folio);
        }

        /*
         * Writeback does not hold a folio reference of its own, relying
         * on truncation to wait for the clearing of PG_writeback.
         * But here we must make sure that the folio is not freed and
         * reused before the folio_wake_bit().
         */
        folio_get(folio);
        if (__folio_end_writeback(folio))
                folio_wake_bit(folio, PG_writeback);
        acct_reclaim_writeback(folio);
        folio_put(folio);
}
EXPORT_SYMBOL(folio_end_writeback);

/**
 * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
 * @folio: The folio to lock
 */
void __folio_lock(struct folio *folio)
{
        folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,
                                EXCLUSIVE);
}
EXPORT_SYMBOL(__folio_lock);

int __folio_lock_killable(struct folio *folio)
{
        return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,
                                        EXCLUSIVE);
}
EXPORT_SYMBOL_GPL(__folio_lock_killable);

static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
{
        struct wait_queue_head *q = folio_waitqueue(folio);
        int ret;

        wait->folio = folio;
        wait->bit_nr = PG_locked;

        spin_lock_irq(&q->lock);
        __add_wait_queue_entry_tail(q, &wait->wait);
        folio_set_waiters(folio);
        ret = !folio_trylock(folio);
        /*
         * If we were successful now, we know we're still on the
         * waitqueue as we're still under the lock. This means it's
         * safe to remove and return success, we know the callback
         * isn't going to trigger.
         */
        if (!ret)
                __remove_wait_queue(q, &wait->wait);
        else
                ret = -EIOCBQUEUED;
        spin_unlock_irq(&q->lock);
        return ret;
}

/*
 * Return values:
 * 0 - folio is locked.
 * non-zero - folio is not locked.
 *     mmap_lock or per-VMA lock has been released (mmap_read_unlock() or
 *     vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and
 *     FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held.
 *
 * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0
 * with the folio locked and the mmap_lock/per-VMA lock is left unperturbed.
 */
vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)
{
        unsigned int flags = vmf->flags;

        if (fault_flag_allow_retry_first(flags)) {
                /*
                 * CAUTION! In this case, mmap_lock/per-VMA lock is not
                 * released even though returning VM_FAULT_RETRY.
                 */
                if (flags & FAULT_FLAG_RETRY_NOWAIT)
                        return VM_FAULT_RETRY;

                release_fault_lock(vmf);
                if (flags & FAULT_FLAG_KILLABLE)
                        folio_wait_locked_killable(folio);
                else
                        folio_wait_locked(folio);
                return VM_FAULT_RETRY;
        }
        if (flags & FAULT_FLAG_KILLABLE) {
                bool ret;

                ret = __folio_lock_killable(folio);
                if (ret) {
                        release_fault_lock(vmf);
                        return VM_FAULT_RETRY;
                }
        } else {
                __folio_lock(folio);
        }

        return 0;
}

/**
 * page_cache_next_miss() - Find the next gap in the page cache.
 * @mapping: Mapping.
 * @index: Index.
 * @max_scan: Maximum range to search.
 *
 * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
 * gap with the lowest index.
 *
 * This function may be called under the rcu_read_lock.  However, this will
 * not atomically search a snapshot of the cache at a single point in time.
 * For example, if a gap is created at index 5, then subsequently a gap is
 * created at index 10, page_cache_next_miss covering both indices may
 * return 10 if called under the rcu_read_lock.
 *
 * Return: The index of the gap if found, otherwise an index outside the
 * range specified (in which case 'return - index >= max_scan' will be true).
 * In the rare case of index wrap-around, 0 will be returned.
 */
pgoff_t page_cache_next_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan)
{
        XA_STATE(xas, &mapping->i_pages, index);

        while (max_scan--) {
                void *entry = xas_next(&xas);
                if (!entry || xa_is_value(entry))
                        break;
                if (xas.xa_index == 0)
                        break;
        }

        return xas.xa_index;
}
EXPORT_SYMBOL(page_cache_next_miss);

/**
 * page_cache_prev_miss() - Find the previous gap in the page cache.
 * @mapping: Mapping.
 * @index: Index.
 * @max_scan: Maximum range to search.
 *
 * Search the range [max(index - max_scan + 1, 0), index] for the
 * gap with the highest index.
 *
 * This function may be called under the rcu_read_lock.  However, this will
 * not atomically search a snapshot of the cache at a single point in time.
 * For example, if a gap is created at index 10, then subsequently a gap is
 * created at index 5, page_cache_prev_miss() covering both indices may
 * return 5 if called under the rcu_read_lock.
 *
 * Return: The index of the gap if found, otherwise an index outside the
 * range specified (in which case 'index - return >= max_scan' will be true).
 * In the rare case of wrap-around, ULONG_MAX will be returned.
 */
pgoff_t page_cache_prev_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan)
{
        XA_STATE(xas, &mapping->i_pages, index);

        while (max_scan--) {
                void *entry = xas_prev(&xas);
                if (!entry || xa_is_value(entry))
                        break;
                if (xas.xa_index == ULONG_MAX)
                        break;
        }

        return xas.xa_index;
}
EXPORT_SYMBOL(page_cache_prev_miss);

/*
 * Lockless page cache protocol:
 * On the lookup side:
 * 1. Load the folio from i_pages
 * 2. Increment the refcount if it's not zero
 * 3. If the folio is not found by xas_reload(), put the refcount and retry
 *
 * On the removal side:
 * A. Freeze the page (by zeroing the refcount if nobody else has a reference)
 * B. Remove the page from i_pages
 * C. Return the page to the page allocator
 *
 * This means that any page may have its reference count temporarily
 * increased by a speculative page cache (or GUP-fast) lookup as it can
 * be allocated by another user before the RCU grace period expires.
 * Because the refcount temporarily acquired here may end up being the
 * last refcount on the page, any page allocation must be freeable by
 * folio_put().
 */

/*
 * filemap_get_entry - Get a page cache entry.
 * @mapping: the address_space to search
 * @index: The page cache index.
 *
 * Looks up the page cache entry at @mapping & @index.  If it is a folio,
 * it is returned with an increased refcount.  If it is a shadow entry
 * of a previously evicted folio, or a swap entry from shmem/tmpfs,
 * it is returned without further action.
 *
 * Return: The folio, swap or shadow entry, %NULL if nothing is found.
 */
void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
{
        XA_STATE(xas, &mapping->i_pages, index);
        struct folio *folio;

        rcu_read_lock();
repeat:
        xas_reset(&xas);
        folio = xas_load(&xas);
        if (xas_retry(&xas, folio))
                goto repeat;
        /*
         * A shadow entry of a recently evicted page, or a swap entry from
         * shmem/tmpfs.  Return it without attempting to raise page count.
         */
        if (!folio || xa_is_value(folio))
                goto out;

        if (!folio_try_get_rcu(folio))
                goto repeat;

        if (unlikely(folio != xas_reload(&xas))) {
                folio_put(folio);
                goto repeat;
        }
out:
        rcu_read_unlock();

        return folio;
}

/**
 * __filemap_get_folio - Find and get a reference to a folio.
 * @mapping: The address_space to search.
 * @index: The page index.
 * @fgp_flags: %FGP flags modify how the folio is returned.
 * @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
 *
 * Looks up the page cache entry at @mapping & @index.
 *
 * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
 * if the %GFP flags specified for %FGP_CREAT are atomic.
 *
 * If this function returns a folio, it is returned with an increased refcount.
 *
 * Return: The found folio or an ERR_PTR() otherwise.
 */
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
                fgf_t fgp_flags, gfp_t gfp)
{
        struct folio *folio;

repeat:
        folio = filemap_get_entry(mapping, index);
        if (xa_is_value(folio))
                folio = NULL;
        if (!folio)
                goto no_page;

        if (fgp_flags & FGP_LOCK) {
                if (fgp_flags & FGP_NOWAIT) {
                        if (!folio_trylock(folio)) {
                                folio_put(folio);
                                return ERR_PTR(-EAGAIN);
                        }
                } else {
                        folio_lock(folio);
                }

                /* Has the page been truncated? */
                if (unlikely(folio->mapping != mapping)) {
                        folio_unlock(folio);
                        folio_put(folio);
                        goto repeat;
                }
                VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
        }

        if (fgp_flags & FGP_ACCESSED)
                folio_mark_accessed(folio);
        else if (fgp_flags & FGP_WRITE) {
                /* Clear idle flag for buffer write */
                if (folio_test_idle(folio))
                        folio_clear_idle(folio);
        }

        if (fgp_flags & FGP_STABLE)
                folio_wait_stable(folio);
no_page:
        if (!folio && (fgp_flags & FGP_CREAT)) {
                unsigned order = FGF_GET_ORDER(fgp_flags);
                int err;

                if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
                        gfp |= __GFP_WRITE;
                if (fgp_flags & FGP_NOFS)
                        gfp &= ~__GFP_FS;
                if (fgp_flags & FGP_NOWAIT) {
                        gfp &= ~GFP_KERNEL;
                        gfp |= GFP_NOWAIT | __GFP_NOWARN;
                }
                if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
                        fgp_flags |= FGP_LOCK;

                if (!mapping_large_folio_support(mapping))
                        order = 0;
                if (order > MAX_PAGECACHE_ORDER)
                        order = MAX_PAGECACHE_ORDER;
                /* If we're not aligned, allocate a smaller folio */
                if (index & ((1UL << order) - 1))
                        order = __ffs(index);

                do {
                        gfp_t alloc_gfp = gfp;

                        err = -ENOMEM;
                        if (order > 0)
                                alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
                        folio = filemap_alloc_folio(alloc_gfp, order);
                        if (!folio)
                                continue;

                        /* Init accessed so avoid atomic mark_page_accessed later */
                        if (fgp_flags & FGP_ACCESSED)
                                __folio_set_referenced(folio);

                        err = filemap_add_folio(mapping, folio, index, gfp);
                        if (!err)
                                break;
                        folio_put(folio);
                        folio = NULL;
                } while (order-- > 0);

                if (err == -EEXIST)
                        goto repeat;
                if (err)
                        return ERR_PTR(err);
                /*
                 * filemap_add_folio locks the page, and for mmap
                 * we expect an unlocked page.
                 */
                if (folio && (fgp_flags & FGP_FOR_MMAP))
                        folio_unlock(folio);
        }

        if (!folio)
                return ERR_PTR(-ENOENT);
        return folio;
}
EXPORT_SYMBOL(__filemap_get_folio);

static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
                xa_mark_t mark)
{
        struct folio *folio;

retry:
        if (mark == XA_PRESENT)
                folio = xas_find(xas, max);
        else
                folio = xas_find_marked(xas, max, mark);

        if (xas_retry(xas, folio))
                goto retry;
        /*
         * A shadow entry of a recently evicted page, a swap
         * entry from shmem/tmpfs or a DAX entry.  Return it
         * without attempting to raise page count.
         */
        if (!folio || xa_is_value(folio))
                return folio;

        if (!folio_try_get_rcu(folio))
                goto reset;

        if (unlikely(folio != xas_reload(xas))) {
                folio_put(folio);
                goto reset;
        }

        return folio;
reset:
        xas_reset(xas);
        goto retry;
}

/**
 * find_get_entries - gang pagecache lookup
 * @mapping:        The address_space to search
 * @start:        The starting page cache index
 * @end:        The final page index (inclusive).
 * @fbatch:        Where the resulting entries are placed.
 * @indices:        The cache indices corresponding to the entries in @entries
 *
 * find_get_entries() will search for and return a batch of entries in
 * the mapping.  The entries are placed in @fbatch.  find_get_entries()
 * takes a reference on any actual folios it returns.
 *
 * The entries have ascending indexes.  The indices may not be consecutive
 * due to not-present entries or large folios.
 *
 * Any shadow entries of evicted folios, or swap entries from
 * shmem/tmpfs, are included in the returned array.
 *
 * Return: The number of entries which were found.
 */
unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
{
        XA_STATE(xas, &mapping->i_pages, *start);
        struct folio *folio;

        rcu_read_lock();
        while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
                indices[fbatch->nr] = xas.xa_index;
                if (!folio_batch_add(fbatch, folio))
                        break;
        }
        rcu_read_unlock();

        if (folio_batch_count(fbatch)) {
                unsigned long nr = 1;
                int idx = folio_batch_count(fbatch) - 1;

                folio = fbatch->folios[idx];
                if (!xa_is_value(folio))
                        nr = folio_nr_pages(folio);
                *start = indices[idx] + nr;
        }
        return folio_batch_count(fbatch);
}

/**
 * find_lock_entries - Find a batch of pagecache entries.
 * @mapping:        The address_space to search.
 * @start:        The starting page cache index.
 * @end:        The final page index (inclusive).
 * @fbatch:        Where the resulting entries are placed.
 * @indices:        The cache indices of the entries in @fbatch.
 *
 * find_lock_entries() will return a batch of entries from @mapping.
 * Swap, shadow and DAX entries are included.  Folios are returned
 * locked and with an incremented refcount.  Folios which are locked
 * by somebody else or under writeback are skipped.  Folios which are
 * partially outside the range are not returned.
 *
 * The entries have ascending indexes.  The indices may not be consecutive
 * due to not-present entries, large folios, folios which could not be
 * locked or folios under writeback.
 *
 * Return: The number of entries which were found.
 */
unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
{
        XA_STATE(xas, &mapping->i_pages, *start);
        struct folio *folio;

        rcu_read_lock();
        while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
                if (!xa_is_value(folio)) {
                        if (folio->index < *start)
                                goto put;
                        if (folio_next_index(folio) - 1 > end)
                                goto put;
                        if (!folio_trylock(folio))
                                goto put;
                        if (folio->mapping != mapping ||
                            folio_test_writeback(folio))
                                goto unlock;
                        VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),
                                        folio);
                }
                indices[fbatch->nr] = xas.xa_index;
                if (!folio_batch_add(fbatch, folio))
                        break;
                continue;
unlock:
                folio_unlock(folio);
put:
                folio_put(folio);
        }
        rcu_read_unlock();

        if (folio_batch_count(fbatch)) {
                unsigned long nr = 1;
                int idx = folio_batch_count(fbatch) - 1;

                folio = fbatch->folios[idx];
                if (!xa_is_value(folio))
                        nr = folio_nr_pages(folio);
                *start = indices[idx] + nr;
        }
        return folio_batch_count(fbatch);
}

/**
 * filemap_get_folios - Get a batch of folios
 * @mapping:        The address_space to search
 * @start:        The starting page index
 * @end:        The final page index (inclusive)
 * @fbatch:        The batch to fill.
 *
 * Search for and return a batch of folios in the mapping starting at
 * index @start and up to index @end (inclusive).  The folios are returned
 * in @fbatch with an elevated reference count.
 *
 * Return: The number of folios which were found.
 * We also update @start to index the next folio for the traversal.
 */
unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch)
{
        return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch);
}
EXPORT_SYMBOL(filemap_get_folios);

/**
 * filemap_get_folios_contig - Get a batch of contiguous folios
 * @mapping:        The address_space to search
 * @start:        The starting page index
 * @end:        The final page index (inclusive)
 * @fbatch:        The batch to fill
 *
 * filemap_get_folios_contig() works exactly like filemap_get_folios(),
 * except the returned folios are guaranteed to be contiguous. This may
 * not return all contiguous folios if the batch gets filled up.
 *
 * Return: The number of folios found.
 * Also update @start to be positioned for traversal of the next folio.
 */

unsigned filemap_get_folios_contig(struct address_space *mapping,
                pgoff_t *start, pgoff_t end, struct folio_batch *fbatch)
{
        XA_STATE(xas, &mapping->i_pages, *start);
        unsigned long nr;
        struct folio *folio;

        rcu_read_lock();

        for (folio = xas_load(&xas); folio && xas.xa_index <= end;
                        folio = xas_next(&xas)) {
                if (xas_retry(&xas, folio))
                        continue;
                /*
                 * If the entry has been swapped out, we can stop looking.
                 * No current caller is looking for DAX entries.
                 */
                if (xa_is_value(folio))
                        goto update_start;

                if (!folio_try_get_rcu(folio))
                        goto retry;

                if (unlikely(folio != xas_reload(&xas)))
                        goto put_folio;

                if (!folio_batch_add(fbatch, folio)) {
                        nr = folio_nr_pages(folio);
                        *start = folio->index + nr;
                        goto out;
                }
                continue;
put_folio:
                folio_put(folio);

retry:
                xas_reset(&xas);
        }

update_start:
        nr = folio_batch_count(fbatch);

        if (nr) {
                folio = fbatch->folios[nr - 1];
                *start = folio_next_index(folio);
        }
out:
        rcu_read_unlock();
        return folio_batch_count(fbatch);
}
EXPORT_SYMBOL(filemap_get_folios_contig);

/**
 * filemap_get_folios_tag - Get a batch of folios matching @tag
 * @mapping:    The address_space to search
 * @start:      The starting page index
 * @end:        The final page index (inclusive)
 * @tag:        The tag index
 * @fbatch:     The batch to fill
 *
 * The first folio may start before @start; if it does, it will contain
 * @start.  The final folio may extend beyond @end; if it does, it will
 * contain @end.  The folios have ascending indices.  There may be gaps
 * between the folios if there are indices which have no folio in the
 * page cache.  If folios are added to or removed from the page cache
 * while this is running, they may or may not be found by this call.
 * Only returns folios that are tagged with @tag.
 *
 * Return: The number of folios found.
 * Also update @start to index the next folio for traversal.
 */
unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
                        pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch)
{
        XA_STATE(xas, &mapping->i_pages, *start);
        struct folio *folio;

        rcu_read_lock();
        while ((folio = find_get_entry(&xas, end, tag)) != NULL) {
                /*
                 * Shadow entries should never be tagged, but this iteration
                 * is lockless so there is a window for page reclaim to evict
                 * a page we saw tagged. Skip over it.
                 */
                if (xa_is_value(folio))
                        continue;
                if (!folio_batch_add(fbatch, folio)) {
                        unsigned long nr = folio_nr_pages(folio);
                        *start = folio->index + nr;
                        goto out;
                }
        }
        /*
         * We come here when there is no page beyond @end. We take care to not
         * overflow the index @start as it confuses some of the callers. This
         * breaks the iteration when there is a page at index -1 but that is
         * already broke anyway.
         */
        if (end == (pgoff_t)-1)
                *start = (pgoff_t)-1;
        else
                *start = end + 1;
out:
        rcu_read_unlock();

        return folio_batch_count(fbatch);
}
EXPORT_SYMBOL(filemap_get_folios_tag);

/*
 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
 * a _large_ part of the i/o request. Imagine the worst scenario:
 *
 *      ---R__________________________________________B__________
 *         ^ reading here                             ^ bad block(assume 4k)
 *
 * read(R) => miss => readahead(R...B) => media error => frustrating retries
 * => failing the whole request => read(R) => read(R+1) =>
 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
 *
 * It is going insane. Fix it by quickly scaling down the readahead size.
 */
static void shrink_readahead_size_eio(struct file_ra_state *ra)
{
        ra->ra_pages /= 4;
}

/*
 * filemap_get_read_batch - Get a batch of folios for read
 *
 * Get a batch of folios which represent a contiguous range of bytes in
 * the file.  No exceptional entries will be returned.  If @index is in
 * the middle of a folio, the entire folio will be returned.  The last
 * folio in the batch may have the readahead flag set or the uptodate flag
 * clear so that the caller can take the appropriate action.
 */
static void filemap_get_read_batch(struct address_space *mapping,
                pgoff_t index, pgoff_t max, struct folio_batch *fbatch)
{
        XA_STATE(xas, &mapping->i_pages, index);
        struct folio *folio;

        rcu_read_lock();
        for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
                if (xas_retry(&xas, folio))
                        continue;
                if (xas.xa_index > max || xa_is_value(folio))
                        break;
                if (xa_is_sibling(folio))
                        break;
                if (!folio_try_get_rcu(folio))
                        goto retry;

                if (unlikely(folio != xas_reload(&xas)))
                        goto put_folio;

                if (!folio_batch_add(fbatch, folio))
                        break;
                if (!folio_test_uptodate(folio))
                        break;
                if (folio_test_readahead(folio))
                        break;
                xas_advance(&xas, folio_next_index(folio) - 1);
                continue;
put_folio:
                folio_put(folio);
retry:
                xas_reset(&xas);
        }
        rcu_read_unlock();
}

static int filemap_read_folio(struct file *file, filler_t filler,
                struct folio *folio)
{
        bool workingset = folio_test_workingset(folio);
        unsigned long pflags;
        int error;

        /*
         * A previous I/O error may have been due to temporary failures,
         * eg. multipath errors.  PG_error will be set again if read_folio
         * fails.
         */
        folio_clear_error(folio);

        /* Start the actual read. The read will unlock the page. */
        if (unlikely(workingset))
                psi_memstall_enter(&pflags);
        error = filler(file, folio);
        if (unlikely(workingset))
                psi_memstall_leave(&pflags);
        if (error)
                return error;

        error = folio_wait_locked_killable(folio);
        if (error)
                return error;
        if (folio_test_uptodate(folio))
                return 0;
        if (file)
                shrink_readahead_size_eio(&file->f_ra);
        return -EIO;
}

static bool filemap_range_uptodate(struct address_space *mapping,
                loff_t pos, size_t count, struct folio *folio,
                bool need_uptodate)
{
        if (folio_test_uptodate(folio))
                return true;
        /* pipes can't handle partially uptodate pages */
        if (need_uptodate)
                return false;
        if (!mapping->a_ops->is_partially_uptodate)
                return false;
        if (mapping->host->i_blkbits >= folio_shift(folio))
                return false;

        if (folio_pos(folio) > pos) {
                count -= folio_pos(folio) - pos;
                pos = 0;
        } else {
                pos -= folio_pos(folio);
        }

        return mapping->a_ops->is_partially_uptodate(folio, pos, count);
}

static int filemap_update_page(struct kiocb *iocb,
                struct address_space *mapping, size_t count,
                struct folio *folio, bool need_uptodate)
{
        int error;

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (!filemap_invalidate_trylock_shared(mapping))
                        return -EAGAIN;
        } else {
                filemap_invalidate_lock_shared(mapping);
        }

        if (!folio_trylock(folio)) {
                error = -EAGAIN;
                if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
                        goto unlock_mapping;
                if (!(iocb->ki_flags & IOCB_WAITQ)) {
                        filemap_invalidate_unlock_shared(mapping);
                        /*
                         * This is where we usually end up waiting for a
                         * previously submitted readahead to finish.
                         */
                        folio_put_wait_locked(folio, TASK_KILLABLE);
                        return AOP_TRUNCATED_PAGE;
                }
                error = __folio_lock_async(folio, iocb->ki_waitq);
                if (error)
                        goto unlock_mapping;
        }

        error = AOP_TRUNCATED_PAGE;
        if (!folio->mapping)
                goto unlock;

        error = 0;
        if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio,
                                   need_uptodate))
                goto unlock;

        error = -EAGAIN;
        if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
                goto unlock;

        error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
                        folio);
        goto unlock_mapping;
unlock:
        folio_unlock(folio);
unlock_mapping:
        filemap_invalidate_unlock_shared(mapping);
        if (error == AOP_TRUNCATED_PAGE)
                folio_put(folio);
        return error;
}

static int filemap_create_folio(struct file *file,
                struct address_space *mapping, pgoff_t index,
                struct folio_batch *fbatch)
{
        struct folio *folio;
        int error;

        folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0);
        if (!folio)
                return -ENOMEM;

        /*
         * Protect against truncate / hole punch. Grabbing invalidate_lock
         * here assures we cannot instantiate and bring uptodate new
         * pagecache folios after evicting page cache during truncate
         * and before actually freeing blocks.        Note that we could
         * release invalidate_lock after inserting the folio into
         * the page cache as the locked folio would then be enough to
         * synchronize with hole punching. But there are code paths
         * such as filemap_update_page() filling in partially uptodate
         * pages or ->readahead() that need to hold invalidate_lock
         * while mapping blocks for IO so let's hold the lock here as
         * well to keep locking rules simple.
         */
        filemap_invalidate_lock_shared(mapping);
        error = filemap_add_folio(mapping, folio, index,
                        mapping_gfp_constraint(mapping, GFP_KERNEL));
        if (error == -EEXIST)
                error = AOP_TRUNCATED_PAGE;
        if (error)
                goto error;

        error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
        if (error)
                goto error;

        filemap_invalidate_unlock_shared(mapping);
        folio_batch_add(fbatch, folio);
        return 0;
error:
        filemap_invalidate_unlock_shared(mapping);
        folio_put(folio);
        return error;
}

static int filemap_readahead(struct kiocb *iocb, struct file *file,
                struct address_space *mapping, struct folio *folio,
                pgoff_t last_index)
{
        DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index);

        if (iocb->ki_flags & IOCB_NOIO)
                return -EAGAIN;
        page_cache_async_ra(&ractl, folio, last_index - folio->index);
        return 0;
}

static int filemap_get_pages(struct kiocb *iocb, size_t count,
                struct folio_batch *fbatch, bool need_uptodate)
{
        struct file *filp = iocb->ki_filp;
        struct address_space *mapping = filp->f_mapping;
        struct file_ra_state *ra = &filp->f_ra;
        pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
        pgoff_t last_index;
        struct folio *folio;
        int err = 0;

        /* "last_index" is the index of the page beyond the end of the read */
        last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
retry:
        if (fatal_signal_pending(current))
                return -EINTR;

        filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
        if (!folio_batch_count(fbatch)) {
                if (iocb->ki_flags & IOCB_NOIO)
                        return -EAGAIN;
                page_cache_sync_readahead(mapping, ra, filp, index,
                                last_index - index);
                filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
        }
        if (!folio_batch_count(fbatch)) {
                if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
                        return -EAGAIN;
                err = filemap_create_folio(filp, mapping,
                                iocb->ki_pos >> PAGE_SHIFT, fbatch);
                if (err == AOP_TRUNCATED_PAGE)
                        goto retry;
                return err;
        }

        folio = fbatch->folios[folio_batch_count(fbatch) - 1];
        if (folio_test_readahead(folio)) {
                err = filemap_readahead(iocb, filp, mapping, folio, last_index);
                if (err)
                        goto err;
        }
        if (!folio_test_uptodate(folio)) {
                if ((iocb->ki_flags & IOCB_WAITQ) &&
                    folio_batch_count(fbatch) > 1)
                        iocb->ki_flags |= IOCB_NOWAIT;
                err = filemap_update_page(iocb, mapping, count, folio,
                                          need_uptodate);
                if (err)
                        goto err;
        }

        return 0;
err:
        if (err < 0)
                folio_put(folio);
        if (likely(--fbatch->nr))
                return 0;
        if (err == AOP_TRUNCATED_PAGE)
                goto retry;
        return err;
}

static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
{
        unsigned int shift = folio_shift(folio);

        return (pos1 >> shift == pos2 >> shift);
}

/**
 * filemap_read - Read data from the page cache.
 * @iocb: The iocb to read.
 * @iter: Destination for the data.
 * @already_read: Number of bytes already read by the caller.
 *
 * Copies data from the page cache.  If the data is not currently present,
 * uses the readahead and read_folio address_space operations to fetch it.
 *
 * Return: Total number of bytes copied, including those already read by
 * the caller.  If an error happens before any bytes are copied, returns
 * a negative error number.
 */
ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
                ssize_t already_read)
{
        struct file *filp = iocb->ki_filp;
        struct file_ra_state *ra = &filp->f_ra;
        struct address_space *mapping = filp->f_mapping;
        struct inode *inode = mapping->host;
        struct folio_batch fbatch;
        int i, error = 0;
        bool writably_mapped;
        loff_t isize, end_offset;
        loff_t last_pos = ra->prev_pos;

        if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
                return 0;
        if (unlikely(!iov_iter_count(iter)))
                return 0;

        iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
        folio_batch_init(&fbatch);

        do {
                cond_resched();

                /*
                 * If we've already successfully copied some data, then we
                 * can no longer safely return -EIOCBQUEUED. Hence mark
                 * an async read NOWAIT at that point.
                 */
                if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
                        iocb->ki_flags |= IOCB_NOWAIT;

                if (unlikely(iocb->ki_pos >= i_size_read(inode)))
                        break;

                error = filemap_get_pages(iocb, iter->count, &fbatch, false);
                if (error < 0)
                        break;

                /*
                 * i_size must be checked after we know the pages are Uptodate.
                 *
                 * Checking i_size after the check allows us to calculate
                 * the correct value for "nr", which means the zero-filled
                 * part of the page is not copied back to userspace (unless
                 * another truncate extends the file - this is desired though).
                 */
                isize = i_size_read(inode);
                if (unlikely(iocb->ki_pos >= isize))
                        goto put_folios;
                end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);

                /*
                 * Once we start copying data, we don't want to be touching any
                 * cachelines that might be contended:
                 */
                writably_mapped = mapping_writably_mapped(mapping);

                /*
                 * When a read accesses the same folio several times, only
                 * mark it as accessed the first time.
                 */
                if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
                                    fbatch.folios[0]))
                        folio_mark_accessed(fbatch.folios[0]);

                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];
                        size_t fsize = folio_size(folio);
                        size_t offset = iocb->ki_pos & (fsize - 1);
                        size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
                                             fsize - offset);
                        size_t copied;

                        if (end_offset < folio_pos(folio))
                                break;
                        if (i > 0)
                                folio_mark_accessed(folio);
                        /*
                         * If users can be writing to this folio using arbitrary
                         * virtual addresses, take care of potential aliasing
                         * before reading the folio on the kernel side.
                         */
                        if (writably_mapped)
                                flush_dcache_folio(folio);

                        copied = copy_folio_to_iter(folio, offset, bytes, iter);

                        already_read += copied;
                        iocb->ki_pos += copied;
                        last_pos = iocb->ki_pos;

                        if (copied < bytes) {
                                error = -EFAULT;
                                break;
                        }
                }
put_folios:
                for (i = 0; i < folio_batch_count(&fbatch); i++)
                        folio_put(fbatch.folios[i]);
                folio_batch_init(&fbatch);
        } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);

        file_accessed(filp);
        ra->prev_pos = last_pos;
        return already_read ? already_read : error;
}
EXPORT_SYMBOL_GPL(filemap_read);

int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;
        loff_t pos = iocb->ki_pos;
        loff_t end = pos + count - 1;

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (filemap_range_needs_writeback(mapping, pos, end))
                        return -EAGAIN;
                return 0;
        }

        return filemap_write_and_wait_range(mapping, pos, end);
}
EXPORT_SYMBOL_GPL(kiocb_write_and_wait);

int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;
        loff_t pos = iocb->ki_pos;
        loff_t end = pos + count - 1;
        int ret;

        if (iocb->ki_flags & IOCB_NOWAIT) {
                /* we could block if there are any pages in the range */
                if (filemap_range_has_page(mapping, pos, end))
                        return -EAGAIN;
        } else {
                ret = filemap_write_and_wait_range(mapping, pos, end);
                if (ret)
                        return ret;
        }

        /*
         * After a write we want buffered reads to be sure to go to disk to get
         * the new data.  We invalidate clean cached page from the region we're
         * about to write.  We do this *before* the write so that we can return
         * without clobbering -EIOCBQUEUED from ->direct_IO().
         */
        return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
                                             end >> PAGE_SHIFT);
}
EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);

/**
 * generic_file_read_iter - generic filesystem read routine
 * @iocb:        kernel I/O control block
 * @iter:        destination for the data read
 *
 * This is the "read_iter()" routine for all filesystems
 * that can use the page cache directly.
 *
 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
 * be returned when no data can be read without waiting for I/O requests
 * to complete; it doesn't prevent readahead.
 *
 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
 * requests shall be made for the read or for readahead.  When no data
 * can be read, -EAGAIN shall be returned.  When readahead would be
 * triggered, a partial, possibly empty read shall be returned.
 *
 * Return:
 * * number of bytes copied, even for partial reads
 * * negative error code (or 0 if IOCB_NOIO) if nothing was read
 */
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        size_t count = iov_iter_count(iter);
        ssize_t retval = 0;

        if (!count)
                return 0; /* skip atime */

        if (iocb->ki_flags & IOCB_DIRECT) {
                struct file *file = iocb->ki_filp;
                struct address_space *mapping = file->f_mapping;
                struct inode *inode = mapping->host;

                retval = kiocb_write_and_wait(iocb, count);
                if (retval < 0)
                        return retval;
                file_accessed(file);

                retval = mapping->a_ops->direct_IO(iocb, iter);
                if (retval >= 0) {
                        iocb->ki_pos += retval;
                        count -= retval;
                }
                if (retval != -EIOCBQUEUED)
                        iov_iter_revert(iter, count - iov_iter_count(iter));

                /*
                 * Btrfs can have a short DIO read if we encounter
                 * compressed extents, so if there was an error, or if
                 * we've already read everything we wanted to, or if
                 * there was a short read because we hit EOF, go ahead
                 * and return.  Otherwise fallthrough to buffered io for
                 * the rest of the read.  Buffered reads will not work for
                 * DAX files, so don't bother trying.
                 */
                if (retval < 0 || !count || IS_DAX(inode))
                        return retval;
                if (iocb->ki_pos >= i_size_read(inode))
                        return retval;
        }

        return filemap_read(iocb, iter, retval);
}
EXPORT_SYMBOL(generic_file_read_iter);

/*
 * Splice subpages from a folio into a pipe.
 */
size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
                              struct folio *folio, loff_t fpos, size_t size)
{
        struct page *page;
        size_t spliced = 0, offset = offset_in_folio(folio, fpos);

        page = folio_page(folio, offset / PAGE_SIZE);
        size = min(size, folio_size(folio) - offset);
        offset %= PAGE_SIZE;

        while (spliced < size &&
               !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
                struct pipe_buffer *buf = pipe_head_buf(pipe);
                size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);

                *buf = (struct pipe_buffer) {
                        .ops        = &page_cache_pipe_buf_ops,
                        .page        = page,
                        .offset        = offset,
                        .len        = part,
                };
                folio_get(folio);
                pipe->head++;
                page++;
                spliced += part;
                offset = 0;
        }

        return spliced;
}

/**
 * filemap_splice_read -  Splice data from a file's pagecache into a pipe
 * @in: The file to read from
 * @ppos: Pointer to the file position to read from
 * @pipe: The pipe to splice into
 * @len: The amount to splice
 * @flags: The SPLICE_F_* flags
 *
 * This function gets folios from a file's pagecache and splices them into the
 * pipe.  Readahead will be called as necessary to fill more folios.  This may
 * be used for blockdevs also.
 *
 * Return: On success, the number of bytes read will be returned and *@ppos
 * will be updated if appropriate; 0 will be returned if there is no more data
 * to be read; -EAGAIN will be returned if the pipe had no space, and some
 * other negative error code will be returned on error.  A short read may occur
 * if the pipe has insufficient space, we reach the end of the data or we hit a
 * hole.
 */
ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
                            struct pipe_inode_info *pipe,
                            size_t len, unsigned int flags)
{
        struct folio_batch fbatch;
        struct kiocb iocb;
        size_t total_spliced = 0, used, npages;
        loff_t isize, end_offset;
        bool writably_mapped;
        int i, error = 0;

        if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes))
                return 0;

        init_sync_kiocb(&iocb, in);
        iocb.ki_pos = *ppos;

        /* Work out how much data we can actually add into the pipe */
        used = pipe_occupancy(pipe->head, pipe->tail);
        npages = max_t(ssize_t, pipe->max_usage - used, 0);
        len = min_t(size_t, len, npages * PAGE_SIZE);

        folio_batch_init(&fbatch);

        do {
                cond_resched();

                if (*ppos >= i_size_read(in->f_mapping->host))
                        break;

                iocb.ki_pos = *ppos;
                error = filemap_get_pages(&iocb, len, &fbatch, true);
                if (error < 0)
                        break;

                /*
                 * i_size must be checked after we know the pages are Uptodate.
                 *
                 * Checking i_size after the check allows us to calculate
                 * the correct value for "nr", which means the zero-filled
                 * part of the page is not copied back to userspace (unless
                 * another truncate extends the file - this is desired though).
                 */
                isize = i_size_read(in->f_mapping->host);
                if (unlikely(*ppos >= isize))
                        break;
                end_offset = min_t(loff_t, isize, *ppos + len);

                /*
                 * Once we start copying data, we don't want to be touching any
                 * cachelines that might be contended:
                 */
                writably_mapped = mapping_writably_mapped(in->f_mapping);

                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];
                        size_t n;

                        if (folio_pos(folio) >= end_offset)
                                goto out;
                        folio_mark_accessed(folio);

                        /*
                         * If users can be writing to this folio using arbitrary
                         * virtual addresses, take care of potential aliasing
                         * before reading the folio on the kernel side.
                         */
                        if (writably_mapped)
                                flush_dcache_folio(folio);

                        n = min_t(loff_t, len, isize - *ppos);
                        n = splice_folio_into_pipe(pipe, folio, *ppos, n);
                        if (!n)
                                goto out;
                        len -= n;
                        total_spliced += n;
                        *ppos += n;
                        in->f_ra.prev_pos = *ppos;
                        if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
                                goto out;
                }

                folio_batch_release(&fbatch);
        } while (len);

out:
        folio_batch_release(&fbatch);
        file_accessed(in);

        return total_spliced ? total_spliced : error;
}
EXPORT_SYMBOL(filemap_splice_read);

static inline loff_t folio_seek_hole_data(struct xa_state *xas,
                struct address_space *mapping, struct folio *folio,
                loff_t start, loff_t end, bool seek_data)
{
        const struct address_space_operations *ops = mapping->a_ops;
        size_t offset, bsz = i_blocksize(mapping->host);

        if (xa_is_value(folio) || folio_test_uptodate(folio))
                return seek_data ? start : end;
        if (!ops->is_partially_uptodate)
                return seek_data ? end : start;

        xas_pause(xas);
        rcu_read_unlock();
        folio_lock(folio);
        if (unlikely(folio->mapping != mapping))
                goto unlock;

        offset = offset_in_folio(folio, start) & ~(bsz - 1);

        do {
                if (ops->is_partially_uptodate(folio, offset, bsz) ==
                                                        seek_data)
                        break;
                start = (start + bsz) & ~(bsz - 1);
                offset += bsz;
        } while (offset < folio_size(folio));
unlock:
        folio_unlock(folio);
        rcu_read_lock();
        return start;
}

static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)
{
        if (xa_is_value(folio))
                return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index);
        return folio_size(folio);
}

/**
 * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.
 * @mapping: Address space to search.
 * @start: First byte to consider.
 * @end: Limit of search (exclusive).
 * @whence: Either SEEK_HOLE or SEEK_DATA.
 *
 * If the page cache knows which blocks contain holes and which blocks
 * contain data, your filesystem can use this function to implement
 * SEEK_HOLE and SEEK_DATA.  This is useful for filesystems which are
 * entirely memory-based such as tmpfs, and filesystems which support
 * unwritten extents.
 *
 * Return: The requested offset on success, or -ENXIO if @whence specifies
 * SEEK_DATA and there is no data after @start.  There is an implicit hole
 * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
 * and @end contain data.
 */
loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
                loff_t end, int whence)
{
        XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
        pgoff_t max = (end - 1) >> PAGE_SHIFT;
        bool seek_data = (whence == SEEK_DATA);
        struct folio *folio;

        if (end <= start)
                return -ENXIO;

        rcu_read_lock();
        while ((folio = find_get_entry(&xas, max, XA_PRESENT))) {
                loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;
                size_t seek_size;

                if (start < pos) {
                        if (!seek_data)
                                goto unlock;
                        start = pos;
                }

                seek_size = seek_folio_size(&xas, folio);
                pos = round_up((u64)pos + 1, seek_size);
                start = folio_seek_hole_data(&xas, mapping, folio, start, pos,
                                seek_data);
                if (start < pos)
                        goto unlock;
                if (start >= end)
                        break;
                if (seek_size > PAGE_SIZE)
                        xas_set(&xas, pos >> PAGE_SHIFT);
                if (!xa_is_value(folio))
                        folio_put(folio);
        }
        if (seek_data)
                start = -ENXIO;
unlock:
        rcu_read_unlock();
        if (folio && !xa_is_value(folio))
                folio_put(folio);
        if (start > end)
                return end;
        return start;
}

#ifdef CONFIG_MMU
#define MMAP_LOTSAMISS  (100)
/*
 * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
 * @vmf - the vm_fault for this fault.
 * @folio - the folio to lock.
 * @fpin - the pointer to the file we may pin (or is already pinned).
 *
 * This works similar to lock_folio_or_retry in that it can drop the
 * mmap_lock.  It differs in that it actually returns the folio locked
 * if it returns 1 and 0 if it couldn't lock the folio.  If we did have
 * to drop the mmap_lock then fpin will point to the pinned file and
 * needs to be fput()'ed at a later point.
 */
static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
                                     struct file **fpin)
{
        if (folio_trylock(folio))
                return 1;

        /*
         * NOTE! This will make us return with VM_FAULT_RETRY, but with
         * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
         * is supposed to work. We have way too many special cases..
         */
        if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
                return 0;

        *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
        if (vmf->flags & FAULT_FLAG_KILLABLE) {
                if (__folio_lock_killable(folio)) {
                        /*
                         * We didn't have the right flags to drop the
                         * fault lock, but all fault_handlers only check
                         * for fatal signals if we return VM_FAULT_RETRY,
                         * so we need to drop the fault lock here and
                         * return 0 if we don't have a fpin.
                         */
                        if (*fpin == NULL)
                                release_fault_lock(vmf);
                        return 0;
                }
        } else
                __folio_lock(folio);

        return 1;
}

/*
 * Synchronous readahead happens when we don't even find a page in the page
 * cache at all.  We don't want to perform IO under the mmap sem, so if we have
 * to drop the mmap sem we return the file that was pinned in order for us to do
 * that.  If we didn't pin a file then we return NULL.  The file that is
 * returned needs to be fput()'ed when we're done with it.
 */
static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
{
        struct file *file = vmf->vma->vm_file;
        struct file_ra_state *ra = &file->f_ra;
        struct address_space *mapping = file->f_mapping;
        DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
        struct file *fpin = NULL;
        unsigned long vm_flags = vmf->vma->vm_flags;
        unsigned int mmap_miss;

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        /* Use the readahead code, even if readahead is disabled */
        if (vm_flags & VM_HUGEPAGE) {
                fpin = maybe_unlock_mmap_for_io(vmf, fpin);
                ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
                ra->size = HPAGE_PMD_NR;
                /*
                 * Fetch two PMD folios, so we get the chance to actually
                 * readahead, unless we've been told not to.
                 */
                if (!(vm_flags & VM_RAND_READ))
                        ra->size *= 2;
                ra->async_size = HPAGE_PMD_NR;
                page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
                return fpin;
        }
#endif

        /* If we don't want any read-ahead, don't bother */
        if (vm_flags & VM_RAND_READ)
                return fpin;
        if (!ra->ra_pages)
                return fpin;

        if (vm_flags & VM_SEQ_READ) {
                fpin = maybe_unlock_mmap_for_io(vmf, fpin);
                page_cache_sync_ra(&ractl, ra->ra_pages);
                return fpin;
        }

        /* Avoid banging the cache line if not needed */
        mmap_miss = READ_ONCE(ra->mmap_miss);
        if (mmap_miss < MMAP_LOTSAMISS * 10)
                WRITE_ONCE(ra->mmap_miss, ++mmap_miss);

        /*
         * Do we miss much more than hit in this file? If so,
         * stop bothering with read-ahead. It will only hurt.
         */
        if (mmap_miss > MMAP_LOTSAMISS)
                return fpin;

        /*
         * mmap read-around
         */
        fpin = maybe_unlock_mmap_for_io(vmf, fpin);
        ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
        ra->size = ra->ra_pages;
        ra->async_size = ra->ra_pages / 4;
        ractl._index = ra->start;
        page_cache_ra_order(&ractl, ra, 0);
        return fpin;
}

/*
 * Asynchronous readahead happens when we find the page and PG_readahead,
 * so we want to possibly extend the readahead further.  We return the file that
 * was pinned if we have to drop the mmap_lock in order to do IO.
 */
static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
                                            struct folio *folio)
{
        struct file *file = vmf->vma->vm_file;
        struct file_ra_state *ra = &file->f_ra;
        DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
        struct file *fpin = NULL;
        unsigned int mmap_miss;

        /* If we don't want any read-ahead, don't bother */
        if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
                return fpin;

        mmap_miss = READ_ONCE(ra->mmap_miss);
        if (mmap_miss)
                WRITE_ONCE(ra->mmap_miss, --mmap_miss);

        if (folio_test_readahead(folio)) {
                fpin = maybe_unlock_mmap_for_io(vmf, fpin);
                page_cache_async_ra(&ractl, folio, ra->ra_pages);
        }
        return fpin;
}

static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret = 0;
        pte_t *ptep;

        /*
         * We might have COW'ed a pagecache folio and might now have an mlocked
         * anon folio mapped. The original pagecache folio is not mlocked and
         * might have been evicted. During a read+clear/modify/write update of
         * the PTE, such as done in do_numa_page()/change_pte_range(), we
         * temporarily clear the PTE under PT lock and might detect it here as
         * "none" when not holding the PT lock.
         *
         * Not rechecking the PTE under PT lock could result in an unexpected
         * major fault in an mlock'ed region. Recheck only for this special
         * scenario while holding the PT lock, to not degrade non-mlocked
         * scenarios. Recheck the PTE without PT lock firstly, thereby reducing
         * the number of times we hold PT lock.
         */
        if (!(vma->vm_flags & VM_LOCKED))
                return 0;

        if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
                return 0;

        ptep = pte_offset_map(vmf->pmd, vmf->address);
        if (unlikely(!ptep))
                return VM_FAULT_NOPAGE;

        if (unlikely(!pte_none(ptep_get_lockless(ptep)))) {
                ret = VM_FAULT_NOPAGE;
        } else {
                spin_lock(vmf->ptl);
                if (unlikely(!pte_none(ptep_get(ptep))))
                        ret = VM_FAULT_NOPAGE;
                spin_unlock(vmf->ptl);
        }
        pte_unmap(ptep);
        return ret;
}

/**
 * filemap_fault - read in file data for page fault handling
 * @vmf:        struct vm_fault containing details of the fault
 *
 * filemap_fault() is invoked via the vma operations vector for a
 * mapped memory region to read in file data during a page fault.
 *
 * The goto's are kind of ugly, but this streamlines the normal case of having
 * it in the page cache, and handles the special cases reasonably without
 * having a lot of duplicated code.
 *
 * vma->vm_mm->mmap_lock must be held on entry.
 *
 * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
 * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap().
 *
 * If our return value does not have VM_FAULT_RETRY set, the mmap_lock
 * has not been released.
 *
 * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
 *
 * Return: bitwise-OR of %VM_FAULT_ codes.
 */
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
        int error;
        struct file *file = vmf->vma->vm_file;
        struct file *fpin = NULL;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        pgoff_t max_idx, index = vmf->pgoff;
        struct folio *folio;
        vm_fault_t ret = 0;
        bool mapping_locked = false;

        max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
        if (unlikely(index >= max_idx))
                return VM_FAULT_SIGBUS;

        /*
         * Do we have something in the page cache already?
         */
        folio = filemap_get_folio(mapping, index);
        if (likely(!IS_ERR(folio))) {
                /*
                 * We found the page, so try async readahead before waiting for
                 * the lock.
                 */
                if (!(vmf->flags & FAULT_FLAG_TRIED))
                        fpin = do_async_mmap_readahead(vmf, folio);
                if (unlikely(!folio_test_uptodate(folio))) {
                        filemap_invalidate_lock_shared(mapping);
                        mapping_locked = true;
                }
        } else {
                ret = filemap_fault_recheck_pte_none(vmf);
                if (unlikely(ret))
                        return ret;

                /* No page in the page cache at all */
                count_vm_event(PGMAJFAULT);
                count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
                ret = VM_FAULT_MAJOR;
                fpin = do_sync_mmap_readahead(vmf);
retry_find:
                /*
                 * See comment in filemap_create_folio() why we need
                 * invalidate_lock
                 */
                if (!mapping_locked) {
                        filemap_invalidate_lock_shared(mapping);
                        mapping_locked = true;
                }
                folio = __filemap_get_folio(mapping, index,
                                          FGP_CREAT|FGP_FOR_MMAP,
                                          vmf->gfp_mask);
                if (IS_ERR(folio)) {
                        if (fpin)
                                goto out_retry;
                        filemap_invalidate_unlock_shared(mapping);
                        return VM_FAULT_OOM;
                }
        }

        if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin))
                goto out_retry;

        /* Did it get truncated? */
        if (unlikely(folio->mapping != mapping)) {
                folio_unlock(folio);
                folio_put(folio);
                goto retry_find;
        }
        VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);

        /*
         * We have a locked folio in the page cache, now we need to check
         * that it's up-to-date. If not, it is going to be due to an error,
         * or because readahead was otherwise unable to retrieve it.
         */
        if (unlikely(!folio_test_uptodate(folio))) {
                /*
                 * If the invalidate lock is not held, the folio was in cache
                 * and uptodate and now it is not. Strange but possible since we
                 * didn't hold the page lock all the time. Let's drop
                 * everything, get the invalidate lock and try again.
                 */
                if (!mapping_locked) {
                        folio_unlock(folio);
                        folio_put(folio);
                        goto retry_find;
                }

                /*
                 * OK, the folio is really not uptodate. This can be because the
                 * VMA has the VM_RAND_READ flag set, or because an error
                 * arose. Let's read it in directly.
                 */
                goto page_not_uptodate;
        }

        /*
         * We've made it this far and we had to drop our mmap_lock, now is the
         * time to return to the upper layer and have it re-find the vma and
         * redo the fault.
         */
        if (fpin) {
                folio_unlock(folio);
                goto out_retry;
        }
        if (mapping_locked)
                filemap_invalidate_unlock_shared(mapping);

        /*
         * Found the page and have a reference on it.
         * We must recheck i_size under page lock.
         */
        max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
        if (unlikely(index >= max_idx)) {
                folio_unlock(folio);
                folio_put(folio);
                return VM_FAULT_SIGBUS;
        }

        vmf->page = folio_file_page(folio, index);
        return ret | VM_FAULT_LOCKED;

page_not_uptodate:
        /*
         * Umm, take care of errors if the page isn't up-to-date.
         * Try to re-read it _once_. We do this synchronously,
         * because there really aren't any performance issues here
         * and we need to check for errors.
         */
        fpin = maybe_unlock_mmap_for_io(vmf, fpin);
        error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
        if (fpin)
                goto out_retry;
        folio_put(folio);

        if (!error || error == AOP_TRUNCATED_PAGE)
                goto retry_find;
        filemap_invalidate_unlock_shared(mapping);

        return VM_FAULT_SIGBUS;

out_retry:
        /*
         * We dropped the mmap_lock, we need to return to the fault handler to
         * re-find the vma and come back and find our hopefully still populated
         * page.
         */
        if (!IS_ERR(folio))
                folio_put(folio);
        if (mapping_locked)
                filemap_invalidate_unlock_shared(mapping);
        if (fpin)
                fput(fpin);
        return ret | VM_FAULT_RETRY;
}
EXPORT_SYMBOL(filemap_fault);

static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
                pgoff_t start)
{
        struct mm_struct *mm = vmf->vma->vm_mm;

        /* Huge page is mapped? No need to proceed. */
        if (pmd_trans_huge(*vmf->pmd)) {
                folio_unlock(folio);
                folio_put(folio);
                return true;
        }

        if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {
                struct page *page = folio_file_page(folio, start);
                vm_fault_t ret = do_set_pmd(vmf, page);
                if (!ret) {
                        /* The page is mapped successfully, reference consumed. */
                        folio_unlock(folio);
                        return true;
                }
        }

        if (pmd_none(*vmf->pmd) && vmf->prealloc_pte)
                pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);

        return false;
}

static struct folio *next_uptodate_folio(struct xa_state *xas,
                struct address_space *mapping, pgoff_t end_pgoff)
{
        struct folio *folio = xas_next_entry(xas, end_pgoff);
        unsigned long max_idx;

        do {
                if (!folio)
                        return NULL;
                if (xas_retry(xas, folio))
                        continue;
                if (xa_is_value(folio))
                        continue;
                if (folio_test_locked(folio))
                        continue;
                if (!folio_try_get_rcu(folio))
                        continue;
                /* Has the page moved or been split? */
                if (unlikely(folio != xas_reload(xas)))
                        goto skip;
                if (!folio_test_uptodate(folio) || folio_test_readahead(folio))
                        goto skip;
                if (!folio_trylock(folio))
                        goto skip;
                if (folio->mapping != mapping)
                        goto unlock;
                if (!folio_test_uptodate(folio))
                        goto unlock;
                max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
                if (xas->xa_index >= max_idx)
                        goto unlock;
                return folio;
unlock:
                folio_unlock(folio);
skip:
                folio_put(folio);
        } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL);

        return NULL;
}

/*
 * Map page range [start_page, start_page + nr_pages) of folio.
 * start_page is gotten from start by folio_page(folio, start)
 */
static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
                        struct folio *folio, unsigned long start,
                        unsigned long addr, unsigned int nr_pages,
                        unsigned long *rss, unsigned int *mmap_miss)
{
        vm_fault_t ret = 0;
        struct page *page = folio_page(folio, start);
        unsigned int count = 0;
        pte_t *old_ptep = vmf->pte;

        do {
                if (PageHWPoison(page + count))
                        goto skip;

                /*
                 * If there are too many folios that are recently evicted
                 * in a file, they will probably continue to be evicted.
                 * In such situation, read-ahead is only a waste of IO.
                 * Don't decrease mmap_miss in this scenario to make sure
                 * we can stop read-ahead.
                 */
                if (!folio_test_workingset(folio))
                        (*mmap_miss)++;

                /*
                 * NOTE: If there're PTE markers, we'll leave them to be
                 * handled in the specific fault path, and it'll prohibit the
                 * fault-around logic.
                 */
                if (!pte_none(ptep_get(&vmf->pte[count])))
                        goto skip;

                count++;
                continue;
skip:
                if (count) {
                        set_pte_range(vmf, folio, page, count, addr);
                        *rss += count;
                        folio_ref_add(folio, count);
                        if (in_range(vmf->address, addr, count * PAGE_SIZE))
                                ret = VM_FAULT_NOPAGE;
                }

                count++;
                page += count;
                vmf->pte += count;
                addr += count * PAGE_SIZE;
                count = 0;
        } while (--nr_pages > 0);

        if (count) {
                set_pte_range(vmf, folio, page, count, addr);
                *rss += count;
                folio_ref_add(folio, count);
                if (in_range(vmf->address, addr, count * PAGE_SIZE))
                        ret = VM_FAULT_NOPAGE;
        }

        vmf->pte = old_ptep;

        return ret;
}

static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
                struct folio *folio, unsigned long addr,
                unsigned long *rss, unsigned int *mmap_miss)
{
        vm_fault_t ret = 0;
        struct page *page = &folio->page;

        if (PageHWPoison(page))
                return ret;

        /* See comment of filemap_map_folio_range() */
        if (!folio_test_workingset(folio))
                (*mmap_miss)++;

        /*
         * NOTE: If there're PTE markers, we'll leave them to be
         * handled in the specific fault path, and it'll prohibit
         * the fault-around logic.
         */
        if (!pte_none(ptep_get(vmf->pte)))
                return ret;

        if (vmf->address == addr)
                ret = VM_FAULT_NOPAGE;

        set_pte_range(vmf, folio, page, 1, addr);
        (*rss)++;
        folio_ref_inc(folio);

        return ret;
}

vm_fault_t filemap_map_pages(struct vm_fault *vmf,
                             pgoff_t start_pgoff, pgoff_t end_pgoff)
{
        struct vm_area_struct *vma = vmf->vma;
        struct file *file = vma->vm_file;
        struct address_space *mapping = file->f_mapping;
        pgoff_t last_pgoff = start_pgoff;
        unsigned long addr;
        XA_STATE(xas, &mapping->i_pages, start_pgoff);
        struct folio *folio;
        vm_fault_t ret = 0;
        unsigned long rss = 0;
        unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type;

        rcu_read_lock();
        folio = next_uptodate_folio(&xas, mapping, end_pgoff);
        if (!folio)
                goto out;

        if (filemap_map_pmd(vmf, folio, start_pgoff)) {
                ret = VM_FAULT_NOPAGE;
                goto out;
        }

        addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
        if (!vmf->pte) {
                folio_unlock(folio);
                folio_put(folio);
                goto out;
        }

        folio_type = mm_counter_file(folio);
        do {
                unsigned long end;

                addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
                vmf->pte += xas.xa_index - last_pgoff;
                last_pgoff = xas.xa_index;
                end = folio_next_index(folio) - 1;
                nr_pages = min(end, end_pgoff) - xas.xa_index + 1;

                if (!folio_test_large(folio))
                        ret |= filemap_map_order0_folio(vmf,
                                        folio, addr, &rss, &mmap_miss);
                else
                        ret |= filemap_map_folio_range(vmf, folio,
                                        xas.xa_index - folio->index, addr,
                                        nr_pages, &rss, &mmap_miss);

                folio_unlock(folio);
                folio_put(folio);
        } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
        add_mm_counter(vma->vm_mm, folio_type, rss);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
        rcu_read_unlock();

        mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss);
        if (mmap_miss >= mmap_miss_saved)
                WRITE_ONCE(file->f_ra.mmap_miss, 0);
        else
                WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss);

        return ret;
}
EXPORT_SYMBOL(filemap_map_pages);

vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        struct folio *folio = page_folio(vmf->page);
        vm_fault_t ret = VM_FAULT_LOCKED;

        sb_start_pagefault(mapping->host->i_sb);
        file_update_time(vmf->vma->vm_file);
        folio_lock(folio);
        if (folio->mapping != mapping) {
                folio_unlock(folio);
                ret = VM_FAULT_NOPAGE;
                goto out;
        }
        /*
         * We mark the folio dirty already here so that when freeze is in
         * progress, we are guaranteed that writeback during freezing will
         * see the dirty folio and writeprotect it again.
         */
        folio_mark_dirty(folio);
        folio_wait_stable(folio);
out:
        sb_end_pagefault(mapping->host->i_sb);
        return ret;
}

const struct vm_operations_struct generic_file_vm_ops = {
        .fault                = filemap_fault,
        .map_pages        = filemap_map_pages,
        .page_mkwrite        = filemap_page_mkwrite,
};

/* This is used for a general mmap of a disk file */

int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct address_space *mapping = file->f_mapping;

        if (!mapping->a_ops->read_folio)
                return -ENOEXEC;
        file_accessed(file);
        vma->vm_ops = &generic_file_vm_ops;
        return 0;
}

/*
 * This is for filesystems which do not implement ->writepage.
 */
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
        if (vma_is_shared_maywrite(vma))
                return -EINVAL;
        return generic_file_mmap(file, vma);
}
#else
vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
        return VM_FAULT_SIGBUS;
}
int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
{
        return -ENOSYS;
}
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
        return -ENOSYS;
}
#endif /* CONFIG_MMU */

EXPORT_SYMBOL(filemap_page_mkwrite);
EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_file_readonly_mmap);

static struct folio *do_read_cache_folio(struct address_space *mapping,
                pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)
{
        struct folio *folio;
        int err;

        if (!filler)
                filler = mapping->a_ops->read_folio;
repeat:
        folio = filemap_get_folio(mapping, index);
        if (IS_ERR(folio)) {
                folio = filemap_alloc_folio(gfp, 0);
                if (!folio)
                        return ERR_PTR(-ENOMEM);
                err = filemap_add_folio(mapping, folio, index, gfp);
                if (unlikely(err)) {
                        folio_put(folio);
                        if (err == -EEXIST)
                                goto repeat;
                        /* Presumably ENOMEM for xarray node */
                        return ERR_PTR(err);
                }

                goto filler;
        }
        if (folio_test_uptodate(folio))
                goto out;

        if (!folio_trylock(folio)) {
                folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
                goto repeat;
        }

        /* Folio was truncated from mapping */
        if (!folio->mapping) {
                folio_unlock(folio);
                folio_put(folio);
                goto repeat;
        }

        /* Someone else locked and filled the page in a very small window */
        if (folio_test_uptodate(folio)) {
                folio_unlock(folio);
                goto out;
        }

filler:
        err = filemap_read_folio(file, filler, folio);
        if (err) {
                folio_put(folio);
                if (err == AOP_TRUNCATED_PAGE)
                        goto repeat;
                return ERR_PTR(err);
        }

out:
        folio_mark_accessed(folio);
        return folio;
}

/**
 * read_cache_folio - Read into page cache, fill it if needed.
 * @mapping: The address_space to read from.
 * @index: The index to read.
 * @filler: Function to perform the read, or NULL to use aops->read_folio().
 * @file: Passed to filler function, may be NULL if not required.
 *
 * Read one page into the page cache.  If it succeeds, the folio returned
 * will contain @index, but it may not be the first page of the folio.
 *
 * If the filler function returns an error, it will be returned to the
 * caller.
 *
 * Context: May sleep.  Expects mapping->invalidate_lock to be held.
 * Return: An uptodate folio on success, ERR_PTR() on failure.
 */
struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index,
                filler_t filler, struct file *file)
{
        return do_read_cache_folio(mapping, index, filler, file,
                        mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(read_cache_folio);

/**
 * mapping_read_folio_gfp - Read into page cache, using specified allocation flags.
 * @mapping:        The address_space for the folio.
 * @index:        The index that the allocated folio will contain.
 * @gfp:        The page allocator flags to use if allocating.
 *
 * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with
 * any new memory allocations done using the specified allocation flags.
 *
 * The most likely error from this function is EIO, but ENOMEM is
 * possible and so is EINTR.  If ->read_folio returns another error,
 * that will be returned to the caller.
 *
 * The function expects mapping->invalidate_lock to be already held.
 *
 * Return: Uptodate folio on success, ERR_PTR() on failure.
 */
struct folio *mapping_read_folio_gfp(struct address_space *mapping,
                pgoff_t index, gfp_t gfp)
{
        return do_read_cache_folio(mapping, index, NULL, NULL, gfp);
}
EXPORT_SYMBOL(mapping_read_folio_gfp);

static struct page *do_read_cache_page(struct address_space *mapping,
                pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp)
{
        struct folio *folio;

        folio = do_read_cache_folio(mapping, index, filler, file, gfp);
        if (IS_ERR(folio))
                return &folio->page;
        return folio_file_page(folio, index);
}

struct page *read_cache_page(struct address_space *mapping,
                        pgoff_t index, filler_t *filler, struct file *file)
{
        return do_read_cache_page(mapping, index, filler, file,
                        mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(read_cache_page);

/**
 * read_cache_page_gfp - read into page cache, using specified page allocation flags.
 * @mapping:        the page's address_space
 * @index:        the page index
 * @gfp:        the page allocator flags to use if allocating
 *
 * This is the same as "read_mapping_page(mapping, index, NULL)", but with
 * any new page allocations done using the specified allocation flags.
 *
 * If the page does not get brought uptodate, return -EIO.
 *
 * The function expects mapping->invalidate_lock to be already held.
 *
 * Return: up to date page on success, ERR_PTR() on failure.
 */
struct page *read_cache_page_gfp(struct address_space *mapping,
                                pgoff_t index,
                                gfp_t gfp)
{
        return do_read_cache_page(mapping, index, NULL, NULL, gfp);
}
EXPORT_SYMBOL(read_cache_page_gfp);

/*
 * Warn about a page cache invalidation failure during a direct I/O write.
 */
static void dio_warn_stale_pagecache(struct file *filp)
{
        static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
        char pathname[128];
        char *path;

        errseq_set(&filp->f_mapping->wb_err, -EIO);
        if (__ratelimit(&_rs)) {
                path = file_path(filp, pathname, sizeof(pathname));
                if (IS_ERR(path))
                        path = "(unknown)";
                pr_crit("Page cache invalidation failure on direct I/O.  Possible data corruption due to collision with buffered I/O!\n");
                pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
                        current->comm);
        }
}

void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;

        if (mapping->nrpages &&
            invalidate_inode_pages2_range(mapping,
                        iocb->ki_pos >> PAGE_SHIFT,
                        (iocb->ki_pos + count - 1) >> PAGE_SHIFT))
                dio_warn_stale_pagecache(iocb->ki_filp);
}

ssize_t
generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;
        size_t write_len = iov_iter_count(from);
        ssize_t written;

        /*
         * If a page can not be invalidated, return 0 to fall back
         * to buffered write.
         */
        written = kiocb_invalidate_pages(iocb, write_len);
        if (written) {
                if (written == -EBUSY)
                        return 0;
                return written;
        }

        written = mapping->a_ops->direct_IO(iocb, from);

        /*
         * Finally, try again to invalidate clean pages which might have been
         * cached by non-direct readahead, or faulted in by get_user_pages()
         * if the source of the write was an mmap'ed region of the file
         * we're writing.  Either one is a pretty crazy thing to do,
         * so we don't support it 100%.  If this invalidation
         * fails, tough, the write still worked...
         *
         * Most of the time we do not need this since dio_complete() will do
         * the invalidation for us. However there are some file systems that
         * do not end up with dio_complete() being called, so let's not break
         * them by removing it completely.
         *
         * Noticeable example is a blkdev_direct_IO().
         *
         * Skip invalidation for async writes or if mapping has no pages.
         */
        if (written > 0) {
                struct inode *inode = mapping->host;
                loff_t pos = iocb->ki_pos;

                kiocb_invalidate_post_direct_write(iocb, written);
                pos += written;
                write_len -= written;
                if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
                        i_size_write(inode, pos);
                        mark_inode_dirty(inode);
                }
                iocb->ki_pos = pos;
        }
        if (written != -EIOCBQUEUED)
                iov_iter_revert(from, write_len - iov_iter_count(from));
        return written;
}
EXPORT_SYMBOL(generic_file_direct_write);

ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
{
        struct file *file = iocb->ki_filp;
        loff_t pos = iocb->ki_pos;
        struct address_space *mapping = file->f_mapping;
        const struct address_space_operations *a_ops = mapping->a_ops;
        long status = 0;
        ssize_t written = 0;

        do {
                struct page *page;
                unsigned long offset;        /* Offset into pagecache page */
                unsigned long bytes;        /* Bytes to write to page */
                size_t copied;                /* Bytes copied from user */
                void *fsdata = NULL;

                offset = (pos & (PAGE_SIZE - 1));
                bytes = min_t(unsigned long, PAGE_SIZE - offset,
                                                iov_iter_count(i));

again:
                /*
                 * Bring in the user page that we will copy from _first_.
                 * Otherwise there's a nasty deadlock on copying from the
                 * same page as we're writing to, without it being marked
                 * up-to-date.
                 */
                if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
                        status = -EFAULT;
                        break;
                }

                if (fatal_signal_pending(current)) {
                        status = -EINTR;
                        break;
                }

                status = a_ops->write_begin(file, mapping, pos, bytes,
                                                &page, &fsdata);
                if (unlikely(status < 0))
                        break;

                if (mapping_writably_mapped(mapping))
                        flush_dcache_page(page);

                copied = copy_page_from_iter_atomic(page, offset, bytes, i);
                flush_dcache_page(page);

                status = a_ops->write_end(file, mapping, pos, bytes, copied,
                                                page, fsdata);
                if (unlikely(status != copied)) {
                        iov_iter_revert(i, copied - max(status, 0L));
                        if (unlikely(status < 0))
                                break;
                }
                cond_resched();

                if (unlikely(status == 0)) {
                        /*
                         * A short copy made ->write_end() reject the
                         * thing entirely.  Might be memory poisoning
                         * halfway through, might be a race with munmap,
                         * might be severe memory pressure.
                         */
                        if (copied)
                                bytes = copied;
                        goto again;
                }
                pos += status;
                written += status;

                balance_dirty_pages_ratelimited(mapping);
        } while (iov_iter_count(i));

        if (!written)
                return status;
        iocb->ki_pos += written;
        return written;
}
EXPORT_SYMBOL(generic_perform_write);

/**
 * __generic_file_write_iter - write data to a file
 * @iocb:        IO state structure (file, offset, etc.)
 * @from:        iov_iter with data to write
 *
 * This function does all the work needed for actually writing data to a
 * file. It does all basic checks, removes SUID from the file, updates
 * modification times and calls proper subroutines depending on whether we
 * do direct IO or a standard buffered write.
 *
 * It expects i_rwsem to be grabbed unless we work on a block device or similar
 * object which does not need locking at all.
 *
 * This function does *not* take care of syncing data in case of O_SYNC write.
 * A caller has to handle it. This is mainly due to the fact that we want to
 * avoid syncing under i_rwsem.
 *
 * Return:
 * * number of bytes written, even for truncated writes
 * * negative error code if no data has been written at all
 */
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        ssize_t ret;

        ret = file_remove_privs(file);
        if (ret)
                return ret;

        ret = file_update_time(file);
        if (ret)
                return ret;

        if (iocb->ki_flags & IOCB_DIRECT) {
                ret = generic_file_direct_write(iocb, from);
                /*
                 * If the write stopped short of completing, fall back to
                 * buffered writes.  Some filesystems do this for writes to
                 * holes, for example.  For DAX files, a buffered write will
                 * not succeed (even if it did, DAX does not handle dirty
                 * page-cache pages correctly).
                 */
                if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode))
                        return ret;
                return direct_write_fallback(iocb, from, ret,
                                generic_perform_write(iocb, from));
        }

        return generic_perform_write(iocb, from);
}
EXPORT_SYMBOL(__generic_file_write_iter);

/**
 * generic_file_write_iter - write data to a file
 * @iocb:        IO state structure
 * @from:        iov_iter with data to write
 *
 * This is a wrapper around __generic_file_write_iter() to be used by most
 * filesystems. It takes care of syncing the file in case of O_SYNC file
 * and acquires i_rwsem as needed.
 * Return:
 * * negative error code if no data has been written at all of
 *   vfs_fsync_range() failed for a synchronous write
 * * number of bytes written, even for truncated writes
 */
ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        ssize_t ret;

        inode_lock(inode);
        ret = generic_write_checks(iocb, from);
        if (ret > 0)
                ret = __generic_file_write_iter(iocb, from);
        inode_unlock(inode);

        if (ret > 0)
                ret = generic_write_sync(iocb, ret);
        return ret;
}
EXPORT_SYMBOL(generic_file_write_iter);

/**
 * filemap_release_folio() - Release fs-specific metadata on a folio.
 * @folio: The folio which the kernel is trying to free.
 * @gfp: Memory allocation flags (and I/O mode).
 *
 * The address_space is trying to release any data attached to a folio
 * (presumably at folio->private).
 *
 * This will also be called if the private_2 flag is set on a page,
 * indicating that the folio has other metadata associated with it.
 *
 * The @gfp argument specifies whether I/O may be performed to release
 * this page (__GFP_IO), and whether the call may block
 * (__GFP_RECLAIM & __GFP_FS).
 *
 * Return: %true if the release was successful, otherwise %false.
 */
bool filemap_release_folio(struct folio *folio, gfp_t gfp)
{
        struct address_space * const mapping = folio->mapping;

        BUG_ON(!folio_test_locked(folio));
        if (!folio_needs_release(folio))
                return true;
        if (folio_test_writeback(folio))
                return false;

        if (mapping && mapping->a_ops->release_folio)
                return mapping->a_ops->release_folio(folio, gfp);
        return try_to_free_buffers(folio);
}
EXPORT_SYMBOL(filemap_release_folio);

/**
 * filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache
 * @inode: The inode to flush
 * @flush: Set to write back rather than simply invalidate.
 * @start: First byte to in range.
 * @end: Last byte in range (inclusive), or LLONG_MAX for everything from start
 *       onwards.
 *
 * Invalidate all the folios on an inode that contribute to the specified
 * range, possibly writing them back first.  Whilst the operation is
 * undertaken, the invalidate lock is held to prevent new folios from being
 * installed.
 */
int filemap_invalidate_inode(struct inode *inode, bool flush,
                             loff_t start, loff_t end)
{
        struct address_space *mapping = inode->i_mapping;
        pgoff_t first = start >> PAGE_SHIFT;
        pgoff_t last = end >> PAGE_SHIFT;
        pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1;

        if (!mapping || !mapping->nrpages || end < start)
                goto out;

        /* Prevent new folios from being added to the inode. */
        filemap_invalidate_lock(mapping);

        if (!mapping->nrpages)
                goto unlock;

        unmap_mapping_pages(mapping, first, nr, false);

        /* Write back the data if we're asked to. */
        if (flush) {
                struct writeback_control wbc = {
                        .sync_mode        = WB_SYNC_ALL,
                        .nr_to_write        = LONG_MAX,
                        .range_start        = start,
                        .range_end        = end,
                };

                filemap_fdatawrite_wbc(mapping, &wbc);
        }

        /* Wait for writeback to complete on all folios and discard. */
        truncate_inode_pages_range(mapping, start, end);

unlock:
        filemap_invalidate_unlock(mapping);
out:
        return filemap_check_errors(mapping);
}
EXPORT_SYMBOL_GPL(filemap_invalidate_inode);

#ifdef CONFIG_CACHESTAT_SYSCALL
/**
 * filemap_cachestat() - compute the page cache statistics of a mapping
 * @mapping:        The mapping to compute the statistics for.
 * @first_index:        The starting page cache index.
 * @last_index:        The final page index (inclusive).
 * @cs:        the cachestat struct to write the result to.
 *
 * This will query the page cache statistics of a mapping in the
 * page range of [first_index, last_index] (inclusive). The statistics
 * queried include: number of dirty pages, number of pages marked for
 * writeback, and the number of (recently) evicted pages.
 */
static void filemap_cachestat(struct address_space *mapping,
                pgoff_t first_index, pgoff_t last_index, struct cachestat *cs)
{
        XA_STATE(xas, &mapping->i_pages, first_index);
        struct folio *folio;

        rcu_read_lock();
        xas_for_each(&xas, folio, last_index) {
                int order;
                unsigned long nr_pages;
                pgoff_t folio_first_index, folio_last_index;

                /*
                 * Don't deref the folio. It is not pinned, and might
                 * get freed (and reused) underneath us.
                 *
                 * We *could* pin it, but that would be expensive for
                 * what should be a fast and lightweight syscall.
                 *
                 * Instead, derive all information of interest from
                 * the rcu-protected xarray.
                 */

                if (xas_retry(&xas, folio))
                        continue;

                order = xa_get_order(xas.xa, xas.xa_index);
                nr_pages = 1 << order;
                folio_first_index = round_down(xas.xa_index, 1 << order);
                folio_last_index = folio_first_index + nr_pages - 1;

                /* Folios might straddle the range boundaries, only count covered pages */
                if (folio_first_index < first_index)
                        nr_pages -= first_index - folio_first_index;

                if (folio_last_index > last_index)
                        nr_pages -= folio_last_index - last_index;

                if (xa_is_value(folio)) {
                        /* page is evicted */
                        void *shadow = (void *)folio;
                        bool workingset; /* not used */

                        cs->nr_evicted += nr_pages;

#ifdef CONFIG_SWAP /* implies CONFIG_MMU */
                        if (shmem_mapping(mapping)) {
                                /* shmem file - in swap cache */
                                swp_entry_t swp = radix_to_swp_entry(folio);

                                /* swapin error results in poisoned entry */
                                if (non_swap_entry(swp))
                                        goto resched;

                                /*
                                 * Getting a swap entry from the shmem
                                 * inode means we beat
                                 * shmem_unuse(). rcu_read_lock()
                                 * ensures swapoff waits for us before
                                 * freeing the swapper space. However,
                                 * we can race with swapping and
                                 * invalidation, so there might not be
                                 * a shadow in the swapcache (yet).
                                 */
                                shadow = get_shadow_from_swap_cache(swp);
                                if (!shadow)
                                        goto resched;
                        }
#endif
                        if (workingset_test_recent(shadow, true, &workingset))
                                cs->nr_recently_evicted += nr_pages;

                        goto resched;
                }

                /* page is in cache */
                cs->nr_cache += nr_pages;

                if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))
                        cs->nr_dirty += nr_pages;

                if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))
                        cs->nr_writeback += nr_pages;

resched:
                if (need_resched()) {
                        xas_pause(&xas);
                        cond_resched_rcu();
                }
        }
        rcu_read_unlock();
}

/*
 * The cachestat(2) system call.
 *
 * cachestat() returns the page cache statistics of a file in the
 * bytes range specified by `off` and `len`: number of cached pages,
 * number of dirty pages, number of pages marked for writeback,
 * number of evicted pages, and number of recently evicted pages.
 *
 * An evicted page is a page that is previously in the page cache
 * but has been evicted since. A page is recently evicted if its last
 * eviction was recent enough that its reentry to the cache would
 * indicate that it is actively being used by the system, and that
 * there is memory pressure on the system.
 *
 * `off` and `len` must be non-negative integers. If `len` > 0,
 * the queried range is [`off`, `off` + `len`]. If `len` == 0,
 * we will query in the range from `off` to the end of the file.
 *
 * The `flags` argument is unused for now, but is included for future
 * extensibility. User should pass 0 (i.e no flag specified).
 *
 * Currently, hugetlbfs is not supported.
 *
 * Because the status of a page can change after cachestat() checks it
 * but before it returns to the application, the returned values may
 * contain stale information.
 *
 * return values:
 *  zero        - success
 *  -EFAULT     - cstat or cstat_range points to an illegal address
 *  -EINVAL     - invalid flags
 *  -EBADF      - invalid file descriptor
 *  -EOPNOTSUPP - file descriptor is of a hugetlbfs file
 */
SYSCALL_DEFINE4(cachestat, unsigned int, fd,
                struct cachestat_range __user *, cstat_range,
                struct cachestat __user *, cstat, unsigned int, flags)
{
        struct fd f = fdget(fd);
        struct address_space *mapping;
        struct cachestat_range csr;
        struct cachestat cs;
        pgoff_t first_index, last_index;

        if (!f.file)
                return -EBADF;

        if (copy_from_user(&csr, cstat_range,
                        sizeof(struct cachestat_range))) {
                fdput(f);
                return -EFAULT;
        }

        /* hugetlbfs is not supported */
        if (is_file_hugepages(f.file)) {
                fdput(f);
                return -EOPNOTSUPP;
        }

        if (flags != 0) {
                fdput(f);
                return -EINVAL;
        }

        first_index = csr.off >> PAGE_SHIFT;
        last_index =
                csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;
        memset(&cs, 0, sizeof(struct cachestat));
        mapping = f.file->f_mapping;
        filemap_cachestat(mapping, first_index, last_index, &cs);
        fdput(f);

        if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
                return -EFAULT;

        return 0;
}
#endif /* CONFIG_CACHESTAT_SYSCALL */















    1 























































    1 











    1 


    1 









    1 
    1 
    1 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
// SPDX-License-Identifier: GPL-2.0
/*
 *
 * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
 *
 */

#include <linux/kernel.h>
#include <linux/types.h>

#include "ntfs_fs.h"

static inline u16 upcase_unicode_char(const u16 *upcase, u16 chr)
{
        if (chr < 'a')
                return chr;

        if (chr <= 'z')
                return chr - ('a' - 'A');

        return upcase[chr];
}

/*
 * ntfs_cmp_names
 *
 * Thanks Kari Argillander <kari.argillander@gmail.com> for idea and implementation 'bothcase'
 *
 * Straight way to compare names:
 * - Case insensitive
 * - If name equals and 'bothcases' then
 * - Case sensitive
 * 'Straight way' code scans input names twice in worst case.
 * Optimized code scans input names only once.
 */
int ntfs_cmp_names(const __le16 *s1, size_t l1, const __le16 *s2, size_t l2,
                   const u16 *upcase, bool bothcase)
{
        int diff1 = 0;
        int diff2;
        size_t len = min(l1, l2);

        if (!bothcase && upcase)
                goto case_insentive;

        for (; len; s1++, s2++, len--) {
                diff1 = le16_to_cpu(*s1) - le16_to_cpu(*s2);
                if (diff1) {
                        if (bothcase && upcase)
                                goto case_insentive;

                        return diff1;
                }
        }
        return l1 - l2;

case_insentive:
        for (; len; s1++, s2++, len--) {
                diff2 = upcase_unicode_char(upcase, le16_to_cpu(*s1)) -
                        upcase_unicode_char(upcase, le16_to_cpu(*s2));
                if (diff2)
                        return diff2;
        }

        diff2 = l1 - l2;
        return diff2 ? diff2 : diff1;
}

int ntfs_cmp_names_cpu(const struct cpu_str *uni1, const struct le_str *uni2,
                       const u16 *upcase, bool bothcase)
{
        const u16 *s1 = uni1->name;
        const __le16 *s2 = uni2->name;
        size_t l1 = uni1->len;
        size_t l2 = uni2->len;
        size_t len = min(l1, l2);
        int diff1 = 0;
        int diff2;

        if (!bothcase && upcase)
                goto case_insentive;

        for (; len; s1++, s2++, len--) {
                diff1 = *s1 - le16_to_cpu(*s2);
                if (diff1) {
                        if (bothcase && upcase)
                                goto case_insentive;

                        return diff1;
                }
        }
        return l1 - l2;

case_insentive:
        for (; len; s1++, s2++, len--) {
                diff2 = upcase_unicode_char(upcase, *s1) -
                        upcase_unicode_char(upcase, le16_to_cpu(*s2));
                if (diff2)
                        return diff2;
        }

        diff2 = l1 - l2;
        return diff2 ? diff2 : diff1;
}

/* Helper function for ntfs_d_hash. */
unsigned long ntfs_names_hash(const u16 *name, size_t len, const u16 *upcase,
                              unsigned long hash)
{
        while (len--) {
                unsigned int c = upcase_unicode_char(upcase, *name++);
                hash = partial_name_hash(c, hash);
        }

        return hash;
}




































































































































































































    3 




    1 



    1 

















    4 















    4 



































































































































































































































    5 
































































    5 












    5 
















































    7 


    3 
















    4 

    2 



















    5 

    5 















   17 

   14 












































    7 

















    7 

    4 






































   16 

   15 












    5 
    2 




   15 
    2 





   17 





    7 












    7 













    7 










   17 




    5 










    1 






























































































































































    4 







    4 


    5 

    5 
    5 






























    5 




















    5 
    5 









    5 























































    5 

    3 
    5 







    4 

















    4 




















































    4 

    5 
    5 







    4 









    5 
















    4 











































    3 















    4 
    2 














    5 
















   17 
    7 







   17 






   17 







    5 






    4 

































    7 

   16 

    1 























    5 
    5 





    5 

    5 

    1 












    5 










    5 























    4 






    5 




    4 




    1 



    4 

    4 


    4 















































    4 







    5 





































































    6 







    2 





    5 


    5 














































    3 

















    4 
    3 
    3 












































































































































































































































    4 












































































































































































































































    6 
    7 
    1 


    7 



    7 















































































































































































































































































































































































































































































































































   14 

















   15 
   14 


   14 






   13 
   14 








   15 
   15 


   15 

   14 


































































































































































































































































































































































































































































































































































































































































































    1 
    1 






    1 











    1 











    1 











    1 






























    4 




    5 

















    6 

    7 
    7 



















    6 




    7 


    7 

    7 


    1 

































































    4 































    1 



    1 

    1 

    1 
    1 


    1 












































































































































































    3 



    4 




    3 
    1 




    4 



    4 






    3 
    4 





    2 






    4 


















    4 


    4 











































































    3 
















    3 



    1 






    4 





    3 



    4 
    4 




































    3 






    2 
    2 

























    2 
























    7 




    6 


    3 








    6 



    5 


    4 


    4 

















    5 




    4 











    4 

















    2 





















    1 

















    2 

























    2 









    2 




    2 










    2 





































    5 








    5 




































    1 
    1 
    1 




    1 

    1 



    1 






    5 














    4 











    4 



    2 
    3 




    3 







    5 















    1 



    5 
    5 




    5 



    3 

























    5 









    5 















    5 






    4 

    5 



    5 













    4 


    4 


    4 
    4 



    5 







    5 















    4 











    5 



    5 

    4 






    5 
    2 





    5 


    5 
    4 




















    5 



    5 
    5 



    4 




    4 
    4 





















































    2 


    2 





    2 



    2 

    2 
    1 

    2 
    2 


    2 
    2 
    2 


    2 


    1 






    1 









    2 






    2 






    2 


    2 



























    2 



    2 
    2 
    2 
    2 


    2 




    2 





    2 




































    5 





    4 
    7 





    3 




    3 









































































    2 


















































    2 












    2 






    1 





    2 










    2 
    2 
    2 


    2 














    2 









    2 








    2 






    2 
    2 















    2 


    2 

























































































































































































































    4 
























    4 


    4 




























































    3 


    4 






    4 













    3 


    4 


    3 













    4 






    4 


    3 













    4 


    4 



    4 








    3 










    4 




    4 

    4 




















    7 









    6 












    6 

















































































    4 



    4 



    3 




























































    4 



    4 


















    2 






    3 




























    5 





    5 

































































    4 





    3 
















    3 





    4 

























































    5 















































    5 

    4 


    5 
    5 















    3 



















    3 


    5 

























































































































































































































    5 

    5 

































    1 




    1 












    1 
    1 


    1 



















































































































































































    2 








    2 





























































































































































































































































































































































    4 
    4 


    4 





































   12 











   14 


   13 


   13 





    2 





    2 


   11 
    2 


   13 

































































































    5 
    5 











    5 


    4 











    5 























































































































































































































    5 
















    4 



    3 
    5 




    5 

    4 










    4 
    5 




    5 
    5 

    4 




    5 





    5 
    5 







    5 


    5 

    5 
    4 















    3 





    5 
    5 



    5 
    4 

    5 


    4 
    5 










    5 




    5 


    5 
    5 

    5 





    4 






    4 






    5 














    5 


    5 




    5 





    5 





    5 




    5 









    5 





    5 
    4 

    5 





    5 


    5 




















    4 


    5 

    5 




    5 

    5 

    4 


    4 


    5 


    5 








    5 



    4 




    5 


    5 
    5 
    5 
    4 




    5 
    5 
    5 

    5 
    5 

    5 

    5 






















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
// SPDX-License-Identifier: GPL-2.0+
/*
 * Maple Tree implementation
 * Copyright (c) 2018-2022 Oracle Corporation
 * Authors: Liam R. Howlett <Liam.Howlett@oracle.com>
 *            Matthew Wilcox <willy@infradead.org>
 * Copyright (c) 2023 ByteDance
 * Author: Peng Zhang <zhangpeng.00@bytedance.com>
 */

/*
 * DOC: Interesting implementation details of the Maple Tree
 *
 * Each node type has a number of slots for entries and a number of slots for
 * pivots.  In the case of dense nodes, the pivots are implied by the position
 * and are simply the slot index + the minimum of the node.
 *
 * In regular B-Tree terms, pivots are called keys.  The term pivot is used to
 * indicate that the tree is specifying ranges.  Pivots may appear in the
 * subtree with an entry attached to the value whereas keys are unique to a
 * specific position of a B-tree.  Pivot values are inclusive of the slot with
 * the same index.
 *
 *
 * The following illustrates the layout of a range64 nodes slots and pivots.
 *
 *
 *  Slots -> | 0 | 1 | 2 | ... | 12 | 13 | 14 | 15 |
 *           ┬   ┬   ┬   ┬     ┬    ┬    ┬    ┬    ┬
 *           │   │   │   │     │    │    │    │    └─ Implied maximum
 *           │   │   │   │     │    │    │    └─ Pivot 14
 *           │   │   │   │     │    │    └─ Pivot 13
 *           │   │   │   │     │    └─ Pivot 12
 *           │   │   │   │     └─ Pivot 11
 *           │   │   │   └─ Pivot 2
 *           │   │   └─ Pivot 1
 *           │   └─ Pivot 0
 *           └─  Implied minimum
 *
 * Slot contents:
 *  Internal (non-leaf) nodes contain pointers to other nodes.
 *  Leaf nodes contain entries.
 *
 * The location of interest is often referred to as an offset.  All offsets have
 * a slot, but the last offset has an implied pivot from the node above (or
 * UINT_MAX for the root node.
 *
 * Ranges complicate certain write activities.  When modifying any of
 * the B-tree variants, it is known that one entry will either be added or
 * deleted.  When modifying the Maple Tree, one store operation may overwrite
 * the entire data set, or one half of the tree, or the middle half of the tree.
 *
 */


#include <linux/maple_tree.h>
#include <linux/xarray.h>
#include <linux/types.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/limits.h>
#include <asm/barrier.h>

#define CREATE_TRACE_POINTS
#include <trace/events/maple_tree.h>

#define MA_ROOT_PARENT 1

/*
 * Maple state flags
 * * MA_STATE_BULK                - Bulk insert mode
 * * MA_STATE_REBALANCE                - Indicate a rebalance during bulk insert
 * * MA_STATE_PREALLOC                - Preallocated nodes, WARN_ON allocation
 */
#define MA_STATE_BULK                1
#define MA_STATE_REBALANCE        2
#define MA_STATE_PREALLOC        4

#define ma_parent_ptr(x) ((struct maple_pnode *)(x))
#define mas_tree_parent(x) ((unsigned long)(x->tree) | MA_ROOT_PARENT)
#define ma_mnode_ptr(x) ((struct maple_node *)(x))
#define ma_enode_ptr(x) ((struct maple_enode *)(x))
static struct kmem_cache *maple_node_cache;

#ifdef CONFIG_DEBUG_MAPLE_TREE
static const unsigned long mt_max[] = {
        [maple_dense]                = MAPLE_NODE_SLOTS,
        [maple_leaf_64]                = ULONG_MAX,
        [maple_range_64]        = ULONG_MAX,
        [maple_arange_64]        = ULONG_MAX,
};
#define mt_node_max(x) mt_max[mte_node_type(x)]
#endif

static const unsigned char mt_slots[] = {
        [maple_dense]                = MAPLE_NODE_SLOTS,
        [maple_leaf_64]                = MAPLE_RANGE64_SLOTS,
        [maple_range_64]        = MAPLE_RANGE64_SLOTS,
        [maple_arange_64]        = MAPLE_ARANGE64_SLOTS,
};
#define mt_slot_count(x) mt_slots[mte_node_type(x)]

static const unsigned char mt_pivots[] = {
        [maple_dense]                = 0,
        [maple_leaf_64]                = MAPLE_RANGE64_SLOTS - 1,
        [maple_range_64]        = MAPLE_RANGE64_SLOTS - 1,
        [maple_arange_64]        = MAPLE_ARANGE64_SLOTS - 1,
};
#define mt_pivot_count(x) mt_pivots[mte_node_type(x)]

static const unsigned char mt_min_slots[] = {
        [maple_dense]                = MAPLE_NODE_SLOTS / 2,
        [maple_leaf_64]                = (MAPLE_RANGE64_SLOTS / 2) - 2,
        [maple_range_64]        = (MAPLE_RANGE64_SLOTS / 2) - 2,
        [maple_arange_64]        = (MAPLE_ARANGE64_SLOTS / 2) - 1,
};
#define mt_min_slot_count(x) mt_min_slots[mte_node_type(x)]

#define MAPLE_BIG_NODE_SLOTS        (MAPLE_RANGE64_SLOTS * 2 + 2)
#define MAPLE_BIG_NODE_GAPS        (MAPLE_ARANGE64_SLOTS * 2 + 1)

struct maple_big_node {
        struct maple_pnode *parent;
        unsigned long pivot[MAPLE_BIG_NODE_SLOTS - 1];
        union {
                struct maple_enode *slot[MAPLE_BIG_NODE_SLOTS];
                struct {
                        unsigned long padding[MAPLE_BIG_NODE_GAPS];
                        unsigned long gap[MAPLE_BIG_NODE_GAPS];
                };
        };
        unsigned char b_end;
        enum maple_type type;
};

/*
 * The maple_subtree_state is used to build a tree to replace a segment of an
 * existing tree in a more atomic way.  Any walkers of the older tree will hit a
 * dead node and restart on updates.
 */
struct maple_subtree_state {
        struct ma_state *orig_l;        /* Original left side of subtree */
        struct ma_state *orig_r;        /* Original right side of subtree */
        struct ma_state *l;                /* New left side of subtree */
        struct ma_state *m;                /* New middle of subtree (rare) */
        struct ma_state *r;                /* New right side of subtree */
        struct ma_topiary *free;        /* nodes to be freed */
        struct ma_topiary *destroy;        /* Nodes to be destroyed (walked and freed) */
        struct maple_big_node *bn;
};

#ifdef CONFIG_KASAN_STACK
/* Prevent mas_wr_bnode() from exceeding the stack frame limit */
#define noinline_for_kasan noinline_for_stack
#else
#define noinline_for_kasan inline
#endif

/* Functions */
static inline struct maple_node *mt_alloc_one(gfp_t gfp)
{
        return kmem_cache_alloc(maple_node_cache, gfp);
}

static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes)
{
        return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes);
}

static inline void mt_free_one(struct maple_node *node)
{
        kmem_cache_free(maple_node_cache, node);
}

static inline void mt_free_bulk(size_t size, void __rcu **nodes)
{
        kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes);
}

static void mt_free_rcu(struct rcu_head *head)
{
        struct maple_node *node = container_of(head, struct maple_node, rcu);

        kmem_cache_free(maple_node_cache, node);
}

/*
 * ma_free_rcu() - Use rcu callback to free a maple node
 * @node: The node to free
 *
 * The maple tree uses the parent pointer to indicate this node is no longer in
 * use and will be freed.
 */
static void ma_free_rcu(struct maple_node *node)
{
        WARN_ON(node->parent != ma_parent_ptr(node));
        call_rcu(&node->rcu, mt_free_rcu);
}

static void mas_set_height(struct ma_state *mas)
{
        unsigned int new_flags = mas->tree->ma_flags;

        new_flags &= ~MT_FLAGS_HEIGHT_MASK;
        MAS_BUG_ON(mas, mas->depth > MAPLE_HEIGHT_MAX);
        new_flags |= mas->depth << MT_FLAGS_HEIGHT_OFFSET;
        mas->tree->ma_flags = new_flags;
}

static unsigned int mas_mt_height(struct ma_state *mas)
{
        return mt_height(mas->tree);
}

static inline unsigned int mt_attr(struct maple_tree *mt)
{
        return mt->ma_flags & ~MT_FLAGS_HEIGHT_MASK;
}

static __always_inline enum maple_type mte_node_type(
                const struct maple_enode *entry)
{
        return ((unsigned long)entry >> MAPLE_NODE_TYPE_SHIFT) &
                MAPLE_NODE_TYPE_MASK;
}

static __always_inline bool ma_is_dense(const enum maple_type type)
{
        return type < maple_leaf_64;
}

static __always_inline bool ma_is_leaf(const enum maple_type type)
{
        return type < maple_range_64;
}

static __always_inline bool mte_is_leaf(const struct maple_enode *entry)
{
        return ma_is_leaf(mte_node_type(entry));
}

/*
 * We also reserve values with the bottom two bits set to '10' which are
 * below 4096
 */
static __always_inline bool mt_is_reserved(const void *entry)
{
        return ((unsigned long)entry < MAPLE_RESERVED_RANGE) &&
                xa_is_internal(entry);
}

static __always_inline void mas_set_err(struct ma_state *mas, long err)
{
        mas->node = MA_ERROR(err);
        mas->status = ma_error;
}

static __always_inline bool mas_is_ptr(const struct ma_state *mas)
{
        return mas->status == ma_root;
}

static __always_inline bool mas_is_start(const struct ma_state *mas)
{
        return mas->status == ma_start;
}

static __always_inline bool mas_is_none(const struct ma_state *mas)
{
        return mas->status == ma_none;
}

static __always_inline bool mas_is_paused(const struct ma_state *mas)
{
        return mas->status == ma_pause;
}

static __always_inline bool mas_is_overflow(struct ma_state *mas)
{
        return mas->status == ma_overflow;
}

static inline bool mas_is_underflow(struct ma_state *mas)
{
        return mas->status == ma_underflow;
}

static __always_inline struct maple_node *mte_to_node(
                const struct maple_enode *entry)
{
        return (struct maple_node *)((unsigned long)entry & ~MAPLE_NODE_MASK);
}

/*
 * mte_to_mat() - Convert a maple encoded node to a maple topiary node.
 * @entry: The maple encoded node
 *
 * Return: a maple topiary pointer
 */
static inline struct maple_topiary *mte_to_mat(const struct maple_enode *entry)
{
        return (struct maple_topiary *)
                ((unsigned long)entry & ~MAPLE_NODE_MASK);
}

/*
 * mas_mn() - Get the maple state node.
 * @mas: The maple state
 *
 * Return: the maple node (not encoded - bare pointer).
 */
static inline struct maple_node *mas_mn(const struct ma_state *mas)
{
        return mte_to_node(mas->node);
}

/*
 * mte_set_node_dead() - Set a maple encoded node as dead.
 * @mn: The maple encoded node.
 */
static inline void mte_set_node_dead(struct maple_enode *mn)
{
        mte_to_node(mn)->parent = ma_parent_ptr(mte_to_node(mn));
        smp_wmb(); /* Needed for RCU */
}

/* Bit 1 indicates the root is a node */
#define MAPLE_ROOT_NODE                        0x02
/* maple_type stored bit 3-6 */
#define MAPLE_ENODE_TYPE_SHIFT                0x03
/* Bit 2 means a NULL somewhere below */
#define MAPLE_ENODE_NULL                0x04

static inline struct maple_enode *mt_mk_node(const struct maple_node *node,
                                             enum maple_type type)
{
        return (void *)((unsigned long)node |
                        (type << MAPLE_ENODE_TYPE_SHIFT) | MAPLE_ENODE_NULL);
}

static inline void *mte_mk_root(const struct maple_enode *node)
{
        return (void *)((unsigned long)node | MAPLE_ROOT_NODE);
}

static inline void *mte_safe_root(const struct maple_enode *node)
{
        return (void *)((unsigned long)node & ~MAPLE_ROOT_NODE);
}

static inline void *mte_set_full(const struct maple_enode *node)
{
        return (void *)((unsigned long)node & ~MAPLE_ENODE_NULL);
}

static inline void *mte_clear_full(const struct maple_enode *node)
{
        return (void *)((unsigned long)node | MAPLE_ENODE_NULL);
}

static inline bool mte_has_null(const struct maple_enode *node)
{
        return (unsigned long)node & MAPLE_ENODE_NULL;
}

static __always_inline bool ma_is_root(struct maple_node *node)
{
        return ((unsigned long)node->parent & MA_ROOT_PARENT);
}

static __always_inline bool mte_is_root(const struct maple_enode *node)
{
        return ma_is_root(mte_to_node(node));
}

static inline bool mas_is_root_limits(const struct ma_state *mas)
{
        return !mas->min && mas->max == ULONG_MAX;
}

static __always_inline bool mt_is_alloc(struct maple_tree *mt)
{
        return (mt->ma_flags & MT_FLAGS_ALLOC_RANGE);
}

/*
 * The Parent Pointer
 * Excluding root, the parent pointer is 256B aligned like all other tree nodes.
 * When storing a 32 or 64 bit values, the offset can fit into 5 bits.  The 16
 * bit values need an extra bit to store the offset.  This extra bit comes from
 * a reuse of the last bit in the node type.  This is possible by using bit 1 to
 * indicate if bit 2 is part of the type or the slot.
 *
 * Note types:
 *  0x??1 = Root
 *  0x?00 = 16 bit nodes
 *  0x010 = 32 bit nodes
 *  0x110 = 64 bit nodes
 *
 * Slot size and alignment
 *  0b??1 : Root
 *  0b?00 : 16 bit values, type in 0-1, slot in 2-7
 *  0b010 : 32 bit values, type in 0-2, slot in 3-7
 *  0b110 : 64 bit values, type in 0-2, slot in 3-7
 */

#define MAPLE_PARENT_ROOT                0x01

#define MAPLE_PARENT_SLOT_SHIFT                0x03
#define MAPLE_PARENT_SLOT_MASK                0xF8

#define MAPLE_PARENT_16B_SLOT_SHIFT        0x02
#define MAPLE_PARENT_16B_SLOT_MASK        0xFC

#define MAPLE_PARENT_RANGE64                0x06
#define MAPLE_PARENT_RANGE32                0x04
#define MAPLE_PARENT_NOT_RANGE16        0x02

/*
 * mte_parent_shift() - Get the parent shift for the slot storage.
 * @parent: The parent pointer cast as an unsigned long
 * Return: The shift into that pointer to the star to of the slot
 */
static inline unsigned long mte_parent_shift(unsigned long parent)
{
        /* Note bit 1 == 0 means 16B */
        if (likely(parent & MAPLE_PARENT_NOT_RANGE16))
                return MAPLE_PARENT_SLOT_SHIFT;

        return MAPLE_PARENT_16B_SLOT_SHIFT;
}

/*
 * mte_parent_slot_mask() - Get the slot mask for the parent.
 * @parent: The parent pointer cast as an unsigned long.
 * Return: The slot mask for that parent.
 */
static inline unsigned long mte_parent_slot_mask(unsigned long parent)
{
        /* Note bit 1 == 0 means 16B */
        if (likely(parent & MAPLE_PARENT_NOT_RANGE16))
                return MAPLE_PARENT_SLOT_MASK;

        return MAPLE_PARENT_16B_SLOT_MASK;
}

/*
 * mas_parent_type() - Return the maple_type of the parent from the stored
 * parent type.
 * @mas: The maple state
 * @enode: The maple_enode to extract the parent's enum
 * Return: The node->parent maple_type
 */
static inline
enum maple_type mas_parent_type(struct ma_state *mas, struct maple_enode *enode)
{
        unsigned long p_type;

        p_type = (unsigned long)mte_to_node(enode)->parent;
        if (WARN_ON(p_type & MAPLE_PARENT_ROOT))
                return 0;

        p_type &= MAPLE_NODE_MASK;
        p_type &= ~mte_parent_slot_mask(p_type);
        switch (p_type) {
        case MAPLE_PARENT_RANGE64: /* or MAPLE_PARENT_ARANGE64 */
                if (mt_is_alloc(mas->tree))
                        return maple_arange_64;
                return maple_range_64;
        }

        return 0;
}

/*
 * mas_set_parent() - Set the parent node and encode the slot
 * @enode: The encoded maple node.
 * @parent: The encoded maple node that is the parent of @enode.
 * @slot: The slot that @enode resides in @parent.
 *
 * Slot number is encoded in the enode->parent bit 3-6 or 2-6, depending on the
 * parent type.
 */
static inline
void mas_set_parent(struct ma_state *mas, struct maple_enode *enode,
                    const struct maple_enode *parent, unsigned char slot)
{
        unsigned long val = (unsigned long)parent;
        unsigned long shift;
        unsigned long type;
        enum maple_type p_type = mte_node_type(parent);

        MAS_BUG_ON(mas, p_type == maple_dense);
        MAS_BUG_ON(mas, p_type == maple_leaf_64);

        switch (p_type) {
        case maple_range_64:
        case maple_arange_64:
                shift = MAPLE_PARENT_SLOT_SHIFT;
                type = MAPLE_PARENT_RANGE64;
                break;
        default:
        case maple_dense:
        case maple_leaf_64:
                shift = type = 0;
                break;
        }

        val &= ~MAPLE_NODE_MASK; /* Clear all node metadata in parent */
        val |= (slot << shift) | type;
        mte_to_node(enode)->parent = ma_parent_ptr(val);
}

/*
 * mte_parent_slot() - get the parent slot of @enode.
 * @enode: The encoded maple node.
 *
 * Return: The slot in the parent node where @enode resides.
 */
static __always_inline
unsigned int mte_parent_slot(const struct maple_enode *enode)
{
        unsigned long val = (unsigned long)mte_to_node(enode)->parent;

        if (unlikely(val & MA_ROOT_PARENT))
                return 0;

        /*
         * Okay to use MAPLE_PARENT_16B_SLOT_MASK as the last bit will be lost
         * by shift if the parent shift is MAPLE_PARENT_SLOT_SHIFT
         */
        return (val & MAPLE_PARENT_16B_SLOT_MASK) >> mte_parent_shift(val);
}

/*
 * mte_parent() - Get the parent of @node.
 * @node: The encoded maple node.
 *
 * Return: The parent maple node.
 */
static __always_inline
struct maple_node *mte_parent(const struct maple_enode *enode)
{
        return (void *)((unsigned long)
                        (mte_to_node(enode)->parent) & ~MAPLE_NODE_MASK);
}

/*
 * ma_dead_node() - check if the @enode is dead.
 * @enode: The encoded maple node
 *
 * Return: true if dead, false otherwise.
 */
static __always_inline bool ma_dead_node(const struct maple_node *node)
{
        struct maple_node *parent;

        /* Do not reorder reads from the node prior to the parent check */
        smp_rmb();
        parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK);
        return (parent == node);
}

/*
 * mte_dead_node() - check if the @enode is dead.
 * @enode: The encoded maple node
 *
 * Return: true if dead, false otherwise.
 */
static __always_inline bool mte_dead_node(const struct maple_enode *enode)
{
        struct maple_node *parent, *node;

        node = mte_to_node(enode);
        /* Do not reorder reads from the node prior to the parent check */
        smp_rmb();
        parent = mte_parent(enode);
        return (parent == node);
}

/*
 * mas_allocated() - Get the number of nodes allocated in a maple state.
 * @mas: The maple state
 *
 * The ma_state alloc member is overloaded to hold a pointer to the first
 * allocated node or to the number of requested nodes to allocate.  If bit 0 is
 * set, then the alloc contains the number of requested nodes.  If there is an
 * allocated node, then the total allocated nodes is in that node.
 *
 * Return: The total number of nodes allocated
 */
static inline unsigned long mas_allocated(const struct ma_state *mas)
{
        if (!mas->alloc || ((unsigned long)mas->alloc & 0x1))
                return 0;

        return mas->alloc->total;
}

/*
 * mas_set_alloc_req() - Set the requested number of allocations.
 * @mas: the maple state
 * @count: the number of allocations.
 *
 * The requested number of allocations is either in the first allocated node,
 * located in @mas->alloc->request_count, or directly in @mas->alloc if there is
 * no allocated node.  Set the request either in the node or do the necessary
 * encoding to store in @mas->alloc directly.
 */
static inline void mas_set_alloc_req(struct ma_state *mas, unsigned long count)
{
        if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) {
                if (!count)
                        mas->alloc = NULL;
                else
                        mas->alloc = (struct maple_alloc *)(((count) << 1U) | 1U);
                return;
        }

        mas->alloc->request_count = count;
}

/*
 * mas_alloc_req() - get the requested number of allocations.
 * @mas: The maple state
 *
 * The alloc count is either stored directly in @mas, or in
 * @mas->alloc->request_count if there is at least one node allocated.  Decode
 * the request count if it's stored directly in @mas->alloc.
 *
 * Return: The allocation request count.
 */
static inline unsigned int mas_alloc_req(const struct ma_state *mas)
{
        if ((unsigned long)mas->alloc & 0x1)
                return (unsigned long)(mas->alloc) >> 1;
        else if (mas->alloc)
                return mas->alloc->request_count;
        return 0;
}

/*
 * ma_pivots() - Get a pointer to the maple node pivots.
 * @node - the maple node
 * @type - the node type
 *
 * In the event of a dead node, this array may be %NULL
 *
 * Return: A pointer to the maple node pivots
 */
static inline unsigned long *ma_pivots(struct maple_node *node,
                                           enum maple_type type)
{
        switch (type) {
        case maple_arange_64:
                return node->ma64.pivot;
        case maple_range_64:
        case maple_leaf_64:
                return node->mr64.pivot;
        case maple_dense:
                return NULL;
        }
        return NULL;
}

/*
 * ma_gaps() - Get a pointer to the maple node gaps.
 * @node - the maple node
 * @type - the node type
 *
 * Return: A pointer to the maple node gaps
 */
static inline unsigned long *ma_gaps(struct maple_node *node,
                                     enum maple_type type)
{
        switch (type) {
        case maple_arange_64:
                return node->ma64.gap;
        case maple_range_64:
        case maple_leaf_64:
        case maple_dense:
                return NULL;
        }
        return NULL;
}

/*
 * mas_safe_pivot() - get the pivot at @piv or mas->max.
 * @mas: The maple state
 * @pivots: The pointer to the maple node pivots
 * @piv: The pivot to fetch
 * @type: The maple node type
 *
 * Return: The pivot at @piv within the limit of the @pivots array, @mas->max
 * otherwise.
 */
static __always_inline unsigned long
mas_safe_pivot(const struct ma_state *mas, unsigned long *pivots,
               unsigned char piv, enum maple_type type)
{
        if (piv >= mt_pivots[type])
                return mas->max;

        return pivots[piv];
}

/*
 * mas_safe_min() - Return the minimum for a given offset.
 * @mas: The maple state
 * @pivots: The pointer to the maple node pivots
 * @offset: The offset into the pivot array
 *
 * Return: The minimum range value that is contained in @offset.
 */
static inline unsigned long
mas_safe_min(struct ma_state *mas, unsigned long *pivots, unsigned char offset)
{
        if (likely(offset))
                return pivots[offset - 1] + 1;

        return mas->min;
}

/*
 * mte_set_pivot() - Set a pivot to a value in an encoded maple node.
 * @mn: The encoded maple node
 * @piv: The pivot offset
 * @val: The value of the pivot
 */
static inline void mte_set_pivot(struct maple_enode *mn, unsigned char piv,
                                unsigned long val)
{
        struct maple_node *node = mte_to_node(mn);
        enum maple_type type = mte_node_type(mn);

        BUG_ON(piv >= mt_pivots[type]);
        switch (type) {
        case maple_range_64:
        case maple_leaf_64:
                node->mr64.pivot[piv] = val;
                break;
        case maple_arange_64:
                node->ma64.pivot[piv] = val;
                break;
        case maple_dense:
                break;
        }

}

/*
 * ma_slots() - Get a pointer to the maple node slots.
 * @mn: The maple node
 * @mt: The maple node type
 *
 * Return: A pointer to the maple node slots
 */
static inline void __rcu **ma_slots(struct maple_node *mn, enum maple_type mt)
{
        switch (mt) {
        case maple_arange_64:
                return mn->ma64.slot;
        case maple_range_64:
        case maple_leaf_64:
                return mn->mr64.slot;
        case maple_dense:
                return mn->slot;
        }

        return NULL;
}

static inline bool mt_write_locked(const struct maple_tree *mt)
{
        return mt_external_lock(mt) ? mt_write_lock_is_held(mt) :
                lockdep_is_held(&mt->ma_lock);
}

static __always_inline bool mt_locked(const struct maple_tree *mt)
{
        return mt_external_lock(mt) ? mt_lock_is_held(mt) :
                lockdep_is_held(&mt->ma_lock);
}

static __always_inline void *mt_slot(const struct maple_tree *mt,
                void __rcu **slots, unsigned char offset)
{
        return rcu_dereference_check(slots[offset], mt_locked(mt));
}

static __always_inline void *mt_slot_locked(struct maple_tree *mt,
                void __rcu **slots, unsigned char offset)
{
        return rcu_dereference_protected(slots[offset], mt_write_locked(mt));
}
/*
 * mas_slot_locked() - Get the slot value when holding the maple tree lock.
 * @mas: The maple state
 * @slots: The pointer to the slots
 * @offset: The offset into the slots array to fetch
 *
 * Return: The entry stored in @slots at the @offset.
 */
static __always_inline void *mas_slot_locked(struct ma_state *mas,
                void __rcu **slots, unsigned char offset)
{
        return mt_slot_locked(mas->tree, slots, offset);
}

/*
 * mas_slot() - Get the slot value when not holding the maple tree lock.
 * @mas: The maple state
 * @slots: The pointer to the slots
 * @offset: The offset into the slots array to fetch
 *
 * Return: The entry stored in @slots at the @offset
 */
static __always_inline void *mas_slot(struct ma_state *mas, void __rcu **slots,
                unsigned char offset)
{
        return mt_slot(mas->tree, slots, offset);
}

/*
 * mas_root() - Get the maple tree root.
 * @mas: The maple state.
 *
 * Return: The pointer to the root of the tree
 */
static __always_inline void *mas_root(struct ma_state *mas)
{
        return rcu_dereference_check(mas->tree->ma_root, mt_locked(mas->tree));
}

static inline void *mt_root_locked(struct maple_tree *mt)
{
        return rcu_dereference_protected(mt->ma_root, mt_write_locked(mt));
}

/*
 * mas_root_locked() - Get the maple tree root when holding the maple tree lock.
 * @mas: The maple state.
 *
 * Return: The pointer to the root of the tree
 */
static inline void *mas_root_locked(struct ma_state *mas)
{
        return mt_root_locked(mas->tree);
}

static inline struct maple_metadata *ma_meta(struct maple_node *mn,
                                             enum maple_type mt)
{
        switch (mt) {
        case maple_arange_64:
                return &mn->ma64.meta;
        default:
                return &mn->mr64.meta;
        }
}

/*
 * ma_set_meta() - Set the metadata information of a node.
 * @mn: The maple node
 * @mt: The maple node type
 * @offset: The offset of the highest sub-gap in this node.
 * @end: The end of the data in this node.
 */
static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt,
                               unsigned char offset, unsigned char end)
{
        struct maple_metadata *meta = ma_meta(mn, mt);

        meta->gap = offset;
        meta->end = end;
}

/*
 * mt_clear_meta() - clear the metadata information of a node, if it exists
 * @mt: The maple tree
 * @mn: The maple node
 * @type: The maple node type
 * @offset: The offset of the highest sub-gap in this node.
 * @end: The end of the data in this node.
 */
static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn,
                                  enum maple_type type)
{
        struct maple_metadata *meta;
        unsigned long *pivots;
        void __rcu **slots;
        void *next;

        switch (type) {
        case maple_range_64:
                pivots = mn->mr64.pivot;
                if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) {
                        slots = mn->mr64.slot;
                        next = mt_slot_locked(mt, slots,
                                              MAPLE_RANGE64_SLOTS - 1);
                        if (unlikely((mte_to_node(next) &&
                                      mte_node_type(next))))
                                return; /* no metadata, could be node */
                }
                fallthrough;
        case maple_arange_64:
                meta = ma_meta(mn, type);
                break;
        default:
                return;
        }

        meta->gap = 0;
        meta->end = 0;
}

/*
 * ma_meta_end() - Get the data end of a node from the metadata
 * @mn: The maple node
 * @mt: The maple node type
 */
static inline unsigned char ma_meta_end(struct maple_node *mn,
                                        enum maple_type mt)
{
        struct maple_metadata *meta = ma_meta(mn, mt);

        return meta->end;
}

/*
 * ma_meta_gap() - Get the largest gap location of a node from the metadata
 * @mn: The maple node
 */
static inline unsigned char ma_meta_gap(struct maple_node *mn)
{
        return mn->ma64.meta.gap;
}

/*
 * ma_set_meta_gap() - Set the largest gap location in a nodes metadata
 * @mn: The maple node
 * @mn: The maple node type
 * @offset: The location of the largest gap.
 */
static inline void ma_set_meta_gap(struct maple_node *mn, enum maple_type mt,
                                   unsigned char offset)
{

        struct maple_metadata *meta = ma_meta(mn, mt);

        meta->gap = offset;
}

/*
 * mat_add() - Add a @dead_enode to the ma_topiary of a list of dead nodes.
 * @mat - the ma_topiary, a linked list of dead nodes.
 * @dead_enode - the node to be marked as dead and added to the tail of the list
 *
 * Add the @dead_enode to the linked list in @mat.
 */
static inline void mat_add(struct ma_topiary *mat,
                           struct maple_enode *dead_enode)
{
        mte_set_node_dead(dead_enode);
        mte_to_mat(dead_enode)->next = NULL;
        if (!mat->tail) {
                mat->tail = mat->head = dead_enode;
                return;
        }

        mte_to_mat(mat->tail)->next = dead_enode;
        mat->tail = dead_enode;
}

static void mt_free_walk(struct rcu_head *head);
static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt,
                            bool free);
/*
 * mas_mat_destroy() - Free all nodes and subtrees in a dead list.
 * @mas - the maple state
 * @mat - the ma_topiary linked list of dead nodes to free.
 *
 * Destroy walk a dead list.
 */
static void mas_mat_destroy(struct ma_state *mas, struct ma_topiary *mat)
{
        struct maple_enode *next;
        struct maple_node *node;
        bool in_rcu = mt_in_rcu(mas->tree);

        while (mat->head) {
                next = mte_to_mat(mat->head)->next;
                node = mte_to_node(mat->head);
                mt_destroy_walk(mat->head, mas->tree, !in_rcu);
                if (in_rcu)
                        call_rcu(&node->rcu, mt_free_walk);
                mat->head = next;
        }
}
/*
 * mas_descend() - Descend into the slot stored in the ma_state.
 * @mas - the maple state.
 *
 * Note: Not RCU safe, only use in write side or debug code.
 */
static inline void mas_descend(struct ma_state *mas)
{
        enum maple_type type;
        unsigned long *pivots;
        struct maple_node *node;
        void __rcu **slots;

        node = mas_mn(mas);
        type = mte_node_type(mas->node);
        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);

        if (mas->offset)
                mas->min = pivots[mas->offset - 1] + 1;
        mas->max = mas_safe_pivot(mas, pivots, mas->offset, type);
        mas->node = mas_slot(mas, slots, mas->offset);
}

/*
 * mte_set_gap() - Set a maple node gap.
 * @mn: The encoded maple node
 * @gap: The offset of the gap to set
 * @val: The gap value
 */
static inline void mte_set_gap(const struct maple_enode *mn,
                                 unsigned char gap, unsigned long val)
{
        switch (mte_node_type(mn)) {
        default:
                break;
        case maple_arange_64:
                mte_to_node(mn)->ma64.gap[gap] = val;
                break;
        }
}

/*
 * mas_ascend() - Walk up a level of the tree.
 * @mas: The maple state
 *
 * Sets the @mas->max and @mas->min to the correct values when walking up.  This
 * may cause several levels of walking up to find the correct min and max.
 * May find a dead node which will cause a premature return.
 * Return: 1 on dead node, 0 otherwise
 */
static int mas_ascend(struct ma_state *mas)
{
        struct maple_enode *p_enode; /* parent enode. */
        struct maple_enode *a_enode; /* ancestor enode. */
        struct maple_node *a_node; /* ancestor node. */
        struct maple_node *p_node; /* parent node. */
        unsigned char a_slot;
        enum maple_type a_type;
        unsigned long min, max;
        unsigned long *pivots;
        bool set_max = false, set_min = false;

        a_node = mas_mn(mas);
        if (ma_is_root(a_node)) {
                mas->offset = 0;
                return 0;
        }

        p_node = mte_parent(mas->node);
        if (unlikely(a_node == p_node))
                return 1;

        a_type = mas_parent_type(mas, mas->node);
        mas->offset = mte_parent_slot(mas->node);
        a_enode = mt_mk_node(p_node, a_type);

        /* Check to make sure all parent information is still accurate */
        if (p_node != mte_parent(mas->node))
                return 1;

        mas->node = a_enode;

        if (mte_is_root(a_enode)) {
                mas->max = ULONG_MAX;
                mas->min = 0;
                return 0;
        }

        min = 0;
        max = ULONG_MAX;
        if (!mas->offset) {
                min = mas->min;
                set_min = true;
        }

        if (mas->max == ULONG_MAX)
                set_max = true;

        do {
                p_enode = a_enode;
                a_type = mas_parent_type(mas, p_enode);
                a_node = mte_parent(p_enode);
                a_slot = mte_parent_slot(p_enode);
                a_enode = mt_mk_node(a_node, a_type);
                pivots = ma_pivots(a_node, a_type);

                if (unlikely(ma_dead_node(a_node)))
                        return 1;

                if (!set_min && a_slot) {
                        set_min = true;
                        min = pivots[a_slot - 1] + 1;
                }

                if (!set_max && a_slot < mt_pivots[a_type]) {
                        set_max = true;
                        max = pivots[a_slot];
                }

                if (unlikely(ma_dead_node(a_node)))
                        return 1;

                if (unlikely(ma_is_root(a_node)))
                        break;

        } while (!set_min || !set_max);

        mas->max = max;
        mas->min = min;
        return 0;
}

/*
 * mas_pop_node() - Get a previously allocated maple node from the maple state.
 * @mas: The maple state
 *
 * Return: A pointer to a maple node.
 */
static inline struct maple_node *mas_pop_node(struct ma_state *mas)
{
        struct maple_alloc *ret, *node = mas->alloc;
        unsigned long total = mas_allocated(mas);
        unsigned int req = mas_alloc_req(mas);

        /* nothing or a request pending. */
        if (WARN_ON(!total))
                return NULL;

        if (total == 1) {
                /* single allocation in this ma_state */
                mas->alloc = NULL;
                ret = node;
                goto single_node;
        }

        if (node->node_count == 1) {
                /* Single allocation in this node. */
                mas->alloc = node->slot[0];
                mas->alloc->total = node->total - 1;
                ret = node;
                goto new_head;
        }
        node->total--;
        ret = node->slot[--node->node_count];
        node->slot[node->node_count] = NULL;

single_node:
new_head:
        if (req) {
                req++;
                mas_set_alloc_req(mas, req);
        }

        memset(ret, 0, sizeof(*ret));
        return (struct maple_node *)ret;
}

/*
 * mas_push_node() - Push a node back on the maple state allocation.
 * @mas: The maple state
 * @used: The used maple node
 *
 * Stores the maple node back into @mas->alloc for reuse.  Updates allocated and
 * requested node count as necessary.
 */
static inline void mas_push_node(struct ma_state *mas, struct maple_node *used)
{
        struct maple_alloc *reuse = (struct maple_alloc *)used;
        struct maple_alloc *head = mas->alloc;
        unsigned long count;
        unsigned int requested = mas_alloc_req(mas);

        count = mas_allocated(mas);

        reuse->request_count = 0;
        reuse->node_count = 0;
        if (count && (head->node_count < MAPLE_ALLOC_SLOTS)) {
                head->slot[head->node_count++] = reuse;
                head->total++;
                goto done;
        }

        reuse->total = 1;
        if ((head) && !((unsigned long)head & 0x1)) {
                reuse->slot[0] = head;
                reuse->node_count = 1;
                reuse->total += head->total;
        }

        mas->alloc = reuse;
done:
        if (requested > 1)
                mas_set_alloc_req(mas, requested - 1);
}

/*
 * mas_alloc_nodes() - Allocate nodes into a maple state
 * @mas: The maple state
 * @gfp: The GFP Flags
 */
static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
{
        struct maple_alloc *node;
        unsigned long allocated = mas_allocated(mas);
        unsigned int requested = mas_alloc_req(mas);
        unsigned int count;
        void **slots = NULL;
        unsigned int max_req = 0;

        if (!requested)
                return;

        mas_set_alloc_req(mas, 0);
        if (mas->mas_flags & MA_STATE_PREALLOC) {
                if (allocated)
                        return;
                BUG_ON(!allocated);
                WARN_ON(!allocated);
        }

        if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) {
                node = (struct maple_alloc *)mt_alloc_one(gfp);
                if (!node)
                        goto nomem_one;

                if (allocated) {
                        node->slot[0] = mas->alloc;
                        node->node_count = 1;
                } else {
                        node->node_count = 0;
                }

                mas->alloc = node;
                node->total = ++allocated;
                requested--;
        }

        node = mas->alloc;
        node->request_count = 0;
        while (requested) {
                max_req = MAPLE_ALLOC_SLOTS - node->node_count;
                slots = (void **)&node->slot[node->node_count];
                max_req = min(requested, max_req);
                count = mt_alloc_bulk(gfp, max_req, slots);
                if (!count)
                        goto nomem_bulk;

                if (node->node_count == 0) {
                        node->slot[0]->node_count = 0;
                        node->slot[0]->request_count = 0;
                }

                node->node_count += count;
                allocated += count;
                node = node->slot[0];
                requested -= count;
        }
        mas->alloc->total = allocated;
        return;

nomem_bulk:
        /* Clean up potential freed allocations on bulk failure */
        memset(slots, 0, max_req * sizeof(unsigned long));
nomem_one:
        mas_set_alloc_req(mas, requested);
        if (mas->alloc && !(((unsigned long)mas->alloc & 0x1)))
                mas->alloc->total = allocated;
        mas_set_err(mas, -ENOMEM);
}

/*
 * mas_free() - Free an encoded maple node
 * @mas: The maple state
 * @used: The encoded maple node to free.
 *
 * Uses rcu free if necessary, pushes @used back on the maple state allocations
 * otherwise.
 */
static inline void mas_free(struct ma_state *mas, struct maple_enode *used)
{
        struct maple_node *tmp = mte_to_node(used);

        if (mt_in_rcu(mas->tree))
                ma_free_rcu(tmp);
        else
                mas_push_node(mas, tmp);
}

/*
 * mas_node_count_gfp() - Check if enough nodes are allocated and request more
 * if there is not enough nodes.
 * @mas: The maple state
 * @count: The number of nodes needed
 * @gfp: the gfp flags
 */
static void mas_node_count_gfp(struct ma_state *mas, int count, gfp_t gfp)
{
        unsigned long allocated = mas_allocated(mas);

        if (allocated < count) {
                mas_set_alloc_req(mas, count - allocated);
                mas_alloc_nodes(mas, gfp);
        }
}

/*
 * mas_node_count() - Check if enough nodes are allocated and request more if
 * there is not enough nodes.
 * @mas: The maple state
 * @count: The number of nodes needed
 *
 * Note: Uses GFP_NOWAIT | __GFP_NOWARN for gfp flags.
 */
static void mas_node_count(struct ma_state *mas, int count)
{
        return mas_node_count_gfp(mas, count, GFP_NOWAIT | __GFP_NOWARN);
}

/*
 * mas_start() - Sets up maple state for operations.
 * @mas: The maple state.
 *
 * If mas->status == mas_start, then set the min, max and depth to
 * defaults.
 *
 * Return:
 * - If mas->node is an error or not mas_start, return NULL.
 * - If it's an empty tree:     NULL & mas->status == ma_none
 * - If it's a single entry:    The entry & mas->status == mas_root
 * - If it's a tree:            NULL & mas->status == safe root node.
 */
static inline struct maple_enode *mas_start(struct ma_state *mas)
{
        if (likely(mas_is_start(mas))) {
                struct maple_enode *root;

                mas->min = 0;
                mas->max = ULONG_MAX;

retry:
                mas->depth = 0;
                root = mas_root(mas);
                /* Tree with nodes */
                if (likely(xa_is_node(root))) {
                        mas->depth = 1;
                        mas->status = ma_active;
                        mas->node = mte_safe_root(root);
                        mas->offset = 0;
                        if (mte_dead_node(mas->node))
                                goto retry;

                        return NULL;
                }

                /* empty tree */
                if (unlikely(!root)) {
                        mas->node = NULL;
                        mas->status = ma_none;
                        mas->offset = MAPLE_NODE_SLOTS;
                        return NULL;
                }

                /* Single entry tree */
                mas->status = ma_root;
                mas->offset = MAPLE_NODE_SLOTS;

                /* Single entry tree. */
                if (mas->index > 0)
                        return NULL;

                return root;
        }

        return NULL;
}

/*
 * ma_data_end() - Find the end of the data in a node.
 * @node: The maple node
 * @type: The maple node type
 * @pivots: The array of pivots in the node
 * @max: The maximum value in the node
 *
 * Uses metadata to find the end of the data when possible.
 * Return: The zero indexed last slot with data (may be null).
 */
static __always_inline unsigned char ma_data_end(struct maple_node *node,
                enum maple_type type, unsigned long *pivots, unsigned long max)
{
        unsigned char offset;

        if (!pivots)
                return 0;

        if (type == maple_arange_64)
                return ma_meta_end(node, type);

        offset = mt_pivots[type] - 1;
        if (likely(!pivots[offset]))
                return ma_meta_end(node, type);

        if (likely(pivots[offset] == max))
                return offset;

        return mt_pivots[type];
}

/*
 * mas_data_end() - Find the end of the data (slot).
 * @mas: the maple state
 *
 * This method is optimized to check the metadata of a node if the node type
 * supports data end metadata.
 *
 * Return: The zero indexed last slot with data (may be null).
 */
static inline unsigned char mas_data_end(struct ma_state *mas)
{
        enum maple_type type;
        struct maple_node *node;
        unsigned char offset;
        unsigned long *pivots;

        type = mte_node_type(mas->node);
        node = mas_mn(mas);
        if (type == maple_arange_64)
                return ma_meta_end(node, type);

        pivots = ma_pivots(node, type);
        if (unlikely(ma_dead_node(node)))
                return 0;

        offset = mt_pivots[type] - 1;
        if (likely(!pivots[offset]))
                return ma_meta_end(node, type);

        if (likely(pivots[offset] == mas->max))
                return offset;

        return mt_pivots[type];
}

/*
 * mas_leaf_max_gap() - Returns the largest gap in a leaf node
 * @mas - the maple state
 *
 * Return: The maximum gap in the leaf.
 */
static unsigned long mas_leaf_max_gap(struct ma_state *mas)
{
        enum maple_type mt;
        unsigned long pstart, gap, max_gap;
        struct maple_node *mn;
        unsigned long *pivots;
        void __rcu **slots;
        unsigned char i;
        unsigned char max_piv;

        mt = mte_node_type(mas->node);
        mn = mas_mn(mas);
        slots = ma_slots(mn, mt);
        max_gap = 0;
        if (unlikely(ma_is_dense(mt))) {
                gap = 0;
                for (i = 0; i < mt_slots[mt]; i++) {
                        if (slots[i]) {
                                if (gap > max_gap)
                                        max_gap = gap;
                                gap = 0;
                        } else {
                                gap++;
                        }
                }
                if (gap > max_gap)
                        max_gap = gap;
                return max_gap;
        }

        /*
         * Check the first implied pivot optimizes the loop below and slot 1 may
         * be skipped if there is a gap in slot 0.
         */
        pivots = ma_pivots(mn, mt);
        if (likely(!slots[0])) {
                max_gap = pivots[0] - mas->min + 1;
                i = 2;
        } else {
                i = 1;
        }

        /* reduce max_piv as the special case is checked before the loop */
        max_piv = ma_data_end(mn, mt, pivots, mas->max) - 1;
        /*
         * Check end implied pivot which can only be a gap on the right most
         * node.
         */
        if (unlikely(mas->max == ULONG_MAX) && !slots[max_piv + 1]) {
                gap = ULONG_MAX - pivots[max_piv];
                if (gap > max_gap)
                        max_gap = gap;

                if (max_gap > pivots[max_piv] - mas->min)
                        return max_gap;
        }

        for (; i <= max_piv; i++) {
                /* data == no gap. */
                if (likely(slots[i]))
                        continue;

                pstart = pivots[i - 1];
                gap = pivots[i] - pstart;
                if (gap > max_gap)
                        max_gap = gap;

                /* There cannot be two gaps in a row. */
                i++;
        }
        return max_gap;
}

/*
 * ma_max_gap() - Get the maximum gap in a maple node (non-leaf)
 * @node: The maple node
 * @gaps: The pointer to the gaps
 * @mt: The maple node type
 * @*off: Pointer to store the offset location of the gap.
 *
 * Uses the metadata data end to scan backwards across set gaps.
 *
 * Return: The maximum gap value
 */
static inline unsigned long
ma_max_gap(struct maple_node *node, unsigned long *gaps, enum maple_type mt,
            unsigned char *off)
{
        unsigned char offset, i;
        unsigned long max_gap = 0;

        i = offset = ma_meta_end(node, mt);
        do {
                if (gaps[i] > max_gap) {
                        max_gap = gaps[i];
                        offset = i;
                }
        } while (i--);

        *off = offset;
        return max_gap;
}

/*
 * mas_max_gap() - find the largest gap in a non-leaf node and set the slot.
 * @mas: The maple state.
 *
 * Return: The gap value.
 */
static inline unsigned long mas_max_gap(struct ma_state *mas)
{
        unsigned long *gaps;
        unsigned char offset;
        enum maple_type mt;
        struct maple_node *node;

        mt = mte_node_type(mas->node);
        if (ma_is_leaf(mt))
                return mas_leaf_max_gap(mas);

        node = mas_mn(mas);
        MAS_BUG_ON(mas, mt != maple_arange_64);
        offset = ma_meta_gap(node);
        gaps = ma_gaps(node, mt);
        return gaps[offset];
}

/*
 * mas_parent_gap() - Set the parent gap and any gaps above, as needed
 * @mas: The maple state
 * @offset: The gap offset in the parent to set
 * @new: The new gap value.
 *
 * Set the parent gap then continue to set the gap upwards, using the metadata
 * of the parent to see if it is necessary to check the node above.
 */
static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset,
                unsigned long new)
{
        unsigned long meta_gap = 0;
        struct maple_node *pnode;
        struct maple_enode *penode;
        unsigned long *pgaps;
        unsigned char meta_offset;
        enum maple_type pmt;

        pnode = mte_parent(mas->node);
        pmt = mas_parent_type(mas, mas->node);
        penode = mt_mk_node(pnode, pmt);
        pgaps = ma_gaps(pnode, pmt);

ascend:
        MAS_BUG_ON(mas, pmt != maple_arange_64);
        meta_offset = ma_meta_gap(pnode);
        meta_gap = pgaps[meta_offset];

        pgaps[offset] = new;

        if (meta_gap == new)
                return;

        if (offset != meta_offset) {
                if (meta_gap > new)
                        return;

                ma_set_meta_gap(pnode, pmt, offset);
        } else if (new < meta_gap) {
                new = ma_max_gap(pnode, pgaps, pmt, &meta_offset);
                ma_set_meta_gap(pnode, pmt, meta_offset);
        }

        if (ma_is_root(pnode))
                return;

        /* Go to the parent node. */
        pnode = mte_parent(penode);
        pmt = mas_parent_type(mas, penode);
        pgaps = ma_gaps(pnode, pmt);
        offset = mte_parent_slot(penode);
        penode = mt_mk_node(pnode, pmt);
        goto ascend;
}

/*
 * mas_update_gap() - Update a nodes gaps and propagate up if necessary.
 * @mas - the maple state.
 */
static inline void mas_update_gap(struct ma_state *mas)
{
        unsigned char pslot;
        unsigned long p_gap;
        unsigned long max_gap;

        if (!mt_is_alloc(mas->tree))
                return;

        if (mte_is_root(mas->node))
                return;

        max_gap = mas_max_gap(mas);

        pslot = mte_parent_slot(mas->node);
        p_gap = ma_gaps(mte_parent(mas->node),
                        mas_parent_type(mas, mas->node))[pslot];

        if (p_gap != max_gap)
                mas_parent_gap(mas, pslot, max_gap);
}

/*
 * mas_adopt_children() - Set the parent pointer of all nodes in @parent to
 * @parent with the slot encoded.
 * @mas - the maple state (for the tree)
 * @parent - the maple encoded node containing the children.
 */
static inline void mas_adopt_children(struct ma_state *mas,
                struct maple_enode *parent)
{
        enum maple_type type = mte_node_type(parent);
        struct maple_node *node = mte_to_node(parent);
        void __rcu **slots = ma_slots(node, type);
        unsigned long *pivots = ma_pivots(node, type);
        struct maple_enode *child;
        unsigned char offset;

        offset = ma_data_end(node, type, pivots, mas->max);
        do {
                child = mas_slot_locked(mas, slots, offset);
                mas_set_parent(mas, child, parent, offset);
        } while (offset--);
}

/*
 * mas_put_in_tree() - Put a new node in the tree, smp_wmb(), and mark the old
 * node as dead.
 * @mas - the maple state with the new node
 * @old_enode - The old maple encoded node to replace.
 */
static inline void mas_put_in_tree(struct ma_state *mas,
                struct maple_enode *old_enode)
        __must_hold(mas->tree->ma_lock)
{
        unsigned char offset;
        void __rcu **slots;

        if (mte_is_root(mas->node)) {
                mas_mn(mas)->parent = ma_parent_ptr(mas_tree_parent(mas));
                rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node));
                mas_set_height(mas);
        } else {

                offset = mte_parent_slot(mas->node);
                slots = ma_slots(mte_parent(mas->node),
                                 mas_parent_type(mas, mas->node));
                rcu_assign_pointer(slots[offset], mas->node);
        }

        mte_set_node_dead(old_enode);
}

/*
 * mas_replace_node() - Replace a node by putting it in the tree, marking it
 * dead, and freeing it.
 * the parent encoding to locate the maple node in the tree.
 * @mas - the ma_state with @mas->node pointing to the new node.
 * @old_enode - The old maple encoded node.
 */
static inline void mas_replace_node(struct ma_state *mas,
                struct maple_enode *old_enode)
        __must_hold(mas->tree->ma_lock)
{
        mas_put_in_tree(mas, old_enode);
        mas_free(mas, old_enode);
}

/*
 * mas_find_child() - Find a child who has the parent @mas->node.
 * @mas: the maple state with the parent.
 * @child: the maple state to store the child.
 */
static inline bool mas_find_child(struct ma_state *mas, struct ma_state *child)
        __must_hold(mas->tree->ma_lock)
{
        enum maple_type mt;
        unsigned char offset;
        unsigned char end;
        unsigned long *pivots;
        struct maple_enode *entry;
        struct maple_node *node;
        void __rcu **slots;

        mt = mte_node_type(mas->node);
        node = mas_mn(mas);
        slots = ma_slots(node, mt);
        pivots = ma_pivots(node, mt);
        end = ma_data_end(node, mt, pivots, mas->max);
        for (offset = mas->offset; offset <= end; offset++) {
                entry = mas_slot_locked(mas, slots, offset);
                if (mte_parent(entry) == node) {
                        *child = *mas;
                        mas->offset = offset + 1;
                        child->offset = offset;
                        mas_descend(child);
                        child->offset = 0;
                        return true;
                }
        }
        return false;
}

/*
 * mab_shift_right() - Shift the data in mab right. Note, does not clean out the
 * old data or set b_node->b_end.
 * @b_node: the maple_big_node
 * @shift: the shift count
 */
static inline void mab_shift_right(struct maple_big_node *b_node,
                                 unsigned char shift)
{
        unsigned long size = b_node->b_end * sizeof(unsigned long);

        memmove(b_node->pivot + shift, b_node->pivot, size);
        memmove(b_node->slot + shift, b_node->slot, size);
        if (b_node->type == maple_arange_64)
                memmove(b_node->gap + shift, b_node->gap, size);
}

/*
 * mab_middle_node() - Check if a middle node is needed (unlikely)
 * @b_node: the maple_big_node that contains the data.
 * @size: the amount of data in the b_node
 * @split: the potential split location
 * @slot_count: the size that can be stored in a single node being considered.
 *
 * Return: true if a middle node is required.
 */
static inline bool mab_middle_node(struct maple_big_node *b_node, int split,
                                   unsigned char slot_count)
{
        unsigned char size = b_node->b_end;

        if (size >= 2 * slot_count)
                return true;

        if (!b_node->slot[split] && (size >= 2 * slot_count - 1))
                return true;

        return false;
}

/*
 * mab_no_null_split() - ensure the split doesn't fall on a NULL
 * @b_node: the maple_big_node with the data
 * @split: the suggested split location
 * @slot_count: the number of slots in the node being considered.
 *
 * Return: the split location.
 */
static inline int mab_no_null_split(struct maple_big_node *b_node,
                                    unsigned char split, unsigned char slot_count)
{
        if (!b_node->slot[split]) {
                /*
                 * If the split is less than the max slot && the right side will
                 * still be sufficient, then increment the split on NULL.
                 */
                if ((split < slot_count - 1) &&
                    (b_node->b_end - split) > (mt_min_slots[b_node->type]))
                        split++;
                else
                        split--;
        }
        return split;
}

/*
 * mab_calc_split() - Calculate the split location and if there needs to be two
 * splits.
 * @bn: The maple_big_node with the data
 * @mid_split: The second split, if required.  0 otherwise.
 *
 * Return: The first split location.  The middle split is set in @mid_split.
 */
static inline int mab_calc_split(struct ma_state *mas,
         struct maple_big_node *bn, unsigned char *mid_split, unsigned long min)
{
        unsigned char b_end = bn->b_end;
        int split = b_end / 2; /* Assume equal split. */
        unsigned char slot_min, slot_count = mt_slots[bn->type];

        /*
         * To support gap tracking, all NULL entries are kept together and a node cannot
         * end on a NULL entry, with the exception of the left-most leaf.  The
         * limitation means that the split of a node must be checked for this condition
         * and be able to put more data in one direction or the other.
         */
        if (unlikely((mas->mas_flags & MA_STATE_BULK))) {
                *mid_split = 0;
                split = b_end - mt_min_slots[bn->type];

                if (!ma_is_leaf(bn->type))
                        return split;

                mas->mas_flags |= MA_STATE_REBALANCE;
                if (!bn->slot[split])
                        split--;
                return split;
        }

        /*
         * Although extremely rare, it is possible to enter what is known as the 3-way
         * split scenario.  The 3-way split comes about by means of a store of a range
         * that overwrites the end and beginning of two full nodes.  The result is a set
         * of entries that cannot be stored in 2 nodes.  Sometimes, these two nodes can
         * also be located in different parent nodes which are also full.  This can
         * carry upwards all the way to the root in the worst case.
         */
        if (unlikely(mab_middle_node(bn, split, slot_count))) {
                split = b_end / 3;
                *mid_split = split * 2;
        } else {
                slot_min = mt_min_slots[bn->type];

                *mid_split = 0;
                /*
                 * Avoid having a range less than the slot count unless it
                 * causes one node to be deficient.
                 * NOTE: mt_min_slots is 1 based, b_end and split are zero.
                 */
                while ((split < slot_count - 1) &&
                       ((bn->pivot[split] - min) < slot_count - 1) &&
                       (b_end - split > slot_min))
                        split++;
        }

        /* Avoid ending a node on a NULL entry */
        split = mab_no_null_split(bn, split, slot_count);

        if (unlikely(*mid_split))
                *mid_split = mab_no_null_split(bn, *mid_split, slot_count);

        return split;
}

/*
 * mas_mab_cp() - Copy data from a maple state inclusively to a maple_big_node
 * and set @b_node->b_end to the next free slot.
 * @mas: The maple state
 * @mas_start: The starting slot to copy
 * @mas_end: The end slot to copy (inclusively)
 * @b_node: The maple_big_node to place the data
 * @mab_start: The starting location in maple_big_node to store the data.
 */
static inline void mas_mab_cp(struct ma_state *mas, unsigned char mas_start,
                        unsigned char mas_end, struct maple_big_node *b_node,
                        unsigned char mab_start)
{
        enum maple_type mt;
        struct maple_node *node;
        void __rcu **slots;
        unsigned long *pivots, *gaps;
        int i = mas_start, j = mab_start;
        unsigned char piv_end;

        node = mas_mn(mas);
        mt = mte_node_type(mas->node);
        pivots = ma_pivots(node, mt);
        if (!i) {
                b_node->pivot[j] = pivots[i++];
                if (unlikely(i > mas_end))
                        goto complete;
                j++;
        }

        piv_end = min(mas_end, mt_pivots[mt]);
        for (; i < piv_end; i++, j++) {
                b_node->pivot[j] = pivots[i];
                if (unlikely(!b_node->pivot[j]))
                        break;

                if (unlikely(mas->max == b_node->pivot[j]))
                        goto complete;
        }

        if (likely(i <= mas_end))
                b_node->pivot[j] = mas_safe_pivot(mas, pivots, i, mt);

complete:
        b_node->b_end = ++j;
        j -= mab_start;
        slots = ma_slots(node, mt);
        memcpy(b_node->slot + mab_start, slots + mas_start, sizeof(void *) * j);
        if (!ma_is_leaf(mt) && mt_is_alloc(mas->tree)) {
                gaps = ma_gaps(node, mt);
                memcpy(b_node->gap + mab_start, gaps + mas_start,
                       sizeof(unsigned long) * j);
        }
}

/*
 * mas_leaf_set_meta() - Set the metadata of a leaf if possible.
 * @node: The maple node
 * @mt: The maple type
 * @end: The node end
 */
static inline void mas_leaf_set_meta(struct maple_node *node,
                enum maple_type mt, unsigned char end)
{
        if (end < mt_slots[mt] - 1)
                ma_set_meta(node, mt, 0, end);
}

/*
 * mab_mas_cp() - Copy data from maple_big_node to a maple encoded node.
 * @b_node: the maple_big_node that has the data
 * @mab_start: the start location in @b_node.
 * @mab_end: The end location in @b_node (inclusively)
 * @mas: The maple state with the maple encoded node.
 */
static inline void mab_mas_cp(struct maple_big_node *b_node,
                              unsigned char mab_start, unsigned char mab_end,
                              struct ma_state *mas, bool new_max)
{
        int i, j = 0;
        enum maple_type mt = mte_node_type(mas->node);
        struct maple_node *node = mte_to_node(mas->node);
        void __rcu **slots = ma_slots(node, mt);
        unsigned long *pivots = ma_pivots(node, mt);
        unsigned long *gaps = NULL;
        unsigned char end;

        if (mab_end - mab_start > mt_pivots[mt])
                mab_end--;

        if (!pivots[mt_pivots[mt] - 1])
                slots[mt_pivots[mt]] = NULL;

        i = mab_start;
        do {
                pivots[j++] = b_node->pivot[i++];
        } while (i <= mab_end && likely(b_node->pivot[i]));

        memcpy(slots, b_node->slot + mab_start,
               sizeof(void *) * (i - mab_start));

        if (new_max)
                mas->max = b_node->pivot[i - 1];

        end = j - 1;
        if (likely(!ma_is_leaf(mt) && mt_is_alloc(mas->tree))) {
                unsigned long max_gap = 0;
                unsigned char offset = 0;

                gaps = ma_gaps(node, mt);
                do {
                        gaps[--j] = b_node->gap[--i];
                        if (gaps[j] > max_gap) {
                                offset = j;
                                max_gap = gaps[j];
                        }
                } while (j);

                ma_set_meta(node, mt, offset, end);
        } else {
                mas_leaf_set_meta(node, mt, end);
        }
}

/*
 * mas_bulk_rebalance() - Rebalance the end of a tree after a bulk insert.
 * @mas: The maple state
 * @end: The maple node end
 * @mt: The maple node type
 */
static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end,
                                      enum maple_type mt)
{
        if (!(mas->mas_flags & MA_STATE_BULK))
                return;

        if (mte_is_root(mas->node))
                return;

        if (end > mt_min_slots[mt]) {
                mas->mas_flags &= ~MA_STATE_REBALANCE;
                return;
        }
}

/*
 * mas_store_b_node() - Store an @entry into the b_node while also copying the
 * data from a maple encoded node.
 * @wr_mas: the maple write state
 * @b_node: the maple_big_node to fill with data
 * @offset_end: the offset to end copying
 *
 * Return: The actual end of the data stored in @b_node
 */
static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas,
                struct maple_big_node *b_node, unsigned char offset_end)
{
        unsigned char slot;
        unsigned char b_end;
        /* Possible underflow of piv will wrap back to 0 before use. */
        unsigned long piv;
        struct ma_state *mas = wr_mas->mas;

        b_node->type = wr_mas->type;
        b_end = 0;
        slot = mas->offset;
        if (slot) {
                /* Copy start data up to insert. */
                mas_mab_cp(mas, 0, slot - 1, b_node, 0);
                b_end = b_node->b_end;
                piv = b_node->pivot[b_end - 1];
        } else
                piv = mas->min - 1;

        if (piv + 1 < mas->index) {
                /* Handle range starting after old range */
                b_node->slot[b_end] = wr_mas->content;
                if (!wr_mas->content)
                        b_node->gap[b_end] = mas->index - 1 - piv;
                b_node->pivot[b_end++] = mas->index - 1;
        }

        /* Store the new entry. */
        mas->offset = b_end;
        b_node->slot[b_end] = wr_mas->entry;
        b_node->pivot[b_end] = mas->last;

        /* Appended. */
        if (mas->last >= mas->max)
                goto b_end;

        /* Handle new range ending before old range ends */
        piv = mas_safe_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type);
        if (piv > mas->last) {
                if (piv == ULONG_MAX)
                        mas_bulk_rebalance(mas, b_node->b_end, wr_mas->type);

                if (offset_end != slot)
                        wr_mas->content = mas_slot_locked(mas, wr_mas->slots,
                                                          offset_end);

                b_node->slot[++b_end] = wr_mas->content;
                if (!wr_mas->content)
                        b_node->gap[b_end] = piv - mas->last + 1;
                b_node->pivot[b_end] = piv;
        }

        slot = offset_end + 1;
        if (slot > mas->end)
                goto b_end;

        /* Copy end data to the end of the node. */
        mas_mab_cp(mas, slot, mas->end + 1, b_node, ++b_end);
        b_node->b_end--;
        return;

b_end:
        b_node->b_end = b_end;
}

/*
 * mas_prev_sibling() - Find the previous node with the same parent.
 * @mas: the maple state
 *
 * Return: True if there is a previous sibling, false otherwise.
 */
static inline bool mas_prev_sibling(struct ma_state *mas)
{
        unsigned int p_slot = mte_parent_slot(mas->node);

        if (mte_is_root(mas->node))
                return false;

        if (!p_slot)
                return false;

        mas_ascend(mas);
        mas->offset = p_slot - 1;
        mas_descend(mas);
        return true;
}

/*
 * mas_next_sibling() - Find the next node with the same parent.
 * @mas: the maple state
 *
 * Return: true if there is a next sibling, false otherwise.
 */
static inline bool mas_next_sibling(struct ma_state *mas)
{
        MA_STATE(parent, mas->tree, mas->index, mas->last);

        if (mte_is_root(mas->node))
                return false;

        parent = *mas;
        mas_ascend(&parent);
        parent.offset = mte_parent_slot(mas->node) + 1;
        if (parent.offset > mas_data_end(&parent))
                return false;

        *mas = parent;
        mas_descend(mas);
        return true;
}

/*
 * mte_node_or_none() - Set the enode and state.
 * @enode: The encoded maple node.
 *
 * Set the node to the enode and the status.
 */
static inline void mas_node_or_none(struct ma_state *mas,
                struct maple_enode *enode)
{
        if (enode) {
                mas->node = enode;
                mas->status = ma_active;
        } else {
                mas->node = NULL;
                mas->status = ma_none;
        }
}

/*
 * mas_wr_node_walk() - Find the correct offset for the index in the @mas.
 * @wr_mas: The maple write state
 *
 * Uses mas_slot_locked() and does not need to worry about dead nodes.
 */
static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char count, offset;

        if (unlikely(ma_is_dense(wr_mas->type))) {
                wr_mas->r_max = wr_mas->r_min = mas->index;
                mas->offset = mas->index = mas->min;
                return;
        }

        wr_mas->node = mas_mn(wr_mas->mas);
        wr_mas->pivots = ma_pivots(wr_mas->node, wr_mas->type);
        count = mas->end = ma_data_end(wr_mas->node, wr_mas->type,
                                       wr_mas->pivots, mas->max);
        offset = mas->offset;

        while (offset < count && mas->index > wr_mas->pivots[offset])
                offset++;

        wr_mas->r_max = offset < count ? wr_mas->pivots[offset] : mas->max;
        wr_mas->r_min = mas_safe_min(mas, wr_mas->pivots, offset);
        wr_mas->offset_end = mas->offset = offset;
}

/*
 * mast_rebalance_next() - Rebalance against the next node
 * @mast: The maple subtree state
 * @old_r: The encoded maple node to the right (next node).
 */
static inline void mast_rebalance_next(struct maple_subtree_state *mast)
{
        unsigned char b_end = mast->bn->b_end;

        mas_mab_cp(mast->orig_r, 0, mt_slot_count(mast->orig_r->node),
                   mast->bn, b_end);
        mast->orig_r->last = mast->orig_r->max;
}

/*
 * mast_rebalance_prev() - Rebalance against the previous node
 * @mast: The maple subtree state
 * @old_l: The encoded maple node to the left (previous node)
 */
static inline void mast_rebalance_prev(struct maple_subtree_state *mast)
{
        unsigned char end = mas_data_end(mast->orig_l) + 1;
        unsigned char b_end = mast->bn->b_end;

        mab_shift_right(mast->bn, end);
        mas_mab_cp(mast->orig_l, 0, end - 1, mast->bn, 0);
        mast->l->min = mast->orig_l->min;
        mast->orig_l->index = mast->orig_l->min;
        mast->bn->b_end = end + b_end;
        mast->l->offset += end;
}

/*
 * mast_spanning_rebalance() - Rebalance nodes with nearest neighbour favouring
 * the node to the right.  Checking the nodes to the right then the left at each
 * level upwards until root is reached.
 * Data is copied into the @mast->bn.
 * @mast: The maple_subtree_state.
 */
static inline
bool mast_spanning_rebalance(struct maple_subtree_state *mast)
{
        struct ma_state r_tmp = *mast->orig_r;
        struct ma_state l_tmp = *mast->orig_l;
        unsigned char depth = 0;

        do {
                mas_ascend(mast->orig_r);
                mas_ascend(mast->orig_l);
                depth++;
                if (mast->orig_r->offset < mas_data_end(mast->orig_r)) {
                        mast->orig_r->offset++;
                        do {
                                mas_descend(mast->orig_r);
                                mast->orig_r->offset = 0;
                        } while (--depth);

                        mast_rebalance_next(mast);
                        *mast->orig_l = l_tmp;
                        return true;
                } else if (mast->orig_l->offset != 0) {
                        mast->orig_l->offset--;
                        do {
                                mas_descend(mast->orig_l);
                                mast->orig_l->offset =
                                        mas_data_end(mast->orig_l);
                        } while (--depth);

                        mast_rebalance_prev(mast);
                        *mast->orig_r = r_tmp;
                        return true;
                }
        } while (!mte_is_root(mast->orig_r->node));

        *mast->orig_r = r_tmp;
        *mast->orig_l = l_tmp;
        return false;
}

/*
 * mast_ascend() - Ascend the original left and right maple states.
 * @mast: the maple subtree state.
 *
 * Ascend the original left and right sides.  Set the offsets to point to the
 * data already in the new tree (@mast->l and @mast->r).
 */
static inline void mast_ascend(struct maple_subtree_state *mast)
{
        MA_WR_STATE(wr_mas, mast->orig_r,  NULL);
        mas_ascend(mast->orig_l);
        mas_ascend(mast->orig_r);

        mast->orig_r->offset = 0;
        mast->orig_r->index = mast->r->max;
        /* last should be larger than or equal to index */
        if (mast->orig_r->last < mast->orig_r->index)
                mast->orig_r->last = mast->orig_r->index;

        wr_mas.type = mte_node_type(mast->orig_r->node);
        mas_wr_node_walk(&wr_mas);
        /* Set up the left side of things */
        mast->orig_l->offset = 0;
        mast->orig_l->index = mast->l->min;
        wr_mas.mas = mast->orig_l;
        wr_mas.type = mte_node_type(mast->orig_l->node);
        mas_wr_node_walk(&wr_mas);

        mast->bn->type = wr_mas.type;
}

/*
 * mas_new_ma_node() - Create and return a new maple node.  Helper function.
 * @mas: the maple state with the allocations.
 * @b_node: the maple_big_node with the type encoding.
 *
 * Use the node type from the maple_big_node to allocate a new node from the
 * ma_state.  This function exists mainly for code readability.
 *
 * Return: A new maple encoded node
 */
static inline struct maple_enode
*mas_new_ma_node(struct ma_state *mas, struct maple_big_node *b_node)
{
        return mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), b_node->type);
}

/*
 * mas_mab_to_node() - Set up right and middle nodes
 *
 * @mas: the maple state that contains the allocations.
 * @b_node: the node which contains the data.
 * @left: The pointer which will have the left node
 * @right: The pointer which may have the right node
 * @middle: the pointer which may have the middle node (rare)
 * @mid_split: the split location for the middle node
 *
 * Return: the split of left.
 */
static inline unsigned char mas_mab_to_node(struct ma_state *mas,
        struct maple_big_node *b_node, struct maple_enode **left,
        struct maple_enode **right, struct maple_enode **middle,
        unsigned char *mid_split, unsigned long min)
{
        unsigned char split = 0;
        unsigned char slot_count = mt_slots[b_node->type];

        *left = mas_new_ma_node(mas, b_node);
        *right = NULL;
        *middle = NULL;
        *mid_split = 0;

        if (b_node->b_end < slot_count) {
                split = b_node->b_end;
        } else {
                split = mab_calc_split(mas, b_node, mid_split, min);
                *right = mas_new_ma_node(mas, b_node);
        }

        if (*mid_split)
                *middle = mas_new_ma_node(mas, b_node);

        return split;

}

/*
 * mab_set_b_end() - Add entry to b_node at b_node->b_end and increment the end
 * pointer.
 * @b_node - the big node to add the entry
 * @mas - the maple state to get the pivot (mas->max)
 * @entry - the entry to add, if NULL nothing happens.
 */
static inline void mab_set_b_end(struct maple_big_node *b_node,
                                 struct ma_state *mas,
                                 void *entry)
{
        if (!entry)
                return;

        b_node->slot[b_node->b_end] = entry;
        if (mt_is_alloc(mas->tree))
                b_node->gap[b_node->b_end] = mas_max_gap(mas);
        b_node->pivot[b_node->b_end++] = mas->max;
}

/*
 * mas_set_split_parent() - combine_then_separate helper function.  Sets the parent
 * of @mas->node to either @left or @right, depending on @slot and @split
 *
 * @mas - the maple state with the node that needs a parent
 * @left - possible parent 1
 * @right - possible parent 2
 * @slot - the slot the mas->node was placed
 * @split - the split location between @left and @right
 */
static inline void mas_set_split_parent(struct ma_state *mas,
                                        struct maple_enode *left,
                                        struct maple_enode *right,
                                        unsigned char *slot, unsigned char split)
{
        if (mas_is_none(mas))
                return;

        if ((*slot) <= split)
                mas_set_parent(mas, mas->node, left, *slot);
        else if (right)
                mas_set_parent(mas, mas->node, right, (*slot) - split - 1);

        (*slot)++;
}

/*
 * mte_mid_split_check() - Check if the next node passes the mid-split
 * @**l: Pointer to left encoded maple node.
 * @**m: Pointer to middle encoded maple node.
 * @**r: Pointer to right encoded maple node.
 * @slot: The offset
 * @*split: The split location.
 * @mid_split: The middle split.
 */
static inline void mte_mid_split_check(struct maple_enode **l,
                                       struct maple_enode **r,
                                       struct maple_enode *right,
                                       unsigned char slot,
                                       unsigned char *split,
                                       unsigned char mid_split)
{
        if (*r == right)
                return;

        if (slot < mid_split)
                return;

        *l = *r;
        *r = right;
        *split = mid_split;
}

/*
 * mast_set_split_parents() - Helper function to set three nodes parents.  Slot
 * is taken from @mast->l.
 * @mast - the maple subtree state
 * @left - the left node
 * @right - the right node
 * @split - the split location.
 */
static inline void mast_set_split_parents(struct maple_subtree_state *mast,
                                          struct maple_enode *left,
                                          struct maple_enode *middle,
                                          struct maple_enode *right,
                                          unsigned char split,
                                          unsigned char mid_split)
{
        unsigned char slot;
        struct maple_enode *l = left;
        struct maple_enode *r = right;

        if (mas_is_none(mast->l))
                return;

        if (middle)
                r = middle;

        slot = mast->l->offset;

        mte_mid_split_check(&l, &r, right, slot, &split, mid_split);
        mas_set_split_parent(mast->l, l, r, &slot, split);

        mte_mid_split_check(&l, &r, right, slot, &split, mid_split);
        mas_set_split_parent(mast->m, l, r, &slot, split);

        mte_mid_split_check(&l, &r, right, slot, &split, mid_split);
        mas_set_split_parent(mast->r, l, r, &slot, split);
}

/*
 * mas_topiary_node() - Dispose of a single node
 * @mas: The maple state for pushing nodes
 * @enode: The encoded maple node
 * @in_rcu: If the tree is in rcu mode
 *
 * The node will either be RCU freed or pushed back on the maple state.
 */
static inline void mas_topiary_node(struct ma_state *mas,
                struct ma_state *tmp_mas, bool in_rcu)
{
        struct maple_node *tmp;
        struct maple_enode *enode;

        if (mas_is_none(tmp_mas))
                return;

        enode = tmp_mas->node;
        tmp = mte_to_node(enode);
        mte_set_node_dead(enode);
        if (in_rcu)
                ma_free_rcu(tmp);
        else
                mas_push_node(mas, tmp);
}

/*
 * mas_topiary_replace() - Replace the data with new data, then repair the
 * parent links within the new tree.  Iterate over the dead sub-tree and collect
 * the dead subtrees and topiary the nodes that are no longer of use.
 *
 * The new tree will have up to three children with the correct parent.  Keep
 * track of the new entries as they need to be followed to find the next level
 * of new entries.
 *
 * The old tree will have up to three children with the old parent.  Keep track
 * of the old entries as they may have more nodes below replaced.  Nodes within
 * [index, last] are dead subtrees, others need to be freed and followed.
 *
 * @mas: The maple state pointing at the new data
 * @old_enode: The maple encoded node being replaced
 *
 */
static inline void mas_topiary_replace(struct ma_state *mas,
                struct maple_enode *old_enode)
{
        struct ma_state tmp[3], tmp_next[3];
        MA_TOPIARY(subtrees, mas->tree);
        bool in_rcu;
        int i, n;

        /* Place data in tree & then mark node as old */
        mas_put_in_tree(mas, old_enode);

        /* Update the parent pointers in the tree */
        tmp[0] = *mas;
        tmp[0].offset = 0;
        tmp[1].status = ma_none;
        tmp[2].status = ma_none;
        while (!mte_is_leaf(tmp[0].node)) {
                n = 0;
                for (i = 0; i < 3; i++) {
                        if (mas_is_none(&tmp[i]))
                                continue;

                        while (n < 3) {
                                if (!mas_find_child(&tmp[i], &tmp_next[n]))
                                        break;
                                n++;
                        }

                        mas_adopt_children(&tmp[i], tmp[i].node);
                }

                if (MAS_WARN_ON(mas, n == 0))
                        break;

                while (n < 3)
                        tmp_next[n++].status = ma_none;

                for (i = 0; i < 3; i++)
                        tmp[i] = tmp_next[i];
        }

        /* Collect the old nodes that need to be discarded */
        if (mte_is_leaf(old_enode))
                return mas_free(mas, old_enode);

        tmp[0] = *mas;
        tmp[0].offset = 0;
        tmp[0].node = old_enode;
        tmp[1].status = ma_none;
        tmp[2].status = ma_none;
        in_rcu = mt_in_rcu(mas->tree);
        do {
                n = 0;
                for (i = 0; i < 3; i++) {
                        if (mas_is_none(&tmp[i]))
                                continue;

                        while (n < 3) {
                                if (!mas_find_child(&tmp[i], &tmp_next[n]))
                                        break;

                                if ((tmp_next[n].min >= tmp_next->index) &&
                                    (tmp_next[n].max <= tmp_next->last)) {
                                        mat_add(&subtrees, tmp_next[n].node);
                                        tmp_next[n].status = ma_none;
                                } else {
                                        n++;
                                }
                        }
                }

                if (MAS_WARN_ON(mas, n == 0))
                        break;

                while (n < 3)
                        tmp_next[n++].status = ma_none;

                for (i = 0; i < 3; i++) {
                        mas_topiary_node(mas, &tmp[i], in_rcu);
                        tmp[i] = tmp_next[i];
                }
        } while (!mte_is_leaf(tmp[0].node));

        for (i = 0; i < 3; i++)
                mas_topiary_node(mas, &tmp[i], in_rcu);

        mas_mat_destroy(mas, &subtrees);
}

/*
 * mas_wmb_replace() - Write memory barrier and replace
 * @mas: The maple state
 * @old: The old maple encoded node that is being replaced.
 *
 * Updates gap as necessary.
 */
static inline void mas_wmb_replace(struct ma_state *mas,
                struct maple_enode *old_enode)
{
        /* Insert the new data in the tree */
        mas_topiary_replace(mas, old_enode);

        if (mte_is_leaf(mas->node))
                return;

        mas_update_gap(mas);
}

/*
 * mast_cp_to_nodes() - Copy data out to nodes.
 * @mast: The maple subtree state
 * @left: The left encoded maple node
 * @middle: The middle encoded maple node
 * @right: The right encoded maple node
 * @split: The location to split between left and (middle ? middle : right)
 * @mid_split: The location to split between middle and right.
 */
static inline void mast_cp_to_nodes(struct maple_subtree_state *mast,
        struct maple_enode *left, struct maple_enode *middle,
        struct maple_enode *right, unsigned char split, unsigned char mid_split)
{
        bool new_lmax = true;

        mas_node_or_none(mast->l, left);
        mas_node_or_none(mast->m, middle);
        mas_node_or_none(mast->r, right);

        mast->l->min = mast->orig_l->min;
        if (split == mast->bn->b_end) {
                mast->l->max = mast->orig_r->max;
                new_lmax = false;
        }

        mab_mas_cp(mast->bn, 0, split, mast->l, new_lmax);

        if (middle) {
                mab_mas_cp(mast->bn, 1 + split, mid_split, mast->m, true);
                mast->m->min = mast->bn->pivot[split] + 1;
                split = mid_split;
        }

        mast->r->max = mast->orig_r->max;
        if (right) {
                mab_mas_cp(mast->bn, 1 + split, mast->bn->b_end, mast->r, false);
                mast->r->min = mast->bn->pivot[split] + 1;
        }
}

/*
 * mast_combine_cp_left - Copy in the original left side of the tree into the
 * combined data set in the maple subtree state big node.
 * @mast: The maple subtree state
 */
static inline void mast_combine_cp_left(struct maple_subtree_state *mast)
{
        unsigned char l_slot = mast->orig_l->offset;

        if (!l_slot)
                return;

        mas_mab_cp(mast->orig_l, 0, l_slot - 1, mast->bn, 0);
}

/*
 * mast_combine_cp_right: Copy in the original right side of the tree into the
 * combined data set in the maple subtree state big node.
 * @mast: The maple subtree state
 */
static inline void mast_combine_cp_right(struct maple_subtree_state *mast)
{
        if (mast->bn->pivot[mast->bn->b_end - 1] >= mast->orig_r->max)
                return;

        mas_mab_cp(mast->orig_r, mast->orig_r->offset + 1,
                   mt_slot_count(mast->orig_r->node), mast->bn,
                   mast->bn->b_end);
        mast->orig_r->last = mast->orig_r->max;
}

/*
 * mast_sufficient: Check if the maple subtree state has enough data in the big
 * node to create at least one sufficient node
 * @mast: the maple subtree state
 */
static inline bool mast_sufficient(struct maple_subtree_state *mast)
{
        if (mast->bn->b_end > mt_min_slot_count(mast->orig_l->node))
                return true;

        return false;
}

/*
 * mast_overflow: Check if there is too much data in the subtree state for a
 * single node.
 * @mast: The maple subtree state
 */
static inline bool mast_overflow(struct maple_subtree_state *mast)
{
        if (mast->bn->b_end >= mt_slot_count(mast->orig_l->node))
                return true;

        return false;
}

static inline void *mtree_range_walk(struct ma_state *mas)
{
        unsigned long *pivots;
        unsigned char offset;
        struct maple_node *node;
        struct maple_enode *next, *last;
        enum maple_type type;
        void __rcu **slots;
        unsigned char end;
        unsigned long max, min;
        unsigned long prev_max, prev_min;

        next = mas->node;
        min = mas->min;
        max = mas->max;
        do {
                last = next;
                node = mte_to_node(next);
                type = mte_node_type(next);
                pivots = ma_pivots(node, type);
                end = ma_data_end(node, type, pivots, max);
                prev_min = min;
                prev_max = max;
                if (pivots[0] >= mas->index) {
                        offset = 0;
                        max = pivots[0];
                        goto next;
                }

                offset = 1;
                while (offset < end) {
                        if (pivots[offset] >= mas->index) {
                                max = pivots[offset];
                                break;
                        }
                        offset++;
                }

                min = pivots[offset - 1] + 1;
next:
                slots = ma_slots(node, type);
                next = mt_slot(mas->tree, slots, offset);
                if (unlikely(ma_dead_node(node)))
                        goto dead_node;
        } while (!ma_is_leaf(type));

        mas->end = end;
        mas->offset = offset;
        mas->index = min;
        mas->last = max;
        mas->min = prev_min;
        mas->max = prev_max;
        mas->node = last;
        return (void *)next;

dead_node:
        mas_reset(mas);
        return NULL;
}

/*
 * mas_spanning_rebalance() - Rebalance across two nodes which may not be peers.
 * @mas: The starting maple state
 * @mast: The maple_subtree_state, keeps track of 4 maple states.
 * @count: The estimated count of iterations needed.
 *
 * Follow the tree upwards from @l_mas and @r_mas for @count, or until the root
 * is hit.  First @b_node is split into two entries which are inserted into the
 * next iteration of the loop.  @b_node is returned populated with the final
 * iteration. @mas is used to obtain allocations.  orig_l_mas keeps track of the
 * nodes that will remain active by using orig_l_mas->index and orig_l_mas->last
 * to account of what has been copied into the new sub-tree.  The update of
 * orig_l_mas->last is used in mas_consume to find the slots that will need to
 * be either freed or destroyed.  orig_l_mas->depth keeps track of the height of
 * the new sub-tree in case the sub-tree becomes the full tree.
 *
 * Return: the number of elements in b_node during the last loop.
 */
static int mas_spanning_rebalance(struct ma_state *mas,
                struct maple_subtree_state *mast, unsigned char count)
{
        unsigned char split, mid_split;
        unsigned char slot = 0;
        struct maple_enode *left = NULL, *middle = NULL, *right = NULL;
        struct maple_enode *old_enode;

        MA_STATE(l_mas, mas->tree, mas->index, mas->index);
        MA_STATE(r_mas, mas->tree, mas->index, mas->last);
        MA_STATE(m_mas, mas->tree, mas->index, mas->index);

        /*
         * The tree needs to be rebalanced and leaves need to be kept at the same level.
         * Rebalancing is done by use of the ``struct maple_topiary``.
         */
        mast->l = &l_mas;
        mast->m = &m_mas;
        mast->r = &r_mas;
        l_mas.status = r_mas.status = m_mas.status = ma_none;

        /* Check if this is not root and has sufficient data.  */
        if (((mast->orig_l->min != 0) || (mast->orig_r->max != ULONG_MAX)) &&
            unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type]))
                mast_spanning_rebalance(mast);

        l_mas.depth = 0;

        /*
         * Each level of the tree is examined and balanced, pushing data to the left or
         * right, or rebalancing against left or right nodes is employed to avoid
         * rippling up the tree to limit the amount of churn.  Once a new sub-section of
         * the tree is created, there may be a mix of new and old nodes.  The old nodes
         * will have the incorrect parent pointers and currently be in two trees: the
         * original tree and the partially new tree.  To remedy the parent pointers in
         * the old tree, the new data is swapped into the active tree and a walk down
         * the tree is performed and the parent pointers are updated.
         * See mas_topiary_replace() for more information.
         */
        while (count--) {
                mast->bn->b_end--;
                mast->bn->type = mte_node_type(mast->orig_l->node);
                split = mas_mab_to_node(mas, mast->bn, &left, &right, &middle,
                                        &mid_split, mast->orig_l->min);
                mast_set_split_parents(mast, left, middle, right, split,
                                       mid_split);
                mast_cp_to_nodes(mast, left, middle, right, split, mid_split);

                /*
                 * Copy data from next level in the tree to mast->bn from next
                 * iteration
                 */
                memset(mast->bn, 0, sizeof(struct maple_big_node));
                mast->bn->type = mte_node_type(left);
                l_mas.depth++;

                /* Root already stored in l->node. */
                if (mas_is_root_limits(mast->l))
                        goto new_root;

                mast_ascend(mast);
                mast_combine_cp_left(mast);
                l_mas.offset = mast->bn->b_end;
                mab_set_b_end(mast->bn, &l_mas, left);
                mab_set_b_end(mast->bn, &m_mas, middle);
                mab_set_b_end(mast->bn, &r_mas, right);

                /* Copy anything necessary out of the right node. */
                mast_combine_cp_right(mast);
                mast->orig_l->last = mast->orig_l->max;

                if (mast_sufficient(mast))
                        continue;

                if (mast_overflow(mast))
                        continue;

                /* May be a new root stored in mast->bn */
                if (mas_is_root_limits(mast->orig_l))
                        break;

                mast_spanning_rebalance(mast);

                /* rebalancing from other nodes may require another loop. */
                if (!count)
                        count++;
        }

        l_mas.node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)),
                                mte_node_type(mast->orig_l->node));
        l_mas.depth++;
        mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, &l_mas, true);
        mas_set_parent(mas, left, l_mas.node, slot);
        if (middle)
                mas_set_parent(mas, middle, l_mas.node, ++slot);

        if (right)
                mas_set_parent(mas, right, l_mas.node, ++slot);

        if (mas_is_root_limits(mast->l)) {
new_root:
                mas_mn(mast->l)->parent = ma_parent_ptr(mas_tree_parent(mas));
                while (!mte_is_root(mast->orig_l->node))
                        mast_ascend(mast);
        } else {
                mas_mn(&l_mas)->parent = mas_mn(mast->orig_l)->parent;
        }

        old_enode = mast->orig_l->node;
        mas->depth = l_mas.depth;
        mas->node = l_mas.node;
        mas->min = l_mas.min;
        mas->max = l_mas.max;
        mas->offset = l_mas.offset;
        mas_wmb_replace(mas, old_enode);
        mtree_range_walk(mas);
        return mast->bn->b_end;
}

/*
 * mas_rebalance() - Rebalance a given node.
 * @mas: The maple state
 * @b_node: The big maple node.
 *
 * Rebalance two nodes into a single node or two new nodes that are sufficient.
 * Continue upwards until tree is sufficient.
 *
 * Return: the number of elements in b_node during the last loop.
 */
static inline int mas_rebalance(struct ma_state *mas,
                                struct maple_big_node *b_node)
{
        char empty_count = mas_mt_height(mas);
        struct maple_subtree_state mast;
        unsigned char shift, b_end = ++b_node->b_end;

        MA_STATE(l_mas, mas->tree, mas->index, mas->last);
        MA_STATE(r_mas, mas->tree, mas->index, mas->last);

        trace_ma_op(__func__, mas);

        /*
         * Rebalancing occurs if a node is insufficient.  Data is rebalanced
         * against the node to the right if it exists, otherwise the node to the
         * left of this node is rebalanced against this node.  If rebalancing
         * causes just one node to be produced instead of two, then the parent
         * is also examined and rebalanced if it is insufficient.  Every level
         * tries to combine the data in the same way.  If one node contains the
         * entire range of the tree, then that node is used as a new root node.
         */
        mas_node_count(mas, empty_count * 2 - 1);
        if (mas_is_err(mas))
                return 0;

        mast.orig_l = &l_mas;
        mast.orig_r = &r_mas;
        mast.bn = b_node;
        mast.bn->type = mte_node_type(mas->node);

        l_mas = r_mas = *mas;

        if (mas_next_sibling(&r_mas)) {
                mas_mab_cp(&r_mas, 0, mt_slot_count(r_mas.node), b_node, b_end);
                r_mas.last = r_mas.index = r_mas.max;
        } else {
                mas_prev_sibling(&l_mas);
                shift = mas_data_end(&l_mas) + 1;
                mab_shift_right(b_node, shift);
                mas->offset += shift;
                mas_mab_cp(&l_mas, 0, shift - 1, b_node, 0);
                b_node->b_end = shift + b_end;
                l_mas.index = l_mas.last = l_mas.min;
        }

        return mas_spanning_rebalance(mas, &mast, empty_count);
}

/*
 * mas_destroy_rebalance() - Rebalance left-most node while destroying the maple
 * state.
 * @mas: The maple state
 * @end: The end of the left-most node.
 *
 * During a mass-insert event (such as forking), it may be necessary to
 * rebalance the left-most node when it is not sufficient.
 */
static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end)
{
        enum maple_type mt = mte_node_type(mas->node);
        struct maple_node reuse, *newnode, *parent, *new_left, *left, *node;
        struct maple_enode *eparent, *old_eparent;
        unsigned char offset, tmp, split = mt_slots[mt] / 2;
        void __rcu **l_slots, **slots;
        unsigned long *l_pivs, *pivs, gap;
        bool in_rcu = mt_in_rcu(mas->tree);

        MA_STATE(l_mas, mas->tree, mas->index, mas->last);

        l_mas = *mas;
        mas_prev_sibling(&l_mas);

        /* set up node. */
        if (in_rcu) {
                /* Allocate for both left and right as well as parent. */
                mas_node_count(mas, 3);
                if (mas_is_err(mas))
                        return;

                newnode = mas_pop_node(mas);
        } else {
                newnode = &reuse;
        }

        node = mas_mn(mas);
        newnode->parent = node->parent;
        slots = ma_slots(newnode, mt);
        pivs = ma_pivots(newnode, mt);
        left = mas_mn(&l_mas);
        l_slots = ma_slots(left, mt);
        l_pivs = ma_pivots(left, mt);
        if (!l_slots[split])
                split++;
        tmp = mas_data_end(&l_mas) - split;

        memcpy(slots, l_slots + split + 1, sizeof(void *) * tmp);
        memcpy(pivs, l_pivs + split + 1, sizeof(unsigned long) * tmp);
        pivs[tmp] = l_mas.max;
        memcpy(slots + tmp, ma_slots(node, mt), sizeof(void *) * end);
        memcpy(pivs + tmp, ma_pivots(node, mt), sizeof(unsigned long) * end);

        l_mas.max = l_pivs[split];
        mas->min = l_mas.max + 1;
        old_eparent = mt_mk_node(mte_parent(l_mas.node),
                             mas_parent_type(&l_mas, l_mas.node));
        tmp += end;
        if (!in_rcu) {
                unsigned char max_p = mt_pivots[mt];
                unsigned char max_s = mt_slots[mt];

                if (tmp < max_p)
                        memset(pivs + tmp, 0,
                               sizeof(unsigned long) * (max_p - tmp));

                if (tmp < mt_slots[mt])
                        memset(slots + tmp, 0, sizeof(void *) * (max_s - tmp));

                memcpy(node, newnode, sizeof(struct maple_node));
                ma_set_meta(node, mt, 0, tmp - 1);
                mte_set_pivot(old_eparent, mte_parent_slot(l_mas.node),
                              l_pivs[split]);

                /* Remove data from l_pivs. */
                tmp = split + 1;
                memset(l_pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp));
                memset(l_slots + tmp, 0, sizeof(void *) * (max_s - tmp));
                ma_set_meta(left, mt, 0, split);
                eparent = old_eparent;

                goto done;
        }

        /* RCU requires replacing both l_mas, mas, and parent. */
        mas->node = mt_mk_node(newnode, mt);
        ma_set_meta(newnode, mt, 0, tmp);

        new_left = mas_pop_node(mas);
        new_left->parent = left->parent;
        mt = mte_node_type(l_mas.node);
        slots = ma_slots(new_left, mt);
        pivs = ma_pivots(new_left, mt);
        memcpy(slots, l_slots, sizeof(void *) * split);
        memcpy(pivs, l_pivs, sizeof(unsigned long) * split);
        ma_set_meta(new_left, mt, 0, split);
        l_mas.node = mt_mk_node(new_left, mt);

        /* replace parent. */
        offset = mte_parent_slot(mas->node);
        mt = mas_parent_type(&l_mas, l_mas.node);
        parent = mas_pop_node(mas);
        slots = ma_slots(parent, mt);
        pivs = ma_pivots(parent, mt);
        memcpy(parent, mte_to_node(old_eparent), sizeof(struct maple_node));
        rcu_assign_pointer(slots[offset], mas->node);
        rcu_assign_pointer(slots[offset - 1], l_mas.node);
        pivs[offset - 1] = l_mas.max;
        eparent = mt_mk_node(parent, mt);
done:
        gap = mas_leaf_max_gap(mas);
        mte_set_gap(eparent, mte_parent_slot(mas->node), gap);
        gap = mas_leaf_max_gap(&l_mas);
        mte_set_gap(eparent, mte_parent_slot(l_mas.node), gap);
        mas_ascend(mas);

        if (in_rcu) {
                mas_replace_node(mas, old_eparent);
                mas_adopt_children(mas, mas->node);
        }

        mas_update_gap(mas);
}

/*
 * mas_split_final_node() - Split the final node in a subtree operation.
 * @mast: the maple subtree state
 * @mas: The maple state
 * @height: The height of the tree in case it's a new root.
 */
static inline void mas_split_final_node(struct maple_subtree_state *mast,
                                        struct ma_state *mas, int height)
{
        struct maple_enode *ancestor;

        if (mte_is_root(mas->node)) {
                if (mt_is_alloc(mas->tree))
                        mast->bn->type = maple_arange_64;
                else
                        mast->bn->type = maple_range_64;
                mas->depth = height;
        }
        /*
         * Only a single node is used here, could be root.
         * The Big_node data should just fit in a single node.
         */
        ancestor = mas_new_ma_node(mas, mast->bn);
        mas_set_parent(mas, mast->l->node, ancestor, mast->l->offset);
        mas_set_parent(mas, mast->r->node, ancestor, mast->r->offset);
        mte_to_node(ancestor)->parent = mas_mn(mas)->parent;

        mast->l->node = ancestor;
        mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, mast->l, true);
        mas->offset = mast->bn->b_end - 1;
}

/*
 * mast_fill_bnode() - Copy data into the big node in the subtree state
 * @mast: The maple subtree state
 * @mas: the maple state
 * @skip: The number of entries to skip for new nodes insertion.
 */
static inline void mast_fill_bnode(struct maple_subtree_state *mast,
                                         struct ma_state *mas,
                                         unsigned char skip)
{
        bool cp = true;
        unsigned char split;

        memset(mast->bn->gap, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->gap));
        memset(mast->bn->slot, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->slot));
        memset(mast->bn->pivot, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->pivot));
        mast->bn->b_end = 0;

        if (mte_is_root(mas->node)) {
                cp = false;
        } else {
                mas_ascend(mas);
                mas->offset = mte_parent_slot(mas->node);
        }

        if (cp && mast->l->offset)
                mas_mab_cp(mas, 0, mast->l->offset - 1, mast->bn, 0);

        split = mast->bn->b_end;
        mab_set_b_end(mast->bn, mast->l, mast->l->node);
        mast->r->offset = mast->bn->b_end;
        mab_set_b_end(mast->bn, mast->r, mast->r->node);
        if (mast->bn->pivot[mast->bn->b_end - 1] == mas->max)
                cp = false;

        if (cp)
                mas_mab_cp(mas, split + skip, mt_slot_count(mas->node) - 1,
                           mast->bn, mast->bn->b_end);

        mast->bn->b_end--;
        mast->bn->type = mte_node_type(mas->node);
}

/*
 * mast_split_data() - Split the data in the subtree state big node into regular
 * nodes.
 * @mast: The maple subtree state
 * @mas: The maple state
 * @split: The location to split the big node
 */
static inline void mast_split_data(struct maple_subtree_state *mast,
           struct ma_state *mas, unsigned char split)
{
        unsigned char p_slot;

        mab_mas_cp(mast->bn, 0, split, mast->l, true);
        mte_set_pivot(mast->r->node, 0, mast->r->max);
        mab_mas_cp(mast->bn, split + 1, mast->bn->b_end, mast->r, false);
        mast->l->offset = mte_parent_slot(mas->node);
        mast->l->max = mast->bn->pivot[split];
        mast->r->min = mast->l->max + 1;
        if (mte_is_leaf(mas->node))
                return;

        p_slot = mast->orig_l->offset;
        mas_set_split_parent(mast->orig_l, mast->l->node, mast->r->node,
                             &p_slot, split);
        mas_set_split_parent(mast->orig_r, mast->l->node, mast->r->node,
                             &p_slot, split);
}

/*
 * mas_push_data() - Instead of splitting a node, it is beneficial to push the
 * data to the right or left node if there is room.
 * @mas: The maple state
 * @height: The current height of the maple state
 * @mast: The maple subtree state
 * @left: Push left or not.
 *
 * Keeping the height of the tree low means faster lookups.
 *
 * Return: True if pushed, false otherwise.
 */
static inline bool mas_push_data(struct ma_state *mas, int height,
                                 struct maple_subtree_state *mast, bool left)
{
        unsigned char slot_total = mast->bn->b_end;
        unsigned char end, space, split;

        MA_STATE(tmp_mas, mas->tree, mas->index, mas->last);
        tmp_mas = *mas;
        tmp_mas.depth = mast->l->depth;

        if (left && !mas_prev_sibling(&tmp_mas))
                return false;
        else if (!left && !mas_next_sibling(&tmp_mas))
                return false;

        end = mas_data_end(&tmp_mas);
        slot_total += end;
        space = 2 * mt_slot_count(mas->node) - 2;
        /* -2 instead of -1 to ensure there isn't a triple split */
        if (ma_is_leaf(mast->bn->type))
                space--;

        if (mas->max == ULONG_MAX)
                space--;

        if (slot_total >= space)
                return false;

        /* Get the data; Fill mast->bn */
        mast->bn->b_end++;
        if (left) {
                mab_shift_right(mast->bn, end + 1);
                mas_mab_cp(&tmp_mas, 0, end, mast->bn, 0);
                mast->bn->b_end = slot_total + 1;
        } else {
                mas_mab_cp(&tmp_mas, 0, end, mast->bn, mast->bn->b_end);
        }

        /* Configure mast for splitting of mast->bn */
        split = mt_slots[mast->bn->type] - 2;
        if (left) {
                /*  Switch mas to prev node  */
                *mas = tmp_mas;
                /* Start using mast->l for the left side. */
                tmp_mas.node = mast->l->node;
                *mast->l = tmp_mas;
        } else {
                tmp_mas.node = mast->r->node;
                *mast->r = tmp_mas;
                split = slot_total - split;
        }
        split = mab_no_null_split(mast->bn, split, mt_slots[mast->bn->type]);
        /* Update parent slot for split calculation. */
        if (left)
                mast->orig_l->offset += end + 1;

        mast_split_data(mast, mas, split);
        mast_fill_bnode(mast, mas, 2);
        mas_split_final_node(mast, mas, height + 1);
        return true;
}

/*
 * mas_split() - Split data that is too big for one node into two.
 * @mas: The maple state
 * @b_node: The maple big node
 * Return: 1 on success, 0 on failure.
 */
static int mas_split(struct ma_state *mas, struct maple_big_node *b_node)
{
        struct maple_subtree_state mast;
        int height = 0;
        unsigned char mid_split, split = 0;
        struct maple_enode *old;

        /*
         * Splitting is handled differently from any other B-tree; the Maple
         * Tree splits upwards.  Splitting up means that the split operation
         * occurs when the walk of the tree hits the leaves and not on the way
         * down.  The reason for splitting up is that it is impossible to know
         * how much space will be needed until the leaf is (or leaves are)
         * reached.  Since overwriting data is allowed and a range could
         * overwrite more than one range or result in changing one entry into 3
         * entries, it is impossible to know if a split is required until the
         * data is examined.
         *
         * Splitting is a balancing act between keeping allocations to a minimum
         * and avoiding a 'jitter' event where a tree is expanded to make room
         * for an entry followed by a contraction when the entry is removed.  To
         * accomplish the balance, there are empty slots remaining in both left
         * and right nodes after a split.
         */
        MA_STATE(l_mas, mas->tree, mas->index, mas->last);
        MA_STATE(r_mas, mas->tree, mas->index, mas->last);
        MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last);
        MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last);

        trace_ma_op(__func__, mas);
        mas->depth = mas_mt_height(mas);
        /* Allocation failures will happen early. */
        mas_node_count(mas, 1 + mas->depth * 2);
        if (mas_is_err(mas))
                return 0;

        mast.l = &l_mas;
        mast.r = &r_mas;
        mast.orig_l = &prev_l_mas;
        mast.orig_r = &prev_r_mas;
        mast.bn = b_node;

        while (height++ <= mas->depth) {
                if (mt_slots[b_node->type] > b_node->b_end) {
                        mas_split_final_node(&mast, mas, height);
                        break;
                }

                l_mas = r_mas = *mas;
                l_mas.node = mas_new_ma_node(mas, b_node);
                r_mas.node = mas_new_ma_node(mas, b_node);
                /*
                 * Another way that 'jitter' is avoided is to terminate a split up early if the
                 * left or right node has space to spare.  This is referred to as "pushing left"
                 * or "pushing right" and is similar to the B* tree, except the nodes left or
                 * right can rarely be reused due to RCU, but the ripple upwards is halted which
                 * is a significant savings.
                 */
                /* Try to push left. */
                if (mas_push_data(mas, height, &mast, true))
                        break;
                /* Try to push right. */
                if (mas_push_data(mas, height, &mast, false))
                        break;

                split = mab_calc_split(mas, b_node, &mid_split, prev_l_mas.min);
                mast_split_data(&mast, mas, split);
                /*
                 * Usually correct, mab_mas_cp in the above call overwrites
                 * r->max.
                 */
                mast.r->max = mas->max;
                mast_fill_bnode(&mast, mas, 1);
                prev_l_mas = *mast.l;
                prev_r_mas = *mast.r;
        }

        /* Set the original node as dead */
        old = mas->node;
        mas->node = l_mas.node;
        mas_wmb_replace(mas, old);
        mtree_range_walk(mas);
        return 1;
}

/*
 * mas_reuse_node() - Reuse the node to store the data.
 * @wr_mas: The maple write state
 * @bn: The maple big node
 * @end: The end of the data.
 *
 * Will always return false in RCU mode.
 *
 * Return: True if node was reused, false otherwise.
 */
static inline bool mas_reuse_node(struct ma_wr_state *wr_mas,
                          struct maple_big_node *bn, unsigned char end)
{
        /* Need to be rcu safe. */
        if (mt_in_rcu(wr_mas->mas->tree))
                return false;

        if (end > bn->b_end) {
                int clear = mt_slots[wr_mas->type] - bn->b_end;

                memset(wr_mas->slots + bn->b_end, 0, sizeof(void *) * clear--);
                memset(wr_mas->pivots + bn->b_end, 0, sizeof(void *) * clear);
        }
        mab_mas_cp(bn, 0, bn->b_end, wr_mas->mas, false);
        return true;
}

/*
 * mas_commit_b_node() - Commit the big node into the tree.
 * @wr_mas: The maple write state
 * @b_node: The maple big node
 * @end: The end of the data.
 */
static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas,
                            struct maple_big_node *b_node, unsigned char end)
{
        struct maple_node *node;
        struct maple_enode *old_enode;
        unsigned char b_end = b_node->b_end;
        enum maple_type b_type = b_node->type;

        old_enode = wr_mas->mas->node;
        if ((b_end < mt_min_slots[b_type]) &&
            (!mte_is_root(old_enode)) &&
            (mas_mt_height(wr_mas->mas) > 1))
                return mas_rebalance(wr_mas->mas, b_node);

        if (b_end >= mt_slots[b_type])
                return mas_split(wr_mas->mas, b_node);

        if (mas_reuse_node(wr_mas, b_node, end))
                goto reuse_node;

        mas_node_count(wr_mas->mas, 1);
        if (mas_is_err(wr_mas->mas))
                return 0;

        node = mas_pop_node(wr_mas->mas);
        node->parent = mas_mn(wr_mas->mas)->parent;
        wr_mas->mas->node = mt_mk_node(node, b_type);
        mab_mas_cp(b_node, 0, b_end, wr_mas->mas, false);
        mas_replace_node(wr_mas->mas, old_enode);
reuse_node:
        mas_update_gap(wr_mas->mas);
        wr_mas->mas->end = b_end;
        return 1;
}

/*
 * mas_root_expand() - Expand a root to a node
 * @mas: The maple state
 * @entry: The entry to store into the tree
 */
static inline int mas_root_expand(struct ma_state *mas, void *entry)
{
        void *contents = mas_root_locked(mas);
        enum maple_type type = maple_leaf_64;
        struct maple_node *node;
        void __rcu **slots;
        unsigned long *pivots;
        int slot = 0;

        mas_node_count(mas, 1);
        if (unlikely(mas_is_err(mas)))
                return 0;

        node = mas_pop_node(mas);
        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);
        node->parent = ma_parent_ptr(mas_tree_parent(mas));
        mas->node = mt_mk_node(node, type);
        mas->status = ma_active;

        if (mas->index) {
                if (contents) {
                        rcu_assign_pointer(slots[slot], contents);
                        if (likely(mas->index > 1))
                                slot++;
                }
                pivots[slot++] = mas->index - 1;
        }

        rcu_assign_pointer(slots[slot], entry);
        mas->offset = slot;
        pivots[slot] = mas->last;
        if (mas->last != ULONG_MAX)
                pivots[++slot] = ULONG_MAX;

        mas->depth = 1;
        mas_set_height(mas);
        ma_set_meta(node, maple_leaf_64, 0, slot);
        /* swap the new root into the tree */
        rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node));
        return slot;
}

static inline void mas_store_root(struct ma_state *mas, void *entry)
{
        if (likely((mas->last != 0) || (mas->index != 0)))
                mas_root_expand(mas, entry);
        else if (((unsigned long) (entry) & 3) == 2)
                mas_root_expand(mas, entry);
        else {
                rcu_assign_pointer(mas->tree->ma_root, entry);
                mas->status = ma_start;
        }
}

/*
 * mas_is_span_wr() - Check if the write needs to be treated as a write that
 * spans the node.
 * @mas: The maple state
 * @piv: The pivot value being written
 * @type: The maple node type
 * @entry: The data to write
 *
 * Spanning writes are writes that start in one node and end in another OR if
 * the write of a %NULL will cause the node to end with a %NULL.
 *
 * Return: True if this is a spanning write, false otherwise.
 */
static bool mas_is_span_wr(struct ma_wr_state *wr_mas)
{
        unsigned long max = wr_mas->r_max;
        unsigned long last = wr_mas->mas->last;
        enum maple_type type = wr_mas->type;
        void *entry = wr_mas->entry;

        /* Contained in this pivot, fast path */
        if (last < max)
                return false;

        if (ma_is_leaf(type)) {
                max = wr_mas->mas->max;
                if (last < max)
                        return false;
        }

        if (last == max) {
                /*
                 * The last entry of leaf node cannot be NULL unless it is the
                 * rightmost node (writing ULONG_MAX), otherwise it spans slots.
                 */
                if (entry || last == ULONG_MAX)
                        return false;
        }

        trace_ma_write(__func__, wr_mas->mas, wr_mas->r_max, entry);
        return true;
}

static inline void mas_wr_walk_descend(struct ma_wr_state *wr_mas)
{
        wr_mas->type = mte_node_type(wr_mas->mas->node);
        mas_wr_node_walk(wr_mas);
        wr_mas->slots = ma_slots(wr_mas->node, wr_mas->type);
}

static inline void mas_wr_walk_traverse(struct ma_wr_state *wr_mas)
{
        wr_mas->mas->max = wr_mas->r_max;
        wr_mas->mas->min = wr_mas->r_min;
        wr_mas->mas->node = wr_mas->content;
        wr_mas->mas->offset = 0;
        wr_mas->mas->depth++;
}
/*
 * mas_wr_walk() - Walk the tree for a write.
 * @wr_mas: The maple write state
 *
 * Uses mas_slot_locked() and does not need to worry about dead nodes.
 *
 * Return: True if it's contained in a node, false on spanning write.
 */
static bool mas_wr_walk(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        while (true) {
                mas_wr_walk_descend(wr_mas);
                if (unlikely(mas_is_span_wr(wr_mas)))
                        return false;

                wr_mas->content = mas_slot_locked(mas, wr_mas->slots,
                                                  mas->offset);
                if (ma_is_leaf(wr_mas->type))
                        return true;

                mas_wr_walk_traverse(wr_mas);
        }

        return true;
}

static bool mas_wr_walk_index(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        while (true) {
                mas_wr_walk_descend(wr_mas);
                wr_mas->content = mas_slot_locked(mas, wr_mas->slots,
                                                  mas->offset);
                if (ma_is_leaf(wr_mas->type))
                        return true;
                mas_wr_walk_traverse(wr_mas);

        }
        return true;
}
/*
 * mas_extend_spanning_null() - Extend a store of a %NULL to include surrounding %NULLs.
 * @l_wr_mas: The left maple write state
 * @r_wr_mas: The right maple write state
 */
static inline void mas_extend_spanning_null(struct ma_wr_state *l_wr_mas,
                                            struct ma_wr_state *r_wr_mas)
{
        struct ma_state *r_mas = r_wr_mas->mas;
        struct ma_state *l_mas = l_wr_mas->mas;
        unsigned char l_slot;

        l_slot = l_mas->offset;
        if (!l_wr_mas->content)
                l_mas->index = l_wr_mas->r_min;

        if ((l_mas->index == l_wr_mas->r_min) &&
                 (l_slot &&
                  !mas_slot_locked(l_mas, l_wr_mas->slots, l_slot - 1))) {
                if (l_slot > 1)
                        l_mas->index = l_wr_mas->pivots[l_slot - 2] + 1;
                else
                        l_mas->index = l_mas->min;

                l_mas->offset = l_slot - 1;
        }

        if (!r_wr_mas->content) {
                if (r_mas->last < r_wr_mas->r_max)
                        r_mas->last = r_wr_mas->r_max;
                r_mas->offset++;
        } else if ((r_mas->last == r_wr_mas->r_max) &&
            (r_mas->last < r_mas->max) &&
            !mas_slot_locked(r_mas, r_wr_mas->slots, r_mas->offset + 1)) {
                r_mas->last = mas_safe_pivot(r_mas, r_wr_mas->pivots,
                                             r_wr_mas->type, r_mas->offset + 1);
                r_mas->offset++;
        }
}

static inline void *mas_state_walk(struct ma_state *mas)
{
        void *entry;

        entry = mas_start(mas);
        if (mas_is_none(mas))
                return NULL;

        if (mas_is_ptr(mas))
                return entry;

        return mtree_range_walk(mas);
}

/*
 * mtree_lookup_walk() - Internal quick lookup that does not keep maple state up
 * to date.
 *
 * @mas: The maple state.
 *
 * Note: Leaves mas in undesirable state.
 * Return: The entry for @mas->index or %NULL on dead node.
 */
static inline void *mtree_lookup_walk(struct ma_state *mas)
{
        unsigned long *pivots;
        unsigned char offset;
        struct maple_node *node;
        struct maple_enode *next;
        enum maple_type type;
        void __rcu **slots;
        unsigned char end;

        next = mas->node;
        do {
                node = mte_to_node(next);
                type = mte_node_type(next);
                pivots = ma_pivots(node, type);
                end = mt_pivots[type];
                offset = 0;
                do {
                        if (pivots[offset] >= mas->index)
                                break;
                } while (++offset < end);

                slots = ma_slots(node, type);
                next = mt_slot(mas->tree, slots, offset);
                if (unlikely(ma_dead_node(node)))
                        goto dead_node;
        } while (!ma_is_leaf(type));

        return (void *)next;

dead_node:
        mas_reset(mas);
        return NULL;
}

static void mte_destroy_walk(struct maple_enode *, struct maple_tree *);
/*
 * mas_new_root() - Create a new root node that only contains the entry passed
 * in.
 * @mas: The maple state
 * @entry: The entry to store.
 *
 * Only valid when the index == 0 and the last == ULONG_MAX
 *
 * Return 0 on error, 1 on success.
 */
static inline int mas_new_root(struct ma_state *mas, void *entry)
{
        struct maple_enode *root = mas_root_locked(mas);
        enum maple_type type = maple_leaf_64;
        struct maple_node *node;
        void __rcu **slots;
        unsigned long *pivots;

        if (!entry && !mas->index && mas->last == ULONG_MAX) {
                mas->depth = 0;
                mas_set_height(mas);
                rcu_assign_pointer(mas->tree->ma_root, entry);
                mas->status = ma_start;
                goto done;
        }

        mas_node_count(mas, 1);
        if (mas_is_err(mas))
                return 0;

        node = mas_pop_node(mas);
        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);
        node->parent = ma_parent_ptr(mas_tree_parent(mas));
        mas->node = mt_mk_node(node, type);
        mas->status = ma_active;
        rcu_assign_pointer(slots[0], entry);
        pivots[0] = mas->last;
        mas->depth = 1;
        mas_set_height(mas);
        rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node));

done:
        if (xa_is_node(root))
                mte_destroy_walk(root, mas->tree);

        return 1;
}
/*
 * mas_wr_spanning_store() - Create a subtree with the store operation completed
 * and new nodes where necessary, then place the sub-tree in the actual tree.
 * Note that mas is expected to point to the node which caused the store to
 * span.
 * @wr_mas: The maple write state
 *
 * Return: 0 on error, positive on success.
 */
static inline int mas_wr_spanning_store(struct ma_wr_state *wr_mas)
{
        struct maple_subtree_state mast;
        struct maple_big_node b_node;
        struct ma_state *mas;
        unsigned char height;

        /* Left and Right side of spanning store */
        MA_STATE(l_mas, NULL, 0, 0);
        MA_STATE(r_mas, NULL, 0, 0);
        MA_WR_STATE(r_wr_mas, &r_mas, wr_mas->entry);
        MA_WR_STATE(l_wr_mas, &l_mas, wr_mas->entry);

        /*
         * A store operation that spans multiple nodes is called a spanning
         * store and is handled early in the store call stack by the function
         * mas_is_span_wr().  When a spanning store is identified, the maple
         * state is duplicated.  The first maple state walks the left tree path
         * to ``index``, the duplicate walks the right tree path to ``last``.
         * The data in the two nodes are combined into a single node, two nodes,
         * or possibly three nodes (see the 3-way split above).  A ``NULL``
         * written to the last entry of a node is considered a spanning store as
         * a rebalance is required for the operation to complete and an overflow
         * of data may happen.
         */
        mas = wr_mas->mas;
        trace_ma_op(__func__, mas);

        if (unlikely(!mas->index && mas->last == ULONG_MAX))
                return mas_new_root(mas, wr_mas->entry);
        /*
         * Node rebalancing may occur due to this store, so there may be three new
         * entries per level plus a new root.
         */
        height = mas_mt_height(mas);
        mas_node_count(mas, 1 + height * 3);
        if (mas_is_err(mas))
                return 0;

        /*
         * Set up right side.  Need to get to the next offset after the spanning
         * store to ensure it's not NULL and to combine both the next node and
         * the node with the start together.
         */
        r_mas = *mas;
        /* Avoid overflow, walk to next slot in the tree. */
        if (r_mas.last + 1)
                r_mas.last++;

        r_mas.index = r_mas.last;
        mas_wr_walk_index(&r_wr_mas);
        r_mas.last = r_mas.index = mas->last;

        /* Set up left side. */
        l_mas = *mas;
        mas_wr_walk_index(&l_wr_mas);

        if (!wr_mas->entry) {
                mas_extend_spanning_null(&l_wr_mas, &r_wr_mas);
                mas->offset = l_mas.offset;
                mas->index = l_mas.index;
                mas->last = l_mas.last = r_mas.last;
        }

        /* expanding NULLs may make this cover the entire range */
        if (!l_mas.index && r_mas.last == ULONG_MAX) {
                mas_set_range(mas, 0, ULONG_MAX);
                return mas_new_root(mas, wr_mas->entry);
        }

        memset(&b_node, 0, sizeof(struct maple_big_node));
        /* Copy l_mas and store the value in b_node. */
        mas_store_b_node(&l_wr_mas, &b_node, l_mas.end);
        /* Copy r_mas into b_node. */
        if (r_mas.offset <= r_mas.end)
                mas_mab_cp(&r_mas, r_mas.offset, r_mas.end,
                           &b_node, b_node.b_end + 1);
        else
                b_node.b_end++;

        /* Stop spanning searches by searching for just index. */
        l_mas.index = l_mas.last = mas->index;

        mast.bn = &b_node;
        mast.orig_l = &l_mas;
        mast.orig_r = &r_mas;
        /* Combine l_mas and r_mas and split them up evenly again. */
        return mas_spanning_rebalance(mas, &mast, height + 1);
}

/*
 * mas_wr_node_store() - Attempt to store the value in a node
 * @wr_mas: The maple write state
 *
 * Attempts to reuse the node, but may allocate.
 *
 * Return: True if stored, false otherwise
 */
static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas,
                                     unsigned char new_end)
{
        struct ma_state *mas = wr_mas->mas;
        void __rcu **dst_slots;
        unsigned long *dst_pivots;
        unsigned char dst_offset, offset_end = wr_mas->offset_end;
        struct maple_node reuse, *newnode;
        unsigned char copy_size, node_pivots = mt_pivots[wr_mas->type];
        bool in_rcu = mt_in_rcu(mas->tree);

        /* Check if there is enough data. The room is enough. */
        if (!mte_is_root(mas->node) && (new_end <= mt_min_slots[wr_mas->type]) &&
            !(mas->mas_flags & MA_STATE_BULK))
                return false;

        if (mas->last == wr_mas->end_piv)
                offset_end++; /* don't copy this offset */
        else if (unlikely(wr_mas->r_max == ULONG_MAX))
                mas_bulk_rebalance(mas, mas->end, wr_mas->type);

        /* set up node. */
        if (in_rcu) {
                mas_node_count(mas, 1);
                if (mas_is_err(mas))
                        return false;

                newnode = mas_pop_node(mas);
        } else {
                memset(&reuse, 0, sizeof(struct maple_node));
                newnode = &reuse;
        }

        newnode->parent = mas_mn(mas)->parent;
        dst_pivots = ma_pivots(newnode, wr_mas->type);
        dst_slots = ma_slots(newnode, wr_mas->type);
        /* Copy from start to insert point */
        memcpy(dst_pivots, wr_mas->pivots, sizeof(unsigned long) * mas->offset);
        memcpy(dst_slots, wr_mas->slots, sizeof(void *) * mas->offset);

        /* Handle insert of new range starting after old range */
        if (wr_mas->r_min < mas->index) {
                rcu_assign_pointer(dst_slots[mas->offset], wr_mas->content);
                dst_pivots[mas->offset++] = mas->index - 1;
        }

        /* Store the new entry and range end. */
        if (mas->offset < node_pivots)
                dst_pivots[mas->offset] = mas->last;
        rcu_assign_pointer(dst_slots[mas->offset], wr_mas->entry);

        /*
         * this range wrote to the end of the node or it overwrote the rest of
         * the data
         */
        if (offset_end > mas->end)
                goto done;

        dst_offset = mas->offset + 1;
        /* Copy to the end of node if necessary. */
        copy_size = mas->end - offset_end + 1;
        memcpy(dst_slots + dst_offset, wr_mas->slots + offset_end,
               sizeof(void *) * copy_size);
        memcpy(dst_pivots + dst_offset, wr_mas->pivots + offset_end,
               sizeof(unsigned long) * (copy_size - 1));

        if (new_end < node_pivots)
                dst_pivots[new_end] = mas->max;

done:
        mas_leaf_set_meta(newnode, maple_leaf_64, new_end);
        if (in_rcu) {
                struct maple_enode *old_enode = mas->node;

                mas->node = mt_mk_node(newnode, wr_mas->type);
                mas_replace_node(mas, old_enode);
        } else {
                memcpy(wr_mas->node, newnode, sizeof(struct maple_node));
        }
        trace_ma_write(__func__, mas, 0, wr_mas->entry);
        mas_update_gap(mas);
        mas->end = new_end;
        return true;
}

/*
 * mas_wr_slot_store: Attempt to store a value in a slot.
 * @wr_mas: the maple write state
 *
 * Return: True if stored, false otherwise
 */
static inline bool mas_wr_slot_store(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char offset = mas->offset;
        void __rcu **slots = wr_mas->slots;
        bool gap = false;

        gap |= !mt_slot_locked(mas->tree, slots, offset);
        gap |= !mt_slot_locked(mas->tree, slots, offset + 1);

        if (wr_mas->offset_end - offset == 1) {
                if (mas->index == wr_mas->r_min) {
                        /* Overwriting the range and a part of the next one */
                        rcu_assign_pointer(slots[offset], wr_mas->entry);
                        wr_mas->pivots[offset] = mas->last;
                } else {
                        /* Overwriting a part of the range and the next one */
                        rcu_assign_pointer(slots[offset + 1], wr_mas->entry);
                        wr_mas->pivots[offset] = mas->index - 1;
                        mas->offset++; /* Keep mas accurate. */
                }
        } else if (!mt_in_rcu(mas->tree)) {
                /*
                 * Expand the range, only partially overwriting the previous and
                 * next ranges
                 */
                gap |= !mt_slot_locked(mas->tree, slots, offset + 2);
                rcu_assign_pointer(slots[offset + 1], wr_mas->entry);
                wr_mas->pivots[offset] = mas->index - 1;
                wr_mas->pivots[offset + 1] = mas->last;
                mas->offset++; /* Keep mas accurate. */
        } else {
                return false;
        }

        trace_ma_write(__func__, mas, 0, wr_mas->entry);
        /*
         * Only update gap when the new entry is empty or there is an empty
         * entry in the original two ranges.
         */
        if (!wr_mas->entry || gap)
                mas_update_gap(mas);

        return true;
}

static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        if (!wr_mas->slots[wr_mas->offset_end]) {
                /* If this one is null, the next and prev are not */
                mas->last = wr_mas->end_piv;
        } else {
                /* Check next slot(s) if we are overwriting the end */
                if ((mas->last == wr_mas->end_piv) &&
                    (mas->end != wr_mas->offset_end) &&
                    !wr_mas->slots[wr_mas->offset_end + 1]) {
                        wr_mas->offset_end++;
                        if (wr_mas->offset_end == mas->end)
                                mas->last = mas->max;
                        else
                                mas->last = wr_mas->pivots[wr_mas->offset_end];
                        wr_mas->end_piv = mas->last;
                }
        }

        if (!wr_mas->content) {
                /* If this one is null, the next and prev are not */
                mas->index = wr_mas->r_min;
        } else {
                /* Check prev slot if we are overwriting the start */
                if (mas->index == wr_mas->r_min && mas->offset &&
                    !wr_mas->slots[mas->offset - 1]) {
                        mas->offset--;
                        wr_mas->r_min = mas->index =
                                mas_safe_min(mas, wr_mas->pivots, mas->offset);
                        wr_mas->r_max = wr_mas->pivots[mas->offset];
                }
        }
}

static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas)
{
        while ((wr_mas->offset_end < wr_mas->mas->end) &&
               (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end]))
                wr_mas->offset_end++;

        if (wr_mas->offset_end < wr_mas->mas->end)
                wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end];
        else
                wr_mas->end_piv = wr_mas->mas->max;

        if (!wr_mas->entry)
                mas_wr_extend_null(wr_mas);
}

static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char new_end = mas->end + 2;

        new_end -= wr_mas->offset_end - mas->offset;
        if (wr_mas->r_min == mas->index)
                new_end--;

        if (wr_mas->end_piv == mas->last)
                new_end--;

        return new_end;
}

/*
 * mas_wr_append: Attempt to append
 * @wr_mas: the maple write state
 * @new_end: The end of the node after the modification
 *
 * This is currently unsafe in rcu mode since the end of the node may be cached
 * by readers while the node contents may be updated which could result in
 * inaccurate information.
 *
 * Return: True if appended, false otherwise
 */
static inline bool mas_wr_append(struct ma_wr_state *wr_mas,
                unsigned char new_end)
{
        struct ma_state *mas;
        void __rcu **slots;
        unsigned char end;

        mas = wr_mas->mas;
        if (mt_in_rcu(mas->tree))
                return false;

        end = mas->end;
        if (mas->offset != end)
                return false;

        if (new_end < mt_pivots[wr_mas->type]) {
                wr_mas->pivots[new_end] = wr_mas->pivots[end];
                ma_set_meta(wr_mas->node, wr_mas->type, 0, new_end);
        }

        slots = wr_mas->slots;
        if (new_end == end + 1) {
                if (mas->last == wr_mas->r_max) {
                        /* Append to end of range */
                        rcu_assign_pointer(slots[new_end], wr_mas->entry);
                        wr_mas->pivots[end] = mas->index - 1;
                        mas->offset = new_end;
                } else {
                        /* Append to start of range */
                        rcu_assign_pointer(slots[new_end], wr_mas->content);
                        wr_mas->pivots[end] = mas->last;
                        rcu_assign_pointer(slots[end], wr_mas->entry);
                }
        } else {
                /* Append to the range without touching any boundaries. */
                rcu_assign_pointer(slots[new_end], wr_mas->content);
                wr_mas->pivots[end + 1] = mas->last;
                rcu_assign_pointer(slots[end + 1], wr_mas->entry);
                wr_mas->pivots[end] = mas->index - 1;
                mas->offset = end + 1;
        }

        if (!wr_mas->content || !wr_mas->entry)
                mas_update_gap(mas);

        mas->end = new_end;
        trace_ma_write(__func__, mas, new_end, wr_mas->entry);
        return  true;
}

/*
 * mas_wr_bnode() - Slow path for a modification.
 * @wr_mas: The write maple state
 *
 * This is where split, rebalance end up.
 */
static void mas_wr_bnode(struct ma_wr_state *wr_mas)
{
        struct maple_big_node b_node;

        trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry);
        memset(&b_node, 0, sizeof(struct maple_big_node));
        mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end);
        mas_commit_b_node(wr_mas, &b_node, wr_mas->mas->end);
}

static inline void mas_wr_modify(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char new_end;

        /* Direct replacement */
        if (wr_mas->r_min == mas->index && wr_mas->r_max == mas->last) {
                rcu_assign_pointer(wr_mas->slots[mas->offset], wr_mas->entry);
                if (!!wr_mas->entry ^ !!wr_mas->content)
                        mas_update_gap(mas);
                return;
        }

        /*
         * new_end exceeds the size of the maple node and cannot enter the fast
         * path.
         */
        new_end = mas_wr_new_end(wr_mas);
        if (new_end >= mt_slots[wr_mas->type])
                goto slow_path;

        /* Attempt to append */
        if (mas_wr_append(wr_mas, new_end))
                return;

        if (new_end == mas->end && mas_wr_slot_store(wr_mas))
                return;

        if (mas_wr_node_store(wr_mas, new_end))
                return;

        if (mas_is_err(mas))
                return;

slow_path:
        mas_wr_bnode(wr_mas);
}

/*
 * mas_wr_store_entry() - Internal call to store a value
 * @mas: The maple state
 * @entry: The entry to store.
 *
 * Return: The contents that was stored at the index.
 */
static inline void *mas_wr_store_entry(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        wr_mas->content = mas_start(mas);
        if (mas_is_none(mas) || mas_is_ptr(mas)) {
                mas_store_root(mas, wr_mas->entry);
                return wr_mas->content;
        }

        if (unlikely(!mas_wr_walk(wr_mas))) {
                mas_wr_spanning_store(wr_mas);
                return wr_mas->content;
        }

        /* At this point, we are at the leaf node that needs to be altered. */
        mas_wr_end_piv(wr_mas);
        /* New root for a single pointer */
        if (unlikely(!mas->index && mas->last == ULONG_MAX)) {
                mas_new_root(mas, wr_mas->entry);
                return wr_mas->content;
        }

        mas_wr_modify(wr_mas);
        return wr_mas->content;
}

/**
 * mas_insert() - Internal call to insert a value
 * @mas: The maple state
 * @entry: The entry to store
 *
 * Return: %NULL or the contents that already exists at the requested index
 * otherwise.  The maple state needs to be checked for error conditions.
 */
static inline void *mas_insert(struct ma_state *mas, void *entry)
{
        MA_WR_STATE(wr_mas, mas, entry);

        /*
         * Inserting a new range inserts either 0, 1, or 2 pivots within the
         * tree.  If the insert fits exactly into an existing gap with a value
         * of NULL, then the slot only needs to be written with the new value.
         * If the range being inserted is adjacent to another range, then only a
         * single pivot needs to be inserted (as well as writing the entry).  If
         * the new range is within a gap but does not touch any other ranges,
         * then two pivots need to be inserted: the start - 1, and the end.  As
         * usual, the entry must be written.  Most operations require a new node
         * to be allocated and replace an existing node to ensure RCU safety,
         * when in RCU mode.  The exception to requiring a newly allocated node
         * is when inserting at the end of a node (appending).  When done
         * carefully, appending can reuse the node in place.
         */
        wr_mas.content = mas_start(mas);
        if (wr_mas.content)
                goto exists;

        if (mas_is_none(mas) || mas_is_ptr(mas)) {
                mas_store_root(mas, entry);
                return NULL;
        }

        /* spanning writes always overwrite something */
        if (!mas_wr_walk(&wr_mas))
                goto exists;

        /* At this point, we are at the leaf node that needs to be altered. */
        wr_mas.offset_end = mas->offset;
        wr_mas.end_piv = wr_mas.r_max;

        if (wr_mas.content || (mas->last > wr_mas.r_max))
                goto exists;

        if (!entry)
                return NULL;

        mas_wr_modify(&wr_mas);
        return wr_mas.content;

exists:
        mas_set_err(mas, -EEXIST);
        return wr_mas.content;

}

/**
 * mas_alloc_cyclic() - Internal call to find somewhere to store an entry
 * @mas: The maple state.
 * @startp: Pointer to ID.
 * @range_lo: Lower bound of range to search.
 * @range_hi: Upper bound of range to search.
 * @entry: The entry to store.
 * @next: Pointer to next ID to allocate.
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Return: 0 if the allocation succeeded without wrapping, 1 if the
 * allocation succeeded after wrapping, or -EBUSY if there are no
 * free entries.
 */
int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp,
                void *entry, unsigned long range_lo, unsigned long range_hi,
                unsigned long *next, gfp_t gfp)
{
        unsigned long min = range_lo;
        int ret = 0;

        range_lo = max(min, *next);
        ret = mas_empty_area(mas, range_lo, range_hi, 1);
        if ((mas->tree->ma_flags & MT_FLAGS_ALLOC_WRAPPED) && ret == 0) {
                mas->tree->ma_flags &= ~MT_FLAGS_ALLOC_WRAPPED;
                ret = 1;
        }
        if (ret < 0 && range_lo > min) {
                ret = mas_empty_area(mas, min, range_hi, 1);
                if (ret == 0)
                        ret = 1;
        }
        if (ret < 0)
                return ret;

        do {
                mas_insert(mas, entry);
        } while (mas_nomem(mas, gfp));
        if (mas_is_err(mas))
                return xa_err(mas->node);

        *startp = mas->index;
        *next = *startp + 1;
        if (*next == 0)
                mas->tree->ma_flags |= MT_FLAGS_ALLOC_WRAPPED;

        return ret;
}
EXPORT_SYMBOL(mas_alloc_cyclic);

static __always_inline void mas_rewalk(struct ma_state *mas, unsigned long index)
{
retry:
        mas_set(mas, index);
        mas_state_walk(mas);
        if (mas_is_start(mas))
                goto retry;
}

static __always_inline bool mas_rewalk_if_dead(struct ma_state *mas,
                struct maple_node *node, const unsigned long index)
{
        if (unlikely(ma_dead_node(node))) {
                mas_rewalk(mas, index);
                return true;
        }
        return false;
}

/*
 * mas_prev_node() - Find the prev non-null entry at the same level in the
 * tree.  The prev value will be mas->node[mas->offset] or the status will be
 * ma_none.
 * @mas: The maple state
 * @min: The lower limit to search
 *
 * The prev node value will be mas->node[mas->offset] or the status will be
 * ma_none.
 * Return: 1 if the node is dead, 0 otherwise.
 */
static int mas_prev_node(struct ma_state *mas, unsigned long min)
{
        enum maple_type mt;
        int offset, level;
        void __rcu **slots;
        struct maple_node *node;
        unsigned long *pivots;
        unsigned long max;

        node = mas_mn(mas);
        if (!mas->min)
                goto no_entry;

        max = mas->min - 1;
        if (max < min)
                goto no_entry;

        level = 0;
        do {
                if (ma_is_root(node))
                        goto no_entry;

                /* Walk up. */
                if (unlikely(mas_ascend(mas)))
                        return 1;
                offset = mas->offset;
                level++;
                node = mas_mn(mas);
        } while (!offset);

        offset--;
        mt = mte_node_type(mas->node);
        while (level > 1) {
                level--;
                slots = ma_slots(node, mt);
                mas->node = mas_slot(mas, slots, offset);
                if (unlikely(ma_dead_node(node)))
                        return 1;

                mt = mte_node_type(mas->node);
                node = mas_mn(mas);
                pivots = ma_pivots(node, mt);
                offset = ma_data_end(node, mt, pivots, max);
                if (unlikely(ma_dead_node(node)))
                        return 1;
        }

        slots = ma_slots(node, mt);
        mas->node = mas_slot(mas, slots, offset);
        pivots = ma_pivots(node, mt);
        if (unlikely(ma_dead_node(node)))
                return 1;

        if (likely(offset))
                mas->min = pivots[offset - 1] + 1;
        mas->max = max;
        mas->offset = mas_data_end(mas);
        if (unlikely(mte_dead_node(mas->node)))
                return 1;

        mas->end = mas->offset;
        return 0;

no_entry:
        if (unlikely(ma_dead_node(node)))
                return 1;

        mas->status = ma_underflow;
        return 0;
}

/*
 * mas_prev_slot() - Get the entry in the previous slot
 *
 * @mas: The maple state
 * @max: The minimum starting range
 * @empty: Can be empty
 * @set_underflow: Set the @mas->node to underflow state on limit.
 *
 * Return: The entry in the previous slot which is possibly NULL
 */
static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty)
{
        void *entry;
        void __rcu **slots;
        unsigned long pivot;
        enum maple_type type;
        unsigned long *pivots;
        struct maple_node *node;
        unsigned long save_point = mas->index;

retry:
        node = mas_mn(mas);
        type = mte_node_type(mas->node);
        pivots = ma_pivots(node, type);
        if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                goto retry;

        if (mas->min <= min) {
                pivot = mas_safe_min(mas, pivots, mas->offset);

                if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                        goto retry;

                if (pivot <= min)
                        goto underflow;
        }

again:
        if (likely(mas->offset)) {
                mas->offset--;
                mas->last = mas->index - 1;
                mas->index = mas_safe_min(mas, pivots, mas->offset);
        } else  {
                if (mas->index <= min)
                        goto underflow;

                if (mas_prev_node(mas, min)) {
                        mas_rewalk(mas, save_point);
                        goto retry;
                }

                if (WARN_ON_ONCE(mas_is_underflow(mas)))
                        return NULL;

                mas->last = mas->max;
                node = mas_mn(mas);
                type = mte_node_type(mas->node);
                pivots = ma_pivots(node, type);
                mas->index = pivots[mas->offset - 1] + 1;
        }

        slots = ma_slots(node, type);
        entry = mas_slot(mas, slots, mas->offset);
        if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                goto retry;


        if (likely(entry))
                return entry;

        if (!empty) {
                if (mas->index <= min) {
                        mas->status = ma_underflow;
                        return NULL;
                }

                goto again;
        }

        return entry;

underflow:
        mas->status = ma_underflow;
        return NULL;
}

/*
 * mas_next_node() - Get the next node at the same level in the tree.
 * @mas: The maple state
 * @max: The maximum pivot value to check.
 *
 * The next value will be mas->node[mas->offset] or the status will have
 * overflowed.
 * Return: 1 on dead node, 0 otherwise.
 */
static int mas_next_node(struct ma_state *mas, struct maple_node *node,
                unsigned long max)
{
        unsigned long min;
        unsigned long *pivots;
        struct maple_enode *enode;
        struct maple_node *tmp;
        int level = 0;
        unsigned char node_end;
        enum maple_type mt;
        void __rcu **slots;

        if (mas->max >= max)
                goto overflow;

        min = mas->max + 1;
        level = 0;
        do {
                if (ma_is_root(node))
                        goto overflow;

                /* Walk up. */
                if (unlikely(mas_ascend(mas)))
                        return 1;

                level++;
                node = mas_mn(mas);
                mt = mte_node_type(mas->node);
                pivots = ma_pivots(node, mt);
                node_end = ma_data_end(node, mt, pivots, mas->max);
                if (unlikely(ma_dead_node(node)))
                        return 1;

        } while (unlikely(mas->offset == node_end));

        slots = ma_slots(node, mt);
        mas->offset++;
        enode = mas_slot(mas, slots, mas->offset);
        if (unlikely(ma_dead_node(node)))
                return 1;

        if (level > 1)
                mas->offset = 0;

        while (unlikely(level > 1)) {
                level--;
                mas->node = enode;
                node = mas_mn(mas);
                mt = mte_node_type(mas->node);
                slots = ma_slots(node, mt);
                enode = mas_slot(mas, slots, 0);
                if (unlikely(ma_dead_node(node)))
                        return 1;
        }

        if (!mas->offset)
                pivots = ma_pivots(node, mt);

        mas->max = mas_safe_pivot(mas, pivots, mas->offset, mt);
        tmp = mte_to_node(enode);
        mt = mte_node_type(enode);
        pivots = ma_pivots(tmp, mt);
        mas->end = ma_data_end(tmp, mt, pivots, mas->max);
        if (unlikely(ma_dead_node(node)))
                return 1;

        mas->node = enode;
        mas->min = min;
        return 0;

overflow:
        if (unlikely(ma_dead_node(node)))
                return 1;

        mas->status = ma_overflow;
        return 0;
}

/*
 * mas_next_slot() - Get the entry in the next slot
 *
 * @mas: The maple state
 * @max: The maximum starting range
 * @empty: Can be empty
 * @set_overflow: Should @mas->node be set to overflow when the limit is
 * reached.
 *
 * Return: The entry in the next slot which is possibly NULL
 */
static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty)
{
        void __rcu **slots;
        unsigned long *pivots;
        unsigned long pivot;
        enum maple_type type;
        struct maple_node *node;
        unsigned long save_point = mas->last;
        void *entry;

retry:
        node = mas_mn(mas);
        type = mte_node_type(mas->node);
        pivots = ma_pivots(node, type);
        if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                goto retry;

        if (mas->max >= max) {
                if (likely(mas->offset < mas->end))
                        pivot = pivots[mas->offset];
                else
                        pivot = mas->max;

                if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                        goto retry;

                if (pivot >= max) { /* Was at the limit, next will extend beyond */
                        mas->status = ma_overflow;
                        return NULL;
                }
        }

        if (likely(mas->offset < mas->end)) {
                mas->index = pivots[mas->offset] + 1;
again:
                mas->offset++;
                if (likely(mas->offset < mas->end))
                        mas->last = pivots[mas->offset];
                else
                        mas->last = mas->max;
        } else  {
                if (mas->last >= max) {
                        mas->status = ma_overflow;
                        return NULL;
                }

                if (mas_next_node(mas, node, max)) {
                        mas_rewalk(mas, save_point);
                        goto retry;
                }

                if (WARN_ON_ONCE(mas_is_overflow(mas)))
                        return NULL;

                mas->offset = 0;
                mas->index = mas->min;
                node = mas_mn(mas);
                type = mte_node_type(mas->node);
                pivots = ma_pivots(node, type);
                mas->last = pivots[0];
        }

        slots = ma_slots(node, type);
        entry = mt_slot(mas->tree, slots, mas->offset);
        if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                goto retry;

        if (entry)
                return entry;


        if (!empty) {
                if (mas->last >= max) {
                        mas->status = ma_overflow;
                        return NULL;
                }

                mas->index = mas->last + 1;
                goto again;
        }

        return entry;
}

/*
 * mas_next_entry() - Internal function to get the next entry.
 * @mas: The maple state
 * @limit: The maximum range start.
 *
 * Set the @mas->node to the next entry and the range_start to
 * the beginning value for the entry.  Does not check beyond @limit.
 * Sets @mas->index and @mas->last to the range, Does not update @mas->index and
 * @mas->last on overflow.
 * Restarts on dead nodes.
 *
 * Return: the next entry or %NULL.
 */
static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit)
{
        if (mas->last >= limit) {
                mas->status = ma_overflow;
                return NULL;
        }

        return mas_next_slot(mas, limit, false);
}

/*
 * mas_rev_awalk() - Internal function.  Reverse allocation walk.  Find the
 * highest gap address of a given size in a given node and descend.
 * @mas: The maple state
 * @size: The needed size.
 *
 * Return: True if found in a leaf, false otherwise.
 *
 */
static bool mas_rev_awalk(struct ma_state *mas, unsigned long size,
                unsigned long *gap_min, unsigned long *gap_max)
{
        enum maple_type type = mte_node_type(mas->node);
        struct maple_node *node = mas_mn(mas);
        unsigned long *pivots, *gaps;
        void __rcu **slots;
        unsigned long gap = 0;
        unsigned long max, min;
        unsigned char offset;

        if (unlikely(mas_is_err(mas)))
                return true;

        if (ma_is_dense(type)) {
                /* dense nodes. */
                mas->offset = (unsigned char)(mas->index - mas->min);
                return true;
        }

        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);
        gaps = ma_gaps(node, type);
        offset = mas->offset;
        min = mas_safe_min(mas, pivots, offset);
        /* Skip out of bounds. */
        while (mas->last < min)
                min = mas_safe_min(mas, pivots, --offset);

        max = mas_safe_pivot(mas, pivots, offset, type);
        while (mas->index <= max) {
                gap = 0;
                if (gaps)
                        gap = gaps[offset];
                else if (!mas_slot(mas, slots, offset))
                        gap = max - min + 1;

                if (gap) {
                        if ((size <= gap) && (size <= mas->last - min + 1))
                                break;

                        if (!gaps) {
                                /* Skip the next slot, it cannot be a gap. */
                                if (offset < 2)
                                        goto ascend;

                                offset -= 2;
                                max = pivots[offset];
                                min = mas_safe_min(mas, pivots, offset);
                                continue;
                        }
                }

                if (!offset)
                        goto ascend;

                offset--;
                max = min - 1;
                min = mas_safe_min(mas, pivots, offset);
        }

        if (unlikely((mas->index > max) || (size - 1 > max - mas->index)))
                goto no_space;

        if (unlikely(ma_is_leaf(type))) {
                mas->offset = offset;
                *gap_min = min;
                *gap_max = min + gap - 1;
                return true;
        }

        /* descend, only happens under lock. */
        mas->node = mas_slot(mas, slots, offset);
        mas->min = min;
        mas->max = max;
        mas->offset = mas_data_end(mas);
        return false;

ascend:
        if (!mte_is_root(mas->node))
                return false;

no_space:
        mas_set_err(mas, -EBUSY);
        return false;
}

static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size)
{
        enum maple_type type = mte_node_type(mas->node);
        unsigned long pivot, min, gap = 0;
        unsigned char offset, data_end;
        unsigned long *gaps, *pivots;
        void __rcu **slots;
        struct maple_node *node;
        bool found = false;

        if (ma_is_dense(type)) {
                mas->offset = (unsigned char)(mas->index - mas->min);
                return true;
        }

        node = mas_mn(mas);
        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);
        gaps = ma_gaps(node, type);
        offset = mas->offset;
        min = mas_safe_min(mas, pivots, offset);
        data_end = ma_data_end(node, type, pivots, mas->max);
        for (; offset <= data_end; offset++) {
                pivot = mas_safe_pivot(mas, pivots, offset, type);

                /* Not within lower bounds */
                if (mas->index > pivot)
                        goto next_slot;

                if (gaps)
                        gap = gaps[offset];
                else if (!mas_slot(mas, slots, offset))
                        gap = min(pivot, mas->last) - max(mas->index, min) + 1;
                else
                        goto next_slot;

                if (gap >= size) {
                        if (ma_is_leaf(type)) {
                                found = true;
                                goto done;
                        }
                        if (mas->index <= pivot) {
                                mas->node = mas_slot(mas, slots, offset);
                                mas->min = min;
                                mas->max = pivot;
                                offset = 0;
                                break;
                        }
                }
next_slot:
                min = pivot + 1;
                if (mas->last <= pivot) {
                        mas_set_err(mas, -EBUSY);
                        return true;
                }
        }

        if (mte_is_root(mas->node))
                found = true;
done:
        mas->offset = offset;
        return found;
}

/**
 * mas_walk() - Search for @mas->index in the tree.
 * @mas: The maple state.
 *
 * mas->index and mas->last will be set to the range if there is a value.  If
 * mas->status is ma_none, reset to ma_start
 *
 * Return: the entry at the location or %NULL.
 */
void *mas_walk(struct ma_state *mas)
{
        void *entry;

        if (!mas_is_active(mas) || !mas_is_start(mas))
                mas->status = ma_start;
retry:
        entry = mas_state_walk(mas);
        if (mas_is_start(mas)) {
                goto retry;
        } else if (mas_is_none(mas)) {
                mas->index = 0;
                mas->last = ULONG_MAX;
        } else if (mas_is_ptr(mas)) {
                if (!mas->index) {
                        mas->last = 0;
                        return entry;
                }

                mas->index = 1;
                mas->last = ULONG_MAX;
                mas->status = ma_none;
                return NULL;
        }

        return entry;
}
EXPORT_SYMBOL_GPL(mas_walk);

static inline bool mas_rewind_node(struct ma_state *mas)
{
        unsigned char slot;

        do {
                if (mte_is_root(mas->node)) {
                        slot = mas->offset;
                        if (!slot)
                                return false;
                } else {
                        mas_ascend(mas);
                        slot = mas->offset;
                }
        } while (!slot);

        mas->offset = --slot;
        return true;
}

/*
 * mas_skip_node() - Internal function.  Skip over a node.
 * @mas: The maple state.
 *
 * Return: true if there is another node, false otherwise.
 */
static inline bool mas_skip_node(struct ma_state *mas)
{
        if (mas_is_err(mas))
                return false;

        do {
                if (mte_is_root(mas->node)) {
                        if (mas->offset >= mas_data_end(mas)) {
                                mas_set_err(mas, -EBUSY);
                                return false;
                        }
                } else {
                        mas_ascend(mas);
                }
        } while (mas->offset >= mas_data_end(mas));

        mas->offset++;
        return true;
}

/*
 * mas_awalk() - Allocation walk.  Search from low address to high, for a gap of
 * @size
 * @mas: The maple state
 * @size: The size of the gap required
 *
 * Search between @mas->index and @mas->last for a gap of @size.
 */
static inline void mas_awalk(struct ma_state *mas, unsigned long size)
{
        struct maple_enode *last = NULL;

        /*
         * There are 4 options:
         * go to child (descend)
         * go back to parent (ascend)
         * no gap found. (return, slot == MAPLE_NODE_SLOTS)
         * found the gap. (return, slot != MAPLE_NODE_SLOTS)
         */
        while (!mas_is_err(mas) && !mas_anode_descend(mas, size)) {
                if (last == mas->node)
                        mas_skip_node(mas);
                else
                        last = mas->node;
        }
}

/*
 * mas_sparse_area() - Internal function.  Return upper or lower limit when
 * searching for a gap in an empty tree.
 * @mas: The maple state
 * @min: the minimum range
 * @max: The maximum range
 * @size: The size of the gap
 * @fwd: Searching forward or back
 */
static inline int mas_sparse_area(struct ma_state *mas, unsigned long min,
                                unsigned long max, unsigned long size, bool fwd)
{
        if (!unlikely(mas_is_none(mas)) && min == 0) {
                min++;
                /*
                 * At this time, min is increased, we need to recheck whether
                 * the size is satisfied.
                 */
                if (min > max || max - min + 1 < size)
                        return -EBUSY;
        }
        /* mas_is_ptr */

        if (fwd) {
                mas->index = min;
                mas->last = min + size - 1;
        } else {
                mas->last = max;
                mas->index = max - size + 1;
        }
        return 0;
}

/*
 * mas_empty_area() - Get the lowest address within the range that is
 * sufficient for the size requested.
 * @mas: The maple state
 * @min: The lowest value of the range
 * @max: The highest value of the range
 * @size: The size needed
 */
int mas_empty_area(struct ma_state *mas, unsigned long min,
                unsigned long max, unsigned long size)
{
        unsigned char offset;
        unsigned long *pivots;
        enum maple_type mt;
        struct maple_node *node;

        if (min > max)
                return -EINVAL;

        if (size == 0 || max - min < size - 1)
                return -EINVAL;

        if (mas_is_start(mas))
                mas_start(mas);
        else if (mas->offset >= 2)
                mas->offset -= 2;
        else if (!mas_skip_node(mas))
                return -EBUSY;

        /* Empty set */
        if (mas_is_none(mas) || mas_is_ptr(mas))
                return mas_sparse_area(mas, min, max, size, true);

        /* The start of the window can only be within these values */
        mas->index = min;
        mas->last = max;
        mas_awalk(mas, size);

        if (unlikely(mas_is_err(mas)))
                return xa_err(mas->node);

        offset = mas->offset;
        if (unlikely(offset == MAPLE_NODE_SLOTS))
                return -EBUSY;

        node = mas_mn(mas);
        mt = mte_node_type(mas->node);
        pivots = ma_pivots(node, mt);
        min = mas_safe_min(mas, pivots, offset);
        if (mas->index < min)
                mas->index = min;
        mas->last = mas->index + size - 1;
        mas->end = ma_data_end(node, mt, pivots, mas->max);
        return 0;
}
EXPORT_SYMBOL_GPL(mas_empty_area);

/*
 * mas_empty_area_rev() - Get the highest address within the range that is
 * sufficient for the size requested.
 * @mas: The maple state
 * @min: The lowest value of the range
 * @max: The highest value of the range
 * @size: The size needed
 */
int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
                unsigned long max, unsigned long size)
{
        struct maple_enode *last = mas->node;

        if (min > max)
                return -EINVAL;

        if (size == 0 || max - min < size - 1)
                return -EINVAL;

        if (mas_is_start(mas))
                mas_start(mas);
        else if ((mas->offset < 2) && (!mas_rewind_node(mas)))
                return -EBUSY;

        if (unlikely(mas_is_none(mas) || mas_is_ptr(mas)))
                return mas_sparse_area(mas, min, max, size, false);
        else if (mas->offset >= 2)
                mas->offset -= 2;
        else
                mas->offset = mas_data_end(mas);


        /* The start of the window can only be within these values. */
        mas->index = min;
        mas->last = max;

        while (!mas_rev_awalk(mas, size, &min, &max)) {
                if (last == mas->node) {
                        if (!mas_rewind_node(mas))
                                return -EBUSY;
                } else {
                        last = mas->node;
                }
        }

        if (mas_is_err(mas))
                return xa_err(mas->node);

        if (unlikely(mas->offset == MAPLE_NODE_SLOTS))
                return -EBUSY;

        /* Trim the upper limit to the max. */
        if (max < mas->last)
                mas->last = max;

        mas->index = mas->last - size + 1;
        mas->end = mas_data_end(mas);
        return 0;
}
EXPORT_SYMBOL_GPL(mas_empty_area_rev);

/*
 * mte_dead_leaves() - Mark all leaves of a node as dead.
 * @mas: The maple state
 * @slots: Pointer to the slot array
 * @type: The maple node type
 *
 * Must hold the write lock.
 *
 * Return: The number of leaves marked as dead.
 */
static inline
unsigned char mte_dead_leaves(struct maple_enode *enode, struct maple_tree *mt,
                              void __rcu **slots)
{
        struct maple_node *node;
        enum maple_type type;
        void *entry;
        int offset;

        for (offset = 0; offset < mt_slot_count(enode); offset++) {
                entry = mt_slot(mt, slots, offset);
                type = mte_node_type(entry);
                node = mte_to_node(entry);
                /* Use both node and type to catch LE & BE metadata */
                if (!node || !type)
                        break;

                mte_set_node_dead(entry);
                node->type = type;
                rcu_assign_pointer(slots[offset], node);
        }

        return offset;
}

/**
 * mte_dead_walk() - Walk down a dead tree to just before the leaves
 * @enode: The maple encoded node
 * @offset: The starting offset
 *
 * Note: This can only be used from the RCU callback context.
 */
static void __rcu **mte_dead_walk(struct maple_enode **enode, unsigned char offset)
{
        struct maple_node *node, *next;
        void __rcu **slots = NULL;

        next = mte_to_node(*enode);
        do {
                *enode = ma_enode_ptr(next);
                node = mte_to_node(*enode);
                slots = ma_slots(node, node->type);
                next = rcu_dereference_protected(slots[offset],
                                        lock_is_held(&rcu_callback_map));
                offset = 0;
        } while (!ma_is_leaf(next->type));

        return slots;
}

/**
 * mt_free_walk() - Walk & free a tree in the RCU callback context
 * @head: The RCU head that's within the node.
 *
 * Note: This can only be used from the RCU callback context.
 */
static void mt_free_walk(struct rcu_head *head)
{
        void __rcu **slots;
        struct maple_node *node, *start;
        struct maple_enode *enode;
        unsigned char offset;
        enum maple_type type;

        node = container_of(head, struct maple_node, rcu);

        if (ma_is_leaf(node->type))
                goto free_leaf;

        start = node;
        enode = mt_mk_node(node, node->type);
        slots = mte_dead_walk(&enode, 0);
        node = mte_to_node(enode);
        do {
                mt_free_bulk(node->slot_len, slots);
                offset = node->parent_slot + 1;
                enode = node->piv_parent;
                if (mte_to_node(enode) == node)
                        goto free_leaf;

                type = mte_node_type(enode);
                slots = ma_slots(mte_to_node(enode), type);
                if ((offset < mt_slots[type]) &&
                    rcu_dereference_protected(slots[offset],
                                              lock_is_held(&rcu_callback_map)))
                        slots = mte_dead_walk(&enode, offset);
                node = mte_to_node(enode);
        } while ((node != start) || (node->slot_len < offset));

        slots = ma_slots(node, node->type);
        mt_free_bulk(node->slot_len, slots);

free_leaf:
        mt_free_rcu(&node->rcu);
}

static inline void __rcu **mte_destroy_descend(struct maple_enode **enode,
        struct maple_tree *mt, struct maple_enode *prev, unsigned char offset)
{
        struct maple_node *node;
        struct maple_enode *next = *enode;
        void __rcu **slots = NULL;
        enum maple_type type;
        unsigned char next_offset = 0;

        do {
                *enode = next;
                node = mte_to_node(*enode);
                type = mte_node_type(*enode);
                slots = ma_slots(node, type);
                next = mt_slot_locked(mt, slots, next_offset);
                if ((mte_dead_node(next)))
                        next = mt_slot_locked(mt, slots, ++next_offset);

                mte_set_node_dead(*enode);
                node->type = type;
                node->piv_parent = prev;
                node->parent_slot = offset;
                offset = next_offset;
                next_offset = 0;
                prev = *enode;
        } while (!mte_is_leaf(next));

        return slots;
}

static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt,
                            bool free)
{
        void __rcu **slots;
        struct maple_node *node = mte_to_node(enode);
        struct maple_enode *start;

        if (mte_is_leaf(enode)) {
                node->type = mte_node_type(enode);
                goto free_leaf;
        }

        start = enode;
        slots = mte_destroy_descend(&enode, mt, start, 0);
        node = mte_to_node(enode); // Updated in the above call.
        do {
                enum maple_type type;
                unsigned char offset;
                struct maple_enode *parent, *tmp;

                node->slot_len = mte_dead_leaves(enode, mt, slots);
                if (free)
                        mt_free_bulk(node->slot_len, slots);
                offset = node->parent_slot + 1;
                enode = node->piv_parent;
                if (mte_to_node(enode) == node)
                        goto free_leaf;

                type = mte_node_type(enode);
                slots = ma_slots(mte_to_node(enode), type);
                if (offset >= mt_slots[type])
                        goto next;

                tmp = mt_slot_locked(mt, slots, offset);
                if (mte_node_type(tmp) && mte_to_node(tmp)) {
                        parent = enode;
                        enode = tmp;
                        slots = mte_destroy_descend(&enode, mt, parent, offset);
                }
next:
                node = mte_to_node(enode);
        } while (start != enode);

        node = mte_to_node(enode);
        node->slot_len = mte_dead_leaves(enode, mt, slots);
        if (free)
                mt_free_bulk(node->slot_len, slots);

free_leaf:
        if (free)
                mt_free_rcu(&node->rcu);
        else
                mt_clear_meta(mt, node, node->type);
}

/*
 * mte_destroy_walk() - Free a tree or sub-tree.
 * @enode: the encoded maple node (maple_enode) to start
 * @mt: the tree to free - needed for node types.
 *
 * Must hold the write lock.
 */
static inline void mte_destroy_walk(struct maple_enode *enode,
                                    struct maple_tree *mt)
{
        struct maple_node *node = mte_to_node(enode);

        if (mt_in_rcu(mt)) {
                mt_destroy_walk(enode, mt, false);
                call_rcu(&node->rcu, mt_free_walk);
        } else {
                mt_destroy_walk(enode, mt, true);
        }
}

static void mas_wr_store_setup(struct ma_wr_state *wr_mas)
{
        if (!mas_is_active(wr_mas->mas)) {
                if (mas_is_start(wr_mas->mas))
                        return;

                if (unlikely(mas_is_paused(wr_mas->mas)))
                        goto reset;

                if (unlikely(mas_is_none(wr_mas->mas)))
                        goto reset;

                if (unlikely(mas_is_overflow(wr_mas->mas)))
                        goto reset;

                if (unlikely(mas_is_underflow(wr_mas->mas)))
                        goto reset;
        }

        /*
         * A less strict version of mas_is_span_wr() where we allow spanning
         * writes within this node.  This is to stop partial walks in
         * mas_prealloc() from being reset.
         */
        if (wr_mas->mas->last > wr_mas->mas->max)
                goto reset;

        if (wr_mas->entry)
                return;

        if (mte_is_leaf(wr_mas->mas->node) &&
            wr_mas->mas->last == wr_mas->mas->max)
                goto reset;

        return;

reset:
        mas_reset(wr_mas->mas);
}

/* Interface */

/**
 * mas_store() - Store an @entry.
 * @mas: The maple state.
 * @entry: The entry to store.
 *
 * The @mas->index and @mas->last is used to set the range for the @entry.
 * Note: The @mas should have pre-allocated entries to ensure there is memory to
 * store the entry.  Please see mas_expected_entries()/mas_destroy() for more details.
 *
 * Return: the first entry between mas->index and mas->last or %NULL.
 */
void *mas_store(struct ma_state *mas, void *entry)
{
        MA_WR_STATE(wr_mas, mas, entry);

        trace_ma_write(__func__, mas, 0, entry);
#ifdef CONFIG_DEBUG_MAPLE_TREE
        if (MAS_WARN_ON(mas, mas->index > mas->last))
                pr_err("Error %lX > %lX %p\n", mas->index, mas->last, entry);

        if (mas->index > mas->last) {
                mas_set_err(mas, -EINVAL);
                return NULL;
        }

#endif

        /*
         * Storing is the same operation as insert with the added caveat that it
         * can overwrite entries.  Although this seems simple enough, one may
         * want to examine what happens if a single store operation was to
         * overwrite multiple entries within a self-balancing B-Tree.
         */
        mas_wr_store_setup(&wr_mas);
        mas_wr_store_entry(&wr_mas);
        return wr_mas.content;
}
EXPORT_SYMBOL_GPL(mas_store);

/**
 * mas_store_gfp() - Store a value into the tree.
 * @mas: The maple state
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations if necessary.
 *
 * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not
 * be allocated.
 */
int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp)
{
        MA_WR_STATE(wr_mas, mas, entry);

        mas_wr_store_setup(&wr_mas);
        trace_ma_write(__func__, mas, 0, entry);
retry:
        mas_wr_store_entry(&wr_mas);
        if (unlikely(mas_nomem(mas, gfp)))
                goto retry;

        if (unlikely(mas_is_err(mas)))
                return xa_err(mas->node);

        return 0;
}
EXPORT_SYMBOL_GPL(mas_store_gfp);

/**
 * mas_store_prealloc() - Store a value into the tree using memory
 * preallocated in the maple state.
 * @mas: The maple state
 * @entry: The entry to store.
 */
void mas_store_prealloc(struct ma_state *mas, void *entry)
{
        MA_WR_STATE(wr_mas, mas, entry);

        mas_wr_store_setup(&wr_mas);
        trace_ma_write(__func__, mas, 0, entry);
        mas_wr_store_entry(&wr_mas);
        MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas));
        mas_destroy(mas);
}
EXPORT_SYMBOL_GPL(mas_store_prealloc);

/**
 * mas_preallocate() - Preallocate enough nodes for a store operation
 * @mas: The maple state
 * @entry: The entry that will be stored
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Return: 0 on success, -ENOMEM if memory could not be allocated.
 */
int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp)
{
        MA_WR_STATE(wr_mas, mas, entry);
        unsigned char node_size;
        int request = 1;
        int ret;


        if (unlikely(!mas->index && mas->last == ULONG_MAX))
                goto ask_now;

        mas_wr_store_setup(&wr_mas);
        wr_mas.content = mas_start(mas);
        /* Root expand */
        if (unlikely(mas_is_none(mas) || mas_is_ptr(mas)))
                goto ask_now;

        if (unlikely(!mas_wr_walk(&wr_mas))) {
                /* Spanning store, use worst case for now */
                request = 1 + mas_mt_height(mas) * 3;
                goto ask_now;
        }

        /* At this point, we are at the leaf node that needs to be altered. */
        /* Exact fit, no nodes needed. */
        if (wr_mas.r_min == mas->index && wr_mas.r_max == mas->last)
                return 0;

        mas_wr_end_piv(&wr_mas);
        node_size = mas_wr_new_end(&wr_mas);

        /* Slot store, does not require additional nodes */
        if (node_size == mas->end) {
                /* reuse node */
                if (!mt_in_rcu(mas->tree))
                        return 0;
                /* shifting boundary */
                if (wr_mas.offset_end - mas->offset == 1)
                        return 0;
        }

        if (node_size >= mt_slots[wr_mas.type]) {
                /* Split, worst case for now. */
                request = 1 + mas_mt_height(mas) * 2;
                goto ask_now;
        }

        /* New root needs a single node */
        if (unlikely(mte_is_root(mas->node)))
                goto ask_now;

        /* Potential spanning rebalance collapsing a node, use worst-case */
        if (node_size  - 1 <= mt_min_slots[wr_mas.type])
                request = mas_mt_height(mas) * 2 - 1;

        /* node store, slot store needs one node */
ask_now:
        mas_node_count_gfp(mas, request, gfp);
        mas->mas_flags |= MA_STATE_PREALLOC;
        if (likely(!mas_is_err(mas)))
                return 0;

        mas_set_alloc_req(mas, 0);
        ret = xa_err(mas->node);
        mas_reset(mas);
        mas_destroy(mas);
        mas_reset(mas);
        return ret;
}
EXPORT_SYMBOL_GPL(mas_preallocate);

/*
 * mas_destroy() - destroy a maple state.
 * @mas: The maple state
 *
 * Upon completion, check the left-most node and rebalance against the node to
 * the right if necessary.  Frees any allocated nodes associated with this maple
 * state.
 */
void mas_destroy(struct ma_state *mas)
{
        struct maple_alloc *node;
        unsigned long total;

        /*
         * When using mas_for_each() to insert an expected number of elements,
         * it is possible that the number inserted is less than the expected
         * number.  To fix an invalid final node, a check is performed here to
         * rebalance the previous node with the final node.
         */
        if (mas->mas_flags & MA_STATE_REBALANCE) {
                unsigned char end;

                mas_start(mas);
                mtree_range_walk(mas);
                end = mas->end + 1;
                if (end < mt_min_slot_count(mas->node) - 1)
                        mas_destroy_rebalance(mas, end);

                mas->mas_flags &= ~MA_STATE_REBALANCE;
        }
        mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC);

        total = mas_allocated(mas);
        while (total) {
                node = mas->alloc;
                mas->alloc = node->slot[0];
                if (node->node_count > 1) {
                        size_t count = node->node_count - 1;

                        mt_free_bulk(count, (void __rcu **)&node->slot[1]);
                        total -= count;
                }
                mt_free_one(ma_mnode_ptr(node));
                total--;
        }

        mas->alloc = NULL;
}
EXPORT_SYMBOL_GPL(mas_destroy);

/*
 * mas_expected_entries() - Set the expected number of entries that will be inserted.
 * @mas: The maple state
 * @nr_entries: The number of expected entries.
 *
 * This will attempt to pre-allocate enough nodes to store the expected number
 * of entries.  The allocations will occur using the bulk allocator interface
 * for speed.  Please call mas_destroy() on the @mas after inserting the entries
 * to ensure any unused nodes are freed.
 *
 * Return: 0 on success, -ENOMEM if memory could not be allocated.
 */
int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries)
{
        int nonleaf_cap = MAPLE_ARANGE64_SLOTS - 2;
        struct maple_enode *enode = mas->node;
        int nr_nodes;
        int ret;

        /*
         * Sometimes it is necessary to duplicate a tree to a new tree, such as
         * forking a process and duplicating the VMAs from one tree to a new
         * tree.  When such a situation arises, it is known that the new tree is
         * not going to be used until the entire tree is populated.  For
         * performance reasons, it is best to use a bulk load with RCU disabled.
         * This allows for optimistic splitting that favours the left and reuse
         * of nodes during the operation.
         */

        /* Optimize splitting for bulk insert in-order */
        mas->mas_flags |= MA_STATE_BULK;

        /*
         * Avoid overflow, assume a gap between each entry and a trailing null.
         * If this is wrong, it just means allocation can happen during
         * insertion of entries.
         */
        nr_nodes = max(nr_entries, nr_entries * 2 + 1);
        if (!mt_is_alloc(mas->tree))
                nonleaf_cap = MAPLE_RANGE64_SLOTS - 2;

        /* Leaves; reduce slots to keep space for expansion */
        nr_nodes = DIV_ROUND_UP(nr_nodes, MAPLE_RANGE64_SLOTS - 2);
        /* Internal nodes */
        nr_nodes += DIV_ROUND_UP(nr_nodes, nonleaf_cap);
        /* Add working room for split (2 nodes) + new parents */
        mas_node_count_gfp(mas, nr_nodes + 3, GFP_KERNEL);

        /* Detect if allocations run out */
        mas->mas_flags |= MA_STATE_PREALLOC;

        if (!mas_is_err(mas))
                return 0;

        ret = xa_err(mas->node);
        mas->node = enode;
        mas_destroy(mas);
        return ret;

}
EXPORT_SYMBOL_GPL(mas_expected_entries);

static bool mas_next_setup(struct ma_state *mas, unsigned long max,
                void **entry)
{
        bool was_none = mas_is_none(mas);

        if (unlikely(mas->last >= max)) {
                mas->status = ma_overflow;
                return true;
        }

        switch (mas->status) {
        case ma_active:
                return false;
        case ma_none:
                fallthrough;
        case ma_pause:
                mas->status = ma_start;
                fallthrough;
        case ma_start:
                mas_walk(mas); /* Retries on dead nodes handled by mas_walk */
                break;
        case ma_overflow:
                /* Overflowed before, but the max changed */
                mas->status = ma_active;
                break;
        case ma_underflow:
                /* The user expects the mas to be one before where it is */
                mas->status = ma_active;
                *entry = mas_walk(mas);
                if (*entry)
                        return true;
                break;
        case ma_root:
                break;
        case ma_error:
                return true;
        }

        if (likely(mas_is_active(mas))) /* Fast path */
                return false;

        if (mas_is_ptr(mas)) {
                *entry = NULL;
                if (was_none && mas->index == 0) {
                        mas->index = mas->last = 0;
                        return true;
                }
                mas->index = 1;
                mas->last = ULONG_MAX;
                mas->status = ma_none;
                return true;
        }

        if (mas_is_none(mas))
                return true;

        return false;
}

/**
 * mas_next() - Get the next entry.
 * @mas: The maple state
 * @max: The maximum index to check.
 *
 * Returns the next entry after @mas->index.
 * Must hold rcu_read_lock or the write lock.
 * Can return the zero entry.
 *
 * Return: The next entry or %NULL
 */
void *mas_next(struct ma_state *mas, unsigned long max)
{
        void *entry = NULL;

        if (mas_next_setup(mas, max, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_next_slot */
        return mas_next_slot(mas, max, false);
}
EXPORT_SYMBOL_GPL(mas_next);

/**
 * mas_next_range() - Advance the maple state to the next range
 * @mas: The maple state
 * @max: The maximum index to check.
 *
 * Sets @mas->index and @mas->last to the range.
 * Must hold rcu_read_lock or the write lock.
 * Can return the zero entry.
 *
 * Return: The next entry or %NULL
 */
void *mas_next_range(struct ma_state *mas, unsigned long max)
{
        void *entry = NULL;

        if (mas_next_setup(mas, max, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_next_slot */
        return mas_next_slot(mas, max, true);
}
EXPORT_SYMBOL_GPL(mas_next_range);

/**
 * mt_next() - get the next value in the maple tree
 * @mt: The maple tree
 * @index: The start index
 * @max: The maximum index to check
 *
 * Takes RCU read lock internally to protect the search, which does not
 * protect the returned pointer after dropping RCU read lock.
 * See also: Documentation/core-api/maple_tree.rst
 *
 * Return: The entry higher than @index or %NULL if nothing is found.
 */
void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max)
{
        void *entry = NULL;
        MA_STATE(mas, mt, index, index);

        rcu_read_lock();
        entry = mas_next(&mas, max);
        rcu_read_unlock();
        return entry;
}
EXPORT_SYMBOL_GPL(mt_next);

static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry)
{
        if (unlikely(mas->index <= min)) {
                mas->status = ma_underflow;
                return true;
        }

        switch (mas->status) {
        case ma_active:
                return false;
        case ma_start:
                break;
        case ma_none:
                fallthrough;
        case ma_pause:
                mas->status = ma_start;
                break;
        case ma_underflow:
                /* underflowed before but the min changed */
                mas->status = ma_active;
                break;
        case ma_overflow:
                /* User expects mas to be one after where it is */
                mas->status = ma_active;
                *entry = mas_walk(mas);
                if (*entry)
                        return true;
                break;
        case ma_root:
                break;
        case ma_error:
                return true;
        }

        if (mas_is_start(mas))
                mas_walk(mas);

        if (unlikely(mas_is_ptr(mas))) {
                if (!mas->index) {
                        mas->status = ma_none;
                        return true;
                }
                mas->index = mas->last = 0;
                *entry = mas_root(mas);
                return true;
        }

        if (mas_is_none(mas)) {
                if (mas->index) {
                        /* Walked to out-of-range pointer? */
                        mas->index = mas->last = 0;
                        mas->status = ma_root;
                        *entry = mas_root(mas);
                        return true;
                }
                return true;
        }

        return false;
}

/**
 * mas_prev() - Get the previous entry
 * @mas: The maple state
 * @min: The minimum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * Will reset mas to ma_start if the status is ma_none.  Will stop on not
 * searchable nodes.
 *
 * Return: the previous value or %NULL.
 */
void *mas_prev(struct ma_state *mas, unsigned long min)
{
        void *entry = NULL;

        if (mas_prev_setup(mas, min, &entry))
                return entry;

        return mas_prev_slot(mas, min, false);
}
EXPORT_SYMBOL_GPL(mas_prev);

/**
 * mas_prev_range() - Advance to the previous range
 * @mas: The maple state
 * @min: The minimum value to check.
 *
 * Sets @mas->index and @mas->last to the range.
 * Must hold rcu_read_lock or the write lock.
 * Will reset mas to ma_start if the node is ma_none.  Will stop on not
 * searchable nodes.
 *
 * Return: the previous value or %NULL.
 */
void *mas_prev_range(struct ma_state *mas, unsigned long min)
{
        void *entry = NULL;

        if (mas_prev_setup(mas, min, &entry))
                return entry;

        return mas_prev_slot(mas, min, true);
}
EXPORT_SYMBOL_GPL(mas_prev_range);

/**
 * mt_prev() - get the previous value in the maple tree
 * @mt: The maple tree
 * @index: The start index
 * @min: The minimum index to check
 *
 * Takes RCU read lock internally to protect the search, which does not
 * protect the returned pointer after dropping RCU read lock.
 * See also: Documentation/core-api/maple_tree.rst
 *
 * Return: The entry before @index or %NULL if nothing is found.
 */
void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min)
{
        void *entry = NULL;
        MA_STATE(mas, mt, index, index);

        rcu_read_lock();
        entry = mas_prev(&mas, min);
        rcu_read_unlock();
        return entry;
}
EXPORT_SYMBOL_GPL(mt_prev);

/**
 * mas_pause() - Pause a mas_find/mas_for_each to drop the lock.
 * @mas: The maple state to pause
 *
 * Some users need to pause a walk and drop the lock they're holding in
 * order to yield to a higher priority thread or carry out an operation
 * on an entry.  Those users should call this function before they drop
 * the lock.  It resets the @mas to be suitable for the next iteration
 * of the loop after the user has reacquired the lock.  If most entries
 * found during a walk require you to call mas_pause(), the mt_for_each()
 * iterator may be more appropriate.
 *
 */
void mas_pause(struct ma_state *mas)
{
        mas->status = ma_pause;
        mas->node = NULL;
}
EXPORT_SYMBOL_GPL(mas_pause);

/**
 * mas_find_setup() - Internal function to set up mas_find*().
 * @mas: The maple state
 * @max: The maximum index
 * @entry: Pointer to the entry
 *
 * Returns: True if entry is the answer, false otherwise.
 */
static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long max, void **entry)
{
        switch (mas->status) {
        case ma_active:
                if (mas->last < max)
                        return false;
                return true;
        case ma_start:
                break;
        case ma_pause:
                if (unlikely(mas->last >= max))
                        return true;

                mas->index = ++mas->last;
                mas->status = ma_start;
                break;
        case ma_none:
                if (unlikely(mas->last >= max))
                        return true;

                mas->index = mas->last;
                mas->status = ma_start;
                break;
        case ma_underflow:
                /* mas is pointing at entry before unable to go lower */
                if (unlikely(mas->index >= max)) {
                        mas->status = ma_overflow;
                        return true;
                }

                mas->status = ma_active;
                *entry = mas_walk(mas);
                if (*entry)
                        return true;
                break;
        case ma_overflow:
                if (unlikely(mas->last >= max))
                        return true;

                mas->status = ma_active;
                *entry = mas_walk(mas);
                if (*entry)
                        return true;
                break;
        case ma_root:
                break;
        case ma_error:
                return true;
        }

        if (mas_is_start(mas)) {
                /* First run or continue */
                if (mas->index > max)
                        return true;

                *entry = mas_walk(mas);
                if (*entry)
                        return true;

        }

        if (unlikely(mas_is_ptr(mas)))
                goto ptr_out_of_range;

        if (unlikely(mas_is_none(mas)))
                return true;

        if (mas->index == max)
                return true;

        return false;

ptr_out_of_range:
        mas->status = ma_none;
        mas->index = 1;
        mas->last = ULONG_MAX;
        return true;
}

/**
 * mas_find() - On the first call, find the entry at or after mas->index up to
 * %max.  Otherwise, find the entry after mas->index.
 * @mas: The maple state
 * @max: The maximum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * If an entry exists, last and index are updated accordingly.
 * May set @mas->status to ma_overflow.
 *
 * Return: The entry or %NULL.
 */
void *mas_find(struct ma_state *mas, unsigned long max)
{
        void *entry = NULL;

        if (mas_find_setup(mas, max, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_next_slot */
        entry = mas_next_slot(mas, max, false);
        /* Ignore overflow */
        mas->status = ma_active;
        return entry;
}
EXPORT_SYMBOL_GPL(mas_find);

/**
 * mas_find_range() - On the first call, find the entry at or after
 * mas->index up to %max.  Otherwise, advance to the next slot mas->index.
 * @mas: The maple state
 * @max: The maximum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * If an entry exists, last and index are updated accordingly.
 * May set @mas->status to ma_overflow.
 *
 * Return: The entry or %NULL.
 */
void *mas_find_range(struct ma_state *mas, unsigned long max)
{
        void *entry = NULL;

        if (mas_find_setup(mas, max, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_next_slot */
        return mas_next_slot(mas, max, true);
}
EXPORT_SYMBOL_GPL(mas_find_range);

/**
 * mas_find_rev_setup() - Internal function to set up mas_find_*_rev()
 * @mas: The maple state
 * @min: The minimum index
 * @entry: Pointer to the entry
 *
 * Returns: True if entry is the answer, false otherwise.
 */
static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min,
                void **entry)
{

        switch (mas->status) {
        case ma_active:
                goto active;
        case ma_start:
                break;
        case ma_pause:
                if (unlikely(mas->index <= min)) {
                        mas->status = ma_underflow;
                        return true;
                }
                mas->last = --mas->index;
                mas->status = ma_start;
                break;
        case ma_none:
                if (mas->index <= min)
                        goto none;

                mas->last = mas->index;
                mas->status = ma_start;
                break;
        case ma_overflow: /* user expects the mas to be one after where it is */
                if (unlikely(mas->index <= min)) {
                        mas->status = ma_underflow;
                        return true;
                }

                mas->status = ma_active;
                break;
        case ma_underflow: /* user expects the mas to be one before where it is */
                if (unlikely(mas->index <= min))
                        return true;

                mas->status = ma_active;
                break;
        case ma_root:
                break;
        case ma_error:
                return true;
        }

        if (mas_is_start(mas)) {
                /* First run or continue */
                if (mas->index < min)
                        return true;

                *entry = mas_walk(mas);
                if (*entry)
                        return true;
        }

        if (unlikely(mas_is_ptr(mas)))
                goto none;

        if (unlikely(mas_is_none(mas))) {
                /*
                 * Walked to the location, and there was nothing so the previous
                 * location is 0.
                 */
                mas->last = mas->index = 0;
                mas->status = ma_root;
                *entry = mas_root(mas);
                return true;
        }

active:
        if (mas->index < min)
                return true;

        return false;

none:
        mas->status = ma_none;
        return true;
}

/**
 * mas_find_rev: On the first call, find the first non-null entry at or below
 * mas->index down to %min.  Otherwise find the first non-null entry below
 * mas->index down to %min.
 * @mas: The maple state
 * @min: The minimum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * If an entry exists, last and index are updated accordingly.
 * May set @mas->status to ma_underflow.
 *
 * Return: The entry or %NULL.
 */
void *mas_find_rev(struct ma_state *mas, unsigned long min)
{
        void *entry = NULL;

        if (mas_find_rev_setup(mas, min, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_prev_slot */
        return mas_prev_slot(mas, min, false);

}
EXPORT_SYMBOL_GPL(mas_find_rev);

/**
 * mas_find_range_rev: On the first call, find the first non-null entry at or
 * below mas->index down to %min.  Otherwise advance to the previous slot after
 * mas->index down to %min.
 * @mas: The maple state
 * @min: The minimum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * If an entry exists, last and index are updated accordingly.
 * May set @mas->status to ma_underflow.
 *
 * Return: The entry or %NULL.
 */
void *mas_find_range_rev(struct ma_state *mas, unsigned long min)
{
        void *entry = NULL;

        if (mas_find_rev_setup(mas, min, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_prev_slot */
        return mas_prev_slot(mas, min, true);
}
EXPORT_SYMBOL_GPL(mas_find_range_rev);

/**
 * mas_erase() - Find the range in which index resides and erase the entire
 * range.
 * @mas: The maple state
 *
 * Must hold the write lock.
 * Searches for @mas->index, sets @mas->index and @mas->last to the range and
 * erases that range.
 *
 * Return: the entry that was erased or %NULL, @mas->index and @mas->last are updated.
 */
void *mas_erase(struct ma_state *mas)
{
        void *entry;
        MA_WR_STATE(wr_mas, mas, NULL);

        if (!mas_is_active(mas) || !mas_is_start(mas))
                mas->status = ma_start;

        /* Retry unnecessary when holding the write lock. */
        entry = mas_state_walk(mas);
        if (!entry)
                return NULL;

write_retry:
        /* Must reset to ensure spanning writes of last slot are detected */
        mas_reset(mas);
        mas_wr_store_setup(&wr_mas);
        mas_wr_store_entry(&wr_mas);
        if (mas_nomem(mas, GFP_KERNEL))
                goto write_retry;

        return entry;
}
EXPORT_SYMBOL_GPL(mas_erase);

/**
 * mas_nomem() - Check if there was an error allocating and do the allocation
 * if necessary If there are allocations, then free them.
 * @mas: The maple state
 * @gfp: The GFP_FLAGS to use for allocations
 * Return: true on allocation, false otherwise.
 */
bool mas_nomem(struct ma_state *mas, gfp_t gfp)
        __must_hold(mas->tree->ma_lock)
{
        if (likely(mas->node != MA_ERROR(-ENOMEM))) {
                mas_destroy(mas);
                return false;
        }

        if (gfpflags_allow_blocking(gfp) && !mt_external_lock(mas->tree)) {
                mtree_unlock(mas->tree);
                mas_alloc_nodes(mas, gfp);
                mtree_lock(mas->tree);
        } else {
                mas_alloc_nodes(mas, gfp);
        }

        if (!mas_allocated(mas))
                return false;

        mas->status = ma_start;
        return true;
}

void __init maple_tree_init(void)
{
        maple_node_cache = kmem_cache_create("maple_node",
                        sizeof(struct maple_node), sizeof(struct maple_node),
                        SLAB_PANIC, NULL);
}

/**
 * mtree_load() - Load a value stored in a maple tree
 * @mt: The maple tree
 * @index: The index to load
 *
 * Return: the entry or %NULL
 */
void *mtree_load(struct maple_tree *mt, unsigned long index)
{
        MA_STATE(mas, mt, index, index);
        void *entry;

        trace_ma_read(__func__, &mas);
        rcu_read_lock();
retry:
        entry = mas_start(&mas);
        if (unlikely(mas_is_none(&mas)))
                goto unlock;

        if (unlikely(mas_is_ptr(&mas))) {
                if (index)
                        entry = NULL;

                goto unlock;
        }

        entry = mtree_lookup_walk(&mas);
        if (!entry && unlikely(mas_is_start(&mas)))
                goto retry;
unlock:
        rcu_read_unlock();
        if (xa_is_zero(entry))
                return NULL;

        return entry;
}
EXPORT_SYMBOL(mtree_load);

/**
 * mtree_store_range() - Store an entry at a given range.
 * @mt: The maple tree
 * @index: The start of the range
 * @last: The end of the range
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations
 *
 * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not
 * be allocated.
 */
int mtree_store_range(struct maple_tree *mt, unsigned long index,
                unsigned long last, void *entry, gfp_t gfp)
{
        MA_STATE(mas, mt, index, last);
        MA_WR_STATE(wr_mas, &mas, entry);

        trace_ma_write(__func__, &mas, 0, entry);
        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return -EINVAL;

        if (index > last)
                return -EINVAL;

        mtree_lock(mt);
retry:
        mas_wr_store_entry(&wr_mas);
        if (mas_nomem(&mas, gfp))
                goto retry;

        mtree_unlock(mt);
        if (mas_is_err(&mas))
                return xa_err(mas.node);

        return 0;
}
EXPORT_SYMBOL(mtree_store_range);

/**
 * mtree_store() - Store an entry at a given index.
 * @mt: The maple tree
 * @index: The index to store the value
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations
 *
 * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not
 * be allocated.
 */
int mtree_store(struct maple_tree *mt, unsigned long index, void *entry,
                 gfp_t gfp)
{
        return mtree_store_range(mt, index, index, entry, gfp);
}
EXPORT_SYMBOL(mtree_store);

/**
 * mtree_insert_range() - Insert an entry at a given range if there is no value.
 * @mt: The maple tree
 * @first: The start of the range
 * @last: The end of the range
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid
 * request, -ENOMEM if memory could not be allocated.
 */
int mtree_insert_range(struct maple_tree *mt, unsigned long first,
                unsigned long last, void *entry, gfp_t gfp)
{
        MA_STATE(ms, mt, first, last);

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return -EINVAL;

        if (first > last)
                return -EINVAL;

        mtree_lock(mt);
retry:
        mas_insert(&ms, entry);
        if (mas_nomem(&ms, gfp))
                goto retry;

        mtree_unlock(mt);
        if (mas_is_err(&ms))
                return xa_err(ms.node);

        return 0;
}
EXPORT_SYMBOL(mtree_insert_range);

/**
 * mtree_insert() - Insert an entry at a given index if there is no value.
 * @mt: The maple tree
 * @index : The index to store the value
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid
 * request, -ENOMEM if memory could not be allocated.
 */
int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry,
                 gfp_t gfp)
{
        return mtree_insert_range(mt, index, index, entry, gfp);
}
EXPORT_SYMBOL(mtree_insert);

int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long size, unsigned long min,
                unsigned long max, gfp_t gfp)
{
        int ret = 0;

        MA_STATE(mas, mt, 0, 0);
        if (!mt_is_alloc(mt))
                return -EINVAL;

        if (WARN_ON_ONCE(mt_is_reserved(entry)))
                return -EINVAL;

        mtree_lock(mt);
retry:
        ret = mas_empty_area(&mas, min, max, size);
        if (ret)
                goto unlock;

        mas_insert(&mas, entry);
        /*
         * mas_nomem() may release the lock, causing the allocated area
         * to be unavailable, so try to allocate a free area again.
         */
        if (mas_nomem(&mas, gfp))
                goto retry;

        if (mas_is_err(&mas))
                ret = xa_err(mas.node);
        else
                *startp = mas.index;

unlock:
        mtree_unlock(mt);
        return ret;
}
EXPORT_SYMBOL(mtree_alloc_range);

/**
 * mtree_alloc_cyclic() - Find somewhere to store this entry in the tree.
 * @mt: The maple tree.
 * @startp: Pointer to ID.
 * @range_lo: Lower bound of range to search.
 * @range_hi: Upper bound of range to search.
 * @entry: The entry to store.
 * @next: Pointer to next ID to allocate.
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Finds an empty entry in @mt after @next, stores the new index into
 * the @id pointer, stores the entry at that index, then updates @next.
 *
 * @mt must be initialized with the MT_FLAGS_ALLOC_RANGE flag.
 *
 * Context: Any context.  Takes and releases the mt.lock.  May sleep if
 * the @gfp flags permit.
 *
 * Return: 0 if the allocation succeeded without wrapping, 1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated, -EINVAL if @mt cannot be used, or -EBUSY if there are no
 * free entries.
 */
int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long range_lo, unsigned long range_hi,
                unsigned long *next, gfp_t gfp)
{
        int ret;

        MA_STATE(mas, mt, 0, 0);

        if (!mt_is_alloc(mt))
                return -EINVAL;
        if (WARN_ON_ONCE(mt_is_reserved(entry)))
                return -EINVAL;
        mtree_lock(mt);
        ret = mas_alloc_cyclic(&mas, startp, entry, range_lo, range_hi,
                               next, gfp);
        mtree_unlock(mt);
        return ret;
}
EXPORT_SYMBOL(mtree_alloc_cyclic);

int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long size, unsigned long min,
                unsigned long max, gfp_t gfp)
{
        int ret = 0;

        MA_STATE(mas, mt, 0, 0);
        if (!mt_is_alloc(mt))
                return -EINVAL;

        if (WARN_ON_ONCE(mt_is_reserved(entry)))
                return -EINVAL;

        mtree_lock(mt);
retry:
        ret = mas_empty_area_rev(&mas, min, max, size);
        if (ret)
                goto unlock;

        mas_insert(&mas, entry);
        /*
         * mas_nomem() may release the lock, causing the allocated area
         * to be unavailable, so try to allocate a free area again.
         */
        if (mas_nomem(&mas, gfp))
                goto retry;

        if (mas_is_err(&mas))
                ret = xa_err(mas.node);
        else
                *startp = mas.index;

unlock:
        mtree_unlock(mt);
        return ret;
}
EXPORT_SYMBOL(mtree_alloc_rrange);

/**
 * mtree_erase() - Find an index and erase the entire range.
 * @mt: The maple tree
 * @index: The index to erase
 *
 * Erasing is the same as a walk to an entry then a store of a NULL to that
 * ENTIRE range.  In fact, it is implemented as such using the advanced API.
 *
 * Return: The entry stored at the @index or %NULL
 */
void *mtree_erase(struct maple_tree *mt, unsigned long index)
{
        void *entry = NULL;

        MA_STATE(mas, mt, index, index);
        trace_ma_op(__func__, &mas);

        mtree_lock(mt);
        entry = mas_erase(&mas);
        mtree_unlock(mt);

        return entry;
}
EXPORT_SYMBOL(mtree_erase);

/*
 * mas_dup_free() - Free an incomplete duplication of a tree.
 * @mas: The maple state of a incomplete tree.
 *
 * The parameter @mas->node passed in indicates that the allocation failed on
 * this node. This function frees all nodes starting from @mas->node in the
 * reverse order of mas_dup_build(). There is no need to hold the source tree
 * lock at this time.
 */
static void mas_dup_free(struct ma_state *mas)
{
        struct maple_node *node;
        enum maple_type type;
        void __rcu **slots;
        unsigned char count, i;

        /* Maybe the first node allocation failed. */
        if (mas_is_none(mas))
                return;

        while (!mte_is_root(mas->node)) {
                mas_ascend(mas);
                if (mas->offset) {
                        mas->offset--;
                        do {
                                mas_descend(mas);
                                mas->offset = mas_data_end(mas);
                        } while (!mte_is_leaf(mas->node));

                        mas_ascend(mas);
                }

                node = mte_to_node(mas->node);
                type = mte_node_type(mas->node);
                slots = ma_slots(node, type);
                count = mas_data_end(mas) + 1;
                for (i = 0; i < count; i++)
                        ((unsigned long *)slots)[i] &= ~MAPLE_NODE_MASK;
                mt_free_bulk(count, slots);
        }

        node = mte_to_node(mas->node);
        mt_free_one(node);
}

/*
 * mas_copy_node() - Copy a maple node and replace the parent.
 * @mas: The maple state of source tree.
 * @new_mas: The maple state of new tree.
 * @parent: The parent of the new node.
 *
 * Copy @mas->node to @new_mas->node, set @parent to be the parent of
 * @new_mas->node. If memory allocation fails, @mas is set to -ENOMEM.
 */
static inline void mas_copy_node(struct ma_state *mas, struct ma_state *new_mas,
                struct maple_pnode *parent)
{
        struct maple_node *node = mte_to_node(mas->node);
        struct maple_node *new_node = mte_to_node(new_mas->node);
        unsigned long val;

        /* Copy the node completely. */
        memcpy(new_node, node, sizeof(struct maple_node));
        /* Update the parent node pointer. */
        val = (unsigned long)node->parent & MAPLE_NODE_MASK;
        new_node->parent = ma_parent_ptr(val | (unsigned long)parent);
}

/*
 * mas_dup_alloc() - Allocate child nodes for a maple node.
 * @mas: The maple state of source tree.
 * @new_mas: The maple state of new tree.
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * This function allocates child nodes for @new_mas->node during the duplication
 * process. If memory allocation fails, @mas is set to -ENOMEM.
 */
static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas,
                gfp_t gfp)
{
        struct maple_node *node = mte_to_node(mas->node);
        struct maple_node *new_node = mte_to_node(new_mas->node);
        enum maple_type type;
        unsigned char request, count, i;
        void __rcu **slots;
        void __rcu **new_slots;
        unsigned long val;

        /* Allocate memory for child nodes. */
        type = mte_node_type(mas->node);
        new_slots = ma_slots(new_node, type);
        request = mas_data_end(mas) + 1;
        count = mt_alloc_bulk(gfp, request, (void **)new_slots);
        if (unlikely(count < request)) {
                memset(new_slots, 0, request * sizeof(void *));
                mas_set_err(mas, -ENOMEM);
                return;
        }

        /* Restore node type information in slots. */
        slots = ma_slots(node, type);
        for (i = 0; i < count; i++) {
                val = (unsigned long)mt_slot_locked(mas->tree, slots, i);
                val &= MAPLE_NODE_MASK;
                ((unsigned long *)new_slots)[i] |= val;
        }
}

/*
 * mas_dup_build() - Build a new maple tree from a source tree
 * @mas: The maple state of source tree, need to be in MAS_START state.
 * @new_mas: The maple state of new tree, need to be in MAS_START state.
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * This function builds a new tree in DFS preorder. If the memory allocation
 * fails, the error code -ENOMEM will be set in @mas, and @new_mas points to the
 * last node. mas_dup_free() will free the incomplete duplication of a tree.
 *
 * Note that the attributes of the two trees need to be exactly the same, and the
 * new tree needs to be empty, otherwise -EINVAL will be set in @mas.
 */
static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas,
                gfp_t gfp)
{
        struct maple_node *node;
        struct maple_pnode *parent = NULL;
        struct maple_enode *root;
        enum maple_type type;

        if (unlikely(mt_attr(mas->tree) != mt_attr(new_mas->tree)) ||
            unlikely(!mtree_empty(new_mas->tree))) {
                mas_set_err(mas, -EINVAL);
                return;
        }

        root = mas_start(mas);
        if (mas_is_ptr(mas) || mas_is_none(mas))
                goto set_new_tree;

        node = mt_alloc_one(gfp);
        if (!node) {
                new_mas->status = ma_none;
                mas_set_err(mas, -ENOMEM);
                return;
        }

        type = mte_node_type(mas->node);
        root = mt_mk_node(node, type);
        new_mas->node = root;
        new_mas->min = 0;
        new_mas->max = ULONG_MAX;
        root = mte_mk_root(root);
        while (1) {
                mas_copy_node(mas, new_mas, parent);
                if (!mte_is_leaf(mas->node)) {
                        /* Only allocate child nodes for non-leaf nodes. */
                        mas_dup_alloc(mas, new_mas, gfp);
                        if (unlikely(mas_is_err(mas)))
                                return;
                } else {
                        /*
                         * This is the last leaf node and duplication is
                         * completed.
                         */
                        if (mas->max == ULONG_MAX)
                                goto done;

                        /* This is not the last leaf node and needs to go up. */
                        do {
                                mas_ascend(mas);
                                mas_ascend(new_mas);
                        } while (mas->offset == mas_data_end(mas));

                        /* Move to the next subtree. */
                        mas->offset++;
                        new_mas->offset++;
                }

                mas_descend(mas);
                parent = ma_parent_ptr(mte_to_node(new_mas->node));
                mas_descend(new_mas);
                mas->offset = 0;
                new_mas->offset = 0;
        }
done:
        /* Specially handle the parent of the root node. */
        mte_to_node(root)->parent = ma_parent_ptr(mas_tree_parent(new_mas));
set_new_tree:
        /* Make them the same height */
        new_mas->tree->ma_flags = mas->tree->ma_flags;
        rcu_assign_pointer(new_mas->tree->ma_root, root);
}

/**
 * __mt_dup(): Duplicate an entire maple tree
 * @mt: The source maple tree
 * @new: The new maple tree
 * @gfp: The GFP_FLAGS to use for allocations
 *
 * This function duplicates a maple tree in Depth-First Search (DFS) pre-order
 * traversal. It uses memcpy() to copy nodes in the source tree and allocate
 * new child nodes in non-leaf nodes. The new node is exactly the same as the
 * source node except for all the addresses stored in it. It will be faster than
 * traversing all elements in the source tree and inserting them one by one into
 * the new tree.
 * The user needs to ensure that the attributes of the source tree and the new
 * tree are the same, and the new tree needs to be an empty tree, otherwise
 * -EINVAL will be returned.
 * Note that the user needs to manually lock the source tree and the new tree.
 *
 * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If
 * the attributes of the two trees are different or the new tree is not an empty
 * tree.
 */
int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp)
{
        int ret = 0;
        MA_STATE(mas, mt, 0, 0);
        MA_STATE(new_mas, new, 0, 0);

        mas_dup_build(&mas, &new_mas, gfp);
        if (unlikely(mas_is_err(&mas))) {
                ret = xa_err(mas.node);
                if (ret == -ENOMEM)
                        mas_dup_free(&new_mas);
        }

        return ret;
}
EXPORT_SYMBOL(__mt_dup);

/**
 * mtree_dup(): Duplicate an entire maple tree
 * @mt: The source maple tree
 * @new: The new maple tree
 * @gfp: The GFP_FLAGS to use for allocations
 *
 * This function duplicates a maple tree in Depth-First Search (DFS) pre-order
 * traversal. It uses memcpy() to copy nodes in the source tree and allocate
 * new child nodes in non-leaf nodes. The new node is exactly the same as the
 * source node except for all the addresses stored in it. It will be faster than
 * traversing all elements in the source tree and inserting them one by one into
 * the new tree.
 * The user needs to ensure that the attributes of the source tree and the new
 * tree are the same, and the new tree needs to be an empty tree, otherwise
 * -EINVAL will be returned.
 *
 * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If
 * the attributes of the two trees are different or the new tree is not an empty
 * tree.
 */
int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp)
{
        int ret = 0;
        MA_STATE(mas, mt, 0, 0);
        MA_STATE(new_mas, new, 0, 0);

        mas_lock(&new_mas);
        mas_lock_nested(&mas, SINGLE_DEPTH_NESTING);
        mas_dup_build(&mas, &new_mas, gfp);
        mas_unlock(&mas);
        if (unlikely(mas_is_err(&mas))) {
                ret = xa_err(mas.node);
                if (ret == -ENOMEM)
                        mas_dup_free(&new_mas);
        }

        mas_unlock(&new_mas);
        return ret;
}
EXPORT_SYMBOL(mtree_dup);

/**
 * __mt_destroy() - Walk and free all nodes of a locked maple tree.
 * @mt: The maple tree
 *
 * Note: Does not handle locking.
 */
void __mt_destroy(struct maple_tree *mt)
{
        void *root = mt_root_locked(mt);

        rcu_assign_pointer(mt->ma_root, NULL);
        if (xa_is_node(root))
                mte_destroy_walk(root, mt);

        mt->ma_flags = mt_attr(mt);
}
EXPORT_SYMBOL_GPL(__mt_destroy);

/**
 * mtree_destroy() - Destroy a maple tree
 * @mt: The maple tree
 *
 * Frees all resources used by the tree.  Handles locking.
 */
void mtree_destroy(struct maple_tree *mt)
{
        mtree_lock(mt);
        __mt_destroy(mt);
        mtree_unlock(mt);
}
EXPORT_SYMBOL(mtree_destroy);

/**
 * mt_find() - Search from the start up until an entry is found.
 * @mt: The maple tree
 * @index: Pointer which contains the start location of the search
 * @max: The maximum value of the search range
 *
 * Takes RCU read lock internally to protect the search, which does not
 * protect the returned pointer after dropping RCU read lock.
 * See also: Documentation/core-api/maple_tree.rst
 *
 * In case that an entry is found @index is updated to point to the next
 * possible entry independent whether the found entry is occupying a
 * single index or a range if indices.
 *
 * Return: The entry at or after the @index or %NULL
 */
void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max)
{
        MA_STATE(mas, mt, *index, *index);
        void *entry;
#ifdef CONFIG_DEBUG_MAPLE_TREE
        unsigned long copy = *index;
#endif

        trace_ma_read(__func__, &mas);

        if ((*index) > max)
                return NULL;

        rcu_read_lock();
retry:
        entry = mas_state_walk(&mas);
        if (mas_is_start(&mas))
                goto retry;

        if (unlikely(xa_is_zero(entry)))
                entry = NULL;

        if (entry)
                goto unlock;

        while (mas_is_active(&mas) && (mas.last < max)) {
                entry = mas_next_entry(&mas, max);
                if (likely(entry && !xa_is_zero(entry)))
                        break;
        }

        if (unlikely(xa_is_zero(entry)))
                entry = NULL;
unlock:
        rcu_read_unlock();
        if (likely(entry)) {
                *index = mas.last + 1;
#ifdef CONFIG_DEBUG_MAPLE_TREE
                if (MT_WARN_ON(mt, (*index) && ((*index) <= copy)))
                        pr_err("index not increased! %lx <= %lx\n",
                               *index, copy);
#endif
        }

        return entry;
}
EXPORT_SYMBOL(mt_find);

/**
 * mt_find_after() - Search from the start up until an entry is found.
 * @mt: The maple tree
 * @index: Pointer which contains the start location of the search
 * @max: The maximum value to check
 *
 * Same as mt_find() except that it checks @index for 0 before
 * searching. If @index == 0, the search is aborted. This covers a wrap
 * around of @index to 0 in an iterator loop.
 *
 * Return: The entry at or after the @index or %NULL
 */
void *mt_find_after(struct maple_tree *mt, unsigned long *index,
                    unsigned long max)
{
        if (!(*index))
                return NULL;

        return mt_find(mt, index, max);
}
EXPORT_SYMBOL(mt_find_after);

#ifdef CONFIG_DEBUG_MAPLE_TREE
atomic_t maple_tree_tests_run;
EXPORT_SYMBOL_GPL(maple_tree_tests_run);
atomic_t maple_tree_tests_passed;
EXPORT_SYMBOL_GPL(maple_tree_tests_passed);

#ifndef __KERNEL__
extern void kmem_cache_set_non_kernel(struct kmem_cache *, unsigned int);
void mt_set_non_kernel(unsigned int val)
{
        kmem_cache_set_non_kernel(maple_node_cache, val);
}

extern unsigned long kmem_cache_get_alloc(struct kmem_cache *);
unsigned long mt_get_alloc_size(void)
{
        return kmem_cache_get_alloc(maple_node_cache);
}

extern void kmem_cache_zero_nr_tallocated(struct kmem_cache *);
void mt_zero_nr_tallocated(void)
{
        kmem_cache_zero_nr_tallocated(maple_node_cache);
}

extern unsigned int kmem_cache_nr_tallocated(struct kmem_cache *);
unsigned int mt_nr_tallocated(void)
{
        return kmem_cache_nr_tallocated(maple_node_cache);
}

extern unsigned int kmem_cache_nr_allocated(struct kmem_cache *);
unsigned int mt_nr_allocated(void)
{
        return kmem_cache_nr_allocated(maple_node_cache);
}

void mt_cache_shrink(void)
{
}
#else
/*
 * mt_cache_shrink() - For testing, don't use this.
 *
 * Certain testcases can trigger an OOM when combined with other memory
 * debugging configuration options.  This function is used to reduce the
 * possibility of an out of memory even due to kmem_cache objects remaining
 * around for longer than usual.
 */
void mt_cache_shrink(void)
{
        kmem_cache_shrink(maple_node_cache);

}
EXPORT_SYMBOL_GPL(mt_cache_shrink);

#endif /* not defined __KERNEL__ */
/*
 * mas_get_slot() - Get the entry in the maple state node stored at @offset.
 * @mas: The maple state
 * @offset: The offset into the slot array to fetch.
 *
 * Return: The entry stored at @offset.
 */
static inline struct maple_enode *mas_get_slot(struct ma_state *mas,
                unsigned char offset)
{
        return mas_slot(mas, ma_slots(mas_mn(mas), mte_node_type(mas->node)),
                        offset);
}

/* Depth first search, post-order */
static void mas_dfs_postorder(struct ma_state *mas, unsigned long max)
{

        struct maple_enode *p, *mn = mas->node;
        unsigned long p_min, p_max;

        mas_next_node(mas, mas_mn(mas), max);
        if (!mas_is_overflow(mas))
                return;

        if (mte_is_root(mn))
                return;

        mas->node = mn;
        mas_ascend(mas);
        do {
                p = mas->node;
                p_min = mas->min;
                p_max = mas->max;
                mas_prev_node(mas, 0);
        } while (!mas_is_underflow(mas));

        mas->node = p;
        mas->max = p_max;
        mas->min = p_min;
}

/* Tree validations */
static void mt_dump_node(const struct maple_tree *mt, void *entry,
                unsigned long min, unsigned long max, unsigned int depth,
                enum mt_dump_format format);
static void mt_dump_range(unsigned long min, unsigned long max,
                          unsigned int depth, enum mt_dump_format format)
{
        static const char spaces[] = "                                ";

        switch(format) {
        case mt_dump_hex:
                if (min == max)
                        pr_info("%.*s%lx: ", depth * 2, spaces, min);
                else
                        pr_info("%.*s%lx-%lx: ", depth * 2, spaces, min, max);
                break;
        case mt_dump_dec:
                if (min == max)
                        pr_info("%.*s%lu: ", depth * 2, spaces, min);
                else
                        pr_info("%.*s%lu-%lu: ", depth * 2, spaces, min, max);
        }
}

static void mt_dump_entry(void *entry, unsigned long min, unsigned long max,
                          unsigned int depth, enum mt_dump_format format)
{
        mt_dump_range(min, max, depth, format);

        if (xa_is_value(entry))
                pr_cont("value %ld (0x%lx) [%p]\n", xa_to_value(entry),
                                xa_to_value(entry), entry);
        else if (xa_is_zero(entry))
                pr_cont("zero (%ld)\n", xa_to_internal(entry));
        else if (mt_is_reserved(entry))
                pr_cont("UNKNOWN ENTRY (%p)\n", entry);
        else
                pr_cont("%p\n", entry);
}

static void mt_dump_range64(const struct maple_tree *mt, void *entry,
                unsigned long min, unsigned long max, unsigned int depth,
                enum mt_dump_format format)
{
        struct maple_range_64 *node = &mte_to_node(entry)->mr64;
        bool leaf = mte_is_leaf(entry);
        unsigned long first = min;
        int i;

        pr_cont(" contents: ");
        for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) {
                switch(format) {
                case mt_dump_hex:
                        pr_cont("%p %lX ", node->slot[i], node->pivot[i]);
                        break;
                case mt_dump_dec:
                        pr_cont("%p %lu ", node->slot[i], node->pivot[i]);
                }
        }
        pr_cont("%p\n", node->slot[i]);
        for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) {
                unsigned long last = max;

                if (i < (MAPLE_RANGE64_SLOTS - 1))
                        last = node->pivot[i];
                else if (!node->slot[i] && max != mt_node_max(entry))
                        break;
                if (last == 0 && i > 0)
                        break;
                if (leaf)
                        mt_dump_entry(mt_slot(mt, node->slot, i),
                                        first, last, depth + 1, format);
                else if (node->slot[i])
                        mt_dump_node(mt, mt_slot(mt, node->slot, i),
                                        first, last, depth + 1, format);

                if (last == max)
                        break;
                if (last > max) {
                        switch(format) {
                        case mt_dump_hex:
                                pr_err("node %p last (%lx) > max (%lx) at pivot %d!\n",
                                        node, last, max, i);
                                break;
                        case mt_dump_dec:
                                pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n",
                                        node, last, max, i);
                        }
                }
                first = last + 1;
        }
}

static void mt_dump_arange64(const struct maple_tree *mt, void *entry,
        unsigned long min, unsigned long max, unsigned int depth,
        enum mt_dump_format format)
{
        struct maple_arange_64 *node = &mte_to_node(entry)->ma64;
        bool leaf = mte_is_leaf(entry);
        unsigned long first = min;
        int i;

        pr_cont(" contents: ");
        for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) {
                switch (format) {
                case mt_dump_hex:
                        pr_cont("%lx ", node->gap[i]);
                        break;
                case mt_dump_dec:
                        pr_cont("%lu ", node->gap[i]);
                }
        }
        pr_cont("| %02X %02X| ", node->meta.end, node->meta.gap);
        for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) {
                switch (format) {
                case mt_dump_hex:
                        pr_cont("%p %lX ", node->slot[i], node->pivot[i]);
                        break;
                case mt_dump_dec:
                        pr_cont("%p %lu ", node->slot[i], node->pivot[i]);
                }
        }
        pr_cont("%p\n", node->slot[i]);
        for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) {
                unsigned long last = max;

                if (i < (MAPLE_ARANGE64_SLOTS - 1))
                        last = node->pivot[i];
                else if (!node->slot[i])
                        break;
                if (last == 0 && i > 0)
                        break;
                if (leaf)
                        mt_dump_entry(mt_slot(mt, node->slot, i),
                                        first, last, depth + 1, format);
                else if (node->slot[i])
                        mt_dump_node(mt, mt_slot(mt, node->slot, i),
                                        first, last, depth + 1, format);

                if (last == max)
                        break;
                if (last > max) {
                        pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n",
                                        node, last, max, i);
                        break;
                }
                first = last + 1;
        }
}

static void mt_dump_node(const struct maple_tree *mt, void *entry,
                unsigned long min, unsigned long max, unsigned int depth,
                enum mt_dump_format format)
{
        struct maple_node *node = mte_to_node(entry);
        unsigned int type = mte_node_type(entry);
        unsigned int i;

        mt_dump_range(min, max, depth, format);

        pr_cont("node %p depth %d type %d parent %p", node, depth, type,
                        node ? node->parent : NULL);
        switch (type) {
        case maple_dense:
                pr_cont("\n");
                for (i = 0; i < MAPLE_NODE_SLOTS; i++) {
                        if (min + i > max)
                                pr_cont("OUT OF RANGE: ");
                        mt_dump_entry(mt_slot(mt, node->slot, i),
                                        min + i, min + i, depth, format);
                }
                break;
        case maple_leaf_64:
        case maple_range_64:
                mt_dump_range64(mt, entry, min, max, depth, format);
                break;
        case maple_arange_64:
                mt_dump_arange64(mt, entry, min, max, depth, format);
                break;

        default:
                pr_cont(" UNKNOWN TYPE\n");
        }
}

void mt_dump(const struct maple_tree *mt, enum mt_dump_format format)
{
        void *entry = rcu_dereference_check(mt->ma_root, mt_locked(mt));

        pr_info("maple_tree(%p) flags %X, height %u root %p\n",
                 mt, mt->ma_flags, mt_height(mt), entry);
        if (!xa_is_node(entry))
                mt_dump_entry(entry, 0, 0, 0, format);
        else if (entry)
                mt_dump_node(mt, entry, 0, mt_node_max(entry), 0, format);
}
EXPORT_SYMBOL_GPL(mt_dump);

/*
 * Calculate the maximum gap in a node and check if that's what is reported in
 * the parent (unless root).
 */
static void mas_validate_gaps(struct ma_state *mas)
{
        struct maple_enode *mte = mas->node;
        struct maple_node *p_mn, *node = mte_to_node(mte);
        enum maple_type mt = mte_node_type(mas->node);
        unsigned long gap = 0, max_gap = 0;
        unsigned long p_end, p_start = mas->min;
        unsigned char p_slot, offset;
        unsigned long *gaps = NULL;
        unsigned long *pivots = ma_pivots(node, mt);
        unsigned int i;

        if (ma_is_dense(mt)) {
                for (i = 0; i < mt_slot_count(mte); i++) {
                        if (mas_get_slot(mas, i)) {
                                if (gap > max_gap)
                                        max_gap = gap;
                                gap = 0;
                                continue;
                        }
                        gap++;
                }
                goto counted;
        }

        gaps = ma_gaps(node, mt);
        for (i = 0; i < mt_slot_count(mte); i++) {
                p_end = mas_safe_pivot(mas, pivots, i, mt);

                if (!gaps) {
                        if (!mas_get_slot(mas, i))
                                gap = p_end - p_start + 1;
                } else {
                        void *entry = mas_get_slot(mas, i);

                        gap = gaps[i];
                        MT_BUG_ON(mas->tree, !entry);

                        if (gap > p_end - p_start + 1) {
                                pr_err("%p[%u] %lu >= %lu - %lu + 1 (%lu)\n",
                                       mas_mn(mas), i, gap, p_end, p_start,
                                       p_end - p_start + 1);
                                MT_BUG_ON(mas->tree, gap > p_end - p_start + 1);
                        }
                }

                if (gap > max_gap)
                        max_gap = gap;

                p_start = p_end + 1;
                if (p_end >= mas->max)
                        break;
        }

counted:
        if (mt == maple_arange_64) {
                MT_BUG_ON(mas->tree, !gaps);
                offset = ma_meta_gap(node);
                if (offset > i) {
                        pr_err("gap offset %p[%u] is invalid\n", node, offset);
                        MT_BUG_ON(mas->tree, 1);
                }

                if (gaps[offset] != max_gap) {
                        pr_err("gap %p[%u] is not the largest gap %lu\n",
                               node, offset, max_gap);
                        MT_BUG_ON(mas->tree, 1);
                }

                for (i++ ; i < mt_slot_count(mte); i++) {
                        if (gaps[i] != 0) {
                                pr_err("gap %p[%u] beyond node limit != 0\n",
                                       node, i);
                                MT_BUG_ON(mas->tree, 1);
                        }
                }
        }

        if (mte_is_root(mte))
                return;

        p_slot = mte_parent_slot(mas->node);
        p_mn = mte_parent(mte);
        MT_BUG_ON(mas->tree, max_gap > mas->max);
        if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) {
                pr_err("gap %p[%u] != %lu\n", p_mn, p_slot, max_gap);
                mt_dump(mas->tree, mt_dump_hex);
                MT_BUG_ON(mas->tree, 1);
        }
}

static void mas_validate_parent_slot(struct ma_state *mas)
{
        struct maple_node *parent;
        struct maple_enode *node;
        enum maple_type p_type;
        unsigned char p_slot;
        void __rcu **slots;
        int i;

        if (mte_is_root(mas->node))
                return;

        p_slot = mte_parent_slot(mas->node);
        p_type = mas_parent_type(mas, mas->node);
        parent = mte_parent(mas->node);
        slots = ma_slots(parent, p_type);
        MT_BUG_ON(mas->tree, mas_mn(mas) == parent);

        /* Check prev/next parent slot for duplicate node entry */

        for (i = 0; i < mt_slots[p_type]; i++) {
                node = mas_slot(mas, slots, i);
                if (i == p_slot) {
                        if (node != mas->node)
                                pr_err("parent %p[%u] does not have %p\n",
                                        parent, i, mas_mn(mas));
                        MT_BUG_ON(mas->tree, node != mas->node);
                } else if (node == mas->node) {
                        pr_err("Invalid child %p at parent %p[%u] p_slot %u\n",
                               mas_mn(mas), parent, i, p_slot);
                        MT_BUG_ON(mas->tree, node == mas->node);
                }
        }
}

static void mas_validate_child_slot(struct ma_state *mas)
{
        enum maple_type type = mte_node_type(mas->node);
        void __rcu **slots = ma_slots(mte_to_node(mas->node), type);
        unsigned long *pivots = ma_pivots(mte_to_node(mas->node), type);
        struct maple_enode *child;
        unsigned char i;

        if (mte_is_leaf(mas->node))
                return;

        for (i = 0; i < mt_slots[type]; i++) {
                child = mas_slot(mas, slots, i);

                if (!child) {
                        pr_err("Non-leaf node lacks child at %p[%u]\n",
                               mas_mn(mas), i);
                        MT_BUG_ON(mas->tree, 1);
                }

                if (mte_parent_slot(child) != i) {
                        pr_err("Slot error at %p[%u]: child %p has pslot %u\n",
                               mas_mn(mas), i, mte_to_node(child),
                               mte_parent_slot(child));
                        MT_BUG_ON(mas->tree, 1);
                }

                if (mte_parent(child) != mte_to_node(mas->node)) {
                        pr_err("child %p has parent %p not %p\n",
                               mte_to_node(child), mte_parent(child),
                               mte_to_node(mas->node));
                        MT_BUG_ON(mas->tree, 1);
                }

                if (i < mt_pivots[type] && pivots[i] == mas->max)
                        break;
        }
}

/*
 * Validate all pivots are within mas->min and mas->max, check metadata ends
 * where the maximum ends and ensure there is no slots or pivots set outside of
 * the end of the data.
 */
static void mas_validate_limits(struct ma_state *mas)
{
        int i;
        unsigned long prev_piv = 0;
        enum maple_type type = mte_node_type(mas->node);
        void __rcu **slots = ma_slots(mte_to_node(mas->node), type);
        unsigned long *pivots = ma_pivots(mas_mn(mas), type);

        for (i = 0; i < mt_slots[type]; i++) {
                unsigned long piv;

                piv = mas_safe_pivot(mas, pivots, i, type);

                if (!piv && (i != 0)) {
                        pr_err("Missing node limit pivot at %p[%u]",
                               mas_mn(mas), i);
                        MAS_WARN_ON(mas, 1);
                }

                if (prev_piv > piv) {
                        pr_err("%p[%u] piv %lu < prev_piv %lu\n",
                                mas_mn(mas), i, piv, prev_piv);
                        MAS_WARN_ON(mas, piv < prev_piv);
                }

                if (piv < mas->min) {
                        pr_err("%p[%u] %lu < %lu\n", mas_mn(mas), i,
                                piv, mas->min);
                        MAS_WARN_ON(mas, piv < mas->min);
                }
                if (piv > mas->max) {
                        pr_err("%p[%u] %lu > %lu\n", mas_mn(mas), i,
                                piv, mas->max);
                        MAS_WARN_ON(mas, piv > mas->max);
                }
                prev_piv = piv;
                if (piv == mas->max)
                        break;
        }

        if (mas_data_end(mas) != i) {
                pr_err("node%p: data_end %u != the last slot offset %u\n",
                       mas_mn(mas), mas_data_end(mas), i);
                MT_BUG_ON(mas->tree, 1);
        }

        for (i += 1; i < mt_slots[type]; i++) {
                void *entry = mas_slot(mas, slots, i);

                if (entry && (i != mt_slots[type] - 1)) {
                        pr_err("%p[%u] should not have entry %p\n", mas_mn(mas),
                               i, entry);
                        MT_BUG_ON(mas->tree, entry != NULL);
                }

                if (i < mt_pivots[type]) {
                        unsigned long piv = pivots[i];

                        if (!piv)
                                continue;

                        pr_err("%p[%u] should not have piv %lu\n",
                               mas_mn(mas), i, piv);
                        MAS_WARN_ON(mas, i < mt_pivots[type] - 1);
                }
        }
}

static void mt_validate_nulls(struct maple_tree *mt)
{
        void *entry, *last = (void *)1;
        unsigned char offset = 0;
        void __rcu **slots;
        MA_STATE(mas, mt, 0, 0);

        mas_start(&mas);
        if (mas_is_none(&mas) || (mas_is_ptr(&mas)))
                return;

        while (!mte_is_leaf(mas.node))
                mas_descend(&mas);

        slots = ma_slots(mte_to_node(mas.node), mte_node_type(mas.node));
        do {
                entry = mas_slot(&mas, slots, offset);
                if (!last && !entry) {
                        pr_err("Sequential nulls end at %p[%u]\n",
                                mas_mn(&mas), offset);
                }
                MT_BUG_ON(mt, !last && !entry);
                last = entry;
                if (offset == mas_data_end(&mas)) {
                        mas_next_node(&mas, mas_mn(&mas), ULONG_MAX);
                        if (mas_is_overflow(&mas))
                                return;
                        offset = 0;
                        slots = ma_slots(mte_to_node(mas.node),
                                         mte_node_type(mas.node));
                } else {
                        offset++;
                }

        } while (!mas_is_overflow(&mas));
}

/*
 * validate a maple tree by checking:
 * 1. The limits (pivots are within mas->min to mas->max)
 * 2. The gap is correctly set in the parents
 */
void mt_validate(struct maple_tree *mt)
{
        unsigned char end;

        MA_STATE(mas, mt, 0, 0);
        rcu_read_lock();
        mas_start(&mas);
        if (!mas_is_active(&mas))
                goto done;

        while (!mte_is_leaf(mas.node))
                mas_descend(&mas);

        while (!mas_is_overflow(&mas)) {
                MAS_WARN_ON(&mas, mte_dead_node(mas.node));
                end = mas_data_end(&mas);
                if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) &&
                                (mas.max != ULONG_MAX))) {
                        pr_err("Invalid size %u of %p\n", end, mas_mn(&mas));
                }

                mas_validate_parent_slot(&mas);
                mas_validate_limits(&mas);
                mas_validate_child_slot(&mas);
                if (mt_is_alloc(mt))
                        mas_validate_gaps(&mas);
                mas_dfs_postorder(&mas, ULONG_MAX);
        }
        mt_validate_nulls(mt);
done:
        rcu_read_unlock();

}
EXPORT_SYMBOL_GPL(mt_validate);

void mas_dump(const struct ma_state *mas)
{
        pr_err("MAS: tree=%p enode=%p ", mas->tree, mas->node);
        switch (mas->status) {
        case ma_active:
                pr_err("(ma_active)");
                break;
        case ma_none:
                pr_err("(ma_none)");
                break;
        case ma_root:
                pr_err("(ma_root)");
                break;
        case ma_start:
                pr_err("(ma_start) ");
                break;
        case ma_pause:
                pr_err("(ma_pause) ");
                break;
        case ma_overflow:
                pr_err("(ma_overflow) ");
                break;
        case ma_underflow:
                pr_err("(ma_underflow) ");
                break;
        case ma_error:
                pr_err("(ma_error) ");
                break;
        }

        pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end,
               mas->index, mas->last);
        pr_err("     min=%lx max=%lx alloc=%p, depth=%u, flags=%x\n",
               mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags);
        if (mas->index > mas->last)
                pr_err("Check index & last\n");
}
EXPORT_SYMBOL_GPL(mas_dump);

void mas_wr_dump(const struct ma_wr_state *wr_mas)
{
        pr_err("WR_MAS: node=%p r_min=%lx r_max=%lx\n",
               wr_mas->node, wr_mas->r_min, wr_mas->r_max);
        pr_err("        type=%u off_end=%u, node_end=%u, end_piv=%lx\n",
               wr_mas->type, wr_mas->offset_end, wr_mas->mas->end,
               wr_mas->end_piv);
}
EXPORT_SYMBOL_GPL(mas_wr_dump);

#endif /* CONFIG_DEBUG_MAPLE_TREE */


















    2 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PATH_H
#define _LINUX_PATH_H

struct dentry;
struct vfsmount;

struct path {
        struct vfsmount *mnt;
        struct dentry *dentry;
} __randomize_layout;

extern void path_get(const struct path *);
extern void path_put(const struct path *);

static inline int path_equal(const struct path *path1, const struct path *path2)
{
        return path1->mnt == path2->mnt && path1->dentry == path2->dentry;
}

static inline void path_put_init(struct path *path)
{
        path_put(path);
        *path = (struct path) { };
}

#endif  /* _LINUX_PATH_H */














    4 




















    4 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
// SPDX-License-Identifier: GPL-2.0
#include <linux/err.h>
#include <linux/mm.h>
#include <asm/current.h>
#include <asm/traps.h>
#include <asm/vdso.h>

struct vdso_exception_table_entry {
        int insn, fixup;
};

bool fixup_vdso_exception(struct pt_regs *regs, int trapnr,
                          unsigned long error_code, unsigned long fault_addr)
{
        const struct vdso_image *image = current->mm->context.vdso_image;
        const struct vdso_exception_table_entry *extable;
        unsigned int nr_entries, i;
        unsigned long base;

        /*
         * Do not attempt to fixup #DB or #BP.  It's impossible to identify
         * whether or not a #DB/#BP originated from within an SGX enclave and
         * SGX enclaves are currently the only use case for vDSO fixup.
         */
        if (trapnr == X86_TRAP_DB || trapnr == X86_TRAP_BP)
                return false;

        if (!current->mm->context.vdso)
                return false;

        base =  (unsigned long)current->mm->context.vdso + image->extable_base;
        nr_entries = image->extable_len / (sizeof(*extable));
        extable = image->extable;

        for (i = 0; i < nr_entries; i++) {
                if (regs->ip == base + extable[i].insn) {
                        regs->ip = base + extable[i].fixup;
                        regs->di = trapnr;
                        regs->si = error_code;
                        regs->dx = fault_addr;
                        return true;
                }
        }

        return false;
}






































































































































































































































































































































































































































    1 







    9 













   14 





   14 


   14 
    1 
























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FILELOCK_H
#define _LINUX_FILELOCK_H

#include <linux/fs.h>

#define FL_POSIX        1
#define FL_FLOCK        2
#define FL_DELEG        4        /* NFSv4 delegation */
#define FL_ACCESS        8        /* not trying to lock, just looking */
#define FL_EXISTS        16        /* when unlocking, test for existence */
#define FL_LEASE        32        /* lease held on this file */
#define FL_CLOSE        64        /* unlock on close */
#define FL_SLEEP        128        /* A blocking lock */
#define FL_DOWNGRADE_PENDING        256 /* Lease is being downgraded */
#define FL_UNLOCK_PENDING        512 /* Lease is being broken */
#define FL_OFDLCK        1024        /* lock is "owned" by struct file */
#define FL_LAYOUT        2048        /* outstanding pNFS layout */
#define FL_RECLAIM        4096        /* reclaiming from a reboot server */

#define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)

/*
 * Special return value from posix_lock_file() and vfs_lock_file() for
 * asynchronous locking.
 */
#define FILE_LOCK_DEFERRED 1

struct file_lock;
struct file_lease;

struct file_lock_operations {
        void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
        void (*fl_release_private)(struct file_lock *);
};

struct lock_manager_operations {
        void *lm_mod_owner;
        fl_owner_t (*lm_get_owner)(fl_owner_t);
        void (*lm_put_owner)(fl_owner_t);
        void (*lm_notify)(struct file_lock *);        /* unblock callback */
        int (*lm_grant)(struct file_lock *, int);
        bool (*lm_lock_expirable)(struct file_lock *cfl);
        void (*lm_expire_lock)(void);
};

struct lease_manager_operations {
        bool (*lm_break)(struct file_lease *);
        int (*lm_change)(struct file_lease *, int, struct list_head *);
        void (*lm_setup)(struct file_lease *, void **);
        bool (*lm_breaker_owns_lease)(struct file_lease *);
};

struct lock_manager {
        struct list_head list;
        /*
         * NFSv4 and up also want opens blocked during the grace period;
         * NLM doesn't care:
         */
        bool block_opens;
};

struct net;
void locks_start_grace(struct net *, struct lock_manager *);
void locks_end_grace(struct lock_manager *);
bool locks_in_grace(struct net *);
bool opens_in_grace(struct net *);

/*
 * struct file_lock has a union that some filesystems use to track
 * their own private info. The NFS side of things is defined here:
 */
#include <linux/nfs_fs_i.h>

/*
 * struct file_lock represents a generic "file lock". It's used to represent
 * POSIX byte range locks, BSD (flock) locks, and leases. It's important to
 * note that the same struct is used to represent both a request for a lock and
 * the lock itself, but the same object is never used for both.
 *
 * FIXME: should we create a separate "struct lock_request" to help distinguish
 * these two uses?
 *
 * The varous i_flctx lists are ordered by:
 *
 * 1) lock owner
 * 2) lock range start
 * 3) lock range end
 *
 * Obviously, the last two criteria only matter for POSIX locks.
 */

struct file_lock_core {
        struct file_lock_core *flc_blocker;        /* The lock that is blocking us */
        struct list_head flc_list;        /* link into file_lock_context */
        struct hlist_node flc_link;        /* node in global lists */
        struct list_head flc_blocked_requests;        /* list of requests with
                                                 * ->fl_blocker pointing here
                                                 */
        struct list_head flc_blocked_member;        /* node in
                                                 * ->fl_blocker->fl_blocked_requests
                                                 */
        fl_owner_t flc_owner;
        unsigned int flc_flags;
        unsigned char flc_type;
        pid_t flc_pid;
        int flc_link_cpu;                /* what cpu's list is this on? */
        wait_queue_head_t flc_wait;
        struct file *flc_file;
};

struct file_lock {
        struct file_lock_core c;
        loff_t fl_start;
        loff_t fl_end;

        const struct file_lock_operations *fl_ops;        /* Callbacks for filesystems */
        const struct lock_manager_operations *fl_lmops;        /* Callbacks for lockmanagers */
        union {
                struct nfs_lock_info        nfs_fl;
                struct nfs4_lock_info        nfs4_fl;
                struct {
                        struct list_head link;        /* link in AFS vnode's pending_locks list */
                        int state;                /* state of grant or error if -ve */
                        unsigned int        debug_id;
                } afs;
                struct {
                        struct inode *inode;
                } ceph;
        } fl_u;
} __randomize_layout;

struct file_lease {
        struct file_lock_core c;
        struct fasync_struct *        fl_fasync; /* for lease break notifications */
        /* for lease breaks: */
        unsigned long fl_break_time;
        unsigned long fl_downgrade_time;
        const struct lease_manager_operations *fl_lmops; /* Callbacks for lease managers */
} __randomize_layout;

struct file_lock_context {
        spinlock_t                flc_lock;
        struct list_head        flc_flock;
        struct list_head        flc_posix;
        struct list_head        flc_lease;
};

#ifdef CONFIG_FILE_LOCKING
int fcntl_getlk(struct file *, unsigned int, struct flock *);
int fcntl_setlk(unsigned int, struct file *, unsigned int,
                        struct flock *);

#if BITS_PER_LONG == 32
int fcntl_getlk64(struct file *, unsigned int, struct flock64 *);
int fcntl_setlk64(unsigned int, struct file *, unsigned int,
                        struct flock64 *);
#endif

int fcntl_setlease(unsigned int fd, struct file *filp, int arg);
int fcntl_getlease(struct file *filp);

static inline bool lock_is_unlock(struct file_lock *fl)
{
        return fl->c.flc_type == F_UNLCK;
}

static inline bool lock_is_read(struct file_lock *fl)
{
        return fl->c.flc_type == F_RDLCK;
}

static inline bool lock_is_write(struct file_lock *fl)
{
        return fl->c.flc_type == F_WRLCK;
}

static inline void locks_wake_up(struct file_lock *fl)
{
        wake_up(&fl->c.flc_wait);
}

/* fs/locks.c */
void locks_free_lock_context(struct inode *inode);
void locks_free_lock(struct file_lock *fl);
void locks_init_lock(struct file_lock *);
struct file_lock *locks_alloc_lock(void);
void locks_copy_lock(struct file_lock *, struct file_lock *);
void locks_copy_conflock(struct file_lock *, struct file_lock *);
void locks_remove_posix(struct file *, fl_owner_t);
void locks_remove_file(struct file *);
void locks_release_private(struct file_lock *);
void posix_test_lock(struct file *, struct file_lock *);
int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
int locks_delete_block(struct file_lock *);
int vfs_test_lock(struct file *, struct file_lock *);
int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *);
int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
bool vfs_inode_has_locks(struct inode *inode);
int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl);

void locks_init_lease(struct file_lease *);
void locks_free_lease(struct file_lease *fl);
struct file_lease *locks_alloc_lease(void);
int __break_lease(struct inode *inode, unsigned int flags, unsigned int type);
void lease_get_mtime(struct inode *, struct timespec64 *time);
int generic_setlease(struct file *, int, struct file_lease **, void **priv);
int kernel_setlease(struct file *, int, struct file_lease **, void **);
int vfs_setlease(struct file *, int, struct file_lease **, void **);
int lease_modify(struct file_lease *, int, struct list_head *);

struct notifier_block;
int lease_register_notifier(struct notifier_block *);
void lease_unregister_notifier(struct notifier_block *);

struct files_struct;
void show_fd_locks(struct seq_file *f,
                         struct file *filp, struct files_struct *files);
bool locks_owner_has_blockers(struct file_lock_context *flctx,
                        fl_owner_t owner);

static inline struct file_lock_context *
locks_inode_context(const struct inode *inode)
{
        return smp_load_acquire(&inode->i_flctx);
}

#else /* !CONFIG_FILE_LOCKING */
static inline int fcntl_getlk(struct file *file, unsigned int cmd,
                              struct flock __user *user)
{
        return -EINVAL;
}

static inline int fcntl_setlk(unsigned int fd, struct file *file,
                              unsigned int cmd, struct flock __user *user)
{
        return -EACCES;
}

#if BITS_PER_LONG == 32
static inline int fcntl_getlk64(struct file *file, unsigned int cmd,
                                struct flock64 *user)
{
        return -EINVAL;
}

static inline int fcntl_setlk64(unsigned int fd, struct file *file,
                                unsigned int cmd, struct flock64 *user)
{
        return -EACCES;
}
#endif
static inline int fcntl_setlease(unsigned int fd, struct file *filp, int arg)
{
        return -EINVAL;
}

static inline int fcntl_getlease(struct file *filp)
{
        return F_UNLCK;
}

static inline bool lock_is_unlock(struct file_lock *fl)
{
        return false;
}

static inline bool lock_is_read(struct file_lock *fl)
{
        return false;
}

static inline bool lock_is_write(struct file_lock *fl)
{
        return false;
}

static inline void locks_wake_up(struct file_lock *fl)
{
}

static inline void
locks_free_lock_context(struct inode *inode)
{
}

static inline void locks_init_lock(struct file_lock *fl)
{
        return;
}

static inline void locks_init_lease(struct file_lease *fl)
{
        return;
}

static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
{
        return;
}

static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
{
        return;
}

static inline void locks_remove_posix(struct file *filp, fl_owner_t owner)
{
        return;
}

static inline void locks_remove_file(struct file *filp)
{
        return;
}

static inline void posix_test_lock(struct file *filp, struct file_lock *fl)
{
        return;
}

static inline int posix_lock_file(struct file *filp, struct file_lock *fl,
                                  struct file_lock *conflock)
{
        return -ENOLCK;
}

static inline int locks_delete_block(struct file_lock *waiter)
{
        return -ENOENT;
}

static inline int vfs_test_lock(struct file *filp, struct file_lock *fl)
{
        return 0;
}

static inline int vfs_lock_file(struct file *filp, unsigned int cmd,
                                struct file_lock *fl, struct file_lock *conf)
{
        return -ENOLCK;
}

static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
{
        return 0;
}

static inline bool vfs_inode_has_locks(struct inode *inode)
{
        return false;
}

static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        return -ENOLCK;
}

static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
{
        return 0;
}

static inline void lease_get_mtime(struct inode *inode,
                                   struct timespec64 *time)
{
        return;
}

static inline int generic_setlease(struct file *filp, int arg,
                                    struct file_lease **flp, void **priv)
{
        return -EINVAL;
}

static inline int kernel_setlease(struct file *filp, int arg,
                               struct file_lease **lease, void **priv)
{
        return -EINVAL;
}

static inline int vfs_setlease(struct file *filp, int arg,
                               struct file_lease **lease, void **priv)
{
        return -EINVAL;
}

static inline int lease_modify(struct file_lease *fl, int arg,
                               struct list_head *dispose)
{
        return -EINVAL;
}

struct files_struct;
static inline void show_fd_locks(struct seq_file *f,
                        struct file *filp, struct files_struct *files) {}
static inline bool locks_owner_has_blockers(struct file_lock_context *flctx,
                        fl_owner_t owner)
{
        return false;
}

static inline struct file_lock_context *
locks_inode_context(const struct inode *inode)
{
        return NULL;
}

#endif /* !CONFIG_FILE_LOCKING */

/* for walking lists of file_locks linked by fl_list */
#define for_each_file_lock(_fl, _head)        list_for_each_entry(_fl, _head, c.flc_list)

static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
{
        return locks_lock_inode_wait(file_inode(filp), fl);
}

#ifdef CONFIG_FILE_LOCKING
static inline int break_lease(struct inode *inode, unsigned int mode)
{
        /*
         * Since this check is lockless, we must ensure that any refcounts
         * taken are done before checking i_flctx->flc_lease. Otherwise, we
         * could end up racing with tasks trying to set a new lease on this
         * file.
         */
        smp_mb();
        if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
                return __break_lease(inode, mode, FL_LEASE);
        return 0;
}

static inline int break_deleg(struct inode *inode, unsigned int mode)
{
        /*
         * Since this check is lockless, we must ensure that any refcounts
         * taken are done before checking i_flctx->flc_lease. Otherwise, we
         * could end up racing with tasks trying to set a new lease on this
         * file.
         */
        smp_mb();
        if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
                return __break_lease(inode, mode, FL_DELEG);
        return 0;
}

static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode)
{
        int ret;

        ret = break_deleg(inode, O_WRONLY|O_NONBLOCK);
        if (ret == -EWOULDBLOCK && delegated_inode) {
                *delegated_inode = inode;
                ihold(inode);
        }
        return ret;
}

static inline int break_deleg_wait(struct inode **delegated_inode)
{
        int ret;

        ret = break_deleg(*delegated_inode, O_WRONLY);
        iput(*delegated_inode);
        *delegated_inode = NULL;
        return ret;
}

static inline int break_layout(struct inode *inode, bool wait)
{
        smp_mb();
        if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
                return __break_lease(inode,
                                wait ? O_WRONLY : O_WRONLY | O_NONBLOCK,
                                FL_LAYOUT);
        return 0;
}

#else /* !CONFIG_FILE_LOCKING */
static inline int break_lease(struct inode *inode, unsigned int mode)
{
        return 0;
}

static inline int break_deleg(struct inode *inode, unsigned int mode)
{
        return 0;
}

static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode)
{
        return 0;
}

static inline int break_deleg_wait(struct inode **delegated_inode)
{
        BUG();
        return 0;
}

static inline int break_layout(struct inode *inode, bool wait)
{
        return 0;
}

#endif /* CONFIG_FILE_LOCKING */

#endif /* _LINUX_FILELOCK_H */








































































































































































































































    9 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/bad_inode.c
 *
 *  Copyright (C) 1997, Stephen Tweedie
 *
 *  Provide stub functions for unreadable inodes
 *
 *  Fabian Frederick : August 2003 - All file operations assigned to EIO
 */

#include <linux/fs.h>
#include <linux/export.h>
#include <linux/stat.h>
#include <linux/time.h>
#include <linux/namei.h>
#include <linux/poll.h>
#include <linux/fiemap.h>

static int bad_file_open(struct inode *inode, struct file *filp)
{
        return -EIO;
}

static const struct file_operations bad_file_ops =
{
        .open                = bad_file_open,
};

static int bad_inode_create(struct mnt_idmap *idmap,
                            struct inode *dir, struct dentry *dentry,
                            umode_t mode, bool excl)
{
        return -EIO;
}

static struct dentry *bad_inode_lookup(struct inode *dir,
                        struct dentry *dentry, unsigned int flags)
{
        return ERR_PTR(-EIO);
}

static int bad_inode_link (struct dentry *old_dentry, struct inode *dir,
                struct dentry *dentry)
{
        return -EIO;
}

static int bad_inode_unlink(struct inode *dir, struct dentry *dentry)
{
        return -EIO;
}

static int bad_inode_symlink(struct mnt_idmap *idmap,
                             struct inode *dir, struct dentry *dentry,
                             const char *symname)
{
        return -EIO;
}

static int bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir,
                           struct dentry *dentry, umode_t mode)
{
        return -EIO;
}

static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry)
{
        return -EIO;
}

static int bad_inode_mknod(struct mnt_idmap *idmap, struct inode *dir,
                           struct dentry *dentry, umode_t mode, dev_t rdev)
{
        return -EIO;
}

static int bad_inode_rename2(struct mnt_idmap *idmap,
                             struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry,
                             unsigned int flags)
{
        return -EIO;
}

static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
                int buflen)
{
        return -EIO;
}

static int bad_inode_permission(struct mnt_idmap *idmap,
                                struct inode *inode, int mask)
{
        return -EIO;
}

static int bad_inode_getattr(struct mnt_idmap *idmap,
                             const struct path *path, struct kstat *stat,
                             u32 request_mask, unsigned int query_flags)
{
        return -EIO;
}

static int bad_inode_setattr(struct mnt_idmap *idmap,
                             struct dentry *direntry, struct iattr *attrs)
{
        return -EIO;
}

static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer,
                        size_t buffer_size)
{
        return -EIO;
}

static const char *bad_inode_get_link(struct dentry *dentry,
                                      struct inode *inode,
                                      struct delayed_call *done)
{
        return ERR_PTR(-EIO);
}

static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type, bool rcu)
{
        return ERR_PTR(-EIO);
}

static int bad_inode_fiemap(struct inode *inode,
                            struct fiemap_extent_info *fieinfo, u64 start,
                            u64 len)
{
        return -EIO;
}

static int bad_inode_update_time(struct inode *inode, int flags)
{
        return -EIO;
}

static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry,
                                 struct file *file, unsigned int open_flag,
                                 umode_t create_mode)
{
        return -EIO;
}

static int bad_inode_tmpfile(struct mnt_idmap *idmap,
                             struct inode *inode, struct file *file,
                             umode_t mode)
{
        return -EIO;
}

static int bad_inode_set_acl(struct mnt_idmap *idmap,
                             struct dentry *dentry, struct posix_acl *acl,
                             int type)
{
        return -EIO;
}

static const struct inode_operations bad_inode_ops =
{
        .create                = bad_inode_create,
        .lookup                = bad_inode_lookup,
        .link                = bad_inode_link,
        .unlink                = bad_inode_unlink,
        .symlink        = bad_inode_symlink,
        .mkdir                = bad_inode_mkdir,
        .rmdir                = bad_inode_rmdir,
        .mknod                = bad_inode_mknod,
        .rename                = bad_inode_rename2,
        .readlink        = bad_inode_readlink,
        .permission        = bad_inode_permission,
        .getattr        = bad_inode_getattr,
        .setattr        = bad_inode_setattr,
        .listxattr        = bad_inode_listxattr,
        .get_link        = bad_inode_get_link,
        .get_inode_acl        = bad_inode_get_acl,
        .fiemap                = bad_inode_fiemap,
        .update_time        = bad_inode_update_time,
        .atomic_open        = bad_inode_atomic_open,
        .tmpfile        = bad_inode_tmpfile,
        .set_acl        = bad_inode_set_acl,
};


/*
 * When a filesystem is unable to read an inode due to an I/O error in
 * its read_inode() function, it can call make_bad_inode() to return a
 * set of stubs which will return EIO errors as required. 
 *
 * We only need to do limited initialisation: all other fields are
 * preinitialised to zero automatically.
 */
 
/**
 *        make_bad_inode - mark an inode bad due to an I/O error
 *        @inode: Inode to mark bad
 *
 *        When an inode cannot be read due to a media or remote network
 *        failure this function makes the inode "bad" and causes I/O operations
 *        on it to fail from this point on.
 */
 
void make_bad_inode(struct inode *inode)
{
        remove_inode_hash(inode);

        inode->i_mode = S_IFREG;
        simple_inode_init_ts(inode);
        inode->i_op = &bad_inode_ops;        
        inode->i_opflags &= ~IOP_XATTR;
        inode->i_fop = &bad_file_ops;        
}
EXPORT_SYMBOL(make_bad_inode);

/*
 * This tests whether an inode has been flagged as bad. The test uses
 * &bad_inode_ops to cover the case of invalidated inodes as well as
 * those created by make_bad_inode() above.
 */
 
/**
 *        is_bad_inode - is an inode errored
 *        @inode: inode to test
 *
 *        Returns true if the inode in question has been marked as bad.
 */
 
bool is_bad_inode(struct inode *inode)
{
        return (inode->i_op == &bad_inode_ops);        
}

EXPORT_SYMBOL(is_bad_inode);

/**
 * iget_failed - Mark an under-construction inode as dead and release it
 * @inode: The inode to discard
 *
 * Mark an under-construction inode as dead and release it.
 */
void iget_failed(struct inode *inode)
{
        make_bad_inode(inode);
        unlock_new_inode(inode);
        iput(inode);
}
EXPORT_SYMBOL(iget_failed);




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2020 ARM Ltd.
 */
#ifndef __ASM_VDSO_PROCESSOR_H
#define __ASM_VDSO_PROCESSOR_H

#ifndef __ASSEMBLY__

/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
static __always_inline void rep_nop(void)
{
        asm volatile("rep; nop" ::: "memory");
}

static __always_inline void cpu_relax(void)
{
        rep_nop();
}

struct getcpu_cache;

notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused);

#endif /* __ASSEMBLY__ */

#endif /* __ASM_VDSO_PROCESSOR_H */




































































    1 









    2 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#ifndef _LINUX_HASH_H
#define _LINUX_HASH_H
/* Fast hashing routine for ints,  longs and pointers.
   (C) 2002 Nadia Yvette Chambers, IBM */

#include <asm/types.h>
#include <linux/compiler.h>

/*
 * The "GOLDEN_RATIO_PRIME" is used in ifs/btrfs/brtfs_inode.h and
 * fs/inode.c.  It's not actually prime any more (the previous primes
 * were actively bad for hashing), but the name remains.
 */
#if BITS_PER_LONG == 32
#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_32
#define hash_long(val, bits) hash_32(val, bits)
#elif BITS_PER_LONG == 64
#define hash_long(val, bits) hash_64(val, bits)
#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_64
#else
#error Wordsize not 32 or 64
#endif

/*
 * This hash multiplies the input by a large odd number and takes the
 * high bits.  Since multiplication propagates changes to the most
 * significant end only, it is essential that the high bits of the
 * product be used for the hash value.
 *
 * Chuck Lever verified the effectiveness of this technique:
 * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
 *
 * Although a random odd number will do, it turns out that the golden
 * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice
 * properties.  (See Knuth vol 3, section 6.4, exercise 9.)
 *
 * These are the negative, (1 - phi) = phi**2 = (3 - sqrt(5))/2,
 * which is very slightly easier to multiply by and makes no
 * difference to the hash distribution.
 */
#define GOLDEN_RATIO_32 0x61C88647
#define GOLDEN_RATIO_64 0x61C8864680B583EBull

#ifdef CONFIG_HAVE_ARCH_HASH
/* This header may use the GOLDEN_RATIO_xx constants */
#include <asm/hash.h>
#endif

/*
 * The _generic versions exist only so lib/test_hash.c can compare
 * the arch-optimized versions with the generic.
 *
 * Note that if you change these, any <asm/hash.h> that aren't updated
 * to match need to have their HAVE_ARCH_* define values updated so the
 * self-test will not false-positive.
 */
#ifndef HAVE_ARCH__HASH_32
#define __hash_32 __hash_32_generic
#endif
static inline u32 __hash_32_generic(u32 val)
{
        return val * GOLDEN_RATIO_32;
}

static inline u32 hash_32(u32 val, unsigned int bits)
{
        /* High bits are more random, so use them. */
        return __hash_32(val) >> (32 - bits);
}

#ifndef HAVE_ARCH_HASH_64
#define hash_64 hash_64_generic
#endif
static __always_inline u32 hash_64_generic(u64 val, unsigned int bits)
{
#if BITS_PER_LONG == 64
        /* 64x64-bit multiply is efficient on all 64-bit processors */
        return val * GOLDEN_RATIO_64 >> (64 - bits);
#else
        /* Hash 64 bits using only 32x32-bit multiply. */
        return hash_32((u32)val ^ __hash_32(val >> 32), bits);
#endif
}

static inline u32 hash_ptr(const void *ptr, unsigned int bits)
{
        return hash_long((unsigned long)ptr, bits);
}

/* This really should be called fold32_ptr; it does no hashing to speak of. */
static inline u32 hash32_ptr(const void *ptr)
{
        unsigned long val = (unsigned long)ptr;

#if BITS_PER_LONG == 64
        val ^= (val >> 32);
#endif
        return (u32)val;
}

#endif /* _LINUX_HASH_H */








































    4 


























    5 




    5 


    4 

    4 






















    3 




























    2 

    1 


    1 






























   10 







    7 
    2 









    7 
    3 



    7 
    3 



    5 








    3 


    3 




   10 






   10 













































    1 






































    8 


    9 
    9 
    7 
    1 
    7 
    2 
    1 
    8 
    6 














    3 
    6 







    9 



    1 






































   10 






   10 

   10 



    6 


















    3 







   10 



   10 



   10 


















    9 





    9 















   10 



    9 







   10 


   10 











    9 




   10 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/attr.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  changes by Thomas Schoebel-Theuer
 */

#include <linux/export.h>
#include <linux/time.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/sched/signal.h>
#include <linux/capability.h>
#include <linux/fsnotify.h>
#include <linux/fcntl.h>
#include <linux/filelock.h>
#include <linux/security.h>

#include "internal.h"

/**
 * setattr_should_drop_sgid - determine whether the setgid bit needs to be
 *                            removed
 * @idmap:        idmap of the mount @inode was found from
 * @inode:        inode to check
 *
 * This function determines whether the setgid bit needs to be removed.
 * We retain backwards compatibility and require setgid bit to be removed
 * unconditionally if S_IXGRP is set. Otherwise we have the exact same
 * requirements as setattr_prepare() and setattr_copy().
 *
 * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise.
 */
int setattr_should_drop_sgid(struct mnt_idmap *idmap,
                             const struct inode *inode)
{
        umode_t mode = inode->i_mode;

        if (!(mode & S_ISGID))
                return 0;
        if (mode & S_IXGRP)
                return ATTR_KILL_SGID;
        if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode)))
                return ATTR_KILL_SGID;
        return 0;
}
EXPORT_SYMBOL(setattr_should_drop_sgid);

/**
 * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to
 *                               be dropped
 * @idmap:        idmap of the mount @inode was found from
 * @inode:        inode to check
 *
 * This function determines whether the set{g,u}id bits need to be removed.
 * If the setuid bit needs to be removed ATTR_KILL_SUID is returned. If the
 * setgid bit needs to be removed ATTR_KILL_SGID is returned. If both
 * set{g,u}id bits need to be removed the corresponding mask of both flags is
 * returned.
 *
 * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits
 * to remove, 0 otherwise.
 */
int setattr_should_drop_suidgid(struct mnt_idmap *idmap,
                                struct inode *inode)
{
        umode_t mode = inode->i_mode;
        int kill = 0;

        /* suid always must be killed */
        if (unlikely(mode & S_ISUID))
                kill = ATTR_KILL_SUID;

        kill |= setattr_should_drop_sgid(idmap, inode);

        if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
                return kill;

        return 0;
}
EXPORT_SYMBOL(setattr_should_drop_suidgid);

/**
 * chown_ok - verify permissions to chown inode
 * @idmap:        idmap of the mount @inode was found from
 * @inode:        inode to check permissions on
 * @ia_vfsuid:        uid to chown @inode to
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 */
static bool chown_ok(struct mnt_idmap *idmap,
                     const struct inode *inode, vfsuid_t ia_vfsuid)
{
        vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode);
        if (vfsuid_eq_kuid(vfsuid, current_fsuid()) &&
            vfsuid_eq(ia_vfsuid, vfsuid))
                return true;
        if (capable_wrt_inode_uidgid(idmap, inode, CAP_CHOWN))
                return true;
        if (!vfsuid_valid(vfsuid) &&
            ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
                return true;
        return false;
}

/**
 * chgrp_ok - verify permissions to chgrp inode
 * @idmap:        idmap of the mount @inode was found from
 * @inode:        inode to check permissions on
 * @ia_vfsgid:        gid to chown @inode to
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 */
static bool chgrp_ok(struct mnt_idmap *idmap,
                     const struct inode *inode, vfsgid_t ia_vfsgid)
{
        vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
        vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode);
        if (vfsuid_eq_kuid(vfsuid, current_fsuid())) {
                if (vfsgid_eq(ia_vfsgid, vfsgid))
                        return true;
                if (vfsgid_in_group_p(ia_vfsgid))
                        return true;
        }
        if (capable_wrt_inode_uidgid(idmap, inode, CAP_CHOWN))
                return true;
        if (!vfsgid_valid(vfsgid) &&
            ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
                return true;
        return false;
}

/**
 * setattr_prepare - check if attribute changes to a dentry are allowed
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        dentry to check
 * @attr:        attributes to change
 *
 * Check if we are allowed to change the attributes contained in @attr
 * in the given dentry.  This includes the normal unix access permission
 * checks, as well as checks for rlimits and others. The function also clears
 * SGID bit from mode if user is not allowed to set it. Also file capabilities
 * and IMA extended attributes are cleared if ATTR_KILL_PRIV is set.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * Should be called as the first thing in ->setattr implementations,
 * possibly after taking additional locks.
 */
int setattr_prepare(struct mnt_idmap *idmap, struct dentry *dentry,
                    struct iattr *attr)
{
        struct inode *inode = d_inode(dentry);
        unsigned int ia_valid = attr->ia_valid;

        /*
         * First check size constraints.  These can't be overriden using
         * ATTR_FORCE.
         */
        if (ia_valid & ATTR_SIZE) {
                int error = inode_newsize_ok(inode, attr->ia_size);
                if (error)
                        return error;
        }

        /* If force is set do it anyway. */
        if (ia_valid & ATTR_FORCE)
                goto kill_priv;

        /* Make sure a caller can chown. */
        if ((ia_valid & ATTR_UID) &&
            !chown_ok(idmap, inode, attr->ia_vfsuid))
                return -EPERM;

        /* Make sure caller can chgrp. */
        if ((ia_valid & ATTR_GID) &&
            !chgrp_ok(idmap, inode, attr->ia_vfsgid))
                return -EPERM;

        /* Make sure a caller can chmod. */
        if (ia_valid & ATTR_MODE) {
                vfsgid_t vfsgid;

                if (!inode_owner_or_capable(idmap, inode))
                        return -EPERM;

                if (ia_valid & ATTR_GID)
                        vfsgid = attr->ia_vfsgid;
                else
                        vfsgid = i_gid_into_vfsgid(idmap, inode);

                /* Also check the setgid bit! */
                if (!in_group_or_capable(idmap, inode, vfsgid))
                        attr->ia_mode &= ~S_ISGID;
        }

        /* Check for setting the inode time. */
        if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
                if (!inode_owner_or_capable(idmap, inode))
                        return -EPERM;
        }

kill_priv:
        /* User has permission for the change */
        if (ia_valid & ATTR_KILL_PRIV) {
                int error;

                error = security_inode_killpriv(idmap, dentry);
                if (error)
                        return error;
        }

        return 0;
}
EXPORT_SYMBOL(setattr_prepare);

/**
 * inode_newsize_ok - may this inode be truncated to a given size
 * @inode:        the inode to be truncated
 * @offset:        the new size to assign to the inode
 *
 * inode_newsize_ok must be called with i_mutex held.
 *
 * inode_newsize_ok will check filesystem limits and ulimits to check that the
 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
 * when necessary. Caller must not proceed with inode size change if failure is
 * returned. @inode must be a file (not directory), with appropriate
 * permissions to allow truncate (inode_newsize_ok does NOT check these
 * conditions).
 *
 * Return: 0 on success, -ve errno on failure
 */
int inode_newsize_ok(const struct inode *inode, loff_t offset)
{
        if (offset < 0)
                return -EINVAL;
        if (inode->i_size < offset) {
                unsigned long limit;

                limit = rlimit(RLIMIT_FSIZE);
                if (limit != RLIM_INFINITY && offset > limit)
                        goto out_sig;
                if (offset > inode->i_sb->s_maxbytes)
                        goto out_big;
        } else {
                /*
                 * truncation of in-use swapfiles is disallowed - it would
                 * cause subsequent swapout to scribble on the now-freed
                 * blocks.
                 */
                if (IS_SWAPFILE(inode))
                        return -ETXTBSY;
        }

        return 0;
out_sig:
        send_sig(SIGXFSZ, current, 0);
out_big:
        return -EFBIG;
}
EXPORT_SYMBOL(inode_newsize_ok);

/**
 * setattr_copy - copy simple metadata updates into the generic inode
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        the inode to be updated
 * @attr:        the new attributes
 *
 * setattr_copy must be called with i_mutex held.
 *
 * setattr_copy updates the inode's metadata with that specified
 * in attr on idmapped mounts. Necessary permission checks to determine
 * whether or not the S_ISGID property needs to be removed are performed with
 * the correct idmapped mount permission helpers.
 * Noticeably missing is inode size update, which is more complex
 * as it requires pagecache updates.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * The inode is not marked as dirty after this operation. The rationale is
 * that for "simple" filesystems, the struct inode is the inode storage.
 * The caller is free to mark the inode dirty afterwards if needed.
 */
void setattr_copy(struct mnt_idmap *idmap, struct inode *inode,
                  const struct iattr *attr)
{
        unsigned int ia_valid = attr->ia_valid;

        i_uid_update(idmap, attr, inode);
        i_gid_update(idmap, attr, inode);
        if (ia_valid & ATTR_ATIME)
                inode_set_atime_to_ts(inode, attr->ia_atime);
        if (ia_valid & ATTR_MTIME)
                inode_set_mtime_to_ts(inode, attr->ia_mtime);
        if (ia_valid & ATTR_CTIME)
                inode_set_ctime_to_ts(inode, attr->ia_ctime);
        if (ia_valid & ATTR_MODE) {
                umode_t mode = attr->ia_mode;
                if (!in_group_or_capable(idmap, inode,
                                         i_gid_into_vfsgid(idmap, inode)))
                        mode &= ~S_ISGID;
                inode->i_mode = mode;
        }
}
EXPORT_SYMBOL(setattr_copy);

int may_setattr(struct mnt_idmap *idmap, struct inode *inode,
                unsigned int ia_valid)
{
        int error;

        if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
                if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
                        return -EPERM;
        }

        /*
         * If utimes(2) and friends are called with times == NULL (or both
         * times are UTIME_NOW), then we need to check for write permission
         */
        if (ia_valid & ATTR_TOUCH) {
                if (IS_IMMUTABLE(inode))
                        return -EPERM;

                if (!inode_owner_or_capable(idmap, inode)) {
                        error = inode_permission(idmap, inode, MAY_WRITE);
                        if (error)
                                return error;
                }
        }
        return 0;
}
EXPORT_SYMBOL(may_setattr);

/**
 * notify_change - modify attributes of a filesystem object
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        object affected
 * @attr:        new attributes
 * @delegated_inode: returns inode, if the inode is delegated
 *
 * The caller must hold the i_mutex on the affected object.
 *
 * If notify_change discovers a delegation in need of breaking,
 * it will return -EWOULDBLOCK and return a reference to the inode in
 * delegated_inode.  The caller should then break the delegation and
 * retry.  Because breaking a delegation may take a long time, the
 * caller should drop the i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.  Also, passing NULL is fine for callers holding
 * the file open for write, as there can be no conflicting delegation in
 * that case.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 */
int notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
                  struct iattr *attr, struct inode **delegated_inode)
{
        struct inode *inode = dentry->d_inode;
        umode_t mode = inode->i_mode;
        int error;
        struct timespec64 now;
        unsigned int ia_valid = attr->ia_valid;

        WARN_ON_ONCE(!inode_is_locked(inode));

        error = may_setattr(idmap, inode, ia_valid);
        if (error)
                return error;

        if ((ia_valid & ATTR_MODE)) {
                /*
                 * Don't allow changing the mode of symlinks:
                 *
                 * (1) The vfs doesn't take the mode of symlinks into account
                 *     during permission checking.
                 * (2) This has never worked correctly. Most major filesystems
                 *     did return EOPNOTSUPP due to interactions with POSIX ACLs
                 *     but did still updated the mode of the symlink.
                 *     This inconsistency led system call wrapper providers such
                 *     as libc to block changing the mode of symlinks with
                 *     EOPNOTSUPP already.
                 * (3) To even do this in the first place one would have to use
                 *     specific file descriptors and quite some effort.
                 */
                if (S_ISLNK(inode->i_mode))
                        return -EOPNOTSUPP;

                /* Flag setting protected by i_mutex */
                if (is_sxid(attr->ia_mode))
                        inode->i_flags &= ~S_NOSEC;
        }

        now = current_time(inode);

        attr->ia_ctime = now;
        if (!(ia_valid & ATTR_ATIME_SET))
                attr->ia_atime = now;
        else
                attr->ia_atime = timestamp_truncate(attr->ia_atime, inode);
        if (!(ia_valid & ATTR_MTIME_SET))
                attr->ia_mtime = now;
        else
                attr->ia_mtime = timestamp_truncate(attr->ia_mtime, inode);

        if (ia_valid & ATTR_KILL_PRIV) {
                error = security_inode_need_killpriv(dentry);
                if (error < 0)
                        return error;
                if (error == 0)
                        ia_valid = attr->ia_valid &= ~ATTR_KILL_PRIV;
        }

        /*
         * We now pass ATTR_KILL_S*ID to the lower level setattr function so
         * that the function has the ability to reinterpret a mode change
         * that's due to these bits. This adds an implicit restriction that
         * no function will ever call notify_change with both ATTR_MODE and
         * ATTR_KILL_S*ID set.
         */
        if ((ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) &&
            (ia_valid & ATTR_MODE))
                BUG();

        if (ia_valid & ATTR_KILL_SUID) {
                if (mode & S_ISUID) {
                        ia_valid = attr->ia_valid |= ATTR_MODE;
                        attr->ia_mode = (inode->i_mode & ~S_ISUID);
                }
        }
        if (ia_valid & ATTR_KILL_SGID) {
                if (mode & S_ISGID) {
                        if (!(ia_valid & ATTR_MODE)) {
                                ia_valid = attr->ia_valid |= ATTR_MODE;
                                attr->ia_mode = inode->i_mode;
                        }
                        attr->ia_mode &= ~S_ISGID;
                }
        }
        if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID)))
                return 0;

        /*
         * Verify that uid/gid changes are valid in the target
         * namespace of the superblock.
         */
        if (ia_valid & ATTR_UID &&
            !vfsuid_has_fsmapping(idmap, inode->i_sb->s_user_ns,
                                  attr->ia_vfsuid))
                return -EOVERFLOW;
        if (ia_valid & ATTR_GID &&
            !vfsgid_has_fsmapping(idmap, inode->i_sb->s_user_ns,
                                  attr->ia_vfsgid))
                return -EOVERFLOW;

        /* Don't allow modifications of files with invalid uids or
         * gids unless those uids & gids are being made valid.
         */
        if (!(ia_valid & ATTR_UID) &&
            !vfsuid_valid(i_uid_into_vfsuid(idmap, inode)))
                return -EOVERFLOW;
        if (!(ia_valid & ATTR_GID) &&
            !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)))
                return -EOVERFLOW;

        error = security_inode_setattr(idmap, dentry, attr);
        if (error)
                return error;
        error = try_break_deleg(inode, delegated_inode);
        if (error)
                return error;

        if (inode->i_op->setattr)
                error = inode->i_op->setattr(idmap, dentry, attr);
        else
                error = simple_setattr(idmap, dentry, attr);

        if (!error) {
                fsnotify_change(dentry, ia_valid);
                security_inode_post_setattr(idmap, dentry, ia_valid);
        }

        return error;
}
EXPORT_SYMBOL(notify_change);


























































    1 



    1 

























    1 
    1 








































    1 



























































































































































    1 


    1 







    1 




    1 




    1 



    1 

    1 










































   11 



    1 


    2 







   10 









   10 









    1 
    1 





    1 


























































    2 
















    1 

    1 


    1 
    1 












    4 
    4 




    1 
    2 






































































    4 














    4 

    1 

    4 


    2 
    2 



    4 










    2 



    2 


    1 




    1 
    1 




    2 





    3 





    1 
    2 



    1 
    1 
    2 





    3 



















    1 





    1 


    1 
























    2 





    2 


    2 












    3 



    3 
    3 


    3 






    1 
    2 












    1 



    1 


    1 









    2 



































































































































    4 






    1 










    1 







    1 

    1 

    1 
    3 






    3 
























    3 

    2 

    1 

    3 
    3 







    2 



    1 

    1 




    1 

    1 



    1 






    1 






    1 




    1 
    1 



    1 
























    2 


    3 















    1 



    2 



    2 




    2 





    1 





    2 




    2 


    1 








    1 


    1 




    1 


    2 








    2 




























































































    2 



















    1 


    1 



    1 




















    1 

























    1 








    1 
    1 













    1 

    2 























    2 













    1 















































































    1 





















    1 





    1 















    1 







    1 















    1 










    1 












    1 



























    1 

    1 










    1 






















    1 



    1 































    1 

    1 





































    5 









    4 
    1 

    5 




    5 











    5 







    5 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/read_write.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/sched/xacct.h>
#include <linux/fcntl.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/fs.h>
#include "internal.h"

#include <linux/uaccess.h>
#include <asm/unistd.h>

const struct file_operations generic_ro_fops = {
        .llseek                = generic_file_llseek,
        .read_iter        = generic_file_read_iter,
        .mmap                = generic_file_readonly_mmap,
        .splice_read        = filemap_splice_read,
};

EXPORT_SYMBOL(generic_ro_fops);

static inline bool unsigned_offsets(struct file *file)
{
        return file->f_mode & FMODE_UNSIGNED_OFFSET;
}

/**
 * vfs_setpos - update the file offset for lseek
 * @file:        file structure in question
 * @offset:        file offset to seek to
 * @maxsize:        maximum file size
 *
 * This is a low-level filesystem helper for updating the file offset to
 * the value specified by @offset if the given offset is valid and it is
 * not equal to the current file offset.
 *
 * Return the specified offset on success and -EINVAL on invalid offset.
 */
loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
{
        if (offset < 0 && !unsigned_offsets(file))
                return -EINVAL;
        if (offset > maxsize)
                return -EINVAL;

        if (offset != file->f_pos) {
                file->f_pos = offset;
                file->f_version = 0;
        }
        return offset;
}
EXPORT_SYMBOL(vfs_setpos);

/**
 * generic_file_llseek_size - generic llseek implementation for regular files
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 * @maxsize:        max size of this file in file system
 * @eof:        offset used for SEEK_END position
 *
 * This is a variant of generic_file_llseek that allows passing in a custom
 * maximum file size and a custom EOF position, for e.g. hashed directories
 *
 * Synchronization:
 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
 * read/writes behave like SEEK_SET against seeks.
 */
loff_t
generic_file_llseek_size(struct file *file, loff_t offset, int whence,
                loff_t maxsize, loff_t eof)
{
        switch (whence) {
        case SEEK_END:
                offset += eof;
                break;
        case SEEK_CUR:
                /*
                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
                 * position-querying operation.  Avoid rewriting the "same"
                 * f_pos value back to the file because a concurrent read(),
                 * write() or lseek() might have altered it
                 */
                if (offset == 0)
                        return file->f_pos;
                /*
                 * f_lock protects against read/modify/write race with other
                 * SEEK_CURs. Note that parallel writes and reads behave
                 * like SEEK_SET.
                 */
                spin_lock(&file->f_lock);
                offset = vfs_setpos(file, file->f_pos + offset, maxsize);
                spin_unlock(&file->f_lock);
                return offset;
        case SEEK_DATA:
                /*
                 * In the generic case the entire file is data, so as long as
                 * offset isn't at the end of the file then the offset is data.
                 */
                if ((unsigned long long)offset >= eof)
                        return -ENXIO;
                break;
        case SEEK_HOLE:
                /*
                 * There is a virtual hole at the end of the file, so as long as
                 * offset isn't i_size or larger, return i_size.
                 */
                if ((unsigned long long)offset >= eof)
                        return -ENXIO;
                offset = eof;
                break;
        }

        return vfs_setpos(file, offset, maxsize);
}
EXPORT_SYMBOL(generic_file_llseek_size);

/**
 * generic_file_llseek - generic llseek implementation for regular files
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 *
 * This is a generic implemenation of ->llseek useable for all normal local
 * filesystems.  It just updates the file offset to the value specified by
 * @offset and @whence.
 */
loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *inode = file->f_mapping->host;

        return generic_file_llseek_size(file, offset, whence,
                                        inode->i_sb->s_maxbytes,
                                        i_size_read(inode));
}
EXPORT_SYMBOL(generic_file_llseek);

/**
 * fixed_size_llseek - llseek implementation for fixed-sized devices
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 * @size:        size of the file
 *
 */
loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
{
        switch (whence) {
        case SEEK_SET: case SEEK_CUR: case SEEK_END:
                return generic_file_llseek_size(file, offset, whence,
                                                size, size);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(fixed_size_llseek);

/**
 * no_seek_end_llseek - llseek implementation for fixed-sized devices
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 *
 */
loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
{
        switch (whence) {
        case SEEK_SET: case SEEK_CUR:
                return generic_file_llseek_size(file, offset, whence,
                                                OFFSET_MAX, 0);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(no_seek_end_llseek);

/**
 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 * @size:        maximal offset allowed
 *
 */
loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
{
        switch (whence) {
        case SEEK_SET: case SEEK_CUR:
                return generic_file_llseek_size(file, offset, whence,
                                                size, 0);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(no_seek_end_llseek_size);

/**
 * noop_llseek - No Operation Performed llseek implementation
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 *
 * This is an implementation of ->llseek useable for the rare special case when
 * userspace expects the seek to succeed but the (device) file is actually not
 * able to perform the seek. In this case you use noop_llseek() instead of
 * falling back to the default implementation of ->llseek.
 */
loff_t noop_llseek(struct file *file, loff_t offset, int whence)
{
        return file->f_pos;
}
EXPORT_SYMBOL(noop_llseek);

loff_t default_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *inode = file_inode(file);
        loff_t retval;

        inode_lock(inode);
        switch (whence) {
                case SEEK_END:
                        offset += i_size_read(inode);
                        break;
                case SEEK_CUR:
                        if (offset == 0) {
                                retval = file->f_pos;
                                goto out;
                        }
                        offset += file->f_pos;
                        break;
                case SEEK_DATA:
                        /*
                         * In the generic case the entire file is data, so as
                         * long as offset isn't at the end of the file then the
                         * offset is data.
                         */
                        if (offset >= inode->i_size) {
                                retval = -ENXIO;
                                goto out;
                        }
                        break;
                case SEEK_HOLE:
                        /*
                         * There is a virtual hole at the end of the file, so
                         * as long as offset isn't i_size or larger, return
                         * i_size.
                         */
                        if (offset >= inode->i_size) {
                                retval = -ENXIO;
                                goto out;
                        }
                        offset = inode->i_size;
                        break;
        }
        retval = -EINVAL;
        if (offset >= 0 || unsigned_offsets(file)) {
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        file->f_version = 0;
                }
                retval = offset;
        }
out:
        inode_unlock(inode);
        return retval;
}
EXPORT_SYMBOL(default_llseek);

loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
{
        if (!(file->f_mode & FMODE_LSEEK))
                return -ESPIPE;
        return file->f_op->llseek(file, offset, whence);
}
EXPORT_SYMBOL(vfs_llseek);

static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
{
        off_t retval;
        struct fd f = fdget_pos(fd);
        if (!f.file)
                return -EBADF;

        retval = -EINVAL;
        if (whence <= SEEK_MAX) {
                loff_t res = vfs_llseek(f.file, offset, whence);
                retval = res;
                if (res != (loff_t)retval)
                        retval = -EOVERFLOW;        /* LFS: should only happen on 32 bit platforms */
        }
        fdput_pos(f);
        return retval;
}

SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
{
        return ksys_lseek(fd, offset, whence);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
{
        return ksys_lseek(fd, offset, whence);
}
#endif

#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \
        defined(__ARCH_WANT_SYS_LLSEEK)
SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
                unsigned long, offset_low, loff_t __user *, result,
                unsigned int, whence)
{
        int retval;
        struct fd f = fdget_pos(fd);
        loff_t offset;

        if (!f.file)
                return -EBADF;

        retval = -EINVAL;
        if (whence > SEEK_MAX)
                goto out_putf;

        offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
                        whence);

        retval = (int)offset;
        if (offset >= 0) {
                retval = -EFAULT;
                if (!copy_to_user(result, &offset, sizeof(offset)))
                        retval = 0;
        }
out_putf:
        fdput_pos(f);
        return retval;
}
#endif

int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
{
        int mask = read_write == READ ? MAY_READ : MAY_WRITE;
        int ret;

        if (unlikely((ssize_t) count < 0))
                return -EINVAL;

        if (ppos) {
                loff_t pos = *ppos;

                if (unlikely(pos < 0)) {
                        if (!unsigned_offsets(file))
                                return -EINVAL;
                        if (count >= -pos) /* both values are in 0..LLONG_MAX */
                                return -EOVERFLOW;
                } else if (unlikely((loff_t) (pos + count) < 0)) {
                        if (!unsigned_offsets(file))
                                return -EINVAL;
                }
        }

        ret = security_file_permission(file, mask);
        if (ret)
                return ret;

        return fsnotify_file_area_perm(file, mask, ppos, count);
}
EXPORT_SYMBOL(rw_verify_area);

static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = (ppos ? *ppos : 0);
        iov_iter_ubuf(&iter, ITER_DEST, buf, len);

        ret = filp->f_op->read_iter(&kiocb, &iter);
        BUG_ON(ret == -EIOCBQUEUED);
        if (ppos)
                *ppos = kiocb.ki_pos;
        return ret;
}

static int warn_unsupported(struct file *file, const char *op)
{
        pr_warn_ratelimited(
                "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
                op, file, current->pid, current->comm);
        return -EINVAL;
}

ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
{
        struct kvec iov = {
                .iov_base        = buf,
                .iov_len        = min_t(size_t, count, MAX_RW_COUNT),
        };
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
                return -EINVAL;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;
        /*
         * Also fail if ->read_iter and ->read are both wired up as that
         * implies very convoluted semantics.
         */
        if (unlikely(!file->f_op->read_iter || file->f_op->read))
                return warn_unsupported(file, "read");

        init_sync_kiocb(&kiocb, file);
        kiocb.ki_pos = pos ? *pos : 0;
        iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len);
        ret = file->f_op->read_iter(&kiocb, &iter);
        if (ret > 0) {
                if (pos)
                        *pos = kiocb.ki_pos;
                fsnotify_access(file);
                add_rchar(current, ret);
        }
        inc_syscr(current);
        return ret;
}

ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
{
        ssize_t ret;

        ret = rw_verify_area(READ, file, pos, count);
        if (ret)
                return ret;
        return __kernel_read(file, buf, count, pos);
}
EXPORT_SYMBOL(kernel_read);

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
        ssize_t ret;

        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;
        if (unlikely(!access_ok(buf, count)))
                return -EFAULT;

        ret = rw_verify_area(READ, file, pos, count);
        if (ret)
                return ret;
        if (count > MAX_RW_COUNT)
                count =  MAX_RW_COUNT;

        if (file->f_op->read)
                ret = file->f_op->read(file, buf, count, pos);
        else if (file->f_op->read_iter)
                ret = new_sync_read(file, buf, count, pos);
        else
                ret = -EINVAL;
        if (ret > 0) {
                fsnotify_access(file);
                add_rchar(current, ret);
        }
        inc_syscr(current);
        return ret;
}

static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = (ppos ? *ppos : 0);
        iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len);

        ret = filp->f_op->write_iter(&kiocb, &iter);
        BUG_ON(ret == -EIOCBQUEUED);
        if (ret > 0 && ppos)
                *ppos = kiocb.ki_pos;
        return ret;
}

/* caller is responsible for file_start_write/file_end_write */
ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos)
{
        struct kiocb kiocb;
        ssize_t ret;

        if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;
        /*
         * Also fail if ->write_iter and ->write are both wired up as that
         * implies very convoluted semantics.
         */
        if (unlikely(!file->f_op->write_iter || file->f_op->write))
                return warn_unsupported(file, "write");

        init_sync_kiocb(&kiocb, file);
        kiocb.ki_pos = pos ? *pos : 0;
        ret = file->f_op->write_iter(&kiocb, from);
        if (ret > 0) {
                if (pos)
                        *pos = kiocb.ki_pos;
                fsnotify_modify(file);
                add_wchar(current, ret);
        }
        inc_syscw(current);
        return ret;
}

/* caller is responsible for file_start_write/file_end_write */
ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
{
        struct kvec iov = {
                .iov_base        = (void *)buf,
                .iov_len        = min_t(size_t, count, MAX_RW_COUNT),
        };
        struct iov_iter iter;
        iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len);
        return __kernel_write_iter(file, &iter, pos);
}
/*
 * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
 * but autofs is one of the few internal kernel users that actually
 * wants this _and_ can be built as a module. So we need to export
 * this symbol for autofs, even though it really isn't appropriate
 * for any other kernel modules.
 */
EXPORT_SYMBOL_GPL(__kernel_write);

ssize_t kernel_write(struct file *file, const void *buf, size_t count,
                            loff_t *pos)
{
        ssize_t ret;

        ret = rw_verify_area(WRITE, file, pos, count);
        if (ret)
                return ret;

        file_start_write(file);
        ret =  __kernel_write(file, buf, count, pos);
        file_end_write(file);
        return ret;
}
EXPORT_SYMBOL(kernel_write);

ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
        ssize_t ret;

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;
        if (unlikely(!access_ok(buf, count)))
                return -EFAULT;

        ret = rw_verify_area(WRITE, file, pos, count);
        if (ret)
                return ret;
        if (count > MAX_RW_COUNT)
                count =  MAX_RW_COUNT;
        file_start_write(file);
        if (file->f_op->write)
                ret = file->f_op->write(file, buf, count, pos);
        else if (file->f_op->write_iter)
                ret = new_sync_write(file, buf, count, pos);
        else
                ret = -EINVAL;
        if (ret > 0) {
                fsnotify_modify(file);
                add_wchar(current, ret);
        }
        inc_syscw(current);
        file_end_write(file);
        return ret;
}

/* file_ppos returns &file->f_pos or NULL if file is stream */
static inline loff_t *file_ppos(struct file *file)
{
        return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
}

ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
{
        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;

        if (f.file) {
                loff_t pos, *ppos = file_ppos(f.file);
                if (ppos) {
                        pos = *ppos;
                        ppos = &pos;
                }
                ret = vfs_read(f.file, buf, count, ppos);
                if (ret >= 0 && ppos)
                        f.file->f_pos = pos;
                fdput_pos(f);
        }
        return ret;
}

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
        return ksys_read(fd, buf, count);
}

ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;

        if (f.file) {
                loff_t pos, *ppos = file_ppos(f.file);
                if (ppos) {
                        pos = *ppos;
                        ppos = &pos;
                }
                ret = vfs_write(f.file, buf, count, ppos);
                if (ret >= 0 && ppos)
                        f.file->f_pos = pos;
                fdput_pos(f);
        }

        return ret;
}

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
                size_t, count)
{
        return ksys_write(fd, buf, count);
}

ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
                     loff_t pos)
{
        struct fd f;
        ssize_t ret = -EBADF;

        if (pos < 0)
                return -EINVAL;

        f = fdget(fd);
        if (f.file) {
                ret = -ESPIPE;
                if (f.file->f_mode & FMODE_PREAD)
                        ret = vfs_read(f.file, buf, count, &pos);
                fdput(f);
        }

        return ret;
}

SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
                        size_t, count, loff_t, pos)
{
        return ksys_pread64(fd, buf, count, pos);
}

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64)
COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf,
                       size_t, count, compat_arg_u64_dual(pos))
{
        return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos));
}
#endif

ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
                      size_t count, loff_t pos)
{
        struct fd f;
        ssize_t ret = -EBADF;

        if (pos < 0)
                return -EINVAL;

        f = fdget(fd);
        if (f.file) {
                ret = -ESPIPE;
                if (f.file->f_mode & FMODE_PWRITE)  
                        ret = vfs_write(f.file, buf, count, &pos);
                fdput(f);
        }

        return ret;
}

SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
                         size_t, count, loff_t, pos)
{
        return ksys_pwrite64(fd, buf, count, pos);
}

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64)
COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf,
                       size_t, count, compat_arg_u64_dual(pos))
{
        return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos));
}
#endif

static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
                loff_t *ppos, int type, rwf_t flags)
{
        struct kiocb kiocb;
        ssize_t ret;

        init_sync_kiocb(&kiocb, filp);
        ret = kiocb_set_rw_flags(&kiocb, flags);
        if (ret)
                return ret;
        kiocb.ki_pos = (ppos ? *ppos : 0);

        if (type == READ)
                ret = filp->f_op->read_iter(&kiocb, iter);
        else
                ret = filp->f_op->write_iter(&kiocb, iter);
        BUG_ON(ret == -EIOCBQUEUED);
        if (ppos)
                *ppos = kiocb.ki_pos;
        return ret;
}

/* Do it by hand, with file-ops */
static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
                loff_t *ppos, int type, rwf_t flags)
{
        ssize_t ret = 0;

        if (flags & ~RWF_HIPRI)
                return -EOPNOTSUPP;

        while (iov_iter_count(iter)) {
                ssize_t nr;

                if (type == READ) {
                        nr = filp->f_op->read(filp, iter_iov_addr(iter),
                                                iter_iov_len(iter), ppos);
                } else {
                        nr = filp->f_op->write(filp, iter_iov_addr(iter),
                                                iter_iov_len(iter), ppos);
                }

                if (nr < 0) {
                        if (!ret)
                                ret = nr;
                        break;
                }
                ret += nr;
                if (nr != iter_iov_len(iter))
                        break;
                iov_iter_advance(iter, nr);
        }

        return ret;
}

ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
                           struct iov_iter *iter)
{
        size_t tot_len;
        ssize_t ret = 0;

        if (!file->f_op->read_iter)
                return -EINVAL;
        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;

        tot_len = iov_iter_count(iter);
        if (!tot_len)
                goto out;
        ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
        if (ret < 0)
                return ret;

        ret = file->f_op->read_iter(iocb, iter);
out:
        if (ret >= 0)
                fsnotify_access(file);
        return ret;
}
EXPORT_SYMBOL(vfs_iocb_iter_read);

ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
                      rwf_t flags)
{
        size_t tot_len;
        ssize_t ret = 0;

        if (!file->f_op->read_iter)
                return -EINVAL;
        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;

        tot_len = iov_iter_count(iter);
        if (!tot_len)
                goto out;
        ret = rw_verify_area(READ, file, ppos, tot_len);
        if (ret < 0)
                return ret;

        ret = do_iter_readv_writev(file, iter, ppos, READ, flags);
out:
        if (ret >= 0)
                fsnotify_access(file);
        return ret;
}
EXPORT_SYMBOL(vfs_iter_read);

/*
 * Caller is responsible for calling kiocb_end_write() on completion
 * if async iocb was queued.
 */
ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
                            struct iov_iter *iter)
{
        size_t tot_len;
        ssize_t ret = 0;

        if (!file->f_op->write_iter)
                return -EINVAL;
        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;

        tot_len = iov_iter_count(iter);
        if (!tot_len)
                return 0;
        ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
        if (ret < 0)
                return ret;

        kiocb_start_write(iocb);
        ret = file->f_op->write_iter(iocb, iter);
        if (ret != -EIOCBQUEUED)
                kiocb_end_write(iocb);
        if (ret > 0)
                fsnotify_modify(file);

        return ret;
}
EXPORT_SYMBOL(vfs_iocb_iter_write);

ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
                       rwf_t flags)
{
        size_t tot_len;
        ssize_t ret;

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;
        if (!file->f_op->write_iter)
                return -EINVAL;

        tot_len = iov_iter_count(iter);
        if (!tot_len)
                return 0;

        ret = rw_verify_area(WRITE, file, ppos, tot_len);
        if (ret < 0)
                return ret;

        file_start_write(file);
        ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags);
        if (ret > 0)
                fsnotify_modify(file);
        file_end_write(file);

        return ret;
}
EXPORT_SYMBOL(vfs_iter_write);

static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
                         unsigned long vlen, loff_t *pos, rwf_t flags)
{
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        size_t tot_len;
        ssize_t ret = 0;

        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;

        ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov,
                           &iter);
        if (ret < 0)
                return ret;

        tot_len = iov_iter_count(&iter);
        if (!tot_len)
                goto out;

        ret = rw_verify_area(READ, file, pos, tot_len);
        if (ret < 0)
                goto out;

        if (file->f_op->read_iter)
                ret = do_iter_readv_writev(file, &iter, pos, READ, flags);
        else
                ret = do_loop_readv_writev(file, &iter, pos, READ, flags);
out:
        if (ret >= 0)
                fsnotify_access(file);
        kfree(iov);
        return ret;
}

static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
                          unsigned long vlen, loff_t *pos, rwf_t flags)
{
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        size_t tot_len;
        ssize_t ret = 0;

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;

        ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov,
                           &iter);
        if (ret < 0)
                return ret;

        tot_len = iov_iter_count(&iter);
        if (!tot_len)
                goto out;

        ret = rw_verify_area(WRITE, file, pos, tot_len);
        if (ret < 0)
                goto out;

        file_start_write(file);
        if (file->f_op->write_iter)
                ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags);
        else
                ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags);
        if (ret > 0)
                fsnotify_modify(file);
        file_end_write(file);
out:
        kfree(iov);
        return ret;
}

static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
                        unsigned long vlen, rwf_t flags)
{
        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;

        if (f.file) {
                loff_t pos, *ppos = file_ppos(f.file);
                if (ppos) {
                        pos = *ppos;
                        ppos = &pos;
                }
                ret = vfs_readv(f.file, vec, vlen, ppos, flags);
                if (ret >= 0 && ppos)
                        f.file->f_pos = pos;
                fdput_pos(f);
        }

        if (ret > 0)
                add_rchar(current, ret);
        inc_syscr(current);
        return ret;
}

static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
                         unsigned long vlen, rwf_t flags)
{
        struct fd f = fdget_pos(fd);
        ssize_t ret = -EBADF;

        if (f.file) {
                loff_t pos, *ppos = file_ppos(f.file);
                if (ppos) {
                        pos = *ppos;
                        ppos = &pos;
                }
                ret = vfs_writev(f.file, vec, vlen, ppos, flags);
                if (ret >= 0 && ppos)
                        f.file->f_pos = pos;
                fdput_pos(f);
        }

        if (ret > 0)
                add_wchar(current, ret);
        inc_syscw(current);
        return ret;
}

static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
{
#define HALF_LONG_BITS (BITS_PER_LONG / 2)
        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
}

static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
                         unsigned long vlen, loff_t pos, rwf_t flags)
{
        struct fd f;
        ssize_t ret = -EBADF;

        if (pos < 0)
                return -EINVAL;

        f = fdget(fd);
        if (f.file) {
                ret = -ESPIPE;
                if (f.file->f_mode & FMODE_PREAD)
                        ret = vfs_readv(f.file, vec, vlen, &pos, flags);
                fdput(f);
        }

        if (ret > 0)
                add_rchar(current, ret);
        inc_syscr(current);
        return ret;
}

static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
                          unsigned long vlen, loff_t pos, rwf_t flags)
{
        struct fd f;
        ssize_t ret = -EBADF;

        if (pos < 0)
                return -EINVAL;

        f = fdget(fd);
        if (f.file) {
                ret = -ESPIPE;
                if (f.file->f_mode & FMODE_PWRITE)
                        ret = vfs_writev(f.file, vec, vlen, &pos, flags);
                fdput(f);
        }

        if (ret > 0)
                add_wchar(current, ret);
        inc_syscw(current);
        return ret;
}

SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen)
{
        return do_readv(fd, vec, vlen, 0);
}

SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen)
{
        return do_writev(fd, vec, vlen, 0);
}

SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
{
        loff_t pos = pos_from_hilo(pos_h, pos_l);

        return do_preadv(fd, vec, vlen, pos, 0);
}

SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
                rwf_t, flags)
{
        loff_t pos = pos_from_hilo(pos_h, pos_l);

        if (pos == -1)
                return do_readv(fd, vec, vlen, flags);

        return do_preadv(fd, vec, vlen, pos, flags);
}

SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
{
        loff_t pos = pos_from_hilo(pos_h, pos_l);

        return do_pwritev(fd, vec, vlen, pos, 0);
}

SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
                rwf_t, flags)
{
        loff_t pos = pos_from_hilo(pos_h, pos_l);

        if (pos == -1)
                return do_writev(fd, vec, vlen, flags);

        return do_pwritev(fd, vec, vlen, pos, flags);
}

/*
 * Various compat syscalls.  Note that they all pretend to take a native
 * iovec - import_iovec will properly treat those as compat_iovecs based on
 * in_compat_syscall().
 */
#ifdef CONFIG_COMPAT
#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
                const struct iovec __user *, vec,
                unsigned long, vlen, loff_t, pos)
{
        return do_preadv(fd, vec, vlen, pos, 0);
}
#endif

COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
                const struct iovec __user *, vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        return do_preadv(fd, vec, vlen, pos, 0);
}

#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
                const struct iovec __user *, vec,
                unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
        if (pos == -1)
                return do_readv(fd, vec, vlen, flags);
        return do_preadv(fd, vec, vlen, pos, flags);
}
#endif

COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
                const struct iovec __user *, vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
                rwf_t, flags)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        if (pos == -1)
                return do_readv(fd, vec, vlen, flags);
        return do_preadv(fd, vec, vlen, pos, flags);
}

#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
                const struct iovec __user *, vec,
                unsigned long, vlen, loff_t, pos)
{
        return do_pwritev(fd, vec, vlen, pos, 0);
}
#endif

COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
                const struct iovec __user *,vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        return do_pwritev(fd, vec, vlen, pos, 0);
}

#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
                const struct iovec __user *, vec,
                unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
        if (pos == -1)
                return do_writev(fd, vec, vlen, flags);
        return do_pwritev(fd, vec, vlen, pos, flags);
}
#endif

COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
                const struct iovec __user *,vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        if (pos == -1)
                return do_writev(fd, vec, vlen, flags);
        return do_pwritev(fd, vec, vlen, pos, flags);
}
#endif /* CONFIG_COMPAT */

static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
                           size_t count, loff_t max)
{
        struct fd in, out;
        struct inode *in_inode, *out_inode;
        struct pipe_inode_info *opipe;
        loff_t pos;
        loff_t out_pos;
        ssize_t retval;
        int fl;

        /*
         * Get input file, and verify that it is ok..
         */
        retval = -EBADF;
        in = fdget(in_fd);
        if (!in.file)
                goto out;
        if (!(in.file->f_mode & FMODE_READ))
                goto fput_in;
        retval = -ESPIPE;
        if (!ppos) {
                pos = in.file->f_pos;
        } else {
                pos = *ppos;
                if (!(in.file->f_mode & FMODE_PREAD))
                        goto fput_in;
        }
        retval = rw_verify_area(READ, in.file, &pos, count);
        if (retval < 0)
                goto fput_in;
        if (count > MAX_RW_COUNT)
                count =  MAX_RW_COUNT;

        /*
         * Get output file, and verify that it is ok..
         */
        retval = -EBADF;
        out = fdget(out_fd);
        if (!out.file)
                goto fput_in;
        if (!(out.file->f_mode & FMODE_WRITE))
                goto fput_out;
        in_inode = file_inode(in.file);
        out_inode = file_inode(out.file);
        out_pos = out.file->f_pos;

        if (!max)
                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);

        if (unlikely(pos + count > max)) {
                retval = -EOVERFLOW;
                if (pos >= max)
                        goto fput_out;
                count = max - pos;
        }

        fl = 0;
#if 0
        /*
         * We need to debate whether we can enable this or not. The
         * man page documents EAGAIN return for the output at least,
         * and the application is arguably buggy if it doesn't expect
         * EAGAIN on a non-blocking file descriptor.
         */
        if (in.file->f_flags & O_NONBLOCK)
                fl = SPLICE_F_NONBLOCK;
#endif
        opipe = get_pipe_info(out.file, true);
        if (!opipe) {
                retval = rw_verify_area(WRITE, out.file, &out_pos, count);
                if (retval < 0)
                        goto fput_out;
                retval = do_splice_direct(in.file, &pos, out.file, &out_pos,
                                          count, fl);
        } else {
                if (out.file->f_flags & O_NONBLOCK)
                        fl |= SPLICE_F_NONBLOCK;

                retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl);
        }

        if (retval > 0) {
                add_rchar(current, retval);
                add_wchar(current, retval);
                fsnotify_access(in.file);
                fsnotify_modify(out.file);
                out.file->f_pos = out_pos;
                if (ppos)
                        *ppos = pos;
                else
                        in.file->f_pos = pos;
        }

        inc_syscr(current);
        inc_syscw(current);
        if (pos > max)
                retval = -EOVERFLOW;

fput_out:
        fdput(out);
fput_in:
        fdput(in);
out:
        return retval;
}

SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
{
        loff_t pos;
        off_t off;
        ssize_t ret;

        if (offset) {
                if (unlikely(get_user(off, offset)))
                        return -EFAULT;
                pos = off;
                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
                if (unlikely(put_user(pos, offset)))
                        return -EFAULT;
                return ret;
        }

        return do_sendfile(out_fd, in_fd, NULL, count, 0);
}

SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
{
        loff_t pos;
        ssize_t ret;

        if (offset) {
                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
                        return -EFAULT;
                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
                if (unlikely(put_user(pos, offset)))
                        return -EFAULT;
                return ret;
        }

        return do_sendfile(out_fd, in_fd, NULL, count, 0);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
                compat_off_t __user *, offset, compat_size_t, count)
{
        loff_t pos;
        off_t off;
        ssize_t ret;

        if (offset) {
                if (unlikely(get_user(off, offset)))
                        return -EFAULT;
                pos = off;
                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
                if (unlikely(put_user(pos, offset)))
                        return -EFAULT;
                return ret;
        }

        return do_sendfile(out_fd, in_fd, NULL, count, 0);
}

COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
                compat_loff_t __user *, offset, compat_size_t, count)
{
        loff_t pos;
        ssize_t ret;

        if (offset) {
                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
                        return -EFAULT;
                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
                if (unlikely(put_user(pos, offset)))
                        return -EFAULT;
                return ret;
        }

        return do_sendfile(out_fd, in_fd, NULL, count, 0);
}
#endif

/*
 * Performs necessary checks before doing a file copy
 *
 * Can adjust amount of bytes to copy via @req_count argument.
 * Returns appropriate error code that caller should return or
 * zero in case the copy should be allowed.
 */
static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
                                    struct file *file_out, loff_t pos_out,
                                    size_t *req_count, unsigned int flags)
{
        struct inode *inode_in = file_inode(file_in);
        struct inode *inode_out = file_inode(file_out);
        uint64_t count = *req_count;
        loff_t size_in;
        int ret;

        ret = generic_file_rw_checks(file_in, file_out);
        if (ret)
                return ret;

        /*
         * We allow some filesystems to handle cross sb copy, but passing
         * a file of the wrong filesystem type to filesystem driver can result
         * in an attempt to dereference the wrong type of ->private_data, so
         * avoid doing that until we really have a good reason.
         *
         * nfs and cifs define several different file_system_type structures
         * and several different sets of file_operations, but they all end up
         * using the same ->copy_file_range() function pointer.
         */
        if (flags & COPY_FILE_SPLICE) {
                /* cross sb splice is allowed */
        } else if (file_out->f_op->copy_file_range) {
                if (file_in->f_op->copy_file_range !=
                    file_out->f_op->copy_file_range)
                        return -EXDEV;
        } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) {
                return -EXDEV;
        }

        /* Don't touch certain kinds of inodes */
        if (IS_IMMUTABLE(inode_out))
                return -EPERM;

        if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
                return -ETXTBSY;

        /* Ensure offsets don't wrap. */
        if (pos_in + count < pos_in || pos_out + count < pos_out)
                return -EOVERFLOW;

        /* Shorten the copy to EOF */
        size_in = i_size_read(inode_in);
        if (pos_in >= size_in)
                count = 0;
        else
                count = min(count, size_in - (uint64_t)pos_in);

        ret = generic_write_check_limits(file_out, pos_out, &count);
        if (ret)
                return ret;

        /* Don't allow overlapped copying within the same file. */
        if (inode_in == inode_out &&
            pos_out + count > pos_in &&
            pos_out < pos_in + count)
                return -EINVAL;

        *req_count = count;
        return 0;
}

/*
 * copy_file_range() differs from regular file read and write in that it
 * specifically allows return partial success.  When it does so is up to
 * the copy_file_range method.
 */
ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
                            struct file *file_out, loff_t pos_out,
                            size_t len, unsigned int flags)
{
        ssize_t ret;
        bool splice = flags & COPY_FILE_SPLICE;
        bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb;

        if (flags & ~COPY_FILE_SPLICE)
                return -EINVAL;

        ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
                                       flags);
        if (unlikely(ret))
                return ret;

        ret = rw_verify_area(READ, file_in, &pos_in, len);
        if (unlikely(ret))
                return ret;

        ret = rw_verify_area(WRITE, file_out, &pos_out, len);
        if (unlikely(ret))
                return ret;

        if (len == 0)
                return 0;

        file_start_write(file_out);

        /*
         * Cloning is supported by more file systems, so we implement copy on
         * same sb using clone, but for filesystems where both clone and copy
         * are supported (e.g. nfs,cifs), we only call the copy method.
         */
        if (!splice && file_out->f_op->copy_file_range) {
                ret = file_out->f_op->copy_file_range(file_in, pos_in,
                                                      file_out, pos_out,
                                                      len, flags);
        } else if (!splice && file_in->f_op->remap_file_range && samesb) {
                ret = file_in->f_op->remap_file_range(file_in, pos_in,
                                file_out, pos_out,
                                min_t(loff_t, MAX_RW_COUNT, len),
                                REMAP_FILE_CAN_SHORTEN);
                /* fallback to splice */
                if (ret <= 0)
                        splice = true;
        } else if (samesb) {
                /* Fallback to splice for same sb copy for backward compat */
                splice = true;
        }

        file_end_write(file_out);

        if (!splice)
                goto done;

        /*
         * We can get here for same sb copy of filesystems that do not implement
         * ->copy_file_range() in case filesystem does not support clone or in
         * case filesystem supports clone but rejected the clone request (e.g.
         * because it was not block aligned).
         *
         * In both cases, fall back to kernel copy so we are able to maintain a
         * consistent story about which filesystems support copy_file_range()
         * and which filesystems do not, that will allow userspace tools to
         * make consistent desicions w.r.t using copy_file_range().
         *
         * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE
         * for server-side-copy between any two sb.
         *
         * In any case, we call do_splice_direct() and not splice_file_range(),
         * without file_start_write() held, to avoid possible deadlocks related
         * to splicing from input file, while file_start_write() is held on
         * the output file on a different sb.
         */
        ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
                               min_t(size_t, len, MAX_RW_COUNT), 0);
done:
        if (ret > 0) {
                fsnotify_access(file_in);
                add_rchar(current, ret);
                fsnotify_modify(file_out);
                add_wchar(current, ret);
        }

        inc_syscr(current);
        inc_syscw(current);

        return ret;
}
EXPORT_SYMBOL(vfs_copy_file_range);

SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
                int, fd_out, loff_t __user *, off_out,
                size_t, len, unsigned int, flags)
{
        loff_t pos_in;
        loff_t pos_out;
        struct fd f_in;
        struct fd f_out;
        ssize_t ret = -EBADF;

        f_in = fdget(fd_in);
        if (!f_in.file)
                goto out2;

        f_out = fdget(fd_out);
        if (!f_out.file)
                goto out1;

        ret = -EFAULT;
        if (off_in) {
                if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
                        goto out;
        } else {
                pos_in = f_in.file->f_pos;
        }

        if (off_out) {
                if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
                        goto out;
        } else {
                pos_out = f_out.file->f_pos;
        }

        ret = -EINVAL;
        if (flags != 0)
                goto out;

        ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
                                  flags);
        if (ret > 0) {
                pos_in += ret;
                pos_out += ret;

                if (off_in) {
                        if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
                                ret = -EFAULT;
                } else {
                        f_in.file->f_pos = pos_in;
                }

                if (off_out) {
                        if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
                                ret = -EFAULT;
                } else {
                        f_out.file->f_pos = pos_out;
                }
        }

out:
        fdput(f_out);
out1:
        fdput(f_in);
out2:
        return ret;
}

/*
 * Don't operate on ranges the page cache doesn't support, and don't exceed the
 * LFS limits.  If pos is under the limit it becomes a short access.  If it
 * exceeds the limit we return -EFBIG.
 */
int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
{
        struct inode *inode = file->f_mapping->host;
        loff_t max_size = inode->i_sb->s_maxbytes;
        loff_t limit = rlimit(RLIMIT_FSIZE);

        if (limit != RLIM_INFINITY) {
                if (pos >= limit) {
                        send_sig(SIGXFSZ, current, 0);
                        return -EFBIG;
                }
                *count = min(*count, limit - pos);
        }

        if (!(file->f_flags & O_LARGEFILE))
                max_size = MAX_NON_LFS;

        if (unlikely(pos >= max_size))
                return -EFBIG;

        *count = min(*count, max_size - pos);

        return 0;
}
EXPORT_SYMBOL_GPL(generic_write_check_limits);

/* Like generic_write_checks(), but takes size of write instead of iter. */
int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;

        if (IS_SWAPFILE(inode))
                return -ETXTBSY;

        if (!*count)
                return 0;

        if (iocb->ki_flags & IOCB_APPEND)
                iocb->ki_pos = i_size_read(inode);

        if ((iocb->ki_flags & IOCB_NOWAIT) &&
            !((iocb->ki_flags & IOCB_DIRECT) ||
              (file->f_op->fop_flags & FOP_BUFFER_WASYNC)))
                return -EINVAL;

        return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
}
EXPORT_SYMBOL(generic_write_checks_count);

/*
 * Performs necessary checks before doing a write
 *
 * Can adjust writing position or amount of bytes to write.
 * Returns appropriate error code that caller should return or
 * zero in case that write should be allowed.
 */
ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
{
        loff_t count = iov_iter_count(from);
        int ret;

        ret = generic_write_checks_count(iocb, &count);
        if (ret)
                return ret;

        iov_iter_truncate(from, count);
        return iov_iter_count(from);
}
EXPORT_SYMBOL(generic_write_checks);

/*
 * Performs common checks before doing a file copy/clone
 * from @file_in to @file_out.
 */
int generic_file_rw_checks(struct file *file_in, struct file *file_out)
{
        struct inode *inode_in = file_inode(file_in);
        struct inode *inode_out = file_inode(file_out);

        /* Don't copy dirs, pipes, sockets... */
        if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
                return -EISDIR;
        if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
                return -EINVAL;

        if (!(file_in->f_mode & FMODE_READ) ||
            !(file_out->f_mode & FMODE_WRITE) ||
            (file_out->f_flags & O_APPEND))
                return -EBADF;

        return 0;
}




















































































































































































































































































































































































































































































































































































































































































































































    2 


    1 











































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
// SPDX-License-Identifier: GPL-2.0
/*
 * drivers/base/power/wakeup.c - System wakeup events framework
 *
 * Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
 */
#define pr_fmt(fmt) "PM: " fmt

#include <linux/device.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/capability.h>
#include <linux/export.h>
#include <linux/suspend.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/pm_wakeirq.h>
#include <trace/events/power.h>

#include "power.h"

#define list_for_each_entry_rcu_locked(pos, head, member) \
        list_for_each_entry_rcu(pos, head, member, \
                srcu_read_lock_held(&wakeup_srcu))
/*
 * If set, the suspend/hibernate code will abort transitions to a sleep state
 * if wakeup events are registered during or immediately before the transition.
 */
bool events_check_enabled __read_mostly;

/* First wakeup IRQ seen by the kernel in the last cycle. */
static unsigned int wakeup_irq[2] __read_mostly;
static DEFINE_RAW_SPINLOCK(wakeup_irq_lock);

/* If greater than 0 and the system is suspending, terminate the suspend. */
static atomic_t pm_abort_suspend __read_mostly;

/*
 * Combined counters of registered wakeup events and wakeup events in progress.
 * They need to be modified together atomically, so it's better to use one
 * atomic variable to hold them both.
 */
static atomic_t combined_event_count = ATOMIC_INIT(0);

#define IN_PROGRESS_BITS        (sizeof(int) * 4)
#define MAX_IN_PROGRESS                ((1 << IN_PROGRESS_BITS) - 1)

static void split_counters(unsigned int *cnt, unsigned int *inpr)
{
        unsigned int comb = atomic_read(&combined_event_count);

        *cnt = (comb >> IN_PROGRESS_BITS);
        *inpr = comb & MAX_IN_PROGRESS;
}

/* A preserved old value of the events counter. */
static unsigned int saved_count;

static DEFINE_RAW_SPINLOCK(events_lock);

static void pm_wakeup_timer_fn(struct timer_list *t);

static LIST_HEAD(wakeup_sources);

static DECLARE_WAIT_QUEUE_HEAD(wakeup_count_wait_queue);

DEFINE_STATIC_SRCU(wakeup_srcu);

static struct wakeup_source deleted_ws = {
        .name = "deleted",
        .lock =  __SPIN_LOCK_UNLOCKED(deleted_ws.lock),
};

static DEFINE_IDA(wakeup_ida);

/**
 * wakeup_source_create - Create a struct wakeup_source object.
 * @name: Name of the new wakeup source.
 */
struct wakeup_source *wakeup_source_create(const char *name)
{
        struct wakeup_source *ws;
        const char *ws_name;
        int id;

        ws = kzalloc(sizeof(*ws), GFP_KERNEL);
        if (!ws)
                goto err_ws;

        ws_name = kstrdup_const(name, GFP_KERNEL);
        if (!ws_name)
                goto err_name;
        ws->name = ws_name;

        id = ida_alloc(&wakeup_ida, GFP_KERNEL);
        if (id < 0)
                goto err_id;
        ws->id = id;

        return ws;

err_id:
        kfree_const(ws->name);
err_name:
        kfree(ws);
err_ws:
        return NULL;
}
EXPORT_SYMBOL_GPL(wakeup_source_create);

/*
 * Record wakeup_source statistics being deleted into a dummy wakeup_source.
 */
static void wakeup_source_record(struct wakeup_source *ws)
{
        unsigned long flags;

        spin_lock_irqsave(&deleted_ws.lock, flags);

        if (ws->event_count) {
                deleted_ws.total_time =
                        ktime_add(deleted_ws.total_time, ws->total_time);
                deleted_ws.prevent_sleep_time =
                        ktime_add(deleted_ws.prevent_sleep_time,
                                  ws->prevent_sleep_time);
                deleted_ws.max_time =
                        ktime_compare(deleted_ws.max_time, ws->max_time) > 0 ?
                                deleted_ws.max_time : ws->max_time;
                deleted_ws.event_count += ws->event_count;
                deleted_ws.active_count += ws->active_count;
                deleted_ws.relax_count += ws->relax_count;
                deleted_ws.expire_count += ws->expire_count;
                deleted_ws.wakeup_count += ws->wakeup_count;
        }

        spin_unlock_irqrestore(&deleted_ws.lock, flags);
}

static void wakeup_source_free(struct wakeup_source *ws)
{
        ida_free(&wakeup_ida, ws->id);
        kfree_const(ws->name);
        kfree(ws);
}

/**
 * wakeup_source_destroy - Destroy a struct wakeup_source object.
 * @ws: Wakeup source to destroy.
 *
 * Use only for wakeup source objects created with wakeup_source_create().
 */
void wakeup_source_destroy(struct wakeup_source *ws)
{
        if (!ws)
                return;

        __pm_relax(ws);
        wakeup_source_record(ws);
        wakeup_source_free(ws);
}
EXPORT_SYMBOL_GPL(wakeup_source_destroy);

/**
 * wakeup_source_add - Add given object to the list of wakeup sources.
 * @ws: Wakeup source object to add to the list.
 */
void wakeup_source_add(struct wakeup_source *ws)
{
        unsigned long flags;

        if (WARN_ON(!ws))
                return;

        spin_lock_init(&ws->lock);
        timer_setup(&ws->timer, pm_wakeup_timer_fn, 0);
        ws->active = false;

        raw_spin_lock_irqsave(&events_lock, flags);
        list_add_rcu(&ws->entry, &wakeup_sources);
        raw_spin_unlock_irqrestore(&events_lock, flags);
}
EXPORT_SYMBOL_GPL(wakeup_source_add);

/**
 * wakeup_source_remove - Remove given object from the wakeup sources list.
 * @ws: Wakeup source object to remove from the list.
 */
void wakeup_source_remove(struct wakeup_source *ws)
{
        unsigned long flags;

        if (WARN_ON(!ws))
                return;

        raw_spin_lock_irqsave(&events_lock, flags);
        list_del_rcu(&ws->entry);
        raw_spin_unlock_irqrestore(&events_lock, flags);
        synchronize_srcu(&wakeup_srcu);

        del_timer_sync(&ws->timer);
        /*
         * Clear timer.function to make wakeup_source_not_registered() treat
         * this wakeup source as not registered.
         */
        ws->timer.function = NULL;
}
EXPORT_SYMBOL_GPL(wakeup_source_remove);

/**
 * wakeup_source_register - Create wakeup source and add it to the list.
 * @dev: Device this wakeup source is associated with (or NULL if virtual).
 * @name: Name of the wakeup source to register.
 */
struct wakeup_source *wakeup_source_register(struct device *dev,
                                             const char *name)
{
        struct wakeup_source *ws;
        int ret;

        ws = wakeup_source_create(name);
        if (ws) {
                if (!dev || device_is_registered(dev)) {
                        ret = wakeup_source_sysfs_add(dev, ws);
                        if (ret) {
                                wakeup_source_free(ws);
                                return NULL;
                        }
                }
                wakeup_source_add(ws);
        }
        return ws;
}
EXPORT_SYMBOL_GPL(wakeup_source_register);

/**
 * wakeup_source_unregister - Remove wakeup source from the list and remove it.
 * @ws: Wakeup source object to unregister.
 */
void wakeup_source_unregister(struct wakeup_source *ws)
{
        if (ws) {
                wakeup_source_remove(ws);
                if (ws->dev)
                        wakeup_source_sysfs_remove(ws);

                wakeup_source_destroy(ws);
        }
}
EXPORT_SYMBOL_GPL(wakeup_source_unregister);

/**
 * wakeup_sources_read_lock - Lock wakeup source list for read.
 *
 * Returns an index of srcu lock for struct wakeup_srcu.
 * This index must be passed to the matching wakeup_sources_read_unlock().
 */
int wakeup_sources_read_lock(void)
{
        return srcu_read_lock(&wakeup_srcu);
}
EXPORT_SYMBOL_GPL(wakeup_sources_read_lock);

/**
 * wakeup_sources_read_unlock - Unlock wakeup source list.
 * @idx: return value from corresponding wakeup_sources_read_lock()
 */
void wakeup_sources_read_unlock(int idx)
{
        srcu_read_unlock(&wakeup_srcu, idx);
}
EXPORT_SYMBOL_GPL(wakeup_sources_read_unlock);

/**
 * wakeup_sources_walk_start - Begin a walk on wakeup source list
 *
 * Returns first object of the list of wakeup sources.
 *
 * Note that to be safe, wakeup sources list needs to be locked by calling
 * wakeup_source_read_lock() for this.
 */
struct wakeup_source *wakeup_sources_walk_start(void)
{
        struct list_head *ws_head = &wakeup_sources;

        return list_entry_rcu(ws_head->next, struct wakeup_source, entry);
}
EXPORT_SYMBOL_GPL(wakeup_sources_walk_start);

/**
 * wakeup_sources_walk_next - Get next wakeup source from the list
 * @ws: Previous wakeup source object
 *
 * Note that to be safe, wakeup sources list needs to be locked by calling
 * wakeup_source_read_lock() for this.
 */
struct wakeup_source *wakeup_sources_walk_next(struct wakeup_source *ws)
{
        struct list_head *ws_head = &wakeup_sources;

        return list_next_or_null_rcu(ws_head, &ws->entry,
                                struct wakeup_source, entry);
}
EXPORT_SYMBOL_GPL(wakeup_sources_walk_next);

/**
 * device_wakeup_attach - Attach a wakeup source object to a device object.
 * @dev: Device to handle.
 * @ws: Wakeup source object to attach to @dev.
 *
 * This causes @dev to be treated as a wakeup device.
 */
static int device_wakeup_attach(struct device *dev, struct wakeup_source *ws)
{
        spin_lock_irq(&dev->power.lock);
        if (dev->power.wakeup) {
                spin_unlock_irq(&dev->power.lock);
                return -EEXIST;
        }
        dev->power.wakeup = ws;
        if (dev->power.wakeirq)
                device_wakeup_attach_irq(dev, dev->power.wakeirq);
        spin_unlock_irq(&dev->power.lock);
        return 0;
}

/**
 * device_wakeup_enable - Enable given device to be a wakeup source.
 * @dev: Device to handle.
 *
 * Create a wakeup source object, register it and attach it to @dev.
 */
int device_wakeup_enable(struct device *dev)
{
        struct wakeup_source *ws;
        int ret;

        if (!dev || !dev->power.can_wakeup)
                return -EINVAL;

        if (pm_suspend_target_state != PM_SUSPEND_ON)
                dev_dbg(dev, "Suspicious %s() during system transition!\n", __func__);

        ws = wakeup_source_register(dev, dev_name(dev));
        if (!ws)
                return -ENOMEM;

        ret = device_wakeup_attach(dev, ws);
        if (ret)
                wakeup_source_unregister(ws);

        return ret;
}
EXPORT_SYMBOL_GPL(device_wakeup_enable);

/**
 * device_wakeup_attach_irq - Attach a wakeirq to a wakeup source
 * @dev: Device to handle
 * @wakeirq: Device specific wakeirq entry
 *
 * Attach a device wakeirq to the wakeup source so the device
 * wake IRQ can be configured automatically for suspend and
 * resume.
 *
 * Call under the device's power.lock lock.
 */
void device_wakeup_attach_irq(struct device *dev,
                             struct wake_irq *wakeirq)
{
        struct wakeup_source *ws;

        ws = dev->power.wakeup;
        if (!ws)
                return;

        if (ws->wakeirq)
                dev_err(dev, "Leftover wakeup IRQ found, overriding\n");

        ws->wakeirq = wakeirq;
}

/**
 * device_wakeup_detach_irq - Detach a wakeirq from a wakeup source
 * @dev: Device to handle
 *
 * Removes a device wakeirq from the wakeup source.
 *
 * Call under the device's power.lock lock.
 */
void device_wakeup_detach_irq(struct device *dev)
{
        struct wakeup_source *ws;

        ws = dev->power.wakeup;
        if (ws)
                ws->wakeirq = NULL;
}

/**
 * device_wakeup_arm_wake_irqs -
 *
 * Iterates over the list of device wakeirqs to arm them.
 */
void device_wakeup_arm_wake_irqs(void)
{
        struct wakeup_source *ws;
        int srcuidx;

        srcuidx = srcu_read_lock(&wakeup_srcu);
        list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry)
                dev_pm_arm_wake_irq(ws->wakeirq);
        srcu_read_unlock(&wakeup_srcu, srcuidx);
}

/**
 * device_wakeup_disarm_wake_irqs -
 *
 * Iterates over the list of device wakeirqs to disarm them.
 */
void device_wakeup_disarm_wake_irqs(void)
{
        struct wakeup_source *ws;
        int srcuidx;

        srcuidx = srcu_read_lock(&wakeup_srcu);
        list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry)
                dev_pm_disarm_wake_irq(ws->wakeirq);
        srcu_read_unlock(&wakeup_srcu, srcuidx);
}

/**
 * device_wakeup_detach - Detach a device's wakeup source object from it.
 * @dev: Device to detach the wakeup source object from.
 *
 * After it returns, @dev will not be treated as a wakeup device any more.
 */
static struct wakeup_source *device_wakeup_detach(struct device *dev)
{
        struct wakeup_source *ws;

        spin_lock_irq(&dev->power.lock);
        ws = dev->power.wakeup;
        dev->power.wakeup = NULL;
        spin_unlock_irq(&dev->power.lock);
        return ws;
}

/**
 * device_wakeup_disable - Do not regard a device as a wakeup source any more.
 * @dev: Device to handle.
 *
 * Detach the @dev's wakeup source object from it, unregister this wakeup source
 * object and destroy it.
 */
void device_wakeup_disable(struct device *dev)
{
        struct wakeup_source *ws;

        if (!dev || !dev->power.can_wakeup)
                return;

        ws = device_wakeup_detach(dev);
        wakeup_source_unregister(ws);
}
EXPORT_SYMBOL_GPL(device_wakeup_disable);

/**
 * device_set_wakeup_capable - Set/reset device wakeup capability flag.
 * @dev: Device to handle.
 * @capable: Whether or not @dev is capable of waking up the system from sleep.
 *
 * If @capable is set, set the @dev's power.can_wakeup flag and add its
 * wakeup-related attributes to sysfs.  Otherwise, unset the @dev's
 * power.can_wakeup flag and remove its wakeup-related attributes from sysfs.
 *
 * This function may sleep and it can't be called from any context where
 * sleeping is not allowed.
 */
void device_set_wakeup_capable(struct device *dev, bool capable)
{
        if (!!dev->power.can_wakeup == !!capable)
                return;

        dev->power.can_wakeup = capable;
        if (device_is_registered(dev) && !list_empty(&dev->power.entry)) {
                if (capable) {
                        int ret = wakeup_sysfs_add(dev);

                        if (ret)
                                dev_info(dev, "Wakeup sysfs attributes not added\n");
                } else {
                        wakeup_sysfs_remove(dev);
                }
        }
}
EXPORT_SYMBOL_GPL(device_set_wakeup_capable);

/**
 * device_set_wakeup_enable - Enable or disable a device to wake up the system.
 * @dev: Device to handle.
 * @enable: enable/disable flag
 */
int device_set_wakeup_enable(struct device *dev, bool enable)
{
        if (enable)
                return device_wakeup_enable(dev);

        device_wakeup_disable(dev);
        return 0;
}
EXPORT_SYMBOL_GPL(device_set_wakeup_enable);

/**
 * wakeup_source_not_registered - validate the given wakeup source.
 * @ws: Wakeup source to be validated.
 */
static bool wakeup_source_not_registered(struct wakeup_source *ws)
{
        /*
         * Use timer struct to check if the given source is initialized
         * by wakeup_source_add.
         */
        return ws->timer.function != pm_wakeup_timer_fn;
}

/*
 * The functions below use the observation that each wakeup event starts a
 * period in which the system should not be suspended.  The moment this period
 * will end depends on how the wakeup event is going to be processed after being
 * detected and all of the possible cases can be divided into two distinct
 * groups.
 *
 * First, a wakeup event may be detected by the same functional unit that will
 * carry out the entire processing of it and possibly will pass it to user space
 * for further processing.  In that case the functional unit that has detected
 * the event may later "close" the "no suspend" period associated with it
 * directly as soon as it has been dealt with.  The pair of pm_stay_awake() and
 * pm_relax(), balanced with each other, is supposed to be used in such
 * situations.
 *
 * Second, a wakeup event may be detected by one functional unit and processed
 * by another one.  In that case the unit that has detected it cannot really
 * "close" the "no suspend" period associated with it, unless it knows in
 * advance what's going to happen to the event during processing.  This
 * knowledge, however, may not be available to it, so it can simply specify time
 * to wait before the system can be suspended and pass it as the second
 * argument of pm_wakeup_event().
 *
 * It is valid to call pm_relax() after pm_wakeup_event(), in which case the
 * "no suspend" period will be ended either by the pm_relax(), or by the timer
 * function executed when the timer expires, whichever comes first.
 */

/**
 * wakeup_source_activate - Mark given wakeup source as active.
 * @ws: Wakeup source to handle.
 *
 * Update the @ws' statistics and, if @ws has just been activated, notify the PM
 * core of the event by incrementing the counter of the wakeup events being
 * processed.
 */
static void wakeup_source_activate(struct wakeup_source *ws)
{
        unsigned int cec;

        if (WARN_ONCE(wakeup_source_not_registered(ws),
                        "unregistered wakeup source\n"))
                return;

        ws->active = true;
        ws->active_count++;
        ws->last_time = ktime_get();
        if (ws->autosleep_enabled)
                ws->start_prevent_time = ws->last_time;

        /* Increment the counter of events in progress. */
        cec = atomic_inc_return(&combined_event_count);

        trace_wakeup_source_activate(ws->name, cec);
}

/**
 * wakeup_source_report_event - Report wakeup event using the given source.
 * @ws: Wakeup source to report the event for.
 * @hard: If set, abort suspends in progress and wake up from suspend-to-idle.
 */
static void wakeup_source_report_event(struct wakeup_source *ws, bool hard)
{
        ws->event_count++;
        /* This is racy, but the counter is approximate anyway. */
        if (events_check_enabled)
                ws->wakeup_count++;

        if (!ws->active)
                wakeup_source_activate(ws);

        if (hard)
                pm_system_wakeup();
}

/**
 * __pm_stay_awake - Notify the PM core of a wakeup event.
 * @ws: Wakeup source object associated with the source of the event.
 *
 * It is safe to call this function from interrupt context.
 */
void __pm_stay_awake(struct wakeup_source *ws)
{
        unsigned long flags;

        if (!ws)
                return;

        spin_lock_irqsave(&ws->lock, flags);

        wakeup_source_report_event(ws, false);
        del_timer(&ws->timer);
        ws->timer_expires = 0;

        spin_unlock_irqrestore(&ws->lock, flags);
}
EXPORT_SYMBOL_GPL(__pm_stay_awake);

/**
 * pm_stay_awake - Notify the PM core that a wakeup event is being processed.
 * @dev: Device the wakeup event is related to.
 *
 * Notify the PM core of a wakeup event (signaled by @dev) by calling
 * __pm_stay_awake for the @dev's wakeup source object.
 *
 * Call this function after detecting of a wakeup event if pm_relax() is going
 * to be called directly after processing the event (and possibly passing it to
 * user space for further processing).
 */
void pm_stay_awake(struct device *dev)
{
        unsigned long flags;

        if (!dev)
                return;

        spin_lock_irqsave(&dev->power.lock, flags);
        __pm_stay_awake(dev->power.wakeup);
        spin_unlock_irqrestore(&dev->power.lock, flags);
}
EXPORT_SYMBOL_GPL(pm_stay_awake);

#ifdef CONFIG_PM_AUTOSLEEP
static void update_prevent_sleep_time(struct wakeup_source *ws, ktime_t now)
{
        ktime_t delta = ktime_sub(now, ws->start_prevent_time);
        ws->prevent_sleep_time = ktime_add(ws->prevent_sleep_time, delta);
}
#else
static inline void update_prevent_sleep_time(struct wakeup_source *ws,
                                             ktime_t now) {}
#endif

/**
 * wakeup_source_deactivate - Mark given wakeup source as inactive.
 * @ws: Wakeup source to handle.
 *
 * Update the @ws' statistics and notify the PM core that the wakeup source has
 * become inactive by decrementing the counter of wakeup events being processed
 * and incrementing the counter of registered wakeup events.
 */
static void wakeup_source_deactivate(struct wakeup_source *ws)
{
        unsigned int cnt, inpr, cec;
        ktime_t duration;
        ktime_t now;

        ws->relax_count++;
        /*
         * __pm_relax() may be called directly or from a timer function.
         * If it is called directly right after the timer function has been
         * started, but before the timer function calls __pm_relax(), it is
         * possible that __pm_stay_awake() will be called in the meantime and
         * will set ws->active.  Then, ws->active may be cleared immediately
         * by the __pm_relax() called from the timer function, but in such a
         * case ws->relax_count will be different from ws->active_count.
         */
        if (ws->relax_count != ws->active_count) {
                ws->relax_count--;
                return;
        }

        ws->active = false;

        now = ktime_get();
        duration = ktime_sub(now, ws->last_time);
        ws->total_time = ktime_add(ws->total_time, duration);
        if (ktime_to_ns(duration) > ktime_to_ns(ws->max_time))
                ws->max_time = duration;

        ws->last_time = now;
        del_timer(&ws->timer);
        ws->timer_expires = 0;

        if (ws->autosleep_enabled)
                update_prevent_sleep_time(ws, now);

        /*
         * Increment the counter of registered wakeup events and decrement the
         * counter of wakeup events in progress simultaneously.
         */
        cec = atomic_add_return(MAX_IN_PROGRESS, &combined_event_count);
        trace_wakeup_source_deactivate(ws->name, cec);

        split_counters(&cnt, &inpr);
        if (!inpr && waitqueue_active(&wakeup_count_wait_queue))
                wake_up(&wakeup_count_wait_queue);
}

/**
 * __pm_relax - Notify the PM core that processing of a wakeup event has ended.
 * @ws: Wakeup source object associated with the source of the event.
 *
 * Call this function for wakeup events whose processing started with calling
 * __pm_stay_awake().
 *
 * It is safe to call it from interrupt context.
 */
void __pm_relax(struct wakeup_source *ws)
{
        unsigned long flags;

        if (!ws)
                return;

        spin_lock_irqsave(&ws->lock, flags);
        if (ws->active)
                wakeup_source_deactivate(ws);
        spin_unlock_irqrestore(&ws->lock, flags);
}
EXPORT_SYMBOL_GPL(__pm_relax);

/**
 * pm_relax - Notify the PM core that processing of a wakeup event has ended.
 * @dev: Device that signaled the event.
 *
 * Execute __pm_relax() for the @dev's wakeup source object.
 */
void pm_relax(struct device *dev)
{
        unsigned long flags;

        if (!dev)
                return;

        spin_lock_irqsave(&dev->power.lock, flags);
        __pm_relax(dev->power.wakeup);
        spin_unlock_irqrestore(&dev->power.lock, flags);
}
EXPORT_SYMBOL_GPL(pm_relax);

/**
 * pm_wakeup_timer_fn - Delayed finalization of a wakeup event.
 * @t: timer list
 *
 * Call wakeup_source_deactivate() for the wakeup source whose address is stored
 * in @data if it is currently active and its timer has not been canceled and
 * the expiration time of the timer is not in future.
 */
static void pm_wakeup_timer_fn(struct timer_list *t)
{
        struct wakeup_source *ws = from_timer(ws, t, timer);
        unsigned long flags;

        spin_lock_irqsave(&ws->lock, flags);

        if (ws->active && ws->timer_expires
            && time_after_eq(jiffies, ws->timer_expires)) {
                wakeup_source_deactivate(ws);
                ws->expire_count++;
        }

        spin_unlock_irqrestore(&ws->lock, flags);
}

/**
 * pm_wakeup_ws_event - Notify the PM core of a wakeup event.
 * @ws: Wakeup source object associated with the event source.
 * @msec: Anticipated event processing time (in milliseconds).
 * @hard: If set, abort suspends in progress and wake up from suspend-to-idle.
 *
 * Notify the PM core of a wakeup event whose source is @ws that will take
 * approximately @msec milliseconds to be processed by the kernel.  If @ws is
 * not active, activate it.  If @msec is nonzero, set up the @ws' timer to
 * execute pm_wakeup_timer_fn() in future.
 *
 * It is safe to call this function from interrupt context.
 */
void pm_wakeup_ws_event(struct wakeup_source *ws, unsigned int msec, bool hard)
{
        unsigned long flags;
        unsigned long expires;

        if (!ws)
                return;

        spin_lock_irqsave(&ws->lock, flags);

        wakeup_source_report_event(ws, hard);

        if (!msec) {
                wakeup_source_deactivate(ws);
                goto unlock;
        }

        expires = jiffies + msecs_to_jiffies(msec);
        if (!expires)
                expires = 1;

        if (!ws->timer_expires || time_after(expires, ws->timer_expires)) {
                mod_timer(&ws->timer, expires);
                ws->timer_expires = expires;
        }

 unlock:
        spin_unlock_irqrestore(&ws->lock, flags);
}
EXPORT_SYMBOL_GPL(pm_wakeup_ws_event);

/**
 * pm_wakeup_dev_event - Notify the PM core of a wakeup event.
 * @dev: Device the wakeup event is related to.
 * @msec: Anticipated event processing time (in milliseconds).
 * @hard: If set, abort suspends in progress and wake up from suspend-to-idle.
 *
 * Call pm_wakeup_ws_event() for the @dev's wakeup source object.
 */
void pm_wakeup_dev_event(struct device *dev, unsigned int msec, bool hard)
{
        unsigned long flags;

        if (!dev)
                return;

        spin_lock_irqsave(&dev->power.lock, flags);
        pm_wakeup_ws_event(dev->power.wakeup, msec, hard);
        spin_unlock_irqrestore(&dev->power.lock, flags);
}
EXPORT_SYMBOL_GPL(pm_wakeup_dev_event);

void pm_print_active_wakeup_sources(void)
{
        struct wakeup_source *ws;
        int srcuidx, active = 0;
        struct wakeup_source *last_activity_ws = NULL;

        srcuidx = srcu_read_lock(&wakeup_srcu);
        list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) {
                if (ws->active) {
                        pm_pr_dbg("active wakeup source: %s\n", ws->name);
                        active = 1;
                } else if (!active &&
                           (!last_activity_ws ||
                            ktime_to_ns(ws->last_time) >
                            ktime_to_ns(last_activity_ws->last_time))) {
                        last_activity_ws = ws;
                }
        }

        if (!active && last_activity_ws)
                pm_pr_dbg("last active wakeup source: %s\n",
                        last_activity_ws->name);
        srcu_read_unlock(&wakeup_srcu, srcuidx);
}
EXPORT_SYMBOL_GPL(pm_print_active_wakeup_sources);

/**
 * pm_wakeup_pending - Check if power transition in progress should be aborted.
 *
 * Compare the current number of registered wakeup events with its preserved
 * value from the past and return true if new wakeup events have been registered
 * since the old value was stored.  Also return true if the current number of
 * wakeup events being processed is different from zero.
 */
bool pm_wakeup_pending(void)
{
        unsigned long flags;
        bool ret = false;

        raw_spin_lock_irqsave(&events_lock, flags);
        if (events_check_enabled) {
                unsigned int cnt, inpr;

                split_counters(&cnt, &inpr);
                ret = (cnt != saved_count || inpr > 0);
                events_check_enabled = !ret;
        }
        raw_spin_unlock_irqrestore(&events_lock, flags);

        if (ret) {
                pm_pr_dbg("Wakeup pending, aborting suspend\n");
                pm_print_active_wakeup_sources();
        }

        return ret || atomic_read(&pm_abort_suspend) > 0;
}
EXPORT_SYMBOL_GPL(pm_wakeup_pending);

void pm_system_wakeup(void)
{
        atomic_inc(&pm_abort_suspend);
        s2idle_wake();
}
EXPORT_SYMBOL_GPL(pm_system_wakeup);

void pm_system_cancel_wakeup(void)
{
        atomic_dec_if_positive(&pm_abort_suspend);
}

void pm_wakeup_clear(unsigned int irq_number)
{
        raw_spin_lock_irq(&wakeup_irq_lock);

        if (irq_number && wakeup_irq[0] == irq_number)
                wakeup_irq[0] = wakeup_irq[1];
        else
                wakeup_irq[0] = 0;

        wakeup_irq[1] = 0;

        raw_spin_unlock_irq(&wakeup_irq_lock);

        if (!irq_number)
                atomic_set(&pm_abort_suspend, 0);
}

void pm_system_irq_wakeup(unsigned int irq_number)
{
        unsigned long flags;

        raw_spin_lock_irqsave(&wakeup_irq_lock, flags);

        if (wakeup_irq[0] == 0)
                wakeup_irq[0] = irq_number;
        else if (wakeup_irq[1] == 0)
                wakeup_irq[1] = irq_number;
        else
                irq_number = 0;

        pm_pr_dbg("Triggering wakeup from IRQ %d\n", irq_number);

        raw_spin_unlock_irqrestore(&wakeup_irq_lock, flags);

        if (irq_number)
                pm_system_wakeup();
}

unsigned int pm_wakeup_irq(void)
{
        return wakeup_irq[0];
}

/**
 * pm_get_wakeup_count - Read the number of registered wakeup events.
 * @count: Address to store the value at.
 * @block: Whether or not to block.
 *
 * Store the number of registered wakeup events at the address in @count.  If
 * @block is set, block until the current number of wakeup events being
 * processed is zero.
 *
 * Return 'false' if the current number of wakeup events being processed is
 * nonzero.  Otherwise return 'true'.
 */
bool pm_get_wakeup_count(unsigned int *count, bool block)
{
        unsigned int cnt, inpr;

        if (block) {
                DEFINE_WAIT(wait);

                for (;;) {
                        prepare_to_wait(&wakeup_count_wait_queue, &wait,
                                        TASK_INTERRUPTIBLE);
                        split_counters(&cnt, &inpr);
                        if (inpr == 0 || signal_pending(current))
                                break;
                        pm_print_active_wakeup_sources();
                        schedule();
                }
                finish_wait(&wakeup_count_wait_queue, &wait);
        }

        split_counters(&cnt, &inpr);
        *count = cnt;
        return !inpr;
}

/**
 * pm_save_wakeup_count - Save the current number of registered wakeup events.
 * @count: Value to compare with the current number of registered wakeup events.
 *
 * If @count is equal to the current number of registered wakeup events and the
 * current number of wakeup events being processed is zero, store @count as the
 * old number of registered wakeup events for pm_check_wakeup_events(), enable
 * wakeup events detection and return 'true'.  Otherwise disable wakeup events
 * detection and return 'false'.
 */
bool pm_save_wakeup_count(unsigned int count)
{
        unsigned int cnt, inpr;
        unsigned long flags;

        events_check_enabled = false;
        raw_spin_lock_irqsave(&events_lock, flags);
        split_counters(&cnt, &inpr);
        if (cnt == count && inpr == 0) {
                saved_count = count;
                events_check_enabled = true;
        }
        raw_spin_unlock_irqrestore(&events_lock, flags);
        return events_check_enabled;
}

#ifdef CONFIG_PM_AUTOSLEEP
/**
 * pm_wakep_autosleep_enabled - Modify autosleep_enabled for all wakeup sources.
 * @set: Whether to set or to clear the autosleep_enabled flags.
 */
void pm_wakep_autosleep_enabled(bool set)
{
        struct wakeup_source *ws;
        ktime_t now = ktime_get();
        int srcuidx;

        srcuidx = srcu_read_lock(&wakeup_srcu);
        list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) {
                spin_lock_irq(&ws->lock);
                if (ws->autosleep_enabled != set) {
                        ws->autosleep_enabled = set;
                        if (ws->active) {
                                if (set)
                                        ws->start_prevent_time = now;
                                else
                                        update_prevent_sleep_time(ws, now);
                        }
                }
                spin_unlock_irq(&ws->lock);
        }
        srcu_read_unlock(&wakeup_srcu, srcuidx);
}
#endif /* CONFIG_PM_AUTOSLEEP */

/**
 * print_wakeup_source_stats - Print wakeup source statistics information.
 * @m: seq_file to print the statistics into.
 * @ws: Wakeup source object to print the statistics for.
 */
static int print_wakeup_source_stats(struct seq_file *m,
                                     struct wakeup_source *ws)
{
        unsigned long flags;
        ktime_t total_time;
        ktime_t max_time;
        unsigned long active_count;
        ktime_t active_time;
        ktime_t prevent_sleep_time;

        spin_lock_irqsave(&ws->lock, flags);

        total_time = ws->total_time;
        max_time = ws->max_time;
        prevent_sleep_time = ws->prevent_sleep_time;
        active_count = ws->active_count;
        if (ws->active) {
                ktime_t now = ktime_get();

                active_time = ktime_sub(now, ws->last_time);
                total_time = ktime_add(total_time, active_time);
                if (active_time > max_time)
                        max_time = active_time;

                if (ws->autosleep_enabled)
                        prevent_sleep_time = ktime_add(prevent_sleep_time,
                                ktime_sub(now, ws->start_prevent_time));
        } else {
                active_time = 0;
        }

        seq_printf(m, "%-12s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n",
                   ws->name, active_count, ws->event_count,
                   ws->wakeup_count, ws->expire_count,
                   ktime_to_ms(active_time), ktime_to_ms(total_time),
                   ktime_to_ms(max_time), ktime_to_ms(ws->last_time),
                   ktime_to_ms(prevent_sleep_time));

        spin_unlock_irqrestore(&ws->lock, flags);

        return 0;
}

static void *wakeup_sources_stats_seq_start(struct seq_file *m,
                                        loff_t *pos)
{
        struct wakeup_source *ws;
        loff_t n = *pos;
        int *srcuidx = m->private;

        if (n == 0) {
                seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
                        "expire_count\tactive_since\ttotal_time\tmax_time\t"
                        "last_change\tprevent_suspend_time\n");
        }

        *srcuidx = srcu_read_lock(&wakeup_srcu);
        list_for_each_entry_rcu_locked(ws, &wakeup_sources, entry) {
                if (n-- <= 0)
                        return ws;
        }

        return NULL;
}

static void *wakeup_sources_stats_seq_next(struct seq_file *m,
                                        void *v, loff_t *pos)
{
        struct wakeup_source *ws = v;
        struct wakeup_source *next_ws = NULL;

        ++(*pos);

        list_for_each_entry_continue_rcu(ws, &wakeup_sources, entry) {
                next_ws = ws;
                break;
        }

        if (!next_ws)
                print_wakeup_source_stats(m, &deleted_ws);

        return next_ws;
}

static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v)
{
        int *srcuidx = m->private;

        srcu_read_unlock(&wakeup_srcu, *srcuidx);
}

/**
 * wakeup_sources_stats_seq_show - Print wakeup sources statistics information.
 * @m: seq_file to print the statistics into.
 * @v: wakeup_source of each iteration
 */
static int wakeup_sources_stats_seq_show(struct seq_file *m, void *v)
{
        struct wakeup_source *ws = v;

        print_wakeup_source_stats(m, ws);

        return 0;
}

static const struct seq_operations wakeup_sources_stats_seq_ops = {
        .start = wakeup_sources_stats_seq_start,
        .next  = wakeup_sources_stats_seq_next,
        .stop  = wakeup_sources_stats_seq_stop,
        .show  = wakeup_sources_stats_seq_show,
};

static int wakeup_sources_stats_open(struct inode *inode, struct file *file)
{
        return seq_open_private(file, &wakeup_sources_stats_seq_ops, sizeof(int));
}

static const struct file_operations wakeup_sources_stats_fops = {
        .owner = THIS_MODULE,
        .open = wakeup_sources_stats_open,
        .read = seq_read,
        .llseek = seq_lseek,
        .release = seq_release_private,
};

static int __init wakeup_sources_debugfs_init(void)
{
        debugfs_create_file("wakeup_sources", 0444, NULL, NULL,
                            &wakeup_sources_stats_fops);
        return 0;
}

postcore_initcall(wakeup_sources_debugfs_init);






























































   18 




















   18 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef IOPRIO_H
#define IOPRIO_H

#include <linux/sched.h>
#include <linux/sched/rt.h>
#include <linux/iocontext.h>

#include <uapi/linux/ioprio.h>

/*
 * Default IO priority.
 */
#define IOPRIO_DEFAULT        IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0)

/*
 * Check that a priority value has a valid class.
 */
static inline bool ioprio_valid(unsigned short ioprio)
{
        unsigned short class = IOPRIO_PRIO_CLASS(ioprio);

        return class > IOPRIO_CLASS_NONE && class <= IOPRIO_CLASS_IDLE;
}

/*
 * if process has set io priority explicitly, use that. if not, convert
 * the cpu scheduler nice value to an io priority
 */
static inline int task_nice_ioprio(struct task_struct *task)
{
        return (task_nice(task) + 20) / 5;
}

/*
 * This is for the case where the task hasn't asked for a specific IO class.
 * Check for idle and rt task process, and return appropriate IO class.
 */
static inline int task_nice_ioclass(struct task_struct *task)
{
        if (task->policy == SCHED_IDLE)
                return IOPRIO_CLASS_IDLE;
        else if (task_is_realtime(task))
                return IOPRIO_CLASS_RT;
        else
                return IOPRIO_CLASS_BE;
}

#ifdef CONFIG_BLOCK
/*
 * If the task has set an I/O priority, use that. Otherwise, return
 * the default I/O priority.
 *
 * Expected to be called for current task or with task_lock() held to keep
 * io_context stable.
 */
static inline int __get_task_ioprio(struct task_struct *p)
{
        struct io_context *ioc = p->io_context;
        int prio;

        if (!ioc)
                return IOPRIO_DEFAULT;

        if (p != current)
                lockdep_assert_held(&p->alloc_lock);

        prio = ioc->ioprio;
        if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
                prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
                                         task_nice_ioprio(p));
        return prio;
}
#else
static inline int __get_task_ioprio(struct task_struct *p)
{
        return IOPRIO_DEFAULT;
}
#endif /* CONFIG_BLOCK */

static inline int get_current_ioprio(void)
{
        return __get_task_ioprio(current);
}

extern int set_task_ioprio(struct task_struct *task, int ioprio);

#ifdef CONFIG_BLOCK
extern int ioprio_check_cap(int ioprio);
#else
static inline int ioprio_check_cap(int ioprio)
{
        return -ENOTBLK;
}
#endif /* CONFIG_BLOCK */

#endif





























    1 














    1 

























































































































































































































































































































































    1 











    1 






    1 






































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2007 Red Hat.  All rights reserved.
 */

#include <linux/init.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/rwsem.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/posix_acl_xattr.h>
#include <linux/iversion.h>
#include <linux/sched/mm.h>
#include "ctree.h"
#include "fs.h"
#include "messages.h"
#include "btrfs_inode.h"
#include "transaction.h"
#include "xattr.h"
#include "disk-io.h"
#include "props.h"
#include "locking.h"
#include "accessors.h"
#include "dir-item.h"

int btrfs_getxattr(struct inode *inode, const char *name,
                                void *buffer, size_t size)
{
        struct btrfs_dir_item *di;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        int ret = 0;
        unsigned long data_ptr;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        /* lookup the xattr by name */
        di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)),
                        name, strlen(name), 0);
        if (!di) {
                ret = -ENODATA;
                goto out;
        } else if (IS_ERR(di)) {
                ret = PTR_ERR(di);
                goto out;
        }

        leaf = path->nodes[0];
        /* if size is 0, that means we want the size of the attr */
        if (!size) {
                ret = btrfs_dir_data_len(leaf, di);
                goto out;
        }

        /* now get the data out of our dir_item */
        if (btrfs_dir_data_len(leaf, di) > size) {
                ret = -ERANGE;
                goto out;
        }

        /*
         * The way things are packed into the leaf is like this
         * |struct btrfs_dir_item|name|data|
         * where name is the xattr name, so security.foo, and data is the
         * content of the xattr.  data_ptr points to the location in memory
         * where the data starts in the in memory leaf
         */
        data_ptr = (unsigned long)((char *)(di + 1) +
                                   btrfs_dir_name_len(leaf, di));
        read_extent_buffer(leaf, buffer, data_ptr,
                           btrfs_dir_data_len(leaf, di));
        ret = btrfs_dir_data_len(leaf, di);

out:
        btrfs_free_path(path);
        return ret;
}

int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
                   const char *name, const void *value, size_t size, int flags)
{
        struct btrfs_dir_item *di = NULL;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_path *path;
        size_t name_len = strlen(name);
        int ret = 0;

        ASSERT(trans);

        if (name_len + size > BTRFS_MAX_XATTR_SIZE(root->fs_info))
                return -ENOSPC;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        path->skip_release_on_error = 1;

        if (!value) {
                di = btrfs_lookup_xattr(trans, root, path,
                                btrfs_ino(BTRFS_I(inode)), name, name_len, -1);
                if (!di && (flags & XATTR_REPLACE))
                        ret = -ENODATA;
                else if (IS_ERR(di))
                        ret = PTR_ERR(di);
                else if (di)
                        ret = btrfs_delete_one_dir_name(trans, root, path, di);
                goto out;
        }

        /*
         * For a replace we can't just do the insert blindly.
         * Do a lookup first (read-only btrfs_search_slot), and return if xattr
         * doesn't exist. If it exists, fall down below to the insert/replace
         * path - we can't race with a concurrent xattr delete, because the VFS
         * locks the inode's i_mutex before calling setxattr or removexattr.
         */
        if (flags & XATTR_REPLACE) {
                ASSERT(inode_is_locked(inode));
                di = btrfs_lookup_xattr(NULL, root, path,
                                btrfs_ino(BTRFS_I(inode)), name, name_len, 0);
                if (!di)
                        ret = -ENODATA;
                else if (IS_ERR(di))
                        ret = PTR_ERR(di);
                if (ret)
                        goto out;
                btrfs_release_path(path);
                di = NULL;
        }

        ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(BTRFS_I(inode)),
                                      name, name_len, value, size);
        if (ret == -EOVERFLOW) {
                /*
                 * We have an existing item in a leaf, split_leaf couldn't
                 * expand it. That item might have or not a dir_item that
                 * matches our target xattr, so lets check.
                 */
                ret = 0;
                btrfs_assert_tree_write_locked(path->nodes[0]);
                di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
                if (!di && !(flags & XATTR_REPLACE)) {
                        ret = -ENOSPC;
                        goto out;
                }
        } else if (ret == -EEXIST) {
                ret = 0;
                di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
                ASSERT(di); /* logic error */
        } else if (ret) {
                goto out;
        }

        if (di && (flags & XATTR_CREATE)) {
                ret = -EEXIST;
                goto out;
        }

        if (di) {
                /*
                 * We're doing a replace, and it must be atomic, that is, at
                 * any point in time we have either the old or the new xattr
                 * value in the tree. We don't want readers (getxattr and
                 * listxattrs) to miss a value, this is specially important
                 * for ACLs.
                 */
                const int slot = path->slots[0];
                struct extent_buffer *leaf = path->nodes[0];
                const u16 old_data_len = btrfs_dir_data_len(leaf, di);
                const u32 item_size = btrfs_item_size(leaf, slot);
                const u32 data_size = sizeof(*di) + name_len + size;
                unsigned long data_ptr;
                char *ptr;

                if (size > old_data_len) {
                        if (btrfs_leaf_free_space(leaf) <
                            (size - old_data_len)) {
                                ret = -ENOSPC;
                                goto out;
                        }
                }

                if (old_data_len + name_len + sizeof(*di) == item_size) {
                        /* No other xattrs packed in the same leaf item. */
                        if (size > old_data_len)
                                btrfs_extend_item(trans, path, size - old_data_len);
                        else if (size < old_data_len)
                                btrfs_truncate_item(trans, path, data_size, 1);
                } else {
                        /* There are other xattrs packed in the same item. */
                        ret = btrfs_delete_one_dir_name(trans, root, path, di);
                        if (ret)
                                goto out;
                        btrfs_extend_item(trans, path, data_size);
                }

                ptr = btrfs_item_ptr(leaf, slot, char);
                ptr += btrfs_item_size(leaf, slot) - data_size;
                di = (struct btrfs_dir_item *)ptr;
                btrfs_set_dir_data_len(leaf, di, size);
                data_ptr = ((unsigned long)(di + 1)) + name_len;
                write_extent_buffer(leaf, value, data_ptr, size);
                btrfs_mark_buffer_dirty(trans, leaf);
        } else {
                /*
                 * Insert, and we had space for the xattr, so path->slots[0] is
                 * where our xattr dir_item is and btrfs_insert_xattr_item()
                 * filled it.
                 */
        }
out:
        btrfs_free_path(path);
        if (!ret) {
                set_bit(BTRFS_INODE_COPY_EVERYTHING,
                        &BTRFS_I(inode)->runtime_flags);
                clear_bit(BTRFS_INODE_NO_XATTRS, &BTRFS_I(inode)->runtime_flags);
        }
        return ret;
}

/*
 * @value: "" makes the attribute to empty, NULL removes it
 */
int btrfs_setxattr_trans(struct inode *inode, const char *name,
                         const void *value, size_t size, int flags)
{
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        const bool start_trans = (current->journal_info == NULL);
        int ret;

        if (start_trans) {
                /*
                 * 1 unit for inserting/updating/deleting the xattr
                 * 1 unit for the inode item update
                 */
                trans = btrfs_start_transaction(root, 2);
                if (IS_ERR(trans))
                        return PTR_ERR(trans);
        } else {
                /*
                 * This can happen when smack is enabled and a directory is being
                 * created. It happens through d_instantiate_new(), which calls
                 * smack_d_instantiate(), which in turn calls __vfs_setxattr() to
                 * set the transmute xattr (XATTR_NAME_SMACKTRANSMUTE) on the
                 * inode. We have already reserved space for the xattr and inode
                 * update at btrfs_mkdir(), so just use the transaction handle.
                 * We don't join or start a transaction, as that will reset the
                 * block_rsv of the handle and trigger a warning for the start
                 * case.
                 */
                ASSERT(strncmp(name, XATTR_SECURITY_PREFIX,
                               XATTR_SECURITY_PREFIX_LEN) == 0);
                trans = current->journal_info;
        }

        ret = btrfs_setxattr(trans, inode, name, value, size, flags);
        if (ret)
                goto out;

        inode_inc_iversion(inode);
        inode_set_ctime_current(inode);
        ret = btrfs_update_inode(trans, BTRFS_I(inode));
        if (ret)
                btrfs_abort_transaction(trans, ret);
out:
        if (start_trans)
                btrfs_end_transaction(trans);
        return ret;
}

ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
        struct btrfs_key found_key;
        struct btrfs_key key;
        struct inode *inode = d_inode(dentry);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_path *path;
        int iter_ret = 0;
        int ret = 0;
        size_t total_size = 0, size_left = size;

        /*
         * ok we want all objects associated with this id.
         * NOTE: we set key.offset = 0; because we want to start with the
         * first xattr that we find and walk forward
         */
        key.objectid = btrfs_ino(BTRFS_I(inode));
        key.type = BTRFS_XATTR_ITEM_KEY;
        key.offset = 0;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        path->reada = READA_FORWARD;

        /* search for our xattrs */
        btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
                struct extent_buffer *leaf;
                int slot;
                struct btrfs_dir_item *di;
                u32 item_size;
                u32 cur;

                leaf = path->nodes[0];
                slot = path->slots[0];

                /* check to make sure this item is what we want */
                if (found_key.objectid != key.objectid)
                        break;
                if (found_key.type > BTRFS_XATTR_ITEM_KEY)
                        break;
                if (found_key.type < BTRFS_XATTR_ITEM_KEY)
                        continue;

                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
                item_size = btrfs_item_size(leaf, slot);
                cur = 0;
                while (cur < item_size) {
                        u16 name_len = btrfs_dir_name_len(leaf, di);
                        u16 data_len = btrfs_dir_data_len(leaf, di);
                        u32 this_len = sizeof(*di) + name_len + data_len;
                        unsigned long name_ptr = (unsigned long)(di + 1);

                        total_size += name_len + 1;
                        /*
                         * We are just looking for how big our buffer needs to
                         * be.
                         */
                        if (!size)
                                goto next;

                        if (!buffer || (name_len + 1) > size_left) {
                                iter_ret = -ERANGE;
                                break;
                        }

                        read_extent_buffer(leaf, buffer, name_ptr, name_len);
                        buffer[name_len] = '\0';

                        size_left -= name_len + 1;
                        buffer += name_len + 1;
next:
                        cur += this_len;
                        di = (struct btrfs_dir_item *)((char *)di + this_len);
                }
        }

        if (iter_ret < 0)
                ret = iter_ret;
        else
                ret = total_size;

        btrfs_free_path(path);

        return ret;
}

static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
                                   struct dentry *unused, struct inode *inode,
                                   const char *name, void *buffer, size_t size)
{
        name = xattr_full_name(handler, name);
        return btrfs_getxattr(inode, name, buffer, size);
}

static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
                                   struct mnt_idmap *idmap,
                                   struct dentry *unused, struct inode *inode,
                                   const char *name, const void *buffer,
                                   size_t size, int flags)
{
        if (btrfs_root_readonly(BTRFS_I(inode)->root))
                return -EROFS;

        name = xattr_full_name(handler, name);
        return btrfs_setxattr_trans(inode, name, buffer, size, flags);
}

static int btrfs_xattr_handler_get_security(const struct xattr_handler *handler,
                                            struct dentry *unused,
                                            struct inode *inode,
                                            const char *name, void *buffer,
                                            size_t size)
{
        int ret;
        bool is_cap = false;

        name = xattr_full_name(handler, name);

        /*
         * security.capability doesn't cache the results, so calls into us
         * constantly to see if there's a capability xattr.  Cache the result
         * here in order to avoid wasting time doing lookups for xattrs we know
         * don't exist.
         */
        if (strcmp(name, XATTR_NAME_CAPS) == 0) {
                is_cap = true;
                if (test_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags))
                        return -ENODATA;
        }

        ret = btrfs_getxattr(inode, name, buffer, size);
        if (ret == -ENODATA && is_cap)
                set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
        return ret;
}

static int btrfs_xattr_handler_set_security(const struct xattr_handler *handler,
                                            struct mnt_idmap *idmap,
                                            struct dentry *unused,
                                            struct inode *inode,
                                            const char *name,
                                            const void *buffer,
                                            size_t size, int flags)
{
        if (btrfs_root_readonly(BTRFS_I(inode)->root))
                return -EROFS;

        name = xattr_full_name(handler, name);
        if (strcmp(name, XATTR_NAME_CAPS) == 0)
                clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);

        return btrfs_setxattr_trans(inode, name, buffer, size, flags);
}

static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
                                        struct mnt_idmap *idmap,
                                        struct dentry *unused, struct inode *inode,
                                        const char *name, const void *value,
                                        size_t size, int flags)
{
        int ret;
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;

        name = xattr_full_name(handler, name);
        ret = btrfs_validate_prop(BTRFS_I(inode), name, value, size);
        if (ret)
                return ret;

        if (btrfs_ignore_prop(BTRFS_I(inode), name))
                return 0;

        trans = btrfs_start_transaction(root, 2);
        if (IS_ERR(trans))
                return PTR_ERR(trans);

        ret = btrfs_set_prop(trans, inode, name, value, size, flags);
        if (!ret) {
                inode_inc_iversion(inode);
                inode_set_ctime_current(inode);
                ret = btrfs_update_inode(trans, BTRFS_I(inode));
                if (ret)
                        btrfs_abort_transaction(trans, ret);
        }

        btrfs_end_transaction(trans);

        return ret;
}

static const struct xattr_handler btrfs_security_xattr_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .get = btrfs_xattr_handler_get_security,
        .set = btrfs_xattr_handler_set_security,
};

static const struct xattr_handler btrfs_trusted_xattr_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .get = btrfs_xattr_handler_get,
        .set = btrfs_xattr_handler_set,
};

static const struct xattr_handler btrfs_user_xattr_handler = {
        .prefix = XATTR_USER_PREFIX,
        .get = btrfs_xattr_handler_get,
        .set = btrfs_xattr_handler_set,
};

static const struct xattr_handler btrfs_btrfs_xattr_handler = {
        .prefix = XATTR_BTRFS_PREFIX,
        .get = btrfs_xattr_handler_get,
        .set = btrfs_xattr_handler_set_prop,
};

const struct xattr_handler * const btrfs_xattr_handlers[] = {
        &btrfs_security_xattr_handler,
        &btrfs_trusted_xattr_handler,
        &btrfs_user_xattr_handler,
        &btrfs_btrfs_xattr_handler,
        NULL,
};

static int btrfs_initxattrs(struct inode *inode,
                            const struct xattr *xattr_array, void *fs_private)
{
        struct btrfs_trans_handle *trans = fs_private;
        const struct xattr *xattr;
        unsigned int nofs_flag;
        char *name;
        int ret = 0;

        /*
         * We're holding a transaction handle, so use a NOFS memory allocation
         * context to avoid deadlock if reclaim happens.
         */
        nofs_flag = memalloc_nofs_save();
        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
                name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
                               strlen(xattr->name) + 1, GFP_KERNEL);
                if (!name) {
                        ret = -ENOMEM;
                        break;
                }
                strcpy(name, XATTR_SECURITY_PREFIX);
                strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);

                if (strcmp(name, XATTR_NAME_CAPS) == 0)
                        clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);

                ret = btrfs_setxattr(trans, inode, name, xattr->value,
                                     xattr->value_len, 0);
                kfree(name);
                if (ret < 0)
                        break;
        }
        memalloc_nofs_restore(nofs_flag);
        return ret;
}

int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
                              struct inode *inode, struct inode *dir,
                              const struct qstr *qstr)
{
        return security_inode_init_security(inode, dir, qstr,
                                            &btrfs_initxattrs, trans);
}













































    1 


















    3 














































    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * kref.h - library routines for handling generic reference counted objects
 *
 * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (C) 2004 IBM Corp.
 *
 * based on kobject.h which was:
 * Copyright (C) 2002-2003 Patrick Mochel <mochel@osdl.org>
 * Copyright (C) 2002-2003 Open Source Development Labs
 */

#ifndef _KREF_H_
#define _KREF_H_

#include <linux/spinlock.h>
#include <linux/refcount.h>

struct kref {
        refcount_t refcount;
};

#define KREF_INIT(n)        { .refcount = REFCOUNT_INIT(n), }

/**
 * kref_init - initialize object.
 * @kref: object in question.
 */
static inline void kref_init(struct kref *kref)
{
        refcount_set(&kref->refcount, 1);
}

static inline unsigned int kref_read(const struct kref *kref)
{
        return refcount_read(&kref->refcount);
}

/**
 * kref_get - increment refcount for object.
 * @kref: object.
 */
static inline void kref_get(struct kref *kref)
{
        refcount_inc(&kref->refcount);
}

/**
 * kref_put - decrement refcount for object.
 * @kref: object.
 * @release: pointer to the function that will clean up the object when the
 *             last reference to the object is released.
 *             This pointer is required, and it is not acceptable to pass kfree
 *             in as this function.
 *
 * Decrement the refcount, and if 0, call release().
 * Return 1 if the object was removed, otherwise return 0.  Beware, if this
 * function returns 0, you still can not count on the kref from remaining in
 * memory.  Only use the return value if you want to see if the kref is now
 * gone, not present.
 */
static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
{
        if (refcount_dec_and_test(&kref->refcount)) {
                release(kref);
                return 1;
        }
        return 0;
}

static inline int kref_put_mutex(struct kref *kref,
                                 void (*release)(struct kref *kref),
                                 struct mutex *lock)
{
        if (refcount_dec_and_mutex_lock(&kref->refcount, lock)) {
                release(kref);
                return 1;
        }
        return 0;
}

static inline int kref_put_lock(struct kref *kref,
                                void (*release)(struct kref *kref),
                                spinlock_t *lock)
{
        if (refcount_dec_and_lock(&kref->refcount, lock)) {
                release(kref);
                return 1;
        }
        return 0;
}

/**
 * kref_get_unless_zero - Increment refcount for object unless it is zero.
 * @kref: object.
 *
 * Return non-zero if the increment succeeded. Otherwise return 0.
 *
 * This function is intended to simplify locking around refcounting for
 * objects that can be looked up from a lookup structure, and which are
 * removed from that lookup structure in the object destructor.
 * Operations on such objects require at least a read lock around
 * lookup + kref_get, and a write lock around kref_put + remove from lookup
 * structure. Furthermore, RCU implementations become extremely tricky.
 * With a lookup followed by a kref_get_unless_zero *with return value check*
 * locking in the kref_put path can be deferred to the actual removal from
 * the lookup structure and RCU lookups become trivial.
 */
static inline int __must_check kref_get_unless_zero(struct kref *kref)
{
        return refcount_inc_not_zero(&kref->refcount);
}
#endif /* _KREF_H_ */





























































    1 



































































































































































































































































































































































































































































































































































































































































    3 
    3 

    3 





    5 















    2 


    5 


    5 














    4 












    4 




    2 



    3 




























































































    3 








































































































































































































































   14 















   26 































































































































































    8 


    7 
    8 















    4 
    4 























   35 














   14 







































    3 












































































    2 





































    1 


    1 

















    2 
   22 
    1 


















    4 











































    2 
    2 





    2 

    2 



















































































    1 






















































































































































































































   13 




    1 




   13 




    1 



















































































































































































   28 
























    6 

































    1 
















    1 











   17 
   21 










































































    5 







































































































































































































































































































































































































































    4 
































    2 
    3 




    3 
    2 





























































































































































   12 





   11 































































































































    2 



















































































































































































































































































































































































































































































































































































































    2 





















































































    2 







































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MM_H
#define _LINUX_MM_H

#include <linux/errno.h>
#include <linux/mmdebug.h>
#include <linux/gfp.h>
#include <linux/pgalloc_tag.h>
#include <linux/bug.h>
#include <linux/list.h>
#include <linux/mmzone.h>
#include <linux/rbtree.h>
#include <linux/atomic.h>
#include <linux/debug_locks.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/range.h>
#include <linux/pfn.h>
#include <linux/percpu-refcount.h>
#include <linux/bit_spinlock.h>
#include <linux/shrinker.h>
#include <linux/resource.h>
#include <linux/page_ext.h>
#include <linux/err.h>
#include <linux/page-flags.h>
#include <linux/page_ref.h>
#include <linux/overflow.h>
#include <linux/sizes.h>
#include <linux/sched.h>
#include <linux/pgtable.h>
#include <linux/kasan.h>
#include <linux/memremap.h>
#include <linux/slab.h>

struct mempolicy;
struct anon_vma;
struct anon_vma_chain;
struct user_struct;
struct pt_regs;
struct folio_batch;

extern int sysctl_page_lock_unfairness;

void mm_core_init(void);
void init_mm_internals(void);

#ifndef CONFIG_NUMA                /* Don't use mapnrs, do it properly */
extern unsigned long max_mapnr;

static inline void set_max_mapnr(unsigned long limit)
{
        max_mapnr = limit;
}
#else
static inline void set_max_mapnr(unsigned long limit) { }
#endif

extern atomic_long_t _totalram_pages;
static inline unsigned long totalram_pages(void)
{
        return (unsigned long)atomic_long_read(&_totalram_pages);
}

static inline void totalram_pages_inc(void)
{
        atomic_long_inc(&_totalram_pages);
}

static inline void totalram_pages_dec(void)
{
        atomic_long_dec(&_totalram_pages);
}

static inline void totalram_pages_add(long count)
{
        atomic_long_add(count, &_totalram_pages);
}

extern void * high_memory;
extern int page_cluster;
extern const int page_cluster_max;

#ifdef CONFIG_SYSCTL
extern int sysctl_legacy_va_layout;
#else
#define sysctl_legacy_va_layout 0
#endif

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
extern const int mmap_rnd_bits_min;
extern int mmap_rnd_bits_max __ro_after_init;
extern int mmap_rnd_bits __read_mostly;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
extern const int mmap_rnd_compat_bits_min;
extern const int mmap_rnd_compat_bits_max;
extern int mmap_rnd_compat_bits __read_mostly;
#endif

#include <asm/page.h>
#include <asm/processor.h>

#ifndef __pa_symbol
#define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
#endif

#ifndef page_to_virt
#define page_to_virt(x)        __va(PFN_PHYS(page_to_pfn(x)))
#endif

#ifndef lm_alias
#define lm_alias(x)        __va(__pa_symbol(x))
#endif

/*
 * To prevent common memory management code establishing
 * a zero page mapping on a read fault.
 * This macro should be defined within <asm/pgtable.h>.
 * s390 does this to prevent multiplexing of hardware bits
 * related to the physical page in case of virtualization.
 */
#ifndef mm_forbids_zeropage
#define mm_forbids_zeropage(X)        (0)
#endif

/*
 * On some architectures it is expensive to call memset() for small sizes.
 * If an architecture decides to implement their own version of
 * mm_zero_struct_page they should wrap the defines below in a #ifndef and
 * define their own version of this macro in <asm/pgtable.h>
 */
#if BITS_PER_LONG == 64
/* This function must be updated when the size of struct page grows above 96
 * or reduces below 56. The idea that compiler optimizes out switch()
 * statement, and only leaves move/store instructions. Also the compiler can
 * combine write statements if they are both assignments and can be reordered,
 * this can result in several of the writes here being dropped.
 */
#define        mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
static inline void __mm_zero_struct_page(struct page *page)
{
        unsigned long *_pp = (void *)page;

         /* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */
        BUILD_BUG_ON(sizeof(struct page) & 7);
        BUILD_BUG_ON(sizeof(struct page) < 56);
        BUILD_BUG_ON(sizeof(struct page) > 96);

        switch (sizeof(struct page)) {
        case 96:
                _pp[11] = 0;
                fallthrough;
        case 88:
                _pp[10] = 0;
                fallthrough;
        case 80:
                _pp[9] = 0;
                fallthrough;
        case 72:
                _pp[8] = 0;
                fallthrough;
        case 64:
                _pp[7] = 0;
                fallthrough;
        case 56:
                _pp[6] = 0;
                _pp[5] = 0;
                _pp[4] = 0;
                _pp[3] = 0;
                _pp[2] = 0;
                _pp[1] = 0;
                _pp[0] = 0;
        }
}
#else
#define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
#endif

/*
 * Default maximum number of active map areas, this limits the number of vmas
 * per mm struct. Users can overwrite this number by sysctl but there is a
 * problem.
 *
 * When a program's coredump is generated as ELF format, a section is created
 * per a vma. In ELF, the number of sections is represented in unsigned short.
 * This means the number of sections should be smaller than 65535 at coredump.
 * Because the kernel adds some informative sections to a image of program at
 * generating coredump, we need some margin. The number of extra sections is
 * 1-3 now and depends on arch. We use "5" as safe margin, here.
 *
 * ELF extended numbering allows more than 65535 sections, so 16-bit bound is
 * not a hard limit any more. Although some userspace tools can be surprised by
 * that.
 */
#define MAPCOUNT_ELF_CORE_MARGIN        (5)
#define DEFAULT_MAX_MAP_COUNT        (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)

extern int sysctl_max_map_count;

extern unsigned long sysctl_user_reserve_kbytes;
extern unsigned long sysctl_admin_reserve_kbytes;

extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern unsigned long sysctl_overcommit_kbytes;

int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);

#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
#define folio_page_idx(folio, p)        (page_to_pfn(p) - folio_pfn(folio))
#else
#define nth_page(page,n) ((page) + (n))
#define folio_page_idx(folio, p)        ((p) - &(folio)->page)
#endif

/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)

/* to align the pointer to the (prev) page boundary */
#define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE)

/* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
#define PAGE_ALIGNED(addr)        IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)

static inline struct folio *lru_to_folio(struct list_head *head)
{
        return list_entry((head)->prev, struct folio, lru);
}

void setup_initial_init_mm(void *start_code, void *end_code,
                           void *end_data, void *brk);

/*
 * Linux kernel virtual memory manager primitives.
 * The idea being to have a "virtual" mm in the same way
 * we have a virtual fs - giving a cleaner interface to the
 * mm details, and allowing different kinds of memory mappings
 * (from shared memory to executable loading to arbitrary
 * mmap() functions).
 */

struct vm_area_struct *vm_area_alloc(struct mm_struct *);
struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
void vm_area_free(struct vm_area_struct *);
/* Use only if VMA has no other users */
void __vm_area_free(struct vm_area_struct *vma);

#ifndef CONFIG_MMU
extern struct rb_root nommu_region_tree;
extern struct rw_semaphore nommu_region_sem;

extern unsigned int kobjsize(const void *objp);
#endif

/*
 * vm_flags in vm_area_struct, see mm_types.h.
 * When changing, update also include/trace/events/mmflags.h
 */
#define VM_NONE                0x00000000

#define VM_READ                0x00000001        /* currently active flags */
#define VM_WRITE        0x00000002
#define VM_EXEC                0x00000004
#define VM_SHARED        0x00000008

/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
#define VM_MAYREAD        0x00000010        /* limits for mprotect() etc */
#define VM_MAYWRITE        0x00000020
#define VM_MAYEXEC        0x00000040
#define VM_MAYSHARE        0x00000080

#define VM_GROWSDOWN        0x00000100        /* general info on the segment */
#ifdef CONFIG_MMU
#define VM_UFFD_MISSING        0x00000200        /* missing pages tracking */
#else /* CONFIG_MMU */
#define VM_MAYOVERLAY        0x00000200        /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
#define VM_UFFD_MISSING        0
#endif /* CONFIG_MMU */
#define VM_PFNMAP        0x00000400        /* Page-ranges managed without "struct page", just pure PFN */
#define VM_UFFD_WP        0x00001000        /* wrprotect pages tracking */

#define VM_LOCKED        0x00002000
#define VM_IO           0x00004000        /* Memory mapped I/O or similar */

                                        /* Used by sys_madvise() */
#define VM_SEQ_READ        0x00008000        /* App will access data sequentially */
#define VM_RAND_READ        0x00010000        /* App will not benefit from clustered reads */

#define VM_DONTCOPY        0x00020000      /* Do not copy this vma on fork */
#define VM_DONTEXPAND        0x00040000        /* Cannot expand with mremap() */
#define VM_LOCKONFAULT        0x00080000        /* Lock the pages covered when they are faulted in */
#define VM_ACCOUNT        0x00100000        /* Is a VM accounted object */
#define VM_NORESERVE        0x00200000        /* should the VM suppress accounting */
#define VM_HUGETLB        0x00400000        /* Huge TLB Page VM */
#define VM_SYNC                0x00800000        /* Synchronous page faults */
#define VM_ARCH_1        0x01000000        /* Architecture-specific flag */
#define VM_WIPEONFORK        0x02000000        /* Wipe VMA contents in child. */
#define VM_DONTDUMP        0x04000000        /* Do not include in the core dump */

#ifdef CONFIG_MEM_SOFT_DIRTY
# define VM_SOFTDIRTY        0x08000000        /* Not soft dirty clean area */
#else
# define VM_SOFTDIRTY        0
#endif

#define VM_MIXEDMAP        0x10000000        /* Can contain "struct page" and pure PFN pages */
#define VM_HUGEPAGE        0x20000000        /* MADV_HUGEPAGE marked this vma */
#define VM_NOHUGEPAGE        0x40000000        /* MADV_NOHUGEPAGE marked this vma */
#define VM_MERGEABLE        0x80000000        /* KSM may merge identical pages */

#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
#define VM_HIGH_ARCH_BIT_0        32        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_1        33        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_2        34        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_3        35        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_4        36        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_5        37        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_0        BIT(VM_HIGH_ARCH_BIT_0)
#define VM_HIGH_ARCH_1        BIT(VM_HIGH_ARCH_BIT_1)
#define VM_HIGH_ARCH_2        BIT(VM_HIGH_ARCH_BIT_2)
#define VM_HIGH_ARCH_3        BIT(VM_HIGH_ARCH_BIT_3)
#define VM_HIGH_ARCH_4        BIT(VM_HIGH_ARCH_BIT_4)
#define VM_HIGH_ARCH_5        BIT(VM_HIGH_ARCH_BIT_5)
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */

#ifdef CONFIG_ARCH_HAS_PKEYS
# define VM_PKEY_SHIFT        VM_HIGH_ARCH_BIT_0
# define VM_PKEY_BIT0        VM_HIGH_ARCH_0        /* A protection key is a 4-bit value */
# define VM_PKEY_BIT1        VM_HIGH_ARCH_1        /* on x86 and 5-bit value on ppc64   */
# define VM_PKEY_BIT2        VM_HIGH_ARCH_2
# define VM_PKEY_BIT3        VM_HIGH_ARCH_3
#ifdef CONFIG_PPC
# define VM_PKEY_BIT4  VM_HIGH_ARCH_4
#else
# define VM_PKEY_BIT4  0
#endif
#endif /* CONFIG_ARCH_HAS_PKEYS */

#ifdef CONFIG_X86_USER_SHADOW_STACK
/*
 * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
 * support core mm.
 *
 * These VMAs will get a single end guard page. This helps userspace protect
 * itself from attacks. A single page is enough for current shadow stack archs
 * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c
 * for more details on the guard size.
 */
# define VM_SHADOW_STACK        VM_HIGH_ARCH_5
#else
# define VM_SHADOW_STACK        VM_NONE
#endif

#if defined(CONFIG_X86)
# define VM_PAT                VM_ARCH_1        /* PAT reserves whole VMA at once (x86) */
#elif defined(CONFIG_PPC)
# define VM_SAO                VM_ARCH_1        /* Strong Access Ordering (powerpc) */
#elif defined(CONFIG_PARISC)
# define VM_GROWSUP        VM_ARCH_1
#elif defined(CONFIG_SPARC64)
# define VM_SPARC_ADI        VM_ARCH_1        /* Uses ADI tag for access control */
# define VM_ARCH_CLEAR        VM_SPARC_ADI
#elif defined(CONFIG_ARM64)
# define VM_ARM64_BTI        VM_ARCH_1        /* BTI guarded page, a.k.a. GP bit */
# define VM_ARCH_CLEAR        VM_ARM64_BTI
#elif !defined(CONFIG_MMU)
# define VM_MAPPED_COPY        VM_ARCH_1        /* T if mapped copy of data (nommu mmap) */
#endif

#if defined(CONFIG_ARM64_MTE)
# define VM_MTE                VM_HIGH_ARCH_0        /* Use Tagged memory for access control */
# define VM_MTE_ALLOWED        VM_HIGH_ARCH_1        /* Tagged memory permitted */
#else
# define VM_MTE                VM_NONE
# define VM_MTE_ALLOWED        VM_NONE
#endif

#ifndef VM_GROWSUP
# define VM_GROWSUP        VM_NONE
#endif

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
# define VM_UFFD_MINOR_BIT        38
# define VM_UFFD_MINOR                BIT(VM_UFFD_MINOR_BIT)        /* UFFD minor faults */
#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
# define VM_UFFD_MINOR                VM_NONE
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */

/*
 * This flag is used to connect VFIO to arch specific KVM code. It
 * indicates that the memory under this VMA is safe for use with any
 * non-cachable memory type inside KVM. Some VFIO devices, on some
 * platforms, are thought to be unsafe and can cause machine crashes
 * if KVM does not lock down the memory type.
 */
#ifdef CONFIG_64BIT
#define VM_ALLOW_ANY_UNCACHED_BIT        39
#define VM_ALLOW_ANY_UNCACHED                BIT(VM_ALLOW_ANY_UNCACHED_BIT)
#else
#define VM_ALLOW_ANY_UNCACHED                VM_NONE
#endif

#ifdef CONFIG_64BIT
/* VM is sealed, in vm_flags */
#define VM_SEALED        _BITUL(63)
#endif

/* Bits set in the VMA until the stack is in its final location */
#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)

#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)

/* Common data flag combinations */
#define VM_DATA_FLAGS_TSK_EXEC        (VM_READ | VM_WRITE | TASK_EXEC | \
                                 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
#define VM_DATA_FLAGS_NON_EXEC        (VM_READ | VM_WRITE | VM_MAYREAD | \
                                 VM_MAYWRITE | VM_MAYEXEC)
#define VM_DATA_FLAGS_EXEC        (VM_READ | VM_WRITE | VM_EXEC | \
                                 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)

#ifndef VM_DATA_DEFAULT_FLAGS                /* arch can override this */
#define VM_DATA_DEFAULT_FLAGS  VM_DATA_FLAGS_EXEC
#endif

#ifndef VM_STACK_DEFAULT_FLAGS                /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
#endif

#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK)

#ifdef CONFIG_STACK_GROWSUP
#define VM_STACK        VM_GROWSUP
#define VM_STACK_EARLY        VM_GROWSDOWN
#else
#define VM_STACK        VM_GROWSDOWN
#define VM_STACK_EARLY        0
#endif

#define VM_STACK_FLAGS        (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)

/* VMA basic access permission flags */
#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)


/*
 * Special vmas that are non-mergable, non-mlock()able.
 */
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)

/* This mask prevents VMA from being scanned with khugepaged */
#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)

/* This mask defines which mm->def_flags a process can inherit its parent */
#define VM_INIT_DEF_MASK        VM_NOHUGEPAGE

/* This mask represents all the VMA flag bits used by mlock */
#define VM_LOCKED_MASK        (VM_LOCKED | VM_LOCKONFAULT)

/* Arch-specific flags to clear when updating VM flags on protection change */
#ifndef VM_ARCH_CLEAR
# define VM_ARCH_CLEAR        VM_NONE
#endif
#define VM_FLAGS_CLEAR        (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR)

/*
 * mapping from the currently active vm_flags protection bits (the
 * low four bits) to a page protection mask..
 */

/*
 * The default fault flags that should be used by most of the
 * arch-specific page fault handlers.
 */
#define FAULT_FLAG_DEFAULT  (FAULT_FLAG_ALLOW_RETRY | \
                             FAULT_FLAG_KILLABLE | \
                             FAULT_FLAG_INTERRUPTIBLE)

/**
 * fault_flag_allow_retry_first - check ALLOW_RETRY the first time
 * @flags: Fault flags.
 *
 * This is mostly used for places where we want to try to avoid taking
 * the mmap_lock for too long a time when waiting for another condition
 * to change, in which case we can try to be polite to release the
 * mmap_lock in the first round to avoid potential starvation of other
 * processes that would also want the mmap_lock.
 *
 * Return: true if the page fault allows retry and this is the first
 * attempt of the fault handling; false otherwise.
 */
static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
{
        return (flags & FAULT_FLAG_ALLOW_RETRY) &&
            (!(flags & FAULT_FLAG_TRIED));
}

#define FAULT_FLAG_TRACE \
        { FAULT_FLAG_WRITE,                "WRITE" }, \
        { FAULT_FLAG_MKWRITE,                "MKWRITE" }, \
        { FAULT_FLAG_ALLOW_RETRY,        "ALLOW_RETRY" }, \
        { FAULT_FLAG_RETRY_NOWAIT,        "RETRY_NOWAIT" }, \
        { FAULT_FLAG_KILLABLE,                "KILLABLE" }, \
        { FAULT_FLAG_TRIED,                "TRIED" }, \
        { FAULT_FLAG_USER,                "USER" }, \
        { FAULT_FLAG_REMOTE,                "REMOTE" }, \
        { FAULT_FLAG_INSTRUCTION,        "INSTRUCTION" }, \
        { FAULT_FLAG_INTERRUPTIBLE,        "INTERRUPTIBLE" }, \
        { FAULT_FLAG_VMA_LOCK,                "VMA_LOCK" }

/*
 * vm_fault is filled by the pagefault handler and passed to the vma's
 * ->fault function. The vma's ->fault is responsible for returning a bitmask
 * of VM_FAULT_xxx flags that give details about how the fault was handled.
 *
 * MM layer fills up gfp_mask for page allocations but fault handler might
 * alter it if its implementation requires a different allocation context.
 *
 * pgoff should be used in favour of virtual_address, if possible.
 */
struct vm_fault {
        const struct {
                struct vm_area_struct *vma;        /* Target VMA */
                gfp_t gfp_mask;                        /* gfp mask to be used for allocations */
                pgoff_t pgoff;                        /* Logical page offset based on vma */
                unsigned long address;                /* Faulting virtual address - masked */
                unsigned long real_address;        /* Faulting virtual address - unmasked */
        };
        enum fault_flag flags;                /* FAULT_FLAG_xxx flags
                                         * XXX: should really be 'const' */
        pmd_t *pmd;                        /* Pointer to pmd entry matching
                                         * the 'address' */
        pud_t *pud;                        /* Pointer to pud entry matching
                                         * the 'address'
                                         */
        union {
                pte_t orig_pte;                /* Value of PTE at the time of fault */
                pmd_t orig_pmd;                /* Value of PMD at the time of fault,
                                         * used by PMD fault only.
                                         */
        };

        struct page *cow_page;                /* Page handler may use for COW fault */
        struct page *page;                /* ->fault handlers should return a
                                         * page here, unless VM_FAULT_NOPAGE
                                         * is set (which is also implied by
                                         * VM_FAULT_ERROR).
                                         */
        /* These three entries are valid only while holding ptl lock */
        pte_t *pte;                        /* Pointer to pte entry matching
                                         * the 'address'. NULL if the page
                                         * table hasn't been allocated.
                                         */
        spinlock_t *ptl;                /* Page table lock.
                                         * Protects pte page table if 'pte'
                                         * is not NULL, otherwise pmd.
                                         */
        pgtable_t prealloc_pte;                /* Pre-allocated pte page table.
                                         * vm_ops->map_pages() sets up a page
                                         * table from atomic context.
                                         * do_fault_around() pre-allocates
                                         * page table to avoid allocation from
                                         * atomic context.
                                         */
};

/*
 * These are the virtual MM functions - opening of an area, closing and
 * unmapping it (needed to keep files on disk up-to-date etc), pointer
 * to the functions called when a no-page or a wp-page exception occurs.
 */
struct vm_operations_struct {
        void (*open)(struct vm_area_struct * area);
        /**
         * @close: Called when the VMA is being removed from the MM.
         * Context: User context.  May sleep.  Caller holds mmap_lock.
         */
        void (*close)(struct vm_area_struct * area);
        /* Called any time before splitting to check if it's allowed */
        int (*may_split)(struct vm_area_struct *area, unsigned long addr);
        int (*mremap)(struct vm_area_struct *area);
        /*
         * Called by mprotect() to make driver-specific permission
         * checks before mprotect() is finalised.   The VMA must not
         * be modified.  Returns 0 if mprotect() can proceed.
         */
        int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
                        unsigned long end, unsigned long newflags);
        vm_fault_t (*fault)(struct vm_fault *vmf);
        vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order);
        vm_fault_t (*map_pages)(struct vm_fault *vmf,
                        pgoff_t start_pgoff, pgoff_t end_pgoff);
        unsigned long (*pagesize)(struct vm_area_struct * area);

        /* notification that a previously read-only page is about to become
         * writable, if an error is returned it will cause a SIGBUS */
        vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);

        /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
        vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);

        /* called by access_process_vm when get_user_pages() fails, typically
         * for use by special VMAs. See also generic_access_phys() for a generic
         * implementation useful for any iomem mapping.
         */
        int (*access)(struct vm_area_struct *vma, unsigned long addr,
                      void *buf, int len, int write);

        /* Called by the /proc/PID/maps code to ask the vma whether it
         * has a special name.  Returning non-NULL will also cause this
         * vma to be dumped unconditionally. */
        const char *(*name)(struct vm_area_struct *vma);

#ifdef CONFIG_NUMA
        /*
         * set_policy() op must add a reference to any non-NULL @new mempolicy
         * to hold the policy upon return.  Caller should pass NULL @new to
         * remove a policy and fall back to surrounding context--i.e. do not
         * install a MPOL_DEFAULT policy, nor the task or system default
         * mempolicy.
         */
        int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);

        /*
         * get_policy() op must add reference [mpol_get()] to any policy at
         * (vma,addr) marked as MPOL_SHARED.  The shared policy infrastructure
         * in mm/mempolicy.c will do this automatically.
         * get_policy() must NOT add a ref if the policy at (vma,addr) is not
         * marked as MPOL_SHARED. vma policies are protected by the mmap_lock.
         * If no [shared/vma] mempolicy exists at the addr, get_policy() op
         * must return NULL--i.e., do not "fallback" to task or system default
         * policy.
         */
        struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
                                        unsigned long addr, pgoff_t *ilx);
#endif
        /*
         * Called by vm_normal_page() for special PTEs to find the
         * page for @addr.  This is useful if the default behavior
         * (using pte_page()) would not find the correct page.
         */
        struct page *(*find_special_page)(struct vm_area_struct *vma,
                                          unsigned long addr);
};

#ifdef CONFIG_NUMA_BALANCING
static inline void vma_numab_state_init(struct vm_area_struct *vma)
{
        vma->numab_state = NULL;
}
static inline void vma_numab_state_free(struct vm_area_struct *vma)
{
        kfree(vma->numab_state);
}
#else
static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_PER_VMA_LOCK
/*
 * Try to read-lock a vma. The function is allowed to occasionally yield false
 * locked result to avoid performance overhead, in which case we fall back to
 * using mmap_lock. The function should never yield false unlocked result.
 */
static inline bool vma_start_read(struct vm_area_struct *vma)
{
        /*
         * Check before locking. A race might cause false locked result.
         * We can use READ_ONCE() for the mm_lock_seq here, and don't need
         * ACQUIRE semantics, because this is just a lockless check whose result
         * we don't rely on for anything - the mm_lock_seq read against which we
         * need ordering is below.
         */
        if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
                return false;

        if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
                return false;

        /*
         * Overflow might produce false locked result.
         * False unlocked result is impossible because we modify and check
         * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
         * modification invalidates all existing locks.
         *
         * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
         * racing with vma_end_write_all(), we only start reading from the VMA
         * after it has been unlocked.
         * This pairs with RELEASE semantics in vma_end_write_all().
         */
        if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
                up_read(&vma->vm_lock->lock);
                return false;
        }
        return true;
}

static inline void vma_end_read(struct vm_area_struct *vma)
{
        rcu_read_lock(); /* keeps vma alive till the end of up_read */
        up_read(&vma->vm_lock->lock);
        rcu_read_unlock();
}

/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
{
        mmap_assert_write_locked(vma->vm_mm);

        /*
         * current task is holding mmap_write_lock, both vma->vm_lock_seq and
         * mm->mm_lock_seq can't be concurrently modified.
         */
        *mm_lock_seq = vma->vm_mm->mm_lock_seq;
        return (vma->vm_lock_seq == *mm_lock_seq);
}

/*
 * Begin writing to a VMA.
 * Exclude concurrent readers under the per-VMA lock until the currently
 * write-locked mmap_lock is dropped or downgraded.
 */
static inline void vma_start_write(struct vm_area_struct *vma)
{
        int mm_lock_seq;

        if (__is_vma_write_locked(vma, &mm_lock_seq))
                return;

        down_write(&vma->vm_lock->lock);
        /*
         * We should use WRITE_ONCE() here because we can have concurrent reads
         * from the early lockless pessimistic check in vma_start_read().
         * We don't really care about the correctness of that early check, but
         * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
         */
        WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
        up_write(&vma->vm_lock->lock);
}

static inline void vma_assert_write_locked(struct vm_area_struct *vma)
{
        int mm_lock_seq;

        VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
}

static inline void vma_assert_locked(struct vm_area_struct *vma)
{
        if (!rwsem_is_locked(&vma->vm_lock->lock))
                vma_assert_write_locked(vma);
}

static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
{
        /* When detaching vma should be write-locked */
        if (detached)
                vma_assert_write_locked(vma);
        vma->detached = detached;
}

static inline void release_fault_lock(struct vm_fault *vmf)
{
        if (vmf->flags & FAULT_FLAG_VMA_LOCK)
                vma_end_read(vmf->vma);
        else
                mmap_read_unlock(vmf->vma->vm_mm);
}

static inline void assert_fault_locked(struct vm_fault *vmf)
{
        if (vmf->flags & FAULT_FLAG_VMA_LOCK)
                vma_assert_locked(vmf->vma);
        else
                mmap_assert_locked(vmf->vma->vm_mm);
}

struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
                                          unsigned long address);

#else /* CONFIG_PER_VMA_LOCK */

static inline bool vma_start_read(struct vm_area_struct *vma)
                { return false; }
static inline void vma_end_read(struct vm_area_struct *vma) {}
static inline void vma_start_write(struct vm_area_struct *vma) {}
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
                { mmap_assert_write_locked(vma->vm_mm); }
static inline void vma_mark_detached(struct vm_area_struct *vma,
                                     bool detached) {}

static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
                unsigned long address)
{
        return NULL;
}

static inline void vma_assert_locked(struct vm_area_struct *vma)
{
        mmap_assert_locked(vma->vm_mm);
}

static inline void release_fault_lock(struct vm_fault *vmf)
{
        mmap_read_unlock(vmf->vma->vm_mm);
}

static inline void assert_fault_locked(struct vm_fault *vmf)
{
        mmap_assert_locked(vmf->vma->vm_mm);
}

#endif /* CONFIG_PER_VMA_LOCK */

extern const struct vm_operations_struct vma_dummy_vm_ops;

/*
 * WARNING: vma_init does not initialize vma->vm_lock.
 * Use vm_area_alloc()/vm_area_free() if vma needs locking.
 */
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
        memset(vma, 0, sizeof(*vma));
        vma->vm_mm = mm;
        vma->vm_ops = &vma_dummy_vm_ops;
        INIT_LIST_HEAD(&vma->anon_vma_chain);
        vma_mark_detached(vma, false);
        vma_numab_state_init(vma);
}

/* Use when VMA is not part of the VMA tree and needs no locking */
static inline void vm_flags_init(struct vm_area_struct *vma,
                                 vm_flags_t flags)
{
        ACCESS_PRIVATE(vma, __vm_flags) = flags;
}

/*
 * Use when VMA is part of the VMA tree and modifications need coordination
 * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and
 * it should be locked explicitly beforehand.
 */
static inline void vm_flags_reset(struct vm_area_struct *vma,
                                  vm_flags_t flags)
{
        vma_assert_write_locked(vma);
        vm_flags_init(vma, flags);
}

static inline void vm_flags_reset_once(struct vm_area_struct *vma,
                                       vm_flags_t flags)
{
        vma_assert_write_locked(vma);
        WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
}

static inline void vm_flags_set(struct vm_area_struct *vma,
                                vm_flags_t flags)
{
        vma_start_write(vma);
        ACCESS_PRIVATE(vma, __vm_flags) |= flags;
}

static inline void vm_flags_clear(struct vm_area_struct *vma,
                                  vm_flags_t flags)
{
        vma_start_write(vma);
        ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
}

/*
 * Use only if VMA is not part of the VMA tree or has no other users and
 * therefore needs no locking.
 */
static inline void __vm_flags_mod(struct vm_area_struct *vma,
                                  vm_flags_t set, vm_flags_t clear)
{
        vm_flags_init(vma, (vma->vm_flags | set) & ~clear);
}

/*
 * Use only when the order of set/clear operations is unimportant, otherwise
 * use vm_flags_{set|clear} explicitly.
 */
static inline void vm_flags_mod(struct vm_area_struct *vma,
                                vm_flags_t set, vm_flags_t clear)
{
        vma_start_write(vma);
        __vm_flags_mod(vma, set, clear);
}

static inline void vma_set_anonymous(struct vm_area_struct *vma)
{
        vma->vm_ops = NULL;
}

static inline bool vma_is_anonymous(struct vm_area_struct *vma)
{
        return !vma->vm_ops;
}

/*
 * Indicate if the VMA is a heap for the given task; for
 * /proc/PID/maps that is the heap of the main task.
 */
static inline bool vma_is_initial_heap(const struct vm_area_struct *vma)
{
        return vma->vm_start < vma->vm_mm->brk &&
                vma->vm_end > vma->vm_mm->start_brk;
}

/*
 * Indicate if the VMA is a stack for the given task; for
 * /proc/PID/maps that is the stack of the main task.
 */
static inline bool vma_is_initial_stack(const struct vm_area_struct *vma)
{
        /*
         * We make no effort to guess what a given thread considers to be
         * its "stack".  It's not even well-defined for programs written
         * languages like Go.
         */
        return vma->vm_start <= vma->vm_mm->start_stack &&
                vma->vm_end >= vma->vm_mm->start_stack;
}

static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
{
        int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);

        if (!maybe_stack)
                return false;

        if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
                                                VM_STACK_INCOMPLETE_SETUP)
                return true;

        return false;
}

static inline bool vma_is_foreign(struct vm_area_struct *vma)
{
        if (!current->mm)
                return true;

        if (current->mm != vma->vm_mm)
                return true;

        return false;
}

static inline bool vma_is_accessible(struct vm_area_struct *vma)
{
        return vma->vm_flags & VM_ACCESS_FLAGS;
}

static inline bool is_shared_maywrite(vm_flags_t vm_flags)
{
        return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
                (VM_SHARED | VM_MAYWRITE);
}

static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
{
        return is_shared_maywrite(vma->vm_flags);
}

static inline
struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
{
        return mas_find(&vmi->mas, max - 1);
}

static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
{
        /*
         * Uses mas_find() to get the first VMA when the iterator starts.
         * Calling mas_next() could skip the first entry.
         */
        return mas_find(&vmi->mas, ULONG_MAX);
}

static inline
struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
{
        return mas_next_range(&vmi->mas, ULONG_MAX);
}


static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
{
        return mas_prev(&vmi->mas, 0);
}

static inline
struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi)
{
        return mas_prev_range(&vmi->mas, 0);
}

static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
{
        return vmi->mas.index;
}

static inline unsigned long vma_iter_end(struct vma_iterator *vmi)
{
        return vmi->mas.last + 1;
}
static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
                                      unsigned long count)
{
        return mas_expected_entries(&vmi->mas, count);
}

static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
                        unsigned long start, unsigned long end, gfp_t gfp)
{
        __mas_set_range(&vmi->mas, start, end - 1);
        mas_store_gfp(&vmi->mas, NULL, gfp);
        if (unlikely(mas_is_err(&vmi->mas)))
                return -ENOMEM;

        return 0;
}

/* Free any unused preallocations */
static inline void vma_iter_free(struct vma_iterator *vmi)
{
        mas_destroy(&vmi->mas);
}

static inline int vma_iter_bulk_store(struct vma_iterator *vmi,
                                      struct vm_area_struct *vma)
{
        vmi->mas.index = vma->vm_start;
        vmi->mas.last = vma->vm_end - 1;
        mas_store(&vmi->mas, vma);
        if (unlikely(mas_is_err(&vmi->mas)))
                return -ENOMEM;

        return 0;
}

static inline void vma_iter_invalidate(struct vma_iterator *vmi)
{
        mas_pause(&vmi->mas);
}

static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr)
{
        mas_set(&vmi->mas, addr);
}

#define for_each_vma(__vmi, __vma)                                        \
        while (((__vma) = vma_next(&(__vmi))) != NULL)

/* The MM code likes to work with exclusive end addresses */
#define for_each_vma_range(__vmi, __vma, __end)                                \
        while (((__vma) = vma_find(&(__vmi), (__end))) != NULL)

#ifdef CONFIG_SHMEM
/*
 * The vma_is_shmem is not inline because it is used only by slow
 * paths in userfault.
 */
bool vma_is_shmem(struct vm_area_struct *vma);
bool vma_is_anon_shmem(struct vm_area_struct *vma);
#else
static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; }
#endif

int vma_is_stack_for_current(struct vm_area_struct *vma);

/* flush_tlb_range() takes a vma, not a mm, and can care about flags */
#define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }

struct mmu_gather;
struct inode;

/*
 * compound_order() can be called without holding a reference, which means
 * that niceties like page_folio() don't work.  These callers should be
 * prepared to handle wild return values.  For example, PG_head may be
 * set before the order is initialised, or this may be a tail page.
 * See compaction.c for some good examples.
 */
static inline unsigned int compound_order(struct page *page)
{
        struct folio *folio = (struct folio *)page;

        if (!test_bit(PG_head, &folio->flags))
                return 0;
        return folio->_flags_1 & 0xff;
}

/**
 * folio_order - The allocation order of a folio.
 * @folio: The folio.
 *
 * A folio is composed of 2^order pages.  See get_order() for the definition
 * of order.
 *
 * Return: The order of the folio.
 */
static inline unsigned int folio_order(struct folio *folio)
{
        if (!folio_test_large(folio))
                return 0;
        return folio->_flags_1 & 0xff;
}

#include <linux/huge_mm.h>

/*
 * Methods to modify the page usage count.
 *
 * What counts for a page usage:
 * - cache mapping   (page->mapping)
 * - private data    (page->private)
 * - page mapped in a task's page tables, each mapping
 *   is counted separately
 *
 * Also, many kernel routines increase the page count before a critical
 * routine so they can be sure the page doesn't go away from under them.
 */

/*
 * Drop a ref, return true if the refcount fell to zero (the page has no users)
 */
static inline int put_page_testzero(struct page *page)
{
        VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
        return page_ref_dec_and_test(page);
}

static inline int folio_put_testzero(struct folio *folio)
{
        return put_page_testzero(&folio->page);
}

/*
 * Try to grab a ref unless the page has a refcount of zero, return false if
 * that is the case.
 * This can be called when MMU is off so it must not access
 * any of the virtual mappings.
 */
static inline bool get_page_unless_zero(struct page *page)
{
        return page_ref_add_unless(page, 1, 0);
}

static inline struct folio *folio_get_nontail_page(struct page *page)
{
        if (unlikely(!get_page_unless_zero(page)))
                return NULL;
        return (struct folio *)page;
}

extern int page_is_ram(unsigned long pfn);

enum {
        REGION_INTERSECTS,
        REGION_DISJOINT,
        REGION_MIXED,
};

int region_intersects(resource_size_t offset, size_t size, unsigned long flags,
                      unsigned long desc);

/* Support for virtually mapped pages */
struct page *vmalloc_to_page(const void *addr);
unsigned long vmalloc_to_pfn(const void *addr);

/*
 * Determine if an address is within the vmalloc range
 *
 * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
 * is no special casing required.
 */
#ifdef CONFIG_MMU
extern bool is_vmalloc_addr(const void *x);
extern int is_vmalloc_or_module_addr(const void *x);
#else
static inline bool is_vmalloc_addr(const void *x)
{
        return false;
}
static inline int is_vmalloc_or_module_addr(const void *x)
{
        return 0;
}
#endif

/*
 * How many times the entire folio is mapped as a single unit (eg by a
 * PMD or PUD entry).  This is probably not what you want, except for
 * debugging purposes - it does not include PTE-mapped sub-pages; look
 * at folio_mapcount() or page_mapcount() instead.
 */
static inline int folio_entire_mapcount(const struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
        return atomic_read(&folio->_entire_mapcount) + 1;
}

/*
 * The atomic page->_mapcount, starts from -1: so that transitions
 * both from it and to it can be tracked, using atomic_inc_and_test
 * and atomic_add_negative(-1).
 */
static inline void page_mapcount_reset(struct page *page)
{
        atomic_set(&(page)->_mapcount, -1);
}

/**
 * page_mapcount() - Number of times this precise page is mapped.
 * @page: The page.
 *
 * The number of times this page is mapped.  If this page is part of
 * a large folio, it includes the number of times this page is mapped
 * as part of that folio.
 *
 * Will report 0 for pages which cannot be mapped into userspace, eg
 * slab, page tables and similar.
 */
static inline int page_mapcount(struct page *page)
{
        int mapcount = atomic_read(&page->_mapcount) + 1;

        /* Handle page_has_type() pages */
        if (mapcount < PAGE_MAPCOUNT_RESERVE + 1)
                mapcount = 0;
        if (unlikely(PageCompound(page)))
                mapcount += folio_entire_mapcount(page_folio(page));

        return mapcount;
}

static inline int folio_large_mapcount(const struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_large(folio), folio);
        return atomic_read(&folio->_large_mapcount) + 1;
}

/**
 * folio_mapcount() - Number of mappings of this folio.
 * @folio: The folio.
 *
 * The folio mapcount corresponds to the number of present user page table
 * entries that reference any part of a folio. Each such present user page
 * table entry must be paired with exactly on folio reference.
 *
 * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts
 * exactly once.
 *
 * For hugetlb folios, each abstracted "hugetlb" user page table entry that
 * references the entire folio counts exactly once, even when such special
 * page table entries are comprised of multiple ordinary page table entries.
 *
 * Will report 0 for pages which cannot be mapped into userspace, such as
 * slab, page tables and similar.
 *
 * Return: The number of times this folio is mapped.
 */
static inline int folio_mapcount(const struct folio *folio)
{
        int mapcount;

        if (likely(!folio_test_large(folio))) {
                mapcount = atomic_read(&folio->_mapcount) + 1;
                /* Handle page_has_type() pages */
                if (mapcount < PAGE_MAPCOUNT_RESERVE + 1)
                        mapcount = 0;
                return mapcount;
        }
        return folio_large_mapcount(folio);
}

/**
 * folio_mapped - Is this folio mapped into userspace?
 * @folio: The folio.
 *
 * Return: True if any page in this folio is referenced by user page tables.
 */
static inline bool folio_mapped(const struct folio *folio)
{
        return folio_mapcount(folio) >= 1;
}

/*
 * Return true if this page is mapped into pagetables.
 * For compound page it returns true if any sub-page of compound page is mapped,
 * even if this particular sub-page is not itself mapped by any PTE or PMD.
 */
static inline bool page_mapped(const struct page *page)
{
        return folio_mapped(page_folio(page));
}

static inline struct page *virt_to_head_page(const void *x)
{
        struct page *page = virt_to_page(x);

        return compound_head(page);
}

static inline struct folio *virt_to_folio(const void *x)
{
        struct page *page = virt_to_page(x);

        return page_folio(page);
}

void __folio_put(struct folio *folio);

void put_pages_list(struct list_head *pages);

void split_page(struct page *page, unsigned int order);
void folio_copy(struct folio *dst, struct folio *src);

unsigned long nr_free_buffer_pages(void);

/* Returns the number of bytes in this potentially compound page. */
static inline unsigned long page_size(struct page *page)
{
        return PAGE_SIZE << compound_order(page);
}

/* Returns the number of bits needed for the number of bytes in a page */
static inline unsigned int page_shift(struct page *page)
{
        return PAGE_SHIFT + compound_order(page);
}

/**
 * thp_order - Order of a transparent huge page.
 * @page: Head page of a transparent huge page.
 */
static inline unsigned int thp_order(struct page *page)
{
        VM_BUG_ON_PGFLAGS(PageTail(page), page);
        return compound_order(page);
}

/**
 * thp_size - Size of a transparent huge page.
 * @page: Head page of a transparent huge page.
 *
 * Return: Number of bytes in this page.
 */
static inline unsigned long thp_size(struct page *page)
{
        return PAGE_SIZE << thp_order(page);
}

#ifdef CONFIG_MMU
/*
 * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
 * servicing faults for write access.  In the normal case, do always want
 * pte_mkwrite.  But get_user_pages can cause write faults for mappings
 * that do not have writing enabled, when used by access_process_vm.
 */
static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
        if (likely(vma->vm_flags & VM_WRITE))
                pte = pte_mkwrite(pte, vma);
        return pte;
}

vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
void set_pte_range(struct vm_fault *vmf, struct folio *folio,
                struct page *page, unsigned int nr, unsigned long addr);

vm_fault_t finish_fault(struct vm_fault *vmf);
#endif

/*
 * Multiple processes may "see" the same page. E.g. for untouched
 * mappings of /dev/null, all processes see the same page full of
 * zeroes, and text pages of executables and shared libraries have
 * only one copy in memory, at most, normally.
 *
 * For the non-reserved pages, page_count(page) denotes a reference count.
 *   page_count() == 0 means the page is free. page->lru is then used for
 *   freelist management in the buddy allocator.
 *   page_count() > 0  means the page has been allocated.
 *
 * Pages are allocated by the slab allocator in order to provide memory
 * to kmalloc and kmem_cache_alloc. In this case, the management of the
 * page, and the fields in 'struct page' are the responsibility of mm/slab.c
 * unless a particular usage is carefully commented. (the responsibility of
 * freeing the kmalloc memory is the caller's, of course).
 *
 * A page may be used by anyone else who does a __get_free_page().
 * In this case, page_count still tracks the references, and should only
 * be used through the normal accessor functions. The top bits of page->flags
 * and page->virtual store page management information, but all other fields
 * are unused and could be used privately, carefully. The management of this
 * page is the responsibility of the one who allocated it, and those who have
 * subsequently been given references to it.
 *
 * The other pages (we may call them "pagecache pages") are completely
 * managed by the Linux memory manager: I/O, buffers, swapping etc.
 * The following discussion applies only to them.
 *
 * A pagecache page contains an opaque `private' member, which belongs to the
 * page's address_space. Usually, this is the address of a circular list of
 * the page's disk buffers. PG_private must be set to tell the VM to call
 * into the filesystem to release these pages.
 *
 * A page may belong to an inode's memory mapping. In this case, page->mapping
 * is the pointer to the inode, and page->index is the file offset of the page,
 * in units of PAGE_SIZE.
 *
 * If pagecache pages are not associated with an inode, they are said to be
 * anonymous pages. These may become associated with the swapcache, and in that
 * case PG_swapcache is set, and page->private is an offset into the swapcache.
 *
 * In either case (swapcache or inode backed), the pagecache itself holds one
 * reference to the page. Setting PG_private should also increment the
 * refcount. The each user mapping also has a reference to the page.
 *
 * The pagecache pages are stored in a per-mapping radix tree, which is
 * rooted at mapping->i_pages, and indexed by offset.
 * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
 * lists, we instead now tag pages as dirty/writeback in the radix tree.
 *
 * All pagecache pages may be subject to I/O:
 * - inode pages may need to be read from disk,
 * - inode pages which have been modified and are MAP_SHARED may need
 *   to be written back to the inode on disk,
 * - anonymous pages (including MAP_PRIVATE file mappings) which have been
 *   modified may need to be swapped out to swap space and (later) to be read
 *   back into memory.
 */

#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
DECLARE_STATIC_KEY_FALSE(devmap_managed_key);

bool __put_devmap_managed_folio_refs(struct folio *folio, int refs);
static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
{
        if (!static_branch_unlikely(&devmap_managed_key))
                return false;
        if (!folio_is_zone_device(folio))
                return false;
        return __put_devmap_managed_folio_refs(folio, refs);
}
#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
{
        return false;
}
#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */

/* 127: arbitrary random number, small enough to assemble well */
#define folio_ref_zero_or_close_to_overflow(folio) \
        ((unsigned int) folio_ref_count(folio) + 127u <= 127u)

/**
 * folio_get - Increment the reference count on a folio.
 * @folio: The folio.
 *
 * Context: May be called in any context, as long as you know that
 * you have a refcount on the folio.  If you do not already have one,
 * folio_try_get() may be the right interface for you to use.
 */
static inline void folio_get(struct folio *folio)
{
        VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio);
        folio_ref_inc(folio);
}

static inline void get_page(struct page *page)
{
        folio_get(page_folio(page));
}

static inline __must_check bool try_get_page(struct page *page)
{
        page = compound_head(page);
        if (WARN_ON_ONCE(page_ref_count(page) <= 0))
                return false;
        page_ref_inc(page);
        return true;
}

/**
 * folio_put - Decrement the reference count on a folio.
 * @folio: The folio.
 *
 * If the folio's reference count reaches zero, the memory will be
 * released back to the page allocator and may be used by another
 * allocation immediately.  Do not access the memory or the struct folio
 * after calling folio_put() unless you can be sure that it wasn't the
 * last reference.
 *
 * Context: May be called in process or interrupt context, but not in NMI
 * context.  May be called while holding a spinlock.
 */
static inline void folio_put(struct folio *folio)
{
        if (folio_put_testzero(folio))
                __folio_put(folio);
}

/**
 * folio_put_refs - Reduce the reference count on a folio.
 * @folio: The folio.
 * @refs: The amount to subtract from the folio's reference count.
 *
 * If the folio's reference count reaches zero, the memory will be
 * released back to the page allocator and may be used by another
 * allocation immediately.  Do not access the memory or the struct folio
 * after calling folio_put_refs() unless you can be sure that these weren't
 * the last references.
 *
 * Context: May be called in process or interrupt context, but not in NMI
 * context.  May be called while holding a spinlock.
 */
static inline void folio_put_refs(struct folio *folio, int refs)
{
        if (folio_ref_sub_and_test(folio, refs))
                __folio_put(folio);
}

void folios_put_refs(struct folio_batch *folios, unsigned int *refs);

/*
 * union release_pages_arg - an array of pages or folios
 *
 * release_pages() releases a simple array of multiple pages, and
 * accepts various different forms of said page array: either
 * a regular old boring array of pages, an array of folios, or
 * an array of encoded page pointers.
 *
 * The transparent union syntax for this kind of "any of these
 * argument types" is all kinds of ugly, so look away.
 */
typedef union {
        struct page **pages;
        struct folio **folios;
        struct encoded_page **encoded_pages;
} release_pages_arg __attribute__ ((__transparent_union__));

void release_pages(release_pages_arg, int nr);

/**
 * folios_put - Decrement the reference count on an array of folios.
 * @folios: The folios.
 *
 * Like folio_put(), but for a batch of folios.  This is more efficient
 * than writing the loop yourself as it will optimise the locks which need
 * to be taken if the folios are freed.  The folios batch is returned
 * empty and ready to be reused for another batch; there is no need to
 * reinitialise it.
 *
 * Context: May be called in process or interrupt context, but not in NMI
 * context.  May be called while holding a spinlock.
 */
static inline void folios_put(struct folio_batch *folios)
{
        folios_put_refs(folios, NULL);
}

static inline void put_page(struct page *page)
{
        struct folio *folio = page_folio(page);

        /*
         * For some devmap managed pages we need to catch refcount transition
         * from 2 to 1:
         */
        if (put_devmap_managed_folio_refs(folio, 1))
                return;
        folio_put(folio);
}

/*
 * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload
 * the page's refcount so that two separate items are tracked: the original page
 * reference count, and also a new count of how many pin_user_pages() calls were
 * made against the page. ("gup-pinned" is another term for the latter).
 *
 * With this scheme, pin_user_pages() becomes special: such pages are marked as
 * distinct from normal pages. As such, the unpin_user_page() call (and its
 * variants) must be used in order to release gup-pinned pages.
 *
 * Choice of value:
 *
 * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference
 * counts with respect to pin_user_pages() and unpin_user_page() becomes
 * simpler, due to the fact that adding an even power of two to the page
 * refcount has the effect of using only the upper N bits, for the code that
 * counts up using the bias value. This means that the lower bits are left for
 * the exclusive use of the original code that increments and decrements by one
 * (or at least, by much smaller values than the bias value).
 *
 * Of course, once the lower bits overflow into the upper bits (and this is
 * OK, because subtraction recovers the original values), then visual inspection
 * no longer suffices to directly view the separate counts. However, for normal
 * applications that don't have huge page reference counts, this won't be an
 * issue.
 *
 * Locking: the lockless algorithm described in folio_try_get_rcu()
 * provides safe operation for get_user_pages(), page_mkclean() and
 * other calls that race to set up page table entries.
 */
#define GUP_PIN_COUNTING_BIAS (1U << 10)

void unpin_user_page(struct page *page);
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
                                 bool make_dirty);
void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
                                      bool make_dirty);
void unpin_user_pages(struct page **pages, unsigned long npages);

static inline bool is_cow_mapping(vm_flags_t flags)
{
        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}

#ifndef CONFIG_MMU
static inline bool is_nommu_shared_mapping(vm_flags_t flags)
{
        /*
         * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected
         * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of
         * a file mapping. R/O MAP_PRIVATE mappings might still modify
         * underlying memory if ptrace is active, so this is only possible if
         * ptrace does not apply. Note that there is no mprotect() to upgrade
         * write permissions later.
         */
        return flags & (VM_MAYSHARE | VM_MAYOVERLAY);
}
#endif

#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
#endif

/*
 * The identification function is mainly used by the buddy allocator for
 * determining if two pages could be buddies. We are not really identifying
 * the zone since we could be using the section number id if we do not have
 * node id available in page flags.
 * We only guarantee that it will return the same value for two combinable
 * pages in a zone.
 */
static inline int page_zone_id(struct page *page)
{
        return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
}

#ifdef NODE_NOT_IN_PAGE_FLAGS
int page_to_nid(const struct page *page);
#else
static inline int page_to_nid(const struct page *page)
{
        return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK;
}
#endif

static inline int folio_nid(const struct folio *folio)
{
        return page_to_nid(&folio->page);
}

#ifdef CONFIG_NUMA_BALANCING
/* page access time bits needs to hold at least 4 seconds */
#define PAGE_ACCESS_TIME_MIN_BITS        12
#if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS
#define PAGE_ACCESS_TIME_BUCKETS                                \
        (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT)
#else
#define PAGE_ACCESS_TIME_BUCKETS        0
#endif

#define PAGE_ACCESS_TIME_MASK                                \
        (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS)

static inline int cpu_pid_to_cpupid(int cpu, int pid)
{
        return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
}

static inline int cpupid_to_pid(int cpupid)
{
        return cpupid & LAST__PID_MASK;
}

static inline int cpupid_to_cpu(int cpupid)
{
        return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK;
}

static inline int cpupid_to_nid(int cpupid)
{
        return cpu_to_node(cpupid_to_cpu(cpupid));
}

static inline bool cpupid_pid_unset(int cpupid)
{
        return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK);
}

static inline bool cpupid_cpu_unset(int cpupid)
{
        return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK);
}

static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
{
        return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid);
}

#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{
        return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK);
}

static inline int folio_last_cpupid(struct folio *folio)
{
        return folio->_last_cpupid;
}
static inline void page_cpupid_reset_last(struct page *page)
{
        page->_last_cpupid = -1 & LAST_CPUPID_MASK;
}
#else
static inline int folio_last_cpupid(struct folio *folio)
{
        return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
}

int folio_xchg_last_cpupid(struct folio *folio, int cpupid);

static inline void page_cpupid_reset_last(struct page *page)
{
        page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
}
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */

static inline int folio_xchg_access_time(struct folio *folio, int time)
{
        int last_time;

        last_time = folio_xchg_last_cpupid(folio,
                                           time >> PAGE_ACCESS_TIME_BUCKETS);
        return last_time << PAGE_ACCESS_TIME_BUCKETS;
}

static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
{
        unsigned int pid_bit;

        pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
        if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) {
                __set_bit(pid_bit, &vma->numab_state->pids_active[1]);
        }
}
#else /* !CONFIG_NUMA_BALANCING */
static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{
        return folio_nid(folio); /* XXX */
}

static inline int folio_xchg_access_time(struct folio *folio, int time)
{
        return 0;
}

static inline int folio_last_cpupid(struct folio *folio)
{
        return folio_nid(folio); /* XXX */
}

static inline int cpupid_to_nid(int cpupid)
{
        return -1;
}

static inline int cpupid_to_pid(int cpupid)
{
        return -1;
}

static inline int cpupid_to_cpu(int cpupid)
{
        return -1;
}

static inline int cpu_pid_to_cpupid(int nid, int pid)
{
        return -1;
}

static inline bool cpupid_pid_unset(int cpupid)
{
        return true;
}

static inline void page_cpupid_reset_last(struct page *page)
{
}

static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
{
        return false;
}

static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
{
}
#endif /* CONFIG_NUMA_BALANCING */

#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)

/*
 * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid
 * setting tags for all pages to native kernel tag value 0xff, as the default
 * value 0x00 maps to 0xff.
 */

static inline u8 page_kasan_tag(const struct page *page)
{
        u8 tag = KASAN_TAG_KERNEL;

        if (kasan_enabled()) {
                tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
                tag ^= 0xff;
        }

        return tag;
}

static inline void page_kasan_tag_set(struct page *page, u8 tag)
{
        unsigned long old_flags, flags;

        if (!kasan_enabled())
                return;

        tag ^= 0xff;
        old_flags = READ_ONCE(page->flags);
        do {
                flags = old_flags;
                flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
                flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
        } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
}

static inline void page_kasan_tag_reset(struct page *page)
{
        if (kasan_enabled())
                page_kasan_tag_set(page, KASAN_TAG_KERNEL);
}

#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */

static inline u8 page_kasan_tag(const struct page *page)
{
        return 0xff;
}

static inline void page_kasan_tag_set(struct page *page, u8 tag) { }
static inline void page_kasan_tag_reset(struct page *page) { }

#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */

static inline struct zone *page_zone(const struct page *page)
{
        return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
}

static inline pg_data_t *page_pgdat(const struct page *page)
{
        return NODE_DATA(page_to_nid(page));
}

static inline struct zone *folio_zone(const struct folio *folio)
{
        return page_zone(&folio->page);
}

static inline pg_data_t *folio_pgdat(const struct folio *folio)
{
        return page_pgdat(&folio->page);
}

#ifdef SECTION_IN_PAGE_FLAGS
static inline void set_page_section(struct page *page, unsigned long section)
{
        page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
        page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
}

static inline unsigned long page_to_section(const struct page *page)
{
        return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
}
#endif

/**
 * folio_pfn - Return the Page Frame Number of a folio.
 * @folio: The folio.
 *
 * A folio may contain multiple pages.  The pages have consecutive
 * Page Frame Numbers.
 *
 * Return: The Page Frame Number of the first page in the folio.
 */
static inline unsigned long folio_pfn(struct folio *folio)
{
        return page_to_pfn(&folio->page);
}

static inline struct folio *pfn_folio(unsigned long pfn)
{
        return page_folio(pfn_to_page(pfn));
}

/**
 * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA.
 * @folio: The folio.
 *
 * This function checks if a folio has been pinned via a call to
 * a function in the pin_user_pages() family.
 *
 * For small folios, the return value is partially fuzzy: false is not fuzzy,
 * because it means "definitely not pinned for DMA", but true means "probably
 * pinned for DMA, but possibly a false positive due to having at least
 * GUP_PIN_COUNTING_BIAS worth of normal folio references".
 *
 * False positives are OK, because: a) it's unlikely for a folio to
 * get that many refcounts, and b) all the callers of this routine are
 * expected to be able to deal gracefully with a false positive.
 *
 * For large folios, the result will be exactly correct. That's because
 * we have more tracking data available: the _pincount field is used
 * instead of the GUP_PIN_COUNTING_BIAS scheme.
 *
 * For more information, please see Documentation/core-api/pin_user_pages.rst.
 *
 * Return: True, if it is likely that the page has been "dma-pinned".
 * False, if the page is definitely not dma-pinned.
 */
static inline bool folio_maybe_dma_pinned(struct folio *folio)
{
        if (folio_test_large(folio))
                return atomic_read(&folio->_pincount) > 0;

        /*
         * folio_ref_count() is signed. If that refcount overflows, then
         * folio_ref_count() returns a negative value, and callers will avoid
         * further incrementing the refcount.
         *
         * Here, for that overflow case, use the sign bit to count a little
         * bit higher via unsigned math, and thus still get an accurate result.
         */
        return ((unsigned int)folio_ref_count(folio)) >=
                GUP_PIN_COUNTING_BIAS;
}

static inline bool page_maybe_dma_pinned(struct page *page)
{
        return folio_maybe_dma_pinned(page_folio(page));
}

/*
 * This should most likely only be called during fork() to see whether we
 * should break the cow immediately for an anon page on the src mm.
 *
 * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq.
 */
static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma,
                                          struct folio *folio)
{
        VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1));

        if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
                return false;

        return folio_maybe_dma_pinned(folio);
}

/**
 * is_zero_page - Query if a page is a zero page
 * @page: The page to query
 *
 * This returns true if @page is one of the permanent zero pages.
 */
static inline bool is_zero_page(const struct page *page)
{
        return is_zero_pfn(page_to_pfn(page));
}

/**
 * is_zero_folio - Query if a folio is a zero page
 * @folio: The folio to query
 *
 * This returns true if @folio is one of the permanent zero pages.
 */
static inline bool is_zero_folio(const struct folio *folio)
{
        return is_zero_page(&folio->page);
}

/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */
#ifdef CONFIG_MIGRATION
static inline bool folio_is_longterm_pinnable(struct folio *folio)
{
#ifdef CONFIG_CMA
        int mt = folio_migratetype(folio);

        if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
                return false;
#endif
        /* The zero page can be "pinned" but gets special handling. */
        if (is_zero_folio(folio))
                return true;

        /* Coherent device memory must always allow eviction. */
        if (folio_is_device_coherent(folio))
                return false;

        /* Otherwise, non-movable zone folios can be pinned. */
        return !folio_is_zone_movable(folio);

}
#else
static inline bool folio_is_longterm_pinnable(struct folio *folio)
{
        return true;
}
#endif

static inline void set_page_zone(struct page *page, enum zone_type zone)
{
        page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
        page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
}

static inline void set_page_node(struct page *page, unsigned long node)
{
        page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
        page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
}

static inline void set_page_links(struct page *page, enum zone_type zone,
        unsigned long node, unsigned long pfn)
{
        set_page_zone(page, zone);
        set_page_node(page, node);
#ifdef SECTION_IN_PAGE_FLAGS
        set_page_section(page, pfn_to_section_nr(pfn));
#endif
}

/**
 * folio_nr_pages - The number of pages in the folio.
 * @folio: The folio.
 *
 * Return: A positive power of two.
 */
static inline long folio_nr_pages(const struct folio *folio)
{
        if (!folio_test_large(folio))
                return 1;
#ifdef CONFIG_64BIT
        return folio->_folio_nr_pages;
#else
        return 1L << (folio->_flags_1 & 0xff);
#endif
}

/* Only hugetlbfs can allocate folios larger than MAX_ORDER */
#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
#define MAX_FOLIO_NR_PAGES        (1UL << PUD_ORDER)
#else
#define MAX_FOLIO_NR_PAGES        MAX_ORDER_NR_PAGES
#endif

/*
 * compound_nr() returns the number of pages in this potentially compound
 * page.  compound_nr() can be called on a tail page, and is defined to
 * return 1 in that case.
 */
static inline unsigned long compound_nr(struct page *page)
{
        struct folio *folio = (struct folio *)page;

        if (!test_bit(PG_head, &folio->flags))
                return 1;
#ifdef CONFIG_64BIT
        return folio->_folio_nr_pages;
#else
        return 1L << (folio->_flags_1 & 0xff);
#endif
}

/**
 * thp_nr_pages - The number of regular pages in this huge page.
 * @page: The head page of a huge page.
 */
static inline int thp_nr_pages(struct page *page)
{
        return folio_nr_pages((struct folio *)page);
}

/**
 * folio_next - Move to the next physical folio.
 * @folio: The folio we're currently operating on.
 *
 * If you have physically contiguous memory which may span more than
 * one folio (eg a &struct bio_vec), use this function to move from one
 * folio to the next.  Do not use it if the memory is only virtually
 * contiguous as the folios are almost certainly not adjacent to each
 * other.  This is the folio equivalent to writing ``page++``.
 *
 * Context: We assume that the folios are refcounted and/or locked at a
 * higher level and do not adjust the reference counts.
 * Return: The next struct folio.
 */
static inline struct folio *folio_next(struct folio *folio)
{
        return (struct folio *)folio_page(folio, folio_nr_pages(folio));
}

/**
 * folio_shift - The size of the memory described by this folio.
 * @folio: The folio.
 *
 * A folio represents a number of bytes which is a power-of-two in size.
 * This function tells you which power-of-two the folio is.  See also
 * folio_size() and folio_order().
 *
 * Context: The caller should have a reference on the folio to prevent
 * it from being split.  It is not necessary for the folio to be locked.
 * Return: The base-2 logarithm of the size of this folio.
 */
static inline unsigned int folio_shift(struct folio *folio)
{
        return PAGE_SHIFT + folio_order(folio);
}

/**
 * folio_size - The number of bytes in a folio.
 * @folio: The folio.
 *
 * Context: The caller should have a reference on the folio to prevent
 * it from being split.  It is not necessary for the folio to be locked.
 * Return: The number of bytes in this folio.
 */
static inline size_t folio_size(struct folio *folio)
{
        return PAGE_SIZE << folio_order(folio);
}

/**
 * folio_likely_mapped_shared - Estimate if the folio is mapped into the page
 *                                tables of more than one MM
 * @folio: The folio.
 *
 * This function checks if the folio is currently mapped into more than one
 * MM ("mapped shared"), or if the folio is only mapped into a single MM
 * ("mapped exclusively").
 *
 * As precise information is not easily available for all folios, this function
 * estimates the number of MMs ("sharers") that are currently mapping a folio
 * using the number of times the first page of the folio is currently mapped
 * into page tables.
 *
 * For small anonymous folios (except KSM folios) and anonymous hugetlb folios,
 * the return value will be exactly correct, because they can only be mapped
 * at most once into an MM, and they cannot be partially mapped.
 *
 * For other folios, the result can be fuzzy:
 *    #. For partially-mappable large folios (THP), the return value can wrongly
 *       indicate "mapped exclusively" (false negative) when the folio is
 *       only partially mapped into at least one MM.
 *    #. For pagecache folios (including hugetlb), the return value can wrongly
 *       indicate "mapped shared" (false positive) when two VMAs in the same MM
 *       cover the same file range.
 *    #. For (small) KSM folios, the return value can wrongly indicate "mapped
 *       shared" (false positive), when the folio is mapped multiple times into
 *       the same MM.
 *
 * Further, this function only considers current page table mappings that
 * are tracked using the folio mapcount(s).
 *
 * This function does not consider:
 *    #. If the folio might get mapped in the (near) future (e.g., swapcache,
 *       pagecache, temporary unmapping for migration).
 *    #. If the folio is mapped differently (VM_PFNMAP).
 *    #. If hugetlb page table sharing applies. Callers might want to check
 *       hugetlb_pmd_shared().
 *
 * Return: Whether the folio is estimated to be mapped into more than one MM.
 */
static inline bool folio_likely_mapped_shared(struct folio *folio)
{
        int mapcount = folio_mapcount(folio);

        /* Only partially-mappable folios require more care. */
        if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio)))
                return mapcount > 1;

        /* A single mapping implies "mapped exclusively". */
        if (mapcount <= 1)
                return false;

        /* If any page is mapped more than once we treat it "mapped shared". */
        if (folio_entire_mapcount(folio) || mapcount > folio_nr_pages(folio))
                return true;

        /* Let's guess based on the first subpage. */
        return atomic_read(&folio->_mapcount) > 0;
}

#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
static inline int arch_make_page_accessible(struct page *page)
{
        return 0;
}
#endif

#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
static inline int arch_make_folio_accessible(struct folio *folio)
{
        int ret;
        long i, nr = folio_nr_pages(folio);

        for (i = 0; i < nr; i++) {
                ret = arch_make_page_accessible(folio_page(folio, i));
                if (ret)
                        break;
        }

        return ret;
}
#endif

/*
 * Some inline functions in vmstat.h depend on page_zone()
 */
#include <linux/vmstat.h>

#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
#define HASHED_PAGE_VIRTUAL
#endif

#if defined(WANT_PAGE_VIRTUAL)
static inline void *page_address(const struct page *page)
{
        return page->virtual;
}
static inline void set_page_address(struct page *page, void *address)
{
        page->virtual = address;
}
#define page_address_init()  do { } while(0)
#endif

#if defined(HASHED_PAGE_VIRTUAL)
void *page_address(const struct page *page);
void set_page_address(struct page *page, void *virtual);
void page_address_init(void);
#endif

static __always_inline void *lowmem_page_address(const struct page *page)
{
        return page_to_virt(page);
}

#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
#define page_address(page) lowmem_page_address(page)
#define set_page_address(page, address)  do { } while(0)
#define page_address_init()  do { } while(0)
#endif

static inline void *folio_address(const struct folio *folio)
{
        return page_address(&folio->page);
}

extern pgoff_t __page_file_index(struct page *page);

/*
 * Return the pagecache index of the passed page.  Regular pagecache pages
 * use ->index whereas swapcache pages use swp_offset(->private)
 */
static inline pgoff_t page_index(struct page *page)
{
        if (unlikely(PageSwapCache(page)))
                return __page_file_index(page);
        return page->index;
}

/*
 * Return true only if the page has been allocated with
 * ALLOC_NO_WATERMARKS and the low watermark was not
 * met implying that the system is under some pressure.
 */
static inline bool page_is_pfmemalloc(const struct page *page)
{
        /*
         * lru.next has bit 1 set if the page is allocated from the
         * pfmemalloc reserves.  Callers may simply overwrite it if
         * they do not need to preserve that information.
         */
        return (uintptr_t)page->lru.next & BIT(1);
}

/*
 * Return true only if the folio has been allocated with
 * ALLOC_NO_WATERMARKS and the low watermark was not
 * met implying that the system is under some pressure.
 */
static inline bool folio_is_pfmemalloc(const struct folio *folio)
{
        /*
         * lru.next has bit 1 set if the page is allocated from the
         * pfmemalloc reserves.  Callers may simply overwrite it if
         * they do not need to preserve that information.
         */
        return (uintptr_t)folio->lru.next & BIT(1);
}

/*
 * Only to be called by the page allocator on a freshly allocated
 * page.
 */
static inline void set_page_pfmemalloc(struct page *page)
{
        page->lru.next = (void *)BIT(1);
}

static inline void clear_page_pfmemalloc(struct page *page)
{
        page->lru.next = NULL;
}

/*
 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
 */
extern void pagefault_out_of_memory(void);

#define offset_in_page(p)        ((unsigned long)(p) & ~PAGE_MASK)
#define offset_in_thp(page, p)        ((unsigned long)(p) & (thp_size(page) - 1))
#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1))

/*
 * Parameter block passed down to zap_pte_range in exceptional cases.
 */
struct zap_details {
        struct folio *single_folio;        /* Locked folio to be unmapped */
        bool even_cows;                        /* Zap COWed private pages too? */
        zap_flags_t zap_flags;                /* Extra flags for zapping */
};

/*
 * Whether to drop the pte markers, for example, the uffd-wp information for
 * file-backed memory.  This should only be specified when we will completely
 * drop the page in the mm, either by truncation or unmapping of the vma.  By
 * default, the flag is not set.
 */
#define  ZAP_FLAG_DROP_MARKER        ((__force zap_flags_t) BIT(0))
/* Set in unmap_vmas() to indicate a final unmap call.  Only used by hugetlb */
#define  ZAP_FLAG_UNMAP              ((__force zap_flags_t) BIT(1))

#ifdef CONFIG_SCHED_MM_CID
void sched_mm_cid_before_execve(struct task_struct *t);
void sched_mm_cid_after_execve(struct task_struct *t);
void sched_mm_cid_fork(struct task_struct *t);
void sched_mm_cid_exit_signals(struct task_struct *t);
static inline int task_mm_cid(struct task_struct *t)
{
        return t->mm_cid;
}
#else
static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
static inline void sched_mm_cid_fork(struct task_struct *t) { }
static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
static inline int task_mm_cid(struct task_struct *t)
{
        /*
         * Use the processor id as a fall-back when the mm cid feature is
         * disabled. This provides functional per-cpu data structure accesses
         * in user-space, althrough it won't provide the memory usage benefits.
         */
        return raw_smp_processor_id();
}
#endif

#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
#else
static inline bool can_do_mlock(void) { return false; }
#endif
extern int user_shm_lock(size_t, struct ucounts *);
extern void user_shm_unlock(size_t, struct ucounts *);

struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
                             pte_t pte);
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                             pte_t pte);
struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
                                  unsigned long addr, pmd_t pmd);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
                                pmd_t pmd);

void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
                  unsigned long size);
void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
                           unsigned long size, struct zap_details *details);
static inline void zap_vma_pages(struct vm_area_struct *vma)
{
        zap_page_range_single(vma, vma->vm_start,
                              vma->vm_end - vma->vm_start, NULL);
}
void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
                struct vm_area_struct *start_vma, unsigned long start,
                unsigned long end, unsigned long tree_end, bool mm_wr_locked);

struct mmu_notifier_range;

void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
                unsigned long end, unsigned long floor, unsigned long ceiling);
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
int follow_pte(struct vm_area_struct *vma, unsigned long address,
               pte_t **ptepp, spinlock_t **ptlp);
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                        void *buf, int len, int write);

extern void truncate_pagecache(struct inode *inode, loff_t new);
extern void truncate_setsize(struct inode *inode, loff_t newsize);
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
int generic_error_remove_folio(struct address_space *mapping,
                struct folio *folio);

struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
                unsigned long address, struct pt_regs *regs);

#ifdef CONFIG_MMU
extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
                                  unsigned long address, unsigned int flags,
                                  struct pt_regs *regs);
extern int fixup_user_fault(struct mm_struct *mm,
                            unsigned long address, unsigned int fault_flags,
                            bool *unlocked);
void unmap_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t nr, bool even_cows);
void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows);
#else
static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
                                         unsigned long address, unsigned int flags,
                                         struct pt_regs *regs)
{
        /* should never happen if there's no MMU */
        BUG();
        return VM_FAULT_SIGBUS;
}
static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address,
                unsigned int fault_flags, bool *unlocked)
{
        /* should never happen if there's no MMU */
        BUG();
        return -EFAULT;
}
static inline void unmap_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t nr, bool even_cows) { }
static inline void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows) { }
#endif

static inline void unmap_shared_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen)
{
        unmap_mapping_range(mapping, holebegin, holelen, 0);
}

static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm,
                                                unsigned long addr);

extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
                void *buf, int len, unsigned int gup_flags);
extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
                void *buf, int len, unsigned int gup_flags);

long get_user_pages_remote(struct mm_struct *mm,
                           unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           int *locked);
long pin_user_pages_remote(struct mm_struct *mm,
                           unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           int *locked);

/*
 * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT.
 */
static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
                                                    unsigned long addr,
                                                    int gup_flags,
                                                    struct vm_area_struct **vmap)
{
        struct page *page;
        struct vm_area_struct *vma;
        int got;

        if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT)))
                return ERR_PTR(-EINVAL);

        got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);

        if (got < 0)
                return ERR_PTR(got);

        vma = vma_lookup(mm, addr);
        if (WARN_ON_ONCE(!vma)) {
                put_page(page);
                return ERR_PTR(-EINVAL);
        }

        *vmap = vma;
        return page;
}

long get_user_pages(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages);
long pin_user_pages(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                    struct page **pages, unsigned int gup_flags);
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                    struct page **pages, unsigned int gup_flags);

int get_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages);
int pin_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages);
void folio_add_pin(struct folio *folio);

int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
                        struct task_struct *task, bool bypass_rlim);

struct kvec;
struct page *get_dump_page(unsigned long addr);

bool folio_mark_dirty(struct folio *folio);
bool set_page_dirty(struct page *page);
int set_page_dirty_lock(struct page *page);

int get_cmdline(struct task_struct *task, char *buffer, int buflen);

extern unsigned long move_page_tables(struct vm_area_struct *vma,
                unsigned long old_addr, struct vm_area_struct *new_vma,
                unsigned long new_addr, unsigned long len,
                bool need_rmap_locks, bool for_stack);

/*
 * Flags used by change_protection().  For now we make it a bitmap so
 * that we can pass in multiple flags just like parameters.  However
 * for now all the callers are only use one of the flags at the same
 * time.
 */
/*
 * Whether we should manually check if we can map individual PTEs writable,
 * because something (e.g., COW, uffd-wp) blocks that from happening for all
 * PTEs automatically in a writable mapping.
 */
#define  MM_CP_TRY_CHANGE_WRITABLE           (1UL << 0)
/* Whether this protection change is for NUMA hints */
#define  MM_CP_PROT_NUMA                   (1UL << 1)
/* Whether this change is for write protecting */
#define  MM_CP_UFFD_WP                     (1UL << 2) /* do wp */
#define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
#define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
                                            MM_CP_UFFD_WP_RESOLVE)

bool vma_needs_dirty_tracking(struct vm_area_struct *vma);
bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
{
        /*
         * We want to check manually if we can change individual PTEs writable
         * if we can't do that automatically for all PTEs in a mapping. For
         * private mappings, that's always the case when we have write
         * permissions as we properly have to handle COW.
         */
        if (vma->vm_flags & VM_SHARED)
                return vma_wants_writenotify(vma, vma->vm_page_prot);
        return !!(vma->vm_flags & VM_WRITE);

}
bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
                             pte_t pte);
extern long change_protection(struct mmu_gather *tlb,
                              struct vm_area_struct *vma, unsigned long start,
                              unsigned long end, unsigned long cp_flags);
extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
          struct vm_area_struct *vma, struct vm_area_struct **pprev,
          unsigned long start, unsigned long end, unsigned long newflags);

/*
 * doesn't attempt to fault and will return short.
 */
int get_user_pages_fast_only(unsigned long start, int nr_pages,
                             unsigned int gup_flags, struct page **pages);

static inline bool get_user_page_fast_only(unsigned long addr,
                        unsigned int gup_flags, struct page **pagep)
{
        return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1;
}
/*
 * per-process(per-mm_struct) statistics.
 */
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
        return percpu_counter_read_positive(&mm->rss_stat[member]);
}

void mm_trace_rss_stat(struct mm_struct *mm, int member);

static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
{
        percpu_counter_add(&mm->rss_stat[member], value);

        mm_trace_rss_stat(mm, member);
}

static inline void inc_mm_counter(struct mm_struct *mm, int member)
{
        percpu_counter_inc(&mm->rss_stat[member]);

        mm_trace_rss_stat(mm, member);
}

static inline void dec_mm_counter(struct mm_struct *mm, int member)
{
        percpu_counter_dec(&mm->rss_stat[member]);

        mm_trace_rss_stat(mm, member);
}

/* Optimized variant when folio is already known not to be anon */
static inline int mm_counter_file(struct folio *folio)
{
        if (folio_test_swapbacked(folio))
                return MM_SHMEMPAGES;
        return MM_FILEPAGES;
}

static inline int mm_counter(struct folio *folio)
{
        if (folio_test_anon(folio))
                return MM_ANONPAGES;
        return mm_counter_file(folio);
}

static inline unsigned long get_mm_rss(struct mm_struct *mm)
{
        return get_mm_counter(mm, MM_FILEPAGES) +
                get_mm_counter(mm, MM_ANONPAGES) +
                get_mm_counter(mm, MM_SHMEMPAGES);
}

static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
{
        return max(mm->hiwater_rss, get_mm_rss(mm));
}

static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
{
        return max(mm->hiwater_vm, mm->total_vm);
}

static inline void update_hiwater_rss(struct mm_struct *mm)
{
        unsigned long _rss = get_mm_rss(mm);

        if ((mm)->hiwater_rss < _rss)
                (mm)->hiwater_rss = _rss;
}

static inline void update_hiwater_vm(struct mm_struct *mm)
{
        if (mm->hiwater_vm < mm->total_vm)
                mm->hiwater_vm = mm->total_vm;
}

static inline void reset_mm_hiwater_rss(struct mm_struct *mm)
{
        mm->hiwater_rss = get_mm_rss(mm);
}

static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
                                         struct mm_struct *mm)
{
        unsigned long hiwater_rss = get_mm_hiwater_rss(mm);

        if (*maxrss < hiwater_rss)
                *maxrss = hiwater_rss;
}

#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
static inline int pte_special(pte_t pte)
{
        return 0;
}

static inline pte_t pte_mkspecial(pte_t pte)
{
        return pte;
}
#endif

#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
static inline int pte_devmap(pte_t pte)
{
        return 0;
}
#endif

extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
                               spinlock_t **ptl);
static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
                                    spinlock_t **ptl)
{
        pte_t *ptep;
        __cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl));
        return ptep;
}

#ifdef __PAGETABLE_P4D_FOLDED
static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
                                                unsigned long address)
{
        return 0;
}
#else
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
#endif

#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU)
static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
                                                unsigned long address)
{
        return 0;
}
static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
static inline void mm_dec_nr_puds(struct mm_struct *mm) {}

#else
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);

static inline void mm_inc_nr_puds(struct mm_struct *mm)
{
        if (mm_pud_folded(mm))
                return;
        atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_puds(struct mm_struct *mm)
{
        if (mm_pud_folded(mm))
                return;
        atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
}
#endif

#if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
                                                unsigned long address)
{
        return 0;
}

static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}

#else
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);

static inline void mm_inc_nr_pmds(struct mm_struct *mm)
{
        if (mm_pmd_folded(mm))
                return;
        atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_pmds(struct mm_struct *mm)
{
        if (mm_pmd_folded(mm))
                return;
        atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
}
#endif

#ifdef CONFIG_MMU
static inline void mm_pgtables_bytes_init(struct mm_struct *mm)
{
        atomic_long_set(&mm->pgtables_bytes, 0);
}

static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
{
        return atomic_long_read(&mm->pgtables_bytes);
}

static inline void mm_inc_nr_ptes(struct mm_struct *mm)
{
        atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_ptes(struct mm_struct *mm)
{
        atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
}
#else

static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {}
static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
{
        return 0;
}

static inline void mm_inc_nr_ptes(struct mm_struct *mm) {}
static inline void mm_dec_nr_ptes(struct mm_struct *mm) {}
#endif

int __pte_alloc(struct mm_struct *mm, pmd_t *pmd);
int __pte_alloc_kernel(pmd_t *pmd);

#if defined(CONFIG_MMU)

static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
                unsigned long address)
{
        return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ?
                NULL : p4d_offset(pgd, address);
}

static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d,
                unsigned long address)
{
        return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ?
                NULL : pud_offset(p4d, address);
}

static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
        return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
                NULL: pmd_offset(pud, address);
}
#endif /* CONFIG_MMU */

static inline struct ptdesc *virt_to_ptdesc(const void *x)
{
        return page_ptdesc(virt_to_page(x));
}

static inline void *ptdesc_to_virt(const struct ptdesc *pt)
{
        return page_to_virt(ptdesc_page(pt));
}

static inline void *ptdesc_address(const struct ptdesc *pt)
{
        return folio_address(ptdesc_folio(pt));
}

static inline bool pagetable_is_reserved(struct ptdesc *pt)
{
        return folio_test_reserved(ptdesc_folio(pt));
}

/**
 * pagetable_alloc - Allocate pagetables
 * @gfp:    GFP flags
 * @order:  desired pagetable order
 *
 * pagetable_alloc allocates memory for page tables as well as a page table
 * descriptor to describe that memory.
 *
 * Return: The ptdesc describing the allocated page tables.
 */
static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order)
{
        struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order);

        return page_ptdesc(page);
}
#define pagetable_alloc(...)        alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__))

/**
 * pagetable_free - Free pagetables
 * @pt:        The page table descriptor
 *
 * pagetable_free frees the memory of all page tables described by a page
 * table descriptor and the memory for the descriptor itself.
 */
static inline void pagetable_free(struct ptdesc *pt)
{
        struct page *page = ptdesc_page(pt);

        __free_pages(page, compound_order(page));
}

#if USE_SPLIT_PTE_PTLOCKS
#if ALLOC_SPLIT_PTLOCKS
void __init ptlock_cache_init(void);
bool ptlock_alloc(struct ptdesc *ptdesc);
void ptlock_free(struct ptdesc *ptdesc);

static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
{
        return ptdesc->ptl;
}
#else /* ALLOC_SPLIT_PTLOCKS */
static inline void ptlock_cache_init(void)
{
}

static inline bool ptlock_alloc(struct ptdesc *ptdesc)
{
        return true;
}

static inline void ptlock_free(struct ptdesc *ptdesc)
{
}

static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
{
        return &ptdesc->ptl;
}
#endif /* ALLOC_SPLIT_PTLOCKS */

static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
}

static inline bool ptlock_init(struct ptdesc *ptdesc)
{
        /*
         * prep_new_page() initialize page->private (and therefore page->ptl)
         * with 0. Make sure nobody took it in use in between.
         *
         * It can happen if arch try to use slab for page table allocation:
         * slab code uses page->slab_cache, which share storage with page->ptl.
         */
        VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc));
        if (!ptlock_alloc(ptdesc))
                return false;
        spin_lock_init(ptlock_ptr(ptdesc));
        return true;
}

#else        /* !USE_SPLIT_PTE_PTLOCKS */
/*
 * We use mm->page_table_lock to guard all pagetable pages of the mm.
 */
static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return &mm->page_table_lock;
}
static inline void ptlock_cache_init(void) {}
static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
static inline void ptlock_free(struct ptdesc *ptdesc) {}
#endif /* USE_SPLIT_PTE_PTLOCKS */

static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
{
        struct folio *folio = ptdesc_folio(ptdesc);

        if (!ptlock_init(ptdesc))
                return false;
        __folio_set_pgtable(folio);
        lruvec_stat_add_folio(folio, NR_PAGETABLE);
        return true;
}

static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
{
        struct folio *folio = ptdesc_folio(ptdesc);

        ptlock_free(ptdesc);
        __folio_clear_pgtable(folio);
        lruvec_stat_sub_folio(folio, NR_PAGETABLE);
}

pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr)
{
        return __pte_offset_map(pmd, addr, NULL);
}

pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
                        unsigned long addr, spinlock_t **ptlp);
static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
                        unsigned long addr, spinlock_t **ptlp)
{
        pte_t *pte;

        __cond_lock(*ptlp, pte = __pte_offset_map_lock(mm, pmd, addr, ptlp));
        return pte;
}

pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
                        unsigned long addr, spinlock_t **ptlp);

#define pte_unmap_unlock(pte, ptl)        do {                \
        spin_unlock(ptl);                                \
        pte_unmap(pte);                                        \
} while (0)

#define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd))

#define pte_alloc_map(mm, pmd, address)                        \
        (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address))

#define pte_alloc_map_lock(mm, pmd, address, ptlp)        \
        (pte_alloc(mm, pmd) ?                        \
                 NULL : pte_offset_map_lock(mm, pmd, address, ptlp))

#define pte_alloc_kernel(pmd, address)                        \
        ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
                NULL: pte_offset_kernel(pmd, address))

#if USE_SPLIT_PMD_PTLOCKS

static inline struct page *pmd_pgtable_page(pmd_t *pmd)
{
        unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
        return virt_to_page((void *)((unsigned long) pmd & mask));
}

static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd)
{
        return page_ptdesc(pmd_pgtable_page(pmd));
}

static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return ptlock_ptr(pmd_ptdesc(pmd));
}

static inline bool pmd_ptlock_init(struct ptdesc *ptdesc)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        ptdesc->pmd_huge_pte = NULL;
#endif
        return ptlock_init(ptdesc);
}

static inline void pmd_ptlock_free(struct ptdesc *ptdesc)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc));
#endif
        ptlock_free(ptdesc);
}

#define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte)

#else

static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return &mm->page_table_lock;
}

static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; }
static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {}

#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)

#endif

static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
{
        spinlock_t *ptl = pmd_lockptr(mm, pmd);
        spin_lock(ptl);
        return ptl;
}

static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc)
{
        struct folio *folio = ptdesc_folio(ptdesc);

        if (!pmd_ptlock_init(ptdesc))
                return false;
        __folio_set_pgtable(folio);
        lruvec_stat_add_folio(folio, NR_PAGETABLE);
        return true;
}

static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc)
{
        struct folio *folio = ptdesc_folio(ptdesc);

        pmd_ptlock_free(ptdesc);
        __folio_clear_pgtable(folio);
        lruvec_stat_sub_folio(folio, NR_PAGETABLE);
}

/*
 * No scalability reason to split PUD locks yet, but follow the same pattern
 * as the PMD locks to make it easier if we decide to.  The VM should not be
 * considered ready to switch to split PUD locks yet; there may be places
 * which need to be converted from page_table_lock.
 */
static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud)
{
        return &mm->page_table_lock;
}

static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
{
        spinlock_t *ptl = pud_lockptr(mm, pud);

        spin_lock(ptl);
        return ptl;
}

static inline void pagetable_pud_ctor(struct ptdesc *ptdesc)
{
        struct folio *folio = ptdesc_folio(ptdesc);

        __folio_set_pgtable(folio);
        lruvec_stat_add_folio(folio, NR_PAGETABLE);
}

static inline void pagetable_pud_dtor(struct ptdesc *ptdesc)
{
        struct folio *folio = ptdesc_folio(ptdesc);

        __folio_clear_pgtable(folio);
        lruvec_stat_sub_folio(folio, NR_PAGETABLE);
}

extern void __init pagecache_init(void);
extern void free_initmem(void);

/*
 * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
 * into the buddy system. The freed pages will be poisoned with pattern
 * "poison" if it's within range [0, UCHAR_MAX].
 * Return pages freed into the buddy system.
 */
extern unsigned long free_reserved_area(void *start, void *end,
                                        int poison, const char *s);

extern void adjust_managed_page_count(struct page *page, long count);

extern void reserve_bootmem_region(phys_addr_t start,
                                   phys_addr_t end, int nid);

/* Free the reserved page into the buddy system, so it gets managed. */
static inline void free_reserved_page(struct page *page)
{
        if (mem_alloc_profiling_enabled()) {
                union codetag_ref *ref = get_page_tag_ref(page);

                if (ref) {
                        set_codetag_empty(ref);
                        put_page_tag_ref(ref);
                }
        }
        ClearPageReserved(page);
        init_page_count(page);
        __free_page(page);
        adjust_managed_page_count(page, 1);
}
#define free_highmem_page(page) free_reserved_page(page)

static inline void mark_page_reserved(struct page *page)
{
        SetPageReserved(page);
        adjust_managed_page_count(page, -1);
}

static inline void free_reserved_ptdesc(struct ptdesc *pt)
{
        free_reserved_page(ptdesc_page(pt));
}

/*
 * Default method to free all the __init memory into the buddy system.
 * The freed pages will be poisoned with pattern "poison" if it's within
 * range [0, UCHAR_MAX].
 * Return pages freed into the buddy system.
 */
static inline unsigned long free_initmem_default(int poison)
{
        extern char __init_begin[], __init_end[];

        return free_reserved_area(&__init_begin, &__init_end,
                                  poison, "unused kernel image (initmem)");
}

static inline unsigned long get_num_physpages(void)
{
        int nid;
        unsigned long phys_pages = 0;

        for_each_online_node(nid)
                phys_pages += node_present_pages(nid);

        return phys_pages;
}

/*
 * Using memblock node mappings, an architecture may initialise its
 * zones, allocate the backing mem_map and account for memory holes in an
 * architecture independent manner.
 *
 * An architecture is expected to register range of page frames backed by
 * physical memory with memblock_add[_node]() before calling
 * free_area_init() passing in the PFN each zone ends at. At a basic
 * usage, an architecture is expected to do something like
 *
 * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
 *                                                          max_highmem_pfn};
 * for_each_valid_physical_page_range()
 *        memblock_add_node(base, size, nid, MEMBLOCK_NONE)
 * free_area_init(max_zone_pfns);
 */
void free_area_init(unsigned long *max_zone_pfn);
unsigned long node_map_pfn_alignment(void);
extern unsigned long absent_pages_in_range(unsigned long start_pfn,
                                                unsigned long end_pfn);
extern void get_pfn_range_for_nid(unsigned int nid,
                        unsigned long *start_pfn, unsigned long *end_pfn);

#ifndef CONFIG_NUMA
static inline int early_pfn_to_nid(unsigned long pfn)
{
        return 0;
}
#else
/* please see mm/page_alloc.c */
extern int __meminit early_pfn_to_nid(unsigned long pfn);
#endif

extern void mem_init(void);
extern void __init mmap_init(void);

extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
static inline void show_mem(void)
{
        __show_mem(0, NULL, MAX_NR_ZONES - 1);
}
extern long si_mem_available(void);
extern void si_meminfo(struct sysinfo * val);
extern void si_meminfo_node(struct sysinfo *val, int nid);

extern __printf(3, 4)
void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);

extern void setup_per_cpu_pageset(void);

/* nommu.c */
extern atomic_long_t mmap_pages_allocated;
extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);

/* interval_tree.c */
void vma_interval_tree_insert(struct vm_area_struct *node,
                              struct rb_root_cached *root);
void vma_interval_tree_insert_after(struct vm_area_struct *node,
                                    struct vm_area_struct *prev,
                                    struct rb_root_cached *root);
void vma_interval_tree_remove(struct vm_area_struct *node,
                              struct rb_root_cached *root);
struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root,
                                unsigned long start, unsigned long last);
struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
                                unsigned long start, unsigned long last);

#define vma_interval_tree_foreach(vma, root, start, last)                \
        for (vma = vma_interval_tree_iter_first(root, start, last);        \
             vma; vma = vma_interval_tree_iter_next(vma, start, last))

void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
                                   struct rb_root_cached *root);
void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
                                   struct rb_root_cached *root);
struct anon_vma_chain *
anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
                                  unsigned long start, unsigned long last);
struct anon_vma_chain *anon_vma_interval_tree_iter_next(
        struct anon_vma_chain *node, unsigned long start, unsigned long last);
#ifdef CONFIG_DEBUG_VM_RB
void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
#endif

#define anon_vma_interval_tree_foreach(avc, root, start, last)                 \
        for (avc = anon_vma_interval_tree_iter_first(root, start, last); \
             avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))

/* mmap.c */
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
                      unsigned long start, unsigned long end, pgoff_t pgoff,
                      struct vm_area_struct *next);
extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
                       unsigned long start, unsigned long end, pgoff_t pgoff);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void unlink_file_vma(struct vm_area_struct *);
extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
        unsigned long addr, unsigned long len, pgoff_t pgoff,
        bool *need_rmap_locks);
extern void exit_mmap(struct mm_struct *);
struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
                                  struct vm_area_struct *prev,
                                  struct vm_area_struct *vma,
                                  unsigned long start, unsigned long end,
                                  unsigned long vm_flags,
                                  struct mempolicy *policy,
                                  struct vm_userfaultfd_ctx uffd_ctx,
                                  struct anon_vma_name *anon_name);

/* We are about to modify the VMA's flags. */
static inline struct vm_area_struct
*vma_modify_flags(struct vma_iterator *vmi,
                  struct vm_area_struct *prev,
                  struct vm_area_struct *vma,
                  unsigned long start, unsigned long end,
                  unsigned long new_flags)
{
        return vma_modify(vmi, prev, vma, start, end, new_flags,
                          vma_policy(vma), vma->vm_userfaultfd_ctx,
                          anon_vma_name(vma));
}

/* We are about to modify the VMA's flags and/or anon_name. */
static inline struct vm_area_struct
*vma_modify_flags_name(struct vma_iterator *vmi,
                       struct vm_area_struct *prev,
                       struct vm_area_struct *vma,
                       unsigned long start,
                       unsigned long end,
                       unsigned long new_flags,
                       struct anon_vma_name *new_name)
{
        return vma_modify(vmi, prev, vma, start, end, new_flags,
                          vma_policy(vma), vma->vm_userfaultfd_ctx, new_name);
}

/* We are about to modify the VMA's memory policy. */
static inline struct vm_area_struct
*vma_modify_policy(struct vma_iterator *vmi,
                   struct vm_area_struct *prev,
                   struct vm_area_struct *vma,
                   unsigned long start, unsigned long end,
                   struct mempolicy *new_pol)
{
        return vma_modify(vmi, prev, vma, start, end, vma->vm_flags,
                          new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma));
}

/* We are about to modify the VMA's flags and/or uffd context. */
static inline struct vm_area_struct
*vma_modify_flags_uffd(struct vma_iterator *vmi,
                       struct vm_area_struct *prev,
                       struct vm_area_struct *vma,
                       unsigned long start, unsigned long end,
                       unsigned long new_flags,
                       struct vm_userfaultfd_ctx new_ctx)
{
        return vma_modify(vmi, prev, vma, start, end, new_flags,
                          vma_policy(vma), new_ctx, anon_vma_name(vma));
}

static inline int check_data_rlimit(unsigned long rlim,
                                    unsigned long new,
                                    unsigned long start,
                                    unsigned long end_data,
                                    unsigned long start_data)
{
        if (rlim < RLIM_INFINITY) {
                if (((new - start) + (end_data - start_data)) > rlim)
                        return -ENOSPC;
        }

        return 0;
}

extern int mm_take_all_locks(struct mm_struct *mm);
extern void mm_drop_all_locks(struct mm_struct *mm);

extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
extern struct file *get_mm_exe_file(struct mm_struct *mm);
extern struct file *get_task_exe_file(struct task_struct *task);

extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages);
extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages);

extern bool vma_is_special_mapping(const struct vm_area_struct *vma,
                                   const struct vm_special_mapping *sm);
extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
                                   unsigned long addr, unsigned long len,
                                   unsigned long flags,
                                   const struct vm_special_mapping *spec);
/* This is an obsolete alternative to _install_special_mapping. */
extern int install_special_mapping(struct mm_struct *mm,
                                   unsigned long addr, unsigned long len,
                                   unsigned long flags, struct page **pages);

unsigned long randomize_stack_top(unsigned long stack_top);
unsigned long randomize_page(unsigned long start, unsigned long range);

unsigned long
__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                    unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags);

static inline unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                  unsigned long pgoff, unsigned long flags)
{
        return __get_unmapped_area(file, addr, len, pgoff, flags, 0);
}

extern unsigned long mmap_region(struct file *file, unsigned long addr,
        unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
        struct list_head *uf);
extern unsigned long do_mmap(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot, unsigned long flags,
        vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
        struct list_head *uf);
extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
                         unsigned long start, size_t len, struct list_head *uf,
                         bool unlock);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
                     struct list_head *uf);
extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);

#ifdef CONFIG_MMU
extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
                         unsigned long start, unsigned long end,
                         struct list_head *uf, bool unlock);
extern int __mm_populate(unsigned long addr, unsigned long len,
                         int ignore_errors);
static inline void mm_populate(unsigned long addr, unsigned long len)
{
        /* Ignore errors */
        (void) __mm_populate(addr, len, 1);
}
#else
static inline void mm_populate(unsigned long addr, unsigned long len) {}
#endif

/* This takes the mm semaphore itself */
extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long);
extern int vm_munmap(unsigned long, size_t);
extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
        unsigned long, unsigned long,
        unsigned long, unsigned long);

struct vm_unmapped_area_info {
#define VM_UNMAPPED_AREA_TOPDOWN 1
        unsigned long flags;
        unsigned long length;
        unsigned long low_limit;
        unsigned long high_limit;
        unsigned long align_mask;
        unsigned long align_offset;
        unsigned long start_gap;
};

extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);

/* truncate.c */
extern void truncate_inode_pages(struct address_space *, loff_t);
extern void truncate_inode_pages_range(struct address_space *,
                                       loff_t lstart, loff_t lend);
extern void truncate_inode_pages_final(struct address_space *);

/* generic vm_area_ops exported for stackable file systems */
extern vm_fault_t filemap_fault(struct vm_fault *vmf);
extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
                pgoff_t start_pgoff, pgoff_t end_pgoff);
extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);

extern unsigned long stack_guard_gap;
/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);

/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
int expand_downwards(struct vm_area_struct *vma, unsigned long address);

/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
                                             struct vm_area_struct **pprev);

/*
 * Look up the first VMA which intersects the interval [start_addr, end_addr)
 * NULL if none.  Assume start_addr < end_addr.
 */
struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
                        unsigned long start_addr, unsigned long end_addr);

/**
 * vma_lookup() - Find a VMA at a specific address
 * @mm: The process address space.
 * @addr: The user address.
 *
 * Return: The vm_area_struct at the given address, %NULL otherwise.
 */
static inline
struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
{
        return mtree_load(&mm->mm_mt, addr);
}

static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma)
{
        if (vma->vm_flags & VM_GROWSDOWN)
                return stack_guard_gap;

        /* See reasoning around the VM_SHADOW_STACK definition */
        if (vma->vm_flags & VM_SHADOW_STACK)
                return PAGE_SIZE;

        return 0;
}

static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
{
        unsigned long gap = stack_guard_start_gap(vma);
        unsigned long vm_start = vma->vm_start;

        vm_start -= gap;
        if (vm_start > vma->vm_start)
                vm_start = 0;
        return vm_start;
}

static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
{
        unsigned long vm_end = vma->vm_end;

        if (vma->vm_flags & VM_GROWSUP) {
                vm_end += stack_guard_gap;
                if (vm_end < vma->vm_end)
                        vm_end = -PAGE_SIZE;
        }
        return vm_end;
}

static inline unsigned long vma_pages(struct vm_area_struct *vma)
{
        return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
}

/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
                                unsigned long vm_start, unsigned long vm_end)
{
        struct vm_area_struct *vma = vma_lookup(mm, vm_start);

        if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
                vma = NULL;

        return vma;
}

static inline bool range_in_vma(struct vm_area_struct *vma,
                                unsigned long start, unsigned long end)
{
        return (vma && vma->vm_start <= start && end <= vma->vm_end);
}

#ifdef CONFIG_MMU
pgprot_t vm_get_page_prot(unsigned long vm_flags);
void vma_set_page_prot(struct vm_area_struct *vma);
#else
static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
        return __pgprot(0);
}
static inline void vma_set_page_prot(struct vm_area_struct *vma)
{
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
}
#endif

void vma_set_file(struct vm_area_struct *vma, struct file *file);

#ifdef CONFIG_NUMA_BALANCING
unsigned long change_prot_numa(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end);
#endif

struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
                unsigned long addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
                        unsigned long pfn, unsigned long size, pgprot_t);
int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
                unsigned long pfn, unsigned long size, pgprot_t prot);
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num);
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num);
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num);
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn);
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn, pgprot_t pgprot);
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                        pfn_t pfn);
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
                unsigned long addr, pfn_t pfn);
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);

static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
                                unsigned long addr, struct page *page)
{
        int err = vm_insert_page(vma, addr, page);

        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        if (err < 0 && err != -EBUSY)
                return VM_FAULT_SIGBUS;

        return VM_FAULT_NOPAGE;
}

#ifndef io_remap_pfn_range
static inline int io_remap_pfn_range(struct vm_area_struct *vma,
                                     unsigned long addr, unsigned long pfn,
                                     unsigned long size, pgprot_t prot)
{
        return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot));
}
#endif

static inline vm_fault_t vmf_error(int err)
{
        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        else if (err == -EHWPOISON)
                return VM_FAULT_HWPOISON;
        return VM_FAULT_SIGBUS;
}

/*
 * Convert errno to return value for ->page_mkwrite() calls.
 *
 * This should eventually be merged with vmf_error() above, but will need a
 * careful audit of all vmf_error() callers.
 */
static inline vm_fault_t vmf_fs_error(int err)
{
        if (err == 0)
                return VM_FAULT_LOCKED;
        if (err == -EFAULT || err == -EAGAIN)
                return VM_FAULT_NOPAGE;
        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        /* -ENOSPC, -EDQUOT, -EIO ... */
        return VM_FAULT_SIGBUS;
}

struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                         unsigned int foll_flags);

static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
{
        if (vm_fault & VM_FAULT_OOM)
                return -ENOMEM;
        if (vm_fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
                return (foll_flags & FOLL_HWPOISON) ? -EHWPOISON : -EFAULT;
        if (vm_fault & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
                return -EFAULT;
        return 0;
}

/*
 * Indicates whether GUP can follow a PROT_NONE mapped page, or whether
 * a (NUMA hinting) fault is required.
 */
static inline bool gup_can_follow_protnone(struct vm_area_struct *vma,
                                           unsigned int flags)
{
        /*
         * If callers don't want to honor NUMA hinting faults, no need to
         * determine if we would actually have to trigger a NUMA hinting fault.
         */
        if (!(flags & FOLL_HONOR_NUMA_FAULT))
                return true;

        /*
         * NUMA hinting faults don't apply in inaccessible (PROT_NONE) VMAs.
         *
         * Requiring a fault here even for inaccessible VMAs would mean that
         * FOLL_FORCE cannot make any progress, because handle_mm_fault()
         * refuses to process NUMA hinting faults in inaccessible VMAs.
         */
        return !vma_is_accessible(vma);
}

typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
                               unsigned long size, pte_fn_t fn, void *data);
extern int apply_to_existing_page_range(struct mm_struct *mm,
                                   unsigned long address, unsigned long size,
                                   pte_fn_t fn, void *data);

#ifdef CONFIG_PAGE_POISONING
extern void __kernel_poison_pages(struct page *page, int numpages);
extern void __kernel_unpoison_pages(struct page *page, int numpages);
extern bool _page_poisoning_enabled_early;
DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled);
static inline bool page_poisoning_enabled(void)
{
        return _page_poisoning_enabled_early;
}
/*
 * For use in fast paths after init_mem_debugging() has run, or when a
 * false negative result is not harmful when called too early.
 */
static inline bool page_poisoning_enabled_static(void)
{
        return static_branch_unlikely(&_page_poisoning_enabled);
}
static inline void kernel_poison_pages(struct page *page, int numpages)
{
        if (page_poisoning_enabled_static())
                __kernel_poison_pages(page, numpages);
}
static inline void kernel_unpoison_pages(struct page *page, int numpages)
{
        if (page_poisoning_enabled_static())
                __kernel_unpoison_pages(page, numpages);
}
#else
static inline bool page_poisoning_enabled(void) { return false; }
static inline bool page_poisoning_enabled_static(void) { return false; }
static inline void __kernel_poison_pages(struct page *page, int nunmpages) { }
static inline void kernel_poison_pages(struct page *page, int numpages) { }
static inline void kernel_unpoison_pages(struct page *page, int numpages) { }
#endif

DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
static inline bool want_init_on_alloc(gfp_t flags)
{
        if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
                                &init_on_alloc))
                return true;
        return flags & __GFP_ZERO;
}

DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
static inline bool want_init_on_free(void)
{
        return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON,
                                   &init_on_free);
}

extern bool _debug_pagealloc_enabled_early;
DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);

static inline bool debug_pagealloc_enabled(void)
{
        return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
                _debug_pagealloc_enabled_early;
}

/*
 * For use in fast paths after mem_debugging_and_hardening_init() has run,
 * or when a false negative result is not harmful when called too early.
 */
static inline bool debug_pagealloc_enabled_static(void)
{
        if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC))
                return false;

        return static_branch_unlikely(&_debug_pagealloc_enabled);
}

/*
 * To support DEBUG_PAGEALLOC architecture must ensure that
 * __kernel_map_pages() never fails
 */
extern void __kernel_map_pages(struct page *page, int numpages, int enable);
#ifdef CONFIG_DEBUG_PAGEALLOC
static inline void debug_pagealloc_map_pages(struct page *page, int numpages)
{
        if (debug_pagealloc_enabled_static())
                __kernel_map_pages(page, numpages, 1);
}

static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages)
{
        if (debug_pagealloc_enabled_static())
                __kernel_map_pages(page, numpages, 0);
}

extern unsigned int _debug_guardpage_minorder;
DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled);

static inline unsigned int debug_guardpage_minorder(void)
{
        return _debug_guardpage_minorder;
}

static inline bool debug_guardpage_enabled(void)
{
        return static_branch_unlikely(&_debug_guardpage_enabled);
}

static inline bool page_is_guard(struct page *page)
{
        if (!debug_guardpage_enabled())
                return false;

        return PageGuard(page);
}

bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order);
static inline bool set_page_guard(struct zone *zone, struct page *page,
                                  unsigned int order)
{
        if (!debug_guardpage_enabled())
                return false;
        return __set_page_guard(zone, page, order);
}

void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order);
static inline void clear_page_guard(struct zone *zone, struct page *page,
                                    unsigned int order)
{
        if (!debug_guardpage_enabled())
                return;
        __clear_page_guard(zone, page, order);
}

#else        /* CONFIG_DEBUG_PAGEALLOC */
static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {}
static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {}
static inline unsigned int debug_guardpage_minorder(void) { return 0; }
static inline bool debug_guardpage_enabled(void) { return false; }
static inline bool page_is_guard(struct page *page) { return false; }
static inline bool set_page_guard(struct zone *zone, struct page *page,
                        unsigned int order) { return false; }
static inline void clear_page_guard(struct zone *zone, struct page *page,
                                unsigned int order) {}
#endif        /* CONFIG_DEBUG_PAGEALLOC */

#ifdef __HAVE_ARCH_GATE_AREA
extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
extern int in_gate_area_no_mm(unsigned long addr);
extern int in_gate_area(struct mm_struct *mm, unsigned long addr);
#else
static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
        return NULL;
}
static inline int in_gate_area_no_mm(unsigned long addr) { return 0; }
static inline int in_gate_area(struct mm_struct *mm, unsigned long addr)
{
        return 0;
}
#endif        /* __HAVE_ARCH_GATE_AREA */

extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);

#ifdef CONFIG_SYSCTL
extern int sysctl_drop_caches;
int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *,
                loff_t *);
#endif

void drop_slab(void);

#ifndef CONFIG_MMU
#define randomize_va_space 0
#else
extern int randomize_va_space;
#endif

const char * arch_vma_name(struct vm_area_struct *vma);
#ifdef CONFIG_MMU
void print_vma_addr(char *prefix, unsigned long rip);
#else
static inline void print_vma_addr(char *prefix, unsigned long rip)
{
}
#endif

void *sparse_buffer_alloc(unsigned long size);
struct page * __populate_section_memmap(unsigned long pfn,
                unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
                struct dev_pagemap *pgmap);
void pmd_init(void *addr);
void pud_init(void *addr);
pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
                            struct vmem_altmap *altmap, struct page *reuse);
void *vmemmap_alloc_block(unsigned long size, int node);
struct vmem_altmap;
void *vmemmap_alloc_block_buf(unsigned long size, int node,
                              struct vmem_altmap *altmap);
void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
void vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
                     unsigned long addr, unsigned long next);
int vmemmap_check_pmd(pmd_t *pmd, int node,
                      unsigned long addr, unsigned long next);
int vmemmap_populate_basepages(unsigned long start, unsigned long end,
                               int node, struct vmem_altmap *altmap);
int vmemmap_populate_hugepages(unsigned long start, unsigned long end,
                               int node, struct vmem_altmap *altmap);
int vmemmap_populate(unsigned long start, unsigned long end, int node,
                struct vmem_altmap *altmap);
void vmemmap_populate_print_last(void);
#ifdef CONFIG_MEMORY_HOTPLUG
void vmemmap_free(unsigned long start, unsigned long end,
                struct vmem_altmap *altmap);
#endif

#ifdef CONFIG_SPARSEMEM_VMEMMAP
static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
{
        /* number of pfns from base where pfn_to_page() is valid */
        if (altmap)
                return altmap->reserve + altmap->free;
        return 0;
}

static inline void vmem_altmap_free(struct vmem_altmap *altmap,
                                    unsigned long nr_pfns)
{
        altmap->alloc -= nr_pfns;
}
#else
static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
{
        return 0;
}

static inline void vmem_altmap_free(struct vmem_altmap *altmap,
                                    unsigned long nr_pfns)
{
}
#endif

#define VMEMMAP_RESERVE_NR        2
#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
                                          struct dev_pagemap *pgmap)
{
        unsigned long nr_pages;
        unsigned long nr_vmemmap_pages;

        if (!pgmap || !is_power_of_2(sizeof(struct page)))
                return false;

        nr_pages = pgmap_vmemmap_nr(pgmap);
        nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT);
        /*
         * For vmemmap optimization with DAX we need minimum 2 vmemmap
         * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst
         */
        return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR);
}
/*
 * If we don't have an architecture override, use the generic rule
 */
#ifndef vmemmap_can_optimize
#define vmemmap_can_optimize __vmemmap_can_optimize
#endif

#else
static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
                                           struct dev_pagemap *pgmap)
{
        return false;
}
#endif

void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
                                  unsigned long nr_pages);

enum mf_flags {
        MF_COUNT_INCREASED = 1 << 0,
        MF_ACTION_REQUIRED = 1 << 1,
        MF_MUST_KILL = 1 << 2,
        MF_SOFT_OFFLINE = 1 << 3,
        MF_UNPOISON = 1 << 4,
        MF_SW_SIMULATED = 1 << 5,
        MF_NO_RETRY = 1 << 6,
        MF_MEM_PRE_REMOVE = 1 << 7,
};
int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
                      unsigned long count, int mf_flags);
extern int memory_failure(unsigned long pfn, int flags);
extern void memory_failure_queue_kick(int cpu);
extern int unpoison_memory(unsigned long pfn);
extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);
#ifdef CONFIG_MEMORY_FAILURE
/*
 * Sysfs entries for memory failure handling statistics.
 */
extern const struct attribute_group memory_failure_attr_group;
extern void memory_failure_queue(unsigned long pfn, int flags);
extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                        bool *migratable_cleared);
void num_poisoned_pages_inc(unsigned long pfn);
void num_poisoned_pages_sub(unsigned long pfn, long i);
struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
#else
static inline void memory_failure_queue(unsigned long pfn, int flags)
{
}

static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                        bool *migratable_cleared)
{
        return 0;
}

static inline void num_poisoned_pages_inc(unsigned long pfn)
{
}

static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
{
}
#endif

#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_KSM)
void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
                     struct vm_area_struct *vma, struct list_head *to_kill,
                     unsigned long ksm_addr);
#endif

#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
extern void memblk_nr_poison_inc(unsigned long pfn);
extern void memblk_nr_poison_sub(unsigned long pfn, long i);
#else
static inline void memblk_nr_poison_inc(unsigned long pfn)
{
}

static inline void memblk_nr_poison_sub(unsigned long pfn, long i)
{
}
#endif

#ifndef arch_memory_failure
static inline int arch_memory_failure(unsigned long pfn, int flags)
{
        return -ENXIO;
}
#endif

#ifndef arch_is_platform_page
static inline bool arch_is_platform_page(u64 paddr)
{
        return false;
}
#endif

/*
 * Error handlers for various types of pages.
 */
enum mf_result {
        MF_IGNORED,        /* Error: cannot be handled */
        MF_FAILED,        /* Error: handling failed */
        MF_DELAYED,        /* Will be handled later */
        MF_RECOVERED,        /* Successfully recovered */
};

enum mf_action_page_type {
        MF_MSG_KERNEL,
        MF_MSG_KERNEL_HIGH_ORDER,
        MF_MSG_SLAB,
        MF_MSG_DIFFERENT_COMPOUND,
        MF_MSG_HUGE,
        MF_MSG_FREE_HUGE,
        MF_MSG_UNMAP_FAILED,
        MF_MSG_DIRTY_SWAPCACHE,
        MF_MSG_CLEAN_SWAPCACHE,
        MF_MSG_DIRTY_MLOCKED_LRU,
        MF_MSG_CLEAN_MLOCKED_LRU,
        MF_MSG_DIRTY_UNEVICTABLE_LRU,
        MF_MSG_CLEAN_UNEVICTABLE_LRU,
        MF_MSG_DIRTY_LRU,
        MF_MSG_CLEAN_LRU,
        MF_MSG_TRUNCATED_LRU,
        MF_MSG_BUDDY,
        MF_MSG_DAX,
        MF_MSG_UNSPLIT_THP,
        MF_MSG_UNKNOWN,
};

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
extern void clear_huge_page(struct page *page,
                            unsigned long addr_hint,
                            unsigned int pages_per_huge_page);
int copy_user_large_folio(struct folio *dst, struct folio *src,
                          unsigned long addr_hint,
                          struct vm_area_struct *vma);
long copy_folio_from_user(struct folio *dst_folio,
                           const void __user *usr_src,
                           bool allow_pagefault);

/**
 * vma_is_special_huge - Are transhuge page-table entries considered special?
 * @vma: Pointer to the struct vm_area_struct to consider
 *
 * Whether transhuge page-table entries are considered "special" following
 * the definition in vm_normal_page().
 *
 * Return: true if transhuge page-table entries should be considered special,
 * false otherwise.
 */
static inline bool vma_is_special_huge(const struct vm_area_struct *vma)
{
        return vma_is_dax(vma) || (vma->vm_file &&
                                   (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
}

#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */

#if MAX_NUMNODES > 1
void __init setup_nr_node_ids(void);
#else
static inline void setup_nr_node_ids(void) {}
#endif

extern int memcmp_pages(struct page *page1, struct page *page2);

static inline int pages_identical(struct page *page1, struct page *page2)
{
        return !memcmp_pages(page1, page2);
}

#ifdef CONFIG_MAPPING_DIRTY_HELPERS
unsigned long clean_record_shared_mapping_range(struct address_space *mapping,
                                                pgoff_t first_index, pgoff_t nr,
                                                pgoff_t bitmap_pgoff,
                                                unsigned long *bitmap,
                                                pgoff_t *start,
                                                pgoff_t *end);

unsigned long wp_shared_mapping_range(struct address_space *mapping,
                                      pgoff_t first_index, pgoff_t nr);
#endif

extern int sysctl_nr_trim_pages;

#ifdef CONFIG_PRINTK
void mem_dump_obj(void *object);
#else
static inline void mem_dump_obj(void *object) {}
#endif

/**
 * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and
 *                    handle them.
 * @seals: the seals to check
 * @vma: the vma to operate on
 *
 * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper
 * check/handling on the vma flags.  Return 0 if check pass, or <0 for errors.
 */
static inline int seal_check_write(int seals, struct vm_area_struct *vma)
{
        if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
                /*
                 * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
                 * write seals are active.
                 */
                if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
                        return -EPERM;

                /*
                 * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
                 * MAP_SHARED and read-only, take care to not allow mprotect to
                 * revert protections on such mappings. Do this only for shared
                 * mappings. For private mappings, don't need to mask
                 * VM_MAYWRITE as we still want them to be COW-writable.
                 */
                if (vma->vm_flags & VM_SHARED)
                        vm_flags_clear(vma, VM_MAYWRITE);
        }

        return 0;
}

#ifdef CONFIG_ANON_VMA_NAME
int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
                          unsigned long len_in,
                          struct anon_vma_name *anon_name);
#else
static inline int
madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
                      unsigned long len_in, struct anon_vma_name *anon_name) {
        return 0;
}
#endif

#ifdef CONFIG_UNACCEPTED_MEMORY

bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end);
void accept_memory(phys_addr_t start, phys_addr_t end);

#else

static inline bool range_contains_unaccepted_memory(phys_addr_t start,
                                                    phys_addr_t end)
{
        return false;
}

static inline void accept_memory(phys_addr_t start, phys_addr_t end)
{
}

#endif

static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
{
        phys_addr_t paddr = pfn << PAGE_SHIFT;

        return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE);
}

void vma_pgtable_walk_begin(struct vm_area_struct *vma);
void vma_pgtable_walk_end(struct vm_area_struct *vma);

#endif /* _LINUX_MM_H */




































































































































    1 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






    1 




    1 













































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
// SPDX-License-Identifier: GPL-2.0+
/*
 * 2002-10-15  Posix Clocks & timers
 *                           by George Anzinger george@mvista.com
 *                             Copyright (C) 2002 2003 by MontaVista Software.
 *
 * 2004-06-01  Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug.
 *                             Copyright (C) 2004 Boris Hu
 *
 * These are all the functions necessary to implement POSIX clocks & timers
 */
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/mutex.h>
#include <linux/sched/task.h>

#include <linux/uaccess.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/compiler.h>
#include <linux/hash.h>
#include <linux/posix-clock.h>
#include <linux/posix-timers.h>
#include <linux/syscalls.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
#include <linux/export.h>
#include <linux/hashtable.h>
#include <linux/compat.h>
#include <linux/nospec.h>
#include <linux/time_namespace.h>

#include "timekeeping.h"
#include "posix-timers.h"

static struct kmem_cache *posix_timers_cache;

/*
 * Timers are managed in a hash table for lockless lookup. The hash key is
 * constructed from current::signal and the timer ID and the timer is
 * matched against current::signal and the timer ID when walking the hash
 * bucket list.
 *
 * This allows checkpoint/restore to reconstruct the exact timer IDs for
 * a process.
 */
static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
static DEFINE_SPINLOCK(hash_lock);

static const struct k_clock * const posix_clocks[];
static const struct k_clock *clockid_to_kclock(const clockid_t id);
static const struct k_clock clock_realtime, clock_monotonic;

/* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */
#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
                        ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
#endif

static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);

#define lock_timer(tid, flags)                                                   \
({        struct k_itimer *__timr;                                           \
        __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags));  \
        __timr;                                                                   \
})

static int hash(struct signal_struct *sig, unsigned int nr)
{
        return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
}

static struct k_itimer *__posix_timers_find(struct hlist_head *head,
                                            struct signal_struct *sig,
                                            timer_t id)
{
        struct k_itimer *timer;

        hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) {
                /* timer->it_signal can be set concurrently */
                if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id))
                        return timer;
        }
        return NULL;
}

static struct k_itimer *posix_timer_by_id(timer_t id)
{
        struct signal_struct *sig = current->signal;
        struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];

        return __posix_timers_find(head, sig, id);
}

static int posix_timer_add(struct k_itimer *timer)
{
        struct signal_struct *sig = current->signal;
        struct hlist_head *head;
        unsigned int cnt, id;

        /*
         * FIXME: Replace this by a per signal struct xarray once there is
         * a plan to handle the resulting CRIU regression gracefully.
         */
        for (cnt = 0; cnt <= INT_MAX; cnt++) {
                spin_lock(&hash_lock);
                id = sig->next_posix_timer_id;

                /* Write the next ID back. Clamp it to the positive space */
                sig->next_posix_timer_id = (id + 1) & INT_MAX;

                head = &posix_timers_hashtable[hash(sig, id)];
                if (!__posix_timers_find(head, sig, id)) {
                        hlist_add_head_rcu(&timer->t_hash, head);
                        spin_unlock(&hash_lock);
                        return id;
                }
                spin_unlock(&hash_lock);
        }
        /* POSIX return code when no timer ID could be allocated */
        return -EAGAIN;
}

static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
{
        spin_unlock_irqrestore(&timr->it_lock, flags);
}

static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp)
{
        ktime_get_real_ts64(tp);
        return 0;
}

static ktime_t posix_get_realtime_ktime(clockid_t which_clock)
{
        return ktime_get_real();
}

static int posix_clock_realtime_set(const clockid_t which_clock,
                                    const struct timespec64 *tp)
{
        return do_sys_settimeofday64(tp, NULL);
}

static int posix_clock_realtime_adj(const clockid_t which_clock,
                                    struct __kernel_timex *t)
{
        return do_adjtimex(t);
}

static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp)
{
        ktime_get_ts64(tp);
        timens_add_monotonic(tp);
        return 0;
}

static ktime_t posix_get_monotonic_ktime(clockid_t which_clock)
{
        return ktime_get();
}

static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
{
        ktime_get_raw_ts64(tp);
        timens_add_monotonic(tp);
        return 0;
}

static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp)
{
        ktime_get_coarse_real_ts64(tp);
        return 0;
}

static int posix_get_monotonic_coarse(clockid_t which_clock,
                                                struct timespec64 *tp)
{
        ktime_get_coarse_ts64(tp);
        timens_add_monotonic(tp);
        return 0;
}

static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *tp)
{
        *tp = ktime_to_timespec64(KTIME_LOW_RES);
        return 0;
}

static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp)
{
        ktime_get_boottime_ts64(tp);
        timens_add_boottime(tp);
        return 0;
}

static ktime_t posix_get_boottime_ktime(const clockid_t which_clock)
{
        return ktime_get_boottime();
}

static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp)
{
        ktime_get_clocktai_ts64(tp);
        return 0;
}

static ktime_t posix_get_tai_ktime(clockid_t which_clock)
{
        return ktime_get_clocktai();
}

static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
{
        tp->tv_sec = 0;
        tp->tv_nsec = hrtimer_resolution;
        return 0;
}

static __init int init_posix_timers(void)
{
        posix_timers_cache = kmem_cache_create("posix_timers_cache",
                                        sizeof(struct k_itimer), 0,
                                        SLAB_PANIC | SLAB_ACCOUNT, NULL);
        return 0;
}
__initcall(init_posix_timers);

/*
 * The siginfo si_overrun field and the return value of timer_getoverrun(2)
 * are of type int. Clamp the overrun value to INT_MAX
 */
static inline int timer_overrun_to_int(struct k_itimer *timr, int baseval)
{
        s64 sum = timr->it_overrun_last + (s64)baseval;

        return sum > (s64)INT_MAX ? INT_MAX : (int)sum;
}

static void common_hrtimer_rearm(struct k_itimer *timr)
{
        struct hrtimer *timer = &timr->it.real.timer;

        timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
                                            timr->it_interval);
        hrtimer_restart(timer);
}

/*
 * This function is called from the signal delivery code if
 * info->si_sys_private is not zero, which indicates that the timer has to
 * be rearmed. Restart the timer and update info::si_overrun.
 */
void posixtimer_rearm(struct kernel_siginfo *info)
{
        struct k_itimer *timr;
        unsigned long flags;

        timr = lock_timer(info->si_tid, &flags);
        if (!timr)
                return;

        if (timr->it_interval && timr->it_requeue_pending == info->si_sys_private) {
                timr->kclock->timer_rearm(timr);

                timr->it_active = 1;
                timr->it_overrun_last = timr->it_overrun;
                timr->it_overrun = -1LL;
                ++timr->it_requeue_pending;

                info->si_overrun = timer_overrun_to_int(timr, info->si_overrun);
        }

        unlock_timer(timr, flags);
}

int posix_timer_event(struct k_itimer *timr, int si_private)
{
        enum pid_type type;
        int ret;
        /*
         * FIXME: if ->sigq is queued we can race with
         * dequeue_signal()->posixtimer_rearm().
         *
         * If dequeue_signal() sees the "right" value of
         * si_sys_private it calls posixtimer_rearm().
         * We re-queue ->sigq and drop ->it_lock().
         * posixtimer_rearm() locks the timer
         * and re-schedules it while ->sigq is pending.
         * Not really bad, but not that we want.
         */
        timr->sigq->info.si_sys_private = si_private;

        type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID;
        ret = send_sigqueue(timr->sigq, timr->it_pid, type);
        /* If we failed to send the signal the timer stops. */
        return ret > 0;
}

/*
 * This function gets called when a POSIX.1b interval timer expires from
 * the HRTIMER interrupt (soft interrupt on RT kernels).
 *
 * Handles CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME and CLOCK_TAI
 * based timers.
 */
static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
{
        enum hrtimer_restart ret = HRTIMER_NORESTART;
        struct k_itimer *timr;
        unsigned long flags;
        int si_private = 0;

        timr = container_of(timer, struct k_itimer, it.real.timer);
        spin_lock_irqsave(&timr->it_lock, flags);

        timr->it_active = 0;
        if (timr->it_interval != 0)
                si_private = ++timr->it_requeue_pending;

        if (posix_timer_event(timr, si_private)) {
                /*
                 * The signal was not queued due to SIG_IGN. As a
                 * consequence the timer is not going to be rearmed from
                 * the signal delivery path. But as a real signal handler
                 * can be installed later the timer must be rearmed here.
                 */
                if (timr->it_interval != 0) {
                        ktime_t now = hrtimer_cb_get_time(timer);

                        /*
                         * FIXME: What we really want, is to stop this
                         * timer completely and restart it in case the
                         * SIG_IGN is removed. This is a non trivial
                         * change to the signal handling code.
                         *
                         * For now let timers with an interval less than a
                         * jiffie expire every jiffie and recheck for a
                         * valid signal handler.
                         *
                         * This avoids interrupt starvation in case of a
                         * very small interval, which would expire the
                         * timer immediately again.
                         *
                         * Moving now ahead of time by one jiffie tricks
                         * hrtimer_forward() to expire the timer later,
                         * while it still maintains the overrun accuracy
                         * for the price of a slight inconsistency in the
                         * timer_gettime() case. This is at least better
                         * than a timer storm.
                         *
                         * Only required when high resolution timers are
                         * enabled as the periodic tick based timers are
                         * automatically aligned to the next tick.
                         */
                        if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS)) {
                                ktime_t kj = TICK_NSEC;

                                if (timr->it_interval < kj)
                                        now = ktime_add(now, kj);
                        }

                        timr->it_overrun += hrtimer_forward(timer, now, timr->it_interval);
                        ret = HRTIMER_RESTART;
                        ++timr->it_requeue_pending;
                        timr->it_active = 1;
                }
        }

        unlock_timer(timr, flags);
        return ret;
}

static struct pid *good_sigevent(sigevent_t * event)
{
        struct pid *pid = task_tgid(current);
        struct task_struct *rtn;

        switch (event->sigev_notify) {
        case SIGEV_SIGNAL | SIGEV_THREAD_ID:
                pid = find_vpid(event->sigev_notify_thread_id);
                rtn = pid_task(pid, PIDTYPE_PID);
                if (!rtn || !same_thread_group(rtn, current))
                        return NULL;
                fallthrough;
        case SIGEV_SIGNAL:
        case SIGEV_THREAD:
                if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX)
                        return NULL;
                fallthrough;
        case SIGEV_NONE:
                return pid;
        default:
                return NULL;
        }
}

static struct k_itimer * alloc_posix_timer(void)
{
        struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);

        if (!tmr)
                return tmr;
        if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
                kmem_cache_free(posix_timers_cache, tmr);
                return NULL;
        }
        clear_siginfo(&tmr->sigq->info);
        return tmr;
}

static void k_itimer_rcu_free(struct rcu_head *head)
{
        struct k_itimer *tmr = container_of(head, struct k_itimer, rcu);

        kmem_cache_free(posix_timers_cache, tmr);
}

static void posix_timer_free(struct k_itimer *tmr)
{
        put_pid(tmr->it_pid);
        sigqueue_free(tmr->sigq);
        call_rcu(&tmr->rcu, k_itimer_rcu_free);
}

static void posix_timer_unhash_and_free(struct k_itimer *tmr)
{
        spin_lock(&hash_lock);
        hlist_del_rcu(&tmr->t_hash);
        spin_unlock(&hash_lock);
        posix_timer_free(tmr);
}

static int common_timer_create(struct k_itimer *new_timer)
{
        hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
        return 0;
}

/* Create a POSIX.1b interval timer. */
static int do_timer_create(clockid_t which_clock, struct sigevent *event,
                           timer_t __user *created_timer_id)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct k_itimer *new_timer;
        int error, new_timer_id;

        if (!kc)
                return -EINVAL;
        if (!kc->timer_create)
                return -EOPNOTSUPP;

        new_timer = alloc_posix_timer();
        if (unlikely(!new_timer))
                return -EAGAIN;

        spin_lock_init(&new_timer->it_lock);

        /*
         * Add the timer to the hash table. The timer is not yet valid
         * because new_timer::it_signal is still NULL. The timer id is also
         * not yet visible to user space.
         */
        new_timer_id = posix_timer_add(new_timer);
        if (new_timer_id < 0) {
                posix_timer_free(new_timer);
                return new_timer_id;
        }

        new_timer->it_id = (timer_t) new_timer_id;
        new_timer->it_clock = which_clock;
        new_timer->kclock = kc;
        new_timer->it_overrun = -1LL;

        if (event) {
                rcu_read_lock();
                new_timer->it_pid = get_pid(good_sigevent(event));
                rcu_read_unlock();
                if (!new_timer->it_pid) {
                        error = -EINVAL;
                        goto out;
                }
                new_timer->it_sigev_notify     = event->sigev_notify;
                new_timer->sigq->info.si_signo = event->sigev_signo;
                new_timer->sigq->info.si_value = event->sigev_value;
        } else {
                new_timer->it_sigev_notify     = SIGEV_SIGNAL;
                new_timer->sigq->info.si_signo = SIGALRM;
                memset(&new_timer->sigq->info.si_value, 0, sizeof(sigval_t));
                new_timer->sigq->info.si_value.sival_int = new_timer->it_id;
                new_timer->it_pid = get_pid(task_tgid(current));
        }

        new_timer->sigq->info.si_tid   = new_timer->it_id;
        new_timer->sigq->info.si_code  = SI_TIMER;

        if (copy_to_user(created_timer_id, &new_timer_id, sizeof (new_timer_id))) {
                error = -EFAULT;
                goto out;
        }
        /*
         * After succesful copy out, the timer ID is visible to user space
         * now but not yet valid because new_timer::signal is still NULL.
         *
         * Complete the initialization with the clock specific create
         * callback.
         */
        error = kc->timer_create(new_timer);
        if (error)
                goto out;

        spin_lock_irq(&current->sighand->siglock);
        /* This makes the timer valid in the hash table */
        WRITE_ONCE(new_timer->it_signal, current->signal);
        list_add(&new_timer->list, &current->signal->posix_timers);
        spin_unlock_irq(&current->sighand->siglock);
        /*
         * After unlocking sighand::siglock @new_timer is subject to
         * concurrent removal and cannot be touched anymore
         */
        return 0;
out:
        posix_timer_unhash_and_free(new_timer);
        return error;
}

SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
                struct sigevent __user *, timer_event_spec,
                timer_t __user *, created_timer_id)
{
        if (timer_event_spec) {
                sigevent_t event;

                if (copy_from_user(&event, timer_event_spec, sizeof (event)))
                        return -EFAULT;
                return do_timer_create(which_clock, &event, created_timer_id);
        }
        return do_timer_create(which_clock, NULL, created_timer_id);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
                       struct compat_sigevent __user *, timer_event_spec,
                       timer_t __user *, created_timer_id)
{
        if (timer_event_spec) {
                sigevent_t event;

                if (get_compat_sigevent(&event, timer_event_spec))
                        return -EFAULT;
                return do_timer_create(which_clock, &event, created_timer_id);
        }
        return do_timer_create(which_clock, NULL, created_timer_id);
}
#endif

static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
{
        struct k_itimer *timr;

        /*
         * timer_t could be any type >= int and we want to make sure any
         * @timer_id outside positive int range fails lookup.
         */
        if ((unsigned long long)timer_id > INT_MAX)
                return NULL;

        /*
         * The hash lookup and the timers are RCU protected.
         *
         * Timers are added to the hash in invalid state where
         * timr::it_signal == NULL. timer::it_signal is only set after the
         * rest of the initialization succeeded.
         *
         * Timer destruction happens in steps:
         *  1) Set timr::it_signal to NULL with timr::it_lock held
         *  2) Release timr::it_lock
         *  3) Remove from the hash under hash_lock
         *  4) Call RCU for removal after the grace period
         *
         * Holding rcu_read_lock() accross the lookup ensures that
         * the timer cannot be freed.
         *
         * The lookup validates locklessly that timr::it_signal ==
         * current::it_signal and timr::it_id == @timer_id. timr::it_id
         * can't change, but timr::it_signal becomes NULL during
         * destruction.
         */
        rcu_read_lock();
        timr = posix_timer_by_id(timer_id);
        if (timr) {
                spin_lock_irqsave(&timr->it_lock, *flags);
                /*
                 * Validate under timr::it_lock that timr::it_signal is
                 * still valid. Pairs with #1 above.
                 */
                if (timr->it_signal == current->signal) {
                        rcu_read_unlock();
                        return timr;
                }
                spin_unlock_irqrestore(&timr->it_lock, *flags);
        }
        rcu_read_unlock();

        return NULL;
}

static ktime_t common_hrtimer_remaining(struct k_itimer *timr, ktime_t now)
{
        struct hrtimer *timer = &timr->it.real.timer;

        return __hrtimer_expires_remaining_adjusted(timer, now);
}

static s64 common_hrtimer_forward(struct k_itimer *timr, ktime_t now)
{
        struct hrtimer *timer = &timr->it.real.timer;

        return hrtimer_forward(timer, now, timr->it_interval);
}

/*
 * Get the time remaining on a POSIX.1b interval timer.
 *
 * Two issues to handle here:
 *
 *  1) The timer has a requeue pending. The return value must appear as
 *     if the timer has been requeued right now.
 *
 *  2) The timer is a SIGEV_NONE timer. These timers are never enqueued
 *     into the hrtimer queue and therefore never expired. Emulate expiry
 *     here taking #1 into account.
 */
void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
{
        const struct k_clock *kc = timr->kclock;
        ktime_t now, remaining, iv;
        bool sig_none;

        sig_none = timr->it_sigev_notify == SIGEV_NONE;
        iv = timr->it_interval;

        /* interval timer ? */
        if (iv) {
                cur_setting->it_interval = ktime_to_timespec64(iv);
        } else if (!timr->it_active) {
                /*
                 * SIGEV_NONE oneshot timers are never queued and therefore
                 * timr->it_active is always false. The check below
                 * vs. remaining time will handle this case.
                 *
                 * For all other timers there is nothing to update here, so
                 * return.
                 */
                if (!sig_none)
                        return;
        }

        now = kc->clock_get_ktime(timr->it_clock);

        /*
         * If this is an interval timer and either has requeue pending or
         * is a SIGEV_NONE timer move the expiry time forward by intervals,
         * so expiry is > now.
         */
        if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none))
                timr->it_overrun += kc->timer_forward(timr, now);

        remaining = kc->timer_remaining(timr, now);
        /*
         * As @now is retrieved before a possible timer_forward() and
         * cannot be reevaluated by the compiler @remaining is based on the
         * same @now value. Therefore @remaining is consistent vs. @now.
         *
         * Consequently all interval timers, i.e. @iv > 0, cannot have a
         * remaining time <= 0 because timer_forward() guarantees to move
         * them forward so that the next timer expiry is > @now.
         */
        if (remaining <= 0) {
                /*
                 * A single shot SIGEV_NONE timer must return 0, when it is
                 * expired! Timers which have a real signal delivery mode
                 * must return a remaining time greater than 0 because the
                 * signal has not yet been delivered.
                 */
                if (!sig_none)
                        cur_setting->it_value.tv_nsec = 1;
        } else {
                cur_setting->it_value = ktime_to_timespec64(remaining);
        }
}

static int do_timer_gettime(timer_t timer_id,  struct itimerspec64 *setting)
{
        const struct k_clock *kc;
        struct k_itimer *timr;
        unsigned long flags;
        int ret = 0;

        timr = lock_timer(timer_id, &flags);
        if (!timr)
                return -EINVAL;

        memset(setting, 0, sizeof(*setting));
        kc = timr->kclock;
        if (WARN_ON_ONCE(!kc || !kc->timer_get))
                ret = -EINVAL;
        else
                kc->timer_get(timr, setting);

        unlock_timer(timr, flags);
        return ret;
}

/* Get the time remaining on a POSIX.1b interval timer. */
SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
                struct __kernel_itimerspec __user *, setting)
{
        struct itimerspec64 cur_setting;

        int ret = do_timer_gettime(timer_id, &cur_setting);
        if (!ret) {
                if (put_itimerspec64(&cur_setting, setting))
                        ret = -EFAULT;
        }
        return ret;
}

#ifdef CONFIG_COMPAT_32BIT_TIME

SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id,
                struct old_itimerspec32 __user *, setting)
{
        struct itimerspec64 cur_setting;

        int ret = do_timer_gettime(timer_id, &cur_setting);
        if (!ret) {
                if (put_old_itimerspec32(&cur_setting, setting))
                        ret = -EFAULT;
        }
        return ret;
}

#endif

/**
 * sys_timer_getoverrun - Get the number of overruns of a POSIX.1b interval timer
 * @timer_id:        The timer ID which identifies the timer
 *
 * The "overrun count" of a timer is one plus the number of expiration
 * intervals which have elapsed between the first expiry, which queues the
 * signal and the actual signal delivery. On signal delivery the "overrun
 * count" is calculated and cached, so it can be returned directly here.
 *
 * As this is relative to the last queued signal the returned overrun count
 * is meaningless outside of the signal delivery path and even there it
 * does not accurately reflect the current state when user space evaluates
 * it.
 *
 * Returns:
 *        -EINVAL                @timer_id is invalid
 *        1..INT_MAX        The number of overruns related to the last delivered signal
 */
SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
{
        struct k_itimer *timr;
        unsigned long flags;
        int overrun;

        timr = lock_timer(timer_id, &flags);
        if (!timr)
                return -EINVAL;

        overrun = timer_overrun_to_int(timr, 0);
        unlock_timer(timr, flags);

        return overrun;
}

static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
                               bool absolute, bool sigev_none)
{
        struct hrtimer *timer = &timr->it.real.timer;
        enum hrtimer_mode mode;

        mode = absolute ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
        /*
         * Posix magic: Relative CLOCK_REALTIME timers are not affected by
         * clock modifications, so they become CLOCK_MONOTONIC based under the
         * hood. See hrtimer_init(). Update timr->kclock, so the generic
         * functions which use timr->kclock->clock_get_*() work.
         *
         * Note: it_clock stays unmodified, because the next timer_set() might
         * use ABSTIME, so it needs to switch back.
         */
        if (timr->it_clock == CLOCK_REALTIME)
                timr->kclock = absolute ? &clock_realtime : &clock_monotonic;

        hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
        timr->it.real.timer.function = posix_timer_fn;

        if (!absolute)
                expires = ktime_add_safe(expires, timer->base->get_time());
        hrtimer_set_expires(timer, expires);

        if (!sigev_none)
                hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}

static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
{
        return hrtimer_try_to_cancel(&timr->it.real.timer);
}

static void common_timer_wait_running(struct k_itimer *timer)
{
        hrtimer_cancel_wait_running(&timer->it.real.timer);
}

/*
 * On PREEMPT_RT this prevents priority inversion and a potential livelock
 * against the ksoftirqd thread in case that ksoftirqd gets preempted while
 * executing a hrtimer callback.
 *
 * See the comments in hrtimer_cancel_wait_running(). For PREEMPT_RT=n this
 * just results in a cpu_relax().
 *
 * For POSIX CPU timers with CONFIG_POSIX_CPU_TIMERS_TASK_WORK=n this is
 * just a cpu_relax(). With CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y this
 * prevents spinning on an eventually scheduled out task and a livelock
 * when the task which tries to delete or disarm the timer has preempted
 * the task which runs the expiry in task work context.
 */
static struct k_itimer *timer_wait_running(struct k_itimer *timer,
                                           unsigned long *flags)
{
        const struct k_clock *kc = READ_ONCE(timer->kclock);
        timer_t timer_id = READ_ONCE(timer->it_id);

        /* Prevent kfree(timer) after dropping the lock */
        rcu_read_lock();
        unlock_timer(timer, *flags);

        /*
         * kc->timer_wait_running() might drop RCU lock. So @timer
         * cannot be touched anymore after the function returns!
         */
        if (!WARN_ON_ONCE(!kc->timer_wait_running))
                kc->timer_wait_running(timer);

        rcu_read_unlock();
        /* Relock the timer. It might be not longer hashed. */
        return lock_timer(timer_id, flags);
}

/* Set a POSIX.1b interval timer. */
int common_timer_set(struct k_itimer *timr, int flags,
                     struct itimerspec64 *new_setting,
                     struct itimerspec64 *old_setting)
{
        const struct k_clock *kc = timr->kclock;
        bool sigev_none;
        ktime_t expires;

        if (old_setting)
                common_timer_get(timr, old_setting);

        /* Prevent rearming by clearing the interval */
        timr->it_interval = 0;
        /*
         * Careful here. On SMP systems the timer expiry function could be
         * active and spinning on timr->it_lock.
         */
        if (kc->timer_try_to_cancel(timr) < 0)
                return TIMER_RETRY;

        timr->it_active = 0;
        timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
                ~REQUEUE_PENDING;
        timr->it_overrun_last = 0;

        /* Switch off the timer when it_value is zero */
        if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
                return 0;

        timr->it_interval = timespec64_to_ktime(new_setting->it_interval);
        expires = timespec64_to_ktime(new_setting->it_value);
        if (flags & TIMER_ABSTIME)
                expires = timens_ktime_to_host(timr->it_clock, expires);
        sigev_none = timr->it_sigev_notify == SIGEV_NONE;

        kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
        timr->it_active = !sigev_none;
        return 0;
}

static int do_timer_settime(timer_t timer_id, int tmr_flags,
                            struct itimerspec64 *new_spec64,
                            struct itimerspec64 *old_spec64)
{
        const struct k_clock *kc;
        struct k_itimer *timr;
        unsigned long flags;
        int error = 0;

        if (!timespec64_valid(&new_spec64->it_interval) ||
            !timespec64_valid(&new_spec64->it_value))
                return -EINVAL;

        if (old_spec64)
                memset(old_spec64, 0, sizeof(*old_spec64));

        timr = lock_timer(timer_id, &flags);
retry:
        if (!timr)
                return -EINVAL;

        kc = timr->kclock;
        if (WARN_ON_ONCE(!kc || !kc->timer_set))
                error = -EINVAL;
        else
                error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64);

        if (error == TIMER_RETRY) {
                // We already got the old time...
                old_spec64 = NULL;
                /* Unlocks and relocks the timer if it still exists */
                timr = timer_wait_running(timr, &flags);
                goto retry;
        }
        unlock_timer(timr, flags);

        return error;
}

/* Set a POSIX.1b interval timer */
SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
                const struct __kernel_itimerspec __user *, new_setting,
                struct __kernel_itimerspec __user *, old_setting)
{
        struct itimerspec64 new_spec, old_spec, *rtn;
        int error = 0;

        if (!new_setting)
                return -EINVAL;

        if (get_itimerspec64(&new_spec, new_setting))
                return -EFAULT;

        rtn = old_setting ? &old_spec : NULL;
        error = do_timer_settime(timer_id, flags, &new_spec, rtn);
        if (!error && old_setting) {
                if (put_itimerspec64(&old_spec, old_setting))
                        error = -EFAULT;
        }
        return error;
}

#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE4(timer_settime32, timer_t, timer_id, int, flags,
                struct old_itimerspec32 __user *, new,
                struct old_itimerspec32 __user *, old)
{
        struct itimerspec64 new_spec, old_spec;
        struct itimerspec64 *rtn = old ? &old_spec : NULL;
        int error = 0;

        if (!new)
                return -EINVAL;
        if (get_old_itimerspec32(&new_spec, new))
                return -EFAULT;

        error = do_timer_settime(timer_id, flags, &new_spec, rtn);
        if (!error && old) {
                if (put_old_itimerspec32(&old_spec, old))
                        error = -EFAULT;
        }
        return error;
}
#endif

int common_timer_del(struct k_itimer *timer)
{
        const struct k_clock *kc = timer->kclock;

        timer->it_interval = 0;
        if (kc->timer_try_to_cancel(timer) < 0)
                return TIMER_RETRY;
        timer->it_active = 0;
        return 0;
}

static inline int timer_delete_hook(struct k_itimer *timer)
{
        const struct k_clock *kc = timer->kclock;

        if (WARN_ON_ONCE(!kc || !kc->timer_del))
                return -EINVAL;
        return kc->timer_del(timer);
}

/* Delete a POSIX.1b interval timer. */
SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
{
        struct k_itimer *timer;
        unsigned long flags;

        timer = lock_timer(timer_id, &flags);

retry_delete:
        if (!timer)
                return -EINVAL;

        if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) {
                /* Unlocks and relocks the timer if it still exists */
                timer = timer_wait_running(timer, &flags);
                goto retry_delete;
        }

        spin_lock(&current->sighand->siglock);
        list_del(&timer->list);
        spin_unlock(&current->sighand->siglock);
        /*
         * A concurrent lookup could check timer::it_signal lockless. It
         * will reevaluate with timer::it_lock held and observe the NULL.
         */
        WRITE_ONCE(timer->it_signal, NULL);

        unlock_timer(timer, flags);
        posix_timer_unhash_and_free(timer);
        return 0;
}

/*
 * Delete a timer if it is armed, remove it from the hash and schedule it
 * for RCU freeing.
 */
static void itimer_delete(struct k_itimer *timer)
{
        unsigned long flags;

        /*
         * irqsave is required to make timer_wait_running() work.
         */
        spin_lock_irqsave(&timer->it_lock, flags);

retry_delete:
        /*
         * Even if the timer is not longer accessible from other tasks
         * it still might be armed and queued in the underlying timer
         * mechanism. Worse, that timer mechanism might run the expiry
         * function concurrently.
         */
        if (timer_delete_hook(timer) == TIMER_RETRY) {
                /*
                 * Timer is expired concurrently, prevent livelocks
                 * and pointless spinning on RT.
                 *
                 * timer_wait_running() drops timer::it_lock, which opens
                 * the possibility for another task to delete the timer.
                 *
                 * That's not possible here because this is invoked from
                 * do_exit() only for the last thread of the thread group.
                 * So no other task can access and delete that timer.
                 */
                if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer))
                        return;

                goto retry_delete;
        }
        list_del(&timer->list);

        /*
         * Setting timer::it_signal to NULL is technically not required
         * here as nothing can access the timer anymore legitimately via
         * the hash table. Set it to NULL nevertheless so that all deletion
         * paths are consistent.
         */
        WRITE_ONCE(timer->it_signal, NULL);

        spin_unlock_irqrestore(&timer->it_lock, flags);
        posix_timer_unhash_and_free(timer);
}

/*
 * Invoked from do_exit() when the last thread of a thread group exits.
 * At that point no other task can access the timers of the dying
 * task anymore.
 */
void exit_itimers(struct task_struct *tsk)
{
        struct list_head timers;
        struct k_itimer *tmr;

        if (list_empty(&tsk->signal->posix_timers))
                return;

        /* Protect against concurrent read via /proc/$PID/timers */
        spin_lock_irq(&tsk->sighand->siglock);
        list_replace_init(&tsk->signal->posix_timers, &timers);
        spin_unlock_irq(&tsk->sighand->siglock);

        /* The timers are not longer accessible via tsk::signal */
        while (!list_empty(&timers)) {
                tmr = list_first_entry(&timers, struct k_itimer, list);
                itimer_delete(tmr);
        }
}

SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
                const struct __kernel_timespec __user *, tp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 new_tp;

        if (!kc || !kc->clock_set)
                return -EINVAL;

        if (get_timespec64(&new_tp, tp))
                return -EFAULT;

        /*
         * Permission checks have to be done inside the clock specific
         * setter callback.
         */
        return kc->clock_set(which_clock, &new_tp);
}

SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
                struct __kernel_timespec __user *, tp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 kernel_tp;
        int error;

        if (!kc)
                return -EINVAL;

        error = kc->clock_get_timespec(which_clock, &kernel_tp);

        if (!error && put_timespec64(&kernel_tp, tp))
                error = -EFAULT;

        return error;
}

int do_clock_adjtime(const clockid_t which_clock, struct __kernel_timex * ktx)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);

        if (!kc)
                return -EINVAL;
        if (!kc->clock_adj)
                return -EOPNOTSUPP;

        return kc->clock_adj(which_clock, ktx);
}

SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
                struct __kernel_timex __user *, utx)
{
        struct __kernel_timex ktx;
        int err;

        if (copy_from_user(&ktx, utx, sizeof(ktx)))
                return -EFAULT;

        err = do_clock_adjtime(which_clock, &ktx);

        if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))
                return -EFAULT;

        return err;
}

/**
 * sys_clock_getres - Get the resolution of a clock
 * @which_clock:        The clock to get the resolution for
 * @tp:                        Pointer to a a user space timespec64 for storage
 *
 * POSIX defines:
 *
 * "The clock_getres() function shall return the resolution of any
 * clock. Clock resolutions are implementation-defined and cannot be set by
 * a process. If the argument res is not NULL, the resolution of the
 * specified clock shall be stored in the location pointed to by res. If
 * res is NULL, the clock resolution is not returned. If the time argument
 * of clock_settime() is not a multiple of res, then the value is truncated
 * to a multiple of res."
 *
 * Due to the various hardware constraints the real resolution can vary
 * wildly and even change during runtime when the underlying devices are
 * replaced. The kernel also can use hardware devices with different
 * resolutions for reading the time and for arming timers.
 *
 * The kernel therefore deviates from the POSIX spec in various aspects:
 *
 * 1) The resolution returned to user space
 *
 *    For CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, CLOCK_TAI,
 *    CLOCK_REALTIME_ALARM, CLOCK_BOOTTIME_ALAREM and CLOCK_MONOTONIC_RAW
 *    the kernel differentiates only two cases:
 *
 *    I)  Low resolution mode:
 *
 *          When high resolution timers are disabled at compile or runtime
 *          the resolution returned is nanoseconds per tick, which represents
 *          the precision at which timers expire.
 *
 *    II) High resolution mode:
 *
 *          When high resolution timers are enabled the resolution returned
 *          is always one nanosecond independent of the actual resolution of
 *          the underlying hardware devices.
 *
 *          For CLOCK_*_ALARM the actual resolution depends on system
 *          state. When system is running the resolution is the same as the
 *          resolution of the other clocks. During suspend the actual
 *          resolution is the resolution of the underlying RTC device which
 *          might be way less precise than the clockevent device used during
 *          running state.
 *
 *   For CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE the resolution
 *   returned is always nanoseconds per tick.
 *
 *   For CLOCK_PROCESS_CPUTIME and CLOCK_THREAD_CPUTIME the resolution
 *   returned is always one nanosecond under the assumption that the
 *   underlying scheduler clock has a better resolution than nanoseconds
 *   per tick.
 *
 *   For dynamic POSIX clocks (PTP devices) the resolution returned is
 *   always one nanosecond.
 *
 * 2) Affect on sys_clock_settime()
 *
 *    The kernel does not truncate the time which is handed in to
 *    sys_clock_settime(). The kernel internal timekeeping is always using
 *    nanoseconds precision independent of the clocksource device which is
 *    used to read the time from. The resolution of that device only
 *    affects the presicion of the time returned by sys_clock_gettime().
 *
 * Returns:
 *        0                Success. @tp contains the resolution
 *        -EINVAL                @which_clock is not a valid clock ID
 *        -EFAULT                Copying the resolution to @tp faulted
 *        -ENODEV                Dynamic POSIX clock is not backed by a device
 *        -EOPNOTSUPP        Dynamic POSIX clock does not support getres()
 */
SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
                struct __kernel_timespec __user *, tp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 rtn_tp;
        int error;

        if (!kc)
                return -EINVAL;

        error = kc->clock_getres(which_clock, &rtn_tp);

        if (!error && tp && put_timespec64(&rtn_tp, tp))
                error = -EFAULT;

        return error;
}

#ifdef CONFIG_COMPAT_32BIT_TIME

SYSCALL_DEFINE2(clock_settime32, clockid_t, which_clock,
                struct old_timespec32 __user *, tp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 ts;

        if (!kc || !kc->clock_set)
                return -EINVAL;

        if (get_old_timespec32(&ts, tp))
                return -EFAULT;

        return kc->clock_set(which_clock, &ts);
}

SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock,
                struct old_timespec32 __user *, tp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 ts;
        int err;

        if (!kc)
                return -EINVAL;

        err = kc->clock_get_timespec(which_clock, &ts);

        if (!err && put_old_timespec32(&ts, tp))
                err = -EFAULT;

        return err;
}

SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock,
                struct old_timex32 __user *, utp)
{
        struct __kernel_timex ktx;
        int err;

        err = get_old_timex32(&ktx, utp);
        if (err)
                return err;

        err = do_clock_adjtime(which_clock, &ktx);

        if (err >= 0 && put_old_timex32(utp, &ktx))
                return -EFAULT;

        return err;
}

SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
                struct old_timespec32 __user *, tp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 ts;
        int err;

        if (!kc)
                return -EINVAL;

        err = kc->clock_getres(which_clock, &ts);
        if (!err && tp && put_old_timespec32(&ts, tp))
                return -EFAULT;

        return err;
}

#endif

/*
 * sys_clock_nanosleep() for CLOCK_REALTIME and CLOCK_TAI
 */
static int common_nsleep(const clockid_t which_clock, int flags,
                         const struct timespec64 *rqtp)
{
        ktime_t texp = timespec64_to_ktime(*rqtp);

        return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
                                 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
                                 which_clock);
}

/*
 * sys_clock_nanosleep() for CLOCK_MONOTONIC and CLOCK_BOOTTIME
 *
 * Absolute nanosleeps for these clocks are time-namespace adjusted.
 */
static int common_nsleep_timens(const clockid_t which_clock, int flags,
                                const struct timespec64 *rqtp)
{
        ktime_t texp = timespec64_to_ktime(*rqtp);

        if (flags & TIMER_ABSTIME)
                texp = timens_ktime_to_host(which_clock, texp);

        return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
                                 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
                                 which_clock);
}

SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
                const struct __kernel_timespec __user *, rqtp,
                struct __kernel_timespec __user *, rmtp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 t;

        if (!kc)
                return -EINVAL;
        if (!kc->nsleep)
                return -EOPNOTSUPP;

        if (get_timespec64(&t, rqtp))
                return -EFAULT;

        if (!timespec64_valid(&t))
                return -EINVAL;
        if (flags & TIMER_ABSTIME)
                rmtp = NULL;
        current->restart_block.fn = do_no_restart_syscall;
        current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
        current->restart_block.nanosleep.rmtp = rmtp;

        return kc->nsleep(which_clock, flags, &t);
}

#ifdef CONFIG_COMPAT_32BIT_TIME

SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
                struct old_timespec32 __user *, rqtp,
                struct old_timespec32 __user *, rmtp)
{
        const struct k_clock *kc = clockid_to_kclock(which_clock);
        struct timespec64 t;

        if (!kc)
                return -EINVAL;
        if (!kc->nsleep)
                return -EOPNOTSUPP;

        if (get_old_timespec32(&t, rqtp))
                return -EFAULT;

        if (!timespec64_valid(&t))
                return -EINVAL;
        if (flags & TIMER_ABSTIME)
                rmtp = NULL;
        current->restart_block.fn = do_no_restart_syscall;
        current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
        current->restart_block.nanosleep.compat_rmtp = rmtp;

        return kc->nsleep(which_clock, flags, &t);
}

#endif

static const struct k_clock clock_realtime = {
        .clock_getres                = posix_get_hrtimer_res,
        .clock_get_timespec        = posix_get_realtime_timespec,
        .clock_get_ktime        = posix_get_realtime_ktime,
        .clock_set                = posix_clock_realtime_set,
        .clock_adj                = posix_clock_realtime_adj,
        .nsleep                        = common_nsleep,
        .timer_create                = common_timer_create,
        .timer_set                = common_timer_set,
        .timer_get                = common_timer_get,
        .timer_del                = common_timer_del,
        .timer_rearm                = common_hrtimer_rearm,
        .timer_forward                = common_hrtimer_forward,
        .timer_remaining        = common_hrtimer_remaining,
        .timer_try_to_cancel        = common_hrtimer_try_to_cancel,
        .timer_wait_running        = common_timer_wait_running,
        .timer_arm                = common_hrtimer_arm,
};

static const struct k_clock clock_monotonic = {
        .clock_getres                = posix_get_hrtimer_res,
        .clock_get_timespec        = posix_get_monotonic_timespec,
        .clock_get_ktime        = posix_get_monotonic_ktime,
        .nsleep                        = common_nsleep_timens,
        .timer_create                = common_timer_create,
        .timer_set                = common_timer_set,
        .timer_get                = common_timer_get,
        .timer_del                = common_timer_del,
        .timer_rearm                = common_hrtimer_rearm,
        .timer_forward                = common_hrtimer_forward,
        .timer_remaining        = common_hrtimer_remaining,
        .timer_try_to_cancel        = common_hrtimer_try_to_cancel,
        .timer_wait_running        = common_timer_wait_running,
        .timer_arm                = common_hrtimer_arm,
};

static const struct k_clock clock_monotonic_raw = {
        .clock_getres                = posix_get_hrtimer_res,
        .clock_get_timespec        = posix_get_monotonic_raw,
};

static const struct k_clock clock_realtime_coarse = {
        .clock_getres                = posix_get_coarse_res,
        .clock_get_timespec        = posix_get_realtime_coarse,
};

static const struct k_clock clock_monotonic_coarse = {
        .clock_getres                = posix_get_coarse_res,
        .clock_get_timespec        = posix_get_monotonic_coarse,
};

static const struct k_clock clock_tai = {
        .clock_getres                = posix_get_hrtimer_res,
        .clock_get_ktime        = posix_get_tai_ktime,
        .clock_get_timespec        = posix_get_tai_timespec,
        .nsleep                        = common_nsleep,
        .timer_create                = common_timer_create,
        .timer_set                = common_timer_set,
        .timer_get                = common_timer_get,
        .timer_del                = common_timer_del,
        .timer_rearm                = common_hrtimer_rearm,
        .timer_forward                = common_hrtimer_forward,
        .timer_remaining        = common_hrtimer_remaining,
        .timer_try_to_cancel        = common_hrtimer_try_to_cancel,
        .timer_wait_running        = common_timer_wait_running,
        .timer_arm                = common_hrtimer_arm,
};

static const struct k_clock clock_boottime = {
        .clock_getres                = posix_get_hrtimer_res,
        .clock_get_ktime        = posix_get_boottime_ktime,
        .clock_get_timespec        = posix_get_boottime_timespec,
        .nsleep                        = common_nsleep_timens,
        .timer_create                = common_timer_create,
        .timer_set                = common_timer_set,
        .timer_get                = common_timer_get,
        .timer_del                = common_timer_del,
        .timer_rearm                = common_hrtimer_rearm,
        .timer_forward                = common_hrtimer_forward,
        .timer_remaining        = common_hrtimer_remaining,
        .timer_try_to_cancel        = common_hrtimer_try_to_cancel,
        .timer_wait_running        = common_timer_wait_running,
        .timer_arm                = common_hrtimer_arm,
};

static const struct k_clock * const posix_clocks[] = {
        [CLOCK_REALTIME]                = &clock_realtime,
        [CLOCK_MONOTONIC]                = &clock_monotonic,
        [CLOCK_PROCESS_CPUTIME_ID]        = &clock_process,
        [CLOCK_THREAD_CPUTIME_ID]        = &clock_thread,
        [CLOCK_MONOTONIC_RAW]                = &clock_monotonic_raw,
        [CLOCK_REALTIME_COARSE]                = &clock_realtime_coarse,
        [CLOCK_MONOTONIC_COARSE]        = &clock_monotonic_coarse,
        [CLOCK_BOOTTIME]                = &clock_boottime,
        [CLOCK_REALTIME_ALARM]                = &alarm_clock,
        [CLOCK_BOOTTIME_ALARM]                = &alarm_clock,
        [CLOCK_TAI]                        = &clock_tai,
};

static const struct k_clock *clockid_to_kclock(const clockid_t id)
{
        clockid_t idx = id;

        if (id < 0) {
                return (id & CLOCKFD_MASK) == CLOCKFD ?
                        &clock_posix_dynamic : &clock_posix_cpu;
        }

        if (id >= ARRAY_SIZE(posix_clocks))
                return NULL;

        return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))];
}



























    3 











































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef BTRFS_MISC_H
#define BTRFS_MISC_H

#include <linux/types.h>
#include <linux/bitmap.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/math64.h>
#include <linux/rbtree.h>

/*
 * Enumerate bits using enum autoincrement. Define the @name as the n-th bit.
 */
#define ENUM_BIT(name)                                  \
        __ ## name ## _BIT,                             \
        name = (1U << __ ## name ## _BIT),              \
        __ ## name ## _SEQ = __ ## name ## _BIT

static inline void cond_wake_up(struct wait_queue_head *wq)
{
        /*
         * This implies a full smp_mb barrier, see comments for
         * waitqueue_active why.
         */
        if (wq_has_sleeper(wq))
                wake_up(wq);
}

static inline void cond_wake_up_nomb(struct wait_queue_head *wq)
{
        /*
         * Special case for conditional wakeup where the barrier required for
         * waitqueue_active is implied by some of the preceding code. Eg. one
         * of such atomic operations (atomic_dec_and_return, ...), or a
         * unlock/lock sequence, etc.
         */
        if (waitqueue_active(wq))
                wake_up(wq);
}

static inline u64 mult_perc(u64 num, u32 percent)
{
        return div_u64(num * percent, 100);
}
/* Copy of is_power_of_two that is 64bit safe */
static inline bool is_power_of_two_u64(u64 n)
{
        return n != 0 && (n & (n - 1)) == 0;
}

static inline bool has_single_bit_set(u64 n)
{
        return is_power_of_two_u64(n);
}

/*
 * Simple bytenr based rb_tree relate structures
 *
 * Any structure wants to use bytenr as single search index should have their
 * structure start with these members.
 */
struct rb_simple_node {
        struct rb_node rb_node;
        u64 bytenr;
};

static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr)
{
        struct rb_node *node = root->rb_node;
        struct rb_simple_node *entry;

        while (node) {
                entry = rb_entry(node, struct rb_simple_node, rb_node);

                if (bytenr < entry->bytenr)
                        node = node->rb_left;
                else if (bytenr > entry->bytenr)
                        node = node->rb_right;
                else
                        return node;
        }
        return NULL;
}

/*
 * Search @root from an entry that starts or comes after @bytenr.
 *
 * @root:        the root to search.
 * @bytenr:        bytenr to search from.
 *
 * Return the rb_node that start at or after @bytenr.  If there is no entry at
 * or after @bytner return NULL.
 */
static inline struct rb_node *rb_simple_search_first(struct rb_root *root,
                                                     u64 bytenr)
{
        struct rb_node *node = root->rb_node, *ret = NULL;
        struct rb_simple_node *entry, *ret_entry = NULL;

        while (node) {
                entry = rb_entry(node, struct rb_simple_node, rb_node);

                if (bytenr < entry->bytenr) {
                        if (!ret || entry->bytenr < ret_entry->bytenr) {
                                ret = node;
                                ret_entry = entry;
                        }

                        node = node->rb_left;
                } else if (bytenr > entry->bytenr) {
                        node = node->rb_right;
                } else {
                        return node;
                }
        }

        return ret;
}

static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr,
                                               struct rb_node *node)
{
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent = NULL;
        struct rb_simple_node *entry;

        while (*p) {
                parent = *p;
                entry = rb_entry(parent, struct rb_simple_node, rb_node);

                if (bytenr < entry->bytenr)
                        p = &(*p)->rb_left;
                else if (bytenr > entry->bytenr)
                        p = &(*p)->rb_right;
                else
                        return parent;
        }

        rb_link_node(node, parent, p);
        rb_insert_color(node, root);
        return NULL;
}

static inline bool bitmap_test_range_all_set(const unsigned long *addr,
                                             unsigned long start,
                                             unsigned long nbits)
{
        unsigned long found_zero;

        found_zero = find_next_zero_bit(addr, start + nbits, start);
        return (found_zero == start + nbits);
}

static inline bool bitmap_test_range_all_zero(const unsigned long *addr,
                                              unsigned long start,
                                              unsigned long nbits)
{
        unsigned long found_set;

        found_set = find_next_bit(addr, start + nbits, start);
        return (found_set == start + nbits);
}

#endif









































































































































    1 




























    1 





    1 






    1 



























    1 



    2 




    2 








    1 
    1 




    1 
    1 











    1 




    1 

    1 

    1 













   15 





















































    1 


























    2 



    2 
    1 
    2 





























    1 

    1 









    1 








    1 






































    1 
    2 







































































































    1 

    1 






    1 













































































































































































































    2 
    2 
    2 


    2 
    2 
    2 






    2 







    2 






    2 








    2 


































































































































    1 







    1 
























    1 















    1 




    1 

    1 
    1 









    1 




    1 














    1 












    1 

    1 
















    1 
























































    1 













































    1 










    1 















    1 








    1 














    1 

    1 




    1 
    1 






    1 




    1 






    1 





    1 



    1 







    1 












    1 
    1 

























































































































































































































































































    3 















































































    2 

    2 







    2 






























    2 

    2 


    2 
    2 



    2 















    2 


























    2 
















































































































































































































































    3 
    3 











    3 




    3 


    3 

























    1 

    1 













    1 








    2 




    2 
















    2 









    2 






















    2 









































































































































































































































































































    2 





























































    2 

















































    3 





















    2 

    2 





    3 













    3 


    1 











    1 











    3 



    3 








    3 















































































































































    1 







    1 











    1 














    1 
    1 



    1 

    1 
    1 




    1 



    1 




























































































































    1 














    1 













    2 







































    1 























    1 








































    1 








    1 



    1 

    1 


    1 















































































































































































































    2 












    2 










    2 

















































    2 



























































































































































    1 
















    1 






















    1 








    1 










    1 

    1 




    1 
































    1 









    1 



    1 




    1 




















    2 

















    2 






    2 




































    2 




















































































































































































































































































































































































































    1 


    1 


































































































































































































































    1 





















    1 

    1 

    1 












    1 



































    1 






















    1 









    1 




    1 



    1 

















    3 















    1 

    1 
    1 




    2 










































































































































    1 






    1 



































































































































































































    1 









    1 


















    1 



























    1 






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
/*
 * Resizable virtual memory filesystem for Linux.
 *
 * Copyright (C) 2000 Linus Torvalds.
 *                 2000 Transmeta Corp.
 *                 2000-2001 Christoph Rohland
 *                 2000-2001 SAP AG
 *                 2002 Red Hat Inc.
 * Copyright (C) 2002-2011 Hugh Dickins.
 * Copyright (C) 2011 Google Inc.
 * Copyright (C) 2002-2005 VERITAS Software Corporation.
 * Copyright (C) 2004 Andi Kleen, SuSE Labs
 *
 * Extended attribute support for tmpfs:
 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
 *
 * tiny-shmem:
 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
 *
 * This file is released under the GPL.
 */

#include <linux/fs.h>
#include <linux/init.h>
#include <linux/vfs.h>
#include <linux/mount.h>
#include <linux/ramfs.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/fileattr.h>
#include <linux/mm.h>
#include <linux/random.h>
#include <linux/sched/signal.h>
#include <linux/export.h>
#include <linux/shmem_fs.h>
#include <linux/swap.h>
#include <linux/uio.h>
#include <linux/hugetlb.h>
#include <linux/fs_parser.h>
#include <linux/swapfile.h>
#include <linux/iversion.h>
#include "swap.h"

static struct vfsmount *shm_mnt __ro_after_init;

#ifdef CONFIG_SHMEM
/*
 * This virtual memory filesystem is heavily based on the ramfs. It
 * extends ramfs by the ability to use swap and honor resource limits
 * which makes it a completely usable filesystem.
 */

#include <linux/xattr.h>
#include <linux/exportfs.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
#include <linux/mman.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/percpu_counter.h>
#include <linux/falloc.h>
#include <linux/splice.h>
#include <linux/security.h>
#include <linux/swapops.h>
#include <linux/mempolicy.h>
#include <linux/namei.h>
#include <linux/ctype.h>
#include <linux/migrate.h>
#include <linux/highmem.h>
#include <linux/seq_file.h>
#include <linux/magic.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <uapi/linux/memfd.h>
#include <linux/rmap.h>
#include <linux/uuid.h>
#include <linux/quotaops.h>
#include <linux/rcupdate_wait.h>

#include <linux/uaccess.h>

#include "internal.h"

#define BLOCKS_PER_PAGE  (PAGE_SIZE/512)
#define VM_ACCT(size)    (PAGE_ALIGN(size) >> PAGE_SHIFT)

/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20

/* Pretend that one inode + its dentry occupy this much memory */
#define BOGO_INODE_SIZE 1024

/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
#define SHORT_SYMLINK_LEN 128

/*
 * shmem_fallocate communicates with shmem_fault or shmem_writepage via
 * inode->i_private (with i_rwsem making sure that it has only one user at
 * a time): we would prefer not to enlarge the shmem inode just for that.
 */
struct shmem_falloc {
        wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
        pgoff_t start;                /* start of range currently being fallocated */
        pgoff_t next;                /* the next page offset to be fallocated */
        pgoff_t nr_falloced;        /* how many new pages have been fallocated */
        pgoff_t nr_unswapped;        /* how often writepage refused to swap out */
};

struct shmem_options {
        unsigned long long blocks;
        unsigned long long inodes;
        struct mempolicy *mpol;
        kuid_t uid;
        kgid_t gid;
        umode_t mode;
        bool full_inums;
        int huge;
        int seen;
        bool noswap;
        unsigned short quota_types;
        struct shmem_quota_limits qlimits;
#define SHMEM_SEEN_BLOCKS 1
#define SHMEM_SEEN_INODES 2
#define SHMEM_SEEN_HUGE 4
#define SHMEM_SEEN_INUMS 8
#define SHMEM_SEEN_NOSWAP 16
#define SHMEM_SEEN_QUOTA 32
};

#ifdef CONFIG_TMPFS
static unsigned long shmem_default_max_blocks(void)
{
        return totalram_pages() / 2;
}

static unsigned long shmem_default_max_inodes(void)
{
        unsigned long nr_pages = totalram_pages();

        return min3(nr_pages - totalhigh_pages(), nr_pages / 2,
                        ULONG_MAX / BOGO_INODE_SIZE);
}
#endif

static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
                        struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
                        struct mm_struct *fault_mm, vm_fault_t *fault_type);

static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}

/*
 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
 * for shared memory and for shared anonymous (/dev/zero) mappings
 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
 * consistent with the pre-accounting of private mappings ...
 */
static inline int shmem_acct_size(unsigned long flags, loff_t size)
{
        return (flags & VM_NORESERVE) ?
                0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
}

static inline void shmem_unacct_size(unsigned long flags, loff_t size)
{
        if (!(flags & VM_NORESERVE))
                vm_unacct_memory(VM_ACCT(size));
}

static inline int shmem_reacct_size(unsigned long flags,
                loff_t oldsize, loff_t newsize)
{
        if (!(flags & VM_NORESERVE)) {
                if (VM_ACCT(newsize) > VM_ACCT(oldsize))
                        return security_vm_enough_memory_mm(current->mm,
                                        VM_ACCT(newsize) - VM_ACCT(oldsize));
                else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
                        vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
        }
        return 0;
}

/*
 * ... whereas tmpfs objects are accounted incrementally as
 * pages are allocated, in order to allow large sparse files.
 * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM,
 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
 */
static inline int shmem_acct_blocks(unsigned long flags, long pages)
{
        if (!(flags & VM_NORESERVE))
                return 0;

        return security_vm_enough_memory_mm(current->mm,
                        pages * VM_ACCT(PAGE_SIZE));
}

static inline void shmem_unacct_blocks(unsigned long flags, long pages)
{
        if (flags & VM_NORESERVE)
                vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
}

static int shmem_inode_acct_blocks(struct inode *inode, long pages)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        int err = -ENOSPC;

        if (shmem_acct_blocks(info->flags, pages))
                return err;

        might_sleep();        /* when quotas */
        if (sbinfo->max_blocks) {
                if (!percpu_counter_limited_add(&sbinfo->used_blocks,
                                                sbinfo->max_blocks, pages))
                        goto unacct;

                err = dquot_alloc_block_nodirty(inode, pages);
                if (err) {
                        percpu_counter_sub(&sbinfo->used_blocks, pages);
                        goto unacct;
                }
        } else {
                err = dquot_alloc_block_nodirty(inode, pages);
                if (err)
                        goto unacct;
        }

        return 0;

unacct:
        shmem_unacct_blocks(info->flags, pages);
        return err;
}

static void shmem_inode_unacct_blocks(struct inode *inode, long pages)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);

        might_sleep();        /* when quotas */
        dquot_free_block_nodirty(inode, pages);

        if (sbinfo->max_blocks)
                percpu_counter_sub(&sbinfo->used_blocks, pages);
        shmem_unacct_blocks(info->flags, pages);
}

static const struct super_operations shmem_ops;
static const struct address_space_operations shmem_aops;
static const struct file_operations shmem_file_operations;
static const struct inode_operations shmem_inode_operations;
static const struct inode_operations shmem_dir_inode_operations;
static const struct inode_operations shmem_special_inode_operations;
static const struct vm_operations_struct shmem_vm_ops;
static const struct vm_operations_struct shmem_anon_vm_ops;
static struct file_system_type shmem_fs_type;

bool shmem_mapping(struct address_space *mapping)
{
        return mapping->a_ops == &shmem_aops;
}
EXPORT_SYMBOL_GPL(shmem_mapping);

bool vma_is_anon_shmem(struct vm_area_struct *vma)
{
        return vma->vm_ops == &shmem_anon_vm_ops;
}

bool vma_is_shmem(struct vm_area_struct *vma)
{
        return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops;
}

static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex);

#ifdef CONFIG_TMPFS_QUOTA

static int shmem_enable_quotas(struct super_block *sb,
                               unsigned short quota_types)
{
        int type, err = 0;

        sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
        for (type = 0; type < SHMEM_MAXQUOTAS; type++) {
                if (!(quota_types & (1 << type)))
                        continue;
                err = dquot_load_quota_sb(sb, type, QFMT_SHMEM,
                                          DQUOT_USAGE_ENABLED |
                                          DQUOT_LIMITS_ENABLED);
                if (err)
                        goto out_err;
        }
        return 0;

out_err:
        pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n",
                type, err);
        for (type--; type >= 0; type--)
                dquot_quota_off(sb, type);
        return err;
}

static void shmem_disable_quotas(struct super_block *sb)
{
        int type;

        for (type = 0; type < SHMEM_MAXQUOTAS; type++)
                dquot_quota_off(sb, type);
}

static struct dquot __rcu **shmem_get_dquots(struct inode *inode)
{
        return SHMEM_I(inode)->i_dquot;
}
#endif /* CONFIG_TMPFS_QUOTA */

/*
 * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
 * produces a novel ino for the newly allocated inode.
 *
 * It may also be called when making a hard link to permit the space needed by
 * each dentry. However, in that case, no new inode number is needed since that
 * internally draws from another pool of inode numbers (currently global
 * get_next_ino()). This case is indicated by passing NULL as inop.
 */
#define SHMEM_INO_BATCH 1024
static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        ino_t ino;

        if (!(sb->s_flags & SB_KERNMOUNT)) {
                raw_spin_lock(&sbinfo->stat_lock);
                if (sbinfo->max_inodes) {
                        if (sbinfo->free_ispace < BOGO_INODE_SIZE) {
                                raw_spin_unlock(&sbinfo->stat_lock);
                                return -ENOSPC;
                        }
                        sbinfo->free_ispace -= BOGO_INODE_SIZE;
                }
                if (inop) {
                        ino = sbinfo->next_ino++;
                        if (unlikely(is_zero_ino(ino)))
                                ino = sbinfo->next_ino++;
                        if (unlikely(!sbinfo->full_inums &&
                                     ino > UINT_MAX)) {
                                /*
                                 * Emulate get_next_ino uint wraparound for
                                 * compatibility
                                 */
                                if (IS_ENABLED(CONFIG_64BIT))
                                        pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
                                                __func__, MINOR(sb->s_dev));
                                sbinfo->next_ino = 1;
                                ino = sbinfo->next_ino++;
                        }
                        *inop = ino;
                }
                raw_spin_unlock(&sbinfo->stat_lock);
        } else if (inop) {
                /*
                 * __shmem_file_setup, one of our callers, is lock-free: it
                 * doesn't hold stat_lock in shmem_reserve_inode since
                 * max_inodes is always 0, and is called from potentially
                 * unknown contexts. As such, use a per-cpu batched allocator
                 * which doesn't require the per-sb stat_lock unless we are at
                 * the batch boundary.
                 *
                 * We don't need to worry about inode{32,64} since SB_KERNMOUNT
                 * shmem mounts are not exposed to userspace, so we don't need
                 * to worry about things like glibc compatibility.
                 */
                ino_t *next_ino;

                next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
                ino = *next_ino;
                if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
                        raw_spin_lock(&sbinfo->stat_lock);
                        ino = sbinfo->next_ino;
                        sbinfo->next_ino += SHMEM_INO_BATCH;
                        raw_spin_unlock(&sbinfo->stat_lock);
                        if (unlikely(is_zero_ino(ino)))
                                ino++;
                }
                *inop = ino;
                *next_ino = ++ino;
                put_cpu();
        }

        return 0;
}

static void shmem_free_inode(struct super_block *sb, size_t freed_ispace)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        if (sbinfo->max_inodes) {
                raw_spin_lock(&sbinfo->stat_lock);
                sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace;
                raw_spin_unlock(&sbinfo->stat_lock);
        }
}

/**
 * shmem_recalc_inode - recalculate the block usage of an inode
 * @inode: inode to recalc
 * @alloced: the change in number of pages allocated to inode
 * @swapped: the change in number of pages swapped from inode
 *
 * We have to calculate the free blocks since the mm can drop
 * undirtied hole pages behind our back.
 *
 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
 */
static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        long freed;

        spin_lock(&info->lock);
        info->alloced += alloced;
        info->swapped += swapped;
        freed = info->alloced - info->swapped -
                READ_ONCE(inode->i_mapping->nrpages);
        /*
         * Special case: whereas normally shmem_recalc_inode() is called
         * after i_mapping->nrpages has already been adjusted (up or down),
         * shmem_writepage() has to raise swapped before nrpages is lowered -
         * to stop a racing shmem_recalc_inode() from thinking that a page has
         * been freed.  Compensate here, to avoid the need for a followup call.
         */
        if (swapped > 0)
                freed += swapped;
        if (freed > 0)
                info->alloced -= freed;
        spin_unlock(&info->lock);

        /* The quota case may block */
        if (freed > 0)
                shmem_inode_unacct_blocks(inode, freed);
}

bool shmem_charge(struct inode *inode, long pages)
{
        struct address_space *mapping = inode->i_mapping;

        if (shmem_inode_acct_blocks(inode, pages))
                return false;

        /* nrpages adjustment first, then shmem_recalc_inode() when balanced */
        xa_lock_irq(&mapping->i_pages);
        mapping->nrpages += pages;
        xa_unlock_irq(&mapping->i_pages);

        shmem_recalc_inode(inode, pages, 0);
        return true;
}

void shmem_uncharge(struct inode *inode, long pages)
{
        /* pages argument is currently unused: keep it to help debugging */
        /* nrpages adjustment done by __filemap_remove_folio() or caller */

        shmem_recalc_inode(inode, 0, 0);
}

/*
 * Replace item expected in xarray by a new item, while holding xa_lock.
 */
static int shmem_replace_entry(struct address_space *mapping,
                        pgoff_t index, void *expected, void *replacement)
{
        XA_STATE(xas, &mapping->i_pages, index);
        void *item;

        VM_BUG_ON(!expected);
        VM_BUG_ON(!replacement);
        item = xas_load(&xas);
        if (item != expected)
                return -ENOENT;
        xas_store(&xas, replacement);
        return 0;
}

/*
 * Sometimes, before we decide whether to proceed or to fail, we must check
 * that an entry was not already brought back from swap by a racing thread.
 *
 * Checking page is not enough: by the time a SwapCache page is locked, it
 * might be reused, and again be SwapCache, using the same swap as before.
 */
static bool shmem_confirm_swap(struct address_space *mapping,
                               pgoff_t index, swp_entry_t swap)
{
        return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
}

/*
 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
 *
 * SHMEM_HUGE_NEVER:
 *        disables huge pages for the mount;
 * SHMEM_HUGE_ALWAYS:
 *        enables huge pages for the mount;
 * SHMEM_HUGE_WITHIN_SIZE:
 *        only allocate huge pages if the page will be fully within i_size,
 *        also respect fadvise()/madvise() hints;
 * SHMEM_HUGE_ADVISE:
 *        only allocate huge pages if requested with fadvise()/madvise();
 */

#define SHMEM_HUGE_NEVER        0
#define SHMEM_HUGE_ALWAYS        1
#define SHMEM_HUGE_WITHIN_SIZE        2
#define SHMEM_HUGE_ADVISE        3

/*
 * Special values.
 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
 *
 * SHMEM_HUGE_DENY:
 *        disables huge on shm_mnt and all mounts, for emergency use;
 * SHMEM_HUGE_FORCE:
 *        enables huge on shm_mnt and all mounts, w/o needing option, for testing;
 *
 */
#define SHMEM_HUGE_DENY                (-1)
#define SHMEM_HUGE_FORCE        (-2)

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* ifdef here to avoid bloating shmem.o when not necessary */

static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;

bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
                   struct mm_struct *mm, unsigned long vm_flags)
{
        loff_t i_size;

        if (!S_ISREG(inode->i_mode))
                return false;
        if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags)))
                return false;
        if (shmem_huge == SHMEM_HUGE_DENY)
                return false;
        if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
                return true;

        switch (SHMEM_SB(inode->i_sb)->huge) {
        case SHMEM_HUGE_ALWAYS:
                return true;
        case SHMEM_HUGE_WITHIN_SIZE:
                index = round_up(index + 1, HPAGE_PMD_NR);
                i_size = round_up(i_size_read(inode), PAGE_SIZE);
                if (i_size >> PAGE_SHIFT >= index)
                        return true;
                fallthrough;
        case SHMEM_HUGE_ADVISE:
                if (mm && (vm_flags & VM_HUGEPAGE))
                        return true;
                fallthrough;
        default:
                return false;
        }
}

#if defined(CONFIG_SYSFS)
static int shmem_parse_huge(const char *str)
{
        if (!strcmp(str, "never"))
                return SHMEM_HUGE_NEVER;
        if (!strcmp(str, "always"))
                return SHMEM_HUGE_ALWAYS;
        if (!strcmp(str, "within_size"))
                return SHMEM_HUGE_WITHIN_SIZE;
        if (!strcmp(str, "advise"))
                return SHMEM_HUGE_ADVISE;
        if (!strcmp(str, "deny"))
                return SHMEM_HUGE_DENY;
        if (!strcmp(str, "force"))
                return SHMEM_HUGE_FORCE;
        return -EINVAL;
}
#endif

#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
static const char *shmem_format_huge(int huge)
{
        switch (huge) {
        case SHMEM_HUGE_NEVER:
                return "never";
        case SHMEM_HUGE_ALWAYS:
                return "always";
        case SHMEM_HUGE_WITHIN_SIZE:
                return "within_size";
        case SHMEM_HUGE_ADVISE:
                return "advise";
        case SHMEM_HUGE_DENY:
                return "deny";
        case SHMEM_HUGE_FORCE:
                return "force";
        default:
                VM_BUG_ON(1);
                return "bad_val";
        }
}
#endif

static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
                struct shrink_control *sc, unsigned long nr_to_split)
{
        LIST_HEAD(list), *pos, *next;
        LIST_HEAD(to_remove);
        struct inode *inode;
        struct shmem_inode_info *info;
        struct folio *folio;
        unsigned long batch = sc ? sc->nr_to_scan : 128;
        int split = 0;

        if (list_empty(&sbinfo->shrinklist))
                return SHRINK_STOP;

        spin_lock(&sbinfo->shrinklist_lock);
        list_for_each_safe(pos, next, &sbinfo->shrinklist) {
                info = list_entry(pos, struct shmem_inode_info, shrinklist);

                /* pin the inode */
                inode = igrab(&info->vfs_inode);

                /* inode is about to be evicted */
                if (!inode) {
                        list_del_init(&info->shrinklist);
                        goto next;
                }

                /* Check if there's anything to gain */
                if (round_up(inode->i_size, PAGE_SIZE) ==
                                round_up(inode->i_size, HPAGE_PMD_SIZE)) {
                        list_move(&info->shrinklist, &to_remove);
                        goto next;
                }

                list_move(&info->shrinklist, &list);
next:
                sbinfo->shrinklist_len--;
                if (!--batch)
                        break;
        }
        spin_unlock(&sbinfo->shrinklist_lock);

        list_for_each_safe(pos, next, &to_remove) {
                info = list_entry(pos, struct shmem_inode_info, shrinklist);
                inode = &info->vfs_inode;
                list_del_init(&info->shrinklist);
                iput(inode);
        }

        list_for_each_safe(pos, next, &list) {
                int ret;
                pgoff_t index;

                info = list_entry(pos, struct shmem_inode_info, shrinklist);
                inode = &info->vfs_inode;

                if (nr_to_split && split >= nr_to_split)
                        goto move_back;

                index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT;
                folio = filemap_get_folio(inode->i_mapping, index);
                if (IS_ERR(folio))
                        goto drop;

                /* No huge page at the end of the file: nothing to split */
                if (!folio_test_large(folio)) {
                        folio_put(folio);
                        goto drop;
                }

                /*
                 * Move the inode on the list back to shrinklist if we failed
                 * to lock the page at this time.
                 *
                 * Waiting for the lock may lead to deadlock in the
                 * reclaim path.
                 */
                if (!folio_trylock(folio)) {
                        folio_put(folio);
                        goto move_back;
                }

                ret = split_folio(folio);
                folio_unlock(folio);
                folio_put(folio);

                /* If split failed move the inode on the list back to shrinklist */
                if (ret)
                        goto move_back;

                split++;
drop:
                list_del_init(&info->shrinklist);
                goto put;
move_back:
                /*
                 * Make sure the inode is either on the global list or deleted
                 * from any local list before iput() since it could be deleted
                 * in another thread once we put the inode (then the local list
                 * is corrupted).
                 */
                spin_lock(&sbinfo->shrinklist_lock);
                list_move(&info->shrinklist, &sbinfo->shrinklist);
                sbinfo->shrinklist_len++;
                spin_unlock(&sbinfo->shrinklist_lock);
put:
                iput(inode);
        }

        return split;
}

static long shmem_unused_huge_scan(struct super_block *sb,
                struct shrink_control *sc)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);

        if (!READ_ONCE(sbinfo->shrinklist_len))
                return SHRINK_STOP;

        return shmem_unused_huge_shrink(sbinfo, sc, 0);
}

static long shmem_unused_huge_count(struct super_block *sb,
                struct shrink_control *sc)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        return READ_ONCE(sbinfo->shrinklist_len);
}
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */

#define shmem_huge SHMEM_HUGE_DENY

static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
                struct shrink_control *sc, unsigned long nr_to_split)
{
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

/*
 * Somewhat like filemap_add_folio, but error if expected item has gone.
 */
static int shmem_add_to_page_cache(struct folio *folio,
                                   struct address_space *mapping,
                                   pgoff_t index, void *expected, gfp_t gfp)
{
        XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
        long nr = folio_nr_pages(folio);

        VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
        VM_BUG_ON(expected && folio_test_large(folio));

        folio_ref_add(folio, nr);
        folio->mapping = mapping;
        folio->index = index;

        gfp &= GFP_RECLAIM_MASK;
        folio_throttle_swaprate(folio, gfp);

        do {
                xas_lock_irq(&xas);
                if (expected != xas_find_conflict(&xas)) {
                        xas_set_err(&xas, -EEXIST);
                        goto unlock;
                }
                if (expected && xas_find_conflict(&xas)) {
                        xas_set_err(&xas, -EEXIST);
                        goto unlock;
                }
                xas_store(&xas, folio);
                if (xas_error(&xas))
                        goto unlock;
                if (folio_test_pmd_mappable(folio))
                        __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
                __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
                __lruvec_stat_mod_folio(folio, NR_SHMEM, nr);
                mapping->nrpages += nr;
unlock:
                xas_unlock_irq(&xas);
        } while (xas_nomem(&xas, gfp));

        if (xas_error(&xas)) {
                folio->mapping = NULL;
                folio_ref_sub(folio, nr);
                return xas_error(&xas);
        }

        return 0;
}

/*
 * Somewhat like filemap_remove_folio, but substitutes swap for @folio.
 */
static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
{
        struct address_space *mapping = folio->mapping;
        long nr = folio_nr_pages(folio);
        int error;

        xa_lock_irq(&mapping->i_pages);
        error = shmem_replace_entry(mapping, folio->index, folio, radswap);
        folio->mapping = NULL;
        mapping->nrpages -= nr;
        __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
        __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
        xa_unlock_irq(&mapping->i_pages);
        folio_put(folio);
        BUG_ON(error);
}

/*
 * Remove swap entry from page cache, free the swap and its page cache.
 */
static int shmem_free_swap(struct address_space *mapping,
                           pgoff_t index, void *radswap)
{
        void *old;

        old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
        if (old != radswap)
                return -ENOENT;
        free_swap_and_cache(radix_to_swp_entry(radswap));
        return 0;
}

/*
 * Determine (in bytes) how many of the shmem object's pages mapped by the
 * given offsets are swapped out.
 *
 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
 * as long as the inode doesn't go away and racy results are not a problem.
 */
unsigned long shmem_partial_swap_usage(struct address_space *mapping,
                                                pgoff_t start, pgoff_t end)
{
        XA_STATE(xas, &mapping->i_pages, start);
        struct page *page;
        unsigned long swapped = 0;
        unsigned long max = end - 1;

        rcu_read_lock();
        xas_for_each(&xas, page, max) {
                if (xas_retry(&xas, page))
                        continue;
                if (xa_is_value(page))
                        swapped++;
                if (xas.xa_index == max)
                        break;
                if (need_resched()) {
                        xas_pause(&xas);
                        cond_resched_rcu();
                }
        }
        rcu_read_unlock();

        return swapped << PAGE_SHIFT;
}

/*
 * Determine (in bytes) how many of the shmem object's pages mapped by the
 * given vma is swapped out.
 *
 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
 * as long as the inode doesn't go away and racy results are not a problem.
 */
unsigned long shmem_swap_usage(struct vm_area_struct *vma)
{
        struct inode *inode = file_inode(vma->vm_file);
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct address_space *mapping = inode->i_mapping;
        unsigned long swapped;

        /* Be careful as we don't hold info->lock */
        swapped = READ_ONCE(info->swapped);

        /*
         * The easier cases are when the shmem object has nothing in swap, or
         * the vma maps it whole. Then we can simply use the stats that we
         * already track.
         */
        if (!swapped)
                return 0;

        if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
                return swapped << PAGE_SHIFT;

        /* Here comes the more involved part */
        return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
                                        vma->vm_pgoff + vma_pages(vma));
}

/*
 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
 */
void shmem_unlock_mapping(struct address_space *mapping)
{
        struct folio_batch fbatch;
        pgoff_t index = 0;

        folio_batch_init(&fbatch);
        /*
         * Minor point, but we might as well stop if someone else SHM_LOCKs it.
         */
        while (!mapping_unevictable(mapping) &&
               filemap_get_folios(mapping, &index, ~0UL, &fbatch)) {
                check_move_unevictable_folios(&fbatch);
                folio_batch_release(&fbatch);
                cond_resched();
        }
}

static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
{
        struct folio *folio;

        /*
         * At first avoid shmem_get_folio(,,,SGP_READ): that fails
         * beyond i_size, and reports fallocated folios as holes.
         */
        folio = filemap_get_entry(inode->i_mapping, index);
        if (!folio)
                return folio;
        if (!xa_is_value(folio)) {
                folio_lock(folio);
                if (folio->mapping == inode->i_mapping)
                        return folio;
                /* The folio has been swapped out */
                folio_unlock(folio);
                folio_put(folio);
        }
        /*
         * But read a folio back from swap if any of it is within i_size
         * (although in some cases this is just a waste of time).
         */
        folio = NULL;
        shmem_get_folio(inode, index, &folio, SGP_READ);
        return folio;
}

/*
 * Remove range of pages and swap entries from page cache, and free them.
 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
 */
static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
                                                                 bool unfalloc)
{
        struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
        pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
        pgoff_t end = (lend + 1) >> PAGE_SHIFT;
        struct folio_batch fbatch;
        pgoff_t indices[PAGEVEC_SIZE];
        struct folio *folio;
        bool same_folio;
        long nr_swaps_freed = 0;
        pgoff_t index;
        int i;

        if (lend == -1)
                end = -1;        /* unsigned, so actually very big */

        if (info->fallocend > start && info->fallocend <= end && !unfalloc)
                info->fallocend = start;

        folio_batch_init(&fbatch);
        index = start;
        while (index < end && find_lock_entries(mapping, &index, end - 1,
                        &fbatch, indices)) {
                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        folio = fbatch.folios[i];

                        if (xa_is_value(folio)) {
                                if (unfalloc)
                                        continue;
                                nr_swaps_freed += !shmem_free_swap(mapping,
                                                        indices[i], folio);
                                continue;
                        }

                        if (!unfalloc || !folio_test_uptodate(folio))
                                truncate_inode_folio(mapping, folio);
                        folio_unlock(folio);
                }
                folio_batch_remove_exceptionals(&fbatch);
                folio_batch_release(&fbatch);
                cond_resched();
        }

        /*
         * When undoing a failed fallocate, we want none of the partial folio
         * zeroing and splitting below, but shall want to truncate the whole
         * folio when !uptodate indicates that it was added by this fallocate,
         * even when [lstart, lend] covers only a part of the folio.
         */
        if (unfalloc)
                goto whole_folios;

        same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
        folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
        if (folio) {
                same_folio = lend < folio_pos(folio) + folio_size(folio);
                folio_mark_dirty(folio);
                if (!truncate_inode_partial_folio(folio, lstart, lend)) {
                        start = folio_next_index(folio);
                        if (same_folio)
                                end = folio->index;
                }
                folio_unlock(folio);
                folio_put(folio);
                folio = NULL;
        }

        if (!same_folio)
                folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
        if (folio) {
                folio_mark_dirty(folio);
                if (!truncate_inode_partial_folio(folio, lstart, lend))
                        end = folio->index;
                folio_unlock(folio);
                folio_put(folio);
        }

whole_folios:

        index = start;
        while (index < end) {
                cond_resched();

                if (!find_get_entries(mapping, &index, end - 1, &fbatch,
                                indices)) {
                        /* If all gone or hole-punch or unfalloc, we're done */
                        if (index == start || end != -1)
                                break;
                        /* But if truncating, restart to make sure all gone */
                        index = start;
                        continue;
                }
                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        folio = fbatch.folios[i];

                        if (xa_is_value(folio)) {
                                if (unfalloc)
                                        continue;
                                if (shmem_free_swap(mapping, indices[i], folio)) {
                                        /* Swap was replaced by page: retry */
                                        index = indices[i];
                                        break;
                                }
                                nr_swaps_freed++;
                                continue;
                        }

                        folio_lock(folio);

                        if (!unfalloc || !folio_test_uptodate(folio)) {
                                if (folio_mapping(folio) != mapping) {
                                        /* Page was replaced by swap: retry */
                                        folio_unlock(folio);
                                        index = indices[i];
                                        break;
                                }
                                VM_BUG_ON_FOLIO(folio_test_writeback(folio),
                                                folio);

                                if (!folio_test_large(folio)) {
                                        truncate_inode_folio(mapping, folio);
                                } else if (truncate_inode_partial_folio(folio, lstart, lend)) {
                                        /*
                                         * If we split a page, reset the loop so
                                         * that we pick up the new sub pages.
                                         * Otherwise the THP was entirely
                                         * dropped or the target range was
                                         * zeroed, so just continue the loop as
                                         * is.
                                         */
                                        if (!folio_test_large(folio)) {
                                                folio_unlock(folio);
                                                index = start;
                                                break;
                                        }
                                }
                        }
                        folio_unlock(folio);
                }
                folio_batch_remove_exceptionals(&fbatch);
                folio_batch_release(&fbatch);
        }

        shmem_recalc_inode(inode, 0, -nr_swaps_freed);
}

void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
        shmem_undo_range(inode, lstart, lend, false);
        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        inode_inc_iversion(inode);
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);

static int shmem_getattr(struct mnt_idmap *idmap,
                         const struct path *path, struct kstat *stat,
                         u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = path->dentry->d_inode;
        struct shmem_inode_info *info = SHMEM_I(inode);

        if (info->alloced - info->swapped != inode->i_mapping->nrpages)
                shmem_recalc_inode(inode, 0, 0);

        if (info->fsflags & FS_APPEND_FL)
                stat->attributes |= STATX_ATTR_APPEND;
        if (info->fsflags & FS_IMMUTABLE_FL)
                stat->attributes |= STATX_ATTR_IMMUTABLE;
        if (info->fsflags & FS_NODUMP_FL)
                stat->attributes |= STATX_ATTR_NODUMP;
        stat->attributes_mask |= (STATX_ATTR_APPEND |
                        STATX_ATTR_IMMUTABLE |
                        STATX_ATTR_NODUMP);
        generic_fillattr(idmap, request_mask, inode, stat);

        if (shmem_is_huge(inode, 0, false, NULL, 0))
                stat->blksize = HPAGE_PMD_SIZE;

        if (request_mask & STATX_BTIME) {
                stat->result_mask |= STATX_BTIME;
                stat->btime.tv_sec = info->i_crtime.tv_sec;
                stat->btime.tv_nsec = info->i_crtime.tv_nsec;
        }

        return 0;
}

static int shmem_setattr(struct mnt_idmap *idmap,
                         struct dentry *dentry, struct iattr *attr)
{
        struct inode *inode = d_inode(dentry);
        struct shmem_inode_info *info = SHMEM_I(inode);
        int error;
        bool update_mtime = false;
        bool update_ctime = true;

        error = setattr_prepare(idmap, dentry, attr);
        if (error)
                return error;

        if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) {
                if ((inode->i_mode ^ attr->ia_mode) & 0111) {
                        return -EPERM;
                }
        }

        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
                loff_t oldsize = inode->i_size;
                loff_t newsize = attr->ia_size;

                /* protected by i_rwsem */
                if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
                    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
                        return -EPERM;

                if (newsize != oldsize) {
                        error = shmem_reacct_size(SHMEM_I(inode)->flags,
                                        oldsize, newsize);
                        if (error)
                                return error;
                        i_size_write(inode, newsize);
                        update_mtime = true;
                } else {
                        update_ctime = false;
                }
                if (newsize <= oldsize) {
                        loff_t holebegin = round_up(newsize, PAGE_SIZE);
                        if (oldsize > holebegin)
                                unmap_mapping_range(inode->i_mapping,
                                                        holebegin, 0, 1);
                        if (info->alloced)
                                shmem_truncate_range(inode,
                                                        newsize, (loff_t)-1);
                        /* unmap again to remove racily COWed private pages */
                        if (oldsize > holebegin)
                                unmap_mapping_range(inode->i_mapping,
                                                        holebegin, 0, 1);
                }
        }

        if (is_quota_modification(idmap, inode, attr)) {
                error = dquot_initialize(inode);
                if (error)
                        return error;
        }

        /* Transfer quota accounting */
        if (i_uid_needs_update(idmap, attr, inode) ||
            i_gid_needs_update(idmap, attr, inode)) {
                error = dquot_transfer(idmap, inode, attr);
                if (error)
                        return error;
        }

        setattr_copy(idmap, inode, attr);
        if (attr->ia_valid & ATTR_MODE)
                error = posix_acl_chmod(idmap, dentry, inode->i_mode);
        if (!error && update_ctime) {
                inode_set_ctime_current(inode);
                if (update_mtime)
                        inode_set_mtime_to_ts(inode, inode_get_ctime(inode));
                inode_inc_iversion(inode);
        }
        return error;
}

static void shmem_evict_inode(struct inode *inode)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        size_t freed = 0;

        if (shmem_mapping(inode->i_mapping)) {
                shmem_unacct_size(info->flags, inode->i_size);
                inode->i_size = 0;
                mapping_set_exiting(inode->i_mapping);
                shmem_truncate_range(inode, 0, (loff_t)-1);
                if (!list_empty(&info->shrinklist)) {
                        spin_lock(&sbinfo->shrinklist_lock);
                        if (!list_empty(&info->shrinklist)) {
                                list_del_init(&info->shrinklist);
                                sbinfo->shrinklist_len--;
                        }
                        spin_unlock(&sbinfo->shrinklist_lock);
                }
                while (!list_empty(&info->swaplist)) {
                        /* Wait while shmem_unuse() is scanning this inode... */
                        wait_var_event(&info->stop_eviction,
                                       !atomic_read(&info->stop_eviction));
                        mutex_lock(&shmem_swaplist_mutex);
                        /* ...but beware of the race if we peeked too early */
                        if (!atomic_read(&info->stop_eviction))
                                list_del_init(&info->swaplist);
                        mutex_unlock(&shmem_swaplist_mutex);
                }
        }

        simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL);
        shmem_free_inode(inode->i_sb, freed);
        WARN_ON(inode->i_blocks);
        clear_inode(inode);
#ifdef CONFIG_TMPFS_QUOTA
        dquot_free_inode(inode);
        dquot_drop(inode);
#endif
}

static int shmem_find_swap_entries(struct address_space *mapping,
                                   pgoff_t start, struct folio_batch *fbatch,
                                   pgoff_t *indices, unsigned int type)
{
        XA_STATE(xas, &mapping->i_pages, start);
        struct folio *folio;
        swp_entry_t entry;

        rcu_read_lock();
        xas_for_each(&xas, folio, ULONG_MAX) {
                if (xas_retry(&xas, folio))
                        continue;

                if (!xa_is_value(folio))
                        continue;

                entry = radix_to_swp_entry(folio);
                /*
                 * swapin error entries can be found in the mapping. But they're
                 * deliberately ignored here as we've done everything we can do.
                 */
                if (swp_type(entry) != type)
                        continue;

                indices[folio_batch_count(fbatch)] = xas.xa_index;
                if (!folio_batch_add(fbatch, folio))
                        break;

                if (need_resched()) {
                        xas_pause(&xas);
                        cond_resched_rcu();
                }
        }
        rcu_read_unlock();

        return xas.xa_index;
}

/*
 * Move the swapped pages for an inode to page cache. Returns the count
 * of pages swapped in, or the error in case of failure.
 */
static int shmem_unuse_swap_entries(struct inode *inode,
                struct folio_batch *fbatch, pgoff_t *indices)
{
        int i = 0;
        int ret = 0;
        int error = 0;
        struct address_space *mapping = inode->i_mapping;

        for (i = 0; i < folio_batch_count(fbatch); i++) {
                struct folio *folio = fbatch->folios[i];

                if (!xa_is_value(folio))
                        continue;
                error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
                                        mapping_gfp_mask(mapping), NULL, NULL);
                if (error == 0) {
                        folio_unlock(folio);
                        folio_put(folio);
                        ret++;
                }
                if (error == -ENOMEM)
                        break;
                error = 0;
        }
        return error ? error : ret;
}

/*
 * If swap found in inode, free it and move page from swapcache to filecache.
 */
static int shmem_unuse_inode(struct inode *inode, unsigned int type)
{
        struct address_space *mapping = inode->i_mapping;
        pgoff_t start = 0;
        struct folio_batch fbatch;
        pgoff_t indices[PAGEVEC_SIZE];
        int ret = 0;

        do {
                folio_batch_init(&fbatch);
                shmem_find_swap_entries(mapping, start, &fbatch, indices, type);
                if (folio_batch_count(&fbatch) == 0) {
                        ret = 0;
                        break;
                }

                ret = shmem_unuse_swap_entries(inode, &fbatch, indices);
                if (ret < 0)
                        break;

                start = indices[folio_batch_count(&fbatch) - 1];
        } while (true);

        return ret;
}

/*
 * Read all the shared memory data that resides in the swap
 * device 'type' back into memory, so the swap device can be
 * unused.
 */
int shmem_unuse(unsigned int type)
{
        struct shmem_inode_info *info, *next;
        int error = 0;

        if (list_empty(&shmem_swaplist))
                return 0;

        mutex_lock(&shmem_swaplist_mutex);
        list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
                if (!info->swapped) {
                        list_del_init(&info->swaplist);
                        continue;
                }
                /*
                 * Drop the swaplist mutex while searching the inode for swap;
                 * but before doing so, make sure shmem_evict_inode() will not
                 * remove placeholder inode from swaplist, nor let it be freed
                 * (igrab() would protect from unlink, but not from unmount).
                 */
                atomic_inc(&info->stop_eviction);
                mutex_unlock(&shmem_swaplist_mutex);

                error = shmem_unuse_inode(&info->vfs_inode, type);
                cond_resched();

                mutex_lock(&shmem_swaplist_mutex);
                next = list_next_entry(info, swaplist);
                if (!info->swapped)
                        list_del_init(&info->swaplist);
                if (atomic_dec_and_test(&info->stop_eviction))
                        wake_up_var(&info->stop_eviction);
                if (error)
                        break;
        }
        mutex_unlock(&shmem_swaplist_mutex);

        return error;
}

/*
 * Move the page from the page cache to the swap cache.
 */
static int shmem_writepage(struct page *page, struct writeback_control *wbc)
{
        struct folio *folio = page_folio(page);
        struct address_space *mapping = folio->mapping;
        struct inode *inode = mapping->host;
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        swp_entry_t swap;
        pgoff_t index;

        /*
         * Our capabilities prevent regular writeback or sync from ever calling
         * shmem_writepage; but a stacking filesystem might use ->writepage of
         * its underlying filesystem, in which case tmpfs should write out to
         * swap only in response to memory pressure, and not for the writeback
         * threads or sync.
         */
        if (WARN_ON_ONCE(!wbc->for_reclaim))
                goto redirty;

        if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap))
                goto redirty;

        if (!total_swap_pages)
                goto redirty;

        /*
         * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or
         * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages,
         * and its shmem_writeback() needs them to be split when swapping.
         */
        if (folio_test_large(folio)) {
                /* Ensure the subpages are still dirty */
                folio_test_set_dirty(folio);
                if (split_huge_page(page) < 0)
                        goto redirty;
                folio = page_folio(page);
                folio_clear_dirty(folio);
        }

        index = folio->index;

        /*
         * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
         * value into swapfile.c, the only way we can correctly account for a
         * fallocated folio arriving here is now to initialize it and write it.
         *
         * That's okay for a folio already fallocated earlier, but if we have
         * not yet completed the fallocation, then (a) we want to keep track
         * of this folio in case we have to undo it, and (b) it may not be a
         * good idea to continue anyway, once we're pushing into swap.  So
         * reactivate the folio, and let shmem_fallocate() quit when too many.
         */
        if (!folio_test_uptodate(folio)) {
                if (inode->i_private) {
                        struct shmem_falloc *shmem_falloc;
                        spin_lock(&inode->i_lock);
                        shmem_falloc = inode->i_private;
                        if (shmem_falloc &&
                            !shmem_falloc->waitq &&
                            index >= shmem_falloc->start &&
                            index < shmem_falloc->next)
                                shmem_falloc->nr_unswapped++;
                        else
                                shmem_falloc = NULL;
                        spin_unlock(&inode->i_lock);
                        if (shmem_falloc)
                                goto redirty;
                }
                folio_zero_range(folio, 0, folio_size(folio));
                flush_dcache_folio(folio);
                folio_mark_uptodate(folio);
        }

        swap = folio_alloc_swap(folio);
        if (!swap.val)
                goto redirty;

        /*
         * Add inode to shmem_unuse()'s list of swapped-out inodes,
         * if it's not already there.  Do it now before the folio is
         * moved to swap cache, when its pagelock no longer protects
         * the inode from eviction.  But don't unlock the mutex until
         * we've incremented swapped, because shmem_unuse_inode() will
         * prune a !swapped inode from the swaplist under this mutex.
         */
        mutex_lock(&shmem_swaplist_mutex);
        if (list_empty(&info->swaplist))
                list_add(&info->swaplist, &shmem_swaplist);

        if (add_to_swap_cache(folio, swap,
                        __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
                        NULL) == 0) {
                shmem_recalc_inode(inode, 0, 1);
                swap_shmem_alloc(swap);
                shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));

                mutex_unlock(&shmem_swaplist_mutex);
                BUG_ON(folio_mapped(folio));
                return swap_writepage(&folio->page, wbc);
        }

        mutex_unlock(&shmem_swaplist_mutex);
        put_swap_folio(folio, swap);
redirty:
        folio_mark_dirty(folio);
        if (wbc->for_reclaim)
                return AOP_WRITEPAGE_ACTIVATE;        /* Return with folio locked */
        folio_unlock(folio);
        return 0;
}

#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
{
        char buffer[64];

        if (!mpol || mpol->mode == MPOL_DEFAULT)
                return;                /* show nothing */

        mpol_to_str(buffer, sizeof(buffer), mpol);

        seq_printf(seq, ",mpol=%s", buffer);
}

static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
{
        struct mempolicy *mpol = NULL;
        if (sbinfo->mpol) {
                raw_spin_lock(&sbinfo->stat_lock);        /* prevent replace/use races */
                mpol = sbinfo->mpol;
                mpol_get(mpol);
                raw_spin_unlock(&sbinfo->stat_lock);
        }
        return mpol;
}
#else /* !CONFIG_NUMA || !CONFIG_TMPFS */
static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
{
}
static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
{
        return NULL;
}
#endif /* CONFIG_NUMA && CONFIG_TMPFS */

static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
                        pgoff_t index, unsigned int order, pgoff_t *ilx);

static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
                        struct shmem_inode_info *info, pgoff_t index)
{
        struct mempolicy *mpol;
        pgoff_t ilx;
        struct folio *folio;

        mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
        folio = swap_cluster_readahead(swap, gfp, mpol, ilx);
        mpol_cond_put(mpol);

        return folio;
}

/*
 * Make sure huge_gfp is always more limited than limit_gfp.
 * Some of the flags set permissions, while others set limitations.
 */
static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
{
        gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
        gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
        gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
        gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);

        /* Allow allocations only from the originally specified zones. */
        result |= zoneflags;

        /*
         * Minimize the result gfp by taking the union with the deny flags,
         * and the intersection of the allow flags.
         */
        result |= (limit_gfp & denyflags);
        result |= (huge_gfp & limit_gfp) & allowflags;

        return result;
}

static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
                struct shmem_inode_info *info, pgoff_t index)
{
        struct mempolicy *mpol;
        pgoff_t ilx;
        struct page *page;

        mpol = shmem_get_pgoff_policy(info, index, HPAGE_PMD_ORDER, &ilx);
        page = alloc_pages_mpol(gfp, HPAGE_PMD_ORDER, mpol, ilx, numa_node_id());
        mpol_cond_put(mpol);

        return page_rmappable_folio(page);
}

static struct folio *shmem_alloc_folio(gfp_t gfp,
                struct shmem_inode_info *info, pgoff_t index)
{
        struct mempolicy *mpol;
        pgoff_t ilx;
        struct page *page;

        mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
        page = alloc_pages_mpol(gfp, 0, mpol, ilx, numa_node_id());
        mpol_cond_put(mpol);

        return (struct folio *)page;
}

static struct folio *shmem_alloc_and_add_folio(gfp_t gfp,
                struct inode *inode, pgoff_t index,
                struct mm_struct *fault_mm, bool huge)
{
        struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct folio *folio;
        long pages;
        int error;

        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                huge = false;

        if (huge) {
                pages = HPAGE_PMD_NR;
                index = round_down(index, HPAGE_PMD_NR);

                /*
                 * Check for conflict before waiting on a huge allocation.
                 * Conflict might be that a huge page has just been allocated
                 * and added to page cache by a racing thread, or that there
                 * is already at least one small page in the huge extent.
                 * Be careful to retry when appropriate, but not forever!
                 * Elsewhere -EEXIST would be the right code, but not here.
                 */
                if (xa_find(&mapping->i_pages, &index,
                                index + HPAGE_PMD_NR - 1, XA_PRESENT))
                        return ERR_PTR(-E2BIG);

                folio = shmem_alloc_hugefolio(gfp, info, index);
                if (!folio)
                        count_vm_event(THP_FILE_FALLBACK);
        } else {
                pages = 1;
                folio = shmem_alloc_folio(gfp, info, index);
        }
        if (!folio)
                return ERR_PTR(-ENOMEM);

        __folio_set_locked(folio);
        __folio_set_swapbacked(folio);

        gfp &= GFP_RECLAIM_MASK;
        error = mem_cgroup_charge(folio, fault_mm, gfp);
        if (error) {
                if (xa_find(&mapping->i_pages, &index,
                                index + pages - 1, XA_PRESENT)) {
                        error = -EEXIST;
                } else if (huge) {
                        count_vm_event(THP_FILE_FALLBACK);
                        count_vm_event(THP_FILE_FALLBACK_CHARGE);
                }
                goto unlock;
        }

        error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp);
        if (error)
                goto unlock;

        error = shmem_inode_acct_blocks(inode, pages);
        if (error) {
                struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
                long freed;
                /*
                 * Try to reclaim some space by splitting a few
                 * large folios beyond i_size on the filesystem.
                 */
                shmem_unused_huge_shrink(sbinfo, NULL, 2);
                /*
                 * And do a shmem_recalc_inode() to account for freed pages:
                 * except our folio is there in cache, so not quite balanced.
                 */
                spin_lock(&info->lock);
                freed = pages + info->alloced - info->swapped -
                        READ_ONCE(mapping->nrpages);
                if (freed > 0)
                        info->alloced -= freed;
                spin_unlock(&info->lock);
                if (freed > 0)
                        shmem_inode_unacct_blocks(inode, freed);
                error = shmem_inode_acct_blocks(inode, pages);
                if (error) {
                        filemap_remove_folio(folio);
                        goto unlock;
                }
        }

        shmem_recalc_inode(inode, pages, 0);
        folio_add_lru(folio);
        return folio;

unlock:
        folio_unlock(folio);
        folio_put(folio);
        return ERR_PTR(error);
}

/*
 * When a page is moved from swapcache to shmem filecache (either by the
 * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
 * shmem_unuse_inode()), it may have been read in earlier from swap, in
 * ignorance of the mapping it belongs to.  If that mapping has special
 * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
 * we may need to copy to a suitable page before moving to filecache.
 *
 * In a future release, this may well be extended to respect cpuset and
 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
 * but for now it is a simple matter of zone.
 */
static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
{
        return folio_zonenum(folio) > gfp_zone(gfp);
}

static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
                                struct shmem_inode_info *info, pgoff_t index)
{
        struct folio *old, *new;
        struct address_space *swap_mapping;
        swp_entry_t entry;
        pgoff_t swap_index;
        int error;

        old = *foliop;
        entry = old->swap;
        swap_index = swp_offset(entry);
        swap_mapping = swap_address_space(entry);

        /*
         * We have arrived here because our zones are constrained, so don't
         * limit chance of success by further cpuset and node constraints.
         */
        gfp &= ~GFP_CONSTRAINT_MASK;
        VM_BUG_ON_FOLIO(folio_test_large(old), old);
        new = shmem_alloc_folio(gfp, info, index);
        if (!new)
                return -ENOMEM;

        folio_get(new);
        folio_copy(new, old);
        flush_dcache_folio(new);

        __folio_set_locked(new);
        __folio_set_swapbacked(new);
        folio_mark_uptodate(new);
        new->swap = entry;
        folio_set_swapcache(new);

        /*
         * Our caller will very soon move newpage out of swapcache, but it's
         * a nice clean interface for us to replace oldpage by newpage there.
         */
        xa_lock_irq(&swap_mapping->i_pages);
        error = shmem_replace_entry(swap_mapping, swap_index, old, new);
        if (!error) {
                mem_cgroup_replace_folio(old, new);
                __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1);
                __lruvec_stat_mod_folio(new, NR_SHMEM, 1);
                __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1);
                __lruvec_stat_mod_folio(old, NR_SHMEM, -1);
        }
        xa_unlock_irq(&swap_mapping->i_pages);

        if (unlikely(error)) {
                /*
                 * Is this possible?  I think not, now that our callers check
                 * both PageSwapCache and page_private after getting page lock;
                 * but be defensive.  Reverse old to newpage for clear and free.
                 */
                old = new;
        } else {
                folio_add_lru(new);
                *foliop = new;
        }

        folio_clear_swapcache(old);
        old->private = NULL;

        folio_unlock(old);
        folio_put_refs(old, 2);
        return error;
}

static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
                                         struct folio *folio, swp_entry_t swap)
{
        struct address_space *mapping = inode->i_mapping;
        swp_entry_t swapin_error;
        void *old;

        swapin_error = make_poisoned_swp_entry();
        old = xa_cmpxchg_irq(&mapping->i_pages, index,
                             swp_to_radix_entry(swap),
                             swp_to_radix_entry(swapin_error), 0);
        if (old != swp_to_radix_entry(swap))
                return;

        folio_wait_writeback(folio);
        delete_from_swap_cache(folio);
        /*
         * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
         * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
         * in shmem_evict_inode().
         */
        shmem_recalc_inode(inode, -1, -1);
        swap_free(swap);
}

/*
 * Swap in the folio pointed to by *foliop.
 * Caller has to make sure that *foliop contains a valid swapped folio.
 * Returns 0 and the folio in foliop if success. On failure, returns the
 * error code and NULL in *foliop.
 */
static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
                             struct folio **foliop, enum sgp_type sgp,
                             gfp_t gfp, struct mm_struct *fault_mm,
                             vm_fault_t *fault_type)
{
        struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct swap_info_struct *si;
        struct folio *folio = NULL;
        swp_entry_t swap;
        int error;

        VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
        swap = radix_to_swp_entry(*foliop);
        *foliop = NULL;

        if (is_poisoned_swp_entry(swap))
                return -EIO;

        si = get_swap_device(swap);
        if (!si) {
                if (!shmem_confirm_swap(mapping, index, swap))
                        return -EEXIST;
                else
                        return -EINVAL;
        }

        /* Look it up and read it in.. */
        folio = swap_cache_get_folio(swap, NULL, 0);
        if (!folio) {
                /* Or update major stats only when swapin succeeds?? */
                if (fault_type) {
                        *fault_type |= VM_FAULT_MAJOR;
                        count_vm_event(PGMAJFAULT);
                        count_memcg_event_mm(fault_mm, PGMAJFAULT);
                }
                /* Here we actually start the io */
                folio = shmem_swapin_cluster(swap, gfp, info, index);
                if (!folio) {
                        error = -ENOMEM;
                        goto failed;
                }
        }

        /* We have to do this with folio locked to prevent races */
        folio_lock(folio);
        if (!folio_test_swapcache(folio) ||
            folio->swap.val != swap.val ||
            !shmem_confirm_swap(mapping, index, swap)) {
                error = -EEXIST;
                goto unlock;
        }
        if (!folio_test_uptodate(folio)) {
                error = -EIO;
                goto failed;
        }
        folio_wait_writeback(folio);

        /*
         * Some architectures may have to restore extra metadata to the
         * folio after reading from swap.
         */
        arch_swap_restore(folio_swap(swap, folio), folio);

        if (shmem_should_replace_folio(folio, gfp)) {
                error = shmem_replace_folio(&folio, gfp, info, index);
                if (error)
                        goto failed;
        }

        error = shmem_add_to_page_cache(folio, mapping, index,
                                        swp_to_radix_entry(swap), gfp);
        if (error)
                goto failed;

        shmem_recalc_inode(inode, 0, -1);

        if (sgp == SGP_WRITE)
                folio_mark_accessed(folio);

        delete_from_swap_cache(folio);
        folio_mark_dirty(folio);
        swap_free(swap);
        put_swap_device(si);

        *foliop = folio;
        return 0;
failed:
        if (!shmem_confirm_swap(mapping, index, swap))
                error = -EEXIST;
        if (error == -EIO)
                shmem_set_folio_swapin_error(inode, index, folio, swap);
unlock:
        if (folio) {
                folio_unlock(folio);
                folio_put(folio);
        }
        put_swap_device(si);

        return error;
}

/*
 * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate
 *
 * If we allocate a new one we do not mark it dirty. That's up to the
 * vm. If we swap it in we mark it dirty since we also free the swap
 * entry since a page cannot live in both the swap and page cache.
 *
 * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL.
 */
static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
                struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
                struct vm_fault *vmf, vm_fault_t *fault_type)
{
        struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
        struct mm_struct *fault_mm;
        struct folio *folio;
        int error;
        bool alloced;

        if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping)))
                return -EINVAL;

        if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
                return -EFBIG;
repeat:
        if (sgp <= SGP_CACHE &&
            ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode))
                return -EINVAL;

        alloced = false;
        fault_mm = vma ? vma->vm_mm : NULL;

        folio = filemap_get_entry(inode->i_mapping, index);
        if (folio && vma && userfaultfd_minor(vma)) {
                if (!xa_is_value(folio))
                        folio_put(folio);
                *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
                return 0;
        }

        if (xa_is_value(folio)) {
                error = shmem_swapin_folio(inode, index, &folio,
                                           sgp, gfp, fault_mm, fault_type);
                if (error == -EEXIST)
                        goto repeat;

                *foliop = folio;
                return error;
        }

        if (folio) {
                folio_lock(folio);

                /* Has the folio been truncated or swapped out? */
                if (unlikely(folio->mapping != inode->i_mapping)) {
                        folio_unlock(folio);
                        folio_put(folio);
                        goto repeat;
                }
                if (sgp == SGP_WRITE)
                        folio_mark_accessed(folio);
                if (folio_test_uptodate(folio))
                        goto out;
                /* fallocated folio */
                if (sgp != SGP_READ)
                        goto clear;
                folio_unlock(folio);
                folio_put(folio);
        }

        /*
         * SGP_READ: succeed on hole, with NULL folio, letting caller zero.
         * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail.
         */
        *foliop = NULL;
        if (sgp == SGP_READ)
                return 0;
        if (sgp == SGP_NOALLOC)
                return -ENOENT;

        /*
         * Fast cache lookup and swap lookup did not find it: allocate.
         */

        if (vma && userfaultfd_missing(vma)) {
                *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
                return 0;
        }

        if (shmem_is_huge(inode, index, false, fault_mm,
                          vma ? vma->vm_flags : 0)) {
                gfp_t huge_gfp;

                huge_gfp = vma_thp_gfp_mask(vma);
                huge_gfp = limit_gfp_mask(huge_gfp, gfp);
                folio = shmem_alloc_and_add_folio(huge_gfp,
                                inode, index, fault_mm, true);
                if (!IS_ERR(folio)) {
                        count_vm_event(THP_FILE_ALLOC);
                        goto alloced;
                }
                if (PTR_ERR(folio) == -EEXIST)
                        goto repeat;
        }

        folio = shmem_alloc_and_add_folio(gfp, inode, index, fault_mm, false);
        if (IS_ERR(folio)) {
                error = PTR_ERR(folio);
                if (error == -EEXIST)
                        goto repeat;
                folio = NULL;
                goto unlock;
        }

alloced:
        alloced = true;
        if (folio_test_pmd_mappable(folio) &&
            DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
                                        folio_next_index(folio) - 1) {
                struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
                struct shmem_inode_info *info = SHMEM_I(inode);
                /*
                 * Part of the large folio is beyond i_size: subject
                 * to shrink under memory pressure.
                 */
                spin_lock(&sbinfo->shrinklist_lock);
                /*
                 * _careful to defend against unlocked access to
                 * ->shrink_list in shmem_unused_huge_shrink()
                 */
                if (list_empty_careful(&info->shrinklist)) {
                        list_add_tail(&info->shrinklist,
                                      &sbinfo->shrinklist);
                        sbinfo->shrinklist_len++;
                }
                spin_unlock(&sbinfo->shrinklist_lock);
        }

        if (sgp == SGP_WRITE)
                folio_set_referenced(folio);
        /*
         * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
         */
        if (sgp == SGP_FALLOC)
                sgp = SGP_WRITE;
clear:
        /*
         * Let SGP_WRITE caller clear ends if write does not fill folio;
         * but SGP_FALLOC on a folio fallocated earlier must initialize
         * it now, lest undo on failure cancel our earlier guarantee.
         */
        if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
                long i, n = folio_nr_pages(folio);

                for (i = 0; i < n; i++)
                        clear_highpage(folio_page(folio, i));
                flush_dcache_folio(folio);
                folio_mark_uptodate(folio);
        }

        /* Perhaps the file has been truncated since we checked */
        if (sgp <= SGP_CACHE &&
            ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
                error = -EINVAL;
                goto unlock;
        }
out:
        *foliop = folio;
        return 0;

        /*
         * Error recovery.
         */
unlock:
        if (alloced)
                filemap_remove_folio(folio);
        shmem_recalc_inode(inode, 0, 0);
        if (folio) {
                folio_unlock(folio);
                folio_put(folio);
        }
        return error;
}

/**
 * shmem_get_folio - find, and lock a shmem folio.
 * @inode:        inode to search
 * @index:        the page index.
 * @foliop:        pointer to the folio if found
 * @sgp:        SGP_* flags to control behavior
 *
 * Looks up the page cache entry at @inode & @index.  If a folio is
 * present, it is returned locked with an increased refcount.
 *
 * If the caller modifies data in the folio, it must call folio_mark_dirty()
 * before unlocking the folio to ensure that the folio is not reclaimed.
 * There is no need to reserve space before calling folio_mark_dirty().
 *
 * When no folio is found, the behavior depends on @sgp:
 *  - for SGP_READ, *@foliop is %NULL and 0 is returned
 *  - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned
 *  - for all other flags a new folio is allocated, inserted into the
 *    page cache and returned locked in @foliop.
 *
 * Context: May sleep.
 * Return: 0 if successful, else a negative error code.
 */
int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
                enum sgp_type sgp)
{
        return shmem_get_folio_gfp(inode, index, foliop, sgp,
                        mapping_gfp_mask(inode->i_mapping), NULL, NULL);
}
EXPORT_SYMBOL_GPL(shmem_get_folio);

/*
 * This is like autoremove_wake_function, but it removes the wait queue
 * entry unconditionally - even if something else had already woken the
 * target.
 */
static int synchronous_wake_function(wait_queue_entry_t *wait,
                        unsigned int mode, int sync, void *key)
{
        int ret = default_wake_function(wait, mode, sync, key);
        list_del_init(&wait->entry);
        return ret;
}

/*
 * Trinity finds that probing a hole which tmpfs is punching can
 * prevent the hole-punch from ever completing: which in turn
 * locks writers out with its hold on i_rwsem.  So refrain from
 * faulting pages into the hole while it's being punched.  Although
 * shmem_undo_range() does remove the additions, it may be unable to
 * keep up, as each new page needs its own unmap_mapping_range() call,
 * and the i_mmap tree grows ever slower to scan if new vmas are added.
 *
 * It does not matter if we sometimes reach this check just before the
 * hole-punch begins, so that one fault then races with the punch:
 * we just need to make racing faults a rare case.
 *
 * The implementation below would be much simpler if we just used a
 * standard mutex or completion: but we cannot take i_rwsem in fault,
 * and bloating every shmem inode for this unlikely case would be sad.
 */
static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode)
{
        struct shmem_falloc *shmem_falloc;
        struct file *fpin = NULL;
        vm_fault_t ret = 0;

        spin_lock(&inode->i_lock);
        shmem_falloc = inode->i_private;
        if (shmem_falloc &&
            shmem_falloc->waitq &&
            vmf->pgoff >= shmem_falloc->start &&
            vmf->pgoff < shmem_falloc->next) {
                wait_queue_head_t *shmem_falloc_waitq;
                DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);

                ret = VM_FAULT_NOPAGE;
                fpin = maybe_unlock_mmap_for_io(vmf, NULL);
                shmem_falloc_waitq = shmem_falloc->waitq;
                prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
                                TASK_UNINTERRUPTIBLE);
                spin_unlock(&inode->i_lock);
                schedule();

                /*
                 * shmem_falloc_waitq points into the shmem_fallocate()
                 * stack of the hole-punching task: shmem_falloc_waitq
                 * is usually invalid by the time we reach here, but
                 * finish_wait() does not dereference it in that case;
                 * though i_lock needed lest racing with wake_up_all().
                 */
                spin_lock(&inode->i_lock);
                finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
        }
        spin_unlock(&inode->i_lock);
        if (fpin) {
                fput(fpin);
                ret = VM_FAULT_RETRY;
        }
        return ret;
}

static vm_fault_t shmem_fault(struct vm_fault *vmf)
{
        struct inode *inode = file_inode(vmf->vma->vm_file);
        gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
        struct folio *folio = NULL;
        vm_fault_t ret = 0;
        int err;

        /*
         * Trinity finds that probing a hole which tmpfs is punching can
         * prevent the hole-punch from ever completing: noted in i_private.
         */
        if (unlikely(inode->i_private)) {
                ret = shmem_falloc_wait(vmf, inode);
                if (ret)
                        return ret;
        }

        WARN_ON_ONCE(vmf->page != NULL);
        err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE,
                                  gfp, vmf, &ret);
        if (err)
                return vmf_error(err);
        if (folio) {
                vmf->page = folio_file_page(folio, vmf->pgoff);
                ret |= VM_FAULT_LOCKED;
        }
        return ret;
}

unsigned long shmem_get_unmapped_area(struct file *file,
                                      unsigned long uaddr, unsigned long len,
                                      unsigned long pgoff, unsigned long flags)
{
        unsigned long addr;
        unsigned long offset;
        unsigned long inflated_len;
        unsigned long inflated_addr;
        unsigned long inflated_offset;

        if (len > TASK_SIZE)
                return -ENOMEM;

        addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff,
                                    flags);

        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return addr;
        if (IS_ERR_VALUE(addr))
                return addr;
        if (addr & ~PAGE_MASK)
                return addr;
        if (addr > TASK_SIZE - len)
                return addr;

        if (shmem_huge == SHMEM_HUGE_DENY)
                return addr;
        if (len < HPAGE_PMD_SIZE)
                return addr;
        if (flags & MAP_FIXED)
                return addr;
        /*
         * Our priority is to support MAP_SHARED mapped hugely;
         * and support MAP_PRIVATE mapped hugely too, until it is COWed.
         * But if caller specified an address hint and we allocated area there
         * successfully, respect that as before.
         */
        if (uaddr == addr)
                return addr;

        if (shmem_huge != SHMEM_HUGE_FORCE) {
                struct super_block *sb;

                if (file) {
                        VM_BUG_ON(file->f_op != &shmem_file_operations);
                        sb = file_inode(file)->i_sb;
                } else {
                        /*
                         * Called directly from mm/mmap.c, or drivers/char/mem.c
                         * for "/dev/zero", to create a shared anonymous object.
                         */
                        if (IS_ERR(shm_mnt))
                                return addr;
                        sb = shm_mnt->mnt_sb;
                }
                if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER)
                        return addr;
        }

        offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1);
        if (offset && offset + len < 2 * HPAGE_PMD_SIZE)
                return addr;
        if ((addr & (HPAGE_PMD_SIZE-1)) == offset)
                return addr;

        inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE;
        if (inflated_len > TASK_SIZE)
                return addr;
        if (inflated_len < len)
                return addr;

        inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr,
                                             inflated_len, 0, flags);
        if (IS_ERR_VALUE(inflated_addr))
                return addr;
        if (inflated_addr & ~PAGE_MASK)
                return addr;

        inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1);
        inflated_addr += offset - inflated_offset;
        if (inflated_offset > offset)
                inflated_addr += HPAGE_PMD_SIZE;

        if (inflated_addr > TASK_SIZE - len)
                return addr;
        return inflated_addr;
}

#ifdef CONFIG_NUMA
static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
{
        struct inode *inode = file_inode(vma->vm_file);
        return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
}

static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
                                          unsigned long addr, pgoff_t *ilx)
{
        struct inode *inode = file_inode(vma->vm_file);
        pgoff_t index;

        /*
         * Bias interleave by inode number to distribute better across nodes;
         * but this interface is independent of which page order is used, so
         * supplies only that bias, letting caller apply the offset (adjusted
         * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()).
         */
        *ilx = inode->i_ino;
        index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
        return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
}

static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
                        pgoff_t index, unsigned int order, pgoff_t *ilx)
{
        struct mempolicy *mpol;

        /* Bias interleave by inode number to distribute better across nodes */
        *ilx = info->vfs_inode.i_ino + (index >> order);

        mpol = mpol_shared_policy_lookup(&info->policy, index);
        return mpol ? mpol : get_task_policy(current);
}
#else
static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
                        pgoff_t index, unsigned int order, pgoff_t *ilx)
{
        *ilx = 0;
        return NULL;
}
#endif /* CONFIG_NUMA */

int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
{
        struct inode *inode = file_inode(file);
        struct shmem_inode_info *info = SHMEM_I(inode);
        int retval = -ENOMEM;

        /*
         * What serializes the accesses to info->flags?
         * ipc_lock_object() when called from shmctl_do_lock(),
         * no serialization needed when called from shm_destroy().
         */
        if (lock && !(info->flags & VM_LOCKED)) {
                if (!user_shm_lock(inode->i_size, ucounts))
                        goto out_nomem;
                info->flags |= VM_LOCKED;
                mapping_set_unevictable(file->f_mapping);
        }
        if (!lock && (info->flags & VM_LOCKED) && ucounts) {
                user_shm_unlock(inode->i_size, ucounts);
                info->flags &= ~VM_LOCKED;
                mapping_clear_unevictable(file->f_mapping);
        }
        retval = 0;

out_nomem:
        return retval;
}

static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct inode *inode = file_inode(file);
        struct shmem_inode_info *info = SHMEM_I(inode);
        int ret;

        ret = seal_check_write(info->seals, vma);
        if (ret)
                return ret;

        /* arm64 - allow memory tagging on RAM-based files */
        vm_flags_set(vma, VM_MTE_ALLOWED);

        file_accessed(file);
        /* This is anonymous shared memory if it is unlinked at the time of mmap */
        if (inode->i_nlink)
                vma->vm_ops = &shmem_vm_ops;
        else
                vma->vm_ops = &shmem_anon_vm_ops;
        return 0;
}

static int shmem_file_open(struct inode *inode, struct file *file)
{
        file->f_mode |= FMODE_CAN_ODIRECT;
        return generic_file_open(inode, file);
}

#ifdef CONFIG_TMPFS_XATTR
static int shmem_initxattrs(struct inode *, const struct xattr *, void *);

/*
 * chattr's fsflags are unrelated to extended attributes,
 * but tmpfs has chosen to enable them under the same config option.
 */
static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
{
        unsigned int i_flags = 0;

        if (fsflags & FS_NOATIME_FL)
                i_flags |= S_NOATIME;
        if (fsflags & FS_APPEND_FL)
                i_flags |= S_APPEND;
        if (fsflags & FS_IMMUTABLE_FL)
                i_flags |= S_IMMUTABLE;
        /*
         * But FS_NODUMP_FL does not require any action in i_flags.
         */
        inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE);
}
#else
static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
{
}
#define shmem_initxattrs NULL
#endif

static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode)
{
        return &SHMEM_I(inode)->dir_offsets;
}

static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
                                             struct super_block *sb,
                                             struct inode *dir, umode_t mode,
                                             dev_t dev, unsigned long flags)
{
        struct inode *inode;
        struct shmem_inode_info *info;
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        ino_t ino;
        int err;

        err = shmem_reserve_inode(sb, &ino);
        if (err)
                return ERR_PTR(err);

        inode = new_inode(sb);
        if (!inode) {
                shmem_free_inode(sb, 0);
                return ERR_PTR(-ENOSPC);
        }

        inode->i_ino = ino;
        inode_init_owner(idmap, inode, dir, mode);
        inode->i_blocks = 0;
        simple_inode_init_ts(inode);
        inode->i_generation = get_random_u32();
        info = SHMEM_I(inode);
        memset(info, 0, (char *)inode - (char *)info);
        spin_lock_init(&info->lock);
        atomic_set(&info->stop_eviction, 0);
        info->seals = F_SEAL_SEAL;
        info->flags = flags & VM_NORESERVE;
        info->i_crtime = inode_get_mtime(inode);
        info->fsflags = (dir == NULL) ? 0 :
                SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
        if (info->fsflags)
                shmem_set_inode_flags(inode, info->fsflags);
        INIT_LIST_HEAD(&info->shrinklist);
        INIT_LIST_HEAD(&info->swaplist);
        simple_xattrs_init(&info->xattrs);
        cache_no_acl(inode);
        if (sbinfo->noswap)
                mapping_set_unevictable(inode->i_mapping);
        mapping_set_large_folios(inode->i_mapping);

        switch (mode & S_IFMT) {
        default:
                inode->i_op = &shmem_special_inode_operations;
                init_special_inode(inode, mode, dev);
                break;
        case S_IFREG:
                inode->i_mapping->a_ops = &shmem_aops;
                inode->i_op = &shmem_inode_operations;
                inode->i_fop = &shmem_file_operations;
                mpol_shared_policy_init(&info->policy,
                                         shmem_get_sbmpol(sbinfo));
                break;
        case S_IFDIR:
                inc_nlink(inode);
                /* Some things misbehave if size == 0 on a directory */
                inode->i_size = 2 * BOGO_DIRENT_SIZE;
                inode->i_op = &shmem_dir_inode_operations;
                inode->i_fop = &simple_offset_dir_operations;
                simple_offset_init(shmem_get_offset_ctx(inode));
                break;
        case S_IFLNK:
                /*
                 * Must not load anything in the rbtree,
                 * mpol_free_shared_policy will not be called.
                 */
                mpol_shared_policy_init(&info->policy, NULL);
                break;
        }

        lockdep_annotate_inode_mutex_key(inode);
        return inode;
}

#ifdef CONFIG_TMPFS_QUOTA
static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
                                     struct super_block *sb, struct inode *dir,
                                     umode_t mode, dev_t dev, unsigned long flags)
{
        int err;
        struct inode *inode;

        inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
        if (IS_ERR(inode))
                return inode;

        err = dquot_initialize(inode);
        if (err)
                goto errout;

        err = dquot_alloc_inode(inode);
        if (err) {
                dquot_drop(inode);
                goto errout;
        }
        return inode;

errout:
        inode->i_flags |= S_NOQUOTA;
        iput(inode);
        return ERR_PTR(err);
}
#else
static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
                                     struct super_block *sb, struct inode *dir,
                                     umode_t mode, dev_t dev, unsigned long flags)
{
        return __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
}
#endif /* CONFIG_TMPFS_QUOTA */

#ifdef CONFIG_USERFAULTFD
int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
                           struct vm_area_struct *dst_vma,
                           unsigned long dst_addr,
                           unsigned long src_addr,
                           uffd_flags_t flags,
                           struct folio **foliop)
{
        struct inode *inode = file_inode(dst_vma->vm_file);
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct address_space *mapping = inode->i_mapping;
        gfp_t gfp = mapping_gfp_mask(mapping);
        pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
        void *page_kaddr;
        struct folio *folio;
        int ret;
        pgoff_t max_off;

        if (shmem_inode_acct_blocks(inode, 1)) {
                /*
                 * We may have got a page, returned -ENOENT triggering a retry,
                 * and now we find ourselves with -ENOMEM. Release the page, to
                 * avoid a BUG_ON in our caller.
                 */
                if (unlikely(*foliop)) {
                        folio_put(*foliop);
                        *foliop = NULL;
                }
                return -ENOMEM;
        }

        if (!*foliop) {
                ret = -ENOMEM;
                folio = shmem_alloc_folio(gfp, info, pgoff);
                if (!folio)
                        goto out_unacct_blocks;

                if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
                        page_kaddr = kmap_local_folio(folio, 0);
                        /*
                         * The read mmap_lock is held here.  Despite the
                         * mmap_lock being read recursive a deadlock is still
                         * possible if a writer has taken a lock.  For example:
                         *
                         * process A thread 1 takes read lock on own mmap_lock
                         * process A thread 2 calls mmap, blocks taking write lock
                         * process B thread 1 takes page fault, read lock on own mmap lock
                         * process B thread 2 calls mmap, blocks taking write lock
                         * process A thread 1 blocks taking read lock on process B
                         * process B thread 1 blocks taking read lock on process A
                         *
                         * Disable page faults to prevent potential deadlock
                         * and retry the copy outside the mmap_lock.
                         */
                        pagefault_disable();
                        ret = copy_from_user(page_kaddr,
                                             (const void __user *)src_addr,
                                             PAGE_SIZE);
                        pagefault_enable();
                        kunmap_local(page_kaddr);

                        /* fallback to copy_from_user outside mmap_lock */
                        if (unlikely(ret)) {
                                *foliop = folio;
                                ret = -ENOENT;
                                /* don't free the page */
                                goto out_unacct_blocks;
                        }

                        flush_dcache_folio(folio);
                } else {                /* ZEROPAGE */
                        clear_user_highpage(&folio->page, dst_addr);
                }
        } else {
                folio = *foliop;
                VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
                *foliop = NULL;
        }

        VM_BUG_ON(folio_test_locked(folio));
        VM_BUG_ON(folio_test_swapbacked(folio));
        __folio_set_locked(folio);
        __folio_set_swapbacked(folio);
        __folio_mark_uptodate(folio);

        ret = -EFAULT;
        max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
        if (unlikely(pgoff >= max_off))
                goto out_release;

        ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp);
        if (ret)
                goto out_release;
        ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp);
        if (ret)
                goto out_release;

        ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
                                       &folio->page, true, flags);
        if (ret)
                goto out_delete_from_cache;

        shmem_recalc_inode(inode, 1, 0);
        folio_unlock(folio);
        return 0;
out_delete_from_cache:
        filemap_remove_folio(folio);
out_release:
        folio_unlock(folio);
        folio_put(folio);
out_unacct_blocks:
        shmem_inode_unacct_blocks(inode, 1);
        return ret;
}
#endif /* CONFIG_USERFAULTFD */

#ifdef CONFIG_TMPFS
static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations;

static int
shmem_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len,
                        struct page **pagep, void **fsdata)
{
        struct inode *inode = mapping->host;
        struct shmem_inode_info *info = SHMEM_I(inode);
        pgoff_t index = pos >> PAGE_SHIFT;
        struct folio *folio;
        int ret = 0;

        /* i_rwsem is held by caller */
        if (unlikely(info->seals & (F_SEAL_GROW |
                                   F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
                if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
                        return -EPERM;
                if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
                        return -EPERM;
        }

        ret = shmem_get_folio(inode, index, &folio, SGP_WRITE);
        if (ret)
                return ret;

        *pagep = folio_file_page(folio, index);
        if (PageHWPoison(*pagep)) {
                folio_unlock(folio);
                folio_put(folio);
                *pagep = NULL;
                return -EIO;
        }

        return 0;
}

static int
shmem_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
{
        struct folio *folio = page_folio(page);
        struct inode *inode = mapping->host;

        if (pos + copied > inode->i_size)
                i_size_write(inode, pos + copied);

        if (!folio_test_uptodate(folio)) {
                if (copied < folio_size(folio)) {
                        size_t from = offset_in_folio(folio, pos);
                        folio_zero_segments(folio, 0, from,
                                        from + copied, folio_size(folio));
                }
                folio_mark_uptodate(folio);
        }
        folio_mark_dirty(folio);
        folio_unlock(folio);
        folio_put(folio);

        return copied;
}

static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        struct address_space *mapping = inode->i_mapping;
        pgoff_t index;
        unsigned long offset;
        int error = 0;
        ssize_t retval = 0;
        loff_t *ppos = &iocb->ki_pos;

        index = *ppos >> PAGE_SHIFT;
        offset = *ppos & ~PAGE_MASK;

        for (;;) {
                struct folio *folio = NULL;
                struct page *page = NULL;
                pgoff_t end_index;
                unsigned long nr, ret;
                loff_t i_size = i_size_read(inode);

                end_index = i_size >> PAGE_SHIFT;
                if (index > end_index)
                        break;
                if (index == end_index) {
                        nr = i_size & ~PAGE_MASK;
                        if (nr <= offset)
                                break;
                }

                error = shmem_get_folio(inode, index, &folio, SGP_READ);
                if (error) {
                        if (error == -EINVAL)
                                error = 0;
                        break;
                }
                if (folio) {
                        folio_unlock(folio);

                        page = folio_file_page(folio, index);
                        if (PageHWPoison(page)) {
                                folio_put(folio);
                                error = -EIO;
                                break;
                        }
                }

                /*
                 * We must evaluate after, since reads (unlike writes)
                 * are called without i_rwsem protection against truncate
                 */
                nr = PAGE_SIZE;
                i_size = i_size_read(inode);
                end_index = i_size >> PAGE_SHIFT;
                if (index == end_index) {
                        nr = i_size & ~PAGE_MASK;
                        if (nr <= offset) {
                                if (folio)
                                        folio_put(folio);
                                break;
                        }
                }
                nr -= offset;

                if (folio) {
                        /*
                         * If users can be writing to this page using arbitrary
                         * virtual addresses, take care about potential aliasing
                         * before reading the page on the kernel side.
                         */
                        if (mapping_writably_mapped(mapping))
                                flush_dcache_page(page);
                        /*
                         * Mark the page accessed if we read the beginning.
                         */
                        if (!offset)
                                folio_mark_accessed(folio);
                        /*
                         * Ok, we have the page, and it's up-to-date, so
                         * now we can copy it to user space...
                         */
                        ret = copy_page_to_iter(page, offset, nr, to);
                        folio_put(folio);

                } else if (user_backed_iter(to)) {
                        /*
                         * Copy to user tends to be so well optimized, but
                         * clear_user() not so much, that it is noticeably
                         * faster to copy the zero page instead of clearing.
                         */
                        ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
                } else {
                        /*
                         * But submitting the same page twice in a row to
                         * splice() - or others? - can result in confusion:
                         * so don't attempt that optimization on pipes etc.
                         */
                        ret = iov_iter_zero(nr, to);
                }

                retval += ret;
                offset += ret;
                index += offset >> PAGE_SHIFT;
                offset &= ~PAGE_MASK;

                if (!iov_iter_count(to))
                        break;
                if (ret < nr) {
                        error = -EFAULT;
                        break;
                }
                cond_resched();
        }

        *ppos = ((loff_t) index << PAGE_SHIFT) + offset;
        file_accessed(file);
        return retval ? retval : error;
}

static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        ssize_t ret;

        inode_lock(inode);
        ret = generic_write_checks(iocb, from);
        if (ret <= 0)
                goto unlock;
        ret = file_remove_privs(file);
        if (ret)
                goto unlock;
        ret = file_update_time(file);
        if (ret)
                goto unlock;
        ret = generic_perform_write(iocb, from);
unlock:
        inode_unlock(inode);
        return ret;
}

static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
                              struct pipe_buffer *buf)
{
        return true;
}

static void zero_pipe_buf_release(struct pipe_inode_info *pipe,
                                  struct pipe_buffer *buf)
{
}

static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe,
                                    struct pipe_buffer *buf)
{
        return false;
}

static const struct pipe_buf_operations zero_pipe_buf_ops = {
        .release        = zero_pipe_buf_release,
        .try_steal        = zero_pipe_buf_try_steal,
        .get                = zero_pipe_buf_get,
};

static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
                                        loff_t fpos, size_t size)
{
        size_t offset = fpos & ~PAGE_MASK;

        size = min_t(size_t, size, PAGE_SIZE - offset);

        if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
                struct pipe_buffer *buf = pipe_head_buf(pipe);

                *buf = (struct pipe_buffer) {
                        .ops        = &zero_pipe_buf_ops,
                        .page        = ZERO_PAGE(0),
                        .offset        = offset,
                        .len        = size,
                };
                pipe->head++;
        }

        return size;
}

static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
                                      struct pipe_inode_info *pipe,
                                      size_t len, unsigned int flags)
{
        struct inode *inode = file_inode(in);
        struct address_space *mapping = inode->i_mapping;
        struct folio *folio = NULL;
        size_t total_spliced = 0, used, npages, n, part;
        loff_t isize;
        int error = 0;

        /* Work out how much data we can actually add into the pipe */
        used = pipe_occupancy(pipe->head, pipe->tail);
        npages = max_t(ssize_t, pipe->max_usage - used, 0);
        len = min_t(size_t, len, npages * PAGE_SIZE);

        do {
                if (*ppos >= i_size_read(inode))
                        break;

                error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio,
                                        SGP_READ);
                if (error) {
                        if (error == -EINVAL)
                                error = 0;
                        break;
                }
                if (folio) {
                        folio_unlock(folio);

                        if (folio_test_hwpoison(folio) ||
                            (folio_test_large(folio) &&
                             folio_test_has_hwpoisoned(folio))) {
                                error = -EIO;
                                break;
                        }
                }

                /*
                 * i_size must be checked after we know the pages are Uptodate.
                 *
                 * Checking i_size after the check allows us to calculate
                 * the correct value for "nr", which means the zero-filled
                 * part of the page is not copied back to userspace (unless
                 * another truncate extends the file - this is desired though).
                 */
                isize = i_size_read(inode);
                if (unlikely(*ppos >= isize))
                        break;
                part = min_t(loff_t, isize - *ppos, len);

                if (folio) {
                        /*
                         * If users can be writing to this page using arbitrary
                         * virtual addresses, take care about potential aliasing
                         * before reading the page on the kernel side.
                         */
                        if (mapping_writably_mapped(mapping))
                                flush_dcache_folio(folio);
                        folio_mark_accessed(folio);
                        /*
                         * Ok, we have the page, and it's up-to-date, so we can
                         * now splice it into the pipe.
                         */
                        n = splice_folio_into_pipe(pipe, folio, *ppos, part);
                        folio_put(folio);
                        folio = NULL;
                } else {
                        n = splice_zeropage_into_pipe(pipe, *ppos, part);
                }

                if (!n)
                        break;
                len -= n;
                total_spliced += n;
                *ppos += n;
                in->f_ra.prev_pos = *ppos;
                if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
                        break;

                cond_resched();
        } while (len);

        if (folio)
                folio_put(folio);

        file_accessed(in);
        return total_spliced ? total_spliced : error;
}

static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
{
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;

        if (whence != SEEK_DATA && whence != SEEK_HOLE)
                return generic_file_llseek_size(file, offset, whence,
                                        MAX_LFS_FILESIZE, i_size_read(inode));
        if (offset < 0)
                return -ENXIO;

        inode_lock(inode);
        /* We're holding i_rwsem so we can access i_size directly */
        offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
        if (offset >= 0)
                offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
        inode_unlock(inode);
        return offset;
}

static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                                                         loff_t len)
{
        struct inode *inode = file_inode(file);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_falloc shmem_falloc;
        pgoff_t start, index, end, undo_fallocend;
        int error;

        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
                return -EOPNOTSUPP;

        inode_lock(inode);

        if (mode & FALLOC_FL_PUNCH_HOLE) {
                struct address_space *mapping = file->f_mapping;
                loff_t unmap_start = round_up(offset, PAGE_SIZE);
                loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
                DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);

                /* protected by i_rwsem */
                if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
                        error = -EPERM;
                        goto out;
                }

                shmem_falloc.waitq = &shmem_falloc_waitq;
                shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
                shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
                spin_lock(&inode->i_lock);
                inode->i_private = &shmem_falloc;
                spin_unlock(&inode->i_lock);

                if ((u64)unmap_end > (u64)unmap_start)
                        unmap_mapping_range(mapping, unmap_start,
                                            1 + unmap_end - unmap_start, 0);
                shmem_truncate_range(inode, offset, offset + len - 1);
                /* No need to unmap again: hole-punching leaves COWed pages */

                spin_lock(&inode->i_lock);
                inode->i_private = NULL;
                wake_up_all(&shmem_falloc_waitq);
                WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
                spin_unlock(&inode->i_lock);
                error = 0;
                goto out;
        }

        /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
        error = inode_newsize_ok(inode, offset + len);
        if (error)
                goto out;

        if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
                error = -EPERM;
                goto out;
        }

        start = offset >> PAGE_SHIFT;
        end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
        /* Try to avoid a swapstorm if len is impossible to satisfy */
        if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
                error = -ENOSPC;
                goto out;
        }

        shmem_falloc.waitq = NULL;
        shmem_falloc.start = start;
        shmem_falloc.next  = start;
        shmem_falloc.nr_falloced = 0;
        shmem_falloc.nr_unswapped = 0;
        spin_lock(&inode->i_lock);
        inode->i_private = &shmem_falloc;
        spin_unlock(&inode->i_lock);

        /*
         * info->fallocend is only relevant when huge pages might be
         * involved: to prevent split_huge_page() freeing fallocated
         * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
         */
        undo_fallocend = info->fallocend;
        if (info->fallocend < end)
                info->fallocend = end;

        for (index = start; index < end; ) {
                struct folio *folio;

                /*
                 * Good, the fallocate(2) manpage permits EINTR: we may have
                 * been interrupted because we are using up too much memory.
                 */
                if (signal_pending(current))
                        error = -EINTR;
                else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
                        error = -ENOMEM;
                else
                        error = shmem_get_folio(inode, index, &folio,
                                                SGP_FALLOC);
                if (error) {
                        info->fallocend = undo_fallocend;
                        /* Remove the !uptodate folios we added */
                        if (index > start) {
                                shmem_undo_range(inode,
                                    (loff_t)start << PAGE_SHIFT,
                                    ((loff_t)index << PAGE_SHIFT) - 1, true);
                        }
                        goto undone;
                }

                /*
                 * Here is a more important optimization than it appears:
                 * a second SGP_FALLOC on the same large folio will clear it,
                 * making it uptodate and un-undoable if we fail later.
                 */
                index = folio_next_index(folio);
                /* Beware 32-bit wraparound */
                if (!index)
                        index--;

                /*
                 * Inform shmem_writepage() how far we have reached.
                 * No need for lock or barrier: we have the page lock.
                 */
                if (!folio_test_uptodate(folio))
                        shmem_falloc.nr_falloced += index - shmem_falloc.next;
                shmem_falloc.next = index;

                /*
                 * If !uptodate, leave it that way so that freeable folios
                 * can be recognized if we need to rollback on error later.
                 * But mark it dirty so that memory pressure will swap rather
                 * than free the folios we are allocating (and SGP_CACHE folios
                 * might still be clean: we now need to mark those dirty too).
                 */
                folio_mark_dirty(folio);
                folio_unlock(folio);
                folio_put(folio);
                cond_resched();
        }

        if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
                i_size_write(inode, offset + len);
undone:
        spin_lock(&inode->i_lock);
        inode->i_private = NULL;
        spin_unlock(&inode->i_lock);
out:
        if (!error)
                file_modified(file);
        inode_unlock(inode);
        return error;
}

static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);

        buf->f_type = TMPFS_MAGIC;
        buf->f_bsize = PAGE_SIZE;
        buf->f_namelen = NAME_MAX;
        if (sbinfo->max_blocks) {
                buf->f_blocks = sbinfo->max_blocks;
                buf->f_bavail =
                buf->f_bfree  = sbinfo->max_blocks -
                                percpu_counter_sum(&sbinfo->used_blocks);
        }
        if (sbinfo->max_inodes) {
                buf->f_files = sbinfo->max_inodes;
                buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE;
        }
        /* else leave those fields 0 like simple_statfs */

        buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);

        return 0;
}

/*
 * File creation. Allocate an inode, and we're done..
 */
static int
shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
            struct dentry *dentry, umode_t mode, dev_t dev)
{
        struct inode *inode;
        int error;

        inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
        if (IS_ERR(inode))
                return PTR_ERR(inode);

        error = simple_acl_create(dir, inode);
        if (error)
                goto out_iput;
        error = security_inode_init_security(inode, dir, &dentry->d_name,
                                             shmem_initxattrs, NULL);
        if (error && error != -EOPNOTSUPP)
                goto out_iput;

        error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
        if (error)
                goto out_iput;

        dir->i_size += BOGO_DIRENT_SIZE;
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        inode_inc_iversion(dir);
        d_instantiate(dentry, inode);
        dget(dentry); /* Extra count - pin the dentry in core */
        return error;

out_iput:
        iput(inode);
        return error;
}

static int
shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
              struct file *file, umode_t mode)
{
        struct inode *inode;
        int error;

        inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
        if (IS_ERR(inode)) {
                error = PTR_ERR(inode);
                goto err_out;
        }
        error = security_inode_init_security(inode, dir, NULL,
                                             shmem_initxattrs, NULL);
        if (error && error != -EOPNOTSUPP)
                goto out_iput;
        error = simple_acl_create(dir, inode);
        if (error)
                goto out_iput;
        d_tmpfile(file, inode);

err_out:
        return finish_open_simple(file, error);
out_iput:
        iput(inode);
        return error;
}

static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
                       struct dentry *dentry, umode_t mode)
{
        int error;

        error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
        if (error)
                return error;
        inc_nlink(dir);
        return 0;
}

static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
                        struct dentry *dentry, umode_t mode, bool excl)
{
        return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
}

/*
 * Link a file..
 */
static int shmem_link(struct dentry *old_dentry, struct inode *dir,
                      struct dentry *dentry)
{
        struct inode *inode = d_inode(old_dentry);
        int ret = 0;

        /*
         * No ordinary (disk based) filesystem counts links as inodes;
         * but each new link needs a new dentry, pinning lowmem, and
         * tmpfs dentries cannot be pruned until they are unlinked.
         * But if an O_TMPFILE file is linked into the tmpfs, the
         * first link must skip that, to get the accounting right.
         */
        if (inode->i_nlink) {
                ret = shmem_reserve_inode(inode->i_sb, NULL);
                if (ret)
                        goto out;
        }

        ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
        if (ret) {
                if (inode->i_nlink)
                        shmem_free_inode(inode->i_sb, 0);
                goto out;
        }

        dir->i_size += BOGO_DIRENT_SIZE;
        inode_set_mtime_to_ts(dir,
                              inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
        inode_inc_iversion(dir);
        inc_nlink(inode);
        ihold(inode);        /* New dentry reference */
        dget(dentry);        /* Extra pinning count for the created dentry */
        d_instantiate(dentry, inode);
out:
        return ret;
}

static int shmem_unlink(struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
                shmem_free_inode(inode->i_sb, 0);

        simple_offset_remove(shmem_get_offset_ctx(dir), dentry);

        dir->i_size -= BOGO_DIRENT_SIZE;
        inode_set_mtime_to_ts(dir,
                              inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
        inode_inc_iversion(dir);
        drop_nlink(inode);
        dput(dentry);        /* Undo the count from "create" - does all the work */
        return 0;
}

static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (!simple_offset_empty(dentry))
                return -ENOTEMPTY;

        drop_nlink(d_inode(dentry));
        drop_nlink(dir);
        return shmem_unlink(dir, dentry);
}

static int shmem_whiteout(struct mnt_idmap *idmap,
                          struct inode *old_dir, struct dentry *old_dentry)
{
        struct dentry *whiteout;
        int error;

        whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
        if (!whiteout)
                return -ENOMEM;

        error = shmem_mknod(idmap, old_dir, whiteout,
                            S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
        dput(whiteout);
        if (error)
                return error;

        /*
         * Cheat and hash the whiteout while the old dentry is still in
         * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
         *
         * d_lookup() will consistently find one of them at this point,
         * not sure which one, but that isn't even important.
         */
        d_rehash(whiteout);
        return 0;
}

/*
 * The VFS layer already does all the dentry stuff for rename,
 * we just have to decrement the usage count for the target if
 * it exists so that the VFS layer correctly free's it when it
 * gets overwritten.
 */
static int shmem_rename2(struct mnt_idmap *idmap,
                         struct inode *old_dir, struct dentry *old_dentry,
                         struct inode *new_dir, struct dentry *new_dentry,
                         unsigned int flags)
{
        struct inode *inode = d_inode(old_dentry);
        int they_are_dirs = S_ISDIR(inode->i_mode);
        int error;

        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                return -EINVAL;

        if (flags & RENAME_EXCHANGE)
                return simple_offset_rename_exchange(old_dir, old_dentry,
                                                     new_dir, new_dentry);

        if (!simple_offset_empty(new_dentry))
                return -ENOTEMPTY;

        if (flags & RENAME_WHITEOUT) {
                error = shmem_whiteout(idmap, old_dir, old_dentry);
                if (error)
                        return error;
        }

        error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry);
        if (error)
                return error;

        if (d_really_is_positive(new_dentry)) {
                (void) shmem_unlink(new_dir, new_dentry);
                if (they_are_dirs) {
                        drop_nlink(d_inode(new_dentry));
                        drop_nlink(old_dir);
                }
        } else if (they_are_dirs) {
                drop_nlink(old_dir);
                inc_nlink(new_dir);
        }

        old_dir->i_size -= BOGO_DIRENT_SIZE;
        new_dir->i_size += BOGO_DIRENT_SIZE;
        simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
        inode_inc_iversion(old_dir);
        inode_inc_iversion(new_dir);
        return 0;
}

static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
                         struct dentry *dentry, const char *symname)
{
        int error;
        int len;
        struct inode *inode;
        struct folio *folio;

        len = strlen(symname) + 1;
        if (len > PAGE_SIZE)
                return -ENAMETOOLONG;

        inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
                                VM_NORESERVE);
        if (IS_ERR(inode))
                return PTR_ERR(inode);

        error = security_inode_init_security(inode, dir, &dentry->d_name,
                                             shmem_initxattrs, NULL);
        if (error && error != -EOPNOTSUPP)
                goto out_iput;

        error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
        if (error)
                goto out_iput;

        inode->i_size = len-1;
        if (len <= SHORT_SYMLINK_LEN) {
                inode->i_link = kmemdup(symname, len, GFP_KERNEL);
                if (!inode->i_link) {
                        error = -ENOMEM;
                        goto out_remove_offset;
                }
                inode->i_op = &shmem_short_symlink_operations;
        } else {
                inode_nohighmem(inode);
                inode->i_mapping->a_ops = &shmem_aops;
                error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
                if (error)
                        goto out_remove_offset;
                inode->i_op = &shmem_symlink_inode_operations;
                memcpy(folio_address(folio), symname, len);
                folio_mark_uptodate(folio);
                folio_mark_dirty(folio);
                folio_unlock(folio);
                folio_put(folio);
        }
        dir->i_size += BOGO_DIRENT_SIZE;
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        inode_inc_iversion(dir);
        d_instantiate(dentry, inode);
        dget(dentry);
        return 0;

out_remove_offset:
        simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
out_iput:
        iput(inode);
        return error;
}

static void shmem_put_link(void *arg)
{
        folio_mark_accessed(arg);
        folio_put(arg);
}

static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
                                  struct delayed_call *done)
{
        struct folio *folio = NULL;
        int error;

        if (!dentry) {
                folio = filemap_get_folio(inode->i_mapping, 0);
                if (IS_ERR(folio))
                        return ERR_PTR(-ECHILD);
                if (PageHWPoison(folio_page(folio, 0)) ||
                    !folio_test_uptodate(folio)) {
                        folio_put(folio);
                        return ERR_PTR(-ECHILD);
                }
        } else {
                error = shmem_get_folio(inode, 0, &folio, SGP_READ);
                if (error)
                        return ERR_PTR(error);
                if (!folio)
                        return ERR_PTR(-ECHILD);
                if (PageHWPoison(folio_page(folio, 0))) {
                        folio_unlock(folio);
                        folio_put(folio);
                        return ERR_PTR(-ECHILD);
                }
                folio_unlock(folio);
        }
        set_delayed_call(done, shmem_put_link, folio);
        return folio_address(folio);
}

#ifdef CONFIG_TMPFS_XATTR

static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
        struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));

        fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE);

        return 0;
}

static int shmem_fileattr_set(struct mnt_idmap *idmap,
                              struct dentry *dentry, struct fileattr *fa)
{
        struct inode *inode = d_inode(dentry);
        struct shmem_inode_info *info = SHMEM_I(inode);

        if (fileattr_has_fsx(fa))
                return -EOPNOTSUPP;
        if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
                return -EOPNOTSUPP;

        info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
                (fa->flags & SHMEM_FL_USER_MODIFIABLE);

        shmem_set_inode_flags(inode, info->fsflags);
        inode_set_ctime_current(inode);
        inode_inc_iversion(inode);
        return 0;
}

/*
 * Superblocks without xattr inode operations may get some security.* xattr
 * support from the LSM "for free". As soon as we have any other xattrs
 * like ACLs, we also need to implement the security.* handlers at
 * filesystem level, though.
 */

/*
 * Callback for security_inode_init_security() for acquiring xattrs.
 */
static int shmem_initxattrs(struct inode *inode,
                            const struct xattr *xattr_array, void *fs_info)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        const struct xattr *xattr;
        struct simple_xattr *new_xattr;
        size_t ispace = 0;
        size_t len;

        if (sbinfo->max_inodes) {
                for (xattr = xattr_array; xattr->name != NULL; xattr++) {
                        ispace += simple_xattr_space(xattr->name,
                                xattr->value_len + XATTR_SECURITY_PREFIX_LEN);
                }
                if (ispace) {
                        raw_spin_lock(&sbinfo->stat_lock);
                        if (sbinfo->free_ispace < ispace)
                                ispace = 0;
                        else
                                sbinfo->free_ispace -= ispace;
                        raw_spin_unlock(&sbinfo->stat_lock);
                        if (!ispace)
                                return -ENOSPC;
                }
        }

        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
                new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
                if (!new_xattr)
                        break;

                len = strlen(xattr->name) + 1;
                new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
                                          GFP_KERNEL_ACCOUNT);
                if (!new_xattr->name) {
                        kvfree(new_xattr);
                        break;
                }

                memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
                       XATTR_SECURITY_PREFIX_LEN);
                memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
                       xattr->name, len);

                simple_xattr_add(&info->xattrs, new_xattr);
        }

        if (xattr->name != NULL) {
                if (ispace) {
                        raw_spin_lock(&sbinfo->stat_lock);
                        sbinfo->free_ispace += ispace;
                        raw_spin_unlock(&sbinfo->stat_lock);
                }
                simple_xattrs_free(&info->xattrs, NULL);
                return -ENOMEM;
        }

        return 0;
}

static int shmem_xattr_handler_get(const struct xattr_handler *handler,
                                   struct dentry *unused, struct inode *inode,
                                   const char *name, void *buffer, size_t size)
{
        struct shmem_inode_info *info = SHMEM_I(inode);

        name = xattr_full_name(handler, name);
        return simple_xattr_get(&info->xattrs, name, buffer, size);
}

static int shmem_xattr_handler_set(const struct xattr_handler *handler,
                                   struct mnt_idmap *idmap,
                                   struct dentry *unused, struct inode *inode,
                                   const char *name, const void *value,
                                   size_t size, int flags)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        struct simple_xattr *old_xattr;
        size_t ispace = 0;

        name = xattr_full_name(handler, name);
        if (value && sbinfo->max_inodes) {
                ispace = simple_xattr_space(name, size);
                raw_spin_lock(&sbinfo->stat_lock);
                if (sbinfo->free_ispace < ispace)
                        ispace = 0;
                else
                        sbinfo->free_ispace -= ispace;
                raw_spin_unlock(&sbinfo->stat_lock);
                if (!ispace)
                        return -ENOSPC;
        }

        old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags);
        if (!IS_ERR(old_xattr)) {
                ispace = 0;
                if (old_xattr && sbinfo->max_inodes)
                        ispace = simple_xattr_space(old_xattr->name,
                                                    old_xattr->size);
                simple_xattr_free(old_xattr);
                old_xattr = NULL;
                inode_set_ctime_current(inode);
                inode_inc_iversion(inode);
        }
        if (ispace) {
                raw_spin_lock(&sbinfo->stat_lock);
                sbinfo->free_ispace += ispace;
                raw_spin_unlock(&sbinfo->stat_lock);
        }
        return PTR_ERR(old_xattr);
}

static const struct xattr_handler shmem_security_xattr_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .get = shmem_xattr_handler_get,
        .set = shmem_xattr_handler_set,
};

static const struct xattr_handler shmem_trusted_xattr_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .get = shmem_xattr_handler_get,
        .set = shmem_xattr_handler_set,
};

static const struct xattr_handler shmem_user_xattr_handler = {
        .prefix = XATTR_USER_PREFIX,
        .get = shmem_xattr_handler_get,
        .set = shmem_xattr_handler_set,
};

static const struct xattr_handler * const shmem_xattr_handlers[] = {
        &shmem_security_xattr_handler,
        &shmem_trusted_xattr_handler,
        &shmem_user_xattr_handler,
        NULL
};

static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
        struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
        return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
}
#endif /* CONFIG_TMPFS_XATTR */

static const struct inode_operations shmem_short_symlink_operations = {
        .getattr        = shmem_getattr,
        .setattr        = shmem_setattr,
        .get_link        = simple_get_link,
#ifdef CONFIG_TMPFS_XATTR
        .listxattr        = shmem_listxattr,
#endif
};

static const struct inode_operations shmem_symlink_inode_operations = {
        .getattr        = shmem_getattr,
        .setattr        = shmem_setattr,
        .get_link        = shmem_get_link,
#ifdef CONFIG_TMPFS_XATTR
        .listxattr        = shmem_listxattr,
#endif
};

static struct dentry *shmem_get_parent(struct dentry *child)
{
        return ERR_PTR(-ESTALE);
}

static int shmem_match(struct inode *ino, void *vfh)
{
        __u32 *fh = vfh;
        __u64 inum = fh[2];
        inum = (inum << 32) | fh[1];
        return ino->i_ino == inum && fh[0] == ino->i_generation;
}

/* Find any alias of inode, but prefer a hashed alias */
static struct dentry *shmem_find_alias(struct inode *inode)
{
        struct dentry *alias = d_find_alias(inode);

        return alias ?: d_find_any_alias(inode);
}

static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
                struct fid *fid, int fh_len, int fh_type)
{
        struct inode *inode;
        struct dentry *dentry = NULL;
        u64 inum;

        if (fh_len < 3)
                return NULL;

        inum = fid->raw[2];
        inum = (inum << 32) | fid->raw[1];

        inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
                        shmem_match, fid->raw);
        if (inode) {
                dentry = shmem_find_alias(inode);
                iput(inode);
        }

        return dentry;
}

static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
                                struct inode *parent)
{
        if (*len < 3) {
                *len = 3;
                return FILEID_INVALID;
        }

        if (inode_unhashed(inode)) {
                /* Unfortunately insert_inode_hash is not idempotent,
                 * so as we hash inodes here rather than at creation
                 * time, we need a lock to ensure we only try
                 * to do it once
                 */
                static DEFINE_SPINLOCK(lock);
                spin_lock(&lock);
                if (inode_unhashed(inode))
                        __insert_inode_hash(inode,
                                            inode->i_ino + inode->i_generation);
                spin_unlock(&lock);
        }

        fh[0] = inode->i_generation;
        fh[1] = inode->i_ino;
        fh[2] = ((__u64)inode->i_ino) >> 32;

        *len = 3;
        return 1;
}

static const struct export_operations shmem_export_ops = {
        .get_parent     = shmem_get_parent,
        .encode_fh      = shmem_encode_fh,
        .fh_to_dentry        = shmem_fh_to_dentry,
};

enum shmem_param {
        Opt_gid,
        Opt_huge,
        Opt_mode,
        Opt_mpol,
        Opt_nr_blocks,
        Opt_nr_inodes,
        Opt_size,
        Opt_uid,
        Opt_inode32,
        Opt_inode64,
        Opt_noswap,
        Opt_quota,
        Opt_usrquota,
        Opt_grpquota,
        Opt_usrquota_block_hardlimit,
        Opt_usrquota_inode_hardlimit,
        Opt_grpquota_block_hardlimit,
        Opt_grpquota_inode_hardlimit,
};

static const struct constant_table shmem_param_enums_huge[] = {
        {"never",        SHMEM_HUGE_NEVER },
        {"always",        SHMEM_HUGE_ALWAYS },
        {"within_size",        SHMEM_HUGE_WITHIN_SIZE },
        {"advise",        SHMEM_HUGE_ADVISE },
        {}
};

const struct fs_parameter_spec shmem_fs_parameters[] = {
        fsparam_u32   ("gid",                Opt_gid),
        fsparam_enum  ("huge",                Opt_huge,  shmem_param_enums_huge),
        fsparam_u32oct("mode",                Opt_mode),
        fsparam_string("mpol",                Opt_mpol),
        fsparam_string("nr_blocks",        Opt_nr_blocks),
        fsparam_string("nr_inodes",        Opt_nr_inodes),
        fsparam_string("size",                Opt_size),
        fsparam_u32   ("uid",                Opt_uid),
        fsparam_flag  ("inode32",        Opt_inode32),
        fsparam_flag  ("inode64",        Opt_inode64),
        fsparam_flag  ("noswap",        Opt_noswap),
#ifdef CONFIG_TMPFS_QUOTA
        fsparam_flag  ("quota",                Opt_quota),
        fsparam_flag  ("usrquota",        Opt_usrquota),
        fsparam_flag  ("grpquota",        Opt_grpquota),
        fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit),
        fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit),
        fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit),
        fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit),
#endif
        {}
};

static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
{
        struct shmem_options *ctx = fc->fs_private;
        struct fs_parse_result result;
        unsigned long long size;
        char *rest;
        int opt;
        kuid_t kuid;
        kgid_t kgid;

        opt = fs_parse(fc, shmem_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case Opt_size:
                size = memparse(param->string, &rest);
                if (*rest == '%') {
                        size <<= PAGE_SHIFT;
                        size *= totalram_pages();
                        do_div(size, 100);
                        rest++;
                }
                if (*rest)
                        goto bad_value;
                ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
                ctx->seen |= SHMEM_SEEN_BLOCKS;
                break;
        case Opt_nr_blocks:
                ctx->blocks = memparse(param->string, &rest);
                if (*rest || ctx->blocks > LONG_MAX)
                        goto bad_value;
                ctx->seen |= SHMEM_SEEN_BLOCKS;
                break;
        case Opt_nr_inodes:
                ctx->inodes = memparse(param->string, &rest);
                if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE)
                        goto bad_value;
                ctx->seen |= SHMEM_SEEN_INODES;
                break;
        case Opt_mode:
                ctx->mode = result.uint_32 & 07777;
                break;
        case Opt_uid:
                kuid = make_kuid(current_user_ns(), result.uint_32);
                if (!uid_valid(kuid))
                        goto bad_value;

                /*
                 * The requested uid must be representable in the
                 * filesystem's idmapping.
                 */
                if (!kuid_has_mapping(fc->user_ns, kuid))
                        goto bad_value;

                ctx->uid = kuid;
                break;
        case Opt_gid:
                kgid = make_kgid(current_user_ns(), result.uint_32);
                if (!gid_valid(kgid))
                        goto bad_value;

                /*
                 * The requested gid must be representable in the
                 * filesystem's idmapping.
                 */
                if (!kgid_has_mapping(fc->user_ns, kgid))
                        goto bad_value;

                ctx->gid = kgid;
                break;
        case Opt_huge:
                ctx->huge = result.uint_32;
                if (ctx->huge != SHMEM_HUGE_NEVER &&
                    !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
                      has_transparent_hugepage()))
                        goto unsupported_parameter;
                ctx->seen |= SHMEM_SEEN_HUGE;
                break;
        case Opt_mpol:
                if (IS_ENABLED(CONFIG_NUMA)) {
                        mpol_put(ctx->mpol);
                        ctx->mpol = NULL;
                        if (mpol_parse_str(param->string, &ctx->mpol))
                                goto bad_value;
                        break;
                }
                goto unsupported_parameter;
        case Opt_inode32:
                ctx->full_inums = false;
                ctx->seen |= SHMEM_SEEN_INUMS;
                break;
        case Opt_inode64:
                if (sizeof(ino_t) < 8) {
                        return invalfc(fc,
                                       "Cannot use inode64 with <64bit inums in kernel\n");
                }
                ctx->full_inums = true;
                ctx->seen |= SHMEM_SEEN_INUMS;
                break;
        case Opt_noswap:
                if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) {
                        return invalfc(fc,
                                       "Turning off swap in unprivileged tmpfs mounts unsupported");
                }
                ctx->noswap = true;
                ctx->seen |= SHMEM_SEEN_NOSWAP;
                break;
        case Opt_quota:
                if (fc->user_ns != &init_user_ns)
                        return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
                ctx->seen |= SHMEM_SEEN_QUOTA;
                ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP);
                break;
        case Opt_usrquota:
                if (fc->user_ns != &init_user_ns)
                        return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
                ctx->seen |= SHMEM_SEEN_QUOTA;
                ctx->quota_types |= QTYPE_MASK_USR;
                break;
        case Opt_grpquota:
                if (fc->user_ns != &init_user_ns)
                        return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
                ctx->seen |= SHMEM_SEEN_QUOTA;
                ctx->quota_types |= QTYPE_MASK_GRP;
                break;
        case Opt_usrquota_block_hardlimit:
                size = memparse(param->string, &rest);
                if (*rest || !size)
                        goto bad_value;
                if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
                        return invalfc(fc,
                                       "User quota block hardlimit too large.");
                ctx->qlimits.usrquota_bhardlimit = size;
                break;
        case Opt_grpquota_block_hardlimit:
                size = memparse(param->string, &rest);
                if (*rest || !size)
                        goto bad_value;
                if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
                        return invalfc(fc,
                                       "Group quota block hardlimit too large.");
                ctx->qlimits.grpquota_bhardlimit = size;
                break;
        case Opt_usrquota_inode_hardlimit:
                size = memparse(param->string, &rest);
                if (*rest || !size)
                        goto bad_value;
                if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
                        return invalfc(fc,
                                       "User quota inode hardlimit too large.");
                ctx->qlimits.usrquota_ihardlimit = size;
                break;
        case Opt_grpquota_inode_hardlimit:
                size = memparse(param->string, &rest);
                if (*rest || !size)
                        goto bad_value;
                if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
                        return invalfc(fc,
                                       "Group quota inode hardlimit too large.");
                ctx->qlimits.grpquota_ihardlimit = size;
                break;
        }
        return 0;

unsupported_parameter:
        return invalfc(fc, "Unsupported parameter '%s'", param->key);
bad_value:
        return invalfc(fc, "Bad value for '%s'", param->key);
}

static int shmem_parse_options(struct fs_context *fc, void *data)
{
        char *options = data;

        if (options) {
                int err = security_sb_eat_lsm_opts(options, &fc->security);
                if (err)
                        return err;
        }

        while (options != NULL) {
                char *this_char = options;
                for (;;) {
                        /*
                         * NUL-terminate this option: unfortunately,
                         * mount options form a comma-separated list,
                         * but mpol's nodelist may also contain commas.
                         */
                        options = strchr(options, ',');
                        if (options == NULL)
                                break;
                        options++;
                        if (!isdigit(*options)) {
                                options[-1] = '\0';
                                break;
                        }
                }
                if (*this_char) {
                        char *value = strchr(this_char, '=');
                        size_t len = 0;
                        int err;

                        if (value) {
                                *value++ = '\0';
                                len = strlen(value);
                        }
                        err = vfs_parse_fs_string(fc, this_char, value, len);
                        if (err < 0)
                                return err;
                }
        }
        return 0;
}

/*
 * Reconfigure a shmem filesystem.
 */
static int shmem_reconfigure(struct fs_context *fc)
{
        struct shmem_options *ctx = fc->fs_private;
        struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
        unsigned long used_isp;
        struct mempolicy *mpol = NULL;
        const char *err;

        raw_spin_lock(&sbinfo->stat_lock);
        used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace;

        if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
                if (!sbinfo->max_blocks) {
                        err = "Cannot retroactively limit size";
                        goto out;
                }
                if (percpu_counter_compare(&sbinfo->used_blocks,
                                           ctx->blocks) > 0) {
                        err = "Too small a size for current use";
                        goto out;
                }
        }
        if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
                if (!sbinfo->max_inodes) {
                        err = "Cannot retroactively limit inodes";
                        goto out;
                }
                if (ctx->inodes * BOGO_INODE_SIZE < used_isp) {
                        err = "Too few inodes for current use";
                        goto out;
                }
        }

        if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
            sbinfo->next_ino > UINT_MAX) {
                err = "Current inum too high to switch to 32-bit inums";
                goto out;
        }
        if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) {
                err = "Cannot disable swap on remount";
                goto out;
        }
        if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) {
                err = "Cannot enable swap on remount if it was disabled on first mount";
                goto out;
        }

        if (ctx->seen & SHMEM_SEEN_QUOTA &&
            !sb_any_quota_loaded(fc->root->d_sb)) {
                err = "Cannot enable quota on remount";
                goto out;
        }

#ifdef CONFIG_TMPFS_QUOTA
#define CHANGED_LIMIT(name)                                                \
        (ctx->qlimits.name## hardlimit &&                                \
        (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit))

        if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) ||
            CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) {
                err = "Cannot change global quota limit on remount";
                goto out;
        }
#endif /* CONFIG_TMPFS_QUOTA */

        if (ctx->seen & SHMEM_SEEN_HUGE)
                sbinfo->huge = ctx->huge;
        if (ctx->seen & SHMEM_SEEN_INUMS)
                sbinfo->full_inums = ctx->full_inums;
        if (ctx->seen & SHMEM_SEEN_BLOCKS)
                sbinfo->max_blocks  = ctx->blocks;
        if (ctx->seen & SHMEM_SEEN_INODES) {
                sbinfo->max_inodes  = ctx->inodes;
                sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp;
        }

        /*
         * Preserve previous mempolicy unless mpol remount option was specified.
         */
        if (ctx->mpol) {
                mpol = sbinfo->mpol;
                sbinfo->mpol = ctx->mpol;        /* transfers initial ref */
                ctx->mpol = NULL;
        }

        if (ctx->noswap)
                sbinfo->noswap = true;

        raw_spin_unlock(&sbinfo->stat_lock);
        mpol_put(mpol);
        return 0;
out:
        raw_spin_unlock(&sbinfo->stat_lock);
        return invalfc(fc, "%s", err);
}

static int shmem_show_options(struct seq_file *seq, struct dentry *root)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
        struct mempolicy *mpol;

        if (sbinfo->max_blocks != shmem_default_max_blocks())
                seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks));
        if (sbinfo->max_inodes != shmem_default_max_inodes())
                seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
        if (sbinfo->mode != (0777 | S_ISVTX))
                seq_printf(seq, ",mode=%03ho", sbinfo->mode);
        if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
                seq_printf(seq, ",uid=%u",
                                from_kuid_munged(&init_user_ns, sbinfo->uid));
        if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
                seq_printf(seq, ",gid=%u",
                                from_kgid_munged(&init_user_ns, sbinfo->gid));

        /*
         * Showing inode{64,32} might be useful even if it's the system default,
         * since then people don't have to resort to checking both here and
         * /proc/config.gz to confirm 64-bit inums were successfully applied
         * (which may not even exist if IKCONFIG_PROC isn't enabled).
         *
         * We hide it when inode64 isn't the default and we are using 32-bit
         * inodes, since that probably just means the feature isn't even under
         * consideration.
         *
         * As such:
         *
         *                     +-----------------+-----------------+
         *                     | TMPFS_INODE64=y | TMPFS_INODE64=n |
         *  +------------------+-----------------+-----------------+
         *  | full_inums=true  | show            | show            |
         *  | full_inums=false | show            | hide            |
         *  +------------------+-----------------+-----------------+
         *
         */
        if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
                seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
        if (sbinfo->huge)
                seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
#endif
        mpol = shmem_get_sbmpol(sbinfo);
        shmem_show_mpol(seq, mpol);
        mpol_put(mpol);
        if (sbinfo->noswap)
                seq_printf(seq, ",noswap");
#ifdef CONFIG_TMPFS_QUOTA
        if (sb_has_quota_active(root->d_sb, USRQUOTA))
                seq_printf(seq, ",usrquota");
        if (sb_has_quota_active(root->d_sb, GRPQUOTA))
                seq_printf(seq, ",grpquota");
        if (sbinfo->qlimits.usrquota_bhardlimit)
                seq_printf(seq, ",usrquota_block_hardlimit=%lld",
                           sbinfo->qlimits.usrquota_bhardlimit);
        if (sbinfo->qlimits.grpquota_bhardlimit)
                seq_printf(seq, ",grpquota_block_hardlimit=%lld",
                           sbinfo->qlimits.grpquota_bhardlimit);
        if (sbinfo->qlimits.usrquota_ihardlimit)
                seq_printf(seq, ",usrquota_inode_hardlimit=%lld",
                           sbinfo->qlimits.usrquota_ihardlimit);
        if (sbinfo->qlimits.grpquota_ihardlimit)
                seq_printf(seq, ",grpquota_inode_hardlimit=%lld",
                           sbinfo->qlimits.grpquota_ihardlimit);
#endif
        return 0;
}

#endif /* CONFIG_TMPFS */

static void shmem_put_super(struct super_block *sb)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);

#ifdef CONFIG_TMPFS_QUOTA
        shmem_disable_quotas(sb);
#endif
        free_percpu(sbinfo->ino_batch);
        percpu_counter_destroy(&sbinfo->used_blocks);
        mpol_put(sbinfo->mpol);
        kfree(sbinfo);
        sb->s_fs_info = NULL;
}

static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
{
        struct shmem_options *ctx = fc->fs_private;
        struct inode *inode;
        struct shmem_sb_info *sbinfo;
        int error = -ENOMEM;

        /* Round up to L1_CACHE_BYTES to resist false sharing */
        sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
                                L1_CACHE_BYTES), GFP_KERNEL);
        if (!sbinfo)
                return error;

        sb->s_fs_info = sbinfo;

#ifdef CONFIG_TMPFS
        /*
         * Per default we only allow half of the physical ram per
         * tmpfs instance, limiting inodes to one per page of lowmem;
         * but the internal instance is left unlimited.
         */
        if (!(sb->s_flags & SB_KERNMOUNT)) {
                if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
                        ctx->blocks = shmem_default_max_blocks();
                if (!(ctx->seen & SHMEM_SEEN_INODES))
                        ctx->inodes = shmem_default_max_inodes();
                if (!(ctx->seen & SHMEM_SEEN_INUMS))
                        ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
                sbinfo->noswap = ctx->noswap;
        } else {
                sb->s_flags |= SB_NOUSER;
        }
        sb->s_export_op = &shmem_export_ops;
        sb->s_flags |= SB_NOSEC | SB_I_VERSION;
#else
        sb->s_flags |= SB_NOUSER;
#endif
        sbinfo->max_blocks = ctx->blocks;
        sbinfo->max_inodes = ctx->inodes;
        sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE;
        if (sb->s_flags & SB_KERNMOUNT) {
                sbinfo->ino_batch = alloc_percpu(ino_t);
                if (!sbinfo->ino_batch)
                        goto failed;
        }
        sbinfo->uid = ctx->uid;
        sbinfo->gid = ctx->gid;
        sbinfo->full_inums = ctx->full_inums;
        sbinfo->mode = ctx->mode;
        sbinfo->huge = ctx->huge;
        sbinfo->mpol = ctx->mpol;
        ctx->mpol = NULL;

        raw_spin_lock_init(&sbinfo->stat_lock);
        if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
                goto failed;
        spin_lock_init(&sbinfo->shrinklist_lock);
        INIT_LIST_HEAD(&sbinfo->shrinklist);

        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_blocksize = PAGE_SIZE;
        sb->s_blocksize_bits = PAGE_SHIFT;
        sb->s_magic = TMPFS_MAGIC;
        sb->s_op = &shmem_ops;
        sb->s_time_gran = 1;
#ifdef CONFIG_TMPFS_XATTR
        sb->s_xattr = shmem_xattr_handlers;
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
        sb->s_flags |= SB_POSIXACL;
#endif
        uuid_t uuid;
        uuid_gen(&uuid);
        super_set_uuid(sb, uuid.b, sizeof(uuid));

#ifdef CONFIG_TMPFS_QUOTA
        if (ctx->seen & SHMEM_SEEN_QUOTA) {
                sb->dq_op = &shmem_quota_operations;
                sb->s_qcop = &dquot_quotactl_sysfile_ops;
                sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;

                /* Copy the default limits from ctx into sbinfo */
                memcpy(&sbinfo->qlimits, &ctx->qlimits,
                       sizeof(struct shmem_quota_limits));

                if (shmem_enable_quotas(sb, ctx->quota_types))
                        goto failed;
        }
#endif /* CONFIG_TMPFS_QUOTA */

        inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL,
                                S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
        if (IS_ERR(inode)) {
                error = PTR_ERR(inode);
                goto failed;
        }
        inode->i_uid = sbinfo->uid;
        inode->i_gid = sbinfo->gid;
        sb->s_root = d_make_root(inode);
        if (!sb->s_root)
                goto failed;
        return 0;

failed:
        shmem_put_super(sb);
        return error;
}

static int shmem_get_tree(struct fs_context *fc)
{
        return get_tree_nodev(fc, shmem_fill_super);
}

static void shmem_free_fc(struct fs_context *fc)
{
        struct shmem_options *ctx = fc->fs_private;

        if (ctx) {
                mpol_put(ctx->mpol);
                kfree(ctx);
        }
}

static const struct fs_context_operations shmem_fs_context_ops = {
        .free                        = shmem_free_fc,
        .get_tree                = shmem_get_tree,
#ifdef CONFIG_TMPFS
        .parse_monolithic        = shmem_parse_options,
        .parse_param                = shmem_parse_one,
        .reconfigure                = shmem_reconfigure,
#endif
};

static struct kmem_cache *shmem_inode_cachep __ro_after_init;

static struct inode *shmem_alloc_inode(struct super_block *sb)
{
        struct shmem_inode_info *info;
        info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL);
        if (!info)
                return NULL;
        return &info->vfs_inode;
}

static void shmem_free_in_core_inode(struct inode *inode)
{
        if (S_ISLNK(inode->i_mode))
                kfree(inode->i_link);
        kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
}

static void shmem_destroy_inode(struct inode *inode)
{
        if (S_ISREG(inode->i_mode))
                mpol_free_shared_policy(&SHMEM_I(inode)->policy);
        if (S_ISDIR(inode->i_mode))
                simple_offset_destroy(shmem_get_offset_ctx(inode));
}

static void shmem_init_inode(void *foo)
{
        struct shmem_inode_info *info = foo;
        inode_init_once(&info->vfs_inode);
}

static void __init shmem_init_inodecache(void)
{
        shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
                                sizeof(struct shmem_inode_info),
                                0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
}

static void __init shmem_destroy_inodecache(void)
{
        kmem_cache_destroy(shmem_inode_cachep);
}

/* Keep the page in page cache instead of truncating it */
static int shmem_error_remove_folio(struct address_space *mapping,
                                   struct folio *folio)
{
        return 0;
}

static const struct address_space_operations shmem_aops = {
        .writepage        = shmem_writepage,
        .dirty_folio        = noop_dirty_folio,
#ifdef CONFIG_TMPFS
        .write_begin        = shmem_write_begin,
        .write_end        = shmem_write_end,
#endif
#ifdef CONFIG_MIGRATION
        .migrate_folio        = migrate_folio,
#endif
        .error_remove_folio = shmem_error_remove_folio,
};

static const struct file_operations shmem_file_operations = {
        .mmap                = shmem_mmap,
        .open                = shmem_file_open,
        .get_unmapped_area = shmem_get_unmapped_area,
#ifdef CONFIG_TMPFS
        .llseek                = shmem_file_llseek,
        .read_iter        = shmem_file_read_iter,
        .write_iter        = shmem_file_write_iter,
        .fsync                = noop_fsync,
        .splice_read        = shmem_file_splice_read,
        .splice_write        = iter_file_splice_write,
        .fallocate        = shmem_fallocate,
#endif
};

static const struct inode_operations shmem_inode_operations = {
        .getattr        = shmem_getattr,
        .setattr        = shmem_setattr,
#ifdef CONFIG_TMPFS_XATTR
        .listxattr        = shmem_listxattr,
        .set_acl        = simple_set_acl,
        .fileattr_get        = shmem_fileattr_get,
        .fileattr_set        = shmem_fileattr_set,
#endif
};

static const struct inode_operations shmem_dir_inode_operations = {
#ifdef CONFIG_TMPFS
        .getattr        = shmem_getattr,
        .create                = shmem_create,
        .lookup                = simple_lookup,
        .link                = shmem_link,
        .unlink                = shmem_unlink,
        .symlink        = shmem_symlink,
        .mkdir                = shmem_mkdir,
        .rmdir                = shmem_rmdir,
        .mknod                = shmem_mknod,
        .rename                = shmem_rename2,
        .tmpfile        = shmem_tmpfile,
        .get_offset_ctx        = shmem_get_offset_ctx,
#endif
#ifdef CONFIG_TMPFS_XATTR
        .listxattr        = shmem_listxattr,
        .fileattr_get        = shmem_fileattr_get,
        .fileattr_set        = shmem_fileattr_set,
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
        .setattr        = shmem_setattr,
        .set_acl        = simple_set_acl,
#endif
};

static const struct inode_operations shmem_special_inode_operations = {
        .getattr        = shmem_getattr,
#ifdef CONFIG_TMPFS_XATTR
        .listxattr        = shmem_listxattr,
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
        .setattr        = shmem_setattr,
        .set_acl        = simple_set_acl,
#endif
};

static const struct super_operations shmem_ops = {
        .alloc_inode        = shmem_alloc_inode,
        .free_inode        = shmem_free_in_core_inode,
        .destroy_inode        = shmem_destroy_inode,
#ifdef CONFIG_TMPFS
        .statfs                = shmem_statfs,
        .show_options        = shmem_show_options,
#endif
#ifdef CONFIG_TMPFS_QUOTA
        .get_dquots        = shmem_get_dquots,
#endif
        .evict_inode        = shmem_evict_inode,
        .drop_inode        = generic_delete_inode,
        .put_super        = shmem_put_super,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        .nr_cached_objects        = shmem_unused_huge_count,
        .free_cached_objects        = shmem_unused_huge_scan,
#endif
};

static const struct vm_operations_struct shmem_vm_ops = {
        .fault                = shmem_fault,
        .map_pages        = filemap_map_pages,
#ifdef CONFIG_NUMA
        .set_policy     = shmem_set_policy,
        .get_policy     = shmem_get_policy,
#endif
};

static const struct vm_operations_struct shmem_anon_vm_ops = {
        .fault                = shmem_fault,
        .map_pages        = filemap_map_pages,
#ifdef CONFIG_NUMA
        .set_policy     = shmem_set_policy,
        .get_policy     = shmem_get_policy,
#endif
};

int shmem_init_fs_context(struct fs_context *fc)
{
        struct shmem_options *ctx;

        ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        ctx->mode = 0777 | S_ISVTX;
        ctx->uid = current_fsuid();
        ctx->gid = current_fsgid();

        fc->fs_private = ctx;
        fc->ops = &shmem_fs_context_ops;
        return 0;
}

static struct file_system_type shmem_fs_type = {
        .owner                = THIS_MODULE,
        .name                = "tmpfs",
        .init_fs_context = shmem_init_fs_context,
#ifdef CONFIG_TMPFS
        .parameters        = shmem_fs_parameters,
#endif
        .kill_sb        = kill_litter_super,
        .fs_flags        = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
};

void __init shmem_init(void)
{
        int error;

        shmem_init_inodecache();

#ifdef CONFIG_TMPFS_QUOTA
        error = register_quota_format(&shmem_quota_format);
        if (error < 0) {
                pr_err("Could not register quota format\n");
                goto out3;
        }
#endif

        error = register_filesystem(&shmem_fs_type);
        if (error) {
                pr_err("Could not register tmpfs\n");
                goto out2;
        }

        shm_mnt = kern_mount(&shmem_fs_type);
        if (IS_ERR(shm_mnt)) {
                error = PTR_ERR(shm_mnt);
                pr_err("Could not kern_mount tmpfs\n");
                goto out1;
        }

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
                SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
        else
                shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
#endif
        return;

out1:
        unregister_filesystem(&shmem_fs_type);
out2:
#ifdef CONFIG_TMPFS_QUOTA
        unregister_quota_format(&shmem_quota_format);
out3:
#endif
        shmem_destroy_inodecache();
        shm_mnt = ERR_PTR(error);
}

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
static ssize_t shmem_enabled_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
{
        static const int values[] = {
                SHMEM_HUGE_ALWAYS,
                SHMEM_HUGE_WITHIN_SIZE,
                SHMEM_HUGE_ADVISE,
                SHMEM_HUGE_NEVER,
                SHMEM_HUGE_DENY,
                SHMEM_HUGE_FORCE,
        };
        int len = 0;
        int i;

        for (i = 0; i < ARRAY_SIZE(values); i++) {
                len += sysfs_emit_at(buf, len,
                                shmem_huge == values[i] ? "%s[%s]" : "%s%s",
                                i ? " " : "", shmem_format_huge(values[i]));
        }
        len += sysfs_emit_at(buf, len, "\n");

        return len;
}

static ssize_t shmem_enabled_store(struct kobject *kobj,
                struct kobj_attribute *attr, const char *buf, size_t count)
{
        char tmp[16];
        int huge;

        if (count + 1 > sizeof(tmp))
                return -EINVAL;
        memcpy(tmp, buf, count);
        tmp[count] = '\0';
        if (count && tmp[count - 1] == '\n')
                tmp[count - 1] = '\0';

        huge = shmem_parse_huge(tmp);
        if (huge == -EINVAL)
                return -EINVAL;
        if (!has_transparent_hugepage() &&
                        huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
                return -EINVAL;

        shmem_huge = huge;
        if (shmem_huge > SHMEM_HUGE_DENY)
                SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
        return count;
}

struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */

#else /* !CONFIG_SHMEM */

/*
 * tiny-shmem: simple shmemfs and tmpfs using ramfs code
 *
 * This is intended for small system where the benefits of the full
 * shmem code (swap-backed and resource-limited) are outweighed by
 * their complexity. On systems without swap this code should be
 * effectively equivalent, but much lighter weight.
 */

static struct file_system_type shmem_fs_type = {
        .name                = "tmpfs",
        .init_fs_context = ramfs_init_fs_context,
        .parameters        = ramfs_fs_parameters,
        .kill_sb        = ramfs_kill_sb,
        .fs_flags        = FS_USERNS_MOUNT,
};

void __init shmem_init(void)
{
        BUG_ON(register_filesystem(&shmem_fs_type) != 0);

        shm_mnt = kern_mount(&shmem_fs_type);
        BUG_ON(IS_ERR(shm_mnt));
}

int shmem_unuse(unsigned int type)
{
        return 0;
}

int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
{
        return 0;
}

void shmem_unlock_mapping(struct address_space *mapping)
{
}

#ifdef CONFIG_MMU
unsigned long shmem_get_unmapped_area(struct file *file,
                                      unsigned long addr, unsigned long len,
                                      unsigned long pgoff, unsigned long flags)
{
        return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
}
#endif

void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
        truncate_inode_pages_range(inode->i_mapping, lstart, lend);
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);

#define shmem_vm_ops                                generic_file_vm_ops
#define shmem_anon_vm_ops                        generic_file_vm_ops
#define shmem_file_operations                        ramfs_file_operations
#define shmem_acct_size(flags, size)                0
#define shmem_unacct_size(flags, size)                do {} while (0)

static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
                                struct super_block *sb, struct inode *dir,
                                umode_t mode, dev_t dev, unsigned long flags)
{
        struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
        return inode ? inode : ERR_PTR(-ENOSPC);
}

#endif /* CONFIG_SHMEM */

/* common code */

static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
                        loff_t size, unsigned long flags, unsigned int i_flags)
{
        struct inode *inode;
        struct file *res;

        if (IS_ERR(mnt))
                return ERR_CAST(mnt);

        if (size < 0 || size > MAX_LFS_FILESIZE)
                return ERR_PTR(-EINVAL);

        if (shmem_acct_size(flags, size))
                return ERR_PTR(-ENOMEM);

        if (is_idmapped_mnt(mnt))
                return ERR_PTR(-EINVAL);

        inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
                                S_IFREG | S_IRWXUGO, 0, flags);
        if (IS_ERR(inode)) {
                shmem_unacct_size(flags, size);
                return ERR_CAST(inode);
        }
        inode->i_flags |= i_flags;
        inode->i_size = size;
        clear_nlink(inode);        /* It is unlinked */
        res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
        if (!IS_ERR(res))
                res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
                                &shmem_file_operations);
        if (IS_ERR(res))
                iput(inode);
        return res;
}

/**
 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
 *         kernel internal.  There will be NO LSM permission checks against the
 *         underlying inode.  So users of this interface must do LSM checks at a
 *        higher layer.  The users are the big_key and shm implementations.  LSM
 *        checks are provided at the key or shm level rather than the inode.
 * @name: name for dentry (to be seen in /proc/<pid>/maps
 * @size: size to be set for the file
 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
 */
struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
{
        return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
}
EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);

/**
 * shmem_file_setup - get an unlinked file living in tmpfs
 * @name: name for dentry (to be seen in /proc/<pid>/maps
 * @size: size to be set for the file
 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
 */
struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
{
        return __shmem_file_setup(shm_mnt, name, size, flags, 0);
}
EXPORT_SYMBOL_GPL(shmem_file_setup);

/**
 * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
 * @mnt: the tmpfs mount where the file will be created
 * @name: name for dentry (to be seen in /proc/<pid>/maps
 * @size: size to be set for the file
 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
 */
struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
                                       loff_t size, unsigned long flags)
{
        return __shmem_file_setup(mnt, name, size, flags, 0);
}
EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);

/**
 * shmem_zero_setup - setup a shared anonymous mapping
 * @vma: the vma to be mmapped is prepared by do_mmap
 */
int shmem_zero_setup(struct vm_area_struct *vma)
{
        struct file *file;
        loff_t size = vma->vm_end - vma->vm_start;

        /*
         * Cloning a new file under mmap_lock leads to a lock ordering conflict
         * between XFS directory reading and selinux: since this file is only
         * accessible to the user through its mapping, use S_PRIVATE flag to
         * bypass file security, in the same way as shmem_kernel_file_setup().
         */
        file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
        if (IS_ERR(file))
                return PTR_ERR(file);

        if (vma->vm_file)
                fput(vma->vm_file);
        vma->vm_file = file;
        vma->vm_ops = &shmem_anon_vm_ops;

        return 0;
}

/**
 * shmem_read_folio_gfp - read into page cache, using specified page allocation flags.
 * @mapping:        the folio's address_space
 * @index:        the folio index
 * @gfp:        the page allocator flags to use if allocating
 *
 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
 * with any new page allocations done using the specified allocation flags.
 * But read_cache_page_gfp() uses the ->read_folio() method: which does not
 * suit tmpfs, since it may have pages in swapcache, and needs to find those
 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
 *
 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
 */
struct folio *shmem_read_folio_gfp(struct address_space *mapping,
                pgoff_t index, gfp_t gfp)
{
#ifdef CONFIG_SHMEM
        struct inode *inode = mapping->host;
        struct folio *folio;
        int error;

        error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE,
                                    gfp, NULL, NULL);
        if (error)
                return ERR_PTR(error);

        folio_unlock(folio);
        return folio;
#else
        /*
         * The tiny !SHMEM case uses ramfs without swap
         */
        return mapping_read_folio_gfp(mapping, index, gfp);
#endif
}
EXPORT_SYMBOL_GPL(shmem_read_folio_gfp);

struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
                                         pgoff_t index, gfp_t gfp)
{
        struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp);
        struct page *page;

        if (IS_ERR(folio))
                return &folio->page;

        page = folio_file_page(folio, index);
        if (PageHWPoison(page)) {
                folio_put(folio);
                return ERR_PTR(-EIO);
        }

        return page;
}
EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);



















    2 


















    5 




    5 













    5 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/fs/ext4/xattr_security.c
 * Handler for storing security labels as extended attributes.
 */

#include <linux/string.h>
#include <linux/fs.h>
#include <linux/security.h>
#include <linux/slab.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "xattr.h"

static int
ext4_xattr_security_get(const struct xattr_handler *handler,
                        struct dentry *unused, struct inode *inode,
                        const char *name, void *buffer, size_t size)
{
        return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY,
                              name, buffer, size);
}

static int
ext4_xattr_security_set(const struct xattr_handler *handler,
                        struct mnt_idmap *idmap,
                        struct dentry *unused, struct inode *inode,
                        const char *name, const void *value,
                        size_t size, int flags)
{
        return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY,
                              name, value, size, flags);
}

static int
ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
                void *fs_info)
{
        const struct xattr *xattr;
        handle_t *handle = fs_info;
        int err = 0;

        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
                err = ext4_xattr_set_handle(handle, inode,
                                            EXT4_XATTR_INDEX_SECURITY,
                                            xattr->name, xattr->value,
                                            xattr->value_len, XATTR_CREATE);
                if (err < 0)
                        break;
        }
        return err;
}

int
ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
                   const struct qstr *qstr)
{
        return security_inode_init_security(inode, dir, qstr,
                                            &ext4_initxattrs, handle);
}

const struct xattr_handler ext4_xattr_security_handler = {
        .prefix        = XATTR_SECURITY_PREFIX,
        .get        = ext4_xattr_security_get,
        .set        = ext4_xattr_security_set,
};


































































































































































































































































    4 


















    2 

























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Operations on the network namespace
 */
#ifndef __NET_NET_NAMESPACE_H
#define __NET_NET_NAMESPACE_H

#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include <linux/list.h>
#include <linux/sysctl.h>
#include <linux/uidgid.h>

#include <net/flow.h>
#include <net/netns/core.h>
#include <net/netns/mib.h>
#include <net/netns/unix.h>
#include <net/netns/packet.h>
#include <net/netns/ipv4.h>
#include <net/netns/ipv6.h>
#include <net/netns/nexthop.h>
#include <net/netns/ieee802154_6lowpan.h>
#include <net/netns/sctp.h>
#include <net/netns/netfilter.h>
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
#include <net/netns/conntrack.h>
#endif
#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
#include <net/netns/flow_table.h>
#endif
#include <net/netns/nftables.h>
#include <net/netns/xfrm.h>
#include <net/netns/mpls.h>
#include <net/netns/can.h>
#include <net/netns/xdp.h>
#include <net/netns/smc.h>
#include <net/netns/bpf.h>
#include <net/netns/mctp.h>
#include <net/net_trackers.h>
#include <linux/ns_common.h>
#include <linux/idr.h>
#include <linux/skbuff.h>
#include <linux/notifier.h>
#include <linux/xarray.h>

struct user_namespace;
struct proc_dir_entry;
struct net_device;
struct sock;
struct ctl_table_header;
struct net_generic;
struct uevent_sock;
struct netns_ipvs;
struct bpf_prog;


#define NETDEV_HASHBITS    8
#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)

struct net {
        /* First cache line can be often dirtied.
         * Do not place here read-mostly fields.
         */
        refcount_t                passive;        /* To decide when the network
                                                 * namespace should be freed.
                                                 */
        spinlock_t                rules_mod_lock;

        unsigned int                dev_base_seq;        /* protected by rtnl_mutex */
        u32                        ifindex;

        spinlock_t                nsid_lock;
        atomic_t                fnhe_genid;

        struct list_head        list;                /* list of network namespaces */
        struct list_head        exit_list;        /* To linked to call pernet exit
                                                 * methods on dead net (
                                                 * pernet_ops_rwsem read locked),
                                                 * or to unregister pernet ops
                                                 * (pernet_ops_rwsem write locked).
                                                 */
        struct llist_node        cleanup_list;        /* namespaces on death row */

#ifdef CONFIG_KEYS
        struct key_tag                *key_domain;        /* Key domain of operation tag */
#endif
        struct user_namespace   *user_ns;        /* Owning user namespace */
        struct ucounts                *ucounts;
        struct idr                netns_ids;

        struct ns_common        ns;
        struct ref_tracker_dir  refcnt_tracker;
        struct ref_tracker_dir  notrefcnt_tracker; /* tracker for objects not
                                                    * refcounted against netns
                                                    */
        struct list_head         dev_base_head;
        struct proc_dir_entry         *proc_net;
        struct proc_dir_entry         *proc_net_stat;

#ifdef CONFIG_SYSCTL
        struct ctl_table_set        sysctls;
#endif

        struct sock                 *rtnl;                        /* rtnetlink socket */
        struct sock                *genl_sock;

        struct uevent_sock        *uevent_sock;                /* uevent socket */

        struct hlist_head         *dev_name_head;
        struct hlist_head        *dev_index_head;
        struct xarray                dev_by_index;
        struct raw_notifier_head        netdev_chain;

        /* Note that @hash_mix can be read millions times per second,
         * it is critical that it is on a read_mostly cache line.
         */
        u32                        hash_mix;

        struct net_device       *loopback_dev;          /* The loopback */

        /* core fib_rules */
        struct list_head        rules_ops;

        struct netns_core        core;
        struct netns_mib        mib;
        struct netns_packet        packet;
#if IS_ENABLED(CONFIG_UNIX)
        struct netns_unix        unx;
#endif
        struct netns_nexthop        nexthop;
        struct netns_ipv4        ipv4;
#if IS_ENABLED(CONFIG_IPV6)
        struct netns_ipv6        ipv6;
#endif
#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
        struct netns_ieee802154_lowpan        ieee802154_lowpan;
#endif
#if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE)
        struct netns_sctp        sctp;
#endif
#ifdef CONFIG_NETFILTER
        struct netns_nf                nf;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        struct netns_ct                ct;
#endif
#if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE)
        struct netns_nftables        nft;
#endif
#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
        struct netns_ft ft;
#endif
#endif
#ifdef CONFIG_WEXT_CORE
        struct sk_buff_head        wext_nlevents;
#endif
        struct net_generic __rcu        *gen;

        /* Used to store attached BPF programs */
        struct netns_bpf        bpf;

        /* Note : following structs are cache line aligned */
#ifdef CONFIG_XFRM
        struct netns_xfrm        xfrm;
#endif

        u64                        net_cookie; /* written once */

#if IS_ENABLED(CONFIG_IP_VS)
        struct netns_ipvs        *ipvs;
#endif
#if IS_ENABLED(CONFIG_MPLS)
        struct netns_mpls        mpls;
#endif
#if IS_ENABLED(CONFIG_CAN)
        struct netns_can        can;
#endif
#ifdef CONFIG_XDP_SOCKETS
        struct netns_xdp        xdp;
#endif
#if IS_ENABLED(CONFIG_MCTP)
        struct netns_mctp        mctp;
#endif
#if IS_ENABLED(CONFIG_CRYPTO_USER)
        struct sock                *crypto_nlsk;
#endif
        struct sock                *diag_nlsk;
#if IS_ENABLED(CONFIG_SMC)
        struct netns_smc        smc;
#endif
} __randomize_layout;

#include <linux/seq_file_net.h>

/* Init's network namespace */
extern struct net init_net;

#ifdef CONFIG_NET_NS
struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns,
                        struct net *old_net);

void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid);

void net_ns_barrier(void);

struct ns_common *get_net_ns(struct ns_common *ns);
struct net *get_net_ns_by_fd(int fd);
#else /* CONFIG_NET_NS */
#include <linux/sched.h>
#include <linux/nsproxy.h>
static inline struct net *copy_net_ns(unsigned long flags,
        struct user_namespace *user_ns, struct net *old_net)
{
        if (flags & CLONE_NEWNET)
                return ERR_PTR(-EINVAL);
        return old_net;
}

static inline void net_ns_get_ownership(const struct net *net,
                                        kuid_t *uid, kgid_t *gid)
{
        *uid = GLOBAL_ROOT_UID;
        *gid = GLOBAL_ROOT_GID;
}

static inline void net_ns_barrier(void) {}

static inline struct ns_common *get_net_ns(struct ns_common *ns)
{
        return ERR_PTR(-EINVAL);
}

static inline struct net *get_net_ns_by_fd(int fd)
{
        return ERR_PTR(-EINVAL);
}
#endif /* CONFIG_NET_NS */


extern struct list_head net_namespace_list;

struct net *get_net_ns_by_pid(pid_t pid);

#ifdef CONFIG_SYSCTL
void ipx_register_sysctl(void);
void ipx_unregister_sysctl(void);
#else
#define ipx_register_sysctl()
#define ipx_unregister_sysctl()
#endif

#ifdef CONFIG_NET_NS
void __put_net(struct net *net);

/* Try using get_net_track() instead */
static inline struct net *get_net(struct net *net)
{
        refcount_inc(&net->ns.count);
        return net;
}

static inline struct net *maybe_get_net(struct net *net)
{
        /* Used when we know struct net exists but we
         * aren't guaranteed a previous reference count
         * exists.  If the reference count is zero this
         * function fails and returns NULL.
         */
        if (!refcount_inc_not_zero(&net->ns.count))
                net = NULL;
        return net;
}

/* Try using put_net_track() instead */
static inline void put_net(struct net *net)
{
        if (refcount_dec_and_test(&net->ns.count))
                __put_net(net);
}

static inline
int net_eq(const struct net *net1, const struct net *net2)
{
        return net1 == net2;
}

static inline int check_net(const struct net *net)
{
        return refcount_read(&net->ns.count) != 0;
}

void net_drop_ns(void *);

#else

static inline struct net *get_net(struct net *net)
{
        return net;
}

static inline void put_net(struct net *net)
{
}

static inline struct net *maybe_get_net(struct net *net)
{
        return net;
}

static inline
int net_eq(const struct net *net1, const struct net *net2)
{
        return 1;
}

static inline int check_net(const struct net *net)
{
        return 1;
}

#define net_drop_ns NULL
#endif


static inline void __netns_tracker_alloc(struct net *net,
                                         netns_tracker *tracker,
                                         bool refcounted,
                                         gfp_t gfp)
{
#ifdef CONFIG_NET_NS_REFCNT_TRACKER
        ref_tracker_alloc(refcounted ? &net->refcnt_tracker :
                                       &net->notrefcnt_tracker,
                          tracker, gfp);
#endif
}

static inline void netns_tracker_alloc(struct net *net, netns_tracker *tracker,
                                       gfp_t gfp)
{
        __netns_tracker_alloc(net, tracker, true, gfp);
}

static inline void __netns_tracker_free(struct net *net,
                                        netns_tracker *tracker,
                                        bool refcounted)
{
#ifdef CONFIG_NET_NS_REFCNT_TRACKER
       ref_tracker_free(refcounted ? &net->refcnt_tracker :
                                     &net->notrefcnt_tracker, tracker);
#endif
}

static inline struct net *get_net_track(struct net *net,
                                        netns_tracker *tracker, gfp_t gfp)
{
        get_net(net);
        netns_tracker_alloc(net, tracker, gfp);
        return net;
}

static inline void put_net_track(struct net *net, netns_tracker *tracker)
{
        __netns_tracker_free(net, tracker, true);
        put_net(net);
}

typedef struct {
#ifdef CONFIG_NET_NS
        struct net __rcu *net;
#endif
} possible_net_t;

static inline void write_pnet(possible_net_t *pnet, struct net *net)
{
#ifdef CONFIG_NET_NS
        rcu_assign_pointer(pnet->net, net);
#endif
}

static inline struct net *read_pnet(const possible_net_t *pnet)
{
#ifdef CONFIG_NET_NS
        return rcu_dereference_protected(pnet->net, true);
#else
        return &init_net;
#endif
}

static inline struct net *read_pnet_rcu(possible_net_t *pnet)
{
#ifdef CONFIG_NET_NS
        return rcu_dereference(pnet->net);
#else
        return &init_net;
#endif
}

/* Protected by net_rwsem */
#define for_each_net(VAR)                                \
        list_for_each_entry(VAR, &net_namespace_list, list)
#define for_each_net_continue_reverse(VAR)                \
        list_for_each_entry_continue_reverse(VAR, &net_namespace_list, list)
#define for_each_net_rcu(VAR)                                \
        list_for_each_entry_rcu(VAR, &net_namespace_list, list)

#ifdef CONFIG_NET_NS
#define __net_init
#define __net_exit
#define __net_initdata
#define __net_initconst
#else
#define __net_init        __init
#define __net_exit        __ref
#define __net_initdata        __initdata
#define __net_initconst        __initconst
#endif

int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp);
int peernet2id(const struct net *net, struct net *peer);
bool peernet_has_id(const struct net *net, struct net *peer);
struct net *get_net_ns_by_id(const struct net *net, int id);

struct pernet_operations {
        struct list_head list;
        /*
         * Below methods are called without any exclusive locks.
         * More than one net may be constructed and destructed
         * in parallel on several cpus. Every pernet_operations
         * have to keep in mind all other pernet_operations and
         * to introduce a locking, if they share common resources.
         *
         * The only time they are called with exclusive lock is
         * from register_pernet_subsys(), unregister_pernet_subsys()
         * register_pernet_device() and unregister_pernet_device().
         *
         * Exit methods using blocking RCU primitives, such as
         * synchronize_rcu(), should be implemented via exit_batch.
         * Then, destruction of a group of net requires single
         * synchronize_rcu() related to these pernet_operations,
         * instead of separate synchronize_rcu() for every net.
         * Please, avoid synchronize_rcu() at all, where it's possible.
         *
         * Note that a combination of pre_exit() and exit() can
         * be used, since a synchronize_rcu() is guaranteed between
         * the calls.
         */
        int (*init)(struct net *net);
        void (*pre_exit)(struct net *net);
        void (*exit)(struct net *net);
        void (*exit_batch)(struct list_head *net_exit_list);
        /* Following method is called with RTNL held. */
        void (*exit_batch_rtnl)(struct list_head *net_exit_list,
                                struct list_head *dev_kill_list);
        unsigned int *id;
        size_t size;
};

/*
 * Use these carefully.  If you implement a network device and it
 * needs per network namespace operations use device pernet operations,
 * otherwise use pernet subsys operations.
 *
 * Network interfaces need to be removed from a dying netns _before_
 * subsys notifiers can be called, as most of the network code cleanup
 * (which is done from subsys notifiers) runs with the assumption that
 * dev_remove_pack has been called so no new packets will arrive during
 * and after the cleanup functions have been called.  dev_remove_pack
 * is not per namespace so instead the guarantee of no more packets
 * arriving in a network namespace is provided by ensuring that all
 * network devices and all sockets have left the network namespace
 * before the cleanup methods are called.
 *
 * For the longest time the ipv4 icmp code was registered as a pernet
 * device which caused kernel oops, and panics during network
 * namespace cleanup.   So please don't get this wrong.
 */
int register_pernet_subsys(struct pernet_operations *);
void unregister_pernet_subsys(struct pernet_operations *);
int register_pernet_device(struct pernet_operations *);
void unregister_pernet_device(struct pernet_operations *);

struct ctl_table;

#define register_net_sysctl(net, path, table)        \
        register_net_sysctl_sz(net, path, table, ARRAY_SIZE(table))
#ifdef CONFIG_SYSCTL
int net_sysctl_init(void);
struct ctl_table_header *register_net_sysctl_sz(struct net *net, const char *path,
                                             struct ctl_table *table, size_t table_size);
void unregister_net_sysctl_table(struct ctl_table_header *header);
#else
static inline int net_sysctl_init(void) { return 0; }
static inline struct ctl_table_header *register_net_sysctl_sz(struct net *net,
        const char *path, struct ctl_table *table, size_t table_size)
{
        return NULL;
}
static inline void unregister_net_sysctl_table(struct ctl_table_header *header)
{
}
#endif

static inline int rt_genid_ipv4(const struct net *net)
{
        return atomic_read(&net->ipv4.rt_genid);
}

#if IS_ENABLED(CONFIG_IPV6)
static inline int rt_genid_ipv6(const struct net *net)
{
        return atomic_read(&net->ipv6.fib6_sernum);
}
#endif

static inline void rt_genid_bump_ipv4(struct net *net)
{
        atomic_inc(&net->ipv4.rt_genid);
}

extern void (*__fib6_flush_trees)(struct net *net);
static inline void rt_genid_bump_ipv6(struct net *net)
{
        if (__fib6_flush_trees)
                __fib6_flush_trees(net);
}

#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
static inline struct netns_ieee802154_lowpan *
net_ieee802154_lowpan(struct net *net)
{
        return &net->ieee802154_lowpan;
}
#endif

/* For callers who don't really care about whether it's IPv4 or IPv6 */
static inline void rt_genid_bump_all(struct net *net)
{
        rt_genid_bump_ipv4(net);
        rt_genid_bump_ipv6(net);
}

static inline int fnhe_genid(const struct net *net)
{
        return atomic_read(&net->fnhe_genid);
}

static inline void fnhe_genid_bump(struct net *net)
{
        atomic_inc(&net->fnhe_genid);
}

#ifdef CONFIG_NET
void net_ns_init(void);
#else
static inline void net_ns_init(void) {}
#endif

#endif /* __NET_NET_NAMESPACE_H */







































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Common interrupt code for 32 and 64 bit
 */
#include <linux/cpu.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/of.h>
#include <linux/seq_file.h>
#include <linux/smp.h>
#include <linux/ftrace.h>
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/irq.h>

#include <asm/irq_stack.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/irq.h>
#include <asm/mce.h>
#include <asm/hw_irq.h>
#include <asm/desc.h>
#include <asm/traps.h>
#include <asm/thermal.h>
#include <asm/posted_intr.h>
#include <asm/irq_remapping.h>

#define CREATE_TRACE_POINTS
#include <asm/trace/irq_vectors.h>

DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
EXPORT_PER_CPU_SYMBOL(irq_stat);

atomic_t irq_err_count;

/*
 * 'what should we do if we get a hw irq event on an illegal vector'.
 * each architecture has to answer this themselves.
 */
void ack_bad_irq(unsigned int irq)
{
        if (printk_ratelimit())
                pr_err("unexpected IRQ trap at vector %02x\n", irq);

        /*
         * Currently unexpected vectors happen only on SMP and APIC.
         * We _must_ ack these because every local APIC has only N
         * irq slots per priority level, and a 'hanging, unacked' IRQ
         * holds up an irq slot - in excessive cases (when multiple
         * unexpected vectors occur) that might lock up the APIC
         * completely.
         * But only ack when the APIC is enabled -AK
         */
        apic_eoi();
}

#define irq_stats(x)                (&per_cpu(irq_stat, x))
/*
 * /proc/interrupts printing for arch specific interrupts
 */
int arch_show_interrupts(struct seq_file *p, int prec)
{
        int j;

        seq_printf(p, "%*s: ", prec, "NMI");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
        seq_puts(p, "  Non-maskable interrupts\n");
#ifdef CONFIG_X86_LOCAL_APIC
        seq_printf(p, "%*s: ", prec, "LOC");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
        seq_puts(p, "  Local timer interrupts\n");

        seq_printf(p, "%*s: ", prec, "SPU");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
        seq_puts(p, "  Spurious interrupts\n");
        seq_printf(p, "%*s: ", prec, "PMI");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
        seq_puts(p, "  Performance monitoring interrupts\n");
        seq_printf(p, "%*s: ", prec, "IWI");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
        seq_puts(p, "  IRQ work interrupts\n");
        seq_printf(p, "%*s: ", prec, "RTR");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
        seq_puts(p, "  APIC ICR read retries\n");
        if (x86_platform_ipi_callback) {
                seq_printf(p, "%*s: ", prec, "PLT");
                for_each_online_cpu(j)
                        seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
                seq_puts(p, "  Platform interrupts\n");
        }
#endif
#ifdef CONFIG_SMP
        seq_printf(p, "%*s: ", prec, "RES");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
        seq_puts(p, "  Rescheduling interrupts\n");
        seq_printf(p, "%*s: ", prec, "CAL");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
        seq_puts(p, "  Function call interrupts\n");
        seq_printf(p, "%*s: ", prec, "TLB");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
        seq_puts(p, "  TLB shootdowns\n");
#endif
#ifdef CONFIG_X86_THERMAL_VECTOR
        seq_printf(p, "%*s: ", prec, "TRM");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
        seq_puts(p, "  Thermal event interrupts\n");
#endif
#ifdef CONFIG_X86_MCE_THRESHOLD
        seq_printf(p, "%*s: ", prec, "THR");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
        seq_puts(p, "  Threshold APIC interrupts\n");
#endif
#ifdef CONFIG_X86_MCE_AMD
        seq_printf(p, "%*s: ", prec, "DFR");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count);
        seq_puts(p, "  Deferred Error APIC interrupts\n");
#endif
#ifdef CONFIG_X86_MCE
        seq_printf(p, "%*s: ", prec, "MCE");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
        seq_puts(p, "  Machine check exceptions\n");
        seq_printf(p, "%*s: ", prec, "MCP");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
        seq_puts(p, "  Machine check polls\n");
#endif
#ifdef CONFIG_X86_HV_CALLBACK_VECTOR
        if (test_bit(HYPERVISOR_CALLBACK_VECTOR, system_vectors)) {
                seq_printf(p, "%*s: ", prec, "HYP");
                for_each_online_cpu(j)
                        seq_printf(p, "%10u ",
                                   irq_stats(j)->irq_hv_callback_count);
                seq_puts(p, "  Hypervisor callback interrupts\n");
        }
#endif
#if IS_ENABLED(CONFIG_HYPERV)
        if (test_bit(HYPERV_REENLIGHTENMENT_VECTOR, system_vectors)) {
                seq_printf(p, "%*s: ", prec, "HRE");
                for_each_online_cpu(j)
                        seq_printf(p, "%10u ",
                                   irq_stats(j)->irq_hv_reenlightenment_count);
                seq_puts(p, "  Hyper-V reenlightenment interrupts\n");
        }
        if (test_bit(HYPERV_STIMER0_VECTOR, system_vectors)) {
                seq_printf(p, "%*s: ", prec, "HVS");
                for_each_online_cpu(j)
                        seq_printf(p, "%10u ",
                                   irq_stats(j)->hyperv_stimer0_count);
                seq_puts(p, "  Hyper-V stimer0 interrupts\n");
        }
#endif
        seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
#if defined(CONFIG_X86_IO_APIC)
        seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
#endif
#if IS_ENABLED(CONFIG_KVM)
        seq_printf(p, "%*s: ", prec, "PIN");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis);
        seq_puts(p, "  Posted-interrupt notification event\n");

        seq_printf(p, "%*s: ", prec, "NPI");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ",
                           irq_stats(j)->kvm_posted_intr_nested_ipis);
        seq_puts(p, "  Nested posted-interrupt event\n");

        seq_printf(p, "%*s: ", prec, "PIW");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ",
                           irq_stats(j)->kvm_posted_intr_wakeup_ipis);
        seq_puts(p, "  Posted-interrupt wakeup event\n");
#endif
#ifdef CONFIG_X86_POSTED_MSI
        seq_printf(p, "%*s: ", prec, "PMN");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ",
                           irq_stats(j)->posted_msi_notification_count);
        seq_puts(p, "  Posted MSI notification event\n");
#endif
        return 0;
}

/*
 * /proc/stat helpers
 */
u64 arch_irq_stat_cpu(unsigned int cpu)
{
        u64 sum = irq_stats(cpu)->__nmi_count;

#ifdef CONFIG_X86_LOCAL_APIC
        sum += irq_stats(cpu)->apic_timer_irqs;
        sum += irq_stats(cpu)->irq_spurious_count;
        sum += irq_stats(cpu)->apic_perf_irqs;
        sum += irq_stats(cpu)->apic_irq_work_irqs;
        sum += irq_stats(cpu)->icr_read_retry_count;
        if (x86_platform_ipi_callback)
                sum += irq_stats(cpu)->x86_platform_ipis;
#endif
#ifdef CONFIG_SMP
        sum += irq_stats(cpu)->irq_resched_count;
        sum += irq_stats(cpu)->irq_call_count;
#endif
#ifdef CONFIG_X86_THERMAL_VECTOR
        sum += irq_stats(cpu)->irq_thermal_count;
#endif
#ifdef CONFIG_X86_MCE_THRESHOLD
        sum += irq_stats(cpu)->irq_threshold_count;
#endif
#ifdef CONFIG_X86_HV_CALLBACK_VECTOR
        sum += irq_stats(cpu)->irq_hv_callback_count;
#endif
#if IS_ENABLED(CONFIG_HYPERV)
        sum += irq_stats(cpu)->irq_hv_reenlightenment_count;
        sum += irq_stats(cpu)->hyperv_stimer0_count;
#endif
#ifdef CONFIG_X86_MCE
        sum += per_cpu(mce_exception_count, cpu);
        sum += per_cpu(mce_poll_count, cpu);
#endif
        return sum;
}

u64 arch_irq_stat(void)
{
        u64 sum = atomic_read(&irq_err_count);
        return sum;
}

static __always_inline void handle_irq(struct irq_desc *desc,
                                       struct pt_regs *regs)
{
        if (IS_ENABLED(CONFIG_X86_64))
                generic_handle_irq_desc(desc);
        else
                __handle_irq(desc, regs);
}

static __always_inline int call_irq_handler(int vector, struct pt_regs *regs)
{
        struct irq_desc *desc;
        int ret = 0;

        desc = __this_cpu_read(vector_irq[vector]);
        if (likely(!IS_ERR_OR_NULL(desc))) {
                handle_irq(desc, regs);
        } else {
                ret = -EINVAL;
                if (desc == VECTOR_UNUSED) {
                        pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n",
                                             __func__, smp_processor_id(),
                                             vector);
                } else {
                        __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
                }
        }

        return ret;
}

/*
 * common_interrupt() handles all normal device IRQ's (the special SMP
 * cross-CPU interrupts have their own entry points).
 */
DEFINE_IDTENTRY_IRQ(common_interrupt)
{
        struct pt_regs *old_regs = set_irq_regs(regs);

        /* entry code tells RCU that we're not quiescent.  Check it. */
        RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");

        if (unlikely(call_irq_handler(vector, regs)))
                apic_eoi();

        set_irq_regs(old_regs);
}

#ifdef CONFIG_X86_LOCAL_APIC
/* Function pointer for generic interrupt vector handling */
void (*x86_platform_ipi_callback)(void) = NULL;
/*
 * Handler for X86_PLATFORM_IPI_VECTOR.
 */
DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi)
{
        struct pt_regs *old_regs = set_irq_regs(regs);

        apic_eoi();
        trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
        inc_irq_stat(x86_platform_ipis);
        if (x86_platform_ipi_callback)
                x86_platform_ipi_callback();
        trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
        set_irq_regs(old_regs);
}
#endif

#if IS_ENABLED(CONFIG_KVM)
static void dummy_handler(void) {}
static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler;

void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
{
        if (handler)
                kvm_posted_intr_wakeup_handler = handler;
        else {
                kvm_posted_intr_wakeup_handler = dummy_handler;
                synchronize_rcu();
        }
}
EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);

/*
 * Handler for POSTED_INTERRUPT_VECTOR.
 */
DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi)
{
        apic_eoi();
        inc_irq_stat(kvm_posted_intr_ipis);
}

/*
 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
 */
DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi)
{
        apic_eoi();
        inc_irq_stat(kvm_posted_intr_wakeup_ipis);
        kvm_posted_intr_wakeup_handler();
}

/*
 * Handler for POSTED_INTERRUPT_NESTED_VECTOR.
 */
DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi)
{
        apic_eoi();
        inc_irq_stat(kvm_posted_intr_nested_ipis);
}
#endif

#ifdef CONFIG_X86_POSTED_MSI

/* Posted Interrupt Descriptors for coalesced MSIs to be posted */
DEFINE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc);

void intel_posted_msi_init(void)
{
        u32 destination;
        u32 apic_id;

        this_cpu_write(posted_msi_pi_desc.nv, POSTED_MSI_NOTIFICATION_VECTOR);

        /*
         * APIC destination ID is stored in bit 8:15 while in XAPIC mode.
         * VT-d spec. CH 9.11
         */
        apic_id = this_cpu_read(x86_cpu_to_apicid);
        destination = x2apic_enabled() ? apic_id : apic_id << 8;
        this_cpu_write(posted_msi_pi_desc.ndst, destination);
}

/*
 * De-multiplexing posted interrupts is on the performance path, the code
 * below is written to optimize the cache performance based on the following
 * considerations:
 * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
 *   accessed by both CPU and IOMMU.
 * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg
 *   for checking and clearing posted interrupt request (PIR), a 256 bit field
 *   within the PID.
 * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
 *   line when posting interrupts and setting control bits.
 * 4.The CPU can access the cache line a magnitude faster than the IOMMU.
 * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
 *   cache line. The cache line states after each operation are as follows:
 *   CPU                IOMMU                        PID Cache line state
 *   ---------------------------------------------------------------
 *...read64                                        exclusive
 *...lock xchg64                                modified
 *...                        post/atomic swap        invalid
 *...-------------------------------------------------------------
 *
 * To reduce L1 data cache miss, it is important to avoid contention with
 * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
 * to dispatch interrupt handlers.
 *
 * In addition, the code is trying to keep the cache line state consistent
 * as much as possible. e.g. when making a copy and clearing the PIR
 * (assuming non-zero PIR bits are present in the entire PIR), it does:
 *                read, read, read, read, xchg, xchg, xchg, xchg
 * instead of:
 *                read, xchg, read, xchg, read, xchg, read, xchg
 */
static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs)
{
        int i, vec = FIRST_EXTERNAL_VECTOR;
        unsigned long pir_copy[4];
        bool handled = false;

        for (i = 0; i < 4; i++)
                pir_copy[i] = pir[i];

        for (i = 0; i < 4; i++) {
                if (!pir_copy[i])
                        continue;

                pir_copy[i] = arch_xchg(&pir[i], 0);
                handled = true;
        }

        if (handled) {
                for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
                        call_irq_handler(vec, regs);
        }

        return handled;
}

/*
 * Performance data shows that 3 is good enough to harvest 90+% of the benefit
 * on high IRQ rate workload.
 */
#define MAX_POSTED_MSI_COALESCING_LOOP 3

/*
 * For MSIs that are delivered as posted interrupts, the CPU notifications
 * can be coalesced if the MSIs arrive in high frequency bursts.
 */
DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
{
        struct pt_regs *old_regs = set_irq_regs(regs);
        struct pi_desc *pid;
        int i = 0;

        pid = this_cpu_ptr(&posted_msi_pi_desc);

        inc_irq_stat(posted_msi_notification_count);
        irq_enter();

        /*
         * Max coalescing count includes the extra round of handle_pending_pir
         * after clearing the outstanding notification bit. Hence, at most
         * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here.
         */
        while (++i < MAX_POSTED_MSI_COALESCING_LOOP) {
                if (!handle_pending_pir(pid->pir64, regs))
                        break;
        }

        /*
         * Clear outstanding notification bit to allow new IRQ notifications,
         * do this last to maximize the window of interrupt coalescing.
         */
        pi_clear_on(pid);

        /*
         * There could be a race of PI notification and the clearing of ON bit,
         * process PIR bits one last time such that handling the new interrupts
         * are not delayed until the next IRQ.
         */
        handle_pending_pir(pid->pir64, regs);

        apic_eoi();
        irq_exit();
        set_irq_regs(old_regs);
}
#endif /* X86_POSTED_MSI */

#ifdef CONFIG_HOTPLUG_CPU
/* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
void fixup_irqs(void)
{
        unsigned int vector;
        struct irq_desc *desc;
        struct irq_data *data;
        struct irq_chip *chip;

        irq_migrate_all_off_this_cpu();

        /*
         * We can remove mdelay() and then send spurious interrupts to
         * new cpu targets for all the irqs that were handled previously by
         * this cpu. While it works, I have seen spurious interrupt messages
         * (nothing wrong but still...).
         *
         * So for now, retain mdelay(1) and check the IRR and then send those
         * interrupts to new targets as this cpu is already offlined...
         */
        mdelay(1);

        /*
         * We can walk the vector array of this cpu without holding
         * vector_lock because the cpu is already marked !online, so
         * nothing else will touch it.
         */
        for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
                if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
                        continue;

                if (is_vector_pending(vector)) {
                        desc = __this_cpu_read(vector_irq[vector]);

                        raw_spin_lock(&desc->lock);
                        data = irq_desc_get_irq_data(desc);
                        chip = irq_data_get_irq_chip(data);
                        if (chip->irq_retrigger) {
                                chip->irq_retrigger(data);
                                __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
                        }
                        raw_spin_unlock(&desc->lock);
                }
                if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
                        __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
        }
}
#endif

#ifdef CONFIG_X86_THERMAL_VECTOR
static void smp_thermal_vector(void)
{
        if (x86_thermal_enabled())
                intel_thermal_interrupt();
        else
                pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
                       smp_processor_id());
}

DEFINE_IDTENTRY_SYSVEC(sysvec_thermal)
{
        trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
        inc_irq_stat(irq_thermal_count);
        smp_thermal_vector();
        trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
        apic_eoi();
}
#endif




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 



















































    1 





































    1 


    1 



    1 



















































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2011 Novell Inc.
 * Copyright (C) 2016 Red Hat, Inc.
 */

#include <linux/fs.h>
#include <linux/cred.h>
#include <linux/ctype.h>
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/ratelimit.h>
#include <linux/mount.h>
#include <linux/exportfs.h>
#include "overlayfs.h"

#include "../internal.h"        /* for vfs_path_lookup */

struct ovl_lookup_data {
        struct super_block *sb;
        const struct ovl_layer *layer;
        struct qstr name;
        bool is_dir;
        bool opaque;
        bool xwhiteouts;
        bool stop;
        bool last;
        char *redirect;
        int metacopy;
        /* Referring to last redirect xattr */
        bool absolute_redirect;
};

static int ovl_check_redirect(const struct path *path, struct ovl_lookup_data *d,
                              size_t prelen, const char *post)
{
        int res;
        char *buf;
        struct ovl_fs *ofs = OVL_FS(d->sb);

        d->absolute_redirect = false;
        buf = ovl_get_redirect_xattr(ofs, path, prelen + strlen(post));
        if (IS_ERR_OR_NULL(buf))
                return PTR_ERR(buf);

        if (buf[0] == '/') {
                d->absolute_redirect = true;
                /*
                 * One of the ancestor path elements in an absolute path
                 * lookup in ovl_lookup_layer() could have been opaque and
                 * that will stop further lookup in lower layers (d->stop=true)
                 * But we have found an absolute redirect in descendant path
                 * element and that should force continue lookup in lower
                 * layers (reset d->stop).
                 */
                d->stop = false;
        } else {
                res = strlen(buf) + 1;
                memmove(buf + prelen, buf, res);
                memcpy(buf, d->name.name, prelen);
        }

        strcat(buf, post);
        kfree(d->redirect);
        d->redirect = buf;
        d->name.name = d->redirect;
        d->name.len = strlen(d->redirect);

        return 0;
}

static int ovl_acceptable(void *ctx, struct dentry *dentry)
{
        /*
         * A non-dir origin may be disconnected, which is fine, because
         * we only need it for its unique inode number.
         */
        if (!d_is_dir(dentry))
                return 1;

        /* Don't decode a deleted empty directory */
        if (d_unhashed(dentry))
                return 0;

        /* Check if directory belongs to the layer we are decoding from */
        return is_subdir(dentry, ((struct vfsmount *)ctx)->mnt_root);
}

/*
 * Check validity of an overlay file handle buffer.
 *
 * Return 0 for a valid file handle.
 * Return -ENODATA for "origin unknown".
 * Return <0 for an invalid file handle.
 */
int ovl_check_fb_len(struct ovl_fb *fb, int fb_len)
{
        if (fb_len < sizeof(struct ovl_fb) || fb_len < fb->len)
                return -EINVAL;

        if (fb->magic != OVL_FH_MAGIC)
                return -EINVAL;

        /* Treat larger version and unknown flags as "origin unknown" */
        if (fb->version > OVL_FH_VERSION || fb->flags & ~OVL_FH_FLAG_ALL)
                return -ENODATA;

        /* Treat endianness mismatch as "origin unknown" */
        if (!(fb->flags & OVL_FH_FLAG_ANY_ENDIAN) &&
            (fb->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN)
                return -ENODATA;

        return 0;
}

static struct ovl_fh *ovl_get_fh(struct ovl_fs *ofs, struct dentry *upperdentry,
                                 enum ovl_xattr ox)
{
        int res, err;
        struct ovl_fh *fh = NULL;

        res = ovl_getxattr_upper(ofs, upperdentry, ox, NULL, 0);
        if (res < 0) {
                if (res == -ENODATA || res == -EOPNOTSUPP)
                        return NULL;
                goto fail;
        }
        /* Zero size value means "copied up but origin unknown" */
        if (res == 0)
                return NULL;

        fh = kzalloc(res + OVL_FH_WIRE_OFFSET, GFP_KERNEL);
        if (!fh)
                return ERR_PTR(-ENOMEM);

        res = ovl_getxattr_upper(ofs, upperdentry, ox, fh->buf, res);
        if (res < 0)
                goto fail;

        err = ovl_check_fb_len(&fh->fb, res);
        if (err < 0) {
                if (err == -ENODATA)
                        goto out;
                goto invalid;
        }

        return fh;

out:
        kfree(fh);
        return NULL;

fail:
        pr_warn_ratelimited("failed to get origin (%i)\n", res);
        goto out;
invalid:
        pr_warn_ratelimited("invalid origin (%*phN)\n", res, fh);
        goto out;
}

struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
                                  struct vfsmount *mnt, bool connected)
{
        struct dentry *real;
        int bytes;

        if (!capable(CAP_DAC_READ_SEARCH))
                return NULL;

        /*
         * Make sure that the stored uuid matches the uuid of the lower
         * layer where file handle will be decoded.
         * In case of uuid=off option just make sure that stored uuid is null.
         */
        if (ovl_origin_uuid(ofs) ?
            !uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid) :
            !uuid_is_null(&fh->fb.uuid))
                return NULL;

        bytes = (fh->fb.len - offsetof(struct ovl_fb, fid));
        real = exportfs_decode_fh(mnt, (struct fid *)fh->fb.fid,
                                  bytes >> 2, (int)fh->fb.type,
                                  connected ? ovl_acceptable : NULL, mnt);
        if (IS_ERR(real)) {
                /*
                 * Treat stale file handle to lower file as "origin unknown".
                 * upper file handle could become stale when upper file is
                 * unlinked and this information is needed to handle stale
                 * index entries correctly.
                 */
                if (real == ERR_PTR(-ESTALE) &&
                    !(fh->fb.flags & OVL_FH_FLAG_PATH_UPPER))
                        real = NULL;
                return real;
        }

        if (ovl_dentry_weird(real)) {
                dput(real);
                return NULL;
        }

        return real;
}

static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d,
                                                   const char *name,
                                                   struct dentry *base, int len,
                                                   bool drop_negative)
{
        struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), name,
                                                 base, len);

        if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
                if (drop_negative && ret->d_lockref.count == 1) {
                        spin_lock(&ret->d_lock);
                        /* Recheck condition under lock */
                        if (d_is_negative(ret) && ret->d_lockref.count == 1)
                                __d_drop(ret);
                        spin_unlock(&ret->d_lock);
                }
                dput(ret);
                ret = ERR_PTR(-ENOENT);
        }
        return ret;
}

static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
                             const char *name, unsigned int namelen,
                             size_t prelen, const char *post,
                             struct dentry **ret, bool drop_negative)
{
        struct ovl_fs *ofs = OVL_FS(d->sb);
        struct dentry *this;
        struct path path;
        int err;
        bool last_element = !post[0];
        bool is_upper = d->layer->idx == 0;
        char val;

        this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative);
        if (IS_ERR(this)) {
                err = PTR_ERR(this);
                this = NULL;
                if (err == -ENOENT || err == -ENAMETOOLONG)
                        goto out;
                goto out_err;
        }

        if (ovl_dentry_weird(this)) {
                /* Don't support traversing automounts and other weirdness */
                err = -EREMOTE;
                goto out_err;
        }

        path.dentry = this;
        path.mnt = d->layer->mnt;
        if (ovl_path_is_whiteout(ofs, &path)) {
                d->stop = d->opaque = true;
                goto put_and_out;
        }
        /*
         * This dentry should be a regular file if previous layer lookup
         * found a metacopy dentry.
         */
        if (last_element && d->metacopy && !d_is_reg(this)) {
                d->stop = true;
                goto put_and_out;
        }

        if (!d_can_lookup(this)) {
                if (d->is_dir || !last_element) {
                        d->stop = true;
                        goto put_and_out;
                }
                err = ovl_check_metacopy_xattr(ofs, &path, NULL);
                if (err < 0)
                        goto out_err;

                d->metacopy = err;
                d->stop = !d->metacopy;
                if (!d->metacopy || d->last)
                        goto out;
        } else {
                if (ovl_lookup_trap_inode(d->sb, this)) {
                        /* Caught in a trap of overlapping layers */
                        err = -ELOOP;
                        goto out_err;
                }

                if (last_element)
                        d->is_dir = true;
                if (d->last)
                        goto out;

                /* overlay.opaque=x means xwhiteouts directory */
                val = ovl_get_opaquedir_val(ofs, &path);
                if (last_element && !is_upper && val == 'x') {
                        d->xwhiteouts = true;
                        ovl_layer_set_xwhiteouts(ofs, d->layer);
                } else if (val == 'y') {
                        d->stop = true;
                        if (last_element)
                                d->opaque = true;
                        goto out;
                }
        }
        err = ovl_check_redirect(&path, d, prelen, post);
        if (err)
                goto out_err;
out:
        *ret = this;
        return 0;

put_and_out:
        dput(this);
        this = NULL;
        goto out;

out_err:
        dput(this);
        return err;
}

static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d,
                            struct dentry **ret, bool drop_negative)
{
        /* Counting down from the end, since the prefix can change */
        size_t rem = d->name.len - 1;
        struct dentry *dentry = NULL;
        int err;

        if (d->name.name[0] != '/')
                return ovl_lookup_single(base, d, d->name.name, d->name.len,
                                         0, "", ret, drop_negative);

        while (!IS_ERR_OR_NULL(base) && d_can_lookup(base)) {
                const char *s = d->name.name + d->name.len - rem;
                const char *next = strchrnul(s, '/');
                size_t thislen = next - s;
                bool end = !next[0];

                /* Verify we did not go off the rails */
                if (WARN_ON(s[-1] != '/'))
                        return -EIO;

                err = ovl_lookup_single(base, d, s, thislen,
                                        d->name.len - rem, next, &base,
                                        drop_negative);
                dput(dentry);
                if (err)
                        return err;
                dentry = base;
                if (end)
                        break;

                rem -= thislen + 1;

                if (WARN_ON(rem >= d->name.len))
                        return -EIO;
        }
        *ret = dentry;
        return 0;
}

static int ovl_lookup_data_layer(struct dentry *dentry, const char *redirect,
                                 const struct ovl_layer *layer,
                                 struct path *datapath)
{
        int err;

        err = vfs_path_lookup(layer->mnt->mnt_root, layer->mnt, redirect,
                        LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS | LOOKUP_NO_XDEV,
                        datapath);
        pr_debug("lookup lowerdata (%pd2, redirect=\"%s\", layer=%d, err=%i)\n",
                 dentry, redirect, layer->idx, err);

        if (err)
                return err;

        err = -EREMOTE;
        if (ovl_dentry_weird(datapath->dentry))
                goto out_path_put;

        err = -ENOENT;
        /* Only regular file is acceptable as lower data */
        if (!d_is_reg(datapath->dentry))
                goto out_path_put;

        return 0;

out_path_put:
        path_put(datapath);

        return err;
}

/* Lookup in data-only layers by absolute redirect to layer root */
static int ovl_lookup_data_layers(struct dentry *dentry, const char *redirect,
                                  struct ovl_path *lowerdata)
{
        struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
        const struct ovl_layer *layer;
        struct path datapath;
        int err = -ENOENT;
        int i;

        layer = &ofs->layers[ofs->numlayer - ofs->numdatalayer];
        for (i = 0; i < ofs->numdatalayer; i++, layer++) {
                err = ovl_lookup_data_layer(dentry, redirect, layer, &datapath);
                if (!err) {
                        mntput(datapath.mnt);
                        lowerdata->dentry = datapath.dentry;
                        lowerdata->layer = layer;
                        return 0;
                }
        }

        return err;
}

int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
                        struct dentry *upperdentry, struct ovl_path **stackp)
{
        struct dentry *origin = NULL;
        int i;

        for (i = 1; i <= ovl_numlowerlayer(ofs); i++) {
                /*
                 * If lower fs uuid is not unique among lower fs we cannot match
                 * fh->uuid to layer.
                 */
                if (ofs->layers[i].fsid &&
                    ofs->layers[i].fs->bad_uuid)
                        continue;

                origin = ovl_decode_real_fh(ofs, fh, ofs->layers[i].mnt,
                                            connected);
                if (origin)
                        break;
        }

        if (!origin)
                return -ESTALE;
        else if (IS_ERR(origin))
                return PTR_ERR(origin);

        if (upperdentry && !ovl_upper_is_whiteout(ofs, upperdentry) &&
            inode_wrong_type(d_inode(upperdentry), d_inode(origin)->i_mode))
                goto invalid;

        if (!*stackp)
                *stackp = kmalloc(sizeof(struct ovl_path), GFP_KERNEL);
        if (!*stackp) {
                dput(origin);
                return -ENOMEM;
        }
        **stackp = (struct ovl_path){
                .dentry = origin,
                .layer = &ofs->layers[i]
        };

        return 0;

invalid:
        pr_warn_ratelimited("invalid origin (%pd2, ftype=%x, origin ftype=%x).\n",
                            upperdentry, d_inode(upperdentry)->i_mode & S_IFMT,
                            d_inode(origin)->i_mode & S_IFMT);
        dput(origin);
        return -ESTALE;
}

static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry,
                            struct ovl_path **stackp)
{
        struct ovl_fh *fh = ovl_get_fh(ofs, upperdentry, OVL_XATTR_ORIGIN);
        int err;

        if (IS_ERR_OR_NULL(fh))
                return PTR_ERR(fh);

        err = ovl_check_origin_fh(ofs, fh, false, upperdentry, stackp);
        kfree(fh);

        if (err) {
                if (err == -ESTALE)
                        return 0;
                return err;
        }

        return 0;
}

/*
 * Verify that @fh matches the file handle stored in xattr @name.
 * Return 0 on match, -ESTALE on mismatch, < 0 on error.
 */
static int ovl_verify_fh(struct ovl_fs *ofs, struct dentry *dentry,
                         enum ovl_xattr ox, const struct ovl_fh *fh)
{
        struct ovl_fh *ofh = ovl_get_fh(ofs, dentry, ox);
        int err = 0;

        if (!ofh)
                return -ENODATA;

        if (IS_ERR(ofh))
                return PTR_ERR(ofh);

        if (fh->fb.len != ofh->fb.len || memcmp(&fh->fb, &ofh->fb, fh->fb.len))
                err = -ESTALE;

        kfree(ofh);
        return err;
}

int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
                      enum ovl_xattr ox, const struct ovl_fh *fh,
                      bool is_upper, bool set)
{
        int err;

        err = ovl_verify_fh(ofs, dentry, ox, fh);
        if (set && err == -ENODATA)
                err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len);

        return err;
}

/*
 * Verify that @real dentry matches the file handle stored in xattr @name.
 *
 * If @set is true and there is no stored file handle, encode @real and store
 * file handle in xattr @name.
 *
 * Return 0 on match, -ESTALE on mismatch, -ENODATA on no xattr, < 0 on error.
 */
int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry,
                            enum ovl_xattr ox, struct dentry *real,
                            bool is_upper, bool set)
{
        struct inode *inode;
        struct ovl_fh *fh;
        int err;

        fh = ovl_encode_real_fh(ofs, real, is_upper);
        err = PTR_ERR(fh);
        if (IS_ERR(fh)) {
                fh = NULL;
                goto fail;
        }

        err = ovl_verify_set_fh(ofs, dentry, ox, fh, is_upper, set);
        if (err)
                goto fail;

out:
        kfree(fh);
        return err;

fail:
        inode = d_inode(real);
        pr_warn_ratelimited("failed to verify %s (%pd2, ino=%lu, err=%i)\n",
                            is_upper ? "upper" : "origin", real,
                            inode ? inode->i_ino : 0, err);
        goto out;
}


/* Get upper dentry from index */
struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index,
                               bool connected)
{
        struct ovl_fh *fh;
        struct dentry *upper;

        if (!d_is_dir(index))
                return dget(index);

        fh = ovl_get_fh(ofs, index, OVL_XATTR_UPPER);
        if (IS_ERR_OR_NULL(fh))
                return ERR_CAST(fh);

        upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), connected);
        kfree(fh);

        if (IS_ERR_OR_NULL(upper))
                return upper ?: ERR_PTR(-ESTALE);

        if (!d_is_dir(upper)) {
                pr_warn_ratelimited("invalid index upper (%pd2, upper=%pd2).\n",
                                    index, upper);
                dput(upper);
                return ERR_PTR(-EIO);
        }

        return upper;
}

/*
 * Verify that an index entry name matches the origin file handle stored in
 * OVL_XATTR_ORIGIN and that origin file handle can be decoded to lower path.
 * Return 0 on match, -ESTALE on mismatch or stale origin, < 0 on error.
 */
int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index)
{
        struct ovl_fh *fh = NULL;
        size_t len;
        struct ovl_path origin = { };
        struct ovl_path *stack = &origin;
        struct dentry *upper = NULL;
        int err;

        if (!d_inode(index))
                return 0;

        err = -EINVAL;
        if (index->d_name.len < sizeof(struct ovl_fb)*2)
                goto fail;

        err = -ENOMEM;
        len = index->d_name.len / 2;
        fh = kzalloc(len + OVL_FH_WIRE_OFFSET, GFP_KERNEL);
        if (!fh)
                goto fail;

        err = -EINVAL;
        if (hex2bin(fh->buf, index->d_name.name, len))
                goto fail;

        err = ovl_check_fb_len(&fh->fb, len);
        if (err)
                goto fail;

        /*
         * Whiteout index entries are used as an indication that an exported
         * overlay file handle should be treated as stale (i.e. after unlink
         * of the overlay inode). These entries contain no origin xattr.
         */
        if (ovl_is_whiteout(index))
                goto out;

        /*
         * Verifying directory index entries are not stale is expensive, so
         * only verify stale dir index if NFS export is enabled.
         */
        if (d_is_dir(index) && !ofs->config.nfs_export)
                goto out;

        /*
         * Directory index entries should have 'upper' xattr pointing to the
         * real upper dir. Non-dir index entries are hardlinks to the upper
         * real inode. For non-dir index, we can read the copy up origin xattr
         * directly from the index dentry, but for dir index we first need to
         * decode the upper directory.
         */
        upper = ovl_index_upper(ofs, index, false);
        if (IS_ERR_OR_NULL(upper)) {
                err = PTR_ERR(upper);
                /*
                 * Directory index entries with no 'upper' xattr need to be
                 * removed. When dir index entry has a stale 'upper' xattr,
                 * we assume that upper dir was removed and we treat the dir
                 * index as orphan entry that needs to be whited out.
                 */
                if (err == -ESTALE)
                        goto orphan;
                else if (!err)
                        err = -ESTALE;
                goto fail;
        }

        err = ovl_verify_fh(ofs, upper, OVL_XATTR_ORIGIN, fh);
        dput(upper);
        if (err)
                goto fail;

        /* Check if non-dir index is orphan and don't warn before cleaning it */
        if (!d_is_dir(index) && d_inode(index)->i_nlink == 1) {
                err = ovl_check_origin_fh(ofs, fh, false, index, &stack);
                if (err)
                        goto fail;

                if (ovl_get_nlink(ofs, origin.dentry, index, 0) == 0)
                        goto orphan;
        }

out:
        dput(origin.dentry);
        kfree(fh);
        return err;

fail:
        pr_warn_ratelimited("failed to verify index (%pd2, ftype=%x, err=%i)\n",
                            index, d_inode(index)->i_mode & S_IFMT, err);
        goto out;

orphan:
        pr_warn_ratelimited("orphan index entry (%pd2, ftype=%x, nlink=%u)\n",
                            index, d_inode(index)->i_mode & S_IFMT,
                            d_inode(index)->i_nlink);
        err = -ENOENT;
        goto out;
}

int ovl_get_index_name_fh(const struct ovl_fh *fh, struct qstr *name)
{
        char *n, *s;

        n = kcalloc(fh->fb.len, 2, GFP_KERNEL);
        if (!n)
                return -ENOMEM;

        s  = bin2hex(n, fh->buf, fh->fb.len);
        *name = (struct qstr) QSTR_INIT(n, s - n);

        return 0;

}

/*
 * Lookup in indexdir for the index entry of a lower real inode or a copy up
 * origin inode. The index entry name is the hex representation of the lower
 * inode file handle.
 *
 * If the index dentry in negative, then either no lower aliases have been
 * copied up yet, or aliases have been copied up in older kernels and are
 * not indexed.
 *
 * If the index dentry for a copy up origin inode is positive, but points
 * to an inode different than the upper inode, then either the upper inode
 * has been copied up and not indexed or it was indexed, but since then
 * index dir was cleared. Either way, that index cannot be used to identify
 * the overlay inode.
 */
int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
                       struct qstr *name)
{
        struct ovl_fh *fh;
        int err;

        fh = ovl_encode_real_fh(ofs, origin, false);
        if (IS_ERR(fh))
                return PTR_ERR(fh);

        err = ovl_get_index_name_fh(fh, name);

        kfree(fh);
        return err;
}

/* Lookup index by file handle for NFS export */
struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh)
{
        struct dentry *index;
        struct qstr name;
        int err;

        err = ovl_get_index_name_fh(fh, &name);
        if (err)
                return ERR_PTR(err);

        index = lookup_positive_unlocked(name.name, ofs->workdir, name.len);
        kfree(name.name);
        if (IS_ERR(index)) {
                if (PTR_ERR(index) == -ENOENT)
                        index = NULL;
                return index;
        }

        if (ovl_is_whiteout(index))
                err = -ESTALE;
        else if (ovl_dentry_weird(index))
                err = -EIO;
        else
                return index;

        dput(index);
        return ERR_PTR(err);
}

struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
                                struct dentry *origin, bool verify)
{
        struct dentry *index;
        struct inode *inode;
        struct qstr name;
        bool is_dir = d_is_dir(origin);
        int err;

        err = ovl_get_index_name(ofs, origin, &name);
        if (err)
                return ERR_PTR(err);

        index = lookup_one_positive_unlocked(ovl_upper_mnt_idmap(ofs), name.name,
                                             ofs->workdir, name.len);
        if (IS_ERR(index)) {
                err = PTR_ERR(index);
                if (err == -ENOENT) {
                        index = NULL;
                        goto out;
                }
                pr_warn_ratelimited("failed inode index lookup (ino=%lu, key=%.*s, err=%i);\n"
                                    "overlayfs: mount with '-o index=off' to disable inodes index.\n",
                                    d_inode(origin)->i_ino, name.len, name.name,
                                    err);
                goto out;
        }

        inode = d_inode(index);
        if (ovl_is_whiteout(index) && !verify) {
                /*
                 * When index lookup is called with !verify for decoding an
                 * overlay file handle, a whiteout index implies that decode
                 * should treat file handle as stale and no need to print a
                 * warning about it.
                 */
                dput(index);
                index = ERR_PTR(-ESTALE);
                goto out;
        } else if (ovl_dentry_weird(index) || ovl_is_whiteout(index) ||
                   inode_wrong_type(inode, d_inode(origin)->i_mode)) {
                /*
                 * Index should always be of the same file type as origin
                 * except for the case of a whiteout index. A whiteout
                 * index should only exist if all lower aliases have been
                 * unlinked, which means that finding a lower origin on lookup
                 * whose index is a whiteout should be treated as an error.
                 */
                pr_warn_ratelimited("bad index found (index=%pd2, ftype=%x, origin ftype=%x).\n",
                                    index, d_inode(index)->i_mode & S_IFMT,
                                    d_inode(origin)->i_mode & S_IFMT);
                goto fail;
        } else if (is_dir && verify) {
                if (!upper) {
                        pr_warn_ratelimited("suspected uncovered redirected dir found (origin=%pd2, index=%pd2).\n",
                                            origin, index);
                        goto fail;
                }

                /* Verify that dir index 'upper' xattr points to upper dir */
                err = ovl_verify_upper(ofs, index, upper, false);
                if (err) {
                        if (err == -ESTALE) {
                                pr_warn_ratelimited("suspected multiply redirected dir found (upper=%pd2, origin=%pd2, index=%pd2).\n",
                                                    upper, origin, index);
                        }
                        goto fail;
                }
        } else if (upper && d_inode(upper) != inode) {
                goto out_dput;
        }
out:
        kfree(name.name);
        return index;

out_dput:
        dput(index);
        index = NULL;
        goto out;

fail:
        dput(index);
        index = ERR_PTR(-EIO);
        goto out;
}

/*
 * Returns next layer in stack starting from top.
 * Returns -1 if this is the last layer.
 */
int ovl_path_next(int idx, struct dentry *dentry, struct path *path,
                  const struct ovl_layer **layer)
{
        struct ovl_entry *oe = OVL_E(dentry);
        struct ovl_path *lowerstack = ovl_lowerstack(oe);

        BUG_ON(idx < 0);
        if (idx == 0) {
                ovl_path_upper(dentry, path);
                if (path->dentry) {
                        *layer = &OVL_FS(dentry->d_sb)->layers[0];
                        return ovl_numlower(oe) ? 1 : -1;
                }
                idx++;
        }
        BUG_ON(idx > ovl_numlower(oe));
        path->dentry = lowerstack[idx - 1].dentry;
        *layer = lowerstack[idx - 1].layer;
        path->mnt = (*layer)->mnt;

        return (idx < ovl_numlower(oe)) ? idx + 1 : -1;
}

/* Fix missing 'origin' xattr */
static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry,
                          struct dentry *lower, struct dentry *upper)
{
        const struct ovl_fh *fh;
        int err;

        if (ovl_check_origin_xattr(ofs, upper))
                return 0;

        fh = ovl_get_origin_fh(ofs, lower);
        if (IS_ERR(fh))
                return PTR_ERR(fh);

        err = ovl_want_write(dentry);
        if (err)
                goto out;

        err = ovl_set_origin_fh(ofs, fh, upper);
        if (!err)
                err = ovl_set_impure(dentry->d_parent, upper->d_parent);

        ovl_drop_write(dentry);
out:
        kfree(fh);
        return err;
}

static int ovl_maybe_validate_verity(struct dentry *dentry)
{
        struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
        struct inode *inode = d_inode(dentry);
        struct path datapath, metapath;
        int err;

        if (!ofs->config.verity_mode ||
            !ovl_is_metacopy_dentry(dentry) ||
            ovl_test_flag(OVL_VERIFIED_DIGEST, inode))
                return 0;

        if (!ovl_test_flag(OVL_HAS_DIGEST, inode)) {
                if (ofs->config.verity_mode == OVL_VERITY_REQUIRE) {
                        pr_warn_ratelimited("metacopy file '%pd' has no digest specified\n",
                                            dentry);
                        return -EIO;
                }
                return 0;
        }

        ovl_path_lowerdata(dentry, &datapath);
        if (!datapath.dentry)
                return -EIO;

        ovl_path_real(dentry, &metapath);
        if (!metapath.dentry)
                return -EIO;

        err = ovl_inode_lock_interruptible(inode);
        if (err)
                return err;

        if (!ovl_test_flag(OVL_VERIFIED_DIGEST, inode)) {
                const struct cred *old_cred;

                old_cred = ovl_override_creds(dentry->d_sb);

                err = ovl_validate_verity(ofs, &metapath, &datapath);
                if (err == 0)
                        ovl_set_flag(OVL_VERIFIED_DIGEST, inode);

                revert_creds(old_cred);
        }

        ovl_inode_unlock(inode);

        return err;
}

/* Lazy lookup of lowerdata */
static int ovl_maybe_lookup_lowerdata(struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        const char *redirect = ovl_lowerdata_redirect(inode);
        struct ovl_path datapath = {};
        const struct cred *old_cred;
        int err;

        if (!redirect || ovl_dentry_lowerdata(dentry))
                return 0;

        if (redirect[0] != '/')
                return -EIO;

        err = ovl_inode_lock_interruptible(inode);
        if (err)
                return err;

        err = 0;
        /* Someone got here before us? */
        if (ovl_dentry_lowerdata(dentry))
                goto out;

        old_cred = ovl_override_creds(dentry->d_sb);
        err = ovl_lookup_data_layers(dentry, redirect, &datapath);
        revert_creds(old_cred);
        if (err)
                goto out_err;

        err = ovl_dentry_set_lowerdata(dentry, &datapath);
        if (err)
                goto out_err;

out:
        ovl_inode_unlock(inode);
        dput(datapath.dentry);

        return err;

out_err:
        pr_warn_ratelimited("lazy lowerdata lookup failed (%pd2, err=%i)\n",
                            dentry, err);
        goto out;
}

int ovl_verify_lowerdata(struct dentry *dentry)
{
        int err;

        err = ovl_maybe_lookup_lowerdata(dentry);
        if (err)
                return err;

        return ovl_maybe_validate_verity(dentry);
}

struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
                          unsigned int flags)
{
        struct ovl_entry *oe = NULL;
        const struct cred *old_cred;
        struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
        struct ovl_entry *poe = OVL_E(dentry->d_parent);
        struct ovl_entry *roe = OVL_E(dentry->d_sb->s_root);
        struct ovl_path *stack = NULL, *origin_path = NULL;
        struct dentry *upperdir, *upperdentry = NULL;
        struct dentry *origin = NULL;
        struct dentry *index = NULL;
        unsigned int ctr = 0;
        struct inode *inode = NULL;
        bool upperopaque = false;
        char *upperredirect = NULL;
        struct dentry *this;
        unsigned int i;
        int err;
        bool uppermetacopy = false;
        int metacopy_size = 0;
        struct ovl_lookup_data d = {
                .sb = dentry->d_sb,
                .name = dentry->d_name,
                .is_dir = false,
                .opaque = false,
                .stop = false,
                .last = ovl_redirect_follow(ofs) ? false : !ovl_numlower(poe),
                .redirect = NULL,
                .metacopy = 0,
        };

        if (dentry->d_name.len > ofs->namelen)
                return ERR_PTR(-ENAMETOOLONG);

        old_cred = ovl_override_creds(dentry->d_sb);
        upperdir = ovl_dentry_upper(dentry->d_parent);
        if (upperdir) {
                d.layer = &ofs->layers[0];
                err = ovl_lookup_layer(upperdir, &d, &upperdentry, true);
                if (err)
                        goto out;

                if (upperdentry && upperdentry->d_flags & DCACHE_OP_REAL) {
                        dput(upperdentry);
                        err = -EREMOTE;
                        goto out;
                }
                if (upperdentry && !d.is_dir) {
                        /*
                         * Lookup copy up origin by decoding origin file handle.
                         * We may get a disconnected dentry, which is fine,
                         * because we only need to hold the origin inode in
                         * cache and use its inode number.  We may even get a
                         * connected dentry, that is not under any of the lower
                         * layers root.  That is also fine for using it's inode
                         * number - it's the same as if we held a reference
                         * to a dentry in lower layer that was moved under us.
                         */
                        err = ovl_check_origin(ofs, upperdentry, &origin_path);
                        if (err)
                                goto out_put_upper;

                        if (d.metacopy)
                                uppermetacopy = true;
                        metacopy_size = d.metacopy;
                }

                if (d.redirect) {
                        err = -ENOMEM;
                        upperredirect = kstrdup(d.redirect, GFP_KERNEL);
                        if (!upperredirect)
                                goto out_put_upper;
                        if (d.redirect[0] == '/')
                                poe = roe;
                }
                upperopaque = d.opaque;
        }

        if (!d.stop && ovl_numlower(poe)) {
                err = -ENOMEM;
                stack = ovl_stack_alloc(ofs->numlayer - 1);
                if (!stack)
                        goto out_put_upper;
        }

        for (i = 0; !d.stop && i < ovl_numlower(poe); i++) {
                struct ovl_path lower = ovl_lowerstack(poe)[i];

                if (!ovl_redirect_follow(ofs))
                        d.last = i == ovl_numlower(poe) - 1;
                else if (d.is_dir || !ofs->numdatalayer)
                        d.last = lower.layer->idx == ovl_numlower(roe);

                d.layer = lower.layer;
                err = ovl_lookup_layer(lower.dentry, &d, &this, false);
                if (err)
                        goto out_put;

                if (!this)
                        continue;

                if ((uppermetacopy || d.metacopy) && !ofs->config.metacopy) {
                        dput(this);
                        err = -EPERM;
                        pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", dentry);
                        goto out_put;
                }

                /*
                 * If no origin fh is stored in upper of a merge dir, store fh
                 * of lower dir and set upper parent "impure".
                 */
                if (upperdentry && !ctr && !ofs->noxattr && d.is_dir) {
                        err = ovl_fix_origin(ofs, dentry, this, upperdentry);
                        if (err) {
                                dput(this);
                                goto out_put;
                        }
                }

                /*
                 * When "verify_lower" feature is enabled, do not merge with a
                 * lower dir that does not match a stored origin xattr. In any
                 * case, only verified origin is used for index lookup.
                 *
                 * For non-dir dentry, if index=on, then ensure origin
                 * matches the dentry found using path based lookup,
                 * otherwise error out.
                 */
                if (upperdentry && !ctr &&
                    ((d.is_dir && ovl_verify_lower(dentry->d_sb)) ||
                     (!d.is_dir && ofs->config.index && origin_path))) {
                        err = ovl_verify_origin(ofs, upperdentry, this, false);
                        if (err) {
                                dput(this);
                                if (d.is_dir)
                                        break;
                                goto out_put;
                        }
                        origin = this;
                }

                if (!upperdentry && !d.is_dir && !ctr && d.metacopy)
                        metacopy_size = d.metacopy;

                if (d.metacopy && ctr) {
                        /*
                         * Do not store intermediate metacopy dentries in
                         * lower chain, except top most lower metacopy dentry.
                         * Continue the loop so that if there is an absolute
                         * redirect on this dentry, poe can be reset to roe.
                         */
                        dput(this);
                        this = NULL;
                } else {
                        stack[ctr].dentry = this;
                        stack[ctr].layer = lower.layer;
                        ctr++;
                }

                /*
                 * Following redirects can have security consequences: it's like
                 * a symlink into the lower layer without the permission checks.
                 * This is only a problem if the upper layer is untrusted (e.g
                 * comes from an USB drive).  This can allow a non-readable file
                 * or directory to become readable.
                 *
                 * Only following redirects when redirects are enabled disables
                 * this attack vector when not necessary.
                 */
                err = -EPERM;
                if (d.redirect && !ovl_redirect_follow(ofs)) {
                        pr_warn_ratelimited("refusing to follow redirect for (%pd2)\n",
                                            dentry);
                        goto out_put;
                }

                if (d.stop)
                        break;

                if (d.redirect && d.redirect[0] == '/' && poe != roe) {
                        poe = roe;
                        /* Find the current layer on the root dentry */
                        i = lower.layer->idx - 1;
                }
        }

        /* Defer lookup of lowerdata in data-only layers to first access */
        if (d.metacopy && ctr && ofs->numdatalayer && d.absolute_redirect) {
                d.metacopy = 0;
                ctr++;
        }

        /*
         * For regular non-metacopy upper dentries, there is no lower
         * path based lookup, hence ctr will be zero. If a dentry is found
         * using ORIGIN xattr on upper, install it in stack.
         *
         * For metacopy dentry, path based lookup will find lower dentries.
         * Just make sure a corresponding data dentry has been found.
         */
        if (d.metacopy || (uppermetacopy && !ctr)) {
                pr_warn_ratelimited("metacopy with no lower data found - abort lookup (%pd2)\n",
                                    dentry);
                err = -EIO;
                goto out_put;
        } else if (!d.is_dir && upperdentry && !ctr && origin_path) {
                if (WARN_ON(stack != NULL)) {
                        err = -EIO;
                        goto out_put;
                }
                stack = origin_path;
                ctr = 1;
                origin = origin_path->dentry;
                origin_path = NULL;
        }

        /*
         * Always lookup index if there is no-upperdentry.
         *
         * For the case of upperdentry, we have set origin by now if it
         * needed to be set. There are basically three cases.
         *
         * For directories, lookup index by lower inode and verify it matches
         * upper inode. We only trust dir index if we verified that lower dir
         * matches origin, otherwise dir index entries may be inconsistent
         * and we ignore them.
         *
         * For regular upper, we already set origin if upper had ORIGIN
         * xattr. There is no verification though as there is no path
         * based dentry lookup in lower in this case.
         *
         * For metacopy upper, we set a verified origin already if index
         * is enabled and if upper had an ORIGIN xattr.
         *
         */
        if (!upperdentry && ctr)
                origin = stack[0].dentry;

        if (origin && ovl_indexdir(dentry->d_sb) &&
            (!d.is_dir || ovl_index_all(dentry->d_sb))) {
                index = ovl_lookup_index(ofs, upperdentry, origin, true);
                if (IS_ERR(index)) {
                        err = PTR_ERR(index);
                        index = NULL;
                        goto out_put;
                }
        }

        if (ctr) {
                oe = ovl_alloc_entry(ctr);
                err = -ENOMEM;
                if (!oe)
                        goto out_put;

                ovl_stack_cpy(ovl_lowerstack(oe), stack, ctr);
        }

        if (upperopaque)
                ovl_dentry_set_opaque(dentry);
        if (d.xwhiteouts)
                ovl_dentry_set_xwhiteouts(dentry);

        if (upperdentry)
                ovl_dentry_set_upper_alias(dentry);
        else if (index) {
                struct path upperpath = {
                        .dentry = upperdentry = dget(index),
                        .mnt = ovl_upper_mnt(ofs),
                };

                /*
                 * It's safe to assign upperredirect here: the previous
                 * assignment of happens only if upperdentry is non-NULL, and
                 * this one only if upperdentry is NULL.
                 */
                upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0);
                if (IS_ERR(upperredirect)) {
                        err = PTR_ERR(upperredirect);
                        upperredirect = NULL;
                        goto out_free_oe;
                }
                err = ovl_check_metacopy_xattr(ofs, &upperpath, NULL);
                if (err < 0)
                        goto out_free_oe;
                uppermetacopy = err;
                metacopy_size = err;
        }

        if (upperdentry || ctr) {
                struct ovl_inode_params oip = {
                        .upperdentry = upperdentry,
                        .oe = oe,
                        .index = index,
                        .redirect = upperredirect,
                };

                /* Store lowerdata redirect for lazy lookup */
                if (ctr > 1 && !d.is_dir && !stack[ctr - 1].dentry) {
                        oip.lowerdata_redirect = d.redirect;
                        d.redirect = NULL;
                }
                inode = ovl_get_inode(dentry->d_sb, &oip);
                err = PTR_ERR(inode);
                if (IS_ERR(inode))
                        goto out_free_oe;
                if (upperdentry && !uppermetacopy)
                        ovl_set_flag(OVL_UPPERDATA, inode);

                if (metacopy_size > OVL_METACOPY_MIN_SIZE)
                        ovl_set_flag(OVL_HAS_DIGEST, inode);
        }

        ovl_dentry_init_reval(dentry, upperdentry, OVL_I_E(inode));

        revert_creds(old_cred);
        if (origin_path) {
                dput(origin_path->dentry);
                kfree(origin_path);
        }
        dput(index);
        ovl_stack_free(stack, ctr);
        kfree(d.redirect);
        return d_splice_alias(inode, dentry);

out_free_oe:
        ovl_free_entry(oe);
out_put:
        dput(index);
        ovl_stack_free(stack, ctr);
out_put_upper:
        if (origin_path) {
                dput(origin_path->dentry);
                kfree(origin_path);
        }
        dput(upperdentry);
        kfree(upperredirect);
out:
        kfree(d.redirect);
        revert_creds(old_cred);
        return ERR_PTR(err);
}

bool ovl_lower_positive(struct dentry *dentry)
{
        struct ovl_entry *poe = OVL_E(dentry->d_parent);
        const struct qstr *name = &dentry->d_name;
        const struct cred *old_cred;
        unsigned int i;
        bool positive = false;
        bool done = false;

        /*
         * If dentry is negative, then lower is positive iff this is a
         * whiteout.
         */
        if (!dentry->d_inode)
                return ovl_dentry_is_opaque(dentry);

        /* Negative upper -> positive lower */
        if (!ovl_dentry_upper(dentry))
                return true;

        old_cred = ovl_override_creds(dentry->d_sb);
        /* Positive upper -> have to look up lower to see whether it exists */
        for (i = 0; !done && !positive && i < ovl_numlower(poe); i++) {
                struct dentry *this;
                struct ovl_path *parentpath = &ovl_lowerstack(poe)[i];

                this = lookup_one_positive_unlocked(
                                mnt_idmap(parentpath->layer->mnt),
                                name->name, parentpath->dentry, name->len);
                if (IS_ERR(this)) {
                        switch (PTR_ERR(this)) {
                        case -ENOENT:
                        case -ENAMETOOLONG:
                                break;

                        default:
                                /*
                                 * Assume something is there, we just couldn't
                                 * access it.
                                 */
                                positive = true;
                                break;
                        }
                } else {
                        struct path path = {
                                .dentry = this,
                                .mnt = parentpath->layer->mnt,
                        };
                        positive = !ovl_path_is_whiteout(OVL_FS(dentry->d_sb), &path);
                        done = true;
                        dput(this);
                }
        }
        revert_creds(old_cred);

        return positive;
}

































































































































    2 








    1 





    1 


















    1 






























    5 



    3 



    1 


    1 





    7 
















    4 











    3 











    3 






    3 




    3 


























































































































































    3 

    3 























    2 



    2 




    1 






    1 










































































































    7 




































































































































































    2 









    2 
































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Integrity Measurement Architecture
 *
 * Copyright (C) 2005,2006,2007,2008 IBM Corporation
 *
 * Authors:
 * Reiner Sailer <sailer@watson.ibm.com>
 * Serge Hallyn <serue@us.ibm.com>
 * Kylene Hall <kylene@us.ibm.com>
 * Mimi Zohar <zohar@us.ibm.com>
 *
 * File: ima_main.c
 *        implements the IMA hooks: ima_bprm_check, ima_file_mmap,
 *        and ima_file_check.
 */

#include <linux/module.h>
#include <linux/file.h>
#include <linux/binfmts.h>
#include <linux/kernel_read_file.h>
#include <linux/mount.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include <linux/ima.h>
#include <linux/fs.h>
#include <linux/iversion.h>
#include <linux/evm.h>

#include "ima.h"

#ifdef CONFIG_IMA_APPRAISE
int ima_appraise = IMA_APPRAISE_ENFORCE;
#else
int ima_appraise;
#endif

int __ro_after_init ima_hash_algo = HASH_ALGO_SHA1;
static int hash_setup_done;

static struct notifier_block ima_lsm_policy_notifier = {
        .notifier_call = ima_lsm_policy_change,
};

static int __init hash_setup(char *str)
{
        struct ima_template_desc *template_desc = ima_template_desc_current();
        int i;

        if (hash_setup_done)
                return 1;

        if (strcmp(template_desc->name, IMA_TEMPLATE_IMA_NAME) == 0) {
                if (strncmp(str, "sha1", 4) == 0) {
                        ima_hash_algo = HASH_ALGO_SHA1;
                } else if (strncmp(str, "md5", 3) == 0) {
                        ima_hash_algo = HASH_ALGO_MD5;
                } else {
                        pr_err("invalid hash algorithm \"%s\" for template \"%s\"",
                                str, IMA_TEMPLATE_IMA_NAME);
                        return 1;
                }
                goto out;
        }

        i = match_string(hash_algo_name, HASH_ALGO__LAST, str);
        if (i < 0) {
                pr_err("invalid hash algorithm \"%s\"", str);
                return 1;
        }

        ima_hash_algo = i;
out:
        hash_setup_done = 1;
        return 1;
}
__setup("ima_hash=", hash_setup);

enum hash_algo ima_get_current_hash_algo(void)
{
        return ima_hash_algo;
}

/* Prevent mmap'ing a file execute that is already mmap'ed write */
static int mmap_violation_check(enum ima_hooks func, struct file *file,
                                char **pathbuf, const char **pathname,
                                char *filename)
{
        struct inode *inode;
        int rc = 0;

        if ((func == MMAP_CHECK || func == MMAP_CHECK_REQPROT) &&
            mapping_writably_mapped(file->f_mapping)) {
                rc = -ETXTBSY;
                inode = file_inode(file);

                if (!*pathbuf)        /* ima_rdwr_violation possibly pre-fetched */
                        *pathname = ima_d_path(&file->f_path, pathbuf,
                                               filename);
                integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, *pathname,
                                    "mmap_file", "mmapped_writers", rc, 0);
        }
        return rc;
}

/*
 * ima_rdwr_violation_check
 *
 * Only invalidate the PCR for measured files:
 *        - Opening a file for write when already open for read,
 *          results in a time of measure, time of use (ToMToU) error.
 *        - Opening a file for read when already open for write,
 *          could result in a file measurement error.
 *
 */
static void ima_rdwr_violation_check(struct file *file,
                                     struct ima_iint_cache *iint,
                                     int must_measure,
                                     char **pathbuf,
                                     const char **pathname,
                                     char *filename)
{
        struct inode *inode = file_inode(file);
        fmode_t mode = file->f_mode;
        bool send_tomtou = false, send_writers = false;

        if (mode & FMODE_WRITE) {
                if (atomic_read(&inode->i_readcount) && IS_IMA(inode)) {
                        if (!iint)
                                iint = ima_iint_find(inode);
                        /* IMA_MEASURE is set from reader side */
                        if (iint && test_bit(IMA_MUST_MEASURE,
                                                &iint->atomic_flags))
                                send_tomtou = true;
                }
        } else {
                if (must_measure)
                        set_bit(IMA_MUST_MEASURE, &iint->atomic_flags);
                if (inode_is_open_for_write(inode) && must_measure)
                        send_writers = true;
        }

        if (!send_tomtou && !send_writers)
                return;

        *pathname = ima_d_path(&file->f_path, pathbuf, filename);

        if (send_tomtou)
                ima_add_violation(file, *pathname, iint,
                                  "invalid_pcr", "ToMToU");
        if (send_writers)
                ima_add_violation(file, *pathname, iint,
                                  "invalid_pcr", "open_writers");
}

static void ima_check_last_writer(struct ima_iint_cache *iint,
                                  struct inode *inode, struct file *file)
{
        fmode_t mode = file->f_mode;
        bool update;

        if (!(mode & FMODE_WRITE))
                return;

        mutex_lock(&iint->mutex);
        if (atomic_read(&inode->i_writecount) == 1) {
                struct kstat stat;

                update = test_and_clear_bit(IMA_UPDATE_XATTR,
                                            &iint->atomic_flags);
                if ((iint->flags & IMA_NEW_FILE) ||
                    vfs_getattr_nosec(&file->f_path, &stat,
                                      STATX_CHANGE_COOKIE,
                                      AT_STATX_SYNC_AS_STAT) ||
                    !(stat.result_mask & STATX_CHANGE_COOKIE) ||
                    stat.change_cookie != iint->real_inode.version) {
                        iint->flags &= ~(IMA_DONE_MASK | IMA_NEW_FILE);
                        iint->measured_pcrs = 0;
                        if (update)
                                ima_update_xattr(iint, file);
                }
        }
        mutex_unlock(&iint->mutex);
}

/**
 * ima_file_free - called on __fput()
 * @file: pointer to file structure being freed
 *
 * Flag files that changed, based on i_version
 */
static void ima_file_free(struct file *file)
{
        struct inode *inode = file_inode(file);
        struct ima_iint_cache *iint;

        if (!ima_policy_flag || !S_ISREG(inode->i_mode))
                return;

        iint = ima_iint_find(inode);
        if (!iint)
                return;

        ima_check_last_writer(iint, inode, file);
}

static int process_measurement(struct file *file, const struct cred *cred,
                               u32 secid, char *buf, loff_t size, int mask,
                               enum ima_hooks func)
{
        struct inode *real_inode, *inode = file_inode(file);
        struct ima_iint_cache *iint = NULL;
        struct ima_template_desc *template_desc = NULL;
        struct inode *metadata_inode;
        char *pathbuf = NULL;
        char filename[NAME_MAX];
        const char *pathname = NULL;
        int rc = 0, action, must_appraise = 0;
        int pcr = CONFIG_IMA_MEASURE_PCR_IDX;
        struct evm_ima_xattr_data *xattr_value = NULL;
        struct modsig *modsig = NULL;
        int xattr_len = 0;
        bool violation_check;
        enum hash_algo hash_algo;
        unsigned int allowed_algos = 0;

        if (!ima_policy_flag || !S_ISREG(inode->i_mode))
                return 0;

        /* Return an IMA_MEASURE, IMA_APPRAISE, IMA_AUDIT action
         * bitmask based on the appraise/audit/measurement policy.
         * Included is the appraise submask.
         */
        action = ima_get_action(file_mnt_idmap(file), inode, cred, secid,
                                mask, func, &pcr, &template_desc, NULL,
                                &allowed_algos);
        violation_check = ((func == FILE_CHECK || func == MMAP_CHECK ||
                            func == MMAP_CHECK_REQPROT) &&
                           (ima_policy_flag & IMA_MEASURE));
        if (!action && !violation_check)
                return 0;

        must_appraise = action & IMA_APPRAISE;

        /*  Is the appraise rule hook specific?  */
        if (action & IMA_FILE_APPRAISE)
                func = FILE_CHECK;

        inode_lock(inode);

        if (action) {
                iint = ima_inode_get(inode);
                if (!iint)
                        rc = -ENOMEM;
        }

        if (!rc && violation_check)
                ima_rdwr_violation_check(file, iint, action & IMA_MEASURE,
                                         &pathbuf, &pathname, filename);

        inode_unlock(inode);

        if (rc)
                goto out;
        if (!action)
                goto out;

        mutex_lock(&iint->mutex);

        if (test_and_clear_bit(IMA_CHANGE_ATTR, &iint->atomic_flags))
                /* reset appraisal flags if ima_inode_post_setattr was called */
                iint->flags &= ~(IMA_APPRAISE | IMA_APPRAISED |
                                 IMA_APPRAISE_SUBMASK | IMA_APPRAISED_SUBMASK |
                                 IMA_NONACTION_FLAGS);

        /*
         * Re-evaulate the file if either the xattr has changed or the
         * kernel has no way of detecting file change on the filesystem.
         * (Limited to privileged mounted filesystems.)
         */
        if (test_and_clear_bit(IMA_CHANGE_XATTR, &iint->atomic_flags) ||
            ((inode->i_sb->s_iflags & SB_I_IMA_UNVERIFIABLE_SIGNATURE) &&
             !(inode->i_sb->s_iflags & SB_I_UNTRUSTED_MOUNTER) &&
             !(action & IMA_FAIL_UNVERIFIABLE_SIGS))) {
                iint->flags &= ~IMA_DONE_MASK;
                iint->measured_pcrs = 0;
        }

        /*
         * On stacked filesystems, detect and re-evaluate file data and
         * metadata changes.
         */
        real_inode = d_real_inode(file_dentry(file));
        if (real_inode != inode &&
            (action & IMA_DO_MASK) && (iint->flags & IMA_DONE_MASK)) {
                if (!IS_I_VERSION(real_inode) ||
                    integrity_inode_attrs_changed(&iint->real_inode,
                                                  real_inode)) {
                        iint->flags &= ~IMA_DONE_MASK;
                        iint->measured_pcrs = 0;
                }

                /*
                 * Reset the EVM status when metadata changed.
                 */
                metadata_inode = d_inode(d_real(file_dentry(file),
                                         D_REAL_METADATA));
                if (evm_metadata_changed(inode, metadata_inode))
                        iint->flags &= ~(IMA_APPRAISED |
                                         IMA_APPRAISED_SUBMASK);
        }

        /* Determine if already appraised/measured based on bitmask
         * (IMA_MEASURE, IMA_MEASURED, IMA_XXXX_APPRAISE, IMA_XXXX_APPRAISED,
         *  IMA_AUDIT, IMA_AUDITED)
         */
        iint->flags |= action;
        action &= IMA_DO_MASK;
        action &= ~((iint->flags & (IMA_DONE_MASK ^ IMA_MEASURED)) >> 1);

        /* If target pcr is already measured, unset IMA_MEASURE action */
        if ((action & IMA_MEASURE) && (iint->measured_pcrs & (0x1 << pcr)))
                action ^= IMA_MEASURE;

        /* HASH sets the digital signature and update flags, nothing else */
        if ((action & IMA_HASH) &&
            !(test_bit(IMA_DIGSIG, &iint->atomic_flags))) {
                xattr_len = ima_read_xattr(file_dentry(file),
                                           &xattr_value, xattr_len);
                if ((xattr_value && xattr_len > 2) &&
                    (xattr_value->type == EVM_IMA_XATTR_DIGSIG))
                        set_bit(IMA_DIGSIG, &iint->atomic_flags);
                iint->flags |= IMA_HASHED;
                action ^= IMA_HASH;
                set_bit(IMA_UPDATE_XATTR, &iint->atomic_flags);
        }

        /* Nothing to do, just return existing appraised status */
        if (!action) {
                if (must_appraise) {
                        rc = mmap_violation_check(func, file, &pathbuf,
                                                  &pathname, filename);
                        if (!rc)
                                rc = ima_get_cache_status(iint, func);
                }
                goto out_locked;
        }

        if ((action & IMA_APPRAISE_SUBMASK) ||
            strcmp(template_desc->name, IMA_TEMPLATE_IMA_NAME) != 0) {
                /* read 'security.ima' */
                xattr_len = ima_read_xattr(file_dentry(file),
                                           &xattr_value, xattr_len);

                /*
                 * Read the appended modsig if allowed by the policy, and allow
                 * an additional measurement list entry, if needed, based on the
                 * template format and whether the file was already measured.
                 */
                if (iint->flags & IMA_MODSIG_ALLOWED) {
                        rc = ima_read_modsig(func, buf, size, &modsig);

                        if (!rc && ima_template_has_modsig(template_desc) &&
                            iint->flags & IMA_MEASURED)
                                action |= IMA_MEASURE;
                }
        }

        hash_algo = ima_get_hash_algo(xattr_value, xattr_len);

        rc = ima_collect_measurement(iint, file, buf, size, hash_algo, modsig);
        if (rc != 0 && rc != -EBADF && rc != -EINVAL)
                goto out_locked;

        if (!pathbuf)        /* ima_rdwr_violation possibly pre-fetched */
                pathname = ima_d_path(&file->f_path, &pathbuf, filename);

        if (action & IMA_MEASURE)
                ima_store_measurement(iint, file, pathname,
                                      xattr_value, xattr_len, modsig, pcr,
                                      template_desc);
        if (rc == 0 && (action & IMA_APPRAISE_SUBMASK)) {
                rc = ima_check_blacklist(iint, modsig, pcr);
                if (rc != -EPERM) {
                        inode_lock(inode);
                        rc = ima_appraise_measurement(func, iint, file,
                                                      pathname, xattr_value,
                                                      xattr_len, modsig);
                        inode_unlock(inode);
                }
                if (!rc)
                        rc = mmap_violation_check(func, file, &pathbuf,
                                                  &pathname, filename);
        }
        if (action & IMA_AUDIT)
                ima_audit_measurement(iint, pathname);

        if ((file->f_flags & O_DIRECT) && (iint->flags & IMA_PERMIT_DIRECTIO))
                rc = 0;

        /* Ensure the digest was generated using an allowed algorithm */
        if (rc == 0 && must_appraise && allowed_algos != 0 &&
            (allowed_algos & (1U << hash_algo)) == 0) {
                rc = -EACCES;

                integrity_audit_msg(AUDIT_INTEGRITY_DATA, file_inode(file),
                                    pathname, "collect_data",
                                    "denied-hash-algorithm", rc, 0);
        }
out_locked:
        if ((mask & MAY_WRITE) && test_bit(IMA_DIGSIG, &iint->atomic_flags) &&
             !(iint->flags & IMA_NEW_FILE))
                rc = -EACCES;
        mutex_unlock(&iint->mutex);
        kfree(xattr_value);
        ima_free_modsig(modsig);
out:
        if (pathbuf)
                __putname(pathbuf);
        if (must_appraise) {
                if (rc && (ima_appraise & IMA_APPRAISE_ENFORCE))
                        return -EACCES;
                if (file->f_mode & FMODE_WRITE)
                        set_bit(IMA_UPDATE_XATTR, &iint->atomic_flags);
        }
        return 0;
}

/**
 * ima_file_mmap - based on policy, collect/store measurement.
 * @file: pointer to the file to be measured (May be NULL)
 * @reqprot: protection requested by the application
 * @prot: protection that will be applied by the kernel
 * @flags: operational flags
 *
 * Measure files being mmapped executable based on the ima_must_measure()
 * policy decision.
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_file_mmap(struct file *file, unsigned long reqprot,
                         unsigned long prot, unsigned long flags)
{
        u32 secid;
        int ret;

        if (!file)
                return 0;

        security_current_getsecid_subj(&secid);

        if (reqprot & PROT_EXEC) {
                ret = process_measurement(file, current_cred(), secid, NULL,
                                          0, MAY_EXEC, MMAP_CHECK_REQPROT);
                if (ret)
                        return ret;
        }

        if (prot & PROT_EXEC)
                return process_measurement(file, current_cred(), secid, NULL,
                                           0, MAY_EXEC, MMAP_CHECK);

        return 0;
}

/**
 * ima_file_mprotect - based on policy, limit mprotect change
 * @vma: vm_area_struct protection is set to
 * @reqprot: protection requested by the application
 * @prot: protection that will be applied by the kernel
 *
 * Files can be mmap'ed read/write and later changed to execute to circumvent
 * IMA's mmap appraisal policy rules.  Due to locking issues (mmap semaphore
 * would be taken before i_mutex), files can not be measured or appraised at
 * this point.  Eliminate this integrity gap by denying the mprotect
 * PROT_EXECUTE change, if an mmap appraise policy rule exists.
 *
 * On mprotect change success, return 0.  On failure, return -EACESS.
 */
static int ima_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
                             unsigned long prot)
{
        struct ima_template_desc *template = NULL;
        struct file *file;
        char filename[NAME_MAX];
        char *pathbuf = NULL;
        const char *pathname = NULL;
        struct inode *inode;
        int result = 0;
        int action;
        u32 secid;
        int pcr;

        /* Is mprotect making an mmap'ed file executable? */
        if (!(ima_policy_flag & IMA_APPRAISE) || !vma->vm_file ||
            !(prot & PROT_EXEC) || (vma->vm_flags & VM_EXEC))
                return 0;

        security_current_getsecid_subj(&secid);
        inode = file_inode(vma->vm_file);
        action = ima_get_action(file_mnt_idmap(vma->vm_file), inode,
                                current_cred(), secid, MAY_EXEC, MMAP_CHECK,
                                &pcr, &template, NULL, NULL);
        action |= ima_get_action(file_mnt_idmap(vma->vm_file), inode,
                                 current_cred(), secid, MAY_EXEC,
                                 MMAP_CHECK_REQPROT, &pcr, &template, NULL,
                                 NULL);

        /* Is the mmap'ed file in policy? */
        if (!(action & (IMA_MEASURE | IMA_APPRAISE_SUBMASK)))
                return 0;

        if (action & IMA_APPRAISE_SUBMASK)
                result = -EPERM;

        file = vma->vm_file;
        pathname = ima_d_path(&file->f_path, &pathbuf, filename);
        integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, pathname,
                            "collect_data", "failed-mprotect", result, 0);
        if (pathbuf)
                __putname(pathbuf);

        return result;
}

/**
 * ima_bprm_check - based on policy, collect/store measurement.
 * @bprm: contains the linux_binprm structure
 *
 * The OS protects against an executable file, already open for write,
 * from being executed in deny_write_access() and an executable file,
 * already open for execute, from being modified in get_write_access().
 * So we can be certain that what we verify and measure here is actually
 * what is being executed.
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_bprm_check(struct linux_binprm *bprm)
{
        int ret;
        u32 secid;

        security_current_getsecid_subj(&secid);
        ret = process_measurement(bprm->file, current_cred(), secid, NULL, 0,
                                  MAY_EXEC, BPRM_CHECK);
        if (ret)
                return ret;

        security_cred_getsecid(bprm->cred, &secid);
        return process_measurement(bprm->file, bprm->cred, secid, NULL, 0,
                                   MAY_EXEC, CREDS_CHECK);
}

/**
 * ima_file_check - based on policy, collect/store measurement.
 * @file: pointer to the file to be measured
 * @mask: contains MAY_READ, MAY_WRITE, MAY_EXEC or MAY_APPEND
 *
 * Measure files based on the ima_must_measure() policy decision.
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_file_check(struct file *file, int mask)
{
        u32 secid;

        security_current_getsecid_subj(&secid);
        return process_measurement(file, current_cred(), secid, NULL, 0,
                                   mask & (MAY_READ | MAY_WRITE | MAY_EXEC |
                                           MAY_APPEND), FILE_CHECK);
}

static int __ima_inode_hash(struct inode *inode, struct file *file, char *buf,
                            size_t buf_size)
{
        struct ima_iint_cache *iint = NULL, tmp_iint;
        int rc, hash_algo;

        if (ima_policy_flag) {
                iint = ima_iint_find(inode);
                if (iint)
                        mutex_lock(&iint->mutex);
        }

        if ((!iint || !(iint->flags & IMA_COLLECTED)) && file) {
                if (iint)
                        mutex_unlock(&iint->mutex);

                memset(&tmp_iint, 0, sizeof(tmp_iint));
                mutex_init(&tmp_iint.mutex);

                rc = ima_collect_measurement(&tmp_iint, file, NULL, 0,
                                             ima_hash_algo, NULL);
                if (rc < 0) {
                        /* ima_hash could be allocated in case of failure. */
                        if (rc != -ENOMEM)
                                kfree(tmp_iint.ima_hash);

                        return -EOPNOTSUPP;
                }

                iint = &tmp_iint;
                mutex_lock(&iint->mutex);
        }

        if (!iint)
                return -EOPNOTSUPP;

        /*
         * ima_file_hash can be called when ima_collect_measurement has still
         * not been called, we might not always have a hash.
         */
        if (!iint->ima_hash || !(iint->flags & IMA_COLLECTED)) {
                mutex_unlock(&iint->mutex);
                return -EOPNOTSUPP;
        }

        if (buf) {
                size_t copied_size;

                copied_size = min_t(size_t, iint->ima_hash->length, buf_size);
                memcpy(buf, iint->ima_hash->digest, copied_size);
        }
        hash_algo = iint->ima_hash->algo;
        mutex_unlock(&iint->mutex);

        if (iint == &tmp_iint)
                kfree(iint->ima_hash);

        return hash_algo;
}

/**
 * ima_file_hash - return a measurement of the file
 * @file: pointer to the file
 * @buf: buffer in which to store the hash
 * @buf_size: length of the buffer
 *
 * On success, return the hash algorithm (as defined in the enum hash_algo).
 * If buf is not NULL, this function also outputs the hash into buf.
 * If the hash is larger than buf_size, then only buf_size bytes will be copied.
 * It generally just makes sense to pass a buffer capable of holding the largest
 * possible hash: IMA_MAX_DIGEST_SIZE.
 * The file hash returned is based on the entire file, including the appended
 * signature.
 *
 * If the measurement cannot be performed, return -EOPNOTSUPP.
 * If the parameters are incorrect, return -EINVAL.
 */
int ima_file_hash(struct file *file, char *buf, size_t buf_size)
{
        if (!file)
                return -EINVAL;

        return __ima_inode_hash(file_inode(file), file, buf, buf_size);
}
EXPORT_SYMBOL_GPL(ima_file_hash);

/**
 * ima_inode_hash - return the stored measurement if the inode has been hashed
 * and is in the iint cache.
 * @inode: pointer to the inode
 * @buf: buffer in which to store the hash
 * @buf_size: length of the buffer
 *
 * On success, return the hash algorithm (as defined in the enum hash_algo).
 * If buf is not NULL, this function also outputs the hash into buf.
 * If the hash is larger than buf_size, then only buf_size bytes will be copied.
 * It generally just makes sense to pass a buffer capable of holding the largest
 * possible hash: IMA_MAX_DIGEST_SIZE.
 * The hash returned is based on the entire contents, including the appended
 * signature.
 *
 * If IMA is disabled or if no measurement is available, return -EOPNOTSUPP.
 * If the parameters are incorrect, return -EINVAL.
 */
int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size)
{
        if (!inode)
                return -EINVAL;

        return __ima_inode_hash(inode, NULL, buf, buf_size);
}
EXPORT_SYMBOL_GPL(ima_inode_hash);

/**
 * ima_post_create_tmpfile - mark newly created tmpfile as new
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode of the newly created tmpfile
 *
 * No measuring, appraising or auditing of newly created tmpfiles is needed.
 * Skip calling process_measurement(), but indicate which newly, created
 * tmpfiles are in policy.
 */
static void ima_post_create_tmpfile(struct mnt_idmap *idmap,
                                    struct inode *inode)

{
        struct ima_iint_cache *iint;
        int must_appraise;

        if (!ima_policy_flag || !S_ISREG(inode->i_mode))
                return;

        must_appraise = ima_must_appraise(idmap, inode, MAY_ACCESS,
                                          FILE_CHECK);
        if (!must_appraise)
                return;

        /* Nothing to do if we can't allocate memory */
        iint = ima_inode_get(inode);
        if (!iint)
                return;

        /* needed for writing the security xattrs */
        set_bit(IMA_UPDATE_XATTR, &iint->atomic_flags);
        iint->ima_file_status = INTEGRITY_PASS;
}

/**
 * ima_post_path_mknod - mark as a new inode
 * @idmap: idmap of the mount the inode was found from
 * @dentry: newly created dentry
 *
 * Mark files created via the mknodat syscall as new, so that the
 * file data can be written later.
 */
static void ima_post_path_mknod(struct mnt_idmap *idmap, struct dentry *dentry)
{
        struct ima_iint_cache *iint;
        struct inode *inode = dentry->d_inode;
        int must_appraise;

        if (!ima_policy_flag || !S_ISREG(inode->i_mode))
                return;

        must_appraise = ima_must_appraise(idmap, inode, MAY_ACCESS,
                                          FILE_CHECK);
        if (!must_appraise)
                return;

        /* Nothing to do if we can't allocate memory */
        iint = ima_inode_get(inode);
        if (!iint)
                return;

        /* needed for re-opening empty files */
        iint->flags |= IMA_NEW_FILE;
}

/**
 * ima_read_file - pre-measure/appraise hook decision based on policy
 * @file: pointer to the file to be measured/appraised/audit
 * @read_id: caller identifier
 * @contents: whether a subsequent call will be made to ima_post_read_file()
 *
 * Permit reading a file based on policy. The policy rules are written
 * in terms of the policy identifier.  Appraising the integrity of
 * a file requires a file descriptor.
 *
 * For permission return 0, otherwise return -EACCES.
 */
static int ima_read_file(struct file *file, enum kernel_read_file_id read_id,
                         bool contents)
{
        enum ima_hooks func;
        u32 secid;

        /*
         * Do devices using pre-allocated memory run the risk of the
         * firmware being accessible to the device prior to the completion
         * of IMA's signature verification any more than when using two
         * buffers? It may be desirable to include the buffer address
         * in this API and walk all the dma_map_single() mappings to check.
         */

        /*
         * There will be a call made to ima_post_read_file() with
         * a filled buffer, so we don't need to perform an extra
         * read early here.
         */
        if (contents)
                return 0;

        /* Read entire file for all partial reads. */
        func = read_idmap[read_id] ?: FILE_CHECK;
        security_current_getsecid_subj(&secid);
        return process_measurement(file, current_cred(), secid, NULL,
                                   0, MAY_READ, func);
}

const int read_idmap[READING_MAX_ID] = {
        [READING_FIRMWARE] = FIRMWARE_CHECK,
        [READING_MODULE] = MODULE_CHECK,
        [READING_KEXEC_IMAGE] = KEXEC_KERNEL_CHECK,
        [READING_KEXEC_INITRAMFS] = KEXEC_INITRAMFS_CHECK,
        [READING_POLICY] = POLICY_CHECK
};

/**
 * ima_post_read_file - in memory collect/appraise/audit measurement
 * @file: pointer to the file to be measured/appraised/audit
 * @buf: pointer to in memory file contents
 * @size: size of in memory file contents
 * @read_id: caller identifier
 *
 * Measure/appraise/audit in memory file based on policy.  Policy rules
 * are written in terms of a policy identifier.
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_post_read_file(struct file *file, char *buf, loff_t size,
                              enum kernel_read_file_id read_id)
{
        enum ima_hooks func;
        u32 secid;

        /* permit signed certs */
        if (!file && read_id == READING_X509_CERTIFICATE)
                return 0;

        if (!file || !buf || size == 0) { /* should never happen */
                if (ima_appraise & IMA_APPRAISE_ENFORCE)
                        return -EACCES;
                return 0;
        }

        func = read_idmap[read_id] ?: FILE_CHECK;
        security_current_getsecid_subj(&secid);
        return process_measurement(file, current_cred(), secid, buf, size,
                                   MAY_READ, func);
}

/**
 * ima_load_data - appraise decision based on policy
 * @id: kernel load data caller identifier
 * @contents: whether the full contents will be available in a later
 *              call to ima_post_load_data().
 *
 * Callers of this LSM hook can not measure, appraise, or audit the
 * data provided by userspace.  Enforce policy rules requiring a file
 * signature (eg. kexec'ed kernel image).
 *
 * For permission return 0, otherwise return -EACCES.
 */
static int ima_load_data(enum kernel_load_data_id id, bool contents)
{
        bool ima_enforce, sig_enforce;

        ima_enforce =
                (ima_appraise & IMA_APPRAISE_ENFORCE) == IMA_APPRAISE_ENFORCE;

        switch (id) {
        case LOADING_KEXEC_IMAGE:
                if (IS_ENABLED(CONFIG_KEXEC_SIG)
                    && arch_ima_get_secureboot()) {
                        pr_err("impossible to appraise a kernel image without a file descriptor; try using kexec_file_load syscall.\n");
                        return -EACCES;
                }

                if (ima_enforce && (ima_appraise & IMA_APPRAISE_KEXEC)) {
                        pr_err("impossible to appraise a kernel image without a file descriptor; try using kexec_file_load syscall.\n");
                        return -EACCES;        /* INTEGRITY_UNKNOWN */
                }
                break;
        case LOADING_FIRMWARE:
                if (ima_enforce && (ima_appraise & IMA_APPRAISE_FIRMWARE) && !contents) {
                        pr_err("Prevent firmware sysfs fallback loading.\n");
                        return -EACCES;        /* INTEGRITY_UNKNOWN */
                }
                break;
        case LOADING_MODULE:
                sig_enforce = is_module_sig_enforced();

                if (ima_enforce && (!sig_enforce
                                    && (ima_appraise & IMA_APPRAISE_MODULES))) {
                        pr_err("impossible to appraise a module without a file descriptor. sig_enforce kernel parameter might help\n");
                        return -EACCES;        /* INTEGRITY_UNKNOWN */
                }
                break;
        default:
                break;
        }
        return 0;
}

/**
 * ima_post_load_data - appraise decision based on policy
 * @buf: pointer to in memory file contents
 * @size: size of in memory file contents
 * @load_id: kernel load data caller identifier
 * @description: @load_id-specific description of contents
 *
 * Measure/appraise/audit in memory buffer based on policy.  Policy rules
 * are written in terms of a policy identifier.
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_post_load_data(char *buf, loff_t size,
                              enum kernel_load_data_id load_id,
                              char *description)
{
        if (load_id == LOADING_FIRMWARE) {
                if ((ima_appraise & IMA_APPRAISE_FIRMWARE) &&
                    (ima_appraise & IMA_APPRAISE_ENFORCE)) {
                        pr_err("Prevent firmware loading_store.\n");
                        return -EACCES; /* INTEGRITY_UNKNOWN */
                }
                return 0;
        }

        /*
         * Measure the init_module syscall buffer containing the ELF image.
         */
        if (load_id == LOADING_MODULE)
                ima_measure_critical_data("modules", "init_module",
                                          buf, size, true, NULL, 0);

        return 0;
}

/**
 * process_buffer_measurement - Measure the buffer or the buffer data hash
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode associated with the object being measured (NULL for KEY_CHECK)
 * @buf: pointer to the buffer that needs to be added to the log.
 * @size: size of buffer(in bytes).
 * @eventname: event name to be used for the buffer entry.
 * @func: IMA hook
 * @pcr: pcr to extend the measurement
 * @func_data: func specific data, may be NULL
 * @buf_hash: measure buffer data hash
 * @digest: buffer digest will be written to
 * @digest_len: buffer length
 *
 * Based on policy, either the buffer data or buffer data hash is measured
 *
 * Return: 0 if the buffer has been successfully measured, 1 if the digest
 * has been written to the passed location but not added to a measurement entry,
 * a negative value otherwise.
 */
int process_buffer_measurement(struct mnt_idmap *idmap,
                               struct inode *inode, const void *buf, int size,
                               const char *eventname, enum ima_hooks func,
                               int pcr, const char *func_data,
                               bool buf_hash, u8 *digest, size_t digest_len)
{
        int ret = 0;
        const char *audit_cause = "ENOMEM";
        struct ima_template_entry *entry = NULL;
        struct ima_iint_cache iint = {};
        struct ima_event_data event_data = {.iint = &iint,
                                            .filename = eventname,
                                            .buf = buf,
                                            .buf_len = size};
        struct ima_template_desc *template;
        struct ima_max_digest_data hash;
        struct ima_digest_data *hash_hdr = container_of(&hash.hdr,
                                                struct ima_digest_data, hdr);
        char digest_hash[IMA_MAX_DIGEST_SIZE];
        int digest_hash_len = hash_digest_size[ima_hash_algo];
        int violation = 0;
        int action = 0;
        u32 secid;

        if (digest && digest_len < digest_hash_len)
                return -EINVAL;

        if (!ima_policy_flag && !digest)
                return -ENOENT;

        template = ima_template_desc_buf();
        if (!template) {
                ret = -EINVAL;
                audit_cause = "ima_template_desc_buf";
                goto out;
        }

        /*
         * Both LSM hooks and auxilary based buffer measurements are
         * based on policy.  To avoid code duplication, differentiate
         * between the LSM hooks and auxilary buffer measurements,
         * retrieving the policy rule information only for the LSM hook
         * buffer measurements.
         */
        if (func) {
                security_current_getsecid_subj(&secid);
                action = ima_get_action(idmap, inode, current_cred(),
                                        secid, 0, func, &pcr, &template,
                                        func_data, NULL);
                if (!(action & IMA_MEASURE) && !digest)
                        return -ENOENT;
        }

        if (!pcr)
                pcr = CONFIG_IMA_MEASURE_PCR_IDX;

        iint.ima_hash = hash_hdr;
        iint.ima_hash->algo = ima_hash_algo;
        iint.ima_hash->length = hash_digest_size[ima_hash_algo];

        ret = ima_calc_buffer_hash(buf, size, iint.ima_hash);
        if (ret < 0) {
                audit_cause = "hashing_error";
                goto out;
        }

        if (buf_hash) {
                memcpy(digest_hash, hash_hdr->digest, digest_hash_len);

                ret = ima_calc_buffer_hash(digest_hash, digest_hash_len,
                                           iint.ima_hash);
                if (ret < 0) {
                        audit_cause = "hashing_error";
                        goto out;
                }

                event_data.buf = digest_hash;
                event_data.buf_len = digest_hash_len;
        }

        if (digest)
                memcpy(digest, iint.ima_hash->digest, digest_hash_len);

        if (!ima_policy_flag || (func && !(action & IMA_MEASURE)))
                return 1;

        ret = ima_alloc_init_template(&event_data, &entry, template);
        if (ret < 0) {
                audit_cause = "alloc_entry";
                goto out;
        }

        ret = ima_store_template(entry, violation, NULL, event_data.buf, pcr);
        if (ret < 0) {
                audit_cause = "store_entry";
                ima_free_template_entry(entry);
        }

out:
        if (ret < 0)
                integrity_audit_message(AUDIT_INTEGRITY_PCR, NULL, eventname,
                                        func_measure_str(func),
                                        audit_cause, ret, 0, ret);

        return ret;
}

/**
 * ima_kexec_cmdline - measure kexec cmdline boot args
 * @kernel_fd: file descriptor of the kexec kernel being loaded
 * @buf: pointer to buffer
 * @size: size of buffer
 *
 * Buffers can only be measured, not appraised.
 */
void ima_kexec_cmdline(int kernel_fd, const void *buf, int size)
{
        struct fd f;

        if (!buf || !size)
                return;

        f = fdget(kernel_fd);
        if (!f.file)
                return;

        process_buffer_measurement(file_mnt_idmap(f.file), file_inode(f.file),
                                   buf, size, "kexec-cmdline", KEXEC_CMDLINE, 0,
                                   NULL, false, NULL, 0);
        fdput(f);
}

/**
 * ima_measure_critical_data - measure kernel integrity critical data
 * @event_label: unique event label for grouping and limiting critical data
 * @event_name: event name for the record in the IMA measurement list
 * @buf: pointer to buffer data
 * @buf_len: length of buffer data (in bytes)
 * @hash: measure buffer data hash
 * @digest: buffer digest will be written to
 * @digest_len: buffer length
 *
 * Measure data critical to the integrity of the kernel into the IMA log
 * and extend the pcr.  Examples of critical data could be various data
 * structures, policies, and states stored in kernel memory that can
 * impact the integrity of the system.
 *
 * Return: 0 if the buffer has been successfully measured, 1 if the digest
 * has been written to the passed location but not added to a measurement entry,
 * a negative value otherwise.
 */
int ima_measure_critical_data(const char *event_label,
                              const char *event_name,
                              const void *buf, size_t buf_len,
                              bool hash, u8 *digest, size_t digest_len)
{
        if (!event_name || !event_label || !buf || !buf_len)
                return -ENOPARAM;

        return process_buffer_measurement(&nop_mnt_idmap, NULL, buf, buf_len,
                                          event_name, CRITICAL_DATA, 0,
                                          event_label, hash, digest,
                                          digest_len);
}
EXPORT_SYMBOL_GPL(ima_measure_critical_data);

#ifdef CONFIG_INTEGRITY_ASYMMETRIC_KEYS

/**
 * ima_kernel_module_request - Prevent crypto-pkcs1pad(rsa,*) requests
 * @kmod_name: kernel module name
 *
 * Avoid a verification loop where verifying the signature of the modprobe
 * binary requires executing modprobe itself. Since the modprobe iint->mutex
 * is already held when the signature verification is performed, a deadlock
 * occurs as soon as modprobe is executed within the critical region, since
 * the same lock cannot be taken again.
 *
 * This happens when public_key_verify_signature(), in case of RSA algorithm,
 * use alg_name to store internal information in order to construct an
 * algorithm on the fly, but crypto_larval_lookup() will try to use alg_name
 * in order to load a kernel module with same name.
 *
 * Since we don't have any real "crypto-pkcs1pad(rsa,*)" kernel modules,
 * we are safe to fail such module request from crypto_larval_lookup(), and
 * avoid the verification loop.
 *
 * Return: Zero if it is safe to load the kernel module, -EINVAL otherwise.
 */
static int ima_kernel_module_request(char *kmod_name)
{
        if (strncmp(kmod_name, "crypto-pkcs1pad(rsa,", 20) == 0)
                return -EINVAL;

        return 0;
}

#endif /* CONFIG_INTEGRITY_ASYMMETRIC_KEYS */

static int __init init_ima(void)
{
        int error;

        ima_appraise_parse_cmdline();
        ima_init_template_list();
        hash_setup(CONFIG_IMA_DEFAULT_HASH);
        error = ima_init();

        if (error && strcmp(hash_algo_name[ima_hash_algo],
                            CONFIG_IMA_DEFAULT_HASH) != 0) {
                pr_info("Allocating %s failed, going to use default hash algorithm %s\n",
                        hash_algo_name[ima_hash_algo], CONFIG_IMA_DEFAULT_HASH);
                hash_setup_done = 0;
                hash_setup(CONFIG_IMA_DEFAULT_HASH);
                error = ima_init();
        }

        if (error)
                return error;

        error = register_blocking_lsm_notifier(&ima_lsm_policy_notifier);
        if (error)
                pr_warn("Couldn't register LSM notifier, error %d\n", error);

        if (!error)
                ima_update_policy_flags();

        return error;
}

static struct security_hook_list ima_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(bprm_check_security, ima_bprm_check),
        LSM_HOOK_INIT(file_post_open, ima_file_check),
        LSM_HOOK_INIT(inode_post_create_tmpfile, ima_post_create_tmpfile),
        LSM_HOOK_INIT(file_release, ima_file_free),
        LSM_HOOK_INIT(mmap_file, ima_file_mmap),
        LSM_HOOK_INIT(file_mprotect, ima_file_mprotect),
        LSM_HOOK_INIT(kernel_load_data, ima_load_data),
        LSM_HOOK_INIT(kernel_post_load_data, ima_post_load_data),
        LSM_HOOK_INIT(kernel_read_file, ima_read_file),
        LSM_HOOK_INIT(kernel_post_read_file, ima_post_read_file),
        LSM_HOOK_INIT(path_post_mknod, ima_post_path_mknod),
#ifdef CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS
        LSM_HOOK_INIT(key_post_create_or_update, ima_post_key_create_or_update),
#endif
#ifdef CONFIG_INTEGRITY_ASYMMETRIC_KEYS
        LSM_HOOK_INIT(kernel_module_request, ima_kernel_module_request),
#endif
        LSM_HOOK_INIT(inode_free_security, ima_inode_free),
};

static const struct lsm_id ima_lsmid = {
        .name = "ima",
        .id = LSM_ID_IMA,
};

static int __init init_ima_lsm(void)
{
        ima_iintcache_init();
        security_add_hooks(ima_hooks, ARRAY_SIZE(ima_hooks), &ima_lsmid);
        init_ima_appraise_lsm(&ima_lsmid);
        return 0;
}

struct lsm_blob_sizes ima_blob_sizes __ro_after_init = {
        .lbs_inode = sizeof(struct ima_iint_cache *),
};

DEFINE_LSM(ima) = {
        .name = "ima",
        .init = init_ima_lsm,
        .order = LSM_ORDER_LAST,
        .blobs = &ima_blob_sizes,
};

late_initcall(init_ima);        /* Start IMA after the TPM is available */







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 


    3 

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
11112
11113
11114
11115
11116
11117
11118
11119
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131
11132
11133
11134
11135
11136
11137
11138
11139
11140
11141
11142
11143
11144
11145
11146
11147
11148
11149
11150
11151
11152
11153
11154
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165
11166
11167
11168
11169
11170
11171
11172
11173
11174
11175
11176
11177
11178
11179
11180
11181
11182
11183
11184
11185
11186
11187
11188
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202
11203
11204
11205
11206
11207
11208
11209
11210
11211
11212
11213
11214
11215
11216
11217
11218
11219
11220
11221
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231
11232
11233
11234
11235
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245
11246
11247
11248
11249
11250
11251
11252
11253
11254
11255
11256
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347
11348
11349
11350
11351
11352
11353
11354
11355
11356
11357
11358
11359
11360
11361
11362
11363
11364
11365
11366
11367
11368
11369
11370
11371
11372
11373
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409
11410
11411
11412
11413
11414
11415
11416
11417
11418
11419
11420
11421
11422
11423
11424
11425
11426
11427
11428
11429
11430
11431
11432
11433
11434
11435
11436
11437
11438
11439
11440
11441
11442
11443
11444
11445
11446
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459
11460
11461
11462
11463
11464
11465
11466
11467
11468
11469
11470
11471
11472
11473
11474
11475
11476
11477
11478
11479
11480
11481
11482
11483
11484
11485
11486
11487
11488
11489
11490
11491
11492
11493
11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510
11511
11512
11513
11514
11515
11516
11517
11518
11519
11520
11521
11522
11523
11524
11525
11526
11527
11528
11529
11530
11531
11532
11533
11534
11535
11536
11537
11538
11539
11540
11541
11542
11543
11544
11545
11546
11547
11548
11549
11550
11551
11552
11553
11554
11555
11556
11557
11558
11559
11560
11561
11562
11563
11564
11565
11566
11567
11568
11569
11570
11571
11572
11573
11574
11575
11576
11577
11578
11579
11580
11581
11582
11583
11584
11585
11586
11587
11588
11589
11590
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606
11607
11608
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618
11619
11620
11621
11622
11623
11624
11625
11626
11627
11628
11629
11630
11631
11632
11633
11634
11635
11636
11637
11638
11639
11640
11641
11642
11643
11644
11645
11646
11647
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657
11658
11659
11660
11661
11662
11663
11664
11665
11666
11667
11668
11669
11670
11671
11672
11673
11674
11675
11676
11677
11678
11679
11680
11681
11682
11683
11684
11685
11686
11687
11688
11689
11690
11691
11692
11693
11694
11695
11696
11697
11698
11699
11700
11701
11702
11703
11704
11705
11706
11707
11708
11709
11710
11711
11712
11713
11714
11715
11716
11717
11718
11719
11720
11721
11722
11723
11724
11725
11726
11727
11728
11729
11730
11731
11732
11733
11734
11735
11736
11737
11738
11739
11740
11741
11742
11743
11744
11745
11746
11747
11748
11749
11750
11751
11752
11753
11754
11755
11756
11757
11758
11759
11760
11761
11762
11763
11764
11765
11766
11767
11768
11769
11770
11771
11772
11773
11774
11775
11776
11777
11778
11779
11780
11781
11782
11783
11784
11785
11786
11787
11788
11789
11790
11791
11792
11793
11794
11795
11796
11797
11798
11799
11800
11801
11802
11803
11804
11805
11806
11807
11808
11809
11810
11811
11812
11813
11814
11815
11816
11817
11818
11819
11820
11821
11822
11823
11824
11825
11826
11827
11828
11829
11830
11831
11832
11833
11834
11835
11836
11837
11838
11839
11840
11841
11842
11843
11844
11845
11846
11847
11848
11849
11850
11851
11852
11853
11854
11855
11856
11857
11858
11859
11860
11861
11862
11863
11864
11865
11866
11867
11868
11869
11870
11871
11872
11873
11874
11875
11876
11877
11878
11879
11880
11881
11882
11883
11884
11885
11886
11887
11888
11889
11890
11891
11892
11893
11894
11895
11896
11897
11898
11899
11900
11901
11902
11903
11904
11905
11906
11907
11908
11909
11910
11911
11912
11913
11914
11915
11916
11917
11918
11919
11920
11921
11922
11923
11924
11925
11926
11927
11928
11929
11930
11931
11932
11933
11934
11935
11936
11937
11938
11939
11940
11941
11942
11943
11944
11945
11946
11947
11948
11949
11950
11951
11952
11953
11954
11955
11956
11957
11958
11959
11960
11961
11962
11963
11964
11965
11966
11967
11968
11969
11970
11971
11972
11973
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987
11988
11989
11990
11991
11992
11993
11994
11995
11996
11997
11998
11999
12000
12001
12002
12003
12004
12005
12006
12007
12008
12009
12010
12011
12012
12013
12014
12015
12016
12017
12018
12019
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031
12032
12033
12034
12035
12036
12037
12038
12039
12040
12041
12042
12043
12044
12045
12046
12047
12048
12049
12050
12051
12052
12053
12054
12055
12056
12057
12058
12059
12060
12061
12062
12063
12064
12065
12066
12067
12068
12069
12070
12071
12072
12073
12074
12075
12076
12077
12078
12079
12080
12081
12082
12083
12084
12085
12086
12087
12088
12089
12090
12091
12092
12093
12094
12095
12096
12097
12098
12099
12100
12101
12102
12103
12104
12105
12106
12107
12108
12109
12110
12111
12112
12113
12114
12115
12116
12117
12118
12119
12120
12121
12122
12123
12124
12125
12126
12127
12128
12129
12130
12131
12132
12133
12134
12135
12136
12137
12138
12139
12140
12141
12142
12143
12144
12145
12146
12147
12148
12149
12150
12151
12152
12153
12154
12155
12156
12157
12158
12159
12160
12161
12162
12163
12164
12165
12166
12167
12168
12169
12170
12171
12172
12173
12174
12175
12176
12177
12178
12179
12180
12181
12182
12183
12184
12185
12186
12187
12188
12189
12190
12191
12192
12193
12194
12195
12196
12197
12198
12199
12200
12201
12202
12203
12204
12205
12206
12207
12208
12209
12210
12211
12212
12213
12214
12215
12216
12217
12218
12219
12220
12221
12222
12223
12224
12225
12226
12227
12228
12229
12230
12231
12232
12233
12234
12235
12236
12237
12238
12239
12240
12241
12242
12243
12244
12245
12246
12247
12248
12249
12250
12251
12252
12253
12254
12255
12256
12257
12258
12259
12260
12261
12262
12263
12264
12265
12266
12267
12268
12269
12270
12271
12272
12273
12274
12275
12276
12277
12278
12279
12280
12281
12282
12283
12284
12285
12286
12287
12288
12289
12290
12291
12292
12293
12294
12295
12296
12297
12298
12299
12300
12301
12302
12303
12304
12305
12306
12307
12308
12309
12310
12311
12312
12313
12314
12315
12316
12317
12318
12319
12320
12321
12322
12323
12324
12325
12326
12327
12328
12329
12330
12331
12332
12333
12334
12335
12336
12337
12338
12339
12340
12341
12342
12343
12344
12345
12346
12347
12348
12349
12350
12351
12352
12353
12354
12355
12356
12357
12358
12359
12360
12361
12362
12363
12364
12365
12366
12367
12368
12369
12370
12371
12372
12373
12374
12375
12376
12377
12378
12379
12380
12381
12382
12383
12384
12385
12386
12387
12388
12389
12390
12391
12392
12393
12394
12395
12396
12397
12398
12399
12400
12401
12402
12403
12404
12405
12406
12407
12408
12409
12410
12411
12412
12413
12414
12415
12416
12417
12418
12419
12420
12421
12422
12423
12424
12425
12426
12427
12428
12429
12430
12431
12432
12433
12434
12435
12436
12437
12438
12439
12440
12441
12442
12443
12444
12445
12446
12447
12448
12449
12450
12451
12452
12453
12454
12455
12456
12457
12458
12459
12460
12461
12462
12463
12464
12465
12466
12467
12468
12469
12470
12471
12472
12473
12474
12475
12476
12477
12478
12479
12480
12481
12482
12483
12484
12485
12486
12487
12488
12489
12490
12491
12492
12493
12494
12495
12496
12497
12498
12499
12500
12501
12502
12503
12504
12505
12506
12507
12508
12509
12510
12511
12512
12513
12514
12515
12516
12517
12518
12519
12520
12521
12522
12523
12524
12525
12526
12527
12528
12529
12530
12531
12532
12533
12534
12535
12536
12537
12538
12539
12540
12541
12542
12543
12544
12545
12546
12547
12548
12549
12550
12551
12552
12553
12554
12555
12556
12557
12558
12559
12560
12561
12562
12563
12564
12565
12566
12567
12568
12569
12570
12571
12572
12573
12574
12575
12576
12577
12578
12579
12580
12581
12582
12583
12584
12585
12586
12587
12588
12589
12590
12591
12592
12593
12594
12595
12596
12597
12598
12599
12600
12601
12602
12603
12604
12605
12606
12607
12608
12609
12610
12611
12612
12613
12614
12615
12616
12617
12618
12619
12620
12621
12622
12623
12624
12625
12626
12627
12628
12629
12630
12631
12632
12633
12634
12635
12636
12637
12638
12639
12640
12641
12642
12643
12644
12645
12646
12647
12648
12649
12650
12651
12652
12653
12654
12655
12656
12657
12658
12659
12660
12661
12662
12663
12664
12665
12666
12667
12668
12669
12670
12671
12672
12673
12674
12675
12676
12677
12678
12679
12680
12681
12682
12683
12684
12685
12686
12687
12688
12689
12690
12691
12692
12693
12694
12695
12696
12697
12698
12699
12700
12701
12702
12703
12704
12705
12706
12707
12708
12709
12710
12711
12712
12713
12714
12715
12716
12717
12718
12719
12720
12721
12722
12723
12724
12725
12726
12727
12728
12729
12730
12731
12732
12733
12734
12735
12736
12737
12738
12739
12740
12741
12742
12743
12744
12745
12746
12747
12748
12749
12750
12751
12752
12753
12754
12755
12756
12757
12758
12759
12760
12761
12762
12763
12764
12765
12766
12767
12768
12769
12770
12771
12772
12773
12774
12775
12776
12777
12778
12779
12780
12781
12782
12783
12784
12785
12786
12787
12788
12789
12790
12791
12792
12793
12794
12795
12796
12797
12798
12799
12800
12801
12802
12803
12804
12805
12806
12807
12808
12809
12810
12811
12812
12813
12814
12815
12816
12817
12818
12819
12820
12821
12822
12823
12824
12825
12826
12827
12828
12829
12830
12831
12832
12833
12834
12835
12836
12837
12838
12839
12840
12841
12842
12843
12844
12845
12846
12847
12848
12849
12850
12851
12852
12853
12854
12855
12856
12857
12858
12859
12860
12861
12862
12863
12864
12865
12866
12867
12868
12869
12870
12871
12872
12873
12874
12875
12876
12877
12878
12879
12880
12881
12882
12883
12884
12885
12886
12887
12888
12889
12890
12891
12892
12893
12894
12895
12896
12897
12898
12899
12900
12901
12902
12903
12904
12905
12906
12907
12908
12909
12910
12911
12912
12913
12914
12915
12916
12917
12918
12919
12920
12921
12922
12923
12924
12925
12926
12927
12928
12929
12930
12931
12932
12933
12934
12935
12936
12937
12938
12939
12940
12941
12942
12943
12944
12945
12946
12947
12948
12949
12950
12951
12952
12953
12954
12955
12956
12957
12958
12959
12960
12961
12962
12963
12964
12965
12966
12967
12968
12969
12970
12971
12972
12973
12974
12975
12976
12977
12978
12979
12980
12981
12982
12983
12984
12985
12986
12987
12988
12989
12990
12991
12992
12993
12994
12995
12996
12997
12998
12999
13000
13001
13002
13003
13004
13005
13006
13007
13008
13009
13010
13011
13012
13013
13014
13015
13016
13017
13018
13019
13020
13021
13022
13023
13024
13025
13026
13027
13028
13029
13030
13031
13032
13033
13034
13035
13036
13037
13038
13039
13040
13041
13042
13043
13044
13045
13046
13047
13048
13049
13050
13051
13052
13053
13054
13055
13056
13057
13058
13059
13060
13061
13062
13063
13064
13065
13066
13067
13068
13069
13070
13071
13072
13073
13074
13075
13076
13077
13078
13079
13080
13081
13082
13083
13084
13085
13086
13087
13088
13089
13090
13091
13092
13093
13094
13095
13096
13097
13098
13099
13100
13101
13102
13103
13104
13105
13106
13107
13108
13109
13110
13111
13112
13113
13114
13115
13116
13117
13118
13119
13120
13121
13122
13123
13124
13125
13126
13127
13128
13129
13130
13131
13132
13133
13134
13135
13136
13137
13138
13139
13140
13141
13142
13143
13144
13145
13146
13147
13148
13149
13150
13151
13152
13153
13154
13155
13156
13157
13158
13159
13160
13161
13162
13163
13164
13165
13166
13167
13168
13169
13170
13171
13172
13173
13174
13175
13176
13177
13178
13179
13180
13181
13182
13183
13184
13185
13186
13187
13188
13189
13190
13191
13192
13193
13194
13195
13196
13197
13198
13199
13200
13201
13202
13203
13204
13205
13206
13207
13208
13209
13210
13211
13212
13213
13214
13215
13216
13217
13218
13219
13220
13221
13222
13223
13224
13225
13226
13227
13228
13229
13230
13231
13232
13233
13234
13235
13236
13237
13238
13239
13240
13241
13242
13243
13244
13245
13246
13247
13248
13249
13250
13251
13252
13253
13254
13255
13256
13257
13258
13259
13260
13261
13262
13263
13264
13265
13266
13267
13268
13269
13270
13271
13272
13273
13274
13275
13276
13277
13278
13279
13280
13281
13282
13283
13284
13285
13286
13287
13288
13289
13290
13291
13292
13293
13294
13295
13296
13297
13298
13299
13300
13301
13302
13303
13304
13305
13306
13307
13308
13309
13310
13311
13312
13313
13314
13315
13316
13317
13318
13319
13320
13321
13322
13323
13324
13325
13326
13327
13328
13329
13330
13331
13332
13333
13334
13335
13336
13337
13338
13339
13340
13341
13342
13343
13344
13345
13346
13347
13348
13349
13350
13351
13352
13353
13354
13355
13356
13357
13358
13359
13360
13361
13362
13363
13364
13365
13366
13367
13368
13369
13370
13371
13372
13373
13374
13375
13376
13377
13378
13379
13380
13381
13382
13383
13384
13385
13386
13387
13388
13389
13390
13391
13392
13393
13394
13395
13396
13397
13398
13399
13400
13401
13402
13403
13404
13405
13406
13407
13408
13409
13410
13411
13412
13413
13414
13415
13416
13417
13418
13419
13420
13421
13422
13423
13424
13425
13426
13427
13428
13429
13430
13431
13432
13433
13434
13435
13436
13437
13438
13439
13440
13441
13442
13443
13444
13445
13446
13447
13448
13449
13450
13451
13452
13453
13454
13455
13456
13457
13458
13459
13460
13461
13462
13463
13464
13465
13466
13467
13468
13469
13470
13471
13472
13473
13474
13475
13476
13477
13478
13479
13480
13481
13482
13483
13484
13485
13486
13487
13488
13489
13490
13491
13492
13493
13494
13495
13496
13497
13498
13499
13500
13501
13502
13503
13504
13505
13506
13507
13508
13509
13510
13511
13512
13513
13514
13515
13516
13517
13518
13519
13520
13521
13522
13523
13524
13525
13526
13527
13528
13529
13530
13531
13532
13533
13534
13535
13536
13537
13538
13539
13540
13541
13542
13543
13544
13545
13546
13547
13548
13549
13550
13551
13552
13553
13554
13555
13556
13557
13558
13559
13560
13561
13562
13563
13564
13565
13566
13567
13568
13569
13570
13571
13572
13573
13574
13575
13576
13577
13578
13579
13580
13581
13582
13583
13584
13585
13586
13587
13588
13589
13590
13591
13592
13593
13594
13595
13596
13597
13598
13599
13600
13601
13602
13603
13604
13605
13606
13607
13608
13609
13610
13611
13612
13613
13614
13615
13616
13617
13618
13619
13620
13621
13622
13623
13624
13625
13626
13627
13628
13629
13630
13631
13632
13633
13634
13635
13636
13637
13638
13639
13640
13641
13642
13643
13644
13645
13646
13647
13648
13649
13650
13651
13652
13653
13654
13655
13656
13657
13658
13659
13660
13661
13662
13663
13664
13665
13666
13667
13668
13669
13670
13671
13672
13673
13674
13675
13676
13677
13678
13679
13680
13681
13682
13683
13684
13685
13686
13687
13688
13689
13690
13691
13692
13693
13694
13695
13696
13697
13698
13699
13700
13701
13702
13703
13704
13705
13706
13707
13708
13709
13710
13711
13712
13713
13714
13715
13716
13717
13718
13719
13720
13721
13722
13723
13724
13725
13726
13727
13728
13729
13730
13731
13732
13733
13734
13735
13736
13737
13738
13739
13740
13741
13742
13743
13744
13745
13746
13747
13748
13749
13750
13751
13752
13753
13754
13755
13756
13757
13758
13759
13760
13761
13762
13763
13764
13765
13766
13767
13768
13769
13770
13771
13772
13773
13774
13775
13776
13777
13778
13779
13780
13781
13782
13783
13784
13785
13786
13787
13788
13789
13790
13791
13792
13793
13794
13795
13796
13797
13798
13799
13800
13801
13802
13803
13804
13805
13806
13807
13808
13809
13810
13811
13812
13813
13814
13815
13816
13817
13818
13819
13820
13821
13822
13823
13824
13825
13826
13827
13828
13829
13830
13831
13832
13833
13834
13835
13836
13837
13838
13839
13840
13841
13842
13843
13844
13845
13846
13847
13848
13849
13850
13851
13852
13853
13854
13855
13856
13857
13858
13859
13860
13861
13862
13863
13864
13865
13866
13867
13868
13869
13870
13871
13872
13873
13874
13875
13876
13877
13878
13879
13880
13881
13882
13883
13884
13885
13886
13887
13888
13889
13890
13891
13892
13893
13894
13895
13896
13897
13898
13899
13900
13901
13902
13903
13904
13905
13906
13907
13908
13909
13910
13911
13912
13913
13914
13915
// SPDX-License-Identifier: GPL-2.0
/*
 * Performance events core code:
 *
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
 *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 */

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/idr.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/hash.h>
#include <linux/tick.h>
#include <linux/sysfs.h>
#include <linux/dcache.h>
#include <linux/percpu.h>
#include <linux/ptrace.h>
#include <linux/reboot.h>
#include <linux/vmstat.h>
#include <linux/device.h>
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/hardirq.h>
#include <linux/hugetlb.h>
#include <linux/rculist.h>
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
#include <linux/kernel_stat.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>
#include <linux/trace_events.h>
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/compat.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/namei.h>
#include <linux/parser.h>
#include <linux/sched/clock.h>
#include <linux/sched/mm.h>
#include <linux/proc_ns.h>
#include <linux/mount.h>
#include <linux/min_heap.h>
#include <linux/highmem.h>
#include <linux/pgtable.h>
#include <linux/buildid.h>
#include <linux/task_work.h>

#include "internal.h"

#include <asm/irq_regs.h>

typedef int (*remote_function_f)(void *);

struct remote_function_call {
        struct task_struct        *p;
        remote_function_f        func;
        void                        *info;
        int                        ret;
};

static void remote_function(void *data)
{
        struct remote_function_call *tfc = data;
        struct task_struct *p = tfc->p;

        if (p) {
                /* -EAGAIN */
                if (task_cpu(p) != smp_processor_id())
                        return;

                /*
                 * Now that we're on right CPU with IRQs disabled, we can test
                 * if we hit the right task without races.
                 */

                tfc->ret = -ESRCH; /* No such (running) process */
                if (p != current)
                        return;
        }

        tfc->ret = tfc->func(tfc->info);
}

/**
 * task_function_call - call a function on the cpu on which a task runs
 * @p:                the task to evaluate
 * @func:        the function to be called
 * @info:        the function call argument
 *
 * Calls the function @func when the task is currently running. This might
 * be on the current CPU, which just calls the function directly.  This will
 * retry due to any failures in smp_call_function_single(), such as if the
 * task_cpu() goes offline concurrently.
 *
 * returns @func return value or -ESRCH or -ENXIO when the process isn't running
 */
static int
task_function_call(struct task_struct *p, remote_function_f func, void *info)
{
        struct remote_function_call data = {
                .p        = p,
                .func        = func,
                .info        = info,
                .ret        = -EAGAIN,
        };
        int ret;

        for (;;) {
                ret = smp_call_function_single(task_cpu(p), remote_function,
                                               &data, 1);
                if (!ret)
                        ret = data.ret;

                if (ret != -EAGAIN)
                        break;

                cond_resched();
        }

        return ret;
}

/**
 * cpu_function_call - call a function on the cpu
 * @cpu:        target cpu to queue this function
 * @func:        the function to be called
 * @info:        the function call argument
 *
 * Calls the function @func on the remote cpu.
 *
 * returns: @func return value or -ENXIO when the cpu is offline
 */
static int cpu_function_call(int cpu, remote_function_f func, void *info)
{
        struct remote_function_call data = {
                .p        = NULL,
                .func        = func,
                .info        = info,
                .ret        = -ENXIO, /* No such CPU */
        };

        smp_call_function_single(cpu, remote_function, &data, 1);

        return data.ret;
}

static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
                          struct perf_event_context *ctx)
{
        raw_spin_lock(&cpuctx->ctx.lock);
        if (ctx)
                raw_spin_lock(&ctx->lock);
}

static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
                            struct perf_event_context *ctx)
{
        if (ctx)
                raw_spin_unlock(&ctx->lock);
        raw_spin_unlock(&cpuctx->ctx.lock);
}

#define TASK_TOMBSTONE ((void *)-1L)

static bool is_kernel_event(struct perf_event *event)
{
        return READ_ONCE(event->owner) == TASK_TOMBSTONE;
}

static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);

struct perf_event_context *perf_cpu_task_ctx(void)
{
        lockdep_assert_irqs_disabled();
        return this_cpu_ptr(&perf_cpu_context)->task_ctx;
}

/*
 * On task ctx scheduling...
 *
 * When !ctx->nr_events a task context will not be scheduled. This means
 * we can disable the scheduler hooks (for performance) without leaving
 * pending task ctx state.
 *
 * This however results in two special cases:
 *
 *  - removing the last event from a task ctx; this is relatively straight
 *    forward and is done in __perf_remove_from_context.
 *
 *  - adding the first event to a task ctx; this is tricky because we cannot
 *    rely on ctx->is_active and therefore cannot use event_function_call().
 *    See perf_install_in_context().
 *
 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
 */

typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
                        struct perf_event_context *, void *);

struct event_function_struct {
        struct perf_event *event;
        event_f func;
        void *data;
};

static int event_function(void *info)
{
        struct event_function_struct *efs = info;
        struct perf_event *event = efs->event;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
        int ret = 0;

        lockdep_assert_irqs_disabled();

        perf_ctx_lock(cpuctx, task_ctx);
        /*
         * Since we do the IPI call without holding ctx->lock things can have
         * changed, double check we hit the task we set out to hit.
         */
        if (ctx->task) {
                if (ctx->task != current) {
                        ret = -ESRCH;
                        goto unlock;
                }

                /*
                 * We only use event_function_call() on established contexts,
                 * and event_function() is only ever called when active (or
                 * rather, we'll have bailed in task_function_call() or the
                 * above ctx->task != current test), therefore we must have
                 * ctx->is_active here.
                 */
                WARN_ON_ONCE(!ctx->is_active);
                /*
                 * And since we have ctx->is_active, cpuctx->task_ctx must
                 * match.
                 */
                WARN_ON_ONCE(task_ctx != ctx);
        } else {
                WARN_ON_ONCE(&cpuctx->ctx != ctx);
        }

        efs->func(event, cpuctx, ctx, efs->data);
unlock:
        perf_ctx_unlock(cpuctx, task_ctx);

        return ret;
}

static void event_function_call(struct perf_event *event, event_f func, void *data)
{
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
        struct event_function_struct efs = {
                .event = event,
                .func = func,
                .data = data,
        };

        if (!event->parent) {
                /*
                 * If this is a !child event, we must hold ctx::mutex to
                 * stabilize the event->ctx relation. See
                 * perf_event_ctx_lock().
                 */
                lockdep_assert_held(&ctx->mutex);
        }

        if (!task) {
                cpu_function_call(event->cpu, event_function, &efs);
                return;
        }

        if (task == TASK_TOMBSTONE)
                return;

again:
        if (!task_function_call(task, event_function, &efs))
                return;

        raw_spin_lock_irq(&ctx->lock);
        /*
         * Reload the task pointer, it might have been changed by
         * a concurrent perf_event_context_sched_out().
         */
        task = ctx->task;
        if (task == TASK_TOMBSTONE) {
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
        if (ctx->is_active) {
                raw_spin_unlock_irq(&ctx->lock);
                goto again;
        }
        func(event, NULL, ctx, data);
        raw_spin_unlock_irq(&ctx->lock);
}

/*
 * Similar to event_function_call() + event_function(), but hard assumes IRQs
 * are already disabled and we're on the right CPU.
 */
static void event_function_local(struct perf_event *event, event_f func, void *data)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct task_struct *task = READ_ONCE(ctx->task);
        struct perf_event_context *task_ctx = NULL;

        lockdep_assert_irqs_disabled();

        if (task) {
                if (task == TASK_TOMBSTONE)
                        return;

                task_ctx = ctx;
        }

        perf_ctx_lock(cpuctx, task_ctx);

        task = ctx->task;
        if (task == TASK_TOMBSTONE)
                goto unlock;

        if (task) {
                /*
                 * We must be either inactive or active and the right task,
                 * otherwise we're screwed, since we cannot IPI to somewhere
                 * else.
                 */
                if (ctx->is_active) {
                        if (WARN_ON_ONCE(task != current))
                                goto unlock;

                        if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
                                goto unlock;
                }
        } else {
                WARN_ON_ONCE(&cpuctx->ctx != ctx);
        }

        func(event, cpuctx, ctx, data);
unlock:
        perf_ctx_unlock(cpuctx, task_ctx);
}

#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
                       PERF_FLAG_FD_OUTPUT  |\
                       PERF_FLAG_PID_CGROUP |\
                       PERF_FLAG_FD_CLOEXEC)

/*
 * branch priv levels that need permission checks
 */
#define PERF_SAMPLE_BRANCH_PERM_PLM \
        (PERF_SAMPLE_BRANCH_KERNEL |\
         PERF_SAMPLE_BRANCH_HV)

enum event_type_t {
        EVENT_FLEXIBLE = 0x1,
        EVENT_PINNED = 0x2,
        EVENT_TIME = 0x4,
        /* see ctx_resched() for details */
        EVENT_CPU = 0x8,
        EVENT_CGROUP = 0x10,
        EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};

/*
 * perf_sched_events : >0 events exist
 */

static void perf_sched_delayed(struct work_struct *work);
DEFINE_STATIC_KEY_FALSE(perf_sched_events);
static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;

static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);

static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_namespaces_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;
static atomic_t nr_ksymbol_events __read_mostly;
static atomic_t nr_bpf_events __read_mostly;
static atomic_t nr_cgroup_events __read_mostly;
static atomic_t nr_text_poke_events __read_mostly;
static atomic_t nr_build_id_events __read_mostly;

static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
static cpumask_var_t perf_online_mask;
static struct kmem_cache *perf_event_cache;

/*
 * perf event paranoia level:
 *  -1 - not paranoid at all
 *   0 - disallow raw tracepoint access for unpriv
 *   1 - disallow cpu events for unpriv
 *   2 - disallow kernel profiling for unpriv
 */
int sysctl_perf_event_paranoid __read_mostly = 2;

/* Minimum for 512 kiB + 1 user control page */
int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */

/*
 * max perf event sample rate
 */
#define DEFAULT_MAX_SAMPLE_RATE                100000
#define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
#define DEFAULT_CPU_TIME_MAX_PERCENT        25

int sysctl_perf_event_sample_rate __read_mostly        = DEFAULT_MAX_SAMPLE_RATE;

static int max_samples_per_tick __read_mostly        = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
static int perf_sample_period_ns __read_mostly        = DEFAULT_SAMPLE_PERIOD_NS;

static int perf_sample_allowed_ns __read_mostly =
        DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;

static void update_perf_cpu_limits(void)
{
        u64 tmp = perf_sample_period_ns;

        tmp *= sysctl_perf_cpu_time_max_percent;
        tmp = div_u64(tmp, 100);
        if (!tmp)
                tmp = 1;

        WRITE_ONCE(perf_sample_allowed_ns, tmp);
}

static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);

int perf_event_max_sample_rate_handler(struct ctl_table *table, int write,
                                       void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;
        int perf_cpu = sysctl_perf_cpu_time_max_percent;
        /*
         * If throttling is disabled don't allow the write:
         */
        if (write && (perf_cpu == 100 || perf_cpu == 0))
                return -EINVAL;

        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret || !write)
                return ret;

        max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
        update_perf_cpu_limits();

        return 0;
}

int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;

int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);

        if (ret || !write)
                return ret;

        if (sysctl_perf_cpu_time_max_percent == 100 ||
            sysctl_perf_cpu_time_max_percent == 0) {
                printk(KERN_WARNING
                       "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
                WRITE_ONCE(perf_sample_allowed_ns, 0);
        } else {
                update_perf_cpu_limits();
        }

        return 0;
}

/*
 * perf samples are done in some very critical code paths (NMIs).
 * If they take too much CPU time, the system can lock up and not
 * get any real work done.  This will drop the sample rate when
 * we detect that events are taking too long.
 */
#define NR_ACCUMULATED_SAMPLES 128
static DEFINE_PER_CPU(u64, running_sample_length);

static u64 __report_avg;
static u64 __report_allowed;

static void perf_duration_warn(struct irq_work *w)
{
        printk_ratelimited(KERN_INFO
                "perf: interrupt took too long (%lld > %lld), lowering "
                "kernel.perf_event_max_sample_rate to %d\n",
                __report_avg, __report_allowed,
                sysctl_perf_event_sample_rate);
}

static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);

void perf_sample_event_took(u64 sample_len_ns)
{
        u64 max_len = READ_ONCE(perf_sample_allowed_ns);
        u64 running_len;
        u64 avg_len;
        u32 max;

        if (max_len == 0)
                return;

        /* Decay the counter by 1 average sample. */
        running_len = __this_cpu_read(running_sample_length);
        running_len -= running_len/NR_ACCUMULATED_SAMPLES;
        running_len += sample_len_ns;
        __this_cpu_write(running_sample_length, running_len);

        /*
         * Note: this will be biased artifically low until we have
         * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
         * from having to maintain a count.
         */
        avg_len = running_len/NR_ACCUMULATED_SAMPLES;
        if (avg_len <= max_len)
                return;

        __report_avg = avg_len;
        __report_allowed = max_len;

        /*
         * Compute a throttle threshold 25% below the current duration.
         */
        avg_len += avg_len / 4;
        max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
        if (avg_len < max)
                max /= (u32)avg_len;
        else
                max = 1;

        WRITE_ONCE(perf_sample_allowed_ns, avg_len);
        WRITE_ONCE(max_samples_per_tick, max);

        sysctl_perf_event_sample_rate = max * HZ;
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;

        if (!irq_work_queue(&perf_duration_work)) {
                early_printk("perf: interrupt took too long (%lld > %lld), lowering "
                             "kernel.perf_event_max_sample_rate to %d\n",
                             __report_avg, __report_allowed,
                             sysctl_perf_event_sample_rate);
        }
}

static atomic64_t perf_event_id;

static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);

void __weak perf_event_print_debug(void)        { }

static inline u64 perf_clock(void)
{
        return local_clock();
}

static inline u64 perf_event_clock(struct perf_event *event)
{
        return event->clock();
}

/*
 * State based event timekeeping...
 *
 * The basic idea is to use event->state to determine which (if any) time
 * fields to increment with the current delta. This means we only need to
 * update timestamps when we change state or when they are explicitly requested
 * (read).
 *
 * Event groups make things a little more complicated, but not terribly so. The
 * rules for a group are that if the group leader is OFF the entire group is
 * OFF, irrespecive of what the group member states are. This results in
 * __perf_effective_state().
 *
 * A futher ramification is that when a group leader flips between OFF and
 * !OFF, we need to update all group member times.
 *
 *
 * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
 * need to make sure the relevant context time is updated before we try and
 * update our timestamps.
 */

static __always_inline enum perf_event_state
__perf_effective_state(struct perf_event *event)
{
        struct perf_event *leader = event->group_leader;

        if (leader->state <= PERF_EVENT_STATE_OFF)
                return leader->state;

        return event->state;
}

static __always_inline void
__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
{
        enum perf_event_state state = __perf_effective_state(event);
        u64 delta = now - event->tstamp;

        *enabled = event->total_time_enabled;
        if (state >= PERF_EVENT_STATE_INACTIVE)
                *enabled += delta;

        *running = event->total_time_running;
        if (state >= PERF_EVENT_STATE_ACTIVE)
                *running += delta;
}

static void perf_event_update_time(struct perf_event *event)
{
        u64 now = perf_event_time(event);

        __perf_update_times(event, now, &event->total_time_enabled,
                                        &event->total_time_running);
        event->tstamp = now;
}

static void perf_event_update_sibling_time(struct perf_event *leader)
{
        struct perf_event *sibling;

        for_each_sibling_event(sibling, leader)
                perf_event_update_time(sibling);
}

static void
perf_event_set_state(struct perf_event *event, enum perf_event_state state)
{
        if (event->state == state)
                return;

        perf_event_update_time(event);
        /*
         * If a group leader gets enabled/disabled all its siblings
         * are affected too.
         */
        if ((event->state < 0) ^ (state < 0))
                perf_event_update_sibling_time(event);

        WRITE_ONCE(event->state, state);
}

/*
 * UP store-release, load-acquire
 */

#define __store_release(ptr, val)                                        \
do {                                                                        \
        barrier();                                                        \
        WRITE_ONCE(*(ptr), (val));                                        \
} while (0)

#define __load_acquire(ptr)                                                \
({                                                                        \
        __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr));        \
        barrier();                                                        \
        ___p;                                                                \
})

static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
{
        struct perf_event_pmu_context *pmu_ctx;

        list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
                if (cgroup && !pmu_ctx->nr_cgroups)
                        continue;
                perf_pmu_disable(pmu_ctx->pmu);
        }
}

static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
{
        struct perf_event_pmu_context *pmu_ctx;

        list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
                if (cgroup && !pmu_ctx->nr_cgroups)
                        continue;
                perf_pmu_enable(pmu_ctx->pmu);
        }
}

static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);

#ifdef CONFIG_CGROUP_PERF

static inline bool
perf_cgroup_match(struct perf_event *event)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);

        /* @event doesn't care about cgroup */
        if (!event->cgrp)
                return true;

        /* wants specific cgroup scope but @cpuctx isn't associated with any */
        if (!cpuctx->cgrp)
                return false;

        /*
         * Cgroup scoping is recursive.  An event enabled for a cgroup is
         * also enabled for all its descendant cgroups.  If @cpuctx's
         * cgroup is a descendant of @event's (the test covers identity
         * case), it's a match.
         */
        return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
                                    event->cgrp->css.cgroup);
}

static inline void perf_detach_cgroup(struct perf_event *event)
{
        css_put(&event->cgrp->css);
        event->cgrp = NULL;
}

static inline int is_cgroup_event(struct perf_event *event)
{
        return event->cgrp != NULL;
}

static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
        struct perf_cgroup_info *t;

        t = per_cpu_ptr(event->cgrp->info, event->cpu);
        return t->time;
}

static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
        struct perf_cgroup_info *t;

        t = per_cpu_ptr(event->cgrp->info, event->cpu);
        if (!__load_acquire(&t->active))
                return t->time;
        now += READ_ONCE(t->timeoffset);
        return now;
}

static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
{
        if (adv)
                info->time += now - info->timestamp;
        info->timestamp = now;
        /*
         * see update_context_time()
         */
        WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
}

static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
{
        struct perf_cgroup *cgrp = cpuctx->cgrp;
        struct cgroup_subsys_state *css;
        struct perf_cgroup_info *info;

        if (cgrp) {
                u64 now = perf_clock();

                for (css = &cgrp->css; css; css = css->parent) {
                        cgrp = container_of(css, struct perf_cgroup, css);
                        info = this_cpu_ptr(cgrp->info);

                        __update_cgrp_time(info, now, true);
                        if (final)
                                __store_release(&info->active, 0);
                }
        }
}

static inline void update_cgrp_time_from_event(struct perf_event *event)
{
        struct perf_cgroup_info *info;

        /*
         * ensure we access cgroup data only when needed and
         * when we know the cgroup is pinned (css_get)
         */
        if (!is_cgroup_event(event))
                return;

        info = this_cpu_ptr(event->cgrp->info);
        /*
         * Do not update time when cgroup is not active
         */
        if (info->active)
                __update_cgrp_time(info, perf_clock(), true);
}

static inline void
perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
{
        struct perf_event_context *ctx = &cpuctx->ctx;
        struct perf_cgroup *cgrp = cpuctx->cgrp;
        struct perf_cgroup_info *info;
        struct cgroup_subsys_state *css;

        /*
         * ctx->lock held by caller
         * ensure we do not access cgroup data
         * unless we have the cgroup pinned (css_get)
         */
        if (!cgrp)
                return;

        WARN_ON_ONCE(!ctx->nr_cgroups);

        for (css = &cgrp->css; css; css = css->parent) {
                cgrp = container_of(css, struct perf_cgroup, css);
                info = this_cpu_ptr(cgrp->info);
                __update_cgrp_time(info, ctx->timestamp, false);
                __store_release(&info->active, 1);
        }
}

/*
 * reschedule events based on the cgroup constraint of task.
 */
static void perf_cgroup_switch(struct task_struct *task)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_cgroup *cgrp;

        /*
         * cpuctx->cgrp is set when the first cgroup event enabled,
         * and is cleared when the last cgroup event disabled.
         */
        if (READ_ONCE(cpuctx->cgrp) == NULL)
                return;

        WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);

        cgrp = perf_cgroup_from_task(task, NULL);
        if (READ_ONCE(cpuctx->cgrp) == cgrp)
                return;

        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_ctx_disable(&cpuctx->ctx, true);

        ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
        /*
         * must not be done before ctxswout due
         * to update_cgrp_time_from_cpuctx() in
         * ctx_sched_out()
         */
        cpuctx->cgrp = cgrp;
        /*
         * set cgrp before ctxsw in to allow
         * perf_cgroup_set_timestamp() in ctx_sched_in()
         * to not have to pass task around
         */
        ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);

        perf_ctx_enable(&cpuctx->ctx, true);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}

static int perf_cgroup_ensure_storage(struct perf_event *event,
                                struct cgroup_subsys_state *css)
{
        struct perf_cpu_context *cpuctx;
        struct perf_event **storage;
        int cpu, heap_size, ret = 0;

        /*
         * Allow storage to have sufficent space for an iterator for each
         * possibly nested cgroup plus an iterator for events with no cgroup.
         */
        for (heap_size = 1; css; css = css->parent)
                heap_size++;

        for_each_possible_cpu(cpu) {
                cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
                if (heap_size <= cpuctx->heap_size)
                        continue;

                storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
                                       GFP_KERNEL, cpu_to_node(cpu));
                if (!storage) {
                        ret = -ENOMEM;
                        break;
                }

                raw_spin_lock_irq(&cpuctx->ctx.lock);
                if (cpuctx->heap_size < heap_size) {
                        swap(cpuctx->heap, storage);
                        if (storage == cpuctx->heap_default)
                                storage = NULL;
                        cpuctx->heap_size = heap_size;
                }
                raw_spin_unlock_irq(&cpuctx->ctx.lock);

                kfree(storage);
        }

        return ret;
}

static inline int perf_cgroup_connect(int fd, struct perf_event *event,
                                      struct perf_event_attr *attr,
                                      struct perf_event *group_leader)
{
        struct perf_cgroup *cgrp;
        struct cgroup_subsys_state *css;
        struct fd f = fdget(fd);
        int ret = 0;

        if (!f.file)
                return -EBADF;

        css = css_tryget_online_from_dir(f.file->f_path.dentry,
                                         &perf_event_cgrp_subsys);
        if (IS_ERR(css)) {
                ret = PTR_ERR(css);
                goto out;
        }

        ret = perf_cgroup_ensure_storage(event, css);
        if (ret)
                goto out;

        cgrp = container_of(css, struct perf_cgroup, css);
        event->cgrp = cgrp;

        /*
         * all events in a group must monitor
         * the same cgroup because a task belongs
         * to only one perf cgroup at a time
         */
        if (group_leader && group_leader->cgrp != cgrp) {
                perf_detach_cgroup(event);
                ret = -EINVAL;
        }
out:
        fdput(f);
        return ret;
}

static inline void
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_cpu_context *cpuctx;

        if (!is_cgroup_event(event))
                return;

        event->pmu_ctx->nr_cgroups++;

        /*
         * Because cgroup events are always per-cpu events,
         * @ctx == &cpuctx->ctx.
         */
        cpuctx = container_of(ctx, struct perf_cpu_context, ctx);

        if (ctx->nr_cgroups++)
                return;

        cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
}

static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_cpu_context *cpuctx;

        if (!is_cgroup_event(event))
                return;

        event->pmu_ctx->nr_cgroups--;

        /*
         * Because cgroup events are always per-cpu events,
         * @ctx == &cpuctx->ctx.
         */
        cpuctx = container_of(ctx, struct perf_cpu_context, ctx);

        if (--ctx->nr_cgroups)
                return;

        cpuctx->cgrp = NULL;
}

#else /* !CONFIG_CGROUP_PERF */

static inline bool
perf_cgroup_match(struct perf_event *event)
{
        return true;
}

static inline void perf_detach_cgroup(struct perf_event *event)
{}

static inline int is_cgroup_event(struct perf_event *event)
{
        return 0;
}

static inline void update_cgrp_time_from_event(struct perf_event *event)
{
}

static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
                                                bool final)
{
}

static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
                                      struct perf_event_attr *attr,
                                      struct perf_event *group_leader)
{
        return -EINVAL;
}

static inline void
perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
{
}

static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
        return 0;
}

static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
        return 0;
}

static inline void
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
}

static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
}

static void perf_cgroup_switch(struct task_struct *task)
{
}
#endif

/*
 * set default to be dependent on timer tick just
 * like original code
 */
#define PERF_CPU_HRTIMER (1000 / HZ)
/*
 * function must be called with interrupts disabled
 */
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
        struct perf_cpu_pmu_context *cpc;
        bool rotations;

        lockdep_assert_irqs_disabled();

        cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer);
        rotations = perf_rotate_context(cpc);

        raw_spin_lock(&cpc->hrtimer_lock);
        if (rotations)
                hrtimer_forward_now(hr, cpc->hrtimer_interval);
        else
                cpc->hrtimer_active = 0;
        raw_spin_unlock(&cpc->hrtimer_lock);

        return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
}

static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
{
        struct hrtimer *timer = &cpc->hrtimer;
        struct pmu *pmu = cpc->epc.pmu;
        u64 interval;

        /*
         * check default is sane, if not set then force to
         * default interval (1/tick)
         */
        interval = pmu->hrtimer_interval_ms;
        if (interval < 1)
                interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;

        cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);

        raw_spin_lock_init(&cpc->hrtimer_lock);
        hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
        timer->function = perf_mux_hrtimer_handler;
}

static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
{
        struct hrtimer *timer = &cpc->hrtimer;
        unsigned long flags;

        raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags);
        if (!cpc->hrtimer_active) {
                cpc->hrtimer_active = 1;
                hrtimer_forward_now(timer, cpc->hrtimer_interval);
                hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
        }
        raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags);

        return 0;
}

static int perf_mux_hrtimer_restart_ipi(void *arg)
{
        return perf_mux_hrtimer_restart(arg);
}

void perf_pmu_disable(struct pmu *pmu)
{
        int *count = this_cpu_ptr(pmu->pmu_disable_count);
        if (!(*count)++)
                pmu->pmu_disable(pmu);
}

void perf_pmu_enable(struct pmu *pmu)
{
        int *count = this_cpu_ptr(pmu->pmu_disable_count);
        if (!--(*count))
                pmu->pmu_enable(pmu);
}

static void perf_assert_pmu_disabled(struct pmu *pmu)
{
        WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
}

static void get_ctx(struct perf_event_context *ctx)
{
        refcount_inc(&ctx->refcount);
}

static void *alloc_task_ctx_data(struct pmu *pmu)
{
        if (pmu->task_ctx_cache)
                return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);

        return NULL;
}

static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
{
        if (pmu->task_ctx_cache && task_ctx_data)
                kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
}

static void free_ctx(struct rcu_head *head)
{
        struct perf_event_context *ctx;

        ctx = container_of(head, struct perf_event_context, rcu_head);
        kfree(ctx);
}

static void put_ctx(struct perf_event_context *ctx)
{
        if (refcount_dec_and_test(&ctx->refcount)) {
                if (ctx->parent_ctx)
                        put_ctx(ctx->parent_ctx);
                if (ctx->task && ctx->task != TASK_TOMBSTONE)
                        put_task_struct(ctx->task);
                call_rcu(&ctx->rcu_head, free_ctx);
        }
}

/*
 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
 * perf_pmu_migrate_context() we need some magic.
 *
 * Those places that change perf_event::ctx will hold both
 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
 *
 * Lock ordering is by mutex address. There are two other sites where
 * perf_event_context::mutex nests and those are:
 *
 *  - perf_event_exit_task_context()        [ child , 0 ]
 *      perf_event_exit_event()
 *        put_event()                        [ parent, 1 ]
 *
 *  - perf_event_init_context()                [ parent, 0 ]
 *      inherit_task_group()
 *        inherit_group()
 *          inherit_event()
 *            perf_event_alloc()
 *              perf_init_event()
 *                perf_try_init_event()        [ child , 1 ]
 *
 * While it appears there is an obvious deadlock here -- the parent and child
 * nesting levels are inverted between the two. This is in fact safe because
 * life-time rules separate them. That is an exiting task cannot fork, and a
 * spawning task cannot (yet) exit.
 *
 * But remember that these are parent<->child context relations, and
 * migration does not affect children, therefore these two orderings should not
 * interact.
 *
 * The change in perf_event::ctx does not affect children (as claimed above)
 * because the sys_perf_event_open() case will install a new event and break
 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
 * concerned with cpuctx and that doesn't have children.
 *
 * The places that change perf_event::ctx will issue:
 *
 *   perf_remove_from_context();
 *   synchronize_rcu();
 *   perf_install_in_context();
 *
 * to affect the change. The remove_from_context() + synchronize_rcu() should
 * quiesce the event, after which we can install it in the new location. This
 * means that only external vectors (perf_fops, prctl) can perturb the event
 * while in transit. Therefore all such accessors should also acquire
 * perf_event_context::mutex to serialize against this.
 *
 * However; because event->ctx can change while we're waiting to acquire
 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
 * function.
 *
 * Lock order:
 *    exec_update_lock
 *        task_struct::perf_event_mutex
 *          perf_event_context::mutex
 *            perf_event::child_mutex;
 *              perf_event_context::lock
 *            perf_event::mmap_mutex
 *            mmap_lock
 *              perf_addr_filters_head::lock
 *
 *    cpu_hotplug_lock
 *      pmus_lock
 *          cpuctx->mutex / perf_event_context::mutex
 */
static struct perf_event_context *
perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
{
        struct perf_event_context *ctx;

again:
        rcu_read_lock();
        ctx = READ_ONCE(event->ctx);
        if (!refcount_inc_not_zero(&ctx->refcount)) {
                rcu_read_unlock();
                goto again;
        }
        rcu_read_unlock();

        mutex_lock_nested(&ctx->mutex, nesting);
        if (event->ctx != ctx) {
                mutex_unlock(&ctx->mutex);
                put_ctx(ctx);
                goto again;
        }

        return ctx;
}

static inline struct perf_event_context *
perf_event_ctx_lock(struct perf_event *event)
{
        return perf_event_ctx_lock_nested(event, 0);
}

static void perf_event_ctx_unlock(struct perf_event *event,
                                  struct perf_event_context *ctx)
{
        mutex_unlock(&ctx->mutex);
        put_ctx(ctx);
}

/*
 * This must be done under the ctx->lock, such as to serialize against
 * context_equiv(), therefore we cannot call put_ctx() since that might end up
 * calling scheduler related locks and ctx->lock nests inside those.
 */
static __must_check struct perf_event_context *
unclone_ctx(struct perf_event_context *ctx)
{
        struct perf_event_context *parent_ctx = ctx->parent_ctx;

        lockdep_assert_held(&ctx->lock);

        if (parent_ctx)
                ctx->parent_ctx = NULL;
        ctx->generation++;

        return parent_ctx;
}

static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
                                enum pid_type type)
{
        u32 nr;
        /*
         * only top level events have the pid namespace they were created in
         */
        if (event->parent)
                event = event->parent;

        nr = __task_pid_nr_ns(p, type, event->ns);
        /* avoid -1 if it is idle thread or runs in another ns */
        if (!nr && !pid_alive(p))
                nr = -1;
        return nr;
}

static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
{
        return perf_event_pid_type(event, p, PIDTYPE_TGID);
}

static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
{
        return perf_event_pid_type(event, p, PIDTYPE_PID);
}

/*
 * If we inherit events we want to return the parent event id
 * to userspace.
 */
static u64 primary_event_id(struct perf_event *event)
{
        u64 id = event->id;

        if (event->parent)
                id = event->parent->id;

        return id;
}

/*
 * Get the perf_event_context for a task and lock it.
 *
 * This has to cope with the fact that until it is locked,
 * the context could get moved to another task.
 */
static struct perf_event_context *
perf_lock_task_context(struct task_struct *task, unsigned long *flags)
{
        struct perf_event_context *ctx;

retry:
        /*
         * One of the few rules of preemptible RCU is that one cannot do
         * rcu_read_unlock() while holding a scheduler (or nested) lock when
         * part of the read side critical section was irqs-enabled -- see
         * rcu_read_unlock_special().
         *
         * Since ctx->lock nests under rq->lock we must ensure the entire read
         * side critical section has interrupts disabled.
         */
        local_irq_save(*flags);
        rcu_read_lock();
        ctx = rcu_dereference(task->perf_event_ctxp);
        if (ctx) {
                /*
                 * If this context is a clone of another, it might
                 * get swapped for another underneath us by
                 * perf_event_task_sched_out, though the
                 * rcu_read_lock() protects us from any context
                 * getting freed.  Lock the context and check if it
                 * got swapped before we could get the lock, and retry
                 * if so.  If we locked the right context, then it
                 * can't get swapped on us any more.
                 */
                raw_spin_lock(&ctx->lock);
                if (ctx != rcu_dereference(task->perf_event_ctxp)) {
                        raw_spin_unlock(&ctx->lock);
                        rcu_read_unlock();
                        local_irq_restore(*flags);
                        goto retry;
                }

                if (ctx->task == TASK_TOMBSTONE ||
                    !refcount_inc_not_zero(&ctx->refcount)) {
                        raw_spin_unlock(&ctx->lock);
                        ctx = NULL;
                } else {
                        WARN_ON_ONCE(ctx->task != task);
                }
        }
        rcu_read_unlock();
        if (!ctx)
                local_irq_restore(*flags);
        return ctx;
}

/*
 * Get the context for a task and increment its pin_count so it
 * can't get swapped to another task.  This also increments its
 * reference count so that the context can't get freed.
 */
static struct perf_event_context *
perf_pin_task_context(struct task_struct *task)
{
        struct perf_event_context *ctx;
        unsigned long flags;

        ctx = perf_lock_task_context(task, &flags);
        if (ctx) {
                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
        return ctx;
}

static void perf_unpin_context(struct perf_event_context *ctx)
{
        unsigned long flags;

        raw_spin_lock_irqsave(&ctx->lock, flags);
        --ctx->pin_count;
        raw_spin_unlock_irqrestore(&ctx->lock, flags);
}

/*
 * Update the record of the current time in a context.
 */
static void __update_context_time(struct perf_event_context *ctx, bool adv)
{
        u64 now = perf_clock();

        lockdep_assert_held(&ctx->lock);

        if (adv)
                ctx->time += now - ctx->timestamp;
        ctx->timestamp = now;

        /*
         * The above: time' = time + (now - timestamp), can be re-arranged
         * into: time` = now + (time - timestamp), which gives a single value
         * offset to compute future time without locks on.
         *
         * See perf_event_time_now(), which can be used from NMI context where
         * it's (obviously) not possible to acquire ctx->lock in order to read
         * both the above values in a consistent manner.
         */
        WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
}

static void update_context_time(struct perf_event_context *ctx)
{
        __update_context_time(ctx, true);
}

static u64 perf_event_time(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;

        if (unlikely(!ctx))
                return 0;

        if (is_cgroup_event(event))
                return perf_cgroup_event_time(event);

        return ctx->time;
}

static u64 perf_event_time_now(struct perf_event *event, u64 now)
{
        struct perf_event_context *ctx = event->ctx;

        if (unlikely(!ctx))
                return 0;

        if (is_cgroup_event(event))
                return perf_cgroup_event_time_now(event, now);

        if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
                return ctx->time;

        now += READ_ONCE(ctx->timeoffset);
        return now;
}

static enum event_type_t get_event_type(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        enum event_type_t event_type;

        lockdep_assert_held(&ctx->lock);

        /*
         * It's 'group type', really, because if our group leader is
         * pinned, so are we.
         */
        if (event->group_leader != event)
                event = event->group_leader;

        event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
        if (!ctx->task)
                event_type |= EVENT_CPU;

        return event_type;
}

/*
 * Helper function to initialize event group nodes.
 */
static void init_event_group(struct perf_event *event)
{
        RB_CLEAR_NODE(&event->group_node);
        event->group_index = 0;
}

/*
 * Extract pinned or flexible groups from the context
 * based on event attrs bits.
 */
static struct perf_event_groups *
get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
{
        if (event->attr.pinned)
                return &ctx->pinned_groups;
        else
                return &ctx->flexible_groups;
}

/*
 * Helper function to initializes perf_event_group trees.
 */
static void perf_event_groups_init(struct perf_event_groups *groups)
{
        groups->tree = RB_ROOT;
        groups->index = 0;
}

static inline struct cgroup *event_cgroup(const struct perf_event *event)
{
        struct cgroup *cgroup = NULL;

#ifdef CONFIG_CGROUP_PERF
        if (event->cgrp)
                cgroup = event->cgrp->css.cgroup;
#endif

        return cgroup;
}

/*
 * Compare function for event groups;
 *
 * Implements complex key that first sorts by CPU and then by virtual index
 * which provides ordering when rotating groups for the same CPU.
 */
static __always_inline int
perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu,
                      const struct cgroup *left_cgroup, const u64 left_group_index,
                      const struct perf_event *right)
{
        if (left_cpu < right->cpu)
                return -1;
        if (left_cpu > right->cpu)
                return 1;

        if (left_pmu) {
                if (left_pmu < right->pmu_ctx->pmu)
                        return -1;
                if (left_pmu > right->pmu_ctx->pmu)
                        return 1;
        }

#ifdef CONFIG_CGROUP_PERF
        {
                const struct cgroup *right_cgroup = event_cgroup(right);

                if (left_cgroup != right_cgroup) {
                        if (!left_cgroup) {
                                /*
                                 * Left has no cgroup but right does, no
                                 * cgroups come first.
                                 */
                                return -1;
                        }
                        if (!right_cgroup) {
                                /*
                                 * Right has no cgroup but left does, no
                                 * cgroups come first.
                                 */
                                return 1;
                        }
                        /* Two dissimilar cgroups, order by id. */
                        if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
                                return -1;

                        return 1;
                }
        }
#endif

        if (left_group_index < right->group_index)
                return -1;
        if (left_group_index > right->group_index)
                return 1;

        return 0;
}

#define __node_2_pe(node) \
        rb_entry((node), struct perf_event, group_node)

static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
{
        struct perf_event *e = __node_2_pe(a);
        return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e),
                                     e->group_index, __node_2_pe(b)) < 0;
}

struct __group_key {
        int cpu;
        struct pmu *pmu;
        struct cgroup *cgroup;
};

static inline int __group_cmp(const void *key, const struct rb_node *node)
{
        const struct __group_key *a = key;
        const struct perf_event *b = __node_2_pe(node);

        /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */
        return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b);
}

static inline int
__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node)
{
        const struct __group_key *a = key;
        const struct perf_event *b = __node_2_pe(node);

        /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */
        return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b),
                                     b->group_index, b);
}

/*
 * Insert @event into @groups' tree; using
 *   {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index}
 * as key. This places it last inside the {cpu,pmu,cgroup} subtree.
 */
static void
perf_event_groups_insert(struct perf_event_groups *groups,
                         struct perf_event *event)
{
        event->group_index = ++groups->index;

        rb_add(&event->group_node, &groups->tree, __group_less);
}

/*
 * Helper function to insert event into the pinned or flexible groups.
 */
static void
add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event_groups *groups;

        groups = get_event_groups(event, ctx);
        perf_event_groups_insert(groups, event);
}

/*
 * Delete a group from a tree.
 */
static void
perf_event_groups_delete(struct perf_event_groups *groups,
                         struct perf_event *event)
{
        WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
                     RB_EMPTY_ROOT(&groups->tree));

        rb_erase(&event->group_node, &groups->tree);
        init_event_group(event);
}

/*
 * Helper function to delete event from its groups.
 */
static void
del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event_groups *groups;

        groups = get_event_groups(event, ctx);
        perf_event_groups_delete(groups, event);
}

/*
 * Get the leftmost event in the {cpu,pmu,cgroup} subtree.
 */
static struct perf_event *
perf_event_groups_first(struct perf_event_groups *groups, int cpu,
                        struct pmu *pmu, struct cgroup *cgrp)
{
        struct __group_key key = {
                .cpu = cpu,
                .pmu = pmu,
                .cgroup = cgrp,
        };
        struct rb_node *node;

        node = rb_find_first(&key, &groups->tree, __group_cmp);
        if (node)
                return __node_2_pe(node);

        return NULL;
}

static struct perf_event *
perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
{
        struct __group_key key = {
                .cpu = event->cpu,
                .pmu = pmu,
                .cgroup = event_cgroup(event),
        };
        struct rb_node *next;

        next = rb_next_match(&key, &event->group_node, __group_cmp);
        if (next)
                return __node_2_pe(next);

        return NULL;
}

#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu)                \
        for (event = perf_event_groups_first(groups, cpu, pmu, NULL);        \
             event; event = perf_event_groups_next(event, pmu))

/*
 * Iterate through the whole groups tree.
 */
#define perf_event_groups_for_each(event, groups)                        \
        for (event = rb_entry_safe(rb_first(&((groups)->tree)),                \
                                typeof(*event), group_node); event;        \
                event = rb_entry_safe(rb_next(&event->group_node),        \
                                typeof(*event), group_node))

/*
 * Add an event from the lists for its context.
 * Must be called with ctx->mutex and ctx->lock held.
 */
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
        lockdep_assert_held(&ctx->lock);

        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
        event->attach_state |= PERF_ATTACH_CONTEXT;

        event->tstamp = perf_event_time(event);

        /*
         * If we're a stand alone event or group leader, we go to the context
         * list, group events are kept attached to the group so that
         * perf_group_detach can, at all times, locate all siblings.
         */
        if (event->group_leader == event) {
                event->group_caps = event->event_caps;
                add_event_to_groups(event, ctx);
        }

        list_add_rcu(&event->event_entry, &ctx->event_list);
        ctx->nr_events++;
        if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
                ctx->nr_user++;
        if (event->attr.inherit_stat)
                ctx->nr_stat++;

        if (event->state > PERF_EVENT_STATE_OFF)
                perf_cgroup_event_enable(event, ctx);

        ctx->generation++;
        event->pmu_ctx->nr_events++;
}

/*
 * Initialize event state based on the perf_event_attr::disabled.
 */
static inline void perf_event__state_init(struct perf_event *event)
{
        event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
                                              PERF_EVENT_STATE_INACTIVE;
}

static int __perf_event_read_size(u64 read_format, int nr_siblings)
{
        int entry = sizeof(u64); /* value */
        int size = 0;
        int nr = 1;

        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                size += sizeof(u64);

        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                size += sizeof(u64);

        if (read_format & PERF_FORMAT_ID)
                entry += sizeof(u64);

        if (read_format & PERF_FORMAT_LOST)
                entry += sizeof(u64);

        if (read_format & PERF_FORMAT_GROUP) {
                nr += nr_siblings;
                size += sizeof(u64);
        }

        /*
         * Since perf_event_validate_size() limits this to 16k and inhibits
         * adding more siblings, this will never overflow.
         */
        return size + nr * entry;
}

static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
{
        struct perf_sample_data *data;
        u16 size = 0;

        if (sample_type & PERF_SAMPLE_IP)
                size += sizeof(data->ip);

        if (sample_type & PERF_SAMPLE_ADDR)
                size += sizeof(data->addr);

        if (sample_type & PERF_SAMPLE_PERIOD)
                size += sizeof(data->period);

        if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
                size += sizeof(data->weight.full);

        if (sample_type & PERF_SAMPLE_READ)
                size += event->read_size;

        if (sample_type & PERF_SAMPLE_DATA_SRC)
                size += sizeof(data->data_src.val);

        if (sample_type & PERF_SAMPLE_TRANSACTION)
                size += sizeof(data->txn);

        if (sample_type & PERF_SAMPLE_PHYS_ADDR)
                size += sizeof(data->phys_addr);

        if (sample_type & PERF_SAMPLE_CGROUP)
                size += sizeof(data->cgroup);

        if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
                size += sizeof(data->data_page_size);

        if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
                size += sizeof(data->code_page_size);

        event->header_size = size;
}

/*
 * Called at perf_event creation and when events are attached/detached from a
 * group.
 */
static void perf_event__header_size(struct perf_event *event)
{
        event->read_size =
                __perf_event_read_size(event->attr.read_format,
                                       event->group_leader->nr_siblings);
        __perf_event_header_size(event, event->attr.sample_type);
}

static void perf_event__id_header_size(struct perf_event *event)
{
        struct perf_sample_data *data;
        u64 sample_type = event->attr.sample_type;
        u16 size = 0;

        if (sample_type & PERF_SAMPLE_TID)
                size += sizeof(data->tid_entry);

        if (sample_type & PERF_SAMPLE_TIME)
                size += sizeof(data->time);

        if (sample_type & PERF_SAMPLE_IDENTIFIER)
                size += sizeof(data->id);

        if (sample_type & PERF_SAMPLE_ID)
                size += sizeof(data->id);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                size += sizeof(data->stream_id);

        if (sample_type & PERF_SAMPLE_CPU)
                size += sizeof(data->cpu_entry);

        event->id_header_size = size;
}

/*
 * Check that adding an event to the group does not result in anybody
 * overflowing the 64k event limit imposed by the output buffer.
 *
 * Specifically, check that the read_size for the event does not exceed 16k,
 * read_size being the one term that grows with groups size. Since read_size
 * depends on per-event read_format, also (re)check the existing events.
 *
 * This leaves 48k for the constant size fields and things like callchains,
 * branch stacks and register sets.
 */
static bool perf_event_validate_size(struct perf_event *event)
{
        struct perf_event *sibling, *group_leader = event->group_leader;

        if (__perf_event_read_size(event->attr.read_format,
                                   group_leader->nr_siblings + 1) > 16*1024)
                return false;

        if (__perf_event_read_size(group_leader->attr.read_format,
                                   group_leader->nr_siblings + 1) > 16*1024)
                return false;

        /*
         * When creating a new group leader, group_leader->ctx is initialized
         * after the size has been validated, but we cannot safely use
         * for_each_sibling_event() until group_leader->ctx is set. A new group
         * leader cannot have any siblings yet, so we can safely skip checking
         * the non-existent siblings.
         */
        if (event == group_leader)
                return true;

        for_each_sibling_event(sibling, group_leader) {
                if (__perf_event_read_size(sibling->attr.read_format,
                                           group_leader->nr_siblings + 1) > 16*1024)
                        return false;
        }

        return true;
}

static void perf_group_attach(struct perf_event *event)
{
        struct perf_event *group_leader = event->group_leader, *pos;

        lockdep_assert_held(&event->ctx->lock);

        /*
         * We can have double attach due to group movement (move_group) in
         * perf_event_open().
         */
        if (event->attach_state & PERF_ATTACH_GROUP)
                return;

        event->attach_state |= PERF_ATTACH_GROUP;

        if (group_leader == event)
                return;

        WARN_ON_ONCE(group_leader->ctx != event->ctx);

        group_leader->group_caps &= event->event_caps;

        list_add_tail(&event->sibling_list, &group_leader->sibling_list);
        group_leader->nr_siblings++;
        group_leader->group_generation++;

        perf_event__header_size(group_leader);

        for_each_sibling_event(pos, group_leader)
                perf_event__header_size(pos);
}

/*
 * Remove an event from the lists for its context.
 * Must be called with ctx->mutex and ctx->lock held.
 */
static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
        WARN_ON_ONCE(event->ctx != ctx);
        lockdep_assert_held(&ctx->lock);

        /*
         * We can have double detach due to exit/hot-unplug + close.
         */
        if (!(event->attach_state & PERF_ATTACH_CONTEXT))
                return;

        event->attach_state &= ~PERF_ATTACH_CONTEXT;

        ctx->nr_events--;
        if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
                ctx->nr_user--;
        if (event->attr.inherit_stat)
                ctx->nr_stat--;

        list_del_rcu(&event->event_entry);

        if (event->group_leader == event)
                del_event_from_groups(event, ctx);

        /*
         * If event was in error state, then keep it
         * that way, otherwise bogus counts will be
         * returned on read(). The only way to get out
         * of error state is by explicit re-enabling
         * of the event
         */
        if (event->state > PERF_EVENT_STATE_OFF) {
                perf_cgroup_event_disable(event, ctx);
                perf_event_set_state(event, PERF_EVENT_STATE_OFF);
        }

        ctx->generation++;
        event->pmu_ctx->nr_events--;
}

static int
perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
{
        if (!has_aux(aux_event))
                return 0;

        if (!event->pmu->aux_output_match)
                return 0;

        return event->pmu->aux_output_match(aux_event);
}

static void put_event(struct perf_event *event);
static void event_sched_out(struct perf_event *event,
                            struct perf_event_context *ctx);

static void perf_put_aux_event(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *iter;

        /*
         * If event uses aux_event tear down the link
         */
        if (event->aux_event) {
                iter = event->aux_event;
                event->aux_event = NULL;
                put_event(iter);
                return;
        }

        /*
         * If the event is an aux_event, tear down all links to
         * it from other events.
         */
        for_each_sibling_event(iter, event->group_leader) {
                if (iter->aux_event != event)
                        continue;

                iter->aux_event = NULL;
                put_event(event);

                /*
                 * If it's ACTIVE, schedule it out and put it into ERROR
                 * state so that we don't try to schedule it again. Note
                 * that perf_event_enable() will clear the ERROR status.
                 */
                event_sched_out(iter, ctx);
                perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
        }
}

static bool perf_need_aux_event(struct perf_event *event)
{
        return !!event->attr.aux_output || !!event->attr.aux_sample_size;
}

static int perf_get_aux_event(struct perf_event *event,
                              struct perf_event *group_leader)
{
        /*
         * Our group leader must be an aux event if we want to be
         * an aux_output. This way, the aux event will precede its
         * aux_output events in the group, and therefore will always
         * schedule first.
         */
        if (!group_leader)
                return 0;

        /*
         * aux_output and aux_sample_size are mutually exclusive.
         */
        if (event->attr.aux_output && event->attr.aux_sample_size)
                return 0;

        if (event->attr.aux_output &&
            !perf_aux_output_match(event, group_leader))
                return 0;

        if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
                return 0;

        if (!atomic_long_inc_not_zero(&group_leader->refcount))
                return 0;

        /*
         * Link aux_outputs to their aux event; this is undone in
         * perf_group_detach() by perf_put_aux_event(). When the
         * group in torn down, the aux_output events loose their
         * link to the aux_event and can't schedule any more.
         */
        event->aux_event = group_leader;

        return 1;
}

static inline struct list_head *get_event_list(struct perf_event *event)
{
        return event->attr.pinned ? &event->pmu_ctx->pinned_active :
                                    &event->pmu_ctx->flexible_active;
}

/*
 * Events that have PERF_EV_CAP_SIBLING require being part of a group and
 * cannot exist on their own, schedule them out and move them into the ERROR
 * state. Also see _perf_event_enable(), it will not be able to recover
 * this ERROR state.
 */
static inline void perf_remove_sibling_event(struct perf_event *event)
{
        event_sched_out(event, event->ctx);
        perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}

static void perf_group_detach(struct perf_event *event)
{
        struct perf_event *leader = event->group_leader;
        struct perf_event *sibling, *tmp;
        struct perf_event_context *ctx = event->ctx;

        lockdep_assert_held(&ctx->lock);

        /*
         * We can have double detach due to exit/hot-unplug + close.
         */
        if (!(event->attach_state & PERF_ATTACH_GROUP))
                return;

        event->attach_state &= ~PERF_ATTACH_GROUP;

        perf_put_aux_event(event);

        /*
         * If this is a sibling, remove it from its group.
         */
        if (leader != event) {
                list_del_init(&event->sibling_list);
                event->group_leader->nr_siblings--;
                event->group_leader->group_generation++;
                goto out;
        }

        /*
         * If this was a group event with sibling events then
         * upgrade the siblings to singleton events by adding them
         * to whatever list we are on.
         */
        list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {

                if (sibling->event_caps & PERF_EV_CAP_SIBLING)
                        perf_remove_sibling_event(sibling);

                sibling->group_leader = sibling;
                list_del_init(&sibling->sibling_list);

                /* Inherit group flags from the previous leader */
                sibling->group_caps = event->group_caps;

                if (sibling->attach_state & PERF_ATTACH_CONTEXT) {
                        add_event_to_groups(sibling, event->ctx);

                        if (sibling->state == PERF_EVENT_STATE_ACTIVE)
                                list_add_tail(&sibling->active_list, get_event_list(sibling));
                }

                WARN_ON_ONCE(sibling->ctx != event->ctx);
        }

out:
        for_each_sibling_event(tmp, leader)
                perf_event__header_size(tmp);

        perf_event__header_size(leader);
}

static void sync_child_event(struct perf_event *child_event);

static void perf_child_detach(struct perf_event *event)
{
        struct perf_event *parent_event = event->parent;

        if (!(event->attach_state & PERF_ATTACH_CHILD))
                return;

        event->attach_state &= ~PERF_ATTACH_CHILD;

        if (WARN_ON_ONCE(!parent_event))
                return;

        lockdep_assert_held(&parent_event->child_mutex);

        sync_child_event(event);
        list_del_init(&event->child_list);
}

static bool is_orphaned_event(struct perf_event *event)
{
        return event->state == PERF_EVENT_STATE_DEAD;
}

static inline int
event_filter_match(struct perf_event *event)
{
        return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
               perf_cgroup_match(event);
}

static void
event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event_pmu_context *epc = event->pmu_ctx;
        struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
        enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;

        // XXX cpc serialization, probably per-cpu IRQ disabled

        WARN_ON_ONCE(event->ctx != ctx);
        lockdep_assert_held(&ctx->lock);

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return;

        /*
         * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
         * we can schedule events _OUT_ individually through things like
         * __perf_remove_from_context().
         */
        list_del_init(&event->active_list);

        perf_pmu_disable(event->pmu);

        event->pmu->del(event, 0);
        event->oncpu = -1;

        if (event->pending_disable) {
                event->pending_disable = 0;
                perf_cgroup_event_disable(event, ctx);
                state = PERF_EVENT_STATE_OFF;
        }

        if (event->pending_sigtrap) {
                bool dec = true;

                event->pending_sigtrap = 0;
                if (state != PERF_EVENT_STATE_OFF &&
                    !event->pending_work) {
                        event->pending_work = 1;
                        dec = false;
                        WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
                        task_work_add(current, &event->pending_task, TWA_RESUME);
                }
                if (dec)
                        local_dec(&event->ctx->nr_pending);
        }

        perf_event_set_state(event, state);

        if (!is_software_event(event))
                cpc->active_oncpu--;
        if (event->attr.freq && event->attr.sample_freq) {
                ctx->nr_freq--;
                epc->nr_freq--;
        }
        if (event->attr.exclusive || !cpc->active_oncpu)
                cpc->exclusive = 0;

        perf_pmu_enable(event->pmu);
}

static void
group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
{
        struct perf_event *event;

        if (group_event->state != PERF_EVENT_STATE_ACTIVE)
                return;

        perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);

        event_sched_out(group_event, ctx);

        /*
         * Schedule out siblings (if any):
         */
        for_each_sibling_event(event, group_event)
                event_sched_out(event, ctx);
}

#define DETACH_GROUP        0x01UL
#define DETACH_CHILD        0x02UL
#define DETACH_DEAD        0x04UL

/*
 * Cross CPU call to remove a performance event
 *
 * We disable the event on the hardware level first. After that we
 * remove it from the context list.
 */
static void
__perf_remove_from_context(struct perf_event *event,
                           struct perf_cpu_context *cpuctx,
                           struct perf_event_context *ctx,
                           void *info)
{
        struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
        unsigned long flags = (unsigned long)info;

        if (ctx->is_active & EVENT_TIME) {
                update_context_time(ctx);
                update_cgrp_time_from_cpuctx(cpuctx, false);
        }

        /*
         * Ensure event_sched_out() switches to OFF, at the very least
         * this avoids raising perf_pending_task() at this time.
         */
        if (flags & DETACH_DEAD)
                event->pending_disable = 1;
        event_sched_out(event, ctx);
        if (flags & DETACH_GROUP)
                perf_group_detach(event);
        if (flags & DETACH_CHILD)
                perf_child_detach(event);
        list_del_event(event, ctx);
        if (flags & DETACH_DEAD)
                event->state = PERF_EVENT_STATE_DEAD;

        if (!pmu_ctx->nr_events) {
                pmu_ctx->rotate_necessary = 0;

                if (ctx->task && ctx->is_active) {
                        struct perf_cpu_pmu_context *cpc;

                        cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
                        WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
                        cpc->task_epc = NULL;
                }
        }

        if (!ctx->nr_events && ctx->is_active) {
                if (ctx == &cpuctx->ctx)
                        update_cgrp_time_from_cpuctx(cpuctx, true);

                ctx->is_active = 0;
                if (ctx->task) {
                        WARN_ON_ONCE(cpuctx->task_ctx != ctx);
                        cpuctx->task_ctx = NULL;
                }
        }
}

/*
 * Remove the event from a task's (or a CPU's) list of events.
 *
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
 * remains valid.  This is OK when called from perf_release since
 * that only calls us on the top-level context, which can't be a clone.
 * When called from perf_event_exit_task, it's OK because the
 * context has been detached from its task.
 */
static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
{
        struct perf_event_context *ctx = event->ctx;

        lockdep_assert_held(&ctx->mutex);

        /*
         * Because of perf_event_exit_task(), perf_remove_from_context() ought
         * to work in the face of TASK_TOMBSTONE, unlike every other
         * event_function_call() user.
         */
        raw_spin_lock_irq(&ctx->lock);
        if (!ctx->is_active) {
                __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
                                           ctx, (void *)flags);
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
        raw_spin_unlock_irq(&ctx->lock);

        event_function_call(event, __perf_remove_from_context, (void *)flags);
}

/*
 * Cross CPU call to disable a performance event
 */
static void __perf_event_disable(struct perf_event *event,
                                 struct perf_cpu_context *cpuctx,
                                 struct perf_event_context *ctx,
                                 void *info)
{
        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return;

        if (ctx->is_active & EVENT_TIME) {
                update_context_time(ctx);
                update_cgrp_time_from_event(event);
        }

        perf_pmu_disable(event->pmu_ctx->pmu);

        if (event == event->group_leader)
                group_sched_out(event, ctx);
        else
                event_sched_out(event, ctx);

        perf_event_set_state(event, PERF_EVENT_STATE_OFF);
        perf_cgroup_event_disable(event, ctx);

        perf_pmu_enable(event->pmu_ctx->pmu);
}

/*
 * Disable an event.
 *
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
 * remains valid.  This condition is satisfied when called through
 * perf_event_for_each_child or perf_event_for_each because they
 * hold the top-level event's child_mutex, so any descendant that
 * goes to exit will block in perf_event_exit_event().
 *
 * When called from perf_pending_irq it's OK because event->ctx
 * is the current context on this CPU and preemption is disabled,
 * hence we can't get into perf_event_task_sched_out for this context.
 */
static void _perf_event_disable(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;

        raw_spin_lock_irq(&ctx->lock);
        if (event->state <= PERF_EVENT_STATE_OFF) {
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
        raw_spin_unlock_irq(&ctx->lock);

        event_function_call(event, __perf_event_disable, NULL);
}

void perf_event_disable_local(struct perf_event *event)
{
        event_function_local(event, __perf_event_disable, NULL);
}

/*
 * Strictly speaking kernel users cannot create groups and therefore this
 * interface does not need the perf_event_ctx_lock() magic.
 */
void perf_event_disable(struct perf_event *event)
{
        struct perf_event_context *ctx;

        ctx = perf_event_ctx_lock(event);
        _perf_event_disable(event);
        perf_event_ctx_unlock(event, ctx);
}
EXPORT_SYMBOL_GPL(perf_event_disable);

void perf_event_disable_inatomic(struct perf_event *event)
{
        event->pending_disable = 1;
        irq_work_queue(&event->pending_irq);
}

#define MAX_INTERRUPTS (~0ULL)

static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);

static int
event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event_pmu_context *epc = event->pmu_ctx;
        struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
        int ret = 0;

        WARN_ON_ONCE(event->ctx != ctx);

        lockdep_assert_held(&ctx->lock);

        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;

        WRITE_ONCE(event->oncpu, smp_processor_id());
        /*
         * Order event::oncpu write to happen before the ACTIVE state is
         * visible. This allows perf_event_{stop,read}() to observe the correct
         * ->oncpu if it sees ACTIVE.
         */
        smp_wmb();
        perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);

        /*
         * Unthrottle events, since we scheduled we might have missed several
         * ticks already, also for a heavily scheduling task there is little
         * guarantee it'll get a tick in a timely manner.
         */
        if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
                perf_log_throttle(event, 1);
                event->hw.interrupts = 0;
        }

        perf_pmu_disable(event->pmu);

        perf_log_itrace_start(event);

        if (event->pmu->add(event, PERF_EF_START)) {
                perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
                event->oncpu = -1;
                ret = -EAGAIN;
                goto out;
        }

        if (!is_software_event(event))
                cpc->active_oncpu++;
        if (event->attr.freq && event->attr.sample_freq) {
                ctx->nr_freq++;
                epc->nr_freq++;
        }
        if (event->attr.exclusive)
                cpc->exclusive = 1;

out:
        perf_pmu_enable(event->pmu);

        return ret;
}

static int
group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx)
{
        struct perf_event *event, *partial_group = NULL;
        struct pmu *pmu = group_event->pmu_ctx->pmu;

        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;

        pmu->start_txn(pmu, PERF_PMU_TXN_ADD);

        if (event_sched_in(group_event, ctx))
                goto error;

        /*
         * Schedule in siblings as one group (if any):
         */
        for_each_sibling_event(event, group_event) {
                if (event_sched_in(event, ctx)) {
                        partial_group = event;
                        goto group_error;
                }
        }

        if (!pmu->commit_txn(pmu))
                return 0;

group_error:
        /*
         * Groups can be scheduled in as one unit only, so undo any
         * partial group before returning:
         * The events up to the failed event are scheduled out normally.
         */
        for_each_sibling_event(event, group_event) {
                if (event == partial_group)
                        break;

                event_sched_out(event, ctx);
        }
        event_sched_out(group_event, ctx);

error:
        pmu->cancel_txn(pmu);
        return -EAGAIN;
}

/*
 * Work out whether we can put this event group on the CPU now.
 */
static int group_can_go_on(struct perf_event *event, int can_add_hw)
{
        struct perf_event_pmu_context *epc = event->pmu_ctx;
        struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);

        /*
         * Groups consisting entirely of software events can always go on.
         */
        if (event->group_caps & PERF_EV_CAP_SOFTWARE)
                return 1;
        /*
         * If an exclusive group is already on, no other hardware
         * events can go on.
         */
        if (cpc->exclusive)
                return 0;
        /*
         * If this group is exclusive and there are already
         * events on the CPU, it can't go on.
         */
        if (event->attr.exclusive && !list_empty(get_event_list(event)))
                return 0;
        /*
         * Otherwise, try to add it if all previous groups were able
         * to go on.
         */
        return can_add_hw;
}

static void add_event_to_ctx(struct perf_event *event,
                               struct perf_event_context *ctx)
{
        list_add_event(event, ctx);
        perf_group_attach(event);
}

static void task_ctx_sched_out(struct perf_event_context *ctx,
                                enum event_type_t event_type)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);

        if (!cpuctx->task_ctx)
                return;

        if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
                return;

        ctx_sched_out(ctx, event_type);
}

static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx)
{
        ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
        if (ctx)
                 ctx_sched_in(ctx, EVENT_PINNED);
        ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
        if (ctx)
                 ctx_sched_in(ctx, EVENT_FLEXIBLE);
}

/*
 * We want to maintain the following priority of scheduling:
 *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
 *  - task pinned (EVENT_PINNED)
 *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
 *  - task flexible (EVENT_FLEXIBLE).
 *
 * In order to avoid unscheduling and scheduling back in everything every
 * time an event is added, only do it for the groups of equal priority and
 * below.
 *
 * This can be called after a batch operation on task events, in which case
 * event_type is a bit mask of the types of events involved. For CPU events,
 * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
 */
/*
 * XXX: ctx_resched() reschedule entire perf_event_context while adding new
 * event to the context or enabling existing event in the context. We can
 * probably optimize it by rescheduling only affected pmu_ctx.
 */
static void ctx_resched(struct perf_cpu_context *cpuctx,
                        struct perf_event_context *task_ctx,
                        enum event_type_t event_type)
{
        bool cpu_event = !!(event_type & EVENT_CPU);

        /*
         * If pinned groups are involved, flexible groups also need to be
         * scheduled out.
         */
        if (event_type & EVENT_PINNED)
                event_type |= EVENT_FLEXIBLE;

        event_type &= EVENT_ALL;

        perf_ctx_disable(&cpuctx->ctx, false);
        if (task_ctx) {
                perf_ctx_disable(task_ctx, false);
                task_ctx_sched_out(task_ctx, event_type);
        }

        /*
         * Decide which cpu ctx groups to schedule out based on the types
         * of events that caused rescheduling:
         *  - EVENT_CPU: schedule out corresponding groups;
         *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
         *  - otherwise, do nothing more.
         */
        if (cpu_event)
                ctx_sched_out(&cpuctx->ctx, event_type);
        else if (event_type & EVENT_PINNED)
                ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);

        perf_event_sched_in(cpuctx, task_ctx);

        perf_ctx_enable(&cpuctx->ctx, false);
        if (task_ctx)
                perf_ctx_enable(task_ctx, false);
}

void perf_pmu_resched(struct pmu *pmu)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;

        perf_ctx_lock(cpuctx, task_ctx);
        ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
        perf_ctx_unlock(cpuctx, task_ctx);
}

/*
 * Cross CPU call to install and enable a performance event
 *
 * Very similar to remote_function() + event_function() but cannot assume that
 * things like ctx->is_active and cpuctx->task_ctx are set.
 */
static int  __perf_install_in_context(void *info)
{
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
        bool reprogram = true;
        int ret = 0;

        raw_spin_lock(&cpuctx->ctx.lock);
        if (ctx->task) {
                raw_spin_lock(&ctx->lock);
                task_ctx = ctx;

                reprogram = (ctx->task == current);

                /*
                 * If the task is running, it must be running on this CPU,
                 * otherwise we cannot reprogram things.
                 *
                 * If its not running, we don't care, ctx->lock will
                 * serialize against it becoming runnable.
                 */
                if (task_curr(ctx->task) && !reprogram) {
                        ret = -ESRCH;
                        goto unlock;
                }

                WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
        } else if (task_ctx) {
                raw_spin_lock(&task_ctx->lock);
        }

#ifdef CONFIG_CGROUP_PERF
        if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
                /*
                 * If the current cgroup doesn't match the event's
                 * cgroup, we should not try to schedule it.
                 */
                struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
                reprogram = cgroup_is_descendant(cgrp->css.cgroup,
                                        event->cgrp->css.cgroup);
        }
#endif

        if (reprogram) {
                ctx_sched_out(ctx, EVENT_TIME);
                add_event_to_ctx(event, ctx);
                ctx_resched(cpuctx, task_ctx, get_event_type(event));
        } else {
                add_event_to_ctx(event, ctx);
        }

unlock:
        perf_ctx_unlock(cpuctx, task_ctx);

        return ret;
}

static bool exclusive_event_installable(struct perf_event *event,
                                        struct perf_event_context *ctx);

/*
 * Attach a performance event to a context.
 *
 * Very similar to event_function_call, see comment there.
 */
static void
perf_install_in_context(struct perf_event_context *ctx,
                        struct perf_event *event,
                        int cpu)
{
        struct task_struct *task = READ_ONCE(ctx->task);

        lockdep_assert_held(&ctx->mutex);

        WARN_ON_ONCE(!exclusive_event_installable(event, ctx));

        if (event->cpu != -1)
                WARN_ON_ONCE(event->cpu != cpu);

        /*
         * Ensures that if we can observe event->ctx, both the event and ctx
         * will be 'complete'. See perf_iterate_sb_cpu().
         */
        smp_store_release(&event->ctx, ctx);

        /*
         * perf_event_attr::disabled events will not run and can be initialized
         * without IPI. Except when this is the first event for the context, in
         * that case we need the magic of the IPI to set ctx->is_active.
         *
         * The IOC_ENABLE that is sure to follow the creation of a disabled
         * event will issue the IPI and reprogram the hardware.
         */
        if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
            ctx->nr_events && !is_cgroup_event(event)) {
                raw_spin_lock_irq(&ctx->lock);
                if (ctx->task == TASK_TOMBSTONE) {
                        raw_spin_unlock_irq(&ctx->lock);
                        return;
                }
                add_event_to_ctx(event, ctx);
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }

        if (!task) {
                cpu_function_call(cpu, __perf_install_in_context, event);
                return;
        }

        /*
         * Should not happen, we validate the ctx is still alive before calling.
         */
        if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
                return;

        /*
         * Installing events is tricky because we cannot rely on ctx->is_active
         * to be set in case this is the nr_events 0 -> 1 transition.
         *
         * Instead we use task_curr(), which tells us if the task is running.
         * However, since we use task_curr() outside of rq::lock, we can race
         * against the actual state. This means the result can be wrong.
         *
         * If we get a false positive, we retry, this is harmless.
         *
         * If we get a false negative, things are complicated. If we are after
         * perf_event_context_sched_in() ctx::lock will serialize us, and the
         * value must be correct. If we're before, it doesn't matter since
         * perf_event_context_sched_in() will program the counter.
         *
         * However, this hinges on the remote context switch having observed
         * our task->perf_event_ctxp[] store, such that it will in fact take
         * ctx::lock in perf_event_context_sched_in().
         *
         * We do this by task_function_call(), if the IPI fails to hit the task
         * we know any future context switch of task must see the
         * perf_event_ctpx[] store.
         */

        /*
         * This smp_mb() orders the task->perf_event_ctxp[] store with the
         * task_cpu() load, such that if the IPI then does not find the task
         * running, a future context switch of that task must observe the
         * store.
         */
        smp_mb();
again:
        if (!task_function_call(task, __perf_install_in_context, event))
                return;

        raw_spin_lock_irq(&ctx->lock);
        task = ctx->task;
        if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
                /*
                 * Cannot happen because we already checked above (which also
                 * cannot happen), and we hold ctx->mutex, which serializes us
                 * against perf_event_exit_task_context().
                 */
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
        /*
         * If the task is not running, ctx->lock will avoid it becoming so,
         * thus we can safely install the event.
         */
        if (task_curr(task)) {
                raw_spin_unlock_irq(&ctx->lock);
                goto again;
        }
        add_event_to_ctx(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
}

/*
 * Cross CPU call to enable a performance event
 */
static void __perf_event_enable(struct perf_event *event,
                                struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx,
                                void *info)
{
        struct perf_event *leader = event->group_leader;
        struct perf_event_context *task_ctx;

        if (event->state >= PERF_EVENT_STATE_INACTIVE ||
            event->state <= PERF_EVENT_STATE_ERROR)
                return;

        if (ctx->is_active)
                ctx_sched_out(ctx, EVENT_TIME);

        perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
        perf_cgroup_event_enable(event, ctx);

        if (!ctx->is_active)
                return;

        if (!event_filter_match(event)) {
                ctx_sched_in(ctx, EVENT_TIME);
                return;
        }

        /*
         * If the event is in a group and isn't the group leader,
         * then don't put it on unless the group is on.
         */
        if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
                ctx_sched_in(ctx, EVENT_TIME);
                return;
        }

        task_ctx = cpuctx->task_ctx;
        if (ctx->task)
                WARN_ON_ONCE(task_ctx != ctx);

        ctx_resched(cpuctx, task_ctx, get_event_type(event));
}

/*
 * Enable an event.
 *
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
 * remains valid.  This condition is satisfied when called through
 * perf_event_for_each_child or perf_event_for_each as described
 * for perf_event_disable.
 */
static void _perf_event_enable(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;

        raw_spin_lock_irq(&ctx->lock);
        if (event->state >= PERF_EVENT_STATE_INACTIVE ||
            event->state <  PERF_EVENT_STATE_ERROR) {
out:
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }

        /*
         * If the event is in error state, clear that first.
         *
         * That way, if we see the event in error state below, we know that it
         * has gone back into error state, as distinct from the task having
         * been scheduled away before the cross-call arrived.
         */
        if (event->state == PERF_EVENT_STATE_ERROR) {
                /*
                 * Detached SIBLING events cannot leave ERROR state.
                 */
                if (event->event_caps & PERF_EV_CAP_SIBLING &&
                    event->group_leader == event)
                        goto out;

                event->state = PERF_EVENT_STATE_OFF;
        }
        raw_spin_unlock_irq(&ctx->lock);

        event_function_call(event, __perf_event_enable, NULL);
}

/*
 * See perf_event_disable();
 */
void perf_event_enable(struct perf_event *event)
{
        struct perf_event_context *ctx;

        ctx = perf_event_ctx_lock(event);
        _perf_event_enable(event);
        perf_event_ctx_unlock(event, ctx);
}
EXPORT_SYMBOL_GPL(perf_event_enable);

struct stop_event_data {
        struct perf_event        *event;
        unsigned int                restart;
};

static int __perf_event_stop(void *info)
{
        struct stop_event_data *sd = info;
        struct perf_event *event = sd->event;

        /* if it's already INACTIVE, do nothing */
        if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
                return 0;

        /* matches smp_wmb() in event_sched_in() */
        smp_rmb();

        /*
         * There is a window with interrupts enabled before we get here,
         * so we need to check again lest we try to stop another CPU's event.
         */
        if (READ_ONCE(event->oncpu) != smp_processor_id())
                return -EAGAIN;

        event->pmu->stop(event, PERF_EF_UPDATE);

        /*
         * May race with the actual stop (through perf_pmu_output_stop()),
         * but it is only used for events with AUX ring buffer, and such
         * events will refuse to restart because of rb::aux_mmap_count==0,
         * see comments in perf_aux_output_begin().
         *
         * Since this is happening on an event-local CPU, no trace is lost
         * while restarting.
         */
        if (sd->restart)
                event->pmu->start(event, 0);

        return 0;
}

static int perf_event_stop(struct perf_event *event, int restart)
{
        struct stop_event_data sd = {
                .event                = event,
                .restart        = restart,
        };
        int ret = 0;

        do {
                if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
                        return 0;

                /* matches smp_wmb() in event_sched_in() */
                smp_rmb();

                /*
                 * We only want to restart ACTIVE events, so if the event goes
                 * inactive here (event->oncpu==-1), there's nothing more to do;
                 * fall through with ret==-ENXIO.
                 */
                ret = cpu_function_call(READ_ONCE(event->oncpu),
                                        __perf_event_stop, &sd);
        } while (ret == -EAGAIN);

        return ret;
}

/*
 * In order to contain the amount of racy and tricky in the address filter
 * configuration management, it is a two part process:
 *
 * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
 *      we update the addresses of corresponding vmas in
 *        event::addr_filter_ranges array and bump the event::addr_filters_gen;
 * (p2) when an event is scheduled in (pmu::add), it calls
 *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
 *      if the generation has changed since the previous call.
 *
 * If (p1) happens while the event is active, we restart it to force (p2).
 *
 * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
 *     pre-existing mappings, called once when new filters arrive via SET_FILTER
 *     ioctl;
 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
 *     registered mapping, called for every new mmap(), with mm::mmap_lock down
 *     for reading;
 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
 *     of exec.
 */
void perf_event_addr_filters_sync(struct perf_event *event)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);

        if (!has_addr_filter(event))
                return;

        raw_spin_lock(&ifh->lock);
        if (event->addr_filters_gen != event->hw.addr_filters_gen) {
                event->pmu->addr_filters_sync(event);
                event->hw.addr_filters_gen = event->addr_filters_gen;
        }
        raw_spin_unlock(&ifh->lock);
}
EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);

static int _perf_event_refresh(struct perf_event *event, int refresh)
{
        /*
         * not supported on inherited events
         */
        if (event->attr.inherit || !is_sampling_event(event))
                return -EINVAL;

        atomic_add(refresh, &event->event_limit);
        _perf_event_enable(event);

        return 0;
}

/*
 * See perf_event_disable()
 */
int perf_event_refresh(struct perf_event *event, int refresh)
{
        struct perf_event_context *ctx;
        int ret;

        ctx = perf_event_ctx_lock(event);
        ret = _perf_event_refresh(event, refresh);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}
EXPORT_SYMBOL_GPL(perf_event_refresh);

static int perf_event_modify_breakpoint(struct perf_event *bp,
                                         struct perf_event_attr *attr)
{
        int err;

        _perf_event_disable(bp);

        err = modify_user_hw_breakpoint_check(bp, attr, true);

        if (!bp->attr.disabled)
                _perf_event_enable(bp);

        return err;
}

/*
 * Copy event-type-independent attributes that may be modified.
 */
static void perf_event_modify_copy_attr(struct perf_event_attr *to,
                                        const struct perf_event_attr *from)
{
        to->sig_data = from->sig_data;
}

static int perf_event_modify_attr(struct perf_event *event,
                                  struct perf_event_attr *attr)
{
        int (*func)(struct perf_event *, struct perf_event_attr *);
        struct perf_event *child;
        int err;

        if (event->attr.type != attr->type)
                return -EINVAL;

        switch (event->attr.type) {
        case PERF_TYPE_BREAKPOINT:
                func = perf_event_modify_breakpoint;
                break;
        default:
                /* Place holder for future additions. */
                return -EOPNOTSUPP;
        }

        WARN_ON_ONCE(event->ctx->parent_ctx);

        mutex_lock(&event->child_mutex);
        /*
         * Event-type-independent attributes must be copied before event-type
         * modification, which will validate that final attributes match the
         * source attributes after all relevant attributes have been copied.
         */
        perf_event_modify_copy_attr(&event->attr, attr);
        err = func(event, attr);
        if (err)
                goto out;
        list_for_each_entry(child, &event->child_list, child_list) {
                perf_event_modify_copy_attr(&child->attr, attr);
                err = func(child, attr);
                if (err)
                        goto out;
        }
out:
        mutex_unlock(&event->child_mutex);
        return err;
}

static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
                                enum event_type_t event_type)
{
        struct perf_event_context *ctx = pmu_ctx->ctx;
        struct perf_event *event, *tmp;
        struct pmu *pmu = pmu_ctx->pmu;

        if (ctx->task && !ctx->is_active) {
                struct perf_cpu_pmu_context *cpc;

                cpc = this_cpu_ptr(pmu->cpu_pmu_context);
                WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
                cpc->task_epc = NULL;
        }

        if (!event_type)
                return;

        perf_pmu_disable(pmu);
        if (event_type & EVENT_PINNED) {
                list_for_each_entry_safe(event, tmp,
                                         &pmu_ctx->pinned_active,
                                         active_list)
                        group_sched_out(event, ctx);
        }

        if (event_type & EVENT_FLEXIBLE) {
                list_for_each_entry_safe(event, tmp,
                                         &pmu_ctx->flexible_active,
                                         active_list)
                        group_sched_out(event, ctx);
                /*
                 * Since we cleared EVENT_FLEXIBLE, also clear
                 * rotate_necessary, is will be reset by
                 * ctx_flexible_sched_in() when needed.
                 */
                pmu_ctx->rotate_necessary = 0;
        }
        perf_pmu_enable(pmu);
}

static void
ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_pmu_context *pmu_ctx;
        int is_active = ctx->is_active;
        bool cgroup = event_type & EVENT_CGROUP;

        event_type &= ~EVENT_CGROUP;

        lockdep_assert_held(&ctx->lock);

        if (likely(!ctx->nr_events)) {
                /*
                 * See __perf_remove_from_context().
                 */
                WARN_ON_ONCE(ctx->is_active);
                if (ctx->task)
                        WARN_ON_ONCE(cpuctx->task_ctx);
                return;
        }

        /*
         * Always update time if it was set; not only when it changes.
         * Otherwise we can 'forget' to update time for any but the last
         * context we sched out. For example:
         *
         *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
         *   ctx_sched_out(.event_type = EVENT_PINNED)
         *
         * would only update time for the pinned events.
         */
        if (is_active & EVENT_TIME) {
                /* update (and stop) ctx time */
                update_context_time(ctx);
                update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
                /*
                 * CPU-release for the below ->is_active store,
                 * see __load_acquire() in perf_event_time_now()
                 */
                barrier();
        }

        ctx->is_active &= ~event_type;
        if (!(ctx->is_active & EVENT_ALL))
                ctx->is_active = 0;

        if (ctx->task) {
                WARN_ON_ONCE(cpuctx->task_ctx != ctx);
                if (!ctx->is_active)
                        cpuctx->task_ctx = NULL;
        }

        is_active ^= ctx->is_active; /* changed bits */

        list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
                if (cgroup && !pmu_ctx->nr_cgroups)
                        continue;
                __pmu_ctx_sched_out(pmu_ctx, is_active);
        }
}

/*
 * Test whether two contexts are equivalent, i.e. whether they have both been
 * cloned from the same version of the same context.
 *
 * Equivalence is measured using a generation number in the context that is
 * incremented on each modification to it; see unclone_ctx(), list_add_event()
 * and list_del_event().
 */
static int context_equiv(struct perf_event_context *ctx1,
                         struct perf_event_context *ctx2)
{
        lockdep_assert_held(&ctx1->lock);
        lockdep_assert_held(&ctx2->lock);

        /* Pinning disables the swap optimization */
        if (ctx1->pin_count || ctx2->pin_count)
                return 0;

        /* If ctx1 is the parent of ctx2 */
        if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
                return 1;

        /* If ctx2 is the parent of ctx1 */
        if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
                return 1;

        /*
         * If ctx1 and ctx2 have the same parent; we flatten the parent
         * hierarchy, see perf_event_init_context().
         */
        if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
                        ctx1->parent_gen == ctx2->parent_gen)
                return 1;

        /* Unmatched */
        return 0;
}

static void __perf_event_sync_stat(struct perf_event *event,
                                     struct perf_event *next_event)
{
        u64 value;

        if (!event->attr.inherit_stat)
                return;

        /*
         * Update the event value, we cannot use perf_event_read()
         * because we're in the middle of a context switch and have IRQs
         * disabled, which upsets smp_call_function_single(), however
         * we know the event must be on the current CPU, therefore we
         * don't need to use it.
         */
        if (event->state == PERF_EVENT_STATE_ACTIVE)
                event->pmu->read(event);

        perf_event_update_time(event);

        /*
         * In order to keep per-task stats reliable we need to flip the event
         * values when we flip the contexts.
         */
        value = local64_read(&next_event->count);
        value = local64_xchg(&event->count, value);
        local64_set(&next_event->count, value);

        swap(event->total_time_enabled, next_event->total_time_enabled);
        swap(event->total_time_running, next_event->total_time_running);

        /*
         * Since we swizzled the values, update the user visible data too.
         */
        perf_event_update_userpage(event);
        perf_event_update_userpage(next_event);
}

static void perf_event_sync_stat(struct perf_event_context *ctx,
                                   struct perf_event_context *next_ctx)
{
        struct perf_event *event, *next_event;

        if (!ctx->nr_stat)
                return;

        update_context_time(ctx);

        event = list_first_entry(&ctx->event_list,
                                   struct perf_event, event_entry);

        next_event = list_first_entry(&next_ctx->event_list,
                                        struct perf_event, event_entry);

        while (&event->event_entry != &ctx->event_list &&
               &next_event->event_entry != &next_ctx->event_list) {

                __perf_event_sync_stat(event, next_event);

                event = list_next_entry(event, event_entry);
                next_event = list_next_entry(next_event, event_entry);
        }
}

#define double_list_for_each_entry(pos1, pos2, head1, head2, member)        \
        for (pos1 = list_first_entry(head1, typeof(*pos1), member),        \
             pos2 = list_first_entry(head2, typeof(*pos2), member);        \
             !list_entry_is_head(pos1, head1, member) &&                \
             !list_entry_is_head(pos2, head2, member);                        \
             pos1 = list_next_entry(pos1, member),                        \
             pos2 = list_next_entry(pos2, member))

static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx,
                                          struct perf_event_context *next_ctx)
{
        struct perf_event_pmu_context *prev_epc, *next_epc;

        if (!prev_ctx->nr_task_data)
                return;

        double_list_for_each_entry(prev_epc, next_epc,
                                   &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list,
                                   pmu_ctx_entry) {

                if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu))
                        continue;

                /*
                 * PMU specific parts of task perf context can require
                 * additional synchronization. As an example of such
                 * synchronization see implementation details of Intel
                 * LBR call stack data profiling;
                 */
                if (prev_epc->pmu->swap_task_ctx)
                        prev_epc->pmu->swap_task_ctx(prev_epc, next_epc);
                else
                        swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
        }
}

static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in)
{
        struct perf_event_pmu_context *pmu_ctx;
        struct perf_cpu_pmu_context *cpc;

        list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
                cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);

                if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
                        pmu_ctx->pmu->sched_task(pmu_ctx, sched_in);
        }
}

static void
perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
{
        struct perf_event_context *ctx = task->perf_event_ctxp;
        struct perf_event_context *next_ctx;
        struct perf_event_context *parent, *next_parent;
        int do_switch = 1;

        if (likely(!ctx))
                return;

        rcu_read_lock();
        next_ctx = rcu_dereference(next->perf_event_ctxp);
        if (!next_ctx)
                goto unlock;

        parent = rcu_dereference(ctx->parent_ctx);
        next_parent = rcu_dereference(next_ctx->parent_ctx);

        /* If neither context have a parent context; they cannot be clones. */
        if (!parent && !next_parent)
                goto unlock;

        if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
                /*
                 * Looks like the two contexts are clones, so we might be
                 * able to optimize the context switch.  We lock both
                 * contexts and check that they are clones under the
                 * lock (including re-checking that neither has been
                 * uncloned in the meantime).  It doesn't matter which
                 * order we take the locks because no other cpu could
                 * be trying to lock both of these tasks.
                 */
                raw_spin_lock(&ctx->lock);
                raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
                if (context_equiv(ctx, next_ctx)) {

                        perf_ctx_disable(ctx, false);

                        /* PMIs are disabled; ctx->nr_pending is stable. */
                        if (local_read(&ctx->nr_pending) ||
                            local_read(&next_ctx->nr_pending)) {
                                /*
                                 * Must not swap out ctx when there's pending
                                 * events that rely on the ctx->task relation.
                                 */
                                raw_spin_unlock(&next_ctx->lock);
                                rcu_read_unlock();
                                goto inside_switch;
                        }

                        WRITE_ONCE(ctx->task, next);
                        WRITE_ONCE(next_ctx->task, task);

                        perf_ctx_sched_task_cb(ctx, false);
                        perf_event_swap_task_ctx_data(ctx, next_ctx);

                        perf_ctx_enable(ctx, false);

                        /*
                         * RCU_INIT_POINTER here is safe because we've not
                         * modified the ctx and the above modification of
                         * ctx->task and ctx->task_ctx_data are immaterial
                         * since those values are always verified under
                         * ctx->lock which we're now holding.
                         */
                        RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
                        RCU_INIT_POINTER(next->perf_event_ctxp, ctx);

                        do_switch = 0;

                        perf_event_sync_stat(ctx, next_ctx);
                }
                raw_spin_unlock(&next_ctx->lock);
                raw_spin_unlock(&ctx->lock);
        }
unlock:
        rcu_read_unlock();

        if (do_switch) {
                raw_spin_lock(&ctx->lock);
                perf_ctx_disable(ctx, false);

inside_switch:
                perf_ctx_sched_task_cb(ctx, false);
                task_ctx_sched_out(ctx, EVENT_ALL);

                perf_ctx_enable(ctx, false);
                raw_spin_unlock(&ctx->lock);
        }
}

static DEFINE_PER_CPU(struct list_head, sched_cb_list);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);

void perf_sched_cb_dec(struct pmu *pmu)
{
        struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);

        this_cpu_dec(perf_sched_cb_usages);
        barrier();

        if (!--cpc->sched_cb_usage)
                list_del(&cpc->sched_cb_entry);
}


void perf_sched_cb_inc(struct pmu *pmu)
{
        struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);

        if (!cpc->sched_cb_usage++)
                list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));

        barrier();
        this_cpu_inc(perf_sched_cb_usages);
}

/*
 * This function provides the context switch callback to the lower code
 * layer. It is invoked ONLY when the context switch callback is enabled.
 *
 * This callback is relevant even to per-cpu events; for example multi event
 * PEBS requires this to provide PID/TID information. This requires we flush
 * all queued PEBS records before we context switch to a new task.
 */
static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct pmu *pmu;

        pmu = cpc->epc.pmu;

        /* software PMUs will not have sched_task */
        if (WARN_ON_ONCE(!pmu->sched_task))
                return;

        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(pmu);

        pmu->sched_task(cpc->task_epc, sched_in);

        perf_pmu_enable(pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}

static void perf_pmu_sched_task(struct task_struct *prev,
                                struct task_struct *next,
                                bool sched_in)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_cpu_pmu_context *cpc;

        /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */
        if (prev == next || cpuctx->task_ctx)
                return;

        list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
                __perf_pmu_sched_task(cpc, sched_in);
}

static void perf_event_switch(struct task_struct *task,
                              struct task_struct *next_prev, bool sched_in);

/*
 * Called from scheduler to remove the events of the current task,
 * with interrupts disabled.
 *
 * We stop each event and update the event value in event->count.
 *
 * This does not protect us against NMI, but disable()
 * sets the disabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * not restart the event.
 */
void __perf_event_task_sched_out(struct task_struct *task,
                                 struct task_struct *next)
{
        if (__this_cpu_read(perf_sched_cb_usages))
                perf_pmu_sched_task(task, next, false);

        if (atomic_read(&nr_switch_events))
                perf_event_switch(task, next, false);

        perf_event_context_sched_out(task, next);

        /*
         * if cgroup events exist on this CPU, then we need
         * to check if we have to switch out PMU state.
         * cgroup event are system-wide mode only
         */
        perf_cgroup_switch(next);
}

static bool perf_less_group_idx(const void *l, const void *r)
{
        const struct perf_event *le = *(const struct perf_event **)l;
        const struct perf_event *re = *(const struct perf_event **)r;

        return le->group_index < re->group_index;
}

static void swap_ptr(void *l, void *r)
{
        void **lp = l, **rp = r;

        swap(*lp, *rp);
}

static const struct min_heap_callbacks perf_min_heap = {
        .elem_size = sizeof(struct perf_event *),
        .less = perf_less_group_idx,
        .swp = swap_ptr,
};

static void __heap_add(struct min_heap *heap, struct perf_event *event)
{
        struct perf_event **itrs = heap->data;

        if (event) {
                itrs[heap->nr] = event;
                heap->nr++;
        }
}

static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
{
        struct perf_cpu_pmu_context *cpc;

        if (!pmu_ctx->ctx->task)
                return;

        cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
        WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
        cpc->task_epc = pmu_ctx;
}

static noinline int visit_groups_merge(struct perf_event_context *ctx,
                                struct perf_event_groups *groups, int cpu,
                                struct pmu *pmu,
                                int (*func)(struct perf_event *, void *),
                                void *data)
{
#ifdef CONFIG_CGROUP_PERF
        struct cgroup_subsys_state *css = NULL;
#endif
        struct perf_cpu_context *cpuctx = NULL;
        /* Space for per CPU and/or any CPU event iterators. */
        struct perf_event *itrs[2];
        struct min_heap event_heap;
        struct perf_event **evt;
        int ret;

        if (pmu->filter && pmu->filter(pmu, cpu))
                return 0;

        if (!ctx->task) {
                cpuctx = this_cpu_ptr(&perf_cpu_context);
                event_heap = (struct min_heap){
                        .data = cpuctx->heap,
                        .nr = 0,
                        .size = cpuctx->heap_size,
                };

                lockdep_assert_held(&cpuctx->ctx.lock);

#ifdef CONFIG_CGROUP_PERF
                if (cpuctx->cgrp)
                        css = &cpuctx->cgrp->css;
#endif
        } else {
                event_heap = (struct min_heap){
                        .data = itrs,
                        .nr = 0,
                        .size = ARRAY_SIZE(itrs),
                };
                /* Events not within a CPU context may be on any CPU. */
                __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL));
        }
        evt = event_heap.data;

        __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL));

#ifdef CONFIG_CGROUP_PERF
        for (; css; css = css->parent)
                __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup));
#endif

        if (event_heap.nr) {
                __link_epc((*evt)->pmu_ctx);
                perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
        }

        min_heapify_all(&event_heap, &perf_min_heap);

        while (event_heap.nr) {
                ret = func(*evt, data);
                if (ret)
                        return ret;

                *evt = perf_event_groups_next(*evt, pmu);
                if (*evt)
                        min_heapify(&event_heap, 0, &perf_min_heap);
                else
                        min_heap_pop(&event_heap, &perf_min_heap);
        }

        return 0;
}

/*
 * Because the userpage is strictly per-event (there is no concept of context,
 * so there cannot be a context indirection), every userpage must be updated
 * when context time starts :-(
 *
 * IOW, we must not miss EVENT_TIME edges.
 */
static inline bool event_update_userpage(struct perf_event *event)
{
        if (likely(!atomic_read(&event->mmap_count)))
                return false;

        perf_event_update_time(event);
        perf_event_update_userpage(event);

        return true;
}

static inline void group_update_userpage(struct perf_event *group_event)
{
        struct perf_event *event;

        if (!event_update_userpage(group_event))
                return;

        for_each_sibling_event(event, group_event)
                event_update_userpage(event);
}

static int merge_sched_in(struct perf_event *event, void *data)
{
        struct perf_event_context *ctx = event->ctx;
        int *can_add_hw = data;

        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;

        if (!event_filter_match(event))
                return 0;

        if (group_can_go_on(event, *can_add_hw)) {
                if (!group_sched_in(event, ctx))
                        list_add_tail(&event->active_list, get_event_list(event));
        }

        if (event->state == PERF_EVENT_STATE_INACTIVE) {
                *can_add_hw = 0;
                if (event->attr.pinned) {
                        perf_cgroup_event_disable(event, ctx);
                        perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
                } else {
                        struct perf_cpu_pmu_context *cpc;

                        event->pmu_ctx->rotate_necessary = 1;
                        cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context);
                        perf_mux_hrtimer_restart(cpc);
                        group_update_userpage(event);
                }
        }

        return 0;
}

static void pmu_groups_sched_in(struct perf_event_context *ctx,
                                struct perf_event_groups *groups,
                                struct pmu *pmu)
{
        int can_add_hw = 1;
        visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
                           merge_sched_in, &can_add_hw);
}

static void ctx_groups_sched_in(struct perf_event_context *ctx,
                                struct perf_event_groups *groups,
                                bool cgroup)
{
        struct perf_event_pmu_context *pmu_ctx;

        list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
                if (cgroup && !pmu_ctx->nr_cgroups)
                        continue;
                pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
        }
}

static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
                               struct pmu *pmu)
{
        pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
}

static void
ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        int is_active = ctx->is_active;
        bool cgroup = event_type & EVENT_CGROUP;

        event_type &= ~EVENT_CGROUP;

        lockdep_assert_held(&ctx->lock);

        if (likely(!ctx->nr_events))
                return;

        if (!(is_active & EVENT_TIME)) {
                /* start ctx time */
                __update_context_time(ctx, false);
                perf_cgroup_set_timestamp(cpuctx);
                /*
                 * CPU-release for the below ->is_active store,
                 * see __load_acquire() in perf_event_time_now()
                 */
                barrier();
        }

        ctx->is_active |= (event_type | EVENT_TIME);
        if (ctx->task) {
                if (!is_active)
                        cpuctx->task_ctx = ctx;
                else
                        WARN_ON_ONCE(cpuctx->task_ctx != ctx);
        }

        is_active ^= ctx->is_active; /* changed bits */

        /*
         * First go through the list and put on any pinned groups
         * in order to give them the best chance of going on.
         */
        if (is_active & EVENT_PINNED)
                ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);

        /* Then walk through the lower prio flexible groups */
        if (is_active & EVENT_FLEXIBLE)
                ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
}

static void perf_event_context_sched_in(struct task_struct *task)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *ctx;

        rcu_read_lock();
        ctx = rcu_dereference(task->perf_event_ctxp);
        if (!ctx)
                goto rcu_unlock;

        if (cpuctx->task_ctx == ctx) {
                perf_ctx_lock(cpuctx, ctx);
                perf_ctx_disable(ctx, false);

                perf_ctx_sched_task_cb(ctx, true);

                perf_ctx_enable(ctx, false);
                perf_ctx_unlock(cpuctx, ctx);
                goto rcu_unlock;
        }

        perf_ctx_lock(cpuctx, ctx);
        /*
         * We must check ctx->nr_events while holding ctx->lock, such
         * that we serialize against perf_install_in_context().
         */
        if (!ctx->nr_events)
                goto unlock;

        perf_ctx_disable(ctx, false);
        /*
         * We want to keep the following priority order:
         * cpu pinned (that don't need to move), task pinned,
         * cpu flexible, task flexible.
         *
         * However, if task's ctx is not carrying any pinned
         * events, no need to flip the cpuctx's events around.
         */
        if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
                perf_ctx_disable(&cpuctx->ctx, false);
                ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
        }

        perf_event_sched_in(cpuctx, ctx);

        perf_ctx_sched_task_cb(cpuctx->task_ctx, true);

        if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
                perf_ctx_enable(&cpuctx->ctx, false);

        perf_ctx_enable(ctx, false);

unlock:
        perf_ctx_unlock(cpuctx, ctx);
rcu_unlock:
        rcu_read_unlock();
}

/*
 * Called from scheduler to add the events of the current task
 * with interrupts disabled.
 *
 * We restore the event value and then enable it.
 *
 * This does not protect us against NMI, but enable()
 * sets the enabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * keep the event running.
 */
void __perf_event_task_sched_in(struct task_struct *prev,
                                struct task_struct *task)
{
        perf_event_context_sched_in(task);

        if (atomic_read(&nr_switch_events))
                perf_event_switch(task, prev, true);

        if (__this_cpu_read(perf_sched_cb_usages))
                perf_pmu_sched_task(prev, task, true);
}

static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
{
        u64 frequency = event->attr.sample_freq;
        u64 sec = NSEC_PER_SEC;
        u64 divisor, dividend;

        int count_fls, nsec_fls, frequency_fls, sec_fls;

        count_fls = fls64(count);
        nsec_fls = fls64(nsec);
        frequency_fls = fls64(frequency);
        sec_fls = 30;

        /*
         * We got @count in @nsec, with a target of sample_freq HZ
         * the target period becomes:
         *
         *             @count * 10^9
         * period = -------------------
         *          @nsec * sample_freq
         *
         */

        /*
         * Reduce accuracy by one bit such that @a and @b converge
         * to a similar magnitude.
         */
#define REDUCE_FLS(a, b)                \
do {                                        \
        if (a##_fls > b##_fls) {        \
                a >>= 1;                \
                a##_fls--;                \
        } else {                        \
                b >>= 1;                \
                b##_fls--;                \
        }                                \
} while (0)

        /*
         * Reduce accuracy until either term fits in a u64, then proceed with
         * the other, so that finally we can do a u64/u64 division.
         */
        while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
                REDUCE_FLS(nsec, frequency);
                REDUCE_FLS(sec, count);
        }

        if (count_fls + sec_fls > 64) {
                divisor = nsec * frequency;

                while (count_fls + sec_fls > 64) {
                        REDUCE_FLS(count, sec);
                        divisor >>= 1;
                }

                dividend = count * sec;
        } else {
                dividend = count * sec;

                while (nsec_fls + frequency_fls > 64) {
                        REDUCE_FLS(nsec, frequency);
                        dividend >>= 1;
                }

                divisor = nsec * frequency;
        }

        if (!divisor)
                return dividend;

        return div64_u64(dividend, divisor);
}

static DEFINE_PER_CPU(int, perf_throttled_count);
static DEFINE_PER_CPU(u64, perf_throttled_seq);

static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
{
        struct hw_perf_event *hwc = &event->hw;
        s64 period, sample_period;
        s64 delta;

        period = perf_calculate_period(event, nsec, count);

        delta = (s64)(period - hwc->sample_period);
        delta = (delta + 7) / 8; /* low pass filter */

        sample_period = hwc->sample_period + delta;

        if (!sample_period)
                sample_period = 1;

        hwc->sample_period = sample_period;

        if (local64_read(&hwc->period_left) > 8*sample_period) {
                if (disable)
                        event->pmu->stop(event, PERF_EF_UPDATE);

                local64_set(&hwc->period_left, 0);

                if (disable)
                        event->pmu->start(event, PERF_EF_RELOAD);
        }
}

static void perf_adjust_freq_unthr_events(struct list_head *event_list)
{
        struct perf_event *event;
        struct hw_perf_event *hwc;
        u64 now, period = TICK_NSEC;
        s64 delta;

        list_for_each_entry(event, event_list, active_list) {
                if (event->state != PERF_EVENT_STATE_ACTIVE)
                        continue;

                // XXX use visit thingy to avoid the -1,cpu match
                if (!event_filter_match(event))
                        continue;

                hwc = &event->hw;

                if (hwc->interrupts == MAX_INTERRUPTS) {
                        hwc->interrupts = 0;
                        perf_log_throttle(event, 1);
                        if (!event->attr.freq || !event->attr.sample_freq)
                                event->pmu->start(event, 0);
                }

                if (!event->attr.freq || !event->attr.sample_freq)
                        continue;

                /*
                 * stop the event and update event->count
                 */
                event->pmu->stop(event, PERF_EF_UPDATE);

                now = local64_read(&event->count);
                delta = now - hwc->freq_count_stamp;
                hwc->freq_count_stamp = now;

                /*
                 * restart the event
                 * reload only if value has changed
                 * we have stopped the event so tell that
                 * to perf_adjust_period() to avoid stopping it
                 * twice.
                 */
                if (delta > 0)
                        perf_adjust_period(event, period, delta, false);

                event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
        }
}

/*
 * combine freq adjustment with unthrottling to avoid two passes over the
 * events. At the same time, make sure, having freq events does not change
 * the rate of unthrottling as that would introduce bias.
 */
static void
perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
{
        struct perf_event_pmu_context *pmu_ctx;

        /*
         * only need to iterate over all events iff:
         * - context have events in frequency mode (needs freq adjust)
         * - there are events to unthrottle on this cpu
         */
        if (!(ctx->nr_freq || unthrottle))
                return;

        raw_spin_lock(&ctx->lock);

        list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
                if (!(pmu_ctx->nr_freq || unthrottle))
                        continue;
                if (!perf_pmu_ctx_is_active(pmu_ctx))
                        continue;
                if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
                        continue;

                perf_pmu_disable(pmu_ctx->pmu);
                perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
                perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
                perf_pmu_enable(pmu_ctx->pmu);
        }

        raw_spin_unlock(&ctx->lock);
}

/*
 * Move @event to the tail of the @ctx's elegible events.
 */
static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
{
        /*
         * Rotate the first entry last of non-pinned groups. Rotation might be
         * disabled by the inheritance code.
         */
        if (ctx->rotate_disable)
                return;

        perf_event_groups_delete(&ctx->flexible_groups, event);
        perf_event_groups_insert(&ctx->flexible_groups, event);
}

/* pick an event from the flexible_groups to rotate */
static inline struct perf_event *
ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx)
{
        struct perf_event *event;
        struct rb_node *node;
        struct rb_root *tree;
        struct __group_key key = {
                .pmu = pmu_ctx->pmu,
        };

        /* pick the first active flexible event */
        event = list_first_entry_or_null(&pmu_ctx->flexible_active,
                                         struct perf_event, active_list);
        if (event)
                goto out;

        /* if no active flexible event, pick the first event */
        tree = &pmu_ctx->ctx->flexible_groups.tree;

        if (!pmu_ctx->ctx->task) {
                key.cpu = smp_processor_id();

                node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
                if (node)
                        event = __node_2_pe(node);
                goto out;
        }

        key.cpu = -1;
        node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
        if (node) {
                event = __node_2_pe(node);
                goto out;
        }

        key.cpu = smp_processor_id();
        node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
        if (node)
                event = __node_2_pe(node);

out:
        /*
         * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
         * finds there are unschedulable events, it will set it again.
         */
        pmu_ctx->rotate_necessary = 0;

        return event;
}

static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_pmu_context *cpu_epc, *task_epc = NULL;
        struct perf_event *cpu_event = NULL, *task_event = NULL;
        int cpu_rotate, task_rotate;
        struct pmu *pmu;

        /*
         * Since we run this from IRQ context, nobody can install new
         * events, thus the event count values are stable.
         */

        cpu_epc = &cpc->epc;
        pmu = cpu_epc->pmu;
        task_epc = cpc->task_epc;

        cpu_rotate = cpu_epc->rotate_necessary;
        task_rotate = task_epc ? task_epc->rotate_necessary : 0;

        if (!(cpu_rotate || task_rotate))
                return false;

        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(pmu);

        if (task_rotate)
                task_event = ctx_event_to_rotate(task_epc);
        if (cpu_rotate)
                cpu_event = ctx_event_to_rotate(cpu_epc);

        /*
         * As per the order given at ctx_resched() first 'pop' task flexible
         * and then, if needed CPU flexible.
         */
        if (task_event || (task_epc && cpu_event)) {
                update_context_time(task_epc->ctx);
                __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
        }

        if (cpu_event) {
                update_context_time(&cpuctx->ctx);
                __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
                rotate_ctx(&cpuctx->ctx, cpu_event);
                __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
        }

        if (task_event)
                rotate_ctx(task_epc->ctx, task_event);

        if (task_event || (task_epc && cpu_event))
                __pmu_ctx_sched_in(task_epc->ctx, pmu);

        perf_pmu_enable(pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);

        return true;
}

void perf_event_task_tick(void)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *ctx;
        int throttled;

        lockdep_assert_irqs_disabled();

        __this_cpu_inc(perf_throttled_seq);
        throttled = __this_cpu_xchg(perf_throttled_count, 0);
        tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);

        perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled);

        rcu_read_lock();
        ctx = rcu_dereference(current->perf_event_ctxp);
        if (ctx)
                perf_adjust_freq_unthr_context(ctx, !!throttled);
        rcu_read_unlock();
}

static int event_enable_on_exec(struct perf_event *event,
                                struct perf_event_context *ctx)
{
        if (!event->attr.enable_on_exec)
                return 0;

        event->attr.enable_on_exec = 0;
        if (event->state >= PERF_EVENT_STATE_INACTIVE)
                return 0;

        perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);

        return 1;
}

/*
 * Enable all of a task's events that have been marked enable-on-exec.
 * This expects task == current.
 */
static void perf_event_enable_on_exec(struct perf_event_context *ctx)
{
        struct perf_event_context *clone_ctx = NULL;
        enum event_type_t event_type = 0;
        struct perf_cpu_context *cpuctx;
        struct perf_event *event;
        unsigned long flags;
        int enabled = 0;

        local_irq_save(flags);
        if (WARN_ON_ONCE(current->perf_event_ctxp != ctx))
                goto out;

        if (!ctx->nr_events)
                goto out;

        cpuctx = this_cpu_ptr(&perf_cpu_context);
        perf_ctx_lock(cpuctx, ctx);
        ctx_sched_out(ctx, EVENT_TIME);

        list_for_each_entry(event, &ctx->event_list, event_entry) {
                enabled |= event_enable_on_exec(event, ctx);
                event_type |= get_event_type(event);
        }

        /*
         * Unclone and reschedule this context if we enabled any event.
         */
        if (enabled) {
                clone_ctx = unclone_ctx(ctx);
                ctx_resched(cpuctx, ctx, event_type);
        } else {
                ctx_sched_in(ctx, EVENT_TIME);
        }
        perf_ctx_unlock(cpuctx, ctx);

out:
        local_irq_restore(flags);

        if (clone_ctx)
                put_ctx(clone_ctx);
}

static void perf_remove_from_owner(struct perf_event *event);
static void perf_event_exit_event(struct perf_event *event,
                                  struct perf_event_context *ctx);

/*
 * Removes all events from the current task that have been marked
 * remove-on-exec, and feeds their values back to parent events.
 */
static void perf_event_remove_on_exec(struct perf_event_context *ctx)
{
        struct perf_event_context *clone_ctx = NULL;
        struct perf_event *event, *next;
        unsigned long flags;
        bool modified = false;

        mutex_lock(&ctx->mutex);

        if (WARN_ON_ONCE(ctx->task != current))
                goto unlock;

        list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
                if (!event->attr.remove_on_exec)
                        continue;

                if (!is_kernel_event(event))
                        perf_remove_from_owner(event);

                modified = true;

                perf_event_exit_event(event, ctx);
        }

        raw_spin_lock_irqsave(&ctx->lock, flags);
        if (modified)
                clone_ctx = unclone_ctx(ctx);
        raw_spin_unlock_irqrestore(&ctx->lock, flags);

unlock:
        mutex_unlock(&ctx->mutex);

        if (clone_ctx)
                put_ctx(clone_ctx);
}

struct perf_read_data {
        struct perf_event *event;
        bool group;
        int ret;
};

static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
{
        u16 local_pkg, event_pkg;

        if ((unsigned)event_cpu >= nr_cpu_ids)
                return event_cpu;

        if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
                int local_cpu = smp_processor_id();

                event_pkg = topology_physical_package_id(event_cpu);
                local_pkg = topology_physical_package_id(local_cpu);

                if (event_pkg == local_pkg)
                        return local_cpu;
        }

        return event_cpu;
}

/*
 * Cross CPU call to read the hardware event
 */
static void __perf_event_read(void *info)
{
        struct perf_read_data *data = info;
        struct perf_event *sub, *event = data->event;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct pmu *pmu = event->pmu;

        /*
         * If this is a task context, we need to check whether it is
         * the current task context of this cpu.  If not it has been
         * scheduled out before the smp call arrived.  In that case
         * event->count would have been updated to a recent sample
         * when the event was scheduled out.
         */
        if (ctx->task && cpuctx->task_ctx != ctx)
                return;

        raw_spin_lock(&ctx->lock);
        if (ctx->is_active & EVENT_TIME) {
                update_context_time(ctx);
                update_cgrp_time_from_event(event);
        }

        perf_event_update_time(event);
        if (data->group)
                perf_event_update_sibling_time(event);

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                goto unlock;

        if (!data->group) {
                pmu->read(event);
                data->ret = 0;
                goto unlock;
        }

        pmu->start_txn(pmu, PERF_PMU_TXN_READ);

        pmu->read(event);

        for_each_sibling_event(sub, event) {
                if (sub->state == PERF_EVENT_STATE_ACTIVE) {
                        /*
                         * Use sibling's PMU rather than @event's since
                         * sibling could be on different (eg: software) PMU.
                         */
                        sub->pmu->read(sub);
                }
        }

        data->ret = pmu->commit_txn(pmu);

unlock:
        raw_spin_unlock(&ctx->lock);
}

static inline u64 perf_event_count(struct perf_event *event)
{
        return local64_read(&event->count) + atomic64_read(&event->child_count);
}

static void calc_timer_values(struct perf_event *event,
                                u64 *now,
                                u64 *enabled,
                                u64 *running)
{
        u64 ctx_time;

        *now = perf_clock();
        ctx_time = perf_event_time_now(event, *now);
        __perf_update_times(event, ctx_time, enabled, running);
}

/*
 * NMI-safe method to read a local event, that is an event that
 * is:
 *   - either for the current task, or for this CPU
 *   - does not have inherit set, for inherited task events
 *     will not be local and we cannot read them atomically
 *   - must not have a pmu::count method
 */
int perf_event_read_local(struct perf_event *event, u64 *value,
                          u64 *enabled, u64 *running)
{
        unsigned long flags;
        int event_oncpu;
        int event_cpu;
        int ret = 0;

        /*
         * Disabling interrupts avoids all counter scheduling (context
         * switches, timer based rotation and IPIs).
         */
        local_irq_save(flags);

        /*
         * It must not be an event with inherit set, we cannot read
         * all child counters from atomic context.
         */
        if (event->attr.inherit) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        /* If this is a per-task event, it must be for current */
        if ((event->attach_state & PERF_ATTACH_TASK) &&
            event->hw.target != current) {
                ret = -EINVAL;
                goto out;
        }

        /*
         * Get the event CPU numbers, and adjust them to local if the event is
         * a per-package event that can be read locally
         */
        event_oncpu = __perf_event_read_cpu(event, event->oncpu);
        event_cpu = __perf_event_read_cpu(event, event->cpu);

        /* If this is a per-CPU event, it must be for this CPU */
        if (!(event->attach_state & PERF_ATTACH_TASK) &&
            event_cpu != smp_processor_id()) {
                ret = -EINVAL;
                goto out;
        }

        /* If this is a pinned event it must be running on this CPU */
        if (event->attr.pinned && event_oncpu != smp_processor_id()) {
                ret = -EBUSY;
                goto out;
        }

        /*
         * If the event is currently on this CPU, its either a per-task event,
         * or local to this CPU. Furthermore it means its ACTIVE (otherwise
         * oncpu == -1).
         */
        if (event_oncpu == smp_processor_id())
                event->pmu->read(event);

        *value = local64_read(&event->count);
        if (enabled || running) {
                u64 __enabled, __running, __now;

                calc_timer_values(event, &__now, &__enabled, &__running);
                if (enabled)
                        *enabled = __enabled;
                if (running)
                        *running = __running;
        }
out:
        local_irq_restore(flags);

        return ret;
}

static int perf_event_read(struct perf_event *event, bool group)
{
        enum perf_event_state state = READ_ONCE(event->state);
        int event_cpu, ret = 0;

        /*
         * If event is enabled and currently active on a CPU, update the
         * value in the event structure:
         */
again:
        if (state == PERF_EVENT_STATE_ACTIVE) {
                struct perf_read_data data;

                /*
                 * Orders the ->state and ->oncpu loads such that if we see
                 * ACTIVE we must also see the right ->oncpu.
                 *
                 * Matches the smp_wmb() from event_sched_in().
                 */
                smp_rmb();

                event_cpu = READ_ONCE(event->oncpu);
                if ((unsigned)event_cpu >= nr_cpu_ids)
                        return 0;

                data = (struct perf_read_data){
                        .event = event,
                        .group = group,
                        .ret = 0,
                };

                preempt_disable();
                event_cpu = __perf_event_read_cpu(event, event_cpu);

                /*
                 * Purposely ignore the smp_call_function_single() return
                 * value.
                 *
                 * If event_cpu isn't a valid CPU it means the event got
                 * scheduled out and that will have updated the event count.
                 *
                 * Therefore, either way, we'll have an up-to-date event count
                 * after this.
                 */
                (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
                preempt_enable();
                ret = data.ret;

        } else if (state == PERF_EVENT_STATE_INACTIVE) {
                struct perf_event_context *ctx = event->ctx;
                unsigned long flags;

                raw_spin_lock_irqsave(&ctx->lock, flags);
                state = event->state;
                if (state != PERF_EVENT_STATE_INACTIVE) {
                        raw_spin_unlock_irqrestore(&ctx->lock, flags);
                        goto again;
                }

                /*
                 * May read while context is not active (e.g., thread is
                 * blocked), in that case we cannot update context time
                 */
                if (ctx->is_active & EVENT_TIME) {
                        update_context_time(ctx);
                        update_cgrp_time_from_event(event);
                }

                perf_event_update_time(event);
                if (group)
                        perf_event_update_sibling_time(event);
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }

        return ret;
}

/*
 * Initialize the perf_event context in a task_struct:
 */
static void __perf_event_init_context(struct perf_event_context *ctx)
{
        raw_spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
        INIT_LIST_HEAD(&ctx->pmu_ctx_list);
        perf_event_groups_init(&ctx->pinned_groups);
        perf_event_groups_init(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
        refcount_set(&ctx->refcount, 1);
}

static void
__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu)
{
        epc->pmu = pmu;
        INIT_LIST_HEAD(&epc->pmu_ctx_entry);
        INIT_LIST_HEAD(&epc->pinned_active);
        INIT_LIST_HEAD(&epc->flexible_active);
        atomic_set(&epc->refcount, 1);
}

static struct perf_event_context *
alloc_perf_context(struct task_struct *task)
{
        struct perf_event_context *ctx;

        ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
        if (!ctx)
                return NULL;

        __perf_event_init_context(ctx);
        if (task)
                ctx->task = get_task_struct(task);

        return ctx;
}

static struct task_struct *
find_lively_task_by_vpid(pid_t vpid)
{
        struct task_struct *task;

        rcu_read_lock();
        if (!vpid)
                task = current;
        else
                task = find_task_by_vpid(vpid);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();

        if (!task)
                return ERR_PTR(-ESRCH);

        return task;
}

/*
 * Returns a matching context with refcount and pincount.
 */
static struct perf_event_context *
find_get_context(struct task_struct *task, struct perf_event *event)
{
        struct perf_event_context *ctx, *clone_ctx = NULL;
        struct perf_cpu_context *cpuctx;
        unsigned long flags;
        int err;

        if (!task) {
                /* Must be root to operate on a CPU event: */
                err = perf_allow_cpu(&event->attr);
                if (err)
                        return ERR_PTR(err);

                cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
                ctx = &cpuctx->ctx;
                get_ctx(ctx);
                raw_spin_lock_irqsave(&ctx->lock, flags);
                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);

                return ctx;
        }

        err = -EINVAL;
retry:
        ctx = perf_lock_task_context(task, &flags);
        if (ctx) {
                clone_ctx = unclone_ctx(ctx);
                ++ctx->pin_count;

                raw_spin_unlock_irqrestore(&ctx->lock, flags);

                if (clone_ctx)
                        put_ctx(clone_ctx);
        } else {
                ctx = alloc_perf_context(task);
                err = -ENOMEM;
                if (!ctx)
                        goto errout;

                err = 0;
                mutex_lock(&task->perf_event_mutex);
                /*
                 * If it has already passed perf_event_exit_task().
                 * we must see PF_EXITING, it takes this mutex too.
                 */
                if (task->flags & PF_EXITING)
                        err = -ESRCH;
                else if (task->perf_event_ctxp)
                        err = -EAGAIN;
                else {
                        get_ctx(ctx);
                        ++ctx->pin_count;
                        rcu_assign_pointer(task->perf_event_ctxp, ctx);
                }
                mutex_unlock(&task->perf_event_mutex);

                if (unlikely(err)) {
                        put_ctx(ctx);

                        if (err == -EAGAIN)
                                goto retry;
                        goto errout;
                }
        }

        return ctx;

errout:
        return ERR_PTR(err);
}

static struct perf_event_pmu_context *
find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
                     struct perf_event *event)
{
        struct perf_event_pmu_context *new = NULL, *epc;
        void *task_ctx_data = NULL;

        if (!ctx->task) {
                /*
                 * perf_pmu_migrate_context() / __perf_pmu_install_event()
                 * relies on the fact that find_get_pmu_context() cannot fail
                 * for CPU contexts.
                 */
                struct perf_cpu_pmu_context *cpc;

                cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
                epc = &cpc->epc;
                raw_spin_lock_irq(&ctx->lock);
                if (!epc->ctx) {
                        atomic_set(&epc->refcount, 1);
                        epc->embedded = 1;
                        list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
                        epc->ctx = ctx;
                } else {
                        WARN_ON_ONCE(epc->ctx != ctx);
                        atomic_inc(&epc->refcount);
                }
                raw_spin_unlock_irq(&ctx->lock);
                return epc;
        }

        new = kzalloc(sizeof(*epc), GFP_KERNEL);
        if (!new)
                return ERR_PTR(-ENOMEM);

        if (event->attach_state & PERF_ATTACH_TASK_DATA) {
                task_ctx_data = alloc_task_ctx_data(pmu);
                if (!task_ctx_data) {
                        kfree(new);
                        return ERR_PTR(-ENOMEM);
                }
        }

        __perf_init_event_pmu_context(new, pmu);

        /*
         * XXX
         *
         * lockdep_assert_held(&ctx->mutex);
         *
         * can't because perf_event_init_task() doesn't actually hold the
         * child_ctx->mutex.
         */

        raw_spin_lock_irq(&ctx->lock);
        list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) {
                if (epc->pmu == pmu) {
                        WARN_ON_ONCE(epc->ctx != ctx);
                        atomic_inc(&epc->refcount);
                        goto found_epc;
                }
        }

        epc = new;
        new = NULL;

        list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
        epc->ctx = ctx;

found_epc:
        if (task_ctx_data && !epc->task_ctx_data) {
                epc->task_ctx_data = task_ctx_data;
                task_ctx_data = NULL;
                ctx->nr_task_data++;
        }
        raw_spin_unlock_irq(&ctx->lock);

        free_task_ctx_data(pmu, task_ctx_data);
        kfree(new);

        return epc;
}

static void get_pmu_ctx(struct perf_event_pmu_context *epc)
{
        WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
}

static void free_epc_rcu(struct rcu_head *head)
{
        struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);

        kfree(epc->task_ctx_data);
        kfree(epc);
}

static void put_pmu_ctx(struct perf_event_pmu_context *epc)
{
        struct perf_event_context *ctx = epc->ctx;
        unsigned long flags;

        /*
         * XXX
         *
         * lockdep_assert_held(&ctx->mutex);
         *
         * can't because of the call-site in _free_event()/put_event()
         * which isn't always called under ctx->mutex.
         */
        if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags))
                return;

        WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));

        list_del_init(&epc->pmu_ctx_entry);
        epc->ctx = NULL;

        WARN_ON_ONCE(!list_empty(&epc->pinned_active));
        WARN_ON_ONCE(!list_empty(&epc->flexible_active));

        raw_spin_unlock_irqrestore(&ctx->lock, flags);

        if (epc->embedded)
                return;

        call_rcu(&epc->rcu_head, free_epc_rcu);
}

static void perf_event_free_filter(struct perf_event *event);

static void free_event_rcu(struct rcu_head *head)
{
        struct perf_event *event = container_of(head, typeof(*event), rcu_head);

        if (event->ns)
                put_pid_ns(event->ns);
        perf_event_free_filter(event);
        kmem_cache_free(perf_event_cache, event);
}

static void ring_buffer_attach(struct perf_event *event,
                               struct perf_buffer *rb);

static void detach_sb_event(struct perf_event *event)
{
        struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);

        raw_spin_lock(&pel->lock);
        list_del_rcu(&event->sb_list);
        raw_spin_unlock(&pel->lock);
}

static bool is_sb_event(struct perf_event *event)
{
        struct perf_event_attr *attr = &event->attr;

        if (event->parent)
                return false;

        if (event->attach_state & PERF_ATTACH_TASK)
                return false;

        if (attr->mmap || attr->mmap_data || attr->mmap2 ||
            attr->comm || attr->comm_exec ||
            attr->task || attr->ksymbol ||
            attr->context_switch || attr->text_poke ||
            attr->bpf_event)
                return true;
        return false;
}

static void unaccount_pmu_sb_event(struct perf_event *event)
{
        if (is_sb_event(event))
                detach_sb_event(event);
}

#ifdef CONFIG_NO_HZ_FULL
static DEFINE_SPINLOCK(nr_freq_lock);
#endif

static void unaccount_freq_event_nohz(void)
{
#ifdef CONFIG_NO_HZ_FULL
        spin_lock(&nr_freq_lock);
        if (atomic_dec_and_test(&nr_freq_events))
                tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
        spin_unlock(&nr_freq_lock);
#endif
}

static void unaccount_freq_event(void)
{
        if (tick_nohz_full_enabled())
                unaccount_freq_event_nohz();
        else
                atomic_dec(&nr_freq_events);
}

static void unaccount_event(struct perf_event *event)
{
        bool dec = false;

        if (event->parent)
                return;

        if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
                dec = true;
        if (event->attr.mmap || event->attr.mmap_data)
                atomic_dec(&nr_mmap_events);
        if (event->attr.build_id)
                atomic_dec(&nr_build_id_events);
        if (event->attr.comm)
                atomic_dec(&nr_comm_events);
        if (event->attr.namespaces)
                atomic_dec(&nr_namespaces_events);
        if (event->attr.cgroup)
                atomic_dec(&nr_cgroup_events);
        if (event->attr.task)
                atomic_dec(&nr_task_events);
        if (event->attr.freq)
                unaccount_freq_event();
        if (event->attr.context_switch) {
                dec = true;
                atomic_dec(&nr_switch_events);
        }
        if (is_cgroup_event(event))
                dec = true;
        if (has_branch_stack(event))
                dec = true;
        if (event->attr.ksymbol)
                atomic_dec(&nr_ksymbol_events);
        if (event->attr.bpf_event)
                atomic_dec(&nr_bpf_events);
        if (event->attr.text_poke)
                atomic_dec(&nr_text_poke_events);

        if (dec) {
                if (!atomic_add_unless(&perf_sched_count, -1, 1))
                        schedule_delayed_work(&perf_sched_work, HZ);
        }

        unaccount_pmu_sb_event(event);
}

static void perf_sched_delayed(struct work_struct *work)
{
        mutex_lock(&perf_sched_mutex);
        if (atomic_dec_and_test(&perf_sched_count))
                static_branch_disable(&perf_sched_events);
        mutex_unlock(&perf_sched_mutex);
}

/*
 * The following implement mutual exclusion of events on "exclusive" pmus
 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
 * at a time, so we disallow creating events that might conflict, namely:
 *
 *  1) cpu-wide events in the presence of per-task events,
 *  2) per-task events in the presence of cpu-wide events,
 *  3) two matching events on the same perf_event_context.
 *
 * The former two cases are handled in the allocation path (perf_event_alloc(),
 * _free_event()), the latter -- before the first perf_install_in_context().
 */
static int exclusive_event_init(struct perf_event *event)
{
        struct pmu *pmu = event->pmu;

        if (!is_exclusive_pmu(pmu))
                return 0;

        /*
         * Prevent co-existence of per-task and cpu-wide events on the
         * same exclusive pmu.
         *
         * Negative pmu::exclusive_cnt means there are cpu-wide
         * events on this "exclusive" pmu, positive means there are
         * per-task events.
         *
         * Since this is called in perf_event_alloc() path, event::ctx
         * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
         * to mean "per-task event", because unlike other attach states it
         * never gets cleared.
         */
        if (event->attach_state & PERF_ATTACH_TASK) {
                if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
                        return -EBUSY;
        } else {
                if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
                        return -EBUSY;
        }

        return 0;
}

static void exclusive_event_destroy(struct perf_event *event)
{
        struct pmu *pmu = event->pmu;

        if (!is_exclusive_pmu(pmu))
                return;

        /* see comment in exclusive_event_init() */
        if (event->attach_state & PERF_ATTACH_TASK)
                atomic_dec(&pmu->exclusive_cnt);
        else
                atomic_inc(&pmu->exclusive_cnt);
}

static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
{
        if ((e1->pmu == e2->pmu) &&
            (e1->cpu == e2->cpu ||
             e1->cpu == -1 ||
             e2->cpu == -1))
                return true;
        return false;
}

static bool exclusive_event_installable(struct perf_event *event,
                                        struct perf_event_context *ctx)
{
        struct perf_event *iter_event;
        struct pmu *pmu = event->pmu;

        lockdep_assert_held(&ctx->mutex);

        if (!is_exclusive_pmu(pmu))
                return true;

        list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
                if (exclusive_event_match(iter_event, event))
                        return false;
        }

        return true;
}

static void perf_addr_filters_splice(struct perf_event *event,
                                       struct list_head *head);

static void _free_event(struct perf_event *event)
{
        irq_work_sync(&event->pending_irq);

        unaccount_event(event);

        security_perf_event_free(event);

        if (event->rb) {
                /*
                 * Can happen when we close an event with re-directed output.
                 *
                 * Since we have a 0 refcount, perf_mmap_close() will skip
                 * over us; possibly making our ring_buffer_put() the last.
                 */
                mutex_lock(&event->mmap_mutex);
                ring_buffer_attach(event, NULL);
                mutex_unlock(&event->mmap_mutex);
        }

        if (is_cgroup_event(event))
                perf_detach_cgroup(event);

        if (!event->parent) {
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
                        put_callchain_buffers();
        }

        perf_event_free_bpf_prog(event);
        perf_addr_filters_splice(event, NULL);
        kfree(event->addr_filter_ranges);

        if (event->destroy)
                event->destroy(event);

        /*
         * Must be after ->destroy(), due to uprobe_perf_close() using
         * hw.target.
         */
        if (event->hw.target)
                put_task_struct(event->hw.target);

        if (event->pmu_ctx)
                put_pmu_ctx(event->pmu_ctx);

        /*
         * perf_event_free_task() relies on put_ctx() being 'last', in particular
         * all task references must be cleaned up.
         */
        if (event->ctx)
                put_ctx(event->ctx);

        exclusive_event_destroy(event);
        module_put(event->pmu->module);

        call_rcu(&event->rcu_head, free_event_rcu);
}

/*
 * Used to free events which have a known refcount of 1, such as in error paths
 * where the event isn't exposed yet and inherited events.
 */
static void free_event(struct perf_event *event)
{
        if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
                                "unexpected event refcount: %ld; ptr=%p\n",
                                atomic_long_read(&event->refcount), event)) {
                /* leak to avoid use-after-free */
                return;
        }

        _free_event(event);
}

/*
 * Remove user event from the owner task.
 */
static void perf_remove_from_owner(struct perf_event *event)
{
        struct task_struct *owner;

        rcu_read_lock();
        /*
         * Matches the smp_store_release() in perf_event_exit_task(). If we
         * observe !owner it means the list deletion is complete and we can
         * indeed free this event, otherwise we need to serialize on
         * owner->perf_event_mutex.
         */
        owner = READ_ONCE(event->owner);
        if (owner) {
                /*
                 * Since delayed_put_task_struct() also drops the last
                 * task reference we can safely take a new reference
                 * while holding the rcu_read_lock().
                 */
                get_task_struct(owner);
        }
        rcu_read_unlock();

        if (owner) {
                /*
                 * If we're here through perf_event_exit_task() we're already
                 * holding ctx->mutex which would be an inversion wrt. the
                 * normal lock order.
                 *
                 * However we can safely take this lock because its the child
                 * ctx->mutex.
                 */
                mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);

                /*
                 * We have to re-check the event->owner field, if it is cleared
                 * we raced with perf_event_exit_task(), acquiring the mutex
                 * ensured they're done, and we can proceed with freeing the
                 * event.
                 */
                if (event->owner) {
                        list_del_init(&event->owner_entry);
                        smp_store_release(&event->owner, NULL);
                }
                mutex_unlock(&owner->perf_event_mutex);
                put_task_struct(owner);
        }
}

static void put_event(struct perf_event *event)
{
        if (!atomic_long_dec_and_test(&event->refcount))
                return;

        _free_event(event);
}

/*
 * Kill an event dead; while event:refcount will preserve the event
 * object, it will not preserve its functionality. Once the last 'user'
 * gives up the object, we'll destroy the thing.
 */
int perf_event_release_kernel(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *child, *tmp;
        LIST_HEAD(free_list);

        /*
         * If we got here through err_alloc: free_event(event); we will not
         * have attached to a context yet.
         */
        if (!ctx) {
                WARN_ON_ONCE(event->attach_state &
                                (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
                goto no_ctx;
        }

        if (!is_kernel_event(event))
                perf_remove_from_owner(event);

        ctx = perf_event_ctx_lock(event);
        WARN_ON_ONCE(ctx->parent_ctx);

        /*
         * Mark this event as STATE_DEAD, there is no external reference to it
         * anymore.
         *
         * Anybody acquiring event->child_mutex after the below loop _must_
         * also see this, most importantly inherit_event() which will avoid
         * placing more children on the list.
         *
         * Thus this guarantees that we will in fact observe and kill _ALL_
         * child events.
         */
        perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);

        perf_event_ctx_unlock(event, ctx);

again:
        mutex_lock(&event->child_mutex);
        list_for_each_entry(child, &event->child_list, child_list) {
                void *var = NULL;

                /*
                 * Cannot change, child events are not migrated, see the
                 * comment with perf_event_ctx_lock_nested().
                 */
                ctx = READ_ONCE(child->ctx);
                /*
                 * Since child_mutex nests inside ctx::mutex, we must jump
                 * through hoops. We start by grabbing a reference on the ctx.
                 *
                 * Since the event cannot get freed while we hold the
                 * child_mutex, the context must also exist and have a !0
                 * reference count.
                 */
                get_ctx(ctx);

                /*
                 * Now that we have a ctx ref, we can drop child_mutex, and
                 * acquire ctx::mutex without fear of it going away. Then we
                 * can re-acquire child_mutex.
                 */
                mutex_unlock(&event->child_mutex);
                mutex_lock(&ctx->mutex);
                mutex_lock(&event->child_mutex);

                /*
                 * Now that we hold ctx::mutex and child_mutex, revalidate our
                 * state, if child is still the first entry, it didn't get freed
                 * and we can continue doing so.
                 */
                tmp = list_first_entry_or_null(&event->child_list,
                                               struct perf_event, child_list);
                if (tmp == child) {
                        perf_remove_from_context(child, DETACH_GROUP);
                        list_move(&child->child_list, &free_list);
                        /*
                         * This matches the refcount bump in inherit_event();
                         * this can't be the last reference.
                         */
                        put_event(event);
                } else {
                        var = &ctx->refcount;
                }

                mutex_unlock(&event->child_mutex);
                mutex_unlock(&ctx->mutex);
                put_ctx(ctx);

                if (var) {
                        /*
                         * If perf_event_free_task() has deleted all events from the
                         * ctx while the child_mutex got released above, make sure to
                         * notify about the preceding put_ctx().
                         */
                        smp_mb(); /* pairs with wait_var_event() */
                        wake_up_var(var);
                }
                goto again;
        }
        mutex_unlock(&event->child_mutex);

        list_for_each_entry_safe(child, tmp, &free_list, child_list) {
                void *var = &child->ctx->refcount;

                list_del(&child->child_list);
                free_event(child);

                /*
                 * Wake any perf_event_free_task() waiting for this event to be
                 * freed.
                 */
                smp_mb(); /* pairs with wait_var_event() */
                wake_up_var(var);
        }

no_ctx:
        put_event(event); /* Must be the 'last' reference */
        return 0;
}
EXPORT_SYMBOL_GPL(perf_event_release_kernel);

/*
 * Called when the last reference to the file is gone.
 */
static int perf_release(struct inode *inode, struct file *file)
{
        perf_event_release_kernel(file->private_data);
        return 0;
}

static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
        struct perf_event *child;
        u64 total = 0;

        *enabled = 0;
        *running = 0;

        mutex_lock(&event->child_mutex);

        (void)perf_event_read(event, false);
        total += perf_event_count(event);

        *enabled += event->total_time_enabled +
                        atomic64_read(&event->child_total_time_enabled);
        *running += event->total_time_running +
                        atomic64_read(&event->child_total_time_running);

        list_for_each_entry(child, &event->child_list, child_list) {
                (void)perf_event_read(child, false);
                total += perf_event_count(child);
                *enabled += child->total_time_enabled;
                *running += child->total_time_running;
        }
        mutex_unlock(&event->child_mutex);

        return total;
}

u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
        struct perf_event_context *ctx;
        u64 count;

        ctx = perf_event_ctx_lock(event);
        count = __perf_event_read_value(event, enabled, running);
        perf_event_ctx_unlock(event, ctx);

        return count;
}
EXPORT_SYMBOL_GPL(perf_event_read_value);

static int __perf_read_group_add(struct perf_event *leader,
                                        u64 read_format, u64 *values)
{
        struct perf_event_context *ctx = leader->ctx;
        struct perf_event *sub, *parent;
        unsigned long flags;
        int n = 1; /* skip @nr */
        int ret;

        ret = perf_event_read(leader, true);
        if (ret)
                return ret;

        raw_spin_lock_irqsave(&ctx->lock, flags);
        /*
         * Verify the grouping between the parent and child (inherited)
         * events is still in tact.
         *
         * Specifically:
         *  - leader->ctx->lock pins leader->sibling_list
         *  - parent->child_mutex pins parent->child_list
         *  - parent->ctx->mutex pins parent->sibling_list
         *
         * Because parent->ctx != leader->ctx (and child_list nests inside
         * ctx->mutex), group destruction is not atomic between children, also
         * see perf_event_release_kernel(). Additionally, parent can grow the
         * group.
         *
         * Therefore it is possible to have parent and child groups in a
         * different configuration and summing over such a beast makes no sense
         * what so ever.
         *
         * Reject this.
         */
        parent = leader->parent;
        if (parent &&
            (parent->group_generation != leader->group_generation ||
             parent->nr_siblings != leader->nr_siblings)) {
                ret = -ECHILD;
                goto unlock;
        }

        /*
         * Since we co-schedule groups, {enabled,running} times of siblings
         * will be identical to those of the leader, so we only publish one
         * set.
         */
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
                values[n++] += leader->total_time_enabled +
                        atomic64_read(&leader->child_total_time_enabled);
        }

        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
                values[n++] += leader->total_time_running +
                        atomic64_read(&leader->child_total_time_running);
        }

        /*
         * Write {count,id} tuples for every sibling.
         */
        values[n++] += perf_event_count(leader);
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&leader->lost_samples);

        for_each_sibling_event(sub, leader) {
                values[n++] += perf_event_count(sub);
                if (read_format & PERF_FORMAT_ID)
                        values[n++] = primary_event_id(sub);
                if (read_format & PERF_FORMAT_LOST)
                        values[n++] = atomic64_read(&sub->lost_samples);
        }

unlock:
        raw_spin_unlock_irqrestore(&ctx->lock, flags);
        return ret;
}

static int perf_read_group(struct perf_event *event,
                                   u64 read_format, char __user *buf)
{
        struct perf_event *leader = event->group_leader, *child;
        struct perf_event_context *ctx = leader->ctx;
        int ret;
        u64 *values;

        lockdep_assert_held(&ctx->mutex);

        values = kzalloc(event->read_size, GFP_KERNEL);
        if (!values)
                return -ENOMEM;

        values[0] = 1 + leader->nr_siblings;

        mutex_lock(&leader->child_mutex);

        ret = __perf_read_group_add(leader, read_format, values);
        if (ret)
                goto unlock;

        list_for_each_entry(child, &leader->child_list, child_list) {
                ret = __perf_read_group_add(child, read_format, values);
                if (ret)
                        goto unlock;
        }

        mutex_unlock(&leader->child_mutex);

        ret = event->read_size;
        if (copy_to_user(buf, values, event->read_size))
                ret = -EFAULT;
        goto out;

unlock:
        mutex_unlock(&leader->child_mutex);
out:
        kfree(values);
        return ret;
}

static int perf_read_one(struct perf_event *event,
                                 u64 read_format, char __user *buf)
{
        u64 enabled, running;
        u64 values[5];
        int n = 0;

        values[n++] = __perf_event_read_value(event, &enabled, &running);
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                values[n++] = enabled;
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                values[n++] = running;
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(event);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&event->lost_samples);

        if (copy_to_user(buf, values, n * sizeof(u64)))
                return -EFAULT;

        return n * sizeof(u64);
}

static bool is_event_hup(struct perf_event *event)
{
        bool no_children;

        if (event->state > PERF_EVENT_STATE_EXIT)
                return false;

        mutex_lock(&event->child_mutex);
        no_children = list_empty(&event->child_list);
        mutex_unlock(&event->child_mutex);
        return no_children;
}

/*
 * Read the performance event - simple non blocking version for now
 */
static ssize_t
__perf_read(struct perf_event *event, char __user *buf, size_t count)
{
        u64 read_format = event->attr.read_format;
        int ret;

        /*
         * Return end-of-file for a read on an event that is in
         * error state (i.e. because it was pinned but it couldn't be
         * scheduled on to the CPU at some point).
         */
        if (event->state == PERF_EVENT_STATE_ERROR)
                return 0;

        if (count < event->read_size)
                return -ENOSPC;

        WARN_ON_ONCE(event->ctx->parent_ctx);
        if (read_format & PERF_FORMAT_GROUP)
                ret = perf_read_group(event, read_format, buf);
        else
                ret = perf_read_one(event, read_format, buf);

        return ret;
}

static ssize_t
perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
        struct perf_event *event = file->private_data;
        struct perf_event_context *ctx;
        int ret;

        ret = security_perf_event_read(event);
        if (ret)
                return ret;

        ctx = perf_event_ctx_lock(event);
        ret = __perf_read(event, buf, count);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}

static __poll_t perf_poll(struct file *file, poll_table *wait)
{
        struct perf_event *event = file->private_data;
        struct perf_buffer *rb;
        __poll_t events = EPOLLHUP;

        poll_wait(file, &event->waitq, wait);

        if (is_event_hup(event))
                return events;

        /*
         * Pin the event->rb by taking event->mmap_mutex; otherwise
         * perf_event_set_output() can swizzle our rb and make us miss wakeups.
         */
        mutex_lock(&event->mmap_mutex);
        rb = event->rb;
        if (rb)
                events = atomic_xchg(&rb->poll, 0);
        mutex_unlock(&event->mmap_mutex);
        return events;
}

static void _perf_event_reset(struct perf_event *event)
{
        (void)perf_event_read(event, false);
        local64_set(&event->count, 0);
        perf_event_update_userpage(event);
}

/* Assume it's not an event with inherit set. */
u64 perf_event_pause(struct perf_event *event, bool reset)
{
        struct perf_event_context *ctx;
        u64 count;

        ctx = perf_event_ctx_lock(event);
        WARN_ON_ONCE(event->attr.inherit);
        _perf_event_disable(event);
        count = local64_read(&event->count);
        if (reset)
                local64_set(&event->count, 0);
        perf_event_ctx_unlock(event, ctx);

        return count;
}
EXPORT_SYMBOL_GPL(perf_event_pause);

/*
 * Holding the top-level event's child_mutex means that any
 * descendant process that has inherited this event will block
 * in perf_event_exit_event() if it goes to exit, thus satisfying the
 * task existence requirements of perf_event_enable/disable.
 */
static void perf_event_for_each_child(struct perf_event *event,
                                        void (*func)(struct perf_event *))
{
        struct perf_event *child;

        WARN_ON_ONCE(event->ctx->parent_ctx);

        mutex_lock(&event->child_mutex);
        func(event);
        list_for_each_entry(child, &event->child_list, child_list)
                func(child);
        mutex_unlock(&event->child_mutex);
}

static void perf_event_for_each(struct perf_event *event,
                                  void (*func)(struct perf_event *))
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *sibling;

        lockdep_assert_held(&ctx->mutex);

        event = event->group_leader;

        perf_event_for_each_child(event, func);
        for_each_sibling_event(sibling, event)
                perf_event_for_each_child(sibling, func);
}

static void __perf_event_period(struct perf_event *event,
                                struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx,
                                void *info)
{
        u64 value = *((u64 *)info);
        bool active;

        if (event->attr.freq) {
                event->attr.sample_freq = value;
        } else {
                event->attr.sample_period = value;
                event->hw.sample_period = value;
        }

        active = (event->state == PERF_EVENT_STATE_ACTIVE);
        if (active) {
                perf_pmu_disable(event->pmu);
                /*
                 * We could be throttled; unthrottle now to avoid the tick
                 * trying to unthrottle while we already re-started the event.
                 */
                if (event->hw.interrupts == MAX_INTERRUPTS) {
                        event->hw.interrupts = 0;
                        perf_log_throttle(event, 1);
                }
                event->pmu->stop(event, PERF_EF_UPDATE);
        }

        local64_set(&event->hw.period_left, 0);

        if (active) {
                event->pmu->start(event, PERF_EF_RELOAD);
                perf_pmu_enable(event->pmu);
        }
}

static int perf_event_check_period(struct perf_event *event, u64 value)
{
        return event->pmu->check_period(event, value);
}

static int _perf_event_period(struct perf_event *event, u64 value)
{
        if (!is_sampling_event(event))
                return -EINVAL;

        if (!value)
                return -EINVAL;

        if (event->attr.freq && value > sysctl_perf_event_sample_rate)
                return -EINVAL;

        if (perf_event_check_period(event, value))
                return -EINVAL;

        if (!event->attr.freq && (value & (1ULL << 63)))
                return -EINVAL;

        event_function_call(event, __perf_event_period, &value);

        return 0;
}

int perf_event_period(struct perf_event *event, u64 value)
{
        struct perf_event_context *ctx;
        int ret;

        ctx = perf_event_ctx_lock(event);
        ret = _perf_event_period(event, value);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}
EXPORT_SYMBOL_GPL(perf_event_period);

static const struct file_operations perf_fops;

static inline int perf_fget_light(int fd, struct fd *p)
{
        struct fd f = fdget(fd);
        if (!f.file)
                return -EBADF;

        if (f.file->f_op != &perf_fops) {
                fdput(f);
                return -EBADF;
        }
        *p = f;
        return 0;
}

static int perf_event_set_output(struct perf_event *event,
                                 struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static int perf_copy_attr(struct perf_event_attr __user *uattr,
                          struct perf_event_attr *attr);

static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
        void (*func)(struct perf_event *);
        u32 flags = arg;

        switch (cmd) {
        case PERF_EVENT_IOC_ENABLE:
                func = _perf_event_enable;
                break;
        case PERF_EVENT_IOC_DISABLE:
                func = _perf_event_disable;
                break;
        case PERF_EVENT_IOC_RESET:
                func = _perf_event_reset;
                break;

        case PERF_EVENT_IOC_REFRESH:
                return _perf_event_refresh(event, arg);

        case PERF_EVENT_IOC_PERIOD:
        {
                u64 value;

                if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
                        return -EFAULT;

                return _perf_event_period(event, value);
        }
        case PERF_EVENT_IOC_ID:
        {
                u64 id = primary_event_id(event);

                if (copy_to_user((void __user *)arg, &id, sizeof(id)))
                        return -EFAULT;
                return 0;
        }

        case PERF_EVENT_IOC_SET_OUTPUT:
        {
                int ret;
                if (arg != -1) {
                        struct perf_event *output_event;
                        struct fd output;
                        ret = perf_fget_light(arg, &output);
                        if (ret)
                                return ret;
                        output_event = output.file->private_data;
                        ret = perf_event_set_output(event, output_event);
                        fdput(output);
                } else {
                        ret = perf_event_set_output(event, NULL);
                }
                return ret;
        }

        case PERF_EVENT_IOC_SET_FILTER:
                return perf_event_set_filter(event, (void __user *)arg);

        case PERF_EVENT_IOC_SET_BPF:
        {
                struct bpf_prog *prog;
                int err;

                prog = bpf_prog_get(arg);
                if (IS_ERR(prog))
                        return PTR_ERR(prog);

                err = perf_event_set_bpf_prog(event, prog, 0);
                if (err) {
                        bpf_prog_put(prog);
                        return err;
                }

                return 0;
        }

        case PERF_EVENT_IOC_PAUSE_OUTPUT: {
                struct perf_buffer *rb;

                rcu_read_lock();
                rb = rcu_dereference(event->rb);
                if (!rb || !rb->nr_pages) {
                        rcu_read_unlock();
                        return -EINVAL;
                }
                rb_toggle_paused(rb, !!arg);
                rcu_read_unlock();
                return 0;
        }

        case PERF_EVENT_IOC_QUERY_BPF:
                return perf_event_query_prog_array(event, (void __user *)arg);

        case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
                struct perf_event_attr new_attr;
                int err = perf_copy_attr((struct perf_event_attr __user *)arg,
                                         &new_attr);

                if (err)
                        return err;

                return perf_event_modify_attr(event,  &new_attr);
        }
        default:
                return -ENOTTY;
        }

        if (flags & PERF_IOC_FLAG_GROUP)
                perf_event_for_each(event, func);
        else
                perf_event_for_each_child(event, func);

        return 0;
}

static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        struct perf_event *event = file->private_data;
        struct perf_event_context *ctx;
        long ret;

        /* Treat ioctl like writes as it is likely a mutating operation. */
        ret = security_perf_event_write(event);
        if (ret)
                return ret;

        ctx = perf_event_ctx_lock(event);
        ret = _perf_ioctl(event, cmd, arg);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}

#ifdef CONFIG_COMPAT
static long perf_compat_ioctl(struct file *file, unsigned int cmd,
                                unsigned long arg)
{
        switch (_IOC_NR(cmd)) {
        case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
        case _IOC_NR(PERF_EVENT_IOC_ID):
        case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
        case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
                /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
                if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
                        cmd &= ~IOCSIZE_MASK;
                        cmd |= sizeof(void *) << IOCSIZE_SHIFT;
                }
                break;
        }
        return perf_ioctl(file, cmd, arg);
}
#else
# define perf_compat_ioctl NULL
#endif

int perf_event_task_enable(void)
{
        struct perf_event_context *ctx;
        struct perf_event *event;

        mutex_lock(&current->perf_event_mutex);
        list_for_each_entry(event, &current->perf_event_list, owner_entry) {
                ctx = perf_event_ctx_lock(event);
                perf_event_for_each_child(event, _perf_event_enable);
                perf_event_ctx_unlock(event, ctx);
        }
        mutex_unlock(&current->perf_event_mutex);

        return 0;
}

int perf_event_task_disable(void)
{
        struct perf_event_context *ctx;
        struct perf_event *event;

        mutex_lock(&current->perf_event_mutex);
        list_for_each_entry(event, &current->perf_event_list, owner_entry) {
                ctx = perf_event_ctx_lock(event);
                perf_event_for_each_child(event, _perf_event_disable);
                perf_event_ctx_unlock(event, ctx);
        }
        mutex_unlock(&current->perf_event_mutex);

        return 0;
}

static int perf_event_index(struct perf_event *event)
{
        if (event->hw.state & PERF_HES_STOPPED)
                return 0;

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return 0;

        return event->pmu->event_idx(event);
}

static void perf_event_init_userpage(struct perf_event *event)
{
        struct perf_event_mmap_page *userpg;
        struct perf_buffer *rb;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (!rb)
                goto unlock;

        userpg = rb->user_page;

        /* Allow new userspace to detect that bit 0 is deprecated */
        userpg->cap_bit0_is_deprecated = 1;
        userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
        userpg->data_offset = PAGE_SIZE;
        userpg->data_size = perf_data_size(rb);

unlock:
        rcu_read_unlock();
}

void __weak arch_perf_update_userpage(
        struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
{
}

/*
 * Callers need to ensure there can be no nesting of this function, otherwise
 * the seqlock logic goes bad. We can not serialize this because the arch
 * code calls this from NMI context.
 */
void perf_event_update_userpage(struct perf_event *event)
{
        struct perf_event_mmap_page *userpg;
        struct perf_buffer *rb;
        u64 enabled, running, now;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (!rb)
                goto unlock;

        /*
         * compute total_time_enabled, total_time_running
         * based on snapshot values taken when the event
         * was last scheduled in.
         *
         * we cannot simply called update_context_time()
         * because of locking issue as we can be called in
         * NMI context
         */
        calc_timer_values(event, &now, &enabled, &running);

        userpg = rb->user_page;
        /*
         * Disable preemption to guarantee consistent time stamps are stored to
         * the user page.
         */
        preempt_disable();
        ++userpg->lock;
        barrier();
        userpg->index = perf_event_index(event);
        userpg->offset = perf_event_count(event);
        if (userpg->index)
                userpg->offset -= local64_read(&event->hw.prev_count);

        userpg->time_enabled = enabled +
                        atomic64_read(&event->child_total_time_enabled);

        userpg->time_running = running +
                        atomic64_read(&event->child_total_time_running);

        arch_perf_update_userpage(event, userpg, now);

        barrier();
        ++userpg->lock;
        preempt_enable();
unlock:
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(perf_event_update_userpage);

static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
{
        struct perf_event *event = vmf->vma->vm_file->private_data;
        struct perf_buffer *rb;
        vm_fault_t ret = VM_FAULT_SIGBUS;

        if (vmf->flags & FAULT_FLAG_MKWRITE) {
                if (vmf->pgoff == 0)
                        ret = 0;
                return ret;
        }

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (!rb)
                goto unlock;

        if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
                goto unlock;

        vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
        if (!vmf->page)
                goto unlock;

        get_page(vmf->page);
        vmf->page->mapping = vmf->vma->vm_file->f_mapping;
        vmf->page->index   = vmf->pgoff;

        ret = 0;
unlock:
        rcu_read_unlock();

        return ret;
}

static void ring_buffer_attach(struct perf_event *event,
                               struct perf_buffer *rb)
{
        struct perf_buffer *old_rb = NULL;
        unsigned long flags;

        WARN_ON_ONCE(event->parent);

        if (event->rb) {
                /*
                 * Should be impossible, we set this when removing
                 * event->rb_entry and wait/clear when adding event->rb_entry.
                 */
                WARN_ON_ONCE(event->rcu_pending);

                old_rb = event->rb;
                spin_lock_irqsave(&old_rb->event_lock, flags);
                list_del_rcu(&event->rb_entry);
                spin_unlock_irqrestore(&old_rb->event_lock, flags);

                event->rcu_batches = get_state_synchronize_rcu();
                event->rcu_pending = 1;
        }

        if (rb) {
                if (event->rcu_pending) {
                        cond_synchronize_rcu(event->rcu_batches);
                        event->rcu_pending = 0;
                }

                spin_lock_irqsave(&rb->event_lock, flags);
                list_add_rcu(&event->rb_entry, &rb->event_list);
                spin_unlock_irqrestore(&rb->event_lock, flags);
        }

        /*
         * Avoid racing with perf_mmap_close(AUX): stop the event
         * before swizzling the event::rb pointer; if it's getting
         * unmapped, its aux_mmap_count will be 0 and it won't
         * restart. See the comment in __perf_pmu_output_stop().
         *
         * Data will inevitably be lost when set_output is done in
         * mid-air, but then again, whoever does it like this is
         * not in for the data anyway.
         */
        if (has_aux(event))
                perf_event_stop(event, 0);

        rcu_assign_pointer(event->rb, rb);

        if (old_rb) {
                ring_buffer_put(old_rb);
                /*
                 * Since we detached before setting the new rb, so that we
                 * could attach the new rb, we could have missed a wakeup.
                 * Provide it now.
                 */
                wake_up_all(&event->waitq);
        }
}

static void ring_buffer_wakeup(struct perf_event *event)
{
        struct perf_buffer *rb;

        if (event->parent)
                event = event->parent;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (rb) {
                list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
                        wake_up_all(&event->waitq);
        }
        rcu_read_unlock();
}

struct perf_buffer *ring_buffer_get(struct perf_event *event)
{
        struct perf_buffer *rb;

        if (event->parent)
                event = event->parent;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (rb) {
                if (!refcount_inc_not_zero(&rb->refcount))
                        rb = NULL;
        }
        rcu_read_unlock();

        return rb;
}

void ring_buffer_put(struct perf_buffer *rb)
{
        if (!refcount_dec_and_test(&rb->refcount))
                return;

        WARN_ON_ONCE(!list_empty(&rb->event_list));

        call_rcu(&rb->rcu_head, rb_free_rcu);
}

static void perf_mmap_open(struct vm_area_struct *vma)
{
        struct perf_event *event = vma->vm_file->private_data;

        atomic_inc(&event->mmap_count);
        atomic_inc(&event->rb->mmap_count);

        if (vma->vm_pgoff)
                atomic_inc(&event->rb->aux_mmap_count);

        if (event->pmu->event_mapped)
                event->pmu->event_mapped(event, vma->vm_mm);
}

static void perf_pmu_output_stop(struct perf_event *event);

/*
 * A buffer can be mmap()ed multiple times; either directly through the same
 * event, or through other events by use of perf_event_set_output().
 *
 * In order to undo the VM accounting done by perf_mmap() we need to destroy
 * the buffer here, where we still have a VM context. This means we need
 * to detach all events redirecting to us.
 */
static void perf_mmap_close(struct vm_area_struct *vma)
{
        struct perf_event *event = vma->vm_file->private_data;
        struct perf_buffer *rb = ring_buffer_get(event);
        struct user_struct *mmap_user = rb->mmap_user;
        int mmap_locked = rb->mmap_locked;
        unsigned long size = perf_data_size(rb);
        bool detach_rest = false;

        if (event->pmu->event_unmapped)
                event->pmu->event_unmapped(event, vma->vm_mm);

        /*
         * rb->aux_mmap_count will always drop before rb->mmap_count and
         * event->mmap_count, so it is ok to use event->mmap_mutex to
         * serialize with perf_mmap here.
         */
        if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
            atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
                /*
                 * Stop all AUX events that are writing to this buffer,
                 * so that we can free its AUX pages and corresponding PMU
                 * data. Note that after rb::aux_mmap_count dropped to zero,
                 * they won't start any more (see perf_aux_output_begin()).
                 */
                perf_pmu_output_stop(event);

                /* now it's safe to free the pages */
                atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
                atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);

                /* this has to be the last one */
                rb_free_aux(rb);
                WARN_ON_ONCE(refcount_read(&rb->aux_refcount));

                mutex_unlock(&event->mmap_mutex);
        }

        if (atomic_dec_and_test(&rb->mmap_count))
                detach_rest = true;

        if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
                goto out_put;

        ring_buffer_attach(event, NULL);
        mutex_unlock(&event->mmap_mutex);

        /* If there's still other mmap()s of this buffer, we're done. */
        if (!detach_rest)
                goto out_put;

        /*
         * No other mmap()s, detach from all other events that might redirect
         * into the now unreachable buffer. Somewhat complicated by the
         * fact that rb::event_lock otherwise nests inside mmap_mutex.
         */
again:
        rcu_read_lock();
        list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
                if (!atomic_long_inc_not_zero(&event->refcount)) {
                        /*
                         * This event is en-route to free_event() which will
                         * detach it and remove it from the list.
                         */
                        continue;
                }
                rcu_read_unlock();

                mutex_lock(&event->mmap_mutex);
                /*
                 * Check we didn't race with perf_event_set_output() which can
                 * swizzle the rb from under us while we were waiting to
                 * acquire mmap_mutex.
                 *
                 * If we find a different rb; ignore this event, a next
                 * iteration will no longer find it on the list. We have to
                 * still restart the iteration to make sure we're not now
                 * iterating the wrong list.
                 */
                if (event->rb == rb)
                        ring_buffer_attach(event, NULL);

                mutex_unlock(&event->mmap_mutex);
                put_event(event);

                /*
                 * Restart the iteration; either we're on the wrong list or
                 * destroyed its integrity by doing a deletion.
                 */
                goto again;
        }
        rcu_read_unlock();

        /*
         * It could be there's still a few 0-ref events on the list; they'll
         * get cleaned up by free_event() -- they'll also still have their
         * ref on the rb and will free it whenever they are done with it.
         *
         * Aside from that, this buffer is 'fully' detached and unmapped,
         * undo the VM accounting.
         */

        atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
                        &mmap_user->locked_vm);
        atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
        free_uid(mmap_user);

out_put:
        ring_buffer_put(rb); /* could be last */
}

static const struct vm_operations_struct perf_mmap_vmops = {
        .open                = perf_mmap_open,
        .close                = perf_mmap_close, /* non mergeable */
        .fault                = perf_mmap_fault,
        .page_mkwrite        = perf_mmap_fault,
};

static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct perf_event *event = file->private_data;
        unsigned long user_locked, user_lock_limit;
        struct user_struct *user = current_user();
        struct perf_buffer *rb = NULL;
        unsigned long locked, lock_limit;
        unsigned long vma_size;
        unsigned long nr_pages;
        long user_extra = 0, extra = 0;
        int ret = 0, flags = 0;

        /*
         * Don't allow mmap() of inherited per-task counters. This would
         * create a performance issue due to all children writing to the
         * same rb.
         */
        if (event->cpu == -1 && event->attr.inherit)
                return -EINVAL;

        if (!(vma->vm_flags & VM_SHARED))
                return -EINVAL;

        ret = security_perf_event_read(event);
        if (ret)
                return ret;

        vma_size = vma->vm_end - vma->vm_start;

        if (vma->vm_pgoff == 0) {
                nr_pages = (vma_size / PAGE_SIZE) - 1;
        } else {
                /*
                 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
                 * mapped, all subsequent mappings should have the same size
                 * and offset. Must be above the normal perf buffer.
                 */
                u64 aux_offset, aux_size;

                if (!event->rb)
                        return -EINVAL;

                nr_pages = vma_size / PAGE_SIZE;

                mutex_lock(&event->mmap_mutex);
                ret = -EINVAL;

                rb = event->rb;
                if (!rb)
                        goto aux_unlock;

                aux_offset = READ_ONCE(rb->user_page->aux_offset);
                aux_size = READ_ONCE(rb->user_page->aux_size);

                if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
                        goto aux_unlock;

                if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
                        goto aux_unlock;

                /* already mapped with a different offset */
                if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
                        goto aux_unlock;

                if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
                        goto aux_unlock;

                /* already mapped with a different size */
                if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
                        goto aux_unlock;

                if (!is_power_of_2(nr_pages))
                        goto aux_unlock;

                if (!atomic_inc_not_zero(&rb->mmap_count))
                        goto aux_unlock;

                if (rb_has_aux(rb)) {
                        atomic_inc(&rb->aux_mmap_count);
                        ret = 0;
                        goto unlock;
                }

                atomic_set(&rb->aux_mmap_count, 1);
                user_extra = nr_pages;

                goto accounting;
        }

        /*
         * If we have rb pages ensure they're a power-of-two number, so we
         * can do bitmasks instead of modulo.
         */
        if (nr_pages != 0 && !is_power_of_2(nr_pages))
                return -EINVAL;

        if (vma_size != PAGE_SIZE * (1 + nr_pages))
                return -EINVAL;

        WARN_ON_ONCE(event->ctx->parent_ctx);
again:
        mutex_lock(&event->mmap_mutex);
        if (event->rb) {
                if (data_page_nr(event->rb) != nr_pages) {
                        ret = -EINVAL;
                        goto unlock;
                }

                if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
                        /*
                         * Raced against perf_mmap_close(); remove the
                         * event and try again.
                         */
                        ring_buffer_attach(event, NULL);
                        mutex_unlock(&event->mmap_mutex);
                        goto again;
                }

                goto unlock;
        }

        user_extra = nr_pages + 1;

accounting:
        user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);

        /*
         * Increase the limit linearly with more CPUs:
         */
        user_lock_limit *= num_online_cpus();

        user_locked = atomic_long_read(&user->locked_vm);

        /*
         * sysctl_perf_event_mlock may have changed, so that
         *     user->locked_vm > user_lock_limit
         */
        if (user_locked > user_lock_limit)
                user_locked = user_lock_limit;
        user_locked += user_extra;

        if (user_locked > user_lock_limit) {
                /*
                 * charge locked_vm until it hits user_lock_limit;
                 * charge the rest from pinned_vm
                 */
                extra = user_locked - user_lock_limit;
                user_extra -= extra;
        }

        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
        locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;

        if ((locked > lock_limit) && perf_is_paranoid() &&
                !capable(CAP_IPC_LOCK)) {
                ret = -EPERM;
                goto unlock;
        }

        WARN_ON(!rb && event->rb);

        if (vma->vm_flags & VM_WRITE)
                flags |= RING_BUFFER_WRITABLE;

        if (!rb) {
                rb = rb_alloc(nr_pages,
                              event->attr.watermark ? event->attr.wakeup_watermark : 0,
                              event->cpu, flags);

                if (!rb) {
                        ret = -ENOMEM;
                        goto unlock;
                }

                atomic_set(&rb->mmap_count, 1);
                rb->mmap_user = get_current_user();
                rb->mmap_locked = extra;

                ring_buffer_attach(event, rb);

                perf_event_update_time(event);
                perf_event_init_userpage(event);
                perf_event_update_userpage(event);
        } else {
                ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
                                   event->attr.aux_watermark, flags);
                if (!ret)
                        rb->aux_mmap_locked = extra;
        }

unlock:
        if (!ret) {
                atomic_long_add(user_extra, &user->locked_vm);
                atomic64_add(extra, &vma->vm_mm->pinned_vm);

                atomic_inc(&event->mmap_count);
        } else if (rb) {
                atomic_dec(&rb->mmap_count);
        }
aux_unlock:
        mutex_unlock(&event->mmap_mutex);

        /*
         * Since pinned accounting is per vm we cannot allow fork() to copy our
         * vma.
         */
        vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
        vma->vm_ops = &perf_mmap_vmops;

        if (event->pmu->event_mapped)
                event->pmu->event_mapped(event, vma->vm_mm);

        return ret;
}

static int perf_fasync(int fd, struct file *filp, int on)
{
        struct inode *inode = file_inode(filp);
        struct perf_event *event = filp->private_data;
        int retval;

        inode_lock(inode);
        retval = fasync_helper(fd, filp, on, &event->fasync);
        inode_unlock(inode);

        if (retval < 0)
                return retval;

        return 0;
}

static const struct file_operations perf_fops = {
        .llseek                        = no_llseek,
        .release                = perf_release,
        .read                        = perf_read,
        .poll                        = perf_poll,
        .unlocked_ioctl                = perf_ioctl,
        .compat_ioctl                = perf_compat_ioctl,
        .mmap                        = perf_mmap,
        .fasync                        = perf_fasync,
};

/*
 * Perf event wakeup
 *
 * If there's data, ensure we set the poll() state and publish everything
 * to user-space before waking everybody up.
 */

void perf_event_wakeup(struct perf_event *event)
{
        ring_buffer_wakeup(event);

        if (event->pending_kill) {
                kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
                event->pending_kill = 0;
        }
}

static void perf_sigtrap(struct perf_event *event)
{
        /*
         * We'd expect this to only occur if the irq_work is delayed and either
         * ctx->task or current has changed in the meantime. This can be the
         * case on architectures that do not implement arch_irq_work_raise().
         */
        if (WARN_ON_ONCE(event->ctx->task != current))
                return;

        /*
         * Both perf_pending_task() and perf_pending_irq() can race with the
         * task exiting.
         */
        if (current->flags & PF_EXITING)
                return;

        send_sig_perf((void __user *)event->pending_addr,
                      event->orig_type, event->attr.sig_data);
}

/*
 * Deliver the pending work in-event-context or follow the context.
 */
static void __perf_pending_irq(struct perf_event *event)
{
        int cpu = READ_ONCE(event->oncpu);

        /*
         * If the event isn't running; we done. event_sched_out() will have
         * taken care of things.
         */
        if (cpu < 0)
                return;

        /*
         * Yay, we hit home and are in the context of the event.
         */
        if (cpu == smp_processor_id()) {
                if (event->pending_sigtrap) {
                        event->pending_sigtrap = 0;
                        perf_sigtrap(event);
                        local_dec(&event->ctx->nr_pending);
                }
                if (event->pending_disable) {
                        event->pending_disable = 0;
                        perf_event_disable_local(event);
                }
                return;
        }

        /*
         *  CPU-A                        CPU-B
         *
         *  perf_event_disable_inatomic()
         *    @pending_disable = CPU-A;
         *    irq_work_queue();
         *
         *  sched-out
         *    @pending_disable = -1;
         *
         *                                sched-in
         *                                perf_event_disable_inatomic()
         *                                  @pending_disable = CPU-B;
         *                                  irq_work_queue(); // FAILS
         *
         *  irq_work_run()
         *    perf_pending_irq()
         *
         * But the event runs on CPU-B and wants disabling there.
         */
        irq_work_queue_on(&event->pending_irq, cpu);
}

static void perf_pending_irq(struct irq_work *entry)
{
        struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
        int rctx;

        /*
         * If we 'fail' here, that's OK, it means recursion is already disabled
         * and we won't recurse 'further'.
         */
        rctx = perf_swevent_get_recursion_context();

        /*
         * The wakeup isn't bound to the context of the event -- it can happen
         * irrespective of where the event is.
         */
        if (event->pending_wakeup) {
                event->pending_wakeup = 0;
                perf_event_wakeup(event);
        }

        __perf_pending_irq(event);

        if (rctx >= 0)
                perf_swevent_put_recursion_context(rctx);
}

static void perf_pending_task(struct callback_head *head)
{
        struct perf_event *event = container_of(head, struct perf_event, pending_task);
        int rctx;

        /*
         * If we 'fail' here, that's OK, it means recursion is already disabled
         * and we won't recurse 'further'.
         */
        preempt_disable_notrace();
        rctx = perf_swevent_get_recursion_context();

        if (event->pending_work) {
                event->pending_work = 0;
                perf_sigtrap(event);
                local_dec(&event->ctx->nr_pending);
        }

        if (rctx >= 0)
                perf_swevent_put_recursion_context(rctx);
        preempt_enable_notrace();

        put_event(event);
}

#ifdef CONFIG_GUEST_PERF_EVENTS
struct perf_guest_info_callbacks __rcu *perf_guest_cbs;

DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state);
DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);

void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
        if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
                return;

        rcu_assign_pointer(perf_guest_cbs, cbs);
        static_call_update(__perf_guest_state, cbs->state);
        static_call_update(__perf_guest_get_ip, cbs->get_ip);

        /* Implementing ->handle_intel_pt_intr is optional. */
        if (cbs->handle_intel_pt_intr)
                static_call_update(__perf_guest_handle_intel_pt_intr,
                                   cbs->handle_intel_pt_intr);
}
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);

void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
        if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
                return;

        rcu_assign_pointer(perf_guest_cbs, NULL);
        static_call_update(__perf_guest_state, (void *)&__static_call_return0);
        static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0);
        static_call_update(__perf_guest_handle_intel_pt_intr,
                           (void *)&__static_call_return0);
        synchronize_rcu();
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
#endif

static void
perf_output_sample_regs(struct perf_output_handle *handle,
                        struct pt_regs *regs, u64 mask)
{
        int bit;
        DECLARE_BITMAP(_mask, 64);

        bitmap_from_u64(_mask, mask);
        for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
                u64 val;

                val = perf_reg_value(regs, bit);
                perf_output_put(handle, val);
        }
}

static void perf_sample_regs_user(struct perf_regs *regs_user,
                                  struct pt_regs *regs)
{
        if (user_mode(regs)) {
                regs_user->abi = perf_reg_abi(current);
                regs_user->regs = regs;
        } else if (!(current->flags & PF_KTHREAD)) {
                perf_get_regs_user(regs_user, regs);
        } else {
                regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
                regs_user->regs = NULL;
        }
}

static void perf_sample_regs_intr(struct perf_regs *regs_intr,
                                  struct pt_regs *regs)
{
        regs_intr->regs = regs;
        regs_intr->abi  = perf_reg_abi(current);
}


/*
 * Get remaining task size from user stack pointer.
 *
 * It'd be better to take stack vma map and limit this more
 * precisely, but there's no way to get it safely under interrupt,
 * so using TASK_SIZE as limit.
 */
static u64 perf_ustack_task_size(struct pt_regs *regs)
{
        unsigned long addr = perf_user_stack_pointer(regs);

        if (!addr || addr >= TASK_SIZE)
                return 0;

        return TASK_SIZE - addr;
}

static u16
perf_sample_ustack_size(u16 stack_size, u16 header_size,
                        struct pt_regs *regs)
{
        u64 task_size;

        /* No regs, no stack pointer, no dump. */
        if (!regs)
                return 0;

        /*
         * Check if we fit in with the requested stack size into the:
         * - TASK_SIZE
         *   If we don't, we limit the size to the TASK_SIZE.
         *
         * - remaining sample size
         *   If we don't, we customize the stack size to
         *   fit in to the remaining sample size.
         */

        task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
        stack_size = min(stack_size, (u16) task_size);

        /* Current header size plus static size and dynamic size. */
        header_size += 2 * sizeof(u64);

        /* Do we fit in with the current stack dump size? */
        if ((u16) (header_size + stack_size) < header_size) {
                /*
                 * If we overflow the maximum size for the sample,
                 * we customize the stack dump size to fit in.
                 */
                stack_size = USHRT_MAX - header_size - sizeof(u64);
                stack_size = round_up(stack_size, sizeof(u64));
        }

        return stack_size;
}

static void
perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
                          struct pt_regs *regs)
{
        /* Case of a kernel thread, nothing to dump */
        if (!regs) {
                u64 size = 0;
                perf_output_put(handle, size);
        } else {
                unsigned long sp;
                unsigned int rem;
                u64 dyn_size;

                /*
                 * We dump:
                 * static size
                 *   - the size requested by user or the best one we can fit
                 *     in to the sample max size
                 * data
                 *   - user stack dump data
                 * dynamic size
                 *   - the actual dumped size
                 */

                /* Static size. */
                perf_output_put(handle, dump_size);

                /* Data. */
                sp = perf_user_stack_pointer(regs);
                rem = __output_copy_user(handle, (void *) sp, dump_size);
                dyn_size = dump_size - rem;

                perf_output_skip(handle, rem);

                /* Dynamic size. */
                perf_output_put(handle, dyn_size);
        }
}

static unsigned long perf_prepare_sample_aux(struct perf_event *event,
                                          struct perf_sample_data *data,
                                          size_t size)
{
        struct perf_event *sampler = event->aux_event;
        struct perf_buffer *rb;

        data->aux_size = 0;

        if (!sampler)
                goto out;

        if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
                goto out;

        if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
                goto out;

        rb = ring_buffer_get(sampler);
        if (!rb)
                goto out;

        /*
         * If this is an NMI hit inside sampling code, don't take
         * the sample. See also perf_aux_sample_output().
         */
        if (READ_ONCE(rb->aux_in_sampling)) {
                data->aux_size = 0;
        } else {
                size = min_t(size_t, size, perf_aux_size(rb));
                data->aux_size = ALIGN(size, sizeof(u64));
        }
        ring_buffer_put(rb);

out:
        return data->aux_size;
}

static long perf_pmu_snapshot_aux(struct perf_buffer *rb,
                                 struct perf_event *event,
                                 struct perf_output_handle *handle,
                                 unsigned long size)
{
        unsigned long flags;
        long ret;

        /*
         * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
         * paths. If we start calling them in NMI context, they may race with
         * the IRQ ones, that is, for example, re-starting an event that's just
         * been stopped, which is why we're using a separate callback that
         * doesn't change the event state.
         *
         * IRQs need to be disabled to prevent IPIs from racing with us.
         */
        local_irq_save(flags);
        /*
         * Guard against NMI hits inside the critical section;
         * see also perf_prepare_sample_aux().
         */
        WRITE_ONCE(rb->aux_in_sampling, 1);
        barrier();

        ret = event->pmu->snapshot_aux(event, handle, size);

        barrier();
        WRITE_ONCE(rb->aux_in_sampling, 0);
        local_irq_restore(flags);

        return ret;
}

static void perf_aux_sample_output(struct perf_event *event,
                                   struct perf_output_handle *handle,
                                   struct perf_sample_data *data)
{
        struct perf_event *sampler = event->aux_event;
        struct perf_buffer *rb;
        unsigned long pad;
        long size;

        if (WARN_ON_ONCE(!sampler || !data->aux_size))
                return;

        rb = ring_buffer_get(sampler);
        if (!rb)
                return;

        size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);

        /*
         * An error here means that perf_output_copy() failed (returned a
         * non-zero surplus that it didn't copy), which in its current
         * enlightened implementation is not possible. If that changes, we'd
         * like to know.
         */
        if (WARN_ON_ONCE(size < 0))
                goto out_put;

        /*
         * The pad comes from ALIGN()ing data->aux_size up to u64 in
         * perf_prepare_sample_aux(), so should not be more than that.
         */
        pad = data->aux_size - size;
        if (WARN_ON_ONCE(pad >= sizeof(u64)))
                pad = 8;

        if (pad) {
                u64 zero = 0;
                perf_output_copy(handle, &zero, pad);
        }

out_put:
        ring_buffer_put(rb);
}

/*
 * A set of common sample data types saved even for non-sample records
 * when event->attr.sample_id_all is set.
 */
#define PERF_SAMPLE_ID_ALL  (PERF_SAMPLE_TID | PERF_SAMPLE_TIME |        \
                             PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID |        \
                             PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER)

static void __perf_event_header__init_id(struct perf_sample_data *data,
                                         struct perf_event *event,
                                         u64 sample_type)
{
        data->type = event->attr.sample_type;
        data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL;

        if (sample_type & PERF_SAMPLE_TID) {
                /* namespace issues */
                data->tid_entry.pid = perf_event_pid(event, current);
                data->tid_entry.tid = perf_event_tid(event, current);
        }

        if (sample_type & PERF_SAMPLE_TIME)
                data->time = perf_event_clock(event);

        if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
                data->id = primary_event_id(event);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                data->stream_id = event->id;

        if (sample_type & PERF_SAMPLE_CPU) {
                data->cpu_entry.cpu         = raw_smp_processor_id();
                data->cpu_entry.reserved = 0;
        }
}

void perf_event_header__init_id(struct perf_event_header *header,
                                struct perf_sample_data *data,
                                struct perf_event *event)
{
        if (event->attr.sample_id_all) {
                header->size += event->id_header_size;
                __perf_event_header__init_id(data, event, event->attr.sample_type);
        }
}

static void __perf_event__output_id_sample(struct perf_output_handle *handle,
                                           struct perf_sample_data *data)
{
        u64 sample_type = data->type;

        if (sample_type & PERF_SAMPLE_TID)
                perf_output_put(handle, data->tid_entry);

        if (sample_type & PERF_SAMPLE_TIME)
                perf_output_put(handle, data->time);

        if (sample_type & PERF_SAMPLE_ID)
                perf_output_put(handle, data->id);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                perf_output_put(handle, data->stream_id);

        if (sample_type & PERF_SAMPLE_CPU)
                perf_output_put(handle, data->cpu_entry);

        if (sample_type & PERF_SAMPLE_IDENTIFIER)
                perf_output_put(handle, data->id);
}

void perf_event__output_id_sample(struct perf_event *event,
                                  struct perf_output_handle *handle,
                                  struct perf_sample_data *sample)
{
        if (event->attr.sample_id_all)
                __perf_event__output_id_sample(handle, sample);
}

static void perf_output_read_one(struct perf_output_handle *handle,
                                 struct perf_event *event,
                                 u64 enabled, u64 running)
{
        u64 read_format = event->attr.read_format;
        u64 values[5];
        int n = 0;

        values[n++] = perf_event_count(event);
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
                values[n++] = enabled +
                        atomic64_read(&event->child_total_time_enabled);
        }
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
                values[n++] = running +
                        atomic64_read(&event->child_total_time_running);
        }
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(event);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&event->lost_samples);

        __output_copy(handle, values, n * sizeof(u64));
}

static void perf_output_read_group(struct perf_output_handle *handle,
                            struct perf_event *event,
                            u64 enabled, u64 running)
{
        struct perf_event *leader = event->group_leader, *sub;
        u64 read_format = event->attr.read_format;
        unsigned long flags;
        u64 values[6];
        int n = 0;

        /*
         * Disabling interrupts avoids all counter scheduling
         * (context switches, timer based rotation and IPIs).
         */
        local_irq_save(flags);

        values[n++] = 1 + leader->nr_siblings;

        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                values[n++] = enabled;

        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                values[n++] = running;

        if ((leader != event) &&
            (leader->state == PERF_EVENT_STATE_ACTIVE))
                leader->pmu->read(leader);

        values[n++] = perf_event_count(leader);
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&leader->lost_samples);

        __output_copy(handle, values, n * sizeof(u64));

        for_each_sibling_event(sub, leader) {
                n = 0;

                if ((sub != event) &&
                    (sub->state == PERF_EVENT_STATE_ACTIVE))
                        sub->pmu->read(sub);

                values[n++] = perf_event_count(sub);
                if (read_format & PERF_FORMAT_ID)
                        values[n++] = primary_event_id(sub);
                if (read_format & PERF_FORMAT_LOST)
                        values[n++] = atomic64_read(&sub->lost_samples);

                __output_copy(handle, values, n * sizeof(u64));
        }

        local_irq_restore(flags);
}

#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
                                 PERF_FORMAT_TOTAL_TIME_RUNNING)

/*
 * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
 *
 * The problem is that its both hard and excessively expensive to iterate the
 * child list, not to mention that its impossible to IPI the children running
 * on another CPU, from interrupt/NMI context.
 */
static void perf_output_read(struct perf_output_handle *handle,
                             struct perf_event *event)
{
        u64 enabled = 0, running = 0, now;
        u64 read_format = event->attr.read_format;

        /*
         * compute total_time_enabled, total_time_running
         * based on snapshot values taken when the event
         * was last scheduled in.
         *
         * we cannot simply called update_context_time()
         * because of locking issue as we are called in
         * NMI context
         */
        if (read_format & PERF_FORMAT_TOTAL_TIMES)
                calc_timer_values(event, &now, &enabled, &running);

        if (event->attr.read_format & PERF_FORMAT_GROUP)
                perf_output_read_group(handle, event, enabled, running);
        else
                perf_output_read_one(handle, event, enabled, running);
}

void perf_output_sample(struct perf_output_handle *handle,
                        struct perf_event_header *header,
                        struct perf_sample_data *data,
                        struct perf_event *event)
{
        u64 sample_type = data->type;

        perf_output_put(handle, *header);

        if (sample_type & PERF_SAMPLE_IDENTIFIER)
                perf_output_put(handle, data->id);

        if (sample_type & PERF_SAMPLE_IP)
                perf_output_put(handle, data->ip);

        if (sample_type & PERF_SAMPLE_TID)
                perf_output_put(handle, data->tid_entry);

        if (sample_type & PERF_SAMPLE_TIME)
                perf_output_put(handle, data->time);

        if (sample_type & PERF_SAMPLE_ADDR)
                perf_output_put(handle, data->addr);

        if (sample_type & PERF_SAMPLE_ID)
                perf_output_put(handle, data->id);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                perf_output_put(handle, data->stream_id);

        if (sample_type & PERF_SAMPLE_CPU)
                perf_output_put(handle, data->cpu_entry);

        if (sample_type & PERF_SAMPLE_PERIOD)
                perf_output_put(handle, data->period);

        if (sample_type & PERF_SAMPLE_READ)
                perf_output_read(handle, event);

        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                int size = 1;

                size += data->callchain->nr;
                size *= sizeof(u64);
                __output_copy(handle, data->callchain, size);
        }

        if (sample_type & PERF_SAMPLE_RAW) {
                struct perf_raw_record *raw = data->raw;

                if (raw) {
                        struct perf_raw_frag *frag = &raw->frag;

                        perf_output_put(handle, raw->size);
                        do {
                                if (frag->copy) {
                                        __output_custom(handle, frag->copy,
                                                        frag->data, frag->size);
                                } else {
                                        __output_copy(handle, frag->data,
                                                      frag->size);
                                }
                                if (perf_raw_frag_last(frag))
                                        break;
                                frag = frag->next;
                        } while (1);
                        if (frag->pad)
                                __output_skip(handle, NULL, frag->pad);
                } else {
                        struct {
                                u32        size;
                                u32        data;
                        } raw = {
                                .size = sizeof(u32),
                                .data = 0,
                        };
                        perf_output_put(handle, raw);
                }
        }

        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
                if (data->br_stack) {
                        size_t size;

                        size = data->br_stack->nr
                             * sizeof(struct perf_branch_entry);

                        perf_output_put(handle, data->br_stack->nr);
                        if (branch_sample_hw_index(event))
                                perf_output_put(handle, data->br_stack->hw_idx);
                        perf_output_copy(handle, data->br_stack->entries, size);
                        /*
                         * Add the extension space which is appended
                         * right after the struct perf_branch_stack.
                         */
                        if (data->br_stack_cntr) {
                                size = data->br_stack->nr * sizeof(u64);
                                perf_output_copy(handle, data->br_stack_cntr, size);
                        }
                } else {
                        /*
                         * we always store at least the value of nr
                         */
                        u64 nr = 0;
                        perf_output_put(handle, nr);
                }
        }

        if (sample_type & PERF_SAMPLE_REGS_USER) {
                u64 abi = data->regs_user.abi;

                /*
                 * If there are no regs to dump, notice it through
                 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
                 */
                perf_output_put(handle, abi);

                if (abi) {
                        u64 mask = event->attr.sample_regs_user;
                        perf_output_sample_regs(handle,
                                                data->regs_user.regs,
                                                mask);
                }
        }

        if (sample_type & PERF_SAMPLE_STACK_USER) {
                perf_output_sample_ustack(handle,
                                          data->stack_user_size,
                                          data->regs_user.regs);
        }

        if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
                perf_output_put(handle, data->weight.full);

        if (sample_type & PERF_SAMPLE_DATA_SRC)
                perf_output_put(handle, data->data_src.val);

        if (sample_type & PERF_SAMPLE_TRANSACTION)
                perf_output_put(handle, data->txn);

        if (sample_type & PERF_SAMPLE_REGS_INTR) {
                u64 abi = data->regs_intr.abi;
                /*
                 * If there are no regs to dump, notice it through
                 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
                 */
                perf_output_put(handle, abi);

                if (abi) {
                        u64 mask = event->attr.sample_regs_intr;

                        perf_output_sample_regs(handle,
                                                data->regs_intr.regs,
                                                mask);
                }
        }

        if (sample_type & PERF_SAMPLE_PHYS_ADDR)
                perf_output_put(handle, data->phys_addr);

        if (sample_type & PERF_SAMPLE_CGROUP)
                perf_output_put(handle, data->cgroup);

        if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
                perf_output_put(handle, data->data_page_size);

        if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
                perf_output_put(handle, data->code_page_size);

        if (sample_type & PERF_SAMPLE_AUX) {
                perf_output_put(handle, data->aux_size);

                if (data->aux_size)
                        perf_aux_sample_output(event, handle, data);
        }

        if (!event->attr.watermark) {
                int wakeup_events = event->attr.wakeup_events;

                if (wakeup_events) {
                        struct perf_buffer *rb = handle->rb;
                        int events = local_inc_return(&rb->events);

                        if (events >= wakeup_events) {
                                local_sub(wakeup_events, &rb->events);
                                local_inc(&rb->wakeup);
                        }
                }
        }
}

static u64 perf_virt_to_phys(u64 virt)
{
        u64 phys_addr = 0;

        if (!virt)
                return 0;

        if (virt >= TASK_SIZE) {
                /* If it's vmalloc()d memory, leave phys_addr as 0 */
                if (virt_addr_valid((void *)(uintptr_t)virt) &&
                    !(virt >= VMALLOC_START && virt < VMALLOC_END))
                        phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
        } else {
                /*
                 * Walking the pages tables for user address.
                 * Interrupts are disabled, so it prevents any tear down
                 * of the page tables.
                 * Try IRQ-safe get_user_page_fast_only first.
                 * If failed, leave phys_addr as 0.
                 */
                if (current->mm != NULL) {
                        struct page *p;

                        pagefault_disable();
                        if (get_user_page_fast_only(virt, 0, &p)) {
                                phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
                                put_page(p);
                        }
                        pagefault_enable();
                }
        }

        return phys_addr;
}

/*
 * Return the pagetable size of a given virtual address.
 */
static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
{
        u64 size = 0;

#ifdef CONFIG_HAVE_GUP_FAST
        pgd_t *pgdp, pgd;
        p4d_t *p4dp, p4d;
        pud_t *pudp, pud;
        pmd_t *pmdp, pmd;
        pte_t *ptep, pte;

        pgdp = pgd_offset(mm, addr);
        pgd = READ_ONCE(*pgdp);
        if (pgd_none(pgd))
                return 0;

        if (pgd_leaf(pgd))
                return pgd_leaf_size(pgd);

        p4dp = p4d_offset_lockless(pgdp, pgd, addr);
        p4d = READ_ONCE(*p4dp);
        if (!p4d_present(p4d))
                return 0;

        if (p4d_leaf(p4d))
                return p4d_leaf_size(p4d);

        pudp = pud_offset_lockless(p4dp, p4d, addr);
        pud = READ_ONCE(*pudp);
        if (!pud_present(pud))
                return 0;

        if (pud_leaf(pud))
                return pud_leaf_size(pud);

        pmdp = pmd_offset_lockless(pudp, pud, addr);
again:
        pmd = pmdp_get_lockless(pmdp);
        if (!pmd_present(pmd))
                return 0;

        if (pmd_leaf(pmd))
                return pmd_leaf_size(pmd);

        ptep = pte_offset_map(&pmd, addr);
        if (!ptep)
                goto again;

        pte = ptep_get_lockless(ptep);
        if (pte_present(pte))
                size = pte_leaf_size(pte);
        pte_unmap(ptep);
#endif /* CONFIG_HAVE_GUP_FAST */

        return size;
}

static u64 perf_get_page_size(unsigned long addr)
{
        struct mm_struct *mm;
        unsigned long flags;
        u64 size;

        if (!addr)
                return 0;

        /*
         * Software page-table walkers must disable IRQs,
         * which prevents any tear down of the page tables.
         */
        local_irq_save(flags);

        mm = current->mm;
        if (!mm) {
                /*
                 * For kernel threads and the like, use init_mm so that
                 * we can find kernel memory.
                 */
                mm = &init_mm;
        }

        size = perf_get_pgtable_size(mm, addr);

        local_irq_restore(flags);

        return size;
}

static struct perf_callchain_entry __empty_callchain = { .nr = 0, };

struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
        bool kernel = !event->attr.exclude_callchain_kernel;
        bool user   = !event->attr.exclude_callchain_user;
        /* Disallow cross-task user callchains. */
        bool crosstask = event->ctx->task && event->ctx->task != current;
        const u32 max_stack = event->attr.sample_max_stack;
        struct perf_callchain_entry *callchain;

        if (!kernel && !user)
                return &__empty_callchain;

        callchain = get_perf_callchain(regs, 0, kernel, user,
                                       max_stack, crosstask, true);
        return callchain ?: &__empty_callchain;
}

static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d)
{
        return d * !!(flags & s);
}

void perf_prepare_sample(struct perf_sample_data *data,
                         struct perf_event *event,
                         struct pt_regs *regs)
{
        u64 sample_type = event->attr.sample_type;
        u64 filtered_sample_type;

        /*
         * Add the sample flags that are dependent to others.  And clear the
         * sample flags that have already been done by the PMU driver.
         */
        filtered_sample_type = sample_type;
        filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE,
                                           PERF_SAMPLE_IP);
        filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE |
                                           PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR);
        filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER,
                                           PERF_SAMPLE_REGS_USER);
        filtered_sample_type &= ~data->sample_flags;

        if (filtered_sample_type == 0) {
                /* Make sure it has the correct data->type for output */
                data->type = event->attr.sample_type;
                return;
        }

        __perf_event_header__init_id(data, event, filtered_sample_type);

        if (filtered_sample_type & PERF_SAMPLE_IP) {
                data->ip = perf_instruction_pointer(regs);
                data->sample_flags |= PERF_SAMPLE_IP;
        }

        if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
                perf_sample_save_callchain(data, event, regs);

        if (filtered_sample_type & PERF_SAMPLE_RAW) {
                data->raw = NULL;
                data->dyn_size += sizeof(u64);
                data->sample_flags |= PERF_SAMPLE_RAW;
        }

        if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) {
                data->br_stack = NULL;
                data->dyn_size += sizeof(u64);
                data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
        }

        if (filtered_sample_type & PERF_SAMPLE_REGS_USER)
                perf_sample_regs_user(&data->regs_user, regs);

        /*
         * It cannot use the filtered_sample_type here as REGS_USER can be set
         * by STACK_USER (using __cond_set() above) and we don't want to update
         * the dyn_size if it's not requested by users.
         */
        if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) {
                /* regs dump ABI info */
                int size = sizeof(u64);

                if (data->regs_user.regs) {
                        u64 mask = event->attr.sample_regs_user;
                        size += hweight64(mask) * sizeof(u64);
                }

                data->dyn_size += size;
                data->sample_flags |= PERF_SAMPLE_REGS_USER;
        }

        if (filtered_sample_type & PERF_SAMPLE_STACK_USER) {
                /*
                 * Either we need PERF_SAMPLE_STACK_USER bit to be always
                 * processed as the last one or have additional check added
                 * in case new sample type is added, because we could eat
                 * up the rest of the sample size.
                 */
                u16 stack_size = event->attr.sample_stack_user;
                u16 header_size = perf_sample_data_size(data, event);
                u16 size = sizeof(u64);

                stack_size = perf_sample_ustack_size(stack_size, header_size,
                                                     data->regs_user.regs);

                /*
                 * If there is something to dump, add space for the dump
                 * itself and for the field that tells the dynamic size,
                 * which is how many have been actually dumped.
                 */
                if (stack_size)
                        size += sizeof(u64) + stack_size;

                data->stack_user_size = stack_size;
                data->dyn_size += size;
                data->sample_flags |= PERF_SAMPLE_STACK_USER;
        }

        if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
                data->weight.full = 0;
                data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
        }

        if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) {
                data->data_src.val = PERF_MEM_NA;
                data->sample_flags |= PERF_SAMPLE_DATA_SRC;
        }

        if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) {
                data->txn = 0;
                data->sample_flags |= PERF_SAMPLE_TRANSACTION;
        }

        if (filtered_sample_type & PERF_SAMPLE_ADDR) {
                data->addr = 0;
                data->sample_flags |= PERF_SAMPLE_ADDR;
        }

        if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) {
                /* regs dump ABI info */
                int size = sizeof(u64);

                perf_sample_regs_intr(&data->regs_intr, regs);

                if (data->regs_intr.regs) {
                        u64 mask = event->attr.sample_regs_intr;

                        size += hweight64(mask) * sizeof(u64);
                }

                data->dyn_size += size;
                data->sample_flags |= PERF_SAMPLE_REGS_INTR;
        }

        if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) {
                data->phys_addr = perf_virt_to_phys(data->addr);
                data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
        }

#ifdef CONFIG_CGROUP_PERF
        if (filtered_sample_type & PERF_SAMPLE_CGROUP) {
                struct cgroup *cgrp;

                /* protected by RCU */
                cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
                data->cgroup = cgroup_id(cgrp);
                data->sample_flags |= PERF_SAMPLE_CGROUP;
        }
#endif

        /*
         * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't
         * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
         * but the value will not dump to the userspace.
         */
        if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) {
                data->data_page_size = perf_get_page_size(data->addr);
                data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE;
        }

        if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) {
                data->code_page_size = perf_get_page_size(data->ip);
                data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE;
        }

        if (filtered_sample_type & PERF_SAMPLE_AUX) {
                u64 size;
                u16 header_size = perf_sample_data_size(data, event);

                header_size += sizeof(u64); /* size */

                /*
                 * Given the 16bit nature of header::size, an AUX sample can
                 * easily overflow it, what with all the preceding sample bits.
                 * Make sure this doesn't happen by using up to U16_MAX bytes
                 * per sample in total (rounded down to 8 byte boundary).
                 */
                size = min_t(size_t, U16_MAX - header_size,
                             event->attr.aux_sample_size);
                size = rounddown(size, 8);
                size = perf_prepare_sample_aux(event, data, size);

                WARN_ON_ONCE(size + header_size > U16_MAX);
                data->dyn_size += size + sizeof(u64); /* size above */
                data->sample_flags |= PERF_SAMPLE_AUX;
        }
}

void perf_prepare_header(struct perf_event_header *header,
                         struct perf_sample_data *data,
                         struct perf_event *event,
                         struct pt_regs *regs)
{
        header->type = PERF_RECORD_SAMPLE;
        header->size = perf_sample_data_size(data, event);
        header->misc = perf_misc_flags(regs);

        /*
         * If you're adding more sample types here, you likely need to do
         * something about the overflowing header::size, like repurpose the
         * lowest 3 bits of size, which should be always zero at the moment.
         * This raises a more important question, do we really need 512k sized
         * samples and why, so good argumentation is in order for whatever you
         * do here next.
         */
        WARN_ON_ONCE(header->size & 7);
}

static __always_inline int
__perf_event_output(struct perf_event *event,
                    struct perf_sample_data *data,
                    struct pt_regs *regs,
                    int (*output_begin)(struct perf_output_handle *,
                                        struct perf_sample_data *,
                                        struct perf_event *,
                                        unsigned int))
{
        struct perf_output_handle handle;
        struct perf_event_header header;
        int err;

        /* protect the callchain buffers */
        rcu_read_lock();

        perf_prepare_sample(data, event, regs);
        perf_prepare_header(&header, data, event, regs);

        err = output_begin(&handle, data, event, header.size);
        if (err)
                goto exit;

        perf_output_sample(&handle, &header, data, event);

        perf_output_end(&handle);

exit:
        rcu_read_unlock();
        return err;
}

void
perf_event_output_forward(struct perf_event *event,
                         struct perf_sample_data *data,
                         struct pt_regs *regs)
{
        __perf_event_output(event, data, regs, perf_output_begin_forward);
}

void
perf_event_output_backward(struct perf_event *event,
                           struct perf_sample_data *data,
                           struct pt_regs *regs)
{
        __perf_event_output(event, data, regs, perf_output_begin_backward);
}

int
perf_event_output(struct perf_event *event,
                  struct perf_sample_data *data,
                  struct pt_regs *regs)
{
        return __perf_event_output(event, data, regs, perf_output_begin);
}

/*
 * read event_id
 */

struct perf_read_event {
        struct perf_event_header        header;

        u32                                pid;
        u32                                tid;
};

static void
perf_event_read_event(struct perf_event *event,
                        struct task_struct *task)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_read_event read_event = {
                .header = {
                        .type = PERF_RECORD_READ,
                        .misc = 0,
                        .size = sizeof(read_event) + event->read_size,
                },
                .pid = perf_event_pid(event, task),
                .tid = perf_event_tid(event, task),
        };
        int ret;

        perf_event_header__init_id(&read_event.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
        if (ret)
                return;

        perf_output_put(&handle, read_event);
        perf_output_read(&handle, event);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

typedef void (perf_iterate_f)(struct perf_event *event, void *data);

static void
perf_iterate_ctx(struct perf_event_context *ctx,
                   perf_iterate_f output,
                   void *data, bool all)
{
        struct perf_event *event;

        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (!all) {
                        if (event->state < PERF_EVENT_STATE_INACTIVE)
                                continue;
                        if (!event_filter_match(event))
                                continue;
                }

                output(event, data);
        }
}

static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
{
        struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
        struct perf_event *event;

        list_for_each_entry_rcu(event, &pel->list, sb_list) {
                /*
                 * Skip events that are not fully formed yet; ensure that
                 * if we observe event->ctx, both event and ctx will be
                 * complete enough. See perf_install_in_context().
                 */
                if (!smp_load_acquire(&event->ctx))
                        continue;

                if (event->state < PERF_EVENT_STATE_INACTIVE)
                        continue;
                if (!event_filter_match(event))
                        continue;
                output(event, data);
        }
}

/*
 * Iterate all events that need to receive side-band events.
 *
 * For new callers; ensure that account_pmu_sb_event() includes
 * your event, otherwise it might not get delivered.
 */
static void
perf_iterate_sb(perf_iterate_f output, void *data,
               struct perf_event_context *task_ctx)
{
        struct perf_event_context *ctx;

        rcu_read_lock();
        preempt_disable();

        /*
         * If we have task_ctx != NULL we only notify the task context itself.
         * The task_ctx is set only for EXIT events before releasing task
         * context.
         */
        if (task_ctx) {
                perf_iterate_ctx(task_ctx, output, data, false);
                goto done;
        }

        perf_iterate_sb_cpu(output, data);

        ctx = rcu_dereference(current->perf_event_ctxp);
        if (ctx)
                perf_iterate_ctx(ctx, output, data, false);
done:
        preempt_enable();
        rcu_read_unlock();
}

/*
 * Clear all file-based filters at exec, they'll have to be
 * re-instated when/if these objects are mmapped again.
 */
static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
        struct perf_addr_filter *filter;
        unsigned int restart = 0, count = 0;
        unsigned long flags;

        if (!has_addr_filter(event))
                return;

        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
                if (filter->path.dentry) {
                        event->addr_filter_ranges[count].start = 0;
                        event->addr_filter_ranges[count].size = 0;
                        restart++;
                }

                count++;
        }

        if (restart)
                event->addr_filters_gen++;
        raw_spin_unlock_irqrestore(&ifh->lock, flags);

        if (restart)
                perf_event_stop(event, 1);
}

void perf_event_exec(void)
{
        struct perf_event_context *ctx;

        ctx = perf_pin_task_context(current);
        if (!ctx)
                return;

        perf_event_enable_on_exec(ctx);
        perf_event_remove_on_exec(ctx);
        perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);

        perf_unpin_context(ctx);
        put_ctx(ctx);
}

struct remote_output {
        struct perf_buffer        *rb;
        int                        err;
};

static void __perf_event_output_stop(struct perf_event *event, void *data)
{
        struct perf_event *parent = event->parent;
        struct remote_output *ro = data;
        struct perf_buffer *rb = ro->rb;
        struct stop_event_data sd = {
                .event        = event,
        };

        if (!has_aux(event))
                return;

        if (!parent)
                parent = event;

        /*
         * In case of inheritance, it will be the parent that links to the
         * ring-buffer, but it will be the child that's actually using it.
         *
         * We are using event::rb to determine if the event should be stopped,
         * however this may race with ring_buffer_attach() (through set_output),
         * which will make us skip the event that actually needs to be stopped.
         * So ring_buffer_attach() has to stop an aux event before re-assigning
         * its rb pointer.
         */
        if (rcu_dereference(parent->rb) == rb)
                ro->err = __perf_event_stop(&sd);
}

static int __perf_pmu_output_stop(void *info)
{
        struct perf_event *event = info;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct remote_output ro = {
                .rb        = event->rb,
        };

        rcu_read_lock();
        perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
        if (cpuctx->task_ctx)
                perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
                                   &ro, false);
        rcu_read_unlock();

        return ro.err;
}

static void perf_pmu_output_stop(struct perf_event *event)
{
        struct perf_event *iter;
        int err, cpu;

restart:
        rcu_read_lock();
        list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
                /*
                 * For per-CPU events, we need to make sure that neither they
                 * nor their children are running; for cpu==-1 events it's
                 * sufficient to stop the event itself if it's active, since
                 * it can't have children.
                 */
                cpu = iter->cpu;
                if (cpu == -1)
                        cpu = READ_ONCE(iter->oncpu);

                if (cpu == -1)
                        continue;

                err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
                if (err == -EAGAIN) {
                        rcu_read_unlock();
                        goto restart;
                }
        }
        rcu_read_unlock();
}

/*
 * task tracking -- fork/exit
 *
 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
 */

struct perf_task_event {
        struct task_struct                *task;
        struct perf_event_context        *task_ctx;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                ppid;
                u32                                tid;
                u32                                ptid;
                u64                                time;
        } event_id;
};

static int perf_event_task_match(struct perf_event *event)
{
        return event->attr.comm  || event->attr.mmap ||
               event->attr.mmap2 || event->attr.mmap_data ||
               event->attr.task;
}

static void perf_event_task_output(struct perf_event *event,
                                   void *data)
{
        struct perf_task_event *task_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data        sample;
        struct task_struct *task = task_event->task;
        int ret, size = task_event->event_id.header.size;

        if (!perf_event_task_match(event))
                return;

        perf_event_header__init_id(&task_event->event_id.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                task_event->event_id.header.size);
        if (ret)
                goto out;

        task_event->event_id.pid = perf_event_pid(event, task);
        task_event->event_id.tid = perf_event_tid(event, task);

        if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
                task_event->event_id.ppid = perf_event_pid(event,
                                                        task->real_parent);
                task_event->event_id.ptid = perf_event_pid(event,
                                                        task->real_parent);
        } else {  /* PERF_RECORD_FORK */
                task_event->event_id.ppid = perf_event_pid(event, current);
                task_event->event_id.ptid = perf_event_tid(event, current);
        }

        task_event->event_id.time = perf_event_clock(event);

        perf_output_put(&handle, task_event->event_id);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        task_event->event_id.header.size = size;
}

static void perf_event_task(struct task_struct *task,
                              struct perf_event_context *task_ctx,
                              int new)
{
        struct perf_task_event task_event;

        if (!atomic_read(&nr_comm_events) &&
            !atomic_read(&nr_mmap_events) &&
            !atomic_read(&nr_task_events))
                return;

        task_event = (struct perf_task_event){
                .task          = task,
                .task_ctx = task_ctx,
                .event_id    = {
                        .header = {
                                .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
                                .misc = 0,
                                .size = sizeof(task_event.event_id),
                        },
                        /* .pid  */
                        /* .ppid */
                        /* .tid  */
                        /* .ptid */
                        /* .time */
                },
        };

        perf_iterate_sb(perf_event_task_output,
                       &task_event,
                       task_ctx);
}

void perf_event_fork(struct task_struct *task)
{
        perf_event_task(task, NULL, 1);
        perf_event_namespaces(task);
}

/*
 * comm tracking
 */

struct perf_comm_event {
        struct task_struct        *task;
        char                        *comm;
        int                        comm_size;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                tid;
        } event_id;
};

static int perf_event_comm_match(struct perf_event *event)
{
        return event->attr.comm;
}

static void perf_event_comm_output(struct perf_event *event,
                                   void *data)
{
        struct perf_comm_event *comm_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int size = comm_event->event_id.header.size;
        int ret;

        if (!perf_event_comm_match(event))
                return;

        perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                comm_event->event_id.header.size);

        if (ret)
                goto out;

        comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
        comm_event->event_id.tid = perf_event_tid(event, comm_event->task);

        perf_output_put(&handle, comm_event->event_id);
        __output_copy(&handle, comm_event->comm,
                                   comm_event->comm_size);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        comm_event->event_id.header.size = size;
}

static void perf_event_comm_event(struct perf_comm_event *comm_event)
{
        char comm[TASK_COMM_LEN];
        unsigned int size;

        memset(comm, 0, sizeof(comm));
        strscpy(comm, comm_event->task->comm, sizeof(comm));
        size = ALIGN(strlen(comm)+1, sizeof(u64));

        comm_event->comm = comm;
        comm_event->comm_size = size;

        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;

        perf_iterate_sb(perf_event_comm_output,
                       comm_event,
                       NULL);
}

void perf_event_comm(struct task_struct *task, bool exec)
{
        struct perf_comm_event comm_event;

        if (!atomic_read(&nr_comm_events))
                return;

        comm_event = (struct perf_comm_event){
                .task        = task,
                /* .comm      */
                /* .comm_size */
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_COMM,
                                .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
                                /* .size */
                        },
                        /* .pid */
                        /* .tid */
                },
        };

        perf_event_comm_event(&comm_event);
}

/*
 * namespaces tracking
 */

struct perf_namespaces_event {
        struct task_struct                *task;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                tid;
                u64                                nr_namespaces;
                struct perf_ns_link_info        link_info[NR_NAMESPACES];
        } event_id;
};

static int perf_event_namespaces_match(struct perf_event *event)
{
        return event->attr.namespaces;
}

static void perf_event_namespaces_output(struct perf_event *event,
                                         void *data)
{
        struct perf_namespaces_event *namespaces_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        u16 header_size = namespaces_event->event_id.header.size;
        int ret;

        if (!perf_event_namespaces_match(event))
                return;

        perf_event_header__init_id(&namespaces_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                namespaces_event->event_id.header.size);
        if (ret)
                goto out;

        namespaces_event->event_id.pid = perf_event_pid(event,
                                                        namespaces_event->task);
        namespaces_event->event_id.tid = perf_event_tid(event,
                                                        namespaces_event->task);

        perf_output_put(&handle, namespaces_event->event_id);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        namespaces_event->event_id.header.size = header_size;
}

static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
                                   struct task_struct *task,
                                   const struct proc_ns_operations *ns_ops)
{
        struct path ns_path;
        struct inode *ns_inode;
        int error;

        error = ns_get_path(&ns_path, task, ns_ops);
        if (!error) {
                ns_inode = ns_path.dentry->d_inode;
                ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
                ns_link_info->ino = ns_inode->i_ino;
                path_put(&ns_path);
        }
}

void perf_event_namespaces(struct task_struct *task)
{
        struct perf_namespaces_event namespaces_event;
        struct perf_ns_link_info *ns_link_info;

        if (!atomic_read(&nr_namespaces_events))
                return;

        namespaces_event = (struct perf_namespaces_event){
                .task        = task,
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_NAMESPACES,
                                .misc = 0,
                                .size = sizeof(namespaces_event.event_id),
                        },
                        /* .pid */
                        /* .tid */
                        .nr_namespaces = NR_NAMESPACES,
                        /* .link_info[NR_NAMESPACES] */
                },
        };

        ns_link_info = namespaces_event.event_id.link_info;

        perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
                               task, &mntns_operations);

#ifdef CONFIG_USER_NS
        perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
                               task, &userns_operations);
#endif
#ifdef CONFIG_NET_NS
        perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
                               task, &netns_operations);
#endif
#ifdef CONFIG_UTS_NS
        perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
                               task, &utsns_operations);
#endif
#ifdef CONFIG_IPC_NS
        perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
                               task, &ipcns_operations);
#endif
#ifdef CONFIG_PID_NS
        perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
                               task, &pidns_operations);
#endif
#ifdef CONFIG_CGROUPS
        perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
                               task, &cgroupns_operations);
#endif

        perf_iterate_sb(perf_event_namespaces_output,
                        &namespaces_event,
                        NULL);
}

/*
 * cgroup tracking
 */
#ifdef CONFIG_CGROUP_PERF

struct perf_cgroup_event {
        char                                *path;
        int                                path_size;
        struct {
                struct perf_event_header        header;
                u64                                id;
                char                                path[];
        } event_id;
};

static int perf_event_cgroup_match(struct perf_event *event)
{
        return event->attr.cgroup;
}

static void perf_event_cgroup_output(struct perf_event *event, void *data)
{
        struct perf_cgroup_event *cgroup_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        u16 header_size = cgroup_event->event_id.header.size;
        int ret;

        if (!perf_event_cgroup_match(event))
                return;

        perf_event_header__init_id(&cgroup_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                cgroup_event->event_id.header.size);
        if (ret)
                goto out;

        perf_output_put(&handle, cgroup_event->event_id);
        __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        cgroup_event->event_id.header.size = header_size;
}

static void perf_event_cgroup(struct cgroup *cgrp)
{
        struct perf_cgroup_event cgroup_event;
        char path_enomem[16] = "//enomem";
        char *pathname;
        size_t size;

        if (!atomic_read(&nr_cgroup_events))
                return;

        cgroup_event = (struct perf_cgroup_event){
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_CGROUP,
                                .misc = 0,
                                .size = sizeof(cgroup_event.event_id),
                        },
                        .id = cgroup_id(cgrp),
                },
        };

        pathname = kmalloc(PATH_MAX, GFP_KERNEL);
        if (pathname == NULL) {
                cgroup_event.path = path_enomem;
        } else {
                /* just to be sure to have enough space for alignment */
                cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
                cgroup_event.path = pathname;
        }

        /*
         * Since our buffer works in 8 byte units we need to align our string
         * size to a multiple of 8. However, we must guarantee the tail end is
         * zero'd out to avoid leaking random bits to userspace.
         */
        size = strlen(cgroup_event.path) + 1;
        while (!IS_ALIGNED(size, sizeof(u64)))
                cgroup_event.path[size++] = '\0';

        cgroup_event.event_id.header.size += size;
        cgroup_event.path_size = size;

        perf_iterate_sb(perf_event_cgroup_output,
                        &cgroup_event,
                        NULL);

        kfree(pathname);
}

#endif

/*
 * mmap tracking
 */

struct perf_mmap_event {
        struct vm_area_struct        *vma;

        const char                *file_name;
        int                        file_size;
        int                        maj, min;
        u64                        ino;
        u64                        ino_generation;
        u32                        prot, flags;
        u8                        build_id[BUILD_ID_SIZE_MAX];
        u32                        build_id_size;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                tid;
                u64                                start;
                u64                                len;
                u64                                pgoff;
        } event_id;
};

static int perf_event_mmap_match(struct perf_event *event,
                                 void *data)
{
        struct perf_mmap_event *mmap_event = data;
        struct vm_area_struct *vma = mmap_event->vma;
        int executable = vma->vm_flags & VM_EXEC;

        return (!executable && event->attr.mmap_data) ||
               (executable && (event->attr.mmap || event->attr.mmap2));
}

static void perf_event_mmap_output(struct perf_event *event,
                                   void *data)
{
        struct perf_mmap_event *mmap_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int size = mmap_event->event_id.header.size;
        u32 type = mmap_event->event_id.header.type;
        bool use_build_id;
        int ret;

        if (!perf_event_mmap_match(event, data))
                return;

        if (event->attr.mmap2) {
                mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
                mmap_event->event_id.header.size += sizeof(mmap_event->maj);
                mmap_event->event_id.header.size += sizeof(mmap_event->min);
                mmap_event->event_id.header.size += sizeof(mmap_event->ino);
                mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
                mmap_event->event_id.header.size += sizeof(mmap_event->prot);
                mmap_event->event_id.header.size += sizeof(mmap_event->flags);
        }

        perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                mmap_event->event_id.header.size);
        if (ret)
                goto out;

        mmap_event->event_id.pid = perf_event_pid(event, current);
        mmap_event->event_id.tid = perf_event_tid(event, current);

        use_build_id = event->attr.build_id && mmap_event->build_id_size;

        if (event->attr.mmap2 && use_build_id)
                mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID;

        perf_output_put(&handle, mmap_event->event_id);

        if (event->attr.mmap2) {
                if (use_build_id) {
                        u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 };

                        __output_copy(&handle, size, 4);
                        __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX);
                } else {
                        perf_output_put(&handle, mmap_event->maj);
                        perf_output_put(&handle, mmap_event->min);
                        perf_output_put(&handle, mmap_event->ino);
                        perf_output_put(&handle, mmap_event->ino_generation);
                }
                perf_output_put(&handle, mmap_event->prot);
                perf_output_put(&handle, mmap_event->flags);
        }

        __output_copy(&handle, mmap_event->file_name,
                                   mmap_event->file_size);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        mmap_event->event_id.header.size = size;
        mmap_event->event_id.header.type = type;
}

static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
{
        struct vm_area_struct *vma = mmap_event->vma;
        struct file *file = vma->vm_file;
        int maj = 0, min = 0;
        u64 ino = 0, gen = 0;
        u32 prot = 0, flags = 0;
        unsigned int size;
        char tmp[16];
        char *buf = NULL;
        char *name = NULL;

        if (vma->vm_flags & VM_READ)
                prot |= PROT_READ;
        if (vma->vm_flags & VM_WRITE)
                prot |= PROT_WRITE;
        if (vma->vm_flags & VM_EXEC)
                prot |= PROT_EXEC;

        if (vma->vm_flags & VM_MAYSHARE)
                flags = MAP_SHARED;
        else
                flags = MAP_PRIVATE;

        if (vma->vm_flags & VM_LOCKED)
                flags |= MAP_LOCKED;
        if (is_vm_hugetlb_page(vma))
                flags |= MAP_HUGETLB;

        if (file) {
                struct inode *inode;
                dev_t dev;

                buf = kmalloc(PATH_MAX, GFP_KERNEL);
                if (!buf) {
                        name = "//enomem";
                        goto cpy_name;
                }
                /*
                 * d_path() works from the end of the rb backwards, so we
                 * need to add enough zero bytes after the string to handle
                 * the 64bit alignment we do later.
                 */
                name = file_path(file, buf, PATH_MAX - sizeof(u64));
                if (IS_ERR(name)) {
                        name = "//toolong";
                        goto cpy_name;
                }
                inode = file_inode(vma->vm_file);
                dev = inode->i_sb->s_dev;
                ino = inode->i_ino;
                gen = inode->i_generation;
                maj = MAJOR(dev);
                min = MINOR(dev);

                goto got_name;
        } else {
                if (vma->vm_ops && vma->vm_ops->name)
                        name = (char *) vma->vm_ops->name(vma);
                if (!name)
                        name = (char *)arch_vma_name(vma);
                if (!name) {
                        if (vma_is_initial_heap(vma))
                                name = "[heap]";
                        else if (vma_is_initial_stack(vma))
                                name = "[stack]";
                        else
                                name = "//anon";
                }
        }

cpy_name:
        strscpy(tmp, name, sizeof(tmp));
        name = tmp;
got_name:
        /*
         * Since our buffer works in 8 byte units we need to align our string
         * size to a multiple of 8. However, we must guarantee the tail end is
         * zero'd out to avoid leaking random bits to userspace.
         */
        size = strlen(name)+1;
        while (!IS_ALIGNED(size, sizeof(u64)))
                name[size++] = '\0';

        mmap_event->file_name = name;
        mmap_event->file_size = size;
        mmap_event->maj = maj;
        mmap_event->min = min;
        mmap_event->ino = ino;
        mmap_event->ino_generation = gen;
        mmap_event->prot = prot;
        mmap_event->flags = flags;

        if (!(vma->vm_flags & VM_EXEC))
                mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;

        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;

        if (atomic_read(&nr_build_id_events))
                build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);

        perf_iterate_sb(perf_event_mmap_output,
                       mmap_event,
                       NULL);

        kfree(buf);
}

/*
 * Check whether inode and address range match filter criteria.
 */
static bool perf_addr_filter_match(struct perf_addr_filter *filter,
                                     struct file *file, unsigned long offset,
                                     unsigned long size)
{
        /* d_inode(NULL) won't be equal to any mapped user-space file */
        if (!filter->path.dentry)
                return false;

        if (d_inode(filter->path.dentry) != file_inode(file))
                return false;

        if (filter->offset > offset + size)
                return false;

        if (filter->offset + filter->size < offset)
                return false;

        return true;
}

static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
                                        struct vm_area_struct *vma,
                                        struct perf_addr_filter_range *fr)
{
        unsigned long vma_size = vma->vm_end - vma->vm_start;
        unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
        struct file *file = vma->vm_file;

        if (!perf_addr_filter_match(filter, file, off, vma_size))
                return false;

        if (filter->offset < off) {
                fr->start = vma->vm_start;
                fr->size = min(vma_size, filter->size - (off - filter->offset));
        } else {
                fr->start = vma->vm_start + filter->offset - off;
                fr->size = min(vma->vm_end - fr->start, filter->size);
        }

        return true;
}

static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
        struct vm_area_struct *vma = data;
        struct perf_addr_filter *filter;
        unsigned int restart = 0, count = 0;
        unsigned long flags;

        if (!has_addr_filter(event))
                return;

        if (!vma->vm_file)
                return;

        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
                if (perf_addr_filter_vma_adjust(filter, vma,
                                                &event->addr_filter_ranges[count]))
                        restart++;

                count++;
        }

        if (restart)
                event->addr_filters_gen++;
        raw_spin_unlock_irqrestore(&ifh->lock, flags);

        if (restart)
                perf_event_stop(event, 1);
}

/*
 * Adjust all task's events' filters to the new vma
 */
static void perf_addr_filters_adjust(struct vm_area_struct *vma)
{
        struct perf_event_context *ctx;

        /*
         * Data tracing isn't supported yet and as such there is no need
         * to keep track of anything that isn't related to executable code:
         */
        if (!(vma->vm_flags & VM_EXEC))
                return;

        rcu_read_lock();
        ctx = rcu_dereference(current->perf_event_ctxp);
        if (ctx)
                perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
        rcu_read_unlock();
}

void perf_event_mmap(struct vm_area_struct *vma)
{
        struct perf_mmap_event mmap_event;

        if (!atomic_read(&nr_mmap_events))
                return;

        mmap_event = (struct perf_mmap_event){
                .vma        = vma,
                /* .file_name */
                /* .file_size */
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_MMAP,
                                .misc = PERF_RECORD_MISC_USER,
                                /* .size */
                        },
                        /* .pid */
                        /* .tid */
                        .start  = vma->vm_start,
                        .len    = vma->vm_end - vma->vm_start,
                        .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
                },
                /* .maj (attr_mmap2 only) */
                /* .min (attr_mmap2 only) */
                /* .ino (attr_mmap2 only) */
                /* .ino_generation (attr_mmap2 only) */
                /* .prot (attr_mmap2 only) */
                /* .flags (attr_mmap2 only) */
        };

        perf_addr_filters_adjust(vma);
        perf_event_mmap_event(&mmap_event);
}

void perf_event_aux_event(struct perf_event *event, unsigned long head,
                          unsigned long size, u64 flags)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_aux_event {
                struct perf_event_header        header;
                u64                                offset;
                u64                                size;
                u64                                flags;
        } rec = {
                .header = {
                        .type = PERF_RECORD_AUX,
                        .misc = 0,
                        .size = sizeof(rec),
                },
                .offset                = head,
                .size                = size,
                .flags                = flags,
        };
        int ret;

        perf_event_header__init_id(&rec.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, rec.header.size);

        if (ret)
                return;

        perf_output_put(&handle, rec);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

/*
 * Lost/dropped samples logging
 */
void perf_log_lost_samples(struct perf_event *event, u64 lost)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        struct {
                struct perf_event_header        header;
                u64                                lost;
        } lost_samples_event = {
                .header = {
                        .type = PERF_RECORD_LOST_SAMPLES,
                        .misc = 0,
                        .size = sizeof(lost_samples_event),
                },
                .lost                = lost,
        };

        perf_event_header__init_id(&lost_samples_event.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                lost_samples_event.header.size);
        if (ret)
                return;

        perf_output_put(&handle, lost_samples_event);
        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
}

/*
 * context_switch tracking
 */

struct perf_switch_event {
        struct task_struct        *task;
        struct task_struct        *next_prev;

        struct {
                struct perf_event_header        header;
                u32                                next_prev_pid;
                u32                                next_prev_tid;
        } event_id;
};

static int perf_event_switch_match(struct perf_event *event)
{
        return event->attr.context_switch;
}

static void perf_event_switch_output(struct perf_event *event, void *data)
{
        struct perf_switch_event *se = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        if (!perf_event_switch_match(event))
                return;

        /* Only CPU-wide events are allowed to see next/prev pid/tid */
        if (event->ctx->task) {
                se->event_id.header.type = PERF_RECORD_SWITCH;
                se->event_id.header.size = sizeof(se->event_id.header);
        } else {
                se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
                se->event_id.header.size = sizeof(se->event_id);
                se->event_id.next_prev_pid =
                                        perf_event_pid(event, se->next_prev);
                se->event_id.next_prev_tid =
                                        perf_event_tid(event, se->next_prev);
        }

        perf_event_header__init_id(&se->event_id.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
        if (ret)
                return;

        if (event->ctx->task)
                perf_output_put(&handle, se->event_id.header);
        else
                perf_output_put(&handle, se->event_id);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

static void perf_event_switch(struct task_struct *task,
                              struct task_struct *next_prev, bool sched_in)
{
        struct perf_switch_event switch_event;

        /* N.B. caller checks nr_switch_events != 0 */

        switch_event = (struct perf_switch_event){
                .task                = task,
                .next_prev        = next_prev,
                .event_id        = {
                        .header = {
                                /* .type */
                                .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
                                /* .size */
                        },
                        /* .next_prev_pid */
                        /* .next_prev_tid */
                },
        };

        if (!sched_in && task->on_rq) {
                switch_event.event_id.header.misc |=
                                PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
        }

        perf_iterate_sb(perf_event_switch_output, &switch_event, NULL);
}

/*
 * IRQ throttle logging
 */

static void perf_log_throttle(struct perf_event *event, int enable)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        struct {
                struct perf_event_header        header;
                u64                                time;
                u64                                id;
                u64                                stream_id;
        } throttle_event = {
                .header = {
                        .type = PERF_RECORD_THROTTLE,
                        .misc = 0,
                        .size = sizeof(throttle_event),
                },
                .time                = perf_event_clock(event),
                .id                = primary_event_id(event),
                .stream_id        = event->id,
        };

        if (enable)
                throttle_event.header.type = PERF_RECORD_UNTHROTTLE;

        perf_event_header__init_id(&throttle_event.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                throttle_event.header.size);
        if (ret)
                return;

        perf_output_put(&handle, throttle_event);
        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
}

/*
 * ksymbol register/unregister tracking
 */

struct perf_ksymbol_event {
        const char        *name;
        int                name_len;
        struct {
                struct perf_event_header        header;
                u64                                addr;
                u32                                len;
                u16                                ksym_type;
                u16                                flags;
        } event_id;
};

static int perf_event_ksymbol_match(struct perf_event *event)
{
        return event->attr.ksymbol;
}

static void perf_event_ksymbol_output(struct perf_event *event, void *data)
{
        struct perf_ksymbol_event *ksymbol_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        if (!perf_event_ksymbol_match(event))
                return;

        perf_event_header__init_id(&ksymbol_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                ksymbol_event->event_id.header.size);
        if (ret)
                return;

        perf_output_put(&handle, ksymbol_event->event_id);
        __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
                        const char *sym)
{
        struct perf_ksymbol_event ksymbol_event;
        char name[KSYM_NAME_LEN];
        u16 flags = 0;
        int name_len;

        if (!atomic_read(&nr_ksymbol_events))
                return;

        if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
            ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
                goto err;

        strscpy(name, sym, KSYM_NAME_LEN);
        name_len = strlen(name) + 1;
        while (!IS_ALIGNED(name_len, sizeof(u64)))
                name[name_len++] = '\0';
        BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));

        if (unregister)
                flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;

        ksymbol_event = (struct perf_ksymbol_event){
                .name = name,
                .name_len = name_len,
                .event_id = {
                        .header = {
                                .type = PERF_RECORD_KSYMBOL,
                                .size = sizeof(ksymbol_event.event_id) +
                                        name_len,
                        },
                        .addr = addr,
                        .len = len,
                        .ksym_type = ksym_type,
                        .flags = flags,
                },
        };

        perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
        return;
err:
        WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
}

/*
 * bpf program load/unload tracking
 */

struct perf_bpf_event {
        struct bpf_prog        *prog;
        struct {
                struct perf_event_header        header;
                u16                                type;
                u16                                flags;
                u32                                id;
                u8                                tag[BPF_TAG_SIZE];
        } event_id;
};

static int perf_event_bpf_match(struct perf_event *event)
{
        return event->attr.bpf_event;
}

static void perf_event_bpf_output(struct perf_event *event, void *data)
{
        struct perf_bpf_event *bpf_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        if (!perf_event_bpf_match(event))
                return;

        perf_event_header__init_id(&bpf_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                bpf_event->event_id.header.size);
        if (ret)
                return;

        perf_output_put(&handle, bpf_event->event_id);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
                                         enum perf_bpf_event_type type)
{
        bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
        int i;

        if (prog->aux->func_cnt == 0) {
                perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
                                   (u64)(unsigned long)prog->bpf_func,
                                   prog->jited_len, unregister,
                                   prog->aux->ksym.name);
        } else {
                for (i = 0; i < prog->aux->func_cnt; i++) {
                        struct bpf_prog *subprog = prog->aux->func[i];

                        perf_event_ksymbol(
                                PERF_RECORD_KSYMBOL_TYPE_BPF,
                                (u64)(unsigned long)subprog->bpf_func,
                                subprog->jited_len, unregister,
                                subprog->aux->ksym.name);
                }
        }
}

void perf_event_bpf_event(struct bpf_prog *prog,
                          enum perf_bpf_event_type type,
                          u16 flags)
{
        struct perf_bpf_event bpf_event;

        switch (type) {
        case PERF_BPF_EVENT_PROG_LOAD:
        case PERF_BPF_EVENT_PROG_UNLOAD:
                if (atomic_read(&nr_ksymbol_events))
                        perf_event_bpf_emit_ksymbols(prog, type);
                break;
        default:
                return;
        }

        if (!atomic_read(&nr_bpf_events))
                return;

        bpf_event = (struct perf_bpf_event){
                .prog = prog,
                .event_id = {
                        .header = {
                                .type = PERF_RECORD_BPF_EVENT,
                                .size = sizeof(bpf_event.event_id),
                        },
                        .type = type,
                        .flags = flags,
                        .id = prog->aux->id,
                },
        };

        BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));

        memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
        perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
}

struct perf_text_poke_event {
        const void                *old_bytes;
        const void                *new_bytes;
        size_t                        pad;
        u16                        old_len;
        u16                        new_len;

        struct {
                struct perf_event_header        header;

                u64                                addr;
        } event_id;
};

static int perf_event_text_poke_match(struct perf_event *event)
{
        return event->attr.text_poke;
}

static void perf_event_text_poke_output(struct perf_event *event, void *data)
{
        struct perf_text_poke_event *text_poke_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        u64 padding = 0;
        int ret;

        if (!perf_event_text_poke_match(event))
                return;

        perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                text_poke_event->event_id.header.size);
        if (ret)
                return;

        perf_output_put(&handle, text_poke_event->event_id);
        perf_output_put(&handle, text_poke_event->old_len);
        perf_output_put(&handle, text_poke_event->new_len);

        __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
        __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);

        if (text_poke_event->pad)
                __output_copy(&handle, &padding, text_poke_event->pad);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

void perf_event_text_poke(const void *addr, const void *old_bytes,
                          size_t old_len, const void *new_bytes, size_t new_len)
{
        struct perf_text_poke_event text_poke_event;
        size_t tot, pad;

        if (!atomic_read(&nr_text_poke_events))
                return;

        tot  = sizeof(text_poke_event.old_len) + old_len;
        tot += sizeof(text_poke_event.new_len) + new_len;
        pad  = ALIGN(tot, sizeof(u64)) - tot;

        text_poke_event = (struct perf_text_poke_event){
                .old_bytes    = old_bytes,
                .new_bytes    = new_bytes,
                .pad          = pad,
                .old_len      = old_len,
                .new_len      = new_len,
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_TEXT_POKE,
                                .misc = PERF_RECORD_MISC_KERNEL,
                                .size = sizeof(text_poke_event.event_id) + tot + pad,
                        },
                        .addr = (unsigned long)addr,
                },
        };

        perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
}

void perf_event_itrace_started(struct perf_event *event)
{
        event->attach_state |= PERF_ATTACH_ITRACE;
}

static void perf_log_itrace_start(struct perf_event *event)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_aux_event {
                struct perf_event_header        header;
                u32                                pid;
                u32                                tid;
        } rec;
        int ret;

        if (event->parent)
                event = event->parent;

        if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
            event->attach_state & PERF_ATTACH_ITRACE)
                return;

        rec.header.type        = PERF_RECORD_ITRACE_START;
        rec.header.misc        = 0;
        rec.header.size        = sizeof(rec);
        rec.pid        = perf_event_pid(event, current);
        rec.tid        = perf_event_tid(event, current);

        perf_event_header__init_id(&rec.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, rec.header.size);

        if (ret)
                return;

        perf_output_put(&handle, rec);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

void perf_report_aux_output_id(struct perf_event *event, u64 hw_id)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_aux_event {
                struct perf_event_header        header;
                u64                                hw_id;
        } rec;
        int ret;

        if (event->parent)
                event = event->parent;

        rec.header.type        = PERF_RECORD_AUX_OUTPUT_HW_ID;
        rec.header.misc        = 0;
        rec.header.size        = sizeof(rec);
        rec.hw_id        = hw_id;

        perf_event_header__init_id(&rec.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, rec.header.size);

        if (ret)
                return;

        perf_output_put(&handle, rec);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}
EXPORT_SYMBOL_GPL(perf_report_aux_output_id);

static int
__perf_event_account_interrupt(struct perf_event *event, int throttle)
{
        struct hw_perf_event *hwc = &event->hw;
        int ret = 0;
        u64 seq;

        seq = __this_cpu_read(perf_throttled_seq);
        if (seq != hwc->interrupts_seq) {
                hwc->interrupts_seq = seq;
                hwc->interrupts = 1;
        } else {
                hwc->interrupts++;
                if (unlikely(throttle &&
                             hwc->interrupts > max_samples_per_tick)) {
                        __this_cpu_inc(perf_throttled_count);
                        tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
                        hwc->interrupts = MAX_INTERRUPTS;
                        perf_log_throttle(event, 0);
                        ret = 1;
                }
        }

        if (event->attr.freq) {
                u64 now = perf_clock();
                s64 delta = now - hwc->freq_time_stamp;

                hwc->freq_time_stamp = now;

                if (delta > 0 && delta < 2*TICK_NSEC)
                        perf_adjust_period(event, delta, hwc->last_period, true);
        }

        return ret;
}

int perf_event_account_interrupt(struct perf_event *event)
{
        return __perf_event_account_interrupt(event, 1);
}

static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
{
        /*
         * Due to interrupt latency (AKA "skid"), we may enter the
         * kernel before taking an overflow, even if the PMU is only
         * counting user events.
         */
        if (event->attr.exclude_kernel && !user_mode(regs))
                return false;

        return true;
}

#ifdef CONFIG_BPF_SYSCALL
static int bpf_overflow_handler(struct perf_event *event,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
{
        struct bpf_perf_event_data_kern ctx = {
                .data = data,
                .event = event,
        };
        struct bpf_prog *prog;
        int ret = 0;

        ctx.regs = perf_arch_bpf_user_pt_regs(regs);
        if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
                goto out;
        rcu_read_lock();
        prog = READ_ONCE(event->prog);
        if (prog) {
                perf_prepare_sample(data, event, regs);
                ret = bpf_prog_run(prog, &ctx);
        }
        rcu_read_unlock();
out:
        __this_cpu_dec(bpf_prog_active);

        return ret;
}

static inline int perf_event_set_bpf_handler(struct perf_event *event,
                                             struct bpf_prog *prog,
                                             u64 bpf_cookie)
{
        if (event->overflow_handler_context)
                /* hw breakpoint or kernel counter */
                return -EINVAL;

        if (event->prog)
                return -EEXIST;

        if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
                return -EINVAL;

        if (event->attr.precise_ip &&
            prog->call_get_stack &&
            (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
             event->attr.exclude_callchain_kernel ||
             event->attr.exclude_callchain_user)) {
                /*
                 * On perf_event with precise_ip, calling bpf_get_stack()
                 * may trigger unwinder warnings and occasional crashes.
                 * bpf_get_[stack|stackid] works around this issue by using
                 * callchain attached to perf_sample_data. If the
                 * perf_event does not full (kernel and user) callchain
                 * attached to perf_sample_data, do not allow attaching BPF
                 * program that calls bpf_get_[stack|stackid].
                 */
                return -EPROTO;
        }

        event->prog = prog;
        event->bpf_cookie = bpf_cookie;
        return 0;
}

static inline void perf_event_free_bpf_handler(struct perf_event *event)
{
        struct bpf_prog *prog = event->prog;

        if (!prog)
                return;

        event->prog = NULL;
        bpf_prog_put(prog);
}
#else
static inline int bpf_overflow_handler(struct perf_event *event,
                                       struct perf_sample_data *data,
                                       struct pt_regs *regs)
{
        return 1;
}

static inline int perf_event_set_bpf_handler(struct perf_event *event,
                                             struct bpf_prog *prog,
                                             u64 bpf_cookie)
{
        return -EOPNOTSUPP;
}

static inline void perf_event_free_bpf_handler(struct perf_event *event)
{
}
#endif

/*
 * Generic event overflow handling, sampling.
 */

static int __perf_event_overflow(struct perf_event *event,
                                 int throttle, struct perf_sample_data *data,
                                 struct pt_regs *regs)
{
        int events = atomic_read(&event->event_limit);
        int ret = 0;

        /*
         * Non-sampling counters might still use the PMI to fold short
         * hardware counters, ignore those.
         */
        if (unlikely(!is_sampling_event(event)))
                return 0;

        ret = __perf_event_account_interrupt(event, throttle);

        if (event->prog && !bpf_overflow_handler(event, data, regs))
                return ret;

        /*
         * XXX event_limit might not quite work as expected on inherited
         * events
         */

        event->pending_kill = POLL_IN;
        if (events && atomic_dec_and_test(&event->event_limit)) {
                ret = 1;
                event->pending_kill = POLL_HUP;
                perf_event_disable_inatomic(event);
        }

        if (event->attr.sigtrap) {
                /*
                 * The desired behaviour of sigtrap vs invalid samples is a bit
                 * tricky; on the one hand, one should not loose the SIGTRAP if
                 * it is the first event, on the other hand, we should also not
                 * trigger the WARN or override the data address.
                 */
                bool valid_sample = sample_is_allowed(event, regs);
                unsigned int pending_id = 1;

                if (regs)
                        pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
                if (!event->pending_sigtrap) {
                        event->pending_sigtrap = pending_id;
                        local_inc(&event->ctx->nr_pending);
                } else if (event->attr.exclude_kernel && valid_sample) {
                        /*
                         * Should not be able to return to user space without
                         * consuming pending_sigtrap; with exceptions:
                         *
                         *  1. Where !exclude_kernel, events can overflow again
                         *     in the kernel without returning to user space.
                         *
                         *  2. Events that can overflow again before the IRQ-
                         *     work without user space progress (e.g. hrtimer).
                         *     To approximate progress (with false negatives),
                         *     check 32-bit hash of the current IP.
                         */
                        WARN_ON_ONCE(event->pending_sigtrap != pending_id);
                }

                event->pending_addr = 0;
                if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
                        event->pending_addr = data->addr;
                irq_work_queue(&event->pending_irq);
        }

        READ_ONCE(event->overflow_handler)(event, data, regs);

        if (*perf_event_fasync(event) && event->pending_kill) {
                event->pending_wakeup = 1;
                irq_work_queue(&event->pending_irq);
        }

        return ret;
}

int perf_event_overflow(struct perf_event *event,
                        struct perf_sample_data *data,
                        struct pt_regs *regs)
{
        return __perf_event_overflow(event, 1, data, regs);
}

/*
 * Generic software event infrastructure
 */

struct swevent_htable {
        struct swevent_hlist                *swevent_hlist;
        struct mutex                        hlist_mutex;
        int                                hlist_refcount;

        /* Recursion avoidance in each contexts */
        int                                recursion[PERF_NR_CONTEXTS];
};

static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);

/*
 * We directly increment event->count and keep a second value in
 * event->hw.period_left to count intervals. This period event
 * is kept in the range [-sample_period, 0] so that we can use the
 * sign as trigger.
 */

u64 perf_swevent_set_period(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;
        u64 period = hwc->last_period;
        u64 nr, offset;
        s64 old, val;

        hwc->last_period = hwc->sample_period;

        old = local64_read(&hwc->period_left);
        do {
                val = old;
                if (val < 0)
                        return 0;

                nr = div64_u64(period + val, period);
                offset = nr * period;
                val -= offset;
        } while (!local64_try_cmpxchg(&hwc->period_left, &old, val));

        return nr;
}

static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
{
        struct hw_perf_event *hwc = &event->hw;
        int throttle = 0;

        if (!overflow)
                overflow = perf_swevent_set_period(event);

        if (hwc->interrupts == MAX_INTERRUPTS)
                return;

        for (; overflow; overflow--) {
                if (__perf_event_overflow(event, throttle,
                                            data, regs)) {
                        /*
                         * We inhibit the overflow from happening when
                         * hwc->interrupts == MAX_INTERRUPTS.
                         */
                        break;
                }
                throttle = 1;
        }
}

static void perf_swevent_event(struct perf_event *event, u64 nr,
                               struct perf_sample_data *data,
                               struct pt_regs *regs)
{
        struct hw_perf_event *hwc = &event->hw;

        local64_add(nr, &event->count);

        if (!regs)
                return;

        if (!is_sampling_event(event))
                return;

        if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
                data->period = nr;
                return perf_swevent_overflow(event, 1, data, regs);
        } else
                data->period = event->hw.last_period;

        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
                return perf_swevent_overflow(event, 1, data, regs);

        if (local64_add_negative(nr, &hwc->period_left))
                return;

        perf_swevent_overflow(event, 0, data, regs);
}

static int perf_exclude_event(struct perf_event *event,
                              struct pt_regs *regs)
{
        if (event->hw.state & PERF_HES_STOPPED)
                return 1;

        if (regs) {
                if (event->attr.exclude_user && user_mode(regs))
                        return 1;

                if (event->attr.exclude_kernel && !user_mode(regs))
                        return 1;
        }

        return 0;
}

static int perf_swevent_match(struct perf_event *event,
                                enum perf_type_id type,
                                u32 event_id,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
{
        if (event->attr.type != type)
                return 0;

        if (event->attr.config != event_id)
                return 0;

        if (perf_exclude_event(event, regs))
                return 0;

        return 1;
}

static inline u64 swevent_hash(u64 type, u32 event_id)
{
        u64 val = event_id | (type << 32);

        return hash_64(val, SWEVENT_HLIST_BITS);
}

static inline struct hlist_head *
__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
{
        u64 hash = swevent_hash(type, event_id);

        return &hlist->heads[hash];
}

/* For the read side: events when they trigger */
static inline struct hlist_head *
find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
{
        struct swevent_hlist *hlist;

        hlist = rcu_dereference(swhash->swevent_hlist);
        if (!hlist)
                return NULL;

        return __find_swevent_head(hlist, type, event_id);
}

/* For the event head insertion and removal in the hlist */
static inline struct hlist_head *
find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
{
        struct swevent_hlist *hlist;
        u32 event_id = event->attr.config;
        u64 type = event->attr.type;

        /*
         * Event scheduling is always serialized against hlist allocation
         * and release. Which makes the protected version suitable here.
         * The context lock guarantees that.
         */
        hlist = rcu_dereference_protected(swhash->swevent_hlist,
                                          lockdep_is_held(&event->ctx->lock));
        if (!hlist)
                return NULL;

        return __find_swevent_head(hlist, type, event_id);
}

static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
                                    u64 nr,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
{
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
        struct perf_event *event;
        struct hlist_head *head;

        rcu_read_lock();
        head = find_swevent_head_rcu(swhash, type, event_id);
        if (!head)
                goto end;

        hlist_for_each_entry_rcu(event, head, hlist_entry) {
                if (perf_swevent_match(event, type, event_id, data, regs))
                        perf_swevent_event(event, nr, data, regs);
        }
end:
        rcu_read_unlock();
}

DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);

int perf_swevent_get_recursion_context(void)
{
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);

        return get_recursion_context(swhash->recursion);
}
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);

void perf_swevent_put_recursion_context(int rctx)
{
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);

        put_recursion_context(swhash->recursion, rctx);
}

void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
        struct perf_sample_data data;

        if (WARN_ON_ONCE(!regs))
                return;

        perf_sample_data_init(&data, addr, 0);
        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
}

void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
        int rctx;

        preempt_disable_notrace();
        rctx = perf_swevent_get_recursion_context();
        if (unlikely(rctx < 0))
                goto fail;

        ___perf_sw_event(event_id, nr, regs, addr);

        perf_swevent_put_recursion_context(rctx);
fail:
        preempt_enable_notrace();
}

static void perf_swevent_read(struct perf_event *event)
{
}

static int perf_swevent_add(struct perf_event *event, int flags)
{
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
        struct hw_perf_event *hwc = &event->hw;
        struct hlist_head *head;

        if (is_sampling_event(event)) {
                hwc->last_period = hwc->sample_period;
                perf_swevent_set_period(event);
        }

        hwc->state = !(flags & PERF_EF_START);

        head = find_swevent_head(swhash, event);
        if (WARN_ON_ONCE(!head))
                return -EINVAL;

        hlist_add_head_rcu(&event->hlist_entry, head);
        perf_event_update_userpage(event);

        return 0;
}

static void perf_swevent_del(struct perf_event *event, int flags)
{
        hlist_del_rcu(&event->hlist_entry);
}

static void perf_swevent_start(struct perf_event *event, int flags)
{
        event->hw.state = 0;
}

static void perf_swevent_stop(struct perf_event *event, int flags)
{
        event->hw.state = PERF_HES_STOPPED;
}

/* Deref the hlist from the update side */
static inline struct swevent_hlist *
swevent_hlist_deref(struct swevent_htable *swhash)
{
        return rcu_dereference_protected(swhash->swevent_hlist,
                                         lockdep_is_held(&swhash->hlist_mutex));
}

static void swevent_hlist_release(struct swevent_htable *swhash)
{
        struct swevent_hlist *hlist = swevent_hlist_deref(swhash);

        if (!hlist)
                return;

        RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
        kfree_rcu(hlist, rcu_head);
}

static void swevent_hlist_put_cpu(int cpu)
{
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);

        mutex_lock(&swhash->hlist_mutex);

        if (!--swhash->hlist_refcount)
                swevent_hlist_release(swhash);

        mutex_unlock(&swhash->hlist_mutex);
}

static void swevent_hlist_put(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                swevent_hlist_put_cpu(cpu);
}

static int swevent_hlist_get_cpu(int cpu)
{
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
        int err = 0;

        mutex_lock(&swhash->hlist_mutex);
        if (!swevent_hlist_deref(swhash) &&
            cpumask_test_cpu(cpu, perf_online_mask)) {
                struct swevent_hlist *hlist;

                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
                if (!hlist) {
                        err = -ENOMEM;
                        goto exit;
                }
                rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
        swhash->hlist_refcount++;
exit:
        mutex_unlock(&swhash->hlist_mutex);

        return err;
}

static int swevent_hlist_get(void)
{
        int err, cpu, failed_cpu;

        mutex_lock(&pmus_lock);
        for_each_possible_cpu(cpu) {
                err = swevent_hlist_get_cpu(cpu);
                if (err) {
                        failed_cpu = cpu;
                        goto fail;
                }
        }
        mutex_unlock(&pmus_lock);
        return 0;
fail:
        for_each_possible_cpu(cpu) {
                if (cpu == failed_cpu)
                        break;
                swevent_hlist_put_cpu(cpu);
        }
        mutex_unlock(&pmus_lock);
        return err;
}

struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];

static void sw_perf_event_destroy(struct perf_event *event)
{
        u64 event_id = event->attr.config;

        WARN_ON(event->parent);

        static_key_slow_dec(&perf_swevent_enabled[event_id]);
        swevent_hlist_put();
}

static struct pmu perf_cpu_clock; /* fwd declaration */
static struct pmu perf_task_clock;

static int perf_swevent_init(struct perf_event *event)
{
        u64 event_id = event->attr.config;

        if (event->attr.type != PERF_TYPE_SOFTWARE)
                return -ENOENT;

        /*
         * no branch sampling for software events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        switch (event_id) {
        case PERF_COUNT_SW_CPU_CLOCK:
                event->attr.type = perf_cpu_clock.type;
                return -ENOENT;
        case PERF_COUNT_SW_TASK_CLOCK:
                event->attr.type = perf_task_clock.type;
                return -ENOENT;

        default:
                break;
        }

        if (event_id >= PERF_COUNT_SW_MAX)
                return -ENOENT;

        if (!event->parent) {
                int err;

                err = swevent_hlist_get();
                if (err)
                        return err;

                static_key_slow_inc(&perf_swevent_enabled[event_id]);
                event->destroy = sw_perf_event_destroy;
        }

        return 0;
}

static struct pmu perf_swevent = {
        .task_ctx_nr        = perf_sw_context,

        .capabilities        = PERF_PMU_CAP_NO_NMI,

        .event_init        = perf_swevent_init,
        .add                = perf_swevent_add,
        .del                = perf_swevent_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
};

#ifdef CONFIG_EVENT_TRACING

static void tp_perf_event_destroy(struct perf_event *event)
{
        perf_trace_destroy(event);
}

static int perf_tp_event_init(struct perf_event *event)
{
        int err;

        if (event->attr.type != PERF_TYPE_TRACEPOINT)
                return -ENOENT;

        /*
         * no branch sampling for tracepoint events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        err = perf_trace_init(event);
        if (err)
                return err;

        event->destroy = tp_perf_event_destroy;

        return 0;
}

static struct pmu perf_tracepoint = {
        .task_ctx_nr        = perf_sw_context,

        .event_init        = perf_tp_event_init,
        .add                = perf_trace_add,
        .del                = perf_trace_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
};

static int perf_tp_filter_match(struct perf_event *event,
                                struct perf_sample_data *data)
{
        void *record = data->raw->frag.data;

        /* only top level events have filters set */
        if (event->parent)
                event = event->parent;

        if (likely(!event->filter) || filter_match_preds(event->filter, record))
                return 1;
        return 0;
}

static int perf_tp_event_match(struct perf_event *event,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
{
        if (event->hw.state & PERF_HES_STOPPED)
                return 0;
        /*
         * If exclude_kernel, only trace user-space tracepoints (uprobes)
         */
        if (event->attr.exclude_kernel && !user_mode(regs))
                return 0;

        if (!perf_tp_filter_match(event, data))
                return 0;

        return 1;
}

void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
                               struct trace_event_call *call, u64 count,
                               struct pt_regs *regs, struct hlist_head *head,
                               struct task_struct *task)
{
        if (bpf_prog_array_valid(call)) {
                *(struct pt_regs **)raw_data = regs;
                if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
                        perf_swevent_put_recursion_context(rctx);
                        return;
                }
        }
        perf_tp_event(call->event.type, count, raw_data, size, regs, head,
                      rctx, task);
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);

static void __perf_tp_event_target_task(u64 count, void *record,
                                        struct pt_regs *regs,
                                        struct perf_sample_data *data,
                                        struct perf_event *event)
{
        struct trace_entry *entry = record;

        if (event->attr.config != entry->type)
                return;
        /* Cannot deliver synchronous signal to other task. */
        if (event->attr.sigtrap)
                return;
        if (perf_tp_event_match(event, data, regs))
                perf_swevent_event(event, count, data, regs);
}

static void perf_tp_event_target_task(u64 count, void *record,
                                      struct pt_regs *regs,
                                      struct perf_sample_data *data,
                                      struct perf_event_context *ctx)
{
        unsigned int cpu = smp_processor_id();
        struct pmu *pmu = &perf_tracepoint;
        struct perf_event *event, *sibling;

        perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
                __perf_tp_event_target_task(count, record, regs, data, event);
                for_each_sibling_event(sibling, event)
                        __perf_tp_event_target_task(count, record, regs, data, sibling);
        }

        perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
                __perf_tp_event_target_task(count, record, regs, data, event);
                for_each_sibling_event(sibling, event)
                        __perf_tp_event_target_task(count, record, regs, data, sibling);
        }
}

void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
                   struct pt_regs *regs, struct hlist_head *head, int rctx,
                   struct task_struct *task)
{
        struct perf_sample_data data;
        struct perf_event *event;

        struct perf_raw_record raw = {
                .frag = {
                        .size = entry_size,
                        .data = record,
                },
        };

        perf_sample_data_init(&data, 0, 0);
        perf_sample_save_raw_data(&data, &raw);

        perf_trace_buf_update(record, event_type);

        hlist_for_each_entry_rcu(event, head, hlist_entry) {
                if (perf_tp_event_match(event, &data, regs)) {
                        perf_swevent_event(event, count, &data, regs);

                        /*
                         * Here use the same on-stack perf_sample_data,
                         * some members in data are event-specific and
                         * need to be re-computed for different sweveents.
                         * Re-initialize data->sample_flags safely to avoid
                         * the problem that next event skips preparing data
                         * because data->sample_flags is set.
                         */
                        perf_sample_data_init(&data, 0, 0);
                        perf_sample_save_raw_data(&data, &raw);
                }
        }

        /*
         * If we got specified a target task, also iterate its context and
         * deliver this event there too.
         */
        if (task && task != current) {
                struct perf_event_context *ctx;

                rcu_read_lock();
                ctx = rcu_dereference(task->perf_event_ctxp);
                if (!ctx)
                        goto unlock;

                raw_spin_lock(&ctx->lock);
                perf_tp_event_target_task(count, record, regs, &data, ctx);
                raw_spin_unlock(&ctx->lock);
unlock:
                rcu_read_unlock();
        }

        perf_swevent_put_recursion_context(rctx);
}
EXPORT_SYMBOL_GPL(perf_tp_event);

#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
/*
 * Flags in config, used by dynamic PMU kprobe and uprobe
 * The flags should match following PMU_FORMAT_ATTR().
 *
 * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
 *                               if not set, create kprobe/uprobe
 *
 * The following values specify a reference counter (or semaphore in the
 * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
 * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
 *
 * PERF_UPROBE_REF_CTR_OFFSET_BITS        # of bits in config as th offset
 * PERF_UPROBE_REF_CTR_OFFSET_SHIFT        # of bits to shift left
 */
enum perf_probe_config {
        PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
        PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
        PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
};

PMU_FORMAT_ATTR(retprobe, "config:0");
#endif

#ifdef CONFIG_KPROBE_EVENTS
static struct attribute *kprobe_attrs[] = {
        &format_attr_retprobe.attr,
        NULL,
};

static struct attribute_group kprobe_format_group = {
        .name = "format",
        .attrs = kprobe_attrs,
};

static const struct attribute_group *kprobe_attr_groups[] = {
        &kprobe_format_group,
        NULL,
};

static int perf_kprobe_event_init(struct perf_event *event);
static struct pmu perf_kprobe = {
        .task_ctx_nr        = perf_sw_context,
        .event_init        = perf_kprobe_event_init,
        .add                = perf_trace_add,
        .del                = perf_trace_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
        .attr_groups        = kprobe_attr_groups,
};

static int perf_kprobe_event_init(struct perf_event *event)
{
        int err;
        bool is_retprobe;

        if (event->attr.type != perf_kprobe.type)
                return -ENOENT;

        if (!perfmon_capable())
                return -EACCES;

        /*
         * no branch sampling for probe events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
        err = perf_kprobe_init(event, is_retprobe);
        if (err)
                return err;

        event->destroy = perf_kprobe_destroy;

        return 0;
}
#endif /* CONFIG_KPROBE_EVENTS */

#ifdef CONFIG_UPROBE_EVENTS
PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");

static struct attribute *uprobe_attrs[] = {
        &format_attr_retprobe.attr,
        &format_attr_ref_ctr_offset.attr,
        NULL,
};

static struct attribute_group uprobe_format_group = {
        .name = "format",
        .attrs = uprobe_attrs,
};

static const struct attribute_group *uprobe_attr_groups[] = {
        &uprobe_format_group,
        NULL,
};

static int perf_uprobe_event_init(struct perf_event *event);
static struct pmu perf_uprobe = {
        .task_ctx_nr        = perf_sw_context,
        .event_init        = perf_uprobe_event_init,
        .add                = perf_trace_add,
        .del                = perf_trace_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
        .attr_groups        = uprobe_attr_groups,
};

static int perf_uprobe_event_init(struct perf_event *event)
{
        int err;
        unsigned long ref_ctr_offset;
        bool is_retprobe;

        if (event->attr.type != perf_uprobe.type)
                return -ENOENT;

        if (!perfmon_capable())
                return -EACCES;

        /*
         * no branch sampling for probe events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
        ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
        err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
        if (err)
                return err;

        event->destroy = perf_uprobe_destroy;

        return 0;
}
#endif /* CONFIG_UPROBE_EVENTS */

static inline void perf_tp_register(void)
{
        perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
#ifdef CONFIG_KPROBE_EVENTS
        perf_pmu_register(&perf_kprobe, "kprobe", -1);
#endif
#ifdef CONFIG_UPROBE_EVENTS
        perf_pmu_register(&perf_uprobe, "uprobe", -1);
#endif
}

static void perf_event_free_filter(struct perf_event *event)
{
        ftrace_profile_free_filter(event);
}

/*
 * returns true if the event is a tracepoint, or a kprobe/upprobe created
 * with perf_event_open()
 */
static inline bool perf_event_is_tracing(struct perf_event *event)
{
        if (event->pmu == &perf_tracepoint)
                return true;
#ifdef CONFIG_KPROBE_EVENTS
        if (event->pmu == &perf_kprobe)
                return true;
#endif
#ifdef CONFIG_UPROBE_EVENTS
        if (event->pmu == &perf_uprobe)
                return true;
#endif
        return false;
}

int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
                            u64 bpf_cookie)
{
        bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;

        if (!perf_event_is_tracing(event))
                return perf_event_set_bpf_handler(event, prog, bpf_cookie);

        is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE;
        is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE;
        is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
        is_syscall_tp = is_syscall_trace_event(event->tp_event);
        if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp)
                /* bpf programs can only be attached to u/kprobe or tracepoint */
                return -EINVAL;

        if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) ||
            (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
            (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
                return -EINVAL;

        if (prog->type == BPF_PROG_TYPE_KPROBE && prog->sleepable && !is_uprobe)
                /* only uprobe programs are allowed to be sleepable */
                return -EINVAL;

        /* Kprobe override only works for kprobes, not uprobes. */
        if (prog->kprobe_override && !is_kprobe)
                return -EINVAL;

        if (is_tracepoint || is_syscall_tp) {
                int off = trace_event_get_offsets(event->tp_event);

                if (prog->aux->max_ctx_offset > off)
                        return -EACCES;
        }

        return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
}

void perf_event_free_bpf_prog(struct perf_event *event)
{
        if (!perf_event_is_tracing(event)) {
                perf_event_free_bpf_handler(event);
                return;
        }
        perf_event_detach_bpf_prog(event);
}

#else

static inline void perf_tp_register(void)
{
}

static void perf_event_free_filter(struct perf_event *event)
{
}

int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
                            u64 bpf_cookie)
{
        return -ENOENT;
}

void perf_event_free_bpf_prog(struct perf_event *event)
{
}
#endif /* CONFIG_EVENT_TRACING */

#ifdef CONFIG_HAVE_HW_BREAKPOINT
void perf_bp_event(struct perf_event *bp, void *data)
{
        struct perf_sample_data sample;
        struct pt_regs *regs = data;

        perf_sample_data_init(&sample, bp->attr.bp_addr, 0);

        if (!bp->hw.state && !perf_exclude_event(bp, regs))
                perf_swevent_event(bp, 1, &sample, regs);
}
#endif

/*
 * Allocate a new address filter
 */
static struct perf_addr_filter *
perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
{
        int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
        struct perf_addr_filter *filter;

        filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
        if (!filter)
                return NULL;

        INIT_LIST_HEAD(&filter->entry);
        list_add_tail(&filter->entry, filters);

        return filter;
}

static void free_filters_list(struct list_head *filters)
{
        struct perf_addr_filter *filter, *iter;

        list_for_each_entry_safe(filter, iter, filters, entry) {
                path_put(&filter->path);
                list_del(&filter->entry);
                kfree(filter);
        }
}

/*
 * Free existing address filters and optionally install new ones
 */
static void perf_addr_filters_splice(struct perf_event *event,
                                     struct list_head *head)
{
        unsigned long flags;
        LIST_HEAD(list);

        if (!has_addr_filter(event))
                return;

        /* don't bother with children, they don't have their own filters */
        if (event->parent)
                return;

        raw_spin_lock_irqsave(&event->addr_filters.lock, flags);

        list_splice_init(&event->addr_filters.list, &list);
        if (head)
                list_splice(head, &event->addr_filters.list);

        raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);

        free_filters_list(&list);
}

/*
 * Scan through mm's vmas and see if one of them matches the
 * @filter; if so, adjust filter's address range.
 * Called with mm::mmap_lock down for reading.
 */
static void perf_addr_filter_apply(struct perf_addr_filter *filter,
                                   struct mm_struct *mm,
                                   struct perf_addr_filter_range *fr)
{
        struct vm_area_struct *vma;
        VMA_ITERATOR(vmi, mm, 0);

        for_each_vma(vmi, vma) {
                if (!vma->vm_file)
                        continue;

                if (perf_addr_filter_vma_adjust(filter, vma, fr))
                        return;
        }
}

/*
 * Update event's address range filters based on the
 * task's existing mappings, if any.
 */
static void perf_event_addr_filters_apply(struct perf_event *event)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
        struct task_struct *task = READ_ONCE(event->ctx->task);
        struct perf_addr_filter *filter;
        struct mm_struct *mm = NULL;
        unsigned int count = 0;
        unsigned long flags;

        /*
         * We may observe TASK_TOMBSTONE, which means that the event tear-down
         * will stop on the parent's child_mutex that our caller is also holding
         */
        if (task == TASK_TOMBSTONE)
                return;

        if (ifh->nr_file_filters) {
                mm = get_task_mm(task);
                if (!mm)
                        goto restart;

                mmap_read_lock(mm);
        }

        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
                if (filter->path.dentry) {
                        /*
                         * Adjust base offset if the filter is associated to a
                         * binary that needs to be mapped:
                         */
                        event->addr_filter_ranges[count].start = 0;
                        event->addr_filter_ranges[count].size = 0;

                        perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
                } else {
                        event->addr_filter_ranges[count].start = filter->offset;
                        event->addr_filter_ranges[count].size  = filter->size;
                }

                count++;
        }

        event->addr_filters_gen++;
        raw_spin_unlock_irqrestore(&ifh->lock, flags);

        if (ifh->nr_file_filters) {
                mmap_read_unlock(mm);

                mmput(mm);
        }

restart:
        perf_event_stop(event, 1);
}

/*
 * Address range filtering: limiting the data to certain
 * instruction address ranges. Filters are ioctl()ed to us from
 * userspace as ascii strings.
 *
 * Filter string format:
 *
 * ACTION RANGE_SPEC
 * where ACTION is one of the
 *  * "filter": limit the trace to this region
 *  * "start": start tracing from this address
 *  * "stop": stop tracing at this address/region;
 * RANGE_SPEC is
 *  * for kernel addresses: <start address>[/<size>]
 *  * for object files:     <start address>[/<size>]@</path/to/object/file>
 *
 * if <size> is not specified or is zero, the range is treated as a single
 * address; not valid for ACTION=="filter".
 */
enum {
        IF_ACT_NONE = -1,
        IF_ACT_FILTER,
        IF_ACT_START,
        IF_ACT_STOP,
        IF_SRC_FILE,
        IF_SRC_KERNEL,
        IF_SRC_FILEADDR,
        IF_SRC_KERNELADDR,
};

enum {
        IF_STATE_ACTION = 0,
        IF_STATE_SOURCE,
        IF_STATE_END,
};

static const match_table_t if_tokens = {
        { IF_ACT_FILTER,        "filter" },
        { IF_ACT_START,                "start" },
        { IF_ACT_STOP,                "stop" },
        { IF_SRC_FILE,                "%u/%u@%s" },
        { IF_SRC_KERNEL,        "%u/%u" },
        { IF_SRC_FILEADDR,        "%u@%s" },
        { IF_SRC_KERNELADDR,        "%u" },
        { IF_ACT_NONE,                NULL },
};

/*
 * Address filter string parser
 */
static int
perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
                             struct list_head *filters)
{
        struct perf_addr_filter *filter = NULL;
        char *start, *orig, *filename = NULL;
        substring_t args[MAX_OPT_ARGS];
        int state = IF_STATE_ACTION, token;
        unsigned int kernel = 0;
        int ret = -EINVAL;

        orig = fstr = kstrdup(fstr, GFP_KERNEL);
        if (!fstr)
                return -ENOMEM;

        while ((start = strsep(&fstr, " ,\n")) != NULL) {
                static const enum perf_addr_filter_action_t actions[] = {
                        [IF_ACT_FILTER]        = PERF_ADDR_FILTER_ACTION_FILTER,
                        [IF_ACT_START]        = PERF_ADDR_FILTER_ACTION_START,
                        [IF_ACT_STOP]        = PERF_ADDR_FILTER_ACTION_STOP,
                };
                ret = -EINVAL;

                if (!*start)
                        continue;

                /* filter definition begins */
                if (state == IF_STATE_ACTION) {
                        filter = perf_addr_filter_new(event, filters);
                        if (!filter)
                                goto fail;
                }

                token = match_token(start, if_tokens, args);
                switch (token) {
                case IF_ACT_FILTER:
                case IF_ACT_START:
                case IF_ACT_STOP:
                        if (state != IF_STATE_ACTION)
                                goto fail;

                        filter->action = actions[token];
                        state = IF_STATE_SOURCE;
                        break;

                case IF_SRC_KERNELADDR:
                case IF_SRC_KERNEL:
                        kernel = 1;
                        fallthrough;

                case IF_SRC_FILEADDR:
                case IF_SRC_FILE:
                        if (state != IF_STATE_SOURCE)
                                goto fail;

                        *args[0].to = 0;
                        ret = kstrtoul(args[0].from, 0, &filter->offset);
                        if (ret)
                                goto fail;

                        if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
                                *args[1].to = 0;
                                ret = kstrtoul(args[1].from, 0, &filter->size);
                                if (ret)
                                        goto fail;
                        }

                        if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
                                int fpos = token == IF_SRC_FILE ? 2 : 1;

                                kfree(filename);
                                filename = match_strdup(&args[fpos]);
                                if (!filename) {
                                        ret = -ENOMEM;
                                        goto fail;
                                }
                        }

                        state = IF_STATE_END;
                        break;

                default:
                        goto fail;
                }

                /*
                 * Filter definition is fully parsed, validate and install it.
                 * Make sure that it doesn't contradict itself or the event's
                 * attribute.
                 */
                if (state == IF_STATE_END) {
                        ret = -EINVAL;

                        /*
                         * ACTION "filter" must have a non-zero length region
                         * specified.
                         */
                        if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
                            !filter->size)
                                goto fail;

                        if (!kernel) {
                                if (!filename)
                                        goto fail;

                                /*
                                 * For now, we only support file-based filters
                                 * in per-task events; doing so for CPU-wide
                                 * events requires additional context switching
                                 * trickery, since same object code will be
                                 * mapped at different virtual addresses in
                                 * different processes.
                                 */
                                ret = -EOPNOTSUPP;
                                if (!event->ctx->task)
                                        goto fail;

                                /* look up the path and grab its inode */
                                ret = kern_path(filename, LOOKUP_FOLLOW,
                                                &filter->path);
                                if (ret)
                                        goto fail;

                                ret = -EINVAL;
                                if (!filter->path.dentry ||
                                    !S_ISREG(d_inode(filter->path.dentry)
                                             ->i_mode))
                                        goto fail;

                                event->addr_filters.nr_file_filters++;
                        }

                        /* ready to consume more filters */
                        kfree(filename);
                        filename = NULL;
                        state = IF_STATE_ACTION;
                        filter = NULL;
                        kernel = 0;
                }
        }

        if (state != IF_STATE_ACTION)
                goto fail;

        kfree(filename);
        kfree(orig);

        return 0;

fail:
        kfree(filename);
        free_filters_list(filters);
        kfree(orig);

        return ret;
}

static int
perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
{
        LIST_HEAD(filters);
        int ret;

        /*
         * Since this is called in perf_ioctl() path, we're already holding
         * ctx::mutex.
         */
        lockdep_assert_held(&event->ctx->mutex);

        if (WARN_ON_ONCE(event->parent))
                return -EINVAL;

        ret = perf_event_parse_addr_filter(event, filter_str, &filters);
        if (ret)
                goto fail_clear_files;

        ret = event->pmu->addr_filters_validate(&filters);
        if (ret)
                goto fail_free_filters;

        /* remove existing filters, if any */
        perf_addr_filters_splice(event, &filters);

        /* install new filters */
        perf_event_for_each_child(event, perf_event_addr_filters_apply);

        return ret;

fail_free_filters:
        free_filters_list(&filters);

fail_clear_files:
        event->addr_filters.nr_file_filters = 0;

        return ret;
}

static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
        int ret = -EINVAL;
        char *filter_str;

        filter_str = strndup_user(arg, PAGE_SIZE);
        if (IS_ERR(filter_str))
                return PTR_ERR(filter_str);

#ifdef CONFIG_EVENT_TRACING
        if (perf_event_is_tracing(event)) {
                struct perf_event_context *ctx = event->ctx;

                /*
                 * Beware, here be dragons!!
                 *
                 * the tracepoint muck will deadlock against ctx->mutex, but
                 * the tracepoint stuff does not actually need it. So
                 * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
                 * already have a reference on ctx.
                 *
                 * This can result in event getting moved to a different ctx,
                 * but that does not affect the tracepoint state.
                 */
                mutex_unlock(&ctx->mutex);
                ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
                mutex_lock(&ctx->mutex);
        } else
#endif
        if (has_addr_filter(event))
                ret = perf_event_set_addr_filter(event, filter_str);

        kfree(filter_str);
        return ret;
}

/*
 * hrtimer based swevent callback
 */

static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
{
        enum hrtimer_restart ret = HRTIMER_RESTART;
        struct perf_sample_data data;
        struct pt_regs *regs;
        struct perf_event *event;
        u64 period;

        event = container_of(hrtimer, struct perf_event, hw.hrtimer);

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return HRTIMER_NORESTART;

        event->pmu->read(event);

        perf_sample_data_init(&data, 0, event->hw.last_period);
        regs = get_irq_regs();

        if (regs && !perf_exclude_event(event, regs)) {
                if (!(event->attr.exclude_idle && is_idle_task(current)))
                        if (__perf_event_overflow(event, 1, &data, regs))
                                ret = HRTIMER_NORESTART;
        }

        period = max_t(u64, 10000, event->hw.sample_period);
        hrtimer_forward_now(hrtimer, ns_to_ktime(period));

        return ret;
}

static void perf_swevent_start_hrtimer(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;
        s64 period;

        if (!is_sampling_event(event))
                return;

        period = local64_read(&hwc->period_left);
        if (period) {
                if (period < 0)
                        period = 10000;

                local64_set(&hwc->period_left, 0);
        } else {
                period = max_t(u64, 10000, hwc->sample_period);
        }
        hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
                      HRTIMER_MODE_REL_PINNED_HARD);
}

static void perf_swevent_cancel_hrtimer(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;

        if (is_sampling_event(event)) {
                ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
                local64_set(&hwc->period_left, ktime_to_ns(remaining));

                hrtimer_cancel(&hwc->hrtimer);
        }
}

static void perf_swevent_init_hrtimer(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;

        if (!is_sampling_event(event))
                return;

        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
        hwc->hrtimer.function = perf_swevent_hrtimer;

        /*
         * Since hrtimers have a fixed rate, we can do a static freq->period
         * mapping and avoid the whole period adjust feedback stuff.
         */
        if (event->attr.freq) {
                long freq = event->attr.sample_freq;

                event->attr.sample_period = NSEC_PER_SEC / freq;
                hwc->sample_period = event->attr.sample_period;
                local64_set(&hwc->period_left, hwc->sample_period);
                hwc->last_period = hwc->sample_period;
                event->attr.freq = 0;
        }
}

/*
 * Software event: cpu wall time clock
 */

static void cpu_clock_event_update(struct perf_event *event)
{
        s64 prev;
        u64 now;

        now = local_clock();
        prev = local64_xchg(&event->hw.prev_count, now);
        local64_add(now - prev, &event->count);
}

static void cpu_clock_event_start(struct perf_event *event, int flags)
{
        local64_set(&event->hw.prev_count, local_clock());
        perf_swevent_start_hrtimer(event);
}

static void cpu_clock_event_stop(struct perf_event *event, int flags)
{
        perf_swevent_cancel_hrtimer(event);
        cpu_clock_event_update(event);
}

static int cpu_clock_event_add(struct perf_event *event, int flags)
{
        if (flags & PERF_EF_START)
                cpu_clock_event_start(event, flags);
        perf_event_update_userpage(event);

        return 0;
}

static void cpu_clock_event_del(struct perf_event *event, int flags)
{
        cpu_clock_event_stop(event, flags);
}

static void cpu_clock_event_read(struct perf_event *event)
{
        cpu_clock_event_update(event);
}

static int cpu_clock_event_init(struct perf_event *event)
{
        if (event->attr.type != perf_cpu_clock.type)
                return -ENOENT;

        if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
                return -ENOENT;

        /*
         * no branch sampling for software events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        perf_swevent_init_hrtimer(event);

        return 0;
}

static struct pmu perf_cpu_clock = {
        .task_ctx_nr        = perf_sw_context,

        .capabilities        = PERF_PMU_CAP_NO_NMI,
        .dev                = PMU_NULL_DEV,

        .event_init        = cpu_clock_event_init,
        .add                = cpu_clock_event_add,
        .del                = cpu_clock_event_del,
        .start                = cpu_clock_event_start,
        .stop                = cpu_clock_event_stop,
        .read                = cpu_clock_event_read,
};

/*
 * Software event: task time clock
 */

static void task_clock_event_update(struct perf_event *event, u64 now)
{
        u64 prev;
        s64 delta;

        prev = local64_xchg(&event->hw.prev_count, now);
        delta = now - prev;
        local64_add(delta, &event->count);
}

static void task_clock_event_start(struct perf_event *event, int flags)
{
        local64_set(&event->hw.prev_count, event->ctx->time);
        perf_swevent_start_hrtimer(event);
}

static void task_clock_event_stop(struct perf_event *event, int flags)
{
        perf_swevent_cancel_hrtimer(event);
        task_clock_event_update(event, event->ctx->time);
}

static int task_clock_event_add(struct perf_event *event, int flags)
{
        if (flags & PERF_EF_START)
                task_clock_event_start(event, flags);
        perf_event_update_userpage(event);

        return 0;
}

static void task_clock_event_del(struct perf_event *event, int flags)
{
        task_clock_event_stop(event, PERF_EF_UPDATE);
}

static void task_clock_event_read(struct perf_event *event)
{
        u64 now = perf_clock();
        u64 delta = now - event->ctx->timestamp;
        u64 time = event->ctx->time + delta;

        task_clock_event_update(event, time);
}

static int task_clock_event_init(struct perf_event *event)
{
        if (event->attr.type != perf_task_clock.type)
                return -ENOENT;

        if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
                return -ENOENT;

        /*
         * no branch sampling for software events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        perf_swevent_init_hrtimer(event);

        return 0;
}

static struct pmu perf_task_clock = {
        .task_ctx_nr        = perf_sw_context,

        .capabilities        = PERF_PMU_CAP_NO_NMI,
        .dev                = PMU_NULL_DEV,

        .event_init        = task_clock_event_init,
        .add                = task_clock_event_add,
        .del                = task_clock_event_del,
        .start                = task_clock_event_start,
        .stop                = task_clock_event_stop,
        .read                = task_clock_event_read,
};

static void perf_pmu_nop_void(struct pmu *pmu)
{
}

static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
{
}

static int perf_pmu_nop_int(struct pmu *pmu)
{
        return 0;
}

static int perf_event_nop_int(struct perf_event *event, u64 value)
{
        return 0;
}

static DEFINE_PER_CPU(unsigned int, nop_txn_flags);

static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
{
        __this_cpu_write(nop_txn_flags, flags);

        if (flags & ~PERF_PMU_TXN_ADD)
                return;

        perf_pmu_disable(pmu);
}

static int perf_pmu_commit_txn(struct pmu *pmu)
{
        unsigned int flags = __this_cpu_read(nop_txn_flags);

        __this_cpu_write(nop_txn_flags, 0);

        if (flags & ~PERF_PMU_TXN_ADD)
                return 0;

        perf_pmu_enable(pmu);
        return 0;
}

static void perf_pmu_cancel_txn(struct pmu *pmu)
{
        unsigned int flags =  __this_cpu_read(nop_txn_flags);

        __this_cpu_write(nop_txn_flags, 0);

        if (flags & ~PERF_PMU_TXN_ADD)
                return;

        perf_pmu_enable(pmu);
}

static int perf_event_idx_default(struct perf_event *event)
{
        return 0;
}

static void free_pmu_context(struct pmu *pmu)
{
        free_percpu(pmu->cpu_pmu_context);
}

/*
 * Let userspace know that this PMU supports address range filtering:
 */
static ssize_t nr_addr_filters_show(struct device *dev,
                                    struct device_attribute *attr,
                                    char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);

        return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
}
DEVICE_ATTR_RO(nr_addr_filters);

static struct idr pmu_idr;

static ssize_t
type_show(struct device *dev, struct device_attribute *attr, char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);

        return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type);
}
static DEVICE_ATTR_RO(type);

static ssize_t
perf_event_mux_interval_ms_show(struct device *dev,
                                struct device_attribute *attr,
                                char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);

        return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms);
}

static DEFINE_MUTEX(mux_interval_mutex);

static ssize_t
perf_event_mux_interval_ms_store(struct device *dev,
                                 struct device_attribute *attr,
                                 const char *buf, size_t count)
{
        struct pmu *pmu = dev_get_drvdata(dev);
        int timer, cpu, ret;

        ret = kstrtoint(buf, 0, &timer);
        if (ret)
                return ret;

        if (timer < 1)
                return -EINVAL;

        /* same value, noting to do */
        if (timer == pmu->hrtimer_interval_ms)
                return count;

        mutex_lock(&mux_interval_mutex);
        pmu->hrtimer_interval_ms = timer;

        /* update all cpuctx for this PMU */
        cpus_read_lock();
        for_each_online_cpu(cpu) {
                struct perf_cpu_pmu_context *cpc;
                cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
                cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);

                cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc);
        }
        cpus_read_unlock();
        mutex_unlock(&mux_interval_mutex);

        return count;
}
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);

static struct attribute *pmu_dev_attrs[] = {
        &dev_attr_type.attr,
        &dev_attr_perf_event_mux_interval_ms.attr,
        &dev_attr_nr_addr_filters.attr,
        NULL,
};

static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n)
{
        struct device *dev = kobj_to_dev(kobj);
        struct pmu *pmu = dev_get_drvdata(dev);

        if (n == 2 && !pmu->nr_addr_filters)
                return 0;

        return a->mode;
}

static struct attribute_group pmu_dev_attr_group = {
        .is_visible = pmu_dev_is_visible,
        .attrs = pmu_dev_attrs,
};

static const struct attribute_group *pmu_dev_groups[] = {
        &pmu_dev_attr_group,
        NULL,
};

static int pmu_bus_running;
static struct bus_type pmu_bus = {
        .name                = "event_source",
        .dev_groups        = pmu_dev_groups,
};

static void pmu_dev_release(struct device *dev)
{
        kfree(dev);
}

static int pmu_dev_alloc(struct pmu *pmu)
{
        int ret = -ENOMEM;

        pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
        if (!pmu->dev)
                goto out;

        pmu->dev->groups = pmu->attr_groups;
        device_initialize(pmu->dev);

        dev_set_drvdata(pmu->dev, pmu);
        pmu->dev->bus = &pmu_bus;
        pmu->dev->parent = pmu->parent;
        pmu->dev->release = pmu_dev_release;

        ret = dev_set_name(pmu->dev, "%s", pmu->name);
        if (ret)
                goto free_dev;

        ret = device_add(pmu->dev);
        if (ret)
                goto free_dev;

        if (pmu->attr_update) {
                ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
                if (ret)
                        goto del_dev;
        }

out:
        return ret;

del_dev:
        device_del(pmu->dev);

free_dev:
        put_device(pmu->dev);
        goto out;
}

static struct lock_class_key cpuctx_mutex;
static struct lock_class_key cpuctx_lock;

int perf_pmu_register(struct pmu *pmu, const char *name, int type)
{
        int cpu, ret, max = PERF_TYPE_MAX;

        mutex_lock(&pmus_lock);
        ret = -ENOMEM;
        pmu->pmu_disable_count = alloc_percpu(int);
        if (!pmu->pmu_disable_count)
                goto unlock;

        pmu->type = -1;
        if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) {
                ret = -EINVAL;
                goto free_pdc;
        }

        pmu->name = name;

        if (type >= 0)
                max = type;

        ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
        if (ret < 0)
                goto free_pdc;

        WARN_ON(type >= 0 && ret != type);

        type = ret;
        pmu->type = type;

        if (pmu_bus_running && !pmu->dev) {
                ret = pmu_dev_alloc(pmu);
                if (ret)
                        goto free_idr;
        }

        ret = -ENOMEM;
        pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
        if (!pmu->cpu_pmu_context)
                goto free_dev;

        for_each_possible_cpu(cpu) {
                struct perf_cpu_pmu_context *cpc;

                cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
                __perf_init_event_pmu_context(&cpc->epc, pmu);
                __perf_mux_hrtimer_init(cpc, cpu);
        }

        if (!pmu->start_txn) {
                if (pmu->pmu_enable) {
                        /*
                         * If we have pmu_enable/pmu_disable calls, install
                         * transaction stubs that use that to try and batch
                         * hardware accesses.
                         */
                        pmu->start_txn  = perf_pmu_start_txn;
                        pmu->commit_txn = perf_pmu_commit_txn;
                        pmu->cancel_txn = perf_pmu_cancel_txn;
                } else {
                        pmu->start_txn  = perf_pmu_nop_txn;
                        pmu->commit_txn = perf_pmu_nop_int;
                        pmu->cancel_txn = perf_pmu_nop_void;
                }
        }

        if (!pmu->pmu_enable) {
                pmu->pmu_enable  = perf_pmu_nop_void;
                pmu->pmu_disable = perf_pmu_nop_void;
        }

        if (!pmu->check_period)
                pmu->check_period = perf_event_nop_int;

        if (!pmu->event_idx)
                pmu->event_idx = perf_event_idx_default;

        list_add_rcu(&pmu->entry, &pmus);
        atomic_set(&pmu->exclusive_cnt, 0);
        ret = 0;
unlock:
        mutex_unlock(&pmus_lock);

        return ret;

free_dev:
        if (pmu->dev && pmu->dev != PMU_NULL_DEV) {
                device_del(pmu->dev);
                put_device(pmu->dev);
        }

free_idr:
        idr_remove(&pmu_idr, pmu->type);

free_pdc:
        free_percpu(pmu->pmu_disable_count);
        goto unlock;
}
EXPORT_SYMBOL_GPL(perf_pmu_register);

void perf_pmu_unregister(struct pmu *pmu)
{
        mutex_lock(&pmus_lock);
        list_del_rcu(&pmu->entry);

        /*
         * We dereference the pmu list under both SRCU and regular RCU, so
         * synchronize against both of those.
         */
        synchronize_srcu(&pmus_srcu);
        synchronize_rcu();

        free_percpu(pmu->pmu_disable_count);
        idr_remove(&pmu_idr, pmu->type);
        if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
                if (pmu->nr_addr_filters)
                        device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
                device_del(pmu->dev);
                put_device(pmu->dev);
        }
        free_pmu_context(pmu);
        mutex_unlock(&pmus_lock);
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);

static inline bool has_extended_regs(struct perf_event *event)
{
        return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
               (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
}

static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
{
        struct perf_event_context *ctx = NULL;
        int ret;

        if (!try_module_get(pmu->module))
                return -ENODEV;

        /*
         * A number of pmu->event_init() methods iterate the sibling_list to,
         * for example, validate if the group fits on the PMU. Therefore,
         * if this is a sibling event, acquire the ctx->mutex to protect
         * the sibling_list.
         */
        if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
                /*
                 * This ctx->mutex can nest when we're called through
                 * inheritance. See the perf_event_ctx_lock_nested() comment.
                 */
                ctx = perf_event_ctx_lock_nested(event->group_leader,
                                                 SINGLE_DEPTH_NESTING);
                BUG_ON(!ctx);
        }

        event->pmu = pmu;
        ret = pmu->event_init(event);

        if (ctx)
                perf_event_ctx_unlock(event->group_leader, ctx);

        if (!ret) {
                if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
                    has_extended_regs(event))
                        ret = -EOPNOTSUPP;

                if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
                    event_has_any_exclude_flag(event))
                        ret = -EINVAL;

                if (ret && event->destroy)
                        event->destroy(event);
        }

        if (ret)
                module_put(pmu->module);

        return ret;
}

static struct pmu *perf_init_event(struct perf_event *event)
{
        bool extended_type = false;
        int idx, type, ret;
        struct pmu *pmu;

        idx = srcu_read_lock(&pmus_srcu);

        /*
         * Save original type before calling pmu->event_init() since certain
         * pmus overwrites event->attr.type to forward event to another pmu.
         */
        event->orig_type = event->attr.type;

        /* Try parent's PMU first: */
        if (event->parent && event->parent->pmu) {
                pmu = event->parent->pmu;
                ret = perf_try_init_event(pmu, event);
                if (!ret)
                        goto unlock;
        }

        /*
         * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
         * are often aliases for PERF_TYPE_RAW.
         */
        type = event->attr.type;
        if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) {
                type = event->attr.config >> PERF_PMU_TYPE_SHIFT;
                if (!type) {
                        type = PERF_TYPE_RAW;
                } else {
                        extended_type = true;
                        event->attr.config &= PERF_HW_EVENT_MASK;
                }
        }

again:
        rcu_read_lock();
        pmu = idr_find(&pmu_idr, type);
        rcu_read_unlock();
        if (pmu) {
                if (event->attr.type != type && type != PERF_TYPE_RAW &&
                    !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
                        goto fail;

                ret = perf_try_init_event(pmu, event);
                if (ret == -ENOENT && event->attr.type != type && !extended_type) {
                        type = event->attr.type;
                        goto again;
                }

                if (ret)
                        pmu = ERR_PTR(ret);

                goto unlock;
        }

        list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
                ret = perf_try_init_event(pmu, event);
                if (!ret)
                        goto unlock;

                if (ret != -ENOENT) {
                        pmu = ERR_PTR(ret);
                        goto unlock;
                }
        }
fail:
        pmu = ERR_PTR(-ENOENT);
unlock:
        srcu_read_unlock(&pmus_srcu, idx);

        return pmu;
}

static void attach_sb_event(struct perf_event *event)
{
        struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);

        raw_spin_lock(&pel->lock);
        list_add_rcu(&event->sb_list, &pel->list);
        raw_spin_unlock(&pel->lock);
}

/*
 * We keep a list of all !task (and therefore per-cpu) events
 * that need to receive side-band records.
 *
 * This avoids having to scan all the various PMU per-cpu contexts
 * looking for them.
 */
static void account_pmu_sb_event(struct perf_event *event)
{
        if (is_sb_event(event))
                attach_sb_event(event);
}

/* Freq events need the tick to stay alive (see perf_event_task_tick). */
static void account_freq_event_nohz(void)
{
#ifdef CONFIG_NO_HZ_FULL
        /* Lock so we don't race with concurrent unaccount */
        spin_lock(&nr_freq_lock);
        if (atomic_inc_return(&nr_freq_events) == 1)
                tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
        spin_unlock(&nr_freq_lock);
#endif
}

static void account_freq_event(void)
{
        if (tick_nohz_full_enabled())
                account_freq_event_nohz();
        else
                atomic_inc(&nr_freq_events);
}


static void account_event(struct perf_event *event)
{
        bool inc = false;

        if (event->parent)
                return;

        if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
                inc = true;
        if (event->attr.mmap || event->attr.mmap_data)
                atomic_inc(&nr_mmap_events);
        if (event->attr.build_id)
                atomic_inc(&nr_build_id_events);
        if (event->attr.comm)
                atomic_inc(&nr_comm_events);
        if (event->attr.namespaces)
                atomic_inc(&nr_namespaces_events);
        if (event->attr.cgroup)
                atomic_inc(&nr_cgroup_events);
        if (event->attr.task)
                atomic_inc(&nr_task_events);
        if (event->attr.freq)
                account_freq_event();
        if (event->attr.context_switch) {
                atomic_inc(&nr_switch_events);
                inc = true;
        }
        if (has_branch_stack(event))
                inc = true;
        if (is_cgroup_event(event))
                inc = true;
        if (event->attr.ksymbol)
                atomic_inc(&nr_ksymbol_events);
        if (event->attr.bpf_event)
                atomic_inc(&nr_bpf_events);
        if (event->attr.text_poke)
                atomic_inc(&nr_text_poke_events);

        if (inc) {
                /*
                 * We need the mutex here because static_branch_enable()
                 * must complete *before* the perf_sched_count increment
                 * becomes visible.
                 */
                if (atomic_inc_not_zero(&perf_sched_count))
                        goto enabled;

                mutex_lock(&perf_sched_mutex);
                if (!atomic_read(&perf_sched_count)) {
                        static_branch_enable(&perf_sched_events);
                        /*
                         * Guarantee that all CPUs observe they key change and
                         * call the perf scheduling hooks before proceeding to
                         * install events that need them.
                         */
                        synchronize_rcu();
                }
                /*
                 * Now that we have waited for the sync_sched(), allow further
                 * increments to by-pass the mutex.
                 */
                atomic_inc(&perf_sched_count);
                mutex_unlock(&perf_sched_mutex);
        }
enabled:

        account_pmu_sb_event(event);
}

/*
 * Allocate and initialize an event structure
 */
static struct perf_event *
perf_event_alloc(struct perf_event_attr *attr, int cpu,
                 struct task_struct *task,
                 struct perf_event *group_leader,
                 struct perf_event *parent_event,
                 perf_overflow_handler_t overflow_handler,
                 void *context, int cgroup_fd)
{
        struct pmu *pmu;
        struct perf_event *event;
        struct hw_perf_event *hwc;
        long err = -EINVAL;
        int node;

        if ((unsigned)cpu >= nr_cpu_ids) {
                if (!task || cpu != -1)
                        return ERR_PTR(-EINVAL);
        }
        if (attr->sigtrap && !task) {
                /* Requires a task: avoid signalling random tasks. */
                return ERR_PTR(-EINVAL);
        }

        node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
        event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO,
                                      node);
        if (!event)
                return ERR_PTR(-ENOMEM);

        /*
         * Single events are their own group leaders, with an
         * empty sibling list:
         */
        if (!group_leader)
                group_leader = event;

        mutex_init(&event->child_mutex);
        INIT_LIST_HEAD(&event->child_list);

        INIT_LIST_HEAD(&event->event_entry);
        INIT_LIST_HEAD(&event->sibling_list);
        INIT_LIST_HEAD(&event->active_list);
        init_event_group(event);
        INIT_LIST_HEAD(&event->rb_entry);
        INIT_LIST_HEAD(&event->active_entry);
        INIT_LIST_HEAD(&event->addr_filters.list);
        INIT_HLIST_NODE(&event->hlist_entry);


        init_waitqueue_head(&event->waitq);
        init_irq_work(&event->pending_irq, perf_pending_irq);
        init_task_work(&event->pending_task, perf_pending_task);

        mutex_init(&event->mmap_mutex);
        raw_spin_lock_init(&event->addr_filters.lock);

        atomic_long_set(&event->refcount, 1);
        event->cpu                = cpu;
        event->attr                = *attr;
        event->group_leader        = group_leader;
        event->pmu                = NULL;
        event->oncpu                = -1;

        event->parent                = parent_event;

        event->ns                = get_pid_ns(task_active_pid_ns(current));
        event->id                = atomic64_inc_return(&perf_event_id);

        event->state                = PERF_EVENT_STATE_INACTIVE;

        if (parent_event)
                event->event_caps = parent_event->event_caps;

        if (task) {
                event->attach_state = PERF_ATTACH_TASK;
                /*
                 * XXX pmu::event_init needs to know what task to account to
                 * and we cannot use the ctx information because we need the
                 * pmu before we get a ctx.
                 */
                event->hw.target = get_task_struct(task);
        }

        event->clock = &local_clock;
        if (parent_event)
                event->clock = parent_event->clock;

        if (!overflow_handler && parent_event) {
                overflow_handler = parent_event->overflow_handler;
                context = parent_event->overflow_handler_context;
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
                if (parent_event->prog) {
                        struct bpf_prog *prog = parent_event->prog;

                        bpf_prog_inc(prog);
                        event->prog = prog;
                }
#endif
        }

        if (overflow_handler) {
                event->overflow_handler        = overflow_handler;
                event->overflow_handler_context = context;
        } else if (is_write_backward(event)){
                event->overflow_handler = perf_event_output_backward;
                event->overflow_handler_context = NULL;
        } else {
                event->overflow_handler = perf_event_output_forward;
                event->overflow_handler_context = NULL;
        }

        perf_event__state_init(event);

        pmu = NULL;

        hwc = &event->hw;
        hwc->sample_period = attr->sample_period;
        if (attr->freq && attr->sample_freq)
                hwc->sample_period = 1;
        hwc->last_period = hwc->sample_period;

        local64_set(&hwc->period_left, hwc->sample_period);

        /*
         * We currently do not support PERF_SAMPLE_READ on inherited events.
         * See perf_output_read().
         */
        if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
                goto err_ns;

        if (!has_branch_stack(event))
                event->attr.branch_sample_type = 0;

        pmu = perf_init_event(event);
        if (IS_ERR(pmu)) {
                err = PTR_ERR(pmu);
                goto err_ns;
        }

        /*
         * Disallow uncore-task events. Similarly, disallow uncore-cgroup
         * events (they don't make sense as the cgroup will be different
         * on other CPUs in the uncore mask).
         */
        if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
                err = -EINVAL;
                goto err_pmu;
        }

        if (event->attr.aux_output &&
            !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
                err = -EOPNOTSUPP;
                goto err_pmu;
        }

        if (cgroup_fd != -1) {
                err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
                if (err)
                        goto err_pmu;
        }

        err = exclusive_event_init(event);
        if (err)
                goto err_pmu;

        if (has_addr_filter(event)) {
                event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
                                                    sizeof(struct perf_addr_filter_range),
                                                    GFP_KERNEL);
                if (!event->addr_filter_ranges) {
                        err = -ENOMEM;
                        goto err_per_task;
                }

                /*
                 * Clone the parent's vma offsets: they are valid until exec()
                 * even if the mm is not shared with the parent.
                 */
                if (event->parent) {
                        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);

                        raw_spin_lock_irq(&ifh->lock);
                        memcpy(event->addr_filter_ranges,
                               event->parent->addr_filter_ranges,
                               pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
                        raw_spin_unlock_irq(&ifh->lock);
                }

                /* force hw sync on the address filters */
                event->addr_filters_gen = 1;
        }

        if (!event->parent) {
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
                        err = get_callchain_buffers(attr->sample_max_stack);
                        if (err)
                                goto err_addr_filters;
                }
        }

        err = security_perf_event_alloc(event);
        if (err)
                goto err_callchain_buffer;

        /* symmetric to unaccount_event() in _free_event() */
        account_event(event);

        return event;

err_callchain_buffer:
        if (!event->parent) {
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
                        put_callchain_buffers();
        }
err_addr_filters:
        kfree(event->addr_filter_ranges);

err_per_task:
        exclusive_event_destroy(event);

err_pmu:
        if (is_cgroup_event(event))
                perf_detach_cgroup(event);
        if (event->destroy)
                event->destroy(event);
        module_put(pmu->module);
err_ns:
        if (event->hw.target)
                put_task_struct(event->hw.target);
        call_rcu(&event->rcu_head, free_event_rcu);

        return ERR_PTR(err);
}

static int perf_copy_attr(struct perf_event_attr __user *uattr,
                          struct perf_event_attr *attr)
{
        u32 size;
        int ret;

        /* Zero the full structure, so that a short copy will be nice. */
        memset(attr, 0, sizeof(*attr));

        ret = get_user(size, &uattr->size);
        if (ret)
                return ret;

        /* ABI compatibility quirk: */
        if (!size)
                size = PERF_ATTR_SIZE_VER0;
        if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
                goto err_size;

        ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
        if (ret) {
                if (ret == -E2BIG)
                        goto err_size;
                return ret;
        }

        attr->size = size;

        if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
                return -EINVAL;

        if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
                return -EINVAL;

        if (attr->read_format & ~(PERF_FORMAT_MAX-1))
                return -EINVAL;

        if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
                u64 mask = attr->branch_sample_type;

                /* only using defined bits */
                if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
                        return -EINVAL;

                /* at least one branch bit must be set */
                if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
                        return -EINVAL;

                /* propagate priv level, when not set for branch */
                if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {

                        /* exclude_kernel checked on syscall entry */
                        if (!attr->exclude_kernel)
                                mask |= PERF_SAMPLE_BRANCH_KERNEL;

                        if (!attr->exclude_user)
                                mask |= PERF_SAMPLE_BRANCH_USER;

                        if (!attr->exclude_hv)
                                mask |= PERF_SAMPLE_BRANCH_HV;
                        /*
                         * adjust user setting (for HW filter setup)
                         */
                        attr->branch_sample_type = mask;
                }
                /* privileged levels capture (kernel, hv): check permissions */
                if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
                        ret = perf_allow_kernel(attr);
                        if (ret)
                                return ret;
                }
        }

        if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
                ret = perf_reg_validate(attr->sample_regs_user);
                if (ret)
                        return ret;
        }

        if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
                if (!arch_perf_have_user_stack_dump())
                        return -ENOSYS;

                /*
                 * We have __u32 type for the size, but so far
                 * we can only use __u16 as maximum due to the
                 * __u16 sample size limit.
                 */
                if (attr->sample_stack_user >= USHRT_MAX)
                        return -EINVAL;
                else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
                        return -EINVAL;
        }

        if (!attr->sample_max_stack)
                attr->sample_max_stack = sysctl_perf_event_max_stack;

        if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
                ret = perf_reg_validate(attr->sample_regs_intr);

#ifndef CONFIG_CGROUP_PERF
        if (attr->sample_type & PERF_SAMPLE_CGROUP)
                return -EINVAL;
#endif
        if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
            (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
                return -EINVAL;

        if (!attr->inherit && attr->inherit_thread)
                return -EINVAL;

        if (attr->remove_on_exec && attr->enable_on_exec)
                return -EINVAL;

        if (attr->sigtrap && !attr->remove_on_exec)
                return -EINVAL;

out:
        return ret;

err_size:
        put_user(sizeof(*attr), &uattr->size);
        ret = -E2BIG;
        goto out;
}

static void mutex_lock_double(struct mutex *a, struct mutex *b)
{
        if (b < a)
                swap(a, b);

        mutex_lock(a);
        mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
}

static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
        struct perf_buffer *rb = NULL;
        int ret = -EINVAL;

        if (!output_event) {
                mutex_lock(&event->mmap_mutex);
                goto set;
        }

        /* don't allow circular references */
        if (event == output_event)
                goto out;

        /*
         * Don't allow cross-cpu buffers
         */
        if (output_event->cpu != event->cpu)
                goto out;

        /*
         * If its not a per-cpu rb, it must be the same task.
         */
        if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
                goto out;

        /*
         * Mixing clocks in the same buffer is trouble you don't need.
         */
        if (output_event->clock != event->clock)
                goto out;

        /*
         * Either writing ring buffer from beginning or from end.
         * Mixing is not allowed.
         */
        if (is_write_backward(output_event) != is_write_backward(event))
                goto out;

        /*
         * If both events generate aux data, they must be on the same PMU
         */
        if (has_aux(event) && has_aux(output_event) &&
            event->pmu != output_event->pmu)
                goto out;

        /*
         * Hold both mmap_mutex to serialize against perf_mmap_close().  Since
         * output_event is already on rb->event_list, and the list iteration
         * restarts after every removal, it is guaranteed this new event is
         * observed *OR* if output_event is already removed, it's guaranteed we
         * observe !rb->mmap_count.
         */
        mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
        /* Can't redirect output if we've got an active mmap() */
        if (atomic_read(&event->mmap_count))
                goto unlock;

        if (output_event) {
                /* get the rb we want to redirect to */
                rb = ring_buffer_get(output_event);
                if (!rb)
                        goto unlock;

                /* did we race against perf_mmap_close() */
                if (!atomic_read(&rb->mmap_count)) {
                        ring_buffer_put(rb);
                        goto unlock;
                }
        }

        ring_buffer_attach(event, rb);

        ret = 0;
unlock:
        mutex_unlock(&event->mmap_mutex);
        if (output_event)
                mutex_unlock(&output_event->mmap_mutex);

out:
        return ret;
}

static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
{
        bool nmi_safe = false;

        switch (clk_id) {
        case CLOCK_MONOTONIC:
                event->clock = &ktime_get_mono_fast_ns;
                nmi_safe = true;
                break;

        case CLOCK_MONOTONIC_RAW:
                event->clock = &ktime_get_raw_fast_ns;
                nmi_safe = true;
                break;

        case CLOCK_REALTIME:
                event->clock = &ktime_get_real_ns;
                break;

        case CLOCK_BOOTTIME:
                event->clock = &ktime_get_boottime_ns;
                break;

        case CLOCK_TAI:
                event->clock = &ktime_get_clocktai_ns;
                break;

        default:
                return -EINVAL;
        }

        if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
                return -EINVAL;

        return 0;
}

static bool
perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
{
        unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS;
        bool is_capable = perfmon_capable();

        if (attr->sigtrap) {
                /*
                 * perf_event_attr::sigtrap sends signals to the other task.
                 * Require the current task to also have CAP_KILL.
                 */
                rcu_read_lock();
                is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL);
                rcu_read_unlock();

                /*
                 * If the required capabilities aren't available, checks for
                 * ptrace permissions: upgrade to ATTACH, since sending signals
                 * can effectively change the target task.
                 */
                ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS;
        }

        /*
         * Preserve ptrace permission check for backwards compatibility. The
         * ptrace check also includes checks that the current task and other
         * task have matching uids, and is therefore not done here explicitly.
         */
        return is_capable || ptrace_may_access(task, ptrace_mode);
}

/**
 * sys_perf_event_open - open a performance event, associate it to a task/cpu
 *
 * @attr_uptr:        event_id type attributes for monitoring/sampling
 * @pid:                target pid
 * @cpu:                target cpu
 * @group_fd:                group leader event fd
 * @flags:                perf event open flags
 */
SYSCALL_DEFINE5(perf_event_open,
                struct perf_event_attr __user *, attr_uptr,
                pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
        struct perf_event *group_leader = NULL, *output_event = NULL;
        struct perf_event_pmu_context *pmu_ctx;
        struct perf_event *event, *sibling;
        struct perf_event_attr attr;
        struct perf_event_context *ctx;
        struct file *event_file = NULL;
        struct fd group = {NULL, 0};
        struct task_struct *task = NULL;
        struct pmu *pmu;
        int event_fd;
        int move_group = 0;
        int err;
        int f_flags = O_RDWR;
        int cgroup_fd = -1;

        /* for future expandability... */
        if (flags & ~PERF_FLAG_ALL)
                return -EINVAL;

        err = perf_copy_attr(attr_uptr, &attr);
        if (err)
                return err;

        /* Do we allow access to perf_event_open(2) ? */
        err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
        if (err)
                return err;

        if (!attr.exclude_kernel) {
                err = perf_allow_kernel(&attr);
                if (err)
                        return err;
        }

        if (attr.namespaces) {
                if (!perfmon_capable())
                        return -EACCES;
        }

        if (attr.freq) {
                if (attr.sample_freq > sysctl_perf_event_sample_rate)
                        return -EINVAL;
        } else {
                if (attr.sample_period & (1ULL << 63))
                        return -EINVAL;
        }

        /* Only privileged users can get physical addresses */
        if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
                err = perf_allow_kernel(&attr);
                if (err)
                        return err;
        }

        /* REGS_INTR can leak data, lockdown must prevent this */
        if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
                err = security_locked_down(LOCKDOWN_PERF);
                if (err)
                        return err;
        }

        /*
         * In cgroup mode, the pid argument is used to pass the fd
         * opened to the cgroup directory in cgroupfs. The cpu argument
         * designates the cpu on which to monitor threads from that
         * cgroup.
         */
        if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
                return -EINVAL;

        if (flags & PERF_FLAG_FD_CLOEXEC)
                f_flags |= O_CLOEXEC;

        event_fd = get_unused_fd_flags(f_flags);
        if (event_fd < 0)
                return event_fd;

        if (group_fd != -1) {
                err = perf_fget_light(group_fd, &group);
                if (err)
                        goto err_fd;
                group_leader = group.file->private_data;
                if (flags & PERF_FLAG_FD_OUTPUT)
                        output_event = group_leader;
                if (flags & PERF_FLAG_FD_NO_GROUP)
                        group_leader = NULL;
        }

        if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
                task = find_lively_task_by_vpid(pid);
                if (IS_ERR(task)) {
                        err = PTR_ERR(task);
                        goto err_group_fd;
                }
        }

        if (task && group_leader &&
            group_leader->attr.inherit != attr.inherit) {
                err = -EINVAL;
                goto err_task;
        }

        if (flags & PERF_FLAG_PID_CGROUP)
                cgroup_fd = pid;

        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
                                 NULL, NULL, cgroup_fd);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err_task;
        }

        if (is_sampling_event(event)) {
                if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
                        err = -EOPNOTSUPP;
                        goto err_alloc;
                }
        }

        /*
         * Special case software events and allow them to be part of
         * any hardware group.
         */
        pmu = event->pmu;

        if (attr.use_clockid) {
                err = perf_event_set_clock(event, attr.clockid);
                if (err)
                        goto err_alloc;
        }

        if (pmu->task_ctx_nr == perf_sw_context)
                event->event_caps |= PERF_EV_CAP_SOFTWARE;

        if (task) {
                err = down_read_interruptible(&task->signal->exec_update_lock);
                if (err)
                        goto err_alloc;

                /*
                 * We must hold exec_update_lock across this and any potential
                 * perf_install_in_context() call for this new event to
                 * serialize against exec() altering our credentials (and the
                 * perf_event_exit_task() that could imply).
                 */
                err = -EACCES;
                if (!perf_check_permission(&attr, task))
                        goto err_cred;
        }

        /*
         * Get the target context (task or percpu):
         */
        ctx = find_get_context(task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
                goto err_cred;
        }

        mutex_lock(&ctx->mutex);

        if (ctx->task == TASK_TOMBSTONE) {
                err = -ESRCH;
                goto err_locked;
        }

        if (!task) {
                /*
                 * Check if the @cpu we're creating an event for is online.
                 *
                 * We use the perf_cpu_context::ctx::mutex to serialize against
                 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
                 */
                struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);

                if (!cpuctx->online) {
                        err = -ENODEV;
                        goto err_locked;
                }
        }

        if (group_leader) {
                err = -EINVAL;

                /*
                 * Do not allow a recursive hierarchy (this new sibling
                 * becoming part of another group-sibling):
                 */
                if (group_leader->group_leader != group_leader)
                        goto err_locked;

                /* All events in a group should have the same clock */
                if (group_leader->clock != event->clock)
                        goto err_locked;

                /*
                 * Make sure we're both events for the same CPU;
                 * grouping events for different CPUs is broken; since
                 * you can never concurrently schedule them anyhow.
                 */
                if (group_leader->cpu != event->cpu)
                        goto err_locked;

                /*
                 * Make sure we're both on the same context; either task or cpu.
                 */
                if (group_leader->ctx != ctx)
                        goto err_locked;

                /*
                 * Only a group leader can be exclusive or pinned
                 */
                if (attr.exclusive || attr.pinned)
                        goto err_locked;

                if (is_software_event(event) &&
                    !in_software_context(group_leader)) {
                        /*
                         * If the event is a sw event, but the group_leader
                         * is on hw context.
                         *
                         * Allow the addition of software events to hw
                         * groups, this is safe because software events
                         * never fail to schedule.
                         *
                         * Note the comment that goes with struct
                         * perf_event_pmu_context.
                         */
                        pmu = group_leader->pmu_ctx->pmu;
                } else if (!is_software_event(event)) {
                        if (is_software_event(group_leader) &&
                            (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
                                /*
                                 * In case the group is a pure software group, and we
                                 * try to add a hardware event, move the whole group to
                                 * the hardware context.
                                 */
                                move_group = 1;
                        }

                        /* Don't allow group of multiple hw events from different pmus */
                        if (!in_software_context(group_leader) &&
                            group_leader->pmu_ctx->pmu != pmu)
                                goto err_locked;
                }
        }

        /*
         * Now that we're certain of the pmu; find the pmu_ctx.
         */
        pmu_ctx = find_get_pmu_context(pmu, ctx, event);
        if (IS_ERR(pmu_ctx)) {
                err = PTR_ERR(pmu_ctx);
                goto err_locked;
        }
        event->pmu_ctx = pmu_ctx;

        if (output_event) {
                err = perf_event_set_output(event, output_event);
                if (err)
                        goto err_context;
        }

        if (!perf_event_validate_size(event)) {
                err = -E2BIG;
                goto err_context;
        }

        if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
                err = -EINVAL;
                goto err_context;
        }

        /*
         * Must be under the same ctx::mutex as perf_install_in_context(),
         * because we need to serialize with concurrent event creation.
         */
        if (!exclusive_event_installable(event, ctx)) {
                err = -EBUSY;
                goto err_context;
        }

        WARN_ON_ONCE(ctx->parent_ctx);

        event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags);
        if (IS_ERR(event_file)) {
                err = PTR_ERR(event_file);
                event_file = NULL;
                goto err_context;
        }

        /*
         * This is the point on no return; we cannot fail hereafter. This is
         * where we start modifying current state.
         */

        if (move_group) {
                perf_remove_from_context(group_leader, 0);
                put_pmu_ctx(group_leader->pmu_ctx);

                for_each_sibling_event(sibling, group_leader) {
                        perf_remove_from_context(sibling, 0);
                        put_pmu_ctx(sibling->pmu_ctx);
                }

                /*
                 * Install the group siblings before the group leader.
                 *
                 * Because a group leader will try and install the entire group
                 * (through the sibling list, which is still in-tact), we can
                 * end up with siblings installed in the wrong context.
                 *
                 * By installing siblings first we NO-OP because they're not
                 * reachable through the group lists.
                 */
                for_each_sibling_event(sibling, group_leader) {
                        sibling->pmu_ctx = pmu_ctx;
                        get_pmu_ctx(pmu_ctx);
                        perf_event__state_init(sibling);
                        perf_install_in_context(ctx, sibling, sibling->cpu);
                }

                /*
                 * Removing from the context ends up with disabled
                 * event. What we want here is event in the initial
                 * startup state, ready to be add into new context.
                 */
                group_leader->pmu_ctx = pmu_ctx;
                get_pmu_ctx(pmu_ctx);
                perf_event__state_init(group_leader);
                perf_install_in_context(ctx, group_leader, group_leader->cpu);
        }

        /*
         * Precalculate sample_data sizes; do while holding ctx::mutex such
         * that we're serialized against further additions and before
         * perf_install_in_context() which is the point the event is active and
         * can use these values.
         */
        perf_event__header_size(event);
        perf_event__id_header_size(event);

        event->owner = current;

        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);

        mutex_unlock(&ctx->mutex);

        if (task) {
                up_read(&task->signal->exec_update_lock);
                put_task_struct(task);
        }

        mutex_lock(&current->perf_event_mutex);
        list_add_tail(&event->owner_entry, &current->perf_event_list);
        mutex_unlock(&current->perf_event_mutex);

        /*
         * Drop the reference on the group_event after placing the
         * new event on the sibling_list. This ensures destruction
         * of the group leader will find the pointer to itself in
         * perf_group_detach().
         */
        fdput(group);
        fd_install(event_fd, event_file);
        return event_fd;

err_context:
        put_pmu_ctx(event->pmu_ctx);
        event->pmu_ctx = NULL; /* _free_event() */
err_locked:
        mutex_unlock(&ctx->mutex);
        perf_unpin_context(ctx);
        put_ctx(ctx);
err_cred:
        if (task)
                up_read(&task->signal->exec_update_lock);
err_alloc:
        free_event(event);
err_task:
        if (task)
                put_task_struct(task);
err_group_fd:
        fdput(group);
err_fd:
        put_unused_fd(event_fd);
        return err;
}

/**
 * perf_event_create_kernel_counter
 *
 * @attr: attributes of the counter to create
 * @cpu: cpu in which the counter is bound
 * @task: task to profile (NULL for percpu)
 * @overflow_handler: callback to trigger when we hit the event
 * @context: context data could be used in overflow_handler callback
 */
struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
                                 struct task_struct *task,
                                 perf_overflow_handler_t overflow_handler,
                                 void *context)
{
        struct perf_event_pmu_context *pmu_ctx;
        struct perf_event_context *ctx;
        struct perf_event *event;
        struct pmu *pmu;
        int err;

        /*
         * Grouping is not supported for kernel events, neither is 'AUX',
         * make sure the caller's intentions are adjusted.
         */
        if (attr->aux_output)
                return ERR_PTR(-EINVAL);

        event = perf_event_alloc(attr, cpu, task, NULL, NULL,
                                 overflow_handler, context, -1);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err;
        }

        /* Mark owner so we could distinguish it from user events. */
        event->owner = TASK_TOMBSTONE;
        pmu = event->pmu;

        if (pmu->task_ctx_nr == perf_sw_context)
                event->event_caps |= PERF_EV_CAP_SOFTWARE;

        /*
         * Get the target context (task or percpu):
         */
        ctx = find_get_context(task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
                goto err_alloc;
        }

        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        if (ctx->task == TASK_TOMBSTONE) {
                err = -ESRCH;
                goto err_unlock;
        }

        pmu_ctx = find_get_pmu_context(pmu, ctx, event);
        if (IS_ERR(pmu_ctx)) {
                err = PTR_ERR(pmu_ctx);
                goto err_unlock;
        }
        event->pmu_ctx = pmu_ctx;

        if (!task) {
                /*
                 * Check if the @cpu we're creating an event for is online.
                 *
                 * We use the perf_cpu_context::ctx::mutex to serialize against
                 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
                 */
                struct perf_cpu_context *cpuctx =
                        container_of(ctx, struct perf_cpu_context, ctx);
                if (!cpuctx->online) {
                        err = -ENODEV;
                        goto err_pmu_ctx;
                }
        }

        if (!exclusive_event_installable(event, ctx)) {
                err = -EBUSY;
                goto err_pmu_ctx;
        }

        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);

        return event;

err_pmu_ctx:
        put_pmu_ctx(pmu_ctx);
        event->pmu_ctx = NULL; /* _free_event() */
err_unlock:
        mutex_unlock(&ctx->mutex);
        perf_unpin_context(ctx);
        put_ctx(ctx);
err_alloc:
        free_event(event);
err:
        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);

static void __perf_pmu_remove(struct perf_event_context *ctx,
                              int cpu, struct pmu *pmu,
                              struct perf_event_groups *groups,
                              struct list_head *events)
{
        struct perf_event *event, *sibling;

        perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
                perf_remove_from_context(event, 0);
                put_pmu_ctx(event->pmu_ctx);
                list_add(&event->migrate_entry, events);

                for_each_sibling_event(sibling, event) {
                        perf_remove_from_context(sibling, 0);
                        put_pmu_ctx(sibling->pmu_ctx);
                        list_add(&sibling->migrate_entry, events);
                }
        }
}

static void __perf_pmu_install_event(struct pmu *pmu,
                                     struct perf_event_context *ctx,
                                     int cpu, struct perf_event *event)
{
        struct perf_event_pmu_context *epc;
        struct perf_event_context *old_ctx = event->ctx;

        get_ctx(ctx); /* normally find_get_context() */

        event->cpu = cpu;
        epc = find_get_pmu_context(pmu, ctx, event);
        event->pmu_ctx = epc;

        if (event->state >= PERF_EVENT_STATE_OFF)
                event->state = PERF_EVENT_STATE_INACTIVE;
        perf_install_in_context(ctx, event, cpu);

        /*
         * Now that event->ctx is updated and visible, put the old ctx.
         */
        put_ctx(old_ctx);
}

static void __perf_pmu_install(struct perf_event_context *ctx,
                               int cpu, struct pmu *pmu, struct list_head *events)
{
        struct perf_event *event, *tmp;

        /*
         * Re-instate events in 2 passes.
         *
         * Skip over group leaders and only install siblings on this first
         * pass, siblings will not get enabled without a leader, however a
         * leader will enable its siblings, even if those are still on the old
         * context.
         */
        list_for_each_entry_safe(event, tmp, events, migrate_entry) {
                if (event->group_leader == event)
                        continue;

                list_del(&event->migrate_entry);
                __perf_pmu_install_event(pmu, ctx, cpu, event);
        }

        /*
         * Once all the siblings are setup properly, install the group leaders
         * to make it go.
         */
        list_for_each_entry_safe(event, tmp, events, migrate_entry) {
                list_del(&event->migrate_entry);
                __perf_pmu_install_event(pmu, ctx, cpu, event);
        }
}

void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
{
        struct perf_event_context *src_ctx, *dst_ctx;
        LIST_HEAD(events);

        /*
         * Since per-cpu context is persistent, no need to grab an extra
         * reference.
         */
        src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
        dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;

        /*
         * See perf_event_ctx_lock() for comments on the details
         * of swizzling perf_event::ctx.
         */
        mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);

        __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
        __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);

        if (!list_empty(&events)) {
                /*
                 * Wait for the events to quiesce before re-instating them.
                 */
                synchronize_rcu();

                __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
        }

        mutex_unlock(&dst_ctx->mutex);
        mutex_unlock(&src_ctx->mutex);
}
EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);

static void sync_child_event(struct perf_event *child_event)
{
        struct perf_event *parent_event = child_event->parent;
        u64 child_val;

        if (child_event->attr.inherit_stat) {
                struct task_struct *task = child_event->ctx->task;

                if (task && task != TASK_TOMBSTONE)
                        perf_event_read_event(child_event, task);
        }

        child_val = perf_event_count(child_event);

        /*
         * Add back the child's count to the parent's count:
         */
        atomic64_add(child_val, &parent_event->child_count);
        atomic64_add(child_event->total_time_enabled,
                     &parent_event->child_total_time_enabled);
        atomic64_add(child_event->total_time_running,
                     &parent_event->child_total_time_running);
}

static void
perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event *parent_event = event->parent;
        unsigned long detach_flags = 0;

        if (parent_event) {
                /*
                 * Do not destroy the 'original' grouping; because of the
                 * context switch optimization the original events could've
                 * ended up in a random child task.
                 *
                 * If we were to destroy the original group, all group related
                 * operations would cease to function properly after this
                 * random child dies.
                 *
                 * Do destroy all inherited groups, we don't care about those
                 * and being thorough is better.
                 */
                detach_flags = DETACH_GROUP | DETACH_CHILD;
                mutex_lock(&parent_event->child_mutex);
        }

        perf_remove_from_context(event, detach_flags);

        raw_spin_lock_irq(&ctx->lock);
        if (event->state > PERF_EVENT_STATE_EXIT)
                perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
        raw_spin_unlock_irq(&ctx->lock);

        /*
         * Child events can be freed.
         */
        if (parent_event) {
                mutex_unlock(&parent_event->child_mutex);
                /*
                 * Kick perf_poll() for is_event_hup();
                 */
                perf_event_wakeup(parent_event);
                free_event(event);
                put_event(parent_event);
                return;
        }

        /*
         * Parent events are governed by their filedesc, retain them.
         */
        perf_event_wakeup(event);
}

static void perf_event_exit_task_context(struct task_struct *child)
{
        struct perf_event_context *child_ctx, *clone_ctx = NULL;
        struct perf_event *child_event, *next;

        WARN_ON_ONCE(child != current);

        child_ctx = perf_pin_task_context(child);
        if (!child_ctx)
                return;

        /*
         * In order to reduce the amount of tricky in ctx tear-down, we hold
         * ctx::mutex over the entire thing. This serializes against almost
         * everything that wants to access the ctx.
         *
         * The exception is sys_perf_event_open() /
         * perf_event_create_kernel_count() which does find_get_context()
         * without ctx::mutex (it cannot because of the move_group double mutex
         * lock thing). See the comments in perf_install_in_context().
         */
        mutex_lock(&child_ctx->mutex);

        /*
         * In a single ctx::lock section, de-schedule the events and detach the
         * context from the task such that we cannot ever get it scheduled back
         * in.
         */
        raw_spin_lock_irq(&child_ctx->lock);
        task_ctx_sched_out(child_ctx, EVENT_ALL);

        /*
         * Now that the context is inactive, destroy the task <-> ctx relation
         * and mark the context dead.
         */
        RCU_INIT_POINTER(child->perf_event_ctxp, NULL);
        put_ctx(child_ctx); /* cannot be last */
        WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
        put_task_struct(current); /* cannot be last */

        clone_ctx = unclone_ctx(child_ctx);
        raw_spin_unlock_irq(&child_ctx->lock);

        if (clone_ctx)
                put_ctx(clone_ctx);

        /*
         * Report the task dead after unscheduling the events so that we
         * won't get any samples after PERF_RECORD_EXIT. We can however still
         * get a few PERF_RECORD_READ events.
         */
        perf_event_task(child, child_ctx, 0);

        list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
                perf_event_exit_event(child_event, child_ctx);

        mutex_unlock(&child_ctx->mutex);

        put_ctx(child_ctx);
}

/*
 * When a child task exits, feed back event values to parent events.
 *
 * Can be called with exec_update_lock held when called from
 * setup_new_exec().
 */
void perf_event_exit_task(struct task_struct *child)
{
        struct perf_event *event, *tmp;

        mutex_lock(&child->perf_event_mutex);
        list_for_each_entry_safe(event, tmp, &child->perf_event_list,
                                 owner_entry) {
                list_del_init(&event->owner_entry);

                /*
                 * Ensure the list deletion is visible before we clear
                 * the owner, closes a race against perf_release() where
                 * we need to serialize on the owner->perf_event_mutex.
                 */
                smp_store_release(&event->owner, NULL);
        }
        mutex_unlock(&child->perf_event_mutex);

        perf_event_exit_task_context(child);

        /*
         * The perf_event_exit_task_context calls perf_event_task
         * with child's task_ctx, which generates EXIT events for
         * child contexts and sets child->perf_event_ctxp[] to NULL.
         * At this point we need to send EXIT events to cpu contexts.
         */
        perf_event_task(child, NULL, 0);
}

static void perf_free_event(struct perf_event *event,
                            struct perf_event_context *ctx)
{
        struct perf_event *parent = event->parent;

        if (WARN_ON_ONCE(!parent))
                return;

        mutex_lock(&parent->child_mutex);
        list_del_init(&event->child_list);
        mutex_unlock(&parent->child_mutex);

        put_event(parent);

        raw_spin_lock_irq(&ctx->lock);
        perf_group_detach(event);
        list_del_event(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
        free_event(event);
}

/*
 * Free a context as created by inheritance by perf_event_init_task() below,
 * used by fork() in case of fail.
 *
 * Even though the task has never lived, the context and events have been
 * exposed through the child_list, so we must take care tearing it all down.
 */
void perf_event_free_task(struct task_struct *task)
{
        struct perf_event_context *ctx;
        struct perf_event *event, *tmp;

        ctx = rcu_access_pointer(task->perf_event_ctxp);
        if (!ctx)
                return;

        mutex_lock(&ctx->mutex);
        raw_spin_lock_irq(&ctx->lock);
        /*
         * Destroy the task <-> ctx relation and mark the context dead.
         *
         * This is important because even though the task hasn't been
         * exposed yet the context has been (through child_list).
         */
        RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
        WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
        put_task_struct(task); /* cannot be last */
        raw_spin_unlock_irq(&ctx->lock);


        list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
                perf_free_event(event, ctx);

        mutex_unlock(&ctx->mutex);

        /*
         * perf_event_release_kernel() could've stolen some of our
         * child events and still have them on its free_list. In that
         * case we must wait for these events to have been freed (in
         * particular all their references to this task must've been
         * dropped).
         *
         * Without this copy_process() will unconditionally free this
         * task (irrespective of its reference count) and
         * _free_event()'s put_task_struct(event->hw.target) will be a
         * use-after-free.
         *
         * Wait for all events to drop their context reference.
         */
        wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
        put_ctx(ctx); /* must be last */
}

void perf_event_delayed_put(struct task_struct *task)
{
        WARN_ON_ONCE(task->perf_event_ctxp);
}

struct file *perf_event_get(unsigned int fd)
{
        struct file *file = fget(fd);
        if (!file)
                return ERR_PTR(-EBADF);

        if (file->f_op != &perf_fops) {
                fput(file);
                return ERR_PTR(-EBADF);
        }

        return file;
}

const struct perf_event *perf_get_event(struct file *file)
{
        if (file->f_op != &perf_fops)
                return ERR_PTR(-EINVAL);

        return file->private_data;
}

const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
        if (!event)
                return ERR_PTR(-EINVAL);

        return &event->attr;
}

/*
 * Inherit an event from parent task to child task.
 *
 * Returns:
 *  - valid pointer on success
 *  - NULL for orphaned events
 *  - IS_ERR() on error
 */
static struct perf_event *
inherit_event(struct perf_event *parent_event,
              struct task_struct *parent,
              struct perf_event_context *parent_ctx,
              struct task_struct *child,
              struct perf_event *group_leader,
              struct perf_event_context *child_ctx)
{
        enum perf_event_state parent_state = parent_event->state;
        struct perf_event_pmu_context *pmu_ctx;
        struct perf_event *child_event;
        unsigned long flags;

        /*
         * Instead of creating recursive hierarchies of events,
         * we link inherited events back to the original parent,
         * which has a filp for sure, which we use as the reference
         * count:
         */
        if (parent_event->parent)
                parent_event = parent_event->parent;

        child_event = perf_event_alloc(&parent_event->attr,
                                           parent_event->cpu,
                                           child,
                                           group_leader, parent_event,
                                           NULL, NULL, -1);
        if (IS_ERR(child_event))
                return child_event;

        pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
        if (IS_ERR(pmu_ctx)) {
                free_event(child_event);
                return ERR_CAST(pmu_ctx);
        }
        child_event->pmu_ctx = pmu_ctx;

        /*
         * is_orphaned_event() and list_add_tail(&parent_event->child_list)
         * must be under the same lock in order to serialize against
         * perf_event_release_kernel(), such that either we must observe
         * is_orphaned_event() or they will observe us on the child_list.
         */
        mutex_lock(&parent_event->child_mutex);
        if (is_orphaned_event(parent_event) ||
            !atomic_long_inc_not_zero(&parent_event->refcount)) {
                mutex_unlock(&parent_event->child_mutex);
                /* task_ctx_data is freed with child_ctx */
                free_event(child_event);
                return NULL;
        }

        get_ctx(child_ctx);

        /*
         * Make the child state follow the state of the parent event,
         * not its attr.disabled bit.  We hold the parent's mutex,
         * so we won't race with perf_event_{en, dis}able_family.
         */
        if (parent_state >= PERF_EVENT_STATE_INACTIVE)
                child_event->state = PERF_EVENT_STATE_INACTIVE;
        else
                child_event->state = PERF_EVENT_STATE_OFF;

        if (parent_event->attr.freq) {
                u64 sample_period = parent_event->hw.sample_period;
                struct hw_perf_event *hwc = &child_event->hw;

                hwc->sample_period = sample_period;
                hwc->last_period   = sample_period;

                local64_set(&hwc->period_left, sample_period);
        }

        child_event->ctx = child_ctx;
        child_event->overflow_handler = parent_event->overflow_handler;
        child_event->overflow_handler_context
                = parent_event->overflow_handler_context;

        /*
         * Precalculate sample_data sizes
         */
        perf_event__header_size(child_event);
        perf_event__id_header_size(child_event);

        /*
         * Link it up in the child's context:
         */
        raw_spin_lock_irqsave(&child_ctx->lock, flags);
        add_event_to_ctx(child_event, child_ctx);
        child_event->attach_state |= PERF_ATTACH_CHILD;
        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);

        /*
         * Link this into the parent event's child list
         */
        list_add_tail(&child_event->child_list, &parent_event->child_list);
        mutex_unlock(&parent_event->child_mutex);

        return child_event;
}

/*
 * Inherits an event group.
 *
 * This will quietly suppress orphaned events; !inherit_event() is not an error.
 * This matches with perf_event_release_kernel() removing all child events.
 *
 * Returns:
 *  - 0 on success
 *  - <0 on error
 */
static int inherit_group(struct perf_event *parent_event,
              struct task_struct *parent,
              struct perf_event_context *parent_ctx,
              struct task_struct *child,
              struct perf_event_context *child_ctx)
{
        struct perf_event *leader;
        struct perf_event *sub;
        struct perf_event *child_ctr;

        leader = inherit_event(parent_event, parent, parent_ctx,
                                 child, NULL, child_ctx);
        if (IS_ERR(leader))
                return PTR_ERR(leader);
        /*
         * @leader can be NULL here because of is_orphaned_event(). In this
         * case inherit_event() will create individual events, similar to what
         * perf_group_detach() would do anyway.
         */
        for_each_sibling_event(sub, parent_event) {
                child_ctr = inherit_event(sub, parent, parent_ctx,
                                            child, leader, child_ctx);
                if (IS_ERR(child_ctr))
                        return PTR_ERR(child_ctr);

                if (sub->aux_event == parent_event && child_ctr &&
                    !perf_get_aux_event(child_ctr, leader))
                        return -EINVAL;
        }
        if (leader)
                leader->group_generation = parent_event->group_generation;
        return 0;
}

/*
 * Creates the child task context and tries to inherit the event-group.
 *
 * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
 * inherited_all set when we 'fail' to inherit an orphaned event; this is
 * consistent with perf_event_release_kernel() removing all child events.
 *
 * Returns:
 *  - 0 on success
 *  - <0 on error
 */
static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
                   struct perf_event_context *parent_ctx,
                   struct task_struct *child,
                   u64 clone_flags, int *inherited_all)
{
        struct perf_event_context *child_ctx;
        int ret;

        if (!event->attr.inherit ||
            (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
            /* Do not inherit if sigtrap and signal handlers were cleared. */
            (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) {
                *inherited_all = 0;
                return 0;
        }

        child_ctx = child->perf_event_ctxp;
        if (!child_ctx) {
                /*
                 * This is executed from the parent task context, so
                 * inherit events that have been marked for cloning.
                 * First allocate and initialize a context for the
                 * child.
                 */
                child_ctx = alloc_perf_context(child);
                if (!child_ctx)
                        return -ENOMEM;

                child->perf_event_ctxp = child_ctx;
        }

        ret = inherit_group(event, parent, parent_ctx, child, child_ctx);
        if (ret)
                *inherited_all = 0;

        return ret;
}

/*
 * Initialize the perf_event context in task_struct
 */
static int perf_event_init_context(struct task_struct *child, u64 clone_flags)
{
        struct perf_event_context *child_ctx, *parent_ctx;
        struct perf_event_context *cloned_ctx;
        struct perf_event *event;
        struct task_struct *parent = current;
        int inherited_all = 1;
        unsigned long flags;
        int ret = 0;

        if (likely(!parent->perf_event_ctxp))
                return 0;

        /*
         * If the parent's context is a clone, pin it so it won't get
         * swapped under us.
         */
        parent_ctx = perf_pin_task_context(parent);
        if (!parent_ctx)
                return 0;

        /*
         * No need to check if parent_ctx != NULL here; since we saw
         * it non-NULL earlier, the only reason for it to become NULL
         * is if we exit, and since we're currently in the middle of
         * a fork we can't be exiting at the same time.
         */

        /*
         * Lock the parent list. No need to lock the child - not PID
         * hashed yet and not running, so nobody can access it.
         */
        mutex_lock(&parent_ctx->mutex);

        /*
         * We dont have to disable NMIs - we are only looking at
         * the list, not manipulating it:
         */
        perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, clone_flags, &inherited_all);
                if (ret)
                        goto out_unlock;
        }

        /*
         * We can't hold ctx->lock when iterating the ->flexible_group list due
         * to allocations, but we need to prevent rotation because
         * rotate_ctx() will change the list from interrupt context.
         */
        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
        parent_ctx->rotate_disable = 1;
        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);

        perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, clone_flags, &inherited_all);
                if (ret)
                        goto out_unlock;
        }

        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
        parent_ctx->rotate_disable = 0;

        child_ctx = child->perf_event_ctxp;

        if (child_ctx && inherited_all) {
                /*
                 * Mark the child context as a clone of the parent
                 * context, or of whatever the parent is a clone of.
                 *
                 * Note that if the parent is a clone, the holding of
                 * parent_ctx->lock avoids it from being uncloned.
                 */
                cloned_ctx = parent_ctx->parent_ctx;
                if (cloned_ctx) {
                        child_ctx->parent_ctx = cloned_ctx;
                        child_ctx->parent_gen = parent_ctx->parent_gen;
                } else {
                        child_ctx->parent_ctx = parent_ctx;
                        child_ctx->parent_gen = parent_ctx->generation;
                }
                get_ctx(child_ctx->parent_ctx);
        }

        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
out_unlock:
        mutex_unlock(&parent_ctx->mutex);

        perf_unpin_context(parent_ctx);
        put_ctx(parent_ctx);

        return ret;
}

/*
 * Initialize the perf_event context in task_struct
 */
int perf_event_init_task(struct task_struct *child, u64 clone_flags)
{
        int ret;

        child->perf_event_ctxp = NULL;
        mutex_init(&child->perf_event_mutex);
        INIT_LIST_HEAD(&child->perf_event_list);

        ret = perf_event_init_context(child, clone_flags);
        if (ret) {
                perf_event_free_task(child);
                return ret;
        }

        return 0;
}

static void __init perf_event_init_all_cpus(void)
{
        struct swevent_htable *swhash;
        struct perf_cpu_context *cpuctx;
        int cpu;

        zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);

        for_each_possible_cpu(cpu) {
                swhash = &per_cpu(swevent_htable, cpu);
                mutex_init(&swhash->hlist_mutex);

                INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
                raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));

                INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));

                cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
                __perf_event_init_context(&cpuctx->ctx);
                lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
                cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
                cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
                cpuctx->heap = cpuctx->heap_default;
        }
}

static void perf_swevent_init_cpu(unsigned int cpu)
{
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);

        mutex_lock(&swhash->hlist_mutex);
        if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
                struct swevent_hlist *hlist;

                hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
                WARN_ON(!hlist);
                rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
        mutex_unlock(&swhash->hlist_mutex);
}

#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *ctx = __info;
        struct perf_event *event;

        raw_spin_lock(&ctx->lock);
        ctx_sched_out(ctx, EVENT_TIME);
        list_for_each_entry(event, &ctx->event_list, event_entry)
                __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
        raw_spin_unlock(&ctx->lock);
}

static void perf_event_exit_cpu_context(int cpu)
{
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;

        // XXX simplify cpuctx->online
        mutex_lock(&pmus_lock);
        cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
        ctx = &cpuctx->ctx;

        mutex_lock(&ctx->mutex);
        smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
        cpuctx->online = 0;
        mutex_unlock(&ctx->mutex);
        cpumask_clear_cpu(cpu, perf_online_mask);
        mutex_unlock(&pmus_lock);
}
#else

static void perf_event_exit_cpu_context(int cpu) { }

#endif

int perf_event_init_cpu(unsigned int cpu)
{
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;

        perf_swevent_init_cpu(cpu);

        mutex_lock(&pmus_lock);
        cpumask_set_cpu(cpu, perf_online_mask);
        cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
        ctx = &cpuctx->ctx;

        mutex_lock(&ctx->mutex);
        cpuctx->online = 1;
        mutex_unlock(&ctx->mutex);
        mutex_unlock(&pmus_lock);

        return 0;
}

int perf_event_exit_cpu(unsigned int cpu)
{
        perf_event_exit_cpu_context(cpu);
        return 0;
}

static int
perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
{
        int cpu;

        for_each_online_cpu(cpu)
                perf_event_exit_cpu(cpu);

        return NOTIFY_OK;
}

/*
 * Run the perf reboot notifier at the very last possible moment so that
 * the generic watchdog code runs as long as possible.
 */
static struct notifier_block perf_reboot_notifier = {
        .notifier_call = perf_reboot,
        .priority = INT_MIN,
};

void __init perf_event_init(void)
{
        int ret;

        idr_init(&pmu_idr);

        perf_event_init_all_cpus();
        init_srcu_struct(&pmus_srcu);
        perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
        perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1);
        perf_pmu_register(&perf_task_clock, "task_clock", -1);
        perf_tp_register();
        perf_event_init_cpu(smp_processor_id());
        register_reboot_notifier(&perf_reboot_notifier);

        ret = init_hw_breakpoint();
        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);

        perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC);

        /*
         * Build time assertion that we keep the data_head at the intended
         * location.  IOW, validation we got the __reserved[] size right.
         */
        BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
                     != 1024);
}

ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
                              char *page)
{
        struct perf_pmu_events_attr *pmu_attr =
                container_of(attr, struct perf_pmu_events_attr, attr);

        if (pmu_attr->event_str)
                return sprintf(page, "%s\n", pmu_attr->event_str);

        return 0;
}
EXPORT_SYMBOL_GPL(perf_event_sysfs_show);

static int __init perf_event_sysfs_init(void)
{
        struct pmu *pmu;
        int ret;

        mutex_lock(&pmus_lock);

        ret = bus_register(&pmu_bus);
        if (ret)
                goto unlock;

        list_for_each_entry(pmu, &pmus, entry) {
                if (pmu->dev)
                        continue;

                ret = pmu_dev_alloc(pmu);
                WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
        }
        pmu_bus_running = 1;
        ret = 0;

unlock:
        mutex_unlock(&pmus_lock);

        return ret;
}
device_initcall(perf_event_sysfs_init);

#ifdef CONFIG_CGROUP_PERF
static struct cgroup_subsys_state *
perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct perf_cgroup *jc;

        jc = kzalloc(sizeof(*jc), GFP_KERNEL);
        if (!jc)
                return ERR_PTR(-ENOMEM);

        jc->info = alloc_percpu(struct perf_cgroup_info);
        if (!jc->info) {
                kfree(jc);
                return ERR_PTR(-ENOMEM);
        }

        return &jc->css;
}

static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
{
        struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);

        free_percpu(jc->info);
        kfree(jc);
}

static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
{
        perf_event_cgroup(css->cgroup);
        return 0;
}

static int __perf_cgroup_move(void *info)
{
        struct task_struct *task = info;

        preempt_disable();
        perf_cgroup_switch(task);
        preempt_enable();

        return 0;
}

static void perf_cgroup_attach(struct cgroup_taskset *tset)
{
        struct task_struct *task;
        struct cgroup_subsys_state *css;

        cgroup_taskset_for_each(task, css, tset)
                task_function_call(task, __perf_cgroup_move, task);
}

struct cgroup_subsys perf_event_cgrp_subsys = {
        .css_alloc        = perf_cgroup_css_alloc,
        .css_free        = perf_cgroup_css_free,
        .css_online        = perf_cgroup_css_online,
        .attach                = perf_cgroup_attach,
        /*
         * Implicitly enable on dfl hierarchy so that perf events can
         * always be filtered by cgroup2 path as long as perf_event
         * controller is not mounted on a legacy hierarchy.
         */
        .implicit_on_dfl = true,
        .threaded        = true,
};
#endif /* CONFIG_CGROUP_PERF */

DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);






























































































































































































































































































































    3 


















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _SCSI_SCSI_CMND_H
#define _SCSI_SCSI_CMND_H

#include <linux/dma-mapping.h>
#include <linux/blkdev.h>
#include <linux/t10-pi.h>
#include <linux/list.h>
#include <linux/types.h>
#include <linux/timer.h>
#include <linux/scatterlist.h>
#include <scsi/scsi_device.h>

struct Scsi_Host;

/*
 * MAX_COMMAND_SIZE is:
 * The longest fixed-length SCSI CDB as per the SCSI standard.
 * fixed-length means: commands that their size can be determined
 * by their opcode and the CDB does not carry a length specifier, (unlike
 * the VARIABLE_LENGTH_CMD(0x7f) command). This is actually not exactly
 * true and the SCSI standard also defines extended commands and
 * vendor specific commands that can be bigger than 16 bytes. The kernel
 * will support these using the same infrastructure used for VARLEN CDB's.
 * So in effect MAX_COMMAND_SIZE means the maximum size command scsi-ml
 * supports without specifying a cmd_len by ULD's
 */
#define MAX_COMMAND_SIZE 16

struct scsi_data_buffer {
        struct sg_table table;
        unsigned length;
};

/* embedded in scsi_cmnd */
struct scsi_pointer {
        char *ptr;                /* data pointer */
        int this_residual;        /* left in this buffer */
        struct scatterlist *buffer;        /* which buffer */
        int buffers_residual;        /* how many buffers left */

        dma_addr_t dma_handle;

        volatile int Status;
        volatile int Message;
        volatile int have_data_in;
        volatile int sent_command;
        volatile int phase;
};

/* for scmd->flags */
#define SCMD_TAGGED                (1 << 0)
#define SCMD_INITIALIZED        (1 << 1)
#define SCMD_LAST                (1 << 2)
/*
 * libata uses SCSI EH to fetch sense data for successful commands.
 * SCSI EH should not overwrite scmd->result when SCMD_FORCE_EH_SUCCESS is set.
 */
#define SCMD_FORCE_EH_SUCCESS        (1 << 3)
#define SCMD_FAIL_IF_RECOVERING        (1 << 4)
/* flags preserved across unprep / reprep */
#define SCMD_PRESERVED_FLAGS        (SCMD_INITIALIZED | SCMD_FAIL_IF_RECOVERING)

/* for scmd->state */
#define SCMD_STATE_COMPLETE        0
#define SCMD_STATE_INFLIGHT        1

enum scsi_cmnd_submitter {
        SUBMITTED_BY_BLOCK_LAYER = 0,
        SUBMITTED_BY_SCSI_ERROR_HANDLER = 1,
        SUBMITTED_BY_SCSI_RESET_IOCTL = 2,
} __packed;

struct scsi_cmnd {
        struct scsi_device *device;
        struct list_head eh_entry; /* entry for the host eh_abort_list/eh_cmd_q */
        struct delayed_work abort_work;

        struct rcu_head rcu;

        int eh_eflags;                /* Used by error handlr */

        int budget_token;

        /*
         * This is set to jiffies as it was when the command was first
         * allocated.  It is used to time how long the command has
         * been outstanding
         */
        unsigned long jiffies_at_alloc;

        int retries;
        int allowed;

        unsigned char prot_op;
        unsigned char prot_type;
        unsigned char prot_flags;
        enum scsi_cmnd_submitter submitter;

        unsigned short cmd_len;
        enum dma_data_direction sc_data_direction;

        unsigned char cmnd[32]; /* SCSI CDB */

        /* These elements define the operation we ultimately want to perform */
        struct scsi_data_buffer sdb;
        struct scsi_data_buffer *prot_sdb;

        unsigned underflow;        /* Return error if less than
                                   this amount is transferred */

        unsigned transfersize;        /* How much we are guaranteed to
                                   transfer with each SCSI transfer
                                   (ie, between disconnect / 
                                   reconnects.   Probably == sector
                                   size */
        unsigned resid_len;        /* residual count */
        unsigned sense_len;
        unsigned char *sense_buffer;
                                /* obtained by REQUEST SENSE when
                                 * CHECK CONDITION is received on original
                                 * command (auto-sense). Length must be
                                 * SCSI_SENSE_BUFFERSIZE bytes. */

        int flags;                /* Command flags */
        unsigned long state;        /* Command completion state */

        unsigned int extra_len;        /* length of alignment and padding */

        /*
         * The fields below can be modified by the LLD but the fields above
         * must not be modified.
         */

        unsigned char *host_scribble;        /* The host adapter is allowed to
                                         * call scsi_malloc and get some memory
                                         * and hang it here.  The host adapter
                                         * is also expected to call scsi_free
                                         * to release this memory.  (The memory
                                         * obtained by scsi_malloc is guaranteed
                                         * to be at an address < 16Mb). */

        int result;                /* Status code from lower level driver */
};

/* Variant of blk_mq_rq_from_pdu() that verifies the type of its argument. */
static inline struct request *scsi_cmd_to_rq(struct scsi_cmnd *scmd)
{
        return blk_mq_rq_from_pdu(scmd);
}

/*
 * Return the driver private allocation behind the command.
 * Only works if cmd_size is set in the host template.
 */
static inline void *scsi_cmd_priv(struct scsi_cmnd *cmd)
{
        return cmd + 1;
}

void scsi_done(struct scsi_cmnd *cmd);
void scsi_done_direct(struct scsi_cmnd *cmd);

extern void scsi_finish_command(struct scsi_cmnd *cmd);

extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count,
                                 size_t *offset, size_t *len);
extern void scsi_kunmap_atomic_sg(void *virt);

blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd);
void scsi_free_sgtables(struct scsi_cmnd *cmd);

#ifdef CONFIG_SCSI_DMA
extern int scsi_dma_map(struct scsi_cmnd *cmd);
extern void scsi_dma_unmap(struct scsi_cmnd *cmd);
#else /* !CONFIG_SCSI_DMA */
static inline int scsi_dma_map(struct scsi_cmnd *cmd) { return -ENOSYS; }
static inline void scsi_dma_unmap(struct scsi_cmnd *cmd) { }
#endif /* !CONFIG_SCSI_DMA */

static inline unsigned scsi_sg_count(struct scsi_cmnd *cmd)
{
        return cmd->sdb.table.nents;
}

static inline struct scatterlist *scsi_sglist(struct scsi_cmnd *cmd)
{
        return cmd->sdb.table.sgl;
}

static inline unsigned scsi_bufflen(struct scsi_cmnd *cmd)
{
        return cmd->sdb.length;
}

static inline void scsi_set_resid(struct scsi_cmnd *cmd, unsigned int resid)
{
        cmd->resid_len = resid;
}

static inline unsigned int scsi_get_resid(struct scsi_cmnd *cmd)
{
        return cmd->resid_len;
}

#define scsi_for_each_sg(cmd, sg, nseg, __i)                        \
        for_each_sg(scsi_sglist(cmd), sg, nseg, __i)

static inline int scsi_sg_copy_from_buffer(struct scsi_cmnd *cmd,
                                           const void *buf, int buflen)
{
        return sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd),
                                   buf, buflen);
}

static inline int scsi_sg_copy_to_buffer(struct scsi_cmnd *cmd,
                                         void *buf, int buflen)
{
        return sg_copy_to_buffer(scsi_sglist(cmd), scsi_sg_count(cmd),
                                 buf, buflen);
}

static inline sector_t scsi_get_sector(struct scsi_cmnd *scmd)
{
        return blk_rq_pos(scsi_cmd_to_rq(scmd));
}

static inline sector_t scsi_get_lba(struct scsi_cmnd *scmd)
{
        unsigned int shift = ilog2(scmd->device->sector_size) - SECTOR_SHIFT;

        return blk_rq_pos(scsi_cmd_to_rq(scmd)) >> shift;
}

static inline unsigned int scsi_logical_block_count(struct scsi_cmnd *scmd)
{
        unsigned int shift = ilog2(scmd->device->sector_size) - SECTOR_SHIFT;

        return blk_rq_bytes(scsi_cmd_to_rq(scmd)) >> shift;
}

/*
 * The operations below are hints that tell the controller driver how
 * to handle I/Os with DIF or similar types of protection information.
 */
enum scsi_prot_operations {
        /* Normal I/O */
        SCSI_PROT_NORMAL = 0,

        /* OS-HBA: Protected, HBA-Target: Unprotected */
        SCSI_PROT_READ_INSERT,
        SCSI_PROT_WRITE_STRIP,

        /* OS-HBA: Unprotected, HBA-Target: Protected */
        SCSI_PROT_READ_STRIP,
        SCSI_PROT_WRITE_INSERT,

        /* OS-HBA: Protected, HBA-Target: Protected */
        SCSI_PROT_READ_PASS,
        SCSI_PROT_WRITE_PASS,
};

static inline void scsi_set_prot_op(struct scsi_cmnd *scmd, unsigned char op)
{
        scmd->prot_op = op;
}

static inline unsigned char scsi_get_prot_op(struct scsi_cmnd *scmd)
{
        return scmd->prot_op;
}

enum scsi_prot_flags {
        SCSI_PROT_TRANSFER_PI                = 1 << 0,
        SCSI_PROT_GUARD_CHECK                = 1 << 1,
        SCSI_PROT_REF_CHECK                = 1 << 2,
        SCSI_PROT_REF_INCREMENT                = 1 << 3,
        SCSI_PROT_IP_CHECKSUM                = 1 << 4,
};

/*
 * The controller usually does not know anything about the target it
 * is communicating with.  However, when DIX is enabled the controller
 * must be know target type so it can verify the protection
 * information passed along with the I/O.
 */
enum scsi_prot_target_type {
        SCSI_PROT_DIF_TYPE0 = 0,
        SCSI_PROT_DIF_TYPE1,
        SCSI_PROT_DIF_TYPE2,
        SCSI_PROT_DIF_TYPE3,
};

static inline void scsi_set_prot_type(struct scsi_cmnd *scmd, unsigned char type)
{
        scmd->prot_type = type;
}

static inline unsigned char scsi_get_prot_type(struct scsi_cmnd *scmd)
{
        return scmd->prot_type;
}

static inline u32 scsi_prot_ref_tag(struct scsi_cmnd *scmd)
{
        struct request *rq = blk_mq_rq_from_pdu(scmd);

        return t10_pi_ref_tag(rq);
}

static inline unsigned int scsi_prot_interval(struct scsi_cmnd *scmd)
{
        return scmd->device->sector_size;
}

static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd)
{
        return cmd->prot_sdb ? cmd->prot_sdb->table.nents : 0;
}

static inline struct scatterlist *scsi_prot_sglist(struct scsi_cmnd *cmd)
{
        return cmd->prot_sdb ? cmd->prot_sdb->table.sgl : NULL;
}

static inline struct scsi_data_buffer *scsi_prot(struct scsi_cmnd *cmd)
{
        return cmd->prot_sdb;
}

#define scsi_for_each_prot_sg(cmd, sg, nseg, __i)                \
        for_each_sg(scsi_prot_sglist(cmd), sg, nseg, __i)

static inline void set_status_byte(struct scsi_cmnd *cmd, char status)
{
        cmd->result = (cmd->result & 0xffffff00) | status;
}

static inline u8 get_status_byte(struct scsi_cmnd *cmd)
{
        return cmd->result & 0xff;
}

static inline void set_host_byte(struct scsi_cmnd *cmd, char status)
{
        cmd->result = (cmd->result & 0xff00ffff) | (status << 16);
}

static inline u8 get_host_byte(struct scsi_cmnd *cmd)
{
        return (cmd->result >> 16) & 0xff;
}

/**
 * scsi_msg_to_host_byte() - translate message byte
 * @cmd: the SCSI command
 * @msg: the SCSI parallel message byte to translate
 *
 * Translate the SCSI parallel message byte to a matching
 * host byte setting. A message of COMMAND_COMPLETE indicates
 * a successful command execution, any other message indicate
 * an error. As the messages themselves only have a meaning
 * for the SCSI parallel protocol this function translates
 * them into a matching host byte value for SCSI EH.
 */
static inline void scsi_msg_to_host_byte(struct scsi_cmnd *cmd, u8 msg)
{
        switch (msg) {
        case COMMAND_COMPLETE:
                break;
        case ABORT_TASK_SET:
                set_host_byte(cmd, DID_ABORT);
                break;
        case TARGET_RESET:
                set_host_byte(cmd, DID_RESET);
                break;
        default:
                set_host_byte(cmd, DID_ERROR);
                break;
        }
}

static inline unsigned scsi_transfer_length(struct scsi_cmnd *scmd)
{
        unsigned int xfer_len = scmd->sdb.length;
        unsigned int prot_interval = scsi_prot_interval(scmd);

        if (scmd->prot_flags & SCSI_PROT_TRANSFER_PI)
                xfer_len += (xfer_len >> ilog2(prot_interval)) * 8;

        return xfer_len;
}

extern void scsi_build_sense(struct scsi_cmnd *scmd, int desc,
                             u8 key, u8 asc, u8 ascq);

struct request *scsi_alloc_request(struct request_queue *q, blk_opf_t opf,
                                   blk_mq_req_flags_t flags);

#endif /* _SCSI_SCSI_CMND_H */


















































































































































































    2 


    2 



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 
















    2 

































































    2 











    2 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 














    2 















    2 


































































































































































































































































































































































































    2 





























    1 











    2 























    2 

















































































































































































    2 














    1 














    2 









    2 





























    1 











    1 















    2 

    2 

    2 













































































    2 




    1 




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 









































    1 







    1 


















    1 

    1 




















































































































    1 
    1 
















































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
#include <linux/semaphore.h>
#include <linux/uuid.h>
#include <linux/list_sort.h>
#include <linux/namei.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "volumes.h"
#include "raid56.h"
#include "rcu-string.h"
#include "dev-replace.h"
#include "sysfs.h"
#include "tree-checker.h"
#include "space-info.h"
#include "block-group.h"
#include "discard.h"
#include "zoned.h"
#include "fs.h"
#include "accessors.h"
#include "uuid-tree.h"
#include "ioctl.h"
#include "relocation.h"
#include "scrub.h"
#include "super.h"
#include "raid-stripe-tree.h"

#define BTRFS_BLOCK_GROUP_STRIPE_MASK        (BTRFS_BLOCK_GROUP_RAID0 | \
                                         BTRFS_BLOCK_GROUP_RAID10 | \
                                         BTRFS_BLOCK_GROUP_RAID56_MASK)

struct btrfs_io_geometry {
        u32 stripe_index;
        u32 stripe_nr;
        int mirror_num;
        int num_stripes;
        u64 stripe_offset;
        u64 raid56_full_stripe_start;
        int max_errors;
        enum btrfs_map_op op;
};

const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
        [BTRFS_RAID_RAID10] = {
                .sub_stripes        = 2,
                .dev_stripes        = 1,
                .devs_max        = 0,        /* 0 == as many as possible */
                .devs_min        = 2,
                .tolerated_failures = 1,
                .devs_increment        = 2,
                .ncopies        = 2,
                .nparity        = 0,
                .raid_name        = "raid10",
                .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
                .mindev_error        = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
        },
        [BTRFS_RAID_RAID1] = {
                .sub_stripes        = 1,
                .dev_stripes        = 1,
                .devs_max        = 2,
                .devs_min        = 2,
                .tolerated_failures = 1,
                .devs_increment        = 2,
                .ncopies        = 2,
                .nparity        = 0,
                .raid_name        = "raid1",
                .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
                .mindev_error        = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
        },
        [BTRFS_RAID_RAID1C3] = {
                .sub_stripes        = 1,
                .dev_stripes        = 1,
                .devs_max        = 3,
                .devs_min        = 3,
                .tolerated_failures = 2,
                .devs_increment        = 3,
                .ncopies        = 3,
                .nparity        = 0,
                .raid_name        = "raid1c3",
                .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C3,
                .mindev_error        = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
        },
        [BTRFS_RAID_RAID1C4] = {
                .sub_stripes        = 1,
                .dev_stripes        = 1,
                .devs_max        = 4,
                .devs_min        = 4,
                .tolerated_failures = 3,
                .devs_increment        = 4,
                .ncopies        = 4,
                .nparity        = 0,
                .raid_name        = "raid1c4",
                .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C4,
                .mindev_error        = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
        },
        [BTRFS_RAID_DUP] = {
                .sub_stripes        = 1,
                .dev_stripes        = 2,
                .devs_max        = 1,
                .devs_min        = 1,
                .tolerated_failures = 0,
                .devs_increment        = 1,
                .ncopies        = 2,
                .nparity        = 0,
                .raid_name        = "dup",
                .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
                .mindev_error        = 0,
        },
        [BTRFS_RAID_RAID0] = {
                .sub_stripes        = 1,
                .dev_stripes        = 1,
                .devs_max        = 0,
                .devs_min        = 1,
                .tolerated_failures = 0,
                .devs_increment        = 1,
                .ncopies        = 1,
                .nparity        = 0,
                .raid_name        = "raid0",
                .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
                .mindev_error        = 0,
        },
        [BTRFS_RAID_SINGLE] = {
                .sub_stripes        = 1,
                .dev_stripes        = 1,
                .devs_max        = 1,
                .devs_min        = 1,
                .tolerated_failures = 0,
                .devs_increment        = 1,
                .ncopies        = 1,
                .nparity        = 0,
                .raid_name        = "single",
                .bg_flag        = 0,
                .mindev_error        = 0,
        },
        [BTRFS_RAID_RAID5] = {
                .sub_stripes        = 1,
                .dev_stripes        = 1,
                .devs_max        = 0,
                .devs_min        = 2,
                .tolerated_failures = 1,
                .devs_increment        = 1,
                .ncopies        = 1,
                .nparity        = 1,
                .raid_name        = "raid5",
                .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
                .mindev_error        = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
        },
        [BTRFS_RAID_RAID6] = {
                .sub_stripes        = 1,
                .dev_stripes        = 1,
                .devs_max        = 0,
                .devs_min        = 3,
                .tolerated_failures = 2,
                .devs_increment        = 1,
                .ncopies        = 1,
                .nparity        = 2,
                .raid_name        = "raid6",
                .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
                .mindev_error        = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
        },
};

/*
 * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
 * can be used as index to access btrfs_raid_array[].
 */
enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
{
        const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);

        if (!profile)
                return BTRFS_RAID_SINGLE;

        return BTRFS_BG_FLAG_TO_INDEX(profile);
}

const char *btrfs_bg_type_to_raid_name(u64 flags)
{
        const int index = btrfs_bg_flags_to_raid_index(flags);

        if (index >= BTRFS_NR_RAID_TYPES)
                return NULL;

        return btrfs_raid_array[index].raid_name;
}

int btrfs_nr_parity_stripes(u64 type)
{
        enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type);

        return btrfs_raid_array[index].nparity;
}

/*
 * Fill @buf with textual description of @bg_flags, no more than @size_buf
 * bytes including terminating null byte.
 */
void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
{
        int i;
        int ret;
        char *bp = buf;
        u64 flags = bg_flags;
        u32 size_bp = size_buf;

        if (!flags) {
                strcpy(bp, "NONE");
                return;
        }

#define DESCRIBE_FLAG(flag, desc)                                                \
        do {                                                                \
                if (flags & (flag)) {                                        \
                        ret = snprintf(bp, size_bp, "%s|", (desc));        \
                        if (ret < 0 || ret >= size_bp)                        \
                                goto out_overflow;                        \
                        size_bp -= ret;                                        \
                        bp += ret;                                        \
                        flags &= ~(flag);                                \
                }                                                        \
        } while (0)

        DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
        DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
        DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");

        DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
                DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
                              btrfs_raid_array[i].raid_name);
#undef DESCRIBE_FLAG

        if (flags) {
                ret = snprintf(bp, size_bp, "0x%llx|", flags);
                size_bp -= ret;
        }

        if (size_bp < size_buf)
                buf[size_buf - size_bp - 1] = '\0'; /* remove last | */

        /*
         * The text is trimmed, it's up to the caller to provide sufficiently
         * large buffer
         */
out_overflow:;
}

static int init_first_rw_device(struct btrfs_trans_handle *trans);
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);

/*
 * Device locking
 * ==============
 *
 * There are several mutexes that protect manipulation of devices and low-level
 * structures like chunks but not block groups, extents or files
 *
 * uuid_mutex (global lock)
 * ------------------------
 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
 * the SCAN_DEV ioctl registration or from mount either implicitly (the first
 * device) or requested by the device= mount option
 *
 * the mutex can be very coarse and can cover long-running operations
 *
 * protects: updates to fs_devices counters like missing devices, rw devices,
 * seeding, structure cloning, opening/closing devices at mount/umount time
 *
 * global::fs_devs - add, remove, updates to the global list
 *
 * does not protect: manipulation of the fs_devices::devices list in general
 * but in mount context it could be used to exclude list modifications by eg.
 * scan ioctl
 *
 * btrfs_device::name - renames (write side), read is RCU
 *
 * fs_devices::device_list_mutex (per-fs, with RCU)
 * ------------------------------------------------
 * protects updates to fs_devices::devices, ie. adding and deleting
 *
 * simple list traversal with read-only actions can be done with RCU protection
 *
 * may be used to exclude some operations from running concurrently without any
 * modifications to the list (see write_all_supers)
 *
 * Is not required at mount and close times, because our device list is
 * protected by the uuid_mutex at that point.
 *
 * balance_mutex
 * -------------
 * protects balance structures (status, state) and context accessed from
 * several places (internally, ioctl)
 *
 * chunk_mutex
 * -----------
 * protects chunks, adding or removing during allocation, trim or when a new
 * device is added/removed. Additionally it also protects post_commit_list of
 * individual devices, since they can be added to the transaction's
 * post_commit_list only with chunk_mutex held.
 *
 * cleaner_mutex
 * -------------
 * a big lock that is held by the cleaner thread and prevents running subvolume
 * cleaning together with relocation or delayed iputs
 *
 *
 * Lock nesting
 * ============
 *
 * uuid_mutex
 *   device_list_mutex
 *     chunk_mutex
 *   balance_mutex
 *
 *
 * Exclusive operations
 * ====================
 *
 * Maintains the exclusivity of the following operations that apply to the
 * whole filesystem and cannot run in parallel.
 *
 * - Balance (*)
 * - Device add
 * - Device remove
 * - Device replace (*)
 * - Resize
 *
 * The device operations (as above) can be in one of the following states:
 *
 * - Running state
 * - Paused state
 * - Completed state
 *
 * Only device operations marked with (*) can go into the Paused state for the
 * following reasons:
 *
 * - ioctl (only Balance can be Paused through ioctl)
 * - filesystem remounted as read-only
 * - filesystem unmounted and mounted as read-only
 * - system power-cycle and filesystem mounted as read-only
 * - filesystem or device errors leading to forced read-only
 *
 * The status of exclusive operation is set and cleared atomically.
 * During the course of Paused state, fs_info::exclusive_operation remains set.
 * A device operation in Paused or Running state can be canceled or resumed
 * either by ioctl (Balance only) or when remounted as read-write.
 * The exclusive status is cleared when the device operation is canceled or
 * completed.
 */

DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
{
        return &fs_uuids;
}

/*
 * Allocate new btrfs_fs_devices structure identified by a fsid.
 *
 * @fsid:    if not NULL, copy the UUID to fs_devices::fsid and to
 *           fs_devices::metadata_fsid
 *
 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 * The returned struct is not linked onto any lists and can be destroyed with
 * kfree() right away.
 */
static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
{
        struct btrfs_fs_devices *fs_devs;

        fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
        if (!fs_devs)
                return ERR_PTR(-ENOMEM);

        mutex_init(&fs_devs->device_list_mutex);

        INIT_LIST_HEAD(&fs_devs->devices);
        INIT_LIST_HEAD(&fs_devs->alloc_list);
        INIT_LIST_HEAD(&fs_devs->fs_list);
        INIT_LIST_HEAD(&fs_devs->seed_list);

        if (fsid) {
                memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
                memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
        }

        return fs_devs;
}

static void btrfs_free_device(struct btrfs_device *device)
{
        WARN_ON(!list_empty(&device->post_commit_list));
        rcu_string_free(device->name);
        extent_io_tree_release(&device->alloc_state);
        btrfs_destroy_dev_zone_info(device);
        kfree(device);
}

static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
        struct btrfs_device *device;

        WARN_ON(fs_devices->opened);
        while (!list_empty(&fs_devices->devices)) {
                device = list_entry(fs_devices->devices.next,
                                    struct btrfs_device, dev_list);
                list_del(&device->dev_list);
                btrfs_free_device(device);
        }
        kfree(fs_devices);
}

void __exit btrfs_cleanup_fs_uuids(void)
{
        struct btrfs_fs_devices *fs_devices;

        while (!list_empty(&fs_uuids)) {
                fs_devices = list_entry(fs_uuids.next,
                                        struct btrfs_fs_devices, fs_list);
                list_del(&fs_devices->fs_list);
                free_fs_devices(fs_devices);
        }
}

static bool match_fsid_fs_devices(const struct btrfs_fs_devices *fs_devices,
                                  const u8 *fsid, const u8 *metadata_fsid)
{
        if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0)
                return false;

        if (!metadata_fsid)
                return true;

        if (memcmp(metadata_fsid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0)
                return false;

        return true;
}

static noinline struct btrfs_fs_devices *find_fsid(
                const u8 *fsid, const u8 *metadata_fsid)
{
        struct btrfs_fs_devices *fs_devices;

        ASSERT(fsid);

        /* Handle non-split brain cases */
        list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
                if (match_fsid_fs_devices(fs_devices, fsid, metadata_fsid))
                        return fs_devices;
        }
        return NULL;
}

static int
btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
                      int flush, struct file **bdev_file,
                      struct btrfs_super_block **disk_super)
{
        struct block_device *bdev;
        int ret;

        *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL);

        if (IS_ERR(*bdev_file)) {
                ret = PTR_ERR(*bdev_file);
                goto error;
        }
        bdev = file_bdev(*bdev_file);

        if (flush)
                sync_blockdev(bdev);
        if (holder) {
                ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE);
                if (ret) {
                        fput(*bdev_file);
                        goto error;
                }
        }
        invalidate_bdev(bdev);
        *disk_super = btrfs_read_dev_super(bdev);
        if (IS_ERR(*disk_super)) {
                ret = PTR_ERR(*disk_super);
                fput(*bdev_file);
                goto error;
        }

        return 0;

error:
        *disk_super = NULL;
        *bdev_file = NULL;
        return ret;
}

/*
 *  Search and remove all stale devices (which are not mounted).  When both
 *  inputs are NULL, it will search and release all stale devices.
 *
 *  @devt:         Optional. When provided will it release all unmounted devices
 *                 matching this devt only.
 *  @skip_device:  Optional. Will skip this device when searching for the stale
 *                 devices.
 *
 *  Return:        0 for success or if @devt is 0.
 *                -EBUSY if @devt is a mounted device.
 *                -ENOENT if @devt does not match any device in the list.
 */
static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device)
{
        struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
        struct btrfs_device *device, *tmp_device;
        int ret;
        bool freed = false;

        lockdep_assert_held(&uuid_mutex);

        /* Return good status if there is no instance of devt. */
        ret = 0;
        list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {

                mutex_lock(&fs_devices->device_list_mutex);
                list_for_each_entry_safe(device, tmp_device,
                                         &fs_devices->devices, dev_list) {
                        if (skip_device && skip_device == device)
                                continue;
                        if (devt && devt != device->devt)
                                continue;
                        if (fs_devices->opened) {
                                if (devt)
                                        ret = -EBUSY;
                                break;
                        }

                        /* delete the stale device */
                        fs_devices->num_devices--;
                        list_del(&device->dev_list);
                        btrfs_free_device(device);

                        freed = true;
                }
                mutex_unlock(&fs_devices->device_list_mutex);

                if (fs_devices->num_devices == 0) {
                        btrfs_sysfs_remove_fsid(fs_devices);
                        list_del(&fs_devices->fs_list);
                        free_fs_devices(fs_devices);
                }
        }

        /* If there is at least one freed device return 0. */
        if (freed)
                return 0;

        return ret;
}

static struct btrfs_fs_devices *find_fsid_by_device(
                                        struct btrfs_super_block *disk_super,
                                        dev_t devt, bool *same_fsid_diff_dev)
{
        struct btrfs_fs_devices *fsid_fs_devices;
        struct btrfs_fs_devices *devt_fs_devices;
        const bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
                                        BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
        bool found_by_devt = false;

        /* Find the fs_device by the usual method, if found use it. */
        fsid_fs_devices = find_fsid(disk_super->fsid,
                    has_metadata_uuid ? disk_super->metadata_uuid : NULL);

        /* The temp_fsid feature is supported only with single device filesystem. */
        if (btrfs_super_num_devices(disk_super) != 1)
                return fsid_fs_devices;

        /*
         * A seed device is an integral component of the sprout device, which
         * functions as a multi-device filesystem. So, temp-fsid feature is
         * not supported.
         */
        if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)
                return fsid_fs_devices;

        /* Try to find a fs_devices by matching devt. */
        list_for_each_entry(devt_fs_devices, &fs_uuids, fs_list) {
                struct btrfs_device *device;

                list_for_each_entry(device, &devt_fs_devices->devices, dev_list) {
                        if (device->devt == devt) {
                                found_by_devt = true;
                                break;
                        }
                }
                if (found_by_devt)
                        break;
        }

        if (found_by_devt) {
                /* Existing device. */
                if (fsid_fs_devices == NULL) {
                        if (devt_fs_devices->opened == 0) {
                                /* Stale device. */
                                return NULL;
                        } else {
                                /* temp_fsid is mounting a subvol. */
                                return devt_fs_devices;
                        }
                } else {
                        /* Regular or temp_fsid device mounting a subvol. */
                        return devt_fs_devices;
                }
        } else {
                /* New device. */
                if (fsid_fs_devices == NULL) {
                        return NULL;
                } else {
                        /* sb::fsid is already used create a new temp_fsid. */
                        *same_fsid_diff_dev = true;
                        return NULL;
                }
        }

        /* Not reached. */
}

/*
 * This is only used on mount, and we are protected from competing things
 * messing with our fs_devices by the uuid_mutex, thus we do not need the
 * fs_devices->device_list_mutex here.
 */
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
                        struct btrfs_device *device, blk_mode_t flags,
                        void *holder)
{
        struct file *bdev_file;
        struct btrfs_super_block *disk_super;
        u64 devid;
        int ret;

        if (device->bdev)
                return -EINVAL;
        if (!device->name)
                return -EINVAL;

        ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
                                    &bdev_file, &disk_super);
        if (ret)
                return ret;

        devid = btrfs_stack_device_id(&disk_super->dev_item);
        if (devid != device->devid)
                goto error_free_page;

        if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
                goto error_free_page;

        device->generation = btrfs_super_generation(disk_super);

        if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
                if (btrfs_super_incompat_flags(disk_super) &
                    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
                        pr_err(
                "BTRFS: Invalid seeding and uuid-changed device detected\n");
                        goto error_free_page;
                }

                clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
                fs_devices->seeding = true;
        } else {
                if (bdev_read_only(file_bdev(bdev_file)))
                        clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
                else
                        set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
        }

        if (!bdev_nonrot(file_bdev(bdev_file)))
                fs_devices->rotating = true;

        if (bdev_max_discard_sectors(file_bdev(bdev_file)))
                fs_devices->discardable = true;

        device->bdev_file = bdev_file;
        device->bdev = file_bdev(bdev_file);
        clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);

        if (device->devt != device->bdev->bd_dev) {
                btrfs_warn(NULL,
                           "device %s maj:min changed from %d:%d to %d:%d",
                           device->name->str, MAJOR(device->devt),
                           MINOR(device->devt), MAJOR(device->bdev->bd_dev),
                           MINOR(device->bdev->bd_dev));

                device->devt = device->bdev->bd_dev;
        }

        fs_devices->open_devices++;
        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
            device->devid != BTRFS_DEV_REPLACE_DEVID) {
                fs_devices->rw_devices++;
                list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
        }
        btrfs_release_disk_super(disk_super);

        return 0;

error_free_page:
        btrfs_release_disk_super(disk_super);
        fput(bdev_file);

        return -EINVAL;
}

u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb)
{
        bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) &
                                  BTRFS_FEATURE_INCOMPAT_METADATA_UUID);

        return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
}

/*
 * Add new device to list of registered devices
 *
 * Returns:
 * device pointer which was just added or updated when successful
 * error pointer when failed
 */
static noinline struct btrfs_device *device_list_add(const char *path,
                           struct btrfs_super_block *disk_super,
                           bool *new_device_added)
{
        struct btrfs_device *device;
        struct btrfs_fs_devices *fs_devices = NULL;
        struct rcu_string *name;
        u64 found_transid = btrfs_super_generation(disk_super);
        u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
        dev_t path_devt;
        int error;
        bool same_fsid_diff_dev = false;
        bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
                BTRFS_FEATURE_INCOMPAT_METADATA_UUID);

        if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
                btrfs_err(NULL,
"device %s has incomplete metadata_uuid change, please use btrfstune to complete",
                          path);
                return ERR_PTR(-EAGAIN);
        }

        error = lookup_bdev(path, &path_devt);
        if (error) {
                btrfs_err(NULL, "failed to lookup block device for path %s: %d",
                          path, error);
                return ERR_PTR(error);
        }

        fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev);

        if (!fs_devices) {
                fs_devices = alloc_fs_devices(disk_super->fsid);
                if (IS_ERR(fs_devices))
                        return ERR_CAST(fs_devices);

                if (has_metadata_uuid)
                        memcpy(fs_devices->metadata_uuid,
                               disk_super->metadata_uuid, BTRFS_FSID_SIZE);

                if (same_fsid_diff_dev) {
                        generate_random_uuid(fs_devices->fsid);
                        fs_devices->temp_fsid = true;
                pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n",
                                path, MAJOR(path_devt), MINOR(path_devt),
                                fs_devices->fsid);
                }

                mutex_lock(&fs_devices->device_list_mutex);
                list_add(&fs_devices->fs_list, &fs_uuids);

                device = NULL;
        } else {
                struct btrfs_dev_lookup_args args = {
                        .devid = devid,
                        .uuid = disk_super->dev_item.uuid,
                };

                mutex_lock(&fs_devices->device_list_mutex);
                device = btrfs_find_device(fs_devices, &args);

                if (found_transid > fs_devices->latest_generation) {
                        memcpy(fs_devices->fsid, disk_super->fsid,
                                        BTRFS_FSID_SIZE);
                        memcpy(fs_devices->metadata_uuid,
                               btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE);
                }
        }

        if (!device) {
                unsigned int nofs_flag;

                if (fs_devices->opened) {
                        btrfs_err(NULL,
"device %s (%d:%d) belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)",
                                  path, MAJOR(path_devt), MINOR(path_devt),
                                  fs_devices->fsid, current->comm,
                                  task_pid_nr(current));
                        mutex_unlock(&fs_devices->device_list_mutex);
                        return ERR_PTR(-EBUSY);
                }

                nofs_flag = memalloc_nofs_save();
                device = btrfs_alloc_device(NULL, &devid,
                                            disk_super->dev_item.uuid, path);
                memalloc_nofs_restore(nofs_flag);
                if (IS_ERR(device)) {
                        mutex_unlock(&fs_devices->device_list_mutex);
                        /* we can safely leave the fs_devices entry around */
                        return device;
                }

                device->devt = path_devt;

                list_add_rcu(&device->dev_list, &fs_devices->devices);
                fs_devices->num_devices++;

                device->fs_devices = fs_devices;
                *new_device_added = true;

                if (disk_super->label[0])
                        pr_info(
"BTRFS: device label %s devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n",
                                disk_super->label, devid, found_transid, path,
                                MAJOR(path_devt), MINOR(path_devt),
                                current->comm, task_pid_nr(current));
                else
                        pr_info(
"BTRFS: device fsid %pU devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n",
                                disk_super->fsid, devid, found_transid, path,
                                MAJOR(path_devt), MINOR(path_devt),
                                current->comm, task_pid_nr(current));

        } else if (!device->name || strcmp(device->name->str, path)) {
                /*
                 * When FS is already mounted.
                 * 1. If you are here and if the device->name is NULL that
                 *    means this device was missing at time of FS mount.
                 * 2. If you are here and if the device->name is different
                 *    from 'path' that means either
                 *      a. The same device disappeared and reappeared with
                 *         different name. or
                 *      b. The missing-disk-which-was-replaced, has
                 *         reappeared now.
                 *
                 * We must allow 1 and 2a above. But 2b would be a spurious
                 * and unintentional.
                 *
                 * Further in case of 1 and 2a above, the disk at 'path'
                 * would have missed some transaction when it was away and
                 * in case of 2a the stale bdev has to be updated as well.
                 * 2b must not be allowed at all time.
                 */

                /*
                 * For now, we do allow update to btrfs_fs_device through the
                 * btrfs dev scan cli after FS has been mounted.  We're still
                 * tracking a problem where systems fail mount by subvolume id
                 * when we reject replacement on a mounted FS.
                 */
                if (!fs_devices->opened && found_transid < device->generation) {
                        /*
                         * That is if the FS is _not_ mounted and if you
                         * are here, that means there is more than one
                         * disk with same uuid and devid.We keep the one
                         * with larger generation number or the last-in if
                         * generation are equal.
                         */
                        mutex_unlock(&fs_devices->device_list_mutex);
                        btrfs_err(NULL,
"device %s already registered with a higher generation, found %llu expect %llu",
                                  path, found_transid, device->generation);
                        return ERR_PTR(-EEXIST);
                }

                /*
                 * We are going to replace the device path for a given devid,
                 * make sure it's the same device if the device is mounted
                 *
                 * NOTE: the device->fs_info may not be reliable here so pass
                 * in a NULL to message helpers instead. This avoids a possible
                 * use-after-free when the fs_info and fs_info->sb are already
                 * torn down.
                 */
                if (device->bdev) {
                        if (device->devt != path_devt) {
                                mutex_unlock(&fs_devices->device_list_mutex);
                                btrfs_warn_in_rcu(NULL,
        "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
                                                  path, devid, found_transid,
                                                  current->comm,
                                                  task_pid_nr(current));
                                return ERR_PTR(-EEXIST);
                        }
                        btrfs_info_in_rcu(NULL,
        "devid %llu device path %s changed to %s scanned by %s (%d)",
                                          devid, btrfs_dev_name(device),
                                          path, current->comm,
                                          task_pid_nr(current));
                }

                name = rcu_string_strdup(path, GFP_NOFS);
                if (!name) {
                        mutex_unlock(&fs_devices->device_list_mutex);
                        return ERR_PTR(-ENOMEM);
                }
                rcu_string_free(device->name);
                rcu_assign_pointer(device->name, name);
                if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
                        fs_devices->missing_devices--;
                        clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
                }
                device->devt = path_devt;
        }

        /*
         * Unmount does not free the btrfs_device struct but would zero
         * generation along with most of the other members. So just update
         * it back. We need it to pick the disk with largest generation
         * (as above).
         */
        if (!fs_devices->opened) {
                device->generation = found_transid;
                fs_devices->latest_generation = max_t(u64, found_transid,
                                                fs_devices->latest_generation);
        }

        fs_devices->total_devices = btrfs_super_num_devices(disk_super);

        mutex_unlock(&fs_devices->device_list_mutex);
        return device;
}

static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
{
        struct btrfs_fs_devices *fs_devices;
        struct btrfs_device *device;
        struct btrfs_device *orig_dev;
        int ret = 0;

        lockdep_assert_held(&uuid_mutex);

        fs_devices = alloc_fs_devices(orig->fsid);
        if (IS_ERR(fs_devices))
                return fs_devices;

        fs_devices->total_devices = orig->total_devices;

        list_for_each_entry(orig_dev, &orig->devices, dev_list) {
                const char *dev_path = NULL;

                /*
                 * This is ok to do without RCU read locked because we hold the
                 * uuid mutex so nothing we touch in here is going to disappear.
                 */
                if (orig_dev->name)
                        dev_path = orig_dev->name->str;

                device = btrfs_alloc_device(NULL, &orig_dev->devid,
                                            orig_dev->uuid, dev_path);
                if (IS_ERR(device)) {
                        ret = PTR_ERR(device);
                        goto error;
                }

                if (orig_dev->zone_info) {
                        struct btrfs_zoned_device_info *zone_info;

                        zone_info = btrfs_clone_dev_zone_info(orig_dev);
                        if (!zone_info) {
                                btrfs_free_device(device);
                                ret = -ENOMEM;
                                goto error;
                        }
                        device->zone_info = zone_info;
                }

                list_add(&device->dev_list, &fs_devices->devices);
                device->fs_devices = fs_devices;
                fs_devices->num_devices++;
        }
        return fs_devices;
error:
        free_fs_devices(fs_devices);
        return ERR_PTR(ret);
}

static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
                                      struct btrfs_device **latest_dev)
{
        struct btrfs_device *device, *next;

        /* This is the initialized path, it is safe to release the devices. */
        list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
                if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
                        if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
                                      &device->dev_state) &&
                            !test_bit(BTRFS_DEV_STATE_MISSING,
                                      &device->dev_state) &&
                            (!*latest_dev ||
                             device->generation > (*latest_dev)->generation)) {
                                *latest_dev = device;
                        }
                        continue;
                }

                /*
                 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
                 * in btrfs_init_dev_replace() so just continue.
                 */
                if (device->devid == BTRFS_DEV_REPLACE_DEVID)
                        continue;

                if (device->bdev_file) {
                        fput(device->bdev_file);
                        device->bdev = NULL;
                        device->bdev_file = NULL;
                        fs_devices->open_devices--;
                }
                if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
                        list_del_init(&device->dev_alloc_list);
                        clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
                        fs_devices->rw_devices--;
                }
                list_del_init(&device->dev_list);
                fs_devices->num_devices--;
                btrfs_free_device(device);
        }

}

/*
 * After we have read the system tree and know devids belonging to this
 * filesystem, remove the device which does not belong there.
 */
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
{
        struct btrfs_device *latest_dev = NULL;
        struct btrfs_fs_devices *seed_dev;

        mutex_lock(&uuid_mutex);
        __btrfs_free_extra_devids(fs_devices, &latest_dev);

        list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
                __btrfs_free_extra_devids(seed_dev, &latest_dev);

        fs_devices->latest_dev = latest_dev;

        mutex_unlock(&uuid_mutex);
}

static void btrfs_close_bdev(struct btrfs_device *device)
{
        if (!device->bdev)
                return;

        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
                sync_blockdev(device->bdev);
                invalidate_bdev(device->bdev);
        }

        fput(device->bdev_file);
}

static void btrfs_close_one_device(struct btrfs_device *device)
{
        struct btrfs_fs_devices *fs_devices = device->fs_devices;

        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
            device->devid != BTRFS_DEV_REPLACE_DEVID) {
                list_del_init(&device->dev_alloc_list);
                fs_devices->rw_devices--;
        }

        if (device->devid == BTRFS_DEV_REPLACE_DEVID)
                clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);

        if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
                clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
                fs_devices->missing_devices--;
        }

        btrfs_close_bdev(device);
        if (device->bdev) {
                fs_devices->open_devices--;
                device->bdev = NULL;
        }
        clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
        btrfs_destroy_dev_zone_info(device);

        device->fs_info = NULL;
        atomic_set(&device->dev_stats_ccnt, 0);
        extent_io_tree_release(&device->alloc_state);

        /*
         * Reset the flush error record. We might have a transient flush error
         * in this mount, and if so we aborted the current transaction and set
         * the fs to an error state, guaranteeing no super blocks can be further
         * committed. However that error might be transient and if we unmount the
         * filesystem and mount it again, we should allow the mount to succeed
         * (btrfs_check_rw_degradable() should not fail) - if after mounting the
         * filesystem again we still get flush errors, then we will again abort
         * any transaction and set the error state, guaranteeing no commits of
         * unsafe super blocks.
         */
        device->last_flush_error = 0;

        /* Verify the device is back in a pristine state  */
        WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
        WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
        WARN_ON(!list_empty(&device->dev_alloc_list));
        WARN_ON(!list_empty(&device->post_commit_list));
}

static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
{
        struct btrfs_device *device, *tmp;

        lockdep_assert_held(&uuid_mutex);

        if (--fs_devices->opened > 0)
                return;

        list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
                btrfs_close_one_device(device);

        WARN_ON(fs_devices->open_devices);
        WARN_ON(fs_devices->rw_devices);
        fs_devices->opened = 0;
        fs_devices->seeding = false;
        fs_devices->fs_info = NULL;
}

void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
        LIST_HEAD(list);
        struct btrfs_fs_devices *tmp;

        mutex_lock(&uuid_mutex);
        close_fs_devices(fs_devices);
        if (!fs_devices->opened) {
                list_splice_init(&fs_devices->seed_list, &list);

                /*
                 * If the struct btrfs_fs_devices is not assembled with any
                 * other device, it can be re-initialized during the next mount
                 * without the needing device-scan step. Therefore, it can be
                 * fully freed.
                 */
                if (fs_devices->num_devices == 1) {
                        list_del(&fs_devices->fs_list);
                        free_fs_devices(fs_devices);
                }
        }


        list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
                close_fs_devices(fs_devices);
                list_del(&fs_devices->seed_list);
                free_fs_devices(fs_devices);
        }
        mutex_unlock(&uuid_mutex);
}

static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
                                blk_mode_t flags, void *holder)
{
        struct btrfs_device *device;
        struct btrfs_device *latest_dev = NULL;
        struct btrfs_device *tmp_device;
        int ret = 0;

        list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
                                 dev_list) {
                int ret2;

                ret2 = btrfs_open_one_device(fs_devices, device, flags, holder);
                if (ret2 == 0 &&
                    (!latest_dev || device->generation > latest_dev->generation)) {
                        latest_dev = device;
                } else if (ret2 == -ENODATA) {
                        fs_devices->num_devices--;
                        list_del(&device->dev_list);
                        btrfs_free_device(device);
                }
                if (ret == 0 && ret2 != 0)
                        ret = ret2;
        }

        if (fs_devices->open_devices == 0) {
                if (ret)
                        return ret;
                return -EINVAL;
        }

        fs_devices->opened = 1;
        fs_devices->latest_dev = latest_dev;
        fs_devices->total_rw_bytes = 0;
        fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
        fs_devices->read_policy = BTRFS_READ_POLICY_PID;

        return 0;
}

static int devid_cmp(void *priv, const struct list_head *a,
                     const struct list_head *b)
{
        const struct btrfs_device *dev1, *dev2;

        dev1 = list_entry(a, struct btrfs_device, dev_list);
        dev2 = list_entry(b, struct btrfs_device, dev_list);

        if (dev1->devid < dev2->devid)
                return -1;
        else if (dev1->devid > dev2->devid)
                return 1;
        return 0;
}

int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                       blk_mode_t flags, void *holder)
{
        int ret;

        lockdep_assert_held(&uuid_mutex);
        /*
         * The device_list_mutex cannot be taken here in case opening the
         * underlying device takes further locks like open_mutex.
         *
         * We also don't need the lock here as this is called during mount and
         * exclusion is provided by uuid_mutex
         */

        if (fs_devices->opened) {
                fs_devices->opened++;
                ret = 0;
        } else {
                list_sort(NULL, &fs_devices->devices, devid_cmp);
                ret = open_fs_devices(fs_devices, flags, holder);
        }

        return ret;
}

void btrfs_release_disk_super(struct btrfs_super_block *super)
{
        struct page *page = virt_to_page(super);

        put_page(page);
}

static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
                                                       u64 bytenr, u64 bytenr_orig)
{
        struct btrfs_super_block *disk_super;
        struct page *page;
        void *p;
        pgoff_t index;

        /* make sure our super fits in the device */
        if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
                return ERR_PTR(-EINVAL);

        /* make sure our super fits in the page */
        if (sizeof(*disk_super) > PAGE_SIZE)
                return ERR_PTR(-EINVAL);

        /* make sure our super doesn't straddle pages on disk */
        index = bytenr >> PAGE_SHIFT;
        if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
                return ERR_PTR(-EINVAL);

        /* pull in the page with our super */
        page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL);

        if (IS_ERR(page))
                return ERR_CAST(page);

        p = page_address(page);

        /* align our pointer to the offset of the super block */
        disk_super = p + offset_in_page(bytenr);

        if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
            btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
                btrfs_release_disk_super(p);
                return ERR_PTR(-EINVAL);
        }

        if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
                disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;

        return disk_super;
}

int btrfs_forget_devices(dev_t devt)
{
        int ret;

        mutex_lock(&uuid_mutex);
        ret = btrfs_free_stale_devices(devt, NULL);
        mutex_unlock(&uuid_mutex);

        return ret;
}

static bool btrfs_skip_registration(struct btrfs_super_block *disk_super,
                                    const char *path, dev_t devt,
                                    bool mount_arg_dev)
{
        struct btrfs_fs_devices *fs_devices;

        /*
         * Do not skip device registration for mounted devices with matching
         * maj:min but different paths. Booting without initrd relies on
         * /dev/root initially, later replaced with the actual root device.
         * A successful scan ensures grub2-probe selects the correct device.
         */
        list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
                struct btrfs_device *device;

                mutex_lock(&fs_devices->device_list_mutex);

                if (!fs_devices->opened) {
                        mutex_unlock(&fs_devices->device_list_mutex);
                        continue;
                }

                list_for_each_entry(device, &fs_devices->devices, dev_list) {
                        if (device->bdev && (device->bdev->bd_dev == devt) &&
                            strcmp(device->name->str, path) != 0) {
                                mutex_unlock(&fs_devices->device_list_mutex);

                                /* Do not skip registration. */
                                return false;
                        }
                }
                mutex_unlock(&fs_devices->device_list_mutex);
        }

        if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
            !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING))
                return true;

        return false;
}

/*
 * Look for a btrfs signature on a device. This may be called out of the mount path
 * and we are not allowed to call set_blocksize during the scan. The superblock
 * is read via pagecache.
 *
 * With @mount_arg_dev it's a scan during mount time that will always register
 * the device or return an error. Multi-device and seeding devices are registered
 * in both cases.
 */
struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
                                           bool mount_arg_dev)
{
        struct btrfs_super_block *disk_super;
        bool new_device_added = false;
        struct btrfs_device *device = NULL;
        struct file *bdev_file;
        u64 bytenr, bytenr_orig;
        dev_t devt;
        int ret;

        lockdep_assert_held(&uuid_mutex);

        /*
         * we would like to check all the supers, but that would make
         * a btrfs mount succeed after a mkfs from a different FS.
         * So, we need to add a special mount option to scan for
         * later supers, using BTRFS_SUPER_MIRROR_MAX instead
         */

        /*
         * Avoid an exclusive open here, as the systemd-udev may initiate the
         * device scan which may race with the user's mount or mkfs command,
         * resulting in failure.
         * Since the device scan is solely for reading purposes, there is no
         * need for an exclusive open. Additionally, the devices are read again
         * during the mount process. It is ok to get some inconsistent
         * values temporarily, as the device paths of the fsid are the only
         * required information for assembling the volume.
         */
        bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL);
        if (IS_ERR(bdev_file))
                return ERR_CAST(bdev_file);

        bytenr_orig = btrfs_sb_offset(0);
        ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr);
        if (ret) {
                device = ERR_PTR(ret);
                goto error_bdev_put;
        }

        disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr,
                                           bytenr_orig);
        if (IS_ERR(disk_super)) {
                device = ERR_CAST(disk_super);
                goto error_bdev_put;
        }

        devt = file_bdev(bdev_file)->bd_dev;
        if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) {
                pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n",
                          path, MAJOR(devt), MINOR(devt));

                btrfs_free_stale_devices(devt, NULL);

                device = NULL;
                goto free_disk_super;
        }

        device = device_list_add(path, disk_super, &new_device_added);
        if (!IS_ERR(device) && new_device_added)
                btrfs_free_stale_devices(device->devt, device);

free_disk_super:
        btrfs_release_disk_super(disk_super);

error_bdev_put:
        fput(bdev_file);

        return device;
}

/*
 * Try to find a chunk that intersects [start, start + len] range and when one
 * such is found, record the end of it in *start
 */
static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
                                    u64 len)
{
        u64 physical_start, physical_end;

        lockdep_assert_held(&device->fs_info->chunk_mutex);

        if (find_first_extent_bit(&device->alloc_state, *start,
                                  &physical_start, &physical_end,
                                  CHUNK_ALLOCATED, NULL)) {

                if (in_range(physical_start, *start, len) ||
                    in_range(*start, physical_start,
                             physical_end + 1 - physical_start)) {
                        *start = physical_end + 1;
                        return true;
                }
        }
        return false;
}

static u64 dev_extent_search_start(struct btrfs_device *device)
{
        switch (device->fs_devices->chunk_alloc_policy) {
        case BTRFS_CHUNK_ALLOC_REGULAR:
                return BTRFS_DEVICE_RANGE_RESERVED;
        case BTRFS_CHUNK_ALLOC_ZONED:
                /*
                 * We don't care about the starting region like regular
                 * allocator, because we anyway use/reserve the first two zones
                 * for superblock logging.
                 */
                return 0;
        default:
                BUG();
        }
}

static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
                                        u64 *hole_start, u64 *hole_size,
                                        u64 num_bytes)
{
        u64 zone_size = device->zone_info->zone_size;
        u64 pos;
        int ret;
        bool changed = false;

        ASSERT(IS_ALIGNED(*hole_start, zone_size));

        while (*hole_size > 0) {
                pos = btrfs_find_allocatable_zones(device, *hole_start,
                                                   *hole_start + *hole_size,
                                                   num_bytes);
                if (pos != *hole_start) {
                        *hole_size = *hole_start + *hole_size - pos;
                        *hole_start = pos;
                        changed = true;
                        if (*hole_size < num_bytes)
                                break;
                }

                ret = btrfs_ensure_empty_zones(device, pos, num_bytes);

                /* Range is ensured to be empty */
                if (!ret)
                        return changed;

                /* Given hole range was invalid (outside of device) */
                if (ret == -ERANGE) {
                        *hole_start += *hole_size;
                        *hole_size = 0;
                        return true;
                }

                *hole_start += zone_size;
                *hole_size -= zone_size;
                changed = true;
        }

        return changed;
}

/*
 * Check if specified hole is suitable for allocation.
 *
 * @device:        the device which we have the hole
 * @hole_start: starting position of the hole
 * @hole_size:        the size of the hole
 * @num_bytes:        the size of the free space that we need
 *
 * This function may modify @hole_start and @hole_size to reflect the suitable
 * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
 */
static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
                                  u64 *hole_size, u64 num_bytes)
{
        bool changed = false;
        u64 hole_end = *hole_start + *hole_size;

        for (;;) {
                /*
                 * Check before we set max_hole_start, otherwise we could end up
                 * sending back this offset anyway.
                 */
                if (contains_pending_extent(device, hole_start, *hole_size)) {
                        if (hole_end >= *hole_start)
                                *hole_size = hole_end - *hole_start;
                        else
                                *hole_size = 0;
                        changed = true;
                }

                switch (device->fs_devices->chunk_alloc_policy) {
                case BTRFS_CHUNK_ALLOC_REGULAR:
                        /* No extra check */
                        break;
                case BTRFS_CHUNK_ALLOC_ZONED:
                        if (dev_extent_hole_check_zoned(device, hole_start,
                                                        hole_size, num_bytes)) {
                                changed = true;
                                /*
                                 * The changed hole can contain pending extent.
                                 * Loop again to check that.
                                 */
                                continue;
                        }
                        break;
                default:
                        BUG();
                }

                break;
        }

        return changed;
}

/*
 * Find free space in the specified device.
 *
 * @device:          the device which we search the free space in
 * @num_bytes:          the size of the free space that we need
 * @search_start: the position from which to begin the search
 * @start:          store the start of the free space.
 * @len:          the size of the free space. that we find, or the size
 *                  of the max free space if we don't find suitable free space
 *
 * This does a pretty simple search, the expectation is that it is called very
 * infrequently and that a given device has a small number of extents.
 *
 * @start is used to store the start of the free space if we find. But if we
 * don't find suitable free space, it will be used to store the start position
 * of the max free space.
 *
 * @len is used to store the size of the free space that we find.
 * But if we don't find suitable free space, it is used to store the size of
 * the max free space.
 *
 * NOTE: This function will search *commit* root of device tree, and does extra
 * check to ensure dev extents are not double allocated.
 * This makes the function safe to allocate dev extents but may not report
 * correct usable device space, as device extent freed in current transaction
 * is not reported as available.
 */
static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
                                u64 *start, u64 *len)
{
        struct btrfs_fs_info *fs_info = device->fs_info;
        struct btrfs_root *root = fs_info->dev_root;
        struct btrfs_key key;
        struct btrfs_dev_extent *dev_extent;
        struct btrfs_path *path;
        u64 search_start;
        u64 hole_size;
        u64 max_hole_start;
        u64 max_hole_size = 0;
        u64 extent_end;
        u64 search_end = device->total_bytes;
        int ret;
        int slot;
        struct extent_buffer *l;

        search_start = dev_extent_search_start(device);
        max_hole_start = search_start;

        WARN_ON(device->zone_info &&
                !IS_ALIGNED(num_bytes, device->zone_info->zone_size));

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }
again:
        if (search_start >= search_end ||
                test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
                ret = -ENOSPC;
                goto out;
        }

        path->reada = READA_FORWARD;
        path->search_commit_root = 1;
        path->skip_locking = 1;

        key.objectid = device->devid;
        key.offset = search_start;
        key.type = BTRFS_DEV_EXTENT_KEY;

        ret = btrfs_search_backwards(root, &key, path);
        if (ret < 0)
                goto out;

        while (search_start < search_end) {
                l = path->nodes[0];
                slot = path->slots[0];
                if (slot >= btrfs_header_nritems(l)) {
                        ret = btrfs_next_leaf(root, path);
                        if (ret == 0)
                                continue;
                        if (ret < 0)
                                goto out;

                        break;
                }
                btrfs_item_key_to_cpu(l, &key, slot);

                if (key.objectid < device->devid)
                        goto next;

                if (key.objectid > device->devid)
                        break;

                if (key.type != BTRFS_DEV_EXTENT_KEY)
                        goto next;

                if (key.offset > search_end)
                        break;

                if (key.offset > search_start) {
                        hole_size = key.offset - search_start;
                        dev_extent_hole_check(device, &search_start, &hole_size,
                                              num_bytes);

                        if (hole_size > max_hole_size) {
                                max_hole_start = search_start;
                                max_hole_size = hole_size;
                        }

                        /*
                         * If this free space is greater than which we need,
                         * it must be the max free space that we have found
                         * until now, so max_hole_start must point to the start
                         * of this free space and the length of this free space
                         * is stored in max_hole_size. Thus, we return
                         * max_hole_start and max_hole_size and go back to the
                         * caller.
                         */
                        if (hole_size >= num_bytes) {
                                ret = 0;
                                goto out;
                        }
                }

                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
                extent_end = key.offset + btrfs_dev_extent_length(l,
                                                                  dev_extent);
                if (extent_end > search_start)
                        search_start = extent_end;
next:
                path->slots[0]++;
                cond_resched();
        }

        /*
         * At this point, search_start should be the end of
         * allocated dev extents, and when shrinking the device,
         * search_end may be smaller than search_start.
         */
        if (search_end > search_start) {
                hole_size = search_end - search_start;
                if (dev_extent_hole_check(device, &search_start, &hole_size,
                                          num_bytes)) {
                        btrfs_release_path(path);
                        goto again;
                }

                if (hole_size > max_hole_size) {
                        max_hole_start = search_start;
                        max_hole_size = hole_size;
                }
        }

        /* See above. */
        if (max_hole_size < num_bytes)
                ret = -ENOSPC;
        else
                ret = 0;

        ASSERT(max_hole_start + max_hole_size <= search_end);
out:
        btrfs_free_path(path);
        *start = max_hole_start;
        if (len)
                *len = max_hole_size;
        return ret;
}

static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
                          struct btrfs_device *device,
                          u64 start, u64 *dev_extent_len)
{
        struct btrfs_fs_info *fs_info = device->fs_info;
        struct btrfs_root *root = fs_info->dev_root;
        int ret;
        struct btrfs_path *path;
        struct btrfs_key key;
        struct btrfs_key found_key;
        struct extent_buffer *leaf = NULL;
        struct btrfs_dev_extent *extent = NULL;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = device->devid;
        key.offset = start;
        key.type = BTRFS_DEV_EXTENT_KEY;
again:
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret > 0) {
                ret = btrfs_previous_item(root, path, key.objectid,
                                          BTRFS_DEV_EXTENT_KEY);
                if (ret)
                        goto out;
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                extent = btrfs_item_ptr(leaf, path->slots[0],
                                        struct btrfs_dev_extent);
                BUG_ON(found_key.offset > start || found_key.offset +
                       btrfs_dev_extent_length(leaf, extent) < start);
                key = found_key;
                btrfs_release_path(path);
                goto again;
        } else if (ret == 0) {
                leaf = path->nodes[0];
                extent = btrfs_item_ptr(leaf, path->slots[0],
                                        struct btrfs_dev_extent);
        } else {
                goto out;
        }

        *dev_extent_len = btrfs_dev_extent_length(leaf, extent);

        ret = btrfs_del_item(trans, root, path);
        if (ret == 0)
                set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
out:
        btrfs_free_path(path);
        return ret;
}

static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
{
        struct rb_node *n;
        u64 ret = 0;

        read_lock(&fs_info->mapping_tree_lock);
        n = rb_last(&fs_info->mapping_tree.rb_root);
        if (n) {
                struct btrfs_chunk_map *map;

                map = rb_entry(n, struct btrfs_chunk_map, rb_node);
                ret = map->start + map->chunk_len;
        }
        read_unlock(&fs_info->mapping_tree_lock);

        return ret;
}

static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
                                    u64 *devid_ret)
{
        int ret;
        struct btrfs_key key;
        struct btrfs_key found_key;
        struct btrfs_path *path;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
        key.type = BTRFS_DEV_ITEM_KEY;
        key.offset = (u64)-1;

        ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
        if (ret < 0)
                goto error;

        if (ret == 0) {
                /* Corruption */
                btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
                ret = -EUCLEAN;
                goto error;
        }

        ret = btrfs_previous_item(fs_info->chunk_root, path,
                                  BTRFS_DEV_ITEMS_OBJECTID,
                                  BTRFS_DEV_ITEM_KEY);
        if (ret) {
                *devid_ret = 1;
        } else {
                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
                                      path->slots[0]);
                *devid_ret = found_key.offset + 1;
        }
        ret = 0;
error:
        btrfs_free_path(path);
        return ret;
}

/*
 * the device information is stored in the chunk root
 * the btrfs_device struct should be fully filled in
 */
static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
                            struct btrfs_device *device)
{
        int ret;
        struct btrfs_path *path;
        struct btrfs_dev_item *dev_item;
        struct extent_buffer *leaf;
        struct btrfs_key key;
        unsigned long ptr;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
        key.type = BTRFS_DEV_ITEM_KEY;
        key.offset = device->devid;

        btrfs_reserve_chunk_metadata(trans, true);
        ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
                                      &key, sizeof(*dev_item));
        btrfs_trans_release_chunk_metadata(trans);
        if (ret)
                goto out;

        leaf = path->nodes[0];
        dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);

        btrfs_set_device_id(leaf, dev_item, device->devid);
        btrfs_set_device_generation(leaf, dev_item, 0);
        btrfs_set_device_type(leaf, dev_item, device->type);
        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
        btrfs_set_device_total_bytes(leaf, dev_item,
                                     btrfs_device_get_disk_total_bytes(device));
        btrfs_set_device_bytes_used(leaf, dev_item,
                                    btrfs_device_get_bytes_used(device));
        btrfs_set_device_group(leaf, dev_item, 0);
        btrfs_set_device_seek_speed(leaf, dev_item, 0);
        btrfs_set_device_bandwidth(leaf, dev_item, 0);
        btrfs_set_device_start_offset(leaf, dev_item, 0);

        ptr = btrfs_device_uuid(dev_item);
        write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
        ptr = btrfs_device_fsid(dev_item);
        write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
                            ptr, BTRFS_FSID_SIZE);
        btrfs_mark_buffer_dirty(trans, leaf);

        ret = 0;
out:
        btrfs_free_path(path);
        return ret;
}

/*
 * Function to update ctime/mtime for a given device path.
 * Mainly used for ctime/mtime based probe like libblkid.
 *
 * We don't care about errors here, this is just to be kind to userspace.
 */
static void update_dev_time(const char *device_path)
{
        struct path path;
        int ret;

        ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
        if (ret)
                return;

        inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION);
        path_put(&path);
}

static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
                             struct btrfs_device *device)
{
        struct btrfs_root *root = device->fs_info->chunk_root;
        int ret;
        struct btrfs_path *path;
        struct btrfs_key key;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
        key.type = BTRFS_DEV_ITEM_KEY;
        key.offset = device->devid;

        btrfs_reserve_chunk_metadata(trans, false);
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        btrfs_trans_release_chunk_metadata(trans);
        if (ret) {
                if (ret > 0)
                        ret = -ENOENT;
                goto out;
        }

        ret = btrfs_del_item(trans, root, path);
out:
        btrfs_free_path(path);
        return ret;
}

/*
 * Verify that @num_devices satisfies the RAID profile constraints in the whole
 * filesystem. It's up to the caller to adjust that number regarding eg. device
 * replace.
 */
static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
                u64 num_devices)
{
        u64 all_avail;
        unsigned seq;
        int i;

        do {
                seq = read_seqbegin(&fs_info->profiles_lock);

                all_avail = fs_info->avail_data_alloc_bits |
                            fs_info->avail_system_alloc_bits |
                            fs_info->avail_metadata_alloc_bits;
        } while (read_seqretry(&fs_info->profiles_lock, seq));

        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
                if (!(all_avail & btrfs_raid_array[i].bg_flag))
                        continue;

                if (num_devices < btrfs_raid_array[i].devs_min)
                        return btrfs_raid_array[i].mindev_error;
        }

        return 0;
}

static struct btrfs_device * btrfs_find_next_active_device(
                struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
{
        struct btrfs_device *next_device;

        list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
                if (next_device != device &&
                    !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
                    && next_device->bdev)
                        return next_device;
        }

        return NULL;
}

/*
 * Helper function to check if the given device is part of s_bdev / latest_dev
 * and replace it with the provided or the next active device, in the context
 * where this function called, there should be always be another device (or
 * this_dev) which is active.
 */
void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
                                            struct btrfs_device *next_device)
{
        struct btrfs_fs_info *fs_info = device->fs_info;

        if (!next_device)
                next_device = btrfs_find_next_active_device(fs_info->fs_devices,
                                                            device);
        ASSERT(next_device);

        if (fs_info->sb->s_bdev &&
                        (fs_info->sb->s_bdev == device->bdev))
                fs_info->sb->s_bdev = next_device->bdev;

        if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
                fs_info->fs_devices->latest_dev = next_device;
}

/*
 * Return btrfs_fs_devices::num_devices excluding the device that's being
 * currently replaced.
 */
static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
{
        u64 num_devices = fs_info->fs_devices->num_devices;

        down_read(&fs_info->dev_replace.rwsem);
        if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
                ASSERT(num_devices > 1);
                num_devices--;
        }
        up_read(&fs_info->dev_replace.rwsem);

        return num_devices;
}

static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info,
                                     struct block_device *bdev, int copy_num)
{
        struct btrfs_super_block *disk_super;
        const size_t len = sizeof(disk_super->magic);
        const u64 bytenr = btrfs_sb_offset(copy_num);
        int ret;

        disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr);
        if (IS_ERR(disk_super))
                return;

        memset(&disk_super->magic, 0, len);
        folio_mark_dirty(virt_to_folio(disk_super));
        btrfs_release_disk_super(disk_super);

        ret = sync_blockdev_range(bdev, bytenr, bytenr + len - 1);
        if (ret)
                btrfs_warn(fs_info, "error clearing superblock number %d (%d)",
                        copy_num, ret);
}

void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device)
{
        int copy_num;
        struct block_device *bdev = device->bdev;

        if (!bdev)
                return;

        for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
                if (bdev_is_zoned(bdev))
                        btrfs_reset_sb_log_zones(bdev, copy_num);
                else
                        btrfs_scratch_superblock(fs_info, bdev, copy_num);
        }

        /* Notify udev that device has changed */
        btrfs_kobject_uevent(bdev, KOBJ_CHANGE);

        /* Update ctime/mtime for device path for libblkid */
        update_dev_time(device->name->str);
}

int btrfs_rm_device(struct btrfs_fs_info *fs_info,
                    struct btrfs_dev_lookup_args *args,
                    struct file **bdev_file)
{
        struct btrfs_trans_handle *trans;
        struct btrfs_device *device;
        struct btrfs_fs_devices *cur_devices;
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        u64 num_devices;
        int ret = 0;

        if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
                btrfs_err(fs_info, "device remove not supported on extent tree v2 yet");
                return -EINVAL;
        }

        /*
         * The device list in fs_devices is accessed without locks (neither
         * uuid_mutex nor device_list_mutex) as it won't change on a mounted
         * filesystem and another device rm cannot run.
         */
        num_devices = btrfs_num_devices(fs_info);

        ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
        if (ret)
                return ret;

        device = btrfs_find_device(fs_info->fs_devices, args);
        if (!device) {
                if (args->missing)
                        ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
                else
                        ret = -ENOENT;
                return ret;
        }

        if (btrfs_pinned_by_swapfile(fs_info, device)) {
                btrfs_warn_in_rcu(fs_info,
                  "cannot remove device %s (devid %llu) due to active swapfile",
                                  btrfs_dev_name(device), device->devid);
                return -ETXTBSY;
        }

        if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
                return BTRFS_ERROR_DEV_TGT_REPLACE;

        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
            fs_info->fs_devices->rw_devices == 1)
                return BTRFS_ERROR_DEV_ONLY_WRITABLE;

        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
                mutex_lock(&fs_info->chunk_mutex);
                list_del_init(&device->dev_alloc_list);
                device->fs_devices->rw_devices--;
                mutex_unlock(&fs_info->chunk_mutex);
        }

        ret = btrfs_shrink_device(device, 0);
        if (ret)
                goto error_undo;

        trans = btrfs_start_transaction(fs_info->chunk_root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto error_undo;
        }

        ret = btrfs_rm_dev_item(trans, device);
        if (ret) {
                /* Any error in dev item removal is critical */
                btrfs_crit(fs_info,
                           "failed to remove device item for devid %llu: %d",
                           device->devid, ret);
                btrfs_abort_transaction(trans, ret);
                btrfs_end_transaction(trans);
                return ret;
        }

        clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
        btrfs_scrub_cancel_dev(device);

        /*
         * the device list mutex makes sure that we don't change
         * the device list while someone else is writing out all
         * the device supers. Whoever is writing all supers, should
         * lock the device list mutex before getting the number of
         * devices in the super block (super_copy). Conversely,
         * whoever updates the number of devices in the super block
         * (super_copy) should hold the device list mutex.
         */

        /*
         * In normal cases the cur_devices == fs_devices. But in case
         * of deleting a seed device, the cur_devices should point to
         * its own fs_devices listed under the fs_devices->seed_list.
         */
        cur_devices = device->fs_devices;
        mutex_lock(&fs_devices->device_list_mutex);
        list_del_rcu(&device->dev_list);

        cur_devices->num_devices--;
        cur_devices->total_devices--;
        /* Update total_devices of the parent fs_devices if it's seed */
        if (cur_devices != fs_devices)
                fs_devices->total_devices--;

        if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
                cur_devices->missing_devices--;

        btrfs_assign_next_active_device(device, NULL);

        if (device->bdev_file) {
                cur_devices->open_devices--;
                /* remove sysfs entry */
                btrfs_sysfs_remove_device(device);
        }

        num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
        btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
        mutex_unlock(&fs_devices->device_list_mutex);

        /*
         * At this point, the device is zero sized and detached from the
         * devices list.  All that's left is to zero out the old supers and
         * free the device.
         *
         * We cannot call btrfs_close_bdev() here because we're holding the sb
         * write lock, and fput() on the block device will pull in the
         * ->open_mutex on the block device and it's dependencies.  Instead
         *  just flush the device and let the caller do the final bdev_release.
         */
        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
                btrfs_scratch_superblocks(fs_info, device);
                if (device->bdev) {
                        sync_blockdev(device->bdev);
                        invalidate_bdev(device->bdev);
                }
        }

        *bdev_file = device->bdev_file;
        synchronize_rcu();
        btrfs_free_device(device);

        /*
         * This can happen if cur_devices is the private seed devices list.  We
         * cannot call close_fs_devices() here because it expects the uuid_mutex
         * to be held, but in fact we don't need that for the private
         * seed_devices, we can simply decrement cur_devices->opened and then
         * remove it from our list and free the fs_devices.
         */
        if (cur_devices->num_devices == 0) {
                list_del_init(&cur_devices->seed_list);
                ASSERT(cur_devices->opened == 1);
                cur_devices->opened--;
                free_fs_devices(cur_devices);
        }

        ret = btrfs_commit_transaction(trans);

        return ret;

error_undo:
        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
                mutex_lock(&fs_info->chunk_mutex);
                list_add(&device->dev_alloc_list,
                         &fs_devices->alloc_list);
                device->fs_devices->rw_devices++;
                mutex_unlock(&fs_info->chunk_mutex);
        }
        return ret;
}

void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
{
        struct btrfs_fs_devices *fs_devices;

        lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);

        /*
         * in case of fs with no seed, srcdev->fs_devices will point
         * to fs_devices of fs_info. However when the dev being replaced is
         * a seed dev it will point to the seed's local fs_devices. In short
         * srcdev will have its correct fs_devices in both the cases.
         */
        fs_devices = srcdev->fs_devices;

        list_del_rcu(&srcdev->dev_list);
        list_del(&srcdev->dev_alloc_list);
        fs_devices->num_devices--;
        if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
                fs_devices->missing_devices--;

        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
                fs_devices->rw_devices--;

        if (srcdev->bdev)
                fs_devices->open_devices--;
}

void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
{
        struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;

        mutex_lock(&uuid_mutex);

        btrfs_close_bdev(srcdev);
        synchronize_rcu();
        btrfs_free_device(srcdev);

        /* if this is no devs we rather delete the fs_devices */
        if (!fs_devices->num_devices) {
                /*
                 * On a mounted FS, num_devices can't be zero unless it's a
                 * seed. In case of a seed device being replaced, the replace
                 * target added to the sprout FS, so there will be no more
                 * device left under the seed FS.
                 */
                ASSERT(fs_devices->seeding);

                list_del_init(&fs_devices->seed_list);
                close_fs_devices(fs_devices);
                free_fs_devices(fs_devices);
        }
        mutex_unlock(&uuid_mutex);
}

void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
{
        struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;

        mutex_lock(&fs_devices->device_list_mutex);

        btrfs_sysfs_remove_device(tgtdev);

        if (tgtdev->bdev)
                fs_devices->open_devices--;

        fs_devices->num_devices--;

        btrfs_assign_next_active_device(tgtdev, NULL);

        list_del_rcu(&tgtdev->dev_list);

        mutex_unlock(&fs_devices->device_list_mutex);

        btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev);

        btrfs_close_bdev(tgtdev);
        synchronize_rcu();
        btrfs_free_device(tgtdev);
}

/*
 * Populate args from device at path.
 *
 * @fs_info:        the filesystem
 * @args:        the args to populate
 * @path:        the path to the device
 *
 * This will read the super block of the device at @path and populate @args with
 * the devid, fsid, and uuid.  This is meant to be used for ioctls that need to
 * lookup a device to operate on, but need to do it before we take any locks.
 * This properly handles the special case of "missing" that a user may pass in,
 * and does some basic sanity checks.  The caller must make sure that @path is
 * properly NUL terminated before calling in, and must call
 * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
 * uuid buffers.
 *
 * Return: 0 for success, -errno for failure
 */
int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
                                 struct btrfs_dev_lookup_args *args,
                                 const char *path)
{
        struct btrfs_super_block *disk_super;
        struct file *bdev_file;
        int ret;

        if (!path || !path[0])
                return -EINVAL;
        if (!strcmp(path, "missing")) {
                args->missing = true;
                return 0;
        }

        args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
        args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
        if (!args->uuid || !args->fsid) {
                btrfs_put_dev_args_from_path(args);
                return -ENOMEM;
        }

        ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
                                    &bdev_file, &disk_super);
        if (ret) {
                btrfs_put_dev_args_from_path(args);
                return ret;
        }

        args->devid = btrfs_stack_device_id(&disk_super->dev_item);
        memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
        if (btrfs_fs_incompat(fs_info, METADATA_UUID))
                memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
        else
                memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
        btrfs_release_disk_super(disk_super);
        fput(bdev_file);
        return 0;
}

/*
 * Only use this jointly with btrfs_get_dev_args_from_path() because we will
 * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
 * that don't need to be freed.
 */
void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
{
        kfree(args->uuid);
        kfree(args->fsid);
        args->uuid = NULL;
        args->fsid = NULL;
}

struct btrfs_device *btrfs_find_device_by_devspec(
                struct btrfs_fs_info *fs_info, u64 devid,
                const char *device_path)
{
        BTRFS_DEV_LOOKUP_ARGS(args);
        struct btrfs_device *device;
        int ret;

        if (devid) {
                args.devid = devid;
                device = btrfs_find_device(fs_info->fs_devices, &args);
                if (!device)
                        return ERR_PTR(-ENOENT);
                return device;
        }

        ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
        if (ret)
                return ERR_PTR(ret);
        device = btrfs_find_device(fs_info->fs_devices, &args);
        btrfs_put_dev_args_from_path(&args);
        if (!device)
                return ERR_PTR(-ENOENT);
        return device;
}

static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
{
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        struct btrfs_fs_devices *old_devices;
        struct btrfs_fs_devices *seed_devices;

        lockdep_assert_held(&uuid_mutex);
        if (!fs_devices->seeding)
                return ERR_PTR(-EINVAL);

        /*
         * Private copy of the seed devices, anchored at
         * fs_info->fs_devices->seed_list
         */
        seed_devices = alloc_fs_devices(NULL);
        if (IS_ERR(seed_devices))
                return seed_devices;

        /*
         * It's necessary to retain a copy of the original seed fs_devices in
         * fs_uuids so that filesystems which have been seeded can successfully
         * reference the seed device from open_seed_devices. This also supports
         * multiple fs seed.
         */
        old_devices = clone_fs_devices(fs_devices);
        if (IS_ERR(old_devices)) {
                kfree(seed_devices);
                return old_devices;
        }

        list_add(&old_devices->fs_list, &fs_uuids);

        memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
        seed_devices->opened = 1;
        INIT_LIST_HEAD(&seed_devices->devices);
        INIT_LIST_HEAD(&seed_devices->alloc_list);
        mutex_init(&seed_devices->device_list_mutex);

        return seed_devices;
}

/*
 * Splice seed devices into the sprout fs_devices.
 * Generate a new fsid for the sprouted read-write filesystem.
 */
static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info,
                               struct btrfs_fs_devices *seed_devices)
{
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        struct btrfs_super_block *disk_super = fs_info->super_copy;
        struct btrfs_device *device;
        u64 super_flags;

        /*
         * We are updating the fsid, the thread leading to device_list_add()
         * could race, so uuid_mutex is needed.
         */
        lockdep_assert_held(&uuid_mutex);

        /*
         * The threads listed below may traverse dev_list but can do that without
         * device_list_mutex:
         * - All device ops and balance - as we are in btrfs_exclop_start.
         * - Various dev_list readers - are using RCU.
         * - btrfs_ioctl_fitrim() - is using RCU.
         *
         * For-read threads as below are using device_list_mutex:
         * - Readonly scrub btrfs_scrub_dev()
         * - Readonly scrub btrfs_scrub_progress()
         * - btrfs_get_dev_stats()
         */
        lockdep_assert_held(&fs_devices->device_list_mutex);

        list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
                              synchronize_rcu);
        list_for_each_entry(device, &seed_devices->devices, dev_list)
                device->fs_devices = seed_devices;

        fs_devices->seeding = false;
        fs_devices->num_devices = 0;
        fs_devices->open_devices = 0;
        fs_devices->missing_devices = 0;
        fs_devices->rotating = false;
        list_add(&seed_devices->seed_list, &fs_devices->seed_list);

        generate_random_uuid(fs_devices->fsid);
        memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
        memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);

        super_flags = btrfs_super_flags(disk_super) &
                      ~BTRFS_SUPER_FLAG_SEEDING;
        btrfs_set_super_flags(disk_super, super_flags);
}

/*
 * Store the expected generation for seed devices in device items.
 */
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
{
        BTRFS_DEV_LOOKUP_ARGS(args);
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *root = fs_info->chunk_root;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_dev_item *dev_item;
        struct btrfs_device *device;
        struct btrfs_key key;
        u8 fs_uuid[BTRFS_FSID_SIZE];
        u8 dev_uuid[BTRFS_UUID_SIZE];
        int ret;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
        key.offset = 0;
        key.type = BTRFS_DEV_ITEM_KEY;

        while (1) {
                btrfs_reserve_chunk_metadata(trans, false);
                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
                btrfs_trans_release_chunk_metadata(trans);
                if (ret < 0)
                        goto error;

                leaf = path->nodes[0];
next_slot:
                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
                        ret = btrfs_next_leaf(root, path);
                        if (ret > 0)
                                break;
                        if (ret < 0)
                                goto error;
                        leaf = path->nodes[0];
                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
                        btrfs_release_path(path);
                        continue;
                }

                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
                if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
                    key.type != BTRFS_DEV_ITEM_KEY)
                        break;

                dev_item = btrfs_item_ptr(leaf, path->slots[0],
                                          struct btrfs_dev_item);
                args.devid = btrfs_device_id(leaf, dev_item);
                read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
                                   BTRFS_UUID_SIZE);
                read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
                                   BTRFS_FSID_SIZE);
                args.uuid = dev_uuid;
                args.fsid = fs_uuid;
                device = btrfs_find_device(fs_info->fs_devices, &args);
                BUG_ON(!device); /* Logic error */

                if (device->fs_devices->seeding) {
                        btrfs_set_device_generation(leaf, dev_item,
                                                    device->generation);
                        btrfs_mark_buffer_dirty(trans, leaf);
                }

                path->slots[0]++;
                goto next_slot;
        }
        ret = 0;
error:
        btrfs_free_path(path);
        return ret;
}

int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
{
        struct btrfs_root *root = fs_info->dev_root;
        struct btrfs_trans_handle *trans;
        struct btrfs_device *device;
        struct file *bdev_file;
        struct super_block *sb = fs_info->sb;
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        struct btrfs_fs_devices *seed_devices = NULL;
        u64 orig_super_total_bytes;
        u64 orig_super_num_devices;
        int ret = 0;
        bool seeding_dev = false;
        bool locked = false;

        if (sb_rdonly(sb) && !fs_devices->seeding)
                return -EROFS;

        bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
                                        fs_info->bdev_holder, NULL);
        if (IS_ERR(bdev_file))
                return PTR_ERR(bdev_file);

        if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) {
                ret = -EINVAL;
                goto error;
        }

        if (fs_devices->seeding) {
                seeding_dev = true;
                down_write(&sb->s_umount);
                mutex_lock(&uuid_mutex);
                locked = true;
        }

        sync_blockdev(file_bdev(bdev_file));

        rcu_read_lock();
        list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
                if (device->bdev == file_bdev(bdev_file)) {
                        ret = -EEXIST;
                        rcu_read_unlock();
                        goto error;
                }
        }
        rcu_read_unlock();

        device = btrfs_alloc_device(fs_info, NULL, NULL, device_path);
        if (IS_ERR(device)) {
                /* we can safely leave the fs_devices entry around */
                ret = PTR_ERR(device);
                goto error;
        }

        device->fs_info = fs_info;
        device->bdev_file = bdev_file;
        device->bdev = file_bdev(bdev_file);
        ret = lookup_bdev(device_path, &device->devt);
        if (ret)
                goto error_free_device;

        ret = btrfs_get_dev_zone_info(device, false);
        if (ret)
                goto error_free_device;

        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto error_free_zone;
        }

        set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
        device->generation = trans->transid;
        device->io_width = fs_info->sectorsize;
        device->io_align = fs_info->sectorsize;
        device->sector_size = fs_info->sectorsize;
        device->total_bytes =
                round_down(bdev_nr_bytes(device->bdev), fs_info->sectorsize);
        device->disk_total_bytes = device->total_bytes;
        device->commit_total_bytes = device->total_bytes;
        set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
        clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
        device->dev_stats_valid = 1;
        set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE);

        if (seeding_dev) {
                btrfs_clear_sb_rdonly(sb);

                /* GFP_KERNEL allocation must not be under device_list_mutex */
                seed_devices = btrfs_init_sprout(fs_info);
                if (IS_ERR(seed_devices)) {
                        ret = PTR_ERR(seed_devices);
                        btrfs_abort_transaction(trans, ret);
                        goto error_trans;
                }
        }

        mutex_lock(&fs_devices->device_list_mutex);
        if (seeding_dev) {
                btrfs_setup_sprout(fs_info, seed_devices);
                btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
                                                device);
        }

        device->fs_devices = fs_devices;

        mutex_lock(&fs_info->chunk_mutex);
        list_add_rcu(&device->dev_list, &fs_devices->devices);
        list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
        fs_devices->num_devices++;
        fs_devices->open_devices++;
        fs_devices->rw_devices++;
        fs_devices->total_devices++;
        fs_devices->total_rw_bytes += device->total_bytes;

        atomic64_add(device->total_bytes, &fs_info->free_chunk_space);

        if (!bdev_nonrot(device->bdev))
                fs_devices->rotating = true;

        orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
        btrfs_set_super_total_bytes(fs_info->super_copy,
                round_down(orig_super_total_bytes + device->total_bytes,
                           fs_info->sectorsize));

        orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
        btrfs_set_super_num_devices(fs_info->super_copy,
                                    orig_super_num_devices + 1);

        /*
         * we've got more storage, clear any full flags on the space
         * infos
         */
        btrfs_clear_space_info_full(fs_info);

        mutex_unlock(&fs_info->chunk_mutex);

        /* Add sysfs device entry */
        btrfs_sysfs_add_device(device);

        mutex_unlock(&fs_devices->device_list_mutex);

        if (seeding_dev) {
                mutex_lock(&fs_info->chunk_mutex);
                ret = init_first_rw_device(trans);
                mutex_unlock(&fs_info->chunk_mutex);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        goto error_sysfs;
                }
        }

        ret = btrfs_add_dev_item(trans, device);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto error_sysfs;
        }

        if (seeding_dev) {
                ret = btrfs_finish_sprout(trans);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        goto error_sysfs;
                }

                /*
                 * fs_devices now represents the newly sprouted filesystem and
                 * its fsid has been changed by btrfs_sprout_splice().
                 */
                btrfs_sysfs_update_sprout_fsid(fs_devices);
        }

        ret = btrfs_commit_transaction(trans);

        if (seeding_dev) {
                mutex_unlock(&uuid_mutex);
                up_write(&sb->s_umount);
                locked = false;

                if (ret) /* transaction commit */
                        return ret;

                ret = btrfs_relocate_sys_chunks(fs_info);
                if (ret < 0)
                        btrfs_handle_fs_error(fs_info, ret,
                                    "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
                trans = btrfs_attach_transaction(root);
                if (IS_ERR(trans)) {
                        if (PTR_ERR(trans) == -ENOENT)
                                return 0;
                        ret = PTR_ERR(trans);
                        trans = NULL;
                        goto error_sysfs;
                }
                ret = btrfs_commit_transaction(trans);
        }

        /*
         * Now that we have written a new super block to this device, check all
         * other fs_devices list if device_path alienates any other scanned
         * device.
         * We can ignore the return value as it typically returns -EINVAL and
         * only succeeds if the device was an alien.
         */
        btrfs_forget_devices(device->devt);

        /* Update ctime/mtime for blkid or udev */
        update_dev_time(device_path);

        return ret;

error_sysfs:
        btrfs_sysfs_remove_device(device);
        mutex_lock(&fs_info->fs_devices->device_list_mutex);
        mutex_lock(&fs_info->chunk_mutex);
        list_del_rcu(&device->dev_list);
        list_del(&device->dev_alloc_list);
        fs_info->fs_devices->num_devices--;
        fs_info->fs_devices->open_devices--;
        fs_info->fs_devices->rw_devices--;
        fs_info->fs_devices->total_devices--;
        fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
        atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
        btrfs_set_super_total_bytes(fs_info->super_copy,
                                    orig_super_total_bytes);
        btrfs_set_super_num_devices(fs_info->super_copy,
                                    orig_super_num_devices);
        mutex_unlock(&fs_info->chunk_mutex);
        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
error_trans:
        if (seeding_dev)
                btrfs_set_sb_rdonly(sb);
        if (trans)
                btrfs_end_transaction(trans);
error_free_zone:
        btrfs_destroy_dev_zone_info(device);
error_free_device:
        btrfs_free_device(device);
error:
        fput(bdev_file);
        if (locked) {
                mutex_unlock(&uuid_mutex);
                up_write(&sb->s_umount);
        }
        return ret;
}

static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
                                        struct btrfs_device *device)
{
        int ret;
        struct btrfs_path *path;
        struct btrfs_root *root = device->fs_info->chunk_root;
        struct btrfs_dev_item *dev_item;
        struct extent_buffer *leaf;
        struct btrfs_key key;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
        key.type = BTRFS_DEV_ITEM_KEY;
        key.offset = device->devid;

        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        if (ret < 0)
                goto out;

        if (ret > 0) {
                ret = -ENOENT;
                goto out;
        }

        leaf = path->nodes[0];
        dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);

        btrfs_set_device_id(leaf, dev_item, device->devid);
        btrfs_set_device_type(leaf, dev_item, device->type);
        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
        btrfs_set_device_total_bytes(leaf, dev_item,
                                     btrfs_device_get_disk_total_bytes(device));
        btrfs_set_device_bytes_used(leaf, dev_item,
                                    btrfs_device_get_bytes_used(device));
        btrfs_mark_buffer_dirty(trans, leaf);

out:
        btrfs_free_path(path);
        return ret;
}

int btrfs_grow_device(struct btrfs_trans_handle *trans,
                      struct btrfs_device *device, u64 new_size)
{
        struct btrfs_fs_info *fs_info = device->fs_info;
        struct btrfs_super_block *super_copy = fs_info->super_copy;
        u64 old_total;
        u64 diff;
        int ret;

        if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
                return -EACCES;

        new_size = round_down(new_size, fs_info->sectorsize);

        mutex_lock(&fs_info->chunk_mutex);
        old_total = btrfs_super_total_bytes(super_copy);
        diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);

        if (new_size <= device->total_bytes ||
            test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
                mutex_unlock(&fs_info->chunk_mutex);
                return -EINVAL;
        }

        btrfs_set_super_total_bytes(super_copy,
                        round_down(old_total + diff, fs_info->sectorsize));
        device->fs_devices->total_rw_bytes += diff;
        atomic64_add(diff, &fs_info->free_chunk_space);

        btrfs_device_set_total_bytes(device, new_size);
        btrfs_device_set_disk_total_bytes(device, new_size);
        btrfs_clear_space_info_full(device->fs_info);
        if (list_empty(&device->post_commit_list))
                list_add_tail(&device->post_commit_list,
                              &trans->transaction->dev_update_list);
        mutex_unlock(&fs_info->chunk_mutex);

        btrfs_reserve_chunk_metadata(trans, false);
        ret = btrfs_update_device(trans, device);
        btrfs_trans_release_chunk_metadata(trans);

        return ret;
}

static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *root = fs_info->chunk_root;
        int ret;
        struct btrfs_path *path;
        struct btrfs_key key;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
        key.offset = chunk_offset;
        key.type = BTRFS_CHUNK_ITEM_KEY;

        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret < 0)
                goto out;
        else if (ret > 0) { /* Logic error or corruption */
                btrfs_handle_fs_error(fs_info, -ENOENT,
                                      "Failed lookup while freeing chunk.");
                ret = -ENOENT;
                goto out;
        }

        ret = btrfs_del_item(trans, root, path);
        if (ret < 0)
                btrfs_handle_fs_error(fs_info, ret,
                                      "Failed to delete chunk item.");
out:
        btrfs_free_path(path);
        return ret;
}

static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
        struct btrfs_super_block *super_copy = fs_info->super_copy;
        struct btrfs_disk_key *disk_key;
        struct btrfs_chunk *chunk;
        u8 *ptr;
        int ret = 0;
        u32 num_stripes;
        u32 array_size;
        u32 len = 0;
        u32 cur;
        struct btrfs_key key;

        lockdep_assert_held(&fs_info->chunk_mutex);
        array_size = btrfs_super_sys_array_size(super_copy);

        ptr = super_copy->sys_chunk_array;
        cur = 0;

        while (cur < array_size) {
                disk_key = (struct btrfs_disk_key *)ptr;
                btrfs_disk_key_to_cpu(&key, disk_key);

                len = sizeof(*disk_key);

                if (key.type == BTRFS_CHUNK_ITEM_KEY) {
                        chunk = (struct btrfs_chunk *)(ptr + len);
                        num_stripes = btrfs_stack_chunk_num_stripes(chunk);
                        len += btrfs_chunk_item_size(num_stripes);
                } else {
                        ret = -EIO;
                        break;
                }
                if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
                    key.offset == chunk_offset) {
                        memmove(ptr, ptr + len, array_size - (cur + len));
                        array_size -= len;
                        btrfs_set_super_sys_array_size(super_copy, array_size);
                } else {
                        ptr += len;
                        cur += len;
                }
        }
        return ret;
}

struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info,
                                                    u64 logical, u64 length)
{
        struct rb_node *node = fs_info->mapping_tree.rb_root.rb_node;
        struct rb_node *prev = NULL;
        struct rb_node *orig_prev;
        struct btrfs_chunk_map *map;
        struct btrfs_chunk_map *prev_map = NULL;

        while (node) {
                map = rb_entry(node, struct btrfs_chunk_map, rb_node);
                prev = node;
                prev_map = map;

                if (logical < map->start) {
                        node = node->rb_left;
                } else if (logical >= map->start + map->chunk_len) {
                        node = node->rb_right;
                } else {
                        refcount_inc(&map->refs);
                        return map;
                }
        }

        if (!prev)
                return NULL;

        orig_prev = prev;
        while (prev && logical >= prev_map->start + prev_map->chunk_len) {
                prev = rb_next(prev);
                prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
        }

        if (!prev) {
                prev = orig_prev;
                prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
                while (prev && logical < prev_map->start) {
                        prev = rb_prev(prev);
                        prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
                }
        }

        if (prev) {
                u64 end = logical + length;

                /*
                 * Caller can pass a U64_MAX length when it wants to get any
                 * chunk starting at an offset of 'logical' or higher, so deal
                 * with underflow by resetting the end offset to U64_MAX.
                 */
                if (end < logical)
                        end = U64_MAX;

                if (end > prev_map->start &&
                    logical < prev_map->start + prev_map->chunk_len) {
                        refcount_inc(&prev_map->refs);
                        return prev_map;
                }
        }

        return NULL;
}

struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info,
                                             u64 logical, u64 length)
{
        struct btrfs_chunk_map *map;

        read_lock(&fs_info->mapping_tree_lock);
        map = btrfs_find_chunk_map_nolock(fs_info, logical, length);
        read_unlock(&fs_info->mapping_tree_lock);

        return map;
}

/*
 * Find the mapping containing the given logical extent.
 *
 * @logical: Logical block offset in bytes.
 * @length: Length of extent in bytes.
 *
 * Return: Chunk mapping or ERR_PTR.
 */
struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
                                            u64 logical, u64 length)
{
        struct btrfs_chunk_map *map;

        map = btrfs_find_chunk_map(fs_info, logical, length);

        if (unlikely(!map)) {
                btrfs_crit(fs_info,
                           "unable to find chunk map for logical %llu length %llu",
                           logical, length);
                return ERR_PTR(-EINVAL);
        }

        if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) {
                btrfs_crit(fs_info,
                           "found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
                           logical, logical + length, map->start,
                           map->start + map->chunk_len);
                btrfs_free_chunk_map(map);
                return ERR_PTR(-EINVAL);
        }

        /* Callers are responsible for dropping the reference. */
        return map;
}

static int remove_chunk_item(struct btrfs_trans_handle *trans,
                             struct btrfs_chunk_map *map, u64 chunk_offset)
{
        int i;

        /*
         * Removing chunk items and updating the device items in the chunks btree
         * requires holding the chunk_mutex.
         * See the comment at btrfs_chunk_alloc() for the details.
         */
        lockdep_assert_held(&trans->fs_info->chunk_mutex);

        for (i = 0; i < map->num_stripes; i++) {
                int ret;

                ret = btrfs_update_device(trans, map->stripes[i].dev);
                if (ret)
                        return ret;
        }

        return btrfs_free_chunk(trans, chunk_offset);
}

int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_chunk_map *map;
        u64 dev_extent_len = 0;
        int i, ret = 0;
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;

        map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
        if (IS_ERR(map)) {
                /*
                 * This is a logic error, but we don't want to just rely on the
                 * user having built with ASSERT enabled, so if ASSERT doesn't
                 * do anything we still error out.
                 */
                ASSERT(0);
                return PTR_ERR(map);
        }

        /*
         * First delete the device extent items from the devices btree.
         * We take the device_list_mutex to avoid racing with the finishing phase
         * of a device replace operation. See the comment below before acquiring
         * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
         * because that can result in a deadlock when deleting the device extent
         * items from the devices btree - COWing an extent buffer from the btree
         * may result in allocating a new metadata chunk, which would attempt to
         * lock again fs_info->chunk_mutex.
         */
        mutex_lock(&fs_devices->device_list_mutex);
        for (i = 0; i < map->num_stripes; i++) {
                struct btrfs_device *device = map->stripes[i].dev;
                ret = btrfs_free_dev_extent(trans, device,
                                            map->stripes[i].physical,
                                            &dev_extent_len);
                if (ret) {
                        mutex_unlock(&fs_devices->device_list_mutex);
                        btrfs_abort_transaction(trans, ret);
                        goto out;
                }

                if (device->bytes_used > 0) {
                        mutex_lock(&fs_info->chunk_mutex);
                        btrfs_device_set_bytes_used(device,
                                        device->bytes_used - dev_extent_len);
                        atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
                        btrfs_clear_space_info_full(fs_info);
                        mutex_unlock(&fs_info->chunk_mutex);
                }
        }
        mutex_unlock(&fs_devices->device_list_mutex);

        /*
         * We acquire fs_info->chunk_mutex for 2 reasons:
         *
         * 1) Just like with the first phase of the chunk allocation, we must
         *    reserve system space, do all chunk btree updates and deletions, and
         *    update the system chunk array in the superblock while holding this
         *    mutex. This is for similar reasons as explained on the comment at
         *    the top of btrfs_chunk_alloc();
         *
         * 2) Prevent races with the final phase of a device replace operation
         *    that replaces the device object associated with the map's stripes,
         *    because the device object's id can change at any time during that
         *    final phase of the device replace operation
         *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
         *    replaced device and then see it with an ID of
         *    BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
         *    the device item, which does not exists on the chunk btree.
         *    The finishing phase of device replace acquires both the
         *    device_list_mutex and the chunk_mutex, in that order, so we are
         *    safe by just acquiring the chunk_mutex.
         */
        trans->removing_chunk = true;
        mutex_lock(&fs_info->chunk_mutex);

        check_system_chunk(trans, map->type);

        ret = remove_chunk_item(trans, map, chunk_offset);
        /*
         * Normally we should not get -ENOSPC since we reserved space before
         * through the call to check_system_chunk().
         *
         * Despite our system space_info having enough free space, we may not
         * be able to allocate extents from its block groups, because all have
         * an incompatible profile, which will force us to allocate a new system
         * block group with the right profile, or right after we called
         * check_system_space() above, a scrub turned the only system block group
         * with enough free space into RO mode.
         * This is explained with more detail at do_chunk_alloc().
         *
         * So if we get -ENOSPC, allocate a new system chunk and retry once.
         */
        if (ret == -ENOSPC) {
                const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
                struct btrfs_block_group *sys_bg;

                sys_bg = btrfs_create_chunk(trans, sys_flags);
                if (IS_ERR(sys_bg)) {
                        ret = PTR_ERR(sys_bg);
                        btrfs_abort_transaction(trans, ret);
                        goto out;
                }

                ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        goto out;
                }

                ret = remove_chunk_item(trans, map, chunk_offset);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        goto out;
                }
        } else if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out;
        }

        trace_btrfs_chunk_free(fs_info, map, chunk_offset, map->chunk_len);

        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
                ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        goto out;
                }
        }

        mutex_unlock(&fs_info->chunk_mutex);
        trans->removing_chunk = false;

        /*
         * We are done with chunk btree updates and deletions, so release the
         * system space we previously reserved (with check_system_chunk()).
         */
        btrfs_trans_release_chunk_metadata(trans);

        ret = btrfs_remove_block_group(trans, map);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out;
        }

out:
        if (trans->removing_chunk) {
                mutex_unlock(&fs_info->chunk_mutex);
                trans->removing_chunk = false;
        }
        /* once for us */
        btrfs_free_chunk_map(map);
        return ret;
}

int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
        struct btrfs_root *root = fs_info->chunk_root;
        struct btrfs_trans_handle *trans;
        struct btrfs_block_group *block_group;
        u64 length;
        int ret;

        if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
                btrfs_err(fs_info,
                          "relocate: not supported on extent tree v2 yet");
                return -EINVAL;
        }

        /*
         * Prevent races with automatic removal of unused block groups.
         * After we relocate and before we remove the chunk with offset
         * chunk_offset, automatic removal of the block group can kick in,
         * resulting in a failure when calling btrfs_remove_chunk() below.
         *
         * Make sure to acquire this mutex before doing a tree search (dev
         * or chunk trees) to find chunks. Otherwise the cleaner kthread might
         * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
         * we release the path used to search the chunk/dev tree and before
         * the current task acquires this mutex and calls us.
         */
        lockdep_assert_held(&fs_info->reclaim_bgs_lock);

        /* step one, relocate all the extents inside this chunk */
        btrfs_scrub_pause(fs_info);
        ret = btrfs_relocate_block_group(fs_info, chunk_offset);
        btrfs_scrub_continue(fs_info);
        if (ret) {
                /*
                 * If we had a transaction abort, stop all running scrubs.
                 * See transaction.c:cleanup_transaction() why we do it here.
                 */
                if (BTRFS_FS_ERROR(fs_info))
                        btrfs_scrub_cancel(fs_info);
                return ret;
        }

        block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
        if (!block_group)
                return -ENOENT;
        btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
        length = block_group->length;
        btrfs_put_block_group(block_group);

        /*
         * On a zoned file system, discard the whole block group, this will
         * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
         * resetting the zone fails, don't treat it as a fatal problem from the
         * filesystem's point of view.
         */
        if (btrfs_is_zoned(fs_info)) {
                ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
                if (ret)
                        btrfs_info(fs_info,
                                "failed to reset zone %llu after relocation",
                                chunk_offset);
        }

        trans = btrfs_start_trans_remove_block_group(root->fs_info,
                                                     chunk_offset);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                btrfs_handle_fs_error(root->fs_info, ret, NULL);
                return ret;
        }

        /*
         * step two, delete the device extents and the
         * chunk tree entries
         */
        ret = btrfs_remove_chunk(trans, chunk_offset);
        btrfs_end_transaction(trans);
        return ret;
}

static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
{
        struct btrfs_root *chunk_root = fs_info->chunk_root;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_chunk *chunk;
        struct btrfs_key key;
        struct btrfs_key found_key;
        u64 chunk_type;
        bool retried = false;
        int failed = 0;
        int ret;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

again:
        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
        key.offset = (u64)-1;
        key.type = BTRFS_CHUNK_ITEM_KEY;

        while (1) {
                mutex_lock(&fs_info->reclaim_bgs_lock);
                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
                if (ret < 0) {
                        mutex_unlock(&fs_info->reclaim_bgs_lock);
                        goto error;
                }
                if (ret == 0) {
                        /*
                         * On the first search we would find chunk tree with
                         * offset -1, which is not possible. On subsequent
                         * loops this would find an existing item on an invalid
                         * offset (one less than the previous one, wrong
                         * alignment and size).
                         */
                        ret = -EUCLEAN;
                        mutex_unlock(&fs_info->reclaim_bgs_lock);
                        goto error;
                }

                ret = btrfs_previous_item(chunk_root, path, key.objectid,
                                          key.type);
                if (ret)
                        mutex_unlock(&fs_info->reclaim_bgs_lock);
                if (ret < 0)
                        goto error;
                if (ret > 0)
                        break;

                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);

                chunk = btrfs_item_ptr(leaf, path->slots[0],
                                       struct btrfs_chunk);
                chunk_type = btrfs_chunk_type(leaf, chunk);
                btrfs_release_path(path);

                if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
                        ret = btrfs_relocate_chunk(fs_info, found_key.offset);
                        if (ret == -ENOSPC)
                                failed++;
                        else
                                BUG_ON(ret);
                }
                mutex_unlock(&fs_info->reclaim_bgs_lock);

                if (found_key.offset == 0)
                        break;
                key.offset = found_key.offset - 1;
        }
        ret = 0;
        if (failed && !retried) {
                failed = 0;
                retried = true;
                goto again;
        } else if (WARN_ON(failed && retried)) {
                ret = -ENOSPC;
        }
error:
        btrfs_free_path(path);
        return ret;
}

/*
 * return 1 : allocate a data chunk successfully,
 * return <0: errors during allocating a data chunk,
 * return 0 : no need to allocate a data chunk.
 */
static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
                                      u64 chunk_offset)
{
        struct btrfs_block_group *cache;
        u64 bytes_used;
        u64 chunk_type;

        cache = btrfs_lookup_block_group(fs_info, chunk_offset);
        ASSERT(cache);
        chunk_type = cache->flags;
        btrfs_put_block_group(cache);

        if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
                return 0;

        spin_lock(&fs_info->data_sinfo->lock);
        bytes_used = fs_info->data_sinfo->bytes_used;
        spin_unlock(&fs_info->data_sinfo->lock);

        if (!bytes_used) {
                struct btrfs_trans_handle *trans;
                int ret;

                trans =        btrfs_join_transaction(fs_info->tree_root);
                if (IS_ERR(trans))
                        return PTR_ERR(trans);

                ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
                btrfs_end_transaction(trans);
                if (ret < 0)
                        return ret;
                return 1;
        }

        return 0;
}

static void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
                                           const struct btrfs_disk_balance_args *disk)
{
        memset(cpu, 0, sizeof(*cpu));

        cpu->profiles = le64_to_cpu(disk->profiles);
        cpu->usage = le64_to_cpu(disk->usage);
        cpu->devid = le64_to_cpu(disk->devid);
        cpu->pstart = le64_to_cpu(disk->pstart);
        cpu->pend = le64_to_cpu(disk->pend);
        cpu->vstart = le64_to_cpu(disk->vstart);
        cpu->vend = le64_to_cpu(disk->vend);
        cpu->target = le64_to_cpu(disk->target);
        cpu->flags = le64_to_cpu(disk->flags);
        cpu->limit = le64_to_cpu(disk->limit);
        cpu->stripes_min = le32_to_cpu(disk->stripes_min);
        cpu->stripes_max = le32_to_cpu(disk->stripes_max);
}

static void btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
                                           const struct btrfs_balance_args *cpu)
{
        memset(disk, 0, sizeof(*disk));

        disk->profiles = cpu_to_le64(cpu->profiles);
        disk->usage = cpu_to_le64(cpu->usage);
        disk->devid = cpu_to_le64(cpu->devid);
        disk->pstart = cpu_to_le64(cpu->pstart);
        disk->pend = cpu_to_le64(cpu->pend);
        disk->vstart = cpu_to_le64(cpu->vstart);
        disk->vend = cpu_to_le64(cpu->vend);
        disk->target = cpu_to_le64(cpu->target);
        disk->flags = cpu_to_le64(cpu->flags);
        disk->limit = cpu_to_le64(cpu->limit);
        disk->stripes_min = cpu_to_le32(cpu->stripes_min);
        disk->stripes_max = cpu_to_le32(cpu->stripes_max);
}

static int insert_balance_item(struct btrfs_fs_info *fs_info,
                               struct btrfs_balance_control *bctl)
{
        struct btrfs_root *root = fs_info->tree_root;
        struct btrfs_trans_handle *trans;
        struct btrfs_balance_item *item;
        struct btrfs_disk_balance_args disk_bargs;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_key key;
        int ret, err;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                btrfs_free_path(path);
                return PTR_ERR(trans);
        }

        key.objectid = BTRFS_BALANCE_OBJECTID;
        key.type = BTRFS_TEMPORARY_ITEM_KEY;
        key.offset = 0;

        ret = btrfs_insert_empty_item(trans, root, path, &key,
                                      sizeof(*item));
        if (ret)
                goto out;

        leaf = path->nodes[0];
        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);

        memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));

        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
        btrfs_set_balance_data(leaf, item, &disk_bargs);
        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
        btrfs_set_balance_meta(leaf, item, &disk_bargs);
        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
        btrfs_set_balance_sys(leaf, item, &disk_bargs);

        btrfs_set_balance_flags(leaf, item, bctl->flags);

        btrfs_mark_buffer_dirty(trans, leaf);
out:
        btrfs_free_path(path);
        err = btrfs_commit_transaction(trans);
        if (err && !ret)
                ret = err;
        return ret;
}

static int del_balance_item(struct btrfs_fs_info *fs_info)
{
        struct btrfs_root *root = fs_info->tree_root;
        struct btrfs_trans_handle *trans;
        struct btrfs_path *path;
        struct btrfs_key key;
        int ret, err;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
        if (IS_ERR(trans)) {
                btrfs_free_path(path);
                return PTR_ERR(trans);
        }

        key.objectid = BTRFS_BALANCE_OBJECTID;
        key.type = BTRFS_TEMPORARY_ITEM_KEY;
        key.offset = 0;

        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret < 0)
                goto out;
        if (ret > 0) {
                ret = -ENOENT;
                goto out;
        }

        ret = btrfs_del_item(trans, root, path);
out:
        btrfs_free_path(path);
        err = btrfs_commit_transaction(trans);
        if (err && !ret)
                ret = err;
        return ret;
}

/*
 * This is a heuristic used to reduce the number of chunks balanced on
 * resume after balance was interrupted.
 */
static void update_balance_args(struct btrfs_balance_control *bctl)
{
        /*
         * Turn on soft mode for chunk types that were being converted.
         */
        if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
                bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
        if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
                bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
        if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
                bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;

        /*
         * Turn on usage filter if is not already used.  The idea is
         * that chunks that we have already balanced should be
         * reasonably full.  Don't do it for chunks that are being
         * converted - that will keep us from relocating unconverted
         * (albeit full) chunks.
         */
        if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
            !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
            !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
                bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
                bctl->data.usage = 90;
        }
        if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
            !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
            !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
                bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
                bctl->sys.usage = 90;
        }
        if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
            !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
            !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
                bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
                bctl->meta.usage = 90;
        }
}

/*
 * Clear the balance status in fs_info and delete the balance item from disk.
 */
static void reset_balance_state(struct btrfs_fs_info *fs_info)
{
        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
        int ret;

        ASSERT(fs_info->balance_ctl);

        spin_lock(&fs_info->balance_lock);
        fs_info->balance_ctl = NULL;
        spin_unlock(&fs_info->balance_lock);

        kfree(bctl);
        ret = del_balance_item(fs_info);
        if (ret)
                btrfs_handle_fs_error(fs_info, ret, NULL);
}

/*
 * Balance filters.  Return 1 if chunk should be filtered out
 * (should not be balanced).
 */
static int chunk_profiles_filter(u64 chunk_type,
                                 struct btrfs_balance_args *bargs)
{
        chunk_type = chunk_to_extended(chunk_type) &
                                BTRFS_EXTENDED_PROFILE_MASK;

        if (bargs->profiles & chunk_type)
                return 0;

        return 1;
}

static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
                              struct btrfs_balance_args *bargs)
{
        struct btrfs_block_group *cache;
        u64 chunk_used;
        u64 user_thresh_min;
        u64 user_thresh_max;
        int ret = 1;

        cache = btrfs_lookup_block_group(fs_info, chunk_offset);
        chunk_used = cache->used;

        if (bargs->usage_min == 0)
                user_thresh_min = 0;
        else
                user_thresh_min = mult_perc(cache->length, bargs->usage_min);

        if (bargs->usage_max == 0)
                user_thresh_max = 1;
        else if (bargs->usage_max > 100)
                user_thresh_max = cache->length;
        else
                user_thresh_max = mult_perc(cache->length, bargs->usage_max);

        if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
                ret = 0;

        btrfs_put_block_group(cache);
        return ret;
}

static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
                u64 chunk_offset, struct btrfs_balance_args *bargs)
{
        struct btrfs_block_group *cache;
        u64 chunk_used, user_thresh;
        int ret = 1;

        cache = btrfs_lookup_block_group(fs_info, chunk_offset);
        chunk_used = cache->used;

        if (bargs->usage_min == 0)
                user_thresh = 1;
        else if (bargs->usage > 100)
                user_thresh = cache->length;
        else
                user_thresh = mult_perc(cache->length, bargs->usage);

        if (chunk_used < user_thresh)
                ret = 0;

        btrfs_put_block_group(cache);
        return ret;
}

static int chunk_devid_filter(struct extent_buffer *leaf,
                              struct btrfs_chunk *chunk,
                              struct btrfs_balance_args *bargs)
{
        struct btrfs_stripe *stripe;
        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
        int i;

        for (i = 0; i < num_stripes; i++) {
                stripe = btrfs_stripe_nr(chunk, i);
                if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
                        return 0;
        }

        return 1;
}

static u64 calc_data_stripes(u64 type, int num_stripes)
{
        const int index = btrfs_bg_flags_to_raid_index(type);
        const int ncopies = btrfs_raid_array[index].ncopies;
        const int nparity = btrfs_raid_array[index].nparity;

        return (num_stripes - nparity) / ncopies;
}

/* [pstart, pend) */
static int chunk_drange_filter(struct extent_buffer *leaf,
                               struct btrfs_chunk *chunk,
                               struct btrfs_balance_args *bargs)
{
        struct btrfs_stripe *stripe;
        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
        u64 stripe_offset;
        u64 stripe_length;
        u64 type;
        int factor;
        int i;

        if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
                return 0;

        type = btrfs_chunk_type(leaf, chunk);
        factor = calc_data_stripes(type, num_stripes);

        for (i = 0; i < num_stripes; i++) {
                stripe = btrfs_stripe_nr(chunk, i);
                if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
                        continue;

                stripe_offset = btrfs_stripe_offset(leaf, stripe);
                stripe_length = btrfs_chunk_length(leaf, chunk);
                stripe_length = div_u64(stripe_length, factor);

                if (stripe_offset < bargs->pend &&
                    stripe_offset + stripe_length > bargs->pstart)
                        return 0;
        }

        return 1;
}

/* [vstart, vend) */
static int chunk_vrange_filter(struct extent_buffer *leaf,
                               struct btrfs_chunk *chunk,
                               u64 chunk_offset,
                               struct btrfs_balance_args *bargs)
{
        if (chunk_offset < bargs->vend &&
            chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
                /* at least part of the chunk is inside this vrange */
                return 0;

        return 1;
}

static int chunk_stripes_range_filter(struct extent_buffer *leaf,
                               struct btrfs_chunk *chunk,
                               struct btrfs_balance_args *bargs)
{
        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);

        if (bargs->stripes_min <= num_stripes
                        && num_stripes <= bargs->stripes_max)
                return 0;

        return 1;
}

static int chunk_soft_convert_filter(u64 chunk_type,
                                     struct btrfs_balance_args *bargs)
{
        if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
                return 0;

        chunk_type = chunk_to_extended(chunk_type) &
                                BTRFS_EXTENDED_PROFILE_MASK;

        if (bargs->target == chunk_type)
                return 1;

        return 0;
}

static int should_balance_chunk(struct extent_buffer *leaf,
                                struct btrfs_chunk *chunk, u64 chunk_offset)
{
        struct btrfs_fs_info *fs_info = leaf->fs_info;
        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
        struct btrfs_balance_args *bargs = NULL;
        u64 chunk_type = btrfs_chunk_type(leaf, chunk);

        /* type filter */
        if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
              (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
                return 0;
        }

        if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
                bargs = &bctl->data;
        else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
                bargs = &bctl->sys;
        else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
                bargs = &bctl->meta;

        /* profiles filter */
        if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
            chunk_profiles_filter(chunk_type, bargs)) {
                return 0;
        }

        /* usage filter */
        if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
            chunk_usage_filter(fs_info, chunk_offset, bargs)) {
                return 0;
        } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
            chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
                return 0;
        }

        /* devid filter */
        if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
            chunk_devid_filter(leaf, chunk, bargs)) {
                return 0;
        }

        /* drange filter, makes sense only with devid filter */
        if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
            chunk_drange_filter(leaf, chunk, bargs)) {
                return 0;
        }

        /* vrange filter */
        if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
            chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
                return 0;
        }

        /* stripes filter */
        if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
            chunk_stripes_range_filter(leaf, chunk, bargs)) {
                return 0;
        }

        /* soft profile changing mode */
        if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
            chunk_soft_convert_filter(chunk_type, bargs)) {
                return 0;
        }

        /*
         * limited by count, must be the last filter
         */
        if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
                if (bargs->limit == 0)
                        return 0;
                else
                        bargs->limit--;
        } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
                /*
                 * Same logic as the 'limit' filter; the minimum cannot be
                 * determined here because we do not have the global information
                 * about the count of all chunks that satisfy the filters.
                 */
                if (bargs->limit_max == 0)
                        return 0;
                else
                        bargs->limit_max--;
        }

        return 1;
}

static int __btrfs_balance(struct btrfs_fs_info *fs_info)
{
        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
        struct btrfs_root *chunk_root = fs_info->chunk_root;
        u64 chunk_type;
        struct btrfs_chunk *chunk;
        struct btrfs_path *path = NULL;
        struct btrfs_key key;
        struct btrfs_key found_key;
        struct extent_buffer *leaf;
        int slot;
        int ret;
        int enospc_errors = 0;
        bool counting = true;
        /* The single value limit and min/max limits use the same bytes in the */
        u64 limit_data = bctl->data.limit;
        u64 limit_meta = bctl->meta.limit;
        u64 limit_sys = bctl->sys.limit;
        u32 count_data = 0;
        u32 count_meta = 0;
        u32 count_sys = 0;
        int chunk_reserved = 0;

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto error;
        }

        /* zero out stat counters */
        spin_lock(&fs_info->balance_lock);
        memset(&bctl->stat, 0, sizeof(bctl->stat));
        spin_unlock(&fs_info->balance_lock);
again:
        if (!counting) {
                /*
                 * The single value limit and min/max limits use the same bytes
                 * in the
                 */
                bctl->data.limit = limit_data;
                bctl->meta.limit = limit_meta;
                bctl->sys.limit = limit_sys;
        }
        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
        key.offset = (u64)-1;
        key.type = BTRFS_CHUNK_ITEM_KEY;

        while (1) {
                if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
                    atomic_read(&fs_info->balance_cancel_req)) {
                        ret = -ECANCELED;
                        goto error;
                }

                mutex_lock(&fs_info->reclaim_bgs_lock);
                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
                if (ret < 0) {
                        mutex_unlock(&fs_info->reclaim_bgs_lock);
                        goto error;
                }

                /*
                 * this shouldn't happen, it means the last relocate
                 * failed
                 */
                if (ret == 0)
                        BUG(); /* FIXME break ? */

                ret = btrfs_previous_item(chunk_root, path, 0,
                                          BTRFS_CHUNK_ITEM_KEY);
                if (ret) {
                        mutex_unlock(&fs_info->reclaim_bgs_lock);
                        ret = 0;
                        break;
                }

                leaf = path->nodes[0];
                slot = path->slots[0];
                btrfs_item_key_to_cpu(leaf, &found_key, slot);

                if (found_key.objectid != key.objectid) {
                        mutex_unlock(&fs_info->reclaim_bgs_lock);
                        break;
                }

                chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
                chunk_type = btrfs_chunk_type(leaf, chunk);

                if (!counting) {
                        spin_lock(&fs_info->balance_lock);
                        bctl->stat.considered++;
                        spin_unlock(&fs_info->balance_lock);
                }

                ret = should_balance_chunk(leaf, chunk, found_key.offset);

                btrfs_release_path(path);
                if (!ret) {
                        mutex_unlock(&fs_info->reclaim_bgs_lock);
                        goto loop;
                }

                if (counting) {
                        mutex_unlock(&fs_info->reclaim_bgs_lock);
                        spin_lock(&fs_info->balance_lock);
                        bctl->stat.expected++;
                        spin_unlock(&fs_info->balance_lock);

                        if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
                                count_data++;
                        else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
                                count_sys++;
                        else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
                                count_meta++;

                        goto loop;
                }

                /*
                 * Apply limit_min filter, no need to check if the LIMITS
                 * filter is used, limit_min is 0 by default
                 */
                if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
                                        count_data < bctl->data.limit_min)
                                || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
                                        count_meta < bctl->meta.limit_min)
                                || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
                                        count_sys < bctl->sys.limit_min)) {
                        mutex_unlock(&fs_info->reclaim_bgs_lock);
                        goto loop;
                }

                if (!chunk_reserved) {
                        /*
                         * We may be relocating the only data chunk we have,
                         * which could potentially end up with losing data's
                         * raid profile, so lets allocate an empty one in
                         * advance.
                         */
                        ret = btrfs_may_alloc_data_chunk(fs_info,
                                                         found_key.offset);
                        if (ret < 0) {
                                mutex_unlock(&fs_info->reclaim_bgs_lock);
                                goto error;
                        } else if (ret == 1) {
                                chunk_reserved = 1;
                        }
                }

                ret = btrfs_relocate_chunk(fs_info, found_key.offset);
                mutex_unlock(&fs_info->reclaim_bgs_lock);
                if (ret == -ENOSPC) {
                        enospc_errors++;
                } else if (ret == -ETXTBSY) {
                        btrfs_info(fs_info,
           "skipping relocation of block group %llu due to active swapfile",
                                   found_key.offset);
                        ret = 0;
                } else if (ret) {
                        goto error;
                } else {
                        spin_lock(&fs_info->balance_lock);
                        bctl->stat.completed++;
                        spin_unlock(&fs_info->balance_lock);
                }
loop:
                if (found_key.offset == 0)
                        break;
                key.offset = found_key.offset - 1;
        }

        if (counting) {
                btrfs_release_path(path);
                counting = false;
                goto again;
        }
error:
        btrfs_free_path(path);
        if (enospc_errors) {
                btrfs_info(fs_info, "%d enospc errors during balance",
                           enospc_errors);
                if (!ret)
                        ret = -ENOSPC;
        }

        return ret;
}

/*
 * See if a given profile is valid and reduced.
 *
 * @flags:     profile to validate
 * @extended:  if true @flags is treated as an extended profile
 */
static int alloc_profile_is_valid(u64 flags, int extended)
{
        u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
                               BTRFS_BLOCK_GROUP_PROFILE_MASK);

        flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;

        /* 1) check that all other bits are zeroed */
        if (flags & ~mask)
                return 0;

        /* 2) see if profile is reduced */
        if (flags == 0)
                return !extended; /* "0" is valid for usual profiles */

        return has_single_bit_set(flags);
}

/*
 * Validate target profile against allowed profiles and return true if it's OK.
 * Otherwise print the error message and return false.
 */
static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
                const struct btrfs_balance_args *bargs,
                u64 allowed, const char *type)
{
        if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
                return true;

        /* Profile is valid and does not have bits outside of the allowed set */
        if (alloc_profile_is_valid(bargs->target, 1) &&
            (bargs->target & ~allowed) == 0)
                return true;

        btrfs_err(fs_info, "balance: invalid convert %s profile %s",
                        type, btrfs_bg_type_to_raid_name(bargs->target));
        return false;
}

/*
 * Fill @buf with textual description of balance filter flags @bargs, up to
 * @size_buf including the terminating null. The output may be trimmed if it
 * does not fit into the provided buffer.
 */
static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
                                 u32 size_buf)
{
        int ret;
        u32 size_bp = size_buf;
        char *bp = buf;
        u64 flags = bargs->flags;
        char tmp_buf[128] = {'\0'};

        if (!flags)
                return;

#define CHECK_APPEND_NOARG(a)                                                \
        do {                                                                \
                ret = snprintf(bp, size_bp, (a));                        \
                if (ret < 0 || ret >= size_bp)                                \
                        goto out_overflow;                                \
                size_bp -= ret;                                                \
                bp += ret;                                                \
        } while (0)

#define CHECK_APPEND_1ARG(a, v1)                                        \
        do {                                                                \
                ret = snprintf(bp, size_bp, (a), (v1));                        \
                if (ret < 0 || ret >= size_bp)                                \
                        goto out_overflow;                                \
                size_bp -= ret;                                                \
                bp += ret;                                                \
        } while (0)

#define CHECK_APPEND_2ARG(a, v1, v2)                                        \
        do {                                                                \
                ret = snprintf(bp, size_bp, (a), (v1), (v2));                \
                if (ret < 0 || ret >= size_bp)                                \
                        goto out_overflow;                                \
                size_bp -= ret;                                                \
                bp += ret;                                                \
        } while (0)

        if (flags & BTRFS_BALANCE_ARGS_CONVERT)
                CHECK_APPEND_1ARG("convert=%s,",
                                  btrfs_bg_type_to_raid_name(bargs->target));

        if (flags & BTRFS_BALANCE_ARGS_SOFT)
                CHECK_APPEND_NOARG("soft,");

        if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
                btrfs_describe_block_groups(bargs->profiles, tmp_buf,
                                            sizeof(tmp_buf));
                CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
        }

        if (flags & BTRFS_BALANCE_ARGS_USAGE)
                CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);

        if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
                CHECK_APPEND_2ARG("usage=%u..%u,",
                                  bargs->usage_min, bargs->usage_max);

        if (flags & BTRFS_BALANCE_ARGS_DEVID)
                CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);

        if (flags & BTRFS_BALANCE_ARGS_DRANGE)
                CHECK_APPEND_2ARG("drange=%llu..%llu,",
                                  bargs->pstart, bargs->pend);

        if (flags & BTRFS_BALANCE_ARGS_VRANGE)
                CHECK_APPEND_2ARG("vrange=%llu..%llu,",
                                  bargs->vstart, bargs->vend);

        if (flags & BTRFS_BALANCE_ARGS_LIMIT)
                CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);

        if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
                CHECK_APPEND_2ARG("limit=%u..%u,",
                                bargs->limit_min, bargs->limit_max);

        if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
                CHECK_APPEND_2ARG("stripes=%u..%u,",
                                  bargs->stripes_min, bargs->stripes_max);

#undef CHECK_APPEND_2ARG
#undef CHECK_APPEND_1ARG
#undef CHECK_APPEND_NOARG

out_overflow:

        if (size_bp < size_buf)
                buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
        else
                buf[0] = '\0';
}

static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
{
        u32 size_buf = 1024;
        char tmp_buf[192] = {'\0'};
        char *buf;
        char *bp;
        u32 size_bp = size_buf;
        int ret;
        struct btrfs_balance_control *bctl = fs_info->balance_ctl;

        buf = kzalloc(size_buf, GFP_KERNEL);
        if (!buf)
                return;

        bp = buf;

#define CHECK_APPEND_1ARG(a, v1)                                        \
        do {                                                                \
                ret = snprintf(bp, size_bp, (a), (v1));                        \
                if (ret < 0 || ret >= size_bp)                                \
                        goto out_overflow;                                \
                size_bp -= ret;                                                \
                bp += ret;                                                \
        } while (0)

        if (bctl->flags & BTRFS_BALANCE_FORCE)
                CHECK_APPEND_1ARG("%s", "-f ");

        if (bctl->flags & BTRFS_BALANCE_DATA) {
                describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
                CHECK_APPEND_1ARG("-d%s ", tmp_buf);
        }

        if (bctl->flags & BTRFS_BALANCE_METADATA) {
                describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
                CHECK_APPEND_1ARG("-m%s ", tmp_buf);
        }

        if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
                describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
                CHECK_APPEND_1ARG("-s%s ", tmp_buf);
        }

#undef CHECK_APPEND_1ARG

out_overflow:

        if (size_bp < size_buf)
                buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
        btrfs_info(fs_info, "balance: %s %s",
                   (bctl->flags & BTRFS_BALANCE_RESUME) ?
                   "resume" : "start", buf);

        kfree(buf);
}

/*
 * Should be called with balance mutexe held
 */
int btrfs_balance(struct btrfs_fs_info *fs_info,
                  struct btrfs_balance_control *bctl,
                  struct btrfs_ioctl_balance_args *bargs)
{
        u64 meta_target, data_target;
        u64 allowed;
        int mixed = 0;
        int ret;
        u64 num_devices;
        unsigned seq;
        bool reducing_redundancy;
        bool paused = false;
        int i;

        if (btrfs_fs_closing(fs_info) ||
            atomic_read(&fs_info->balance_pause_req) ||
            btrfs_should_cancel_balance(fs_info)) {
                ret = -EINVAL;
                goto out;
        }

        allowed = btrfs_super_incompat_flags(fs_info->super_copy);
        if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
                mixed = 1;

        /*
         * In case of mixed groups both data and meta should be picked,
         * and identical options should be given for both of them.
         */
        allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
        if (mixed && (bctl->flags & allowed)) {
                if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
                    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
                    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
                        btrfs_err(fs_info,
          "balance: mixed groups data and metadata options must be the same");
                        ret = -EINVAL;
                        goto out;
                }
        }

        /*
         * rw_devices will not change at the moment, device add/delete/replace
         * are exclusive
         */
        num_devices = fs_info->fs_devices->rw_devices;

        /*
         * SINGLE profile on-disk has no profile bit, but in-memory we have a
         * special bit for it, to make it easier to distinguish.  Thus we need
         * to set it manually, or balance would refuse the profile.
         */
        allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
        for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
                if (num_devices >= btrfs_raid_array[i].devs_min)
                        allowed |= btrfs_raid_array[i].bg_flag;

        if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
            !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
            !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
                ret = -EINVAL;
                goto out;
        }

        /*
         * Allow to reduce metadata or system integrity only if force set for
         * profiles with redundancy (copies, parity)
         */
        allowed = 0;
        for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
                if (btrfs_raid_array[i].ncopies >= 2 ||
                    btrfs_raid_array[i].tolerated_failures >= 1)
                        allowed |= btrfs_raid_array[i].bg_flag;
        }
        do {
                seq = read_seqbegin(&fs_info->profiles_lock);

                if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
                     (fs_info->avail_system_alloc_bits & allowed) &&
                     !(bctl->sys.target & allowed)) ||
                    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
                     (fs_info->avail_metadata_alloc_bits & allowed) &&
                     !(bctl->meta.target & allowed)))
                        reducing_redundancy = true;
                else
                        reducing_redundancy = false;

                /* if we're not converting, the target field is uninitialized */
                meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
                        bctl->meta.target : fs_info->avail_metadata_alloc_bits;
                data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
                        bctl->data.target : fs_info->avail_data_alloc_bits;
        } while (read_seqretry(&fs_info->profiles_lock, seq));

        if (reducing_redundancy) {
                if (bctl->flags & BTRFS_BALANCE_FORCE) {
                        btrfs_info(fs_info,
                           "balance: force reducing metadata redundancy");
                } else {
                        btrfs_err(fs_info,
        "balance: reduces metadata redundancy, use --force if you want this");
                        ret = -EINVAL;
                        goto out;
                }
        }

        if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
                btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
                btrfs_warn(fs_info,
        "balance: metadata profile %s has lower redundancy than data profile %s",
                                btrfs_bg_type_to_raid_name(meta_target),
                                btrfs_bg_type_to_raid_name(data_target));
        }

        ret = insert_balance_item(fs_info, bctl);
        if (ret && ret != -EEXIST)
                goto out;

        if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
                BUG_ON(ret == -EEXIST);
                BUG_ON(fs_info->balance_ctl);
                spin_lock(&fs_info->balance_lock);
                fs_info->balance_ctl = bctl;
                spin_unlock(&fs_info->balance_lock);
        } else {
                BUG_ON(ret != -EEXIST);
                spin_lock(&fs_info->balance_lock);
                update_balance_args(bctl);
                spin_unlock(&fs_info->balance_lock);
        }

        ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
        set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
        describe_balance_start_or_resume(fs_info);
        mutex_unlock(&fs_info->balance_mutex);

        ret = __btrfs_balance(fs_info);

        mutex_lock(&fs_info->balance_mutex);
        if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
                btrfs_info(fs_info, "balance: paused");
                btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
                paused = true;
        }
        /*
         * Balance can be canceled by:
         *
         * - Regular cancel request
         *   Then ret == -ECANCELED and balance_cancel_req > 0
         *
         * - Fatal signal to "btrfs" process
         *   Either the signal caught by wait_reserve_ticket() and callers
         *   got -EINTR, or caught by btrfs_should_cancel_balance() and
         *   got -ECANCELED.
         *   Either way, in this case balance_cancel_req = 0, and
         *   ret == -EINTR or ret == -ECANCELED.
         *
         * So here we only check the return value to catch canceled balance.
         */
        else if (ret == -ECANCELED || ret == -EINTR)
                btrfs_info(fs_info, "balance: canceled");
        else
                btrfs_info(fs_info, "balance: ended with status: %d", ret);

        clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);

        if (bargs) {
                memset(bargs, 0, sizeof(*bargs));
                btrfs_update_ioctl_balance_args(fs_info, bargs);
        }

        /* We didn't pause, we can clean everything up. */
        if (!paused) {
                reset_balance_state(fs_info);
                btrfs_exclop_finish(fs_info);
        }

        wake_up(&fs_info->balance_wait_q);

        return ret;
out:
        if (bctl->flags & BTRFS_BALANCE_RESUME)
                reset_balance_state(fs_info);
        else
                kfree(bctl);
        btrfs_exclop_finish(fs_info);

        return ret;
}

static int balance_kthread(void *data)
{
        struct btrfs_fs_info *fs_info = data;
        int ret = 0;

        sb_start_write(fs_info->sb);
        mutex_lock(&fs_info->balance_mutex);
        if (fs_info->balance_ctl)
                ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
        mutex_unlock(&fs_info->balance_mutex);
        sb_end_write(fs_info->sb);

        return ret;
}

int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
{
        struct task_struct *tsk;

        mutex_lock(&fs_info->balance_mutex);
        if (!fs_info->balance_ctl) {
                mutex_unlock(&fs_info->balance_mutex);
                return 0;
        }
        mutex_unlock(&fs_info->balance_mutex);

        if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
                btrfs_info(fs_info, "balance: resume skipped");
                return 0;
        }

        spin_lock(&fs_info->super_lock);
        ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
        fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
        spin_unlock(&fs_info->super_lock);
        /*
         * A ro->rw remount sequence should continue with the paused balance
         * regardless of who pauses it, system or the user as of now, so set
         * the resume flag.
         */
        spin_lock(&fs_info->balance_lock);
        fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
        spin_unlock(&fs_info->balance_lock);

        tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
        return PTR_ERR_OR_ZERO(tsk);
}

int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
{
        struct btrfs_balance_control *bctl;
        struct btrfs_balance_item *item;
        struct btrfs_disk_balance_args disk_bargs;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_key key;
        int ret;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = BTRFS_BALANCE_OBJECTID;
        key.type = BTRFS_TEMPORARY_ITEM_KEY;
        key.offset = 0;

        ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
        if (ret < 0)
                goto out;
        if (ret > 0) { /* ret = -ENOENT; */
                ret = 0;
                goto out;
        }

        bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
        if (!bctl) {
                ret = -ENOMEM;
                goto out;
        }

        leaf = path->nodes[0];
        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);

        bctl->flags = btrfs_balance_flags(leaf, item);
        bctl->flags |= BTRFS_BALANCE_RESUME;

        btrfs_balance_data(leaf, item, &disk_bargs);
        btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
        btrfs_balance_meta(leaf, item, &disk_bargs);
        btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
        btrfs_balance_sys(leaf, item, &disk_bargs);
        btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);

        /*
         * This should never happen, as the paused balance state is recovered
         * during mount without any chance of other exclusive ops to collide.
         *
         * This gives the exclusive op status to balance and keeps in paused
         * state until user intervention (cancel or umount). If the ownership
         * cannot be assigned, show a message but do not fail. The balance
         * is in a paused state and must have fs_info::balance_ctl properly
         * set up.
         */
        if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED))
                btrfs_warn(fs_info,
        "balance: cannot set exclusive op status, resume manually");

        btrfs_release_path(path);

        mutex_lock(&fs_info->balance_mutex);
        BUG_ON(fs_info->balance_ctl);
        spin_lock(&fs_info->balance_lock);
        fs_info->balance_ctl = bctl;
        spin_unlock(&fs_info->balance_lock);
        mutex_unlock(&fs_info->balance_mutex);
out:
        btrfs_free_path(path);
        return ret;
}

int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
{
        int ret = 0;

        mutex_lock(&fs_info->balance_mutex);
        if (!fs_info->balance_ctl) {
                mutex_unlock(&fs_info->balance_mutex);
                return -ENOTCONN;
        }

        if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
                atomic_inc(&fs_info->balance_pause_req);
                mutex_unlock(&fs_info->balance_mutex);

                wait_event(fs_info->balance_wait_q,
                           !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));

                mutex_lock(&fs_info->balance_mutex);
                /* we are good with balance_ctl ripped off from under us */
                BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
                atomic_dec(&fs_info->balance_pause_req);
        } else {
                ret = -ENOTCONN;
        }

        mutex_unlock(&fs_info->balance_mutex);
        return ret;
}

int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
{
        mutex_lock(&fs_info->balance_mutex);
        if (!fs_info->balance_ctl) {
                mutex_unlock(&fs_info->balance_mutex);
                return -ENOTCONN;
        }

        /*
         * A paused balance with the item stored on disk can be resumed at
         * mount time if the mount is read-write. Otherwise it's still paused
         * and we must not allow cancelling as it deletes the item.
         */
        if (sb_rdonly(fs_info->sb)) {
                mutex_unlock(&fs_info->balance_mutex);
                return -EROFS;
        }

        atomic_inc(&fs_info->balance_cancel_req);
        /*
         * if we are running just wait and return, balance item is
         * deleted in btrfs_balance in this case
         */
        if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
                mutex_unlock(&fs_info->balance_mutex);
                wait_event(fs_info->balance_wait_q,
                           !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
                mutex_lock(&fs_info->balance_mutex);
        } else {
                mutex_unlock(&fs_info->balance_mutex);
                /*
                 * Lock released to allow other waiters to continue, we'll
                 * reexamine the status again.
                 */
                mutex_lock(&fs_info->balance_mutex);

                if (fs_info->balance_ctl) {
                        reset_balance_state(fs_info);
                        btrfs_exclop_finish(fs_info);
                        btrfs_info(fs_info, "balance: canceled");
                }
        }

        ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
        atomic_dec(&fs_info->balance_cancel_req);
        mutex_unlock(&fs_info->balance_mutex);
        return 0;
}

int btrfs_uuid_scan_kthread(void *data)
{
        struct btrfs_fs_info *fs_info = data;
        struct btrfs_root *root = fs_info->tree_root;
        struct btrfs_key key;
        struct btrfs_path *path = NULL;
        int ret = 0;
        struct extent_buffer *eb;
        int slot;
        struct btrfs_root_item root_item;
        u32 item_size;
        struct btrfs_trans_handle *trans = NULL;
        bool closing = false;

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }

        key.objectid = 0;
        key.type = BTRFS_ROOT_ITEM_KEY;
        key.offset = 0;

        while (1) {
                if (btrfs_fs_closing(fs_info)) {
                        closing = true;
                        break;
                }
                ret = btrfs_search_forward(root, &key, path,
                                BTRFS_OLDEST_GENERATION);
                if (ret) {
                        if (ret > 0)
                                ret = 0;
                        break;
                }

                if (key.type != BTRFS_ROOT_ITEM_KEY ||
                    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
                     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
                    key.objectid > BTRFS_LAST_FREE_OBJECTID)
                        goto skip;

                eb = path->nodes[0];
                slot = path->slots[0];
                item_size = btrfs_item_size(eb, slot);
                if (item_size < sizeof(root_item))
                        goto skip;

                read_extent_buffer(eb, &root_item,
                                   btrfs_item_ptr_offset(eb, slot),
                                   (int)sizeof(root_item));
                if (btrfs_root_refs(&root_item) == 0)
                        goto skip;

                if (!btrfs_is_empty_uuid(root_item.uuid) ||
                    !btrfs_is_empty_uuid(root_item.received_uuid)) {
                        if (trans)
                                goto update_tree;

                        btrfs_release_path(path);
                        /*
                         * 1 - subvol uuid item
                         * 1 - received_subvol uuid item
                         */
                        trans = btrfs_start_transaction(fs_info->uuid_root, 2);
                        if (IS_ERR(trans)) {
                                ret = PTR_ERR(trans);
                                break;
                        }
                        continue;
                } else {
                        goto skip;
                }
update_tree:
                btrfs_release_path(path);
                if (!btrfs_is_empty_uuid(root_item.uuid)) {
                        ret = btrfs_uuid_tree_add(trans, root_item.uuid,
                                                  BTRFS_UUID_KEY_SUBVOL,
                                                  key.objectid);
                        if (ret < 0) {
                                btrfs_warn(fs_info, "uuid_tree_add failed %d",
                                        ret);
                                break;
                        }
                }

                if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
                        ret = btrfs_uuid_tree_add(trans,
                                                  root_item.received_uuid,
                                                 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
                                                  key.objectid);
                        if (ret < 0) {
                                btrfs_warn(fs_info, "uuid_tree_add failed %d",
                                        ret);
                                break;
                        }
                }

skip:
                btrfs_release_path(path);
                if (trans) {
                        ret = btrfs_end_transaction(trans);
                        trans = NULL;
                        if (ret)
                                break;
                }

                if (key.offset < (u64)-1) {
                        key.offset++;
                } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
                        key.offset = 0;
                        key.type = BTRFS_ROOT_ITEM_KEY;
                } else if (key.objectid < (u64)-1) {
                        key.offset = 0;
                        key.type = BTRFS_ROOT_ITEM_KEY;
                        key.objectid++;
                } else {
                        break;
                }
                cond_resched();
        }

out:
        btrfs_free_path(path);
        if (trans && !IS_ERR(trans))
                btrfs_end_transaction(trans);
        if (ret)
                btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
        else if (!closing)
                set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
        up(&fs_info->uuid_tree_rescan_sem);
        return 0;
}

int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
{
        struct btrfs_trans_handle *trans;
        struct btrfs_root *tree_root = fs_info->tree_root;
        struct btrfs_root *uuid_root;
        struct task_struct *task;
        int ret;

        /*
         * 1 - root node
         * 1 - root item
         */
        trans = btrfs_start_transaction(tree_root, 2);
        if (IS_ERR(trans))
                return PTR_ERR(trans);

        uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
        if (IS_ERR(uuid_root)) {
                ret = PTR_ERR(uuid_root);
                btrfs_abort_transaction(trans, ret);
                btrfs_end_transaction(trans);
                return ret;
        }

        fs_info->uuid_root = uuid_root;

        ret = btrfs_commit_transaction(trans);
        if (ret)
                return ret;

        down(&fs_info->uuid_tree_rescan_sem);
        task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
        if (IS_ERR(task)) {
                /* fs_info->update_uuid_tree_gen remains 0 in all error case */
                btrfs_warn(fs_info, "failed to start uuid_scan task");
                up(&fs_info->uuid_tree_rescan_sem);
                return PTR_ERR(task);
        }

        return 0;
}

/*
 * shrinking a device means finding all of the device extents past
 * the new size, and then following the back refs to the chunks.
 * The chunk relocation code actually frees the device extent
 */
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
{
        struct btrfs_fs_info *fs_info = device->fs_info;
        struct btrfs_root *root = fs_info->dev_root;
        struct btrfs_trans_handle *trans;
        struct btrfs_dev_extent *dev_extent = NULL;
        struct btrfs_path *path;
        u64 length;
        u64 chunk_offset;
        int ret;
        int slot;
        int failed = 0;
        bool retried = false;
        struct extent_buffer *l;
        struct btrfs_key key;
        struct btrfs_super_block *super_copy = fs_info->super_copy;
        u64 old_total = btrfs_super_total_bytes(super_copy);
        u64 old_size = btrfs_device_get_total_bytes(device);
        u64 diff;
        u64 start;
        u64 free_diff = 0;

        new_size = round_down(new_size, fs_info->sectorsize);
        start = new_size;
        diff = round_down(old_size - new_size, fs_info->sectorsize);

        if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
                return -EINVAL;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        path->reada = READA_BACK;

        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                btrfs_free_path(path);
                return PTR_ERR(trans);
        }

        mutex_lock(&fs_info->chunk_mutex);

        btrfs_device_set_total_bytes(device, new_size);
        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
                device->fs_devices->total_rw_bytes -= diff;

                /*
                 * The new free_chunk_space is new_size - used, so we have to
                 * subtract the delta of the old free_chunk_space which included
                 * old_size - used.  If used > new_size then just subtract this
                 * entire device's free space.
                 */
                if (device->bytes_used < new_size)
                        free_diff = (old_size - device->bytes_used) -
                                    (new_size - device->bytes_used);
                else
                        free_diff = old_size - device->bytes_used;
                atomic64_sub(free_diff, &fs_info->free_chunk_space);
        }

        /*
         * Once the device's size has been set to the new size, ensure all
         * in-memory chunks are synced to disk so that the loop below sees them
         * and relocates them accordingly.
         */
        if (contains_pending_extent(device, &start, diff)) {
                mutex_unlock(&fs_info->chunk_mutex);
                ret = btrfs_commit_transaction(trans);
                if (ret)
                        goto done;
        } else {
                mutex_unlock(&fs_info->chunk_mutex);
                btrfs_end_transaction(trans);
        }

again:
        key.objectid = device->devid;
        key.offset = (u64)-1;
        key.type = BTRFS_DEV_EXTENT_KEY;

        do {
                mutex_lock(&fs_info->reclaim_bgs_lock);
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0) {
                        mutex_unlock(&fs_info->reclaim_bgs_lock);
                        goto done;
                }

                ret = btrfs_previous_item(root, path, 0, key.type);
                if (ret) {
                        mutex_unlock(&fs_info->reclaim_bgs_lock);
                        if (ret < 0)
                                goto done;
                        ret = 0;
                        btrfs_release_path(path);
                        break;
                }

                l = path->nodes[0];
                slot = path->slots[0];
                btrfs_item_key_to_cpu(l, &key, path->slots[0]);

                if (key.objectid != device->devid) {
                        mutex_unlock(&fs_info->reclaim_bgs_lock);
                        btrfs_release_path(path);
                        break;
                }

                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
                length = btrfs_dev_extent_length(l, dev_extent);

                if (key.offset + length <= new_size) {
                        mutex_unlock(&fs_info->reclaim_bgs_lock);
                        btrfs_release_path(path);
                        break;
                }

                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
                btrfs_release_path(path);

                /*
                 * We may be relocating the only data chunk we have,
                 * which could potentially end up with losing data's
                 * raid profile, so lets allocate an empty one in
                 * advance.
                 */
                ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
                if (ret < 0) {
                        mutex_unlock(&fs_info->reclaim_bgs_lock);
                        goto done;
                }

                ret = btrfs_relocate_chunk(fs_info, chunk_offset);
                mutex_unlock(&fs_info->reclaim_bgs_lock);
                if (ret == -ENOSPC) {
                        failed++;
                } else if (ret) {
                        if (ret == -ETXTBSY) {
                                btrfs_warn(fs_info,
                   "could not shrink block group %llu due to active swapfile",
                                           chunk_offset);
                        }
                        goto done;
                }
        } while (key.offset-- > 0);

        if (failed && !retried) {
                failed = 0;
                retried = true;
                goto again;
        } else if (failed && retried) {
                ret = -ENOSPC;
                goto done;
        }

        /* Shrinking succeeded, else we would be at "done". */
        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto done;
        }

        mutex_lock(&fs_info->chunk_mutex);
        /* Clear all state bits beyond the shrunk device size */
        clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
                          CHUNK_STATE_MASK);

        btrfs_device_set_disk_total_bytes(device, new_size);
        if (list_empty(&device->post_commit_list))
                list_add_tail(&device->post_commit_list,
                              &trans->transaction->dev_update_list);

        WARN_ON(diff > old_total);
        btrfs_set_super_total_bytes(super_copy,
                        round_down(old_total - diff, fs_info->sectorsize));
        mutex_unlock(&fs_info->chunk_mutex);

        btrfs_reserve_chunk_metadata(trans, false);
        /* Now btrfs_update_device() will change the on-disk size. */
        ret = btrfs_update_device(trans, device);
        btrfs_trans_release_chunk_metadata(trans);
        if (ret < 0) {
                btrfs_abort_transaction(trans, ret);
                btrfs_end_transaction(trans);
        } else {
                ret = btrfs_commit_transaction(trans);
        }
done:
        btrfs_free_path(path);
        if (ret) {
                mutex_lock(&fs_info->chunk_mutex);
                btrfs_device_set_total_bytes(device, old_size);
                if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
                        device->fs_devices->total_rw_bytes += diff;
                        atomic64_add(free_diff, &fs_info->free_chunk_space);
                }
                mutex_unlock(&fs_info->chunk_mutex);
        }
        return ret;
}

static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
                           struct btrfs_key *key,
                           struct btrfs_chunk *chunk, int item_size)
{
        struct btrfs_super_block *super_copy = fs_info->super_copy;
        struct btrfs_disk_key disk_key;
        u32 array_size;
        u8 *ptr;

        lockdep_assert_held(&fs_info->chunk_mutex);

        array_size = btrfs_super_sys_array_size(super_copy);
        if (array_size + item_size + sizeof(disk_key)
                        > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
                return -EFBIG;

        ptr = super_copy->sys_chunk_array + array_size;
        btrfs_cpu_key_to_disk(&disk_key, key);
        memcpy(ptr, &disk_key, sizeof(disk_key));
        ptr += sizeof(disk_key);
        memcpy(ptr, chunk, item_size);
        item_size += sizeof(disk_key);
        btrfs_set_super_sys_array_size(super_copy, array_size + item_size);

        return 0;
}

/*
 * sort the devices in descending order by max_avail, total_avail
 */
static int btrfs_cmp_device_info(const void *a, const void *b)
{
        const struct btrfs_device_info *di_a = a;
        const struct btrfs_device_info *di_b = b;

        if (di_a->max_avail > di_b->max_avail)
                return -1;
        if (di_a->max_avail < di_b->max_avail)
                return 1;
        if (di_a->total_avail > di_b->total_avail)
                return -1;
        if (di_a->total_avail < di_b->total_avail)
                return 1;
        return 0;
}

static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
        if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
                return;

        btrfs_set_fs_incompat(info, RAID56);
}

static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
        if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
                return;

        btrfs_set_fs_incompat(info, RAID1C34);
}

/*
 * Structure used internally for btrfs_create_chunk() function.
 * Wraps needed parameters.
 */
struct alloc_chunk_ctl {
        u64 start;
        u64 type;
        /* Total number of stripes to allocate */
        int num_stripes;
        /* sub_stripes info for map */
        int sub_stripes;
        /* Stripes per device */
        int dev_stripes;
        /* Maximum number of devices to use */
        int devs_max;
        /* Minimum number of devices to use */
        int devs_min;
        /* ndevs has to be a multiple of this */
        int devs_increment;
        /* Number of copies */
        int ncopies;
        /* Number of stripes worth of bytes to store parity information */
        int nparity;
        u64 max_stripe_size;
        u64 max_chunk_size;
        u64 dev_extent_min;
        u64 stripe_size;
        u64 chunk_size;
        int ndevs;
};

static void init_alloc_chunk_ctl_policy_regular(
                                struct btrfs_fs_devices *fs_devices,
                                struct alloc_chunk_ctl *ctl)
{
        struct btrfs_space_info *space_info;

        space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type);
        ASSERT(space_info);

        ctl->max_chunk_size = READ_ONCE(space_info->chunk_size);
        ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G);

        if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM)
                ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);

        /* We don't want a chunk larger than 10% of writable space */
        ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10),
                                  ctl->max_chunk_size);
        ctl->dev_extent_min = btrfs_stripe_nr_to_offset(ctl->dev_stripes);
}

static void init_alloc_chunk_ctl_policy_zoned(
                                      struct btrfs_fs_devices *fs_devices,
                                      struct alloc_chunk_ctl *ctl)
{
        u64 zone_size = fs_devices->fs_info->zone_size;
        u64 limit;
        int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
        int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
        u64 min_chunk_size = min_data_stripes * zone_size;
        u64 type = ctl->type;

        ctl->max_stripe_size = zone_size;
        if (type & BTRFS_BLOCK_GROUP_DATA) {
                ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
                                                 zone_size);
        } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
                ctl->max_chunk_size = ctl->max_stripe_size;
        } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
                ctl->max_chunk_size = 2 * ctl->max_stripe_size;
                ctl->devs_max = min_t(int, ctl->devs_max,
                                      BTRFS_MAX_DEVS_SYS_CHUNK);
        } else {
                BUG();
        }

        /* We don't want a chunk larger than 10% of writable space */
        limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10),
                               zone_size),
                    min_chunk_size);
        ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
        ctl->dev_extent_min = zone_size * ctl->dev_stripes;
}

static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
                                 struct alloc_chunk_ctl *ctl)
{
        int index = btrfs_bg_flags_to_raid_index(ctl->type);

        ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
        ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
        ctl->devs_max = btrfs_raid_array[index].devs_max;
        if (!ctl->devs_max)
                ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
        ctl->devs_min = btrfs_raid_array[index].devs_min;
        ctl->devs_increment = btrfs_raid_array[index].devs_increment;
        ctl->ncopies = btrfs_raid_array[index].ncopies;
        ctl->nparity = btrfs_raid_array[index].nparity;
        ctl->ndevs = 0;

        switch (fs_devices->chunk_alloc_policy) {
        case BTRFS_CHUNK_ALLOC_REGULAR:
                init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
                break;
        case BTRFS_CHUNK_ALLOC_ZONED:
                init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
                break;
        default:
                BUG();
        }
}

static int gather_device_info(struct btrfs_fs_devices *fs_devices,
                              struct alloc_chunk_ctl *ctl,
                              struct btrfs_device_info *devices_info)
{
        struct btrfs_fs_info *info = fs_devices->fs_info;
        struct btrfs_device *device;
        u64 total_avail;
        u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
        int ret;
        int ndevs = 0;
        u64 max_avail;
        u64 dev_offset;

        /*
         * in the first pass through the devices list, we gather information
         * about the available holes on each device.
         */
        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
                if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
                        WARN(1, KERN_ERR
                               "BTRFS: read-only device in alloc_list\n");
                        continue;
                }

                if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
                                        &device->dev_state) ||
                    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
                        continue;

                if (device->total_bytes > device->bytes_used)
                        total_avail = device->total_bytes - device->bytes_used;
                else
                        total_avail = 0;

                /* If there is no space on this device, skip it. */
                if (total_avail < ctl->dev_extent_min)
                        continue;

                ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
                                           &max_avail);
                if (ret && ret != -ENOSPC)
                        return ret;

                if (ret == 0)
                        max_avail = dev_extent_want;

                if (max_avail < ctl->dev_extent_min) {
                        if (btrfs_test_opt(info, ENOSPC_DEBUG))
                                btrfs_debug(info,
                        "%s: devid %llu has no free space, have=%llu want=%llu",
                                            __func__, device->devid, max_avail,
                                            ctl->dev_extent_min);
                        continue;
                }

                if (ndevs == fs_devices->rw_devices) {
                        WARN(1, "%s: found more than %llu devices\n",
                             __func__, fs_devices->rw_devices);
                        break;
                }
                devices_info[ndevs].dev_offset = dev_offset;
                devices_info[ndevs].max_avail = max_avail;
                devices_info[ndevs].total_avail = total_avail;
                devices_info[ndevs].dev = device;
                ++ndevs;
        }
        ctl->ndevs = ndevs;

        /*
         * now sort the devices by hole size / available space
         */
        sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
             btrfs_cmp_device_info, NULL);

        return 0;
}

static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
                                      struct btrfs_device_info *devices_info)
{
        /* Number of stripes that count for block group size */
        int data_stripes;

        /*
         * The primary goal is to maximize the number of stripes, so use as
         * many devices as possible, even if the stripes are not maximum sized.
         *
         * The DUP profile stores more than one stripe per device, the
         * max_avail is the total size so we have to adjust.
         */
        ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
                                   ctl->dev_stripes);
        ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;

        /* This will have to be fixed for RAID1 and RAID10 over more drives */
        data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;

        /*
         * Use the number of data stripes to figure out how big this chunk is
         * really going to be in terms of logical address space, and compare
         * that answer with the max chunk size. If it's higher, we try to
         * reduce stripe_size.
         */
        if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
                /*
                 * Reduce stripe_size, round it up to a 16MB boundary again and
                 * then use it, unless it ends up being even bigger than the
                 * previous value we had already.
                 */
                ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
                                                        data_stripes), SZ_16M),
                                       ctl->stripe_size);
        }

        /* Stripe size should not go beyond 1G. */
        ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G);

        /* Align to BTRFS_STRIPE_LEN */
        ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
        ctl->chunk_size = ctl->stripe_size * data_stripes;

        return 0;
}

static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
                                    struct btrfs_device_info *devices_info)
{
        u64 zone_size = devices_info[0].dev->zone_info->zone_size;
        /* Number of stripes that count for block group size */
        int data_stripes;

        /*
         * It should hold because:
         *    dev_extent_min == dev_extent_want == zone_size * dev_stripes
         */
        ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);

        ctl->stripe_size = zone_size;
        ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
        data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;

        /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
        if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
                ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
                                             ctl->stripe_size) + ctl->nparity,
                                     ctl->dev_stripes);
                ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
                data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
                ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
        }

        ctl->chunk_size = ctl->stripe_size * data_stripes;

        return 0;
}

static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
                              struct alloc_chunk_ctl *ctl,
                              struct btrfs_device_info *devices_info)
{
        struct btrfs_fs_info *info = fs_devices->fs_info;

        /*
         * Round down to number of usable stripes, devs_increment can be any
         * number so we can't use round_down() that requires power of 2, while
         * rounddown is safe.
         */
        ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);

        if (ctl->ndevs < ctl->devs_min) {
                if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
                        btrfs_debug(info,
        "%s: not enough devices with free space: have=%d minimum required=%d",
                                    __func__, ctl->ndevs, ctl->devs_min);
                }
                return -ENOSPC;
        }

        ctl->ndevs = min(ctl->ndevs, ctl->devs_max);

        switch (fs_devices->chunk_alloc_policy) {
        case BTRFS_CHUNK_ALLOC_REGULAR:
                return decide_stripe_size_regular(ctl, devices_info);
        case BTRFS_CHUNK_ALLOC_ZONED:
                return decide_stripe_size_zoned(ctl, devices_info);
        default:
                BUG();
        }
}

static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int bits)
{
        for (int i = 0; i < map->num_stripes; i++) {
                struct btrfs_io_stripe *stripe = &map->stripes[i];
                struct btrfs_device *device = stripe->dev;

                set_extent_bit(&device->alloc_state, stripe->physical,
                               stripe->physical + map->stripe_size - 1,
                               bits | EXTENT_NOWAIT, NULL);
        }
}

static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
{
        for (int i = 0; i < map->num_stripes; i++) {
                struct btrfs_io_stripe *stripe = &map->stripes[i];
                struct btrfs_device *device = stripe->dev;

                __clear_extent_bit(&device->alloc_state, stripe->physical,
                                   stripe->physical + map->stripe_size - 1,
                                   bits | EXTENT_NOWAIT,
                                   NULL, NULL);
        }
}

void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
{
        write_lock(&fs_info->mapping_tree_lock);
        rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
        RB_CLEAR_NODE(&map->rb_node);
        chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
        write_unlock(&fs_info->mapping_tree_lock);

        /* Once for the tree reference. */
        btrfs_free_chunk_map(map);
}

EXPORT_FOR_TESTS
int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
{
        struct rb_node **p;
        struct rb_node *parent = NULL;
        bool leftmost = true;

        write_lock(&fs_info->mapping_tree_lock);
        p = &fs_info->mapping_tree.rb_root.rb_node;
        while (*p) {
                struct btrfs_chunk_map *entry;

                parent = *p;
                entry = rb_entry(parent, struct btrfs_chunk_map, rb_node);

                if (map->start < entry->start) {
                        p = &(*p)->rb_left;
                } else if (map->start > entry->start) {
                        p = &(*p)->rb_right;
                        leftmost = false;
                } else {
                        write_unlock(&fs_info->mapping_tree_lock);
                        return -EEXIST;
                }
        }
        rb_link_node(&map->rb_node, parent, p);
        rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost);
        chunk_map_device_set_bits(map, CHUNK_ALLOCATED);
        chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
        write_unlock(&fs_info->mapping_tree_lock);

        return 0;
}

EXPORT_FOR_TESTS
struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp)
{
        struct btrfs_chunk_map *map;

        map = kmalloc(btrfs_chunk_map_size(num_stripes), gfp);
        if (!map)
                return NULL;

        refcount_set(&map->refs, 1);
        RB_CLEAR_NODE(&map->rb_node);

        return map;
}

static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
                        struct alloc_chunk_ctl *ctl,
                        struct btrfs_device_info *devices_info)
{
        struct btrfs_fs_info *info = trans->fs_info;
        struct btrfs_chunk_map *map;
        struct btrfs_block_group *block_group;
        u64 start = ctl->start;
        u64 type = ctl->type;
        int ret;
        int i;
        int j;

        map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS);
        if (!map)
                return ERR_PTR(-ENOMEM);

        map->start = start;
        map->chunk_len = ctl->chunk_size;
        map->stripe_size = ctl->stripe_size;
        map->type = type;
        map->io_align = BTRFS_STRIPE_LEN;
        map->io_width = BTRFS_STRIPE_LEN;
        map->sub_stripes = ctl->sub_stripes;
        map->num_stripes = ctl->num_stripes;

        for (i = 0; i < ctl->ndevs; ++i) {
                for (j = 0; j < ctl->dev_stripes; ++j) {
                        int s = i * ctl->dev_stripes + j;
                        map->stripes[s].dev = devices_info[i].dev;
                        map->stripes[s].physical = devices_info[i].dev_offset +
                                                   j * ctl->stripe_size;
                }
        }

        trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);

        ret = btrfs_add_chunk_map(info, map);
        if (ret) {
                btrfs_free_chunk_map(map);
                return ERR_PTR(ret);
        }

        block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size);
        if (IS_ERR(block_group)) {
                btrfs_remove_chunk_map(info, map);
                return block_group;
        }

        for (int i = 0; i < map->num_stripes; i++) {
                struct btrfs_device *dev = map->stripes[i].dev;

                btrfs_device_set_bytes_used(dev,
                                            dev->bytes_used + ctl->stripe_size);
                if (list_empty(&dev->post_commit_list))
                        list_add_tail(&dev->post_commit_list,
                                      &trans->transaction->dev_update_list);
        }

        atomic64_sub(ctl->stripe_size * map->num_stripes,
                     &info->free_chunk_space);

        check_raid56_incompat_flag(info, type);
        check_raid1c34_incompat_flag(info, type);

        return block_group;
}

struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
                                            u64 type)
{
        struct btrfs_fs_info *info = trans->fs_info;
        struct btrfs_fs_devices *fs_devices = info->fs_devices;
        struct btrfs_device_info *devices_info = NULL;
        struct alloc_chunk_ctl ctl;
        struct btrfs_block_group *block_group;
        int ret;

        lockdep_assert_held(&info->chunk_mutex);

        if (!alloc_profile_is_valid(type, 0)) {
                ASSERT(0);
                return ERR_PTR(-EINVAL);
        }

        if (list_empty(&fs_devices->alloc_list)) {
                if (btrfs_test_opt(info, ENOSPC_DEBUG))
                        btrfs_debug(info, "%s: no writable device", __func__);
                return ERR_PTR(-ENOSPC);
        }

        if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
                btrfs_err(info, "invalid chunk type 0x%llx requested", type);
                ASSERT(0);
                return ERR_PTR(-EINVAL);
        }

        ctl.start = find_next_chunk(info);
        ctl.type = type;
        init_alloc_chunk_ctl(fs_devices, &ctl);

        devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
                               GFP_NOFS);
        if (!devices_info)
                return ERR_PTR(-ENOMEM);

        ret = gather_device_info(fs_devices, &ctl, devices_info);
        if (ret < 0) {
                block_group = ERR_PTR(ret);
                goto out;
        }

        ret = decide_stripe_size(fs_devices, &ctl, devices_info);
        if (ret < 0) {
                block_group = ERR_PTR(ret);
                goto out;
        }

        block_group = create_chunk(trans, &ctl, devices_info);

out:
        kfree(devices_info);
        return block_group;
}

/*
 * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
 * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
 * chunks.
 *
 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
 * phases.
 */
int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
                                     struct btrfs_block_group *bg)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *chunk_root = fs_info->chunk_root;
        struct btrfs_key key;
        struct btrfs_chunk *chunk;
        struct btrfs_stripe *stripe;
        struct btrfs_chunk_map *map;
        size_t item_size;
        int i;
        int ret;

        /*
         * We take the chunk_mutex for 2 reasons:
         *
         * 1) Updates and insertions in the chunk btree must be done while holding
         *    the chunk_mutex, as well as updating the system chunk array in the
         *    superblock. See the comment on top of btrfs_chunk_alloc() for the
         *    details;
         *
         * 2) To prevent races with the final phase of a device replace operation
         *    that replaces the device object associated with the map's stripes,
         *    because the device object's id can change at any time during that
         *    final phase of the device replace operation
         *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
         *    replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
         *    which would cause a failure when updating the device item, which does
         *    not exists, or persisting a stripe of the chunk item with such ID.
         *    Here we can't use the device_list_mutex because our caller already
         *    has locked the chunk_mutex, and the final phase of device replace
         *    acquires both mutexes - first the device_list_mutex and then the
         *    chunk_mutex. Using any of those two mutexes protects us from a
         *    concurrent device replace.
         */
        lockdep_assert_held(&fs_info->chunk_mutex);

        map = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
        if (IS_ERR(map)) {
                ret = PTR_ERR(map);
                btrfs_abort_transaction(trans, ret);
                return ret;
        }

        item_size = btrfs_chunk_item_size(map->num_stripes);

        chunk = kzalloc(item_size, GFP_NOFS);
        if (!chunk) {
                ret = -ENOMEM;
                btrfs_abort_transaction(trans, ret);
                goto out;
        }

        for (i = 0; i < map->num_stripes; i++) {
                struct btrfs_device *device = map->stripes[i].dev;

                ret = btrfs_update_device(trans, device);
                if (ret)
                        goto out;
        }

        stripe = &chunk->stripe;
        for (i = 0; i < map->num_stripes; i++) {
                struct btrfs_device *device = map->stripes[i].dev;
                const u64 dev_offset = map->stripes[i].physical;

                btrfs_set_stack_stripe_devid(stripe, device->devid);
                btrfs_set_stack_stripe_offset(stripe, dev_offset);
                memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
                stripe++;
        }

        btrfs_set_stack_chunk_length(chunk, bg->length);
        btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
        btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN);
        btrfs_set_stack_chunk_type(chunk, map->type);
        btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
        btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN);
        btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN);
        btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
        btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);

        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
        key.type = BTRFS_CHUNK_ITEM_KEY;
        key.offset = bg->start;

        ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
        if (ret)
                goto out;

        set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags);

        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
                ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
                if (ret)
                        goto out;
        }

out:
        kfree(chunk);
        btrfs_free_chunk_map(map);
        return ret;
}

static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        u64 alloc_profile;
        struct btrfs_block_group *meta_bg;
        struct btrfs_block_group *sys_bg;

        /*
         * When adding a new device for sprouting, the seed device is read-only
         * so we must first allocate a metadata and a system chunk. But before
         * adding the block group items to the extent, device and chunk btrees,
         * we must first:
         *
         * 1) Create both chunks without doing any changes to the btrees, as
         *    otherwise we would get -ENOSPC since the block groups from the
         *    seed device are read-only;
         *
         * 2) Add the device item for the new sprout device - finishing the setup
         *    of a new block group requires updating the device item in the chunk
         *    btree, so it must exist when we attempt to do it. The previous step
         *    ensures this does not fail with -ENOSPC.
         *
         * After that we can add the block group items to their btrees:
         * update existing device item in the chunk btree, add a new block group
         * item to the extent btree, add a new chunk item to the chunk btree and
         * finally add the new device extent items to the devices btree.
         */

        alloc_profile = btrfs_metadata_alloc_profile(fs_info);
        meta_bg = btrfs_create_chunk(trans, alloc_profile);
        if (IS_ERR(meta_bg))
                return PTR_ERR(meta_bg);

        alloc_profile = btrfs_system_alloc_profile(fs_info);
        sys_bg = btrfs_create_chunk(trans, alloc_profile);
        if (IS_ERR(sys_bg))
                return PTR_ERR(sys_bg);

        return 0;
}

static inline int btrfs_chunk_max_errors(struct btrfs_chunk_map *map)
{
        const int index = btrfs_bg_flags_to_raid_index(map->type);

        return btrfs_raid_array[index].tolerated_failures;
}

bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
        struct btrfs_chunk_map *map;
        int miss_ndevs = 0;
        int i;
        bool ret = true;

        map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
        if (IS_ERR(map))
                return false;

        for (i = 0; i < map->num_stripes; i++) {
                if (test_bit(BTRFS_DEV_STATE_MISSING,
                                        &map->stripes[i].dev->dev_state)) {
                        miss_ndevs++;
                        continue;
                }
                if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
                                        &map->stripes[i].dev->dev_state)) {
                        ret = false;
                        goto end;
                }
        }

        /*
         * If the number of missing devices is larger than max errors, we can
         * not write the data into that chunk successfully.
         */
        if (miss_ndevs > btrfs_chunk_max_errors(map))
                ret = false;
end:
        btrfs_free_chunk_map(map);
        return ret;
}

void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info)
{
        write_lock(&fs_info->mapping_tree_lock);
        while (!RB_EMPTY_ROOT(&fs_info->mapping_tree.rb_root)) {
                struct btrfs_chunk_map *map;
                struct rb_node *node;

                node = rb_first_cached(&fs_info->mapping_tree);
                map = rb_entry(node, struct btrfs_chunk_map, rb_node);
                rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
                RB_CLEAR_NODE(&map->rb_node);
                chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
                /* Once for the tree ref. */
                btrfs_free_chunk_map(map);
                cond_resched_rwlock_write(&fs_info->mapping_tree_lock);
        }
        write_unlock(&fs_info->mapping_tree_lock);
}

int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
        struct btrfs_chunk_map *map;
        enum btrfs_raid_types index;
        int ret = 1;

        map = btrfs_get_chunk_map(fs_info, logical, len);
        if (IS_ERR(map))
                /*
                 * We could return errors for these cases, but that could get
                 * ugly and we'd probably do the same thing which is just not do
                 * anything else and exit, so return 1 so the callers don't try
                 * to use other copies.
                 */
                return 1;

        index = btrfs_bg_flags_to_raid_index(map->type);

        /* Non-RAID56, use their ncopies from btrfs_raid_array. */
        if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK))
                ret = btrfs_raid_array[index].ncopies;
        else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
                ret = 2;
        else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
                /*
                 * There could be two corrupted data stripes, we need
                 * to loop retry in order to rebuild the correct data.
                 *
                 * Fail a stripe at a time on every retry except the
                 * stripe under reconstruction.
                 */
                ret = map->num_stripes;
        btrfs_free_chunk_map(map);
        return ret;
}

unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
                                    u64 logical)
{
        struct btrfs_chunk_map *map;
        unsigned long len = fs_info->sectorsize;

        if (!btrfs_fs_incompat(fs_info, RAID56))
                return len;

        map = btrfs_get_chunk_map(fs_info, logical, len);

        if (!WARN_ON(IS_ERR(map))) {
                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
                        len = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
                btrfs_free_chunk_map(map);
        }
        return len;
}

int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
        struct btrfs_chunk_map *map;
        int ret = 0;

        if (!btrfs_fs_incompat(fs_info, RAID56))
                return 0;

        map = btrfs_get_chunk_map(fs_info, logical, len);

        if (!WARN_ON(IS_ERR(map))) {
                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
                        ret = 1;
                btrfs_free_chunk_map(map);
        }
        return ret;
}

static int find_live_mirror(struct btrfs_fs_info *fs_info,
                            struct btrfs_chunk_map *map, int first,
                            int dev_replace_is_ongoing)
{
        const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy);
        int i;
        int num_stripes;
        int preferred_mirror;
        int tolerance;
        struct btrfs_device *srcdev;

        ASSERT((map->type &
                 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));

        if (map->type & BTRFS_BLOCK_GROUP_RAID10)
                num_stripes = map->sub_stripes;
        else
                num_stripes = map->num_stripes;

        switch (policy) {
        default:
                /* Shouldn't happen, just warn and use pid instead of failing */
                btrfs_warn_rl(fs_info, "unknown read_policy type %u, reset to pid",
                              policy);
                WRITE_ONCE(fs_info->fs_devices->read_policy, BTRFS_READ_POLICY_PID);
                fallthrough;
        case BTRFS_READ_POLICY_PID:
                preferred_mirror = first + (current->pid % num_stripes);
                break;
        }

        if (dev_replace_is_ongoing &&
            fs_info->dev_replace.cont_reading_from_srcdev_mode ==
             BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
                srcdev = fs_info->dev_replace.srcdev;
        else
                srcdev = NULL;

        /*
         * try to avoid the drive that is the source drive for a
         * dev-replace procedure, only choose it if no other non-missing
         * mirror is available
         */
        for (tolerance = 0; tolerance < 2; tolerance++) {
                if (map->stripes[preferred_mirror].dev->bdev &&
                    (tolerance || map->stripes[preferred_mirror].dev != srcdev))
                        return preferred_mirror;
                for (i = first; i < first + num_stripes; i++) {
                        if (map->stripes[i].dev->bdev &&
                            (tolerance || map->stripes[i].dev != srcdev))
                                return i;
                }
        }

        /* we couldn't find one that doesn't fail.  Just return something
         * and the io error handling code will clean up eventually
         */
        return preferred_mirror;
}

static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
                                                       u64 logical,
                                                       u16 total_stripes)
{
        struct btrfs_io_context *bioc;

        bioc = kzalloc(
                 /* The size of btrfs_io_context */
                sizeof(struct btrfs_io_context) +
                /* Plus the variable array for the stripes */
                sizeof(struct btrfs_io_stripe) * (total_stripes),
                GFP_NOFS);

        if (!bioc)
                return NULL;

        refcount_set(&bioc->refs, 1);

        bioc->fs_info = fs_info;
        bioc->replace_stripe_src = -1;
        bioc->full_stripe_logical = (u64)-1;
        bioc->logical = logical;

        return bioc;
}

void btrfs_get_bioc(struct btrfs_io_context *bioc)
{
        WARN_ON(!refcount_read(&bioc->refs));
        refcount_inc(&bioc->refs);
}

void btrfs_put_bioc(struct btrfs_io_context *bioc)
{
        if (!bioc)
                return;
        if (refcount_dec_and_test(&bioc->refs))
                kfree(bioc);
}

/*
 * Please note that, discard won't be sent to target device of device
 * replace.
 */
struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
                                               u64 logical, u64 *length_ret,
                                               u32 *num_stripes)
{
        struct btrfs_chunk_map *map;
        struct btrfs_discard_stripe *stripes;
        u64 length = *length_ret;
        u64 offset;
        u32 stripe_nr;
        u32 stripe_nr_end;
        u32 stripe_cnt;
        u64 stripe_end_offset;
        u64 stripe_offset;
        u32 stripe_index;
        u32 factor = 0;
        u32 sub_stripes = 0;
        u32 stripes_per_dev = 0;
        u32 remaining_stripes = 0;
        u32 last_stripe = 0;
        int ret;
        int i;

        map = btrfs_get_chunk_map(fs_info, logical, length);
        if (IS_ERR(map))
                return ERR_CAST(map);

        /* we don't discard raid56 yet */
        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                ret = -EOPNOTSUPP;
                goto out_free_map;
        }

        offset = logical - map->start;
        length = min_t(u64, map->start + map->chunk_len - logical, length);
        *length_ret = length;

        /*
         * stripe_nr counts the total number of stripes we have to stride
         * to get to this block
         */
        stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;

        /* stripe_offset is the offset of this block in its stripe */
        stripe_offset = offset - btrfs_stripe_nr_to_offset(stripe_nr);

        stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >>
                        BTRFS_STRIPE_LEN_SHIFT;
        stripe_cnt = stripe_nr_end - stripe_nr;
        stripe_end_offset = btrfs_stripe_nr_to_offset(stripe_nr_end) -
                            (offset + length);
        /*
         * after this, stripe_nr is the number of stripes on this
         * device we have to walk to find the data, and stripe_index is
         * the number of our device in the stripe array
         */
        *num_stripes = 1;
        stripe_index = 0;
        if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
                         BTRFS_BLOCK_GROUP_RAID10)) {
                if (map->type & BTRFS_BLOCK_GROUP_RAID0)
                        sub_stripes = 1;
                else
                        sub_stripes = map->sub_stripes;

                factor = map->num_stripes / sub_stripes;
                *num_stripes = min_t(u64, map->num_stripes,
                                    sub_stripes * stripe_cnt);
                stripe_index = stripe_nr % factor;
                stripe_nr /= factor;
                stripe_index *= sub_stripes;

                remaining_stripes = stripe_cnt % factor;
                stripes_per_dev = stripe_cnt / factor;
                last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes;
        } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
                                BTRFS_BLOCK_GROUP_DUP)) {
                *num_stripes = map->num_stripes;
        } else {
                stripe_index = stripe_nr % map->num_stripes;
                stripe_nr /= map->num_stripes;
        }

        stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS);
        if (!stripes) {
                ret = -ENOMEM;
                goto out_free_map;
        }

        for (i = 0; i < *num_stripes; i++) {
                stripes[i].physical =
                        map->stripes[stripe_index].physical +
                        stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
                stripes[i].dev = map->stripes[stripe_index].dev;

                if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
                                 BTRFS_BLOCK_GROUP_RAID10)) {
                        stripes[i].length = btrfs_stripe_nr_to_offset(stripes_per_dev);

                        if (i / sub_stripes < remaining_stripes)
                                stripes[i].length += BTRFS_STRIPE_LEN;

                        /*
                         * Special for the first stripe and
                         * the last stripe:
                         *
                         * |-------|...|-------|
                         *     |----------|
                         *    off     end_off
                         */
                        if (i < sub_stripes)
                                stripes[i].length -= stripe_offset;

                        if (stripe_index >= last_stripe &&
                            stripe_index <= (last_stripe +
                                             sub_stripes - 1))
                                stripes[i].length -= stripe_end_offset;

                        if (i == sub_stripes - 1)
                                stripe_offset = 0;
                } else {
                        stripes[i].length = length;
                }

                stripe_index++;
                if (stripe_index == map->num_stripes) {
                        stripe_index = 0;
                        stripe_nr++;
                }
        }

        btrfs_free_chunk_map(map);
        return stripes;
out_free_map:
        btrfs_free_chunk_map(map);
        return ERR_PTR(ret);
}

static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
{
        struct btrfs_block_group *cache;
        bool ret;

        /* Non zoned filesystem does not use "to_copy" flag */
        if (!btrfs_is_zoned(fs_info))
                return false;

        cache = btrfs_lookup_block_group(fs_info, logical);

        ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);

        btrfs_put_block_group(cache);
        return ret;
}

static void handle_ops_on_dev_replace(enum btrfs_map_op op,
                                      struct btrfs_io_context *bioc,
                                      struct btrfs_dev_replace *dev_replace,
                                      u64 logical,
                                      int *num_stripes_ret, int *max_errors_ret)
{
        u64 srcdev_devid = dev_replace->srcdev->devid;
        /*
         * At this stage, num_stripes is still the real number of stripes,
         * excluding the duplicated stripes.
         */
        int num_stripes = *num_stripes_ret;
        int nr_extra_stripes = 0;
        int max_errors = *max_errors_ret;
        int i;

        /*
         * A block group which has "to_copy" set will eventually be copied by
         * the dev-replace process. We can avoid cloning IO here.
         */
        if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
                return;

        /*
         * Duplicate the write operations while the dev-replace procedure is
         * running. Since the copying of the old disk to the new disk takes
         * place at run time while the filesystem is mounted writable, the
         * regular write operations to the old disk have to be duplicated to go
         * to the new disk as well.
         *
         * Note that device->missing is handled by the caller, and that the
         * write to the old disk is already set up in the stripes array.
         */
        for (i = 0; i < num_stripes; i++) {
                struct btrfs_io_stripe *old = &bioc->stripes[i];
                struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes];

                if (old->dev->devid != srcdev_devid)
                        continue;

                new->physical = old->physical;
                new->dev = dev_replace->tgtdev;
                if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
                        bioc->replace_stripe_src = i;
                nr_extra_stripes++;
        }

        /* We can only have at most 2 extra nr_stripes (for DUP). */
        ASSERT(nr_extra_stripes <= 2);
        /*
         * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for
         * replace.
         * If we have 2 extra stripes, only choose the one with smaller physical.
         */
        if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) {
                struct btrfs_io_stripe *first = &bioc->stripes[num_stripes];
                struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1];

                /* Only DUP can have two extra stripes. */
                ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP);

                /*
                 * Swap the last stripe stripes and reduce @nr_extra_stripes.
                 * The extra stripe would still be there, but won't be accessed.
                 */
                if (first->physical > second->physical) {
                        swap(second->physical, first->physical);
                        swap(second->dev, first->dev);
                        nr_extra_stripes--;
                }
        }

        *num_stripes_ret = num_stripes + nr_extra_stripes;
        *max_errors_ret = max_errors + nr_extra_stripes;
        bioc->replace_nr_stripes = nr_extra_stripes;
}

static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset,
                            struct btrfs_io_geometry *io_geom)
{
        /*
         * Stripe_nr is the stripe where this block falls.  stripe_offset is
         * the offset of this block in its stripe.
         */
        io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
        io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
        ASSERT(io_geom->stripe_offset < U32_MAX);

        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                unsigned long full_stripe_len =
                        btrfs_stripe_nr_to_offset(nr_data_stripes(map));

                /*
                 * For full stripe start, we use previously calculated
                 * @stripe_nr. Align it to nr_data_stripes, then multiply with
                 * STRIPE_LEN.
                 *
                 * By this we can avoid u64 division completely.  And we have
                 * to go rounddown(), not round_down(), as nr_data_stripes is
                 * not ensured to be power of 2.
                 */
                io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset(
                        rounddown(io_geom->stripe_nr, nr_data_stripes(map)));

                ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset);
                ASSERT(io_geom->raid56_full_stripe_start <= offset);
                /*
                 * For writes to RAID56, allow to write a full stripe set, but
                 * no straddling of stripe sets.
                 */
                if (io_geom->op == BTRFS_MAP_WRITE)
                        return full_stripe_len - (offset - io_geom->raid56_full_stripe_start);
        }

        /*
         * For other RAID types and for RAID56 reads, allow a single stripe (on
         * a single disk).
         */
        if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
                return BTRFS_STRIPE_LEN - io_geom->stripe_offset;
        return U64_MAX;
}

static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical,
                         u64 *length, struct btrfs_io_stripe *dst,
                         struct btrfs_chunk_map *map,
                         struct btrfs_io_geometry *io_geom)
{
        dst->dev = map->stripes[io_geom->stripe_index].dev;

        if (io_geom->op == BTRFS_MAP_READ &&
            btrfs_need_stripe_tree_update(fs_info, map->type))
                return btrfs_get_raid_extent_offset(fs_info, logical, length,
                                                    map->type,
                                                    io_geom->stripe_index, dst);

        dst->physical = map->stripes[io_geom->stripe_index].physical +
                        io_geom->stripe_offset +
                        btrfs_stripe_nr_to_offset(io_geom->stripe_nr);
        return 0;
}

static bool is_single_device_io(struct btrfs_fs_info *fs_info,
                                const struct btrfs_io_stripe *smap,
                                const struct btrfs_chunk_map *map,
                                int num_alloc_stripes,
                                enum btrfs_map_op op, int mirror_num)
{
        if (!smap)
                return false;

        if (num_alloc_stripes != 1)
                return false;

        if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ)
                return false;

        if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)
                return false;

        return true;
}

static void map_blocks_raid0(const struct btrfs_chunk_map *map,
                             struct btrfs_io_geometry *io_geom)
{
        io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
        io_geom->stripe_nr /= map->num_stripes;
        if (io_geom->op == BTRFS_MAP_READ)
                io_geom->mirror_num = 1;
}

static void map_blocks_raid1(struct btrfs_fs_info *fs_info,
                             struct btrfs_chunk_map *map,
                             struct btrfs_io_geometry *io_geom,
                             bool dev_replace_is_ongoing)
{
        if (io_geom->op != BTRFS_MAP_READ) {
                io_geom->num_stripes = map->num_stripes;
                return;
        }

        if (io_geom->mirror_num) {
                io_geom->stripe_index = io_geom->mirror_num - 1;
                return;
        }

        io_geom->stripe_index = find_live_mirror(fs_info, map, 0,
                                                 dev_replace_is_ongoing);
        io_geom->mirror_num = io_geom->stripe_index + 1;
}

static void map_blocks_dup(const struct btrfs_chunk_map *map,
                           struct btrfs_io_geometry *io_geom)
{
        if (io_geom->op != BTRFS_MAP_READ) {
                io_geom->num_stripes = map->num_stripes;
                return;
        }

        if (io_geom->mirror_num) {
                io_geom->stripe_index = io_geom->mirror_num - 1;
                return;
        }

        io_geom->mirror_num = 1;
}

static void map_blocks_raid10(struct btrfs_fs_info *fs_info,
                              struct btrfs_chunk_map *map,
                              struct btrfs_io_geometry *io_geom,
                              bool dev_replace_is_ongoing)
{
        u32 factor = map->num_stripes / map->sub_stripes;
        int old_stripe_index;

        io_geom->stripe_index = (io_geom->stripe_nr % factor) * map->sub_stripes;
        io_geom->stripe_nr /= factor;

        if (io_geom->op != BTRFS_MAP_READ) {
                io_geom->num_stripes = map->sub_stripes;
                return;
        }

        if (io_geom->mirror_num) {
                io_geom->stripe_index += io_geom->mirror_num - 1;
                return;
        }

        old_stripe_index = io_geom->stripe_index;
        io_geom->stripe_index = find_live_mirror(fs_info, map,
                                                 io_geom->stripe_index,
                                                 dev_replace_is_ongoing);
        io_geom->mirror_num = io_geom->stripe_index - old_stripe_index + 1;
}

static void map_blocks_raid56_write(struct btrfs_chunk_map *map,
                                    struct btrfs_io_geometry *io_geom,
                                    u64 logical, u64 *length)
{
        int data_stripes = nr_data_stripes(map);

        /*
         * Needs full stripe mapping.
         *
         * Push stripe_nr back to the start of the full stripe For those cases
         * needing a full stripe, @stripe_nr is the full stripe number.
         *
         * Originally we go raid56_full_stripe_start / full_stripe_len, but
         * that can be expensive.  Here we just divide @stripe_nr with
         * @data_stripes.
         */
        io_geom->stripe_nr /= data_stripes;

        /* RAID[56] write or recovery. Return all stripes */
        io_geom->num_stripes = map->num_stripes;
        io_geom->max_errors = btrfs_chunk_max_errors(map);

        /* Return the length to the full stripe end. */
        *length = min(logical + *length,
                      io_geom->raid56_full_stripe_start + map->start +
                      btrfs_stripe_nr_to_offset(data_stripes)) -
                logical;
        io_geom->stripe_index = 0;
        io_geom->stripe_offset = 0;
}

static void map_blocks_raid56_read(struct btrfs_chunk_map *map,
                                   struct btrfs_io_geometry *io_geom)
{
        int data_stripes = nr_data_stripes(map);

        ASSERT(io_geom->mirror_num <= 1);
        /* Just grab the data stripe directly. */
        io_geom->stripe_index = io_geom->stripe_nr % data_stripes;
        io_geom->stripe_nr /= data_stripes;

        /* We distribute the parity blocks across stripes. */
        io_geom->stripe_index =
                (io_geom->stripe_nr + io_geom->stripe_index) % map->num_stripes;

        if (io_geom->op == BTRFS_MAP_READ && io_geom->mirror_num < 1)
                io_geom->mirror_num = 1;
}

static void map_blocks_single(const struct btrfs_chunk_map *map,
                              struct btrfs_io_geometry *io_geom)
{
        io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
        io_geom->stripe_nr /= map->num_stripes;
        io_geom->mirror_num = io_geom->stripe_index + 1;
}

/*
 * Map one logical range to one or more physical ranges.
 *
 * @length:                (Mandatory) mapped length of this run.
 *                        One logical range can be split into different segments
 *                        due to factors like zones and RAID0/5/6/10 stripe
 *                        boundaries.
 *
 * @bioc_ret:                (Mandatory) returned btrfs_io_context structure.
 *                        which has one or more physical ranges (btrfs_io_stripe)
 *                        recorded inside.
 *                        Caller should call btrfs_put_bioc() to free it after use.
 *
 * @smap:                (Optional) single physical range optimization.
 *                        If the map request can be fulfilled by one single
 *                        physical range, and this is parameter is not NULL,
 *                        then @bioc_ret would be NULL, and @smap would be
 *                        updated.
 *
 * @mirror_num_ret:        (Mandatory) returned mirror number if the original
 *                        value is 0.
 *
 *                        Mirror number 0 means to choose any live mirrors.
 *
 *                        For non-RAID56 profiles, non-zero mirror_num means
 *                        the Nth mirror. (e.g. mirror_num 1 means the first
 *                        copy).
 *
 *                        For RAID56 profile, mirror 1 means rebuild from P and
 *                        the remaining data stripes.
 *
 *                        For RAID6 profile, mirror > 2 means mark another
 *                        data/P stripe error and rebuild from the remaining
 *                        stripes..
 */
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
                    u64 logical, u64 *length,
                    struct btrfs_io_context **bioc_ret,
                    struct btrfs_io_stripe *smap, int *mirror_num_ret)
{
        struct btrfs_chunk_map *map;
        struct btrfs_io_geometry io_geom = { 0 };
        u64 map_offset;
        int i;
        int ret = 0;
        int num_copies;
        struct btrfs_io_context *bioc = NULL;
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
        int dev_replace_is_ongoing = 0;
        u16 num_alloc_stripes;
        u64 max_len;

        ASSERT(bioc_ret);

        io_geom.mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
        io_geom.num_stripes = 1;
        io_geom.stripe_index = 0;
        io_geom.op = op;

        num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize);
        if (io_geom.mirror_num > num_copies)
                return -EINVAL;

        map = btrfs_get_chunk_map(fs_info, logical, *length);
        if (IS_ERR(map))
                return PTR_ERR(map);

        map_offset = logical - map->start;
        io_geom.raid56_full_stripe_start = (u64)-1;
        max_len = btrfs_max_io_len(map, map_offset, &io_geom);
        *length = min_t(u64, map->chunk_len - map_offset, max_len);

        down_read(&dev_replace->rwsem);
        dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
        /*
         * Hold the semaphore for read during the whole operation, write is
         * requested at commit time but must wait.
         */
        if (!dev_replace_is_ongoing)
                up_read(&dev_replace->rwsem);

        switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
        case BTRFS_BLOCK_GROUP_RAID0:
                map_blocks_raid0(map, &io_geom);
                break;
        case BTRFS_BLOCK_GROUP_RAID1:
        case BTRFS_BLOCK_GROUP_RAID1C3:
        case BTRFS_BLOCK_GROUP_RAID1C4:
                map_blocks_raid1(fs_info, map, &io_geom, dev_replace_is_ongoing);
                break;
        case BTRFS_BLOCK_GROUP_DUP:
                map_blocks_dup(map, &io_geom);
                break;
        case BTRFS_BLOCK_GROUP_RAID10:
                map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing);
                break;
        case BTRFS_BLOCK_GROUP_RAID5:
        case BTRFS_BLOCK_GROUP_RAID6:
                if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)
                        map_blocks_raid56_write(map, &io_geom, logical, length);
                else
                        map_blocks_raid56_read(map, &io_geom);
                break;
        default:
                /*
                 * After this, stripe_nr is the number of stripes on this
                 * device we have to walk to find the data, and stripe_index is
                 * the number of our device in the stripe array
                 */
                map_blocks_single(map, &io_geom);
                break;
        }
        if (io_geom.stripe_index >= map->num_stripes) {
                btrfs_crit(fs_info,
                           "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
                           io_geom.stripe_index, map->num_stripes);
                ret = -EINVAL;
                goto out;
        }

        num_alloc_stripes = io_geom.num_stripes;
        if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
            op != BTRFS_MAP_READ)
                /*
                 * For replace case, we need to add extra stripes for extra
                 * duplicated stripes.
                 *
                 * For both WRITE and GET_READ_MIRRORS, we may have at most
                 * 2 more stripes (DUP types, otherwise 1).
                 */
                num_alloc_stripes += 2;

        /*
         * If this I/O maps to a single device, try to return the device and
         * physical block information on the stack instead of allocating an
         * I/O context structure.
         */
        if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op,
                                io_geom.mirror_num)) {
                ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom);
                if (mirror_num_ret)
                        *mirror_num_ret = io_geom.mirror_num;
                *bioc_ret = NULL;
                goto out;
        }

        bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes);
        if (!bioc) {
                ret = -ENOMEM;
                goto out;
        }
        bioc->map_type = map->type;

        /*
         * For RAID56 full map, we need to make sure the stripes[] follows the
         * rule that data stripes are all ordered, then followed with P and Q
         * (if we have).
         *
         * It's still mostly the same as other profiles, just with extra rotation.
         */
        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
            (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)) {
                /*
                 * For RAID56 @stripe_nr is already the number of full stripes
                 * before us, which is also the rotation value (needs to modulo
                 * with num_stripes).
                 *
                 * In this case, we just add @stripe_nr with @i, then do the
                 * modulo, to reduce one modulo call.
                 */
                bioc->full_stripe_logical = map->start +
                        btrfs_stripe_nr_to_offset(io_geom.stripe_nr *
                                                  nr_data_stripes(map));
                for (int i = 0; i < io_geom.num_stripes; i++) {
                        struct btrfs_io_stripe *dst = &bioc->stripes[i];
                        u32 stripe_index;

                        stripe_index = (i + io_geom.stripe_nr) % io_geom.num_stripes;
                        dst->dev = map->stripes[stripe_index].dev;
                        dst->physical =
                                map->stripes[stripe_index].physical +
                                io_geom.stripe_offset +
                                btrfs_stripe_nr_to_offset(io_geom.stripe_nr);
                }
        } else {
                /*
                 * For all other non-RAID56 profiles, just copy the target
                 * stripe into the bioc.
                 */
                for (i = 0; i < io_geom.num_stripes; i++) {
                        ret = set_io_stripe(fs_info, logical, length,
                                            &bioc->stripes[i], map, &io_geom);
                        if (ret < 0)
                                break;
                        io_geom.stripe_index++;
                }
        }

        if (ret) {
                *bioc_ret = NULL;
                btrfs_put_bioc(bioc);
                goto out;
        }

        if (op != BTRFS_MAP_READ)
                io_geom.max_errors = btrfs_chunk_max_errors(map);

        if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
            op != BTRFS_MAP_READ) {
                handle_ops_on_dev_replace(op, bioc, dev_replace, logical,
                                          &io_geom.num_stripes, &io_geom.max_errors);
        }

        *bioc_ret = bioc;
        bioc->num_stripes = io_geom.num_stripes;
        bioc->max_errors = io_geom.max_errors;
        bioc->mirror_num = io_geom.mirror_num;

out:
        if (dev_replace_is_ongoing) {
                lockdep_assert_held(&dev_replace->rwsem);
                /* Unlock and let waiting writers proceed */
                up_read(&dev_replace->rwsem);
        }
        btrfs_free_chunk_map(map);
        return ret;
}

static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
                                      const struct btrfs_fs_devices *fs_devices)
{
        if (args->fsid == NULL)
                return true;
        if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
                return true;
        return false;
}

static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
                                  const struct btrfs_device *device)
{
        if (args->missing) {
                if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
                    !device->bdev)
                        return true;
                return false;
        }

        if (device->devid != args->devid)
                return false;
        if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
                return false;
        return true;
}

/*
 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
 * return NULL.
 *
 * If devid and uuid are both specified, the match must be exact, otherwise
 * only devid is used.
 */
struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
                                       const struct btrfs_dev_lookup_args *args)
{
        struct btrfs_device *device;
        struct btrfs_fs_devices *seed_devs;

        if (dev_args_match_fs_devices(args, fs_devices)) {
                list_for_each_entry(device, &fs_devices->devices, dev_list) {
                        if (dev_args_match_device(args, device))
                                return device;
                }
        }

        list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
                if (!dev_args_match_fs_devices(args, seed_devs))
                        continue;
                list_for_each_entry(device, &seed_devs->devices, dev_list) {
                        if (dev_args_match_device(args, device))
                                return device;
                }
        }

        return NULL;
}

static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
                                            u64 devid, u8 *dev_uuid)
{
        struct btrfs_device *device;
        unsigned int nofs_flag;

        /*
         * We call this under the chunk_mutex, so we want to use NOFS for this
         * allocation, however we don't want to change btrfs_alloc_device() to
         * always do NOFS because we use it in a lot of other GFP_KERNEL safe
         * places.
         */

        nofs_flag = memalloc_nofs_save();
        device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL);
        memalloc_nofs_restore(nofs_flag);
        if (IS_ERR(device))
                return device;

        list_add(&device->dev_list, &fs_devices->devices);
        device->fs_devices = fs_devices;
        fs_devices->num_devices++;

        set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
        fs_devices->missing_devices++;

        return device;
}

/*
 * Allocate new device struct, set up devid and UUID.
 *
 * @fs_info:        used only for generating a new devid, can be NULL if
 *                devid is provided (i.e. @devid != NULL).
 * @devid:        a pointer to devid for this device.  If NULL a new devid
 *                is generated.
 * @uuid:        a pointer to UUID for this device.  If NULL a new UUID
 *                is generated.
 * @path:        a pointer to device path if available, NULL otherwise.
 *
 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
 * on error.  Returned struct is not linked onto any lists and must be
 * destroyed with btrfs_free_device.
 */
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
                                        const u64 *devid, const u8 *uuid,
                                        const char *path)
{
        struct btrfs_device *dev;
        u64 tmp;

        if (WARN_ON(!devid && !fs_info))
                return ERR_PTR(-EINVAL);

        dev = kzalloc(sizeof(*dev), GFP_KERNEL);
        if (!dev)
                return ERR_PTR(-ENOMEM);

        INIT_LIST_HEAD(&dev->dev_list);
        INIT_LIST_HEAD(&dev->dev_alloc_list);
        INIT_LIST_HEAD(&dev->post_commit_list);

        atomic_set(&dev->dev_stats_ccnt, 0);
        btrfs_device_data_ordered_init(dev);
        extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE);

        if (devid)
                tmp = *devid;
        else {
                int ret;

                ret = find_next_devid(fs_info, &tmp);
                if (ret) {
                        btrfs_free_device(dev);
                        return ERR_PTR(ret);
                }
        }
        dev->devid = tmp;

        if (uuid)
                memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
        else
                generate_random_uuid(dev->uuid);

        if (path) {
                struct rcu_string *name;

                name = rcu_string_strdup(path, GFP_KERNEL);
                if (!name) {
                        btrfs_free_device(dev);
                        return ERR_PTR(-ENOMEM);
                }
                rcu_assign_pointer(dev->name, name);
        }

        return dev;
}

static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
                                        u64 devid, u8 *uuid, bool error)
{
        if (error)
                btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
                              devid, uuid);
        else
                btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
                              devid, uuid);
}

u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map)
{
        const int data_stripes = calc_data_stripes(map->type, map->num_stripes);

        return div_u64(map->chunk_len, data_stripes);
}

#if BITS_PER_LONG == 32
/*
 * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
 * can't be accessed on 32bit systems.
 *
 * This function do mount time check to reject the fs if it already has
 * metadata chunk beyond that limit.
 */
static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
                                  u64 logical, u64 length, u64 type)
{
        if (!(type & BTRFS_BLOCK_GROUP_METADATA))
                return 0;

        if (logical + length < MAX_LFS_FILESIZE)
                return 0;

        btrfs_err_32bit_limit(fs_info);
        return -EOVERFLOW;
}

/*
 * This is to give early warning for any metadata chunk reaching
 * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
 * Although we can still access the metadata, it's not going to be possible
 * once the limit is reached.
 */
static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
                                  u64 logical, u64 length, u64 type)
{
        if (!(type & BTRFS_BLOCK_GROUP_METADATA))
                return;

        if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
                return;

        btrfs_warn_32bit_limit(fs_info);
}
#endif

static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info,
                                                  u64 devid, u8 *uuid)
{
        struct btrfs_device *dev;

        if (!btrfs_test_opt(fs_info, DEGRADED)) {
                btrfs_report_missing_device(fs_info, devid, uuid, true);
                return ERR_PTR(-ENOENT);
        }

        dev = add_missing_dev(fs_info->fs_devices, devid, uuid);
        if (IS_ERR(dev)) {
                btrfs_err(fs_info, "failed to init missing device %llu: %ld",
                          devid, PTR_ERR(dev));
                return dev;
        }
        btrfs_report_missing_device(fs_info, devid, uuid, false);

        return dev;
}

static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
                          struct btrfs_chunk *chunk)
{
        BTRFS_DEV_LOOKUP_ARGS(args);
        struct btrfs_fs_info *fs_info = leaf->fs_info;
        struct btrfs_chunk_map *map;
        u64 logical;
        u64 length;
        u64 devid;
        u64 type;
        u8 uuid[BTRFS_UUID_SIZE];
        int index;
        int num_stripes;
        int ret;
        int i;

        logical = key->offset;
        length = btrfs_chunk_length(leaf, chunk);
        type = btrfs_chunk_type(leaf, chunk);
        index = btrfs_bg_flags_to_raid_index(type);
        num_stripes = btrfs_chunk_num_stripes(leaf, chunk);

#if BITS_PER_LONG == 32
        ret = check_32bit_meta_chunk(fs_info, logical, length, type);
        if (ret < 0)
                return ret;
        warn_32bit_meta_chunk(fs_info, logical, length, type);
#endif

        /*
         * Only need to verify chunk item if we're reading from sys chunk array,
         * as chunk item in tree block is already verified by tree-checker.
         */
        if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
                ret = btrfs_check_chunk_valid(leaf, chunk, logical);
                if (ret)
                        return ret;
        }

        map = btrfs_find_chunk_map(fs_info, logical, 1);

        /* already mapped? */
        if (map && map->start <= logical && map->start + map->chunk_len > logical) {
                btrfs_free_chunk_map(map);
                return 0;
        } else if (map) {
                btrfs_free_chunk_map(map);
        }

        map = btrfs_alloc_chunk_map(num_stripes, GFP_NOFS);
        if (!map)
                return -ENOMEM;

        map->start = logical;
        map->chunk_len = length;
        map->num_stripes = num_stripes;
        map->io_width = btrfs_chunk_io_width(leaf, chunk);
        map->io_align = btrfs_chunk_io_align(leaf, chunk);
        map->type = type;
        /*
         * We can't use the sub_stripes value, as for profiles other than
         * RAID10, they may have 0 as sub_stripes for filesystems created by
         * older mkfs (<v5.4).
         * In that case, it can cause divide-by-zero errors later.
         * Since currently sub_stripes is fixed for each profile, let's
         * use the trusted value instead.
         */
        map->sub_stripes = btrfs_raid_array[index].sub_stripes;
        map->verified_stripes = 0;
        map->stripe_size = btrfs_calc_stripe_length(map);
        for (i = 0; i < num_stripes; i++) {
                map->stripes[i].physical =
                        btrfs_stripe_offset_nr(leaf, chunk, i);
                devid = btrfs_stripe_devid_nr(leaf, chunk, i);
                args.devid = devid;
                read_extent_buffer(leaf, uuid, (unsigned long)
                                   btrfs_stripe_dev_uuid_nr(chunk, i),
                                   BTRFS_UUID_SIZE);
                args.uuid = uuid;
                map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
                if (!map->stripes[i].dev) {
                        map->stripes[i].dev = handle_missing_device(fs_info,
                                                                    devid, uuid);
                        if (IS_ERR(map->stripes[i].dev)) {
                                ret = PTR_ERR(map->stripes[i].dev);
                                btrfs_free_chunk_map(map);
                                return ret;
                        }
                }

                set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
                                &(map->stripes[i].dev->dev_state));
        }

        ret = btrfs_add_chunk_map(fs_info, map);
        if (ret < 0) {
                btrfs_err(fs_info,
                          "failed to add chunk map, start=%llu len=%llu: %d",
                          map->start, map->chunk_len, ret);
        }

        return ret;
}

static void fill_device_from_item(struct extent_buffer *leaf,
                                 struct btrfs_dev_item *dev_item,
                                 struct btrfs_device *device)
{
        unsigned long ptr;

        device->devid = btrfs_device_id(leaf, dev_item);
        device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
        device->total_bytes = device->disk_total_bytes;
        device->commit_total_bytes = device->disk_total_bytes;
        device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
        device->commit_bytes_used = device->bytes_used;
        device->type = btrfs_device_type(leaf, dev_item);
        device->io_align = btrfs_device_io_align(leaf, dev_item);
        device->io_width = btrfs_device_io_width(leaf, dev_item);
        device->sector_size = btrfs_device_sector_size(leaf, dev_item);
        WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
        clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);

        ptr = btrfs_device_uuid(dev_item);
        read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
}

static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
                                                  u8 *fsid)
{
        struct btrfs_fs_devices *fs_devices;
        int ret;

        lockdep_assert_held(&uuid_mutex);
        ASSERT(fsid);

        /* This will match only for multi-device seed fs */
        list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
                if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
                        return fs_devices;


        fs_devices = find_fsid(fsid, NULL);
        if (!fs_devices) {
                if (!btrfs_test_opt(fs_info, DEGRADED))
                        return ERR_PTR(-ENOENT);

                fs_devices = alloc_fs_devices(fsid);
                if (IS_ERR(fs_devices))
                        return fs_devices;

                fs_devices->seeding = true;
                fs_devices->opened = 1;
                return fs_devices;
        }

        /*
         * Upon first call for a seed fs fsid, just create a private copy of the
         * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
         */
        fs_devices = clone_fs_devices(fs_devices);
        if (IS_ERR(fs_devices))
                return fs_devices;

        ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder);
        if (ret) {
                free_fs_devices(fs_devices);
                return ERR_PTR(ret);
        }

        if (!fs_devices->seeding) {
                close_fs_devices(fs_devices);
                free_fs_devices(fs_devices);
                return ERR_PTR(-EINVAL);
        }

        list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);

        return fs_devices;
}

static int read_one_dev(struct extent_buffer *leaf,
                        struct btrfs_dev_item *dev_item)
{
        BTRFS_DEV_LOOKUP_ARGS(args);
        struct btrfs_fs_info *fs_info = leaf->fs_info;
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        struct btrfs_device *device;
        u64 devid;
        int ret;
        u8 fs_uuid[BTRFS_FSID_SIZE];
        u8 dev_uuid[BTRFS_UUID_SIZE];

        devid = btrfs_device_id(leaf, dev_item);
        args.devid = devid;
        read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
                           BTRFS_UUID_SIZE);
        read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
                           BTRFS_FSID_SIZE);
        args.uuid = dev_uuid;
        args.fsid = fs_uuid;

        if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
                fs_devices = open_seed_devices(fs_info, fs_uuid);
                if (IS_ERR(fs_devices))
                        return PTR_ERR(fs_devices);
        }

        device = btrfs_find_device(fs_info->fs_devices, &args);
        if (!device) {
                if (!btrfs_test_opt(fs_info, DEGRADED)) {
                        btrfs_report_missing_device(fs_info, devid,
                                                        dev_uuid, true);
                        return -ENOENT;
                }

                device = add_missing_dev(fs_devices, devid, dev_uuid);
                if (IS_ERR(device)) {
                        btrfs_err(fs_info,
                                "failed to add missing dev %llu: %ld",
                                devid, PTR_ERR(device));
                        return PTR_ERR(device);
                }
                btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
        } else {
                if (!device->bdev) {
                        if (!btrfs_test_opt(fs_info, DEGRADED)) {
                                btrfs_report_missing_device(fs_info,
                                                devid, dev_uuid, true);
                                return -ENOENT;
                        }
                        btrfs_report_missing_device(fs_info, devid,
                                                        dev_uuid, false);
                }

                if (!device->bdev &&
                    !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
                        /*
                         * this happens when a device that was properly setup
                         * in the device info lists suddenly goes bad.
                         * device->bdev is NULL, and so we have to set
                         * device->missing to one here
                         */
                        device->fs_devices->missing_devices++;
                        set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
                }

                /* Move the device to its own fs_devices */
                if (device->fs_devices != fs_devices) {
                        ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
                                                        &device->dev_state));

                        list_move(&device->dev_list, &fs_devices->devices);
                        device->fs_devices->num_devices--;
                        fs_devices->num_devices++;

                        device->fs_devices->missing_devices--;
                        fs_devices->missing_devices++;

                        device->fs_devices = fs_devices;
                }
        }

        if (device->fs_devices != fs_info->fs_devices) {
                BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
                if (device->generation !=
                    btrfs_device_generation(leaf, dev_item))
                        return -EINVAL;
        }

        fill_device_from_item(leaf, dev_item, device);
        if (device->bdev) {
                u64 max_total_bytes = bdev_nr_bytes(device->bdev);

                if (device->total_bytes > max_total_bytes) {
                        btrfs_err(fs_info,
                        "device total_bytes should be at most %llu but found %llu",
                                  max_total_bytes, device->total_bytes);
                        return -EINVAL;
                }
        }
        set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
           !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
                device->fs_devices->total_rw_bytes += device->total_bytes;
                atomic64_add(device->total_bytes - device->bytes_used,
                                &fs_info->free_chunk_space);
        }
        ret = 0;
        return ret;
}

int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
{
        struct btrfs_super_block *super_copy = fs_info->super_copy;
        struct extent_buffer *sb;
        struct btrfs_disk_key *disk_key;
        struct btrfs_chunk *chunk;
        u8 *array_ptr;
        unsigned long sb_array_offset;
        int ret = 0;
        u32 num_stripes;
        u32 array_size;
        u32 len = 0;
        u32 cur_offset;
        u64 type;
        struct btrfs_key key;

        ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);

        /*
         * We allocated a dummy extent, just to use extent buffer accessors.
         * There will be unused space after BTRFS_SUPER_INFO_SIZE, but
         * that's fine, we will not go beyond system chunk array anyway.
         */
        sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
        if (!sb)
                return -ENOMEM;
        set_extent_buffer_uptodate(sb);

        write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
        array_size = btrfs_super_sys_array_size(super_copy);

        array_ptr = super_copy->sys_chunk_array;
        sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
        cur_offset = 0;

        while (cur_offset < array_size) {
                disk_key = (struct btrfs_disk_key *)array_ptr;
                len = sizeof(*disk_key);
                if (cur_offset + len > array_size)
                        goto out_short_read;

                btrfs_disk_key_to_cpu(&key, disk_key);

                array_ptr += len;
                sb_array_offset += len;
                cur_offset += len;

                if (key.type != BTRFS_CHUNK_ITEM_KEY) {
                        btrfs_err(fs_info,
                            "unexpected item type %u in sys_array at offset %u",
                                  (u32)key.type, cur_offset);
                        ret = -EIO;
                        break;
                }

                chunk = (struct btrfs_chunk *)sb_array_offset;
                /*
                 * At least one btrfs_chunk with one stripe must be present,
                 * exact stripe count check comes afterwards
                 */
                len = btrfs_chunk_item_size(1);
                if (cur_offset + len > array_size)
                        goto out_short_read;

                num_stripes = btrfs_chunk_num_stripes(sb, chunk);
                if (!num_stripes) {
                        btrfs_err(fs_info,
                        "invalid number of stripes %u in sys_array at offset %u",
                                  num_stripes, cur_offset);
                        ret = -EIO;
                        break;
                }

                type = btrfs_chunk_type(sb, chunk);
                if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
                        btrfs_err(fs_info,
                        "invalid chunk type %llu in sys_array at offset %u",
                                  type, cur_offset);
                        ret = -EIO;
                        break;
                }

                len = btrfs_chunk_item_size(num_stripes);
                if (cur_offset + len > array_size)
                        goto out_short_read;

                ret = read_one_chunk(&key, sb, chunk);
                if (ret)
                        break;

                array_ptr += len;
                sb_array_offset += len;
                cur_offset += len;
        }
        clear_extent_buffer_uptodate(sb);
        free_extent_buffer_stale(sb);
        return ret;

out_short_read:
        btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
                        len, cur_offset);
        clear_extent_buffer_uptodate(sb);
        free_extent_buffer_stale(sb);
        return -EIO;
}

/*
 * Check if all chunks in the fs are OK for read-write degraded mount
 *
 * If the @failing_dev is specified, it's accounted as missing.
 *
 * Return true if all chunks meet the minimal RW mount requirements.
 * Return false if any chunk doesn't meet the minimal RW mount requirements.
 */
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
                                        struct btrfs_device *failing_dev)
{
        struct btrfs_chunk_map *map;
        u64 next_start;
        bool ret = true;

        map = btrfs_find_chunk_map(fs_info, 0, U64_MAX);
        /* No chunk at all? Return false anyway */
        if (!map) {
                ret = false;
                goto out;
        }
        while (map) {
                int missing = 0;
                int max_tolerated;
                int i;

                max_tolerated =
                        btrfs_get_num_tolerated_disk_barrier_failures(
                                        map->type);
                for (i = 0; i < map->num_stripes; i++) {
                        struct btrfs_device *dev = map->stripes[i].dev;

                        if (!dev || !dev->bdev ||
                            test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
                            dev->last_flush_error)
                                missing++;
                        else if (failing_dev && failing_dev == dev)
                                missing++;
                }
                if (missing > max_tolerated) {
                        if (!failing_dev)
                                btrfs_warn(fs_info,
        "chunk %llu missing %d devices, max tolerance is %d for writable mount",
                                   map->start, missing, max_tolerated);
                        btrfs_free_chunk_map(map);
                        ret = false;
                        goto out;
                }
                next_start = map->start + map->chunk_len;
                btrfs_free_chunk_map(map);

                map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start);
        }
out:
        return ret;
}

static void readahead_tree_node_children(struct extent_buffer *node)
{
        int i;
        const int nr_items = btrfs_header_nritems(node);

        for (i = 0; i < nr_items; i++)
                btrfs_readahead_node_child(node, i);
}

int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
{
        struct btrfs_root *root = fs_info->chunk_root;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_key key;
        struct btrfs_key found_key;
        int ret;
        int slot;
        int iter_ret = 0;
        u64 total_dev = 0;
        u64 last_ra_node = 0;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        /*
         * uuid_mutex is needed only if we are mounting a sprout FS
         * otherwise we don't need it.
         */
        mutex_lock(&uuid_mutex);

        /*
         * It is possible for mount and umount to race in such a way that
         * we execute this code path, but open_fs_devices failed to clear
         * total_rw_bytes. We certainly want it cleared before reading the
         * device items, so clear it here.
         */
        fs_info->fs_devices->total_rw_bytes = 0;

        /*
         * Lockdep complains about possible circular locking dependency between
         * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
         * used for freeze procection of a fs (struct super_block.s_writers),
         * which we take when starting a transaction, and extent buffers of the
         * chunk tree if we call read_one_dev() while holding a lock on an
         * extent buffer of the chunk tree. Since we are mounting the filesystem
         * and at this point there can't be any concurrent task modifying the
         * chunk tree, to keep it simple, just skip locking on the chunk tree.
         */
        ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
        path->skip_locking = 1;

        /*
         * Read all device items, and then all the chunk items. All
         * device items are found before any chunk item (their object id
         * is smaller than the lowest possible object id for a chunk
         * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
         */
        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
        key.offset = 0;
        key.type = 0;
        btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
                struct extent_buffer *node = path->nodes[1];

                leaf = path->nodes[0];
                slot = path->slots[0];

                if (node) {
                        if (last_ra_node != node->start) {
                                readahead_tree_node_children(node);
                                last_ra_node = node->start;
                        }
                }
                if (found_key.type == BTRFS_DEV_ITEM_KEY) {
                        struct btrfs_dev_item *dev_item;
                        dev_item = btrfs_item_ptr(leaf, slot,
                                                  struct btrfs_dev_item);
                        ret = read_one_dev(leaf, dev_item);
                        if (ret)
                                goto error;
                        total_dev++;
                } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
                        struct btrfs_chunk *chunk;

                        /*
                         * We are only called at mount time, so no need to take
                         * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
                         * we always lock first fs_info->chunk_mutex before
                         * acquiring any locks on the chunk tree. This is a
                         * requirement for chunk allocation, see the comment on
                         * top of btrfs_chunk_alloc() for details.
                         */
                        chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
                        ret = read_one_chunk(&found_key, leaf, chunk);
                        if (ret)
                                goto error;
                }
        }
        /* Catch error found during iteration */
        if (iter_ret < 0) {
                ret = iter_ret;
                goto error;
        }

        /*
         * After loading chunk tree, we've got all device information,
         * do another round of validation checks.
         */
        if (total_dev != fs_info->fs_devices->total_devices) {
                btrfs_warn(fs_info,
"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
                          btrfs_super_num_devices(fs_info->super_copy),
                          total_dev);
                fs_info->fs_devices->total_devices = total_dev;
                btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
        }
        if (btrfs_super_total_bytes(fs_info->super_copy) <
            fs_info->fs_devices->total_rw_bytes) {
                btrfs_err(fs_info,
        "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
                          btrfs_super_total_bytes(fs_info->super_copy),
                          fs_info->fs_devices->total_rw_bytes);
                ret = -EINVAL;
                goto error;
        }
        ret = 0;
error:
        mutex_unlock(&uuid_mutex);

        btrfs_free_path(path);
        return ret;
}

int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
        struct btrfs_device *device;
        int ret = 0;

        fs_devices->fs_info = fs_info;

        mutex_lock(&fs_devices->device_list_mutex);
        list_for_each_entry(device, &fs_devices->devices, dev_list)
                device->fs_info = fs_info;

        list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
                list_for_each_entry(device, &seed_devs->devices, dev_list) {
                        device->fs_info = fs_info;
                        ret = btrfs_get_dev_zone_info(device, false);
                        if (ret)
                                break;
                }

                seed_devs->fs_info = fs_info;
        }
        mutex_unlock(&fs_devices->device_list_mutex);

        return ret;
}

static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
                                 const struct btrfs_dev_stats_item *ptr,
                                 int index)
{
        u64 val;

        read_extent_buffer(eb, &val,
                           offsetof(struct btrfs_dev_stats_item, values) +
                            ((unsigned long)ptr) + (index * sizeof(u64)),
                           sizeof(val));
        return val;
}

static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
                                      struct btrfs_dev_stats_item *ptr,
                                      int index, u64 val)
{
        write_extent_buffer(eb, &val,
                            offsetof(struct btrfs_dev_stats_item, values) +
                             ((unsigned long)ptr) + (index * sizeof(u64)),
                            sizeof(val));
}

static int btrfs_device_init_dev_stats(struct btrfs_device *device,
                                       struct btrfs_path *path)
{
        struct btrfs_dev_stats_item *ptr;
        struct extent_buffer *eb;
        struct btrfs_key key;
        int item_size;
        int i, ret, slot;

        if (!device->fs_info->dev_root)
                return 0;

        key.objectid = BTRFS_DEV_STATS_OBJECTID;
        key.type = BTRFS_PERSISTENT_ITEM_KEY;
        key.offset = device->devid;
        ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
        if (ret) {
                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
                        btrfs_dev_stat_set(device, i, 0);
                device->dev_stats_valid = 1;
                btrfs_release_path(path);
                return ret < 0 ? ret : 0;
        }
        slot = path->slots[0];
        eb = path->nodes[0];
        item_size = btrfs_item_size(eb, slot);

        ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);

        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
                if (item_size >= (1 + i) * sizeof(__le64))
                        btrfs_dev_stat_set(device, i,
                                           btrfs_dev_stats_value(eb, ptr, i));
                else
                        btrfs_dev_stat_set(device, i, 0);
        }

        device->dev_stats_valid = 1;
        btrfs_dev_stat_print_on_load(device);
        btrfs_release_path(path);

        return 0;
}

int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
{
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
        struct btrfs_device *device;
        struct btrfs_path *path = NULL;
        int ret = 0;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        mutex_lock(&fs_devices->device_list_mutex);
        list_for_each_entry(device, &fs_devices->devices, dev_list) {
                ret = btrfs_device_init_dev_stats(device, path);
                if (ret)
                        goto out;
        }
        list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
                list_for_each_entry(device, &seed_devs->devices, dev_list) {
                        ret = btrfs_device_init_dev_stats(device, path);
                        if (ret)
                                goto out;
                }
        }
out:
        mutex_unlock(&fs_devices->device_list_mutex);

        btrfs_free_path(path);
        return ret;
}

static int update_dev_stat_item(struct btrfs_trans_handle *trans,
                                struct btrfs_device *device)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *dev_root = fs_info->dev_root;
        struct btrfs_path *path;
        struct btrfs_key key;
        struct extent_buffer *eb;
        struct btrfs_dev_stats_item *ptr;
        int ret;
        int i;

        key.objectid = BTRFS_DEV_STATS_OBJECTID;
        key.type = BTRFS_PERSISTENT_ITEM_KEY;
        key.offset = device->devid;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
        if (ret < 0) {
                btrfs_warn_in_rcu(fs_info,
                        "error %d while searching for dev_stats item for device %s",
                                  ret, btrfs_dev_name(device));
                goto out;
        }

        if (ret == 0 &&
            btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
                /* need to delete old one and insert a new one */
                ret = btrfs_del_item(trans, dev_root, path);
                if (ret != 0) {
                        btrfs_warn_in_rcu(fs_info,
                                "delete too small dev_stats item for device %s failed %d",
                                          btrfs_dev_name(device), ret);
                        goto out;
                }
                ret = 1;
        }

        if (ret == 1) {
                /* need to insert a new item */
                btrfs_release_path(path);
                ret = btrfs_insert_empty_item(trans, dev_root, path,
                                              &key, sizeof(*ptr));
                if (ret < 0) {
                        btrfs_warn_in_rcu(fs_info,
                                "insert dev_stats item for device %s failed %d",
                                btrfs_dev_name(device), ret);
                        goto out;
                }
        }

        eb = path->nodes[0];
        ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
                btrfs_set_dev_stats_value(eb, ptr, i,
                                          btrfs_dev_stat_read(device, i));
        btrfs_mark_buffer_dirty(trans, eb);

out:
        btrfs_free_path(path);
        return ret;
}

/*
 * called from commit_transaction. Writes all changed device stats to disk.
 */
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        struct btrfs_device *device;
        int stats_cnt;
        int ret = 0;

        mutex_lock(&fs_devices->device_list_mutex);
        list_for_each_entry(device, &fs_devices->devices, dev_list) {
                stats_cnt = atomic_read(&device->dev_stats_ccnt);
                if (!device->dev_stats_valid || stats_cnt == 0)
                        continue;


                /*
                 * There is a LOAD-LOAD control dependency between the value of
                 * dev_stats_ccnt and updating the on-disk values which requires
                 * reading the in-memory counters. Such control dependencies
                 * require explicit read memory barriers.
                 *
                 * This memory barriers pairs with smp_mb__before_atomic in
                 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
                 * barrier implied by atomic_xchg in
                 * btrfs_dev_stats_read_and_reset
                 */
                smp_rmb();

                ret = update_dev_stat_item(trans, device);
                if (!ret)
                        atomic_sub(stats_cnt, &device->dev_stats_ccnt);
        }
        mutex_unlock(&fs_devices->device_list_mutex);

        return ret;
}

void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
{
        btrfs_dev_stat_inc(dev, index);

        if (!dev->dev_stats_valid)
                return;
        btrfs_err_rl_in_rcu(dev->fs_info,
                "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
                           btrfs_dev_name(dev),
                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
}

static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
{
        int i;

        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
                if (btrfs_dev_stat_read(dev, i) != 0)
                        break;
        if (i == BTRFS_DEV_STAT_VALUES_MAX)
                return; /* all values == 0, suppress message */

        btrfs_info_in_rcu(dev->fs_info,
                "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
               btrfs_dev_name(dev),
               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
}

int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
                        struct btrfs_ioctl_get_dev_stats *stats)
{
        BTRFS_DEV_LOOKUP_ARGS(args);
        struct btrfs_device *dev;
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        int i;

        mutex_lock(&fs_devices->device_list_mutex);
        args.devid = stats->devid;
        dev = btrfs_find_device(fs_info->fs_devices, &args);
        mutex_unlock(&fs_devices->device_list_mutex);

        if (!dev) {
                btrfs_warn(fs_info, "get dev_stats failed, device not found");
                return -ENODEV;
        } else if (!dev->dev_stats_valid) {
                btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
                return -ENODEV;
        } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
                        if (stats->nr_items > i)
                                stats->values[i] =
                                        btrfs_dev_stat_read_and_reset(dev, i);
                        else
                                btrfs_dev_stat_set(dev, i, 0);
                }
                btrfs_info(fs_info, "device stats zeroed by %s (%d)",
                           current->comm, task_pid_nr(current));
        } else {
                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
                        if (stats->nr_items > i)
                                stats->values[i] = btrfs_dev_stat_read(dev, i);
        }
        if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
                stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
        return 0;
}

/*
 * Update the size and bytes used for each device where it changed.  This is
 * delayed since we would otherwise get errors while writing out the
 * superblocks.
 *
 * Must be invoked during transaction commit.
 */
void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
{
        struct btrfs_device *curr, *next;

        ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);

        if (list_empty(&trans->dev_update_list))
                return;

        /*
         * We don't need the device_list_mutex here.  This list is owned by the
         * transaction and the transaction must complete before the device is
         * released.
         */
        mutex_lock(&trans->fs_info->chunk_mutex);
        list_for_each_entry_safe(curr, next, &trans->dev_update_list,
                                 post_commit_list) {
                list_del_init(&curr->post_commit_list);
                curr->commit_total_bytes = curr->disk_total_bytes;
                curr->commit_bytes_used = curr->bytes_used;
        }
        mutex_unlock(&trans->fs_info->chunk_mutex);
}

/*
 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
 */
int btrfs_bg_type_to_factor(u64 flags)
{
        const int index = btrfs_bg_flags_to_raid_index(flags);

        return btrfs_raid_array[index].ncopies;
}



static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
                                 u64 chunk_offset, u64 devid,
                                 u64 physical_offset, u64 physical_len)
{
        struct btrfs_dev_lookup_args args = { .devid = devid };
        struct btrfs_chunk_map *map;
        struct btrfs_device *dev;
        u64 stripe_len;
        bool found = false;
        int ret = 0;
        int i;

        map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
        if (!map) {
                btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
                          physical_offset, devid);
                ret = -EUCLEAN;
                goto out;
        }

        stripe_len = btrfs_calc_stripe_length(map);
        if (physical_len != stripe_len) {
                btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
                          physical_offset, devid, map->start, physical_len,
                          stripe_len);
                ret = -EUCLEAN;
                goto out;
        }

        /*
         * Very old mkfs.btrfs (before v4.1) will not respect the reserved
         * space. Although kernel can handle it without problem, better to warn
         * the users.
         */
        if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED)
                btrfs_warn(fs_info,
                "devid %llu physical %llu len %llu inside the reserved space",
                           devid, physical_offset, physical_len);

        for (i = 0; i < map->num_stripes; i++) {
                if (map->stripes[i].dev->devid == devid &&
                    map->stripes[i].physical == physical_offset) {
                        found = true;
                        if (map->verified_stripes >= map->num_stripes) {
                                btrfs_err(fs_info,
                                "too many dev extents for chunk %llu found",
                                          map->start);
                                ret = -EUCLEAN;
                                goto out;
                        }
                        map->verified_stripes++;
                        break;
                }
        }
        if (!found) {
                btrfs_err(fs_info,
        "dev extent physical offset %llu devid %llu has no corresponding chunk",
                        physical_offset, devid);
                ret = -EUCLEAN;
        }

        /* Make sure no dev extent is beyond device boundary */
        dev = btrfs_find_device(fs_info->fs_devices, &args);
        if (!dev) {
                btrfs_err(fs_info, "failed to find devid %llu", devid);
                ret = -EUCLEAN;
                goto out;
        }

        if (physical_offset + physical_len > dev->disk_total_bytes) {
                btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
                          devid, physical_offset, physical_len,
                          dev->disk_total_bytes);
                ret = -EUCLEAN;
                goto out;
        }

        if (dev->zone_info) {
                u64 zone_size = dev->zone_info->zone_size;

                if (!IS_ALIGNED(physical_offset, zone_size) ||
                    !IS_ALIGNED(physical_len, zone_size)) {
                        btrfs_err(fs_info,
"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
                                  devid, physical_offset, physical_len);
                        ret = -EUCLEAN;
                        goto out;
                }
        }

out:
        btrfs_free_chunk_map(map);
        return ret;
}

static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
{
        struct rb_node *node;
        int ret = 0;

        read_lock(&fs_info->mapping_tree_lock);
        for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
                struct btrfs_chunk_map *map;

                map = rb_entry(node, struct btrfs_chunk_map, rb_node);
                if (map->num_stripes != map->verified_stripes) {
                        btrfs_err(fs_info,
                        "chunk %llu has missing dev extent, have %d expect %d",
                                  map->start, map->verified_stripes, map->num_stripes);
                        ret = -EUCLEAN;
                        goto out;
                }
        }
out:
        read_unlock(&fs_info->mapping_tree_lock);
        return ret;
}

/*
 * Ensure that all dev extents are mapped to correct chunk, otherwise
 * later chunk allocation/free would cause unexpected behavior.
 *
 * NOTE: This will iterate through the whole device tree, which should be of
 * the same size level as the chunk tree.  This slightly increases mount time.
 */
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
{
        struct btrfs_path *path;
        struct btrfs_root *root = fs_info->dev_root;
        struct btrfs_key key;
        u64 prev_devid = 0;
        u64 prev_dev_ext_end = 0;
        int ret = 0;

        /*
         * We don't have a dev_root because we mounted with ignorebadroots and
         * failed to load the root, so we want to skip the verification in this
         * case for sure.
         *
         * However if the dev root is fine, but the tree itself is corrupted
         * we'd still fail to mount.  This verification is only to make sure
         * writes can happen safely, so instead just bypass this check
         * completely in the case of IGNOREBADROOTS.
         */
        if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
                return 0;

        key.objectid = 1;
        key.type = BTRFS_DEV_EXTENT_KEY;
        key.offset = 0;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        path->reada = READA_FORWARD;
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto out;

        if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
                ret = btrfs_next_leaf(root, path);
                if (ret < 0)
                        goto out;
                /* No dev extents at all? Not good */
                if (ret > 0) {
                        ret = -EUCLEAN;
                        goto out;
                }
        }
        while (1) {
                struct extent_buffer *leaf = path->nodes[0];
                struct btrfs_dev_extent *dext;
                int slot = path->slots[0];
                u64 chunk_offset;
                u64 physical_offset;
                u64 physical_len;
                u64 devid;

                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (key.type != BTRFS_DEV_EXTENT_KEY)
                        break;
                devid = key.objectid;
                physical_offset = key.offset;

                dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
                chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
                physical_len = btrfs_dev_extent_length(leaf, dext);

                /* Check if this dev extent overlaps with the previous one */
                if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
                        btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
                                  devid, physical_offset, prev_dev_ext_end);
                        ret = -EUCLEAN;
                        goto out;
                }

                ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
                                            physical_offset, physical_len);
                if (ret < 0)
                        goto out;
                prev_devid = devid;
                prev_dev_ext_end = physical_offset + physical_len;

                ret = btrfs_next_item(root, path);
                if (ret < 0)
                        goto out;
                if (ret > 0) {
                        ret = 0;
                        break;
                }
        }

        /* Ensure all chunks have corresponding dev extents */
        ret = verify_chunk_dev_extent_mapping(fs_info);
out:
        btrfs_free_path(path);
        return ret;
}

/*
 * Check whether the given block group or device is pinned by any inode being
 * used as a swapfile.
 */
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
{
        struct btrfs_swapfile_pin *sp;
        struct rb_node *node;

        spin_lock(&fs_info->swapfile_pins_lock);
        node = fs_info->swapfile_pins.rb_node;
        while (node) {
                sp = rb_entry(node, struct btrfs_swapfile_pin, node);
                if (ptr < sp->ptr)
                        node = node->rb_left;
                else if (ptr > sp->ptr)
                        node = node->rb_right;
                else
                        break;
        }
        spin_unlock(&fs_info->swapfile_pins_lock);
        return node != NULL;
}

static int relocating_repair_kthread(void *data)
{
        struct btrfs_block_group *cache = data;
        struct btrfs_fs_info *fs_info = cache->fs_info;
        u64 target;
        int ret = 0;

        target = cache->start;
        btrfs_put_block_group(cache);

        sb_start_write(fs_info->sb);
        if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
                btrfs_info(fs_info,
                           "zoned: skip relocating block group %llu to repair: EBUSY",
                           target);
                sb_end_write(fs_info->sb);
                return -EBUSY;
        }

        mutex_lock(&fs_info->reclaim_bgs_lock);

        /* Ensure block group still exists */
        cache = btrfs_lookup_block_group(fs_info, target);
        if (!cache)
                goto out;

        if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags))
                goto out;

        ret = btrfs_may_alloc_data_chunk(fs_info, target);
        if (ret < 0)
                goto out;

        btrfs_info(fs_info,
                   "zoned: relocating block group %llu to repair IO failure",
                   target);
        ret = btrfs_relocate_chunk(fs_info, target);

out:
        if (cache)
                btrfs_put_block_group(cache);
        mutex_unlock(&fs_info->reclaim_bgs_lock);
        btrfs_exclop_finish(fs_info);
        sb_end_write(fs_info->sb);

        return ret;
}

bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
{
        struct btrfs_block_group *cache;

        if (!btrfs_is_zoned(fs_info))
                return false;

        /* Do not attempt to repair in degraded state */
        if (btrfs_test_opt(fs_info, DEGRADED))
                return true;

        cache = btrfs_lookup_block_group(fs_info, logical);
        if (!cache)
                return true;

        if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) {
                btrfs_put_block_group(cache);
                return true;
        }

        kthread_run(relocating_repair_kthread, cache,
                    "btrfs-relocating-repair");

        return true;
}

static void map_raid56_repair_block(struct btrfs_io_context *bioc,
                                    struct btrfs_io_stripe *smap,
                                    u64 logical)
{
        int data_stripes = nr_bioc_data_stripes(bioc);
        int i;

        for (i = 0; i < data_stripes; i++) {
                u64 stripe_start = bioc->full_stripe_logical +
                                   btrfs_stripe_nr_to_offset(i);

                if (logical >= stripe_start &&
                    logical < stripe_start + BTRFS_STRIPE_LEN)
                        break;
        }
        ASSERT(i < data_stripes);
        smap->dev = bioc->stripes[i].dev;
        smap->physical = bioc->stripes[i].physical +
                        ((logical - bioc->full_stripe_logical) &
                         BTRFS_STRIPE_LEN_MASK);
}

/*
 * Map a repair write into a single device.
 *
 * A repair write is triggered by read time repair or scrub, which would only
 * update the contents of a single device.
 * Not update any other mirrors nor go through RMW path.
 *
 * Callers should ensure:
 *
 * - Call btrfs_bio_counter_inc_blocked() first
 * - The range does not cross stripe boundary
 * - Has a valid @mirror_num passed in.
 */
int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
                           struct btrfs_io_stripe *smap, u64 logical,
                           u32 length, int mirror_num)
{
        struct btrfs_io_context *bioc = NULL;
        u64 map_length = length;
        int mirror_ret = mirror_num;
        int ret;

        ASSERT(mirror_num > 0);

        ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
                              &bioc, smap, &mirror_ret);
        if (ret < 0)
                return ret;

        /* The map range should not cross stripe boundary. */
        ASSERT(map_length >= length);

        /* Already mapped to single stripe. */
        if (!bioc)
                goto out;

        /* Map the RAID56 multi-stripe writes to a single one. */
        if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                map_raid56_repair_block(bioc, smap, logical);
                goto out;
        }

        ASSERT(mirror_num <= bioc->num_stripes);
        smap->dev = bioc->stripes[mirror_num - 1].dev;
        smap->physical = bioc->stripes[mirror_num - 1].physical;
out:
        btrfs_put_bioc(bioc);
        ASSERT(smap->dev);
        return 0;
}

















   12 








   13 




   12 








   18 




   17 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
// SPDX-License-Identifier: GPL-2.0
#include <linux/compiler.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/instrumented.h>
#include <linux/kernel.h>
#include <linux/nospec.h>
#include <linux/string.h>
#include <linux/uaccess.h>
#include <linux/wordpart.h>

/* out-of-line parts */

#ifndef INLINE_COPY_FROM_USER
unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n)
{
        unsigned long res = n;
        might_fault();
        if (!should_fail_usercopy() && likely(access_ok(from, n))) {
                /*
                 * Ensure that bad access_ok() speculation will not
                 * lead to nasty side effects *after* the copy is
                 * finished:
                 */
                barrier_nospec();
                instrument_copy_from_user_before(to, from, n);
                res = raw_copy_from_user(to, from, n);
                instrument_copy_from_user_after(to, from, n, res);
        }
        if (unlikely(res))
                memset(to + (n - res), 0, res);
        return res;
}
EXPORT_SYMBOL(_copy_from_user);
#endif

#ifndef INLINE_COPY_TO_USER
unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n)
{
        might_fault();
        if (should_fail_usercopy())
                return n;
        if (likely(access_ok(to, n))) {
                instrument_copy_to_user(to, from, n);
                n = raw_copy_to_user(to, from, n);
        }
        return n;
}
EXPORT_SYMBOL(_copy_to_user);
#endif

/**
 * check_zeroed_user: check if a userspace buffer only contains zero bytes
 * @from: Source address, in userspace.
 * @size: Size of buffer.
 *
 * This is effectively shorthand for "memchr_inv(from, 0, size) == NULL" for
 * userspace addresses (and is more efficient because we don't care where the
 * first non-zero byte is).
 *
 * Returns:
 *  * 0: There were non-zero bytes present in the buffer.
 *  * 1: The buffer was full of zero bytes.
 *  * -EFAULT: access to userspace failed.
 */
int check_zeroed_user(const void __user *from, size_t size)
{
        unsigned long val;
        uintptr_t align = (uintptr_t) from % sizeof(unsigned long);

        if (unlikely(size == 0))
                return 1;

        from -= align;
        size += align;

        if (!user_read_access_begin(from, size))
                return -EFAULT;

        unsafe_get_user(val, (unsigned long __user *) from, err_fault);
        if (align)
                val &= ~aligned_byte_mask(align);

        while (size > sizeof(unsigned long)) {
                if (unlikely(val))
                        goto done;

                from += sizeof(unsigned long);
                size -= sizeof(unsigned long);

                unsafe_get_user(val, (unsigned long __user *) from, err_fault);
        }

        if (size < sizeof(unsigned long))
                val &= aligned_byte_mask(size);

done:
        user_read_access_end();
        return (val == 0);
err_fault:
        user_read_access_end();
        return -EFAULT;
}
EXPORT_SYMBOL(check_zeroed_user);





















    3 





















































































































































































    6 

    6 











    6 





























































































































































































































































































   10 

   11 









    8 
    2 





    6 
    2 



    7 















   11 
   10 
    9 
   10 














































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

#include <linux/dcache.h>
#include <linux/fs.h>
#include <linux/gfp.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/srcu.h>

#include <linux/fsnotify_backend.h>
#include "fsnotify.h"

/*
 * Clear all of the marks on an inode when it is being evicted from core
 */
void __fsnotify_inode_delete(struct inode *inode)
{
        fsnotify_clear_marks_by_inode(inode);
}
EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);

void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
{
        fsnotify_clear_marks_by_mount(mnt);
}

/**
 * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
 * @sb: superblock being unmounted.
 *
 * Called during unmount with no locks held, so needs to be safe against
 * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
 */
static void fsnotify_unmount_inodes(struct super_block *sb)
{
        struct inode *inode, *iput_inode = NULL;

        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                /*
                 * We cannot __iget() an inode in state I_FREEING,
                 * I_WILL_FREE, or I_NEW which is fine because by that point
                 * the inode cannot have any associated watches.
                 */
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                /*
                 * If i_count is zero, the inode cannot have any watches and
                 * doing an __iget/iput with SB_ACTIVE clear would actually
                 * evict all inodes with zero i_count from icache which is
                 * unnecessarily violent and may in fact be illegal to do.
                 * However, we should have been called /after/ evict_inodes
                 * removed all zero refcount inodes, in any case.  Test to
                 * be sure.
                 */
                if (!atomic_read(&inode->i_count)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                __iget(inode);
                spin_unlock(&inode->i_lock);
                spin_unlock(&sb->s_inode_list_lock);

                iput(iput_inode);

                /* for each watch, send FS_UNMOUNT and then remove it */
                fsnotify_inode(inode, FS_UNMOUNT);

                fsnotify_inode_delete(inode);

                iput_inode = inode;

                cond_resched();
                spin_lock(&sb->s_inode_list_lock);
        }
        spin_unlock(&sb->s_inode_list_lock);

        iput(iput_inode);
}

void fsnotify_sb_delete(struct super_block *sb)
{
        struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);

        /* Were any marks ever added to any object on this sb? */
        if (!sbinfo)
                return;

        fsnotify_unmount_inodes(sb);
        fsnotify_clear_marks_by_sb(sb);
        /* Wait for outstanding object references from connectors */
        wait_var_event(fsnotify_sb_watched_objects(sb),
                       !atomic_long_read(fsnotify_sb_watched_objects(sb)));
        WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT));
        WARN_ON(fsnotify_sb_has_priority_watchers(sb,
                                                  FSNOTIFY_PRIO_PRE_CONTENT));
}

void fsnotify_sb_free(struct super_block *sb)
{
        kfree(sb->s_fsnotify_info);
}

/*
 * Given an inode, first check if we care what happens to our children.  Inotify
 * and dnotify both tell their parents about events.  If we care about any event
 * on a child we run all of our children and set a dentry flag saying that the
 * parent cares.  Thus when an event happens on a child it can quickly tell
 * if there is a need to find a parent and send the event to the parent.
 */
void __fsnotify_update_child_dentry_flags(struct inode *inode)
{
        struct dentry *alias;
        int watched;

        if (!S_ISDIR(inode->i_mode))
                return;

        /* determine if the children should tell inode about their events */
        watched = fsnotify_inode_watches_children(inode);

        spin_lock(&inode->i_lock);
        /* run all of the dentries associated with this inode.  Since this is a
         * directory, there damn well better only be one item on this list */
        hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
                struct dentry *child;

                /* run all of the children of the original inode and fix their
                 * d_flags to indicate parental interest (their parent is the
                 * original inode) */
                spin_lock(&alias->d_lock);
                hlist_for_each_entry(child, &alias->d_children, d_sib) {
                        if (!child->d_inode)
                                continue;

                        spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
                        if (watched)
                                child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
                        else
                                child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
                        spin_unlock(&child->d_lock);
                }
                spin_unlock(&alias->d_lock);
        }
        spin_unlock(&inode->i_lock);
}

/* Are inode/sb/mount interested in parent and name info with this event? */
static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask,
                                        __u32 mask)
{
        __u32 marks_mask = 0;

        /* We only send parent/name to inode/sb/mount for events on non-dir */
        if (mask & FS_ISDIR)
                return false;

        /*
         * All events that are possible on child can also may be reported with
         * parent/name info to inode/sb/mount.  Otherwise, a watching parent
         * could result in events reported with unexpected name info to sb/mount.
         */
        BUILD_BUG_ON(FS_EVENTS_POSS_ON_CHILD & ~FS_EVENTS_POSS_TO_PARENT);

        /* Did either inode/sb/mount subscribe for events with parent/name? */
        marks_mask |= fsnotify_parent_needed_mask(inode->i_fsnotify_mask);
        marks_mask |= fsnotify_parent_needed_mask(inode->i_sb->s_fsnotify_mask);
        marks_mask |= fsnotify_parent_needed_mask(mnt_mask);

        /* Did they subscribe for this event with parent/name info? */
        return mask & marks_mask;
}

/* Are there any inode/mount/sb objects that are interested in this event? */
static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
                                           __u32 mask)
{
        __u32 marks_mask = inode->i_fsnotify_mask | mnt_mask |
                           inode->i_sb->s_fsnotify_mask;

        return mask & marks_mask & ALL_FSNOTIFY_EVENTS;
}

/*
 * Notify this dentry's parent about a child's events with child name info
 * if parent is watching or if inode/sb/mount are interested in events with
 * parent and name info.
 *
 * Notify only the child without name info if parent is not watching and
 * inode/sb/mount are not interested in events with parent and name info.
 */
int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
                      int data_type)
{
        const struct path *path = fsnotify_data_path(data, data_type);
        __u32 mnt_mask = path ? real_mount(path->mnt)->mnt_fsnotify_mask : 0;
        struct inode *inode = d_inode(dentry);
        struct dentry *parent;
        bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED;
        bool parent_needed, parent_interested;
        __u32 p_mask;
        struct inode *p_inode = NULL;
        struct name_snapshot name;
        struct qstr *file_name = NULL;
        int ret = 0;

        /* Optimize the likely case of nobody watching this path */
        if (likely(!parent_watched &&
                   !fsnotify_object_watched(inode, mnt_mask, mask)))
                return 0;

        parent = NULL;
        parent_needed = fsnotify_event_needs_parent(inode, mnt_mask, mask);
        if (!parent_watched && !parent_needed)
                goto notify;

        /* Does parent inode care about events on children? */
        parent = dget_parent(dentry);
        p_inode = parent->d_inode;
        p_mask = fsnotify_inode_watches_children(p_inode);
        if (unlikely(parent_watched && !p_mask))
                __fsnotify_update_child_dentry_flags(p_inode);

        /*
         * Include parent/name in notification either if some notification
         * groups require parent info or the parent is interested in this event.
         */
        parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS;
        if (parent_needed || parent_interested) {
                /* When notifying parent, child should be passed as data */
                WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type));

                /* Notify both parent and child with child name info */
                take_dentry_name_snapshot(&name, dentry);
                file_name = &name.name;
                if (parent_interested)
                        mask |= FS_EVENT_ON_CHILD;
        }

notify:
        ret = fsnotify(mask, data, data_type, p_inode, file_name, inode, 0);

        if (file_name)
                release_dentry_name_snapshot(&name);
        dput(parent);

        return ret;
}
EXPORT_SYMBOL_GPL(__fsnotify_parent);

static int fsnotify_handle_inode_event(struct fsnotify_group *group,
                                       struct fsnotify_mark *inode_mark,
                                       u32 mask, const void *data, int data_type,
                                       struct inode *dir, const struct qstr *name,
                                       u32 cookie)
{
        const struct path *path = fsnotify_data_path(data, data_type);
        struct inode *inode = fsnotify_data_inode(data, data_type);
        const struct fsnotify_ops *ops = group->ops;

        if (WARN_ON_ONCE(!ops->handle_inode_event))
                return 0;

        if (WARN_ON_ONCE(!inode && !dir))
                return 0;

        if ((inode_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK) &&
            path && d_unlinked(path->dentry))
                return 0;

        /* Check interest of this mark in case event was sent with two marks */
        if (!(mask & inode_mark->mask & ALL_FSNOTIFY_EVENTS))
                return 0;

        return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie);
}

static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
                                 const void *data, int data_type,
                                 struct inode *dir, const struct qstr *name,
                                 u32 cookie, struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
        struct fsnotify_mark *parent_mark = fsnotify_iter_parent_mark(iter_info);
        int ret;

        if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) ||
            WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info)))
                return 0;

        /*
         * For FS_RENAME, 'dir' is old dir and 'data' is new dentry.
         * The only ->handle_inode_event() backend that supports FS_RENAME is
         * dnotify, where it means file was renamed within same parent.
         */
        if (mask & FS_RENAME) {
                struct dentry *moved = fsnotify_data_dentry(data, data_type);

                if (dir != moved->d_parent->d_inode)
                        return 0;
        }

        if (parent_mark) {
                ret = fsnotify_handle_inode_event(group, parent_mark, mask,
                                                  data, data_type, dir, name, 0);
                if (ret)
                        return ret;
        }

        if (!inode_mark)
                return 0;

        if (mask & FS_EVENT_ON_CHILD) {
                /*
                 * Some events can be sent on both parent dir and child marks
                 * (e.g. FS_ATTRIB).  If both parent dir and child are
                 * watching, report the event once to parent dir with name (if
                 * interested) and once to child without name (if interested).
                 * The child watcher is expecting an event without a file name
                 * and without the FS_EVENT_ON_CHILD flag.
                 */
                mask &= ~FS_EVENT_ON_CHILD;
                dir = NULL;
                name = NULL;
        }

        return fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type,
                                           dir, name, cookie);
}

static int send_to_group(__u32 mask, const void *data, int data_type,
                         struct inode *dir, const struct qstr *file_name,
                         u32 cookie, struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_group *group = NULL;
        __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
        __u32 marks_mask = 0;
        __u32 marks_ignore_mask = 0;
        bool is_dir = mask & FS_ISDIR;
        struct fsnotify_mark *mark;
        int type;

        if (!iter_info->report_mask)
                return 0;

        /* clear ignored on inode modification */
        if (mask & FS_MODIFY) {
                fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
                        if (!(mark->flags &
                              FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
                                mark->ignore_mask = 0;
                }
        }

        /* Are any of the group marks interested in this event? */
        fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
                group = mark->group;
                marks_mask |= mark->mask;
                marks_ignore_mask |=
                        fsnotify_effective_ignore_mask(mark, is_dir, type);
        }

        pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n",
                 __func__, group, mask, marks_mask, marks_ignore_mask,
                 data, data_type, dir, cookie);

        if (!(test_mask & marks_mask & ~marks_ignore_mask))
                return 0;

        if (group->ops->handle_event) {
                return group->ops->handle_event(group, mask, data, data_type, dir,
                                                file_name, cookie, iter_info);
        }

        return fsnotify_handle_event(group, mask, data, data_type, dir,
                                     file_name, cookie, iter_info);
}

static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector **connp)
{
        struct fsnotify_mark_connector *conn;
        struct hlist_node *node = NULL;

        conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
        if (conn)
                node = srcu_dereference(conn->list.first, &fsnotify_mark_srcu);

        return hlist_entry_safe(node, struct fsnotify_mark, obj_list);
}

static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark)
{
        struct hlist_node *node = NULL;

        if (mark)
                node = srcu_dereference(mark->obj_list.next,
                                        &fsnotify_mark_srcu);

        return hlist_entry_safe(node, struct fsnotify_mark, obj_list);
}

/*
 * iter_info is a multi head priority queue of marks.
 * Pick a subset of marks from queue heads, all with the same group
 * and set the report_mask to a subset of the selected marks.
 * Returns false if there are no more groups to iterate.
 */
static bool fsnotify_iter_select_report_types(
                struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_group *max_prio_group = NULL;
        struct fsnotify_mark *mark;
        int type;

        /* Choose max prio group among groups of all queue heads */
        fsnotify_foreach_iter_type(type) {
                mark = iter_info->marks[type];
                if (mark &&
                    fsnotify_compare_groups(max_prio_group, mark->group) > 0)
                        max_prio_group = mark->group;
        }

        if (!max_prio_group)
                return false;

        /* Set the report mask for marks from same group as max prio group */
        iter_info->current_group = max_prio_group;
        iter_info->report_mask = 0;
        fsnotify_foreach_iter_type(type) {
                mark = iter_info->marks[type];
                if (mark && mark->group == iter_info->current_group) {
                        /*
                         * FSNOTIFY_ITER_TYPE_PARENT indicates that this inode
                         * is watching children and interested in this event,
                         * which is an event possible on child.
                         * But is *this mark* watching children?
                         */
                        if (type == FSNOTIFY_ITER_TYPE_PARENT &&
                            !(mark->mask & FS_EVENT_ON_CHILD) &&
                            !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD))
                                continue;

                        fsnotify_iter_set_report_type(iter_info, type);
                }
        }

        return true;
}

/*
 * Pop from iter_info multi head queue, the marks that belong to the group of
 * current iteration step.
 */
static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_mark *mark;
        int type;

        /*
         * We cannot use fsnotify_foreach_iter_mark_type() here because we
         * may need to advance a mark of type X that belongs to current_group
         * but was not selected for reporting.
         */
        fsnotify_foreach_iter_type(type) {
                mark = iter_info->marks[type];
                if (mark && mark->group == iter_info->current_group)
                        iter_info->marks[type] =
                                fsnotify_next_mark(iter_info->marks[type]);
        }
}

/*
 * fsnotify - This is the main call to fsnotify.
 *
 * The VFS calls into hook specific functions in linux/fsnotify.h.
 * Those functions then in turn call here.  Here will call out to all of the
 * registered fsnotify_group.  Those groups can then use the notification event
 * in whatever means they feel necessary.
 *
 * @mask:        event type and flags
 * @data:        object that event happened on
 * @data_type:        type of object for fanotify_data_XXX() accessors
 * @dir:        optional directory associated with event -
 *                if @file_name is not NULL, this is the directory that
 *                @file_name is relative to
 * @file_name:        optional file name associated with event
 * @inode:        optional inode associated with event -
 *                If @dir and @inode are both non-NULL, event may be
 *                reported to both.
 * @cookie:        inotify rename cookie
 */
int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
             const struct qstr *file_name, struct inode *inode, u32 cookie)
{
        const struct path *path = fsnotify_data_path(data, data_type);
        struct super_block *sb = fsnotify_data_sb(data, data_type);
        struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
        struct fsnotify_iter_info iter_info = {};
        struct mount *mnt = NULL;
        struct inode *inode2 = NULL;
        struct dentry *moved;
        int inode2_type;
        int ret = 0;
        __u32 test_mask, marks_mask;

        if (path)
                mnt = real_mount(path->mnt);

        if (!inode) {
                /* Dirent event - report on TYPE_INODE to dir */
                inode = dir;
                /* For FS_RENAME, inode is old_dir and inode2 is new_dir */
                if (mask & FS_RENAME) {
                        moved = fsnotify_data_dentry(data, data_type);
                        inode2 = moved->d_parent->d_inode;
                        inode2_type = FSNOTIFY_ITER_TYPE_INODE2;
                }
        } else if (mask & FS_EVENT_ON_CHILD) {
                /*
                 * Event on child - report on TYPE_PARENT to dir if it is
                 * watching children and on TYPE_INODE to child.
                 */
                inode2 = dir;
                inode2_type = FSNOTIFY_ITER_TYPE_PARENT;
        }

        /*
         * Optimization: srcu_read_lock() has a memory barrier which can
         * be expensive.  It protects walking the *_fsnotify_marks lists.
         * However, if we do not walk the lists, we do not have to do
         * SRCU because we have no references to any objects and do not
         * need SRCU to keep them "alive".
         */
        if ((!sbinfo || !sbinfo->sb_marks) &&
            (!mnt || !mnt->mnt_fsnotify_marks) &&
            (!inode || !inode->i_fsnotify_marks) &&
            (!inode2 || !inode2->i_fsnotify_marks))
                return 0;

        marks_mask = sb->s_fsnotify_mask;
        if (mnt)
                marks_mask |= mnt->mnt_fsnotify_mask;
        if (inode)
                marks_mask |= inode->i_fsnotify_mask;
        if (inode2)
                marks_mask |= inode2->i_fsnotify_mask;


        /*
         * If this is a modify event we may need to clear some ignore masks.
         * In that case, the object with ignore masks will have the FS_MODIFY
         * event in its mask.
         * Otherwise, return if none of the marks care about this type of event.
         */
        test_mask = (mask & ALL_FSNOTIFY_EVENTS);
        if (!(test_mask & marks_mask))
                return 0;

        iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);

        if (sbinfo) {
                iter_info.marks[FSNOTIFY_ITER_TYPE_SB] =
                        fsnotify_first_mark(&sbinfo->sb_marks);
        }
        if (mnt) {
                iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] =
                        fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
        }
        if (inode) {
                iter_info.marks[FSNOTIFY_ITER_TYPE_INODE] =
                        fsnotify_first_mark(&inode->i_fsnotify_marks);
        }
        if (inode2) {
                iter_info.marks[inode2_type] =
                        fsnotify_first_mark(&inode2->i_fsnotify_marks);
        }

        /*
         * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark
         * ignore masks are properly reflected for mount/sb mark notifications.
         * That's why this traversal is so complicated...
         */
        while (fsnotify_iter_select_report_types(&iter_info)) {
                ret = send_to_group(mask, data, data_type, dir, file_name,
                                    cookie, &iter_info);

                if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
                        goto out;

                fsnotify_iter_next(&iter_info);
        }
        ret = 0;
out:
        srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx);

        return ret;
}
EXPORT_SYMBOL_GPL(fsnotify);

static __init int fsnotify_init(void)
{
        int ret;

        BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23);

        ret = init_srcu_struct(&fsnotify_mark_srcu);
        if (ret)
                panic("initializing fsnotify_mark_srcu");

        fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector,
                                                    SLAB_PANIC);

        return 0;
}
core_initcall(fsnotify_init);






























   28 





   27 









































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LOCAL_LOCK_H
# error "Do not include directly, include linux/local_lock.h"
#endif

#include <linux/percpu-defs.h>
#include <linux/lockdep.h>

#ifndef CONFIG_PREEMPT_RT

typedef struct {
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
        struct task_struct        *owner;
#endif
} local_lock_t;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define LOCAL_LOCK_DEBUG_INIT(lockname)                \
        .dep_map = {                                        \
                .name = #lockname,                        \
                .wait_type_inner = LD_WAIT_CONFIG,        \
                .lock_type = LD_LOCK_PERCPU,                \
        },                                                \
        .owner = NULL,

static inline void local_lock_acquire(local_lock_t *l)
{
        lock_map_acquire(&l->dep_map);
        DEBUG_LOCKS_WARN_ON(l->owner);
        l->owner = current;
}

static inline void local_lock_release(local_lock_t *l)
{
        DEBUG_LOCKS_WARN_ON(l->owner != current);
        l->owner = NULL;
        lock_map_release(&l->dep_map);
}

static inline void local_lock_debug_init(local_lock_t *l)
{
        l->owner = NULL;
}
#else /* CONFIG_DEBUG_LOCK_ALLOC */
# define LOCAL_LOCK_DEBUG_INIT(lockname)
static inline void local_lock_acquire(local_lock_t *l) { }
static inline void local_lock_release(local_lock_t *l) { }
static inline void local_lock_debug_init(local_lock_t *l) { }
#endif /* !CONFIG_DEBUG_LOCK_ALLOC */

#define INIT_LOCAL_LOCK(lockname)        { LOCAL_LOCK_DEBUG_INIT(lockname) }

#define __local_lock_init(lock)                                        \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        debug_check_no_locks_freed((void *)lock, sizeof(*lock));\
        lockdep_init_map_type(&(lock)->dep_map, #lock, &__key,  \
                              0, LD_WAIT_CONFIG, LD_WAIT_INV,        \
                              LD_LOCK_PERCPU);                        \
        local_lock_debug_init(lock);                                \
} while (0)

#define __local_lock(lock)                                        \
        do {                                                        \
                preempt_disable();                                \
                local_lock_acquire(this_cpu_ptr(lock));                \
        } while (0)

#define __local_lock_irq(lock)                                        \
        do {                                                        \
                local_irq_disable();                                \
                local_lock_acquire(this_cpu_ptr(lock));                \
        } while (0)

#define __local_lock_irqsave(lock, flags)                        \
        do {                                                        \
                local_irq_save(flags);                                \
                local_lock_acquire(this_cpu_ptr(lock));                \
        } while (0)

#define __local_unlock(lock)                                        \
        do {                                                        \
                local_lock_release(this_cpu_ptr(lock));                \
                preempt_enable();                                \
        } while (0)

#define __local_unlock_irq(lock)                                \
        do {                                                        \
                local_lock_release(this_cpu_ptr(lock));                \
                local_irq_enable();                                \
        } while (0)

#define __local_unlock_irqrestore(lock, flags)                        \
        do {                                                        \
                local_lock_release(this_cpu_ptr(lock));                \
                local_irq_restore(flags);                        \
        } while (0)

#else /* !CONFIG_PREEMPT_RT */

/*
 * On PREEMPT_RT local_lock maps to a per CPU spinlock, which protects the
 * critical section while staying preemptible.
 */
typedef spinlock_t local_lock_t;

#define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname))

#define __local_lock_init(l)                                        \
        do {                                                        \
                local_spin_lock_init((l));                        \
        } while (0)

#define __local_lock(__lock)                                        \
        do {                                                        \
                migrate_disable();                                \
                spin_lock(this_cpu_ptr((__lock)));                \
        } while (0)

#define __local_lock_irq(lock)                        __local_lock(lock)

#define __local_lock_irqsave(lock, flags)                        \
        do {                                                        \
                typecheck(unsigned long, flags);                \
                flags = 0;                                        \
                __local_lock(lock);                                \
        } while (0)

#define __local_unlock(__lock)                                        \
        do {                                                        \
                spin_unlock(this_cpu_ptr((__lock)));                \
                migrate_enable();                                \
        } while (0)

#define __local_unlock_irq(lock)                __local_unlock(lock)

#define __local_unlock_irqrestore(lock, flags)        __local_unlock(lock)

#endif /* CONFIG_PREEMPT_RT */























































































































    1 







    1 







    1 
    1 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
// SPDX-License-Identifier: GPL-2.0
#include "cgroup-internal.h"

#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/nsproxy.h>
#include <linux/proc_ns.h>


/* cgroup namespaces */

static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
{
        return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
}

static void dec_cgroup_namespaces(struct ucounts *ucounts)
{
        dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
}

static struct cgroup_namespace *alloc_cgroup_ns(void)
{
        struct cgroup_namespace *new_ns;
        int ret;

        new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT);
        if (!new_ns)
                return ERR_PTR(-ENOMEM);
        ret = ns_alloc_inum(&new_ns->ns);
        if (ret) {
                kfree(new_ns);
                return ERR_PTR(ret);
        }
        refcount_set(&new_ns->ns.count, 1);
        new_ns->ns.ops = &cgroupns_operations;
        return new_ns;
}

void free_cgroup_ns(struct cgroup_namespace *ns)
{
        put_css_set(ns->root_cset);
        dec_cgroup_namespaces(ns->ucounts);
        put_user_ns(ns->user_ns);
        ns_free_inum(&ns->ns);
        kfree(ns);
}
EXPORT_SYMBOL(free_cgroup_ns);

struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
                                        struct user_namespace *user_ns,
                                        struct cgroup_namespace *old_ns)
{
        struct cgroup_namespace *new_ns;
        struct ucounts *ucounts;
        struct css_set *cset;

        BUG_ON(!old_ns);

        if (!(flags & CLONE_NEWCGROUP)) {
                get_cgroup_ns(old_ns);
                return old_ns;
        }

        /* Allow only sysadmin to create cgroup namespace. */
        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);

        ucounts = inc_cgroup_namespaces(user_ns);
        if (!ucounts)
                return ERR_PTR(-ENOSPC);

        /* It is not safe to take cgroup_mutex here */
        spin_lock_irq(&css_set_lock);
        cset = task_css_set(current);
        get_css_set(cset);
        spin_unlock_irq(&css_set_lock);

        new_ns = alloc_cgroup_ns();
        if (IS_ERR(new_ns)) {
                put_css_set(cset);
                dec_cgroup_namespaces(ucounts);
                return new_ns;
        }

        new_ns->user_ns = get_user_ns(user_ns);
        new_ns->ucounts = ucounts;
        new_ns->root_cset = cset;

        return new_ns;
}

static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
{
        return container_of(ns, struct cgroup_namespace, ns);
}

static int cgroupns_install(struct nsset *nsset, struct ns_common *ns)
{
        struct nsproxy *nsproxy = nsset->nsproxy;
        struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);

        if (!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN) ||
            !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        /* Don't need to do anything if we are attaching to our own cgroupns. */
        if (cgroup_ns == nsproxy->cgroup_ns)
                return 0;

        get_cgroup_ns(cgroup_ns);
        put_cgroup_ns(nsproxy->cgroup_ns);
        nsproxy->cgroup_ns = cgroup_ns;

        return 0;
}

static struct ns_common *cgroupns_get(struct task_struct *task)
{
        struct cgroup_namespace *ns = NULL;
        struct nsproxy *nsproxy;

        task_lock(task);
        nsproxy = task->nsproxy;
        if (nsproxy) {
                ns = nsproxy->cgroup_ns;
                get_cgroup_ns(ns);
        }
        task_unlock(task);

        return ns ? &ns->ns : NULL;
}

static void cgroupns_put(struct ns_common *ns)
{
        put_cgroup_ns(to_cg_ns(ns));
}

static struct user_namespace *cgroupns_owner(struct ns_common *ns)
{
        return to_cg_ns(ns)->user_ns;
}

const struct proc_ns_operations cgroupns_operations = {
        .name                = "cgroup",
        .type                = CLONE_NEWCGROUP,
        .get                = cgroupns_get,
        .put                = cgroupns_put,
        .install        = cgroupns_install,
        .owner                = cgroupns_owner,
};






















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __SHMEM_FS_H
#define __SHMEM_FS_H

#include <linux/file.h>
#include <linux/swap.h>
#include <linux/mempolicy.h>
#include <linux/pagemap.h>
#include <linux/percpu_counter.h>
#include <linux/xattr.h>
#include <linux/fs_parser.h>
#include <linux/userfaultfd_k.h>

/* inode in-kernel data */

#ifdef CONFIG_TMPFS_QUOTA
#define SHMEM_MAXQUOTAS 2
#endif

struct shmem_inode_info {
        spinlock_t                lock;
        unsigned int                seals;                /* shmem seals */
        unsigned long                flags;
        unsigned long                alloced;        /* data pages alloced to file */
        unsigned long                swapped;        /* subtotal assigned to swap */
        union {
            struct offset_ctx        dir_offsets;        /* stable directory offsets */
            struct {
                struct list_head shrinklist;        /* shrinkable hpage inodes */
                struct list_head swaplist;        /* chain of maybes on swap */
            };
        };
        struct timespec64        i_crtime;        /* file creation time */
        struct shared_policy        policy;                /* NUMA memory alloc policy */
        struct simple_xattrs        xattrs;                /* list of xattrs */
        pgoff_t                        fallocend;        /* highest fallocate endindex */
        unsigned int                fsflags;        /* for FS_IOC_[SG]ETFLAGS */
        atomic_t                stop_eviction;        /* hold when working on inode */
#ifdef CONFIG_TMPFS_QUOTA
        struct dquot __rcu        *i_dquot[MAXQUOTAS];
#endif
        struct inode                vfs_inode;
};

#define SHMEM_FL_USER_VISIBLE                FS_FL_USER_VISIBLE
#define SHMEM_FL_USER_MODIFIABLE \
        (FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL)
#define SHMEM_FL_INHERITED                (FS_NODUMP_FL | FS_NOATIME_FL)

struct shmem_quota_limits {
        qsize_t usrquota_bhardlimit; /* Default user quota block hard limit */
        qsize_t usrquota_ihardlimit; /* Default user quota inode hard limit */
        qsize_t grpquota_bhardlimit; /* Default group quota block hard limit */
        qsize_t grpquota_ihardlimit; /* Default group quota inode hard limit */
};

struct shmem_sb_info {
        unsigned long max_blocks;   /* How many blocks are allowed */
        struct percpu_counter used_blocks;  /* How many are allocated */
        unsigned long max_inodes;   /* How many inodes are allowed */
        unsigned long free_ispace;  /* How much ispace left for allocation */
        raw_spinlock_t stat_lock;   /* Serialize shmem_sb_info changes */
        umode_t mode;                    /* Mount mode for root directory */
        unsigned char huge;            /* Whether to try for hugepages */
        kuid_t uid;                    /* Mount uid for root directory */
        kgid_t gid;                    /* Mount gid for root directory */
        bool full_inums;            /* If i_ino should be uint or ino_t */
        bool noswap;                    /* ignores VM reclaim / swap requests */
        ino_t next_ino;                    /* The next per-sb inode number to use */
        ino_t __percpu *ino_batch;  /* The next per-cpu inode number to use */
        struct mempolicy *mpol;     /* default memory policy for mappings */
        spinlock_t shrinklist_lock;   /* Protects shrinklist */
        struct list_head shrinklist;  /* List of shinkable inodes */
        unsigned long shrinklist_len; /* Length of shrinklist */
        struct shmem_quota_limits qlimits; /* Default quota limits */
};

static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
{
        return container_of(inode, struct shmem_inode_info, vfs_inode);
}

/*
 * Functions in mm/shmem.c called directly from elsewhere:
 */
extern const struct fs_parameter_spec shmem_fs_parameters[];
extern void shmem_init(void);
extern int shmem_init_fs_context(struct fs_context *fc);
extern struct file *shmem_file_setup(const char *name,
                                        loff_t size, unsigned long flags);
extern struct file *shmem_kernel_file_setup(const char *name, loff_t size,
                                            unsigned long flags);
extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt,
                const char *name, loff_t size, unsigned long flags);
extern int shmem_zero_setup(struct vm_area_struct *);
extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags);
extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts);
#ifdef CONFIG_SHMEM
bool shmem_mapping(struct address_space *mapping);
#else
static inline bool shmem_mapping(struct address_space *mapping)
{
        return false;
}
#endif /* CONFIG_SHMEM */
extern void shmem_unlock_mapping(struct address_space *mapping);
extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
                                        pgoff_t index, gfp_t gfp_mask);
extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
int shmem_unuse(unsigned int type);

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
                          struct mm_struct *mm, unsigned long vm_flags);
#else
static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
                                          struct mm_struct *mm, unsigned long vm_flags)
{
        return false;
}
#endif

#ifdef CONFIG_SHMEM
extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
#else
static inline unsigned long shmem_swap_usage(struct vm_area_struct *vma)
{
        return 0;
}
#endif
extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
                                                pgoff_t start, pgoff_t end);

/* Flag allocation requirements to shmem_get_folio */
enum sgp_type {
        SGP_READ,        /* don't exceed i_size, don't allocate page */
        SGP_NOALLOC,        /* similar, but fail on hole or use fallocated page */
        SGP_CACHE,        /* don't exceed i_size, may allocate page */
        SGP_WRITE,        /* may exceed i_size, may allocate !Uptodate page */
        SGP_FALLOC,        /* like SGP_WRITE, but make existing page Uptodate */
};

int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
                enum sgp_type sgp);
struct folio *shmem_read_folio_gfp(struct address_space *mapping,
                pgoff_t index, gfp_t gfp);

static inline struct folio *shmem_read_folio(struct address_space *mapping,
                pgoff_t index)
{
        return shmem_read_folio_gfp(mapping, index, mapping_gfp_mask(mapping));
}

static inline struct page *shmem_read_mapping_page(
                                struct address_space *mapping, pgoff_t index)
{
        return shmem_read_mapping_page_gfp(mapping, index,
                                        mapping_gfp_mask(mapping));
}

static inline bool shmem_file(struct file *file)
{
        if (!IS_ENABLED(CONFIG_SHMEM))
                return false;
        if (!file || !file->f_mapping)
                return false;
        return shmem_mapping(file->f_mapping);
}

/*
 * If fallocate(FALLOC_FL_KEEP_SIZE) has been used, there may be pages
 * beyond i_size's notion of EOF, which fallocate has committed to reserving:
 * which split_huge_page() must therefore not delete.  This use of a single
 * "fallocend" per inode errs on the side of not deleting a reservation when
 * in doubt: there are plenty of cases when it preserves unreserved pages.
 */
static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof)
{
        return max(eof, SHMEM_I(inode)->fallocend);
}

extern bool shmem_charge(struct inode *inode, long pages);
extern void shmem_uncharge(struct inode *inode, long pages);

#ifdef CONFIG_USERFAULTFD
#ifdef CONFIG_SHMEM
extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
                                  struct vm_area_struct *dst_vma,
                                  unsigned long dst_addr,
                                  unsigned long src_addr,
                                  uffd_flags_t flags,
                                  struct folio **foliop);
#else /* !CONFIG_SHMEM */
#define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \
                               src_addr, flags, foliop) ({ BUG(); 0; })
#endif /* CONFIG_SHMEM */
#endif /* CONFIG_USERFAULTFD */

/*
 * Used space is stored as unsigned 64-bit value in bytes but
 * quota core supports only signed 64-bit values so use that
 * as a limit
 */
#define SHMEM_QUOTA_MAX_SPC_LIMIT 0x7fffffffffffffffLL /* 2^63-1 */
#define SHMEM_QUOTA_MAX_INO_LIMIT 0x7fffffffffffffffLL

#ifdef CONFIG_TMPFS_QUOTA
extern const struct dquot_operations shmem_quota_operations;
extern struct quota_format_type shmem_quota_format;
#endif /* CONFIG_TMPFS_QUOTA */

#endif


































































































































































    1 

























    1 



















    1 






































































































































































































    1 


    1 












































































    1 

























































































    1 































    1 






    1 

    1 

    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
// SPDX-License-Identifier: GPL-2.0
/*
 * Code related to the io_uring_register() syscall
 *
 * Copyright (C) 2023 Jens Axboe
 */
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/refcount.h>
#include <linux/bits.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/nospec.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/io_uring_types.h>

#include "io_uring.h"
#include "opdef.h"
#include "tctx.h"
#include "rsrc.h"
#include "sqpoll.h"
#include "register.h"
#include "cancel.h"
#include "kbuf.h"
#include "napi.h"

#define IORING_MAX_RESTRICTIONS        (IORING_RESTRICTION_LAST + \
                                 IORING_REGISTER_LAST + IORING_OP_LAST)

static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
                               unsigned int eventfd_async)
{
        struct io_ev_fd *ev_fd;
        __s32 __user *fds = arg;
        int fd;

        ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
                                        lockdep_is_held(&ctx->uring_lock));
        if (ev_fd)
                return -EBUSY;

        if (copy_from_user(&fd, fds, sizeof(*fds)))
                return -EFAULT;

        ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
        if (!ev_fd)
                return -ENOMEM;

        ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
        if (IS_ERR(ev_fd->cq_ev_fd)) {
                int ret = PTR_ERR(ev_fd->cq_ev_fd);
                kfree(ev_fd);
                return ret;
        }

        spin_lock(&ctx->completion_lock);
        ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
        spin_unlock(&ctx->completion_lock);

        ev_fd->eventfd_async = eventfd_async;
        ctx->has_evfd = true;
        rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
        atomic_set(&ev_fd->refs, 1);
        atomic_set(&ev_fd->ops, 0);
        return 0;
}

int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
        struct io_ev_fd *ev_fd;

        ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
                                        lockdep_is_held(&ctx->uring_lock));
        if (ev_fd) {
                ctx->has_evfd = false;
                rcu_assign_pointer(ctx->io_ev_fd, NULL);
                if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
                        call_rcu(&ev_fd->rcu, io_eventfd_ops);
                return 0;
        }

        return -ENXIO;
}

static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
                           unsigned nr_args)
{
        struct io_uring_probe *p;
        size_t size;
        int i, ret;

        size = struct_size(p, ops, nr_args);
        if (size == SIZE_MAX)
                return -EOVERFLOW;
        p = kzalloc(size, GFP_KERNEL);
        if (!p)
                return -ENOMEM;

        ret = -EFAULT;
        if (copy_from_user(p, arg, size))
                goto out;
        ret = -EINVAL;
        if (memchr_inv(p, 0, size))
                goto out;

        p->last_op = IORING_OP_LAST - 1;
        if (nr_args > IORING_OP_LAST)
                nr_args = IORING_OP_LAST;

        for (i = 0; i < nr_args; i++) {
                p->ops[i].op = i;
                if (!io_issue_defs[i].not_supported)
                        p->ops[i].flags = IO_URING_OP_SUPPORTED;
        }
        p->ops_len = i;

        ret = 0;
        if (copy_to_user(arg, p, size))
                ret = -EFAULT;
out:
        kfree(p);
        return ret;
}

int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
{
        const struct cred *creds;

        creds = xa_erase(&ctx->personalities, id);
        if (creds) {
                put_cred(creds);
                return 0;
        }

        return -EINVAL;
}


static int io_register_personality(struct io_ring_ctx *ctx)
{
        const struct cred *creds;
        u32 id;
        int ret;

        creds = get_current_cred();

        ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
                        XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
        if (ret < 0) {
                put_cred(creds);
                return ret;
        }
        return id;
}

static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
                                           void __user *arg, unsigned int nr_args)
{
        struct io_uring_restriction *res;
        size_t size;
        int i, ret;

        /* Restrictions allowed only if rings started disabled */
        if (!(ctx->flags & IORING_SETUP_R_DISABLED))
                return -EBADFD;

        /* We allow only a single restrictions registration */
        if (ctx->restrictions.registered)
                return -EBUSY;

        if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
                return -EINVAL;

        size = array_size(nr_args, sizeof(*res));
        if (size == SIZE_MAX)
                return -EOVERFLOW;

        res = memdup_user(arg, size);
        if (IS_ERR(res))
                return PTR_ERR(res);

        ret = 0;

        for (i = 0; i < nr_args; i++) {
                switch (res[i].opcode) {
                case IORING_RESTRICTION_REGISTER_OP:
                        if (res[i].register_op >= IORING_REGISTER_LAST) {
                                ret = -EINVAL;
                                goto out;
                        }

                        __set_bit(res[i].register_op,
                                  ctx->restrictions.register_op);
                        break;
                case IORING_RESTRICTION_SQE_OP:
                        if (res[i].sqe_op >= IORING_OP_LAST) {
                                ret = -EINVAL;
                                goto out;
                        }

                        __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
                        break;
                case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
                        ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
                        break;
                case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
                        ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
                        break;
                default:
                        ret = -EINVAL;
                        goto out;
                }
        }

out:
        /* Reset all restrictions if an error happened */
        if (ret != 0)
                memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
        else
                ctx->restrictions.registered = true;

        kfree(res);
        return ret;
}

static int io_register_enable_rings(struct io_ring_ctx *ctx)
{
        if (!(ctx->flags & IORING_SETUP_R_DISABLED))
                return -EBADFD;

        if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
                WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
                /*
                 * Lazy activation attempts would fail if it was polled before
                 * submitter_task is set.
                 */
                if (wq_has_sleeper(&ctx->poll_wq))
                        io_activate_pollwq(ctx);
        }

        if (ctx->restrictions.registered)
                ctx->restricted = 1;

        ctx->flags &= ~IORING_SETUP_R_DISABLED;
        if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
                wake_up(&ctx->sq_data->wait);
        return 0;
}

static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
                                         cpumask_var_t new_mask)
{
        int ret;

        if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
                ret = io_wq_cpu_affinity(current->io_uring, new_mask);
        } else {
                mutex_unlock(&ctx->uring_lock);
                ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
                mutex_lock(&ctx->uring_lock);
        }

        return ret;
}

static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
                                       void __user *arg, unsigned len)
{
        cpumask_var_t new_mask;
        int ret;

        if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
                return -ENOMEM;

        cpumask_clear(new_mask);
        if (len > cpumask_size())
                len = cpumask_size();

#ifdef CONFIG_COMPAT
        if (in_compat_syscall())
                ret = compat_get_bitmap(cpumask_bits(new_mask),
                                        (const compat_ulong_t __user *)arg,
                                        len * 8 /* CHAR_BIT */);
        else
#endif
                ret = copy_from_user(new_mask, arg, len);

        if (ret) {
                free_cpumask_var(new_mask);
                return -EFAULT;
        }

        ret = __io_register_iowq_aff(ctx, new_mask);
        free_cpumask_var(new_mask);
        return ret;
}

static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
{
        return __io_register_iowq_aff(ctx, NULL);
}

static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
                                               void __user *arg)
        __must_hold(&ctx->uring_lock)
{
        struct io_tctx_node *node;
        struct io_uring_task *tctx = NULL;
        struct io_sq_data *sqd = NULL;
        __u32 new_count[2];
        int i, ret;

        if (copy_from_user(new_count, arg, sizeof(new_count)))
                return -EFAULT;
        for (i = 0; i < ARRAY_SIZE(new_count); i++)
                if (new_count[i] > INT_MAX)
                        return -EINVAL;

        if (ctx->flags & IORING_SETUP_SQPOLL) {
                sqd = ctx->sq_data;
                if (sqd) {
                        /*
                         * Observe the correct sqd->lock -> ctx->uring_lock
                         * ordering. Fine to drop uring_lock here, we hold
                         * a ref to the ctx.
                         */
                        refcount_inc(&sqd->refs);
                        mutex_unlock(&ctx->uring_lock);
                        mutex_lock(&sqd->lock);
                        mutex_lock(&ctx->uring_lock);
                        if (sqd->thread)
                                tctx = sqd->thread->io_uring;
                }
        } else {
                tctx = current->io_uring;
        }

        BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));

        for (i = 0; i < ARRAY_SIZE(new_count); i++)
                if (new_count[i])
                        ctx->iowq_limits[i] = new_count[i];
        ctx->iowq_limits_set = true;

        if (tctx && tctx->io_wq) {
                ret = io_wq_max_workers(tctx->io_wq, new_count);
                if (ret)
                        goto err;
        } else {
                memset(new_count, 0, sizeof(new_count));
        }

        if (sqd) {
                mutex_unlock(&ctx->uring_lock);
                mutex_unlock(&sqd->lock);
                io_put_sq_data(sqd);
                mutex_lock(&ctx->uring_lock);
        }

        if (copy_to_user(arg, new_count, sizeof(new_count)))
                return -EFAULT;

        /* that's it for SQPOLL, only the SQPOLL task creates requests */
        if (sqd)
                return 0;

        /* now propagate the restriction to all registered users */
        list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
                tctx = node->task->io_uring;
                if (WARN_ON_ONCE(!tctx->io_wq))
                        continue;

                for (i = 0; i < ARRAY_SIZE(new_count); i++)
                        new_count[i] = ctx->iowq_limits[i];
                /* ignore errors, it always returns zero anyway */
                (void)io_wq_max_workers(tctx->io_wq, new_count);
        }
        return 0;
err:
        if (sqd) {
                mutex_unlock(&ctx->uring_lock);
                mutex_unlock(&sqd->lock);
                io_put_sq_data(sqd);
                mutex_lock(&ctx->uring_lock);
        }
        return ret;
}

static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
                               void __user *arg, unsigned nr_args)
        __releases(ctx->uring_lock)
        __acquires(ctx->uring_lock)
{
        int ret;

        /*
         * We don't quiesce the refs for register anymore and so it can't be
         * dying as we're holding a file ref here.
         */
        if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
                return -ENXIO;

        if (ctx->submitter_task && ctx->submitter_task != current)
                return -EEXIST;

        if (ctx->restricted) {
                opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
                if (!test_bit(opcode, ctx->restrictions.register_op))
                        return -EACCES;
        }

        switch (opcode) {
        case IORING_REGISTER_BUFFERS:
                ret = -EFAULT;
                if (!arg)
                        break;
                ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
                break;
        case IORING_UNREGISTER_BUFFERS:
                ret = -EINVAL;
                if (arg || nr_args)
                        break;
                ret = io_sqe_buffers_unregister(ctx);
                break;
        case IORING_REGISTER_FILES:
                ret = -EFAULT;
                if (!arg)
                        break;
                ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
                break;
        case IORING_UNREGISTER_FILES:
                ret = -EINVAL;
                if (arg || nr_args)
                        break;
                ret = io_sqe_files_unregister(ctx);
                break;
        case IORING_REGISTER_FILES_UPDATE:
                ret = io_register_files_update(ctx, arg, nr_args);
                break;
        case IORING_REGISTER_EVENTFD:
                ret = -EINVAL;
                if (nr_args != 1)
                        break;
                ret = io_eventfd_register(ctx, arg, 0);
                break;
        case IORING_REGISTER_EVENTFD_ASYNC:
                ret = -EINVAL;
                if (nr_args != 1)
                        break;
                ret = io_eventfd_register(ctx, arg, 1);
                break;
        case IORING_UNREGISTER_EVENTFD:
                ret = -EINVAL;
                if (arg || nr_args)
                        break;
                ret = io_eventfd_unregister(ctx);
                break;
        case IORING_REGISTER_PROBE:
                ret = -EINVAL;
                if (!arg || nr_args > 256)
                        break;
                ret = io_probe(ctx, arg, nr_args);
                break;
        case IORING_REGISTER_PERSONALITY:
                ret = -EINVAL;
                if (arg || nr_args)
                        break;
                ret = io_register_personality(ctx);
                break;
        case IORING_UNREGISTER_PERSONALITY:
                ret = -EINVAL;
                if (arg)
                        break;
                ret = io_unregister_personality(ctx, nr_args);
                break;
        case IORING_REGISTER_ENABLE_RINGS:
                ret = -EINVAL;
                if (arg || nr_args)
                        break;
                ret = io_register_enable_rings(ctx);
                break;
        case IORING_REGISTER_RESTRICTIONS:
                ret = io_register_restrictions(ctx, arg, nr_args);
                break;
        case IORING_REGISTER_FILES2:
                ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
                break;
        case IORING_REGISTER_FILES_UPDATE2:
                ret = io_register_rsrc_update(ctx, arg, nr_args,
                                              IORING_RSRC_FILE);
                break;
        case IORING_REGISTER_BUFFERS2:
                ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
                break;
        case IORING_REGISTER_BUFFERS_UPDATE:
                ret = io_register_rsrc_update(ctx, arg, nr_args,
                                              IORING_RSRC_BUFFER);
                break;
        case IORING_REGISTER_IOWQ_AFF:
                ret = -EINVAL;
                if (!arg || !nr_args)
                        break;
                ret = io_register_iowq_aff(ctx, arg, nr_args);
                break;
        case IORING_UNREGISTER_IOWQ_AFF:
                ret = -EINVAL;
                if (arg || nr_args)
                        break;
                ret = io_unregister_iowq_aff(ctx);
                break;
        case IORING_REGISTER_IOWQ_MAX_WORKERS:
                ret = -EINVAL;
                if (!arg || nr_args != 2)
                        break;
                ret = io_register_iowq_max_workers(ctx, arg);
                break;
        case IORING_REGISTER_RING_FDS:
                ret = io_ringfd_register(ctx, arg, nr_args);
                break;
        case IORING_UNREGISTER_RING_FDS:
                ret = io_ringfd_unregister(ctx, arg, nr_args);
                break;
        case IORING_REGISTER_PBUF_RING:
                ret = -EINVAL;
                if (!arg || nr_args != 1)
                        break;
                ret = io_register_pbuf_ring(ctx, arg);
                break;
        case IORING_UNREGISTER_PBUF_RING:
                ret = -EINVAL;
                if (!arg || nr_args != 1)
                        break;
                ret = io_unregister_pbuf_ring(ctx, arg);
                break;
        case IORING_REGISTER_SYNC_CANCEL:
                ret = -EINVAL;
                if (!arg || nr_args != 1)
                        break;
                ret = io_sync_cancel(ctx, arg);
                break;
        case IORING_REGISTER_FILE_ALLOC_RANGE:
                ret = -EINVAL;
                if (!arg || nr_args)
                        break;
                ret = io_register_file_alloc_range(ctx, arg);
                break;
        case IORING_REGISTER_PBUF_STATUS:
                ret = -EINVAL;
                if (!arg || nr_args != 1)
                        break;
                ret = io_register_pbuf_status(ctx, arg);
                break;
        case IORING_REGISTER_NAPI:
                ret = -EINVAL;
                if (!arg || nr_args != 1)
                        break;
                ret = io_register_napi(ctx, arg);
                break;
        case IORING_UNREGISTER_NAPI:
                ret = -EINVAL;
                if (nr_args != 1)
                        break;
                ret = io_unregister_napi(ctx, arg);
                break;
        default:
                ret = -EINVAL;
                break;
        }

        return ret;
}

SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
                void __user *, arg, unsigned int, nr_args)
{
        struct io_ring_ctx *ctx;
        long ret = -EBADF;
        struct file *file;
        bool use_registered_ring;

        use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
        opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;

        if (opcode >= IORING_REGISTER_LAST)
                return -EINVAL;

        if (use_registered_ring) {
                /*
                 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
                 * need only dereference our task private array to find it.
                 */
                struct io_uring_task *tctx = current->io_uring;

                if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
                        return -EINVAL;
                fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
                file = tctx->registered_rings[fd];
                if (unlikely(!file))
                        return -EBADF;
        } else {
                file = fget(fd);
                if (unlikely(!file))
                        return -EBADF;
                ret = -EOPNOTSUPP;
                if (!io_is_uring_fops(file))
                        goto out_fput;
        }

        ctx = file->private_data;

        mutex_lock(&ctx->uring_lock);
        ret = __io_uring_register(ctx, opcode, arg, nr_args);
        mutex_unlock(&ctx->uring_lock);
        trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
out_fput:
        if (!use_registered_ring)
                fput(file);
        return ret;
}





























































    4 



















    2 







    5 













    3 

    2 

    2 



    2 
    2 




    4 

    4 





    5 










































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
 *
 * Provides a framework for enqueueing and running callbacks from hardirq
 * context. The enqueueing is NMI-safe.
 */

#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/irq_work.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/irqflags.h>
#include <linux/sched.h>
#include <linux/tick.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/smpboot.h>
#include <asm/processor.h>
#include <linux/kasan.h>

#include <trace/events/ipi.h>

static DEFINE_PER_CPU(struct llist_head, raised_list);
static DEFINE_PER_CPU(struct llist_head, lazy_list);
static DEFINE_PER_CPU(struct task_struct *, irq_workd);

static void wake_irq_workd(void)
{
        struct task_struct *tsk = __this_cpu_read(irq_workd);

        if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk)
                wake_up_process(tsk);
}

#ifdef CONFIG_SMP
static void irq_work_wake(struct irq_work *entry)
{
        wake_irq_workd();
}

static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) =
        IRQ_WORK_INIT_HARD(irq_work_wake);
#endif

static int irq_workd_should_run(unsigned int cpu)
{
        return !llist_empty(this_cpu_ptr(&lazy_list));
}

/*
 * Claim the entry so that no one else will poke at it.
 */
static bool irq_work_claim(struct irq_work *work)
{
        int oflags;

        oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags);
        /*
         * If the work is already pending, no need to raise the IPI.
         * The pairing smp_mb() in irq_work_single() makes sure
         * everything we did before is visible.
         */
        if (oflags & IRQ_WORK_PENDING)
                return false;
        return true;
}

void __weak arch_irq_work_raise(void)
{
        /*
         * Lame architectures will get the timer tick callback
         */
}

static __always_inline void irq_work_raise(struct irq_work *work)
{
        if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt())
                trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func);

        arch_irq_work_raise();
}

/* Enqueue on current CPU, work must already be claimed and preempt disabled */
static void __irq_work_queue_local(struct irq_work *work)
{
        struct llist_head *list;
        bool rt_lazy_work = false;
        bool lazy_work = false;
        int work_flags;

        work_flags = atomic_read(&work->node.a_flags);
        if (work_flags & IRQ_WORK_LAZY)
                lazy_work = true;
        else if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
                 !(work_flags & IRQ_WORK_HARD_IRQ))
                rt_lazy_work = true;

        if (lazy_work || rt_lazy_work)
                list = this_cpu_ptr(&lazy_list);
        else
                list = this_cpu_ptr(&raised_list);

        if (!llist_add(&work->node.llist, list))
                return;

        /* If the work is "lazy", handle it from next tick if any */
        if (!lazy_work || tick_nohz_tick_stopped())
                irq_work_raise(work);
}

/* Enqueue the irq work @work on the current CPU */
bool irq_work_queue(struct irq_work *work)
{
        /* Only queue if not already pending */
        if (!irq_work_claim(work))
                return false;

        /* Queue the entry and raise the IPI if needed. */
        preempt_disable();
        __irq_work_queue_local(work);
        preempt_enable();

        return true;
}
EXPORT_SYMBOL_GPL(irq_work_queue);

/*
 * Enqueue the irq_work @work on @cpu unless it's already pending
 * somewhere.
 *
 * Can be re-enqueued while the callback is still in progress.
 */
bool irq_work_queue_on(struct irq_work *work, int cpu)
{
#ifndef CONFIG_SMP
        return irq_work_queue(work);

#else /* CONFIG_SMP: */
        /* All work should have been flushed before going offline */
        WARN_ON_ONCE(cpu_is_offline(cpu));

        /* Only queue if not already pending */
        if (!irq_work_claim(work))
                return false;

        kasan_record_aux_stack_noalloc(work);

        preempt_disable();
        if (cpu != smp_processor_id()) {
                /* Arch remote IPI send/receive backend aren't NMI safe */
                WARN_ON_ONCE(in_nmi());

                /*
                 * On PREEMPT_RT the items which are not marked as
                 * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work
                 * item is used on the remote CPU to wake the thread.
                 */
                if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
                    !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) {

                        if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu)))
                                goto out;

                        work = &per_cpu(irq_work_wakeup, cpu);
                        if (!irq_work_claim(work))
                                goto out;
                }

                __smp_call_single_queue(cpu, &work->node.llist);
        } else {
                __irq_work_queue_local(work);
        }
out:
        preempt_enable();

        return true;
#endif /* CONFIG_SMP */
}

bool irq_work_needs_cpu(void)
{
        struct llist_head *raised, *lazy;

        raised = this_cpu_ptr(&raised_list);
        lazy = this_cpu_ptr(&lazy_list);

        if (llist_empty(raised) || arch_irq_work_has_interrupt())
                if (llist_empty(lazy))
                        return false;

        /* All work should have been flushed before going offline */
        WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));

        return true;
}

void irq_work_single(void *arg)
{
        struct irq_work *work = arg;
        int flags;

        /*
         * Clear the PENDING bit, after this point the @work can be re-used.
         * The PENDING bit acts as a lock, and we own it, so we can clear it
         * without atomic ops.
         */
        flags = atomic_read(&work->node.a_flags);
        flags &= ~IRQ_WORK_PENDING;
        atomic_set(&work->node.a_flags, flags);

        /*
         * See irq_work_claim().
         */
        smp_mb();

        lockdep_irq_work_enter(flags);
        work->func(work);
        lockdep_irq_work_exit(flags);

        /*
         * Clear the BUSY bit, if set, and return to the free state if no-one
         * else claimed it meanwhile.
         */
        (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY);

        if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) ||
            !arch_irq_work_has_interrupt())
                rcuwait_wake_up(&work->irqwait);
}

static void irq_work_run_list(struct llist_head *list)
{
        struct irq_work *work, *tmp;
        struct llist_node *llnode;

        /*
         * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed
         * in a per-CPU thread in preemptible context. Only the items which are
         * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context.
         */
        BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT));

        if (llist_empty(list))
                return;

        llnode = llist_del_all(list);
        llist_for_each_entry_safe(work, tmp, llnode, node.llist)
                irq_work_single(work);
}

/*
 * hotplug calls this through:
 *  hotplug_cfd() -> flush_smp_call_function_queue()
 */
void irq_work_run(void)
{
        irq_work_run_list(this_cpu_ptr(&raised_list));
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                irq_work_run_list(this_cpu_ptr(&lazy_list));
        else
                wake_irq_workd();
}
EXPORT_SYMBOL_GPL(irq_work_run);

void irq_work_tick(void)
{
        struct llist_head *raised = this_cpu_ptr(&raised_list);

        if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
                irq_work_run_list(raised);

        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                irq_work_run_list(this_cpu_ptr(&lazy_list));
        else
                wake_irq_workd();
}

/*
 * Synchronize against the irq_work @entry, ensures the entry is not
 * currently in use.
 */
void irq_work_sync(struct irq_work *work)
{
        lockdep_assert_irqs_enabled();
        might_sleep();

        if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) ||
            !arch_irq_work_has_interrupt()) {
                rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work),
                                   TASK_UNINTERRUPTIBLE);
                return;
        }

        while (irq_work_is_busy(work))
                cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);

static void run_irq_workd(unsigned int cpu)
{
        irq_work_run_list(this_cpu_ptr(&lazy_list));
}

static void irq_workd_setup(unsigned int cpu)
{
        sched_set_fifo_low(current);
}

static struct smp_hotplug_thread irqwork_threads = {
        .store                  = &irq_workd,
        .setup                        = irq_workd_setup,
        .thread_should_run      = irq_workd_should_run,
        .thread_fn              = run_irq_workd,
        .thread_comm            = "irq_work/%u",
};

static __init int irq_work_init_threads(void)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                BUG_ON(smpboot_register_percpu_thread(&irqwork_threads));
        return 0;
}
early_initcall(irq_work_init_threads);




















































































































    5 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_NAMEI_H
#define _LINUX_NAMEI_H

#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/path.h>
#include <linux/fcntl.h>
#include <linux/errno.h>

enum { MAX_NESTED_LINKS = 8 };

#define MAXSYMLINKS 40

/*
 * Type of the last component on LOOKUP_PARENT
 */
enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT};

/* pathwalk mode */
#define LOOKUP_FOLLOW                0x0001        /* follow links at the end */
#define LOOKUP_DIRECTORY        0x0002        /* require a directory */
#define LOOKUP_AUTOMOUNT        0x0004  /* force terminal automount */
#define LOOKUP_EMPTY                0x4000        /* accept empty path [user_... only] */
#define LOOKUP_DOWN                0x8000        /* follow mounts in the starting point */
#define LOOKUP_MOUNTPOINT        0x0080        /* follow mounts in the end */

#define LOOKUP_REVAL                0x0020        /* tell ->d_revalidate() to trust no cache */
#define LOOKUP_RCU                0x0040        /* RCU pathwalk mode; semi-internal */

/* These tell filesystem methods that we are dealing with the final component... */
#define LOOKUP_OPEN                0x0100        /* ... in open */
#define LOOKUP_CREATE                0x0200        /* ... in object creation */
#define LOOKUP_EXCL                0x0400        /* ... in exclusive creation */
#define LOOKUP_RENAME_TARGET        0x0800        /* ... in destination of rename() */

/* internal use only */
#define LOOKUP_PARENT                0x0010

/* Scoping flags for lookup. */
#define LOOKUP_NO_SYMLINKS        0x010000 /* No symlink crossing. */
#define LOOKUP_NO_MAGICLINKS        0x020000 /* No nd_jump_link() crossing. */
#define LOOKUP_NO_XDEV                0x040000 /* No mountpoint crossing. */
#define LOOKUP_BENEATH                0x080000 /* No escaping from starting point. */
#define LOOKUP_IN_ROOT                0x100000 /* Treat dirfd as fs root. */
#define LOOKUP_CACHED                0x200000 /* Only do cached lookup */
#define LOOKUP_LINKAT_EMPTY        0x400000 /* Linkat request with empty path. */
/* LOOKUP_* flags which do scope-related checks based on the dirfd. */
#define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT)

extern int path_pts(struct path *path);

extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty);

static inline int user_path_at(int dfd, const char __user *name, unsigned flags,
                 struct path *path)
{
        return user_path_at_empty(dfd, name, flags, path, NULL);
}

struct dentry *lookup_one_qstr_excl(const struct qstr *name,
                                    struct dentry *base,
                                    unsigned int flags);
extern int kern_path(const char *, unsigned, struct path *);

extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int);
extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int);
extern void done_path_create(struct path *, struct dentry *);
extern struct dentry *kern_path_locked(const char *, struct path *);
extern struct dentry *user_path_locked_at(int , const char __user *, struct path *);
int vfs_path_parent_lookup(struct filename *filename, unsigned int flags,
                           struct path *parent, struct qstr *last, int *type,
                           const struct path *root);
int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *,
                    unsigned int, struct path *);

extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int);
extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
extern struct dentry *lookup_positive_unlocked(const char *, struct dentry *, int);
struct dentry *lookup_one(struct mnt_idmap *, const char *, struct dentry *, int);
struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap,
                                   const char *name, struct dentry *base,
                                   int len);
struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap,
                                            const char *name,
                                            struct dentry *base, int len);

extern int follow_down_one(struct path *);
extern int follow_down(struct path *path, unsigned int flags);
extern int follow_up(struct path *);

extern struct dentry *lock_rename(struct dentry *, struct dentry *);
extern struct dentry *lock_rename_child(struct dentry *, struct dentry *);
extern void unlock_rename(struct dentry *, struct dentry *);

/**
 * mode_strip_umask - handle vfs umask stripping
 * @dir:        parent directory of the new inode
 * @mode:        mode of the new inode to be created in @dir
 *
 * In most filesystems, umask stripping depends on whether or not the
 * filesystem supports POSIX ACLs. If the filesystem doesn't support it umask
 * stripping is done directly in here. If the filesystem does support POSIX
 * ACLs umask stripping is deferred until the filesystem calls
 * posix_acl_create().
 *
 * Some filesystems (like NFSv4) also want to avoid umask stripping by the
 * VFS, but don't support POSIX ACLs. Those filesystems can set SB_I_NOUMASK
 * to get this effect without declaring that they support POSIX ACLs.
 *
 * Returns: mode
 */
static inline umode_t __must_check mode_strip_umask(const struct inode *dir, umode_t mode)
{
        if (!IS_POSIXACL(dir) && !(dir->i_sb->s_iflags & SB_I_NOUMASK))
                mode &= ~current_umask();
        return mode;
}

extern int __must_check nd_jump_link(const struct path *path);

static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
{
        ((char *) name)[min(len, maxlen)] = '\0';
}

/**
 * retry_estale - determine whether the caller should retry an operation
 * @error: the error that would currently be returned
 * @flags: flags being used for next lookup attempt
 *
 * Check to see if the error code was -ESTALE, and then determine whether
 * to retry the call based on whether "flags" already has LOOKUP_REVAL set.
 *
 * Returns true if the caller should try the operation again.
 */
static inline bool
retry_estale(const long error, const unsigned int flags)
{
        return unlikely(error == -ESTALE && !(flags & LOOKUP_REVAL));
}

#endif /* _LINUX_NAMEI_H */

































































































































    9 








   10 













   11 











    2 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
// SPDX-License-Identifier: GPL-2.0
/*
 * Functions related to generic timeout handling of requests.
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/fault-inject.h>

#include "blk.h"
#include "blk-mq.h"

#ifdef CONFIG_FAIL_IO_TIMEOUT

static DECLARE_FAULT_ATTR(fail_io_timeout);

static int __init setup_fail_io_timeout(char *str)
{
        return setup_fault_attr(&fail_io_timeout, str);
}
__setup("fail_io_timeout=", setup_fail_io_timeout);

bool __blk_should_fake_timeout(struct request_queue *q)
{
        return should_fail(&fail_io_timeout, 1);
}
EXPORT_SYMBOL_GPL(__blk_should_fake_timeout);

static int __init fail_io_timeout_debugfs(void)
{
        struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout",
                                                NULL, &fail_io_timeout);

        return PTR_ERR_OR_ZERO(dir);
}

late_initcall(fail_io_timeout_debugfs);

ssize_t part_timeout_show(struct device *dev, struct device_attribute *attr,
                          char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);
        int set = test_bit(QUEUE_FLAG_FAIL_IO, &disk->queue->queue_flags);

        return sprintf(buf, "%d\n", set != 0);
}

ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr,
                           const char *buf, size_t count)
{
        struct gendisk *disk = dev_to_disk(dev);
        int val;

        if (count) {
                struct request_queue *q = disk->queue;
                char *p = (char *) buf;

                val = simple_strtoul(p, &p, 10);
                if (val)
                        blk_queue_flag_set(QUEUE_FLAG_FAIL_IO, q);
                else
                        blk_queue_flag_clear(QUEUE_FLAG_FAIL_IO, q);
        }

        return count;
}

#endif /* CONFIG_FAIL_IO_TIMEOUT */

/**
 * blk_abort_request - Request recovery for the specified command
 * @req:        pointer to the request of interest
 *
 * This function requests that the block layer start recovery for the
 * request by deleting the timer and calling the q's timeout function.
 * LLDDs who implement their own error recovery MAY ignore the timeout
 * event if they generated blk_abort_request.
 */
void blk_abort_request(struct request *req)
{
        /*
         * All we need to ensure is that timeout scan takes place
         * immediately and that scan sees the new timeout value.
         * No need for fancy synchronizations.
         */
        WRITE_ONCE(req->deadline, jiffies);
        kblockd_schedule_work(&req->q->timeout_work);
}
EXPORT_SYMBOL_GPL(blk_abort_request);

static unsigned long blk_timeout_mask __read_mostly;

static int __init blk_timeout_init(void)
{
        blk_timeout_mask = roundup_pow_of_two(HZ) - 1;
        return 0;
}

late_initcall(blk_timeout_init);

/*
 * Just a rough estimate, we don't care about specific values for timeouts.
 */
static inline unsigned long blk_round_jiffies(unsigned long j)
{
        return (j + blk_timeout_mask) + 1;
}

unsigned long blk_rq_timeout(unsigned long timeout)
{
        unsigned long maxt;

        maxt = blk_round_jiffies(jiffies + BLK_MAX_TIMEOUT);
        if (time_after(timeout, maxt))
                timeout = maxt;

        return timeout;
}

/**
 * blk_add_timer - Start timeout timer for a single request
 * @req:        request that is about to start running.
 *
 * Notes:
 *    Each request has its own timer, and as it is added to the queue, we
 *    set up the timer. When the request completes, we cancel the timer.
 */
void blk_add_timer(struct request *req)
{
        struct request_queue *q = req->q;
        unsigned long expiry;

        /*
         * Some LLDs, like scsi, peek at the timeout to prevent a
         * command from being retried forever.
         */
        if (!req->timeout)
                req->timeout = q->rq_timeout;

        req->rq_flags &= ~RQF_TIMED_OUT;

        expiry = jiffies + req->timeout;
        WRITE_ONCE(req->deadline, expiry);

        /*
         * If the timer isn't already pending or this timeout is earlier
         * than an existing one, modify the timer. Round up to next nearest
         * second.
         */
        expiry = blk_rq_timeout(blk_round_jiffies(expiry));

        if (!timer_pending(&q->timeout) ||
            time_before(expiry, q->timeout.expires)) {
                unsigned long diff = q->timeout.expires - expiry;

                /*
                 * Due to added timer slack to group timers, the timer
                 * will often be a little in front of what we asked for.
                 * So apply some tolerance here too, otherwise we keep
                 * modifying the timer because expires for value X
                 * will be X + something.
                 */
                if (!timer_pending(&q->timeout) || (diff >= HZ / 2))
                        mod_timer(&q->timeout, expiry);
        }

}




































    5 

















    4 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
/* SPDX-License-Identifier: GPL-2.0 */
/* Freezer declarations */

#ifndef FREEZER_H_INCLUDED
#define FREEZER_H_INCLUDED

#include <linux/debug_locks.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>

#ifdef CONFIG_FREEZER
DECLARE_STATIC_KEY_FALSE(freezer_active);

extern bool pm_freezing;                /* PM freezing in effect */
extern bool pm_nosig_freezing;                /* PM nosig freezing in effect */

/*
 * Timeout for stopping processes
 */
extern unsigned int freeze_timeout_msecs;

/*
 * Check if a process has been frozen
 */
extern bool frozen(struct task_struct *p);

extern bool freezing_slow_path(struct task_struct *p);

/*
 * Check if there is a request to freeze a process
 */
static inline bool freezing(struct task_struct *p)
{
        if (static_branch_unlikely(&freezer_active))
                return freezing_slow_path(p);

        return false;
}

/* Takes and releases task alloc lock using task_lock() */
extern void __thaw_task(struct task_struct *t);

extern bool __refrigerator(bool check_kthr_stop);
extern int freeze_processes(void);
extern int freeze_kernel_threads(void);
extern void thaw_processes(void);
extern void thaw_kernel_threads(void);

static inline bool try_to_freeze(void)
{
        might_sleep();
        if (likely(!freezing(current)))
                return false;
        if (!(current->flags & PF_NOFREEZE))
                debug_check_no_locks_held();
        return __refrigerator(false);
}

extern bool freeze_task(struct task_struct *p);
extern bool set_freezable(void);

#ifdef CONFIG_CGROUP_FREEZER
extern bool cgroup_freezing(struct task_struct *task);
#else /* !CONFIG_CGROUP_FREEZER */
static inline bool cgroup_freezing(struct task_struct *task)
{
        return false;
}
#endif /* !CONFIG_CGROUP_FREEZER */

#else /* !CONFIG_FREEZER */
static inline bool frozen(struct task_struct *p) { return false; }
static inline bool freezing(struct task_struct *p) { return false; }
static inline void __thaw_task(struct task_struct *t) {}

static inline bool __refrigerator(bool check_kthr_stop) { return false; }
static inline int freeze_processes(void) { return -ENOSYS; }
static inline int freeze_kernel_threads(void) { return -ENOSYS; }
static inline void thaw_processes(void) {}
static inline void thaw_kernel_threads(void) {}

static inline bool try_to_freeze(void) { return false; }

static inline void set_freezable(void) {}

#endif /* !CONFIG_FREEZER */

#endif        /* FREEZER_H_INCLUDED */






































































    5 



































    1 










    5 





















    1 


    1 








































































































    1 















    1 






















    1 
    1 


    1 



    1 




    1 




    1 







    1 













































    4 











































































































    1 








































































    1 
    2 











































































































    9 


    1 
































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
 */
#ifndef __LINUX_BIO_H
#define __LINUX_BIO_H

#include <linux/mempool.h>
/* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
#include <linux/blk_types.h>
#include <linux/uio.h>

#define BIO_MAX_VECS                256U

struct queue_limits;

static inline unsigned int bio_max_segs(unsigned int nr_segs)
{
        return min(nr_segs, BIO_MAX_VECS);
}

#define bio_prio(bio)                        (bio)->bi_ioprio
#define bio_set_prio(bio, prio)                ((bio)->bi_ioprio = prio)

#define bio_iter_iovec(bio, iter)                                \
        bvec_iter_bvec((bio)->bi_io_vec, (iter))

#define bio_iter_page(bio, iter)                                \
        bvec_iter_page((bio)->bi_io_vec, (iter))
#define bio_iter_len(bio, iter)                                        \
        bvec_iter_len((bio)->bi_io_vec, (iter))
#define bio_iter_offset(bio, iter)                                \
        bvec_iter_offset((bio)->bi_io_vec, (iter))

#define bio_page(bio)                bio_iter_page((bio), (bio)->bi_iter)
#define bio_offset(bio)                bio_iter_offset((bio), (bio)->bi_iter)
#define bio_iovec(bio)                bio_iter_iovec((bio), (bio)->bi_iter)

#define bvec_iter_sectors(iter)        ((iter).bi_size >> 9)
#define bvec_iter_end_sector(iter) ((iter).bi_sector + bvec_iter_sectors((iter)))

#define bio_sectors(bio)        bvec_iter_sectors((bio)->bi_iter)
#define bio_end_sector(bio)        bvec_iter_end_sector((bio)->bi_iter)

/*
 * Return the data direction, READ or WRITE.
 */
#define bio_data_dir(bio) \
        (op_is_write(bio_op(bio)) ? WRITE : READ)

/*
 * Check whether this bio carries any data or not. A NULL bio is allowed.
 */
static inline bool bio_has_data(struct bio *bio)
{
        if (bio &&
            bio->bi_iter.bi_size &&
            bio_op(bio) != REQ_OP_DISCARD &&
            bio_op(bio) != REQ_OP_SECURE_ERASE &&
            bio_op(bio) != REQ_OP_WRITE_ZEROES)
                return true;

        return false;
}

static inline bool bio_no_advance_iter(const struct bio *bio)
{
        return bio_op(bio) == REQ_OP_DISCARD ||
               bio_op(bio) == REQ_OP_SECURE_ERASE ||
               bio_op(bio) == REQ_OP_WRITE_ZEROES;
}

static inline void *bio_data(struct bio *bio)
{
        if (bio_has_data(bio))
                return page_address(bio_page(bio)) + bio_offset(bio);

        return NULL;
}

static inline bool bio_next_segment(const struct bio *bio,
                                    struct bvec_iter_all *iter)
{
        if (iter->idx >= bio->bi_vcnt)
                return false;

        bvec_advance(&bio->bi_io_vec[iter->idx], iter);
        return true;
}

/*
 * drivers should _never_ use the all version - the bio may have been split
 * before it got to the driver and the driver won't own all of it
 */
#define bio_for_each_segment_all(bvl, bio, iter) \
        for (bvl = bvec_init_iter_all(&iter); bio_next_segment((bio), &iter); )

static inline void bio_advance_iter(const struct bio *bio,
                                    struct bvec_iter *iter, unsigned int bytes)
{
        iter->bi_sector += bytes >> 9;

        if (bio_no_advance_iter(bio))
                iter->bi_size -= bytes;
        else
                bvec_iter_advance(bio->bi_io_vec, iter, bytes);
                /* TODO: It is reasonable to complete bio with error here. */
}

/* @bytes should be less or equal to bvec[i->bi_idx].bv_len */
static inline void bio_advance_iter_single(const struct bio *bio,
                                           struct bvec_iter *iter,
                                           unsigned int bytes)
{
        iter->bi_sector += bytes >> 9;

        if (bio_no_advance_iter(bio))
                iter->bi_size -= bytes;
        else
                bvec_iter_advance_single(bio->bi_io_vec, iter, bytes);
}

void __bio_advance(struct bio *, unsigned bytes);

/**
 * bio_advance - increment/complete a bio by some number of bytes
 * @bio:        bio to advance
 * @nbytes:        number of bytes to complete
 *
 * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
 * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
 * be updated on the last bvec as well.
 *
 * @bio will then represent the remaining, uncompleted portion of the io.
 */
static inline void bio_advance(struct bio *bio, unsigned int nbytes)
{
        if (nbytes == bio->bi_iter.bi_size) {
                bio->bi_iter.bi_size = 0;
                return;
        }
        __bio_advance(bio, nbytes);
}

#define __bio_for_each_segment(bvl, bio, iter, start)                        \
        for (iter = (start);                                                \
             (iter).bi_size &&                                                \
                ((bvl = bio_iter_iovec((bio), (iter))), 1);                \
             bio_advance_iter_single((bio), &(iter), (bvl).bv_len))

#define bio_for_each_segment(bvl, bio, iter)                                \
        __bio_for_each_segment(bvl, bio, iter, (bio)->bi_iter)

#define __bio_for_each_bvec(bvl, bio, iter, start)                \
        for (iter = (start);                                                \
             (iter).bi_size &&                                                \
                ((bvl = mp_bvec_iter_bvec((bio)->bi_io_vec, (iter))), 1); \
             bio_advance_iter_single((bio), &(iter), (bvl).bv_len))

/* iterate over multi-page bvec */
#define bio_for_each_bvec(bvl, bio, iter)                        \
        __bio_for_each_bvec(bvl, bio, iter, (bio)->bi_iter)

/*
 * Iterate over all multi-page bvecs. Drivers shouldn't use this version for the
 * same reasons as bio_for_each_segment_all().
 */
#define bio_for_each_bvec_all(bvl, bio, i)                \
        for (i = 0, bvl = bio_first_bvec_all(bio);        \
             i < (bio)->bi_vcnt; i++, bvl++)

#define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)

static inline unsigned bio_segments(struct bio *bio)
{
        unsigned segs = 0;
        struct bio_vec bv;
        struct bvec_iter iter;

        /*
         * We special case discard/write same/write zeroes, because they
         * interpret bi_size differently:
         */

        switch (bio_op(bio)) {
        case REQ_OP_DISCARD:
        case REQ_OP_SECURE_ERASE:
        case REQ_OP_WRITE_ZEROES:
                return 0;
        default:
                break;
        }

        bio_for_each_segment(bv, bio, iter)
                segs++;

        return segs;
}

/*
 * get a reference to a bio, so it won't disappear. the intended use is
 * something like:
 *
 * bio_get(bio);
 * submit_bio(rw, bio);
 * if (bio->bi_flags ...)
 *        do_something
 * bio_put(bio);
 *
 * without the bio_get(), it could potentially complete I/O before submit_bio
 * returns. and then bio would be freed memory when if (bio->bi_flags ...)
 * runs
 */
static inline void bio_get(struct bio *bio)
{
        bio->bi_flags |= (1 << BIO_REFFED);
        smp_mb__before_atomic();
        atomic_inc(&bio->__bi_cnt);
}

static inline void bio_cnt_set(struct bio *bio, unsigned int count)
{
        if (count != 1) {
                bio->bi_flags |= (1 << BIO_REFFED);
                smp_mb();
        }
        atomic_set(&bio->__bi_cnt, count);
}

static inline bool bio_flagged(struct bio *bio, unsigned int bit)
{
        return bio->bi_flags & (1U << bit);
}

static inline void bio_set_flag(struct bio *bio, unsigned int bit)
{
        bio->bi_flags |= (1U << bit);
}

static inline void bio_clear_flag(struct bio *bio, unsigned int bit)
{
        bio->bi_flags &= ~(1U << bit);
}

static inline struct bio_vec *bio_first_bvec_all(struct bio *bio)
{
        WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
        return bio->bi_io_vec;
}

static inline struct page *bio_first_page_all(struct bio *bio)
{
        return bio_first_bvec_all(bio)->bv_page;
}

static inline struct folio *bio_first_folio_all(struct bio *bio)
{
        return page_folio(bio_first_page_all(bio));
}

static inline struct bio_vec *bio_last_bvec_all(struct bio *bio)
{
        WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
        return &bio->bi_io_vec[bio->bi_vcnt - 1];
}

/**
 * struct folio_iter - State for iterating all folios in a bio.
 * @folio: The current folio we're iterating.  NULL after the last folio.
 * @offset: The byte offset within the current folio.
 * @length: The number of bytes in this iteration (will not cross folio
 *        boundary).
 */
struct folio_iter {
        struct folio *folio;
        size_t offset;
        size_t length;
        /* private: for use by the iterator */
        struct folio *_next;
        size_t _seg_count;
        int _i;
};

static inline void bio_first_folio(struct folio_iter *fi, struct bio *bio,
                                   int i)
{
        struct bio_vec *bvec = bio_first_bvec_all(bio) + i;

        if (unlikely(i >= bio->bi_vcnt)) {
                fi->folio = NULL;
                return;
        }

        fi->folio = page_folio(bvec->bv_page);
        fi->offset = bvec->bv_offset +
                        PAGE_SIZE * (bvec->bv_page - &fi->folio->page);
        fi->_seg_count = bvec->bv_len;
        fi->length = min(folio_size(fi->folio) - fi->offset, fi->_seg_count);
        fi->_next = folio_next(fi->folio);
        fi->_i = i;
}

static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio)
{
        fi->_seg_count -= fi->length;
        if (fi->_seg_count) {
                fi->folio = fi->_next;
                fi->offset = 0;
                fi->length = min(folio_size(fi->folio), fi->_seg_count);
                fi->_next = folio_next(fi->folio);
        } else {
                bio_first_folio(fi, bio, fi->_i + 1);
        }
}

/**
 * bio_for_each_folio_all - Iterate over each folio in a bio.
 * @fi: struct folio_iter which is updated for each folio.
 * @bio: struct bio to iterate over.
 */
#define bio_for_each_folio_all(fi, bio)                                \
        for (bio_first_folio(&fi, bio, 0); fi.folio; bio_next_folio(&fi, bio))

enum bip_flags {
        BIP_BLOCK_INTEGRITY        = 1 << 0, /* block layer owns integrity data */
        BIP_MAPPED_INTEGRITY        = 1 << 1, /* ref tag has been remapped */
        BIP_CTRL_NOCHECK        = 1 << 2, /* disable HBA integrity checking */
        BIP_DISK_NOCHECK        = 1 << 3, /* disable disk integrity checking */
        BIP_IP_CHECKSUM                = 1 << 4, /* IP checksum */
        BIP_INTEGRITY_USER        = 1 << 5, /* Integrity payload is user address */
        BIP_COPY_USER                = 1 << 6, /* Kernel bounce buffer in use */
};

/*
 * bio integrity payload
 */
struct bio_integrity_payload {
        struct bio                *bip_bio;        /* parent bio */

        struct bvec_iter        bip_iter;

        unsigned short                bip_vcnt;        /* # of integrity bio_vecs */
        unsigned short                bip_max_vcnt;        /* integrity bio_vec slots */
        unsigned short                bip_flags;        /* control flags */

        struct bvec_iter        bio_iter;        /* for rewinding parent bio */

        struct work_struct        bip_work;        /* I/O completion */

        struct bio_vec                *bip_vec;
        struct bio_vec                bip_inline_vecs[];/* embedded bvec array */
};

#if defined(CONFIG_BLK_DEV_INTEGRITY)

static inline struct bio_integrity_payload *bio_integrity(struct bio *bio)
{
        if (bio->bi_opf & REQ_INTEGRITY)
                return bio->bi_integrity;

        return NULL;
}

static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag)
{
        struct bio_integrity_payload *bip = bio_integrity(bio);

        if (bip)
                return bip->bip_flags & flag;

        return false;
}

static inline sector_t bip_get_seed(struct bio_integrity_payload *bip)
{
        return bip->bip_iter.bi_sector;
}

static inline void bip_set_seed(struct bio_integrity_payload *bip,
                                sector_t seed)
{
        bip->bip_iter.bi_sector = seed;
}

#endif /* CONFIG_BLK_DEV_INTEGRITY */

void bio_trim(struct bio *bio, sector_t offset, sector_t size);
extern struct bio *bio_split(struct bio *bio, int sectors,
                             gfp_t gfp, struct bio_set *bs);
struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
                unsigned *segs, struct bio_set *bs, unsigned max_bytes);

/**
 * bio_next_split - get next @sectors from a bio, splitting if necessary
 * @bio:        bio to split
 * @sectors:        number of sectors to split from the front of @bio
 * @gfp:        gfp mask
 * @bs:                bio set to allocate from
 *
 * Return: a bio representing the next @sectors of @bio - if the bio is smaller
 * than @sectors, returns the original bio unchanged.
 */
static inline struct bio *bio_next_split(struct bio *bio, int sectors,
                                         gfp_t gfp, struct bio_set *bs)
{
        if (sectors >= bio_sectors(bio))
                return bio;

        return bio_split(bio, sectors, gfp, bs);
}

enum {
        BIOSET_NEED_BVECS = BIT(0),
        BIOSET_NEED_RESCUER = BIT(1),
        BIOSET_PERCPU_CACHE = BIT(2),
};
extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags);
extern void bioset_exit(struct bio_set *);
extern int biovec_init_pool(mempool_t *pool, int pool_entries);

struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
                             blk_opf_t opf, gfp_t gfp_mask,
                             struct bio_set *bs);
struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask);
extern void bio_put(struct bio *);

struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src,
                gfp_t gfp, struct bio_set *bs);
int bio_init_clone(struct block_device *bdev, struct bio *bio,
                struct bio *bio_src, gfp_t gfp);

extern struct bio_set fs_bio_set;

static inline struct bio *bio_alloc(struct block_device *bdev,
                unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp_mask)
{
        return bio_alloc_bioset(bdev, nr_vecs, opf, gfp_mask, &fs_bio_set);
}

void submit_bio(struct bio *bio);

extern void bio_endio(struct bio *);

static inline void bio_io_error(struct bio *bio)
{
        bio->bi_status = BLK_STS_IOERR;
        bio_endio(bio);
}

static inline void bio_wouldblock_error(struct bio *bio)
{
        bio_set_flag(bio, BIO_QUIET);
        bio->bi_status = BLK_STS_AGAIN;
        bio_endio(bio);
}

/*
 * Calculate number of bvec segments that should be allocated to fit data
 * pointed by @iter. If @iter is backed by bvec it's going to be reused
 * instead of allocating a new one.
 */
static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
{
        if (iov_iter_is_bvec(iter))
                return 0;
        return iov_iter_npages(iter, max_segs);
}

struct request_queue;

extern int submit_bio_wait(struct bio *bio);
void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
              unsigned short max_vecs, blk_opf_t opf);
extern void bio_uninit(struct bio *);
void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf);
void bio_chain(struct bio *, struct bio *);

int __must_check bio_add_page(struct bio *bio, struct page *page, unsigned len,
                              unsigned off);
bool __must_check bio_add_folio(struct bio *bio, struct folio *folio,
                                size_t len, size_t off);
extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
                           unsigned int, unsigned int);
int bio_add_zone_append_page(struct bio *bio, struct page *page,
                             unsigned int len, unsigned int offset);
void __bio_add_page(struct bio *bio, struct page *page,
                unsigned int len, unsigned int off);
void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len,
                          size_t off);
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter);
void __bio_release_pages(struct bio *bio, bool mark_dirty);
extern void bio_set_pages_dirty(struct bio *bio);
extern void bio_check_pages_dirty(struct bio *bio);

extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
                               struct bio *src, struct bvec_iter *src_iter);
extern void bio_copy_data(struct bio *dst, struct bio *src);
extern void bio_free_pages(struct bio *bio);
void guard_bio_eod(struct bio *bio);
void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter);

static inline void zero_fill_bio(struct bio *bio)
{
        zero_fill_bio_iter(bio, bio->bi_iter);
}

static inline void bio_release_pages(struct bio *bio, bool mark_dirty)
{
        if (bio_flagged(bio, BIO_PAGE_PINNED))
                __bio_release_pages(bio, mark_dirty);
}

#define bio_dev(bio) \
        disk_devt((bio)->bi_bdev->bd_disk)

#ifdef CONFIG_BLK_CGROUP
void bio_associate_blkg(struct bio *bio);
void bio_associate_blkg_from_css(struct bio *bio,
                                 struct cgroup_subsys_state *css);
void bio_clone_blkg_association(struct bio *dst, struct bio *src);
void blkcg_punt_bio_submit(struct bio *bio);
#else        /* CONFIG_BLK_CGROUP */
static inline void bio_associate_blkg(struct bio *bio) { }
static inline void bio_associate_blkg_from_css(struct bio *bio,
                                               struct cgroup_subsys_state *css)
{ }
static inline void bio_clone_blkg_association(struct bio *dst,
                                              struct bio *src) { }
static inline void blkcg_punt_bio_submit(struct bio *bio)
{
        submit_bio(bio);
}
#endif        /* CONFIG_BLK_CGROUP */

static inline void bio_set_dev(struct bio *bio, struct block_device *bdev)
{
        bio_clear_flag(bio, BIO_REMAPPED);
        if (bio->bi_bdev != bdev)
                bio_clear_flag(bio, BIO_BPS_THROTTLED);
        bio->bi_bdev = bdev;
        bio_associate_blkg(bio);
}

/*
 * BIO list management for use by remapping drivers (e.g. DM or MD) and loop.
 *
 * A bio_list anchors a singly-linked list of bios chained through the bi_next
 * member of the bio.  The bio_list also caches the last list member to allow
 * fast access to the tail.
 */
struct bio_list {
        struct bio *head;
        struct bio *tail;
};

static inline int bio_list_empty(const struct bio_list *bl)
{
        return bl->head == NULL;
}

static inline void bio_list_init(struct bio_list *bl)
{
        bl->head = bl->tail = NULL;
}

#define BIO_EMPTY_LIST        { NULL, NULL }

#define bio_list_for_each(bio, bl) \
        for (bio = (bl)->head; bio; bio = bio->bi_next)

static inline unsigned bio_list_size(const struct bio_list *bl)
{
        unsigned sz = 0;
        struct bio *bio;

        bio_list_for_each(bio, bl)
                sz++;

        return sz;
}

static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
{
        bio->bi_next = NULL;

        if (bl->tail)
                bl->tail->bi_next = bio;
        else
                bl->head = bio;

        bl->tail = bio;
}

static inline void bio_list_add_head(struct bio_list *bl, struct bio *bio)
{
        bio->bi_next = bl->head;

        bl->head = bio;

        if (!bl->tail)
                bl->tail = bio;
}

static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
{
        if (!bl2->head)
                return;

        if (bl->tail)
                bl->tail->bi_next = bl2->head;
        else
                bl->head = bl2->head;

        bl->tail = bl2->tail;
}

static inline void bio_list_merge_init(struct bio_list *bl,
                struct bio_list *bl2)
{
        bio_list_merge(bl, bl2);
        bio_list_init(bl2);
}

static inline void bio_list_merge_head(struct bio_list *bl,
                                       struct bio_list *bl2)
{
        if (!bl2->head)
                return;

        if (bl->head)
                bl2->tail->bi_next = bl->head;
        else
                bl->tail = bl2->tail;

        bl->head = bl2->head;
}

static inline struct bio *bio_list_peek(struct bio_list *bl)
{
        return bl->head;
}

static inline struct bio *bio_list_pop(struct bio_list *bl)
{
        struct bio *bio = bl->head;

        if (bio) {
                bl->head = bl->head->bi_next;
                if (!bl->head)
                        bl->tail = NULL;

                bio->bi_next = NULL;
        }

        return bio;
}

static inline struct bio *bio_list_get(struct bio_list *bl)
{
        struct bio *bio = bl->head;

        bl->head = bl->tail = NULL;

        return bio;
}

/*
 * Increment chain count for the bio. Make sure the CHAIN flag update
 * is visible before the raised count.
 */
static inline void bio_inc_remaining(struct bio *bio)
{
        bio_set_flag(bio, BIO_CHAIN);
        smp_mb__before_atomic();
        atomic_inc(&bio->__bi_remaining);
}

/*
 * bio_set is used to allow other portions of the IO system to
 * allocate their own private memory pools for bio and iovec structures.
 * These memory pools in turn all allocate from the bio_slab
 * and the bvec_slabs[].
 */
#define BIO_POOL_SIZE 2

struct bio_set {
        struct kmem_cache *bio_slab;
        unsigned int front_pad;

        /*
         * per-cpu bio alloc cache
         */
        struct bio_alloc_cache __percpu *cache;

        mempool_t bio_pool;
        mempool_t bvec_pool;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
        mempool_t bio_integrity_pool;
        mempool_t bvec_integrity_pool;
#endif

        unsigned int back_pad;
        /*
         * Deadlock avoidance for stacking block drivers: see comments in
         * bio_alloc_bioset() for details
         */
        spinlock_t                rescue_lock;
        struct bio_list                rescue_list;
        struct work_struct        rescue_work;
        struct workqueue_struct        *rescue_workqueue;

        /*
         * Hot un-plug notifier for the per-cpu cache, if used
         */
        struct hlist_node cpuhp_dead;
};

static inline bool bioset_initialized(struct bio_set *bs)
{
        return bs->bio_slab != NULL;
}

#if defined(CONFIG_BLK_DEV_INTEGRITY)

#define bip_for_each_vec(bvl, bip, iter)                                \
        for_each_bvec(bvl, (bip)->bip_vec, iter, (bip)->bip_iter)

#define bio_for_each_integrity_vec(_bvl, _bio, _iter)                        \
        for_each_bio(_bio)                                                \
                bip_for_each_vec(_bvl, _bio->bi_integrity, _iter)

int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed);
void bio_integrity_unmap_free_user(struct bio *bio);
extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
extern bool bio_integrity_prep(struct bio *);
extern void bio_integrity_advance(struct bio *, unsigned int);
extern void bio_integrity_trim(struct bio *);
extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t);
extern int bioset_integrity_create(struct bio_set *, int);
extern void bioset_integrity_free(struct bio_set *);
extern void bio_integrity_init(void);

#else /* CONFIG_BLK_DEV_INTEGRITY */

static inline void *bio_integrity(struct bio *bio)
{
        return NULL;
}

static inline int bioset_integrity_create(struct bio_set *bs, int pool_size)
{
        return 0;
}

static inline void bioset_integrity_free (struct bio_set *bs)
{
        return;
}

static inline bool bio_integrity_prep(struct bio *bio)
{
        return true;
}

static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
                                      gfp_t gfp_mask)
{
        return 0;
}

static inline void bio_integrity_advance(struct bio *bio,
                                         unsigned int bytes_done)
{
        return;
}

static inline void bio_integrity_trim(struct bio *bio)
{
        return;
}

static inline void bio_integrity_init(void)
{
        return;
}

static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag)
{
        return false;
}

static inline void *bio_integrity_alloc(struct bio * bio, gfp_t gfp,
                                                                unsigned int nr)
{
        return ERR_PTR(-EINVAL);
}

static inline int bio_integrity_add_page(struct bio *bio, struct page *page,
                                        unsigned int len, unsigned int offset)
{
        return 0;
}

static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf,
                                         ssize_t len, u32 seed)
{
        return -EINVAL;
}
static inline void bio_integrity_unmap_free_user(struct bio *bio)
{
}

#endif /* CONFIG_BLK_DEV_INTEGRITY */

/*
 * Mark a bio as polled. Note that for async polled IO, the caller must
 * expect -EWOULDBLOCK if we cannot allocate a request (or other resources).
 * We cannot block waiting for requests on polled IO, as those completions
 * must be found by the caller. This is different than IRQ driven IO, where
 * it's safe to wait for IO to complete.
 */
static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb)
{
        bio->bi_opf |= REQ_POLLED;
        if (kiocb->ki_flags & IOCB_NOWAIT)
                bio->bi_opf |= REQ_NOWAIT;
}

static inline void bio_clear_polled(struct bio *bio)
{
        bio->bi_opf &= ~REQ_POLLED;
}

struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
                unsigned int nr_pages, blk_opf_t opf, gfp_t gfp);
struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new);

struct bio *blk_alloc_discard_bio(struct block_device *bdev,
                sector_t *sector, sector_t *nr_sects, gfp_t gfp_mask);

#endif /* __LINUX_BIO_H */














    1 










    1 
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
// SPDX-License-Identifier: GPL-2.0
#include <linux/cache.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/pid_namespace.h>
#include "internal.h"

/*
 * /proc/self:
 */
static const char *proc_self_get_link(struct dentry *dentry,
                                      struct inode *inode,
                                      struct delayed_call *done)
{
        struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
        pid_t tgid = task_tgid_nr_ns(current, ns);
        char *name;

        if (!tgid)
                return ERR_PTR(-ENOENT);
        /* max length of unsigned int in decimal + NULL term */
        name = kmalloc(10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);
        if (unlikely(!name))
                return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
        sprintf(name, "%u", tgid);
        set_delayed_call(done, kfree_link, name);
        return name;
}

static const struct inode_operations proc_self_inode_operations = {
        .get_link        = proc_self_get_link,
};

static unsigned self_inum __ro_after_init;

int proc_setup_self(struct super_block *s)
{
        struct inode *root_inode = d_inode(s->s_root);
        struct proc_fs_info *fs_info = proc_sb_info(s);
        struct dentry *self;
        int ret = -ENOMEM;

        inode_lock(root_inode);
        self = d_alloc_name(s->s_root, "self");
        if (self) {
                struct inode *inode = new_inode(s);
                if (inode) {
                        inode->i_ino = self_inum;
                        simple_inode_init_ts(inode);
                        inode->i_mode = S_IFLNK | S_IRWXUGO;
                        inode->i_uid = GLOBAL_ROOT_UID;
                        inode->i_gid = GLOBAL_ROOT_GID;
                        inode->i_op = &proc_self_inode_operations;
                        d_add(self, inode);
                        ret = 0;
                } else {
                        dput(self);
                }
        }
        inode_unlock(root_inode);

        if (ret)
                pr_err("proc_fill_super: can't allocate /proc/self\n");
        else
                fs_info->proc_self = self;

        return ret;
}

void __init proc_self_init(void)
{
        proc_alloc_inum(&self_inum);
}









































































































































































































































































































   12 








   11 







   12 
   12 




   12 


    9 

   12 








   11 










   12 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/block_validity.c
 *
 * Copyright (C) 2009
 * Theodore Ts'o (tytso@mit.edu)
 *
 * Track which blocks in the filesystem are metadata blocks that
 * should never be used as data blocks by files or directories.
 */

#include <linux/time.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include "ext4.h"

struct ext4_system_zone {
        struct rb_node        node;
        ext4_fsblk_t        start_blk;
        unsigned int        count;
        u32                ino;
};

static struct kmem_cache *ext4_system_zone_cachep;

int __init ext4_init_system_zone(void)
{
        ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
        if (ext4_system_zone_cachep == NULL)
                return -ENOMEM;
        return 0;
}

void ext4_exit_system_zone(void)
{
        rcu_barrier();
        kmem_cache_destroy(ext4_system_zone_cachep);
}

static inline int can_merge(struct ext4_system_zone *entry1,
                     struct ext4_system_zone *entry2)
{
        if ((entry1->start_blk + entry1->count) == entry2->start_blk &&
            entry1->ino == entry2->ino)
                return 1;
        return 0;
}

static void release_system_zone(struct ext4_system_blocks *system_blks)
{
        struct ext4_system_zone        *entry, *n;

        rbtree_postorder_for_each_entry_safe(entry, n,
                                &system_blks->root, node)
                kmem_cache_free(ext4_system_zone_cachep, entry);
}

/*
 * Mark a range of blocks as belonging to the "system zone" --- that
 * is, filesystem metadata blocks which should never be used by
 * inodes.
 */
static int add_system_zone(struct ext4_system_blocks *system_blks,
                           ext4_fsblk_t start_blk,
                           unsigned int count, u32 ino)
{
        struct ext4_system_zone *new_entry, *entry;
        struct rb_node **n = &system_blks->root.rb_node, *node;
        struct rb_node *parent = NULL, *new_node = NULL;

        while (*n) {
                parent = *n;
                entry = rb_entry(parent, struct ext4_system_zone, node);
                if (start_blk < entry->start_blk)
                        n = &(*n)->rb_left;
                else if (start_blk >= (entry->start_blk + entry->count))
                        n = &(*n)->rb_right;
                else        /* Unexpected overlap of system zones. */
                        return -EFSCORRUPTED;
        }

        new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
                                     GFP_KERNEL);
        if (!new_entry)
                return -ENOMEM;
        new_entry->start_blk = start_blk;
        new_entry->count = count;
        new_entry->ino = ino;
        new_node = &new_entry->node;

        rb_link_node(new_node, parent, n);
        rb_insert_color(new_node, &system_blks->root);

        /* Can we merge to the left? */
        node = rb_prev(new_node);
        if (node) {
                entry = rb_entry(node, struct ext4_system_zone, node);
                if (can_merge(entry, new_entry)) {
                        new_entry->start_blk = entry->start_blk;
                        new_entry->count += entry->count;
                        rb_erase(node, &system_blks->root);
                        kmem_cache_free(ext4_system_zone_cachep, entry);
                }
        }

        /* Can we merge to the right? */
        node = rb_next(new_node);
        if (node) {
                entry = rb_entry(node, struct ext4_system_zone, node);
                if (can_merge(new_entry, entry)) {
                        new_entry->count += entry->count;
                        rb_erase(node, &system_blks->root);
                        kmem_cache_free(ext4_system_zone_cachep, entry);
                }
        }
        return 0;
}

static void debug_print_tree(struct ext4_sb_info *sbi)
{
        struct rb_node *node;
        struct ext4_system_zone *entry;
        struct ext4_system_blocks *system_blks;
        int first = 1;

        printk(KERN_INFO "System zones: ");
        rcu_read_lock();
        system_blks = rcu_dereference(sbi->s_system_blks);
        node = rb_first(&system_blks->root);
        while (node) {
                entry = rb_entry(node, struct ext4_system_zone, node);
                printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ",
                       entry->start_blk, entry->start_blk + entry->count - 1);
                first = 0;
                node = rb_next(node);
        }
        rcu_read_unlock();
        printk(KERN_CONT "\n");
}

static int ext4_protect_reserved_inode(struct super_block *sb,
                                       struct ext4_system_blocks *system_blks,
                                       u32 ino)
{
        struct inode *inode;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_map_blocks map;
        u32 i = 0, num;
        int err = 0, n;

        if ((ino < EXT4_ROOT_INO) ||
            (ino > le32_to_cpu(sbi->s_es->s_inodes_count)))
                return -EINVAL;
        inode = ext4_iget(sb, ino, EXT4_IGET_SPECIAL);
        if (IS_ERR(inode))
                return PTR_ERR(inode);
        num = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
        while (i < num) {
                cond_resched();
                map.m_lblk = i;
                map.m_len = num - i;
                n = ext4_map_blocks(NULL, inode, &map, 0);
                if (n < 0) {
                        err = n;
                        break;
                }
                if (n == 0) {
                        i++;
                } else {
                        err = add_system_zone(system_blks, map.m_pblk, n, ino);
                        if (err < 0) {
                                if (err == -EFSCORRUPTED) {
                                        EXT4_ERROR_INODE_ERR(inode, -err,
                                                "blocks %llu-%llu from inode overlap system zone",
                                                map.m_pblk,
                                                map.m_pblk + map.m_len - 1);
                                }
                                break;
                        }
                        i += n;
                }
        }
        iput(inode);
        return err;
}

static void ext4_destroy_system_zone(struct rcu_head *rcu)
{
        struct ext4_system_blocks *system_blks;

        system_blks = container_of(rcu, struct ext4_system_blocks, rcu);
        release_system_zone(system_blks);
        kfree(system_blks);
}

/*
 * Build system zone rbtree which is used for block validity checking.
 *
 * The update of system_blks pointer in this function is protected by
 * sb->s_umount semaphore. However we have to be careful as we can be
 * racing with ext4_inode_block_valid() calls reading system_blks rbtree
 * protected only by RCU. That's why we first build the rbtree and then
 * swap it in place.
 */
int ext4_setup_system_zone(struct super_block *sb)
{
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_system_blocks *system_blks;
        struct ext4_group_desc *gdp;
        ext4_group_t i;
        int ret;

        system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL);
        if (!system_blks)
                return -ENOMEM;

        for (i=0; i < ngroups; i++) {
                unsigned int meta_blks = ext4_num_base_meta_blocks(sb, i);

                cond_resched();
                if (meta_blks != 0) {
                        ret = add_system_zone(system_blks,
                                        ext4_group_first_block_no(sb, i),
                                        meta_blks, 0);
                        if (ret)
                                goto err;
                }
                gdp = ext4_get_group_desc(sb, i, NULL);
                ret = add_system_zone(system_blks,
                                ext4_block_bitmap(sb, gdp), 1, 0);
                if (ret)
                        goto err;
                ret = add_system_zone(system_blks,
                                ext4_inode_bitmap(sb, gdp), 1, 0);
                if (ret)
                        goto err;
                ret = add_system_zone(system_blks,
                                ext4_inode_table(sb, gdp),
                                sbi->s_itb_per_group, 0);
                if (ret)
                        goto err;
        }
        if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) {
                ret = ext4_protect_reserved_inode(sb, system_blks,
                                le32_to_cpu(sbi->s_es->s_journal_inum));
                if (ret)
                        goto err;
        }

        /*
         * System blks rbtree complete, announce it once to prevent racing
         * with ext4_inode_block_valid() accessing the rbtree at the same
         * time.
         */
        rcu_assign_pointer(sbi->s_system_blks, system_blks);

        if (test_opt(sb, DEBUG))
                debug_print_tree(sbi);
        return 0;
err:
        release_system_zone(system_blks);
        kfree(system_blks);
        return ret;
}

/*
 * Called when the filesystem is unmounted or when remounting it with
 * noblock_validity specified.
 *
 * The update of system_blks pointer in this function is protected by
 * sb->s_umount semaphore. However we have to be careful as we can be
 * racing with ext4_inode_block_valid() calls reading system_blks rbtree
 * protected only by RCU. So we first clear the system_blks pointer and
 * then free the rbtree only after RCU grace period expires.
 */
void ext4_release_system_zone(struct super_block *sb)
{
        struct ext4_system_blocks *system_blks;

        system_blks = rcu_dereference_protected(EXT4_SB(sb)->s_system_blks,
                                        lockdep_is_held(&sb->s_umount));
        rcu_assign_pointer(EXT4_SB(sb)->s_system_blks, NULL);

        if (system_blks)
                call_rcu(&system_blks->rcu, ext4_destroy_system_zone);
}

int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
                                ext4_fsblk_t start_blk, unsigned int count)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_system_blocks *system_blks;
        struct ext4_system_zone *entry;
        struct rb_node *n;
        int ret = 1;

        if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
            (start_blk + count < start_blk) ||
            (start_blk + count > ext4_blocks_count(sbi->s_es)))
                return 0;

        /*
         * Lock the system zone to prevent it being released concurrently
         * when doing a remount which inverse current "[no]block_validity"
         * mount option.
         */
        rcu_read_lock();
        system_blks = rcu_dereference(sbi->s_system_blks);
        if (system_blks == NULL)
                goto out_rcu;

        n = system_blks->root.rb_node;
        while (n) {
                entry = rb_entry(n, struct ext4_system_zone, node);
                if (start_blk + count - 1 < entry->start_blk)
                        n = n->rb_left;
                else if (start_blk >= (entry->start_blk + entry->count))
                        n = n->rb_right;
                else {
                        ret = 0;
                        if (inode)
                                ret = (entry->ino == inode->i_ino);
                        break;
                }
        }
out_rcu:
        rcu_read_unlock();
        return ret;
}

/*
 * Returns 1 if the passed-in block region (start_blk,
 * start_blk+count) is valid; 0 if some part of the block region
 * overlaps with some other filesystem metadata blocks.
 */
int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
                          unsigned int count)
{
        return ext4_sb_block_valid(inode->i_sb, inode, start_blk, count);
}

int ext4_check_blockref(const char *function, unsigned int line,
                        struct inode *inode, __le32 *p, unsigned int max)
{
        __le32 *bref = p;
        unsigned int blk;

        if (ext4_has_feature_journal(inode->i_sb) &&
            (inode->i_ino ==
             le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
                return 0;

        while (bref < p+max) {
                blk = le32_to_cpu(*bref++);
                if (blk &&
                    unlikely(!ext4_inode_block_valid(inode, blk, 1))) {
                        ext4_error_inode(inode, function, line, blk,
                                         "invalid block");
                        return -EFSCORRUPTED;
                }
        }
        return 0;
}


















































































    3 













































    3 























































    4 























































































































































































































































































































































































































   15 
    4 




















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCULIST_H
#define _LINUX_RCULIST_H

#ifdef __KERNEL__

/*
 * RCU-protected list version
 */
#include <linux/list.h>
#include <linux/rcupdate.h>

/*
 * INIT_LIST_HEAD_RCU - Initialize a list_head visible to RCU readers
 * @list: list to be initialized
 *
 * You should instead use INIT_LIST_HEAD() for normal initialization and
 * cleanup tasks, when readers have no access to the list being initialized.
 * However, if the list being initialized is visible to readers, you
 * need to keep the compiler from being too mischievous.
 */
static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
{
        WRITE_ONCE(list->next, list);
        WRITE_ONCE(list->prev, list);
}

/*
 * return the ->next pointer of a list_head in an rcu safe
 * way, we must not access it directly
 */
#define list_next_rcu(list)        (*((struct list_head __rcu **)(&(list)->next)))

/**
 * list_tail_rcu - returns the prev pointer of the head of the list
 * @head: the head of the list
 *
 * Note: This should only be used with the list header, and even then
 * only if list_del() and similar primitives are not also used on the
 * list header.
 */
#define list_tail_rcu(head)        (*((struct list_head __rcu **)(&(head)->prev)))

/*
 * Check during list traversal that we are within an RCU reader
 */

#define check_arg_count_one(dummy)

#ifdef CONFIG_PROVE_RCU_LIST
#define __list_check_rcu(dummy, cond, extra...)                                \
        ({                                                                \
        check_arg_count_one(extra);                                        \
        RCU_LOCKDEP_WARN(!(cond) && !rcu_read_lock_any_held(),                \
                         "RCU-list traversed in non-reader section!");        \
        })

#define __list_check_srcu(cond)                                         \
        ({                                                                 \
        RCU_LOCKDEP_WARN(!(cond),                                         \
                "RCU-list traversed without holding the required lock!");\
        })
#else
#define __list_check_rcu(dummy, cond, extra...)                                \
        ({ check_arg_count_one(extra); })

#define __list_check_srcu(cond) ({ })
#endif

/*
 * Insert a new entry between two known consecutive entries.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_add_rcu(struct list_head *new,
                struct list_head *prev, struct list_head *next)
{
        if (!__list_add_valid(new, prev, next))
                return;

        new->next = next;
        new->prev = prev;
        rcu_assign_pointer(list_next_rcu(prev), new);
        next->prev = new;
}

/**
 * list_add_rcu - add a new entry to rcu-protected list
 * @new: new entry to be added
 * @head: list head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_add_rcu()
 * or list_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 */
static inline void list_add_rcu(struct list_head *new, struct list_head *head)
{
        __list_add_rcu(new, head, head->next);
}

/**
 * list_add_tail_rcu - add a new entry to rcu-protected list
 * @new: new entry to be added
 * @head: list head to add it before
 *
 * Insert a new entry before the specified head.
 * This is useful for implementing queues.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_add_tail_rcu()
 * or list_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 */
static inline void list_add_tail_rcu(struct list_head *new,
                                        struct list_head *head)
{
        __list_add_rcu(new, head->prev, head);
}

/**
 * list_del_rcu - deletes entry from list without re-initialization
 * @entry: the element to delete from the list.
 *
 * Note: list_empty() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_del_rcu()
 * or list_add_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 *
 * Note that the caller is not permitted to immediately free
 * the newly deleted entry.  Instead, either synchronize_rcu()
 * or call_rcu() must be used to defer freeing until an RCU
 * grace period has elapsed.
 */
static inline void list_del_rcu(struct list_head *entry)
{
        __list_del_entry(entry);
        entry->prev = LIST_POISON2;
}

/**
 * hlist_del_init_rcu - deletes entry from hash list with re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: list_unhashed() on the node return true after this. It is
 * useful for RCU based read lockfree traversal if the writer side
 * must know if the list entry is still hashed or already unhashed.
 *
 * In particular, it means that we can not poison the forward pointers
 * that may still be used for walking the hash list and we can only
 * zero the pprev pointer so list_unhashed() will return true after
 * this.
 *
 * The caller must take whatever precautions are necessary (such as
 * holding appropriate locks) to avoid racing with another
 * list-mutation primitive, such as hlist_add_head_rcu() or
 * hlist_del_rcu(), running on this same list.  However, it is
 * perfectly legal to run concurrently with the _rcu list-traversal
 * primitives, such as hlist_for_each_entry_rcu().
 */
static inline void hlist_del_init_rcu(struct hlist_node *n)
{
        if (!hlist_unhashed(n)) {
                __hlist_del(n);
                WRITE_ONCE(n->pprev, NULL);
        }
}

/**
 * list_replace_rcu - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * The @old entry will be replaced with the @new entry atomically.
 * Note: @old should not be empty.
 */
static inline void list_replace_rcu(struct list_head *old,
                                struct list_head *new)
{
        new->next = old->next;
        new->prev = old->prev;
        rcu_assign_pointer(list_next_rcu(new->prev), new);
        new->next->prev = new;
        old->prev = LIST_POISON2;
}

/**
 * __list_splice_init_rcu - join an RCU-protected list into an existing list.
 * @list:        the RCU-protected list to splice
 * @prev:        points to the last element of the existing list
 * @next:        points to the first element of the existing list
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 *
 * The list pointed to by @prev and @next can be RCU-read traversed
 * concurrently with this function.
 *
 * Note that this function blocks.
 *
 * Important note: the caller must take whatever action is necessary to prevent
 * any other updates to the existing list.  In principle, it is possible to
 * modify the list as soon as sync() begins execution. If this sort of thing
 * becomes necessary, an alternative version based on call_rcu() could be
 * created.  But only if -really- needed -- there is no shortage of RCU API
 * members.
 */
static inline void __list_splice_init_rcu(struct list_head *list,
                                          struct list_head *prev,
                                          struct list_head *next,
                                          void (*sync)(void))
{
        struct list_head *first = list->next;
        struct list_head *last = list->prev;

        /*
         * "first" and "last" tracking list, so initialize it.  RCU readers
         * have access to this list, so we must use INIT_LIST_HEAD_RCU()
         * instead of INIT_LIST_HEAD().
         */

        INIT_LIST_HEAD_RCU(list);

        /*
         * At this point, the list body still points to the source list.
         * Wait for any readers to finish using the list before splicing
         * the list body into the new list.  Any new readers will see
         * an empty list.
         */

        sync();
        ASSERT_EXCLUSIVE_ACCESS(*first);
        ASSERT_EXCLUSIVE_ACCESS(*last);

        /*
         * Readers are finished with the source list, so perform splice.
         * The order is important if the new list is global and accessible
         * to concurrent RCU readers.  Note that RCU readers are not
         * permitted to traverse the prev pointers without excluding
         * this function.
         */

        last->next = next;
        rcu_assign_pointer(list_next_rcu(prev), first);
        first->prev = prev;
        next->prev = last;
}

/**
 * list_splice_init_rcu - splice an RCU-protected list into an existing list,
 *                        designed for stacks.
 * @list:        the RCU-protected list to splice
 * @head:        the place in the existing list to splice the first list into
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 */
static inline void list_splice_init_rcu(struct list_head *list,
                                        struct list_head *head,
                                        void (*sync)(void))
{
        if (!list_empty(list))
                __list_splice_init_rcu(list, head, head->next, sync);
}

/**
 * list_splice_tail_init_rcu - splice an RCU-protected list into an existing
 *                             list, designed for queues.
 * @list:        the RCU-protected list to splice
 * @head:        the place in the existing list to splice the first list into
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 */
static inline void list_splice_tail_init_rcu(struct list_head *list,
                                             struct list_head *head,
                                             void (*sync)(void))
{
        if (!list_empty(list))
                __list_splice_init_rcu(list, head->prev, head, sync);
}

/**
 * list_entry_rcu - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_entry_rcu(ptr, type, member) \
        container_of(READ_ONCE(ptr), type, member)

/*
 * Where are list_empty_rcu() and list_first_entry_rcu()?
 *
 * They do not exist because they would lead to subtle race conditions:
 *
 * if (!list_empty_rcu(mylist)) {
 *        struct foo *bar = list_first_entry_rcu(mylist, struct foo, list_member);
 *        do_something(bar);
 * }
 *
 * The list might be non-empty when list_empty_rcu() checks it, but it
 * might have become empty by the time that list_first_entry_rcu() rereads
 * the ->next pointer, which would result in a SEGV.
 *
 * When not using RCU, it is OK for list_first_entry() to re-read that
 * pointer because both functions should be protected by some lock that
 * blocks writers.
 *
 * When using RCU, list_empty() uses READ_ONCE() to fetch the
 * RCU-protected ->next pointer and then compares it to the address of the
 * list head.  However, it neither dereferences this pointer nor provides
 * this pointer to its caller.  Thus, READ_ONCE() suffices (that is,
 * rcu_dereference() is not needed), which means that list_empty() can be
 * used anywhere you would want to use list_empty_rcu().  Just don't
 * expect anything useful to happen if you do a subsequent lockless
 * call to list_first_entry_rcu()!!!
 *
 * See list_first_or_null_rcu for an alternative.
 */

/**
 * list_first_or_null_rcu - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * Note that if the list is empty, it returns NULL.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_first_or_null_rcu(ptr, type, member) \
({ \
        struct list_head *__ptr = (ptr); \
        struct list_head *__next = READ_ONCE(__ptr->next); \
        likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \
})

/**
 * list_next_or_null_rcu - get the next element from a list
 * @head:        the head for the list.
 * @ptr:        the list head to take the next element from.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * Note that if the ptr is at the end of the list, NULL is returned.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_next_or_null_rcu(head, ptr, type, member) \
({ \
        struct list_head *__head = (head); \
        struct list_head *__ptr = (ptr); \
        struct list_head *__next = READ_ONCE(__ptr->next); \
        likely(__next != __head) ? list_entry_rcu(__next, type, \
                                                  member) : NULL; \
})

/**
 * list_for_each_entry_rcu        -        iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 * @cond:        optional lockdep expression if called from non-RCU protection.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as list_add_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define list_for_each_entry_rcu(pos, head, member, cond...)                \
        for (__list_check_rcu(dummy, ## cond, 0),                        \
             pos = list_entry_rcu((head)->next, typeof(*pos), member);        \
                &pos->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_srcu        -        iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 * @cond:        lockdep expression for the lock required to traverse the list.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as list_add_rcu()
 * as long as the traversal is guarded by srcu_read_lock().
 * The lockdep expression srcu_read_lock_held() can be passed as the
 * cond argument from read side.
 */
#define list_for_each_entry_srcu(pos, head, member, cond)                \
        for (__list_check_srcu(cond),                                        \
             pos = list_entry_rcu((head)->next, typeof(*pos), member);        \
                &pos->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_entry_lockless - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * This primitive may safely run concurrently with the _rcu
 * list-mutation primitives such as list_add_rcu(), but requires some
 * implicit RCU read-side guarding.  One example is running within a special
 * exception-time environment where preemption is disabled and where lockdep
 * cannot be invoked.  Another example is when items are added to the list,
 * but never deleted.
 */
#define list_entry_lockless(ptr, type, member) \
        container_of((typeof(ptr))READ_ONCE(ptr), type, member)

/**
 * list_for_each_entry_lockless - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_struct within the struct.
 *
 * This primitive may safely run concurrently with the _rcu
 * list-mutation primitives such as list_add_rcu(), but requires some
 * implicit RCU read-side guarding.  One example is running within a special
 * exception-time environment where preemption is disabled and where lockdep
 * cannot be invoked.  Another example is when items are added to the list,
 * but never deleted.
 */
#define list_for_each_entry_lockless(pos, head, member) \
        for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \
             &pos->member != (head); \
             pos = list_entry_lockless(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_continue_rcu - continue iteration over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Continue to iterate over list of given type, continuing after
 * the current position which must have been in the list when the RCU read
 * lock was taken.
 * This would typically require either that you obtained the node from a
 * previous walk of the list in the same RCU read-side critical section, or
 * that you held some sort of non-RCU reference (such as a reference count)
 * to keep the node alive *and* in the list.
 *
 * This iterator is similar to list_for_each_entry_from_rcu() except
 * this starts after the given position and that one starts at the given
 * position.
 */
#define list_for_each_entry_continue_rcu(pos, head, member)                 \
        for (pos = list_entry_rcu(pos->member.next, typeof(*pos), member); \
             &pos->member != (head);        \
             pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_from_rcu - iterate over a list from current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_node within the struct.
 *
 * Iterate over the tail of a list starting from a given position,
 * which must have been in the list when the RCU read lock was taken.
 * This would typically require either that you obtained the node from a
 * previous walk of the list in the same RCU read-side critical section, or
 * that you held some sort of non-RCU reference (such as a reference count)
 * to keep the node alive *and* in the list.
 *
 * This iterator is similar to list_for_each_entry_continue_rcu() except
 * this starts from the given position and that one starts from the position
 * after the given position.
 */
#define list_for_each_entry_from_rcu(pos, head, member)                        \
        for (; &(pos)->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*(pos)), member))

/**
 * hlist_del_rcu - deletes entry from hash list without re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: list_unhashed() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the hash list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry().
 */
static inline void hlist_del_rcu(struct hlist_node *n)
{
        __hlist_del(n);
        WRITE_ONCE(n->pprev, LIST_POISON2);
}

/**
 * hlist_replace_rcu - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * The @old entry will be replaced with the @new entry atomically.
 */
static inline void hlist_replace_rcu(struct hlist_node *old,
                                        struct hlist_node *new)
{
        struct hlist_node *next = old->next;

        new->next = next;
        WRITE_ONCE(new->pprev, old->pprev);
        rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new);
        if (next)
                WRITE_ONCE(new->next->pprev, &new->next);
        WRITE_ONCE(old->pprev, LIST_POISON2);
}

/**
 * hlists_swap_heads_rcu - swap the lists the hlist heads point to
 * @left:  The hlist head on the left
 * @right: The hlist head on the right
 *
 * The lists start out as [@left  ][node1 ... ] and
 *                        [@right ][node2 ... ]
 * The lists end up as    [@left  ][node2 ... ]
 *                        [@right ][node1 ... ]
 */
static inline void hlists_swap_heads_rcu(struct hlist_head *left, struct hlist_head *right)
{
        struct hlist_node *node1 = left->first;
        struct hlist_node *node2 = right->first;

        rcu_assign_pointer(left->first, node2);
        rcu_assign_pointer(right->first, node1);
        WRITE_ONCE(node2->pprev, &left->first);
        WRITE_ONCE(node1->pprev, &right->first);
}

/*
 * return the first or the next element in an RCU protected hlist
 */
#define hlist_first_rcu(head)        (*((struct hlist_node __rcu **)(&(head)->first)))
#define hlist_next_rcu(node)        (*((struct hlist_node __rcu **)(&(node)->next)))
#define hlist_pprev_rcu(node)        (*((struct hlist_node __rcu **)((node)->pprev)))

/**
 * hlist_add_head_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_add_head_rcu(struct hlist_node *n,
                                        struct hlist_head *h)
{
        struct hlist_node *first = h->first;

        n->next = first;
        WRITE_ONCE(n->pprev, &h->first);
        rcu_assign_pointer(hlist_first_rcu(h), n);
        if (first)
                WRITE_ONCE(first->pprev, &n->next);
}

/**
 * hlist_add_tail_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_add_tail_rcu(struct hlist_node *n,
                                      struct hlist_head *h)
{
        struct hlist_node *i, *last = NULL;

        /* Note: write side code, so rcu accessors are not needed. */
        for (i = h->first; i; i = i->next)
                last = i;

        if (last) {
                n->next = last->next;
                WRITE_ONCE(n->pprev, &last->next);
                rcu_assign_pointer(hlist_next_rcu(last), n);
        } else {
                hlist_add_head_rcu(n, h);
        }
}

/**
 * hlist_add_before_rcu
 * @n: the new element to add to the hash list.
 * @next: the existing element to add the new element before.
 *
 * Description:
 * Adds the specified element to the specified hlist
 * before the specified node while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.
 */
static inline void hlist_add_before_rcu(struct hlist_node *n,
                                        struct hlist_node *next)
{
        WRITE_ONCE(n->pprev, next->pprev);
        n->next = next;
        rcu_assign_pointer(hlist_pprev_rcu(n), n);
        WRITE_ONCE(next->pprev, &n->next);
}

/**
 * hlist_add_behind_rcu
 * @n: the new element to add to the hash list.
 * @prev: the existing element to add the new element after.
 *
 * Description:
 * Adds the specified element to the specified hlist
 * after the specified node while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.
 */
static inline void hlist_add_behind_rcu(struct hlist_node *n,
                                        struct hlist_node *prev)
{
        n->next = prev->next;
        WRITE_ONCE(n->pprev, &prev->next);
        rcu_assign_pointer(hlist_next_rcu(prev), n);
        if (n->next)
                WRITE_ONCE(n->next->pprev, &n->next);
}

#define __hlist_for_each_rcu(pos, head)                                \
        for (pos = rcu_dereference(hlist_first_rcu(head));        \
             pos;                                                \
             pos = rcu_dereference(hlist_next_rcu(pos)))

/**
 * hlist_for_each_entry_rcu - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 * @cond:        optional lockdep expression if called from non-RCU protection.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define hlist_for_each_entry_rcu(pos, head, member, cond...)                \
        for (__list_check_rcu(dummy, ## cond, 0),                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_srcu - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 * @cond:        lockdep expression for the lock required to traverse the list.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by srcu_read_lock().
 * The lockdep expression srcu_read_lock_held() can be passed as the
 * cond argument from read side.
 */
#define hlist_for_each_entry_srcu(pos, head, member, cond)                \
        for (__list_check_srcu(cond),                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing)
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 *
 * This is the same as hlist_for_each_entry_rcu() except that it does
 * not do any RCU debugging or tracing.
 */
#define hlist_for_each_entry_rcu_notrace(pos, head, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define hlist_for_each_entry_rcu_bh(pos, head, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue_rcu - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue_rcu(pos, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \
                        &(pos)->member)), typeof(*(pos)), member);        \
             pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue_rcu_bh - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue_rcu_bh(pos, member)                \
        for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(  \
                        &(pos)->member)), typeof(*(pos)), member);        \
             pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_from_rcu - iterate over a hlist continuing from current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_from_rcu(pos, member)                        \
        for (; pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

#endif        /* __KERNEL__ */
#endif































































































































































































































































































    2 
    2 
























    2 





    2 
    2 


    2 
    2 



    2 



    2 
























































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/mm/swap_state.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 *
 *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
 */
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/kernel_stat.h>
#include <linux/mempolicy.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
#include <linux/migrate.h>
#include <linux/vmalloc.h>
#include <linux/swap_slots.h>
#include <linux/huge_mm.h>
#include <linux/shmem_fs.h>
#include "internal.h"
#include "swap.h"

/*
 * swapper_space is a fiction, retained to simplify the path through
 * vmscan's shrink_page_list.
 */
static const struct address_space_operations swap_aops = {
        .writepage        = swap_writepage,
        .dirty_folio        = noop_dirty_folio,
#ifdef CONFIG_MIGRATION
        .migrate_folio        = migrate_folio,
#endif
};

struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
static bool enable_vma_readahead __read_mostly = true;

#define SWAP_RA_WIN_SHIFT        (PAGE_SHIFT / 2)
#define SWAP_RA_HITS_MASK        ((1UL << SWAP_RA_WIN_SHIFT) - 1)
#define SWAP_RA_HITS_MAX        SWAP_RA_HITS_MASK
#define SWAP_RA_WIN_MASK        (~PAGE_MASK & ~SWAP_RA_HITS_MASK)

#define SWAP_RA_HITS(v)                ((v) & SWAP_RA_HITS_MASK)
#define SWAP_RA_WIN(v)                (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
#define SWAP_RA_ADDR(v)                ((v) & PAGE_MASK)

#define SWAP_RA_VAL(addr, win, hits)                                \
        (((addr) & PAGE_MASK) |                                        \
         (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) |        \
         ((hits) & SWAP_RA_HITS_MASK))

/* Initial readahead hits is 4 to start up with a small window */
#define GET_SWAP_RA_VAL(vma)                                        \
        (atomic_long_read(&(vma)->swap_readahead_info) ? : 4)

static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);

void show_swap_cache_info(void)
{
        printk("%lu pages in swap cache\n", total_swapcache_pages());
        printk("Free swap  = %ldkB\n", K(get_nr_swap_pages()));
        printk("Total swap = %lukB\n", K(total_swap_pages));
}

void *get_shadow_from_swap_cache(swp_entry_t entry)
{
        struct address_space *address_space = swap_address_space(entry);
        pgoff_t idx = swp_offset(entry);
        void *shadow;

        shadow = xa_load(&address_space->i_pages, idx);
        if (xa_is_value(shadow))
                return shadow;
        return NULL;
}

/*
 * add_to_swap_cache resembles filemap_add_folio on swapper_space,
 * but sets SwapCache flag and private instead of mapping and index.
 */
int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
                        gfp_t gfp, void **shadowp)
{
        struct address_space *address_space = swap_address_space(entry);
        pgoff_t idx = swp_offset(entry);
        XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio));
        unsigned long i, nr = folio_nr_pages(folio);
        void *old;

        xas_set_update(&xas, workingset_update_node);

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);

        folio_ref_add(folio, nr);
        folio_set_swapcache(folio);
        folio->swap = entry;

        do {
                xas_lock_irq(&xas);
                xas_create_range(&xas);
                if (xas_error(&xas))
                        goto unlock;
                for (i = 0; i < nr; i++) {
                        VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio);
                        if (shadowp) {
                                old = xas_load(&xas);
                                if (xa_is_value(old))
                                        *shadowp = old;
                        }
                        xas_store(&xas, folio);
                        xas_next(&xas);
                }
                address_space->nrpages += nr;
                __node_stat_mod_folio(folio, NR_FILE_PAGES, nr);
                __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr);
unlock:
                xas_unlock_irq(&xas);
        } while (xas_nomem(&xas, gfp));

        if (!xas_error(&xas))
                return 0;

        folio_clear_swapcache(folio);
        folio_ref_sub(folio, nr);
        return xas_error(&xas);
}

/*
 * This must be called only on folios that have
 * been verified to be in the swap cache.
 */
void __delete_from_swap_cache(struct folio *folio,
                        swp_entry_t entry, void *shadow)
{
        struct address_space *address_space = swap_address_space(entry);
        int i;
        long nr = folio_nr_pages(folio);
        pgoff_t idx = swp_offset(entry);
        XA_STATE(xas, &address_space->i_pages, idx);

        xas_set_update(&xas, workingset_update_node);

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
        VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);

        for (i = 0; i < nr; i++) {
                void *entry = xas_store(&xas, shadow);
                VM_BUG_ON_PAGE(entry != folio, entry);
                xas_next(&xas);
        }
        folio->swap.val = 0;
        folio_clear_swapcache(folio);
        address_space->nrpages -= nr;
        __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
        __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
}

/**
 * add_to_swap - allocate swap space for a folio
 * @folio: folio we want to move to swap
 *
 * Allocate swap space for the folio and add the folio to the
 * swap cache.
 *
 * Context: Caller needs to hold the folio lock.
 * Return: Whether the folio was added to the swap cache.
 */
bool add_to_swap(struct folio *folio)
{
        swp_entry_t entry;
        int err;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);

        entry = folio_alloc_swap(folio);
        if (!entry.val)
                return false;

        /*
         * XArray node allocations from PF_MEMALLOC contexts could
         * completely exhaust the page allocator. __GFP_NOMEMALLOC
         * stops emergency reserves from being allocated.
         *
         * TODO: this could cause a theoretical memory reclaim
         * deadlock in the swap out path.
         */
        /*
         * Add it to the swap cache.
         */
        err = add_to_swap_cache(folio, entry,
                        __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
        if (err)
                /*
                 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
                 * clear SWAP_HAS_CACHE flag.
                 */
                goto fail;
        /*
         * Normally the folio will be dirtied in unmap because its
         * pte should be dirty. A special case is MADV_FREE page. The
         * page's pte could have dirty bit cleared but the folio's
         * SwapBacked flag is still set because clearing the dirty bit
         * and SwapBacked flag has no lock protected. For such folio,
         * unmap will not set dirty bit for it, so folio reclaim will
         * not write the folio out. This can cause data corruption when
         * the folio is swapped in later. Always setting the dirty flag
         * for the folio solves the problem.
         */
        folio_mark_dirty(folio);

        return true;

fail:
        put_swap_folio(folio, entry);
        return false;
}

/*
 * This must be called only on folios that have
 * been verified to be in the swap cache and locked.
 * It will never put the folio into the free list,
 * the caller has a reference on the folio.
 */
void delete_from_swap_cache(struct folio *folio)
{
        swp_entry_t entry = folio->swap;
        struct address_space *address_space = swap_address_space(entry);

        xa_lock_irq(&address_space->i_pages);
        __delete_from_swap_cache(folio, entry, NULL);
        xa_unlock_irq(&address_space->i_pages);

        put_swap_folio(folio, entry);
        folio_ref_sub(folio, folio_nr_pages(folio));
}

void clear_shadow_from_swap_cache(int type, unsigned long begin,
                                unsigned long end)
{
        unsigned long curr = begin;
        void *old;

        for (;;) {
                swp_entry_t entry = swp_entry(type, curr);
                struct address_space *address_space = swap_address_space(entry);
                XA_STATE(xas, &address_space->i_pages, curr);

                xas_set_update(&xas, workingset_update_node);

                xa_lock_irq(&address_space->i_pages);
                xas_for_each(&xas, old, end) {
                        if (!xa_is_value(old))
                                continue;
                        xas_store(&xas, NULL);
                }
                xa_unlock_irq(&address_space->i_pages);

                /* search the next swapcache until we meet end */
                curr >>= SWAP_ADDRESS_SPACE_SHIFT;
                curr++;
                curr <<= SWAP_ADDRESS_SPACE_SHIFT;
                if (curr > end)
                        break;
        }
}

/*
 * If we are the only user, then try to free up the swap cache.
 *
 * Its ok to check the swapcache flag without the folio lock
 * here because we are going to recheck again inside
 * folio_free_swap() _with_ the lock.
 *                                         - Marcelo
 */
void free_swap_cache(struct folio *folio)
{
        if (folio_test_swapcache(folio) && !folio_mapped(folio) &&
            folio_trylock(folio)) {
                folio_free_swap(folio);
                folio_unlock(folio);
        }
}

/*
 * Perform a free_page(), also freeing any swap cache associated with
 * this page if it is the last user of the page.
 */
void free_page_and_swap_cache(struct page *page)
{
        struct folio *folio = page_folio(page);

        free_swap_cache(folio);
        if (!is_huge_zero_folio(folio))
                folio_put(folio);
}

/*
 * Passed an array of pages, drop them all from swapcache and then release
 * them.  They are removed from the LRU and freed if this is their last use.
 */
void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
{
        struct folio_batch folios;
        unsigned int refs[PAGEVEC_SIZE];

        lru_add_drain();
        folio_batch_init(&folios);
        for (int i = 0; i < nr; i++) {
                struct folio *folio = page_folio(encoded_page_ptr(pages[i]));

                free_swap_cache(folio);
                refs[folios.nr] = 1;
                if (unlikely(encoded_page_flags(pages[i]) &
                             ENCODED_PAGE_BIT_NR_PAGES_NEXT))
                        refs[folios.nr] = encoded_nr_pages(pages[++i]);

                if (folio_batch_add(&folios, folio) == 0)
                        folios_put_refs(&folios, refs);
        }
        if (folios.nr)
                folios_put_refs(&folios, refs);
}

static inline bool swap_use_vma_readahead(void)
{
        return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
}

/*
 * Lookup a swap entry in the swap cache. A found folio will be returned
 * unlocked and with its refcount incremented - we rely on the kernel
 * lock getting page table operations atomic even if we drop the folio
 * lock before returning.
 *
 * Caller must lock the swap device or hold a reference to keep it valid.
 */
struct folio *swap_cache_get_folio(swp_entry_t entry,
                struct vm_area_struct *vma, unsigned long addr)
{
        struct folio *folio;

        folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry));
        if (!IS_ERR(folio)) {
                bool vma_ra = swap_use_vma_readahead();
                bool readahead;

                /*
                 * At the moment, we don't support PG_readahead for anon THP
                 * so let's bail out rather than confusing the readahead stat.
                 */
                if (unlikely(folio_test_large(folio)))
                        return folio;

                readahead = folio_test_clear_readahead(folio);
                if (vma && vma_ra) {
                        unsigned long ra_val;
                        int win, hits;

                        ra_val = GET_SWAP_RA_VAL(vma);
                        win = SWAP_RA_WIN(ra_val);
                        hits = SWAP_RA_HITS(ra_val);
                        if (readahead)
                                hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
                        atomic_long_set(&vma->swap_readahead_info,
                                        SWAP_RA_VAL(addr, win, hits));
                }

                if (readahead) {
                        count_vm_event(SWAP_RA_HIT);
                        if (!vma || !vma_ra)
                                atomic_inc(&swapin_readahead_hits);
                }
        } else {
                folio = NULL;
        }

        return folio;
}

/**
 * filemap_get_incore_folio - Find and get a folio from the page or swap caches.
 * @mapping: The address_space to search.
 * @index: The page cache index.
 *
 * This differs from filemap_get_folio() in that it will also look for the
 * folio in the swap cache.
 *
 * Return: The found folio or %NULL.
 */
struct folio *filemap_get_incore_folio(struct address_space *mapping,
                pgoff_t index)
{
        swp_entry_t swp;
        struct swap_info_struct *si;
        struct folio *folio = filemap_get_entry(mapping, index);

        if (!folio)
                return ERR_PTR(-ENOENT);
        if (!xa_is_value(folio))
                return folio;
        if (!shmem_mapping(mapping))
                return ERR_PTR(-ENOENT);

        swp = radix_to_swp_entry(folio);
        /* There might be swapin error entries in shmem mapping. */
        if (non_swap_entry(swp))
                return ERR_PTR(-ENOENT);
        /* Prevent swapoff from happening to us */
        si = get_swap_device(swp);
        if (!si)
                return ERR_PTR(-ENOENT);
        index = swp_offset(swp);
        folio = filemap_get_folio(swap_address_space(swp), index);
        put_swap_device(si);
        return folio;
}

struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
                bool skip_if_exists)
{
        struct swap_info_struct *si;
        struct folio *folio;
        void *shadow = NULL;

        *new_page_allocated = false;
        si = get_swap_device(entry);
        if (!si)
                return NULL;

        for (;;) {
                int err;
                /*
                 * First check the swap cache.  Since this is normally
                 * called after swap_cache_get_folio() failed, re-calling
                 * that would confuse statistics.
                 */
                folio = filemap_get_folio(swap_address_space(entry),
                                                swp_offset(entry));
                if (!IS_ERR(folio))
                        goto got_folio;

                /*
                 * Just skip read ahead for unused swap slot.
                 * During swap_off when swap_slot_cache is disabled,
                 * we have to handle the race between putting
                 * swap entry in swap cache and marking swap slot
                 * as SWAP_HAS_CACHE.  That's done in later part of code or
                 * else swap_off will be aborted if we return NULL.
                 */
                if (!swap_swapcount(si, entry) && swap_slot_cache_enabled)
                        goto fail_put_swap;

                /*
                 * Get a new folio to read into from swap.  Allocate it now,
                 * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
                 * cause any racers to loop around until we add it to cache.
                 */
                folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0,
                                                mpol, ilx, numa_node_id());
                if (!folio)
                        goto fail_put_swap;

                /*
                 * Swap entry may have been freed since our caller observed it.
                 */
                err = swapcache_prepare(entry);
                if (!err)
                        break;

                folio_put(folio);
                if (err != -EEXIST)
                        goto fail_put_swap;

                /*
                 * Protect against a recursive call to __read_swap_cache_async()
                 * on the same entry waiting forever here because SWAP_HAS_CACHE
                 * is set but the folio is not the swap cache yet. This can
                 * happen today if mem_cgroup_swapin_charge_folio() below
                 * triggers reclaim through zswap, which may call
                 * __read_swap_cache_async() in the writeback path.
                 */
                if (skip_if_exists)
                        goto fail_put_swap;

                /*
                 * We might race against __delete_from_swap_cache(), and
                 * stumble across a swap_map entry whose SWAP_HAS_CACHE
                 * has not yet been cleared.  Or race against another
                 * __read_swap_cache_async(), which has set SWAP_HAS_CACHE
                 * in swap_map, but not yet added its folio to swap cache.
                 */
                schedule_timeout_uninterruptible(1);
        }

        /*
         * The swap entry is ours to swap in. Prepare the new folio.
         */

        __folio_set_locked(folio);
        __folio_set_swapbacked(folio);

        if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry))
                goto fail_unlock;

        /* May fail (-ENOMEM) if XArray node allocation failed. */
        if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
                goto fail_unlock;

        mem_cgroup_swapin_uncharge_swap(entry);

        if (shadow)
                workingset_refault(folio, shadow);

        /* Caller will initiate read into locked folio */
        folio_add_lru(folio);
        *new_page_allocated = true;
got_folio:
        put_swap_device(si);
        return folio;

fail_unlock:
        put_swap_folio(folio, entry);
        folio_unlock(folio);
        folio_put(folio);
fail_put_swap:
        put_swap_device(si);
        return NULL;
}

/*
 * Locate a page of swap in physical memory, reserving swap cache space
 * and reading the disk if it is not already cached.
 * A failure return means that either the page allocation failed or that
 * the swap entry is no longer in use.
 *
 * get/put_swap_device() aren't needed to call this function, because
 * __read_swap_cache_async() call them and swap_read_folio() holds the
 * swap cache folio lock.
 */
struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                struct vm_area_struct *vma, unsigned long addr,
                struct swap_iocb **plug)
{
        bool page_allocated;
        struct mempolicy *mpol;
        pgoff_t ilx;
        struct folio *folio;

        mpol = get_vma_policy(vma, addr, 0, &ilx);
        folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
                                        &page_allocated, false);
        mpol_cond_put(mpol);

        if (page_allocated)
                swap_read_folio(folio, false, plug);
        return folio;
}

static unsigned int __swapin_nr_pages(unsigned long prev_offset,
                                      unsigned long offset,
                                      int hits,
                                      int max_pages,
                                      int prev_win)
{
        unsigned int pages, last_ra;

        /*
         * This heuristic has been found to work well on both sequential and
         * random loads, swapping to hard disk or to SSD: please don't ask
         * what the "+ 2" means, it just happens to work well, that's all.
         */
        pages = hits + 2;
        if (pages == 2) {
                /*
                 * We can have no readahead hits to judge by: but must not get
                 * stuck here forever, so check for an adjacent offset instead
                 * (and don't even bother to check whether swap type is same).
                 */
                if (offset != prev_offset + 1 && offset != prev_offset - 1)
                        pages = 1;
        } else {
                unsigned int roundup = 4;
                while (roundup < pages)
                        roundup <<= 1;
                pages = roundup;
        }

        if (pages > max_pages)
                pages = max_pages;

        /* Don't shrink readahead too fast */
        last_ra = prev_win / 2;
        if (pages < last_ra)
                pages = last_ra;

        return pages;
}

static unsigned long swapin_nr_pages(unsigned long offset)
{
        static unsigned long prev_offset;
        unsigned int hits, pages, max_pages;
        static atomic_t last_readahead_pages;

        max_pages = 1 << READ_ONCE(page_cluster);
        if (max_pages <= 1)
                return 1;

        hits = atomic_xchg(&swapin_readahead_hits, 0);
        pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits,
                                  max_pages,
                                  atomic_read(&last_readahead_pages));
        if (!hits)
                WRITE_ONCE(prev_offset, offset);
        atomic_set(&last_readahead_pages, pages);

        return pages;
}

/**
 * swap_cluster_readahead - swap in pages in hope we need them soon
 * @entry: swap entry of this memory
 * @gfp_mask: memory allocation flags
 * @mpol: NUMA memory allocation policy to be applied
 * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
 *
 * Returns the struct folio for entry and addr, after queueing swapin.
 *
 * Primitive swap readahead code. We simply read an aligned block of
 * (1 << page_cluster) entries in the swap area. This method is chosen
 * because it doesn't cost us any seek time.  We also make sure to queue
 * the 'original' request together with the readahead ones...
 *
 * Note: it is intentional that the same NUMA policy and interleave index
 * are used for every page of the readahead: neighbouring pages on swap
 * are fairly likely to have been swapped out from the same node.
 */
struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
                                    struct mempolicy *mpol, pgoff_t ilx)
{
        struct folio *folio;
        unsigned long entry_offset = swp_offset(entry);
        unsigned long offset = entry_offset;
        unsigned long start_offset, end_offset;
        unsigned long mask;
        struct swap_info_struct *si = swp_swap_info(entry);
        struct blk_plug plug;
        struct swap_iocb *splug = NULL;
        bool page_allocated;

        mask = swapin_nr_pages(offset) - 1;
        if (!mask)
                goto skip;

        /* Read a page_cluster sized and aligned cluster around offset. */
        start_offset = offset & ~mask;
        end_offset = offset | mask;
        if (!start_offset)        /* First page is swap header. */
                start_offset++;
        if (end_offset >= si->max)
                end_offset = si->max - 1;

        blk_start_plug(&plug);
        for (offset = start_offset; offset <= end_offset ; offset++) {
                /* Ok, do the async read-ahead now */
                folio = __read_swap_cache_async(
                                swp_entry(swp_type(entry), offset),
                                gfp_mask, mpol, ilx, &page_allocated, false);
                if (!folio)
                        continue;
                if (page_allocated) {
                        swap_read_folio(folio, false, &splug);
                        if (offset != entry_offset) {
                                folio_set_readahead(folio);
                                count_vm_event(SWAP_RA);
                        }
                }
                folio_put(folio);
        }
        blk_finish_plug(&plug);
        swap_read_unplug(splug);
        lru_add_drain();        /* Push any new pages onto the LRU now */
skip:
        /* The page was likely read above, so no need for plugging here */
        folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
                                        &page_allocated, false);
        if (unlikely(page_allocated)) {
                zswap_folio_swapin(folio);
                swap_read_folio(folio, false, NULL);
        }
        return folio;
}

int init_swap_address_space(unsigned int type, unsigned long nr_pages)
{
        struct address_space *spaces, *space;
        unsigned int i, nr;

        nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
        spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL);
        if (!spaces)
                return -ENOMEM;
        for (i = 0; i < nr; i++) {
                space = spaces + i;
                xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ);
                atomic_set(&space->i_mmap_writable, 0);
                space->a_ops = &swap_aops;
                /* swap cache doesn't use writeback related tags */
                mapping_set_no_writeback_tags(space);
        }
        nr_swapper_spaces[type] = nr;
        swapper_spaces[type] = spaces;

        return 0;
}

void exit_swap_address_space(unsigned int type)
{
        int i;
        struct address_space *spaces = swapper_spaces[type];

        for (i = 0; i < nr_swapper_spaces[type]; i++)
                VM_WARN_ON_ONCE(!mapping_empty(&spaces[i]));
        kvfree(spaces);
        nr_swapper_spaces[type] = 0;
        swapper_spaces[type] = NULL;
}

#define SWAP_RA_ORDER_CEILING        5

struct vma_swap_readahead {
        unsigned short win;
        unsigned short offset;
        unsigned short nr_pte;
};

static void swap_ra_info(struct vm_fault *vmf,
                         struct vma_swap_readahead *ra_info)
{
        struct vm_area_struct *vma = vmf->vma;
        unsigned long ra_val;
        unsigned long faddr, pfn, fpfn, lpfn, rpfn;
        unsigned long start, end;
        unsigned int max_win, hits, prev_win, win;

        max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster),
                             SWAP_RA_ORDER_CEILING);
        if (max_win == 1) {
                ra_info->win = 1;
                return;
        }

        faddr = vmf->address;
        fpfn = PFN_DOWN(faddr);
        ra_val = GET_SWAP_RA_VAL(vma);
        pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val));
        prev_win = SWAP_RA_WIN(ra_val);
        hits = SWAP_RA_HITS(ra_val);
        ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits,
                                               max_win, prev_win);
        atomic_long_set(&vma->swap_readahead_info,
                        SWAP_RA_VAL(faddr, win, 0));
        if (win == 1)
                return;

        if (fpfn == pfn + 1) {
                lpfn = fpfn;
                rpfn = fpfn + win;
        } else if (pfn == fpfn + 1) {
                lpfn = fpfn - win + 1;
                rpfn = fpfn + 1;
        } else {
                unsigned int left = (win - 1) / 2;

                lpfn = fpfn - left;
                rpfn = fpfn + win - left;
        }
        start = max3(lpfn, PFN_DOWN(vma->vm_start),
                     PFN_DOWN(faddr & PMD_MASK));
        end = min3(rpfn, PFN_DOWN(vma->vm_end),
                   PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));

        ra_info->nr_pte = end - start;
        ra_info->offset = fpfn - start;
}

/**
 * swap_vma_readahead - swap in pages in hope we need them soon
 * @targ_entry: swap entry of the targeted memory
 * @gfp_mask: memory allocation flags
 * @mpol: NUMA memory allocation policy to be applied
 * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
 * @vmf: fault information
 *
 * Returns the struct folio for entry and addr, after queueing swapin.
 *
 * Primitive swap readahead code. We simply read in a few pages whose
 * virtual addresses are around the fault address in the same vma.
 *
 * Caller must hold read mmap_lock if vmf->vma is not NULL.
 *
 */
static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
                struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf)
{
        struct blk_plug plug;
        struct swap_iocb *splug = NULL;
        struct folio *folio;
        pte_t *pte = NULL, pentry;
        unsigned long addr;
        swp_entry_t entry;
        pgoff_t ilx;
        unsigned int i;
        bool page_allocated;
        struct vma_swap_readahead ra_info = {
                .win = 1,
        };

        swap_ra_info(vmf, &ra_info);
        if (ra_info.win == 1)
                goto skip;

        addr = vmf->address - (ra_info.offset * PAGE_SIZE);
        ilx = targ_ilx - ra_info.offset;

        blk_start_plug(&plug);
        for (i = 0; i < ra_info.nr_pte; i++, ilx++, addr += PAGE_SIZE) {
                if (!pte++) {
                        pte = pte_offset_map(vmf->pmd, addr);
                        if (!pte)
                                break;
                }
                pentry = ptep_get_lockless(pte);
                if (!is_swap_pte(pentry))
                        continue;
                entry = pte_to_swp_entry(pentry);
                if (unlikely(non_swap_entry(entry)))
                        continue;
                pte_unmap(pte);
                pte = NULL;
                folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
                                                &page_allocated, false);
                if (!folio)
                        continue;
                if (page_allocated) {
                        swap_read_folio(folio, false, &splug);
                        if (i != ra_info.offset) {
                                folio_set_readahead(folio);
                                count_vm_event(SWAP_RA);
                        }
                }
                folio_put(folio);
        }
        if (pte)
                pte_unmap(pte);
        blk_finish_plug(&plug);
        swap_read_unplug(splug);
        lru_add_drain();
skip:
        /* The folio was likely read above, so no need for plugging here */
        folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
                                        &page_allocated, false);
        if (unlikely(page_allocated)) {
                zswap_folio_swapin(folio);
                swap_read_folio(folio, false, NULL);
        }
        return folio;
}

/**
 * swapin_readahead - swap in pages in hope we need them soon
 * @entry: swap entry of this memory
 * @gfp_mask: memory allocation flags
 * @vmf: fault information
 *
 * Returns the struct page for entry and addr, after queueing swapin.
 *
 * It's a main entry function for swap readahead. By the configuration,
 * it will read ahead blocks by cluster-based(ie, physical disk based)
 * or vma-based(ie, virtual address based on faulty address) readahead.
 */
struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
                                struct vm_fault *vmf)
{
        struct mempolicy *mpol;
        pgoff_t ilx;
        struct folio *folio;

        mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
        folio = swap_use_vma_readahead() ?
                swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) :
                swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
        mpol_cond_put(mpol);

        if (!folio)
                return NULL;
        return folio_file_page(folio, swp_offset(entry));
}

#ifdef CONFIG_SYSFS
static ssize_t vma_ra_enabled_show(struct kobject *kobj,
                                     struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%s\n",
                          enable_vma_readahead ? "true" : "false");
}
static ssize_t vma_ra_enabled_store(struct kobject *kobj,
                                      struct kobj_attribute *attr,
                                      const char *buf, size_t count)
{
        ssize_t ret;

        ret = kstrtobool(buf, &enable_vma_readahead);
        if (ret)
                return ret;

        return count;
}
static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled);

static struct attribute *swap_attrs[] = {
        &vma_ra_enabled_attr.attr,
        NULL,
};

static const struct attribute_group swap_attr_group = {
        .attrs = swap_attrs,
};

static int __init swap_init_sysfs(void)
{
        int err;
        struct kobject *swap_kobj;

        swap_kobj = kobject_create_and_add("swap", mm_kobj);
        if (!swap_kobj) {
                pr_err("failed to create swap kobject\n");
                return -ENOMEM;
        }
        err = sysfs_create_group(swap_kobj, &swap_attr_group);
        if (err) {
                pr_err("failed to register swap group\n");
                goto delete_obj;
        }
        return 0;

delete_obj:
        kobject_put(swap_kobj);
        return err;
}
subsys_initcall(swap_init_sysfs);
#endif































































































































































































    1 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






    1 
























    1 





    1 





    1 














    1 





    1 








    1 





    1 






































    1 




    1 



































































































    1 
















    1 


    1 





    1 


    1 



























    1 

    1 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
// SPDX-License-Identifier: GPL-2.0
/*
 *
 * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
 *
 */

#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/nls.h>

#include "debug.h"
#include "ntfs.h"
#include "ntfs_fs.h"

// clang-format off
const struct cpu_str NAME_MFT = {
        4, 0, { '$', 'M', 'F', 'T' },
};
const struct cpu_str NAME_MIRROR = {
        8, 0, { '$', 'M', 'F', 'T', 'M', 'i', 'r', 'r' },
};
const struct cpu_str NAME_LOGFILE = {
        8, 0, { '$', 'L', 'o', 'g', 'F', 'i', 'l', 'e' },
};
const struct cpu_str NAME_VOLUME = {
        7, 0, { '$', 'V', 'o', 'l', 'u', 'm', 'e' },
};
const struct cpu_str NAME_ATTRDEF = {
        8, 0, { '$', 'A', 't', 't', 'r', 'D', 'e', 'f' },
};
const struct cpu_str NAME_ROOT = {
        1, 0, { '.' },
};
const struct cpu_str NAME_BITMAP = {
        7, 0, { '$', 'B', 'i', 't', 'm', 'a', 'p' },
};
const struct cpu_str NAME_BOOT = {
        5, 0, { '$', 'B', 'o', 'o', 't' },
};
const struct cpu_str NAME_BADCLUS = {
        8, 0, { '$', 'B', 'a', 'd', 'C', 'l', 'u', 's' },
};
const struct cpu_str NAME_QUOTA = {
        6, 0, { '$', 'Q', 'u', 'o', 't', 'a' },
};
const struct cpu_str NAME_SECURE = {
        7, 0, { '$', 'S', 'e', 'c', 'u', 'r', 'e' },
};
const struct cpu_str NAME_UPCASE = {
        7, 0, { '$', 'U', 'p', 'C', 'a', 's', 'e' },
};
const struct cpu_str NAME_EXTEND = {
        7, 0, { '$', 'E', 'x', 't', 'e', 'n', 'd' },
};
const struct cpu_str NAME_OBJID = {
        6, 0, { '$', 'O', 'b', 'j', 'I', 'd' },
};
const struct cpu_str NAME_REPARSE = {
        8, 0, { '$', 'R', 'e', 'p', 'a', 'r', 's', 'e' },
};
const struct cpu_str NAME_USNJRNL = {
        8, 0, { '$', 'U', 's', 'n', 'J', 'r', 'n', 'l' },
};
const __le16 BAD_NAME[4] = {
        cpu_to_le16('$'), cpu_to_le16('B'), cpu_to_le16('a'), cpu_to_le16('d'),
};
const __le16 I30_NAME[4] = {
        cpu_to_le16('$'), cpu_to_le16('I'), cpu_to_le16('3'), cpu_to_le16('0'),
};
const __le16 SII_NAME[4] = {
        cpu_to_le16('$'), cpu_to_le16('S'), cpu_to_le16('I'), cpu_to_le16('I'),
};
const __le16 SDH_NAME[4] = {
        cpu_to_le16('$'), cpu_to_le16('S'), cpu_to_le16('D'), cpu_to_le16('H'),
};
const __le16 SDS_NAME[4] = {
        cpu_to_le16('$'), cpu_to_le16('S'), cpu_to_le16('D'), cpu_to_le16('S'),
};
const __le16 SO_NAME[2] = {
        cpu_to_le16('$'), cpu_to_le16('O'),
};
const __le16 SQ_NAME[2] = {
        cpu_to_le16('$'), cpu_to_le16('Q'),
};
const __le16 SR_NAME[2] = {
        cpu_to_le16('$'), cpu_to_le16('R'),
};

#ifdef CONFIG_NTFS3_LZX_XPRESS
const __le16 WOF_NAME[17] = {
        cpu_to_le16('W'), cpu_to_le16('o'), cpu_to_le16('f'), cpu_to_le16('C'),
        cpu_to_le16('o'), cpu_to_le16('m'), cpu_to_le16('p'), cpu_to_le16('r'),
        cpu_to_le16('e'), cpu_to_le16('s'), cpu_to_le16('s'), cpu_to_le16('e'),
        cpu_to_le16('d'), cpu_to_le16('D'), cpu_to_le16('a'), cpu_to_le16('t'),
        cpu_to_le16('a'),
};
#endif

static const __le16 CON_NAME[3] = {
        cpu_to_le16('C'), cpu_to_le16('O'), cpu_to_le16('N'),
};

static const __le16 NUL_NAME[3] = {
        cpu_to_le16('N'), cpu_to_le16('U'), cpu_to_le16('L'),
};

static const __le16 AUX_NAME[3] = {
        cpu_to_le16('A'), cpu_to_le16('U'), cpu_to_le16('X'),
};

static const __le16 PRN_NAME[3] = {
        cpu_to_le16('P'), cpu_to_le16('R'), cpu_to_le16('N'),
};

static const __le16 COM_NAME[3] = {
        cpu_to_le16('C'), cpu_to_le16('O'), cpu_to_le16('M'),
};

static const __le16 LPT_NAME[3] = {
        cpu_to_le16('L'), cpu_to_le16('P'), cpu_to_le16('T'),
};

// clang-format on

/*
 * ntfs_fix_pre_write - Insert fixups into @rhdr before writing to disk.
 */
bool ntfs_fix_pre_write(struct NTFS_RECORD_HEADER *rhdr, size_t bytes)
{
        u16 *fixup, *ptr;
        u16 sample;
        u16 fo = le16_to_cpu(rhdr->fix_off);
        u16 fn = le16_to_cpu(rhdr->fix_num);

        if ((fo & 1) || fo + fn * sizeof(short) > SECTOR_SIZE || !fn-- ||
            fn * SECTOR_SIZE > bytes) {
                return false;
        }

        /* Get fixup pointer. */
        fixup = Add2Ptr(rhdr, fo);

        if (*fixup >= 0x7FFF)
                *fixup = 1;
        else
                *fixup += 1;

        sample = *fixup;

        ptr = Add2Ptr(rhdr, SECTOR_SIZE - sizeof(short));

        while (fn--) {
                *++fixup = *ptr;
                *ptr = sample;
                ptr += SECTOR_SIZE / sizeof(short);
        }
        return true;
}

/*
 * ntfs_fix_post_read - Remove fixups after reading from disk.
 *
 * Return: < 0 if error, 0 if ok, 1 if need to update fixups.
 */
int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes,
                       bool simple)
{
        int ret;
        u16 *fixup, *ptr;
        u16 sample, fo, fn;

        fo = le16_to_cpu(rhdr->fix_off);
        fn = simple ? ((bytes >> SECTOR_SHIFT) + 1) :
                      le16_to_cpu(rhdr->fix_num);

        /* Check errors. */
        if ((fo & 1) || fo + fn * sizeof(short) > SECTOR_SIZE || !fn-- ||
            fn * SECTOR_SIZE > bytes) {
                return -E_NTFS_CORRUPT;
        }

        /* Get fixup pointer. */
        fixup = Add2Ptr(rhdr, fo);
        sample = *fixup;
        ptr = Add2Ptr(rhdr, SECTOR_SIZE - sizeof(short));
        ret = 0;

        while (fn--) {
                /* Test current word. */
                if (*ptr != sample) {
                        /* Fixup does not match! Is it serious error? */
                        ret = -E_NTFS_FIXUP;
                }

                /* Replace fixup. */
                *ptr = *++fixup;
                ptr += SECTOR_SIZE / sizeof(short);
        }

        return ret;
}

/*
 * ntfs_extend_init - Load $Extend file.
 */
int ntfs_extend_init(struct ntfs_sb_info *sbi)
{
        int err;
        struct super_block *sb = sbi->sb;
        struct inode *inode, *inode2;
        struct MFT_REF ref;

        if (sbi->volume.major_ver < 3) {
                ntfs_notice(sb, "Skip $Extend 'cause NTFS version");
                return 0;
        }

        ref.low = cpu_to_le32(MFT_REC_EXTEND);
        ref.high = 0;
        ref.seq = cpu_to_le16(MFT_REC_EXTEND);
        inode = ntfs_iget5(sb, &ref, &NAME_EXTEND);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                ntfs_err(sb, "Failed to load $Extend (%d).", err);
                inode = NULL;
                goto out;
        }

        /* If ntfs_iget5() reads from disk it never returns bad inode. */
        if (!S_ISDIR(inode->i_mode)) {
                err = -EINVAL;
                goto out;
        }

        /* Try to find $ObjId */
        inode2 = dir_search_u(inode, &NAME_OBJID, NULL);
        if (inode2 && !IS_ERR(inode2)) {
                if (is_bad_inode(inode2)) {
                        iput(inode2);
                } else {
                        sbi->objid.ni = ntfs_i(inode2);
                        sbi->objid_no = inode2->i_ino;
                }
        }

        /* Try to find $Quota */
        inode2 = dir_search_u(inode, &NAME_QUOTA, NULL);
        if (inode2 && !IS_ERR(inode2)) {
                sbi->quota_no = inode2->i_ino;
                iput(inode2);
        }

        /* Try to find $Reparse */
        inode2 = dir_search_u(inode, &NAME_REPARSE, NULL);
        if (inode2 && !IS_ERR(inode2)) {
                sbi->reparse.ni = ntfs_i(inode2);
                sbi->reparse_no = inode2->i_ino;
        }

        /* Try to find $UsnJrnl */
        inode2 = dir_search_u(inode, &NAME_USNJRNL, NULL);
        if (inode2 && !IS_ERR(inode2)) {
                sbi->usn_jrnl_no = inode2->i_ino;
                iput(inode2);
        }

        err = 0;
out:
        iput(inode);
        return err;
}

int ntfs_loadlog_and_replay(struct ntfs_inode *ni, struct ntfs_sb_info *sbi)
{
        int err = 0;
        struct super_block *sb = sbi->sb;
        bool initialized = false;
        struct MFT_REF ref;
        struct inode *inode;

        /* Check for 4GB. */
        if (ni->vfs_inode.i_size >= 0x100000000ull) {
                ntfs_err(sb, "\x24LogFile is large than 4G.");
                err = -EINVAL;
                goto out;
        }

        sbi->flags |= NTFS_FLAGS_LOG_REPLAYING;

        ref.low = cpu_to_le32(MFT_REC_MFT);
        ref.high = 0;
        ref.seq = cpu_to_le16(1);

        inode = ntfs_iget5(sb, &ref, NULL);

        if (IS_ERR(inode))
                inode = NULL;

        if (!inode) {
                /* Try to use MFT copy. */
                u64 t64 = sbi->mft.lbo;

                sbi->mft.lbo = sbi->mft.lbo2;
                inode = ntfs_iget5(sb, &ref, NULL);
                sbi->mft.lbo = t64;
                if (IS_ERR(inode))
                        inode = NULL;
        }

        if (!inode) {
                err = -EINVAL;
                ntfs_err(sb, "Failed to load $MFT.");
                goto out;
        }

        sbi->mft.ni = ntfs_i(inode);

        /* LogFile should not contains attribute list. */
        err = ni_load_all_mi(sbi->mft.ni);
        if (!err)
                err = log_replay(ni, &initialized);

        iput(inode);
        sbi->mft.ni = NULL;

        sync_blockdev(sb->s_bdev);
        invalidate_bdev(sb->s_bdev);

        if (sbi->flags & NTFS_FLAGS_NEED_REPLAY) {
                err = 0;
                goto out;
        }

        if (sb_rdonly(sb) || !initialized)
                goto out;

        /* Fill LogFile by '-1' if it is initialized. */
        err = ntfs_bio_fill_1(sbi, &ni->file.run);

out:
        sbi->flags &= ~NTFS_FLAGS_LOG_REPLAYING;

        return err;
}

/*
 * ntfs_look_for_free_space - Look for a free space in bitmap.
 */
int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len,
                             CLST *new_lcn, CLST *new_len,
                             enum ALLOCATE_OPT opt)
{
        int err;
        CLST alen;
        struct super_block *sb = sbi->sb;
        size_t alcn, zlen, zeroes, zlcn, zlen2, ztrim, new_zlen;
        struct wnd_bitmap *wnd = &sbi->used.bitmap;

        down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS);
        if (opt & ALLOCATE_MFT) {
                zlen = wnd_zone_len(wnd);

                if (!zlen) {
                        err = ntfs_refresh_zone(sbi);
                        if (err)
                                goto up_write;

                        zlen = wnd_zone_len(wnd);
                }

                if (!zlen) {
                        ntfs_err(sbi->sb, "no free space to extend mft");
                        err = -ENOSPC;
                        goto up_write;
                }

                lcn = wnd_zone_bit(wnd);
                alen = min_t(CLST, len, zlen);

                wnd_zone_set(wnd, lcn + alen, zlen - alen);

                err = wnd_set_used(wnd, lcn, alen);
                if (err)
                        goto up_write;

                alcn = lcn;
                goto space_found;
        }
        /*
         * 'Cause cluster 0 is always used this value means that we should use
         * cached value of 'next_free_lcn' to improve performance.
         */
        if (!lcn)
                lcn = sbi->used.next_free_lcn;

        if (lcn >= wnd->nbits)
                lcn = 0;

        alen = wnd_find(wnd, len, lcn, BITMAP_FIND_MARK_AS_USED, &alcn);
        if (alen)
                goto space_found;

        /* Try to use clusters from MftZone. */
        zlen = wnd_zone_len(wnd);
        zeroes = wnd_zeroes(wnd);

        /* Check too big request */
        if (len > zeroes + zlen || zlen <= NTFS_MIN_MFT_ZONE) {
                err = -ENOSPC;
                goto up_write;
        }

        /* How many clusters to cat from zone. */
        zlcn = wnd_zone_bit(wnd);
        zlen2 = zlen >> 1;
        ztrim = clamp_val(len, zlen2, zlen);
        new_zlen = max_t(size_t, zlen - ztrim, NTFS_MIN_MFT_ZONE);

        wnd_zone_set(wnd, zlcn, new_zlen);

        /* Allocate continues clusters. */
        alen = wnd_find(wnd, len, 0,
                        BITMAP_FIND_MARK_AS_USED | BITMAP_FIND_FULL, &alcn);
        if (!alen) {
                err = -ENOSPC;
                goto up_write;
        }

space_found:
        err = 0;
        *new_len = alen;
        *new_lcn = alcn;

        ntfs_unmap_meta(sb, alcn, alen);

        /* Set hint for next requests. */
        if (!(opt & ALLOCATE_MFT))
                sbi->used.next_free_lcn = alcn + alen;
up_write:
        up_write(&wnd->rw_lock);
        return err;
}

/*
 * ntfs_check_for_free_space
 *
 * Check if it is possible to allocate 'clen' clusters and 'mlen' Mft records
 */
bool ntfs_check_for_free_space(struct ntfs_sb_info *sbi, CLST clen, CLST mlen)
{
        size_t free, zlen, avail;
        struct wnd_bitmap *wnd;

        wnd = &sbi->used.bitmap;
        down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS);
        free = wnd_zeroes(wnd);
        zlen = min_t(size_t, NTFS_MIN_MFT_ZONE, wnd_zone_len(wnd));
        up_read(&wnd->rw_lock);

        if (free < zlen + clen)
                return false;

        avail = free - (zlen + clen);

        wnd = &sbi->mft.bitmap;
        down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT);
        free = wnd_zeroes(wnd);
        zlen = wnd_zone_len(wnd);
        up_read(&wnd->rw_lock);

        if (free >= zlen + mlen)
                return true;

        return avail >= bytes_to_cluster(sbi, mlen << sbi->record_bits);
}

/*
 * ntfs_extend_mft - Allocate additional MFT records.
 *
 * sbi->mft.bitmap is locked for write.
 *
 * NOTE: recursive:
 *        ntfs_look_free_mft ->
 *        ntfs_extend_mft ->
 *        attr_set_size ->
 *        ni_insert_nonresident ->
 *        ni_insert_attr ->
 *        ni_ins_attr_ext ->
 *        ntfs_look_free_mft ->
 *        ntfs_extend_mft
 *
 * To avoid recursive always allocate space for two new MFT records
 * see attrib.c: "at least two MFT to avoid recursive loop".
 */
static int ntfs_extend_mft(struct ntfs_sb_info *sbi)
{
        int err;
        struct ntfs_inode *ni = sbi->mft.ni;
        size_t new_mft_total;
        u64 new_mft_bytes, new_bitmap_bytes;
        struct ATTRIB *attr;
        struct wnd_bitmap *wnd = &sbi->mft.bitmap;

        new_mft_total = ALIGN(wnd->nbits + NTFS_MFT_INCREASE_STEP, 128);
        new_mft_bytes = (u64)new_mft_total << sbi->record_bits;

        /* Step 1: Resize $MFT::DATA. */
        down_write(&ni->file.run_lock);
        err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run,
                            new_mft_bytes, NULL, false, &attr);

        if (err) {
                up_write(&ni->file.run_lock);
                goto out;
        }

        attr->nres.valid_size = attr->nres.data_size;
        new_mft_total = le64_to_cpu(attr->nres.alloc_size) >> sbi->record_bits;
        ni->mi.dirty = true;

        /* Step 2: Resize $MFT::BITMAP. */
        new_bitmap_bytes = ntfs3_bitmap_size(new_mft_total);

        err = attr_set_size(ni, ATTR_BITMAP, NULL, 0, &sbi->mft.bitmap.run,
                            new_bitmap_bytes, &new_bitmap_bytes, true, NULL);

        /* Refresh MFT Zone if necessary. */
        down_write_nested(&sbi->used.bitmap.rw_lock, BITMAP_MUTEX_CLUSTERS);

        ntfs_refresh_zone(sbi);

        up_write(&sbi->used.bitmap.rw_lock);
        up_write(&ni->file.run_lock);

        if (err)
                goto out;

        err = wnd_extend(wnd, new_mft_total);

        if (err)
                goto out;

        ntfs_clear_mft_tail(sbi, sbi->mft.used, new_mft_total);

        err = _ni_write_inode(&ni->vfs_inode, 0);
out:
        return err;
}

/*
 * ntfs_look_free_mft - Look for a free MFT record.
 */
int ntfs_look_free_mft(struct ntfs_sb_info *sbi, CLST *rno, bool mft,
                       struct ntfs_inode *ni, struct mft_inode **mi)
{
        int err = 0;
        size_t zbit, zlen, from, to, fr;
        size_t mft_total;
        struct MFT_REF ref;
        struct super_block *sb = sbi->sb;
        struct wnd_bitmap *wnd = &sbi->mft.bitmap;
        u32 ir;

        static_assert(sizeof(sbi->mft.reserved_bitmap) * 8 >=
                      MFT_REC_FREE - MFT_REC_RESERVED);

        if (!mft)
                down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT);

        zlen = wnd_zone_len(wnd);

        /* Always reserve space for MFT. */
        if (zlen) {
                if (mft) {
                        zbit = wnd_zone_bit(wnd);
                        *rno = zbit;
                        wnd_zone_set(wnd, zbit + 1, zlen - 1);
                }
                goto found;
        }

        /* No MFT zone. Find the nearest to '0' free MFT. */
        if (!wnd_find(wnd, 1, MFT_REC_FREE, 0, &zbit)) {
                /* Resize MFT */
                mft_total = wnd->nbits;

                err = ntfs_extend_mft(sbi);
                if (!err) {
                        zbit = mft_total;
                        goto reserve_mft;
                }

                if (!mft || MFT_REC_FREE == sbi->mft.next_reserved)
                        goto out;

                err = 0;

                /*
                 * Look for free record reserved area [11-16) ==
                 * [MFT_REC_RESERVED, MFT_REC_FREE ) MFT bitmap always
                 * marks it as used.
                 */
                if (!sbi->mft.reserved_bitmap) {
                        /* Once per session create internal bitmap for 5 bits. */
                        sbi->mft.reserved_bitmap = 0xFF;

                        ref.high = 0;
                        for (ir = MFT_REC_RESERVED; ir < MFT_REC_FREE; ir++) {
                                struct inode *i;
                                struct ntfs_inode *ni;
                                struct MFT_REC *mrec;

                                ref.low = cpu_to_le32(ir);
                                ref.seq = cpu_to_le16(ir);

                                i = ntfs_iget5(sb, &ref, NULL);
                                if (IS_ERR(i)) {
next:
                                        ntfs_notice(
                                                sb,
                                                "Invalid reserved record %x",
                                                ref.low);
                                        continue;
                                }
                                if (is_bad_inode(i)) {
                                        iput(i);
                                        goto next;
                                }

                                ni = ntfs_i(i);

                                mrec = ni->mi.mrec;

                                if (!is_rec_base(mrec))
                                        goto next;

                                if (mrec->hard_links)
                                        goto next;

                                if (!ni_std(ni))
                                        goto next;

                                if (ni_find_attr(ni, NULL, NULL, ATTR_NAME,
                                                 NULL, 0, NULL, NULL))
                                        goto next;

                                __clear_bit(ir - MFT_REC_RESERVED,
                                            &sbi->mft.reserved_bitmap);
                        }
                }

                /* Scan 5 bits for zero. Bit 0 == MFT_REC_RESERVED */
                zbit = find_next_zero_bit(&sbi->mft.reserved_bitmap,
                                          MFT_REC_FREE, MFT_REC_RESERVED);
                if (zbit >= MFT_REC_FREE) {
                        sbi->mft.next_reserved = MFT_REC_FREE;
                        goto out;
                }

                zlen = 1;
                sbi->mft.next_reserved = zbit;
        } else {
reserve_mft:
                zlen = zbit == MFT_REC_FREE ? (MFT_REC_USER - MFT_REC_FREE) : 4;
                if (zbit + zlen > wnd->nbits)
                        zlen = wnd->nbits - zbit;

                while (zlen > 1 && !wnd_is_free(wnd, zbit, zlen))
                        zlen -= 1;

                /* [zbit, zbit + zlen) will be used for MFT itself. */
                from = sbi->mft.used;
                if (from < zbit)
                        from = zbit;
                to = zbit + zlen;
                if (from < to) {
                        ntfs_clear_mft_tail(sbi, from, to);
                        sbi->mft.used = to;
                }
        }

        if (mft) {
                *rno = zbit;
                zbit += 1;
                zlen -= 1;
        }

        wnd_zone_set(wnd, zbit, zlen);

found:
        if (!mft) {
                /* The request to get record for general purpose. */
                if (sbi->mft.next_free < MFT_REC_USER)
                        sbi->mft.next_free = MFT_REC_USER;

                for (;;) {
                        if (sbi->mft.next_free >= sbi->mft.bitmap.nbits) {
                        } else if (!wnd_find(wnd, 1, MFT_REC_USER, 0, &fr)) {
                                sbi->mft.next_free = sbi->mft.bitmap.nbits;
                        } else {
                                *rno = fr;
                                sbi->mft.next_free = *rno + 1;
                                break;
                        }

                        err = ntfs_extend_mft(sbi);
                        if (err)
                                goto out;
                }
        }

        if (ni && !ni_add_subrecord(ni, *rno, mi)) {
                err = -ENOMEM;
                goto out;
        }

        /* We have found a record that are not reserved for next MFT. */
        if (*rno >= MFT_REC_FREE)
                wnd_set_used(wnd, *rno, 1);
        else if (*rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited)
                __set_bit(*rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap);

out:
        if (!mft)
                up_write(&wnd->rw_lock);

        return err;
}

/*
 * ntfs_mark_rec_free - Mark record as free.
 * is_mft - true if we are changing MFT
 */
void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft)
{
        struct wnd_bitmap *wnd = &sbi->mft.bitmap;

        if (!is_mft)
                down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT);
        if (rno >= wnd->nbits)
                goto out;

        if (rno >= MFT_REC_FREE) {
                if (!wnd_is_used(wnd, rno, 1))
                        ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
                else
                        wnd_set_free(wnd, rno, 1);
        } else if (rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited) {
                __clear_bit(rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap);
        }

        if (rno < wnd_zone_bit(wnd))
                wnd_zone_set(wnd, rno, 1);
        else if (rno < sbi->mft.next_free && rno >= MFT_REC_USER)
                sbi->mft.next_free = rno;

out:
        if (!is_mft)
                up_write(&wnd->rw_lock);
}

/*
 * ntfs_clear_mft_tail - Format empty records [from, to).
 *
 * sbi->mft.bitmap is locked for write.
 */
int ntfs_clear_mft_tail(struct ntfs_sb_info *sbi, size_t from, size_t to)
{
        int err;
        u32 rs;
        u64 vbo;
        struct runs_tree *run;
        struct ntfs_inode *ni;

        if (from >= to)
                return 0;

        rs = sbi->record_size;
        ni = sbi->mft.ni;
        run = &ni->file.run;

        down_read(&ni->file.run_lock);
        vbo = (u64)from * rs;
        for (; from < to; from++, vbo += rs) {
                struct ntfs_buffers nb;

                err = ntfs_get_bh(sbi, run, vbo, rs, &nb);
                if (err)
                        goto out;

                err = ntfs_write_bh(sbi, &sbi->new_rec->rhdr, &nb, 0);
                nb_put(&nb);
                if (err)
                        goto out;
        }

out:
        sbi->mft.used = from;
        up_read(&ni->file.run_lock);
        return err;
}

/*
 * ntfs_refresh_zone - Refresh MFT zone.
 *
 * sbi->used.bitmap is locked for rw.
 * sbi->mft.bitmap is locked for write.
 * sbi->mft.ni->file.run_lock for write.
 */
int ntfs_refresh_zone(struct ntfs_sb_info *sbi)
{
        CLST lcn, vcn, len;
        size_t lcn_s, zlen;
        struct wnd_bitmap *wnd = &sbi->used.bitmap;
        struct ntfs_inode *ni = sbi->mft.ni;

        /* Do not change anything unless we have non empty MFT zone. */
        if (wnd_zone_len(wnd))
                return 0;

        vcn = bytes_to_cluster(sbi,
                               (u64)sbi->mft.bitmap.nbits << sbi->record_bits);

        if (!run_lookup_entry(&ni->file.run, vcn - 1, &lcn, &len, NULL))
                lcn = SPARSE_LCN;

        /* We should always find Last Lcn for MFT. */
        if (lcn == SPARSE_LCN)
                return -EINVAL;

        lcn_s = lcn + 1;

        /* Try to allocate clusters after last MFT run. */
        zlen = wnd_find(wnd, sbi->zone_max, lcn_s, 0, &lcn_s);
        wnd_zone_set(wnd, lcn_s, zlen);

        return 0;
}

/*
 * ntfs_update_mftmirr - Update $MFTMirr data.
 */
void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
{
        int err;
        struct super_block *sb = sbi->sb;
        u32 blocksize, bytes;
        sector_t block1, block2;

        /*
         * sb can be NULL here. In this case sbi->flags should be 0 too.
         */
        if (!sb || !(sbi->flags & NTFS_FLAGS_MFTMIRR) ||
            unlikely(ntfs3_forced_shutdown(sb)))
                return;

        blocksize = sb->s_blocksize;
        bytes = sbi->mft.recs_mirr << sbi->record_bits;
        block1 = sbi->mft.lbo >> sb->s_blocksize_bits;
        block2 = sbi->mft.lbo2 >> sb->s_blocksize_bits;

        for (; bytes >= blocksize; bytes -= blocksize) {
                struct buffer_head *bh1, *bh2;

                bh1 = sb_bread(sb, block1++);
                if (!bh1)
                        return;

                bh2 = sb_getblk(sb, block2++);
                if (!bh2) {
                        put_bh(bh1);
                        return;
                }

                if (buffer_locked(bh2))
                        __wait_on_buffer(bh2);

                lock_buffer(bh2);
                memcpy(bh2->b_data, bh1->b_data, blocksize);
                set_buffer_uptodate(bh2);
                mark_buffer_dirty(bh2);
                unlock_buffer(bh2);

                put_bh(bh1);
                bh1 = NULL;

                err = wait ? sync_dirty_buffer(bh2) : 0;

                put_bh(bh2);
                if (err)
                        return;
        }

        sbi->flags &= ~NTFS_FLAGS_MFTMIRR;
}

/*
 * ntfs_bad_inode
 *
 * Marks inode as bad and marks fs as 'dirty'
 */
void ntfs_bad_inode(struct inode *inode, const char *hint)
{
        struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;

        ntfs_inode_err(inode, "%s", hint);
        make_bad_inode(inode);
        ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
}

/*
 * ntfs_set_state
 *
 * Mount: ntfs_set_state(NTFS_DIRTY_DIRTY)
 * Umount: ntfs_set_state(NTFS_DIRTY_CLEAR)
 * NTFS error: ntfs_set_state(NTFS_DIRTY_ERROR)
 */
int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty)
{
        int err;
        struct ATTRIB *attr;
        struct VOLUME_INFO *info;
        struct mft_inode *mi;
        struct ntfs_inode *ni;
        __le16 info_flags;

        /*
         * Do not change state if fs was real_dirty.
         * Do not change state if fs already dirty(clear).
         * Do not change any thing if mounted read only.
         */
        if (sbi->volume.real_dirty || sb_rdonly(sbi->sb))
                return 0;

        /* Check cached value. */
        if ((dirty == NTFS_DIRTY_CLEAR ? 0 : VOLUME_FLAG_DIRTY) ==
            (sbi->volume.flags & VOLUME_FLAG_DIRTY))
                return 0;

        ni = sbi->volume.ni;
        if (!ni)
                return -EINVAL;

        mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_DIRTY);

        attr = ni_find_attr(ni, NULL, NULL, ATTR_VOL_INFO, NULL, 0, NULL, &mi);
        if (!attr) {
                err = -EINVAL;
                goto out;
        }

        info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO);
        if (!info) {
                err = -EINVAL;
                goto out;
        }

        info_flags = info->flags;

        switch (dirty) {
        case NTFS_DIRTY_ERROR:
                ntfs_notice(sbi->sb, "Mark volume as dirty due to NTFS errors");
                sbi->volume.real_dirty = true;
                fallthrough;
        case NTFS_DIRTY_DIRTY:
                info->flags |= VOLUME_FLAG_DIRTY;
                break;
        case NTFS_DIRTY_CLEAR:
                info->flags &= ~VOLUME_FLAG_DIRTY;
                break;
        }
        /* Cache current volume flags. */
        if (info_flags != info->flags) {
                sbi->volume.flags = info->flags;
                mi->dirty = true;
        }
        err = 0;

out:
        ni_unlock(ni);
        if (err)
                return err;

        mark_inode_dirty_sync(&ni->vfs_inode);
        /* verify(!ntfs_update_mftmirr()); */

        /* write mft record on disk. */
        err = _ni_write_inode(&ni->vfs_inode, 1);

        return err;
}

/*
 * security_hash - Calculates a hash of security descriptor.
 */
static inline __le32 security_hash(const void *sd, size_t bytes)
{
        u32 hash = 0;
        const __le32 *ptr = sd;

        bytes >>= 2;
        while (bytes--)
                hash = ((hash >> 0x1D) | (hash << 3)) + le32_to_cpu(*ptr++);
        return cpu_to_le32(hash);
}

/*
 * simple wrapper for sb_bread_unmovable.
 */
struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block)
{
        struct ntfs_sb_info *sbi = sb->s_fs_info;
        struct buffer_head *bh;

        if (unlikely(block >= sbi->volume.blocks)) {
                /* prevent generic message "attempt to access beyond end of device" */
                ntfs_err(sb, "try to read out of volume at offset 0x%llx",
                         (u64)block << sb->s_blocksize_bits);
                return NULL;
        }

        bh = sb_bread_unmovable(sb, block);
        if (bh)
                return bh;

        ntfs_err(sb, "failed to read volume at offset 0x%llx",
                 (u64)block << sb->s_blocksize_bits);
        return NULL;
}

int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer)
{
        struct block_device *bdev = sb->s_bdev;
        u32 blocksize = sb->s_blocksize;
        u64 block = lbo >> sb->s_blocksize_bits;
        u32 off = lbo & (blocksize - 1);
        u32 op = blocksize - off;

        for (; bytes; block += 1, off = 0, op = blocksize) {
                struct buffer_head *bh = __bread(bdev, block, blocksize);

                if (!bh)
                        return -EIO;

                if (op > bytes)
                        op = bytes;

                memcpy(buffer, bh->b_data + off, op);

                put_bh(bh);

                bytes -= op;
                buffer = Add2Ptr(buffer, op);
        }

        return 0;
}

int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes,
                  const void *buf, int wait)
{
        u32 blocksize = sb->s_blocksize;
        struct block_device *bdev = sb->s_bdev;
        sector_t block = lbo >> sb->s_blocksize_bits;
        u32 off = lbo & (blocksize - 1);
        u32 op = blocksize - off;
        struct buffer_head *bh;

        if (!wait && (sb->s_flags & SB_SYNCHRONOUS))
                wait = 1;

        for (; bytes; block += 1, off = 0, op = blocksize) {
                if (op > bytes)
                        op = bytes;

                if (op < blocksize) {
                        bh = __bread(bdev, block, blocksize);
                        if (!bh) {
                                ntfs_err(sb, "failed to read block %llx",
                                         (u64)block);
                                return -EIO;
                        }
                } else {
                        bh = __getblk(bdev, block, blocksize);
                        if (!bh)
                                return -ENOMEM;
                }

                if (buffer_locked(bh))
                        __wait_on_buffer(bh);

                lock_buffer(bh);
                if (buf) {
                        memcpy(bh->b_data + off, buf, op);
                        buf = Add2Ptr(buf, op);
                } else {
                        memset(bh->b_data + off, -1, op);
                }

                set_buffer_uptodate(bh);
                mark_buffer_dirty(bh);
                unlock_buffer(bh);

                if (wait) {
                        int err = sync_dirty_buffer(bh);

                        if (err) {
                                ntfs_err(
                                        sb,
                                        "failed to sync buffer at block %llx, error %d",
                                        (u64)block, err);
                                put_bh(bh);
                                return err;
                        }
                }

                put_bh(bh);

                bytes -= op;
        }
        return 0;
}

int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run,
                      u64 vbo, const void *buf, size_t bytes, int sync)
{
        struct super_block *sb = sbi->sb;
        u8 cluster_bits = sbi->cluster_bits;
        u32 off = vbo & sbi->cluster_mask;
        CLST lcn, clen, vcn = vbo >> cluster_bits, vcn_next;
        u64 lbo, len;
        size_t idx;

        if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx))
                return -ENOENT;

        if (lcn == SPARSE_LCN)
                return -EINVAL;

        lbo = ((u64)lcn << cluster_bits) + off;
        len = ((u64)clen << cluster_bits) - off;

        for (;;) {
                u32 op = min_t(u64, len, bytes);
                int err = ntfs_sb_write(sb, lbo, op, buf, sync);

                if (err)
                        return err;

                bytes -= op;
                if (!bytes)
                        break;

                vcn_next = vcn + clen;
                if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) ||
                    vcn != vcn_next)
                        return -ENOENT;

                if (lcn == SPARSE_LCN)
                        return -EINVAL;

                if (buf)
                        buf = Add2Ptr(buf, op);

                lbo = ((u64)lcn << cluster_bits);
                len = ((u64)clen << cluster_bits);
        }

        return 0;
}

struct buffer_head *ntfs_bread_run(struct ntfs_sb_info *sbi,
                                   const struct runs_tree *run, u64 vbo)
{
        struct super_block *sb = sbi->sb;
        u8 cluster_bits = sbi->cluster_bits;
        CLST lcn;
        u64 lbo;

        if (!run_lookup_entry(run, vbo >> cluster_bits, &lcn, NULL, NULL))
                return ERR_PTR(-ENOENT);

        lbo = ((u64)lcn << cluster_bits) + (vbo & sbi->cluster_mask);

        return ntfs_bread(sb, lbo >> sb->s_blocksize_bits);
}

int ntfs_read_run_nb(struct ntfs_sb_info *sbi, const struct runs_tree *run,
                     u64 vbo, void *buf, u32 bytes, struct ntfs_buffers *nb)
{
        int err;
        struct super_block *sb = sbi->sb;
        u32 blocksize = sb->s_blocksize;
        u8 cluster_bits = sbi->cluster_bits;
        u32 off = vbo & sbi->cluster_mask;
        u32 nbh = 0;
        CLST vcn_next, vcn = vbo >> cluster_bits;
        CLST lcn, clen;
        u64 lbo, len;
        size_t idx;
        struct buffer_head *bh;

        if (!run) {
                /* First reading of $Volume + $MFTMirr + $LogFile goes here. */
                if (vbo > MFT_REC_VOL * sbi->record_size) {
                        err = -ENOENT;
                        goto out;
                }

                /* Use absolute boot's 'MFTCluster' to read record. */
                lbo = vbo + sbi->mft.lbo;
                len = sbi->record_size;
        } else if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) {
                err = -ENOENT;
                goto out;
        } else {
                if (lcn == SPARSE_LCN) {
                        err = -EINVAL;
                        goto out;
                }

                lbo = ((u64)lcn << cluster_bits) + off;
                len = ((u64)clen << cluster_bits) - off;
        }

        off = lbo & (blocksize - 1);
        if (nb) {
                nb->off = off;
                nb->bytes = bytes;
        }

        for (;;) {
                u32 len32 = len >= bytes ? bytes : len;
                sector_t block = lbo >> sb->s_blocksize_bits;

                do {
                        u32 op = blocksize - off;

                        if (op > len32)
                                op = len32;

                        bh = ntfs_bread(sb, block);
                        if (!bh) {
                                err = -EIO;
                                goto out;
                        }

                        if (buf) {
                                memcpy(buf, bh->b_data + off, op);
                                buf = Add2Ptr(buf, op);
                        }

                        if (!nb) {
                                put_bh(bh);
                        } else if (nbh >= ARRAY_SIZE(nb->bh)) {
                                err = -EINVAL;
                                goto out;
                        } else {
                                nb->bh[nbh++] = bh;
                                nb->nbufs = nbh;
                        }

                        bytes -= op;
                        if (!bytes)
                                return 0;
                        len32 -= op;
                        block += 1;
                        off = 0;

                } while (len32);

                vcn_next = vcn + clen;
                if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) ||
                    vcn != vcn_next) {
                        err = -ENOENT;
                        goto out;
                }

                if (lcn == SPARSE_LCN) {
                        err = -EINVAL;
                        goto out;
                }

                lbo = ((u64)lcn << cluster_bits);
                len = ((u64)clen << cluster_bits);
        }

out:
        if (!nbh)
                return err;

        while (nbh) {
                put_bh(nb->bh[--nbh]);
                nb->bh[nbh] = NULL;
        }

        nb->nbufs = 0;
        return err;
}

/*
 * ntfs_read_bh
 *
 * Return: < 0 if error, 0 if ok, -E_NTFS_FIXUP if need to update fixups.
 */
int ntfs_read_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo,
                 struct NTFS_RECORD_HEADER *rhdr, u32 bytes,
                 struct ntfs_buffers *nb)
{
        int err = ntfs_read_run_nb(sbi, run, vbo, rhdr, bytes, nb);

        if (err)
                return err;
        return ntfs_fix_post_read(rhdr, nb->bytes, true);
}

int ntfs_get_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo,
                u32 bytes, struct ntfs_buffers *nb)
{
        int err = 0;
        struct super_block *sb = sbi->sb;
        u32 blocksize = sb->s_blocksize;
        u8 cluster_bits = sbi->cluster_bits;
        CLST vcn_next, vcn = vbo >> cluster_bits;
        u32 off;
        u32 nbh = 0;
        CLST lcn, clen;
        u64 lbo, len;
        size_t idx;

        nb->bytes = bytes;

        if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) {
                err = -ENOENT;
                goto out;
        }

        off = vbo & sbi->cluster_mask;
        lbo = ((u64)lcn << cluster_bits) + off;
        len = ((u64)clen << cluster_bits) - off;

        nb->off = off = lbo & (blocksize - 1);

        for (;;) {
                u32 len32 = min_t(u64, len, bytes);
                sector_t block = lbo >> sb->s_blocksize_bits;

                do {
                        u32 op;
                        struct buffer_head *bh;

                        if (nbh >= ARRAY_SIZE(nb->bh)) {
                                err = -EINVAL;
                                goto out;
                        }

                        op = blocksize - off;
                        if (op > len32)
                                op = len32;

                        if (op == blocksize) {
                                bh = sb_getblk(sb, block);
                                if (!bh) {
                                        err = -ENOMEM;
                                        goto out;
                                }
                                if (buffer_locked(bh))
                                        __wait_on_buffer(bh);
                                set_buffer_uptodate(bh);
                        } else {
                                bh = ntfs_bread(sb, block);
                                if (!bh) {
                                        err = -EIO;
                                        goto out;
                                }
                        }

                        nb->bh[nbh++] = bh;
                        bytes -= op;
                        if (!bytes) {
                                nb->nbufs = nbh;
                                return 0;
                        }

                        block += 1;
                        len32 -= op;
                        off = 0;
                } while (len32);

                vcn_next = vcn + clen;
                if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) ||
                    vcn != vcn_next) {
                        err = -ENOENT;
                        goto out;
                }

                lbo = ((u64)lcn << cluster_bits);
                len = ((u64)clen << cluster_bits);
        }

out:
        while (nbh) {
                put_bh(nb->bh[--nbh]);
                nb->bh[nbh] = NULL;
        }

        nb->nbufs = 0;

        return err;
}

int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr,
                  struct ntfs_buffers *nb, int sync)
{
        int err = 0;
        struct super_block *sb = sbi->sb;
        u32 block_size = sb->s_blocksize;
        u32 bytes = nb->bytes;
        u32 off = nb->off;
        u16 fo = le16_to_cpu(rhdr->fix_off);
        u16 fn = le16_to_cpu(rhdr->fix_num);
        u32 idx;
        __le16 *fixup;
        __le16 sample;

        if ((fo & 1) || fo + fn * sizeof(short) > SECTOR_SIZE || !fn-- ||
            fn * SECTOR_SIZE > bytes) {
                return -EINVAL;
        }

        for (idx = 0; bytes && idx < nb->nbufs; idx += 1, off = 0) {
                u32 op = block_size - off;
                char *bh_data;
                struct buffer_head *bh = nb->bh[idx];
                __le16 *ptr, *end_data;

                if (op > bytes)
                        op = bytes;

                if (buffer_locked(bh))
                        __wait_on_buffer(bh);

                lock_buffer(bh);

                bh_data = bh->b_data + off;
                end_data = Add2Ptr(bh_data, op);
                memcpy(bh_data, rhdr, op);

                if (!idx) {
                        u16 t16;

                        fixup = Add2Ptr(bh_data, fo);
                        sample = *fixup;
                        t16 = le16_to_cpu(sample);
                        if (t16 >= 0x7FFF) {
                                sample = *fixup = cpu_to_le16(1);
                        } else {
                                sample = cpu_to_le16(t16 + 1);
                                *fixup = sample;
                        }

                        *(__le16 *)Add2Ptr(rhdr, fo) = sample;
                }

                ptr = Add2Ptr(bh_data, SECTOR_SIZE - sizeof(short));

                do {
                        *++fixup = *ptr;
                        *ptr = sample;
                        ptr += SECTOR_SIZE / sizeof(short);
                } while (ptr < end_data);

                set_buffer_uptodate(bh);
                mark_buffer_dirty(bh);
                unlock_buffer(bh);

                if (sync) {
                        int err2 = sync_dirty_buffer(bh);

                        if (!err && err2)
                                err = err2;
                }

                bytes -= op;
                rhdr = Add2Ptr(rhdr, op);
        }

        return err;
}

/*
 * ntfs_bio_pages - Read/write pages from/to disk.
 */
int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run,
                   struct page **pages, u32 nr_pages, u64 vbo, u32 bytes,
                   enum req_op op)
{
        int err = 0;
        struct bio *new, *bio = NULL;
        struct super_block *sb = sbi->sb;
        struct block_device *bdev = sb->s_bdev;
        struct page *page;
        u8 cluster_bits = sbi->cluster_bits;
        CLST lcn, clen, vcn, vcn_next;
        u32 add, off, page_idx;
        u64 lbo, len;
        size_t run_idx;
        struct blk_plug plug;

        if (!bytes)
                return 0;

        blk_start_plug(&plug);

        /* Align vbo and bytes to be 512 bytes aligned. */
        lbo = (vbo + bytes + 511) & ~511ull;
        vbo = vbo & ~511ull;
        bytes = lbo - vbo;

        vcn = vbo >> cluster_bits;
        if (!run_lookup_entry(run, vcn, &lcn, &clen, &run_idx)) {
                err = -ENOENT;
                goto out;
        }
        off = vbo & sbi->cluster_mask;
        page_idx = 0;
        page = pages[0];

        for (;;) {
                lbo = ((u64)lcn << cluster_bits) + off;
                len = ((u64)clen << cluster_bits) - off;
new_bio:
                new = bio_alloc(bdev, nr_pages - page_idx, op, GFP_NOFS);
                if (bio) {
                        bio_chain(bio, new);
                        submit_bio(bio);
                }
                bio = new;
                bio->bi_iter.bi_sector = lbo >> 9;

                while (len) {
                        off = vbo & (PAGE_SIZE - 1);
                        add = off + len > PAGE_SIZE ? (PAGE_SIZE - off) : len;

                        if (bio_add_page(bio, page, add, off) < add)
                                goto new_bio;

                        if (bytes <= add)
                                goto out;
                        bytes -= add;
                        vbo += add;

                        if (add + off == PAGE_SIZE) {
                                page_idx += 1;
                                if (WARN_ON(page_idx >= nr_pages)) {
                                        err = -EINVAL;
                                        goto out;
                                }
                                page = pages[page_idx];
                        }

                        if (len <= add)
                                break;
                        len -= add;
                        lbo += add;
                }

                vcn_next = vcn + clen;
                if (!run_get_entry(run, ++run_idx, &vcn, &lcn, &clen) ||
                    vcn != vcn_next) {
                        err = -ENOENT;
                        goto out;
                }
                off = 0;
        }
out:
        if (bio) {
                if (!err)
                        err = submit_bio_wait(bio);
                bio_put(bio);
        }
        blk_finish_plug(&plug);

        return err;
}

/*
 * ntfs_bio_fill_1 - Helper for ntfs_loadlog_and_replay().
 *
 * Fill on-disk logfile range by (-1)
 * this means empty logfile.
 */
int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run)
{
        int err = 0;
        struct super_block *sb = sbi->sb;
        struct block_device *bdev = sb->s_bdev;
        u8 cluster_bits = sbi->cluster_bits;
        struct bio *new, *bio = NULL;
        CLST lcn, clen;
        u64 lbo, len;
        size_t run_idx;
        struct page *fill;
        void *kaddr;
        struct blk_plug plug;

        fill = alloc_page(GFP_KERNEL);
        if (!fill)
                return -ENOMEM;

        kaddr = kmap_atomic(fill);
        memset(kaddr, -1, PAGE_SIZE);
        kunmap_atomic(kaddr);
        flush_dcache_page(fill);
        lock_page(fill);

        if (!run_lookup_entry(run, 0, &lcn, &clen, &run_idx)) {
                err = -ENOENT;
                goto out;
        }

        /*
         * TODO: Try blkdev_issue_write_same.
         */
        blk_start_plug(&plug);
        do {
                lbo = (u64)lcn << cluster_bits;
                len = (u64)clen << cluster_bits;
new_bio:
                new = bio_alloc(bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOFS);
                if (bio) {
                        bio_chain(bio, new);
                        submit_bio(bio);
                }
                bio = new;
                bio->bi_iter.bi_sector = lbo >> 9;

                for (;;) {
                        u32 add = len > PAGE_SIZE ? PAGE_SIZE : len;

                        if (bio_add_page(bio, fill, add, 0) < add)
                                goto new_bio;

                        lbo += add;
                        if (len <= add)
                                break;
                        len -= add;
                }
        } while (run_get_entry(run, ++run_idx, NULL, &lcn, &clen));

        if (!err)
                err = submit_bio_wait(bio);
        bio_put(bio);

        blk_finish_plug(&plug);
out:
        unlock_page(fill);
        put_page(fill);

        return err;
}

int ntfs_vbo_to_lbo(struct ntfs_sb_info *sbi, const struct runs_tree *run,
                    u64 vbo, u64 *lbo, u64 *bytes)
{
        u32 off;
        CLST lcn, len;
        u8 cluster_bits = sbi->cluster_bits;

        if (!run_lookup_entry(run, vbo >> cluster_bits, &lcn, &len, NULL))
                return -ENOENT;

        off = vbo & sbi->cluster_mask;
        *lbo = lcn == SPARSE_LCN ? -1 : (((u64)lcn << cluster_bits) + off);
        *bytes = ((u64)len << cluster_bits) - off;

        return 0;
}

struct ntfs_inode *ntfs_new_inode(struct ntfs_sb_info *sbi, CLST rno,
                                  enum RECORD_FLAG flag)
{
        int err = 0;
        struct super_block *sb = sbi->sb;
        struct inode *inode = new_inode(sb);
        struct ntfs_inode *ni;

        if (!inode)
                return ERR_PTR(-ENOMEM);

        ni = ntfs_i(inode);

        err = mi_format_new(&ni->mi, sbi, rno, flag, false);
        if (err)
                goto out;

        inode->i_ino = rno;
        if (insert_inode_locked(inode) < 0) {
                err = -EIO;
                goto out;
        }

out:
        if (err) {
                make_bad_inode(inode);
                iput(inode);
                ni = ERR_PTR(err);
        }
        return ni;
}

/*
 * O:BAG:BAD:(A;OICI;FA;;;WD)
 * Owner S-1-5-32-544 (Administrators)
 * Group S-1-5-32-544 (Administrators)
 * ACE: allow S-1-1-0 (Everyone) with FILE_ALL_ACCESS
 */
const u8 s_default_security[] __aligned(8) = {
        0x01, 0x00, 0x04, 0x80, 0x30, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x02, 0x00, 0x1C, 0x00,
        0x01, 0x00, 0x00, 0x00, 0x00, 0x03, 0x14, 0x00, 0xFF, 0x01, 0x1F, 0x00,
        0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
        0x01, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x20, 0x00, 0x00, 0x00,
        0x20, 0x02, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05,
        0x20, 0x00, 0x00, 0x00, 0x20, 0x02, 0x00, 0x00,
};

static_assert(sizeof(s_default_security) == 0x50);

static inline u32 sid_length(const struct SID *sid)
{
        return struct_size(sid, SubAuthority, sid->SubAuthorityCount);
}

/*
 * is_acl_valid
 *
 * Thanks Mark Harmstone for idea.
 */
static bool is_acl_valid(const struct ACL *acl, u32 len)
{
        const struct ACE_HEADER *ace;
        u32 i;
        u16 ace_count, ace_size;

        if (acl->AclRevision != ACL_REVISION &&
            acl->AclRevision != ACL_REVISION_DS) {
                /*
                 * This value should be ACL_REVISION, unless the ACL contains an
                 * object-specific ACE, in which case this value must be ACL_REVISION_DS.
                 * All ACEs in an ACL must be at the same revision level.
                 */
                return false;
        }

        if (acl->Sbz1)
                return false;

        if (le16_to_cpu(acl->AclSize) > len)
                return false;

        if (acl->Sbz2)
                return false;

        len -= sizeof(struct ACL);
        ace = (struct ACE_HEADER *)&acl[1];
        ace_count = le16_to_cpu(acl->AceCount);

        for (i = 0; i < ace_count; i++) {
                if (len < sizeof(struct ACE_HEADER))
                        return false;

                ace_size = le16_to_cpu(ace->AceSize);
                if (len < ace_size)
                        return false;

                len -= ace_size;
                ace = Add2Ptr(ace, ace_size);
        }

        return true;
}

bool is_sd_valid(const struct SECURITY_DESCRIPTOR_RELATIVE *sd, u32 len)
{
        u32 sd_owner, sd_group, sd_sacl, sd_dacl;

        if (len < sizeof(struct SECURITY_DESCRIPTOR_RELATIVE))
                return false;

        if (sd->Revision != 1)
                return false;

        if (sd->Sbz1)
                return false;

        if (!(sd->Control & SE_SELF_RELATIVE))
                return false;

        sd_owner = le32_to_cpu(sd->Owner);
        if (sd_owner) {
                const struct SID *owner = Add2Ptr(sd, sd_owner);

                if (sd_owner + offsetof(struct SID, SubAuthority) > len)
                        return false;

                if (owner->Revision != 1)
                        return false;

                if (sd_owner + sid_length(owner) > len)
                        return false;
        }

        sd_group = le32_to_cpu(sd->Group);
        if (sd_group) {
                const struct SID *group = Add2Ptr(sd, sd_group);

                if (sd_group + offsetof(struct SID, SubAuthority) > len)
                        return false;

                if (group->Revision != 1)
                        return false;

                if (sd_group + sid_length(group) > len)
                        return false;
        }

        sd_sacl = le32_to_cpu(sd->Sacl);
        if (sd_sacl) {
                const struct ACL *sacl = Add2Ptr(sd, sd_sacl);

                if (sd_sacl + sizeof(struct ACL) > len)
                        return false;

                if (!is_acl_valid(sacl, len - sd_sacl))
                        return false;
        }

        sd_dacl = le32_to_cpu(sd->Dacl);
        if (sd_dacl) {
                const struct ACL *dacl = Add2Ptr(sd, sd_dacl);

                if (sd_dacl + sizeof(struct ACL) > len)
                        return false;

                if (!is_acl_valid(dacl, len - sd_dacl))
                        return false;
        }

        return true;
}

/*
 * ntfs_security_init - Load and parse $Secure.
 */
int ntfs_security_init(struct ntfs_sb_info *sbi)
{
        int err;
        struct super_block *sb = sbi->sb;
        struct inode *inode;
        struct ntfs_inode *ni;
        struct MFT_REF ref;
        struct ATTRIB *attr;
        struct ATTR_LIST_ENTRY *le;
        u64 sds_size;
        size_t off;
        struct NTFS_DE *ne;
        struct NTFS_DE_SII *sii_e;
        struct ntfs_fnd *fnd_sii = NULL;
        const struct INDEX_ROOT *root_sii;
        const struct INDEX_ROOT *root_sdh;
        struct ntfs_index *indx_sdh = &sbi->security.index_sdh;
        struct ntfs_index *indx_sii = &sbi->security.index_sii;

        ref.low = cpu_to_le32(MFT_REC_SECURE);
        ref.high = 0;
        ref.seq = cpu_to_le16(MFT_REC_SECURE);

        inode = ntfs_iget5(sb, &ref, &NAME_SECURE);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                ntfs_err(sb, "Failed to load $Secure (%d).", err);
                inode = NULL;
                goto out;
        }

        ni = ntfs_i(inode);

        le = NULL;

        attr = ni_find_attr(ni, NULL, &le, ATTR_ROOT, SDH_NAME,
                            ARRAY_SIZE(SDH_NAME), NULL, NULL);
        if (!attr ||
            !(root_sdh = resident_data_ex(attr, sizeof(struct INDEX_ROOT))) ||
            root_sdh->type != ATTR_ZERO ||
            root_sdh->rule != NTFS_COLLATION_TYPE_SECURITY_HASH ||
            offsetof(struct INDEX_ROOT, ihdr) +
                            le32_to_cpu(root_sdh->ihdr.used) >
                    le32_to_cpu(attr->res.data_size)) {
                ntfs_err(sb, "$Secure::$SDH is corrupted.");
                err = -EINVAL;
                goto out;
        }

        err = indx_init(indx_sdh, sbi, attr, INDEX_MUTEX_SDH);
        if (err) {
                ntfs_err(sb, "Failed to initialize $Secure::$SDH (%d).", err);
                goto out;
        }

        attr = ni_find_attr(ni, attr, &le, ATTR_ROOT, SII_NAME,
                            ARRAY_SIZE(SII_NAME), NULL, NULL);
        if (!attr ||
            !(root_sii = resident_data_ex(attr, sizeof(struct INDEX_ROOT))) ||
            root_sii->type != ATTR_ZERO ||
            root_sii->rule != NTFS_COLLATION_TYPE_UINT ||
            offsetof(struct INDEX_ROOT, ihdr) +
                            le32_to_cpu(root_sii->ihdr.used) >
                    le32_to_cpu(attr->res.data_size)) {
                ntfs_err(sb, "$Secure::$SII is corrupted.");
                err = -EINVAL;
                goto out;
        }

        err = indx_init(indx_sii, sbi, attr, INDEX_MUTEX_SII);
        if (err) {
                ntfs_err(sb, "Failed to initialize $Secure::$SII (%d).", err);
                goto out;
        }

        fnd_sii = fnd_get();
        if (!fnd_sii) {
                err = -ENOMEM;
                goto out;
        }

        sds_size = inode->i_size;

        /* Find the last valid Id. */
        sbi->security.next_id = SECURITY_ID_FIRST;
        /* Always write new security at the end of bucket. */
        sbi->security.next_off =
                ALIGN(sds_size - SecurityDescriptorsBlockSize, 16);

        off = 0;
        ne = NULL;

        for (;;) {
                u32 next_id;

                err = indx_find_raw(indx_sii, ni, root_sii, &ne, &off, fnd_sii);
                if (err || !ne)
                        break;

                sii_e = (struct NTFS_DE_SII *)ne;
                if (le16_to_cpu(ne->view.data_size) < sizeof(sii_e->sec_hdr))
                        continue;

                next_id = le32_to_cpu(sii_e->sec_id) + 1;
                if (next_id >= sbi->security.next_id)
                        sbi->security.next_id = next_id;
        }

        sbi->security.ni = ni;
        inode = NULL;
out:
        iput(inode);
        fnd_put(fnd_sii);

        return err;
}

/*
 * ntfs_get_security_by_id - Read security descriptor by id.
 */
int ntfs_get_security_by_id(struct ntfs_sb_info *sbi, __le32 security_id,
                            struct SECURITY_DESCRIPTOR_RELATIVE **sd,
                            size_t *size)
{
        int err;
        int diff;
        struct ntfs_inode *ni = sbi->security.ni;
        struct ntfs_index *indx = &sbi->security.index_sii;
        void *p = NULL;
        struct NTFS_DE_SII *sii_e;
        struct ntfs_fnd *fnd_sii;
        struct SECURITY_HDR d_security;
        const struct INDEX_ROOT *root_sii;
        u32 t32;

        *sd = NULL;

        mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_SECURITY);

        fnd_sii = fnd_get();
        if (!fnd_sii) {
                err = -ENOMEM;
                goto out;
        }

        root_sii = indx_get_root(indx, ni, NULL, NULL);
        if (!root_sii) {
                err = -EINVAL;
                goto out;
        }

        /* Try to find this SECURITY descriptor in SII indexes. */
        err = indx_find(indx, ni, root_sii, &security_id, sizeof(security_id),
                        NULL, &diff, (struct NTFS_DE **)&sii_e, fnd_sii);
        if (err)
                goto out;

        if (diff)
                goto out;

        t32 = le32_to_cpu(sii_e->sec_hdr.size);
        if (t32 < sizeof(struct SECURITY_HDR)) {
                err = -EINVAL;
                goto out;
        }

        if (t32 > sizeof(struct SECURITY_HDR) + 0x10000) {
                /* Looks like too big security. 0x10000 - is arbitrary big number. */
                err = -EFBIG;
                goto out;
        }

        *size = t32 - sizeof(struct SECURITY_HDR);

        p = kmalloc(*size, GFP_NOFS);
        if (!p) {
                err = -ENOMEM;
                goto out;
        }

        err = ntfs_read_run_nb(sbi, &ni->file.run,
                               le64_to_cpu(sii_e->sec_hdr.off), &d_security,
                               sizeof(d_security), NULL);
        if (err)
                goto out;

        if (memcmp(&d_security, &sii_e->sec_hdr, sizeof(d_security))) {
                err = -EINVAL;
                goto out;
        }

        err = ntfs_read_run_nb(sbi, &ni->file.run,
                               le64_to_cpu(sii_e->sec_hdr.off) +
                                       sizeof(struct SECURITY_HDR),
                               p, *size, NULL);
        if (err)
                goto out;

        *sd = p;
        p = NULL;

out:
        kfree(p);
        fnd_put(fnd_sii);
        ni_unlock(ni);

        return err;
}

/*
 * ntfs_insert_security - Insert security descriptor into $Secure::SDS.
 *
 * SECURITY Descriptor Stream data is organized into chunks of 256K bytes
 * and it contains a mirror copy of each security descriptor.  When writing
 * to a security descriptor at location X, another copy will be written at
 * location (X+256K).
 * When writing a security descriptor that will cross the 256K boundary,
 * the pointer will be advanced by 256K to skip
 * over the mirror portion.
 */
int ntfs_insert_security(struct ntfs_sb_info *sbi,
                         const struct SECURITY_DESCRIPTOR_RELATIVE *sd,
                         u32 size_sd, __le32 *security_id, bool *inserted)
{
        int err, diff;
        struct ntfs_inode *ni = sbi->security.ni;
        struct ntfs_index *indx_sdh = &sbi->security.index_sdh;
        struct ntfs_index *indx_sii = &sbi->security.index_sii;
        struct NTFS_DE_SDH *e;
        struct NTFS_DE_SDH sdh_e;
        struct NTFS_DE_SII sii_e;
        struct SECURITY_HDR *d_security;
        u32 new_sec_size = size_sd + sizeof(struct SECURITY_HDR);
        u32 aligned_sec_size = ALIGN(new_sec_size, 16);
        struct SECURITY_KEY hash_key;
        struct ntfs_fnd *fnd_sdh = NULL;
        const struct INDEX_ROOT *root_sdh;
        const struct INDEX_ROOT *root_sii;
        u64 mirr_off, new_sds_size;
        u32 next, left;

        static_assert((1 << Log2OfSecurityDescriptorsBlockSize) ==
                      SecurityDescriptorsBlockSize);

        hash_key.hash = security_hash(sd, size_sd);
        hash_key.sec_id = SECURITY_ID_INVALID;

        if (inserted)
                *inserted = false;
        *security_id = SECURITY_ID_INVALID;

        /* Allocate a temporal buffer. */
        d_security = kzalloc(aligned_sec_size, GFP_NOFS);
        if (!d_security)
                return -ENOMEM;

        mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_SECURITY);

        fnd_sdh = fnd_get();
        if (!fnd_sdh) {
                err = -ENOMEM;
                goto out;
        }

        root_sdh = indx_get_root(indx_sdh, ni, NULL, NULL);
        if (!root_sdh) {
                err = -EINVAL;
                goto out;
        }

        root_sii = indx_get_root(indx_sii, ni, NULL, NULL);
        if (!root_sii) {
                err = -EINVAL;
                goto out;
        }

        /*
         * Check if such security already exists.
         * Use "SDH" and hash -> to get the offset in "SDS".
         */
        err = indx_find(indx_sdh, ni, root_sdh, &hash_key, sizeof(hash_key),
                        &d_security->key.sec_id, &diff, (struct NTFS_DE **)&e,
                        fnd_sdh);
        if (err)
                goto out;

        while (e) {
                if (le32_to_cpu(e->sec_hdr.size) == new_sec_size) {
                        err = ntfs_read_run_nb(sbi, &ni->file.run,
                                               le64_to_cpu(e->sec_hdr.off),
                                               d_security, new_sec_size, NULL);
                        if (err)
                                goto out;

                        if (le32_to_cpu(d_security->size) == new_sec_size &&
                            d_security->key.hash == hash_key.hash &&
                            !memcmp(d_security + 1, sd, size_sd)) {
                                /* Such security already exists. */
                                *security_id = d_security->key.sec_id;
                                err = 0;
                                goto out;
                        }
                }

                err = indx_find_sort(indx_sdh, ni, root_sdh,
                                     (struct NTFS_DE **)&e, fnd_sdh);
                if (err)
                        goto out;

                if (!e || e->key.hash != hash_key.hash)
                        break;
        }

        /* Zero unused space. */
        next = sbi->security.next_off & (SecurityDescriptorsBlockSize - 1);
        left = SecurityDescriptorsBlockSize - next;

        /* Zero gap until SecurityDescriptorsBlockSize. */
        if (left < new_sec_size) {
                /* Zero "left" bytes from sbi->security.next_off. */
                sbi->security.next_off += SecurityDescriptorsBlockSize + left;
        }

        /* Zero tail of previous security. */
        //used = ni->vfs_inode.i_size & (SecurityDescriptorsBlockSize - 1);

        /*
         * Example:
         * 0x40438 == ni->vfs_inode.i_size
         * 0x00440 == sbi->security.next_off
         * need to zero [0x438-0x440)
         * if (next > used) {
         *  u32 tozero = next - used;
         *  zero "tozero" bytes from sbi->security.next_off - tozero
         */

        /* Format new security descriptor. */
        d_security->key.hash = hash_key.hash;
        d_security->key.sec_id = cpu_to_le32(sbi->security.next_id);
        d_security->off = cpu_to_le64(sbi->security.next_off);
        d_security->size = cpu_to_le32(new_sec_size);
        memcpy(d_security + 1, sd, size_sd);

        /* Write main SDS bucket. */
        err = ntfs_sb_write_run(sbi, &ni->file.run, sbi->security.next_off,
                                d_security, aligned_sec_size, 0);

        if (err)
                goto out;

        mirr_off = sbi->security.next_off + SecurityDescriptorsBlockSize;
        new_sds_size = mirr_off + aligned_sec_size;

        if (new_sds_size > ni->vfs_inode.i_size) {
                err = attr_set_size(ni, ATTR_DATA, SDS_NAME,
                                    ARRAY_SIZE(SDS_NAME), &ni->file.run,
                                    new_sds_size, &new_sds_size, false, NULL);
                if (err)
                        goto out;
        }

        /* Write copy SDS bucket. */
        err = ntfs_sb_write_run(sbi, &ni->file.run, mirr_off, d_security,
                                aligned_sec_size, 0);
        if (err)
                goto out;

        /* Fill SII entry. */
        sii_e.de.view.data_off =
                cpu_to_le16(offsetof(struct NTFS_DE_SII, sec_hdr));
        sii_e.de.view.data_size = cpu_to_le16(sizeof(struct SECURITY_HDR));
        sii_e.de.view.res = 0;
        sii_e.de.size = cpu_to_le16(sizeof(struct NTFS_DE_SII));
        sii_e.de.key_size = cpu_to_le16(sizeof(d_security->key.sec_id));
        sii_e.de.flags = 0;
        sii_e.de.res = 0;
        sii_e.sec_id = d_security->key.sec_id;
        memcpy(&sii_e.sec_hdr, d_security, sizeof(struct SECURITY_HDR));

        err = indx_insert_entry(indx_sii, ni, &sii_e.de, NULL, NULL, 0);
        if (err)
                goto out;

        /* Fill SDH entry. */
        sdh_e.de.view.data_off =
                cpu_to_le16(offsetof(struct NTFS_DE_SDH, sec_hdr));
        sdh_e.de.view.data_size = cpu_to_le16(sizeof(struct SECURITY_HDR));
        sdh_e.de.view.res = 0;
        sdh_e.de.size = cpu_to_le16(SIZEOF_SDH_DIRENTRY);
        sdh_e.de.key_size = cpu_to_le16(sizeof(sdh_e.key));
        sdh_e.de.flags = 0;
        sdh_e.de.res = 0;
        sdh_e.key.hash = d_security->key.hash;
        sdh_e.key.sec_id = d_security->key.sec_id;
        memcpy(&sdh_e.sec_hdr, d_security, sizeof(struct SECURITY_HDR));
        sdh_e.magic[0] = cpu_to_le16('I');
        sdh_e.magic[1] = cpu_to_le16('I');

        fnd_clear(fnd_sdh);
        err = indx_insert_entry(indx_sdh, ni, &sdh_e.de, (void *)(size_t)1,
                                fnd_sdh, 0);
        if (err)
                goto out;

        *security_id = d_security->key.sec_id;
        if (inserted)
                *inserted = true;

        /* Update Id and offset for next descriptor. */
        sbi->security.next_id += 1;
        sbi->security.next_off += aligned_sec_size;

out:
        fnd_put(fnd_sdh);
        mark_inode_dirty(&ni->vfs_inode);
        ni_unlock(ni);
        kfree(d_security);

        return err;
}

/*
 * ntfs_reparse_init - Load and parse $Extend/$Reparse.
 */
int ntfs_reparse_init(struct ntfs_sb_info *sbi)
{
        int err;
        struct ntfs_inode *ni = sbi->reparse.ni;
        struct ntfs_index *indx = &sbi->reparse.index_r;
        struct ATTRIB *attr;
        struct ATTR_LIST_ENTRY *le;
        const struct INDEX_ROOT *root_r;

        if (!ni)
                return 0;

        le = NULL;
        attr = ni_find_attr(ni, NULL, &le, ATTR_ROOT, SR_NAME,
                            ARRAY_SIZE(SR_NAME), NULL, NULL);
        if (!attr) {
                err = -EINVAL;
                goto out;
        }

        root_r = resident_data(attr);
        if (root_r->type != ATTR_ZERO ||
            root_r->rule != NTFS_COLLATION_TYPE_UINTS) {
                err = -EINVAL;
                goto out;
        }

        err = indx_init(indx, sbi, attr, INDEX_MUTEX_SR);
        if (err)
                goto out;

out:
        return err;
}

/*
 * ntfs_objid_init - Load and parse $Extend/$ObjId.
 */
int ntfs_objid_init(struct ntfs_sb_info *sbi)
{
        int err;
        struct ntfs_inode *ni = sbi->objid.ni;
        struct ntfs_index *indx = &sbi->objid.index_o;
        struct ATTRIB *attr;
        struct ATTR_LIST_ENTRY *le;
        const struct INDEX_ROOT *root;

        if (!ni)
                return 0;

        le = NULL;
        attr = ni_find_attr(ni, NULL, &le, ATTR_ROOT, SO_NAME,
                            ARRAY_SIZE(SO_NAME), NULL, NULL);
        if (!attr) {
                err = -EINVAL;
                goto out;
        }

        root = resident_data(attr);
        if (root->type != ATTR_ZERO ||
            root->rule != NTFS_COLLATION_TYPE_UINTS) {
                err = -EINVAL;
                goto out;
        }

        err = indx_init(indx, sbi, attr, INDEX_MUTEX_SO);
        if (err)
                goto out;

out:
        return err;
}

int ntfs_objid_remove(struct ntfs_sb_info *sbi, struct GUID *guid)
{
        int err;
        struct ntfs_inode *ni = sbi->objid.ni;
        struct ntfs_index *indx = &sbi->objid.index_o;

        if (!ni)
                return -EINVAL;

        mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_OBJID);

        err = indx_delete_entry(indx, ni, guid, sizeof(*guid), NULL);

        mark_inode_dirty(&ni->vfs_inode);
        ni_unlock(ni);

        return err;
}

int ntfs_insert_reparse(struct ntfs_sb_info *sbi, __le32 rtag,
                        const struct MFT_REF *ref)
{
        int err;
        struct ntfs_inode *ni = sbi->reparse.ni;
        struct ntfs_index *indx = &sbi->reparse.index_r;
        struct NTFS_DE_R re;

        if (!ni)
                return -EINVAL;

        memset(&re, 0, sizeof(re));

        re.de.view.data_off = cpu_to_le16(offsetof(struct NTFS_DE_R, zero));
        re.de.size = cpu_to_le16(sizeof(struct NTFS_DE_R));
        re.de.key_size = cpu_to_le16(sizeof(re.key));

        re.key.ReparseTag = rtag;
        memcpy(&re.key.ref, ref, sizeof(*ref));

        mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_REPARSE);

        err = indx_insert_entry(indx, ni, &re.de, NULL, NULL, 0);

        mark_inode_dirty(&ni->vfs_inode);
        ni_unlock(ni);

        return err;
}

int ntfs_remove_reparse(struct ntfs_sb_info *sbi, __le32 rtag,
                        const struct MFT_REF *ref)
{
        int err, diff;
        struct ntfs_inode *ni = sbi->reparse.ni;
        struct ntfs_index *indx = &sbi->reparse.index_r;
        struct ntfs_fnd *fnd = NULL;
        struct REPARSE_KEY rkey;
        struct NTFS_DE_R *re;
        struct INDEX_ROOT *root_r;

        if (!ni)
                return -EINVAL;

        rkey.ReparseTag = rtag;
        rkey.ref = *ref;

        mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_REPARSE);

        if (rtag) {
                err = indx_delete_entry(indx, ni, &rkey, sizeof(rkey), NULL);
                goto out1;
        }

        fnd = fnd_get();
        if (!fnd) {
                err = -ENOMEM;
                goto out1;
        }

        root_r = indx_get_root(indx, ni, NULL, NULL);
        if (!root_r) {
                err = -EINVAL;
                goto out;
        }

        /* 1 - forces to ignore rkey.ReparseTag when comparing keys. */
        err = indx_find(indx, ni, root_r, &rkey, sizeof(rkey), (void *)1, &diff,
                        (struct NTFS_DE **)&re, fnd);
        if (err)
                goto out;

        if (memcmp(&re->key.ref, ref, sizeof(*ref))) {
                /* Impossible. Looks like volume corrupt? */
                goto out;
        }

        memcpy(&rkey, &re->key, sizeof(rkey));

        fnd_put(fnd);
        fnd = NULL;

        err = indx_delete_entry(indx, ni, &rkey, sizeof(rkey), NULL);
        if (err)
                goto out;

out:
        fnd_put(fnd);

out1:
        mark_inode_dirty(&ni->vfs_inode);
        ni_unlock(ni);

        return err;
}

static inline void ntfs_unmap_and_discard(struct ntfs_sb_info *sbi, CLST lcn,
                                          CLST len)
{
        ntfs_unmap_meta(sbi->sb, lcn, len);
        ntfs_discard(sbi, lcn, len);
}

void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim)
{
        CLST end, i, zone_len, zlen;
        struct wnd_bitmap *wnd = &sbi->used.bitmap;
        bool dirty = false;

        down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS);
        if (!wnd_is_used(wnd, lcn, len)) {
                /* mark volume as dirty out of wnd->rw_lock */
                dirty = true;

                end = lcn + len;
                len = 0;
                for (i = lcn; i < end; i++) {
                        if (wnd_is_used(wnd, i, 1)) {
                                if (!len)
                                        lcn = i;
                                len += 1;
                                continue;
                        }

                        if (!len)
                                continue;

                        if (trim)
                                ntfs_unmap_and_discard(sbi, lcn, len);

                        wnd_set_free(wnd, lcn, len);
                        len = 0;
                }

                if (!len)
                        goto out;
        }

        if (trim)
                ntfs_unmap_and_discard(sbi, lcn, len);
        wnd_set_free(wnd, lcn, len);

        /* append to MFT zone, if possible. */
        zone_len = wnd_zone_len(wnd);
        zlen = min(zone_len + len, sbi->zone_max);

        if (zlen == zone_len) {
                /* MFT zone already has maximum size. */
        } else if (!zone_len) {
                /* Create MFT zone only if 'zlen' is large enough. */
                if (zlen == sbi->zone_max)
                        wnd_zone_set(wnd, lcn, zlen);
        } else {
                CLST zone_lcn = wnd_zone_bit(wnd);

                if (lcn + len == zone_lcn) {
                        /* Append into head MFT zone. */
                        wnd_zone_set(wnd, lcn, zlen);
                } else if (zone_lcn + zone_len == lcn) {
                        /* Append into tail MFT zone. */
                        wnd_zone_set(wnd, zone_lcn, zlen);
                }
        }

out:
        up_write(&wnd->rw_lock);
        if (dirty)
                ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
}

/*
 * run_deallocate - Deallocate clusters.
 */
int run_deallocate(struct ntfs_sb_info *sbi, const struct runs_tree *run,
                   bool trim)
{
        CLST lcn, len;
        size_t idx = 0;

        while (run_get_entry(run, idx++, NULL, &lcn, &len)) {
                if (lcn == SPARSE_LCN)
                        continue;

                mark_as_free_ex(sbi, lcn, len, trim);
        }

        return 0;
}

static inline bool name_has_forbidden_chars(const struct le_str *fname)
{
        int i, ch;

        /* check for forbidden chars */
        for (i = 0; i < fname->len; ++i) {
                ch = le16_to_cpu(fname->name[i]);

                /* control chars */
                if (ch < 0x20)
                        return true;

                switch (ch) {
                /* disallowed by Windows */
                case '\\':
                case '/':
                case ':':
                case '*':
                case '?':
                case '<':
                case '>':
                case '|':
                case '\"':
                        return true;

                default:
                        /* allowed char */
                        break;
                }
        }

        /* file names cannot end with space or . */
        if (fname->len > 0) {
                ch = le16_to_cpu(fname->name[fname->len - 1]);
                if (ch == ' ' || ch == '.')
                        return true;
        }

        return false;
}

static inline bool is_reserved_name(const struct ntfs_sb_info *sbi,
                                    const struct le_str *fname)
{
        int port_digit;
        const __le16 *name = fname->name;
        int len = fname->len;
        const u16 *upcase = sbi->upcase;

        /* check for 3 chars reserved names (device names) */
        /* name by itself or with any extension is forbidden */
        if (len == 3 || (len > 3 && le16_to_cpu(name[3]) == '.'))
                if (!ntfs_cmp_names(name, 3, CON_NAME, 3, upcase, false) ||
                    !ntfs_cmp_names(name, 3, NUL_NAME, 3, upcase, false) ||
                    !ntfs_cmp_names(name, 3, AUX_NAME, 3, upcase, false) ||
                    !ntfs_cmp_names(name, 3, PRN_NAME, 3, upcase, false))
                        return true;

        /* check for 4 chars reserved names (port name followed by 1..9) */
        /* name by itself or with any extension is forbidden */
        if (len == 4 || (len > 4 && le16_to_cpu(name[4]) == '.')) {
                port_digit = le16_to_cpu(name[3]);
                if (port_digit >= '1' && port_digit <= '9')
                        if (!ntfs_cmp_names(name, 3, COM_NAME, 3, upcase,
                                            false) ||
                            !ntfs_cmp_names(name, 3, LPT_NAME, 3, upcase,
                                            false))
                                return true;
        }

        return false;
}

/*
 * valid_windows_name - Check if a file name is valid in Windows.
 */
bool valid_windows_name(struct ntfs_sb_info *sbi, const struct le_str *fname)
{
        return !name_has_forbidden_chars(fname) &&
               !is_reserved_name(sbi, fname);
}

/*
 * ntfs_set_label - updates current ntfs label.
 */
int ntfs_set_label(struct ntfs_sb_info *sbi, u8 *label, int len)
{
        int err;
        struct ATTRIB *attr;
        struct ntfs_inode *ni = sbi->volume.ni;
        const u8 max_ulen = 0x80; /* TODO: use attrdef to get maximum length */
        /* Allocate PATH_MAX bytes. */
        struct cpu_str *uni = __getname();

        if (!uni)
                return -ENOMEM;

        err = ntfs_nls_to_utf16(sbi, label, len, uni, (PATH_MAX - 2) / 2,
                                UTF16_LITTLE_ENDIAN);
        if (err < 0)
                goto out;

        if (uni->len > max_ulen) {
                ntfs_warn(sbi->sb, "new label is too long");
                err = -EFBIG;
                goto out;
        }

        ni_lock(ni);

        /* Ignore any errors. */
        ni_remove_attr(ni, ATTR_LABEL, NULL, 0, false, NULL);

        err = ni_insert_resident(ni, uni->len * sizeof(u16), ATTR_LABEL, NULL,
                                 0, &attr, NULL, NULL);
        if (err < 0)
                goto unlock_out;

        /* write new label in on-disk struct. */
        memcpy(resident_data(attr), uni->name, uni->len * sizeof(u16));

        /* update cached value of current label. */
        if (len >= ARRAY_SIZE(sbi->volume.label))
                len = ARRAY_SIZE(sbi->volume.label) - 1;
        memcpy(sbi->volume.label, label, len);
        sbi->volume.label[len] = 0;
        mark_inode_dirty_sync(&ni->vfs_inode);

unlock_out:
        ni_unlock(ni);

        if (!err)
                err = _ni_write_inode(&ni->vfs_inode, 0);

out:
        __putname(uni);
        return err;
}
















































    6 














    6 


    6 
















    1 
    1 

    1 




















    1 
















    6 


    7 









    7 





























    6 








    6 






















    1 


































    5 










    2 








    4 



    4 




    4 







    4 









    1 










    1 










































































































    4 





































    2 





    2 




    1 













    1 












    1 


    1 


    1 







    1 

















    2 


    2 












    1 





    1 


































































































    1 

























































    1 













    1 








    1 






























































































    1 



    7 







    7 

    7 









    5 









    5 

    5 






    4 











    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/stat.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/highuid.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/compat.h>
#include <linux/iversion.h>

#include <linux/uaccess.h>
#include <asm/unistd.h>

#include "internal.h"
#include "mount.h"

/**
 * generic_fillattr - Fill in the basic attributes from the inode struct
 * @idmap:                idmap of the mount the inode was found from
 * @request_mask:        statx request_mask
 * @inode:                Inode to use as the source
 * @stat:                Where to fill in the attributes
 *
 * Fill in the basic attributes in the kstat structure from data that's to be
 * found on the VFS inode structure.  This is the default if no getattr inode
 * operation is supplied.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before filling in the
 * uid and gid filds. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 */
void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
                      struct inode *inode, struct kstat *stat)
{
        vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode);
        vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);

        stat->dev = inode->i_sb->s_dev;
        stat->ino = inode->i_ino;
        stat->mode = inode->i_mode;
        stat->nlink = inode->i_nlink;
        stat->uid = vfsuid_into_kuid(vfsuid);
        stat->gid = vfsgid_into_kgid(vfsgid);
        stat->rdev = inode->i_rdev;
        stat->size = i_size_read(inode);
        stat->atime = inode_get_atime(inode);
        stat->mtime = inode_get_mtime(inode);
        stat->ctime = inode_get_ctime(inode);
        stat->blksize = i_blocksize(inode);
        stat->blocks = inode->i_blocks;

        if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) {
                stat->result_mask |= STATX_CHANGE_COOKIE;
                stat->change_cookie = inode_query_iversion(inode);
        }

}
EXPORT_SYMBOL(generic_fillattr);

/**
 * generic_fill_statx_attr - Fill in the statx attributes from the inode flags
 * @inode:        Inode to use as the source
 * @stat:        Where to fill in the attribute flags
 *
 * Fill in the STATX_ATTR_* flags in the kstat structure for properties of the
 * inode that are published on i_flags and enforced by the VFS.
 */
void generic_fill_statx_attr(struct inode *inode, struct kstat *stat)
{
        if (inode->i_flags & S_IMMUTABLE)
                stat->attributes |= STATX_ATTR_IMMUTABLE;
        if (inode->i_flags & S_APPEND)
                stat->attributes |= STATX_ATTR_APPEND;
        stat->attributes_mask |= KSTAT_ATTR_VFS_FLAGS;
}
EXPORT_SYMBOL(generic_fill_statx_attr);

/**
 * vfs_getattr_nosec - getattr without security checks
 * @path: file to get attributes from
 * @stat: structure to return attributes in
 * @request_mask: STATX_xxx flags indicating what the caller wants
 * @query_flags: Query mode (AT_STATX_SYNC_TYPE)
 *
 * Get attributes without calling security_inode_getattr.
 *
 * Currently the only caller other than vfs_getattr is internal to the
 * filehandle lookup code, which uses only the inode number and returns no
 * attributes to any user.  Any other code probably wants vfs_getattr.
 */
int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
                      u32 request_mask, unsigned int query_flags)
{
        struct mnt_idmap *idmap;
        struct inode *inode = d_backing_inode(path->dentry);

        memset(stat, 0, sizeof(*stat));
        stat->result_mask |= STATX_BASIC_STATS;
        query_flags &= AT_STATX_SYNC_TYPE;

        /* allow the fs to override these if it really wants to */
        /* SB_NOATIME means filesystem supplies dummy atime value */
        if (inode->i_sb->s_flags & SB_NOATIME)
                stat->result_mask &= ~STATX_ATIME;

        /*
         * Note: If you add another clause to set an attribute flag, please
         * update attributes_mask below.
         */
        if (IS_AUTOMOUNT(inode))
                stat->attributes |= STATX_ATTR_AUTOMOUNT;

        if (IS_DAX(inode))
                stat->attributes |= STATX_ATTR_DAX;

        stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT |
                                  STATX_ATTR_DAX);

        idmap = mnt_idmap(path->mnt);
        if (inode->i_op->getattr)
                return inode->i_op->getattr(idmap, path, stat,
                                            request_mask,
                                            query_flags | AT_GETATTR_NOSEC);

        generic_fillattr(idmap, request_mask, inode, stat);
        return 0;
}
EXPORT_SYMBOL(vfs_getattr_nosec);

/*
 * vfs_getattr - Get the enhanced basic attributes of a file
 * @path: The file of interest
 * @stat: Where to return the statistics
 * @request_mask: STATX_xxx flags indicating what the caller wants
 * @query_flags: Query mode (AT_STATX_SYNC_TYPE)
 *
 * Ask the filesystem for a file's attributes.  The caller must indicate in
 * request_mask and query_flags to indicate what they want.
 *
 * If the file is remote, the filesystem can be forced to update the attributes
 * from the backing store by passing AT_STATX_FORCE_SYNC in query_flags or can
 * suppress the update by passing AT_STATX_DONT_SYNC.
 *
 * Bits must have been set in request_mask to indicate which attributes the
 * caller wants retrieving.  Any such attribute not requested may be returned
 * anyway, but the value may be approximate, and, if remote, may not have been
 * synchronised with the server.
 *
 * 0 will be returned on success, and a -ve error code if unsuccessful.
 */
int vfs_getattr(const struct path *path, struct kstat *stat,
                u32 request_mask, unsigned int query_flags)
{
        int retval;

        if (WARN_ON_ONCE(query_flags & AT_GETATTR_NOSEC))
                return -EPERM;

        retval = security_inode_getattr(path);
        if (retval)
                return retval;
        return vfs_getattr_nosec(path, stat, request_mask, query_flags);
}
EXPORT_SYMBOL(vfs_getattr);

/**
 * vfs_fstat - Get the basic attributes by file descriptor
 * @fd: The file descriptor referring to the file of interest
 * @stat: The result structure to fill in.
 *
 * This function is a wrapper around vfs_getattr().  The main difference is
 * that it uses a file descriptor to determine the file location.
 *
 * 0 will be returned on success, and a -ve error code if unsuccessful.
 */
int vfs_fstat(int fd, struct kstat *stat)
{
        struct fd f;
        int error;

        f = fdget_raw(fd);
        if (!f.file)
                return -EBADF;
        error = vfs_getattr(&f.file->f_path, stat, STATX_BASIC_STATS, 0);
        fdput(f);
        return error;
}

int getname_statx_lookup_flags(int flags)
{
        int lookup_flags = 0;

        if (!(flags & AT_SYMLINK_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
        if (!(flags & AT_NO_AUTOMOUNT))
                lookup_flags |= LOOKUP_AUTOMOUNT;
        if (flags & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;

        return lookup_flags;
}

/**
 * vfs_statx - Get basic and extra attributes by filename
 * @dfd: A file descriptor representing the base dir for a relative filename
 * @filename: The name of the file of interest
 * @flags: Flags to control the query
 * @stat: The result structure to fill in.
 * @request_mask: STATX_xxx flags indicating what the caller wants
 *
 * This function is a wrapper around vfs_getattr().  The main difference is
 * that it uses a filename and base directory to determine the file location.
 * Additionally, the use of AT_SYMLINK_NOFOLLOW in flags will prevent a symlink
 * at the given name from being referenced.
 *
 * 0 will be returned on success, and a -ve error code if unsuccessful.
 */
static int vfs_statx(int dfd, struct filename *filename, int flags,
              struct kstat *stat, u32 request_mask)
{
        struct path path;
        unsigned int lookup_flags = getname_statx_lookup_flags(flags);
        int error;

        if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH |
                      AT_STATX_SYNC_TYPE))
                return -EINVAL;

retry:
        error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
        if (error)
                goto out;

        error = vfs_getattr(&path, stat, request_mask, flags);

        if (request_mask & STATX_MNT_ID_UNIQUE) {
                stat->mnt_id = real_mount(path.mnt)->mnt_id_unique;
                stat->result_mask |= STATX_MNT_ID_UNIQUE;
        } else {
                stat->mnt_id = real_mount(path.mnt)->mnt_id;
                stat->result_mask |= STATX_MNT_ID;
        }

        if (path.mnt->mnt_root == path.dentry)
                stat->attributes |= STATX_ATTR_MOUNT_ROOT;
        stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;

        /* Handle STATX_DIOALIGN for block devices. */
        if (request_mask & STATX_DIOALIGN) {
                struct inode *inode = d_backing_inode(path.dentry);

                if (S_ISBLK(inode->i_mode))
                        bdev_statx_dioalign(inode, stat);
        }

        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        return error;
}

int vfs_fstatat(int dfd, const char __user *filename,
                              struct kstat *stat, int flags)
{
        int ret;
        int statx_flags = flags | AT_NO_AUTOMOUNT;
        struct filename *name;

        /*
         * Work around glibc turning fstat() into fstatat(AT_EMPTY_PATH)
         *
         * If AT_EMPTY_PATH is set, we expect the common case to be that
         * empty path, and avoid doing all the extra pathname work.
         */
        if (dfd >= 0 && flags == AT_EMPTY_PATH) {
                char c;

                ret = get_user(c, filename);
                if (unlikely(ret))
                        return ret;

                if (likely(!c))
                        return vfs_fstat(dfd, stat);
        }

        name = getname_flags(filename, getname_statx_lookup_flags(statx_flags), NULL);
        ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS);
        putname(name);

        return ret;
}

#ifdef __ARCH_WANT_OLD_STAT

/*
 * For backward compatibility?  Maybe this should be moved
 * into arch/i386 instead?
 */
static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * statbuf)
{
        static int warncount = 5;
        struct __old_kernel_stat tmp;

        if (warncount > 0) {
                warncount--;
                printk(KERN_WARNING "VFS: Warning: %s using old stat() call. Recompile your binary.\n",
                        current->comm);
        } else if (warncount < 0) {
                /* it's laughable, but... */
                warncount = 0;
        }

        memset(&tmp, 0, sizeof(struct __old_kernel_stat));
        tmp.st_dev = old_encode_dev(stat->dev);
        tmp.st_ino = stat->ino;
        if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
                return -EOVERFLOW;
        tmp.st_mode = stat->mode;
        tmp.st_nlink = stat->nlink;
        if (tmp.st_nlink != stat->nlink)
                return -EOVERFLOW;
        SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
        SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
        tmp.st_rdev = old_encode_dev(stat->rdev);
#if BITS_PER_LONG == 32
        if (stat->size > MAX_NON_LFS)
                return -EOVERFLOW;
#endif
        tmp.st_size = stat->size;
        tmp.st_atime = stat->atime.tv_sec;
        tmp.st_mtime = stat->mtime.tv_sec;
        tmp.st_ctime = stat->ctime.tv_sec;
        return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
}

SYSCALL_DEFINE2(stat, const char __user *, filename,
                struct __old_kernel_stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_stat(filename, &stat);
        if (error)
                return error;

        return cp_old_stat(&stat, statbuf);
}

SYSCALL_DEFINE2(lstat, const char __user *, filename,
                struct __old_kernel_stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_lstat(filename, &stat);
        if (error)
                return error;

        return cp_old_stat(&stat, statbuf);
}

SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_fstat(fd, &stat);

        if (!error)
                error = cp_old_stat(&stat, statbuf);

        return error;
}

#endif /* __ARCH_WANT_OLD_STAT */

#ifdef __ARCH_WANT_NEW_STAT

#ifndef INIT_STRUCT_STAT_PADDING
#  define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st))
#endif

static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
{
        struct stat tmp;

        if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
                return -EOVERFLOW;
        if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
                return -EOVERFLOW;
#if BITS_PER_LONG == 32
        if (stat->size > MAX_NON_LFS)
                return -EOVERFLOW;
#endif

        INIT_STRUCT_STAT_PADDING(tmp);
        tmp.st_dev = new_encode_dev(stat->dev);
        tmp.st_ino = stat->ino;
        if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
                return -EOVERFLOW;
        tmp.st_mode = stat->mode;
        tmp.st_nlink = stat->nlink;
        if (tmp.st_nlink != stat->nlink)
                return -EOVERFLOW;
        SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
        SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
        tmp.st_rdev = new_encode_dev(stat->rdev);
        tmp.st_size = stat->size;
        tmp.st_atime = stat->atime.tv_sec;
        tmp.st_mtime = stat->mtime.tv_sec;
        tmp.st_ctime = stat->ctime.tv_sec;
#ifdef STAT_HAVE_NSEC
        tmp.st_atime_nsec = stat->atime.tv_nsec;
        tmp.st_mtime_nsec = stat->mtime.tv_nsec;
        tmp.st_ctime_nsec = stat->ctime.tv_nsec;
#endif
        tmp.st_blocks = stat->blocks;
        tmp.st_blksize = stat->blksize;
        return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
}

SYSCALL_DEFINE2(newstat, const char __user *, filename,
                struct stat __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_stat(filename, &stat);

        if (error)
                return error;
        return cp_new_stat(&stat, statbuf);
}

SYSCALL_DEFINE2(newlstat, const char __user *, filename,
                struct stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_lstat(filename, &stat);
        if (error)
                return error;

        return cp_new_stat(&stat, statbuf);
}

#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
                struct stat __user *, statbuf, int, flag)
{
        struct kstat stat;
        int error;

        error = vfs_fstatat(dfd, filename, &stat, flag);
        if (error)
                return error;
        return cp_new_stat(&stat, statbuf);
}
#endif

SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_fstat(fd, &stat);

        if (!error)
                error = cp_new_stat(&stat, statbuf);

        return error;
}
#endif

static int do_readlinkat(int dfd, const char __user *pathname,
                         char __user *buf, int bufsiz)
{
        struct path path;
        int error;
        int empty = 0;
        unsigned int lookup_flags = LOOKUP_EMPTY;

        if (bufsiz <= 0)
                return -EINVAL;

retry:
        error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty);
        if (!error) {
                struct inode *inode = d_backing_inode(path.dentry);

                error = empty ? -ENOENT : -EINVAL;
                /*
                 * AFS mountpoints allow readlink(2) but are not symlinks
                 */
                if (d_is_symlink(path.dentry) || inode->i_op->readlink) {
                        error = security_inode_readlink(path.dentry);
                        if (!error) {
                                touch_atime(&path);
                                error = vfs_readlink(path.dentry, buf, bufsiz);
                        }
                }
                path_put(&path);
                if (retry_estale(error, lookup_flags)) {
                        lookup_flags |= LOOKUP_REVAL;
                        goto retry;
                }
        }
        return error;
}

SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
                char __user *, buf, int, bufsiz)
{
        return do_readlinkat(dfd, pathname, buf, bufsiz);
}

SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf,
                int, bufsiz)
{
        return do_readlinkat(AT_FDCWD, path, buf, bufsiz);
}


/* ---------- LFS-64 ----------- */
#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64)

#ifndef INIT_STRUCT_STAT64_PADDING
#  define INIT_STRUCT_STAT64_PADDING(st) memset(&st, 0, sizeof(st))
#endif

static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
{
        struct stat64 tmp;

        INIT_STRUCT_STAT64_PADDING(tmp);
#ifdef CONFIG_MIPS
        /* mips has weird padding, so we don't get 64 bits there */
        tmp.st_dev = new_encode_dev(stat->dev);
        tmp.st_rdev = new_encode_dev(stat->rdev);
#else
        tmp.st_dev = huge_encode_dev(stat->dev);
        tmp.st_rdev = huge_encode_dev(stat->rdev);
#endif
        tmp.st_ino = stat->ino;
        if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
                return -EOVERFLOW;
#ifdef STAT64_HAS_BROKEN_ST_INO
        tmp.__st_ino = stat->ino;
#endif
        tmp.st_mode = stat->mode;
        tmp.st_nlink = stat->nlink;
        tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid);
        tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid);
        tmp.st_atime = stat->atime.tv_sec;
        tmp.st_atime_nsec = stat->atime.tv_nsec;
        tmp.st_mtime = stat->mtime.tv_sec;
        tmp.st_mtime_nsec = stat->mtime.tv_nsec;
        tmp.st_ctime = stat->ctime.tv_sec;
        tmp.st_ctime_nsec = stat->ctime.tv_nsec;
        tmp.st_size = stat->size;
        tmp.st_blocks = stat->blocks;
        tmp.st_blksize = stat->blksize;
        return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
}

SYSCALL_DEFINE2(stat64, const char __user *, filename,
                struct stat64 __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_stat(filename, &stat);

        if (!error)
                error = cp_new_stat64(&stat, statbuf);

        return error;
}

SYSCALL_DEFINE2(lstat64, const char __user *, filename,
                struct stat64 __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_lstat(filename, &stat);

        if (!error)
                error = cp_new_stat64(&stat, statbuf);

        return error;
}

SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_fstat(fd, &stat);

        if (!error)
                error = cp_new_stat64(&stat, statbuf);

        return error;
}

SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
                struct stat64 __user *, statbuf, int, flag)
{
        struct kstat stat;
        int error;

        error = vfs_fstatat(dfd, filename, &stat, flag);
        if (error)
                return error;
        return cp_new_stat64(&stat, statbuf);
}
#endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */

static noinline_for_stack int
cp_statx(const struct kstat *stat, struct statx __user *buffer)
{
        struct statx tmp;

        memset(&tmp, 0, sizeof(tmp));

        /* STATX_CHANGE_COOKIE is kernel-only for now */
        tmp.stx_mask = stat->result_mask & ~STATX_CHANGE_COOKIE;
        tmp.stx_blksize = stat->blksize;
        /* STATX_ATTR_CHANGE_MONOTONIC is kernel-only for now */
        tmp.stx_attributes = stat->attributes & ~STATX_ATTR_CHANGE_MONOTONIC;
        tmp.stx_nlink = stat->nlink;
        tmp.stx_uid = from_kuid_munged(current_user_ns(), stat->uid);
        tmp.stx_gid = from_kgid_munged(current_user_ns(), stat->gid);
        tmp.stx_mode = stat->mode;
        tmp.stx_ino = stat->ino;
        tmp.stx_size = stat->size;
        tmp.stx_blocks = stat->blocks;
        tmp.stx_attributes_mask = stat->attributes_mask;
        tmp.stx_atime.tv_sec = stat->atime.tv_sec;
        tmp.stx_atime.tv_nsec = stat->atime.tv_nsec;
        tmp.stx_btime.tv_sec = stat->btime.tv_sec;
        tmp.stx_btime.tv_nsec = stat->btime.tv_nsec;
        tmp.stx_ctime.tv_sec = stat->ctime.tv_sec;
        tmp.stx_ctime.tv_nsec = stat->ctime.tv_nsec;
        tmp.stx_mtime.tv_sec = stat->mtime.tv_sec;
        tmp.stx_mtime.tv_nsec = stat->mtime.tv_nsec;
        tmp.stx_rdev_major = MAJOR(stat->rdev);
        tmp.stx_rdev_minor = MINOR(stat->rdev);
        tmp.stx_dev_major = MAJOR(stat->dev);
        tmp.stx_dev_minor = MINOR(stat->dev);
        tmp.stx_mnt_id = stat->mnt_id;
        tmp.stx_dio_mem_align = stat->dio_mem_align;
        tmp.stx_dio_offset_align = stat->dio_offset_align;
        tmp.stx_subvol = stat->subvol;

        return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}

int do_statx(int dfd, struct filename *filename, unsigned int flags,
             unsigned int mask, struct statx __user *buffer)
{
        struct kstat stat;
        int error;

        if (mask & STATX__RESERVED)
                return -EINVAL;
        if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
                return -EINVAL;

        /* STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests
         * from userland.
         */
        mask &= ~STATX_CHANGE_COOKIE;

        error = vfs_statx(dfd, filename, flags, &stat, mask);
        if (error)
                return error;

        return cp_statx(&stat, buffer);
}

/**
 * sys_statx - System call to get enhanced stats
 * @dfd: Base directory to pathwalk from *or* fd to stat.
 * @filename: File to stat or "" with AT_EMPTY_PATH
 * @flags: AT_* flags to control pathwalk.
 * @mask: Parts of statx struct actually required.
 * @buffer: Result buffer.
 *
 * Note that fstat() can be emulated by setting dfd to the fd of interest,
 * supplying "" as the filename and setting AT_EMPTY_PATH in the flags.
 */
SYSCALL_DEFINE5(statx,
                int, dfd, const char __user *, filename, unsigned, flags,
                unsigned int, mask,
                struct statx __user *, buffer)
{
        int ret;
        struct filename *name;

        name = getname_flags(filename, getname_statx_lookup_flags(flags), NULL);
        ret = do_statx(dfd, name, flags, mask, buffer);
        putname(name);

        return ret;
}

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_STAT)
static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
{
        struct compat_stat tmp;

        if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
                return -EOVERFLOW;
        if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
                return -EOVERFLOW;

        memset(&tmp, 0, sizeof(tmp));
        tmp.st_dev = new_encode_dev(stat->dev);
        tmp.st_ino = stat->ino;
        if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
                return -EOVERFLOW;
        tmp.st_mode = stat->mode;
        tmp.st_nlink = stat->nlink;
        if (tmp.st_nlink != stat->nlink)
                return -EOVERFLOW;
        SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
        SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
        tmp.st_rdev = new_encode_dev(stat->rdev);
        if ((u64) stat->size > MAX_NON_LFS)
                return -EOVERFLOW;
        tmp.st_size = stat->size;
        tmp.st_atime = stat->atime.tv_sec;
        tmp.st_atime_nsec = stat->atime.tv_nsec;
        tmp.st_mtime = stat->mtime.tv_sec;
        tmp.st_mtime_nsec = stat->mtime.tv_nsec;
        tmp.st_ctime = stat->ctime.tv_sec;
        tmp.st_ctime_nsec = stat->ctime.tv_nsec;
        tmp.st_blocks = stat->blocks;
        tmp.st_blksize = stat->blksize;
        return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}

COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename,
                       struct compat_stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_stat(filename, &stat);
        if (error)
                return error;
        return cp_compat_stat(&stat, statbuf);
}

COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename,
                       struct compat_stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_lstat(filename, &stat);
        if (error)
                return error;
        return cp_compat_stat(&stat, statbuf);
}

#ifndef __ARCH_WANT_STAT64
COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd,
                       const char __user *, filename,
                       struct compat_stat __user *, statbuf, int, flag)
{
        struct kstat stat;
        int error;

        error = vfs_fstatat(dfd, filename, &stat, flag);
        if (error)
                return error;
        return cp_compat_stat(&stat, statbuf);
}
#endif

COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd,
                       struct compat_stat __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_fstat(fd, &stat);

        if (!error)
                error = cp_compat_stat(&stat, statbuf);
        return error;
}
#endif

/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */
void __inode_add_bytes(struct inode *inode, loff_t bytes)
{
        inode->i_blocks += bytes >> 9;
        bytes &= 511;
        inode->i_bytes += bytes;
        if (inode->i_bytes >= 512) {
                inode->i_blocks++;
                inode->i_bytes -= 512;
        }
}
EXPORT_SYMBOL(__inode_add_bytes);

void inode_add_bytes(struct inode *inode, loff_t bytes)
{
        spin_lock(&inode->i_lock);
        __inode_add_bytes(inode, bytes);
        spin_unlock(&inode->i_lock);
}

EXPORT_SYMBOL(inode_add_bytes);

void __inode_sub_bytes(struct inode *inode, loff_t bytes)
{
        inode->i_blocks -= bytes >> 9;
        bytes &= 511;
        if (inode->i_bytes < bytes) {
                inode->i_blocks--;
                inode->i_bytes += 512;
        }
        inode->i_bytes -= bytes;
}

EXPORT_SYMBOL(__inode_sub_bytes);

void inode_sub_bytes(struct inode *inode, loff_t bytes)
{
        spin_lock(&inode->i_lock);
        __inode_sub_bytes(inode, bytes);
        spin_unlock(&inode->i_lock);
}

EXPORT_SYMBOL(inode_sub_bytes);

loff_t inode_get_bytes(struct inode *inode)
{
        loff_t ret;

        spin_lock(&inode->i_lock);
        ret = __inode_get_bytes(inode);
        spin_unlock(&inode->i_lock);
        return ret;
}

EXPORT_SYMBOL(inode_get_bytes);

void inode_set_bytes(struct inode *inode, loff_t bytes)
{
        /* Caller is here responsible for sufficient locking
         * (ie. inode->i_lock) */
        inode->i_blocks = bytes >> 9;
        inode->i_bytes = bytes & 511;
}

EXPORT_SYMBOL(inode_set_bytes);






















































    1 





    1 




























































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/proc/inode.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/cache.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/kernel.h>
#include <linux/pid_namespace.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/completion.h>
#include <linux/poll.h>
#include <linux/printk.h>
#include <linux/file.h>
#include <linux/limits.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/mount.h>
#include <linux/bug.h>

#include "internal.h"

static void proc_evict_inode(struct inode *inode)
{
        struct ctl_table_header *head;
        struct proc_inode *ei = PROC_I(inode);

        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);

        /* Stop tracking associated processes */
        if (ei->pid)
                proc_pid_evict_inode(ei);

        head = ei->sysctl;
        if (head) {
                RCU_INIT_POINTER(ei->sysctl, NULL);
                proc_sys_evict_inode(inode, head);
        }
}

static struct kmem_cache *proc_inode_cachep __ro_after_init;
static struct kmem_cache *pde_opener_cache __ro_after_init;

static struct inode *proc_alloc_inode(struct super_block *sb)
{
        struct proc_inode *ei;

        ei = alloc_inode_sb(sb, proc_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
        ei->pid = NULL;
        ei->fd = 0;
        ei->op.proc_get_link = NULL;
        ei->pde = NULL;
        ei->sysctl = NULL;
        ei->sysctl_entry = NULL;
        INIT_HLIST_NODE(&ei->sibling_inodes);
        ei->ns_ops = NULL;
        return &ei->vfs_inode;
}

static void proc_free_inode(struct inode *inode)
{
        struct proc_inode *ei = PROC_I(inode);

        if (ei->pid)
                put_pid(ei->pid);
        /* Let go of any associated proc directory entry */
        if (ei->pde)
                pde_put(ei->pde);
        kmem_cache_free(proc_inode_cachep, PROC_I(inode));
}

static void init_once(void *foo)
{
        struct proc_inode *ei = (struct proc_inode *) foo;

        inode_init_once(&ei->vfs_inode);
}

void __init proc_init_kmemcache(void)
{
        proc_inode_cachep = kmem_cache_create("proc_inode_cache",
                                             sizeof(struct proc_inode),
                                             0, (SLAB_RECLAIM_ACCOUNT|
                                                SLAB_ACCOUNT|
                                                SLAB_PANIC),
                                             init_once);
        pde_opener_cache =
                kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0,
                                  SLAB_ACCOUNT|SLAB_PANIC, NULL);
        proc_dir_entry_cache = kmem_cache_create_usercopy(
                "proc_dir_entry", SIZEOF_PDE, 0, SLAB_PANIC,
                offsetof(struct proc_dir_entry, inline_name),
                SIZEOF_PDE_INLINE_NAME, NULL);
        BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE);
}

void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
{
        struct hlist_node *node;
        struct super_block *old_sb = NULL;

        rcu_read_lock();
        while ((node = hlist_first_rcu(inodes))) {
                struct proc_inode *ei = hlist_entry(node, struct proc_inode, sibling_inodes);
                struct super_block *sb;
                struct inode *inode;

                spin_lock(lock);
                hlist_del_init_rcu(&ei->sibling_inodes);
                spin_unlock(lock);

                inode = &ei->vfs_inode;
                sb = inode->i_sb;
                if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active))
                        continue;
                inode = igrab(inode);
                rcu_read_unlock();
                if (sb != old_sb) {
                        if (old_sb)
                                deactivate_super(old_sb);
                        old_sb = sb;
                }
                if (unlikely(!inode)) {
                        rcu_read_lock();
                        continue;
                }

                if (S_ISDIR(inode->i_mode)) {
                        struct dentry *dir = d_find_any_alias(inode);
                        if (dir) {
                                d_invalidate(dir);
                                dput(dir);
                        }
                } else {
                        struct dentry *dentry;
                        while ((dentry = d_find_alias(inode))) {
                                d_invalidate(dentry);
                                dput(dentry);
                        }
                }
                iput(inode);

                rcu_read_lock();
        }
        rcu_read_unlock();
        if (old_sb)
                deactivate_super(old_sb);
}

static inline const char *hidepid2str(enum proc_hidepid v)
{
        switch (v) {
                case HIDEPID_OFF: return "off";
                case HIDEPID_NO_ACCESS: return "noaccess";
                case HIDEPID_INVISIBLE: return "invisible";
                case HIDEPID_NOT_PTRACEABLE: return "ptraceable";
        }
        WARN_ONCE(1, "bad hide_pid value: %d\n", v);
        return "unknown";
}

static int proc_show_options(struct seq_file *seq, struct dentry *root)
{
        struct proc_fs_info *fs_info = proc_sb_info(root->d_sb);

        if (!gid_eq(fs_info->pid_gid, GLOBAL_ROOT_GID))
                seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, fs_info->pid_gid));
        if (fs_info->hide_pid != HIDEPID_OFF)
                seq_printf(seq, ",hidepid=%s", hidepid2str(fs_info->hide_pid));
        if (fs_info->pidonly != PROC_PIDONLY_OFF)
                seq_printf(seq, ",subset=pid");

        return 0;
}

const struct super_operations proc_sops = {
        .alloc_inode        = proc_alloc_inode,
        .free_inode        = proc_free_inode,
        .drop_inode        = generic_delete_inode,
        .evict_inode        = proc_evict_inode,
        .statfs                = simple_statfs,
        .show_options        = proc_show_options,
};

enum {BIAS = -1U<<31};

static inline int use_pde(struct proc_dir_entry *pde)
{
        return likely(atomic_inc_unless_negative(&pde->in_use));
}

static void unuse_pde(struct proc_dir_entry *pde)
{
        if (unlikely(atomic_dec_return(&pde->in_use) == BIAS))
                complete(pde->pde_unload_completion);
}

/*
 * At most 2 contexts can enter this function: the one doing the last
 * close on the descriptor and whoever is deleting PDE itself.
 *
 * First to enter calls ->proc_release hook and signals its completion
 * to the second one which waits and then does nothing.
 *
 * PDE is locked on entry, unlocked on exit.
 */
static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
        __releases(&pde->pde_unload_lock)
{
        /*
         * close() (proc_reg_release()) can't delete an entry and proceed:
         * ->release hook needs to be available at the right moment.
         *
         * rmmod (remove_proc_entry() et al) can't delete an entry and proceed:
         * "struct file" needs to be available at the right moment.
         */
        if (pdeo->closing) {
                /* somebody else is doing that, just wait */
                DECLARE_COMPLETION_ONSTACK(c);
                pdeo->c = &c;
                spin_unlock(&pde->pde_unload_lock);
                wait_for_completion(&c);
        } else {
                struct file *file;
                struct completion *c;

                pdeo->closing = true;
                spin_unlock(&pde->pde_unload_lock);

                file = pdeo->file;
                pde->proc_ops->proc_release(file_inode(file), file);

                spin_lock(&pde->pde_unload_lock);
                /* Strictly after ->proc_release, see above. */
                list_del(&pdeo->lh);
                c = pdeo->c;
                spin_unlock(&pde->pde_unload_lock);
                if (unlikely(c))
                        complete(c);
                kmem_cache_free(pde_opener_cache, pdeo);
        }
}

void proc_entry_rundown(struct proc_dir_entry *de)
{
        DECLARE_COMPLETION_ONSTACK(c);
        /* Wait until all existing callers into module are done. */
        de->pde_unload_completion = &c;
        if (atomic_add_return(BIAS, &de->in_use) != BIAS)
                wait_for_completion(&c);

        /* ->pde_openers list can't grow from now on. */

        spin_lock(&de->pde_unload_lock);
        while (!list_empty(&de->pde_openers)) {
                struct pde_opener *pdeo;
                pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
                close_pdeo(de, pdeo);
                spin_lock(&de->pde_unload_lock);
        }
        spin_unlock(&de->pde_unload_lock);
}

static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        loff_t rv = -EINVAL;

        if (pde_is_permanent(pde)) {
                return pde->proc_ops->proc_lseek(file, offset, whence);
        } else if (use_pde(pde)) {
                rv = pde->proc_ops->proc_lseek(file, offset, whence);
                unuse_pde(pde);
        }
        return rv;
}

static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        struct proc_dir_entry *pde = PDE(file_inode(iocb->ki_filp));
        ssize_t ret;

        if (pde_is_permanent(pde))
                return pde->proc_ops->proc_read_iter(iocb, iter);

        if (!use_pde(pde))
                return -EIO;
        ret = pde->proc_ops->proc_read_iter(iocb, iter);
        unuse_pde(pde);
        return ret;
}

static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
        typeof_member(struct proc_ops, proc_read) read;

        read = pde->proc_ops->proc_read;
        if (read)
                return read(file, buf, count, ppos);
        return -EIO;
}

static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        ssize_t rv = -EIO;

        if (pde_is_permanent(pde)) {
                return pde_read(pde, file, buf, count, ppos);
        } else if (use_pde(pde)) {
                rv = pde_read(pde, file, buf, count, ppos);
                unuse_pde(pde);
        }
        return rv;
}

static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
        typeof_member(struct proc_ops, proc_write) write;

        write = pde->proc_ops->proc_write;
        if (write)
                return write(file, buf, count, ppos);
        return -EIO;
}

static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        ssize_t rv = -EIO;

        if (pde_is_permanent(pde)) {
                return pde_write(pde, file, buf, count, ppos);
        } else if (use_pde(pde)) {
                rv = pde_write(pde, file, buf, count, ppos);
                unuse_pde(pde);
        }
        return rv;
}

static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts)
{
        typeof_member(struct proc_ops, proc_poll) poll;

        poll = pde->proc_ops->proc_poll;
        if (poll)
                return poll(file, pts);
        return DEFAULT_POLLMASK;
}

static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        __poll_t rv = DEFAULT_POLLMASK;

        if (pde_is_permanent(pde)) {
                return pde_poll(pde, file, pts);
        } else if (use_pde(pde)) {
                rv = pde_poll(pde, file, pts);
                unuse_pde(pde);
        }
        return rv;
}

static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
{
        typeof_member(struct proc_ops, proc_ioctl) ioctl;

        ioctl = pde->proc_ops->proc_ioctl;
        if (ioctl)
                return ioctl(file, cmd, arg);
        return -ENOTTY;
}

static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        long rv = -ENOTTY;

        if (pde_is_permanent(pde)) {
                return pde_ioctl(pde, file, cmd, arg);
        } else if (use_pde(pde)) {
                rv = pde_ioctl(pde, file, cmd, arg);
                unuse_pde(pde);
        }
        return rv;
}

#ifdef CONFIG_COMPAT
static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
{
        typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;

        compat_ioctl = pde->proc_ops->proc_compat_ioctl;
        if (compat_ioctl)
                return compat_ioctl(file, cmd, arg);
        return -ENOTTY;
}

static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        long rv = -ENOTTY;
        if (pde_is_permanent(pde)) {
                return pde_compat_ioctl(pde, file, cmd, arg);
        } else if (use_pde(pde)) {
                rv = pde_compat_ioctl(pde, file, cmd, arg);
                unuse_pde(pde);
        }
        return rv;
}
#endif

static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma)
{
        typeof_member(struct proc_ops, proc_mmap) mmap;

        mmap = pde->proc_ops->proc_mmap;
        if (mmap)
                return mmap(file, vma);
        return -EIO;
}

static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        int rv = -EIO;

        if (pde_is_permanent(pde)) {
                return pde_mmap(pde, file, vma);
        } else if (use_pde(pde)) {
                rv = pde_mmap(pde, file, vma);
                unuse_pde(pde);
        }
        return rv;
}

static unsigned long
pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr,
                           unsigned long len, unsigned long pgoff,
                           unsigned long flags)
{
        if (pde->proc_ops->proc_get_unmapped_area)
                return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags);

#ifdef CONFIG_MMU
        return mm_get_unmapped_area(current->mm, file, orig_addr, len, pgoff, flags);
#endif

        return orig_addr;
}

static unsigned long
proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
                           unsigned long len, unsigned long pgoff,
                           unsigned long flags)
{
        struct proc_dir_entry *pde = PDE(file_inode(file));
        unsigned long rv = -EIO;

        if (pde_is_permanent(pde)) {
                return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
        } else if (use_pde(pde)) {
                rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags);
                unuse_pde(pde);
        }
        return rv;
}

static int proc_reg_open(struct inode *inode, struct file *file)
{
        struct proc_dir_entry *pde = PDE(inode);
        int rv = 0;
        typeof_member(struct proc_ops, proc_open) open;
        typeof_member(struct proc_ops, proc_release) release;
        struct pde_opener *pdeo;

        if (!pde->proc_ops->proc_lseek)
                file->f_mode &= ~FMODE_LSEEK;

        if (pde_is_permanent(pde)) {
                open = pde->proc_ops->proc_open;
                if (open)
                        rv = open(inode, file);
                return rv;
        }

        /*
         * Ensure that
         * 1) PDE's ->release hook will be called no matter what
         *    either normally by close()/->release, or forcefully by
         *    rmmod/remove_proc_entry.
         *
         * 2) rmmod isn't blocked by opening file in /proc and sitting on
         *    the descriptor (including "rmmod foo </proc/foo" scenario).
         *
         * Save every "struct file" with custom ->release hook.
         */
        if (!use_pde(pde))
                return -ENOENT;

        release = pde->proc_ops->proc_release;
        if (release) {
                pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL);
                if (!pdeo) {
                        rv = -ENOMEM;
                        goto out_unuse;
                }
        }

        open = pde->proc_ops->proc_open;
        if (open)
                rv = open(inode, file);

        if (release) {
                if (rv == 0) {
                        /* To know what to release. */
                        pdeo->file = file;
                        pdeo->closing = false;
                        pdeo->c = NULL;
                        spin_lock(&pde->pde_unload_lock);
                        list_add(&pdeo->lh, &pde->pde_openers);
                        spin_unlock(&pde->pde_unload_lock);
                } else
                        kmem_cache_free(pde_opener_cache, pdeo);
        }

out_unuse:
        unuse_pde(pde);
        return rv;
}

static int proc_reg_release(struct inode *inode, struct file *file)
{
        struct proc_dir_entry *pde = PDE(inode);
        struct pde_opener *pdeo;

        if (pde_is_permanent(pde)) {
                typeof_member(struct proc_ops, proc_release) release;

                release = pde->proc_ops->proc_release;
                if (release) {
                        return release(inode, file);
                }
                return 0;
        }

        spin_lock(&pde->pde_unload_lock);
        list_for_each_entry(pdeo, &pde->pde_openers, lh) {
                if (pdeo->file == file) {
                        close_pdeo(pde, pdeo);
                        return 0;
                }
        }
        spin_unlock(&pde->pde_unload_lock);
        return 0;
}

static const struct file_operations proc_reg_file_ops = {
        .llseek                = proc_reg_llseek,
        .read                = proc_reg_read,
        .write                = proc_reg_write,
        .poll                = proc_reg_poll,
        .unlocked_ioctl        = proc_reg_unlocked_ioctl,
        .mmap                = proc_reg_mmap,
        .get_unmapped_area = proc_reg_get_unmapped_area,
        .open                = proc_reg_open,
        .release        = proc_reg_release,
};

static const struct file_operations proc_iter_file_ops = {
        .llseek                = proc_reg_llseek,
        .read_iter        = proc_reg_read_iter,
        .write                = proc_reg_write,
        .splice_read        = copy_splice_read,
        .poll                = proc_reg_poll,
        .unlocked_ioctl        = proc_reg_unlocked_ioctl,
        .mmap                = proc_reg_mmap,
        .get_unmapped_area = proc_reg_get_unmapped_area,
        .open                = proc_reg_open,
        .release        = proc_reg_release,
};

#ifdef CONFIG_COMPAT
static const struct file_operations proc_reg_file_ops_compat = {
        .llseek                = proc_reg_llseek,
        .read                = proc_reg_read,
        .write                = proc_reg_write,
        .poll                = proc_reg_poll,
        .unlocked_ioctl        = proc_reg_unlocked_ioctl,
        .compat_ioctl        = proc_reg_compat_ioctl,
        .mmap                = proc_reg_mmap,
        .get_unmapped_area = proc_reg_get_unmapped_area,
        .open                = proc_reg_open,
        .release        = proc_reg_release,
};

static const struct file_operations proc_iter_file_ops_compat = {
        .llseek                = proc_reg_llseek,
        .read_iter        = proc_reg_read_iter,
        .splice_read        = copy_splice_read,
        .write                = proc_reg_write,
        .poll                = proc_reg_poll,
        .unlocked_ioctl        = proc_reg_unlocked_ioctl,
        .compat_ioctl        = proc_reg_compat_ioctl,
        .mmap                = proc_reg_mmap,
        .get_unmapped_area = proc_reg_get_unmapped_area,
        .open                = proc_reg_open,
        .release        = proc_reg_release,
};
#endif

static void proc_put_link(void *p)
{
        unuse_pde(p);
}

static const char *proc_get_link(struct dentry *dentry,
                                 struct inode *inode,
                                 struct delayed_call *done)
{
        struct proc_dir_entry *pde = PDE(inode);
        if (!use_pde(pde))
                return ERR_PTR(-EINVAL);
        set_delayed_call(done, proc_put_link, pde);
        return pde->data;
}

const struct inode_operations proc_link_inode_operations = {
        .get_link        = proc_get_link,
};

struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
{
        struct inode *inode = new_inode(sb);

        if (!inode) {
                pde_put(de);
                return NULL;
        }

        inode->i_private = de->data;
        inode->i_ino = de->low_ino;
        simple_inode_init_ts(inode);
        PROC_I(inode)->pde = de;
        if (is_empty_pde(de)) {
                make_empty_dir_inode(inode);
                return inode;
        }

        if (de->mode) {
                inode->i_mode = de->mode;
                inode->i_uid = de->uid;
                inode->i_gid = de->gid;
        }
        if (de->size)
                inode->i_size = de->size;
        if (de->nlink)
                set_nlink(inode, de->nlink);

        if (S_ISREG(inode->i_mode)) {
                inode->i_op = de->proc_iops;
                if (de->proc_ops->proc_read_iter)
                        inode->i_fop = &proc_iter_file_ops;
                else
                        inode->i_fop = &proc_reg_file_ops;
#ifdef CONFIG_COMPAT
                if (de->proc_ops->proc_compat_ioctl) {
                        if (de->proc_ops->proc_read_iter)
                                inode->i_fop = &proc_iter_file_ops_compat;
                        else
                                inode->i_fop = &proc_reg_file_ops_compat;
                }
#endif
        } else if (S_ISDIR(inode->i_mode)) {
                inode->i_op = de->proc_iops;
                inode->i_fop = de->proc_dir_ops;
        } else if (S_ISLNK(inode->i_mode)) {
                inode->i_op = de->proc_iops;
                inode->i_fop = NULL;
        } else {
                BUG();
        }
        return inode;
}















































    2 
   34 
































    4 
    4 
    8 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Wrapper functions for accessing the file_struct fd array.
 */

#ifndef __LINUX_FILE_H
#define __LINUX_FILE_H

#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/posix_types.h>
#include <linux/errno.h>
#include <linux/cleanup.h>

struct file;

extern void fput(struct file *);

struct file_operations;
struct task_struct;
struct vfsmount;
struct dentry;
struct inode;
struct path;
extern struct file *alloc_file_pseudo(struct inode *, struct vfsmount *,
        const char *, int flags, const struct file_operations *);
extern struct file *alloc_file_pseudo_noaccount(struct inode *, struct vfsmount *,
        const char *, int flags, const struct file_operations *);
extern struct file *alloc_file_clone(struct file *, int flags,
        const struct file_operations *);

static inline void fput_light(struct file *file, int fput_needed)
{
        if (fput_needed)
                fput(file);
}

struct fd {
        struct file *file;
        unsigned int flags;
};
#define FDPUT_FPUT       1
#define FDPUT_POS_UNLOCK 2

static inline void fdput(struct fd fd)
{
        if (fd.flags & FDPUT_FPUT)
                fput(fd.file);
}

extern struct file *fget(unsigned int fd);
extern struct file *fget_raw(unsigned int fd);
extern struct file *fget_task(struct task_struct *task, unsigned int fd);
extern unsigned long __fdget(unsigned int fd);
extern unsigned long __fdget_raw(unsigned int fd);
extern unsigned long __fdget_pos(unsigned int fd);
extern void __f_unlock_pos(struct file *);

static inline struct fd __to_fd(unsigned long v)
{
        return (struct fd){(struct file *)(v & ~3),v & 3};
}

static inline struct fd fdget(unsigned int fd)
{
        return __to_fd(__fdget(fd));
}

static inline struct fd fdget_raw(unsigned int fd)
{
        return __to_fd(__fdget_raw(fd));
}

static inline struct fd fdget_pos(int fd)
{
        return __to_fd(__fdget_pos(fd));
}

static inline void fdput_pos(struct fd f)
{
        if (f.flags & FDPUT_POS_UNLOCK)
                __f_unlock_pos(f.file);
        fdput(f);
}

DEFINE_CLASS(fd, struct fd, fdput(_T), fdget(fd), int fd)
DEFINE_CLASS(fd_raw, struct fd, fdput(_T), fdget_raw(fd), int fd)

extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
extern int replace_fd(unsigned fd, struct file *file, unsigned flags);
extern void set_close_on_exec(unsigned int fd, int flag);
extern bool get_close_on_exec(unsigned int fd);
extern int __get_unused_fd_flags(unsigned flags, unsigned long nofile);
extern int get_unused_fd_flags(unsigned flags);
extern void put_unused_fd(unsigned int fd);

DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T),
             get_unused_fd_flags(flags), unsigned flags)

extern void fd_install(unsigned int fd, struct file *file);

int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags);

int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags);

extern void flush_delayed_fput(void);
extern void __fput_sync(struct file *);

extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;

#endif /* __LINUX_FILE_H */


























































































































































































































































































































































































































































































































































































































    2 







































    2 





































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#ifndef BTRFS_VOLUMES_H
#define BTRFS_VOLUMES_H

#include <linux/blk_types.h>
#include <linux/sizes.h>
#include <linux/atomic.h>
#include <linux/sort.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/log2.h>
#include <linux/kobject.h>
#include <linux/refcount.h>
#include <linux/completion.h>
#include <linux/rbtree.h>
#include <uapi/linux/btrfs.h>
#include "messages.h"
#include "rcu-string.h"

struct block_device;
struct bdev_handle;
struct btrfs_fs_info;
struct btrfs_block_group;
struct btrfs_trans_handle;
struct btrfs_zoned_device_info;

#define BTRFS_MAX_DATA_CHUNK_SIZE        (10ULL * SZ_1G)

extern struct mutex uuid_mutex;

#define BTRFS_STRIPE_LEN                SZ_64K
#define BTRFS_STRIPE_LEN_SHIFT                (16)
#define BTRFS_STRIPE_LEN_MASK                (BTRFS_STRIPE_LEN - 1)

static_assert(const_ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT);

/* Used by sanity check for btrfs_raid_types. */
#define const_ffs(n) (__builtin_ctzll(n) + 1)

/*
 * The conversion from BTRFS_BLOCK_GROUP_* bits to btrfs_raid_type requires
 * RAID0 always to be the lowest profile bit.
 * Although it's part of on-disk format and should never change, do extra
 * compile-time sanity checks.
 */
static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) <
              const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0));
static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) >
              ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK));

/* ilog2() can handle both constants and variables */
#define BTRFS_BG_FLAG_TO_INDEX(profile)                                        \
        ilog2((profile) >> (ilog2(BTRFS_BLOCK_GROUP_RAID0) - 1))

enum btrfs_raid_types {
        /* SINGLE is the special one as it doesn't have on-disk bit. */
        BTRFS_RAID_SINGLE  = 0,

        BTRFS_RAID_RAID0   = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID0),
        BTRFS_RAID_RAID1   = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1),
        BTRFS_RAID_DUP           = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_DUP),
        BTRFS_RAID_RAID10  = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID10),
        BTRFS_RAID_RAID5   = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID5),
        BTRFS_RAID_RAID6   = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID6),
        BTRFS_RAID_RAID1C3 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C3),
        BTRFS_RAID_RAID1C4 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C4),

        BTRFS_NR_RAID_TYPES
};

/*
 * Use sequence counter to get consistent device stat data on
 * 32-bit processors.
 */
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
#include <linux/seqlock.h>
#define __BTRFS_NEED_DEVICE_DATA_ORDERED
#define btrfs_device_data_ordered_init(device)        \
        seqcount_init(&device->data_seqcount)
#else
#define btrfs_device_data_ordered_init(device) do { } while (0)
#endif

#define BTRFS_DEV_STATE_WRITEABLE        (0)
#define BTRFS_DEV_STATE_IN_FS_METADATA        (1)
#define BTRFS_DEV_STATE_MISSING                (2)
#define BTRFS_DEV_STATE_REPLACE_TGT        (3)
#define BTRFS_DEV_STATE_FLUSH_SENT        (4)
#define BTRFS_DEV_STATE_NO_READA        (5)

/* Special value encoding failure to write primary super block. */
#define BTRFS_SUPER_PRIMARY_WRITE_ERROR                (INT_MAX / 2)

struct btrfs_fs_devices;

struct btrfs_device {
        struct list_head dev_list; /* device_list_mutex */
        struct list_head dev_alloc_list; /* chunk mutex */
        struct list_head post_commit_list; /* chunk mutex */
        struct btrfs_fs_devices *fs_devices;
        struct btrfs_fs_info *fs_info;

        struct rcu_string __rcu *name;

        u64 generation;

        struct file *bdev_file;
        struct block_device *bdev;

        struct btrfs_zoned_device_info *zone_info;

        /*
         * Device's major-minor number. Must be set even if the device is not
         * opened (bdev == NULL), unless the device is missing.
         */
        dev_t devt;
        unsigned long dev_state;
        blk_status_t last_flush_error;

#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
        seqcount_t data_seqcount;
#endif

        /* the internal btrfs device id */
        u64 devid;

        /* size of the device in memory */
        u64 total_bytes;

        /* size of the device on disk */
        u64 disk_total_bytes;

        /* bytes used */
        u64 bytes_used;

        /* optimal io alignment for this device */
        u32 io_align;

        /* optimal io width for this device */
        u32 io_width;
        /* type and info about this device */
        u64 type;

        /*
         * Counter of super block write errors, values larger than
         * BTRFS_SUPER_PRIMARY_WRITE_ERROR encode primary super block write failure.
         */
        atomic_t sb_write_errors;

        /* minimal io size for this device */
        u32 sector_size;

        /* physical drive uuid (or lvm uuid) */
        u8 uuid[BTRFS_UUID_SIZE];

        /*
         * size of the device on the current transaction
         *
         * This variant is update when committing the transaction,
         * and protected by chunk mutex
         */
        u64 commit_total_bytes;

        /* bytes used on the current transaction */
        u64 commit_bytes_used;

        /* Bio used for flushing device barriers */
        struct bio flush_bio;
        struct completion flush_wait;

        /* per-device scrub information */
        struct scrub_ctx *scrub_ctx;

        /* disk I/O failure stats. For detailed description refer to
         * enum btrfs_dev_stat_values in ioctl.h */
        int dev_stats_valid;

        /* Counter to record the change of device stats */
        atomic_t dev_stats_ccnt;
        atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];

        struct extent_io_tree alloc_state;

        struct completion kobj_unregister;
        /* For sysfs/FSID/devinfo/devid/ */
        struct kobject devid_kobj;

        /* Bandwidth limit for scrub, in bytes */
        u64 scrub_speed_max;
};

/*
 * Block group or device which contains an active swapfile. Used for preventing
 * unsafe operations while a swapfile is active.
 *
 * These are sorted on (ptr, inode) (note that a block group or device can
 * contain more than one swapfile). We compare the pointer values because we
 * don't actually care what the object is, we just need a quick check whether
 * the object exists in the rbtree.
 */
struct btrfs_swapfile_pin {
        struct rb_node node;
        void *ptr;
        struct inode *inode;
        /*
         * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr
         * points to a struct btrfs_device.
         */
        bool is_block_group;
        /*
         * Only used when 'is_block_group' is true and it is the number of
         * extents used by a swapfile for this block group ('ptr' field).
         */
        int bg_extent_count;
};

/*
 * If we read those variants at the context of their own lock, we needn't
 * use the following helpers, reading them directly is safe.
 */
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
#define BTRFS_DEVICE_GETSET_FUNCS(name)                                        \
static inline u64                                                        \
btrfs_device_get_##name(const struct btrfs_device *dev)                        \
{                                                                        \
        u64 size;                                                        \
        unsigned int seq;                                                \
                                                                        \
        do {                                                                \
                seq = read_seqcount_begin(&dev->data_seqcount);                \
                size = dev->name;                                        \
        } while (read_seqcount_retry(&dev->data_seqcount, seq));        \
        return size;                                                        \
}                                                                        \
                                                                        \
static inline void                                                        \
btrfs_device_set_##name(struct btrfs_device *dev, u64 size)                \
{                                                                        \
        preempt_disable();                                                \
        write_seqcount_begin(&dev->data_seqcount);                        \
        dev->name = size;                                                \
        write_seqcount_end(&dev->data_seqcount);                        \
        preempt_enable();                                                \
}
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
#define BTRFS_DEVICE_GETSET_FUNCS(name)                                        \
static inline u64                                                        \
btrfs_device_get_##name(const struct btrfs_device *dev)                        \
{                                                                        \
        u64 size;                                                        \
                                                                        \
        preempt_disable();                                                \
        size = dev->name;                                                \
        preempt_enable();                                                \
        return size;                                                        \
}                                                                        \
                                                                        \
static inline void                                                        \
btrfs_device_set_##name(struct btrfs_device *dev, u64 size)                \
{                                                                        \
        preempt_disable();                                                \
        dev->name = size;                                                \
        preempt_enable();                                                \
}
#else
#define BTRFS_DEVICE_GETSET_FUNCS(name)                                        \
static inline u64                                                        \
btrfs_device_get_##name(const struct btrfs_device *dev)                        \
{                                                                        \
        return dev->name;                                                \
}                                                                        \
                                                                        \
static inline void                                                        \
btrfs_device_set_##name(struct btrfs_device *dev, u64 size)                \
{                                                                        \
        dev->name = size;                                                \
}
#endif

BTRFS_DEVICE_GETSET_FUNCS(total_bytes);
BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes);
BTRFS_DEVICE_GETSET_FUNCS(bytes_used);

enum btrfs_chunk_allocation_policy {
        BTRFS_CHUNK_ALLOC_REGULAR,
        BTRFS_CHUNK_ALLOC_ZONED,
};

/*
 * Read policies for mirrored block group profiles, read picks the stripe based
 * on these policies.
 */
enum btrfs_read_policy {
        /* Use process PID to choose the stripe */
        BTRFS_READ_POLICY_PID,
        BTRFS_NR_READ_POLICY,
};

#ifdef CONFIG_BTRFS_DEBUG
/*
 * Checksum mode - offload it to workqueues or do it synchronously in
 * btrfs_submit_chunk().
 */
enum btrfs_offload_csum_mode {
        /*
         * Choose offloading checksum or do it synchronously automatically.
         * Do it synchronously if the checksum is fast, or offload to workqueues
         * otherwise.
         */
        BTRFS_OFFLOAD_CSUM_AUTO,
        /* Always offload checksum to workqueues. */
        BTRFS_OFFLOAD_CSUM_FORCE_ON,
        /* Never offload checksum to workqueues. */
        BTRFS_OFFLOAD_CSUM_FORCE_OFF,
};
#endif

struct btrfs_fs_devices {
        u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */

        /*
         * UUID written into the btree blocks:
         *
         * - If metadata_uuid != fsid then super block must have
         *   BTRFS_FEATURE_INCOMPAT_METADATA_UUID flag set.
         *
         * - Following shall be true at all times:
         *   - metadata_uuid == btrfs_header::fsid
         *   - metadata_uuid == btrfs_dev_item::fsid
         *
         * - Relations between fsid and metadata_uuid in sb and fs_devices:
         *   - Normal:
         *       fs_devices->fsid == fs_devices->metadata_uuid == sb->fsid
         *       sb->metadata_uuid == 0
         *
         *   - When the BTRFS_FEATURE_INCOMPAT_METADATA_UUID flag is set:
         *       fs_devices->fsid == sb->fsid
         *       fs_devices->metadata_uuid == sb->metadata_uuid
         *
         *   - When in-memory fs_devices->temp_fsid is true
         *         fs_devices->fsid = random
         *         fs_devices->metadata_uuid == sb->fsid
         */
        u8 metadata_uuid[BTRFS_FSID_SIZE];

        struct list_head fs_list;

        /*
         * Number of devices under this fsid including missing and
         * replace-target device and excludes seed devices.
         */
        u64 num_devices;

        /*
         * The number of devices that successfully opened, including
         * replace-target, excludes seed devices.
         */
        u64 open_devices;

        /* The number of devices that are under the chunk allocation list. */
        u64 rw_devices;

        /* Count of missing devices under this fsid excluding seed device. */
        u64 missing_devices;
        u64 total_rw_bytes;

        /*
         * Count of devices from btrfs_super_block::num_devices for this fsid,
         * which includes the seed device, excludes the transient replace-target
         * device.
         */
        u64 total_devices;

        /* Highest generation number of seen devices */
        u64 latest_generation;

        /*
         * The mount device or a device with highest generation after removal
         * or replace.
         */
        struct btrfs_device *latest_dev;

        /*
         * All of the devices in the filesystem, protected by a mutex so we can
         * safely walk it to write out the super blocks without worrying about
         * adding/removing by the multi-device code. Scrubbing super block can
         * kick off supers writing by holding this mutex lock.
         */
        struct mutex device_list_mutex;

        /* List of all devices, protected by device_list_mutex */
        struct list_head devices;

        /* Devices which can satisfy space allocation. Protected by * chunk_mutex. */
        struct list_head alloc_list;

        struct list_head seed_list;

        /* Count fs-devices opened. */
        int opened;

        /* Set when we find or add a device that doesn't have the nonrot flag set. */
        bool rotating;
        /* Devices support TRIM/discard commands. */
        bool discardable;
        /* The filesystem is a seed filesystem. */
        bool seeding;
        /* The mount needs to use a randomly generated fsid. */
        bool temp_fsid;

        struct btrfs_fs_info *fs_info;
        /* sysfs kobjects */
        struct kobject fsid_kobj;
        struct kobject *devices_kobj;
        struct kobject *devinfo_kobj;
        struct completion kobj_unregister;

        enum btrfs_chunk_allocation_policy chunk_alloc_policy;

        /* Policy used to read the mirrored stripes. */
        enum btrfs_read_policy read_policy;

#ifdef CONFIG_BTRFS_DEBUG
        /* Checksum mode - offload it or do it synchronously. */
        enum btrfs_offload_csum_mode offload_csum_mode;
#endif
};

#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info)        \
                        - sizeof(struct btrfs_chunk))                \
                        / sizeof(struct btrfs_stripe) + 1)

#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE        \
                                - 2 * sizeof(struct btrfs_disk_key)        \
                                - 2 * sizeof(struct btrfs_chunk))        \
                                / sizeof(struct btrfs_stripe) + 1)

struct btrfs_io_stripe {
        struct btrfs_device *dev;
        /* Block mapping. */
        u64 physical;
        u64 length;
        bool is_scrub;
        /* For the endio handler. */
        struct btrfs_io_context *bioc;
};

struct btrfs_discard_stripe {
        struct btrfs_device *dev;
        u64 physical;
        u64 length;
};

/*
 * Context for IO subsmission for device stripe.
 *
 * - Track the unfinished mirrors for mirror based profiles
 *   Mirror based profiles are SINGLE/DUP/RAID1/RAID10.
 *
 * - Contain the logical -> physical mapping info
 *   Used by submit_stripe_bio() for mapping logical bio
 *   into physical device address.
 *
 * - Contain device replace info
 *   Used by handle_ops_on_dev_replace() to copy logical bios
 *   into the new device.
 *
 * - Contain RAID56 full stripe logical bytenrs
 */
struct btrfs_io_context {
        refcount_t refs;
        struct btrfs_fs_info *fs_info;
        /* Taken from struct btrfs_chunk_map::type. */
        u64 map_type;
        struct bio *orig_bio;
        atomic_t error;
        u16 max_errors;

        u64 logical;
        u64 size;
        /* Raid stripe tree ordered entry. */
        struct list_head rst_ordered_entry;

        /*
         * The total number of stripes, including the extra duplicated
         * stripe for replace.
         */
        u16 num_stripes;

        /*
         * The mirror_num of this bioc.
         *
         * This is for reads which use 0 as mirror_num, thus we should return a
         * valid mirror_num (>0) for the reader.
         */
        u16 mirror_num;

        /*
         * The following two members are for dev-replace case only.
         *
         * @replace_nr_stripes:        Number of duplicated stripes which need to be
         *                        written to replace target.
         *                        Should be <= 2 (2 for DUP, otherwise <= 1).
         * @replace_stripe_src:        The array indicates where the duplicated stripes
         *                        are from.
         *
         * The @replace_stripe_src[] array is mostly for RAID56 cases.
         * As non-RAID56 stripes share the same contents of the mapped range,
         * thus no need to bother where the duplicated ones are from.
         *
         * But for RAID56 case, all stripes contain different contents, thus
         * we need a way to know the mapping.
         *
         * There is an example for the two members, using a RAID5 write:
         *
         *   num_stripes:        4 (3 + 1 duplicated write)
         *   stripes[0]:        dev = devid 1, physical = X
         *   stripes[1]:        dev = devid 2, physical = Y
         *   stripes[2]:        dev = devid 3, physical = Z
         *   stripes[3]:        dev = devid 0, physical = Y
         *
         * replace_nr_stripes = 1
         * replace_stripe_src = 1        <- Means stripes[1] is involved in replace.
         *                                   The duplicated stripe index would be
         *                                   (@num_stripes - 1).
         *
         * Note, that we can still have cases replace_nr_stripes = 2 for DUP.
         * In that case, all stripes share the same content, thus we don't
         * need to bother @replace_stripe_src value at all.
         */
        u16 replace_nr_stripes;
        s16 replace_stripe_src;
        /*
         * Logical bytenr of the full stripe start, only for RAID56 cases.
         *
         * When this value is set to other than (u64)-1, the stripes[] should
         * follow this pattern:
         *
         * (real_stripes = num_stripes - replace_nr_stripes)
         * (data_stripes = (is_raid6) ? (real_stripes - 2) : (real_stripes - 1))
         *
         * stripes[0]:                        The first data stripe
         * stripes[1]:                        The second data stripe
         * ...
         * stripes[data_stripes - 1]:        The last data stripe
         * stripes[data_stripes]:        The P stripe
         * stripes[data_stripes + 1]:        The Q stripe (only for RAID6).
         */
        u64 full_stripe_logical;
        struct btrfs_io_stripe stripes[];
};

struct btrfs_device_info {
        struct btrfs_device *dev;
        u64 dev_offset;
        u64 max_avail;
        u64 total_avail;
};

struct btrfs_raid_attr {
        u8 sub_stripes;                /* sub_stripes info for map */
        u8 dev_stripes;                /* stripes per dev */
        u8 devs_max;                /* max devs to use */
        u8 devs_min;                /* min devs needed */
        u8 tolerated_failures;        /* max tolerated fail devs */
        u8 devs_increment;        /* ndevs has to be a multiple of this */
        u8 ncopies;                /* how many copies to data has */
        u8 nparity;                /* number of stripes worth of bytes to store
                                 * parity information */
        u8 mindev_error;        /* error code if min devs requisite is unmet */
        const char raid_name[8]; /* name of the raid */
        u64 bg_flag;                /* block group flag of the raid */
};

extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];

struct btrfs_chunk_map {
        struct rb_node rb_node;
        /* For mount time dev extent verification. */
        int verified_stripes;
        refcount_t refs;
        u64 start;
        u64 chunk_len;
        u64 stripe_size;
        u64 type;
        int io_align;
        int io_width;
        int num_stripes;
        int sub_stripes;
        struct btrfs_io_stripe stripes[];
};

#define btrfs_chunk_map_size(n) (sizeof(struct btrfs_chunk_map) + \
                                 (sizeof(struct btrfs_io_stripe) * (n)))

static inline void btrfs_free_chunk_map(struct btrfs_chunk_map *map)
{
        if (map && refcount_dec_and_test(&map->refs)) {
                ASSERT(RB_EMPTY_NODE(&map->rb_node));
                kfree(map);
        }
}

struct btrfs_balance_control {
        struct btrfs_balance_args data;
        struct btrfs_balance_args meta;
        struct btrfs_balance_args sys;

        u64 flags;

        struct btrfs_balance_progress stat;
};

/*
 * Search for a given device by the set parameters
 */
struct btrfs_dev_lookup_args {
        u64 devid;
        u8 *uuid;
        u8 *fsid;
        bool missing;
};

/* We have to initialize to -1 because BTRFS_DEV_REPLACE_DEVID is 0 */
#define BTRFS_DEV_LOOKUP_ARGS_INIT { .devid = (u64)-1 }

#define BTRFS_DEV_LOOKUP_ARGS(name) \
        struct btrfs_dev_lookup_args name = BTRFS_DEV_LOOKUP_ARGS_INIT

enum btrfs_map_op {
        BTRFS_MAP_READ,
        BTRFS_MAP_WRITE,
        BTRFS_MAP_GET_READ_MIRRORS,
};

static inline enum btrfs_map_op btrfs_op(struct bio *bio)
{
        switch (bio_op(bio)) {
        case REQ_OP_WRITE:
        case REQ_OP_ZONE_APPEND:
                return BTRFS_MAP_WRITE;
        default:
                WARN_ON_ONCE(1);
                fallthrough;
        case REQ_OP_READ:
                return BTRFS_MAP_READ;
        }
}

static inline unsigned long btrfs_chunk_item_size(int num_stripes)
{
        ASSERT(num_stripes);
        return sizeof(struct btrfs_chunk) +
                sizeof(struct btrfs_stripe) * (num_stripes - 1);
}

/*
 * Do the type safe conversion from stripe_nr to offset inside the chunk.
 *
 * @stripe_nr is u32, with left shift it can overflow u32 for chunks larger
 * than 4G.  This does the proper type cast to avoid overflow.
 */
static inline u64 btrfs_stripe_nr_to_offset(u32 stripe_nr)
{
        return (u64)stripe_nr << BTRFS_STRIPE_LEN_SHIFT;
}

void btrfs_get_bioc(struct btrfs_io_context *bioc);
void btrfs_put_bioc(struct btrfs_io_context *bioc);
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
                    u64 logical, u64 *length,
                    struct btrfs_io_context **bioc_ret,
                    struct btrfs_io_stripe *smap, int *mirror_num_ret);
int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
                           struct btrfs_io_stripe *smap, u64 logical,
                           u32 length, int mirror_num);
struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
                                               u64 logical, u64 *length_ret,
                                               u32 *num_stripes);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
                                            u64 type);
void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                       blk_mode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
                                           bool mount_arg_dev);
int btrfs_forget_devices(dev_t devt);
void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices);
void btrfs_assign_next_active_device(struct btrfs_device *device,
                                     struct btrfs_device *this_dev);
struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info,
                                                  u64 devid,
                                                  const char *devpath);
int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
                                 struct btrfs_dev_lookup_args *args,
                                 const char *path);
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
                                        const u64 *devid, const u8 *uuid,
                                        const char *path);
void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
                    struct btrfs_dev_lookup_args *args,
                    struct file **bdev_file);
void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
                      struct btrfs_device *device, u64 new_size);
struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
                                       const struct btrfs_dev_lookup_args *args);
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
int btrfs_balance(struct btrfs_fs_info *fs_info,
                  struct btrfs_balance_control *bctl,
                  struct btrfs_ioctl_balance_args *bargs);
void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf);
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
int btrfs_uuid_scan_kthread(void *data);
bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset);
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
                        struct btrfs_ioctl_get_dev_stats *stats);
int btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans);
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev);
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev);
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev);
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
                           u64 logical, u64 len);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
                                    u64 logical);
u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map);
int btrfs_nr_parity_stripes(u64 type);
int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
                                     struct btrfs_block_group *bg);
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);

#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp);
int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map);
#endif

struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info,
                                             u64 logical, u64 length);
struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info,
                                                    u64 logical, u64 length);
struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
                                            u64 logical, u64 length);
void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map);
void btrfs_release_disk_super(struct btrfs_super_block *super);

static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
                                      int index)
{
        atomic_inc(dev->dev_stat_values + index);
        /*
         * This memory barrier orders stores updating statistics before stores
         * updating dev_stats_ccnt.
         *
         * It pairs with smp_rmb() in btrfs_run_dev_stats().
         */
        smp_mb__before_atomic();
        atomic_inc(&dev->dev_stats_ccnt);
}

static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
                                      int index)
{
        return atomic_read(dev->dev_stat_values + index);
}

static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
                                                int index)
{
        int ret;

        ret = atomic_xchg(dev->dev_stat_values + index, 0);
        /*
         * atomic_xchg implies a full memory barriers as per atomic_t.txt:
         * - RMW operations that have a return value are fully ordered;
         *
         * This implicit memory barriers is paired with the smp_rmb in
         * btrfs_run_dev_stats
         */
        atomic_inc(&dev->dev_stats_ccnt);
        return ret;
}

static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
                                      int index, unsigned long val)
{
        atomic_set(dev->dev_stat_values + index, val);
        /*
         * This memory barrier orders stores updating statistics before stores
         * updating dev_stats_ccnt.
         *
         * It pairs with smp_rmb() in btrfs_run_dev_stats().
         */
        smp_mb__before_atomic();
        atomic_inc(&dev->dev_stats_ccnt);
}

static inline const char *btrfs_dev_name(const struct btrfs_device *device)
{
        if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
                return "<missing disk>";
        else
                return rcu_str_deref(device->name);
}

void btrfs_commit_device_sizes(struct btrfs_transaction *trans);

struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
                                        struct btrfs_device *failing_dev);
void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device);

enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags);
int btrfs_bg_type_to_factor(u64 flags);
const char *btrfs_bg_type_to_raid_name(u64 flags);
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);

bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb);

#endif


















































    4 













































    4 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM signal

#if !defined(_TRACE_SIGNAL_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SIGNAL_H

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/tracepoint.h>

#define TP_STORE_SIGINFO(__entry, info)                                \
        do {                                                        \
                if (info == SEND_SIG_NOINFO) {                        \
                        __entry->errno        = 0;                        \
                        __entry->code        = SI_USER;                \
                } else if (info == SEND_SIG_PRIV) {                \
                        __entry->errno        = 0;                        \
                        __entry->code        = SI_KERNEL;                \
                } else {                                        \
                        __entry->errno        = info->si_errno;        \
                        __entry->code        = info->si_code;        \
                }                                                \
        } while (0)

#ifndef TRACE_HEADER_MULTI_READ
enum {
        TRACE_SIGNAL_DELIVERED,
        TRACE_SIGNAL_IGNORED,
        TRACE_SIGNAL_ALREADY_PENDING,
        TRACE_SIGNAL_OVERFLOW_FAIL,
        TRACE_SIGNAL_LOSE_INFO,
};
#endif

/**
 * signal_generate - called when a signal is generated
 * @sig: signal number
 * @info: pointer to struct siginfo
 * @task: pointer to struct task_struct
 * @group: shared or private
 * @result: TRACE_SIGNAL_*
 *
 * Current process sends a 'sig' signal to 'task' process with
 * 'info' siginfo. If 'info' is SEND_SIG_NOINFO or SEND_SIG_PRIV,
 * 'info' is not a pointer and you can't access its field. Instead,
 * SEND_SIG_NOINFO means that si_code is SI_USER, and SEND_SIG_PRIV
 * means that si_code is SI_KERNEL.
 */
TRACE_EVENT(signal_generate,

        TP_PROTO(int sig, struct kernel_siginfo *info, struct task_struct *task,
                        int group, int result),

        TP_ARGS(sig, info, task, group, result),

        TP_STRUCT__entry(
                __field(        int,        sig                        )
                __field(        int,        errno                        )
                __field(        int,        code                        )
                __array(        char,        comm,        TASK_COMM_LEN        )
                __field(        pid_t,        pid                        )
                __field(        int,        group                        )
                __field(        int,        result                        )
        ),

        TP_fast_assign(
                __entry->sig        = sig;
                TP_STORE_SIGINFO(__entry, info);
                memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
                __entry->pid        = task->pid;
                __entry->group        = group;
                __entry->result        = result;
        ),

        TP_printk("sig=%d errno=%d code=%d comm=%s pid=%d grp=%d res=%d",
                  __entry->sig, __entry->errno, __entry->code,
                  __entry->comm, __entry->pid, __entry->group,
                  __entry->result)
);

/**
 * signal_deliver - called when a signal is delivered
 * @sig: signal number
 * @info: pointer to struct siginfo
 * @ka: pointer to struct k_sigaction
 *
 * A 'sig' signal is delivered to current process with 'info' siginfo,
 * and it will be handled by 'ka'. ka->sa.sa_handler can be SIG_IGN or
 * SIG_DFL.
 * Note that some signals reported by signal_generate tracepoint can be
 * lost, ignored or modified (by debugger) before hitting this tracepoint.
 * This means, this can show which signals are actually delivered, but
 * matching generated signals and delivered signals may not be correct.
 */
TRACE_EVENT(signal_deliver,

        TP_PROTO(int sig, struct kernel_siginfo *info, struct k_sigaction *ka),

        TP_ARGS(sig, info, ka),

        TP_STRUCT__entry(
                __field(        int,                sig                )
                __field(        int,                errno                )
                __field(        int,                code                )
                __field(        unsigned long,        sa_handler        )
                __field(        unsigned long,        sa_flags        )
        ),

        TP_fast_assign(
                __entry->sig        = sig;
                TP_STORE_SIGINFO(__entry, info);
                __entry->sa_handler        = (unsigned long)ka->sa.sa_handler;
                __entry->sa_flags        = ka->sa.sa_flags;
        ),

        TP_printk("sig=%d errno=%d code=%d sa_handler=%lx sa_flags=%lx",
                  __entry->sig, __entry->errno, __entry->code,
                  __entry->sa_handler, __entry->sa_flags)
);

#endif /* _TRACE_SIGNAL_H */

/* This part must be outside protection */
#include <trace/define_trace.h>








































































































































































































    1 





    1 





















    1 


    1 


    1 
















































































    1 











    1 








    1 







































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/fat/misc.c
 *
 *  Written 1992,1993 by Werner Almesberger
 *  22/11/2000 - Fixed fat_date_unix2dos for dates earlier than 01/01/1980
 *                 and date_dos2unix for date==0 by Igor Zhbanov(bsg@uniyar.ac.ru)
 */

#include "fat.h"
#include <linux/iversion.h>

/*
 * fat_fs_error reports a file system problem that might indicate fa data
 * corruption/inconsistency. Depending on 'errors' mount option the
 * panic() is called, or error message is printed FAT and nothing is done,
 * or filesystem is remounted read-only (default behavior).
 * In case the file system is remounted read-only, it can be made writable
 * again by remounting it.
 */
void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
{
        struct fat_mount_options *opts = &MSDOS_SB(sb)->options;
        va_list args;
        struct va_format vaf;

        if (report) {
                va_start(args, fmt);
                vaf.fmt = fmt;
                vaf.va = &args;
                fat_msg(sb, KERN_ERR, "error, %pV", &vaf);
                va_end(args);
        }

        if (opts->errors == FAT_ERRORS_PANIC)
                panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
        else if (opts->errors == FAT_ERRORS_RO && !sb_rdonly(sb)) {
                sb->s_flags |= SB_RDONLY;
                fat_msg(sb, KERN_ERR, "Filesystem has been set read-only");
        }
}
EXPORT_SYMBOL_GPL(__fat_fs_error);

/**
 * _fat_msg() - Print a preformatted FAT message based on a superblock.
 * @sb: A pointer to a &struct super_block
 * @level: A Kernel printk level constant
 * @fmt: The printf-style format string to print.
 *
 * Everything that is not fat_fs_error() should be fat_msg().
 *
 * fat_msg() wraps _fat_msg() for printk indexing.
 */
void _fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
        _printk(FAT_PRINTK_PREFIX "%pV\n", level, sb->s_id, &vaf);
        va_end(args);
}

/* Flushes the number of free clusters on FAT32 */
/* XXX: Need to write one per FSINFO block.  Currently only writes 1 */
int fat_clusters_flush(struct super_block *sb)
{
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        struct buffer_head *bh;
        struct fat_boot_fsinfo *fsinfo;

        if (!is_fat32(sbi))
                return 0;

        bh = sb_bread(sb, sbi->fsinfo_sector);
        if (bh == NULL) {
                fat_msg(sb, KERN_ERR, "bread failed in fat_clusters_flush");
                return -EIO;
        }

        fsinfo = (struct fat_boot_fsinfo *)bh->b_data;
        /* Sanity check */
        if (!IS_FSINFO(fsinfo)) {
                fat_msg(sb, KERN_ERR, "Invalid FSINFO signature: "
                       "0x%08x, 0x%08x (sector = %lu)",
                       le32_to_cpu(fsinfo->signature1),
                       le32_to_cpu(fsinfo->signature2),
                       sbi->fsinfo_sector);
        } else {
                if (sbi->free_clusters != -1)
                        fsinfo->free_clusters = cpu_to_le32(sbi->free_clusters);
                if (sbi->prev_free != -1)
                        fsinfo->next_cluster = cpu_to_le32(sbi->prev_free);
                mark_buffer_dirty(bh);
        }
        brelse(bh);

        return 0;
}

/*
 * fat_chain_add() adds a new cluster to the chain of clusters represented
 * by inode.
 */
int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
{
        struct super_block *sb = inode->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        int ret, new_fclus, last;

        /*
         * We must locate the last cluster of the file to add this new
         * one (new_dclus) to the end of the link list (the FAT).
         */
        last = new_fclus = 0;
        if (MSDOS_I(inode)->i_start) {
                int fclus, dclus;

                ret = fat_get_cluster(inode, FAT_ENT_EOF, &fclus, &dclus);
                if (ret < 0)
                        return ret;
                new_fclus = fclus + 1;
                last = dclus;
        }

        /* add new one to the last of the cluster chain */
        if (last) {
                struct fat_entry fatent;

                fatent_init(&fatent);
                ret = fat_ent_read(inode, &fatent, last);
                if (ret >= 0) {
                        int wait = inode_needs_sync(inode);
                        ret = fat_ent_write(inode, &fatent, new_dclus, wait);
                        fatent_brelse(&fatent);
                }
                if (ret < 0)
                        return ret;
                /*
                 * FIXME:Although we can add this cache, fat_cache_add() is
                 * assuming to be called after linear search with fat_cache_id.
                 */
//                fat_cache_add(inode, new_fclus, new_dclus);
        } else {
                MSDOS_I(inode)->i_start = new_dclus;
                MSDOS_I(inode)->i_logstart = new_dclus;
                /*
                 * Since generic_write_sync() synchronizes regular files later,
                 * we sync here only directories.
                 */
                if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) {
                        ret = fat_sync_inode(inode);
                        if (ret)
                                return ret;
                } else
                        mark_inode_dirty(inode);
        }
        if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) {
                fat_fs_error(sb, "clusters badly computed (%d != %llu)",
                             new_fclus,
                             (llu)(inode->i_blocks >> (sbi->cluster_bits - 9)));
                fat_cache_inval_inode(inode);
        }
        inode->i_blocks += nr_cluster << (sbi->cluster_bits - 9);

        return 0;
}

/*
 * The epoch of FAT timestamp is 1980.
 *     :  bits :     value
 * date:  0 -  4: day        (1 -  31)
 * date:  5 -  8: month        (1 -  12)
 * date:  9 - 15: year        (0 - 127) from 1980
 * time:  0 -  4: sec        (0 -  29) 2sec counts
 * time:  5 - 10: min        (0 -  59)
 * time: 11 - 15: hour        (0 -  23)
 */
#define SECS_PER_MIN        60
#define SECS_PER_HOUR        (60 * 60)
#define SECS_PER_DAY        (SECS_PER_HOUR * 24)
/* days between 1.1.70 and 1.1.80 (2 leap days) */
#define DAYS_DELTA        (365 * 10 + 2)
/* 120 (2100 - 1980) isn't leap year */
#define YEAR_2100        120
#define IS_LEAP_YEAR(y)        (!((y) & 3) && (y) != YEAR_2100)

/* Linear day numbers of the respective 1sts in non-leap years. */
static long days_in_year[] = {
        /* Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec */
        0,   0,  31,  59,  90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0,
};

static inline int fat_tz_offset(const struct msdos_sb_info *sbi)
{
        return (sbi->options.tz_set ?
               -sbi->options.time_offset :
               sys_tz.tz_minuteswest) * SECS_PER_MIN;
}

/* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */
void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts,
                       __le16 __time, __le16 __date, u8 time_cs)
{
        u16 time = le16_to_cpu(__time), date = le16_to_cpu(__date);
        time64_t second;
        long day, leap_day, month, year;

        year  = date >> 9;
        month = max(1, (date >> 5) & 0xf);
        day   = max(1, date & 0x1f) - 1;

        leap_day = (year + 3) / 4;
        if (year > YEAR_2100)                /* 2100 isn't leap year */
                leap_day--;
        if (IS_LEAP_YEAR(year) && month > 2)
                leap_day++;

        second =  (time & 0x1f) << 1;
        second += ((time >> 5) & 0x3f) * SECS_PER_MIN;
        second += (time >> 11) * SECS_PER_HOUR;
        second += (time64_t)(year * 365 + leap_day
                   + days_in_year[month] + day
                   + DAYS_DELTA) * SECS_PER_DAY;

        second += fat_tz_offset(sbi);

        if (time_cs) {
                ts->tv_sec = second + (time_cs / 100);
                ts->tv_nsec = (time_cs % 100) * 10000000;
        } else {
                ts->tv_sec = second;
                ts->tv_nsec = 0;
        }
}

/* Export fat_time_fat2unix() for the fat_test KUnit tests. */
EXPORT_SYMBOL_GPL(fat_time_fat2unix);

/* Convert linear UNIX date to a FAT time/date pair. */
void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts,
                       __le16 *time, __le16 *date, u8 *time_cs)
{
        struct tm tm;
        time64_to_tm(ts->tv_sec, -fat_tz_offset(sbi), &tm);

        /*  FAT can only support year between 1980 to 2107 */
        if (tm.tm_year < 1980 - 1900) {
                *time = 0;
                *date = cpu_to_le16((0 << 9) | (1 << 5) | 1);
                if (time_cs)
                        *time_cs = 0;
                return;
        }
        if (tm.tm_year > 2107 - 1900) {
                *time = cpu_to_le16((23 << 11) | (59 << 5) | 29);
                *date = cpu_to_le16((127 << 9) | (12 << 5) | 31);
                if (time_cs)
                        *time_cs = 199;
                return;
        }

        /* from 1900 -> from 1980 */
        tm.tm_year -= 80;
        /* 0~11 -> 1~12 */
        tm.tm_mon++;
        /* 0~59 -> 0~29(2sec counts) */
        tm.tm_sec >>= 1;

        *time = cpu_to_le16(tm.tm_hour << 11 | tm.tm_min << 5 | tm.tm_sec);
        *date = cpu_to_le16(tm.tm_year << 9 | tm.tm_mon << 5 | tm.tm_mday);
        if (time_cs)
                *time_cs = (ts->tv_sec & 1) * 100 + ts->tv_nsec / 10000000;
}
EXPORT_SYMBOL_GPL(fat_time_unix2fat);

static inline struct timespec64 fat_timespec64_trunc_2secs(struct timespec64 ts)
{
        return (struct timespec64){ ts.tv_sec & ~1ULL, 0 };
}

/*
 * truncate atime to 24 hour granularity (00:00:00 in local timezone)
 */
struct timespec64 fat_truncate_atime(const struct msdos_sb_info *sbi,
                                     const struct timespec64 *ts)
{
        /* to localtime */
        time64_t seconds = ts->tv_sec - fat_tz_offset(sbi);
        s32 remainder;

        div_s64_rem(seconds, SECS_PER_DAY, &remainder);
        /* to day boundary, and back to unix time */
        seconds = seconds + fat_tz_offset(sbi) - remainder;

        return (struct timespec64){ seconds, 0 };
}

/*
 * truncate mtime to 2 second granularity
 */
struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi,
                                     const struct timespec64 *ts)
{
        return fat_timespec64_trunc_2secs(*ts);
}

/*
 * truncate the various times with appropriate granularity:
 *   all times in root node are always 0
 */
int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags)
{
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
        struct timespec64 ts;

        if (inode->i_ino == MSDOS_ROOT_INO)
                return 0;

        if (now == NULL) {
                now = &ts;
                ts = current_time(inode);
        }

        if (flags & S_ATIME)
                inode_set_atime_to_ts(inode, fat_truncate_atime(sbi, now));
        /*
         * ctime and mtime share the same on-disk field, and should be
         * identical in memory. all mtime updates will be applied to ctime,
         * but ctime updates are ignored.
         */
        if (flags & S_MTIME)
                inode_set_mtime_to_ts(inode,
                                      inode_set_ctime_to_ts(inode, fat_truncate_mtime(sbi, now)));

        return 0;
}
EXPORT_SYMBOL_GPL(fat_truncate_time);

int fat_update_time(struct inode *inode, int flags)
{
        int dirty_flags = 0;

        if (inode->i_ino == MSDOS_ROOT_INO)
                return 0;

        if (flags & (S_ATIME | S_CTIME | S_MTIME)) {
                fat_truncate_time(inode, NULL, flags);
                if (inode->i_sb->s_flags & SB_LAZYTIME)
                        dirty_flags |= I_DIRTY_TIME;
                else
                        dirty_flags |= I_DIRTY_SYNC;
        }

        __mark_inode_dirty(inode, dirty_flags);
        return 0;
}
EXPORT_SYMBOL_GPL(fat_update_time);

int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
{
        int i, err = 0;

        for (i = 0; i < nr_bhs; i++)
                write_dirty_buffer(bhs[i], 0);

        for (i = 0; i < nr_bhs; i++) {
                wait_on_buffer(bhs[i]);
                if (!err && !buffer_uptodate(bhs[i]))
                        err = -EIO;
        }
        return err;
}
















































































































































































































    1 











































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2005,2006,2007,2008 IBM Corporation
 *
 * Authors:
 * Reiner Sailer <sailer@watson.ibm.com>
 * Mimi Zohar <zohar@us.ibm.com>
 *
 * File: ima.h
 *        internal Integrity Measurement Architecture (IMA) definitions
 */

#ifndef __LINUX_IMA_H
#define __LINUX_IMA_H

#include <linux/types.h>
#include <linux/crypto.h>
#include <linux/fs.h>
#include <linux/security.h>
#include <linux/hash.h>
#include <linux/tpm.h>
#include <linux/audit.h>
#include <crypto/hash_info.h>

#include "../integrity.h"

enum ima_show_type { IMA_SHOW_BINARY, IMA_SHOW_BINARY_NO_FIELD_LEN,
                     IMA_SHOW_BINARY_OLD_STRING_FMT, IMA_SHOW_ASCII };
enum tpm_pcrs { TPM_PCR0 = 0, TPM_PCR8 = 8, TPM_PCR10 = 10 };

/* digest size for IMA, fits SHA1 or MD5 */
#define IMA_DIGEST_SIZE                SHA1_DIGEST_SIZE
#define IMA_EVENT_NAME_LEN_MAX        255

#define IMA_HASH_BITS 10
#define IMA_MEASURE_HTABLE_SIZE (1 << IMA_HASH_BITS)

#define IMA_TEMPLATE_FIELD_ID_MAX_LEN        16
#define IMA_TEMPLATE_NUM_FIELDS_MAX        15

#define IMA_TEMPLATE_IMA_NAME "ima"
#define IMA_TEMPLATE_IMA_FMT "d|n"

#define NR_BANKS(chip) ((chip != NULL) ? chip->nr_allocated_banks : 0)

/* current content of the policy */
extern int ima_policy_flag;

/* bitset of digests algorithms allowed in the setxattr hook */
extern atomic_t ima_setxattr_allowed_hash_algorithms;

/* IMA hash algorithm description */
struct ima_algo_desc {
        struct crypto_shash *tfm;
        enum hash_algo algo;
};

/* set during initialization */
extern int ima_hash_algo __ro_after_init;
extern int ima_sha1_idx __ro_after_init;
extern int ima_hash_algo_idx __ro_after_init;
extern int ima_extra_slots __ro_after_init;
extern struct ima_algo_desc *ima_algo_array __ro_after_init;

extern int ima_appraise;
extern struct tpm_chip *ima_tpm_chip;
extern const char boot_aggregate_name[];

/* IMA event related data */
struct ima_event_data {
        struct ima_iint_cache *iint;
        struct file *file;
        const unsigned char *filename;
        struct evm_ima_xattr_data *xattr_value;
        int xattr_len;
        const struct modsig *modsig;
        const char *violation;
        const void *buf;
        int buf_len;
};

/* IMA template field data definition */
struct ima_field_data {
        u8 *data;
        u32 len;
};

/* IMA template field definition */
struct ima_template_field {
        const char field_id[IMA_TEMPLATE_FIELD_ID_MAX_LEN];
        int (*field_init)(struct ima_event_data *event_data,
                          struct ima_field_data *field_data);
        void (*field_show)(struct seq_file *m, enum ima_show_type show,
                           struct ima_field_data *field_data);
};

/* IMA template descriptor definition */
struct ima_template_desc {
        struct list_head list;
        char *name;
        char *fmt;
        int num_fields;
        const struct ima_template_field **fields;
};

struct ima_template_entry {
        int pcr;
        struct tpm_digest *digests;
        struct ima_template_desc *template_desc; /* template descriptor */
        u32 template_data_len;
        struct ima_field_data template_data[];        /* template related data */
};

struct ima_queue_entry {
        struct hlist_node hnext;        /* place in hash collision list */
        struct list_head later;                /* place in ima_measurements list */
        struct ima_template_entry *entry;
};
extern struct list_head ima_measurements;        /* list of all measurements */

/* Some details preceding the binary serialized measurement list */
struct ima_kexec_hdr {
        u16 version;
        u16 _reserved0;
        u32 _reserved1;
        u64 buffer_size;
        u64 count;
};

/* IMA iint action cache flags */
#define IMA_MEASURE                0x00000001
#define IMA_MEASURED                0x00000002
#define IMA_APPRAISE                0x00000004
#define IMA_APPRAISED                0x00000008
/*#define IMA_COLLECT                0x00000010  do not use this flag */
#define IMA_COLLECTED                0x00000020
#define IMA_AUDIT                0x00000040
#define IMA_AUDITED                0x00000080
#define IMA_HASH                0x00000100
#define IMA_HASHED                0x00000200

/* IMA iint policy rule cache flags */
#define IMA_NONACTION_FLAGS        0xff000000
#define IMA_DIGSIG_REQUIRED        0x01000000
#define IMA_PERMIT_DIRECTIO        0x02000000
#define IMA_NEW_FILE                0x04000000
#define IMA_FAIL_UNVERIFIABLE_SIGS        0x10000000
#define IMA_MODSIG_ALLOWED        0x20000000
#define IMA_CHECK_BLACKLIST        0x40000000
#define IMA_VERITY_REQUIRED        0x80000000

#define IMA_DO_MASK                (IMA_MEASURE | IMA_APPRAISE | IMA_AUDIT | \
                                 IMA_HASH | IMA_APPRAISE_SUBMASK)
#define IMA_DONE_MASK                (IMA_MEASURED | IMA_APPRAISED | IMA_AUDITED | \
                                 IMA_HASHED | IMA_COLLECTED | \
                                 IMA_APPRAISED_SUBMASK)

/* IMA iint subaction appraise cache flags */
#define IMA_FILE_APPRAISE        0x00001000
#define IMA_FILE_APPRAISED        0x00002000
#define IMA_MMAP_APPRAISE        0x00004000
#define IMA_MMAP_APPRAISED        0x00008000
#define IMA_BPRM_APPRAISE        0x00010000
#define IMA_BPRM_APPRAISED        0x00020000
#define IMA_READ_APPRAISE        0x00040000
#define IMA_READ_APPRAISED        0x00080000
#define IMA_CREDS_APPRAISE        0x00100000
#define IMA_CREDS_APPRAISED        0x00200000
#define IMA_APPRAISE_SUBMASK        (IMA_FILE_APPRAISE | IMA_MMAP_APPRAISE | \
                                 IMA_BPRM_APPRAISE | IMA_READ_APPRAISE | \
                                 IMA_CREDS_APPRAISE)
#define IMA_APPRAISED_SUBMASK        (IMA_FILE_APPRAISED | IMA_MMAP_APPRAISED | \
                                 IMA_BPRM_APPRAISED | IMA_READ_APPRAISED | \
                                 IMA_CREDS_APPRAISED)

/* IMA iint cache atomic_flags */
#define IMA_CHANGE_XATTR        0
#define IMA_UPDATE_XATTR        1
#define IMA_CHANGE_ATTR                2
#define IMA_DIGSIG                3
#define IMA_MUST_MEASURE        4

/* IMA integrity metadata associated with an inode */
struct ima_iint_cache {
        struct mutex mutex;        /* protects: version, flags, digest */
        struct integrity_inode_attributes real_inode;
        unsigned long flags;
        unsigned long measured_pcrs;
        unsigned long atomic_flags;
        enum integrity_status ima_file_status:4;
        enum integrity_status ima_mmap_status:4;
        enum integrity_status ima_bprm_status:4;
        enum integrity_status ima_read_status:4;
        enum integrity_status ima_creds_status:4;
        struct ima_digest_data *ima_hash;
};

extern struct lsm_blob_sizes ima_blob_sizes;

static inline struct ima_iint_cache *
ima_inode_get_iint(const struct inode *inode)
{
        struct ima_iint_cache **iint_sec;

        if (unlikely(!inode->i_security))
                return NULL;

        iint_sec = inode->i_security + ima_blob_sizes.lbs_inode;
        return *iint_sec;
}

static inline void ima_inode_set_iint(const struct inode *inode,
                                      struct ima_iint_cache *iint)
{
        struct ima_iint_cache **iint_sec;

        if (unlikely(!inode->i_security))
                return;

        iint_sec = inode->i_security + ima_blob_sizes.lbs_inode;
        *iint_sec = iint;
}

struct ima_iint_cache *ima_iint_find(struct inode *inode);
struct ima_iint_cache *ima_inode_get(struct inode *inode);
void ima_inode_free(struct inode *inode);
void __init ima_iintcache_init(void);

extern const int read_idmap[];

#ifdef CONFIG_HAVE_IMA_KEXEC
void ima_load_kexec_buffer(void);
#else
static inline void ima_load_kexec_buffer(void) {}
#endif /* CONFIG_HAVE_IMA_KEXEC */

#ifdef CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS
void ima_post_key_create_or_update(struct key *keyring, struct key *key,
                                   const void *payload, size_t plen,
                                   unsigned long flags, bool create);
#endif

/*
 * The default binary_runtime_measurements list format is defined as the
 * platform native format.  The canonical format is defined as little-endian.
 */
extern bool ima_canonical_fmt;

/* Internal IMA function definitions */
int ima_init(void);
int ima_fs_init(void);
int ima_add_template_entry(struct ima_template_entry *entry, int violation,
                           const char *op, struct inode *inode,
                           const unsigned char *filename);
int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash);
int ima_calc_buffer_hash(const void *buf, loff_t len,
                         struct ima_digest_data *hash);
int ima_calc_field_array_hash(struct ima_field_data *field_data,
                              struct ima_template_entry *entry);
int ima_calc_boot_aggregate(struct ima_digest_data *hash);
void ima_add_violation(struct file *file, const unsigned char *filename,
                       struct ima_iint_cache *iint, const char *op,
                       const char *cause);
int ima_init_crypto(void);
void ima_putc(struct seq_file *m, void *data, int datalen);
void ima_print_digest(struct seq_file *m, u8 *digest, u32 size);
int template_desc_init_fields(const char *template_fmt,
                              const struct ima_template_field ***fields,
                              int *num_fields);
struct ima_template_desc *ima_template_desc_current(void);
struct ima_template_desc *ima_template_desc_buf(void);
struct ima_template_desc *lookup_template_desc(const char *name);
bool ima_template_has_modsig(const struct ima_template_desc *ima_template);
int ima_restore_measurement_entry(struct ima_template_entry *entry);
int ima_restore_measurement_list(loff_t bufsize, void *buf);
int ima_measurements_show(struct seq_file *m, void *v);
unsigned long ima_get_binary_runtime_size(void);
int ima_init_template(void);
void ima_init_template_list(void);
int __init ima_init_digests(void);
int ima_lsm_policy_change(struct notifier_block *nb, unsigned long event,
                          void *lsm_data);

/*
 * used to protect h_table and sha_table
 */
extern spinlock_t ima_queue_lock;

struct ima_h_table {
        atomic_long_t len;        /* number of stored measurements in the list */
        atomic_long_t violations;
        struct hlist_head queue[IMA_MEASURE_HTABLE_SIZE];
};
extern struct ima_h_table ima_htable;

static inline unsigned int ima_hash_key(u8 *digest)
{
        /* there is no point in taking a hash of part of a digest */
        return (digest[0] | digest[1] << 8) % IMA_MEASURE_HTABLE_SIZE;
}

#define __ima_hooks(hook)                                \
        hook(NONE, none)                                \
        hook(FILE_CHECK, file)                                \
        hook(MMAP_CHECK, mmap)                                \
        hook(MMAP_CHECK_REQPROT, mmap_reqprot)                \
        hook(BPRM_CHECK, bprm)                                \
        hook(CREDS_CHECK, creds)                        \
        hook(POST_SETATTR, post_setattr)                \
        hook(MODULE_CHECK, module)                        \
        hook(FIRMWARE_CHECK, firmware)                        \
        hook(KEXEC_KERNEL_CHECK, kexec_kernel)                \
        hook(KEXEC_INITRAMFS_CHECK, kexec_initramfs)        \
        hook(POLICY_CHECK, policy)                        \
        hook(KEXEC_CMDLINE, kexec_cmdline)                \
        hook(KEY_CHECK, key)                                \
        hook(CRITICAL_DATA, critical_data)                \
        hook(SETXATTR_CHECK, setxattr_check)                \
        hook(MAX_CHECK, none)

#define __ima_hook_enumify(ENUM, str)        ENUM,
#define __ima_stringify(arg) (#arg)
#define __ima_hook_measuring_stringify(ENUM, str) \
                (__ima_stringify(measuring_ ##str)),

enum ima_hooks {
        __ima_hooks(__ima_hook_enumify)
};

static const char * const ima_hooks_measure_str[] = {
        __ima_hooks(__ima_hook_measuring_stringify)
};

static inline const char *func_measure_str(enum ima_hooks func)
{
        if (func >= MAX_CHECK)
                return ima_hooks_measure_str[NONE];

        return ima_hooks_measure_str[func];
}

extern const char *const func_tokens[];

struct modsig;

#ifdef CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS
/*
 * To track keys that need to be measured.
 */
struct ima_key_entry {
        struct list_head list;
        void *payload;
        size_t payload_len;
        char *keyring_name;
};
void ima_init_key_queue(void);
bool ima_should_queue_key(void);
bool ima_queue_key(struct key *keyring, const void *payload,
                   size_t payload_len);
void ima_process_queued_keys(void);
#else
static inline void ima_init_key_queue(void) {}
static inline bool ima_should_queue_key(void) { return false; }
static inline bool ima_queue_key(struct key *keyring,
                                 const void *payload,
                                 size_t payload_len) { return false; }
static inline void ima_process_queued_keys(void) {}
#endif /* CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS */

/* LIM API function definitions */
int ima_get_action(struct mnt_idmap *idmap, struct inode *inode,
                   const struct cred *cred, u32 secid, int mask,
                   enum ima_hooks func, int *pcr,
                   struct ima_template_desc **template_desc,
                   const char *func_data, unsigned int *allowed_algos);
int ima_must_measure(struct inode *inode, int mask, enum ima_hooks func);
int ima_collect_measurement(struct ima_iint_cache *iint, struct file *file,
                            void *buf, loff_t size, enum hash_algo algo,
                            struct modsig *modsig);
void ima_store_measurement(struct ima_iint_cache *iint, struct file *file,
                           const unsigned char *filename,
                           struct evm_ima_xattr_data *xattr_value,
                           int xattr_len, const struct modsig *modsig, int pcr,
                           struct ima_template_desc *template_desc);
int process_buffer_measurement(struct mnt_idmap *idmap,
                               struct inode *inode, const void *buf, int size,
                               const char *eventname, enum ima_hooks func,
                               int pcr, const char *func_data,
                               bool buf_hash, u8 *digest, size_t digest_len);
void ima_audit_measurement(struct ima_iint_cache *iint,
                           const unsigned char *filename);
int ima_alloc_init_template(struct ima_event_data *event_data,
                            struct ima_template_entry **entry,
                            struct ima_template_desc *template_desc);
int ima_store_template(struct ima_template_entry *entry, int violation,
                       struct inode *inode,
                       const unsigned char *filename, int pcr);
void ima_free_template_entry(struct ima_template_entry *entry);
const char *ima_d_path(const struct path *path, char **pathbuf, char *filename);

/* IMA policy related functions */
int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode,
                     const struct cred *cred, u32 secid, enum ima_hooks func,
                     int mask, int flags, int *pcr,
                     struct ima_template_desc **template_desc,
                     const char *func_data, unsigned int *allowed_algos);
void ima_init_policy(void);
void ima_update_policy(void);
void ima_update_policy_flags(void);
ssize_t ima_parse_add_rule(char *);
void ima_delete_rules(void);
int ima_check_policy(void);
void *ima_policy_start(struct seq_file *m, loff_t *pos);
void *ima_policy_next(struct seq_file *m, void *v, loff_t *pos);
void ima_policy_stop(struct seq_file *m, void *v);
int ima_policy_show(struct seq_file *m, void *v);

/* Appraise integrity measurements */
#define IMA_APPRAISE_ENFORCE        0x01
#define IMA_APPRAISE_FIX        0x02
#define IMA_APPRAISE_LOG        0x04
#define IMA_APPRAISE_MODULES        0x08
#define IMA_APPRAISE_FIRMWARE        0x10
#define IMA_APPRAISE_POLICY        0x20
#define IMA_APPRAISE_KEXEC        0x40

#ifdef CONFIG_IMA_APPRAISE
int ima_check_blacklist(struct ima_iint_cache *iint,
                        const struct modsig *modsig, int pcr);
int ima_appraise_measurement(enum ima_hooks func, struct ima_iint_cache *iint,
                             struct file *file, const unsigned char *filename,
                             struct evm_ima_xattr_data *xattr_value,
                             int xattr_len, const struct modsig *modsig);
int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode,
                      int mask, enum ima_hooks func);
void ima_update_xattr(struct ima_iint_cache *iint, struct file *file);
enum integrity_status ima_get_cache_status(struct ima_iint_cache *iint,
                                           enum ima_hooks func);
enum hash_algo ima_get_hash_algo(const struct evm_ima_xattr_data *xattr_value,
                                 int xattr_len);
int ima_read_xattr(struct dentry *dentry,
                   struct evm_ima_xattr_data **xattr_value, int xattr_len);
void __init init_ima_appraise_lsm(const struct lsm_id *lsmid);

#else
static inline int ima_check_blacklist(struct ima_iint_cache *iint,
                                      const struct modsig *modsig, int pcr)
{
        return 0;
}

static inline int ima_appraise_measurement(enum ima_hooks func,
                                           struct ima_iint_cache *iint,
                                           struct file *file,
                                           const unsigned char *filename,
                                           struct evm_ima_xattr_data *xattr_value,
                                           int xattr_len,
                                           const struct modsig *modsig)
{
        return INTEGRITY_UNKNOWN;
}

static inline int ima_must_appraise(struct mnt_idmap *idmap,
                                    struct inode *inode, int mask,
                                    enum ima_hooks func)
{
        return 0;
}

static inline void ima_update_xattr(struct ima_iint_cache *iint,
                                    struct file *file)
{
}

static inline enum integrity_status
ima_get_cache_status(struct ima_iint_cache *iint, enum ima_hooks func)
{
        return INTEGRITY_UNKNOWN;
}

static inline enum hash_algo
ima_get_hash_algo(struct evm_ima_xattr_data *xattr_value, int xattr_len)
{
        return ima_hash_algo;
}

static inline int ima_read_xattr(struct dentry *dentry,
                                 struct evm_ima_xattr_data **xattr_value,
                                 int xattr_len)
{
        return 0;
}

static inline void __init init_ima_appraise_lsm(const struct lsm_id *lsmid)
{
}

#endif /* CONFIG_IMA_APPRAISE */

#ifdef CONFIG_IMA_APPRAISE_MODSIG
int ima_read_modsig(enum ima_hooks func, const void *buf, loff_t buf_len,
                    struct modsig **modsig);
void ima_collect_modsig(struct modsig *modsig, const void *buf, loff_t size);
int ima_get_modsig_digest(const struct modsig *modsig, enum hash_algo *algo,
                          const u8 **digest, u32 *digest_size);
int ima_get_raw_modsig(const struct modsig *modsig, const void **data,
                       u32 *data_len);
void ima_free_modsig(struct modsig *modsig);
#else
static inline int ima_read_modsig(enum ima_hooks func, const void *buf,
                                  loff_t buf_len, struct modsig **modsig)
{
        return -EOPNOTSUPP;
}

static inline void ima_collect_modsig(struct modsig *modsig, const void *buf,
                                      loff_t size)
{
}

static inline int ima_get_modsig_digest(const struct modsig *modsig,
                                        enum hash_algo *algo, const u8 **digest,
                                        u32 *digest_size)
{
        return -EOPNOTSUPP;
}

static inline int ima_get_raw_modsig(const struct modsig *modsig,
                                     const void **data, u32 *data_len)
{
        return -EOPNOTSUPP;
}

static inline void ima_free_modsig(struct modsig *modsig)
{
}
#endif /* CONFIG_IMA_APPRAISE_MODSIG */

/* LSM based policy rules require audit */
#ifdef CONFIG_IMA_LSM_RULES

#define ima_filter_rule_init security_audit_rule_init
#define ima_filter_rule_free security_audit_rule_free
#define ima_filter_rule_match security_audit_rule_match

#else

static inline int ima_filter_rule_init(u32 field, u32 op, char *rulestr,
                                       void **lsmrule, gfp_t gfp)
{
        return -EINVAL;
}

static inline void ima_filter_rule_free(void *lsmrule)
{
}

static inline int ima_filter_rule_match(u32 secid, u32 field, u32 op,
                                        void *lsmrule)
{
        return -EINVAL;
}
#endif /* CONFIG_IMA_LSM_RULES */

#ifdef        CONFIG_IMA_READ_POLICY
#define        POLICY_FILE_FLAGS        (S_IWUSR | S_IRUSR)
#else
#define        POLICY_FILE_FLAGS        S_IWUSR
#endif /* CONFIG_IMA_READ_POLICY */

#endif /* __LINUX_IMA_H */

















































































































    4 












    4 
    4 

    4 































    4 
    4 


























    4 



















    4 

















    3 



    4 










    4 


    4 













































































































































































































    4 





















































    3 
    4 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
// SPDX-License-Identifier: GPL-2.0
/*
 * FPU signal frame handling routines.
 */

#include <linux/compat.h>
#include <linux/cpu.h>
#include <linux/pagemap.h>

#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/xstate.h>

#include <asm/sigframe.h>
#include <asm/trapnr.h>
#include <asm/trace/fpu.h>

#include "context.h"
#include "internal.h"
#include "legacy.h"
#include "xstate.h"

/*
 * Check for the presence of extended state information in the
 * user fpstate pointer in the sigcontext.
 */
static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
                                            struct _fpx_sw_bytes *fx_sw)
{
        int min_xstate_size = sizeof(struct fxregs_state) +
                              sizeof(struct xstate_header);
        void __user *fpstate = fxbuf;
        unsigned int magic2;

        if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw)))
                return false;

        /* Check for the first magic field and other error scenarios. */
        if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
            fx_sw->xstate_size < min_xstate_size ||
            fx_sw->xstate_size > current->thread.fpu.fpstate->user_size ||
            fx_sw->xstate_size > fx_sw->extended_size)
                goto setfx;

        /*
         * Check for the presence of second magic word at the end of memory
         * layout. This detects the case where the user just copied the legacy
         * fpstate layout with out copying the extended state information
         * in the memory layout.
         */
        if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)))
                return false;

        if (likely(magic2 == FP_XSTATE_MAGIC2))
                return true;
setfx:
        trace_x86_fpu_xstate_check_failed(&current->thread.fpu);

        /* Set the parameters for fx only state */
        fx_sw->magic1 = 0;
        fx_sw->xstate_size = sizeof(struct fxregs_state);
        fx_sw->xfeatures = XFEATURE_MASK_FPSSE;
        return true;
}

/*
 * Signal frame handlers.
 */
static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf)
{
        if (use_fxsr()) {
                struct xregs_state *xsave = &tsk->thread.fpu.fpstate->regs.xsave;
                struct user_i387_ia32_struct env;
                struct _fpstate_32 __user *fp = buf;

                fpregs_lock();
                if (!test_thread_flag(TIF_NEED_FPU_LOAD))
                        fxsave(&tsk->thread.fpu.fpstate->regs.fxsave);
                fpregs_unlock();

                convert_from_fxsr(&env, tsk);

                if (__copy_to_user(buf, &env, sizeof(env)) ||
                    __put_user(xsave->i387.swd, &fp->status) ||
                    __put_user(X86_FXSR_MAGIC, &fp->magic))
                        return false;
        } else {
                struct fregs_state __user *fp = buf;
                u32 swd;

                if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
                        return false;
        }

        return true;
}

/*
 * Prepare the SW reserved portion of the fxsave memory layout, indicating
 * the presence of the extended state information in the memory layout
 * pointed to by the fpstate pointer in the sigcontext.
 * This is saved when ever the FP and extended state context is
 * saved on the user stack during the signal handler delivery to the user.
 */
static inline void save_sw_bytes(struct _fpx_sw_bytes *sw_bytes, bool ia32_frame,
                                 struct fpstate *fpstate)
{
        sw_bytes->magic1 = FP_XSTATE_MAGIC1;
        sw_bytes->extended_size = fpstate->user_size + FP_XSTATE_MAGIC2_SIZE;
        sw_bytes->xfeatures = fpstate->user_xfeatures;
        sw_bytes->xstate_size = fpstate->user_size;

        if (ia32_frame)
                sw_bytes->extended_size += sizeof(struct fregs_state);
}

static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
                                      struct fpstate *fpstate)
{
        struct xregs_state __user *x = buf;
        struct _fpx_sw_bytes sw_bytes = {};
        u32 xfeatures;
        int err;

        /* Setup the bytes not touched by the [f]xsave and reserved for SW. */
        save_sw_bytes(&sw_bytes, ia32_frame, fpstate);
        err = __copy_to_user(&x->i387.sw_reserved, &sw_bytes, sizeof(sw_bytes));

        if (!use_xsave())
                return !err;

        err |= __put_user(FP_XSTATE_MAGIC2,
                          (__u32 __user *)(buf + fpstate->user_size));

        /*
         * Read the xfeatures which we copied (directly from the cpu or
         * from the state in task struct) to the user buffers.
         */
        err |= __get_user(xfeatures, (__u32 __user *)&x->header.xfeatures);

        /*
         * For legacy compatible, we always set FP/SSE bits in the bit
         * vector while saving the state to the user context. This will
         * enable us capturing any changes(during sigreturn) to
         * the FP/SSE bits by the legacy applications which don't touch
         * xfeatures in the xsave header.
         *
         * xsave aware apps can change the xfeatures in the xsave
         * header as well as change any contents in the memory layout.
         * xrestore as part of sigreturn will capture all the changes.
         */
        xfeatures |= XFEATURE_MASK_FPSSE;

        err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures);

        return !err;
}

static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
{
        if (use_xsave())
                return xsave_to_user_sigframe(buf);
        if (use_fxsr())
                return fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
        else
                return fnsave_to_user_sigframe((struct fregs_state __user *) buf);
}

/*
 * Save the fpu, extended register state to the user signal frame.
 *
 * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save
 *  state is copied.
 *  'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'.
 *
 *        buf == buf_fx for 64-bit frames and 32-bit fsave frame.
 *        buf != buf_fx for 32-bit frames with fxstate.
 *
 * Save it directly to the user frame with disabled page fault handler. If
 * that faults, try to clear the frame which handles the page fault.
 *
 * If this is a 32-bit frame with fxstate, put a fsave header before
 * the aligned state at 'buf_fx'.
 *
 * For [f]xsave state, update the SW reserved fields in the [f]xsave frame
 * indicating the absence/presence of the extended state to the user.
 */
bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
{
        struct task_struct *tsk = current;
        struct fpstate *fpstate = tsk->thread.fpu.fpstate;
        bool ia32_fxstate = (buf != buf_fx);
        int ret;

        ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
                         IS_ENABLED(CONFIG_IA32_EMULATION));

        if (!static_cpu_has(X86_FEATURE_FPU)) {
                struct user_i387_ia32_struct fp;

                fpregs_soft_get(current, NULL, (struct membuf){.p = &fp,
                                                .left = sizeof(fp)});
                return !copy_to_user(buf, &fp, sizeof(fp));
        }

        if (!access_ok(buf, size))
                return false;

        if (use_xsave()) {
                struct xregs_state __user *xbuf = buf_fx;

                /*
                 * Clear the xsave header first, so that reserved fields are
                 * initialized to zero.
                 */
                if (__clear_user(&xbuf->header, sizeof(xbuf->header)))
                        return false;
        }
retry:
        /*
         * Load the FPU registers if they are not valid for the current task.
         * With a valid FPU state we can attempt to save the state directly to
         * userland's stack frame which will likely succeed. If it does not,
         * resolve the fault in the user memory and try again.
         */
        fpregs_lock();
        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                fpregs_restore_userregs();

        pagefault_disable();
        ret = copy_fpregs_to_sigframe(buf_fx);
        pagefault_enable();
        fpregs_unlock();

        if (ret) {
                if (!__clear_user(buf_fx, fpstate->user_size))
                        goto retry;
                return false;
        }

        /* Save the fsave header for the 32-bit frames. */
        if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf))
                return false;

        if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate))
                return false;

        return true;
}

static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures,
                                      u64 xrestore, bool fx_only)
{
        if (use_xsave()) {
                u64 init_bv = ufeatures & ~xrestore;
                int ret;

                if (likely(!fx_only))
                        ret = xrstor_from_user_sigframe(buf, xrestore);
                else
                        ret = fxrstor_from_user_sigframe(buf);

                if (!ret && unlikely(init_bv))
                        os_xrstor(&init_fpstate, init_bv);
                return ret;
        } else if (use_fxsr()) {
                return fxrstor_from_user_sigframe(buf);
        } else {
                return frstor_from_user_sigframe(buf);
        }
}

/*
 * Attempt to restore the FPU registers directly from user memory.
 * Pagefaults are handled and any errors returned are fatal.
 */
static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only)
{
        struct fpu *fpu = &current->thread.fpu;
        int ret;

        /* Restore enabled features only. */
        xrestore &= fpu->fpstate->user_xfeatures;
retry:
        fpregs_lock();
        /* Ensure that XFD is up to date */
        xfd_update_state(fpu->fpstate);
        pagefault_disable();
        ret = __restore_fpregs_from_user(buf, fpu->fpstate->user_xfeatures,
                                         xrestore, fx_only);
        pagefault_enable();

        if (unlikely(ret)) {
                /*
                 * The above did an FPU restore operation, restricted to
                 * the user portion of the registers, and failed, but the
                 * microcode might have modified the FPU registers
                 * nevertheless.
                 *
                 * If the FPU registers do not belong to current, then
                 * invalidate the FPU register state otherwise the task
                 * might preempt current and return to user space with
                 * corrupted FPU registers.
                 */
                if (test_thread_flag(TIF_NEED_FPU_LOAD))
                        __cpu_invalidate_fpregs_state();
                fpregs_unlock();

                /* Try to handle #PF, but anything else is fatal. */
                if (ret != X86_TRAP_PF)
                        return false;

                if (!fault_in_readable(buf, fpu->fpstate->user_size))
                        goto retry;
                return false;
        }

        /*
         * Restore supervisor states: previous context switch etc has done
         * XSAVES and saved the supervisor states in the kernel buffer from
         * which they can be restored now.
         *
         * It would be optimal to handle this with a single XRSTORS, but
         * this does not work because the rest of the FPU registers have
         * been restored from a user buffer directly.
         */
        if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor())
                os_xrstor_supervisor(fpu->fpstate);

        fpregs_mark_activate();
        fpregs_unlock();
        return true;
}

static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
                              bool ia32_fxstate)
{
        struct task_struct *tsk = current;
        struct fpu *fpu = &tsk->thread.fpu;
        struct user_i387_ia32_struct env;
        bool success, fx_only = false;
        union fpregs_state *fpregs;
        u64 user_xfeatures = 0;

        if (use_xsave()) {
                struct _fpx_sw_bytes fx_sw_user;

                if (!check_xstate_in_sigframe(buf_fx, &fx_sw_user))
                        return false;

                fx_only = !fx_sw_user.magic1;
                user_xfeatures = fx_sw_user.xfeatures;
        } else {
                user_xfeatures = XFEATURE_MASK_FPSSE;
        }

        if (likely(!ia32_fxstate)) {
                /* Restore the FPU registers directly from user memory. */
                return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only);
        }

        /*
         * Copy the legacy state because the FP portion of the FX frame has
         * to be ignored for histerical raisins. The legacy state is folded
         * in once the larger state has been copied.
         */
        if (__copy_from_user(&env, buf, sizeof(env)))
                return false;

        /*
         * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is
         * not modified on context switch and that the xstate is considered
         * to be loaded again on return to userland (overriding last_cpu avoids
         * the optimisation).
         */
        fpregs_lock();
        if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
                /*
                 * If supervisor states are available then save the
                 * hardware state in current's fpstate so that the
                 * supervisor state is preserved. Save the full state for
                 * simplicity. There is no point in optimizing this by only
                 * saving the supervisor states and then shuffle them to
                 * the right place in memory. It's ia32 mode. Shrug.
                 */
                if (xfeatures_mask_supervisor())
                        os_xsave(fpu->fpstate);
                set_thread_flag(TIF_NEED_FPU_LOAD);
        }
        __fpu_invalidate_fpregs_state(fpu);
        __cpu_invalidate_fpregs_state();
        fpregs_unlock();

        fpregs = &fpu->fpstate->regs;
        if (use_xsave() && !fx_only) {
                if (copy_sigframe_from_user_to_xstate(tsk, buf_fx))
                        return false;
        } else {
                if (__copy_from_user(&fpregs->fxsave, buf_fx,
                                     sizeof(fpregs->fxsave)))
                        return false;

                if (IS_ENABLED(CONFIG_X86_64)) {
                        /* Reject invalid MXCSR values. */
                        if (fpregs->fxsave.mxcsr & ~mxcsr_feature_mask)
                                return false;
                } else {
                        /* Mask invalid bits out for historical reasons (broken hardware). */
                        fpregs->fxsave.mxcsr &= mxcsr_feature_mask;
                }

                /* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */
                if (use_xsave())
                        fpregs->xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
        }

        /* Fold the legacy FP storage */
        convert_to_fxsr(&fpregs->fxsave, &env);

        fpregs_lock();
        if (use_xsave()) {
                /*
                 * Remove all UABI feature bits not set in user_xfeatures
                 * from the memory xstate header which makes the full
                 * restore below bring them into init state. This works for
                 * fx_only mode as well because that has only FP and SSE
                 * set in user_xfeatures.
                 *
                 * Preserve supervisor states!
                 */
                u64 mask = user_xfeatures | xfeatures_mask_supervisor();

                fpregs->xsave.header.xfeatures &= mask;
                success = !os_xrstor_safe(fpu->fpstate,
                                          fpu_kernel_cfg.max_features);
        } else {
                success = !fxrstor_safe(&fpregs->fxsave);
        }

        if (likely(success))
                fpregs_mark_activate();

        fpregs_unlock();
        return success;
}

static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate)
{
        unsigned int size = fpstate->user_size;

        return use_xsave() ? size + FP_XSTATE_MAGIC2_SIZE : size;
}

/*
 * Restore FPU state from a sigframe:
 */
bool fpu__restore_sig(void __user *buf, int ia32_frame)
{
        struct fpu *fpu = &current->thread.fpu;
        void __user *buf_fx = buf;
        bool ia32_fxstate = false;
        bool success = false;
        unsigned int size;

        if (unlikely(!buf)) {
                fpu__clear_user_states(fpu);
                return true;
        }

        size = xstate_sigframe_size(fpu->fpstate);

        ia32_frame &= (IS_ENABLED(CONFIG_X86_32) ||
                       IS_ENABLED(CONFIG_IA32_EMULATION));

        /*
         * Only FXSR enabled systems need the FX state quirk.
         * FRSTOR does not need it and can use the fast path.
         */
        if (ia32_frame && use_fxsr()) {
                buf_fx = buf + sizeof(struct fregs_state);
                size += sizeof(struct fregs_state);
                ia32_fxstate = true;
        }

        if (!access_ok(buf, size))
                goto out;

        if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) {
                success = !fpregs_soft_set(current, NULL, 0,
                                           sizeof(struct user_i387_ia32_struct),
                                           NULL, buf);
        } else {
                success = __fpu_restore_sig(buf, buf_fx, ia32_fxstate);
        }

out:
        if (unlikely(!success))
                fpu__clear_user_states(fpu);
        return success;
}

unsigned long
fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
                     unsigned long *buf_fx, unsigned long *size)
{
        unsigned long frame_size = xstate_sigframe_size(current->thread.fpu.fpstate);

        *buf_fx = sp = round_down(sp - frame_size, 64);
        if (ia32_frame && use_fxsr()) {
                frame_size += sizeof(struct fregs_state);
                sp -= sizeof(struct fregs_state);
        }

        *size = frame_size;

        return sp;
}

unsigned long __init fpu__get_fpstate_size(void)
{
        unsigned long ret = fpu_user_cfg.max_size;

        if (use_xsave())
                ret += FP_XSTATE_MAGIC2_SIZE;

        /*
         * This space is needed on (most) 32-bit kernels, or when a 32-bit
         * app is running on a 64-bit kernel. To keep things simple, just
         * assume the worst case and always include space for 'freg_state',
         * even for 64-bit apps on 64-bit kernels. This wastes a bit of
         * space, but keeps the code simple.
         */
        if ((IS_ENABLED(CONFIG_IA32_EMULATION) ||
             IS_ENABLED(CONFIG_X86_32)) && use_fxsr())
                ret += sizeof(struct fregs_state);

        return ret;
}



























    1 




    1 



    1 












    1 
    1 
    1 
    1 

    1 
    1 



















    1 


    1 
    1 


    1 



    1 
    1 







    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2010 Red Hat, Inc.
 * Copyright (c) 2016-2021 Christoph Hellwig.
 */
#include <linux/fs.h>
#include <linux/iomap.h>
#include "trace.h"

/*
 * Advance to the next range we need to map.
 *
 * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully
 * processed - it was aborted because the extent the iomap spanned may have been
 * changed during the operation. In this case, the iteration behaviour is to
 * remap the unprocessed range of the iter, and that means we may need to remap
 * even when we've made no progress (i.e. iter->processed = 0). Hence the
 * "finished iterating" case needs to distinguish between
 * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we
 * need to remap the entire remaining range.
 */
static inline int iomap_iter_advance(struct iomap_iter *iter)
{
        bool stale = iter->iomap.flags & IOMAP_F_STALE;

        /* handle the previous iteration (if any) */
        if (iter->iomap.length) {
                if (iter->processed < 0)
                        return iter->processed;
                if (!iter->processed && !stale)
                        return 0;
                if (WARN_ON_ONCE(iter->processed > iomap_length(iter)))
                        return -EIO;
                iter->pos += iter->processed;
                iter->len -= iter->processed;
                if (!iter->len)
                        return 0;
        }

        /* clear the state for the next iteration */
        iter->processed = 0;
        memset(&iter->iomap, 0, sizeof(iter->iomap));
        memset(&iter->srcmap, 0, sizeof(iter->srcmap));
        return 1;
}

static inline void iomap_iter_done(struct iomap_iter *iter)
{
        WARN_ON_ONCE(iter->iomap.offset > iter->pos);
        WARN_ON_ONCE(iter->iomap.length == 0);
        WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos);
        WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE);

        trace_iomap_iter_dstmap(iter->inode, &iter->iomap);
        if (iter->srcmap.type != IOMAP_HOLE)
                trace_iomap_iter_srcmap(iter->inode, &iter->srcmap);
}

/**
 * iomap_iter - iterate over a ranges in a file
 * @iter: iteration structue
 * @ops: iomap ops provided by the file system
 *
 * Iterate over filesystem-provided space mappings for the provided file range.
 *
 * This function handles cleanup of resources acquired for iteration when the
 * filesystem indicates there are no more space mappings, which means that this
 * function must be called in a loop that continues as long it returns a
 * positive value.  If 0 or a negative value is returned, the caller must not
 * return to the loop body.  Within a loop body, there are two ways to break out
 * of the loop body:  leave @iter.processed unchanged, or set it to a negative
 * errno.
 */
int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops)
{
        int ret;

        if (iter->iomap.length && ops->iomap_end) {
                ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter),
                                iter->processed > 0 ? iter->processed : 0,
                                iter->flags, &iter->iomap);
                if (ret < 0 && !iter->processed)
                        return ret;
        }

        trace_iomap_iter(iter, ops, _RET_IP_);
        ret = iomap_iter_advance(iter);
        if (ret <= 0)
                return ret;

        ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags,
                               &iter->iomap, &iter->srcmap);
        if (ret < 0)
                return ret;
        iomap_iter_done(iter);
        return 1;
}





































































































































































































































































































































































































































    3 





    3 





















    3 





















































































































































































































































































































































































    4 
    4 







    9 


    7 












    6 


    6 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   55 


   52 






























   52 



   52 
   54 














   52 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C) 2002 Richard Henderson
 * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
 * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
 */

#define INCLUDE_VERMAGIC

#include <linux/export.h>
#include <linux/extable.h>
#include <linux/moduleloader.h>
#include <linux/module_signature.h>
#include <linux/trace_events.h>
#include <linux/init.h>
#include <linux/kallsyms.h>
#include <linux/buildid.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/kernel_read_file.h>
#include <linux/kstrtox.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/elf.h>
#include <linux/seq_file.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <linux/rcupdate.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/moduleparam.h>
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/vermagic.h>
#include <linux/notifier.h>
#include <linux/sched.h>
#include <linux/device.h>
#include <linux/string.h>
#include <linux/mutex.h>
#include <linux/rculist.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
#include <linux/set_memory.h>
#include <asm/mmu_context.h>
#include <linux/license.h>
#include <asm/sections.h>
#include <linux/tracepoint.h>
#include <linux/ftrace.h>
#include <linux/livepatch.h>
#include <linux/async.h>
#include <linux/percpu.h>
#include <linux/kmemleak.h>
#include <linux/jump_label.h>
#include <linux/pfn.h>
#include <linux/bsearch.h>
#include <linux/dynamic_debug.h>
#include <linux/audit.h>
#include <linux/cfi.h>
#include <linux/codetag.h>
#include <linux/debugfs.h>
#include <linux/execmem.h>
#include <uapi/linux/module.h>
#include "internal.h"

#define CREATE_TRACE_POINTS
#include <trace/events/module.h>

/*
 * Mutex protects:
 * 1) List of modules (also safely readable with preempt_disable),
 * 2) module_use links,
 * 3) mod_tree.addr_min/mod_tree.addr_max.
 * (delete and add uses RCU list operations).
 */
DEFINE_MUTEX(module_mutex);
LIST_HEAD(modules);

/* Work queue for freeing init sections in success case */
static void do_free_init(struct work_struct *w);
static DECLARE_WORK(init_free_wq, do_free_init);
static LLIST_HEAD(init_free_list);

struct mod_tree_root mod_tree __cacheline_aligned = {
        .addr_min = -1UL,
};

struct symsearch {
        const struct kernel_symbol *start, *stop;
        const s32 *crcs;
        enum mod_license license;
};

/*
 * Bounds of module memory, for speeding up __module_address.
 * Protected by module_mutex.
 */
static void __mod_update_bounds(enum mod_mem_type type __maybe_unused, void *base,
                                unsigned int size, struct mod_tree_root *tree)
{
        unsigned long min = (unsigned long)base;
        unsigned long max = min + size;

#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
        if (mod_mem_type_is_core_data(type)) {
                if (min < tree->data_addr_min)
                        tree->data_addr_min = min;
                if (max > tree->data_addr_max)
                        tree->data_addr_max = max;
                return;
        }
#endif
        if (min < tree->addr_min)
                tree->addr_min = min;
        if (max > tree->addr_max)
                tree->addr_max = max;
}

static void mod_update_bounds(struct module *mod)
{
        for_each_mod_mem_type(type) {
                struct module_memory *mod_mem = &mod->mem[type];

                if (mod_mem->size)
                        __mod_update_bounds(type, mod_mem->base, mod_mem->size, &mod_tree);
        }
}

/* Block module loading/unloading? */
int modules_disabled;
core_param(nomodule, modules_disabled, bint, 0);

/* Waiting for a module to finish initializing? */
static DECLARE_WAIT_QUEUE_HEAD(module_wq);

static BLOCKING_NOTIFIER_HEAD(module_notify_list);

int register_module_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&module_notify_list, nb);
}
EXPORT_SYMBOL(register_module_notifier);

int unregister_module_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&module_notify_list, nb);
}
EXPORT_SYMBOL(unregister_module_notifier);

/*
 * We require a truly strong try_module_get(): 0 means success.
 * Otherwise an error is returned due to ongoing or failed
 * initialization etc.
 */
static inline int strong_try_module_get(struct module *mod)
{
        BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
        if (mod && mod->state == MODULE_STATE_COMING)
                return -EBUSY;
        if (try_module_get(mod))
                return 0;
        else
                return -ENOENT;
}

static inline void add_taint_module(struct module *mod, unsigned flag,
                                    enum lockdep_ok lockdep_ok)
{
        add_taint(flag, lockdep_ok);
        set_bit(flag, &mod->taints);
}

/*
 * A thread that wants to hold a reference to a module only while it
 * is running can call this to safely exit.
 */
void __noreturn __module_put_and_kthread_exit(struct module *mod, long code)
{
        module_put(mod);
        kthread_exit(code);
}
EXPORT_SYMBOL(__module_put_and_kthread_exit);

/* Find a module section: 0 means not found. */
static unsigned int find_sec(const struct load_info *info, const char *name)
{
        unsigned int i;

        for (i = 1; i < info->hdr->e_shnum; i++) {
                Elf_Shdr *shdr = &info->sechdrs[i];
                /* Alloc bit cleared means "ignore it." */
                if ((shdr->sh_flags & SHF_ALLOC)
                    && strcmp(info->secstrings + shdr->sh_name, name) == 0)
                        return i;
        }
        return 0;
}

/* Find a module section, or NULL. */
static void *section_addr(const struct load_info *info, const char *name)
{
        /* Section 0 has sh_addr 0. */
        return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
}

/* Find a module section, or NULL.  Fill in number of "objects" in section. */
static void *section_objs(const struct load_info *info,
                          const char *name,
                          size_t object_size,
                          unsigned int *num)
{
        unsigned int sec = find_sec(info, name);

        /* Section 0 has sh_addr 0 and sh_size 0. */
        *num = info->sechdrs[sec].sh_size / object_size;
        return (void *)info->sechdrs[sec].sh_addr;
}

/* Find a module section: 0 means not found. Ignores SHF_ALLOC flag. */
static unsigned int find_any_sec(const struct load_info *info, const char *name)
{
        unsigned int i;

        for (i = 1; i < info->hdr->e_shnum; i++) {
                Elf_Shdr *shdr = &info->sechdrs[i];
                if (strcmp(info->secstrings + shdr->sh_name, name) == 0)
                        return i;
        }
        return 0;
}

/*
 * Find a module section, or NULL. Fill in number of "objects" in section.
 * Ignores SHF_ALLOC flag.
 */
static __maybe_unused void *any_section_objs(const struct load_info *info,
                                             const char *name,
                                             size_t object_size,
                                             unsigned int *num)
{
        unsigned int sec = find_any_sec(info, name);

        /* Section 0 has sh_addr 0 and sh_size 0. */
        *num = info->sechdrs[sec].sh_size / object_size;
        return (void *)info->sechdrs[sec].sh_addr;
}

#ifndef CONFIG_MODVERSIONS
#define symversion(base, idx) NULL
#else
#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
#endif

static const char *kernel_symbol_name(const struct kernel_symbol *sym)
{
#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
        return offset_to_ptr(&sym->name_offset);
#else
        return sym->name;
#endif
}

static const char *kernel_symbol_namespace(const struct kernel_symbol *sym)
{
#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
        if (!sym->namespace_offset)
                return NULL;
        return offset_to_ptr(&sym->namespace_offset);
#else
        return sym->namespace;
#endif
}

int cmp_name(const void *name, const void *sym)
{
        return strcmp(name, kernel_symbol_name(sym));
}

static bool find_exported_symbol_in_section(const struct symsearch *syms,
                                            struct module *owner,
                                            struct find_symbol_arg *fsa)
{
        struct kernel_symbol *sym;

        if (!fsa->gplok && syms->license == GPL_ONLY)
                return false;

        sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
                        sizeof(struct kernel_symbol), cmp_name);
        if (!sym)
                return false;

        fsa->owner = owner;
        fsa->crc = symversion(syms->crcs, sym - syms->start);
        fsa->sym = sym;
        fsa->license = syms->license;

        return true;
}

/*
 * Find an exported symbol and return it, along with, (optional) crc and
 * (optional) module which owns it.  Needs preempt disabled or module_mutex.
 */
bool find_symbol(struct find_symbol_arg *fsa)
{
        static const struct symsearch arr[] = {
                { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
                  NOT_GPL_ONLY },
                { __start___ksymtab_gpl, __stop___ksymtab_gpl,
                  __start___kcrctab_gpl,
                  GPL_ONLY },
        };
        struct module *mod;
        unsigned int i;

        module_assert_mutex_or_preempt();

        for (i = 0; i < ARRAY_SIZE(arr); i++)
                if (find_exported_symbol_in_section(&arr[i], NULL, fsa))
                        return true;

        list_for_each_entry_rcu(mod, &modules, list,
                                lockdep_is_held(&module_mutex)) {
                struct symsearch arr[] = {
                        { mod->syms, mod->syms + mod->num_syms, mod->crcs,
                          NOT_GPL_ONLY },
                        { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
                          mod->gpl_crcs,
                          GPL_ONLY },
                };

                if (mod->state == MODULE_STATE_UNFORMED)
                        continue;

                for (i = 0; i < ARRAY_SIZE(arr); i++)
                        if (find_exported_symbol_in_section(&arr[i], mod, fsa))
                                return true;
        }

        pr_debug("Failed to find symbol %s\n", fsa->name);
        return false;
}

/*
 * Search for module by name: must hold module_mutex (or preempt disabled
 * for read-only access).
 */
struct module *find_module_all(const char *name, size_t len,
                               bool even_unformed)
{
        struct module *mod;

        module_assert_mutex_or_preempt();

        list_for_each_entry_rcu(mod, &modules, list,
                                lockdep_is_held(&module_mutex)) {
                if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
                        continue;
                if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
                        return mod;
        }
        return NULL;
}

struct module *find_module(const char *name)
{
        return find_module_all(name, strlen(name), false);
}

#ifdef CONFIG_SMP

static inline void __percpu *mod_percpu(struct module *mod)
{
        return mod->percpu;
}

static int percpu_modalloc(struct module *mod, struct load_info *info)
{
        Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu];
        unsigned long align = pcpusec->sh_addralign;

        if (!pcpusec->sh_size)
                return 0;

        if (align > PAGE_SIZE) {
                pr_warn("%s: per-cpu alignment %li > %li\n",
                        mod->name, align, PAGE_SIZE);
                align = PAGE_SIZE;
        }

        mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
        if (!mod->percpu) {
                pr_warn("%s: Could not allocate %lu bytes percpu data\n",
                        mod->name, (unsigned long)pcpusec->sh_size);
                return -ENOMEM;
        }
        mod->percpu_size = pcpusec->sh_size;
        return 0;
}

static void percpu_modfree(struct module *mod)
{
        free_percpu(mod->percpu);
}

static unsigned int find_pcpusec(struct load_info *info)
{
        return find_sec(info, ".data..percpu");
}

static void percpu_modcopy(struct module *mod,
                           const void *from, unsigned long size)
{
        int cpu;

        for_each_possible_cpu(cpu)
                memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
}

bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
{
        struct module *mod;
        unsigned int cpu;

        preempt_disable();

        list_for_each_entry_rcu(mod, &modules, list) {
                if (mod->state == MODULE_STATE_UNFORMED)
                        continue;
                if (!mod->percpu_size)
                        continue;
                for_each_possible_cpu(cpu) {
                        void *start = per_cpu_ptr(mod->percpu, cpu);
                        void *va = (void *)addr;

                        if (va >= start && va < start + mod->percpu_size) {
                                if (can_addr) {
                                        *can_addr = (unsigned long) (va - start);
                                        *can_addr += (unsigned long)
                                                per_cpu_ptr(mod->percpu,
                                                            get_boot_cpu_id());
                                }
                                preempt_enable();
                                return true;
                        }
                }
        }

        preempt_enable();
        return false;
}

/**
 * is_module_percpu_address() - test whether address is from module static percpu
 * @addr: address to test
 *
 * Test whether @addr belongs to module static percpu area.
 *
 * Return: %true if @addr is from module static percpu area
 */
bool is_module_percpu_address(unsigned long addr)
{
        return __is_module_percpu_address(addr, NULL);
}

#else /* ... !CONFIG_SMP */

static inline void __percpu *mod_percpu(struct module *mod)
{
        return NULL;
}
static int percpu_modalloc(struct module *mod, struct load_info *info)
{
        /* UP modules shouldn't have this section: ENOMEM isn't quite right */
        if (info->sechdrs[info->index.pcpu].sh_size != 0)
                return -ENOMEM;
        return 0;
}
static inline void percpu_modfree(struct module *mod)
{
}
static unsigned int find_pcpusec(struct load_info *info)
{
        return 0;
}
static inline void percpu_modcopy(struct module *mod,
                                  const void *from, unsigned long size)
{
        /* pcpusec should be 0, and size of that section should be 0. */
        BUG_ON(size != 0);
}
bool is_module_percpu_address(unsigned long addr)
{
        return false;
}

bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
{
        return false;
}

#endif /* CONFIG_SMP */

#define MODINFO_ATTR(field)        \
static void setup_modinfo_##field(struct module *mod, const char *s)  \
{                                                                     \
        mod->field = kstrdup(s, GFP_KERNEL);                          \
}                                                                     \
static ssize_t show_modinfo_##field(struct module_attribute *mattr,   \
                        struct module_kobject *mk, char *buffer)      \
{                                                                     \
        return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field);  \
}                                                                     \
static int modinfo_##field##_exists(struct module *mod)               \
{                                                                     \
        return mod->field != NULL;                                    \
}                                                                     \
static void free_modinfo_##field(struct module *mod)                  \
{                                                                     \
        kfree(mod->field);                                            \
        mod->field = NULL;                                            \
}                                                                     \
static struct module_attribute modinfo_##field = {                    \
        .attr = { .name = __stringify(field), .mode = 0444 },         \
        .show = show_modinfo_##field,                                 \
        .setup = setup_modinfo_##field,                               \
        .test = modinfo_##field##_exists,                             \
        .free = free_modinfo_##field,                                 \
};

MODINFO_ATTR(version);
MODINFO_ATTR(srcversion);

static struct {
        char name[MODULE_NAME_LEN + 1];
        char taints[MODULE_FLAGS_BUF_SIZE];
} last_unloaded_module;

#ifdef CONFIG_MODULE_UNLOAD

EXPORT_TRACEPOINT_SYMBOL(module_get);

/* MODULE_REF_BASE is the base reference count by kmodule loader. */
#define MODULE_REF_BASE        1

/* Init the unload section of the module. */
static int module_unload_init(struct module *mod)
{
        /*
         * Initialize reference counter to MODULE_REF_BASE.
         * refcnt == 0 means module is going.
         */
        atomic_set(&mod->refcnt, MODULE_REF_BASE);

        INIT_LIST_HEAD(&mod->source_list);
        INIT_LIST_HEAD(&mod->target_list);

        /* Hold reference count during initialization. */
        atomic_inc(&mod->refcnt);

        return 0;
}

/* Does a already use b? */
static int already_uses(struct module *a, struct module *b)
{
        struct module_use *use;

        list_for_each_entry(use, &b->source_list, source_list) {
                if (use->source == a)
                        return 1;
        }
        pr_debug("%s does not use %s!\n", a->name, b->name);
        return 0;
}

/*
 * Module a uses b
 *  - we add 'a' as a "source", 'b' as a "target" of module use
 *  - the module_use is added to the list of 'b' sources (so
 *    'b' can walk the list to see who sourced them), and of 'a'
 *    targets (so 'a' can see what modules it targets).
 */
static int add_module_usage(struct module *a, struct module *b)
{
        struct module_use *use;

        pr_debug("Allocating new usage for %s.\n", a->name);
        use = kmalloc(sizeof(*use), GFP_ATOMIC);
        if (!use)
                return -ENOMEM;

        use->source = a;
        use->target = b;
        list_add(&use->source_list, &b->source_list);
        list_add(&use->target_list, &a->target_list);
        return 0;
}

/* Module a uses b: caller needs module_mutex() */
static int ref_module(struct module *a, struct module *b)
{
        int err;

        if (b == NULL || already_uses(a, b))
                return 0;

        /* If module isn't available, we fail. */
        err = strong_try_module_get(b);
        if (err)
                return err;

        err = add_module_usage(a, b);
        if (err) {
                module_put(b);
                return err;
        }
        return 0;
}

/* Clear the unload stuff of the module. */
static void module_unload_free(struct module *mod)
{
        struct module_use *use, *tmp;

        mutex_lock(&module_mutex);
        list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
                struct module *i = use->target;
                pr_debug("%s unusing %s\n", mod->name, i->name);
                module_put(i);
                list_del(&use->source_list);
                list_del(&use->target_list);
                kfree(use);
        }
        mutex_unlock(&module_mutex);
}

#ifdef CONFIG_MODULE_FORCE_UNLOAD
static inline int try_force_unload(unsigned int flags)
{
        int ret = (flags & O_TRUNC);
        if (ret)
                add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
        return ret;
}
#else
static inline int try_force_unload(unsigned int flags)
{
        return 0;
}
#endif /* CONFIG_MODULE_FORCE_UNLOAD */

/* Try to release refcount of module, 0 means success. */
static int try_release_module_ref(struct module *mod)
{
        int ret;

        /* Try to decrement refcnt which we set at loading */
        ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
        BUG_ON(ret < 0);
        if (ret)
                /* Someone can put this right now, recover with checking */
                ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);

        return ret;
}

static int try_stop_module(struct module *mod, int flags, int *forced)
{
        /* If it's not unused, quit unless we're forcing. */
        if (try_release_module_ref(mod) != 0) {
                *forced = try_force_unload(flags);
                if (!(*forced))
                        return -EWOULDBLOCK;
        }

        /* Mark it as dying. */
        mod->state = MODULE_STATE_GOING;

        return 0;
}

/**
 * module_refcount() - return the refcount or -1 if unloading
 * @mod:        the module we're checking
 *
 * Return:
 *        -1 if the module is in the process of unloading
 *        otherwise the number of references in the kernel to the module
 */
int module_refcount(struct module *mod)
{
        return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
}
EXPORT_SYMBOL(module_refcount);

/* This exists whether we can unload or not */
static void free_module(struct module *mod);

SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
                unsigned int, flags)
{
        struct module *mod;
        char name[MODULE_NAME_LEN];
        char buf[MODULE_FLAGS_BUF_SIZE];
        int ret, forced = 0;

        if (!capable(CAP_SYS_MODULE) || modules_disabled)
                return -EPERM;

        if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
                return -EFAULT;
        name[MODULE_NAME_LEN-1] = '\0';

        audit_log_kern_module(name);

        if (mutex_lock_interruptible(&module_mutex) != 0)
                return -EINTR;

        mod = find_module(name);
        if (!mod) {
                ret = -ENOENT;
                goto out;
        }

        if (!list_empty(&mod->source_list)) {
                /* Other modules depend on us: get rid of them first. */
                ret = -EWOULDBLOCK;
                goto out;
        }

        /* Doing init or already dying? */
        if (mod->state != MODULE_STATE_LIVE) {
                /* FIXME: if (force), slam module count damn the torpedoes */
                pr_debug("%s already dying\n", mod->name);
                ret = -EBUSY;
                goto out;
        }

        /* If it has an init func, it must have an exit func to unload */
        if (mod->init && !mod->exit) {
                forced = try_force_unload(flags);
                if (!forced) {
                        /* This module can't be removed */
                        ret = -EBUSY;
                        goto out;
                }
        }

        ret = try_stop_module(mod, flags, &forced);
        if (ret != 0)
                goto out;

        mutex_unlock(&module_mutex);
        /* Final destruction now no one is using it. */
        if (mod->exit != NULL)
                mod->exit();
        blocking_notifier_call_chain(&module_notify_list,
                                     MODULE_STATE_GOING, mod);
        klp_module_going(mod);
        ftrace_release_mod(mod);

        async_synchronize_full();

        /* Store the name and taints of the last unloaded module for diagnostic purposes */
        strscpy(last_unloaded_module.name, mod->name, sizeof(last_unloaded_module.name));
        strscpy(last_unloaded_module.taints, module_flags(mod, buf, false), sizeof(last_unloaded_module.taints));

        free_module(mod);
        /* someone could wait for the module in add_unformed_module() */
        wake_up_all(&module_wq);
        return 0;
out:
        mutex_unlock(&module_mutex);
        return ret;
}

void __symbol_put(const char *symbol)
{
        struct find_symbol_arg fsa = {
                .name        = symbol,
                .gplok        = true,
        };

        preempt_disable();
        BUG_ON(!find_symbol(&fsa));
        module_put(fsa.owner);
        preempt_enable();
}
EXPORT_SYMBOL(__symbol_put);

/* Note this assumes addr is a function, which it currently always is. */
void symbol_put_addr(void *addr)
{
        struct module *modaddr;
        unsigned long a = (unsigned long)dereference_function_descriptor(addr);

        if (core_kernel_text(a))
                return;

        /*
         * Even though we hold a reference on the module; we still need to
         * disable preemption in order to safely traverse the data structure.
         */
        preempt_disable();
        modaddr = __module_text_address(a);
        BUG_ON(!modaddr);
        module_put(modaddr);
        preempt_enable();
}
EXPORT_SYMBOL_GPL(symbol_put_addr);

static ssize_t show_refcnt(struct module_attribute *mattr,
                           struct module_kobject *mk, char *buffer)
{
        return sprintf(buffer, "%i\n", module_refcount(mk->mod));
}

static struct module_attribute modinfo_refcnt =
        __ATTR(refcnt, 0444, show_refcnt, NULL);

void __module_get(struct module *module)
{
        if (module) {
                atomic_inc(&module->refcnt);
                trace_module_get(module, _RET_IP_);
        }
}
EXPORT_SYMBOL(__module_get);

bool try_module_get(struct module *module)
{
        bool ret = true;

        if (module) {
                /* Note: here, we can fail to get a reference */
                if (likely(module_is_live(module) &&
                           atomic_inc_not_zero(&module->refcnt) != 0))
                        trace_module_get(module, _RET_IP_);
                else
                        ret = false;
        }
        return ret;
}
EXPORT_SYMBOL(try_module_get);

void module_put(struct module *module)
{
        int ret;

        if (module) {
                ret = atomic_dec_if_positive(&module->refcnt);
                WARN_ON(ret < 0);        /* Failed to put refcount */
                trace_module_put(module, _RET_IP_);
        }
}
EXPORT_SYMBOL(module_put);

#else /* !CONFIG_MODULE_UNLOAD */
static inline void module_unload_free(struct module *mod)
{
}

static int ref_module(struct module *a, struct module *b)
{
        return strong_try_module_get(b);
}

static inline int module_unload_init(struct module *mod)
{
        return 0;
}
#endif /* CONFIG_MODULE_UNLOAD */

size_t module_flags_taint(unsigned long taints, char *buf)
{
        size_t l = 0;
        int i;

        for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
                if (taint_flags[i].module && test_bit(i, &taints))
                        buf[l++] = taint_flags[i].c_true;
        }

        return l;
}

static ssize_t show_initstate(struct module_attribute *mattr,
                              struct module_kobject *mk, char *buffer)
{
        const char *state = "unknown";

        switch (mk->mod->state) {
        case MODULE_STATE_LIVE:
                state = "live";
                break;
        case MODULE_STATE_COMING:
                state = "coming";
                break;
        case MODULE_STATE_GOING:
                state = "going";
                break;
        default:
                BUG();
        }
        return sprintf(buffer, "%s\n", state);
}

static struct module_attribute modinfo_initstate =
        __ATTR(initstate, 0444, show_initstate, NULL);

static ssize_t store_uevent(struct module_attribute *mattr,
                            struct module_kobject *mk,
                            const char *buffer, size_t count)
{
        int rc;

        rc = kobject_synth_uevent(&mk->kobj, buffer, count);
        return rc ? rc : count;
}

struct module_attribute module_uevent =
        __ATTR(uevent, 0200, NULL, store_uevent);

static ssize_t show_coresize(struct module_attribute *mattr,
                             struct module_kobject *mk, char *buffer)
{
        unsigned int size = mk->mod->mem[MOD_TEXT].size;

        if (!IS_ENABLED(CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC)) {
                for_class_mod_mem_type(type, core_data)
                        size += mk->mod->mem[type].size;
        }
        return sprintf(buffer, "%u\n", size);
}

static struct module_attribute modinfo_coresize =
        __ATTR(coresize, 0444, show_coresize, NULL);

#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
static ssize_t show_datasize(struct module_attribute *mattr,
                             struct module_kobject *mk, char *buffer)
{
        unsigned int size = 0;

        for_class_mod_mem_type(type, core_data)
                size += mk->mod->mem[type].size;
        return sprintf(buffer, "%u\n", size);
}

static struct module_attribute modinfo_datasize =
        __ATTR(datasize, 0444, show_datasize, NULL);
#endif

static ssize_t show_initsize(struct module_attribute *mattr,
                             struct module_kobject *mk, char *buffer)
{
        unsigned int size = 0;

        for_class_mod_mem_type(type, init)
                size += mk->mod->mem[type].size;
        return sprintf(buffer, "%u\n", size);
}

static struct module_attribute modinfo_initsize =
        __ATTR(initsize, 0444, show_initsize, NULL);

static ssize_t show_taint(struct module_attribute *mattr,
                          struct module_kobject *mk, char *buffer)
{
        size_t l;

        l = module_flags_taint(mk->mod->taints, buffer);
        buffer[l++] = '\n';
        return l;
}

static struct module_attribute modinfo_taint =
        __ATTR(taint, 0444, show_taint, NULL);

struct module_attribute *modinfo_attrs[] = {
        &module_uevent,
        &modinfo_version,
        &modinfo_srcversion,
        &modinfo_initstate,
        &modinfo_coresize,
#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
        &modinfo_datasize,
#endif
        &modinfo_initsize,
        &modinfo_taint,
#ifdef CONFIG_MODULE_UNLOAD
        &modinfo_refcnt,
#endif
        NULL,
};

size_t modinfo_attrs_count = ARRAY_SIZE(modinfo_attrs);

static const char vermagic[] = VERMAGIC_STRING;

int try_to_force_load(struct module *mod, const char *reason)
{
#ifdef CONFIG_MODULE_FORCE_LOAD
        if (!test_taint(TAINT_FORCED_MODULE))
                pr_warn("%s: %s: kernel tainted.\n", mod->name, reason);
        add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
        return 0;
#else
        return -ENOEXEC;
#endif
}

/* Parse tag=value strings from .modinfo section */
char *module_next_tag_pair(char *string, unsigned long *secsize)
{
        /* Skip non-zero chars */
        while (string[0]) {
                string++;
                if ((*secsize)-- <= 1)
                        return NULL;
        }

        /* Skip any zero padding. */
        while (!string[0]) {
                string++;
                if ((*secsize)-- <= 1)
                        return NULL;
        }
        return string;
}

static char *get_next_modinfo(const struct load_info *info, const char *tag,
                              char *prev)
{
        char *p;
        unsigned int taglen = strlen(tag);
        Elf_Shdr *infosec = &info->sechdrs[info->index.info];
        unsigned long size = infosec->sh_size;

        /*
         * get_modinfo() calls made before rewrite_section_headers()
         * must use sh_offset, as sh_addr isn't set!
         */
        char *modinfo = (char *)info->hdr + infosec->sh_offset;

        if (prev) {
                size -= prev - modinfo;
                modinfo = module_next_tag_pair(prev, &size);
        }

        for (p = modinfo; p; p = module_next_tag_pair(p, &size)) {
                if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
                        return p + taglen + 1;
        }
        return NULL;
}

static char *get_modinfo(const struct load_info *info, const char *tag)
{
        return get_next_modinfo(info, tag, NULL);
}

static int verify_namespace_is_imported(const struct load_info *info,
                                        const struct kernel_symbol *sym,
                                        struct module *mod)
{
        const char *namespace;
        char *imported_namespace;

        namespace = kernel_symbol_namespace(sym);
        if (namespace && namespace[0]) {
                for_each_modinfo_entry(imported_namespace, info, "import_ns") {
                        if (strcmp(namespace, imported_namespace) == 0)
                                return 0;
                }
#ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
                pr_warn(
#else
                pr_err(
#endif
                        "%s: module uses symbol (%s) from namespace %s, but does not import it.\n",
                        mod->name, kernel_symbol_name(sym), namespace);
#ifndef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
                return -EINVAL;
#endif
        }
        return 0;
}

static bool inherit_taint(struct module *mod, struct module *owner, const char *name)
{
        if (!owner || !test_bit(TAINT_PROPRIETARY_MODULE, &owner->taints))
                return true;

        if (mod->using_gplonly_symbols) {
                pr_err("%s: module using GPL-only symbols uses symbols %s from proprietary module %s.\n",
                        mod->name, name, owner->name);
                return false;
        }

        if (!test_bit(TAINT_PROPRIETARY_MODULE, &mod->taints)) {
                pr_warn("%s: module uses symbols %s from proprietary module %s, inheriting taint.\n",
                        mod->name, name, owner->name);
                set_bit(TAINT_PROPRIETARY_MODULE, &mod->taints);
        }
        return true;
}

/* Resolve a symbol for this module.  I.e. if we find one, record usage. */
static const struct kernel_symbol *resolve_symbol(struct module *mod,
                                                  const struct load_info *info,
                                                  const char *name,
                                                  char ownername[])
{
        struct find_symbol_arg fsa = {
                .name        = name,
                .gplok        = !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)),
                .warn        = true,
        };
        int err;

        /*
         * The module_mutex should not be a heavily contended lock;
         * if we get the occasional sleep here, we'll go an extra iteration
         * in the wait_event_interruptible(), which is harmless.
         */
        sched_annotate_sleep();
        mutex_lock(&module_mutex);
        if (!find_symbol(&fsa))
                goto unlock;

        if (fsa.license == GPL_ONLY)
                mod->using_gplonly_symbols = true;

        if (!inherit_taint(mod, fsa.owner, name)) {
                fsa.sym = NULL;
                goto getname;
        }

        if (!check_version(info, name, mod, fsa.crc)) {
                fsa.sym = ERR_PTR(-EINVAL);
                goto getname;
        }

        err = verify_namespace_is_imported(info, fsa.sym, mod);
        if (err) {
                fsa.sym = ERR_PTR(err);
                goto getname;
        }

        err = ref_module(mod, fsa.owner);
        if (err) {
                fsa.sym = ERR_PTR(err);
                goto getname;
        }

getname:
        /* We must make copy under the lock if we failed to get ref. */
        strncpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN);
unlock:
        mutex_unlock(&module_mutex);
        return fsa.sym;
}

static const struct kernel_symbol *
resolve_symbol_wait(struct module *mod,
                    const struct load_info *info,
                    const char *name)
{
        const struct kernel_symbol *ksym;
        char owner[MODULE_NAME_LEN];

        if (wait_event_interruptible_timeout(module_wq,
                        !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
                        || PTR_ERR(ksym) != -EBUSY,
                                             30 * HZ) <= 0) {
                pr_warn("%s: gave up waiting for init of module %s.\n",
                        mod->name, owner);
        }
        return ksym;
}

void __weak module_arch_cleanup(struct module *mod)
{
}

void __weak module_arch_freeing_init(struct module *mod)
{
}

static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
{
        unsigned int size = PAGE_ALIGN(mod->mem[type].size);
        enum execmem_type execmem_type;
        void *ptr;

        mod->mem[type].size = size;

        if (mod_mem_type_is_data(type))
                execmem_type = EXECMEM_MODULE_DATA;
        else
                execmem_type = EXECMEM_MODULE_TEXT;

        ptr = execmem_alloc(execmem_type, size);
        if (!ptr)
                return -ENOMEM;

        /*
         * The pointer to these blocks of memory are stored on the module
         * structure and we keep that around so long as the module is
         * around. We only free that memory when we unload the module.
         * Just mark them as not being a leak then. The .init* ELF
         * sections *do* get freed after boot so we *could* treat them
         * slightly differently with kmemleak_ignore() and only grey
         * them out as they work as typical memory allocations which
         * *do* eventually get freed, but let's just keep things simple
         * and avoid *any* false positives.
         */
        kmemleak_not_leak(ptr);

        memset(ptr, 0, size);
        mod->mem[type].base = ptr;

        return 0;
}

static void module_memory_free(struct module *mod, enum mod_mem_type type,
                               bool unload_codetags)
{
        void *ptr = mod->mem[type].base;

        if (!unload_codetags && mod_mem_type_is_core_data(type))
                return;

        execmem_free(ptr);
}

static void free_mod_mem(struct module *mod, bool unload_codetags)
{
        for_each_mod_mem_type(type) {
                struct module_memory *mod_mem = &mod->mem[type];

                if (type == MOD_DATA)
                        continue;

                /* Free lock-classes; relies on the preceding sync_rcu(). */
                lockdep_free_key_range(mod_mem->base, mod_mem->size);
                if (mod_mem->size)
                        module_memory_free(mod, type, unload_codetags);
        }

        /* MOD_DATA hosts mod, so free it at last */
        lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size);
        module_memory_free(mod, MOD_DATA, unload_codetags);
}

/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
{
        bool unload_codetags;

        trace_module_free(mod);

        unload_codetags = codetag_unload_module(mod);
        if (!unload_codetags)
                pr_warn("%s: memory allocation(s) from the module still alive, cannot unload cleanly\n",
                        mod->name);

        mod_sysfs_teardown(mod);

        /*
         * We leave it in list to prevent duplicate loads, but make sure
         * that noone uses it while it's being deconstructed.
         */
        mutex_lock(&module_mutex);
        mod->state = MODULE_STATE_UNFORMED;
        mutex_unlock(&module_mutex);

        /* Arch-specific cleanup. */
        module_arch_cleanup(mod);

        /* Module unload stuff */
        module_unload_free(mod);

        /* Free any allocated parameters. */
        destroy_params(mod->kp, mod->num_kp);

        if (is_livepatch_module(mod))
                free_module_elf(mod);

        /* Now we can delete it from the lists */
        mutex_lock(&module_mutex);
        /* Unlink carefully: kallsyms could be walking list. */
        list_del_rcu(&mod->list);
        mod_tree_remove(mod);
        /* Remove this module from bug list, this uses list_del_rcu */
        module_bug_cleanup(mod);
        /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */
        synchronize_rcu();
        if (try_add_tainted_module(mod))
                pr_err("%s: adding tainted module to the unloaded tainted modules list failed.\n",
                       mod->name);
        mutex_unlock(&module_mutex);

        /* This may be empty, but that's OK */
        module_arch_freeing_init(mod);
        kfree(mod->args);
        percpu_modfree(mod);

        free_mod_mem(mod, unload_codetags);
}

void *__symbol_get(const char *symbol)
{
        struct find_symbol_arg fsa = {
                .name        = symbol,
                .gplok        = true,
                .warn        = true,
        };

        preempt_disable();
        if (!find_symbol(&fsa))
                goto fail;
        if (fsa.license != GPL_ONLY) {
                pr_warn("failing symbol_get of non-GPLONLY symbol %s.\n",
                        symbol);
                goto fail;
        }
        if (strong_try_module_get(fsa.owner))
                goto fail;
        preempt_enable();
        return (void *)kernel_symbol_value(fsa.sym);
fail:
        preempt_enable();
        return NULL;
}
EXPORT_SYMBOL_GPL(__symbol_get);

/*
 * Ensure that an exported symbol [global namespace] does not already exist
 * in the kernel or in some other module's exported symbol table.
 *
 * You must hold the module_mutex.
 */
static int verify_exported_symbols(struct module *mod)
{
        unsigned int i;
        const struct kernel_symbol *s;
        struct {
                const struct kernel_symbol *sym;
                unsigned int num;
        } arr[] = {
                { mod->syms, mod->num_syms },
                { mod->gpl_syms, mod->num_gpl_syms },
        };

        for (i = 0; i < ARRAY_SIZE(arr); i++) {
                for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
                        struct find_symbol_arg fsa = {
                                .name        = kernel_symbol_name(s),
                                .gplok        = true,
                        };
                        if (find_symbol(&fsa)) {
                                pr_err("%s: exports duplicate symbol %s"
                                       " (owned by %s)\n",
                                       mod->name, kernel_symbol_name(s),
                                       module_name(fsa.owner));
                                return -ENOEXEC;
                        }
                }
        }
        return 0;
}

static bool ignore_undef_symbol(Elf_Half emachine, const char *name)
{
        /*
         * On x86, PIC code and Clang non-PIC code may have call foo@PLT. GNU as
         * before 2.37 produces an unreferenced _GLOBAL_OFFSET_TABLE_ on x86-64.
         * i386 has a similar problem but may not deserve a fix.
         *
         * If we ever have to ignore many symbols, consider refactoring the code to
         * only warn if referenced by a relocation.
         */
        if (emachine == EM_386 || emachine == EM_X86_64)
                return !strcmp(name, "_GLOBAL_OFFSET_TABLE_");
        return false;
}

/* Change all symbols so that st_value encodes the pointer directly. */
static int simplify_symbols(struct module *mod, const struct load_info *info)
{
        Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
        Elf_Sym *sym = (void *)symsec->sh_addr;
        unsigned long secbase;
        unsigned int i;
        int ret = 0;
        const struct kernel_symbol *ksym;

        for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
                const char *name = info->strtab + sym[i].st_name;

                switch (sym[i].st_shndx) {
                case SHN_COMMON:
                        /* Ignore common symbols */
                        if (!strncmp(name, "__gnu_lto", 9))
                                break;

                        /*
                         * We compiled with -fno-common.  These are not
                         * supposed to happen.
                         */
                        pr_debug("Common symbol: %s\n", name);
                        pr_warn("%s: please compile with -fno-common\n",
                               mod->name);
                        ret = -ENOEXEC;
                        break;

                case SHN_ABS:
                        /* Don't need to do anything */
                        pr_debug("Absolute symbol: 0x%08lx %s\n",
                                 (long)sym[i].st_value, name);
                        break;

                case SHN_LIVEPATCH:
                        /* Livepatch symbols are resolved by livepatch */
                        break;

                case SHN_UNDEF:
                        ksym = resolve_symbol_wait(mod, info, name);
                        /* Ok if resolved.  */
                        if (ksym && !IS_ERR(ksym)) {
                                sym[i].st_value = kernel_symbol_value(ksym);
                                break;
                        }

                        /* Ok if weak or ignored.  */
                        if (!ksym &&
                            (ELF_ST_BIND(sym[i].st_info) == STB_WEAK ||
                             ignore_undef_symbol(info->hdr->e_machine, name)))
                                break;

                        ret = PTR_ERR(ksym) ?: -ENOENT;
                        pr_warn("%s: Unknown symbol %s (err %d)\n",
                                mod->name, name, ret);
                        break;

                default:
                        /* Divert to percpu allocation if a percpu var. */
                        if (sym[i].st_shndx == info->index.pcpu)
                                secbase = (unsigned long)mod_percpu(mod);
                        else
                                secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
                        sym[i].st_value += secbase;
                        break;
                }
        }

        return ret;
}

static int apply_relocations(struct module *mod, const struct load_info *info)
{
        unsigned int i;
        int err = 0;

        /* Now do relocations. */
        for (i = 1; i < info->hdr->e_shnum; i++) {
                unsigned int infosec = info->sechdrs[i].sh_info;

                /* Not a valid relocation section? */
                if (infosec >= info->hdr->e_shnum)
                        continue;

                /* Don't bother with non-allocated sections */
                if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
                        continue;

                if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH)
                        err = klp_apply_section_relocs(mod, info->sechdrs,
                                                       info->secstrings,
                                                       info->strtab,
                                                       info->index.sym, i,
                                                       NULL);
                else if (info->sechdrs[i].sh_type == SHT_REL)
                        err = apply_relocate(info->sechdrs, info->strtab,
                                             info->index.sym, i, mod);
                else if (info->sechdrs[i].sh_type == SHT_RELA)
                        err = apply_relocate_add(info->sechdrs, info->strtab,
                                                 info->index.sym, i, mod);
                if (err < 0)
                        break;
        }
        return err;
}

/* Additional bytes needed by arch in front of individual sections */
unsigned int __weak arch_mod_section_prepend(struct module *mod,
                                             unsigned int section)
{
        /* default implementation just returns zero */
        return 0;
}

long module_get_offset_and_type(struct module *mod, enum mod_mem_type type,
                                Elf_Shdr *sechdr, unsigned int section)
{
        long offset;
        long mask = ((unsigned long)(type) & SH_ENTSIZE_TYPE_MASK) << SH_ENTSIZE_TYPE_SHIFT;

        mod->mem[type].size += arch_mod_section_prepend(mod, section);
        offset = ALIGN(mod->mem[type].size, sechdr->sh_addralign ?: 1);
        mod->mem[type].size = offset + sechdr->sh_size;

        WARN_ON_ONCE(offset & mask);
        return offset | mask;
}

bool module_init_layout_section(const char *sname)
{
#ifndef CONFIG_MODULE_UNLOAD
        if (module_exit_section(sname))
                return true;
#endif
        return module_init_section(sname);
}

static void __layout_sections(struct module *mod, struct load_info *info, bool is_init)
{
        unsigned int m, i;

        static const unsigned long masks[][2] = {
                /*
                 * NOTE: all executable code must be the first section
                 * in this array; otherwise modify the text_size
                 * finder in the two loops below
                 */
                { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
                { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
                { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL },
                { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
                { ARCH_SHF_SMALL | SHF_ALLOC, 0 }
        };
        static const int core_m_to_mem_type[] = {
                MOD_TEXT,
                MOD_RODATA,
                MOD_RO_AFTER_INIT,
                MOD_DATA,
                MOD_DATA,
        };
        static const int init_m_to_mem_type[] = {
                MOD_INIT_TEXT,
                MOD_INIT_RODATA,
                MOD_INVALID,
                MOD_INIT_DATA,
                MOD_INIT_DATA,
        };

        for (m = 0; m < ARRAY_SIZE(masks); ++m) {
                enum mod_mem_type type = is_init ? init_m_to_mem_type[m] : core_m_to_mem_type[m];

                for (i = 0; i < info->hdr->e_shnum; ++i) {
                        Elf_Shdr *s = &info->sechdrs[i];
                        const char *sname = info->secstrings + s->sh_name;

                        if ((s->sh_flags & masks[m][0]) != masks[m][0]
                            || (s->sh_flags & masks[m][1])
                            || s->sh_entsize != ~0UL
                            || is_init != module_init_layout_section(sname))
                                continue;

                        if (WARN_ON_ONCE(type == MOD_INVALID))
                                continue;

                        s->sh_entsize = module_get_offset_and_type(mod, type, s, i);
                        pr_debug("\t%s\n", sname);
                }
        }
}

/*
 * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
 * might -- code, read-only data, read-write data, small data.  Tally
 * sizes, and place the offsets into sh_entsize fields: high bit means it
 * belongs in init.
 */
static void layout_sections(struct module *mod, struct load_info *info)
{
        unsigned int i;

        for (i = 0; i < info->hdr->e_shnum; i++)
                info->sechdrs[i].sh_entsize = ~0UL;

        pr_debug("Core section allocation order for %s:\n", mod->name);
        __layout_sections(mod, info, false);

        pr_debug("Init section allocation order for %s:\n", mod->name);
        __layout_sections(mod, info, true);
}

static void module_license_taint_check(struct module *mod, const char *license)
{
        if (!license)
                license = "unspecified";

        if (!license_is_gpl_compatible(license)) {
                if (!test_taint(TAINT_PROPRIETARY_MODULE))
                        pr_warn("%s: module license '%s' taints kernel.\n",
                                mod->name, license);
                add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
                                 LOCKDEP_NOW_UNRELIABLE);
        }
}

static void setup_modinfo(struct module *mod, struct load_info *info)
{
        struct module_attribute *attr;
        int i;

        for (i = 0; (attr = modinfo_attrs[i]); i++) {
                if (attr->setup)
                        attr->setup(mod, get_modinfo(info, attr->attr.name));
        }
}

static void free_modinfo(struct module *mod)
{
        struct module_attribute *attr;
        int i;

        for (i = 0; (attr = modinfo_attrs[i]); i++) {
                if (attr->free)
                        attr->free(mod);
        }
}

bool __weak module_init_section(const char *name)
{
        return strstarts(name, ".init");
}

bool __weak module_exit_section(const char *name)
{
        return strstarts(name, ".exit");
}

static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr)
{
#if defined(CONFIG_64BIT)
        unsigned long long secend;
#else
        unsigned long secend;
#endif

        /*
         * Check for both overflow and offset/size being
         * too large.
         */
        secend = shdr->sh_offset + shdr->sh_size;
        if (secend < shdr->sh_offset || secend > info->len)
                return -ENOEXEC;

        return 0;
}

/*
 * Check userspace passed ELF module against our expectations, and cache
 * useful variables for further processing as we go.
 *
 * This does basic validity checks against section offsets and sizes, the
 * section name string table, and the indices used for it (sh_name).
 *
 * As a last step, since we're already checking the ELF sections we cache
 * useful variables which will be used later for our convenience:
 *
 *         o pointers to section headers
 *         o cache the modinfo symbol section
 *         o cache the string symbol section
 *         o cache the module section
 *
 * As a last step we set info->mod to the temporary copy of the module in
 * info->hdr. The final one will be allocated in move_module(). Any
 * modifications we make to our copy of the module will be carried over
 * to the final minted module.
 */
static int elf_validity_cache_copy(struct load_info *info, int flags)
{
        unsigned int i;
        Elf_Shdr *shdr, *strhdr;
        int err;
        unsigned int num_mod_secs = 0, mod_idx;
        unsigned int num_info_secs = 0, info_idx;
        unsigned int num_sym_secs = 0, sym_idx;

        if (info->len < sizeof(*(info->hdr))) {
                pr_err("Invalid ELF header len %lu\n", info->len);
                goto no_exec;
        }

        if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0) {
                pr_err("Invalid ELF header magic: != %s\n", ELFMAG);
                goto no_exec;
        }
        if (info->hdr->e_type != ET_REL) {
                pr_err("Invalid ELF header type: %u != %u\n",
                       info->hdr->e_type, ET_REL);
                goto no_exec;
        }
        if (!elf_check_arch(info->hdr)) {
                pr_err("Invalid architecture in ELF header: %u\n",
                       info->hdr->e_machine);
                goto no_exec;
        }
        if (!module_elf_check_arch(info->hdr)) {
                pr_err("Invalid module architecture in ELF header: %u\n",
                       info->hdr->e_machine);
                goto no_exec;
        }
        if (info->hdr->e_shentsize != sizeof(Elf_Shdr)) {
                pr_err("Invalid ELF section header size\n");
                goto no_exec;
        }

        /*
         * e_shnum is 16 bits, and sizeof(Elf_Shdr) is
         * known and small. So e_shnum * sizeof(Elf_Shdr)
         * will not overflow unsigned long on any platform.
         */
        if (info->hdr->e_shoff >= info->len
            || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
                info->len - info->hdr->e_shoff)) {
                pr_err("Invalid ELF section header overflow\n");
                goto no_exec;
        }

        info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;

        /*
         * Verify if the section name table index is valid.
         */
        if (info->hdr->e_shstrndx == SHN_UNDEF
            || info->hdr->e_shstrndx >= info->hdr->e_shnum) {
                pr_err("Invalid ELF section name index: %d || e_shstrndx (%d) >= e_shnum (%d)\n",
                       info->hdr->e_shstrndx, info->hdr->e_shstrndx,
                       info->hdr->e_shnum);
                goto no_exec;
        }

        strhdr = &info->sechdrs[info->hdr->e_shstrndx];
        err = validate_section_offset(info, strhdr);
        if (err < 0) {
                pr_err("Invalid ELF section hdr(type %u)\n", strhdr->sh_type);
                return err;
        }

        /*
         * The section name table must be NUL-terminated, as required
         * by the spec. This makes strcmp and pr_* calls that access
         * strings in the section safe.
         */
        info->secstrings = (void *)info->hdr + strhdr->sh_offset;
        if (strhdr->sh_size == 0) {
                pr_err("empty section name table\n");
                goto no_exec;
        }
        if (info->secstrings[strhdr->sh_size - 1] != '\0') {
                pr_err("ELF Spec violation: section name table isn't null terminated\n");
                goto no_exec;
        }

        /*
         * The code assumes that section 0 has a length of zero and
         * an addr of zero, so check for it.
         */
        if (info->sechdrs[0].sh_type != SHT_NULL
            || info->sechdrs[0].sh_size != 0
            || info->sechdrs[0].sh_addr != 0) {
                pr_err("ELF Spec violation: section 0 type(%d)!=SH_NULL or non-zero len or addr\n",
                       info->sechdrs[0].sh_type);
                goto no_exec;
        }

        for (i = 1; i < info->hdr->e_shnum; i++) {
                shdr = &info->sechdrs[i];
                switch (shdr->sh_type) {
                case SHT_NULL:
                case SHT_NOBITS:
                        continue;
                case SHT_SYMTAB:
                        if (shdr->sh_link == SHN_UNDEF
                            || shdr->sh_link >= info->hdr->e_shnum) {
                                pr_err("Invalid ELF sh_link!=SHN_UNDEF(%d) or (sh_link(%d) >= hdr->e_shnum(%d)\n",
                                       shdr->sh_link, shdr->sh_link,
                                       info->hdr->e_shnum);
                                goto no_exec;
                        }
                        num_sym_secs++;
                        sym_idx = i;
                        fallthrough;
                default:
                        err = validate_section_offset(info, shdr);
                        if (err < 0) {
                                pr_err("Invalid ELF section in module (section %u type %u)\n",
                                        i, shdr->sh_type);
                                return err;
                        }
                        if (strcmp(info->secstrings + shdr->sh_name,
                                   ".gnu.linkonce.this_module") == 0) {
                                num_mod_secs++;
                                mod_idx = i;
                        } else if (strcmp(info->secstrings + shdr->sh_name,
                                   ".modinfo") == 0) {
                                num_info_secs++;
                                info_idx = i;
                        }

                        if (shdr->sh_flags & SHF_ALLOC) {
                                if (shdr->sh_name >= strhdr->sh_size) {
                                        pr_err("Invalid ELF section name in module (section %u type %u)\n",
                                               i, shdr->sh_type);
                                        return -ENOEXEC;
                                }
                        }
                        break;
                }
        }

        if (num_info_secs > 1) {
                pr_err("Only one .modinfo section must exist.\n");
                goto no_exec;
        } else if (num_info_secs == 1) {
                /* Try to find a name early so we can log errors with a module name */
                info->index.info = info_idx;
                info->name = get_modinfo(info, "name");
        }

        if (num_sym_secs != 1) {
                pr_warn("%s: module has no symbols (stripped?)\n",
                        info->name ?: "(missing .modinfo section or name field)");
                goto no_exec;
        }

        /* Sets internal symbols and strings. */
        info->index.sym = sym_idx;
        shdr = &info->sechdrs[sym_idx];
        info->index.str = shdr->sh_link;
        info->strtab = (char *)info->hdr + info->sechdrs[info->index.str].sh_offset;

        /*
         * The ".gnu.linkonce.this_module" ELF section is special. It is
         * what modpost uses to refer to __this_module and let's use rely
         * on THIS_MODULE to point to &__this_module properly. The kernel's
         * modpost declares it on each modules's *.mod.c file. If the struct
         * module of the kernel changes a full kernel rebuild is required.
         *
         * We have a few expectaions for this special section, the following
         * code validates all this for us:
         *
         *   o Only one section must exist
         *   o We expect the kernel to always have to allocate it: SHF_ALLOC
         *   o The section size must match the kernel's run time's struct module
         *     size
         */
        if (num_mod_secs != 1) {
                pr_err("module %s: Only one .gnu.linkonce.this_module section must exist.\n",
                       info->name ?: "(missing .modinfo section or name field)");
                goto no_exec;
        }

        shdr = &info->sechdrs[mod_idx];

        /*
         * This is already implied on the switch above, however let's be
         * pedantic about it.
         */
        if (shdr->sh_type == SHT_NOBITS) {
                pr_err("module %s: .gnu.linkonce.this_module section must have a size set\n",
                       info->name ?: "(missing .modinfo section or name field)");
                goto no_exec;
        }

        if (!(shdr->sh_flags & SHF_ALLOC)) {
                pr_err("module %s: .gnu.linkonce.this_module must occupy memory during process execution\n",
                       info->name ?: "(missing .modinfo section or name field)");
                goto no_exec;
        }

        if (shdr->sh_size != sizeof(struct module)) {
                pr_err("module %s: .gnu.linkonce.this_module section size must match the kernel's built struct module size at run time\n",
                       info->name ?: "(missing .modinfo section or name field)");
                goto no_exec;
        }

        info->index.mod = mod_idx;

        /* This is temporary: point mod into copy of data. */
        info->mod = (void *)info->hdr + shdr->sh_offset;

        /*
         * If we didn't load the .modinfo 'name' field earlier, fall back to
         * on-disk struct mod 'name' field.
         */
        if (!info->name)
                info->name = info->mod->name;

        if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
                info->index.vers = 0; /* Pretend no __versions section! */
        else
                info->index.vers = find_sec(info, "__versions");

        info->index.pcpu = find_pcpusec(info);

        return 0;

no_exec:
        return -ENOEXEC;
}

#define COPY_CHUNK_SIZE (16*PAGE_SIZE)

static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len)
{
        do {
                unsigned long n = min(len, COPY_CHUNK_SIZE);

                if (copy_from_user(dst, usrc, n) != 0)
                        return -EFAULT;
                cond_resched();
                dst += n;
                usrc += n;
                len -= n;
        } while (len);
        return 0;
}

static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
{
        if (!get_modinfo(info, "livepatch"))
                /* Nothing more to do */
                return 0;

        if (set_livepatch_module(mod))
                return 0;

        pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
               mod->name);
        return -ENOEXEC;
}

static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
{
        if (retpoline_module_ok(get_modinfo(info, "retpoline")))
                return;

        pr_warn("%s: loading module not compiled with retpoline compiler.\n",
                mod->name);
}

/* Sets info->hdr and info->len. */
static int copy_module_from_user(const void __user *umod, unsigned long len,
                                  struct load_info *info)
{
        int err;

        info->len = len;
        if (info->len < sizeof(*(info->hdr)))
                return -ENOEXEC;

        err = security_kernel_load_data(LOADING_MODULE, true);
        if (err)
                return err;

        /* Suck in entire file: we'll want most of it. */
        info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN);
        if (!info->hdr)
                return -ENOMEM;

        if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
                err = -EFAULT;
                goto out;
        }

        err = security_kernel_post_load_data((char *)info->hdr, info->len,
                                             LOADING_MODULE, "init_module");
out:
        if (err)
                vfree(info->hdr);

        return err;
}

static void free_copy(struct load_info *info, int flags)
{
        if (flags & MODULE_INIT_COMPRESSED_FILE)
                module_decompress_cleanup(info);
        else
                vfree(info->hdr);
}

static int rewrite_section_headers(struct load_info *info, int flags)
{
        unsigned int i;

        /* This should always be true, but let's be sure. */
        info->sechdrs[0].sh_addr = 0;

        for (i = 1; i < info->hdr->e_shnum; i++) {
                Elf_Shdr *shdr = &info->sechdrs[i];

                /*
                 * Mark all sections sh_addr with their address in the
                 * temporary image.
                 */
                shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;

        }

        /* Track but don't keep modinfo and version sections. */
        info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
        info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;

        return 0;
}

/*
 * These calls taint the kernel depending certain module circumstances */
static void module_augment_kernel_taints(struct module *mod, struct load_info *info)
{
        int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);

        if (!get_modinfo(info, "intree")) {
                if (!test_taint(TAINT_OOT_MODULE))
                        pr_warn("%s: loading out-of-tree module taints kernel.\n",
                                mod->name);
                add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
        }

        check_modinfo_retpoline(mod, info);

        if (get_modinfo(info, "staging")) {
                add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
                pr_warn("%s: module is from the staging directory, the quality "
                        "is unknown, you have been warned.\n", mod->name);
        }

        if (is_livepatch_module(mod)) {
                add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
                pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n",
                                mod->name);
        }

        module_license_taint_check(mod, get_modinfo(info, "license"));

        if (get_modinfo(info, "test")) {
                if (!test_taint(TAINT_TEST))
                        pr_warn("%s: loading test module taints kernel.\n",
                                mod->name);
                add_taint_module(mod, TAINT_TEST, LOCKDEP_STILL_OK);
        }
#ifdef CONFIG_MODULE_SIG
        mod->sig_ok = info->sig_ok;
        if (!mod->sig_ok) {
                pr_notice_once("%s: module verification failed: signature "
                               "and/or required key missing - tainting "
                               "kernel\n", mod->name);
                add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
        }
#endif

        /*
         * ndiswrapper is under GPL by itself, but loads proprietary modules.
         * Don't use add_taint_module(), as it would prevent ndiswrapper from
         * using GPL-only symbols it needs.
         */
        if (strcmp(mod->name, "ndiswrapper") == 0)
                add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);

        /* driverloader was caught wrongly pretending to be under GPL */
        if (strcmp(mod->name, "driverloader") == 0)
                add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
                                 LOCKDEP_NOW_UNRELIABLE);

        /* lve claims to be GPL but upstream won't provide source */
        if (strcmp(mod->name, "lve") == 0)
                add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
                                 LOCKDEP_NOW_UNRELIABLE);

        if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
                pr_warn("%s: module license taints kernel.\n", mod->name);

}

static int check_modinfo(struct module *mod, struct load_info *info, int flags)
{
        const char *modmagic = get_modinfo(info, "vermagic");
        int err;

        if (flags & MODULE_INIT_IGNORE_VERMAGIC)
                modmagic = NULL;

        /* This is allowed: modprobe --force will invalidate it. */
        if (!modmagic) {
                err = try_to_force_load(mod, "bad vermagic");
                if (err)
                        return err;
        } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
                pr_err("%s: version magic '%s' should be '%s'\n",
                       info->name, modmagic, vermagic);
                return -ENOEXEC;
        }

        err = check_modinfo_livepatch(mod, info);
        if (err)
                return err;

        return 0;
}

static int find_module_sections(struct module *mod, struct load_info *info)
{
        mod->kp = section_objs(info, "__param",
                               sizeof(*mod->kp), &mod->num_kp);
        mod->syms = section_objs(info, "__ksymtab",
                                 sizeof(*mod->syms), &mod->num_syms);
        mod->crcs = section_addr(info, "__kcrctab");
        mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
                                     sizeof(*mod->gpl_syms),
                                     &mod->num_gpl_syms);
        mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");

#ifdef CONFIG_CONSTRUCTORS
        mod->ctors = section_objs(info, ".ctors",
                                  sizeof(*mod->ctors), &mod->num_ctors);
        if (!mod->ctors)
                mod->ctors = section_objs(info, ".init_array",
                                sizeof(*mod->ctors), &mod->num_ctors);
        else if (find_sec(info, ".init_array")) {
                /*
                 * This shouldn't happen with same compiler and binutils
                 * building all parts of the module.
                 */
                pr_warn("%s: has both .ctors and .init_array.\n",
                       mod->name);
                return -EINVAL;
        }
#endif

        mod->noinstr_text_start = section_objs(info, ".noinstr.text", 1,
                                                &mod->noinstr_text_size);

#ifdef CONFIG_TRACEPOINTS
        mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
                                             sizeof(*mod->tracepoints_ptrs),
                                             &mod->num_tracepoints);
#endif
#ifdef CONFIG_TREE_SRCU
        mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs",
                                             sizeof(*mod->srcu_struct_ptrs),
                                             &mod->num_srcu_structs);
#endif
#ifdef CONFIG_BPF_EVENTS
        mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map",
                                           sizeof(*mod->bpf_raw_events),
                                           &mod->num_bpf_raw_events);
#endif
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
        mod->btf_data = any_section_objs(info, ".BTF", 1, &mod->btf_data_size);
#endif
#ifdef CONFIG_JUMP_LABEL
        mod->jump_entries = section_objs(info, "__jump_table",
                                        sizeof(*mod->jump_entries),
                                        &mod->num_jump_entries);
#endif
#ifdef CONFIG_EVENT_TRACING
        mod->trace_events = section_objs(info, "_ftrace_events",
                                         sizeof(*mod->trace_events),
                                         &mod->num_trace_events);
        mod->trace_evals = section_objs(info, "_ftrace_eval_map",
                                        sizeof(*mod->trace_evals),
                                        &mod->num_trace_evals);
#endif
#ifdef CONFIG_TRACING
        mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
                                         sizeof(*mod->trace_bprintk_fmt_start),
                                         &mod->num_trace_bprintk_fmt);
#endif
#ifdef CONFIG_FTRACE_MCOUNT_RECORD
        /* sechdrs[0].sh_size is always zero */
        mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION,
                                             sizeof(*mod->ftrace_callsites),
                                             &mod->num_ftrace_callsites);
#endif
#ifdef CONFIG_FUNCTION_ERROR_INJECTION
        mod->ei_funcs = section_objs(info, "_error_injection_whitelist",
                                            sizeof(*mod->ei_funcs),
                                            &mod->num_ei_funcs);
#endif
#ifdef CONFIG_KPROBES
        mod->kprobes_text_start = section_objs(info, ".kprobes.text", 1,
                                                &mod->kprobes_text_size);
        mod->kprobe_blacklist = section_objs(info, "_kprobe_blacklist",
                                                sizeof(unsigned long),
                                                &mod->num_kprobe_blacklist);
#endif
#ifdef CONFIG_PRINTK_INDEX
        mod->printk_index_start = section_objs(info, ".printk_index",
                                               sizeof(*mod->printk_index_start),
                                               &mod->printk_index_size);
#endif
#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
        mod->static_call_sites = section_objs(info, ".static_call_sites",
                                              sizeof(*mod->static_call_sites),
                                              &mod->num_static_call_sites);
#endif
#if IS_ENABLED(CONFIG_KUNIT)
        mod->kunit_suites = section_objs(info, ".kunit_test_suites",
                                              sizeof(*mod->kunit_suites),
                                              &mod->num_kunit_suites);
        mod->kunit_init_suites = section_objs(info, ".kunit_init_test_suites",
                                              sizeof(*mod->kunit_init_suites),
                                              &mod->num_kunit_init_suites);
#endif

        mod->extable = section_objs(info, "__ex_table",
                                    sizeof(*mod->extable), &mod->num_exentries);

        if (section_addr(info, "__obsparm"))
                pr_warn("%s: Ignoring obsolete parameters\n", mod->name);

#ifdef CONFIG_DYNAMIC_DEBUG_CORE
        mod->dyndbg_info.descs = section_objs(info, "__dyndbg",
                                              sizeof(*mod->dyndbg_info.descs),
                                              &mod->dyndbg_info.num_descs);
        mod->dyndbg_info.classes = section_objs(info, "__dyndbg_classes",
                                                sizeof(*mod->dyndbg_info.classes),
                                                &mod->dyndbg_info.num_classes);
#endif

        return 0;
}

static int move_module(struct module *mod, struct load_info *info)
{
        int i;
        enum mod_mem_type t = 0;
        int ret = -ENOMEM;

        for_each_mod_mem_type(type) {
                if (!mod->mem[type].size) {
                        mod->mem[type].base = NULL;
                        continue;
                }

                ret = module_memory_alloc(mod, type);
                if (ret) {
                        t = type;
                        goto out_enomem;
                }
        }

        /* Transfer each section which specifies SHF_ALLOC */
        pr_debug("Final section addresses for %s:\n", mod->name);
        for (i = 0; i < info->hdr->e_shnum; i++) {
                void *dest;
                Elf_Shdr *shdr = &info->sechdrs[i];
                enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;

                if (!(shdr->sh_flags & SHF_ALLOC))
                        continue;

                dest = mod->mem[type].base + (shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK);

                if (shdr->sh_type != SHT_NOBITS) {
                        /*
                         * Our ELF checker already validated this, but let's
                         * be pedantic and make the goal clearer. We actually
                         * end up copying over all modifications made to the
                         * userspace copy of the entire struct module.
                         */
                        if (i == info->index.mod &&
                           (WARN_ON_ONCE(shdr->sh_size != sizeof(struct module)))) {
                                ret = -ENOEXEC;
                                goto out_enomem;
                        }
                        memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
                }
                /*
                 * Update the userspace copy's ELF section address to point to
                 * our newly allocated memory as a pure convenience so that
                 * users of info can keep taking advantage and using the newly
                 * minted official memory area.
                 */
                shdr->sh_addr = (unsigned long)dest;
                pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr,
                         (long)shdr->sh_size, info->secstrings + shdr->sh_name);
        }

        return 0;
out_enomem:
        for (t--; t >= 0; t--)
                module_memory_free(mod, t, true);
        return ret;
}

static int check_export_symbol_versions(struct module *mod)
{
#ifdef CONFIG_MODVERSIONS
        if ((mod->num_syms && !mod->crcs) ||
            (mod->num_gpl_syms && !mod->gpl_crcs)) {
                return try_to_force_load(mod,
                                         "no versions for exported symbols");
        }
#endif
        return 0;
}

static void flush_module_icache(const struct module *mod)
{
        /*
         * Flush the instruction cache, since we've played with text.
         * Do it before processing of module parameters, so the module
         * can provide parameter accessor functions of its own.
         */
        for_each_mod_mem_type(type) {
                const struct module_memory *mod_mem = &mod->mem[type];

                if (mod_mem->size) {
                        flush_icache_range((unsigned long)mod_mem->base,
                                           (unsigned long)mod_mem->base + mod_mem->size);
                }
        }
}

bool __weak module_elf_check_arch(Elf_Ehdr *hdr)
{
        return true;
}

int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
                                     Elf_Shdr *sechdrs,
                                     char *secstrings,
                                     struct module *mod)
{
        return 0;
}

/* module_blacklist is a comma-separated list of module names */
static char *module_blacklist;
static bool blacklisted(const char *module_name)
{
        const char *p;
        size_t len;

        if (!module_blacklist)
                return false;

        for (p = module_blacklist; *p; p += len) {
                len = strcspn(p, ",");
                if (strlen(module_name) == len && !memcmp(module_name, p, len))
                        return true;
                if (p[len] == ',')
                        len++;
        }
        return false;
}
core_param(module_blacklist, module_blacklist, charp, 0400);

static struct module *layout_and_allocate(struct load_info *info, int flags)
{
        struct module *mod;
        unsigned int ndx;
        int err;

        /* Allow arches to frob section contents and sizes.  */
        err = module_frob_arch_sections(info->hdr, info->sechdrs,
                                        info->secstrings, info->mod);
        if (err < 0)
                return ERR_PTR(err);

        err = module_enforce_rwx_sections(info->hdr, info->sechdrs,
                                          info->secstrings, info->mod);
        if (err < 0)
                return ERR_PTR(err);

        /* We will do a special allocation for per-cpu sections later. */
        info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;

        /*
         * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
         * layout_sections() can put it in the right place.
         * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
         */
        ndx = find_sec(info, ".data..ro_after_init");
        if (ndx)
                info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
        /*
         * Mark the __jump_table section as ro_after_init as well: these data
         * structures are never modified, with the exception of entries that
         * refer to code in the __init section, which are annotated as such
         * at module load time.
         */
        ndx = find_sec(info, "__jump_table");
        if (ndx)
                info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;

        /*
         * Determine total sizes, and put offsets in sh_entsize.  For now
         * this is done generically; there doesn't appear to be any
         * special cases for the architectures.
         */
        layout_sections(info->mod, info);
        layout_symtab(info->mod, info);

        /* Allocate and move to the final place */
        err = move_module(info->mod, info);
        if (err)
                return ERR_PTR(err);

        /* Module has been copied to its final place now: return it. */
        mod = (void *)info->sechdrs[info->index.mod].sh_addr;
        kmemleak_load_module(mod, info);
        return mod;
}

/* mod is no longer valid after this! */
static void module_deallocate(struct module *mod, struct load_info *info)
{
        percpu_modfree(mod);
        module_arch_freeing_init(mod);

        free_mod_mem(mod, true);
}

int __weak module_finalize(const Elf_Ehdr *hdr,
                           const Elf_Shdr *sechdrs,
                           struct module *me)
{
        return 0;
}

static int post_relocation(struct module *mod, const struct load_info *info)
{
        /* Sort exception table now relocations are done. */
        sort_extable(mod->extable, mod->extable + mod->num_exentries);

        /* Copy relocated percpu area over. */
        percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
                       info->sechdrs[info->index.pcpu].sh_size);

        /* Setup kallsyms-specific fields. */
        add_kallsyms(mod, info);

        /* Arch-specific module finalizing. */
        return module_finalize(info->hdr, info->sechdrs, mod);
}

/* Call module constructors. */
static void do_mod_ctors(struct module *mod)
{
#ifdef CONFIG_CONSTRUCTORS
        unsigned long i;

        for (i = 0; i < mod->num_ctors; i++)
                mod->ctors[i]();
#endif
}

/* For freeing module_init on success, in case kallsyms traversing */
struct mod_initfree {
        struct llist_node node;
        void *init_text;
        void *init_data;
        void *init_rodata;
};

static void do_free_init(struct work_struct *w)
{
        struct llist_node *pos, *n, *list;
        struct mod_initfree *initfree;

        list = llist_del_all(&init_free_list);

        synchronize_rcu();

        llist_for_each_safe(pos, n, list) {
                initfree = container_of(pos, struct mod_initfree, node);
                execmem_free(initfree->init_text);
                execmem_free(initfree->init_data);
                execmem_free(initfree->init_rodata);
                kfree(initfree);
        }
}

void flush_module_init_free_work(void)
{
        flush_work(&init_free_wq);
}

#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "module."
/* Default value for module->async_probe_requested */
static bool async_probe;
module_param(async_probe, bool, 0644);

/*
 * This is where the real work happens.
 *
 * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
 * helper command 'lx-symbols'.
 */
static noinline int do_init_module(struct module *mod)
{
        int ret = 0;
        struct mod_initfree *freeinit;
#if defined(CONFIG_MODULE_STATS)
        unsigned int text_size = 0, total_size = 0;

        for_each_mod_mem_type(type) {
                const struct module_memory *mod_mem = &mod->mem[type];
                if (mod_mem->size) {
                        total_size += mod_mem->size;
                        if (type == MOD_TEXT || type == MOD_INIT_TEXT)
                                text_size += mod_mem->size;
                }
        }
#endif

        freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
        if (!freeinit) {
                ret = -ENOMEM;
                goto fail;
        }
        freeinit->init_text = mod->mem[MOD_INIT_TEXT].base;
        freeinit->init_data = mod->mem[MOD_INIT_DATA].base;
        freeinit->init_rodata = mod->mem[MOD_INIT_RODATA].base;

        do_mod_ctors(mod);
        /* Start the module */
        if (mod->init != NULL)
                ret = do_one_initcall(mod->init);
        if (ret < 0) {
                goto fail_free_freeinit;
        }
        if (ret > 0) {
                pr_warn("%s: '%s'->init suspiciously returned %d, it should "
                        "follow 0/-E convention\n"
                        "%s: loading module anyway...\n",
                        __func__, mod->name, ret, __func__);
                dump_stack();
        }

        /* Now it's a first class citizen! */
        mod->state = MODULE_STATE_LIVE;
        blocking_notifier_call_chain(&module_notify_list,
                                     MODULE_STATE_LIVE, mod);

        /* Delay uevent until module has finished its init routine */
        kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);

        /*
         * We need to finish all async code before the module init sequence
         * is done. This has potential to deadlock if synchronous module
         * loading is requested from async (which is not allowed!).
         *
         * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous
         * request_module() from async workers") for more details.
         */
        if (!mod->async_probe_requested)
                async_synchronize_full();

        ftrace_free_mem(mod, mod->mem[MOD_INIT_TEXT].base,
                        mod->mem[MOD_INIT_TEXT].base + mod->mem[MOD_INIT_TEXT].size);
        mutex_lock(&module_mutex);
        /* Drop initial reference. */
        module_put(mod);
        trim_init_extable(mod);
#ifdef CONFIG_KALLSYMS
        /* Switch to core kallsyms now init is done: kallsyms may be walking! */
        rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
#endif
        ret = module_enable_rodata_ro(mod, true);
        if (ret)
                goto fail_mutex_unlock;
        mod_tree_remove_init(mod);
        module_arch_freeing_init(mod);
        for_class_mod_mem_type(type, init) {
                mod->mem[type].base = NULL;
                mod->mem[type].size = 0;
        }

#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
        /* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */
        mod->btf_data = NULL;
#endif
        /*
         * We want to free module_init, but be aware that kallsyms may be
         * walking this with preempt disabled.  In all the failure paths, we
         * call synchronize_rcu(), but we don't want to slow down the success
         * path. execmem_free() cannot be called in an interrupt, so do the
         * work and call synchronize_rcu() in a work queue.
         *
         * Note that execmem_alloc() on most architectures creates W+X page
         * mappings which won't be cleaned up until do_free_init() runs.  Any
         * code such as mark_rodata_ro() which depends on those mappings to
         * be cleaned up needs to sync with the queued work by invoking
         * flush_module_init_free_work().
         */
        if (llist_add(&freeinit->node, &init_free_list))
                schedule_work(&init_free_wq);

        mutex_unlock(&module_mutex);
        wake_up_all(&module_wq);

        mod_stat_add_long(text_size, &total_text_size);
        mod_stat_add_long(total_size, &total_mod_size);

        mod_stat_inc(&modcount);

        return 0;

fail_mutex_unlock:
        mutex_unlock(&module_mutex);
fail_free_freeinit:
        kfree(freeinit);
fail:
        /* Try to protect us from buggy refcounters. */
        mod->state = MODULE_STATE_GOING;
        synchronize_rcu();
        module_put(mod);
        blocking_notifier_call_chain(&module_notify_list,
                                     MODULE_STATE_GOING, mod);
        klp_module_going(mod);
        ftrace_release_mod(mod);
        free_module(mod);
        wake_up_all(&module_wq);

        return ret;
}

static int may_init_module(void)
{
        if (!capable(CAP_SYS_MODULE) || modules_disabled)
                return -EPERM;

        return 0;
}

/* Is this module of this name done loading?  No locks held. */
static bool finished_loading(const char *name)
{
        struct module *mod;
        bool ret;

        /*
         * The module_mutex should not be a heavily contended lock;
         * if we get the occasional sleep here, we'll go an extra iteration
         * in the wait_event_interruptible(), which is harmless.
         */
        sched_annotate_sleep();
        mutex_lock(&module_mutex);
        mod = find_module_all(name, strlen(name), true);
        ret = !mod || mod->state == MODULE_STATE_LIVE
                || mod->state == MODULE_STATE_GOING;
        mutex_unlock(&module_mutex);

        return ret;
}

/* Must be called with module_mutex held */
static int module_patient_check_exists(const char *name,
                                       enum fail_dup_mod_reason reason)
{
        struct module *old;
        int err = 0;

        old = find_module_all(name, strlen(name), true);
        if (old == NULL)
                return 0;

        if (old->state == MODULE_STATE_COMING ||
            old->state == MODULE_STATE_UNFORMED) {
                /* Wait in case it fails to load. */
                mutex_unlock(&module_mutex);
                err = wait_event_interruptible(module_wq,
                                       finished_loading(name));
                mutex_lock(&module_mutex);
                if (err)
                        return err;

                /* The module might have gone in the meantime. */
                old = find_module_all(name, strlen(name), true);
        }

        if (try_add_failed_module(name, reason))
                pr_warn("Could not add fail-tracking for module: %s\n", name);

        /*
         * We are here only when the same module was being loaded. Do
         * not try to load it again right now. It prevents long delays
         * caused by serialized module load failures. It might happen
         * when more devices of the same type trigger load of
         * a particular module.
         */
        if (old && old->state == MODULE_STATE_LIVE)
                return -EEXIST;
        return -EBUSY;
}

/*
 * We try to place it in the list now to make sure it's unique before
 * we dedicate too many resources.  In particular, temporary percpu
 * memory exhaustion.
 */
static int add_unformed_module(struct module *mod)
{
        int err;

        mod->state = MODULE_STATE_UNFORMED;

        mutex_lock(&module_mutex);
        err = module_patient_check_exists(mod->name, FAIL_DUP_MOD_LOAD);
        if (err)
                goto out;

        mod_update_bounds(mod);
        list_add_rcu(&mod->list, &modules);
        mod_tree_insert(mod);
        err = 0;

out:
        mutex_unlock(&module_mutex);
        return err;
}

static int complete_formation(struct module *mod, struct load_info *info)
{
        int err;

        mutex_lock(&module_mutex);

        /* Find duplicate symbols (must be called under lock). */
        err = verify_exported_symbols(mod);
        if (err < 0)
                goto out;

        /* These rely on module_mutex for list integrity. */
        module_bug_finalize(info->hdr, info->sechdrs, mod);
        module_cfi_finalize(info->hdr, info->sechdrs, mod);

        err = module_enable_rodata_ro(mod, false);
        if (err)
                goto out_strict_rwx;
        err = module_enable_data_nx(mod);
        if (err)
                goto out_strict_rwx;
        err = module_enable_text_rox(mod);
        if (err)
                goto out_strict_rwx;

        /*
         * Mark state as coming so strong_try_module_get() ignores us,
         * but kallsyms etc. can see us.
         */
        mod->state = MODULE_STATE_COMING;
        mutex_unlock(&module_mutex);

        return 0;

out_strict_rwx:
        module_bug_cleanup(mod);
out:
        mutex_unlock(&module_mutex);
        return err;
}

static int prepare_coming_module(struct module *mod)
{
        int err;

        ftrace_module_enable(mod);
        err = klp_module_coming(mod);
        if (err)
                return err;

        err = blocking_notifier_call_chain_robust(&module_notify_list,
                        MODULE_STATE_COMING, MODULE_STATE_GOING, mod);
        err = notifier_to_errno(err);
        if (err)
                klp_module_going(mod);

        return err;
}

static int unknown_module_param_cb(char *param, char *val, const char *modname,
                                   void *arg)
{
        struct module *mod = arg;
        int ret;

        if (strcmp(param, "async_probe") == 0) {
                if (kstrtobool(val, &mod->async_probe_requested))
                        mod->async_probe_requested = true;
                return 0;
        }

        /* Check for magic 'dyndbg' arg */
        ret = ddebug_dyndbg_module_param_cb(param, val, modname);
        if (ret != 0)
                pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
        return 0;
}

/* Module within temporary copy, this doesn't do any allocation  */
static int early_mod_check(struct load_info *info, int flags)
{
        int err;

        /*
         * Now that we know we have the correct module name, check
         * if it's blacklisted.
         */
        if (blacklisted(info->name)) {
                pr_err("Module %s is blacklisted\n", info->name);
                return -EPERM;
        }

        err = rewrite_section_headers(info, flags);
        if (err)
                return err;

        /* Check module struct version now, before we try to use module. */
        if (!check_modstruct_version(info, info->mod))
                return -ENOEXEC;

        err = check_modinfo(info->mod, info, flags);
        if (err)
                return err;

        mutex_lock(&module_mutex);
        err = module_patient_check_exists(info->mod->name, FAIL_DUP_MOD_BECOMING);
        mutex_unlock(&module_mutex);

        return err;
}

/*
 * Allocate and load the module: note that size of section 0 is always
 * zero, and we rely on this for optional sections.
 */
static int load_module(struct load_info *info, const char __user *uargs,
                       int flags)
{
        struct module *mod;
        bool module_allocated = false;
        long err = 0;
        char *after_dashes;

        /*
         * Do the signature check (if any) first. All that
         * the signature check needs is info->len, it does
         * not need any of the section info. That can be
         * set up later. This will minimize the chances
         * of a corrupt module causing problems before
         * we even get to the signature check.
         *
         * The check will also adjust info->len by stripping
         * off the sig length at the end of the module, making
         * checks against info->len more correct.
         */
        err = module_sig_check(info, flags);
        if (err)
                goto free_copy;

        /*
         * Do basic sanity checks against the ELF header and
         * sections. Cache useful sections and set the
         * info->mod to the userspace passed struct module.
         */
        err = elf_validity_cache_copy(info, flags);
        if (err)
                goto free_copy;

        err = early_mod_check(info, flags);
        if (err)
                goto free_copy;

        /* Figure out module layout, and allocate all the memory. */
        mod = layout_and_allocate(info, flags);
        if (IS_ERR(mod)) {
                err = PTR_ERR(mod);
                goto free_copy;
        }

        module_allocated = true;

        audit_log_kern_module(mod->name);

        /* Reserve our place in the list. */
        err = add_unformed_module(mod);
        if (err)
                goto free_module;

        /*
         * We are tainting your kernel if your module gets into
         * the modules linked list somehow.
         */
        module_augment_kernel_taints(mod, info);

        /* To avoid stressing percpu allocator, do this once we're unique. */
        err = percpu_modalloc(mod, info);
        if (err)
                goto unlink_mod;

        /* Now module is in final location, initialize linked lists, etc. */
        err = module_unload_init(mod);
        if (err)
                goto unlink_mod;

        init_param_lock(mod);

        /*
         * Now we've got everything in the final locations, we can
         * find optional sections.
         */
        err = find_module_sections(mod, info);
        if (err)
                goto free_unload;

        err = check_export_symbol_versions(mod);
        if (err)
                goto free_unload;

        /* Set up MODINFO_ATTR fields */
        setup_modinfo(mod, info);

        /* Fix up syms, so that st_value is a pointer to location. */
        err = simplify_symbols(mod, info);
        if (err < 0)
                goto free_modinfo;

        err = apply_relocations(mod, info);
        if (err < 0)
                goto free_modinfo;

        err = post_relocation(mod, info);
        if (err < 0)
                goto free_modinfo;

        flush_module_icache(mod);

        /* Now copy in args */
        mod->args = strndup_user(uargs, ~0UL >> 1);
        if (IS_ERR(mod->args)) {
                err = PTR_ERR(mod->args);
                goto free_arch_cleanup;
        }

        init_build_id(mod, info);

        /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
        ftrace_module_init(mod);

        /* Finally it's fully formed, ready to start executing. */
        err = complete_formation(mod, info);
        if (err)
                goto ddebug_cleanup;

        err = prepare_coming_module(mod);
        if (err)
                goto bug_cleanup;

        mod->async_probe_requested = async_probe;

        /* Module is ready to execute: parsing args may do that. */
        after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
                                  -32768, 32767, mod,
                                  unknown_module_param_cb);
        if (IS_ERR(after_dashes)) {
                err = PTR_ERR(after_dashes);
                goto coming_cleanup;
        } else if (after_dashes) {
                pr_warn("%s: parameters '%s' after `--' ignored\n",
                       mod->name, after_dashes);
        }

        /* Link in to sysfs. */
        err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
        if (err < 0)
                goto coming_cleanup;

        if (is_livepatch_module(mod)) {
                err = copy_module_elf(mod, info);
                if (err < 0)
                        goto sysfs_cleanup;
        }

        /* Get rid of temporary copy. */
        free_copy(info, flags);

        codetag_load_module(mod);

        /* Done! */
        trace_module_load(mod);

        return do_init_module(mod);

 sysfs_cleanup:
        mod_sysfs_teardown(mod);
 coming_cleanup:
        mod->state = MODULE_STATE_GOING;
        destroy_params(mod->kp, mod->num_kp);
        blocking_notifier_call_chain(&module_notify_list,
                                     MODULE_STATE_GOING, mod);
        klp_module_going(mod);
 bug_cleanup:
        mod->state = MODULE_STATE_GOING;
        /* module_bug_cleanup needs module_mutex protection */
        mutex_lock(&module_mutex);
        module_bug_cleanup(mod);
        mutex_unlock(&module_mutex);

 ddebug_cleanup:
        ftrace_release_mod(mod);
        synchronize_rcu();
        kfree(mod->args);
 free_arch_cleanup:
        module_arch_cleanup(mod);
 free_modinfo:
        free_modinfo(mod);
 free_unload:
        module_unload_free(mod);
 unlink_mod:
        mutex_lock(&module_mutex);
        /* Unlink carefully: kallsyms could be walking list. */
        list_del_rcu(&mod->list);
        mod_tree_remove(mod);
        wake_up_all(&module_wq);
        /* Wait for RCU-sched synchronizing before releasing mod->list. */
        synchronize_rcu();
        mutex_unlock(&module_mutex);
 free_module:
        mod_stat_bump_invalid(info, flags);
        /* Free lock-classes; relies on the preceding sync_rcu() */
        for_class_mod_mem_type(type, core_data) {
                lockdep_free_key_range(mod->mem[type].base,
                                       mod->mem[type].size);
        }

        module_deallocate(mod, info);
 free_copy:
        /*
         * The info->len is always set. We distinguish between
         * failures once the proper module was allocated and
         * before that.
         */
        if (!module_allocated)
                mod_stat_bump_becoming(info, flags);
        free_copy(info, flags);
        return err;
}

SYSCALL_DEFINE3(init_module, void __user *, umod,
                unsigned long, len, const char __user *, uargs)
{
        int err;
        struct load_info info = { };

        err = may_init_module();
        if (err)
                return err;

        pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
               umod, len, uargs);

        err = copy_module_from_user(umod, len, &info);
        if (err) {
                mod_stat_inc(&failed_kreads);
                mod_stat_add_long(len, &invalid_kread_bytes);
                return err;
        }

        return load_module(&info, uargs, 0);
}

struct idempotent {
        const void *cookie;
        struct hlist_node entry;
        struct completion complete;
        int ret;
};

#define IDEM_HASH_BITS 8
static struct hlist_head idem_hash[1 << IDEM_HASH_BITS];
static DEFINE_SPINLOCK(idem_lock);

static bool idempotent(struct idempotent *u, const void *cookie)
{
        int hash = hash_ptr(cookie, IDEM_HASH_BITS);
        struct hlist_head *head = idem_hash + hash;
        struct idempotent *existing;
        bool first;

        u->ret = 0;
        u->cookie = cookie;
        init_completion(&u->complete);

        spin_lock(&idem_lock);
        first = true;
        hlist_for_each_entry(existing, head, entry) {
                if (existing->cookie != cookie)
                        continue;
                first = false;
                break;
        }
        hlist_add_head(&u->entry, idem_hash + hash);
        spin_unlock(&idem_lock);

        return !first;
}

/*
 * We were the first one with 'cookie' on the list, and we ended
 * up completing the operation. We now need to walk the list,
 * remove everybody - which includes ourselves - fill in the return
 * value, and then complete the operation.
 */
static int idempotent_complete(struct idempotent *u, int ret)
{
        const void *cookie = u->cookie;
        int hash = hash_ptr(cookie, IDEM_HASH_BITS);
        struct hlist_head *head = idem_hash + hash;
        struct hlist_node *next;
        struct idempotent *pos;

        spin_lock(&idem_lock);
        hlist_for_each_entry_safe(pos, next, head, entry) {
                if (pos->cookie != cookie)
                        continue;
                hlist_del(&pos->entry);
                pos->ret = ret;
                complete(&pos->complete);
        }
        spin_unlock(&idem_lock);
        return ret;
}

static int init_module_from_file(struct file *f, const char __user * uargs, int flags)
{
        struct load_info info = { };
        void *buf = NULL;
        int len;

        len = kernel_read_file(f, 0, &buf, INT_MAX, NULL, READING_MODULE);
        if (len < 0) {
                mod_stat_inc(&failed_kreads);
                return len;
        }

        if (flags & MODULE_INIT_COMPRESSED_FILE) {
                int err = module_decompress(&info, buf, len);
                vfree(buf); /* compressed data is no longer needed */
                if (err) {
                        mod_stat_inc(&failed_decompress);
                        mod_stat_add_long(len, &invalid_decompress_bytes);
                        return err;
                }
        } else {
                info.hdr = buf;
                info.len = len;
        }

        return load_module(&info, uargs, flags);
}

static int idempotent_init_module(struct file *f, const char __user * uargs, int flags)
{
        struct idempotent idem;

        if (!f || !(f->f_mode & FMODE_READ))
                return -EBADF;

        /* See if somebody else is doing the operation? */
        if (idempotent(&idem, file_inode(f))) {
                wait_for_completion(&idem.complete);
                return idem.ret;
        }

        /* Otherwise, we'll do it and complete others */
        return idempotent_complete(&idem,
                init_module_from_file(f, uargs, flags));
}

SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
{
        int err;
        struct fd f;

        err = may_init_module();
        if (err)
                return err;

        pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);

        if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
                      |MODULE_INIT_IGNORE_VERMAGIC
                      |MODULE_INIT_COMPRESSED_FILE))
                return -EINVAL;

        f = fdget(fd);
        err = idempotent_init_module(f.file, uargs, flags);
        fdput(f);
        return err;
}

/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
char *module_flags(struct module *mod, char *buf, bool show_state)
{
        int bx = 0;

        BUG_ON(mod->state == MODULE_STATE_UNFORMED);
        if (!mod->taints && !show_state)
                goto out;
        if (mod->taints ||
            mod->state == MODULE_STATE_GOING ||
            mod->state == MODULE_STATE_COMING) {
                buf[bx++] = '(';
                bx += module_flags_taint(mod->taints, buf + bx);
                /* Show a - for module-is-being-unloaded */
                if (mod->state == MODULE_STATE_GOING && show_state)
                        buf[bx++] = '-';
                /* Show a + for module-is-being-loaded */
                if (mod->state == MODULE_STATE_COMING && show_state)
                        buf[bx++] = '+';
                buf[bx++] = ')';
        }
out:
        buf[bx] = '\0';

        return buf;
}

/* Given an address, look for it in the module exception tables. */
const struct exception_table_entry *search_module_extables(unsigned long addr)
{
        const struct exception_table_entry *e = NULL;
        struct module *mod;

        preempt_disable();
        mod = __module_address(addr);
        if (!mod)
                goto out;

        if (!mod->num_exentries)
                goto out;

        e = search_extable(mod->extable,
                           mod->num_exentries,
                           addr);
out:
        preempt_enable();

        /*
         * Now, if we found one, we are running inside it now, hence
         * we cannot unload the module, hence no refcnt needed.
         */
        return e;
}

/**
 * is_module_address() - is this address inside a module?
 * @addr: the address to check.
 *
 * See is_module_text_address() if you simply want to see if the address
 * is code (not data).
 */
bool is_module_address(unsigned long addr)
{
        bool ret;

        preempt_disable();
        ret = __module_address(addr) != NULL;
        preempt_enable();

        return ret;
}

/**
 * __module_address() - get the module which contains an address.
 * @addr: the address.
 *
 * Must be called with preempt disabled or module mutex held so that
 * module doesn't get freed during this.
 */
struct module *__module_address(unsigned long addr)
{
        struct module *mod;

        if (addr >= mod_tree.addr_min && addr <= mod_tree.addr_max)
                goto lookup;

#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
        if (addr >= mod_tree.data_addr_min && addr <= mod_tree.data_addr_max)
                goto lookup;
#endif

        return NULL;

lookup:
        module_assert_mutex_or_preempt();

        mod = mod_find(addr, &mod_tree);
        if (mod) {
                BUG_ON(!within_module(addr, mod));
                if (mod->state == MODULE_STATE_UNFORMED)
                        mod = NULL;
        }
        return mod;
}

/**
 * is_module_text_address() - is this address inside module code?
 * @addr: the address to check.
 *
 * See is_module_address() if you simply want to see if the address is
 * anywhere in a module.  See kernel_text_address() for testing if an
 * address corresponds to kernel or module code.
 */
bool is_module_text_address(unsigned long addr)
{
        bool ret;

        preempt_disable();
        ret = __module_text_address(addr) != NULL;
        preempt_enable();

        return ret;
}

/**
 * __module_text_address() - get the module whose code contains an address.
 * @addr: the address.
 *
 * Must be called with preempt disabled or module mutex held so that
 * module doesn't get freed during this.
 */
struct module *__module_text_address(unsigned long addr)
{
        struct module *mod = __module_address(addr);
        if (mod) {
                /* Make sure it's within the text section. */
                if (!within_module_mem_type(addr, mod, MOD_TEXT) &&
                    !within_module_mem_type(addr, mod, MOD_INIT_TEXT))
                        mod = NULL;
        }
        return mod;
}

/* Don't grab lock, we're oopsing. */
void print_modules(void)
{
        struct module *mod;
        char buf[MODULE_FLAGS_BUF_SIZE];

        printk(KERN_DEFAULT "Modules linked in:");
        /* Most callers should already have preempt disabled, but make sure */
        preempt_disable();
        list_for_each_entry_rcu(mod, &modules, list) {
                if (mod->state == MODULE_STATE_UNFORMED)
                        continue;
                pr_cont(" %s%s", mod->name, module_flags(mod, buf, true));
        }

        print_unloaded_tainted_modules();
        preempt_enable();
        if (last_unloaded_module.name[0])
                pr_cont(" [last unloaded: %s%s]", last_unloaded_module.name,
                        last_unloaded_module.taints);
        pr_cont("\n");
}

#ifdef CONFIG_MODULE_DEBUGFS
struct dentry *mod_debugfs_root;

static int module_debugfs_init(void)
{
        mod_debugfs_root = debugfs_create_dir("modules", NULL);
        return 0;
}
module_init(module_debugfs_init);
#endif


































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CPUSET_H
#define _LINUX_CPUSET_H
/*
 *  cpuset interface
 *
 *  Copyright (C) 2003 BULL SA
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 */

#include <linux/sched.h>
#include <linux/sched/topology.h>
#include <linux/sched/task.h>
#include <linux/cpumask.h>
#include <linux/nodemask.h>
#include <linux/mm.h>
#include <linux/mmu_context.h>
#include <linux/jump_label.h>

#ifdef CONFIG_CPUSETS

/*
 * Static branch rewrites can happen in an arbitrary order for a given
 * key. In code paths where we need to loop with read_mems_allowed_begin() and
 * read_mems_allowed_retry() to get a consistent view of mems_allowed, we need
 * to ensure that begin() always gets rewritten before retry() in the
 * disabled -> enabled transition. If not, then if local irqs are disabled
 * around the loop, we can deadlock since retry() would always be
 * comparing the latest value of the mems_allowed seqcount against 0 as
 * begin() still would see cpusets_enabled() as false. The enabled -> disabled
 * transition should happen in reverse order for the same reasons (want to stop
 * looking at real value of mems_allowed.sequence in retry() first).
 */
extern struct static_key_false cpusets_pre_enable_key;
extern struct static_key_false cpusets_enabled_key;
extern struct static_key_false cpusets_insane_config_key;

static inline bool cpusets_enabled(void)
{
        return static_branch_unlikely(&cpusets_enabled_key);
}

static inline void cpuset_inc(void)
{
        static_branch_inc_cpuslocked(&cpusets_pre_enable_key);
        static_branch_inc_cpuslocked(&cpusets_enabled_key);
}

static inline void cpuset_dec(void)
{
        static_branch_dec_cpuslocked(&cpusets_enabled_key);
        static_branch_dec_cpuslocked(&cpusets_pre_enable_key);
}

/*
 * This will get enabled whenever a cpuset configuration is considered
 * unsupportable in general. E.g. movable only node which cannot satisfy
 * any non movable allocations (see update_nodemask). Page allocator
 * needs to make additional checks for those configurations and this
 * check is meant to guard those checks without any overhead for sane
 * configurations.
 */
static inline bool cpusets_insane_config(void)
{
        return static_branch_unlikely(&cpusets_insane_config_key);
}

extern int cpuset_init(void);
extern void cpuset_init_smp(void);
extern void cpuset_force_rebuild(void);
extern void cpuset_update_active_cpus(void);
extern void inc_dl_tasks_cs(struct task_struct *task);
extern void dec_dl_tasks_cs(struct task_struct *task);
extern void cpuset_lock(void);
extern void cpuset_unlock(void);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
extern bool cpuset_cpus_allowed_fallback(struct task_struct *p);
extern bool cpuset_cpu_is_isolated(int cpu);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
#define cpuset_current_mems_allowed (current->mems_allowed)
void cpuset_init_current_mems_allowed(void);
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);

extern bool cpuset_node_allowed(int node, gfp_t gfp_mask);

static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
        return cpuset_node_allowed(zone_to_nid(z), gfp_mask);
}

static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
        if (cpusets_enabled())
                return __cpuset_zone_allowed(z, gfp_mask);
        return true;
}

extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
                                          const struct task_struct *tsk2);

#define cpuset_memory_pressure_bump()                                 \
        do {                                                        \
                if (cpuset_memory_pressure_enabled)                \
                        __cpuset_memory_pressure_bump();        \
        } while (0)
extern int cpuset_memory_pressure_enabled;
extern void __cpuset_memory_pressure_bump(void);

extern void cpuset_task_status_allowed(struct seq_file *m,
                                        struct task_struct *task);
extern int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
                            struct pid *pid, struct task_struct *tsk);

extern int cpuset_mem_spread_node(void);
extern int cpuset_slab_spread_node(void);

static inline int cpuset_do_page_mem_spread(void)
{
        return task_spread_page(current);
}

extern bool current_cpuset_is_being_rebound(void);

extern void rebuild_sched_domains(void);

extern void cpuset_print_current_mems_allowed(void);

/*
 * read_mems_allowed_begin is required when making decisions involving
 * mems_allowed such as during page allocation. mems_allowed can be updated in
 * parallel and depending on the new value an operation can fail potentially
 * causing process failure. A retry loop with read_mems_allowed_begin and
 * read_mems_allowed_retry prevents these artificial failures.
 */
static inline unsigned int read_mems_allowed_begin(void)
{
        if (!static_branch_unlikely(&cpusets_pre_enable_key))
                return 0;

        return read_seqcount_begin(&current->mems_allowed_seq);
}

/*
 * If this returns true, the operation that took place after
 * read_mems_allowed_begin may have failed artificially due to a concurrent
 * update of mems_allowed. It is up to the caller to retry the operation if
 * appropriate.
 */
static inline bool read_mems_allowed_retry(unsigned int seq)
{
        if (!static_branch_unlikely(&cpusets_enabled_key))
                return false;

        return read_seqcount_retry(&current->mems_allowed_seq, seq);
}

static inline void set_mems_allowed(nodemask_t nodemask)
{
        unsigned long flags;

        task_lock(current);
        local_irq_save(flags);
        write_seqcount_begin(&current->mems_allowed_seq);
        current->mems_allowed = nodemask;
        write_seqcount_end(&current->mems_allowed_seq);
        local_irq_restore(flags);
        task_unlock(current);
}

#else /* !CONFIG_CPUSETS */

static inline bool cpusets_enabled(void) { return false; }

static inline bool cpusets_insane_config(void) { return false; }

static inline int cpuset_init(void) { return 0; }
static inline void cpuset_init_smp(void) {}

static inline void cpuset_force_rebuild(void) { }

static inline void cpuset_update_active_cpus(void)
{
        partition_sched_domains(1, NULL, NULL);
}

static inline void inc_dl_tasks_cs(struct task_struct *task) { }
static inline void dec_dl_tasks_cs(struct task_struct *task) { }
static inline void cpuset_lock(void) { }
static inline void cpuset_unlock(void) { }

static inline void cpuset_cpus_allowed(struct task_struct *p,
                                       struct cpumask *mask)
{
        cpumask_copy(mask, task_cpu_possible_mask(p));
}

static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p)
{
        return false;
}

static inline bool cpuset_cpu_is_isolated(int cpu)
{
        return false;
}

static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
{
        return node_possible_map;
}

#define cpuset_current_mems_allowed (node_states[N_MEMORY])
static inline void cpuset_init_current_mems_allowed(void) {}

static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
{
        return 1;
}

static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
        return true;
}

static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
        return true;
}

static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
                                                 const struct task_struct *tsk2)
{
        return 1;
}

static inline void cpuset_memory_pressure_bump(void) {}

static inline void cpuset_task_status_allowed(struct seq_file *m,
                                                struct task_struct *task)
{
}

static inline int cpuset_mem_spread_node(void)
{
        return 0;
}

static inline int cpuset_slab_spread_node(void)
{
        return 0;
}

static inline int cpuset_do_page_mem_spread(void)
{
        return 0;
}

static inline bool current_cpuset_is_being_rebound(void)
{
        return false;
}

static inline void rebuild_sched_domains(void)
{
        partition_sched_domains(1, NULL, NULL);
}

static inline void cpuset_print_current_mems_allowed(void)
{
}

static inline void set_mems_allowed(nodemask_t nodemask)
{
}

static inline unsigned int read_mems_allowed_begin(void)
{
        return 0;
}

static inline bool read_mems_allowed_retry(unsigned int seq)
{
        return false;
}

#endif /* !CONFIG_CPUSETS */

#endif /* _LINUX_CPUSET_H */











































































































   23 







































    4 






























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_ATOMIC_H
#define _ASM_X86_ATOMIC_H

#include <linux/compiler.h>
#include <linux/types.h>
#include <asm/alternative.h>
#include <asm/cmpxchg.h>
#include <asm/rmwcc.h>
#include <asm/barrier.h>

/*
 * Atomic operations that C can't guarantee us.  Useful for
 * resource counting etc..
 */

static __always_inline int arch_atomic_read(const atomic_t *v)
{
        /*
         * Note for KASAN: we deliberately don't use READ_ONCE_NOCHECK() here,
         * it's non-inlined function that increases binary size and stack usage.
         */
        return __READ_ONCE((v)->counter);
}

static __always_inline void arch_atomic_set(atomic_t *v, int i)
{
        __WRITE_ONCE(v->counter, i);
}

static __always_inline void arch_atomic_add(int i, atomic_t *v)
{
        asm volatile(LOCK_PREFIX "addl %1,%0"
                     : "+m" (v->counter)
                     : "ir" (i) : "memory");
}

static __always_inline void arch_atomic_sub(int i, atomic_t *v)
{
        asm volatile(LOCK_PREFIX "subl %1,%0"
                     : "+m" (v->counter)
                     : "ir" (i) : "memory");
}

static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i);
}
#define arch_atomic_sub_and_test arch_atomic_sub_and_test

static __always_inline void arch_atomic_inc(atomic_t *v)
{
        asm volatile(LOCK_PREFIX "incl %0"
                     : "+m" (v->counter) :: "memory");
}
#define arch_atomic_inc arch_atomic_inc

static __always_inline void arch_atomic_dec(atomic_t *v)
{
        asm volatile(LOCK_PREFIX "decl %0"
                     : "+m" (v->counter) :: "memory");
}
#define arch_atomic_dec arch_atomic_dec

static __always_inline bool arch_atomic_dec_and_test(atomic_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e);
}
#define arch_atomic_dec_and_test arch_atomic_dec_and_test

static __always_inline bool arch_atomic_inc_and_test(atomic_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e);
}
#define arch_atomic_inc_and_test arch_atomic_inc_and_test

static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i);
}
#define arch_atomic_add_negative arch_atomic_add_negative

static __always_inline int arch_atomic_add_return(int i, atomic_t *v)
{
        return i + xadd(&v->counter, i);
}
#define arch_atomic_add_return arch_atomic_add_return

#define arch_atomic_sub_return(i, v) arch_atomic_add_return(-(i), v)

static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
{
        return xadd(&v->counter, i);
}
#define arch_atomic_fetch_add arch_atomic_fetch_add

#define arch_atomic_fetch_sub(i, v) arch_atomic_fetch_add(-(i), v)

static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
{
        return arch_cmpxchg(&v->counter, old, new);
}
#define arch_atomic_cmpxchg arch_atomic_cmpxchg

static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
        return arch_try_cmpxchg(&v->counter, old, new);
}
#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg

static __always_inline int arch_atomic_xchg(atomic_t *v, int new)
{
        return arch_xchg(&v->counter, new);
}
#define arch_atomic_xchg arch_atomic_xchg

static __always_inline void arch_atomic_and(int i, atomic_t *v)
{
        asm volatile(LOCK_PREFIX "andl %1,%0"
                        : "+m" (v->counter)
                        : "ir" (i)
                        : "memory");
}

static __always_inline int arch_atomic_fetch_and(int i, atomic_t *v)
{
        int val = arch_atomic_read(v);

        do { } while (!arch_atomic_try_cmpxchg(v, &val, val & i));

        return val;
}
#define arch_atomic_fetch_and arch_atomic_fetch_and

static __always_inline void arch_atomic_or(int i, atomic_t *v)
{
        asm volatile(LOCK_PREFIX "orl %1,%0"
                        : "+m" (v->counter)
                        : "ir" (i)
                        : "memory");
}

static __always_inline int arch_atomic_fetch_or(int i, atomic_t *v)
{
        int val = arch_atomic_read(v);

        do { } while (!arch_atomic_try_cmpxchg(v, &val, val | i));

        return val;
}
#define arch_atomic_fetch_or arch_atomic_fetch_or

static __always_inline void arch_atomic_xor(int i, atomic_t *v)
{
        asm volatile(LOCK_PREFIX "xorl %1,%0"
                        : "+m" (v->counter)
                        : "ir" (i)
                        : "memory");
}

static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v)
{
        int val = arch_atomic_read(v);

        do { } while (!arch_atomic_try_cmpxchg(v, &val, val ^ i));

        return val;
}
#define arch_atomic_fetch_xor arch_atomic_fetch_xor

#ifdef CONFIG_X86_32
# include <asm/atomic64_32.h>
#else
# include <asm/atomic64_64.h>
#endif

#endif /* _ASM_X86_ATOMIC_H */












































































































































    3 
    3 
    3 












































































    2 
    1 





    3 
    1 





















    3 


    4 












    3 
















    3 





    3 









































































    2 

    1 















    2 






















    2 





    2 


















    2 





















































    3 



    1 







    3 











































    3 






















































    3 





































































    4 





























    1 
    3 

    3 



























    4 






    4 


















    4 







    3 





























    4 



































    3 



















































    1 




    1 








































































































































    4 




    4 




























    3 















    3 


    3 

    3 
    3 

    3 

    3 







    3 

    3 


    3 












    3 








































































































































































































    1 
















    1 





































    1 


    1 



    1 

    1 
























































































    1 

















    1 






















































































































































































































































































































































































































































    1 




    1 
























































































































































































    1 






    1 












    1 


    1 


















    1 

























    1 












    1 





















































































    1 
















    1 

    1 














    1 









    1 







    1 




















    1 
































































    1 










































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/writeback.h>
#include <linux/pagemap.h>
#include <linux/blkdev.h>
#include <linux/uuid.h>
#include <linux/timekeeping.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "locking.h"
#include "tree-log.h"
#include "volumes.h"
#include "dev-replace.h"
#include "qgroup.h"
#include "block-group.h"
#include "space-info.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
#include "dir-item.h"
#include "uuid-tree.h"
#include "ioctl.h"
#include "relocation.h"
#include "scrub.h"

static struct kmem_cache *btrfs_trans_handle_cachep;

/*
 * Transaction states and transitions
 *
 * No running transaction (fs tree blocks are not modified)
 * |
 * | To next stage:
 * |  Call start_transaction() variants. Except btrfs_join_transaction_nostart().
 * V
 * Transaction N [[TRANS_STATE_RUNNING]]
 * |
 * | New trans handles can be attached to transaction N by calling all
 * | start_transaction() variants.
 * |
 * | To next stage:
 * |  Call btrfs_commit_transaction() on any trans handle attached to
 * |  transaction N
 * V
 * Transaction N [[TRANS_STATE_COMMIT_PREP]]
 * |
 * | If there are simultaneous calls to btrfs_commit_transaction() one will win
 * | the race and the rest will wait for the winner to commit the transaction.
 * |
 * | The winner will wait for previous running transaction to completely finish
 * | if there is one.
 * |
 * Transaction N [[TRANS_STATE_COMMIT_START]]
 * |
 * | Then one of the following happens:
 * | - Wait for all other trans handle holders to release.
 * |   The btrfs_commit_transaction() caller will do the commit work.
 * | - Wait for current transaction to be committed by others.
 * |   Other btrfs_commit_transaction() caller will do the commit work.
 * |
 * | At this stage, only btrfs_join_transaction*() variants can attach
 * | to this running transaction.
 * | All other variants will wait for current one to finish and attach to
 * | transaction N+1.
 * |
 * | To next stage:
 * |  Caller is chosen to commit transaction N, and all other trans handle
 * |  haven been released.
 * V
 * Transaction N [[TRANS_STATE_COMMIT_DOING]]
 * |
 * | The heavy lifting transaction work is started.
 * | From running delayed refs (modifying extent tree) to creating pending
 * | snapshots, running qgroups.
 * | In short, modify supporting trees to reflect modifications of subvolume
 * | trees.
 * |
 * | At this stage, all start_transaction() calls will wait for this
 * | transaction to finish and attach to transaction N+1.
 * |
 * | To next stage:
 * |  Until all supporting trees are updated.
 * V
 * Transaction N [[TRANS_STATE_UNBLOCKED]]
 * |                                                    Transaction N+1
 * | All needed trees are modified, thus we only    [[TRANS_STATE_RUNNING]]
 * | need to write them back to disk and update            |
 * | super blocks.                                    |
 * |                                                    |
 * | At this stage, new transaction is allowed to   |
 * | start.                                            |
 * | All new start_transaction() calls will be            |
 * | attached to transid N+1.                            |
 * |                                                    |
 * | To next stage:                                    |
 * |  Until all tree blocks are super blocks are    |
 * |  written to block devices                            |
 * V                                                    |
 * Transaction N [[TRANS_STATE_COMPLETED]]            V
 *   All tree blocks and super blocks are written.  Transaction N+1
 *   This transaction is finished and all its            [[TRANS_STATE_COMMIT_START]]
 *   data structures will be cleaned up.            | Life goes on
 */
static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
        [TRANS_STATE_RUNNING]                = 0U,
        [TRANS_STATE_COMMIT_PREP]        = 0U,
        [TRANS_STATE_COMMIT_START]        = (__TRANS_START | __TRANS_ATTACH),
        [TRANS_STATE_COMMIT_DOING]        = (__TRANS_START |
                                           __TRANS_ATTACH |
                                           __TRANS_JOIN |
                                           __TRANS_JOIN_NOSTART),
        [TRANS_STATE_UNBLOCKED]                = (__TRANS_START |
                                           __TRANS_ATTACH |
                                           __TRANS_JOIN |
                                           __TRANS_JOIN_NOLOCK |
                                           __TRANS_JOIN_NOSTART),
        [TRANS_STATE_SUPER_COMMITTED]        = (__TRANS_START |
                                           __TRANS_ATTACH |
                                           __TRANS_JOIN |
                                           __TRANS_JOIN_NOLOCK |
                                           __TRANS_JOIN_NOSTART),
        [TRANS_STATE_COMPLETED]                = (__TRANS_START |
                                           __TRANS_ATTACH |
                                           __TRANS_JOIN |
                                           __TRANS_JOIN_NOLOCK |
                                           __TRANS_JOIN_NOSTART),
};

void btrfs_put_transaction(struct btrfs_transaction *transaction)
{
        WARN_ON(refcount_read(&transaction->use_count) == 0);
        if (refcount_dec_and_test(&transaction->use_count)) {
                BUG_ON(!list_empty(&transaction->list));
                WARN_ON(!RB_EMPTY_ROOT(
                                &transaction->delayed_refs.href_root.rb_root));
                WARN_ON(!RB_EMPTY_ROOT(
                                &transaction->delayed_refs.dirty_extent_root));
                if (transaction->delayed_refs.pending_csums)
                        btrfs_err(transaction->fs_info,
                                  "pending csums is %llu",
                                  transaction->delayed_refs.pending_csums);
                /*
                 * If any block groups are found in ->deleted_bgs then it's
                 * because the transaction was aborted and a commit did not
                 * happen (things failed before writing the new superblock
                 * and calling btrfs_finish_extent_commit()), so we can not
                 * discard the physical locations of the block groups.
                 */
                while (!list_empty(&transaction->deleted_bgs)) {
                        struct btrfs_block_group *cache;

                        cache = list_first_entry(&transaction->deleted_bgs,
                                                 struct btrfs_block_group,
                                                 bg_list);
                        list_del_init(&cache->bg_list);
                        btrfs_unfreeze_block_group(cache);
                        btrfs_put_block_group(cache);
                }
                WARN_ON(!list_empty(&transaction->dev_update_list));
                kfree(transaction);
        }
}

static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
{
        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *root, *tmp;

        /*
         * At this point no one can be using this transaction to modify any tree
         * and no one can start another transaction to modify any tree either.
         */
        ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING);

        down_write(&fs_info->commit_root_sem);

        if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
                fs_info->last_reloc_trans = trans->transid;

        list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits,
                                 dirty_list) {
                list_del_init(&root->dirty_list);
                free_extent_buffer(root->commit_root);
                root->commit_root = btrfs_root_node(root);
                extent_io_tree_release(&root->dirty_log_pages);
                btrfs_qgroup_clean_swapped_blocks(root);
        }

        /* We can free old roots now. */
        spin_lock(&cur_trans->dropped_roots_lock);
        while (!list_empty(&cur_trans->dropped_roots)) {
                root = list_first_entry(&cur_trans->dropped_roots,
                                        struct btrfs_root, root_list);
                list_del_init(&root->root_list);
                spin_unlock(&cur_trans->dropped_roots_lock);
                btrfs_free_log(trans, root);
                btrfs_drop_and_free_fs_root(fs_info, root);
                spin_lock(&cur_trans->dropped_roots_lock);
        }
        spin_unlock(&cur_trans->dropped_roots_lock);

        up_write(&fs_info->commit_root_sem);
}

static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
                                         unsigned int type)
{
        if (type & TRANS_EXTWRITERS)
                atomic_inc(&trans->num_extwriters);
}

static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
                                         unsigned int type)
{
        if (type & TRANS_EXTWRITERS)
                atomic_dec(&trans->num_extwriters);
}

static inline void extwriter_counter_init(struct btrfs_transaction *trans,
                                          unsigned int type)
{
        atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
}

static inline int extwriter_counter_read(struct btrfs_transaction *trans)
{
        return atomic_read(&trans->num_extwriters);
}

/*
 * To be called after doing the chunk btree updates right after allocating a new
 * chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a
 * chunk after all chunk btree updates and after finishing the second phase of
 * chunk allocation (btrfs_create_pending_block_groups()) in case some block
 * group had its chunk item insertion delayed to the second phase.
 */
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;

        if (!trans->chunk_bytes_reserved)
                return;

        btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
                                trans->chunk_bytes_reserved, NULL);
        trans->chunk_bytes_reserved = 0;
}

/*
 * either allocate a new transaction or hop into the existing one
 */
static noinline int join_transaction(struct btrfs_fs_info *fs_info,
                                     unsigned int type)
{
        struct btrfs_transaction *cur_trans;

        spin_lock(&fs_info->trans_lock);
loop:
        /* The file system has been taken offline. No new transactions. */
        if (BTRFS_FS_ERROR(fs_info)) {
                spin_unlock(&fs_info->trans_lock);
                return -EROFS;
        }

        cur_trans = fs_info->running_transaction;
        if (cur_trans) {
                if (TRANS_ABORTED(cur_trans)) {
                        spin_unlock(&fs_info->trans_lock);
                        return cur_trans->aborted;
                }
                if (btrfs_blocked_trans_types[cur_trans->state] & type) {
                        spin_unlock(&fs_info->trans_lock);
                        return -EBUSY;
                }
                refcount_inc(&cur_trans->use_count);
                atomic_inc(&cur_trans->num_writers);
                extwriter_counter_inc(cur_trans, type);
                spin_unlock(&fs_info->trans_lock);
                btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers);
                btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters);
                return 0;
        }
        spin_unlock(&fs_info->trans_lock);

        /*
         * If we are ATTACH or TRANS_JOIN_NOSTART, we just want to catch the
         * current transaction, and commit it. If there is no transaction, just
         * return ENOENT.
         */
        if (type == TRANS_ATTACH || type == TRANS_JOIN_NOSTART)
                return -ENOENT;

        /*
         * JOIN_NOLOCK only happens during the transaction commit, so
         * it is impossible that ->running_transaction is NULL
         */
        BUG_ON(type == TRANS_JOIN_NOLOCK);

        cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS);
        if (!cur_trans)
                return -ENOMEM;

        btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers);
        btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters);

        spin_lock(&fs_info->trans_lock);
        if (fs_info->running_transaction) {
                /*
                 * someone started a transaction after we unlocked.  Make sure
                 * to redo the checks above
                 */
                btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
                btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
                kfree(cur_trans);
                goto loop;
        } else if (BTRFS_FS_ERROR(fs_info)) {
                spin_unlock(&fs_info->trans_lock);
                btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
                btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
                kfree(cur_trans);
                return -EROFS;
        }

        cur_trans->fs_info = fs_info;
        atomic_set(&cur_trans->pending_ordered, 0);
        init_waitqueue_head(&cur_trans->pending_wait);
        atomic_set(&cur_trans->num_writers, 1);
        extwriter_counter_init(cur_trans, type);
        init_waitqueue_head(&cur_trans->writer_wait);
        init_waitqueue_head(&cur_trans->commit_wait);
        cur_trans->state = TRANS_STATE_RUNNING;
        /*
         * One for this trans handle, one so it will live on until we
         * commit the transaction.
         */
        refcount_set(&cur_trans->use_count, 2);
        cur_trans->flags = 0;
        cur_trans->start_time = ktime_get_seconds();

        memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));

        cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
        cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
        atomic_set(&cur_trans->delayed_refs.num_entries, 0);

        /*
         * although the tree mod log is per file system and not per transaction,
         * the log must never go across transaction boundaries.
         */
        smp_mb();
        if (!list_empty(&fs_info->tree_mod_seq_list))
                WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when creating a fresh transaction\n");
        if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
                WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when creating a fresh transaction\n");
        atomic64_set(&fs_info->tree_mod_seq, 0);

        spin_lock_init(&cur_trans->delayed_refs.lock);

        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
        INIT_LIST_HEAD(&cur_trans->dev_update_list);
        INIT_LIST_HEAD(&cur_trans->switch_commits);
        INIT_LIST_HEAD(&cur_trans->dirty_bgs);
        INIT_LIST_HEAD(&cur_trans->io_bgs);
        INIT_LIST_HEAD(&cur_trans->dropped_roots);
        mutex_init(&cur_trans->cache_write_mutex);
        spin_lock_init(&cur_trans->dirty_bgs_lock);
        INIT_LIST_HEAD(&cur_trans->deleted_bgs);
        spin_lock_init(&cur_trans->dropped_roots_lock);
        list_add_tail(&cur_trans->list, &fs_info->trans_list);
        extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
                        IO_TREE_TRANS_DIRTY_PAGES);
        extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
                        IO_TREE_FS_PINNED_EXTENTS);
        btrfs_set_fs_generation(fs_info, fs_info->generation + 1);
        cur_trans->transid = fs_info->generation;
        fs_info->running_transaction = cur_trans;
        cur_trans->aborted = 0;
        spin_unlock(&fs_info->trans_lock);

        return 0;
}

/*
 * This does all the record keeping required to make sure that a shareable root
 * is properly recorded in a given transaction.  This is required to make sure
 * the old root from before we joined the transaction is deleted when the
 * transaction commits.
 */
static int record_root_in_trans(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               int force)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret = 0;

        if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
            root->last_trans < trans->transid) || force) {
                WARN_ON(!force && root->commit_root != root->node);

                /*
                 * see below for IN_TRANS_SETUP usage rules
                 * we have the reloc mutex held now, so there
                 * is only one writer in this function
                 */
                set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);

                /* make sure readers find IN_TRANS_SETUP before
                 * they find our root->last_trans update
                 */
                smp_wmb();

                spin_lock(&fs_info->fs_roots_radix_lock);
                if (root->last_trans == trans->transid && !force) {
                        spin_unlock(&fs_info->fs_roots_radix_lock);
                        return 0;
                }
                radix_tree_tag_set(&fs_info->fs_roots_radix,
                                   (unsigned long)btrfs_root_id(root),
                                   BTRFS_ROOT_TRANS_TAG);
                spin_unlock(&fs_info->fs_roots_radix_lock);
                root->last_trans = trans->transid;

                /* this is pretty tricky.  We don't want to
                 * take the relocation lock in btrfs_record_root_in_trans
                 * unless we're really doing the first setup for this root in
                 * this transaction.
                 *
                 * Normally we'd use root->last_trans as a flag to decide
                 * if we want to take the expensive mutex.
                 *
                 * But, we have to set root->last_trans before we
                 * init the relocation root, otherwise, we trip over warnings
                 * in ctree.c.  The solution used here is to flag ourselves
                 * with root IN_TRANS_SETUP.  When this is 1, we're still
                 * fixing up the reloc trees and everyone must wait.
                 *
                 * When this is zero, they can trust root->last_trans and fly
                 * through btrfs_record_root_in_trans without having to take the
                 * lock.  smp_wmb() makes sure that all the writes above are
                 * done before we pop in the zero below
                 */
                ret = btrfs_init_reloc_root(trans, root);
                smp_mb__before_atomic();
                clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
        }
        return ret;
}


void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_transaction *cur_trans = trans->transaction;

        /* Add ourselves to the transaction dropped list */
        spin_lock(&cur_trans->dropped_roots_lock);
        list_add_tail(&root->root_list, &cur_trans->dropped_roots);
        spin_unlock(&cur_trans->dropped_roots_lock);

        /* Make sure we don't try to update the root at commit time */
        spin_lock(&fs_info->fs_roots_radix_lock);
        radix_tree_tag_clear(&fs_info->fs_roots_radix,
                             (unsigned long)btrfs_root_id(root),
                             BTRFS_ROOT_TRANS_TAG);
        spin_unlock(&fs_info->fs_roots_radix_lock);
}

int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;

        if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
                return 0;

        /*
         * see record_root_in_trans for comments about IN_TRANS_SETUP usage
         * and barriers
         */
        smp_rmb();
        if (root->last_trans == trans->transid &&
            !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
                return 0;

        mutex_lock(&fs_info->reloc_mutex);
        ret = record_root_in_trans(trans, root, 0);
        mutex_unlock(&fs_info->reloc_mutex);

        return ret;
}

static inline int is_transaction_blocked(struct btrfs_transaction *trans)
{
        return (trans->state >= TRANS_STATE_COMMIT_START &&
                trans->state < TRANS_STATE_UNBLOCKED &&
                !TRANS_ABORTED(trans));
}

/* wait for commit against the current transaction to become unblocked
 * when this is done, it is safe to start a new transaction, but the current
 * transaction might not be fully on disk.
 */
static void wait_current_trans(struct btrfs_fs_info *fs_info)
{
        struct btrfs_transaction *cur_trans;

        spin_lock(&fs_info->trans_lock);
        cur_trans = fs_info->running_transaction;
        if (cur_trans && is_transaction_blocked(cur_trans)) {
                refcount_inc(&cur_trans->use_count);
                spin_unlock(&fs_info->trans_lock);

                btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
                wait_event(fs_info->transaction_wait,
                           cur_trans->state >= TRANS_STATE_UNBLOCKED ||
                           TRANS_ABORTED(cur_trans));
                btrfs_put_transaction(cur_trans);
        } else {
                spin_unlock(&fs_info->trans_lock);
        }
}

static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type)
{
        if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
                return 0;

        if (type == TRANS_START)
                return 1;

        return 0;
}

static inline bool need_reserve_reloc_root(struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;

        if (!fs_info->reloc_ctl ||
            !test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
            btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
            root->reloc_root)
                return false;

        return true;
}

static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
                                        enum btrfs_reserve_flush_enum flush,
                                        u64 num_bytes,
                                        u64 *delayed_refs_bytes)
{
        struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
        u64 bytes = num_bytes + *delayed_refs_bytes;
        int ret;

        /*
         * We want to reserve all the bytes we may need all at once, so we only
         * do 1 enospc flushing cycle per transaction start.
         */
        ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);

        /*
         * If we are an emergency flush, which can steal from the global block
         * reserve, then attempt to not reserve space for the delayed refs, as
         * we will consume space for them from the global block reserve.
         */
        if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
                bytes -= *delayed_refs_bytes;
                *delayed_refs_bytes = 0;
                ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
        }

        return ret;
}

static struct btrfs_trans_handle *
start_transaction(struct btrfs_root *root, unsigned int num_items,
                  unsigned int type, enum btrfs_reserve_flush_enum flush,
                  bool enforce_qgroups)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
        struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
        struct btrfs_trans_handle *h;
        struct btrfs_transaction *cur_trans;
        u64 num_bytes = 0;
        u64 qgroup_reserved = 0;
        u64 delayed_refs_bytes = 0;
        bool reloc_reserved = false;
        bool do_chunk_alloc = false;
        int ret;

        if (BTRFS_FS_ERROR(fs_info))
                return ERR_PTR(-EROFS);

        if (current->journal_info) {
                WARN_ON(type & TRANS_EXTWRITERS);
                h = current->journal_info;
                refcount_inc(&h->use_count);
                WARN_ON(refcount_read(&h->use_count) > 2);
                h->orig_rsv = h->block_rsv;
                h->block_rsv = NULL;
                goto got_it;
        }

        /*
         * Do the reservation before we join the transaction so we can do all
         * the appropriate flushing if need be.
         */
        if (num_items && root != fs_info->chunk_root) {
                qgroup_reserved = num_items * fs_info->nodesize;
                /*
                 * Use prealloc for now, as there might be a currently running
                 * transaction that could free this reserved space prematurely
                 * by committing.
                 */
                ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserved,
                                                         enforce_qgroups, false);
                if (ret)
                        return ERR_PTR(ret);

                num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
                /*
                 * If we plan to insert/update/delete "num_items" from a btree,
                 * we will also generate delayed refs for extent buffers in the
                 * respective btree paths, so reserve space for the delayed refs
                 * that will be generated by the caller as it modifies btrees.
                 * Try to reserve them to avoid excessive use of the global
                 * block reserve.
                 */
                delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, num_items);

                /*
                 * Do the reservation for the relocation root creation
                 */
                if (need_reserve_reloc_root(root)) {
                        num_bytes += fs_info->nodesize;
                        reloc_reserved = true;
                }

                ret = btrfs_reserve_trans_metadata(fs_info, flush, num_bytes,
                                                   &delayed_refs_bytes);
                if (ret)
                        goto reserve_fail;

                btrfs_block_rsv_add_bytes(trans_rsv, num_bytes, true);

                if (trans_rsv->space_info->force_alloc)
                        do_chunk_alloc = true;
        } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
                   !btrfs_block_rsv_full(delayed_refs_rsv)) {
                /*
                 * Some people call with btrfs_start_transaction(root, 0)
                 * because they can be throttled, but have some other mechanism
                 * for reserving space.  We still want these guys to refill the
                 * delayed block_rsv so just add 1 items worth of reservation
                 * here.
                 */
                ret = btrfs_delayed_refs_rsv_refill(fs_info, flush);
                if (ret)
                        goto reserve_fail;
        }
again:
        h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS);
        if (!h) {
                ret = -ENOMEM;
                goto alloc_fail;
        }

        /*
         * If we are JOIN_NOLOCK we're already committing a transaction and
         * waiting on this guy, so we don't need to do the sb_start_intwrite
         * because we're already holding a ref.  We need this because we could
         * have raced in and did an fsync() on a file which can kick a commit
         * and then we deadlock with somebody doing a freeze.
         *
         * If we are ATTACH, it means we just want to catch the current
         * transaction and commit it, so we needn't do sb_start_intwrite(). 
         */
        if (type & __TRANS_FREEZABLE)
                sb_start_intwrite(fs_info->sb);

        if (may_wait_transaction(fs_info, type))
                wait_current_trans(fs_info);

        do {
                ret = join_transaction(fs_info, type);
                if (ret == -EBUSY) {
                        wait_current_trans(fs_info);
                        if (unlikely(type == TRANS_ATTACH ||
                                     type == TRANS_JOIN_NOSTART))
                                ret = -ENOENT;
                }
        } while (ret == -EBUSY);

        if (ret < 0)
                goto join_fail;

        cur_trans = fs_info->running_transaction;

        h->transid = cur_trans->transid;
        h->transaction = cur_trans;
        refcount_set(&h->use_count, 1);
        h->fs_info = root->fs_info;

        h->type = type;
        INIT_LIST_HEAD(&h->new_bgs);
        btrfs_init_metadata_block_rsv(fs_info, &h->delayed_rsv, BTRFS_BLOCK_RSV_DELOPS);

        smp_mb();
        if (cur_trans->state >= TRANS_STATE_COMMIT_START &&
            may_wait_transaction(fs_info, type)) {
                current->journal_info = h;
                btrfs_commit_transaction(h);
                goto again;
        }

        if (num_bytes) {
                trace_btrfs_space_reservation(fs_info, "transaction",
                                              h->transid, num_bytes, 1);
                h->block_rsv = trans_rsv;
                h->bytes_reserved = num_bytes;
                if (delayed_refs_bytes > 0) {
                        trace_btrfs_space_reservation(fs_info,
                                                      "local_delayed_refs_rsv",
                                                      h->transid,
                                                      delayed_refs_bytes, 1);
                        h->delayed_refs_bytes_reserved = delayed_refs_bytes;
                        btrfs_block_rsv_add_bytes(&h->delayed_rsv, delayed_refs_bytes, true);
                        delayed_refs_bytes = 0;
                }
                h->reloc_reserved = reloc_reserved;
        }

got_it:
        if (!current->journal_info)
                current->journal_info = h;

        /*
         * If the space_info is marked ALLOC_FORCE then we'll get upgraded to
         * ALLOC_FORCE the first run through, and then we won't allocate for
         * anybody else who races in later.  We don't care about the return
         * value here.
         */
        if (do_chunk_alloc && num_bytes) {
                u64 flags = h->block_rsv->space_info->flags;

                btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags),
                                  CHUNK_ALLOC_NO_FORCE);
        }

        /*
         * btrfs_record_root_in_trans() needs to alloc new extents, and may
         * call btrfs_join_transaction() while we're also starting a
         * transaction.
         *
         * Thus it need to be called after current->journal_info initialized,
         * or we can deadlock.
         */
        ret = btrfs_record_root_in_trans(h, root);
        if (ret) {
                /*
                 * The transaction handle is fully initialized and linked with
                 * other structures so it needs to be ended in case of errors,
                 * not just freed.
                 */
                btrfs_end_transaction(h);
                goto reserve_fail;
        }
        /*
         * Now that we have found a transaction to be a part of, convert the
         * qgroup reservation from prealloc to pertrans. A different transaction
         * can't race in and free our pertrans out from under us.
         */
        if (qgroup_reserved)
                btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);

        return h;

join_fail:
        if (type & __TRANS_FREEZABLE)
                sb_end_intwrite(fs_info->sb);
        kmem_cache_free(btrfs_trans_handle_cachep, h);
alloc_fail:
        if (num_bytes)
                btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL);
        if (delayed_refs_bytes)
                btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info,
                                                    delayed_refs_bytes);
reserve_fail:
        btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
        return ERR_PTR(ret);
}

struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
                                                   unsigned int num_items)
{
        return start_transaction(root, num_items, TRANS_START,
                                 BTRFS_RESERVE_FLUSH_ALL, true);
}

struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
                                        struct btrfs_root *root,
                                        unsigned int num_items)
{
        return start_transaction(root, num_items, TRANS_START,
                                 BTRFS_RESERVE_FLUSH_ALL_STEAL, false);
}

struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
{
        return start_transaction(root, 0, TRANS_JOIN, BTRFS_RESERVE_NO_FLUSH,
                                 true);
}

struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root)
{
        return start_transaction(root, 0, TRANS_JOIN_NOLOCK,
                                 BTRFS_RESERVE_NO_FLUSH, true);
}

/*
 * Similar to regular join but it never starts a transaction when none is
 * running or when there's a running one at a state >= TRANS_STATE_UNBLOCKED.
 * This is similar to btrfs_attach_transaction() but it allows the join to
 * happen if the transaction commit already started but it's not yet in the
 * "doing" phase (the state is < TRANS_STATE_COMMIT_DOING).
 */
struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root)
{
        return start_transaction(root, 0, TRANS_JOIN_NOSTART,
                                 BTRFS_RESERVE_NO_FLUSH, true);
}

/*
 * Catch the running transaction.
 *
 * It is used when we want to commit the current the transaction, but
 * don't want to start a new one.
 *
 * Note: If this function return -ENOENT, it just means there is no
 * running transaction. But it is possible that the inactive transaction
 * is still in the memory, not fully on disk. If you hope there is no
 * inactive transaction in the fs when -ENOENT is returned, you should
 * invoke
 *     btrfs_attach_transaction_barrier()
 */
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
{
        return start_transaction(root, 0, TRANS_ATTACH,
                                 BTRFS_RESERVE_NO_FLUSH, true);
}

/*
 * Catch the running transaction.
 *
 * It is similar to the above function, the difference is this one
 * will wait for all the inactive transactions until they fully
 * complete.
 */
struct btrfs_trans_handle *
btrfs_attach_transaction_barrier(struct btrfs_root *root)
{
        struct btrfs_trans_handle *trans;

        trans = start_transaction(root, 0, TRANS_ATTACH,
                                  BTRFS_RESERVE_NO_FLUSH, true);
        if (trans == ERR_PTR(-ENOENT)) {
                int ret;

                ret = btrfs_wait_for_commit(root->fs_info, 0);
                if (ret)
                        return ERR_PTR(ret);
        }

        return trans;
}

/* Wait for a transaction commit to reach at least the given state. */
static noinline void wait_for_commit(struct btrfs_transaction *commit,
                                     const enum btrfs_trans_state min_state)
{
        struct btrfs_fs_info *fs_info = commit->fs_info;
        u64 transid = commit->transid;
        bool put = false;

        /*
         * At the moment this function is called with min_state either being
         * TRANS_STATE_COMPLETED or TRANS_STATE_SUPER_COMMITTED.
         */
        if (min_state == TRANS_STATE_COMPLETED)
                btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
        else
                btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);

        while (1) {
                wait_event(commit->commit_wait, commit->state >= min_state);
                if (put)
                        btrfs_put_transaction(commit);

                if (min_state < TRANS_STATE_COMPLETED)
                        break;

                /*
                 * A transaction isn't really completed until all of the
                 * previous transactions are completed, but with fsync we can
                 * end up with SUPER_COMMITTED transactions before a COMPLETED
                 * transaction. Wait for those.
                 */

                spin_lock(&fs_info->trans_lock);
                commit = list_first_entry_or_null(&fs_info->trans_list,
                                                  struct btrfs_transaction,
                                                  list);
                if (!commit || commit->transid > transid) {
                        spin_unlock(&fs_info->trans_lock);
                        break;
                }
                refcount_inc(&commit->use_count);
                put = true;
                spin_unlock(&fs_info->trans_lock);
        }
}

int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
{
        struct btrfs_transaction *cur_trans = NULL, *t;
        int ret = 0;

        if (transid) {
                if (transid <= btrfs_get_last_trans_committed(fs_info))
                        goto out;

                /* find specified transaction */
                spin_lock(&fs_info->trans_lock);
                list_for_each_entry(t, &fs_info->trans_list, list) {
                        if (t->transid == transid) {
                                cur_trans = t;
                                refcount_inc(&cur_trans->use_count);
                                ret = 0;
                                break;
                        }
                        if (t->transid > transid) {
                                ret = 0;
                                break;
                        }
                }
                spin_unlock(&fs_info->trans_lock);

                /*
                 * The specified transaction doesn't exist, or we
                 * raced with btrfs_commit_transaction
                 */
                if (!cur_trans) {
                        if (transid > btrfs_get_last_trans_committed(fs_info))
                                ret = -EINVAL;
                        goto out;
                }
        } else {
                /* find newest transaction that is committing | committed */
                spin_lock(&fs_info->trans_lock);
                list_for_each_entry_reverse(t, &fs_info->trans_list,
                                            list) {
                        if (t->state >= TRANS_STATE_COMMIT_START) {
                                if (t->state == TRANS_STATE_COMPLETED)
                                        break;
                                cur_trans = t;
                                refcount_inc(&cur_trans->use_count);
                                break;
                        }
                }
                spin_unlock(&fs_info->trans_lock);
                if (!cur_trans)
                        goto out;  /* nothing committing|committed */
        }

        wait_for_commit(cur_trans, TRANS_STATE_COMPLETED);
        ret = cur_trans->aborted;
        btrfs_put_transaction(cur_trans);
out:
        return ret;
}

void btrfs_throttle(struct btrfs_fs_info *fs_info)
{
        wait_current_trans(fs_info);
}

bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
{
        struct btrfs_transaction *cur_trans = trans->transaction;

        if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
            test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags))
                return true;

        if (btrfs_check_space_for_delayed_refs(trans->fs_info))
                return true;

        return !!btrfs_block_rsv_check(&trans->fs_info->global_block_rsv, 50);
}

static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)

{
        struct btrfs_fs_info *fs_info = trans->fs_info;

        if (!trans->block_rsv) {
                ASSERT(!trans->bytes_reserved);
                ASSERT(!trans->delayed_refs_bytes_reserved);
                return;
        }

        if (!trans->bytes_reserved) {
                ASSERT(!trans->delayed_refs_bytes_reserved);
                return;
        }

        ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
        trace_btrfs_space_reservation(fs_info, "transaction",
                                      trans->transid, trans->bytes_reserved, 0);
        btrfs_block_rsv_release(fs_info, trans->block_rsv,
                                trans->bytes_reserved, NULL);
        trans->bytes_reserved = 0;

        if (!trans->delayed_refs_bytes_reserved)
                return;

        trace_btrfs_space_reservation(fs_info, "local_delayed_refs_rsv",
                                      trans->transid,
                                      trans->delayed_refs_bytes_reserved, 0);
        btrfs_block_rsv_release(fs_info, &trans->delayed_rsv,
                                trans->delayed_refs_bytes_reserved, NULL);
        trans->delayed_refs_bytes_reserved = 0;
}

static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                                   int throttle)
{
        struct btrfs_fs_info *info = trans->fs_info;
        struct btrfs_transaction *cur_trans = trans->transaction;
        int ret = 0;

        if (refcount_read(&trans->use_count) > 1) {
                refcount_dec(&trans->use_count);
                trans->block_rsv = trans->orig_rsv;
                return 0;
        }

        btrfs_trans_release_metadata(trans);
        trans->block_rsv = NULL;

        btrfs_create_pending_block_groups(trans);

        btrfs_trans_release_chunk_metadata(trans);

        if (trans->type & __TRANS_FREEZABLE)
                sb_end_intwrite(info->sb);

        WARN_ON(cur_trans != info->running_transaction);
        WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
        atomic_dec(&cur_trans->num_writers);
        extwriter_counter_dec(cur_trans, trans->type);

        cond_wake_up(&cur_trans->writer_wait);

        btrfs_lockdep_release(info, btrfs_trans_num_extwriters);
        btrfs_lockdep_release(info, btrfs_trans_num_writers);

        btrfs_put_transaction(cur_trans);

        if (current->journal_info == trans)
                current->journal_info = NULL;

        if (throttle)
                btrfs_run_delayed_iputs(info);

        if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) {
                wake_up_process(info->transaction_kthread);
                if (TRANS_ABORTED(trans))
                        ret = trans->aborted;
                else
                        ret = -EROFS;
        }

        kmem_cache_free(btrfs_trans_handle_cachep, trans);
        return ret;
}

int btrfs_end_transaction(struct btrfs_trans_handle *trans)
{
        return __btrfs_end_transaction(trans, 0);
}

int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans)
{
        return __btrfs_end_transaction(trans, 1);
}

/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are sent to disk but does not wait on them
 */
int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
                               struct extent_io_tree *dirty_pages, int mark)
{
        int ret = 0;
        struct address_space *mapping = fs_info->btree_inode->i_mapping;
        struct extent_state *cached_state = NULL;
        u64 start = 0;
        u64 end;

        while (find_first_extent_bit(dirty_pages, start, &start, &end,
                                     mark, &cached_state)) {
                bool wait_writeback = false;

                ret = convert_extent_bit(dirty_pages, start, end,
                                         EXTENT_NEED_WAIT,
                                         mark, &cached_state);
                /*
                 * convert_extent_bit can return -ENOMEM, which is most of the
                 * time a temporary error. So when it happens, ignore the error
                 * and wait for writeback of this range to finish - because we
                 * failed to set the bit EXTENT_NEED_WAIT for the range, a call
                 * to __btrfs_wait_marked_extents() would not know that
                 * writeback for this range started and therefore wouldn't
                 * wait for it to finish - we don't want to commit a
                 * superblock that points to btree nodes/leafs for which
                 * writeback hasn't finished yet (and without errors).
                 * We cleanup any entries left in the io tree when committing
                 * the transaction (through extent_io_tree_release()).
                 */
                if (ret == -ENOMEM) {
                        ret = 0;
                        wait_writeback = true;
                }
                if (!ret)
                        ret = filemap_fdatawrite_range(mapping, start, end);
                if (!ret && wait_writeback)
                        ret = filemap_fdatawait_range(mapping, start, end);
                free_extent_state(cached_state);
                if (ret)
                        break;
                cached_state = NULL;
                cond_resched();
                start = end + 1;
        }
        return ret;
}

/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit.  We wait
 * on all the pages and clear them from the dirty pages state tree
 */
static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
                                       struct extent_io_tree *dirty_pages)
{
        struct address_space *mapping = fs_info->btree_inode->i_mapping;
        struct extent_state *cached_state = NULL;
        u64 start = 0;
        u64 end;
        int ret = 0;

        while (find_first_extent_bit(dirty_pages, start, &start, &end,
                                     EXTENT_NEED_WAIT, &cached_state)) {
                /*
                 * Ignore -ENOMEM errors returned by clear_extent_bit().
                 * When committing the transaction, we'll remove any entries
                 * left in the io tree. For a log commit, we don't remove them
                 * after committing the log because the tree can be accessed
                 * concurrently - we do it only at transaction commit time when
                 * it's safe to do it (through extent_io_tree_release()).
                 */
                ret = clear_extent_bit(dirty_pages, start, end,
                                       EXTENT_NEED_WAIT, &cached_state);
                if (ret == -ENOMEM)
                        ret = 0;
                if (!ret)
                        ret = filemap_fdatawait_range(mapping, start, end);
                free_extent_state(cached_state);
                if (ret)
                        break;
                cached_state = NULL;
                cond_resched();
                start = end + 1;
        }
        return ret;
}

static int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
                       struct extent_io_tree *dirty_pages)
{
        bool errors = false;
        int err;

        err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
        if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags))
                errors = true;

        if (errors && !err)
                err = -EIO;
        return err;
}

int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
{
        struct btrfs_fs_info *fs_info = log_root->fs_info;
        struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages;
        bool errors = false;
        int err;

        ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID);

        err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
        if ((mark & EXTENT_DIRTY) &&
            test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags))
                errors = true;

        if ((mark & EXTENT_NEW) &&
            test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags))
                errors = true;

        if (errors && !err)
                err = -EIO;
        return err;
}

/*
 * When btree blocks are allocated the corresponding extents are marked dirty.
 * This function ensures such extents are persisted on disk for transaction or
 * log commit.
 *
 * @trans: transaction whose dirty pages we'd like to write
 */
static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans)
{
        int ret;
        int ret2;
        struct extent_io_tree *dirty_pages = &trans->transaction->dirty_pages;
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct blk_plug plug;

        blk_start_plug(&plug);
        ret = btrfs_write_marked_extents(fs_info, dirty_pages, EXTENT_DIRTY);
        blk_finish_plug(&plug);
        ret2 = btrfs_wait_extents(fs_info, dirty_pages);

        extent_io_tree_release(&trans->transaction->dirty_pages);

        if (ret)
                return ret;
        else if (ret2)
                return ret2;
        else
                return 0;
}

/*
 * this is used to update the root pointer in the tree of tree roots.
 *
 * But, in the case of the extent allocation tree, updating the root
 * pointer may allocate blocks which may change the root of the extent
 * allocation tree.
 *
 * So, this loops and repeats and makes sure the cowonly root didn't
 * change while the root pointer was being updated in the metadata.
 */
static int update_cowonly_root(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root)
{
        int ret;
        u64 old_root_bytenr;
        u64 old_root_used;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_root *tree_root = fs_info->tree_root;

        old_root_used = btrfs_root_used(&root->root_item);

        while (1) {
                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
                if (old_root_bytenr == root->node->start &&
                    old_root_used == btrfs_root_used(&root->root_item))
                        break;

                btrfs_set_root_node(&root->root_item, root->node);
                ret = btrfs_update_root(trans, tree_root,
                                        &root->root_key,
                                        &root->root_item);
                if (ret)
                        return ret;

                old_root_used = btrfs_root_used(&root->root_item);
        }

        return 0;
}

/*
 * update all the cowonly tree roots on disk
 *
 * The error handling in this function may not be obvious. Any of the
 * failures will cause the file system to go offline. We still need
 * to clean up the delayed refs.
 */
static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
        struct list_head *io_bgs = &trans->transaction->io_bgs;
        struct list_head *next;
        struct extent_buffer *eb;
        int ret;

        /*
         * At this point no one can be using this transaction to modify any tree
         * and no one can start another transaction to modify any tree either.
         */
        ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);

        eb = btrfs_lock_root_node(fs_info->tree_root);
        ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
                              0, &eb, BTRFS_NESTING_COW);
        btrfs_tree_unlock(eb);
        free_extent_buffer(eb);

        if (ret)
                return ret;

        ret = btrfs_run_dev_stats(trans);
        if (ret)
                return ret;
        ret = btrfs_run_dev_replace(trans);
        if (ret)
                return ret;
        ret = btrfs_run_qgroups(trans);
        if (ret)
                return ret;

        ret = btrfs_setup_space_cache(trans);
        if (ret)
                return ret;

again:
        while (!list_empty(&fs_info->dirty_cowonly_roots)) {
                struct btrfs_root *root;
                next = fs_info->dirty_cowonly_roots.next;
                list_del_init(next);
                root = list_entry(next, struct btrfs_root, dirty_list);
                clear_bit(BTRFS_ROOT_DIRTY, &root->state);

                list_add_tail(&root->dirty_list,
                              &trans->transaction->switch_commits);
                ret = update_cowonly_root(trans, root);
                if (ret)
                        return ret;
        }

        /* Now flush any delayed refs generated by updating all of the roots */
        ret = btrfs_run_delayed_refs(trans, U64_MAX);
        if (ret)
                return ret;

        while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
                ret = btrfs_write_dirty_block_groups(trans);
                if (ret)
                        return ret;

                /*
                 * We're writing the dirty block groups, which could generate
                 * delayed refs, which could generate more dirty block groups,
                 * so we want to keep this flushing in this loop to make sure
                 * everything gets run.
                 */
                ret = btrfs_run_delayed_refs(trans, U64_MAX);
                if (ret)
                        return ret;
        }

        if (!list_empty(&fs_info->dirty_cowonly_roots))
                goto again;

        /* Update dev-replace pointer once everything is committed */
        fs_info->dev_replace.committed_cursor_left =
                fs_info->dev_replace.cursor_left_last_write_of_item;

        return 0;
}

/*
 * If we had a pending drop we need to see if there are any others left in our
 * dead roots list, and if not clear our bit and wake any waiters.
 */
void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
{
        /*
         * We put the drop in progress roots at the front of the list, so if the
         * first entry doesn't have UNFINISHED_DROP set we can wake everybody
         * up.
         */
        spin_lock(&fs_info->trans_lock);
        if (!list_empty(&fs_info->dead_roots)) {
                struct btrfs_root *root = list_first_entry(&fs_info->dead_roots,
                                                           struct btrfs_root,
                                                           root_list);
                if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) {
                        spin_unlock(&fs_info->trans_lock);
                        return;
                }
        }
        spin_unlock(&fs_info->trans_lock);

        btrfs_wake_unfinished_drop(fs_info);
}

/*
 * dead roots are old snapshots that need to be deleted.  This allocates
 * a dirty root struct and adds it into the list of dead roots that need to
 * be deleted
 */
void btrfs_add_dead_root(struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;

        spin_lock(&fs_info->trans_lock);
        if (list_empty(&root->root_list)) {
                btrfs_grab_root(root);

                /* We want to process the partially complete drops first. */
                if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state))
                        list_add(&root->root_list, &fs_info->dead_roots);
                else
                        list_add_tail(&root->root_list, &fs_info->dead_roots);
        }
        spin_unlock(&fs_info->trans_lock);
}

/*
 * Update each subvolume root and its relocation root, if it exists, in the tree
 * of tree roots. Also free log roots if they exist.
 */
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *gang[8];
        int i;
        int ret;

        /*
         * At this point no one can be using this transaction to modify any tree
         * and no one can start another transaction to modify any tree either.
         */
        ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);

        spin_lock(&fs_info->fs_roots_radix_lock);
        while (1) {
                ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
                                                 (void **)gang, 0,
                                                 ARRAY_SIZE(gang),
                                                 BTRFS_ROOT_TRANS_TAG);
                if (ret == 0)
                        break;
                for (i = 0; i < ret; i++) {
                        struct btrfs_root *root = gang[i];
                        int ret2;

                        /*
                         * At this point we can neither have tasks logging inodes
                         * from a root nor trying to commit a log tree.
                         */
                        ASSERT(atomic_read(&root->log_writers) == 0);
                        ASSERT(atomic_read(&root->log_commit[0]) == 0);
                        ASSERT(atomic_read(&root->log_commit[1]) == 0);

                        radix_tree_tag_clear(&fs_info->fs_roots_radix,
                                        (unsigned long)btrfs_root_id(root),
                                        BTRFS_ROOT_TRANS_TAG);
                        btrfs_qgroup_free_meta_all_pertrans(root);
                        spin_unlock(&fs_info->fs_roots_radix_lock);

                        btrfs_free_log(trans, root);
                        ret2 = btrfs_update_reloc_root(trans, root);
                        if (ret2)
                                return ret2;

                        /* see comments in should_cow_block() */
                        clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
                        smp_mb__after_atomic();

                        if (root->commit_root != root->node) {
                                list_add_tail(&root->dirty_list,
                                        &trans->transaction->switch_commits);
                                btrfs_set_root_node(&root->root_item,
                                                    root->node);
                        }

                        ret2 = btrfs_update_root(trans, fs_info->tree_root,
                                                &root->root_key,
                                                &root->root_item);
                        if (ret2)
                                return ret2;
                        spin_lock(&fs_info->fs_roots_radix_lock);
                }
        }
        spin_unlock(&fs_info->fs_roots_radix_lock);
        return 0;
}

/*
 * Do all special snapshot related qgroup dirty hack.
 *
 * Will do all needed qgroup inherit and dirty hack like switch commit
 * roots inside one transaction and write all btree into disk, to make
 * qgroup works.
 */
static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *src,
                                   struct btrfs_root *parent,
                                   struct btrfs_qgroup_inherit *inherit,
                                   u64 dst_objectid)
{
        struct btrfs_fs_info *fs_info = src->fs_info;
        int ret;

        /*
         * Save some performance in the case that qgroups are not enabled. If
         * this check races with the ioctl, rescan will kick in anyway.
         */
        if (!btrfs_qgroup_full_accounting(fs_info))
                return 0;

        /*
         * Ensure dirty @src will be committed.  Or, after coming
         * commit_fs_roots() and switch_commit_roots(), any dirty but not
         * recorded root will never be updated again, causing an outdated root
         * item.
         */
        ret = record_root_in_trans(trans, src, 1);
        if (ret)
                return ret;

        /*
         * btrfs_qgroup_inherit relies on a consistent view of the usage for the
         * src root, so we must run the delayed refs here.
         *
         * However this isn't particularly fool proof, because there's no
         * synchronization keeping us from changing the tree after this point
         * before we do the qgroup_inherit, or even from making changes while
         * we're doing the qgroup_inherit.  But that's a problem for the future,
         * for now flush the delayed refs to narrow the race window where the
         * qgroup counters could end up wrong.
         */
        ret = btrfs_run_delayed_refs(trans, U64_MAX);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                return ret;
        }

        ret = commit_fs_roots(trans);
        if (ret)
                goto out;
        ret = btrfs_qgroup_account_extents(trans);
        if (ret < 0)
                goto out;

        /* Now qgroup are all updated, we can inherit it to new qgroups */
        ret = btrfs_qgroup_inherit(trans, btrfs_root_id(src), dst_objectid,
                                   btrfs_root_id(parent), inherit);
        if (ret < 0)
                goto out;

        /*
         * Now we do a simplified commit transaction, which will:
         * 1) commit all subvolume and extent tree
         *    To ensure all subvolume and extent tree have a valid
         *    commit_root to accounting later insert_dir_item()
         * 2) write all btree blocks onto disk
         *    This is to make sure later btree modification will be cowed
         *    Or commit_root can be populated and cause wrong qgroup numbers
         * In this simplified commit, we don't really care about other trees
         * like chunk and root tree, as they won't affect qgroup.
         * And we don't write super to avoid half committed status.
         */
        ret = commit_cowonly_roots(trans);
        if (ret)
                goto out;
        switch_commit_roots(trans);
        ret = btrfs_write_and_wait_transaction(trans);
        if (ret)
                btrfs_handle_fs_error(fs_info, ret,
                        "Error while writing out transaction for qgroup");

out:
        /*
         * Force parent root to be updated, as we recorded it before so its
         * last_trans == cur_transid.
         * Or it won't be committed again onto disk after later
         * insert_dir_item()
         */
        if (!ret)
                ret = record_root_in_trans(trans, parent, 1);
        return ret;
}

/*
 * new snapshots need to be created at a very specific time in the
 * transaction commit.  This does the actual creation.
 *
 * Note:
 * If the error which may affect the commitment of the current transaction
 * happens, we should return the error number. If the error which just affect
 * the creation of the pending snapshots, just return 0.
 */
static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                                   struct btrfs_pending_snapshot *pending)
{

        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_key key;
        struct btrfs_root_item *new_root_item;
        struct btrfs_root *tree_root = fs_info->tree_root;
        struct btrfs_root *root = pending->root;
        struct btrfs_root *parent_root;
        struct btrfs_block_rsv *rsv;
        struct inode *parent_inode = pending->dir;
        struct btrfs_path *path;
        struct btrfs_dir_item *dir_item;
        struct extent_buffer *tmp;
        struct extent_buffer *old;
        struct timespec64 cur_time;
        int ret = 0;
        u64 to_reserve = 0;
        u64 index = 0;
        u64 objectid;
        u64 root_flags;
        unsigned int nofs_flags;
        struct fscrypt_name fname;

        ASSERT(pending->path);
        path = pending->path;

        ASSERT(pending->root_item);
        new_root_item = pending->root_item;

        /*
         * We're inside a transaction and must make sure that any potential
         * allocations with GFP_KERNEL in fscrypt won't recurse back to
         * filesystem.
         */
        nofs_flags = memalloc_nofs_save();
        pending->error = fscrypt_setup_filename(parent_inode,
                                                &pending->dentry->d_name, 0,
                                                &fname);
        memalloc_nofs_restore(nofs_flags);
        if (pending->error)
                goto free_pending;

        pending->error = btrfs_get_free_objectid(tree_root, &objectid);
        if (pending->error)
                goto free_fname;

        /*
         * Make qgroup to skip current new snapshot's qgroupid, as it is
         * accounted by later btrfs_qgroup_inherit().
         */
        btrfs_set_skip_qgroup(trans, objectid);

        btrfs_reloc_pre_snapshot(pending, &to_reserve);

        if (to_reserve > 0) {
                pending->error = btrfs_block_rsv_add(fs_info,
                                                     &pending->block_rsv,
                                                     to_reserve,
                                                     BTRFS_RESERVE_NO_FLUSH);
                if (pending->error)
                        goto clear_skip_qgroup;
        }

        key.objectid = objectid;
        key.offset = (u64)-1;
        key.type = BTRFS_ROOT_ITEM_KEY;

        rsv = trans->block_rsv;
        trans->block_rsv = &pending->block_rsv;
        trans->bytes_reserved = trans->block_rsv->reserved;
        trace_btrfs_space_reservation(fs_info, "transaction",
                                      trans->transid,
                                      trans->bytes_reserved, 1);
        parent_root = BTRFS_I(parent_inode)->root;
        ret = record_root_in_trans(trans, parent_root, 0);
        if (ret)
                goto fail;
        cur_time = current_time(parent_inode);

        /*
         * insert the directory item
         */
        ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto fail;
        }

        /* check if there is a file/dir which has the same name. */
        dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
                                         btrfs_ino(BTRFS_I(parent_inode)),
                                         &fname.disk_name, 0);
        if (dir_item != NULL && !IS_ERR(dir_item)) {
                pending->error = -EEXIST;
                goto dir_item_existed;
        } else if (IS_ERR(dir_item)) {
                ret = PTR_ERR(dir_item);
                btrfs_abort_transaction(trans, ret);
                goto fail;
        }
        btrfs_release_path(path);

        ret = btrfs_create_qgroup(trans, objectid);
        if (ret && ret != -EEXIST) {
                btrfs_abort_transaction(trans, ret);
                goto fail;
        }

        /*
         * pull in the delayed directory update
         * and the delayed inode item
         * otherwise we corrupt the FS during
         * snapshot
         */
        ret = btrfs_run_delayed_items(trans);
        if (ret) {        /* Transaction aborted */
                btrfs_abort_transaction(trans, ret);
                goto fail;
        }

        ret = record_root_in_trans(trans, root, 0);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto fail;
        }
        btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
        btrfs_check_and_init_root_item(new_root_item);

        root_flags = btrfs_root_flags(new_root_item);
        if (pending->readonly)
                root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
        else
                root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
        btrfs_set_root_flags(new_root_item, root_flags);

        btrfs_set_root_generation_v2(new_root_item,
                        trans->transid);
        generate_random_guid(new_root_item->uuid);
        memcpy(new_root_item->parent_uuid, root->root_item.uuid,
                        BTRFS_UUID_SIZE);
        if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) {
                memset(new_root_item->received_uuid, 0,
                       sizeof(new_root_item->received_uuid));
                memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
                memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
                btrfs_set_root_stransid(new_root_item, 0);
                btrfs_set_root_rtransid(new_root_item, 0);
        }
        btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec);
        btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec);
        btrfs_set_root_otransid(new_root_item, trans->transid);

        old = btrfs_lock_root_node(root);
        ret = btrfs_cow_block(trans, root, old, NULL, 0, &old,
                              BTRFS_NESTING_COW);
        if (ret) {
                btrfs_tree_unlock(old);
                free_extent_buffer(old);
                btrfs_abort_transaction(trans, ret);
                goto fail;
        }

        ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
        /* clean up in any case */
        btrfs_tree_unlock(old);
        free_extent_buffer(old);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto fail;
        }
        /* see comments in should_cow_block() */
        set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
        smp_wmb();

        btrfs_set_root_node(new_root_item, tmp);
        /* record when the snapshot was created in key.offset */
        key.offset = trans->transid;
        ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
        btrfs_tree_unlock(tmp);
        free_extent_buffer(tmp);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto fail;
        }

        /*
         * insert root back/forward references
         */
        ret = btrfs_add_root_ref(trans, objectid,
                                 btrfs_root_id(parent_root),
                                 btrfs_ino(BTRFS_I(parent_inode)), index,
                                 &fname.disk_name);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto fail;
        }

        key.offset = (u64)-1;
        pending->snap = btrfs_get_new_fs_root(fs_info, objectid, &pending->anon_dev);
        if (IS_ERR(pending->snap)) {
                ret = PTR_ERR(pending->snap);
                pending->snap = NULL;
                btrfs_abort_transaction(trans, ret);
                goto fail;
        }

        ret = btrfs_reloc_post_snapshot(trans, pending);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto fail;
        }

        /*
         * Do special qgroup accounting for snapshot, as we do some qgroup
         * snapshot hack to do fast snapshot.
         * To co-operate with that hack, we do hack again.
         * Or snapshot will be greatly slowed down by a subtree qgroup rescan
         */
        if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL)
                ret = qgroup_account_snapshot(trans, root, parent_root,
                                              pending->inherit, objectid);
        else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
                ret = btrfs_qgroup_inherit(trans, btrfs_root_id(root), objectid,
                                           btrfs_root_id(parent_root), pending->inherit);
        if (ret < 0)
                goto fail;

        ret = btrfs_insert_dir_item(trans, &fname.disk_name,
                                    BTRFS_I(parent_inode), &key, BTRFS_FT_DIR,
                                    index);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto fail;
        }

        btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
                                                  fname.disk_name.len * 2);
        inode_set_mtime_to_ts(parent_inode,
                              inode_set_ctime_current(parent_inode));
        ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode));
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto fail;
        }
        ret = btrfs_uuid_tree_add(trans, new_root_item->uuid,
                                  BTRFS_UUID_KEY_SUBVOL,
                                  objectid);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto fail;
        }
        if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
                ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid,
                                          BTRFS_UUID_KEY_RECEIVED_SUBVOL,
                                          objectid);
                if (ret && ret != -EEXIST) {
                        btrfs_abort_transaction(trans, ret);
                        goto fail;
                }
        }

fail:
        pending->error = ret;
dir_item_existed:
        trans->block_rsv = rsv;
        trans->bytes_reserved = 0;
clear_skip_qgroup:
        btrfs_clear_skip_qgroup(trans);
free_fname:
        fscrypt_free_filename(&fname);
free_pending:
        kfree(new_root_item);
        pending->root_item = NULL;
        btrfs_free_path(path);
        pending->path = NULL;

        return ret;
}

/*
 * create all the snapshots we've scheduled for creation
 */
static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans)
{
        struct btrfs_pending_snapshot *pending, *next;
        struct list_head *head = &trans->transaction->pending_snapshots;
        int ret = 0;

        list_for_each_entry_safe(pending, next, head, list) {
                list_del(&pending->list);
                ret = create_pending_snapshot(trans, pending);
                if (ret)
                        break;
        }
        return ret;
}

static void update_super_roots(struct btrfs_fs_info *fs_info)
{
        struct btrfs_root_item *root_item;
        struct btrfs_super_block *super;

        super = fs_info->super_copy;

        root_item = &fs_info->chunk_root->root_item;
        super->chunk_root = root_item->bytenr;
        super->chunk_root_generation = root_item->generation;
        super->chunk_root_level = root_item->level;

        root_item = &fs_info->tree_root->root_item;
        super->root = root_item->bytenr;
        super->generation = root_item->generation;
        super->root_level = root_item->level;
        if (btrfs_test_opt(fs_info, SPACE_CACHE))
                super->cache_generation = root_item->generation;
        else if (test_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags))
                super->cache_generation = 0;
        if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
                super->uuid_tree_generation = root_item->generation;
}

int btrfs_transaction_blocked(struct btrfs_fs_info *info)
{
        struct btrfs_transaction *trans;
        int ret = 0;

        spin_lock(&info->trans_lock);
        trans = info->running_transaction;
        if (trans)
                ret = is_transaction_blocked(trans);
        spin_unlock(&info->trans_lock);
        return ret;
}

void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_transaction *cur_trans;

        /* Kick the transaction kthread. */
        set_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);
        wake_up_process(fs_info->transaction_kthread);

        /* take transaction reference */
        cur_trans = trans->transaction;
        refcount_inc(&cur_trans->use_count);

        btrfs_end_transaction(trans);

        /*
         * Wait for the current transaction commit to start and block
         * subsequent transaction joins
         */
        btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
        wait_event(fs_info->transaction_blocked_wait,
                   cur_trans->state >= TRANS_STATE_COMMIT_START ||
                   TRANS_ABORTED(cur_trans));
        btrfs_put_transaction(cur_trans);
}

static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_transaction *cur_trans = trans->transaction;

        WARN_ON(refcount_read(&trans->use_count) > 1);

        btrfs_abort_transaction(trans, err);

        spin_lock(&fs_info->trans_lock);

        /*
         * If the transaction is removed from the list, it means this
         * transaction has been committed successfully, so it is impossible
         * to call the cleanup function.
         */
        BUG_ON(list_empty(&cur_trans->list));

        if (cur_trans == fs_info->running_transaction) {
                cur_trans->state = TRANS_STATE_COMMIT_DOING;
                spin_unlock(&fs_info->trans_lock);

                /*
                 * The thread has already released the lockdep map as reader
                 * already in btrfs_commit_transaction().
                 */
                btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers);
                wait_event(cur_trans->writer_wait,
                           atomic_read(&cur_trans->num_writers) == 1);

                spin_lock(&fs_info->trans_lock);
        }

        /*
         * Now that we know no one else is still using the transaction we can
         * remove the transaction from the list of transactions. This avoids
         * the transaction kthread from cleaning up the transaction while some
         * other task is still using it, which could result in a use-after-free
         * on things like log trees, as it forces the transaction kthread to
         * wait for this transaction to be cleaned up by us.
         */
        list_del_init(&cur_trans->list);

        spin_unlock(&fs_info->trans_lock);

        btrfs_cleanup_one_transaction(trans->transaction, fs_info);

        spin_lock(&fs_info->trans_lock);
        if (cur_trans == fs_info->running_transaction)
                fs_info->running_transaction = NULL;
        spin_unlock(&fs_info->trans_lock);

        if (trans->type & __TRANS_FREEZABLE)
                sb_end_intwrite(fs_info->sb);
        btrfs_put_transaction(cur_trans);
        btrfs_put_transaction(cur_trans);

        trace_btrfs_transaction_commit(fs_info);

        if (current->journal_info == trans)
                current->journal_info = NULL;

        /*
         * If relocation is running, we can't cancel scrub because that will
         * result in a deadlock. Before relocating a block group, relocation
         * pauses scrub, then starts and commits a transaction before unpausing
         * scrub. If the transaction commit is being done by the relocation
         * task or triggered by another task and the relocation task is waiting
         * for the commit, and we end up here due to an error in the commit
         * path, then calling btrfs_scrub_cancel() will deadlock, as we are
         * asking for scrub to stop while having it asked to be paused higher
         * above in relocation code.
         */
        if (!test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
                btrfs_scrub_cancel(fs_info);

        kmem_cache_free(btrfs_trans_handle_cachep, trans);
}

/*
 * Release reserved delayed ref space of all pending block groups of the
 * transaction and remove them from the list
 */
static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
{
       struct btrfs_fs_info *fs_info = trans->fs_info;
       struct btrfs_block_group *block_group, *tmp;

       list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
               btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
               list_del_init(&block_group->bg_list);
       }
}

static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{
        /*
         * We use try_to_writeback_inodes_sb() here because if we used
         * btrfs_start_delalloc_roots we would deadlock with fs freeze.
         * Currently are holding the fs freeze lock, if we do an async flush
         * we'll do btrfs_join_transaction() and deadlock because we need to
         * wait for the fs freeze lock.  Using the direct flushing we benefit
         * from already being in a transaction and our join_transaction doesn't
         * have to re-take the fs freeze lock.
         *
         * Note that try_to_writeback_inodes_sb() will only trigger writeback
         * if it can read lock sb->s_umount. It will always be able to lock it,
         * except when the filesystem is being unmounted or being frozen, but in
         * those cases sync_filesystem() is called, which results in calling
         * writeback_inodes_sb() while holding a write lock on sb->s_umount.
         * Note that we don't call writeback_inodes_sb() directly, because it
         * will emit a warning if sb->s_umount is not locked.
         */
        if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
                try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
        return 0;
}

static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
{
        if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
                btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
}

/*
 * Add a pending snapshot associated with the given transaction handle to the
 * respective handle. This must be called after the transaction commit started
 * and while holding fs_info->trans_lock.
 * This serves to guarantee a caller of btrfs_commit_transaction() that it can
 * safely free the pending snapshot pointer in case btrfs_commit_transaction()
 * returns an error.
 */
static void add_pending_snapshot(struct btrfs_trans_handle *trans)
{
        struct btrfs_transaction *cur_trans = trans->transaction;

        if (!trans->pending_snapshot)
                return;

        lockdep_assert_held(&trans->fs_info->trans_lock);
        ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP);

        list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots);
}

static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval)
{
        fs_info->commit_stats.commit_count++;
        fs_info->commit_stats.last_commit_dur = interval;
        fs_info->commit_stats.max_commit_dur =
                        max_t(u64, fs_info->commit_stats.max_commit_dur, interval);
        fs_info->commit_stats.total_commit_dur += interval;
}

int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_transaction *prev_trans = NULL;
        int ret;
        ktime_t start_time;
        ktime_t interval;

        ASSERT(refcount_read(&trans->use_count) == 1);
        btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);

        clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags);

        /* Stop the commit early if ->aborted is set */
        if (TRANS_ABORTED(cur_trans)) {
                ret = cur_trans->aborted;
                goto lockdep_trans_commit_start_release;
        }

        btrfs_trans_release_metadata(trans);
        trans->block_rsv = NULL;

        /*
         * We only want one transaction commit doing the flushing so we do not
         * waste a bunch of time on lock contention on the extent root node.
         */
        if (!test_and_set_bit(BTRFS_DELAYED_REFS_FLUSHING,
                              &cur_trans->delayed_refs.flags)) {
                /*
                 * Make a pass through all the delayed refs we have so far.
                 * Any running threads may add more while we are here.
                 */
                ret = btrfs_run_delayed_refs(trans, 0);
                if (ret)
                        goto lockdep_trans_commit_start_release;
        }

        btrfs_create_pending_block_groups(trans);

        if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
                int run_it = 0;

                /* this mutex is also taken before trying to set
                 * block groups readonly.  We need to make sure
                 * that nobody has set a block group readonly
                 * after a extents from that block group have been
                 * allocated for cache files.  btrfs_set_block_group_ro
                 * will wait for the transaction to commit if it
                 * finds BTRFS_TRANS_DIRTY_BG_RUN set.
                 *
                 * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
                 * only one process starts all the block group IO.  It wouldn't
                 * hurt to have more than one go through, but there's no
                 * real advantage to it either.
                 */
                mutex_lock(&fs_info->ro_block_group_mutex);
                if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
                                      &cur_trans->flags))
                        run_it = 1;
                mutex_unlock(&fs_info->ro_block_group_mutex);

                if (run_it) {
                        ret = btrfs_start_dirty_block_groups(trans);
                        if (ret)
                                goto lockdep_trans_commit_start_release;
                }
        }

        spin_lock(&fs_info->trans_lock);
        if (cur_trans->state >= TRANS_STATE_COMMIT_PREP) {
                enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;

                add_pending_snapshot(trans);

                spin_unlock(&fs_info->trans_lock);
                refcount_inc(&cur_trans->use_count);

                if (trans->in_fsync)
                        want_state = TRANS_STATE_SUPER_COMMITTED;

                btrfs_trans_state_lockdep_release(fs_info,
                                                  BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
                ret = btrfs_end_transaction(trans);
                wait_for_commit(cur_trans, want_state);

                if (TRANS_ABORTED(cur_trans))
                        ret = cur_trans->aborted;

                btrfs_put_transaction(cur_trans);

                return ret;
        }

        cur_trans->state = TRANS_STATE_COMMIT_PREP;
        wake_up(&fs_info->transaction_blocked_wait);
        btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);

        if (cur_trans->list.prev != &fs_info->trans_list) {
                enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;

                if (trans->in_fsync)
                        want_state = TRANS_STATE_SUPER_COMMITTED;

                prev_trans = list_entry(cur_trans->list.prev,
                                        struct btrfs_transaction, list);
                if (prev_trans->state < want_state) {
                        refcount_inc(&prev_trans->use_count);
                        spin_unlock(&fs_info->trans_lock);

                        wait_for_commit(prev_trans, want_state);

                        ret = READ_ONCE(prev_trans->aborted);

                        btrfs_put_transaction(prev_trans);
                        if (ret)
                                goto lockdep_release;
                        spin_lock(&fs_info->trans_lock);
                }
        } else {
                /*
                 * The previous transaction was aborted and was already removed
                 * from the list of transactions at fs_info->trans_list. So we
                 * abort to prevent writing a new superblock that reflects a
                 * corrupt state (pointing to trees with unwritten nodes/leafs).
                 */
                if (BTRFS_FS_ERROR(fs_info)) {
                        spin_unlock(&fs_info->trans_lock);
                        ret = -EROFS;
                        goto lockdep_release;
                }
        }

        cur_trans->state = TRANS_STATE_COMMIT_START;
        wake_up(&fs_info->transaction_blocked_wait);
        spin_unlock(&fs_info->trans_lock);

        /*
         * Get the time spent on the work done by the commit thread and not
         * the time spent waiting on a previous commit
         */
        start_time = ktime_get_ns();

        extwriter_counter_dec(cur_trans, trans->type);

        ret = btrfs_start_delalloc_flush(fs_info);
        if (ret)
                goto lockdep_release;

        ret = btrfs_run_delayed_items(trans);
        if (ret)
                goto lockdep_release;

        /*
         * The thread has started/joined the transaction thus it holds the
         * lockdep map as a reader. It has to release it before acquiring the
         * lockdep map as a writer.
         */
        btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
        btrfs_might_wait_for_event(fs_info, btrfs_trans_num_extwriters);
        wait_event(cur_trans->writer_wait,
                   extwriter_counter_read(cur_trans) == 0);

        /* some pending stuffs might be added after the previous flush. */
        ret = btrfs_run_delayed_items(trans);
        if (ret) {
                btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
                goto cleanup_transaction;
        }

        btrfs_wait_delalloc_flush(fs_info);

        /*
         * Wait for all ordered extents started by a fast fsync that joined this
         * transaction. Otherwise if this transaction commits before the ordered
         * extents complete we lose logged data after a power failure.
         */
        btrfs_might_wait_for_event(fs_info, btrfs_trans_pending_ordered);
        wait_event(cur_trans->pending_wait,
                   atomic_read(&cur_trans->pending_ordered) == 0);

        btrfs_scrub_pause(fs_info);
        /*
         * Ok now we need to make sure to block out any other joins while we
         * commit the transaction.  We could have started a join before setting
         * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
         */
        spin_lock(&fs_info->trans_lock);
        add_pending_snapshot(trans);
        cur_trans->state = TRANS_STATE_COMMIT_DOING;
        spin_unlock(&fs_info->trans_lock);

        /*
         * The thread has started/joined the transaction thus it holds the
         * lockdep map as a reader. It has to release it before acquiring the
         * lockdep map as a writer.
         */
        btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
        btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers);
        wait_event(cur_trans->writer_wait,
                   atomic_read(&cur_trans->num_writers) == 1);

        /*
         * Make lockdep happy by acquiring the state locks after
         * btrfs_trans_num_writers is released. If we acquired the state locks
         * before releasing the btrfs_trans_num_writers lock then lockdep would
         * complain because we did not follow the reverse order unlocking rule.
         */
        btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
        btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
        btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);

        /*
         * We've started the commit, clear the flag in case we were triggered to
         * do an async commit but somebody else started before the transaction
         * kthread could do the work.
         */
        clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);

        if (TRANS_ABORTED(cur_trans)) {
                ret = cur_trans->aborted;
                btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
                goto scrub_continue;
        }
        /*
         * the reloc mutex makes sure that we stop
         * the balancing code from coming in and moving
         * extents around in the middle of the commit
         */
        mutex_lock(&fs_info->reloc_mutex);

        /*
         * We needn't worry about the delayed items because we will
         * deal with them in create_pending_snapshot(), which is the
         * core function of the snapshot creation.
         */
        ret = create_pending_snapshots(trans);
        if (ret)
                goto unlock_reloc;

        /*
         * We insert the dir indexes of the snapshots and update the inode
         * of the snapshots' parents after the snapshot creation, so there
         * are some delayed items which are not dealt with. Now deal with
         * them.
         *
         * We needn't worry that this operation will corrupt the snapshots,
         * because all the tree which are snapshoted will be forced to COW
         * the nodes and leaves.
         */
        ret = btrfs_run_delayed_items(trans);
        if (ret)
                goto unlock_reloc;

        ret = btrfs_run_delayed_refs(trans, U64_MAX);
        if (ret)
                goto unlock_reloc;

        /*
         * make sure none of the code above managed to slip in a
         * delayed item
         */
        btrfs_assert_delayed_root_empty(fs_info);

        WARN_ON(cur_trans != trans->transaction);

        ret = commit_fs_roots(trans);
        if (ret)
                goto unlock_reloc;

        /* commit_fs_roots gets rid of all the tree log roots, it is now
         * safe to free the root of tree log roots
         */
        btrfs_free_log_root_tree(trans, fs_info);

        /*
         * Since fs roots are all committed, we can get a quite accurate
         * new_roots. So let's do quota accounting.
         */
        ret = btrfs_qgroup_account_extents(trans);
        if (ret < 0)
                goto unlock_reloc;

        ret = commit_cowonly_roots(trans);
        if (ret)
                goto unlock_reloc;

        /*
         * The tasks which save the space cache and inode cache may also
         * update ->aborted, check it.
         */
        if (TRANS_ABORTED(cur_trans)) {
                ret = cur_trans->aborted;
                goto unlock_reloc;
        }

        cur_trans = fs_info->running_transaction;

        btrfs_set_root_node(&fs_info->tree_root->root_item,
                            fs_info->tree_root->node);
        list_add_tail(&fs_info->tree_root->dirty_list,
                      &cur_trans->switch_commits);

        btrfs_set_root_node(&fs_info->chunk_root->root_item,
                            fs_info->chunk_root->node);
        list_add_tail(&fs_info->chunk_root->dirty_list,
                      &cur_trans->switch_commits);

        if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
                btrfs_set_root_node(&fs_info->block_group_root->root_item,
                                    fs_info->block_group_root->node);
                list_add_tail(&fs_info->block_group_root->dirty_list,
                              &cur_trans->switch_commits);
        }

        switch_commit_roots(trans);

        ASSERT(list_empty(&cur_trans->dirty_bgs));
        ASSERT(list_empty(&cur_trans->io_bgs));
        update_super_roots(fs_info);

        btrfs_set_super_log_root(fs_info->super_copy, 0);
        btrfs_set_super_log_root_level(fs_info->super_copy, 0);
        memcpy(fs_info->super_for_commit, fs_info->super_copy,
               sizeof(*fs_info->super_copy));

        btrfs_commit_device_sizes(cur_trans);

        clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
        clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);

        btrfs_trans_release_chunk_metadata(trans);

        /*
         * Before changing the transaction state to TRANS_STATE_UNBLOCKED and
         * setting fs_info->running_transaction to NULL, lock tree_log_mutex to
         * make sure that before we commit our superblock, no other task can
         * start a new transaction and commit a log tree before we commit our
         * superblock. Anyone trying to commit a log tree locks this mutex before
         * writing its superblock.
         */
        mutex_lock(&fs_info->tree_log_mutex);

        spin_lock(&fs_info->trans_lock);
        cur_trans->state = TRANS_STATE_UNBLOCKED;
        fs_info->running_transaction = NULL;
        spin_unlock(&fs_info->trans_lock);
        mutex_unlock(&fs_info->reloc_mutex);

        wake_up(&fs_info->transaction_wait);
        btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);

        /* If we have features changed, wake up the cleaner to update sysfs. */
        if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) &&
            fs_info->cleaner_kthread)
                wake_up_process(fs_info->cleaner_kthread);

        ret = btrfs_write_and_wait_transaction(trans);
        if (ret) {
                btrfs_handle_fs_error(fs_info, ret,
                                      "Error while writing out transaction");
                mutex_unlock(&fs_info->tree_log_mutex);
                goto scrub_continue;
        }

        ret = write_all_supers(fs_info, 0);
        /*
         * the super is written, we can safely allow the tree-loggers
         * to go about their business
         */
        mutex_unlock(&fs_info->tree_log_mutex);
        if (ret)
                goto scrub_continue;

        /*
         * We needn't acquire the lock here because there is no other task
         * which can change it.
         */
        cur_trans->state = TRANS_STATE_SUPER_COMMITTED;
        wake_up(&cur_trans->commit_wait);
        btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);

        btrfs_finish_extent_commit(trans);

        if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
                btrfs_clear_space_info_full(fs_info);

        btrfs_set_last_trans_committed(fs_info, cur_trans->transid);
        /*
         * We needn't acquire the lock here because there is no other task
         * which can change it.
         */
        cur_trans->state = TRANS_STATE_COMPLETED;
        wake_up(&cur_trans->commit_wait);
        btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);

        spin_lock(&fs_info->trans_lock);
        list_del_init(&cur_trans->list);
        spin_unlock(&fs_info->trans_lock);

        btrfs_put_transaction(cur_trans);
        btrfs_put_transaction(cur_trans);

        if (trans->type & __TRANS_FREEZABLE)
                sb_end_intwrite(fs_info->sb);

        trace_btrfs_transaction_commit(fs_info);

        interval = ktime_get_ns() - start_time;

        btrfs_scrub_continue(fs_info);

        if (current->journal_info == trans)
                current->journal_info = NULL;

        kmem_cache_free(btrfs_trans_handle_cachep, trans);

        update_commit_stats(fs_info, interval);

        return ret;

unlock_reloc:
        mutex_unlock(&fs_info->reloc_mutex);
        btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
scrub_continue:
        btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
        btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
        btrfs_scrub_continue(fs_info);
cleanup_transaction:
        btrfs_trans_release_metadata(trans);
        btrfs_cleanup_pending_block_groups(trans);
        btrfs_trans_release_chunk_metadata(trans);
        trans->block_rsv = NULL;
        btrfs_warn(fs_info, "Skipping commit of aborted transaction.");
        if (current->journal_info == trans)
                current->journal_info = NULL;
        cleanup_transaction(trans, ret);

        return ret;

lockdep_release:
        btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
        btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
        goto cleanup_transaction;

lockdep_trans_commit_start_release:
        btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
        btrfs_end_transaction(trans);
        return ret;
}

/*
 * return < 0 if error
 * 0 if there are no more dead_roots at the time of call
 * 1 there are more to be processed, call me again
 *
 * The return value indicates there are certainly more snapshots to delete, but
 * if there comes a new one during processing, it may return 0. We don't mind,
 * because btrfs_commit_super will poke cleaner thread and it will process it a
 * few seconds later.
 */
int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
{
        struct btrfs_root *root;
        int ret;

        spin_lock(&fs_info->trans_lock);
        if (list_empty(&fs_info->dead_roots)) {
                spin_unlock(&fs_info->trans_lock);
                return 0;
        }
        root = list_first_entry(&fs_info->dead_roots,
                        struct btrfs_root, root_list);
        list_del_init(&root->root_list);
        spin_unlock(&fs_info->trans_lock);

        btrfs_debug(fs_info, "cleaner removing %llu", btrfs_root_id(root));

        btrfs_kill_all_delayed_nodes(root);

        if (btrfs_header_backref_rev(root->node) <
                        BTRFS_MIXED_BACKREF_REV)
                ret = btrfs_drop_snapshot(root, 0, 0);
        else
                ret = btrfs_drop_snapshot(root, 1, 0);

        btrfs_put_root(root);
        return (ret < 0) ? 0 : 1;
}

/*
 * We only mark the transaction aborted and then set the file system read-only.
 * This will prevent new transactions from starting or trying to join this
 * one.
 *
 * This means that error recovery at the call site is limited to freeing
 * any local memory allocations and passing the error code up without
 * further cleanup. The transaction should complete as it normally would
 * in the call path but will return -EIO.
 *
 * We'll complete the cleanup in btrfs_end_transaction and
 * btrfs_commit_transaction.
 */
void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
                                      const char *function,
                                      unsigned int line, int error, bool first_hit)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;

        WRITE_ONCE(trans->aborted, error);
        WRITE_ONCE(trans->transaction->aborted, error);
        if (first_hit && error == -ENOSPC)
                btrfs_dump_space_info_for_trans_abort(fs_info);
        /* Wake up anybody who may be waiting on this transaction */
        wake_up(&fs_info->transaction_wait);
        wake_up(&fs_info->transaction_blocked_wait);
        __btrfs_handle_fs_error(fs_info, function, line, error, NULL);
}

int __init btrfs_transaction_init(void)
{
        btrfs_trans_handle_cachep = KMEM_CACHE(btrfs_trans_handle, SLAB_TEMPORARY);
        if (!btrfs_trans_handle_cachep)
                return -ENOMEM;
        return 0;
}

void __cold btrfs_transaction_exit(void)
{
        kmem_cache_destroy(btrfs_trans_handle_cachep);
}

































































































































































































   51 
   53 













   52 




   53 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
 */
#include <linux/sched/debug.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>
#include <linux/kdebug.h>
#include <linux/export.h>
#include <linux/ptrace.h>
#include <linux/kexec.h>
#include <linux/sysfs.h>
#include <linux/bug.h>
#include <linux/nmi.h>

#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>

static const char * const exception_stack_names[] = {
                [ ESTACK_DF        ]        = "#DF",
                [ ESTACK_NMI        ]        = "NMI",
                [ ESTACK_DB        ]        = "#DB",
                [ ESTACK_MCE        ]        = "#MC",
                [ ESTACK_VC        ]        = "#VC",
                [ ESTACK_VC2        ]        = "#VC2",
};

const char *stack_type_name(enum stack_type type)
{
        BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);

        if (type == STACK_TYPE_TASK)
                return "TASK";

        if (type == STACK_TYPE_IRQ)
                return "IRQ";

        if (type == STACK_TYPE_SOFTIRQ)
                return "SOFTIRQ";

        if (type == STACK_TYPE_ENTRY) {
                /*
                 * On 64-bit, we have a generic entry stack that we
                 * use for all the kernel entry points, including
                 * SYSENTER.
                 */
                return "ENTRY_TRAMPOLINE";
        }

        if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
                return exception_stack_names[type - STACK_TYPE_EXCEPTION];

        return NULL;
}

/**
 * struct estack_pages - Page descriptor for exception stacks
 * @offs:        Offset from the start of the exception stack area
 * @size:        Size of the exception stack
 * @type:        Type to store in the stack_info struct
 */
struct estack_pages {
        u32        offs;
        u16        size;
        u16        type;
};

#define EPAGERANGE(st)                                                        \
        [PFN_DOWN(CEA_ESTACK_OFFS(st)) ...                                \
         PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = {        \
                .offs        = CEA_ESTACK_OFFS(st),                                \
                .size        = CEA_ESTACK_SIZE(st),                                \
                .type        = STACK_TYPE_EXCEPTION + ESTACK_ ##st, }

/*
 * Array of exception stack page descriptors. If the stack is larger than
 * PAGE_SIZE, all pages covering a particular stack will have the same
 * info. The guard pages including the not mapped DB2 stack are zeroed
 * out.
 */
static const
struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = {
        EPAGERANGE(DF),
        EPAGERANGE(NMI),
        EPAGERANGE(DB),
        EPAGERANGE(MCE),
        EPAGERANGE(VC),
        EPAGERANGE(VC2),
};

static __always_inline bool in_exception_stack(unsigned long *stack, struct stack_info *info)
{
        unsigned long begin, end, stk = (unsigned long)stack;
        const struct estack_pages *ep;
        struct pt_regs *regs;
        unsigned int k;

        BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);

        begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
        /*
         * Handle the case where stack trace is collected _before_
         * cea_exception_stacks had been initialized.
         */
        if (!begin)
                return false;

        end = begin + sizeof(struct cea_exception_stacks);
        /* Bail if @stack is outside the exception stack area. */
        if (stk < begin || stk >= end)
                return false;

        /* Calc page offset from start of exception stacks */
        k = (stk - begin) >> PAGE_SHIFT;
        /* Lookup the page descriptor */
        ep = &estack_pages[k];
        /* Guard page? */
        if (!ep->size)
                return false;

        begin += (unsigned long)ep->offs;
        end = begin + (unsigned long)ep->size;
        regs = (struct pt_regs *)end - 1;

        info->type        = ep->type;
        info->begin        = (unsigned long *)begin;
        info->end        = (unsigned long *)end;
        info->next_sp        = (unsigned long *)regs->sp;
        return true;
}

static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info)
{
        unsigned long *end = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr);
        unsigned long *begin;

        /*
         * @end points directly to the top most stack entry to avoid a -8
         * adjustment in the stack switch hotpath. Adjust it back before
         * calculating @begin.
         */
        end++;
        begin = end - (IRQ_STACK_SIZE / sizeof(long));

        /*
         * Due to the switching logic RSP can never be == @end because the
         * final operation is 'popq %rsp' which means after that RSP points
         * to the original stack and not to @end.
         */
        if (stack < begin || stack >= end)
                return false;

        info->type        = STACK_TYPE_IRQ;
        info->begin        = begin;
        info->end        = end;

        /*
         * The next stack pointer is stored at the top of the irq stack
         * before switching to the irq stack. Actual stack entries are all
         * below that.
         */
        info->next_sp = (unsigned long *)*(end - 1);

        return true;
}

bool noinstr get_stack_info_noinstr(unsigned long *stack, struct task_struct *task,
                                    struct stack_info *info)
{
        if (in_task_stack(stack, task, info))
                return true;

        if (task != current)
                return false;

        if (in_exception_stack(stack, info))
                return true;

        if (in_irq_stack(stack, info))
                return true;

        if (in_entry_stack(stack, info))
                return true;

        return false;
}

int get_stack_info(unsigned long *stack, struct task_struct *task,
                   struct stack_info *info, unsigned long *visit_mask)
{
        task = task ? : current;

        if (!stack)
                goto unknown;

        if (!get_stack_info_noinstr(stack, task, info))
                goto unknown;

        /*
         * Make sure we don't iterate through any given stack more than once.
         * If it comes up a second time then there's something wrong going on:
         * just break out and report an unknown stack type.
         */
        if (visit_mask) {
                if (*visit_mask & (1UL << info->type)) {
                        if (task == current)
                                printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
                        goto unknown;
                }
                *visit_mask |= 1UL << info->type;
        }

        return 0;

unknown:
        info->type = STACK_TYPE_UNKNOWN;
        return -EINVAL;
}

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
























































































































































































































    3 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_MQ_H
#define BLK_MQ_H

#include <linux/blkdev.h>
#include <linux/sbitmap.h>
#include <linux/lockdep.h>
#include <linux/scatterlist.h>
#include <linux/prefetch.h>
#include <linux/srcu.h>
#include <linux/rw_hint.h>

struct blk_mq_tags;
struct blk_flush_queue;

#define BLKDEV_MIN_RQ        4
#define BLKDEV_DEFAULT_RQ        128

enum rq_end_io_ret {
        RQ_END_IO_NONE,
        RQ_END_IO_FREE,
};

typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t);

/*
 * request flags */
typedef __u32 __bitwise req_flags_t;

/* drive already may have started this one */
#define RQF_STARTED                ((__force req_flags_t)(1 << 1))
/* request for flush sequence */
#define RQF_FLUSH_SEQ                ((__force req_flags_t)(1 << 4))
/* merge of different types, fail separately */
#define RQF_MIXED_MERGE                ((__force req_flags_t)(1 << 5))
/* don't call prep for this one */
#define RQF_DONTPREP                ((__force req_flags_t)(1 << 7))
/* use hctx->sched_tags */
#define RQF_SCHED_TAGS                ((__force req_flags_t)(1 << 8))
/* use an I/O scheduler for this request */
#define RQF_USE_SCHED                ((__force req_flags_t)(1 << 9))
/* vaguely specified driver internal error.  Ignored by the block layer */
#define RQF_FAILED                ((__force req_flags_t)(1 << 10))
/* don't warn about errors */
#define RQF_QUIET                ((__force req_flags_t)(1 << 11))
/* account into disk and partition IO statistics */
#define RQF_IO_STAT                ((__force req_flags_t)(1 << 13))
/* runtime pm request */
#define RQF_PM                        ((__force req_flags_t)(1 << 15))
/* on IO scheduler merge hash */
#define RQF_HASHED                ((__force req_flags_t)(1 << 16))
/* track IO completion time */
#define RQF_STATS                ((__force req_flags_t)(1 << 17))
/* Look at ->special_vec for the actual data payload instead of the
   bio chain. */
#define RQF_SPECIAL_PAYLOAD        ((__force req_flags_t)(1 << 18))
/* The request completion needs to be signaled to zone write pluging. */
#define RQF_ZONE_WRITE_PLUGGING        ((__force req_flags_t)(1 << 20))
/* ->timeout has been called, don't expire again */
#define RQF_TIMED_OUT                ((__force req_flags_t)(1 << 21))
#define RQF_RESV                ((__force req_flags_t)(1 << 23))

/* flags that prevent us from merging requests: */
#define RQF_NOMERGE_FLAGS \
        (RQF_STARTED | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)

enum mq_rq_state {
        MQ_RQ_IDLE                = 0,
        MQ_RQ_IN_FLIGHT                = 1,
        MQ_RQ_COMPLETE                = 2,
};

/*
 * Try to put the fields that are referenced together in the same cacheline.
 *
 * If you modify this structure, make sure to update blk_rq_init() and
 * especially blk_mq_rq_ctx_init() to take care of the added fields.
 */
struct request {
        struct request_queue *q;
        struct blk_mq_ctx *mq_ctx;
        struct blk_mq_hw_ctx *mq_hctx;

        blk_opf_t cmd_flags;                /* op and common flags */
        req_flags_t rq_flags;

        int tag;
        int internal_tag;

        unsigned int timeout;

        /* the following two fields are internal, NEVER access directly */
        unsigned int __data_len;        /* total data len */
        sector_t __sector;                /* sector cursor */

        struct bio *bio;
        struct bio *biotail;

        union {
                struct list_head queuelist;
                struct request *rq_next;
        };

        struct block_device *part;
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
        /* Time that the first bio started allocating this request. */
        u64 alloc_time_ns;
#endif
        /* Time that this request was allocated for this IO. */
        u64 start_time_ns;
        /* Time that I/O was submitted to the device. */
        u64 io_start_time_ns;

#ifdef CONFIG_BLK_WBT
        unsigned short wbt_flags;
#endif
        /*
         * rq sectors used for blk stats. It has the same value
         * with blk_rq_sectors(rq), except that it never be zeroed
         * by completion.
         */
        unsigned short stats_sectors;

        /*
         * Number of scatter-gather DMA addr+len pairs after
         * physical address coalescing is performed.
         */
        unsigned short nr_phys_segments;

#ifdef CONFIG_BLK_DEV_INTEGRITY
        unsigned short nr_integrity_segments;
#endif

#ifdef CONFIG_BLK_INLINE_ENCRYPTION
        struct bio_crypt_ctx *crypt_ctx;
        struct blk_crypto_keyslot *crypt_keyslot;
#endif

        enum rw_hint write_hint;
        unsigned short ioprio;

        enum mq_rq_state state;
        atomic_t ref;

        unsigned long deadline;

        /*
         * The hash is used inside the scheduler, and killed once the
         * request reaches the dispatch list. The ipi_list is only used
         * to queue the request for softirq completion, which is long
         * after the request has been unhashed (and even removed from
         * the dispatch list).
         */
        union {
                struct hlist_node hash;        /* merge hash */
                struct llist_node ipi_list;
        };

        /*
         * The rb_node is only used inside the io scheduler, requests
         * are pruned when moved to the dispatch queue. special_vec must
         * only be used if RQF_SPECIAL_PAYLOAD is set, and those cannot be
         * insert into an IO scheduler.
         */
        union {
                struct rb_node rb_node;        /* sort/lookup */
                struct bio_vec special_vec;
        };

        /*
         * Three pointers are available for the IO schedulers, if they need
         * more they have to dynamically allocate it.
         */
        struct {
                struct io_cq                *icq;
                void                        *priv[2];
        } elv;

        struct {
                unsigned int                seq;
                rq_end_io_fn                *saved_end_io;
        } flush;

        u64 fifo_time;

        /*
         * completion callback.
         */
        rq_end_io_fn *end_io;
        void *end_io_data;
};

static inline enum req_op req_op(const struct request *req)
{
        return req->cmd_flags & REQ_OP_MASK;
}

static inline bool blk_rq_is_passthrough(struct request *rq)
{
        return blk_op_is_passthrough(rq->cmd_flags);
}

static inline unsigned short req_get_ioprio(struct request *req)
{
        return req->ioprio;
}

#define rq_data_dir(rq)                (op_is_write(req_op(rq)) ? WRITE : READ)

#define rq_dma_dir(rq) \
        (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)

#define rq_list_add(listptr, rq)        do {                \
        (rq)->rq_next = *(listptr);                        \
        *(listptr) = rq;                                \
} while (0)

#define rq_list_add_tail(lastpptr, rq)        do {                \
        (rq)->rq_next = NULL;                                \
        **(lastpptr) = rq;                                \
        *(lastpptr) = &rq->rq_next;                        \
} while (0)

#define rq_list_pop(listptr)                                \
({                                                        \
        struct request *__req = NULL;                        \
        if ((listptr) && *(listptr))        {                \
                __req = *(listptr);                        \
                *(listptr) = __req->rq_next;                \
        }                                                \
        __req;                                                \
})

#define rq_list_peek(listptr)                                \
({                                                        \
        struct request *__req = NULL;                        \
        if ((listptr) && *(listptr))                        \
                __req = *(listptr);                        \
        __req;                                                \
})

#define rq_list_for_each(listptr, pos)                        \
        for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos))

#define rq_list_for_each_safe(listptr, pos, nxt)                        \
        for (pos = rq_list_peek((listptr)), nxt = rq_list_next(pos);        \
                pos; pos = nxt, nxt = pos ? rq_list_next(pos) : NULL)

#define rq_list_next(rq)        (rq)->rq_next
#define rq_list_empty(list)        ((list) == (struct request *) NULL)

/**
 * rq_list_move() - move a struct request from one list to another
 * @src: The source list @rq is currently in
 * @dst: The destination list that @rq will be appended to
 * @rq: The request to move
 * @prev: The request preceding @rq in @src (NULL if @rq is the head)
 */
static inline void rq_list_move(struct request **src, struct request **dst,
                                struct request *rq, struct request *prev)
{
        if (prev)
                prev->rq_next = rq->rq_next;
        else
                *src = rq->rq_next;
        rq_list_add(dst, rq);
}

/**
 * enum blk_eh_timer_return - How the timeout handler should proceed
 * @BLK_EH_DONE: The block driver completed the command or will complete it at
 *        a later time.
 * @BLK_EH_RESET_TIMER: Reset the request timer and continue waiting for the
 *        request to complete.
 */
enum blk_eh_timer_return {
        BLK_EH_DONE,
        BLK_EH_RESET_TIMER,
};

#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */

/**
 * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
 * block device
 */
struct blk_mq_hw_ctx {
        struct {
                /** @lock: Protects the dispatch list. */
                spinlock_t                lock;
                /**
                 * @dispatch: Used for requests that are ready to be
                 * dispatched to the hardware but for some reason (e.g. lack of
                 * resources) could not be sent to the hardware. As soon as the
                 * driver can send new requests, requests at this list will
                 * be sent first for a fairer dispatch.
                 */
                struct list_head        dispatch;
                 /**
                  * @state: BLK_MQ_S_* flags. Defines the state of the hw
                  * queue (active, scheduled to restart, stopped).
                  */
                unsigned long                state;
        } ____cacheline_aligned_in_smp;

        /**
         * @run_work: Used for scheduling a hardware queue run at a later time.
         */
        struct delayed_work        run_work;
        /** @cpumask: Map of available CPUs where this hctx can run. */
        cpumask_var_t                cpumask;
        /**
         * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU
         * selection from @cpumask.
         */
        int                        next_cpu;
        /**
         * @next_cpu_batch: Counter of how many works left in the batch before
         * changing to the next CPU.
         */
        int                        next_cpu_batch;

        /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */
        unsigned long                flags;

        /**
         * @sched_data: Pointer owned by the IO scheduler attached to a request
         * queue. It's up to the IO scheduler how to use this pointer.
         */
        void                        *sched_data;
        /**
         * @queue: Pointer to the request queue that owns this hardware context.
         */
        struct request_queue        *queue;
        /** @fq: Queue of requests that need to perform a flush operation. */
        struct blk_flush_queue        *fq;

        /**
         * @driver_data: Pointer to data owned by the block driver that created
         * this hctx
         */
        void                        *driver_data;

        /**
         * @ctx_map: Bitmap for each software queue. If bit is on, there is a
         * pending request in that software queue.
         */
        struct sbitmap                ctx_map;

        /**
         * @dispatch_from: Software queue to be used when no scheduler was
         * selected.
         */
        struct blk_mq_ctx        *dispatch_from;
        /**
         * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to
         * decide if the hw_queue is busy using Exponential Weighted Moving
         * Average algorithm.
         */
        unsigned int                dispatch_busy;

        /** @type: HCTX_TYPE_* flags. Type of hardware queue. */
        unsigned short                type;
        /** @nr_ctx: Number of software queues. */
        unsigned short                nr_ctx;
        /** @ctxs: Array of software queues. */
        struct blk_mq_ctx        **ctxs;

        /** @dispatch_wait_lock: Lock for dispatch_wait queue. */
        spinlock_t                dispatch_wait_lock;
        /**
         * @dispatch_wait: Waitqueue to put requests when there is no tag
         * available at the moment, to wait for another try in the future.
         */
        wait_queue_entry_t        dispatch_wait;

        /**
         * @wait_index: Index of next available dispatch_wait queue to insert
         * requests.
         */
        atomic_t                wait_index;

        /**
         * @tags: Tags owned by the block driver. A tag at this set is only
         * assigned when a request is dispatched from a hardware queue.
         */
        struct blk_mq_tags        *tags;
        /**
         * @sched_tags: Tags owned by I/O scheduler. If there is an I/O
         * scheduler associated with a request queue, a tag is assigned when
         * that request is allocated. Else, this member is not used.
         */
        struct blk_mq_tags        *sched_tags;

        /** @numa_node: NUMA node the storage adapter has been connected to. */
        unsigned int                numa_node;
        /** @queue_num: Index of this hardware queue. */
        unsigned int                queue_num;

        /**
         * @nr_active: Number of active requests. Only used when a tag set is
         * shared across request queues.
         */
        atomic_t                nr_active;

        /** @cpuhp_online: List to store request if CPU is going to die */
        struct hlist_node        cpuhp_online;
        /** @cpuhp_dead: List to store request if some CPU die. */
        struct hlist_node        cpuhp_dead;
        /** @kobj: Kernel object for sysfs. */
        struct kobject                kobj;

#ifdef CONFIG_BLK_DEBUG_FS
        /**
         * @debugfs_dir: debugfs directory for this hardware queue. Named
         * as cpu<cpu_number>.
         */
        struct dentry                *debugfs_dir;
        /** @sched_debugfs_dir:        debugfs directory for the scheduler. */
        struct dentry                *sched_debugfs_dir;
#endif

        /**
         * @hctx_list: if this hctx is not in use, this is an entry in
         * q->unused_hctx_list.
         */
        struct list_head        hctx_list;
};

/**
 * struct blk_mq_queue_map - Map software queues to hardware queues
 * @mq_map:       CPU ID to hardware queue index map. This is an array
 *        with nr_cpu_ids elements. Each element has a value in the range
 *        [@queue_offset, @queue_offset + @nr_queues).
 * @nr_queues:    Number of hardware queues to map CPU IDs onto.
 * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe
 *        driver to map each hardware queue type (enum hctx_type) onto a distinct
 *        set of hardware queues.
 */
struct blk_mq_queue_map {
        unsigned int *mq_map;
        unsigned int nr_queues;
        unsigned int queue_offset;
};

/**
 * enum hctx_type - Type of hardware queue
 * @HCTX_TYPE_DEFAULT:        All I/O not otherwise accounted for.
 * @HCTX_TYPE_READ:        Just for READ I/O.
 * @HCTX_TYPE_POLL:        Polled I/O of any kind.
 * @HCTX_MAX_TYPES:        Number of types of hctx.
 */
enum hctx_type {
        HCTX_TYPE_DEFAULT,
        HCTX_TYPE_READ,
        HCTX_TYPE_POLL,

        HCTX_MAX_TYPES,
};

/**
 * struct blk_mq_tag_set - tag set that can be shared between request queues
 * @ops:           Pointers to functions that implement block driver behavior.
 * @map:           One or more ctx -> hctx mappings. One map exists for each
 *                   hardware queue type (enum hctx_type) that the driver wishes
 *                   to support. There are no restrictions on maps being of the
 *                   same size, and it's perfectly legal to share maps between
 *                   types.
 * @nr_maps:           Number of elements in the @map array. A number in the range
 *                   [1, HCTX_MAX_TYPES].
 * @nr_hw_queues:  Number of hardware queues supported by the block driver that
 *                   owns this data structure.
 * @queue_depth:   Number of tags per hardware queue, reserved tags included.
 * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag
 *                   allocations.
 * @cmd_size:           Number of additional bytes to allocate per request. The block
 *                   driver owns these additional bytes.
 * @numa_node:           NUMA node the storage adapter has been connected to.
 * @timeout:           Request processing timeout in jiffies.
 * @flags:           Zero or more BLK_MQ_F_* flags.
 * @driver_data:   Pointer to data owned by the block driver that created this
 *                   tag set.
 * @tags:           Tag sets. One tag set per hardware queue. Has @nr_hw_queues
 *                   elements.
 * @shared_tags:
 *                   Shared set of tags. Has @nr_hw_queues elements. If set,
 *                   shared by all @tags.
 * @tag_list_lock: Serializes tag_list accesses.
 * @tag_list:           List of the request queues that use this tag set. See also
 *                   request_queue.tag_set_list.
 * @srcu:           Use as lock when type of the request queue is blocking
 *                   (BLK_MQ_F_BLOCKING).
 */
struct blk_mq_tag_set {
        const struct blk_mq_ops        *ops;
        struct blk_mq_queue_map        map[HCTX_MAX_TYPES];
        unsigned int                nr_maps;
        unsigned int                nr_hw_queues;
        unsigned int                queue_depth;
        unsigned int                reserved_tags;
        unsigned int                cmd_size;
        int                        numa_node;
        unsigned int                timeout;
        unsigned int                flags;
        void                        *driver_data;

        struct blk_mq_tags        **tags;

        struct blk_mq_tags        *shared_tags;

        struct mutex                tag_list_lock;
        struct list_head        tag_list;
        struct srcu_struct        *srcu;
};

/**
 * struct blk_mq_queue_data - Data about a request inserted in a queue
 *
 * @rq:   Request pointer.
 * @last: If it is the last request in the queue.
 */
struct blk_mq_queue_data {
        struct request *rq;
        bool last;
};

typedef bool (busy_tag_iter_fn)(struct request *, void *);

/**
 * struct blk_mq_ops - Callback functions that implements block driver
 * behaviour.
 */
struct blk_mq_ops {
        /**
         * @queue_rq: Queue a new request from block IO.
         */
        blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *,
                                 const struct blk_mq_queue_data *);

        /**
         * @commit_rqs: If a driver uses bd->last to judge when to submit
         * requests to hardware, it must define this function. In case of errors
         * that make us stop issuing further requests, this hook serves the
         * purpose of kicking the hardware (which the last request otherwise
         * would have done).
         */
        void (*commit_rqs)(struct blk_mq_hw_ctx *);

        /**
         * @queue_rqs: Queue a list of new requests. Driver is guaranteed
         * that each request belongs to the same queue. If the driver doesn't
         * empty the @rqlist completely, then the rest will be queued
         * individually by the block layer upon return.
         */
        void (*queue_rqs)(struct request **rqlist);

        /**
         * @get_budget: Reserve budget before queue request, once .queue_rq is
         * run, it is driver's responsibility to release the
         * reserved budget. Also we have to handle failure case
         * of .get_budget for avoiding I/O deadlock.
         */
        int (*get_budget)(struct request_queue *);

        /**
         * @put_budget: Release the reserved budget.
         */
        void (*put_budget)(struct request_queue *, int);

        /**
         * @set_rq_budget_token: store rq's budget token
         */
        void (*set_rq_budget_token)(struct request *, int);
        /**
         * @get_rq_budget_token: retrieve rq's budget token
         */
        int (*get_rq_budget_token)(struct request *);

        /**
         * @timeout: Called on request timeout.
         */
        enum blk_eh_timer_return (*timeout)(struct request *);

        /**
         * @poll: Called to poll for completion of a specific tag.
         */
        int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *);

        /**
         * @complete: Mark the request as complete.
         */
        void (*complete)(struct request *);

        /**
         * @init_hctx: Called when the block layer side of a hardware queue has
         * been set up, allowing the driver to allocate/init matching
         * structures.
         */
        int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int);
        /**
         * @exit_hctx: Ditto for exit/teardown.
         */
        void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);

        /**
         * @init_request: Called for every command allocated by the block layer
         * to allow the driver to set up driver specific data.
         *
         * Tag greater than or equal to queue_depth is for setting up
         * flush request.
         */
        int (*init_request)(struct blk_mq_tag_set *set, struct request *,
                            unsigned int, unsigned int);
        /**
         * @exit_request: Ditto for exit/teardown.
         */
        void (*exit_request)(struct blk_mq_tag_set *set, struct request *,
                             unsigned int);

        /**
         * @cleanup_rq: Called before freeing one request which isn't completed
         * yet, and usually for freeing the driver private data.
         */
        void (*cleanup_rq)(struct request *);

        /**
         * @busy: If set, returns whether or not this queue currently is busy.
         */
        bool (*busy)(struct request_queue *);

        /**
         * @map_queues: This allows drivers specify their own queue mapping by
         * overriding the setup-time function that builds the mq_map.
         */
        void (*map_queues)(struct blk_mq_tag_set *set);

#ifdef CONFIG_BLK_DEBUG_FS
        /**
         * @show_rq: Used by the debugfs implementation to show driver-specific
         * information about a request.
         */
        void (*show_rq)(struct seq_file *m, struct request *rq);
#endif
};

enum {
        BLK_MQ_F_SHOULD_MERGE        = 1 << 0,
        BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1,
        /*
         * Set when this device requires underlying blk-mq device for
         * completing IO:
         */
        BLK_MQ_F_STACKING        = 1 << 2,
        BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3,
        BLK_MQ_F_BLOCKING        = 1 << 5,
        /* Do not allow an I/O scheduler to be configured. */
        BLK_MQ_F_NO_SCHED        = 1 << 6,
        /*
         * Select 'none' during queue registration in case of a single hwq
         * or shared hwqs instead of 'mq-deadline'.
         */
        BLK_MQ_F_NO_SCHED_BY_DEFAULT        = 1 << 7,
        BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
        BLK_MQ_F_ALLOC_POLICY_BITS = 1,

        BLK_MQ_S_STOPPED        = 0,
        BLK_MQ_S_TAG_ACTIVE        = 1,
        BLK_MQ_S_SCHED_RESTART        = 2,

        /* hw queue is inactive after all its CPUs become offline */
        BLK_MQ_S_INACTIVE        = 3,

        BLK_MQ_MAX_DEPTH        = 10240,

        BLK_MQ_CPU_WORK_BATCH        = 8,
};
#define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \
        ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \
                ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1))
#define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \
        ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \
                << BLK_MQ_F_ALLOC_POLICY_START_BIT)

#define BLK_MQ_NO_HCTX_IDX        (-1U)

struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
                struct queue_limits *lim, void *queuedata,
                struct lock_class_key *lkclass);
#define blk_mq_alloc_disk(set, lim, queuedata)                                \
({                                                                        \
        static struct lock_class_key __key;                                \
                                                                        \
        __blk_mq_alloc_disk(set, lim, queuedata, &__key);                \
})
struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
                struct lock_class_key *lkclass);
struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
                struct queue_limits *lim, void *queuedata);
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
                struct request_queue *q);
void blk_mq_destroy_queue(struct request_queue *);

int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
                const struct blk_mq_ops *ops, unsigned int queue_depth,
                unsigned int set_flags);
void blk_mq_free_tag_set(struct blk_mq_tag_set *set);

void blk_mq_free_request(struct request *rq);
int blk_rq_poll(struct request *rq, struct io_comp_batch *iob,
                unsigned int poll_flags);

bool blk_mq_queue_inflight(struct request_queue *q);

enum {
        /* return when out of requests */
        BLK_MQ_REQ_NOWAIT        = (__force blk_mq_req_flags_t)(1 << 0),
        /* allocate from reserved pool */
        BLK_MQ_REQ_RESERVED        = (__force blk_mq_req_flags_t)(1 << 1),
        /* set RQF_PM */
        BLK_MQ_REQ_PM                = (__force blk_mq_req_flags_t)(1 << 2),
};

struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
                blk_mq_req_flags_t flags);
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
                blk_opf_t opf, blk_mq_req_flags_t flags,
                unsigned int hctx_idx);

/*
 * Tag address space map.
 */
struct blk_mq_tags {
        unsigned int nr_tags;
        unsigned int nr_reserved_tags;
        unsigned int active_queues;

        struct sbitmap_queue bitmap_tags;
        struct sbitmap_queue breserved_tags;

        struct request **rqs;
        struct request **static_rqs;
        struct list_head page_list;

        /*
         * used to clear request reference in rqs[] before freeing one
         * request pool
         */
        spinlock_t lock;
};

static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags,
                                               unsigned int tag)
{
        if (tag < tags->nr_tags) {
                prefetch(tags->rqs[tag]);
                return tags->rqs[tag];
        }

        return NULL;
}

enum {
        BLK_MQ_UNIQUE_TAG_BITS = 16,
        BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1,
};

u32 blk_mq_unique_tag(struct request *rq);

static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag)
{
        return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS;
}

static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
{
        return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
}

/**
 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
 * @rq: target request.
 */
static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
{
        return READ_ONCE(rq->state);
}

static inline int blk_mq_request_started(struct request *rq)
{
        return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
}

static inline int blk_mq_request_completed(struct request *rq)
{
        return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
}

/*
 * 
 * Set the state to complete when completing a request from inside ->queue_rq.
 * This is used by drivers that want to ensure special complete actions that
 * need access to the request are called on failure, e.g. by nvme for
 * multipathing.
 */
static inline void blk_mq_set_request_complete(struct request *rq)
{
        WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
}

/*
 * Complete the request directly instead of deferring it to softirq or
 * completing it another CPU. Useful in preemptible instead of an interrupt.
 */
static inline void blk_mq_complete_request_direct(struct request *rq,
                   void (*complete)(struct request *rq))
{
        WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
        complete(rq);
}

void blk_mq_start_request(struct request *rq);
void blk_mq_end_request(struct request *rq, blk_status_t error);
void __blk_mq_end_request(struct request *rq, blk_status_t error);
void blk_mq_end_request_batch(struct io_comp_batch *ib);

/*
 * Only need start/end time stamping if we have iostat or
 * blk stats enabled, or using an IO scheduler.
 */
static inline bool blk_mq_need_time_stamp(struct request *rq)
{
        /*
         * passthrough io doesn't use iostat accounting, cgroup stats
         * and io scheduler functionalities.
         */
        if (blk_rq_is_passthrough(rq))
                return false;
        return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED));
}

static inline bool blk_mq_is_reserved_rq(struct request *rq)
{
        return rq->rq_flags & RQF_RESV;
}

/*
 * Batched completions only work when there is no I/O error and no special
 * ->end_io handler.
 */
static inline bool blk_mq_add_to_batch(struct request *req,
                                       struct io_comp_batch *iob, int ioerror,
                                       void (*complete)(struct io_comp_batch *))
{
        /*
         * blk_mq_end_request_batch() can't end request allocated from
         * sched tags
         */
        if (!iob || (req->rq_flags & RQF_SCHED_TAGS) || ioerror ||
                        (req->end_io && !blk_rq_is_passthrough(req)))
                return false;

        if (!iob->complete)
                iob->complete = complete;
        else if (iob->complete != complete)
                return false;
        iob->need_ts |= blk_mq_need_time_stamp(req);
        rq_list_add(&iob->req_list, req);
        return true;
}

void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
void blk_mq_kick_requeue_list(struct request_queue *q);
void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
void blk_mq_complete_request(struct request *rq);
bool blk_mq_complete_request_remote(struct request *rq);
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
void blk_mq_stop_hw_queues(struct request_queue *q);
void blk_mq_start_hw_queues(struct request_queue *q);
void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
void blk_mq_quiesce_queue(struct request_queue *q);
void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set);
void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set);
void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set);
void blk_mq_unquiesce_queue(struct request_queue *q);
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_run_hw_queues(struct request_queue *q, bool async);
void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs);
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
                busy_tag_iter_fn *fn, void *priv);
void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset);
void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_unfreeze_queue(struct request_queue *q);
void blk_freeze_queue_start(struct request_queue *q);
void blk_mq_freeze_queue_wait(struct request_queue *q);
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
                                     unsigned long timeout);

void blk_mq_map_queues(struct blk_mq_queue_map *qmap);
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);

void blk_mq_quiesce_queue_nowait(struct request_queue *q);

unsigned int blk_mq_rq_cpu(struct request *rq);

bool __blk_should_fake_timeout(struct request_queue *q);
static inline bool blk_should_fake_timeout(struct request_queue *q)
{
        if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) &&
            test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags))
                return __blk_should_fake_timeout(q);
        return false;
}

/**
 * blk_mq_rq_from_pdu - cast a PDU to a request
 * @pdu: the PDU (Protocol Data Unit) to be casted
 *
 * Return: request
 *
 * Driver command data is immediately after the request. So subtract request
 * size to get back to the original request.
 */
static inline struct request *blk_mq_rq_from_pdu(void *pdu)
{
        return pdu - sizeof(struct request);
}

/**
 * blk_mq_rq_to_pdu - cast a request to a PDU
 * @rq: the request to be casted
 *
 * Return: pointer to the PDU
 *
 * Driver command data is immediately after the request. So add request to get
 * the PDU.
 */
static inline void *blk_mq_rq_to_pdu(struct request *rq)
{
        return rq + 1;
}

#define queue_for_each_hw_ctx(q, hctx, i)                                \
        xa_for_each(&(q)->hctx_table, (i), (hctx))

#define hctx_for_each_ctx(hctx, ctx, i)                                        \
        for ((i) = 0; (i) < (hctx)->nr_ctx &&                                \
             ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++)

static inline void blk_mq_cleanup_rq(struct request *rq)
{
        if (rq->q->mq_ops->cleanup_rq)
                rq->q->mq_ops->cleanup_rq(rq);
}

static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
                unsigned int nr_segs)
{
        rq->nr_phys_segments = nr_segs;
        rq->__data_len = bio->bi_iter.bi_size;
        rq->bio = rq->biotail = bio;
        rq->ioprio = bio_prio(bio);
}

void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
                struct lock_class_key *key);

static inline bool rq_is_sync(struct request *rq)
{
        return op_is_sync(rq->cmd_flags);
}

void blk_rq_init(struct request_queue *q, struct request *rq);
int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
                struct bio_set *bs, gfp_t gfp_mask,
                int (*bio_ctr)(struct bio *, struct bio *, void *), void *data);
void blk_rq_unprep_clone(struct request *rq);
blk_status_t blk_insert_cloned_request(struct request *rq);

struct rq_map_data {
        struct page **pages;
        unsigned long offset;
        unsigned short page_order;
        unsigned short nr_entries;
        bool null_mapped;
        bool from_user;
};

int blk_rq_map_user(struct request_queue *, struct request *,
                struct rq_map_data *, void __user *, unsigned long, gfp_t);
int blk_rq_map_user_io(struct request *, struct rq_map_data *,
                void __user *, unsigned long, gfp_t, bool, int, bool, int);
int blk_rq_map_user_iov(struct request_queue *, struct request *,
                struct rq_map_data *, const struct iov_iter *, gfp_t);
int blk_rq_unmap_user(struct bio *);
int blk_rq_map_kern(struct request_queue *, struct request *, void *,
                unsigned int, gfp_t);
int blk_rq_append_bio(struct request *rq, struct bio *bio);
void blk_execute_rq_nowait(struct request *rq, bool at_head);
blk_status_t blk_execute_rq(struct request *rq, bool at_head);
bool blk_rq_is_poll(struct request *rq);

struct req_iterator {
        struct bvec_iter iter;
        struct bio *bio;
};

#define __rq_for_each_bio(_bio, rq)        \
        if ((rq->bio))                        \
                for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)

#define rq_for_each_segment(bvl, _rq, _iter)                        \
        __rq_for_each_bio(_iter.bio, _rq)                        \
                bio_for_each_segment(bvl, _iter.bio, _iter.iter)

#define rq_for_each_bvec(bvl, _rq, _iter)                        \
        __rq_for_each_bio(_iter.bio, _rq)                        \
                bio_for_each_bvec(bvl, _iter.bio, _iter.iter)

#define rq_iter_last(bvec, _iter)                                \
                (_iter.bio->bi_next == NULL &&                        \
                 bio_iter_last(bvec, _iter.iter))

/*
 * blk_rq_pos()                        : the current sector
 * blk_rq_bytes()                : bytes left in the entire request
 * blk_rq_cur_bytes()                : bytes left in the current segment
 * blk_rq_sectors()                : sectors left in the entire request
 * blk_rq_cur_sectors()                : sectors left in the current segment
 * blk_rq_stats_sectors()        : sectors of the entire request used for stats
 */
static inline sector_t blk_rq_pos(const struct request *rq)
{
        return rq->__sector;
}

static inline unsigned int blk_rq_bytes(const struct request *rq)
{
        return rq->__data_len;
}

static inline int blk_rq_cur_bytes(const struct request *rq)
{
        if (!rq->bio)
                return 0;
        if (!bio_has_data(rq->bio))        /* dataless requests such as discard */
                return rq->bio->bi_iter.bi_size;
        return bio_iovec(rq->bio).bv_len;
}

static inline unsigned int blk_rq_sectors(const struct request *rq)
{
        return blk_rq_bytes(rq) >> SECTOR_SHIFT;
}

static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
{
        return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
}

static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
{
        return rq->stats_sectors;
}

/*
 * Some commands like WRITE SAME have a payload or data transfer size which
 * is different from the size of the request.  Any driver that supports such
 * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to
 * calculate the data transfer size.
 */
static inline unsigned int blk_rq_payload_bytes(struct request *rq)
{
        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
                return rq->special_vec.bv_len;
        return blk_rq_bytes(rq);
}

/*
 * Return the first full biovec in the request.  The caller needs to check that
 * there are any bvecs before calling this helper.
 */
static inline struct bio_vec req_bvec(struct request *rq)
{
        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
                return rq->special_vec;
        return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
}

static inline unsigned int blk_rq_count_bios(struct request *rq)
{
        unsigned int nr_bios = 0;
        struct bio *bio;

        __rq_for_each_bio(bio, rq)
                nr_bios++;

        return nr_bios;
}

void blk_steal_bios(struct bio_list *list, struct request *rq);

/*
 * Request completion related functions.
 *
 * blk_update_request() completes given number of bytes and updates
 * the request without completing it.
 */
bool blk_update_request(struct request *rq, blk_status_t error,
                               unsigned int nr_bytes);
void blk_abort_request(struct request *);

/*
 * Number of physical segments as sent to the device.
 *
 * Normally this is the number of discontiguous data segments sent by the
 * submitter.  But for data-less command like discard we might have no
 * actual data segments submitted, but the driver might have to add it's
 * own special payload.  In that case we still return 1 here so that this
 * special payload will be mapped.
 */
static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
{
        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
                return 1;
        return rq->nr_phys_segments;
}

/*
 * Number of discard segments (or ranges) the driver needs to fill in.
 * Each discard bio merged into a request is counted as one segment.
 */
static inline unsigned short blk_rq_nr_discard_segments(struct request *rq)
{
        return max_t(unsigned short, rq->nr_phys_segments, 1);
}

int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
                struct scatterlist *sglist, struct scatterlist **last_sg);
static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
                struct scatterlist *sglist)
{
        struct scatterlist *last_sg = NULL;

        return __blk_rq_map_sg(q, rq, sglist, &last_sg);
}
void blk_dump_rq_flags(struct request *, char *);

#endif /* BLK_MQ_H */




















    1 








    1 




























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *   Copyright (C) International Business Machines Corp., 2000-2002
 *   Portions Copyright (C) Christoph Hellwig, 2001-2002
 */

#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/posix_acl.h>
#include <linux/quotaops.h>
#include "jfs_incore.h"
#include "jfs_inode.h"
#include "jfs_dmap.h"
#include "jfs_txnmgr.h"
#include "jfs_xattr.h"
#include "jfs_acl.h"
#include "jfs_debug.h"

int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
        struct inode *inode = file->f_mapping->host;
        int rc = 0;

        rc = file_write_and_wait_range(file, start, end);
        if (rc)
                return rc;

        inode_lock(inode);
        if (!(inode->i_state & I_DIRTY_ALL) ||
            (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
                /* Make sure committed changes hit the disk */
                jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
                inode_unlock(inode);
                return rc;
        }

        rc |= jfs_commit_inode(inode, 1);
        inode_unlock(inode);

        return rc ? -EIO : 0;
}

static int jfs_open(struct inode *inode, struct file *file)
{
        int rc;

        if ((rc = dquot_file_open(inode, file)))
                return rc;

        /*
         * We attempt to allow only one "active" file open per aggregate
         * group.  Otherwise, appending to files in parallel can cause
         * fragmentation within the files.
         *
         * If the file is empty, it was probably just created and going
         * to be written to.  If it has a size, we'll hold off until the
         * file is actually grown.
         */
        if (S_ISREG(inode->i_mode) && file->f_mode & FMODE_WRITE &&
            (inode->i_size == 0)) {
                struct jfs_inode_info *ji = JFS_IP(inode);
                spin_lock_irq(&ji->ag_lock);
                if (ji->active_ag == -1) {
                        struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb);
                        ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb);
                        atomic_inc(&jfs_sb->bmap->db_active[ji->active_ag]);
                }
                spin_unlock_irq(&ji->ag_lock);
        }

        return 0;
}
static int jfs_release(struct inode *inode, struct file *file)
{
        struct jfs_inode_info *ji = JFS_IP(inode);

        spin_lock_irq(&ji->ag_lock);
        if (ji->active_ag != -1) {
                struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap;
                atomic_dec(&bmap->db_active[ji->active_ag]);
                ji->active_ag = -1;
        }
        spin_unlock_irq(&ji->ag_lock);

        return 0;
}

int jfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                struct iattr *iattr)
{
        struct inode *inode = d_inode(dentry);
        int rc;

        rc = setattr_prepare(&nop_mnt_idmap, dentry, iattr);
        if (rc)
                return rc;

        if (is_quota_modification(&nop_mnt_idmap, inode, iattr)) {
                rc = dquot_initialize(inode);
                if (rc)
                        return rc;
        }
        if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
            (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
                rc = dquot_transfer(&nop_mnt_idmap, inode, iattr);
                if (rc)
                        return rc;
        }

        if ((iattr->ia_valid & ATTR_SIZE) &&
            iattr->ia_size != i_size_read(inode)) {
                inode_dio_wait(inode);

                rc = inode_newsize_ok(inode, iattr->ia_size);
                if (rc)
                        return rc;

                truncate_setsize(inode, iattr->ia_size);
                jfs_truncate(inode);
        }

        setattr_copy(&nop_mnt_idmap, inode, iattr);
        mark_inode_dirty(inode);

        if (iattr->ia_valid & ATTR_MODE)
                rc = posix_acl_chmod(&nop_mnt_idmap, dentry, inode->i_mode);
        return rc;
}

const struct inode_operations jfs_file_inode_operations = {
        .listxattr        = jfs_listxattr,
        .setattr        = jfs_setattr,
        .fileattr_get        = jfs_fileattr_get,
        .fileattr_set        = jfs_fileattr_set,
#ifdef CONFIG_JFS_POSIX_ACL
        .get_inode_acl        = jfs_get_acl,
        .set_acl        = jfs_set_acl,
#endif
};

const struct file_operations jfs_file_operations = {
        .open                = jfs_open,
        .llseek                = generic_file_llseek,
        .read_iter        = generic_file_read_iter,
        .write_iter        = generic_file_write_iter,
        .mmap                = generic_file_mmap,
        .splice_read        = filemap_splice_read,
        .splice_write        = iter_file_splice_write,
        .fsync                = jfs_fsync,
        .release        = jfs_release,
        .unlocked_ioctl = jfs_ioctl,
        .compat_ioctl        = compat_ptr_ioctl,
};














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 


    1 






    1 








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/seq_file.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mount.h>
#include <linux/writeback.h>
#include <linux/statfs.h>
#include <linux/compat.h>
#include <linux/parser.h>
#include <linux/ctype.h>
#include <linux/namei.h>
#include <linux/miscdevice.h>
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/crc32c.h>
#include <linux/btrfs.h>
#include <linux/security.h>
#include <linux/fs_parser.h>
#include "messages.h"
#include "delayed-inode.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "props.h"
#include "xattr.h"
#include "bio.h"
#include "export.h"
#include "compression.h"
#include "dev-replace.h"
#include "free-space-cache.h"
#include "backref.h"
#include "space-info.h"
#include "sysfs.h"
#include "zoned.h"
#include "tests/btrfs-tests.h"
#include "block-group.h"
#include "discard.h"
#include "qgroup.h"
#include "raid56.h"
#include "fs.h"
#include "accessors.h"
#include "defrag.h"
#include "dir-item.h"
#include "ioctl.h"
#include "scrub.h"
#include "verity.h"
#include "super.h"
#include "extent-tree.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>

static const struct super_operations btrfs_super_ops;
static struct file_system_type btrfs_fs_type;

static void btrfs_put_super(struct super_block *sb)
{
        struct btrfs_fs_info *fs_info = btrfs_sb(sb);

        btrfs_info(fs_info, "last unmount of filesystem %pU", fs_info->fs_devices->fsid);
        close_ctree(fs_info);
}

/* Store the mount options related information. */
struct btrfs_fs_context {
        char *subvol_name;
        u64 subvol_objectid;
        u64 max_inline;
        u32 commit_interval;
        u32 metadata_ratio;
        u32 thread_pool_size;
        unsigned long mount_opt;
        unsigned long compress_type:4;
        unsigned int compress_level;
        refcount_t refs;
};

enum {
        Opt_acl,
        Opt_clear_cache,
        Opt_commit_interval,
        Opt_compress,
        Opt_compress_force,
        Opt_compress_force_type,
        Opt_compress_type,
        Opt_degraded,
        Opt_device,
        Opt_fatal_errors,
        Opt_flushoncommit,
        Opt_max_inline,
        Opt_barrier,
        Opt_datacow,
        Opt_datasum,
        Opt_defrag,
        Opt_discard,
        Opt_discard_mode,
        Opt_ratio,
        Opt_rescan_uuid_tree,
        Opt_skip_balance,
        Opt_space_cache,
        Opt_space_cache_version,
        Opt_ssd,
        Opt_ssd_spread,
        Opt_subvol,
        Opt_subvol_empty,
        Opt_subvolid,
        Opt_thread_pool,
        Opt_treelog,
        Opt_user_subvol_rm_allowed,
        Opt_norecovery,

        /* Rescue options */
        Opt_rescue,
        Opt_usebackuproot,
        Opt_nologreplay,
        Opt_ignorebadroots,
        Opt_ignoredatacsums,
        Opt_rescue_all,

        /* Debugging options */
        Opt_enospc_debug,
#ifdef CONFIG_BTRFS_DEBUG
        Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
        Opt_ref_verify,
#endif
        Opt_err,
};

enum {
        Opt_fatal_errors_panic,
        Opt_fatal_errors_bug,
};

static const struct constant_table btrfs_parameter_fatal_errors[] = {
        { "panic", Opt_fatal_errors_panic },
        { "bug", Opt_fatal_errors_bug },
        {}
};

enum {
        Opt_discard_sync,
        Opt_discard_async,
};

static const struct constant_table btrfs_parameter_discard[] = {
        { "sync", Opt_discard_sync },
        { "async", Opt_discard_async },
        {}
};

enum {
        Opt_space_cache_v1,
        Opt_space_cache_v2,
};

static const struct constant_table btrfs_parameter_space_cache[] = {
        { "v1", Opt_space_cache_v1 },
        { "v2", Opt_space_cache_v2 },
        {}
};

enum {
        Opt_rescue_usebackuproot,
        Opt_rescue_nologreplay,
        Opt_rescue_ignorebadroots,
        Opt_rescue_ignoredatacsums,
        Opt_rescue_parameter_all,
};

static const struct constant_table btrfs_parameter_rescue[] = {
        { "usebackuproot", Opt_rescue_usebackuproot },
        { "nologreplay", Opt_rescue_nologreplay },
        { "ignorebadroots", Opt_rescue_ignorebadroots },
        { "ibadroots", Opt_rescue_ignorebadroots },
        { "ignoredatacsums", Opt_rescue_ignoredatacsums },
        { "idatacsums", Opt_rescue_ignoredatacsums },
        { "all", Opt_rescue_parameter_all },
        {}
};

#ifdef CONFIG_BTRFS_DEBUG
enum {
        Opt_fragment_parameter_data,
        Opt_fragment_parameter_metadata,
        Opt_fragment_parameter_all,
};

static const struct constant_table btrfs_parameter_fragment[] = {
        { "data", Opt_fragment_parameter_data },
        { "metadata", Opt_fragment_parameter_metadata },
        { "all", Opt_fragment_parameter_all },
        {}
};
#endif

static const struct fs_parameter_spec btrfs_fs_parameters[] = {
        fsparam_flag_no("acl", Opt_acl),
        fsparam_flag_no("autodefrag", Opt_defrag),
        fsparam_flag_no("barrier", Opt_barrier),
        fsparam_flag("clear_cache", Opt_clear_cache),
        fsparam_u32("commit", Opt_commit_interval),
        fsparam_flag("compress", Opt_compress),
        fsparam_string("compress", Opt_compress_type),
        fsparam_flag("compress-force", Opt_compress_force),
        fsparam_string("compress-force", Opt_compress_force_type),
        fsparam_flag_no("datacow", Opt_datacow),
        fsparam_flag_no("datasum", Opt_datasum),
        fsparam_flag("degraded", Opt_degraded),
        fsparam_string("device", Opt_device),
        fsparam_flag_no("discard", Opt_discard),
        fsparam_enum("discard", Opt_discard_mode, btrfs_parameter_discard),
        fsparam_enum("fatal_errors", Opt_fatal_errors, btrfs_parameter_fatal_errors),
        fsparam_flag_no("flushoncommit", Opt_flushoncommit),
        fsparam_string("max_inline", Opt_max_inline),
        fsparam_u32("metadata_ratio", Opt_ratio),
        fsparam_flag("rescan_uuid_tree", Opt_rescan_uuid_tree),
        fsparam_flag("skip_balance", Opt_skip_balance),
        fsparam_flag_no("space_cache", Opt_space_cache),
        fsparam_enum("space_cache", Opt_space_cache_version, btrfs_parameter_space_cache),
        fsparam_flag_no("ssd", Opt_ssd),
        fsparam_flag_no("ssd_spread", Opt_ssd_spread),
        fsparam_string("subvol", Opt_subvol),
        fsparam_flag("subvol=", Opt_subvol_empty),
        fsparam_u64("subvolid", Opt_subvolid),
        fsparam_u32("thread_pool", Opt_thread_pool),
        fsparam_flag_no("treelog", Opt_treelog),
        fsparam_flag("user_subvol_rm_allowed", Opt_user_subvol_rm_allowed),

        /* Rescue options. */
        fsparam_enum("rescue", Opt_rescue, btrfs_parameter_rescue),
        /* Deprecated, with alias rescue=nologreplay */
        __fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL),
        /* Deprecated, with alias rescue=usebackuproot */
        __fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL),
        /* For compatibility only, alias for "rescue=nologreplay". */
        fsparam_flag("norecovery", Opt_norecovery),

        /* Debugging options. */
        fsparam_flag_no("enospc_debug", Opt_enospc_debug),
#ifdef CONFIG_BTRFS_DEBUG
        fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment),
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
        fsparam_flag("ref_verify", Opt_ref_verify),
#endif
        {}
};

/* No support for restricting writes to btrfs devices yet... */
static inline blk_mode_t btrfs_open_mode(struct fs_context *fc)
{
        return sb_open_mode(fc->sb_flags) & ~BLK_OPEN_RESTRICT_WRITES;
}

static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct btrfs_fs_context *ctx = fc->fs_private;
        struct fs_parse_result result;
        int opt;

        opt = fs_parse(fc, btrfs_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case Opt_degraded:
                btrfs_set_opt(ctx->mount_opt, DEGRADED);
                break;
        case Opt_subvol_empty:
                /*
                 * This exists because we used to allow it on accident, so we're
                 * keeping it to maintain ABI.  See 37becec95ac3 ("Btrfs: allow
                 * empty subvol= again").
                 */
                break;
        case Opt_subvol:
                kfree(ctx->subvol_name);
                ctx->subvol_name = kstrdup(param->string, GFP_KERNEL);
                if (!ctx->subvol_name)
                        return -ENOMEM;
                break;
        case Opt_subvolid:
                ctx->subvol_objectid = result.uint_64;

                /* subvolid=0 means give me the original fs_tree. */
                if (!ctx->subvol_objectid)
                        ctx->subvol_objectid = BTRFS_FS_TREE_OBJECTID;
                break;
        case Opt_device: {
                struct btrfs_device *device;
                blk_mode_t mode = btrfs_open_mode(fc);

                mutex_lock(&uuid_mutex);
                device = btrfs_scan_one_device(param->string, mode, false);
                mutex_unlock(&uuid_mutex);
                if (IS_ERR(device))
                        return PTR_ERR(device);
                break;
        }
        case Opt_datasum:
                if (result.negated) {
                        btrfs_set_opt(ctx->mount_opt, NODATASUM);
                } else {
                        btrfs_clear_opt(ctx->mount_opt, NODATACOW);
                        btrfs_clear_opt(ctx->mount_opt, NODATASUM);
                }
                break;
        case Opt_datacow:
                if (result.negated) {
                        btrfs_clear_opt(ctx->mount_opt, COMPRESS);
                        btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
                        btrfs_set_opt(ctx->mount_opt, NODATACOW);
                        btrfs_set_opt(ctx->mount_opt, NODATASUM);
                } else {
                        btrfs_clear_opt(ctx->mount_opt, NODATACOW);
                }
                break;
        case Opt_compress_force:
        case Opt_compress_force_type:
                btrfs_set_opt(ctx->mount_opt, FORCE_COMPRESS);
                fallthrough;
        case Opt_compress:
        case Opt_compress_type:
                if (opt == Opt_compress || opt == Opt_compress_force) {
                        ctx->compress_type = BTRFS_COMPRESS_ZLIB;
                        ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
                        btrfs_set_opt(ctx->mount_opt, COMPRESS);
                        btrfs_clear_opt(ctx->mount_opt, NODATACOW);
                        btrfs_clear_opt(ctx->mount_opt, NODATASUM);
                } else if (strncmp(param->string, "zlib", 4) == 0) {
                        ctx->compress_type = BTRFS_COMPRESS_ZLIB;
                        ctx->compress_level =
                                btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB,
                                                         param->string + 4);
                        btrfs_set_opt(ctx->mount_opt, COMPRESS);
                        btrfs_clear_opt(ctx->mount_opt, NODATACOW);
                        btrfs_clear_opt(ctx->mount_opt, NODATASUM);
                } else if (strncmp(param->string, "lzo", 3) == 0) {
                        ctx->compress_type = BTRFS_COMPRESS_LZO;
                        ctx->compress_level = 0;
                        btrfs_set_opt(ctx->mount_opt, COMPRESS);
                        btrfs_clear_opt(ctx->mount_opt, NODATACOW);
                        btrfs_clear_opt(ctx->mount_opt, NODATASUM);
                } else if (strncmp(param->string, "zstd", 4) == 0) {
                        ctx->compress_type = BTRFS_COMPRESS_ZSTD;
                        ctx->compress_level =
                                btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD,
                                                         param->string + 4);
                        btrfs_set_opt(ctx->mount_opt, COMPRESS);
                        btrfs_clear_opt(ctx->mount_opt, NODATACOW);
                        btrfs_clear_opt(ctx->mount_opt, NODATASUM);
                } else if (strncmp(param->string, "no", 2) == 0) {
                        ctx->compress_level = 0;
                        ctx->compress_type = 0;
                        btrfs_clear_opt(ctx->mount_opt, COMPRESS);
                        btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
                } else {
                        btrfs_err(NULL, "unrecognized compression value %s",
                                  param->string);
                        return -EINVAL;
                }
                break;
        case Opt_ssd:
                if (result.negated) {
                        btrfs_set_opt(ctx->mount_opt, NOSSD);
                        btrfs_clear_opt(ctx->mount_opt, SSD);
                        btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD);
                } else {
                        btrfs_set_opt(ctx->mount_opt, SSD);
                        btrfs_clear_opt(ctx->mount_opt, NOSSD);
                }
                break;
        case Opt_ssd_spread:
                if (result.negated) {
                        btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD);
                } else {
                        btrfs_set_opt(ctx->mount_opt, SSD);
                        btrfs_set_opt(ctx->mount_opt, SSD_SPREAD);
                        btrfs_clear_opt(ctx->mount_opt, NOSSD);
                }
                break;
        case Opt_barrier:
                if (result.negated)
                        btrfs_set_opt(ctx->mount_opt, NOBARRIER);
                else
                        btrfs_clear_opt(ctx->mount_opt, NOBARRIER);
                break;
        case Opt_thread_pool:
                if (result.uint_32 == 0) {
                        btrfs_err(NULL, "invalid value 0 for thread_pool");
                        return -EINVAL;
                }
                ctx->thread_pool_size = result.uint_32;
                break;
        case Opt_max_inline:
                ctx->max_inline = memparse(param->string, NULL);
                break;
        case Opt_acl:
                if (result.negated) {
                        fc->sb_flags &= ~SB_POSIXACL;
                } else {
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
                        fc->sb_flags |= SB_POSIXACL;
#else
                        btrfs_err(NULL, "support for ACL not compiled in");
                        return -EINVAL;
#endif
                }
                /*
                 * VFS limits the ability to toggle ACL on and off via remount,
                 * despite every file system allowing this.  This seems to be
                 * an oversight since we all do, but it'll fail if we're
                 * remounting.  So don't set the mask here, we'll check it in
                 * btrfs_reconfigure and do the toggling ourselves.
                 */
                if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE)
                        fc->sb_flags_mask |= SB_POSIXACL;
                break;
        case Opt_treelog:
                if (result.negated)
                        btrfs_set_opt(ctx->mount_opt, NOTREELOG);
                else
                        btrfs_clear_opt(ctx->mount_opt, NOTREELOG);
                break;
        case Opt_nologreplay:
                btrfs_warn(NULL,
                "'nologreplay' is deprecated, use 'rescue=nologreplay' instead");
                btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
                break;
        case Opt_norecovery:
                btrfs_info(NULL,
"'norecovery' is for compatibility only, recommended to use 'rescue=nologreplay'");
                btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
                break;
        case Opt_flushoncommit:
                if (result.negated)
                        btrfs_clear_opt(ctx->mount_opt, FLUSHONCOMMIT);
                else
                        btrfs_set_opt(ctx->mount_opt, FLUSHONCOMMIT);
                break;
        case Opt_ratio:
                ctx->metadata_ratio = result.uint_32;
                break;
        case Opt_discard:
                if (result.negated) {
                        btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC);
                        btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
                        btrfs_set_opt(ctx->mount_opt, NODISCARD);
                } else {
                        btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC);
                        btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
                }
                break;
        case Opt_discard_mode:
                switch (result.uint_32) {
                case Opt_discard_sync:
                        btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
                        btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC);
                        break;
                case Opt_discard_async:
                        btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC);
                        btrfs_set_opt(ctx->mount_opt, DISCARD_ASYNC);
                        break;
                default:
                        btrfs_err(NULL, "unrecognized discard mode value %s",
                                  param->key);
                        return -EINVAL;
                }
                btrfs_clear_opt(ctx->mount_opt, NODISCARD);
                break;
        case Opt_space_cache:
                if (result.negated) {
                        btrfs_set_opt(ctx->mount_opt, NOSPACECACHE);
                        btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE);
                        btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
                } else {
                        btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
                        btrfs_set_opt(ctx->mount_opt, SPACE_CACHE);
                }
                break;
        case Opt_space_cache_version:
                switch (result.uint_32) {
                case Opt_space_cache_v1:
                        btrfs_set_opt(ctx->mount_opt, SPACE_CACHE);
                        btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
                        break;
                case Opt_space_cache_v2:
                        btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE);
                        btrfs_set_opt(ctx->mount_opt, FREE_SPACE_TREE);
                        break;
                default:
                        btrfs_err(NULL, "unrecognized space_cache value %s",
                                  param->key);
                        return -EINVAL;
                }
                break;
        case Opt_rescan_uuid_tree:
                btrfs_set_opt(ctx->mount_opt, RESCAN_UUID_TREE);
                break;
        case Opt_clear_cache:
                btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE);
                break;
        case Opt_user_subvol_rm_allowed:
                btrfs_set_opt(ctx->mount_opt, USER_SUBVOL_RM_ALLOWED);
                break;
        case Opt_enospc_debug:
                if (result.negated)
                        btrfs_clear_opt(ctx->mount_opt, ENOSPC_DEBUG);
                else
                        btrfs_set_opt(ctx->mount_opt, ENOSPC_DEBUG);
                break;
        case Opt_defrag:
                if (result.negated)
                        btrfs_clear_opt(ctx->mount_opt, AUTO_DEFRAG);
                else
                        btrfs_set_opt(ctx->mount_opt, AUTO_DEFRAG);
                break;
        case Opt_usebackuproot:
                btrfs_warn(NULL,
                           "'usebackuproot' is deprecated, use 'rescue=usebackuproot' instead");
                btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT);

                /* If we're loading the backup roots we can't trust the space cache. */
                btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE);
                break;
        case Opt_skip_balance:
                btrfs_set_opt(ctx->mount_opt, SKIP_BALANCE);
                break;
        case Opt_fatal_errors:
                switch (result.uint_32) {
                case Opt_fatal_errors_panic:
                        btrfs_set_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR);
                        break;
                case Opt_fatal_errors_bug:
                        btrfs_clear_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR);
                        break;
                default:
                        btrfs_err(NULL, "unrecognized fatal_errors value %s",
                                  param->key);
                        return -EINVAL;
                }
                break;
        case Opt_commit_interval:
                ctx->commit_interval = result.uint_32;
                if (ctx->commit_interval == 0)
                        ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
                break;
        case Opt_rescue:
                switch (result.uint_32) {
                case Opt_rescue_usebackuproot:
                        btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT);
                        break;
                case Opt_rescue_nologreplay:
                        btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
                        break;
                case Opt_rescue_ignorebadroots:
                        btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS);
                        break;
                case Opt_rescue_ignoredatacsums:
                        btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS);
                        break;
                case Opt_rescue_parameter_all:
                        btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS);
                        btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS);
                        btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
                        break;
                default:
                        btrfs_info(NULL, "unrecognized rescue option '%s'",
                                   param->key);
                        return -EINVAL;
                }
                break;
#ifdef CONFIG_BTRFS_DEBUG
        case Opt_fragment:
                switch (result.uint_32) {
                case Opt_fragment_parameter_all:
                        btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA);
                        btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA);
                        break;
                case Opt_fragment_parameter_metadata:
                        btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA);
                        break;
                case Opt_fragment_parameter_data:
                        btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA);
                        break;
                default:
                        btrfs_info(NULL, "unrecognized fragment option '%s'",
                                   param->key);
                        return -EINVAL;
                }
                break;
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
        case Opt_ref_verify:
                btrfs_set_opt(ctx->mount_opt, REF_VERIFY);
                break;
#endif
        default:
                btrfs_err(NULL, "unrecognized mount option '%s'", param->key);
                return -EINVAL;
        }

        return 0;
}

/*
 * Some options only have meaning at mount time and shouldn't persist across
 * remounts, or be displayed. Clear these at the end of mount and remount code
 * paths.
 */
static void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
{
        btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
        btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
        btrfs_clear_opt(fs_info->mount_opt, NOSPACECACHE);
}

static bool check_ro_option(struct btrfs_fs_info *fs_info,
                            unsigned long mount_opt, unsigned long opt,
                            const char *opt_name)
{
        if (mount_opt & opt) {
                btrfs_err(fs_info, "%s must be used with ro mount option",
                          opt_name);
                return true;
        }
        return false;
}

bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt,
                         unsigned long flags)
{
        bool ret = true;

        if (!(flags & SB_RDONLY) &&
            (check_ro_option(info, *mount_opt, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
             check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
             check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums")))
                ret = false;

        if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
            !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE) &&
            !btrfs_raw_test_opt(*mount_opt, CLEAR_CACHE)) {
                btrfs_err(info, "cannot disable free-space-tree");
                ret = false;
        }
        if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) &&
             !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) {
                btrfs_err(info, "cannot disable free-space-tree with block-group-tree feature");
                ret = false;
        }

        if (btrfs_check_mountopts_zoned(info, mount_opt))
                ret = false;

        if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) {
                if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE))
                        btrfs_info(info, "disk space caching is enabled");
                if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE))
                        btrfs_info(info, "using free-space-tree");
        }

        return ret;
}

/*
 * This is subtle, we only call this during open_ctree().  We need to pre-load
 * the mount options with the on-disk settings.  Before the new mount API took
 * effect we would do this on mount and remount.  With the new mount API we'll
 * only do this on the initial mount.
 *
 * This isn't a change in behavior, because we're using the current state of the
 * file system to set the current mount options.  If you mounted with special
 * options to disable these features and then remounted we wouldn't revert the
 * settings, because mounting without these features cleared the on-disk
 * settings, so this being called on re-mount is not needed.
 */
void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info)
{
        if (fs_info->sectorsize < PAGE_SIZE) {
                btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
                if (!btrfs_test_opt(fs_info, FREE_SPACE_TREE)) {
                        btrfs_info(fs_info,
                                   "forcing free space tree for sector size %u with page size %lu",
                                   fs_info->sectorsize, PAGE_SIZE);
                        btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
                }
        }

        /*
         * At this point our mount options are populated, so we only mess with
         * these settings if we don't have any settings already.
         */
        if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
                return;

        if (btrfs_is_zoned(fs_info) &&
            btrfs_free_space_cache_v1_active(fs_info)) {
                btrfs_info(fs_info, "zoned: clearing existing space cache");
                btrfs_set_super_cache_generation(fs_info->super_copy, 0);
                return;
        }

        if (btrfs_test_opt(fs_info, SPACE_CACHE))
                return;

        if (btrfs_test_opt(fs_info, NOSPACECACHE))
                return;

        /*
         * At this point we don't have explicit options set by the user, set
         * them ourselves based on the state of the file system.
         */
        if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
                btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
        else if (btrfs_free_space_cache_v1_active(fs_info))
                btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
}

static void set_device_specific_options(struct btrfs_fs_info *fs_info)
{
        if (!btrfs_test_opt(fs_info, NOSSD) &&
            !fs_info->fs_devices->rotating)
                btrfs_set_opt(fs_info->mount_opt, SSD);

        /*
         * For devices supporting discard turn on discard=async automatically,
         * unless it's already set or disabled. This could be turned off by
         * nodiscard for the same mount.
         *
         * The zoned mode piggy backs on the discard functionality for
         * resetting a zone. There is no reason to delay the zone reset as it is
         * fast enough. So, do not enable async discard for zoned mode.
         */
        if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
              btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
              btrfs_test_opt(fs_info, NODISCARD)) &&
            fs_info->fs_devices->discardable &&
            !btrfs_is_zoned(fs_info))
                btrfs_set_opt(fs_info->mount_opt, DISCARD_ASYNC);
}

char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
                                          u64 subvol_objectid)
{
        struct btrfs_root *root = fs_info->tree_root;
        struct btrfs_root *fs_root = NULL;
        struct btrfs_root_ref *root_ref;
        struct btrfs_inode_ref *inode_ref;
        struct btrfs_key key;
        struct btrfs_path *path = NULL;
        char *name = NULL, *ptr;
        u64 dirid;
        int len;
        int ret;

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto err;
        }

        name = kmalloc(PATH_MAX, GFP_KERNEL);
        if (!name) {
                ret = -ENOMEM;
                goto err;
        }
        ptr = name + PATH_MAX - 1;
        ptr[0] = '\0';

        /*
         * Walk up the subvolume trees in the tree of tree roots by root
         * backrefs until we hit the top-level subvolume.
         */
        while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
                key.objectid = subvol_objectid;
                key.type = BTRFS_ROOT_BACKREF_KEY;
                key.offset = (u64)-1;

                ret = btrfs_search_backwards(root, &key, path);
                if (ret < 0) {
                        goto err;
                } else if (ret > 0) {
                        ret = -ENOENT;
                        goto err;
                }

                subvol_objectid = key.offset;

                root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                          struct btrfs_root_ref);
                len = btrfs_root_ref_name_len(path->nodes[0], root_ref);
                ptr -= len + 1;
                if (ptr < name) {
                        ret = -ENAMETOOLONG;
                        goto err;
                }
                read_extent_buffer(path->nodes[0], ptr + 1,
                                   (unsigned long)(root_ref + 1), len);
                ptr[0] = '/';
                dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref);
                btrfs_release_path(path);

                fs_root = btrfs_get_fs_root(fs_info, subvol_objectid, true);
                if (IS_ERR(fs_root)) {
                        ret = PTR_ERR(fs_root);
                        fs_root = NULL;
                        goto err;
                }

                /*
                 * Walk up the filesystem tree by inode refs until we hit the
                 * root directory.
                 */
                while (dirid != BTRFS_FIRST_FREE_OBJECTID) {
                        key.objectid = dirid;
                        key.type = BTRFS_INODE_REF_KEY;
                        key.offset = (u64)-1;

                        ret = btrfs_search_backwards(fs_root, &key, path);
                        if (ret < 0) {
                                goto err;
                        } else if (ret > 0) {
                                ret = -ENOENT;
                                goto err;
                        }

                        dirid = key.offset;

                        inode_ref = btrfs_item_ptr(path->nodes[0],
                                                   path->slots[0],
                                                   struct btrfs_inode_ref);
                        len = btrfs_inode_ref_name_len(path->nodes[0],
                                                       inode_ref);
                        ptr -= len + 1;
                        if (ptr < name) {
                                ret = -ENAMETOOLONG;
                                goto err;
                        }
                        read_extent_buffer(path->nodes[0], ptr + 1,
                                           (unsigned long)(inode_ref + 1), len);
                        ptr[0] = '/';
                        btrfs_release_path(path);
                }
                btrfs_put_root(fs_root);
                fs_root = NULL;
        }

        btrfs_free_path(path);
        if (ptr == name + PATH_MAX - 1) {
                name[0] = '/';
                name[1] = '\0';
        } else {
                memmove(name, ptr, name + PATH_MAX - ptr);
        }
        return name;

err:
        btrfs_put_root(fs_root);
        btrfs_free_path(path);
        kfree(name);
        return ERR_PTR(ret);
}

static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid)
{
        struct btrfs_root *root = fs_info->tree_root;
        struct btrfs_dir_item *di;
        struct btrfs_path *path;
        struct btrfs_key location;
        struct fscrypt_str name = FSTR_INIT("default", 7);
        u64 dir_id;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        /*
         * Find the "default" dir item which points to the root item that we
         * will mount by default if we haven't been given a specific subvolume
         * to mount.
         */
        dir_id = btrfs_super_root_dir(fs_info->super_copy);
        di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0);
        if (IS_ERR(di)) {
                btrfs_free_path(path);
                return PTR_ERR(di);
        }
        if (!di) {
                /*
                 * Ok the default dir item isn't there.  This is weird since
                 * it's always been there, but don't freak out, just try and
                 * mount the top-level subvolume.
                 */
                btrfs_free_path(path);
                *objectid = BTRFS_FS_TREE_OBJECTID;
                return 0;
        }

        btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
        btrfs_free_path(path);
        *objectid = location.objectid;
        return 0;
}

static int btrfs_fill_super(struct super_block *sb,
                            struct btrfs_fs_devices *fs_devices,
                            void *data)
{
        struct inode *inode;
        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        int err;

        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_magic = BTRFS_SUPER_MAGIC;
        sb->s_op = &btrfs_super_ops;
        sb->s_d_op = &btrfs_dentry_operations;
        sb->s_export_op = &btrfs_export_ops;
#ifdef CONFIG_FS_VERITY
        sb->s_vop = &btrfs_verityops;
#endif
        sb->s_xattr = btrfs_xattr_handlers;
        sb->s_time_gran = 1;
        sb->s_iflags |= SB_I_CGROUPWB;

        err = super_setup_bdi(sb);
        if (err) {
                btrfs_err(fs_info, "super_setup_bdi failed");
                return err;
        }

        err = open_ctree(sb, fs_devices, (char *)data);
        if (err) {
                btrfs_err(fs_info, "open_ctree failed");
                return err;
        }

        inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                btrfs_handle_fs_error(fs_info, err, NULL);
                goto fail_close;
        }

        sb->s_root = d_make_root(inode);
        if (!sb->s_root) {
                err = -ENOMEM;
                goto fail_close;
        }

        sb->s_flags |= SB_ACTIVE;
        return 0;

fail_close:
        close_ctree(fs_info);
        return err;
}

int btrfs_sync_fs(struct super_block *sb, int wait)
{
        struct btrfs_trans_handle *trans;
        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        struct btrfs_root *root = fs_info->tree_root;

        trace_btrfs_sync_fs(fs_info, wait);

        if (!wait) {
                filemap_flush(fs_info->btree_inode->i_mapping);
                return 0;
        }

        btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);

        trans = btrfs_attach_transaction_barrier(root);
        if (IS_ERR(trans)) {
                /* no transaction, don't bother */
                if (PTR_ERR(trans) == -ENOENT) {
                        /*
                         * Exit unless we have some pending changes
                         * that need to go through commit
                         */
                        if (!test_bit(BTRFS_FS_NEED_TRANS_COMMIT,
                                      &fs_info->flags))
                                return 0;
                        /*
                         * A non-blocking test if the fs is frozen. We must not
                         * start a new transaction here otherwise a deadlock
                         * happens. The pending operations are delayed to the
                         * next commit after thawing.
                         */
                        if (sb_start_write_trylock(sb))
                                sb_end_write(sb);
                        else
                                return 0;
                        trans = btrfs_start_transaction(root, 0);
                }
                if (IS_ERR(trans))
                        return PTR_ERR(trans);
        }
        return btrfs_commit_transaction(trans);
}

static void print_rescue_option(struct seq_file *seq, const char *s, bool *printed)
{
        seq_printf(seq, "%s%s", (*printed) ? ":" : ",rescue=", s);
        *printed = true;
}

static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
        struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
        const char *compress_type;
        const char *subvol_name;
        bool printed = false;

        if (btrfs_test_opt(info, DEGRADED))
                seq_puts(seq, ",degraded");
        if (btrfs_test_opt(info, NODATASUM))
                seq_puts(seq, ",nodatasum");
        if (btrfs_test_opt(info, NODATACOW))
                seq_puts(seq, ",nodatacow");
        if (btrfs_test_opt(info, NOBARRIER))
                seq_puts(seq, ",nobarrier");
        if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
                seq_printf(seq, ",max_inline=%llu", info->max_inline);
        if (info->thread_pool_size !=  min_t(unsigned long,
                                             num_online_cpus() + 2, 8))
                seq_printf(seq, ",thread_pool=%u", info->thread_pool_size);
        if (btrfs_test_opt(info, COMPRESS)) {
                compress_type = btrfs_compress_type2str(info->compress_type);
                if (btrfs_test_opt(info, FORCE_COMPRESS))
                        seq_printf(seq, ",compress-force=%s", compress_type);
                else
                        seq_printf(seq, ",compress=%s", compress_type);
                if (info->compress_level)
                        seq_printf(seq, ":%d", info->compress_level);
        }
        if (btrfs_test_opt(info, NOSSD))
                seq_puts(seq, ",nossd");
        if (btrfs_test_opt(info, SSD_SPREAD))
                seq_puts(seq, ",ssd_spread");
        else if (btrfs_test_opt(info, SSD))
                seq_puts(seq, ",ssd");
        if (btrfs_test_opt(info, NOTREELOG))
                seq_puts(seq, ",notreelog");
        if (btrfs_test_opt(info, NOLOGREPLAY))
                print_rescue_option(seq, "nologreplay", &printed);
        if (btrfs_test_opt(info, USEBACKUPROOT))
                print_rescue_option(seq, "usebackuproot", &printed);
        if (btrfs_test_opt(info, IGNOREBADROOTS))
                print_rescue_option(seq, "ignorebadroots", &printed);
        if (btrfs_test_opt(info, IGNOREDATACSUMS))
                print_rescue_option(seq, "ignoredatacsums", &printed);
        if (btrfs_test_opt(info, FLUSHONCOMMIT))
                seq_puts(seq, ",flushoncommit");
        if (btrfs_test_opt(info, DISCARD_SYNC))
                seq_puts(seq, ",discard");
        if (btrfs_test_opt(info, DISCARD_ASYNC))
                seq_puts(seq, ",discard=async");
        if (!(info->sb->s_flags & SB_POSIXACL))
                seq_puts(seq, ",noacl");
        if (btrfs_free_space_cache_v1_active(info))
                seq_puts(seq, ",space_cache");
        else if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
                seq_puts(seq, ",space_cache=v2");
        else
                seq_puts(seq, ",nospace_cache");
        if (btrfs_test_opt(info, RESCAN_UUID_TREE))
                seq_puts(seq, ",rescan_uuid_tree");
        if (btrfs_test_opt(info, CLEAR_CACHE))
                seq_puts(seq, ",clear_cache");
        if (btrfs_test_opt(info, USER_SUBVOL_RM_ALLOWED))
                seq_puts(seq, ",user_subvol_rm_allowed");
        if (btrfs_test_opt(info, ENOSPC_DEBUG))
                seq_puts(seq, ",enospc_debug");
        if (btrfs_test_opt(info, AUTO_DEFRAG))
                seq_puts(seq, ",autodefrag");
        if (btrfs_test_opt(info, SKIP_BALANCE))
                seq_puts(seq, ",skip_balance");
        if (info->metadata_ratio)
                seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio);
        if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR))
                seq_puts(seq, ",fatal_errors=panic");
        if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
                seq_printf(seq, ",commit=%u", info->commit_interval);
#ifdef CONFIG_BTRFS_DEBUG
        if (btrfs_test_opt(info, FRAGMENT_DATA))
                seq_puts(seq, ",fragment=data");
        if (btrfs_test_opt(info, FRAGMENT_METADATA))
                seq_puts(seq, ",fragment=metadata");
#endif
        if (btrfs_test_opt(info, REF_VERIFY))
                seq_puts(seq, ",ref_verify");
        seq_printf(seq, ",subvolid=%llu", btrfs_root_id(BTRFS_I(d_inode(dentry))->root));
        subvol_name = btrfs_get_subvol_name_from_objectid(info,
                        btrfs_root_id(BTRFS_I(d_inode(dentry))->root));
        if (!IS_ERR(subvol_name)) {
                seq_puts(seq, ",subvol=");
                seq_escape(seq, subvol_name, " \t\n\\");
                kfree(subvol_name);
        }
        return 0;
}

/*
 * subvolumes are identified by ino 256
 */
static inline int is_subvolume_inode(struct inode *inode)
{
        if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                return 1;
        return 0;
}

static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
                                   struct vfsmount *mnt)
{
        struct dentry *root;
        int ret;

        if (!subvol_name) {
                if (!subvol_objectid) {
                        ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb),
                                                          &subvol_objectid);
                        if (ret) {
                                root = ERR_PTR(ret);
                                goto out;
                        }
                }
                subvol_name = btrfs_get_subvol_name_from_objectid(
                                        btrfs_sb(mnt->mnt_sb), subvol_objectid);
                if (IS_ERR(subvol_name)) {
                        root = ERR_CAST(subvol_name);
                        subvol_name = NULL;
                        goto out;
                }

        }

        root = mount_subtree(mnt, subvol_name);
        /* mount_subtree() drops our reference on the vfsmount. */
        mnt = NULL;

        if (!IS_ERR(root)) {
                struct super_block *s = root->d_sb;
                struct btrfs_fs_info *fs_info = btrfs_sb(s);
                struct inode *root_inode = d_inode(root);
                u64 root_objectid = btrfs_root_id(BTRFS_I(root_inode)->root);

                ret = 0;
                if (!is_subvolume_inode(root_inode)) {
                        btrfs_err(fs_info, "'%s' is not a valid subvolume",
                               subvol_name);
                        ret = -EINVAL;
                }
                if (subvol_objectid && root_objectid != subvol_objectid) {
                        /*
                         * This will also catch a race condition where a
                         * subvolume which was passed by ID is renamed and
                         * another subvolume is renamed over the old location.
                         */
                        btrfs_err(fs_info,
                                  "subvol '%s' does not match subvolid %llu",
                                  subvol_name, subvol_objectid);
                        ret = -EINVAL;
                }
                if (ret) {
                        dput(root);
                        root = ERR_PTR(ret);
                        deactivate_locked_super(s);
                }
        }

out:
        mntput(mnt);
        kfree(subvol_name);
        return root;
}

static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
                                     u32 new_pool_size, u32 old_pool_size)
{
        if (new_pool_size == old_pool_size)
                return;

        fs_info->thread_pool_size = new_pool_size;

        btrfs_info(fs_info, "resize thread pool %d -> %d",
               old_pool_size, new_pool_size);

        btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
        btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
        btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
        workqueue_set_max_active(fs_info->endio_workers, new_pool_size);
        workqueue_set_max_active(fs_info->endio_meta_workers, new_pool_size);
        btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
        btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
        btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
}

static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
                                       unsigned long old_opts, int flags)
{
        if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
            (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
             (flags & SB_RDONLY))) {
                /* wait for any defraggers to finish */
                wait_event(fs_info->transaction_wait,
                           (atomic_read(&fs_info->defrag_running) == 0));
                if (flags & SB_RDONLY)
                        sync_filesystem(fs_info->sb);
        }
}

static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
                                         unsigned long old_opts)
{
        const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);

        /*
         * We need to cleanup all defragable inodes if the autodefragment is
         * close or the filesystem is read only.
         */
        if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
            (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || sb_rdonly(fs_info->sb))) {
                btrfs_cleanup_defrag_inodes(fs_info);
        }

        /* If we toggled discard async */
        if (!btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) &&
            btrfs_test_opt(fs_info, DISCARD_ASYNC))
                btrfs_discard_resume(fs_info);
        else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) &&
                 !btrfs_test_opt(fs_info, DISCARD_ASYNC))
                btrfs_discard_cleanup(fs_info);

        /* If we toggled space cache */
        if (cache_opt != btrfs_free_space_cache_v1_active(fs_info))
                btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
}

static int btrfs_remount_rw(struct btrfs_fs_info *fs_info)
{
        int ret;

        if (BTRFS_FS_ERROR(fs_info)) {
                btrfs_err(fs_info,
                          "remounting read-write after error is not allowed");
                return -EINVAL;
        }

        if (fs_info->fs_devices->rw_devices == 0)
                return -EACCES;

        if (!btrfs_check_rw_degradable(fs_info, NULL)) {
                btrfs_warn(fs_info,
                           "too many missing devices, writable remount is not allowed");
                return -EACCES;
        }

        if (btrfs_super_log_root(fs_info->super_copy) != 0) {
                btrfs_warn(fs_info,
                           "mount required to replay tree-log, cannot remount read-write");
                return -EINVAL;
        }

        /*
         * NOTE: when remounting with a change that does writes, don't put it
         * anywhere above this point, as we are not sure to be safe to write
         * until we pass the above checks.
         */
        ret = btrfs_start_pre_rw_mount(fs_info);
        if (ret)
                return ret;

        btrfs_clear_sb_rdonly(fs_info->sb);

        set_bit(BTRFS_FS_OPEN, &fs_info->flags);

        /*
         * If we've gone from readonly -> read-write, we need to get our
         * sync/async discard lists in the right state.
         */
        btrfs_discard_resume(fs_info);

        return 0;
}

static int btrfs_remount_ro(struct btrfs_fs_info *fs_info)
{
        /*
         * This also happens on 'umount -rf' or on shutdown, when the
         * filesystem is busy.
         */
        cancel_work_sync(&fs_info->async_reclaim_work);
        cancel_work_sync(&fs_info->async_data_reclaim_work);

        btrfs_discard_cleanup(fs_info);

        /* Wait for the uuid_scan task to finish */
        down(&fs_info->uuid_tree_rescan_sem);
        /* Avoid complains from lockdep et al. */
        up(&fs_info->uuid_tree_rescan_sem);

        btrfs_set_sb_rdonly(fs_info->sb);

        /*
         * Setting SB_RDONLY will put the cleaner thread to sleep at the next
         * loop if it's already active.  If it's already asleep, we'll leave
         * unused block groups on disk until we're mounted read-write again
         * unless we clean them up here.
         */
        btrfs_delete_unused_bgs(fs_info);

        /*
         * The cleaner task could be already running before we set the flag
         * BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock).  We must make
         * sure that after we finish the remount, i.e. after we call
         * btrfs_commit_super(), the cleaner can no longer start a transaction
         * - either because it was dropping a dead root, running delayed iputs
         *   or deleting an unused block group (the cleaner picked a block
         *   group from the list of unused block groups before we were able to
         *   in the previous call to btrfs_delete_unused_bgs()).
         */
        wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING, TASK_UNINTERRUPTIBLE);

        /*
         * We've set the superblock to RO mode, so we might have made the
         * cleaner task sleep without running all pending delayed iputs. Go
         * through all the delayed iputs here, so that if an unmount happens
         * without remounting RW we don't end up at finishing close_ctree()
         * with a non-empty list of delayed iputs.
         */
        btrfs_run_delayed_iputs(fs_info);

        btrfs_dev_replace_suspend_for_unmount(fs_info);
        btrfs_scrub_cancel(fs_info);
        btrfs_pause_balance(fs_info);

        /*
         * Pause the qgroup rescan worker if it is running. We don't want it to
         * be still running after we are in RO mode, as after that, by the time
         * we unmount, it might have left a transaction open, so we would leak
         * the transaction and/or crash.
         */
        btrfs_qgroup_wait_for_completion(fs_info, false);

        return btrfs_commit_super(fs_info);
}

static void btrfs_ctx_to_info(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx)
{
        fs_info->max_inline = ctx->max_inline;
        fs_info->commit_interval = ctx->commit_interval;
        fs_info->metadata_ratio = ctx->metadata_ratio;
        fs_info->thread_pool_size = ctx->thread_pool_size;
        fs_info->mount_opt = ctx->mount_opt;
        fs_info->compress_type = ctx->compress_type;
        fs_info->compress_level = ctx->compress_level;
}

static void btrfs_info_to_ctx(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx)
{
        ctx->max_inline = fs_info->max_inline;
        ctx->commit_interval = fs_info->commit_interval;
        ctx->metadata_ratio = fs_info->metadata_ratio;
        ctx->thread_pool_size = fs_info->thread_pool_size;
        ctx->mount_opt = fs_info->mount_opt;
        ctx->compress_type = fs_info->compress_type;
        ctx->compress_level = fs_info->compress_level;
}

#define btrfs_info_if_set(fs_info, old_ctx, opt, fmt, args...)                        \
do {                                                                                \
        if ((!old_ctx || !btrfs_raw_test_opt(old_ctx->mount_opt, opt)) &&        \
            btrfs_raw_test_opt(fs_info->mount_opt, opt))                        \
                btrfs_info(fs_info, fmt, ##args);                                \
} while (0)

#define btrfs_info_if_unset(fs_info, old_ctx, opt, fmt, args...)        \
do {                                                                        \
        if ((old_ctx && btrfs_raw_test_opt(old_ctx->mount_opt, opt)) &&        \
            !btrfs_raw_test_opt(fs_info->mount_opt, opt))                \
                btrfs_info(fs_info, fmt, ##args);                        \
} while (0)

static void btrfs_emit_options(struct btrfs_fs_info *info,
                               struct btrfs_fs_context *old)
{
        btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
        btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts");
        btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
        btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations");
        btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme");
        btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers");
        btrfs_info_if_set(info, old, NOTREELOG, "disabling tree log");
        btrfs_info_if_set(info, old, NOLOGREPLAY, "disabling log replay at mount time");
        btrfs_info_if_set(info, old, FLUSHONCOMMIT, "turning on flush-on-commit");
        btrfs_info_if_set(info, old, DISCARD_SYNC, "turning on sync discard");
        btrfs_info_if_set(info, old, DISCARD_ASYNC, "turning on async discard");
        btrfs_info_if_set(info, old, FREE_SPACE_TREE, "enabling free space tree");
        btrfs_info_if_set(info, old, SPACE_CACHE, "enabling disk space caching");
        btrfs_info_if_set(info, old, CLEAR_CACHE, "force clearing of disk cache");
        btrfs_info_if_set(info, old, AUTO_DEFRAG, "enabling auto defrag");
        btrfs_info_if_set(info, old, FRAGMENT_DATA, "fragmenting data");
        btrfs_info_if_set(info, old, FRAGMENT_METADATA, "fragmenting metadata");
        btrfs_info_if_set(info, old, REF_VERIFY, "doing ref verification");
        btrfs_info_if_set(info, old, USEBACKUPROOT, "trying to use backup root at mount time");
        btrfs_info_if_set(info, old, IGNOREBADROOTS, "ignoring bad roots");
        btrfs_info_if_set(info, old, IGNOREDATACSUMS, "ignoring data csums");

        btrfs_info_if_unset(info, old, NODATACOW, "setting datacow");
        btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations");
        btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme");
        btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers");
        btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log");
        btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching");
        btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree");
        btrfs_info_if_unset(info, old, AUTO_DEFRAG, "disabling auto defrag");
        btrfs_info_if_unset(info, old, COMPRESS, "use no compression");

        /* Did the compression settings change? */
        if (btrfs_test_opt(info, COMPRESS) &&
            (!old ||
             old->compress_type != info->compress_type ||
             old->compress_level != info->compress_level ||
             (!btrfs_raw_test_opt(old->mount_opt, FORCE_COMPRESS) &&
              btrfs_raw_test_opt(info->mount_opt, FORCE_COMPRESS)))) {
                const char *compress_type = btrfs_compress_type2str(info->compress_type);

                btrfs_info(info, "%s %s compression, level %d",
                           btrfs_test_opt(info, FORCE_COMPRESS) ? "force" : "use",
                           compress_type, info->compress_level);
        }

        if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
                btrfs_info(info, "max_inline set to %llu", info->max_inline);
}

static int btrfs_reconfigure(struct fs_context *fc)
{
        struct super_block *sb = fc->root->d_sb;
        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        struct btrfs_fs_context *ctx = fc->fs_private;
        struct btrfs_fs_context old_ctx;
        int ret = 0;
        bool mount_reconfigure = (fc->s_fs_info != NULL);

        btrfs_info_to_ctx(fs_info, &old_ctx);

        /*
         * This is our "bind mount" trick, we don't want to allow the user to do
         * anything other than mount a different ro/rw and a different subvol,
         * all of the mount options should be maintained.
         */
        if (mount_reconfigure)
                ctx->mount_opt = old_ctx.mount_opt;

        sync_filesystem(sb);
        set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);

        if (!mount_reconfigure &&
            !btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags))
                return -EINVAL;

        ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY));
        if (ret < 0)
                return ret;

        btrfs_ctx_to_info(fs_info, ctx);
        btrfs_remount_begin(fs_info, old_ctx.mount_opt, fc->sb_flags);
        btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size,
                                 old_ctx.thread_pool_size);

        if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) !=
            (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
            (!sb_rdonly(sb) || (fc->sb_flags & SB_RDONLY))) {
                btrfs_warn(fs_info,
                "remount supports changing free space tree only from RO to RW");
                /* Make sure free space cache options match the state on disk. */
                if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
                        btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
                        btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
                }
                if (btrfs_free_space_cache_v1_active(fs_info)) {
                        btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE);
                        btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
                }
        }

        ret = 0;
        if (!sb_rdonly(sb) && (fc->sb_flags & SB_RDONLY))
                ret = btrfs_remount_ro(fs_info);
        else if (sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY))
                ret = btrfs_remount_rw(fs_info);
        if (ret)
                goto restore;

        /*
         * If we set the mask during the parameter parsing VFS would reject the
         * remount.  Here we can set the mask and the value will be updated
         * appropriately.
         */
        if ((fc->sb_flags & SB_POSIXACL) != (sb->s_flags & SB_POSIXACL))
                fc->sb_flags_mask |= SB_POSIXACL;

        btrfs_emit_options(fs_info, &old_ctx);
        wake_up_process(fs_info->transaction_kthread);
        btrfs_remount_cleanup(fs_info, old_ctx.mount_opt);
        btrfs_clear_oneshot_options(fs_info);
        clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);

        return 0;
restore:
        btrfs_ctx_to_info(fs_info, &old_ctx);
        btrfs_remount_cleanup(fs_info, old_ctx.mount_opt);
        clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
        return ret;
}

/* Used to sort the devices by max_avail(descending sort) */
static int btrfs_cmp_device_free_bytes(const void *a, const void *b)
{
        const struct btrfs_device_info *dev_info1 = a;
        const struct btrfs_device_info *dev_info2 = b;

        if (dev_info1->max_avail > dev_info2->max_avail)
                return -1;
        else if (dev_info1->max_avail < dev_info2->max_avail)
                return 1;
        return 0;
}

/*
 * sort the devices by max_avail, in which max free extent size of each device
 * is stored.(Descending Sort)
 */
static inline void btrfs_descending_sort_devices(
                                        struct btrfs_device_info *devices,
                                        size_t nr_devices)
{
        sort(devices, nr_devices, sizeof(struct btrfs_device_info),
             btrfs_cmp_device_free_bytes, NULL);
}

/*
 * The helper to calc the free space on the devices that can be used to store
 * file data.
 */
static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
                                              u64 *free_bytes)
{
        struct btrfs_device_info *devices_info;
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        struct btrfs_device *device;
        u64 type;
        u64 avail_space;
        u64 min_stripe_size;
        int num_stripes = 1;
        int i = 0, nr_devices;
        const struct btrfs_raid_attr *rattr;

        /*
         * We aren't under the device list lock, so this is racy-ish, but good
         * enough for our purposes.
         */
        nr_devices = fs_info->fs_devices->open_devices;
        if (!nr_devices) {
                smp_mb();
                nr_devices = fs_info->fs_devices->open_devices;
                ASSERT(nr_devices);
                if (!nr_devices) {
                        *free_bytes = 0;
                        return 0;
                }
        }

        devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
                               GFP_KERNEL);
        if (!devices_info)
                return -ENOMEM;

        /* calc min stripe number for data space allocation */
        type = btrfs_data_alloc_profile(fs_info);
        rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)];

        if (type & BTRFS_BLOCK_GROUP_RAID0)
                num_stripes = nr_devices;
        else if (type & BTRFS_BLOCK_GROUP_RAID1_MASK)
                num_stripes = rattr->ncopies;
        else if (type & BTRFS_BLOCK_GROUP_RAID10)
                num_stripes = 4;

        /* Adjust for more than 1 stripe per device */
        min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN;

        rcu_read_lock();
        list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
                if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
                                                &device->dev_state) ||
                    !device->bdev ||
                    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
                        continue;

                if (i >= nr_devices)
                        break;

                avail_space = device->total_bytes - device->bytes_used;

                /* align with stripe_len */
                avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN);

                /*
                 * Ensure we have at least min_stripe_size on top of the
                 * reserved space on the device.
                 */
                if (avail_space <= BTRFS_DEVICE_RANGE_RESERVED + min_stripe_size)
                        continue;

                avail_space -= BTRFS_DEVICE_RANGE_RESERVED;

                devices_info[i].dev = device;
                devices_info[i].max_avail = avail_space;

                i++;
        }
        rcu_read_unlock();

        nr_devices = i;

        btrfs_descending_sort_devices(devices_info, nr_devices);

        i = nr_devices - 1;
        avail_space = 0;
        while (nr_devices >= rattr->devs_min) {
                num_stripes = min(num_stripes, nr_devices);

                if (devices_info[i].max_avail >= min_stripe_size) {
                        int j;
                        u64 alloc_size;

                        avail_space += devices_info[i].max_avail * num_stripes;
                        alloc_size = devices_info[i].max_avail;
                        for (j = i + 1 - num_stripes; j <= i; j++)
                                devices_info[j].max_avail -= alloc_size;
                }
                i--;
                nr_devices--;
        }

        kfree(devices_info);
        *free_bytes = avail_space;
        return 0;
}

/*
 * Calculate numbers for 'df', pessimistic in case of mixed raid profiles.
 *
 * If there's a redundant raid level at DATA block groups, use the respective
 * multiplier to scale the sizes.
 *
 * Unused device space usage is based on simulating the chunk allocator
 * algorithm that respects the device sizes and order of allocations.  This is
 * a close approximation of the actual use but there are other factors that may
 * change the result (like a new metadata chunk).
 *
 * If metadata is exhausted, f_bavail will be 0.
 */
static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
        struct btrfs_super_block *disk_super = fs_info->super_copy;
        struct btrfs_space_info *found;
        u64 total_used = 0;
        u64 total_free_data = 0;
        u64 total_free_meta = 0;
        u32 bits = fs_info->sectorsize_bits;
        __be32 *fsid = (__be32 *)fs_info->fs_devices->fsid;
        unsigned factor = 1;
        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
        int ret;
        u64 thresh = 0;
        int mixed = 0;

        list_for_each_entry(found, &fs_info->space_info, list) {
                if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
                        int i;

                        total_free_data += found->disk_total - found->disk_used;
                        total_free_data -=
                                btrfs_account_ro_block_groups_free_space(found);

                        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
                                if (!list_empty(&found->block_groups[i]))
                                        factor = btrfs_bg_type_to_factor(
                                                btrfs_raid_array[i].bg_flag);
                        }
                }

                /*
                 * Metadata in mixed block group profiles are accounted in data
                 */
                if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) {
                        if (found->flags & BTRFS_BLOCK_GROUP_DATA)
                                mixed = 1;
                        else
                                total_free_meta += found->disk_total -
                                        found->disk_used;
                }

                total_used += found->disk_used;
        }

        buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
        buf->f_blocks >>= bits;
        buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);

        /* Account global block reserve as used, it's in logical size already */
        spin_lock(&block_rsv->lock);
        /* Mixed block groups accounting is not byte-accurate, avoid overflow */
        if (buf->f_bfree >= block_rsv->size >> bits)
                buf->f_bfree -= block_rsv->size >> bits;
        else
                buf->f_bfree = 0;
        spin_unlock(&block_rsv->lock);

        buf->f_bavail = div_u64(total_free_data, factor);
        ret = btrfs_calc_avail_data_space(fs_info, &total_free_data);
        if (ret)
                return ret;
        buf->f_bavail += div_u64(total_free_data, factor);
        buf->f_bavail = buf->f_bavail >> bits;

        /*
         * We calculate the remaining metadata space minus global reserve. If
         * this is (supposedly) smaller than zero, there's no space. But this
         * does not hold in practice, the exhausted state happens where's still
         * some positive delta. So we apply some guesswork and compare the
         * delta to a 4M threshold.  (Practically observed delta was ~2M.)
         *
         * We probably cannot calculate the exact threshold value because this
         * depends on the internal reservations requested by various
         * operations, so some operations that consume a few metadata will
         * succeed even if the Avail is zero. But this is better than the other
         * way around.
         */
        thresh = SZ_4M;

        /*
         * We only want to claim there's no available space if we can no longer
         * allocate chunks for our metadata profile and our global reserve will
         * not fit in the free metadata space.  If we aren't ->full then we
         * still can allocate chunks and thus are fine using the currently
         * calculated f_bavail.
         */
        if (!mixed && block_rsv->space_info->full &&
            (total_free_meta < thresh || total_free_meta - thresh < block_rsv->size))
                buf->f_bavail = 0;

        buf->f_type = BTRFS_SUPER_MAGIC;
        buf->f_bsize = fs_info->sectorsize;
        buf->f_namelen = BTRFS_NAME_LEN;

        /* We treat it as constant endianness (it doesn't matter _which_)
           because we want the fsid to come out the same whether mounted
           on a big-endian or little-endian host */
        buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
        buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
        /* Mask in the root object ID too, to disambiguate subvols */
        buf->f_fsid.val[0] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root) >> 32;
        buf->f_fsid.val[1] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root);

        return 0;
}

static int btrfs_fc_test_super(struct super_block *sb, struct fs_context *fc)
{
        struct btrfs_fs_info *p = fc->s_fs_info;
        struct btrfs_fs_info *fs_info = btrfs_sb(sb);

        return fs_info->fs_devices == p->fs_devices;
}

static int btrfs_get_tree_super(struct fs_context *fc)
{
        struct btrfs_fs_info *fs_info = fc->s_fs_info;
        struct btrfs_fs_context *ctx = fc->fs_private;
        struct btrfs_fs_devices *fs_devices = NULL;
        struct block_device *bdev;
        struct btrfs_device *device;
        struct super_block *sb;
        blk_mode_t mode = btrfs_open_mode(fc);
        int ret;

        btrfs_ctx_to_info(fs_info, ctx);
        mutex_lock(&uuid_mutex);

        /*
         * With 'true' passed to btrfs_scan_one_device() (mount time) we expect
         * either a valid device or an error.
         */
        device = btrfs_scan_one_device(fc->source, mode, true);
        ASSERT(device != NULL);
        if (IS_ERR(device)) {
                mutex_unlock(&uuid_mutex);
                return PTR_ERR(device);
        }

        fs_devices = device->fs_devices;
        fs_info->fs_devices = fs_devices;

        ret = btrfs_open_devices(fs_devices, mode, &btrfs_fs_type);
        mutex_unlock(&uuid_mutex);
        if (ret)
                return ret;

        if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
                ret = -EACCES;
                goto error;
        }

        bdev = fs_devices->latest_dev->bdev;

        /*
         * From now on the error handling is not straightforward.
         *
         * If successful, this will transfer the fs_info into the super block,
         * and fc->s_fs_info will be NULL.  However if there's an existing
         * super, we'll still have fc->s_fs_info populated.  If we error
         * completely out it'll be cleaned up when we drop the fs_context,
         * otherwise it's tied to the lifetime of the super_block.
         */
        sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc);
        if (IS_ERR(sb)) {
                ret = PTR_ERR(sb);
                goto error;
        }

        set_device_specific_options(fs_info);

        if (sb->s_root) {
                btrfs_close_devices(fs_devices);
                if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY)
                        ret = -EBUSY;
        } else {
                snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
                shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id);
                btrfs_sb(sb)->bdev_holder = &btrfs_fs_type;
                ret = btrfs_fill_super(sb, fs_devices, NULL);
        }

        if (ret) {
                deactivate_locked_super(sb);
                return ret;
        }

        btrfs_clear_oneshot_options(fs_info);

        fc->root = dget(sb->s_root);
        return 0;

error:
        btrfs_close_devices(fs_devices);
        return ret;
}

/*
 * Ever since commit 0723a0473fb4 ("btrfs: allow mounting btrfs subvolumes
 * with different ro/rw options") the following works:
 *
 *        (i) mount /dev/sda3 -o subvol=foo,ro /mnt/foo
 *       (ii) mount /dev/sda3 -o subvol=bar,rw /mnt/bar
 *
 * which looks nice and innocent but is actually pretty intricate and deserves
 * a long comment.
 *
 * On another filesystem a subvolume mount is close to something like:
 *
 *        (iii) # create rw superblock + initial mount
 *              mount -t xfs /dev/sdb /opt/
 *
 *              # create ro bind mount
 *              mount --bind -o ro /opt/foo /mnt/foo
 *
 *              # unmount initial mount
 *              umount /opt
 *
 * Of course, there's some special subvolume sauce and there's the fact that the
 * sb->s_root dentry is really swapped after mount_subtree(). But conceptually
 * it's very close and will help us understand the issue.
 *
 * The old mount API didn't cleanly distinguish between a mount being made ro
 * and a superblock being made ro.  The only way to change the ro state of
 * either object was by passing ms_rdonly. If a new mount was created via
 * mount(2) such as:
 *
 *      mount("/dev/sdb", "/mnt", "xfs", ms_rdonly, null);
 *
 * the MS_RDONLY flag being specified had two effects:
 *
 * (1) MNT_READONLY was raised -> the resulting mount got
 *     @mnt->mnt_flags |= MNT_READONLY raised.
 *
 * (2) MS_RDONLY was passed to the filesystem's mount method and the filesystems
 *     made the superblock ro. Note, how SB_RDONLY has the same value as
 *     ms_rdonly and is raised whenever MS_RDONLY is passed through mount(2).
 *
 * Creating a subtree mount via (iii) ends up leaving a rw superblock with a
 * subtree mounted ro.
 *
 * But consider the effect on the old mount API on btrfs subvolume mounting
 * which combines the distinct step in (iii) into a single step.
 *
 * By issuing (i) both the mount and the superblock are turned ro. Now when (ii)
 * is issued the superblock is ro and thus even if the mount created for (ii) is
 * rw it wouldn't help. Hence, btrfs needed to transition the superblock from ro
 * to rw for (ii) which it did using an internal remount call.
 *
 * IOW, subvolume mounting was inherently complicated due to the ambiguity of
 * MS_RDONLY in mount(2). Note, this ambiguity has mount(8) always translate
 * "ro" to MS_RDONLY. IOW, in both (i) and (ii) "ro" becomes MS_RDONLY when
 * passed by mount(8) to mount(2).
 *
 * Enter the new mount API. The new mount API disambiguates making a mount ro
 * and making a superblock ro.
 *
 * (3) To turn a mount ro the MOUNT_ATTR_ONLY flag can be used with either
 *     fsmount() or mount_setattr() this is a pure VFS level change for a
 *     specific mount or mount tree that is never seen by the filesystem itself.
 *
 * (4) To turn a superblock ro the "ro" flag must be used with
 *     fsconfig(FSCONFIG_SET_FLAG, "ro"). This option is seen by the filesystem
 *     in fc->sb_flags.
 *
 * This disambiguation has rather positive consequences.  Mounting a subvolume
 * ro will not also turn the superblock ro. Only the mount for the subvolume
 * will become ro.
 *
 * So, if the superblock creation request comes from the new mount API the
 * caller must have explicitly done:
 *
 *      fsconfig(FSCONFIG_SET_FLAG, "ro")
 *      fsmount/mount_setattr(MOUNT_ATTR_RDONLY)
 *
 * IOW, at some point the caller must have explicitly turned the whole
 * superblock ro and we shouldn't just undo it like we did for the old mount
 * API. In any case, it lets us avoid the hack in the new mount API.
 *
 * Consequently, the remounting hack must only be used for requests originating
 * from the old mount API and should be marked for full deprecation so it can be
 * turned off in a couple of years.
 *
 * The new mount API has no reason to support this hack.
 */
static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc)
{
        struct vfsmount *mnt;
        int ret;
        const bool ro2rw = !(fc->sb_flags & SB_RDONLY);

        /*
         * We got an EBUSY because our SB_RDONLY flag didn't match the existing
         * super block, so invert our setting here and retry the mount so we
         * can get our vfsmount.
         */
        if (ro2rw)
                fc->sb_flags |= SB_RDONLY;
        else
                fc->sb_flags &= ~SB_RDONLY;

        mnt = fc_mount(fc);
        if (IS_ERR(mnt))
                return mnt;

        if (!fc->oldapi || !ro2rw)
                return mnt;

        /* We need to convert to rw, call reconfigure. */
        fc->sb_flags &= ~SB_RDONLY;
        down_write(&mnt->mnt_sb->s_umount);
        ret = btrfs_reconfigure(fc);
        up_write(&mnt->mnt_sb->s_umount);
        if (ret) {
                mntput(mnt);
                return ERR_PTR(ret);
        }
        return mnt;
}

static int btrfs_get_tree_subvol(struct fs_context *fc)
{
        struct btrfs_fs_info *fs_info = NULL;
        struct btrfs_fs_context *ctx = fc->fs_private;
        struct fs_context *dup_fc;
        struct dentry *dentry;
        struct vfsmount *mnt;

        /*
         * Setup a dummy root and fs_info for test/set super.  This is because
         * we don't actually fill this stuff out until open_ctree, but we need
         * then open_ctree will properly initialize the file system specific
         * settings later.  btrfs_init_fs_info initializes the static elements
         * of the fs_info (locks and such) to make cleanup easier if we find a
         * superblock with our given fs_devices later on at sget() time.
         */
        fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
        if (!fs_info)
                return -ENOMEM;

        fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
        fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
        if (!fs_info->super_copy || !fs_info->super_for_commit) {
                btrfs_free_fs_info(fs_info);
                return -ENOMEM;
        }
        btrfs_init_fs_info(fs_info);

        dup_fc = vfs_dup_fs_context(fc);
        if (IS_ERR(dup_fc)) {
                btrfs_free_fs_info(fs_info);
                return PTR_ERR(dup_fc);
        }

        /*
         * When we do the sget_fc this gets transferred to the sb, so we only
         * need to set it on the dup_fc as this is what creates the super block.
         */
        dup_fc->s_fs_info = fs_info;

        /*
         * We'll do the security settings in our btrfs_get_tree_super() mount
         * loop, they were duplicated into dup_fc, we can drop the originals
         * here.
         */
        security_free_mnt_opts(&fc->security);
        fc->security = NULL;

        mnt = fc_mount(dup_fc);
        if (PTR_ERR_OR_ZERO(mnt) == -EBUSY)
                mnt = btrfs_reconfigure_for_mount(dup_fc);
        put_fs_context(dup_fc);
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);

        /*
         * This free's ->subvol_name, because if it isn't set we have to
         * allocate a buffer to hold the subvol_name, so we just drop our
         * reference to it here.
         */
        dentry = mount_subvol(ctx->subvol_name, ctx->subvol_objectid, mnt);
        ctx->subvol_name = NULL;
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);

        fc->root = dentry;
        return 0;
}

static int btrfs_get_tree(struct fs_context *fc)
{
        /*
         * Since we use mount_subtree to mount the default/specified subvol, we
         * have to do mounts in two steps.
         *
         * First pass through we call btrfs_get_tree_subvol(), this is just a
         * wrapper around fc_mount() to call back into here again, and this time
         * we'll call btrfs_get_tree_super().  This will do the open_ctree() and
         * everything to open the devices and file system.  Then we return back
         * with a fully constructed vfsmount in btrfs_get_tree_subvol(), and
         * from there we can do our mount_subvol() call, which will lookup
         * whichever subvol we're mounting and setup this fc with the
         * appropriate dentry for the subvol.
         */
        if (fc->s_fs_info)
                return btrfs_get_tree_super(fc);
        return btrfs_get_tree_subvol(fc);
}

static void btrfs_kill_super(struct super_block *sb)
{
        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        kill_anon_super(sb);
        btrfs_free_fs_info(fs_info);
}

static void btrfs_free_fs_context(struct fs_context *fc)
{
        struct btrfs_fs_context *ctx = fc->fs_private;
        struct btrfs_fs_info *fs_info = fc->s_fs_info;

        if (fs_info)
                btrfs_free_fs_info(fs_info);

        if (ctx && refcount_dec_and_test(&ctx->refs)) {
                kfree(ctx->subvol_name);
                kfree(ctx);
        }
}

static int btrfs_dup_fs_context(struct fs_context *fc, struct fs_context *src_fc)
{
        struct btrfs_fs_context *ctx = src_fc->fs_private;

        /*
         * Give a ref to our ctx to this dup, as we want to keep it around for
         * our original fc so we can have the subvolume name or objectid.
         *
         * We unset ->source in the original fc because the dup needs it for
         * mounting, and then once we free the dup it'll free ->source, so we
         * need to make sure we're only pointing to it in one fc.
         */
        refcount_inc(&ctx->refs);
        fc->fs_private = ctx;
        fc->source = src_fc->source;
        src_fc->source = NULL;
        return 0;
}

static const struct fs_context_operations btrfs_fs_context_ops = {
        .parse_param        = btrfs_parse_param,
        .reconfigure        = btrfs_reconfigure,
        .get_tree        = btrfs_get_tree,
        .dup                = btrfs_dup_fs_context,
        .free                = btrfs_free_fs_context,
};

static int btrfs_init_fs_context(struct fs_context *fc)
{
        struct btrfs_fs_context *ctx;

        ctx = kzalloc(sizeof(struct btrfs_fs_context), GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        refcount_set(&ctx->refs, 1);
        fc->fs_private = ctx;
        fc->ops = &btrfs_fs_context_ops;

        if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
                btrfs_info_to_ctx(btrfs_sb(fc->root->d_sb), ctx);
        } else {
                ctx->thread_pool_size =
                        min_t(unsigned long, num_online_cpus() + 2, 8);
                ctx->max_inline = BTRFS_DEFAULT_MAX_INLINE;
                ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
        }

#ifdef CONFIG_BTRFS_FS_POSIX_ACL
        fc->sb_flags |= SB_POSIXACL;
#endif
        fc->sb_flags |= SB_I_VERSION;

        return 0;
}

static struct file_system_type btrfs_fs_type = {
        .owner                        = THIS_MODULE,
        .name                        = "btrfs",
        .init_fs_context        = btrfs_init_fs_context,
        .parameters                = btrfs_fs_parameters,
        .kill_sb                = btrfs_kill_super,
        .fs_flags                = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
 };

MODULE_ALIAS_FS("btrfs");

static int btrfs_control_open(struct inode *inode, struct file *file)
{
        /*
         * The control file's private_data is used to hold the
         * transaction when it is started and is used to keep
         * track of whether a transaction is already in progress.
         */
        file->private_data = NULL;
        return 0;
}

/*
 * Used by /dev/btrfs-control for devices ioctls.
 */
static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
                                unsigned long arg)
{
        struct btrfs_ioctl_vol_args *vol;
        struct btrfs_device *device = NULL;
        dev_t devt = 0;
        int ret = -ENOTTY;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        vol = memdup_user((void __user *)arg, sizeof(*vol));
        if (IS_ERR(vol))
                return PTR_ERR(vol);
        ret = btrfs_check_ioctl_vol_args_path(vol);
        if (ret < 0)
                goto out;

        switch (cmd) {
        case BTRFS_IOC_SCAN_DEV:
                mutex_lock(&uuid_mutex);
                /*
                 * Scanning outside of mount can return NULL which would turn
                 * into 0 error code.
                 */
                device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
                ret = PTR_ERR_OR_ZERO(device);
                mutex_unlock(&uuid_mutex);
                break;
        case BTRFS_IOC_FORGET_DEV:
                if (vol->name[0] != 0) {
                        ret = lookup_bdev(vol->name, &devt);
                        if (ret)
                                break;
                }
                ret = btrfs_forget_devices(devt);
                break;
        case BTRFS_IOC_DEVICES_READY:
                mutex_lock(&uuid_mutex);
                /*
                 * Scanning outside of mount can return NULL which would turn
                 * into 0 error code.
                 */
                device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
                if (IS_ERR_OR_NULL(device)) {
                        mutex_unlock(&uuid_mutex);
                        ret = PTR_ERR(device);
                        break;
                }
                ret = !(device->fs_devices->num_devices ==
                        device->fs_devices->total_devices);
                mutex_unlock(&uuid_mutex);
                break;
        case BTRFS_IOC_GET_SUPPORTED_FEATURES:
                ret = btrfs_ioctl_get_supported_features((void __user*)arg);
                break;
        }

out:
        kfree(vol);
        return ret;
}

static int btrfs_freeze(struct super_block *sb)
{
        struct btrfs_trans_handle *trans;
        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        struct btrfs_root *root = fs_info->tree_root;

        set_bit(BTRFS_FS_FROZEN, &fs_info->flags);
        /*
         * We don't need a barrier here, we'll wait for any transaction that
         * could be in progress on other threads (and do delayed iputs that
         * we want to avoid on a frozen filesystem), or do the commit
         * ourselves.
         */
        trans = btrfs_attach_transaction_barrier(root);
        if (IS_ERR(trans)) {
                /* no transaction, don't bother */
                if (PTR_ERR(trans) == -ENOENT)
                        return 0;
                return PTR_ERR(trans);
        }
        return btrfs_commit_transaction(trans);
}

static int check_dev_super(struct btrfs_device *dev)
{
        struct btrfs_fs_info *fs_info = dev->fs_info;
        struct btrfs_super_block *sb;
        u64 last_trans;
        u16 csum_type;
        int ret = 0;

        /* This should be called with fs still frozen. */
        ASSERT(test_bit(BTRFS_FS_FROZEN, &fs_info->flags));

        /* Missing dev, no need to check. */
        if (!dev->bdev)
                return 0;

        /* Only need to check the primary super block. */
        sb = btrfs_read_dev_one_super(dev->bdev, 0, true);
        if (IS_ERR(sb))
                return PTR_ERR(sb);

        /* Verify the checksum. */
        csum_type = btrfs_super_csum_type(sb);
        if (csum_type != btrfs_super_csum_type(fs_info->super_copy)) {
                btrfs_err(fs_info, "csum type changed, has %u expect %u",
                          csum_type, btrfs_super_csum_type(fs_info->super_copy));
                ret = -EUCLEAN;
                goto out;
        }

        if (btrfs_check_super_csum(fs_info, sb)) {
                btrfs_err(fs_info, "csum for on-disk super block no longer matches");
                ret = -EUCLEAN;
                goto out;
        }

        /* Btrfs_validate_super() includes fsid check against super->fsid. */
        ret = btrfs_validate_super(fs_info, sb, 0);
        if (ret < 0)
                goto out;

        last_trans = btrfs_get_last_trans_committed(fs_info);
        if (btrfs_super_generation(sb) != last_trans) {
                btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
                          btrfs_super_generation(sb), last_trans);
                ret = -EUCLEAN;
                goto out;
        }
out:
        btrfs_release_disk_super(sb);
        return ret;
}

static int btrfs_unfreeze(struct super_block *sb)
{
        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        struct btrfs_device *device;
        int ret = 0;

        /*
         * Make sure the fs is not changed by accident (like hibernation then
         * modified by other OS).
         * If we found anything wrong, we mark the fs error immediately.
         *
         * And since the fs is frozen, no one can modify the fs yet, thus
         * we don't need to hold device_list_mutex.
         */
        list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
                ret = check_dev_super(device);
                if (ret < 0) {
                        btrfs_handle_fs_error(fs_info, ret,
                                "super block on devid %llu got modified unexpectedly",
                                device->devid);
                        break;
                }
        }
        clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);

        /*
         * We still return 0, to allow VFS layer to unfreeze the fs even the
         * above checks failed. Since the fs is either fine or read-only, we're
         * safe to continue, without causing further damage.
         */
        return 0;
}

static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
{
        struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);

        /*
         * There should be always a valid pointer in latest_dev, it may be stale
         * for a short moment in case it's being deleted but still valid until
         * the end of RCU grace period.
         */
        rcu_read_lock();
        seq_escape(m, btrfs_dev_name(fs_info->fs_devices->latest_dev), " \t\n\\");
        rcu_read_unlock();

        return 0;
}

static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_control *sc)
{
        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        const s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);

        trace_btrfs_extent_map_shrinker_count(fs_info, nr);

        return nr;
}

static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc)
{
        const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
        struct btrfs_fs_info *fs_info = btrfs_sb(sb);

        return btrfs_free_extent_maps(fs_info, nr_to_scan);
}

static const struct super_operations btrfs_super_ops = {
        .drop_inode        = btrfs_drop_inode,
        .evict_inode        = btrfs_evict_inode,
        .put_super        = btrfs_put_super,
        .sync_fs        = btrfs_sync_fs,
        .show_options        = btrfs_show_options,
        .show_devname        = btrfs_show_devname,
        .alloc_inode        = btrfs_alloc_inode,
        .destroy_inode        = btrfs_destroy_inode,
        .free_inode        = btrfs_free_inode,
        .statfs                = btrfs_statfs,
        .freeze_fs        = btrfs_freeze,
        .unfreeze_fs        = btrfs_unfreeze,
        .nr_cached_objects = btrfs_nr_cached_objects,
        .free_cached_objects = btrfs_free_cached_objects,
};

static const struct file_operations btrfs_ctl_fops = {
        .open = btrfs_control_open,
        .unlocked_ioctl         = btrfs_control_ioctl,
        .compat_ioctl = compat_ptr_ioctl,
        .owner         = THIS_MODULE,
        .llseek = noop_llseek,
};

static struct miscdevice btrfs_misc = {
        .minor                = BTRFS_MINOR,
        .name                = "btrfs-control",
        .fops                = &btrfs_ctl_fops
};

MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
MODULE_ALIAS("devname:btrfs-control");

static int __init btrfs_interface_init(void)
{
        return misc_register(&btrfs_misc);
}

static __cold void btrfs_interface_exit(void)
{
        misc_deregister(&btrfs_misc);
}

static int __init btrfs_print_mod_info(void)
{
        static const char options[] = ""
#ifdef CONFIG_BTRFS_DEBUG
                        ", debug=on"
#endif
#ifdef CONFIG_BTRFS_ASSERT
                        ", assert=on"
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
                        ", ref-verify=on"
#endif
#ifdef CONFIG_BLK_DEV_ZONED
                        ", zoned=yes"
#else
                        ", zoned=no"
#endif
#ifdef CONFIG_FS_VERITY
                        ", fsverity=yes"
#else
                        ", fsverity=no"
#endif
                        ;
        pr_info("Btrfs loaded%s\n", options);
        return 0;
}

static int register_btrfs(void)
{
        return register_filesystem(&btrfs_fs_type);
}

static void unregister_btrfs(void)
{
        unregister_filesystem(&btrfs_fs_type);
}

/* Helper structure for long init/exit functions. */
struct init_sequence {
        int (*init_func)(void);
        /* Can be NULL if the init_func doesn't need cleanup. */
        void (*exit_func)(void);
};

static const struct init_sequence mod_init_seq[] = {
        {
                .init_func = btrfs_props_init,
                .exit_func = NULL,
        }, {
                .init_func = btrfs_init_sysfs,
                .exit_func = btrfs_exit_sysfs,
        }, {
                .init_func = btrfs_init_compress,
                .exit_func = btrfs_exit_compress,
        }, {
                .init_func = btrfs_init_cachep,
                .exit_func = btrfs_destroy_cachep,
        }, {
                .init_func = btrfs_transaction_init,
                .exit_func = btrfs_transaction_exit,
        }, {
                .init_func = btrfs_ctree_init,
                .exit_func = btrfs_ctree_exit,
        }, {
                .init_func = btrfs_free_space_init,
                .exit_func = btrfs_free_space_exit,
        }, {
                .init_func = extent_state_init_cachep,
                .exit_func = extent_state_free_cachep,
        }, {
                .init_func = extent_buffer_init_cachep,
                .exit_func = extent_buffer_free_cachep,
        }, {
                .init_func = btrfs_bioset_init,
                .exit_func = btrfs_bioset_exit,
        }, {
                .init_func = extent_map_init,
                .exit_func = extent_map_exit,
        }, {
                .init_func = ordered_data_init,
                .exit_func = ordered_data_exit,
        }, {
                .init_func = btrfs_delayed_inode_init,
                .exit_func = btrfs_delayed_inode_exit,
        }, {
                .init_func = btrfs_auto_defrag_init,
                .exit_func = btrfs_auto_defrag_exit,
        }, {
                .init_func = btrfs_delayed_ref_init,
                .exit_func = btrfs_delayed_ref_exit,
        }, {
                .init_func = btrfs_prelim_ref_init,
                .exit_func = btrfs_prelim_ref_exit,
        }, {
                .init_func = btrfs_interface_init,
                .exit_func = btrfs_interface_exit,
        }, {
                .init_func = btrfs_print_mod_info,
                .exit_func = NULL,
        }, {
                .init_func = btrfs_run_sanity_tests,
                .exit_func = NULL,
        }, {
                .init_func = register_btrfs,
                .exit_func = unregister_btrfs,
        }
};

static bool mod_init_result[ARRAY_SIZE(mod_init_seq)];

static __always_inline void btrfs_exit_btrfs_fs(void)
{
        int i;

        for (i = ARRAY_SIZE(mod_init_seq) - 1; i >= 0; i--) {
                if (!mod_init_result[i])
                        continue;
                if (mod_init_seq[i].exit_func)
                        mod_init_seq[i].exit_func();
                mod_init_result[i] = false;
        }
}

static void __exit exit_btrfs_fs(void)
{
        btrfs_exit_btrfs_fs();
        btrfs_cleanup_fs_uuids();
}

static int __init init_btrfs_fs(void)
{
        int ret;
        int i;

        for (i = 0; i < ARRAY_SIZE(mod_init_seq); i++) {
                ASSERT(!mod_init_result[i]);
                ret = mod_init_seq[i].init_func();
                if (ret < 0) {
                        btrfs_exit_btrfs_fs();
                        return ret;
                }
                mod_init_result[i] = true;
        }
        return 0;
}

late_initcall(init_btrfs_fs);
module_exit(exit_btrfs_fs)

MODULE_LICENSE("GPL");
MODULE_SOFTDEP("pre: crc32c");
MODULE_SOFTDEP("pre: xxhash64");
MODULE_SOFTDEP("pre: sha256");
MODULE_SOFTDEP("pre: blake2b-256");



































































































































































    1 







































































































































    1 

















    5 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * fs-verity: read-only file-based authenticity protection
 *
 * This header declares the interface between the fs/verity/ support layer and
 * filesystems that support fs-verity.
 *
 * Copyright 2019 Google LLC
 */

#ifndef _LINUX_FSVERITY_H
#define _LINUX_FSVERITY_H

#include <linux/fs.h>
#include <linux/mm.h>
#include <crypto/hash_info.h>
#include <crypto/sha2.h>
#include <uapi/linux/fsverity.h>

/*
 * Largest digest size among all hash algorithms supported by fs-verity.
 * Currently assumed to be <= size of fsverity_descriptor::root_hash.
 */
#define FS_VERITY_MAX_DIGEST_SIZE        SHA512_DIGEST_SIZE

/* Arbitrary limit to bound the kmalloc() size.  Can be changed. */
#define FS_VERITY_MAX_DESCRIPTOR_SIZE        16384

/* Verity operations for filesystems */
struct fsverity_operations {

        /**
         * Begin enabling verity on the given file.
         *
         * @filp: a readonly file descriptor for the file
         *
         * The filesystem must do any needed filesystem-specific preparations
         * for enabling verity, e.g. evicting inline data.  It also must return
         * -EBUSY if verity is already being enabled on the given file.
         *
         * i_rwsem is held for write.
         *
         * Return: 0 on success, -errno on failure
         */
        int (*begin_enable_verity)(struct file *filp);

        /**
         * End enabling verity on the given file.
         *
         * @filp: a readonly file descriptor for the file
         * @desc: the verity descriptor to write, or NULL on failure
         * @desc_size: size of verity descriptor, or 0 on failure
         * @merkle_tree_size: total bytes the Merkle tree took up
         *
         * If desc == NULL, then enabling verity failed and the filesystem only
         * must do any necessary cleanups.  Else, it must also store the given
         * verity descriptor to a fs-specific location associated with the inode
         * and do any fs-specific actions needed to mark the inode as a verity
         * inode, e.g. setting a bit in the on-disk inode.  The filesystem is
         * also responsible for setting the S_VERITY flag in the VFS inode.
         *
         * i_rwsem is held for write, but it may have been dropped between
         * ->begin_enable_verity() and ->end_enable_verity().
         *
         * Return: 0 on success, -errno on failure
         */
        int (*end_enable_verity)(struct file *filp, const void *desc,
                                 size_t desc_size, u64 merkle_tree_size);

        /**
         * Get the verity descriptor of the given inode.
         *
         * @inode: an inode with the S_VERITY flag set
         * @buf: buffer in which to place the verity descriptor
         * @bufsize: size of @buf, or 0 to retrieve the size only
         *
         * If bufsize == 0, then the size of the verity descriptor is returned.
         * Otherwise the verity descriptor is written to 'buf' and its actual
         * size is returned; -ERANGE is returned if it's too large.  This may be
         * called by multiple processes concurrently on the same inode.
         *
         * Return: the size on success, -errno on failure
         */
        int (*get_verity_descriptor)(struct inode *inode, void *buf,
                                     size_t bufsize);

        /**
         * Read a Merkle tree page of the given inode.
         *
         * @inode: the inode
         * @index: 0-based index of the page within the Merkle tree
         * @num_ra_pages: The number of Merkle tree pages that should be
         *                  prefetched starting at @index if the page at @index
         *                  isn't already cached.  Implementations may ignore this
         *                  argument; it's only a performance optimization.
         *
         * This can be called at any time on an open verity file.  It may be
         * called by multiple processes concurrently, even with the same page.
         *
         * Note that this must retrieve a *page*, not necessarily a *block*.
         *
         * Return: the page on success, ERR_PTR() on failure
         */
        struct page *(*read_merkle_tree_page)(struct inode *inode,
                                              pgoff_t index,
                                              unsigned long num_ra_pages);

        /**
         * Write a Merkle tree block to the given inode.
         *
         * @inode: the inode for which the Merkle tree is being built
         * @buf: the Merkle tree block to write
         * @pos: the position of the block in the Merkle tree (in bytes)
         * @size: the Merkle tree block size (in bytes)
         *
         * This is only called between ->begin_enable_verity() and
         * ->end_enable_verity().
         *
         * Return: 0 on success, -errno on failure
         */
        int (*write_merkle_tree_block)(struct inode *inode, const void *buf,
                                       u64 pos, unsigned int size);
};

#ifdef CONFIG_FS_VERITY

static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
{
        /*
         * Pairs with the cmpxchg_release() in fsverity_set_info().
         * I.e., another task may publish ->i_verity_info concurrently,
         * executing a RELEASE barrier.  We need to use smp_load_acquire() here
         * to safely ACQUIRE the memory the other task published.
         */
        return smp_load_acquire(&inode->i_verity_info);
}

/* enable.c */

int fsverity_ioctl_enable(struct file *filp, const void __user *arg);

/* measure.c */

int fsverity_ioctl_measure(struct file *filp, void __user *arg);
int fsverity_get_digest(struct inode *inode,
                        u8 raw_digest[FS_VERITY_MAX_DIGEST_SIZE],
                        u8 *alg, enum hash_algo *halg);

/* open.c */

int __fsverity_file_open(struct inode *inode, struct file *filp);
int __fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr);
void __fsverity_cleanup_inode(struct inode *inode);

/**
 * fsverity_cleanup_inode() - free the inode's verity info, if present
 * @inode: an inode being evicted
 *
 * Filesystems must call this on inode eviction to free ->i_verity_info.
 */
static inline void fsverity_cleanup_inode(struct inode *inode)
{
        if (inode->i_verity_info)
                __fsverity_cleanup_inode(inode);
}

/* read_metadata.c */

int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg);

/* verify.c */

bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset);
void fsverity_verify_bio(struct bio *bio);
void fsverity_enqueue_verify_work(struct work_struct *work);

#else /* !CONFIG_FS_VERITY */

static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
{
        return NULL;
}

/* enable.c */

static inline int fsverity_ioctl_enable(struct file *filp,
                                        const void __user *arg)
{
        return -EOPNOTSUPP;
}

/* measure.c */

static inline int fsverity_ioctl_measure(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fsverity_get_digest(struct inode *inode,
                                      u8 raw_digest[FS_VERITY_MAX_DIGEST_SIZE],
                                      u8 *alg, enum hash_algo *halg)
{
        /*
         * fsverity is not enabled in the kernel configuration, so always report
         * that the file doesn't have fsverity enabled (digest size 0).
         */
        return 0;
}

/* open.c */

static inline int __fsverity_file_open(struct inode *inode, struct file *filp)
{
        return -EOPNOTSUPP;
}

static inline int __fsverity_prepare_setattr(struct dentry *dentry,
                                             struct iattr *attr)
{
        return -EOPNOTSUPP;
}

static inline void fsverity_cleanup_inode(struct inode *inode)
{
}

/* read_metadata.c */

static inline int fsverity_ioctl_read_metadata(struct file *filp,
                                               const void __user *uarg)
{
        return -EOPNOTSUPP;
}

/* verify.c */

static inline bool fsverity_verify_blocks(struct folio *folio, size_t len,
                                          size_t offset)
{
        WARN_ON_ONCE(1);
        return false;
}

static inline void fsverity_verify_bio(struct bio *bio)
{
        WARN_ON_ONCE(1);
}

static inline void fsverity_enqueue_verify_work(struct work_struct *work)
{
        WARN_ON_ONCE(1);
}

#endif        /* !CONFIG_FS_VERITY */

static inline bool fsverity_verify_folio(struct folio *folio)
{
        return fsverity_verify_blocks(folio, folio_size(folio), 0);
}

static inline bool fsverity_verify_page(struct page *page)
{
        return fsverity_verify_blocks(page_folio(page), PAGE_SIZE, 0);
}

/**
 * fsverity_active() - do reads from the inode need to go through fs-verity?
 * @inode: inode to check
 *
 * This checks whether ->i_verity_info has been set.
 *
 * Filesystems call this from ->readahead() to check whether the pages need to
 * be verified or not.  Don't use IS_VERITY() for this purpose; it's subject to
 * a race condition where the file is being read concurrently with
 * FS_IOC_ENABLE_VERITY completing.  (S_VERITY is set before ->i_verity_info.)
 *
 * Return: true if reads need to go through fs-verity, otherwise false
 */
static inline bool fsverity_active(const struct inode *inode)
{
        return fsverity_get_info(inode) != NULL;
}

/**
 * fsverity_file_open() - prepare to open a verity file
 * @inode: the inode being opened
 * @filp: the struct file being set up
 *
 * When opening a verity file, deny the open if it is for writing.  Otherwise,
 * set up the inode's ->i_verity_info if not already done.
 *
 * When combined with fscrypt, this must be called after fscrypt_file_open().
 * Otherwise, we won't have the key set up to decrypt the verity metadata.
 *
 * Return: 0 on success, -errno on failure
 */
static inline int fsverity_file_open(struct inode *inode, struct file *filp)
{
        if (IS_VERITY(inode))
                return __fsverity_file_open(inode, filp);
        return 0;
}

/**
 * fsverity_prepare_setattr() - prepare to change a verity inode's attributes
 * @dentry: dentry through which the inode is being changed
 * @attr: attributes to change
 *
 * Verity files are immutable, so deny truncates.  This isn't covered by the
 * open-time check because sys_truncate() takes a path, not a file descriptor.
 *
 * Return: 0 on success, -errno on failure
 */
static inline int fsverity_prepare_setattr(struct dentry *dentry,
                                           struct iattr *attr)
{
        if (IS_VERITY(d_inode(dentry)))
                return __fsverity_prepare_setattr(dentry, attr);
        return 0;
}

#endif        /* _LINUX_FSVERITY_H */































































































   14 

   15 



































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
// SPDX-License-Identifier: GPL-2.0
/*
 * SafeSetID Linux Security Module
 *
 * Author: Micah Morton <mortonm@chromium.org>
 *
 * Copyright (C) 2018 The Chromium OS Authors.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2, as
 * published by the Free Software Foundation.
 *
 */

#define pr_fmt(fmt) "SafeSetID: " fmt

#include <linux/lsm_hooks.h>
#include <linux/module.h>
#include <linux/ptrace.h>
#include <linux/sched/task_stack.h>
#include <linux/security.h>
#include <uapi/linux/lsm.h>
#include "lsm.h"

/* Flag indicating whether initialization completed */
int safesetid_initialized __initdata;

struct setid_ruleset __rcu *safesetid_setuid_rules;
struct setid_ruleset __rcu *safesetid_setgid_rules;


/* Compute a decision for a transition from @src to @dst under @policy. */
enum sid_policy_type _setid_policy_lookup(struct setid_ruleset *policy,
                kid_t src, kid_t dst)
{
        struct setid_rule *rule;
        enum sid_policy_type result = SIDPOL_DEFAULT;

        if (policy->type == UID) {
                hash_for_each_possible(policy->rules, rule, next, __kuid_val(src.uid)) {
                        if (!uid_eq(rule->src_id.uid, src.uid))
                                continue;
                        if (uid_eq(rule->dst_id.uid, dst.uid))
                                return SIDPOL_ALLOWED;
                        result = SIDPOL_CONSTRAINED;
                }
        } else if (policy->type == GID) {
                hash_for_each_possible(policy->rules, rule, next, __kgid_val(src.gid)) {
                        if (!gid_eq(rule->src_id.gid, src.gid))
                                continue;
                        if (gid_eq(rule->dst_id.gid, dst.gid)){
                                return SIDPOL_ALLOWED;
                        }
                        result = SIDPOL_CONSTRAINED;
                }
        } else {
                /* Should not reach here, report the ID as contrainsted */
                result = SIDPOL_CONSTRAINED;
        }
        return result;
}

/*
 * Compute a decision for a transition from @src to @dst under the active
 * policy.
 */
static enum sid_policy_type setid_policy_lookup(kid_t src, kid_t dst, enum setid_type new_type)
{
        enum sid_policy_type result = SIDPOL_DEFAULT;
        struct setid_ruleset *pol;

        rcu_read_lock();
        if (new_type == UID)
                pol = rcu_dereference(safesetid_setuid_rules);
        else if (new_type == GID)
                pol = rcu_dereference(safesetid_setgid_rules);
        else { /* Should not reach here */
                result = SIDPOL_CONSTRAINED;
                rcu_read_unlock();
                return result;
        }

        if (pol) {
                pol->type = new_type;
                result = _setid_policy_lookup(pol, src, dst);
        }
        rcu_read_unlock();
        return result;
}

static int safesetid_security_capable(const struct cred *cred,
                                      struct user_namespace *ns,
                                      int cap,
                                      unsigned int opts)
{
        /* We're only interested in CAP_SETUID and CAP_SETGID. */
        if (cap != CAP_SETUID && cap != CAP_SETGID)
                return 0;

        /*
         * If CAP_SET{U/G}ID is currently used for a setid or setgroups syscall, we
         * want to let it go through here; the real security check happens later, in
         * the task_fix_set{u/g}id or task_fix_setgroups hooks.
         */
        if ((opts & CAP_OPT_INSETID) != 0)
                return 0;

        switch (cap) {
        case CAP_SETUID:
                /*
                * If no policy applies to this task, allow the use of CAP_SETUID for
                * other purposes.
                */
                if (setid_policy_lookup((kid_t){.uid = cred->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT)
                        return 0;
                /*
                 * Reject use of CAP_SETUID for functionality other than calling
                 * set*uid() (e.g. setting up userns uid mappings).
                 */
                pr_warn("Operation requires CAP_SETUID, which is not available to UID %u for operations besides approved set*uid transitions\n",
                        __kuid_val(cred->uid));
                return -EPERM;
        case CAP_SETGID:
                /*
                * If no policy applies to this task, allow the use of CAP_SETGID for
                * other purposes.
                */
                if (setid_policy_lookup((kid_t){.gid = cred->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT)
                        return 0;
                /*
                 * Reject use of CAP_SETUID for functionality other than calling
                 * set*gid() (e.g. setting up userns gid mappings).
                 */
                pr_warn("Operation requires CAP_SETGID, which is not available to GID %u for operations besides approved set*gid transitions\n",
                        __kgid_val(cred->gid));
                return -EPERM;
        default:
                /* Error, the only capabilities were checking for is CAP_SETUID/GID */
                return 0;
        }
        return 0;
}

/*
 * Check whether a caller with old credentials @old is allowed to switch to
 * credentials that contain @new_id.
 */
static bool id_permitted_for_cred(const struct cred *old, kid_t new_id, enum setid_type new_type)
{
        bool permitted;

        /* If our old creds already had this ID in it, it's fine. */
        if (new_type == UID) {
                if (uid_eq(new_id.uid, old->uid) || uid_eq(new_id.uid, old->euid) ||
                        uid_eq(new_id.uid, old->suid))
                        return true;
        } else if (new_type == GID){
                if (gid_eq(new_id.gid, old->gid) || gid_eq(new_id.gid, old->egid) ||
                        gid_eq(new_id.gid, old->sgid))
                        return true;
        } else /* Error, new_type is an invalid type */
                return false;

        /*
         * Transitions to new UIDs require a check against the policy of the old
         * RUID.
         */
        permitted =
            setid_policy_lookup((kid_t){.uid = old->uid}, new_id, new_type) != SIDPOL_CONSTRAINED;

        if (!permitted) {
                if (new_type == UID) {
                        pr_warn("UID transition ((%d,%d,%d) -> %d) blocked\n",
                                __kuid_val(old->uid), __kuid_val(old->euid),
                                __kuid_val(old->suid), __kuid_val(new_id.uid));
                } else if (new_type == GID) {
                        pr_warn("GID transition ((%d,%d,%d) -> %d) blocked\n",
                                __kgid_val(old->gid), __kgid_val(old->egid),
                                __kgid_val(old->sgid), __kgid_val(new_id.gid));
                } else /* Error, new_type is an invalid type */
                        return false;
        }
        return permitted;
}

/*
 * Check whether there is either an exception for user under old cred struct to
 * set*uid to user under new cred struct, or the UID transition is allowed (by
 * Linux set*uid rules) even without CAP_SETUID.
 */
static int safesetid_task_fix_setuid(struct cred *new,
                                     const struct cred *old,
                                     int flags)
{

        /* Do nothing if there are no setuid restrictions for our old RUID. */
        if (setid_policy_lookup((kid_t){.uid = old->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT)
                return 0;

        if (id_permitted_for_cred(old, (kid_t){.uid = new->uid}, UID) &&
            id_permitted_for_cred(old, (kid_t){.uid = new->euid}, UID) &&
            id_permitted_for_cred(old, (kid_t){.uid = new->suid}, UID) &&
            id_permitted_for_cred(old, (kid_t){.uid = new->fsuid}, UID))
                return 0;

        /*
         * Kill this process to avoid potential security vulnerabilities
         * that could arise from a missing allowlist entry preventing a
         * privileged process from dropping to a lesser-privileged one.
         */
        force_sig(SIGKILL);
        return -EACCES;
}

static int safesetid_task_fix_setgid(struct cred *new,
                                     const struct cred *old,
                                     int flags)
{

        /* Do nothing if there are no setgid restrictions for our old RGID. */
        if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT)
                return 0;

        if (id_permitted_for_cred(old, (kid_t){.gid = new->gid}, GID) &&
            id_permitted_for_cred(old, (kid_t){.gid = new->egid}, GID) &&
            id_permitted_for_cred(old, (kid_t){.gid = new->sgid}, GID) &&
            id_permitted_for_cred(old, (kid_t){.gid = new->fsgid}, GID))
                return 0;

        /*
         * Kill this process to avoid potential security vulnerabilities
         * that could arise from a missing allowlist entry preventing a
         * privileged process from dropping to a lesser-privileged one.
         */
        force_sig(SIGKILL);
        return -EACCES;
}

static int safesetid_task_fix_setgroups(struct cred *new, const struct cred *old)
{
        int i;

        /* Do nothing if there are no setgid restrictions for our old RGID. */
        if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT)
                return 0;

        get_group_info(new->group_info);
        for (i = 0; i < new->group_info->ngroups; i++) {
                if (!id_permitted_for_cred(old, (kid_t){.gid = new->group_info->gid[i]}, GID)) {
                        put_group_info(new->group_info);
                        /*
                         * Kill this process to avoid potential security vulnerabilities
                         * that could arise from a missing allowlist entry preventing a
                         * privileged process from dropping to a lesser-privileged one.
                         */
                        force_sig(SIGKILL);
                        return -EACCES;
                }
        }

        put_group_info(new->group_info);
        return 0;
}

static const struct lsm_id safesetid_lsmid = {
        .name = "safesetid",
        .id = LSM_ID_SAFESETID,
};

static struct security_hook_list safesetid_security_hooks[] = {
        LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid),
        LSM_HOOK_INIT(task_fix_setgid, safesetid_task_fix_setgid),
        LSM_HOOK_INIT(task_fix_setgroups, safesetid_task_fix_setgroups),
        LSM_HOOK_INIT(capable, safesetid_security_capable)
};

static int __init safesetid_security_init(void)
{
        security_add_hooks(safesetid_security_hooks,
                           ARRAY_SIZE(safesetid_security_hooks),
                           &safesetid_lsmid);

        /* Report that SafeSetID successfully initialized */
        safesetid_initialized = 1;

        return 0;
}

DEFINE_LSM(safesetid_security_init) = {
        .init = safesetid_security_init,
        .name = "safesetid",
};







































































































































































































































































































































































































































































































































































































































































































    2 













    2 














































    2 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/dax.c - Direct Access filesystem code
 * Copyright (c) 2013-2014 Intel Corporation
 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
 * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
 */

#include <linux/atomic.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/memcontrol.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/pagevec.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/uio.h>
#include <linux/vmstat.h>
#include <linux/pfn_t.h>
#include <linux/sizes.h>
#include <linux/mmu_notifier.h>
#include <linux/iomap.h>
#include <linux/rmap.h>
#include <asm/pgalloc.h>

#define CREATE_TRACE_POINTS
#include <trace/events/fs_dax.h>

/* We choose 4096 entries - same as per-zone page wait tables */
#define DAX_WAIT_TABLE_BITS 12
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)

/* The 'colour' (ie low bits) within a PMD of a page offset.  */
#define PG_PMD_COLOUR        ((PMD_SIZE >> PAGE_SHIFT) - 1)
#define PG_PMD_NR        (PMD_SIZE >> PAGE_SHIFT)

static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];

static int __init init_dax_wait_table(void)
{
        int i;

        for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
                init_waitqueue_head(wait_table + i);
        return 0;
}
fs_initcall(init_dax_wait_table);

/*
 * DAX pagecache entries use XArray value entries so they can't be mistaken
 * for pages.  We use one bit for locking, one bit for the entry size (PMD)
 * and two more to tell us if the entry is a zero page or an empty entry that
 * is just used for locking.  In total four special bits.
 *
 * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
 * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
 * block allocation.
 */
#define DAX_SHIFT        (4)
#define DAX_LOCKED        (1UL << 0)
#define DAX_PMD                (1UL << 1)
#define DAX_ZERO_PAGE        (1UL << 2)
#define DAX_EMPTY        (1UL << 3)

static unsigned long dax_to_pfn(void *entry)
{
        return xa_to_value(entry) >> DAX_SHIFT;
}

static void *dax_make_entry(pfn_t pfn, unsigned long flags)
{
        return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
}

static bool dax_is_locked(void *entry)
{
        return xa_to_value(entry) & DAX_LOCKED;
}

static unsigned int dax_entry_order(void *entry)
{
        if (xa_to_value(entry) & DAX_PMD)
                return PMD_ORDER;
        return 0;
}

static unsigned long dax_is_pmd_entry(void *entry)
{
        return xa_to_value(entry) & DAX_PMD;
}

static bool dax_is_pte_entry(void *entry)
{
        return !(xa_to_value(entry) & DAX_PMD);
}

static int dax_is_zero_entry(void *entry)
{
        return xa_to_value(entry) & DAX_ZERO_PAGE;
}

static int dax_is_empty_entry(void *entry)
{
        return xa_to_value(entry) & DAX_EMPTY;
}

/*
 * true if the entry that was found is of a smaller order than the entry
 * we were looking for
 */
static bool dax_is_conflict(void *entry)
{
        return entry == XA_RETRY_ENTRY;
}

/*
 * DAX page cache entry locking
 */
struct exceptional_entry_key {
        struct xarray *xa;
        pgoff_t entry_start;
};

struct wait_exceptional_entry_queue {
        wait_queue_entry_t wait;
        struct exceptional_entry_key key;
};

/**
 * enum dax_wake_mode: waitqueue wakeup behaviour
 * @WAKE_ALL: wake all waiters in the waitqueue
 * @WAKE_NEXT: wake only the first waiter in the waitqueue
 */
enum dax_wake_mode {
        WAKE_ALL,
        WAKE_NEXT,
};

static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
                void *entry, struct exceptional_entry_key *key)
{
        unsigned long hash;
        unsigned long index = xas->xa_index;

        /*
         * If 'entry' is a PMD, align the 'index' that we use for the wait
         * queue to the start of that PMD.  This ensures that all offsets in
         * the range covered by the PMD map to the same bit lock.
         */
        if (dax_is_pmd_entry(entry))
                index &= ~PG_PMD_COLOUR;
        key->xa = xas->xa;
        key->entry_start = index;

        hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
        return wait_table + hash;
}

static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
                unsigned int mode, int sync, void *keyp)
{
        struct exceptional_entry_key *key = keyp;
        struct wait_exceptional_entry_queue *ewait =
                container_of(wait, struct wait_exceptional_entry_queue, wait);

        if (key->xa != ewait->key.xa ||
            key->entry_start != ewait->key.entry_start)
                return 0;
        return autoremove_wake_function(wait, mode, sync, NULL);
}

/*
 * @entry may no longer be the entry at the index in the mapping.
 * The important information it's conveying is whether the entry at
 * this index used to be a PMD entry.
 */
static void dax_wake_entry(struct xa_state *xas, void *entry,
                           enum dax_wake_mode mode)
{
        struct exceptional_entry_key key;
        wait_queue_head_t *wq;

        wq = dax_entry_waitqueue(xas, entry, &key);

        /*
         * Checking for locked entry and prepare_to_wait_exclusive() happens
         * under the i_pages lock, ditto for entry handling in our callers.
         * So at this point all tasks that could have seen our entry locked
         * must be in the waitqueue and the following check will see them.
         */
        if (waitqueue_active(wq))
                __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
}

/*
 * Look up entry in page cache, wait for it to become unlocked if it
 * is a DAX entry and return it.  The caller must subsequently call
 * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
 * if it did.  The entry returned may have a larger order than @order.
 * If @order is larger than the order of the entry found in i_pages, this
 * function returns a dax_is_conflict entry.
 *
 * Must be called with the i_pages lock held.
 */
static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
{
        void *entry;
        struct wait_exceptional_entry_queue ewait;
        wait_queue_head_t *wq;

        init_wait(&ewait.wait);
        ewait.wait.func = wake_exceptional_entry_func;

        for (;;) {
                entry = xas_find_conflict(xas);
                if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
                        return entry;
                if (dax_entry_order(entry) < order)
                        return XA_RETRY_ENTRY;
                if (!dax_is_locked(entry))
                        return entry;

                wq = dax_entry_waitqueue(xas, entry, &ewait.key);
                prepare_to_wait_exclusive(wq, &ewait.wait,
                                          TASK_UNINTERRUPTIBLE);
                xas_unlock_irq(xas);
                xas_reset(xas);
                schedule();
                finish_wait(wq, &ewait.wait);
                xas_lock_irq(xas);
        }
}

/*
 * The only thing keeping the address space around is the i_pages lock
 * (it's cycled in clear_inode() after removing the entries from i_pages)
 * After we call xas_unlock_irq(), we cannot touch xas->xa.
 */
static void wait_entry_unlocked(struct xa_state *xas, void *entry)
{
        struct wait_exceptional_entry_queue ewait;
        wait_queue_head_t *wq;

        init_wait(&ewait.wait);
        ewait.wait.func = wake_exceptional_entry_func;

        wq = dax_entry_waitqueue(xas, entry, &ewait.key);
        /*
         * Unlike get_unlocked_entry() there is no guarantee that this
         * path ever successfully retrieves an unlocked entry before an
         * inode dies. Perform a non-exclusive wait in case this path
         * never successfully performs its own wake up.
         */
        prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
        xas_unlock_irq(xas);
        schedule();
        finish_wait(wq, &ewait.wait);
}

static void put_unlocked_entry(struct xa_state *xas, void *entry,
                               enum dax_wake_mode mode)
{
        if (entry && !dax_is_conflict(entry))
                dax_wake_entry(xas, entry, mode);
}

/*
 * We used the xa_state to get the entry, but then we locked the entry and
 * dropped the xa_lock, so we know the xa_state is stale and must be reset
 * before use.
 */
static void dax_unlock_entry(struct xa_state *xas, void *entry)
{
        void *old;

        BUG_ON(dax_is_locked(entry));
        xas_reset(xas);
        xas_lock_irq(xas);
        old = xas_store(xas, entry);
        xas_unlock_irq(xas);
        BUG_ON(!dax_is_locked(old));
        dax_wake_entry(xas, entry, WAKE_NEXT);
}

/*
 * Return: The entry stored at this location before it was locked.
 */
static void *dax_lock_entry(struct xa_state *xas, void *entry)
{
        unsigned long v = xa_to_value(entry);
        return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
}

static unsigned long dax_entry_size(void *entry)
{
        if (dax_is_zero_entry(entry))
                return 0;
        else if (dax_is_empty_entry(entry))
                return 0;
        else if (dax_is_pmd_entry(entry))
                return PMD_SIZE;
        else
                return PAGE_SIZE;
}

static unsigned long dax_end_pfn(void *entry)
{
        return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
}

/*
 * Iterate through all mapped pfns represented by an entry, i.e. skip
 * 'empty' and 'zero' entries.
 */
#define for_each_mapped_pfn(entry, pfn) \
        for (pfn = dax_to_pfn(entry); \
                        pfn < dax_end_pfn(entry); pfn++)

static inline bool dax_page_is_shared(struct page *page)
{
        return page->mapping == PAGE_MAPPING_DAX_SHARED;
}

/*
 * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
 * refcount.
 */
static inline void dax_page_share_get(struct page *page)
{
        if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
                /*
                 * Reset the index if the page was already mapped
                 * regularly before.
                 */
                if (page->mapping)
                        page->share = 1;
                page->mapping = PAGE_MAPPING_DAX_SHARED;
        }
        page->share++;
}

static inline unsigned long dax_page_share_put(struct page *page)
{
        return --page->share;
}

/*
 * When it is called in dax_insert_entry(), the shared flag will indicate that
 * whether this entry is shared by multiple files.  If so, set the page->mapping
 * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
 */
static void dax_associate_entry(void *entry, struct address_space *mapping,
                struct vm_area_struct *vma, unsigned long address, bool shared)
{
        unsigned long size = dax_entry_size(entry), pfn, index;
        int i = 0;

        if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
                return;

        index = linear_page_index(vma, address & ~(size - 1));
        for_each_mapped_pfn(entry, pfn) {
                struct page *page = pfn_to_page(pfn);

                if (shared) {
                        dax_page_share_get(page);
                } else {
                        WARN_ON_ONCE(page->mapping);
                        page->mapping = mapping;
                        page->index = index + i++;
                }
        }
}

static void dax_disassociate_entry(void *entry, struct address_space *mapping,
                bool trunc)
{
        unsigned long pfn;

        if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
                return;

        for_each_mapped_pfn(entry, pfn) {
                struct page *page = pfn_to_page(pfn);

                WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
                if (dax_page_is_shared(page)) {
                        /* keep the shared flag if this page is still shared */
                        if (dax_page_share_put(page) > 0)
                                continue;
                } else
                        WARN_ON_ONCE(page->mapping && page->mapping != mapping);
                page->mapping = NULL;
                page->index = 0;
        }
}

static struct page *dax_busy_page(void *entry)
{
        unsigned long pfn;

        for_each_mapped_pfn(entry, pfn) {
                struct page *page = pfn_to_page(pfn);

                if (page_ref_count(page) > 1)
                        return page;
        }
        return NULL;
}

/**
 * dax_lock_folio - Lock the DAX entry corresponding to a folio
 * @folio: The folio whose entry we want to lock
 *
 * Context: Process context.
 * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could
 * not be locked.
 */
dax_entry_t dax_lock_folio(struct folio *folio)
{
        XA_STATE(xas, NULL, 0);
        void *entry;

        /* Ensure folio->mapping isn't freed while we look at it */
        rcu_read_lock();
        for (;;) {
                struct address_space *mapping = READ_ONCE(folio->mapping);

                entry = NULL;
                if (!mapping || !dax_mapping(mapping))
                        break;

                /*
                 * In the device-dax case there's no need to lock, a
                 * struct dev_pagemap pin is sufficient to keep the
                 * inode alive, and we assume we have dev_pagemap pin
                 * otherwise we would not have a valid pfn_to_page()
                 * translation.
                 */
                entry = (void *)~0UL;
                if (S_ISCHR(mapping->host->i_mode))
                        break;

                xas.xa = &mapping->i_pages;
                xas_lock_irq(&xas);
                if (mapping != folio->mapping) {
                        xas_unlock_irq(&xas);
                        continue;
                }
                xas_set(&xas, folio->index);
                entry = xas_load(&xas);
                if (dax_is_locked(entry)) {
                        rcu_read_unlock();
                        wait_entry_unlocked(&xas, entry);
                        rcu_read_lock();
                        continue;
                }
                dax_lock_entry(&xas, entry);
                xas_unlock_irq(&xas);
                break;
        }
        rcu_read_unlock();
        return (dax_entry_t)entry;
}

void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
{
        struct address_space *mapping = folio->mapping;
        XA_STATE(xas, &mapping->i_pages, folio->index);

        if (S_ISCHR(mapping->host->i_mode))
                return;

        dax_unlock_entry(&xas, (void *)cookie);
}

/*
 * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
 * @mapping: the file's mapping whose entry we want to lock
 * @index: the offset within this file
 * @page: output the dax page corresponding to this dax entry
 *
 * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
 * could not be locked.
 */
dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
                struct page **page)
{
        XA_STATE(xas, NULL, 0);
        void *entry;

        rcu_read_lock();
        for (;;) {
                entry = NULL;
                if (!dax_mapping(mapping))
                        break;

                xas.xa = &mapping->i_pages;
                xas_lock_irq(&xas);
                xas_set(&xas, index);
                entry = xas_load(&xas);
                if (dax_is_locked(entry)) {
                        rcu_read_unlock();
                        wait_entry_unlocked(&xas, entry);
                        rcu_read_lock();
                        continue;
                }
                if (!entry ||
                    dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
                        /*
                         * Because we are looking for entry from file's mapping
                         * and index, so the entry may not be inserted for now,
                         * or even a zero/empty entry.  We don't think this is
                         * an error case.  So, return a special value and do
                         * not output @page.
                         */
                        entry = (void *)~0UL;
                } else {
                        *page = pfn_to_page(dax_to_pfn(entry));
                        dax_lock_entry(&xas, entry);
                }
                xas_unlock_irq(&xas);
                break;
        }
        rcu_read_unlock();
        return (dax_entry_t)entry;
}

void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
                dax_entry_t cookie)
{
        XA_STATE(xas, &mapping->i_pages, index);

        if (cookie == ~0UL)
                return;

        dax_unlock_entry(&xas, (void *)cookie);
}

/*
 * Find page cache entry at given index. If it is a DAX entry, return it
 * with the entry locked. If the page cache doesn't contain an entry at
 * that index, add a locked empty entry.
 *
 * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
 * either return that locked entry or will return VM_FAULT_FALLBACK.
 * This will happen if there are any PTE entries within the PMD range
 * that we are requesting.
 *
 * We always favor PTE entries over PMD entries. There isn't a flow where we
 * evict PTE entries in order to 'upgrade' them to a PMD entry.  A PMD
 * insertion will fail if it finds any PTE entries already in the tree, and a
 * PTE insertion will cause an existing PMD entry to be unmapped and
 * downgraded to PTE entries.  This happens for both PMD zero pages as
 * well as PMD empty entries.
 *
 * The exception to this downgrade path is for PMD entries that have
 * real storage backing them.  We will leave these real PMD entries in
 * the tree, and PTE writes will simply dirty the entire PMD entry.
 *
 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
 * persistent memory the benefit is doubtful. We can add that later if we can
 * show it helps.
 *
 * On error, this function does not return an ERR_PTR.  Instead it returns
 * a VM_FAULT code, encoded as an xarray internal entry.  The ERR_PTR values
 * overlap with xarray value entries.
 */
static void *grab_mapping_entry(struct xa_state *xas,
                struct address_space *mapping, unsigned int order)
{
        unsigned long index = xas->xa_index;
        bool pmd_downgrade;        /* splitting PMD entry into PTE entries? */
        void *entry;

retry:
        pmd_downgrade = false;
        xas_lock_irq(xas);
        entry = get_unlocked_entry(xas, order);

        if (entry) {
                if (dax_is_conflict(entry))
                        goto fallback;
                if (!xa_is_value(entry)) {
                        xas_set_err(xas, -EIO);
                        goto out_unlock;
                }

                if (order == 0) {
                        if (dax_is_pmd_entry(entry) &&
                            (dax_is_zero_entry(entry) ||
                             dax_is_empty_entry(entry))) {
                                pmd_downgrade = true;
                        }
                }
        }

        if (pmd_downgrade) {
                /*
                 * Make sure 'entry' remains valid while we drop
                 * the i_pages lock.
                 */
                dax_lock_entry(xas, entry);

                /*
                 * Besides huge zero pages the only other thing that gets
                 * downgraded are empty entries which don't need to be
                 * unmapped.
                 */
                if (dax_is_zero_entry(entry)) {
                        xas_unlock_irq(xas);
                        unmap_mapping_pages(mapping,
                                        xas->xa_index & ~PG_PMD_COLOUR,
                                        PG_PMD_NR, false);
                        xas_reset(xas);
                        xas_lock_irq(xas);
                }

                dax_disassociate_entry(entry, mapping, false);
                xas_store(xas, NULL);        /* undo the PMD join */
                dax_wake_entry(xas, entry, WAKE_ALL);
                mapping->nrpages -= PG_PMD_NR;
                entry = NULL;
                xas_set(xas, index);
        }

        if (entry) {
                dax_lock_entry(xas, entry);
        } else {
                unsigned long flags = DAX_EMPTY;

                if (order > 0)
                        flags |= DAX_PMD;
                entry = dax_make_entry(pfn_to_pfn_t(0), flags);
                dax_lock_entry(xas, entry);
                if (xas_error(xas))
                        goto out_unlock;
                mapping->nrpages += 1UL << order;
        }

out_unlock:
        xas_unlock_irq(xas);
        if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
                goto retry;
        if (xas->xa_node == XA_ERROR(-ENOMEM))
                return xa_mk_internal(VM_FAULT_OOM);
        if (xas_error(xas))
                return xa_mk_internal(VM_FAULT_SIGBUS);
        return entry;
fallback:
        xas_unlock_irq(xas);
        return xa_mk_internal(VM_FAULT_FALLBACK);
}

/**
 * dax_layout_busy_page_range - find first pinned page in @mapping
 * @mapping: address space to scan for a page with ref count > 1
 * @start: Starting offset. Page containing 'start' is included.
 * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
 *       pages from 'start' till the end of file are included.
 *
 * DAX requires ZONE_DEVICE mapped pages. These pages are never
 * 'onlined' to the page allocator so they are considered idle when
 * page->count == 1. A filesystem uses this interface to determine if
 * any page in the mapping is busy, i.e. for DMA, or other
 * get_user_pages() usages.
 *
 * It is expected that the filesystem is holding locks to block the
 * establishment of new mappings in this address_space. I.e. it expects
 * to be able to run unmap_mapping_range() and subsequently not race
 * mapping_mapped() becoming true.
 */
struct page *dax_layout_busy_page_range(struct address_space *mapping,
                                        loff_t start, loff_t end)
{
        void *entry;
        unsigned int scanned = 0;
        struct page *page = NULL;
        pgoff_t start_idx = start >> PAGE_SHIFT;
        pgoff_t end_idx;
        XA_STATE(xas, &mapping->i_pages, start_idx);

        /*
         * In the 'limited' case get_user_pages() for dax is disabled.
         */
        if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
                return NULL;

        if (!dax_mapping(mapping) || !mapping_mapped(mapping))
                return NULL;

        /* If end == LLONG_MAX, all pages from start to till end of file */
        if (end == LLONG_MAX)
                end_idx = ULONG_MAX;
        else
                end_idx = end >> PAGE_SHIFT;
        /*
         * If we race get_user_pages_fast() here either we'll see the
         * elevated page count in the iteration and wait, or
         * get_user_pages_fast() will see that the page it took a reference
         * against is no longer mapped in the page tables and bail to the
         * get_user_pages() slow path.  The slow path is protected by
         * pte_lock() and pmd_lock(). New references are not taken without
         * holding those locks, and unmap_mapping_pages() will not zero the
         * pte or pmd without holding the respective lock, so we are
         * guaranteed to either see new references or prevent new
         * references from being established.
         */
        unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);

        xas_lock_irq(&xas);
        xas_for_each(&xas, entry, end_idx) {
                if (WARN_ON_ONCE(!xa_is_value(entry)))
                        continue;
                if (unlikely(dax_is_locked(entry)))
                        entry = get_unlocked_entry(&xas, 0);
                if (entry)
                        page = dax_busy_page(entry);
                put_unlocked_entry(&xas, entry, WAKE_NEXT);
                if (page)
                        break;
                if (++scanned % XA_CHECK_SCHED)
                        continue;

                xas_pause(&xas);
                xas_unlock_irq(&xas);
                cond_resched();
                xas_lock_irq(&xas);
        }
        xas_unlock_irq(&xas);
        return page;
}
EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);

struct page *dax_layout_busy_page(struct address_space *mapping)
{
        return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
}
EXPORT_SYMBOL_GPL(dax_layout_busy_page);

static int __dax_invalidate_entry(struct address_space *mapping,
                                          pgoff_t index, bool trunc)
{
        XA_STATE(xas, &mapping->i_pages, index);
        int ret = 0;
        void *entry;

        xas_lock_irq(&xas);
        entry = get_unlocked_entry(&xas, 0);
        if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
                goto out;
        if (!trunc &&
            (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
             xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
                goto out;
        dax_disassociate_entry(entry, mapping, trunc);
        xas_store(&xas, NULL);
        mapping->nrpages -= 1UL << dax_entry_order(entry);
        ret = 1;
out:
        put_unlocked_entry(&xas, entry, WAKE_ALL);
        xas_unlock_irq(&xas);
        return ret;
}

static int __dax_clear_dirty_range(struct address_space *mapping,
                pgoff_t start, pgoff_t end)
{
        XA_STATE(xas, &mapping->i_pages, start);
        unsigned int scanned = 0;
        void *entry;

        xas_lock_irq(&xas);
        xas_for_each(&xas, entry, end) {
                entry = get_unlocked_entry(&xas, 0);
                xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
                xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
                put_unlocked_entry(&xas, entry, WAKE_NEXT);

                if (++scanned % XA_CHECK_SCHED)
                        continue;

                xas_pause(&xas);
                xas_unlock_irq(&xas);
                cond_resched();
                xas_lock_irq(&xas);
        }
        xas_unlock_irq(&xas);

        return 0;
}

/*
 * Delete DAX entry at @index from @mapping.  Wait for it
 * to be unlocked before deleting it.
 */
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
{
        int ret = __dax_invalidate_entry(mapping, index, true);

        /*
         * This gets called from truncate / punch_hole path. As such, the caller
         * must hold locks protecting against concurrent modifications of the
         * page cache (usually fs-private i_mmap_sem for writing). Since the
         * caller has seen a DAX entry for this index, we better find it
         * at that index as well...
         */
        WARN_ON_ONCE(!ret);
        return ret;
}

/*
 * Invalidate DAX entry if it is clean.
 */
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
                                      pgoff_t index)
{
        return __dax_invalidate_entry(mapping, index, false);
}

static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
{
        return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset);
}

static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
{
        pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos);
        void *vto, *kaddr;
        long rc;
        int id;

        id = dax_read_lock();
        rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS,
                                &kaddr, NULL);
        if (rc < 0) {
                dax_read_unlock(id);
                return rc;
        }
        vto = kmap_atomic(vmf->cow_page);
        copy_user_page(vto, kaddr, vmf->address, vmf->cow_page);
        kunmap_atomic(vto);
        dax_read_unlock(id);
        return 0;
}

/*
 * MAP_SYNC on a dax mapping guarantees dirty metadata is
 * flushed on write-faults (non-cow), but not read-faults.
 */
static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
                struct vm_area_struct *vma)
{
        return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
                (iter->iomap.flags & IOMAP_F_DIRTY);
}

/*
 * By this point grab_mapping_entry() has ensured that we have a locked entry
 * of the appropriate size so we don't have to worry about downgrading PMDs to
 * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
 * already in the tree, we will skip the insertion and just dirty the PMD as
 * appropriate.
 */
static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
                const struct iomap_iter *iter, void *entry, pfn_t pfn,
                unsigned long flags)
{
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        void *new_entry = dax_make_entry(pfn, flags);
        bool write = iter->flags & IOMAP_WRITE;
        bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma);
        bool shared = iter->iomap.flags & IOMAP_F_SHARED;

        if (dirty)
                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);

        if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
                unsigned long index = xas->xa_index;
                /* we are replacing a zero page with block mapping */
                if (dax_is_pmd_entry(entry))
                        unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
                                        PG_PMD_NR, false);
                else /* pte entry */
                        unmap_mapping_pages(mapping, index, 1, false);
        }

        xas_reset(xas);
        xas_lock_irq(xas);
        if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
                void *old;

                dax_disassociate_entry(entry, mapping, false);
                dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
                                shared);
                /*
                 * Only swap our new entry into the page cache if the current
                 * entry is a zero page or an empty entry.  If a normal PTE or
                 * PMD entry is already in the cache, we leave it alone.  This
                 * means that if we are trying to insert a PTE and the
                 * existing entry is a PMD, we will just leave the PMD in the
                 * tree and dirty it if necessary.
                 */
                old = dax_lock_entry(xas, new_entry);
                WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
                                        DAX_LOCKED));
                entry = new_entry;
        } else {
                xas_load(xas);        /* Walk the xa_state */
        }

        if (dirty)
                xas_set_mark(xas, PAGECACHE_TAG_DIRTY);

        if (write && shared)
                xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);

        xas_unlock_irq(xas);
        return entry;
}

static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
                struct address_space *mapping, void *entry)
{
        unsigned long pfn, index, count, end;
        long ret = 0;
        struct vm_area_struct *vma;

        /*
         * A page got tagged dirty in DAX mapping? Something is seriously
         * wrong.
         */
        if (WARN_ON(!xa_is_value(entry)))
                return -EIO;

        if (unlikely(dax_is_locked(entry))) {
                void *old_entry = entry;

                entry = get_unlocked_entry(xas, 0);

                /* Entry got punched out / reallocated? */
                if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
                        goto put_unlocked;
                /*
                 * Entry got reallocated elsewhere? No need to writeback.
                 * We have to compare pfns as we must not bail out due to
                 * difference in lockbit or entry type.
                 */
                if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
                        goto put_unlocked;
                if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
                                        dax_is_zero_entry(entry))) {
                        ret = -EIO;
                        goto put_unlocked;
                }

                /* Another fsync thread may have already done this entry */
                if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
                        goto put_unlocked;
        }

        /* Lock the entry to serialize with page faults */
        dax_lock_entry(xas, entry);

        /*
         * We can clear the tag now but we have to be careful so that concurrent
         * dax_writeback_one() calls for the same index cannot finish before we
         * actually flush the caches. This is achieved as the calls will look
         * at the entry only under the i_pages lock and once they do that
         * they will see the entry locked and wait for it to unlock.
         */
        xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
        xas_unlock_irq(xas);

        /*
         * If dax_writeback_mapping_range() was given a wbc->range_start
         * in the middle of a PMD, the 'index' we use needs to be
         * aligned to the start of the PMD.
         * This allows us to flush for PMD_SIZE and not have to worry about
         * partial PMD writebacks.
         */
        pfn = dax_to_pfn(entry);
        count = 1UL << dax_entry_order(entry);
        index = xas->xa_index & ~(count - 1);
        end = index + count - 1;

        /* Walk all mappings of a given index of a file and writeprotect them */
        i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
                pfn_mkclean_range(pfn, count, index, vma);
                cond_resched();
        }
        i_mmap_unlock_read(mapping);

        dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
        /*
         * After we have flushed the cache, we can clear the dirty tag. There
         * cannot be new dirty data in the pfn after the flush has completed as
         * the pfn mappings are writeprotected and fault waits for mapping
         * entry lock.
         */
        xas_reset(xas);
        xas_lock_irq(xas);
        xas_store(xas, entry);
        xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
        dax_wake_entry(xas, entry, WAKE_NEXT);

        trace_dax_writeback_one(mapping->host, index, count);
        return ret;

 put_unlocked:
        put_unlocked_entry(xas, entry, WAKE_NEXT);
        return ret;
}

/*
 * Flush the mapping to the persistent domain within the byte range of [start,
 * end]. This is required by data integrity operations to ensure file data is
 * on persistent storage prior to completion of the operation.
 */
int dax_writeback_mapping_range(struct address_space *mapping,
                struct dax_device *dax_dev, struct writeback_control *wbc)
{
        XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
        struct inode *inode = mapping->host;
        pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
        void *entry;
        int ret = 0;
        unsigned int scanned = 0;

        if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
                return -EIO;

        if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL)
                return 0;

        trace_dax_writeback_range(inode, xas.xa_index, end_index);

        tag_pages_for_writeback(mapping, xas.xa_index, end_index);

        xas_lock_irq(&xas);
        xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
                ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
                if (ret < 0) {
                        mapping_set_error(mapping, ret);
                        break;
                }
                if (++scanned % XA_CHECK_SCHED)
                        continue;

                xas_pause(&xas);
                xas_unlock_irq(&xas);
                cond_resched();
                xas_lock_irq(&xas);
        }
        xas_unlock_irq(&xas);
        trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
        return ret;
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);

static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
                size_t size, void **kaddr, pfn_t *pfnp)
{
        pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
        int id, rc = 0;
        long length;

        id = dax_read_lock();
        length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
                                   DAX_ACCESS, kaddr, pfnp);
        if (length < 0) {
                rc = length;
                goto out;
        }
        if (!pfnp)
                goto out_check_addr;
        rc = -EINVAL;
        if (PFN_PHYS(length) < size)
                goto out;
        if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
                goto out;
        /* For larger pages we need devmap */
        if (length > 1 && !pfn_t_devmap(*pfnp))
                goto out;
        rc = 0;

out_check_addr:
        if (!kaddr)
                goto out;
        if (!*kaddr)
                rc = -EFAULT;
out:
        dax_read_unlock(id);
        return rc;
}

/**
 * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
 * by copying the data before and after the range to be written.
 * @pos:        address to do copy from.
 * @length:        size of copy operation.
 * @align_size:        aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
 * @srcmap:        iomap srcmap
 * @daddr:        destination address to copy to.
 *
 * This can be called from two places. Either during DAX write fault (page
 * aligned), to copy the length size data to daddr. Or, while doing normal DAX
 * write operation, dax_iomap_iter() might call this to do the copy of either
 * start or end unaligned address. In the latter case the rest of the copy of
 * aligned ranges is taken care by dax_iomap_iter() itself.
 * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the
 * area to make sure no old data remains.
 */
static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size,
                const struct iomap *srcmap, void *daddr)
{
        loff_t head_off = pos & (align_size - 1);
        size_t size = ALIGN(head_off + length, align_size);
        loff_t end = pos + length;
        loff_t pg_end = round_up(end, align_size);
        /* copy_all is usually in page fault case */
        bool copy_all = head_off == 0 && end == pg_end;
        /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */
        bool zero_edge = srcmap->flags & IOMAP_F_SHARED ||
                         srcmap->type == IOMAP_UNWRITTEN;
        void *saddr = NULL;
        int ret = 0;

        if (!zero_edge) {
                ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
                if (ret)
                        return dax_mem2blk_err(ret);
        }

        if (copy_all) {
                if (zero_edge)
                        memset(daddr, 0, size);
                else
                        ret = copy_mc_to_kernel(daddr, saddr, length);
                goto out;
        }

        /* Copy the head part of the range */
        if (head_off) {
                if (zero_edge)
                        memset(daddr, 0, head_off);
                else {
                        ret = copy_mc_to_kernel(daddr, saddr, head_off);
                        if (ret)
                                return -EIO;
                }
        }

        /* Copy the tail part of the range */
        if (end < pg_end) {
                loff_t tail_off = head_off + length;
                loff_t tail_len = pg_end - end;

                if (zero_edge)
                        memset(daddr + tail_off, 0, tail_len);
                else {
                        ret = copy_mc_to_kernel(daddr + tail_off,
                                                saddr + tail_off, tail_len);
                        if (ret)
                                return -EIO;
                }
        }
out:
        if (zero_edge)
                dax_flush(srcmap->dax_dev, daddr, size);
        return ret ? -EIO : 0;
}

/*
 * The user has performed a load from a hole in the file.  Allocating a new
 * page in the file would cause excessive storage usage for workloads with
 * sparse files.  Instead we insert a read-only mapping of the 4k zero page.
 * If this page is ever written to we will re-fault and change the mapping to
 * point to real DAX storage instead.
 */
static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
                const struct iomap_iter *iter, void **entry)
{
        struct inode *inode = iter->inode;
        unsigned long vaddr = vmf->address;
        pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
        vm_fault_t ret;

        *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);

        ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
        trace_dax_load_hole(inode, vmf, ret);
        return ret;
}

#ifdef CONFIG_FS_DAX_PMD
static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
                const struct iomap_iter *iter, void **entry)
{
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        unsigned long pmd_addr = vmf->address & PMD_MASK;
        struct vm_area_struct *vma = vmf->vma;
        struct inode *inode = mapping->host;
        pgtable_t pgtable = NULL;
        struct folio *zero_folio;
        spinlock_t *ptl;
        pmd_t pmd_entry;
        pfn_t pfn;

        zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm);

        if (unlikely(!zero_folio))
                goto fallback;

        pfn = page_to_pfn_t(&zero_folio->page);
        *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
                                  DAX_PMD | DAX_ZERO_PAGE);

        if (arch_needs_pgtable_deposit()) {
                pgtable = pte_alloc_one(vma->vm_mm);
                if (!pgtable)
                        return VM_FAULT_OOM;
        }

        ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
        if (!pmd_none(*(vmf->pmd))) {
                spin_unlock(ptl);
                goto fallback;
        }

        if (pgtable) {
                pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
                mm_inc_nr_ptes(vma->vm_mm);
        }
        pmd_entry = mk_pmd(&zero_folio->page, vmf->vma->vm_page_prot);
        pmd_entry = pmd_mkhuge(pmd_entry);
        set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
        spin_unlock(ptl);
        trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
        return VM_FAULT_NOPAGE;

fallback:
        if (pgtable)
                pte_free(vma->vm_mm, pgtable);
        trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry);
        return VM_FAULT_FALLBACK;
}
#else
static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
                const struct iomap_iter *iter, void **entry)
{
        return VM_FAULT_FALLBACK;
}
#endif /* CONFIG_FS_DAX_PMD */

static s64 dax_unshare_iter(struct iomap_iter *iter)
{
        struct iomap *iomap = &iter->iomap;
        const struct iomap *srcmap = iomap_iter_srcmap(iter);
        loff_t pos = iter->pos;
        loff_t length = iomap_length(iter);
        int id = 0;
        s64 ret = 0;
        void *daddr = NULL, *saddr = NULL;

        /* don't bother with blocks that are not shared to start with */
        if (!(iomap->flags & IOMAP_F_SHARED))
                return length;

        id = dax_read_lock();
        ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL);
        if (ret < 0)
                goto out_unlock;

        /* zero the distance if srcmap is HOLE or UNWRITTEN */
        if (srcmap->flags & IOMAP_F_SHARED || srcmap->type == IOMAP_UNWRITTEN) {
                memset(daddr, 0, length);
                dax_flush(iomap->dax_dev, daddr, length);
                ret = length;
                goto out_unlock;
        }

        ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL);
        if (ret < 0)
                goto out_unlock;

        if (copy_mc_to_kernel(daddr, saddr, length) == 0)
                ret = length;
        else
                ret = -EIO;

out_unlock:
        dax_read_unlock(id);
        return dax_mem2blk_err(ret);
}

int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
                const struct iomap_ops *ops)
{
        struct iomap_iter iter = {
                .inode                = inode,
                .pos                = pos,
                .len                = len,
                .flags                = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX,
        };
        int ret;

        while ((ret = iomap_iter(&iter, ops)) > 0)
                iter.processed = dax_unshare_iter(&iter);
        return ret;
}
EXPORT_SYMBOL_GPL(dax_file_unshare);

static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
{
        const struct iomap *iomap = &iter->iomap;
        const struct iomap *srcmap = iomap_iter_srcmap(iter);
        unsigned offset = offset_in_page(pos);
        pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
        void *kaddr;
        long ret;

        ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr,
                                NULL);
        if (ret < 0)
                return dax_mem2blk_err(ret);

        memset(kaddr + offset, 0, size);
        if (iomap->flags & IOMAP_F_SHARED)
                ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap,
                                            kaddr);
        else
                dax_flush(iomap->dax_dev, kaddr + offset, size);
        return ret;
}

static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
{
        const struct iomap *iomap = &iter->iomap;
        const struct iomap *srcmap = iomap_iter_srcmap(iter);
        loff_t pos = iter->pos;
        u64 length = iomap_length(iter);
        s64 written = 0;

        /* already zeroed?  we're done. */
        if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
                return length;

        /*
         * invalidate the pages whose sharing state is to be changed
         * because of CoW.
         */
        if (iomap->flags & IOMAP_F_SHARED)
                invalidate_inode_pages2_range(iter->inode->i_mapping,
                                              pos >> PAGE_SHIFT,
                                              (pos + length - 1) >> PAGE_SHIFT);

        do {
                unsigned offset = offset_in_page(pos);
                unsigned size = min_t(u64, PAGE_SIZE - offset, length);
                pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
                long rc;
                int id;

                id = dax_read_lock();
                if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
                        rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
                else
                        rc = dax_memzero(iter, pos, size);
                dax_read_unlock(id);

                if (rc < 0)
                        return rc;
                pos += size;
                length -= size;
                written += size;
        } while (length > 0);

        if (did_zero)
                *did_zero = true;
        return written;
}

int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
                const struct iomap_ops *ops)
{
        struct iomap_iter iter = {
                .inode                = inode,
                .pos                = pos,
                .len                = len,
                .flags                = IOMAP_DAX | IOMAP_ZERO,
        };
        int ret;

        while ((ret = iomap_iter(&iter, ops)) > 0)
                iter.processed = dax_zero_iter(&iter, did_zero);
        return ret;
}
EXPORT_SYMBOL_GPL(dax_zero_range);

int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
                const struct iomap_ops *ops)
{
        unsigned int blocksize = i_blocksize(inode);
        unsigned int off = pos & (blocksize - 1);

        /* Block boundary? Nothing to do */
        if (!off)
                return 0;
        return dax_zero_range(inode, pos, blocksize - off, did_zero, ops);
}
EXPORT_SYMBOL_GPL(dax_truncate_page);

static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
                struct iov_iter *iter)
{
        const struct iomap *iomap = &iomi->iomap;
        const struct iomap *srcmap = iomap_iter_srcmap(iomi);
        loff_t length = iomap_length(iomi);
        loff_t pos = iomi->pos;
        struct dax_device *dax_dev = iomap->dax_dev;
        loff_t end = pos + length, done = 0;
        bool write = iov_iter_rw(iter) == WRITE;
        bool cow = write && iomap->flags & IOMAP_F_SHARED;
        ssize_t ret = 0;
        size_t xfer;
        int id;

        if (!write) {
                end = min(end, i_size_read(iomi->inode));
                if (pos >= end)
                        return 0;

                if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
                        return iov_iter_zero(min(length, end - pos), iter);
        }

        /*
         * In DAX mode, enforce either pure overwrites of written extents, or
         * writes to unwritten extents as part of a copy-on-write operation.
         */
        if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
                        !(iomap->flags & IOMAP_F_SHARED)))
                return -EIO;

        /*
         * Write can allocate block for an area which has a hole page mapped
         * into page tables. We have to tear down these mappings so that data
         * written by write(2) is visible in mmap.
         */
        if (iomap->flags & IOMAP_F_NEW || cow) {
                /*
                 * Filesystem allows CoW on non-shared extents. The src extents
                 * may have been mmapped with dirty mark before. To be able to
                 * invalidate its dax entries, we need to clear the dirty mark
                 * in advance.
                 */
                if (cow)
                        __dax_clear_dirty_range(iomi->inode->i_mapping,
                                                pos >> PAGE_SHIFT,
                                                (end - 1) >> PAGE_SHIFT);
                invalidate_inode_pages2_range(iomi->inode->i_mapping,
                                              pos >> PAGE_SHIFT,
                                              (end - 1) >> PAGE_SHIFT);
        }

        id = dax_read_lock();
        while (pos < end) {
                unsigned offset = pos & (PAGE_SIZE - 1);
                const size_t size = ALIGN(length + offset, PAGE_SIZE);
                pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
                ssize_t map_len;
                bool recovery = false;
                void *kaddr;

                if (fatal_signal_pending(current)) {
                        ret = -EINTR;
                        break;
                }

                map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
                                DAX_ACCESS, &kaddr, NULL);
                if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) {
                        map_len = dax_direct_access(dax_dev, pgoff,
                                        PHYS_PFN(size), DAX_RECOVERY_WRITE,
                                        &kaddr, NULL);
                        if (map_len > 0)
                                recovery = true;
                }
                if (map_len < 0) {
                        ret = dax_mem2blk_err(map_len);
                        break;
                }

                if (cow) {
                        ret = dax_iomap_copy_around(pos, length, PAGE_SIZE,
                                                    srcmap, kaddr);
                        if (ret)
                                break;
                }

                map_len = PFN_PHYS(map_len);
                kaddr += offset;
                map_len -= offset;
                if (map_len > end - pos)
                        map_len = end - pos;

                if (recovery)
                        xfer = dax_recovery_write(dax_dev, pgoff, kaddr,
                                        map_len, iter);
                else if (write)
                        xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
                                        map_len, iter);
                else
                        xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
                                        map_len, iter);

                pos += xfer;
                length -= xfer;
                done += xfer;

                if (xfer == 0)
                        ret = -EFAULT;
                if (xfer < map_len)
                        break;
        }
        dax_read_unlock(id);

        return done ? done : ret;
}

/**
 * dax_iomap_rw - Perform I/O to a DAX file
 * @iocb:        The control block for this I/O
 * @iter:        The addresses to do I/O from or to
 * @ops:        iomap ops passed from the file system
 *
 * This function performs read and write operations to directly mapped
 * persistent memory.  The callers needs to take care of read/write exclusion
 * and evicting any page cache pages in the region under I/O.
 */
ssize_t
dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops)
{
        struct iomap_iter iomi = {
                .inode                = iocb->ki_filp->f_mapping->host,
                .pos                = iocb->ki_pos,
                .len                = iov_iter_count(iter),
                .flags                = IOMAP_DAX,
        };
        loff_t done = 0;
        int ret;

        if (!iomi.len)
                return 0;

        if (iov_iter_rw(iter) == WRITE) {
                lockdep_assert_held_write(&iomi.inode->i_rwsem);
                iomi.flags |= IOMAP_WRITE;
        } else {
                lockdep_assert_held(&iomi.inode->i_rwsem);
        }

        if (iocb->ki_flags & IOCB_NOWAIT)
                iomi.flags |= IOMAP_NOWAIT;

        while ((ret = iomap_iter(&iomi, ops)) > 0)
                iomi.processed = dax_iomap_iter(&iomi, iter);

        done = iomi.pos - iocb->ki_pos;
        iocb->ki_pos = iomi.pos;
        return done ? done : ret;
}
EXPORT_SYMBOL_GPL(dax_iomap_rw);

static vm_fault_t dax_fault_return(int error)
{
        if (error == 0)
                return VM_FAULT_NOPAGE;
        return vmf_error(error);
}

/*
 * When handling a synchronous page fault and the inode need a fsync, we can
 * insert the PTE/PMD into page tables only after that fsync happened. Skip
 * insertion for now and return the pfn so that caller can insert it after the
 * fsync is done.
 */
static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
{
        if (WARN_ON_ONCE(!pfnp))
                return VM_FAULT_SIGBUS;
        *pfnp = pfn;
        return VM_FAULT_NEEDDSYNC;
}

static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
                const struct iomap_iter *iter)
{
        vm_fault_t ret;
        int error = 0;

        switch (iter->iomap.type) {
        case IOMAP_HOLE:
        case IOMAP_UNWRITTEN:
                clear_user_highpage(vmf->cow_page, vmf->address);
                break;
        case IOMAP_MAPPED:
                error = copy_cow_page_dax(vmf, iter);
                break;
        default:
                WARN_ON_ONCE(1);
                error = -EIO;
                break;
        }

        if (error)
                return dax_fault_return(error);

        __SetPageUptodate(vmf->cow_page);
        ret = finish_fault(vmf);
        if (!ret)
                return VM_FAULT_DONE_COW;
        return ret;
}

/**
 * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault.
 * @vmf:        vm fault instance
 * @iter:        iomap iter
 * @pfnp:        pfn to be returned
 * @xas:        the dax mapping tree of a file
 * @entry:        an unlocked dax entry to be inserted
 * @pmd:        distinguish whether it is a pmd fault
 */
static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
                const struct iomap_iter *iter, pfn_t *pfnp,
                struct xa_state *xas, void **entry, bool pmd)
{
        const struct iomap *iomap = &iter->iomap;
        const struct iomap *srcmap = iomap_iter_srcmap(iter);
        size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
        loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
        bool write = iter->flags & IOMAP_WRITE;
        unsigned long entry_flags = pmd ? DAX_PMD : 0;
        int err = 0;
        pfn_t pfn;
        void *kaddr;

        if (!pmd && vmf->cow_page)
                return dax_fault_cow_page(vmf, iter);

        /* if we are reading UNWRITTEN and HOLE, return a hole. */
        if (!write &&
            (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
                if (!pmd)
                        return dax_load_hole(xas, vmf, iter, entry);
                return dax_pmd_load_hole(xas, vmf, iter, entry);
        }

        if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) {
                WARN_ON_ONCE(1);
                return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
        }

        err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
        if (err)
                return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);

        *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);

        if (write && iomap->flags & IOMAP_F_SHARED) {
                err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr);
                if (err)
                        return dax_fault_return(err);
        }

        if (dax_fault_is_synchronous(iter, vmf->vma))
                return dax_fault_synchronous_pfnp(pfnp, pfn);

        /* insert PMD pfn */
        if (pmd)
                return vmf_insert_pfn_pmd(vmf, pfn, write);

        /* insert PTE pfn */
        if (write)
                return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
        return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
}

static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
                               int *iomap_errp, const struct iomap_ops *ops)
{
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
        struct iomap_iter iter = {
                .inode                = mapping->host,
                .pos                = (loff_t)vmf->pgoff << PAGE_SHIFT,
                .len                = PAGE_SIZE,
                .flags                = IOMAP_DAX | IOMAP_FAULT,
        };
        vm_fault_t ret = 0;
        void *entry;
        int error;

        trace_dax_pte_fault(iter.inode, vmf, ret);
        /*
         * Check whether offset isn't beyond end of file now. Caller is supposed
         * to hold locks serializing us with truncate / punch hole so this is
         * a reliable test.
         */
        if (iter.pos >= i_size_read(iter.inode)) {
                ret = VM_FAULT_SIGBUS;
                goto out;
        }

        if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
                iter.flags |= IOMAP_WRITE;

        entry = grab_mapping_entry(&xas, mapping, 0);
        if (xa_is_internal(entry)) {
                ret = xa_to_internal(entry);
                goto out;
        }

        /*
         * It is possible, particularly with mixed reads & writes to private
         * mappings, that we have raced with a PMD fault that overlaps with
         * the PTE we need to set up.  If so just return and the fault will be
         * retried.
         */
        if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
                ret = VM_FAULT_NOPAGE;
                goto unlock_entry;
        }

        while ((error = iomap_iter(&iter, ops)) > 0) {
                if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
                        iter.processed = -EIO;        /* fs corruption? */
                        continue;
                }

                ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false);
                if (ret != VM_FAULT_SIGBUS &&
                    (iter.iomap.flags & IOMAP_F_NEW)) {
                        count_vm_event(PGMAJFAULT);
                        count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
                        ret |= VM_FAULT_MAJOR;
                }

                if (!(ret & VM_FAULT_ERROR))
                        iter.processed = PAGE_SIZE;
        }

        if (iomap_errp)
                *iomap_errp = error;
        if (!ret && error)
                ret = dax_fault_return(error);

unlock_entry:
        dax_unlock_entry(&xas, entry);
out:
        trace_dax_pte_fault_done(iter.inode, vmf, ret);
        return ret;
}

#ifdef CONFIG_FS_DAX_PMD
static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
                pgoff_t max_pgoff)
{
        unsigned long pmd_addr = vmf->address & PMD_MASK;
        bool write = vmf->flags & FAULT_FLAG_WRITE;

        /*
         * Make sure that the faulting address's PMD offset (color) matches
         * the PMD offset from the start of the file.  This is necessary so
         * that a PMD range in the page table overlaps exactly with a PMD
         * range in the page cache.
         */
        if ((vmf->pgoff & PG_PMD_COLOUR) !=
            ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
                return true;

        /* Fall back to PTEs if we're going to COW */
        if (write && !(vmf->vma->vm_flags & VM_SHARED))
                return true;

        /* If the PMD would extend outside the VMA */
        if (pmd_addr < vmf->vma->vm_start)
                return true;
        if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
                return true;

        /* If the PMD would extend beyond the file size */
        if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff)
                return true;

        return false;
}

static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
                               const struct iomap_ops *ops)
{
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
        struct iomap_iter iter = {
                .inode                = mapping->host,
                .len                = PMD_SIZE,
                .flags                = IOMAP_DAX | IOMAP_FAULT,
        };
        vm_fault_t ret = VM_FAULT_FALLBACK;
        pgoff_t max_pgoff;
        void *entry;

        if (vmf->flags & FAULT_FLAG_WRITE)
                iter.flags |= IOMAP_WRITE;

        /*
         * Check whether offset isn't beyond end of file now. Caller is
         * supposed to hold locks serializing us with truncate / punch hole so
         * this is a reliable test.
         */
        max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE);

        trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0);

        if (xas.xa_index >= max_pgoff) {
                ret = VM_FAULT_SIGBUS;
                goto out;
        }

        if (dax_fault_check_fallback(vmf, &xas, max_pgoff))
                goto fallback;

        /*
         * grab_mapping_entry() will make sure we get an empty PMD entry,
         * a zero PMD entry or a DAX PMD.  If it can't (because a PTE
         * entry is already in the array, for instance), it will return
         * VM_FAULT_FALLBACK.
         */
        entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
        if (xa_is_internal(entry)) {
                ret = xa_to_internal(entry);
                goto fallback;
        }

        /*
         * It is possible, particularly with mixed reads & writes to private
         * mappings, that we have raced with a PTE fault that overlaps with
         * the PMD we need to set up.  If so just return and the fault will be
         * retried.
         */
        if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
                        !pmd_devmap(*vmf->pmd)) {
                ret = 0;
                goto unlock_entry;
        }

        iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT;
        while (iomap_iter(&iter, ops) > 0) {
                if (iomap_length(&iter) < PMD_SIZE)
                        continue; /* actually breaks out of the loop */

                ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
                if (ret != VM_FAULT_FALLBACK)
                        iter.processed = PMD_SIZE;
        }

unlock_entry:
        dax_unlock_entry(&xas, entry);
fallback:
        if (ret == VM_FAULT_FALLBACK) {
                split_huge_pmd(vmf->vma, vmf->pmd, vmf->address);
                count_vm_event(THP_FAULT_FALLBACK);
        }
out:
        trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret);
        return ret;
}
#else
static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
                               const struct iomap_ops *ops)
{
        return VM_FAULT_FALLBACK;
}
#endif /* CONFIG_FS_DAX_PMD */

/**
 * dax_iomap_fault - handle a page fault on a DAX file
 * @vmf: The description of the fault
 * @order: Order of the page to fault in
 * @pfnp: PFN to insert for synchronous faults if fsync is required
 * @iomap_errp: Storage for detailed error code in case of error
 * @ops: Iomap ops passed from the file system
 *
 * When a page fault occurs, filesystems may call this helper in
 * their fault handler for DAX files. dax_iomap_fault() assumes the caller
 * has done all the necessary locking for page fault to proceed
 * successfully.
 */
vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
                    pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
{
        if (order == 0)
                return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
        else if (order == PMD_ORDER)
                return dax_iomap_pmd_fault(vmf, pfnp, ops);
        else
                return VM_FAULT_FALLBACK;
}
EXPORT_SYMBOL_GPL(dax_iomap_fault);

/*
 * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
 * @vmf: The description of the fault
 * @pfn: PFN to insert
 * @order: Order of entry to insert.
 *
 * This function inserts a writeable PTE or PMD entry into the page tables
 * for an mmaped DAX file.  It also marks the page cache entry as dirty.
 */
static vm_fault_t
dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
{
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
        void *entry;
        vm_fault_t ret;

        xas_lock_irq(&xas);
        entry = get_unlocked_entry(&xas, order);
        /* Did we race with someone splitting entry or so? */
        if (!entry || dax_is_conflict(entry) ||
            (order == 0 && !dax_is_pte_entry(entry))) {
                put_unlocked_entry(&xas, entry, WAKE_NEXT);
                xas_unlock_irq(&xas);
                trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
                                                      VM_FAULT_NOPAGE);
                return VM_FAULT_NOPAGE;
        }
        xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
        dax_lock_entry(&xas, entry);
        xas_unlock_irq(&xas);
        if (order == 0)
                ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
#ifdef CONFIG_FS_DAX_PMD
        else if (order == PMD_ORDER)
                ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
#endif
        else
                ret = VM_FAULT_FALLBACK;
        dax_unlock_entry(&xas, entry);
        trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
        return ret;
}

/**
 * dax_finish_sync_fault - finish synchronous page fault
 * @vmf: The description of the fault
 * @order: Order of entry to be inserted
 * @pfn: PFN to insert
 *
 * This function ensures that the file range touched by the page fault is
 * stored persistently on the media and handles inserting of appropriate page
 * table entry.
 */
vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
                pfn_t pfn)
{
        int err;
        loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
        size_t len = PAGE_SIZE << order;

        err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
        if (err)
                return VM_FAULT_SIGBUS;
        return dax_insert_pfn_mkwrite(vmf, pfn, order);
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);

static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
                struct iomap_iter *it_dest, u64 len, bool *same)
{
        const struct iomap *smap = &it_src->iomap;
        const struct iomap *dmap = &it_dest->iomap;
        loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
        void *saddr, *daddr;
        int id, ret;

        len = min(len, min(smap->length, dmap->length));

        if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
                *same = true;
                return len;
        }

        if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
                *same = false;
                return 0;
        }

        id = dax_read_lock();
        ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
                                      &saddr, NULL);
        if (ret < 0)
                goto out_unlock;

        ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
                                      &daddr, NULL);
        if (ret < 0)
                goto out_unlock;

        *same = !memcmp(saddr, daddr, len);
        if (!*same)
                len = 0;
        dax_read_unlock(id);
        return len;

out_unlock:
        dax_read_unlock(id);
        return -EIO;
}

int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
                struct inode *dst, loff_t dstoff, loff_t len, bool *same,
                const struct iomap_ops *ops)
{
        struct iomap_iter src_iter = {
                .inode                = src,
                .pos                = srcoff,
                .len                = len,
                .flags                = IOMAP_DAX,
        };
        struct iomap_iter dst_iter = {
                .inode                = dst,
                .pos                = dstoff,
                .len                = len,
                .flags                = IOMAP_DAX,
        };
        int ret, compared = 0;

        while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
               (ret = iomap_iter(&dst_iter, ops)) > 0) {
                compared = dax_range_compare_iter(&src_iter, &dst_iter,
                                min(src_iter.len, dst_iter.len), same);
                if (compared < 0)
                        return ret;
                src_iter.processed = dst_iter.processed = compared;
        }
        return ret;
}

int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
                              struct file *file_out, loff_t pos_out,
                              loff_t *len, unsigned int remap_flags,
                              const struct iomap_ops *ops)
{
        return __generic_remap_file_range_prep(file_in, pos_in, file_out,
                                               pos_out, len, remap_flags, ops);
}
EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);

































































    2 


















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_64_H
#define _ASM_X86_PGTABLE_64_H

#include <linux/const.h>
#include <asm/pgtable_64_types.h>

#ifndef __ASSEMBLY__

/*
 * This file contains the functions and defines necessary to modify and use
 * the x86-64 page table tree.
 */
#include <asm/processor.h>
#include <linux/bitops.h>
#include <linux/threads.h>
#include <asm/fixmap.h>

extern p4d_t level4_kernel_pgt[512];
extern p4d_t level4_ident_pgt[512];
extern pud_t level3_kernel_pgt[512];
extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
extern pmd_t level2_fixmap_pgt[512];
extern pmd_t level2_ident_pgt[512];
extern pte_t level1_fixmap_pgt[512 * FIXMAP_PMD_NUM];
extern pgd_t init_top_pgt[];

#define swapper_pg_dir init_top_pgt

extern void paging_init(void);
static inline void sync_initial_page_table(void) { }

#define pte_ERROR(e)                                        \
        pr_err("%s:%d: bad pte %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pte_val(e))
#define pmd_ERROR(e)                                        \
        pr_err("%s:%d: bad pmd %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pmd_val(e))
#define pud_ERROR(e)                                        \
        pr_err("%s:%d: bad pud %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pud_val(e))

#if CONFIG_PGTABLE_LEVELS >= 5
#define p4d_ERROR(e)                                        \
        pr_err("%s:%d: bad p4d %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), p4d_val(e))
#endif

#define pgd_ERROR(e)                                        \
        pr_err("%s:%d: bad pgd %p(%016lx)\n",                \
               __FILE__, __LINE__, &(e), pgd_val(e))

struct mm_struct;

#define mm_p4d_folded mm_p4d_folded
static inline bool mm_p4d_folded(struct mm_struct *mm)
{
        return !pgtable_l5_enabled();
}

void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte);
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);

static inline void native_set_pte(pte_t *ptep, pte_t pte)
{
        WRITE_ONCE(*ptep, pte);
}

static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
                                    pte_t *ptep)
{
        native_set_pte(ptep, native_make_pte(0));
}

static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
{
        native_set_pte(ptep, pte);
}

static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
        WRITE_ONCE(*pmdp, pmd);
}

static inline void native_pmd_clear(pmd_t *pmd)
{
        native_set_pmd(pmd, native_make_pmd(0));
}

static inline pte_t native_ptep_get_and_clear(pte_t *xp)
{
#ifdef CONFIG_SMP
        return native_make_pte(xchg(&xp->pte, 0));
#else
        /* native_local_ptep_get_and_clear,
           but duplicated because of cyclic dependency */
        pte_t ret = *xp;
        native_pte_clear(NULL, 0, xp);
        return ret;
#endif
}

static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
{
#ifdef CONFIG_SMP
        return native_make_pmd(xchg(&xp->pmd, 0));
#else
        /* native_local_pmdp_get_and_clear,
           but duplicated because of cyclic dependency */
        pmd_t ret = *xp;
        native_pmd_clear(xp);
        return ret;
#endif
}

static inline void native_set_pud(pud_t *pudp, pud_t pud)
{
        WRITE_ONCE(*pudp, pud);
}

static inline void native_pud_clear(pud_t *pud)
{
        native_set_pud(pud, native_make_pud(0));
}

static inline pud_t native_pudp_get_and_clear(pud_t *xp)
{
#ifdef CONFIG_SMP
        return native_make_pud(xchg(&xp->pud, 0));
#else
        /* native_local_pudp_get_and_clear,
         * but duplicated because of cyclic dependency
         */
        pud_t ret = *xp;

        native_pud_clear(xp);
        return ret;
#endif
}

static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
        pgd_t pgd;

        if (pgtable_l5_enabled() ||
            !IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) {
                WRITE_ONCE(*p4dp, p4d);
                return;
        }

        pgd = native_make_pgd(native_p4d_val(p4d));
        pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd);
        WRITE_ONCE(*p4dp, native_make_p4d(native_pgd_val(pgd)));
}

static inline void native_p4d_clear(p4d_t *p4d)
{
        native_set_p4d(p4d, native_make_p4d(0));
}

static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
        WRITE_ONCE(*pgdp, pti_set_user_pgtbl(pgdp, pgd));
}

static inline void native_pgd_clear(pgd_t *pgd)
{
        native_set_pgd(pgd, native_make_pgd(0));
}

/*
 * Conversion functions: convert a page and protection to a page entry,
 * and a page entry and page directory to the page they refer to.
 */

/* PGD - Level 4 access */

/* PUD - Level 3 access */

/* PMD - Level 2 access */

/* PTE - Level 1 access */

/*
 * Encode and de-code a swap entry
 *
 * |     ...            | 11| 10|  9|8|7|6|5| 4| 3|2| 1|0| <- bit number
 * |     ...            |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
 * | TYPE (59-63) | ~OFFSET (9-58)  |0|0|X|X| X| E|F|SD|0| <- swp entry
 *
 * G (8) is aliased and used as a PROT_NONE indicator for
 * !present ptes.  We need to start storing swap entries above
 * there.  We also need to avoid using A and D because of an
 * erratum where they can be incorrectly set by hardware on
 * non-present PTEs.
 *
 * SD Bits 1-4 are not used in non-present format and available for
 * special use described below:
 *
 * SD (1) in swp entry is used to store soft dirty bit, which helps us
 * remember soft dirty over page migration
 *
 * F (2) in swp entry is used to record when a pagetable is
 * writeprotected by userfaultfd WP support.
 *
 * E (3) in swp entry is used to remember PG_anon_exclusive.
 *
 * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
 * but also L and G.
 *
 * The offset is inverted by a binary not operation to make the high
 * physical bits set.
 */
#define SWP_TYPE_BITS                5

#define SWP_OFFSET_FIRST_BIT        (_PAGE_BIT_PROTNONE + 1)

/* We always extract/encode the offset by shifting it all the way up, and then down again */
#define SWP_OFFSET_SHIFT        (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS)

#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)

/* Extract the high bits for type */
#define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS))

/* Shift up (to get rid of type), then down to get value */
#define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)

/*
 * Shift the offset up "too far" by TYPE bits, then down again
 * The offset is inverted by a binary not operation to make the high
 * physical bits set.
 */
#define __swp_entry(type, offset) ((swp_entry_t) { \
        (~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
        | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) })

#define __pte_to_swp_entry(pte)                ((swp_entry_t) { pte_val((pte)) })
#define __pmd_to_swp_entry(pmd)                ((swp_entry_t) { pmd_val((pmd)) })
#define __swp_entry_to_pte(x)                (__pte((x).val))
#define __swp_entry_to_pmd(x)                (__pmd((x).val))

extern void cleanup_highmap(void);

#define HAVE_ARCH_UNMAPPED_AREA
#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
#define HAVE_ARCH_UNMAPPED_AREA_VMFLAGS

#define PAGE_AGP    PAGE_KERNEL_NOCACHE
#define HAVE_PAGE_AGP 1

/* fs/proc/kcore.c */
#define        kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
#define        kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)

#define __HAVE_ARCH_PTE_SAME

#define vmemmap ((struct page *)VMEMMAP_START)

extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);

#define gup_fast_permitted gup_fast_permitted
static inline bool gup_fast_permitted(unsigned long start, unsigned long end)
{
        if (end >> __VIRTUAL_MASK_SHIFT)
                return false;
        return true;
}

#include <asm/pgtable-invert.h>

#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */













    2 



    2 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BSEARCH_H
#define _LINUX_BSEARCH_H

#include <linux/types.h>

static __always_inline
void *__inline_bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp)
{
        const char *pivot;
        int result;

        while (num > 0) {
                pivot = base + (num >> 1) * size;
                result = cmp(key, pivot);

                if (result == 0)
                        return (void *)pivot;

                if (result > 0) {
                        base = pivot + size;
                        num--;
                }
                num >>= 1;
        }

        return NULL;
}

extern void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp);

#endif /* _LINUX_BSEARCH_H */














































































































































































































































































































































































































































































































































































































































































































    1 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * fscrypt_private.h
 *
 * Copyright (C) 2015, Google, Inc.
 *
 * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar.
 * Heavily modified since then.
 */

#ifndef _FSCRYPT_PRIVATE_H
#define _FSCRYPT_PRIVATE_H

#include <linux/fscrypt.h>
#include <linux/siphash.h>
#include <crypto/hash.h>
#include <linux/blk-crypto.h>

#define CONST_STRLEN(str)        (sizeof(str) - 1)

#define FSCRYPT_FILE_NONCE_SIZE        16

/*
 * Minimum size of an fscrypt master key.  Note: a longer key will be required
 * if ciphers with a 256-bit security strength are used.  This is just the
 * absolute minimum, which applies when only 128-bit encryption is used.
 */
#define FSCRYPT_MIN_KEY_SIZE        16

#define FSCRYPT_CONTEXT_V1        1
#define FSCRYPT_CONTEXT_V2        2

/* Keep this in sync with include/uapi/linux/fscrypt.h */
#define FSCRYPT_MODE_MAX        FSCRYPT_MODE_AES_256_HCTR2

struct fscrypt_context_v1 {
        u8 version; /* FSCRYPT_CONTEXT_V1 */
        u8 contents_encryption_mode;
        u8 filenames_encryption_mode;
        u8 flags;
        u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
        u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
};

struct fscrypt_context_v2 {
        u8 version; /* FSCRYPT_CONTEXT_V2 */
        u8 contents_encryption_mode;
        u8 filenames_encryption_mode;
        u8 flags;
        u8 log2_data_unit_size;
        u8 __reserved[3];
        u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE];
        u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
};

/*
 * fscrypt_context - the encryption context of an inode
 *
 * This is the on-disk equivalent of an fscrypt_policy, stored alongside each
 * encrypted file usually in a hidden extended attribute.  It contains the
 * fields from the fscrypt_policy, in order to identify the encryption algorithm
 * and key with which the file is encrypted.  It also contains a nonce that was
 * randomly generated by fscrypt itself; this is used as KDF input or as a tweak
 * to cause different files to be encrypted differently.
 */
union fscrypt_context {
        u8 version;
        struct fscrypt_context_v1 v1;
        struct fscrypt_context_v2 v2;
};

/*
 * Return the size expected for the given fscrypt_context based on its version
 * number, or 0 if the context version is unrecognized.
 */
static inline int fscrypt_context_size(const union fscrypt_context *ctx)
{
        switch (ctx->version) {
        case FSCRYPT_CONTEXT_V1:
                BUILD_BUG_ON(sizeof(ctx->v1) != 28);
                return sizeof(ctx->v1);
        case FSCRYPT_CONTEXT_V2:
                BUILD_BUG_ON(sizeof(ctx->v2) != 40);
                return sizeof(ctx->v2);
        }
        return 0;
}

/* Check whether an fscrypt_context has a recognized version number and size */
static inline bool fscrypt_context_is_valid(const union fscrypt_context *ctx,
                                            int ctx_size)
{
        return ctx_size >= 1 && ctx_size == fscrypt_context_size(ctx);
}

/* Retrieve the context's nonce, assuming the context was already validated */
static inline const u8 *fscrypt_context_nonce(const union fscrypt_context *ctx)
{
        switch (ctx->version) {
        case FSCRYPT_CONTEXT_V1:
                return ctx->v1.nonce;
        case FSCRYPT_CONTEXT_V2:
                return ctx->v2.nonce;
        }
        WARN_ON_ONCE(1);
        return NULL;
}

union fscrypt_policy {
        u8 version;
        struct fscrypt_policy_v1 v1;
        struct fscrypt_policy_v2 v2;
};

/*
 * Return the size expected for the given fscrypt_policy based on its version
 * number, or 0 if the policy version is unrecognized.
 */
static inline int fscrypt_policy_size(const union fscrypt_policy *policy)
{
        switch (policy->version) {
        case FSCRYPT_POLICY_V1:
                return sizeof(policy->v1);
        case FSCRYPT_POLICY_V2:
                return sizeof(policy->v2);
        }
        return 0;
}

/* Return the contents encryption mode of a valid encryption policy */
static inline u8
fscrypt_policy_contents_mode(const union fscrypt_policy *policy)
{
        switch (policy->version) {
        case FSCRYPT_POLICY_V1:
                return policy->v1.contents_encryption_mode;
        case FSCRYPT_POLICY_V2:
                return policy->v2.contents_encryption_mode;
        }
        BUG();
}

/* Return the filenames encryption mode of a valid encryption policy */
static inline u8
fscrypt_policy_fnames_mode(const union fscrypt_policy *policy)
{
        switch (policy->version) {
        case FSCRYPT_POLICY_V1:
                return policy->v1.filenames_encryption_mode;
        case FSCRYPT_POLICY_V2:
                return policy->v2.filenames_encryption_mode;
        }
        BUG();
}

/* Return the flags (FSCRYPT_POLICY_FLAG*) of a valid encryption policy */
static inline u8
fscrypt_policy_flags(const union fscrypt_policy *policy)
{
        switch (policy->version) {
        case FSCRYPT_POLICY_V1:
                return policy->v1.flags;
        case FSCRYPT_POLICY_V2:
                return policy->v2.flags;
        }
        BUG();
}

static inline int
fscrypt_policy_v2_du_bits(const struct fscrypt_policy_v2 *policy,
                          const struct inode *inode)
{
        return policy->log2_data_unit_size ?: inode->i_blkbits;
}

static inline int
fscrypt_policy_du_bits(const union fscrypt_policy *policy,
                       const struct inode *inode)
{
        switch (policy->version) {
        case FSCRYPT_POLICY_V1:
                return inode->i_blkbits;
        case FSCRYPT_POLICY_V2:
                return fscrypt_policy_v2_du_bits(&policy->v2, inode);
        }
        BUG();
}

/*
 * For encrypted symlinks, the ciphertext length is stored at the beginning
 * of the string in little-endian format.
 */
struct fscrypt_symlink_data {
        __le16 len;
        char encrypted_path[];
} __packed;

/**
 * struct fscrypt_prepared_key - a key prepared for actual encryption/decryption
 * @tfm: crypto API transform object
 * @blk_key: key for blk-crypto
 *
 * Normally only one of the fields will be non-NULL.
 */
struct fscrypt_prepared_key {
        struct crypto_skcipher *tfm;
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
        struct blk_crypto_key *blk_key;
#endif
};

/*
 * fscrypt_inode_info - the "encryption key" for an inode
 *
 * When an encrypted file's key is made available, an instance of this struct is
 * allocated and stored in ->i_crypt_info.  Once created, it remains until the
 * inode is evicted.
 */
struct fscrypt_inode_info {

        /* The key in a form prepared for actual encryption/decryption */
        struct fscrypt_prepared_key ci_enc_key;

        /* True if ci_enc_key should be freed when this struct is freed */
        u8 ci_owns_key : 1;

#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
        /*
         * True if this inode will use inline encryption (blk-crypto) instead of
         * the traditional filesystem-layer encryption.
         */
        u8 ci_inlinecrypt : 1;
#endif

        /* True if ci_dirhash_key is initialized */
        u8 ci_dirhash_key_initialized : 1;

        /*
         * log2 of the data unit size (granularity of contents encryption) of
         * this file.  This is computable from ci_policy and ci_inode but is
         * cached here for efficiency.  Only used for regular files.
         */
        u8 ci_data_unit_bits;

        /* Cached value: log2 of number of data units per FS block */
        u8 ci_data_units_per_block_bits;

        /* Hashed inode number.  Only set for IV_INO_LBLK_32 */
        u32 ci_hashed_ino;

        /*
         * Encryption mode used for this inode.  It corresponds to either the
         * contents or filenames encryption mode, depending on the inode type.
         */
        struct fscrypt_mode *ci_mode;

        /* Back-pointer to the inode */
        struct inode *ci_inode;

        /*
         * The master key with which this inode was unlocked (decrypted).  This
         * will be NULL if the master key was found in a process-subscribed
         * keyring rather than in the filesystem-level keyring.
         */
        struct fscrypt_master_key *ci_master_key;

        /*
         * Link in list of inodes that were unlocked with the master key.
         * Only used when ->ci_master_key is set.
         */
        struct list_head ci_master_key_link;

        /*
         * If non-NULL, then encryption is done using the master key directly
         * and ci_enc_key will equal ci_direct_key->dk_key.
         */
        struct fscrypt_direct_key *ci_direct_key;

        /*
         * This inode's hash key for filenames.  This is a 128-bit SipHash-2-4
         * key.  This is only set for directories that use a keyed dirhash over
         * the plaintext filenames -- currently just casefolded directories.
         */
        siphash_key_t ci_dirhash_key;

        /* The encryption policy used by this inode */
        union fscrypt_policy ci_policy;

        /* This inode's nonce, copied from the fscrypt_context */
        u8 ci_nonce[FSCRYPT_FILE_NONCE_SIZE];
};

typedef enum {
        FS_DECRYPT = 0,
        FS_ENCRYPT,
} fscrypt_direction_t;

/* crypto.c */
extern struct kmem_cache *fscrypt_inode_info_cachep;
int fscrypt_initialize(struct super_block *sb);
int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
                            fscrypt_direction_t rw, u64 index,
                            struct page *src_page, struct page *dest_page,
                            unsigned int len, unsigned int offs,
                            gfp_t gfp_flags);
struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags);

void __printf(3, 4) __cold
fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...);

#define fscrypt_warn(inode, fmt, ...)                \
        fscrypt_msg((inode), KERN_WARNING, fmt, ##__VA_ARGS__)
#define fscrypt_err(inode, fmt, ...)                \
        fscrypt_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__)

#define FSCRYPT_MAX_IV_SIZE        32

union fscrypt_iv {
        struct {
                /* zero-based index of data unit within the file */
                __le64 index;

                /* per-file nonce; only set in DIRECT_KEY mode */
                u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
        };
        u8 raw[FSCRYPT_MAX_IV_SIZE];
        __le64 dun[FSCRYPT_MAX_IV_SIZE / sizeof(__le64)];
};

void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index,
                         const struct fscrypt_inode_info *ci);

/*
 * Return the number of bits used by the maximum file data unit index that is
 * possible on the given filesystem, using the given log2 data unit size.
 */
static inline int
fscrypt_max_file_dun_bits(const struct super_block *sb, int du_bits)
{
        return fls64(sb->s_maxbytes - 1) - du_bits;
}

/* fname.c */
bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
                                    u32 orig_len, u32 max_len,
                                    u32 *encrypted_len_ret);

/* hkdf.c */
struct fscrypt_hkdf {
        struct crypto_shash *hmac_tfm;
};

int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
                      unsigned int master_key_size);

/*
 * The list of contexts in which fscrypt uses HKDF.  These values are used as
 * the first byte of the HKDF application-specific info string to guarantee that
 * info strings are never repeated between contexts.  This ensures that all HKDF
 * outputs are unique and cryptographically isolated, i.e. knowledge of one
 * output doesn't reveal another.
 */
#define HKDF_CONTEXT_KEY_IDENTIFIER        1 /* info=<empty>                */
#define HKDF_CONTEXT_PER_FILE_ENC_KEY        2 /* info=file_nonce                */
#define HKDF_CONTEXT_DIRECT_KEY                3 /* info=mode_num                */
#define HKDF_CONTEXT_IV_INO_LBLK_64_KEY        4 /* info=mode_num||fs_uuid        */
#define HKDF_CONTEXT_DIRHASH_KEY        5 /* info=file_nonce                */
#define HKDF_CONTEXT_IV_INO_LBLK_32_KEY        6 /* info=mode_num||fs_uuid        */
#define HKDF_CONTEXT_INODE_HASH_KEY        7 /* info=<empty>                */

int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
                        const u8 *info, unsigned int infolen,
                        u8 *okm, unsigned int okmlen);

void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf);

/* inline_crypt.c */
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci);

static inline bool
fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
{
        return ci->ci_inlinecrypt;
}

int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
                                     const u8 *raw_key,
                                     const struct fscrypt_inode_info *ci);

void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
                                      struct fscrypt_prepared_key *prep_key);

/*
 * Check whether the crypto transform or blk-crypto key has been allocated in
 * @prep_key, depending on which encryption implementation the file will use.
 */
static inline bool
fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
                        const struct fscrypt_inode_info *ci)
{
        /*
         * The two smp_load_acquire()'s here pair with the smp_store_release()'s
         * in fscrypt_prepare_inline_crypt_key() and fscrypt_prepare_key().
         * I.e., in some cases (namely, if this prep_key is a per-mode
         * encryption key) another task can publish blk_key or tfm concurrently,
         * executing a RELEASE barrier.  We need to use smp_load_acquire() here
         * to safely ACQUIRE the memory the other task published.
         */
        if (fscrypt_using_inline_encryption(ci))
                return smp_load_acquire(&prep_key->blk_key) != NULL;
        return smp_load_acquire(&prep_key->tfm) != NULL;
}

#else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */

static inline int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci)
{
        return 0;
}

static inline bool
fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
{
        return false;
}

static inline int
fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
                                 const u8 *raw_key,
                                 const struct fscrypt_inode_info *ci)
{
        WARN_ON_ONCE(1);
        return -EOPNOTSUPP;
}

static inline void
fscrypt_destroy_inline_crypt_key(struct super_block *sb,
                                 struct fscrypt_prepared_key *prep_key)
{
}

static inline bool
fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
                        const struct fscrypt_inode_info *ci)
{
        return smp_load_acquire(&prep_key->tfm) != NULL;
}
#endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */

/* keyring.c */

/*
 * fscrypt_master_key_secret - secret key material of an in-use master key
 */
struct fscrypt_master_key_secret {

        /*
         * For v2 policy keys: HKDF context keyed by this master key.
         * For v1 policy keys: not set (hkdf.hmac_tfm == NULL).
         */
        struct fscrypt_hkdf        hkdf;

        /*
         * Size of the raw key in bytes.  This remains set even if ->raw was
         * zeroized due to no longer being needed.  I.e. we still remember the
         * size of the key even if we don't need to remember the key itself.
         */
        u32                        size;

        /* For v1 policy keys: the raw key.  Wiped for v2 policy keys. */
        u8                        raw[FSCRYPT_MAX_KEY_SIZE];

} __randomize_layout;

/*
 * fscrypt_master_key - an in-use master key
 *
 * This represents a master encryption key which has been added to the
 * filesystem.  There are three high-level states that a key can be in:
 *
 * FSCRYPT_KEY_STATUS_PRESENT
 *        Key is fully usable; it can be used to unlock inodes that are encrypted
 *        with it (this includes being able to create new inodes).  ->mk_present
 *        indicates whether the key is in this state.  ->mk_secret exists, the key
 *        is in the keyring, and ->mk_active_refs > 0 due to ->mk_present.
 *
 * FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED
 *        Removal of this key has been initiated, but some inodes that were
 *        unlocked with it are still in-use.  Like ABSENT, ->mk_secret is wiped,
 *        and the key can no longer be used to unlock inodes.  Unlike ABSENT, the
 *        key is still in the keyring; ->mk_decrypted_inodes is nonempty; and
 *        ->mk_active_refs > 0, being equal to the size of ->mk_decrypted_inodes.
 *
 *        This state transitions to ABSENT if ->mk_decrypted_inodes becomes empty,
 *        or to PRESENT if FS_IOC_ADD_ENCRYPTION_KEY is called again for this key.
 *
 * FSCRYPT_KEY_STATUS_ABSENT
 *        Key is fully removed.  The key is no longer in the keyring,
 *        ->mk_decrypted_inodes is empty, ->mk_active_refs == 0, ->mk_secret is
 *        wiped, and the key can no longer be used to unlock inodes.
 */
struct fscrypt_master_key {

        /*
         * Link in ->s_master_keys->key_hashtable.
         * Only valid if ->mk_active_refs > 0.
         */
        struct hlist_node                        mk_node;

        /* Semaphore that protects ->mk_secret, ->mk_users, and ->mk_present */
        struct rw_semaphore                        mk_sem;

        /*
         * Active and structural reference counts.  An active ref guarantees
         * that the struct continues to exist, continues to be in the keyring
         * ->s_master_keys, and that any embedded subkeys (e.g.
         * ->mk_direct_keys) that have been prepared continue to exist.
         * A structural ref only guarantees that the struct continues to exist.
         *
         * There is one active ref associated with ->mk_present being true, and
         * one active ref for each inode in ->mk_decrypted_inodes.
         *
         * There is one structural ref associated with the active refcount being
         * nonzero.  Finding a key in the keyring also takes a structural ref,
         * which is then held temporarily while the key is operated on.
         */
        refcount_t                                mk_active_refs;
        refcount_t                                mk_struct_refs;

        struct rcu_head                                mk_rcu_head;

        /*
         * The secret key material.  Wiped as soon as it is no longer needed;
         * for details, see the fscrypt_master_key struct comment.
         *
         * Locking: protected by ->mk_sem.
         */
        struct fscrypt_master_key_secret        mk_secret;

        /*
         * For v1 policy keys: an arbitrary key descriptor which was assigned by
         * userspace (->descriptor).
         *
         * For v2 policy keys: a cryptographic hash of this key (->identifier).
         */
        struct fscrypt_key_specifier                mk_spec;

        /*
         * Keyring which contains a key of type 'key_type_fscrypt_user' for each
         * user who has added this key.  Normally each key will be added by just
         * one user, but it's possible that multiple users share a key, and in
         * that case we need to keep track of those users so that one user can't
         * remove the key before the others want it removed too.
         *
         * This is NULL for v1 policy keys; those can only be added by root.
         *
         * Locking: protected by ->mk_sem.  (We don't just rely on the keyrings
         * subsystem semaphore ->mk_users->sem, as we need support for atomic
         * search+insert along with proper synchronization with other fields.)
         */
        struct key                *mk_users;

        /*
         * List of inodes that were unlocked using this key.  This allows the
         * inodes to be evicted efficiently if the key is removed.
         */
        struct list_head        mk_decrypted_inodes;
        spinlock_t                mk_decrypted_inodes_lock;

        /*
         * Per-mode encryption keys for the various types of encryption policies
         * that use them.  Allocated and derived on-demand.
         */
        struct fscrypt_prepared_key mk_direct_keys[FSCRYPT_MODE_MAX + 1];
        struct fscrypt_prepared_key mk_iv_ino_lblk_64_keys[FSCRYPT_MODE_MAX + 1];
        struct fscrypt_prepared_key mk_iv_ino_lblk_32_keys[FSCRYPT_MODE_MAX + 1];

        /* Hash key for inode numbers.  Initialized only when needed. */
        siphash_key_t                mk_ino_hash_key;
        bool                        mk_ino_hash_key_initialized;

        /*
         * Whether this key is in the "present" state, i.e. fully usable.  For
         * details, see the fscrypt_master_key struct comment.
         *
         * Locking: protected by ->mk_sem, but can be read locklessly using
         * READ_ONCE().  Writers must use WRITE_ONCE() when concurrent readers
         * are possible.
         */
        bool                        mk_present;

} __randomize_layout;

static inline const char *master_key_spec_type(
                                const struct fscrypt_key_specifier *spec)
{
        switch (spec->type) {
        case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
                return "descriptor";
        case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER:
                return "identifier";
        }
        return "[unknown]";
}

static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec)
{
        switch (spec->type) {
        case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
                return FSCRYPT_KEY_DESCRIPTOR_SIZE;
        case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER:
                return FSCRYPT_KEY_IDENTIFIER_SIZE;
        }
        return 0;
}

void fscrypt_put_master_key(struct fscrypt_master_key *mk);

void fscrypt_put_master_key_activeref(struct super_block *sb,
                                      struct fscrypt_master_key *mk);

struct fscrypt_master_key *
fscrypt_find_master_key(struct super_block *sb,
                        const struct fscrypt_key_specifier *mk_spec);

int fscrypt_get_test_dummy_key_identifier(
                          u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]);

int fscrypt_add_test_dummy_key(struct super_block *sb,
                               struct fscrypt_key_specifier *key_spec);

int fscrypt_verify_key_added(struct super_block *sb,
                             const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]);

int __init fscrypt_init_keyring(void);

/* keysetup.c */

struct fscrypt_mode {
        const char *friendly_name;
        const char *cipher_str;
        int keysize;                /* key size in bytes */
        int security_strength;        /* security strength in bytes */
        int ivsize;                /* IV size in bytes */
        int logged_cryptoapi_impl;
        int logged_blk_crypto_native;
        int logged_blk_crypto_fallback;
        enum blk_crypto_mode_num blk_crypto_mode;
};

extern struct fscrypt_mode fscrypt_modes[];

int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
                        const u8 *raw_key, const struct fscrypt_inode_info *ci);

void fscrypt_destroy_prepared_key(struct super_block *sb,
                                  struct fscrypt_prepared_key *prep_key);

int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci,
                                 const u8 *raw_key);

int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
                               const struct fscrypt_master_key *mk);

void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci,
                               const struct fscrypt_master_key *mk);

int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported);

/**
 * fscrypt_require_key() - require an inode's encryption key
 * @inode: the inode we need the key for
 *
 * If the inode is encrypted, set up its encryption key if not already done.
 * Then require that the key be present and return -ENOKEY otherwise.
 *
 * No locks are needed, and the key will live as long as the struct inode --- so
 * it won't go away from under you.
 *
 * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
 * if a problem occurred while setting up the encryption key.
 */
static inline int fscrypt_require_key(struct inode *inode)
{
        if (IS_ENCRYPTED(inode)) {
                int err = fscrypt_get_encryption_info(inode, false);

                if (err)
                        return err;
                if (!fscrypt_has_encryption_key(inode))
                        return -ENOKEY;
        }
        return 0;
}

/* keysetup_v1.c */

void fscrypt_put_direct_key(struct fscrypt_direct_key *dk);

int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci,
                              const u8 *raw_master_key);

int fscrypt_setup_v1_file_key_via_subscribed_keyrings(
                                struct fscrypt_inode_info *ci);

/* policy.c */

bool fscrypt_policies_equal(const union fscrypt_policy *policy1,
                            const union fscrypt_policy *policy2);
int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy,
                               struct fscrypt_key_specifier *key_spec);
const union fscrypt_policy *fscrypt_get_dummy_policy(struct super_block *sb);
bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
                              const struct inode *inode);
int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
                                const union fscrypt_context *ctx_u,
                                int ctx_size);
const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir);

#endif /* _FSCRYPT_PRIVATE_H */

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


    1 













































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
 */

#include <linux/blkdev.h>
#include <linux/ratelimit.h>
#include <linux/sched/mm.h>
#include <crypto/hash.h>
#include "ctree.h"
#include "discard.h"
#include "volumes.h"
#include "disk-io.h"
#include "ordered-data.h"
#include "transaction.h"
#include "backref.h"
#include "extent_io.h"
#include "dev-replace.h"
#include "raid56.h"
#include "block-group.h"
#include "zoned.h"
#include "fs.h"
#include "accessors.h"
#include "file-item.h"
#include "scrub.h"
#include "raid-stripe-tree.h"

/*
 * This is only the first step towards a full-features scrub. It reads all
 * extent and super block and verifies the checksums. In case a bad checksum
 * is found or the extent cannot be read, good data will be written back if
 * any can be found.
 *
 * Future enhancements:
 *  - In case an unrepairable extent is encountered, track which files are
 *    affected and report them
 *  - track and record media errors, throw out bad devices
 *  - add a mode to also read unallocated space
 */

struct scrub_ctx;

/*
 * The following value only influences the performance.
 *
 * This determines how many stripes would be submitted in one go,
 * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP).
 */
#define SCRUB_STRIPES_PER_GROUP                8

/*
 * How many groups we have for each sctx.
 *
 * This would be 8M per device, the same value as the old scrub in-flight bios
 * size limit.
 */
#define SCRUB_GROUPS_PER_SCTX                16

#define SCRUB_TOTAL_STRIPES                (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP)

/*
 * The following value times PAGE_SIZE needs to be large enough to match the
 * largest node/leaf/sector size that shall be supported.
 */
#define SCRUB_MAX_SECTORS_PER_BLOCK        (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)

/* Represent one sector and its needed info to verify the content. */
struct scrub_sector_verification {
        bool is_metadata;

        union {
                /*
                 * Csum pointer for data csum verification.  Should point to a
                 * sector csum inside scrub_stripe::csums.
                 *
                 * NULL if this data sector has no csum.
                 */
                u8 *csum;

                /*
                 * Extra info for metadata verification.  All sectors inside a
                 * tree block share the same generation.
                 */
                u64 generation;
        };
};

enum scrub_stripe_flags {
        /* Set when @mirror_num, @dev, @physical and @logical are set. */
        SCRUB_STRIPE_FLAG_INITIALIZED,

        /* Set when the read-repair is finished. */
        SCRUB_STRIPE_FLAG_REPAIR_DONE,

        /*
         * Set for data stripes if it's triggered from P/Q stripe.
         * During such scrub, we should not report errors in data stripes, nor
         * update the accounting.
         */
        SCRUB_STRIPE_FLAG_NO_REPORT,
};

#define SCRUB_STRIPE_PAGES                (BTRFS_STRIPE_LEN / PAGE_SIZE)

/*
 * Represent one contiguous range with a length of BTRFS_STRIPE_LEN.
 */
struct scrub_stripe {
        struct scrub_ctx *sctx;
        struct btrfs_block_group *bg;

        struct page *pages[SCRUB_STRIPE_PAGES];
        struct scrub_sector_verification *sectors;

        struct btrfs_device *dev;
        u64 logical;
        u64 physical;

        u16 mirror_num;

        /* Should be BTRFS_STRIPE_LEN / sectorsize. */
        u16 nr_sectors;

        /*
         * How many data/meta extents are in this stripe.  Only for scrub status
         * reporting purposes.
         */
        u16 nr_data_extents;
        u16 nr_meta_extents;

        atomic_t pending_io;
        wait_queue_head_t io_wait;
        wait_queue_head_t repair_wait;

        /*
         * Indicate the states of the stripe.  Bits are defined in
         * scrub_stripe_flags enum.
         */
        unsigned long state;

        /* Indicate which sectors are covered by extent items. */
        unsigned long extent_sector_bitmap;

        /*
         * The errors hit during the initial read of the stripe.
         *
         * Would be utilized for error reporting and repair.
         *
         * The remaining init_nr_* records the number of errors hit, only used
         * by error reporting.
         */
        unsigned long init_error_bitmap;
        unsigned int init_nr_io_errors;
        unsigned int init_nr_csum_errors;
        unsigned int init_nr_meta_errors;

        /*
         * The following error bitmaps are all for the current status.
         * Every time we submit a new read, these bitmaps may be updated.
         *
         * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap;
         *
         * IO and csum errors can happen for both metadata and data.
         */
        unsigned long error_bitmap;
        unsigned long io_error_bitmap;
        unsigned long csum_error_bitmap;
        unsigned long meta_error_bitmap;

        /* For writeback (repair or replace) error reporting. */
        unsigned long write_error_bitmap;

        /* Writeback can be concurrent, thus we need to protect the bitmap. */
        spinlock_t write_error_lock;

        /*
         * Checksum for the whole stripe if this stripe is inside a data block
         * group.
         */
        u8 *csums;

        struct work_struct work;
};

struct scrub_ctx {
        struct scrub_stripe        stripes[SCRUB_TOTAL_STRIPES];
        struct scrub_stripe        *raid56_data_stripes;
        struct btrfs_fs_info        *fs_info;
        struct btrfs_path        extent_path;
        struct btrfs_path        csum_path;
        int                        first_free;
        int                        cur_stripe;
        atomic_t                cancel_req;
        int                        readonly;

        /* State of IO submission throttling affecting the associated device */
        ktime_t                        throttle_deadline;
        u64                        throttle_sent;

        int                        is_dev_replace;
        u64                        write_pointer;

        struct mutex            wr_lock;
        struct btrfs_device     *wr_tgtdev;

        /*
         * statistics
         */
        struct btrfs_scrub_progress stat;
        spinlock_t                stat_lock;

        /*
         * Use a ref counter to avoid use-after-free issues. Scrub workers
         * decrement bios_in_flight and workers_pending and then do a wakeup
         * on the list_wait wait queue. We must ensure the main scrub task
         * doesn't free the scrub context before or while the workers are
         * doing the wakeup() call.
         */
        refcount_t              refs;
};

struct scrub_warning {
        struct btrfs_path        *path;
        u64                        extent_item_size;
        const char                *errstr;
        u64                        physical;
        u64                        logical;
        struct btrfs_device        *dev;
};

static void release_scrub_stripe(struct scrub_stripe *stripe)
{
        if (!stripe)
                return;

        for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) {
                if (stripe->pages[i])
                        __free_page(stripe->pages[i]);
                stripe->pages[i] = NULL;
        }
        kfree(stripe->sectors);
        kfree(stripe->csums);
        stripe->sectors = NULL;
        stripe->csums = NULL;
        stripe->sctx = NULL;
        stripe->state = 0;
}

static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
                             struct scrub_stripe *stripe)
{
        int ret;

        memset(stripe, 0, sizeof(*stripe));

        stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
        stripe->state = 0;

        init_waitqueue_head(&stripe->io_wait);
        init_waitqueue_head(&stripe->repair_wait);
        atomic_set(&stripe->pending_io, 0);
        spin_lock_init(&stripe->write_error_lock);

        ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, 0);
        if (ret < 0)
                goto error;

        stripe->sectors = kcalloc(stripe->nr_sectors,
                                  sizeof(struct scrub_sector_verification),
                                  GFP_KERNEL);
        if (!stripe->sectors)
                goto error;

        stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits,
                                fs_info->csum_size, GFP_KERNEL);
        if (!stripe->csums)
                goto error;
        return 0;
error:
        release_scrub_stripe(stripe);
        return -ENOMEM;
}

static void wait_scrub_stripe_io(struct scrub_stripe *stripe)
{
        wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0);
}

static void scrub_put_ctx(struct scrub_ctx *sctx);

static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
{
        while (atomic_read(&fs_info->scrub_pause_req)) {
                mutex_unlock(&fs_info->scrub_lock);
                wait_event(fs_info->scrub_pause_wait,
                   atomic_read(&fs_info->scrub_pause_req) == 0);
                mutex_lock(&fs_info->scrub_lock);
        }
}

static void scrub_pause_on(struct btrfs_fs_info *fs_info)
{
        atomic_inc(&fs_info->scrubs_paused);
        wake_up(&fs_info->scrub_pause_wait);
}

static void scrub_pause_off(struct btrfs_fs_info *fs_info)
{
        mutex_lock(&fs_info->scrub_lock);
        __scrub_blocked_if_needed(fs_info);
        atomic_dec(&fs_info->scrubs_paused);
        mutex_unlock(&fs_info->scrub_lock);

        wake_up(&fs_info->scrub_pause_wait);
}

static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
{
        scrub_pause_on(fs_info);
        scrub_pause_off(fs_info);
}

static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
{
        int i;

        if (!sctx)
                return;

        for (i = 0; i < SCRUB_TOTAL_STRIPES; i++)
                release_scrub_stripe(&sctx->stripes[i]);

        kvfree(sctx);
}

static void scrub_put_ctx(struct scrub_ctx *sctx)
{
        if (refcount_dec_and_test(&sctx->refs))
                scrub_free_ctx(sctx);
}

static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
                struct btrfs_fs_info *fs_info, int is_dev_replace)
{
        struct scrub_ctx *sctx;
        int                i;

        /* Since sctx has inline 128 stripes, it can go beyond 64K easily.  Use
         * kvzalloc().
         */
        sctx = kvzalloc(sizeof(*sctx), GFP_KERNEL);
        if (!sctx)
                goto nomem;
        refcount_set(&sctx->refs, 1);
        sctx->is_dev_replace = is_dev_replace;
        sctx->fs_info = fs_info;
        sctx->extent_path.search_commit_root = 1;
        sctx->extent_path.skip_locking = 1;
        sctx->csum_path.search_commit_root = 1;
        sctx->csum_path.skip_locking = 1;
        for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) {
                int ret;

                ret = init_scrub_stripe(fs_info, &sctx->stripes[i]);
                if (ret < 0)
                        goto nomem;
                sctx->stripes[i].sctx = sctx;
        }
        sctx->first_free = 0;
        atomic_set(&sctx->cancel_req, 0);

        spin_lock_init(&sctx->stat_lock);
        sctx->throttle_deadline = 0;

        mutex_init(&sctx->wr_lock);
        if (is_dev_replace) {
                WARN_ON(!fs_info->dev_replace.tgtdev);
                sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
        }

        return sctx;

nomem:
        scrub_free_ctx(sctx);
        return ERR_PTR(-ENOMEM);
}

static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
                                     u64 root, void *warn_ctx)
{
        u32 nlink;
        int ret;
        int i;
        unsigned nofs_flag;
        struct extent_buffer *eb;
        struct btrfs_inode_item *inode_item;
        struct scrub_warning *swarn = warn_ctx;
        struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
        struct inode_fs_paths *ipath = NULL;
        struct btrfs_root *local_root;
        struct btrfs_key key;

        local_root = btrfs_get_fs_root(fs_info, root, true);
        if (IS_ERR(local_root)) {
                ret = PTR_ERR(local_root);
                goto err;
        }

        /*
         * this makes the path point to (inum INODE_ITEM ioff)
         */
        key.objectid = inum;
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;

        ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
        if (ret) {
                btrfs_put_root(local_root);
                btrfs_release_path(swarn->path);
                goto err;
        }

        eb = swarn->path->nodes[0];
        inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
                                        struct btrfs_inode_item);
        nlink = btrfs_inode_nlink(eb, inode_item);
        btrfs_release_path(swarn->path);

        /*
         * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
         * uses GFP_NOFS in this context, so we keep it consistent but it does
         * not seem to be strictly necessary.
         */
        nofs_flag = memalloc_nofs_save();
        ipath = init_ipath(4096, local_root, swarn->path);
        memalloc_nofs_restore(nofs_flag);
        if (IS_ERR(ipath)) {
                btrfs_put_root(local_root);
                ret = PTR_ERR(ipath);
                ipath = NULL;
                goto err;
        }
        ret = paths_from_inode(inum, ipath);

        if (ret < 0)
                goto err;

        /*
         * we deliberately ignore the bit ipath might have been too small to
         * hold all of the paths here
         */
        for (i = 0; i < ipath->fspath->elem_cnt; ++i)
                btrfs_warn_in_rcu(fs_info,
"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
                                  swarn->errstr, swarn->logical,
                                  btrfs_dev_name(swarn->dev),
                                  swarn->physical,
                                  root, inum, offset,
                                  fs_info->sectorsize, nlink,
                                  (char *)(unsigned long)ipath->fspath->val[i]);

        btrfs_put_root(local_root);
        free_ipath(ipath);
        return 0;

err:
        btrfs_warn_in_rcu(fs_info,
                          "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
                          swarn->errstr, swarn->logical,
                          btrfs_dev_name(swarn->dev),
                          swarn->physical,
                          root, inum, offset, ret);

        free_ipath(ipath);
        return 0;
}

static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev,
                                       bool is_super, u64 logical, u64 physical)
{
        struct btrfs_fs_info *fs_info = dev->fs_info;
        struct btrfs_path *path;
        struct btrfs_key found_key;
        struct extent_buffer *eb;
        struct btrfs_extent_item *ei;
        struct scrub_warning swarn;
        u64 flags = 0;
        u32 item_size;
        int ret;

        /* Super block error, no need to search extent tree. */
        if (is_super) {
                btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
                                  errstr, btrfs_dev_name(dev), physical);
                return;
        }
        path = btrfs_alloc_path();
        if (!path)
                return;

        swarn.physical = physical;
        swarn.logical = logical;
        swarn.errstr = errstr;
        swarn.dev = NULL;

        ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
                                  &flags);
        if (ret < 0)
                goto out;

        swarn.extent_item_size = found_key.offset;

        eb = path->nodes[0];
        ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
        item_size = btrfs_item_size(eb, path->slots[0]);

        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                unsigned long ptr = 0;
                u8 ref_level;
                u64 ref_root;

                while (true) {
                        ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
                                                      item_size, &ref_root,
                                                      &ref_level);
                        if (ret < 0) {
                                btrfs_warn(fs_info,
                                "failed to resolve tree backref for logical %llu: %d",
                                                  swarn.logical, ret);
                                break;
                        }
                        if (ret > 0)
                                break;
                        btrfs_warn_in_rcu(fs_info,
"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
                                errstr, swarn.logical, btrfs_dev_name(dev),
                                swarn.physical, (ref_level ? "node" : "leaf"),
                                ref_level, ref_root);
                }
                btrfs_release_path(path);
        } else {
                struct btrfs_backref_walk_ctx ctx = { 0 };

                btrfs_release_path(path);

                ctx.bytenr = found_key.objectid;
                ctx.extent_item_pos = swarn.logical - found_key.objectid;
                ctx.fs_info = fs_info;

                swarn.path = path;
                swarn.dev = dev;

                iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn);
        }

out:
        btrfs_free_path(path);
}

static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
{
        int ret = 0;
        u64 length;

        if (!btrfs_is_zoned(sctx->fs_info))
                return 0;

        if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
                return 0;

        if (sctx->write_pointer < physical) {
                length = physical - sctx->write_pointer;

                ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
                                                sctx->write_pointer, length);
                if (!ret)
                        sctx->write_pointer = physical;
        }
        return ret;
}

static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr)
{
        struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
        int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT;

        return stripe->pages[page_index];
}

static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe,
                                                 int sector_nr)
{
        struct btrfs_fs_info *fs_info = stripe->bg->fs_info;

        return offset_in_page(sector_nr << fs_info->sectorsize_bits);
}

static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr)
{
        struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
        const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
        const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits);
        const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr);
        const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr);
        SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
        u8 on_disk_csum[BTRFS_CSUM_SIZE];
        u8 calculated_csum[BTRFS_CSUM_SIZE];
        struct btrfs_header *header;

        /*
         * Here we don't have a good way to attach the pages (and subpages)
         * to a dummy extent buffer, thus we have to directly grab the members
         * from pages.
         */
        header = (struct btrfs_header *)(page_address(first_page) + first_off);
        memcpy(on_disk_csum, header->csum, fs_info->csum_size);

        if (logical != btrfs_stack_header_bytenr(header)) {
                bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
                bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
                btrfs_warn_rl(fs_info,
                "tree block %llu mirror %u has bad bytenr, has %llu want %llu",
                              logical, stripe->mirror_num,
                              btrfs_stack_header_bytenr(header), logical);
                return;
        }
        if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid,
                   BTRFS_FSID_SIZE) != 0) {
                bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
                bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
                btrfs_warn_rl(fs_info,
                "tree block %llu mirror %u has bad fsid, has %pU want %pU",
                              logical, stripe->mirror_num,
                              header->fsid, fs_info->fs_devices->fsid);
                return;
        }
        if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid,
                   BTRFS_UUID_SIZE) != 0) {
                bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
                bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
                btrfs_warn_rl(fs_info,
                "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
                              logical, stripe->mirror_num,
                              header->chunk_tree_uuid, fs_info->chunk_tree_uuid);
                return;
        }

        /* Now check tree block csum. */
        shash->tfm = fs_info->csum_shash;
        crypto_shash_init(shash);
        crypto_shash_update(shash, page_address(first_page) + first_off +
                            BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE);

        for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) {
                struct page *page = scrub_stripe_get_page(stripe, i);
                unsigned int page_off = scrub_stripe_get_page_offset(stripe, i);

                crypto_shash_update(shash, page_address(page) + page_off,
                                    fs_info->sectorsize);
        }

        crypto_shash_final(shash, calculated_csum);
        if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) {
                bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
                bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
                btrfs_warn_rl(fs_info,
                "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
                              logical, stripe->mirror_num,
                              CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
                              CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
                return;
        }
        if (stripe->sectors[sector_nr].generation !=
            btrfs_stack_header_generation(header)) {
                bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
                bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
                btrfs_warn_rl(fs_info,
                "tree block %llu mirror %u has bad generation, has %llu want %llu",
                              logical, stripe->mirror_num,
                              btrfs_stack_header_generation(header),
                              stripe->sectors[sector_nr].generation);
                return;
        }
        bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree);
        bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
        bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
}

static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
{
        struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
        struct scrub_sector_verification *sector = &stripe->sectors[sector_nr];
        const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
        struct page *page = scrub_stripe_get_page(stripe, sector_nr);
        unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
        u8 csum_buf[BTRFS_CSUM_SIZE];
        int ret;

        ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors);

        /* Sector not utilized, skip it. */
        if (!test_bit(sector_nr, &stripe->extent_sector_bitmap))
                return;

        /* IO error, no need to check. */
        if (test_bit(sector_nr, &stripe->io_error_bitmap))
                return;

        /* Metadata, verify the full tree block. */
        if (sector->is_metadata) {
                /*
                 * Check if the tree block crosses the stripe boundary.  If
                 * crossed the boundary, we cannot verify it but only give a
                 * warning.
                 *
                 * This can only happen on a very old filesystem where chunks
                 * are not ensured to be stripe aligned.
                 */
                if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) {
                        btrfs_warn_rl(fs_info,
                        "tree block at %llu crosses stripe boundary %llu",
                                      stripe->logical +
                                      (sector_nr << fs_info->sectorsize_bits),
                                      stripe->logical);
                        return;
                }
                scrub_verify_one_metadata(stripe, sector_nr);
                return;
        }

        /*
         * Data is easier, we just verify the data csum (if we have it).  For
         * cases without csum, we have no other choice but to trust it.
         */
        if (!sector->csum) {
                clear_bit(sector_nr, &stripe->error_bitmap);
                return;
        }

        ret = btrfs_check_sector_csum(fs_info, page, pgoff, csum_buf, sector->csum);
        if (ret < 0) {
                set_bit(sector_nr, &stripe->csum_error_bitmap);
                set_bit(sector_nr, &stripe->error_bitmap);
        } else {
                clear_bit(sector_nr, &stripe->csum_error_bitmap);
                clear_bit(sector_nr, &stripe->error_bitmap);
        }
}

/* Verify specified sectors of a stripe. */
static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap)
{
        struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
        const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
        int sector_nr;

        for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) {
                scrub_verify_one_sector(stripe, sector_nr);
                if (stripe->sectors[sector_nr].is_metadata)
                        sector_nr += sectors_per_tree - 1;
        }
}

static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec)
{
        int i;

        for (i = 0; i < stripe->nr_sectors; i++) {
                if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page &&
                    scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset)
                        break;
        }
        ASSERT(i < stripe->nr_sectors);
        return i;
}

/*
 * Repair read is different to the regular read:
 *
 * - Only reads the failed sectors
 * - May have extra blocksize limits
 */
static void scrub_repair_read_endio(struct btrfs_bio *bbio)
{
        struct scrub_stripe *stripe = bbio->private;
        struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
        struct bio_vec *bvec;
        int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
        u32 bio_size = 0;
        int i;

        ASSERT(sector_nr < stripe->nr_sectors);

        bio_for_each_bvec_all(bvec, &bbio->bio, i)
                bio_size += bvec->bv_len;

        if (bbio->bio.bi_status) {
                bitmap_set(&stripe->io_error_bitmap, sector_nr,
                           bio_size >> fs_info->sectorsize_bits);
                bitmap_set(&stripe->error_bitmap, sector_nr,
                           bio_size >> fs_info->sectorsize_bits);
        } else {
                bitmap_clear(&stripe->io_error_bitmap, sector_nr,
                             bio_size >> fs_info->sectorsize_bits);
        }
        bio_put(&bbio->bio);
        if (atomic_dec_and_test(&stripe->pending_io))
                wake_up(&stripe->io_wait);
}

static int calc_next_mirror(int mirror, int num_copies)
{
        ASSERT(mirror <= num_copies);
        return (mirror + 1 > num_copies) ? 1 : mirror + 1;
}

static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
                                            int mirror, int blocksize, bool wait)
{
        struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
        struct btrfs_bio *bbio = NULL;
        const unsigned long old_error_bitmap = stripe->error_bitmap;
        int i;

        ASSERT(stripe->mirror_num >= 1);
        ASSERT(atomic_read(&stripe->pending_io) == 0);

        for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) {
                struct page *page;
                int pgoff;
                int ret;

                page = scrub_stripe_get_page(stripe, i);
                pgoff = scrub_stripe_get_page_offset(stripe, i);

                /* The current sector cannot be merged, submit the bio. */
                if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) ||
                             bbio->bio.bi_iter.bi_size >= blocksize)) {
                        ASSERT(bbio->bio.bi_iter.bi_size);
                        atomic_inc(&stripe->pending_io);
                        btrfs_submit_bio(bbio, mirror);
                        if (wait)
                                wait_scrub_stripe_io(stripe);
                        bbio = NULL;
                }

                if (!bbio) {
                        bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
                                fs_info, scrub_repair_read_endio, stripe);
                        bbio->bio.bi_iter.bi_sector = (stripe->logical +
                                (i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT;
                }

                ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
                ASSERT(ret == fs_info->sectorsize);
        }
        if (bbio) {
                ASSERT(bbio->bio.bi_iter.bi_size);
                atomic_inc(&stripe->pending_io);
                btrfs_submit_bio(bbio, mirror);
                if (wait)
                        wait_scrub_stripe_io(stripe);
        }
}

static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
                                       struct scrub_stripe *stripe)
{
        static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
                                      DEFAULT_RATELIMIT_BURST);
        struct btrfs_fs_info *fs_info = sctx->fs_info;
        struct btrfs_device *dev = NULL;
        u64 physical = 0;
        int nr_data_sectors = 0;
        int nr_meta_sectors = 0;
        int nr_nodatacsum_sectors = 0;
        int nr_repaired_sectors = 0;
        int sector_nr;

        if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state))
                return;

        /*
         * Init needed infos for error reporting.
         *
         * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio()
         * thus no need for dev/physical, error reporting still needs dev and physical.
         */
        if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) {
                u64 mapped_len = fs_info->sectorsize;
                struct btrfs_io_context *bioc = NULL;
                int stripe_index = stripe->mirror_num - 1;
                int ret;

                /* For scrub, our mirror_num should always start at 1. */
                ASSERT(stripe->mirror_num >= 1);
                ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
                                      stripe->logical, &mapped_len, &bioc,
                                      NULL, NULL);
                /*
                 * If we failed, dev will be NULL, and later detailed reports
                 * will just be skipped.
                 */
                if (ret < 0)
                        goto skip;
                physical = bioc->stripes[stripe_index].physical;
                dev = bioc->stripes[stripe_index].dev;
                btrfs_put_bioc(bioc);
        }

skip:
        for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
                bool repaired = false;

                if (stripe->sectors[sector_nr].is_metadata) {
                        nr_meta_sectors++;
                } else {
                        nr_data_sectors++;
                        if (!stripe->sectors[sector_nr].csum)
                                nr_nodatacsum_sectors++;
                }

                if (test_bit(sector_nr, &stripe->init_error_bitmap) &&
                    !test_bit(sector_nr, &stripe->error_bitmap)) {
                        nr_repaired_sectors++;
                        repaired = true;
                }

                /* Good sector from the beginning, nothing need to be done. */
                if (!test_bit(sector_nr, &stripe->init_error_bitmap))
                        continue;

                /*
                 * Report error for the corrupted sectors.  If repaired, just
                 * output the message of repaired message.
                 */
                if (repaired) {
                        if (dev) {
                                btrfs_err_rl_in_rcu(fs_info,
                        "fixed up error at logical %llu on dev %s physical %llu",
                                            stripe->logical, btrfs_dev_name(dev),
                                            physical);
                        } else {
                                btrfs_err_rl_in_rcu(fs_info,
                        "fixed up error at logical %llu on mirror %u",
                                            stripe->logical, stripe->mirror_num);
                        }
                        continue;
                }

                /* The remaining are all for unrepaired. */
                if (dev) {
                        btrfs_err_rl_in_rcu(fs_info,
        "unable to fixup (regular) error at logical %llu on dev %s physical %llu",
                                            stripe->logical, btrfs_dev_name(dev),
                                            physical);
                } else {
                        btrfs_err_rl_in_rcu(fs_info,
        "unable to fixup (regular) error at logical %llu on mirror %u",
                                            stripe->logical, stripe->mirror_num);
                }

                if (test_bit(sector_nr, &stripe->io_error_bitmap))
                        if (__ratelimit(&rs) && dev)
                                scrub_print_common_warning("i/o error", dev, false,
                                                     stripe->logical, physical);
                if (test_bit(sector_nr, &stripe->csum_error_bitmap))
                        if (__ratelimit(&rs) && dev)
                                scrub_print_common_warning("checksum error", dev, false,
                                                     stripe->logical, physical);
                if (test_bit(sector_nr, &stripe->meta_error_bitmap))
                        if (__ratelimit(&rs) && dev)
                                scrub_print_common_warning("header error", dev, false,
                                                     stripe->logical, physical);
        }

        spin_lock(&sctx->stat_lock);
        sctx->stat.data_extents_scrubbed += stripe->nr_data_extents;
        sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents;
        sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits;
        sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits;
        sctx->stat.no_csum += nr_nodatacsum_sectors;
        sctx->stat.read_errors += stripe->init_nr_io_errors;
        sctx->stat.csum_errors += stripe->init_nr_csum_errors;
        sctx->stat.verify_errors += stripe->init_nr_meta_errors;
        sctx->stat.uncorrectable_errors +=
                bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors);
        sctx->stat.corrected_errors += nr_repaired_sectors;
        spin_unlock(&sctx->stat_lock);
}

static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe,
                                unsigned long write_bitmap, bool dev_replace);

/*
 * The main entrance for all read related scrub work, including:
 *
 * - Wait for the initial read to finish
 * - Verify and locate any bad sectors
 * - Go through the remaining mirrors and try to read as large blocksize as
 *   possible
 * - Go through all mirrors (including the failed mirror) sector-by-sector
 * - Submit writeback for repaired sectors
 *
 * Writeback for dev-replace does not happen here, it needs extra
 * synchronization for zoned devices.
 */
static void scrub_stripe_read_repair_worker(struct work_struct *work)
{
        struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work);
        struct scrub_ctx *sctx = stripe->sctx;
        struct btrfs_fs_info *fs_info = sctx->fs_info;
        int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
                                          stripe->bg->length);
        unsigned long repaired;
        int mirror;
        int i;

        ASSERT(stripe->mirror_num > 0);

        wait_scrub_stripe_io(stripe);
        scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap);
        /* Save the initial failed bitmap for later repair and report usage. */
        stripe->init_error_bitmap = stripe->error_bitmap;
        stripe->init_nr_io_errors = bitmap_weight(&stripe->io_error_bitmap,
                                                  stripe->nr_sectors);
        stripe->init_nr_csum_errors = bitmap_weight(&stripe->csum_error_bitmap,
                                                    stripe->nr_sectors);
        stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap,
                                                    stripe->nr_sectors);

        if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors))
                goto out;

        /*
         * Try all remaining mirrors.
         *
         * Here we still try to read as large block as possible, as this is
         * faster and we have extra safety nets to rely on.
         */
        for (mirror = calc_next_mirror(stripe->mirror_num, num_copies);
             mirror != stripe->mirror_num;
             mirror = calc_next_mirror(mirror, num_copies)) {
                const unsigned long old_error_bitmap = stripe->error_bitmap;

                scrub_stripe_submit_repair_read(stripe, mirror,
                                                BTRFS_STRIPE_LEN, false);
                wait_scrub_stripe_io(stripe);
                scrub_verify_one_stripe(stripe, old_error_bitmap);
                if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
                        goto out;
        }

        /*
         * Last safety net, try re-checking all mirrors, including the failed
         * one, sector-by-sector.
         *
         * As if one sector failed the drive's internal csum, the whole read
         * containing the offending sector would be marked as error.
         * Thus here we do sector-by-sector read.
         *
         * This can be slow, thus we only try it as the last resort.
         */

        for (i = 0, mirror = stripe->mirror_num;
             i < num_copies;
             i++, mirror = calc_next_mirror(mirror, num_copies)) {
                const unsigned long old_error_bitmap = stripe->error_bitmap;

                scrub_stripe_submit_repair_read(stripe, mirror,
                                                fs_info->sectorsize, true);
                wait_scrub_stripe_io(stripe);
                scrub_verify_one_stripe(stripe, old_error_bitmap);
                if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
                        goto out;
        }
out:
        /*
         * Submit the repaired sectors.  For zoned case, we cannot do repair
         * in-place, but queue the bg to be relocated.
         */
        bitmap_andnot(&repaired, &stripe->init_error_bitmap, &stripe->error_bitmap,
                      stripe->nr_sectors);
        if (!sctx->readonly && !bitmap_empty(&repaired, stripe->nr_sectors)) {
                if (btrfs_is_zoned(fs_info)) {
                        btrfs_repair_one_zone(fs_info, sctx->stripes[0].bg->start);
                } else {
                        scrub_write_sectors(sctx, stripe, repaired, false);
                        wait_scrub_stripe_io(stripe);
                }
        }

        scrub_stripe_report_errors(sctx, stripe);
        set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state);
        wake_up(&stripe->repair_wait);
}

static void scrub_read_endio(struct btrfs_bio *bbio)
{
        struct scrub_stripe *stripe = bbio->private;
        struct bio_vec *bvec;
        int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
        int num_sectors;
        u32 bio_size = 0;
        int i;

        ASSERT(sector_nr < stripe->nr_sectors);
        bio_for_each_bvec_all(bvec, &bbio->bio, i)
                bio_size += bvec->bv_len;
        num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits;

        if (bbio->bio.bi_status) {
                bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors);
                bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors);
        } else {
                bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors);
        }
        bio_put(&bbio->bio);
        if (atomic_dec_and_test(&stripe->pending_io)) {
                wake_up(&stripe->io_wait);
                INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
                queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
        }
}

static void scrub_write_endio(struct btrfs_bio *bbio)
{
        struct scrub_stripe *stripe = bbio->private;
        struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
        struct bio_vec *bvec;
        int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
        u32 bio_size = 0;
        int i;

        bio_for_each_bvec_all(bvec, &bbio->bio, i)
                bio_size += bvec->bv_len;

        if (bbio->bio.bi_status) {
                unsigned long flags;

                spin_lock_irqsave(&stripe->write_error_lock, flags);
                bitmap_set(&stripe->write_error_bitmap, sector_nr,
                           bio_size >> fs_info->sectorsize_bits);
                spin_unlock_irqrestore(&stripe->write_error_lock, flags);
        }
        bio_put(&bbio->bio);

        if (atomic_dec_and_test(&stripe->pending_io))
                wake_up(&stripe->io_wait);
}

static void scrub_submit_write_bio(struct scrub_ctx *sctx,
                                   struct scrub_stripe *stripe,
                                   struct btrfs_bio *bbio, bool dev_replace)
{
        struct btrfs_fs_info *fs_info = sctx->fs_info;
        u32 bio_len = bbio->bio.bi_iter.bi_size;
        u32 bio_off = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT) -
                      stripe->logical;

        fill_writer_pointer_gap(sctx, stripe->physical + bio_off);
        atomic_inc(&stripe->pending_io);
        btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
        if (!btrfs_is_zoned(fs_info))
                return;
        /*
         * For zoned writeback, queue depth must be 1, thus we must wait for
         * the write to finish before the next write.
         */
        wait_scrub_stripe_io(stripe);

        /*
         * And also need to update the write pointer if write finished
         * successfully.
         */
        if (!test_bit(bio_off >> fs_info->sectorsize_bits,
                      &stripe->write_error_bitmap))
                sctx->write_pointer += bio_len;
}

/*
 * Submit the write bio(s) for the sectors specified by @write_bitmap.
 *
 * Here we utilize btrfs_submit_repair_write(), which has some extra benefits:
 *
 * - Only needs logical bytenr and mirror_num
 *   Just like the scrub read path
 *
 * - Would only result in writes to the specified mirror
 *   Unlike the regular writeback path, which would write back to all stripes
 *
 * - Handle dev-replace and read-repair writeback differently
 */
static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe,
                                unsigned long write_bitmap, bool dev_replace)
{
        struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
        struct btrfs_bio *bbio = NULL;
        int sector_nr;

        for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) {
                struct page *page = scrub_stripe_get_page(stripe, sector_nr);
                unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
                int ret;

                /* We should only writeback sectors covered by an extent. */
                ASSERT(test_bit(sector_nr, &stripe->extent_sector_bitmap));

                /* Cannot merge with previous sector, submit the current one. */
                if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) {
                        scrub_submit_write_bio(sctx, stripe, bbio, dev_replace);
                        bbio = NULL;
                }
                if (!bbio) {
                        bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_WRITE,
                                               fs_info, scrub_write_endio, stripe);
                        bbio->bio.bi_iter.bi_sector = (stripe->logical +
                                (sector_nr << fs_info->sectorsize_bits)) >>
                                SECTOR_SHIFT;
                }
                ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
                ASSERT(ret == fs_info->sectorsize);
        }
        if (bbio)
                scrub_submit_write_bio(sctx, stripe, bbio, dev_replace);
}

/*
 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
 * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
 */
static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *device,
                                  unsigned int bio_size)
{
        const int time_slice = 1000;
        s64 delta;
        ktime_t now;
        u32 div;
        u64 bwlimit;

        bwlimit = READ_ONCE(device->scrub_speed_max);
        if (bwlimit == 0)
                return;

        /*
         * Slice is divided into intervals when the IO is submitted, adjust by
         * bwlimit and maximum of 64 intervals.
         */
        div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
        div = min_t(u32, 64, div);

        /* Start new epoch, set deadline */
        now = ktime_get();
        if (sctx->throttle_deadline == 0) {
                sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
                sctx->throttle_sent = 0;
        }

        /* Still in the time to send? */
        if (ktime_before(now, sctx->throttle_deadline)) {
                /* If current bio is within the limit, send it */
                sctx->throttle_sent += bio_size;
                if (sctx->throttle_sent <= div_u64(bwlimit, div))
                        return;

                /* We're over the limit, sleep until the rest of the slice */
                delta = ktime_ms_delta(sctx->throttle_deadline, now);
        } else {
                /* New request after deadline, start new epoch */
                delta = 0;
        }

        if (delta) {
                long timeout;

                timeout = div_u64(delta * HZ, 1000);
                schedule_timeout_interruptible(timeout);
        }

        /* Next call will start the deadline period */
        sctx->throttle_deadline = 0;
}

/*
 * Given a physical address, this will calculate it's
 * logical offset. if this is a parity stripe, it will return
 * the most left data stripe's logical offset.
 *
 * return 0 if it is a data stripe, 1 means parity stripe.
 */
static int get_raid56_logic_offset(u64 physical, int num,
                                   struct btrfs_chunk_map *map, u64 *offset,
                                   u64 *stripe_start)
{
        int i;
        int j = 0;
        u64 last_offset;
        const int data_stripes = nr_data_stripes(map);

        last_offset = (physical - map->stripes[num].physical) * data_stripes;
        if (stripe_start)
                *stripe_start = last_offset;

        *offset = last_offset;
        for (i = 0; i < data_stripes; i++) {
                u32 stripe_nr;
                u32 stripe_index;
                u32 rot;

                *offset = last_offset + btrfs_stripe_nr_to_offset(i);

                stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes;

                /* Work out the disk rotation on this stripe-set */
                rot = stripe_nr % map->num_stripes;
                /* calculate which stripe this data locates */
                rot += i;
                stripe_index = rot % map->num_stripes;
                if (stripe_index == num)
                        return 0;
                if (stripe_index < num)
                        j++;
        }
        *offset = last_offset + btrfs_stripe_nr_to_offset(j);
        return 1;
}

/*
 * Return 0 if the extent item range covers any byte of the range.
 * Return <0 if the extent item is before @search_start.
 * Return >0 if the extent item is after @start_start + @search_len.
 */
static int compare_extent_item_range(struct btrfs_path *path,
                                     u64 search_start, u64 search_len)
{
        struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
        u64 len;
        struct btrfs_key key;

        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
        ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
               key.type == BTRFS_METADATA_ITEM_KEY);
        if (key.type == BTRFS_METADATA_ITEM_KEY)
                len = fs_info->nodesize;
        else
                len = key.offset;

        if (key.objectid + len <= search_start)
                return -1;
        if (key.objectid >= search_start + search_len)
                return 1;
        return 0;
}

/*
 * Locate one extent item which covers any byte in range
 * [@search_start, @search_start + @search_length)
 *
 * If the path is not initialized, we will initialize the search by doing
 * a btrfs_search_slot().
 * If the path is already initialized, we will use the path as the initial
 * slot, to avoid duplicated btrfs_search_slot() calls.
 *
 * NOTE: If an extent item starts before @search_start, we will still
 * return the extent item. This is for data extent crossing stripe boundary.
 *
 * Return 0 if we found such extent item, and @path will point to the extent item.
 * Return >0 if no such extent item can be found, and @path will be released.
 * Return <0 if hit fatal error, and @path will be released.
 */
static int find_first_extent_item(struct btrfs_root *extent_root,
                                  struct btrfs_path *path,
                                  u64 search_start, u64 search_len)
{
        struct btrfs_fs_info *fs_info = extent_root->fs_info;
        struct btrfs_key key;
        int ret;

        /* Continue using the existing path */
        if (path->nodes[0])
                goto search_forward;

        if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
                key.type = BTRFS_METADATA_ITEM_KEY;
        else
                key.type = BTRFS_EXTENT_ITEM_KEY;
        key.objectid = search_start;
        key.offset = (u64)-1;

        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
        if (ret < 0)
                return ret;
        if (ret == 0) {
                /*
                 * Key with offset -1 found, there would have to exist an extent
                 * item with such offset, but this is out of the valid range.
                 */
                btrfs_release_path(path);
                return -EUCLEAN;
        }

        /*
         * Here we intentionally pass 0 as @min_objectid, as there could be
         * an extent item starting before @search_start.
         */
        ret = btrfs_previous_extent_item(extent_root, path, 0);
        if (ret < 0)
                return ret;
        /*
         * No matter whether we have found an extent item, the next loop will
         * properly do every check on the key.
         */
search_forward:
        while (true) {
                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
                if (key.objectid >= search_start + search_len)
                        break;
                if (key.type != BTRFS_METADATA_ITEM_KEY &&
                    key.type != BTRFS_EXTENT_ITEM_KEY)
                        goto next;

                ret = compare_extent_item_range(path, search_start, search_len);
                if (ret == 0)
                        return ret;
                if (ret > 0)
                        break;
next:
                ret = btrfs_next_item(extent_root, path);
                if (ret) {
                        /* Either no more items or a fatal error. */
                        btrfs_release_path(path);
                        return ret;
                }
        }
        btrfs_release_path(path);
        return 1;
}

static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
                            u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
{
        struct btrfs_key key;
        struct btrfs_extent_item *ei;

        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
        ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
               key.type == BTRFS_EXTENT_ITEM_KEY);
        *extent_start_ret = key.objectid;
        if (key.type == BTRFS_METADATA_ITEM_KEY)
                *size_ret = path->nodes[0]->fs_info->nodesize;
        else
                *size_ret = key.offset;
        ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
        *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
        *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
}

static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
                                        u64 physical, u64 physical_end)
{
        struct btrfs_fs_info *fs_info = sctx->fs_info;
        int ret = 0;

        if (!btrfs_is_zoned(fs_info))
                return 0;

        mutex_lock(&sctx->wr_lock);
        if (sctx->write_pointer < physical_end) {
                ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
                                                    physical,
                                                    sctx->write_pointer);
                if (ret)
                        btrfs_err(fs_info,
                                  "zoned: failed to recover write pointer");
        }
        mutex_unlock(&sctx->wr_lock);
        btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);

        return ret;
}

static void fill_one_extent_info(struct btrfs_fs_info *fs_info,
                                 struct scrub_stripe *stripe,
                                 u64 extent_start, u64 extent_len,
                                 u64 extent_flags, u64 extent_gen)
{
        for (u64 cur_logical = max(stripe->logical, extent_start);
             cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN,
                               extent_start + extent_len);
             cur_logical += fs_info->sectorsize) {
                const int nr_sector = (cur_logical - stripe->logical) >>
                                      fs_info->sectorsize_bits;
                struct scrub_sector_verification *sector =
                                                &stripe->sectors[nr_sector];

                set_bit(nr_sector, &stripe->extent_sector_bitmap);
                if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                        sector->is_metadata = true;
                        sector->generation = extent_gen;
                }
        }
}

static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
{
        stripe->extent_sector_bitmap = 0;
        stripe->init_error_bitmap = 0;
        stripe->init_nr_io_errors = 0;
        stripe->init_nr_csum_errors = 0;
        stripe->init_nr_meta_errors = 0;
        stripe->error_bitmap = 0;
        stripe->io_error_bitmap = 0;
        stripe->csum_error_bitmap = 0;
        stripe->meta_error_bitmap = 0;
}

/*
 * Locate one stripe which has at least one extent in its range.
 *
 * Return 0 if found such stripe, and store its info into @stripe.
 * Return >0 if there is no such stripe in the specified range.
 * Return <0 for error.
 */
static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
                                        struct btrfs_path *extent_path,
                                        struct btrfs_path *csum_path,
                                        struct btrfs_device *dev, u64 physical,
                                        int mirror_num, u64 logical_start,
                                        u32 logical_len,
                                        struct scrub_stripe *stripe)
{
        struct btrfs_fs_info *fs_info = bg->fs_info;
        struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start);
        struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start);
        const u64 logical_end = logical_start + logical_len;
        u64 cur_logical = logical_start;
        u64 stripe_end;
        u64 extent_start;
        u64 extent_len;
        u64 extent_flags;
        u64 extent_gen;
        int ret;

        memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
                                   stripe->nr_sectors);
        scrub_stripe_reset_bitmaps(stripe);

        /* The range must be inside the bg. */
        ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);

        ret = find_first_extent_item(extent_root, extent_path, logical_start,
                                     logical_len);
        /* Either error or not found. */
        if (ret)
                goto out;
        get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags,
                        &extent_gen);
        if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
                stripe->nr_meta_extents++;
        if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
                stripe->nr_data_extents++;
        cur_logical = max(extent_start, cur_logical);

        /*
         * Round down to stripe boundary.
         *
         * The extra calculation against bg->start is to handle block groups
         * whose logical bytenr is not BTRFS_STRIPE_LEN aligned.
         */
        stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) +
                          bg->start;
        stripe->physical = physical + stripe->logical - logical_start;
        stripe->dev = dev;
        stripe->bg = bg;
        stripe->mirror_num = mirror_num;
        stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1;

        /* Fill the first extent info into stripe->sectors[] array. */
        fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
                             extent_flags, extent_gen);
        cur_logical = extent_start + extent_len;

        /* Fill the extent info for the remaining sectors. */
        while (cur_logical <= stripe_end) {
                ret = find_first_extent_item(extent_root, extent_path, cur_logical,
                                             stripe_end - cur_logical + 1);
                if (ret < 0)
                        goto out;
                if (ret > 0) {
                        ret = 0;
                        break;
                }
                get_extent_info(extent_path, &extent_start, &extent_len,
                                &extent_flags, &extent_gen);
                if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
                        stripe->nr_meta_extents++;
                if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
                        stripe->nr_data_extents++;
                fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
                                     extent_flags, extent_gen);
                cur_logical = extent_start + extent_len;
        }

        /* Now fill the data csum. */
        if (bg->flags & BTRFS_BLOCK_GROUP_DATA) {
                int sector_nr;
                unsigned long csum_bitmap = 0;

                /* Csum space should have already been allocated. */
                ASSERT(stripe->csums);

                /*
                 * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN
                 * should contain at most 16 sectors.
                 */
                ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);

                ret = btrfs_lookup_csums_bitmap(csum_root, csum_path,
                                                stripe->logical, stripe_end,
                                                stripe->csums, &csum_bitmap);
                if (ret < 0)
                        goto out;
                if (ret > 0)
                        ret = 0;

                for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) {
                        stripe->sectors[sector_nr].csum = stripe->csums +
                                sector_nr * fs_info->csum_size;
                }
        }
        set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
out:
        return ret;
}

static void scrub_reset_stripe(struct scrub_stripe *stripe)
{
        scrub_stripe_reset_bitmaps(stripe);

        stripe->nr_meta_extents = 0;
        stripe->nr_data_extents = 0;
        stripe->state = 0;

        for (int i = 0; i < stripe->nr_sectors; i++) {
                stripe->sectors[i].is_metadata = false;
                stripe->sectors[i].csum = NULL;
                stripe->sectors[i].generation = 0;
        }
}

static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
                                            struct scrub_stripe *stripe)
{
        struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
        struct btrfs_bio *bbio = NULL;
        unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
                                      stripe->bg->length - stripe->logical) >>
                                  fs_info->sectorsize_bits;
        u64 stripe_len = BTRFS_STRIPE_LEN;
        int mirror = stripe->mirror_num;
        int i;

        atomic_inc(&stripe->pending_io);

        for_each_set_bit(i, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
                struct page *page = scrub_stripe_get_page(stripe, i);
                unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i);

                /* We're beyond the chunk boundary, no need to read anymore. */
                if (i >= nr_sectors)
                        break;

                /* The current sector cannot be merged, submit the bio. */
                if (bbio &&
                    ((i > 0 &&
                      !test_bit(i - 1, &stripe->extent_sector_bitmap)) ||
                     bbio->bio.bi_iter.bi_size >= stripe_len)) {
                        ASSERT(bbio->bio.bi_iter.bi_size);
                        atomic_inc(&stripe->pending_io);
                        btrfs_submit_bio(bbio, mirror);
                        bbio = NULL;
                }

                if (!bbio) {
                        struct btrfs_io_stripe io_stripe = {};
                        struct btrfs_io_context *bioc = NULL;
                        const u64 logical = stripe->logical +
                                            (i << fs_info->sectorsize_bits);
                        int err;

                        bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
                                               fs_info, scrub_read_endio, stripe);
                        bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;

                        io_stripe.is_scrub = true;
                        err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
                                              &stripe_len, &bioc, &io_stripe,
                                              &mirror);
                        btrfs_put_bioc(bioc);
                        if (err) {
                                btrfs_bio_end_io(bbio,
                                                 errno_to_blk_status(err));
                                return;
                        }
                }

                __bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
        }

        if (bbio) {
                ASSERT(bbio->bio.bi_iter.bi_size);
                atomic_inc(&stripe->pending_io);
                btrfs_submit_bio(bbio, mirror);
        }

        if (atomic_dec_and_test(&stripe->pending_io)) {
                wake_up(&stripe->io_wait);
                INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
                queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
        }
}

static void scrub_submit_initial_read(struct scrub_ctx *sctx,
                                      struct scrub_stripe *stripe)
{
        struct btrfs_fs_info *fs_info = sctx->fs_info;
        struct btrfs_bio *bbio;
        unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
                                      stripe->bg->length - stripe->logical) >>
                                  fs_info->sectorsize_bits;
        int mirror = stripe->mirror_num;

        ASSERT(stripe->bg);
        ASSERT(stripe->mirror_num > 0);
        ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));

        if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) {
                scrub_submit_extent_sector_read(sctx, stripe);
                return;
        }

        bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
                               scrub_read_endio, stripe);

        bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
        /* Read the whole range inside the chunk boundary. */
        for (unsigned int cur = 0; cur < nr_sectors; cur++) {
                struct page *page = scrub_stripe_get_page(stripe, cur);
                unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur);
                int ret;

                ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
                /* We should have allocated enough bio vectors. */
                ASSERT(ret == fs_info->sectorsize);
        }
        atomic_inc(&stripe->pending_io);

        /*
         * For dev-replace, either user asks to avoid the source dev, or
         * the device is missing, we try the next mirror instead.
         */
        if (sctx->is_dev_replace &&
            (fs_info->dev_replace.cont_reading_from_srcdev_mode ==
             BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID ||
             !stripe->dev->bdev)) {
                int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
                                                  stripe->bg->length);

                mirror = calc_next_mirror(mirror, num_copies);
        }
        btrfs_submit_bio(bbio, mirror);
}

static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
{
        int i;

        for_each_set_bit(i, &stripe->error_bitmap, stripe->nr_sectors) {
                if (stripe->sectors[i].is_metadata) {
                        struct btrfs_fs_info *fs_info = stripe->bg->fs_info;

                        btrfs_err(fs_info,
                        "stripe %llu has unrepaired metadata sector at %llu",
                                  stripe->logical,
                                  stripe->logical + (i << fs_info->sectorsize_bits));
                        return true;
                }
        }
        return false;
}

static void submit_initial_group_read(struct scrub_ctx *sctx,
                                      unsigned int first_slot,
                                      unsigned int nr_stripes)
{
        struct blk_plug plug;

        ASSERT(first_slot < SCRUB_TOTAL_STRIPES);
        ASSERT(first_slot + nr_stripes <= SCRUB_TOTAL_STRIPES);

        scrub_throttle_dev_io(sctx, sctx->stripes[0].dev,
                              btrfs_stripe_nr_to_offset(nr_stripes));
        blk_start_plug(&plug);
        for (int i = 0; i < nr_stripes; i++) {
                struct scrub_stripe *stripe = &sctx->stripes[first_slot + i];

                /* Those stripes should be initialized. */
                ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
                scrub_submit_initial_read(sctx, stripe);
        }
        blk_finish_plug(&plug);
}

static int flush_scrub_stripes(struct scrub_ctx *sctx)
{
        struct btrfs_fs_info *fs_info = sctx->fs_info;
        struct scrub_stripe *stripe;
        const int nr_stripes = sctx->cur_stripe;
        int ret = 0;

        if (!nr_stripes)
                return 0;

        ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state));

        /* Submit the stripes which are populated but not submitted. */
        if (nr_stripes % SCRUB_STRIPES_PER_GROUP) {
                const int first_slot = round_down(nr_stripes, SCRUB_STRIPES_PER_GROUP);

                submit_initial_group_read(sctx, first_slot, nr_stripes - first_slot);
        }

        for (int i = 0; i < nr_stripes; i++) {
                stripe = &sctx->stripes[i];

                wait_event(stripe->repair_wait,
                           test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state));
        }

        /* Submit for dev-replace. */
        if (sctx->is_dev_replace) {
                /*
                 * For dev-replace, if we know there is something wrong with
                 * metadata, we should immediately abort.
                 */
                for (int i = 0; i < nr_stripes; i++) {
                        if (stripe_has_metadata_error(&sctx->stripes[i])) {
                                ret = -EIO;
                                goto out;
                        }
                }
                for (int i = 0; i < nr_stripes; i++) {
                        unsigned long good;

                        stripe = &sctx->stripes[i];

                        ASSERT(stripe->dev == fs_info->dev_replace.srcdev);

                        bitmap_andnot(&good, &stripe->extent_sector_bitmap,
                                      &stripe->error_bitmap, stripe->nr_sectors);
                        scrub_write_sectors(sctx, stripe, good, true);
                }
        }

        /* Wait for the above writebacks to finish. */
        for (int i = 0; i < nr_stripes; i++) {
                stripe = &sctx->stripes[i];

                wait_scrub_stripe_io(stripe);
                scrub_reset_stripe(stripe);
        }
out:
        sctx->cur_stripe = 0;
        return ret;
}

static void raid56_scrub_wait_endio(struct bio *bio)
{
        complete(bio->bi_private);
}

static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg,
                              struct btrfs_device *dev, int mirror_num,
                              u64 logical, u32 length, u64 physical,
                              u64 *found_logical_ret)
{
        struct scrub_stripe *stripe;
        int ret;

        /*
         * There should always be one slot left, as caller filling the last
         * slot should flush them all.
         */
        ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES);

        /* @found_logical_ret must be specified. */
        ASSERT(found_logical_ret);

        stripe = &sctx->stripes[sctx->cur_stripe];
        scrub_reset_stripe(stripe);
        ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path,
                                           &sctx->csum_path, dev, physical,
                                           mirror_num, logical, length, stripe);
        /* Either >0 as no more extents or <0 for error. */
        if (ret)
                return ret;
        *found_logical_ret = stripe->logical;
        sctx->cur_stripe++;

        /* We filled one group, submit it. */
        if (sctx->cur_stripe % SCRUB_STRIPES_PER_GROUP == 0) {
                const int first_slot = sctx->cur_stripe - SCRUB_STRIPES_PER_GROUP;

                submit_initial_group_read(sctx, first_slot, SCRUB_STRIPES_PER_GROUP);
        }

        /* Last slot used, flush them all. */
        if (sctx->cur_stripe == SCRUB_TOTAL_STRIPES)
                return flush_scrub_stripes(sctx);
        return 0;
}

static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
                                      struct btrfs_device *scrub_dev,
                                      struct btrfs_block_group *bg,
                                      struct btrfs_chunk_map *map,
                                      u64 full_stripe_start)
{
        DECLARE_COMPLETION_ONSTACK(io_done);
        struct btrfs_fs_info *fs_info = sctx->fs_info;
        struct btrfs_raid_bio *rbio;
        struct btrfs_io_context *bioc = NULL;
        struct btrfs_path extent_path = { 0 };
        struct btrfs_path csum_path = { 0 };
        struct bio *bio;
        struct scrub_stripe *stripe;
        bool all_empty = true;
        const int data_stripes = nr_data_stripes(map);
        unsigned long extent_bitmap = 0;
        u64 length = btrfs_stripe_nr_to_offset(data_stripes);
        int ret;

        ASSERT(sctx->raid56_data_stripes);

        /*
         * For data stripe search, we cannot re-use the same extent/csum paths,
         * as the data stripe bytenr may be smaller than previous extent.  Thus
         * we have to use our own extent/csum paths.
         */
        extent_path.search_commit_root = 1;
        extent_path.skip_locking = 1;
        csum_path.search_commit_root = 1;
        csum_path.skip_locking = 1;

        for (int i = 0; i < data_stripes; i++) {
                int stripe_index;
                int rot;
                u64 physical;

                stripe = &sctx->raid56_data_stripes[i];
                rot = div_u64(full_stripe_start - bg->start,
                              data_stripes) >> BTRFS_STRIPE_LEN_SHIFT;
                stripe_index = (i + rot) % map->num_stripes;
                physical = map->stripes[stripe_index].physical +
                           btrfs_stripe_nr_to_offset(rot);

                scrub_reset_stripe(stripe);
                set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state);
                ret = scrub_find_fill_first_stripe(bg, &extent_path, &csum_path,
                                map->stripes[stripe_index].dev, physical, 1,
                                full_stripe_start + btrfs_stripe_nr_to_offset(i),
                                BTRFS_STRIPE_LEN, stripe);
                if (ret < 0)
                        goto out;
                /*
                 * No extent in this data stripe, need to manually mark them
                 * initialized to make later read submission happy.
                 */
                if (ret > 0) {
                        stripe->logical = full_stripe_start +
                                          btrfs_stripe_nr_to_offset(i);
                        stripe->dev = map->stripes[stripe_index].dev;
                        stripe->mirror_num = 1;
                        set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
                }
        }

        /* Check if all data stripes are empty. */
        for (int i = 0; i < data_stripes; i++) {
                stripe = &sctx->raid56_data_stripes[i];
                if (!bitmap_empty(&stripe->extent_sector_bitmap, stripe->nr_sectors)) {
                        all_empty = false;
                        break;
                }
        }
        if (all_empty) {
                ret = 0;
                goto out;
        }

        for (int i = 0; i < data_stripes; i++) {
                stripe = &sctx->raid56_data_stripes[i];
                scrub_submit_initial_read(sctx, stripe);
        }
        for (int i = 0; i < data_stripes; i++) {
                stripe = &sctx->raid56_data_stripes[i];

                wait_event(stripe->repair_wait,
                           test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state));
        }
        /* For now, no zoned support for RAID56. */
        ASSERT(!btrfs_is_zoned(sctx->fs_info));

        /*
         * Now all data stripes are properly verified. Check if we have any
         * unrepaired, if so abort immediately or we could further corrupt the
         * P/Q stripes.
         *
         * During the loop, also populate extent_bitmap.
         */
        for (int i = 0; i < data_stripes; i++) {
                unsigned long error;

                stripe = &sctx->raid56_data_stripes[i];

                /*
                 * We should only check the errors where there is an extent.
                 * As we may hit an empty data stripe while it's missing.
                 */
                bitmap_and(&error, &stripe->error_bitmap,
                           &stripe->extent_sector_bitmap, stripe->nr_sectors);
                if (!bitmap_empty(&error, stripe->nr_sectors)) {
                        btrfs_err(fs_info,
"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
                                  full_stripe_start, i, stripe->nr_sectors,
                                  &error);
                        ret = -EIO;
                        goto out;
                }
                bitmap_or(&extent_bitmap, &extent_bitmap,
                          &stripe->extent_sector_bitmap, stripe->nr_sectors);
        }

        /* Now we can check and regenerate the P/Q stripe. */
        bio = bio_alloc(NULL, 1, REQ_OP_READ, GFP_NOFS);
        bio->bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT;
        bio->bi_private = &io_done;
        bio->bi_end_io = raid56_scrub_wait_endio;

        btrfs_bio_counter_inc_blocked(fs_info);
        ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
                              &length, &bioc, NULL, NULL);
        if (ret < 0) {
                btrfs_put_bioc(bioc);
                btrfs_bio_counter_dec(fs_info);
                goto out;
        }
        rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, scrub_dev, &extent_bitmap,
                                BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
        btrfs_put_bioc(bioc);
        if (!rbio) {
                ret = -ENOMEM;
                btrfs_bio_counter_dec(fs_info);
                goto out;
        }
        /* Use the recovered stripes as cache to avoid read them from disk again. */
        for (int i = 0; i < data_stripes; i++) {
                stripe = &sctx->raid56_data_stripes[i];

                raid56_parity_cache_data_pages(rbio, stripe->pages,
                                full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT));
        }
        raid56_parity_submit_scrub_rbio(rbio);
        wait_for_completion_io(&io_done);
        ret = blk_status_to_errno(bio->bi_status);
        bio_put(bio);
        btrfs_bio_counter_dec(fs_info);

        btrfs_release_path(&extent_path);
        btrfs_release_path(&csum_path);
out:
        return ret;
}

/*
 * Scrub one range which can only has simple mirror based profile.
 * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
 *  RAID0/RAID10).
 *
 * Since we may need to handle a subset of block group, we need @logical_start
 * and @logical_length parameter.
 */
static int scrub_simple_mirror(struct scrub_ctx *sctx,
                               struct btrfs_block_group *bg,
                               struct btrfs_chunk_map *map,
                               u64 logical_start, u64 logical_length,
                               struct btrfs_device *device,
                               u64 physical, int mirror_num)
{
        struct btrfs_fs_info *fs_info = sctx->fs_info;
        const u64 logical_end = logical_start + logical_length;
        u64 cur_logical = logical_start;
        int ret = 0;

        /* The range must be inside the bg */
        ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);

        /* Go through each extent items inside the logical range */
        while (cur_logical < logical_end) {
                u64 found_logical = U64_MAX;
                u64 cur_physical = physical + cur_logical - logical_start;

                /* Canceled? */
                if (atomic_read(&fs_info->scrub_cancel_req) ||
                    atomic_read(&sctx->cancel_req)) {
                        ret = -ECANCELED;
                        break;
                }
                /* Paused? */
                if (atomic_read(&fs_info->scrub_pause_req)) {
                        /* Push queued extents */
                        scrub_blocked_if_needed(fs_info);
                }
                /* Block group removed? */
                spin_lock(&bg->lock);
                if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
                        spin_unlock(&bg->lock);
                        ret = 0;
                        break;
                }
                spin_unlock(&bg->lock);

                ret = queue_scrub_stripe(sctx, bg, device, mirror_num,
                                         cur_logical, logical_end - cur_logical,
                                         cur_physical, &found_logical);
                if (ret > 0) {
                        /* No more extent, just update the accounting */
                        sctx->stat.last_physical = physical + logical_length;
                        ret = 0;
                        break;
                }
                if (ret < 0)
                        break;

                /* queue_scrub_stripe() returned 0, @found_logical must be updated. */
                ASSERT(found_logical != U64_MAX);
                cur_logical = found_logical + BTRFS_STRIPE_LEN;

                /* Don't hold CPU for too long time */
                cond_resched();
        }
        return ret;
}

/* Calculate the full stripe length for simple stripe based profiles */
static u64 simple_stripe_full_stripe_len(const struct btrfs_chunk_map *map)
{
        ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
                            BTRFS_BLOCK_GROUP_RAID10));

        return btrfs_stripe_nr_to_offset(map->num_stripes / map->sub_stripes);
}

/* Get the logical bytenr for the stripe */
static u64 simple_stripe_get_logical(struct btrfs_chunk_map *map,
                                     struct btrfs_block_group *bg,
                                     int stripe_index)
{
        ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
                            BTRFS_BLOCK_GROUP_RAID10));
        ASSERT(stripe_index < map->num_stripes);

        /*
         * (stripe_index / sub_stripes) gives how many data stripes we need to
         * skip.
         */
        return btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes) +
               bg->start;
}

/* Get the mirror number for the stripe */
static int simple_stripe_mirror_num(struct btrfs_chunk_map *map, int stripe_index)
{
        ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
                            BTRFS_BLOCK_GROUP_RAID10));
        ASSERT(stripe_index < map->num_stripes);

        /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
        return stripe_index % map->sub_stripes + 1;
}

static int scrub_simple_stripe(struct scrub_ctx *sctx,
                               struct btrfs_block_group *bg,
                               struct btrfs_chunk_map *map,
                               struct btrfs_device *device,
                               int stripe_index)
{
        const u64 logical_increment = simple_stripe_full_stripe_len(map);
        const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
        const u64 orig_physical = map->stripes[stripe_index].physical;
        const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
        u64 cur_logical = orig_logical;
        u64 cur_physical = orig_physical;
        int ret = 0;

        while (cur_logical < bg->start + bg->length) {
                /*
                 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
                 * just RAID1, so we can reuse scrub_simple_mirror() to scrub
                 * this stripe.
                 */
                ret = scrub_simple_mirror(sctx, bg, map, cur_logical,
                                          BTRFS_STRIPE_LEN, device, cur_physical,
                                          mirror_num);
                if (ret)
                        return ret;
                /* Skip to next stripe which belongs to the target device */
                cur_logical += logical_increment;
                /* For physical offset, we just go to next stripe */
                cur_physical += BTRFS_STRIPE_LEN;
        }
        return ret;
}

static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                                           struct btrfs_block_group *bg,
                                           struct btrfs_chunk_map *map,
                                           struct btrfs_device *scrub_dev,
                                           int stripe_index)
{
        struct btrfs_fs_info *fs_info = sctx->fs_info;
        const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
        const u64 chunk_logical = bg->start;
        int ret;
        int ret2;
        u64 physical = map->stripes[stripe_index].physical;
        const u64 dev_stripe_len = btrfs_calc_stripe_length(map);
        const u64 physical_end = physical + dev_stripe_len;
        u64 logical;
        u64 logic_end;
        /* The logical increment after finishing one stripe */
        u64 increment;
        /* Offset inside the chunk */
        u64 offset;
        u64 stripe_logical;
        int stop_loop = 0;

        /* Extent_path should be released by now. */
        ASSERT(sctx->extent_path.nodes[0] == NULL);

        scrub_blocked_if_needed(fs_info);

        if (sctx->is_dev_replace &&
            btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
                mutex_lock(&sctx->wr_lock);
                sctx->write_pointer = physical;
                mutex_unlock(&sctx->wr_lock);
        }

        /* Prepare the extra data stripes used by RAID56. */
        if (profile & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                ASSERT(sctx->raid56_data_stripes == NULL);

                sctx->raid56_data_stripes = kcalloc(nr_data_stripes(map),
                                                    sizeof(struct scrub_stripe),
                                                    GFP_KERNEL);
                if (!sctx->raid56_data_stripes) {
                        ret = -ENOMEM;
                        goto out;
                }
                for (int i = 0; i < nr_data_stripes(map); i++) {
                        ret = init_scrub_stripe(fs_info,
                                                &sctx->raid56_data_stripes[i]);
                        if (ret < 0)
                                goto out;
                        sctx->raid56_data_stripes[i].bg = bg;
                        sctx->raid56_data_stripes[i].sctx = sctx;
                }
        }
        /*
         * There used to be a big double loop to handle all profiles using the
         * same routine, which grows larger and more gross over time.
         *
         * So here we handle each profile differently, so simpler profiles
         * have simpler scrubbing function.
         */
        if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
                         BTRFS_BLOCK_GROUP_RAID56_MASK))) {
                /*
                 * Above check rules out all complex profile, the remaining
                 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
                 * mirrored duplication without stripe.
                 *
                 * Only @physical and @mirror_num needs to calculated using
                 * @stripe_index.
                 */
                ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length,
                                scrub_dev, map->stripes[stripe_index].physical,
                                stripe_index + 1);
                offset = 0;
                goto out;
        }
        if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
                ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index);
                offset = btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes);
                goto out;
        }

        /* Only RAID56 goes through the old code */
        ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
        ret = 0;

        /* Calculate the logical end of the stripe */
        get_raid56_logic_offset(physical_end, stripe_index,
                                map, &logic_end, NULL);
        logic_end += chunk_logical;

        /* Initialize @offset in case we need to go to out: label */
        get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
        increment = btrfs_stripe_nr_to_offset(nr_data_stripes(map));

        /*
         * Due to the rotation, for RAID56 it's better to iterate each stripe
         * using their physical offset.
         */
        while (physical < physical_end) {
                ret = get_raid56_logic_offset(physical, stripe_index, map,
                                              &logical, &stripe_logical);
                logical += chunk_logical;
                if (ret) {
                        /* it is parity strip */
                        stripe_logical += chunk_logical;
                        ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg,
                                                         map, stripe_logical);
                        if (ret)
                                goto out;
                        goto next;
                }

                /*
                 * Now we're at a data stripe, scrub each extents in the range.
                 *
                 * At this stage, if we ignore the repair part, inside each data
                 * stripe it is no different than SINGLE profile.
                 * We can reuse scrub_simple_mirror() here, as the repair part
                 * is still based on @mirror_num.
                 */
                ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN,
                                          scrub_dev, physical, 1);
                if (ret < 0)
                        goto out;
next:
                logical += increment;
                physical += BTRFS_STRIPE_LEN;
                spin_lock(&sctx->stat_lock);
                if (stop_loop)
                        sctx->stat.last_physical =
                                map->stripes[stripe_index].physical + dev_stripe_len;
                else
                        sctx->stat.last_physical = physical;
                spin_unlock(&sctx->stat_lock);
                if (stop_loop)
                        break;
        }
out:
        ret2 = flush_scrub_stripes(sctx);
        if (!ret)
                ret = ret2;
        btrfs_release_path(&sctx->extent_path);
        btrfs_release_path(&sctx->csum_path);

        if (sctx->raid56_data_stripes) {
                for (int i = 0; i < nr_data_stripes(map); i++)
                        release_scrub_stripe(&sctx->raid56_data_stripes[i]);
                kfree(sctx->raid56_data_stripes);
                sctx->raid56_data_stripes = NULL;
        }

        if (sctx->is_dev_replace && ret >= 0) {
                int ret2;

                ret2 = sync_write_pointer_for_zoned(sctx,
                                chunk_logical + offset,
                                map->stripes[stripe_index].physical,
                                physical_end);
                if (ret2)
                        ret = ret2;
        }

        return ret < 0 ? ret : 0;
}

static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
                                          struct btrfs_block_group *bg,
                                          struct btrfs_device *scrub_dev,
                                          u64 dev_offset,
                                          u64 dev_extent_len)
{
        struct btrfs_fs_info *fs_info = sctx->fs_info;
        struct btrfs_chunk_map *map;
        int i;
        int ret = 0;

        map = btrfs_find_chunk_map(fs_info, bg->start, bg->length);
        if (!map) {
                /*
                 * Might have been an unused block group deleted by the cleaner
                 * kthread or relocation.
                 */
                spin_lock(&bg->lock);
                if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags))
                        ret = -EINVAL;
                spin_unlock(&bg->lock);

                return ret;
        }
        if (map->start != bg->start)
                goto out;
        if (map->chunk_len < dev_extent_len)
                goto out;

        for (i = 0; i < map->num_stripes; ++i) {
                if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
                    map->stripes[i].physical == dev_offset) {
                        ret = scrub_stripe(sctx, bg, map, scrub_dev, i);
                        if (ret)
                                goto out;
                }
        }
out:
        btrfs_free_chunk_map(map);

        return ret;
}

static int finish_extent_writes_for_zoned(struct btrfs_root *root,
                                          struct btrfs_block_group *cache)
{
        struct btrfs_fs_info *fs_info = cache->fs_info;
        struct btrfs_trans_handle *trans;

        if (!btrfs_is_zoned(fs_info))
                return 0;

        btrfs_wait_block_group_reservations(cache);
        btrfs_wait_nocow_writers(cache);
        btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);

        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
        return btrfs_commit_transaction(trans);
}

static noinline_for_stack
int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                           struct btrfs_device *scrub_dev, u64 start, u64 end)
{
        struct btrfs_dev_extent *dev_extent = NULL;
        struct btrfs_path *path;
        struct btrfs_fs_info *fs_info = sctx->fs_info;
        struct btrfs_root *root = fs_info->dev_root;
        u64 chunk_offset;
        int ret = 0;
        int ro_set;
        int slot;
        struct extent_buffer *l;
        struct btrfs_key key;
        struct btrfs_key found_key;
        struct btrfs_block_group *cache;
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        path->reada = READA_FORWARD;
        path->search_commit_root = 1;
        path->skip_locking = 1;

        key.objectid = scrub_dev->devid;
        key.offset = 0ull;
        key.type = BTRFS_DEV_EXTENT_KEY;

        while (1) {
                u64 dev_extent_len;

                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0)
                        break;
                if (ret > 0) {
                        if (path->slots[0] >=
                            btrfs_header_nritems(path->nodes[0])) {
                                ret = btrfs_next_leaf(root, path);
                                if (ret < 0)
                                        break;
                                if (ret > 0) {
                                        ret = 0;
                                        break;
                                }
                        } else {
                                ret = 0;
                        }
                }

                l = path->nodes[0];
                slot = path->slots[0];

                btrfs_item_key_to_cpu(l, &found_key, slot);

                if (found_key.objectid != scrub_dev->devid)
                        break;

                if (found_key.type != BTRFS_DEV_EXTENT_KEY)
                        break;

                if (found_key.offset >= end)
                        break;

                if (found_key.offset < key.offset)
                        break;

                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
                dev_extent_len = btrfs_dev_extent_length(l, dev_extent);

                if (found_key.offset + dev_extent_len <= start)
                        goto skip;

                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);

                /*
                 * get a reference on the corresponding block group to prevent
                 * the chunk from going away while we scrub it
                 */
                cache = btrfs_lookup_block_group(fs_info, chunk_offset);

                /* some chunks are removed but not committed to disk yet,
                 * continue scrubbing */
                if (!cache)
                        goto skip;

                ASSERT(cache->start <= chunk_offset);
                /*
                 * We are using the commit root to search for device extents, so
                 * that means we could have found a device extent item from a
                 * block group that was deleted in the current transaction. The
                 * logical start offset of the deleted block group, stored at
                 * @chunk_offset, might be part of the logical address range of
                 * a new block group (which uses different physical extents).
                 * In this case btrfs_lookup_block_group() has returned the new
                 * block group, and its start address is less than @chunk_offset.
                 *
                 * We skip such new block groups, because it's pointless to
                 * process them, as we won't find their extents because we search
                 * for them using the commit root of the extent tree. For a device
                 * replace it's also fine to skip it, we won't miss copying them
                 * to the target device because we have the write duplication
                 * setup through the regular write path (by btrfs_map_block()),
                 * and we have committed a transaction when we started the device
                 * replace, right after setting up the device replace state.
                 */
                if (cache->start < chunk_offset) {
                        btrfs_put_block_group(cache);
                        goto skip;
                }

                if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
                        if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {
                                btrfs_put_block_group(cache);
                                goto skip;
                        }
                }

                /*
                 * Make sure that while we are scrubbing the corresponding block
                 * group doesn't get its logical address and its device extents
                 * reused for another block group, which can possibly be of a
                 * different type and different profile. We do this to prevent
                 * false error detections and crashes due to bogus attempts to
                 * repair extents.
                 */
                spin_lock(&cache->lock);
                if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
                        spin_unlock(&cache->lock);
                        btrfs_put_block_group(cache);
                        goto skip;
                }
                btrfs_freeze_block_group(cache);
                spin_unlock(&cache->lock);

                /*
                 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
                 * to avoid deadlock caused by:
                 * btrfs_inc_block_group_ro()
                 * -> btrfs_wait_for_commit()
                 * -> btrfs_commit_transaction()
                 * -> btrfs_scrub_pause()
                 */
                scrub_pause_on(fs_info);

                /*
                 * Don't do chunk preallocation for scrub.
                 *
                 * This is especially important for SYSTEM bgs, or we can hit
                 * -EFBIG from btrfs_finish_chunk_alloc() like:
                 * 1. The only SYSTEM bg is marked RO.
                 *    Since SYSTEM bg is small, that's pretty common.
                 * 2. New SYSTEM bg will be allocated
                 *    Due to regular version will allocate new chunk.
                 * 3. New SYSTEM bg is empty and will get cleaned up
                 *    Before cleanup really happens, it's marked RO again.
                 * 4. Empty SYSTEM bg get scrubbed
                 *    We go back to 2.
                 *
                 * This can easily boost the amount of SYSTEM chunks if cleaner
                 * thread can't be triggered fast enough, and use up all space
                 * of btrfs_super_block::sys_chunk_array
                 *
                 * While for dev replace, we need to try our best to mark block
                 * group RO, to prevent race between:
                 * - Write duplication
                 *   Contains latest data
                 * - Scrub copy
                 *   Contains data from commit tree
                 *
                 * If target block group is not marked RO, nocow writes can
                 * be overwritten by scrub copy, causing data corruption.
                 * So for dev-replace, it's not allowed to continue if a block
                 * group is not RO.
                 */
                ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
                if (!ret && sctx->is_dev_replace) {
                        ret = finish_extent_writes_for_zoned(root, cache);
                        if (ret) {
                                btrfs_dec_block_group_ro(cache);
                                scrub_pause_off(fs_info);
                                btrfs_put_block_group(cache);
                                break;
                        }
                }

                if (ret == 0) {
                        ro_set = 1;
                } else if (ret == -ENOSPC && !sctx->is_dev_replace &&
                           !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) {
                        /*
                         * btrfs_inc_block_group_ro return -ENOSPC when it
                         * failed in creating new chunk for metadata.
                         * It is not a problem for scrub, because
                         * metadata are always cowed, and our scrub paused
                         * commit_transactions.
                         *
                         * For RAID56 chunks, we have to mark them read-only
                         * for scrub, as later we would use our own cache
                         * out of RAID56 realm.
                         * Thus we want the RAID56 bg to be marked RO to
                         * prevent RMW from screwing up out cache.
                         */
                        ro_set = 0;
                } else if (ret == -ETXTBSY) {
                        btrfs_warn(fs_info,
                   "skipping scrub of block group %llu due to active swapfile",
                                   cache->start);
                        scrub_pause_off(fs_info);
                        ret = 0;
                        goto skip_unfreeze;
                } else {
                        btrfs_warn(fs_info,
                                   "failed setting block group ro: %d", ret);
                        btrfs_unfreeze_block_group(cache);
                        btrfs_put_block_group(cache);
                        scrub_pause_off(fs_info);
                        break;
                }

                /*
                 * Now the target block is marked RO, wait for nocow writes to
                 * finish before dev-replace.
                 * COW is fine, as COW never overwrites extents in commit tree.
                 */
                if (sctx->is_dev_replace) {
                        btrfs_wait_nocow_writers(cache);
                        btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
                                        cache->length);
                }

                scrub_pause_off(fs_info);
                down_write(&dev_replace->rwsem);
                dev_replace->cursor_right = found_key.offset + dev_extent_len;
                dev_replace->cursor_left = found_key.offset;
                dev_replace->item_needs_writeback = 1;
                up_write(&dev_replace->rwsem);

                ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
                                  dev_extent_len);
                if (sctx->is_dev_replace &&
                    !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
                                                      cache, found_key.offset))
                        ro_set = 0;

                down_write(&dev_replace->rwsem);
                dev_replace->cursor_left = dev_replace->cursor_right;
                dev_replace->item_needs_writeback = 1;
                up_write(&dev_replace->rwsem);

                if (ro_set)
                        btrfs_dec_block_group_ro(cache);

                /*
                 * We might have prevented the cleaner kthread from deleting
                 * this block group if it was already unused because we raced
                 * and set it to RO mode first. So add it back to the unused
                 * list, otherwise it might not ever be deleted unless a manual
                 * balance is triggered or it becomes used and unused again.
                 */
                spin_lock(&cache->lock);
                if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) &&
                    !cache->ro && cache->reserved == 0 && cache->used == 0) {
                        spin_unlock(&cache->lock);
                        if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
                                btrfs_discard_queue_work(&fs_info->discard_ctl,
                                                         cache);
                        else
                                btrfs_mark_bg_unused(cache);
                } else {
                        spin_unlock(&cache->lock);
                }
skip_unfreeze:
                btrfs_unfreeze_block_group(cache);
                btrfs_put_block_group(cache);
                if (ret)
                        break;
                if (sctx->is_dev_replace &&
                    atomic64_read(&dev_replace->num_write_errors) > 0) {
                        ret = -EIO;
                        break;
                }
                if (sctx->stat.malloc_errors > 0) {
                        ret = -ENOMEM;
                        break;
                }
skip:
                key.offset = found_key.offset + dev_extent_len;
                btrfs_release_path(path);
        }

        btrfs_free_path(path);

        return ret;
}

static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
                           struct page *page, u64 physical, u64 generation)
{
        struct btrfs_fs_info *fs_info = sctx->fs_info;
        struct bio_vec bvec;
        struct bio bio;
        struct btrfs_super_block *sb = page_address(page);
        int ret;

        bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ);
        bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT;
        __bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0);
        ret = submit_bio_wait(&bio);
        bio_uninit(&bio);

        if (ret < 0)
                return ret;
        ret = btrfs_check_super_csum(fs_info, sb);
        if (ret != 0) {
                btrfs_err_rl(fs_info,
                        "super block at physical %llu devid %llu has bad csum",
                        physical, dev->devid);
                return -EIO;
        }
        if (btrfs_super_generation(sb) != generation) {
                btrfs_err_rl(fs_info,
"super block at physical %llu devid %llu has bad generation %llu expect %llu",
                             physical, dev->devid,
                             btrfs_super_generation(sb), generation);
                return -EUCLEAN;
        }

        return btrfs_validate_super(fs_info, sb, -1);
}

static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
                                           struct btrfs_device *scrub_dev)
{
        int        i;
        u64        bytenr;
        u64        gen;
        int ret = 0;
        struct page *page;
        struct btrfs_fs_info *fs_info = sctx->fs_info;

        if (BTRFS_FS_ERROR(fs_info))
                return -EROFS;

        page = alloc_page(GFP_KERNEL);
        if (!page) {
                spin_lock(&sctx->stat_lock);
                sctx->stat.malloc_errors++;
                spin_unlock(&sctx->stat_lock);
                return -ENOMEM;
        }

        /* Seed devices of a new filesystem has their own generation. */
        if (scrub_dev->fs_devices != fs_info->fs_devices)
                gen = scrub_dev->generation;
        else
                gen = btrfs_get_last_trans_committed(fs_info);

        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
                ret = btrfs_sb_log_location(scrub_dev, i, 0, &bytenr);
                if (ret == -ENOENT)
                        break;

                if (ret) {
                        spin_lock(&sctx->stat_lock);
                        sctx->stat.super_errors++;
                        spin_unlock(&sctx->stat_lock);
                        continue;
                }

                if (bytenr + BTRFS_SUPER_INFO_SIZE >
                    scrub_dev->commit_total_bytes)
                        break;
                if (!btrfs_check_super_location(scrub_dev, bytenr))
                        continue;

                ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen);
                if (ret) {
                        spin_lock(&sctx->stat_lock);
                        sctx->stat.super_errors++;
                        spin_unlock(&sctx->stat_lock);
                }
        }
        __free_page(page);
        return 0;
}

static void scrub_workers_put(struct btrfs_fs_info *fs_info)
{
        if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
                                        &fs_info->scrub_lock)) {
                struct workqueue_struct *scrub_workers = fs_info->scrub_workers;

                fs_info->scrub_workers = NULL;
                mutex_unlock(&fs_info->scrub_lock);

                if (scrub_workers)
                        destroy_workqueue(scrub_workers);
        }
}

/*
 * get a reference count on fs_info->scrub_workers. start worker if necessary
 */
static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info)
{
        struct workqueue_struct *scrub_workers = NULL;
        unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
        int max_active = fs_info->thread_pool_size;
        int ret = -ENOMEM;

        if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
                return 0;

        scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active);
        if (!scrub_workers)
                return -ENOMEM;

        mutex_lock(&fs_info->scrub_lock);
        if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
                ASSERT(fs_info->scrub_workers == NULL);
                fs_info->scrub_workers = scrub_workers;
                refcount_set(&fs_info->scrub_workers_refcnt, 1);
                mutex_unlock(&fs_info->scrub_lock);
                return 0;
        }
        /* Other thread raced in and created the workers for us */
        refcount_inc(&fs_info->scrub_workers_refcnt);
        mutex_unlock(&fs_info->scrub_lock);

        ret = 0;

        destroy_workqueue(scrub_workers);
        return ret;
}

int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                    u64 end, struct btrfs_scrub_progress *progress,
                    int readonly, int is_dev_replace)
{
        struct btrfs_dev_lookup_args args = { .devid = devid };
        struct scrub_ctx *sctx;
        int ret;
        struct btrfs_device *dev;
        unsigned int nofs_flag;
        bool need_commit = false;

        if (btrfs_fs_closing(fs_info))
                return -EAGAIN;

        /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */
        ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN);

        /*
         * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible
         * value (max nodesize / min sectorsize), thus nodesize should always
         * be fine.
         */
        ASSERT(fs_info->nodesize <=
               SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits);

        /* Allocate outside of device_list_mutex */
        sctx = scrub_setup_ctx(fs_info, is_dev_replace);
        if (IS_ERR(sctx))
                return PTR_ERR(sctx);

        ret = scrub_workers_get(fs_info);
        if (ret)
                goto out_free_ctx;

        mutex_lock(&fs_info->fs_devices->device_list_mutex);
        dev = btrfs_find_device(fs_info->fs_devices, &args);
        if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
                     !is_dev_replace)) {
                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
                ret = -ENODEV;
                goto out;
        }

        if (!is_dev_replace && !readonly &&
            !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
                btrfs_err_in_rcu(fs_info,
                        "scrub on devid %llu: filesystem on %s is not writable",
                                 devid, btrfs_dev_name(dev));
                ret = -EROFS;
                goto out;
        }

        mutex_lock(&fs_info->scrub_lock);
        if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
            test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
                mutex_unlock(&fs_info->scrub_lock);
                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
                ret = -EIO;
                goto out;
        }

        down_read(&fs_info->dev_replace.rwsem);
        if (dev->scrub_ctx ||
            (!is_dev_replace &&
             btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
                up_read(&fs_info->dev_replace.rwsem);
                mutex_unlock(&fs_info->scrub_lock);
                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
                ret = -EINPROGRESS;
                goto out;
        }
        up_read(&fs_info->dev_replace.rwsem);

        sctx->readonly = readonly;
        dev->scrub_ctx = sctx;
        mutex_unlock(&fs_info->fs_devices->device_list_mutex);

        /*
         * checking @scrub_pause_req here, we can avoid
         * race between committing transaction and scrubbing.
         */
        __scrub_blocked_if_needed(fs_info);
        atomic_inc(&fs_info->scrubs_running);
        mutex_unlock(&fs_info->scrub_lock);

        /*
         * In order to avoid deadlock with reclaim when there is a transaction
         * trying to pause scrub, make sure we use GFP_NOFS for all the
         * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
         * invoked by our callees. The pausing request is done when the
         * transaction commit starts, and it blocks the transaction until scrub
         * is paused (done at specific points at scrub_stripe() or right above
         * before incrementing fs_info->scrubs_running).
         */
        nofs_flag = memalloc_nofs_save();
        if (!is_dev_replace) {
                u64 old_super_errors;

                spin_lock(&sctx->stat_lock);
                old_super_errors = sctx->stat.super_errors;
                spin_unlock(&sctx->stat_lock);

                btrfs_info(fs_info, "scrub: started on devid %llu", devid);
                /*
                 * by holding device list mutex, we can
                 * kick off writing super in log tree sync.
                 */
                mutex_lock(&fs_info->fs_devices->device_list_mutex);
                ret = scrub_supers(sctx, dev);
                mutex_unlock(&fs_info->fs_devices->device_list_mutex);

                spin_lock(&sctx->stat_lock);
                /*
                 * Super block errors found, but we can not commit transaction
                 * at current context, since btrfs_commit_transaction() needs
                 * to pause the current running scrub (hold by ourselves).
                 */
                if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
                        need_commit = true;
                spin_unlock(&sctx->stat_lock);
        }

        if (!ret)
                ret = scrub_enumerate_chunks(sctx, dev, start, end);
        memalloc_nofs_restore(nofs_flag);

        atomic_dec(&fs_info->scrubs_running);
        wake_up(&fs_info->scrub_pause_wait);

        if (progress)
                memcpy(progress, &sctx->stat, sizeof(*progress));

        if (!is_dev_replace)
                btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
                        ret ? "not finished" : "finished", devid, ret);

        mutex_lock(&fs_info->scrub_lock);
        dev->scrub_ctx = NULL;
        mutex_unlock(&fs_info->scrub_lock);

        scrub_workers_put(fs_info);
        scrub_put_ctx(sctx);

        /*
         * We found some super block errors before, now try to force a
         * transaction commit, as scrub has finished.
         */
        if (need_commit) {
                struct btrfs_trans_handle *trans;

                trans = btrfs_start_transaction(fs_info->tree_root, 0);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
                        btrfs_err(fs_info,
        "scrub: failed to start transaction to fix super block errors: %d", ret);
                        return ret;
                }
                ret = btrfs_commit_transaction(trans);
                if (ret < 0)
                        btrfs_err(fs_info,
        "scrub: failed to commit transaction to fix super block errors: %d", ret);
        }
        return ret;
out:
        scrub_workers_put(fs_info);
out_free_ctx:
        scrub_free_ctx(sctx);

        return ret;
}

void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
{
        mutex_lock(&fs_info->scrub_lock);
        atomic_inc(&fs_info->scrub_pause_req);
        while (atomic_read(&fs_info->scrubs_paused) !=
               atomic_read(&fs_info->scrubs_running)) {
                mutex_unlock(&fs_info->scrub_lock);
                wait_event(fs_info->scrub_pause_wait,
                           atomic_read(&fs_info->scrubs_paused) ==
                           atomic_read(&fs_info->scrubs_running));
                mutex_lock(&fs_info->scrub_lock);
        }
        mutex_unlock(&fs_info->scrub_lock);
}

void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
{
        atomic_dec(&fs_info->scrub_pause_req);
        wake_up(&fs_info->scrub_pause_wait);
}

int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
{
        mutex_lock(&fs_info->scrub_lock);
        if (!atomic_read(&fs_info->scrubs_running)) {
                mutex_unlock(&fs_info->scrub_lock);
                return -ENOTCONN;
        }

        atomic_inc(&fs_info->scrub_cancel_req);
        while (atomic_read(&fs_info->scrubs_running)) {
                mutex_unlock(&fs_info->scrub_lock);
                wait_event(fs_info->scrub_pause_wait,
                           atomic_read(&fs_info->scrubs_running) == 0);
                mutex_lock(&fs_info->scrub_lock);
        }
        atomic_dec(&fs_info->scrub_cancel_req);
        mutex_unlock(&fs_info->scrub_lock);

        return 0;
}

int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
{
        struct btrfs_fs_info *fs_info = dev->fs_info;
        struct scrub_ctx *sctx;

        mutex_lock(&fs_info->scrub_lock);
        sctx = dev->scrub_ctx;
        if (!sctx) {
                mutex_unlock(&fs_info->scrub_lock);
                return -ENOTCONN;
        }
        atomic_inc(&sctx->cancel_req);
        while (dev->scrub_ctx) {
                mutex_unlock(&fs_info->scrub_lock);
                wait_event(fs_info->scrub_pause_wait,
                           dev->scrub_ctx == NULL);
                mutex_lock(&fs_info->scrub_lock);
        }
        mutex_unlock(&fs_info->scrub_lock);

        return 0;
}

int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
                         struct btrfs_scrub_progress *progress)
{
        struct btrfs_dev_lookup_args args = { .devid = devid };
        struct btrfs_device *dev;
        struct scrub_ctx *sctx = NULL;

        mutex_lock(&fs_info->fs_devices->device_list_mutex);
        dev = btrfs_find_device(fs_info->fs_devices, &args);
        if (dev)
                sctx = dev->scrub_ctx;
        if (sctx)
                memcpy(progress, &sctx->stat, sizeof(*progress));
        mutex_unlock(&fs_info->fs_devices->device_list_mutex);

        return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
}











































































































































































































































































































    3 













    3 








    2 

    3 



    3 










    2 

    3 










    3 

















    3 






















































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 
























































































    3 


    3 



    2 






























































    2 








    3 























    3 




























    2 

































































































































































































































































































































































































    3 

















































    2 













    2 


























    3 
































































































    3 









































    3 











    3 


























    3 
    2 










    1 













    3 





    1 
    3 







    3 

    2 




    3 








    3 














    3 










    3 
















    3 

    2 
    2 

    3 


    2 







    1 


























    3 








    3 




    1 

    2 



    3 






    2 












    3 







    3 



















    3 

    3 












    3 





    3 











































































































































































































































































































































    3 



    3 



















    3 















    3 








    3 







































































    3 





















    3 






    3 







    3 






























    3 








    2 










    2 














    2 
    3 











    2 
    3 






































    3 












    3 


    3 



    3 





    2 



    3 
    3 

    3 




    2 





    3 
    3 

    3 



















    3 






















    2 
    3 







    2 








    3 

















































































































































































































































































































































































































































































































































































































































































































































































































































































    2 















    1 


    3 

    3 

















































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/printk.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 * Modified to make sys_syslog() more flexible: added commands to
 * return the last 4k of kernel messages, regardless of whether
 * they've been read or not.  Added option to suppress kernel printk's
 * to the console.  Added hook for sending the console messages
 * elsewhere, in preparation for a serial line console (someday).
 * Ted Ts'o, 2/11/93.
 * Modified for sysctl support, 1/8/97, Chris Horn.
 * Fixed SMP synchronization, 08/08/99, Manfred Spraul
 *     manfred@colorfullife.com
 * Rewrote bits to get rid of console_lock
 *        01Mar01 Andrew Morton
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/tty.h>
#include <linux/tty_driver.h>
#include <linux/console.h>
#include <linux/init.h>
#include <linux/jiffies.h>
#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/delay.h>
#include <linux/smp.h>
#include <linux/security.h>
#include <linux/memblock.h>
#include <linux/syscalls.h>
#include <linux/vmcore_info.h>
#include <linux/ratelimit.h>
#include <linux/kmsg_dump.h>
#include <linux/syslog.h>
#include <linux/cpu.h>
#include <linux/rculist.h>
#include <linux/poll.h>
#include <linux/irq_work.h>
#include <linux/ctype.h>
#include <linux/uio.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/task_stack.h>

#include <linux/uaccess.h>
#include <asm/sections.h>

#include <trace/events/initcall.h>
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>

#include "printk_ringbuffer.h"
#include "console_cmdline.h"
#include "braille.h"
#include "internal.h"

int console_printk[4] = {
        CONSOLE_LOGLEVEL_DEFAULT,        /* console_loglevel */
        MESSAGE_LOGLEVEL_DEFAULT,        /* default_message_loglevel */
        CONSOLE_LOGLEVEL_MIN,                /* minimum_console_loglevel */
        CONSOLE_LOGLEVEL_DEFAULT,        /* default_console_loglevel */
};
EXPORT_SYMBOL_GPL(console_printk);

atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0);
EXPORT_SYMBOL(ignore_console_lock_warning);

EXPORT_TRACEPOINT_SYMBOL_GPL(console);

/*
 * Low level drivers may need that to know if they can schedule in
 * their unblank() callback or not. So let's export it.
 */
int oops_in_progress;
EXPORT_SYMBOL(oops_in_progress);

/*
 * console_mutex protects console_list updates and console->flags updates.
 * The flags are synchronized only for consoles that are registered, i.e.
 * accessible via the console list.
 */
static DEFINE_MUTEX(console_mutex);

/*
 * console_sem protects updates to console->seq
 * and also provides serialization for console printing.
 */
static DEFINE_SEMAPHORE(console_sem, 1);
HLIST_HEAD(console_list);
EXPORT_SYMBOL_GPL(console_list);
DEFINE_STATIC_SRCU(console_srcu);

/*
 * System may need to suppress printk message under certain
 * circumstances, like after kernel panic happens.
 */
int __read_mostly suppress_printk;

#ifdef CONFIG_LOCKDEP
static struct lockdep_map console_lock_dep_map = {
        .name = "console_lock"
};

void lockdep_assert_console_list_lock_held(void)
{
        lockdep_assert_held(&console_mutex);
}
EXPORT_SYMBOL(lockdep_assert_console_list_lock_held);
#endif

#ifdef CONFIG_DEBUG_LOCK_ALLOC
bool console_srcu_read_lock_is_held(void)
{
        return srcu_read_lock_held(&console_srcu);
}
EXPORT_SYMBOL(console_srcu_read_lock_is_held);
#endif

enum devkmsg_log_bits {
        __DEVKMSG_LOG_BIT_ON = 0,
        __DEVKMSG_LOG_BIT_OFF,
        __DEVKMSG_LOG_BIT_LOCK,
};

enum devkmsg_log_masks {
        DEVKMSG_LOG_MASK_ON             = BIT(__DEVKMSG_LOG_BIT_ON),
        DEVKMSG_LOG_MASK_OFF            = BIT(__DEVKMSG_LOG_BIT_OFF),
        DEVKMSG_LOG_MASK_LOCK           = BIT(__DEVKMSG_LOG_BIT_LOCK),
};

/* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */
#define DEVKMSG_LOG_MASK_DEFAULT        0

static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;

static int __control_devkmsg(char *str)
{
        size_t len;

        if (!str)
                return -EINVAL;

        len = str_has_prefix(str, "on");
        if (len) {
                devkmsg_log = DEVKMSG_LOG_MASK_ON;
                return len;
        }

        len = str_has_prefix(str, "off");
        if (len) {
                devkmsg_log = DEVKMSG_LOG_MASK_OFF;
                return len;
        }

        len = str_has_prefix(str, "ratelimit");
        if (len) {
                devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
                return len;
        }

        return -EINVAL;
}

static int __init control_devkmsg(char *str)
{
        if (__control_devkmsg(str) < 0) {
                pr_warn("printk.devkmsg: bad option string '%s'\n", str);
                return 1;
        }

        /*
         * Set sysctl string accordingly:
         */
        if (devkmsg_log == DEVKMSG_LOG_MASK_ON)
                strscpy(devkmsg_log_str, "on");
        else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF)
                strscpy(devkmsg_log_str, "off");
        /* else "ratelimit" which is set by default. */

        /*
         * Sysctl cannot change it anymore. The kernel command line setting of
         * this parameter is to force the setting to be permanent throughout the
         * runtime of the system. This is a precation measure against userspace
         * trying to be a smarta** and attempting to change it up on us.
         */
        devkmsg_log |= DEVKMSG_LOG_MASK_LOCK;

        return 1;
}
__setup("printk.devkmsg=", control_devkmsg);

char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
                              void *buffer, size_t *lenp, loff_t *ppos)
{
        char old_str[DEVKMSG_STR_MAX_SIZE];
        unsigned int old;
        int err;

        if (write) {
                if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK)
                        return -EINVAL;

                old = devkmsg_log;
                strscpy(old_str, devkmsg_log_str);
        }

        err = proc_dostring(table, write, buffer, lenp, ppos);
        if (err)
                return err;

        if (write) {
                err = __control_devkmsg(devkmsg_log_str);

                /*
                 * Do not accept an unknown string OR a known string with
                 * trailing crap...
                 */
                if (err < 0 || (err + 1 != *lenp)) {

                        /* ... and restore old setting. */
                        devkmsg_log = old;
                        strscpy(devkmsg_log_str, old_str);

                        return -EINVAL;
                }
        }

        return 0;
}
#endif /* CONFIG_PRINTK && CONFIG_SYSCTL */

/**
 * console_list_lock - Lock the console list
 *
 * For console list or console->flags updates
 */
void console_list_lock(void)
{
        /*
         * In unregister_console() and console_force_preferred_locked(),
         * synchronize_srcu() is called with the console_list_lock held.
         * Therefore it is not allowed that the console_list_lock is taken
         * with the srcu_lock held.
         *
         * Detecting if this context is really in the read-side critical
         * section is only possible if the appropriate debug options are
         * enabled.
         */
        WARN_ON_ONCE(debug_lockdep_rcu_enabled() &&
                     srcu_read_lock_held(&console_srcu));

        mutex_lock(&console_mutex);
}
EXPORT_SYMBOL(console_list_lock);

/**
 * console_list_unlock - Unlock the console list
 *
 * Counterpart to console_list_lock()
 */
void console_list_unlock(void)
{
        mutex_unlock(&console_mutex);
}
EXPORT_SYMBOL(console_list_unlock);

/**
 * console_srcu_read_lock - Register a new reader for the
 *        SRCU-protected console list
 *
 * Use for_each_console_srcu() to iterate the console list
 *
 * Context: Any context.
 * Return: A cookie to pass to console_srcu_read_unlock().
 */
int console_srcu_read_lock(void)
{
        return srcu_read_lock_nmisafe(&console_srcu);
}
EXPORT_SYMBOL(console_srcu_read_lock);

/**
 * console_srcu_read_unlock - Unregister an old reader from
 *        the SRCU-protected console list
 * @cookie: cookie returned from console_srcu_read_lock()
 *
 * Counterpart to console_srcu_read_lock()
 */
void console_srcu_read_unlock(int cookie)
{
        srcu_read_unlock_nmisafe(&console_srcu, cookie);
}
EXPORT_SYMBOL(console_srcu_read_unlock);

/*
 * Helper macros to handle lockdep when locking/unlocking console_sem. We use
 * macros instead of functions so that _RET_IP_ contains useful information.
 */
#define down_console_sem() do { \
        down(&console_sem);\
        mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\
} while (0)

static int __down_trylock_console_sem(unsigned long ip)
{
        int lock_failed;
        unsigned long flags;

        /*
         * Here and in __up_console_sem() we need to be in safe mode,
         * because spindump/WARN/etc from under console ->lock will
         * deadlock in printk()->down_trylock_console_sem() otherwise.
         */
        printk_safe_enter_irqsave(flags);
        lock_failed = down_trylock(&console_sem);
        printk_safe_exit_irqrestore(flags);

        if (lock_failed)
                return 1;
        mutex_acquire(&console_lock_dep_map, 0, 1, ip);
        return 0;
}
#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_)

static void __up_console_sem(unsigned long ip)
{
        unsigned long flags;

        mutex_release(&console_lock_dep_map, ip);

        printk_safe_enter_irqsave(flags);
        up(&console_sem);
        printk_safe_exit_irqrestore(flags);
}
#define up_console_sem() __up_console_sem(_RET_IP_)

static bool panic_in_progress(void)
{
        return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
}

/* Return true if a panic is in progress on the current CPU. */
bool this_cpu_in_panic(void)
{
        /*
         * We can use raw_smp_processor_id() here because it is impossible for
         * the task to be migrated to the panic_cpu, or away from it. If
         * panic_cpu has already been set, and we're not currently executing on
         * that CPU, then we never will be.
         */
        return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id());
}

/*
 * Return true if a panic is in progress on a remote CPU.
 *
 * On true, the local CPU should immediately release any printing resources
 * that may be needed by the panic CPU.
 */
bool other_cpu_in_panic(void)
{
        return (panic_in_progress() && !this_cpu_in_panic());
}

/*
 * This is used for debugging the mess that is the VT code by
 * keeping track if we have the console semaphore held. It's
 * definitely not the perfect debug tool (we don't know if _WE_
 * hold it and are racing, but it helps tracking those weird code
 * paths in the console code where we end up in places I want
 * locked without the console semaphore held).
 */
static int console_locked;

/*
 *        Array of consoles built from command line options (console=)
 */
static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];

static int preferred_console = -1;
int console_set_on_cmdline;
EXPORT_SYMBOL(console_set_on_cmdline);

/* Flag: console code may call schedule() */
static int console_may_schedule;

enum con_msg_format_flags {
        MSG_FORMAT_DEFAULT        = 0,
        MSG_FORMAT_SYSLOG        = (1 << 0),
};

static int console_msg_format = MSG_FORMAT_DEFAULT;

/*
 * The printk log buffer consists of a sequenced collection of records, each
 * containing variable length message text. Every record also contains its
 * own meta-data (@info).
 *
 * Every record meta-data carries the timestamp in microseconds, as well as
 * the standard userspace syslog level and syslog facility. The usual kernel
 * messages use LOG_KERN; userspace-injected messages always carry a matching
 * syslog facility, by default LOG_USER. The origin of every message can be
 * reliably determined that way.
 *
 * The human readable log message of a record is available in @text, the
 * length of the message text in @text_len. The stored message is not
 * terminated.
 *
 * Optionally, a record can carry a dictionary of properties (key/value
 * pairs), to provide userspace with a machine-readable message context.
 *
 * Examples for well-defined, commonly used property names are:
 *   DEVICE=b12:8               device identifier
 *                                b12:8         block dev_t
 *                                c127:3        char dev_t
 *                                n8            netdev ifindex
 *                                +sound:card0  subsystem:devname
 *   SUBSYSTEM=pci              driver-core subsystem name
 *
 * Valid characters in property names are [a-zA-Z0-9.-_]. Property names
 * and values are terminated by a '\0' character.
 *
 * Example of record values:
 *   record.text_buf                = "it's a line" (unterminated)
 *   record.info.seq                = 56
 *   record.info.ts_nsec            = 36863
 *   record.info.text_len           = 11
 *   record.info.facility           = 0 (LOG_KERN)
 *   record.info.flags              = 0
 *   record.info.level              = 3 (LOG_ERR)
 *   record.info.caller_id          = 299 (task 299)
 *   record.info.dev_info.subsystem = "pci" (terminated)
 *   record.info.dev_info.device    = "+pci:0000:00:01.0" (terminated)
 *
 * The 'struct printk_info' buffer must never be directly exported to
 * userspace, it is a kernel-private implementation detail that might
 * need to be changed in the future, when the requirements change.
 *
 * /dev/kmsg exports the structured data in the following line format:
 *   "<level>,<sequnum>,<timestamp>,<contflag>[,additional_values, ... ];<message text>\n"
 *
 * Users of the export format should ignore possible additional values
 * separated by ',', and find the message after the ';' character.
 *
 * The optional key/value pairs are attached as continuation lines starting
 * with a space character and terminated by a newline. All possible
 * non-prinatable characters are escaped in the "\xff" notation.
 */

/* syslog_lock protects syslog_* variables and write access to clear_seq. */
static DEFINE_MUTEX(syslog_lock);

#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* All 3 protected by @syslog_lock. */
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
static size_t syslog_partial;
static bool syslog_time;

struct latched_seq {
        seqcount_latch_t        latch;
        u64                        val[2];
};

/*
 * The next printk record to read after the last 'clear' command. There are
 * two copies (updated with seqcount_latch) so that reads can locklessly
 * access a valid value. Writers are synchronized by @syslog_lock.
 */
static struct latched_seq clear_seq = {
        .latch                = SEQCNT_LATCH_ZERO(clear_seq.latch),
        .val[0]                = 0,
        .val[1]                = 0,
};

#define LOG_LEVEL(v)                ((v) & 0x07)
#define LOG_FACILITY(v)                ((v) >> 3 & 0xff)

/* record buffer */
#define LOG_ALIGN __alignof__(unsigned long)
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
#define LOG_BUF_LEN_MAX (u32)(1 << 31)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;

/*
 * Define the average message size. This only affects the number of
 * descriptors that will be available. Underestimating is better than
 * overestimating (too many available descriptors is better than not enough).
 */
#define PRB_AVGBITS 5        /* 32 character average length */

#if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS
#error CONFIG_LOG_BUF_SHIFT value too small.
#endif
_DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS,
                 PRB_AVGBITS, &__log_buf[0]);

static struct printk_ringbuffer printk_rb_dynamic;

struct printk_ringbuffer *prb = &printk_rb_static;

/*
 * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
 * per_cpu_areas are initialised. This variable is set to true when
 * it's safe to access per-CPU data.
 */
static bool __printk_percpu_data_ready __ro_after_init;

bool printk_percpu_data_ready(void)
{
        return __printk_percpu_data_ready;
}

/* Must be called under syslog_lock. */
static void latched_seq_write(struct latched_seq *ls, u64 val)
{
        raw_write_seqcount_latch(&ls->latch);
        ls->val[0] = val;
        raw_write_seqcount_latch(&ls->latch);
        ls->val[1] = val;
}

/* Can be called from any context. */
static u64 latched_seq_read_nolock(struct latched_seq *ls)
{
        unsigned int seq;
        unsigned int idx;
        u64 val;

        do {
                seq = raw_read_seqcount_latch(&ls->latch);
                idx = seq & 0x1;
                val = ls->val[idx];
        } while (raw_read_seqcount_latch_retry(&ls->latch, seq));

        return val;
}

/* Return log buffer address */
char *log_buf_addr_get(void)
{
        return log_buf;
}

/* Return log buffer size */
u32 log_buf_len_get(void)
{
        return log_buf_len;
}

/*
 * Define how much of the log buffer we could take at maximum. The value
 * must be greater than two. Note that only half of the buffer is available
 * when the index points to the middle.
 */
#define MAX_LOG_TAKE_PART 4
static const char trunc_msg[] = "<truncated>";

static void truncate_msg(u16 *text_len, u16 *trunc_msg_len)
{
        /*
         * The message should not take the whole buffer. Otherwise, it might
         * get removed too soon.
         */
        u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;

        if (*text_len > max_text_len)
                *text_len = max_text_len;

        /* enable the warning message (if there is room) */
        *trunc_msg_len = strlen(trunc_msg);
        if (*text_len >= *trunc_msg_len)
                *text_len -= *trunc_msg_len;
        else
                *trunc_msg_len = 0;
}

int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);

static int syslog_action_restricted(int type)
{
        if (dmesg_restrict)
                return 1;
        /*
         * Unless restricted, we allow "read all" and "get buffer size"
         * for everybody.
         */
        return type != SYSLOG_ACTION_READ_ALL &&
               type != SYSLOG_ACTION_SIZE_BUFFER;
}

static int check_syslog_permissions(int type, int source)
{
        /*
         * If this is from /proc/kmsg and we've already opened it, then we've
         * already done the capabilities checks at open time.
         */
        if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN)
                goto ok;

        if (syslog_action_restricted(type)) {
                if (capable(CAP_SYSLOG))
                        goto ok;
                return -EPERM;
        }
ok:
        return security_syslog(type);
}

static void append_char(char **pp, char *e, char c)
{
        if (*pp < e)
                *(*pp)++ = c;
}

static ssize_t info_print_ext_header(char *buf, size_t size,
                                     struct printk_info *info)
{
        u64 ts_usec = info->ts_nsec;
        char caller[20];
#ifdef CONFIG_PRINTK_CALLER
        u32 id = info->caller_id;

        snprintf(caller, sizeof(caller), ",caller=%c%u",
                 id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
#else
        caller[0] = '\0';
#endif

        do_div(ts_usec, 1000);

        return scnprintf(buf, size, "%u,%llu,%llu,%c%s;",
                         (info->facility << 3) | info->level, info->seq,
                         ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller);
}

static ssize_t msg_add_ext_text(char *buf, size_t size,
                                const char *text, size_t text_len,
                                unsigned char endc)
{
        char *p = buf, *e = buf + size;
        size_t i;

        /* escape non-printable characters */
        for (i = 0; i < text_len; i++) {
                unsigned char c = text[i];

                if (c < ' ' || c >= 127 || c == '\\')
                        p += scnprintf(p, e - p, "\\x%02x", c);
                else
                        append_char(&p, e, c);
        }
        append_char(&p, e, endc);

        return p - buf;
}

static ssize_t msg_add_dict_text(char *buf, size_t size,
                                 const char *key, const char *val)
{
        size_t val_len = strlen(val);
        ssize_t len;

        if (!val_len)
                return 0;

        len = msg_add_ext_text(buf, size, "", 0, ' ');        /* dict prefix */
        len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '=');
        len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n');

        return len;
}

static ssize_t msg_print_ext_body(char *buf, size_t size,
                                  char *text, size_t text_len,
                                  struct dev_printk_info *dev_info)
{
        ssize_t len;

        len = msg_add_ext_text(buf, size, text, text_len, '\n');

        if (!dev_info)
                goto out;

        len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM",
                                 dev_info->subsystem);
        len += msg_add_dict_text(buf + len, size - len, "DEVICE",
                                 dev_info->device);
out:
        return len;
}

/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
        atomic64_t seq;
        struct ratelimit_state rs;
        struct mutex lock;
        struct printk_buffers pbufs;
};

static __printf(3, 4) __cold
int devkmsg_emit(int facility, int level, const char *fmt, ...)
{
        va_list args;
        int r;

        va_start(args, fmt);
        r = vprintk_emit(facility, level, NULL, fmt, args);
        va_end(args);

        return r;
}

static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
{
        char *buf, *line;
        int level = default_message_loglevel;
        int facility = 1;        /* LOG_USER */
        struct file *file = iocb->ki_filp;
        struct devkmsg_user *user = file->private_data;
        size_t len = iov_iter_count(from);
        ssize_t ret = len;

        if (len > PRINTKRB_RECORD_MAX)
                return -EINVAL;

        /* Ignore when user logging is disabled. */
        if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
                return len;

        /* Ratelimit when not explicitly enabled. */
        if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) {
                if (!___ratelimit(&user->rs, current->comm))
                        return ret;
        }

        buf = kmalloc(len+1, GFP_KERNEL);
        if (buf == NULL)
                return -ENOMEM;

        buf[len] = '\0';
        if (!copy_from_iter_full(buf, len, from)) {
                kfree(buf);
                return -EFAULT;
        }

        /*
         * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace
         * the decimal value represents 32bit, the lower 3 bit are the log
         * level, the rest are the log facility.
         *
         * If no prefix or no userspace facility is specified, we
         * enforce LOG_USER, to be able to reliably distinguish
         * kernel-generated messages from userspace-injected ones.
         */
        line = buf;
        if (line[0] == '<') {
                char *endp = NULL;
                unsigned int u;

                u = simple_strtoul(line + 1, &endp, 10);
                if (endp && endp[0] == '>') {
                        level = LOG_LEVEL(u);
                        if (LOG_FACILITY(u) != 0)
                                facility = LOG_FACILITY(u);
                        endp++;
                        line = endp;
                }
        }

        devkmsg_emit(facility, level, "%s", line);
        kfree(buf);
        return ret;
}

static ssize_t devkmsg_read(struct file *file, char __user *buf,
                            size_t count, loff_t *ppos)
{
        struct devkmsg_user *user = file->private_data;
        char *outbuf = &user->pbufs.outbuf[0];
        struct printk_message pmsg = {
                .pbufs = &user->pbufs,
        };
        ssize_t ret;

        ret = mutex_lock_interruptible(&user->lock);
        if (ret)
                return ret;

        if (!printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)) {
                if (file->f_flags & O_NONBLOCK) {
                        ret = -EAGAIN;
                        goto out;
                }

                /*
                 * Guarantee this task is visible on the waitqueue before
                 * checking the wake condition.
                 *
                 * The full memory barrier within set_current_state() of
                 * prepare_to_wait_event() pairs with the full memory barrier
                 * within wq_has_sleeper().
                 *
                 * This pairs with __wake_up_klogd:A.
                 */
                ret = wait_event_interruptible(log_wait,
                                printk_get_next_message(&pmsg, atomic64_read(&user->seq), true,
                                                        false)); /* LMM(devkmsg_read:A) */
                if (ret)
                        goto out;
        }

        if (pmsg.dropped) {
                /* our last seen message is gone, return error and reset */
                atomic64_set(&user->seq, pmsg.seq);
                ret = -EPIPE;
                goto out;
        }

        atomic64_set(&user->seq, pmsg.seq + 1);

        if (pmsg.outbuf_len > count) {
                ret = -EINVAL;
                goto out;
        }

        if (copy_to_user(buf, outbuf, pmsg.outbuf_len)) {
                ret = -EFAULT;
                goto out;
        }
        ret = pmsg.outbuf_len;
out:
        mutex_unlock(&user->lock);
        return ret;
}

/*
 * Be careful when modifying this function!!!
 *
 * Only few operations are supported because the device works only with the
 * entire variable length messages (records). Non-standard values are
 * returned in the other cases and has been this way for quite some time.
 * User space applications might depend on this behavior.
 */
static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
{
        struct devkmsg_user *user = file->private_data;
        loff_t ret = 0;

        if (offset)
                return -ESPIPE;

        switch (whence) {
        case SEEK_SET:
                /* the first record */
                atomic64_set(&user->seq, prb_first_valid_seq(prb));
                break;
        case SEEK_DATA:
                /*
                 * The first record after the last SYSLOG_ACTION_CLEAR,
                 * like issued by 'dmesg -c'. Reading /dev/kmsg itself
                 * changes no global state, and does not clear anything.
                 */
                atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq));
                break;
        case SEEK_END:
                /* after the last record */
                atomic64_set(&user->seq, prb_next_seq(prb));
                break;
        default:
                ret = -EINVAL;
        }
        return ret;
}

static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
{
        struct devkmsg_user *user = file->private_data;
        struct printk_info info;
        __poll_t ret = 0;

        poll_wait(file, &log_wait, wait);

        if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) {
                /* return error when data has vanished underneath us */
                if (info.seq != atomic64_read(&user->seq))
                        ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
                else
                        ret = EPOLLIN|EPOLLRDNORM;
        }

        return ret;
}

static int devkmsg_open(struct inode *inode, struct file *file)
{
        struct devkmsg_user *user;
        int err;

        if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
                return -EPERM;

        /* write-only does not need any file context */
        if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
                err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
                                               SYSLOG_FROM_READER);
                if (err)
                        return err;
        }

        user = kvmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
        if (!user)
                return -ENOMEM;

        ratelimit_default_init(&user->rs);
        ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE);

        mutex_init(&user->lock);

        atomic64_set(&user->seq, prb_first_valid_seq(prb));

        file->private_data = user;
        return 0;
}

static int devkmsg_release(struct inode *inode, struct file *file)
{
        struct devkmsg_user *user = file->private_data;

        ratelimit_state_exit(&user->rs);

        mutex_destroy(&user->lock);
        kvfree(user);
        return 0;
}

const struct file_operations kmsg_fops = {
        .open = devkmsg_open,
        .read = devkmsg_read,
        .write_iter = devkmsg_write,
        .llseek = devkmsg_llseek,
        .poll = devkmsg_poll,
        .release = devkmsg_release,
};

#ifdef CONFIG_VMCORE_INFO
/*
 * This appends the listed symbols to /proc/vmcore
 *
 * /proc/vmcore is used by various utilities, like crash and makedumpfile to
 * obtain access to symbols that are otherwise very difficult to locate.  These
 * symbols are specifically used so that utilities can access and extract the
 * dmesg log from a vmcore file after a crash.
 */
void log_buf_vmcoreinfo_setup(void)
{
        struct dev_printk_info *dev_info = NULL;

        VMCOREINFO_SYMBOL(prb);
        VMCOREINFO_SYMBOL(printk_rb_static);
        VMCOREINFO_SYMBOL(clear_seq);

        /*
         * Export struct size and field offsets. User space tools can
         * parse it and detect any changes to structure down the line.
         */

        VMCOREINFO_STRUCT_SIZE(printk_ringbuffer);
        VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring);
        VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring);
        VMCOREINFO_OFFSET(printk_ringbuffer, fail);

        VMCOREINFO_STRUCT_SIZE(prb_desc_ring);
        VMCOREINFO_OFFSET(prb_desc_ring, count_bits);
        VMCOREINFO_OFFSET(prb_desc_ring, descs);
        VMCOREINFO_OFFSET(prb_desc_ring, infos);
        VMCOREINFO_OFFSET(prb_desc_ring, head_id);
        VMCOREINFO_OFFSET(prb_desc_ring, tail_id);

        VMCOREINFO_STRUCT_SIZE(prb_desc);
        VMCOREINFO_OFFSET(prb_desc, state_var);
        VMCOREINFO_OFFSET(prb_desc, text_blk_lpos);

        VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos);
        VMCOREINFO_OFFSET(prb_data_blk_lpos, begin);
        VMCOREINFO_OFFSET(prb_data_blk_lpos, next);

        VMCOREINFO_STRUCT_SIZE(printk_info);
        VMCOREINFO_OFFSET(printk_info, seq);
        VMCOREINFO_OFFSET(printk_info, ts_nsec);
        VMCOREINFO_OFFSET(printk_info, text_len);
        VMCOREINFO_OFFSET(printk_info, caller_id);
        VMCOREINFO_OFFSET(printk_info, dev_info);

        VMCOREINFO_STRUCT_SIZE(dev_printk_info);
        VMCOREINFO_OFFSET(dev_printk_info, subsystem);
        VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem));
        VMCOREINFO_OFFSET(dev_printk_info, device);
        VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device));

        VMCOREINFO_STRUCT_SIZE(prb_data_ring);
        VMCOREINFO_OFFSET(prb_data_ring, size_bits);
        VMCOREINFO_OFFSET(prb_data_ring, data);
        VMCOREINFO_OFFSET(prb_data_ring, head_lpos);
        VMCOREINFO_OFFSET(prb_data_ring, tail_lpos);

        VMCOREINFO_SIZE(atomic_long_t);
        VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter);

        VMCOREINFO_STRUCT_SIZE(latched_seq);
        VMCOREINFO_OFFSET(latched_seq, val);
}
#endif

/* requested log_buf_len from kernel cmdline */
static unsigned long __initdata new_log_buf_len;

/* we practice scaling the ring buffer by powers of 2 */
static void __init log_buf_len_update(u64 size)
{
        if (size > (u64)LOG_BUF_LEN_MAX) {
                size = (u64)LOG_BUF_LEN_MAX;
                pr_err("log_buf over 2G is not supported.\n");
        }

        if (size)
                size = roundup_pow_of_two(size);
        if (size > log_buf_len)
                new_log_buf_len = (unsigned long)size;
}

/* save requested log_buf_len since it's too early to process it */
static int __init log_buf_len_setup(char *str)
{
        u64 size;

        if (!str)
                return -EINVAL;

        size = memparse(str, &str);

        log_buf_len_update(size);

        return 0;
}
early_param("log_buf_len", log_buf_len_setup);

#ifdef CONFIG_SMP
#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)

static void __init log_buf_add_cpu(void)
{
        unsigned int cpu_extra;

        /*
         * archs should set up cpu_possible_bits properly with
         * set_cpu_possible() after setup_arch() but just in
         * case lets ensure this is valid.
         */
        if (num_possible_cpus() == 1)
                return;

        cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN;

        /* by default this will only continue through for large > 64 CPUs */
        if (cpu_extra <= __LOG_BUF_LEN / 2)
                return;

        pr_info("log_buf_len individual max cpu contribution: %d bytes\n",
                __LOG_CPU_MAX_BUF_LEN);
        pr_info("log_buf_len total cpu_extra contributions: %d bytes\n",
                cpu_extra);
        pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN);

        log_buf_len_update(cpu_extra + __LOG_BUF_LEN);
}
#else /* !CONFIG_SMP */
static inline void log_buf_add_cpu(void) {}
#endif /* CONFIG_SMP */

static void __init set_percpu_data_ready(void)
{
        __printk_percpu_data_ready = true;
}

static unsigned int __init add_to_rb(struct printk_ringbuffer *rb,
                                     struct printk_record *r)
{
        struct prb_reserved_entry e;
        struct printk_record dest_r;

        prb_rec_init_wr(&dest_r, r->info->text_len);

        if (!prb_reserve(&e, rb, &dest_r))
                return 0;

        memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len);
        dest_r.info->text_len = r->info->text_len;
        dest_r.info->facility = r->info->facility;
        dest_r.info->level = r->info->level;
        dest_r.info->flags = r->info->flags;
        dest_r.info->ts_nsec = r->info->ts_nsec;
        dest_r.info->caller_id = r->info->caller_id;
        memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info));

        prb_final_commit(&e);

        return prb_record_text_space(&e);
}

static char setup_text_buf[PRINTKRB_RECORD_MAX] __initdata;

void __init setup_log_buf(int early)
{
        struct printk_info *new_infos;
        unsigned int new_descs_count;
        struct prb_desc *new_descs;
        struct printk_info info;
        struct printk_record r;
        unsigned int text_size;
        size_t new_descs_size;
        size_t new_infos_size;
        unsigned long flags;
        char *new_log_buf;
        unsigned int free;
        u64 seq;

        /*
         * Some archs call setup_log_buf() multiple times - first is very
         * early, e.g. from setup_arch(), and second - when percpu_areas
         * are initialised.
         */
        if (!early)
                set_percpu_data_ready();

        if (log_buf != __log_buf)
                return;

        if (!early && !new_log_buf_len)
                log_buf_add_cpu();

        if (!new_log_buf_len)
                return;

        new_descs_count = new_log_buf_len >> PRB_AVGBITS;
        if (new_descs_count == 0) {
                pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len);
                return;
        }

        new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN);
        if (unlikely(!new_log_buf)) {
                pr_err("log_buf_len: %lu text bytes not available\n",
                       new_log_buf_len);
                return;
        }

        new_descs_size = new_descs_count * sizeof(struct prb_desc);
        new_descs = memblock_alloc(new_descs_size, LOG_ALIGN);
        if (unlikely(!new_descs)) {
                pr_err("log_buf_len: %zu desc bytes not available\n",
                       new_descs_size);
                goto err_free_log_buf;
        }

        new_infos_size = new_descs_count * sizeof(struct printk_info);
        new_infos = memblock_alloc(new_infos_size, LOG_ALIGN);
        if (unlikely(!new_infos)) {
                pr_err("log_buf_len: %zu info bytes not available\n",
                       new_infos_size);
                goto err_free_descs;
        }

        prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf));

        prb_init(&printk_rb_dynamic,
                 new_log_buf, ilog2(new_log_buf_len),
                 new_descs, ilog2(new_descs_count),
                 new_infos);

        local_irq_save(flags);

        log_buf_len = new_log_buf_len;
        log_buf = new_log_buf;
        new_log_buf_len = 0;

        free = __LOG_BUF_LEN;
        prb_for_each_record(0, &printk_rb_static, seq, &r) {
                text_size = add_to_rb(&printk_rb_dynamic, &r);
                if (text_size > free)
                        free = 0;
                else
                        free -= text_size;
        }

        prb = &printk_rb_dynamic;

        local_irq_restore(flags);

        /*
         * Copy any remaining messages that might have appeared from
         * NMI context after copying but before switching to the
         * dynamic buffer.
         */
        prb_for_each_record(seq, &printk_rb_static, seq, &r) {
                text_size = add_to_rb(&printk_rb_dynamic, &r);
                if (text_size > free)
                        free = 0;
                else
                        free -= text_size;
        }

        if (seq != prb_next_seq(&printk_rb_static)) {
                pr_err("dropped %llu messages\n",
                       prb_next_seq(&printk_rb_static) - seq);
        }

        pr_info("log_buf_len: %u bytes\n", log_buf_len);
        pr_info("early log buf free: %u(%u%%)\n",
                free, (free * 100) / __LOG_BUF_LEN);
        return;

err_free_descs:
        memblock_free(new_descs, new_descs_size);
err_free_log_buf:
        memblock_free(new_log_buf, new_log_buf_len);
}

static bool __read_mostly ignore_loglevel;

static int __init ignore_loglevel_setup(char *str)
{
        ignore_loglevel = true;
        pr_info("debug: ignoring loglevel setting.\n");

        return 0;
}

early_param("ignore_loglevel", ignore_loglevel_setup);
module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(ignore_loglevel,
                 "ignore loglevel setting (prints all kernel messages to the console)");

static bool suppress_message_printing(int level)
{
        return (level >= console_loglevel && !ignore_loglevel);
}

#ifdef CONFIG_BOOT_PRINTK_DELAY

static int boot_delay; /* msecs delay after each printk during bootup */
static unsigned long long loops_per_msec;        /* based on boot_delay */

static int __init boot_delay_setup(char *str)
{
        unsigned long lpj;

        lpj = preset_lpj ? preset_lpj : 1000000;        /* some guess */
        loops_per_msec = (unsigned long long)lpj / 1000 * HZ;

        get_option(&str, &boot_delay);
        if (boot_delay > 10 * 1000)
                boot_delay = 0;

        pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
                "HZ: %d, loops_per_msec: %llu\n",
                boot_delay, preset_lpj, lpj, HZ, loops_per_msec);
        return 0;
}
early_param("boot_delay", boot_delay_setup);

static void boot_delay_msec(int level)
{
        unsigned long long k;
        unsigned long timeout;

        if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING)
                || suppress_message_printing(level)) {
                return;
        }

        k = (unsigned long long)loops_per_msec * boot_delay;

        timeout = jiffies + msecs_to_jiffies(boot_delay);
        while (k) {
                k--;
                cpu_relax();
                /*
                 * use (volatile) jiffies to prevent
                 * compiler reduction; loop termination via jiffies
                 * is secondary and may or may not happen.
                 */
                if (time_after(jiffies, timeout))
                        break;
                touch_nmi_watchdog();
        }
}
#else
static inline void boot_delay_msec(int level)
{
}
#endif

static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);

static size_t print_syslog(unsigned int level, char *buf)
{
        return sprintf(buf, "<%u>", level);
}

static size_t print_time(u64 ts, char *buf)
{
        unsigned long rem_nsec = do_div(ts, 1000000000);

        return sprintf(buf, "[%5lu.%06lu]",
                       (unsigned long)ts, rem_nsec / 1000);
}

#ifdef CONFIG_PRINTK_CALLER
static size_t print_caller(u32 id, char *buf)
{
        char caller[12];

        snprintf(caller, sizeof(caller), "%c%u",
                 id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
        return sprintf(buf, "[%6s]", caller);
}
#else
#define print_caller(id, buf) 0
#endif

static size_t info_print_prefix(const struct printk_info  *info, bool syslog,
                                bool time, char *buf)
{
        size_t len = 0;

        if (syslog)
                len = print_syslog((info->facility << 3) | info->level, buf);

        if (time)
                len += print_time(info->ts_nsec, buf + len);

        len += print_caller(info->caller_id, buf + len);

        if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) {
                buf[len++] = ' ';
                buf[len] = '\0';
        }

        return len;
}

/*
 * Prepare the record for printing. The text is shifted within the given
 * buffer to avoid a need for another one. The following operations are
 * done:
 *
 *   - Add prefix for each line.
 *   - Drop truncated lines that no longer fit into the buffer.
 *   - Add the trailing newline that has been removed in vprintk_store().
 *   - Add a string terminator.
 *
 * Since the produced string is always terminated, the maximum possible
 * return value is @r->text_buf_size - 1;
 *
 * Return: The length of the updated/prepared text, including the added
 * prefixes and the newline. The terminator is not counted. The dropped
 * line(s) are not counted.
 */
static size_t record_print_text(struct printk_record *r, bool syslog,
                                bool time)
{
        size_t text_len = r->info->text_len;
        size_t buf_size = r->text_buf_size;
        char *text = r->text_buf;
        char prefix[PRINTK_PREFIX_MAX];
        bool truncated = false;
        size_t prefix_len;
        size_t line_len;
        size_t len = 0;
        char *next;

        /*
         * If the message was truncated because the buffer was not large
         * enough, treat the available text as if it were the full text.
         */
        if (text_len > buf_size)
                text_len = buf_size;

        prefix_len = info_print_prefix(r->info, syslog, time, prefix);

        /*
         * @text_len: bytes of unprocessed text
         * @line_len: bytes of current line _without_ newline
         * @text:     pointer to beginning of current line
         * @len:      number of bytes prepared in r->text_buf
         */
        for (;;) {
                next = memchr(text, '\n', text_len);
                if (next) {
                        line_len = next - text;
                } else {
                        /* Drop truncated line(s). */
                        if (truncated)
                                break;
                        line_len = text_len;
                }

                /*
                 * Truncate the text if there is not enough space to add the
                 * prefix and a trailing newline and a terminator.
                 */
                if (len + prefix_len + text_len + 1 + 1 > buf_size) {
                        /* Drop even the current line if no space. */
                        if (len + prefix_len + line_len + 1 + 1 > buf_size)
                                break;

                        text_len = buf_size - len - prefix_len - 1 - 1;
                        truncated = true;
                }

                memmove(text + prefix_len, text, text_len);
                memcpy(text, prefix, prefix_len);

                /*
                 * Increment the prepared length to include the text and
                 * prefix that were just moved+copied. Also increment for the
                 * newline at the end of this line. If this is the last line,
                 * there is no newline, but it will be added immediately below.
                 */
                len += prefix_len + line_len + 1;
                if (text_len == line_len) {
                        /*
                         * This is the last line. Add the trailing newline
                         * removed in vprintk_store().
                         */
                        text[prefix_len + line_len] = '\n';
                        break;
                }

                /*
                 * Advance beyond the added prefix and the related line with
                 * its newline.
                 */
                text += prefix_len + line_len + 1;

                /*
                 * The remaining text has only decreased by the line with its
                 * newline.
                 *
                 * Note that @text_len can become zero. It happens when @text
                 * ended with a newline (either due to truncation or the
                 * original string ending with "\n\n"). The loop is correctly
                 * repeated and (if not truncated) an empty line with a prefix
                 * will be prepared.
                 */
                text_len -= line_len + 1;
        }

        /*
         * If a buffer was provided, it will be terminated. Space for the
         * string terminator is guaranteed to be available. The terminator is
         * not counted in the return value.
         */
        if (buf_size > 0)
                r->text_buf[len] = 0;

        return len;
}

static size_t get_record_print_text_size(struct printk_info *info,
                                         unsigned int line_count,
                                         bool syslog, bool time)
{
        char prefix[PRINTK_PREFIX_MAX];
        size_t prefix_len;

        prefix_len = info_print_prefix(info, syslog, time, prefix);

        /*
         * Each line will be preceded with a prefix. The intermediate
         * newlines are already within the text, but a final trailing
         * newline will be added.
         */
        return ((prefix_len * line_count) + info->text_len + 1);
}

/*
 * Beginning with @start_seq, find the first record where it and all following
 * records up to (but not including) @max_seq fit into @size.
 *
 * @max_seq is simply an upper bound and does not need to exist. If the caller
 * does not require an upper bound, -1 can be used for @max_seq.
 */
static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size,
                                  bool syslog, bool time)
{
        struct printk_info info;
        unsigned int line_count;
        size_t len = 0;
        u64 seq;

        /* Determine the size of the records up to @max_seq. */
        prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
                if (info.seq >= max_seq)
                        break;
                len += get_record_print_text_size(&info, line_count, syslog, time);
        }

        /*
         * Adjust the upper bound for the next loop to avoid subtracting
         * lengths that were never added.
         */
        if (seq < max_seq)
                max_seq = seq;

        /*
         * Move first record forward until length fits into the buffer. Ignore
         * newest messages that were not counted in the above cycle. Messages
         * might appear and get lost in the meantime. This is a best effort
         * that prevents an infinite loop that could occur with a retry.
         */
        prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
                if (len <= size || info.seq >= max_seq)
                        break;
                len -= get_record_print_text_size(&info, line_count, syslog, time);
        }

        return seq;
}

/* The caller is responsible for making sure @size is greater than 0. */
static int syslog_print(char __user *buf, int size)
{
        struct printk_info info;
        struct printk_record r;
        char *text;
        int len = 0;
        u64 seq;

        text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);
        if (!text)
                return -ENOMEM;

        prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);

        mutex_lock(&syslog_lock);

        /*
         * Wait for the @syslog_seq record to be available. @syslog_seq may
         * change while waiting.
         */
        do {
                seq = syslog_seq;

                mutex_unlock(&syslog_lock);
                /*
                 * Guarantee this task is visible on the waitqueue before
                 * checking the wake condition.
                 *
                 * The full memory barrier within set_current_state() of
                 * prepare_to_wait_event() pairs with the full memory barrier
                 * within wq_has_sleeper().
                 *
                 * This pairs with __wake_up_klogd:A.
                 */
                len = wait_event_interruptible(log_wait,
                                prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */
                mutex_lock(&syslog_lock);

                if (len)
                        goto out;
        } while (syslog_seq != seq);

        /*
         * Copy records that fit into the buffer. The above cycle makes sure
         * that the first record is always available.
         */
        do {
                size_t n;
                size_t skip;
                int err;

                if (!prb_read_valid(prb, syslog_seq, &r))
                        break;

                if (r.info->seq != syslog_seq) {
                        /* message is gone, move to next valid one */
                        syslog_seq = r.info->seq;
                        syslog_partial = 0;
                }

                /*
                 * To keep reading/counting partial line consistent,
                 * use printk_time value as of the beginning of a line.
                 */
                if (!syslog_partial)
                        syslog_time = printk_time;

                skip = syslog_partial;
                n = record_print_text(&r, true, syslog_time);
                if (n - syslog_partial <= size) {
                        /* message fits into buffer, move forward */
                        syslog_seq = r.info->seq + 1;
                        n -= syslog_partial;
                        syslog_partial = 0;
                } else if (!len){
                        /* partial read(), remember position */
                        n = size;
                        syslog_partial += n;
                } else
                        n = 0;

                if (!n)
                        break;

                mutex_unlock(&syslog_lock);
                err = copy_to_user(buf, text + skip, n);
                mutex_lock(&syslog_lock);

                if (err) {
                        if (!len)
                                len = -EFAULT;
                        break;
                }

                len += n;
                size -= n;
                buf += n;
        } while (size);
out:
        mutex_unlock(&syslog_lock);
        kfree(text);
        return len;
}

static int syslog_print_all(char __user *buf, int size, bool clear)
{
        struct printk_info info;
        struct printk_record r;
        char *text;
        int len = 0;
        u64 seq;
        bool time;

        text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);
        if (!text)
                return -ENOMEM;

        time = printk_time;
        /*
         * Find first record that fits, including all following records,
         * into the user-provided buffer for this dump.
         */
        seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1,
                                     size, true, time);

        prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);

        prb_for_each_record(seq, prb, seq, &r) {
                int textlen;

                textlen = record_print_text(&r, true, time);

                if (len + textlen > size) {
                        seq--;
                        break;
                }

                if (copy_to_user(buf + len, text, textlen))
                        len = -EFAULT;
                else
                        len += textlen;

                if (len < 0)
                        break;
        }

        if (clear) {
                mutex_lock(&syslog_lock);
                latched_seq_write(&clear_seq, seq);
                mutex_unlock(&syslog_lock);
        }

        kfree(text);
        return len;
}

static void syslog_clear(void)
{
        mutex_lock(&syslog_lock);
        latched_seq_write(&clear_seq, prb_next_seq(prb));
        mutex_unlock(&syslog_lock);
}

int do_syslog(int type, char __user *buf, int len, int source)
{
        struct printk_info info;
        bool clear = false;
        static int saved_console_loglevel = LOGLEVEL_DEFAULT;
        int error;

        error = check_syslog_permissions(type, source);
        if (error)
                return error;

        switch (type) {
        case SYSLOG_ACTION_CLOSE:        /* Close log */
                break;
        case SYSLOG_ACTION_OPEN:        /* Open log */
                break;
        case SYSLOG_ACTION_READ:        /* Read from log */
                if (!buf || len < 0)
                        return -EINVAL;
                if (!len)
                        return 0;
                if (!access_ok(buf, len))
                        return -EFAULT;
                error = syslog_print(buf, len);
                break;
        /* Read/clear last kernel messages */
        case SYSLOG_ACTION_READ_CLEAR:
                clear = true;
                fallthrough;
        /* Read last kernel messages */
        case SYSLOG_ACTION_READ_ALL:
                if (!buf || len < 0)
                        return -EINVAL;
                if (!len)
                        return 0;
                if (!access_ok(buf, len))
                        return -EFAULT;
                error = syslog_print_all(buf, len, clear);
                break;
        /* Clear ring buffer */
        case SYSLOG_ACTION_CLEAR:
                syslog_clear();
                break;
        /* Disable logging to console */
        case SYSLOG_ACTION_CONSOLE_OFF:
                if (saved_console_loglevel == LOGLEVEL_DEFAULT)
                        saved_console_loglevel = console_loglevel;
                console_loglevel = minimum_console_loglevel;
                break;
        /* Enable logging to console */
        case SYSLOG_ACTION_CONSOLE_ON:
                if (saved_console_loglevel != LOGLEVEL_DEFAULT) {
                        console_loglevel = saved_console_loglevel;
                        saved_console_loglevel = LOGLEVEL_DEFAULT;
                }
                break;
        /* Set level of messages printed to console */
        case SYSLOG_ACTION_CONSOLE_LEVEL:
                if (len < 1 || len > 8)
                        return -EINVAL;
                if (len < minimum_console_loglevel)
                        len = minimum_console_loglevel;
                console_loglevel = len;
                /* Implicitly re-enable logging to console */
                saved_console_loglevel = LOGLEVEL_DEFAULT;
                break;
        /* Number of chars in the log buffer */
        case SYSLOG_ACTION_SIZE_UNREAD:
                mutex_lock(&syslog_lock);
                if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) {
                        /* No unread messages. */
                        mutex_unlock(&syslog_lock);
                        return 0;
                }
                if (info.seq != syslog_seq) {
                        /* messages are gone, move to first one */
                        syslog_seq = info.seq;
                        syslog_partial = 0;
                }
                if (source == SYSLOG_FROM_PROC) {
                        /*
                         * Short-cut for poll(/"proc/kmsg") which simply checks
                         * for pending data, not the size; return the count of
                         * records, not the length.
                         */
                        error = prb_next_seq(prb) - syslog_seq;
                } else {
                        bool time = syslog_partial ? syslog_time : printk_time;
                        unsigned int line_count;
                        u64 seq;

                        prb_for_each_info(syslog_seq, prb, seq, &info,
                                          &line_count) {
                                error += get_record_print_text_size(&info, line_count,
                                                                    true, time);
                                time = printk_time;
                        }
                        error -= syslog_partial;
                }
                mutex_unlock(&syslog_lock);
                break;
        /* Size of the log buffer */
        case SYSLOG_ACTION_SIZE_BUFFER:
                error = log_buf_len;
                break;
        default:
                error = -EINVAL;
                break;
        }

        return error;
}

SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
{
        return do_syslog(type, buf, len, SYSLOG_FROM_READER);
}

/*
 * Special console_lock variants that help to reduce the risk of soft-lockups.
 * They allow to pass console_lock to another printk() call using a busy wait.
 */

#ifdef CONFIG_LOCKDEP
static struct lockdep_map console_owner_dep_map = {
        .name = "console_owner"
};
#endif

static DEFINE_RAW_SPINLOCK(console_owner_lock);
static struct task_struct *console_owner;
static bool console_waiter;

/**
 * console_lock_spinning_enable - mark beginning of code where another
 *        thread might safely busy wait
 *
 * This basically converts console_lock into a spinlock. This marks
 * the section where the console_lock owner can not sleep, because
 * there may be a waiter spinning (like a spinlock). Also it must be
 * ready to hand over the lock at the end of the section.
 */
static void console_lock_spinning_enable(void)
{
        /*
         * Do not use spinning in panic(). The panic CPU wants to keep the lock.
         * Non-panic CPUs abandon the flush anyway.
         *
         * Just keep the lockdep annotation. The panic-CPU should avoid
         * taking console_owner_lock because it might cause a deadlock.
         * This looks like the easiest way how to prevent false lockdep
         * reports without handling races a lockless way.
         */
        if (panic_in_progress())
                goto lockdep;

        raw_spin_lock(&console_owner_lock);
        console_owner = current;
        raw_spin_unlock(&console_owner_lock);

lockdep:
        /* The waiter may spin on us after setting console_owner */
        spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
}

/**
 * console_lock_spinning_disable_and_check - mark end of code where another
 *        thread was able to busy wait and check if there is a waiter
 * @cookie: cookie returned from console_srcu_read_lock()
 *
 * This is called at the end of the section where spinning is allowed.
 * It has two functions. First, it is a signal that it is no longer
 * safe to start busy waiting for the lock. Second, it checks if
 * there is a busy waiter and passes the lock rights to her.
 *
 * Important: Callers lose both the console_lock and the SRCU read lock if
 *        there was a busy waiter. They must not touch items synchronized by
 *        console_lock or SRCU read lock in this case.
 *
 * Return: 1 if the lock rights were passed, 0 otherwise.
 */
static int console_lock_spinning_disable_and_check(int cookie)
{
        int waiter;

        /*
         * Ignore spinning waiters during panic() because they might get stopped
         * or blocked at any time,
         *
         * It is safe because nobody is allowed to start spinning during panic
         * in the first place. If there has been a waiter then non panic CPUs
         * might stay spinning. They would get stopped anyway. The panic context
         * will never start spinning and an interrupted spin on panic CPU will
         * never continue.
         */
        if (panic_in_progress()) {
                /* Keep lockdep happy. */
                spin_release(&console_owner_dep_map, _THIS_IP_);
                return 0;
        }

        raw_spin_lock(&console_owner_lock);
        waiter = READ_ONCE(console_waiter);
        console_owner = NULL;
        raw_spin_unlock(&console_owner_lock);

        if (!waiter) {
                spin_release(&console_owner_dep_map, _THIS_IP_);
                return 0;
        }

        /* The waiter is now free to continue */
        WRITE_ONCE(console_waiter, false);

        spin_release(&console_owner_dep_map, _THIS_IP_);

        /*
         * Preserve lockdep lock ordering. Release the SRCU read lock before
         * releasing the console_lock.
         */
        console_srcu_read_unlock(cookie);

        /*
         * Hand off console_lock to waiter. The waiter will perform
         * the up(). After this, the waiter is the console_lock owner.
         */
        mutex_release(&console_lock_dep_map, _THIS_IP_);
        return 1;
}

/**
 * console_trylock_spinning - try to get console_lock by busy waiting
 *
 * This allows to busy wait for the console_lock when the current
 * owner is running in specially marked sections. It means that
 * the current owner is running and cannot reschedule until it
 * is ready to lose the lock.
 *
 * Return: 1 if we got the lock, 0 othrewise
 */
static int console_trylock_spinning(void)
{
        struct task_struct *owner = NULL;
        bool waiter;
        bool spin = false;
        unsigned long flags;

        if (console_trylock())
                return 1;

        /*
         * It's unsafe to spin once a panic has begun. If we are the
         * panic CPU, we may have already halted the owner of the
         * console_sem. If we are not the panic CPU, then we should
         * avoid taking console_sem, so the panic CPU has a better
         * chance of cleanly acquiring it later.
         */
        if (panic_in_progress())
                return 0;

        printk_safe_enter_irqsave(flags);

        raw_spin_lock(&console_owner_lock);
        owner = READ_ONCE(console_owner);
        waiter = READ_ONCE(console_waiter);
        if (!waiter && owner && owner != current) {
                WRITE_ONCE(console_waiter, true);
                spin = true;
        }
        raw_spin_unlock(&console_owner_lock);

        /*
         * If there is an active printk() writing to the
         * consoles, instead of having it write our data too,
         * see if we can offload that load from the active
         * printer, and do some printing ourselves.
         * Go into a spin only if there isn't already a waiter
         * spinning, and there is an active printer, and
         * that active printer isn't us (recursive printk?).
         */
        if (!spin) {
                printk_safe_exit_irqrestore(flags);
                return 0;
        }

        /* We spin waiting for the owner to release us */
        spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
        /* Owner will clear console_waiter on hand off */
        while (READ_ONCE(console_waiter))
                cpu_relax();
        spin_release(&console_owner_dep_map, _THIS_IP_);

        printk_safe_exit_irqrestore(flags);
        /*
         * The owner passed the console lock to us.
         * Since we did not spin on console lock, annotate
         * this as a trylock. Otherwise lockdep will
         * complain.
         */
        mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_);

        /*
         * Update @console_may_schedule for trylock because the previous
         * owner may have been schedulable.
         */
        console_may_schedule = 0;

        return 1;
}

/*
 * Recursion is tracked separately on each CPU. If NMIs are supported, an
 * additional NMI context per CPU is also separately tracked. Until per-CPU
 * is available, a separate "early tracking" is performed.
 */
static DEFINE_PER_CPU(u8, printk_count);
static u8 printk_count_early;
#ifdef CONFIG_HAVE_NMI
static DEFINE_PER_CPU(u8, printk_count_nmi);
static u8 printk_count_nmi_early;
#endif

/*
 * Recursion is limited to keep the output sane. printk() should not require
 * more than 1 level of recursion (allowing, for example, printk() to trigger
 * a WARN), but a higher value is used in case some printk-internal errors
 * exist, such as the ringbuffer validation checks failing.
 */
#define PRINTK_MAX_RECURSION 3

/*
 * Return a pointer to the dedicated counter for the CPU+context of the
 * caller.
 */
static u8 *__printk_recursion_counter(void)
{
#ifdef CONFIG_HAVE_NMI
        if (in_nmi()) {
                if (printk_percpu_data_ready())
                        return this_cpu_ptr(&printk_count_nmi);
                return &printk_count_nmi_early;
        }
#endif
        if (printk_percpu_data_ready())
                return this_cpu_ptr(&printk_count);
        return &printk_count_early;
}

/*
 * Enter recursion tracking. Interrupts are disabled to simplify tracking.
 * The caller must check the boolean return value to see if the recursion is
 * allowed. On failure, interrupts are not disabled.
 *
 * @recursion_ptr must be a variable of type (u8 *) and is the same variable
 * that is passed to printk_exit_irqrestore().
 */
#define printk_enter_irqsave(recursion_ptr, flags)        \
({                                                        \
        bool success = true;                                \
                                                        \
        typecheck(u8 *, recursion_ptr);                        \
        local_irq_save(flags);                                \
        (recursion_ptr) = __printk_recursion_counter();        \
        if (*(recursion_ptr) > PRINTK_MAX_RECURSION) {        \
                local_irq_restore(flags);                \
                success = false;                        \
        } else {                                        \
                (*(recursion_ptr))++;                        \
        }                                                \
        success;                                        \
})

/* Exit recursion tracking, restoring interrupts. */
#define printk_exit_irqrestore(recursion_ptr, flags)        \
        do {                                                \
                typecheck(u8 *, recursion_ptr);                \
                (*(recursion_ptr))--;                        \
                local_irq_restore(flags);                \
        } while (0)

int printk_delay_msec __read_mostly;

static inline void printk_delay(int level)
{
        boot_delay_msec(level);

        if (unlikely(printk_delay_msec)) {
                int m = printk_delay_msec;

                while (m--) {
                        mdelay(1);
                        touch_nmi_watchdog();
                }
        }
}

static inline u32 printk_caller_id(void)
{
        return in_task() ? task_pid_nr(current) :
                0x80000000 + smp_processor_id();
}

/**
 * printk_parse_prefix - Parse level and control flags.
 *
 * @text:     The terminated text message.
 * @level:    A pointer to the current level value, will be updated.
 * @flags:    A pointer to the current printk_info flags, will be updated.
 *
 * @level may be NULL if the caller is not interested in the parsed value.
 * Otherwise the variable pointed to by @level must be set to
 * LOGLEVEL_DEFAULT in order to be updated with the parsed value.
 *
 * @flags may be NULL if the caller is not interested in the parsed value.
 * Otherwise the variable pointed to by @flags will be OR'd with the parsed
 * value.
 *
 * Return: The length of the parsed level and control flags.
 */
u16 printk_parse_prefix(const char *text, int *level,
                        enum printk_info_flags *flags)
{
        u16 prefix_len = 0;
        int kern_level;

        while (*text) {
                kern_level = printk_get_level(text);
                if (!kern_level)
                        break;

                switch (kern_level) {
                case '0' ... '7':
                        if (level && *level == LOGLEVEL_DEFAULT)
                                *level = kern_level - '0';
                        break;
                case 'c':        /* KERN_CONT */
                        if (flags)
                                *flags |= LOG_CONT;
                }

                prefix_len += 2;
                text += 2;
        }

        return prefix_len;
}

__printf(5, 0)
static u16 printk_sprint(char *text, u16 size, int facility,
                         enum printk_info_flags *flags, const char *fmt,
                         va_list args)
{
        u16 text_len;

        text_len = vscnprintf(text, size, fmt, args);

        /* Mark and strip a trailing newline. */
        if (text_len && text[text_len - 1] == '\n') {
                text_len--;
                *flags |= LOG_NEWLINE;
        }

        /* Strip log level and control flags. */
        if (facility == 0) {
                u16 prefix_len;

                prefix_len = printk_parse_prefix(text, NULL, NULL);
                if (prefix_len) {
                        text_len -= prefix_len;
                        memmove(text, text + prefix_len, text_len);
                }
        }

        trace_console(text, text_len);

        return text_len;
}

__printf(4, 0)
int vprintk_store(int facility, int level,
                  const struct dev_printk_info *dev_info,
                  const char *fmt, va_list args)
{
        struct prb_reserved_entry e;
        enum printk_info_flags flags = 0;
        struct printk_record r;
        unsigned long irqflags;
        u16 trunc_msg_len = 0;
        char prefix_buf[8];
        u8 *recursion_ptr;
        u16 reserve_size;
        va_list args2;
        u32 caller_id;
        u16 text_len;
        int ret = 0;
        u64 ts_nsec;

        if (!printk_enter_irqsave(recursion_ptr, irqflags))
                return 0;

        /*
         * Since the duration of printk() can vary depending on the message
         * and state of the ringbuffer, grab the timestamp now so that it is
         * close to the call of printk(). This provides a more deterministic
         * timestamp with respect to the caller.
         */
        ts_nsec = local_clock();

        caller_id = printk_caller_id();

        /*
         * The sprintf needs to come first since the syslog prefix might be
         * passed in as a parameter. An extra byte must be reserved so that
         * later the vscnprintf() into the reserved buffer has room for the
         * terminating '\0', which is not counted by vsnprintf().
         */
        va_copy(args2, args);
        reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1;
        va_end(args2);

        if (reserve_size > PRINTKRB_RECORD_MAX)
                reserve_size = PRINTKRB_RECORD_MAX;

        /* Extract log level or control flags. */
        if (facility == 0)
                printk_parse_prefix(&prefix_buf[0], &level, &flags);

        if (level == LOGLEVEL_DEFAULT)
                level = default_message_loglevel;

        if (dev_info)
                flags |= LOG_NEWLINE;

        if (flags & LOG_CONT) {
                prb_rec_init_wr(&r, reserve_size);
                if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) {
                        text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size,
                                                 facility, &flags, fmt, args);
                        r.info->text_len += text_len;

                        if (flags & LOG_NEWLINE) {
                                r.info->flags |= LOG_NEWLINE;
                                prb_final_commit(&e);
                        } else {
                                prb_commit(&e);
                        }

                        ret = text_len;
                        goto out;
                }
        }

        /*
         * Explicitly initialize the record before every prb_reserve() call.
         * prb_reserve_in_last() and prb_reserve() purposely invalidate the
         * structure when they fail.
         */
        prb_rec_init_wr(&r, reserve_size);
        if (!prb_reserve(&e, prb, &r)) {
                /* truncate the message if it is too long for empty buffer */
                truncate_msg(&reserve_size, &trunc_msg_len);

                prb_rec_init_wr(&r, reserve_size + trunc_msg_len);
                if (!prb_reserve(&e, prb, &r))
                        goto out;
        }

        /* fill message */
        text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args);
        if (trunc_msg_len)
                memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len);
        r.info->text_len = text_len + trunc_msg_len;
        r.info->facility = facility;
        r.info->level = level & 7;
        r.info->flags = flags & 0x1f;
        r.info->ts_nsec = ts_nsec;
        r.info->caller_id = caller_id;
        if (dev_info)
                memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info));

        /* A message without a trailing newline can be continued. */
        if (!(flags & LOG_NEWLINE))
                prb_commit(&e);
        else
                prb_final_commit(&e);

        ret = text_len + trunc_msg_len;
out:
        printk_exit_irqrestore(recursion_ptr, irqflags);
        return ret;
}

asmlinkage int vprintk_emit(int facility, int level,
                            const struct dev_printk_info *dev_info,
                            const char *fmt, va_list args)
{
        int printed_len;
        bool in_sched = false;

        /* Suppress unimportant messages after panic happens */
        if (unlikely(suppress_printk))
                return 0;

        /*
         * The messages on the panic CPU are the most important. If
         * non-panic CPUs are generating any messages, they will be
         * silently dropped.
         */
        if (other_cpu_in_panic())
                return 0;

        if (level == LOGLEVEL_SCHED) {
                level = LOGLEVEL_DEFAULT;
                in_sched = true;
        }

        printk_delay(level);

        printed_len = vprintk_store(facility, level, dev_info, fmt, args);

        /* If called from the scheduler, we can not call up(). */
        if (!in_sched) {
                /*
                 * The caller may be holding system-critical or
                 * timing-sensitive locks. Disable preemption during
                 * printing of all remaining records to all consoles so that
                 * this context can return as soon as possible. Hopefully
                 * another printk() caller will take over the printing.
                 */
                preempt_disable();
                /*
                 * Try to acquire and then immediately release the console
                 * semaphore. The release will print out buffers. With the
                 * spinning variant, this context tries to take over the
                 * printing from another printing context.
                 */
                if (console_trylock_spinning())
                        console_unlock();
                preempt_enable();
        }

        if (in_sched)
                defer_console_output();
        else
                wake_up_klogd();

        return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);

int vprintk_default(const char *fmt, va_list args)
{
        return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
}
EXPORT_SYMBOL_GPL(vprintk_default);

asmlinkage __visible int _printk(const char *fmt, ...)
{
        va_list args;
        int r;

        va_start(args, fmt);
        r = vprintk(fmt, args);
        va_end(args);

        return r;
}
EXPORT_SYMBOL(_printk);

static bool pr_flush(int timeout_ms, bool reset_on_progress);
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress);

#else /* CONFIG_PRINTK */

#define printk_time                false

#define prb_read_valid(rb, seq, r)        false
#define prb_first_valid_seq(rb)                0
#define prb_next_seq(rb)                0

static u64 syslog_seq;

static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; }
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }

#endif /* CONFIG_PRINTK */

#ifdef CONFIG_EARLY_PRINTK
struct console *early_console;

asmlinkage __visible void early_printk(const char *fmt, ...)
{
        va_list ap;
        char buf[512];
        int n;

        if (!early_console)
                return;

        va_start(ap, fmt);
        n = vscnprintf(buf, sizeof(buf), fmt, ap);
        va_end(ap);

        early_console->write(early_console, buf, n);
}
#endif

static void set_user_specified(struct console_cmdline *c, bool user_specified)
{
        if (!user_specified)
                return;

        /*
         * @c console was defined by the user on the command line.
         * Do not clear when added twice also by SPCR or the device tree.
         */
        c->user_specified = true;
        /* At least one console defined by the user on the command line. */
        console_set_on_cmdline = 1;
}

static int __add_preferred_console(const char *name, const short idx, char *options,
                                   char *brl_options, bool user_specified)
{
        struct console_cmdline *c;
        int i;

        /*
         * We use a signed short index for struct console for device drivers to
         * indicate a not yet assigned index or port. However, a negative index
         * value is not valid for preferred console.
         */
        if (idx < 0)
                return -EINVAL;

        /*
         *        See if this tty is not yet registered, and
         *        if we have a slot free.
         */
        for (i = 0, c = console_cmdline;
             i < MAX_CMDLINECONSOLES && c->name[0];
             i++, c++) {
                if (strcmp(c->name, name) == 0 && c->index == idx) {
                        if (!brl_options)
                                preferred_console = i;
                        set_user_specified(c, user_specified);
                        return 0;
                }
        }
        if (i == MAX_CMDLINECONSOLES)
                return -E2BIG;
        if (!brl_options)
                preferred_console = i;
        strscpy(c->name, name, sizeof(c->name));
        c->options = options;
        set_user_specified(c, user_specified);
        braille_set_options(c, brl_options);

        c->index = idx;
        return 0;
}

static int __init console_msg_format_setup(char *str)
{
        if (!strcmp(str, "syslog"))
                console_msg_format = MSG_FORMAT_SYSLOG;
        if (!strcmp(str, "default"))
                console_msg_format = MSG_FORMAT_DEFAULT;
        return 1;
}
__setup("console_msg_format=", console_msg_format_setup);

/*
 * Set up a console.  Called via do_early_param() in init/main.c
 * for each "console=" parameter in the boot command line.
 */
static int __init console_setup(char *str)
{
        char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */
        char *s, *options, *brl_options = NULL;
        int idx;

        /*
         * console="" or console=null have been suggested as a way to
         * disable console output. Use ttynull that has been created
         * for exactly this purpose.
         */
        if (str[0] == 0 || strcmp(str, "null") == 0) {
                __add_preferred_console("ttynull", 0, NULL, NULL, true);
                return 1;
        }

        if (_braille_console_setup(&str, &brl_options))
                return 1;

        /* Save the console for driver subsystem use */
        if (console_opt_save(str, brl_options))
                return 1;

        /* Flag register_console() to not call try_enable_default_console() */
        console_set_on_cmdline = 1;

        /* Don't attempt to parse a DEVNAME:0.0 style console */
        if (strchr(str, ':'))
                return 1;

        /*
         * Decode str into name, index, options.
         */
        if (isdigit(str[0]))
                scnprintf(buf, sizeof(buf), "ttyS%s", str);
        else
                strscpy(buf, str);

        options = strchr(str, ',');
        if (options)
                *(options++) = 0;

#ifdef __sparc__
        if (!strcmp(str, "ttya"))
                strscpy(buf, "ttyS0");
        if (!strcmp(str, "ttyb"))
                strscpy(buf, "ttyS1");
#endif

        for (s = buf; *s; s++)
                if (isdigit(*s) || *s == ',')
                        break;
        idx = simple_strtoul(s, NULL, 10);
        *s = 0;

        __add_preferred_console(buf, idx, options, brl_options, true);
        return 1;
}
__setup("console=", console_setup);

/* Only called from add_preferred_console_match() */
int console_opt_add_preferred_console(const char *name, const short idx,
                                      char *options, char *brl_options)
{
        return __add_preferred_console(name, idx, options, brl_options, true);
}

/**
 * add_preferred_console - add a device to the list of preferred consoles.
 * @name: device name
 * @idx: device index
 * @options: options for this console
 *
 * The last preferred console added will be used for kernel messages
 * and stdin/out/err for init.  Normally this is used by console_setup
 * above to handle user-supplied console arguments; however it can also
 * be used by arch-specific code either to override the user or more
 * commonly to provide a default console (ie from PROM variables) when
 * the user has not supplied one.
 */
int add_preferred_console(const char *name, const short idx, char *options)
{
        return __add_preferred_console(name, idx, options, NULL, false);
}

bool console_suspend_enabled = true;
EXPORT_SYMBOL(console_suspend_enabled);

static int __init console_suspend_disable(char *str)
{
        console_suspend_enabled = false;
        return 1;
}
__setup("no_console_suspend", console_suspend_disable);
module_param_named(console_suspend, console_suspend_enabled,
                bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
        " and hibernate operations");

static bool printk_console_no_auto_verbose;

void console_verbose(void)
{
        if (console_loglevel && !printk_console_no_auto_verbose)
                console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
}
EXPORT_SYMBOL_GPL(console_verbose);

module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool, 0644);
MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc");

/**
 * suspend_console - suspend the console subsystem
 *
 * This disables printk() while we go into suspend states
 */
void suspend_console(void)
{
        struct console *con;

        if (!console_suspend_enabled)
                return;
        pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
        pr_flush(1000, true);

        console_list_lock();
        for_each_console(con)
                console_srcu_write_flags(con, con->flags | CON_SUSPENDED);
        console_list_unlock();

        /*
         * Ensure that all SRCU list walks have completed. All printing
         * contexts must be able to see that they are suspended so that it
         * is guaranteed that all printing has stopped when this function
         * completes.
         */
        synchronize_srcu(&console_srcu);
}

void resume_console(void)
{
        struct console *con;

        if (!console_suspend_enabled)
                return;

        console_list_lock();
        for_each_console(con)
                console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED);
        console_list_unlock();

        /*
         * Ensure that all SRCU list walks have completed. All printing
         * contexts must be able to see they are no longer suspended so
         * that they are guaranteed to wake up and resume printing.
         */
        synchronize_srcu(&console_srcu);

        pr_flush(1000, true);
}

/**
 * console_cpu_notify - print deferred console messages after CPU hotplug
 * @cpu: unused
 *
 * If printk() is called from a CPU that is not online yet, the messages
 * will be printed on the console only if there are CON_ANYTIME consoles.
 * This function is called when a new CPU comes online (or fails to come
 * up) or goes offline.
 */
static int console_cpu_notify(unsigned int cpu)
{
        if (!cpuhp_tasks_frozen) {
                /* If trylock fails, someone else is doing the printing */
                if (console_trylock())
                        console_unlock();
        }
        return 0;
}

/**
 * console_lock - block the console subsystem from printing
 *
 * Acquires a lock which guarantees that no consoles will
 * be in or enter their write() callback.
 *
 * Can sleep, returns nothing.
 */
void console_lock(void)
{
        might_sleep();

        /* On panic, the console_lock must be left to the panic cpu. */
        while (other_cpu_in_panic())
                msleep(1000);

        down_console_sem();
        console_locked = 1;
        console_may_schedule = 1;
}
EXPORT_SYMBOL(console_lock);

/**
 * console_trylock - try to block the console subsystem from printing
 *
 * Try to acquire a lock which guarantees that no consoles will
 * be in or enter their write() callback.
 *
 * returns 1 on success, and 0 on failure to acquire the lock.
 */
int console_trylock(void)
{
        /* On panic, the console_lock must be left to the panic cpu. */
        if (other_cpu_in_panic())
                return 0;
        if (down_trylock_console_sem())
                return 0;
        console_locked = 1;
        console_may_schedule = 0;
        return 1;
}
EXPORT_SYMBOL(console_trylock);

int is_console_locked(void)
{
        return console_locked;
}
EXPORT_SYMBOL(is_console_locked);

/*
 * Check if the given console is currently capable and allowed to print
 * records.
 *
 * Requires the console_srcu_read_lock.
 */
static inline bool console_is_usable(struct console *con)
{
        short flags = console_srcu_read_flags(con);

        if (!(flags & CON_ENABLED))
                return false;

        if ((flags & CON_SUSPENDED))
                return false;

        if (!con->write)
                return false;

        /*
         * Console drivers may assume that per-cpu resources have been
         * allocated. So unless they're explicitly marked as being able to
         * cope (CON_ANYTIME) don't call them until this CPU is officially up.
         */
        if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
                return false;

        return true;
}

static void __console_unlock(void)
{
        console_locked = 0;
        up_console_sem();
}

#ifdef CONFIG_PRINTK

/*
 * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". This
 * is achieved by shifting the existing message over and inserting the dropped
 * message.
 *
 * @pmsg is the printk message to prepend.
 *
 * @dropped is the dropped count to report in the dropped message.
 *
 * If the message text in @pmsg->pbufs->outbuf does not have enough space for
 * the dropped message, the message text will be sufficiently truncated.
 *
 * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated.
 */
void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
{
        struct printk_buffers *pbufs = pmsg->pbufs;
        const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
        const size_t outbuf_sz = sizeof(pbufs->outbuf);
        char *scratchbuf = &pbufs->scratchbuf[0];
        char *outbuf = &pbufs->outbuf[0];
        size_t len;

        len = scnprintf(scratchbuf, scratchbuf_sz,
                       "** %lu printk messages dropped **\n", dropped);

        /*
         * Make sure outbuf is sufficiently large before prepending.
         * Keep at least the prefix when the message must be truncated.
         * It is a rather theoretical problem when someone tries to
         * use a minimalist buffer.
         */
        if (WARN_ON_ONCE(len + PRINTK_PREFIX_MAX >= outbuf_sz))
                return;

        if (pmsg->outbuf_len + len >= outbuf_sz) {
                /* Truncate the message, but keep it terminated. */
                pmsg->outbuf_len = outbuf_sz - (len + 1);
                outbuf[pmsg->outbuf_len] = 0;
        }

        memmove(outbuf + len, outbuf, pmsg->outbuf_len + 1);
        memcpy(outbuf, scratchbuf, len);
        pmsg->outbuf_len += len;
}

/*
 * Read and format the specified record (or a later record if the specified
 * record is not available).
 *
 * @pmsg will contain the formatted result. @pmsg->pbufs must point to a
 * struct printk_buffers.
 *
 * @seq is the record to read and format. If it is not available, the next
 * valid record is read.
 *
 * @is_extended specifies if the message should be formatted for extended
 * console output.
 *
 * @may_supress specifies if records may be skipped based on loglevel.
 *
 * Returns false if no record is available. Otherwise true and all fields
 * of @pmsg are valid. (See the documentation of struct printk_message
 * for information about the @pmsg fields.)
 */
bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
                             bool is_extended, bool may_suppress)
{
        struct printk_buffers *pbufs = pmsg->pbufs;
        const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
        const size_t outbuf_sz = sizeof(pbufs->outbuf);
        char *scratchbuf = &pbufs->scratchbuf[0];
        char *outbuf = &pbufs->outbuf[0];
        struct printk_info info;
        struct printk_record r;
        size_t len = 0;

        /*
         * Formatting extended messages requires a separate buffer, so use the
         * scratch buffer to read in the ringbuffer text.
         *
         * Formatting normal messages is done in-place, so read the ringbuffer
         * text directly into the output buffer.
         */
        if (is_extended)
                prb_rec_init_rd(&r, &info, scratchbuf, scratchbuf_sz);
        else
                prb_rec_init_rd(&r, &info, outbuf, outbuf_sz);

        if (!prb_read_valid(prb, seq, &r))
                return false;

        pmsg->seq = r.info->seq;
        pmsg->dropped = r.info->seq - seq;

        /* Skip record that has level above the console loglevel. */
        if (may_suppress && suppress_message_printing(r.info->level))
                goto out;

        if (is_extended) {
                len = info_print_ext_header(outbuf, outbuf_sz, r.info);
                len += msg_print_ext_body(outbuf + len, outbuf_sz - len,
                                          &r.text_buf[0], r.info->text_len, &r.info->dev_info);
        } else {
                len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
        }
out:
        pmsg->outbuf_len = len;
        return true;
}

/*
 * Used as the printk buffers for non-panic, serialized console printing.
 * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles.
 * Its usage requires the console_lock held.
 */
struct printk_buffers printk_shared_pbufs;

/*
 * Print one record for the given console. The record printed is whatever
 * record is the next available record for the given console.
 *
 * @handover will be set to true if a printk waiter has taken over the
 * console_lock, in which case the caller is no longer holding both the
 * console_lock and the SRCU read lock. Otherwise it is set to false.
 *
 * @cookie is the cookie from the SRCU read lock.
 *
 * Returns false if the given console has no next record to print, otherwise
 * true.
 *
 * Requires the console_lock and the SRCU read lock.
 */
static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
{
        bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
        char *outbuf = &printk_shared_pbufs.outbuf[0];
        struct printk_message pmsg = {
                .pbufs = &printk_shared_pbufs,
        };
        unsigned long flags;

        *handover = false;

        if (!printk_get_next_message(&pmsg, con->seq, is_extended, true))
                return false;

        con->dropped += pmsg.dropped;

        /* Skip messages of formatted length 0. */
        if (pmsg.outbuf_len == 0) {
                con->seq = pmsg.seq + 1;
                goto skip;
        }

        if (con->dropped && !is_extended) {
                console_prepend_dropped(&pmsg, con->dropped);
                con->dropped = 0;
        }

        /*
         * While actively printing out messages, if another printk()
         * were to occur on another CPU, it may wait for this one to
         * finish. This task can not be preempted if there is a
         * waiter waiting to take over.
         *
         * Interrupts are disabled because the hand over to a waiter
         * must not be interrupted until the hand over is completed
         * (@console_waiter is cleared).
         */
        printk_safe_enter_irqsave(flags);
        console_lock_spinning_enable();

        /* Do not trace print latency. */
        stop_critical_timings();

        /* Write everything out to the hardware. */
        con->write(con, outbuf, pmsg.outbuf_len);

        start_critical_timings();

        con->seq = pmsg.seq + 1;

        *handover = console_lock_spinning_disable_and_check(cookie);
        printk_safe_exit_irqrestore(flags);
skip:
        return true;
}

#else

static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
{
        *handover = false;
        return false;
}

#endif /* CONFIG_PRINTK */

/*
 * Print out all remaining records to all consoles.
 *
 * @do_cond_resched is set by the caller. It can be true only in schedulable
 * context.
 *
 * @next_seq is set to the sequence number after the last available record.
 * The value is valid only when this function returns true. It means that all
 * usable consoles are completely flushed.
 *
 * @handover will be set to true if a printk waiter has taken over the
 * console_lock, in which case the caller is no longer holding the
 * console_lock. Otherwise it is set to false.
 *
 * Returns true when there was at least one usable console and all messages
 * were flushed to all usable consoles. A returned false informs the caller
 * that everything was not flushed (either there were no usable consoles or
 * another context has taken over printing or it is a panic situation and this
 * is not the panic CPU). Regardless the reason, the caller should assume it
 * is not useful to immediately try again.
 *
 * Requires the console_lock.
 */
static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover)
{
        bool any_usable = false;
        struct console *con;
        bool any_progress;
        int cookie;

        *next_seq = 0;
        *handover = false;

        do {
                any_progress = false;

                cookie = console_srcu_read_lock();
                for_each_console_srcu(con) {
                        bool progress;

                        if (!console_is_usable(con))
                                continue;
                        any_usable = true;

                        progress = console_emit_next_record(con, handover, cookie);

                        /*
                         * If a handover has occurred, the SRCU read lock
                         * is already released.
                         */
                        if (*handover)
                                return false;

                        /* Track the next of the highest seq flushed. */
                        if (con->seq > *next_seq)
                                *next_seq = con->seq;

                        if (!progress)
                                continue;
                        any_progress = true;

                        /* Allow panic_cpu to take over the consoles safely. */
                        if (other_cpu_in_panic())
                                goto abandon;

                        if (do_cond_resched)
                                cond_resched();
                }
                console_srcu_read_unlock(cookie);
        } while (any_progress);

        return any_usable;

abandon:
        console_srcu_read_unlock(cookie);
        return false;
}

/**
 * console_unlock - unblock the console subsystem from printing
 *
 * Releases the console_lock which the caller holds to block printing of
 * the console subsystem.
 *
 * While the console_lock was held, console output may have been buffered
 * by printk().  If this is the case, console_unlock(); emits
 * the output prior to releasing the lock.
 *
 * console_unlock(); may be called from any context.
 */
void console_unlock(void)
{
        bool do_cond_resched;
        bool handover;
        bool flushed;
        u64 next_seq;

        /*
         * Console drivers are called with interrupts disabled, so
         * @console_may_schedule should be cleared before; however, we may
         * end up dumping a lot of lines, for example, if called from
         * console registration path, and should invoke cond_resched()
         * between lines if allowable.  Not doing so can cause a very long
         * scheduling stall on a slow console leading to RCU stall and
         * softlockup warnings which exacerbate the issue with more
         * messages practically incapacitating the system. Therefore, create
         * a local to use for the printing loop.
         */
        do_cond_resched = console_may_schedule;

        do {
                console_may_schedule = 0;

                flushed = console_flush_all(do_cond_resched, &next_seq, &handover);
                if (!handover)
                        __console_unlock();

                /*
                 * Abort if there was a failure to flush all messages to all
                 * usable consoles. Either it is not possible to flush (in
                 * which case it would be an infinite loop of retrying) or
                 * another context has taken over printing.
                 */
                if (!flushed)
                        break;

                /*
                 * Some context may have added new records after
                 * console_flush_all() but before unlocking the console.
                 * Re-check if there is a new record to flush. If the trylock
                 * fails, another context is already handling the printing.
                 */
        } while (prb_read_valid(prb, next_seq, NULL) && console_trylock());
}
EXPORT_SYMBOL(console_unlock);

/**
 * console_conditional_schedule - yield the CPU if required
 *
 * If the console code is currently allowed to sleep, and
 * if this CPU should yield the CPU to another task, do
 * so here.
 *
 * Must be called within console_lock();.
 */
void __sched console_conditional_schedule(void)
{
        if (console_may_schedule)
                cond_resched();
}
EXPORT_SYMBOL(console_conditional_schedule);

void console_unblank(void)
{
        bool found_unblank = false;
        struct console *c;
        int cookie;

        /*
         * First check if there are any consoles implementing the unblank()
         * callback. If not, there is no reason to continue and take the
         * console lock, which in particular can be dangerous if
         * @oops_in_progress is set.
         */
        cookie = console_srcu_read_lock();
        for_each_console_srcu(c) {
                if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) {
                        found_unblank = true;
                        break;
                }
        }
        console_srcu_read_unlock(cookie);
        if (!found_unblank)
                return;

        /*
         * Stop console printing because the unblank() callback may
         * assume the console is not within its write() callback.
         *
         * If @oops_in_progress is set, this may be an atomic context.
         * In that case, attempt a trylock as best-effort.
         */
        if (oops_in_progress) {
                /* Semaphores are not NMI-safe. */
                if (in_nmi())
                        return;

                /*
                 * Attempting to trylock the console lock can deadlock
                 * if another CPU was stopped while modifying the
                 * semaphore. "Hope and pray" that this is not the
                 * current situation.
                 */
                if (down_trylock_console_sem() != 0)
                        return;
        } else
                console_lock();

        console_locked = 1;
        console_may_schedule = 0;

        cookie = console_srcu_read_lock();
        for_each_console_srcu(c) {
                if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank)
                        c->unblank();
        }
        console_srcu_read_unlock(cookie);

        console_unlock();

        if (!oops_in_progress)
                pr_flush(1000, true);
}

/*
 * Rewind all consoles to the oldest available record.
 *
 * IMPORTANT: The function is safe only when called under
 *            console_lock(). It is not enforced because
 *            it is used as a best effort in panic().
 */
static void __console_rewind_all(void)
{
        struct console *c;
        short flags;
        int cookie;
        u64 seq;

        seq = prb_first_valid_seq(prb);

        cookie = console_srcu_read_lock();
        for_each_console_srcu(c) {
                flags = console_srcu_read_flags(c);

                if (flags & CON_NBCON) {
                        nbcon_seq_force(c, seq);
                } else {
                        /*
                         * This assignment is safe only when called under
                         * console_lock(). On panic, legacy consoles are
                         * only best effort.
                         */
                        c->seq = seq;
                }
        }
        console_srcu_read_unlock(cookie);
}

/**
 * console_flush_on_panic - flush console content on panic
 * @mode: flush all messages in buffer or just the pending ones
 *
 * Immediately output all pending messages no matter what.
 */
void console_flush_on_panic(enum con_flush_mode mode)
{
        bool handover;
        u64 next_seq;

        /*
         * Ignore the console lock and flush out the messages. Attempting a
         * trylock would not be useful because:
         *
         *   - if it is contended, it must be ignored anyway
         *   - console_lock() and console_trylock() block and fail
         *     respectively in panic for non-panic CPUs
         *   - semaphores are not NMI-safe
         */

        /*
         * If another context is holding the console lock,
         * @console_may_schedule might be set. Clear it so that
         * this context does not call cond_resched() while flushing.
         */
        console_may_schedule = 0;

        if (mode == CONSOLE_REPLAY_ALL)
                __console_rewind_all();

        console_flush_all(false, &next_seq, &handover);
}

/*
 * Return the console tty driver structure and its associated index
 */
struct tty_driver *console_device(int *index)
{
        struct console *c;
        struct tty_driver *driver = NULL;
        int cookie;

        /*
         * Take console_lock to serialize device() callback with
         * other console operations. For example, fg_console is
         * modified under console_lock when switching vt.
         */
        console_lock();

        cookie = console_srcu_read_lock();
        for_each_console_srcu(c) {
                if (!c->device)
                        continue;
                driver = c->device(c, index);
                if (driver)
                        break;
        }
        console_srcu_read_unlock(cookie);

        console_unlock();
        return driver;
}

/*
 * Prevent further output on the passed console device so that (for example)
 * serial drivers can disable console output before suspending a port, and can
 * re-enable output afterwards.
 */
void console_stop(struct console *console)
{
        __pr_flush(console, 1000, true);
        console_list_lock();
        console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
        console_list_unlock();

        /*
         * Ensure that all SRCU list walks have completed. All contexts must
         * be able to see that this console is disabled so that (for example)
         * the caller can suspend the port without risk of another context
         * using the port.
         */
        synchronize_srcu(&console_srcu);
}
EXPORT_SYMBOL(console_stop);

void console_start(struct console *console)
{
        console_list_lock();
        console_srcu_write_flags(console, console->flags | CON_ENABLED);
        console_list_unlock();
        __pr_flush(console, 1000, true);
}
EXPORT_SYMBOL(console_start);

static int __read_mostly keep_bootcon;

static int __init keep_bootcon_setup(char *str)
{
        keep_bootcon = 1;
        pr_info("debug: skip boot console de-registration.\n");

        return 0;
}

early_param("keep_bootcon", keep_bootcon_setup);

static int console_call_setup(struct console *newcon, char *options)
{
        int err;

        if (!newcon->setup)
                return 0;

        /* Synchronize with possible boot console. */
        console_lock();
        err = newcon->setup(newcon, options);
        console_unlock();

        return err;
}

/*
 * This is called by register_console() to try to match
 * the newly registered console with any of the ones selected
 * by either the command line or add_preferred_console() and
 * setup/enable it.
 *
 * Care need to be taken with consoles that are statically
 * enabled such as netconsole
 */
static int try_enable_preferred_console(struct console *newcon,
                                        bool user_specified)
{
        struct console_cmdline *c;
        int i, err;

        for (i = 0, c = console_cmdline;
             i < MAX_CMDLINECONSOLES && c->name[0];
             i++, c++) {
                if (c->user_specified != user_specified)
                        continue;
                if (!newcon->match ||
                    newcon->match(newcon, c->name, c->index, c->options) != 0) {
                        /* default matching */
                        BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name));
                        if (strcmp(c->name, newcon->name) != 0)
                                continue;
                        if (newcon->index >= 0 &&
                            newcon->index != c->index)
                                continue;
                        if (newcon->index < 0)
                                newcon->index = c->index;

                        if (_braille_register_console(newcon, c))
                                return 0;

                        err = console_call_setup(newcon, c->options);
                        if (err)
                                return err;
                }
                newcon->flags |= CON_ENABLED;
                if (i == preferred_console)
                        newcon->flags |= CON_CONSDEV;
                return 0;
        }

        /*
         * Some consoles, such as pstore and netconsole, can be enabled even
         * without matching. Accept the pre-enabled consoles only when match()
         * and setup() had a chance to be called.
         */
        if (newcon->flags & CON_ENABLED && c->user_specified ==        user_specified)
                return 0;

        return -ENOENT;
}

/* Try to enable the console unconditionally */
static void try_enable_default_console(struct console *newcon)
{
        if (newcon->index < 0)
                newcon->index = 0;

        if (console_call_setup(newcon, NULL) != 0)
                return;

        newcon->flags |= CON_ENABLED;

        if (newcon->device)
                newcon->flags |= CON_CONSDEV;
}

static void console_init_seq(struct console *newcon, bool bootcon_registered)
{
        struct console *con;
        bool handover;

        if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) {
                /* Get a consistent copy of @syslog_seq. */
                mutex_lock(&syslog_lock);
                newcon->seq = syslog_seq;
                mutex_unlock(&syslog_lock);
        } else {
                /* Begin with next message added to ringbuffer. */
                newcon->seq = prb_next_seq(prb);

                /*
                 * If any enabled boot consoles are due to be unregistered
                 * shortly, some may not be caught up and may be the same
                 * device as @newcon. Since it is not known which boot console
                 * is the same device, flush all consoles and, if necessary,
                 * start with the message of the enabled boot console that is
                 * the furthest behind.
                 */
                if (bootcon_registered && !keep_bootcon) {
                        /*
                         * Hold the console_lock to stop console printing and
                         * guarantee safe access to console->seq.
                         */
                        console_lock();

                        /*
                         * Flush all consoles and set the console to start at
                         * the next unprinted sequence number.
                         */
                        if (!console_flush_all(true, &newcon->seq, &handover)) {
                                /*
                                 * Flushing failed. Just choose the lowest
                                 * sequence of the enabled boot consoles.
                                 */

                                /*
                                 * If there was a handover, this context no
                                 * longer holds the console_lock.
                                 */
                                if (handover)
                                        console_lock();

                                newcon->seq = prb_next_seq(prb);
                                for_each_console(con) {
                                        if ((con->flags & CON_BOOT) &&
                                            (con->flags & CON_ENABLED) &&
                                            con->seq < newcon->seq) {
                                                newcon->seq = con->seq;
                                        }
                                }
                        }

                        console_unlock();
                }
        }
}

#define console_first()                                \
        hlist_entry(console_list.first, struct console, node)

static int unregister_console_locked(struct console *console);

/*
 * The console driver calls this routine during kernel initialization
 * to register the console printing procedure with printk() and to
 * print any messages that were printed by the kernel before the
 * console driver was initialized.
 *
 * This can happen pretty early during the boot process (because of
 * early_printk) - sometimes before setup_arch() completes - be careful
 * of what kernel features are used - they may not be initialised yet.
 *
 * There are two types of consoles - bootconsoles (early_printk) and
 * "real" consoles (everything which is not a bootconsole) which are
 * handled differently.
 *  - Any number of bootconsoles can be registered at any time.
 *  - As soon as a "real" console is registered, all bootconsoles
 *    will be unregistered automatically.
 *  - Once a "real" console is registered, any attempt to register a
 *    bootconsoles will be rejected
 */
void register_console(struct console *newcon)
{
        struct console *con;
        bool bootcon_registered = false;
        bool realcon_registered = false;
        int err;

        console_list_lock();

        for_each_console(con) {
                if (WARN(con == newcon, "console '%s%d' already registered\n",
                                         con->name, con->index)) {
                        goto unlock;
                }

                if (con->flags & CON_BOOT)
                        bootcon_registered = true;
                else
                        realcon_registered = true;
        }

        /* Do not register boot consoles when there already is a real one. */
        if ((newcon->flags & CON_BOOT) && realcon_registered) {
                pr_info("Too late to register bootconsole %s%d\n",
                        newcon->name, newcon->index);
                goto unlock;
        }

        if (newcon->flags & CON_NBCON) {
                /*
                 * Ensure the nbcon console buffers can be allocated
                 * before modifying any global data.
                 */
                if (!nbcon_alloc(newcon))
                        goto unlock;
        }

        /*
         * See if we want to enable this console driver by default.
         *
         * Nope when a console is preferred by the command line, device
         * tree, or SPCR.
         *
         * The first real console with tty binding (driver) wins. More
         * consoles might get enabled before the right one is found.
         *
         * Note that a console with tty binding will have CON_CONSDEV
         * flag set and will be first in the list.
         */
        if (preferred_console < 0 && !console_set_on_cmdline) {
                if (hlist_empty(&console_list) || !console_first()->device ||
                    console_first()->flags & CON_BOOT) {
                        try_enable_default_console(newcon);
                }
        }

        /* See if this console matches one we selected on the command line */
        err = try_enable_preferred_console(newcon, true);

        /* If not, try to match against the platform default(s) */
        if (err == -ENOENT)
                err = try_enable_preferred_console(newcon, false);

        /* printk() messages are not printed to the Braille console. */
        if (err || newcon->flags & CON_BRL) {
                if (newcon->flags & CON_NBCON)
                        nbcon_free(newcon);
                goto unlock;
        }

        /*
         * If we have a bootconsole, and are switching to a real console,
         * don't print everything out again, since when the boot console, and
         * the real console are the same physical device, it's annoying to
         * see the beginning boot messages twice
         */
        if (bootcon_registered &&
            ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
                newcon->flags &= ~CON_PRINTBUFFER;
        }

        newcon->dropped = 0;
        console_init_seq(newcon, bootcon_registered);

        if (newcon->flags & CON_NBCON)
                nbcon_init(newcon);

        /*
         * Put this console in the list - keep the
         * preferred driver at the head of the list.
         */
        if (hlist_empty(&console_list)) {
                /* Ensure CON_CONSDEV is always set for the head. */
                newcon->flags |= CON_CONSDEV;
                hlist_add_head_rcu(&newcon->node, &console_list);

        } else if (newcon->flags & CON_CONSDEV) {
                /* Only the new head can have CON_CONSDEV set. */
                console_srcu_write_flags(console_first(), console_first()->flags & ~CON_CONSDEV);
                hlist_add_head_rcu(&newcon->node, &console_list);

        } else {
                hlist_add_behind_rcu(&newcon->node, console_list.first);
        }

        /*
         * No need to synchronize SRCU here! The caller does not rely
         * on all contexts being able to see the new console before
         * register_console() completes.
         */

        console_sysfs_notify();

        /*
         * By unregistering the bootconsoles after we enable the real console
         * we get the "console xxx enabled" message on all the consoles -
         * boot consoles, real consoles, etc - this is to ensure that end
         * users know there might be something in the kernel's log buffer that
         * went to the bootconsole (that they do not see on the real console)
         */
        con_printk(KERN_INFO, newcon, "enabled\n");
        if (bootcon_registered &&
            ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
            !keep_bootcon) {
                struct hlist_node *tmp;

                hlist_for_each_entry_safe(con, tmp, &console_list, node) {
                        if (con->flags & CON_BOOT)
                                unregister_console_locked(con);
                }
        }
unlock:
        console_list_unlock();
}
EXPORT_SYMBOL(register_console);

/* Must be called under console_list_lock(). */
static int unregister_console_locked(struct console *console)
{
        int res;

        lockdep_assert_console_list_lock_held();

        con_printk(KERN_INFO, console, "disabled\n");

        res = _braille_unregister_console(console);
        if (res < 0)
                return res;
        if (res > 0)
                return 0;

        /* Disable it unconditionally */
        console_srcu_write_flags(console, console->flags & ~CON_ENABLED);

        if (!console_is_registered_locked(console))
                return -ENODEV;

        hlist_del_init_rcu(&console->node);

        /*
         * <HISTORICAL>
         * If this isn't the last console and it has CON_CONSDEV set, we
         * need to set it on the next preferred console.
         * </HISTORICAL>
         *
         * The above makes no sense as there is no guarantee that the next
         * console has any device attached. Oh well....
         */
        if (!hlist_empty(&console_list) && console->flags & CON_CONSDEV)
                console_srcu_write_flags(console_first(), console_first()->flags | CON_CONSDEV);

        /*
         * Ensure that all SRCU list walks have completed. All contexts
         * must not be able to see this console in the list so that any
         * exit/cleanup routines can be performed safely.
         */
        synchronize_srcu(&console_srcu);

        if (console->flags & CON_NBCON)
                nbcon_free(console);

        console_sysfs_notify();

        if (console->exit)
                res = console->exit(console);

        return res;
}

int unregister_console(struct console *console)
{
        int res;

        console_list_lock();
        res = unregister_console_locked(console);
        console_list_unlock();
        return res;
}
EXPORT_SYMBOL(unregister_console);

/**
 * console_force_preferred_locked - force a registered console preferred
 * @con: The registered console to force preferred.
 *
 * Must be called under console_list_lock().
 */
void console_force_preferred_locked(struct console *con)
{
        struct console *cur_pref_con;

        if (!console_is_registered_locked(con))
                return;

        cur_pref_con = console_first();

        /* Already preferred? */
        if (cur_pref_con == con)
                return;

        /*
         * Delete, but do not re-initialize the entry. This allows the console
         * to continue to appear registered (via any hlist_unhashed_lockless()
         * checks), even though it was briefly removed from the console list.
         */
        hlist_del_rcu(&con->node);

        /*
         * Ensure that all SRCU list walks have completed so that the console
         * can be added to the beginning of the console list and its forward
         * list pointer can be re-initialized.
         */
        synchronize_srcu(&console_srcu);

        con->flags |= CON_CONSDEV;
        WARN_ON(!con->device);

        /* Only the new head can have CON_CONSDEV set. */
        console_srcu_write_flags(cur_pref_con, cur_pref_con->flags & ~CON_CONSDEV);
        hlist_add_head_rcu(&con->node, &console_list);
}
EXPORT_SYMBOL(console_force_preferred_locked);

/*
 * Initialize the console device. This is called *early*, so
 * we can't necessarily depend on lots of kernel help here.
 * Just do some early initializations, and do the complex setup
 * later.
 */
void __init console_init(void)
{
        int ret;
        initcall_t call;
        initcall_entry_t *ce;

        /* Setup the default TTY line discipline. */
        n_tty_init();

        /*
         * set up the console device so that later boot sequences can
         * inform about problems etc..
         */
        ce = __con_initcall_start;
        trace_initcall_level("console");
        while (ce < __con_initcall_end) {
                call = initcall_from_entry(ce);
                trace_initcall_start(call);
                ret = call();
                trace_initcall_finish(call, ret);
                ce++;
        }
}

/*
 * Some boot consoles access data that is in the init section and which will
 * be discarded after the initcalls have been run. To make sure that no code
 * will access this data, unregister the boot consoles in a late initcall.
 *
 * If for some reason, such as deferred probe or the driver being a loadable
 * module, the real console hasn't registered yet at this point, there will
 * be a brief interval in which no messages are logged to the console, which
 * makes it difficult to diagnose problems that occur during this time.
 *
 * To mitigate this problem somewhat, only unregister consoles whose memory
 * intersects with the init section. Note that all other boot consoles will
 * get unregistered when the real preferred console is registered.
 */
static int __init printk_late_init(void)
{
        struct hlist_node *tmp;
        struct console *con;
        int ret;

        console_list_lock();
        hlist_for_each_entry_safe(con, tmp, &console_list, node) {
                if (!(con->flags & CON_BOOT))
                        continue;

                /* Check addresses that might be used for enabled consoles. */
                if (init_section_intersects(con, sizeof(*con)) ||
                    init_section_contains(con->write, 0) ||
                    init_section_contains(con->read, 0) ||
                    init_section_contains(con->device, 0) ||
                    init_section_contains(con->unblank, 0) ||
                    init_section_contains(con->data, 0)) {
                        /*
                         * Please, consider moving the reported consoles out
                         * of the init section.
                         */
                        pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n",
                                con->name, con->index);
                        unregister_console_locked(con);
                }
        }
        console_list_unlock();

        ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL,
                                        console_cpu_notify);
        WARN_ON(ret < 0);
        ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online",
                                        console_cpu_notify, NULL);
        WARN_ON(ret < 0);
        printk_sysctl_init();
        return 0;
}
late_initcall(printk_late_init);

#if defined CONFIG_PRINTK
/* If @con is specified, only wait for that console. Otherwise wait for all. */
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress)
{
        unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms);
        unsigned long remaining_jiffies = timeout_jiffies;
        struct console *c;
        u64 last_diff = 0;
        u64 printk_seq;
        short flags;
        int cookie;
        u64 diff;
        u64 seq;

        might_sleep();

        seq = prb_next_reserve_seq(prb);

        /* Flush the consoles so that records up to @seq are printed. */
        console_lock();
        console_unlock();

        for (;;) {
                unsigned long begin_jiffies;
                unsigned long slept_jiffies;

                diff = 0;

                /*
                 * Hold the console_lock to guarantee safe access to
                 * console->seq. Releasing console_lock flushes more
                 * records in case @seq is still not printed on all
                 * usable consoles.
                 */
                console_lock();

                cookie = console_srcu_read_lock();
                for_each_console_srcu(c) {
                        if (con && con != c)
                                continue;

                        flags = console_srcu_read_flags(c);

                        /*
                         * If consoles are not usable, it cannot be expected
                         * that they make forward progress, so only increment
                         * @diff for usable consoles.
                         */
                        if (!console_is_usable(c))
                                continue;

                        if (flags & CON_NBCON) {
                                printk_seq = nbcon_seq_read(c);
                        } else {
                                printk_seq = c->seq;
                        }

                        if (printk_seq < seq)
                                diff += seq - printk_seq;
                }
                console_srcu_read_unlock(cookie);

                if (diff != last_diff && reset_on_progress)
                        remaining_jiffies = timeout_jiffies;

                console_unlock();

                /* Note: @diff is 0 if there are no usable consoles. */
                if (diff == 0 || remaining_jiffies == 0)
                        break;

                /* msleep(1) might sleep much longer. Check time by jiffies. */
                begin_jiffies = jiffies;
                msleep(1);
                slept_jiffies = jiffies - begin_jiffies;

                remaining_jiffies -= min(slept_jiffies, remaining_jiffies);

                last_diff = diff;
        }

        return (diff == 0);
}

/**
 * pr_flush() - Wait for printing threads to catch up.
 *
 * @timeout_ms:        The maximum time (in ms) to wait.
 * @reset_on_progress: Reset the timeout if forward progress is seen.
 *
 * A value of 0 for @timeout_ms means no waiting will occur. A value of -1
 * represents infinite waiting.
 *
 * If @reset_on_progress is true, the timeout will be reset whenever any
 * printer has been seen to make some forward progress.
 *
 * Context: Process context. May sleep while acquiring console lock.
 * Return: true if all usable printers are caught up.
 */
static bool pr_flush(int timeout_ms, bool reset_on_progress)
{
        return __pr_flush(NULL, timeout_ms, reset_on_progress);
}

/*
 * Delayed printk version, for scheduler-internal messages:
 */
#define PRINTK_PENDING_WAKEUP        0x01
#define PRINTK_PENDING_OUTPUT        0x02

static DEFINE_PER_CPU(int, printk_pending);

static void wake_up_klogd_work_func(struct irq_work *irq_work)
{
        int pending = this_cpu_xchg(printk_pending, 0);

        if (pending & PRINTK_PENDING_OUTPUT) {
                /* If trylock fails, someone else is doing the printing */
                if (console_trylock())
                        console_unlock();
        }

        if (pending & PRINTK_PENDING_WAKEUP)
                wake_up_interruptible(&log_wait);
}

static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) =
        IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func);

static void __wake_up_klogd(int val)
{
        if (!printk_percpu_data_ready())
                return;

        preempt_disable();
        /*
         * Guarantee any new records can be seen by tasks preparing to wait
         * before this context checks if the wait queue is empty.
         *
         * The full memory barrier within wq_has_sleeper() pairs with the full
         * memory barrier within set_current_state() of
         * prepare_to_wait_event(), which is called after ___wait_event() adds
         * the waiter but before it has checked the wait condition.
         *
         * This pairs with devkmsg_read:A and syslog_print:A.
         */
        if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */
            (val & PRINTK_PENDING_OUTPUT)) {
                this_cpu_or(printk_pending, val);
                irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
        }
        preempt_enable();
}

/**
 * wake_up_klogd - Wake kernel logging daemon
 *
 * Use this function when new records have been added to the ringbuffer
 * and the console printing of those records has already occurred or is
 * known to be handled by some other context. This function will only
 * wake the logging daemon.
 *
 * Context: Any context.
 */
void wake_up_klogd(void)
{
        __wake_up_klogd(PRINTK_PENDING_WAKEUP);
}

/**
 * defer_console_output - Wake kernel logging daemon and trigger
 *        console printing in a deferred context
 *
 * Use this function when new records have been added to the ringbuffer,
 * this context is responsible for console printing those records, but
 * the current context is not allowed to perform the console printing.
 * Trigger an irq_work context to perform the console printing. This
 * function also wakes the logging daemon.
 *
 * Context: Any context.
 */
void defer_console_output(void)
{
        /*
         * New messages may have been added directly to the ringbuffer
         * using vprintk_store(), so wake any waiters as well.
         */
        __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT);
}

void printk_trigger_flush(void)
{
        defer_console_output();
}

int vprintk_deferred(const char *fmt, va_list args)
{
        return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
}

int _printk_deferred(const char *fmt, ...)
{
        va_list args;
        int r;

        va_start(args, fmt);
        r = vprintk_deferred(fmt, args);
        va_end(args);

        return r;
}

/*
 * printk rate limiting, lifted from the networking subsystem.
 *
 * This enforces a rate limit: not more than 10 kernel messages
 * every 5s to make a denial-of-service attack impossible.
 */
DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);

int __printk_ratelimit(const char *func)
{
        return ___ratelimit(&printk_ratelimit_state, func);
}
EXPORT_SYMBOL(__printk_ratelimit);

/**
 * printk_timed_ratelimit - caller-controlled printk ratelimiting
 * @caller_jiffies: pointer to caller's state
 * @interval_msecs: minimum interval between prints
 *
 * printk_timed_ratelimit() returns true if more than @interval_msecs
 * milliseconds have elapsed since the last time printk_timed_ratelimit()
 * returned true.
 */
bool printk_timed_ratelimit(unsigned long *caller_jiffies,
                        unsigned int interval_msecs)
{
        unsigned long elapsed = jiffies - *caller_jiffies;

        if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs))
                return false;

        *caller_jiffies = jiffies;
        return true;
}
EXPORT_SYMBOL(printk_timed_ratelimit);

static DEFINE_SPINLOCK(dump_list_lock);
static LIST_HEAD(dump_list);

/**
 * kmsg_dump_register - register a kernel log dumper.
 * @dumper: pointer to the kmsg_dumper structure
 *
 * Adds a kernel log dumper to the system. The dump callback in the
 * structure will be called when the kernel oopses or panics and must be
 * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise.
 */
int kmsg_dump_register(struct kmsg_dumper *dumper)
{
        unsigned long flags;
        int err = -EBUSY;

        /* The dump callback needs to be set */
        if (!dumper->dump)
                return -EINVAL;

        spin_lock_irqsave(&dump_list_lock, flags);
        /* Don't allow registering multiple times */
        if (!dumper->registered) {
                dumper->registered = 1;
                list_add_tail_rcu(&dumper->list, &dump_list);
                err = 0;
        }
        spin_unlock_irqrestore(&dump_list_lock, flags);

        return err;
}
EXPORT_SYMBOL_GPL(kmsg_dump_register);

/**
 * kmsg_dump_unregister - unregister a kmsg dumper.
 * @dumper: pointer to the kmsg_dumper structure
 *
 * Removes a dump device from the system. Returns zero on success and
 * %-EINVAL otherwise.
 */
int kmsg_dump_unregister(struct kmsg_dumper *dumper)
{
        unsigned long flags;
        int err = -EINVAL;

        spin_lock_irqsave(&dump_list_lock, flags);
        if (dumper->registered) {
                dumper->registered = 0;
                list_del_rcu(&dumper->list);
                err = 0;
        }
        spin_unlock_irqrestore(&dump_list_lock, flags);
        synchronize_rcu();

        return err;
}
EXPORT_SYMBOL_GPL(kmsg_dump_unregister);

static bool always_kmsg_dump;
module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);

const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason)
{
        switch (reason) {
        case KMSG_DUMP_PANIC:
                return "Panic";
        case KMSG_DUMP_OOPS:
                return "Oops";
        case KMSG_DUMP_EMERG:
                return "Emergency";
        case KMSG_DUMP_SHUTDOWN:
                return "Shutdown";
        default:
                return "Unknown";
        }
}
EXPORT_SYMBOL_GPL(kmsg_dump_reason_str);

/**
 * kmsg_dump - dump kernel log to kernel message dumpers.
 * @reason: the reason (oops, panic etc) for dumping
 *
 * Call each of the registered dumper's dump() callback, which can
 * retrieve the kmsg records with kmsg_dump_get_line() or
 * kmsg_dump_get_buffer().
 */
void kmsg_dump(enum kmsg_dump_reason reason)
{
        struct kmsg_dumper *dumper;

        rcu_read_lock();
        list_for_each_entry_rcu(dumper, &dump_list, list) {
                enum kmsg_dump_reason max_reason = dumper->max_reason;

                /*
                 * If client has not provided a specific max_reason, default
                 * to KMSG_DUMP_OOPS, unless always_kmsg_dump was set.
                 */
                if (max_reason == KMSG_DUMP_UNDEF) {
                        max_reason = always_kmsg_dump ? KMSG_DUMP_MAX :
                                                        KMSG_DUMP_OOPS;
                }
                if (reason > max_reason)
                        continue;

                /* invoke dumper which will iterate over records */
                dumper->dump(dumper, reason);
        }
        rcu_read_unlock();
}

/**
 * kmsg_dump_get_line - retrieve one kmsg log line
 * @iter: kmsg dump iterator
 * @syslog: include the "<4>" prefixes
 * @line: buffer to copy the line to
 * @size: maximum size of the buffer
 * @len: length of line placed into buffer
 *
 * Start at the beginning of the kmsg buffer, with the oldest kmsg
 * record, and copy one record into the provided buffer.
 *
 * Consecutive calls will return the next available record moving
 * towards the end of the buffer with the youngest messages.
 *
 * A return value of FALSE indicates that there are no more records to
 * read.
 */
bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
                        char *line, size_t size, size_t *len)
{
        u64 min_seq = latched_seq_read_nolock(&clear_seq);
        struct printk_info info;
        unsigned int line_count;
        struct printk_record r;
        size_t l = 0;
        bool ret = false;

        if (iter->cur_seq < min_seq)
                iter->cur_seq = min_seq;

        prb_rec_init_rd(&r, &info, line, size);

        /* Read text or count text lines? */
        if (line) {
                if (!prb_read_valid(prb, iter->cur_seq, &r))
                        goto out;
                l = record_print_text(&r, syslog, printk_time);
        } else {
                if (!prb_read_valid_info(prb, iter->cur_seq,
                                         &info, &line_count)) {
                        goto out;
                }
                l = get_record_print_text_size(&info, line_count, syslog,
                                               printk_time);

        }

        iter->cur_seq = r.info->seq + 1;
        ret = true;
out:
        if (len)
                *len = l;
        return ret;
}
EXPORT_SYMBOL_GPL(kmsg_dump_get_line);

/**
 * kmsg_dump_get_buffer - copy kmsg log lines
 * @iter: kmsg dump iterator
 * @syslog: include the "<4>" prefixes
 * @buf: buffer to copy the line to
 * @size: maximum size of the buffer
 * @len_out: length of line placed into buffer
 *
 * Start at the end of the kmsg buffer and fill the provided buffer
 * with as many of the *youngest* kmsg records that fit into it.
 * If the buffer is large enough, all available kmsg records will be
 * copied with a single call.
 *
 * Consecutive calls will fill the buffer with the next block of
 * available older records, not including the earlier retrieved ones.
 *
 * A return value of FALSE indicates that there are no more records to
 * read.
 */
bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
                          char *buf, size_t size, size_t *len_out)
{
        u64 min_seq = latched_seq_read_nolock(&clear_seq);
        struct printk_info info;
        struct printk_record r;
        u64 seq;
        u64 next_seq;
        size_t len = 0;
        bool ret = false;
        bool time = printk_time;

        if (!buf || !size)
                goto out;

        if (iter->cur_seq < min_seq)
                iter->cur_seq = min_seq;

        if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) {
                if (info.seq != iter->cur_seq) {
                        /* messages are gone, move to first available one */
                        iter->cur_seq = info.seq;
                }
        }

        /* last entry */
        if (iter->cur_seq >= iter->next_seq)
                goto out;

        /*
         * Find first record that fits, including all following records,
         * into the user-provided buffer for this dump. Pass in size-1
         * because this function (by way of record_print_text()) will
         * not write more than size-1 bytes of text into @buf.
         */
        seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq,
                                     size - 1, syslog, time);

        /*
         * Next kmsg_dump_get_buffer() invocation will dump block of
         * older records stored right before this one.
         */
        next_seq = seq;

        prb_rec_init_rd(&r, &info, buf, size);

        prb_for_each_record(seq, prb, seq, &r) {
                if (r.info->seq >= iter->next_seq)
                        break;

                len += record_print_text(&r, syslog, time);

                /* Adjust record to store to remaining buffer space. */
                prb_rec_init_rd(&r, &info, buf + len, size - len);
        }

        iter->next_seq = next_seq;
        ret = true;
out:
        if (len_out)
                *len_out = len;
        return ret;
}
EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);

/**
 * kmsg_dump_rewind - reset the iterator
 * @iter: kmsg dump iterator
 *
 * Reset the dumper's iterator so that kmsg_dump_get_line() and
 * kmsg_dump_get_buffer() can be called again and used multiple
 * times within the same dumper.dump() callback.
 */
void kmsg_dump_rewind(struct kmsg_dump_iter *iter)
{
        iter->cur_seq = latched_seq_read_nolock(&clear_seq);
        iter->next_seq = prb_next_seq(prb);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);

/**
 * console_replay_all - replay kernel log on consoles
 *
 * Try to obtain lock on console subsystem and replay all
 * available records in printk buffer on the consoles.
 * Does nothing if lock is not obtained.
 *
 * Context: Any context.
 */
void console_replay_all(void)
{
        if (console_trylock()) {
                __console_rewind_all();
                /* Consoles are flushed as part of console_unlock(). */
                console_unlock();
        }
}
#endif

#ifdef CONFIG_SMP
static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1);
static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0);

/**
 * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant
 *                            spinning lock is not owned by any CPU.
 *
 * Context: Any context.
 */
void __printk_cpu_sync_wait(void)
{
        do {
                cpu_relax();
        } while (atomic_read(&printk_cpu_sync_owner) != -1);
}
EXPORT_SYMBOL(__printk_cpu_sync_wait);

/**
 * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant
 *                               spinning lock.
 *
 * If no processor has the lock, the calling processor takes the lock and
 * becomes the owner. If the calling processor is already the owner of the
 * lock, this function succeeds immediately.
 *
 * Context: Any context. Expects interrupts to be disabled.
 * Return: 1 on success, otherwise 0.
 */
int __printk_cpu_sync_try_get(void)
{
        int cpu;
        int old;

        cpu = smp_processor_id();

        /*
         * Guarantee loads and stores from this CPU when it is the lock owner
         * are _not_ visible to the previous lock owner. This pairs with
         * __printk_cpu_sync_put:B.
         *
         * Memory barrier involvement:
         *
         * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B,
         * then __printk_cpu_sync_put:A can never read from
         * __printk_cpu_sync_try_get:B.
         *
         * Relies on:
         *
         * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B
         * of the previous CPU
         *    matching
         * ACQUIRE from __printk_cpu_sync_try_get:A to
         * __printk_cpu_sync_try_get:B of this CPU
         */
        old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1,
                                     cpu); /* LMM(__printk_cpu_sync_try_get:A) */
        if (old == -1) {
                /*
                 * This CPU is now the owner and begins loading/storing
                 * data: LMM(__printk_cpu_sync_try_get:B)
                 */
                return 1;

        } else if (old == cpu) {
                /* This CPU is already the owner. */
                atomic_inc(&printk_cpu_sync_nested);
                return 1;
        }

        return 0;
}
EXPORT_SYMBOL(__printk_cpu_sync_try_get);

/**
 * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock.
 *
 * The calling processor must be the owner of the lock.
 *
 * Context: Any context. Expects interrupts to be disabled.
 */
void __printk_cpu_sync_put(void)
{
        if (atomic_read(&printk_cpu_sync_nested)) {
                atomic_dec(&printk_cpu_sync_nested);
                return;
        }

        /*
         * This CPU is finished loading/storing data:
         * LMM(__printk_cpu_sync_put:A)
         */

        /*
         * Guarantee loads and stores from this CPU when it was the
         * lock owner are visible to the next lock owner. This pairs
         * with __printk_cpu_sync_try_get:A.
         *
         * Memory barrier involvement:
         *
         * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B,
         * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A.
         *
         * Relies on:
         *
         * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B
         * of this CPU
         *    matching
         * ACQUIRE from __printk_cpu_sync_try_get:A to
         * __printk_cpu_sync_try_get:B of the next CPU
         */
        atomic_set_release(&printk_cpu_sync_owner,
                           -1); /* LMM(__printk_cpu_sync_put:B) */
}
EXPORT_SYMBOL(__printk_cpu_sync_put);
#endif /* CONFIG_SMP */







































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Because linux/module.h has tracepoints in the header, and ftrace.h
 * used to include this file, define_trace.h includes linux/module.h
 * But we do not want the module.h to override the TRACE_SYSTEM macro
 * variable that define_trace.h is processing, so we only set it
 * when module events are being processed, which would happen when
 * CREATE_TRACE_POINTS is defined.
 */
#ifdef CREATE_TRACE_POINTS
#undef TRACE_SYSTEM
#define TRACE_SYSTEM module
#endif

#if !defined(_TRACE_MODULE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MODULE_H

#include <linux/tracepoint.h>

#ifdef CONFIG_MODULES

struct module;

#define show_module_flags(flags) __print_flags(flags, "",        \
        { (1UL << TAINT_PROPRIETARY_MODULE),        "P" },                \
        { (1UL << TAINT_OOT_MODULE),                "O" },                \
        { (1UL << TAINT_FORCED_MODULE),                "F" },                \
        { (1UL << TAINT_CRAP),                        "C" },                \
        { (1UL << TAINT_UNSIGNED_MODULE),        "E" })

TRACE_EVENT(module_load,

        TP_PROTO(struct module *mod),

        TP_ARGS(mod),

        TP_STRUCT__entry(
                __field(        unsigned int,        taints                )
                __string(        name,                mod->name        )
        ),

        TP_fast_assign(
                __entry->taints = mod->taints;
                __assign_str(name);
        ),

        TP_printk("%s %s", __get_str(name), show_module_flags(__entry->taints))
);

TRACE_EVENT(module_free,

        TP_PROTO(struct module *mod),

        TP_ARGS(mod),

        TP_STRUCT__entry(
                __string(        name,                mod->name        )
        ),

        TP_fast_assign(
                __assign_str(name);
        ),

        TP_printk("%s", __get_str(name))
);

#ifdef CONFIG_MODULE_UNLOAD
/* trace_module_get/put are only used if CONFIG_MODULE_UNLOAD is defined */

DECLARE_EVENT_CLASS(module_refcnt,

        TP_PROTO(struct module *mod, unsigned long ip),

        TP_ARGS(mod, ip),

        TP_STRUCT__entry(
                __field(        unsigned long,        ip                )
                __field(        int,                refcnt                )
                __string(        name,                mod->name        )
        ),

        TP_fast_assign(
                __entry->ip        = ip;
                __entry->refcnt        = atomic_read(&mod->refcnt);
                __assign_str(name);
        ),

        TP_printk("%s call_site=%ps refcnt=%d",
                  __get_str(name), (void *)__entry->ip, __entry->refcnt)
);

DEFINE_EVENT(module_refcnt, module_get,

        TP_PROTO(struct module *mod, unsigned long ip),

        TP_ARGS(mod, ip)
);

DEFINE_EVENT(module_refcnt, module_put,

        TP_PROTO(struct module *mod, unsigned long ip),

        TP_ARGS(mod, ip)
);
#endif /* CONFIG_MODULE_UNLOAD */

TRACE_EVENT(module_request,

        TP_PROTO(char *name, bool wait, unsigned long ip),

        TP_ARGS(name, wait, ip),

        TP_STRUCT__entry(
                __field(        unsigned long,        ip                )
                __field(        bool,                wait                )
                __string(        name,                name                )
        ),

        TP_fast_assign(
                __entry->ip        = ip;
                __entry->wait        = wait;
                __assign_str(name);
        ),

        TP_printk("%s wait=%d call_site=%ps",
                  __get_str(name), (int)__entry->wait, (void *)__entry->ip)
);

#endif /* CONFIG_MODULES */

#endif /* _TRACE_MODULE_H */

/* This part must be outside protection */
#include <trace/define_trace.h>










































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
// SPDX-License-Identifier: GPL-2.0-only
/*
 * vhost transport for vsock
 *
 * Copyright (C) 2013-2015 Red Hat, Inc.
 * Author: Asias He <asias@redhat.com>
 *         Stefan Hajnoczi <stefanha@redhat.com>
 */
#include <linux/miscdevice.h>
#include <linux/atomic.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <net/sock.h>
#include <linux/virtio_vsock.h>
#include <linux/vhost.h>
#include <linux/hashtable.h>

#include <net/af_vsock.h>
#include "vhost.h"

#define VHOST_VSOCK_DEFAULT_HOST_CID        2
/* Max number of bytes transferred before requeueing the job.
 * Using this limit prevents one virtqueue from starving others. */
#define VHOST_VSOCK_WEIGHT 0x80000
/* Max number of packets transferred before requeueing the job.
 * Using this limit prevents one virtqueue from starving others with
 * small pkts.
 */
#define VHOST_VSOCK_PKT_WEIGHT 256

enum {
        VHOST_VSOCK_FEATURES = VHOST_FEATURES |
                               (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
                               (1ULL << VIRTIO_VSOCK_F_SEQPACKET)
};

enum {
        VHOST_VSOCK_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2)
};

/* Used to track all the vhost_vsock instances on the system. */
static DEFINE_MUTEX(vhost_vsock_mutex);
static DEFINE_READ_MOSTLY_HASHTABLE(vhost_vsock_hash, 8);

struct vhost_vsock {
        struct vhost_dev dev;
        struct vhost_virtqueue vqs[2];

        /* Link to global vhost_vsock_hash, writes use vhost_vsock_mutex */
        struct hlist_node hash;

        struct vhost_work send_pkt_work;
        struct sk_buff_head send_pkt_queue; /* host->guest pending packets */

        atomic_t queued_replies;

        u32 guest_cid;
        bool seqpacket_allow;
};

static u32 vhost_transport_get_local_cid(void)
{
        return VHOST_VSOCK_DEFAULT_HOST_CID;
}

/* Callers that dereference the return value must hold vhost_vsock_mutex or the
 * RCU read lock.
 */
static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
{
        struct vhost_vsock *vsock;

        hash_for_each_possible_rcu(vhost_vsock_hash, vsock, hash, guest_cid) {
                u32 other_cid = vsock->guest_cid;

                /* Skip instances that have no CID yet */
                if (other_cid == 0)
                        continue;

                if (other_cid == guest_cid)
                        return vsock;

        }

        return NULL;
}

static void
vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
                            struct vhost_virtqueue *vq)
{
        struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
        int pkts = 0, total_len = 0;
        bool added = false;
        bool restart_tx = false;

        mutex_lock(&vq->mutex);

        if (!vhost_vq_get_backend(vq))
                goto out;

        if (!vq_meta_prefetch(vq))
                goto out;

        /* Avoid further vmexits, we're already processing the virtqueue */
        vhost_disable_notify(&vsock->dev, vq);

        do {
                struct virtio_vsock_hdr *hdr;
                size_t iov_len, payload_len;
                struct iov_iter iov_iter;
                u32 flags_to_restore = 0;
                struct sk_buff *skb;
                unsigned out, in;
                size_t nbytes;
                u32 offset;
                int head;

                skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue);

                if (!skb) {
                        vhost_enable_notify(&vsock->dev, vq);
                        break;
                }

                head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
                                         &out, &in, NULL, NULL);
                if (head < 0) {
                        virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb);
                        break;
                }

                if (head == vq->num) {
                        virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb);
                        /* We cannot finish yet if more buffers snuck in while
                         * re-enabling notify.
                         */
                        if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
                                vhost_disable_notify(&vsock->dev, vq);
                                continue;
                        }
                        break;
                }

                if (out) {
                        kfree_skb(skb);
                        vq_err(vq, "Expected 0 output buffers, got %u\n", out);
                        break;
                }

                iov_len = iov_length(&vq->iov[out], in);
                if (iov_len < sizeof(*hdr)) {
                        kfree_skb(skb);
                        vq_err(vq, "Buffer len [%zu] too small\n", iov_len);
                        break;
                }

                iov_iter_init(&iov_iter, ITER_DEST, &vq->iov[out], in, iov_len);
                offset = VIRTIO_VSOCK_SKB_CB(skb)->offset;
                payload_len = skb->len - offset;
                hdr = virtio_vsock_hdr(skb);

                /* If the packet is greater than the space available in the
                 * buffer, we split it using multiple buffers.
                 */
                if (payload_len > iov_len - sizeof(*hdr)) {
                        payload_len = iov_len - sizeof(*hdr);

                        /* As we are copying pieces of large packet's buffer to
                         * small rx buffers, headers of packets in rx queue are
                         * created dynamically and are initialized with header
                         * of current packet(except length). But in case of
                         * SOCK_SEQPACKET, we also must clear message delimeter
                         * bit (VIRTIO_VSOCK_SEQ_EOM) and MSG_EOR bit
                         * (VIRTIO_VSOCK_SEQ_EOR) if set. Otherwise,
                         * there will be sequence of packets with these
                         * bits set. After initialized header will be copied to
                         * rx buffer, these required bits will be restored.
                         */
                        if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) {
                                hdr->flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
                                flags_to_restore |= VIRTIO_VSOCK_SEQ_EOM;

                                if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR) {
                                        hdr->flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
                                        flags_to_restore |= VIRTIO_VSOCK_SEQ_EOR;
                                }
                        }
                }

                /* Set the correct length in the header */
                hdr->len = cpu_to_le32(payload_len);

                nbytes = copy_to_iter(hdr, sizeof(*hdr), &iov_iter);
                if (nbytes != sizeof(*hdr)) {
                        kfree_skb(skb);
                        vq_err(vq, "Faulted on copying pkt hdr\n");
                        break;
                }

                if (skb_copy_datagram_iter(skb,
                                           offset,
                                           &iov_iter,
                                           payload_len)) {
                        kfree_skb(skb);
                        vq_err(vq, "Faulted on copying pkt buf\n");
                        break;
                }

                /* Deliver to monitoring devices all packets that we
                 * will transmit.
                 */
                virtio_transport_deliver_tap_pkt(skb);

                vhost_add_used(vq, head, sizeof(*hdr) + payload_len);
                added = true;

                VIRTIO_VSOCK_SKB_CB(skb)->offset += payload_len;
                total_len += payload_len;

                /* If we didn't send all the payload we can requeue the packet
                 * to send it with the next available buffer.
                 */
                if (VIRTIO_VSOCK_SKB_CB(skb)->offset < skb->len) {
                        hdr->flags |= cpu_to_le32(flags_to_restore);

                        /* We are queueing the same skb to handle
                         * the remaining bytes, and we want to deliver it
                         * to monitoring devices in the next iteration.
                         */
                        virtio_vsock_skb_clear_tap_delivered(skb);
                        virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb);
                } else {
                        if (virtio_vsock_skb_reply(skb)) {
                                int val;

                                val = atomic_dec_return(&vsock->queued_replies);

                                /* Do we have resources to resume tx
                                 * processing?
                                 */
                                if (val + 1 == tx_vq->num)
                                        restart_tx = true;
                        }

                        consume_skb(skb);
                }
        } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
        if (added)
                vhost_signal(&vsock->dev, vq);

out:
        mutex_unlock(&vq->mutex);

        if (restart_tx)
                vhost_poll_queue(&tx_vq->poll);
}

static void vhost_transport_send_pkt_work(struct vhost_work *work)
{
        struct vhost_virtqueue *vq;
        struct vhost_vsock *vsock;

        vsock = container_of(work, struct vhost_vsock, send_pkt_work);
        vq = &vsock->vqs[VSOCK_VQ_RX];

        vhost_transport_do_send_pkt(vsock, vq);
}

static int
vhost_transport_send_pkt(struct sk_buff *skb)
{
        struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
        struct vhost_vsock *vsock;
        int len = skb->len;

        rcu_read_lock();

        /* Find the vhost_vsock according to guest context id  */
        vsock = vhost_vsock_get(le64_to_cpu(hdr->dst_cid));
        if (!vsock) {
                rcu_read_unlock();
                kfree_skb(skb);
                return -ENODEV;
        }

        if (virtio_vsock_skb_reply(skb))
                atomic_inc(&vsock->queued_replies);

        virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb);
        vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work);

        rcu_read_unlock();
        return len;
}

static int
vhost_transport_cancel_pkt(struct vsock_sock *vsk)
{
        struct vhost_vsock *vsock;
        int cnt = 0;
        int ret = -ENODEV;

        rcu_read_lock();

        /* Find the vhost_vsock according to guest context id  */
        vsock = vhost_vsock_get(vsk->remote_addr.svm_cid);
        if (!vsock)
                goto out;

        cnt = virtio_transport_purge_skbs(vsk, &vsock->send_pkt_queue);

        if (cnt) {
                struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
                int new_cnt;

                new_cnt = atomic_sub_return(cnt, &vsock->queued_replies);
                if (new_cnt + cnt >= tx_vq->num && new_cnt < tx_vq->num)
                        vhost_poll_queue(&tx_vq->poll);
        }

        ret = 0;
out:
        rcu_read_unlock();
        return ret;
}

static struct sk_buff *
vhost_vsock_alloc_skb(struct vhost_virtqueue *vq,
                      unsigned int out, unsigned int in)
{
        struct virtio_vsock_hdr *hdr;
        struct iov_iter iov_iter;
        struct sk_buff *skb;
        size_t payload_len;
        size_t nbytes;
        size_t len;

        if (in != 0) {
                vq_err(vq, "Expected 0 input buffers, got %u\n", in);
                return NULL;
        }

        len = iov_length(vq->iov, out);

        /* len contains both payload and hdr */
        skb = virtio_vsock_alloc_skb(len, GFP_KERNEL);
        if (!skb)
                return NULL;

        iov_iter_init(&iov_iter, ITER_SOURCE, vq->iov, out, len);

        hdr = virtio_vsock_hdr(skb);
        nbytes = copy_from_iter(hdr, sizeof(*hdr), &iov_iter);
        if (nbytes != sizeof(*hdr)) {
                vq_err(vq, "Expected %zu bytes for pkt->hdr, got %zu bytes\n",
                       sizeof(*hdr), nbytes);
                kfree_skb(skb);
                return NULL;
        }

        payload_len = le32_to_cpu(hdr->len);

        /* No payload */
        if (!payload_len)
                return skb;

        /* The pkt is too big or the length in the header is invalid */
        if (payload_len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE ||
            payload_len + sizeof(*hdr) > len) {
                kfree_skb(skb);
                return NULL;
        }

        virtio_vsock_skb_rx_put(skb);

        nbytes = copy_from_iter(skb->data, payload_len, &iov_iter);
        if (nbytes != payload_len) {
                vq_err(vq, "Expected %zu byte payload, got %zu bytes\n",
                       payload_len, nbytes);
                kfree_skb(skb);
                return NULL;
        }

        return skb;
}

/* Is there space left for replies to rx packets? */
static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
{
        struct vhost_virtqueue *vq = &vsock->vqs[VSOCK_VQ_TX];
        int val;

        smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */
        val = atomic_read(&vsock->queued_replies);

        return val < vq->num;
}

static bool vhost_transport_msgzerocopy_allow(void)
{
        return true;
}

static bool vhost_transport_seqpacket_allow(u32 remote_cid);

static struct virtio_transport vhost_transport = {
        .transport = {
                .module                   = THIS_MODULE,

                .get_local_cid            = vhost_transport_get_local_cid,

                .init                     = virtio_transport_do_socket_init,
                .destruct                 = virtio_transport_destruct,
                .release                  = virtio_transport_release,
                .connect                  = virtio_transport_connect,
                .shutdown                 = virtio_transport_shutdown,
                .cancel_pkt               = vhost_transport_cancel_pkt,

                .dgram_enqueue            = virtio_transport_dgram_enqueue,
                .dgram_dequeue            = virtio_transport_dgram_dequeue,
                .dgram_bind               = virtio_transport_dgram_bind,
                .dgram_allow              = virtio_transport_dgram_allow,

                .stream_enqueue           = virtio_transport_stream_enqueue,
                .stream_dequeue           = virtio_transport_stream_dequeue,
                .stream_has_data          = virtio_transport_stream_has_data,
                .stream_has_space         = virtio_transport_stream_has_space,
                .stream_rcvhiwat          = virtio_transport_stream_rcvhiwat,
                .stream_is_active         = virtio_transport_stream_is_active,
                .stream_allow             = virtio_transport_stream_allow,

                .seqpacket_dequeue        = virtio_transport_seqpacket_dequeue,
                .seqpacket_enqueue        = virtio_transport_seqpacket_enqueue,
                .seqpacket_allow          = vhost_transport_seqpacket_allow,
                .seqpacket_has_data       = virtio_transport_seqpacket_has_data,

                .msgzerocopy_allow        = vhost_transport_msgzerocopy_allow,

                .notify_poll_in           = virtio_transport_notify_poll_in,
                .notify_poll_out          = virtio_transport_notify_poll_out,
                .notify_recv_init         = virtio_transport_notify_recv_init,
                .notify_recv_pre_block    = virtio_transport_notify_recv_pre_block,
                .notify_recv_pre_dequeue  = virtio_transport_notify_recv_pre_dequeue,
                .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue,
                .notify_send_init         = virtio_transport_notify_send_init,
                .notify_send_pre_block    = virtio_transport_notify_send_pre_block,
                .notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
                .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
                .notify_buffer_size       = virtio_transport_notify_buffer_size,
                .notify_set_rcvlowat      = virtio_transport_notify_set_rcvlowat,

                .read_skb = virtio_transport_read_skb,
        },

        .send_pkt = vhost_transport_send_pkt,
};

static bool vhost_transport_seqpacket_allow(u32 remote_cid)
{
        struct vhost_vsock *vsock;
        bool seqpacket_allow = false;

        rcu_read_lock();
        vsock = vhost_vsock_get(remote_cid);

        if (vsock)
                seqpacket_allow = vsock->seqpacket_allow;

        rcu_read_unlock();

        return seqpacket_allow;
}

static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
{
        struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
                                                  poll.work);
        struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
                                                 dev);
        int head, pkts = 0, total_len = 0;
        unsigned int out, in;
        struct sk_buff *skb;
        bool added = false;

        mutex_lock(&vq->mutex);

        if (!vhost_vq_get_backend(vq))
                goto out;

        if (!vq_meta_prefetch(vq))
                goto out;

        vhost_disable_notify(&vsock->dev, vq);
        do {
                struct virtio_vsock_hdr *hdr;

                if (!vhost_vsock_more_replies(vsock)) {
                        /* Stop tx until the device processes already
                         * pending replies.  Leave tx virtqueue
                         * callbacks disabled.
                         */
                        goto no_more_replies;
                }

                head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
                                         &out, &in, NULL, NULL);
                if (head < 0)
                        break;

                if (head == vq->num) {
                        if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
                                vhost_disable_notify(&vsock->dev, vq);
                                continue;
                        }
                        break;
                }

                skb = vhost_vsock_alloc_skb(vq, out, in);
                if (!skb) {
                        vq_err(vq, "Faulted on pkt\n");
                        continue;
                }

                total_len += sizeof(*hdr) + skb->len;

                /* Deliver to monitoring devices all received packets */
                virtio_transport_deliver_tap_pkt(skb);

                hdr = virtio_vsock_hdr(skb);

                /* Only accept correctly addressed packets */
                if (le64_to_cpu(hdr->src_cid) == vsock->guest_cid &&
                    le64_to_cpu(hdr->dst_cid) ==
                    vhost_transport_get_local_cid())
                        virtio_transport_recv_pkt(&vhost_transport, skb);
                else
                        kfree_skb(skb);

                vhost_add_used(vq, head, 0);
                added = true;
        } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));

no_more_replies:
        if (added)
                vhost_signal(&vsock->dev, vq);

out:
        mutex_unlock(&vq->mutex);
}

static void vhost_vsock_handle_rx_kick(struct vhost_work *work)
{
        struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
                                                poll.work);
        struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
                                                 dev);

        vhost_transport_do_send_pkt(vsock, vq);
}

static int vhost_vsock_start(struct vhost_vsock *vsock)
{
        struct vhost_virtqueue *vq;
        size_t i;
        int ret;

        mutex_lock(&vsock->dev.mutex);

        ret = vhost_dev_check_owner(&vsock->dev);
        if (ret)
                goto err;

        for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
                vq = &vsock->vqs[i];

                mutex_lock(&vq->mutex);

                if (!vhost_vq_access_ok(vq)) {
                        ret = -EFAULT;
                        goto err_vq;
                }

                if (!vhost_vq_get_backend(vq)) {
                        vhost_vq_set_backend(vq, vsock);
                        ret = vhost_vq_init_access(vq);
                        if (ret)
                                goto err_vq;
                }

                mutex_unlock(&vq->mutex);
        }

        /* Some packets may have been queued before the device was started,
         * let's kick the send worker to send them.
         */
        vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work);

        mutex_unlock(&vsock->dev.mutex);
        return 0;

err_vq:
        vhost_vq_set_backend(vq, NULL);
        mutex_unlock(&vq->mutex);

        for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
                vq = &vsock->vqs[i];

                mutex_lock(&vq->mutex);
                vhost_vq_set_backend(vq, NULL);
                mutex_unlock(&vq->mutex);
        }
err:
        mutex_unlock(&vsock->dev.mutex);
        return ret;
}

static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner)
{
        size_t i;
        int ret = 0;

        mutex_lock(&vsock->dev.mutex);

        if (check_owner) {
                ret = vhost_dev_check_owner(&vsock->dev);
                if (ret)
                        goto err;
        }

        for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
                struct vhost_virtqueue *vq = &vsock->vqs[i];

                mutex_lock(&vq->mutex);
                vhost_vq_set_backend(vq, NULL);
                mutex_unlock(&vq->mutex);
        }

err:
        mutex_unlock(&vsock->dev.mutex);
        return ret;
}

static void vhost_vsock_free(struct vhost_vsock *vsock)
{
        kvfree(vsock);
}

static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
{
        struct vhost_virtqueue **vqs;
        struct vhost_vsock *vsock;
        int ret;

        /* This struct is large and allocation could fail, fall back to vmalloc
         * if there is no other way.
         */
        vsock = kvmalloc(sizeof(*vsock), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
        if (!vsock)
                return -ENOMEM;

        vqs = kmalloc_array(ARRAY_SIZE(vsock->vqs), sizeof(*vqs), GFP_KERNEL);
        if (!vqs) {
                ret = -ENOMEM;
                goto out;
        }

        vsock->guest_cid = 0; /* no CID assigned yet */

        atomic_set(&vsock->queued_replies, 0);

        vqs[VSOCK_VQ_TX] = &vsock->vqs[VSOCK_VQ_TX];
        vqs[VSOCK_VQ_RX] = &vsock->vqs[VSOCK_VQ_RX];
        vsock->vqs[VSOCK_VQ_TX].handle_kick = vhost_vsock_handle_tx_kick;
        vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick;

        vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs),
                       UIO_MAXIOV, VHOST_VSOCK_PKT_WEIGHT,
                       VHOST_VSOCK_WEIGHT, true, NULL);

        file->private_data = vsock;
        skb_queue_head_init(&vsock->send_pkt_queue);
        vhost_work_init(&vsock->send_pkt_work, vhost_transport_send_pkt_work);
        return 0;

out:
        vhost_vsock_free(vsock);
        return ret;
}

static void vhost_vsock_flush(struct vhost_vsock *vsock)
{
        vhost_dev_flush(&vsock->dev);
}

static void vhost_vsock_reset_orphans(struct sock *sk)
{
        struct vsock_sock *vsk = vsock_sk(sk);

        /* vmci_transport.c doesn't take sk_lock here either.  At least we're
         * under vsock_table_lock so the sock cannot disappear while we're
         * executing.
         */

        /* If the peer is still valid, no need to reset connection */
        if (vhost_vsock_get(vsk->remote_addr.svm_cid))
                return;

        /* If the close timeout is pending, let it expire.  This avoids races
         * with the timeout callback.
         */
        if (vsk->close_work_scheduled)
                return;

        sock_set_flag(sk, SOCK_DONE);
        vsk->peer_shutdown = SHUTDOWN_MASK;
        sk->sk_state = SS_UNCONNECTED;
        sk->sk_err = ECONNRESET;
        sk_error_report(sk);
}

static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
{
        struct vhost_vsock *vsock = file->private_data;

        mutex_lock(&vhost_vsock_mutex);
        if (vsock->guest_cid)
                hash_del_rcu(&vsock->hash);
        mutex_unlock(&vhost_vsock_mutex);

        /* Wait for other CPUs to finish using vsock */
        synchronize_rcu();

        /* Iterating over all connections for all CIDs to find orphans is
         * inefficient.  Room for improvement here. */
        vsock_for_each_connected_socket(&vhost_transport.transport,
                                        vhost_vsock_reset_orphans);

        /* Don't check the owner, because we are in the release path, so we
         * need to stop the vsock device in any case.
         * vhost_vsock_stop() can not fail in this case, so we don't need to
         * check the return code.
         */
        vhost_vsock_stop(vsock, false);
        vhost_vsock_flush(vsock);
        vhost_dev_stop(&vsock->dev);

        virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);

        vhost_dev_cleanup(&vsock->dev);
        kfree(vsock->dev.vqs);
        vhost_vsock_free(vsock);
        return 0;
}

static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid)
{
        struct vhost_vsock *other;

        /* Refuse reserved CIDs */
        if (guest_cid <= VMADDR_CID_HOST ||
            guest_cid == U32_MAX)
                return -EINVAL;

        /* 64-bit CIDs are not yet supported */
        if (guest_cid > U32_MAX)
                return -EINVAL;

        /* Refuse if CID is assigned to the guest->host transport (i.e. nested
         * VM), to make the loopback work.
         */
        if (vsock_find_cid(guest_cid))
                return -EADDRINUSE;

        /* Refuse if CID is already in use */
        mutex_lock(&vhost_vsock_mutex);
        other = vhost_vsock_get(guest_cid);
        if (other && other != vsock) {
                mutex_unlock(&vhost_vsock_mutex);
                return -EADDRINUSE;
        }

        if (vsock->guest_cid)
                hash_del_rcu(&vsock->hash);

        vsock->guest_cid = guest_cid;
        hash_add_rcu(vhost_vsock_hash, &vsock->hash, vsock->guest_cid);
        mutex_unlock(&vhost_vsock_mutex);

        return 0;
}

static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features)
{
        struct vhost_virtqueue *vq;
        int i;

        if (features & ~VHOST_VSOCK_FEATURES)
                return -EOPNOTSUPP;

        mutex_lock(&vsock->dev.mutex);
        if ((features & (1 << VHOST_F_LOG_ALL)) &&
            !vhost_log_access_ok(&vsock->dev)) {
                goto err;
        }

        if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) {
                if (vhost_init_device_iotlb(&vsock->dev))
                        goto err;
        }

        if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET))
                vsock->seqpacket_allow = true;

        for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) {
                vq = &vsock->vqs[i];
                mutex_lock(&vq->mutex);
                vq->acked_features = features;
                mutex_unlock(&vq->mutex);
        }
        mutex_unlock(&vsock->dev.mutex);
        return 0;

err:
        mutex_unlock(&vsock->dev.mutex);
        return -EFAULT;
}

static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl,
                                  unsigned long arg)
{
        struct vhost_vsock *vsock = f->private_data;
        void __user *argp = (void __user *)arg;
        u64 guest_cid;
        u64 features;
        int start;
        int r;

        switch (ioctl) {
        case VHOST_VSOCK_SET_GUEST_CID:
                if (copy_from_user(&guest_cid, argp, sizeof(guest_cid)))
                        return -EFAULT;
                return vhost_vsock_set_cid(vsock, guest_cid);
        case VHOST_VSOCK_SET_RUNNING:
                if (copy_from_user(&start, argp, sizeof(start)))
                        return -EFAULT;
                if (start)
                        return vhost_vsock_start(vsock);
                else
                        return vhost_vsock_stop(vsock, true);
        case VHOST_GET_FEATURES:
                features = VHOST_VSOCK_FEATURES;
                if (copy_to_user(argp, &features, sizeof(features)))
                        return -EFAULT;
                return 0;
        case VHOST_SET_FEATURES:
                if (copy_from_user(&features, argp, sizeof(features)))
                        return -EFAULT;
                return vhost_vsock_set_features(vsock, features);
        case VHOST_GET_BACKEND_FEATURES:
                features = VHOST_VSOCK_BACKEND_FEATURES;
                if (copy_to_user(argp, &features, sizeof(features)))
                        return -EFAULT;
                return 0;
        case VHOST_SET_BACKEND_FEATURES:
                if (copy_from_user(&features, argp, sizeof(features)))
                        return -EFAULT;
                if (features & ~VHOST_VSOCK_BACKEND_FEATURES)
                        return -EOPNOTSUPP;
                vhost_set_backend_features(&vsock->dev, features);
                return 0;
        default:
                mutex_lock(&vsock->dev.mutex);
                r = vhost_dev_ioctl(&vsock->dev, ioctl, argp);
                if (r == -ENOIOCTLCMD)
                        r = vhost_vring_ioctl(&vsock->dev, ioctl, argp);
                else
                        vhost_vsock_flush(vsock);
                mutex_unlock(&vsock->dev.mutex);
                return r;
        }
}

static ssize_t vhost_vsock_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct vhost_vsock *vsock = file->private_data;
        struct vhost_dev *dev = &vsock->dev;
        int noblock = file->f_flags & O_NONBLOCK;

        return vhost_chr_read_iter(dev, to, noblock);
}

static ssize_t vhost_vsock_chr_write_iter(struct kiocb *iocb,
                                        struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct vhost_vsock *vsock = file->private_data;
        struct vhost_dev *dev = &vsock->dev;

        return vhost_chr_write_iter(dev, from);
}

static __poll_t vhost_vsock_chr_poll(struct file *file, poll_table *wait)
{
        struct vhost_vsock *vsock = file->private_data;
        struct vhost_dev *dev = &vsock->dev;

        return vhost_chr_poll(file, dev, wait);
}

static const struct file_operations vhost_vsock_fops = {
        .owner          = THIS_MODULE,
        .open           = vhost_vsock_dev_open,
        .release        = vhost_vsock_dev_release,
        .llseek                = noop_llseek,
        .unlocked_ioctl = vhost_vsock_dev_ioctl,
        .compat_ioctl   = compat_ptr_ioctl,
        .read_iter      = vhost_vsock_chr_read_iter,
        .write_iter     = vhost_vsock_chr_write_iter,
        .poll           = vhost_vsock_chr_poll,
};

static struct miscdevice vhost_vsock_misc = {
        .minor = VHOST_VSOCK_MINOR,
        .name = "vhost-vsock",
        .fops = &vhost_vsock_fops,
};

static int __init vhost_vsock_init(void)
{
        int ret;

        ret = vsock_core_register(&vhost_transport.transport,
                                  VSOCK_TRANSPORT_F_H2G);
        if (ret < 0)
                return ret;

        ret = misc_register(&vhost_vsock_misc);
        if (ret) {
                vsock_core_unregister(&vhost_transport.transport);
                return ret;
        }

        return 0;
};

static void __exit vhost_vsock_exit(void)
{
        misc_deregister(&vhost_vsock_misc);
        vsock_core_unregister(&vhost_transport.transport);
};

module_init(vhost_vsock_init);
module_exit(vhost_vsock_exit);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Asias He");
MODULE_DESCRIPTION("vhost transport for vsock ");
MODULE_ALIAS_MISCDEV(VHOST_VSOCK_MINOR);
MODULE_ALIAS("devname:vhost-vsock");












































































    1 



























































































    3 




    3 
    1 
    3 




    3 



    3 











    3 




    3 







    2 











    3 




    3 















    3 

    3 





    3 



    2 



    1 

    1 










    3 















    3 















    3 








    3 














    3 









    3 













    3 





















































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
// SPDX-License-Identifier: GPL-2.0
/*
 *  Block device elevator/IO-scheduler.
 *
 *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
 *
 * 30042000 Jens Axboe <axboe@kernel.dk> :
 *
 * Split the elevator a bit so that it is possible to choose a different
 * one or even write a new "plug in". There are three pieces:
 * - elevator_fn, inserts a new request in the queue list
 * - elevator_merge_fn, decides whether a new buffer can be merged with
 *   an existing request
 * - elevator_dequeue_fn, called when a request is taken off the active list
 *
 * 20082000 Dave Jones <davej@suse.de> :
 * Removed tests for max-bomb-segments, which was breaking elvtune
 *  when run without -bN
 *
 * Jens:
 * - Rework again to work with bio instead of buffer_heads
 * - loose bi_dev comparisons, partition handling is right now
 * - completely modularize elevator setup and teardown
 *
 */
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/compiler.h>
#include <linux/blktrace_api.h>
#include <linux/hash.h>
#include <linux/uaccess.h>
#include <linux/pm_runtime.h>

#include <trace/events/block.h>

#include "elevator.h"
#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-pm.h"
#include "blk-wbt.h"
#include "blk-cgroup.h"

static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);

/*
 * Merge hash stuff.
 */
#define rq_hash_key(rq)                (blk_rq_pos(rq) + blk_rq_sectors(rq))

/*
 * Query io scheduler to see if the current process issuing bio may be
 * merged with rq.
 */
static bool elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
{
        struct request_queue *q = rq->q;
        struct elevator_queue *e = q->elevator;

        if (e->type->ops.allow_merge)
                return e->type->ops.allow_merge(q, rq, bio);

        return true;
}

/*
 * can we safely merge with this request?
 */
bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
{
        if (!blk_rq_merge_ok(rq, bio))
                return false;

        if (!elv_iosched_allow_bio_merge(rq, bio))
                return false;

        return true;
}
EXPORT_SYMBOL(elv_bio_merge_ok);

/**
 * elevator_match - Check whether @e's name or alias matches @name
 * @e: Scheduler to test
 * @name: Elevator name to test
 *
 * Return true if the elevator @e's name or alias matches @name.
 */
static bool elevator_match(const struct elevator_type *e, const char *name)
{
        return !strcmp(e->elevator_name, name) ||
                (e->elevator_alias && !strcmp(e->elevator_alias, name));
}

static struct elevator_type *__elevator_find(const char *name)
{
        struct elevator_type *e;

        list_for_each_entry(e, &elv_list, list)
                if (elevator_match(e, name))
                        return e;
        return NULL;
}

static struct elevator_type *elevator_find_get(struct request_queue *q,
                const char *name)
{
        struct elevator_type *e;

        spin_lock(&elv_list_lock);
        e = __elevator_find(name);
        if (e && (!elevator_tryget(e)))
                e = NULL;
        spin_unlock(&elv_list_lock);
        return e;
}

static const struct kobj_type elv_ktype;

struct elevator_queue *elevator_alloc(struct request_queue *q,
                                  struct elevator_type *e)
{
        struct elevator_queue *eq;

        eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);
        if (unlikely(!eq))
                return NULL;

        __elevator_get(e);
        eq->type = e;
        kobject_init(&eq->kobj, &elv_ktype);
        mutex_init(&eq->sysfs_lock);
        hash_init(eq->hash);

        return eq;
}
EXPORT_SYMBOL(elevator_alloc);

static void elevator_release(struct kobject *kobj)
{
        struct elevator_queue *e;

        e = container_of(kobj, struct elevator_queue, kobj);
        elevator_put(e->type);
        kfree(e);
}

void elevator_exit(struct request_queue *q)
{
        struct elevator_queue *e = q->elevator;

        ioc_clear_queue(q);
        blk_mq_sched_free_rqs(q);

        mutex_lock(&e->sysfs_lock);
        blk_mq_exit_sched(q, e);
        mutex_unlock(&e->sysfs_lock);

        kobject_put(&e->kobj);
}

static inline void __elv_rqhash_del(struct request *rq)
{
        hash_del(&rq->hash);
        rq->rq_flags &= ~RQF_HASHED;
}

void elv_rqhash_del(struct request_queue *q, struct request *rq)
{
        if (ELV_ON_HASH(rq))
                __elv_rqhash_del(rq);
}
EXPORT_SYMBOL_GPL(elv_rqhash_del);

void elv_rqhash_add(struct request_queue *q, struct request *rq)
{
        struct elevator_queue *e = q->elevator;

        BUG_ON(ELV_ON_HASH(rq));
        hash_add(e->hash, &rq->hash, rq_hash_key(rq));
        rq->rq_flags |= RQF_HASHED;
}
EXPORT_SYMBOL_GPL(elv_rqhash_add);

void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
{
        __elv_rqhash_del(rq);
        elv_rqhash_add(q, rq);
}

struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
{
        struct elevator_queue *e = q->elevator;
        struct hlist_node *next;
        struct request *rq;

        hash_for_each_possible_safe(e->hash, rq, next, hash, offset) {
                BUG_ON(!ELV_ON_HASH(rq));

                if (unlikely(!rq_mergeable(rq))) {
                        __elv_rqhash_del(rq);
                        continue;
                }

                if (rq_hash_key(rq) == offset)
                        return rq;
        }

        return NULL;
}

/*
 * RB-tree support functions for inserting/lookup/removal of requests
 * in a sorted RB tree.
 */
void elv_rb_add(struct rb_root *root, struct request *rq)
{
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent = NULL;
        struct request *__rq;

        while (*p) {
                parent = *p;
                __rq = rb_entry(parent, struct request, rb_node);

                if (blk_rq_pos(rq) < blk_rq_pos(__rq))
                        p = &(*p)->rb_left;
                else if (blk_rq_pos(rq) >= blk_rq_pos(__rq))
                        p = &(*p)->rb_right;
        }

        rb_link_node(&rq->rb_node, parent, p);
        rb_insert_color(&rq->rb_node, root);
}
EXPORT_SYMBOL(elv_rb_add);

void elv_rb_del(struct rb_root *root, struct request *rq)
{
        BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
        rb_erase(&rq->rb_node, root);
        RB_CLEAR_NODE(&rq->rb_node);
}
EXPORT_SYMBOL(elv_rb_del);

struct request *elv_rb_find(struct rb_root *root, sector_t sector)
{
        struct rb_node *n = root->rb_node;
        struct request *rq;

        while (n) {
                rq = rb_entry(n, struct request, rb_node);

                if (sector < blk_rq_pos(rq))
                        n = n->rb_left;
                else if (sector > blk_rq_pos(rq))
                        n = n->rb_right;
                else
                        return rq;
        }

        return NULL;
}
EXPORT_SYMBOL(elv_rb_find);

enum elv_merge elv_merge(struct request_queue *q, struct request **req,
                struct bio *bio)
{
        struct elevator_queue *e = q->elevator;
        struct request *__rq;

        /*
         * Levels of merges:
         *         nomerges:  No merges at all attempted
         *         noxmerges: Only simple one-hit cache try
         *         merges:           All merge tries attempted
         */
        if (blk_queue_nomerges(q) || !bio_mergeable(bio))
                return ELEVATOR_NO_MERGE;

        /*
         * First try one-hit cache.
         */
        if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) {
                enum elv_merge ret = blk_try_merge(q->last_merge, bio);

                if (ret != ELEVATOR_NO_MERGE) {
                        *req = q->last_merge;
                        return ret;
                }
        }

        if (blk_queue_noxmerges(q))
                return ELEVATOR_NO_MERGE;

        /*
         * See if our hash lookup can find a potential backmerge.
         */
        __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector);
        if (__rq && elv_bio_merge_ok(__rq, bio)) {
                *req = __rq;

                if (blk_discard_mergable(__rq))
                        return ELEVATOR_DISCARD_MERGE;
                return ELEVATOR_BACK_MERGE;
        }

        if (e->type->ops.request_merge)
                return e->type->ops.request_merge(q, req, bio);

        return ELEVATOR_NO_MERGE;
}

/*
 * Attempt to do an insertion back merge. Only check for the case where
 * we can append 'rq' to an existing request, so we can throw 'rq' away
 * afterwards.
 *
 * Returns true if we merged, false otherwise. 'free' will contain all
 * requests that need to be freed.
 */
bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq,
                              struct list_head *free)
{
        struct request *__rq;
        bool ret;

        if (blk_queue_nomerges(q))
                return false;

        /*
         * First try one-hit cache.
         */
        if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) {
                list_add(&rq->queuelist, free);
                return true;
        }

        if (blk_queue_noxmerges(q))
                return false;

        ret = false;
        /*
         * See if our hash lookup can find a potential backmerge.
         */
        while (1) {
                __rq = elv_rqhash_find(q, blk_rq_pos(rq));
                if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
                        break;

                list_add(&rq->queuelist, free);
                /* The merged request could be merged with others, try again */
                ret = true;
                rq = __rq;
        }

        return ret;
}

void elv_merged_request(struct request_queue *q, struct request *rq,
                enum elv_merge type)
{
        struct elevator_queue *e = q->elevator;

        if (e->type->ops.request_merged)
                e->type->ops.request_merged(q, rq, type);

        if (type == ELEVATOR_BACK_MERGE)
                elv_rqhash_reposition(q, rq);

        q->last_merge = rq;
}

void elv_merge_requests(struct request_queue *q, struct request *rq,
                             struct request *next)
{
        struct elevator_queue *e = q->elevator;

        if (e->type->ops.requests_merged)
                e->type->ops.requests_merged(q, rq, next);

        elv_rqhash_reposition(q, rq);
        q->last_merge = rq;
}

struct request *elv_latter_request(struct request_queue *q, struct request *rq)
{
        struct elevator_queue *e = q->elevator;

        if (e->type->ops.next_request)
                return e->type->ops.next_request(q, rq);

        return NULL;
}

struct request *elv_former_request(struct request_queue *q, struct request *rq)
{
        struct elevator_queue *e = q->elevator;

        if (e->type->ops.former_request)
                return e->type->ops.former_request(q, rq);

        return NULL;
}

#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)

static ssize_t
elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
        struct elv_fs_entry *entry = to_elv(attr);
        struct elevator_queue *e;
        ssize_t error;

        if (!entry->show)
                return -EIO;

        e = container_of(kobj, struct elevator_queue, kobj);
        mutex_lock(&e->sysfs_lock);
        error = e->type ? entry->show(e, page) : -ENOENT;
        mutex_unlock(&e->sysfs_lock);
        return error;
}

static ssize_t
elv_attr_store(struct kobject *kobj, struct attribute *attr,
               const char *page, size_t length)
{
        struct elv_fs_entry *entry = to_elv(attr);
        struct elevator_queue *e;
        ssize_t error;

        if (!entry->store)
                return -EIO;

        e = container_of(kobj, struct elevator_queue, kobj);
        mutex_lock(&e->sysfs_lock);
        error = e->type ? entry->store(e, page, length) : -ENOENT;
        mutex_unlock(&e->sysfs_lock);
        return error;
}

static const struct sysfs_ops elv_sysfs_ops = {
        .show        = elv_attr_show,
        .store        = elv_attr_store,
};

static const struct kobj_type elv_ktype = {
        .sysfs_ops        = &elv_sysfs_ops,
        .release        = elevator_release,
};

int elv_register_queue(struct request_queue *q, bool uevent)
{
        struct elevator_queue *e = q->elevator;
        int error;

        lockdep_assert_held(&q->sysfs_lock);

        error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched");
        if (!error) {
                struct elv_fs_entry *attr = e->type->elevator_attrs;
                if (attr) {
                        while (attr->attr.name) {
                                if (sysfs_create_file(&e->kobj, &attr->attr))
                                        break;
                                attr++;
                        }
                }
                if (uevent)
                        kobject_uevent(&e->kobj, KOBJ_ADD);

                set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags);
        }
        return error;
}

void elv_unregister_queue(struct request_queue *q)
{
        struct elevator_queue *e = q->elevator;

        lockdep_assert_held(&q->sysfs_lock);

        if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) {
                kobject_uevent(&e->kobj, KOBJ_REMOVE);
                kobject_del(&e->kobj);
        }
}

int elv_register(struct elevator_type *e)
{
        /* finish request is mandatory */
        if (WARN_ON_ONCE(!e->ops.finish_request))
                return -EINVAL;
        /* insert_requests and dispatch_request are mandatory */
        if (WARN_ON_ONCE(!e->ops.insert_requests || !e->ops.dispatch_request))
                return -EINVAL;

        /* create icq_cache if requested */
        if (e->icq_size) {
                if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
                    WARN_ON(e->icq_align < __alignof__(struct io_cq)))
                        return -EINVAL;

                snprintf(e->icq_cache_name, sizeof(e->icq_cache_name),
                         "%s_io_cq", e->elevator_name);
                e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size,
                                                 e->icq_align, 0, NULL);
                if (!e->icq_cache)
                        return -ENOMEM;
        }

        /* register, don't allow duplicate names */
        spin_lock(&elv_list_lock);
        if (__elevator_find(e->elevator_name)) {
                spin_unlock(&elv_list_lock);
                kmem_cache_destroy(e->icq_cache);
                return -EBUSY;
        }
        list_add_tail(&e->list, &elv_list);
        spin_unlock(&elv_list_lock);

        printk(KERN_INFO "io scheduler %s registered\n", e->elevator_name);

        return 0;
}
EXPORT_SYMBOL_GPL(elv_register);

void elv_unregister(struct elevator_type *e)
{
        /* unregister */
        spin_lock(&elv_list_lock);
        list_del_init(&e->list);
        spin_unlock(&elv_list_lock);

        /*
         * Destroy icq_cache if it exists.  icq's are RCU managed.  Make
         * sure all RCU operations are complete before proceeding.
         */
        if (e->icq_cache) {
                rcu_barrier();
                kmem_cache_destroy(e->icq_cache);
                e->icq_cache = NULL;
        }
}
EXPORT_SYMBOL_GPL(elv_unregister);

static inline bool elv_support_iosched(struct request_queue *q)
{
        if (!queue_is_mq(q) ||
            (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)))
                return false;
        return true;
}

/*
 * For single queue devices, default to using mq-deadline. If we have multiple
 * queues or mq-deadline is not available, default to "none".
 */
static struct elevator_type *elevator_get_default(struct request_queue *q)
{
        if (q->tag_set && q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
                return NULL;

        if (q->nr_hw_queues != 1 &&
            !blk_mq_is_shared_tags(q->tag_set->flags))
                return NULL;

        return elevator_find_get(q, "mq-deadline");
}

/*
 * Use the default elevator settings. If the chosen elevator initialization
 * fails, fall back to the "none" elevator (no elevator).
 */
void elevator_init_mq(struct request_queue *q)
{
        struct elevator_type *e;
        int err;

        if (!elv_support_iosched(q))
                return;

        WARN_ON_ONCE(blk_queue_registered(q));

        if (unlikely(q->elevator))
                return;

        e = elevator_get_default(q);
        if (!e)
                return;

        /*
         * We are called before adding disk, when there isn't any FS I/O,
         * so freezing queue plus canceling dispatch work is enough to
         * drain any dispatch activities originated from passthrough
         * requests, then no need to quiesce queue which may add long boot
         * latency, especially when lots of disks are involved.
         */
        blk_mq_freeze_queue(q);
        blk_mq_cancel_work_sync(q);

        err = blk_mq_init_sched(q, e);

        blk_mq_unfreeze_queue(q);

        if (err) {
                pr_warn("\"%s\" elevator initialization failed, "
                        "falling back to \"none\"\n", e->elevator_name);
        }

        elevator_put(e);
}

/*
 * Switch to new_e io scheduler.
 *
 * If switching fails, we are most likely running out of memory and not able
 * to restore the old io scheduler, so leaving the io scheduler being none.
 */
int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
{
        int ret;

        lockdep_assert_held(&q->sysfs_lock);

        blk_mq_freeze_queue(q);
        blk_mq_quiesce_queue(q);

        if (q->elevator) {
                elv_unregister_queue(q);
                elevator_exit(q);
        }

        ret = blk_mq_init_sched(q, new_e);
        if (ret)
                goto out_unfreeze;

        ret = elv_register_queue(q, true);
        if (ret) {
                elevator_exit(q);
                goto out_unfreeze;
        }
        blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);

out_unfreeze:
        blk_mq_unquiesce_queue(q);
        blk_mq_unfreeze_queue(q);

        if (ret) {
                pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n",
                        new_e->elevator_name);
        }

        return ret;
}

void elevator_disable(struct request_queue *q)
{
        lockdep_assert_held(&q->sysfs_lock);

        blk_mq_freeze_queue(q);
        blk_mq_quiesce_queue(q);

        elv_unregister_queue(q);
        elevator_exit(q);
        blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
        q->elevator = NULL;
        q->nr_requests = q->tag_set->queue_depth;
        blk_add_trace_msg(q, "elv switch: none");

        blk_mq_unquiesce_queue(q);
        blk_mq_unfreeze_queue(q);
}

/*
 * Switch this queue to the given IO scheduler.
 */
static int elevator_change(struct request_queue *q, const char *elevator_name)
{
        struct elevator_type *e;
        int ret;

        /* Make sure queue is not in the middle of being removed */
        if (!blk_queue_registered(q))
                return -ENOENT;

        if (!strncmp(elevator_name, "none", 4)) {
                if (q->elevator)
                        elevator_disable(q);
                return 0;
        }

        if (q->elevator && elevator_match(q->elevator->type, elevator_name))
                return 0;

        e = elevator_find_get(q, elevator_name);
        if (!e) {
                request_module("%s-iosched", elevator_name);
                e = elevator_find_get(q, elevator_name);
                if (!e)
                        return -EINVAL;
        }
        ret = elevator_switch(q, e);
        elevator_put(e);
        return ret;
}

ssize_t elv_iosched_store(struct request_queue *q, const char *buf,
                          size_t count)
{
        char elevator_name[ELV_NAME_MAX];
        int ret;

        if (!elv_support_iosched(q))
                return count;

        strscpy(elevator_name, buf, sizeof(elevator_name));
        ret = elevator_change(q, strstrip(elevator_name));
        if (!ret)
                return count;
        return ret;
}

ssize_t elv_iosched_show(struct request_queue *q, char *name)
{
        struct elevator_queue *eq = q->elevator;
        struct elevator_type *cur = NULL, *e;
        int len = 0;

        if (!elv_support_iosched(q))
                return sprintf(name, "none\n");

        if (!q->elevator) {
                len += sprintf(name+len, "[none] ");
        } else {
                len += sprintf(name+len, "none ");
                cur = eq->type;
        }

        spin_lock(&elv_list_lock);
        list_for_each_entry(e, &elv_list, list) {
                if (e == cur)
                        len += sprintf(name+len, "[%s] ", e->elevator_name);
                else
                        len += sprintf(name+len, "%s ", e->elevator_name);
        }
        spin_unlock(&elv_list_lock);

        len += sprintf(name+len, "\n");
        return len;
}

struct request *elv_rb_former_request(struct request_queue *q,
                                      struct request *rq)
{
        struct rb_node *rbprev = rb_prev(&rq->rb_node);

        if (rbprev)
                return rb_entry_rq(rbprev);

        return NULL;
}
EXPORT_SYMBOL(elv_rb_former_request);

struct request *elv_rb_latter_request(struct request_queue *q,
                                      struct request *rq)
{
        struct rb_node *rbnext = rb_next(&rq->rb_node);

        if (rbnext)
                return rb_entry_rq(rbnext);

        return NULL;
}
EXPORT_SYMBOL(elv_rb_latter_request);

static int __init elevator_setup(char *str)
{
        pr_warn("Kernel parameter elevator= does not have any effect anymore.\n"
                "Please use sysfs to set IO scheduler for individual devices.\n");
        return 1;
}

__setup("elevator=", elevator_setup);









































































































































































































































































































































































    3 








    3 





    3 


































    3 



























    2 
    3 
    3 







    3 
































    3 


    1 
    3 
    3 
    1 

















































    3 


































































































    3 




































    3 

























































































































































































































    3 

























    2 





















































    3 






    2 




































    3 













    2 





















    3 

    3 


























    3 


    3 











































    1 



    1 


    1 







    1 








    1 

    1 






























    3 

    3 


    2 


    2 






















    3 



    3 
















    3 

    2 

























    3 





























    1 








































































    1 






    1 


    1 







    1 
















    1 























    1 






    1 





    1 


    1 


























































    3 
















































    3 














    2 

    3 
































    3 






    3 








    3 

    3 






    3 




























    3 
    3 










    3 









    3 













    3 

    3 



























    2 





    3 



















    1 











    1 



















    3 






























































    3 






































    3 












    3 
















    3 










    3 










    3 


    3 





    3 
    3 



    3 















    3 

















































































































































    2 



    3 
































    3 


































    3 












































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
// SPDX-License-Identifier: GPL-2.0

#include <linux/kernel.h>
#include <linux/irqflags.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/bug.h>
#include "printk_ringbuffer.h"
#include "internal.h"

/**
 * DOC: printk_ringbuffer overview
 *
 * Data Structure
 * --------------
 * The printk_ringbuffer is made up of 3 internal ringbuffers:
 *
 *   desc_ring
 *     A ring of descriptors and their meta data (such as sequence number,
 *     timestamp, loglevel, etc.) as well as internal state information about
 *     the record and logical positions specifying where in the other
 *     ringbuffer the text strings are located.
 *
 *   text_data_ring
 *     A ring of data blocks. A data block consists of an unsigned long
 *     integer (ID) that maps to a desc_ring index followed by the text
 *     string of the record.
 *
 * The internal state information of a descriptor is the key element to allow
 * readers and writers to locklessly synchronize access to the data.
 *
 * Implementation
 * --------------
 *
 * Descriptor Ring
 * ~~~~~~~~~~~~~~~
 * The descriptor ring is an array of descriptors. A descriptor contains
 * essential meta data to track the data of a printk record using
 * blk_lpos structs pointing to associated text data blocks (see
 * "Data Rings" below). Each descriptor is assigned an ID that maps
 * directly to index values of the descriptor array and has a state. The ID
 * and the state are bitwise combined into a single descriptor field named
 * @state_var, allowing ID and state to be synchronously and atomically
 * updated.
 *
 * Descriptors have four states:
 *
 *   reserved
 *     A writer is modifying the record.
 *
 *   committed
 *     The record and all its data are written. A writer can reopen the
 *     descriptor (transitioning it back to reserved), but in the committed
 *     state the data is consistent.
 *
 *   finalized
 *     The record and all its data are complete and available for reading. A
 *     writer cannot reopen the descriptor.
 *
 *   reusable
 *     The record exists, but its text and/or meta data may no longer be
 *     available.
 *
 * Querying the @state_var of a record requires providing the ID of the
 * descriptor to query. This can yield a possible fifth (pseudo) state:
 *
 *   miss
 *     The descriptor being queried has an unexpected ID.
 *
 * The descriptor ring has a @tail_id that contains the ID of the oldest
 * descriptor and @head_id that contains the ID of the newest descriptor.
 *
 * When a new descriptor should be created (and the ring is full), the tail
 * descriptor is invalidated by first transitioning to the reusable state and
 * then invalidating all tail data blocks up to and including the data blocks
 * associated with the tail descriptor (for the text ring). Then
 * @tail_id is advanced, followed by advancing @head_id. And finally the
 * @state_var of the new descriptor is initialized to the new ID and reserved
 * state.
 *
 * The @tail_id can only be advanced if the new @tail_id would be in the
 * committed or reusable queried state. This makes it possible that a valid
 * sequence number of the tail is always available.
 *
 * Descriptor Finalization
 * ~~~~~~~~~~~~~~~~~~~~~~~
 * When a writer calls the commit function prb_commit(), record data is
 * fully stored and is consistent within the ringbuffer. However, a writer can
 * reopen that record, claiming exclusive access (as with prb_reserve()), and
 * modify that record. When finished, the writer must again commit the record.
 *
 * In order for a record to be made available to readers (and also become
 * recyclable for writers), it must be finalized. A finalized record cannot be
 * reopened and can never become "unfinalized". Record finalization can occur
 * in three different scenarios:
 *
 *   1) A writer can simultaneously commit and finalize its record by calling
 *      prb_final_commit() instead of prb_commit().
 *
 *   2) When a new record is reserved and the previous record has been
 *      committed via prb_commit(), that previous record is automatically
 *      finalized.
 *
 *   3) When a record is committed via prb_commit() and a newer record
 *      already exists, the record being committed is automatically finalized.
 *
 * Data Ring
 * ~~~~~~~~~
 * The text data ring is a byte array composed of data blocks. Data blocks are
 * referenced by blk_lpos structs that point to the logical position of the
 * beginning of a data block and the beginning of the next adjacent data
 * block. Logical positions are mapped directly to index values of the byte
 * array ringbuffer.
 *
 * Each data block consists of an ID followed by the writer data. The ID is
 * the identifier of a descriptor that is associated with the data block. A
 * given data block is considered valid if all of the following conditions
 * are met:
 *
 *   1) The descriptor associated with the data block is in the committed
 *      or finalized queried state.
 *
 *   2) The blk_lpos struct within the descriptor associated with the data
 *      block references back to the same data block.
 *
 *   3) The data block is within the head/tail logical position range.
 *
 * If the writer data of a data block would extend beyond the end of the
 * byte array, only the ID of the data block is stored at the logical
 * position and the full data block (ID and writer data) is stored at the
 * beginning of the byte array. The referencing blk_lpos will point to the
 * ID before the wrap and the next data block will be at the logical
 * position adjacent the full data block after the wrap.
 *
 * Data rings have a @tail_lpos that points to the beginning of the oldest
 * data block and a @head_lpos that points to the logical position of the
 * next (not yet existing) data block.
 *
 * When a new data block should be created (and the ring is full), tail data
 * blocks will first be invalidated by putting their associated descriptors
 * into the reusable state and then pushing the @tail_lpos forward beyond
 * them. Then the @head_lpos is pushed forward and is associated with a new
 * descriptor. If a data block is not valid, the @tail_lpos cannot be
 * advanced beyond it.
 *
 * Info Array
 * ~~~~~~~~~~
 * The general meta data of printk records are stored in printk_info structs,
 * stored in an array with the same number of elements as the descriptor ring.
 * Each info corresponds to the descriptor of the same index in the
 * descriptor ring. Info validity is confirmed by evaluating the corresponding
 * descriptor before and after loading the info.
 *
 * Usage
 * -----
 * Here are some simple examples demonstrating writers and readers. For the
 * examples a global ringbuffer (test_rb) is available (which is not the
 * actual ringbuffer used by printk)::
 *
 *        DEFINE_PRINTKRB(test_rb, 15, 5);
 *
 * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of
 * 1 MiB (2 ^ (15 + 5)) for text data.
 *
 * Sample writer code::
 *
 *        const char *textstr = "message text";
 *        struct prb_reserved_entry e;
 *        struct printk_record r;
 *
 *        // specify how much to allocate
 *        prb_rec_init_wr(&r, strlen(textstr) + 1);
 *
 *        if (prb_reserve(&e, &test_rb, &r)) {
 *                snprintf(r.text_buf, r.text_buf_size, "%s", textstr);
 *
 *                r.info->text_len = strlen(textstr);
 *                r.info->ts_nsec = local_clock();
 *                r.info->caller_id = printk_caller_id();
 *
 *                // commit and finalize the record
 *                prb_final_commit(&e);
 *        }
 *
 * Note that additional writer functions are available to extend a record
 * after it has been committed but not yet finalized. This can be done as
 * long as no new records have been reserved and the caller is the same.
 *
 * Sample writer code (record extending)::
 *
 *                // alternate rest of previous example
 *
 *                r.info->text_len = strlen(textstr);
 *                r.info->ts_nsec = local_clock();
 *                r.info->caller_id = printk_caller_id();
 *
 *                // commit the record (but do not finalize yet)
 *                prb_commit(&e);
 *        }
 *
 *        ...
 *
 *        // specify additional 5 bytes text space to extend
 *        prb_rec_init_wr(&r, 5);
 *
 *        // try to extend, but only if it does not exceed 32 bytes
 *        if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id(), 32)) {
 *                snprintf(&r.text_buf[r.info->text_len],
 *                         r.text_buf_size - r.info->text_len, "hello");
 *
 *                r.info->text_len += 5;
 *
 *                // commit and finalize the record
 *                prb_final_commit(&e);
 *        }
 *
 * Sample reader code::
 *
 *        struct printk_info info;
 *        struct printk_record r;
 *        char text_buf[32];
 *        u64 seq;
 *
 *        prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf));
 *
 *        prb_for_each_record(0, &test_rb, &seq, &r) {
 *                if (info.seq != seq)
 *                        pr_warn("lost %llu records\n", info.seq - seq);
 *
 *                if (info.text_len > r.text_buf_size) {
 *                        pr_warn("record %llu text truncated\n", info.seq);
 *                        text_buf[r.text_buf_size - 1] = 0;
 *                }
 *
 *                pr_info("%llu: %llu: %s\n", info.seq, info.ts_nsec,
 *                        &text_buf[0]);
 *        }
 *
 * Note that additional less convenient reader functions are available to
 * allow complex record access.
 *
 * ABA Issues
 * ~~~~~~~~~~
 * To help avoid ABA issues, descriptors are referenced by IDs (array index
 * values combined with tagged bits counting array wraps) and data blocks are
 * referenced by logical positions (array index values combined with tagged
 * bits counting array wraps). However, on 32-bit systems the number of
 * tagged bits is relatively small such that an ABA incident is (at least
 * theoretically) possible. For example, if 4 million maximally sized (1KiB)
 * printk messages were to occur in NMI context on a 32-bit system, the
 * interrupted context would not be able to recognize that the 32-bit integer
 * completely wrapped and thus represents a different data block than the one
 * the interrupted context expects.
 *
 * To help combat this possibility, additional state checking is performed
 * (such as using cmpxchg() even though set() would suffice). These extra
 * checks are commented as such and will hopefully catch any ABA issue that
 * a 32-bit system might experience.
 *
 * Memory Barriers
 * ~~~~~~~~~~~~~~~
 * Multiple memory barriers are used. To simplify proving correctness and
 * generating litmus tests, lines of code related to memory barriers
 * (loads, stores, and the associated memory barriers) are labeled::
 *
 *        LMM(function:letter)
 *
 * Comments reference the labels using only the "function:letter" part.
 *
 * The memory barrier pairs and their ordering are:
 *
 *   desc_reserve:D / desc_reserve:B
 *     push descriptor tail (id), then push descriptor head (id)
 *
 *   desc_reserve:D / data_push_tail:B
 *     push data tail (lpos), then set new descriptor reserved (state)
 *
 *   desc_reserve:D / desc_push_tail:C
 *     push descriptor tail (id), then set new descriptor reserved (state)
 *
 *   desc_reserve:D / prb_first_seq:C
 *     push descriptor tail (id), then set new descriptor reserved (state)
 *
 *   desc_reserve:F / desc_read:D
 *     set new descriptor id and reserved (state), then allow writer changes
 *
 *   data_alloc:A (or data_realloc:A) / desc_read:D
 *     set old descriptor reusable (state), then modify new data block area
 *
 *   data_alloc:A (or data_realloc:A) / data_push_tail:B
 *     push data tail (lpos), then modify new data block area
 *
 *   _prb_commit:B / desc_read:B
 *     store writer changes, then set new descriptor committed (state)
 *
 *   desc_reopen_last:A / _prb_commit:B
 *     set descriptor reserved (state), then read descriptor data
 *
 *   _prb_commit:B / desc_reserve:D
 *     set new descriptor committed (state), then check descriptor head (id)
 *
 *   data_push_tail:D / data_push_tail:A
 *     set descriptor reusable (state), then push data tail (lpos)
 *
 *   desc_push_tail:B / desc_reserve:D
 *     set descriptor reusable (state), then push descriptor tail (id)
 *
 *   desc_update_last_finalized:A / desc_last_finalized_seq:A
 *     store finalized record, then set new highest finalized sequence number
 */

#define DATA_SIZE(data_ring)                _DATA_SIZE((data_ring)->size_bits)
#define DATA_SIZE_MASK(data_ring)        (DATA_SIZE(data_ring) - 1)

#define DESCS_COUNT(desc_ring)                _DESCS_COUNT((desc_ring)->count_bits)
#define DESCS_COUNT_MASK(desc_ring)        (DESCS_COUNT(desc_ring) - 1)

/* Determine the data array index from a logical position. */
#define DATA_INDEX(data_ring, lpos)        ((lpos) & DATA_SIZE_MASK(data_ring))

/* Determine the desc array index from an ID or sequence number. */
#define DESC_INDEX(desc_ring, n)        ((n) & DESCS_COUNT_MASK(desc_ring))

/* Determine how many times the data array has wrapped. */
#define DATA_WRAPS(data_ring, lpos)        ((lpos) >> (data_ring)->size_bits)

/* Determine if a logical position refers to a data-less block. */
#define LPOS_DATALESS(lpos)                ((lpos) & 1UL)
#define BLK_DATALESS(blk)                (LPOS_DATALESS((blk)->begin) && \
                                         LPOS_DATALESS((blk)->next))

/* Get the logical position at index 0 of the current wrap. */
#define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \
((lpos) & ~DATA_SIZE_MASK(data_ring))

/* Get the ID for the same index of the previous wrap as the given ID. */
#define DESC_ID_PREV_WRAP(desc_ring, id) \
DESC_ID((id) - DESCS_COUNT(desc_ring))

/*
 * A data block: mapped directly to the beginning of the data block area
 * specified as a logical position within the data ring.
 *
 * @id:   the ID of the associated descriptor
 * @data: the writer data
 *
 * Note that the size of a data block is only known by its associated
 * descriptor.
 */
struct prb_data_block {
        unsigned long        id;
        char                data[];
};

/*
 * Return the descriptor associated with @n. @n can be either a
 * descriptor ID or a sequence number.
 */
static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n)
{
        return &desc_ring->descs[DESC_INDEX(desc_ring, n)];
}

/*
 * Return the printk_info associated with @n. @n can be either a
 * descriptor ID or a sequence number.
 */
static struct printk_info *to_info(struct prb_desc_ring *desc_ring, u64 n)
{
        return &desc_ring->infos[DESC_INDEX(desc_ring, n)];
}

static struct prb_data_block *to_block(struct prb_data_ring *data_ring,
                                       unsigned long begin_lpos)
{
        return (void *)&data_ring->data[DATA_INDEX(data_ring, begin_lpos)];
}

/*
 * Increase the data size to account for data block meta data plus any
 * padding so that the adjacent data block is aligned on the ID size.
 */
static unsigned int to_blk_size(unsigned int size)
{
        struct prb_data_block *db = NULL;

        size += sizeof(*db);
        size = ALIGN(size, sizeof(db->id));
        return size;
}

/*
 * Sanity checker for reserve size. The ringbuffer code assumes that a data
 * block does not exceed the maximum possible size that could fit within the
 * ringbuffer. This function provides that basic size check so that the
 * assumption is safe.
 */
static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size)
{
        struct prb_data_block *db = NULL;

        if (size == 0)
                return true;

        /*
         * Ensure the alignment padded size could possibly fit in the data
         * array. The largest possible data block must still leave room for
         * at least the ID of the next block.
         */
        size = to_blk_size(size);
        if (size > DATA_SIZE(data_ring) - sizeof(db->id))
                return false;

        return true;
}

/* Query the state of a descriptor. */
static enum desc_state get_desc_state(unsigned long id,
                                      unsigned long state_val)
{
        if (id != DESC_ID(state_val))
                return desc_miss;

        return DESC_STATE(state_val);
}

/*
 * Get a copy of a specified descriptor and return its queried state. If the
 * descriptor is in an inconsistent state (miss or reserved), the caller can
 * only expect the descriptor's @state_var field to be valid.
 *
 * The sequence number and caller_id can be optionally retrieved. Like all
 * non-state_var data, they are only valid if the descriptor is in a
 * consistent state.
 */
static enum desc_state desc_read(struct prb_desc_ring *desc_ring,
                                 unsigned long id, struct prb_desc *desc_out,
                                 u64 *seq_out, u32 *caller_id_out)
{
        struct printk_info *info = to_info(desc_ring, id);
        struct prb_desc *desc = to_desc(desc_ring, id);
        atomic_long_t *state_var = &desc->state_var;
        enum desc_state d_state;
        unsigned long state_val;

        /* Check the descriptor state. */
        state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */
        d_state = get_desc_state(id, state_val);
        if (d_state == desc_miss || d_state == desc_reserved) {
                /*
                 * The descriptor is in an inconsistent state. Set at least
                 * @state_var so that the caller can see the details of
                 * the inconsistent state.
                 */
                goto out;
        }

        /*
         * Guarantee the state is loaded before copying the descriptor
         * content. This avoids copying obsolete descriptor content that might
         * not apply to the descriptor state. This pairs with _prb_commit:B.
         *
         * Memory barrier involvement:
         *
         * If desc_read:A reads from _prb_commit:B, then desc_read:C reads
         * from _prb_commit:A.
         *
         * Relies on:
         *
         * WMB from _prb_commit:A to _prb_commit:B
         *    matching
         * RMB from desc_read:A to desc_read:C
         */
        smp_rmb(); /* LMM(desc_read:B) */

        /*
         * Copy the descriptor data. The data is not valid until the
         * state has been re-checked. A memcpy() for all of @desc
         * cannot be used because of the atomic_t @state_var field.
         */
        if (desc_out) {
                memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos,
                       sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */
        }
        if (seq_out)
                *seq_out = info->seq; /* also part of desc_read:C */
        if (caller_id_out)
                *caller_id_out = info->caller_id; /* also part of desc_read:C */

        /*
         * 1. Guarantee the descriptor content is loaded before re-checking
         *    the state. This avoids reading an obsolete descriptor state
         *    that may not apply to the copied content. This pairs with
         *    desc_reserve:F.
         *
         *    Memory barrier involvement:
         *
         *    If desc_read:C reads from desc_reserve:G, then desc_read:E
         *    reads from desc_reserve:F.
         *
         *    Relies on:
         *
         *    WMB from desc_reserve:F to desc_reserve:G
         *       matching
         *    RMB from desc_read:C to desc_read:E
         *
         * 2. Guarantee the record data is loaded before re-checking the
         *    state. This avoids reading an obsolete descriptor state that may
         *    not apply to the copied data. This pairs with data_alloc:A and
         *    data_realloc:A.
         *
         *    Memory barrier involvement:
         *
         *    If copy_data:A reads from data_alloc:B, then desc_read:E
         *    reads from desc_make_reusable:A.
         *
         *    Relies on:
         *
         *    MB from desc_make_reusable:A to data_alloc:B
         *       matching
         *    RMB from desc_read:C to desc_read:E
         *
         *    Note: desc_make_reusable:A and data_alloc:B can be different
         *          CPUs. However, the data_alloc:B CPU (which performs the
         *          full memory barrier) must have previously seen
         *          desc_make_reusable:A.
         */
        smp_rmb(); /* LMM(desc_read:D) */

        /*
         * The data has been copied. Return the current descriptor state,
         * which may have changed since the load above.
         */
        state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */
        d_state = get_desc_state(id, state_val);
out:
        if (desc_out)
                atomic_long_set(&desc_out->state_var, state_val);
        return d_state;
}

/*
 * Take a specified descriptor out of the finalized state by attempting
 * the transition from finalized to reusable. Either this context or some
 * other context will have been successful.
 */
static void desc_make_reusable(struct prb_desc_ring *desc_ring,
                               unsigned long id)
{
        unsigned long val_finalized = DESC_SV(id, desc_finalized);
        unsigned long val_reusable = DESC_SV(id, desc_reusable);
        struct prb_desc *desc = to_desc(desc_ring, id);
        atomic_long_t *state_var = &desc->state_var;

        atomic_long_cmpxchg_relaxed(state_var, val_finalized,
                                    val_reusable); /* LMM(desc_make_reusable:A) */
}

/*
 * Given the text data ring, put the associated descriptor of each
 * data block from @lpos_begin until @lpos_end into the reusable state.
 *
 * If there is any problem making the associated descriptor reusable, either
 * the descriptor has not yet been finalized or another writer context has
 * already pushed the tail lpos past the problematic data block. Regardless,
 * on error the caller can re-load the tail lpos to determine the situation.
 */
static bool data_make_reusable(struct printk_ringbuffer *rb,
                               unsigned long lpos_begin,
                               unsigned long lpos_end,
                               unsigned long *lpos_out)
{

        struct prb_data_ring *data_ring = &rb->text_data_ring;
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        struct prb_data_block *blk;
        enum desc_state d_state;
        struct prb_desc desc;
        struct prb_data_blk_lpos *blk_lpos = &desc.text_blk_lpos;
        unsigned long id;

        /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */
        while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) {
                blk = to_block(data_ring, lpos_begin);

                /*
                 * Load the block ID from the data block. This is a data race
                 * against a writer that may have newly reserved this data
                 * area. If the loaded value matches a valid descriptor ID,
                 * the blk_lpos of that descriptor will be checked to make
                 * sure it points back to this data block. If the check fails,
                 * the data area has been recycled by another writer.
                 */
                id = blk->id; /* LMM(data_make_reusable:A) */

                d_state = desc_read(desc_ring, id, &desc,
                                    NULL, NULL); /* LMM(data_make_reusable:B) */

                switch (d_state) {
                case desc_miss:
                case desc_reserved:
                case desc_committed:
                        return false;
                case desc_finalized:
                        /*
                         * This data block is invalid if the descriptor
                         * does not point back to it.
                         */
                        if (blk_lpos->begin != lpos_begin)
                                return false;
                        desc_make_reusable(desc_ring, id);
                        break;
                case desc_reusable:
                        /*
                         * This data block is invalid if the descriptor
                         * does not point back to it.
                         */
                        if (blk_lpos->begin != lpos_begin)
                                return false;
                        break;
                }

                /* Advance @lpos_begin to the next data block. */
                lpos_begin = blk_lpos->next;
        }

        *lpos_out = lpos_begin;
        return true;
}

/*
 * Advance the data ring tail to at least @lpos. This function puts
 * descriptors into the reusable state if the tail is pushed beyond
 * their associated data block.
 */
static bool data_push_tail(struct printk_ringbuffer *rb, unsigned long lpos)
{
        struct prb_data_ring *data_ring = &rb->text_data_ring;
        unsigned long tail_lpos_new;
        unsigned long tail_lpos;
        unsigned long next_lpos;

        /* If @lpos is from a data-less block, there is nothing to do. */
        if (LPOS_DATALESS(lpos))
                return true;

        /*
         * Any descriptor states that have transitioned to reusable due to the
         * data tail being pushed to this loaded value will be visible to this
         * CPU. This pairs with data_push_tail:D.
         *
         * Memory barrier involvement:
         *
         * If data_push_tail:A reads from data_push_tail:D, then this CPU can
         * see desc_make_reusable:A.
         *
         * Relies on:
         *
         * MB from desc_make_reusable:A to data_push_tail:D
         *    matches
         * READFROM from data_push_tail:D to data_push_tail:A
         *    thus
         * READFROM from desc_make_reusable:A to this CPU
         */
        tail_lpos = atomic_long_read(&data_ring->tail_lpos); /* LMM(data_push_tail:A) */

        /*
         * Loop until the tail lpos is at or beyond @lpos. This condition
         * may already be satisfied, resulting in no full memory barrier
         * from data_push_tail:D being performed. However, since this CPU
         * sees the new tail lpos, any descriptor states that transitioned to
         * the reusable state must already be visible.
         */
        while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) {
                /*
                 * Make all descriptors reusable that are associated with
                 * data blocks before @lpos.
                 */
                if (!data_make_reusable(rb, tail_lpos, lpos, &next_lpos)) {
                        /*
                         * 1. Guarantee the block ID loaded in
                         *    data_make_reusable() is performed before
                         *    reloading the tail lpos. The failed
                         *    data_make_reusable() may be due to a newly
                         *    recycled data area causing the tail lpos to
                         *    have been previously pushed. This pairs with
                         *    data_alloc:A and data_realloc:A.
                         *
                         *    Memory barrier involvement:
                         *
                         *    If data_make_reusable:A reads from data_alloc:B,
                         *    then data_push_tail:C reads from
                         *    data_push_tail:D.
                         *
                         *    Relies on:
                         *
                         *    MB from data_push_tail:D to data_alloc:B
                         *       matching
                         *    RMB from data_make_reusable:A to
                         *    data_push_tail:C
                         *
                         *    Note: data_push_tail:D and data_alloc:B can be
                         *          different CPUs. However, the data_alloc:B
                         *          CPU (which performs the full memory
                         *          barrier) must have previously seen
                         *          data_push_tail:D.
                         *
                         * 2. Guarantee the descriptor state loaded in
                         *    data_make_reusable() is performed before
                         *    reloading the tail lpos. The failed
                         *    data_make_reusable() may be due to a newly
                         *    recycled descriptor causing the tail lpos to
                         *    have been previously pushed. This pairs with
                         *    desc_reserve:D.
                         *
                         *    Memory barrier involvement:
                         *
                         *    If data_make_reusable:B reads from
                         *    desc_reserve:F, then data_push_tail:C reads
                         *    from data_push_tail:D.
                         *
                         *    Relies on:
                         *
                         *    MB from data_push_tail:D to desc_reserve:F
                         *       matching
                         *    RMB from data_make_reusable:B to
                         *    data_push_tail:C
                         *
                         *    Note: data_push_tail:D and desc_reserve:F can
                         *          be different CPUs. However, the
                         *          desc_reserve:F CPU (which performs the
                         *          full memory barrier) must have previously
                         *          seen data_push_tail:D.
                         */
                        smp_rmb(); /* LMM(data_push_tail:B) */

                        tail_lpos_new = atomic_long_read(&data_ring->tail_lpos
                                                        ); /* LMM(data_push_tail:C) */
                        if (tail_lpos_new == tail_lpos)
                                return false;

                        /* Another CPU pushed the tail. Try again. */
                        tail_lpos = tail_lpos_new;
                        continue;
                }

                /*
                 * Guarantee any descriptor states that have transitioned to
                 * reusable are stored before pushing the tail lpos. A full
                 * memory barrier is needed since other CPUs may have made
                 * the descriptor states reusable. This pairs with
                 * data_push_tail:A.
                 */
                if (atomic_long_try_cmpxchg(&data_ring->tail_lpos, &tail_lpos,
                                            next_lpos)) { /* LMM(data_push_tail:D) */
                        break;
                }
        }

        return true;
}

/*
 * Advance the desc ring tail. This function advances the tail by one
 * descriptor, thus invalidating the oldest descriptor. Before advancing
 * the tail, the tail descriptor is made reusable and all data blocks up to
 * and including the descriptor's data block are invalidated (i.e. the data
 * ring tail is pushed past the data block of the descriptor being made
 * reusable).
 */
static bool desc_push_tail(struct printk_ringbuffer *rb,
                           unsigned long tail_id)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        enum desc_state d_state;
        struct prb_desc desc;

        d_state = desc_read(desc_ring, tail_id, &desc, NULL, NULL);

        switch (d_state) {
        case desc_miss:
                /*
                 * If the ID is exactly 1 wrap behind the expected, it is
                 * in the process of being reserved by another writer and
                 * must be considered reserved.
                 */
                if (DESC_ID(atomic_long_read(&desc.state_var)) ==
                    DESC_ID_PREV_WRAP(desc_ring, tail_id)) {
                        return false;
                }

                /*
                 * The ID has changed. Another writer must have pushed the
                 * tail and recycled the descriptor already. Success is
                 * returned because the caller is only interested in the
                 * specified tail being pushed, which it was.
                 */
                return true;
        case desc_reserved:
        case desc_committed:
                return false;
        case desc_finalized:
                desc_make_reusable(desc_ring, tail_id);
                break;
        case desc_reusable:
                break;
        }

        /*
         * Data blocks must be invalidated before their associated
         * descriptor can be made available for recycling. Invalidating
         * them later is not possible because there is no way to trust
         * data blocks once their associated descriptor is gone.
         */

        if (!data_push_tail(rb, desc.text_blk_lpos.next))
                return false;

        /*
         * Check the next descriptor after @tail_id before pushing the tail
         * to it because the tail must always be in a finalized or reusable
         * state. The implementation of prb_first_seq() relies on this.
         *
         * A successful read implies that the next descriptor is less than or
         * equal to @head_id so there is no risk of pushing the tail past the
         * head.
         */
        d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc,
                            NULL, NULL); /* LMM(desc_push_tail:A) */

        if (d_state == desc_finalized || d_state == desc_reusable) {
                /*
                 * Guarantee any descriptor states that have transitioned to
                 * reusable are stored before pushing the tail ID. This allows
                 * verifying the recycled descriptor state. A full memory
                 * barrier is needed since other CPUs may have made the
                 * descriptor states reusable. This pairs with desc_reserve:D.
                 */
                atomic_long_cmpxchg(&desc_ring->tail_id, tail_id,
                                    DESC_ID(tail_id + 1)); /* LMM(desc_push_tail:B) */
        } else {
                /*
                 * Guarantee the last state load from desc_read() is before
                 * reloading @tail_id in order to see a new tail ID in the
                 * case that the descriptor has been recycled. This pairs
                 * with desc_reserve:D.
                 *
                 * Memory barrier involvement:
                 *
                 * If desc_push_tail:A reads from desc_reserve:F, then
                 * desc_push_tail:D reads from desc_push_tail:B.
                 *
                 * Relies on:
                 *
                 * MB from desc_push_tail:B to desc_reserve:F
                 *    matching
                 * RMB from desc_push_tail:A to desc_push_tail:D
                 *
                 * Note: desc_push_tail:B and desc_reserve:F can be different
                 *       CPUs. However, the desc_reserve:F CPU (which performs
                 *       the full memory barrier) must have previously seen
                 *       desc_push_tail:B.
                 */
                smp_rmb(); /* LMM(desc_push_tail:C) */

                /*
                 * Re-check the tail ID. The descriptor following @tail_id is
                 * not in an allowed tail state. But if the tail has since
                 * been moved by another CPU, then it does not matter.
                 */
                if (atomic_long_read(&desc_ring->tail_id) == tail_id) /* LMM(desc_push_tail:D) */
                        return false;
        }

        return true;
}

/* Reserve a new descriptor, invalidating the oldest if necessary. */
static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        unsigned long prev_state_val;
        unsigned long id_prev_wrap;
        struct prb_desc *desc;
        unsigned long head_id;
        unsigned long id;

        head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */

        do {
                id = DESC_ID(head_id + 1);
                id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id);

                /*
                 * Guarantee the head ID is read before reading the tail ID.
                 * Since the tail ID is updated before the head ID, this
                 * guarantees that @id_prev_wrap is never ahead of the tail
                 * ID. This pairs with desc_reserve:D.
                 *
                 * Memory barrier involvement:
                 *
                 * If desc_reserve:A reads from desc_reserve:D, then
                 * desc_reserve:C reads from desc_push_tail:B.
                 *
                 * Relies on:
                 *
                 * MB from desc_push_tail:B to desc_reserve:D
                 *    matching
                 * RMB from desc_reserve:A to desc_reserve:C
                 *
                 * Note: desc_push_tail:B and desc_reserve:D can be different
                 *       CPUs. However, the desc_reserve:D CPU (which performs
                 *       the full memory barrier) must have previously seen
                 *       desc_push_tail:B.
                 */
                smp_rmb(); /* LMM(desc_reserve:B) */

                if (id_prev_wrap == atomic_long_read(&desc_ring->tail_id
                                                    )) { /* LMM(desc_reserve:C) */
                        /*
                         * Make space for the new descriptor by
                         * advancing the tail.
                         */
                        if (!desc_push_tail(rb, id_prev_wrap))
                                return false;
                }

                /*
                 * 1. Guarantee the tail ID is read before validating the
                 *    recycled descriptor state. A read memory barrier is
                 *    sufficient for this. This pairs with desc_push_tail:B.
                 *
                 *    Memory barrier involvement:
                 *
                 *    If desc_reserve:C reads from desc_push_tail:B, then
                 *    desc_reserve:E reads from desc_make_reusable:A.
                 *
                 *    Relies on:
                 *
                 *    MB from desc_make_reusable:A to desc_push_tail:B
                 *       matching
                 *    RMB from desc_reserve:C to desc_reserve:E
                 *
                 *    Note: desc_make_reusable:A and desc_push_tail:B can be
                 *          different CPUs. However, the desc_push_tail:B CPU
                 *          (which performs the full memory barrier) must have
                 *          previously seen desc_make_reusable:A.
                 *
                 * 2. Guarantee the tail ID is stored before storing the head
                 *    ID. This pairs with desc_reserve:B.
                 *
                 * 3. Guarantee any data ring tail changes are stored before
                 *    recycling the descriptor. Data ring tail changes can
                 *    happen via desc_push_tail()->data_push_tail(). A full
                 *    memory barrier is needed since another CPU may have
                 *    pushed the data ring tails. This pairs with
                 *    data_push_tail:B.
                 *
                 * 4. Guarantee a new tail ID is stored before recycling the
                 *    descriptor. A full memory barrier is needed since
                 *    another CPU may have pushed the tail ID. This pairs
                 *    with desc_push_tail:C and this also pairs with
                 *    prb_first_seq:C.
                 *
                 * 5. Guarantee the head ID is stored before trying to
                 *    finalize the previous descriptor. This pairs with
                 *    _prb_commit:B.
                 */
        } while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id,
                                          id)); /* LMM(desc_reserve:D) */

        desc = to_desc(desc_ring, id);

        /*
         * If the descriptor has been recycled, verify the old state val.
         * See "ABA Issues" about why this verification is performed.
         */
        prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */
        if (prev_state_val &&
            get_desc_state(id_prev_wrap, prev_state_val) != desc_reusable) {
                WARN_ON_ONCE(1);
                return false;
        }

        /*
         * Assign the descriptor a new ID and set its state to reserved.
         * See "ABA Issues" about why cmpxchg() instead of set() is used.
         *
         * Guarantee the new descriptor ID and state is stored before making
         * any other changes. A write memory barrier is sufficient for this.
         * This pairs with desc_read:D.
         */
        if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val,
                        DESC_SV(id, desc_reserved))) { /* LMM(desc_reserve:F) */
                WARN_ON_ONCE(1);
                return false;
        }

        /* Now data in @desc can be modified: LMM(desc_reserve:G) */

        *id_out = id;
        return true;
}

/* Determine the end of a data block. */
static unsigned long get_next_lpos(struct prb_data_ring *data_ring,
                                   unsigned long lpos, unsigned int size)
{
        unsigned long begin_lpos;
        unsigned long next_lpos;

        begin_lpos = lpos;
        next_lpos = lpos + size;

        /* First check if the data block does not wrap. */
        if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos))
                return next_lpos;

        /* Wrapping data blocks store their data at the beginning. */
        return (DATA_THIS_WRAP_START_LPOS(data_ring, next_lpos) + size);
}

/*
 * Allocate a new data block, invalidating the oldest data block(s)
 * if necessary. This function also associates the data block with
 * a specified descriptor.
 */
static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size,
                        struct prb_data_blk_lpos *blk_lpos, unsigned long id)
{
        struct prb_data_ring *data_ring = &rb->text_data_ring;
        struct prb_data_block *blk;
        unsigned long begin_lpos;
        unsigned long next_lpos;

        if (size == 0) {
                /*
                 * Data blocks are not created for empty lines. Instead, the
                 * reader will recognize these special lpos values and handle
                 * it appropriately.
                 */
                blk_lpos->begin = EMPTY_LINE_LPOS;
                blk_lpos->next = EMPTY_LINE_LPOS;
                return NULL;
        }

        size = to_blk_size(size);

        begin_lpos = atomic_long_read(&data_ring->head_lpos);

        do {
                next_lpos = get_next_lpos(data_ring, begin_lpos, size);

                if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) {
                        /* Failed to allocate, specify a data-less block. */
                        blk_lpos->begin = FAILED_LPOS;
                        blk_lpos->next = FAILED_LPOS;
                        return NULL;
                }

                /*
                 * 1. Guarantee any descriptor states that have transitioned
                 *    to reusable are stored before modifying the newly
                 *    allocated data area. A full memory barrier is needed
                 *    since other CPUs may have made the descriptor states
                 *    reusable. See data_push_tail:A about why the reusable
                 *    states are visible. This pairs with desc_read:D.
                 *
                 * 2. Guarantee any updated tail lpos is stored before
                 *    modifying the newly allocated data area. Another CPU may
                 *    be in data_make_reusable() and is reading a block ID
                 *    from this area. data_make_reusable() can handle reading
                 *    a garbage block ID value, but then it must be able to
                 *    load a new tail lpos. A full memory barrier is needed
                 *    since other CPUs may have updated the tail lpos. This
                 *    pairs with data_push_tail:B.
                 */
        } while (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &begin_lpos,
                                          next_lpos)); /* LMM(data_alloc:A) */

        blk = to_block(data_ring, begin_lpos);
        blk->id = id; /* LMM(data_alloc:B) */

        if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) {
                /* Wrapping data blocks store their data at the beginning. */
                blk = to_block(data_ring, 0);

                /*
                 * Store the ID on the wrapped block for consistency.
                 * The printk_ringbuffer does not actually use it.
                 */
                blk->id = id;
        }

        blk_lpos->begin = begin_lpos;
        blk_lpos->next = next_lpos;

        return &blk->data[0];
}

/*
 * Try to resize an existing data block associated with the descriptor
 * specified by @id. If the resized data block should become wrapped, it
 * copies the old data to the new data block. If @size yields a data block
 * with the same or less size, the data block is left as is.
 *
 * Fail if this is not the last allocated data block or if there is not
 * enough space or it is not possible make enough space.
 *
 * Return a pointer to the beginning of the entire data buffer or NULL on
 * failure.
 */
static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size,
                          struct prb_data_blk_lpos *blk_lpos, unsigned long id)
{
        struct prb_data_ring *data_ring = &rb->text_data_ring;
        struct prb_data_block *blk;
        unsigned long head_lpos;
        unsigned long next_lpos;
        bool wrapped;

        /* Reallocation only works if @blk_lpos is the newest data block. */
        head_lpos = atomic_long_read(&data_ring->head_lpos);
        if (head_lpos != blk_lpos->next)
                return NULL;

        /* Keep track if @blk_lpos was a wrapping data block. */
        wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next));

        size = to_blk_size(size);

        next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size);

        /* If the data block does not increase, there is nothing to do. */
        if (head_lpos - next_lpos < DATA_SIZE(data_ring)) {
                if (wrapped)
                        blk = to_block(data_ring, 0);
                else
                        blk = to_block(data_ring, blk_lpos->begin);
                return &blk->data[0];
        }

        if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring)))
                return NULL;

        /* The memory barrier involvement is the same as data_alloc:A. */
        if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos,
                                     next_lpos)) { /* LMM(data_realloc:A) */
                return NULL;
        }

        blk = to_block(data_ring, blk_lpos->begin);

        if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) {
                struct prb_data_block *old_blk = blk;

                /* Wrapping data blocks store their data at the beginning. */
                blk = to_block(data_ring, 0);

                /*
                 * Store the ID on the wrapped block for consistency.
                 * The printk_ringbuffer does not actually use it.
                 */
                blk->id = id;

                if (!wrapped) {
                        /*
                         * Since the allocated space is now in the newly
                         * created wrapping data block, copy the content
                         * from the old data block.
                         */
                        memcpy(&blk->data[0], &old_blk->data[0],
                               (blk_lpos->next - blk_lpos->begin) - sizeof(blk->id));
                }
        }

        blk_lpos->next = next_lpos;

        return &blk->data[0];
}

/* Return the number of bytes used by a data block. */
static unsigned int space_used(struct prb_data_ring *data_ring,
                               struct prb_data_blk_lpos *blk_lpos)
{
        /* Data-less blocks take no space. */
        if (BLK_DATALESS(blk_lpos))
                return 0;

        if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) {
                /* Data block does not wrap. */
                return (DATA_INDEX(data_ring, blk_lpos->next) -
                        DATA_INDEX(data_ring, blk_lpos->begin));
        }

        /*
         * For wrapping data blocks, the trailing (wasted) space is
         * also counted.
         */
        return (DATA_INDEX(data_ring, blk_lpos->next) +
                DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin));
}

/*
 * Given @blk_lpos, return a pointer to the writer data from the data block
 * and calculate the size of the data part. A NULL pointer is returned if
 * @blk_lpos specifies values that could never be legal.
 *
 * This function (used by readers) performs strict validation on the lpos
 * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is
 * triggered if an internal error is detected.
 */
static const char *get_data(struct prb_data_ring *data_ring,
                            struct prb_data_blk_lpos *blk_lpos,
                            unsigned int *data_size)
{
        struct prb_data_block *db;

        /* Data-less data block description. */
        if (BLK_DATALESS(blk_lpos)) {
                /*
                 * Records that are just empty lines are also valid, even
                 * though they do not have a data block. For such records
                 * explicitly return empty string data to signify success.
                 */
                if (blk_lpos->begin == EMPTY_LINE_LPOS &&
                    blk_lpos->next == EMPTY_LINE_LPOS) {
                        *data_size = 0;
                        return "";
                }

                /* Data lost, invalid, or otherwise unavailable. */
                return NULL;
        }

        /* Regular data block: @begin less than @next and in same wrap. */
        if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) &&
            blk_lpos->begin < blk_lpos->next) {
                db = to_block(data_ring, blk_lpos->begin);
                *data_size = blk_lpos->next - blk_lpos->begin;

        /* Wrapping data block: @begin is one wrap behind @next. */
        } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) ==
                   DATA_WRAPS(data_ring, blk_lpos->next)) {
                db = to_block(data_ring, 0);
                *data_size = DATA_INDEX(data_ring, blk_lpos->next);

        /* Illegal block description. */
        } else {
                WARN_ON_ONCE(1);
                return NULL;
        }

        /* A valid data block will always be aligned to the ID size. */
        if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) ||
            WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) {
                return NULL;
        }

        /* A valid data block will always have at least an ID. */
        if (WARN_ON_ONCE(*data_size < sizeof(db->id)))
                return NULL;

        /* Subtract block ID space from size to reflect data size. */
        *data_size -= sizeof(db->id);

        return &db->data[0];
}

/*
 * Attempt to transition the newest descriptor from committed back to reserved
 * so that the record can be modified by a writer again. This is only possible
 * if the descriptor is not yet finalized and the provided @caller_id matches.
 */
static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring,
                                         u32 caller_id, unsigned long *id_out)
{
        unsigned long prev_state_val;
        enum desc_state d_state;
        struct prb_desc desc;
        struct prb_desc *d;
        unsigned long id;
        u32 cid;

        id = atomic_long_read(&desc_ring->head_id);

        /*
         * To reduce unnecessarily reopening, first check if the descriptor
         * state and caller ID are correct.
         */
        d_state = desc_read(desc_ring, id, &desc, NULL, &cid);
        if (d_state != desc_committed || cid != caller_id)
                return NULL;

        d = to_desc(desc_ring, id);

        prev_state_val = DESC_SV(id, desc_committed);

        /*
         * Guarantee the reserved state is stored before reading any
         * record data. A full memory barrier is needed because @state_var
         * modification is followed by reading. This pairs with _prb_commit:B.
         *
         * Memory barrier involvement:
         *
         * If desc_reopen_last:A reads from _prb_commit:B, then
         * prb_reserve_in_last:A reads from _prb_commit:A.
         *
         * Relies on:
         *
         * WMB from _prb_commit:A to _prb_commit:B
         *    matching
         * MB If desc_reopen_last:A to prb_reserve_in_last:A
         */
        if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val,
                        DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */
                return NULL;
        }

        *id_out = id;
        return d;
}

/**
 * prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer
 *                         used by the newest record.
 *
 * @e:         The entry structure to setup.
 * @rb:        The ringbuffer to re-reserve and extend data in.
 * @r:         The record structure to allocate buffers for.
 * @caller_id: The caller ID of the caller (reserving writer).
 * @max_size:  Fail if the extended size would be greater than this.
 *
 * This is the public function available to writers to re-reserve and extend
 * data.
 *
 * The writer specifies the text size to extend (not the new total size) by
 * setting the @text_buf_size field of @r. To ensure proper initialization
 * of @r, prb_rec_init_wr() should be used.
 *
 * This function will fail if @caller_id does not match the caller ID of the
 * newest record. In that case the caller must reserve new data using
 * prb_reserve().
 *
 * Context: Any context. Disables local interrupts on success.
 * Return: true if text data could be extended, otherwise false.
 *
 * On success:
 *
 *   - @r->text_buf points to the beginning of the entire text buffer.
 *
 *   - @r->text_buf_size is set to the new total size of the buffer.
 *
 *   - @r->info is not touched so that @r->info->text_len could be used
 *     to append the text.
 *
 *   - prb_record_text_space() can be used on @e to query the new
 *     actually used space.
 *
 * Important: All @r->info fields will already be set with the current values
 *            for the record. I.e. @r->info->text_len will be less than
 *            @text_buf_size. Writers can use @r->info->text_len to know
 *            where concatenation begins and writers should update
 *            @r->info->text_len after concatenating.
 */
bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
                         struct printk_record *r, u32 caller_id, unsigned int max_size)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        struct printk_info *info;
        unsigned int data_size;
        struct prb_desc *d;
        unsigned long id;

        local_irq_save(e->irqflags);

        /* Transition the newest descriptor back to the reserved state. */
        d = desc_reopen_last(desc_ring, caller_id, &id);
        if (!d) {
                local_irq_restore(e->irqflags);
                goto fail_reopen;
        }

        /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */

        info = to_info(desc_ring, id);

        /*
         * Set the @e fields here so that prb_commit() can be used if
         * anything fails from now on.
         */
        e->rb = rb;
        e->id = id;

        /*
         * desc_reopen_last() checked the caller_id, but there was no
         * exclusive access at that point. The descriptor may have
         * changed since then.
         */
        if (caller_id != info->caller_id)
                goto fail;

        if (BLK_DATALESS(&d->text_blk_lpos)) {
                if (WARN_ON_ONCE(info->text_len != 0)) {
                        pr_warn_once("wrong text_len value (%hu, expecting 0)\n",
                                     info->text_len);
                        info->text_len = 0;
                }

                if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
                        goto fail;

                if (r->text_buf_size > max_size)
                        goto fail;

                r->text_buf = data_alloc(rb, r->text_buf_size,
                                         &d->text_blk_lpos, id);
        } else {
                if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size))
                        goto fail;

                /*
                 * Increase the buffer size to include the original size. If
                 * the meta data (@text_len) is not sane, use the full data
                 * block size.
                 */
                if (WARN_ON_ONCE(info->text_len > data_size)) {
                        pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n",
                                     info->text_len, data_size);
                        info->text_len = data_size;
                }
                r->text_buf_size += info->text_len;

                if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
                        goto fail;

                if (r->text_buf_size > max_size)
                        goto fail;

                r->text_buf = data_realloc(rb, r->text_buf_size,
                                           &d->text_blk_lpos, id);
        }
        if (r->text_buf_size && !r->text_buf)
                goto fail;

        r->info = info;

        e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos);

        return true;
fail:
        prb_commit(e);
        /* prb_commit() re-enabled interrupts. */
fail_reopen:
        /* Make it clear to the caller that the re-reserve failed. */
        memset(r, 0, sizeof(*r));
        return false;
}

/*
 * @last_finalized_seq value guarantees that all records up to and including
 * this sequence number are finalized and can be read. The only exception are
 * too old records which have already been overwritten.
 *
 * It is also guaranteed that @last_finalized_seq only increases.
 *
 * Be aware that finalized records following non-finalized records are not
 * reported because they are not yet available to the reader. For example,
 * a new record stored via printk() will not be available to a printer if
 * it follows a record that has not been finalized yet. However, once that
 * non-finalized record becomes finalized, @last_finalized_seq will be
 * appropriately updated and the full set of finalized records will be
 * available to the printer. And since each printk() caller will either
 * directly print or trigger deferred printing of all available unprinted
 * records, all printk() messages will get printed.
 */
static u64 desc_last_finalized_seq(struct printk_ringbuffer *rb)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        unsigned long ulseq;

        /*
         * Guarantee the sequence number is loaded before loading the
         * associated record in order to guarantee that the record can be
         * seen by this CPU. This pairs with desc_update_last_finalized:A.
         */
        ulseq = atomic_long_read_acquire(&desc_ring->last_finalized_seq
                                        ); /* LMM(desc_last_finalized_seq:A) */

        return __ulseq_to_u64seq(rb, ulseq);
}

static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
                            struct printk_record *r, unsigned int *line_count);

/*
 * Check if there are records directly following @last_finalized_seq that are
 * finalized. If so, update @last_finalized_seq to the latest of these
 * records. It is not allowed to skip over records that are not yet finalized.
 */
static void desc_update_last_finalized(struct printk_ringbuffer *rb)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        u64 old_seq = desc_last_finalized_seq(rb);
        unsigned long oldval;
        unsigned long newval;
        u64 finalized_seq;
        u64 try_seq;

try_again:
        finalized_seq = old_seq;
        try_seq = finalized_seq + 1;

        /* Try to find later finalized records. */
        while (_prb_read_valid(rb, &try_seq, NULL, NULL)) {
                finalized_seq = try_seq;
                try_seq++;
        }

        /* No update needed if no later finalized record was found. */
        if (finalized_seq == old_seq)
                return;

        oldval = __u64seq_to_ulseq(old_seq);
        newval = __u64seq_to_ulseq(finalized_seq);

        /*
         * Set the sequence number of a later finalized record that has been
         * seen.
         *
         * Guarantee the record data is visible to other CPUs before storing
         * its sequence number. This pairs with desc_last_finalized_seq:A.
         *
         * Memory barrier involvement:
         *
         * If desc_last_finalized_seq:A reads from
         * desc_update_last_finalized:A, then desc_read:A reads from
         * _prb_commit:B.
         *
         * Relies on:
         *
         * RELEASE from _prb_commit:B to desc_update_last_finalized:A
         *    matching
         * ACQUIRE from desc_last_finalized_seq:A to desc_read:A
         *
         * Note: _prb_commit:B and desc_update_last_finalized:A can be
         *       different CPUs. However, the desc_update_last_finalized:A
         *       CPU (which performs the release) must have previously seen
         *       _prb_commit:B.
         */
        if (!atomic_long_try_cmpxchg_release(&desc_ring->last_finalized_seq,
                                &oldval, newval)) { /* LMM(desc_update_last_finalized:A) */
                old_seq = __ulseq_to_u64seq(rb, oldval);
                goto try_again;
        }
}

/*
 * Attempt to finalize a specified descriptor. If this fails, the descriptor
 * is either already final or it will finalize itself when the writer commits.
 */
static void desc_make_final(struct printk_ringbuffer *rb, unsigned long id)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        unsigned long prev_state_val = DESC_SV(id, desc_committed);
        struct prb_desc *d = to_desc(desc_ring, id);

        if (atomic_long_try_cmpxchg_relaxed(&d->state_var, &prev_state_val,
                        DESC_SV(id, desc_finalized))) { /* LMM(desc_make_final:A) */
                desc_update_last_finalized(rb);
        }
}

/**
 * prb_reserve() - Reserve space in the ringbuffer.
 *
 * @e:  The entry structure to setup.
 * @rb: The ringbuffer to reserve data in.
 * @r:  The record structure to allocate buffers for.
 *
 * This is the public function available to writers to reserve data.
 *
 * The writer specifies the text size to reserve by setting the
 * @text_buf_size field of @r. To ensure proper initialization of @r,
 * prb_rec_init_wr() should be used.
 *
 * Context: Any context. Disables local interrupts on success.
 * Return: true if at least text data could be allocated, otherwise false.
 *
 * On success, the fields @info and @text_buf of @r will be set by this
 * function and should be filled in by the writer before committing. Also
 * on success, prb_record_text_space() can be used on @e to query the actual
 * space used for the text data block.
 *
 * Important: @info->text_len needs to be set correctly by the writer in
 *            order for data to be readable and/or extended. Its value
 *            is initialized to 0.
 */
bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
                 struct printk_record *r)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        struct printk_info *info;
        struct prb_desc *d;
        unsigned long id;
        u64 seq;

        if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
                goto fail;

        /*
         * Descriptors in the reserved state act as blockers to all further
         * reservations once the desc_ring has fully wrapped. Disable
         * interrupts during the reserve/commit window in order to minimize
         * the likelihood of this happening.
         */
        local_irq_save(e->irqflags);

        if (!desc_reserve(rb, &id)) {
                /* Descriptor reservation failures are tracked. */
                atomic_long_inc(&rb->fail);
                local_irq_restore(e->irqflags);
                goto fail;
        }

        d = to_desc(desc_ring, id);
        info = to_info(desc_ring, id);

        /*
         * All @info fields (except @seq) are cleared and must be filled in
         * by the writer. Save @seq before clearing because it is used to
         * determine the new sequence number.
         */
        seq = info->seq;
        memset(info, 0, sizeof(*info));

        /*
         * Set the @e fields here so that prb_commit() can be used if
         * text data allocation fails.
         */
        e->rb = rb;
        e->id = id;

        /*
         * Initialize the sequence number if it has "never been set".
         * Otherwise just increment it by a full wrap.
         *
         * @seq is considered "never been set" if it has a value of 0,
         * _except_ for @infos[0], which was specially setup by the ringbuffer
         * initializer and therefore is always considered as set.
         *
         * See the "Bootstrap" comment block in printk_ringbuffer.h for
         * details about how the initializer bootstraps the descriptors.
         */
        if (seq == 0 && DESC_INDEX(desc_ring, id) != 0)
                info->seq = DESC_INDEX(desc_ring, id);
        else
                info->seq = seq + DESCS_COUNT(desc_ring);

        /*
         * New data is about to be reserved. Once that happens, previous
         * descriptors are no longer able to be extended. Finalize the
         * previous descriptor now so that it can be made available to
         * readers. (For seq==0 there is no previous descriptor.)
         */
        if (info->seq > 0)
                desc_make_final(rb, DESC_ID(id - 1));

        r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id);
        /* If text data allocation fails, a data-less record is committed. */
        if (r->text_buf_size && !r->text_buf) {
                prb_commit(e);
                /* prb_commit() re-enabled interrupts. */
                goto fail;
        }

        r->info = info;

        /* Record full text space used by record. */
        e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos);

        return true;
fail:
        /* Make it clear to the caller that the reserve failed. */
        memset(r, 0, sizeof(*r));
        return false;
}

/* Commit the data (possibly finalizing it) and restore interrupts. */
static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val)
{
        struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
        struct prb_desc *d = to_desc(desc_ring, e->id);
        unsigned long prev_state_val = DESC_SV(e->id, desc_reserved);

        /* Now the writer has finished all writing: LMM(_prb_commit:A) */

        /*
         * Set the descriptor as committed. See "ABA Issues" about why
         * cmpxchg() instead of set() is used.
         *
         * 1  Guarantee all record data is stored before the descriptor state
         *    is stored as committed. A write memory barrier is sufficient
         *    for this. This pairs with desc_read:B and desc_reopen_last:A.
         *
         * 2. Guarantee the descriptor state is stored as committed before
         *    re-checking the head ID in order to possibly finalize this
         *    descriptor. This pairs with desc_reserve:D.
         *
         *    Memory barrier involvement:
         *
         *    If prb_commit:A reads from desc_reserve:D, then
         *    desc_make_final:A reads from _prb_commit:B.
         *
         *    Relies on:
         *
         *    MB _prb_commit:B to prb_commit:A
         *       matching
         *    MB desc_reserve:D to desc_make_final:A
         */
        if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val,
                        DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */
                WARN_ON_ONCE(1);
        }

        /* Restore interrupts, the reserve/commit window is finished. */
        local_irq_restore(e->irqflags);
}

/**
 * prb_commit() - Commit (previously reserved) data to the ringbuffer.
 *
 * @e: The entry containing the reserved data information.
 *
 * This is the public function available to writers to commit data.
 *
 * Note that the data is not yet available to readers until it is finalized.
 * Finalizing happens automatically when space for the next record is
 * reserved.
 *
 * See prb_final_commit() for a version of this function that finalizes
 * immediately.
 *
 * Context: Any context. Enables local interrupts.
 */
void prb_commit(struct prb_reserved_entry *e)
{
        struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
        unsigned long head_id;

        _prb_commit(e, desc_committed);

        /*
         * If this descriptor is no longer the head (i.e. a new record has
         * been allocated), extending the data for this record is no longer
         * allowed and therefore it must be finalized.
         */
        head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */
        if (head_id != e->id)
                desc_make_final(e->rb, e->id);
}

/**
 * prb_final_commit() - Commit and finalize (previously reserved) data to
 *                      the ringbuffer.
 *
 * @e: The entry containing the reserved data information.
 *
 * This is the public function available to writers to commit+finalize data.
 *
 * By finalizing, the data is made immediately available to readers.
 *
 * This function should only be used if there are no intentions of extending
 * this data using prb_reserve_in_last().
 *
 * Context: Any context. Enables local interrupts.
 */
void prb_final_commit(struct prb_reserved_entry *e)
{
        _prb_commit(e, desc_finalized);

        desc_update_last_finalized(e->rb);
}

/*
 * Count the number of lines in provided text. All text has at least 1 line
 * (even if @text_size is 0). Each '\n' processed is counted as an additional
 * line.
 */
static unsigned int count_lines(const char *text, unsigned int text_size)
{
        unsigned int next_size = text_size;
        unsigned int line_count = 1;
        const char *next = text;

        while (next_size) {
                next = memchr(next, '\n', next_size);
                if (!next)
                        break;
                line_count++;
                next++;
                next_size = text_size - (next - text);
        }

        return line_count;
}

/*
 * Given @blk_lpos, copy an expected @len of data into the provided buffer.
 * If @line_count is provided, count the number of lines in the data.
 *
 * This function (used by readers) performs strict validation on the data
 * size to possibly detect bugs in the writer code. A WARN_ON_ONCE() is
 * triggered if an internal error is detected.
 */
static bool copy_data(struct prb_data_ring *data_ring,
                      struct prb_data_blk_lpos *blk_lpos, u16 len, char *buf,
                      unsigned int buf_size, unsigned int *line_count)
{
        unsigned int data_size;
        const char *data;

        /* Caller might not want any data. */
        if ((!buf || !buf_size) && !line_count)
                return true;

        data = get_data(data_ring, blk_lpos, &data_size);
        if (!data)
                return false;

        /*
         * Actual cannot be less than expected. It can be more than expected
         * because of the trailing alignment padding.
         *
         * Note that invalid @len values can occur because the caller loads
         * the value during an allowed data race.
         */
        if (data_size < (unsigned int)len)
                return false;

        /* Caller interested in the line count? */
        if (line_count)
                *line_count = count_lines(data, len);

        /* Caller interested in the data content? */
        if (!buf || !buf_size)
                return true;

        data_size = min_t(unsigned int, buf_size, len);

        memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */
        return true;
}

/*
 * This is an extended version of desc_read(). It gets a copy of a specified
 * descriptor. However, it also verifies that the record is finalized and has
 * the sequence number @seq. On success, 0 is returned.
 *
 * Error return values:
 * -EINVAL: A finalized record with sequence number @seq does not exist.
 * -ENOENT: A finalized record with sequence number @seq exists, but its data
 *          is not available. This is a valid record, so readers should
 *          continue with the next record.
 */
static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring,
                                   unsigned long id, u64 seq,
                                   struct prb_desc *desc_out)
{
        struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos;
        enum desc_state d_state;
        u64 s;

        d_state = desc_read(desc_ring, id, desc_out, &s, NULL);

        /*
         * An unexpected @id (desc_miss) or @seq mismatch means the record
         * does not exist. A descriptor in the reserved or committed state
         * means the record does not yet exist for the reader.
         */
        if (d_state == desc_miss ||
            d_state == desc_reserved ||
            d_state == desc_committed ||
            s != seq) {
                return -EINVAL;
        }

        /*
         * A descriptor in the reusable state may no longer have its data
         * available; report it as existing but with lost data. Or the record
         * may actually be a record with lost data.
         */
        if (d_state == desc_reusable ||
            (blk_lpos->begin == FAILED_LPOS && blk_lpos->next == FAILED_LPOS)) {
                return -ENOENT;
        }

        return 0;
}

/*
 * Copy the ringbuffer data from the record with @seq to the provided
 * @r buffer. On success, 0 is returned.
 *
 * See desc_read_finalized_seq() for error return values.
 */
static int prb_read(struct printk_ringbuffer *rb, u64 seq,
                    struct printk_record *r, unsigned int *line_count)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        struct printk_info *info = to_info(desc_ring, seq);
        struct prb_desc *rdesc = to_desc(desc_ring, seq);
        atomic_long_t *state_var = &rdesc->state_var;
        struct prb_desc desc;
        unsigned long id;
        int err;

        /* Extract the ID, used to specify the descriptor to read. */
        id = DESC_ID(atomic_long_read(state_var));

        /* Get a local copy of the correct descriptor (if available). */
        err = desc_read_finalized_seq(desc_ring, id, seq, &desc);

        /*
         * If @r is NULL, the caller is only interested in the availability
         * of the record.
         */
        if (err || !r)
                return err;

        /* If requested, copy meta data. */
        if (r->info)
                memcpy(r->info, info, sizeof(*(r->info)));

        /* Copy text data. If it fails, this is a data-less record. */
        if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, info->text_len,
                       r->text_buf, r->text_buf_size, line_count)) {
                return -ENOENT;
        }

        /* Ensure the record is still finalized and has the same @seq. */
        return desc_read_finalized_seq(desc_ring, id, seq, &desc);
}

/* Get the sequence number of the tail descriptor. */
u64 prb_first_seq(struct printk_ringbuffer *rb)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        enum desc_state d_state;
        struct prb_desc desc;
        unsigned long id;
        u64 seq;

        for (;;) {
                id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */

                d_state = desc_read(desc_ring, id, &desc, &seq, NULL); /* LMM(prb_first_seq:B) */

                /*
                 * This loop will not be infinite because the tail is
                 * _always_ in the finalized or reusable state.
                 */
                if (d_state == desc_finalized || d_state == desc_reusable)
                        break;

                /*
                 * Guarantee the last state load from desc_read() is before
                 * reloading @tail_id in order to see a new tail in the case
                 * that the descriptor has been recycled. This pairs with
                 * desc_reserve:D.
                 *
                 * Memory barrier involvement:
                 *
                 * If prb_first_seq:B reads from desc_reserve:F, then
                 * prb_first_seq:A reads from desc_push_tail:B.
                 *
                 * Relies on:
                 *
                 * MB from desc_push_tail:B to desc_reserve:F
                 *    matching
                 * RMB prb_first_seq:B to prb_first_seq:A
                 */
                smp_rmb(); /* LMM(prb_first_seq:C) */
        }

        return seq;
}

/**
 * prb_next_reserve_seq() - Get the sequence number after the most recently
 *                  reserved record.
 *
 * @rb:  The ringbuffer to get the sequence number from.
 *
 * This is the public function available to readers to see what sequence
 * number will be assigned to the next reserved record.
 *
 * Note that depending on the situation, this value can be equal to or
 * higher than the sequence number returned by prb_next_seq().
 *
 * Context: Any context.
 * Return: The sequence number that will be assigned to the next record
 *         reserved.
 */
u64 prb_next_reserve_seq(struct printk_ringbuffer *rb)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        unsigned long last_finalized_id;
        atomic_long_t *state_var;
        u64 last_finalized_seq;
        unsigned long head_id;
        struct prb_desc desc;
        unsigned long diff;
        struct prb_desc *d;
        int err;

        /*
         * It may not be possible to read a sequence number for @head_id.
         * So the ID of @last_finailzed_seq is used to calculate what the
         * sequence number of @head_id will be.
         */

try_again:
        last_finalized_seq = desc_last_finalized_seq(rb);

        /*
         * @head_id is loaded after @last_finalized_seq to ensure that
         * it points to the record with @last_finalized_seq or newer.
         *
         * Memory barrier involvement:
         *
         * If desc_last_finalized_seq:A reads from
         * desc_update_last_finalized:A, then
         * prb_next_reserve_seq:A reads from desc_reserve:D.
         *
         * Relies on:
         *
         * RELEASE from desc_reserve:D to desc_update_last_finalized:A
         *    matching
         * ACQUIRE from desc_last_finalized_seq:A to prb_next_reserve_seq:A
         *
         * Note: desc_reserve:D and desc_update_last_finalized:A can be
         *       different CPUs. However, the desc_update_last_finalized:A CPU
         *       (which performs the release) must have previously seen
         *       desc_read:C, which implies desc_reserve:D can be seen.
         */
        head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_next_reserve_seq:A) */

        d = to_desc(desc_ring, last_finalized_seq);
        state_var = &d->state_var;

        /* Extract the ID, used to specify the descriptor to read. */
        last_finalized_id = DESC_ID(atomic_long_read(state_var));

        /* Ensure @last_finalized_id is correct. */
        err = desc_read_finalized_seq(desc_ring, last_finalized_id, last_finalized_seq, &desc);

        if (err == -EINVAL) {
                if (last_finalized_seq == 0) {
                        /*
                         * No record has been finalized or even reserved yet.
                         *
                         * The @head_id is initialized such that the first
                         * increment will yield the first record (seq=0).
                         * Handle it separately to avoid a negative @diff
                         * below.
                         */
                        if (head_id == DESC0_ID(desc_ring->count_bits))
                                return 0;

                        /*
                         * One or more descriptors are already reserved. Use
                         * the descriptor ID of the first one (@seq=0) for
                         * the @diff below.
                         */
                        last_finalized_id = DESC0_ID(desc_ring->count_bits) + 1;
                } else {
                        /* Record must have been overwritten. Try again. */
                        goto try_again;
                }
        }

        /* Diff of known descriptor IDs to compute related sequence numbers. */
        diff = head_id - last_finalized_id;

        /*
         * @head_id points to the most recently reserved record, but this
         * function returns the sequence number that will be assigned to the
         * next (not yet reserved) record. Thus +1 is needed.
         */
        return (last_finalized_seq + diff + 1);
}

/*
 * Non-blocking read of a record.
 *
 * On success @seq is updated to the record that was read and (if provided)
 * @r and @line_count will contain the read/calculated data.
 *
 * On failure @seq is updated to a record that is not yet available to the
 * reader, but it will be the next record available to the reader.
 *
 * Note: When the current CPU is in panic, this function will skip over any
 *       non-existent/non-finalized records in order to allow the panic CPU
 *       to print any and all records that have been finalized.
 */
static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
                            struct printk_record *r, unsigned int *line_count)
{
        u64 tail_seq;
        int err;

        while ((err = prb_read(rb, *seq, r, line_count))) {
                tail_seq = prb_first_seq(rb);

                if (*seq < tail_seq) {
                        /*
                         * Behind the tail. Catch up and try again. This
                         * can happen for -ENOENT and -EINVAL cases.
                         */
                        *seq = tail_seq;

                } else if (err == -ENOENT) {
                        /* Record exists, but the data was lost. Skip. */
                        (*seq)++;

                } else {
                        /*
                         * Non-existent/non-finalized record. Must stop.
                         *
                         * For panic situations it cannot be expected that
                         * non-finalized records will become finalized. But
                         * there may be other finalized records beyond that
                         * need to be printed for a panic situation. If this
                         * is the panic CPU, skip this
                         * non-existent/non-finalized record unless it is
                         * at or beyond the head, in which case it is not
                         * possible to continue.
                         *
                         * Note that new messages printed on panic CPU are
                         * finalized when we are here. The only exception
                         * might be the last message without trailing newline.
                         * But it would have the sequence number returned
                         * by "prb_next_reserve_seq() - 1".
                         */
                        if (this_cpu_in_panic() && ((*seq + 1) < prb_next_reserve_seq(rb)))
                                (*seq)++;
                        else
                                return false;
                }
        }

        return true;
}

/**
 * prb_read_valid() - Non-blocking read of a requested record or (if gone)
 *                    the next available record.
 *
 * @rb:  The ringbuffer to read from.
 * @seq: The sequence number of the record to read.
 * @r:   A record data buffer to store the read record to.
 *
 * This is the public function available to readers to read a record.
 *
 * The reader provides the @info and @text_buf buffers of @r to be
 * filled in. Any of the buffer pointers can be set to NULL if the reader
 * is not interested in that data. To ensure proper initialization of @r,
 * prb_rec_init_rd() should be used.
 *
 * Context: Any context.
 * Return: true if a record was read, otherwise false.
 *
 * On success, the reader must check r->info.seq to see which record was
 * actually read. This allows the reader to detect dropped records.
 *
 * Failure means @seq refers to a record not yet available to the reader.
 */
bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
                    struct printk_record *r)
{
        return _prb_read_valid(rb, &seq, r, NULL);
}

/**
 * prb_read_valid_info() - Non-blocking read of meta data for a requested
 *                         record or (if gone) the next available record.
 *
 * @rb:         The ringbuffer to read from.
 * @seq:        The sequence number of the record to read.
 * @info:       A buffer to store the read record meta data to.
 * @line_count: A buffer to store the number of lines in the record text.
 *
 * This is the public function available to readers to read only the
 * meta data of a record.
 *
 * The reader provides the @info, @line_count buffers to be filled in.
 * Either of the buffer pointers can be set to NULL if the reader is not
 * interested in that data.
 *
 * Context: Any context.
 * Return: true if a record's meta data was read, otherwise false.
 *
 * On success, the reader must check info->seq to see which record meta data
 * was actually read. This allows the reader to detect dropped records.
 *
 * Failure means @seq refers to a record not yet available to the reader.
 */
bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
                         struct printk_info *info, unsigned int *line_count)
{
        struct printk_record r;

        prb_rec_init_rd(&r, info, NULL, 0);

        return _prb_read_valid(rb, &seq, &r, line_count);
}

/**
 * prb_first_valid_seq() - Get the sequence number of the oldest available
 *                         record.
 *
 * @rb: The ringbuffer to get the sequence number from.
 *
 * This is the public function available to readers to see what the
 * first/oldest valid sequence number is.
 *
 * This provides readers a starting point to begin iterating the ringbuffer.
 *
 * Context: Any context.
 * Return: The sequence number of the first/oldest record or, if the
 *         ringbuffer is empty, 0 is returned.
 */
u64 prb_first_valid_seq(struct printk_ringbuffer *rb)
{
        u64 seq = 0;

        if (!_prb_read_valid(rb, &seq, NULL, NULL))
                return 0;

        return seq;
}

/**
 * prb_next_seq() - Get the sequence number after the last available record.
 *
 * @rb:  The ringbuffer to get the sequence number from.
 *
 * This is the public function available to readers to see what the next
 * newest sequence number available to readers will be.
 *
 * This provides readers a sequence number to jump to if all currently
 * available records should be skipped. It is guaranteed that all records
 * previous to the returned value have been finalized and are (or were)
 * available to the reader.
 *
 * Context: Any context.
 * Return: The sequence number of the next newest (not yet available) record
 *         for readers.
 */
u64 prb_next_seq(struct printk_ringbuffer *rb)
{
        u64 seq;

        seq = desc_last_finalized_seq(rb);

        /*
         * Begin searching after the last finalized record.
         *
         * On 0, the search must begin at 0 because of hack#2
         * of the bootstrapping phase it is not known if a
         * record at index 0 exists.
         */
        if (seq != 0)
                seq++;

        /*
         * The information about the last finalized @seq might be inaccurate.
         * Search forward to find the current one.
         */
        while (_prb_read_valid(rb, &seq, NULL, NULL))
                seq++;

        return seq;
}

/**
 * prb_init() - Initialize a ringbuffer to use provided external buffers.
 *
 * @rb:       The ringbuffer to initialize.
 * @text_buf: The data buffer for text data.
 * @textbits: The size of @text_buf as a power-of-2 value.
 * @descs:    The descriptor buffer for ringbuffer records.
 * @descbits: The count of @descs items as a power-of-2 value.
 * @infos:    The printk_info buffer for ringbuffer records.
 *
 * This is the public function available to writers to setup a ringbuffer
 * during runtime using provided buffers.
 *
 * This must match the initialization of DEFINE_PRINTKRB().
 *
 * Context: Any context.
 */
void prb_init(struct printk_ringbuffer *rb,
              char *text_buf, unsigned int textbits,
              struct prb_desc *descs, unsigned int descbits,
              struct printk_info *infos)
{
        memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0]));
        memset(infos, 0, _DESCS_COUNT(descbits) * sizeof(infos[0]));

        rb->desc_ring.count_bits = descbits;
        rb->desc_ring.descs = descs;
        rb->desc_ring.infos = infos;
        atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits));
        atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits));
        atomic_long_set(&rb->desc_ring.last_finalized_seq, 0);

        rb->text_data_ring.size_bits = textbits;
        rb->text_data_ring.data = text_buf;
        atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits));
        atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits));

        atomic_long_set(&rb->fail, 0);

        atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits));
        descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS;
        descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS;

        infos[0].seq = -(u64)_DESCS_COUNT(descbits);
        infos[_DESCS_COUNT(descbits) - 1].seq = 0;
}

/**
 * prb_record_text_space() - Query the full actual used ringbuffer space for
 *                           the text data of a reserved entry.
 *
 * @e: The successfully reserved entry to query.
 *
 * This is the public function available to writers to see how much actual
 * space is used in the ringbuffer to store the text data of the specified
 * entry.
 *
 * This function is only valid if @e has been successfully reserved using
 * prb_reserve().
 *
 * Context: Any context.
 * Return: The size in bytes used by the text data of the associated record.
 */
unsigned int prb_record_text_space(struct prb_reserved_entry *e)
{
        return e->text_space;
}













































































































































































































































































































































































































































































    3 




    3 




















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_DEFS_H
#define _ASM_X86_PGTABLE_DEFS_H

#include <linux/const.h>
#include <linux/mem_encrypt.h>

#include <asm/page_types.h>

#define _PAGE_BIT_PRESENT        0        /* is present */
#define _PAGE_BIT_RW                1        /* writeable */
#define _PAGE_BIT_USER                2        /* userspace addressable */
#define _PAGE_BIT_PWT                3        /* page write through */
#define _PAGE_BIT_PCD                4        /* page cache disabled */
#define _PAGE_BIT_ACCESSED        5        /* was accessed (raised by CPU) */
#define _PAGE_BIT_DIRTY                6        /* was written to (raised by CPU) */
#define _PAGE_BIT_PSE                7        /* 4 MB (or 2MB) page */
#define _PAGE_BIT_PAT                7        /* on 4KB pages */
#define _PAGE_BIT_GLOBAL        8        /* Global TLB entry PPro+ */
#define _PAGE_BIT_SOFTW1        9        /* available for programmer */
#define _PAGE_BIT_SOFTW2        10        /* " */
#define _PAGE_BIT_SOFTW3        11        /* " */
#define _PAGE_BIT_PAT_LARGE        12        /* On 2MB or 1GB pages */
#define _PAGE_BIT_SOFTW4        57        /* available for programmer */
#define _PAGE_BIT_SOFTW5        58        /* available for programmer */
#define _PAGE_BIT_PKEY_BIT0        59        /* Protection Keys, bit 1/4 */
#define _PAGE_BIT_PKEY_BIT1        60        /* Protection Keys, bit 2/4 */
#define _PAGE_BIT_PKEY_BIT2        61        /* Protection Keys, bit 3/4 */
#define _PAGE_BIT_PKEY_BIT3        62        /* Protection Keys, bit 4/4 */
#define _PAGE_BIT_NX                63        /* No execute: only valid after cpuid check */

#define _PAGE_BIT_SPECIAL        _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST        _PAGE_BIT_SOFTW1
#define _PAGE_BIT_UFFD_WP        _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
#define _PAGE_BIT_SOFT_DIRTY        _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_DEVMAP        _PAGE_BIT_SOFTW4

#ifdef CONFIG_X86_64
#define _PAGE_BIT_SAVED_DIRTY        _PAGE_BIT_SOFTW5 /* Saved Dirty bit */
#else
/* Shared with _PAGE_BIT_UFFD_WP which is not supported on 32 bit */
#define _PAGE_BIT_SAVED_DIRTY        _PAGE_BIT_SOFTW2 /* Saved Dirty bit */
#endif

/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
#define _PAGE_BIT_PROTNONE        _PAGE_BIT_GLOBAL

#define _PAGE_PRESENT        (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
#define _PAGE_RW        (_AT(pteval_t, 1) << _PAGE_BIT_RW)
#define _PAGE_USER        (_AT(pteval_t, 1) << _PAGE_BIT_USER)
#define _PAGE_PWT        (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
#define _PAGE_PCD        (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
#define _PAGE_ACCESSED        (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
#define _PAGE_DIRTY        (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
#define _PAGE_PSE        (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
#define _PAGE_GLOBAL        (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#define _PAGE_SOFTW1        (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_SOFTW2        (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
#define _PAGE_SOFTW3        (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW3)
#define _PAGE_PAT        (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL        (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST        (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
#define _PAGE_PKEY_BIT0        (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
#define _PAGE_PKEY_BIT1        (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
#define _PAGE_PKEY_BIT2        (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT2)
#define _PAGE_PKEY_BIT3        (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT3)
#else
#define _PAGE_PKEY_BIT0        (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT1        (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT2        (_AT(pteval_t, 0))
#define _PAGE_PKEY_BIT3        (_AT(pteval_t, 0))
#endif

#define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \
                         _PAGE_PKEY_BIT1 | \
                         _PAGE_PKEY_BIT2 | \
                         _PAGE_PKEY_BIT3)

#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED)
#else
#define _PAGE_KNL_ERRATUM_MASK 0
#endif

#ifdef CONFIG_MEM_SOFT_DIRTY
#define _PAGE_SOFT_DIRTY        (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
#else
#define _PAGE_SOFT_DIRTY        (_AT(pteval_t, 0))
#endif

/*
 * Tracking soft dirty bit when a page goes to a swap is tricky.
 * We need a bit which can be stored in pte _and_ not conflict
 * with swap entry format. On x86 bits 1-4 are *not* involved
 * into swap entry computation, but bit 7 is used for thp migration,
 * so we borrow bit 1 for soft dirty tracking.
 *
 * Please note that this bit must be treated as swap dirty page
 * mark if and only if the PTE/PMD has present bit clear!
 */
#ifdef CONFIG_MEM_SOFT_DIRTY
#define _PAGE_SWP_SOFT_DIRTY        _PAGE_RW
#else
#define _PAGE_SWP_SOFT_DIRTY        (_AT(pteval_t, 0))
#endif

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
#define _PAGE_UFFD_WP                (_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP)
#define _PAGE_SWP_UFFD_WP        _PAGE_USER
#else
#define _PAGE_UFFD_WP                (_AT(pteval_t, 0))
#define _PAGE_SWP_UFFD_WP        (_AT(pteval_t, 0))
#endif

#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_NX        (_AT(pteval_t, 1) << _PAGE_BIT_NX)
#define _PAGE_DEVMAP        (_AT(u64, 1) << _PAGE_BIT_DEVMAP)
#define _PAGE_SOFTW4        (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW4)
#else
#define _PAGE_NX        (_AT(pteval_t, 0))
#define _PAGE_DEVMAP        (_AT(pteval_t, 0))
#define _PAGE_SOFTW4        (_AT(pteval_t, 0))
#endif

/*
 * The hardware requires shadow stack to be Write=0,Dirty=1. However,
 * there are valid cases where the kernel might create read-only PTEs that
 * are dirty (e.g., fork(), mprotect(), uffd-wp(), soft-dirty tracking). In
 * this case, the _PAGE_SAVED_DIRTY bit is used instead of the HW-dirty bit,
 * to avoid creating a wrong "shadow stack" PTEs. Such PTEs have
 * (Write=0,SavedDirty=1,Dirty=0) set.
 */
#define _PAGE_SAVED_DIRTY        (_AT(pteval_t, 1) << _PAGE_BIT_SAVED_DIRTY)

#define _PAGE_DIRTY_BITS (_PAGE_DIRTY | _PAGE_SAVED_DIRTY)

#define _PAGE_PROTNONE        (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)

/*
 * Set of bits not changed in pte_modify.  The pte's
 * protection key is treated like _PAGE_RW, for
 * instance, and is *not* included in this mask since
 * pte_modify() does modify it.
 */
#define _COMMON_PAGE_CHG_MASK        (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |        \
                                 _PAGE_SPECIAL | _PAGE_ACCESSED |        \
                                 _PAGE_DIRTY_BITS | _PAGE_SOFT_DIRTY |        \
                                 _PAGE_DEVMAP | _PAGE_CC | _PAGE_UFFD_WP)
#define _PAGE_CHG_MASK        (_COMMON_PAGE_CHG_MASK | _PAGE_PAT)
#define _HPAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_PAT_LARGE)

/*
 * The cache modes defined here are used to translate between pure SW usage
 * and the HW defined cache mode bits and/or PAT entries.
 *
 * The resulting bits for PWT, PCD and PAT should be chosen in a way
 * to have the WB mode at index 0 (all bits clear). This is the default
 * right now and likely would break too much if changed.
 */
#ifndef __ASSEMBLY__
enum page_cache_mode {
        _PAGE_CACHE_MODE_WB       = 0,
        _PAGE_CACHE_MODE_WC       = 1,
        _PAGE_CACHE_MODE_UC_MINUS = 2,
        _PAGE_CACHE_MODE_UC       = 3,
        _PAGE_CACHE_MODE_WT       = 4,
        _PAGE_CACHE_MODE_WP       = 5,

        _PAGE_CACHE_MODE_NUM      = 8
};
#endif

#define _PAGE_CC                (_AT(pteval_t, cc_mask))
#define _PAGE_ENC                (_AT(pteval_t, sme_me_mask))

#define _PAGE_CACHE_MASK        (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)
#define _PAGE_LARGE_CACHE_MASK        (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT_LARGE)

#define _PAGE_NOCACHE                (cachemode2protval(_PAGE_CACHE_MODE_UC))
#define _PAGE_CACHE_WP                (cachemode2protval(_PAGE_CACHE_MODE_WP))

#define __PP _PAGE_PRESENT
#define __RW _PAGE_RW
#define _USR _PAGE_USER
#define ___A _PAGE_ACCESSED
#define ___D _PAGE_DIRTY
#define ___G _PAGE_GLOBAL
#define __NX _PAGE_NX

#define _ENC _PAGE_ENC
#define __WP _PAGE_CACHE_WP
#define __NC _PAGE_NOCACHE
#define _PSE _PAGE_PSE

#define pgprot_val(x)                ((x).pgprot)
#define __pgprot(x)                ((pgprot_t) { (x) } )
#define __pg(x)                        __pgprot(x)

#define PAGE_NONE             __pg(   0|   0|   0|___A|   0|   0|   0|___G)
#define PAGE_SHARED             __pg(__PP|__RW|_USR|___A|__NX|   0|   0|   0)
#define PAGE_SHARED_EXEC     __pg(__PP|__RW|_USR|___A|   0|   0|   0|   0)
#define PAGE_COPY_NOEXEC     __pg(__PP|   0|_USR|___A|__NX|   0|   0|   0)
#define PAGE_COPY_EXEC             __pg(__PP|   0|_USR|___A|   0|   0|   0|   0)
#define PAGE_COPY             __pg(__PP|   0|_USR|___A|__NX|   0|   0|   0)
#define PAGE_READONLY             __pg(__PP|   0|_USR|___A|__NX|   0|   0|   0)
#define PAGE_READONLY_EXEC   __pg(__PP|   0|_USR|___A|   0|   0|   0|   0)

#define __PAGE_KERNEL                 (__PP|__RW|   0|___A|__NX|___D|   0|___G)
#define __PAGE_KERNEL_EXEC         (__PP|__RW|   0|___A|   0|___D|   0|___G)

/*
 * Page tables needs to have Write=1 in order for any lower PTEs to be
 * writable. This includes shadow stack memory (Write=0, Dirty=1)
 */
#define _KERNPG_TABLE_NOENC         (__PP|__RW|   0|___A|   0|___D|   0|   0)
#define _KERNPG_TABLE                 (__PP|__RW|   0|___A|   0|___D|   0|   0| _ENC)
#define _PAGE_TABLE_NOENC         (__PP|__RW|_USR|___A|   0|___D|   0|   0)
#define _PAGE_TABLE                 (__PP|__RW|_USR|___A|   0|___D|   0|   0| _ENC)

#define __PAGE_KERNEL_RO         (__PP|   0|   0|___A|__NX|   0|   0|___G)
#define __PAGE_KERNEL_ROX         (__PP|   0|   0|___A|   0|   0|   0|___G)
#define __PAGE_KERNEL                 (__PP|__RW|   0|___A|__NX|___D|   0|___G)
#define __PAGE_KERNEL_EXEC         (__PP|__RW|   0|___A|   0|___D|   0|___G)
#define __PAGE_KERNEL_NOCACHE         (__PP|__RW|   0|___A|__NX|___D|   0|___G| __NC)
#define __PAGE_KERNEL_VVAR         (__PP|   0|_USR|___A|__NX|   0|   0|___G)
#define __PAGE_KERNEL_LARGE         (__PP|__RW|   0|___A|__NX|___D|_PSE|___G)
#define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW|   0|___A|   0|___D|_PSE|___G)
#define __PAGE_KERNEL_WP         (__PP|__RW|   0|___A|__NX|___D|   0|___G| __WP)


#define __PAGE_KERNEL_IO                __PAGE_KERNEL
#define __PAGE_KERNEL_IO_NOCACHE        __PAGE_KERNEL_NOCACHE


#ifndef __ASSEMBLY__

#define __PAGE_KERNEL_ENC        (__PAGE_KERNEL    | _ENC)
#define __PAGE_KERNEL_ENC_WP        (__PAGE_KERNEL_WP | _ENC)
#define __PAGE_KERNEL_NOENC        (__PAGE_KERNEL    |    0)
#define __PAGE_KERNEL_NOENC_WP        (__PAGE_KERNEL_WP |    0)

#define __pgprot_mask(x)        __pgprot((x) & __default_kernel_pte_mask)

#define PAGE_KERNEL                __pgprot_mask(__PAGE_KERNEL            | _ENC)
#define PAGE_KERNEL_NOENC        __pgprot_mask(__PAGE_KERNEL            |    0)
#define PAGE_KERNEL_RO                __pgprot_mask(__PAGE_KERNEL_RO         | _ENC)
#define PAGE_KERNEL_EXEC        __pgprot_mask(__PAGE_KERNEL_EXEC       | _ENC)
#define PAGE_KERNEL_EXEC_NOENC        __pgprot_mask(__PAGE_KERNEL_EXEC       |    0)
#define PAGE_KERNEL_ROX                __pgprot_mask(__PAGE_KERNEL_ROX        | _ENC)
#define PAGE_KERNEL_NOCACHE        __pgprot_mask(__PAGE_KERNEL_NOCACHE    | _ENC)
#define PAGE_KERNEL_LARGE        __pgprot_mask(__PAGE_KERNEL_LARGE      | _ENC)
#define PAGE_KERNEL_LARGE_EXEC        __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC)
#define PAGE_KERNEL_VVAR        __pgprot_mask(__PAGE_KERNEL_VVAR       | _ENC)

#define PAGE_KERNEL_IO                __pgprot_mask(__PAGE_KERNEL_IO)
#define PAGE_KERNEL_IO_NOCACHE        __pgprot_mask(__PAGE_KERNEL_IO_NOCACHE)

#endif        /* __ASSEMBLY__ */

/*
 * early identity mapping  pte attrib macros.
 */
#ifdef CONFIG_X86_64
#define __PAGE_KERNEL_IDENT_LARGE_EXEC        __PAGE_KERNEL_LARGE_EXEC
#else
#define PTE_IDENT_ATTR         0x003                /* PRESENT+RW */
#define PDE_IDENT_ATTR         0x063                /* PRESENT+RW+DIRTY+ACCESSED */
#define PGD_IDENT_ATTR         0x001                /* PRESENT (no other attributes) */
#endif

#ifdef CONFIG_X86_32
# include <asm/pgtable_32_types.h>
#else
# include <asm/pgtable_64_types.h>
#endif

#ifndef __ASSEMBLY__

#include <linux/types.h>

/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */
#define PTE_PFN_MASK                ((pteval_t)PHYSICAL_PAGE_MASK)

/*
 *  Extracts the flags from a (pte|pmd|pud|pgd)val_t
 *  This includes the protection key value.
 */
#define PTE_FLAGS_MASK                (~PTE_PFN_MASK)

typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;

typedef struct { pgdval_t pgd; } pgd_t;

static inline pgprot_t pgprot_nx(pgprot_t prot)
{
        return __pgprot(pgprot_val(prot) | _PAGE_NX);
}
#define pgprot_nx pgprot_nx

#ifdef CONFIG_X86_PAE

/*
 * PHYSICAL_PAGE_MASK might be non-constant when SME is compiled in, so we can't
 * use it here.
 */

#define PGD_PAE_PAGE_MASK        ((signed long)PAGE_MASK)
#define PGD_PAE_PHYS_MASK        (((1ULL << __PHYSICAL_MASK_SHIFT)-1) & PGD_PAE_PAGE_MASK)

/*
 * PAE allows Base Address, P, PWT, PCD and AVL bits to be set in PGD entries.
 * All other bits are Reserved MBZ
 */
#define PGD_ALLOWED_BITS        (PGD_PAE_PHYS_MASK | _PAGE_PRESENT | \
                                 _PAGE_PWT | _PAGE_PCD | \
                                 _PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3)

#else
/* No need to mask any bits for !PAE */
#define PGD_ALLOWED_BITS        (~0ULL)
#endif

static inline pgd_t native_make_pgd(pgdval_t val)
{
        return (pgd_t) { val & PGD_ALLOWED_BITS };
}

static inline pgdval_t native_pgd_val(pgd_t pgd)
{
        return pgd.pgd & PGD_ALLOWED_BITS;
}

static inline pgdval_t pgd_flags(pgd_t pgd)
{
        return native_pgd_val(pgd) & PTE_FLAGS_MASK;
}

#if CONFIG_PGTABLE_LEVELS > 4
typedef struct { p4dval_t p4d; } p4d_t;

static inline p4d_t native_make_p4d(pudval_t val)
{
        return (p4d_t) { val };
}

static inline p4dval_t native_p4d_val(p4d_t p4d)
{
        return p4d.p4d;
}
#else
#include <asm-generic/pgtable-nop4d.h>

static inline p4d_t native_make_p4d(pudval_t val)
{
        return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) };
}

static inline p4dval_t native_p4d_val(p4d_t p4d)
{
        return native_pgd_val(p4d.pgd);
}
#endif

#if CONFIG_PGTABLE_LEVELS > 3
typedef struct { pudval_t pud; } pud_t;

static inline pud_t native_make_pud(pmdval_t val)
{
        return (pud_t) { val };
}

static inline pudval_t native_pud_val(pud_t pud)
{
        return pud.pud;
}
#else
#include <asm-generic/pgtable-nopud.h>

static inline pud_t native_make_pud(pudval_t val)
{
        return (pud_t) { .p4d.pgd = native_make_pgd(val) };
}

static inline pudval_t native_pud_val(pud_t pud)
{
        return native_pgd_val(pud.p4d.pgd);
}
#endif

#if CONFIG_PGTABLE_LEVELS > 2
static inline pmd_t native_make_pmd(pmdval_t val)
{
        return (pmd_t) { .pmd = val };
}

static inline pmdval_t native_pmd_val(pmd_t pmd)
{
        return pmd.pmd;
}
#else
#include <asm-generic/pgtable-nopmd.h>

static inline pmd_t native_make_pmd(pmdval_t val)
{
        return (pmd_t) { .pud.p4d.pgd = native_make_pgd(val) };
}

static inline pmdval_t native_pmd_val(pmd_t pmd)
{
        return native_pgd_val(pmd.pud.p4d.pgd);
}
#endif

static inline p4dval_t p4d_pfn_mask(p4d_t p4d)
{
        /* No 512 GiB huge pages yet */
        return PTE_PFN_MASK;
}

static inline p4dval_t p4d_flags_mask(p4d_t p4d)
{
        return ~p4d_pfn_mask(p4d);
}

static inline p4dval_t p4d_flags(p4d_t p4d)
{
        return native_p4d_val(p4d) & p4d_flags_mask(p4d);
}

static inline pudval_t pud_pfn_mask(pud_t pud)
{
        if (native_pud_val(pud) & _PAGE_PSE)
                return PHYSICAL_PUD_PAGE_MASK;
        else
                return PTE_PFN_MASK;
}

static inline pudval_t pud_flags_mask(pud_t pud)
{
        return ~pud_pfn_mask(pud);
}

static inline pudval_t pud_flags(pud_t pud)
{
        return native_pud_val(pud) & pud_flags_mask(pud);
}

static inline pmdval_t pmd_pfn_mask(pmd_t pmd)
{
        if (native_pmd_val(pmd) & _PAGE_PSE)
                return PHYSICAL_PMD_PAGE_MASK;
        else
                return PTE_PFN_MASK;
}

static inline pmdval_t pmd_flags_mask(pmd_t pmd)
{
        return ~pmd_pfn_mask(pmd);
}

static inline pmdval_t pmd_flags(pmd_t pmd)
{
        return native_pmd_val(pmd) & pmd_flags_mask(pmd);
}

static inline pte_t native_make_pte(pteval_t val)
{
        return (pte_t) { .pte = val };
}

static inline pteval_t native_pte_val(pte_t pte)
{
        return pte.pte;
}

static inline pteval_t pte_flags(pte_t pte)
{
        return native_pte_val(pte) & PTE_FLAGS_MASK;
}

#define __pte2cm_idx(cb)                                \
        ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) |                \
         (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) |                \
         (((cb) >> _PAGE_BIT_PWT) & 1))
#define __cm_idx2pte(i)                                        \
        ((((i) & 4) << (_PAGE_BIT_PAT - 2)) |                \
         (((i) & 2) << (_PAGE_BIT_PCD - 1)) |                \
         (((i) & 1) << _PAGE_BIT_PWT))

unsigned long cachemode2protval(enum page_cache_mode pcm);

static inline pgprotval_t protval_4k_2_large(pgprotval_t val)
{
        return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
                ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
}
static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot)
{
        return __pgprot(protval_4k_2_large(pgprot_val(pgprot)));
}
static inline pgprotval_t protval_large_2_4k(pgprotval_t val)
{
        return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
                ((val & _PAGE_PAT_LARGE) >>
                 (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
}
static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot)
{
        return __pgprot(protval_large_2_4k(pgprot_val(pgprot)));
}


typedef struct page *pgtable_t;

extern pteval_t __supported_pte_mask;
extern pteval_t __default_kernel_pte_mask;
extern void set_nx(void);
extern int nx_enabled;

#define pgprot_writecombine        pgprot_writecombine
extern pgprot_t pgprot_writecombine(pgprot_t prot);

#define pgprot_writethrough        pgprot_writethrough
extern pgprot_t pgprot_writethrough(pgprot_t prot);

/* Indicate that x86 has its own track and untrack pfn vma functions */
#define __HAVE_PFNMAP_TRACKING

#define __HAVE_PHYS_MEM_ACCESS_PROT
struct file;
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                              unsigned long size, pgprot_t vma_prot);

/* Install a pte for a particular vaddr in kernel space. */
void set_pte_vaddr(unsigned long vaddr, pte_t pte);

#ifdef CONFIG_X86_32
extern void native_pagetable_init(void);
#else
#define native_pagetable_init        paging_init
#endif

enum pg_level {
        PG_LEVEL_NONE,
        PG_LEVEL_4K,
        PG_LEVEL_2M,
        PG_LEVEL_1G,
        PG_LEVEL_512G,
        PG_LEVEL_NUM
};

#ifdef CONFIG_PROC_FS
extern void update_page_count(int level, unsigned long pages);
#else
static inline void update_page_count(int level, unsigned long pages) { }
#endif

/*
 * Helper function that returns the kernel pagetable entry controlling
 * the virtual address 'address'. NULL means no pagetable entry present.
 * NOTE: the return type is pte_t but if the pmd is PSE then we return it
 * as a pte too.
 */
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
                                    unsigned int *level);
pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
                                  unsigned int *level, bool *nx, bool *rw);
extern pmd_t *lookup_pmd_address(unsigned long address);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,
                                          unsigned long address,
                                          unsigned numpages,
                                          unsigned long page_flags);
extern int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
                                            unsigned long numpages);
#endif        /* !__ASSEMBLY__ */

#endif /* _ASM_X86_PGTABLE_DEFS_H */























































































    1 



























    1 







    1 



































































































































































































































































































































































































































































































































































































































































































































































    1 




















    1 





    1 








    3 



    3 
    2 








    1 








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






    1 







    1 


    1 











    1 







    1 


    1 








    2 
    2 

    2 






    1 

    2 


    2 





    1 

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 2009  Red Hat, Inc.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/shrinker.h>
#include <linux/mm_inline.h>
#include <linux/swapops.h>
#include <linux/backing-dev.h>
#include <linux/dax.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/pfn_t.h>
#include <linux/mman.h>
#include <linux/memremap.h>
#include <linux/pagemap.h>
#include <linux/debugfs.h>
#include <linux/migrate.h>
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/page_owner.h>
#include <linux/sched/sysctl.h>
#include <linux/memory-tiers.h>
#include <linux/compat.h>
#include <linux/pgalloc_tag.h>

#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
#include "swap.h"

#define CREATE_TRACE_POINTS
#include <trace/events/thp.h>

/*
 * By default, transparent hugepage support is disabled in order to avoid
 * risking an increased memory footprint for applications that are not
 * guaranteed to benefit from it. When transparent hugepage support is
 * enabled, it is for all mappings, and khugepaged scans all mappings.
 * Defrag is invoked by khugepaged hugepage allocations and by page faults
 * for all hugepage allocations.
 */
unsigned long transparent_hugepage_flags __read_mostly =
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
        (1<<TRANSPARENT_HUGEPAGE_FLAG)|
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
        (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
#endif
        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
        (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);

static struct shrinker *deferred_split_shrinker;
static unsigned long deferred_split_count(struct shrinker *shrink,
                                          struct shrink_control *sc);
static unsigned long deferred_split_scan(struct shrinker *shrink,
                                         struct shrink_control *sc);

static atomic_t huge_zero_refcount;
struct folio *huge_zero_folio __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
unsigned long huge_anon_orders_always __read_mostly;
unsigned long huge_anon_orders_madvise __read_mostly;
unsigned long huge_anon_orders_inherit __read_mostly;

unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
                                         unsigned long vm_flags,
                                         unsigned long tva_flags,
                                         unsigned long orders)
{
        bool smaps = tva_flags & TVA_SMAPS;
        bool in_pf = tva_flags & TVA_IN_PF;
        bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
        /* Check the intersection of requested and supported orders. */
        orders &= vma_is_anonymous(vma) ?
                        THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
        if (!orders)
                return 0;

        if (!vma->vm_mm)                /* vdso */
                return 0;

        /*
         * Explicitly disabled through madvise or prctl, or some
         * architectures may disable THP for some mappings, for
         * example, s390 kvm.
         * */
        if ((vm_flags & VM_NOHUGEPAGE) ||
            test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
                return 0;
        /*
         * If the hardware/firmware marked hugepage support disabled.
         */
        if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
                return 0;

        /* khugepaged doesn't collapse DAX vma, but page fault is fine. */
        if (vma_is_dax(vma))
                return in_pf ? orders : 0;

        /*
         * khugepaged special VMA and hugetlb VMA.
         * Must be checked after dax since some dax mappings may have
         * VM_MIXEDMAP set.
         */
        if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
                return 0;

        /*
         * Check alignment for file vma and size for both file and anon vma by
         * filtering out the unsuitable orders.
         *
         * Skip the check for page fault. Huge fault does the check in fault
         * handlers.
         */
        if (!in_pf) {
                int order = highest_order(orders);
                unsigned long addr;

                while (orders) {
                        addr = vma->vm_end - (PAGE_SIZE << order);
                        if (thp_vma_suitable_order(vma, addr, order))
                                break;
                        order = next_order(&orders, order);
                }

                if (!orders)
                        return 0;
        }

        /*
         * Enabled via shmem mount options or sysfs settings.
         * Must be done before hugepage flags check since shmem has its
         * own flags.
         */
        if (!in_pf && shmem_file(vma->vm_file))
                return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
                                     !enforce_sysfs, vma->vm_mm, vm_flags)
                        ? orders : 0;

        if (!vma_is_anonymous(vma)) {
                /*
                 * Enforce sysfs THP requirements as necessary. Anonymous vmas
                 * were already handled in thp_vma_allowable_orders().
                 */
                if (enforce_sysfs &&
                    (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
                                                    !hugepage_global_always())))
                        return 0;

                /*
                 * Trust that ->huge_fault() handlers know what they are doing
                 * in fault path.
                 */
                if (((in_pf || smaps)) && vma->vm_ops->huge_fault)
                        return orders;
                /* Only regular file is valid in collapse path */
                if (((!in_pf || smaps)) && file_thp_enabled(vma))
                        return orders;
                return 0;
        }

        if (vma_is_temporary_stack(vma))
                return 0;

        /*
         * THPeligible bit of smaps should show 1 for proper VMAs even
         * though anon_vma is not initialized yet.
         *
         * Allow page fault since anon_vma may be not initialized until
         * the first page fault.
         */
        if (!vma->anon_vma)
                return (smaps || in_pf) ? orders : 0;

        return orders;
}

static bool get_huge_zero_page(void)
{
        struct folio *zero_folio;
retry:
        if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
                return true;

        zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
                        HPAGE_PMD_ORDER);
        if (!zero_folio) {
                count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
                return false;
        }
        preempt_disable();
        if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) {
                preempt_enable();
                folio_put(zero_folio);
                goto retry;
        }
        WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio));

        /* We take additional reference here. It will be put back by shrinker */
        atomic_set(&huge_zero_refcount, 2);
        preempt_enable();
        count_vm_event(THP_ZERO_PAGE_ALLOC);
        return true;
}

static void put_huge_zero_page(void)
{
        /*
         * Counter should never go to zero here. Only shrinker can put
         * last reference.
         */
        BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}

struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
{
        if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
                return READ_ONCE(huge_zero_folio);

        if (!get_huge_zero_page())
                return NULL;

        if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
                put_huge_zero_page();

        return READ_ONCE(huge_zero_folio);
}

void mm_put_huge_zero_folio(struct mm_struct *mm)
{
        if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
                put_huge_zero_page();
}

static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
                                        struct shrink_control *sc)
{
        /* we can free zero page only if last reference remains */
        return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
}

static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
                                       struct shrink_control *sc)
{
        if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
                struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
                BUG_ON(zero_folio == NULL);
                WRITE_ONCE(huge_zero_pfn, ~0UL);
                folio_put(zero_folio);
                return HPAGE_PMD_NR;
        }

        return 0;
}

static struct shrinker *huge_zero_page_shrinker;

#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
                            struct kobj_attribute *attr, char *buf)
{
        const char *output;

        if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
                output = "[always] madvise never";
        else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
                          &transparent_hugepage_flags))
                output = "always [madvise] never";
        else
                output = "always madvise [never]";

        return sysfs_emit(buf, "%s\n", output);
}

static ssize_t enabled_store(struct kobject *kobj,
                             struct kobj_attribute *attr,
                             const char *buf, size_t count)
{
        ssize_t ret = count;

        if (sysfs_streq(buf, "always")) {
                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "madvise")) {
                clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "never")) {
                clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
        } else
                ret = -EINVAL;

        if (ret > 0) {
                int err = start_stop_khugepaged();
                if (err)
                        ret = err;
        }
        return ret;
}

static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);

ssize_t single_hugepage_flag_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf,
                                  enum transparent_hugepage_flag flag)
{
        return sysfs_emit(buf, "%d\n",
                          !!test_bit(flag, &transparent_hugepage_flags));
}

ssize_t single_hugepage_flag_store(struct kobject *kobj,
                                 struct kobj_attribute *attr,
                                 const char *buf, size_t count,
                                 enum transparent_hugepage_flag flag)
{
        unsigned long value;
        int ret;

        ret = kstrtoul(buf, 10, &value);
        if (ret < 0)
                return ret;
        if (value > 1)
                return -EINVAL;

        if (value)
                set_bit(flag, &transparent_hugepage_flags);
        else
                clear_bit(flag, &transparent_hugepage_flags);

        return count;
}

static ssize_t defrag_show(struct kobject *kobj,
                           struct kobj_attribute *attr, char *buf)
{
        const char *output;

        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
                     &transparent_hugepage_flags))
                output = "[always] defer defer+madvise madvise never";
        else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
                          &transparent_hugepage_flags))
                output = "always [defer] defer+madvise madvise never";
        else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
                          &transparent_hugepage_flags))
                output = "always defer [defer+madvise] madvise never";
        else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
                          &transparent_hugepage_flags))
                output = "always defer defer+madvise [madvise] never";
        else
                output = "always defer defer+madvise madvise [never]";

        return sysfs_emit(buf, "%s\n", output);
}

static ssize_t defrag_store(struct kobject *kobj,
                            struct kobj_attribute *attr,
                            const char *buf, size_t count)
{
        if (sysfs_streq(buf, "always")) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "defer+madvise")) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "defer")) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "madvise")) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "never")) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
        } else
                return -EINVAL;

        return count;
}
static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);

static ssize_t use_zero_page_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
{
        return single_hugepage_flag_show(kobj, attr, buf,
                                         TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
}
static ssize_t use_zero_page_store(struct kobject *kobj,
                struct kobj_attribute *attr, const char *buf, size_t count)
{
        return single_hugepage_flag_store(kobj, attr, buf, count,
                                 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
}
static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);

static ssize_t hpage_pmd_size_show(struct kobject *kobj,
                                   struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
}
static struct kobj_attribute hpage_pmd_size_attr =
        __ATTR_RO(hpage_pmd_size);

static struct attribute *hugepage_attr[] = {
        &enabled_attr.attr,
        &defrag_attr.attr,
        &use_zero_page_attr.attr,
        &hpage_pmd_size_attr.attr,
#ifdef CONFIG_SHMEM
        &shmem_enabled_attr.attr,
#endif
        NULL,
};

static const struct attribute_group hugepage_attr_group = {
        .attrs = hugepage_attr,
};

static void hugepage_exit_sysfs(struct kobject *hugepage_kobj);
static void thpsize_release(struct kobject *kobj);
static DEFINE_SPINLOCK(huge_anon_orders_lock);
static LIST_HEAD(thpsize_list);

struct thpsize {
        struct kobject kobj;
        struct list_head node;
        int order;
};

#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj)

static ssize_t thpsize_enabled_show(struct kobject *kobj,
                                    struct kobj_attribute *attr, char *buf)
{
        int order = to_thpsize(kobj)->order;
        const char *output;

        if (test_bit(order, &huge_anon_orders_always))
                output = "[always] inherit madvise never";
        else if (test_bit(order, &huge_anon_orders_inherit))
                output = "always [inherit] madvise never";
        else if (test_bit(order, &huge_anon_orders_madvise))
                output = "always inherit [madvise] never";
        else
                output = "always inherit madvise [never]";

        return sysfs_emit(buf, "%s\n", output);
}

static ssize_t thpsize_enabled_store(struct kobject *kobj,
                                     struct kobj_attribute *attr,
                                     const char *buf, size_t count)
{
        int order = to_thpsize(kobj)->order;
        ssize_t ret = count;

        if (sysfs_streq(buf, "always")) {
                spin_lock(&huge_anon_orders_lock);
                clear_bit(order, &huge_anon_orders_inherit);
                clear_bit(order, &huge_anon_orders_madvise);
                set_bit(order, &huge_anon_orders_always);
                spin_unlock(&huge_anon_orders_lock);
        } else if (sysfs_streq(buf, "inherit")) {
                spin_lock(&huge_anon_orders_lock);
                clear_bit(order, &huge_anon_orders_always);
                clear_bit(order, &huge_anon_orders_madvise);
                set_bit(order, &huge_anon_orders_inherit);
                spin_unlock(&huge_anon_orders_lock);
        } else if (sysfs_streq(buf, "madvise")) {
                spin_lock(&huge_anon_orders_lock);
                clear_bit(order, &huge_anon_orders_always);
                clear_bit(order, &huge_anon_orders_inherit);
                set_bit(order, &huge_anon_orders_madvise);
                spin_unlock(&huge_anon_orders_lock);
        } else if (sysfs_streq(buf, "never")) {
                spin_lock(&huge_anon_orders_lock);
                clear_bit(order, &huge_anon_orders_always);
                clear_bit(order, &huge_anon_orders_inherit);
                clear_bit(order, &huge_anon_orders_madvise);
                spin_unlock(&huge_anon_orders_lock);
        } else
                ret = -EINVAL;

        return ret;
}

static struct kobj_attribute thpsize_enabled_attr =
        __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store);

static struct attribute *thpsize_attrs[] = {
        &thpsize_enabled_attr.attr,
        NULL,
};

static const struct attribute_group thpsize_attr_group = {
        .attrs = thpsize_attrs,
};

static const struct kobj_type thpsize_ktype = {
        .release = &thpsize_release,
        .sysfs_ops = &kobj_sysfs_ops,
};

DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}};

static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
{
        unsigned long sum = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                struct mthp_stat *this = &per_cpu(mthp_stats, cpu);

                sum += this->stats[order][item];
        }

        return sum;
}

#define DEFINE_MTHP_STAT_ATTR(_name, _index)                                \
static ssize_t _name##_show(struct kobject *kobj,                        \
                        struct kobj_attribute *attr, char *buf)                \
{                                                                        \
        int order = to_thpsize(kobj)->order;                                \
                                                                        \
        return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index));        \
}                                                                        \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)

DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);

static struct attribute *stats_attrs[] = {
        &anon_fault_alloc_attr.attr,
        &anon_fault_fallback_attr.attr,
        &anon_fault_fallback_charge_attr.attr,
        &swpout_attr.attr,
        &swpout_fallback_attr.attr,
        NULL,
};

static struct attribute_group stats_attr_group = {
        .name = "stats",
        .attrs = stats_attrs,
};

static struct thpsize *thpsize_create(int order, struct kobject *parent)
{
        unsigned long size = (PAGE_SIZE << order) / SZ_1K;
        struct thpsize *thpsize;
        int ret;

        thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL);
        if (!thpsize)
                return ERR_PTR(-ENOMEM);

        ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent,
                                   "hugepages-%lukB", size);
        if (ret) {
                kfree(thpsize);
                return ERR_PTR(ret);
        }

        ret = sysfs_create_group(&thpsize->kobj, &thpsize_attr_group);
        if (ret) {
                kobject_put(&thpsize->kobj);
                return ERR_PTR(ret);
        }

        ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
        if (ret) {
                kobject_put(&thpsize->kobj);
                return ERR_PTR(ret);
        }

        thpsize->order = order;
        return thpsize;
}

static void thpsize_release(struct kobject *kobj)
{
        kfree(to_thpsize(kobj));
}

static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
{
        int err;
        struct thpsize *thpsize;
        unsigned long orders;
        int order;

        /*
         * Default to setting PMD-sized THP to inherit the global setting and
         * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
         * constant so we have to do this here.
         */
        huge_anon_orders_inherit = BIT(PMD_ORDER);

        *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
        if (unlikely(!*hugepage_kobj)) {
                pr_err("failed to create transparent hugepage kobject\n");
                return -ENOMEM;
        }

        err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
        if (err) {
                pr_err("failed to register transparent hugepage group\n");
                goto delete_obj;
        }

        err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
        if (err) {
                pr_err("failed to register transparent hugepage group\n");
                goto remove_hp_group;
        }

        orders = THP_ORDERS_ALL_ANON;
        order = highest_order(orders);
        while (orders) {
                thpsize = thpsize_create(order, *hugepage_kobj);
                if (IS_ERR(thpsize)) {
                        pr_err("failed to create thpsize for order %d\n", order);
                        err = PTR_ERR(thpsize);
                        goto remove_all;
                }
                list_add(&thpsize->node, &thpsize_list);
                order = next_order(&orders, order);
        }

        return 0;

remove_all:
        hugepage_exit_sysfs(*hugepage_kobj);
        return err;
remove_hp_group:
        sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
delete_obj:
        kobject_put(*hugepage_kobj);
        return err;
}

static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
{
        struct thpsize *thpsize, *tmp;

        list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) {
                list_del(&thpsize->node);
                kobject_put(&thpsize->kobj);
        }

        sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
        sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
        kobject_put(hugepage_kobj);
}
#else
static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
{
        return 0;
}

static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
{
}
#endif /* CONFIG_SYSFS */

static int __init thp_shrinker_init(void)
{
        huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
        if (!huge_zero_page_shrinker)
                return -ENOMEM;

        deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
                                                 SHRINKER_MEMCG_AWARE |
                                                 SHRINKER_NONSLAB,
                                                 "thp-deferred_split");
        if (!deferred_split_shrinker) {
                shrinker_free(huge_zero_page_shrinker);
                return -ENOMEM;
        }

        huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
        huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
        shrinker_register(huge_zero_page_shrinker);

        deferred_split_shrinker->count_objects = deferred_split_count;
        deferred_split_shrinker->scan_objects = deferred_split_scan;
        shrinker_register(deferred_split_shrinker);

        return 0;
}

static void __init thp_shrinker_exit(void)
{
        shrinker_free(huge_zero_page_shrinker);
        shrinker_free(deferred_split_shrinker);
}

static int __init hugepage_init(void)
{
        int err;
        struct kobject *hugepage_kobj;

        if (!has_transparent_hugepage()) {
                transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
                return -EINVAL;
        }

        /*
         * hugepages can't be allocated by the buddy allocator
         */
        MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER);

        err = hugepage_init_sysfs(&hugepage_kobj);
        if (err)
                goto err_sysfs;

        err = khugepaged_init();
        if (err)
                goto err_slab;

        err = thp_shrinker_init();
        if (err)
                goto err_shrinker;

        /*
         * By default disable transparent hugepages on smaller systems,
         * where the extra memory used could hurt more than TLB overhead
         * is likely to save.  The admin can still enable it through /sys.
         */
        if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
                transparent_hugepage_flags = 0;
                return 0;
        }

        err = start_stop_khugepaged();
        if (err)
                goto err_khugepaged;

        return 0;
err_khugepaged:
        thp_shrinker_exit();
err_shrinker:
        khugepaged_destroy();
err_slab:
        hugepage_exit_sysfs(hugepage_kobj);
err_sysfs:
        return err;
}
subsys_initcall(hugepage_init);

static int __init setup_transparent_hugepage(char *str)
{
        int ret = 0;
        if (!str)
                goto out;
        if (!strcmp(str, "always")) {
                set_bit(TRANSPARENT_HUGEPAGE_FLAG,
                        &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
                          &transparent_hugepage_flags);
                ret = 1;
        } else if (!strcmp(str, "madvise")) {
                clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
                          &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
                        &transparent_hugepage_flags);
                ret = 1;
        } else if (!strcmp(str, "never")) {
                clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
                          &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
                          &transparent_hugepage_flags);
                ret = 1;
        }
out:
        if (!ret)
                pr_warn("transparent_hugepage= cannot parse, ignored\n");
        return ret;
}
__setup("transparent_hugepage=", setup_transparent_hugepage);

pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
        if (likely(vma->vm_flags & VM_WRITE))
                pmd = pmd_mkwrite(pmd, vma);
        return pmd;
}

#ifdef CONFIG_MEMCG
static inline
struct deferred_split *get_deferred_split_queue(struct folio *folio)
{
        struct mem_cgroup *memcg = folio_memcg(folio);
        struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));

        if (memcg)
                return &memcg->deferred_split_queue;
        else
                return &pgdat->deferred_split_queue;
}
#else
static inline
struct deferred_split *get_deferred_split_queue(struct folio *folio)
{
        struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));

        return &pgdat->deferred_split_queue;
}
#endif

static inline bool is_transparent_hugepage(const struct folio *folio)
{
        if (!folio_test_large(folio))
                return false;

        return is_huge_zero_folio(folio) ||
                folio_test_large_rmappable(folio);
}

static unsigned long __thp_get_unmapped_area(struct file *filp,
                unsigned long addr, unsigned long len,
                loff_t off, unsigned long flags, unsigned long size,
                vm_flags_t vm_flags)
{
        loff_t off_end = off + len;
        loff_t off_align = round_up(off, size);
        unsigned long len_pad, ret, off_sub;

        if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall())
                return 0;

        if (off_end <= off_align || (off_end - off_align) < size)
                return 0;

        len_pad = len + size;
        if (len_pad < len || (off + len_pad) < off)
                return 0;

        ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad,
                                           off >> PAGE_SHIFT, flags, vm_flags);

        /*
         * The failure might be due to length padding. The caller will retry
         * without the padding.
         */
        if (IS_ERR_VALUE(ret))
                return 0;

        /*
         * Do not try to align to THP boundary if allocation at the address
         * hint succeeds.
         */
        if (ret == addr)
                return addr;

        off_sub = (off - ret) & (size - 1);

        if (test_bit(MMF_TOPDOWN, &current->mm->flags) && !off_sub)
                return ret + size;

        ret += off_sub;
        return ret;
}

unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags,
                vm_flags_t vm_flags)
{
        unsigned long ret;
        loff_t off = (loff_t)pgoff << PAGE_SHIFT;

        ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags);
        if (ret)
                return ret;

        return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags,
                                            vm_flags);
}

unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags)
{
        return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0);
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);

static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
                        struct page *page, gfp_t gfp)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio = page_folio(page);
        pgtable_t pgtable;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        vm_fault_t ret = 0;

        VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);

        if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
                folio_put(folio);
                count_vm_event(THP_FAULT_FALLBACK);
                count_vm_event(THP_FAULT_FALLBACK_CHARGE);
                count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
                count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
                return VM_FAULT_FALLBACK;
        }
        folio_throttle_swaprate(folio, gfp);

        pgtable = pte_alloc_one(vma->vm_mm);
        if (unlikely(!pgtable)) {
                ret = VM_FAULT_OOM;
                goto release;
        }

        clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
        /*
         * The memory barrier inside __folio_mark_uptodate makes sure that
         * clear_huge_page writes become visible before the set_pmd_at()
         * write.
         */
        __folio_mark_uptodate(folio);

        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_none(*vmf->pmd))) {
                goto unlock_release;
        } else {
                pmd_t entry;

                ret = check_stable_address_space(vma->vm_mm);
                if (ret)
                        goto unlock_release;

                /* Deliver the page fault to userland */
                if (userfaultfd_missing(vma)) {
                        spin_unlock(vmf->ptl);
                        folio_put(folio);
                        pte_free(vma->vm_mm, pgtable);
                        ret = handle_userfault(vmf, VM_UFFD_MISSING);
                        VM_BUG_ON(ret & VM_FAULT_FALLBACK);
                        return ret;
                }

                entry = mk_huge_pmd(page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                folio_add_new_anon_rmap(folio, vma, haddr);
                folio_add_lru_vma(folio, vma);
                pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
                set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
                update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
                add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
                mm_inc_nr_ptes(vma->vm_mm);
                spin_unlock(vmf->ptl);
                count_vm_event(THP_FAULT_ALLOC);
                count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
                count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
        }

        return 0;
unlock_release:
        spin_unlock(vmf->ptl);
release:
        if (pgtable)
                pte_free(vma->vm_mm, pgtable);
        folio_put(folio);
        return ret;

}

/*
 * always: directly stall for all thp allocations
 * defer: wake kswapd and fail if not immediately available
 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
 *                  fail if not immediately available
 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
 *            available
 * never: never stall for any thp allocation
 */
gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
{
        const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);

        /* Always do synchronous compaction */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);

        /* Kick kcompactd and fail quickly */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;

        /* Synchronous compaction if madvised, otherwise kick kcompactd */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE_LIGHT |
                        (vma_madvised ? __GFP_DIRECT_RECLAIM :
                                        __GFP_KSWAPD_RECLAIM);

        /* Only do synchronous compaction if madvised */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE_LIGHT |
                       (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);

        return GFP_TRANSHUGE_LIGHT;
}

/* Caller must hold page table lock. */
static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
                struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
                struct folio *zero_folio)
{
        pmd_t entry;
        if (!pmd_none(*pmd))
                return;
        entry = mk_pmd(&zero_folio->page, vma->vm_page_prot);
        entry = pmd_mkhuge(entry);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, haddr, pmd, entry);
        mm_inc_nr_ptes(mm);
}

vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        gfp_t gfp;
        struct folio *folio;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        vm_fault_t ret;

        if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
                return VM_FAULT_FALLBACK;
        ret = vmf_anon_prepare(vmf);
        if (ret)
                return ret;
        khugepaged_enter_vma(vma, vma->vm_flags);

        if (!(vmf->flags & FAULT_FLAG_WRITE) &&
                        !mm_forbids_zeropage(vma->vm_mm) &&
                        transparent_hugepage_use_zero_page()) {
                pgtable_t pgtable;
                struct folio *zero_folio;
                vm_fault_t ret;

                pgtable = pte_alloc_one(vma->vm_mm);
                if (unlikely(!pgtable))
                        return VM_FAULT_OOM;
                zero_folio = mm_get_huge_zero_folio(vma->vm_mm);
                if (unlikely(!zero_folio)) {
                        pte_free(vma->vm_mm, pgtable);
                        count_vm_event(THP_FAULT_FALLBACK);
                        return VM_FAULT_FALLBACK;
                }
                vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
                ret = 0;
                if (pmd_none(*vmf->pmd)) {
                        ret = check_stable_address_space(vma->vm_mm);
                        if (ret) {
                                spin_unlock(vmf->ptl);
                                pte_free(vma->vm_mm, pgtable);
                        } else if (userfaultfd_missing(vma)) {
                                spin_unlock(vmf->ptl);
                                pte_free(vma->vm_mm, pgtable);
                                ret = handle_userfault(vmf, VM_UFFD_MISSING);
                                VM_BUG_ON(ret & VM_FAULT_FALLBACK);
                        } else {
                                set_huge_zero_folio(pgtable, vma->vm_mm, vma,
                                                   haddr, vmf->pmd, zero_folio);
                                update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
                                spin_unlock(vmf->ptl);
                        }
                } else {
                        spin_unlock(vmf->ptl);
                        pte_free(vma->vm_mm, pgtable);
                }
                return ret;
        }
        gfp = vma_thp_gfp_mask(vma);
        folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
        if (unlikely(!folio)) {
                count_vm_event(THP_FAULT_FALLBACK);
                count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
        }
        return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
}

static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
                pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
                pgtable_t pgtable)
{
        struct mm_struct *mm = vma->vm_mm;
        pmd_t entry;
        spinlock_t *ptl;

        ptl = pmd_lock(mm, pmd);
        if (!pmd_none(*pmd)) {
                if (write) {
                        if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
                                WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
                                goto out_unlock;
                        }
                        entry = pmd_mkyoung(*pmd);
                        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                        if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
                                update_mmu_cache_pmd(vma, addr, pmd);
                }

                goto out_unlock;
        }

        entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
        if (pfn_t_devmap(pfn))
                entry = pmd_mkdevmap(entry);
        if (write) {
                entry = pmd_mkyoung(pmd_mkdirty(entry));
                entry = maybe_pmd_mkwrite(entry, vma);
        }

        if (pgtable) {
                pgtable_trans_huge_deposit(mm, pmd, pgtable);
                mm_inc_nr_ptes(mm);
                pgtable = NULL;
        }

        set_pmd_at(mm, addr, pmd, entry);
        update_mmu_cache_pmd(vma, addr, pmd);

out_unlock:
        spin_unlock(ptl);
        if (pgtable)
                pte_free(mm, pgtable);
}

/**
 * vmf_insert_pfn_pmd - insert a pmd size pfn
 * @vmf: Structure describing the fault
 * @pfn: pfn to insert
 * @write: whether it's a write fault
 *
 * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
 *
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
{
        unsigned long addr = vmf->address & PMD_MASK;
        struct vm_area_struct *vma = vmf->vma;
        pgprot_t pgprot = vma->vm_page_prot;
        pgtable_t pgtable = NULL;

        /*
         * If we had pmd_special, we could avoid all these restrictions,
         * but we need to be consistent with PTEs and architectures that
         * can't support a 'special' bit.
         */
        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
                        !pfn_t_devmap(pfn));
        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                (VM_PFNMAP|VM_MIXEDMAP));
        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        if (arch_needs_pgtable_deposit()) {
                pgtable = pte_alloc_one(vma->vm_mm);
                if (!pgtable)
                        return VM_FAULT_OOM;
        }

        track_pfn_insert(vma, &pgprot, pfn);

        insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
        return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
{
        if (likely(vma->vm_flags & VM_WRITE))
                pud = pud_mkwrite(pud);
        return pud;
}

static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
                pud_t *pud, pfn_t pfn, bool write)
{
        struct mm_struct *mm = vma->vm_mm;
        pgprot_t prot = vma->vm_page_prot;
        pud_t entry;
        spinlock_t *ptl;

        ptl = pud_lock(mm, pud);
        if (!pud_none(*pud)) {
                if (write) {
                        if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
                                WARN_ON_ONCE(!is_huge_zero_pud(*pud));
                                goto out_unlock;
                        }
                        entry = pud_mkyoung(*pud);
                        entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
                        if (pudp_set_access_flags(vma, addr, pud, entry, 1))
                                update_mmu_cache_pud(vma, addr, pud);
                }
                goto out_unlock;
        }

        entry = pud_mkhuge(pfn_t_pud(pfn, prot));
        if (pfn_t_devmap(pfn))
                entry = pud_mkdevmap(entry);
        if (write) {
                entry = pud_mkyoung(pud_mkdirty(entry));
                entry = maybe_pud_mkwrite(entry, vma);
        }
        set_pud_at(mm, addr, pud, entry);
        update_mmu_cache_pud(vma, addr, pud);

out_unlock:
        spin_unlock(ptl);
}

/**
 * vmf_insert_pfn_pud - insert a pud size pfn
 * @vmf: Structure describing the fault
 * @pfn: pfn to insert
 * @write: whether it's a write fault
 *
 * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
 *
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
{
        unsigned long addr = vmf->address & PUD_MASK;
        struct vm_area_struct *vma = vmf->vma;
        pgprot_t pgprot = vma->vm_page_prot;

        /*
         * If we had pud_special, we could avoid all these restrictions,
         * but we need to be consistent with PTEs and architectures that
         * can't support a 'special' bit.
         */
        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
                        !pfn_t_devmap(pfn));
        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                (VM_PFNMAP|VM_MIXEDMAP));
        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        track_pfn_insert(vma, &pgprot, pfn);

        insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
        return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
               pmd_t *pmd, bool write)
{
        pmd_t _pmd;

        _pmd = pmd_mkyoung(*pmd);
        if (write)
                _pmd = pmd_mkdirty(_pmd);
        if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
                                  pmd, _pmd, write))
                update_mmu_cache_pmd(vma, addr, pmd);
}

struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
                pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
{
        unsigned long pfn = pmd_pfn(*pmd);
        struct mm_struct *mm = vma->vm_mm;
        struct page *page;
        int ret;

        assert_spin_locked(pmd_lockptr(mm, pmd));

        if (flags & FOLL_WRITE && !pmd_write(*pmd))
                return NULL;

        if (pmd_present(*pmd) && pmd_devmap(*pmd))
                /* pass */;
        else
                return NULL;

        if (flags & FOLL_TOUCH)
                touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);

        /*
         * device mapped pages can only be returned if the
         * caller will manage the page reference count.
         */
        if (!(flags & (FOLL_GET | FOLL_PIN)))
                return ERR_PTR(-EEXIST);

        pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
        *pgmap = get_dev_pagemap(pfn, *pgmap);
        if (!*pgmap)
                return ERR_PTR(-EFAULT);
        page = pfn_to_page(pfn);
        ret = try_grab_page(page, flags);
        if (ret)
                page = ERR_PTR(ret);

        return page;
}

int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
        spinlock_t *dst_ptl, *src_ptl;
        struct page *src_page;
        struct folio *src_folio;
        pmd_t pmd;
        pgtable_t pgtable = NULL;
        int ret = -ENOMEM;

        /* Skip if can be re-fill on fault */
        if (!vma_is_anonymous(dst_vma))
                return 0;

        pgtable = pte_alloc_one(dst_mm);
        if (unlikely(!pgtable))
                goto out;

        dst_ptl = pmd_lock(dst_mm, dst_pmd);
        src_ptl = pmd_lockptr(src_mm, src_pmd);
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

        ret = -EAGAIN;
        pmd = *src_pmd;

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
        if (unlikely(is_swap_pmd(pmd))) {
                swp_entry_t entry = pmd_to_swp_entry(pmd);

                VM_BUG_ON(!is_pmd_migration_entry(pmd));
                if (!is_readable_migration_entry(entry)) {
                        entry = make_readable_migration_entry(
                                                        swp_offset(entry));
                        pmd = swp_entry_to_pmd(entry);
                        if (pmd_swp_soft_dirty(*src_pmd))
                                pmd = pmd_swp_mksoft_dirty(pmd);
                        if (pmd_swp_uffd_wp(*src_pmd))
                                pmd = pmd_swp_mkuffd_wp(pmd);
                        set_pmd_at(src_mm, addr, src_pmd, pmd);
                }
                add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
                mm_inc_nr_ptes(dst_mm);
                pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
                if (!userfaultfd_wp(dst_vma))
                        pmd = pmd_swp_clear_uffd_wp(pmd);
                set_pmd_at(dst_mm, addr, dst_pmd, pmd);
                ret = 0;
                goto out_unlock;
        }
#endif

        if (unlikely(!pmd_trans_huge(pmd))) {
                pte_free(dst_mm, pgtable);
                goto out_unlock;
        }
        /*
         * When page table lock is held, the huge zero pmd should not be
         * under splitting since we don't split the page itself, only pmd to
         * a page table.
         */
        if (is_huge_zero_pmd(pmd)) {
                /*
                 * mm_get_huge_zero_folio() will never allocate a new
                 * folio here, since we already have a zero page to
                 * copy. It just takes a reference.
                 */
                mm_get_huge_zero_folio(dst_mm);
                goto out_zero_page;
        }

        src_page = pmd_page(pmd);
        VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
        src_folio = page_folio(src_page);

        folio_get(src_folio);
        if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, src_vma))) {
                /* Page maybe pinned: split and retry the fault on PTEs. */
                folio_put(src_folio);
                pte_free(dst_mm, pgtable);
                spin_unlock(src_ptl);
                spin_unlock(dst_ptl);
                __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
                return -EAGAIN;
        }
        add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
out_zero_page:
        mm_inc_nr_ptes(dst_mm);
        pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
        pmdp_set_wrprotect(src_mm, addr, src_pmd);
        if (!userfaultfd_wp(dst_vma))
                pmd = pmd_clear_uffd_wp(pmd);
        pmd = pmd_mkold(pmd_wrprotect(pmd));
        set_pmd_at(dst_mm, addr, dst_pmd, pmd);

        ret = 0;
out_unlock:
        spin_unlock(src_ptl);
        spin_unlock(dst_ptl);
out:
        return ret;
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
void touch_pud(struct vm_area_struct *vma, unsigned long addr,
               pud_t *pud, bool write)
{
        pud_t _pud;

        _pud = pud_mkyoung(*pud);
        if (write)
                _pud = pud_mkdirty(_pud);
        if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
                                  pud, _pud, write))
                update_mmu_cache_pud(vma, addr, pud);
}

int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
                  struct vm_area_struct *vma)
{
        spinlock_t *dst_ptl, *src_ptl;
        pud_t pud;
        int ret;

        dst_ptl = pud_lock(dst_mm, dst_pud);
        src_ptl = pud_lockptr(src_mm, src_pud);
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

        ret = -EAGAIN;
        pud = *src_pud;
        if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
                goto out_unlock;

        /*
         * When page table lock is held, the huge zero pud should not be
         * under splitting since we don't split the page itself, only pud to
         * a page table.
         */
        if (is_huge_zero_pud(pud)) {
                /* No huge zero pud yet */
        }

        /*
         * TODO: once we support anonymous pages, use
         * folio_try_dup_anon_rmap_*() and split if duplicating fails.
         */
        pudp_set_wrprotect(src_mm, addr, src_pud);
        pud = pud_mkold(pud_wrprotect(pud));
        set_pud_at(dst_mm, addr, dst_pud, pud);

        ret = 0;
out_unlock:
        spin_unlock(src_ptl);
        spin_unlock(dst_ptl);
        return ret;
}

void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
{
        bool write = vmf->flags & FAULT_FLAG_WRITE;

        vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
        if (unlikely(!pud_same(*vmf->pud, orig_pud)))
                goto unlock;

        touch_pud(vmf->vma, vmf->address, vmf->pud, write);
unlock:
        spin_unlock(vmf->ptl);
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

void huge_pmd_set_accessed(struct vm_fault *vmf)
{
        bool write = vmf->flags & FAULT_FLAG_WRITE;

        vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
                goto unlock;

        touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);

unlock:
        spin_unlock(vmf->ptl);
}

vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
{
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        struct page *page;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        pmd_t orig_pmd = vmf->orig_pmd;

        vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
        VM_BUG_ON_VMA(!vma->anon_vma, vma);

        if (is_huge_zero_pmd(orig_pmd))
                goto fallback;

        spin_lock(vmf->ptl);

        if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
                spin_unlock(vmf->ptl);
                return 0;
        }

        page = pmd_page(orig_pmd);
        folio = page_folio(page);
        VM_BUG_ON_PAGE(!PageHead(page), page);

        /* Early check when only holding the PT lock. */
        if (PageAnonExclusive(page))
                goto reuse;

        if (!folio_trylock(folio)) {
                folio_get(folio);
                spin_unlock(vmf->ptl);
                folio_lock(folio);
                spin_lock(vmf->ptl);
                if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
                        spin_unlock(vmf->ptl);
                        folio_unlock(folio);
                        folio_put(folio);
                        return 0;
                }
                folio_put(folio);
        }

        /* Recheck after temporarily dropping the PT lock. */
        if (PageAnonExclusive(page)) {
                folio_unlock(folio);
                goto reuse;
        }

        /*
         * See do_wp_page(): we can only reuse the folio exclusively if
         * there are no additional references. Note that we always drain
         * the LRU cache immediately after adding a THP.
         */
        if (folio_ref_count(folio) >
                        1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
                goto unlock_fallback;
        if (folio_test_swapcache(folio))
                folio_free_swap(folio);
        if (folio_ref_count(folio) == 1) {
                pmd_t entry;

                folio_move_anon_rmap(folio, vma);
                SetPageAnonExclusive(page);
                folio_unlock(folio);
reuse:
                if (unlikely(unshare)) {
                        spin_unlock(vmf->ptl);
                        return 0;
                }
                entry = pmd_mkyoung(orig_pmd);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
                        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
                spin_unlock(vmf->ptl);
                return 0;
        }

unlock_fallback:
        folio_unlock(folio);
        spin_unlock(vmf->ptl);
fallback:
        __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
        return VM_FAULT_FALLBACK;
}

static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
                                           unsigned long addr, pmd_t pmd)
{
        struct page *page;

        if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
                return false;

        /* Don't touch entries that are not even readable (NUMA hinting). */
        if (pmd_protnone(pmd))
                return false;

        /* Do we need write faults for softdirty tracking? */
        if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
                return false;

        /* Do we need write faults for uffd-wp tracking? */
        if (userfaultfd_huge_pmd_wp(vma, pmd))
                return false;

        if (!(vma->vm_flags & VM_SHARED)) {
                /* See can_change_pte_writable(). */
                page = vm_normal_page_pmd(vma, addr, pmd);
                return page && PageAnon(page) && PageAnonExclusive(page);
        }

        /* See can_change_pte_writable(). */
        return pmd_dirty(pmd);
}

/* NUMA hinting page fault entry point for trans huge pmds */
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        pmd_t oldpmd = vmf->orig_pmd;
        pmd_t pmd;
        struct folio *folio;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        int nid = NUMA_NO_NODE;
        int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
        bool migrated = false, writable = false;
        int flags = 0;

        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
                spin_unlock(vmf->ptl);
                goto out;
        }

        pmd = pmd_modify(oldpmd, vma->vm_page_prot);

        /*
         * Detect now whether the PMD could be writable; this information
         * is only valid while holding the PT lock.
         */
        writable = pmd_write(pmd);
        if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
            can_change_pmd_writable(vma, vmf->address, pmd))
                writable = true;

        folio = vm_normal_folio_pmd(vma, haddr, pmd);
        if (!folio)
                goto out_map;

        /* See similar comment in do_numa_page for explanation */
        if (!writable)
                flags |= TNF_NO_GROUP;

        nid = folio_nid(folio);
        /*
         * For memory tiering mode, cpupid of slow memory page is used
         * to record page access time.  So use default value.
         */
        if (node_is_toptier(nid))
                last_cpupid = folio_last_cpupid(folio);
        target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags);
        if (target_nid == NUMA_NO_NODE) {
                folio_put(folio);
                goto out_map;
        }

        spin_unlock(vmf->ptl);
        writable = false;

        migrated = migrate_misplaced_folio(folio, vma, target_nid);
        if (migrated) {
                flags |= TNF_MIGRATED;
                nid = target_nid;
        } else {
                flags |= TNF_MIGRATE_FAIL;
                vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
                if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
                        spin_unlock(vmf->ptl);
                        goto out;
                }
                goto out_map;
        }

out:
        if (nid != NUMA_NO_NODE)
                task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);

        return 0;

out_map:
        /* Restore the PMD */
        pmd = pmd_modify(oldpmd, vma->vm_page_prot);
        pmd = pmd_mkyoung(pmd);
        if (writable)
                pmd = pmd_mkwrite(pmd, vma);
        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
        spin_unlock(vmf->ptl);
        goto out;
}

/*
 * Return true if we do MADV_FREE successfully on entire pmd page.
 * Otherwise, return false.
 */
bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                pmd_t *pmd, unsigned long addr, unsigned long next)
{
        spinlock_t *ptl;
        pmd_t orig_pmd;
        struct folio *folio;
        struct mm_struct *mm = tlb->mm;
        bool ret = false;

        tlb_change_page_size(tlb, HPAGE_PMD_SIZE);

        ptl = pmd_trans_huge_lock(pmd, vma);
        if (!ptl)
                goto out_unlocked;

        orig_pmd = *pmd;
        if (is_huge_zero_pmd(orig_pmd))
                goto out;

        if (unlikely(!pmd_present(orig_pmd))) {
                VM_BUG_ON(thp_migration_supported() &&
                                  !is_pmd_migration_entry(orig_pmd));
                goto out;
        }

        folio = pmd_folio(orig_pmd);
        /*
         * If other processes are mapping this folio, we couldn't discard
         * the folio unless they all do MADV_FREE so let's skip the folio.
         */
        if (folio_likely_mapped_shared(folio))
                goto out;

        if (!folio_trylock(folio))
                goto out;

        /*
         * If user want to discard part-pages of THP, split it so MADV_FREE
         * will deactivate only them.
         */
        if (next - addr != HPAGE_PMD_SIZE) {
                folio_get(folio);
                spin_unlock(ptl);
                split_folio(folio);
                folio_unlock(folio);
                folio_put(folio);
                goto out_unlocked;
        }

        if (folio_test_dirty(folio))
                folio_clear_dirty(folio);
        folio_unlock(folio);

        if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
                pmdp_invalidate(vma, addr, pmd);
                orig_pmd = pmd_mkold(orig_pmd);
                orig_pmd = pmd_mkclean(orig_pmd);

                set_pmd_at(mm, addr, pmd, orig_pmd);
                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
        }

        folio_mark_lazyfree(folio);
        ret = true;
out:
        spin_unlock(ptl);
out_unlocked:
        return ret;
}

static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
{
        pgtable_t pgtable;

        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pte_free(mm, pgtable);
        mm_dec_nr_ptes(mm);
}

int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pmd_t *pmd, unsigned long addr)
{
        pmd_t orig_pmd;
        spinlock_t *ptl;

        tlb_change_page_size(tlb, HPAGE_PMD_SIZE);

        ptl = __pmd_trans_huge_lock(pmd, vma);
        if (!ptl)
                return 0;
        /*
         * For architectures like ppc64 we look at deposited pgtable
         * when calling pmdp_huge_get_and_clear. So do the
         * pgtable_trans_huge_withdraw after finishing pmdp related
         * operations.
         */
        orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
                                                tlb->fullmm);
        arch_check_zapped_pmd(vma, orig_pmd);
        tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
        if (vma_is_special_huge(vma)) {
                if (arch_needs_pgtable_deposit())
                        zap_deposited_table(tlb->mm, pmd);
                spin_unlock(ptl);
        } else if (is_huge_zero_pmd(orig_pmd)) {
                zap_deposited_table(tlb->mm, pmd);
                spin_unlock(ptl);
        } else {
                struct folio *folio = NULL;
                int flush_needed = 1;

                if (pmd_present(orig_pmd)) {
                        struct page *page = pmd_page(orig_pmd);

                        folio = page_folio(page);
                        folio_remove_rmap_pmd(folio, page, vma);
                        WARN_ON_ONCE(folio_mapcount(folio) < 0);
                        VM_BUG_ON_PAGE(!PageHead(page), page);
                } else if (thp_migration_supported()) {
                        swp_entry_t entry;

                        VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
                        entry = pmd_to_swp_entry(orig_pmd);
                        folio = pfn_swap_entry_folio(entry);
                        flush_needed = 0;
                } else
                        WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");

                if (folio_test_anon(folio)) {
                        zap_deposited_table(tlb->mm, pmd);
                        add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
                } else {
                        if (arch_needs_pgtable_deposit())
                                zap_deposited_table(tlb->mm, pmd);
                        add_mm_counter(tlb->mm, mm_counter_file(folio),
                                       -HPAGE_PMD_NR);
                }

                spin_unlock(ptl);
                if (flush_needed)
                        tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
        }
        return 1;
}

#ifndef pmd_move_must_withdraw
static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
                                         spinlock_t *old_pmd_ptl,
                                         struct vm_area_struct *vma)
{
        /*
         * With split pmd lock we also need to move preallocated
         * PTE page table if new_pmd is on different PMD page table.
         *
         * We also don't deposit and withdraw tables for file pages.
         */
        return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
}
#endif

static pmd_t move_soft_dirty_pmd(pmd_t pmd)
{
#ifdef CONFIG_MEM_SOFT_DIRTY
        if (unlikely(is_pmd_migration_entry(pmd)))
                pmd = pmd_swp_mksoft_dirty(pmd);
        else if (pmd_present(pmd))
                pmd = pmd_mksoft_dirty(pmd);
#endif
        return pmd;
}

bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
                  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
        spinlock_t *old_ptl, *new_ptl;
        pmd_t pmd;
        struct mm_struct *mm = vma->vm_mm;
        bool force_flush = false;

        /*
         * The destination pmd shouldn't be established, free_pgtables()
         * should have released it; but move_page_tables() might have already
         * inserted a page table, if racing against shmem/file collapse.
         */
        if (!pmd_none(*new_pmd)) {
                VM_BUG_ON(pmd_trans_huge(*new_pmd));
                return false;
        }

        /*
         * We don't have to worry about the ordering of src and dst
         * ptlocks because exclusive mmap_lock prevents deadlock.
         */
        old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
        if (old_ptl) {
                new_ptl = pmd_lockptr(mm, new_pmd);
                if (new_ptl != old_ptl)
                        spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
                pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
                if (pmd_present(pmd))
                        force_flush = true;
                VM_BUG_ON(!pmd_none(*new_pmd));

                if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
                        pgtable_t pgtable;
                        pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
                        pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
                }
                pmd = move_soft_dirty_pmd(pmd);
                set_pmd_at(mm, new_addr, new_pmd, pmd);
                if (force_flush)
                        flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
                if (new_ptl != old_ptl)
                        spin_unlock(new_ptl);
                spin_unlock(old_ptl);
                return true;
        }
        return false;
}

/*
 * Returns
 *  - 0 if PMD could not be locked
 *  - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
 *      or if prot_numa but THP migration is not supported
 *  - HPAGE_PMD_NR if protections changed and TLB flush necessary
 */
int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
                    unsigned long cp_flags)
{
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *ptl;
        pmd_t oldpmd, entry;
        bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
        bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
        bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
        int ret = 1;

        tlb_change_page_size(tlb, HPAGE_PMD_SIZE);

        if (prot_numa && !thp_migration_supported())
                return 1;

        ptl = __pmd_trans_huge_lock(pmd, vma);
        if (!ptl)
                return 0;

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
        if (is_swap_pmd(*pmd)) {
                swp_entry_t entry = pmd_to_swp_entry(*pmd);
                struct folio *folio = pfn_swap_entry_folio(entry);
                pmd_t newpmd;

                VM_BUG_ON(!is_pmd_migration_entry(*pmd));
                if (is_writable_migration_entry(entry)) {
                        /*
                         * A protection check is difficult so
                         * just be safe and disable write
                         */
                        if (folio_test_anon(folio))
                                entry = make_readable_exclusive_migration_entry(swp_offset(entry));
                        else
                                entry = make_readable_migration_entry(swp_offset(entry));
                        newpmd = swp_entry_to_pmd(entry);
                        if (pmd_swp_soft_dirty(*pmd))
                                newpmd = pmd_swp_mksoft_dirty(newpmd);
                } else {
                        newpmd = *pmd;
                }

                if (uffd_wp)
                        newpmd = pmd_swp_mkuffd_wp(newpmd);
                else if (uffd_wp_resolve)
                        newpmd = pmd_swp_clear_uffd_wp(newpmd);
                if (!pmd_same(*pmd, newpmd))
                        set_pmd_at(mm, addr, pmd, newpmd);
                goto unlock;
        }
#endif

        if (prot_numa) {
                struct folio *folio;
                bool toptier;
                /*
                 * Avoid trapping faults against the zero page. The read-only
                 * data is likely to be read-cached on the local CPU and
                 * local/remote hits to the zero page are not interesting.
                 */
                if (is_huge_zero_pmd(*pmd))
                        goto unlock;

                if (pmd_protnone(*pmd))
                        goto unlock;

                folio = pmd_folio(*pmd);
                toptier = node_is_toptier(folio_nid(folio));
                /*
                 * Skip scanning top tier node if normal numa
                 * balancing is disabled
                 */
                if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
                    toptier)
                        goto unlock;

                if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
                    !toptier)
                        folio_xchg_access_time(folio,
                                               jiffies_to_msecs(jiffies));
        }
        /*
         * In case prot_numa, we are under mmap_read_lock(mm). It's critical
         * to not clear pmd intermittently to avoid race with MADV_DONTNEED
         * which is also under mmap_read_lock(mm):
         *
         *        CPU0:                                CPU1:
         *                                change_huge_pmd(prot_numa=1)
         *                                 pmdp_huge_get_and_clear_notify()
         * madvise_dontneed()
         *  zap_pmd_range()
         *   pmd_trans_huge(*pmd) == 0 (without ptl)
         *   // skip the pmd
         *                                 set_pmd_at();
         *                                 // pmd is re-established
         *
         * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
         * which may break userspace.
         *
         * pmdp_invalidate_ad() is required to make sure we don't miss
         * dirty/young flags set by hardware.
         */
        oldpmd = pmdp_invalidate_ad(vma, addr, pmd);

        entry = pmd_modify(oldpmd, newprot);
        if (uffd_wp)
                entry = pmd_mkuffd_wp(entry);
        else if (uffd_wp_resolve)
                /*
                 * Leave the write bit to be handled by PF interrupt
                 * handler, then things like COW could be properly
                 * handled.
                 */
                entry = pmd_clear_uffd_wp(entry);

        /* See change_pte_range(). */
        if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
            can_change_pmd_writable(vma, addr, entry))
                entry = pmd_mkwrite(entry, vma);

        ret = HPAGE_PMD_NR;
        set_pmd_at(mm, addr, pmd, entry);

        if (huge_pmd_needs_flush(oldpmd, entry))
                tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
unlock:
        spin_unlock(ptl);
        return ret;
}

#ifdef CONFIG_USERFAULTFD
/*
 * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
 * the caller, but it must return after releasing the page_table_lock.
 * Just move the page from src_pmd to dst_pmd if possible.
 * Return zero if succeeded in moving the page, -EAGAIN if it needs to be
 * repeated by the caller, or other errors in case of failure.
 */
int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
                        struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                        unsigned long dst_addr, unsigned long src_addr)
{
        pmd_t _dst_pmd, src_pmdval;
        struct page *src_page;
        struct folio *src_folio;
        struct anon_vma *src_anon_vma;
        spinlock_t *src_ptl, *dst_ptl;
        pgtable_t src_pgtable;
        struct mmu_notifier_range range;
        int err = 0;

        src_pmdval = *src_pmd;
        src_ptl = pmd_lockptr(mm, src_pmd);

        lockdep_assert_held(src_ptl);
        vma_assert_locked(src_vma);
        vma_assert_locked(dst_vma);

        /* Sanity checks before the operation */
        if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
            WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) {
                spin_unlock(src_ptl);
                return -EINVAL;
        }

        if (!pmd_trans_huge(src_pmdval)) {
                spin_unlock(src_ptl);
                if (is_pmd_migration_entry(src_pmdval)) {
                        pmd_migration_entry_wait(mm, &src_pmdval);
                        return -EAGAIN;
                }
                return -ENOENT;
        }

        src_page = pmd_page(src_pmdval);

        if (!is_huge_zero_pmd(src_pmdval)) {
                if (unlikely(!PageAnonExclusive(src_page))) {
                        spin_unlock(src_ptl);
                        return -EBUSY;
                }

                src_folio = page_folio(src_page);
                folio_get(src_folio);
        } else
                src_folio = NULL;

        spin_unlock(src_ptl);

        flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr,
                                src_addr + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        if (src_folio) {
                folio_lock(src_folio);

                /*
                 * split_huge_page walks the anon_vma chain without the page
                 * lock. Serialize against it with the anon_vma lock, the page
                 * lock is not enough.
                 */
                src_anon_vma = folio_get_anon_vma(src_folio);
                if (!src_anon_vma) {
                        err = -EAGAIN;
                        goto unlock_folio;
                }
                anon_vma_lock_write(src_anon_vma);
        } else
                src_anon_vma = NULL;

        dst_ptl = pmd_lockptr(mm, dst_pmd);
        double_pt_lock(src_ptl, dst_ptl);
        if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
                     !pmd_same(*dst_pmd, dst_pmdval))) {
                err = -EAGAIN;
                goto unlock_ptls;
        }
        if (src_folio) {
                if (folio_maybe_dma_pinned(src_folio) ||
                    !PageAnonExclusive(&src_folio->page)) {
                        err = -EBUSY;
                        goto unlock_ptls;
                }

                if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
                    WARN_ON_ONCE(!folio_test_anon(src_folio))) {
                        err = -EBUSY;
                        goto unlock_ptls;
                }

                src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
                /* Folio got pinned from under us. Put it back and fail the move. */
                if (folio_maybe_dma_pinned(src_folio)) {
                        set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
                        err = -EBUSY;
                        goto unlock_ptls;
                }

                folio_move_anon_rmap(src_folio, dst_vma);
                src_folio->index = linear_page_index(dst_vma, dst_addr);

                _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
                /* Follow mremap() behavior and treat the entry dirty after the move */
                _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
        } else {
                src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
                _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
        }
        set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);

        src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
        pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
unlock_ptls:
        double_pt_unlock(src_ptl, dst_ptl);
        if (src_anon_vma) {
                anon_vma_unlock_write(src_anon_vma);
                put_anon_vma(src_anon_vma);
        }
unlock_folio:
        /* unblock rmap walks */
        if (src_folio)
                folio_unlock(src_folio);
        mmu_notifier_invalidate_range_end(&range);
        if (src_folio)
                folio_put(src_folio);
        return err;
}
#endif /* CONFIG_USERFAULTFD */

/*
 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
 *
 * Note that if it returns page table lock pointer, this routine returns without
 * unlocking page table lock. So callers must unlock it.
 */
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
{
        spinlock_t *ptl;
        ptl = pmd_lock(vma->vm_mm, pmd);
        if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
                        pmd_devmap(*pmd)))
                return ptl;
        spin_unlock(ptl);
        return NULL;
}

/*
 * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
 *
 * Note that if it returns page table lock pointer, this routine returns without
 * unlocking page table lock. So callers must unlock it.
 */
spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
{
        spinlock_t *ptl;

        ptl = pud_lock(vma->vm_mm, pud);
        if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
                return ptl;
        spin_unlock(ptl);
        return NULL;
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pud_t *pud, unsigned long addr)
{
        spinlock_t *ptl;

        ptl = __pud_trans_huge_lock(pud, vma);
        if (!ptl)
                return 0;

        pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
        tlb_remove_pud_tlb_entry(tlb, pud, addr);
        if (vma_is_special_huge(vma)) {
                spin_unlock(ptl);
                /* No zero page support yet */
        } else {
                /* No support for anonymous PUD pages yet */
                BUG();
        }
        return 1;
}

static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
                unsigned long haddr)
{
        VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
        VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
        VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
        VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));

        count_vm_event(THP_SPLIT_PUD);

        pudp_huge_clear_flush(vma, haddr, pud);
}

void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
                unsigned long address)
{
        spinlock_t *ptl;
        struct mmu_notifier_range range;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address & HPAGE_PUD_MASK,
                                (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
        mmu_notifier_invalidate_range_start(&range);
        ptl = pud_lock(vma->vm_mm, pud);
        if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
                goto out;
        __split_huge_pud_locked(vma, pud, range.start);

out:
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(&range);
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
                unsigned long haddr, pmd_t *pmd)
{
        struct mm_struct *mm = vma->vm_mm;
        pgtable_t pgtable;
        pmd_t _pmd, old_pmd;
        unsigned long addr;
        pte_t *pte;
        int i;

        /*
         * Leave pmd empty until pte is filled note that it is fine to delay
         * notification until mmu_notifier_invalidate_range_end() as we are
         * replacing a zero pmd write protected page with a zero pte write
         * protected page.
         *
         * See Documentation/mm/mmu_notifier.rst
         */
        old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);

        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);

        pte = pte_offset_map(&_pmd, haddr);
        VM_BUG_ON(!pte);
        for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
                pte_t entry;

                entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
                entry = pte_mkspecial(entry);
                if (pmd_uffd_wp(old_pmd))
                        entry = pte_mkuffd_wp(entry);
                VM_BUG_ON(!pte_none(ptep_get(pte)));
                set_pte_at(mm, addr, pte, entry);
                pte++;
        }
        pte_unmap(pte - 1);
        smp_wmb(); /* make pte visible before pmd */
        pmd_populate(mm, pmd, pgtable);
}

static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long haddr, bool freeze)
{
        struct mm_struct *mm = vma->vm_mm;
        struct folio *folio;
        struct page *page;
        pgtable_t pgtable;
        pmd_t old_pmd, _pmd;
        bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
        bool anon_exclusive = false, dirty = false;
        unsigned long addr;
        pte_t *pte;
        int i;

        VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
        VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
        VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
        VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
                                && !pmd_devmap(*pmd));

        count_vm_event(THP_SPLIT_PMD);

        if (!vma_is_anonymous(vma)) {
                old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
                /*
                 * We are going to unmap this huge page. So
                 * just go ahead and zap it
                 */
                if (arch_needs_pgtable_deposit())
                        zap_deposited_table(mm, pmd);
                if (vma_is_special_huge(vma))
                        return;
                if (unlikely(is_pmd_migration_entry(old_pmd))) {
                        swp_entry_t entry;

                        entry = pmd_to_swp_entry(old_pmd);
                        folio = pfn_swap_entry_folio(entry);
                } else {
                        page = pmd_page(old_pmd);
                        folio = page_folio(page);
                        if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
                                folio_mark_dirty(folio);
                        if (!folio_test_referenced(folio) && pmd_young(old_pmd))
                                folio_set_referenced(folio);
                        folio_remove_rmap_pmd(folio, page, vma);
                        folio_put(folio);
                }
                add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
                return;
        }

        if (is_huge_zero_pmd(*pmd)) {
                /*
                 * FIXME: Do we want to invalidate secondary mmu by calling
                 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
                 * inside __split_huge_pmd() ?
                 *
                 * We are going from a zero huge page write protected to zero
                 * small page also write protected so it does not seems useful
                 * to invalidate secondary mmu at this time.
                 */
                return __split_huge_zero_page_pmd(vma, haddr, pmd);
        }

        pmd_migration = is_pmd_migration_entry(*pmd);
        if (unlikely(pmd_migration)) {
                swp_entry_t entry;

                old_pmd = *pmd;
                entry = pmd_to_swp_entry(old_pmd);
                page = pfn_swap_entry_to_page(entry);
                write = is_writable_migration_entry(entry);
                if (PageAnon(page))
                        anon_exclusive = is_readable_exclusive_migration_entry(entry);
                young = is_migration_entry_young(entry);
                dirty = is_migration_entry_dirty(entry);
                soft_dirty = pmd_swp_soft_dirty(old_pmd);
                uffd_wp = pmd_swp_uffd_wp(old_pmd);
        } else {
                /*
                 * Up to this point the pmd is present and huge and userland has
                 * the whole access to the hugepage during the split (which
                 * happens in place). If we overwrite the pmd with the not-huge
                 * version pointing to the pte here (which of course we could if
                 * all CPUs were bug free), userland could trigger a small page
                 * size TLB miss on the small sized TLB while the hugepage TLB
                 * entry is still established in the huge TLB. Some CPU doesn't
                 * like that. See
                 * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
                 * 383 on page 105. Intel should be safe but is also warns that
                 * it's only safe if the permission and cache attributes of the
                 * two entries loaded in the two TLB is identical (which should
                 * be the case here). But it is generally safer to never allow
                 * small and huge TLB entries for the same virtual address to be
                 * loaded simultaneously. So instead of doing "pmd_populate();
                 * flush_pmd_tlb_range();" we first mark the current pmd
                 * notpresent (atomically because here the pmd_trans_huge must
                 * remain set at all times on the pmd until the split is
                 * complete for this pmd), then we flush the SMP TLB and finally
                 * we write the non-huge version of the pmd entry with
                 * pmd_populate.
                 */
                old_pmd = pmdp_invalidate(vma, haddr, pmd);
                page = pmd_page(old_pmd);
                folio = page_folio(page);
                if (pmd_dirty(old_pmd)) {
                        dirty = true;
                        folio_set_dirty(folio);
                }
                write = pmd_write(old_pmd);
                young = pmd_young(old_pmd);
                soft_dirty = pmd_soft_dirty(old_pmd);
                uffd_wp = pmd_uffd_wp(old_pmd);

                VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio);
                VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);

                /*
                 * Without "freeze", we'll simply split the PMD, propagating the
                 * PageAnonExclusive() flag for each PTE by setting it for
                 * each subpage -- no need to (temporarily) clear.
                 *
                 * With "freeze" we want to replace mapped pages by
                 * migration entries right away. This is only possible if we
                 * managed to clear PageAnonExclusive() -- see
                 * set_pmd_migration_entry().
                 *
                 * In case we cannot clear PageAnonExclusive(), split the PMD
                 * only and let try_to_migrate_one() fail later.
                 *
                 * See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
                 */
                anon_exclusive = PageAnonExclusive(page);
                if (freeze && anon_exclusive &&
                    folio_try_share_anon_rmap_pmd(folio, page))
                        freeze = false;
                if (!freeze) {
                        rmap_t rmap_flags = RMAP_NONE;

                        folio_ref_add(folio, HPAGE_PMD_NR - 1);
                        if (anon_exclusive)
                                rmap_flags |= RMAP_EXCLUSIVE;
                        folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
                                                 vma, haddr, rmap_flags);
                }
        }

        /*
         * Withdraw the table only after we mark the pmd entry invalid.
         * This's critical for some architectures (Power).
         */
        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);

        pte = pte_offset_map(&_pmd, haddr);
        VM_BUG_ON(!pte);

        /*
         * Note that NUMA hinting access restrictions are not transferred to
         * avoid any possibility of altering permissions across VMAs.
         */
        if (freeze || pmd_migration) {
                for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
                        pte_t entry;
                        swp_entry_t swp_entry;

                        if (write)
                                swp_entry = make_writable_migration_entry(
                                                        page_to_pfn(page + i));
                        else if (anon_exclusive)
                                swp_entry = make_readable_exclusive_migration_entry(
                                                        page_to_pfn(page + i));
                        else
                                swp_entry = make_readable_migration_entry(
                                                        page_to_pfn(page + i));
                        if (young)
                                swp_entry = make_migration_entry_young(swp_entry);
                        if (dirty)
                                swp_entry = make_migration_entry_dirty(swp_entry);
                        entry = swp_entry_to_pte(swp_entry);
                        if (soft_dirty)
                                entry = pte_swp_mksoft_dirty(entry);
                        if (uffd_wp)
                                entry = pte_swp_mkuffd_wp(entry);

                        VM_WARN_ON(!pte_none(ptep_get(pte + i)));
                        set_pte_at(mm, addr, pte + i, entry);
                }
        } else {
                pte_t entry;

                entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
                if (write)
                        entry = pte_mkwrite(entry, vma);
                if (!young)
                        entry = pte_mkold(entry);
                /* NOTE: this may set soft-dirty too on some archs */
                if (dirty)
                        entry = pte_mkdirty(entry);
                if (soft_dirty)
                        entry = pte_mksoft_dirty(entry);
                if (uffd_wp)
                        entry = pte_mkuffd_wp(entry);

                for (i = 0; i < HPAGE_PMD_NR; i++)
                        VM_WARN_ON(!pte_none(ptep_get(pte + i)));

                set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
        }
        pte_unmap(pte);

        if (!pmd_migration)
                folio_remove_rmap_pmd(folio, page, vma);
        if (freeze)
                put_page(page);

        smp_wmb(); /* make pte visible before pmd */
        pmd_populate(mm, pmd, pgtable);
}

void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long address, bool freeze, struct folio *folio)
{
        spinlock_t *ptl;
        struct mmu_notifier_range range;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address & HPAGE_PMD_MASK,
                                (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);
        ptl = pmd_lock(vma->vm_mm, pmd);

        /*
         * If caller asks to setup a migration entry, we need a folio to check
         * pmd against. Otherwise we can end up replacing wrong folio.
         */
        VM_BUG_ON(freeze && !folio);
        VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));

        if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
            is_pmd_migration_entry(*pmd)) {
                /*
                 * It's safe to call pmd_page when folio is set because it's
                 * guaranteed that pmd is present.
                 */
                if (folio && folio != pmd_folio(*pmd))
                        goto out;
                __split_huge_pmd_locked(vma, pmd, range.start, freeze);
        }

out:
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(&range);
}

void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
                bool freeze, struct folio *folio)
{
        pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);

        if (!pmd)
                return;

        __split_huge_pmd(vma, pmd, address, freeze, folio);
}

static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
{
        /*
         * If the new address isn't hpage aligned and it could previously
         * contain an hugepage: check if we need to split an huge pmd.
         */
        if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
            range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
                         ALIGN(address, HPAGE_PMD_SIZE)))
                split_huge_pmd_address(vma, address, false, NULL);
}

void vma_adjust_trans_huge(struct vm_area_struct *vma,
                             unsigned long start,
                             unsigned long end,
                             long adjust_next)
{
        /* Check if we need to split start first. */
        split_huge_pmd_if_needed(vma, start);

        /* Check if we need to split end next. */
        split_huge_pmd_if_needed(vma, end);

        /*
         * If we're also updating the next vma vm_start,
         * check if we need to split it.
         */
        if (adjust_next > 0) {
                struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
                unsigned long nstart = next->vm_start;
                nstart += adjust_next;
                split_huge_pmd_if_needed(next, nstart);
        }
}

static void unmap_folio(struct folio *folio)
{
        enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC |
                TTU_BATCH_FLUSH;

        VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);

        if (folio_test_pmd_mappable(folio))
                ttu_flags |= TTU_SPLIT_HUGE_PMD;

        /*
         * Anon pages need migration entries to preserve them, but file
         * pages can simply be left unmapped, then faulted back on demand.
         * If that is ever changed (perhaps for mlock), update remap_page().
         */
        if (folio_test_anon(folio))
                try_to_migrate(folio, ttu_flags);
        else
                try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);

        try_to_unmap_flush();
}

static void remap_page(struct folio *folio, unsigned long nr)
{
        int i = 0;

        /* If unmap_folio() uses try_to_migrate() on file, remove this check */
        if (!folio_test_anon(folio))
                return;
        for (;;) {
                remove_migration_ptes(folio, folio, true);
                i += folio_nr_pages(folio);
                if (i >= nr)
                        break;
                folio = folio_next(folio);
        }
}

static void lru_add_page_tail(struct page *head, struct page *tail,
                struct lruvec *lruvec, struct list_head *list)
{
        VM_BUG_ON_PAGE(!PageHead(head), head);
        VM_BUG_ON_PAGE(PageLRU(tail), head);
        lockdep_assert_held(&lruvec->lru_lock);

        if (list) {
                /* page reclaim is reclaiming a huge page */
                VM_WARN_ON(PageLRU(head));
                get_page(tail);
                list_add_tail(&tail->lru, list);
        } else {
                /* head is still on lru (and we have it frozen) */
                VM_WARN_ON(!PageLRU(head));
                if (PageUnevictable(tail))
                        tail->mlock_count = 0;
                else
                        list_add_tail(&tail->lru, &head->lru);
                SetPageLRU(tail);
        }
}

static void __split_huge_page_tail(struct folio *folio, int tail,
                struct lruvec *lruvec, struct list_head *list,
                unsigned int new_order)
{
        struct page *head = &folio->page;
        struct page *page_tail = head + tail;
        /*
         * Careful: new_folio is not a "real" folio before we cleared PageTail.
         * Don't pass it around before clear_compound_head().
         */
        struct folio *new_folio = (struct folio *)page_tail;

        VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);

        /*
         * Clone page flags before unfreezing refcount.
         *
         * After successful get_page_unless_zero() might follow flags change,
         * for example lock_page() which set PG_waiters.
         *
         * Note that for mapped sub-pages of an anonymous THP,
         * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
         * the migration entry instead from where remap_page() will restore it.
         * We can still have PG_anon_exclusive set on effectively unmapped and
         * unreferenced sub-pages of an anonymous THP: we can simply drop
         * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
         */
        page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
        page_tail->flags |= (head->flags &
                        ((1L << PG_referenced) |
                         (1L << PG_swapbacked) |
                         (1L << PG_swapcache) |
                         (1L << PG_mlocked) |
                         (1L << PG_uptodate) |
                         (1L << PG_active) |
                         (1L << PG_workingset) |
                         (1L << PG_locked) |
                         (1L << PG_unevictable) |
#ifdef CONFIG_ARCH_USES_PG_ARCH_X
                         (1L << PG_arch_2) |
                         (1L << PG_arch_3) |
#endif
                         (1L << PG_dirty) |
                         LRU_GEN_MASK | LRU_REFS_MASK));

        /* ->mapping in first and second tail page is replaced by other uses */
        VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
                        page_tail);
        page_tail->mapping = head->mapping;
        page_tail->index = head->index + tail;

        /*
         * page->private should not be set in tail pages. Fix up and warn once
         * if private is unexpectedly set.
         */
        if (unlikely(page_tail->private)) {
                VM_WARN_ON_ONCE_PAGE(true, page_tail);
                page_tail->private = 0;
        }
        if (folio_test_swapcache(folio))
                new_folio->swap.val = folio->swap.val + tail;

        /* Page flags must be visible before we make the page non-compound. */
        smp_wmb();

        /*
         * Clear PageTail before unfreezing page refcount.
         *
         * After successful get_page_unless_zero() might follow put_page()
         * which needs correct compound_head().
         */
        clear_compound_head(page_tail);
        if (new_order) {
                prep_compound_page(page_tail, new_order);
                folio_set_large_rmappable(new_folio);
        }

        /* Finally unfreeze refcount. Additional reference from page cache. */
        page_ref_unfreeze(page_tail,
                1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ?
                             folio_nr_pages(new_folio) : 0));

        if (folio_test_young(folio))
                folio_set_young(new_folio);
        if (folio_test_idle(folio))
                folio_set_idle(new_folio);

        folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));

        /*
         * always add to the tail because some iterators expect new
         * pages to show after the currently processed elements - e.g.
         * migrate_pages
         */
        lru_add_page_tail(head, page_tail, lruvec, list);
}

static void __split_huge_page(struct page *page, struct list_head *list,
                pgoff_t end, unsigned int new_order)
{
        struct folio *folio = page_folio(page);
        struct page *head = &folio->page;
        struct lruvec *lruvec;
        struct address_space *swap_cache = NULL;
        unsigned long offset = 0;
        int i, nr_dropped = 0;
        unsigned int new_nr = 1 << new_order;
        int order = folio_order(folio);
        unsigned int nr = 1 << order;

        /* complete memcg works before add pages to LRU */
        split_page_memcg(head, order, new_order);

        if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
                offset = swp_offset(folio->swap);
                swap_cache = swap_address_space(folio->swap);
                xa_lock(&swap_cache->i_pages);
        }

        /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
        lruvec = folio_lruvec_lock(folio);

        ClearPageHasHWPoisoned(head);

        for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
                __split_huge_page_tail(folio, i, lruvec, list, new_order);
                /* Some pages can be beyond EOF: drop them from page cache */
                if (head[i].index >= end) {
                        struct folio *tail = page_folio(head + i);

                        if (shmem_mapping(folio->mapping))
                                nr_dropped++;
                        else if (folio_test_clear_dirty(tail))
                                folio_account_cleaned(tail,
                                        inode_to_wb(folio->mapping->host));
                        __filemap_remove_folio(tail, NULL);
                        folio_put(tail);
                } else if (!PageAnon(page)) {
                        __xa_store(&folio->mapping->i_pages, head[i].index,
                                        head + i, 0);
                } else if (swap_cache) {
                        __xa_store(&swap_cache->i_pages, offset + i,
                                        head + i, 0);
                }
        }

        if (!new_order)
                ClearPageCompound(head);
        else {
                struct folio *new_folio = (struct folio *)head;

                folio_set_order(new_folio, new_order);
        }
        unlock_page_lruvec(lruvec);
        /* Caller disabled irqs, so they are still disabled here */

        split_page_owner(head, order, new_order);
        pgalloc_tag_split(head, 1 << order);

        /* See comment in __split_huge_page_tail() */
        if (folio_test_anon(folio)) {
                /* Additional pin to swap cache */
                if (folio_test_swapcache(folio)) {
                        folio_ref_add(folio, 1 + new_nr);
                        xa_unlock(&swap_cache->i_pages);
                } else {
                        folio_ref_inc(folio);
                }
        } else {
                /* Additional pin to page cache */
                folio_ref_add(folio, 1 + new_nr);
                xa_unlock(&folio->mapping->i_pages);
        }
        local_irq_enable();

        if (nr_dropped)
                shmem_uncharge(folio->mapping->host, nr_dropped);
        remap_page(folio, nr);

        /*
         * set page to its compound_head when split to non order-0 pages, so
         * we can skip unlocking it below, since PG_locked is transferred to
         * the compound_head of the page and the caller will unlock it.
         */
        if (new_order)
                page = compound_head(page);

        for (i = 0; i < nr; i += new_nr) {
                struct page *subpage = head + i;
                struct folio *new_folio = page_folio(subpage);
                if (subpage == page)
                        continue;
                folio_unlock(new_folio);

                /*
                 * Subpages may be freed if there wasn't any mapping
                 * like if add_to_swap() is running on a lru page that
                 * had its mapping zapped. And freeing these pages
                 * requires taking the lru_lock so we do the put_page
                 * of the tail pages after the split is complete.
                 */
                free_page_and_swap_cache(subpage);
        }
}

/* Racy check whether the huge page can be split */
bool can_split_folio(struct folio *folio, int *pextra_pins)
{
        int extra_pins;

        /* Additional pins from page cache */
        if (folio_test_anon(folio))
                extra_pins = folio_test_swapcache(folio) ?
                                folio_nr_pages(folio) : 0;
        else
                extra_pins = folio_nr_pages(folio);
        if (pextra_pins)
                *pextra_pins = extra_pins;
        return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
}

/*
 * This function splits a large folio into smaller folios of order @new_order.
 * @page can point to any page of the large folio to split. The split operation
 * does not change the position of @page.
 *
 * Prerequisites:
 *
 * 1) The caller must hold a reference on the @page's owning folio, also known
 *    as the large folio.
 *
 * 2) The large folio must be locked.
 *
 * 3) The folio must not be pinned. Any unexpected folio references, including
 *    GUP pins, will result in the folio not getting split; instead, the caller
 *    will receive an -EAGAIN.
 *
 * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
 *    supported for non-file-backed folios, because folio->_deferred_list, which
 *    is used by partially mapped folios, is stored in subpage 2, but an order-1
 *    folio only has subpages 0 and 1. File-backed order-1 folios are supported,
 *    since they do not use _deferred_list.
 *
 * After splitting, the caller's folio reference will be transferred to @page,
 * resulting in a raised refcount of @page after this call. The other pages may
 * be freed if they are not mapped.
 *
 * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
 *
 * Pages in @new_order will inherit the mapping, flags, and so on from the
 * huge page.
 *
 * Returns 0 if the huge page was split successfully.
 *
 * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
 * the folio was concurrently removed from the page cache.
 *
 * Returns -EBUSY when trying to split the huge zeropage, if the folio is
 * under writeback, if fs-specific folio metadata cannot currently be
 * released, or if some unexpected race happened (e.g., anon VMA disappeared,
 * truncation).
 *
 * Returns -EINVAL when trying to split to an order that is incompatible
 * with the folio. Splitting to order 0 is compatible with all folios.
 */
int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
                                     unsigned int new_order)
{
        struct folio *folio = page_folio(page);
        struct deferred_split *ds_queue = get_deferred_split_queue(folio);
        /* reset xarray order to new order after split */
        XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
        struct anon_vma *anon_vma = NULL;
        struct address_space *mapping = NULL;
        bool is_thp = folio_test_pmd_mappable(folio);
        int extra_pins, ret;
        pgoff_t end;
        bool is_hzp;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);

        if (new_order >= folio_order(folio))
                return -EINVAL;

        if (folio_test_anon(folio)) {
                /* order-1 is not supported for anonymous THP. */
                if (new_order == 1) {
                        VM_WARN_ONCE(1, "Cannot split to order-1 folio");
                        return -EINVAL;
                }
        } else if (new_order) {
                /* Split shmem folio to non-zero order not supported */
                if (shmem_mapping(folio->mapping)) {
                        VM_WARN_ONCE(1,
                                "Cannot split shmem folio to non-0 order");
                        return -EINVAL;
                }
                /*
                 * No split if the file system does not support large folio.
                 * Note that we might still have THPs in such mappings due to
                 * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
                 * does not actually support large folios properly.
                 */
                if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
                    !mapping_large_folio_support(folio->mapping)) {
                        VM_WARN_ONCE(1,
                                "Cannot split file folio to non-0 order");
                        return -EINVAL;
                }
        }

        /* Only swapping a whole PMD-mapped folio is supported */
        if (folio_test_swapcache(folio) && new_order)
                return -EINVAL;

        is_hzp = is_huge_zero_folio(folio);
        if (is_hzp) {
                pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
                return -EBUSY;
        }

        if (folio_test_writeback(folio))
                return -EBUSY;

        if (folio_test_anon(folio)) {
                /*
                 * The caller does not necessarily hold an mmap_lock that would
                 * prevent the anon_vma disappearing so we first we take a
                 * reference to it and then lock the anon_vma for write. This
                 * is similar to folio_lock_anon_vma_read except the write lock
                 * is taken to serialise against parallel split or collapse
                 * operations.
                 */
                anon_vma = folio_get_anon_vma(folio);
                if (!anon_vma) {
                        ret = -EBUSY;
                        goto out;
                }
                end = -1;
                mapping = NULL;
                anon_vma_lock_write(anon_vma);
        } else {
                gfp_t gfp;

                mapping = folio->mapping;

                /* Truncated ? */
                if (!mapping) {
                        ret = -EBUSY;
                        goto out;
                }

                gfp = current_gfp_context(mapping_gfp_mask(mapping) &
                                                        GFP_RECLAIM_MASK);

                if (!filemap_release_folio(folio, gfp)) {
                        ret = -EBUSY;
                        goto out;
                }

                xas_split_alloc(&xas, folio, folio_order(folio), gfp);
                if (xas_error(&xas)) {
                        ret = xas_error(&xas);
                        goto out;
                }

                anon_vma = NULL;
                i_mmap_lock_read(mapping);

                /*
                 *__split_huge_page() may need to trim off pages beyond EOF:
                 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
                 * which cannot be nested inside the page tree lock. So note
                 * end now: i_size itself may be changed at any moment, but
                 * folio lock is good enough to serialize the trimming.
                 */
                end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
                if (shmem_mapping(mapping))
                        end = shmem_fallocend(mapping->host, end);
        }

        /*
         * Racy check if we can split the page, before unmap_folio() will
         * split PMDs
         */
        if (!can_split_folio(folio, &extra_pins)) {
                ret = -EAGAIN;
                goto out_unlock;
        }

        unmap_folio(folio);

        /* block interrupt reentry in xa_lock and spinlock */
        local_irq_disable();
        if (mapping) {
                /*
                 * Check if the folio is present in page cache.
                 * We assume all tail are present too, if folio is there.
                 */
                xas_lock(&xas);
                xas_reset(&xas);
                if (xas_load(&xas) != folio)
                        goto fail;
        }

        /* Prevent deferred_split_scan() touching ->_refcount */
        spin_lock(&ds_queue->split_queue_lock);
        if (folio_ref_freeze(folio, 1 + extra_pins)) {
                if (folio_order(folio) > 1 &&
                    !list_empty(&folio->_deferred_list)) {
                        ds_queue->split_queue_len--;
                        /*
                         * Reinitialize page_deferred_list after removing the
                         * page from the split_queue, otherwise a subsequent
                         * split will see list corruption when checking the
                         * page_deferred_list.
                         */
                        list_del_init(&folio->_deferred_list);
                }
                spin_unlock(&ds_queue->split_queue_lock);
                if (mapping) {
                        int nr = folio_nr_pages(folio);

                        xas_split(&xas, folio, folio_order(folio));
                        if (folio_test_pmd_mappable(folio) &&
                            new_order < HPAGE_PMD_ORDER) {
                                if (folio_test_swapbacked(folio)) {
                                        __lruvec_stat_mod_folio(folio,
                                                        NR_SHMEM_THPS, -nr);
                                } else {
                                        __lruvec_stat_mod_folio(folio,
                                                        NR_FILE_THPS, -nr);
                                        filemap_nr_thps_dec(mapping);
                                }
                        }
                }

                __split_huge_page(page, list, end, new_order);
                ret = 0;
        } else {
                spin_unlock(&ds_queue->split_queue_lock);
fail:
                if (mapping)
                        xas_unlock(&xas);
                local_irq_enable();
                remap_page(folio, folio_nr_pages(folio));
                ret = -EAGAIN;
        }

out_unlock:
        if (anon_vma) {
                anon_vma_unlock_write(anon_vma);
                put_anon_vma(anon_vma);
        }
        if (mapping)
                i_mmap_unlock_read(mapping);
out:
        xas_destroy(&xas);
        if (is_thp)
                count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
        return ret;
}

void folio_undo_large_rmappable(struct folio *folio)
{
        struct deferred_split *ds_queue;
        unsigned long flags;

        if (folio_order(folio) <= 1)
                return;

        /*
         * At this point, there is no one trying to add the folio to
         * deferred_list. If folio is not in deferred_list, it's safe
         * to check without acquiring the split_queue_lock.
         */
        if (data_race(list_empty(&folio->_deferred_list)))
                return;

        ds_queue = get_deferred_split_queue(folio);
        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
        if (!list_empty(&folio->_deferred_list)) {
                ds_queue->split_queue_len--;
                list_del_init(&folio->_deferred_list);
        }
        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
}

void deferred_split_folio(struct folio *folio)
{
        struct deferred_split *ds_queue = get_deferred_split_queue(folio);
#ifdef CONFIG_MEMCG
        struct mem_cgroup *memcg = folio_memcg(folio);
#endif
        unsigned long flags;

        /*
         * Order 1 folios have no space for a deferred list, but we also
         * won't waste much memory by not adding them to the deferred list.
         */
        if (folio_order(folio) <= 1)
                return;

        /*
         * The try_to_unmap() in page reclaim path might reach here too,
         * this may cause a race condition to corrupt deferred split queue.
         * And, if page reclaim is already handling the same folio, it is
         * unnecessary to handle it again in shrinker.
         *
         * Check the swapcache flag to determine if the folio is being
         * handled by page reclaim since THP swap would add the folio into
         * swap cache before calling try_to_unmap().
         */
        if (folio_test_swapcache(folio))
                return;

        if (!list_empty(&folio->_deferred_list))
                return;

        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
        if (list_empty(&folio->_deferred_list)) {
                if (folio_test_pmd_mappable(folio))
                        count_vm_event(THP_DEFERRED_SPLIT_PAGE);
                list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
                ds_queue->split_queue_len++;
#ifdef CONFIG_MEMCG
                if (memcg)
                        set_shrinker_bit(memcg, folio_nid(folio),
                                         deferred_split_shrinker->id);
#endif
        }
        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
}

static unsigned long deferred_split_count(struct shrinker *shrink,
                struct shrink_control *sc)
{
        struct pglist_data *pgdata = NODE_DATA(sc->nid);
        struct deferred_split *ds_queue = &pgdata->deferred_split_queue;

#ifdef CONFIG_MEMCG
        if (sc->memcg)
                ds_queue = &sc->memcg->deferred_split_queue;
#endif
        return READ_ONCE(ds_queue->split_queue_len);
}

static unsigned long deferred_split_scan(struct shrinker *shrink,
                struct shrink_control *sc)
{
        struct pglist_data *pgdata = NODE_DATA(sc->nid);
        struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
        unsigned long flags;
        LIST_HEAD(list);
        struct folio *folio, *next;
        int split = 0;

#ifdef CONFIG_MEMCG
        if (sc->memcg)
                ds_queue = &sc->memcg->deferred_split_queue;
#endif

        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
        /* Take pin on all head pages to avoid freeing them under us */
        list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
                                                        _deferred_list) {
                if (folio_try_get(folio)) {
                        list_move(&folio->_deferred_list, &list);
                } else {
                        /* We lost race with folio_put() */
                        list_del_init(&folio->_deferred_list);
                        ds_queue->split_queue_len--;
                }
                if (!--sc->nr_to_scan)
                        break;
        }
        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);

        list_for_each_entry_safe(folio, next, &list, _deferred_list) {
                if (!folio_trylock(folio))
                        goto next;
                /* split_huge_page() removes page from list on success */
                if (!split_folio(folio))
                        split++;
                folio_unlock(folio);
next:
                folio_put(folio);
        }

        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
        list_splice_tail(&list, &ds_queue->split_queue);
        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);

        /*
         * Stop shrinker if we didn't split any page, but the queue is empty.
         * This can happen if pages were freed under us.
         */
        if (!split && list_empty(&ds_queue->split_queue))
                return SHRINK_STOP;
        return split;
}

#ifdef CONFIG_DEBUG_FS
static void split_huge_pages_all(void)
{
        struct zone *zone;
        struct page *page;
        struct folio *folio;
        unsigned long pfn, max_zone_pfn;
        unsigned long total = 0, split = 0;

        pr_debug("Split all THPs\n");
        for_each_zone(zone) {
                if (!managed_zone(zone))
                        continue;
                max_zone_pfn = zone_end_pfn(zone);
                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
                        int nr_pages;

                        page = pfn_to_online_page(pfn);
                        if (!page || PageTail(page))
                                continue;
                        folio = page_folio(page);
                        if (!folio_try_get(folio))
                                continue;

                        if (unlikely(page_folio(page) != folio))
                                goto next;

                        if (zone != folio_zone(folio))
                                goto next;

                        if (!folio_test_large(folio)
                                || folio_test_hugetlb(folio)
                                || !folio_test_lru(folio))
                                goto next;

                        total++;
                        folio_lock(folio);
                        nr_pages = folio_nr_pages(folio);
                        if (!split_folio(folio))
                                split++;
                        pfn += nr_pages - 1;
                        folio_unlock(folio);
next:
                        folio_put(folio);
                        cond_resched();
                }
        }

        pr_debug("%lu of %lu THP split\n", split, total);
}

static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
{
        return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
                    is_vm_hugetlb_page(vma);
}

static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
                                unsigned long vaddr_end, unsigned int new_order)
{
        int ret = 0;
        struct task_struct *task;
        struct mm_struct *mm;
        unsigned long total = 0, split = 0;
        unsigned long addr;

        vaddr_start &= PAGE_MASK;
        vaddr_end &= PAGE_MASK;

        /* Find the task_struct from pid */
        rcu_read_lock();
        task = find_task_by_vpid(pid);
        if (!task) {
                rcu_read_unlock();
                ret = -ESRCH;
                goto out;
        }
        get_task_struct(task);
        rcu_read_unlock();

        /* Find the mm_struct */
        mm = get_task_mm(task);
        put_task_struct(task);

        if (!mm) {
                ret = -EINVAL;
                goto out;
        }

        pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
                 pid, vaddr_start, vaddr_end);

        mmap_read_lock(mm);
        /*
         * always increase addr by PAGE_SIZE, since we could have a PTE page
         * table filled with PTE-mapped THPs, each of which is distinct.
         */
        for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
                struct vm_area_struct *vma = vma_lookup(mm, addr);
                struct page *page;
                struct folio *folio;

                if (!vma)
                        break;

                /* skip special VMA and hugetlb VMA */
                if (vma_not_suitable_for_thp_split(vma)) {
                        addr = vma->vm_end;
                        continue;
                }

                /* FOLL_DUMP to ignore special (like zero) pages */
                page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);

                if (IS_ERR_OR_NULL(page))
                        continue;

                folio = page_folio(page);
                if (!is_transparent_hugepage(folio))
                        goto next;

                if (new_order >= folio_order(folio))
                        goto next;

                total++;
                /*
                 * For folios with private, split_huge_page_to_list_to_order()
                 * will try to drop it before split and then check if the folio
                 * can be split or not. So skip the check here.
                 */
                if (!folio_test_private(folio) &&
                    !can_split_folio(folio, NULL))
                        goto next;

                if (!folio_trylock(folio))
                        goto next;

                if (!split_folio_to_order(folio, new_order))
                        split++;

                folio_unlock(folio);
next:
                folio_put(folio);
                cond_resched();
        }
        mmap_read_unlock(mm);
        mmput(mm);

        pr_debug("%lu of %lu THP split\n", split, total);

out:
        return ret;
}

static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
                                pgoff_t off_end, unsigned int new_order)
{
        struct filename *file;
        struct file *candidate;
        struct address_space *mapping;
        int ret = -EINVAL;
        pgoff_t index;
        int nr_pages = 1;
        unsigned long total = 0, split = 0;

        file = getname_kernel(file_path);
        if (IS_ERR(file))
                return ret;

        candidate = file_open_name(file, O_RDONLY, 0);
        if (IS_ERR(candidate))
                goto out;

        pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
                 file_path, off_start, off_end);

        mapping = candidate->f_mapping;

        for (index = off_start; index < off_end; index += nr_pages) {
                struct folio *folio = filemap_get_folio(mapping, index);

                nr_pages = 1;
                if (IS_ERR(folio))
                        continue;

                if (!folio_test_large(folio))
                        goto next;

                total++;
                nr_pages = folio_nr_pages(folio);

                if (new_order >= folio_order(folio))
                        goto next;

                if (!folio_trylock(folio))
                        goto next;

                if (!split_folio_to_order(folio, new_order))
                        split++;

                folio_unlock(folio);
next:
                folio_put(folio);
                cond_resched();
        }

        filp_close(candidate, NULL);
        ret = 0;

        pr_debug("%lu of %lu file-backed THP split\n", split, total);
out:
        putname(file);
        return ret;
}

#define MAX_INPUT_BUF_SZ 255

static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppops)
{
        static DEFINE_MUTEX(split_debug_mutex);
        ssize_t ret;
        /*
         * hold pid, start_vaddr, end_vaddr, new_order or
         * file_path, off_start, off_end, new_order
         */
        char input_buf[MAX_INPUT_BUF_SZ];
        int pid;
        unsigned long vaddr_start, vaddr_end;
        unsigned int new_order = 0;

        ret = mutex_lock_interruptible(&split_debug_mutex);
        if (ret)
                return ret;

        ret = -EFAULT;

        memset(input_buf, 0, MAX_INPUT_BUF_SZ);
        if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
                goto out;

        input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';

        if (input_buf[0] == '/') {
                char *tok;
                char *buf = input_buf;
                char file_path[MAX_INPUT_BUF_SZ];
                pgoff_t off_start = 0, off_end = 0;
                size_t input_len = strlen(input_buf);

                tok = strsep(&buf, ",");
                if (tok) {
                        strcpy(file_path, tok);
                } else {
                        ret = -EINVAL;
                        goto out;
                }

                ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order);
                if (ret != 2 && ret != 3) {
                        ret = -EINVAL;
                        goto out;
                }
                ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order);
                if (!ret)
                        ret = input_len;

                goto out;
        }

        ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order);
        if (ret == 1 && pid == 1) {
                split_huge_pages_all();
                ret = strlen(input_buf);
                goto out;
        } else if (ret != 3 && ret != 4) {
                ret = -EINVAL;
                goto out;
        }

        ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order);
        if (!ret)
                ret = strlen(input_buf);
out:
        mutex_unlock(&split_debug_mutex);
        return ret;

}

static const struct file_operations split_huge_pages_fops = {
        .owner         = THIS_MODULE,
        .write         = split_huge_pages_write,
        .llseek  = no_llseek,
};

static int __init split_huge_pages_debugfs(void)
{
        debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
                            &split_huge_pages_fops);
        return 0;
}
late_initcall(split_huge_pages_debugfs);
#endif

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
                struct page *page)
{
        struct folio *folio = page_folio(page);
        struct vm_area_struct *vma = pvmw->vma;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address = pvmw->address;
        bool anon_exclusive;
        pmd_t pmdval;
        swp_entry_t entry;
        pmd_t pmdswp;

        if (!(pvmw->pmd && !pvmw->pte))
                return 0;

        flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
        pmdval = pmdp_invalidate(vma, address, pvmw->pmd);

        /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
        anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
        if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) {
                set_pmd_at(mm, address, pvmw->pmd, pmdval);
                return -EBUSY;
        }

        if (pmd_dirty(pmdval))
                folio_mark_dirty(folio);
        if (pmd_write(pmdval))
                entry = make_writable_migration_entry(page_to_pfn(page));
        else if (anon_exclusive)
                entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
        else
                entry = make_readable_migration_entry(page_to_pfn(page));
        if (pmd_young(pmdval))
                entry = make_migration_entry_young(entry);
        if (pmd_dirty(pmdval))
                entry = make_migration_entry_dirty(entry);
        pmdswp = swp_entry_to_pmd(entry);
        if (pmd_soft_dirty(pmdval))
                pmdswp = pmd_swp_mksoft_dirty(pmdswp);
        if (pmd_uffd_wp(pmdval))
                pmdswp = pmd_swp_mkuffd_wp(pmdswp);
        set_pmd_at(mm, address, pvmw->pmd, pmdswp);
        folio_remove_rmap_pmd(folio, page, vma);
        folio_put(folio);
        trace_set_migration_pmd(address, pmd_val(pmdswp));

        return 0;
}

void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
{
        struct folio *folio = page_folio(new);
        struct vm_area_struct *vma = pvmw->vma;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address = pvmw->address;
        unsigned long haddr = address & HPAGE_PMD_MASK;
        pmd_t pmde;
        swp_entry_t entry;

        if (!(pvmw->pmd && !pvmw->pte))
                return;

        entry = pmd_to_swp_entry(*pvmw->pmd);
        folio_get(folio);
        pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
        if (pmd_swp_soft_dirty(*pvmw->pmd))
                pmde = pmd_mksoft_dirty(pmde);
        if (is_writable_migration_entry(entry))
                pmde = pmd_mkwrite(pmde, vma);
        if (pmd_swp_uffd_wp(*pvmw->pmd))
                pmde = pmd_mkuffd_wp(pmde);
        if (!is_migration_entry_young(entry))
                pmde = pmd_mkold(pmde);
        /* NOTE: this may contain setting soft-dirty on some archs */
        if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
                pmde = pmd_mkdirty(pmde);

        if (folio_test_anon(folio)) {
                rmap_t rmap_flags = RMAP_NONE;

                if (!is_readable_migration_entry(entry))
                        rmap_flags |= RMAP_EXCLUSIVE;

                folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags);
        } else {
                folio_add_file_rmap_pmd(folio, new, vma);
        }
        VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new));
        set_pmd_at(mm, haddr, pvmw->pmd, pmde);

        /* No need to invalidate - it was non-present before */
        update_mmu_cache_pmd(vma, address, pvmw->pmd);
        trace_remove_migration_pmd(address, pmd_val(pmde));
}
#endif
















































































































































   49 






   49 



































    1 




































































































    1 






























































































































































































































































    1 



























   49 

































































































































































































































































































































































































































































































































































































































   49 



















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_CPUMASK_H
#define __LINUX_CPUMASK_H

/*
 * Cpumasks provide a bitmap suitable for representing the
 * set of CPUs in a system, one bit position per CPU number.  In general,
 * only nr_cpu_ids (<= NR_CPUS) bits are valid.
 */
#include <linux/cleanup.h>
#include <linux/kernel.h>
#include <linux/threads.h>
#include <linux/bitmap.h>
#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/gfp_types.h>
#include <linux/numa.h>

/* Don't assign or return these: may not be this big! */
typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;

/**
 * cpumask_bits - get the bits in a cpumask
 * @maskp: the struct cpumask *
 *
 * You should only assume nr_cpu_ids bits of this mask are valid.  This is
 * a macro so it's const-correct.
 */
#define cpumask_bits(maskp) ((maskp)->bits)

/**
 * cpumask_pr_args - printf args to output a cpumask
 * @maskp: cpumask to be printed
 *
 * Can be used to provide arguments for '%*pb[l]' when printing a cpumask.
 */
#define cpumask_pr_args(maskp)                nr_cpu_ids, cpumask_bits(maskp)

#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS)
#define nr_cpu_ids ((unsigned int)NR_CPUS)
#else
extern unsigned int nr_cpu_ids;
#endif

static inline void set_nr_cpu_ids(unsigned int nr)
{
#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS)
        WARN_ON(nr != nr_cpu_ids);
#else
        nr_cpu_ids = nr;
#endif
}

/*
 * We have several different "preferred sizes" for the cpumask
 * operations, depending on operation.
 *
 * For example, the bitmap scanning and operating operations have
 * optimized routines that work for the single-word case, but only when
 * the size is constant. So if NR_CPUS fits in one single word, we are
 * better off using that small constant, in order to trigger the
 * optimized bit finding. That is 'small_cpumask_size'.
 *
 * The clearing and copying operations will similarly perform better
 * with a constant size, but we limit that size arbitrarily to four
 * words. We call this 'large_cpumask_size'.
 *
 * Finally, some operations just want the exact limit, either because
 * they set bits or just don't have any faster fixed-sized versions. We
 * call this just 'nr_cpumask_bits'.
 *
 * Note that these optional constants are always guaranteed to be at
 * least as big as 'nr_cpu_ids' itself is, and all our cpumask
 * allocations are at least that size (see cpumask_size()). The
 * optimization comes from being able to potentially use a compile-time
 * constant instead of a run-time generated exact number of CPUs.
 */
#if NR_CPUS <= BITS_PER_LONG
  #define small_cpumask_bits ((unsigned int)NR_CPUS)
  #define large_cpumask_bits ((unsigned int)NR_CPUS)
#elif NR_CPUS <= 4*BITS_PER_LONG
  #define small_cpumask_bits nr_cpu_ids
  #define large_cpumask_bits ((unsigned int)NR_CPUS)
#else
  #define small_cpumask_bits nr_cpu_ids
  #define large_cpumask_bits nr_cpu_ids
#endif
#define nr_cpumask_bits nr_cpu_ids

/*
 * The following particular system cpumasks and operations manage
 * possible, present, active and online cpus.
 *
 *     cpu_possible_mask- has bit 'cpu' set iff cpu is populatable
 *     cpu_present_mask - has bit 'cpu' set iff cpu is populated
 *     cpu_online_mask  - has bit 'cpu' set iff cpu available to scheduler
 *     cpu_active_mask  - has bit 'cpu' set iff cpu available to migration
 *
 *  If !CONFIG_HOTPLUG_CPU, present == possible, and active == online.
 *
 *  The cpu_possible_mask is fixed at boot time, as the set of CPU IDs
 *  that it is possible might ever be plugged in at anytime during the
 *  life of that system boot.  The cpu_present_mask is dynamic(*),
 *  representing which CPUs are currently plugged in.  And
 *  cpu_online_mask is the dynamic subset of cpu_present_mask,
 *  indicating those CPUs available for scheduling.
 *
 *  If HOTPLUG is enabled, then cpu_present_mask varies dynamically,
 *  depending on what ACPI reports as currently plugged in, otherwise
 *  cpu_present_mask is just a copy of cpu_possible_mask.
 *
 *  (*) Well, cpu_present_mask is dynamic in the hotplug case.  If not
 *      hotplug, it's a copy of cpu_possible_mask, hence fixed at boot.
 *
 * Subtleties:
 * 1) UP ARCHes (NR_CPUS == 1, CONFIG_SMP not defined) hardcode
 *    assumption that their single CPU is online.  The UP
 *    cpu_{online,possible,present}_masks are placebos.  Changing them
 *    will have no useful affect on the following num_*_cpus()
 *    and cpu_*() macros in the UP case.  This ugliness is a UP
 *    optimization - don't waste any instructions or memory references
 *    asking if you're online or how many CPUs there are if there is
 *    only one CPU.
 */

extern struct cpumask __cpu_possible_mask;
extern struct cpumask __cpu_online_mask;
extern struct cpumask __cpu_present_mask;
extern struct cpumask __cpu_active_mask;
extern struct cpumask __cpu_dying_mask;
#define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask)
#define cpu_online_mask   ((const struct cpumask *)&__cpu_online_mask)
#define cpu_present_mask  ((const struct cpumask *)&__cpu_present_mask)
#define cpu_active_mask   ((const struct cpumask *)&__cpu_active_mask)
#define cpu_dying_mask    ((const struct cpumask *)&__cpu_dying_mask)

extern atomic_t __num_online_cpus;

extern cpumask_t cpus_booted_once_mask;

static __always_inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
{
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
        WARN_ON_ONCE(cpu >= bits);
#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
}

/* verify cpu argument to cpumask_* operators */
static __always_inline unsigned int cpumask_check(unsigned int cpu)
{
        cpu_max_bits_warn(cpu, small_cpumask_bits);
        return cpu;
}

/**
 * cpumask_first - get the first cpu in a cpumask
 * @srcp: the cpumask pointer
 *
 * Return: >= nr_cpu_ids if no cpus set.
 */
static inline unsigned int cpumask_first(const struct cpumask *srcp)
{
        return find_first_bit(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_first_zero - get the first unset cpu in a cpumask
 * @srcp: the cpumask pointer
 *
 * Return: >= nr_cpu_ids if all cpus are set.
 */
static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
{
        return find_first_zero_bit(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_first_and - return the first cpu from *srcp1 & *srcp2
 * @srcp1: the first input
 * @srcp2: the second input
 *
 * Return: >= nr_cpu_ids if no cpus set in both.  See also cpumask_next_and().
 */
static inline
unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2)
{
        return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
}

/**
 * cpumask_first_and_and - return the first cpu from *srcp1 & *srcp2 & *srcp3
 * @srcp1: the first input
 * @srcp2: the second input
 * @srcp3: the third input
 *
 * Return: >= nr_cpu_ids if no cpus set in all.
 */
static inline
unsigned int cpumask_first_and_and(const struct cpumask *srcp1,
                                   const struct cpumask *srcp2,
                                   const struct cpumask *srcp3)
{
        return find_first_and_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
                                      cpumask_bits(srcp3), small_cpumask_bits);
}

/**
 * cpumask_last - get the last CPU in a cpumask
 * @srcp:        - the cpumask pointer
 *
 * Return:        >= nr_cpumask_bits if no CPUs set.
 */
static inline unsigned int cpumask_last(const struct cpumask *srcp)
{
        return find_last_bit(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_next - get the next cpu in a cpumask
 * @n: the cpu prior to the place to search (i.e. return will be > @n)
 * @srcp: the cpumask pointer
 *
 * Return: >= nr_cpu_ids if no further cpus set.
 */
static inline
unsigned int cpumask_next(int n, const struct cpumask *srcp)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_bit(cpumask_bits(srcp), small_cpumask_bits, n + 1);
}

/**
 * cpumask_next_zero - get the next unset cpu in a cpumask
 * @n: the cpu prior to the place to search (i.e. return will be > @n)
 * @srcp: the cpumask pointer
 *
 * Return: >= nr_cpu_ids if no further cpus unset.
 */
static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_zero_bit(cpumask_bits(srcp), small_cpumask_bits, n+1);
}

#if NR_CPUS == 1
/* Uniprocessor: there is only one valid CPU */
static inline unsigned int cpumask_local_spread(unsigned int i, int node)
{
        return 0;
}

static inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
                                                      const struct cpumask *src2p)
{
        return cpumask_first_and(src1p, src2p);
}

static inline unsigned int cpumask_any_distribute(const struct cpumask *srcp)
{
        return cpumask_first(srcp);
}
#else
unsigned int cpumask_local_spread(unsigned int i, int node);
unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
                               const struct cpumask *src2p);
unsigned int cpumask_any_distribute(const struct cpumask *srcp);
#endif /* NR_CPUS */

/**
 * cpumask_next_and - get the next cpu in *src1p & *src2p
 * @n: the cpu prior to the place to search (i.e. return will be > @n)
 * @src1p: the first cpumask pointer
 * @src2p: the second cpumask pointer
 *
 * Return: >= nr_cpu_ids if no further cpus set in both.
 */
static inline
unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
                     const struct cpumask *src2p)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
                small_cpumask_bits, n + 1);
}

/**
 * for_each_cpu - iterate over every cpu in a mask
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask pointer
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu(cpu, mask)                                \
        for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)

#if NR_CPUS == 1
static inline
unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
{
        cpumask_check(start);
        if (n != -1)
                cpumask_check(n);

        /*
         * Return the first available CPU when wrapping, or when starting before cpu0,
         * since there is only one valid option.
         */
        if (wrap && n >= 0)
                return nr_cpumask_bits;

        return cpumask_first(mask);
}
#else
unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
#endif

/**
 * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask pointer
 * @start: the start location
 *
 * The implementation does not assume any bit in @mask is set (including @start).
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_wrap(cpu, mask, start)                                \
        for_each_set_bit_wrap(cpu, cpumask_bits(mask), small_cpumask_bits, start)

/**
 * for_each_cpu_and - iterate over every cpu in both masks
 * @cpu: the (optionally unsigned) integer iterator
 * @mask1: the first cpumask pointer
 * @mask2: the second cpumask pointer
 *
 * This saves a temporary CPU mask in many places.  It is equivalent to:
 *        struct cpumask tmp;
 *        cpumask_and(&tmp, &mask1, &mask2);
 *        for_each_cpu(cpu, &tmp)
 *                ...
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_and(cpu, mask1, mask2)                                \
        for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)

/**
 * for_each_cpu_andnot - iterate over every cpu present in one mask, excluding
 *                         those present in another.
 * @cpu: the (optionally unsigned) integer iterator
 * @mask1: the first cpumask pointer
 * @mask2: the second cpumask pointer
 *
 * This saves a temporary CPU mask in many places.  It is equivalent to:
 *        struct cpumask tmp;
 *        cpumask_andnot(&tmp, &mask1, &mask2);
 *        for_each_cpu(cpu, &tmp)
 *                ...
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_andnot(cpu, mask1, mask2)                                \
        for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)

/**
 * for_each_cpu_or - iterate over every cpu present in either mask
 * @cpu: the (optionally unsigned) integer iterator
 * @mask1: the first cpumask pointer
 * @mask2: the second cpumask pointer
 *
 * This saves a temporary CPU mask in many places.  It is equivalent to:
 *        struct cpumask tmp;
 *        cpumask_or(&tmp, &mask1, &mask2);
 *        for_each_cpu(cpu, &tmp)
 *                ...
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_or(cpu, mask1, mask2)                                \
        for_each_or_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)

/**
 * for_each_cpu_from - iterate over CPUs present in @mask, from @cpu to the end of @mask.
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask pointer
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_from(cpu, mask)                                \
        for_each_set_bit_from(cpu, cpumask_bits(mask), small_cpumask_bits)

/**
 * cpumask_any_but - return a "random" in a cpumask, but not this one.
 * @mask: the cpumask to search
 * @cpu: the cpu to ignore.
 *
 * Often used to find any cpu but smp_processor_id() in a mask.
 * Return: >= nr_cpu_ids if no cpus set.
 */
static inline
unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
{
        unsigned int i;

        cpumask_check(cpu);
        for_each_cpu(i, mask)
                if (i != cpu)
                        break;
        return i;
}

/**
 * cpumask_any_and_but - pick a "random" cpu from *mask1 & *mask2, but not this one.
 * @mask1: the first input cpumask
 * @mask2: the second input cpumask
 * @cpu: the cpu to ignore
 *
 * Returns >= nr_cpu_ids if no cpus set.
 */
static inline
unsigned int cpumask_any_and_but(const struct cpumask *mask1,
                                 const struct cpumask *mask2,
                                 unsigned int cpu)
{
        unsigned int i;

        cpumask_check(cpu);
        i = cpumask_first_and(mask1, mask2);
        if (i != cpu)
                return i;

        return cpumask_next_and(cpu, mask1, mask2);
}

/**
 * cpumask_nth - get the Nth cpu in a cpumask
 * @srcp: the cpumask pointer
 * @cpu: the Nth cpu to find, starting from 0
 *
 * Return: >= nr_cpu_ids if such cpu doesn't exist.
 */
static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp)
{
        return find_nth_bit(cpumask_bits(srcp), small_cpumask_bits, cpumask_check(cpu));
}

/**
 * cpumask_nth_and - get the Nth cpu in 2 cpumasks
 * @srcp1: the cpumask pointer
 * @srcp2: the cpumask pointer
 * @cpu: the Nth cpu to find, starting from 0
 *
 * Return: >= nr_cpu_ids if such cpu doesn't exist.
 */
static inline
unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
                                                        const struct cpumask *srcp2)
{
        return find_nth_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
                                small_cpumask_bits, cpumask_check(cpu));
}

/**
 * cpumask_nth_andnot - get the Nth cpu set in 1st cpumask, and clear in 2nd.
 * @srcp1: the cpumask pointer
 * @srcp2: the cpumask pointer
 * @cpu: the Nth cpu to find, starting from 0
 *
 * Return: >= nr_cpu_ids if such cpu doesn't exist.
 */
static inline
unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
                                                        const struct cpumask *srcp2)
{
        return find_nth_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
                                small_cpumask_bits, cpumask_check(cpu));
}

/**
 * cpumask_nth_and_andnot - get the Nth cpu set in 1st and 2nd cpumask, and clear in 3rd.
 * @srcp1: the cpumask pointer
 * @srcp2: the cpumask pointer
 * @srcp3: the cpumask pointer
 * @cpu: the Nth cpu to find, starting from 0
 *
 * Return: >= nr_cpu_ids if such cpu doesn't exist.
 */
static __always_inline
unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp1,
                                                        const struct cpumask *srcp2,
                                                        const struct cpumask *srcp3)
{
        return find_nth_and_andnot_bit(cpumask_bits(srcp1),
                                        cpumask_bits(srcp2),
                                        cpumask_bits(srcp3),
                                        small_cpumask_bits, cpumask_check(cpu));
}

#define CPU_BITS_NONE                                                \
{                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL                        \
}

#define CPU_BITS_CPU0                                                \
{                                                                \
        [0] =  1UL                                                \
}

/**
 * cpumask_set_cpu - set a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @dstp: the cpumask pointer
 */
static __always_inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
{
        set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

static __always_inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
{
        __set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}


/**
 * cpumask_clear_cpu - clear a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @dstp: the cpumask pointer
 */
static __always_inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp)
{
        clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

static __always_inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp)
{
        __clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

/**
 * cpumask_assign_cpu - assign a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @dstp: the cpumask pointer
 * @bool: the value to assign
 */
static __always_inline void cpumask_assign_cpu(int cpu, struct cpumask *dstp, bool value)
{
        assign_bit(cpumask_check(cpu), cpumask_bits(dstp), value);
}

static __always_inline void __cpumask_assign_cpu(int cpu, struct cpumask *dstp, bool value)
{
        __assign_bit(cpumask_check(cpu), cpumask_bits(dstp), value);
}

/**
 * cpumask_test_cpu - test for a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * Return: true if @cpu is set in @cpumask, else returns false
 */
static __always_inline bool cpumask_test_cpu(int cpu, const struct cpumask *cpumask)
{
        return test_bit(cpumask_check(cpu), cpumask_bits((cpumask)));
}

/**
 * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * test_and_set_bit wrapper for cpumasks.
 *
 * Return: true if @cpu is set in old bitmap of @cpumask, else returns false
 */
static __always_inline bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask)
{
        return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask));
}

/**
 * cpumask_test_and_clear_cpu - atomically test and clear a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * test_and_clear_bit wrapper for cpumasks.
 *
 * Return: true if @cpu is set in old bitmap of @cpumask, else returns false
 */
static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask)
{
        return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask));
}

/**
 * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask
 * @dstp: the cpumask pointer
 */
static inline void cpumask_setall(struct cpumask *dstp)
{
        if (small_const_nbits(small_cpumask_bits)) {
                cpumask_bits(dstp)[0] = BITMAP_LAST_WORD_MASK(nr_cpumask_bits);
                return;
        }
        bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask
 * @dstp: the cpumask pointer
 */
static inline void cpumask_clear(struct cpumask *dstp)
{
        bitmap_zero(cpumask_bits(dstp), large_cpumask_bits);
}

/**
 * cpumask_and - *dstp = *src1p & *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: false if *@dstp is empty, else returns true
 */
static inline bool cpumask_and(struct cpumask *dstp,
                               const struct cpumask *src1p,
                               const struct cpumask *src2p)
{
        return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p),
                                       cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_or - *dstp = *src1p | *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 */
static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
                              const struct cpumask *src2p)
{
        bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p),
                                      cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_xor - *dstp = *src1p ^ *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 */
static inline void cpumask_xor(struct cpumask *dstp,
                               const struct cpumask *src1p,
                               const struct cpumask *src2p)
{
        bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p),
                                       cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_andnot - *dstp = *src1p & ~*src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: false if *@dstp is empty, else returns true
 */
static inline bool cpumask_andnot(struct cpumask *dstp,
                                  const struct cpumask *src1p,
                                  const struct cpumask *src2p)
{
        return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p),
                                          cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_equal - *src1p == *src2p
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: true if the cpumasks are equal, false if not
 */
static inline bool cpumask_equal(const struct cpumask *src1p,
                                const struct cpumask *src2p)
{
        return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p),
                                                 small_cpumask_bits);
}

/**
 * cpumask_or_equal - *src1p | *src2p == *src3p
 * @src1p: the first input
 * @src2p: the second input
 * @src3p: the third input
 *
 * Return: true if first cpumask ORed with second cpumask == third cpumask,
 *           otherwise false
 */
static inline bool cpumask_or_equal(const struct cpumask *src1p,
                                    const struct cpumask *src2p,
                                    const struct cpumask *src3p)
{
        return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p),
                               cpumask_bits(src3p), small_cpumask_bits);
}

/**
 * cpumask_intersects - (*src1p & *src2p) != 0
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: true if first cpumask ANDed with second cpumask is non-empty,
 *           otherwise false
 */
static inline bool cpumask_intersects(const struct cpumask *src1p,
                                     const struct cpumask *src2p)
{
        return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p),
                                                      small_cpumask_bits);
}

/**
 * cpumask_subset - (*src1p & ~*src2p) == 0
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: true if *@src1p is a subset of *@src2p, else returns false
 */
static inline bool cpumask_subset(const struct cpumask *src1p,
                                 const struct cpumask *src2p)
{
        return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p),
                                                  small_cpumask_bits);
}

/**
 * cpumask_empty - *srcp == 0
 * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear.
 *
 * Return: true if srcp is empty (has no bits set), else false
 */
static inline bool cpumask_empty(const struct cpumask *srcp)
{
        return bitmap_empty(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_full - *srcp == 0xFFFFFFFF...
 * @srcp: the cpumask to that all cpus < nr_cpu_ids are set.
 *
 * Return: true if srcp is full (has all bits set), else false
 */
static inline bool cpumask_full(const struct cpumask *srcp)
{
        return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits);
}

/**
 * cpumask_weight - Count of bits in *srcp
 * @srcp: the cpumask to count bits (< nr_cpu_ids) in.
 *
 * Return: count of bits set in *srcp
 */
static inline unsigned int cpumask_weight(const struct cpumask *srcp)
{
        return bitmap_weight(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_weight_and - Count of bits in (*srcp1 & *srcp2)
 * @srcp1: the cpumask to count bits (< nr_cpu_ids) in.
 * @srcp2: the cpumask to count bits (< nr_cpu_ids) in.
 *
 * Return: count of bits set in both *srcp1 and *srcp2
 */
static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1,
                                                const struct cpumask *srcp2)
{
        return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
}

/**
 * cpumask_weight_andnot - Count of bits in (*srcp1 & ~*srcp2)
 * @srcp1: the cpumask to count bits (< nr_cpu_ids) in.
 * @srcp2: the cpumask to count bits (< nr_cpu_ids) in.
 *
 * Return: count of bits set in both *srcp1 and *srcp2
 */
static inline unsigned int cpumask_weight_andnot(const struct cpumask *srcp1,
                                                const struct cpumask *srcp2)
{
        return bitmap_weight_andnot(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
}

/**
 * cpumask_shift_right - *dstp = *srcp >> n
 * @dstp: the cpumask result
 * @srcp: the input to shift
 * @n: the number of bits to shift by
 */
static inline void cpumask_shift_right(struct cpumask *dstp,
                                       const struct cpumask *srcp, int n)
{
        bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n,
                                               small_cpumask_bits);
}

/**
 * cpumask_shift_left - *dstp = *srcp << n
 * @dstp: the cpumask result
 * @srcp: the input to shift
 * @n: the number of bits to shift by
 */
static inline void cpumask_shift_left(struct cpumask *dstp,
                                      const struct cpumask *srcp, int n)
{
        bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n,
                                              nr_cpumask_bits);
}

/**
 * cpumask_copy - *dstp = *srcp
 * @dstp: the result
 * @srcp: the input cpumask
 */
static inline void cpumask_copy(struct cpumask *dstp,
                                const struct cpumask *srcp)
{
        bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), large_cpumask_bits);
}

/**
 * cpumask_any - pick a "random" cpu from *srcp
 * @srcp: the input cpumask
 *
 * Return: >= nr_cpu_ids if no cpus set.
 */
#define cpumask_any(srcp) cpumask_first(srcp)

/**
 * cpumask_any_and - pick a "random" cpu from *mask1 & *mask2
 * @mask1: the first input cpumask
 * @mask2: the second input cpumask
 *
 * Return: >= nr_cpu_ids if no cpus set.
 */
#define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2))

/**
 * cpumask_of - the cpumask containing just a given cpu
 * @cpu: the cpu (<= nr_cpu_ids)
 */
#define cpumask_of(cpu) (get_cpu_mask(cpu))

/**
 * cpumask_parse_user - extract a cpumask from a user string
 * @buf: the buffer to extract from
 * @len: the length of the buffer
 * @dstp: the cpumask to set.
 *
 * Return: -errno, or 0 for success.
 */
static inline int cpumask_parse_user(const char __user *buf, int len,
                                     struct cpumask *dstp)
{
        return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_parselist_user - extract a cpumask from a user string
 * @buf: the buffer to extract from
 * @len: the length of the buffer
 * @dstp: the cpumask to set.
 *
 * Return: -errno, or 0 for success.
 */
static inline int cpumask_parselist_user(const char __user *buf, int len,
                                     struct cpumask *dstp)
{
        return bitmap_parselist_user(buf, len, cpumask_bits(dstp),
                                     nr_cpumask_bits);
}

/**
 * cpumask_parse - extract a cpumask from a string
 * @buf: the buffer to extract from
 * @dstp: the cpumask to set.
 *
 * Return: -errno, or 0 for success.
 */
static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
{
        return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpulist_parse - extract a cpumask from a user string of ranges
 * @buf: the buffer to extract from
 * @dstp: the cpumask to set.
 *
 * Return: -errno, or 0 for success.
 */
static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
{
        return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_size - calculate size to allocate for a 'struct cpumask' in bytes
 *
 * Return: size to allocate for a &struct cpumask in bytes
 */
static inline unsigned int cpumask_size(void)
{
        return bitmap_size(large_cpumask_bits);
}

/*
 * cpumask_var_t: struct cpumask for stack usage.
 *
 * Oh, the wicked games we play!  In order to make kernel coding a
 * little more difficult, we typedef cpumask_var_t to an array or a
 * pointer: doing &mask on an array is a noop, so it still works.
 *
 * i.e.
 *        cpumask_var_t tmpmask;
 *        if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
 *                return -ENOMEM;
 *
 *          ... use 'tmpmask' like a normal struct cpumask * ...
 *
 *        free_cpumask_var(tmpmask);
 *
 *
 * However, one notable exception is there. alloc_cpumask_var() allocates
 * only nr_cpumask_bits bits (in the other hand, real cpumask_t always has
 * NR_CPUS bits). Therefore you don't have to dereference cpumask_var_t.
 *
 *        cpumask_var_t tmpmask;
 *        if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
 *                return -ENOMEM;
 *
 *        var = *tmpmask;
 *
 * This code makes NR_CPUS length memcopy and brings to a memory corruption.
 * cpumask_copy() provide safe copy functionality.
 *
 * Note that there is another evil here: If you define a cpumask_var_t
 * as a percpu variable then the way to obtain the address of the cpumask
 * structure differently influences what this_cpu_* operation needs to be
 * used. Please use this_cpu_cpumask_var_t in those cases. The direct use
 * of this_cpu_ptr() or this_cpu_read() will lead to failures when the
 * other type of cpumask_var_t implementation is configured.
 *
 * Please also note that __cpumask_var_read_mostly can be used to declare
 * a cpumask_var_t variable itself (not its content) as read mostly.
 */
#ifdef CONFIG_CPUMASK_OFFSTACK
typedef struct cpumask *cpumask_var_t;

#define this_cpu_cpumask_var_ptr(x)        this_cpu_read(x)
#define __cpumask_var_read_mostly        __read_mostly

bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);

static inline
bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
{
        return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node);
}

/**
 * alloc_cpumask_var - allocate a struct cpumask
 * @mask: pointer to cpumask_var_t where the cpumask is returned
 * @flags: GFP_ flags
 *
 * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
 * a nop returning a constant 1 (in <linux/cpumask.h>).
 *
 * See alloc_cpumask_var_node.
 *
 * Return: %true if allocation succeeded, %false if not
 */
static inline
bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE);
}

static inline
bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return alloc_cpumask_var(mask, flags | __GFP_ZERO);
}

void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
void free_cpumask_var(cpumask_var_t mask);
void free_bootmem_cpumask_var(cpumask_var_t mask);

static inline bool cpumask_available(cpumask_var_t mask)
{
        return mask != NULL;
}

#else
typedef struct cpumask cpumask_var_t[1];

#define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x)
#define __cpumask_var_read_mostly

static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return true;
}

static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
                                          int node)
{
        return true;
}

static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        cpumask_clear(*mask);
        return true;
}

static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
                                          int node)
{
        cpumask_clear(*mask);
        return true;
}

static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
{
}

static inline void free_cpumask_var(cpumask_var_t mask)
{
}

static inline void free_bootmem_cpumask_var(cpumask_var_t mask)
{
}

static inline bool cpumask_available(cpumask_var_t mask)
{
        return true;
}
#endif /* CONFIG_CPUMASK_OFFSTACK */

DEFINE_FREE(free_cpumask_var, struct cpumask *, if (_T) free_cpumask_var(_T));

/* It's common to want to use cpu_all_mask in struct member initializers,
 * so it has to refer to an address rather than a pointer. */
extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
#define cpu_all_mask to_cpumask(cpu_all_bits)

/* First bits of cpu_bit_bitmap are in fact unset. */
#define cpu_none_mask to_cpumask(cpu_bit_bitmap[0])

#if NR_CPUS == 1
/* Uniprocessor: the possible/online/present masks are always "1" */
#define for_each_possible_cpu(cpu)        for ((cpu) = 0; (cpu) < 1; (cpu)++)
#define for_each_online_cpu(cpu)        for ((cpu) = 0; (cpu) < 1; (cpu)++)
#define for_each_present_cpu(cpu)        for ((cpu) = 0; (cpu) < 1; (cpu)++)
#else
#define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask)
#define for_each_online_cpu(cpu)   for_each_cpu((cpu), cpu_online_mask)
#define for_each_present_cpu(cpu)  for_each_cpu((cpu), cpu_present_mask)
#endif

/* Wrappers for arch boot code to manipulate normally-constant masks */
void init_cpu_present(const struct cpumask *src);
void init_cpu_possible(const struct cpumask *src);
void init_cpu_online(const struct cpumask *src);

static inline void
set_cpu_possible(unsigned int cpu, bool possible)
{
        if (possible)
                cpumask_set_cpu(cpu, &__cpu_possible_mask);
        else
                cpumask_clear_cpu(cpu, &__cpu_possible_mask);
}

static inline void
set_cpu_present(unsigned int cpu, bool present)
{
        if (present)
                cpumask_set_cpu(cpu, &__cpu_present_mask);
        else
                cpumask_clear_cpu(cpu, &__cpu_present_mask);
}

void set_cpu_online(unsigned int cpu, bool online);

static inline void
set_cpu_active(unsigned int cpu, bool active)
{
        if (active)
                cpumask_set_cpu(cpu, &__cpu_active_mask);
        else
                cpumask_clear_cpu(cpu, &__cpu_active_mask);
}

static inline void
set_cpu_dying(unsigned int cpu, bool dying)
{
        if (dying)
                cpumask_set_cpu(cpu, &__cpu_dying_mask);
        else
                cpumask_clear_cpu(cpu, &__cpu_dying_mask);
}

/**
 * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask *
 * @bitmap: the bitmap
 *
 * There are a few places where cpumask_var_t isn't appropriate and
 * static cpumasks must be used (eg. very early boot), yet we don't
 * expose the definition of 'struct cpumask'.
 *
 * This does the conversion, and can be used as a constant initializer.
 */
#define to_cpumask(bitmap)                                                \
        ((struct cpumask *)(1 ? (bitmap)                                \
                            : (void *)sizeof(__check_is_bitmap(bitmap))))

static inline int __check_is_bitmap(const unsigned long *bitmap)
{
        return 1;
}

/*
 * Special-case data structure for "single bit set only" constant CPU masks.
 *
 * We pre-generate all the 64 (or 32) possible bit positions, with enough
 * padding to the left and the right, and return the constant pointer
 * appropriately offset.
 */
extern const unsigned long
        cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)];

static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
{
        const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
        p -= cpu / BITS_PER_LONG;
        return to_cpumask(p);
}

#if NR_CPUS > 1
/**
 * num_online_cpus() - Read the number of online CPUs
 *
 * Despite the fact that __num_online_cpus is of type atomic_t, this
 * interface gives only a momentary snapshot and is not protected against
 * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
 * region.
 *
 * Return: momentary snapshot of the number of online CPUs
 */
static __always_inline unsigned int num_online_cpus(void)
{
        return raw_atomic_read(&__num_online_cpus);
}
#define num_possible_cpus()        cpumask_weight(cpu_possible_mask)
#define num_present_cpus()        cpumask_weight(cpu_present_mask)
#define num_active_cpus()        cpumask_weight(cpu_active_mask)

static inline bool cpu_online(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_online_mask);
}

static inline bool cpu_possible(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_possible_mask);
}

static inline bool cpu_present(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_present_mask);
}

static inline bool cpu_active(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_active_mask);
}

static inline bool cpu_dying(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_dying_mask);
}

#else

#define num_online_cpus()        1U
#define num_possible_cpus()        1U
#define num_present_cpus()        1U
#define num_active_cpus()        1U

static inline bool cpu_online(unsigned int cpu)
{
        return cpu == 0;
}

static inline bool cpu_possible(unsigned int cpu)
{
        return cpu == 0;
}

static inline bool cpu_present(unsigned int cpu)
{
        return cpu == 0;
}

static inline bool cpu_active(unsigned int cpu)
{
        return cpu == 0;
}

static inline bool cpu_dying(unsigned int cpu)
{
        return false;
}

#endif /* NR_CPUS > 1 */

#define cpu_is_offline(cpu)        unlikely(!cpu_online(cpu))

#if NR_CPUS <= BITS_PER_LONG
#define CPU_BITS_ALL                                                \
{                                                                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
}

#else /* NR_CPUS > BITS_PER_LONG */

#define CPU_BITS_ALL                                                \
{                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
}
#endif /* NR_CPUS > BITS_PER_LONG */

/**
 * cpumap_print_to_pagebuf  - copies the cpumask into the buffer either
 *        as comma-separated list of cpus or hex values of cpumask
 * @list: indicates whether the cpumap must be list
 * @mask: the cpumask to copy
 * @buf: the buffer to copy into
 *
 * Return: the length of the (null-terminated) @buf string, zero if
 * nothing is copied.
 */
static inline ssize_t
cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask)
{
        return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask),
                                      nr_cpu_ids);
}

/**
 * cpumap_print_bitmask_to_buf  - copies the cpumask into the buffer as
 *        hex values of cpumask
 *
 * @buf: the buffer to copy into
 * @mask: the cpumask to copy
 * @off: in the string from which we are copying, we copy to @buf
 * @count: the maximum number of bytes to print
 *
 * The function prints the cpumask into the buffer as hex values of
 * cpumask; Typically used by bin_attribute to export cpumask bitmask
 * ABI.
 *
 * Return: the length of how many bytes have been copied, excluding
 * terminating '\0'.
 */
static inline ssize_t
cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask,
                loff_t off, size_t count)
{
        return bitmap_print_bitmask_to_buf(buf, cpumask_bits(mask),
                                   nr_cpu_ids, off, count) - 1;
}

/**
 * cpumap_print_list_to_buf  - copies the cpumask into the buffer as
 *        comma-separated list of cpus
 * @buf: the buffer to copy into
 * @mask: the cpumask to copy
 * @off: in the string from which we are copying, we copy to @buf
 * @count: the maximum number of bytes to print
 *
 * Everything is same with the above cpumap_print_bitmask_to_buf()
 * except the print format.
 *
 * Return: the length of how many bytes have been copied, excluding
 * terminating '\0'.
 */
static inline ssize_t
cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
                loff_t off, size_t count)
{
        return bitmap_print_list_to_buf(buf, cpumask_bits(mask),
                                   nr_cpu_ids, off, count) - 1;
}

#if NR_CPUS <= BITS_PER_LONG
#define CPU_MASK_ALL                                                        \
(cpumask_t) { {                                                                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
} }
#else
#define CPU_MASK_ALL                                                        \
(cpumask_t) { {                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,                        \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
} }
#endif /* NR_CPUS > BITS_PER_LONG */

#define CPU_MASK_NONE                                                        \
(cpumask_t) { {                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-1] =  0UL                                \
} }

#define CPU_MASK_CPU0                                                        \
(cpumask_t) { {                                                                \
        [0] =  1UL                                                        \
} }

/*
 * Provide a valid theoretical max size for cpumap and cpulist sysfs files
 * to avoid breaking userspace which may allocate a buffer based on the size
 * reported by e.g. fstat.
 *
 * for cpumap NR_CPUS * 9/32 - 1 should be an exact length.
 *
 * For cpulist 7 is (ceil(log10(NR_CPUS)) + 1) allowing for NR_CPUS to be up
 * to 2 orders of magnitude larger than 8192. And then we divide by 2 to
 * cover a worst-case of every other cpu being on one of two nodes for a
 * very large NR_CPUS.
 *
 *  Use PAGE_SIZE as a minimum for smaller configurations while avoiding
 *  unsigned comparison to -1.
 */
#define CPUMAP_FILE_MAX_BYTES  (((NR_CPUS * 9)/32 > PAGE_SIZE) \
                                        ? (NR_CPUS * 9)/32 - 1 : PAGE_SIZE)
#define CPULIST_FILE_MAX_BYTES  (((NR_CPUS * 7)/2 > PAGE_SIZE) ? (NR_CPUS * 7)/2 : PAGE_SIZE)

#endif /* __LINUX_CPUMASK_H */


































































































    2 













    2 







    2 



    2 


    1 




















    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
/*
 *  linux/fs/hfs/trans.c
 *
 * Copyright (C) 1995-1997  Paul H. Hargrove
 * This file may be distributed under the terms of the GNU General Public License.
 *
 * This file contains routines for converting between the Macintosh
 * character set and various other encodings.  This includes dealing
 * with ':' vs. '/' as the path-element separator.
 */

#include <linux/types.h>
#include <linux/nls.h>

#include "hfs_fs.h"

/*================ Global functions ================*/

/*
 * hfs_mac2asc()
 *
 * Given a 'Pascal String' (a string preceded by a length byte) in
 * the Macintosh character set produce the corresponding filename using
 * the 'trivial' name-mangling scheme, returning the length of the
 * mangled filename.  Note that the output string is not NULL
 * terminated.
 *
 * The name-mangling works as follows:
 * The character '/', which is illegal in Linux filenames is replaced
 * by ':' which never appears in HFS filenames.         All other characters
 * are passed unchanged from input to output.
 */
int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in)
{
        struct nls_table *nls_disk = HFS_SB(sb)->nls_disk;
        struct nls_table *nls_io = HFS_SB(sb)->nls_io;
        const char *src;
        char *dst;
        int srclen, dstlen, size;

        src = in->name;
        srclen = in->len;
        if (srclen > HFS_NAMELEN)
                srclen = HFS_NAMELEN;
        dst = out;
        dstlen = HFS_MAX_NAMELEN;
        if (nls_io) {
                wchar_t ch;

                while (srclen > 0) {
                        if (nls_disk) {
                                size = nls_disk->char2uni(src, srclen, &ch);
                                if (size <= 0) {
                                        ch = '?';
                                        size = 1;
                                }
                                src += size;
                                srclen -= size;
                        } else {
                                ch = *src++;
                                srclen--;
                        }
                        if (ch == '/')
                                ch = ':';
                        size = nls_io->uni2char(ch, dst, dstlen);
                        if (size < 0) {
                                if (size == -ENAMETOOLONG)
                                        goto out;
                                *dst = '?';
                                size = 1;
                        }
                        dst += size;
                        dstlen -= size;
                }
        } else {
                char ch;

                while (--srclen >= 0)
                        *dst++ = (ch = *src++) == '/' ? ':' : ch;
        }
out:
        return dst - out;
}

/*
 * hfs_asc2mac()
 *
 * Given an ASCII string (not null-terminated) and its length,
 * generate the corresponding filename in the Macintosh character set
 * using the 'trivial' name-mangling scheme, returning the length of
 * the mangled filename.  Note that the output string is not NULL
 * terminated.
 *
 * This routine is a inverse to hfs_mac2triv().
 * A ':' is replaced by a '/'.
 */
void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, const struct qstr *in)
{
        struct nls_table *nls_disk = HFS_SB(sb)->nls_disk;
        struct nls_table *nls_io = HFS_SB(sb)->nls_io;
        const char *src;
        char *dst;
        int srclen, dstlen, size;

        src = in->name;
        srclen = in->len;
        dst = out->name;
        dstlen = HFS_NAMELEN;
        if (nls_io) {
                wchar_t ch;

                while (srclen > 0 && dstlen > 0) {
                        size = nls_io->char2uni(src, srclen, &ch);
                        if (size < 0) {
                                ch = '?';
                                size = 1;
                        }
                        src += size;
                        srclen -= size;
                        if (ch == ':')
                                ch = '/';
                        if (nls_disk) {
                                size = nls_disk->uni2char(ch, dst, dstlen);
                                if (size < 0) {
                                        if (size == -ENAMETOOLONG)
                                                goto out;
                                        *dst = '?';
                                        size = 1;
                                }
                                dst += size;
                                dstlen -= size;
                        } else {
                                *dst++ = ch > 0xff ? '?' : ch;
                                dstlen--;
                        }
                }
        } else {
                char ch;

                if (dstlen > srclen)
                        dstlen = srclen;
                while (--dstlen >= 0)
                        *dst++ = (ch = *src++) == ':' ? '/' : ch;
        }
out:
        out->len = dst - (char *)out->name;
        dstlen = HFS_NAMELEN - out->len;
        while (--dstlen >= 0)
                *dst++ = 0;
}






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 










    1 












    1 


    1 
    1 

















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Digital Audio (PCM) abstract layer / OSS compatible
 *  Copyright (c) by Jaroslav Kysela <perex@perex.cz>
 */

#if 0
#define PLUGIN_DEBUG
#endif
#if 0
#define OSS_DEBUG
#endif

#include <linux/init.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/time.h>
#include <linux/vmalloc.h>
#include <linux/module.h>
#include <linux/math64.h>
#include <linux/string.h>
#include <linux/compat.h>
#include <sound/core.h>
#include <sound/minors.h>
#include <sound/pcm.h>
#include <sound/pcm_params.h>
#include "pcm_plugin.h"
#include <sound/info.h>
#include <linux/soundcard.h>
#include <sound/initval.h>
#include <sound/mixer_oss.h>

#define OSS_ALSAEMULVER                _SIOR ('M', 249, int)

static int dsp_map[SNDRV_CARDS];
static int adsp_map[SNDRV_CARDS] = {[0 ... (SNDRV_CARDS-1)] = 1};
static bool nonblock_open = 1;

MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>, Abramo Bagnara <abramo@alsa-project.org>");
MODULE_DESCRIPTION("PCM OSS emulation for ALSA.");
MODULE_LICENSE("GPL");
module_param_array(dsp_map, int, NULL, 0444);
MODULE_PARM_DESC(dsp_map, "PCM device number assigned to 1st OSS device.");
module_param_array(adsp_map, int, NULL, 0444);
MODULE_PARM_DESC(adsp_map, "PCM device number assigned to 2nd OSS device.");
module_param(nonblock_open, bool, 0644);
MODULE_PARM_DESC(nonblock_open, "Don't block opening busy PCM devices.");
MODULE_ALIAS_SNDRV_MINOR(SNDRV_MINOR_OSS_PCM);
MODULE_ALIAS_SNDRV_MINOR(SNDRV_MINOR_OSS_PCM1);

static int snd_pcm_oss_get_rate(struct snd_pcm_oss_file *pcm_oss_file);
static int snd_pcm_oss_get_channels(struct snd_pcm_oss_file *pcm_oss_file);
static int snd_pcm_oss_get_format(struct snd_pcm_oss_file *pcm_oss_file);

/*
 * helper functions to process hw_params
 */
static int snd_interval_refine_min(struct snd_interval *i, unsigned int min, int openmin)
{
        int changed = 0;
        if (i->min < min) {
                i->min = min;
                i->openmin = openmin;
                changed = 1;
        } else if (i->min == min && !i->openmin && openmin) {
                i->openmin = 1;
                changed = 1;
        }
        if (i->integer) {
                if (i->openmin) {
                        i->min++;
                        i->openmin = 0;
                }
        }
        if (snd_interval_checkempty(i)) {
                snd_interval_none(i);
                return -EINVAL;
        }
        return changed;
}

static int snd_interval_refine_max(struct snd_interval *i, unsigned int max, int openmax)
{
        int changed = 0;
        if (i->max > max) {
                i->max = max;
                i->openmax = openmax;
                changed = 1;
        } else if (i->max == max && !i->openmax && openmax) {
                i->openmax = 1;
                changed = 1;
        }
        if (i->integer) {
                if (i->openmax) {
                        i->max--;
                        i->openmax = 0;
                }
        }
        if (snd_interval_checkempty(i)) {
                snd_interval_none(i);
                return -EINVAL;
        }
        return changed;
}

static int snd_interval_refine_set(struct snd_interval *i, unsigned int val)
{
        struct snd_interval t;
        t.empty = 0;
        t.min = t.max = val;
        t.openmin = t.openmax = 0;
        t.integer = 1;
        return snd_interval_refine(i, &t);
}

/**
 * snd_pcm_hw_param_value_min
 * @params: the hw_params instance
 * @var: parameter to retrieve
 * @dir: pointer to the direction (-1,0,1) or NULL
 *
 * Return the minimum value for field PAR.
 */
static unsigned int
snd_pcm_hw_param_value_min(const struct snd_pcm_hw_params *params,
                           snd_pcm_hw_param_t var, int *dir)
{
        if (hw_is_mask(var)) {
                if (dir)
                        *dir = 0;
                return snd_mask_min(hw_param_mask_c(params, var));
        }
        if (hw_is_interval(var)) {
                const struct snd_interval *i = hw_param_interval_c(params, var);
                if (dir)
                        *dir = i->openmin;
                return snd_interval_min(i);
        }
        return -EINVAL;
}

/**
 * snd_pcm_hw_param_value_max
 * @params: the hw_params instance
 * @var: parameter to retrieve
 * @dir: pointer to the direction (-1,0,1) or NULL
 *
 * Return the maximum value for field PAR.
 */
static int
snd_pcm_hw_param_value_max(const struct snd_pcm_hw_params *params,
                           snd_pcm_hw_param_t var, int *dir)
{
        if (hw_is_mask(var)) {
                if (dir)
                        *dir = 0;
                return snd_mask_max(hw_param_mask_c(params, var));
        }
        if (hw_is_interval(var)) {
                const struct snd_interval *i = hw_param_interval_c(params, var);
                if (dir)
                        *dir = - (int) i->openmax;
                return snd_interval_max(i);
        }
        return -EINVAL;
}

static int _snd_pcm_hw_param_mask(struct snd_pcm_hw_params *params,
                                  snd_pcm_hw_param_t var,
                                  const struct snd_mask *val)
{
        int changed;
        changed = snd_mask_refine(hw_param_mask(params, var), val);
        if (changed > 0) {
                params->cmask |= 1 << var;
                params->rmask |= 1 << var;
        }
        return changed;
}

static int snd_pcm_hw_param_mask(struct snd_pcm_substream *pcm,
                                 struct snd_pcm_hw_params *params,
                                 snd_pcm_hw_param_t var,
                                 const struct snd_mask *val)
{
        int changed = _snd_pcm_hw_param_mask(params, var, val);
        if (changed < 0)
                return changed;
        if (params->rmask) {
                int err = snd_pcm_hw_refine(pcm, params);
                if (err < 0)
                        return err;
        }
        return 0;
}

static int _snd_pcm_hw_param_min(struct snd_pcm_hw_params *params,
                                 snd_pcm_hw_param_t var, unsigned int val,
                                 int dir)
{
        int changed;
        int open = 0;
        if (dir) {
                if (dir > 0) {
                        open = 1;
                } else if (dir < 0) {
                        if (val > 0) {
                                open = 1;
                                val--;
                        }
                }
        }
        if (hw_is_mask(var))
                changed = snd_mask_refine_min(hw_param_mask(params, var),
                                              val + !!open);
        else if (hw_is_interval(var))
                changed = snd_interval_refine_min(hw_param_interval(params, var),
                                                  val, open);
        else
                return -EINVAL;
        if (changed > 0) {
                params->cmask |= 1 << var;
                params->rmask |= 1 << var;
        }
        return changed;
}

/**
 * snd_pcm_hw_param_min
 * @pcm: PCM instance
 * @params: the hw_params instance
 * @var: parameter to retrieve
 * @val: minimal value
 * @dir: pointer to the direction (-1,0,1) or NULL
 *
 * Inside configuration space defined by PARAMS remove from PAR all 
 * values < VAL. Reduce configuration space accordingly.
 * Return new minimum or -EINVAL if the configuration space is empty
 */
static int snd_pcm_hw_param_min(struct snd_pcm_substream *pcm,
                                struct snd_pcm_hw_params *params,
                                snd_pcm_hw_param_t var, unsigned int val,
                                int *dir)
{
        int changed = _snd_pcm_hw_param_min(params, var, val, dir ? *dir : 0);
        if (changed < 0)
                return changed;
        if (params->rmask) {
                int err = snd_pcm_hw_refine(pcm, params);
                if (err < 0)
                        return err;
        }
        return snd_pcm_hw_param_value_min(params, var, dir);
}

static int _snd_pcm_hw_param_max(struct snd_pcm_hw_params *params,
                                 snd_pcm_hw_param_t var, unsigned int val,
                                 int dir)
{
        int changed;
        int open = 0;
        if (dir) {
                if (dir < 0) {
                        open = 1;
                } else if (dir > 0) {
                        open = 1;
                        val++;
                }
        }
        if (hw_is_mask(var)) {
                if (val == 0 && open) {
                        snd_mask_none(hw_param_mask(params, var));
                        changed = -EINVAL;
                } else
                        changed = snd_mask_refine_max(hw_param_mask(params, var),
                                                      val - !!open);
        } else if (hw_is_interval(var))
                changed = snd_interval_refine_max(hw_param_interval(params, var),
                                                  val, open);
        else
                return -EINVAL;
        if (changed > 0) {
                params->cmask |= 1 << var;
                params->rmask |= 1 << var;
        }
        return changed;
}

/**
 * snd_pcm_hw_param_max
 * @pcm: PCM instance
 * @params: the hw_params instance
 * @var: parameter to retrieve
 * @val: maximal value
 * @dir: pointer to the direction (-1,0,1) or NULL
 *
 * Inside configuration space defined by PARAMS remove from PAR all 
 *  values >= VAL + 1. Reduce configuration space accordingly.
 *  Return new maximum or -EINVAL if the configuration space is empty
 */
static int snd_pcm_hw_param_max(struct snd_pcm_substream *pcm,
                                struct snd_pcm_hw_params *params,
                                snd_pcm_hw_param_t var, unsigned int val,
                                int *dir)
{
        int changed = _snd_pcm_hw_param_max(params, var, val, dir ? *dir : 0);
        if (changed < 0)
                return changed;
        if (params->rmask) {
                int err = snd_pcm_hw_refine(pcm, params);
                if (err < 0)
                        return err;
        }
        return snd_pcm_hw_param_value_max(params, var, dir);
}

static int boundary_sub(int a, int adir,
                        int b, int bdir,
                        int *c, int *cdir)
{
        adir = adir < 0 ? -1 : (adir > 0 ? 1 : 0);
        bdir = bdir < 0 ? -1 : (bdir > 0 ? 1 : 0);
        *c = a - b;
        *cdir = adir - bdir;
        if (*cdir == -2) {
                (*c)--;
        } else if (*cdir == 2) {
                (*c)++;
        }
        return 0;
}

static int boundary_lt(unsigned int a, int adir,
                       unsigned int b, int bdir)
{
        if (adir < 0) {
                a--;
                adir = 1;
        } else if (adir > 0)
                adir = 1;
        if (bdir < 0) {
                b--;
                bdir = 1;
        } else if (bdir > 0)
                bdir = 1;
        return a < b || (a == b && adir < bdir);
}

/* Return 1 if min is nearer to best than max */
static int boundary_nearer(int min, int mindir,
                           int best, int bestdir,
                           int max, int maxdir)
{
        int dmin, dmindir;
        int dmax, dmaxdir;
        boundary_sub(best, bestdir, min, mindir, &dmin, &dmindir);
        boundary_sub(max, maxdir, best, bestdir, &dmax, &dmaxdir);
        return boundary_lt(dmin, dmindir, dmax, dmaxdir);
}

/**
 * snd_pcm_hw_param_near
 * @pcm: PCM instance
 * @params: the hw_params instance
 * @var: parameter to retrieve
 * @best: value to set
 * @dir: pointer to the direction (-1,0,1) or NULL
 *
 * Inside configuration space defined by PARAMS set PAR to the available value
 * nearest to VAL. Reduce configuration space accordingly.
 * This function cannot be called for SNDRV_PCM_HW_PARAM_ACCESS,
 * SNDRV_PCM_HW_PARAM_FORMAT, SNDRV_PCM_HW_PARAM_SUBFORMAT.
 * Return the value found.
  */
static int snd_pcm_hw_param_near(struct snd_pcm_substream *pcm,
                                 struct snd_pcm_hw_params *params,
                                 snd_pcm_hw_param_t var, unsigned int best,
                                 int *dir)
{
        struct snd_pcm_hw_params *save __free(kfree) = NULL;
        int v;
        unsigned int saved_min;
        int last = 0;
        int min, max;
        int mindir, maxdir;
        int valdir = dir ? *dir : 0;
        /* FIXME */
        if (best > INT_MAX)
                best = INT_MAX;
        min = max = best;
        mindir = maxdir = valdir;
        if (maxdir > 0)
                maxdir = 0;
        else if (maxdir == 0)
                maxdir = -1;
        else {
                maxdir = 1;
                max--;
        }
        save = kmalloc(sizeof(*save), GFP_KERNEL);
        if (save == NULL)
                return -ENOMEM;
        *save = *params;
        saved_min = min;
        min = snd_pcm_hw_param_min(pcm, params, var, min, &mindir);
        if (min >= 0) {
                struct snd_pcm_hw_params *params1 __free(kfree) = NULL;
                if (max < 0)
                        goto _end;
                if ((unsigned int)min == saved_min && mindir == valdir)
                        goto _end;
                params1 = kmalloc(sizeof(*params1), GFP_KERNEL);
                if (params1 == NULL)
                        return -ENOMEM;
                *params1 = *save;
                max = snd_pcm_hw_param_max(pcm, params1, var, max, &maxdir);
                if (max < 0)
                        goto _end;
                if (boundary_nearer(max, maxdir, best, valdir, min, mindir)) {
                        *params = *params1;
                        last = 1;
                }
        } else {
                *params = *save;
                max = snd_pcm_hw_param_max(pcm, params, var, max, &maxdir);
                if (max < 0)
                        return max;
                last = 1;
        }
 _end:
        if (last)
                v = snd_pcm_hw_param_last(pcm, params, var, dir);
        else
                v = snd_pcm_hw_param_first(pcm, params, var, dir);
        return v;
}

static int _snd_pcm_hw_param_set(struct snd_pcm_hw_params *params,
                                 snd_pcm_hw_param_t var, unsigned int val,
                                 int dir)
{
        int changed;
        if (hw_is_mask(var)) {
                struct snd_mask *m = hw_param_mask(params, var);
                if (val == 0 && dir < 0) {
                        changed = -EINVAL;
                        snd_mask_none(m);
                } else {
                        if (dir > 0)
                                val++;
                        else if (dir < 0)
                                val--;
                        changed = snd_mask_refine_set(hw_param_mask(params, var), val);
                }
        } else if (hw_is_interval(var)) {
                struct snd_interval *i = hw_param_interval(params, var);
                if (val == 0 && dir < 0) {
                        changed = -EINVAL;
                        snd_interval_none(i);
                } else if (dir == 0)
                        changed = snd_interval_refine_set(i, val);
                else {
                        struct snd_interval t;
                        t.openmin = 1;
                        t.openmax = 1;
                        t.empty = 0;
                        t.integer = 0;
                        if (dir < 0) {
                                t.min = val - 1;
                                t.max = val;
                        } else {
                                t.min = val;
                                t.max = val+1;
                        }
                        changed = snd_interval_refine(i, &t);
                }
        } else
                return -EINVAL;
        if (changed > 0) {
                params->cmask |= 1 << var;
                params->rmask |= 1 << var;
        }
        return changed;
}

/**
 * snd_pcm_hw_param_set
 * @pcm: PCM instance
 * @params: the hw_params instance
 * @var: parameter to retrieve
 * @val: value to set
 * @dir: pointer to the direction (-1,0,1) or NULL
 *
 * Inside configuration space defined by PARAMS remove from PAR all 
 * values != VAL. Reduce configuration space accordingly.
 *  Return VAL or -EINVAL if the configuration space is empty
 */
static int snd_pcm_hw_param_set(struct snd_pcm_substream *pcm,
                                struct snd_pcm_hw_params *params,
                                snd_pcm_hw_param_t var, unsigned int val,
                                int dir)
{
        int changed = _snd_pcm_hw_param_set(params, var, val, dir);
        if (changed < 0)
                return changed;
        if (params->rmask) {
                int err = snd_pcm_hw_refine(pcm, params);
                if (err < 0)
                        return err;
        }
        return snd_pcm_hw_param_value(params, var, NULL);
}

static int _snd_pcm_hw_param_setinteger(struct snd_pcm_hw_params *params,
                                        snd_pcm_hw_param_t var)
{
        int changed;
        changed = snd_interval_setinteger(hw_param_interval(params, var));
        if (changed > 0) {
                params->cmask |= 1 << var;
                params->rmask |= 1 << var;
        }
        return changed;
}
        
/*
 * plugin
 */

#ifdef CONFIG_SND_PCM_OSS_PLUGINS
static int snd_pcm_oss_plugin_clear(struct snd_pcm_substream *substream)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        struct snd_pcm_plugin *plugin, *next;
        
        plugin = runtime->oss.plugin_first;
        while (plugin) {
                next = plugin->next;
                snd_pcm_plugin_free(plugin);
                plugin = next;
        }
        runtime->oss.plugin_first = runtime->oss.plugin_last = NULL;
        return 0;
}

static int snd_pcm_plugin_insert(struct snd_pcm_plugin *plugin)
{
        struct snd_pcm_runtime *runtime = plugin->plug->runtime;
        plugin->next = runtime->oss.plugin_first;
        plugin->prev = NULL;
        if (runtime->oss.plugin_first) {
                runtime->oss.plugin_first->prev = plugin;
                runtime->oss.plugin_first = plugin;
        } else {
                runtime->oss.plugin_last =
                runtime->oss.plugin_first = plugin;
        }
        return 0;
}

int snd_pcm_plugin_append(struct snd_pcm_plugin *plugin)
{
        struct snd_pcm_runtime *runtime = plugin->plug->runtime;
        plugin->next = NULL;
        plugin->prev = runtime->oss.plugin_last;
        if (runtime->oss.plugin_last) {
                runtime->oss.plugin_last->next = plugin;
                runtime->oss.plugin_last = plugin;
        } else {
                runtime->oss.plugin_last =
                runtime->oss.plugin_first = plugin;
        }
        return 0;
}
#endif /* CONFIG_SND_PCM_OSS_PLUGINS */

static long snd_pcm_oss_bytes(struct snd_pcm_substream *substream, long frames)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        long buffer_size = snd_pcm_lib_buffer_bytes(substream);
        long bytes = frames_to_bytes(runtime, frames);
        if (buffer_size == runtime->oss.buffer_bytes)
                return bytes;
#if BITS_PER_LONG >= 64
        return runtime->oss.buffer_bytes * bytes / buffer_size;
#else
        {
                u64 bsize = (u64)runtime->oss.buffer_bytes * (u64)bytes;
                return div_u64(bsize, buffer_size);
        }
#endif
}

static long snd_pcm_alsa_frames(struct snd_pcm_substream *substream, long bytes)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        long buffer_size = snd_pcm_lib_buffer_bytes(substream);
        if (buffer_size == runtime->oss.buffer_bytes)
                return bytes_to_frames(runtime, bytes);
        return bytes_to_frames(runtime, (buffer_size * bytes) / runtime->oss.buffer_bytes);
}

static inline
snd_pcm_uframes_t get_hw_ptr_period(struct snd_pcm_runtime *runtime)
{
        return runtime->hw_ptr_interrupt;
}

/* define extended formats in the recent OSS versions (if any) */
/* linear formats */
#define AFMT_S32_LE      0x00001000
#define AFMT_S32_BE      0x00002000
#define AFMT_S24_LE      0x00008000
#define AFMT_S24_BE      0x00010000
#define AFMT_S24_PACKED  0x00040000

/* other supported formats */
#define AFMT_FLOAT       0x00004000
#define AFMT_SPDIF_RAW   0x00020000

/* unsupported formats */
#define AFMT_AC3         0x00000400
#define AFMT_VORBIS      0x00000800

static snd_pcm_format_t snd_pcm_oss_format_from(int format)
{
        switch (format) {
        case AFMT_MU_LAW:        return SNDRV_PCM_FORMAT_MU_LAW;
        case AFMT_A_LAW:        return SNDRV_PCM_FORMAT_A_LAW;
        case AFMT_IMA_ADPCM:        return SNDRV_PCM_FORMAT_IMA_ADPCM;
        case AFMT_U8:                return SNDRV_PCM_FORMAT_U8;
        case AFMT_S16_LE:        return SNDRV_PCM_FORMAT_S16_LE;
        case AFMT_S16_BE:        return SNDRV_PCM_FORMAT_S16_BE;
        case AFMT_S8:                return SNDRV_PCM_FORMAT_S8;
        case AFMT_U16_LE:        return SNDRV_PCM_FORMAT_U16_LE;
        case AFMT_U16_BE:        return SNDRV_PCM_FORMAT_U16_BE;
        case AFMT_MPEG:                return SNDRV_PCM_FORMAT_MPEG;
        case AFMT_S32_LE:        return SNDRV_PCM_FORMAT_S32_LE;
        case AFMT_S32_BE:        return SNDRV_PCM_FORMAT_S32_BE;
        case AFMT_S24_LE:        return SNDRV_PCM_FORMAT_S24_LE;
        case AFMT_S24_BE:        return SNDRV_PCM_FORMAT_S24_BE;
        case AFMT_S24_PACKED:        return SNDRV_PCM_FORMAT_S24_3LE;
        case AFMT_FLOAT:        return SNDRV_PCM_FORMAT_FLOAT;
        case AFMT_SPDIF_RAW:        return SNDRV_PCM_FORMAT_IEC958_SUBFRAME;
        default:                return SNDRV_PCM_FORMAT_U8;
        }
}

static int snd_pcm_oss_format_to(snd_pcm_format_t format)
{
        switch (format) {
        case SNDRV_PCM_FORMAT_MU_LAW:        return AFMT_MU_LAW;
        case SNDRV_PCM_FORMAT_A_LAW:        return AFMT_A_LAW;
        case SNDRV_PCM_FORMAT_IMA_ADPCM:        return AFMT_IMA_ADPCM;
        case SNDRV_PCM_FORMAT_U8:                return AFMT_U8;
        case SNDRV_PCM_FORMAT_S16_LE:        return AFMT_S16_LE;
        case SNDRV_PCM_FORMAT_S16_BE:        return AFMT_S16_BE;
        case SNDRV_PCM_FORMAT_S8:                return AFMT_S8;
        case SNDRV_PCM_FORMAT_U16_LE:        return AFMT_U16_LE;
        case SNDRV_PCM_FORMAT_U16_BE:        return AFMT_U16_BE;
        case SNDRV_PCM_FORMAT_MPEG:                return AFMT_MPEG;
        case SNDRV_PCM_FORMAT_S32_LE:        return AFMT_S32_LE;
        case SNDRV_PCM_FORMAT_S32_BE:        return AFMT_S32_BE;
        case SNDRV_PCM_FORMAT_S24_LE:        return AFMT_S24_LE;
        case SNDRV_PCM_FORMAT_S24_BE:        return AFMT_S24_BE;
        case SNDRV_PCM_FORMAT_S24_3LE:        return AFMT_S24_PACKED;
        case SNDRV_PCM_FORMAT_FLOAT:        return AFMT_FLOAT;
        case SNDRV_PCM_FORMAT_IEC958_SUBFRAME: return AFMT_SPDIF_RAW;
        default:                        return -EINVAL;
        }
}

static int snd_pcm_oss_period_size(struct snd_pcm_substream *substream, 
                                   struct snd_pcm_hw_params *oss_params,
                                   struct snd_pcm_hw_params *slave_params)
{
        ssize_t s;
        ssize_t oss_buffer_size;
        ssize_t oss_period_size, oss_periods;
        ssize_t min_period_size, max_period_size;
        struct snd_pcm_runtime *runtime = substream->runtime;
        size_t oss_frame_size;

        oss_frame_size = snd_pcm_format_physical_width(params_format(oss_params)) *
                         params_channels(oss_params) / 8;

        oss_buffer_size = snd_pcm_hw_param_value_max(slave_params,
                                                     SNDRV_PCM_HW_PARAM_BUFFER_SIZE,
                                                     NULL);
        if (oss_buffer_size <= 0)
                return -EINVAL;
        oss_buffer_size = snd_pcm_plug_client_size(substream,
                                                   oss_buffer_size * oss_frame_size);
        if (oss_buffer_size <= 0)
                return -EINVAL;
        oss_buffer_size = rounddown_pow_of_two(oss_buffer_size);
        if (atomic_read(&substream->mmap_count)) {
                if (oss_buffer_size > runtime->oss.mmap_bytes)
                        oss_buffer_size = runtime->oss.mmap_bytes;
        }

        if (substream->oss.setup.period_size > 16)
                oss_period_size = substream->oss.setup.period_size;
        else if (runtime->oss.fragshift) {
                oss_period_size = 1 << runtime->oss.fragshift;
                if (oss_period_size > oss_buffer_size / 2)
                        oss_period_size = oss_buffer_size / 2;
        } else {
                int sd;
                size_t bytes_per_sec = params_rate(oss_params) * snd_pcm_format_physical_width(params_format(oss_params)) * params_channels(oss_params) / 8;

                oss_period_size = oss_buffer_size;
                do {
                        oss_period_size /= 2;
                } while (oss_period_size > bytes_per_sec);
                if (runtime->oss.subdivision == 0) {
                        sd = 4;
                        if (oss_period_size / sd > 4096)
                                sd *= 2;
                        if (oss_period_size / sd < 4096)
                                sd = 1;
                } else
                        sd = runtime->oss.subdivision;
                oss_period_size /= sd;
                if (oss_period_size < 16)
                        oss_period_size = 16;
        }

        min_period_size = snd_pcm_plug_client_size(substream,
                                                   snd_pcm_hw_param_value_min(slave_params, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, NULL));
        if (min_period_size > 0) {
                min_period_size *= oss_frame_size;
                min_period_size = roundup_pow_of_two(min_period_size);
                if (oss_period_size < min_period_size)
                        oss_period_size = min_period_size;
        }

        max_period_size = snd_pcm_plug_client_size(substream,
                                                   snd_pcm_hw_param_value_max(slave_params, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, NULL));
        if (max_period_size > 0) {
                max_period_size *= oss_frame_size;
                max_period_size = rounddown_pow_of_two(max_period_size);
                if (oss_period_size > max_period_size)
                        oss_period_size = max_period_size;
        }

        oss_periods = oss_buffer_size / oss_period_size;

        if (substream->oss.setup.periods > 1)
                oss_periods = substream->oss.setup.periods;

        s = snd_pcm_hw_param_value_max(slave_params, SNDRV_PCM_HW_PARAM_PERIODS, NULL);
        if (s > 0 && runtime->oss.maxfrags && s > runtime->oss.maxfrags)
                s = runtime->oss.maxfrags;
        if (oss_periods > s)
                oss_periods = s;

        s = snd_pcm_hw_param_value_min(slave_params, SNDRV_PCM_HW_PARAM_PERIODS, NULL);
        if (s < 2)
                s = 2;
        if (oss_periods < s)
                oss_periods = s;

        while (oss_period_size * oss_periods > oss_buffer_size)
                oss_period_size /= 2;

        if (oss_period_size < 16)
                return -EINVAL;

        /* don't allocate too large period; 1MB period must be enough */
        if (oss_period_size > 1024 * 1024)
                return -ENOMEM;

        runtime->oss.period_bytes = oss_period_size;
        runtime->oss.period_frames = 1;
        runtime->oss.periods = oss_periods;
        return 0;
}

static int choose_rate(struct snd_pcm_substream *substream,
                       struct snd_pcm_hw_params *params, unsigned int best_rate)
{
        const struct snd_interval *it;
        struct snd_pcm_hw_params *save __free(kfree) = NULL;
        unsigned int rate, prev;

        save = kmalloc(sizeof(*save), GFP_KERNEL);
        if (save == NULL)
                return -ENOMEM;
        *save = *params;
        it = hw_param_interval_c(save, SNDRV_PCM_HW_PARAM_RATE);

        /* try multiples of the best rate */
        rate = best_rate;
        for (;;) {
                if (it->max < rate || (it->max == rate && it->openmax))
                        break;
                if (it->min < rate || (it->min == rate && !it->openmin)) {
                        int ret;
                        ret = snd_pcm_hw_param_set(substream, params,
                                                   SNDRV_PCM_HW_PARAM_RATE,
                                                   rate, 0);
                        if (ret == (int)rate)
                                return rate;
                        *params = *save;
                }
                prev = rate;
                rate += best_rate;
                if (rate <= prev)
                        break;
        }

        /* not found, use the nearest rate */
        return snd_pcm_hw_param_near(substream, params, SNDRV_PCM_HW_PARAM_RATE, best_rate, NULL);
}

/* parameter locking: returns immediately if tried during streaming */
static int lock_params(struct snd_pcm_runtime *runtime)
{
        if (mutex_lock_interruptible(&runtime->oss.params_lock))
                return -ERESTARTSYS;
        if (atomic_read(&runtime->oss.rw_ref)) {
                mutex_unlock(&runtime->oss.params_lock);
                return -EBUSY;
        }
        return 0;
}

static void unlock_params(struct snd_pcm_runtime *runtime)
{
        mutex_unlock(&runtime->oss.params_lock);
}

static void snd_pcm_oss_release_buffers(struct snd_pcm_substream *substream)
{
        struct snd_pcm_runtime *runtime = substream->runtime;

        kvfree(runtime->oss.buffer);
        runtime->oss.buffer = NULL;
#ifdef CONFIG_SND_PCM_OSS_PLUGINS
        snd_pcm_oss_plugin_clear(substream);
#endif
}

/* call with params_lock held */
static int snd_pcm_oss_change_params_locked(struct snd_pcm_substream *substream)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        struct snd_pcm_hw_params *params, *sparams;
        struct snd_pcm_sw_params *sw_params;
        ssize_t oss_buffer_size, oss_period_size;
        size_t oss_frame_size;
        int err;
        int direct;
        snd_pcm_format_t format, sformat;
        int n;
        const struct snd_mask *sformat_mask;
        struct snd_mask mask;

        if (!runtime->oss.params)
                return 0;
        sw_params = kzalloc(sizeof(*sw_params), GFP_KERNEL);
        params = kmalloc(sizeof(*params), GFP_KERNEL);
        sparams = kmalloc(sizeof(*sparams), GFP_KERNEL);
        if (!sw_params || !params || !sparams) {
                err = -ENOMEM;
                goto failure;
        }

        if (atomic_read(&substream->mmap_count))
                direct = 1;
        else
                direct = substream->oss.setup.direct;

        _snd_pcm_hw_params_any(sparams);
        _snd_pcm_hw_param_setinteger(sparams, SNDRV_PCM_HW_PARAM_PERIODS);
        _snd_pcm_hw_param_min(sparams, SNDRV_PCM_HW_PARAM_PERIODS, 2, 0);
        snd_mask_none(&mask);
        if (atomic_read(&substream->mmap_count))
                snd_mask_set(&mask, (__force int)SNDRV_PCM_ACCESS_MMAP_INTERLEAVED);
        else {
                snd_mask_set(&mask, (__force int)SNDRV_PCM_ACCESS_RW_INTERLEAVED);
                if (!direct)
                        snd_mask_set(&mask, (__force int)SNDRV_PCM_ACCESS_RW_NONINTERLEAVED);
        }
        err = snd_pcm_hw_param_mask(substream, sparams, SNDRV_PCM_HW_PARAM_ACCESS, &mask);
        if (err < 0) {
                pcm_dbg(substream->pcm, "No usable accesses\n");
                err = -EINVAL;
                goto failure;
        }

        err = choose_rate(substream, sparams, runtime->oss.rate);
        if (err < 0)
                goto failure;
        err = snd_pcm_hw_param_near(substream, sparams,
                                    SNDRV_PCM_HW_PARAM_CHANNELS,
                                    runtime->oss.channels, NULL);
        if (err < 0)
                goto failure;

        format = snd_pcm_oss_format_from(runtime->oss.format);

        sformat_mask = hw_param_mask_c(sparams, SNDRV_PCM_HW_PARAM_FORMAT);
        if (direct)
                sformat = format;
        else
                sformat = snd_pcm_plug_slave_format(format, sformat_mask);

        if ((__force int)sformat < 0 ||
            !snd_mask_test_format(sformat_mask, sformat)) {
                pcm_for_each_format(sformat) {
                        if (snd_mask_test_format(sformat_mask, sformat) &&
                            snd_pcm_oss_format_to(sformat) >= 0)
                                goto format_found;
                }
                pcm_dbg(substream->pcm, "Cannot find a format!!!\n");
                err = -EINVAL;
                goto failure;
        }
 format_found:
        err = _snd_pcm_hw_param_set(sparams, SNDRV_PCM_HW_PARAM_FORMAT, (__force int)sformat, 0);
        if (err < 0)
                goto failure;

        if (direct) {
                memcpy(params, sparams, sizeof(*params));
        } else {
                _snd_pcm_hw_params_any(params);
                _snd_pcm_hw_param_set(params, SNDRV_PCM_HW_PARAM_ACCESS,
                                      (__force int)SNDRV_PCM_ACCESS_RW_INTERLEAVED, 0);
                _snd_pcm_hw_param_set(params, SNDRV_PCM_HW_PARAM_FORMAT,
                                      (__force int)snd_pcm_oss_format_from(runtime->oss.format), 0);
                _snd_pcm_hw_param_set(params, SNDRV_PCM_HW_PARAM_CHANNELS,
                                      runtime->oss.channels, 0);
                _snd_pcm_hw_param_set(params, SNDRV_PCM_HW_PARAM_RATE,
                                      runtime->oss.rate, 0);
                pdprintf("client: access = %i, format = %i, channels = %i, rate = %i\n",
                         params_access(params), params_format(params),
                         params_channels(params), params_rate(params));
        }
        pdprintf("slave: access = %i, format = %i, channels = %i, rate = %i\n",
                 params_access(sparams), params_format(sparams),
                 params_channels(sparams), params_rate(sparams));

        oss_frame_size = snd_pcm_format_physical_width(params_format(params)) *
                         params_channels(params) / 8;

        err = snd_pcm_oss_period_size(substream, params, sparams);
        if (err < 0)
                goto failure;

        n = snd_pcm_plug_slave_size(substream, runtime->oss.period_bytes / oss_frame_size);
        err = snd_pcm_hw_param_near(substream, sparams, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, n, NULL);
        if (err < 0)
                goto failure;

        err = snd_pcm_hw_param_near(substream, sparams, SNDRV_PCM_HW_PARAM_PERIODS,
                                     runtime->oss.periods, NULL);
        if (err < 0)
                goto failure;

        snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DROP, NULL);

        err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_HW_PARAMS, sparams);
        if (err < 0) {
                pcm_dbg(substream->pcm, "HW_PARAMS failed: %i\n", err);
                goto failure;
        }

#ifdef CONFIG_SND_PCM_OSS_PLUGINS
        snd_pcm_oss_plugin_clear(substream);
        if (!direct) {
                /* add necessary plugins */
                err = snd_pcm_plug_format_plugins(substream, params, sparams);
                if (err < 0) {
                        pcm_dbg(substream->pcm,
                                "snd_pcm_plug_format_plugins failed: %i\n", err);
                        goto failure;
                }
                if (runtime->oss.plugin_first) {
                        struct snd_pcm_plugin *plugin;
                        err = snd_pcm_plugin_build_io(substream, sparams, &plugin);
                        if (err < 0) {
                                pcm_dbg(substream->pcm,
                                        "snd_pcm_plugin_build_io failed: %i\n", err);
                                goto failure;
                        }
                        if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) {
                                err = snd_pcm_plugin_append(plugin);
                        } else {
                                err = snd_pcm_plugin_insert(plugin);
                        }
                        if (err < 0)
                                goto failure;
                }
        }
#endif

        if (runtime->oss.trigger) {
                sw_params->start_threshold = 1;
        } else {
                sw_params->start_threshold = runtime->boundary;
        }
        if (atomic_read(&substream->mmap_count) ||
            substream->stream == SNDRV_PCM_STREAM_CAPTURE)
                sw_params->stop_threshold = runtime->boundary;
        else
                sw_params->stop_threshold = runtime->buffer_size;
        sw_params->tstamp_mode = SNDRV_PCM_TSTAMP_NONE;
        sw_params->period_step = 1;
        sw_params->avail_min = substream->stream == SNDRV_PCM_STREAM_PLAYBACK ?
                1 : runtime->period_size;
        if (atomic_read(&substream->mmap_count) ||
            substream->oss.setup.nosilence) {
                sw_params->silence_threshold = 0;
                sw_params->silence_size = 0;
        } else {
                snd_pcm_uframes_t frames;
                frames = runtime->period_size + 16;
                if (frames > runtime->buffer_size)
                        frames = runtime->buffer_size;
                sw_params->silence_threshold = frames;
                sw_params->silence_size = frames;
        }

        err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_SW_PARAMS, sw_params);
        if (err < 0) {
                pcm_dbg(substream->pcm, "SW_PARAMS failed: %i\n", err);
                goto failure;
        }

        runtime->oss.periods = params_periods(sparams);
        oss_period_size = snd_pcm_plug_client_size(substream, params_period_size(sparams));
        if (oss_period_size < 0) {
                err = -EINVAL;
                goto failure;
        }
#ifdef CONFIG_SND_PCM_OSS_PLUGINS
        if (runtime->oss.plugin_first) {
                err = snd_pcm_plug_alloc(substream, oss_period_size);
                if (err < 0)
                        goto failure;
        }
#endif
        oss_period_size = array_size(oss_period_size, oss_frame_size);
        oss_buffer_size = array_size(oss_period_size, runtime->oss.periods);
        if (oss_buffer_size <= 0) {
                err = -EINVAL;
                goto failure;
        }

        runtime->oss.period_bytes = oss_period_size;
        runtime->oss.buffer_bytes = oss_buffer_size;

        pdprintf("oss: period bytes = %i, buffer bytes = %i\n",
                 runtime->oss.period_bytes,
                 runtime->oss.buffer_bytes);
        pdprintf("slave: period_size = %i, buffer_size = %i\n",
                 params_period_size(sparams),
                 params_buffer_size(sparams));

        runtime->oss.format = snd_pcm_oss_format_to(params_format(params));
        runtime->oss.channels = params_channels(params);
        runtime->oss.rate = params_rate(params);

        kvfree(runtime->oss.buffer);
        runtime->oss.buffer = kvzalloc(runtime->oss.period_bytes, GFP_KERNEL);
        if (!runtime->oss.buffer) {
                err = -ENOMEM;
                goto failure;
        }

        runtime->oss.params = 0;
        runtime->oss.prepare = 1;
        runtime->oss.buffer_used = 0;
        if (runtime->dma_area)
                snd_pcm_format_set_silence(runtime->format, runtime->dma_area, bytes_to_samples(runtime, runtime->dma_bytes));

        runtime->oss.period_frames = snd_pcm_alsa_frames(substream, oss_period_size);

        err = 0;
failure:
        if (err)
                snd_pcm_oss_release_buffers(substream);
        kfree(sw_params);
        kfree(params);
        kfree(sparams);
        return err;
}

/* this one takes the lock by itself */
static int snd_pcm_oss_change_params(struct snd_pcm_substream *substream,
                                     bool trylock)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        int err;

        if (trylock) {
                if (!(mutex_trylock(&runtime->oss.params_lock)))
                        return -EAGAIN;
        } else if (mutex_lock_interruptible(&runtime->oss.params_lock))
                return -ERESTARTSYS;

        err = snd_pcm_oss_change_params_locked(substream);
        mutex_unlock(&runtime->oss.params_lock);
        return err;
}

static int snd_pcm_oss_get_active_substream(struct snd_pcm_oss_file *pcm_oss_file, struct snd_pcm_substream **r_substream)
{
        int idx, err;
        struct snd_pcm_substream *asubstream = NULL, *substream;

        for (idx = 0; idx < 2; idx++) {
                substream = pcm_oss_file->streams[idx];
                if (substream == NULL)
                        continue;
                if (asubstream == NULL)
                        asubstream = substream;
                if (substream->runtime->oss.params) {
                        err = snd_pcm_oss_change_params(substream, false);
                        if (err < 0)
                                return err;
                }
        }
        if (!asubstream)
                return -EIO;
        if (r_substream)
                *r_substream = asubstream;
        return 0;
}

/* call with params_lock held */
/* NOTE: this always call PREPARE unconditionally no matter whether
 * runtime->oss.prepare is set or not
 */
static int snd_pcm_oss_prepare(struct snd_pcm_substream *substream)
{
        int err;
        struct snd_pcm_runtime *runtime = substream->runtime;

        err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_PREPARE, NULL);
        if (err < 0) {
                pcm_dbg(substream->pcm,
                        "snd_pcm_oss_prepare: SNDRV_PCM_IOCTL_PREPARE failed\n");
                return err;
        }
        runtime->oss.prepare = 0;
        runtime->oss.prev_hw_ptr_period = 0;
        runtime->oss.period_ptr = 0;
        runtime->oss.buffer_used = 0;

        return 0;
}

static int snd_pcm_oss_make_ready(struct snd_pcm_substream *substream)
{
        struct snd_pcm_runtime *runtime;
        int err;

        runtime = substream->runtime;
        if (runtime->oss.params) {
                err = snd_pcm_oss_change_params(substream, false);
                if (err < 0)
                        return err;
        }
        if (runtime->oss.prepare) {
                if (mutex_lock_interruptible(&runtime->oss.params_lock))
                        return -ERESTARTSYS;
                err = snd_pcm_oss_prepare(substream);
                mutex_unlock(&runtime->oss.params_lock);
                if (err < 0)
                        return err;
        }
        return 0;
}

/* call with params_lock held */
static int snd_pcm_oss_make_ready_locked(struct snd_pcm_substream *substream)
{
        struct snd_pcm_runtime *runtime;
        int err;

        runtime = substream->runtime;
        if (runtime->oss.params) {
                err = snd_pcm_oss_change_params_locked(substream);
                if (err < 0)
                        return err;
        }
        if (runtime->oss.prepare) {
                err = snd_pcm_oss_prepare(substream);
                if (err < 0)
                        return err;
        }
        return 0;
}

static int snd_pcm_oss_capture_position_fixup(struct snd_pcm_substream *substream, snd_pcm_sframes_t *delay)
{
        struct snd_pcm_runtime *runtime;
        snd_pcm_uframes_t frames;
        int err = 0;

        while (1) {
                err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DELAY, delay);
                if (err < 0)
                        break;
                runtime = substream->runtime;
                if (*delay <= (snd_pcm_sframes_t)runtime->buffer_size)
                        break;
                /* in case of overrun, skip whole periods like OSS/Linux driver does */
                /* until avail(delay) <= buffer_size */
                frames = (*delay - runtime->buffer_size) + runtime->period_size - 1;
                frames /= runtime->period_size;
                frames *= runtime->period_size;
                err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_FORWARD, &frames);
                if (err < 0)
                        break;
        }
        return err;
}

snd_pcm_sframes_t snd_pcm_oss_write3(struct snd_pcm_substream *substream, const char *ptr, snd_pcm_uframes_t frames, int in_kernel)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        int ret;
        while (1) {
                if (runtime->state == SNDRV_PCM_STATE_XRUN ||
                    runtime->state == SNDRV_PCM_STATE_SUSPENDED) {
#ifdef OSS_DEBUG
                        pcm_dbg(substream->pcm,
                                "pcm_oss: write: recovering from %s\n",
                                runtime->state == SNDRV_PCM_STATE_XRUN ?
                                "XRUN" : "SUSPEND");
#endif
                        ret = snd_pcm_oss_prepare(substream);
                        if (ret < 0)
                                break;
                }
                mutex_unlock(&runtime->oss.params_lock);
                ret = __snd_pcm_lib_xfer(substream, (void *)ptr, true,
                                         frames, in_kernel);
                mutex_lock(&runtime->oss.params_lock);
                if (ret != -EPIPE && ret != -ESTRPIPE)
                        break;
                /* test, if we can't store new data, because the stream */
                /* has not been started */
                if (runtime->state == SNDRV_PCM_STATE_PREPARED)
                        return -EAGAIN;
        }
        return ret;
}

snd_pcm_sframes_t snd_pcm_oss_read3(struct snd_pcm_substream *substream, char *ptr, snd_pcm_uframes_t frames, int in_kernel)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        snd_pcm_sframes_t delay;
        int ret;
        while (1) {
                if (runtime->state == SNDRV_PCM_STATE_XRUN ||
                    runtime->state == SNDRV_PCM_STATE_SUSPENDED) {
#ifdef OSS_DEBUG
                        pcm_dbg(substream->pcm,
                                "pcm_oss: read: recovering from %s\n",
                                runtime->state == SNDRV_PCM_STATE_XRUN ?
                                "XRUN" : "SUSPEND");
#endif
                        ret = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DRAIN, NULL);
                        if (ret < 0)
                                break;
                } else if (runtime->state == SNDRV_PCM_STATE_SETUP) {
                        ret = snd_pcm_oss_prepare(substream);
                        if (ret < 0)
                                break;
                }
                ret = snd_pcm_oss_capture_position_fixup(substream, &delay);
                if (ret < 0)
                        break;
                mutex_unlock(&runtime->oss.params_lock);
                ret = __snd_pcm_lib_xfer(substream, (void *)ptr, true,
                                         frames, in_kernel);
                mutex_lock(&runtime->oss.params_lock);
                if (ret == -EPIPE) {
                        if (runtime->state == SNDRV_PCM_STATE_DRAINING) {
                                ret = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DROP, NULL);
                                if (ret < 0)
                                        break;
                        }
                        continue;
                }
                if (ret != -ESTRPIPE)
                        break;
        }
        return ret;
}

#ifdef CONFIG_SND_PCM_OSS_PLUGINS
snd_pcm_sframes_t snd_pcm_oss_writev3(struct snd_pcm_substream *substream, void **bufs, snd_pcm_uframes_t frames)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        int ret;
        while (1) {
                if (runtime->state == SNDRV_PCM_STATE_XRUN ||
                    runtime->state == SNDRV_PCM_STATE_SUSPENDED) {
#ifdef OSS_DEBUG
                        pcm_dbg(substream->pcm,
                                "pcm_oss: writev: recovering from %s\n",
                                runtime->state == SNDRV_PCM_STATE_XRUN ?
                                "XRUN" : "SUSPEND");
#endif
                        ret = snd_pcm_oss_prepare(substream);
                        if (ret < 0)
                                break;
                }
                ret = snd_pcm_kernel_writev(substream, bufs, frames);
                if (ret != -EPIPE && ret != -ESTRPIPE)
                        break;

                /* test, if we can't store new data, because the stream */
                /* has not been started */
                if (runtime->state == SNDRV_PCM_STATE_PREPARED)
                        return -EAGAIN;
        }
        return ret;
}
        
snd_pcm_sframes_t snd_pcm_oss_readv3(struct snd_pcm_substream *substream, void **bufs, snd_pcm_uframes_t frames)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        int ret;
        while (1) {
                if (runtime->state == SNDRV_PCM_STATE_XRUN ||
                    runtime->state == SNDRV_PCM_STATE_SUSPENDED) {
#ifdef OSS_DEBUG
                        pcm_dbg(substream->pcm,
                                "pcm_oss: readv: recovering from %s\n",
                                runtime->state == SNDRV_PCM_STATE_XRUN ?
                                "XRUN" : "SUSPEND");
#endif
                        ret = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DRAIN, NULL);
                        if (ret < 0)
                                break;
                } else if (runtime->state == SNDRV_PCM_STATE_SETUP) {
                        ret = snd_pcm_oss_prepare(substream);
                        if (ret < 0)
                                break;
                }
                ret = snd_pcm_kernel_readv(substream, bufs, frames);
                if (ret != -EPIPE && ret != -ESTRPIPE)
                        break;
        }
        return ret;
}
#endif /* CONFIG_SND_PCM_OSS_PLUGINS */

static ssize_t snd_pcm_oss_write2(struct snd_pcm_substream *substream, const char *buf, size_t bytes, int in_kernel)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        snd_pcm_sframes_t frames, frames1;
#ifdef CONFIG_SND_PCM_OSS_PLUGINS
        if (runtime->oss.plugin_first) {
                struct snd_pcm_plugin_channel *channels;
                size_t oss_frame_bytes = (runtime->oss.plugin_first->src_width * runtime->oss.plugin_first->src_format.channels) / 8;
                if (!in_kernel) {
                        if (copy_from_user(runtime->oss.buffer, (const char __force __user *)buf, bytes))
                                return -EFAULT;
                        buf = runtime->oss.buffer;
                }
                frames = bytes / oss_frame_bytes;
                frames1 = snd_pcm_plug_client_channels_buf(substream, (char *)buf, frames, &channels);
                if (frames1 < 0)
                        return frames1;
                frames1 = snd_pcm_plug_write_transfer(substream, channels, frames1);
                if (frames1 <= 0)
                        return frames1;
                bytes = frames1 * oss_frame_bytes;
        } else
#endif
        {
                frames = bytes_to_frames(runtime, bytes);
                frames1 = snd_pcm_oss_write3(substream, buf, frames, in_kernel);
                if (frames1 <= 0)
                        return frames1;
                bytes = frames_to_bytes(runtime, frames1);
        }
        return bytes;
}

static ssize_t snd_pcm_oss_write1(struct snd_pcm_substream *substream, const char __user *buf, size_t bytes)
{
        size_t xfer = 0;
        ssize_t tmp = 0;
        struct snd_pcm_runtime *runtime = substream->runtime;

        if (atomic_read(&substream->mmap_count))
                return -ENXIO;

        atomic_inc(&runtime->oss.rw_ref);
        while (bytes > 0) {
                if (mutex_lock_interruptible(&runtime->oss.params_lock)) {
                        tmp = -ERESTARTSYS;
                        break;
                }
                tmp = snd_pcm_oss_make_ready_locked(substream);
                if (tmp < 0)
                        goto err;
                if (bytes < runtime->oss.period_bytes || runtime->oss.buffer_used > 0) {
                        tmp = bytes;
                        if (tmp + runtime->oss.buffer_used > runtime->oss.period_bytes)
                                tmp = runtime->oss.period_bytes - runtime->oss.buffer_used;
                        if (tmp > 0) {
                                if (copy_from_user(runtime->oss.buffer + runtime->oss.buffer_used, buf, tmp)) {
                                        tmp = -EFAULT;
                                        goto err;
                                }
                        }
                        runtime->oss.buffer_used += tmp;
                        buf += tmp;
                        bytes -= tmp;
                        xfer += tmp;
                        if (substream->oss.setup.partialfrag ||
                            runtime->oss.buffer_used == runtime->oss.period_bytes) {
                                tmp = snd_pcm_oss_write2(substream, runtime->oss.buffer + runtime->oss.period_ptr, 
                                                         runtime->oss.buffer_used - runtime->oss.period_ptr, 1);
                                if (tmp <= 0)
                                        goto err;
                                runtime->oss.bytes += tmp;
                                runtime->oss.period_ptr += tmp;
                                runtime->oss.period_ptr %= runtime->oss.period_bytes;
                                if (runtime->oss.period_ptr == 0 ||
                                    runtime->oss.period_ptr == runtime->oss.buffer_used)
                                        runtime->oss.buffer_used = 0;
                                else if ((substream->f_flags & O_NONBLOCK) != 0) {
                                        tmp = -EAGAIN;
                                        goto err;
                                }
                        }
                } else {
                        tmp = snd_pcm_oss_write2(substream,
                                                 (const char __force *)buf,
                                                 runtime->oss.period_bytes, 0);
                        if (tmp <= 0)
                                goto err;
                        runtime->oss.bytes += tmp;
                        buf += tmp;
                        bytes -= tmp;
                        xfer += tmp;
                        if ((substream->f_flags & O_NONBLOCK) != 0 &&
                            tmp != runtime->oss.period_bytes)
                                tmp = -EAGAIN;
                }
 err:
                mutex_unlock(&runtime->oss.params_lock);
                if (tmp < 0)
                        break;
                if (signal_pending(current)) {
                        tmp = -ERESTARTSYS;
                        break;
                }
                tmp = 0;
        }
        atomic_dec(&runtime->oss.rw_ref);
        return xfer > 0 ? (snd_pcm_sframes_t)xfer : tmp;
}

static ssize_t snd_pcm_oss_read2(struct snd_pcm_substream *substream, char *buf, size_t bytes, int in_kernel)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        snd_pcm_sframes_t frames, frames1;
#ifdef CONFIG_SND_PCM_OSS_PLUGINS
        char __user *final_dst = (char __force __user *)buf;
        if (runtime->oss.plugin_first) {
                struct snd_pcm_plugin_channel *channels;
                size_t oss_frame_bytes = (runtime->oss.plugin_last->dst_width * runtime->oss.plugin_last->dst_format.channels) / 8;
                if (!in_kernel)
                        buf = runtime->oss.buffer;
                frames = bytes / oss_frame_bytes;
                frames1 = snd_pcm_plug_client_channels_buf(substream, buf, frames, &channels);
                if (frames1 < 0)
                        return frames1;
                frames1 = snd_pcm_plug_read_transfer(substream, channels, frames1);
                if (frames1 <= 0)
                        return frames1;
                bytes = frames1 * oss_frame_bytes;
                if (!in_kernel && copy_to_user(final_dst, buf, bytes))
                        return -EFAULT;
        } else
#endif
        {
                frames = bytes_to_frames(runtime, bytes);
                frames1 = snd_pcm_oss_read3(substream, buf, frames, in_kernel);
                if (frames1 <= 0)
                        return frames1;
                bytes = frames_to_bytes(runtime, frames1);
        }
        return bytes;
}

static ssize_t snd_pcm_oss_read1(struct snd_pcm_substream *substream, char __user *buf, size_t bytes)
{
        size_t xfer = 0;
        ssize_t tmp = 0;
        struct snd_pcm_runtime *runtime = substream->runtime;

        if (atomic_read(&substream->mmap_count))
                return -ENXIO;

        atomic_inc(&runtime->oss.rw_ref);
        while (bytes > 0) {
                if (mutex_lock_interruptible(&runtime->oss.params_lock)) {
                        tmp = -ERESTARTSYS;
                        break;
                }
                tmp = snd_pcm_oss_make_ready_locked(substream);
                if (tmp < 0)
                        goto err;
                if (bytes < runtime->oss.period_bytes || runtime->oss.buffer_used > 0) {
                        if (runtime->oss.buffer_used == 0) {
                                tmp = snd_pcm_oss_read2(substream, runtime->oss.buffer, runtime->oss.period_bytes, 1);
                                if (tmp <= 0)
                                        goto err;
                                runtime->oss.bytes += tmp;
                                runtime->oss.period_ptr = tmp;
                                runtime->oss.buffer_used = tmp;
                        }
                        tmp = bytes;
                        if ((size_t) tmp > runtime->oss.buffer_used)
                                tmp = runtime->oss.buffer_used;
                        if (copy_to_user(buf, runtime->oss.buffer + (runtime->oss.period_ptr - runtime->oss.buffer_used), tmp)) {
                                tmp = -EFAULT;
                                goto err;
                        }
                        buf += tmp;
                        bytes -= tmp;
                        xfer += tmp;
                        runtime->oss.buffer_used -= tmp;
                } else {
                        tmp = snd_pcm_oss_read2(substream, (char __force *)buf,
                                                runtime->oss.period_bytes, 0);
                        if (tmp <= 0)
                                goto err;
                        runtime->oss.bytes += tmp;
                        buf += tmp;
                        bytes -= tmp;
                        xfer += tmp;
                }
 err:
                mutex_unlock(&runtime->oss.params_lock);
                if (tmp < 0)
                        break;
                if (signal_pending(current)) {
                        tmp = -ERESTARTSYS;
                        break;
                }
                tmp = 0;
        }
        atomic_dec(&runtime->oss.rw_ref);
        return xfer > 0 ? (snd_pcm_sframes_t)xfer : tmp;
}

static int snd_pcm_oss_reset(struct snd_pcm_oss_file *pcm_oss_file)
{
        struct snd_pcm_substream *substream;
        struct snd_pcm_runtime *runtime;
        int i;

        for (i = 0; i < 2; i++) { 
                substream = pcm_oss_file->streams[i];
                if (!substream)
                        continue;
                runtime = substream->runtime;
                snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DROP, NULL);
                mutex_lock(&runtime->oss.params_lock);
                runtime->oss.prepare = 1;
                runtime->oss.buffer_used = 0;
                runtime->oss.prev_hw_ptr_period = 0;
                runtime->oss.period_ptr = 0;
                mutex_unlock(&runtime->oss.params_lock);
        }
        return 0;
}

static int snd_pcm_oss_post(struct snd_pcm_oss_file *pcm_oss_file)
{
        struct snd_pcm_substream *substream;
        int err;

        substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK];
        if (substream != NULL) {
                err = snd_pcm_oss_make_ready(substream);
                if (err < 0)
                        return err;
                snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_START, NULL);
        }
        /* note: all errors from the start action are ignored */
        /* OSS apps do not know, how to handle them */
        return 0;
}

static int snd_pcm_oss_sync1(struct snd_pcm_substream *substream, size_t size)
{
        struct snd_pcm_runtime *runtime;
        ssize_t result = 0;
        snd_pcm_state_t state;
        long res;
        wait_queue_entry_t wait;

        runtime = substream->runtime;
        init_waitqueue_entry(&wait, current);
        add_wait_queue(&runtime->sleep, &wait);
#ifdef OSS_DEBUG
        pcm_dbg(substream->pcm, "sync1: size = %li\n", size);
#endif
        while (1) {
                result = snd_pcm_oss_write2(substream, runtime->oss.buffer, size, 1);
                if (result > 0) {
                        runtime->oss.buffer_used = 0;
                        result = 0;
                        break;
                }
                if (result != 0 && result != -EAGAIN)
                        break;
                result = 0;
                set_current_state(TASK_INTERRUPTIBLE);
                scoped_guard(pcm_stream_lock_irq, substream)
                        state = runtime->state;
                if (state != SNDRV_PCM_STATE_RUNNING) {
                        set_current_state(TASK_RUNNING);
                        break;
                }
                res = schedule_timeout(10 * HZ);
                if (signal_pending(current)) {
                        result = -ERESTARTSYS;
                        break;
                }
                if (res == 0) {
                        pcm_err(substream->pcm,
                                "OSS sync error - DMA timeout\n");
                        result = -EIO;
                        break;
                }
        }
        remove_wait_queue(&runtime->sleep, &wait);
        return result;
}

static int snd_pcm_oss_sync(struct snd_pcm_oss_file *pcm_oss_file)
{
        int err = 0;
        unsigned int saved_f_flags;
        struct snd_pcm_substream *substream;
        struct snd_pcm_runtime *runtime;
        snd_pcm_format_t format;
        unsigned long width;
        size_t size;

        substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK];
        if (substream != NULL) {
                runtime = substream->runtime;
                if (atomic_read(&substream->mmap_count))
                        goto __direct;
                atomic_inc(&runtime->oss.rw_ref);
                if (mutex_lock_interruptible(&runtime->oss.params_lock)) {
                        atomic_dec(&runtime->oss.rw_ref);
                        return -ERESTARTSYS;
                }
                err = snd_pcm_oss_make_ready_locked(substream);
                if (err < 0)
                        goto unlock;
                format = snd_pcm_oss_format_from(runtime->oss.format);
                width = snd_pcm_format_physical_width(format);
                if (runtime->oss.buffer_used > 0) {
#ifdef OSS_DEBUG
                        pcm_dbg(substream->pcm, "sync: buffer_used\n");
#endif
                        size = (8 * (runtime->oss.period_bytes - runtime->oss.buffer_used) + 7) / width;
                        snd_pcm_format_set_silence(format,
                                                   runtime->oss.buffer + runtime->oss.buffer_used,
                                                   size);
                        err = snd_pcm_oss_sync1(substream, runtime->oss.period_bytes);
                        if (err < 0)
                                goto unlock;
                } else if (runtime->oss.period_ptr > 0) {
#ifdef OSS_DEBUG
                        pcm_dbg(substream->pcm, "sync: period_ptr\n");
#endif
                        size = runtime->oss.period_bytes - runtime->oss.period_ptr;
                        snd_pcm_format_set_silence(format,
                                                   runtime->oss.buffer,
                                                   size * 8 / width);
                        err = snd_pcm_oss_sync1(substream, size);
                        if (err < 0)
                                goto unlock;
                }
                /*
                 * The ALSA's period might be a bit large than OSS one.
                 * Fill the remain portion of ALSA period with zeros.
                 */
                size = runtime->control->appl_ptr % runtime->period_size;
                if (size > 0) {
                        size = runtime->period_size - size;
                        if (runtime->access == SNDRV_PCM_ACCESS_RW_INTERLEAVED)
                                snd_pcm_lib_write(substream, NULL, size);
                        else if (runtime->access == SNDRV_PCM_ACCESS_RW_NONINTERLEAVED)
                                snd_pcm_lib_writev(substream, NULL, size);
                }
unlock:
                mutex_unlock(&runtime->oss.params_lock);
                atomic_dec(&runtime->oss.rw_ref);
                if (err < 0)
                        return err;
                /*
                 * finish sync: drain the buffer
                 */
              __direct:
                saved_f_flags = substream->f_flags;
                substream->f_flags &= ~O_NONBLOCK;
                err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DRAIN, NULL);
                substream->f_flags = saved_f_flags;
                if (err < 0)
                        return err;
                mutex_lock(&runtime->oss.params_lock);
                runtime->oss.prepare = 1;
                mutex_unlock(&runtime->oss.params_lock);
        }

        substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE];
        if (substream != NULL) {
                err = snd_pcm_oss_make_ready(substream);
                if (err < 0)
                        return err;
                runtime = substream->runtime;
                err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DROP, NULL);
                if (err < 0)
                        return err;
                mutex_lock(&runtime->oss.params_lock);
                runtime->oss.buffer_used = 0;
                runtime->oss.prepare = 1;
                mutex_unlock(&runtime->oss.params_lock);
        }
        return 0;
}

static int snd_pcm_oss_set_rate(struct snd_pcm_oss_file *pcm_oss_file, int rate)
{
        int idx;

        for (idx = 1; idx >= 0; --idx) {
                struct snd_pcm_substream *substream = pcm_oss_file->streams[idx];
                struct snd_pcm_runtime *runtime;
                int err;

                if (substream == NULL)
                        continue;
                runtime = substream->runtime;
                if (rate < 1000)
                        rate = 1000;
                else if (rate > 192000)
                        rate = 192000;
                err = lock_params(runtime);
                if (err < 0)
                        return err;
                if (runtime->oss.rate != rate) {
                        runtime->oss.params = 1;
                        runtime->oss.rate = rate;
                }
                unlock_params(runtime);
        }
        return snd_pcm_oss_get_rate(pcm_oss_file);
}

static int snd_pcm_oss_get_rate(struct snd_pcm_oss_file *pcm_oss_file)
{
        struct snd_pcm_substream *substream;
        int err;
        
        err = snd_pcm_oss_get_active_substream(pcm_oss_file, &substream);
        if (err < 0)
                return err;
        return substream->runtime->oss.rate;
}

static int snd_pcm_oss_set_channels(struct snd_pcm_oss_file *pcm_oss_file, unsigned int channels)
{
        int idx;
        if (channels < 1)
                channels = 1;
        if (channels > 128)
                return -EINVAL;
        for (idx = 1; idx >= 0; --idx) {
                struct snd_pcm_substream *substream = pcm_oss_file->streams[idx];
                struct snd_pcm_runtime *runtime;
                int err;

                if (substream == NULL)
                        continue;
                runtime = substream->runtime;
                err = lock_params(runtime);
                if (err < 0)
                        return err;
                if (runtime->oss.channels != channels) {
                        runtime->oss.params = 1;
                        runtime->oss.channels = channels;
                }
                unlock_params(runtime);
        }
        return snd_pcm_oss_get_channels(pcm_oss_file);
}

static int snd_pcm_oss_get_channels(struct snd_pcm_oss_file *pcm_oss_file)
{
        struct snd_pcm_substream *substream;
        int err;
        
        err = snd_pcm_oss_get_active_substream(pcm_oss_file, &substream);
        if (err < 0)
                return err;
        return substream->runtime->oss.channels;
}

static int snd_pcm_oss_get_block_size(struct snd_pcm_oss_file *pcm_oss_file)
{
        struct snd_pcm_substream *substream;
        int err;
        
        err = snd_pcm_oss_get_active_substream(pcm_oss_file, &substream);
        if (err < 0)
                return err;
        return substream->runtime->oss.period_bytes;
}

static int snd_pcm_oss_get_formats(struct snd_pcm_oss_file *pcm_oss_file)
{
        struct snd_pcm_substream *substream;
        int err;
        int direct;
        struct snd_pcm_hw_params *params __free(kfree) = NULL;
        unsigned int formats = 0;
        const struct snd_mask *format_mask;
        int fmt;

        err = snd_pcm_oss_get_active_substream(pcm_oss_file, &substream);
        if (err < 0)
                return err;
        if (atomic_read(&substream->mmap_count))
                direct = 1;
        else
                direct = substream->oss.setup.direct;
        if (!direct)
                return AFMT_MU_LAW | AFMT_U8 |
                       AFMT_S16_LE | AFMT_S16_BE |
                       AFMT_S8 | AFMT_U16_LE |
                       AFMT_U16_BE |
                        AFMT_S32_LE | AFMT_S32_BE |
                        AFMT_S24_LE | AFMT_S24_BE |
                        AFMT_S24_PACKED;
        params = kmalloc(sizeof(*params), GFP_KERNEL);
        if (!params)
                return -ENOMEM;
        _snd_pcm_hw_params_any(params);
        err = snd_pcm_hw_refine(substream, params);
        if (err < 0)
                return err;
        format_mask = hw_param_mask_c(params, SNDRV_PCM_HW_PARAM_FORMAT);
        for (fmt = 0; fmt < 32; ++fmt) {
                if (snd_mask_test(format_mask, fmt)) {
                        int f = snd_pcm_oss_format_to((__force snd_pcm_format_t)fmt);
                        if (f >= 0)
                                formats |= f;
                }
        }

        return formats;
}

static int snd_pcm_oss_set_format(struct snd_pcm_oss_file *pcm_oss_file, int format)
{
        int formats, idx;
        int err;
        
        if (format != AFMT_QUERY) {
                formats = snd_pcm_oss_get_formats(pcm_oss_file);
                if (formats < 0)
                        return formats;
                if (!(formats & format))
                        format = AFMT_U8;
                for (idx = 1; idx >= 0; --idx) {
                        struct snd_pcm_substream *substream = pcm_oss_file->streams[idx];
                        struct snd_pcm_runtime *runtime;
                        if (substream == NULL)
                                continue;
                        runtime = substream->runtime;
                        err = lock_params(runtime);
                        if (err < 0)
                                return err;
                        if (runtime->oss.format != format) {
                                runtime->oss.params = 1;
                                runtime->oss.format = format;
                        }
                        unlock_params(runtime);
                }
        }
        return snd_pcm_oss_get_format(pcm_oss_file);
}

static int snd_pcm_oss_get_format(struct snd_pcm_oss_file *pcm_oss_file)
{
        struct snd_pcm_substream *substream;
        int err;
        
        err = snd_pcm_oss_get_active_substream(pcm_oss_file, &substream);
        if (err < 0)
                return err;
        return substream->runtime->oss.format;
}

static int snd_pcm_oss_set_subdivide1(struct snd_pcm_substream *substream, int subdivide)
{
        struct snd_pcm_runtime *runtime;

        runtime = substream->runtime;
        if (subdivide == 0) {
                subdivide = runtime->oss.subdivision;
                if (subdivide == 0)
                        subdivide = 1;
                return subdivide;
        }
        if (runtime->oss.subdivision || runtime->oss.fragshift)
                return -EINVAL;
        if (subdivide != 1 && subdivide != 2 && subdivide != 4 &&
            subdivide != 8 && subdivide != 16)
                return -EINVAL;
        runtime->oss.subdivision = subdivide;
        runtime->oss.params = 1;
        return subdivide;
}

static int snd_pcm_oss_set_subdivide(struct snd_pcm_oss_file *pcm_oss_file, int subdivide)
{
        int err = -EINVAL, idx;

        for (idx = 1; idx >= 0; --idx) {
                struct snd_pcm_substream *substream = pcm_oss_file->streams[idx];
                struct snd_pcm_runtime *runtime;

                if (substream == NULL)
                        continue;
                runtime = substream->runtime;
                err = lock_params(runtime);
                if (err < 0)
                        return err;
                err = snd_pcm_oss_set_subdivide1(substream, subdivide);
                unlock_params(runtime);
                if (err < 0)
                        return err;
        }
        return err;
}

static int snd_pcm_oss_set_fragment1(struct snd_pcm_substream *substream, unsigned int val)
{
        struct snd_pcm_runtime *runtime;
        int fragshift;

        runtime = substream->runtime;
        if (runtime->oss.subdivision || runtime->oss.fragshift)
                return -EINVAL;
        fragshift = val & 0xffff;
        if (fragshift >= 25) /* should be large enough */
                return -EINVAL;
        runtime->oss.fragshift = fragshift;
        runtime->oss.maxfrags = (val >> 16) & 0xffff;
        if (runtime->oss.fragshift < 4)                /* < 16 */
                runtime->oss.fragshift = 4;
        if (runtime->oss.maxfrags < 2)
                runtime->oss.maxfrags = 2;
        runtime->oss.params = 1;
        return 0;
}

static int snd_pcm_oss_set_fragment(struct snd_pcm_oss_file *pcm_oss_file, unsigned int val)
{
        int err = -EINVAL, idx;

        for (idx = 1; idx >= 0; --idx) {
                struct snd_pcm_substream *substream = pcm_oss_file->streams[idx];
                struct snd_pcm_runtime *runtime;

                if (substream == NULL)
                        continue;
                runtime = substream->runtime;
                err = lock_params(runtime);
                if (err < 0)
                        return err;
                err = snd_pcm_oss_set_fragment1(substream, val);
                unlock_params(runtime);
                if (err < 0)
                        return err;
        }
        return err;
}

static int snd_pcm_oss_nonblock(struct file * file)
{
        spin_lock(&file->f_lock);
        file->f_flags |= O_NONBLOCK;
        spin_unlock(&file->f_lock);
        return 0;
}

static int snd_pcm_oss_get_caps1(struct snd_pcm_substream *substream, int res)
{

        if (substream == NULL) {
                res &= ~DSP_CAP_DUPLEX;
                return res;
        }
#ifdef DSP_CAP_MULTI
        if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK)
                if (substream->pstr->substream_count > 1)
                        res |= DSP_CAP_MULTI;
#endif
        /* DSP_CAP_REALTIME is set all times: */
        /* all ALSA drivers can return actual pointer in ring buffer */
#if defined(DSP_CAP_REALTIME) && 0
        {
                struct snd_pcm_runtime *runtime = substream->runtime;
                if (runtime->info & (SNDRV_PCM_INFO_BLOCK_TRANSFER|SNDRV_PCM_INFO_BATCH))
                        res &= ~DSP_CAP_REALTIME;
        }
#endif
        return res;
}

static int snd_pcm_oss_get_caps(struct snd_pcm_oss_file *pcm_oss_file)
{
        int result, idx;
        
        result = DSP_CAP_TRIGGER | DSP_CAP_MMAP        | DSP_CAP_DUPLEX | DSP_CAP_REALTIME;
        for (idx = 0; idx < 2; idx++) {
                struct snd_pcm_substream *substream = pcm_oss_file->streams[idx];
                result = snd_pcm_oss_get_caps1(substream, result);
        }
        result |= 0x0001;        /* revision - same as SB AWE 64 */
        return result;
}

static void snd_pcm_oss_simulate_fill(struct snd_pcm_substream *substream,
                                      snd_pcm_uframes_t hw_ptr)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        snd_pcm_uframes_t appl_ptr;
        appl_ptr = hw_ptr + runtime->buffer_size;
        appl_ptr %= runtime->boundary;
        runtime->control->appl_ptr = appl_ptr;
}

static int snd_pcm_oss_set_trigger(struct snd_pcm_oss_file *pcm_oss_file, int trigger)
{
        struct snd_pcm_runtime *runtime;
        struct snd_pcm_substream *psubstream = NULL, *csubstream = NULL;
        int err, cmd;

#ifdef OSS_DEBUG
        pr_debug("pcm_oss: trigger = 0x%x\n", trigger);
#endif
        
        psubstream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK];
        csubstream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE];

        if (psubstream) {
                err = snd_pcm_oss_make_ready(psubstream);
                if (err < 0)
                        return err;
        }
        if (csubstream) {
                err = snd_pcm_oss_make_ready(csubstream);
                if (err < 0)
                        return err;
        }
              if (psubstream) {
                      runtime = psubstream->runtime;
                cmd = 0;
                if (mutex_lock_interruptible(&runtime->oss.params_lock))
                        return -ERESTARTSYS;
                if (trigger & PCM_ENABLE_OUTPUT) {
                        if (runtime->oss.trigger)
                                goto _skip1;
                        if (atomic_read(&psubstream->mmap_count))
                                snd_pcm_oss_simulate_fill(psubstream,
                                                get_hw_ptr_period(runtime));
                        runtime->oss.trigger = 1;
                        runtime->start_threshold = 1;
                        cmd = SNDRV_PCM_IOCTL_START;
                } else {
                        if (!runtime->oss.trigger)
                                goto _skip1;
                        runtime->oss.trigger = 0;
                        runtime->start_threshold = runtime->boundary;
                        cmd = SNDRV_PCM_IOCTL_DROP;
                        runtime->oss.prepare = 1;
                }
 _skip1:
                mutex_unlock(&runtime->oss.params_lock);
                if (cmd) {
                        err = snd_pcm_kernel_ioctl(psubstream, cmd, NULL);
                        if (err < 0)
                                return err;
                }
        }
        if (csubstream) {
                      runtime = csubstream->runtime;
                cmd = 0;
                if (mutex_lock_interruptible(&runtime->oss.params_lock))
                        return -ERESTARTSYS;
                if (trigger & PCM_ENABLE_INPUT) {
                        if (runtime->oss.trigger)
                                goto _skip2;
                        runtime->oss.trigger = 1;
                        runtime->start_threshold = 1;
                        cmd = SNDRV_PCM_IOCTL_START;
                } else {
                        if (!runtime->oss.trigger)
                                goto _skip2;
                        runtime->oss.trigger = 0;
                        runtime->start_threshold = runtime->boundary;
                        cmd = SNDRV_PCM_IOCTL_DROP;
                        runtime->oss.prepare = 1;
                }
 _skip2:
                mutex_unlock(&runtime->oss.params_lock);
                if (cmd) {
                        err = snd_pcm_kernel_ioctl(csubstream, cmd, NULL);
                        if (err < 0)
                                return err;
                }
        }
        return 0;
}

static int snd_pcm_oss_get_trigger(struct snd_pcm_oss_file *pcm_oss_file)
{
        struct snd_pcm_substream *psubstream = NULL, *csubstream = NULL;
        int result = 0;

        psubstream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK];
        csubstream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE];
        if (psubstream && psubstream->runtime && psubstream->runtime->oss.trigger)
                result |= PCM_ENABLE_OUTPUT;
        if (csubstream && csubstream->runtime && csubstream->runtime->oss.trigger)
                result |= PCM_ENABLE_INPUT;
        return result;
}

static int snd_pcm_oss_get_odelay(struct snd_pcm_oss_file *pcm_oss_file)
{
        struct snd_pcm_substream *substream;
        struct snd_pcm_runtime *runtime;
        snd_pcm_sframes_t delay;
        int err;

        substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK];
        if (substream == NULL)
                return -EINVAL;
        err = snd_pcm_oss_make_ready(substream);
        if (err < 0)
                return err;
        runtime = substream->runtime;
        if (runtime->oss.params || runtime->oss.prepare)
                return 0;
        err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DELAY, &delay);
        if (err == -EPIPE)
                delay = 0;        /* hack for broken OSS applications */
        else if (err < 0)
                return err;
        return snd_pcm_oss_bytes(substream, delay);
}

static int snd_pcm_oss_get_ptr(struct snd_pcm_oss_file *pcm_oss_file, int stream, struct count_info __user * _info)
{        
        struct snd_pcm_substream *substream;
        struct snd_pcm_runtime *runtime;
        snd_pcm_sframes_t delay;
        int fixup;
        struct count_info info;
        int err;

        if (_info == NULL)
                return -EFAULT;
        substream = pcm_oss_file->streams[stream];
        if (substream == NULL)
                return -EINVAL;
        err = snd_pcm_oss_make_ready(substream);
        if (err < 0)
                return err;
        runtime = substream->runtime;
        if (runtime->oss.params || runtime->oss.prepare) {
                memset(&info, 0, sizeof(info));
                if (copy_to_user(_info, &info, sizeof(info)))
                        return -EFAULT;
                return 0;
        }
        if (stream == SNDRV_PCM_STREAM_PLAYBACK) {
                err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DELAY, &delay);
                if (err == -EPIPE || err == -ESTRPIPE || (! err && delay < 0)) {
                        err = 0;
                        delay = 0;
                        fixup = 0;
                } else {
                        fixup = runtime->oss.buffer_used;
                }
        } else {
                err = snd_pcm_oss_capture_position_fixup(substream, &delay);
                fixup = -runtime->oss.buffer_used;
        }
        if (err < 0)
                return err;
        info.ptr = snd_pcm_oss_bytes(substream, runtime->status->hw_ptr % runtime->buffer_size);
        if (atomic_read(&substream->mmap_count)) {
                snd_pcm_sframes_t n;
                delay = get_hw_ptr_period(runtime);
                n = delay - runtime->oss.prev_hw_ptr_period;
                if (n < 0)
                        n += runtime->boundary;
                info.blocks = n / runtime->period_size;
                runtime->oss.prev_hw_ptr_period = delay;
                if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK)
                        snd_pcm_oss_simulate_fill(substream, delay);
                info.bytes = snd_pcm_oss_bytes(substream, runtime->status->hw_ptr) & INT_MAX;
        } else {
                delay = snd_pcm_oss_bytes(substream, delay);
                if (stream == SNDRV_PCM_STREAM_PLAYBACK) {
                        if (substream->oss.setup.buggyptr)
                                info.blocks = (runtime->oss.buffer_bytes - delay - fixup) / runtime->oss.period_bytes;
                        else
                                info.blocks = (delay + fixup) / runtime->oss.period_bytes;
                        info.bytes = (runtime->oss.bytes - delay) & INT_MAX;
                } else {
                        delay += fixup;
                        info.blocks = delay / runtime->oss.period_bytes;
                        info.bytes = (runtime->oss.bytes + delay) & INT_MAX;
                }
        }
        if (copy_to_user(_info, &info, sizeof(info)))
                return -EFAULT;
        return 0;
}

static int snd_pcm_oss_get_space(struct snd_pcm_oss_file *pcm_oss_file, int stream, struct audio_buf_info __user *_info)
{
        struct snd_pcm_substream *substream;
        struct snd_pcm_runtime *runtime;
        snd_pcm_sframes_t avail;
        int fixup;
        struct audio_buf_info info;
        int err;

        if (_info == NULL)
                return -EFAULT;
        substream = pcm_oss_file->streams[stream];
        if (substream == NULL)
                return -EINVAL;
        runtime = substream->runtime;

        if (runtime->oss.params) {
                err = snd_pcm_oss_change_params(substream, false);
                if (err < 0)
                        return err;
        }

        info.fragsize = runtime->oss.period_bytes;
        info.fragstotal = runtime->periods;
        if (runtime->oss.prepare) {
                if (stream == SNDRV_PCM_STREAM_PLAYBACK) {
                        info.bytes = runtime->oss.period_bytes * runtime->oss.periods;
                        info.fragments = runtime->oss.periods;
                } else {
                        info.bytes = 0;
                        info.fragments = 0;
                }
        } else {
                if (stream == SNDRV_PCM_STREAM_PLAYBACK) {
                        err = snd_pcm_kernel_ioctl(substream, SNDRV_PCM_IOCTL_DELAY, &avail);
                        if (err == -EPIPE || err == -ESTRPIPE || (! err && avail < 0)) {
                                avail = runtime->buffer_size;
                                err = 0;
                                fixup = 0;
                        } else {
                                avail = runtime->buffer_size - avail;
                                fixup = -runtime->oss.buffer_used;
                        }
                } else {
                        err = snd_pcm_oss_capture_position_fixup(substream, &avail);
                        fixup = runtime->oss.buffer_used;
                }
                if (err < 0)
                        return err;
                info.bytes = snd_pcm_oss_bytes(substream, avail) + fixup;
                info.fragments = info.bytes / runtime->oss.period_bytes;
        }

#ifdef OSS_DEBUG
        pcm_dbg(substream->pcm,
                "pcm_oss: space: bytes = %i, fragments = %i, fragstotal = %i, fragsize = %i\n",
                info.bytes, info.fragments, info.fragstotal, info.fragsize);
#endif
        if (copy_to_user(_info, &info, sizeof(info)))
                return -EFAULT;
        return 0;
}

static int snd_pcm_oss_get_mapbuf(struct snd_pcm_oss_file *pcm_oss_file, int stream, struct buffmem_desc __user * _info)
{
        // it won't be probably implemented
        // pr_debug("TODO: snd_pcm_oss_get_mapbuf\n");
        return -EINVAL;
}

static const char *strip_task_path(const char *path)
{
        const char *ptr, *ptrl = NULL;
        for (ptr = path; *ptr; ptr++) {
                if (*ptr == '/')
                        ptrl = ptr + 1;
        }
        return ptrl;
}

static void snd_pcm_oss_look_for_setup(struct snd_pcm *pcm, int stream,
                                      const char *task_name,
                                      struct snd_pcm_oss_setup *rsetup)
{
        struct snd_pcm_oss_setup *setup;

        guard(mutex)(&pcm->streams[stream].oss.setup_mutex);
        do {
                for (setup = pcm->streams[stream].oss.setup_list; setup;
                     setup = setup->next) {
                        if (!strcmp(setup->task_name, task_name))
                                goto out;
                }
        } while ((task_name = strip_task_path(task_name)) != NULL);
 out:
        if (setup)
                *rsetup = *setup;
}

static void snd_pcm_oss_release_substream(struct snd_pcm_substream *substream)
{
        snd_pcm_oss_release_buffers(substream);
        substream->oss.oss = 0;
}

static void snd_pcm_oss_init_substream(struct snd_pcm_substream *substream,
                                       struct snd_pcm_oss_setup *setup,
                                       int minor)
{
        struct snd_pcm_runtime *runtime;

        substream->oss.oss = 1;
        substream->oss.setup = *setup;
        if (setup->nonblock)
                substream->f_flags |= O_NONBLOCK;
        else if (setup->block)
                substream->f_flags &= ~O_NONBLOCK;
        runtime = substream->runtime;
        runtime->oss.params = 1;
        runtime->oss.trigger = 1;
        runtime->oss.rate = 8000;
        mutex_init(&runtime->oss.params_lock);
        switch (SNDRV_MINOR_OSS_DEVICE(minor)) {
        case SNDRV_MINOR_OSS_PCM_8:
                runtime->oss.format = AFMT_U8;
                break;
        case SNDRV_MINOR_OSS_PCM_16:
                runtime->oss.format = AFMT_S16_LE;
                break;
        default:
                runtime->oss.format = AFMT_MU_LAW;
        }
        runtime->oss.channels = 1;
        runtime->oss.fragshift = 0;
        runtime->oss.maxfrags = 0;
        runtime->oss.subdivision = 0;
        substream->pcm_release = snd_pcm_oss_release_substream;
        atomic_set(&runtime->oss.rw_ref, 0);
}

static int snd_pcm_oss_release_file(struct snd_pcm_oss_file *pcm_oss_file)
{
        int cidx;
        if (!pcm_oss_file)
                return 0;
        for (cidx = 0; cidx < 2; ++cidx) {
                struct snd_pcm_substream *substream = pcm_oss_file->streams[cidx];
                if (substream)
                        snd_pcm_release_substream(substream);
        }
        kfree(pcm_oss_file);
        return 0;
}

static int snd_pcm_oss_open_file(struct file *file,
                                 struct snd_pcm *pcm,
                                 struct snd_pcm_oss_file **rpcm_oss_file,
                                 int minor,
                                 struct snd_pcm_oss_setup *setup)
{
        int idx, err;
        struct snd_pcm_oss_file *pcm_oss_file;
        struct snd_pcm_substream *substream;
        fmode_t f_mode = file->f_mode;

        if (rpcm_oss_file)
                *rpcm_oss_file = NULL;

        pcm_oss_file = kzalloc(sizeof(*pcm_oss_file), GFP_KERNEL);
        if (pcm_oss_file == NULL)
                return -ENOMEM;

        if ((f_mode & (FMODE_WRITE|FMODE_READ)) == (FMODE_WRITE|FMODE_READ) &&
            (pcm->info_flags & SNDRV_PCM_INFO_HALF_DUPLEX))
                f_mode = FMODE_WRITE;

        file->f_flags &= ~O_APPEND;
        for (idx = 0; idx < 2; idx++) {
                if (setup[idx].disable)
                        continue;
                if (! pcm->streams[idx].substream_count)
                        continue; /* no matching substream */
                if (idx == SNDRV_PCM_STREAM_PLAYBACK) {
                        if (! (f_mode & FMODE_WRITE))
                                continue;
                } else {
                        if (! (f_mode & FMODE_READ))
                                continue;
                }
                err = snd_pcm_open_substream(pcm, idx, file, &substream);
                if (err < 0) {
                        snd_pcm_oss_release_file(pcm_oss_file);
                        return err;
                }

                pcm_oss_file->streams[idx] = substream;
                snd_pcm_oss_init_substream(substream, &setup[idx], minor);
        }
        
        if (!pcm_oss_file->streams[0] && !pcm_oss_file->streams[1]) {
                snd_pcm_oss_release_file(pcm_oss_file);
                return -EINVAL;
        }

        file->private_data = pcm_oss_file;
        if (rpcm_oss_file)
                *rpcm_oss_file = pcm_oss_file;
        return 0;
}


static int snd_task_name(struct task_struct *task, char *name, size_t size)
{
        unsigned int idx;

        if (snd_BUG_ON(!task || !name || size < 2))
                return -EINVAL;
        for (idx = 0; idx < sizeof(task->comm) && idx + 1 < size; idx++)
                name[idx] = task->comm[idx];
        name[idx] = '\0';
        return 0;
}

static int snd_pcm_oss_open(struct inode *inode, struct file *file)
{
        int err;
        char task_name[32];
        struct snd_pcm *pcm;
        struct snd_pcm_oss_file *pcm_oss_file;
        struct snd_pcm_oss_setup setup[2];
        int nonblock;
        wait_queue_entry_t wait;

        err = nonseekable_open(inode, file);
        if (err < 0)
                return err;

        pcm = snd_lookup_oss_minor_data(iminor(inode),
                                        SNDRV_OSS_DEVICE_TYPE_PCM);
        if (pcm == NULL) {
                err = -ENODEV;
                goto __error1;
        }
        err = snd_card_file_add(pcm->card, file);
        if (err < 0)
                goto __error1;
        if (!try_module_get(pcm->card->module)) {
                err = -EFAULT;
                goto __error2;
        }
        if (snd_task_name(current, task_name, sizeof(task_name)) < 0) {
                err = -EFAULT;
                goto __error;
        }
        memset(setup, 0, sizeof(setup));
        if (file->f_mode & FMODE_WRITE)
                snd_pcm_oss_look_for_setup(pcm, SNDRV_PCM_STREAM_PLAYBACK,
                                           task_name, &setup[0]);
        if (file->f_mode & FMODE_READ)
                snd_pcm_oss_look_for_setup(pcm, SNDRV_PCM_STREAM_CAPTURE,
                                           task_name, &setup[1]);

        nonblock = !!(file->f_flags & O_NONBLOCK);
        if (!nonblock)
                nonblock = nonblock_open;

        init_waitqueue_entry(&wait, current);
        add_wait_queue(&pcm->open_wait, &wait);
        mutex_lock(&pcm->open_mutex);
        while (1) {
                err = snd_pcm_oss_open_file(file, pcm, &pcm_oss_file,
                                            iminor(inode), setup);
                if (err >= 0)
                        break;
                if (err == -EAGAIN) {
                        if (nonblock) {
                                err = -EBUSY;
                                break;
                        }
                } else
                        break;
                set_current_state(TASK_INTERRUPTIBLE);
                mutex_unlock(&pcm->open_mutex);
                schedule();
                mutex_lock(&pcm->open_mutex);
                if (pcm->card->shutdown) {
                        err = -ENODEV;
                        break;
                }
                if (signal_pending(current)) {
                        err = -ERESTARTSYS;
                        break;
                }
        }
        remove_wait_queue(&pcm->open_wait, &wait);
        mutex_unlock(&pcm->open_mutex);
        if (err < 0)
                goto __error;
        snd_card_unref(pcm->card);
        return err;

      __error:
             module_put(pcm->card->module);
      __error2:
              snd_card_file_remove(pcm->card, file);
      __error1:
        if (pcm)
                snd_card_unref(pcm->card);
        return err;
}

static int snd_pcm_oss_release(struct inode *inode, struct file *file)
{
        struct snd_pcm *pcm;
        struct snd_pcm_substream *substream;
        struct snd_pcm_oss_file *pcm_oss_file;

        pcm_oss_file = file->private_data;
        substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK];
        if (substream == NULL)
                substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE];
        if (snd_BUG_ON(!substream))
                return -ENXIO;
        pcm = substream->pcm;
        if (!pcm->card->shutdown)
                snd_pcm_oss_sync(pcm_oss_file);
        mutex_lock(&pcm->open_mutex);
        snd_pcm_oss_release_file(pcm_oss_file);
        mutex_unlock(&pcm->open_mutex);
        wake_up(&pcm->open_wait);
        module_put(pcm->card->module);
        snd_card_file_remove(pcm->card, file);
        return 0;
}

static long snd_pcm_oss_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        struct snd_pcm_oss_file *pcm_oss_file;
        int __user *p = (int __user *)arg;
        int res;

        pcm_oss_file = file->private_data;
        if (cmd == OSS_GETVERSION)
                return put_user(SNDRV_OSS_VERSION, p);
        if (cmd == OSS_ALSAEMULVER)
                return put_user(1, p);
#if IS_REACHABLE(CONFIG_SND_MIXER_OSS)
        if (((cmd >> 8) & 0xff) == 'M')        {        /* mixer ioctl - for OSS compatibility */
                struct snd_pcm_substream *substream;
                int idx;
                for (idx = 0; idx < 2; ++idx) {
                        substream = pcm_oss_file->streams[idx];
                        if (substream != NULL)
                                break;
                }
                if (snd_BUG_ON(idx >= 2))
                        return -ENXIO;
                return snd_mixer_oss_ioctl_card(substream->pcm->card, cmd, arg);
        }
#endif
        if (((cmd >> 8) & 0xff) != 'P')
                return -EINVAL;
#ifdef OSS_DEBUG
        pr_debug("pcm_oss: ioctl = 0x%x\n", cmd);
#endif
        switch (cmd) {
        case SNDCTL_DSP_RESET:
                return snd_pcm_oss_reset(pcm_oss_file);
        case SNDCTL_DSP_SYNC:
                return snd_pcm_oss_sync(pcm_oss_file);
        case SNDCTL_DSP_SPEED:
                if (get_user(res, p))
                        return -EFAULT;
                res = snd_pcm_oss_set_rate(pcm_oss_file, res);
                if (res < 0)
                        return res;
                return put_user(res, p);
        case SOUND_PCM_READ_RATE:
                res = snd_pcm_oss_get_rate(pcm_oss_file);
                if (res < 0)
                        return res;
                return put_user(res, p);
        case SNDCTL_DSP_STEREO:
                if (get_user(res, p))
                        return -EFAULT;
                res = res > 0 ? 2 : 1;
                res = snd_pcm_oss_set_channels(pcm_oss_file, res);
                if (res < 0)
                        return res;
                return put_user(--res, p);
        case SNDCTL_DSP_GETBLKSIZE:
                res = snd_pcm_oss_get_block_size(pcm_oss_file);
                if (res < 0)
                        return res;
                return put_user(res, p);
        case SNDCTL_DSP_SETFMT:
                if (get_user(res, p))
                        return -EFAULT;
                res = snd_pcm_oss_set_format(pcm_oss_file, res);
                if (res < 0)
                        return res;
                return put_user(res, p);
        case SOUND_PCM_READ_BITS:
                res = snd_pcm_oss_get_format(pcm_oss_file);
                if (res < 0)
                        return res;
                return put_user(res, p);
        case SNDCTL_DSP_CHANNELS:
                if (get_user(res, p))
                        return -EFAULT;
                res = snd_pcm_oss_set_channels(pcm_oss_file, res);
                if (res < 0)
                        return res;
                return put_user(res, p);
        case SOUND_PCM_READ_CHANNELS:
                res = snd_pcm_oss_get_channels(pcm_oss_file);
                if (res < 0)
                        return res;
                return put_user(res, p);
        case SOUND_PCM_WRITE_FILTER:
        case SOUND_PCM_READ_FILTER:
                return -EIO;
        case SNDCTL_DSP_POST:
                return snd_pcm_oss_post(pcm_oss_file);
        case SNDCTL_DSP_SUBDIVIDE:
                if (get_user(res, p))
                        return -EFAULT;
                res = snd_pcm_oss_set_subdivide(pcm_oss_file, res);
                if (res < 0)
                        return res;
                return put_user(res, p);
        case SNDCTL_DSP_SETFRAGMENT:
                if (get_user(res, p))
                        return -EFAULT;
                return snd_pcm_oss_set_fragment(pcm_oss_file, res);
        case SNDCTL_DSP_GETFMTS:
                res = snd_pcm_oss_get_formats(pcm_oss_file);
                if (res < 0)
                        return res;
                return put_user(res, p);
        case SNDCTL_DSP_GETOSPACE:
        case SNDCTL_DSP_GETISPACE:
                return snd_pcm_oss_get_space(pcm_oss_file,
                        cmd == SNDCTL_DSP_GETISPACE ?
                                SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK,
                        (struct audio_buf_info __user *) arg);
        case SNDCTL_DSP_NONBLOCK:
                return snd_pcm_oss_nonblock(file);
        case SNDCTL_DSP_GETCAPS:
                res = snd_pcm_oss_get_caps(pcm_oss_file);
                if (res < 0)
                        return res;
                return put_user(res, p);
        case SNDCTL_DSP_GETTRIGGER:
                res = snd_pcm_oss_get_trigger(pcm_oss_file);
                if (res < 0)
                        return res;
                return put_user(res, p);
        case SNDCTL_DSP_SETTRIGGER:
                if (get_user(res, p))
                        return -EFAULT;
                return snd_pcm_oss_set_trigger(pcm_oss_file, res);
        case SNDCTL_DSP_GETIPTR:
        case SNDCTL_DSP_GETOPTR:
                return snd_pcm_oss_get_ptr(pcm_oss_file,
                        cmd == SNDCTL_DSP_GETIPTR ?
                                SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK,
                        (struct count_info __user *) arg);
        case SNDCTL_DSP_MAPINBUF:
        case SNDCTL_DSP_MAPOUTBUF:
                return snd_pcm_oss_get_mapbuf(pcm_oss_file,
                        cmd == SNDCTL_DSP_MAPINBUF ?
                                SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK,
                        (struct buffmem_desc __user *) arg);
        case SNDCTL_DSP_SETSYNCRO:
                /* stop DMA now.. */
                return 0;
        case SNDCTL_DSP_SETDUPLEX:
                if (snd_pcm_oss_get_caps(pcm_oss_file) & DSP_CAP_DUPLEX)
                        return 0;
                return -EIO;
        case SNDCTL_DSP_GETODELAY:
                res = snd_pcm_oss_get_odelay(pcm_oss_file);
                if (res < 0) {
                        /* it's for sure, some broken apps don't check for error codes */
                        put_user(0, p);
                        return res;
                }
                return put_user(res, p);
        case SNDCTL_DSP_PROFILE:
                return 0;        /* silently ignore */
        default:
                pr_debug("pcm_oss: unknown command = 0x%x\n", cmd);
        }
        return -EINVAL;
}

#ifdef CONFIG_COMPAT
/* all compatible */
static long snd_pcm_oss_ioctl_compat(struct file *file, unsigned int cmd,
                                     unsigned long arg)
{
        /*
         * Everything is compatbile except SNDCTL_DSP_MAPINBUF/SNDCTL_DSP_MAPOUTBUF,
         * which are not implemented for the native case either
         */
        return snd_pcm_oss_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
}
#else
#define snd_pcm_oss_ioctl_compat        NULL
#endif

static ssize_t snd_pcm_oss_read(struct file *file, char __user *buf, size_t count, loff_t *offset)
{
        struct snd_pcm_oss_file *pcm_oss_file;
        struct snd_pcm_substream *substream;

        pcm_oss_file = file->private_data;
        substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE];
        if (substream == NULL)
                return -ENXIO;
        substream->f_flags = file->f_flags & O_NONBLOCK;
#ifndef OSS_DEBUG
        return snd_pcm_oss_read1(substream, buf, count);
#else
        {
                ssize_t res = snd_pcm_oss_read1(substream, buf, count);
                pcm_dbg(substream->pcm,
                        "pcm_oss: read %li bytes (returned %li bytes)\n",
                        (long)count, (long)res);
                return res;
        }
#endif
}

static ssize_t snd_pcm_oss_write(struct file *file, const char __user *buf, size_t count, loff_t *offset)
{
        struct snd_pcm_oss_file *pcm_oss_file;
        struct snd_pcm_substream *substream;
        long result;

        pcm_oss_file = file->private_data;
        substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK];
        if (substream == NULL)
                return -ENXIO;
        substream->f_flags = file->f_flags & O_NONBLOCK;
        result = snd_pcm_oss_write1(substream, buf, count);
#ifdef OSS_DEBUG
        pcm_dbg(substream->pcm, "pcm_oss: write %li bytes (wrote %li bytes)\n",
               (long)count, (long)result);
#endif
        return result;
}

static int snd_pcm_oss_playback_ready(struct snd_pcm_substream *substream)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        if (atomic_read(&substream->mmap_count))
                return runtime->oss.prev_hw_ptr_period !=
                                                get_hw_ptr_period(runtime);
        else
                return snd_pcm_playback_avail(runtime) >=
                                                runtime->oss.period_frames;
}

static int snd_pcm_oss_capture_ready(struct snd_pcm_substream *substream)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        if (atomic_read(&substream->mmap_count))
                return runtime->oss.prev_hw_ptr_period !=
                                                get_hw_ptr_period(runtime);
        else
                return snd_pcm_capture_avail(runtime) >=
                                                runtime->oss.period_frames;
}

static __poll_t snd_pcm_oss_poll(struct file *file, poll_table * wait)
{
        struct snd_pcm_oss_file *pcm_oss_file;
        __poll_t mask;
        struct snd_pcm_substream *psubstream = NULL, *csubstream = NULL;
        
        pcm_oss_file = file->private_data;

        psubstream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK];
        csubstream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE];

        mask = 0;
        if (psubstream != NULL) {
                struct snd_pcm_runtime *runtime = psubstream->runtime;
                poll_wait(file, &runtime->sleep, wait);
                scoped_guard(pcm_stream_lock_irq, psubstream) {
                        if (runtime->state != SNDRV_PCM_STATE_DRAINING &&
                            (runtime->state != SNDRV_PCM_STATE_RUNNING ||
                             snd_pcm_oss_playback_ready(psubstream)))
                                mask |= EPOLLOUT | EPOLLWRNORM;
                }
        }
        if (csubstream != NULL) {
                struct snd_pcm_runtime *runtime = csubstream->runtime;
                snd_pcm_state_t ostate;
                poll_wait(file, &runtime->sleep, wait);
                scoped_guard(pcm_stream_lock_irq, csubstream) {
                        ostate = runtime->state;
                        if (ostate != SNDRV_PCM_STATE_RUNNING ||
                            snd_pcm_oss_capture_ready(csubstream))
                                mask |= EPOLLIN | EPOLLRDNORM;
                }
                if (ostate != SNDRV_PCM_STATE_RUNNING && runtime->oss.trigger) {
                        struct snd_pcm_oss_file ofile;
                        memset(&ofile, 0, sizeof(ofile));
                        ofile.streams[SNDRV_PCM_STREAM_CAPTURE] = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE];
                        runtime->oss.trigger = 0;
                        snd_pcm_oss_set_trigger(&ofile, PCM_ENABLE_INPUT);
                }
        }

        return mask;
}

static int snd_pcm_oss_mmap(struct file *file, struct vm_area_struct *area)
{
        struct snd_pcm_oss_file *pcm_oss_file;
        struct snd_pcm_substream *substream = NULL;
        struct snd_pcm_runtime *runtime;
        int err;

#ifdef OSS_DEBUG
        pr_debug("pcm_oss: mmap begin\n");
#endif
        pcm_oss_file = file->private_data;
        switch ((area->vm_flags & (VM_READ | VM_WRITE))) {
        case VM_READ | VM_WRITE:
                substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK];
                if (substream)
                        break;
                fallthrough;
        case VM_READ:
                substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_CAPTURE];
                break;
        case VM_WRITE:
                substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK];
                break;
        default:
                return -EINVAL;
        }
        /* set VM_READ access as well to fix memset() routines that do
           reads before writes (to improve performance) */
        vm_flags_set(area, VM_READ);
        if (substream == NULL)
                return -ENXIO;
        runtime = substream->runtime;
        if (!(runtime->info & SNDRV_PCM_INFO_MMAP_VALID))
                return -EIO;
        if (runtime->info & SNDRV_PCM_INFO_INTERLEAVED)
                runtime->access = SNDRV_PCM_ACCESS_MMAP_INTERLEAVED;
        else
                return -EIO;
        
        if (runtime->oss.params) {
                /* use mutex_trylock() for params_lock for avoiding a deadlock
                 * between mmap_lock and params_lock taken by
                 * copy_from/to_user() in snd_pcm_oss_write/read()
                 */
                err = snd_pcm_oss_change_params(substream, true);
                if (err < 0)
                        return err;
        }
#ifdef CONFIG_SND_PCM_OSS_PLUGINS
        if (runtime->oss.plugin_first != NULL)
                return -EIO;
#endif

        if (area->vm_pgoff != 0)
                return -EINVAL;

        err = snd_pcm_mmap_data(substream, file, area);
        if (err < 0)
                return err;
        runtime->oss.mmap_bytes = area->vm_end - area->vm_start;
        runtime->silence_threshold = 0;
        runtime->silence_size = 0;
#ifdef OSS_DEBUG
        pr_debug("pcm_oss: mmap ok, bytes = 0x%x\n",
               runtime->oss.mmap_bytes);
#endif
        /* In mmap mode we never stop */
        runtime->stop_threshold = runtime->boundary;

        return 0;
}

#ifdef CONFIG_SND_VERBOSE_PROCFS
/*
 *  /proc interface
 */

static void snd_pcm_oss_proc_read(struct snd_info_entry *entry,
                                  struct snd_info_buffer *buffer)
{
        struct snd_pcm_str *pstr = entry->private_data;
        struct snd_pcm_oss_setup *setup = pstr->oss.setup_list;
        guard(mutex)(&pstr->oss.setup_mutex);
        while (setup) {
                snd_iprintf(buffer, "%s %u %u%s%s%s%s%s%s\n",
                            setup->task_name,
                            setup->periods,
                            setup->period_size,
                            setup->disable ? " disable" : "",
                            setup->direct ? " direct" : "",
                            setup->block ? " block" : "",
                            setup->nonblock ? " non-block" : "",
                            setup->partialfrag ? " partial-frag" : "",
                            setup->nosilence ? " no-silence" : "");
                setup = setup->next;
        }
}

static void snd_pcm_oss_proc_free_setup_list(struct snd_pcm_str * pstr)
{
        struct snd_pcm_oss_setup *setup, *setupn;

        for (setup = pstr->oss.setup_list, pstr->oss.setup_list = NULL;
             setup; setup = setupn) {
                setupn = setup->next;
                kfree(setup->task_name);
                kfree(setup);
        }
        pstr->oss.setup_list = NULL;
}

static void snd_pcm_oss_proc_write(struct snd_info_entry *entry,
                                   struct snd_info_buffer *buffer)
{
        struct snd_pcm_str *pstr = entry->private_data;
        char line[128], str[32], task_name[32];
        const char *ptr;
        int idx1;
        struct snd_pcm_oss_setup *setup, *setup1, template;

        while (!snd_info_get_line(buffer, line, sizeof(line))) {
                guard(mutex)(&pstr->oss.setup_mutex);
                memset(&template, 0, sizeof(template));
                ptr = snd_info_get_str(task_name, line, sizeof(task_name));
                if (!strcmp(task_name, "clear") || !strcmp(task_name, "erase")) {
                        snd_pcm_oss_proc_free_setup_list(pstr);
                        continue;
                }
                for (setup = pstr->oss.setup_list; setup; setup = setup->next) {
                        if (!strcmp(setup->task_name, task_name)) {
                                template = *setup;
                                break;
                        }
                }
                ptr = snd_info_get_str(str, ptr, sizeof(str));
                template.periods = simple_strtoul(str, NULL, 10);
                ptr = snd_info_get_str(str, ptr, sizeof(str));
                template.period_size = simple_strtoul(str, NULL, 10);
                for (idx1 = 31; idx1 >= 0; idx1--)
                        if (template.period_size & (1 << idx1))
                                break;
                for (idx1--; idx1 >= 0; idx1--)
                        template.period_size &= ~(1 << idx1);
                do {
                        ptr = snd_info_get_str(str, ptr, sizeof(str));
                        if (!strcmp(str, "disable")) {
                                template.disable = 1;
                        } else if (!strcmp(str, "direct")) {
                                template.direct = 1;
                        } else if (!strcmp(str, "block")) {
                                template.block = 1;
                        } else if (!strcmp(str, "non-block")) {
                                template.nonblock = 1;
                        } else if (!strcmp(str, "partial-frag")) {
                                template.partialfrag = 1;
                        } else if (!strcmp(str, "no-silence")) {
                                template.nosilence = 1;
                        } else if (!strcmp(str, "buggy-ptr")) {
                                template.buggyptr = 1;
                        }
                } while (*str);
                if (setup == NULL) {
                        setup = kmalloc(sizeof(*setup), GFP_KERNEL);
                        if (! setup) {
                                buffer->error = -ENOMEM;
                                return;
                        }
                        if (pstr->oss.setup_list == NULL)
                                pstr->oss.setup_list = setup;
                        else {
                                for (setup1 = pstr->oss.setup_list;
                                     setup1->next; setup1 = setup1->next);
                                setup1->next = setup;
                        }
                        template.task_name = kstrdup(task_name, GFP_KERNEL);
                        if (! template.task_name) {
                                kfree(setup);
                                buffer->error = -ENOMEM;
                                return;
                        }
                }
                *setup = template;
        }
}

static void snd_pcm_oss_proc_init(struct snd_pcm *pcm)
{
        int stream;
        for (stream = 0; stream < 2; ++stream) {
                struct snd_info_entry *entry;
                struct snd_pcm_str *pstr = &pcm->streams[stream];
                if (pstr->substream_count == 0)
                        continue;
                entry = snd_info_create_card_entry(pcm->card, "oss", pstr->proc_root);
                if (entry) {
                        entry->content = SNDRV_INFO_CONTENT_TEXT;
                        entry->mode = S_IFREG | 0644;
                        entry->c.text.read = snd_pcm_oss_proc_read;
                        entry->c.text.write = snd_pcm_oss_proc_write;
                        entry->private_data = pstr;
                        if (snd_info_register(entry) < 0) {
                                snd_info_free_entry(entry);
                                entry = NULL;
                        }
                }
                pstr->oss.proc_entry = entry;
        }
}

static void snd_pcm_oss_proc_done(struct snd_pcm *pcm)
{
        int stream;
        for (stream = 0; stream < 2; ++stream) {
                struct snd_pcm_str *pstr = &pcm->streams[stream];
                snd_info_free_entry(pstr->oss.proc_entry);
                pstr->oss.proc_entry = NULL;
                snd_pcm_oss_proc_free_setup_list(pstr);
        }
}
#else /* !CONFIG_SND_VERBOSE_PROCFS */
static inline void snd_pcm_oss_proc_init(struct snd_pcm *pcm)
{
}
static inline void snd_pcm_oss_proc_done(struct snd_pcm *pcm)
{
}
#endif /* CONFIG_SND_VERBOSE_PROCFS */

/*
 *  ENTRY functions
 */

static const struct file_operations snd_pcm_oss_f_reg =
{
        .owner =        THIS_MODULE,
        .read =                snd_pcm_oss_read,
        .write =        snd_pcm_oss_write,
        .open =                snd_pcm_oss_open,
        .release =        snd_pcm_oss_release,
        .llseek =        no_llseek,
        .poll =                snd_pcm_oss_poll,
        .unlocked_ioctl =        snd_pcm_oss_ioctl,
        .compat_ioctl =        snd_pcm_oss_ioctl_compat,
        .mmap =                snd_pcm_oss_mmap,
};

static void register_oss_dsp(struct snd_pcm *pcm, int index)
{
        if (snd_register_oss_device(SNDRV_OSS_DEVICE_TYPE_PCM,
                                    pcm->card, index, &snd_pcm_oss_f_reg,
                                    pcm) < 0) {
                pcm_err(pcm, "unable to register OSS PCM device %i:%i\n",
                           pcm->card->number, pcm->device);
        }
}

static int snd_pcm_oss_register_minor(struct snd_pcm *pcm)
{
        pcm->oss.reg = 0;
        if (dsp_map[pcm->card->number] == (int)pcm->device) {
                char name[128];
                int duplex;
                register_oss_dsp(pcm, 0);
                duplex = (pcm->streams[SNDRV_PCM_STREAM_PLAYBACK].substream_count > 0 && 
                              pcm->streams[SNDRV_PCM_STREAM_CAPTURE].substream_count && 
                              !(pcm->info_flags & SNDRV_PCM_INFO_HALF_DUPLEX));
                sprintf(name, "%s%s", pcm->name, duplex ? " (DUPLEX)" : "");
#ifdef SNDRV_OSS_INFO_DEV_AUDIO
                snd_oss_info_register(SNDRV_OSS_INFO_DEV_AUDIO,
                                      pcm->card->number,
                                      name);
#endif
                pcm->oss.reg++;
                pcm->oss.reg_mask |= 1;
        }
        if (adsp_map[pcm->card->number] == (int)pcm->device) {
                register_oss_dsp(pcm, 1);
                pcm->oss.reg++;
                pcm->oss.reg_mask |= 2;
        }

        if (pcm->oss.reg)
                snd_pcm_oss_proc_init(pcm);

        return 0;
}

static int snd_pcm_oss_disconnect_minor(struct snd_pcm *pcm)
{
        if (pcm->oss.reg) {
                if (pcm->oss.reg_mask & 1) {
                        pcm->oss.reg_mask &= ~1;
                        snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_PCM,
                                                  pcm->card, 0);
                }
                if (pcm->oss.reg_mask & 2) {
                        pcm->oss.reg_mask &= ~2;
                        snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_PCM,
                                                  pcm->card, 1);
                }
                if (dsp_map[pcm->card->number] == (int)pcm->device) {
#ifdef SNDRV_OSS_INFO_DEV_AUDIO
                        snd_oss_info_unregister(SNDRV_OSS_INFO_DEV_AUDIO, pcm->card->number);
#endif
                }
                pcm->oss.reg = 0;
        }
        return 0;
}

static int snd_pcm_oss_unregister_minor(struct snd_pcm *pcm)
{
        snd_pcm_oss_disconnect_minor(pcm);
        snd_pcm_oss_proc_done(pcm);
        return 0;
}

static struct snd_pcm_notify snd_pcm_oss_notify =
{
        .n_register =        snd_pcm_oss_register_minor,
        .n_disconnect = snd_pcm_oss_disconnect_minor,
        .n_unregister =        snd_pcm_oss_unregister_minor,
};

static int __init alsa_pcm_oss_init(void)
{
        int i;
        int err;

        /* check device map table */
        for (i = 0; i < SNDRV_CARDS; i++) {
                if (dsp_map[i] < 0 || dsp_map[i] >= SNDRV_PCM_DEVICES) {
                        pr_err("ALSA: pcm_oss: invalid dsp_map[%d] = %d\n",
                                   i, dsp_map[i]);
                        dsp_map[i] = 0;
                }
                if (adsp_map[i] < 0 || adsp_map[i] >= SNDRV_PCM_DEVICES) {
                        pr_err("ALSA: pcm_oss: invalid adsp_map[%d] = %d\n",
                                   i, adsp_map[i]);
                        adsp_map[i] = 1;
                }
        }
        err = snd_pcm_notify(&snd_pcm_oss_notify, 0);
        if (err < 0)
                return err;
        return 0;
}

static void __exit alsa_pcm_oss_exit(void)
{
        snd_pcm_notify(&snd_pcm_oss_notify, 1);
}

module_init(alsa_pcm_oss_init)
module_exit(alsa_pcm_oss_exit)
































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
/*
 * Copyright (c) 1982, 1986 Regents of the University of California.
 * All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Robert Elz at The University of Melbourne.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
#ifndef _LINUX_QUOTA_
#define _LINUX_QUOTA_

#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/spinlock.h>
#include <linux/wait.h>
#include <linux/percpu_counter.h>

#include <linux/dqblk_xfs.h>
#include <linux/dqblk_v1.h>
#include <linux/dqblk_v2.h>

#include <linux/atomic.h>
#include <linux/uidgid.h>
#include <linux/projid.h>
#include <uapi/linux/quota.h>

#undef USRQUOTA
#undef GRPQUOTA
#undef PRJQUOTA
enum quota_type {
        USRQUOTA = 0,                /* element used for user quotas */
        GRPQUOTA = 1,                /* element used for group quotas */
        PRJQUOTA = 2,                /* element used for project quotas */
};

/* Masks for quota types when used as a bitmask */
#define QTYPE_MASK_USR (1 << USRQUOTA)
#define QTYPE_MASK_GRP (1 << GRPQUOTA)
#define QTYPE_MASK_PRJ (1 << PRJQUOTA)

typedef __kernel_uid32_t qid_t; /* Type in which we store ids in memory */
typedef long long qsize_t;        /* Type in which we store sizes */

struct kqid {                        /* Type in which we store the quota identifier */
        union {
                kuid_t uid;
                kgid_t gid;
                kprojid_t projid;
        };
        enum quota_type type;  /* USRQUOTA (uid) or GRPQUOTA (gid) or PRJQUOTA (projid) */
};

extern bool qid_eq(struct kqid left, struct kqid right);
extern bool qid_lt(struct kqid left, struct kqid right);
extern qid_t from_kqid(struct user_namespace *to, struct kqid qid);
extern qid_t from_kqid_munged(struct user_namespace *to, struct kqid qid);
extern bool qid_valid(struct kqid qid);

/**
 *        make_kqid - Map a user-namespace, type, qid tuple into a kqid.
 *        @from: User namespace that the qid is in
 *        @type: The type of quota
 *        @qid: Quota identifier
 *
 *        Maps a user-namespace, type qid tuple into a kernel internal
 *        kqid, and returns that kqid.
 *
 *        When there is no mapping defined for the user-namespace, type,
 *        qid tuple an invalid kqid is returned.  Callers are expected to
 *        test for and handle invalid kqids being returned.
 *        Invalid kqids may be tested for using qid_valid().
 */
static inline struct kqid make_kqid(struct user_namespace *from,
                                    enum quota_type type, qid_t qid)
{
        struct kqid kqid;

        kqid.type = type;
        switch (type) {
        case USRQUOTA:
                kqid.uid = make_kuid(from, qid);
                break;
        case GRPQUOTA:
                kqid.gid = make_kgid(from, qid);
                break;
        case PRJQUOTA:
                kqid.projid = make_kprojid(from, qid);
                break;
        default:
                BUG();
        }
        return kqid;
}

/**
 *        make_kqid_invalid - Explicitly make an invalid kqid
 *        @type: The type of quota identifier
 *
 *        Returns an invalid kqid with the specified type.
 */
static inline struct kqid make_kqid_invalid(enum quota_type type)
{
        struct kqid kqid;

        kqid.type = type;
        switch (type) {
        case USRQUOTA:
                kqid.uid = INVALID_UID;
                break;
        case GRPQUOTA:
                kqid.gid = INVALID_GID;
                break;
        case PRJQUOTA:
                kqid.projid = INVALID_PROJID;
                break;
        default:
                BUG();
        }
        return kqid;
}

/**
 *        make_kqid_uid - Make a kqid from a kuid
 *        @uid: The kuid to make the quota identifier from
 */
static inline struct kqid make_kqid_uid(kuid_t uid)
{
        struct kqid kqid;
        kqid.type = USRQUOTA;
        kqid.uid = uid;
        return kqid;
}

/**
 *        make_kqid_gid - Make a kqid from a kgid
 *        @gid: The kgid to make the quota identifier from
 */
static inline struct kqid make_kqid_gid(kgid_t gid)
{
        struct kqid kqid;
        kqid.type = GRPQUOTA;
        kqid.gid = gid;
        return kqid;
}

/**
 *        make_kqid_projid - Make a kqid from a projid
 *        @projid: The kprojid to make the quota identifier from
 */
static inline struct kqid make_kqid_projid(kprojid_t projid)
{
        struct kqid kqid;
        kqid.type = PRJQUOTA;
        kqid.projid = projid;
        return kqid;
}

/**
 *        qid_has_mapping - Report if a qid maps into a user namespace.
 *        @ns:  The user namespace to see if a value maps into.
 *        @qid: The kernel internal quota identifier to test.
 */
static inline bool qid_has_mapping(struct user_namespace *ns, struct kqid qid)
{
        return from_kqid(ns, qid) != (qid_t) -1;
}


extern spinlock_t dq_data_lock;

/* Maximal numbers of writes for quota operation (insert/delete/update)
 * (over VFS all formats) */
#define DQUOT_INIT_ALLOC max(V1_INIT_ALLOC, V2_INIT_ALLOC)
#define DQUOT_INIT_REWRITE max(V1_INIT_REWRITE, V2_INIT_REWRITE)
#define DQUOT_DEL_ALLOC max(V1_DEL_ALLOC, V2_DEL_ALLOC)
#define DQUOT_DEL_REWRITE max(V1_DEL_REWRITE, V2_DEL_REWRITE)

/*
 * Data for one user/group kept in memory
 */
struct mem_dqblk {
        qsize_t dqb_bhardlimit;        /* absolute limit on disk blks alloc */
        qsize_t dqb_bsoftlimit;        /* preferred limit on disk blks */
        qsize_t dqb_curspace;        /* current used space */
        qsize_t dqb_rsvspace;   /* current reserved space for delalloc*/
        qsize_t dqb_ihardlimit;        /* absolute limit on allocated inodes */
        qsize_t dqb_isoftlimit;        /* preferred inode limit */
        qsize_t dqb_curinodes;        /* current # allocated inodes */
        time64_t dqb_btime;        /* time limit for excessive disk use */
        time64_t dqb_itime;        /* time limit for excessive inode use */
};

/*
 * Data for one quotafile kept in memory
 */
struct quota_format_type;

struct mem_dqinfo {
        struct quota_format_type *dqi_format;
        int dqi_fmt_id;                /* Id of the dqi_format - used when turning
                                 * quotas on after remount RW */
        struct list_head dqi_dirty_list;        /* List of dirty dquots [dq_list_lock] */
        unsigned long dqi_flags;        /* DFQ_ flags [dq_data_lock] */
        unsigned int dqi_bgrace;        /* Space grace time [dq_data_lock] */
        unsigned int dqi_igrace;        /* Inode grace time [dq_data_lock] */
        qsize_t dqi_max_spc_limit;        /* Maximum space limit [static] */
        qsize_t dqi_max_ino_limit;        /* Maximum inode limit [static] */
        void *dqi_priv;
};

struct super_block;

/* Mask for flags passed to userspace */
#define DQF_GETINFO_MASK (DQF_ROOT_SQUASH | DQF_SYS_FILE)
/* Mask for flags modifiable from userspace */
#define DQF_SETINFO_MASK DQF_ROOT_SQUASH

enum {
        DQF_INFO_DIRTY_B = DQF_PRIVATE,
};
#define DQF_INFO_DIRTY (1 << DQF_INFO_DIRTY_B)        /* Is info dirty? */

extern void mark_info_dirty(struct super_block *sb, int type);
static inline int info_dirty(struct mem_dqinfo *info)
{
        return test_bit(DQF_INFO_DIRTY_B, &info->dqi_flags);
}

enum {
        DQST_LOOKUPS,
        DQST_DROPS,
        DQST_READS,
        DQST_WRITES,
        DQST_CACHE_HITS,
        DQST_ALLOC_DQUOTS,
        DQST_FREE_DQUOTS,
        DQST_SYNCS,
        _DQST_DQSTAT_LAST
};

struct dqstats {
        unsigned long stat[_DQST_DQSTAT_LAST];
        struct percpu_counter counter[_DQST_DQSTAT_LAST];
};

extern struct dqstats dqstats;

static inline void dqstats_inc(unsigned int type)
{
        percpu_counter_inc(&dqstats.counter[type]);
}

static inline void dqstats_dec(unsigned int type)
{
        percpu_counter_dec(&dqstats.counter[type]);
}

#define DQ_MOD_B        0        /* dquot modified since read */
#define DQ_BLKS_B        1        /* uid/gid has been warned about blk limit */
#define DQ_INODES_B        2        /* uid/gid has been warned about inode limit */
#define DQ_FAKE_B        3        /* no limits only usage */
#define DQ_READ_B        4        /* dquot was read into memory */
#define DQ_ACTIVE_B        5        /* dquot is active (dquot_release not called) */
#define DQ_RELEASING_B        6        /* dquot is in releasing_dquots list waiting
                                 * to be cleaned up */
#define DQ_LASTSET_B        7        /* Following 6 bits (see QIF_) are reserved\
                                 * for the mask of entries set via SETQUOTA\
                                 * quotactl. They are set under dq_data_lock\
                                 * and the quota format handling dquot can\
                                 * clear them when it sees fit. */

struct dquot {
        struct hlist_node dq_hash;        /* Hash list in memory [dq_list_lock] */
        struct list_head dq_inuse;        /* List of all quotas [dq_list_lock] */
        struct list_head dq_free;        /* Free list element [dq_list_lock] */
        struct list_head dq_dirty;        /* List of dirty dquots [dq_list_lock] */
        struct mutex dq_lock;                /* dquot IO lock */
        spinlock_t dq_dqb_lock;                /* Lock protecting dq_dqb changes */
        atomic_t dq_count;                /* Use count */
        struct super_block *dq_sb;        /* superblock this applies to */
        struct kqid dq_id;                /* ID this applies to (uid, gid, projid) */
        loff_t dq_off;                        /* Offset of dquot on disk [dq_lock, stable once set] */
        unsigned long dq_flags;                /* See DQ_* */
        struct mem_dqblk dq_dqb;        /* Diskquota usage [dq_dqb_lock] */
};

/* Operations which must be implemented by each quota format */
struct quota_format_ops {
        int (*check_quota_file)(struct super_block *sb, int type);        /* Detect whether file is in our format */
        int (*read_file_info)(struct super_block *sb, int type);        /* Read main info about file - called on quotaon() */
        int (*write_file_info)(struct super_block *sb, int type);        /* Write main info about file */
        int (*free_file_info)(struct super_block *sb, int type);        /* Called on quotaoff() */
        int (*read_dqblk)(struct dquot *dquot);                /* Read structure for one user */
        int (*commit_dqblk)(struct dquot *dquot);        /* Write structure for one user */
        int (*release_dqblk)(struct dquot *dquot);        /* Called when last reference to dquot is being dropped */
        int (*get_next_id)(struct super_block *sb, struct kqid *qid);        /* Get next ID with existing structure in the quota file */
};

/* Operations working with dquots */
struct dquot_operations {
        int (*write_dquot) (struct dquot *);                /* Ordinary dquot write */
        struct dquot *(*alloc_dquot)(struct super_block *, int);        /* Allocate memory for new dquot */
        void (*destroy_dquot)(struct dquot *);                /* Free memory for dquot */
        int (*acquire_dquot) (struct dquot *);                /* Quota is going to be created on disk */
        int (*release_dquot) (struct dquot *);                /* Quota is going to be deleted from disk */
        int (*mark_dirty) (struct dquot *);                /* Dquot is marked dirty */
        int (*write_info) (struct super_block *, int);        /* Write of quota "superblock" */
        /* get reserved quota for delayed alloc, value returned is managed by
         * quota code only */
        qsize_t *(*get_reserved_space) (struct inode *);
        int (*get_projid) (struct inode *, kprojid_t *);/* Get project ID */
        /* Get number of inodes that were charged for a given inode */
        int (*get_inode_usage) (struct inode *, qsize_t *);
        /* Get next ID with active quota structure */
        int (*get_next_id) (struct super_block *sb, struct kqid *qid);
};

struct path;

/* Structure for communicating via ->get_dqblk() & ->set_dqblk() */
struct qc_dqblk {
        int d_fieldmask;        /* mask of fields to change in ->set_dqblk() */
        u64 d_spc_hardlimit;        /* absolute limit on used space */
        u64 d_spc_softlimit;        /* preferred limit on used space */
        u64 d_ino_hardlimit;        /* maximum # allocated inodes */
        u64 d_ino_softlimit;        /* preferred inode limit */
        u64 d_space;                /* Space owned by the user */
        u64 d_ino_count;        /* # inodes owned by the user */
        s64 d_ino_timer;        /* zero if within inode limits */
                                /* if not, we refuse service */
        s64 d_spc_timer;        /* similar to above; for space */
        int d_ino_warns;        /* # warnings issued wrt num inodes */
        int d_spc_warns;        /* # warnings issued wrt used space */
        u64 d_rt_spc_hardlimit;        /* absolute limit on realtime space */
        u64 d_rt_spc_softlimit;        /* preferred limit on RT space */
        u64 d_rt_space;                /* realtime space owned */
        s64 d_rt_spc_timer;        /* similar to above; for RT space */
        int d_rt_spc_warns;        /* # warnings issued wrt RT space */
};

/*
 * Field specifiers for ->set_dqblk() in struct qc_dqblk and also for
 * ->set_info() in struct qc_info
 */
#define        QC_INO_SOFT        (1<<0)
#define        QC_INO_HARD        (1<<1)
#define        QC_SPC_SOFT        (1<<2)
#define        QC_SPC_HARD        (1<<3)
#define        QC_RT_SPC_SOFT        (1<<4)
#define        QC_RT_SPC_HARD        (1<<5)
#define QC_LIMIT_MASK (QC_INO_SOFT | QC_INO_HARD | QC_SPC_SOFT | QC_SPC_HARD | \
                       QC_RT_SPC_SOFT | QC_RT_SPC_HARD)
#define        QC_SPC_TIMER        (1<<6)
#define        QC_INO_TIMER        (1<<7)
#define        QC_RT_SPC_TIMER        (1<<8)
#define QC_TIMER_MASK (QC_SPC_TIMER | QC_INO_TIMER | QC_RT_SPC_TIMER)
#define        QC_SPC_WARNS        (1<<9)
#define        QC_INO_WARNS        (1<<10)
#define        QC_RT_SPC_WARNS        (1<<11)
#define QC_WARNS_MASK (QC_SPC_WARNS | QC_INO_WARNS | QC_RT_SPC_WARNS)
#define        QC_SPACE        (1<<12)
#define        QC_INO_COUNT        (1<<13)
#define        QC_RT_SPACE        (1<<14)
#define QC_ACCT_MASK (QC_SPACE | QC_INO_COUNT | QC_RT_SPACE)
#define QC_FLAGS        (1<<15)

#define QCI_SYSFILE                (1 << 0)        /* Quota file is hidden from userspace */
#define QCI_ROOT_SQUASH                (1 << 1)        /* Root squash turned on */
#define QCI_ACCT_ENABLED        (1 << 2)        /* Quota accounting enabled */
#define QCI_LIMITS_ENFORCED        (1 << 3)        /* Quota limits enforced */

/* Structures for communicating via ->get_state */
struct qc_type_state {
        unsigned int flags;                /* Flags QCI_* */
        unsigned int spc_timelimit;        /* Time after which space softlimit is
                                         * enforced */
        unsigned int ino_timelimit;        /* Ditto for inode softlimit */
        unsigned int rt_spc_timelimit;        /* Ditto for real-time space */
        unsigned int spc_warnlimit;        /* Limit for number of space warnings */
        unsigned int ino_warnlimit;        /* Ditto for inodes */
        unsigned int rt_spc_warnlimit;        /* Ditto for real-time space */
        unsigned long long ino;                /* Inode number of quota file */
        blkcnt_t blocks;                /* Number of 512-byte blocks in the file */
        blkcnt_t nextents;                /* Number of extents in the file */
};

struct qc_state {
        unsigned int s_incoredqs;        /* Number of dquots in core */
        struct qc_type_state s_state[MAXQUOTAS];  /* Per quota type information */
};

/* Structure for communicating via ->set_info */
struct qc_info {
        int i_fieldmask;        /* mask of fields to change in ->set_info() */
        unsigned int i_flags;                /* Flags QCI_* */
        unsigned int i_spc_timelimit;        /* Time after which space softlimit is
                                         * enforced */
        unsigned int i_ino_timelimit;        /* Ditto for inode softlimit */
        unsigned int i_rt_spc_timelimit;/* Ditto for real-time space */
        unsigned int i_spc_warnlimit;        /* Limit for number of space warnings */
        unsigned int i_ino_warnlimit;        /* Limit for number of inode warnings */
        unsigned int i_rt_spc_warnlimit;        /* Ditto for real-time space */
};

/* Operations handling requests from userspace */
struct quotactl_ops {
        int (*quota_on)(struct super_block *, int, int, const struct path *);
        int (*quota_off)(struct super_block *, int);
        int (*quota_enable)(struct super_block *, unsigned int);
        int (*quota_disable)(struct super_block *, unsigned int);
        int (*quota_sync)(struct super_block *, int);
        int (*set_info)(struct super_block *, int, struct qc_info *);
        int (*get_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *);
        int (*get_nextdqblk)(struct super_block *, struct kqid *,
                             struct qc_dqblk *);
        int (*set_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *);
        int (*get_state)(struct super_block *, struct qc_state *);
        int (*rm_xquota)(struct super_block *, unsigned int);
};

struct quota_format_type {
        int qf_fmt_id;        /* Quota format id */
        const struct quota_format_ops *qf_ops;        /* Operations of format */
        struct module *qf_owner;                /* Module implementing quota format */
        struct quota_format_type *qf_next;
};

/**
 * Quota state flags - they come in three flavors - for users, groups and projects.
 *
 * Actual typed flags layout:
 *                                USRQUOTA        GRPQUOTA        PRJQUOTA
 *  DQUOT_USAGE_ENABLED                0x0001                0x0002                0x0004
 *  DQUOT_LIMITS_ENABLED        0x0008                0x0010                0x0020
 *  DQUOT_SUSPENDED                0x0040                0x0080                0x0100
 *
 * Following bits are used for non-typed flags:
 *  DQUOT_QUOTA_SYS_FILE        0x0200
 *  DQUOT_NEGATIVE_USAGE        0x0400
 *  DQUOT_NOLIST_DIRTY                0x0800
 */
enum {
        _DQUOT_USAGE_ENABLED = 0,                /* Track disk usage for users */
        _DQUOT_LIMITS_ENABLED,                        /* Enforce quota limits for users */
        _DQUOT_SUSPENDED,                        /* User diskquotas are off, but
                                                 * we have necessary info in
                                                 * memory to turn them on */
        _DQUOT_STATE_FLAGS
};
#define DQUOT_USAGE_ENABLED        (1 << _DQUOT_USAGE_ENABLED * MAXQUOTAS)
#define DQUOT_LIMITS_ENABLED        (1 << _DQUOT_LIMITS_ENABLED * MAXQUOTAS)
#define DQUOT_SUSPENDED                (1 << _DQUOT_SUSPENDED * MAXQUOTAS)
#define DQUOT_STATE_FLAGS        (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED | \
                                 DQUOT_SUSPENDED)
/* Other quota flags */
#define DQUOT_STATE_LAST        (_DQUOT_STATE_FLAGS * MAXQUOTAS)
#define DQUOT_QUOTA_SYS_FILE        (1 << DQUOT_STATE_LAST)
                                                /* Quota file is a special
                                                 * system file and user cannot
                                                 * touch it. Filesystem is
                                                 * responsible for setting
                                                 * S_NOQUOTA, S_NOATIME flags
                                                 */
#define DQUOT_NEGATIVE_USAGE        (1 << (DQUOT_STATE_LAST + 1))
                                               /* Allow negative quota usage */
/* Do not track dirty dquots in a list */
#define DQUOT_NOLIST_DIRTY        (1 << (DQUOT_STATE_LAST + 2))

static inline unsigned int dquot_state_flag(unsigned int flags, int type)
{
        return flags << type;
}

static inline unsigned int dquot_generic_flag(unsigned int flags, int type)
{
        return (flags >> type) & DQUOT_STATE_FLAGS;
}

/* Bitmap of quota types where flag is set in flags */
static __always_inline unsigned dquot_state_types(unsigned flags, unsigned flag)
{
        BUILD_BUG_ON_NOT_POWER_OF_2(flag);
        return (flags / flag) & ((1 << MAXQUOTAS) - 1);
}

#ifdef CONFIG_QUOTA_NETLINK_INTERFACE
extern void quota_send_warning(struct kqid qid, dev_t dev,
                               const char warntype);
#else
static inline void quota_send_warning(struct kqid qid, dev_t dev,
                                      const char warntype)
{
        return;
}
#endif /* CONFIG_QUOTA_NETLINK_INTERFACE */

struct quota_info {
        unsigned int flags;                        /* Flags for diskquotas on this device */
        struct rw_semaphore dqio_sem;                /* Lock quota file while I/O in progress */
        struct inode *files[MAXQUOTAS];                /* inodes of quotafiles */
        struct mem_dqinfo info[MAXQUOTAS];        /* Information for each quota type */
        const struct quota_format_ops *ops[MAXQUOTAS];        /* Operations for each type */
};

int register_quota_format(struct quota_format_type *fmt);
void unregister_quota_format(struct quota_format_type *fmt);

struct quota_module_name {
        int qm_fmt_id;
        char *qm_mod_name;
};

#define INIT_QUOTA_MODULE_NAMES {\
        {QFMT_VFS_OLD, "quota_v1"},\
        {QFMT_VFS_V0, "quota_v2"},\
        {QFMT_VFS_V1, "quota_v2"},\
        {0, NULL}}

#endif /* _QUOTA_ */























































































































































































    1 























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
// SPDX-License-Identifier: GPL-2.0

#include "io_uring.h"
#include "napi.h"

#ifdef CONFIG_NET_RX_BUSY_POLL

/* Timeout for cleanout of stale entries. */
#define NAPI_TIMEOUT                (60 * SEC_CONVERSION)

struct io_napi_entry {
        unsigned int                napi_id;
        struct list_head        list;

        unsigned long                timeout;
        struct hlist_node        node;

        struct rcu_head                rcu;
};

static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list,
                                               unsigned int napi_id)
{
        struct io_napi_entry *e;

        hlist_for_each_entry_rcu(e, hash_list, node) {
                if (e->napi_id != napi_id)
                        continue;
                e->timeout = jiffies + NAPI_TIMEOUT;
                return e;
        }

        return NULL;
}

void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock)
{
        struct hlist_head *hash_list;
        unsigned int napi_id;
        struct sock *sk;
        struct io_napi_entry *e;

        sk = sock->sk;
        if (!sk)
                return;

        napi_id = READ_ONCE(sk->sk_napi_id);

        /* Non-NAPI IDs can be rejected. */
        if (napi_id < MIN_NAPI_ID)
                return;

        hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];

        rcu_read_lock();
        e = io_napi_hash_find(hash_list, napi_id);
        if (e) {
                e->timeout = jiffies + NAPI_TIMEOUT;
                rcu_read_unlock();
                return;
        }
        rcu_read_unlock();

        e = kmalloc(sizeof(*e), GFP_NOWAIT);
        if (!e)
                return;

        e->napi_id = napi_id;
        e->timeout = jiffies + NAPI_TIMEOUT;

        spin_lock(&ctx->napi_lock);
        if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
                spin_unlock(&ctx->napi_lock);
                kfree(e);
                return;
        }

        hlist_add_tail_rcu(&e->node, hash_list);
        list_add_tail(&e->list, &ctx->napi_list);
        spin_unlock(&ctx->napi_lock);
}

static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
{
        struct io_napi_entry *e;
        unsigned int i;

        spin_lock(&ctx->napi_lock);
        hash_for_each(ctx->napi_ht, i, e, node) {
                if (time_after(jiffies, e->timeout)) {
                        list_del(&e->list);
                        hash_del_rcu(&e->node);
                        kfree_rcu(e, rcu);
                }
        }
        spin_unlock(&ctx->napi_lock);
}

static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale)
{
        if (is_stale)
                __io_napi_remove_stale(ctx);
}

static inline bool io_napi_busy_loop_timeout(unsigned long start_time,
                                             unsigned long bp_usec)
{
        if (bp_usec) {
                unsigned long end_time = start_time + bp_usec;
                unsigned long now = busy_loop_current_time();

                return time_after(now, end_time);
        }

        return true;
}

static bool io_napi_busy_loop_should_end(void *data,
                                         unsigned long start_time)
{
        struct io_wait_queue *iowq = data;

        if (signal_pending(current))
                return true;
        if (io_should_wake(iowq) || io_has_work(iowq->ctx))
                return true;
        if (io_napi_busy_loop_timeout(start_time, iowq->napi_busy_poll_to))
                return true;

        return false;
}

static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
                                   void *loop_end_arg)
{
        struct io_napi_entry *e;
        bool (*loop_end)(void *, unsigned long) = NULL;
        bool is_stale = false;

        if (loop_end_arg)
                loop_end = io_napi_busy_loop_should_end;

        list_for_each_entry_rcu(e, &ctx->napi_list, list) {
                napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
                                   ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);

                if (time_after(jiffies, e->timeout))
                        is_stale = true;
        }

        return is_stale;
}

static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
                                       struct io_wait_queue *iowq)
{
        unsigned long start_time = busy_loop_current_time();
        void *loop_end_arg = NULL;
        bool is_stale = false;

        /* Singular lists use a different napi loop end check function and are
         * only executed once.
         */
        if (list_is_singular(&ctx->napi_list))
                loop_end_arg = iowq;

        rcu_read_lock();
        do {
                is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg);
        } while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg);
        rcu_read_unlock();

        io_napi_remove_stale(ctx, is_stale);
}

/*
 * io_napi_init() - Init napi settings
 * @ctx: pointer to io-uring context structure
 *
 * Init napi settings in the io-uring context.
 */
void io_napi_init(struct io_ring_ctx *ctx)
{
        INIT_LIST_HEAD(&ctx->napi_list);
        spin_lock_init(&ctx->napi_lock);
        ctx->napi_prefer_busy_poll = false;
        ctx->napi_busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
}

/*
 * io_napi_free() - Deallocate napi
 * @ctx: pointer to io-uring context structure
 *
 * Free the napi list and the hash table in the io-uring context.
 */
void io_napi_free(struct io_ring_ctx *ctx)
{
        struct io_napi_entry *e;
        LIST_HEAD(napi_list);
        unsigned int i;

        spin_lock(&ctx->napi_lock);
        hash_for_each(ctx->napi_ht, i, e, node) {
                hash_del_rcu(&e->node);
                kfree_rcu(e, rcu);
        }
        spin_unlock(&ctx->napi_lock);
}

/*
 * io_napi_register() - Register napi with io-uring
 * @ctx: pointer to io-uring context structure
 * @arg: pointer to io_uring_napi structure
 *
 * Register napi in the io-uring context.
 */
int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
{
        const struct io_uring_napi curr = {
                .busy_poll_to           = ctx->napi_busy_poll_to,
                .prefer_busy_poll = ctx->napi_prefer_busy_poll
        };
        struct io_uring_napi napi;

        if (copy_from_user(&napi, arg, sizeof(napi)))
                return -EFAULT;
        if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv)
                return -EINVAL;

        if (copy_to_user(arg, &curr, sizeof(curr)))
                return -EFAULT;

        WRITE_ONCE(ctx->napi_busy_poll_to, napi.busy_poll_to);
        WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll);
        WRITE_ONCE(ctx->napi_enabled, true);
        return 0;
}

/*
 * io_napi_unregister() - Unregister napi with io-uring
 * @ctx: pointer to io-uring context structure
 * @arg: pointer to io_uring_napi structure
 *
 * Unregister napi. If arg has been specified copy the busy poll timeout and
 * prefer busy poll setting to the passed in structure.
 */
int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
{
        const struct io_uring_napi curr = {
                .busy_poll_to           = ctx->napi_busy_poll_to,
                .prefer_busy_poll = ctx->napi_prefer_busy_poll
        };

        if (arg && copy_to_user(arg, &curr, sizeof(curr)))
                return -EFAULT;

        WRITE_ONCE(ctx->napi_busy_poll_to, 0);
        WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
        WRITE_ONCE(ctx->napi_enabled, false);
        return 0;
}

/*
 * __io_napi_adjust_timeout() - adjust busy loop timeout
 * @ctx: pointer to io-uring context structure
 * @iowq: pointer to io wait queue
 * @ts: pointer to timespec or NULL
 *
 * Adjust the busy loop timeout according to timespec and busy poll timeout.
 * If the specified NAPI timeout is bigger than the wait timeout, then adjust
 * the NAPI timeout accordingly.
 */
void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
                              struct timespec64 *ts)
{
        unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to);

        if (ts) {
                struct timespec64 poll_to_ts;

                poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to);
                if (timespec64_compare(ts, &poll_to_ts) < 0) {
                        s64 poll_to_ns = timespec64_to_ns(ts);
                        if (poll_to_ns > 0) {
                                u64 val = poll_to_ns + 999;
                                do_div(val, (s64) 1000);
                                poll_to = val;
                        }
                }
        }

        iowq->napi_busy_poll_to = poll_to;
}

/*
 * __io_napi_busy_loop() - execute busy poll loop
 * @ctx: pointer to io-uring context structure
 * @iowq: pointer to io wait queue
 *
 * Execute the busy poll loop and merge the spliced off list.
 */
void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
{
        iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);

        if (!(ctx->flags & IORING_SETUP_SQPOLL) && ctx->napi_enabled)
                io_napi_blocking_busy_loop(ctx, iowq);
}

/*
 * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll
 * @ctx: pointer to io-uring context structure
 *
 * Splice of the napi list and execute the napi busy poll loop.
 */
int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
{
        LIST_HEAD(napi_list);
        bool is_stale = false;

        if (!READ_ONCE(ctx->napi_busy_poll_to))
                return 0;
        if (list_empty_careful(&ctx->napi_list))
                return 0;

        rcu_read_lock();
        is_stale = __io_napi_do_busy_loop(ctx, NULL);
        rcu_read_unlock();

        io_napi_remove_stale(ctx, is_stale);
        return 1;
}

#endif




































   48 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/fault-inject.h>
#include <linux/fault-inject-usercopy.h>

static struct {
        struct fault_attr attr;
} fail_usercopy = {
        .attr = FAULT_ATTR_INITIALIZER,
};

static int __init setup_fail_usercopy(char *str)
{
        return setup_fault_attr(&fail_usercopy.attr, str);
}
__setup("fail_usercopy=", setup_fail_usercopy);

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

static int __init fail_usercopy_debugfs(void)
{
        struct dentry *dir;

        dir = fault_create_debugfs_attr("fail_usercopy", NULL,
                                        &fail_usercopy.attr);
        if (IS_ERR(dir))
                return PTR_ERR(dir);

        return 0;
}

late_initcall(fail_usercopy_debugfs);

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

bool should_fail_usercopy(void)
{
        return should_fail(&fail_usercopy.attr, 1);
}
EXPORT_SYMBOL_GPL(should_fail_usercopy);










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





    1 


    1 
























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * User-space Probes (UProbes) for x86
 *
 * Copyright (C) IBM Corporation, 2008-2011
 * Authors:
 *        Srikar Dronamraju
 *        Jim Keniston
 */
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/ptrace.h>
#include <linux/uprobes.h>
#include <linux/uaccess.h>

#include <linux/kdebug.h>
#include <asm/processor.h>
#include <asm/insn.h>
#include <asm/mmu_context.h>

/* Post-execution fixups. */

/* Adjust IP back to vicinity of actual insn */
#define UPROBE_FIX_IP                0x01

/* Adjust the return address of a call insn */
#define UPROBE_FIX_CALL                0x02

/* Instruction will modify TF, don't change it */
#define UPROBE_FIX_SETF                0x04

#define UPROBE_FIX_RIP_SI        0x08
#define UPROBE_FIX_RIP_DI        0x10
#define UPROBE_FIX_RIP_BX        0x20
#define UPROBE_FIX_RIP_MASK        \
        (UPROBE_FIX_RIP_SI | UPROBE_FIX_RIP_DI | UPROBE_FIX_RIP_BX)

#define        UPROBE_TRAP_NR                UINT_MAX

/* Adaptations for mhiramat x86 decoder v14. */
#define OPCODE1(insn)                ((insn)->opcode.bytes[0])
#define OPCODE2(insn)                ((insn)->opcode.bytes[1])
#define OPCODE3(insn)                ((insn)->opcode.bytes[2])
#define MODRM_REG(insn)                X86_MODRM_REG((insn)->modrm.value)

#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
        (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
          (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
          (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
          (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
         << (row % 32))

/*
 * Good-instruction tables for 32-bit apps.  This is non-const and volatile
 * to keep gcc from statically optimizing it out, as variable_test_bit makes
 * some versions of gcc to think only *(unsigned long*) is used.
 *
 * Opcodes we'll probably never support:
 * 6c-6f - ins,outs. SEGVs if used in userspace
 * e4-e7 - in,out imm. SEGVs if used in userspace
 * ec-ef - in,out acc. SEGVs if used in userspace
 * cc - int3. SIGTRAP if used in userspace
 * ce - into. Not used in userspace - no kernel support to make it useful. SEGVs
 *        (why we support bound (62) then? it's similar, and similarly unused...)
 * f1 - int1. SIGTRAP if used in userspace
 * f4 - hlt. SEGVs if used in userspace
 * fa - cli. SEGVs if used in userspace
 * fb - sti. SEGVs if used in userspace
 *
 * Opcodes which need some work to be supported:
 * 07,17,1f - pop es/ss/ds
 *        Normally not used in userspace, but would execute if used.
 *        Can cause GP or stack exception if tries to load wrong segment descriptor.
 *        We hesitate to run them under single step since kernel's handling
 *        of userspace single-stepping (TF flag) is fragile.
 *        We can easily refuse to support push es/cs/ss/ds (06/0e/16/1e)
 *        on the same grounds that they are never used.
 * cd - int N.
 *        Used by userspace for "int 80" syscall entry. (Other "int N"
 *        cause GP -> SEGV since their IDT gates don't allow calls from CPL 3).
 *        Not supported since kernel's handling of userspace single-stepping
 *        (TF flag) is fragile.
 * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
 */
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
static volatile u32 good_insns_32[256 / 32] = {
        /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
        /*      ----------------------------------------------         */
        W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 00 */
        W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */
        W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
        W(0x30, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */
        W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
        W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
        W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
        W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
        W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
        W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
        W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
        W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
        W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
        W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
        W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
        W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1)   /* f0 */
        /*      ----------------------------------------------         */
        /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
};
#else
#define good_insns_32        NULL
#endif

/* Good-instruction tables for 64-bit apps.
 *
 * Genuinely invalid opcodes:
 * 06,07 - formerly push/pop es
 * 0e - formerly push cs
 * 16,17 - formerly push/pop ss
 * 1e,1f - formerly push/pop ds
 * 27,2f,37,3f - formerly daa/das/aaa/aas
 * 60,61 - formerly pusha/popa
 * 62 - formerly bound. EVEX prefix for AVX512 (not yet supported)
 * 82 - formerly redundant encoding of Group1
 * 9a - formerly call seg:ofs
 * ce - formerly into
 * d4,d5 - formerly aam/aad
 * d6 - formerly undocumented salc
 * ea - formerly jmp seg:ofs
 *
 * Opcodes we'll probably never support:
 * 6c-6f - ins,outs. SEGVs if used in userspace
 * e4-e7 - in,out imm. SEGVs if used in userspace
 * ec-ef - in,out acc. SEGVs if used in userspace
 * cc - int3. SIGTRAP if used in userspace
 * f1 - int1. SIGTRAP if used in userspace
 * f4 - hlt. SEGVs if used in userspace
 * fa - cli. SEGVs if used in userspace
 * fb - sti. SEGVs if used in userspace
 *
 * Opcodes which need some work to be supported:
 * cd - int N.
 *        Used by userspace for "int 80" syscall entry. (Other "int N"
 *        cause GP -> SEGV since their IDT gates don't allow calls from CPL 3).
 *        Not supported since kernel's handling of userspace single-stepping
 *        (TF flag) is fragile.
 * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
 */
#if defined(CONFIG_X86_64)
static volatile u32 good_insns_64[256 / 32] = {
        /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
        /*      ----------------------------------------------         */
        W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* 00 */
        W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
        W(0x20, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 20 */
        W(0x30, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 30 */
        W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
        W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
        W(0x60, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
        W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
        W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
        W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1) , /* 90 */
        W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
        W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
        W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
        W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
        W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0) | /* e0 */
        W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1)   /* f0 */
        /*      ----------------------------------------------         */
        /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
};
#else
#define good_insns_64        NULL
#endif

/* Using this for both 64-bit and 32-bit apps.
 * Opcodes we don't support:
 * 0f 00 - SLDT/STR/LLDT/LTR/VERR/VERW/-/- group. System insns
 * 0f 01 - SGDT/SIDT/LGDT/LIDT/SMSW/-/LMSW/INVLPG group.
 *        Also encodes tons of other system insns if mod=11.
 *        Some are in fact non-system: xend, xtest, rdtscp, maybe more
 * 0f 05 - syscall
 * 0f 06 - clts (CPL0 insn)
 * 0f 07 - sysret
 * 0f 08 - invd (CPL0 insn)
 * 0f 09 - wbinvd (CPL0 insn)
 * 0f 0b - ud2
 * 0f 30 - wrmsr (CPL0 insn) (then why rdmsr is allowed, it's also CPL0 insn?)
 * 0f 34 - sysenter
 * 0f 35 - sysexit
 * 0f 37 - getsec
 * 0f 78 - vmread (Intel VMX. CPL0 insn)
 * 0f 79 - vmwrite (Intel VMX. CPL0 insn)
 *        Note: with prefixes, these two opcodes are
 *        extrq/insertq/AVX512 convert vector ops.
 * 0f ae - group15: [f]xsave,[f]xrstor,[v]{ld,st}mxcsr,clflush[opt],
 *        {rd,wr}{fs,gs}base,{s,l,m}fence.
 *        Why? They are all user-executable.
 */
static volatile u32 good_2byte_insns[256 / 32] = {
        /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
        /*      ----------------------------------------------         */
        W(0x00, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1) | /* 00 */
        W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
        W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
        W(0x30, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */
        W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
        W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
        W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
        W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* 70 */
        W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
        W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
        W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
        W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
        W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
        W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
        W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
        W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)   /* f0 */
        /*      ----------------------------------------------         */
        /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
};
#undef W

/*
 * opcodes we may need to refine support for:
 *
 *  0f - 2-byte instructions: For many of these instructions, the validity
 *  depends on the prefix and/or the reg field.  On such instructions, we
 *  just consider the opcode combination valid if it corresponds to any
 *  valid instruction.
 *
 *  8f - Group 1 - only reg = 0 is OK
 *  c6-c7 - Group 11 - only reg = 0 is OK
 *  d9-df - fpu insns with some illegal encodings
 *  f2, f3 - repnz, repz prefixes.  These are also the first byte for
 *  certain floating-point instructions, such as addsd.
 *
 *  fe - Group 4 - only reg = 0 or 1 is OK
 *  ff - Group 5 - only reg = 0-6 is OK
 *
 * others -- Do we need to support these?
 *
 *  0f - (floating-point?) prefetch instructions
 *  07, 17, 1f - pop es, pop ss, pop ds
 *  26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes --
 *        but 64 and 65 (fs: and gs:) seem to be used, so we support them
 *  67 - addr16 prefix
 *  ce - into
 *  f0 - lock prefix
 */

/*
 * TODO:
 * - Where necessary, examine the modrm byte and allow only valid instructions
 * in the different Groups and fpu instructions.
 */

static bool is_prefix_bad(struct insn *insn)
{
        insn_byte_t p;
        int i;

        for_each_insn_prefix(insn, i, p) {
                insn_attr_t attr;

                attr = inat_get_opcode_attribute(p);
                switch (attr) {
                case INAT_MAKE_PREFIX(INAT_PFX_ES):
                case INAT_MAKE_PREFIX(INAT_PFX_CS):
                case INAT_MAKE_PREFIX(INAT_PFX_DS):
                case INAT_MAKE_PREFIX(INAT_PFX_SS):
                case INAT_MAKE_PREFIX(INAT_PFX_LOCK):
                        return true;
                }
        }
        return false;
}

static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64)
{
        enum insn_mode m = x86_64 ? INSN_MODE_64 : INSN_MODE_32;
        u32 volatile *good_insns;
        int ret;

        ret = insn_decode(insn, auprobe->insn, sizeof(auprobe->insn), m);
        if (ret < 0)
                return -ENOEXEC;

        if (is_prefix_bad(insn))
                return -ENOTSUPP;

        /* We should not singlestep on the exception masking instructions */
        if (insn_masking_exception(insn))
                return -ENOTSUPP;

        if (x86_64)
                good_insns = good_insns_64;
        else
                good_insns = good_insns_32;

        if (test_bit(OPCODE1(insn), (unsigned long *)good_insns))
                return 0;

        if (insn->opcode.nbytes == 2) {
                if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
                        return 0;
        }

        return -ENOTSUPP;
}

#ifdef CONFIG_X86_64
/*
 * If arch_uprobe->insn doesn't use rip-relative addressing, return
 * immediately.  Otherwise, rewrite the instruction so that it accesses
 * its memory operand indirectly through a scratch register.  Set
 * defparam->fixups accordingly. (The contents of the scratch register
 * will be saved before we single-step the modified instruction,
 * and restored afterward).
 *
 * We do this because a rip-relative instruction can access only a
 * relatively small area (+/- 2 GB from the instruction), and the XOL
 * area typically lies beyond that area.  At least for instructions
 * that store to memory, we can't execute the original instruction
 * and "fix things up" later, because the misdirected store could be
 * disastrous.
 *
 * Some useful facts about rip-relative instructions:
 *
 *  - There's always a modrm byte with bit layout "00 reg 101".
 *  - There's never a SIB byte.
 *  - The displacement is always 4 bytes.
 *  - REX.B=1 bit in REX prefix, which normally extends r/m field,
 *    has no effect on rip-relative mode. It doesn't make modrm byte
 *    with r/m=101 refer to register 1101 = R13.
 */
static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
{
        u8 *cursor;
        u8 reg;
        u8 reg2;

        if (!insn_rip_relative(insn))
                return;

        /*
         * insn_rip_relative() would have decoded rex_prefix, vex_prefix, modrm.
         * Clear REX.b bit (extension of MODRM.rm field):
         * we want to encode low numbered reg, not r8+.
         */
        if (insn->rex_prefix.nbytes) {
                cursor = auprobe->insn + insn_offset_rex_prefix(insn);
                /* REX byte has 0100wrxb layout, clearing REX.b bit */
                *cursor &= 0xfe;
        }
        /*
         * Similar treatment for VEX3/EVEX prefix.
         * TODO: add XOP treatment when insn decoder supports them
         */
        if (insn->vex_prefix.nbytes >= 3) {
                /*
                 * vex2:     c5    rvvvvLpp   (has no b bit)
                 * vex3/xop: c4/8f rxbmmmmm wvvvvLpp
                 * evex:     62    rxbR00mm wvvvv1pp zllBVaaa
                 * Setting VEX3.b (setting because it has inverted meaning).
                 * Setting EVEX.x since (in non-SIB encoding) EVEX.x
                 * is the 4th bit of MODRM.rm, and needs the same treatment.
                 * For VEX3-encoded insns, VEX3.x value has no effect in
                 * non-SIB encoding, the change is superfluous but harmless.
                 */
                cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1;
                *cursor |= 0x60;
        }

        /*
         * Convert from rip-relative addressing to register-relative addressing
         * via a scratch register.
         *
         * This is tricky since there are insns with modrm byte
         * which also use registers not encoded in modrm byte:
         * [i]div/[i]mul: implicitly use dx:ax
         * shift ops: implicitly use cx
         * cmpxchg: implicitly uses ax
         * cmpxchg8/16b: implicitly uses dx:ax and bx:cx
         *   Encoding: 0f c7/1 modrm
         *   The code below thinks that reg=1 (cx), chooses si as scratch.
         * mulx: implicitly uses dx: mulx r/m,r1,r2 does r1:r2 = dx * r/m.
         *   First appeared in Haswell (BMI2 insn). It is vex-encoded.
         *   Example where none of bx,cx,dx can be used as scratch reg:
         *   c4 e2 63 f6 0d disp32   mulx disp32(%rip),%ebx,%ecx
         * [v]pcmpistri: implicitly uses cx, xmm0
         * [v]pcmpistrm: implicitly uses xmm0
         * [v]pcmpestri: implicitly uses ax, dx, cx, xmm0
         * [v]pcmpestrm: implicitly uses ax, dx, xmm0
         *   Evil SSE4.2 string comparison ops from hell.
         * maskmovq/[v]maskmovdqu: implicitly uses (ds:rdi) as destination.
         *   Encoding: 0f f7 modrm, 66 0f f7 modrm, vex-encoded: c5 f9 f7 modrm.
         *   Store op1, byte-masked by op2 msb's in each byte, to (ds:rdi).
         *   AMD says it has no 3-operand form (vex.vvvv must be 1111)
         *   and that it can have only register operands, not mem
         *   (its modrm byte must have mode=11).
         *   If these restrictions will ever be lifted,
         *   we'll need code to prevent selection of di as scratch reg!
         *
         * Summary: I don't know any insns with modrm byte which
         * use SI register implicitly. DI register is used only
         * by one insn (maskmovq) and BX register is used
         * only by one too (cmpxchg8b).
         * BP is stack-segment based (may be a problem?).
         * AX, DX, CX are off-limits (many implicit users).
         * SP is unusable (it's stack pointer - think about "pop mem";
         * also, rsp+disp32 needs sib encoding -> insn length change).
         */

        reg = MODRM_REG(insn);        /* Fetch modrm.reg */
        reg2 = 0xff;                /* Fetch vex.vvvv */
        if (insn->vex_prefix.nbytes)
                reg2 = insn->vex_prefix.bytes[2];
        /*
         * TODO: add XOP vvvv reading.
         *
         * vex.vvvv field is in bits 6-3, bits are inverted.
         * But in 32-bit mode, high-order bit may be ignored.
         * Therefore, let's consider only 3 low-order bits.
         */
        reg2 = ((reg2 >> 3) & 0x7) ^ 0x7;
        /*
         * Register numbering is ax,cx,dx,bx, sp,bp,si,di, r8..r15.
         *
         * Choose scratch reg. Order is important: must not select bx
         * if we can use si (cmpxchg8b case!)
         */
        if (reg != 6 && reg2 != 6) {
                reg2 = 6;
                auprobe->defparam.fixups |= UPROBE_FIX_RIP_SI;
        } else if (reg != 7 && reg2 != 7) {
                reg2 = 7;
                auprobe->defparam.fixups |= UPROBE_FIX_RIP_DI;
                /* TODO (paranoia): force maskmovq to not use di */
        } else {
                reg2 = 3;
                auprobe->defparam.fixups |= UPROBE_FIX_RIP_BX;
        }
        /*
         * Point cursor at the modrm byte.  The next 4 bytes are the
         * displacement.  Beyond the displacement, for some instructions,
         * is the immediate operand.
         */
        cursor = auprobe->insn + insn_offset_modrm(insn);
        /*
         * Change modrm from "00 reg 101" to "10 reg reg2". Example:
         * 89 05 disp32  mov %eax,disp32(%rip) becomes
         * 89 86 disp32  mov %eax,disp32(%rsi)
         */
        *cursor = 0x80 | (reg << 3) | reg2;
}

static inline unsigned long *
scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
        if (auprobe->defparam.fixups & UPROBE_FIX_RIP_SI)
                return &regs->si;
        if (auprobe->defparam.fixups & UPROBE_FIX_RIP_DI)
                return &regs->di;
        return &regs->bx;
}

/*
 * If we're emulating a rip-relative instruction, save the contents
 * of the scratch register and store the target address in that register.
 */
static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
        if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) {
                struct uprobe_task *utask = current->utask;
                unsigned long *sr = scratch_reg(auprobe, regs);

                utask->autask.saved_scratch_register = *sr;
                *sr = utask->vaddr + auprobe->defparam.ilen;
        }
}

static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
        if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) {
                struct uprobe_task *utask = current->utask;
                unsigned long *sr = scratch_reg(auprobe, regs);

                *sr = utask->autask.saved_scratch_register;
        }
}
#else /* 32-bit: */
/*
 * No RIP-relative addressing on 32-bit
 */
static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
{
}
static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
}
static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
}
#endif /* CONFIG_X86_64 */

struct uprobe_xol_ops {
        bool        (*emulate)(struct arch_uprobe *, struct pt_regs *);
        int        (*pre_xol)(struct arch_uprobe *, struct pt_regs *);
        int        (*post_xol)(struct arch_uprobe *, struct pt_regs *);
        void        (*abort)(struct arch_uprobe *, struct pt_regs *);
};

static inline int sizeof_long(struct pt_regs *regs)
{
        /*
         * Check registers for mode as in_xxx_syscall() does not apply here.
         */
        return user_64bit_mode(regs) ? 8 : 4;
}

static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
        riprel_pre_xol(auprobe, regs);
        return 0;
}

static int emulate_push_stack(struct pt_regs *regs, unsigned long val)
{
        unsigned long new_sp = regs->sp - sizeof_long(regs);

        if (copy_to_user((void __user *)new_sp, &val, sizeof_long(regs)))
                return -EFAULT;

        regs->sp = new_sp;
        return 0;
}

/*
 * We have to fix things up as follows:
 *
 * Typically, the new ip is relative to the copied instruction.  We need
 * to make it relative to the original instruction (FIX_IP).  Exceptions
 * are return instructions and absolute or indirect jump or call instructions.
 *
 * If the single-stepped instruction was a call, the return address that
 * is atop the stack is the address following the copied instruction.  We
 * need to make it the address following the original instruction (FIX_CALL).
 *
 * If the original instruction was a rip-relative instruction such as
 * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
 * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)".
 * We need to restore the contents of the scratch register
 * (FIX_RIP_reg).
 */
static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
        struct uprobe_task *utask = current->utask;

        riprel_post_xol(auprobe, regs);
        if (auprobe->defparam.fixups & UPROBE_FIX_IP) {
                long correction = utask->vaddr - utask->xol_vaddr;
                regs->ip += correction;
        } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
                regs->sp += sizeof_long(regs); /* Pop incorrect return address */
                if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen))
                        return -ERESTART;
        }
        /* popf; tell the caller to not touch TF */
        if (auprobe->defparam.fixups & UPROBE_FIX_SETF)
                utask->autask.saved_tf = true;

        return 0;
}

static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
        riprel_post_xol(auprobe, regs);
}

static const struct uprobe_xol_ops default_xol_ops = {
        .pre_xol  = default_pre_xol_op,
        .post_xol = default_post_xol_op,
        .abort          = default_abort_op,
};

static bool branch_is_call(struct arch_uprobe *auprobe)
{
        return auprobe->branch.opc1 == 0xe8;
}

#define CASE_COND                                        \
        COND(70, 71, XF(OF))                                \
        COND(72, 73, XF(CF))                                \
        COND(74, 75, XF(ZF))                                \
        COND(78, 79, XF(SF))                                \
        COND(7a, 7b, XF(PF))                                \
        COND(76, 77, XF(CF) || XF(ZF))                        \
        COND(7c, 7d, XF(SF) != XF(OF))                        \
        COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF))

#define COND(op_y, op_n, expr)                                \
        case 0x ## op_y: DO((expr) != 0)                \
        case 0x ## op_n: DO((expr) == 0)

#define XF(xf)        (!!(flags & X86_EFLAGS_ ## xf))

static bool is_cond_jmp_opcode(u8 opcode)
{
        switch (opcode) {
        #define DO(expr)        \
                return true;
        CASE_COND
        #undef        DO

        default:
                return false;
        }
}

static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
        unsigned long flags = regs->flags;

        switch (auprobe->branch.opc1) {
        #define DO(expr)        \
                return expr;
        CASE_COND
        #undef        DO

        default:        /* not a conditional jmp */
                return true;
        }
}

#undef        XF
#undef        COND
#undef        CASE_COND

static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
        unsigned long new_ip = regs->ip += auprobe->branch.ilen;
        unsigned long offs = (long)auprobe->branch.offs;

        if (branch_is_call(auprobe)) {
                /*
                 * If it fails we execute this (mangled, see the comment in
                 * branch_clear_offset) insn out-of-line. In the likely case
                 * this should trigger the trap, and the probed application
                 * should die or restart the same insn after it handles the
                 * signal, arch_uprobe_post_xol() won't be even called.
                 *
                 * But there is corner case, see the comment in ->post_xol().
                 */
                if (emulate_push_stack(regs, new_ip))
                        return false;
        } else if (!check_jmp_cond(auprobe, regs)) {
                offs = 0;
        }

        regs->ip = new_ip + offs;
        return true;
}

static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
        unsigned long *src_ptr = (void *)regs + auprobe->push.reg_offset;

        if (emulate_push_stack(regs, *src_ptr))
                return false;
        regs->ip += auprobe->push.ilen;
        return true;
}

static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
        BUG_ON(!branch_is_call(auprobe));
        /*
         * We can only get here if branch_emulate_op() failed to push the ret
         * address _and_ another thread expanded our stack before the (mangled)
         * "call" insn was executed out-of-line. Just restore ->sp and restart.
         * We could also restore ->ip and try to call branch_emulate_op() again.
         */
        regs->sp += sizeof_long(regs);
        return -ERESTART;
}

static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn)
{
        /*
         * Turn this insn into "call 1f; 1:", this is what we will execute
         * out-of-line if ->emulate() fails. We only need this to generate
         * a trap, so that the probed task receives the correct signal with
         * the properly filled siginfo.
         *
         * But see the comment in ->post_xol(), in the unlikely case it can
         * succeed. So we need to ensure that the new ->ip can not fall into
         * the non-canonical area and trigger #GP.
         *
         * We could turn it into (say) "pushf", but then we would need to
         * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte
         * of ->insn[] for set_orig_insn().
         */
        memset(auprobe->insn + insn_offset_immediate(insn),
                0, insn->immediate.nbytes);
}

static const struct uprobe_xol_ops branch_xol_ops = {
        .emulate  = branch_emulate_op,
        .post_xol = branch_post_xol_op,
};

static const struct uprobe_xol_ops push_xol_ops = {
        .emulate  = push_emulate_op,
};

/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
{
        u8 opc1 = OPCODE1(insn);
        insn_byte_t p;
        int i;

        switch (opc1) {
        case 0xeb:        /* jmp 8 */
        case 0xe9:        /* jmp 32 */
                break;
        case 0x90:        /* prefix* + nop; same as jmp with .offs = 0 */
                goto setup;

        case 0xe8:        /* call relative */
                branch_clear_offset(auprobe, insn);
                break;

        case 0x0f:
                if (insn->opcode.nbytes != 2)
                        return -ENOSYS;
                /*
                 * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches
                 * OPCODE1() of the "short" jmp which checks the same condition.
                 */
                opc1 = OPCODE2(insn) - 0x10;
                fallthrough;
        default:
                if (!is_cond_jmp_opcode(opc1))
                        return -ENOSYS;
        }

        /*
         * 16-bit overrides such as CALLW (66 e8 nn nn) are not supported.
         * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix.
         * No one uses these insns, reject any branch insns with such prefix.
         */
        for_each_insn_prefix(insn, i, p) {
                if (p == 0x66)
                        return -ENOTSUPP;
        }

setup:
        auprobe->branch.opc1 = opc1;
        auprobe->branch.ilen = insn->length;
        auprobe->branch.offs = insn->immediate.value;

        auprobe->ops = &branch_xol_ops;
        return 0;
}

/* Returns -ENOSYS if push_xol_ops doesn't handle this insn */
static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
{
        u8 opc1 = OPCODE1(insn), reg_offset = 0;

        if (opc1 < 0x50 || opc1 > 0x57)
                return -ENOSYS;

        if (insn->length > 2)
                return -ENOSYS;
        if (insn->length == 2) {
                /* only support rex_prefix 0x41 (x64 only) */
#ifdef CONFIG_X86_64
                if (insn->rex_prefix.nbytes != 1 ||
                    insn->rex_prefix.bytes[0] != 0x41)
                        return -ENOSYS;

                switch (opc1) {
                case 0x50:
                        reg_offset = offsetof(struct pt_regs, r8);
                        break;
                case 0x51:
                        reg_offset = offsetof(struct pt_regs, r9);
                        break;
                case 0x52:
                        reg_offset = offsetof(struct pt_regs, r10);
                        break;
                case 0x53:
                        reg_offset = offsetof(struct pt_regs, r11);
                        break;
                case 0x54:
                        reg_offset = offsetof(struct pt_regs, r12);
                        break;
                case 0x55:
                        reg_offset = offsetof(struct pt_regs, r13);
                        break;
                case 0x56:
                        reg_offset = offsetof(struct pt_regs, r14);
                        break;
                case 0x57:
                        reg_offset = offsetof(struct pt_regs, r15);
                        break;
                }
#else
                return -ENOSYS;
#endif
        } else {
                switch (opc1) {
                case 0x50:
                        reg_offset = offsetof(struct pt_regs, ax);
                        break;
                case 0x51:
                        reg_offset = offsetof(struct pt_regs, cx);
                        break;
                case 0x52:
                        reg_offset = offsetof(struct pt_regs, dx);
                        break;
                case 0x53:
                        reg_offset = offsetof(struct pt_regs, bx);
                        break;
                case 0x54:
                        reg_offset = offsetof(struct pt_regs, sp);
                        break;
                case 0x55:
                        reg_offset = offsetof(struct pt_regs, bp);
                        break;
                case 0x56:
                        reg_offset = offsetof(struct pt_regs, si);
                        break;
                case 0x57:
                        reg_offset = offsetof(struct pt_regs, di);
                        break;
                }
        }

        auprobe->push.reg_offset = reg_offset;
        auprobe->push.ilen = insn->length;
        auprobe->ops = &push_xol_ops;
        return 0;
}

/**
 * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
 * @auprobe: the probepoint information.
 * @mm: the probed address space.
 * @addr: virtual address at which to install the probepoint
 * Return 0 on success or a -ve number on error.
 */
int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
{
        struct insn insn;
        u8 fix_ip_or_call = UPROBE_FIX_IP;
        int ret;

        ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm));
        if (ret)
                return ret;

        ret = branch_setup_xol_ops(auprobe, &insn);
        if (ret != -ENOSYS)
                return ret;

        ret = push_setup_xol_ops(auprobe, &insn);
        if (ret != -ENOSYS)
                return ret;

        /*
         * Figure out which fixups default_post_xol_op() will need to perform,
         * and annotate defparam->fixups accordingly.
         */
        switch (OPCODE1(&insn)) {
        case 0x9d:                /* popf */
                auprobe->defparam.fixups |= UPROBE_FIX_SETF;
                break;
        case 0xc3:                /* ret or lret -- ip is correct */
        case 0xcb:
        case 0xc2:
        case 0xca:
        case 0xea:                /* jmp absolute -- ip is correct */
                fix_ip_or_call = 0;
                break;
        case 0x9a:                /* call absolute - Fix return addr, not ip */
                fix_ip_or_call = UPROBE_FIX_CALL;
                break;
        case 0xff:
                switch (MODRM_REG(&insn)) {
                case 2: case 3:                        /* call or lcall, indirect */
                        fix_ip_or_call = UPROBE_FIX_CALL;
                        break;
                case 4: case 5:                        /* jmp or ljmp, indirect */
                        fix_ip_or_call = 0;
                        break;
                }
                fallthrough;
        default:
                riprel_analyze(auprobe, &insn);
        }

        auprobe->defparam.ilen = insn.length;
        auprobe->defparam.fixups |= fix_ip_or_call;

        auprobe->ops = &default_xol_ops;
        return 0;
}

/*
 * arch_uprobe_pre_xol - prepare to execute out of line.
 * @auprobe: the probepoint information.
 * @regs: reflects the saved user state of current task.
 */
int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
        struct uprobe_task *utask = current->utask;

        if (auprobe->ops->pre_xol) {
                int err = auprobe->ops->pre_xol(auprobe, regs);
                if (err)
                        return err;
        }

        regs->ip = utask->xol_vaddr;
        utask->autask.saved_trap_nr = current->thread.trap_nr;
        current->thread.trap_nr = UPROBE_TRAP_NR;

        utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF);
        regs->flags |= X86_EFLAGS_TF;
        if (test_tsk_thread_flag(current, TIF_BLOCKSTEP))
                set_task_blockstep(current, false);

        return 0;
}

/*
 * If xol insn itself traps and generates a signal(Say,
 * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped
 * instruction jumps back to its own address. It is assumed that anything
 * like do_page_fault/do_trap/etc sets thread.trap_nr != -1.
 *
 * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr,
 * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to
 * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol().
 */
bool arch_uprobe_xol_was_trapped(struct task_struct *t)
{
        if (t->thread.trap_nr != UPROBE_TRAP_NR)
                return true;

        return false;
}

/*
 * Called after single-stepping. To avoid the SMP problems that can
 * occur when we temporarily put back the original opcode to
 * single-step, we single-stepped a copy of the instruction.
 *
 * This function prepares to resume execution after the single-step.
 */
int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
        struct uprobe_task *utask = current->utask;
        bool send_sigtrap = utask->autask.saved_tf;
        int err = 0;

        WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
        current->thread.trap_nr = utask->autask.saved_trap_nr;

        if (auprobe->ops->post_xol) {
                err = auprobe->ops->post_xol(auprobe, regs);
                if (err) {
                        /*
                         * Restore ->ip for restart or post mortem analysis.
                         * ->post_xol() must not return -ERESTART unless this
                         * is really possible.
                         */
                        regs->ip = utask->vaddr;
                        if (err == -ERESTART)
                                err = 0;
                        send_sigtrap = false;
                }
        }
        /*
         * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP
         * so we can get an extra SIGTRAP if we do not clear TF. We need
         * to examine the opcode to make it right.
         */
        if (send_sigtrap)
                send_sig(SIGTRAP, current, 0);

        if (!utask->autask.saved_tf)
                regs->flags &= ~X86_EFLAGS_TF;

        return err;
}

/* callback routine for handling exceptions. */
int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data)
{
        struct die_args *args = data;
        struct pt_regs *regs = args->regs;
        int ret = NOTIFY_DONE;

        /* We are only interested in userspace traps */
        if (regs && !user_mode(regs))
                return NOTIFY_DONE;

        switch (val) {
        case DIE_INT3:
                if (uprobe_pre_sstep_notifier(regs))
                        ret = NOTIFY_STOP;

                break;

        case DIE_DEBUG:
                if (uprobe_post_sstep_notifier(regs))
                        ret = NOTIFY_STOP;

                break;

        default:
                break;
        }

        return ret;
}

/*
 * This function gets called when XOL instruction either gets trapped or
 * the thread has a fatal signal. Reset the instruction pointer to its
 * probed address for the potential restart or for post mortem analysis.
 */
void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
        struct uprobe_task *utask = current->utask;

        if (auprobe->ops->abort)
                auprobe->ops->abort(auprobe, regs);

        current->thread.trap_nr = utask->autask.saved_trap_nr;
        regs->ip = utask->vaddr;
        /* clear TF if it was set by us in arch_uprobe_pre_xol() */
        if (!utask->autask.saved_tf)
                regs->flags &= ~X86_EFLAGS_TF;
}

static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
        if (auprobe->ops->emulate)
                return auprobe->ops->emulate(auprobe, regs);
        return false;
}

bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
        bool ret = __skip_sstep(auprobe, regs);
        if (ret && (regs->flags & X86_EFLAGS_TF))
                send_sig(SIGTRAP, current, 0);
        return ret;
}

unsigned long
arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
{
        int rasize = sizeof_long(regs), nleft;
        unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */

        if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize))
                return -1;

        /* check whether address has been already hijacked */
        if (orig_ret_vaddr == trampoline_vaddr)
                return orig_ret_vaddr;

        nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize);
        if (likely(!nleft))
                return orig_ret_vaddr;

        if (nleft != rasize) {
                pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n",
                       current->pid, regs->sp, regs->ip);

                force_sig(SIGSEGV);
        }

        return -1;
}

bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
                                struct pt_regs *regs)
{
        if (ctx == RP_CHECK_CALL) /* sp was just decremented by "call" insn */
                return regs->sp < ret->stack;
        else
                return regs->sp <= ret->stack;
}











































































































































    1 












    1 





    1 


    1 







    1 
    1 



    1 

    1 























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
/*
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
 *
 *  Pentium III FXSR, SSE support
 *        Gareth Hughes <gareth@valinux.com>, May 2000
 */

/*
 * Handle hardware traps and faults.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/context_tracking.h>
#include <linux/interrupt.h>
#include <linux/kallsyms.h>
#include <linux/kmsan.h>
#include <linux/spinlock.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/kdebug.h>
#include <linux/kgdb.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/ptrace.h>
#include <linux/uprobes.h>
#include <linux/string.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/kexec.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/timer.h>
#include <linux/init.h>
#include <linux/bug.h>
#include <linux/nmi.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/io.h>
#include <linux/hardirq.h>
#include <linux/atomic.h>
#include <linux/iommu.h>

#include <asm/stacktrace.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
#include <asm/realmode.h>
#include <asm/text-patching.h>
#include <asm/ftrace.h>
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/fred.h>
#include <asm/fpu/api.h>
#include <asm/cpu.h>
#include <asm/cpu_entry_area.h>
#include <asm/mce.h>
#include <asm/fixmap.h>
#include <asm/mach_traps.h>
#include <asm/alternative.h>
#include <asm/fpu/xstate.h>
#include <asm/vm86.h>
#include <asm/umip.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
#include <asm/vdso.h>
#include <asm/tdx.h>
#include <asm/cfi.h>

#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
#else
#include <asm/processor-flags.h>
#include <asm/setup.h>
#endif

#include <asm/proto.h>

DECLARE_BITMAP(system_vectors, NR_VECTORS);

__always_inline int is_valid_bugaddr(unsigned long addr)
{
        if (addr < TASK_SIZE_MAX)
                return 0;

        /*
         * We got #UD, if the text isn't readable we'd have gotten
         * a different exception.
         */
        return *(unsigned short *)addr == INSN_UD2;
}

static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
                  struct pt_regs *regs,        long error_code)
{
        if (v8086_mode(regs)) {
                /*
                 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
                 * On nmi (interrupt 2), do_trap should not be called.
                 */
                if (trapnr < X86_TRAP_UD) {
                        if (!handle_vm86_trap((struct kernel_vm86_regs *) regs,
                                                error_code, trapnr))
                                return 0;
                }
        } else if (!user_mode(regs)) {
                if (fixup_exception(regs, trapnr, error_code, 0))
                        return 0;

                tsk->thread.error_code = error_code;
                tsk->thread.trap_nr = trapnr;
                die(str, regs, error_code);
        } else {
                if (fixup_vdso_exception(regs, trapnr, error_code, 0))
                        return 0;
        }

        /*
         * We want error_code and trap_nr set for userspace faults and
         * kernelspace faults which result in die(), but not
         * kernelspace faults which are fixed up.  die() gives the
         * process no chance to handle the signal and notice the
         * kernel fault information, so that won't result in polluting
         * the information about previously queued, but not yet
         * delivered, faults.  See also exc_general_protection below.
         */
        tsk->thread.error_code = error_code;
        tsk->thread.trap_nr = trapnr;

        return -1;
}

static void show_signal(struct task_struct *tsk, int signr,
                        const char *type, const char *desc,
                        struct pt_regs *regs, long error_code)
{
        if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
            printk_ratelimit()) {
                pr_info("%s[%d] %s%s ip:%lx sp:%lx error:%lx",
                        tsk->comm, task_pid_nr(tsk), type, desc,
                        regs->ip, regs->sp, error_code);
                print_vma_addr(KERN_CONT " in ", regs->ip);
                pr_cont("\n");
        }
}

static void
do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
        long error_code, int sicode, void __user *addr)
{
        struct task_struct *tsk = current;

        if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
                return;

        show_signal(tsk, signr, "trap ", str, regs, error_code);

        if (!sicode)
                force_sig(signr);
        else
                force_sig_fault(signr, sicode, addr);
}
NOKPROBE_SYMBOL(do_trap);

static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
        unsigned long trapnr, int signr, int sicode, void __user *addr)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");

        if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
                        NOTIFY_STOP) {
                cond_local_irq_enable(regs);
                do_trap(trapnr, signr, str, regs, error_code, sicode, addr);
                cond_local_irq_disable(regs);
        }
}

/*
 * Posix requires to provide the address of the faulting instruction for
 * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t.
 *
 * This address is usually regs->ip, but when an uprobe moved the code out
 * of line then regs->ip points to the XOL code which would confuse
 * anything which analyzes the fault address vs. the unmodified binary. If
 * a trap happened in XOL code then uprobe maps regs->ip back to the
 * original instruction address.
 */
static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs)
{
        return (void __user *)uprobe_get_trap_addr(regs);
}

DEFINE_IDTENTRY(exc_divide_error)
{
        do_error_trap(regs, 0, "divide error", X86_TRAP_DE, SIGFPE,
                      FPE_INTDIV, error_get_trap_addr(regs));
}

DEFINE_IDTENTRY(exc_overflow)
{
        do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL);
}

#ifdef CONFIG_X86_F00F_BUG
void handle_invalid_op(struct pt_regs *regs)
#else
static inline void handle_invalid_op(struct pt_regs *regs)
#endif
{
        do_error_trap(regs, 0, "invalid opcode", X86_TRAP_UD, SIGILL,
                      ILL_ILLOPN, error_get_trap_addr(regs));
}

static noinstr bool handle_bug(struct pt_regs *regs)
{
        bool handled = false;

        /*
         * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
         * is a rare case that uses @regs without passing them to
         * irqentry_enter().
         */
        kmsan_unpoison_entry_regs(regs);
        if (!is_valid_bugaddr(regs->ip))
                return handled;

        /*
         * All lies, just get the WARN/BUG out.
         */
        instrumentation_begin();
        /*
         * Since we're emulating a CALL with exceptions, restore the interrupt
         * state to what it was at the exception site.
         */
        if (regs->flags & X86_EFLAGS_IF)
                raw_local_irq_enable();
        if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN ||
            handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) {
                regs->ip += LEN_UD2;
                handled = true;
        }
        if (regs->flags & X86_EFLAGS_IF)
                raw_local_irq_disable();
        instrumentation_end();

        return handled;
}

DEFINE_IDTENTRY_RAW(exc_invalid_op)
{
        irqentry_state_t state;

        /*
         * We use UD2 as a short encoding for 'CALL __WARN', as such
         * handle it before exception entry to avoid recursive WARN
         * in case exception entry is the one triggering WARNs.
         */
        if (!user_mode(regs) && handle_bug(regs))
                return;

        state = irqentry_enter(regs);
        instrumentation_begin();
        handle_invalid_op(regs);
        instrumentation_end();
        irqentry_exit(regs, state);
}

DEFINE_IDTENTRY(exc_coproc_segment_overrun)
{
        do_error_trap(regs, 0, "coprocessor segment overrun",
                      X86_TRAP_OLD_MF, SIGFPE, 0, NULL);
}

DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss)
{
        do_error_trap(regs, error_code, "invalid TSS", X86_TRAP_TS, SIGSEGV,
                      0, NULL);
}

DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present)
{
        do_error_trap(regs, error_code, "segment not present", X86_TRAP_NP,
                      SIGBUS, 0, NULL);
}

DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment)
{
        do_error_trap(regs, error_code, "stack segment", X86_TRAP_SS, SIGBUS,
                      0, NULL);
}

DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check)
{
        char *str = "alignment check";

        if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP)
                return;

        if (!user_mode(regs))
                die("Split lock detected\n", regs, error_code);

        local_irq_enable();

        if (handle_user_split_lock(regs, error_code))
                goto out;

        do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs,
                error_code, BUS_ADRALN, NULL);

out:
        local_irq_disable();
}

#ifdef CONFIG_VMAP_STACK
__visible void __noreturn handle_stack_overflow(struct pt_regs *regs,
                                                unsigned long fault_address,
                                                struct stack_info *info)
{
        const char *name = stack_type_name(info->type);

        printk(KERN_EMERG "BUG: %s stack guard page was hit at %p (stack is %p..%p)\n",
               name, (void *)fault_address, info->begin, info->end);

        die("stack guard page", regs, 0);

        /* Be absolutely certain we don't return. */
        panic("%s stack guard hit", name);
}
#endif

/*
 * Runs on an IST stack for x86_64 and on a special task stack for x86_32.
 *
 * On x86_64, this is more or less a normal kernel entry.  Notwithstanding the
 * SDM's warnings about double faults being unrecoverable, returning works as
 * expected.  Presumably what the SDM actually means is that the CPU may get
 * the register state wrong on entry, so returning could be a bad idea.
 *
 * Various CPU engineers have promised that double faults due to an IRET fault
 * while the stack is read-only are, in fact, recoverable.
 *
 * On x86_32, this is entered through a task gate, and regs are synthesized
 * from the TSS.  Returning is, in principle, okay, but changes to regs will
 * be lost.  If, for some reason, we need to return to a context with modified
 * regs, the shim code could be adjusted to synchronize the registers.
 *
 * The 32bit #DF shim provides CR2 already as an argument. On 64bit it needs
 * to be read before doing anything else.
 */
DEFINE_IDTENTRY_DF(exc_double_fault)
{
        static const char str[] = "double fault";
        struct task_struct *tsk = current;

#ifdef CONFIG_VMAP_STACK
        unsigned long address = read_cr2();
        struct stack_info info;
#endif

#ifdef CONFIG_X86_ESPFIX64
        extern unsigned char native_irq_return_iret[];

        /*
         * If IRET takes a non-IST fault on the espfix64 stack, then we
         * end up promoting it to a doublefault.  In that case, take
         * advantage of the fact that we're not using the normal (TSS.sp0)
         * stack right now.  We can write a fake #GP(0) frame at TSS.sp0
         * and then modify our own IRET frame so that, when we return,
         * we land directly at the #GP(0) vector with the stack already
         * set up according to its expectations.
         *
         * The net result is that our #GP handler will think that we
         * entered from usermode with the bad user context.
         *
         * No need for nmi_enter() here because we don't use RCU.
         */
        if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY &&
                regs->cs == __KERNEL_CS &&
                regs->ip == (unsigned long)native_irq_return_iret)
        {
                struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
                unsigned long *p = (unsigned long *)regs->sp;

                /*
                 * regs->sp points to the failing IRET frame on the
                 * ESPFIX64 stack.  Copy it to the entry stack.  This fills
                 * in gpregs->ss through gpregs->ip.
                 *
                 */
                gpregs->ip        = p[0];
                gpregs->cs        = p[1];
                gpregs->flags        = p[2];
                gpregs->sp        = p[3];
                gpregs->ss        = p[4];
                gpregs->orig_ax = 0;  /* Missing (lost) #GP error code */

                /*
                 * Adjust our frame so that we return straight to the #GP
                 * vector with the expected RSP value.  This is safe because
                 * we won't enable interrupts or schedule before we invoke
                 * general_protection, so nothing will clobber the stack
                 * frame we just set up.
                 *
                 * We will enter general_protection with kernel GSBASE,
                 * which is what the stub expects, given that the faulting
                 * RIP will be the IRET instruction.
                 */
                regs->ip = (unsigned long)asm_exc_general_protection;
                regs->sp = (unsigned long)&gpregs->orig_ax;

                return;
        }
#endif

        irqentry_nmi_enter(regs);
        instrumentation_begin();
        notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);

        tsk->thread.error_code = error_code;
        tsk->thread.trap_nr = X86_TRAP_DF;

#ifdef CONFIG_VMAP_STACK
        /*
         * If we overflow the stack into a guard page, the CPU will fail
         * to deliver #PF and will send #DF instead.  Similarly, if we
         * take any non-IST exception while too close to the bottom of
         * the stack, the processor will get a page fault while
         * delivering the exception and will generate a double fault.
         *
         * According to the SDM (footnote in 6.15 under "Interrupt 14 -
         * Page-Fault Exception (#PF):
         *
         *   Processors update CR2 whenever a page fault is detected. If a
         *   second page fault occurs while an earlier page fault is being
         *   delivered, the faulting linear address of the second fault will
         *   overwrite the contents of CR2 (replacing the previous
         *   address). These updates to CR2 occur even if the page fault
         *   results in a double fault or occurs during the delivery of a
         *   double fault.
         *
         * The logic below has a small possibility of incorrectly diagnosing
         * some errors as stack overflows.  For example, if the IDT or GDT
         * gets corrupted such that #GP delivery fails due to a bad descriptor
         * causing #GP and we hit this condition while CR2 coincidentally
         * points to the stack guard page, we'll think we overflowed the
         * stack.  Given that we're going to panic one way or another
         * if this happens, this isn't necessarily worth fixing.
         *
         * If necessary, we could improve the test by only diagnosing
         * a stack overflow if the saved RSP points within 47 bytes of
         * the bottom of the stack: if RSP == tsk_stack + 48 and we
         * take an exception, the stack is already aligned and there
         * will be enough room SS, RSP, RFLAGS, CS, RIP, and a
         * possible error code, so a stack overflow would *not* double
         * fault.  With any less space left, exception delivery could
         * fail, and, as a practical matter, we've overflowed the
         * stack even if the actual trigger for the double fault was
         * something else.
         */
        if (get_stack_guard_info((void *)address, &info))
                handle_stack_overflow(regs, address, &info);
#endif

        pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
        die("double fault", regs, error_code);
        panic("Machine halted.");
        instrumentation_end();
}

DEFINE_IDTENTRY(exc_bounds)
{
        if (notify_die(DIE_TRAP, "bounds", regs, 0,
                        X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
                return;
        cond_local_irq_enable(regs);

        if (!user_mode(regs))
                die("bounds", regs, 0);

        do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL);

        cond_local_irq_disable(regs);
}

enum kernel_gp_hint {
        GP_NO_HINT,
        GP_NON_CANONICAL,
        GP_CANONICAL
};

/*
 * When an uncaught #GP occurs, try to determine the memory address accessed by
 * the instruction and return that address to the caller. Also, try to figure
 * out whether any part of the access to that address was non-canonical.
 */
static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
                                                 unsigned long *addr)
{
        u8 insn_buf[MAX_INSN_SIZE];
        struct insn insn;
        int ret;

        if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip,
                        MAX_INSN_SIZE))
                return GP_NO_HINT;

        ret = insn_decode_kernel(&insn, insn_buf);
        if (ret < 0)
                return GP_NO_HINT;

        *addr = (unsigned long)insn_get_addr_ref(&insn, regs);
        if (*addr == -1UL)
                return GP_NO_HINT;

#ifdef CONFIG_X86_64
        /*
         * Check that:
         *  - the operand is not in the kernel half
         *  - the last byte of the operand is not in the user canonical half
         */
        if (*addr < ~__VIRTUAL_MASK &&
            *addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK)
                return GP_NON_CANONICAL;
#endif

        return GP_CANONICAL;
}

#define GPFSTR "general protection fault"

static bool fixup_iopl_exception(struct pt_regs *regs)
{
        struct thread_struct *t = &current->thread;
        unsigned char byte;
        unsigned long ip;

        if (!IS_ENABLED(CONFIG_X86_IOPL_IOPERM) || t->iopl_emul != 3)
                return false;

        if (insn_get_effective_ip(regs, &ip))
                return false;

        if (get_user(byte, (const char __user *)ip))
                return false;

        if (byte != 0xfa && byte != 0xfb)
                return false;

        if (!t->iopl_warn && printk_ratelimit()) {
                pr_err("%s[%d] attempts to use CLI/STI, pretending it's a NOP, ip:%lx",
                       current->comm, task_pid_nr(current), ip);
                print_vma_addr(KERN_CONT " in ", ip);
                pr_cont("\n");
                t->iopl_warn = 1;
        }

        regs->ip += 1;
        return true;
}

/*
 * The unprivileged ENQCMD instruction generates #GPs if the
 * IA32_PASID MSR has not been populated.  If possible, populate
 * the MSR from a PASID previously allocated to the mm.
 */
static bool try_fixup_enqcmd_gp(void)
{
#ifdef CONFIG_ARCH_HAS_CPU_PASID
        u32 pasid;

        /*
         * MSR_IA32_PASID is managed using XSAVE.  Directly
         * writing to the MSR is only possible when fpregs
         * are valid and the fpstate is not.  This is
         * guaranteed when handling a userspace exception
         * in *before* interrupts are re-enabled.
         */
        lockdep_assert_irqs_disabled();

        /*
         * Hardware without ENQCMD will not generate
         * #GPs that can be fixed up here.
         */
        if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
                return false;

        /*
         * If the mm has not been allocated a
         * PASID, the #GP can not be fixed up.
         */
        if (!mm_valid_pasid(current->mm))
                return false;

        pasid = mm_get_enqcmd_pasid(current->mm);

        /*
         * Did this thread already have its PASID activated?
         * If so, the #GP must be from something else.
         */
        if (current->pasid_activated)
                return false;

        wrmsrl(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID);
        current->pasid_activated = 1;

        return true;
#else
        return false;
#endif
}

static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr,
                                    unsigned long error_code, const char *str,
                                    unsigned long address)
{
        if (fixup_exception(regs, trapnr, error_code, address))
                return true;

        current->thread.error_code = error_code;
        current->thread.trap_nr = trapnr;

        /*
         * To be potentially processing a kprobe fault and to trust the result
         * from kprobe_running(), we have to be non-preemptible.
         */
        if (!preemptible() && kprobe_running() &&
            kprobe_fault_handler(regs, trapnr))
                return true;

        return notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV) == NOTIFY_STOP;
}

static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr,
                                   unsigned long error_code, const char *str)
{
        current->thread.error_code = error_code;
        current->thread.trap_nr = trapnr;
        show_signal(current, SIGSEGV, "", str, regs, error_code);
        force_sig(SIGSEGV);
}

DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
{
        char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
        enum kernel_gp_hint hint = GP_NO_HINT;
        unsigned long gp_addr;

        if (user_mode(regs) && try_fixup_enqcmd_gp())
                return;

        cond_local_irq_enable(regs);

        if (static_cpu_has(X86_FEATURE_UMIP)) {
                if (user_mode(regs) && fixup_umip_exception(regs))
                        goto exit;
        }

        if (v8086_mode(regs)) {
                local_irq_enable();
                handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
                local_irq_disable();
                return;
        }

        if (user_mode(regs)) {
                if (fixup_iopl_exception(regs))
                        goto exit;

                if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0))
                        goto exit;

                gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc);
                goto exit;
        }

        if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc, 0))
                goto exit;

        if (error_code)
                snprintf(desc, sizeof(desc), "segment-related " GPFSTR);
        else
                hint = get_kernel_gp_address(regs, &gp_addr);

        if (hint != GP_NO_HINT)
                snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx",
                         (hint == GP_NON_CANONICAL) ? "probably for non-canonical address"
                                                    : "maybe for address",
                         gp_addr);

        /*
         * KASAN is interested only in the non-canonical case, clear it
         * otherwise.
         */
        if (hint != GP_NON_CANONICAL)
                gp_addr = 0;

        die_addr(desc, regs, error_code, gp_addr);

exit:
        cond_local_irq_disable(regs);
}

static bool do_int3(struct pt_regs *regs)
{
        int res;

#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
        if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP,
                         SIGTRAP) == NOTIFY_STOP)
                return true;
#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */

#ifdef CONFIG_KPROBES
        if (kprobe_int3_handler(regs))
                return true;
#endif
        res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP);

        return res == NOTIFY_STOP;
}
NOKPROBE_SYMBOL(do_int3);

static void do_int3_user(struct pt_regs *regs)
{
        if (do_int3(regs))
                return;

        cond_local_irq_enable(regs);
        do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL);
        cond_local_irq_disable(regs);
}

DEFINE_IDTENTRY_RAW(exc_int3)
{
        /*
         * poke_int3_handler() is completely self contained code; it does (and
         * must) *NOT* call out to anything, lest it hits upon yet another
         * INT3.
         */
        if (poke_int3_handler(regs))
                return;

        /*
         * irqentry_enter_from_user_mode() uses static_branch_{,un}likely()
         * and therefore can trigger INT3, hence poke_int3_handler() must
         * be done before. If the entry came from kernel mode, then use
         * nmi_enter() because the INT3 could have been hit in any context
         * including NMI.
         */
        if (user_mode(regs)) {
                irqentry_enter_from_user_mode(regs);
                instrumentation_begin();
                do_int3_user(regs);
                instrumentation_end();
                irqentry_exit_to_user_mode(regs);
        } else {
                irqentry_state_t irq_state = irqentry_nmi_enter(regs);

                instrumentation_begin();
                if (!do_int3(regs))
                        die("int3", regs, 0);
                instrumentation_end();
                irqentry_nmi_exit(regs, irq_state);
        }
}

#ifdef CONFIG_X86_64
/*
 * Help handler running on a per-cpu (IST or entry trampoline) stack
 * to switch to the normal thread stack if the interrupted code was in
 * user mode. The actual stack switch is done in entry_64.S
 */
asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs)
{
        struct pt_regs *regs = (struct pt_regs *)current_top_of_stack() - 1;
        if (regs != eregs)
                *regs = *eregs;
        return regs;
}

#ifdef CONFIG_AMD_MEM_ENCRYPT
asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *regs)
{
        unsigned long sp, *stack;
        struct stack_info info;
        struct pt_regs *regs_ret;

        /*
         * In the SYSCALL entry path the RSP value comes from user-space - don't
         * trust it and switch to the current kernel stack
         */
        if (ip_within_syscall_gap(regs)) {
                sp = current_top_of_stack();
                goto sync;
        }

        /*
         * From here on the RSP value is trusted. Now check whether entry
         * happened from a safe stack. Not safe are the entry or unknown stacks,
         * use the fall-back stack instead in this case.
         */
        sp    = regs->sp;
        stack = (unsigned long *)sp;

        if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY ||
            info.type > STACK_TYPE_EXCEPTION_LAST)
                sp = __this_cpu_ist_top_va(VC2);

sync:
        /*
         * Found a safe stack - switch to it as if the entry didn't happen via
         * IST stack. The code below only copies pt_regs, the real switch happens
         * in assembly code.
         */
        sp = ALIGN_DOWN(sp, 8) - sizeof(*regs_ret);

        regs_ret = (struct pt_regs *)sp;
        *regs_ret = *regs;

        return regs_ret;
}
#endif

asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs)
{
        struct pt_regs tmp, *new_stack;

        /*
         * This is called from entry_64.S early in handling a fault
         * caused by a bad iret to user mode.  To handle the fault
         * correctly, we want to move our stack frame to where it would
         * be had we entered directly on the entry stack (rather than
         * just below the IRET frame) and we want to pretend that the
         * exception came from the IRET target.
         */
        new_stack = (struct pt_regs *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;

        /* Copy the IRET target to the temporary storage. */
        __memcpy(&tmp.ip, (void *)bad_regs->sp, 5*8);

        /* Copy the remainder of the stack from the current stack. */
        __memcpy(&tmp, bad_regs, offsetof(struct pt_regs, ip));

        /* Update the entry stack */
        __memcpy(new_stack, &tmp, sizeof(tmp));

        BUG_ON(!user_mode(new_stack));
        return new_stack;
}
#endif

static bool is_sysenter_singlestep(struct pt_regs *regs)
{
        /*
         * We don't try for precision here.  If we're anywhere in the region of
         * code that can be single-stepped in the SYSENTER entry path, then
         * assume that this is a useless single-step trap due to SYSENTER
         * being invoked with TF set.  (We don't know in advance exactly
         * which instructions will be hit because BTF could plausibly
         * be set.)
         */
#ifdef CONFIG_X86_32
        return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) <
                (unsigned long)__end_SYSENTER_singlestep_region -
                (unsigned long)__begin_SYSENTER_singlestep_region;
#elif defined(CONFIG_IA32_EMULATION)
        return (regs->ip - (unsigned long)entry_SYSENTER_compat) <
                (unsigned long)__end_entry_SYSENTER_compat -
                (unsigned long)entry_SYSENTER_compat;
#else
        return false;
#endif
}

static __always_inline unsigned long debug_read_clear_dr6(void)
{
        unsigned long dr6;

        /*
         * The Intel SDM says:
         *
         *   Certain debug exceptions may clear bits 0-3. The remaining
         *   contents of the DR6 register are never cleared by the
         *   processor. To avoid confusion in identifying debug
         *   exceptions, debug handlers should clear the register before
         *   returning to the interrupted task.
         *
         * Keep it simple: clear DR6 immediately.
         */
        get_debugreg(dr6, 6);
        set_debugreg(DR6_RESERVED, 6);
        dr6 ^= DR6_RESERVED; /* Flip to positive polarity */

        return dr6;
}

/*
 * Our handling of the processor debug registers is non-trivial.
 * We do not clear them on entry and exit from the kernel. Therefore
 * it is possible to get a watchpoint trap here from inside the kernel.
 * However, the code in ./ptrace.c has ensured that the user can
 * only set watchpoints on userspace addresses. Therefore the in-kernel
 * watchpoint trap can only occur in code which is reading/writing
 * from user space. Such code must not hold kernel locks (since it
 * can equally take a page fault), therefore it is safe to call
 * force_sig_info even though that claims and releases locks.
 *
 * Code in ./signal.c ensures that the debug control register
 * is restored before we deliver any signal, and therefore that
 * user code runs with the correct debug control register even though
 * we clear it here.
 *
 * Being careful here means that we don't have to be as careful in a
 * lot of more complicated places (task switching can be a bit lazy
 * about restoring all the debug state, and ptrace doesn't have to
 * find every occurrence of the TF bit that could be saved away even
 * by user code)
 *
 * May run on IST stack.
 */

static bool notify_debug(struct pt_regs *regs, unsigned long *dr6)
{
        /*
         * Notifiers will clear bits in @dr6 to indicate the event has been
         * consumed - hw_breakpoint_handler(), single_stop_cont().
         *
         * Notifiers will set bits in @virtual_dr6 to indicate the desire
         * for signals - ptrace_triggered(), kgdb_hw_overflow_handler().
         */
        if (notify_die(DIE_DEBUG, "debug", regs, (long)dr6, 0, SIGTRAP) == NOTIFY_STOP)
                return true;

        return false;
}

static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6)
{
        /*
         * Disable breakpoints during exception handling; recursive exceptions
         * are exceedingly 'fun'.
         *
         * Since this function is NOKPROBE, and that also applies to
         * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a
         * HW_BREAKPOINT_W on our stack)
         *
         * Entry text is excluded for HW_BP_X and cpu_entry_area, which
         * includes the entry stack is excluded for everything.
         *
         * For FRED, nested #DB should just work fine. But when a watchpoint or
         * breakpoint is set in the code path which is executed by #DB handler,
         * it results in an endless recursion and stack overflow. Thus we stay
         * with the IDT approach, i.e., save DR7 and disable #DB.
         */
        unsigned long dr7 = local_db_save();
        irqentry_state_t irq_state = irqentry_nmi_enter(regs);
        instrumentation_begin();

        /*
         * If something gets miswired and we end up here for a user mode
         * #DB, we will malfunction.
         */
        WARN_ON_ONCE(user_mode(regs));

        if (test_thread_flag(TIF_BLOCKSTEP)) {
                /*
                 * The SDM says "The processor clears the BTF flag when it
                 * generates a debug exception." but PTRACE_BLOCKSTEP requested
                 * it for userspace, but we just took a kernel #DB, so re-set
                 * BTF.
                 */
                unsigned long debugctl;

                rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
                debugctl |= DEBUGCTLMSR_BTF;
                wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
        }

        /*
         * Catch SYSENTER with TF set and clear DR_STEP. If this hit a
         * watchpoint at the same time then that will still be handled.
         */
        if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
            (dr6 & DR_STEP) && is_sysenter_singlestep(regs))
                dr6 &= ~DR_STEP;

        /*
         * The kernel doesn't use INT1
         */
        if (!dr6)
                goto out;

        if (notify_debug(regs, &dr6))
                goto out;

        /*
         * The kernel doesn't use TF single-step outside of:
         *
         *  - Kprobes, consumed through kprobe_debug_handler()
         *  - KGDB, consumed through notify_debug()
         *
         * So if we get here with DR_STEP set, something is wonky.
         *
         * A known way to trigger this is through QEMU's GDB stub,
         * which leaks #DB into the guest and causes IST recursion.
         */
        if (WARN_ON_ONCE(dr6 & DR_STEP))
                regs->flags &= ~X86_EFLAGS_TF;
out:
        instrumentation_end();
        irqentry_nmi_exit(regs, irq_state);

        local_db_restore(dr7);
}

static noinstr void exc_debug_user(struct pt_regs *regs, unsigned long dr6)
{
        bool icebp;

        /*
         * If something gets miswired and we end up here for a kernel mode
         * #DB, we will malfunction.
         */
        WARN_ON_ONCE(!user_mode(regs));

        /*
         * NB: We can't easily clear DR7 here because
         * irqentry_exit_to_usermode() can invoke ptrace, schedule, access
         * user memory, etc.  This means that a recursive #DB is possible.  If
         * this happens, that #DB will hit exc_debug_kernel() and clear DR7.
         * Since we're not on the IST stack right now, everything will be
         * fine.
         */

        irqentry_enter_from_user_mode(regs);
        instrumentation_begin();

        /*
         * Start the virtual/ptrace DR6 value with just the DR_STEP mask
         * of the real DR6. ptrace_triggered() will set the DR_TRAPn bits.
         *
         * Userspace expects DR_STEP to be visible in ptrace_get_debugreg(6)
         * even if it is not the result of PTRACE_SINGLESTEP.
         */
        current->thread.virtual_dr6 = (dr6 & DR_STEP);

        /*
         * The SDM says "The processor clears the BTF flag when it
         * generates a debug exception."  Clear TIF_BLOCKSTEP to keep
         * TIF_BLOCKSTEP in sync with the hardware BTF flag.
         */
        clear_thread_flag(TIF_BLOCKSTEP);

        /*
         * If dr6 has no reason to give us about the origin of this trap,
         * then it's very likely the result of an icebp/int01 trap.
         * User wants a sigtrap for that.
         */
        icebp = !dr6;

        if (notify_debug(regs, &dr6))
                goto out;

        /* It's safe to allow irq's after DR6 has been saved */
        local_irq_enable();

        if (v8086_mode(regs)) {
                handle_vm86_trap((struct kernel_vm86_regs *)regs, 0, X86_TRAP_DB);
                goto out_irq;
        }

        /* #DB for bus lock can only be triggered from userspace. */
        if (dr6 & DR_BUS_LOCK)
                handle_bus_lock(regs);

        /* Add the virtual_dr6 bits for signals. */
        dr6 |= current->thread.virtual_dr6;
        if (dr6 & (DR_STEP | DR_TRAP_BITS) || icebp)
                send_sigtrap(regs, 0, get_si_code(dr6));

out_irq:
        local_irq_disable();
out:
        instrumentation_end();
        irqentry_exit_to_user_mode(regs);
}

#ifdef CONFIG_X86_64
/* IST stack entry */
DEFINE_IDTENTRY_DEBUG(exc_debug)
{
        exc_debug_kernel(regs, debug_read_clear_dr6());
}

/* User entry, runs on regular task stack */
DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
{
        exc_debug_user(regs, debug_read_clear_dr6());
}

#ifdef CONFIG_X86_FRED
/*
 * When occurred on different ring level, i.e., from user or kernel
 * context, #DB needs to be handled on different stack: User #DB on
 * current task stack, while kernel #DB on a dedicated stack.
 *
 * This is exactly how FRED event delivery invokes an exception
 * handler: ring 3 event on level 0 stack, i.e., current task stack;
 * ring 0 event on the #DB dedicated stack specified in the
 * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED debug exception
 * entry stub doesn't do stack switch.
 */
DEFINE_FREDENTRY_DEBUG(exc_debug)
{
        /*
         * FRED #DB stores DR6 on the stack in the format which
         * debug_read_clear_dr6() returns for the IDT entry points.
         */
        unsigned long dr6 = fred_event_data(regs);

        if (user_mode(regs))
                exc_debug_user(regs, dr6);
        else
                exc_debug_kernel(regs, dr6);
}
#endif /* CONFIG_X86_FRED */

#else
/* 32 bit does not have separate entry points. */
DEFINE_IDTENTRY_RAW(exc_debug)
{
        unsigned long dr6 = debug_read_clear_dr6();

        if (user_mode(regs))
                exc_debug_user(regs, dr6);
        else
                exc_debug_kernel(regs, dr6);
}
#endif

/*
 * Note that we play around with the 'TS' bit in an attempt to get
 * the correct behaviour even in the presence of the asynchronous
 * IRQ13 behaviour
 */
static void math_error(struct pt_regs *regs, int trapnr)
{
        struct task_struct *task = current;
        struct fpu *fpu = &task->thread.fpu;
        int si_code;
        char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
                                                "simd exception";

        cond_local_irq_enable(regs);

        if (!user_mode(regs)) {
                if (fixup_exception(regs, trapnr, 0, 0))
                        goto exit;

                task->thread.error_code = 0;
                task->thread.trap_nr = trapnr;

                if (notify_die(DIE_TRAP, str, regs, 0, trapnr,
                               SIGFPE) != NOTIFY_STOP)
                        die(str, regs, 0);
                goto exit;
        }

        /*
         * Synchronize the FPU register state to the memory register state
         * if necessary. This allows the exception handler to inspect it.
         */
        fpu_sync_fpstate(fpu);

        task->thread.trap_nr        = trapnr;
        task->thread.error_code = 0;

        si_code = fpu__exception_code(fpu, trapnr);
        /* Retry when we get spurious exceptions: */
        if (!si_code)
                goto exit;

        if (fixup_vdso_exception(regs, trapnr, 0, 0))
                goto exit;

        force_sig_fault(SIGFPE, si_code,
                        (void __user *)uprobe_get_trap_addr(regs));
exit:
        cond_local_irq_disable(regs);
}

DEFINE_IDTENTRY(exc_coprocessor_error)
{
        math_error(regs, X86_TRAP_MF);
}

DEFINE_IDTENTRY(exc_simd_coprocessor_error)
{
        if (IS_ENABLED(CONFIG_X86_INVD_BUG)) {
                /* AMD 486 bug: INVD in CPL 0 raises #XF instead of #GP */
                if (!static_cpu_has(X86_FEATURE_XMM)) {
                        __exc_general_protection(regs, 0);
                        return;
                }
        }
        math_error(regs, X86_TRAP_XF);
}

DEFINE_IDTENTRY(exc_spurious_interrupt_bug)
{
        /*
         * This addresses a Pentium Pro Erratum:
         *
         * PROBLEM: If the APIC subsystem is configured in mixed mode with
         * Virtual Wire mode implemented through the local APIC, an
         * interrupt vector of 0Fh (Intel reserved encoding) may be
         * generated by the local APIC (Int 15).  This vector may be
         * generated upon receipt of a spurious interrupt (an interrupt
         * which is removed before the system receives the INTA sequence)
         * instead of the programmed 8259 spurious interrupt vector.
         *
         * IMPLICATION: The spurious interrupt vector programmed in the
         * 8259 is normally handled by an operating system's spurious
         * interrupt handler. However, a vector of 0Fh is unknown to some
         * operating systems, which would crash if this erratum occurred.
         *
         * In theory this could be limited to 32bit, but the handler is not
         * hurting and who knows which other CPUs suffer from this.
         */
}

static bool handle_xfd_event(struct pt_regs *regs)
{
        u64 xfd_err;
        int err;

        if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD))
                return false;

        rdmsrl(MSR_IA32_XFD_ERR, xfd_err);
        if (!xfd_err)
                return false;

        wrmsrl(MSR_IA32_XFD_ERR, 0);

        /* Die if that happens in kernel space */
        if (WARN_ON(!user_mode(regs)))
                return false;

        local_irq_enable();

        err = xfd_enable_feature(xfd_err);

        switch (err) {
        case -EPERM:
                force_sig_fault(SIGILL, ILL_ILLOPC, error_get_trap_addr(regs));
                break;
        case -EFAULT:
                force_sig(SIGSEGV);
                break;
        }

        local_irq_disable();
        return true;
}

DEFINE_IDTENTRY(exc_device_not_available)
{
        unsigned long cr0 = read_cr0();

        if (handle_xfd_event(regs))
                return;

#ifdef CONFIG_MATH_EMULATION
        if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) {
                struct math_emu_info info = { };

                cond_local_irq_enable(regs);

                info.regs = regs;
                math_emulate(&info);

                cond_local_irq_disable(regs);
                return;
        }
#endif

        /* This should not happen. */
        if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) {
                /* Try to fix it up and carry on. */
                write_cr0(cr0 & ~X86_CR0_TS);
        } else {
                /*
                 * Something terrible happened, and we're better off trying
                 * to kill the task than getting stuck in a never-ending
                 * loop of #NM faults.
                 */
                die("unexpected #NM exception", regs, 0);
        }
}

#ifdef CONFIG_INTEL_TDX_GUEST

#define VE_FAULT_STR "VE fault"

static void ve_raise_fault(struct pt_regs *regs, long error_code,
                           unsigned long address)
{
        if (user_mode(regs)) {
                gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR);
                return;
        }

        if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code,
                                    VE_FAULT_STR, address)) {
                return;
        }

        die_addr(VE_FAULT_STR, regs, error_code, address);
}

/*
 * Virtualization Exceptions (#VE) are delivered to TDX guests due to
 * specific guest actions which may happen in either user space or the
 * kernel:
 *
 *  * Specific instructions (WBINVD, for example)
 *  * Specific MSR accesses
 *  * Specific CPUID leaf accesses
 *  * Access to specific guest physical addresses
 *
 * In the settings that Linux will run in, virtualization exceptions are
 * never generated on accesses to normal, TD-private memory that has been
 * accepted (by BIOS or with tdx_enc_status_changed()).
 *
 * Syscall entry code has a critical window where the kernel stack is not
 * yet set up. Any exception in this window leads to hard to debug issues
 * and can be exploited for privilege escalation. Exceptions in the NMI
 * entry code also cause issues. Returning from the exception handler with
 * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack.
 *
 * For these reasons, the kernel avoids #VEs during the syscall gap and
 * the NMI entry code. Entry code paths do not access TD-shared memory,
 * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves
 * that might generate #VE. VMM can remove memory from TD at any point,
 * but access to unaccepted (or missing) private memory leads to VM
 * termination, not to #VE.
 *
 * Similarly to page faults and breakpoints, #VEs are allowed in NMI
 * handlers once the kernel is ready to deal with nested NMIs.
 *
 * During #VE delivery, all interrupts, including NMIs, are blocked until
 * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads
 * the VE info.
 *
 * If a guest kernel action which would normally cause a #VE occurs in
 * the interrupt-disabled region before TDGETVEINFO, a #DF (fault
 * exception) is delivered to the guest which will result in an oops.
 *
 * The entry code has been audited carefully for following these expectations.
 * Changes in the entry code have to be audited for correctness vs. this
 * aspect. Similarly to #PF, #VE in these places will expose kernel to
 * privilege escalation or may lead to random crashes.
 */
DEFINE_IDTENTRY(exc_virtualization_exception)
{
        struct ve_info ve;

        /*
         * NMIs/Machine-checks/Interrupts will be in a disabled state
         * till TDGETVEINFO TDCALL is executed. This ensures that VE
         * info cannot be overwritten by a nested #VE.
         */
        tdx_get_ve_info(&ve);

        cond_local_irq_enable(regs);

        /*
         * If tdx_handle_virt_exception() could not process
         * it successfully, treat it as #GP(0) and handle it.
         */
        if (!tdx_handle_virt_exception(regs, &ve))
                ve_raise_fault(regs, 0, ve.gla);

        cond_local_irq_disable(regs);
}

#endif

#ifdef CONFIG_X86_32
DEFINE_IDTENTRY_SW(iret_error)
{
        local_irq_enable();
        if (notify_die(DIE_TRAP, "iret exception", regs, 0,
                        X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
                do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, 0,
                        ILL_BADSTK, (void __user *)NULL);
        }
        local_irq_disable();
}
#endif

/* Do not enable FRED by default yet. */
static bool enable_fred __ro_after_init = false;

#ifdef CONFIG_X86_FRED
static int __init fred_setup(char *str)
{
        if (!str)
                return -EINVAL;

        if (!cpu_feature_enabled(X86_FEATURE_FRED))
                return 0;

        if (!strcmp(str, "on"))
                enable_fred = true;
        else if (!strcmp(str, "off"))
                enable_fred = false;
        else
                pr_warn("invalid FRED option: 'fred=%s'\n", str);
        return 0;
}
early_param("fred", fred_setup);
#endif

void __init trap_init(void)
{
        if (cpu_feature_enabled(X86_FEATURE_FRED) && !enable_fred)
                setup_clear_cpu_cap(X86_FEATURE_FRED);

        /* Init cpu_entry_area before IST entries are set up */
        setup_cpu_entry_areas();

        /* Init GHCB memory pages when running as an SEV-ES guest */
        sev_es_init_vc_handling();

        /* Initialize TSS before setting up traps so ISTs work */
        cpu_init_exception_handling();

        /* Setup traps as cpu_init() might #GP */
        if (!cpu_feature_enabled(X86_FEATURE_FRED))
                idt_setup_traps();

        cpu_init();
}











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2008 Oracle.  All rights reserved.
 */

#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/list_sort.h>
#include <linux/iversion.h>
#include "misc.h"
#include "ctree.h"
#include "tree-log.h"
#include "disk-io.h"
#include "locking.h"
#include "backref.h"
#include "compression.h"
#include "qgroup.h"
#include "block-group.h"
#include "space-info.h"
#include "inode-item.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
#include "dir-item.h"
#include "file-item.h"
#include "file.h"
#include "orphan.h"
#include "tree-checker.h"

#define MAX_CONFLICT_INODES 10

/* magic values for the inode_only field in btrfs_log_inode:
 *
 * LOG_INODE_ALL means to log everything
 * LOG_INODE_EXISTS means to log just enough to recreate the inode
 * during log replay
 */
enum {
        LOG_INODE_ALL,
        LOG_INODE_EXISTS,
};

/*
 * directory trouble cases
 *
 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
 * log, we must force a full commit before doing an fsync of the directory
 * where the unlink was done.
 * ---> record transid of last unlink/rename per directory
 *
 * mkdir foo/some_dir
 * normal commit
 * rename foo/some_dir foo2/some_dir
 * mkdir foo/some_dir
 * fsync foo/some_dir/some_file
 *
 * The fsync above will unlink the original some_dir without recording
 * it in its new location (foo2).  After a crash, some_dir will be gone
 * unless the fsync of some_file forces a full commit
 *
 * 2) we must log any new names for any file or dir that is in the fsync
 * log. ---> check inode while renaming/linking.
 *
 * 2a) we must log any new names for any file or dir during rename
 * when the directory they are being removed from was logged.
 * ---> check inode and old parent dir during rename
 *
 *  2a is actually the more important variant.  With the extra logging
 *  a crash might unlink the old name without recreating the new one
 *
 * 3) after a crash, we must go through any directories with a link count
 * of zero and redo the rm -rf
 *
 * mkdir f1/foo
 * normal commit
 * rm -rf f1/foo
 * fsync(f1)
 *
 * The directory f1 was fully removed from the FS, but fsync was never
 * called on f1, only its parent dir.  After a crash the rm -rf must
 * be replayed.  This must be able to recurse down the entire
 * directory tree.  The inode link count fixup code takes care of the
 * ugly details.
 */

/*
 * stages for the tree walking.  The first
 * stage (0) is to only pin down the blocks we find
 * the second stage (1) is to make sure that all the inodes
 * we find in the log are created in the subvolume.
 *
 * The last stage is to deal with directories and links and extents
 * and all the other fun semantics
 */
enum {
        LOG_WALK_PIN_ONLY,
        LOG_WALK_REPLAY_INODES,
        LOG_WALK_REPLAY_DIR_INDEX,
        LOG_WALK_REPLAY_ALL,
};

static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                           struct btrfs_inode *inode,
                           int inode_only,
                           struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid);
static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root,
                                       struct btrfs_root *log,
                                       struct btrfs_path *path,
                                       u64 dirid, int del_all);
static void wait_log_commit(struct btrfs_root *root, int transid);

/*
 * tree logging is a special write ahead log used to make sure that
 * fsyncs and O_SYNCs can happen without doing full tree commits.
 *
 * Full tree commits are expensive because they require commonly
 * modified blocks to be recowed, creating many dirty pages in the
 * extent tree an 4x-6x higher write load than ext3.
 *
 * Instead of doing a tree commit on every fsync, we use the
 * key ranges and transaction ids to find items for a given file or directory
 * that have changed in this transaction.  Those items are copied into
 * a special tree (one per subvolume root), that tree is written to disk
 * and then the fsync is considered complete.
 *
 * After a crash, items are copied out of the log-tree back into the
 * subvolume tree.  Any file data extents found are recorded in the extent
 * allocation tree, and the log-tree freed.
 *
 * The log tree is read three times, once to pin down all the extents it is
 * using in ram and once, once to create all the inodes logged in the tree
 * and once to do all the other items.
 */

/*
 * start a sub transaction and setup the log tree
 * this increments the log tree writer count to make the people
 * syncing the tree wait for us to finish
 */
static int start_log_trans(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct btrfs_log_ctx *ctx)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_root *tree_root = fs_info->tree_root;
        const bool zoned = btrfs_is_zoned(fs_info);
        int ret = 0;
        bool created = false;

        /*
         * First check if the log root tree was already created. If not, create
         * it before locking the root's log_mutex, just to keep lockdep happy.
         */
        if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
                mutex_lock(&tree_root->log_mutex);
                if (!fs_info->log_root_tree) {
                        ret = btrfs_init_log_root_tree(trans, fs_info);
                        if (!ret) {
                                set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
                                created = true;
                        }
                }
                mutex_unlock(&tree_root->log_mutex);
                if (ret)
                        return ret;
        }

        mutex_lock(&root->log_mutex);

again:
        if (root->log_root) {
                int index = (root->log_transid + 1) % 2;

                if (btrfs_need_log_full_commit(trans)) {
                        ret = BTRFS_LOG_FORCE_COMMIT;
                        goto out;
                }

                if (zoned && atomic_read(&root->log_commit[index])) {
                        wait_log_commit(root, root->log_transid - 1);
                        goto again;
                }

                if (!root->log_start_pid) {
                        clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
                        root->log_start_pid = current->pid;
                } else if (root->log_start_pid != current->pid) {
                        set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
                }
        } else {
                /*
                 * This means fs_info->log_root_tree was already created
                 * for some other FS trees. Do the full commit not to mix
                 * nodes from multiple log transactions to do sequential
                 * writing.
                 */
                if (zoned && !created) {
                        ret = BTRFS_LOG_FORCE_COMMIT;
                        goto out;
                }

                ret = btrfs_add_log_tree(trans, root);
                if (ret)
                        goto out;

                set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
                clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
                root->log_start_pid = current->pid;
        }

        atomic_inc(&root->log_writers);
        if (!ctx->logging_new_name) {
                int index = root->log_transid % 2;
                list_add_tail(&ctx->list, &root->log_ctxs[index]);
                ctx->log_transid = root->log_transid;
        }

out:
        mutex_unlock(&root->log_mutex);
        return ret;
}

/*
 * returns 0 if there was a log transaction running and we were able
 * to join, or returns -ENOENT if there were not transactions
 * in progress
 */
static int join_running_log_trans(struct btrfs_root *root)
{
        const bool zoned = btrfs_is_zoned(root->fs_info);
        int ret = -ENOENT;

        if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
                return ret;

        mutex_lock(&root->log_mutex);
again:
        if (root->log_root) {
                int index = (root->log_transid + 1) % 2;

                ret = 0;
                if (zoned && atomic_read(&root->log_commit[index])) {
                        wait_log_commit(root, root->log_transid - 1);
                        goto again;
                }
                atomic_inc(&root->log_writers);
        }
        mutex_unlock(&root->log_mutex);
        return ret;
}

/*
 * This either makes the current running log transaction wait
 * until you call btrfs_end_log_trans() or it makes any future
 * log transactions wait until you call btrfs_end_log_trans()
 */
void btrfs_pin_log_trans(struct btrfs_root *root)
{
        atomic_inc(&root->log_writers);
}

/*
 * indicate we're done making changes to the log tree
 * and wake up anyone waiting to do a sync
 */
void btrfs_end_log_trans(struct btrfs_root *root)
{
        if (atomic_dec_and_test(&root->log_writers)) {
                /* atomic_dec_and_test implies a barrier */
                cond_wake_up_nomb(&root->log_writer_wait);
        }
}

/*
 * the walk control struct is used to pass state down the chain when
 * processing the log tree.  The stage field tells us which part
 * of the log tree processing we are currently doing.  The others
 * are state fields used for that specific part
 */
struct walk_control {
        /* should we free the extent on disk when done?  This is used
         * at transaction commit time while freeing a log tree
         */
        int free;

        /* pin only walk, we record which extents on disk belong to the
         * log trees
         */
        int pin;

        /* what stage of the replay code we're currently in */
        int stage;

        /*
         * Ignore any items from the inode currently being processed. Needs
         * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
         * the LOG_WALK_REPLAY_INODES stage.
         */
        bool ignore_cur_inode;

        /* the root we are currently replaying */
        struct btrfs_root *replay_dest;

        /* the trans handle for the current replay */
        struct btrfs_trans_handle *trans;

        /* the function that gets used to process blocks we find in the
         * tree.  Note the extent_buffer might not be up to date when it is
         * passed in, and it must be checked or read if you need the data
         * inside it
         */
        int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
                            struct walk_control *wc, u64 gen, int level);
};

/*
 * process_func used to pin down extents, write them or wait on them
 */
static int process_one_buffer(struct btrfs_root *log,
                              struct extent_buffer *eb,
                              struct walk_control *wc, u64 gen, int level)
{
        struct btrfs_fs_info *fs_info = log->fs_info;
        int ret = 0;

        /*
         * If this fs is mixed then we need to be able to process the leaves to
         * pin down any logged extents, so we have to read the block.
         */
        if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
                struct btrfs_tree_parent_check check = {
                        .level = level,
                        .transid = gen
                };

                ret = btrfs_read_extent_buffer(eb, &check);
                if (ret)
                        return ret;
        }

        if (wc->pin) {
                ret = btrfs_pin_extent_for_log_replay(wc->trans, eb);
                if (ret)
                        return ret;

                if (btrfs_buffer_uptodate(eb, gen, 0) &&
                    btrfs_header_level(eb) == 0)
                        ret = btrfs_exclude_logged_extents(eb);
        }
        return ret;
}

/*
 * Item overwrite used by replay and tree logging.  eb, slot and key all refer
 * to the src data we are copying out.
 *
 * root is the tree we are copying into, and path is a scratch
 * path for use in this function (it should be released on entry and
 * will be released on exit).
 *
 * If the key is already in the destination tree the existing item is
 * overwritten.  If the existing item isn't big enough, it is extended.
 * If it is too large, it is truncated.
 *
 * If the key isn't in the destination yet, a new item is inserted.
 */
static int overwrite_item(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          struct btrfs_path *path,
                          struct extent_buffer *eb, int slot,
                          struct btrfs_key *key)
{
        int ret;
        u32 item_size;
        u64 saved_i_size = 0;
        int save_old_i_size = 0;
        unsigned long src_ptr;
        unsigned long dst_ptr;
        bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;

        /*
         * This is only used during log replay, so the root is always from a
         * fs/subvolume tree. In case we ever need to support a log root, then
         * we'll have to clone the leaf in the path, release the path and use
         * the leaf before writing into the log tree. See the comments at
         * copy_items() for more details.
         */
        ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);

        item_size = btrfs_item_size(eb, slot);
        src_ptr = btrfs_item_ptr_offset(eb, slot);

        /* Look for the key in the destination tree. */
        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
        if (ret < 0)
                return ret;

        if (ret == 0) {
                char *src_copy;
                char *dst_copy;
                u32 dst_size = btrfs_item_size(path->nodes[0],
                                                  path->slots[0]);
                if (dst_size != item_size)
                        goto insert;

                if (item_size == 0) {
                        btrfs_release_path(path);
                        return 0;
                }
                dst_copy = kmalloc(item_size, GFP_NOFS);
                src_copy = kmalloc(item_size, GFP_NOFS);
                if (!dst_copy || !src_copy) {
                        btrfs_release_path(path);
                        kfree(dst_copy);
                        kfree(src_copy);
                        return -ENOMEM;
                }

                read_extent_buffer(eb, src_copy, src_ptr, item_size);

                dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
                read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
                                   item_size);
                ret = memcmp(dst_copy, src_copy, item_size);

                kfree(dst_copy);
                kfree(src_copy);
                /*
                 * they have the same contents, just return, this saves
                 * us from cowing blocks in the destination tree and doing
                 * extra writes that may not have been done by a previous
                 * sync
                 */
                if (ret == 0) {
                        btrfs_release_path(path);
                        return 0;
                }

                /*
                 * We need to load the old nbytes into the inode so when we
                 * replay the extents we've logged we get the right nbytes.
                 */
                if (inode_item) {
                        struct btrfs_inode_item *item;
                        u64 nbytes;
                        u32 mode;

                        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                              struct btrfs_inode_item);
                        nbytes = btrfs_inode_nbytes(path->nodes[0], item);
                        item = btrfs_item_ptr(eb, slot,
                                              struct btrfs_inode_item);
                        btrfs_set_inode_nbytes(eb, item, nbytes);

                        /*
                         * If this is a directory we need to reset the i_size to
                         * 0 so that we can set it up properly when replaying
                         * the rest of the items in this log.
                         */
                        mode = btrfs_inode_mode(eb, item);
                        if (S_ISDIR(mode))
                                btrfs_set_inode_size(eb, item, 0);
                }
        } else if (inode_item) {
                struct btrfs_inode_item *item;
                u32 mode;

                /*
                 * New inode, set nbytes to 0 so that the nbytes comes out
                 * properly when we replay the extents.
                 */
                item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
                btrfs_set_inode_nbytes(eb, item, 0);

                /*
                 * If this is a directory we need to reset the i_size to 0 so
                 * that we can set it up properly when replaying the rest of
                 * the items in this log.
                 */
                mode = btrfs_inode_mode(eb, item);
                if (S_ISDIR(mode))
                        btrfs_set_inode_size(eb, item, 0);
        }
insert:
        btrfs_release_path(path);
        /* try to insert the key into the destination tree */
        path->skip_release_on_error = 1;
        ret = btrfs_insert_empty_item(trans, root, path,
                                      key, item_size);
        path->skip_release_on_error = 0;

        /* make sure any existing item is the correct size */
        if (ret == -EEXIST || ret == -EOVERFLOW) {
                u32 found_size;
                found_size = btrfs_item_size(path->nodes[0],
                                                path->slots[0]);
                if (found_size > item_size)
                        btrfs_truncate_item(trans, path, item_size, 1);
                else if (found_size < item_size)
                        btrfs_extend_item(trans, path, item_size - found_size);
        } else if (ret) {
                return ret;
        }
        dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
                                        path->slots[0]);

        /* don't overwrite an existing inode if the generation number
         * was logged as zero.  This is done when the tree logging code
         * is just logging an inode to make sure it exists after recovery.
         *
         * Also, don't overwrite i_size on directories during replay.
         * log replay inserts and removes directory items based on the
         * state of the tree found in the subvolume, and i_size is modified
         * as it goes
         */
        if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
                struct btrfs_inode_item *src_item;
                struct btrfs_inode_item *dst_item;

                src_item = (struct btrfs_inode_item *)src_ptr;
                dst_item = (struct btrfs_inode_item *)dst_ptr;

                if (btrfs_inode_generation(eb, src_item) == 0) {
                        struct extent_buffer *dst_eb = path->nodes[0];
                        const u64 ino_size = btrfs_inode_size(eb, src_item);

                        /*
                         * For regular files an ino_size == 0 is used only when
                         * logging that an inode exists, as part of a directory
                         * fsync, and the inode wasn't fsynced before. In this
                         * case don't set the size of the inode in the fs/subvol
                         * tree, otherwise we would be throwing valid data away.
                         */
                        if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
                            S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
                            ino_size != 0)
                                btrfs_set_inode_size(dst_eb, dst_item, ino_size);
                        goto no_copy;
                }

                if (S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
                    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
                        save_old_i_size = 1;
                        saved_i_size = btrfs_inode_size(path->nodes[0],
                                                        dst_item);
                }
        }

        copy_extent_buffer(path->nodes[0], eb, dst_ptr,
                           src_ptr, item_size);

        if (save_old_i_size) {
                struct btrfs_inode_item *dst_item;
                dst_item = (struct btrfs_inode_item *)dst_ptr;
                btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
        }

        /* make sure the generation is filled in */
        if (key->type == BTRFS_INODE_ITEM_KEY) {
                struct btrfs_inode_item *dst_item;
                dst_item = (struct btrfs_inode_item *)dst_ptr;
                if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
                        btrfs_set_inode_generation(path->nodes[0], dst_item,
                                                   trans->transid);
                }
        }
no_copy:
        btrfs_mark_buffer_dirty(trans, path->nodes[0]);
        btrfs_release_path(path);
        return 0;
}

static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
                               struct fscrypt_str *name)
{
        char *buf;

        buf = kmalloc(len, GFP_NOFS);
        if (!buf)
                return -ENOMEM;

        read_extent_buffer(eb, buf, (unsigned long)start, len);
        name->name = buf;
        name->len = len;
        return 0;
}

/*
 * simple helper to read an inode off the disk from a given root
 * This can only be called for subvolume roots and not for the log
 */
static noinline struct inode *read_one_inode(struct btrfs_root *root,
                                             u64 objectid)
{
        struct inode *inode;

        inode = btrfs_iget(root->fs_info->sb, objectid, root);
        if (IS_ERR(inode))
                inode = NULL;
        return inode;
}

/* replays a single extent in 'eb' at 'slot' with 'key' into the
 * subvolume 'root'.  path is released on entry and should be released
 * on exit.
 *
 * extents in the log tree have not been allocated out of the extent
 * tree yet.  So, this completes the allocation, taking a reference
 * as required if the extent already exists or creating a new extent
 * if it isn't in the extent allocation tree yet.
 *
 * The extent is inserted into the file, dropping any existing extents
 * from the file that overlap the new one.
 */
static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
                                      struct btrfs_path *path,
                                      struct extent_buffer *eb, int slot,
                                      struct btrfs_key *key)
{
        struct btrfs_drop_extents_args drop_args = { 0 };
        struct btrfs_fs_info *fs_info = root->fs_info;
        int found_type;
        u64 extent_end;
        u64 start = key->offset;
        u64 nbytes = 0;
        struct btrfs_file_extent_item *item;
        struct inode *inode = NULL;
        unsigned long size;
        int ret = 0;

        item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
        found_type = btrfs_file_extent_type(eb, item);

        if (found_type == BTRFS_FILE_EXTENT_REG ||
            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
                nbytes = btrfs_file_extent_num_bytes(eb, item);
                extent_end = start + nbytes;

                /*
                 * We don't add to the inodes nbytes if we are prealloc or a
                 * hole.
                 */
                if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
                        nbytes = 0;
        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
                size = btrfs_file_extent_ram_bytes(eb, item);
                nbytes = btrfs_file_extent_ram_bytes(eb, item);
                extent_end = ALIGN(start + size,
                                   fs_info->sectorsize);
        } else {
                ret = 0;
                goto out;
        }

        inode = read_one_inode(root, key->objectid);
        if (!inode) {
                ret = -EIO;
                goto out;
        }

        /*
         * first check to see if we already have this extent in the
         * file.  This must be done before the btrfs_drop_extents run
         * so we don't try to drop this extent.
         */
        ret = btrfs_lookup_file_extent(trans, root, path,
                        btrfs_ino(BTRFS_I(inode)), start, 0);

        if (ret == 0 &&
            (found_type == BTRFS_FILE_EXTENT_REG ||
             found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
                struct btrfs_file_extent_item cmp1;
                struct btrfs_file_extent_item cmp2;
                struct btrfs_file_extent_item *existing;
                struct extent_buffer *leaf;

                leaf = path->nodes[0];
                existing = btrfs_item_ptr(leaf, path->slots[0],
                                          struct btrfs_file_extent_item);

                read_extent_buffer(eb, &cmp1, (unsigned long)item,
                                   sizeof(cmp1));
                read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
                                   sizeof(cmp2));

                /*
                 * we already have a pointer to this exact extent,
                 * we don't have to do anything
                 */
                if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
                        btrfs_release_path(path);
                        goto out;
                }
        }
        btrfs_release_path(path);

        /* drop any overlapping extents */
        drop_args.start = start;
        drop_args.end = extent_end;
        drop_args.drop_cache = true;
        ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
        if (ret)
                goto out;

        if (found_type == BTRFS_FILE_EXTENT_REG ||
            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
                u64 offset;
                unsigned long dest_offset;
                struct btrfs_key ins;

                if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
                    btrfs_fs_incompat(fs_info, NO_HOLES))
                        goto update_inode;

                ret = btrfs_insert_empty_item(trans, root, path, key,
                                              sizeof(*item));
                if (ret)
                        goto out;
                dest_offset = btrfs_item_ptr_offset(path->nodes[0],
                                                    path->slots[0]);
                copy_extent_buffer(path->nodes[0], eb, dest_offset,
                                (unsigned long)item,  sizeof(*item));

                ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
                ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
                ins.type = BTRFS_EXTENT_ITEM_KEY;
                offset = key->offset - btrfs_file_extent_offset(eb, item);

                /*
                 * Manually record dirty extent, as here we did a shallow
                 * file extent item copy and skip normal backref update,
                 * but modifying extent tree all by ourselves.
                 * So need to manually record dirty extent for qgroup,
                 * as the owner of the file extent changed from log tree
                 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
                 */
                ret = btrfs_qgroup_trace_extent(trans,
                                btrfs_file_extent_disk_bytenr(eb, item),
                                btrfs_file_extent_disk_num_bytes(eb, item));
                if (ret < 0)
                        goto out;

                if (ins.objectid > 0) {
                        u64 csum_start;
                        u64 csum_end;
                        LIST_HEAD(ordered_sums);

                        /*
                         * is this extent already allocated in the extent
                         * allocation tree?  If so, just add a reference
                         */
                        ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
                                                ins.offset);
                        if (ret < 0) {
                                goto out;
                        } else if (ret == 0) {
                                struct btrfs_ref ref = {
                                        .action = BTRFS_ADD_DELAYED_REF,
                                        .bytenr = ins.objectid,
                                        .num_bytes = ins.offset,
                                        .owning_root = btrfs_root_id(root),
                                        .ref_root = btrfs_root_id(root),
                                };
                                btrfs_init_data_ref(&ref, key->objectid, offset,
                                                    0, false);
                                ret = btrfs_inc_extent_ref(trans, &ref);
                                if (ret)
                                        goto out;
                        } else {
                                /*
                                 * insert the extent pointer in the extent
                                 * allocation tree
                                 */
                                ret = btrfs_alloc_logged_file_extent(trans,
                                                btrfs_root_id(root),
                                                key->objectid, offset, &ins);
                                if (ret)
                                        goto out;
                        }
                        btrfs_release_path(path);

                        if (btrfs_file_extent_compression(eb, item)) {
                                csum_start = ins.objectid;
                                csum_end = csum_start + ins.offset;
                        } else {
                                csum_start = ins.objectid +
                                        btrfs_file_extent_offset(eb, item);
                                csum_end = csum_start +
                                        btrfs_file_extent_num_bytes(eb, item);
                        }

                        ret = btrfs_lookup_csums_list(root->log_root,
                                                csum_start, csum_end - 1,
                                                &ordered_sums, false);
                        if (ret < 0)
                                goto out;
                        ret = 0;
                        /*
                         * Now delete all existing cums in the csum root that
                         * cover our range. We do this because we can have an
                         * extent that is completely referenced by one file
                         * extent item and partially referenced by another
                         * file extent item (like after using the clone or
                         * extent_same ioctls). In this case if we end up doing
                         * the replay of the one that partially references the
                         * extent first, and we do not do the csum deletion
                         * below, we can get 2 csum items in the csum tree that
                         * overlap each other. For example, imagine our log has
                         * the two following file extent items:
                         *
                         * key (257 EXTENT_DATA 409600)
                         *     extent data disk byte 12845056 nr 102400
                         *     extent data offset 20480 nr 20480 ram 102400
                         *
                         * key (257 EXTENT_DATA 819200)
                         *     extent data disk byte 12845056 nr 102400
                         *     extent data offset 0 nr 102400 ram 102400
                         *
                         * Where the second one fully references the 100K extent
                         * that starts at disk byte 12845056, and the log tree
                         * has a single csum item that covers the entire range
                         * of the extent:
                         *
                         * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
                         *
                         * After the first file extent item is replayed, the
                         * csum tree gets the following csum item:
                         *
                         * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
                         *
                         * Which covers the 20K sub-range starting at offset 20K
                         * of our extent. Now when we replay the second file
                         * extent item, if we do not delete existing csum items
                         * that cover any of its blocks, we end up getting two
                         * csum items in our csum tree that overlap each other:
                         *
                         * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
                         * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
                         *
                         * Which is a problem, because after this anyone trying
                         * to lookup up for the checksum of any block of our
                         * extent starting at an offset of 40K or higher, will
                         * end up looking at the second csum item only, which
                         * does not contain the checksum for any block starting
                         * at offset 40K or higher of our extent.
                         */
                        while (!list_empty(&ordered_sums)) {
                                struct btrfs_ordered_sum *sums;
                                struct btrfs_root *csum_root;

                                sums = list_entry(ordered_sums.next,
                                                struct btrfs_ordered_sum,
                                                list);
                                csum_root = btrfs_csum_root(fs_info,
                                                            sums->logical);
                                if (!ret)
                                        ret = btrfs_del_csums(trans, csum_root,
                                                              sums->logical,
                                                              sums->len);
                                if (!ret)
                                        ret = btrfs_csum_file_blocks(trans,
                                                                     csum_root,
                                                                     sums);
                                list_del(&sums->list);
                                kfree(sums);
                        }
                        if (ret)
                                goto out;
                } else {
                        btrfs_release_path(path);
                }
        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
                /* inline extents are easy, we just overwrite them */
                ret = overwrite_item(trans, root, path, eb, slot, key);
                if (ret)
                        goto out;
        }

        ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
                                                extent_end - start);
        if (ret)
                goto out;

update_inode:
        btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
        ret = btrfs_update_inode(trans, BTRFS_I(inode));
out:
        iput(inode);
        return ret;
}

static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
                                       struct btrfs_inode *dir,
                                       struct btrfs_inode *inode,
                                       const struct fscrypt_str *name)
{
        int ret;

        ret = btrfs_unlink_inode(trans, dir, inode, name);
        if (ret)
                return ret;
        /*
         * Whenever we need to check if a name exists or not, we check the
         * fs/subvolume tree. So after an unlink we must run delayed items, so
         * that future checks for a name during log replay see that the name
         * does not exists anymore.
         */
        return btrfs_run_delayed_items(trans);
}

/*
 * when cleaning up conflicts between the directory names in the
 * subvolume, directory names in the log and directory names in the
 * inode back references, we may have to unlink inodes from directories.
 *
 * This is a helper function to do the unlink of a specific directory
 * item
 */
static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
                                      struct btrfs_path *path,
                                      struct btrfs_inode *dir,
                                      struct btrfs_dir_item *di)
{
        struct btrfs_root *root = dir->root;
        struct inode *inode;
        struct fscrypt_str name;
        struct extent_buffer *leaf;
        struct btrfs_key location;
        int ret;

        leaf = path->nodes[0];

        btrfs_dir_item_key_to_cpu(leaf, di, &location);
        ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
        if (ret)
                return -ENOMEM;

        btrfs_release_path(path);

        inode = read_one_inode(root, location.objectid);
        if (!inode) {
                ret = -EIO;
                goto out;
        }

        ret = link_to_fixup_dir(trans, root, path, location.objectid);
        if (ret)
                goto out;

        ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name);
out:
        kfree(name.name);
        iput(inode);
        return ret;
}

/*
 * See if a given name and sequence number found in an inode back reference are
 * already in a directory and correctly point to this inode.
 *
 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
 * exists.
 */
static noinline int inode_in_dir(struct btrfs_root *root,
                                 struct btrfs_path *path,
                                 u64 dirid, u64 objectid, u64 index,
                                 struct fscrypt_str *name)
{
        struct btrfs_dir_item *di;
        struct btrfs_key location;
        int ret = 0;

        di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
                                         index, name, 0);
        if (IS_ERR(di)) {
                ret = PTR_ERR(di);
                goto out;
        } else if (di) {
                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
                if (location.objectid != objectid)
                        goto out;
        } else {
                goto out;
        }

        btrfs_release_path(path);
        di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0);
        if (IS_ERR(di)) {
                ret = PTR_ERR(di);
                goto out;
        } else if (di) {
                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
                if (location.objectid == objectid)
                        ret = 1;
        }
out:
        btrfs_release_path(path);
        return ret;
}

/*
 * helper function to check a log tree for a named back reference in
 * an inode.  This is used to decide if a back reference that is
 * found in the subvolume conflicts with what we find in the log.
 *
 * inode backreferences may have multiple refs in a single item,
 * during replay we process one reference at a time, and we don't
 * want to delete valid links to a file from the subvolume if that
 * link is also in the log.
 */
static noinline int backref_in_log(struct btrfs_root *log,
                                   struct btrfs_key *key,
                                   u64 ref_objectid,
                                   const struct fscrypt_str *name)
{
        struct btrfs_path *path;
        int ret;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
        if (ret < 0) {
                goto out;
        } else if (ret == 1) {
                ret = 0;
                goto out;
        }

        if (key->type == BTRFS_INODE_EXTREF_KEY)
                ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
                                                       path->slots[0],
                                                       ref_objectid, name);
        else
                ret = !!btrfs_find_name_in_backref(path->nodes[0],
                                                   path->slots[0], name);
out:
        btrfs_free_path(path);
        return ret;
}

static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root,
                                  struct btrfs_path *path,
                                  struct btrfs_root *log_root,
                                  struct btrfs_inode *dir,
                                  struct btrfs_inode *inode,
                                  u64 inode_objectid, u64 parent_objectid,
                                  u64 ref_index, struct fscrypt_str *name)
{
        int ret;
        struct extent_buffer *leaf;
        struct btrfs_dir_item *di;
        struct btrfs_key search_key;
        struct btrfs_inode_extref *extref;

again:
        /* Search old style refs */
        search_key.objectid = inode_objectid;
        search_key.type = BTRFS_INODE_REF_KEY;
        search_key.offset = parent_objectid;
        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
        if (ret == 0) {
                struct btrfs_inode_ref *victim_ref;
                unsigned long ptr;
                unsigned long ptr_end;

                leaf = path->nodes[0];

                /* are we trying to overwrite a back ref for the root directory
                 * if so, just jump out, we're done
                 */
                if (search_key.objectid == search_key.offset)
                        return 1;

                /* check all the names in this back reference to see
                 * if they are in the log.  if so, we allow them to stay
                 * otherwise they must be unlinked as a conflict
                 */
                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
                ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
                while (ptr < ptr_end) {
                        struct fscrypt_str victim_name;

                        victim_ref = (struct btrfs_inode_ref *)ptr;
                        ret = read_alloc_one_name(leaf, (victim_ref + 1),
                                 btrfs_inode_ref_name_len(leaf, victim_ref),
                                 &victim_name);
                        if (ret)
                                return ret;

                        ret = backref_in_log(log_root, &search_key,
                                             parent_objectid, &victim_name);
                        if (ret < 0) {
                                kfree(victim_name.name);
                                return ret;
                        } else if (!ret) {
                                inc_nlink(&inode->vfs_inode);
                                btrfs_release_path(path);

                                ret = unlink_inode_for_log_replay(trans, dir, inode,
                                                &victim_name);
                                kfree(victim_name.name);
                                if (ret)
                                        return ret;
                                goto again;
                        }
                        kfree(victim_name.name);

                        ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
                }
        }
        btrfs_release_path(path);

        /* Same search but for extended refs */
        extref = btrfs_lookup_inode_extref(NULL, root, path, name,
                                           inode_objectid, parent_objectid, 0,
                                           0);
        if (IS_ERR(extref)) {
                return PTR_ERR(extref);
        } else if (extref) {
                u32 item_size;
                u32 cur_offset = 0;
                unsigned long base;
                struct inode *victim_parent;

                leaf = path->nodes[0];

                item_size = btrfs_item_size(leaf, path->slots[0]);
                base = btrfs_item_ptr_offset(leaf, path->slots[0]);

                while (cur_offset < item_size) {
                        struct fscrypt_str victim_name;

                        extref = (struct btrfs_inode_extref *)(base + cur_offset);

                        if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
                                goto next;

                        ret = read_alloc_one_name(leaf, &extref->name,
                                 btrfs_inode_extref_name_len(leaf, extref),
                                 &victim_name);
                        if (ret)
                                return ret;

                        search_key.objectid = inode_objectid;
                        search_key.type = BTRFS_INODE_EXTREF_KEY;
                        search_key.offset = btrfs_extref_hash(parent_objectid,
                                                              victim_name.name,
                                                              victim_name.len);
                        ret = backref_in_log(log_root, &search_key,
                                             parent_objectid, &victim_name);
                        if (ret < 0) {
                                kfree(victim_name.name);
                                return ret;
                        } else if (!ret) {
                                ret = -ENOENT;
                                victim_parent = read_one_inode(root,
                                                parent_objectid);
                                if (victim_parent) {
                                        inc_nlink(&inode->vfs_inode);
                                        btrfs_release_path(path);

                                        ret = unlink_inode_for_log_replay(trans,
                                                        BTRFS_I(victim_parent),
                                                        inode, &victim_name);
                                }
                                iput(victim_parent);
                                kfree(victim_name.name);
                                if (ret)
                                        return ret;
                                goto again;
                        }
                        kfree(victim_name.name);
next:
                        cur_offset += victim_name.len + sizeof(*extref);
                }
        }
        btrfs_release_path(path);

        /* look for a conflicting sequence number */
        di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
                                         ref_index, name, 0);
        if (IS_ERR(di)) {
                return PTR_ERR(di);
        } else if (di) {
                ret = drop_one_dir_item(trans, path, dir, di);
                if (ret)
                        return ret;
        }
        btrfs_release_path(path);

        /* look for a conflicting name */
        di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0);
        if (IS_ERR(di)) {
                return PTR_ERR(di);
        } else if (di) {
                ret = drop_one_dir_item(trans, path, dir, di);
                if (ret)
                        return ret;
        }
        btrfs_release_path(path);

        return 0;
}

static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
                             struct fscrypt_str *name, u64 *index,
                             u64 *parent_objectid)
{
        struct btrfs_inode_extref *extref;
        int ret;

        extref = (struct btrfs_inode_extref *)ref_ptr;

        ret = read_alloc_one_name(eb, &extref->name,
                                  btrfs_inode_extref_name_len(eb, extref), name);
        if (ret)
                return ret;

        if (index)
                *index = btrfs_inode_extref_index(eb, extref);
        if (parent_objectid)
                *parent_objectid = btrfs_inode_extref_parent(eb, extref);

        return 0;
}

static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
                          struct fscrypt_str *name, u64 *index)
{
        struct btrfs_inode_ref *ref;
        int ret;

        ref = (struct btrfs_inode_ref *)ref_ptr;

        ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
                                  name);
        if (ret)
                return ret;

        if (index)
                *index = btrfs_inode_ref_index(eb, ref);

        return 0;
}

/*
 * Take an inode reference item from the log tree and iterate all names from the
 * inode reference item in the subvolume tree with the same key (if it exists).
 * For any name that is not in the inode reference item from the log tree, do a
 * proper unlink of that name (that is, remove its entry from the inode
 * reference item and both dir index keys).
 */
static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path,
                                 struct btrfs_inode *inode,
                                 struct extent_buffer *log_eb,
                                 int log_slot,
                                 struct btrfs_key *key)
{
        int ret;
        unsigned long ref_ptr;
        unsigned long ref_end;
        struct extent_buffer *eb;

again:
        btrfs_release_path(path);
        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
        if (ret > 0) {
                ret = 0;
                goto out;
        }
        if (ret < 0)
                goto out;

        eb = path->nodes[0];
        ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
        ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
        while (ref_ptr < ref_end) {
                struct fscrypt_str name;
                u64 parent_id;

                if (key->type == BTRFS_INODE_EXTREF_KEY) {
                        ret = extref_get_fields(eb, ref_ptr, &name,
                                                NULL, &parent_id);
                } else {
                        parent_id = key->offset;
                        ret = ref_get_fields(eb, ref_ptr, &name, NULL);
                }
                if (ret)
                        goto out;

                if (key->type == BTRFS_INODE_EXTREF_KEY)
                        ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
                                                               parent_id, &name);
                else
                        ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);

                if (!ret) {
                        struct inode *dir;

                        btrfs_release_path(path);
                        dir = read_one_inode(root, parent_id);
                        if (!dir) {
                                ret = -ENOENT;
                                kfree(name.name);
                                goto out;
                        }
                        ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
                                                 inode, &name);
                        kfree(name.name);
                        iput(dir);
                        if (ret)
                                goto out;
                        goto again;
                }

                kfree(name.name);
                ref_ptr += name.len;
                if (key->type == BTRFS_INODE_EXTREF_KEY)
                        ref_ptr += sizeof(struct btrfs_inode_extref);
                else
                        ref_ptr += sizeof(struct btrfs_inode_ref);
        }
        ret = 0;
 out:
        btrfs_release_path(path);
        return ret;
}

/*
 * replay one inode back reference item found in the log tree.
 * eb, slot and key refer to the buffer and key found in the log tree.
 * root is the destination we are replaying into, and path is for temp
 * use by this function.  (it should be released on return).
 */
static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root,
                                  struct btrfs_root *log,
                                  struct btrfs_path *path,
                                  struct extent_buffer *eb, int slot,
                                  struct btrfs_key *key)
{
        struct inode *dir = NULL;
        struct inode *inode = NULL;
        unsigned long ref_ptr;
        unsigned long ref_end;
        struct fscrypt_str name;
        int ret;
        int log_ref_ver = 0;
        u64 parent_objectid;
        u64 inode_objectid;
        u64 ref_index = 0;
        int ref_struct_size;

        ref_ptr = btrfs_item_ptr_offset(eb, slot);
        ref_end = ref_ptr + btrfs_item_size(eb, slot);

        if (key->type == BTRFS_INODE_EXTREF_KEY) {
                struct btrfs_inode_extref *r;

                ref_struct_size = sizeof(struct btrfs_inode_extref);
                log_ref_ver = 1;
                r = (struct btrfs_inode_extref *)ref_ptr;
                parent_objectid = btrfs_inode_extref_parent(eb, r);
        } else {
                ref_struct_size = sizeof(struct btrfs_inode_ref);
                parent_objectid = key->offset;
        }
        inode_objectid = key->objectid;

        /*
         * it is possible that we didn't log all the parent directories
         * for a given inode.  If we don't find the dir, just don't
         * copy the back ref in.  The link count fixup code will take
         * care of the rest
         */
        dir = read_one_inode(root, parent_objectid);
        if (!dir) {
                ret = -ENOENT;
                goto out;
        }

        inode = read_one_inode(root, inode_objectid);
        if (!inode) {
                ret = -EIO;
                goto out;
        }

        while (ref_ptr < ref_end) {
                if (log_ref_ver) {
                        ret = extref_get_fields(eb, ref_ptr, &name,
                                                &ref_index, &parent_objectid);
                        /*
                         * parent object can change from one array
                         * item to another.
                         */
                        if (!dir)
                                dir = read_one_inode(root, parent_objectid);
                        if (!dir) {
                                ret = -ENOENT;
                                goto out;
                        }
                } else {
                        ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
                }
                if (ret)
                        goto out;

                ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
                                   btrfs_ino(BTRFS_I(inode)), ref_index, &name);
                if (ret < 0) {
                        goto out;
                } else if (ret == 0) {
                        /*
                         * look for a conflicting back reference in the
                         * metadata. if we find one we have to unlink that name
                         * of the file before we add our new link.  Later on, we
                         * overwrite any existing back reference, and we don't
                         * want to create dangling pointers in the directory.
                         */
                        ret = __add_inode_ref(trans, root, path, log,
                                              BTRFS_I(dir), BTRFS_I(inode),
                                              inode_objectid, parent_objectid,
                                              ref_index, &name);
                        if (ret) {
                                if (ret == 1)
                                        ret = 0;
                                goto out;
                        }

                        /* insert our name */
                        ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
                                             &name, 0, ref_index);
                        if (ret)
                                goto out;

                        ret = btrfs_update_inode(trans, BTRFS_I(inode));
                        if (ret)
                                goto out;
                }
                /* Else, ret == 1, we already have a perfect match, we're done. */

                ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
                kfree(name.name);
                name.name = NULL;
                if (log_ref_ver) {
                        iput(dir);
                        dir = NULL;
                }
        }

        /*
         * Before we overwrite the inode reference item in the subvolume tree
         * with the item from the log tree, we must unlink all names from the
         * parent directory that are in the subvolume's tree inode reference
         * item, otherwise we end up with an inconsistent subvolume tree where
         * dir index entries exist for a name but there is no inode reference
         * item with the same name.
         */
        ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
                                    key);
        if (ret)
                goto out;

        /* finally write the back reference in the inode */
        ret = overwrite_item(trans, root, path, eb, slot, key);
out:
        btrfs_release_path(path);
        kfree(name.name);
        iput(dir);
        iput(inode);
        return ret;
}

static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path)
{
        int ret = 0;
        int name_len;
        unsigned int nlink = 0;
        u32 item_size;
        u32 cur_offset = 0;
        u64 inode_objectid = btrfs_ino(inode);
        u64 offset = 0;
        unsigned long ptr;
        struct btrfs_inode_extref *extref;
        struct extent_buffer *leaf;

        while (1) {
                ret = btrfs_find_one_extref(inode->root, inode_objectid, offset,
                                            path, &extref, &offset);
                if (ret)
                        break;

                leaf = path->nodes[0];
                item_size = btrfs_item_size(leaf, path->slots[0]);
                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
                cur_offset = 0;

                while (cur_offset < item_size) {
                        extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
                        name_len = btrfs_inode_extref_name_len(leaf, extref);

                        nlink++;

                        cur_offset += name_len + sizeof(*extref);
                }

                offset++;
                btrfs_release_path(path);
        }
        btrfs_release_path(path);

        if (ret < 0 && ret != -ENOENT)
                return ret;
        return nlink;
}

static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path)
{
        int ret;
        struct btrfs_key key;
        unsigned int nlink = 0;
        unsigned long ptr;
        unsigned long ptr_end;
        int name_len;
        u64 ino = btrfs_ino(inode);

        key.objectid = ino;
        key.type = BTRFS_INODE_REF_KEY;
        key.offset = (u64)-1;

        while (1) {
                ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0);
                if (ret < 0)
                        break;
                if (ret > 0) {
                        if (path->slots[0] == 0)
                                break;
                        path->slots[0]--;
                }
process_slot:
                btrfs_item_key_to_cpu(path->nodes[0], &key,
                                      path->slots[0]);
                if (key.objectid != ino ||
                    key.type != BTRFS_INODE_REF_KEY)
                        break;
                ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
                ptr_end = ptr + btrfs_item_size(path->nodes[0],
                                                   path->slots[0]);
                while (ptr < ptr_end) {
                        struct btrfs_inode_ref *ref;

                        ref = (struct btrfs_inode_ref *)ptr;
                        name_len = btrfs_inode_ref_name_len(path->nodes[0],
                                                            ref);
                        ptr = (unsigned long)(ref + 1) + name_len;
                        nlink++;
                }

                if (key.offset == 0)
                        break;
                if (path->slots[0] > 0) {
                        path->slots[0]--;
                        goto process_slot;
                }
                key.offset--;
                btrfs_release_path(path);
        }
        btrfs_release_path(path);

        return nlink;
}

/*
 * There are a few corners where the link count of the file can't
 * be properly maintained during replay.  So, instead of adding
 * lots of complexity to the log code, we just scan the backrefs
 * for any file that has been through replay.
 *
 * The scan will update the link count on the inode to reflect the
 * number of back refs found.  If it goes down to zero, the iput
 * will free the inode.
 */
static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
                                           struct inode *inode)
{
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_path *path;
        int ret;
        u64 nlink = 0;
        u64 ino = btrfs_ino(BTRFS_I(inode));

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        ret = count_inode_refs(BTRFS_I(inode), path);
        if (ret < 0)
                goto out;

        nlink = ret;

        ret = count_inode_extrefs(BTRFS_I(inode), path);
        if (ret < 0)
                goto out;

        nlink += ret;

        ret = 0;

        if (nlink != inode->i_nlink) {
                set_nlink(inode, nlink);
                ret = btrfs_update_inode(trans, BTRFS_I(inode));
                if (ret)
                        goto out;
        }
        BTRFS_I(inode)->index_cnt = (u64)-1;

        if (inode->i_nlink == 0) {
                if (S_ISDIR(inode->i_mode)) {
                        ret = replay_dir_deletes(trans, root, NULL, path,
                                                 ino, 1);
                        if (ret)
                                goto out;
                }
                ret = btrfs_insert_orphan_item(trans, root, ino);
                if (ret == -EEXIST)
                        ret = 0;
        }

out:
        btrfs_free_path(path);
        return ret;
}

static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            struct btrfs_path *path)
{
        int ret;
        struct btrfs_key key;
        struct inode *inode;

        key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
        key.type = BTRFS_ORPHAN_ITEM_KEY;
        key.offset = (u64)-1;
        while (1) {
                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
                if (ret < 0)
                        break;

                if (ret == 1) {
                        ret = 0;
                        if (path->slots[0] == 0)
                                break;
                        path->slots[0]--;
                }

                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
                if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
                    key.type != BTRFS_ORPHAN_ITEM_KEY)
                        break;

                ret = btrfs_del_item(trans, root, path);
                if (ret)
                        break;

                btrfs_release_path(path);
                inode = read_one_inode(root, key.offset);
                if (!inode) {
                        ret = -EIO;
                        break;
                }

                ret = fixup_inode_link_count(trans, inode);
                iput(inode);
                if (ret)
                        break;

                /*
                 * fixup on a directory may create new entries,
                 * make sure we always look for the highset possible
                 * offset
                 */
                key.offset = (u64)-1;
        }
        btrfs_release_path(path);
        return ret;
}


/*
 * record a given inode in the fixup dir so we can check its link
 * count when replay is done.  The link count is incremented here
 * so the inode won't go away until we check it
 */
static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
                                      struct btrfs_path *path,
                                      u64 objectid)
{
        struct btrfs_key key;
        int ret = 0;
        struct inode *inode;

        inode = read_one_inode(root, objectid);
        if (!inode)
                return -EIO;

        key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
        key.type = BTRFS_ORPHAN_ITEM_KEY;
        key.offset = objectid;

        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);

        btrfs_release_path(path);
        if (ret == 0) {
                if (!inode->i_nlink)
                        set_nlink(inode, 1);
                else
                        inc_nlink(inode);
                ret = btrfs_update_inode(trans, BTRFS_I(inode));
        } else if (ret == -EEXIST) {
                ret = 0;
        }
        iput(inode);

        return ret;
}

/*
 * when replaying the log for a directory, we only insert names
 * for inodes that actually exist.  This means an fsync on a directory
 * does not implicitly fsync all the new files in it
 */
static noinline int insert_one_name(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root,
                                    u64 dirid, u64 index,
                                    const struct fscrypt_str *name,
                                    struct btrfs_key *location)
{
        struct inode *inode;
        struct inode *dir;
        int ret;

        inode = read_one_inode(root, location->objectid);
        if (!inode)
                return -ENOENT;

        dir = read_one_inode(root, dirid);
        if (!dir) {
                iput(inode);
                return -EIO;
        }

        ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
                             1, index);

        /* FIXME, put inode into FIXUP list */

        iput(inode);
        iput(dir);
        return ret;
}

static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
                                        struct btrfs_inode *dir,
                                        struct btrfs_path *path,
                                        struct btrfs_dir_item *dst_di,
                                        const struct btrfs_key *log_key,
                                        u8 log_flags,
                                        bool exists)
{
        struct btrfs_key found_key;

        btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
        /* The existing dentry points to the same inode, don't delete it. */
        if (found_key.objectid == log_key->objectid &&
            found_key.type == log_key->type &&
            found_key.offset == log_key->offset &&
            btrfs_dir_flags(path->nodes[0], dst_di) == log_flags)
                return 1;

        /*
         * Don't drop the conflicting directory entry if the inode for the new
         * entry doesn't exist.
         */
        if (!exists)
                return 0;

        return drop_one_dir_item(trans, path, dir, dst_di);
}

/*
 * take a single entry in a log directory item and replay it into
 * the subvolume.
 *
 * if a conflicting item exists in the subdirectory already,
 * the inode it points to is unlinked and put into the link count
 * fix up tree.
 *
 * If a name from the log points to a file or directory that does
 * not exist in the FS, it is skipped.  fsyncs on directories
 * do not force down inodes inside that directory, just changes to the
 * names or unlinks in a directory.
 *
 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
 * non-existing inode) and 1 if the name was replayed.
 */
static noinline int replay_one_name(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root,
                                    struct btrfs_path *path,
                                    struct extent_buffer *eb,
                                    struct btrfs_dir_item *di,
                                    struct btrfs_key *key)
{
        struct fscrypt_str name;
        struct btrfs_dir_item *dir_dst_di;
        struct btrfs_dir_item *index_dst_di;
        bool dir_dst_matches = false;
        bool index_dst_matches = false;
        struct btrfs_key log_key;
        struct btrfs_key search_key;
        struct inode *dir;
        u8 log_flags;
        bool exists;
        int ret;
        bool update_size = true;
        bool name_added = false;

        dir = read_one_inode(root, key->objectid);
        if (!dir)
                return -EIO;

        ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
        if (ret)
                goto out;

        log_flags = btrfs_dir_flags(eb, di);
        btrfs_dir_item_key_to_cpu(eb, di, &log_key);
        ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
        btrfs_release_path(path);
        if (ret < 0)
                goto out;
        exists = (ret == 0);
        ret = 0;

        dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
                                           &name, 1);
        if (IS_ERR(dir_dst_di)) {
                ret = PTR_ERR(dir_dst_di);
                goto out;
        } else if (dir_dst_di) {
                ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
                                                   dir_dst_di, &log_key,
                                                   log_flags, exists);
                if (ret < 0)
                        goto out;
                dir_dst_matches = (ret == 1);
        }

        btrfs_release_path(path);

        index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
                                                   key->objectid, key->offset,
                                                   &name, 1);
        if (IS_ERR(index_dst_di)) {
                ret = PTR_ERR(index_dst_di);
                goto out;
        } else if (index_dst_di) {
                ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
                                                   index_dst_di, &log_key,
                                                   log_flags, exists);
                if (ret < 0)
                        goto out;
                index_dst_matches = (ret == 1);
        }

        btrfs_release_path(path);

        if (dir_dst_matches && index_dst_matches) {
                ret = 0;
                update_size = false;
                goto out;
        }

        /*
         * Check if the inode reference exists in the log for the given name,
         * inode and parent inode
         */
        search_key.objectid = log_key.objectid;
        search_key.type = BTRFS_INODE_REF_KEY;
        search_key.offset = key->objectid;
        ret = backref_in_log(root->log_root, &search_key, 0, &name);
        if (ret < 0) {
                goto out;
        } else if (ret) {
                /* The dentry will be added later. */
                ret = 0;
                update_size = false;
                goto out;
        }

        search_key.objectid = log_key.objectid;
        search_key.type = BTRFS_INODE_EXTREF_KEY;
        search_key.offset = key->objectid;
        ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
        if (ret < 0) {
                goto out;
        } else if (ret) {
                /* The dentry will be added later. */
                ret = 0;
                update_size = false;
                goto out;
        }
        btrfs_release_path(path);
        ret = insert_one_name(trans, root, key->objectid, key->offset,
                              &name, &log_key);
        if (ret && ret != -ENOENT && ret != -EEXIST)
                goto out;
        if (!ret)
                name_added = true;
        update_size = false;
        ret = 0;

out:
        if (!ret && update_size) {
                btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
                ret = btrfs_update_inode(trans, BTRFS_I(dir));
        }
        kfree(name.name);
        iput(dir);
        if (!ret && name_added)
                ret = 1;
        return ret;
}

/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root,
                                        struct btrfs_path *path,
                                        struct extent_buffer *eb, int slot,
                                        struct btrfs_key *key)
{
        int ret;
        struct btrfs_dir_item *di;

        /* We only log dir index keys, which only contain a single dir item. */
        ASSERT(key->type == BTRFS_DIR_INDEX_KEY);

        di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
        ret = replay_one_name(trans, root, path, eb, di, key);
        if (ret < 0)
                return ret;

        /*
         * If this entry refers to a non-directory (directories can not have a
         * link count > 1) and it was added in the transaction that was not
         * committed, make sure we fixup the link count of the inode the entry
         * points to. Otherwise something like the following would result in a
         * directory pointing to an inode with a wrong link that does not account
         * for this dir entry:
         *
         * mkdir testdir
         * touch testdir/foo
         * touch testdir/bar
         * sync
         *
         * ln testdir/bar testdir/bar_link
         * ln testdir/foo testdir/foo_link
         * xfs_io -c "fsync" testdir/bar
         *
         * <power failure>
         *
         * mount fs, log replay happens
         *
         * File foo would remain with a link count of 1 when it has two entries
         * pointing to it in the directory testdir. This would make it impossible
         * to ever delete the parent directory has it would result in stale
         * dentries that can never be deleted.
         */
        if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) {
                struct btrfs_path *fixup_path;
                struct btrfs_key di_key;

                fixup_path = btrfs_alloc_path();
                if (!fixup_path)
                        return -ENOMEM;

                btrfs_dir_item_key_to_cpu(eb, di, &di_key);
                ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
                btrfs_free_path(fixup_path);
        }

        return ret;
}

/*
 * directory replay has two parts.  There are the standard directory
 * items in the log copied from the subvolume, and range items
 * created in the log while the subvolume was logged.
 *
 * The range items tell us which parts of the key space the log
 * is authoritative for.  During replay, if a key in the subvolume
 * directory is in a logged range item, but not actually in the log
 * that means it was deleted from the directory before the fsync
 * and should be removed.
 */
static noinline int find_dir_range(struct btrfs_root *root,
                                   struct btrfs_path *path,
                                   u64 dirid,
                                   u64 *start_ret, u64 *end_ret)
{
        struct btrfs_key key;
        u64 found_end;
        struct btrfs_dir_log_item *item;
        int ret;
        int nritems;

        if (*start_ret == (u64)-1)
                return 1;

        key.objectid = dirid;
        key.type = BTRFS_DIR_LOG_INDEX_KEY;
        key.offset = *start_ret;

        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto out;
        if (ret > 0) {
                if (path->slots[0] == 0)
                        goto out;
                path->slots[0]--;
        }
        if (ret != 0)
                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);

        if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
                ret = 1;
                goto next;
        }
        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                              struct btrfs_dir_log_item);
        found_end = btrfs_dir_log_end(path->nodes[0], item);

        if (*start_ret >= key.offset && *start_ret <= found_end) {
                ret = 0;
                *start_ret = key.offset;
                *end_ret = found_end;
                goto out;
        }
        ret = 1;
next:
        /* check the next slot in the tree to see if it is a valid item */
        nritems = btrfs_header_nritems(path->nodes[0]);
        path->slots[0]++;
        if (path->slots[0] >= nritems) {
                ret = btrfs_next_leaf(root, path);
                if (ret)
                        goto out;
        }

        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);

        if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
                ret = 1;
                goto out;
        }
        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                              struct btrfs_dir_log_item);
        found_end = btrfs_dir_log_end(path->nodes[0], item);
        *start_ret = key.offset;
        *end_ret = found_end;
        ret = 0;
out:
        btrfs_release_path(path);
        return ret;
}

/*
 * this looks for a given directory item in the log.  If the directory
 * item is not in the log, the item is removed and the inode it points
 * to is unlinked
 */
static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *log,
                                      struct btrfs_path *path,
                                      struct btrfs_path *log_path,
                                      struct inode *dir,
                                      struct btrfs_key *dir_key)
{
        struct btrfs_root *root = BTRFS_I(dir)->root;
        int ret;
        struct extent_buffer *eb;
        int slot;
        struct btrfs_dir_item *di;
        struct fscrypt_str name;
        struct inode *inode = NULL;
        struct btrfs_key location;

        /*
         * Currently we only log dir index keys. Even if we replay a log created
         * by an older kernel that logged both dir index and dir item keys, all
         * we need to do is process the dir index keys, we (and our caller) can
         * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
         */
        ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);

        eb = path->nodes[0];
        slot = path->slots[0];
        di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
        ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
        if (ret)
                goto out;

        if (log) {
                struct btrfs_dir_item *log_di;

                log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
                                                     dir_key->objectid,
                                                     dir_key->offset, &name, 0);
                if (IS_ERR(log_di)) {
                        ret = PTR_ERR(log_di);
                        goto out;
                } else if (log_di) {
                        /* The dentry exists in the log, we have nothing to do. */
                        ret = 0;
                        goto out;
                }
        }

        btrfs_dir_item_key_to_cpu(eb, di, &location);
        btrfs_release_path(path);
        btrfs_release_path(log_path);
        inode = read_one_inode(root, location.objectid);
        if (!inode) {
                ret = -EIO;
                goto out;
        }

        ret = link_to_fixup_dir(trans, root, path, location.objectid);
        if (ret)
                goto out;

        inc_nlink(inode);
        ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
                                          &name);
        /*
         * Unlike dir item keys, dir index keys can only have one name (entry) in
         * them, as there are no key collisions since each key has a unique offset
         * (an index number), so we're done.
         */
out:
        btrfs_release_path(path);
        btrfs_release_path(log_path);
        kfree(name.name);
        iput(inode);
        return ret;
}

static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              struct btrfs_root *log,
                              struct btrfs_path *path,
                              const u64 ino)
{
        struct btrfs_key search_key;
        struct btrfs_path *log_path;
        int i;
        int nritems;
        int ret;

        log_path = btrfs_alloc_path();
        if (!log_path)
                return -ENOMEM;

        search_key.objectid = ino;
        search_key.type = BTRFS_XATTR_ITEM_KEY;
        search_key.offset = 0;
again:
        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
        if (ret < 0)
                goto out;
process_leaf:
        nritems = btrfs_header_nritems(path->nodes[0]);
        for (i = path->slots[0]; i < nritems; i++) {
                struct btrfs_key key;
                struct btrfs_dir_item *di;
                struct btrfs_dir_item *log_di;
                u32 total_size;
                u32 cur;

                btrfs_item_key_to_cpu(path->nodes[0], &key, i);
                if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
                        ret = 0;
                        goto out;
                }

                di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
                total_size = btrfs_item_size(path->nodes[0], i);
                cur = 0;
                while (cur < total_size) {
                        u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
                        u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
                        u32 this_len = sizeof(*di) + name_len + data_len;
                        char *name;

                        name = kmalloc(name_len, GFP_NOFS);
                        if (!name) {
                                ret = -ENOMEM;
                                goto out;
                        }
                        read_extent_buffer(path->nodes[0], name,
                                           (unsigned long)(di + 1), name_len);

                        log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
                                                    name, name_len, 0);
                        btrfs_release_path(log_path);
                        if (!log_di) {
                                /* Doesn't exist in log tree, so delete it. */
                                btrfs_release_path(path);
                                di = btrfs_lookup_xattr(trans, root, path, ino,
                                                        name, name_len, -1);
                                kfree(name);
                                if (IS_ERR(di)) {
                                        ret = PTR_ERR(di);
                                        goto out;
                                }
                                ASSERT(di);
                                ret = btrfs_delete_one_dir_name(trans, root,
                                                                path, di);
                                if (ret)
                                        goto out;
                                btrfs_release_path(path);
                                search_key = key;
                                goto again;
                        }
                        kfree(name);
                        if (IS_ERR(log_di)) {
                                ret = PTR_ERR(log_di);
                                goto out;
                        }
                        cur += this_len;
                        di = (struct btrfs_dir_item *)((char *)di + this_len);
                }
        }
        ret = btrfs_next_leaf(root, path);
        if (ret > 0)
                ret = 0;
        else if (ret == 0)
                goto process_leaf;
out:
        btrfs_free_path(log_path);
        btrfs_release_path(path);
        return ret;
}


/*
 * deletion replay happens before we copy any new directory items
 * out of the log or out of backreferences from inodes.  It
 * scans the log to find ranges of keys that log is authoritative for,
 * and then scans the directory to find items in those ranges that are
 * not present in the log.
 *
 * Anything we don't find in the log is unlinked and removed from the
 * directory.
 */
static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root,
                                       struct btrfs_root *log,
                                       struct btrfs_path *path,
                                       u64 dirid, int del_all)
{
        u64 range_start;
        u64 range_end;
        int ret = 0;
        struct btrfs_key dir_key;
        struct btrfs_key found_key;
        struct btrfs_path *log_path;
        struct inode *dir;

        dir_key.objectid = dirid;
        dir_key.type = BTRFS_DIR_INDEX_KEY;
        log_path = btrfs_alloc_path();
        if (!log_path)
                return -ENOMEM;

        dir = read_one_inode(root, dirid);
        /* it isn't an error if the inode isn't there, that can happen
         * because we replay the deletes before we copy in the inode item
         * from the log
         */
        if (!dir) {
                btrfs_free_path(log_path);
                return 0;
        }

        range_start = 0;
        range_end = 0;
        while (1) {
                if (del_all)
                        range_end = (u64)-1;
                else {
                        ret = find_dir_range(log, path, dirid,
                                             &range_start, &range_end);
                        if (ret < 0)
                                goto out;
                        else if (ret > 0)
                                break;
                }

                dir_key.offset = range_start;
                while (1) {
                        int nritems;
                        ret = btrfs_search_slot(NULL, root, &dir_key, path,
                                                0, 0);
                        if (ret < 0)
                                goto out;

                        nritems = btrfs_header_nritems(path->nodes[0]);
                        if (path->slots[0] >= nritems) {
                                ret = btrfs_next_leaf(root, path);
                                if (ret == 1)
                                        break;
                                else if (ret < 0)
                                        goto out;
                        }
                        btrfs_item_key_to_cpu(path->nodes[0], &found_key,
                                              path->slots[0]);
                        if (found_key.objectid != dirid ||
                            found_key.type != dir_key.type) {
                                ret = 0;
                                goto out;
                        }

                        if (found_key.offset > range_end)
                                break;

                        ret = check_item_in_log(trans, log, path,
                                                log_path, dir,
                                                &found_key);
                        if (ret)
                                goto out;
                        if (found_key.offset == (u64)-1)
                                break;
                        dir_key.offset = found_key.offset + 1;
                }
                btrfs_release_path(path);
                if (range_end == (u64)-1)
                        break;
                range_start = range_end + 1;
        }
        ret = 0;
out:
        btrfs_release_path(path);
        btrfs_free_path(log_path);
        iput(dir);
        return ret;
}

/*
 * the process_func used to replay items from the log tree.  This
 * gets called in two different stages.  The first stage just looks
 * for inodes and makes sure they are all copied into the subvolume.
 *
 * The second stage copies all the other item types from the log into
 * the subvolume.  The two stage approach is slower, but gets rid of
 * lots of complexity around inodes referencing other inodes that exist
 * only in the log (references come from either directory items or inode
 * back refs).
 */
static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
                             struct walk_control *wc, u64 gen, int level)
{
        int nritems;
        struct btrfs_tree_parent_check check = {
                .transid = gen,
                .level = level
        };
        struct btrfs_path *path;
        struct btrfs_root *root = wc->replay_dest;
        struct btrfs_key key;
        int i;
        int ret;

        ret = btrfs_read_extent_buffer(eb, &check);
        if (ret)
                return ret;

        level = btrfs_header_level(eb);

        if (level != 0)
                return 0;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        nritems = btrfs_header_nritems(eb);
        for (i = 0; i < nritems; i++) {
                btrfs_item_key_to_cpu(eb, &key, i);

                /* inode keys are done during the first stage */
                if (key.type == BTRFS_INODE_ITEM_KEY &&
                    wc->stage == LOG_WALK_REPLAY_INODES) {
                        struct btrfs_inode_item *inode_item;
                        u32 mode;

                        inode_item = btrfs_item_ptr(eb, i,
                                            struct btrfs_inode_item);
                        /*
                         * If we have a tmpfile (O_TMPFILE) that got fsync'ed
                         * and never got linked before the fsync, skip it, as
                         * replaying it is pointless since it would be deleted
                         * later. We skip logging tmpfiles, but it's always
                         * possible we are replaying a log created with a kernel
                         * that used to log tmpfiles.
                         */
                        if (btrfs_inode_nlink(eb, inode_item) == 0) {
                                wc->ignore_cur_inode = true;
                                continue;
                        } else {
                                wc->ignore_cur_inode = false;
                        }
                        ret = replay_xattr_deletes(wc->trans, root, log,
                                                   path, key.objectid);
                        if (ret)
                                break;
                        mode = btrfs_inode_mode(eb, inode_item);
                        if (S_ISDIR(mode)) {
                                ret = replay_dir_deletes(wc->trans,
                                         root, log, path, key.objectid, 0);
                                if (ret)
                                        break;
                        }
                        ret = overwrite_item(wc->trans, root, path,
                                             eb, i, &key);
                        if (ret)
                                break;

                        /*
                         * Before replaying extents, truncate the inode to its
                         * size. We need to do it now and not after log replay
                         * because before an fsync we can have prealloc extents
                         * added beyond the inode's i_size. If we did it after,
                         * through orphan cleanup for example, we would drop
                         * those prealloc extents just after replaying them.
                         */
                        if (S_ISREG(mode)) {
                                struct btrfs_drop_extents_args drop_args = { 0 };
                                struct inode *inode;
                                u64 from;

                                inode = read_one_inode(root, key.objectid);
                                if (!inode) {
                                        ret = -EIO;
                                        break;
                                }
                                from = ALIGN(i_size_read(inode),
                                             root->fs_info->sectorsize);
                                drop_args.start = from;
                                drop_args.end = (u64)-1;
                                drop_args.drop_cache = true;
                                ret = btrfs_drop_extents(wc->trans, root,
                                                         BTRFS_I(inode),
                                                         &drop_args);
                                if (!ret) {
                                        inode_sub_bytes(inode,
                                                        drop_args.bytes_found);
                                        /* Update the inode's nbytes. */
                                        ret = btrfs_update_inode(wc->trans,
                                                                 BTRFS_I(inode));
                                }
                                iput(inode);
                                if (ret)
                                        break;
                        }

                        ret = link_to_fixup_dir(wc->trans, root,
                                                path, key.objectid);
                        if (ret)
                                break;
                }

                if (wc->ignore_cur_inode)
                        continue;

                if (key.type == BTRFS_DIR_INDEX_KEY &&
                    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
                        ret = replay_one_dir_item(wc->trans, root, path,
                                                  eb, i, &key);
                        if (ret)
                                break;
                }

                if (wc->stage < LOG_WALK_REPLAY_ALL)
                        continue;

                /* these keys are simply copied */
                if (key.type == BTRFS_XATTR_ITEM_KEY) {
                        ret = overwrite_item(wc->trans, root, path,
                                             eb, i, &key);
                        if (ret)
                                break;
                } else if (key.type == BTRFS_INODE_REF_KEY ||
                           key.type == BTRFS_INODE_EXTREF_KEY) {
                        ret = add_inode_ref(wc->trans, root, log, path,
                                            eb, i, &key);
                        if (ret && ret != -ENOENT)
                                break;
                        ret = 0;
                } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
                        ret = replay_one_extent(wc->trans, root, path,
                                                eb, i, &key);
                        if (ret)
                                break;
                }
                /*
                 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
                 * BTRFS_DIR_INDEX_KEY items which we use to derive the
                 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
                 * older kernel with such keys, ignore them.
                 */
        }
        btrfs_free_path(path);
        return ret;
}

/*
 * Correctly adjust the reserved bytes occupied by a log tree extent buffer
 */
static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
{
        struct btrfs_block_group *cache;

        cache = btrfs_lookup_block_group(fs_info, start);
        if (!cache) {
                btrfs_err(fs_info, "unable to find block group for %llu", start);
                return;
        }

        spin_lock(&cache->space_info->lock);
        spin_lock(&cache->lock);
        cache->reserved -= fs_info->nodesize;
        cache->space_info->bytes_reserved -= fs_info->nodesize;
        spin_unlock(&cache->lock);
        spin_unlock(&cache->space_info->lock);

        btrfs_put_block_group(cache);
}

static int clean_log_buffer(struct btrfs_trans_handle *trans,
                            struct extent_buffer *eb)
{
        int ret;

        btrfs_tree_lock(eb);
        btrfs_clear_buffer_dirty(trans, eb);
        wait_on_extent_buffer_writeback(eb);
        btrfs_tree_unlock(eb);

        if (trans) {
                ret = btrfs_pin_reserved_extent(trans, eb);
                if (ret)
                        return ret;
        } else {
                unaccount_log_buffer(eb->fs_info, eb->start);
        }

        return 0;
}

static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
                                   struct btrfs_path *path, int *level,
                                   struct walk_control *wc)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 bytenr;
        u64 ptr_gen;
        struct extent_buffer *next;
        struct extent_buffer *cur;
        int ret = 0;

        while (*level > 0) {
                struct btrfs_tree_parent_check check = { 0 };

                cur = path->nodes[*level];

                WARN_ON(btrfs_header_level(cur) != *level);

                if (path->slots[*level] >=
                    btrfs_header_nritems(cur))
                        break;

                bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
                check.transid = ptr_gen;
                check.level = *level - 1;
                check.has_first_key = true;
                btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);

                next = btrfs_find_create_tree_block(fs_info, bytenr,
                                                    btrfs_header_owner(cur),
                                                    *level - 1);
                if (IS_ERR(next))
                        return PTR_ERR(next);

                if (*level == 1) {
                        ret = wc->process_func(root, next, wc, ptr_gen,
                                               *level - 1);
                        if (ret) {
                                free_extent_buffer(next);
                                return ret;
                        }

                        path->slots[*level]++;
                        if (wc->free) {
                                ret = btrfs_read_extent_buffer(next, &check);
                                if (ret) {
                                        free_extent_buffer(next);
                                        return ret;
                                }

                                ret = clean_log_buffer(trans, next);
                                if (ret) {
                                        free_extent_buffer(next);
                                        return ret;
                                }
                        }
                        free_extent_buffer(next);
                        continue;
                }
                ret = btrfs_read_extent_buffer(next, &check);
                if (ret) {
                        free_extent_buffer(next);
                        return ret;
                }

                if (path->nodes[*level-1])
                        free_extent_buffer(path->nodes[*level-1]);
                path->nodes[*level-1] = next;
                *level = btrfs_header_level(next);
                path->slots[*level] = 0;
                cond_resched();
        }
        path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);

        cond_resched();
        return 0;
}

static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path, int *level,
                                 struct walk_control *wc)
{
        int i;
        int slot;
        int ret;

        for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
                slot = path->slots[i];
                if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
                        path->slots[i]++;
                        *level = i;
                        WARN_ON(*level == 0);
                        return 0;
                } else {
                        ret = wc->process_func(root, path->nodes[*level], wc,
                                 btrfs_header_generation(path->nodes[*level]),
                                 *level);
                        if (ret)
                                return ret;

                        if (wc->free) {
                                ret = clean_log_buffer(trans, path->nodes[*level]);
                                if (ret)
                                        return ret;
                        }
                        free_extent_buffer(path->nodes[*level]);
                        path->nodes[*level] = NULL;
                        *level = i + 1;
                }
        }
        return 1;
}

/*
 * drop the reference count on the tree rooted at 'snap'.  This traverses
 * the tree freeing any blocks that have a ref count of zero after being
 * decremented.
 */
static int walk_log_tree(struct btrfs_trans_handle *trans,
                         struct btrfs_root *log, struct walk_control *wc)
{
        int ret = 0;
        int wret;
        int level;
        struct btrfs_path *path;
        int orig_level;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        level = btrfs_header_level(log->node);
        orig_level = level;
        path->nodes[level] = log->node;
        atomic_inc(&log->node->refs);
        path->slots[level] = 0;

        while (1) {
                wret = walk_down_log_tree(trans, log, path, &level, wc);
                if (wret > 0)
                        break;
                if (wret < 0) {
                        ret = wret;
                        goto out;
                }

                wret = walk_up_log_tree(trans, log, path, &level, wc);
                if (wret > 0)
                        break;
                if (wret < 0) {
                        ret = wret;
                        goto out;
                }
        }

        /* was the root node processed? if not, catch it here */
        if (path->nodes[orig_level]) {
                ret = wc->process_func(log, path->nodes[orig_level], wc,
                         btrfs_header_generation(path->nodes[orig_level]),
                         orig_level);
                if (ret)
                        goto out;
                if (wc->free)
                        ret = clean_log_buffer(trans, path->nodes[orig_level]);
        }

out:
        btrfs_free_path(path);
        return ret;
}

/*
 * helper function to update the item for a given subvolumes log root
 * in the tree of log roots
 */
static int update_log_root(struct btrfs_trans_handle *trans,
                           struct btrfs_root *log,
                           struct btrfs_root_item *root_item)
{
        struct btrfs_fs_info *fs_info = log->fs_info;
        int ret;

        if (log->log_transid == 1) {
                /* insert root item on the first sync */
                ret = btrfs_insert_root(trans, fs_info->log_root_tree,
                                &log->root_key, root_item);
        } else {
                ret = btrfs_update_root(trans, fs_info->log_root_tree,
                                &log->root_key, root_item);
        }
        return ret;
}

static void wait_log_commit(struct btrfs_root *root, int transid)
{
        DEFINE_WAIT(wait);
        int index = transid % 2;

        /*
         * we only allow two pending log transactions at a time,
         * so we know that if ours is more than 2 older than the
         * current transaction, we're done
         */
        for (;;) {
                prepare_to_wait(&root->log_commit_wait[index],
                                &wait, TASK_UNINTERRUPTIBLE);

                if (!(root->log_transid_committed < transid &&
                      atomic_read(&root->log_commit[index])))
                        break;

                mutex_unlock(&root->log_mutex);
                schedule();
                mutex_lock(&root->log_mutex);
        }
        finish_wait(&root->log_commit_wait[index], &wait);
}

static void wait_for_writer(struct btrfs_root *root)
{
        DEFINE_WAIT(wait);

        for (;;) {
                prepare_to_wait(&root->log_writer_wait, &wait,
                                TASK_UNINTERRUPTIBLE);
                if (!atomic_read(&root->log_writers))
                        break;

                mutex_unlock(&root->log_mutex);
                schedule();
                mutex_lock(&root->log_mutex);
        }
        finish_wait(&root->log_writer_wait, &wait);
}

void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct inode *inode)
{
        ctx->log_ret = 0;
        ctx->log_transid = 0;
        ctx->log_new_dentries = false;
        ctx->logging_new_name = false;
        ctx->logging_new_delayed_dentries = false;
        ctx->logged_before = false;
        ctx->inode = inode;
        INIT_LIST_HEAD(&ctx->list);
        INIT_LIST_HEAD(&ctx->ordered_extents);
        INIT_LIST_HEAD(&ctx->conflict_inodes);
        ctx->num_conflict_inodes = 0;
        ctx->logging_conflict_inodes = false;
        ctx->scratch_eb = NULL;
}

void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx)
{
        struct btrfs_inode *inode = BTRFS_I(ctx->inode);

        if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
            !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
                return;

        /*
         * Don't care about allocation failure. This is just for optimization,
         * if we fail to allocate here, we will try again later if needed.
         */
        ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0);
}

void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
{
        struct btrfs_ordered_extent *ordered;
        struct btrfs_ordered_extent *tmp;

        ASSERT(inode_is_locked(ctx->inode));

        list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
                list_del_init(&ordered->log_list);
                btrfs_put_ordered_extent(ordered);
        }
}


static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
                                        struct btrfs_log_ctx *ctx)
{
        mutex_lock(&root->log_mutex);
        list_del_init(&ctx->list);
        mutex_unlock(&root->log_mutex);
}

/* 
 * Invoked in log mutex context, or be sure there is no other task which
 * can access the list.
 */
static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
                                             int index, int error)
{
        struct btrfs_log_ctx *ctx;
        struct btrfs_log_ctx *safe;

        list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
                list_del_init(&ctx->list);
                ctx->log_ret = error;
        }
}

/*
 * Sends a given tree log down to the disk and updates the super blocks to
 * record it.  When this call is done, you know that any inodes previously
 * logged are safely on disk only if it returns 0.
 *
 * Any other return value means you need to call btrfs_commit_transaction.
 * Some of the edge cases for fsyncing directories that have had unlinks
 * or renames done in the past mean that sometimes the only safe
 * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
 * that has happened.
 */
int btrfs_sync_log(struct btrfs_trans_handle *trans,
                   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
{
        int index1;
        int index2;
        int mark;
        int ret;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_root *log = root->log_root;
        struct btrfs_root *log_root_tree = fs_info->log_root_tree;
        struct btrfs_root_item new_root_item;
        int log_transid = 0;
        struct btrfs_log_ctx root_log_ctx;
        struct blk_plug plug;
        u64 log_root_start;
        u64 log_root_level;

        mutex_lock(&root->log_mutex);
        log_transid = ctx->log_transid;
        if (root->log_transid_committed >= log_transid) {
                mutex_unlock(&root->log_mutex);
                return ctx->log_ret;
        }

        index1 = log_transid % 2;
        if (atomic_read(&root->log_commit[index1])) {
                wait_log_commit(root, log_transid);
                mutex_unlock(&root->log_mutex);
                return ctx->log_ret;
        }
        ASSERT(log_transid == root->log_transid);
        atomic_set(&root->log_commit[index1], 1);

        /* wait for previous tree log sync to complete */
        if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
                wait_log_commit(root, log_transid - 1);

        while (1) {
                int batch = atomic_read(&root->log_batch);
                /* when we're on an ssd, just kick the log commit out */
                if (!btrfs_test_opt(fs_info, SSD) &&
                    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
                        mutex_unlock(&root->log_mutex);
                        schedule_timeout_uninterruptible(1);
                        mutex_lock(&root->log_mutex);
                }
                wait_for_writer(root);
                if (batch == atomic_read(&root->log_batch))
                        break;
        }

        /* bail out if we need to do a full commit */
        if (btrfs_need_log_full_commit(trans)) {
                ret = BTRFS_LOG_FORCE_COMMIT;
                mutex_unlock(&root->log_mutex);
                goto out;
        }

        if (log_transid % 2 == 0)
                mark = EXTENT_DIRTY;
        else
                mark = EXTENT_NEW;

        /* we start IO on  all the marked extents here, but we don't actually
         * wait for them until later.
         */
        blk_start_plug(&plug);
        ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
        /*
         * -EAGAIN happens when someone, e.g., a concurrent transaction
         *  commit, writes a dirty extent in this tree-log commit. This
         *  concurrent write will create a hole writing out the extents,
         *  and we cannot proceed on a zoned filesystem, requiring
         *  sequential writing. While we can bail out to a full commit
         *  here, but we can continue hoping the concurrent writing fills
         *  the hole.
         */
        if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
                ret = 0;
        if (ret) {
                blk_finish_plug(&plug);
                btrfs_set_log_full_commit(trans);
                mutex_unlock(&root->log_mutex);
                goto out;
        }

        /*
         * We _must_ update under the root->log_mutex in order to make sure we
         * have a consistent view of the log root we are trying to commit at
         * this moment.
         *
         * We _must_ copy this into a local copy, because we are not holding the
         * log_root_tree->log_mutex yet.  This is important because when we
         * commit the log_root_tree we must have a consistent view of the
         * log_root_tree when we update the super block to point at the
         * log_root_tree bytenr.  If we update the log_root_tree here we'll race
         * with the commit and possibly point at the new block which we may not
         * have written out.
         */
        btrfs_set_root_node(&log->root_item, log->node);
        memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));

        btrfs_set_root_log_transid(root, root->log_transid + 1);
        log->log_transid = root->log_transid;
        root->log_start_pid = 0;
        /*
         * IO has been started, blocks of the log tree have WRITTEN flag set
         * in their headers. new modifications of the log will be written to
         * new positions. so it's safe to allow log writers to go in.
         */
        mutex_unlock(&root->log_mutex);

        if (btrfs_is_zoned(fs_info)) {
                mutex_lock(&fs_info->tree_root->log_mutex);
                if (!log_root_tree->node) {
                        ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
                        if (ret) {
                                mutex_unlock(&fs_info->tree_root->log_mutex);
                                blk_finish_plug(&plug);
                                goto out;
                        }
                }
                mutex_unlock(&fs_info->tree_root->log_mutex);
        }

        btrfs_init_log_ctx(&root_log_ctx, NULL);

        mutex_lock(&log_root_tree->log_mutex);

        index2 = log_root_tree->log_transid % 2;
        list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
        root_log_ctx.log_transid = log_root_tree->log_transid;

        /*
         * Now we are safe to update the log_root_tree because we're under the
         * log_mutex, and we're a current writer so we're holding the commit
         * open until we drop the log_mutex.
         */
        ret = update_log_root(trans, log, &new_root_item);
        if (ret) {
                list_del_init(&root_log_ctx.list);
                blk_finish_plug(&plug);
                btrfs_set_log_full_commit(trans);
                if (ret != -ENOSPC)
                        btrfs_err(fs_info,
                                  "failed to update log for root %llu ret %d",
                                  btrfs_root_id(root), ret);
                btrfs_wait_tree_log_extents(log, mark);
                mutex_unlock(&log_root_tree->log_mutex);
                goto out;
        }

        if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
                blk_finish_plug(&plug);
                list_del_init(&root_log_ctx.list);
                mutex_unlock(&log_root_tree->log_mutex);
                ret = root_log_ctx.log_ret;
                goto out;
        }

        if (atomic_read(&log_root_tree->log_commit[index2])) {
                blk_finish_plug(&plug);
                ret = btrfs_wait_tree_log_extents(log, mark);
                wait_log_commit(log_root_tree,
                                root_log_ctx.log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
                if (!ret)
                        ret = root_log_ctx.log_ret;
                goto out;
        }
        ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
        atomic_set(&log_root_tree->log_commit[index2], 1);

        if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
                wait_log_commit(log_root_tree,
                                root_log_ctx.log_transid - 1);
        }

        /*
         * now that we've moved on to the tree of log tree roots,
         * check the full commit flag again
         */
        if (btrfs_need_log_full_commit(trans)) {
                blk_finish_plug(&plug);
                btrfs_wait_tree_log_extents(log, mark);
                mutex_unlock(&log_root_tree->log_mutex);
                ret = BTRFS_LOG_FORCE_COMMIT;
                goto out_wake_log_root;
        }

        ret = btrfs_write_marked_extents(fs_info,
                                         &log_root_tree->dirty_log_pages,
                                         EXTENT_DIRTY | EXTENT_NEW);
        blk_finish_plug(&plug);
        /*
         * As described above, -EAGAIN indicates a hole in the extents. We
         * cannot wait for these write outs since the waiting cause a
         * deadlock. Bail out to the full commit instead.
         */
        if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
                btrfs_set_log_full_commit(trans);
                btrfs_wait_tree_log_extents(log, mark);
                mutex_unlock(&log_root_tree->log_mutex);
                goto out_wake_log_root;
        } else if (ret) {
                btrfs_set_log_full_commit(trans);
                mutex_unlock(&log_root_tree->log_mutex);
                goto out_wake_log_root;
        }
        ret = btrfs_wait_tree_log_extents(log, mark);
        if (!ret)
                ret = btrfs_wait_tree_log_extents(log_root_tree,
                                                  EXTENT_NEW | EXTENT_DIRTY);
        if (ret) {
                btrfs_set_log_full_commit(trans);
                mutex_unlock(&log_root_tree->log_mutex);
                goto out_wake_log_root;
        }

        log_root_start = log_root_tree->node->start;
        log_root_level = btrfs_header_level(log_root_tree->node);
        log_root_tree->log_transid++;
        mutex_unlock(&log_root_tree->log_mutex);

        /*
         * Here we are guaranteed that nobody is going to write the superblock
         * for the current transaction before us and that neither we do write
         * our superblock before the previous transaction finishes its commit
         * and writes its superblock, because:
         *
         * 1) We are holding a handle on the current transaction, so no body
         *    can commit it until we release the handle;
         *
         * 2) Before writing our superblock we acquire the tree_log_mutex, so
         *    if the previous transaction is still committing, and hasn't yet
         *    written its superblock, we wait for it to do it, because a
         *    transaction commit acquires the tree_log_mutex when the commit
         *    begins and releases it only after writing its superblock.
         */
        mutex_lock(&fs_info->tree_log_mutex);

        /*
         * The previous transaction writeout phase could have failed, and thus
         * marked the fs in an error state.  We must not commit here, as we
         * could have updated our generation in the super_for_commit and
         * writing the super here would result in transid mismatches.  If there
         * is an error here just bail.
         */
        if (BTRFS_FS_ERROR(fs_info)) {
                ret = -EIO;
                btrfs_set_log_full_commit(trans);
                btrfs_abort_transaction(trans, ret);
                mutex_unlock(&fs_info->tree_log_mutex);
                goto out_wake_log_root;
        }

        btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
        btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
        ret = write_all_supers(fs_info, 1);
        mutex_unlock(&fs_info->tree_log_mutex);
        if (ret) {
                btrfs_set_log_full_commit(trans);
                btrfs_abort_transaction(trans, ret);
                goto out_wake_log_root;
        }

        /*
         * We know there can only be one task here, since we have not yet set
         * root->log_commit[index1] to 0 and any task attempting to sync the
         * log must wait for the previous log transaction to commit if it's
         * still in progress or wait for the current log transaction commit if
         * someone else already started it. We use <= and not < because the
         * first log transaction has an ID of 0.
         */
        ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid);
        btrfs_set_root_last_log_commit(root, log_transid);

out_wake_log_root:
        mutex_lock(&log_root_tree->log_mutex);
        btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);

        log_root_tree->log_transid_committed++;
        atomic_set(&log_root_tree->log_commit[index2], 0);
        mutex_unlock(&log_root_tree->log_mutex);

        /*
         * The barrier before waitqueue_active (in cond_wake_up) is needed so
         * all the updates above are seen by the woken threads. It might not be
         * necessary, but proving that seems to be hard.
         */
        cond_wake_up(&log_root_tree->log_commit_wait[index2]);
out:
        mutex_lock(&root->log_mutex);
        btrfs_remove_all_log_ctxs(root, index1, ret);
        root->log_transid_committed++;
        atomic_set(&root->log_commit[index1], 0);
        mutex_unlock(&root->log_mutex);

        /*
         * The barrier before waitqueue_active (in cond_wake_up) is needed so
         * all the updates above are seen by the woken threads. It might not be
         * necessary, but proving that seems to be hard.
         */
        cond_wake_up(&root->log_commit_wait[index1]);
        return ret;
}

static void free_log_tree(struct btrfs_trans_handle *trans,
                          struct btrfs_root *log)
{
        int ret;
        struct walk_control wc = {
                .free = 1,
                .process_func = process_one_buffer
        };

        if (log->node) {
                ret = walk_log_tree(trans, log, &wc);
                if (ret) {
                        /*
                         * We weren't able to traverse the entire log tree, the
                         * typical scenario is getting an -EIO when reading an
                         * extent buffer of the tree, due to a previous writeback
                         * failure of it.
                         */
                        set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
                                &log->fs_info->fs_state);

                        /*
                         * Some extent buffers of the log tree may still be dirty
                         * and not yet written back to storage, because we may
                         * have updates to a log tree without syncing a log tree,
                         * such as during rename and link operations. So flush
                         * them out and wait for their writeback to complete, so
                         * that we properly cleanup their state and pages.
                         */
                        btrfs_write_marked_extents(log->fs_info,
                                                   &log->dirty_log_pages,
                                                   EXTENT_DIRTY | EXTENT_NEW);
                        btrfs_wait_tree_log_extents(log,
                                                    EXTENT_DIRTY | EXTENT_NEW);

                        if (trans)
                                btrfs_abort_transaction(trans, ret);
                        else
                                btrfs_handle_fs_error(log->fs_info, ret, NULL);
                }
        }

        extent_io_tree_release(&log->dirty_log_pages);
        extent_io_tree_release(&log->log_csum_range);

        btrfs_put_root(log);
}

/*
 * free all the extents used by the tree log.  This should be called
 * at commit time of the full transaction
 */
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
{
        if (root->log_root) {
                free_log_tree(trans, root->log_root);
                root->log_root = NULL;
                clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
        }
        return 0;
}

int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *fs_info)
{
        if (fs_info->log_root_tree) {
                free_log_tree(trans, fs_info->log_root_tree);
                fs_info->log_root_tree = NULL;
                clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
        }
        return 0;
}

/*
 * Check if an inode was logged in the current transaction. This correctly deals
 * with the case where the inode was logged but has a logged_trans of 0, which
 * happens if the inode is evicted and loaded again, as logged_trans is an in
 * memory only field (not persisted).
 *
 * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
 * and < 0 on error.
 */
static int inode_logged(const struct btrfs_trans_handle *trans,
                        struct btrfs_inode *inode,
                        struct btrfs_path *path_in)
{
        struct btrfs_path *path = path_in;
        struct btrfs_key key;
        int ret;

        if (inode->logged_trans == trans->transid)
                return 1;

        /*
         * If logged_trans is not 0, then we know the inode logged was not logged
         * in this transaction, so we can return false right away.
         */
        if (inode->logged_trans > 0)
                return 0;

        /*
         * If no log tree was created for this root in this transaction, then
         * the inode can not have been logged in this transaction. In that case
         * set logged_trans to anything greater than 0 and less than the current
         * transaction's ID, to avoid the search below in a future call in case
         * a log tree gets created after this.
         */
        if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
                inode->logged_trans = trans->transid - 1;
                return 0;
        }

        /*
         * We have a log tree and the inode's logged_trans is 0. We can't tell
         * for sure if the inode was logged before in this transaction by looking
         * only at logged_trans. We could be pessimistic and assume it was, but
         * that can lead to unnecessarily logging an inode during rename and link
         * operations, and then further updating the log in followup rename and
         * link operations, specially if it's a directory, which adds latency
         * visible to applications doing a series of rename or link operations.
         *
         * A logged_trans of 0 here can mean several things:
         *
         * 1) The inode was never logged since the filesystem was mounted, and may
         *    or may have not been evicted and loaded again;
         *
         * 2) The inode was logged in a previous transaction, then evicted and
         *    then loaded again;
         *
         * 3) The inode was logged in the current transaction, then evicted and
         *    then loaded again.
         *
         * For cases 1) and 2) we don't want to return true, but we need to detect
         * case 3) and return true. So we do a search in the log root for the inode
         * item.
         */
        key.objectid = btrfs_ino(inode);
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;

        if (!path) {
                path = btrfs_alloc_path();
                if (!path)
                        return -ENOMEM;
        }

        ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);

        if (path_in)
                btrfs_release_path(path);
        else
                btrfs_free_path(path);

        /*
         * Logging an inode always results in logging its inode item. So if we
         * did not find the item we know the inode was not logged for sure.
         */
        if (ret < 0) {
                return ret;
        } else if (ret > 0) {
                /*
                 * Set logged_trans to a value greater than 0 and less then the
                 * current transaction to avoid doing the search in future calls.
                 */
                inode->logged_trans = trans->transid - 1;
                return 0;
        }

        /*
         * The inode was previously logged and then evicted, set logged_trans to
         * the current transacion's ID, to avoid future tree searches as long as
         * the inode is not evicted again.
         */
        inode->logged_trans = trans->transid;

        /*
         * If it's a directory, then we must set last_dir_index_offset to the
         * maximum possible value, so that the next attempt to log the inode does
         * not skip checking if dir index keys found in modified subvolume tree
         * leaves have been logged before, otherwise it would result in attempts
         * to insert duplicate dir index keys in the log tree. This must be done
         * because last_dir_index_offset is an in-memory only field, not persisted
         * in the inode item or any other on-disk structure, so its value is lost
         * once the inode is evicted.
         */
        if (S_ISDIR(inode->vfs_inode.i_mode))
                inode->last_dir_index_offset = (u64)-1;

        return 1;
}

/*
 * Delete a directory entry from the log if it exists.
 *
 * Returns < 0 on error
 *           1 if the entry does not exists
 *           0 if the entry existed and was successfully deleted
 */
static int del_logged_dentry(struct btrfs_trans_handle *trans,
                             struct btrfs_root *log,
                             struct btrfs_path *path,
                             u64 dir_ino,
                             const struct fscrypt_str *name,
                             u64 index)
{
        struct btrfs_dir_item *di;

        /*
         * We only log dir index items of a directory, so we don't need to look
         * for dir item keys.
         */
        di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
                                         index, name, -1);
        if (IS_ERR(di))
                return PTR_ERR(di);
        else if (!di)
                return 1;

        /*
         * We do not need to update the size field of the directory's
         * inode item because on log replay we update the field to reflect
         * all existing entries in the directory (see overwrite_item()).
         */
        return btrfs_delete_one_dir_name(trans, log, path, di);
}

/*
 * If both a file and directory are logged, and unlinks or renames are
 * mixed in, we have a few interesting corners:
 *
 * create file X in dir Y
 * link file X to X.link in dir Y
 * fsync file X
 * unlink file X but leave X.link
 * fsync dir Y
 *
 * After a crash we would expect only X.link to exist.  But file X
 * didn't get fsync'd again so the log has back refs for X and X.link.
 *
 * We solve this by removing directory entries and inode backrefs from the
 * log when a file that was logged in the current transaction is
 * unlinked.  Any later fsync will include the updated log entries, and
 * we'll be able to reconstruct the proper directory items from backrefs.
 *
 * This optimizations allows us to avoid relogging the entire inode
 * or the entire directory.
 */
void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root,
                                  const struct fscrypt_str *name,
                                  struct btrfs_inode *dir, u64 index)
{
        struct btrfs_path *path;
        int ret;

        ret = inode_logged(trans, dir, NULL);
        if (ret == 0)
                return;
        else if (ret < 0) {
                btrfs_set_log_full_commit(trans);
                return;
        }

        ret = join_running_log_trans(root);
        if (ret)
                return;

        mutex_lock(&dir->log_mutex);

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out_unlock;
        }

        ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
                                name, index);
        btrfs_free_path(path);
out_unlock:
        mutex_unlock(&dir->log_mutex);
        if (ret < 0)
                btrfs_set_log_full_commit(trans);
        btrfs_end_log_trans(root);
}

/* see comments for btrfs_del_dir_entries_in_log */
void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                const struct fscrypt_str *name,
                                struct btrfs_inode *inode, u64 dirid)
{
        struct btrfs_root *log;
        u64 index;
        int ret;

        ret = inode_logged(trans, inode, NULL);
        if (ret == 0)
                return;
        else if (ret < 0) {
                btrfs_set_log_full_commit(trans);
                return;
        }

        ret = join_running_log_trans(root);
        if (ret)
                return;
        log = root->log_root;
        mutex_lock(&inode->log_mutex);

        ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode),
                                  dirid, &index);
        mutex_unlock(&inode->log_mutex);
        if (ret < 0 && ret != -ENOENT)
                btrfs_set_log_full_commit(trans);
        btrfs_end_log_trans(root);
}

/*
 * creates a range item in the log for 'dirid'.  first_offset and
 * last_offset tell us which parts of the key space the log should
 * be considered authoritative for.
 */
static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *log,
                                       struct btrfs_path *path,
                                       u64 dirid,
                                       u64 first_offset, u64 last_offset)
{
        int ret;
        struct btrfs_key key;
        struct btrfs_dir_log_item *item;

        key.objectid = dirid;
        key.offset = first_offset;
        key.type = BTRFS_DIR_LOG_INDEX_KEY;
        ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
        /*
         * -EEXIST is fine and can happen sporadically when we are logging a
         * directory and have concurrent insertions in the subvolume's tree for
         * items from other inodes and that result in pushing off some dir items
         * from one leaf to another in order to accommodate for the new items.
         * This results in logging the same dir index range key.
         */
        if (ret && ret != -EEXIST)
                return ret;

        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                              struct btrfs_dir_log_item);
        if (ret == -EEXIST) {
                const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);

                /*
                 * btrfs_del_dir_entries_in_log() might have been called during
                 * an unlink between the initial insertion of this key and the
                 * current update, or we might be logging a single entry deletion
                 * during a rename, so set the new last_offset to the max value.
                 */
                last_offset = max(last_offset, curr_end);
        }
        btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
        btrfs_mark_buffer_dirty(trans, path->nodes[0]);
        btrfs_release_path(path);
        return 0;
}

static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
                                 struct btrfs_inode *inode,
                                 struct extent_buffer *src,
                                 struct btrfs_path *dst_path,
                                 int start_slot,
                                 int count)
{
        struct btrfs_root *log = inode->root->log_root;
        char *ins_data = NULL;
        struct btrfs_item_batch batch;
        struct extent_buffer *dst;
        unsigned long src_offset;
        unsigned long dst_offset;
        u64 last_index;
        struct btrfs_key key;
        u32 item_size;
        int ret;
        int i;

        ASSERT(count > 0);
        batch.nr = count;

        if (count == 1) {
                btrfs_item_key_to_cpu(src, &key, start_slot);
                item_size = btrfs_item_size(src, start_slot);
                batch.keys = &key;
                batch.data_sizes = &item_size;
                batch.total_data_size = item_size;
        } else {
                struct btrfs_key *ins_keys;
                u32 *ins_sizes;

                ins_data = kmalloc(count * sizeof(u32) +
                                   count * sizeof(struct btrfs_key), GFP_NOFS);
                if (!ins_data)
                        return -ENOMEM;

                ins_sizes = (u32 *)ins_data;
                ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
                batch.keys = ins_keys;
                batch.data_sizes = ins_sizes;
                batch.total_data_size = 0;

                for (i = 0; i < count; i++) {
                        const int slot = start_slot + i;

                        btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
                        ins_sizes[i] = btrfs_item_size(src, slot);
                        batch.total_data_size += ins_sizes[i];
                }
        }

        ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
        if (ret)
                goto out;

        dst = dst_path->nodes[0];
        /*
         * Copy all the items in bulk, in a single copy operation. Item data is
         * organized such that it's placed at the end of a leaf and from right
         * to left. For example, the data for the second item ends at an offset
         * that matches the offset where the data for the first item starts, the
         * data for the third item ends at an offset that matches the offset
         * where the data of the second items starts, and so on.
         * Therefore our source and destination start offsets for copy match the
         * offsets of the last items (highest slots).
         */
        dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
        src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
        copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
        btrfs_release_path(dst_path);

        last_index = batch.keys[count - 1].offset;
        ASSERT(last_index > inode->last_dir_index_offset);

        /*
         * If for some unexpected reason the last item's index is not greater
         * than the last index we logged, warn and force a transaction commit.
         */
        if (WARN_ON(last_index <= inode->last_dir_index_offset))
                ret = BTRFS_LOG_FORCE_COMMIT;
        else
                inode->last_dir_index_offset = last_index;

        if (btrfs_get_first_dir_index_to_log(inode) == 0)
                btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset);
out:
        kfree(ins_data);

        return ret;
}

static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx)
{
        const int slot = path->slots[0];

        if (ctx->scratch_eb) {
                copy_extent_buffer_full(ctx->scratch_eb, path->nodes[0]);
        } else {
                ctx->scratch_eb = btrfs_clone_extent_buffer(path->nodes[0]);
                if (!ctx->scratch_eb)
                        return -ENOMEM;
        }

        btrfs_release_path(path);
        path->nodes[0] = ctx->scratch_eb;
        path->slots[0] = slot;
        /*
         * Add extra ref to scratch eb so that it is not freed when callers
         * release the path, so we can reuse it later if needed.
         */
        atomic_inc(&ctx->scratch_eb->refs);

        return 0;
}

static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
                                  struct btrfs_inode *inode,
                                  struct btrfs_path *path,
                                  struct btrfs_path *dst_path,
                                  struct btrfs_log_ctx *ctx,
                                  u64 *last_old_dentry_offset)
{
        struct btrfs_root *log = inode->root->log_root;
        struct extent_buffer *src;
        const int nritems = btrfs_header_nritems(path->nodes[0]);
        const u64 ino = btrfs_ino(inode);
        bool last_found = false;
        int batch_start = 0;
        int batch_size = 0;
        int ret;

        /*
         * We need to clone the leaf, release the read lock on it, and use the
         * clone before modifying the log tree. See the comment at copy_items()
         * about why we need to do this.
         */
        ret = clone_leaf(path, ctx);
        if (ret < 0)
                return ret;

        src = path->nodes[0];

        for (int i = path->slots[0]; i < nritems; i++) {
                struct btrfs_dir_item *di;
                struct btrfs_key key;
                int ret;

                btrfs_item_key_to_cpu(src, &key, i);

                if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
                        last_found = true;
                        break;
                }

                di = btrfs_item_ptr(src, i, struct btrfs_dir_item);

                /*
                 * Skip ranges of items that consist only of dir item keys created
                 * in past transactions. However if we find a gap, we must log a
                 * dir index range item for that gap, so that index keys in that
                 * gap are deleted during log replay.
                 */
                if (btrfs_dir_transid(src, di) < trans->transid) {
                        if (key.offset > *last_old_dentry_offset + 1) {
                                ret = insert_dir_log_key(trans, log, dst_path,
                                                 ino, *last_old_dentry_offset + 1,
                                                 key.offset - 1);
                                if (ret < 0)
                                        return ret;
                        }

                        *last_old_dentry_offset = key.offset;
                        continue;
                }

                /* If we logged this dir index item before, we can skip it. */
                if (key.offset <= inode->last_dir_index_offset)
                        continue;

                /*
                 * We must make sure that when we log a directory entry, the
                 * corresponding inode, after log replay, has a matching link
                 * count. For example:
                 *
                 * touch foo
                 * mkdir mydir
                 * sync
                 * ln foo mydir/bar
                 * xfs_io -c "fsync" mydir
                 * <crash>
                 * <mount fs and log replay>
                 *
                 * Would result in a fsync log that when replayed, our file inode
                 * would have a link count of 1, but we get two directory entries
                 * pointing to the same inode. After removing one of the names,
                 * it would not be possible to remove the other name, which
                 * resulted always in stale file handle errors, and would not be
                 * possible to rmdir the parent directory, since its i_size could
                 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
                 * resulting in -ENOTEMPTY errors.
                 */
                if (!ctx->log_new_dentries) {
                        struct btrfs_key di_key;

                        btrfs_dir_item_key_to_cpu(src, di, &di_key);
                        if (di_key.type != BTRFS_ROOT_ITEM_KEY)
                                ctx->log_new_dentries = true;
                }

                if (batch_size == 0)
                        batch_start = i;
                batch_size++;
        }

        if (batch_size > 0) {
                int ret;

                ret = flush_dir_items_batch(trans, inode, src, dst_path,
                                            batch_start, batch_size);
                if (ret < 0)
                        return ret;
        }

        return last_found ? 1 : 0;
}

/*
 * log all the items included in the current transaction for a given
 * directory.  This also creates the range items in the log tree required
 * to replay anything deleted before the fsync
 */
static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                          struct btrfs_inode *inode,
                          struct btrfs_path *path,
                          struct btrfs_path *dst_path,
                          struct btrfs_log_ctx *ctx,
                          u64 min_offset, u64 *last_offset_ret)
{
        struct btrfs_key min_key;
        struct btrfs_root *root = inode->root;
        struct btrfs_root *log = root->log_root;
        int ret;
        u64 last_old_dentry_offset = min_offset - 1;
        u64 last_offset = (u64)-1;
        u64 ino = btrfs_ino(inode);

        min_key.objectid = ino;
        min_key.type = BTRFS_DIR_INDEX_KEY;
        min_key.offset = min_offset;

        ret = btrfs_search_forward(root, &min_key, path, trans->transid);

        /*
         * we didn't find anything from this transaction, see if there
         * is anything at all
         */
        if (ret != 0 || min_key.objectid != ino ||
            min_key.type != BTRFS_DIR_INDEX_KEY) {
                min_key.objectid = ino;
                min_key.type = BTRFS_DIR_INDEX_KEY;
                min_key.offset = (u64)-1;
                btrfs_release_path(path);
                ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
                if (ret < 0) {
                        btrfs_release_path(path);
                        return ret;
                }
                ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);

                /* if ret == 0 there are items for this type,
                 * create a range to tell us the last key of this type.
                 * otherwise, there are no items in this directory after
                 * *min_offset, and we create a range to indicate that.
                 */
                if (ret == 0) {
                        struct btrfs_key tmp;

                        btrfs_item_key_to_cpu(path->nodes[0], &tmp,
                                              path->slots[0]);
                        if (tmp.type == BTRFS_DIR_INDEX_KEY)
                                last_old_dentry_offset = tmp.offset;
                } else if (ret > 0) {
                        ret = 0;
                }

                goto done;
        }

        /* go backward to find any previous key */
        ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
        if (ret == 0) {
                struct btrfs_key tmp;

                btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
                /*
                 * The dir index key before the first one we found that needs to
                 * be logged might be in a previous leaf, and there might be a
                 * gap between these keys, meaning that we had deletions that
                 * happened. So the key range item we log (key type
                 * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
                 * previous key's offset plus 1, so that those deletes are replayed.
                 */
                if (tmp.type == BTRFS_DIR_INDEX_KEY)
                        last_old_dentry_offset = tmp.offset;
        } else if (ret < 0) {
                goto done;
        }

        btrfs_release_path(path);

        /*
         * Find the first key from this transaction again or the one we were at
         * in the loop below in case we had to reschedule. We may be logging the
         * directory without holding its VFS lock, which happen when logging new
         * dentries (through log_new_dir_dentries()) or in some cases when we
         * need to log the parent directory of an inode. This means a dir index
         * key might be deleted from the inode's root, and therefore we may not
         * find it anymore. If we can't find it, just move to the next key. We
         * can not bail out and ignore, because if we do that we will simply
         * not log dir index keys that come after the one that was just deleted
         * and we can end up logging a dir index range that ends at (u64)-1
         * (@last_offset is initialized to that), resulting in removing dir
         * entries we should not remove at log replay time.
         */
search:
        ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
        if (ret > 0) {
                ret = btrfs_next_item(root, path);
                if (ret > 0) {
                        /* There are no more keys in the inode's root. */
                        ret = 0;
                        goto done;
                }
        }
        if (ret < 0)
                goto done;

        /*
         * we have a block from this transaction, log every item in it
         * from our directory
         */
        while (1) {
                ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
                                             &last_old_dentry_offset);
                if (ret != 0) {
                        if (ret > 0)
                                ret = 0;
                        goto done;
                }
                path->slots[0] = btrfs_header_nritems(path->nodes[0]);

                /*
                 * look ahead to the next item and see if it is also
                 * from this directory and from this transaction
                 */
                ret = btrfs_next_leaf(root, path);
                if (ret) {
                        if (ret == 1) {
                                last_offset = (u64)-1;
                                ret = 0;
                        }
                        goto done;
                }
                btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
                if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
                        last_offset = (u64)-1;
                        goto done;
                }
                if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
                        /*
                         * The next leaf was not changed in the current transaction
                         * and has at least one dir index key.
                         * We check for the next key because there might have been
                         * one or more deletions between the last key we logged and
                         * that next key. So the key range item we log (key type
                         * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
                         * offset minus 1, so that those deletes are replayed.
                         */
                        last_offset = min_key.offset - 1;
                        goto done;
                }
                if (need_resched()) {
                        btrfs_release_path(path);
                        cond_resched();
                        goto search;
                }
        }
done:
        btrfs_release_path(path);
        btrfs_release_path(dst_path);

        if (ret == 0) {
                *last_offset_ret = last_offset;
                /*
                 * In case the leaf was changed in the current transaction but
                 * all its dir items are from a past transaction, the last item
                 * in the leaf is a dir item and there's no gap between that last
                 * dir item and the first one on the next leaf (which did not
                 * change in the current transaction), then we don't need to log
                 * a range, last_old_dentry_offset is == to last_offset.
                 */
                ASSERT(last_old_dentry_offset <= last_offset);
                if (last_old_dentry_offset < last_offset)
                        ret = insert_dir_log_key(trans, log, path, ino,
                                                 last_old_dentry_offset + 1,
                                                 last_offset);
        }

        return ret;
}

/*
 * If the inode was logged before and it was evicted, then its
 * last_dir_index_offset is (u64)-1, so we don't the value of the last index
 * key offset. If that's the case, search for it and update the inode. This
 * is to avoid lookups in the log tree every time we try to insert a dir index
 * key from a leaf changed in the current transaction, and to allow us to always
 * do batch insertions of dir index keys.
 */
static int update_last_dir_index_offset(struct btrfs_inode *inode,
                                        struct btrfs_path *path,
                                        const struct btrfs_log_ctx *ctx)
{
        const u64 ino = btrfs_ino(inode);
        struct btrfs_key key;
        int ret;

        lockdep_assert_held(&inode->log_mutex);

        if (inode->last_dir_index_offset != (u64)-1)
                return 0;

        if (!ctx->logged_before) {
                inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
                return 0;
        }

        key.objectid = ino;
        key.type = BTRFS_DIR_INDEX_KEY;
        key.offset = (u64)-1;

        ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
        /*
         * An error happened or we actually have an index key with an offset
         * value of (u64)-1. Bail out, we're done.
         */
        if (ret <= 0)
                goto out;

        ret = 0;
        inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;

        /*
         * No dir index items, bail out and leave last_dir_index_offset with
         * the value right before the first valid index value.
         */
        if (path->slots[0] == 0)
                goto out;

        /*
         * btrfs_search_slot() left us at one slot beyond the slot with the last
         * index key, or beyond the last key of the directory that is not an
         * index key. If we have an index key before, set last_dir_index_offset
         * to its offset value, otherwise leave it with a value right before the
         * first valid index value, as it means we have an empty directory.
         */
        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
        if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
                inode->last_dir_index_offset = key.offset;

out:
        btrfs_release_path(path);

        return ret;
}

/*
 * logging directories is very similar to logging inodes, We find all the items
 * from the current transaction and write them to the log.
 *
 * The recovery code scans the directory in the subvolume, and if it finds a
 * key in the range logged that is not present in the log tree, then it means
 * that dir entry was unlinked during the transaction.
 *
 * In order for that scan to work, we must include one key smaller than
 * the smallest logged by this transaction and one key larger than the largest
 * key logged by this transaction.
 */
static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
                          struct btrfs_inode *inode,
                          struct btrfs_path *path,
                          struct btrfs_path *dst_path,
                          struct btrfs_log_ctx *ctx)
{
        u64 min_key;
        u64 max_key;
        int ret;

        ret = update_last_dir_index_offset(inode, path, ctx);
        if (ret)
                return ret;

        min_key = BTRFS_DIR_START_INDEX;
        max_key = 0;

        while (1) {
                ret = log_dir_items(trans, inode, path, dst_path,
                                ctx, min_key, &max_key);
                if (ret)
                        return ret;
                if (max_key == (u64)-1)
                        break;
                min_key = max_key + 1;
        }

        return 0;
}

/*
 * a helper function to drop items from the log before we relog an
 * inode.  max_key_type indicates the highest item type to remove.
 * This cannot be run for file data extents because it does not
 * free the extents they point to.
 */
static int drop_inode_items(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *log,
                                  struct btrfs_path *path,
                                  struct btrfs_inode *inode,
                                  int max_key_type)
{
        int ret;
        struct btrfs_key key;
        struct btrfs_key found_key;
        int start_slot;

        key.objectid = btrfs_ino(inode);
        key.type = max_key_type;
        key.offset = (u64)-1;

        while (1) {
                ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
                if (ret < 0) {
                        break;
                } else if (ret > 0) {
                        if (path->slots[0] == 0)
                                break;
                        path->slots[0]--;
                }

                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
                                      path->slots[0]);

                if (found_key.objectid != key.objectid)
                        break;

                found_key.offset = 0;
                found_key.type = 0;
                ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot);
                if (ret < 0)
                        break;

                ret = btrfs_del_items(trans, log, path, start_slot,
                                      path->slots[0] - start_slot + 1);
                /*
                 * If start slot isn't 0 then we don't need to re-search, we've
                 * found the last guy with the objectid in this tree.
                 */
                if (ret || start_slot != 0)
                        break;
                btrfs_release_path(path);
        }
        btrfs_release_path(path);
        if (ret > 0)
                ret = 0;
        return ret;
}

static int truncate_inode_items(struct btrfs_trans_handle *trans,
                                struct btrfs_root *log_root,
                                struct btrfs_inode *inode,
                                u64 new_size, u32 min_type)
{
        struct btrfs_truncate_control control = {
                .new_size = new_size,
                .ino = btrfs_ino(inode),
                .min_type = min_type,
                .skip_ref_updates = true,
        };

        return btrfs_truncate_inode_items(trans, log_root, &control);
}

static void fill_inode_item(struct btrfs_trans_handle *trans,
                            struct extent_buffer *leaf,
                            struct btrfs_inode_item *item,
                            struct inode *inode, int log_inode_only,
                            u64 logged_isize)
{
        struct btrfs_map_token token;
        u64 flags;

        btrfs_init_map_token(&token, leaf);

        if (log_inode_only) {
                /* set the generation to zero so the recover code
                 * can tell the difference between an logging
                 * just to say 'this inode exists' and a logging
                 * to say 'update this inode with these values'
                 */
                btrfs_set_token_inode_generation(&token, item, 0);
                btrfs_set_token_inode_size(&token, item, logged_isize);
        } else {
                btrfs_set_token_inode_generation(&token, item,
                                                 BTRFS_I(inode)->generation);
                btrfs_set_token_inode_size(&token, item, inode->i_size);
        }

        btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
        btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
        btrfs_set_token_inode_mode(&token, item, inode->i_mode);
        btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);

        btrfs_set_token_timespec_sec(&token, &item->atime,
                                     inode_get_atime_sec(inode));
        btrfs_set_token_timespec_nsec(&token, &item->atime,
                                      inode_get_atime_nsec(inode));

        btrfs_set_token_timespec_sec(&token, &item->mtime,
                                     inode_get_mtime_sec(inode));
        btrfs_set_token_timespec_nsec(&token, &item->mtime,
                                      inode_get_mtime_nsec(inode));

        btrfs_set_token_timespec_sec(&token, &item->ctime,
                                     inode_get_ctime_sec(inode));
        btrfs_set_token_timespec_nsec(&token, &item->ctime,
                                      inode_get_ctime_nsec(inode));

        /*
         * We do not need to set the nbytes field, in fact during a fast fsync
         * its value may not even be correct, since a fast fsync does not wait
         * for ordered extent completion, which is where we update nbytes, it
         * only waits for writeback to complete. During log replay as we find
         * file extent items and replay them, we adjust the nbytes field of the
         * inode item in subvolume tree as needed (see overwrite_item()).
         */

        btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
        btrfs_set_token_inode_transid(&token, item, trans->transid);
        btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
        flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
                                          BTRFS_I(inode)->ro_flags);
        btrfs_set_token_inode_flags(&token, item, flags);
        btrfs_set_token_inode_block_group(&token, item, 0);
}

static int log_inode_item(struct btrfs_trans_handle *trans,
                          struct btrfs_root *log, struct btrfs_path *path,
                          struct btrfs_inode *inode, bool inode_item_dropped)
{
        struct btrfs_inode_item *inode_item;
        int ret;

        /*
         * If we are doing a fast fsync and the inode was logged before in the
         * current transaction, then we know the inode was previously logged and
         * it exists in the log tree. For performance reasons, in this case use
         * btrfs_search_slot() directly with ins_len set to 0 so that we never
         * attempt a write lock on the leaf's parent, which adds unnecessary lock
         * contention in case there are concurrent fsyncs for other inodes of the
         * same subvolume. Using btrfs_insert_empty_item() when the inode item
         * already exists can also result in unnecessarily splitting a leaf.
         */
        if (!inode_item_dropped && inode->logged_trans == trans->transid) {
                ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
                ASSERT(ret <= 0);
                if (ret > 0)
                        ret = -ENOENT;
        } else {
                /*
                 * This means it is the first fsync in the current transaction,
                 * so the inode item is not in the log and we need to insert it.
                 * We can never get -EEXIST because we are only called for a fast
                 * fsync and in case an inode eviction happens after the inode was
                 * logged before in the current transaction, when we load again
                 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
                 * flags and set ->logged_trans to 0.
                 */
                ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
                                              sizeof(*inode_item));
                ASSERT(ret != -EEXIST);
        }
        if (ret)
                return ret;
        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                    struct btrfs_inode_item);
        fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
                        0, 0);
        btrfs_release_path(path);
        return 0;
}

static int log_csums(struct btrfs_trans_handle *trans,
                     struct btrfs_inode *inode,
                     struct btrfs_root *log_root,
                     struct btrfs_ordered_sum *sums)
{
        const u64 lock_end = sums->logical + sums->len - 1;
        struct extent_state *cached_state = NULL;
        int ret;

        /*
         * If this inode was not used for reflink operations in the current
         * transaction with new extents, then do the fast path, no need to
         * worry about logging checksum items with overlapping ranges.
         */
        if (inode->last_reflink_trans < trans->transid)
                return btrfs_csum_file_blocks(trans, log_root, sums);

        /*
         * Serialize logging for checksums. This is to avoid racing with the
         * same checksum being logged by another task that is logging another
         * file which happens to refer to the same extent as well. Such races
         * can leave checksum items in the log with overlapping ranges.
         */
        ret = lock_extent(&log_root->log_csum_range, sums->logical, lock_end,
                          &cached_state);
        if (ret)
                return ret;
        /*
         * Due to extent cloning, we might have logged a csum item that covers a
         * subrange of a cloned extent, and later we can end up logging a csum
         * item for a larger subrange of the same extent or the entire range.
         * This would leave csum items in the log tree that cover the same range
         * and break the searches for checksums in the log tree, resulting in
         * some checksums missing in the fs/subvolume tree. So just delete (or
         * trim and adjust) any existing csum items in the log for this range.
         */
        ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len);
        if (!ret)
                ret = btrfs_csum_file_blocks(trans, log_root, sums);

        unlock_extent(&log_root->log_csum_range, sums->logical, lock_end,
                      &cached_state);

        return ret;
}

static noinline int copy_items(struct btrfs_trans_handle *trans,
                               struct btrfs_inode *inode,
                               struct btrfs_path *dst_path,
                               struct btrfs_path *src_path,
                               int start_slot, int nr, int inode_only,
                               u64 logged_isize, struct btrfs_log_ctx *ctx)
{
        struct btrfs_root *log = inode->root->log_root;
        struct btrfs_file_extent_item *extent;
        struct extent_buffer *src;
        int ret;
        struct btrfs_key *ins_keys;
        u32 *ins_sizes;
        struct btrfs_item_batch batch;
        char *ins_data;
        int dst_index;
        const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
        const u64 i_size = i_size_read(&inode->vfs_inode);

        /*
         * To keep lockdep happy and avoid deadlocks, clone the source leaf and
         * use the clone. This is because otherwise we would be changing the log
         * tree, to insert items from the subvolume tree or insert csum items,
         * while holding a read lock on a leaf from the subvolume tree, which
         * creates a nasty lock dependency when COWing log tree nodes/leaves:
         *
         * 1) Modifying the log tree triggers an extent buffer allocation while
         *    holding a write lock on a parent extent buffer from the log tree.
         *    Allocating the pages for an extent buffer, or the extent buffer
         *    struct, can trigger inode eviction and finally the inode eviction
         *    will trigger a release/remove of a delayed node, which requires
         *    taking the delayed node's mutex;
         *
         * 2) Allocating a metadata extent for a log tree can trigger the async
         *    reclaim thread and make us wait for it to release enough space and
         *    unblock our reservation ticket. The reclaim thread can start
         *    flushing delayed items, and that in turn results in the need to
         *    lock delayed node mutexes and in the need to write lock extent
         *    buffers of a subvolume tree - all this while holding a write lock
         *    on the parent extent buffer in the log tree.
         *
         * So one task in scenario 1) running in parallel with another task in
         * scenario 2) could lead to a deadlock, one wanting to lock a delayed
         * node mutex while having a read lock on a leaf from the subvolume,
         * while the other is holding the delayed node's mutex and wants to
         * write lock the same subvolume leaf for flushing delayed items.
         */
        ret = clone_leaf(src_path, ctx);
        if (ret < 0)
                return ret;

        src = src_path->nodes[0];

        ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
                           nr * sizeof(u32), GFP_NOFS);
        if (!ins_data)
                return -ENOMEM;

        ins_sizes = (u32 *)ins_data;
        ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
        batch.keys = ins_keys;
        batch.data_sizes = ins_sizes;
        batch.total_data_size = 0;
        batch.nr = 0;

        dst_index = 0;
        for (int i = 0; i < nr; i++) {
                const int src_slot = start_slot + i;
                struct btrfs_root *csum_root;
                struct btrfs_ordered_sum *sums;
                struct btrfs_ordered_sum *sums_next;
                LIST_HEAD(ordered_sums);
                u64 disk_bytenr;
                u64 disk_num_bytes;
                u64 extent_offset;
                u64 extent_num_bytes;
                bool is_old_extent;

                btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);

                if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
                        goto add_to_batch;

                extent = btrfs_item_ptr(src, src_slot,
                                        struct btrfs_file_extent_item);

                is_old_extent = (btrfs_file_extent_generation(src, extent) <
                                 trans->transid);

                /*
                 * Don't copy extents from past generations. That would make us
                 * log a lot more metadata for common cases like doing only a
                 * few random writes into a file and then fsync it for the first
                 * time or after the full sync flag is set on the inode. We can
                 * get leaves full of extent items, most of which are from past
                 * generations, so we can skip them - as long as the inode has
                 * not been the target of a reflink operation in this transaction,
                 * as in that case it might have had file extent items with old
                 * generations copied into it. We also must always log prealloc
                 * extents that start at or beyond eof, otherwise we would lose
                 * them on log replay.
                 */
                if (is_old_extent &&
                    ins_keys[dst_index].offset < i_size &&
                    inode->last_reflink_trans < trans->transid)
                        continue;

                if (skip_csum)
                        goto add_to_batch;

                /* Only regular extents have checksums. */
                if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
                        goto add_to_batch;

                /*
                 * If it's an extent created in a past transaction, then its
                 * checksums are already accessible from the committed csum tree,
                 * no need to log them.
                 */
                if (is_old_extent)
                        goto add_to_batch;

                disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
                /* If it's an explicit hole, there are no checksums. */
                if (disk_bytenr == 0)
                        goto add_to_batch;

                disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);

                if (btrfs_file_extent_compression(src, extent)) {
                        extent_offset = 0;
                        extent_num_bytes = disk_num_bytes;
                } else {
                        extent_offset = btrfs_file_extent_offset(src, extent);
                        extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
                }

                csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
                disk_bytenr += extent_offset;
                ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
                                              disk_bytenr + extent_num_bytes - 1,
                                              &ordered_sums, false);
                if (ret < 0)
                        goto out;
                ret = 0;

                list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
                        if (!ret)
                                ret = log_csums(trans, inode, log, sums);
                        list_del(&sums->list);
                        kfree(sums);
                }
                if (ret)
                        goto out;

add_to_batch:
                ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
                batch.total_data_size += ins_sizes[dst_index];
                batch.nr++;
                dst_index++;
        }

        /*
         * We have a leaf full of old extent items that don't need to be logged,
         * so we don't need to do anything.
         */
        if (batch.nr == 0)
                goto out;

        ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
        if (ret)
                goto out;

        dst_index = 0;
        for (int i = 0; i < nr; i++) {
                const int src_slot = start_slot + i;
                const int dst_slot = dst_path->slots[0] + dst_index;
                struct btrfs_key key;
                unsigned long src_offset;
                unsigned long dst_offset;

                /*
                 * We're done, all the remaining items in the source leaf
                 * correspond to old file extent items.
                 */
                if (dst_index >= batch.nr)
                        break;

                btrfs_item_key_to_cpu(src, &key, src_slot);

                if (key.type != BTRFS_EXTENT_DATA_KEY)
                        goto copy_item;

                extent = btrfs_item_ptr(src, src_slot,
                                        struct btrfs_file_extent_item);

                /* See the comment in the previous loop, same logic. */
                if (btrfs_file_extent_generation(src, extent) < trans->transid &&
                    key.offset < i_size &&
                    inode->last_reflink_trans < trans->transid)
                        continue;

copy_item:
                dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
                src_offset = btrfs_item_ptr_offset(src, src_slot);

                if (key.type == BTRFS_INODE_ITEM_KEY) {
                        struct btrfs_inode_item *inode_item;

                        inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
                                                    struct btrfs_inode_item);
                        fill_inode_item(trans, dst_path->nodes[0], inode_item,
                                        &inode->vfs_inode,
                                        inode_only == LOG_INODE_EXISTS,
                                        logged_isize);
                } else {
                        copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
                                           src_offset, ins_sizes[dst_index]);
                }

                dst_index++;
        }

        btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]);
        btrfs_release_path(dst_path);
out:
        kfree(ins_data);

        return ret;
}

static int extent_cmp(void *priv, const struct list_head *a,
                      const struct list_head *b)
{
        const struct extent_map *em1, *em2;

        em1 = list_entry(a, struct extent_map, list);
        em2 = list_entry(b, struct extent_map, list);

        if (em1->start < em2->start)
                return -1;
        else if (em1->start > em2->start)
                return 1;
        return 0;
}

static int log_extent_csums(struct btrfs_trans_handle *trans,
                            struct btrfs_inode *inode,
                            struct btrfs_root *log_root,
                            const struct extent_map *em,
                            struct btrfs_log_ctx *ctx)
{
        struct btrfs_ordered_extent *ordered;
        struct btrfs_root *csum_root;
        u64 csum_offset;
        u64 csum_len;
        u64 mod_start = em->start;
        u64 mod_len = em->len;
        LIST_HEAD(ordered_sums);
        int ret = 0;

        if (inode->flags & BTRFS_INODE_NODATASUM ||
            (em->flags & EXTENT_FLAG_PREALLOC) ||
            em->block_start == EXTENT_MAP_HOLE)
                return 0;

        list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
                const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
                const u64 mod_end = mod_start + mod_len;
                struct btrfs_ordered_sum *sums;

                if (mod_len == 0)
                        break;

                if (ordered_end <= mod_start)
                        continue;
                if (mod_end <= ordered->file_offset)
                        break;

                /*
                 * We are going to copy all the csums on this ordered extent, so
                 * go ahead and adjust mod_start and mod_len in case this ordered
                 * extent has already been logged.
                 */
                if (ordered->file_offset > mod_start) {
                        if (ordered_end >= mod_end)
                                mod_len = ordered->file_offset - mod_start;
                        /*
                         * If we have this case
                         *
                         * |--------- logged extent ---------|
                         *       |----- ordered extent ----|
                         *
                         * Just don't mess with mod_start and mod_len, we'll
                         * just end up logging more csums than we need and it
                         * will be ok.
                         */
                } else {
                        if (ordered_end < mod_end) {
                                mod_len = mod_end - ordered_end;
                                mod_start = ordered_end;
                        } else {
                                mod_len = 0;
                        }
                }

                /*
                 * To keep us from looping for the above case of an ordered
                 * extent that falls inside of the logged extent.
                 */
                if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
                        continue;

                list_for_each_entry(sums, &ordered->list, list) {
                        ret = log_csums(trans, inode, log_root, sums);
                        if (ret)
                                return ret;
                }
        }

        /* We're done, found all csums in the ordered extents. */
        if (mod_len == 0)
                return 0;

        /* If we're compressed we have to save the entire range of csums. */
        if (extent_map_is_compressed(em)) {
                csum_offset = 0;
                csum_len = max(em->block_len, em->orig_block_len);
        } else {
                csum_offset = mod_start - em->start;
                csum_len = mod_len;
        }

        /* block start is already adjusted for the file extent offset. */
        csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
        ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset,
                                      em->block_start + csum_offset +
                                      csum_len - 1, &ordered_sums, false);
        if (ret < 0)
                return ret;
        ret = 0;

        while (!list_empty(&ordered_sums)) {
                struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
                                                   struct btrfs_ordered_sum,
                                                   list);
                if (!ret)
                        ret = log_csums(trans, inode, log_root, sums);
                list_del(&sums->list);
                kfree(sums);
        }

        return ret;
}

static int log_one_extent(struct btrfs_trans_handle *trans,
                          struct btrfs_inode *inode,
                          const struct extent_map *em,
                          struct btrfs_path *path,
                          struct btrfs_log_ctx *ctx)
{
        struct btrfs_drop_extents_args drop_args = { 0 };
        struct btrfs_root *log = inode->root->log_root;
        struct btrfs_file_extent_item fi = { 0 };
        struct extent_buffer *leaf;
        struct btrfs_key key;
        enum btrfs_compression_type compress_type;
        u64 extent_offset = em->start - em->orig_start;
        u64 block_len;
        int ret;

        btrfs_set_stack_file_extent_generation(&fi, trans->transid);
        if (em->flags & EXTENT_FLAG_PREALLOC)
                btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
        else
                btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);

        block_len = max(em->block_len, em->orig_block_len);
        compress_type = extent_map_compression(em);
        if (compress_type != BTRFS_COMPRESS_NONE) {
                btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
                btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
        } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
                btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start -
                                                        extent_offset);
                btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
        }

        btrfs_set_stack_file_extent_offset(&fi, extent_offset);
        btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
        btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
        btrfs_set_stack_file_extent_compression(&fi, compress_type);

        ret = log_extent_csums(trans, inode, log, em, ctx);
        if (ret)
                return ret;

        /*
         * If this is the first time we are logging the inode in the current
         * transaction, we can avoid btrfs_drop_extents(), which is expensive
         * because it does a deletion search, which always acquires write locks
         * for extent buffers at levels 2, 1 and 0. This not only wastes time
         * but also adds significant contention in a log tree, since log trees
         * are small, with a root at level 2 or 3 at most, due to their short
         * life span.
         */
        if (ctx->logged_before) {
                drop_args.path = path;
                drop_args.start = em->start;
                drop_args.end = em->start + em->len;
                drop_args.replace_extent = true;
                drop_args.extent_item_size = sizeof(fi);
                ret = btrfs_drop_extents(trans, log, inode, &drop_args);
                if (ret)
                        return ret;
        }

        if (!drop_args.extent_inserted) {
                key.objectid = btrfs_ino(inode);
                key.type = BTRFS_EXTENT_DATA_KEY;
                key.offset = em->start;

                ret = btrfs_insert_empty_item(trans, log, path, &key,
                                              sizeof(fi));
                if (ret)
                        return ret;
        }
        leaf = path->nodes[0];
        write_extent_buffer(leaf, &fi,
                            btrfs_item_ptr_offset(leaf, path->slots[0]),
                            sizeof(fi));
        btrfs_mark_buffer_dirty(trans, leaf);

        btrfs_release_path(path);

        return ret;
}

/*
 * Log all prealloc extents beyond the inode's i_size to make sure we do not
 * lose them after doing a full/fast fsync and replaying the log. We scan the
 * subvolume's root instead of iterating the inode's extent map tree because
 * otherwise we can log incorrect extent items based on extent map conversion.
 * That can happen due to the fact that extent maps are merged when they
 * are not in the extent map tree's list of modified extents.
 */
static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
                                      struct btrfs_inode *inode,
                                      struct btrfs_path *path,
                                      struct btrfs_log_ctx *ctx)
{
        struct btrfs_root *root = inode->root;
        struct btrfs_key key;
        const u64 i_size = i_size_read(&inode->vfs_inode);
        const u64 ino = btrfs_ino(inode);
        struct btrfs_path *dst_path = NULL;
        bool dropped_extents = false;
        u64 truncate_offset = i_size;
        struct extent_buffer *leaf;
        int slot;
        int ins_nr = 0;
        int start_slot = 0;
        int ret;

        if (!(inode->flags & BTRFS_INODE_PREALLOC))
                return 0;

        key.objectid = ino;
        key.type = BTRFS_EXTENT_DATA_KEY;
        key.offset = i_size;
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto out;

        /*
         * We must check if there is a prealloc extent that starts before the
         * i_size and crosses the i_size boundary. This is to ensure later we
         * truncate down to the end of that extent and not to the i_size, as
         * otherwise we end up losing part of the prealloc extent after a log
         * replay and with an implicit hole if there is another prealloc extent
         * that starts at an offset beyond i_size.
         */
        ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
        if (ret < 0)
                goto out;

        if (ret == 0) {
                struct btrfs_file_extent_item *ei;

                leaf = path->nodes[0];
                slot = path->slots[0];
                ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);

                if (btrfs_file_extent_type(leaf, ei) ==
                    BTRFS_FILE_EXTENT_PREALLOC) {
                        u64 extent_end;

                        btrfs_item_key_to_cpu(leaf, &key, slot);
                        extent_end = key.offset +
                                btrfs_file_extent_num_bytes(leaf, ei);

                        if (extent_end > i_size)
                                truncate_offset = extent_end;
                }
        } else {
                ret = 0;
        }

        while (true) {
                leaf = path->nodes[0];
                slot = path->slots[0];

                if (slot >= btrfs_header_nritems(leaf)) {
                        if (ins_nr > 0) {
                                ret = copy_items(trans, inode, dst_path, path,
                                                 start_slot, ins_nr, 1, 0, ctx);
                                if (ret < 0)
                                        goto out;
                                ins_nr = 0;
                        }
                        ret = btrfs_next_leaf(root, path);
                        if (ret < 0)
                                goto out;
                        if (ret > 0) {
                                ret = 0;
                                break;
                        }
                        continue;
                }

                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (key.objectid > ino)
                        break;
                if (WARN_ON_ONCE(key.objectid < ino) ||
                    key.type < BTRFS_EXTENT_DATA_KEY ||
                    key.offset < i_size) {
                        path->slots[0]++;
                        continue;
                }
                /*
                 * Avoid overlapping items in the log tree. The first time we
                 * get here, get rid of everything from a past fsync. After
                 * that, if the current extent starts before the end of the last
                 * extent we copied, truncate the last one. This can happen if
                 * an ordered extent completion modifies the subvolume tree
                 * while btrfs_next_leaf() has the tree unlocked.
                 */
                if (!dropped_extents || key.offset < truncate_offset) {
                        ret = truncate_inode_items(trans, root->log_root, inode,
                                                   min(key.offset, truncate_offset),
                                                   BTRFS_EXTENT_DATA_KEY);
                        if (ret)
                                goto out;
                        dropped_extents = true;
                }
                truncate_offset = btrfs_file_extent_end(path);
                if (ins_nr == 0)
                        start_slot = slot;
                ins_nr++;
                path->slots[0]++;
                if (!dst_path) {
                        dst_path = btrfs_alloc_path();
                        if (!dst_path) {
                                ret = -ENOMEM;
                                goto out;
                        }
                }
        }
        if (ins_nr > 0)
                ret = copy_items(trans, inode, dst_path, path,
                                 start_slot, ins_nr, 1, 0, ctx);
out:
        btrfs_release_path(path);
        btrfs_free_path(dst_path);
        return ret;
}

static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
                                     struct btrfs_inode *inode,
                                     struct btrfs_path *path,
                                     struct btrfs_log_ctx *ctx)
{
        struct btrfs_ordered_extent *ordered;
        struct btrfs_ordered_extent *tmp;
        struct extent_map *em, *n;
        LIST_HEAD(extents);
        struct extent_map_tree *tree = &inode->extent_tree;
        int ret = 0;
        int num = 0;

        write_lock(&tree->lock);

        list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
                list_del_init(&em->list);
                /*
                 * Just an arbitrary number, this can be really CPU intensive
                 * once we start getting a lot of extents, and really once we
                 * have a bunch of extents we just want to commit since it will
                 * be faster.
                 */
                if (++num > 32768) {
                        list_del_init(&tree->modified_extents);
                        ret = -EFBIG;
                        goto process;
                }

                if (em->generation < trans->transid)
                        continue;

                /* We log prealloc extents beyond eof later. */
                if ((em->flags & EXTENT_FLAG_PREALLOC) &&
                    em->start >= i_size_read(&inode->vfs_inode))
                        continue;

                /* Need a ref to keep it from getting evicted from cache */
                refcount_inc(&em->refs);
                em->flags |= EXTENT_FLAG_LOGGING;
                list_add_tail(&em->list, &extents);
                num++;
        }

        list_sort(NULL, &extents, extent_cmp);
process:
        while (!list_empty(&extents)) {
                em = list_entry(extents.next, struct extent_map, list);

                list_del_init(&em->list);

                /*
                 * If we had an error we just need to delete everybody from our
                 * private list.
                 */
                if (ret) {
                        clear_em_logging(inode, em);
                        free_extent_map(em);
                        continue;
                }

                write_unlock(&tree->lock);

                ret = log_one_extent(trans, inode, em, path, ctx);
                write_lock(&tree->lock);
                clear_em_logging(inode, em);
                free_extent_map(em);
        }
        WARN_ON(!list_empty(&extents));
        write_unlock(&tree->lock);

        if (!ret)
                ret = btrfs_log_prealloc_extents(trans, inode, path, ctx);
        if (ret)
                return ret;

        /*
         * We have logged all extents successfully, now make sure the commit of
         * the current transaction waits for the ordered extents to complete
         * before it commits and wipes out the log trees, otherwise we would
         * lose data if an ordered extents completes after the transaction
         * commits and a power failure happens after the transaction commit.
         */
        list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
                list_del_init(&ordered->log_list);
                set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);

                if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
                        spin_lock_irq(&inode->ordered_tree_lock);
                        if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
                                set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
                                atomic_inc(&trans->transaction->pending_ordered);
                        }
                        spin_unlock_irq(&inode->ordered_tree_lock);
                }
                btrfs_put_ordered_extent(ordered);
        }

        return 0;
}

static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
                             struct btrfs_path *path, u64 *size_ret)
{
        struct btrfs_key key;
        int ret;

        key.objectid = btrfs_ino(inode);
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;

        ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
        if (ret < 0) {
                return ret;
        } else if (ret > 0) {
                *size_ret = 0;
        } else {
                struct btrfs_inode_item *item;

                item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                      struct btrfs_inode_item);
                *size_ret = btrfs_inode_size(path->nodes[0], item);
                /*
                 * If the in-memory inode's i_size is smaller then the inode
                 * size stored in the btree, return the inode's i_size, so
                 * that we get a correct inode size after replaying the log
                 * when before a power failure we had a shrinking truncate
                 * followed by addition of a new name (rename / new hard link).
                 * Otherwise return the inode size from the btree, to avoid
                 * data loss when replaying a log due to previously doing a
                 * write that expands the inode's size and logging a new name
                 * immediately after.
                 */
                if (*size_ret > inode->vfs_inode.i_size)
                        *size_ret = inode->vfs_inode.i_size;
        }

        btrfs_release_path(path);
        return 0;
}

/*
 * At the moment we always log all xattrs. This is to figure out at log replay
 * time which xattrs must have their deletion replayed. If a xattr is missing
 * in the log tree and exists in the fs/subvol tree, we delete it. This is
 * because if a xattr is deleted, the inode is fsynced and a power failure
 * happens, causing the log to be replayed the next time the fs is mounted,
 * we want the xattr to not exist anymore (same behaviour as other filesystems
 * with a journal, ext3/4, xfs, f2fs, etc).
 */
static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
                                struct btrfs_inode *inode,
                                struct btrfs_path *path,
                                struct btrfs_path *dst_path,
                                struct btrfs_log_ctx *ctx)
{
        struct btrfs_root *root = inode->root;
        int ret;
        struct btrfs_key key;
        const u64 ino = btrfs_ino(inode);
        int ins_nr = 0;
        int start_slot = 0;
        bool found_xattrs = false;

        if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
                return 0;

        key.objectid = ino;
        key.type = BTRFS_XATTR_ITEM_KEY;
        key.offset = 0;

        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                return ret;

        while (true) {
                int slot = path->slots[0];
                struct extent_buffer *leaf = path->nodes[0];
                int nritems = btrfs_header_nritems(leaf);

                if (slot >= nritems) {
                        if (ins_nr > 0) {
                                ret = copy_items(trans, inode, dst_path, path,
                                                 start_slot, ins_nr, 1, 0, ctx);
                                if (ret < 0)
                                        return ret;
                                ins_nr = 0;
                        }
                        ret = btrfs_next_leaf(root, path);
                        if (ret < 0)
                                return ret;
                        else if (ret > 0)
                                break;
                        continue;
                }

                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
                        break;

                if (ins_nr == 0)
                        start_slot = slot;
                ins_nr++;
                path->slots[0]++;
                found_xattrs = true;
                cond_resched();
        }
        if (ins_nr > 0) {
                ret = copy_items(trans, inode, dst_path, path,
                                 start_slot, ins_nr, 1, 0, ctx);
                if (ret < 0)
                        return ret;
        }

        if (!found_xattrs)
                set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);

        return 0;
}

/*
 * When using the NO_HOLES feature if we punched a hole that causes the
 * deletion of entire leafs or all the extent items of the first leaf (the one
 * that contains the inode item and references) we may end up not processing
 * any extents, because there are no leafs with a generation matching the
 * current transaction that have extent items for our inode. So we need to find
 * if any holes exist and then log them. We also need to log holes after any
 * truncate operation that changes the inode's size.
 */
static int btrfs_log_holes(struct btrfs_trans_handle *trans,
                           struct btrfs_inode *inode,
                           struct btrfs_path *path)
{
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_key key;
        const u64 ino = btrfs_ino(inode);
        const u64 i_size = i_size_read(&inode->vfs_inode);
        u64 prev_extent_end = 0;
        int ret;

        if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
                return 0;

        key.objectid = ino;
        key.type = BTRFS_EXTENT_DATA_KEY;
        key.offset = 0;

        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                return ret;

        while (true) {
                struct extent_buffer *leaf = path->nodes[0];

                if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
                        ret = btrfs_next_leaf(root, path);
                        if (ret < 0)
                                return ret;
                        if (ret > 0) {
                                ret = 0;
                                break;
                        }
                        leaf = path->nodes[0];
                }

                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
                if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
                        break;

                /* We have a hole, log it. */
                if (prev_extent_end < key.offset) {
                        const u64 hole_len = key.offset - prev_extent_end;

                        /*
                         * Release the path to avoid deadlocks with other code
                         * paths that search the root while holding locks on
                         * leafs from the log root.
                         */
                        btrfs_release_path(path);
                        ret = btrfs_insert_hole_extent(trans, root->log_root,
                                                       ino, prev_extent_end,
                                                       hole_len);
                        if (ret < 0)
                                return ret;

                        /*
                         * Search for the same key again in the root. Since it's
                         * an extent item and we are holding the inode lock, the
                         * key must still exist. If it doesn't just emit warning
                         * and return an error to fall back to a transaction
                         * commit.
                         */
                        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                        if (ret < 0)
                                return ret;
                        if (WARN_ON(ret > 0))
                                return -ENOENT;
                        leaf = path->nodes[0];
                }

                prev_extent_end = btrfs_file_extent_end(path);
                path->slots[0]++;
                cond_resched();
        }

        if (prev_extent_end < i_size) {
                u64 hole_len;

                btrfs_release_path(path);
                hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
                ret = btrfs_insert_hole_extent(trans, root->log_root, ino,
                                               prev_extent_end, hole_len);
                if (ret < 0)
                        return ret;
        }

        return 0;
}

/*
 * When we are logging a new inode X, check if it doesn't have a reference that
 * matches the reference from some other inode Y created in a past transaction
 * and that was renamed in the current transaction. If we don't do this, then at
 * log replay time we can lose inode Y (and all its files if it's a directory):
 *
 * mkdir /mnt/x
 * echo "hello world" > /mnt/x/foobar
 * sync
 * mv /mnt/x /mnt/y
 * mkdir /mnt/x                 # or touch /mnt/x
 * xfs_io -c fsync /mnt/x
 * <power fail>
 * mount fs, trigger log replay
 *
 * After the log replay procedure, we would lose the first directory and all its
 * files (file foobar).
 * For the case where inode Y is not a directory we simply end up losing it:
 *
 * echo "123" > /mnt/foo
 * sync
 * mv /mnt/foo /mnt/bar
 * echo "abc" > /mnt/foo
 * xfs_io -c fsync /mnt/foo
 * <power fail>
 *
 * We also need this for cases where a snapshot entry is replaced by some other
 * entry (file or directory) otherwise we end up with an unreplayable log due to
 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
 * if it were a regular entry:
 *
 * mkdir /mnt/x
 * btrfs subvolume snapshot /mnt /mnt/x/snap
 * btrfs subvolume delete /mnt/x/snap
 * rmdir /mnt/x
 * mkdir /mnt/x
 * fsync /mnt/x or fsync some new file inside it
 * <power fail>
 *
 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
 * the same transaction.
 */
static int btrfs_check_ref_name_override(struct extent_buffer *eb,
                                         const int slot,
                                         const struct btrfs_key *key,
                                         struct btrfs_inode *inode,
                                         u64 *other_ino, u64 *other_parent)
{
        int ret;
        struct btrfs_path *search_path;
        char *name = NULL;
        u32 name_len = 0;
        u32 item_size = btrfs_item_size(eb, slot);
        u32 cur_offset = 0;
        unsigned long ptr = btrfs_item_ptr_offset(eb, slot);

        search_path = btrfs_alloc_path();
        if (!search_path)
                return -ENOMEM;
        search_path->search_commit_root = 1;
        search_path->skip_locking = 1;

        while (cur_offset < item_size) {
                u64 parent;
                u32 this_name_len;
                u32 this_len;
                unsigned long name_ptr;
                struct btrfs_dir_item *di;
                struct fscrypt_str name_str;

                if (key->type == BTRFS_INODE_REF_KEY) {
                        struct btrfs_inode_ref *iref;

                        iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
                        parent = key->offset;
                        this_name_len = btrfs_inode_ref_name_len(eb, iref);
                        name_ptr = (unsigned long)(iref + 1);
                        this_len = sizeof(*iref) + this_name_len;
                } else {
                        struct btrfs_inode_extref *extref;

                        extref = (struct btrfs_inode_extref *)(ptr +
                                                               cur_offset);
                        parent = btrfs_inode_extref_parent(eb, extref);
                        this_name_len = btrfs_inode_extref_name_len(eb, extref);
                        name_ptr = (unsigned long)&extref->name;
                        this_len = sizeof(*extref) + this_name_len;
                }

                if (this_name_len > name_len) {
                        char *new_name;

                        new_name = krealloc(name, this_name_len, GFP_NOFS);
                        if (!new_name) {
                                ret = -ENOMEM;
                                goto out;
                        }
                        name_len = this_name_len;
                        name = new_name;
                }

                read_extent_buffer(eb, name, name_ptr, this_name_len);

                name_str.name = name;
                name_str.len = this_name_len;
                di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
                                parent, &name_str, 0);
                if (di && !IS_ERR(di)) {
                        struct btrfs_key di_key;

                        btrfs_dir_item_key_to_cpu(search_path->nodes[0],
                                                  di, &di_key);
                        if (di_key.type == BTRFS_INODE_ITEM_KEY) {
                                if (di_key.objectid != key->objectid) {
                                        ret = 1;
                                        *other_ino = di_key.objectid;
                                        *other_parent = parent;
                                } else {
                                        ret = 0;
                                }
                        } else {
                                ret = -EAGAIN;
                        }
                        goto out;
                } else if (IS_ERR(di)) {
                        ret = PTR_ERR(di);
                        goto out;
                }
                btrfs_release_path(search_path);

                cur_offset += this_len;
        }
        ret = 0;
out:
        btrfs_free_path(search_path);
        kfree(name);
        return ret;
}

/*
 * Check if we need to log an inode. This is used in contexts where while
 * logging an inode we need to log another inode (either that it exists or in
 * full mode). This is used instead of btrfs_inode_in_log() because the later
 * requires the inode to be in the log and have the log transaction committed,
 * while here we do not care if the log transaction was already committed - our
 * caller will commit the log later - and we want to avoid logging an inode
 * multiple times when multiple tasks have joined the same log transaction.
 */
static bool need_log_inode(const struct btrfs_trans_handle *trans,
                           struct btrfs_inode *inode)
{
        /*
         * If a directory was not modified, no dentries added or removed, we can
         * and should avoid logging it.
         */
        if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
                return false;

        /*
         * If this inode does not have new/updated/deleted xattrs since the last
         * time it was logged and is flagged as logged in the current transaction,
         * we can skip logging it. As for new/deleted names, those are updated in
         * the log by link/unlink/rename operations.
         * In case the inode was logged and then evicted and reloaded, its
         * logged_trans will be 0, in which case we have to fully log it since
         * logged_trans is a transient field, not persisted.
         */
        if (inode_logged(trans, inode, NULL) == 1 &&
            !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
                return false;

        return true;
}

struct btrfs_dir_list {
        u64 ino;
        struct list_head list;
};

/*
 * Log the inodes of the new dentries of a directory.
 * See process_dir_items_leaf() for details about why it is needed.
 * This is a recursive operation - if an existing dentry corresponds to a
 * directory, that directory's new entries are logged too (same behaviour as
 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
 * the dentries point to we do not acquire their VFS lock, otherwise lockdep
 * complains about the following circular lock dependency / possible deadlock:
 *
 *        CPU0                                        CPU1
 *        ----                                        ----
 * lock(&type->i_mutex_dir_key#3/2);
 *                                            lock(sb_internal#2);
 *                                            lock(&type->i_mutex_dir_key#3/2);
 * lock(&sb->s_type->i_mutex_key#14);
 *
 * Where sb_internal is the lock (a counter that works as a lock) acquired by
 * sb_start_intwrite() in btrfs_start_transaction().
 * Not acquiring the VFS lock of the inodes is still safe because:
 *
 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
 *    that while logging the inode new references (names) are added or removed
 *    from the inode, leaving the logged inode item with a link count that does
 *    not match the number of logged inode reference items. This is fine because
 *    at log replay time we compute the real number of links and correct the
 *    link count in the inode item (see replay_one_buffer() and
 *    link_to_fixup_dir());
 *
 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
 *    while logging the inode's items new index items (key type
 *    BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
 *    has a size that doesn't match the sum of the lengths of all the logged
 *    names - this is ok, not a problem, because at log replay time we set the
 *    directory's i_size to the correct value (see replay_one_name() and
 *    overwrite_item()).
 */
static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
                                struct btrfs_inode *start_inode,
                                struct btrfs_log_ctx *ctx)
{
        struct btrfs_root *root = start_inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_path *path;
        LIST_HEAD(dir_list);
        struct btrfs_dir_list *dir_elem;
        u64 ino = btrfs_ino(start_inode);
        struct btrfs_inode *curr_inode = start_inode;
        int ret = 0;

        /*
         * If we are logging a new name, as part of a link or rename operation,
         * don't bother logging new dentries, as we just want to log the names
         * of an inode and that any new parents exist.
         */
        if (ctx->logging_new_name)
                return 0;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        /* Pairs with btrfs_add_delayed_iput below. */
        ihold(&curr_inode->vfs_inode);

        while (true) {
                struct inode *vfs_inode;
                struct btrfs_key key;
                struct btrfs_key found_key;
                u64 next_index;
                bool continue_curr_inode = true;
                int iter_ret;

                key.objectid = ino;
                key.type = BTRFS_DIR_INDEX_KEY;
                key.offset = btrfs_get_first_dir_index_to_log(curr_inode);
                next_index = key.offset;
again:
                btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) {
                        struct extent_buffer *leaf = path->nodes[0];
                        struct btrfs_dir_item *di;
                        struct btrfs_key di_key;
                        struct inode *di_inode;
                        int log_mode = LOG_INODE_EXISTS;
                        int type;

                        if (found_key.objectid != ino ||
                            found_key.type != BTRFS_DIR_INDEX_KEY) {
                                continue_curr_inode = false;
                                break;
                        }

                        next_index = found_key.offset + 1;

                        di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
                        type = btrfs_dir_ftype(leaf, di);
                        if (btrfs_dir_transid(leaf, di) < trans->transid)
                                continue;
                        btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
                        if (di_key.type == BTRFS_ROOT_ITEM_KEY)
                                continue;

                        btrfs_release_path(path);
                        di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
                        if (IS_ERR(di_inode)) {
                                ret = PTR_ERR(di_inode);
                                goto out;
                        }

                        if (!need_log_inode(trans, BTRFS_I(di_inode))) {
                                btrfs_add_delayed_iput(BTRFS_I(di_inode));
                                break;
                        }

                        ctx->log_new_dentries = false;
                        if (type == BTRFS_FT_DIR)
                                log_mode = LOG_INODE_ALL;
                        ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
                                              log_mode, ctx);
                        btrfs_add_delayed_iput(BTRFS_I(di_inode));
                        if (ret)
                                goto out;
                        if (ctx->log_new_dentries) {
                                dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
                                if (!dir_elem) {
                                        ret = -ENOMEM;
                                        goto out;
                                }
                                dir_elem->ino = di_key.objectid;
                                list_add_tail(&dir_elem->list, &dir_list);
                        }
                        break;
                }

                btrfs_release_path(path);

                if (iter_ret < 0) {
                        ret = iter_ret;
                        goto out;
                } else if (iter_ret > 0) {
                        continue_curr_inode = false;
                } else {
                        key = found_key;
                }

                if (continue_curr_inode && key.offset < (u64)-1) {
                        key.offset++;
                        goto again;
                }

                btrfs_set_first_dir_index_to_log(curr_inode, next_index);

                if (list_empty(&dir_list))
                        break;

                dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
                ino = dir_elem->ino;
                list_del(&dir_elem->list);
                kfree(dir_elem);

                btrfs_add_delayed_iput(curr_inode);
                curr_inode = NULL;

                vfs_inode = btrfs_iget(fs_info->sb, ino, root);
                if (IS_ERR(vfs_inode)) {
                        ret = PTR_ERR(vfs_inode);
                        break;
                }
                curr_inode = BTRFS_I(vfs_inode);
        }
out:
        btrfs_free_path(path);
        if (curr_inode)
                btrfs_add_delayed_iput(curr_inode);

        if (ret) {
                struct btrfs_dir_list *next;

                list_for_each_entry_safe(dir_elem, next, &dir_list, list)
                        kfree(dir_elem);
        }

        return ret;
}

struct btrfs_ino_list {
        u64 ino;
        u64 parent;
        struct list_head list;
};

static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
{
        struct btrfs_ino_list *curr;
        struct btrfs_ino_list *next;

        list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
                list_del(&curr->list);
                kfree(curr);
        }
}

static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
                                    struct btrfs_path *path)
{
        struct btrfs_key key;
        int ret;

        key.objectid = ino;
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;

        path->search_commit_root = 1;
        path->skip_locking = 1;

        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (WARN_ON_ONCE(ret > 0)) {
                /*
                 * We have previously found the inode through the commit root
                 * so this should not happen. If it does, just error out and
                 * fallback to a transaction commit.
                 */
                ret = -ENOENT;
        } else if (ret == 0) {
                struct btrfs_inode_item *item;

                item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                      struct btrfs_inode_item);
                if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item)))
                        ret = 1;
        }

        btrfs_release_path(path);
        path->search_commit_root = 0;
        path->skip_locking = 0;

        return ret;
}

static int add_conflicting_inode(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path,
                                 u64 ino, u64 parent,
                                 struct btrfs_log_ctx *ctx)
{
        struct btrfs_ino_list *ino_elem;
        struct inode *inode;

        /*
         * It's rare to have a lot of conflicting inodes, in practice it is not
         * common to have more than 1 or 2. We don't want to collect too many,
         * as we could end up logging too many inodes (even if only in
         * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
         * commits.
         */
        if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
                return BTRFS_LOG_FORCE_COMMIT;

        inode = btrfs_iget(root->fs_info->sb, ino, root);
        /*
         * If the other inode that had a conflicting dir entry was deleted in
         * the current transaction then we either:
         *
         * 1) Log the parent directory (later after adding it to the list) if
         *    the inode is a directory. This is because it may be a deleted
         *    subvolume/snapshot or it may be a regular directory that had
         *    deleted subvolumes/snapshots (or subdirectories that had them),
         *    and at the moment we can't deal with dropping subvolumes/snapshots
         *    during log replay. So we just log the parent, which will result in
         *    a fallback to a transaction commit if we are dealing with those
         *    cases (last_unlink_trans will match the current transaction);
         *
         * 2) Do nothing if it's not a directory. During log replay we simply
         *    unlink the conflicting dentry from the parent directory and then
         *    add the dentry for our inode. Like this we can avoid logging the
         *    parent directory (and maybe fallback to a transaction commit in
         *    case it has a last_unlink_trans == trans->transid, due to moving
         *    some inode from it to some other directory).
         */
        if (IS_ERR(inode)) {
                int ret = PTR_ERR(inode);

                if (ret != -ENOENT)
                        return ret;

                ret = conflicting_inode_is_dir(root, ino, path);
                /* Not a directory or we got an error. */
                if (ret <= 0)
                        return ret;

                /* Conflicting inode is a directory, so we'll log its parent. */
                ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
                if (!ino_elem)
                        return -ENOMEM;
                ino_elem->ino = ino;
                ino_elem->parent = parent;
                list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
                ctx->num_conflict_inodes++;

                return 0;
        }

        /*
         * If the inode was already logged skip it - otherwise we can hit an
         * infinite loop. Example:
         *
         * From the commit root (previous transaction) we have the following
         * inodes:
         *
         * inode 257 a directory
         * inode 258 with references "zz" and "zz_link" on inode 257
         * inode 259 with reference "a" on inode 257
         *
         * And in the current (uncommitted) transaction we have:
         *
         * inode 257 a directory, unchanged
         * inode 258 with references "a" and "a2" on inode 257
         * inode 259 with reference "zz_link" on inode 257
         * inode 261 with reference "zz" on inode 257
         *
         * When logging inode 261 the following infinite loop could
         * happen if we don't skip already logged inodes:
         *
         * - we detect inode 258 as a conflicting inode, with inode 261
         *   on reference "zz", and log it;
         *
         * - we detect inode 259 as a conflicting inode, with inode 258
         *   on reference "a", and log it;
         *
         * - we detect inode 258 as a conflicting inode, with inode 259
         *   on reference "zz_link", and log it - again! After this we
         *   repeat the above steps forever.
         *
         * Here we can use need_log_inode() because we only need to log the
         * inode in LOG_INODE_EXISTS mode and rename operations update the log,
         * so that the log ends up with the new name and without the old name.
         */
        if (!need_log_inode(trans, BTRFS_I(inode))) {
                btrfs_add_delayed_iput(BTRFS_I(inode));
                return 0;
        }

        btrfs_add_delayed_iput(BTRFS_I(inode));

        ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
        if (!ino_elem)
                return -ENOMEM;
        ino_elem->ino = ino;
        ino_elem->parent = parent;
        list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
        ctx->num_conflict_inodes++;

        return 0;
}

static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root,
                                  struct btrfs_log_ctx *ctx)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret = 0;

        /*
         * Conflicting inodes are logged by the first call to btrfs_log_inode(),
         * otherwise we could have unbounded recursion of btrfs_log_inode()
         * calls. This check guarantees we can have only 1 level of recursion.
         */
        if (ctx->logging_conflict_inodes)
                return 0;

        ctx->logging_conflict_inodes = true;

        /*
         * New conflicting inodes may be found and added to the list while we
         * are logging a conflicting inode, so keep iterating while the list is
         * not empty.
         */
        while (!list_empty(&ctx->conflict_inodes)) {
                struct btrfs_ino_list *curr;
                struct inode *inode;
                u64 ino;
                u64 parent;

                curr = list_first_entry(&ctx->conflict_inodes,
                                        struct btrfs_ino_list, list);
                ino = curr->ino;
                parent = curr->parent;
                list_del(&curr->list);
                kfree(curr);

                inode = btrfs_iget(fs_info->sb, ino, root);
                /*
                 * If the other inode that had a conflicting dir entry was
                 * deleted in the current transaction, we need to log its parent
                 * directory. See the comment at add_conflicting_inode().
                 */
                if (IS_ERR(inode)) {
                        ret = PTR_ERR(inode);
                        if (ret != -ENOENT)
                                break;

                        inode = btrfs_iget(fs_info->sb, parent, root);
                        if (IS_ERR(inode)) {
                                ret = PTR_ERR(inode);
                                break;
                        }

                        /*
                         * Always log the directory, we cannot make this
                         * conditional on need_log_inode() because the directory
                         * might have been logged in LOG_INODE_EXISTS mode or
                         * the dir index of the conflicting inode is not in a
                         * dir index key range logged for the directory. So we
                         * must make sure the deletion is recorded.
                         */
                        ret = btrfs_log_inode(trans, BTRFS_I(inode),
                                              LOG_INODE_ALL, ctx);
                        btrfs_add_delayed_iput(BTRFS_I(inode));
                        if (ret)
                                break;
                        continue;
                }

                /*
                 * Here we can use need_log_inode() because we only need to log
                 * the inode in LOG_INODE_EXISTS mode and rename operations
                 * update the log, so that the log ends up with the new name and
                 * without the old name.
                 *
                 * We did this check at add_conflicting_inode(), but here we do
                 * it again because if some other task logged the inode after
                 * that, we can avoid doing it again.
                 */
                if (!need_log_inode(trans, BTRFS_I(inode))) {
                        btrfs_add_delayed_iput(BTRFS_I(inode));
                        continue;
                }

                /*
                 * We are safe logging the other inode without acquiring its
                 * lock as long as we log with the LOG_INODE_EXISTS mode. We
                 * are safe against concurrent renames of the other inode as
                 * well because during a rename we pin the log and update the
                 * log with the new name before we unpin it.
                 */
                ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
                btrfs_add_delayed_iput(BTRFS_I(inode));
                if (ret)
                        break;
        }

        ctx->logging_conflict_inodes = false;
        if (ret)
                free_conflicting_inodes(ctx);

        return ret;
}

static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
                                   struct btrfs_inode *inode,
                                   struct btrfs_key *min_key,
                                   const struct btrfs_key *max_key,
                                   struct btrfs_path *path,
                                   struct btrfs_path *dst_path,
                                   const u64 logged_isize,
                                   const int inode_only,
                                   struct btrfs_log_ctx *ctx,
                                   bool *need_log_inode_item)
{
        const u64 i_size = i_size_read(&inode->vfs_inode);
        struct btrfs_root *root = inode->root;
        int ins_start_slot = 0;
        int ins_nr = 0;
        int ret;

        while (1) {
                ret = btrfs_search_forward(root, min_key, path, trans->transid);
                if (ret < 0)
                        return ret;
                if (ret > 0) {
                        ret = 0;
                        break;
                }
again:
                /* Note, ins_nr might be > 0 here, cleanup outside the loop */
                if (min_key->objectid != max_key->objectid)
                        break;
                if (min_key->type > max_key->type)
                        break;

                if (min_key->type == BTRFS_INODE_ITEM_KEY) {
                        *need_log_inode_item = false;
                } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
                           min_key->offset >= i_size) {
                        /*
                         * Extents at and beyond eof are logged with
                         * btrfs_log_prealloc_extents().
                         * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
                         * and no keys greater than that, so bail out.
                         */
                        break;
                } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
                            min_key->type == BTRFS_INODE_EXTREF_KEY) &&
                           (inode->generation == trans->transid ||
                            ctx->logging_conflict_inodes)) {
                        u64 other_ino = 0;
                        u64 other_parent = 0;

                        ret = btrfs_check_ref_name_override(path->nodes[0],
                                        path->slots[0], min_key, inode,
                                        &other_ino, &other_parent);
                        if (ret < 0) {
                                return ret;
                        } else if (ret > 0 &&
                                   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
                                if (ins_nr > 0) {
                                        ins_nr++;
                                } else {
                                        ins_nr = 1;
                                        ins_start_slot = path->slots[0];
                                }
                                ret = copy_items(trans, inode, dst_path, path,
                                                 ins_start_slot, ins_nr,
                                                 inode_only, logged_isize, ctx);
                                if (ret < 0)
                                        return ret;
                                ins_nr = 0;

                                btrfs_release_path(path);
                                ret = add_conflicting_inode(trans, root, path,
                                                            other_ino,
                                                            other_parent, ctx);
                                if (ret)
                                        return ret;
                                goto next_key;
                        }
                } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
                        /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
                        if (ins_nr == 0)
                                goto next_slot;
                        ret = copy_items(trans, inode, dst_path, path,
                                         ins_start_slot,
                                         ins_nr, inode_only, logged_isize, ctx);
                        if (ret < 0)
                                return ret;
                        ins_nr = 0;
                        goto next_slot;
                }

                if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
                        ins_nr++;
                        goto next_slot;
                } else if (!ins_nr) {
                        ins_start_slot = path->slots[0];
                        ins_nr = 1;
                        goto next_slot;
                }

                ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
                                 ins_nr, inode_only, logged_isize, ctx);
                if (ret < 0)
                        return ret;
                ins_nr = 1;
                ins_start_slot = path->slots[0];
next_slot:
                path->slots[0]++;
                if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
                        btrfs_item_key_to_cpu(path->nodes[0], min_key,
                                              path->slots[0]);
                        goto again;
                }
                if (ins_nr) {
                        ret = copy_items(trans, inode, dst_path, path,
                                         ins_start_slot, ins_nr, inode_only,
                                         logged_isize, ctx);
                        if (ret < 0)
                                return ret;
                        ins_nr = 0;
                }
                btrfs_release_path(path);
next_key:
                if (min_key->offset < (u64)-1) {
                        min_key->offset++;
                } else if (min_key->type < max_key->type) {
                        min_key->type++;
                        min_key->offset = 0;
                } else {
                        break;
                }

                /*
                 * We may process many leaves full of items for our inode, so
                 * avoid monopolizing a cpu for too long by rescheduling while
                 * not holding locks on any tree.
                 */
                cond_resched();
        }
        if (ins_nr) {
                ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
                                 ins_nr, inode_only, logged_isize, ctx);
                if (ret)
                        return ret;
        }

        if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
                /*
                 * Release the path because otherwise we might attempt to double
                 * lock the same leaf with btrfs_log_prealloc_extents() below.
                 */
                btrfs_release_path(path);
                ret = btrfs_log_prealloc_extents(trans, inode, dst_path, ctx);
        }

        return ret;
}

static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *log,
                                      struct btrfs_path *path,
                                      const struct btrfs_item_batch *batch,
                                      const struct btrfs_delayed_item *first_item)
{
        const struct btrfs_delayed_item *curr = first_item;
        int ret;

        ret = btrfs_insert_empty_items(trans, log, path, batch);
        if (ret)
                return ret;

        for (int i = 0; i < batch->nr; i++) {
                char *data_ptr;

                data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
                write_extent_buffer(path->nodes[0], &curr->data,
                                    (unsigned long)data_ptr, curr->data_len);
                curr = list_next_entry(curr, log_list);
                path->slots[0]++;
        }

        btrfs_release_path(path);

        return 0;
}

static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
                                       struct btrfs_inode *inode,
                                       struct btrfs_path *path,
                                       const struct list_head *delayed_ins_list,
                                       struct btrfs_log_ctx *ctx)
{
        /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */
        const int max_batch_size = 195;
        const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info);
        const u64 ino = btrfs_ino(inode);
        struct btrfs_root *log = inode->root->log_root;
        struct btrfs_item_batch batch = {
                .nr = 0,
                .total_data_size = 0,
        };
        const struct btrfs_delayed_item *first = NULL;
        const struct btrfs_delayed_item *curr;
        char *ins_data;
        struct btrfs_key *ins_keys;
        u32 *ins_sizes;
        u64 curr_batch_size = 0;
        int batch_idx = 0;
        int ret;

        /* We are adding dir index items to the log tree. */
        lockdep_assert_held(&inode->log_mutex);

        /*
         * We collect delayed items before copying index keys from the subvolume
         * to the log tree. However just after we collected them, they may have
         * been flushed (all of them or just some of them), and therefore we
         * could have copied them from the subvolume tree to the log tree.
         * So find the first delayed item that was not yet logged (they are
         * sorted by index number).
         */
        list_for_each_entry(curr, delayed_ins_list, log_list) {
                if (curr->index > inode->last_dir_index_offset) {
                        first = curr;
                        break;
                }
        }

        /* Empty list or all delayed items were already logged. */
        if (!first)
                return 0;

        ins_data = kmalloc(max_batch_size * sizeof(u32) +
                           max_batch_size * sizeof(struct btrfs_key), GFP_NOFS);
        if (!ins_data)
                return -ENOMEM;
        ins_sizes = (u32 *)ins_data;
        batch.data_sizes = ins_sizes;
        ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32));
        batch.keys = ins_keys;

        curr = first;
        while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
                const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);

                if (curr_batch_size + curr_size > leaf_data_size ||
                    batch.nr == max_batch_size) {
                        ret = insert_delayed_items_batch(trans, log, path,
                                                         &batch, first);
                        if (ret)
                                goto out;
                        batch_idx = 0;
                        batch.nr = 0;
                        batch.total_data_size = 0;
                        curr_batch_size = 0;
                        first = curr;
                }

                ins_sizes[batch_idx] = curr->data_len;
                ins_keys[batch_idx].objectid = ino;
                ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
                ins_keys[batch_idx].offset = curr->index;
                curr_batch_size += curr_size;
                batch.total_data_size += curr->data_len;
                batch.nr++;
                batch_idx++;
                curr = list_next_entry(curr, log_list);
        }

        ASSERT(batch.nr >= 1);
        ret = insert_delayed_items_batch(trans, log, path, &batch, first);

        curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
                               log_list);
        inode->last_dir_index_offset = curr->index;
out:
        kfree(ins_data);

        return ret;
}

static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
                                      struct btrfs_inode *inode,
                                      struct btrfs_path *path,
                                      const struct list_head *delayed_del_list,
                                      struct btrfs_log_ctx *ctx)
{
        const u64 ino = btrfs_ino(inode);
        const struct btrfs_delayed_item *curr;

        curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
                                log_list);

        while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
                u64 first_dir_index = curr->index;
                u64 last_dir_index;
                const struct btrfs_delayed_item *next;
                int ret;

                /*
                 * Find a range of consecutive dir index items to delete. Like
                 * this we log a single dir range item spanning several contiguous
                 * dir items instead of logging one range item per dir index item.
                 */
                next = list_next_entry(curr, log_list);
                while (!list_entry_is_head(next, delayed_del_list, log_list)) {
                        if (next->index != curr->index + 1)
                                break;
                        curr = next;
                        next = list_next_entry(next, log_list);
                }

                last_dir_index = curr->index;
                ASSERT(last_dir_index >= first_dir_index);

                ret = insert_dir_log_key(trans, inode->root->log_root, path,
                                         ino, first_dir_index, last_dir_index);
                if (ret)
                        return ret;
                curr = list_next_entry(curr, log_list);
        }

        return 0;
}

static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
                                        struct btrfs_inode *inode,
                                        struct btrfs_path *path,
                                        struct btrfs_log_ctx *ctx,
                                        const struct list_head *delayed_del_list,
                                        const struct btrfs_delayed_item *first,
                                        const struct btrfs_delayed_item **last_ret)
{
        const struct btrfs_delayed_item *next;
        struct extent_buffer *leaf = path->nodes[0];
        const int last_slot = btrfs_header_nritems(leaf) - 1;
        int slot = path->slots[0] + 1;
        const u64 ino = btrfs_ino(inode);

        next = list_next_entry(first, log_list);

        while (slot < last_slot &&
               !list_entry_is_head(next, delayed_del_list, log_list)) {
                struct btrfs_key key;

                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (key.objectid != ino ||
                    key.type != BTRFS_DIR_INDEX_KEY ||
                    key.offset != next->index)
                        break;

                slot++;
                *last_ret = next;
                next = list_next_entry(next, log_list);
        }

        return btrfs_del_items(trans, inode->root->log_root, path,
                               path->slots[0], slot - path->slots[0]);
}

static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
                                             struct btrfs_inode *inode,
                                             struct btrfs_path *path,
                                             const struct list_head *delayed_del_list,
                                             struct btrfs_log_ctx *ctx)
{
        struct btrfs_root *log = inode->root->log_root;
        const struct btrfs_delayed_item *curr;
        u64 last_range_start = 0;
        u64 last_range_end = 0;
        struct btrfs_key key;

        key.objectid = btrfs_ino(inode);
        key.type = BTRFS_DIR_INDEX_KEY;
        curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
                                log_list);

        while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
                const struct btrfs_delayed_item *last = curr;
                u64 first_dir_index = curr->index;
                u64 last_dir_index;
                bool deleted_items = false;
                int ret;

                key.offset = curr->index;
                ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
                if (ret < 0) {
                        return ret;
                } else if (ret == 0) {
                        ret = batch_delete_dir_index_items(trans, inode, path, ctx,
                                                           delayed_del_list, curr,
                                                           &last);
                        if (ret)
                                return ret;
                        deleted_items = true;
                }

                btrfs_release_path(path);

                /*
                 * If we deleted items from the leaf, it means we have a range
                 * item logging their range, so no need to add one or update an
                 * existing one. Otherwise we have to log a dir range item.
                 */
                if (deleted_items)
                        goto next_batch;

                last_dir_index = last->index;
                ASSERT(last_dir_index >= first_dir_index);
                /*
                 * If this range starts right after where the previous one ends,
                 * then we want to reuse the previous range item and change its
                 * end offset to the end of this range. This is just to minimize
                 * leaf space usage, by avoiding adding a new range item.
                 */
                if (last_range_end != 0 && first_dir_index == last_range_end + 1)
                        first_dir_index = last_range_start;

                ret = insert_dir_log_key(trans, log, path, key.objectid,
                                         first_dir_index, last_dir_index);
                if (ret)
                        return ret;

                last_range_start = first_dir_index;
                last_range_end = last_dir_index;
next_batch:
                curr = list_next_entry(last, log_list);
        }

        return 0;
}

static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
                                      struct btrfs_inode *inode,
                                      struct btrfs_path *path,
                                      const struct list_head *delayed_del_list,
                                      struct btrfs_log_ctx *ctx)
{
        /*
         * We are deleting dir index items from the log tree or adding range
         * items to it.
         */
        lockdep_assert_held(&inode->log_mutex);

        if (list_empty(delayed_del_list))
                return 0;

        if (ctx->logged_before)
                return log_delayed_deletions_incremental(trans, inode, path,
                                                         delayed_del_list, ctx);

        return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
                                          ctx);
}

/*
 * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
 * items instead of the subvolume tree.
 */
static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
                                    struct btrfs_inode *inode,
                                    const struct list_head *delayed_ins_list,
                                    struct btrfs_log_ctx *ctx)
{
        const bool orig_log_new_dentries = ctx->log_new_dentries;
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_delayed_item *item;
        int ret = 0;

        /*
         * No need for the log mutex, plus to avoid potential deadlocks or
         * lockdep annotations due to nesting of delayed inode mutexes and log
         * mutexes.
         */
        lockdep_assert_not_held(&inode->log_mutex);

        ASSERT(!ctx->logging_new_delayed_dentries);
        ctx->logging_new_delayed_dentries = true;

        list_for_each_entry(item, delayed_ins_list, log_list) {
                struct btrfs_dir_item *dir_item;
                struct inode *di_inode;
                struct btrfs_key key;
                int log_mode = LOG_INODE_EXISTS;

                dir_item = (struct btrfs_dir_item *)item->data;
                btrfs_disk_key_to_cpu(&key, &dir_item->location);

                if (key.type == BTRFS_ROOT_ITEM_KEY)
                        continue;

                di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root);
                if (IS_ERR(di_inode)) {
                        ret = PTR_ERR(di_inode);
                        break;
                }

                if (!need_log_inode(trans, BTRFS_I(di_inode))) {
                        btrfs_add_delayed_iput(BTRFS_I(di_inode));
                        continue;
                }

                if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR)
                        log_mode = LOG_INODE_ALL;

                ctx->log_new_dentries = false;
                ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx);

                if (!ret && ctx->log_new_dentries)
                        ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);

                btrfs_add_delayed_iput(BTRFS_I(di_inode));

                if (ret)
                        break;
        }

        ctx->log_new_dentries = orig_log_new_dentries;
        ctx->logging_new_delayed_dentries = false;

        return ret;
}

/* log a single inode in the tree log.
 * At least one parent directory for this inode must exist in the tree
 * or be logged already.
 *
 * Any items from this inode changed by the current transaction are copied
 * to the log tree.  An extra reference is taken on any extents in this
 * file, allowing us to avoid a whole pile of corner cases around logging
 * blocks that have been removed from the tree.
 *
 * See LOG_INODE_ALL and related defines for a description of what inode_only
 * does.
 *
 * This handles both files and directories.
 */
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                           struct btrfs_inode *inode,
                           int inode_only,
                           struct btrfs_log_ctx *ctx)
{
        struct btrfs_path *path;
        struct btrfs_path *dst_path;
        struct btrfs_key min_key;
        struct btrfs_key max_key;
        struct btrfs_root *log = inode->root->log_root;
        int ret;
        bool fast_search = false;
        u64 ino = btrfs_ino(inode);
        struct extent_map_tree *em_tree = &inode->extent_tree;
        u64 logged_isize = 0;
        bool need_log_inode_item = true;
        bool xattrs_logged = false;
        bool inode_item_dropped = true;
        bool full_dir_logging = false;
        LIST_HEAD(delayed_ins_list);
        LIST_HEAD(delayed_del_list);

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        dst_path = btrfs_alloc_path();
        if (!dst_path) {
                btrfs_free_path(path);
                return -ENOMEM;
        }

        min_key.objectid = ino;
        min_key.type = BTRFS_INODE_ITEM_KEY;
        min_key.offset = 0;

        max_key.objectid = ino;


        /* today the code can only do partial logging of directories */
        if (S_ISDIR(inode->vfs_inode.i_mode) ||
            (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                       &inode->runtime_flags) &&
             inode_only >= LOG_INODE_EXISTS))
                max_key.type = BTRFS_XATTR_ITEM_KEY;
        else
                max_key.type = (u8)-1;
        max_key.offset = (u64)-1;

        if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
                full_dir_logging = true;

        /*
         * If we are logging a directory while we are logging dentries of the
         * delayed items of some other inode, then we need to flush the delayed
         * items of this directory and not log the delayed items directly. This
         * is to prevent more than one level of recursion into btrfs_log_inode()
         * by having something like this:
         *
         *     $ mkdir -p a/b/c/d/e/f/g/h/...
         *     $ xfs_io -c "fsync" a
         *
         * Where all directories in the path did not exist before and are
         * created in the current transaction.
         * So in such a case we directly log the delayed items of the main
         * directory ("a") without flushing them first, while for each of its
         * subdirectories we flush their delayed items before logging them.
         * This prevents a potential unbounded recursion like this:
         *
         * btrfs_log_inode()
         *   log_new_delayed_dentries()
         *      btrfs_log_inode()
         *        log_new_delayed_dentries()
         *          btrfs_log_inode()
         *            log_new_delayed_dentries()
         *              (...)
         *
         * We have thresholds for the maximum number of delayed items to have in
         * memory, and once they are hit, the items are flushed asynchronously.
         * However the limit is quite high, so lets prevent deep levels of
         * recursion to happen by limiting the maximum depth to be 1.
         */
        if (full_dir_logging && ctx->logging_new_delayed_dentries) {
                ret = btrfs_commit_inode_delayed_items(trans, inode);
                if (ret)
                        goto out;
        }

        mutex_lock(&inode->log_mutex);

        /*
         * For symlinks, we must always log their content, which is stored in an
         * inline extent, otherwise we could end up with an empty symlink after
         * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
         * one attempts to create an empty symlink).
         * We don't need to worry about flushing delalloc, because when we create
         * the inline extent when the symlink is created (we never have delalloc
         * for symlinks).
         */
        if (S_ISLNK(inode->vfs_inode.i_mode))
                inode_only = LOG_INODE_ALL;

        /*
         * Before logging the inode item, cache the value returned by
         * inode_logged(), because after that we have the need to figure out if
         * the inode was previously logged in this transaction.
         */
        ret = inode_logged(trans, inode, path);
        if (ret < 0)
                goto out_unlock;
        ctx->logged_before = (ret == 1);
        ret = 0;

        /*
         * This is for cases where logging a directory could result in losing a
         * a file after replaying the log. For example, if we move a file from a
         * directory A to a directory B, then fsync directory A, we have no way
         * to known the file was moved from A to B, so logging just A would
         * result in losing the file after a log replay.
         */
        if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
                ret = BTRFS_LOG_FORCE_COMMIT;
                goto out_unlock;
        }

        /*
         * a brute force approach to making sure we get the most uptodate
         * copies of everything.
         */
        if (S_ISDIR(inode->vfs_inode.i_mode)) {
                clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
                if (ctx->logged_before)
                        ret = drop_inode_items(trans, log, path, inode,
                                               BTRFS_XATTR_ITEM_KEY);
        } else {
                if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
                        /*
                         * Make sure the new inode item we write to the log has
                         * the same isize as the current one (if it exists).
                         * This is necessary to prevent data loss after log
                         * replay, and also to prevent doing a wrong expanding
                         * truncate - for e.g. create file, write 4K into offset
                         * 0, fsync, write 4K into offset 4096, add hard link,
                         * fsync some other file (to sync log), power fail - if
                         * we use the inode's current i_size, after log replay
                         * we get a 8Kb file, with the last 4Kb extent as a hole
                         * (zeroes), as if an expanding truncate happened,
                         * instead of getting a file of 4Kb only.
                         */
                        ret = logged_inode_size(log, inode, path, &logged_isize);
                        if (ret)
                                goto out_unlock;
                }
                if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                             &inode->runtime_flags)) {
                        if (inode_only == LOG_INODE_EXISTS) {
                                max_key.type = BTRFS_XATTR_ITEM_KEY;
                                if (ctx->logged_before)
                                        ret = drop_inode_items(trans, log, path,
                                                               inode, max_key.type);
                        } else {
                                clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                                          &inode->runtime_flags);
                                clear_bit(BTRFS_INODE_COPY_EVERYTHING,
                                          &inode->runtime_flags);
                                if (ctx->logged_before)
                                        ret = truncate_inode_items(trans, log,
                                                                   inode, 0, 0);
                        }
                } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
                                              &inode->runtime_flags) ||
                           inode_only == LOG_INODE_EXISTS) {
                        if (inode_only == LOG_INODE_ALL)
                                fast_search = true;
                        max_key.type = BTRFS_XATTR_ITEM_KEY;
                        if (ctx->logged_before)
                                ret = drop_inode_items(trans, log, path, inode,
                                                       max_key.type);
                } else {
                        if (inode_only == LOG_INODE_ALL)
                                fast_search = true;
                        inode_item_dropped = false;
                        goto log_extents;
                }

        }
        if (ret)
                goto out_unlock;

        /*
         * If we are logging a directory in full mode, collect the delayed items
         * before iterating the subvolume tree, so that we don't miss any new
         * dir index items in case they get flushed while or right after we are
         * iterating the subvolume tree.
         */
        if (full_dir_logging && !ctx->logging_new_delayed_dentries)
                btrfs_log_get_delayed_items(inode, &delayed_ins_list,
                                            &delayed_del_list);

        ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
                                      path, dst_path, logged_isize,
                                      inode_only, ctx,
                                      &need_log_inode_item);
        if (ret)
                goto out_unlock;

        btrfs_release_path(path);
        btrfs_release_path(dst_path);
        ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
        if (ret)
                goto out_unlock;
        xattrs_logged = true;
        if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
                btrfs_release_path(path);
                btrfs_release_path(dst_path);
                ret = btrfs_log_holes(trans, inode, path);
                if (ret)
                        goto out_unlock;
        }
log_extents:
        btrfs_release_path(path);
        btrfs_release_path(dst_path);
        if (need_log_inode_item) {
                ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
                if (ret)
                        goto out_unlock;
                /*
                 * If we are doing a fast fsync and the inode was logged before
                 * in this transaction, we don't need to log the xattrs because
                 * they were logged before. If xattrs were added, changed or
                 * deleted since the last time we logged the inode, then we have
                 * already logged them because the inode had the runtime flag
                 * BTRFS_INODE_COPY_EVERYTHING set.
                 */
                if (!xattrs_logged && inode->logged_trans < trans->transid) {
                        ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
                        if (ret)
                                goto out_unlock;
                        btrfs_release_path(path);
                }
        }
        if (fast_search) {
                ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
                if (ret)
                        goto out_unlock;
        } else if (inode_only == LOG_INODE_ALL) {
                struct extent_map *em, *n;

                write_lock(&em_tree->lock);
                list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
                        list_del_init(&em->list);
                write_unlock(&em_tree->lock);
        }

        if (full_dir_logging) {
                ret = log_directory_changes(trans, inode, path, dst_path, ctx);
                if (ret)
                        goto out_unlock;
                ret = log_delayed_insertion_items(trans, inode, path,
                                                  &delayed_ins_list, ctx);
                if (ret)
                        goto out_unlock;
                ret = log_delayed_deletion_items(trans, inode, path,
                                                 &delayed_del_list, ctx);
                if (ret)
                        goto out_unlock;
        }

        spin_lock(&inode->lock);
        inode->logged_trans = trans->transid;
        /*
         * Don't update last_log_commit if we logged that an inode exists.
         * We do this for three reasons:
         *
         * 1) We might have had buffered writes to this inode that were
         *    flushed and had their ordered extents completed in this
         *    transaction, but we did not previously log the inode with
         *    LOG_INODE_ALL. Later the inode was evicted and after that
         *    it was loaded again and this LOG_INODE_EXISTS log operation
         *    happened. We must make sure that if an explicit fsync against
         *    the inode is performed later, it logs the new extents, an
         *    updated inode item, etc, and syncs the log. The same logic
         *    applies to direct IO writes instead of buffered writes.
         *
         * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
         *    is logged with an i_size of 0 or whatever value was logged
         *    before. If later the i_size of the inode is increased by a
         *    truncate operation, the log is synced through an fsync of
         *    some other inode and then finally an explicit fsync against
         *    this inode is made, we must make sure this fsync logs the
         *    inode with the new i_size, the hole between old i_size and
         *    the new i_size, and syncs the log.
         *
         * 3) If we are logging that an ancestor inode exists as part of
         *    logging a new name from a link or rename operation, don't update
         *    its last_log_commit - otherwise if an explicit fsync is made
         *    against an ancestor, the fsync considers the inode in the log
         *    and doesn't sync the log, resulting in the ancestor missing after
         *    a power failure unless the log was synced as part of an fsync
         *    against any other unrelated inode.
         */
        if (inode_only != LOG_INODE_EXISTS)
                inode->last_log_commit = inode->last_sub_trans;
        spin_unlock(&inode->lock);

        /*
         * Reset the last_reflink_trans so that the next fsync does not need to
         * go through the slower path when logging extents and their checksums.
         */
        if (inode_only == LOG_INODE_ALL)
                inode->last_reflink_trans = 0;

out_unlock:
        mutex_unlock(&inode->log_mutex);
out:
        btrfs_free_path(path);
        btrfs_free_path(dst_path);

        if (ret)
                free_conflicting_inodes(ctx);
        else
                ret = log_conflicting_inodes(trans, inode->root, ctx);

        if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
                if (!ret)
                        ret = log_new_delayed_dentries(trans, inode,
                                                       &delayed_ins_list, ctx);

                btrfs_log_put_delayed_items(inode, &delayed_ins_list,
                                            &delayed_del_list);
        }

        return ret;
}

static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
                                 struct btrfs_inode *inode,
                                 struct btrfs_log_ctx *ctx)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        int ret;
        struct btrfs_path *path;
        struct btrfs_key key;
        struct btrfs_root *root = inode->root;
        const u64 ino = btrfs_ino(inode);

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        path->skip_locking = 1;
        path->search_commit_root = 1;

        key.objectid = ino;
        key.type = BTRFS_INODE_REF_KEY;
        key.offset = 0;
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto out;

        while (true) {
                struct extent_buffer *leaf = path->nodes[0];
                int slot = path->slots[0];
                u32 cur_offset = 0;
                u32 item_size;
                unsigned long ptr;

                if (slot >= btrfs_header_nritems(leaf)) {
                        ret = btrfs_next_leaf(root, path);
                        if (ret < 0)
                                goto out;
                        else if (ret > 0)
                                break;
                        continue;
                }

                btrfs_item_key_to_cpu(leaf, &key, slot);
                /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
                if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
                        break;

                item_size = btrfs_item_size(leaf, slot);
                ptr = btrfs_item_ptr_offset(leaf, slot);
                while (cur_offset < item_size) {
                        struct btrfs_key inode_key;
                        struct inode *dir_inode;

                        inode_key.type = BTRFS_INODE_ITEM_KEY;
                        inode_key.offset = 0;

                        if (key.type == BTRFS_INODE_EXTREF_KEY) {
                                struct btrfs_inode_extref *extref;

                                extref = (struct btrfs_inode_extref *)
                                        (ptr + cur_offset);
                                inode_key.objectid = btrfs_inode_extref_parent(
                                        leaf, extref);
                                cur_offset += sizeof(*extref);
                                cur_offset += btrfs_inode_extref_name_len(leaf,
                                        extref);
                        } else {
                                inode_key.objectid = key.offset;
                                cur_offset = item_size;
                        }

                        dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
                                               root);
                        /*
                         * If the parent inode was deleted, return an error to
                         * fallback to a transaction commit. This is to prevent
                         * getting an inode that was moved from one parent A to
                         * a parent B, got its former parent A deleted and then
                         * it got fsync'ed, from existing at both parents after
                         * a log replay (and the old parent still existing).
                         * Example:
                         *
                         * mkdir /mnt/A
                         * mkdir /mnt/B
                         * touch /mnt/B/bar
                         * sync
                         * mv /mnt/B/bar /mnt/A/bar
                         * mv -T /mnt/A /mnt/B
                         * fsync /mnt/B/bar
                         * <power fail>
                         *
                         * If we ignore the old parent B which got deleted,
                         * after a log replay we would have file bar linked
                         * at both parents and the old parent B would still
                         * exist.
                         */
                        if (IS_ERR(dir_inode)) {
                                ret = PTR_ERR(dir_inode);
                                goto out;
                        }

                        if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
                                btrfs_add_delayed_iput(BTRFS_I(dir_inode));
                                continue;
                        }

                        ctx->log_new_dentries = false;
                        ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
                                              LOG_INODE_ALL, ctx);
                        if (!ret && ctx->log_new_dentries)
                                ret = log_new_dir_dentries(trans,
                                                   BTRFS_I(dir_inode), ctx);
                        btrfs_add_delayed_iput(BTRFS_I(dir_inode));
                        if (ret)
                                goto out;
                }
                path->slots[0]++;
        }
        ret = 0;
out:
        btrfs_free_path(path);
        return ret;
}

static int log_new_ancestors(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path,
                             struct btrfs_log_ctx *ctx)
{
        struct btrfs_key found_key;

        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);

        while (true) {
                struct btrfs_fs_info *fs_info = root->fs_info;
                struct extent_buffer *leaf;
                int slot;
                struct btrfs_key search_key;
                struct inode *inode;
                u64 ino;
                int ret = 0;

                btrfs_release_path(path);

                ino = found_key.offset;

                search_key.objectid = found_key.offset;
                search_key.type = BTRFS_INODE_ITEM_KEY;
                search_key.offset = 0;
                inode = btrfs_iget(fs_info->sb, ino, root);
                if (IS_ERR(inode))
                        return PTR_ERR(inode);

                if (BTRFS_I(inode)->generation >= trans->transid &&
                    need_log_inode(trans, BTRFS_I(inode)))
                        ret = btrfs_log_inode(trans, BTRFS_I(inode),
                                              LOG_INODE_EXISTS, ctx);
                btrfs_add_delayed_iput(BTRFS_I(inode));
                if (ret)
                        return ret;

                if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
                        break;

                search_key.type = BTRFS_INODE_REF_KEY;
                ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
                if (ret < 0)
                        return ret;

                leaf = path->nodes[0];
                slot = path->slots[0];
                if (slot >= btrfs_header_nritems(leaf)) {
                        ret = btrfs_next_leaf(root, path);
                        if (ret < 0)
                                return ret;
                        else if (ret > 0)
                                return -ENOENT;
                        leaf = path->nodes[0];
                        slot = path->slots[0];
                }

                btrfs_item_key_to_cpu(leaf, &found_key, slot);
                if (found_key.objectid != search_key.objectid ||
                    found_key.type != BTRFS_INODE_REF_KEY)
                        return -ENOENT;
        }
        return 0;
}

static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
                                  struct btrfs_inode *inode,
                                  struct dentry *parent,
                                  struct btrfs_log_ctx *ctx)
{
        struct btrfs_root *root = inode->root;
        struct dentry *old_parent = NULL;
        struct super_block *sb = inode->vfs_inode.i_sb;
        int ret = 0;

        while (true) {
                if (!parent || d_really_is_negative(parent) ||
                    sb != parent->d_sb)
                        break;

                inode = BTRFS_I(d_inode(parent));
                if (root != inode->root)
                        break;

                if (inode->generation >= trans->transid &&
                    need_log_inode(trans, inode)) {
                        ret = btrfs_log_inode(trans, inode,
                                              LOG_INODE_EXISTS, ctx);
                        if (ret)
                                break;
                }
                if (IS_ROOT(parent))
                        break;

                parent = dget_parent(parent);
                dput(old_parent);
                old_parent = parent;
        }
        dput(old_parent);

        return ret;
}

static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
                                 struct btrfs_inode *inode,
                                 struct dentry *parent,
                                 struct btrfs_log_ctx *ctx)
{
        struct btrfs_root *root = inode->root;
        const u64 ino = btrfs_ino(inode);
        struct btrfs_path *path;
        struct btrfs_key search_key;
        int ret;

        /*
         * For a single hard link case, go through a fast path that does not
         * need to iterate the fs/subvolume tree.
         */
        if (inode->vfs_inode.i_nlink < 2)
                return log_new_ancestors_fast(trans, inode, parent, ctx);

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        search_key.objectid = ino;
        search_key.type = BTRFS_INODE_REF_KEY;
        search_key.offset = 0;
again:
        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
        if (ret < 0)
                goto out;
        if (ret == 0)
                path->slots[0]++;

        while (true) {
                struct extent_buffer *leaf = path->nodes[0];
                int slot = path->slots[0];
                struct btrfs_key found_key;

                if (slot >= btrfs_header_nritems(leaf)) {
                        ret = btrfs_next_leaf(root, path);
                        if (ret < 0)
                                goto out;
                        else if (ret > 0)
                                break;
                        continue;
                }

                btrfs_item_key_to_cpu(leaf, &found_key, slot);
                if (found_key.objectid != ino ||
                    found_key.type > BTRFS_INODE_EXTREF_KEY)
                        break;

                /*
                 * Don't deal with extended references because they are rare
                 * cases and too complex to deal with (we would need to keep
                 * track of which subitem we are processing for each item in
                 * this loop, etc). So just return some error to fallback to
                 * a transaction commit.
                 */
                if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
                        ret = -EMLINK;
                        goto out;
                }

                /*
                 * Logging ancestors needs to do more searches on the fs/subvol
                 * tree, so it releases the path as needed to avoid deadlocks.
                 * Keep track of the last inode ref key and resume from that key
                 * after logging all new ancestors for the current hard link.
                 */
                memcpy(&search_key, &found_key, sizeof(search_key));

                ret = log_new_ancestors(trans, root, path, ctx);
                if (ret)
                        goto out;
                btrfs_release_path(path);
                goto again;
        }
        ret = 0;
out:
        btrfs_free_path(path);
        return ret;
}

/*
 * helper function around btrfs_log_inode to make sure newly created
 * parent directories also end up in the log.  A minimal inode and backref
 * only logging is done of any parent directories that are older than
 * the last committed transaction
 */
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                                  struct btrfs_inode *inode,
                                  struct dentry *parent,
                                  int inode_only,
                                  struct btrfs_log_ctx *ctx)
{
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret = 0;
        bool log_dentries = false;

        if (btrfs_test_opt(fs_info, NOTREELOG)) {
                ret = BTRFS_LOG_FORCE_COMMIT;
                goto end_no_trans;
        }

        if (btrfs_root_refs(&root->root_item) == 0) {
                ret = BTRFS_LOG_FORCE_COMMIT;
                goto end_no_trans;
        }

        /*
         * Skip already logged inodes or inodes corresponding to tmpfiles
         * (since logging them is pointless, a link count of 0 means they
         * will never be accessible).
         */
        if ((btrfs_inode_in_log(inode, trans->transid) &&
             list_empty(&ctx->ordered_extents)) ||
            inode->vfs_inode.i_nlink == 0) {
                ret = BTRFS_NO_LOG_SYNC;
                goto end_no_trans;
        }

        ret = start_log_trans(trans, root, ctx);
        if (ret)
                goto end_no_trans;

        ret = btrfs_log_inode(trans, inode, inode_only, ctx);
        if (ret)
                goto end_trans;

        /*
         * for regular files, if its inode is already on disk, we don't
         * have to worry about the parents at all.  This is because
         * we can use the last_unlink_trans field to record renames
         * and other fun in this file.
         */
        if (S_ISREG(inode->vfs_inode.i_mode) &&
            inode->generation < trans->transid &&
            inode->last_unlink_trans < trans->transid) {
                ret = 0;
                goto end_trans;
        }

        if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
                log_dentries = true;

        /*
         * On unlink we must make sure all our current and old parent directory
         * inodes are fully logged. This is to prevent leaving dangling
         * directory index entries in directories that were our parents but are
         * not anymore. Not doing this results in old parent directory being
         * impossible to delete after log replay (rmdir will always fail with
         * error -ENOTEMPTY).
         *
         * Example 1:
         *
         * mkdir testdir
         * touch testdir/foo
         * ln testdir/foo testdir/bar
         * sync
         * unlink testdir/bar
         * xfs_io -c fsync testdir/foo
         * <power failure>
         * mount fs, triggers log replay
         *
         * If we don't log the parent directory (testdir), after log replay the
         * directory still has an entry pointing to the file inode using the bar
         * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
         * the file inode has a link count of 1.
         *
         * Example 2:
         *
         * mkdir testdir
         * touch foo
         * ln foo testdir/foo2
         * ln foo testdir/foo3
         * sync
         * unlink testdir/foo3
         * xfs_io -c fsync foo
         * <power failure>
         * mount fs, triggers log replay
         *
         * Similar as the first example, after log replay the parent directory
         * testdir still has an entry pointing to the inode file with name foo3
         * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
         * and has a link count of 2.
         */
        if (inode->last_unlink_trans >= trans->transid) {
                ret = btrfs_log_all_parents(trans, inode, ctx);
                if (ret)
                        goto end_trans;
        }

        ret = log_all_new_ancestors(trans, inode, parent, ctx);
        if (ret)
                goto end_trans;

        if (log_dentries)
                ret = log_new_dir_dentries(trans, inode, ctx);
        else
                ret = 0;
end_trans:
        if (ret < 0) {
                btrfs_set_log_full_commit(trans);
                ret = BTRFS_LOG_FORCE_COMMIT;
        }

        if (ret)
                btrfs_remove_log_ctx(root, ctx);
        btrfs_end_log_trans(root);
end_no_trans:
        return ret;
}

/*
 * it is not safe to log dentry if the chunk root has added new
 * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
 * If this returns 1, you must commit the transaction to safely get your
 * data on disk.
 */
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct dentry *dentry,
                          struct btrfs_log_ctx *ctx)
{
        struct dentry *parent = dget_parent(dentry);
        int ret;

        ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
                                     LOG_INODE_ALL, ctx);
        dput(parent);

        return ret;
}

/*
 * should be called during mount to recover any replay any log trees
 * from the FS
 */
int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
{
        int ret;
        struct btrfs_path *path;
        struct btrfs_trans_handle *trans;
        struct btrfs_key key;
        struct btrfs_key found_key;
        struct btrfs_root *log;
        struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
        struct walk_control wc = {
                .process_func = process_one_buffer,
                .stage = LOG_WALK_PIN_ONLY,
        };

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);

        trans = btrfs_start_transaction(fs_info->tree_root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto error;
        }

        wc.trans = trans;
        wc.pin = 1;

        ret = walk_log_tree(trans, log_root_tree, &wc);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto error;
        }

again:
        key.objectid = BTRFS_TREE_LOG_OBJECTID;
        key.offset = (u64)-1;
        key.type = BTRFS_ROOT_ITEM_KEY;

        while (1) {
                ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);

                if (ret < 0) {
                        btrfs_abort_transaction(trans, ret);
                        goto error;
                }
                if (ret > 0) {
                        if (path->slots[0] == 0)
                                break;
                        path->slots[0]--;
                }
                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
                                      path->slots[0]);
                btrfs_release_path(path);
                if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
                        break;

                log = btrfs_read_tree_root(log_root_tree, &found_key);
                if (IS_ERR(log)) {
                        ret = PTR_ERR(log);
                        btrfs_abort_transaction(trans, ret);
                        goto error;
                }

                wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
                                                   true);
                if (IS_ERR(wc.replay_dest)) {
                        ret = PTR_ERR(wc.replay_dest);

                        /*
                         * We didn't find the subvol, likely because it was
                         * deleted.  This is ok, simply skip this log and go to
                         * the next one.
                         *
                         * We need to exclude the root because we can't have
                         * other log replays overwriting this log as we'll read
                         * it back in a few more times.  This will keep our
                         * block from being modified, and we'll just bail for
                         * each subsequent pass.
                         */
                        if (ret == -ENOENT)
                                ret = btrfs_pin_extent_for_log_replay(trans, log->node);
                        btrfs_put_root(log);

                        if (!ret)
                                goto next;
                        btrfs_abort_transaction(trans, ret);
                        goto error;
                }

                wc.replay_dest->log_root = log;
                ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
                if (ret)
                        /* The loop needs to continue due to the root refs */
                        btrfs_abort_transaction(trans, ret);
                else
                        ret = walk_log_tree(trans, log, &wc);

                if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
                        ret = fixup_inode_link_counts(trans, wc.replay_dest,
                                                      path);
                        if (ret)
                                btrfs_abort_transaction(trans, ret);
                }

                if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
                        struct btrfs_root *root = wc.replay_dest;

                        btrfs_release_path(path);

                        /*
                         * We have just replayed everything, and the highest
                         * objectid of fs roots probably has changed in case
                         * some inode_item's got replayed.
                         *
                         * root->objectid_mutex is not acquired as log replay
                         * could only happen during mount.
                         */
                        ret = btrfs_init_root_free_objectid(root);
                        if (ret)
                                btrfs_abort_transaction(trans, ret);
                }

                wc.replay_dest->log_root = NULL;
                btrfs_put_root(wc.replay_dest);
                btrfs_put_root(log);

                if (ret)
                        goto error;
next:
                if (found_key.offset == 0)
                        break;
                key.offset = found_key.offset - 1;
        }
        btrfs_release_path(path);

        /* step one is to pin it all, step two is to replay just inodes */
        if (wc.pin) {
                wc.pin = 0;
                wc.process_func = replay_one_buffer;
                wc.stage = LOG_WALK_REPLAY_INODES;
                goto again;
        }
        /* step three is to replay everything */
        if (wc.stage < LOG_WALK_REPLAY_ALL) {
                wc.stage++;
                goto again;
        }

        btrfs_free_path(path);

        /* step 4: commit the transaction, which also unpins the blocks */
        ret = btrfs_commit_transaction(trans);
        if (ret)
                return ret;

        log_root_tree->log_root = NULL;
        clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
        btrfs_put_root(log_root_tree);

        return 0;
error:
        if (wc.trans)
                btrfs_end_transaction(wc.trans);
        clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
        btrfs_free_path(path);
        return ret;
}

/*
 * there are some corner cases where we want to force a full
 * commit instead of allowing a directory to be logged.
 *
 * They revolve around files there were unlinked from the directory, and
 * this function updates the parent directory so that a full commit is
 * properly done if it is fsync'd later after the unlinks are done.
 *
 * Must be called before the unlink operations (updates to the subvolume tree,
 * inodes, etc) are done.
 */
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
                             struct btrfs_inode *dir, struct btrfs_inode *inode,
                             bool for_rename)
{
        /*
         * when we're logging a file, if it hasn't been renamed
         * or unlinked, and its inode is fully committed on disk,
         * we don't have to worry about walking up the directory chain
         * to log its parents.
         *
         * So, we use the last_unlink_trans field to put this transid
         * into the file.  When the file is logged we check it and
         * don't log the parents if the file is fully on disk.
         */
        mutex_lock(&inode->log_mutex);
        inode->last_unlink_trans = trans->transid;
        mutex_unlock(&inode->log_mutex);

        if (!for_rename)
                return;

        /*
         * If this directory was already logged, any new names will be logged
         * with btrfs_log_new_name() and old names will be deleted from the log
         * tree with btrfs_del_dir_entries_in_log() or with
         * btrfs_del_inode_ref_in_log().
         */
        if (inode_logged(trans, dir, NULL) == 1)
                return;

        /*
         * If the inode we're about to unlink was logged before, the log will be
         * properly updated with the new name with btrfs_log_new_name() and the
         * old name removed with btrfs_del_dir_entries_in_log() or with
         * btrfs_del_inode_ref_in_log().
         */
        if (inode_logged(trans, inode, NULL) == 1)
                return;

        /*
         * when renaming files across directories, if the directory
         * there we're unlinking from gets fsync'd later on, there's
         * no way to find the destination directory later and fsync it
         * properly.  So, we have to be conservative and force commits
         * so the new name gets discovered.
         */
        mutex_lock(&dir->log_mutex);
        dir->last_unlink_trans = trans->transid;
        mutex_unlock(&dir->log_mutex);
}

/*
 * Make sure that if someone attempts to fsync the parent directory of a deleted
 * snapshot, it ends up triggering a transaction commit. This is to guarantee
 * that after replaying the log tree of the parent directory's root we will not
 * see the snapshot anymore and at log replay time we will not see any log tree
 * corresponding to the deleted snapshot's root, which could lead to replaying
 * it after replaying the log tree of the parent directory (which would replay
 * the snapshot delete operation).
 *
 * Must be called before the actual snapshot destroy operation (updates to the
 * parent root and tree of tree roots trees, etc) are done.
 */
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
                                   struct btrfs_inode *dir)
{
        mutex_lock(&dir->log_mutex);
        dir->last_unlink_trans = trans->transid;
        mutex_unlock(&dir->log_mutex);
}

/*
 * Update the log after adding a new name for an inode.
 *
 * @trans:              Transaction handle.
 * @old_dentry:         The dentry associated with the old name and the old
 *                      parent directory.
 * @old_dir:            The inode of the previous parent directory for the case
 *                      of a rename. For a link operation, it must be NULL.
 * @old_dir_index:      The index number associated with the old name, meaningful
 *                      only for rename operations (when @old_dir is not NULL).
 *                      Ignored for link operations.
 * @parent:             The dentry associated with the directory under which the
 *                      new name is located.
 *
 * Call this after adding a new name for an inode, as a result of a link or
 * rename operation, and it will properly update the log to reflect the new name.
 */
void btrfs_log_new_name(struct btrfs_trans_handle *trans,
                        struct dentry *old_dentry, struct btrfs_inode *old_dir,
                        u64 old_dir_index, struct dentry *parent)
{
        struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
        struct btrfs_root *root = inode->root;
        struct btrfs_log_ctx ctx;
        bool log_pinned = false;
        int ret;

        /*
         * this will force the logging code to walk the dentry chain
         * up for the file
         */
        if (!S_ISDIR(inode->vfs_inode.i_mode))
                inode->last_unlink_trans = trans->transid;

        /*
         * if this inode hasn't been logged and directory we're renaming it
         * from hasn't been logged, we don't need to log it
         */
        ret = inode_logged(trans, inode, NULL);
        if (ret < 0) {
                goto out;
        } else if (ret == 0) {
                if (!old_dir)
                        return;
                /*
                 * If the inode was not logged and we are doing a rename (old_dir is not
                 * NULL), check if old_dir was logged - if it was not we can return and
                 * do nothing.
                 */
                ret = inode_logged(trans, old_dir, NULL);
                if (ret < 0)
                        goto out;
                else if (ret == 0)
                        return;
        }
        ret = 0;

        /*
         * If we are doing a rename (old_dir is not NULL) from a directory that
         * was previously logged, make sure that on log replay we get the old
         * dir entry deleted. This is needed because we will also log the new
         * name of the renamed inode, so we need to make sure that after log
         * replay we don't end up with both the new and old dir entries existing.
         */
        if (old_dir && old_dir->logged_trans == trans->transid) {
                struct btrfs_root *log = old_dir->root->log_root;
                struct btrfs_path *path;
                struct fscrypt_name fname;

                ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);

                ret = fscrypt_setup_filename(&old_dir->vfs_inode,
                                             &old_dentry->d_name, 0, &fname);
                if (ret)
                        goto out;
                /*
                 * We have two inodes to update in the log, the old directory and
                 * the inode that got renamed, so we must pin the log to prevent
                 * anyone from syncing the log until we have updated both inodes
                 * in the log.
                 */
                ret = join_running_log_trans(root);
                /*
                 * At least one of the inodes was logged before, so this should
                 * not fail, but if it does, it's not serious, just bail out and
                 * mark the log for a full commit.
                 */
                if (WARN_ON_ONCE(ret < 0)) {
                        fscrypt_free_filename(&fname);
                        goto out;
                }

                log_pinned = true;

                path = btrfs_alloc_path();
                if (!path) {
                        ret = -ENOMEM;
                        fscrypt_free_filename(&fname);
                        goto out;
                }

                /*
                 * Other concurrent task might be logging the old directory,
                 * as it can be triggered when logging other inode that had or
                 * still has a dentry in the old directory. We lock the old
                 * directory's log_mutex to ensure the deletion of the old
                 * name is persisted, because during directory logging we
                 * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
                 * the old name's dir index item is in the delayed items, so
                 * it could be missed by an in progress directory logging.
                 */
                mutex_lock(&old_dir->log_mutex);
                ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
                                        &fname.disk_name, old_dir_index);
                if (ret > 0) {
                        /*
                         * The dentry does not exist in the log, so record its
                         * deletion.
                         */
                        btrfs_release_path(path);
                        ret = insert_dir_log_key(trans, log, path,
                                                 btrfs_ino(old_dir),
                                                 old_dir_index, old_dir_index);
                }
                mutex_unlock(&old_dir->log_mutex);

                btrfs_free_path(path);
                fscrypt_free_filename(&fname);
                if (ret < 0)
                        goto out;
        }

        btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
        ctx.logging_new_name = true;
        btrfs_init_log_ctx_scratch_eb(&ctx);
        /*
         * We don't care about the return value. If we fail to log the new name
         * then we know the next attempt to sync the log will fallback to a full
         * transaction commit (due to a call to btrfs_set_log_full_commit()), so
         * we don't need to worry about getting a log committed that has an
         * inconsistent state after a rename operation.
         */
        btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
        free_extent_buffer(ctx.scratch_eb);
        ASSERT(list_empty(&ctx.conflict_inodes));
out:
        /*
         * If an error happened mark the log for a full commit because it's not
         * consistent and up to date or we couldn't find out if one of the
         * inodes was logged before in this transaction. Do it before unpinning
         * the log, to avoid any races with someone else trying to commit it.
         */
        if (ret < 0)
                btrfs_set_log_full_commit(trans);
        if (log_pinned)
                btrfs_end_log_trans(root);
}


























































    1 
    2 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
// SPDX-License-Identifier: GPL-2.0-only
/*
 *      crc-itu-t.c
 */

#include <linux/types.h>
#include <linux/module.h>
#include <linux/crc-itu-t.h>

/* CRC table for the CRC ITU-T V.41 0x1021 (x^16 + x^12 + x^5 + 1) */
const u16 crc_itu_t_table[256] = {
        0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7,
        0x8108, 0x9129, 0xa14a, 0xb16b, 0xc18c, 0xd1ad, 0xe1ce, 0xf1ef,
        0x1231, 0x0210, 0x3273, 0x2252, 0x52b5, 0x4294, 0x72f7, 0x62d6,
        0x9339, 0x8318, 0xb37b, 0xa35a, 0xd3bd, 0xc39c, 0xf3ff, 0xe3de,
        0x2462, 0x3443, 0x0420, 0x1401, 0x64e6, 0x74c7, 0x44a4, 0x5485,
        0xa56a, 0xb54b, 0x8528, 0x9509, 0xe5ee, 0xf5cf, 0xc5ac, 0xd58d,
        0x3653, 0x2672, 0x1611, 0x0630, 0x76d7, 0x66f6, 0x5695, 0x46b4,
        0xb75b, 0xa77a, 0x9719, 0x8738, 0xf7df, 0xe7fe, 0xd79d, 0xc7bc,
        0x48c4, 0x58e5, 0x6886, 0x78a7, 0x0840, 0x1861, 0x2802, 0x3823,
        0xc9cc, 0xd9ed, 0xe98e, 0xf9af, 0x8948, 0x9969, 0xa90a, 0xb92b,
        0x5af5, 0x4ad4, 0x7ab7, 0x6a96, 0x1a71, 0x0a50, 0x3a33, 0x2a12,
        0xdbfd, 0xcbdc, 0xfbbf, 0xeb9e, 0x9b79, 0x8b58, 0xbb3b, 0xab1a,
        0x6ca6, 0x7c87, 0x4ce4, 0x5cc5, 0x2c22, 0x3c03, 0x0c60, 0x1c41,
        0xedae, 0xfd8f, 0xcdec, 0xddcd, 0xad2a, 0xbd0b, 0x8d68, 0x9d49,
        0x7e97, 0x6eb6, 0x5ed5, 0x4ef4, 0x3e13, 0x2e32, 0x1e51, 0x0e70,
        0xff9f, 0xefbe, 0xdfdd, 0xcffc, 0xbf1b, 0xaf3a, 0x9f59, 0x8f78,
        0x9188, 0x81a9, 0xb1ca, 0xa1eb, 0xd10c, 0xc12d, 0xf14e, 0xe16f,
        0x1080, 0x00a1, 0x30c2, 0x20e3, 0x5004, 0x4025, 0x7046, 0x6067,
        0x83b9, 0x9398, 0xa3fb, 0xb3da, 0xc33d, 0xd31c, 0xe37f, 0xf35e,
        0x02b1, 0x1290, 0x22f3, 0x32d2, 0x4235, 0x5214, 0x6277, 0x7256,
        0xb5ea, 0xa5cb, 0x95a8, 0x8589, 0xf56e, 0xe54f, 0xd52c, 0xc50d,
        0x34e2, 0x24c3, 0x14a0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405,
        0xa7db, 0xb7fa, 0x8799, 0x97b8, 0xe75f, 0xf77e, 0xc71d, 0xd73c,
        0x26d3, 0x36f2, 0x0691, 0x16b0, 0x6657, 0x7676, 0x4615, 0x5634,
        0xd94c, 0xc96d, 0xf90e, 0xe92f, 0x99c8, 0x89e9, 0xb98a, 0xa9ab,
        0x5844, 0x4865, 0x7806, 0x6827, 0x18c0, 0x08e1, 0x3882, 0x28a3,
        0xcb7d, 0xdb5c, 0xeb3f, 0xfb1e, 0x8bf9, 0x9bd8, 0xabbb, 0xbb9a,
        0x4a75, 0x5a54, 0x6a37, 0x7a16, 0x0af1, 0x1ad0, 0x2ab3, 0x3a92,
        0xfd2e, 0xed0f, 0xdd6c, 0xcd4d, 0xbdaa, 0xad8b, 0x9de8, 0x8dc9,
        0x7c26, 0x6c07, 0x5c64, 0x4c45, 0x3ca2, 0x2c83, 0x1ce0, 0x0cc1,
        0xef1f, 0xff3e, 0xcf5d, 0xdf7c, 0xaf9b, 0xbfba, 0x8fd9, 0x9ff8,
        0x6e17, 0x7e36, 0x4e55, 0x5e74, 0x2e93, 0x3eb2, 0x0ed1, 0x1ef0
};

EXPORT_SYMBOL(crc_itu_t_table);

/**
 * crc_itu_t - Compute the CRC-ITU-T for the data buffer
 *
 * @crc:     previous CRC value
 * @buffer:  data pointer
 * @len:     number of bytes in the buffer
 *
 * Returns the updated CRC value
 */
u16 crc_itu_t(u16 crc, const u8 *buffer, size_t len)
{
        while (len--)
                crc = crc_itu_t_byte(crc, *buffer++);
        return crc;
}
EXPORT_SYMBOL(crc_itu_t);

MODULE_DESCRIPTION("CRC ITU-T V.41 calculations");
MODULE_LICENSE("GPL");



























































































































































































































































































































































































    1 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
/*
 * linux/fs/nls/nls_cp864.c
 *
 * Charset cp864 translation tables.
 * Generated automatically from the Unicode and charset
 * tables from the Unicode Organization (www.unicode.org).
 * The Unicode to charset table has only exact mappings.
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/nls.h>
#include <linux/errno.h>

static const wchar_t charset2uni[256] = {
        /* 0x00*/
        0x0000, 0x0001, 0x0002, 0x0003,
        0x0004, 0x0005, 0x0006, 0x0007,
        0x0008, 0x0009, 0x000a, 0x000b,
        0x000c, 0x000d, 0x000e, 0x000f,
        /* 0x10*/
        0x0010, 0x0011, 0x0012, 0x0013,
        0x0014, 0x0015, 0x0016, 0x0017,
        0x0018, 0x0019, 0x001a, 0x001b,
        0x001c, 0x001d, 0x001e, 0x001f,
        /* 0x20*/
        0x0020, 0x0021, 0x0022, 0x0023,
        0x0024, 0x066a, 0x0026, 0x0027,
        0x0028, 0x0029, 0x002a, 0x002b,
        0x002c, 0x002d, 0x002e, 0x002f,
        /* 0x30*/
        0x0030, 0x0031, 0x0032, 0x0033,
        0x0034, 0x0035, 0x0036, 0x0037,
        0x0038, 0x0039, 0x003a, 0x003b,
        0x003c, 0x003d, 0x003e, 0x003f,
        /* 0x40*/
        0x0040, 0x0041, 0x0042, 0x0043,
        0x0044, 0x0045, 0x0046, 0x0047,
        0x0048, 0x0049, 0x004a, 0x004b,
        0x004c, 0x004d, 0x004e, 0x004f,
        /* 0x50*/
        0x0050, 0x0051, 0x0052, 0x0053,
        0x0054, 0x0055, 0x0056, 0x0057,
        0x0058, 0x0059, 0x005a, 0x005b,
        0x005c, 0x005d, 0x005e, 0x005f,
        /* 0x60*/
        0x0060, 0x0061, 0x0062, 0x0063,
        0x0064, 0x0065, 0x0066, 0x0067,
        0x0068, 0x0069, 0x006a, 0x006b,
        0x006c, 0x006d, 0x006e, 0x006f,
        /* 0x70*/
        0x0070, 0x0071, 0x0072, 0x0073,
        0x0074, 0x0075, 0x0076, 0x0077,
        0x0078, 0x0079, 0x007a, 0x007b,
        0x007c, 0x007d, 0x007e, 0x007f,
        /* 0x80*/
        0x00b0, 0x00b7, 0x2219, 0x221a,
        0x2592, 0x2500, 0x2502, 0x253c,
        0x2524, 0x252c, 0x251c, 0x2534,
        0x2510, 0x250c, 0x2514, 0x2518,
        /* 0x90*/
        0x03b2, 0x221e, 0x03c6, 0x00b1,
        0x00bd, 0x00bc, 0x2248, 0x00ab,
        0x00bb, 0xfef7, 0xfef8, 0x0000,
        0x0000, 0xfefb, 0xfefc, 0x0000,
        /* 0xa0*/
        0x00a0, 0x00ad, 0xfe82, 0x00a3,
        0x00a4, 0xfe84, 0x0000, 0x0000,
        0xfe8e, 0xfe8f, 0xfe95, 0xfe99,
        0x060c, 0xfe9d, 0xfea1, 0xfea5,
        /* 0xb0*/
        0x0660, 0x0661, 0x0662, 0x0663,
        0x0664, 0x0665, 0x0666, 0x0667,
        0x0668, 0x0669, 0xfed1, 0x061b,
        0xfeb1, 0xfeb5, 0xfeb9, 0x061f,
        /* 0xc0*/
        0x00a2, 0xfe80, 0xfe81, 0xfe83,
        0xfe85, 0xfeca, 0xfe8b, 0xfe8d,
        0xfe91, 0xfe93, 0xfe97, 0xfe9b,
        0xfe9f, 0xfea3, 0xfea7, 0xfea9,
        /* 0xd0*/
        0xfeab, 0xfead, 0xfeaf, 0xfeb3,
        0xfeb7, 0xfebb, 0xfebf, 0xfec1,
        0xfec5, 0xfecb, 0xfecf, 0x00a6,
        0x00ac, 0x00f7, 0x00d7, 0xfec9,
        /* 0xe0*/
        0x0640, 0xfed3, 0xfed7, 0xfedb,
        0xfedf, 0xfee3, 0xfee7, 0xfeeb,
        0xfeed, 0xfeef, 0xfef3, 0xfebd,
        0xfecc, 0xfece, 0xfecd, 0xfee1,
        /* 0xf0*/
        0xfe7d, 0x0651, 0xfee5, 0xfee9,
        0xfeec, 0xfef0, 0xfef2, 0xfed0,
        0xfed5, 0xfef5, 0xfef6, 0xfedd,
        0xfed9, 0xfef1, 0x25a0, 0x0000,
};

static const unsigned char page00[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x00, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
        0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
        0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
        0xa0, 0x00, 0xc0, 0xa3, 0xa4, 0x00, 0xdb, 0x00, /* 0xa0-0xa7 */
        0x00, 0x00, 0x00, 0x97, 0xdc, 0xa1, 0x00, 0x00, /* 0xa8-0xaf */
        0x80, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x81, /* 0xb0-0xb7 */
        0x00, 0x00, 0x00, 0x98, 0x95, 0x94, 0x00, 0x00, /* 0xb8-0xbf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, /* 0xd0-0xd7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdd, /* 0xf0-0xf7 */
};

static const unsigned char page03[256] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */

        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
        0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x92, 0x00, /* 0xc0-0xc7 */
};

static const unsigned char page06[256] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
        0x00, 0x00, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00, /* 0x08-0x0f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
        0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0x18-0x1f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
        0xe0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
        0x00, 0xf1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
        0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0x60-0x67 */
        0xb8, 0xb9, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
};

static const unsigned char page22[256] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
        0x00, 0x82, 0x83, 0x00, 0x00, 0x00, 0x91, 0x00, /* 0x18-0x1f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
        0x96, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
};

static const unsigned char page25[256] = {
        0x85, 0x00, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, /* 0x08-0x0f */
        0x8c, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, /* 0x10-0x17 */
        0x8f, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, /* 0x18-0x1f */
        0x00, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, /* 0x20-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, /* 0x28-0x2f */
        0x00, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00, /* 0x30-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, /* 0x38-0x3f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */

        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
        0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
        0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
};

static const unsigned char pagefe[256] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, /* 0x78-0x7f */

        0xc1, 0xc2, 0xa2, 0xc3, 0xa5, 0xc4, 0x00, 0x00, /* 0x80-0x87 */
        0x00, 0x00, 0x00, 0xc6, 0x00, 0xc7, 0xa8, 0xa9, /* 0x88-0x8f */
        0x00, 0xc8, 0x00, 0xc9, 0x00, 0xaa, 0x00, 0xca, /* 0x90-0x97 */
        0x00, 0xab, 0x00, 0xcb, 0x00, 0xad, 0x00, 0xcc, /* 0x98-0x9f */
        0x00, 0xae, 0x00, 0xcd, 0x00, 0xaf, 0x00, 0xce, /* 0xa0-0xa7 */
        0x00, 0xcf, 0x00, 0xd0, 0x00, 0xd1, 0x00, 0xd2, /* 0xa8-0xaf */
        0x00, 0xbc, 0x00, 0xd3, 0x00, 0xbd, 0x00, 0xd4, /* 0xb0-0xb7 */
        0x00, 0xbe, 0x00, 0xd5, 0x00, 0xeb, 0x00, 0xd6, /* 0xb8-0xbf */
        0x00, 0xd7, 0x00, 0x00, 0x00, 0xd8, 0x00, 0x00, /* 0xc0-0xc7 */
        0x00, 0xdf, 0xc5, 0xd9, 0xec, 0xee, 0xed, 0xda, /* 0xc8-0xcf */
        0xf7, 0xba, 0x00, 0xe1, 0x00, 0xf8, 0x00, 0xe2, /* 0xd0-0xd7 */
        0x00, 0xfc, 0x00, 0xe3, 0x00, 0xfb, 0x00, 0xe4, /* 0xd8-0xdf */
        0x00, 0xef, 0x00, 0xe5, 0x00, 0xf2, 0x00, 0xe6, /* 0xe0-0xe7 */
        0x00, 0xf3, 0x00, 0xe7, 0xf4, 0xe8, 0x00, 0xe9, /* 0xe8-0xef */
        0xf5, 0xfd, 0xf6, 0xea, 0x00, 0xf9, 0xfa, 0x99, /* 0xf0-0xf7 */
        0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, 0x00, 0x00, /* 0xf8-0xff */
};

static const unsigned char *const page_uni2charset[256] = {
        page00, NULL,   NULL,   page03, NULL,   NULL,   page06, NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   page22, NULL,   NULL,   page25, NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   pagefe, NULL,   
};

static const unsigned char charset2lower[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
        0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
        0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
        0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
        0x98, 0x99, 0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */
        0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0x00, 0x00, /* 0xa0-0xa7 */
        0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
        0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
        0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
        0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
        0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
        0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
        0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
        0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
        0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
        0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
        0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xf8-0xff */
};

static const unsigned char charset2upper[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
        0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
        0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
        0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
        0x00, 0x91, 0x00, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
        0x98, 0x99, 0x9a, 0x00, 0x00, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */
        0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0x00, 0x00, /* 0xa0-0xa7 */
        0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
        0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
        0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
        0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
        0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
        0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
        0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
        0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
        0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
        0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
        0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0x00, /* 0xf8-0xff */
};

static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
{
        const unsigned char *uni2charset;
        unsigned char cl = uni & 0x00ff;
        unsigned char ch = (uni & 0xff00) >> 8;

        if (boundlen <= 0)
                return -ENAMETOOLONG;

        uni2charset = page_uni2charset[ch];
        if (uni2charset && uni2charset[cl])
                out[0] = uni2charset[cl];
        else
                return -EINVAL;
        return 1;
}

static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
{
        *uni = charset2uni[*rawstring];
        if (*uni == 0x0000)
                return -EINVAL;
        return 1;
}

static struct nls_table table = {
        .charset        = "cp864",
        .uni2char        = uni2char,
        .char2uni        = char2uni,
        .charset2lower        = charset2lower,
        .charset2upper        = charset2upper,
};

static int __init init_nls_cp864(void)
{
        return register_nls(&table);
}

static void __exit exit_nls_cp864(void)
{
        unregister_nls(&table);
}

module_init(init_nls_cp864)
module_exit(exit_nls_cp864)

MODULE_LICENSE("Dual BSD/GPL");



















































































    6 


   35 






























   36 





























   36 


















































   37 





















   37 













   35 


















   39 


















   36 




































































   40 











   38 









































































































































































































































































































    4 









    3 

    3 
























    4 






    4 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2007 Casey Schaufler <casey@schaufler-ca.com>
 *
 * Author:
 *      Casey Schaufler <casey@schaufler-ca.com>
 */

#include <linux/types.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include "smack.h"

struct smack_known smack_known_huh = {
        .smk_known        = "?",
        .smk_secid        = 2,
};

struct smack_known smack_known_hat = {
        .smk_known        = "^",
        .smk_secid        = 3,
};

struct smack_known smack_known_star = {
        .smk_known        = "*",
        .smk_secid        = 4,
};

struct smack_known smack_known_floor = {
        .smk_known        = "_",
        .smk_secid        = 5,
};

struct smack_known smack_known_web = {
        .smk_known        = "@",
        .smk_secid        = 7,
};

LIST_HEAD(smack_known_list);

/*
 * The initial value needs to be bigger than any of the
 * known values above.
 */
static u32 smack_next_secid = 10;

/*
 * what events do we log
 * can be overwritten at run-time by /smack/logging
 */
int log_policy = SMACK_AUDIT_DENIED;

/**
 * smk_access_entry - look up matching access rule
 * @subject_label: a pointer to the subject's Smack label
 * @object_label: a pointer to the object's Smack label
 * @rule_list: the list of rules to search
 *
 * This function looks up the subject/object pair in the
 * access rule list and returns the access mode. If no
 * entry is found returns -ENOENT.
 *
 * NOTE:
 *
 * Earlier versions of this function allowed for labels that
 * were not on the label list. This was done to allow for
 * labels to come over the network that had never been seen
 * before on this host. Unless the receiving socket has the
 * star label this will always result in a failure check. The
 * star labeled socket case is now handled in the networking
 * hooks so there is no case where the label is not on the
 * label list. Checking to see if the address of two labels
 * is the same is now a reliable test.
 *
 * Do the object check first because that is more
 * likely to differ.
 *
 * Allowing write access implies allowing locking.
 */
int smk_access_entry(char *subject_label, char *object_label,
                        struct list_head *rule_list)
{
        struct smack_rule *srp;

        list_for_each_entry_rcu(srp, rule_list, list) {
                if (srp->smk_object->smk_known == object_label &&
                    srp->smk_subject->smk_known == subject_label) {
                        int may = srp->smk_access;
                        /*
                         * MAY_WRITE implies MAY_LOCK.
                         */
                        if ((may & MAY_WRITE) == MAY_WRITE)
                                may |= MAY_LOCK;
                        return may;
                }
        }

        return -ENOENT;
}

/**
 * smk_access - determine if a subject has a specific access to an object
 * @subject: a pointer to the subject's Smack label entry
 * @object: a pointer to the object's Smack label entry
 * @request: the access requested, in "MAY" format
 * @a : a pointer to the audit data
 *
 * This function looks up the subject/object pair in the
 * access rule list and returns 0 if the access is permitted,
 * non zero otherwise.
 *
 * Smack labels are shared on smack_list
 */
int smk_access(struct smack_known *subject, struct smack_known *object,
               int request, struct smk_audit_info *a)
{
        int may = MAY_NOT;
        int rc = 0;

        /*
         * Hardcoded comparisons.
         */
        /*
         * A star subject can't access any object.
         */
        if (subject == &smack_known_star) {
                rc = -EACCES;
                goto out_audit;
        }
        /*
         * An internet object can be accessed by any subject.
         * Tasks cannot be assigned the internet label.
         * An internet subject can access any object.
         */
        if (object == &smack_known_web || subject == &smack_known_web)
                goto out_audit;
        /*
         * A star object can be accessed by any subject.
         */
        if (object == &smack_known_star)
                goto out_audit;
        /*
         * An object can be accessed in any way by a subject
         * with the same label.
         */
        if (subject->smk_known == object->smk_known)
                goto out_audit;
        /*
         * A hat subject can read or lock any object.
         * A floor object can be read or locked by any subject.
         */
        if ((request & MAY_ANYREAD) == request ||
            (request & MAY_LOCK) == request) {
                if (object == &smack_known_floor)
                        goto out_audit;
                if (subject == &smack_known_hat)
                        goto out_audit;
        }
        /*
         * Beyond here an explicit relationship is required.
         * If the requested access is contained in the available
         * access (e.g. read is included in readwrite) it's
         * good. A negative response from smk_access_entry()
         * indicates there is no entry for this pair.
         */
        rcu_read_lock();
        may = smk_access_entry(subject->smk_known, object->smk_known,
                               &subject->smk_rules);
        rcu_read_unlock();

        if (may <= 0 || (request & may) != request) {
                rc = -EACCES;
                goto out_audit;
        }
#ifdef CONFIG_SECURITY_SMACK_BRINGUP
        /*
         * Return a positive value if using bringup mode.
         * This allows the hooks to identify checks that
         * succeed because of "b" rules.
         */
        if (may & MAY_BRINGUP)
                rc = SMACK_BRINGUP_ALLOW;
#endif

out_audit:

#ifdef CONFIG_SECURITY_SMACK_BRINGUP
        if (rc < 0) {
                if (object == smack_unconfined)
                        rc = SMACK_UNCONFINED_OBJECT;
                if (subject == smack_unconfined)
                        rc = SMACK_UNCONFINED_SUBJECT;
        }
#endif

#ifdef CONFIG_AUDIT
        if (a)
                smack_log(subject->smk_known, object->smk_known,
                          request, rc, a);
#endif

        return rc;
}

/**
 * smk_tskacc - determine if a task has a specific access to an object
 * @tsp: a pointer to the subject's task
 * @obj_known: a pointer to the object's label entry
 * @mode: the access requested, in "MAY" format
 * @a : common audit data
 *
 * This function checks the subject task's label/object label pair
 * in the access rule list and returns 0 if the access is permitted,
 * non zero otherwise. It allows that the task may have the capability
 * to override the rules.
 */
int smk_tskacc(struct task_smack *tsp, struct smack_known *obj_known,
               u32 mode, struct smk_audit_info *a)
{
        struct smack_known *sbj_known = smk_of_task(tsp);
        int may;
        int rc;

        /*
         * Check the global rule list
         */
        rc = smk_access(sbj_known, obj_known, mode, NULL);
        if (rc >= 0) {
                /*
                 * If there is an entry in the task's rule list
                 * it can further restrict access.
                 */
                may = smk_access_entry(sbj_known->smk_known,
                                       obj_known->smk_known,
                                       &tsp->smk_rules);
                if (may < 0)
                        goto out_audit;
                if ((mode & may) == mode)
                        goto out_audit;
                rc = -EACCES;
        }

        /*
         * Allow for priviliged to override policy.
         */
        if (rc != 0 && smack_privileged(CAP_MAC_OVERRIDE))
                rc = 0;

out_audit:
#ifdef CONFIG_AUDIT
        if (a)
                smack_log(sbj_known->smk_known, obj_known->smk_known,
                          mode, rc, a);
#endif
        return rc;
}

/**
 * smk_curacc - determine if current has a specific access to an object
 * @obj_known: a pointer to the object's Smack label entry
 * @mode: the access requested, in "MAY" format
 * @a : common audit data
 *
 * This function checks the current subject label/object label pair
 * in the access rule list and returns 0 if the access is permitted,
 * non zero otherwise. It allows that current may have the capability
 * to override the rules.
 */
int smk_curacc(struct smack_known *obj_known,
               u32 mode, struct smk_audit_info *a)
{
        struct task_smack *tsp = smack_cred(current_cred());

        return smk_tskacc(tsp, obj_known, mode, a);
}

#ifdef CONFIG_AUDIT
/**
 * smack_str_from_perm : helper to transalate an int to a
 * readable string
 * @string : the string to fill
 * @access : the int
 *
 */
static inline void smack_str_from_perm(char *string, int access)
{
        int i = 0;

        if (access & MAY_READ)
                string[i++] = 'r';
        if (access & MAY_WRITE)
                string[i++] = 'w';
        if (access & MAY_EXEC)
                string[i++] = 'x';
        if (access & MAY_APPEND)
                string[i++] = 'a';
        if (access & MAY_TRANSMUTE)
                string[i++] = 't';
        if (access & MAY_LOCK)
                string[i++] = 'l';
        string[i] = '\0';
}
/**
 * smack_log_callback - SMACK specific information
 * will be called by generic audit code
 * @ab : the audit_buffer
 * @a  : audit_data
 *
 */
static void smack_log_callback(struct audit_buffer *ab, void *a)
{
        struct common_audit_data *ad = a;
        struct smack_audit_data *sad = ad->smack_audit_data;
        audit_log_format(ab, "lsm=SMACK fn=%s action=%s",
                         ad->smack_audit_data->function,
                         sad->result ? "denied" : "granted");
        audit_log_format(ab, " subject=");
        audit_log_untrustedstring(ab, sad->subject);
        audit_log_format(ab, " object=");
        audit_log_untrustedstring(ab, sad->object);
        if (sad->request[0] == '\0')
                audit_log_format(ab, " labels_differ");
        else
                audit_log_format(ab, " requested=%s", sad->request);
}

/**
 *  smack_log - Audit the granting or denial of permissions.
 *  @subject_label : smack label of the requester
 *  @object_label  : smack label of the object being accessed
 *  @request: requested permissions
 *  @result: result from smk_access
 *  @ad:  auxiliary audit data
 *
 * Audit the granting or denial of permissions in accordance
 * with the policy.
 */
void smack_log(char *subject_label, char *object_label, int request,
               int result, struct smk_audit_info *ad)
{
#ifdef CONFIG_SECURITY_SMACK_BRINGUP
        char request_buffer[SMK_NUM_ACCESS_TYPE + 5];
#else
        char request_buffer[SMK_NUM_ACCESS_TYPE + 1];
#endif
        struct smack_audit_data *sad;
        struct common_audit_data *a = &ad->a;

        /* check if we have to log the current event */
        if (result < 0 && (log_policy & SMACK_AUDIT_DENIED) == 0)
                return;
        if (result == 0 && (log_policy & SMACK_AUDIT_ACCEPT) == 0)
                return;

        sad = a->smack_audit_data;

        if (sad->function == NULL)
                sad->function = "unknown";

        /* end preparing the audit data */
        smack_str_from_perm(request_buffer, request);
        sad->subject = subject_label;
        sad->object  = object_label;
#ifdef CONFIG_SECURITY_SMACK_BRINGUP
        /*
         * The result may be positive in bringup mode.
         * A positive result is an allow, but not for normal reasons.
         * Mark it as successful, but don't filter it out even if
         * the logging policy says to do so.
         */
        if (result == SMACK_UNCONFINED_SUBJECT)
                strcat(request_buffer, "(US)");
        else if (result == SMACK_UNCONFINED_OBJECT)
                strcat(request_buffer, "(UO)");

        if (result > 0)
                result = 0;
#endif
        sad->request = request_buffer;
        sad->result  = result;

        common_lsm_audit(a, smack_log_callback, NULL);
}
#else /* #ifdef CONFIG_AUDIT */
void smack_log(char *subject_label, char *object_label, int request,
               int result, struct smk_audit_info *ad)
{
}
#endif

DEFINE_MUTEX(smack_known_lock);

struct hlist_head smack_known_hash[SMACK_HASH_SLOTS];

/**
 * smk_insert_entry - insert a smack label into a hash map,
 * @skp: smack label
 *
 * this function must be called under smack_known_lock
 */
void smk_insert_entry(struct smack_known *skp)
{
        unsigned int hash;
        struct hlist_head *head;

        hash = full_name_hash(NULL, skp->smk_known, strlen(skp->smk_known));
        head = &smack_known_hash[hash & (SMACK_HASH_SLOTS - 1)];

        hlist_add_head_rcu(&skp->smk_hashed, head);
        list_add_rcu(&skp->list, &smack_known_list);
}

/**
 * smk_find_entry - find a label on the list, return the list entry
 * @string: a text string that might be a Smack label
 *
 * Returns a pointer to the entry in the label list that
 * matches the passed string or NULL if not found.
 */
struct smack_known *smk_find_entry(const char *string)
{
        unsigned int hash;
        struct hlist_head *head;
        struct smack_known *skp;

        hash = full_name_hash(NULL, string, strlen(string));
        head = &smack_known_hash[hash & (SMACK_HASH_SLOTS - 1)];

        hlist_for_each_entry_rcu(skp, head, smk_hashed)
                if (strcmp(skp->smk_known, string) == 0)
                        return skp;

        return NULL;
}

/**
 * smk_parse_smack - parse smack label from a text string
 * @string: a text string that might contain a Smack label
 * @len: the maximum size, or zero if it is NULL terminated.
 *
 * Returns a pointer to the clean label or an error code.
 */
char *smk_parse_smack(const char *string, int len)
{
        char *smack;
        int i;

        if (len <= 0)
                len = strlen(string) + 1;

        /*
         * Reserve a leading '-' as an indicator that
         * this isn't a label, but an option to interfaces
         * including /smack/cipso and /smack/cipso2
         */
        if (string[0] == '-')
                return ERR_PTR(-EINVAL);

        for (i = 0; i < len; i++)
                if (string[i] > '~' || string[i] <= ' ' || string[i] == '/' ||
                    string[i] == '"' || string[i] == '\\' || string[i] == '\'')
                        break;

        if (i == 0 || i >= SMK_LONGLABEL)
                return ERR_PTR(-EINVAL);

        smack = kstrndup(string, i, GFP_NOFS);
        if (!smack)
                return ERR_PTR(-ENOMEM);
        return smack;
}

/**
 * smk_netlbl_mls - convert a catset to netlabel mls categories
 * @level: MLS sensitivity level
 * @catset: the Smack categories
 * @sap: where to put the netlabel categories
 * @len: number of bytes for the levels in a CIPSO IP option
 *
 * Allocates and fills attr.mls
 * Returns 0 on success, error code on failure.
 */
int smk_netlbl_mls(int level, char *catset, struct netlbl_lsm_secattr *sap,
                        int len)
{
        unsigned char *cp;
        unsigned char m;
        int cat;
        int rc;
        int byte;

        sap->flags |= NETLBL_SECATTR_MLS_CAT;
        sap->attr.mls.lvl = level;
        sap->attr.mls.cat = NULL;

        for (cat = 1, cp = catset, byte = 0; byte < len; cp++, byte++)
                for (m = 0x80; m != 0; m >>= 1, cat++) {
                        if ((m & *cp) == 0)
                                continue;
                        rc = netlbl_catmap_setbit(&sap->attr.mls.cat,
                                                  cat, GFP_NOFS);
                        if (rc < 0) {
                                netlbl_catmap_free(sap->attr.mls.cat);
                                return rc;
                        }
                }

        return 0;
}

/**
 * smack_populate_secattr - fill in the smack_known netlabel information
 * @skp: pointer to the structure to fill
 *
 * Populate the netlabel secattr structure for a Smack label.
 *
 * Returns 0 unless creating the category mapping fails
 */
int smack_populate_secattr(struct smack_known *skp)
{
        int slen;

        skp->smk_netlabel.attr.secid = skp->smk_secid;
        skp->smk_netlabel.domain = skp->smk_known;
        skp->smk_netlabel.cache = netlbl_secattr_cache_alloc(GFP_ATOMIC);
        if (skp->smk_netlabel.cache != NULL) {
                skp->smk_netlabel.flags |= NETLBL_SECATTR_CACHE;
                skp->smk_netlabel.cache->free = NULL;
                skp->smk_netlabel.cache->data = skp;
        }
        skp->smk_netlabel.flags |= NETLBL_SECATTR_SECID |
                                   NETLBL_SECATTR_MLS_LVL |
                                   NETLBL_SECATTR_DOMAIN;
        /*
         * If direct labeling works use it.
         * Otherwise use mapped labeling.
         */
        slen = strlen(skp->smk_known);
        if (slen < SMK_CIPSOLEN)
                return smk_netlbl_mls(smack_cipso_direct, skp->smk_known,
                                      &skp->smk_netlabel, slen);

        return smk_netlbl_mls(smack_cipso_mapped, (char *)&skp->smk_secid,
                              &skp->smk_netlabel, sizeof(skp->smk_secid));
}

/**
 * smk_import_entry - import a label, return the list entry
 * @string: a text string that might be a Smack label
 * @len: the maximum size, or zero if it is NULL terminated.
 *
 * Returns a pointer to the entry in the label list that
 * matches the passed string, adding it if necessary,
 * or an error code.
 */
struct smack_known *smk_import_entry(const char *string, int len)
{
        struct smack_known *skp;
        char *smack;
        int rc;

        smack = smk_parse_smack(string, len);
        if (IS_ERR(smack))
                return ERR_CAST(smack);

        mutex_lock(&smack_known_lock);

        skp = smk_find_entry(smack);
        if (skp != NULL)
                goto freeout;

        skp = kzalloc(sizeof(*skp), GFP_NOFS);
        if (skp == NULL) {
                skp = ERR_PTR(-ENOMEM);
                goto freeout;
        }

        skp->smk_known = smack;
        skp->smk_secid = smack_next_secid++;

        rc = smack_populate_secattr(skp);
        if (rc >= 0) {
                INIT_LIST_HEAD(&skp->smk_rules);
                mutex_init(&skp->smk_rules_lock);
                /*
                 * Make sure that the entry is actually
                 * filled before putting it on the list.
                 */
                smk_insert_entry(skp);
                goto unlockout;
        }
        kfree(skp);
        skp = ERR_PTR(rc);
freeout:
        kfree(smack);
unlockout:
        mutex_unlock(&smack_known_lock);

        return skp;
}

/**
 * smack_from_secid - find the Smack label associated with a secid
 * @secid: an integer that might be associated with a Smack label
 *
 * Returns a pointer to the appropriate Smack label entry if there is one,
 * otherwise a pointer to the invalid Smack label.
 */
struct smack_known *smack_from_secid(const u32 secid)
{
        struct smack_known *skp;

        rcu_read_lock();
        list_for_each_entry_rcu(skp, &smack_known_list, list) {
                if (skp->smk_secid == secid) {
                        rcu_read_unlock();
                        return skp;
                }
        }

        /*
         * If we got this far someone asked for the translation
         * of a secid that is not on the list.
         */
        rcu_read_unlock();
        return &smack_known_huh;
}

/*
 * Unless a process is running with one of these labels
 * even having CAP_MAC_OVERRIDE isn't enough to grant
 * privilege to violate MAC policy. If no labels are
 * designated (the empty list case) capabilities apply to
 * everyone.
 */
LIST_HEAD(smack_onlycap_list);
DEFINE_MUTEX(smack_onlycap_lock);

/**
 * smack_privileged_cred - are all privilege requirements met by cred
 * @cap: The requested capability
 * @cred: the credential to use
 *
 * Is the task privileged and allowed to be privileged
 * by the onlycap rule.
 *
 * Returns true if the task is allowed to be privileged, false if it's not.
 */
bool smack_privileged_cred(int cap, const struct cred *cred)
{
        struct task_smack *tsp = smack_cred(cred);
        struct smack_known *skp = tsp->smk_task;
        struct smack_known_list_elem *sklep;
        int rc;

        rc = cap_capable(cred, &init_user_ns, cap, CAP_OPT_NONE);
        if (rc)
                return false;

        rcu_read_lock();
        if (list_empty(&smack_onlycap_list)) {
                rcu_read_unlock();
                return true;
        }

        list_for_each_entry_rcu(sklep, &smack_onlycap_list, list) {
                if (sklep->smk_label == skp) {
                        rcu_read_unlock();
                        return true;
                }
        }
        rcu_read_unlock();

        return false;
}

/**
 * smack_privileged - are all privilege requirements met
 * @cap: The requested capability
 *
 * Is the task privileged and allowed to be privileged
 * by the onlycap rule.
 *
 * Returns true if the task is allowed to be privileged, false if it's not.
 */
bool smack_privileged(int cap)
{
        /*
         * All kernel tasks are privileged
         */
        if (unlikely(current->flags & PF_KTHREAD))
                return true;

        return smack_privileged_cred(cap, current_cred());
}

















































    1 











































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2011 STRATO AG
 * written by Arne Jansen <sensille@gmx.net>
 */

#include <linux/slab.h>
#include "messages.h"
#include "ulist.h"

/*
 * ulist is a generic data structure to hold a collection of unique u64
 * values. The only operations it supports is adding to the list and
 * enumerating it.
 * It is possible to store an auxiliary value along with the key.
 *
 * A sample usage for ulists is the enumeration of directed graphs without
 * visiting a node twice. The pseudo-code could look like this:
 *
 * ulist = ulist_alloc();
 * ulist_add(ulist, root);
 * ULIST_ITER_INIT(&uiter);
 *
 * while ((elem = ulist_next(ulist, &uiter)) {
 *         for (all child nodes n in elem)
 *                ulist_add(ulist, n);
 *        do something useful with the node;
 * }
 * ulist_free(ulist);
 *
 * This assumes the graph nodes are addressable by u64. This stems from the
 * usage for tree enumeration in btrfs, where the logical addresses are
 * 64 bit.
 *
 * It is also useful for tree enumeration which could be done elegantly
 * recursively, but is not possible due to kernel stack limitations. The
 * loop would be similar to the above.
 */

/*
 * Freshly initialize a ulist.
 *
 * @ulist:        the ulist to initialize
 *
 * Note: don't use this function to init an already used ulist, use
 * ulist_reinit instead.
 */
void ulist_init(struct ulist *ulist)
{
        INIT_LIST_HEAD(&ulist->nodes);
        ulist->root = RB_ROOT;
        ulist->nnodes = 0;
}

/*
 * Free up additionally allocated memory for the ulist.
 *
 * @ulist:        the ulist from which to free the additional memory
 *
 * This is useful in cases where the base 'struct ulist' has been statically
 * allocated.
 */
void ulist_release(struct ulist *ulist)
{
        struct ulist_node *node;
        struct ulist_node *next;

        list_for_each_entry_safe(node, next, &ulist->nodes, list) {
                kfree(node);
        }
        ulist->root = RB_ROOT;
        INIT_LIST_HEAD(&ulist->nodes);
}

/*
 * Prepare a ulist for reuse.
 *
 * @ulist:        ulist to be reused
 *
 * Free up all additional memory allocated for the list elements and reinit
 * the ulist.
 */
void ulist_reinit(struct ulist *ulist)
{
        ulist_release(ulist);
        ulist_init(ulist);
}

/*
 * Dynamically allocate a ulist.
 *
 * @gfp_mask:        allocation flags to for base allocation
 *
 * The allocated ulist will be returned in an initialized state.
 */
struct ulist *ulist_alloc(gfp_t gfp_mask)
{
        struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);

        if (!ulist)
                return NULL;

        ulist_init(ulist);

        return ulist;
}

/*
 * Free dynamically allocated ulist.
 *
 * @ulist:        ulist to free
 *
 * It is not necessary to call ulist_release before.
 */
void ulist_free(struct ulist *ulist)
{
        if (!ulist)
                return;
        ulist_release(ulist);
        kfree(ulist);
}

static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
{
        struct rb_node *n = ulist->root.rb_node;
        struct ulist_node *u = NULL;

        while (n) {
                u = rb_entry(n, struct ulist_node, rb_node);
                if (u->val < val)
                        n = n->rb_right;
                else if (u->val > val)
                        n = n->rb_left;
                else
                        return u;
        }
        return NULL;
}

static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node)
{
        rb_erase(&node->rb_node, &ulist->root);
        list_del(&node->list);
        kfree(node);
        BUG_ON(ulist->nnodes == 0);
        ulist->nnodes--;
}

static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
{
        struct rb_node **p = &ulist->root.rb_node;
        struct rb_node *parent = NULL;
        struct ulist_node *cur = NULL;

        while (*p) {
                parent = *p;
                cur = rb_entry(parent, struct ulist_node, rb_node);

                if (cur->val < ins->val)
                        p = &(*p)->rb_right;
                else if (cur->val > ins->val)
                        p = &(*p)->rb_left;
                else
                        return -EEXIST;
        }
        rb_link_node(&ins->rb_node, parent, p);
        rb_insert_color(&ins->rb_node, &ulist->root);
        return 0;
}

/*
 * Add an element to the ulist.
 *
 * @ulist:        ulist to add the element to
 * @val:        value to add to ulist
 * @aux:        auxiliary value to store along with val
 * @gfp_mask:        flags to use for allocation
 *
 * Note: locking must be provided by the caller. In case of rwlocks write
 *       locking is needed
 *
 * Add an element to a ulist. The @val will only be added if it doesn't
 * already exist. If it is added, the auxiliary value @aux is stored along with
 * it. In case @val already exists in the ulist, @aux is ignored, even if
 * it differs from the already stored value.
 *
 * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been
 * inserted.
 * In case of allocation failure -ENOMEM is returned and the ulist stays
 * unaltered.
 */
int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
{
        return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
}

int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
                    u64 *old_aux, gfp_t gfp_mask)
{
        int ret;
        struct ulist_node *node;

        node = ulist_rbtree_search(ulist, val);
        if (node) {
                if (old_aux)
                        *old_aux = node->aux;
                return 0;
        }
        node = kmalloc(sizeof(*node), gfp_mask);
        if (!node)
                return -ENOMEM;

        node->val = val;
        node->aux = aux;

        ret = ulist_rbtree_insert(ulist, node);
        ASSERT(!ret);
        list_add_tail(&node->list, &ulist->nodes);
        ulist->nnodes++;

        return 1;
}

/*
 * Delete one node from ulist.
 *
 * @ulist:        ulist to remove node from
 * @val:        value to delete
 * @aux:        aux to delete
 *
 * The deletion will only be done when *BOTH* val and aux matches.
 * Return 0 for successful delete.
 * Return > 0 for not found.
 */
int ulist_del(struct ulist *ulist, u64 val, u64 aux)
{
        struct ulist_node *node;

        node = ulist_rbtree_search(ulist, val);
        /* Not found */
        if (!node)
                return 1;

        if (node->aux != aux)
                return 1;

        /* Found and delete */
        ulist_rbtree_erase(ulist, node);
        return 0;
}

/*
 * Iterate ulist.
 *
 * @ulist:        ulist to iterate
 * @uiter:        iterator variable, initialized with ULIST_ITER_INIT(&iterator)
 *
 * Note: locking must be provided by the caller. In case of rwlocks only read
 *       locking is needed
 *
 * This function is used to iterate an ulist.
 * It returns the next element from the ulist or %NULL when the
 * end is reached. No guarantee is made with respect to the order in which
 * the elements are returned. They might neither be returned in order of
 * addition nor in ascending order.
 * It is allowed to call ulist_add during an enumeration. Newly added items
 * are guaranteed to show up in the running enumeration.
 */
struct ulist_node *ulist_next(const struct ulist *ulist, struct ulist_iterator *uiter)
{
        struct ulist_node *node;

        if (list_empty(&ulist->nodes))
                return NULL;
        if (uiter->cur_list && uiter->cur_list->next == &ulist->nodes)
                return NULL;
        if (uiter->cur_list) {
                uiter->cur_list = uiter->cur_list->next;
        } else {
                uiter->cur_list = ulist->nodes.next;
        }
        node = list_entry(uiter->cur_list, struct ulist_node, list);
        return node;
}

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_GENERIC_BITOPS_LE_H_
#define _ASM_GENERIC_BITOPS_LE_H_

#include <asm/types.h>
#include <asm/byteorder.h>

#if defined(__LITTLE_ENDIAN)

#define BITOP_LE_SWIZZLE        0

#elif defined(__BIG_ENDIAN)

#define BITOP_LE_SWIZZLE        ((BITS_PER_LONG-1) & ~0x7)

#endif


static inline int test_bit_le(int nr, const void *addr)
{
        return test_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline void set_bit_le(int nr, void *addr)
{
        set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline void clear_bit_le(int nr, void *addr)
{
        clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline void __set_bit_le(int nr, void *addr)
{
        __set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline void __clear_bit_le(int nr, void *addr)
{
        __clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline int test_and_set_bit_le(int nr, void *addr)
{
        return test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline int test_and_clear_bit_le(int nr, void *addr)
{
        return test_and_clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline int __test_and_set_bit_le(int nr, void *addr)
{
        return __test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline int __test_and_clear_bit_le(int nr, void *addr)
{
        return __test_and_clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

#endif /* _ASM_GENERIC_BITOPS_LE_H_ */






































    4 

    6 






    8 
    8 



    8 
    7 
    8 





    8 





















    2 
    2 


    2 
    2 





























    8 











    8 





















































    4 







































































































































































































































































































































































































































    2 




























    6 







    1 



    5 




































































































































































































































































































































































































    2 










    2 

    2 
























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2002,2003 by Andreas Gruenbacher <a.gruenbacher@computer.org>
 *
 * Fixes from William Schumacher incorporated on 15 March 2001.
 *    (Reported by Charles Bertsch, <CBertsch@microtest.com>).
 */

/*
 *  This file contains generic functions for manipulating
 *  POSIX 1003.1e draft standard 17 ACLs.
 */

#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/atomic.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/cred.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
#include <linux/namei.h>
#include <linux/mnt_idmapping.h>
#include <linux/iversion.h>
#include <linux/security.h>
#include <linux/fsnotify.h>
#include <linux/filelock.h>

#include "internal.h"

static struct posix_acl **acl_by_type(struct inode *inode, int type)
{
        switch (type) {
        case ACL_TYPE_ACCESS:
                return &inode->i_acl;
        case ACL_TYPE_DEFAULT:
                return &inode->i_default_acl;
        default:
                BUG();
        }
}

struct posix_acl *get_cached_acl(struct inode *inode, int type)
{
        struct posix_acl **p = acl_by_type(inode, type);
        struct posix_acl *acl;

        for (;;) {
                rcu_read_lock();
                acl = rcu_dereference(*p);
                if (!acl || is_uncached_acl(acl) ||
                    refcount_inc_not_zero(&acl->a_refcount))
                        break;
                rcu_read_unlock();
                cpu_relax();
        }
        rcu_read_unlock();
        return acl;
}
EXPORT_SYMBOL(get_cached_acl);

struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
{
        struct posix_acl *acl = rcu_dereference(*acl_by_type(inode, type));

        if (acl == ACL_DONT_CACHE) {
                struct posix_acl *ret;

                ret = inode->i_op->get_inode_acl(inode, type, LOOKUP_RCU);
                if (!IS_ERR(ret))
                        acl = ret;
        }

        return acl;
}
EXPORT_SYMBOL(get_cached_acl_rcu);

void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl)
{
        struct posix_acl **p = acl_by_type(inode, type);
        struct posix_acl *old;

        old = xchg(p, posix_acl_dup(acl));
        if (!is_uncached_acl(old))
                posix_acl_release(old);
}
EXPORT_SYMBOL(set_cached_acl);

static void __forget_cached_acl(struct posix_acl **p)
{
        struct posix_acl *old;

        old = xchg(p, ACL_NOT_CACHED);
        if (!is_uncached_acl(old))
                posix_acl_release(old);
}

void forget_cached_acl(struct inode *inode, int type)
{
        __forget_cached_acl(acl_by_type(inode, type));
}
EXPORT_SYMBOL(forget_cached_acl);

void forget_all_cached_acls(struct inode *inode)
{
        __forget_cached_acl(&inode->i_acl);
        __forget_cached_acl(&inode->i_default_acl);
}
EXPORT_SYMBOL(forget_all_cached_acls);

static struct posix_acl *__get_acl(struct mnt_idmap *idmap,
                                   struct dentry *dentry, struct inode *inode,
                                   int type)
{
        struct posix_acl *sentinel;
        struct posix_acl **p;
        struct posix_acl *acl;

        /*
         * The sentinel is used to detect when another operation like
         * set_cached_acl() or forget_cached_acl() races with get_inode_acl().
         * It is guaranteed that is_uncached_acl(sentinel) is true.
         */

        acl = get_cached_acl(inode, type);
        if (!is_uncached_acl(acl))
                return acl;

        if (!IS_POSIXACL(inode))
                return NULL;

        sentinel = uncached_acl_sentinel(current);
        p = acl_by_type(inode, type);

        /*
         * If the ACL isn't being read yet, set our sentinel.  Otherwise, the
         * current value of the ACL will not be ACL_NOT_CACHED and so our own
         * sentinel will not be set; another task will update the cache.  We
         * could wait for that other task to complete its job, but it's easier
         * to just call ->get_inode_acl to fetch the ACL ourself.  (This is
         * going to be an unlikely race.)
         */
        cmpxchg(p, ACL_NOT_CACHED, sentinel);

        /*
         * Normally, the ACL returned by ->get{_inode}_acl will be cached.
         * A filesystem can prevent that by calling
         * forget_cached_acl(inode, type) in ->get{_inode}_acl.
         *
         * If the filesystem doesn't have a get{_inode}_ acl() function at all,
         * we'll just create the negative cache entry.
         */
        if (dentry && inode->i_op->get_acl) {
                acl = inode->i_op->get_acl(idmap, dentry, type);
        } else if (inode->i_op->get_inode_acl) {
                acl = inode->i_op->get_inode_acl(inode, type, false);
        } else {
                set_cached_acl(inode, type, NULL);
                return NULL;
        }
        if (IS_ERR(acl)) {
                /*
                 * Remove our sentinel so that we don't block future attempts
                 * to cache the ACL.
                 */
                cmpxchg(p, sentinel, ACL_NOT_CACHED);
                return acl;
        }

        /*
         * Cache the result, but only if our sentinel is still in place.
         */
        posix_acl_dup(acl);
        if (unlikely(!try_cmpxchg(p, &sentinel, acl)))
                posix_acl_release(acl);
        return acl;
}

struct posix_acl *get_inode_acl(struct inode *inode, int type)
{
        return __get_acl(&nop_mnt_idmap, NULL, inode, type);
}
EXPORT_SYMBOL(get_inode_acl);

/*
 * Init a fresh posix_acl
 */
void
posix_acl_init(struct posix_acl *acl, int count)
{
        refcount_set(&acl->a_refcount, 1);
        acl->a_count = count;
}
EXPORT_SYMBOL(posix_acl_init);

/*
 * Allocate a new ACL with the specified number of entries.
 */
struct posix_acl *
posix_acl_alloc(int count, gfp_t flags)
{
        const size_t size = sizeof(struct posix_acl) +
                            count * sizeof(struct posix_acl_entry);
        struct posix_acl *acl = kmalloc(size, flags);
        if (acl)
                posix_acl_init(acl, count);
        return acl;
}
EXPORT_SYMBOL(posix_acl_alloc);

/*
 * Clone an ACL.
 */
struct posix_acl *
posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
{
        struct posix_acl *clone = NULL;

        if (acl) {
                int size = sizeof(struct posix_acl) + acl->a_count *
                           sizeof(struct posix_acl_entry);
                clone = kmemdup(acl, size, flags);
                if (clone)
                        refcount_set(&clone->a_refcount, 1);
        }
        return clone;
}
EXPORT_SYMBOL_GPL(posix_acl_clone);

/*
 * Check if an acl is valid. Returns 0 if it is, or -E... otherwise.
 */
int
posix_acl_valid(struct user_namespace *user_ns, const struct posix_acl *acl)
{
        const struct posix_acl_entry *pa, *pe;
        int state = ACL_USER_OBJ;
        int needs_mask = 0;

        FOREACH_ACL_ENTRY(pa, acl, pe) {
                if (pa->e_perm & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
                        return -EINVAL;
                switch (pa->e_tag) {
                        case ACL_USER_OBJ:
                                if (state == ACL_USER_OBJ) {
                                        state = ACL_USER;
                                        break;
                                }
                                return -EINVAL;

                        case ACL_USER:
                                if (state != ACL_USER)
                                        return -EINVAL;
                                if (!kuid_has_mapping(user_ns, pa->e_uid))
                                        return -EINVAL;
                                needs_mask = 1;
                                break;

                        case ACL_GROUP_OBJ:
                                if (state == ACL_USER) {
                                        state = ACL_GROUP;
                                        break;
                                }
                                return -EINVAL;

                        case ACL_GROUP:
                                if (state != ACL_GROUP)
                                        return -EINVAL;
                                if (!kgid_has_mapping(user_ns, pa->e_gid))
                                        return -EINVAL;
                                needs_mask = 1;
                                break;

                        case ACL_MASK:
                                if (state != ACL_GROUP)
                                        return -EINVAL;
                                state = ACL_OTHER;
                                break;

                        case ACL_OTHER:
                                if (state == ACL_OTHER ||
                                    (state == ACL_GROUP && !needs_mask)) {
                                        state = 0;
                                        break;
                                }
                                return -EINVAL;

                        default:
                                return -EINVAL;
                }
        }
        if (state == 0)
                return 0;
        return -EINVAL;
}
EXPORT_SYMBOL(posix_acl_valid);

/*
 * Returns 0 if the acl can be exactly represented in the traditional
 * file mode permission bits, or else 1. Returns -E... on error.
 */
int
posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
{
        const struct posix_acl_entry *pa, *pe;
        umode_t mode = 0;
        int not_equiv = 0;

        /*
         * A null ACL can always be presented as mode bits.
         */
        if (!acl)
                return 0;

        FOREACH_ACL_ENTRY(pa, acl, pe) {
                switch (pa->e_tag) {
                        case ACL_USER_OBJ:
                                mode |= (pa->e_perm & S_IRWXO) << 6;
                                break;
                        case ACL_GROUP_OBJ:
                                mode |= (pa->e_perm & S_IRWXO) << 3;
                                break;
                        case ACL_OTHER:
                                mode |= pa->e_perm & S_IRWXO;
                                break;
                        case ACL_MASK:
                                mode = (mode & ~S_IRWXG) |
                                       ((pa->e_perm & S_IRWXO) << 3);
                                not_equiv = 1;
                                break;
                        case ACL_USER:
                        case ACL_GROUP:
                                not_equiv = 1;
                                break;
                        default:
                                return -EINVAL;
                }
        }
        if (mode_p)
                *mode_p = (*mode_p & ~S_IRWXUGO) | mode;
        return not_equiv;
}
EXPORT_SYMBOL(posix_acl_equiv_mode);

/*
 * Create an ACL representing the file mode permission bits of an inode.
 */
struct posix_acl *
posix_acl_from_mode(umode_t mode, gfp_t flags)
{
        struct posix_acl *acl = posix_acl_alloc(3, flags);
        if (!acl)
                return ERR_PTR(-ENOMEM);

        acl->a_entries[0].e_tag  = ACL_USER_OBJ;
        acl->a_entries[0].e_perm = (mode & S_IRWXU) >> 6;

        acl->a_entries[1].e_tag  = ACL_GROUP_OBJ;
        acl->a_entries[1].e_perm = (mode & S_IRWXG) >> 3;

        acl->a_entries[2].e_tag  = ACL_OTHER;
        acl->a_entries[2].e_perm = (mode & S_IRWXO);
        return acl;
}
EXPORT_SYMBOL(posix_acl_from_mode);

/*
 * Return 0 if current is granted want access to the inode
 * by the acl. Returns -E... otherwise.
 */
int
posix_acl_permission(struct mnt_idmap *idmap, struct inode *inode,
                     const struct posix_acl *acl, int want)
{
        const struct posix_acl_entry *pa, *pe, *mask_obj;
        struct user_namespace *fs_userns = i_user_ns(inode);
        int found = 0;
        vfsuid_t vfsuid;
        vfsgid_t vfsgid;

        want &= MAY_READ | MAY_WRITE | MAY_EXEC;

        FOREACH_ACL_ENTRY(pa, acl, pe) {
                switch(pa->e_tag) {
                        case ACL_USER_OBJ:
                                /* (May have been checked already) */
                                vfsuid = i_uid_into_vfsuid(idmap, inode);
                                if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
                                        goto check_perm;
                                break;
                        case ACL_USER:
                                vfsuid = make_vfsuid(idmap, fs_userns,
                                                     pa->e_uid);
                                if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
                                        goto mask;
                                break;
                        case ACL_GROUP_OBJ:
                                vfsgid = i_gid_into_vfsgid(idmap, inode);
                                if (vfsgid_in_group_p(vfsgid)) {
                                        found = 1;
                                        if ((pa->e_perm & want) == want)
                                                goto mask;
                                }
                                break;
                        case ACL_GROUP:
                                vfsgid = make_vfsgid(idmap, fs_userns,
                                                     pa->e_gid);
                                if (vfsgid_in_group_p(vfsgid)) {
                                        found = 1;
                                        if ((pa->e_perm & want) == want)
                                                goto mask;
                                }
                                break;
                        case ACL_MASK:
                                break;
                        case ACL_OTHER:
                                if (found)
                                        return -EACCES;
                                else
                                        goto check_perm;
                        default:
                                return -EIO;
                }
        }
        return -EIO;

mask:
        for (mask_obj = pa+1; mask_obj != pe; mask_obj++) {
                if (mask_obj->e_tag == ACL_MASK) {
                        if ((pa->e_perm & mask_obj->e_perm & want) == want)
                                return 0;
                        return -EACCES;
                }
        }

check_perm:
        if ((pa->e_perm & want) == want)
                return 0;
        return -EACCES;
}

/*
 * Modify acl when creating a new inode. The caller must ensure the acl is
 * only referenced once.
 *
 * mode_p initially must contain the mode parameter to the open() / creat()
 * system calls. All permissions that are not granted by the acl are removed.
 * The permissions in the acl are changed to reflect the mode_p parameter.
 */
static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p)
{
        struct posix_acl_entry *pa, *pe;
        struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
        umode_t mode = *mode_p;
        int not_equiv = 0;

        /* assert(atomic_read(acl->a_refcount) == 1); */

        FOREACH_ACL_ENTRY(pa, acl, pe) {
                switch(pa->e_tag) {
                        case ACL_USER_OBJ:
                                pa->e_perm &= (mode >> 6) | ~S_IRWXO;
                                mode &= (pa->e_perm << 6) | ~S_IRWXU;
                                break;

                        case ACL_USER:
                        case ACL_GROUP:
                                not_equiv = 1;
                                break;

                        case ACL_GROUP_OBJ:
                                group_obj = pa;
                                break;

                        case ACL_OTHER:
                                pa->e_perm &= mode | ~S_IRWXO;
                                mode &= pa->e_perm | ~S_IRWXO;
                                break;

                        case ACL_MASK:
                                mask_obj = pa;
                                not_equiv = 1;
                                break;

                        default:
                                return -EIO;
                }
        }

        if (mask_obj) {
                mask_obj->e_perm &= (mode >> 3) | ~S_IRWXO;
                mode &= (mask_obj->e_perm << 3) | ~S_IRWXG;
        } else {
                if (!group_obj)
                        return -EIO;
                group_obj->e_perm &= (mode >> 3) | ~S_IRWXO;
                mode &= (group_obj->e_perm << 3) | ~S_IRWXG;
        }

        *mode_p = (*mode_p & ~S_IRWXUGO) | mode;
        return not_equiv;
}

/*
 * Modify the ACL for the chmod syscall.
 */
static int __posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode)
{
        struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
        struct posix_acl_entry *pa, *pe;

        /* assert(atomic_read(acl->a_refcount) == 1); */

        FOREACH_ACL_ENTRY(pa, acl, pe) {
                switch(pa->e_tag) {
                        case ACL_USER_OBJ:
                                pa->e_perm = (mode & S_IRWXU) >> 6;
                                break;

                        case ACL_USER:
                        case ACL_GROUP:
                                break;

                        case ACL_GROUP_OBJ:
                                group_obj = pa;
                                break;

                        case ACL_MASK:
                                mask_obj = pa;
                                break;

                        case ACL_OTHER:
                                pa->e_perm = (mode & S_IRWXO);
                                break;

                        default:
                                return -EIO;
                }
        }

        if (mask_obj) {
                mask_obj->e_perm = (mode & S_IRWXG) >> 3;
        } else {
                if (!group_obj)
                        return -EIO;
                group_obj->e_perm = (mode & S_IRWXG) >> 3;
        }

        return 0;
}

int
__posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p)
{
        struct posix_acl *clone = posix_acl_clone(*acl, gfp);
        int err = -ENOMEM;
        if (clone) {
                err = posix_acl_create_masq(clone, mode_p);
                if (err < 0) {
                        posix_acl_release(clone);
                        clone = NULL;
                }
        }
        posix_acl_release(*acl);
        *acl = clone;
        return err;
}
EXPORT_SYMBOL(__posix_acl_create);

int
__posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
{
        struct posix_acl *clone = posix_acl_clone(*acl, gfp);
        int err = -ENOMEM;
        if (clone) {
                err = __posix_acl_chmod_masq(clone, mode);
                if (err) {
                        posix_acl_release(clone);
                        clone = NULL;
                }
        }
        posix_acl_release(*acl);
        *acl = clone;
        return err;
}
EXPORT_SYMBOL(__posix_acl_chmod);

/**
 * posix_acl_chmod - chmod a posix acl
 *
 * @idmap:        idmap of the mount @inode was found from
 * @dentry:        dentry to check permissions on
 * @mode:        the new mode of @inode
 *
 * If the dentry has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 */
int
 posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry,
                    umode_t mode)
{
        struct inode *inode = d_inode(dentry);
        struct posix_acl *acl;
        int ret = 0;

        if (!IS_POSIXACL(inode))
                return 0;
        if (!inode->i_op->set_acl)
                return -EOPNOTSUPP;

        acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR_OR_NULL(acl)) {
                if (acl == ERR_PTR(-EOPNOTSUPP))
                        return 0;
                return PTR_ERR(acl);
        }

        ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
        if (ret)
                return ret;
        ret = inode->i_op->set_acl(idmap, dentry, acl, ACL_TYPE_ACCESS);
        posix_acl_release(acl);
        return ret;
}
EXPORT_SYMBOL(posix_acl_chmod);

int
posix_acl_create(struct inode *dir, umode_t *mode,
                struct posix_acl **default_acl, struct posix_acl **acl)
{
        struct posix_acl *p;
        struct posix_acl *clone;
        int ret;

        *acl = NULL;
        *default_acl = NULL;

        if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
                return 0;

        p = get_inode_acl(dir, ACL_TYPE_DEFAULT);
        if (!p || p == ERR_PTR(-EOPNOTSUPP)) {
                *mode &= ~current_umask();
                return 0;
        }
        if (IS_ERR(p))
                return PTR_ERR(p);

        ret = -ENOMEM;
        clone = posix_acl_clone(p, GFP_NOFS);
        if (!clone)
                goto err_release;

        ret = posix_acl_create_masq(clone, mode);
        if (ret < 0)
                goto err_release_clone;

        if (ret == 0)
                posix_acl_release(clone);
        else
                *acl = clone;

        if (!S_ISDIR(*mode))
                posix_acl_release(p);
        else
                *default_acl = p;

        return 0;

err_release_clone:
        posix_acl_release(clone);
err_release:
        posix_acl_release(p);
        return ret;
}
EXPORT_SYMBOL_GPL(posix_acl_create);

/**
 * posix_acl_update_mode  -  update mode in set_acl
 * @idmap:        idmap of the mount @inode was found from
 * @inode:        target inode
 * @mode_p:        mode (pointer) for update
 * @acl:        acl pointer
 *
 * Update the file mode when setting an ACL: compute the new file permission
 * bits based on the ACL.  In addition, if the ACL is equivalent to the new
 * file mode, set *@acl to NULL to indicate that no ACL should be set.
 *
 * As with chmod, clear the setgid bit if the caller is not in the owning group
 * or capable of CAP_FSETID (see inode_change_ok).
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * Called from set_acl inode operations.
 */
int posix_acl_update_mode(struct mnt_idmap *idmap,
                          struct inode *inode, umode_t *mode_p,
                          struct posix_acl **acl)
{
        umode_t mode = inode->i_mode;
        int error;

        error = posix_acl_equiv_mode(*acl, &mode);
        if (error < 0)
                return error;
        if (error == 0)
                *acl = NULL;
        if (!vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)) &&
            !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID))
                mode &= ~S_ISGID;
        *mode_p = mode;
        return 0;
}
EXPORT_SYMBOL(posix_acl_update_mode);

/*
 * Fix up the uids and gids in posix acl extended attributes in place.
 */
static int posix_acl_fix_xattr_common(const void *value, size_t size)
{
        const struct posix_acl_xattr_header *header = value;
        int count;

        if (!header)
                return -EINVAL;
        if (size < sizeof(struct posix_acl_xattr_header))
                return -EINVAL;
        if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
                return -EOPNOTSUPP;

        count = posix_acl_xattr_count(size);
        if (count < 0)
                return -EINVAL;
        if (count == 0)
                return 0;

        return count;
}

/**
 * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format
 * @userns: the filesystem's idmapping
 * @value: the uapi representation of POSIX ACLs
 * @size: the size of @void
 *
 * Filesystems that store POSIX ACLs in the unaltered uapi format should use
 * posix_acl_from_xattr() when reading them from the backing store and
 * converting them into the struct posix_acl VFS format. The helper is
 * specifically intended to be called from the acl inode operation.
 *
 * The posix_acl_from_xattr() function will map the raw {g,u}id values stored
 * in ACL_{GROUP,USER} entries into idmapping in @userns.
 *
 * Note that posix_acl_from_xattr() does not take idmapped mounts into account.
 * If it did it calling it from the get acl inode operation would return POSIX
 * ACLs mapped according to an idmapped mount which would mean that the value
 * couldn't be cached for the filesystem. Idmapped mounts are taken into
 * account on the fly during permission checking or right at the VFS -
 * userspace boundary before reporting them to the user.
 *
 * Return: Allocated struct posix_acl on success, NULL for a valid header but
 *         without actual POSIX ACL entries, or ERR_PTR() encoded error code.
 */
struct posix_acl *posix_acl_from_xattr(struct user_namespace *userns,
                                       const void *value, size_t size)
{
        const struct posix_acl_xattr_header *header = value;
        const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end;
        int count;
        struct posix_acl *acl;
        struct posix_acl_entry *acl_e;

        count = posix_acl_fix_xattr_common(value, size);
        if (count < 0)
                return ERR_PTR(count);
        if (count == 0)
                return NULL;

        acl = posix_acl_alloc(count, GFP_NOFS);
        if (!acl)
                return ERR_PTR(-ENOMEM);
        acl_e = acl->a_entries;

        for (end = entry + count; entry != end; acl_e++, entry++) {
                acl_e->e_tag  = le16_to_cpu(entry->e_tag);
                acl_e->e_perm = le16_to_cpu(entry->e_perm);

                switch(acl_e->e_tag) {
                        case ACL_USER_OBJ:
                        case ACL_GROUP_OBJ:
                        case ACL_MASK:
                        case ACL_OTHER:
                                break;

                        case ACL_USER:
                                acl_e->e_uid = make_kuid(userns,
                                                le32_to_cpu(entry->e_id));
                                if (!uid_valid(acl_e->e_uid))
                                        goto fail;
                                break;
                        case ACL_GROUP:
                                acl_e->e_gid = make_kgid(userns,
                                                le32_to_cpu(entry->e_id));
                                if (!gid_valid(acl_e->e_gid))
                                        goto fail;
                                break;

                        default:
                                goto fail;
                }
        }
        return acl;

fail:
        posix_acl_release(acl);
        return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL (posix_acl_from_xattr);

/*
 * Convert from in-memory to extended attribute representation.
 */
int
posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
                   void *buffer, size_t size)
{
        struct posix_acl_xattr_header *ext_acl = buffer;
        struct posix_acl_xattr_entry *ext_entry;
        int real_size, n;

        real_size = posix_acl_xattr_size(acl->a_count);
        if (!buffer)
                return real_size;
        if (real_size > size)
                return -ERANGE;

        ext_entry = (void *)(ext_acl + 1);
        ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);

        for (n=0; n < acl->a_count; n++, ext_entry++) {
                const struct posix_acl_entry *acl_e = &acl->a_entries[n];
                ext_entry->e_tag  = cpu_to_le16(acl_e->e_tag);
                ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
                switch(acl_e->e_tag) {
                case ACL_USER:
                        ext_entry->e_id =
                                cpu_to_le32(from_kuid(user_ns, acl_e->e_uid));
                        break;
                case ACL_GROUP:
                        ext_entry->e_id =
                                cpu_to_le32(from_kgid(user_ns, acl_e->e_gid));
                        break;
                default:
                        ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
                        break;
                }
        }
        return real_size;
}
EXPORT_SYMBOL (posix_acl_to_xattr);

/**
 * vfs_posix_acl_to_xattr - convert from kernel to userspace representation
 * @idmap: idmap of the mount
 * @inode: inode the posix acls are set on
 * @acl: the posix acls as represented by the vfs
 * @buffer: the buffer into which to convert @acl
 * @size: size of @buffer
 *
 * This converts @acl from the VFS representation in the filesystem idmapping
 * to the uapi form reportable to userspace. And mount and caller idmappings
 * are handled appropriately.
 *
 * Return: On success, the size of the stored uapi posix acls, on error a
 * negative errno.
 */
static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap,
                                      struct inode *inode,
                                      const struct posix_acl *acl, void *buffer,
                                      size_t size)

{
        struct posix_acl_xattr_header *ext_acl = buffer;
        struct posix_acl_xattr_entry *ext_entry;
        struct user_namespace *fs_userns, *caller_userns;
        ssize_t real_size, n;
        vfsuid_t vfsuid;
        vfsgid_t vfsgid;

        real_size = posix_acl_xattr_size(acl->a_count);
        if (!buffer)
                return real_size;
        if (real_size > size)
                return -ERANGE;

        ext_entry = (void *)(ext_acl + 1);
        ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);

        fs_userns = i_user_ns(inode);
        caller_userns = current_user_ns();
        for (n=0; n < acl->a_count; n++, ext_entry++) {
                const struct posix_acl_entry *acl_e = &acl->a_entries[n];
                ext_entry->e_tag  = cpu_to_le16(acl_e->e_tag);
                ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
                switch(acl_e->e_tag) {
                case ACL_USER:
                        vfsuid = make_vfsuid(idmap, fs_userns, acl_e->e_uid);
                        ext_entry->e_id = cpu_to_le32(from_kuid(
                                caller_userns, vfsuid_into_kuid(vfsuid)));
                        break;
                case ACL_GROUP:
                        vfsgid = make_vfsgid(idmap, fs_userns, acl_e->e_gid);
                        ext_entry->e_id = cpu_to_le32(from_kgid(
                                caller_userns, vfsgid_into_kgid(vfsgid)));
                        break;
                default:
                        ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
                        break;
                }
        }
        return real_size;
}

int
set_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry,
              int type, struct posix_acl *acl)
{
        struct inode *inode = d_inode(dentry);

        if (!IS_POSIXACL(inode))
                return -EOPNOTSUPP;
        if (!inode->i_op->set_acl)
                return -EOPNOTSUPP;

        if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
                return acl ? -EACCES : 0;
        if (!inode_owner_or_capable(idmap, inode))
                return -EPERM;

        if (acl) {
                int ret = posix_acl_valid(inode->i_sb->s_user_ns, acl);
                if (ret)
                        return ret;
        }
        return inode->i_op->set_acl(idmap, dentry, acl, type);
}
EXPORT_SYMBOL(set_posix_acl);

int posix_acl_listxattr(struct inode *inode, char **buffer,
                        ssize_t *remaining_size)
{
        int err;

        if (!IS_POSIXACL(inode))
                return 0;

        if (inode->i_acl) {
                err = xattr_list_one(buffer, remaining_size,
                                     XATTR_NAME_POSIX_ACL_ACCESS);
                if (err)
                        return err;
        }

        if (inode->i_default_acl) {
                err = xattr_list_one(buffer, remaining_size,
                                     XATTR_NAME_POSIX_ACL_DEFAULT);
                if (err)
                        return err;
        }

        return 0;
}

static bool
posix_acl_xattr_list(struct dentry *dentry)
{
        return IS_POSIXACL(d_backing_inode(dentry));
}

/*
 * nop_posix_acl_access - legacy xattr handler for access POSIX ACLs
 *
 * This is the legacy POSIX ACL access xattr handler. It is used by some
 * filesystems to implement their ->listxattr() inode operation. New code
 * should never use them.
 */
const struct xattr_handler nop_posix_acl_access = {
        .name = XATTR_NAME_POSIX_ACL_ACCESS,
        .list = posix_acl_xattr_list,
};
EXPORT_SYMBOL_GPL(nop_posix_acl_access);

/*
 * nop_posix_acl_default - legacy xattr handler for default POSIX ACLs
 *
 * This is the legacy POSIX ACL default xattr handler. It is used by some
 * filesystems to implement their ->listxattr() inode operation. New code
 * should never use them.
 */
const struct xattr_handler nop_posix_acl_default = {
        .name = XATTR_NAME_POSIX_ACL_DEFAULT,
        .list = posix_acl_xattr_list,
};
EXPORT_SYMBOL_GPL(nop_posix_acl_default);

int simple_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                   struct posix_acl *acl, int type)
{
        int error;
        struct inode *inode = d_inode(dentry);

        if (type == ACL_TYPE_ACCESS) {
                error = posix_acl_update_mode(idmap, inode,
                                &inode->i_mode, &acl);
                if (error)
                        return error;
        }

        inode_set_ctime_current(inode);
        if (IS_I_VERSION(inode))
                inode_inc_iversion(inode);
        set_cached_acl(inode, type, acl);
        return 0;
}

int simple_acl_create(struct inode *dir, struct inode *inode)
{
        struct posix_acl *default_acl, *acl;
        int error;

        error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
        if (error)
                return error;

        set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl);
        set_cached_acl(inode, ACL_TYPE_ACCESS, acl);

        if (default_acl)
                posix_acl_release(default_acl);
        if (acl)
                posix_acl_release(acl);
        return 0;
}

static int vfs_set_acl_idmapped_mnt(struct mnt_idmap *idmap,
                                    struct user_namespace *fs_userns,
                                    struct posix_acl *acl)
{
        for (int n = 0; n < acl->a_count; n++) {
                struct posix_acl_entry *acl_e = &acl->a_entries[n];

                switch (acl_e->e_tag) {
                case ACL_USER:
                        acl_e->e_uid = from_vfsuid(idmap, fs_userns,
                                                   VFSUIDT_INIT(acl_e->e_uid));
                        break;
                case ACL_GROUP:
                        acl_e->e_gid = from_vfsgid(idmap, fs_userns,
                                                   VFSGIDT_INIT(acl_e->e_gid));
                        break;
                }
        }

        return 0;
}

/**
 * vfs_set_acl - set posix acls
 * @idmap: idmap of the mount
 * @dentry: the dentry based on which to set the posix acls
 * @acl_name: the name of the posix acl
 * @kacl: the posix acls in the appropriate VFS format
 *
 * This function sets @kacl. The caller must all posix_acl_release() on @kacl
 * afterwards.
 *
 * Return: On success 0, on error negative errno.
 */
int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                const char *acl_name, struct posix_acl *kacl)
{
        int acl_type;
        int error;
        struct inode *inode = d_inode(dentry);
        struct inode *delegated_inode = NULL;

        acl_type = posix_acl_type(acl_name);
        if (acl_type < 0)
                return -EINVAL;

        if (kacl) {
                /*
                 * If we're on an idmapped mount translate from mount specific
                 * vfs{g,u}id_t into global filesystem k{g,u}id_t.
                 * Afterwards we can cache the POSIX ACLs filesystem wide and -
                 * if this is a filesystem with a backing store - ultimately
                 * translate them to backing store values.
                 */
                error = vfs_set_acl_idmapped_mnt(idmap, i_user_ns(inode), kacl);
                if (error)
                        return error;
        }

retry_deleg:
        inode_lock(inode);

        /*
         * We only care about restrictions the inode struct itself places upon
         * us otherwise POSIX ACLs aren't subject to any VFS restrictions.
         */
        error = may_write_xattr(idmap, inode);
        if (error)
                goto out_inode_unlock;

        error = security_inode_set_acl(idmap, dentry, acl_name, kacl);
        if (error)
                goto out_inode_unlock;

        error = try_break_deleg(inode, &delegated_inode);
        if (error)
                goto out_inode_unlock;

        if (likely(!is_bad_inode(inode)))
                error = set_posix_acl(idmap, dentry, acl_type, kacl);
        else
                error = -EIO;
        if (!error) {
                fsnotify_xattr(dentry);
                security_inode_post_set_acl(dentry, acl_name, kacl);
        }

out_inode_unlock:
        inode_unlock(inode);

        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }

        return error;
}
EXPORT_SYMBOL_GPL(vfs_set_acl);

/**
 * vfs_get_acl - get posix acls
 * @idmap: idmap of the mount
 * @dentry: the dentry based on which to retrieve the posix acls
 * @acl_name: the name of the posix acl
 *
 * This function retrieves @kacl from the filesystem. The caller must all
 * posix_acl_release() on @kacl.
 *
 * Return: On success POSIX ACLs in VFS format, on error negative errno.
 */
struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap,
                              struct dentry *dentry, const char *acl_name)
{
        struct inode *inode = d_inode(dentry);
        struct posix_acl *acl;
        int acl_type, error;

        acl_type = posix_acl_type(acl_name);
        if (acl_type < 0)
                return ERR_PTR(-EINVAL);

        /*
         * The VFS has no restrictions on reading POSIX ACLs so calling
         * something like xattr_permission() isn't needed. Only LSMs get a say.
         */
        error = security_inode_get_acl(idmap, dentry, acl_name);
        if (error)
                return ERR_PTR(error);

        if (!IS_POSIXACL(inode))
                return ERR_PTR(-EOPNOTSUPP);
        if (S_ISLNK(inode->i_mode))
                return ERR_PTR(-EOPNOTSUPP);

        acl = __get_acl(idmap, dentry, inode, acl_type);
        if (IS_ERR(acl))
                return acl;
        if (!acl)
                return ERR_PTR(-ENODATA);

        return acl;
}
EXPORT_SYMBOL_GPL(vfs_get_acl);

/**
 * vfs_remove_acl - remove posix acls
 * @idmap: idmap of the mount
 * @dentry: the dentry based on which to retrieve the posix acls
 * @acl_name: the name of the posix acl
 *
 * This function removes posix acls.
 *
 * Return: On success 0, on error negative errno.
 */
int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                   const char *acl_name)
{
        int acl_type;
        int error;
        struct inode *inode = d_inode(dentry);
        struct inode *delegated_inode = NULL;

        acl_type = posix_acl_type(acl_name);
        if (acl_type < 0)
                return -EINVAL;

retry_deleg:
        inode_lock(inode);

        /*
         * We only care about restrictions the inode struct itself places upon
         * us otherwise POSIX ACLs aren't subject to any VFS restrictions.
         */
        error = may_write_xattr(idmap, inode);
        if (error)
                goto out_inode_unlock;

        error = security_inode_remove_acl(idmap, dentry, acl_name);
        if (error)
                goto out_inode_unlock;

        error = try_break_deleg(inode, &delegated_inode);
        if (error)
                goto out_inode_unlock;

        if (likely(!is_bad_inode(inode)))
                error = set_posix_acl(idmap, dentry, acl_type, NULL);
        else
                error = -EIO;
        if (!error) {
                fsnotify_xattr(dentry);
                security_inode_post_remove_acl(idmap, dentry, acl_name);
        }

out_inode_unlock:
        inode_unlock(inode);

        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }

        return error;
}
EXPORT_SYMBOL_GPL(vfs_remove_acl);

int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
               const char *acl_name, const void *kvalue, size_t size)
{
        int error;
        struct posix_acl *acl = NULL;

        if (size) {
                /*
                 * Note that posix_acl_from_xattr() uses GFP_NOFS when it
                 * probably doesn't need to here.
                 */
                acl = posix_acl_from_xattr(current_user_ns(), kvalue, size);
                if (IS_ERR(acl))
                        return PTR_ERR(acl);
        }

        error = vfs_set_acl(idmap, dentry, acl_name, acl);
        posix_acl_release(acl);
        return error;
}

ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                   const char *acl_name, void *kvalue, size_t size)
{
        ssize_t error;
        struct posix_acl *acl;

        acl = vfs_get_acl(idmap, dentry, acl_name);
        if (IS_ERR(acl))
                return PTR_ERR(acl);

        error = vfs_posix_acl_to_xattr(idmap, d_inode(dentry),
                                       acl, kvalue, size);
        posix_acl_release(acl);
        return error;
}















































































































































































































































































































































    1 


































































    1 








    1 







    1 


































    1 

































    1 













    1 










    1 












    1 



    1 

    1 






    1 
















    1 



    1 








    1 













    1 





    1 





    1 



    1 
    1 















    1 


    1 
































































































































    1 





    1 


















    1 





















































    1 





    1 

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

    1 








































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/fat/inode.c
 *
 *  Written 1992,1993 by Werner Almesberger
 *  VFAT extensions by Gordon Chaffee, merged with msdos fs by Henrik Storner
 *  Rewritten for the constant inumbers support by Al Viro
 *
 *  Fixes:
 *
 *        Max Cohan: Fixed invalid FSINFO offset when info_sector is 0
 */

#include <linux/module.h>
#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/vfs.h>
#include <linux/seq_file.h>
#include <linux/parser.h>
#include <linux/uio.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <asm/unaligned.h>
#include <linux/random.h>
#include <linux/iversion.h>
#include "fat.h"

#ifndef CONFIG_FAT_DEFAULT_IOCHARSET
/* if user don't select VFAT, this is undefined. */
#define CONFIG_FAT_DEFAULT_IOCHARSET        ""
#endif

#define KB_IN_SECTORS 2

/* DOS dates from 1980/1/1 through 2107/12/31 */
#define FAT_DATE_MIN (0<<9 | 1<<5 | 1)
#define FAT_DATE_MAX (127<<9 | 12<<5 | 31)
#define FAT_TIME_MAX (23<<11 | 59<<5 | 29)

/*
 * A deserialized copy of the on-disk structure laid out in struct
 * fat_boot_sector.
 */
struct fat_bios_param_block {
        u16        fat_sector_size;
        u8        fat_sec_per_clus;
        u16        fat_reserved;
        u8        fat_fats;
        u16        fat_dir_entries;
        u16        fat_sectors;
        u16        fat_fat_length;
        u32        fat_total_sect;

        u8        fat16_state;
        u32        fat16_vol_id;

        u32        fat32_length;
        u32        fat32_root_cluster;
        u16        fat32_info_sector;
        u8        fat32_state;
        u32        fat32_vol_id;
};

static int fat_default_codepage = CONFIG_FAT_DEFAULT_CODEPAGE;
static char fat_default_iocharset[] = CONFIG_FAT_DEFAULT_IOCHARSET;

static struct fat_floppy_defaults {
        unsigned nr_sectors;
        unsigned sec_per_clus;
        unsigned dir_entries;
        unsigned media;
        unsigned fat_length;
} floppy_defaults[] = {
{
        .nr_sectors = 160 * KB_IN_SECTORS,
        .sec_per_clus = 1,
        .dir_entries = 64,
        .media = 0xFE,
        .fat_length = 1,
},
{
        .nr_sectors = 180 * KB_IN_SECTORS,
        .sec_per_clus = 1,
        .dir_entries = 64,
        .media = 0xFC,
        .fat_length = 2,
},
{
        .nr_sectors = 320 * KB_IN_SECTORS,
        .sec_per_clus = 2,
        .dir_entries = 112,
        .media = 0xFF,
        .fat_length = 1,
},
{
        .nr_sectors = 360 * KB_IN_SECTORS,
        .sec_per_clus = 2,
        .dir_entries = 112,
        .media = 0xFD,
        .fat_length = 2,
},
};

int fat_add_cluster(struct inode *inode)
{
        int err, cluster;

        err = fat_alloc_clusters(inode, &cluster, 1);
        if (err)
                return err;
        /* FIXME: this cluster should be added after data of this
         * cluster is writed */
        err = fat_chain_add(inode, cluster, 1);
        if (err)
                fat_free_clusters(inode, cluster);
        return err;
}

static inline int __fat_get_block(struct inode *inode, sector_t iblock,
                                  unsigned long *max_blocks,
                                  struct buffer_head *bh_result, int create)
{
        struct super_block *sb = inode->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        unsigned long mapped_blocks;
        sector_t phys, last_block;
        int err, offset;

        err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
        if (err)
                return err;
        if (phys) {
                map_bh(bh_result, sb, phys);
                *max_blocks = min(mapped_blocks, *max_blocks);
                return 0;
        }
        if (!create)
                return 0;

        if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) {
                fat_fs_error(sb, "corrupted file size (i_pos %lld, %lld)",
                        MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private);
                return -EIO;
        }

        last_block = inode->i_blocks >> (sb->s_blocksize_bits - 9);
        offset = (unsigned long)iblock & (sbi->sec_per_clus - 1);
        /*
         * allocate a cluster according to the following.
         * 1) no more available blocks
         * 2) not part of fallocate region
         */
        if (!offset && !(iblock < last_block)) {
                /* TODO: multiple cluster allocation would be desirable. */
                err = fat_add_cluster(inode);
                if (err)
                        return err;
        }
        /* available blocks on this cluster */
        mapped_blocks = sbi->sec_per_clus - offset;

        *max_blocks = min(mapped_blocks, *max_blocks);
        MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;

        err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
        if (err)
                return err;
        if (!phys) {
                fat_fs_error(sb,
                             "invalid FAT chain (i_pos %lld, last_block %llu)",
                             MSDOS_I(inode)->i_pos,
                             (unsigned long long)last_block);
                return -EIO;
        }

        BUG_ON(*max_blocks != mapped_blocks);
        set_buffer_new(bh_result);
        map_bh(bh_result, sb, phys);

        return 0;
}

static int fat_get_block(struct inode *inode, sector_t iblock,
                         struct buffer_head *bh_result, int create)
{
        struct super_block *sb = inode->i_sb;
        unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
        int err;

        err = __fat_get_block(inode, iblock, &max_blocks, bh_result, create);
        if (err)
                return err;
        bh_result->b_size = max_blocks << sb->s_blocksize_bits;
        return 0;
}

static int fat_writepages(struct address_space *mapping,
                          struct writeback_control *wbc)
{
        return mpage_writepages(mapping, wbc, fat_get_block);
}

static int fat_read_folio(struct file *file, struct folio *folio)
{
        return mpage_read_folio(folio, fat_get_block);
}

static void fat_readahead(struct readahead_control *rac)
{
        mpage_readahead(rac, fat_get_block);
}

static void fat_write_failed(struct address_space *mapping, loff_t to)
{
        struct inode *inode = mapping->host;

        if (to > inode->i_size) {
                truncate_pagecache(inode, inode->i_size);
                fat_truncate_blocks(inode, inode->i_size);
        }
}

static int fat_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len,
                        struct page **pagep, void **fsdata)
{
        int err;

        *pagep = NULL;
        err = cont_write_begin(file, mapping, pos, len,
                                pagep, fsdata, fat_get_block,
                                &MSDOS_I(mapping->host)->mmu_private);
        if (err < 0)
                fat_write_failed(mapping, pos + len);
        return err;
}

static int fat_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *pagep, void *fsdata)
{
        struct inode *inode = mapping->host;
        int err;
        err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
        if (err < len)
                fat_write_failed(mapping, pos + len);
        if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
                fat_truncate_time(inode, NULL, S_CTIME|S_MTIME);
                MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
                mark_inode_dirty(inode);
        }
        return err;
}

static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        size_t count = iov_iter_count(iter);
        loff_t offset = iocb->ki_pos;
        ssize_t ret;

        if (iov_iter_rw(iter) == WRITE) {
                /*
                 * FIXME: blockdev_direct_IO() doesn't use ->write_begin(),
                 * so we need to update the ->mmu_private to block boundary.
                 *
                 * But we must fill the remaining area or hole by nul for
                 * updating ->mmu_private.
                 *
                 * Return 0, and fallback to normal buffered write.
                 */
                loff_t size = offset + count;
                if (MSDOS_I(inode)->mmu_private < size)
                        return 0;
        }

        /*
         * FAT need to use the DIO_LOCKING for avoiding the race
         * condition of fat_get_block() and ->truncate().
         */
        ret = blockdev_direct_IO(iocb, inode, iter, fat_get_block);
        if (ret < 0 && iov_iter_rw(iter) == WRITE)
                fat_write_failed(mapping, offset + count);

        return ret;
}

static int fat_get_block_bmap(struct inode *inode, sector_t iblock,
                struct buffer_head *bh_result, int create)
{
        struct super_block *sb = inode->i_sb;
        unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
        int err;
        sector_t bmap;
        unsigned long mapped_blocks;

        BUG_ON(create != 0);

        err = fat_bmap(inode, iblock, &bmap, &mapped_blocks, create, true);
        if (err)
                return err;

        if (bmap) {
                map_bh(bh_result, sb, bmap);
                max_blocks = min(mapped_blocks, max_blocks);
        }

        bh_result->b_size = max_blocks << sb->s_blocksize_bits;

        return 0;
}

static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
{
        sector_t blocknr;

        /* fat_get_cluster() assumes the requested blocknr isn't truncated. */
        down_read(&MSDOS_I(mapping->host)->truncate_lock);
        blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap);
        up_read(&MSDOS_I(mapping->host)->truncate_lock);

        return blocknr;
}

/*
 * fat_block_truncate_page() zeroes out a mapping from file offset `from'
 * up to the end of the block which corresponds to `from'.
 * This is required during truncate to physically zeroout the tail end
 * of that block so it doesn't yield old data if the file is later grown.
 * Also, avoid causing failure from fsx for cases of "data past EOF"
 */
int fat_block_truncate_page(struct inode *inode, loff_t from)
{
        return block_truncate_page(inode->i_mapping, from, fat_get_block);
}

static const struct address_space_operations fat_aops = {
        .dirty_folio        = block_dirty_folio,
        .invalidate_folio = block_invalidate_folio,
        .read_folio        = fat_read_folio,
        .readahead        = fat_readahead,
        .writepages        = fat_writepages,
        .write_begin        = fat_write_begin,
        .write_end        = fat_write_end,
        .direct_IO        = fat_direct_IO,
        .bmap                = _fat_bmap,
        .migrate_folio        = buffer_migrate_folio,
};

/*
 * New FAT inode stuff. We do the following:
 *        a) i_ino is constant and has nothing with on-disk location.
 *        b) FAT manages its own cache of directory entries.
 *        c) *This* cache is indexed by on-disk location.
 *        d) inode has an associated directory entry, all right, but
 *                it may be unhashed.
 *        e) currently entries are stored within struct inode. That should
 *                change.
 *        f) we deal with races in the following way:
 *                1. readdir() and lookup() do FAT-dir-cache lookup.
 *                2. rename() unhashes the F-d-c entry and rehashes it in
 *                        a new place.
 *                3. unlink() and rmdir() unhash F-d-c entry.
 *                4. fat_write_inode() checks whether the thing is unhashed.
 *                        If it is we silently return. If it isn't we do bread(),
 *                        check if the location is still valid and retry if it
 *                        isn't. Otherwise we do changes.
 *                5. Spinlock is used to protect hash/unhash/location check/lookup
 *                6. fat_evict_inode() unhashes the F-d-c entry.
 *                7. lookup() and readdir() do igrab() if they find a F-d-c entry
 *                        and consider negative result as cache miss.
 */

static void fat_hash_init(struct super_block *sb)
{
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        int i;

        spin_lock_init(&sbi->inode_hash_lock);
        for (i = 0; i < FAT_HASH_SIZE; i++)
                INIT_HLIST_HEAD(&sbi->inode_hashtable[i]);
}

static inline unsigned long fat_hash(loff_t i_pos)
{
        return hash_32(i_pos, FAT_HASH_BITS);
}

static void dir_hash_init(struct super_block *sb)
{
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        int i;

        spin_lock_init(&sbi->dir_hash_lock);
        for (i = 0; i < FAT_HASH_SIZE; i++)
                INIT_HLIST_HEAD(&sbi->dir_hashtable[i]);
}

void fat_attach(struct inode *inode, loff_t i_pos)
{
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);

        if (inode->i_ino != MSDOS_ROOT_INO) {
                struct hlist_head *head =   sbi->inode_hashtable
                                          + fat_hash(i_pos);

                spin_lock(&sbi->inode_hash_lock);
                MSDOS_I(inode)->i_pos = i_pos;
                hlist_add_head(&MSDOS_I(inode)->i_fat_hash, head);
                spin_unlock(&sbi->inode_hash_lock);
        }

        /* If NFS support is enabled, cache the mapping of start cluster
         * to directory inode. This is used during reconnection of
         * dentries to the filesystem root.
         */
        if (S_ISDIR(inode->i_mode) && sbi->options.nfs) {
                struct hlist_head *d_head = sbi->dir_hashtable;
                d_head += fat_dir_hash(MSDOS_I(inode)->i_logstart);

                spin_lock(&sbi->dir_hash_lock);
                hlist_add_head(&MSDOS_I(inode)->i_dir_hash, d_head);
                spin_unlock(&sbi->dir_hash_lock);
        }
}
EXPORT_SYMBOL_GPL(fat_attach);

void fat_detach(struct inode *inode)
{
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
        spin_lock(&sbi->inode_hash_lock);
        MSDOS_I(inode)->i_pos = 0;
        hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
        spin_unlock(&sbi->inode_hash_lock);

        if (S_ISDIR(inode->i_mode) && sbi->options.nfs) {
                spin_lock(&sbi->dir_hash_lock);
                hlist_del_init(&MSDOS_I(inode)->i_dir_hash);
                spin_unlock(&sbi->dir_hash_lock);
        }
}
EXPORT_SYMBOL_GPL(fat_detach);

struct inode *fat_iget(struct super_block *sb, loff_t i_pos)
{
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos);
        struct msdos_inode_info *i;
        struct inode *inode = NULL;

        spin_lock(&sbi->inode_hash_lock);
        hlist_for_each_entry(i, head, i_fat_hash) {
                BUG_ON(i->vfs_inode.i_sb != sb);
                if (i->i_pos != i_pos)
                        continue;
                inode = igrab(&i->vfs_inode);
                if (inode)
                        break;
        }
        spin_unlock(&sbi->inode_hash_lock);
        return inode;
}

static int is_exec(unsigned char *extension)
{
        unsigned char exe_extensions[] = "EXECOMBAT", *walk;

        for (walk = exe_extensions; *walk; walk += 3)
                if (!strncmp(extension, walk, 3))
                        return 1;
        return 0;
}

static int fat_calc_dir_size(struct inode *inode)
{
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
        int ret, fclus, dclus;

        inode->i_size = 0;
        if (MSDOS_I(inode)->i_start == 0)
                return 0;

        ret = fat_get_cluster(inode, FAT_ENT_EOF, &fclus, &dclus);
        if (ret < 0)
                return ret;
        inode->i_size = (fclus + 1) << sbi->cluster_bits;

        return 0;
}

static int fat_validate_dir(struct inode *dir)
{
        struct super_block *sb = dir->i_sb;

        if (dir->i_nlink < 2) {
                /* Directory should have "."/".." entries at least. */
                fat_fs_error(sb, "corrupted directory (invalid entries)");
                return -EIO;
        }
        if (MSDOS_I(dir)->i_start == 0 ||
            MSDOS_I(dir)->i_start == MSDOS_SB(sb)->root_cluster) {
                /* Directory should point valid cluster. */
                fat_fs_error(sb, "corrupted directory (invalid i_start)");
                return -EIO;
        }
        return 0;
}

/* doesn't deal with root inode */
int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
{
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
        struct timespec64 mtime;
        int error;

        MSDOS_I(inode)->i_pos = 0;
        inode->i_uid = sbi->options.fs_uid;
        inode->i_gid = sbi->options.fs_gid;
        inode_inc_iversion(inode);
        inode->i_generation = get_random_u32();

        if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) {
                inode->i_generation &= ~1;
                inode->i_mode = fat_make_mode(sbi, de->attr, S_IRWXUGO);
                inode->i_op = sbi->dir_ops;
                inode->i_fop = &fat_dir_operations;

                MSDOS_I(inode)->i_start = fat_get_start(sbi, de);
                MSDOS_I(inode)->i_logstart = MSDOS_I(inode)->i_start;
                error = fat_calc_dir_size(inode);
                if (error < 0)
                        return error;
                MSDOS_I(inode)->mmu_private = inode->i_size;

                set_nlink(inode, fat_subdirs(inode));

                error = fat_validate_dir(inode);
                if (error < 0)
                        return error;
        } else { /* not a directory */
                inode->i_generation |= 1;
                inode->i_mode = fat_make_mode(sbi, de->attr,
                        ((sbi->options.showexec && !is_exec(de->name + 8))
                         ? S_IRUGO|S_IWUGO : S_IRWXUGO));
                MSDOS_I(inode)->i_start = fat_get_start(sbi, de);

                MSDOS_I(inode)->i_logstart = MSDOS_I(inode)->i_start;
                inode->i_size = le32_to_cpu(de->size);
                inode->i_op = &fat_file_inode_operations;
                inode->i_fop = &fat_file_operations;
                inode->i_mapping->a_ops = &fat_aops;
                MSDOS_I(inode)->mmu_private = inode->i_size;
        }
        if (de->attr & ATTR_SYS) {
                if (sbi->options.sys_immutable)
                        inode->i_flags |= S_IMMUTABLE;
        }
        fat_save_attrs(inode, de->attr);

        inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
                           & ~((loff_t)sbi->cluster_size - 1)) >> 9;

        fat_time_fat2unix(sbi, &mtime, de->time, de->date, 0);
        inode_set_mtime_to_ts(inode, mtime);
        inode_set_ctime_to_ts(inode, mtime);
        if (sbi->options.isvfat) {
                struct timespec64 atime;

                fat_time_fat2unix(sbi, &atime, 0, de->adate, 0);
                inode_set_atime_to_ts(inode, atime);
                fat_time_fat2unix(sbi, &MSDOS_I(inode)->i_crtime, de->ctime,
                                  de->cdate, de->ctime_cs);
        } else
                inode_set_atime_to_ts(inode, fat_truncate_atime(sbi, &mtime));

        return 0;
}

static inline void fat_lock_build_inode(struct msdos_sb_info *sbi)
{
        if (sbi->options.nfs == FAT_NFS_NOSTALE_RO)
                mutex_lock(&sbi->nfs_build_inode_lock);
}

static inline void fat_unlock_build_inode(struct msdos_sb_info *sbi)
{
        if (sbi->options.nfs == FAT_NFS_NOSTALE_RO)
                mutex_unlock(&sbi->nfs_build_inode_lock);
}

struct inode *fat_build_inode(struct super_block *sb,
                        struct msdos_dir_entry *de, loff_t i_pos)
{
        struct inode *inode;
        int err;

        fat_lock_build_inode(MSDOS_SB(sb));
        inode = fat_iget(sb, i_pos);
        if (inode)
                goto out;
        inode = new_inode(sb);
        if (!inode) {
                inode = ERR_PTR(-ENOMEM);
                goto out;
        }
        inode->i_ino = iunique(sb, MSDOS_ROOT_INO);
        inode_set_iversion(inode, 1);
        err = fat_fill_inode(inode, de);
        if (err) {
                iput(inode);
                inode = ERR_PTR(err);
                goto out;
        }
        fat_attach(inode, i_pos);
        insert_inode_hash(inode);
out:
        fat_unlock_build_inode(MSDOS_SB(sb));
        return inode;
}

EXPORT_SYMBOL_GPL(fat_build_inode);

static int __fat_write_inode(struct inode *inode, int wait);

static void fat_free_eofblocks(struct inode *inode)
{
        /* Release unwritten fallocated blocks on inode eviction. */
        if ((inode->i_blocks << 9) >
                        round_up(MSDOS_I(inode)->mmu_private,
                                MSDOS_SB(inode->i_sb)->cluster_size)) {
                int err;

                fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private);
                /* Fallocate results in updating the i_start/iogstart
                 * for the zero byte file. So, make it return to
                 * original state during evict and commit it to avoid
                 * any corruption on the next access to the cluster
                 * chain for the file.
                 */
                err = __fat_write_inode(inode, inode_needs_sync(inode));
                if (err) {
                        fat_msg(inode->i_sb, KERN_WARNING, "Failed to "
                                        "update on disk inode for unused "
                                        "fallocated blocks, inode could be "
                                        "corrupted. Please run fsck");
                }

        }
}

static void fat_evict_inode(struct inode *inode)
{
        truncate_inode_pages_final(&inode->i_data);
        if (!inode->i_nlink) {
                inode->i_size = 0;
                fat_truncate_blocks(inode, 0);
        } else
                fat_free_eofblocks(inode);

        invalidate_inode_buffers(inode);
        clear_inode(inode);
        fat_cache_inval_inode(inode);
        fat_detach(inode);
}

static void fat_set_state(struct super_block *sb,
                        unsigned int set, unsigned int force)
{
        struct buffer_head *bh;
        struct fat_boot_sector *b;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);

        /* do not change any thing if mounted read only */
        if (sb_rdonly(sb) && !force)
                return;

        /* do not change state if fs was dirty */
        if (sbi->dirty) {
                /* warn only on set (mount). */
                if (set)
                        fat_msg(sb, KERN_WARNING, "Volume was not properly "
                                "unmounted. Some data may be corrupt. "
                                "Please run fsck.");
                return;
        }

        bh = sb_bread(sb, 0);
        if (bh == NULL) {
                fat_msg(sb, KERN_ERR, "unable to read boot sector "
                        "to mark fs as dirty");
                return;
        }

        b = (struct fat_boot_sector *) bh->b_data;

        if (is_fat32(sbi)) {
                if (set)
                        b->fat32.state |= FAT_STATE_DIRTY;
                else
                        b->fat32.state &= ~FAT_STATE_DIRTY;
        } else /* fat 16 and 12 */ {
                if (set)
                        b->fat16.state |= FAT_STATE_DIRTY;
                else
                        b->fat16.state &= ~FAT_STATE_DIRTY;
        }

        mark_buffer_dirty(bh);
        sync_dirty_buffer(bh);
        brelse(bh);
}

static void fat_reset_iocharset(struct fat_mount_options *opts)
{
        if (opts->iocharset != fat_default_iocharset) {
                /* Note: opts->iocharset can be NULL here */
                kfree(opts->iocharset);
                opts->iocharset = fat_default_iocharset;
        }
}

static void delayed_free(struct rcu_head *p)
{
        struct msdos_sb_info *sbi = container_of(p, struct msdos_sb_info, rcu);
        unload_nls(sbi->nls_disk);
        unload_nls(sbi->nls_io);
        fat_reset_iocharset(&sbi->options);
        kfree(sbi);
}

static void fat_put_super(struct super_block *sb)
{
        struct msdos_sb_info *sbi = MSDOS_SB(sb);

        fat_set_state(sb, 0, 0);

        iput(sbi->fsinfo_inode);
        iput(sbi->fat_inode);

        call_rcu(&sbi->rcu, delayed_free);
}

static struct kmem_cache *fat_inode_cachep;

static struct inode *fat_alloc_inode(struct super_block *sb)
{
        struct msdos_inode_info *ei;
        ei = alloc_inode_sb(sb, fat_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;

        init_rwsem(&ei->truncate_lock);
        /* Zeroing to allow iput() even if partial initialized inode. */
        ei->mmu_private = 0;
        ei->i_start = 0;
        ei->i_logstart = 0;
        ei->i_attrs = 0;
        ei->i_pos = 0;
        ei->i_crtime.tv_sec = 0;
        ei->i_crtime.tv_nsec = 0;

        return &ei->vfs_inode;
}

static void fat_free_inode(struct inode *inode)
{
        kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
}

static void init_once(void *foo)
{
        struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;

        spin_lock_init(&ei->cache_lru_lock);
        ei->nr_caches = 0;
        ei->cache_valid_id = FAT_CACHE_VALID + 1;
        INIT_LIST_HEAD(&ei->cache_lru);
        INIT_HLIST_NODE(&ei->i_fat_hash);
        INIT_HLIST_NODE(&ei->i_dir_hash);
        inode_init_once(&ei->vfs_inode);
}

static int __init fat_init_inodecache(void)
{
        fat_inode_cachep = kmem_cache_create("fat_inode_cache",
                                             sizeof(struct msdos_inode_info),
                                             0, (SLAB_RECLAIM_ACCOUNT|
                                                SLAB_ACCOUNT),
                                             init_once);
        if (fat_inode_cachep == NULL)
                return -ENOMEM;
        return 0;
}

static void __exit fat_destroy_inodecache(void)
{
        /*
         * Make sure all delayed rcu free inodes are flushed before we
         * destroy cache.
         */
        rcu_barrier();
        kmem_cache_destroy(fat_inode_cachep);
}

static int fat_remount(struct super_block *sb, int *flags, char *data)
{
        bool new_rdonly;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        *flags |= SB_NODIRATIME | (sbi->options.isvfat ? 0 : SB_NOATIME);

        sync_filesystem(sb);

        /* make sure we update state on remount. */
        new_rdonly = *flags & SB_RDONLY;
        if (new_rdonly != sb_rdonly(sb)) {
                if (new_rdonly)
                        fat_set_state(sb, 0, 0);
                else
                        fat_set_state(sb, 1, 1);
        }
        return 0;
}

static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct super_block *sb = dentry->d_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);

        /* If the count of free cluster is still unknown, counts it here. */
        if (sbi->free_clusters == -1 || !sbi->free_clus_valid) {
                int err = fat_count_free_clusters(dentry->d_sb);
                if (err)
                        return err;
        }

        buf->f_type = dentry->d_sb->s_magic;
        buf->f_bsize = sbi->cluster_size;
        buf->f_blocks = sbi->max_cluster - FAT_START_ENT;
        buf->f_bfree = sbi->free_clusters;
        buf->f_bavail = sbi->free_clusters;
        buf->f_fsid = u64_to_fsid(id);
        buf->f_namelen =
                (sbi->options.isvfat ? FAT_LFN_LEN : 12) * NLS_MAX_CHARSET_SIZE;

        return 0;
}

static int __fat_write_inode(struct inode *inode, int wait)
{
        struct super_block *sb = inode->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        struct buffer_head *bh;
        struct msdos_dir_entry *raw_entry;
        struct timespec64 mtime;
        loff_t i_pos;
        sector_t blocknr;
        int err, offset;

        if (inode->i_ino == MSDOS_ROOT_INO)
                return 0;

retry:
        i_pos = fat_i_pos_read(sbi, inode);
        if (!i_pos)
                return 0;

        fat_get_blknr_offset(sbi, i_pos, &blocknr, &offset);
        bh = sb_bread(sb, blocknr);
        if (!bh) {
                fat_msg(sb, KERN_ERR, "unable to read inode block "
                       "for updating (i_pos %lld)", i_pos);
                return -EIO;
        }
        spin_lock(&sbi->inode_hash_lock);
        if (i_pos != MSDOS_I(inode)->i_pos) {
                spin_unlock(&sbi->inode_hash_lock);
                brelse(bh);
                goto retry;
        }

        raw_entry = &((struct msdos_dir_entry *) (bh->b_data))[offset];
        if (S_ISDIR(inode->i_mode))
                raw_entry->size = 0;
        else
                raw_entry->size = cpu_to_le32(inode->i_size);
        raw_entry->attr = fat_make_attrs(inode);
        fat_set_start(raw_entry, MSDOS_I(inode)->i_logstart);
        mtime = inode_get_mtime(inode);
        fat_time_unix2fat(sbi, &mtime, &raw_entry->time,
                          &raw_entry->date, NULL);
        if (sbi->options.isvfat) {
                struct timespec64 ts = inode_get_atime(inode);
                __le16 atime;

                fat_time_unix2fat(sbi, &ts, &atime, &raw_entry->adate, NULL);
                fat_time_unix2fat(sbi, &MSDOS_I(inode)->i_crtime, &raw_entry->ctime,
                                  &raw_entry->cdate, &raw_entry->ctime_cs);
        }
        spin_unlock(&sbi->inode_hash_lock);
        mark_buffer_dirty(bh);
        err = 0;
        if (wait)
                err = sync_dirty_buffer(bh);
        brelse(bh);
        return err;
}

static int fat_write_inode(struct inode *inode, struct writeback_control *wbc)
{
        int err;

        if (inode->i_ino == MSDOS_FSINFO_INO) {
                struct super_block *sb = inode->i_sb;

                mutex_lock(&MSDOS_SB(sb)->s_lock);
                err = fat_clusters_flush(sb);
                mutex_unlock(&MSDOS_SB(sb)->s_lock);
        } else
                err = __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);

        return err;
}

int fat_sync_inode(struct inode *inode)
{
        return __fat_write_inode(inode, 1);
}

EXPORT_SYMBOL_GPL(fat_sync_inode);

static int fat_show_options(struct seq_file *m, struct dentry *root);
static const struct super_operations fat_sops = {
        .alloc_inode        = fat_alloc_inode,
        .free_inode        = fat_free_inode,
        .write_inode        = fat_write_inode,
        .evict_inode        = fat_evict_inode,
        .put_super        = fat_put_super,
        .statfs                = fat_statfs,
        .remount_fs        = fat_remount,

        .show_options        = fat_show_options,
};

static int fat_show_options(struct seq_file *m, struct dentry *root)
{
        struct msdos_sb_info *sbi = MSDOS_SB(root->d_sb);
        struct fat_mount_options *opts = &sbi->options;
        int isvfat = opts->isvfat;

        if (!uid_eq(opts->fs_uid, GLOBAL_ROOT_UID))
                seq_printf(m, ",uid=%u",
                                from_kuid_munged(&init_user_ns, opts->fs_uid));
        if (!gid_eq(opts->fs_gid, GLOBAL_ROOT_GID))
                seq_printf(m, ",gid=%u",
                                from_kgid_munged(&init_user_ns, opts->fs_gid));
        seq_printf(m, ",fmask=%04o", opts->fs_fmask);
        seq_printf(m, ",dmask=%04o", opts->fs_dmask);
        if (opts->allow_utime)
                seq_printf(m, ",allow_utime=%04o", opts->allow_utime);
        if (sbi->nls_disk)
                /* strip "cp" prefix from displayed option */
                seq_printf(m, ",codepage=%s", &sbi->nls_disk->charset[2]);
        if (isvfat) {
                if (sbi->nls_io)
                        seq_printf(m, ",iocharset=%s", sbi->nls_io->charset);

                switch (opts->shortname) {
                case VFAT_SFN_DISPLAY_WIN95 | VFAT_SFN_CREATE_WIN95:
                        seq_puts(m, ",shortname=win95");
                        break;
                case VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WINNT:
                        seq_puts(m, ",shortname=winnt");
                        break;
                case VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WIN95:
                        seq_puts(m, ",shortname=mixed");
                        break;
                case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95:
                        seq_puts(m, ",shortname=lower");
                        break;
                default:
                        seq_puts(m, ",shortname=unknown");
                        break;
                }
        }
        if (opts->name_check != 'n')
                seq_printf(m, ",check=%c", opts->name_check);
        if (opts->usefree)
                seq_puts(m, ",usefree");
        if (opts->quiet)
                seq_puts(m, ",quiet");
        if (opts->showexec)
                seq_puts(m, ",showexec");
        if (opts->sys_immutable)
                seq_puts(m, ",sys_immutable");
        if (!isvfat) {
                if (opts->dotsOK)
                        seq_puts(m, ",dotsOK=yes");
                if (opts->nocase)
                        seq_puts(m, ",nocase");
        } else {
                if (opts->utf8)
                        seq_puts(m, ",utf8");
                if (opts->unicode_xlate)
                        seq_puts(m, ",uni_xlate");
                if (!opts->numtail)
                        seq_puts(m, ",nonumtail");
                if (opts->rodir)
                        seq_puts(m, ",rodir");
        }
        if (opts->flush)
                seq_puts(m, ",flush");
        if (opts->tz_set) {
                if (opts->time_offset)
                        seq_printf(m, ",time_offset=%d", opts->time_offset);
                else
                        seq_puts(m, ",tz=UTC");
        }
        if (opts->errors == FAT_ERRORS_CONT)
                seq_puts(m, ",errors=continue");
        else if (opts->errors == FAT_ERRORS_PANIC)
                seq_puts(m, ",errors=panic");
        else
                seq_puts(m, ",errors=remount-ro");
        if (opts->nfs == FAT_NFS_NOSTALE_RO)
                seq_puts(m, ",nfs=nostale_ro");
        else if (opts->nfs)
                seq_puts(m, ",nfs=stale_rw");
        if (opts->discard)
                seq_puts(m, ",discard");
        if (opts->dos1xfloppy)
                seq_puts(m, ",dos1xfloppy");

        return 0;
}

enum {
        Opt_check_n, Opt_check_r, Opt_check_s, Opt_uid, Opt_gid,
        Opt_umask, Opt_dmask, Opt_fmask, Opt_allow_utime, Opt_codepage,
        Opt_usefree, Opt_nocase, Opt_quiet, Opt_showexec, Opt_debug,
        Opt_immutable, Opt_dots, Opt_nodots,
        Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
        Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
        Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
        Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
        Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset,
        Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, Opt_dos1xfloppy,
};

static const match_table_t fat_tokens = {
        {Opt_check_r, "check=relaxed"},
        {Opt_check_s, "check=strict"},
        {Opt_check_n, "check=normal"},
        {Opt_check_r, "check=r"},
        {Opt_check_s, "check=s"},
        {Opt_check_n, "check=n"},
        {Opt_uid, "uid=%u"},
        {Opt_gid, "gid=%u"},
        {Opt_umask, "umask=%o"},
        {Opt_dmask, "dmask=%o"},
        {Opt_fmask, "fmask=%o"},
        {Opt_allow_utime, "allow_utime=%o"},
        {Opt_codepage, "codepage=%u"},
        {Opt_usefree, "usefree"},
        {Opt_nocase, "nocase"},
        {Opt_quiet, "quiet"},
        {Opt_showexec, "showexec"},
        {Opt_debug, "debug"},
        {Opt_immutable, "sys_immutable"},
        {Opt_flush, "flush"},
        {Opt_tz_utc, "tz=UTC"},
        {Opt_time_offset, "time_offset=%d"},
        {Opt_err_cont, "errors=continue"},
        {Opt_err_panic, "errors=panic"},
        {Opt_err_ro, "errors=remount-ro"},
        {Opt_discard, "discard"},
        {Opt_nfs_stale_rw, "nfs"},
        {Opt_nfs_stale_rw, "nfs=stale_rw"},
        {Opt_nfs_nostale_ro, "nfs=nostale_ro"},
        {Opt_dos1xfloppy, "dos1xfloppy"},
        {Opt_obsolete, "conv=binary"},
        {Opt_obsolete, "conv=text"},
        {Opt_obsolete, "conv=auto"},
        {Opt_obsolete, "conv=b"},
        {Opt_obsolete, "conv=t"},
        {Opt_obsolete, "conv=a"},
        {Opt_obsolete, "fat=%u"},
        {Opt_obsolete, "blocksize=%u"},
        {Opt_obsolete, "cvf_format=%20s"},
        {Opt_obsolete, "cvf_options=%100s"},
        {Opt_obsolete, "posix"},
        {Opt_err, NULL},
};
static const match_table_t msdos_tokens = {
        {Opt_nodots, "nodots"},
        {Opt_nodots, "dotsOK=no"},
        {Opt_dots, "dots"},
        {Opt_dots, "dotsOK=yes"},
        {Opt_err, NULL}
};
static const match_table_t vfat_tokens = {
        {Opt_charset, "iocharset=%s"},
        {Opt_shortname_lower, "shortname=lower"},
        {Opt_shortname_win95, "shortname=win95"},
        {Opt_shortname_winnt, "shortname=winnt"},
        {Opt_shortname_mixed, "shortname=mixed"},
        {Opt_utf8_no, "utf8=0"},                /* 0 or no or false */
        {Opt_utf8_no, "utf8=no"},
        {Opt_utf8_no, "utf8=false"},
        {Opt_utf8_yes, "utf8=1"},                /* empty or 1 or yes or true */
        {Opt_utf8_yes, "utf8=yes"},
        {Opt_utf8_yes, "utf8=true"},
        {Opt_utf8_yes, "utf8"},
        {Opt_uni_xl_no, "uni_xlate=0"},                /* 0 or no or false */
        {Opt_uni_xl_no, "uni_xlate=no"},
        {Opt_uni_xl_no, "uni_xlate=false"},
        {Opt_uni_xl_yes, "uni_xlate=1"},        /* empty or 1 or yes or true */
        {Opt_uni_xl_yes, "uni_xlate=yes"},
        {Opt_uni_xl_yes, "uni_xlate=true"},
        {Opt_uni_xl_yes, "uni_xlate"},
        {Opt_nonumtail_no, "nonumtail=0"},        /* 0 or no or false */
        {Opt_nonumtail_no, "nonumtail=no"},
        {Opt_nonumtail_no, "nonumtail=false"},
        {Opt_nonumtail_yes, "nonumtail=1"},        /* empty or 1 or yes or true */
        {Opt_nonumtail_yes, "nonumtail=yes"},
        {Opt_nonumtail_yes, "nonumtail=true"},
        {Opt_nonumtail_yes, "nonumtail"},
        {Opt_rodir, "rodir"},
        {Opt_err, NULL}
};

static int parse_options(struct super_block *sb, char *options, int is_vfat,
                         int silent, int *debug, struct fat_mount_options *opts)
{
        char *p;
        substring_t args[MAX_OPT_ARGS];
        int option;
        char *iocharset;

        opts->isvfat = is_vfat;

        opts->fs_uid = current_uid();
        opts->fs_gid = current_gid();
        opts->fs_fmask = opts->fs_dmask = current_umask();
        opts->allow_utime = -1;
        opts->codepage = fat_default_codepage;
        fat_reset_iocharset(opts);
        if (is_vfat) {
                opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95;
                opts->rodir = 0;
        } else {
                opts->shortname = 0;
                opts->rodir = 1;
        }
        opts->name_check = 'n';
        opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK =  0;
        opts->unicode_xlate = 0;
        opts->numtail = 1;
        opts->usefree = opts->nocase = 0;
        opts->tz_set = 0;
        opts->nfs = 0;
        opts->errors = FAT_ERRORS_RO;
        *debug = 0;

        opts->utf8 = IS_ENABLED(CONFIG_FAT_DEFAULT_UTF8) && is_vfat;

        if (!options)
                goto out;

        while ((p = strsep(&options, ",")) != NULL) {
                int token;
                if (!*p)
                        continue;

                token = match_token(p, fat_tokens, args);
                if (token == Opt_err) {
                        if (is_vfat)
                                token = match_token(p, vfat_tokens, args);
                        else
                                token = match_token(p, msdos_tokens, args);
                }
                switch (token) {
                case Opt_check_s:
                        opts->name_check = 's';
                        break;
                case Opt_check_r:
                        opts->name_check = 'r';
                        break;
                case Opt_check_n:
                        opts->name_check = 'n';
                        break;
                case Opt_usefree:
                        opts->usefree = 1;
                        break;
                case Opt_nocase:
                        if (!is_vfat)
                                opts->nocase = 1;
                        else {
                                /* for backward compatibility */
                                opts->shortname = VFAT_SFN_DISPLAY_WIN95
                                        | VFAT_SFN_CREATE_WIN95;
                        }
                        break;
                case Opt_quiet:
                        opts->quiet = 1;
                        break;
                case Opt_showexec:
                        opts->showexec = 1;
                        break;
                case Opt_debug:
                        *debug = 1;
                        break;
                case Opt_immutable:
                        opts->sys_immutable = 1;
                        break;
                case Opt_uid:
                        if (match_int(&args[0], &option))
                                return -EINVAL;
                        opts->fs_uid = make_kuid(current_user_ns(), option);
                        if (!uid_valid(opts->fs_uid))
                                return -EINVAL;
                        break;
                case Opt_gid:
                        if (match_int(&args[0], &option))
                                return -EINVAL;
                        opts->fs_gid = make_kgid(current_user_ns(), option);
                        if (!gid_valid(opts->fs_gid))
                                return -EINVAL;
                        break;
                case Opt_umask:
                        if (match_octal(&args[0], &option))
                                return -EINVAL;
                        opts->fs_fmask = opts->fs_dmask = option;
                        break;
                case Opt_dmask:
                        if (match_octal(&args[0], &option))
                                return -EINVAL;
                        opts->fs_dmask = option;
                        break;
                case Opt_fmask:
                        if (match_octal(&args[0], &option))
                                return -EINVAL;
                        opts->fs_fmask = option;
                        break;
                case Opt_allow_utime:
                        if (match_octal(&args[0], &option))
                                return -EINVAL;
                        opts->allow_utime = option & (S_IWGRP | S_IWOTH);
                        break;
                case Opt_codepage:
                        if (match_int(&args[0], &option))
                                return -EINVAL;
                        opts->codepage = option;
                        break;
                case Opt_flush:
                        opts->flush = 1;
                        break;
                case Opt_time_offset:
                        if (match_int(&args[0], &option))
                                return -EINVAL;
                        /*
                         * GMT+-12 zones may have DST corrections so at least
                         * 13 hours difference is needed. Make the limit 24
                         * just in case someone invents something unusual.
                         */
                        if (option < -24 * 60 || option > 24 * 60)
                                return -EINVAL;
                        opts->tz_set = 1;
                        opts->time_offset = option;
                        break;
                case Opt_tz_utc:
                        opts->tz_set = 1;
                        opts->time_offset = 0;
                        break;
                case Opt_err_cont:
                        opts->errors = FAT_ERRORS_CONT;
                        break;
                case Opt_err_panic:
                        opts->errors = FAT_ERRORS_PANIC;
                        break;
                case Opt_err_ro:
                        opts->errors = FAT_ERRORS_RO;
                        break;
                case Opt_nfs_stale_rw:
                        opts->nfs = FAT_NFS_STALE_RW;
                        break;
                case Opt_nfs_nostale_ro:
                        opts->nfs = FAT_NFS_NOSTALE_RO;
                        break;
                case Opt_dos1xfloppy:
                        opts->dos1xfloppy = 1;
                        break;

                /* msdos specific */
                case Opt_dots:
                        opts->dotsOK = 1;
                        break;
                case Opt_nodots:
                        opts->dotsOK = 0;
                        break;

                /* vfat specific */
                case Opt_charset:
                        fat_reset_iocharset(opts);
                        iocharset = match_strdup(&args[0]);
                        if (!iocharset)
                                return -ENOMEM;
                        opts->iocharset = iocharset;
                        break;
                case Opt_shortname_lower:
                        opts->shortname = VFAT_SFN_DISPLAY_LOWER
                                        | VFAT_SFN_CREATE_WIN95;
                        break;
                case Opt_shortname_win95:
                        opts->shortname = VFAT_SFN_DISPLAY_WIN95
                                        | VFAT_SFN_CREATE_WIN95;
                        break;
                case Opt_shortname_winnt:
                        opts->shortname = VFAT_SFN_DISPLAY_WINNT
                                        | VFAT_SFN_CREATE_WINNT;
                        break;
                case Opt_shortname_mixed:
                        opts->shortname = VFAT_SFN_DISPLAY_WINNT
                                        | VFAT_SFN_CREATE_WIN95;
                        break;
                case Opt_utf8_no:                /* 0 or no or false */
                        opts->utf8 = 0;
                        break;
                case Opt_utf8_yes:                /* empty or 1 or yes or true */
                        opts->utf8 = 1;
                        break;
                case Opt_uni_xl_no:                /* 0 or no or false */
                        opts->unicode_xlate = 0;
                        break;
                case Opt_uni_xl_yes:                /* empty or 1 or yes or true */
                        opts->unicode_xlate = 1;
                        break;
                case Opt_nonumtail_no:                /* 0 or no or false */
                        opts->numtail = 1;        /* negated option */
                        break;
                case Opt_nonumtail_yes:                /* empty or 1 or yes or true */
                        opts->numtail = 0;        /* negated option */
                        break;
                case Opt_rodir:
                        opts->rodir = 1;
                        break;
                case Opt_discard:
                        opts->discard = 1;
                        break;

                /* obsolete mount options */
                case Opt_obsolete:
                        fat_msg(sb, KERN_INFO, "\"%s\" option is obsolete, "
                               "not supported now", p);
                        break;
                /* unknown option */
                default:
                        if (!silent) {
                                fat_msg(sb, KERN_ERR,
                                       "Unrecognized mount option \"%s\" "
                                       "or missing value", p);
                        }
                        return -EINVAL;
                }
        }

out:
        /* UTF-8 doesn't provide FAT semantics */
        if (!strcmp(opts->iocharset, "utf8")) {
                fat_msg(sb, KERN_WARNING, "utf8 is not a recommended IO charset"
                       " for FAT filesystems, filesystem will be "
                       "case sensitive!");
        }

        /* If user doesn't specify allow_utime, it's initialized from dmask. */
        if (opts->allow_utime == (unsigned short)-1)
                opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH);
        if (opts->unicode_xlate)
                opts->utf8 = 0;
        if (opts->nfs == FAT_NFS_NOSTALE_RO) {
                sb->s_flags |= SB_RDONLY;
                sb->s_export_op = &fat_export_ops_nostale;
        }

        return 0;
}

static int fat_read_root(struct inode *inode)
{
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
        int error;

        MSDOS_I(inode)->i_pos = MSDOS_ROOT_INO;
        inode->i_uid = sbi->options.fs_uid;
        inode->i_gid = sbi->options.fs_gid;
        inode_inc_iversion(inode);
        inode->i_generation = 0;
        inode->i_mode = fat_make_mode(sbi, ATTR_DIR, S_IRWXUGO);
        inode->i_op = sbi->dir_ops;
        inode->i_fop = &fat_dir_operations;
        if (is_fat32(sbi)) {
                MSDOS_I(inode)->i_start = sbi->root_cluster;
                error = fat_calc_dir_size(inode);
                if (error < 0)
                        return error;
        } else {
                MSDOS_I(inode)->i_start = 0;
                inode->i_size = sbi->dir_entries * sizeof(struct msdos_dir_entry);
        }
        inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
                           & ~((loff_t)sbi->cluster_size - 1)) >> 9;
        MSDOS_I(inode)->i_logstart = 0;
        MSDOS_I(inode)->mmu_private = inode->i_size;

        fat_save_attrs(inode, ATTR_DIR);
        inode_set_mtime_to_ts(inode,
                              inode_set_atime_to_ts(inode, inode_set_ctime(inode, 0, 0)));
        set_nlink(inode, fat_subdirs(inode)+2);

        return 0;
}

static unsigned long calc_fat_clusters(struct super_block *sb)
{
        struct msdos_sb_info *sbi = MSDOS_SB(sb);

        /* Divide first to avoid overflow */
        if (!is_fat12(sbi)) {
                unsigned long ent_per_sec = sb->s_blocksize * 8 / sbi->fat_bits;
                return ent_per_sec * sbi->fat_length;
        }

        return sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits;
}

static bool fat_bpb_is_zero(struct fat_boot_sector *b)
{
        if (get_unaligned_le16(&b->sector_size))
                return false;
        if (b->sec_per_clus)
                return false;
        if (b->reserved)
                return false;
        if (b->fats)
                return false;
        if (get_unaligned_le16(&b->dir_entries))
                return false;
        if (get_unaligned_le16(&b->sectors))
                return false;
        if (b->media)
                return false;
        if (b->fat_length)
                return false;
        if (b->secs_track)
                return false;
        if (b->heads)
                return false;
        return true;
}

static int fat_read_bpb(struct super_block *sb, struct fat_boot_sector *b,
        int silent, struct fat_bios_param_block *bpb)
{
        int error = -EINVAL;

        /* Read in BPB ... */
        memset(bpb, 0, sizeof(*bpb));
        bpb->fat_sector_size = get_unaligned_le16(&b->sector_size);
        bpb->fat_sec_per_clus = b->sec_per_clus;
        bpb->fat_reserved = le16_to_cpu(b->reserved);
        bpb->fat_fats = b->fats;
        bpb->fat_dir_entries = get_unaligned_le16(&b->dir_entries);
        bpb->fat_sectors = get_unaligned_le16(&b->sectors);
        bpb->fat_fat_length = le16_to_cpu(b->fat_length);
        bpb->fat_total_sect = le32_to_cpu(b->total_sect);

        bpb->fat16_state = b->fat16.state;
        bpb->fat16_vol_id = get_unaligned_le32(b->fat16.vol_id);

        bpb->fat32_length = le32_to_cpu(b->fat32.length);
        bpb->fat32_root_cluster = le32_to_cpu(b->fat32.root_cluster);
        bpb->fat32_info_sector = le16_to_cpu(b->fat32.info_sector);
        bpb->fat32_state = b->fat32.state;
        bpb->fat32_vol_id = get_unaligned_le32(b->fat32.vol_id);

        /* Validate this looks like a FAT filesystem BPB */
        if (!bpb->fat_reserved) {
                if (!silent)
                        fat_msg(sb, KERN_ERR,
                                "bogus number of reserved sectors");
                goto out;
        }
        if (!bpb->fat_fats) {
                if (!silent)
                        fat_msg(sb, KERN_ERR, "bogus number of FAT structure");
                goto out;
        }

        /*
         * Earlier we checked here that b->secs_track and b->head are nonzero,
         * but it turns out valid FAT filesystems can have zero there.
         */

        if (!fat_valid_media(b->media)) {
                if (!silent)
                        fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)",
                                (unsigned)b->media);
                goto out;
        }

        if (!is_power_of_2(bpb->fat_sector_size)
            || (bpb->fat_sector_size < 512)
            || (bpb->fat_sector_size > 4096)) {
                if (!silent)
                        fat_msg(sb, KERN_ERR, "bogus logical sector size %u",
                               (unsigned)bpb->fat_sector_size);
                goto out;
        }

        if (!is_power_of_2(bpb->fat_sec_per_clus)) {
                if (!silent)
                        fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u",
                                (unsigned)bpb->fat_sec_per_clus);
                goto out;
        }

        if (bpb->fat_fat_length == 0 && bpb->fat32_length == 0) {
                if (!silent)
                        fat_msg(sb, KERN_ERR, "bogus number of FAT sectors");
                goto out;
        }

        error = 0;

out:
        return error;
}

static int fat_read_static_bpb(struct super_block *sb,
        struct fat_boot_sector *b, int silent,
        struct fat_bios_param_block *bpb)
{
        static const char *notdos1x = "This doesn't look like a DOS 1.x volume";
        sector_t bd_sects = bdev_nr_sectors(sb->s_bdev);
        struct fat_floppy_defaults *fdefaults = NULL;
        int error = -EINVAL;
        unsigned i;

        /* 16-bit DOS 1.x reliably wrote bootstrap short-jmp code */
        if (b->ignored[0] != 0xeb || b->ignored[2] != 0x90) {
                if (!silent)
                        fat_msg(sb, KERN_ERR,
                                "%s; no bootstrapping code", notdos1x);
                goto out;
        }

        /*
         * If any value in this region is non-zero, it isn't archaic
         * DOS.
         */
        if (!fat_bpb_is_zero(b)) {
                if (!silent)
                        fat_msg(sb, KERN_ERR,
                                "%s; DOS 2.x BPB is non-zero", notdos1x);
                goto out;
        }

        for (i = 0; i < ARRAY_SIZE(floppy_defaults); i++) {
                if (floppy_defaults[i].nr_sectors == bd_sects) {
                        fdefaults = &floppy_defaults[i];
                        break;
                }
        }

        if (fdefaults == NULL) {
                if (!silent)
                        fat_msg(sb, KERN_WARNING,
                                "This looks like a DOS 1.x volume, but isn't a recognized floppy size (%llu sectors)",
                                (u64)bd_sects);
                goto out;
        }

        if (!silent)
                fat_msg(sb, KERN_INFO,
                        "This looks like a DOS 1.x volume; assuming default BPB values");

        memset(bpb, 0, sizeof(*bpb));
        bpb->fat_sector_size = SECTOR_SIZE;
        bpb->fat_sec_per_clus = fdefaults->sec_per_clus;
        bpb->fat_reserved = 1;
        bpb->fat_fats = 2;
        bpb->fat_dir_entries = fdefaults->dir_entries;
        bpb->fat_sectors = fdefaults->nr_sectors;
        bpb->fat_fat_length = fdefaults->fat_length;

        error = 0;

out:
        return error;
}

/*
 * Read the super block of an MS-DOS FS.
 */
int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
                   void (*setup)(struct super_block *))
{
        struct inode *root_inode = NULL, *fat_inode = NULL;
        struct inode *fsinfo_inode = NULL;
        struct buffer_head *bh;
        struct fat_bios_param_block bpb;
        struct msdos_sb_info *sbi;
        u16 logical_sector_size;
        u32 total_sectors, total_clusters, fat_clusters, rootdir_sectors;
        int debug;
        long error;
        char buf[50];
        struct timespec64 ts;

        /*
         * GFP_KERNEL is ok here, because while we do hold the
         * superblock lock, memory pressure can't call back into
         * the filesystem, since we're only just about to mount
         * it and have no inodes etc active!
         */
        sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
        sb->s_fs_info = sbi;

        sb->s_flags |= SB_NODIRATIME;
        sb->s_magic = MSDOS_SUPER_MAGIC;
        sb->s_op = &fat_sops;
        sb->s_export_op = &fat_export_ops;
        /*
         * fat timestamps are complex and truncated by fat itself, so
         * we set 1 here to be fast
         */
        sb->s_time_gran = 1;
        mutex_init(&sbi->nfs_build_inode_lock);
        ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
                             DEFAULT_RATELIMIT_BURST);

        error = parse_options(sb, data, isvfat, silent, &debug, &sbi->options);
        if (error)
                goto out_fail;

        setup(sb); /* flavour-specific stuff that needs options */

        error = -EIO;
        sb_min_blocksize(sb, 512);
        bh = sb_bread(sb, 0);
        if (bh == NULL) {
                fat_msg(sb, KERN_ERR, "unable to read boot sector");
                goto out_fail;
        }

        error = fat_read_bpb(sb, (struct fat_boot_sector *)bh->b_data, silent,
                &bpb);
        if (error == -EINVAL && sbi->options.dos1xfloppy)
                error = fat_read_static_bpb(sb,
                        (struct fat_boot_sector *)bh->b_data, silent, &bpb);
        brelse(bh);

        if (error == -EINVAL)
                goto out_invalid;
        else if (error)
                goto out_fail;

        logical_sector_size = bpb.fat_sector_size;
        sbi->sec_per_clus = bpb.fat_sec_per_clus;

        error = -EIO;
        if (logical_sector_size < sb->s_blocksize) {
                fat_msg(sb, KERN_ERR, "logical sector size too small for device"
                       " (logical sector size = %u)", logical_sector_size);
                goto out_fail;
        }

        if (logical_sector_size > sb->s_blocksize) {
                struct buffer_head *bh_resize;

                if (!sb_set_blocksize(sb, logical_sector_size)) {
                        fat_msg(sb, KERN_ERR, "unable to set blocksize %u",
                               logical_sector_size);
                        goto out_fail;
                }

                /* Verify that the larger boot sector is fully readable */
                bh_resize = sb_bread(sb, 0);
                if (bh_resize == NULL) {
                        fat_msg(sb, KERN_ERR, "unable to read boot sector"
                               " (logical sector size = %lu)",
                               sb->s_blocksize);
                        goto out_fail;
                }
                brelse(bh_resize);
        }

        mutex_init(&sbi->s_lock);
        sbi->cluster_size = sb->s_blocksize * sbi->sec_per_clus;
        sbi->cluster_bits = ffs(sbi->cluster_size) - 1;
        sbi->fats = bpb.fat_fats;
        sbi->fat_bits = 0;                /* Don't know yet */
        sbi->fat_start = bpb.fat_reserved;
        sbi->fat_length = bpb.fat_fat_length;
        sbi->root_cluster = 0;
        sbi->free_clusters = -1;        /* Don't know yet */
        sbi->free_clus_valid = 0;
        sbi->prev_free = FAT_START_ENT;
        sb->s_maxbytes = 0xffffffff;
        fat_time_fat2unix(sbi, &ts, 0, cpu_to_le16(FAT_DATE_MIN), 0);
        sb->s_time_min = ts.tv_sec;

        fat_time_fat2unix(sbi, &ts, cpu_to_le16(FAT_TIME_MAX),
                          cpu_to_le16(FAT_DATE_MAX), 0);
        sb->s_time_max = ts.tv_sec;

        if (!sbi->fat_length && bpb.fat32_length) {
                struct fat_boot_fsinfo *fsinfo;
                struct buffer_head *fsinfo_bh;

                /* Must be FAT32 */
                sbi->fat_bits = 32;
                sbi->fat_length = bpb.fat32_length;
                sbi->root_cluster = bpb.fat32_root_cluster;

                /* MC - if info_sector is 0, don't multiply by 0 */
                sbi->fsinfo_sector = bpb.fat32_info_sector;
                if (sbi->fsinfo_sector == 0)
                        sbi->fsinfo_sector = 1;

                fsinfo_bh = sb_bread(sb, sbi->fsinfo_sector);
                if (fsinfo_bh == NULL) {
                        fat_msg(sb, KERN_ERR, "bread failed, FSINFO block"
                               " (sector = %lu)", sbi->fsinfo_sector);
                        goto out_fail;
                }

                fsinfo = (struct fat_boot_fsinfo *)fsinfo_bh->b_data;
                if (!IS_FSINFO(fsinfo)) {
                        fat_msg(sb, KERN_WARNING, "Invalid FSINFO signature: "
                               "0x%08x, 0x%08x (sector = %lu)",
                               le32_to_cpu(fsinfo->signature1),
                               le32_to_cpu(fsinfo->signature2),
                               sbi->fsinfo_sector);
                } else {
                        if (sbi->options.usefree)
                                sbi->free_clus_valid = 1;
                        sbi->free_clusters = le32_to_cpu(fsinfo->free_clusters);
                        sbi->prev_free = le32_to_cpu(fsinfo->next_cluster);
                }

                brelse(fsinfo_bh);
        }

        /* interpret volume ID as a little endian 32 bit integer */
        if (is_fat32(sbi))
                sbi->vol_id = bpb.fat32_vol_id;
        else /* fat 16 or 12 */
                sbi->vol_id = bpb.fat16_vol_id;

        __le32 vol_id_le = cpu_to_le32(sbi->vol_id);
        super_set_uuid(sb, (void *) &vol_id_le, sizeof(vol_id_le));

        sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
        sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;

        sbi->dir_start = sbi->fat_start + sbi->fats * sbi->fat_length;
        sbi->dir_entries = bpb.fat_dir_entries;
        if (sbi->dir_entries & (sbi->dir_per_block - 1)) {
                if (!silent)
                        fat_msg(sb, KERN_ERR, "bogus number of directory entries"
                               " (%u)", sbi->dir_entries);
                goto out_invalid;
        }

        rootdir_sectors = sbi->dir_entries
                * sizeof(struct msdos_dir_entry) / sb->s_blocksize;
        sbi->data_start = sbi->dir_start + rootdir_sectors;
        total_sectors = bpb.fat_sectors;
        if (total_sectors == 0)
                total_sectors = bpb.fat_total_sect;

        total_clusters = (total_sectors - sbi->data_start) / sbi->sec_per_clus;

        if (!is_fat32(sbi))
                sbi->fat_bits = (total_clusters > MAX_FAT12) ? 16 : 12;

        /* some OSes set FAT_STATE_DIRTY and clean it on unmount. */
        if (is_fat32(sbi))
                sbi->dirty = bpb.fat32_state & FAT_STATE_DIRTY;
        else /* fat 16 or 12 */
                sbi->dirty = bpb.fat16_state & FAT_STATE_DIRTY;

        /* check that FAT table does not overflow */
        fat_clusters = calc_fat_clusters(sb);
        total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT);
        if (total_clusters > max_fat(sb)) {
                if (!silent)
                        fat_msg(sb, KERN_ERR, "count of clusters too big (%u)",
                               total_clusters);
                goto out_invalid;
        }

        sbi->max_cluster = total_clusters + FAT_START_ENT;
        /* check the free_clusters, it's not necessarily correct */
        if (sbi->free_clusters != -1 && sbi->free_clusters > total_clusters)
                sbi->free_clusters = -1;
        /* check the prev_free, it's not necessarily correct */
        sbi->prev_free %= sbi->max_cluster;
        if (sbi->prev_free < FAT_START_ENT)
                sbi->prev_free = FAT_START_ENT;

        /* set up enough so that it can read an inode */
        fat_hash_init(sb);
        dir_hash_init(sb);
        fat_ent_access_init(sb);

        /*
         * The low byte of the first FAT entry must have the same value as
         * the media field of the boot sector. But in real world, too many
         * devices are writing wrong values. So, removed that validity check.
         *
         * The removed check compared the first FAT entry to a value dependent
         * on the media field like this:
         * == (0x0F00 | media), for FAT12
         * == (0XFF00 | media), for FAT16
         * == (0x0FFFFF | media), for FAT32
         */

        error = -EINVAL;
        sprintf(buf, "cp%d", sbi->options.codepage);
        sbi->nls_disk = load_nls(buf);
        if (!sbi->nls_disk) {
                fat_msg(sb, KERN_ERR, "codepage %s not found", buf);
                goto out_fail;
        }

        /* FIXME: utf8 is using iocharset for upper/lower conversion */
        if (sbi->options.isvfat) {
                sbi->nls_io = load_nls(sbi->options.iocharset);
                if (!sbi->nls_io) {
                        fat_msg(sb, KERN_ERR, "IO charset %s not found",
                               sbi->options.iocharset);
                        goto out_fail;
                }
        }

        error = -ENOMEM;
        fat_inode = new_inode(sb);
        if (!fat_inode)
                goto out_fail;
        sbi->fat_inode = fat_inode;

        fsinfo_inode = new_inode(sb);
        if (!fsinfo_inode)
                goto out_fail;
        fsinfo_inode->i_ino = MSDOS_FSINFO_INO;
        sbi->fsinfo_inode = fsinfo_inode;
        insert_inode_hash(fsinfo_inode);

        root_inode = new_inode(sb);
        if (!root_inode)
                goto out_fail;
        root_inode->i_ino = MSDOS_ROOT_INO;
        inode_set_iversion(root_inode, 1);
        error = fat_read_root(root_inode);
        if (error < 0) {
                iput(root_inode);
                goto out_fail;
        }
        error = -ENOMEM;
        insert_inode_hash(root_inode);
        fat_attach(root_inode, 0);
        sb->s_root = d_make_root(root_inode);
        if (!sb->s_root) {
                fat_msg(sb, KERN_ERR, "get root inode failed");
                goto out_fail;
        }

        if (sbi->options.discard && !bdev_max_discard_sectors(sb->s_bdev))
                fat_msg(sb, KERN_WARNING,
                        "mounting with \"discard\" option, but the device does not support discard");

        fat_set_state(sb, 1, 0);
        return 0;

out_invalid:
        error = -EINVAL;
        if (!silent)
                fat_msg(sb, KERN_INFO, "Can't find a valid FAT filesystem");

out_fail:
        iput(fsinfo_inode);
        iput(fat_inode);
        unload_nls(sbi->nls_io);
        unload_nls(sbi->nls_disk);
        fat_reset_iocharset(&sbi->options);
        sb->s_fs_info = NULL;
        kfree(sbi);
        return error;
}

EXPORT_SYMBOL_GPL(fat_fill_super);

/*
 * helper function for fat_flush_inodes.  This writes both the inode
 * and the file data blocks, waiting for in flight data blocks before
 * the start of the call.  It does not wait for any io started
 * during the call
 */
static int writeback_inode(struct inode *inode)
{

        int ret;

        /* if we used wait=1, sync_inode_metadata waits for the io for the
        * inode to finish.  So wait=0 is sent down to sync_inode_metadata
        * and filemap_fdatawrite is used for the data blocks
        */
        ret = sync_inode_metadata(inode, 0);
        if (!ret)
                ret = filemap_fdatawrite(inode->i_mapping);
        return ret;
}

/*
 * write data and metadata corresponding to i1 and i2.  The io is
 * started but we do not wait for any of it to finish.
 *
 * filemap_flush is used for the block device, so if there is a dirty
 * page for a block already in flight, we will not wait and start the
 * io over again
 */
int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2)
{
        int ret = 0;
        if (!MSDOS_SB(sb)->options.flush)
                return 0;
        if (i1)
                ret = writeback_inode(i1);
        if (!ret && i2)
                ret = writeback_inode(i2);
        if (!ret)
                ret = sync_blockdev_nowait(sb->s_bdev);
        return ret;
}
EXPORT_SYMBOL_GPL(fat_flush_inodes);

static int __init init_fat_fs(void)
{
        int err;

        err = fat_cache_init();
        if (err)
                return err;

        err = fat_init_inodecache();
        if (err)
                goto failed;

        return 0;

failed:
        fat_cache_destroy();
        return err;
}

static void __exit exit_fat_fs(void)
{
        fat_cache_destroy();
        fat_destroy_inodecache();
}

module_init(init_fat_fs)
module_exit(exit_fat_fs)

MODULE_LICENSE("GPL");
















































































   35 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 1993  Linus Torvalds
 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
 *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
 *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
 *  Numa awareness, Christoph Lameter, SGI, June 2005
 *  Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
 */

#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/set_memory.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/rbtree.h>
#include <linux/xarray.h>
#include <linux/io.h>
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/memcontrol.h>
#include <linux/llist.h>
#include <linux/uio.h>
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
#include <linux/overflow.h>
#include <linux/pgtable.h>
#include <linux/hugetlb.h>
#include <linux/sched/mm.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
#include <linux/page_owner.h>

#define CREATE_TRACE_POINTS
#include <trace/events/vmalloc.h>

#include "internal.h"
#include "pgalloc-track.h"

#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1;

static int __init set_nohugeiomap(char *str)
{
        ioremap_max_page_shift = PAGE_SHIFT;
        return 0;
}
early_param("nohugeiomap", set_nohugeiomap);
#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
static const unsigned int ioremap_max_page_shift = PAGE_SHIFT;
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMAP */

#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
static bool __ro_after_init vmap_allow_huge = true;

static int __init set_nohugevmalloc(char *str)
{
        vmap_allow_huge = false;
        return 0;
}
early_param("nohugevmalloc", set_nohugevmalloc);
#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
static const bool vmap_allow_huge = false;
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */

bool is_vmalloc_addr(const void *x)
{
        unsigned long addr = (unsigned long)kasan_reset_tag(x);

        return addr >= VMALLOC_START && addr < VMALLOC_END;
}
EXPORT_SYMBOL(is_vmalloc_addr);

struct vfree_deferred {
        struct llist_head list;
        struct work_struct wq;
};
static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);

/*** Page table manipulation functions ***/
static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
        pte_t *pte;
        u64 pfn;
        struct page *page;
        unsigned long size = PAGE_SIZE;

        pfn = phys_addr >> PAGE_SHIFT;
        pte = pte_alloc_kernel_track(pmd, addr, mask);
        if (!pte)
                return -ENOMEM;
        do {
                if (!pte_none(ptep_get(pte))) {
                        if (pfn_valid(pfn)) {
                                page = pfn_to_page(pfn);
                                dump_page(page, "remapping already mapped page");
                        }
                        BUG();
                }

#ifdef CONFIG_HUGETLB_PAGE
                size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
                if (size != PAGE_SIZE) {
                        pte_t entry = pfn_pte(pfn, prot);

                        entry = arch_make_huge_pte(entry, ilog2(size), 0);
                        set_huge_pte_at(&init_mm, addr, pte, entry, size);
                        pfn += PFN_DOWN(size);
                        continue;
                }
#endif
                set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
                pfn++;
        } while (pte += PFN_DOWN(size), addr += size, addr != end);
        *mask |= PGTBL_PTE_MODIFIED;
        return 0;
}

static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift)
{
        if (max_page_shift < PMD_SHIFT)
                return 0;

        if (!arch_vmap_pmd_supported(prot))
                return 0;

        if ((end - addr) != PMD_SIZE)
                return 0;

        if (!IS_ALIGNED(addr, PMD_SIZE))
                return 0;

        if (!IS_ALIGNED(phys_addr, PMD_SIZE))
                return 0;

        if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
                return 0;

        return pmd_set_huge(pmd, phys_addr, prot);
}

static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;

        pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
        if (!pmd)
                return -ENOMEM;
        do {
                next = pmd_addr_end(addr, end);

                if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
                                        max_page_shift)) {
                        *mask |= PGTBL_PMD_MODIFIED;
                        continue;
                }

                if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask))
                        return -ENOMEM;
        } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
        return 0;
}

static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift)
{
        if (max_page_shift < PUD_SHIFT)
                return 0;

        if (!arch_vmap_pud_supported(prot))
                return 0;

        if ((end - addr) != PUD_SIZE)
                return 0;

        if (!IS_ALIGNED(addr, PUD_SIZE))
                return 0;

        if (!IS_ALIGNED(phys_addr, PUD_SIZE))
                return 0;

        if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
                return 0;

        return pud_set_huge(pud, phys_addr, prot);
}

static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;

        pud = pud_alloc_track(&init_mm, p4d, addr, mask);
        if (!pud)
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);

                if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
                                        max_page_shift)) {
                        *mask |= PGTBL_PUD_MODIFIED;
                        continue;
                }

                if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
                                        max_page_shift, mask))
                        return -ENOMEM;
        } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
        return 0;
}

static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift)
{
        if (max_page_shift < P4D_SHIFT)
                return 0;

        if (!arch_vmap_p4d_supported(prot))
                return 0;

        if ((end - addr) != P4D_SIZE)
                return 0;

        if (!IS_ALIGNED(addr, P4D_SIZE))
                return 0;

        if (!IS_ALIGNED(phys_addr, P4D_SIZE))
                return 0;

        if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
                return 0;

        return p4d_set_huge(p4d, phys_addr, prot);
}

static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
        if (!p4d)
                return -ENOMEM;
        do {
                next = p4d_addr_end(addr, end);

                if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
                                        max_page_shift)) {
                        *mask |= PGTBL_P4D_MODIFIED;
                        continue;
                }

                if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
                                        max_page_shift, mask))
                        return -ENOMEM;
        } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
        return 0;
}

static int vmap_range_noflush(unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift)
{
        pgd_t *pgd;
        unsigned long start;
        unsigned long next;
        int err;
        pgtbl_mod_mask mask = 0;

        might_sleep();
        BUG_ON(addr >= end);

        start = addr;
        pgd = pgd_offset_k(addr);
        do {
                next = pgd_addr_end(addr, end);
                err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
                                        max_page_shift, &mask);
                if (err)
                        break;
        } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, end);

        return err;
}

int vmap_page_range(unsigned long addr, unsigned long end,
                    phys_addr_t phys_addr, pgprot_t prot)
{
        int err;

        err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
                                 ioremap_max_page_shift);
        flush_cache_vmap(addr, end);
        if (!err)
                err = kmsan_ioremap_page_range(addr, end, phys_addr, prot,
                                               ioremap_max_page_shift);
        return err;
}

int ioremap_page_range(unsigned long addr, unsigned long end,
                phys_addr_t phys_addr, pgprot_t prot)
{
        struct vm_struct *area;

        area = find_vm_area((void *)addr);
        if (!area || !(area->flags & VM_IOREMAP)) {
                WARN_ONCE(1, "vm_area at addr %lx is not marked as VM_IOREMAP\n", addr);
                return -EINVAL;
        }
        if (addr != (unsigned long)area->addr ||
            (void *)end != area->addr + get_vm_area_size(area)) {
                WARN_ONCE(1, "ioremap request [%lx,%lx) doesn't match vm_area [%lx, %lx)\n",
                          addr, end, (long)area->addr,
                          (long)area->addr + get_vm_area_size(area));
                return -ERANGE;
        }
        return vmap_page_range(addr, end, phys_addr, prot);
}

static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        pte_t *pte;

        pte = pte_offset_kernel(pmd, addr);
        do {
                pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
                WARN_ON(!pte_none(ptent) && !pte_present(ptent));
        } while (pte++, addr += PAGE_SIZE, addr != end);
        *mask |= PGTBL_PTE_MODIFIED;
}

static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;
        int cleared;

        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);

                cleared = pmd_clear_huge(pmd);
                if (cleared || pmd_bad(*pmd))
                        *mask |= PGTBL_PMD_MODIFIED;

                if (cleared)
                        continue;
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                vunmap_pte_range(pmd, addr, next, mask);

                cond_resched();
        } while (pmd++, addr = next, addr != end);
}

static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;
        int cleared;

        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);

                cleared = pud_clear_huge(pud);
                if (cleared || pud_bad(*pud))
                        *mask |= PGTBL_PUD_MODIFIED;

                if (cleared)
                        continue;
                if (pud_none_or_clear_bad(pud))
                        continue;
                vunmap_pmd_range(pud, addr, next, mask);
        } while (pud++, addr = next, addr != end);
}

static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);

                p4d_clear_huge(p4d);
                if (p4d_bad(*p4d))
                        *mask |= PGTBL_P4D_MODIFIED;

                if (p4d_none_or_clear_bad(p4d))
                        continue;
                vunmap_pud_range(p4d, addr, next, mask);
        } while (p4d++, addr = next, addr != end);
}

/*
 * vunmap_range_noflush is similar to vunmap_range, but does not
 * flush caches or TLBs.
 *
 * The caller is responsible for calling flush_cache_vmap() before calling
 * this function, and flush_tlb_kernel_range after it has returned
 * successfully (and before the addresses are expected to cause a page fault
 * or be re-mapped for something else, if TLB flushes are being delayed or
 * coalesced).
 *
 * This is an internal function only. Do not use outside mm/.
 */
void __vunmap_range_noflush(unsigned long start, unsigned long end)
{
        unsigned long next;
        pgd_t *pgd;
        unsigned long addr = start;
        pgtbl_mod_mask mask = 0;

        BUG_ON(addr >= end);
        pgd = pgd_offset_k(addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_bad(*pgd))
                        mask |= PGTBL_PGD_MODIFIED;
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                vunmap_p4d_range(pgd, addr, next, &mask);
        } while (pgd++, addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, end);
}

void vunmap_range_noflush(unsigned long start, unsigned long end)
{
        kmsan_vunmap_range_noflush(start, end);
        __vunmap_range_noflush(start, end);
}

/**
 * vunmap_range - unmap kernel virtual addresses
 * @addr: start of the VM area to unmap
 * @end: end of the VM area to unmap (non-inclusive)
 *
 * Clears any present PTEs in the virtual address range, flushes TLBs and
 * caches. Any subsequent access to the address before it has been re-mapped
 * is a kernel bug.
 */
void vunmap_range(unsigned long addr, unsigned long end)
{
        flush_cache_vunmap(addr, end);
        vunmap_range_noflush(addr, end);
        flush_tlb_kernel_range(addr, end);
}

static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        pte_t *pte;

        /*
         * nr is a running index into the array which helps higher level
         * callers keep track of where we're up to.
         */

        pte = pte_alloc_kernel_track(pmd, addr, mask);
        if (!pte)
                return -ENOMEM;
        do {
                struct page *page = pages[*nr];

                if (WARN_ON(!pte_none(ptep_get(pte))))
                        return -EBUSY;
                if (WARN_ON(!page))
                        return -ENOMEM;
                if (WARN_ON(!pfn_valid(page_to_pfn(page))))
                        return -EINVAL;

                set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
                (*nr)++;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        *mask |= PGTBL_PTE_MODIFIED;
        return 0;
}

static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;

        pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
        if (!pmd)
                return -ENOMEM;
        do {
                next = pmd_addr_end(addr, end);
                if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
                        return -ENOMEM;
        } while (pmd++, addr = next, addr != end);
        return 0;
}

static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;

        pud = pud_alloc_track(&init_mm, p4d, addr, mask);
        if (!pud)
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);
                if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
                        return -ENOMEM;
        } while (pud++, addr = next, addr != end);
        return 0;
}

static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
        if (!p4d)
                return -ENOMEM;
        do {
                next = p4d_addr_end(addr, end);
                if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
                        return -ENOMEM;
        } while (p4d++, addr = next, addr != end);
        return 0;
}

static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages)
{
        unsigned long start = addr;
        pgd_t *pgd;
        unsigned long next;
        int err = 0;
        int nr = 0;
        pgtbl_mod_mask mask = 0;

        BUG_ON(addr >= end);
        pgd = pgd_offset_k(addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_bad(*pgd))
                        mask |= PGTBL_PGD_MODIFIED;
                err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
                if (err)
                        return err;
        } while (pgd++, addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, end);

        return 0;
}

/*
 * vmap_pages_range_noflush is similar to vmap_pages_range, but does not
 * flush caches.
 *
 * The caller is responsible for calling flush_cache_vmap() after this
 * function returns successfully and before the addresses are accessed.
 *
 * This is an internal function only. Do not use outside mm/.
 */
int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift)
{
        unsigned int i, nr = (end - addr) >> PAGE_SHIFT;

        WARN_ON(page_shift < PAGE_SHIFT);

        if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
                        page_shift == PAGE_SHIFT)
                return vmap_small_pages_range_noflush(addr, end, prot, pages);

        for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
                int err;

                err = vmap_range_noflush(addr, addr + (1UL << page_shift),
                                        page_to_phys(pages[i]), prot,
                                        page_shift);
                if (err)
                        return err;

                addr += 1UL << page_shift;
        }

        return 0;
}

int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift)
{
        int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
                                                 page_shift);

        if (ret)
                return ret;
        return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
}

/**
 * vmap_pages_range - map pages to a kernel virtual address
 * @addr: start of the VM area to map
 * @end: end of the VM area to map (non-inclusive)
 * @prot: page protection flags to use
 * @pages: pages to map (always PAGE_SIZE pages)
 * @page_shift: maximum shift that the pages may be mapped with, @pages must
 * be aligned and contiguous up to at least this shift.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
static int vmap_pages_range(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift)
{
        int err;

        err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
        flush_cache_vmap(addr, end);
        return err;
}

static int check_sparse_vm_area(struct vm_struct *area, unsigned long start,
                                unsigned long end)
{
        might_sleep();
        if (WARN_ON_ONCE(area->flags & VM_FLUSH_RESET_PERMS))
                return -EINVAL;
        if (WARN_ON_ONCE(area->flags & VM_NO_GUARD))
                return -EINVAL;
        if (WARN_ON_ONCE(!(area->flags & VM_SPARSE)))
                return -EINVAL;
        if ((end - start) >> PAGE_SHIFT > totalram_pages())
                return -E2BIG;
        if (start < (unsigned long)area->addr ||
            (void *)end > area->addr + get_vm_area_size(area))
                return -ERANGE;
        return 0;
}

/**
 * vm_area_map_pages - map pages inside given sparse vm_area
 * @area: vm_area
 * @start: start address inside vm_area
 * @end: end address inside vm_area
 * @pages: pages to map (always PAGE_SIZE pages)
 */
int vm_area_map_pages(struct vm_struct *area, unsigned long start,
                      unsigned long end, struct page **pages)
{
        int err;

        err = check_sparse_vm_area(area, start, end);
        if (err)
                return err;

        return vmap_pages_range(start, end, PAGE_KERNEL, pages, PAGE_SHIFT);
}

/**
 * vm_area_unmap_pages - unmap pages inside given sparse vm_area
 * @area: vm_area
 * @start: start address inside vm_area
 * @end: end address inside vm_area
 */
void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
                         unsigned long end)
{
        if (check_sparse_vm_area(area, start, end))
                return;

        vunmap_range(start, end);
}

int is_vmalloc_or_module_addr(const void *x)
{
        /*
         * ARM, x86-64 and sparc64 put modules in a special place,
         * and fall back on vmalloc() if that fails. Others
         * just put it in the vmalloc space.
         */
#if defined(CONFIG_EXECMEM) && defined(MODULES_VADDR)
        unsigned long addr = (unsigned long)kasan_reset_tag(x);
        if (addr >= MODULES_VADDR && addr < MODULES_END)
                return 1;
#endif
        return is_vmalloc_addr(x);
}
EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr);

/*
 * Walk a vmap address to the struct page it maps. Huge vmap mappings will
 * return the tail page that corresponds to the base page address, which
 * matches small vmap mappings.
 */
struct page *vmalloc_to_page(const void *vmalloc_addr)
{
        unsigned long addr = (unsigned long) vmalloc_addr;
        struct page *page = NULL;
        pgd_t *pgd = pgd_offset_k(addr);
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *ptep, pte;

        /*
         * XXX we might need to change this if we add VIRTUAL_BUG_ON for
         * architectures that do not vmalloc module space
         */
        VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));

        if (pgd_none(*pgd))
                return NULL;
        if (WARN_ON_ONCE(pgd_leaf(*pgd)))
                return NULL; /* XXX: no allowance for huge pgd */
        if (WARN_ON_ONCE(pgd_bad(*pgd)))
                return NULL;

        p4d = p4d_offset(pgd, addr);
        if (p4d_none(*p4d))
                return NULL;
        if (p4d_leaf(*p4d))
                return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
        if (WARN_ON_ONCE(p4d_bad(*p4d)))
                return NULL;

        pud = pud_offset(p4d, addr);
        if (pud_none(*pud))
                return NULL;
        if (pud_leaf(*pud))
                return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
        if (WARN_ON_ONCE(pud_bad(*pud)))
                return NULL;

        pmd = pmd_offset(pud, addr);
        if (pmd_none(*pmd))
                return NULL;
        if (pmd_leaf(*pmd))
                return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
        if (WARN_ON_ONCE(pmd_bad(*pmd)))
                return NULL;

        ptep = pte_offset_kernel(pmd, addr);
        pte = ptep_get(ptep);
        if (pte_present(pte))
                page = pte_page(pte);

        return page;
}
EXPORT_SYMBOL(vmalloc_to_page);

/*
 * Map a vmalloc()-space virtual address to the physical page frame number.
 */
unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
{
        return page_to_pfn(vmalloc_to_page(vmalloc_addr));
}
EXPORT_SYMBOL(vmalloc_to_pfn);


/*** Global kva allocator ***/

#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0


static DEFINE_SPINLOCK(free_vmap_area_lock);
static bool vmap_initialized __read_mostly;

/*
 * This kmem_cache is used for vmap_area objects. Instead of
 * allocating from slab we reuse an object from this cache to
 * make things faster. Especially in "no edge" splitting of
 * free block.
 */
static struct kmem_cache *vmap_area_cachep;

/*
 * This linked list is used in pair with free_vmap_area_root.
 * It gives O(1) access to prev/next to perform fast coalescing.
 */
static LIST_HEAD(free_vmap_area_list);

/*
 * This augment red-black tree represents the free vmap space.
 * All vmap_area objects in this tree are sorted by va->va_start
 * address. It is used for allocation and merging when a vmap
 * object is released.
 *
 * Each vmap_area node contains a maximum available free block
 * of its sub-tree, right or left. Therefore it is possible to
 * find a lowest match of free area.
 */
static struct rb_root free_vmap_area_root = RB_ROOT;

/*
 * Preload a CPU with one object for "no edge" split case. The
 * aim is to get rid of allocations from the atomic context, thus
 * to use more permissive allocation masks.
 */
static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);

/*
 * This structure defines a single, solid model where a list and
 * rb-tree are part of one entity protected by the lock. Nodes are
 * sorted in ascending order, thus for O(1) access to left/right
 * neighbors a list is used as well as for sequential traversal.
 */
struct rb_list {
        struct rb_root root;
        struct list_head head;
        spinlock_t lock;
};

/*
 * A fast size storage contains VAs up to 1M size. A pool consists
 * of linked between each other ready to go VAs of certain sizes.
 * An index in the pool-array corresponds to number of pages + 1.
 */
#define MAX_VA_SIZE_PAGES 256

struct vmap_pool {
        struct list_head head;
        unsigned long len;
};

/*
 * An effective vmap-node logic. Users make use of nodes instead
 * of a global heap. It allows to balance an access and mitigate
 * contention.
 */
static struct vmap_node {
        /* Simple size segregated storage. */
        struct vmap_pool pool[MAX_VA_SIZE_PAGES];
        spinlock_t pool_lock;
        bool skip_populate;

        /* Bookkeeping data of this node. */
        struct rb_list busy;
        struct rb_list lazy;

        /*
         * Ready-to-free areas.
         */
        struct list_head purge_list;
        struct work_struct purge_work;
        unsigned long nr_purged;
} single;

/*
 * Initial setup consists of one single node, i.e. a balancing
 * is fully disabled. Later on, after vmap is initialized these
 * parameters are updated based on a system capacity.
 */
static struct vmap_node *vmap_nodes = &single;
static __read_mostly unsigned int nr_vmap_nodes = 1;
static __read_mostly unsigned int vmap_zone_size = 1;

static inline unsigned int
addr_to_node_id(unsigned long addr)
{
        return (addr / vmap_zone_size) % nr_vmap_nodes;
}

static inline struct vmap_node *
addr_to_node(unsigned long addr)
{
        return &vmap_nodes[addr_to_node_id(addr)];
}

static inline struct vmap_node *
id_to_node(unsigned int id)
{
        return &vmap_nodes[id % nr_vmap_nodes];
}

/*
 * We use the value 0 to represent "no node", that is why
 * an encoded value will be the node-id incremented by 1.
 * It is always greater then 0. A valid node_id which can
 * be encoded is [0:nr_vmap_nodes - 1]. If a passed node_id
 * is not valid 0 is returned.
 */
static unsigned int
encode_vn_id(unsigned int node_id)
{
        /* Can store U8_MAX [0:254] nodes. */
        if (node_id < nr_vmap_nodes)
                return (node_id + 1) << BITS_PER_BYTE;

        /* Warn and no node encoded. */
        WARN_ONCE(1, "Encode wrong node id (%u)\n", node_id);
        return 0;
}

/*
 * Returns an encoded node-id, the valid range is within
 * [0:nr_vmap_nodes-1] values. Otherwise nr_vmap_nodes is
 * returned if extracted data is wrong.
 */
static unsigned int
decode_vn_id(unsigned int val)
{
        unsigned int node_id = (val >> BITS_PER_BYTE) - 1;

        /* Can store U8_MAX [0:254] nodes. */
        if (node_id < nr_vmap_nodes)
                return node_id;

        /* If it was _not_ zero, warn. */
        WARN_ONCE(node_id != UINT_MAX,
                "Decode wrong node id (%d)\n", node_id);

        return nr_vmap_nodes;
}

static bool
is_vn_id_valid(unsigned int node_id)
{
        if (node_id < nr_vmap_nodes)
                return true;

        return false;
}

static __always_inline unsigned long
va_size(struct vmap_area *va)
{
        return (va->va_end - va->va_start);
}

static __always_inline unsigned long
get_subtree_max_size(struct rb_node *node)
{
        struct vmap_area *va;

        va = rb_entry_safe(node, struct vmap_area, rb_node);
        return va ? va->subtree_max_size : 0;
}

RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
        struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)

static void reclaim_and_purge_vmap_areas(void);
static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
static void drain_vmap_area_work(struct work_struct *work);
static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);

static atomic_long_t nr_vmalloc_pages;

unsigned long vmalloc_nr_pages(void)
{
        return atomic_long_read(&nr_vmalloc_pages);
}

static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
{
        struct rb_node *n = root->rb_node;

        addr = (unsigned long)kasan_reset_tag((void *)addr);

        while (n) {
                struct vmap_area *va;

                va = rb_entry(n, struct vmap_area, rb_node);
                if (addr < va->va_start)
                        n = n->rb_left;
                else if (addr >= va->va_end)
                        n = n->rb_right;
                else
                        return va;
        }

        return NULL;
}

/* Look up the first VA which satisfies addr < va_end, NULL if none. */
static struct vmap_area *
__find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
{
        struct vmap_area *va = NULL;
        struct rb_node *n = root->rb_node;

        addr = (unsigned long)kasan_reset_tag((void *)addr);

        while (n) {
                struct vmap_area *tmp;

                tmp = rb_entry(n, struct vmap_area, rb_node);
                if (tmp->va_end > addr) {
                        va = tmp;
                        if (tmp->va_start <= addr)
                                break;

                        n = n->rb_left;
                } else
                        n = n->rb_right;
        }

        return va;
}

/*
 * Returns a node where a first VA, that satisfies addr < va_end, resides.
 * If success, a node is locked. A user is responsible to unlock it when a
 * VA is no longer needed to be accessed.
 *
 * Returns NULL if nothing found.
 */
static struct vmap_node *
find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
{
        unsigned long va_start_lowest;
        struct vmap_node *vn;
        int i;

repeat:
        for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) {
                vn = &vmap_nodes[i];

                spin_lock(&vn->busy.lock);
                *va = __find_vmap_area_exceed_addr(addr, &vn->busy.root);

                if (*va)
                        if (!va_start_lowest || (*va)->va_start < va_start_lowest)
                                va_start_lowest = (*va)->va_start;
                spin_unlock(&vn->busy.lock);
        }

        /*
         * Check if found VA exists, it might have gone away.  In this case we
         * repeat the search because a VA has been removed concurrently and we
         * need to proceed to the next one, which is a rare case.
         */
        if (va_start_lowest) {
                vn = addr_to_node(va_start_lowest);

                spin_lock(&vn->busy.lock);
                *va = __find_vmap_area(va_start_lowest, &vn->busy.root);

                if (*va)
                        return vn;

                spin_unlock(&vn->busy.lock);
                goto repeat;
        }

        return NULL;
}

/*
 * This function returns back addresses of parent node
 * and its left or right link for further processing.
 *
 * Otherwise NULL is returned. In that case all further
 * steps regarding inserting of conflicting overlap range
 * have to be declined and actually considered as a bug.
 */
static __always_inline struct rb_node **
find_va_links(struct vmap_area *va,
        struct rb_root *root, struct rb_node *from,
        struct rb_node **parent)
{
        struct vmap_area *tmp_va;
        struct rb_node **link;

        if (root) {
                link = &root->rb_node;
                if (unlikely(!*link)) {
                        *parent = NULL;
                        return link;
                }
        } else {
                link = &from;
        }

        /*
         * Go to the bottom of the tree. When we hit the last point
         * we end up with parent rb_node and correct direction, i name
         * it link, where the new va->rb_node will be attached to.
         */
        do {
                tmp_va = rb_entry(*link, struct vmap_area, rb_node);

                /*
                 * During the traversal we also do some sanity check.
                 * Trigger the BUG() if there are sides(left/right)
                 * or full overlaps.
                 */
                if (va->va_end <= tmp_va->va_start)
                        link = &(*link)->rb_left;
                else if (va->va_start >= tmp_va->va_end)
                        link = &(*link)->rb_right;
                else {
                        WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
                                va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);

                        return NULL;
                }
        } while (*link);

        *parent = &tmp_va->rb_node;
        return link;
}

static __always_inline struct list_head *
get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
{
        struct list_head *list;

        if (unlikely(!parent))
                /*
                 * The red-black tree where we try to find VA neighbors
                 * before merging or inserting is empty, i.e. it means
                 * there is no free vmap space. Normally it does not
                 * happen but we handle this case anyway.
                 */
                return NULL;

        list = &rb_entry(parent, struct vmap_area, rb_node)->list;
        return (&parent->rb_right == link ? list->next : list);
}

static __always_inline void
__link_va(struct vmap_area *va, struct rb_root *root,
        struct rb_node *parent, struct rb_node **link,
        struct list_head *head, bool augment)
{
        /*
         * VA is still not in the list, but we can
         * identify its future previous list_head node.
         */
        if (likely(parent)) {
                head = &rb_entry(parent, struct vmap_area, rb_node)->list;
                if (&parent->rb_right != link)
                        head = head->prev;
        }

        /* Insert to the rb-tree */
        rb_link_node(&va->rb_node, parent, link);
        if (augment) {
                /*
                 * Some explanation here. Just perform simple insertion
                 * to the tree. We do not set va->subtree_max_size to
                 * its current size before calling rb_insert_augmented().
                 * It is because we populate the tree from the bottom
                 * to parent levels when the node _is_ in the tree.
                 *
                 * Therefore we set subtree_max_size to zero after insertion,
                 * to let __augment_tree_propagate_from() puts everything to
                 * the correct order later on.
                 */
                rb_insert_augmented(&va->rb_node,
                        root, &free_vmap_area_rb_augment_cb);
                va->subtree_max_size = 0;
        } else {
                rb_insert_color(&va->rb_node, root);
        }

        /* Address-sort this list */
        list_add(&va->list, head);
}

static __always_inline void
link_va(struct vmap_area *va, struct rb_root *root,
        struct rb_node *parent, struct rb_node **link,
        struct list_head *head)
{
        __link_va(va, root, parent, link, head, false);
}

static __always_inline void
link_va_augment(struct vmap_area *va, struct rb_root *root,
        struct rb_node *parent, struct rb_node **link,
        struct list_head *head)
{
        __link_va(va, root, parent, link, head, true);
}

static __always_inline void
__unlink_va(struct vmap_area *va, struct rb_root *root, bool augment)
{
        if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
                return;

        if (augment)
                rb_erase_augmented(&va->rb_node,
                        root, &free_vmap_area_rb_augment_cb);
        else
                rb_erase(&va->rb_node, root);

        list_del_init(&va->list);
        RB_CLEAR_NODE(&va->rb_node);
}

static __always_inline void
unlink_va(struct vmap_area *va, struct rb_root *root)
{
        __unlink_va(va, root, false);
}

static __always_inline void
unlink_va_augment(struct vmap_area *va, struct rb_root *root)
{
        __unlink_va(va, root, true);
}

#if DEBUG_AUGMENT_PROPAGATE_CHECK
/*
 * Gets called when remove the node and rotate.
 */
static __always_inline unsigned long
compute_subtree_max_size(struct vmap_area *va)
{
        return max3(va_size(va),
                get_subtree_max_size(va->rb_node.rb_left),
                get_subtree_max_size(va->rb_node.rb_right));
}

static void
augment_tree_propagate_check(void)
{
        struct vmap_area *va;
        unsigned long computed_size;

        list_for_each_entry(va, &free_vmap_area_list, list) {
                computed_size = compute_subtree_max_size(va);
                if (computed_size != va->subtree_max_size)
                        pr_emerg("tree is corrupted: %lu, %lu\n",
                                va_size(va), va->subtree_max_size);
        }
}
#endif

/*
 * This function populates subtree_max_size from bottom to upper
 * levels starting from VA point. The propagation must be done
 * when VA size is modified by changing its va_start/va_end. Or
 * in case of newly inserting of VA to the tree.
 *
 * It means that __augment_tree_propagate_from() must be called:
 * - After VA has been inserted to the tree(free path);
 * - After VA has been shrunk(allocation path);
 * - After VA has been increased(merging path).
 *
 * Please note that, it does not mean that upper parent nodes
 * and their subtree_max_size are recalculated all the time up
 * to the root node.
 *
 *       4--8
 *        /\
 *       /  \
 *      /    \
 *    2--2  8--8
 *
 * For example if we modify the node 4, shrinking it to 2, then
 * no any modification is required. If we shrink the node 2 to 1
 * its subtree_max_size is updated only, and set to 1. If we shrink
 * the node 8 to 6, then its subtree_max_size is set to 6 and parent
 * node becomes 4--6.
 */
static __always_inline void
augment_tree_propagate_from(struct vmap_area *va)
{
        /*
         * Populate the tree from bottom towards the root until
         * the calculated maximum available size of checked node
         * is equal to its current one.
         */
        free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);

#if DEBUG_AUGMENT_PROPAGATE_CHECK
        augment_tree_propagate_check();
#endif
}

static void
insert_vmap_area(struct vmap_area *va,
        struct rb_root *root, struct list_head *head)
{
        struct rb_node **link;
        struct rb_node *parent;

        link = find_va_links(va, root, NULL, &parent);
        if (link)
                link_va(va, root, parent, link, head);
}

static void
insert_vmap_area_augment(struct vmap_area *va,
        struct rb_node *from, struct rb_root *root,
        struct list_head *head)
{
        struct rb_node **link;
        struct rb_node *parent;

        if (from)
                link = find_va_links(va, NULL, from, &parent);
        else
                link = find_va_links(va, root, NULL, &parent);

        if (link) {
                link_va_augment(va, root, parent, link, head);
                augment_tree_propagate_from(va);
        }
}

/*
 * Merge de-allocated chunk of VA memory with previous
 * and next free blocks. If coalesce is not done a new
 * free area is inserted. If VA has been merged, it is
 * freed.
 *
 * Please note, it can return NULL in case of overlap
 * ranges, followed by WARN() report. Despite it is a
 * buggy behaviour, a system can be alive and keep
 * ongoing.
 */
static __always_inline struct vmap_area *
__merge_or_add_vmap_area(struct vmap_area *va,
        struct rb_root *root, struct list_head *head, bool augment)
{
        struct vmap_area *sibling;
        struct list_head *next;
        struct rb_node **link;
        struct rb_node *parent;
        bool merged = false;

        /*
         * Find a place in the tree where VA potentially will be
         * inserted, unless it is merged with its sibling/siblings.
         */
        link = find_va_links(va, root, NULL, &parent);
        if (!link)
                return NULL;

        /*
         * Get next node of VA to check if merging can be done.
         */
        next = get_va_next_sibling(parent, link);
        if (unlikely(next == NULL))
                goto insert;

        /*
         * start            end
         * |                |
         * |<------VA------>|<-----Next----->|
         *                  |                |
         *                  start            end
         */
        if (next != head) {
                sibling = list_entry(next, struct vmap_area, list);
                if (sibling->va_start == va->va_end) {
                        sibling->va_start = va->va_start;

                        /* Free vmap_area object. */
                        kmem_cache_free(vmap_area_cachep, va);

                        /* Point to the new merged area. */
                        va = sibling;
                        merged = true;
                }
        }

        /*
         * start            end
         * |                |
         * |<-----Prev----->|<------VA------>|
         *                  |                |
         *                  start            end
         */
        if (next->prev != head) {
                sibling = list_entry(next->prev, struct vmap_area, list);
                if (sibling->va_end == va->va_start) {
                        /*
                         * If both neighbors are coalesced, it is important
                         * to unlink the "next" node first, followed by merging
                         * with "previous" one. Otherwise the tree might not be
                         * fully populated if a sibling's augmented value is
                         * "normalized" because of rotation operations.
                         */
                        if (merged)
                                __unlink_va(va, root, augment);

                        sibling->va_end = va->va_end;

                        /* Free vmap_area object. */
                        kmem_cache_free(vmap_area_cachep, va);

                        /* Point to the new merged area. */
                        va = sibling;
                        merged = true;
                }
        }

insert:
        if (!merged)
                __link_va(va, root, parent, link, head, augment);

        return va;
}

static __always_inline struct vmap_area *
merge_or_add_vmap_area(struct vmap_area *va,
        struct rb_root *root, struct list_head *head)
{
        return __merge_or_add_vmap_area(va, root, head, false);
}

static __always_inline struct vmap_area *
merge_or_add_vmap_area_augment(struct vmap_area *va,
        struct rb_root *root, struct list_head *head)
{
        va = __merge_or_add_vmap_area(va, root, head, true);
        if (va)
                augment_tree_propagate_from(va);

        return va;
}

static __always_inline bool
is_within_this_va(struct vmap_area *va, unsigned long size,
        unsigned long align, unsigned long vstart)
{
        unsigned long nva_start_addr;

        if (va->va_start > vstart)
                nva_start_addr = ALIGN(va->va_start, align);
        else
                nva_start_addr = ALIGN(vstart, align);

        /* Can be overflowed due to big size or alignment. */
        if (nva_start_addr + size < nva_start_addr ||
                        nva_start_addr < vstart)
                return false;

        return (nva_start_addr + size <= va->va_end);
}

/*
 * Find the first free block(lowest start address) in the tree,
 * that will accomplish the request corresponding to passing
 * parameters. Please note, with an alignment bigger than PAGE_SIZE,
 * a search length is adjusted to account for worst case alignment
 * overhead.
 */
static __always_inline struct vmap_area *
find_vmap_lowest_match(struct rb_root *root, unsigned long size,
        unsigned long align, unsigned long vstart, bool adjust_search_size)
{
        struct vmap_area *va;
        struct rb_node *node;
        unsigned long length;

        /* Start from the root. */
        node = root->rb_node;

        /* Adjust the search size for alignment overhead. */
        length = adjust_search_size ? size + align - 1 : size;

        while (node) {
                va = rb_entry(node, struct vmap_area, rb_node);

                if (get_subtree_max_size(node->rb_left) >= length &&
                                vstart < va->va_start) {
                        node = node->rb_left;
                } else {
                        if (is_within_this_va(va, size, align, vstart))
                                return va;

                        /*
                         * Does not make sense to go deeper towards the right
                         * sub-tree if it does not have a free block that is
                         * equal or bigger to the requested search length.
                         */
                        if (get_subtree_max_size(node->rb_right) >= length) {
                                node = node->rb_right;
                                continue;
                        }

                        /*
                         * OK. We roll back and find the first right sub-tree,
                         * that will satisfy the search criteria. It can happen
                         * due to "vstart" restriction or an alignment overhead
                         * that is bigger then PAGE_SIZE.
                         */
                        while ((node = rb_parent(node))) {
                                va = rb_entry(node, struct vmap_area, rb_node);
                                if (is_within_this_va(va, size, align, vstart))
                                        return va;

                                if (get_subtree_max_size(node->rb_right) >= length &&
                                                vstart <= va->va_start) {
                                        /*
                                         * Shift the vstart forward. Please note, we update it with
                                         * parent's start address adding "1" because we do not want
                                         * to enter same sub-tree after it has already been checked
                                         * and no suitable free block found there.
                                         */
                                        vstart = va->va_start + 1;
                                        node = node->rb_right;
                                        break;
                                }
                        }
                }
        }

        return NULL;
}

#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
#include <linux/random.h>

static struct vmap_area *
find_vmap_lowest_linear_match(struct list_head *head, unsigned long size,
        unsigned long align, unsigned long vstart)
{
        struct vmap_area *va;

        list_for_each_entry(va, head, list) {
                if (!is_within_this_va(va, size, align, vstart))
                        continue;

                return va;
        }

        return NULL;
}

static void
find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head,
                             unsigned long size, unsigned long align)
{
        struct vmap_area *va_1, *va_2;
        unsigned long vstart;
        unsigned int rnd;

        get_random_bytes(&rnd, sizeof(rnd));
        vstart = VMALLOC_START + rnd;

        va_1 = find_vmap_lowest_match(root, size, align, vstart, false);
        va_2 = find_vmap_lowest_linear_match(head, size, align, vstart);

        if (va_1 != va_2)
                pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
                        va_1, va_2, vstart);
}
#endif

enum fit_type {
        NOTHING_FIT = 0,
        FL_FIT_TYPE = 1,        /* full fit */
        LE_FIT_TYPE = 2,        /* left edge fit */
        RE_FIT_TYPE = 3,        /* right edge fit */
        NE_FIT_TYPE = 4                /* no edge fit */
};

static __always_inline enum fit_type
classify_va_fit_type(struct vmap_area *va,
        unsigned long nva_start_addr, unsigned long size)
{
        enum fit_type type;

        /* Check if it is within VA. */
        if (nva_start_addr < va->va_start ||
                        nva_start_addr + size > va->va_end)
                return NOTHING_FIT;

        /* Now classify. */
        if (va->va_start == nva_start_addr) {
                if (va->va_end == nva_start_addr + size)
                        type = FL_FIT_TYPE;
                else
                        type = LE_FIT_TYPE;
        } else if (va->va_end == nva_start_addr + size) {
                type = RE_FIT_TYPE;
        } else {
                type = NE_FIT_TYPE;
        }

        return type;
}

static __always_inline int
va_clip(struct rb_root *root, struct list_head *head,
                struct vmap_area *va, unsigned long nva_start_addr,
                unsigned long size)
{
        struct vmap_area *lva = NULL;
        enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);

        if (type == FL_FIT_TYPE) {
                /*
                 * No need to split VA, it fully fits.
                 *
                 * |               |
                 * V      NVA      V
                 * |---------------|
                 */
                unlink_va_augment(va, root);
                kmem_cache_free(vmap_area_cachep, va);
        } else if (type == LE_FIT_TYPE) {
                /*
                 * Split left edge of fit VA.
                 *
                 * |       |
                 * V  NVA  V   R
                 * |-------|-------|
                 */
                va->va_start += size;
        } else if (type == RE_FIT_TYPE) {
                /*
                 * Split right edge of fit VA.
                 *
                 *         |       |
                 *     L   V  NVA  V
                 * |-------|-------|
                 */
                va->va_end = nva_start_addr;
        } else if (type == NE_FIT_TYPE) {
                /*
                 * Split no edge of fit VA.
                 *
                 *     |       |
                 *   L V  NVA  V R
                 * |---|-------|---|
                 */
                lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
                if (unlikely(!lva)) {
                        /*
                         * For percpu allocator we do not do any pre-allocation
                         * and leave it as it is. The reason is it most likely
                         * never ends up with NE_FIT_TYPE splitting. In case of
                         * percpu allocations offsets and sizes are aligned to
                         * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
                         * are its main fitting cases.
                         *
                         * There are a few exceptions though, as an example it is
                         * a first allocation (early boot up) when we have "one"
                         * big free space that has to be split.
                         *
                         * Also we can hit this path in case of regular "vmap"
                         * allocations, if "this" current CPU was not preloaded.
                         * See the comment in alloc_vmap_area() why. If so, then
                         * GFP_NOWAIT is used instead to get an extra object for
                         * split purpose. That is rare and most time does not
                         * occur.
                         *
                         * What happens if an allocation gets failed. Basically,
                         * an "overflow" path is triggered to purge lazily freed
                         * areas to free some memory, then, the "retry" path is
                         * triggered to repeat one more time. See more details
                         * in alloc_vmap_area() function.
                         */
                        lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
                        if (!lva)
                                return -1;
                }

                /*
                 * Build the remainder.
                 */
                lva->va_start = va->va_start;
                lva->va_end = nva_start_addr;

                /*
                 * Shrink this VA to remaining size.
                 */
                va->va_start = nva_start_addr + size;
        } else {
                return -1;
        }

        if (type != FL_FIT_TYPE) {
                augment_tree_propagate_from(va);

                if (lva)        /* type == NE_FIT_TYPE */
                        insert_vmap_area_augment(lva, &va->rb_node, root, head);
        }

        return 0;
}

static unsigned long
va_alloc(struct vmap_area *va,
                struct rb_root *root, struct list_head *head,
                unsigned long size, unsigned long align,
                unsigned long vstart, unsigned long vend)
{
        unsigned long nva_start_addr;
        int ret;

        if (va->va_start > vstart)
                nva_start_addr = ALIGN(va->va_start, align);
        else
                nva_start_addr = ALIGN(vstart, align);

        /* Check the "vend" restriction. */
        if (nva_start_addr + size > vend)
                return vend;

        /* Update the free vmap_area. */
        ret = va_clip(root, head, va, nva_start_addr, size);
        if (WARN_ON_ONCE(ret))
                return vend;

        return nva_start_addr;
}

/*
 * Returns a start address of the newly allocated area, if success.
 * Otherwise a vend is returned that indicates failure.
 */
static __always_inline unsigned long
__alloc_vmap_area(struct rb_root *root, struct list_head *head,
        unsigned long size, unsigned long align,
        unsigned long vstart, unsigned long vend)
{
        bool adjust_search_size = true;
        unsigned long nva_start_addr;
        struct vmap_area *va;

        /*
         * Do not adjust when:
         *   a) align <= PAGE_SIZE, because it does not make any sense.
         *      All blocks(their start addresses) are at least PAGE_SIZE
         *      aligned anyway;
         *   b) a short range where a requested size corresponds to exactly
         *      specified [vstart:vend] interval and an alignment > PAGE_SIZE.
         *      With adjusted search length an allocation would not succeed.
         */
        if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
                adjust_search_size = false;

        va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
        if (unlikely(!va))
                return vend;

        nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend);
        if (nva_start_addr == vend)
                return vend;

#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
        find_vmap_lowest_match_check(root, head, size, align);
#endif

        return nva_start_addr;
}

/*
 * Free a region of KVA allocated by alloc_vmap_area
 */
static void free_vmap_area(struct vmap_area *va)
{
        struct vmap_node *vn = addr_to_node(va->va_start);

        /*
         * Remove from the busy tree/list.
         */
        spin_lock(&vn->busy.lock);
        unlink_va(va, &vn->busy.root);
        spin_unlock(&vn->busy.lock);

        /*
         * Insert/Merge it back to the free tree/list.
         */
        spin_lock(&free_vmap_area_lock);
        merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
        spin_unlock(&free_vmap_area_lock);
}

static inline void
preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
{
        struct vmap_area *va = NULL;

        /*
         * Preload this CPU with one extra vmap_area object. It is used
         * when fit type of free area is NE_FIT_TYPE. It guarantees that
         * a CPU that does an allocation is preloaded.
         *
         * We do it in non-atomic context, thus it allows us to use more
         * permissive allocation masks to be more stable under low memory
         * condition and high memory pressure.
         */
        if (!this_cpu_read(ne_fit_preload_node))
                va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);

        spin_lock(lock);

        if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va))
                kmem_cache_free(vmap_area_cachep, va);
}

static struct vmap_pool *
size_to_va_pool(struct vmap_node *vn, unsigned long size)
{
        unsigned int idx = (size - 1) / PAGE_SIZE;

        if (idx < MAX_VA_SIZE_PAGES)
                return &vn->pool[idx];

        return NULL;
}

static bool
node_pool_add_va(struct vmap_node *n, struct vmap_area *va)
{
        struct vmap_pool *vp;

        vp = size_to_va_pool(n, va_size(va));
        if (!vp)
                return false;

        spin_lock(&n->pool_lock);
        list_add(&va->list, &vp->head);
        WRITE_ONCE(vp->len, vp->len + 1);
        spin_unlock(&n->pool_lock);

        return true;
}

static struct vmap_area *
node_pool_del_va(struct vmap_node *vn, unsigned long size,
                unsigned long align, unsigned long vstart,
                unsigned long vend)
{
        struct vmap_area *va = NULL;
        struct vmap_pool *vp;
        int err = 0;

        vp = size_to_va_pool(vn, size);
        if (!vp || list_empty(&vp->head))
                return NULL;

        spin_lock(&vn->pool_lock);
        if (!list_empty(&vp->head)) {
                va = list_first_entry(&vp->head, struct vmap_area, list);

                if (IS_ALIGNED(va->va_start, align)) {
                        /*
                         * Do some sanity check and emit a warning
                         * if one of below checks detects an error.
                         */
                        err |= (va_size(va) != size);
                        err |= (va->va_start < vstart);
                        err |= (va->va_end > vend);

                        if (!WARN_ON_ONCE(err)) {
                                list_del_init(&va->list);
                                WRITE_ONCE(vp->len, vp->len - 1);
                        } else {
                                va = NULL;
                        }
                } else {
                        list_move_tail(&va->list, &vp->head);
                        va = NULL;
                }
        }
        spin_unlock(&vn->pool_lock);

        return va;
}

static struct vmap_area *
node_alloc(unsigned long size, unsigned long align,
                unsigned long vstart, unsigned long vend,
                unsigned long *addr, unsigned int *vn_id)
{
        struct vmap_area *va;

        *vn_id = 0;
        *addr = vend;

        /*
         * Fallback to a global heap if not vmalloc or there
         * is only one node.
         */
        if (vstart != VMALLOC_START || vend != VMALLOC_END ||
                        nr_vmap_nodes == 1)
                return NULL;

        *vn_id = raw_smp_processor_id() % nr_vmap_nodes;
        va = node_pool_del_va(id_to_node(*vn_id), size, align, vstart, vend);
        *vn_id = encode_vn_id(*vn_id);

        if (va)
                *addr = va->va_start;

        return va;
}

static inline void setup_vmalloc_vm(struct vm_struct *vm,
        struct vmap_area *va, unsigned long flags, const void *caller)
{
        vm->flags = flags;
        vm->addr = (void *)va->va_start;
        vm->size = va->va_end - va->va_start;
        vm->caller = caller;
        va->vm = vm;
}

/*
 * Allocate a region of KVA of the specified size and alignment, within the
 * vstart and vend. If vm is passed in, the two will also be bound.
 */
static struct vmap_area *alloc_vmap_area(unsigned long size,
                                unsigned long align,
                                unsigned long vstart, unsigned long vend,
                                int node, gfp_t gfp_mask,
                                unsigned long va_flags, struct vm_struct *vm)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        unsigned long freed;
        unsigned long addr;
        unsigned int vn_id;
        int purged = 0;
        int ret;

        if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align)))
                return ERR_PTR(-EINVAL);

        if (unlikely(!vmap_initialized))
                return ERR_PTR(-EBUSY);

        might_sleep();

        /*
         * If a VA is obtained from a global heap(if it fails here)
         * it is anyway marked with this "vn_id" so it is returned
         * to this pool's node later. Such way gives a possibility
         * to populate pools based on users demand.
         *
         * On success a ready to go VA is returned.
         */
        va = node_alloc(size, align, vstart, vend, &addr, &vn_id);
        if (!va) {
                gfp_mask = gfp_mask & GFP_RECLAIM_MASK;

                va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
                if (unlikely(!va))
                        return ERR_PTR(-ENOMEM);

                /*
                 * Only scan the relevant parts containing pointers to other objects
                 * to avoid false negatives.
                 */
                kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
        }

retry:
        if (addr == vend) {
                preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
                addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
                        size, align, vstart, vend);
                spin_unlock(&free_vmap_area_lock);
        }

        trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);

        /*
         * If an allocation fails, the "vend" address is
         * returned. Therefore trigger the overflow path.
         */
        if (unlikely(addr == vend))
                goto overflow;

        va->va_start = addr;
        va->va_end = addr + size;
        va->vm = NULL;
        va->flags = (va_flags | vn_id);

        if (vm) {
                vm->addr = (void *)va->va_start;
                vm->size = va->va_end - va->va_start;
                va->vm = vm;
        }

        vn = addr_to_node(va->va_start);

        spin_lock(&vn->busy.lock);
        insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
        spin_unlock(&vn->busy.lock);

        BUG_ON(!IS_ALIGNED(va->va_start, align));
        BUG_ON(va->va_start < vstart);
        BUG_ON(va->va_end > vend);

        ret = kasan_populate_vmalloc(addr, size);
        if (ret) {
                free_vmap_area(va);
                return ERR_PTR(ret);
        }

        return va;

overflow:
        if (!purged) {
                reclaim_and_purge_vmap_areas();
                purged = 1;
                goto retry;
        }

        freed = 0;
        blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);

        if (freed > 0) {
                purged = 0;
                goto retry;
        }

        if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
                pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
                        size);

        kmem_cache_free(vmap_area_cachep, va);
        return ERR_PTR(-EBUSY);
}

int register_vmap_purge_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);

int unregister_vmap_purge_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);

/*
 * lazy_max_pages is the maximum amount of virtual address space we gather up
 * before attempting to purge with a TLB flush.
 *
 * There is a tradeoff here: a larger number will cover more kernel page tables
 * and take slightly longer to purge, but it will linearly reduce the number of
 * global TLB flushes that must be performed. It would seem natural to scale
 * this number up linearly with the number of CPUs (because vmapping activity
 * could also scale linearly with the number of CPUs), however it is likely
 * that in practice, workloads might be constrained in other ways that mean
 * vmap activity will not scale linearly with CPUs. Also, I want to be
 * conservative and not introduce a big latency on huge systems, so go with
 * a less aggressive log scale. It will still be an improvement over the old
 * code, and it will be simple to change the scale factor if we find that it
 * becomes a problem on bigger systems.
 */
static unsigned long lazy_max_pages(void)
{
        unsigned int log;

        log = fls(num_online_cpus());

        return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}

static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);

/*
 * Serialize vmap purging.  There is no actual critical section protected
 * by this lock, but we want to avoid concurrent calls for performance
 * reasons and to make the pcpu_get_vm_areas more deterministic.
 */
static DEFINE_MUTEX(vmap_purge_lock);

/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
static cpumask_t purge_nodes;

static void
reclaim_list_global(struct list_head *head)
{
        struct vmap_area *va, *n;

        if (list_empty(head))
                return;

        spin_lock(&free_vmap_area_lock);
        list_for_each_entry_safe(va, n, head, list)
                merge_or_add_vmap_area_augment(va,
                        &free_vmap_area_root, &free_vmap_area_list);
        spin_unlock(&free_vmap_area_lock);
}

static void
decay_va_pool_node(struct vmap_node *vn, bool full_decay)
{
        struct vmap_area *va, *nva;
        struct list_head decay_list;
        struct rb_root decay_root;
        unsigned long n_decay;
        int i;

        decay_root = RB_ROOT;
        INIT_LIST_HEAD(&decay_list);

        for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
                struct list_head tmp_list;

                if (list_empty(&vn->pool[i].head))
                        continue;

                INIT_LIST_HEAD(&tmp_list);

                /* Detach the pool, so no-one can access it. */
                spin_lock(&vn->pool_lock);
                list_replace_init(&vn->pool[i].head, &tmp_list);
                spin_unlock(&vn->pool_lock);

                if (full_decay)
                        WRITE_ONCE(vn->pool[i].len, 0);

                /* Decay a pool by ~25% out of left objects. */
                n_decay = vn->pool[i].len >> 2;

                list_for_each_entry_safe(va, nva, &tmp_list, list) {
                        list_del_init(&va->list);
                        merge_or_add_vmap_area(va, &decay_root, &decay_list);

                        if (!full_decay) {
                                WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1);

                                if (!--n_decay)
                                        break;
                        }
                }

                /*
                 * Attach the pool back if it has been partly decayed.
                 * Please note, it is supposed that nobody(other contexts)
                 * can populate the pool therefore a simple list replace
                 * operation takes place here.
                 */
                if (!full_decay && !list_empty(&tmp_list)) {
                        spin_lock(&vn->pool_lock);
                        list_replace_init(&tmp_list, &vn->pool[i].head);
                        spin_unlock(&vn->pool_lock);
                }
        }

        reclaim_list_global(&decay_list);
}

static void purge_vmap_node(struct work_struct *work)
{
        struct vmap_node *vn = container_of(work,
                struct vmap_node, purge_work);
        struct vmap_area *va, *n_va;
        LIST_HEAD(local_list);

        vn->nr_purged = 0;

        list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
                unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
                unsigned long orig_start = va->va_start;
                unsigned long orig_end = va->va_end;
                unsigned int vn_id = decode_vn_id(va->flags);

                list_del_init(&va->list);

                if (is_vmalloc_or_module_addr((void *)orig_start))
                        kasan_release_vmalloc(orig_start, orig_end,
                                              va->va_start, va->va_end);

                atomic_long_sub(nr, &vmap_lazy_nr);
                vn->nr_purged++;

                if (is_vn_id_valid(vn_id) && !vn->skip_populate)
                        if (node_pool_add_va(vn, va))
                                continue;

                /* Go back to global. */
                list_add(&va->list, &local_list);
        }

        reclaim_list_global(&local_list);
}

/*
 * Purges all lazily-freed vmap areas.
 */
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
                bool full_pool_decay)
{
        unsigned long nr_purged_areas = 0;
        unsigned int nr_purge_helpers;
        unsigned int nr_purge_nodes;
        struct vmap_node *vn;
        int i;

        lockdep_assert_held(&vmap_purge_lock);

        /*
         * Use cpumask to mark which node has to be processed.
         */
        purge_nodes = CPU_MASK_NONE;

        for (i = 0; i < nr_vmap_nodes; i++) {
                vn = &vmap_nodes[i];

                INIT_LIST_HEAD(&vn->purge_list);
                vn->skip_populate = full_pool_decay;
                decay_va_pool_node(vn, full_pool_decay);

                if (RB_EMPTY_ROOT(&vn->lazy.root))
                        continue;

                spin_lock(&vn->lazy.lock);
                WRITE_ONCE(vn->lazy.root.rb_node, NULL);
                list_replace_init(&vn->lazy.head, &vn->purge_list);
                spin_unlock(&vn->lazy.lock);

                start = min(start, list_first_entry(&vn->purge_list,
                        struct vmap_area, list)->va_start);

                end = max(end, list_last_entry(&vn->purge_list,
                        struct vmap_area, list)->va_end);

                cpumask_set_cpu(i, &purge_nodes);
        }

        nr_purge_nodes = cpumask_weight(&purge_nodes);
        if (nr_purge_nodes > 0) {
                flush_tlb_kernel_range(start, end);

                /* One extra worker is per a lazy_max_pages() full set minus one. */
                nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages();
                nr_purge_helpers = clamp(nr_purge_helpers, 1U, nr_purge_nodes) - 1;

                for_each_cpu(i, &purge_nodes) {
                        vn = &vmap_nodes[i];

                        if (nr_purge_helpers > 0) {
                                INIT_WORK(&vn->purge_work, purge_vmap_node);

                                if (cpumask_test_cpu(i, cpu_online_mask))
                                        schedule_work_on(i, &vn->purge_work);
                                else
                                        schedule_work(&vn->purge_work);

                                nr_purge_helpers--;
                        } else {
                                vn->purge_work.func = NULL;
                                purge_vmap_node(&vn->purge_work);
                                nr_purged_areas += vn->nr_purged;
                        }
                }

                for_each_cpu(i, &purge_nodes) {
                        vn = &vmap_nodes[i];

                        if (vn->purge_work.func) {
                                flush_work(&vn->purge_work);
                                nr_purged_areas += vn->nr_purged;
                        }
                }
        }

        trace_purge_vmap_area_lazy(start, end, nr_purged_areas);
        return nr_purged_areas > 0;
}

/*
 * Reclaim vmap areas by purging fragmented blocks and purge_vmap_area_list.
 */
static void reclaim_and_purge_vmap_areas(void)

{
        mutex_lock(&vmap_purge_lock);
        purge_fragmented_blocks_allcpus();
        __purge_vmap_area_lazy(ULONG_MAX, 0, true);
        mutex_unlock(&vmap_purge_lock);
}

static void drain_vmap_area_work(struct work_struct *work)
{
        mutex_lock(&vmap_purge_lock);
        __purge_vmap_area_lazy(ULONG_MAX, 0, false);
        mutex_unlock(&vmap_purge_lock);
}

/*
 * Free a vmap area, caller ensuring that the area has been unmapped,
 * unlinked and flush_cache_vunmap had been called for the correct
 * range previously.
 */
static void free_vmap_area_noflush(struct vmap_area *va)
{
        unsigned long nr_lazy_max = lazy_max_pages();
        unsigned long va_start = va->va_start;
        unsigned int vn_id = decode_vn_id(va->flags);
        struct vmap_node *vn;
        unsigned long nr_lazy;

        if (WARN_ON_ONCE(!list_empty(&va->list)))
                return;

        nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
                                PAGE_SHIFT, &vmap_lazy_nr);

        /*
         * If it was request by a certain node we would like to
         * return it to that node, i.e. its pool for later reuse.
         */
        vn = is_vn_id_valid(vn_id) ?
                id_to_node(vn_id):addr_to_node(va->va_start);

        spin_lock(&vn->lazy.lock);
        insert_vmap_area(va, &vn->lazy.root, &vn->lazy.head);
        spin_unlock(&vn->lazy.lock);

        trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);

        /* After this point, we may free va at any time */
        if (unlikely(nr_lazy > nr_lazy_max))
                schedule_work(&drain_vmap_work);
}

/*
 * Free and unmap a vmap area
 */
static void free_unmap_vmap_area(struct vmap_area *va)
{
        flush_cache_vunmap(va->va_start, va->va_end);
        vunmap_range_noflush(va->va_start, va->va_end);
        if (debug_pagealloc_enabled_static())
                flush_tlb_kernel_range(va->va_start, va->va_end);

        free_vmap_area_noflush(va);
}

struct vmap_area *find_vmap_area(unsigned long addr)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        int i, j;

        if (unlikely(!vmap_initialized))
                return NULL;

        /*
         * An addr_to_node_id(addr) converts an address to a node index
         * where a VA is located. If VA spans several zones and passed
         * addr is not the same as va->va_start, what is not common, we
         * may need to scan extra nodes. See an example:
         *
         *      <----va---->
         * -|-----|-----|-----|-----|-
         *     1     2     0     1
         *
         * VA resides in node 1 whereas it spans 1, 2 an 0. If passed
         * addr is within 2 or 0 nodes we should do extra work.
         */
        i = j = addr_to_node_id(addr);
        do {
                vn = &vmap_nodes[i];

                spin_lock(&vn->busy.lock);
                va = __find_vmap_area(addr, &vn->busy.root);
                spin_unlock(&vn->busy.lock);

                if (va)
                        return va;
        } while ((i = (i + 1) % nr_vmap_nodes) != j);

        return NULL;
}

static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        int i, j;

        /*
         * Check the comment in the find_vmap_area() about the loop.
         */
        i = j = addr_to_node_id(addr);
        do {
                vn = &vmap_nodes[i];

                spin_lock(&vn->busy.lock);
                va = __find_vmap_area(addr, &vn->busy.root);
                if (va)
                        unlink_va(va, &vn->busy.root);
                spin_unlock(&vn->busy.lock);

                if (va)
                        return va;
        } while ((i = (i + 1) % nr_vmap_nodes) != j);

        return NULL;
}

/*** Per cpu kva allocator ***/

/*
 * vmap space is limited especially on 32 bit architectures. Ensure there is
 * room for at least 16 percpu vmap blocks per CPU.
 */
/*
 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
 * to #define VMALLOC_SPACE                (VMALLOC_END-VMALLOC_START). Guess
 * instead (we just need a rough idea)
 */
#if BITS_PER_LONG == 32
#define VMALLOC_SPACE                (128UL*1024*1024)
#else
#define VMALLOC_SPACE                (128UL*1024*1024*1024)
#endif

#define VMALLOC_PAGES                (VMALLOC_SPACE / PAGE_SIZE)
#define VMAP_MAX_ALLOC                BITS_PER_LONG        /* 256K with 4K pages */
#define VMAP_BBMAP_BITS_MAX        1024        /* 4MB with 4K pages */
#define VMAP_BBMAP_BITS_MIN        (VMAP_MAX_ALLOC*2)
#define VMAP_MIN(x, y)                ((x) < (y) ? (x) : (y)) /* can't use min() */
#define VMAP_MAX(x, y)                ((x) > (y) ? (x) : (y)) /* can't use max() */
#define VMAP_BBMAP_BITS                \
                VMAP_MIN(VMAP_BBMAP_BITS_MAX,        \
                VMAP_MAX(VMAP_BBMAP_BITS_MIN,        \
                        VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))

#define VMAP_BLOCK_SIZE                (VMAP_BBMAP_BITS * PAGE_SIZE)

/*
 * Purge threshold to prevent overeager purging of fragmented blocks for
 * regular operations: Purge if vb->free is less than 1/4 of the capacity.
 */
#define VMAP_PURGE_THRESHOLD        (VMAP_BBMAP_BITS / 4)

#define VMAP_RAM                0x1 /* indicates vm_map_ram area*/
#define VMAP_BLOCK                0x2 /* mark out the vmap_block sub-type*/
#define VMAP_FLAGS_MASK                0x3

struct vmap_block_queue {
        spinlock_t lock;
        struct list_head free;

        /*
         * An xarray requires an extra memory dynamically to
         * be allocated. If it is an issue, we can use rb-tree
         * instead.
         */
        struct xarray vmap_blocks;
};

struct vmap_block {
        spinlock_t lock;
        struct vmap_area *va;
        unsigned long free, dirty;
        DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS);
        unsigned long dirty_min, dirty_max; /*< dirty range */
        struct list_head free_list;
        struct rcu_head rcu_head;
        struct list_head purge;
        unsigned int cpu;
};

/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);

/*
 * In order to fast access to any "vmap_block" associated with a
 * specific address, we use a hash.
 *
 * A per-cpu vmap_block_queue is used in both ways, to serialize
 * an access to free block chains among CPUs(alloc path) and it
 * also acts as a vmap_block hash(alloc/free paths). It means we
 * overload it, since we already have the per-cpu array which is
 * used as a hash table. When used as a hash a 'cpu' passed to
 * per_cpu() is not actually a CPU but rather a hash index.
 *
 * A hash function is addr_to_vb_xa() which hashes any address
 * to a specific index(in a hash) it belongs to. This then uses a
 * per_cpu() macro to access an array with generated index.
 *
 * An example:
 *
 *  CPU_1  CPU_2  CPU_0
 *    |      |      |
 *    V      V      V
 * 0     10     20     30     40     50     60
 * |------|------|------|------|------|------|...<vmap address space>
 *   CPU0   CPU1   CPU2   CPU0   CPU1   CPU2
 *
 * - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus
 *   it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock;
 *
 * - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus
 *   it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock;
 *
 * - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus
 *   it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock.
 *
 * This technique almost always avoids lock contention on insert/remove,
 * however xarray spinlocks protect against any contention that remains.
 */
static struct xarray *
addr_to_vb_xa(unsigned long addr)
{
        int index = (addr / VMAP_BLOCK_SIZE) % num_possible_cpus();

        return &per_cpu(vmap_block_queue, index).vmap_blocks;
}

/*
 * We should probably have a fallback mechanism to allocate virtual memory
 * out of partially filled vmap blocks. However vmap block sizing should be
 * fairly reasonable according to the vmalloc size, so it shouldn't be a
 * big problem.
 */

static unsigned long addr_to_vb_idx(unsigned long addr)
{
        addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
        addr /= VMAP_BLOCK_SIZE;
        return addr;
}

static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
{
        unsigned long addr;

        addr = va_start + (pages_off << PAGE_SHIFT);
        BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
        return (void *)addr;
}

/**
 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
 *                  block. Of course pages number can't exceed VMAP_BBMAP_BITS
 * @order:    how many 2^order pages should be occupied in newly allocated block
 * @gfp_mask: flags for the page level allocator
 *
 * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
 */
static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
{
        struct vmap_block_queue *vbq;
        struct vmap_block *vb;
        struct vmap_area *va;
        struct xarray *xa;
        unsigned long vb_idx;
        int node, err;
        void *vaddr;

        node = numa_node_id();

        vb = kmalloc_node(sizeof(struct vmap_block),
                        gfp_mask & GFP_RECLAIM_MASK, node);
        if (unlikely(!vb))
                return ERR_PTR(-ENOMEM);

        va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
                                        VMALLOC_START, VMALLOC_END,
                                        node, gfp_mask,
                                        VMAP_RAM|VMAP_BLOCK, NULL);
        if (IS_ERR(va)) {
                kfree(vb);
                return ERR_CAST(va);
        }

        vaddr = vmap_block_vaddr(va->va_start, 0);
        spin_lock_init(&vb->lock);
        vb->va = va;
        /* At least something should be left free */
        BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
        bitmap_zero(vb->used_map, VMAP_BBMAP_BITS);
        vb->free = VMAP_BBMAP_BITS - (1UL << order);
        vb->dirty = 0;
        vb->dirty_min = VMAP_BBMAP_BITS;
        vb->dirty_max = 0;
        bitmap_set(vb->used_map, 0, (1UL << order));
        INIT_LIST_HEAD(&vb->free_list);

        xa = addr_to_vb_xa(va->va_start);
        vb_idx = addr_to_vb_idx(va->va_start);
        err = xa_insert(xa, vb_idx, vb, gfp_mask);
        if (err) {
                kfree(vb);
                free_vmap_area(va);
                return ERR_PTR(err);
        }
        /*
         * list_add_tail_rcu could happened in another core
         * rather than vb->cpu due to task migration, which
         * is safe as list_add_tail_rcu will ensure the list's
         * integrity together with list_for_each_rcu from read
         * side.
         */
        vb->cpu = raw_smp_processor_id();
        vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu);
        spin_lock(&vbq->lock);
        list_add_tail_rcu(&vb->free_list, &vbq->free);
        spin_unlock(&vbq->lock);

        return vaddr;
}

static void free_vmap_block(struct vmap_block *vb)
{
        struct vmap_node *vn;
        struct vmap_block *tmp;
        struct xarray *xa;

        xa = addr_to_vb_xa(vb->va->va_start);
        tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start));
        BUG_ON(tmp != vb);

        vn = addr_to_node(vb->va->va_start);
        spin_lock(&vn->busy.lock);
        unlink_va(vb->va, &vn->busy.root);
        spin_unlock(&vn->busy.lock);

        free_vmap_area_noflush(vb->va);
        kfree_rcu(vb, rcu_head);
}

static bool purge_fragmented_block(struct vmap_block *vb,
                struct list_head *purge_list, bool force_purge)
{
        struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, vb->cpu);

        if (vb->free + vb->dirty != VMAP_BBMAP_BITS ||
            vb->dirty == VMAP_BBMAP_BITS)
                return false;

        /* Don't overeagerly purge usable blocks unless requested */
        if (!(force_purge || vb->free < VMAP_PURGE_THRESHOLD))
                return false;

        /* prevent further allocs after releasing lock */
        WRITE_ONCE(vb->free, 0);
        /* prevent purging it again */
        WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS);
        vb->dirty_min = 0;
        vb->dirty_max = VMAP_BBMAP_BITS;
        spin_lock(&vbq->lock);
        list_del_rcu(&vb->free_list);
        spin_unlock(&vbq->lock);
        list_add_tail(&vb->purge, purge_list);
        return true;
}

static void free_purged_blocks(struct list_head *purge_list)
{
        struct vmap_block *vb, *n_vb;

        list_for_each_entry_safe(vb, n_vb, purge_list, purge) {
                list_del(&vb->purge);
                free_vmap_block(vb);
        }
}

static void purge_fragmented_blocks(int cpu)
{
        LIST_HEAD(purge);
        struct vmap_block *vb;
        struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);

        rcu_read_lock();
        list_for_each_entry_rcu(vb, &vbq->free, free_list) {
                unsigned long free = READ_ONCE(vb->free);
                unsigned long dirty = READ_ONCE(vb->dirty);

                if (free + dirty != VMAP_BBMAP_BITS ||
                    dirty == VMAP_BBMAP_BITS)
                        continue;

                spin_lock(&vb->lock);
                purge_fragmented_block(vb, &purge, true);
                spin_unlock(&vb->lock);
        }
        rcu_read_unlock();
        free_purged_blocks(&purge);
}

static void purge_fragmented_blocks_allcpus(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                purge_fragmented_blocks(cpu);
}

static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
{
        struct vmap_block_queue *vbq;
        struct vmap_block *vb;
        void *vaddr = NULL;
        unsigned int order;

        BUG_ON(offset_in_page(size));
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
        if (WARN_ON(size == 0)) {
                /*
                 * Allocating 0 bytes isn't what caller wants since
                 * get_order(0) returns funny result. Just warn and terminate
                 * early.
                 */
                return ERR_PTR(-EINVAL);
        }
        order = get_order(size);

        rcu_read_lock();
        vbq = raw_cpu_ptr(&vmap_block_queue);
        list_for_each_entry_rcu(vb, &vbq->free, free_list) {
                unsigned long pages_off;

                if (READ_ONCE(vb->free) < (1UL << order))
                        continue;

                spin_lock(&vb->lock);
                if (vb->free < (1UL << order)) {
                        spin_unlock(&vb->lock);
                        continue;
                }

                pages_off = VMAP_BBMAP_BITS - vb->free;
                vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
                WRITE_ONCE(vb->free, vb->free - (1UL << order));
                bitmap_set(vb->used_map, pages_off, (1UL << order));
                if (vb->free == 0) {
                        spin_lock(&vbq->lock);
                        list_del_rcu(&vb->free_list);
                        spin_unlock(&vbq->lock);
                }

                spin_unlock(&vb->lock);
                break;
        }

        rcu_read_unlock();

        /* Allocate new block if nothing was found */
        if (!vaddr)
                vaddr = new_vmap_block(order, gfp_mask);

        return vaddr;
}

static void vb_free(unsigned long addr, unsigned long size)
{
        unsigned long offset;
        unsigned int order;
        struct vmap_block *vb;
        struct xarray *xa;

        BUG_ON(offset_in_page(size));
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);

        flush_cache_vunmap(addr, addr + size);

        order = get_order(size);
        offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;

        xa = addr_to_vb_xa(addr);
        vb = xa_load(xa, addr_to_vb_idx(addr));

        spin_lock(&vb->lock);
        bitmap_clear(vb->used_map, offset, (1UL << order));
        spin_unlock(&vb->lock);

        vunmap_range_noflush(addr, addr + size);

        if (debug_pagealloc_enabled_static())
                flush_tlb_kernel_range(addr, addr + size);

        spin_lock(&vb->lock);

        /* Expand the not yet TLB flushed dirty range */
        vb->dirty_min = min(vb->dirty_min, offset);
        vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));

        WRITE_ONCE(vb->dirty, vb->dirty + (1UL << order));
        if (vb->dirty == VMAP_BBMAP_BITS) {
                BUG_ON(vb->free);
                spin_unlock(&vb->lock);
                free_vmap_block(vb);
        } else
                spin_unlock(&vb->lock);
}

static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
{
        LIST_HEAD(purge_list);
        int cpu;

        if (unlikely(!vmap_initialized))
                return;

        mutex_lock(&vmap_purge_lock);

        for_each_possible_cpu(cpu) {
                struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
                struct vmap_block *vb;
                unsigned long idx;

                rcu_read_lock();
                xa_for_each(&vbq->vmap_blocks, idx, vb) {
                        spin_lock(&vb->lock);

                        /*
                         * Try to purge a fragmented block first. If it's
                         * not purgeable, check whether there is dirty
                         * space to be flushed.
                         */
                        if (!purge_fragmented_block(vb, &purge_list, false) &&
                            vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) {
                                unsigned long va_start = vb->va->va_start;
                                unsigned long s, e;

                                s = va_start + (vb->dirty_min << PAGE_SHIFT);
                                e = va_start + (vb->dirty_max << PAGE_SHIFT);

                                start = min(s, start);
                                end   = max(e, end);

                                /* Prevent that this is flushed again */
                                vb->dirty_min = VMAP_BBMAP_BITS;
                                vb->dirty_max = 0;

                                flush = 1;
                        }
                        spin_unlock(&vb->lock);
                }
                rcu_read_unlock();
        }
        free_purged_blocks(&purge_list);

        if (!__purge_vmap_area_lazy(start, end, false) && flush)
                flush_tlb_kernel_range(start, end);
        mutex_unlock(&vmap_purge_lock);
}

/**
 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
 *
 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
 * to amortize TLB flushing overheads. What this means is that any page you
 * have now, may, in a former life, have been mapped into kernel virtual
 * address by the vmap layer and so there might be some CPUs with TLB entries
 * still referencing that page (additional to the regular 1:1 kernel mapping).
 *
 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
 * be sure that none of the pages we have control over will have any aliases
 * from the vmap layer.
 */
void vm_unmap_aliases(void)
{
        unsigned long start = ULONG_MAX, end = 0;
        int flush = 0;

        _vm_unmap_aliases(start, end, flush);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);

/**
 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
 * @mem: the pointer returned by vm_map_ram
 * @count: the count passed to that vm_map_ram call (cannot unmap partial)
 */
void vm_unmap_ram(const void *mem, unsigned int count)
{
        unsigned long size = (unsigned long)count << PAGE_SHIFT;
        unsigned long addr = (unsigned long)kasan_reset_tag(mem);
        struct vmap_area *va;

        might_sleep();
        BUG_ON(!addr);
        BUG_ON(addr < VMALLOC_START);
        BUG_ON(addr > VMALLOC_END);
        BUG_ON(!PAGE_ALIGNED(addr));

        kasan_poison_vmalloc(mem, size);

        if (likely(count <= VMAP_MAX_ALLOC)) {
                debug_check_no_locks_freed(mem, size);
                vb_free(addr, size);
                return;
        }

        va = find_unlink_vmap_area(addr);
        if (WARN_ON_ONCE(!va))
                return;

        debug_check_no_locks_freed((void *)va->va_start,
                                    (va->va_end - va->va_start));
        free_unmap_vmap_area(va);
}
EXPORT_SYMBOL(vm_unmap_ram);

/**
 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
 * @pages: an array of pointers to the pages to be mapped
 * @count: number of pages
 * @node: prefer to allocate data structures on this node
 *
 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
 * faster than vmap so it's good.  But if you mix long-life and short-life
 * objects with vm_map_ram(), it could consume lots of address space through
 * fragmentation (especially on a 32bit machine).  You could see failures in
 * the end.  Please use this function for short-lived objects.
 *
 * Returns: a pointer to the address that has been mapped, or %NULL on failure
 */
void *vm_map_ram(struct page **pages, unsigned int count, int node)
{
        unsigned long size = (unsigned long)count << PAGE_SHIFT;
        unsigned long addr;
        void *mem;

        if (likely(count <= VMAP_MAX_ALLOC)) {
                mem = vb_alloc(size, GFP_KERNEL);
                if (IS_ERR(mem))
                        return NULL;
                addr = (unsigned long)mem;
        } else {
                struct vmap_area *va;
                va = alloc_vmap_area(size, PAGE_SIZE,
                                VMALLOC_START, VMALLOC_END,
                                node, GFP_KERNEL, VMAP_RAM,
                                NULL);
                if (IS_ERR(va))
                        return NULL;

                addr = va->va_start;
                mem = (void *)addr;
        }

        if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
                                pages, PAGE_SHIFT) < 0) {
                vm_unmap_ram(mem, count);
                return NULL;
        }

        /*
         * Mark the pages as accessible, now that they are mapped.
         * With hardware tag-based KASAN, marking is skipped for
         * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
         */
        mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL);

        return mem;
}
EXPORT_SYMBOL(vm_map_ram);

static struct vm_struct *vmlist __initdata;

static inline unsigned int vm_area_page_order(struct vm_struct *vm)
{
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
        return vm->page_order;
#else
        return 0;
#endif
}

static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
{
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
        vm->page_order = order;
#else
        BUG_ON(order != 0);
#endif
}

/**
 * vm_area_add_early - add vmap area early during boot
 * @vm: vm_struct to add
 *
 * This function is used to add fixed kernel vm area to vmlist before
 * vmalloc_init() is called.  @vm->addr, @vm->size, and @vm->flags
 * should contain proper values and the other fields should be zero.
 *
 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
 */
void __init vm_area_add_early(struct vm_struct *vm)
{
        struct vm_struct *tmp, **p;

        BUG_ON(vmap_initialized);
        for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
                if (tmp->addr >= vm->addr) {
                        BUG_ON(tmp->addr < vm->addr + vm->size);
                        break;
                } else
                        BUG_ON(tmp->addr + tmp->size > vm->addr);
        }
        vm->next = *p;
        *p = vm;
}

/**
 * vm_area_register_early - register vmap area early during boot
 * @vm: vm_struct to register
 * @align: requested alignment
 *
 * This function is used to register kernel vm area before
 * vmalloc_init() is called.  @vm->size and @vm->flags should contain
 * proper values on entry and other fields should be zero.  On return,
 * vm->addr contains the allocated address.
 *
 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
 */
void __init vm_area_register_early(struct vm_struct *vm, size_t align)
{
        unsigned long addr = ALIGN(VMALLOC_START, align);
        struct vm_struct *cur, **p;

        BUG_ON(vmap_initialized);

        for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
                if ((unsigned long)cur->addr - addr >= vm->size)
                        break;
                addr = ALIGN((unsigned long)cur->addr + cur->size, align);
        }

        BUG_ON(addr > VMALLOC_END - vm->size);
        vm->addr = (void *)addr;
        vm->next = *p;
        *p = vm;
        kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
}

static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
        /*
         * Before removing VM_UNINITIALIZED,
         * we should make sure that vm has proper values.
         * Pair with smp_rmb() in show_numa_info().
         */
        smp_wmb();
        vm->flags &= ~VM_UNINITIALIZED;
}

static struct vm_struct *__get_vm_area_node(unsigned long size,
                unsigned long align, unsigned long shift, unsigned long flags,
                unsigned long start, unsigned long end, int node,
                gfp_t gfp_mask, const void *caller)
{
        struct vmap_area *va;
        struct vm_struct *area;
        unsigned long requested_size = size;

        BUG_ON(in_interrupt());
        size = ALIGN(size, 1ul << shift);
        if (unlikely(!size))
                return NULL;

        if (flags & VM_IOREMAP)
                align = 1ul << clamp_t(int, get_count_order_long(size),
                                       PAGE_SHIFT, IOREMAP_MAX_ORDER);

        area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
        if (unlikely(!area))
                return NULL;

        if (!(flags & VM_NO_GUARD))
                size += PAGE_SIZE;

        area->flags = flags;
        area->caller = caller;

        va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
        if (IS_ERR(va)) {
                kfree(area);
                return NULL;
        }

        /*
         * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
         * best-effort approach, as they can be mapped outside of vmalloc code.
         * For VM_ALLOC mappings, the pages are marked as accessible after
         * getting mapped in __vmalloc_node_range().
         * With hardware tag-based KASAN, marking is skipped for
         * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
         */
        if (!(flags & VM_ALLOC))
                area->addr = kasan_unpoison_vmalloc(area->addr, requested_size,
                                                    KASAN_VMALLOC_PROT_NORMAL);

        return area;
}

struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
                                       unsigned long start, unsigned long end,
                                       const void *caller)
{
        return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
                                  NUMA_NO_NODE, GFP_KERNEL, caller);
}

/**
 * get_vm_area - reserve a contiguous kernel virtual area
 * @size:         size of the area
 * @flags:         %VM_IOREMAP for I/O mappings or VM_ALLOC
 *
 * Search an area of @size in the kernel virtual mapping area,
 * and reserved it for out purposes.  Returns the area descriptor
 * on success or %NULL on failure.
 *
 * Return: the area descriptor on success or %NULL on failure.
 */
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
        return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
                                  VMALLOC_START, VMALLOC_END,
                                  NUMA_NO_NODE, GFP_KERNEL,
                                  __builtin_return_address(0));
}

struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
                                const void *caller)
{
        return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
                                  VMALLOC_START, VMALLOC_END,
                                  NUMA_NO_NODE, GFP_KERNEL, caller);
}

/**
 * find_vm_area - find a continuous kernel virtual area
 * @addr:          base address
 *
 * Search for the kernel VM area starting at @addr, and return it.
 * It is up to the caller to do all required locking to keep the returned
 * pointer valid.
 *
 * Return: the area descriptor on success or %NULL on failure.
 */
struct vm_struct *find_vm_area(const void *addr)
{
        struct vmap_area *va;

        va = find_vmap_area((unsigned long)addr);
        if (!va)
                return NULL;

        return va->vm;
}

/**
 * remove_vm_area - find and remove a continuous kernel virtual area
 * @addr:            base address
 *
 * Search for the kernel VM area starting at @addr, and remove it.
 * This function returns the found VM area, but using it is NOT safe
 * on SMP machines, except for its size or flags.
 *
 * Return: the area descriptor on success or %NULL on failure.
 */
struct vm_struct *remove_vm_area(const void *addr)
{
        struct vmap_area *va;
        struct vm_struct *vm;

        might_sleep();

        if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
                        addr))
                return NULL;

        va = find_unlink_vmap_area((unsigned long)addr);
        if (!va || !va->vm)
                return NULL;
        vm = va->vm;

        debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm));
        debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm));
        kasan_free_module_shadow(vm);
        kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm));

        free_unmap_vmap_area(va);
        return vm;
}

static inline void set_area_direct_map(const struct vm_struct *area,
                                       int (*set_direct_map)(struct page *page))
{
        int i;

        /* HUGE_VMALLOC passes small pages to set_direct_map */
        for (i = 0; i < area->nr_pages; i++)
                if (page_address(area->pages[i]))
                        set_direct_map(area->pages[i]);
}

/*
 * Flush the vm mapping and reset the direct map.
 */
static void vm_reset_perms(struct vm_struct *area)
{
        unsigned long start = ULONG_MAX, end = 0;
        unsigned int page_order = vm_area_page_order(area);
        int flush_dmap = 0;
        int i;

        /*
         * Find the start and end range of the direct mappings to make sure that
         * the vm_unmap_aliases() flush includes the direct map.
         */
        for (i = 0; i < area->nr_pages; i += 1U << page_order) {
                unsigned long addr = (unsigned long)page_address(area->pages[i]);

                if (addr) {
                        unsigned long page_size;

                        page_size = PAGE_SIZE << page_order;
                        start = min(addr, start);
                        end = max(addr + page_size, end);
                        flush_dmap = 1;
                }
        }

        /*
         * Set direct map to something invalid so that it won't be cached if
         * there are any accesses after the TLB flush, then flush the TLB and
         * reset the direct map permissions to the default.
         */
        set_area_direct_map(area, set_direct_map_invalid_noflush);
        _vm_unmap_aliases(start, end, flush_dmap);
        set_area_direct_map(area, set_direct_map_default_noflush);
}

static void delayed_vfree_work(struct work_struct *w)
{
        struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
        struct llist_node *t, *llnode;

        llist_for_each_safe(llnode, t, llist_del_all(&p->list))
                vfree(llnode);
}

/**
 * vfree_atomic - release memory allocated by vmalloc()
 * @addr:          memory base address
 *
 * This one is just like vfree() but can be called in any atomic context
 * except NMIs.
 */
void vfree_atomic(const void *addr)
{
        struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);

        BUG_ON(in_nmi());
        kmemleak_free(addr);

        /*
         * Use raw_cpu_ptr() because this can be called from preemptible
         * context. Preemption is absolutely fine here, because the llist_add()
         * implementation is lockless, so it works even if we are adding to
         * another cpu's list. schedule_work() should be fine with this too.
         */
        if (addr && llist_add((struct llist_node *)addr, &p->list))
                schedule_work(&p->wq);
}

/**
 * vfree - Release memory allocated by vmalloc()
 * @addr:  Memory base address
 *
 * Free the virtually continuous memory area starting at @addr, as obtained
 * from one of the vmalloc() family of APIs.  This will usually also free the
 * physical memory underlying the virtual allocation, but that memory is
 * reference counted, so it will not be freed until the last user goes away.
 *
 * If @addr is NULL, no operation is performed.
 *
 * Context:
 * May sleep if called *not* from interrupt context.
 * Must not be called in NMI context (strictly speaking, it could be
 * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
 * conventions for vfree() arch-dependent would be a really bad idea).
 */
void vfree(const void *addr)
{
        struct vm_struct *vm;
        int i;

        if (unlikely(in_interrupt())) {
                vfree_atomic(addr);
                return;
        }

        BUG_ON(in_nmi());
        kmemleak_free(addr);
        might_sleep();

        if (!addr)
                return;

        vm = remove_vm_area(addr);
        if (unlikely(!vm)) {
                WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
                                addr);
                return;
        }

        if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
                vm_reset_perms(vm);
        for (i = 0; i < vm->nr_pages; i++) {
                struct page *page = vm->pages[i];

                BUG_ON(!page);
                mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
                /*
                 * High-order allocs for huge vmallocs are split, so
                 * can be freed as an array of order-0 allocations
                 */
                __free_page(page);
                cond_resched();
        }
        atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
        kvfree(vm->pages);
        kfree(vm);
}
EXPORT_SYMBOL(vfree);

/**
 * vunmap - release virtual mapping obtained by vmap()
 * @addr:   memory base address
 *
 * Free the virtually contiguous memory area starting at @addr,
 * which was created from the page array passed to vmap().
 *
 * Must not be called in interrupt context.
 */
void vunmap(const void *addr)
{
        struct vm_struct *vm;

        BUG_ON(in_interrupt());
        might_sleep();

        if (!addr)
                return;
        vm = remove_vm_area(addr);
        if (unlikely(!vm)) {
                WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n",
                                addr);
                return;
        }
        kfree(vm);
}
EXPORT_SYMBOL(vunmap);

/**
 * vmap - map an array of pages into virtually contiguous space
 * @pages: array of page pointers
 * @count: number of pages to map
 * @flags: vm_area->flags
 * @prot: page protection for the mapping
 *
 * Maps @count pages from @pages into contiguous kernel virtual space.
 * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
 * (which must be kmalloc or vmalloc memory) and one reference per pages in it
 * are transferred from the caller to vmap(), and will be freed / dropped when
 * vfree() is called on the return value.
 *
 * Return: the address of the area or %NULL on failure
 */
void *vmap(struct page **pages, unsigned int count,
           unsigned long flags, pgprot_t prot)
{
        struct vm_struct *area;
        unsigned long addr;
        unsigned long size;                /* In bytes */

        might_sleep();

        if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS))
                return NULL;

        /*
         * Your top guard is someone else's bottom guard. Not having a top
         * guard compromises someone else's mappings too.
         */
        if (WARN_ON_ONCE(flags & VM_NO_GUARD))
                flags &= ~VM_NO_GUARD;

        if (count > totalram_pages())
                return NULL;

        size = (unsigned long)count << PAGE_SHIFT;
        area = get_vm_area_caller(size, flags, __builtin_return_address(0));
        if (!area)
                return NULL;

        addr = (unsigned long)area->addr;
        if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
                                pages, PAGE_SHIFT) < 0) {
                vunmap(area->addr);
                return NULL;
        }

        if (flags & VM_MAP_PUT_PAGES) {
                area->pages = pages;
                area->nr_pages = count;
        }
        return area->addr;
}
EXPORT_SYMBOL(vmap);

#ifdef CONFIG_VMAP_PFN
struct vmap_pfn_data {
        unsigned long        *pfns;
        pgprot_t        prot;
        unsigned int        idx;
};

static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
{
        struct vmap_pfn_data *data = private;
        unsigned long pfn = data->pfns[data->idx];
        pte_t ptent;

        if (WARN_ON_ONCE(pfn_valid(pfn)))
                return -EINVAL;

        ptent = pte_mkspecial(pfn_pte(pfn, data->prot));
        set_pte_at(&init_mm, addr, pte, ptent);

        data->idx++;
        return 0;
}

/**
 * vmap_pfn - map an array of PFNs into virtually contiguous space
 * @pfns: array of PFNs
 * @count: number of pages to map
 * @prot: page protection for the mapping
 *
 * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
 * the start address of the mapping.
 */
void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
{
        struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
        struct vm_struct *area;

        area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
                        __builtin_return_address(0));
        if (!area)
                return NULL;
        if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
                        count * PAGE_SIZE, vmap_pfn_apply, &data)) {
                free_vm_area(area);
                return NULL;
        }

        flush_cache_vmap((unsigned long)area->addr,
                         (unsigned long)area->addr + count * PAGE_SIZE);

        return area->addr;
}
EXPORT_SYMBOL_GPL(vmap_pfn);
#endif /* CONFIG_VMAP_PFN */

static inline unsigned int
vm_area_alloc_pages(gfp_t gfp, int nid,
                unsigned int order, unsigned int nr_pages, struct page **pages)
{
        unsigned int nr_allocated = 0;
        gfp_t alloc_gfp = gfp;
        bool nofail = gfp & __GFP_NOFAIL;
        struct page *page;
        int i;

        /*
         * For order-0 pages we make use of bulk allocator, if
         * the page array is partly or not at all populated due
         * to fails, fallback to a single page allocator that is
         * more permissive.
         */
        if (!order) {
                /* bulk allocator doesn't support nofail req. officially */
                gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL;

                while (nr_allocated < nr_pages) {
                        unsigned int nr, nr_pages_request;

                        /*
                         * A maximum allowed request is hard-coded and is 100
                         * pages per call. That is done in order to prevent a
                         * long preemption off scenario in the bulk-allocator
                         * so the range is [1:100].
                         */
                        nr_pages_request = min(100U, nr_pages - nr_allocated);

                        /* memory allocation should consider mempolicy, we can't
                         * wrongly use nearest node when nid == NUMA_NO_NODE,
                         * otherwise memory may be allocated in only one node,
                         * but mempolicy wants to alloc memory by interleaving.
                         */
                        if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
                                nr = alloc_pages_bulk_array_mempolicy_noprof(bulk_gfp,
                                                        nr_pages_request,
                                                        pages + nr_allocated);

                        else
                                nr = alloc_pages_bulk_array_node_noprof(bulk_gfp, nid,
                                                        nr_pages_request,
                                                        pages + nr_allocated);

                        nr_allocated += nr;
                        cond_resched();

                        /*
                         * If zero or pages were obtained partly,
                         * fallback to a single page allocator.
                         */
                        if (nr != nr_pages_request)
                                break;
                }
        } else if (gfp & __GFP_NOFAIL) {
                /*
                 * Higher order nofail allocations are really expensive and
                 * potentially dangerous (pre-mature OOM, disruptive reclaim
                 * and compaction etc.
                 */
                alloc_gfp &= ~__GFP_NOFAIL;
        }

        /* High-order pages or fallback path if "bulk" fails. */
        while (nr_allocated < nr_pages) {
                if (!nofail && fatal_signal_pending(current))
                        break;

                if (nid == NUMA_NO_NODE)
                        page = alloc_pages_noprof(alloc_gfp, order);
                else
                        page = alloc_pages_node_noprof(nid, alloc_gfp, order);
                if (unlikely(!page)) {
                        if (!nofail)
                                break;

                        /* fall back to the zero order allocations */
                        alloc_gfp |= __GFP_NOFAIL;
                        order = 0;
                        continue;
                }

                /*
                 * Higher order allocations must be able to be treated as
                 * indepdenent small pages by callers (as they can with
                 * small-page vmallocs). Some drivers do their own refcounting
                 * on vmalloc_to_page() pages, some use page->mapping,
                 * page->lru, etc.
                 */
                if (order)
                        split_page(page, order);

                /*
                 * Careful, we allocate and map page-order pages, but
                 * tracking is done per PAGE_SIZE page so as to keep the
                 * vm_struct APIs independent of the physical/mapped size.
                 */
                for (i = 0; i < (1U << order); i++)
                        pages[nr_allocated + i] = page + i;

                cond_resched();
                nr_allocated += 1U << order;
        }

        return nr_allocated;
}

static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                                 pgprot_t prot, unsigned int page_shift,
                                 int node)
{
        const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
        bool nofail = gfp_mask & __GFP_NOFAIL;
        unsigned long addr = (unsigned long)area->addr;
        unsigned long size = get_vm_area_size(area);
        unsigned long array_size;
        unsigned int nr_small_pages = size >> PAGE_SHIFT;
        unsigned int page_order;
        unsigned int flags;
        int ret;

        array_size = (unsigned long)nr_small_pages * sizeof(struct page *);

        if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
                gfp_mask |= __GFP_HIGHMEM;

        /* Please note that the recursion is strictly bounded. */
        if (array_size > PAGE_SIZE) {
                area->pages = __vmalloc_node_noprof(array_size, 1, nested_gfp, node,
                                        area->caller);
        } else {
                area->pages = kmalloc_node_noprof(array_size, nested_gfp, node);
        }

        if (!area->pages) {
                warn_alloc(gfp_mask, NULL,
                        "vmalloc error: size %lu, failed to allocated page array size %lu",
                        nr_small_pages * PAGE_SIZE, array_size);
                free_vm_area(area);
                return NULL;
        }

        set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
        page_order = vm_area_page_order(area);

        area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN,
                node, page_order, nr_small_pages, area->pages);

        atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
        if (gfp_mask & __GFP_ACCOUNT) {
                int i;

                for (i = 0; i < area->nr_pages; i++)
                        mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
        }

        /*
         * If not enough pages were obtained to accomplish an
         * allocation request, free them via vfree() if any.
         */
        if (area->nr_pages != nr_small_pages) {
                /*
                 * vm_area_alloc_pages() can fail due to insufficient memory but
                 * also:-
                 *
                 * - a pending fatal signal
                 * - insufficient huge page-order pages
                 *
                 * Since we always retry allocations at order-0 in the huge page
                 * case a warning for either is spurious.
                 */
                if (!fatal_signal_pending(current) && page_order == 0)
                        warn_alloc(gfp_mask, NULL,
                                "vmalloc error: size %lu, failed to allocate pages",
                                area->nr_pages * PAGE_SIZE);
                goto fail;
        }

        /*
         * page tables allocations ignore external gfp mask, enforce it
         * by the scope API
         */
        if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
                flags = memalloc_nofs_save();
        else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
                flags = memalloc_noio_save();

        do {
                ret = vmap_pages_range(addr, addr + size, prot, area->pages,
                        page_shift);
                if (nofail && (ret < 0))
                        schedule_timeout_uninterruptible(1);
        } while (nofail && (ret < 0));

        if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
                memalloc_nofs_restore(flags);
        else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
                memalloc_noio_restore(flags);

        if (ret < 0) {
                warn_alloc(gfp_mask, NULL,
                        "vmalloc error: size %lu, failed to map pages",
                        area->nr_pages * PAGE_SIZE);
                goto fail;
        }

        return area->addr;

fail:
        vfree(area->addr);
        return NULL;
}

/**
 * __vmalloc_node_range - allocate virtually contiguous memory
 * @size:                  allocation size
 * @align:                  desired alignment
 * @start:                  vm area range start
 * @end:                  vm area range end
 * @gfp_mask:                  flags for the page level allocator
 * @prot:                  protection mask for the allocated pages
 * @vm_flags:                  additional vm area flags (e.g. %VM_NO_GUARD)
 * @node:                  node to use for allocation or NUMA_NO_NODE
 * @caller:                  caller's return address
 *
 * Allocate enough pages to cover @size from the page level
 * allocator with @gfp_mask flags. Please note that the full set of gfp
 * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all
 * supported.
 * Zone modifiers are not supported. From the reclaim modifiers
 * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported)
 * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and
 * __GFP_RETRY_MAYFAIL are not supported).
 *
 * __GFP_NOWARN can be used to suppress failures messages.
 *
 * Map them into contiguous kernel virtual space, using a pagetable
 * protection of @prot.
 *
 * Return: the address of the area or %NULL on failure
 */
void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
                        unsigned long start, unsigned long end, gfp_t gfp_mask,
                        pgprot_t prot, unsigned long vm_flags, int node,
                        const void *caller)
{
        struct vm_struct *area;
        void *ret;
        kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
        unsigned long real_size = size;
        unsigned long real_align = align;
        unsigned int shift = PAGE_SHIFT;

        if (WARN_ON_ONCE(!size))
                return NULL;

        if ((size >> PAGE_SHIFT) > totalram_pages()) {
                warn_alloc(gfp_mask, NULL,
                        "vmalloc error: size %lu, exceeds total pages",
                        real_size);
                return NULL;
        }

        if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
                unsigned long size_per_node;

                /*
                 * Try huge pages. Only try for PAGE_KERNEL allocations,
                 * others like modules don't yet expect huge pages in
                 * their allocations due to apply_to_page_range not
                 * supporting them.
                 */

                size_per_node = size;
                if (node == NUMA_NO_NODE)
                        size_per_node /= num_online_nodes();
                if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
                        shift = PMD_SHIFT;
                else
                        shift = arch_vmap_pte_supported_shift(size_per_node);

                align = max(real_align, 1UL << shift);
                size = ALIGN(real_size, 1UL << shift);
        }

again:
        area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
                                  VM_UNINITIALIZED | vm_flags, start, end, node,
                                  gfp_mask, caller);
        if (!area) {
                bool nofail = gfp_mask & __GFP_NOFAIL;
                warn_alloc(gfp_mask, NULL,
                        "vmalloc error: size %lu, vm_struct allocation failed%s",
                        real_size, (nofail) ? ". Retrying." : "");
                if (nofail) {
                        schedule_timeout_uninterruptible(1);
                        goto again;
                }
                goto fail;
        }

        /*
         * Prepare arguments for __vmalloc_area_node() and
         * kasan_unpoison_vmalloc().
         */
        if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
                if (kasan_hw_tags_enabled()) {
                        /*
                         * Modify protection bits to allow tagging.
                         * This must be done before mapping.
                         */
                        prot = arch_vmap_pgprot_tagged(prot);

                        /*
                         * Skip page_alloc poisoning and zeroing for physical
                         * pages backing VM_ALLOC mapping. Memory is instead
                         * poisoned and zeroed by kasan_unpoison_vmalloc().
                         */
                        gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO;
                }

                /* Take note that the mapping is PAGE_KERNEL. */
                kasan_flags |= KASAN_VMALLOC_PROT_NORMAL;
        }

        /* Allocate physical pages and map them into vmalloc space. */
        ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
        if (!ret)
                goto fail;

        /*
         * Mark the pages as accessible, now that they are mapped.
         * The condition for setting KASAN_VMALLOC_INIT should complement the
         * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check
         * to make sure that memory is initialized under the same conditions.
         * Tag-based KASAN modes only assign tags to normal non-executable
         * allocations, see __kasan_unpoison_vmalloc().
         */
        kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
        if (!want_init_on_free() && want_init_on_alloc(gfp_mask) &&
            (gfp_mask & __GFP_SKIP_ZERO))
                kasan_flags |= KASAN_VMALLOC_INIT;
        /* KASAN_VMALLOC_PROT_NORMAL already set if required. */
        area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);

        /*
         * In this function, newly allocated vm_struct has VM_UNINITIALIZED
         * flag. It means that vm_struct is not fully initialized.
         * Now, it is fully initialized, so remove this flag here.
         */
        clear_vm_uninitialized_flag(area);

        size = PAGE_ALIGN(size);
        if (!(vm_flags & VM_DEFER_KMEMLEAK))
                kmemleak_vmalloc(area, size, gfp_mask);

        return area->addr;

fail:
        if (shift > PAGE_SHIFT) {
                shift = PAGE_SHIFT;
                align = real_align;
                size = real_size;
                goto again;
        }

        return NULL;
}

/**
 * __vmalloc_node - allocate virtually contiguous memory
 * @size:            allocation size
 * @align:            desired alignment
 * @gfp_mask:            flags for the page level allocator
 * @node:            node to use for allocation or NUMA_NO_NODE
 * @caller:            caller's return address
 *
 * Allocate enough pages to cover @size from the page level allocator with
 * @gfp_mask flags.  Map them into contiguous kernel virtual space.
 *
 * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
 * and __GFP_NOFAIL are not supported
 *
 * Any use of gfp flags outside of GFP_KERNEL should be consulted
 * with mm people.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *__vmalloc_node_noprof(unsigned long size, unsigned long align,
                            gfp_t gfp_mask, int node, const void *caller)
{
        return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
                                gfp_mask, PAGE_KERNEL, 0, node, caller);
}
/*
 * This is only for performance analysis of vmalloc and stress purpose.
 * It is required by vmalloc test module, therefore do not use it other
 * than that.
 */
#ifdef CONFIG_TEST_VMALLOC_MODULE
EXPORT_SYMBOL_GPL(__vmalloc_node_noprof);
#endif

void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
{
        return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(__vmalloc_noprof);

/**
 * vmalloc - allocate virtually contiguous memory
 * @size:    allocation size
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 *
 * For tight control over page level allocator and protection flags
 * use __vmalloc() instead.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_noprof(unsigned long size)
{
        return __vmalloc_node_noprof(size, 1, GFP_KERNEL, NUMA_NO_NODE,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_noprof);

/**
 * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
 * @size:      allocation size
 * @gfp_mask:  flags for the page level allocator
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 * If @size is greater than or equal to PMD_SIZE, allow using
 * huge pages for the memory
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask)
{
        return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
                                    gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
                                    NUMA_NO_NODE, __builtin_return_address(0));
}
EXPORT_SYMBOL_GPL(vmalloc_huge_noprof);

/**
 * vzalloc - allocate virtually contiguous memory with zero fill
 * @size:    allocation size
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 * The memory allocated is set to zero.
 *
 * For tight control over page level allocator and protection flags
 * use __vmalloc() instead.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vzalloc_noprof(unsigned long size)
{
        return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc_noprof);

/**
 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
 * @size: allocation size
 *
 * The resulting memory area is zeroed so it can be mapped to userspace
 * without leaking data.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_user_noprof(unsigned long size)
{
        return __vmalloc_node_range_noprof(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
                                    GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
                                    VM_USERMAP, NUMA_NO_NODE,
                                    __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_user_noprof);

/**
 * vmalloc_node - allocate memory on a specific node
 * @size:          allocation size
 * @node:          numa node
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 *
 * For tight control over page level allocator and protection flags
 * use __vmalloc() instead.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_node_noprof(unsigned long size, int node)
{
        return __vmalloc_node_noprof(size, 1, GFP_KERNEL, node,
                        __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_node_noprof);

/**
 * vzalloc_node - allocate memory on a specific node with zero fill
 * @size:        allocation size
 * @node:        numa node
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 * The memory allocated is set to zero.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vzalloc_node_noprof(unsigned long size, int node)
{
        return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, node,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc_node_noprof);

#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
#else
/*
 * 64b systems should always have either DMA or DMA32 zones. For others
 * GFP_DMA32 should do the right thing and use the normal zone.
 */
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#endif

/**
 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
 * @size:        allocation size
 *
 * Allocate enough 32bit PA addressable pages to cover @size from the
 * page level allocator and map them into contiguous kernel virtual space.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_32_noprof(unsigned long size)
{
        return __vmalloc_node_noprof(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
                        __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32_noprof);

/**
 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
 * @size:             allocation size
 *
 * The resulting memory area is 32bit addressable and zeroed so it can be
 * mapped to userspace without leaking data.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_32_user_noprof(unsigned long size)
{
        return __vmalloc_node_range_noprof(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
                                    GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
                                    VM_USERMAP, NUMA_NO_NODE,
                                    __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32_user_noprof);

/*
 * Atomically zero bytes in the iterator.
 *
 * Returns the number of zeroed bytes.
 */
static size_t zero_iter(struct iov_iter *iter, size_t count)
{
        size_t remains = count;

        while (remains > 0) {
                size_t num, copied;

                num = min_t(size_t, remains, PAGE_SIZE);
                copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter);
                remains -= copied;

                if (copied < num)
                        break;
        }

        return count - remains;
}

/*
 * small helper routine, copy contents to iter from addr.
 * If the page is not present, fill zero.
 *
 * Returns the number of copied bytes.
 */
static size_t aligned_vread_iter(struct iov_iter *iter,
                                 const char *addr, size_t count)
{
        size_t remains = count;
        struct page *page;

        while (remains > 0) {
                unsigned long offset, length;
                size_t copied = 0;

                offset = offset_in_page(addr);
                length = PAGE_SIZE - offset;
                if (length > remains)
                        length = remains;
                page = vmalloc_to_page(addr);
                /*
                 * To do safe access to this _mapped_ area, we need lock. But
                 * adding lock here means that we need to add overhead of
                 * vmalloc()/vfree() calls for this _debug_ interface, rarely
                 * used. Instead of that, we'll use an local mapping via
                 * copy_page_to_iter_nofault() and accept a small overhead in
                 * this access function.
                 */
                if (page)
                        copied = copy_page_to_iter_nofault(page, offset,
                                                           length, iter);
                else
                        copied = zero_iter(iter, length);

                addr += copied;
                remains -= copied;

                if (copied != length)
                        break;
        }

        return count - remains;
}

/*
 * Read from a vm_map_ram region of memory.
 *
 * Returns the number of copied bytes.
 */
static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
                                  size_t count, unsigned long flags)
{
        char *start;
        struct vmap_block *vb;
        struct xarray *xa;
        unsigned long offset;
        unsigned int rs, re;
        size_t remains, n;

        /*
         * If it's area created by vm_map_ram() interface directly, but
         * not further subdividing and delegating management to vmap_block,
         * handle it here.
         */
        if (!(flags & VMAP_BLOCK))
                return aligned_vread_iter(iter, addr, count);

        remains = count;

        /*
         * Area is split into regions and tracked with vmap_block, read out
         * each region and zero fill the hole between regions.
         */
        xa = addr_to_vb_xa((unsigned long) addr);
        vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr));
        if (!vb)
                goto finished_zero;

        spin_lock(&vb->lock);
        if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
                spin_unlock(&vb->lock);
                goto finished_zero;
        }

        for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
                size_t copied;

                if (remains == 0)
                        goto finished;

                start = vmap_block_vaddr(vb->va->va_start, rs);

                if (addr < start) {
                        size_t to_zero = min_t(size_t, start - addr, remains);
                        size_t zeroed = zero_iter(iter, to_zero);

                        addr += zeroed;
                        remains -= zeroed;

                        if (remains == 0 || zeroed != to_zero)
                                goto finished;
                }

                /*it could start reading from the middle of used region*/
                offset = offset_in_page(addr);
                n = ((re - rs + 1) << PAGE_SHIFT) - offset;
                if (n > remains)
                        n = remains;

                copied = aligned_vread_iter(iter, start + offset, n);

                addr += copied;
                remains -= copied;

                if (copied != n)
                        goto finished;
        }

        spin_unlock(&vb->lock);

finished_zero:
        /* zero-fill the left dirty or free regions */
        return count - remains + zero_iter(iter, remains);
finished:
        /* We couldn't copy/zero everything */
        spin_unlock(&vb->lock);
        return count - remains;
}

/**
 * vread_iter() - read vmalloc area in a safe way to an iterator.
 * @iter:         the iterator to which data should be written.
 * @addr:         vm address.
 * @count:        number of bytes to be read.
 *
 * This function checks that addr is a valid vmalloc'ed area, and
 * copy data from that area to a given buffer. If the given memory range
 * of [addr...addr+count) includes some valid address, data is copied to
 * proper area of @buf. If there are memory holes, they'll be zero-filled.
 * IOREMAP area is treated as memory hole and no copy is done.
 *
 * If [addr...addr+count) doesn't includes any intersects with alive
 * vm_struct area, returns 0. @buf should be kernel's buffer.
 *
 * Note: In usual ops, vread() is never necessary because the caller
 * should know vmalloc() area is valid and can use memcpy().
 * This is for routines which have to access vmalloc area without
 * any information, as /proc/kcore.
 *
 * Return: number of bytes for which addr and buf should be increased
 * (same number as @count) or %0 if [addr...addr+count) doesn't
 * include any intersection with valid vmalloc area
 */
long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        struct vm_struct *vm;
        char *vaddr;
        size_t n, size, flags, remains;
        unsigned long next;

        addr = kasan_reset_tag(addr);

        /* Don't allow overflow */
        if ((unsigned long) addr + count < count)
                count = -(unsigned long) addr;

        remains = count;

        vn = find_vmap_area_exceed_addr_lock((unsigned long) addr, &va);
        if (!vn)
                goto finished_zero;

        /* no intersects with alive vmap_area */
        if ((unsigned long)addr + remains <= va->va_start)
                goto finished_zero;

        do {
                size_t copied;

                if (remains == 0)
                        goto finished;

                vm = va->vm;
                flags = va->flags & VMAP_FLAGS_MASK;
                /*
                 * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need
                 * be set together with VMAP_RAM.
                 */
                WARN_ON(flags == VMAP_BLOCK);

                if (!vm && !flags)
                        goto next_va;

                if (vm && (vm->flags & VM_UNINITIALIZED))
                        goto next_va;

                /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
                smp_rmb();

                vaddr = (char *) va->va_start;
                size = vm ? get_vm_area_size(vm) : va_size(va);

                if (addr >= vaddr + size)
                        goto next_va;

                if (addr < vaddr) {
                        size_t to_zero = min_t(size_t, vaddr - addr, remains);
                        size_t zeroed = zero_iter(iter, to_zero);

                        addr += zeroed;
                        remains -= zeroed;

                        if (remains == 0 || zeroed != to_zero)
                                goto finished;
                }

                n = vaddr + size - addr;
                if (n > remains)
                        n = remains;

                if (flags & VMAP_RAM)
                        copied = vmap_ram_vread_iter(iter, addr, n, flags);
                else if (!(vm && (vm->flags & (VM_IOREMAP | VM_SPARSE))))
                        copied = aligned_vread_iter(iter, addr, n);
                else /* IOREMAP | SPARSE area is treated as memory hole */
                        copied = zero_iter(iter, n);

                addr += copied;
                remains -= copied;

                if (copied != n)
                        goto finished;

        next_va:
                next = va->va_end;
                spin_unlock(&vn->busy.lock);
        } while ((vn = find_vmap_area_exceed_addr_lock(next, &va)));

finished_zero:
        if (vn)
                spin_unlock(&vn->busy.lock);

        /* zero-fill memory holes */
        return count - remains + zero_iter(iter, remains);
finished:
        /* Nothing remains, or We couldn't copy/zero everything. */
        if (vn)
                spin_unlock(&vn->busy.lock);

        return count - remains;
}

/**
 * remap_vmalloc_range_partial - map vmalloc pages to userspace
 * @vma:                vma to cover
 * @uaddr:                target user address to start at
 * @kaddr:                virtual address of vmalloc kernel memory
 * @pgoff:                offset from @kaddr to start at
 * @size:                size of map area
 *
 * Returns:        0 for success, -Exxx on failure
 *
 * This function checks that @kaddr is a valid vmalloc'ed area,
 * and that it is big enough to cover the range starting at
 * @uaddr in @vma. Will return failure if that criteria isn't
 * met.
 *
 * Similar to remap_pfn_range() (see mm/memory.c)
 */
int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
                                void *kaddr, unsigned long pgoff,
                                unsigned long size)
{
        struct vm_struct *area;
        unsigned long off;
        unsigned long end_index;

        if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
                return -EINVAL;

        size = PAGE_ALIGN(size);

        if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
                return -EINVAL;

        area = find_vm_area(kaddr);
        if (!area)
                return -EINVAL;

        if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
                return -EINVAL;

        if (check_add_overflow(size, off, &end_index) ||
            end_index > get_vm_area_size(area))
                return -EINVAL;
        kaddr += off;

        do {
                struct page *page = vmalloc_to_page(kaddr);
                int ret;

                ret = vm_insert_page(vma, uaddr, page);
                if (ret)
                        return ret;

                uaddr += PAGE_SIZE;
                kaddr += PAGE_SIZE;
                size -= PAGE_SIZE;
        } while (size > 0);

        vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP);

        return 0;
}

/**
 * remap_vmalloc_range - map vmalloc pages to userspace
 * @vma:                vma to cover (map full range of vma)
 * @addr:                vmalloc memory
 * @pgoff:                number of pages into addr before first page to map
 *
 * Returns:        0 for success, -Exxx on failure
 *
 * This function checks that addr is a valid vmalloc'ed area, and
 * that it is big enough to cover the vma. Will return failure if
 * that criteria isn't met.
 *
 * Similar to remap_pfn_range() (see mm/memory.c)
 */
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
                                                unsigned long pgoff)
{
        return remap_vmalloc_range_partial(vma, vma->vm_start,
                                           addr, pgoff,
                                           vma->vm_end - vma->vm_start);
}
EXPORT_SYMBOL(remap_vmalloc_range);

void free_vm_area(struct vm_struct *area)
{
        struct vm_struct *ret;
        ret = remove_vm_area(area->addr);
        BUG_ON(ret != area);
        kfree(area);
}
EXPORT_SYMBOL_GPL(free_vm_area);

#ifdef CONFIG_SMP
static struct vmap_area *node_to_va(struct rb_node *n)
{
        return rb_entry_safe(n, struct vmap_area, rb_node);
}

/**
 * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
 * @addr: target address
 *
 * Returns: vmap_area if it is found. If there is no such area
 *   the first highest(reverse order) vmap_area is returned
 *   i.e. va->va_start < addr && va->va_end < addr or NULL
 *   if there are no any areas before @addr.
 */
static struct vmap_area *
pvm_find_va_enclose_addr(unsigned long addr)
{
        struct vmap_area *va, *tmp;
        struct rb_node *n;

        n = free_vmap_area_root.rb_node;
        va = NULL;

        while (n) {
                tmp = rb_entry(n, struct vmap_area, rb_node);
                if (tmp->va_start <= addr) {
                        va = tmp;
                        if (tmp->va_end >= addr)
                                break;

                        n = n->rb_right;
                } else {
                        n = n->rb_left;
                }
        }

        return va;
}

/**
 * pvm_determine_end_from_reverse - find the highest aligned address
 * of free block below VMALLOC_END
 * @va:
 *   in - the VA we start the search(reverse order);
 *   out - the VA with the highest aligned end address.
 * @align: alignment for required highest address
 *
 * Returns: determined end address within vmap_area
 */
static unsigned long
pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
{
        unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
        unsigned long addr;

        if (likely(*va)) {
                list_for_each_entry_from_reverse((*va),
                                &free_vmap_area_list, list) {
                        addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
                        if ((*va)->va_start < addr)
                                return addr;
                }
        }

        return 0;
}

/**
 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
 * @offsets: array containing offset of each area
 * @sizes: array containing size of each area
 * @nr_vms: the number of areas to allocate
 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
 *
 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
 *            vm_structs on success, %NULL on failure
 *
 * Percpu allocator wants to use congruent vm areas so that it can
 * maintain the offsets among percpu areas.  This function allocates
 * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
 * be scattered pretty far, distance between two areas easily going up
 * to gigabytes.  To avoid interacting with regular vmallocs, these
 * areas are allocated from top.
 *
 * Despite its complicated look, this allocator is rather simple. It
 * does everything top-down and scans free blocks from the end looking
 * for matching base. While scanning, if any of the areas do not fit the
 * base address is pulled down to fit the area. Scanning is repeated till
 * all the areas fit and then all necessary data structures are inserted
 * and the result is returned.
 */
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                                     const size_t *sizes, int nr_vms,
                                     size_t align)
{
        const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
        const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
        struct vmap_area **vas, *va;
        struct vm_struct **vms;
        int area, area2, last_area, term_area;
        unsigned long base, start, size, end, last_end, orig_start, orig_end;
        bool purged = false;

        /* verify parameters and allocate data structures */
        BUG_ON(offset_in_page(align) || !is_power_of_2(align));
        for (last_area = 0, area = 0; area < nr_vms; area++) {
                start = offsets[area];
                end = start + sizes[area];

                /* is everything aligned properly? */
                BUG_ON(!IS_ALIGNED(offsets[area], align));
                BUG_ON(!IS_ALIGNED(sizes[area], align));

                /* detect the area with the highest address */
                if (start > offsets[last_area])
                        last_area = area;

                for (area2 = area + 1; area2 < nr_vms; area2++) {
                        unsigned long start2 = offsets[area2];
                        unsigned long end2 = start2 + sizes[area2];

                        BUG_ON(start2 < end && start < end2);
                }
        }
        last_end = offsets[last_area] + sizes[last_area];

        if (vmalloc_end - vmalloc_start < last_end) {
                WARN_ON(true);
                return NULL;
        }

        vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
        vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
        if (!vas || !vms)
                goto err_free2;

        for (area = 0; area < nr_vms; area++) {
                vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
                vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
                if (!vas[area] || !vms[area])
                        goto err_free;
        }
retry:
        spin_lock(&free_vmap_area_lock);

        /* start scanning - we scan from the top, begin with the last area */
        area = term_area = last_area;
        start = offsets[area];
        end = start + sizes[area];

        va = pvm_find_va_enclose_addr(vmalloc_end);
        base = pvm_determine_end_from_reverse(&va, align) - end;

        while (true) {
                /*
                 * base might have underflowed, add last_end before
                 * comparing.
                 */
                if (base + last_end < vmalloc_start + last_end)
                        goto overflow;

                /*
                 * Fitting base has not been found.
                 */
                if (va == NULL)
                        goto overflow;

                /*
                 * If required width exceeds current VA block, move
                 * base downwards and then recheck.
                 */
                if (base + end > va->va_end) {
                        base = pvm_determine_end_from_reverse(&va, align) - end;
                        term_area = area;
                        continue;
                }

                /*
                 * If this VA does not fit, move base downwards and recheck.
                 */
                if (base + start < va->va_start) {
                        va = node_to_va(rb_prev(&va->rb_node));
                        base = pvm_determine_end_from_reverse(&va, align) - end;
                        term_area = area;
                        continue;
                }

                /*
                 * This area fits, move on to the previous one.  If
                 * the previous one is the terminal one, we're done.
                 */
                area = (area + nr_vms - 1) % nr_vms;
                if (area == term_area)
                        break;

                start = offsets[area];
                end = start + sizes[area];
                va = pvm_find_va_enclose_addr(base + end);
        }

        /* we've found a fitting base, insert all va's */
        for (area = 0; area < nr_vms; area++) {
                int ret;

                start = base + offsets[area];
                size = sizes[area];

                va = pvm_find_va_enclose_addr(start);
                if (WARN_ON_ONCE(va == NULL))
                        /* It is a BUG(), but trigger recovery instead. */
                        goto recovery;

                ret = va_clip(&free_vmap_area_root,
                        &free_vmap_area_list, va, start, size);
                if (WARN_ON_ONCE(unlikely(ret)))
                        /* It is a BUG(), but trigger recovery instead. */
                        goto recovery;

                /* Allocated area. */
                va = vas[area];
                va->va_start = start;
                va->va_end = start + size;
        }

        spin_unlock(&free_vmap_area_lock);

        /* populate the kasan shadow space */
        for (area = 0; area < nr_vms; area++) {
                if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
                        goto err_free_shadow;
        }

        /* insert all vm's */
        for (area = 0; area < nr_vms; area++) {
                struct vmap_node *vn = addr_to_node(vas[area]->va_start);

                spin_lock(&vn->busy.lock);
                insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
                setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
                                 pcpu_get_vm_areas);
                spin_unlock(&vn->busy.lock);
        }

        /*
         * Mark allocated areas as accessible. Do it now as a best-effort
         * approach, as they can be mapped outside of vmalloc code.
         * With hardware tag-based KASAN, marking is skipped for
         * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
         */
        for (area = 0; area < nr_vms; area++)
                vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
                                vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);

        kfree(vas);
        return vms;

recovery:
        /*
         * Remove previously allocated areas. There is no
         * need in removing these areas from the busy tree,
         * because they are inserted only on the final step
         * and when pcpu_get_vm_areas() is success.
         */
        while (area--) {
                orig_start = vas[area]->va_start;
                orig_end = vas[area]->va_end;
                va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
                                &free_vmap_area_list);
                if (va)
                        kasan_release_vmalloc(orig_start, orig_end,
                                va->va_start, va->va_end);
                vas[area] = NULL;
        }

overflow:
        spin_unlock(&free_vmap_area_lock);
        if (!purged) {
                reclaim_and_purge_vmap_areas();
                purged = true;

                /* Before "retry", check if we recover. */
                for (area = 0; area < nr_vms; area++) {
                        if (vas[area])
                                continue;

                        vas[area] = kmem_cache_zalloc(
                                vmap_area_cachep, GFP_KERNEL);
                        if (!vas[area])
                                goto err_free;
                }

                goto retry;
        }

err_free:
        for (area = 0; area < nr_vms; area++) {
                if (vas[area])
                        kmem_cache_free(vmap_area_cachep, vas[area]);

                kfree(vms[area]);
        }
err_free2:
        kfree(vas);
        kfree(vms);
        return NULL;

err_free_shadow:
        spin_lock(&free_vmap_area_lock);
        /*
         * We release all the vmalloc shadows, even the ones for regions that
         * hadn't been successfully added. This relies on kasan_release_vmalloc
         * being able to tolerate this case.
         */
        for (area = 0; area < nr_vms; area++) {
                orig_start = vas[area]->va_start;
                orig_end = vas[area]->va_end;
                va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
                                &free_vmap_area_list);
                if (va)
                        kasan_release_vmalloc(orig_start, orig_end,
                                va->va_start, va->va_end);
                vas[area] = NULL;
                kfree(vms[area]);
        }
        spin_unlock(&free_vmap_area_lock);
        kfree(vas);
        kfree(vms);
        return NULL;
}

/**
 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
 * @nr_vms: the number of allocated areas
 *
 * Free vm_structs and the array allocated by pcpu_get_vm_areas().
 */
void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
{
        int i;

        for (i = 0; i < nr_vms; i++)
                free_vm_area(vms[i]);
        kfree(vms);
}
#endif        /* CONFIG_SMP */

#ifdef CONFIG_PRINTK
bool vmalloc_dump_obj(void *object)
{
        const void *caller;
        struct vm_struct *vm;
        struct vmap_area *va;
        struct vmap_node *vn;
        unsigned long addr;
        unsigned int nr_pages;

        addr = PAGE_ALIGN((unsigned long) object);
        vn = addr_to_node(addr);

        if (!spin_trylock(&vn->busy.lock))
                return false;

        va = __find_vmap_area(addr, &vn->busy.root);
        if (!va || !va->vm) {
                spin_unlock(&vn->busy.lock);
                return false;
        }

        vm = va->vm;
        addr = (unsigned long) vm->addr;
        caller = vm->caller;
        nr_pages = vm->nr_pages;
        spin_unlock(&vn->busy.lock);

        pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
                nr_pages, addr, caller);

        return true;
}
#endif

#ifdef CONFIG_PROC_FS
static void show_numa_info(struct seq_file *m, struct vm_struct *v)
{
        if (IS_ENABLED(CONFIG_NUMA)) {
                unsigned int nr, *counters = m->private;
                unsigned int step = 1U << vm_area_page_order(v);

                if (!counters)
                        return;

                if (v->flags & VM_UNINITIALIZED)
                        return;
                /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
                smp_rmb();

                memset(counters, 0, nr_node_ids * sizeof(unsigned int));

                for (nr = 0; nr < v->nr_pages; nr += step)
                        counters[page_to_nid(v->pages[nr])] += step;
                for_each_node_state(nr, N_HIGH_MEMORY)
                        if (counters[nr])
                                seq_printf(m, " N%u=%u", nr, counters[nr]);
        }
}

static void show_purge_info(struct seq_file *m)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        int i;

        for (i = 0; i < nr_vmap_nodes; i++) {
                vn = &vmap_nodes[i];

                spin_lock(&vn->lazy.lock);
                list_for_each_entry(va, &vn->lazy.head, list) {
                        seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
                                (void *)va->va_start, (void *)va->va_end,
                                va->va_end - va->va_start);
                }
                spin_unlock(&vn->lazy.lock);
        }
}

static int vmalloc_info_show(struct seq_file *m, void *p)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        struct vm_struct *v;
        int i;

        for (i = 0; i < nr_vmap_nodes; i++) {
                vn = &vmap_nodes[i];

                spin_lock(&vn->busy.lock);
                list_for_each_entry(va, &vn->busy.head, list) {
                        if (!va->vm) {
                                if (va->flags & VMAP_RAM)
                                        seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
                                                (void *)va->va_start, (void *)va->va_end,
                                                va->va_end - va->va_start);

                                continue;
                        }

                        v = va->vm;

                        seq_printf(m, "0x%pK-0x%pK %7ld",
                                v->addr, v->addr + v->size, v->size);

                        if (v->caller)
                                seq_printf(m, " %pS", v->caller);

                        if (v->nr_pages)
                                seq_printf(m, " pages=%d", v->nr_pages);

                        if (v->phys_addr)
                                seq_printf(m, " phys=%pa", &v->phys_addr);

                        if (v->flags & VM_IOREMAP)
                                seq_puts(m, " ioremap");

                        if (v->flags & VM_SPARSE)
                                seq_puts(m, " sparse");

                        if (v->flags & VM_ALLOC)
                                seq_puts(m, " vmalloc");

                        if (v->flags & VM_MAP)
                                seq_puts(m, " vmap");

                        if (v->flags & VM_USERMAP)
                                seq_puts(m, " user");

                        if (v->flags & VM_DMA_COHERENT)
                                seq_puts(m, " dma-coherent");

                        if (is_vmalloc_addr(v->pages))
                                seq_puts(m, " vpages");

                        show_numa_info(m, v);
                        seq_putc(m, '\n');
                }
                spin_unlock(&vn->busy.lock);
        }

        /*
         * As a final step, dump "unpurged" areas.
         */
        show_purge_info(m);
        return 0;
}

static int __init proc_vmalloc_init(void)
{
        void *priv_data = NULL;

        if (IS_ENABLED(CONFIG_NUMA))
                priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);

        proc_create_single_data("vmallocinfo",
                0400, NULL, vmalloc_info_show, priv_data);

        return 0;
}
module_init(proc_vmalloc_init);

#endif

static void __init vmap_init_free_space(void)
{
        unsigned long vmap_start = 1;
        const unsigned long vmap_end = ULONG_MAX;
        struct vmap_area *free;
        struct vm_struct *busy;

        /*
         *     B     F     B     B     B     F
         * -|-----|.....|-----|-----|-----|.....|-
         *  |           The KVA space           |
         *  |<--------------------------------->|
         */
        for (busy = vmlist; busy; busy = busy->next) {
                if ((unsigned long) busy->addr - vmap_start > 0) {
                        free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
                        if (!WARN_ON_ONCE(!free)) {
                                free->va_start = vmap_start;
                                free->va_end = (unsigned long) busy->addr;

                                insert_vmap_area_augment(free, NULL,
                                        &free_vmap_area_root,
                                                &free_vmap_area_list);
                        }
                }

                vmap_start = (unsigned long) busy->addr + busy->size;
        }

        if (vmap_end - vmap_start > 0) {
                free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
                if (!WARN_ON_ONCE(!free)) {
                        free->va_start = vmap_start;
                        free->va_end = vmap_end;

                        insert_vmap_area_augment(free, NULL,
                                &free_vmap_area_root,
                                        &free_vmap_area_list);
                }
        }
}

static void vmap_init_nodes(void)
{
        struct vmap_node *vn;
        int i, n;

#if BITS_PER_LONG == 64
        /*
         * A high threshold of max nodes is fixed and bound to 128,
         * thus a scale factor is 1 for systems where number of cores
         * are less or equal to specified threshold.
         *
         * As for NUMA-aware notes. For bigger systems, for example
         * NUMA with multi-sockets, where we can end-up with thousands
         * of cores in total, a "sub-numa-clustering" should be added.
         *
         * In this case a NUMA domain is considered as a single entity
         * with dedicated sub-nodes in it which describe one group or
         * set of cores. Therefore a per-domain purging is supposed to
         * be added as well as a per-domain balancing.
         */
        n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);

        if (n > 1) {
                vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN);
                if (vn) {
                        /* Node partition is 16 pages. */
                        vmap_zone_size = (1 << 4) * PAGE_SIZE;
                        nr_vmap_nodes = n;
                        vmap_nodes = vn;
                } else {
                        pr_err("Failed to allocate an array. Disable a node layer\n");
                }
        }
#endif

        for (n = 0; n < nr_vmap_nodes; n++) {
                vn = &vmap_nodes[n];
                vn->busy.root = RB_ROOT;
                INIT_LIST_HEAD(&vn->busy.head);
                spin_lock_init(&vn->busy.lock);

                vn->lazy.root = RB_ROOT;
                INIT_LIST_HEAD(&vn->lazy.head);
                spin_lock_init(&vn->lazy.lock);

                for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
                        INIT_LIST_HEAD(&vn->pool[i].head);
                        WRITE_ONCE(vn->pool[i].len, 0);
                }

                spin_lock_init(&vn->pool_lock);
        }
}

static unsigned long
vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
        unsigned long count;
        struct vmap_node *vn;
        int i, j;

        for (count = 0, i = 0; i < nr_vmap_nodes; i++) {
                vn = &vmap_nodes[i];

                for (j = 0; j < MAX_VA_SIZE_PAGES; j++)
                        count += READ_ONCE(vn->pool[j].len);
        }

        return count ? count : SHRINK_EMPTY;
}

static unsigned long
vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
        int i;

        for (i = 0; i < nr_vmap_nodes; i++)
                decay_va_pool_node(&vmap_nodes[i], true);

        return SHRINK_STOP;
}

void __init vmalloc_init(void)
{
        struct shrinker *vmap_node_shrinker;
        struct vmap_area *va;
        struct vmap_node *vn;
        struct vm_struct *tmp;
        int i;

        /*
         * Create the cache for vmap_area objects.
         */
        vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);

        for_each_possible_cpu(i) {
                struct vmap_block_queue *vbq;
                struct vfree_deferred *p;

                vbq = &per_cpu(vmap_block_queue, i);
                spin_lock_init(&vbq->lock);
                INIT_LIST_HEAD(&vbq->free);
                p = &per_cpu(vfree_deferred, i);
                init_llist_head(&p->list);
                INIT_WORK(&p->wq, delayed_vfree_work);
                xa_init(&vbq->vmap_blocks);
        }

        /*
         * Setup nodes before importing vmlist.
         */
        vmap_init_nodes();

        /* Import existing vmlist entries. */
        for (tmp = vmlist; tmp; tmp = tmp->next) {
                va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
                if (WARN_ON_ONCE(!va))
                        continue;

                va->va_start = (unsigned long)tmp->addr;
                va->va_end = va->va_start + tmp->size;
                va->vm = tmp;

                vn = addr_to_node(va->va_start);
                insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
        }

        /*
         * Now we can initialize a free vmap space.
         */
        vmap_init_free_space();
        vmap_initialized = true;

        vmap_node_shrinker = shrinker_alloc(0, "vmap-node");
        if (!vmap_node_shrinker) {
                pr_err("Failed to allocate vmap-node shrinker!\n");
                return;
        }

        vmap_node_shrinker->count_objects = vmap_node_shrink_count;
        vmap_node_shrinker->scan_objects = vmap_node_shrink_scan;
        shrinker_register(vmap_node_shrinker);
}









































































































































    1 






    3 
















   12 







   13 














    1 


















   11 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright 2019 Google LLC
 */

#ifndef __LINUX_BLK_CRYPTO_INTERNAL_H
#define __LINUX_BLK_CRYPTO_INTERNAL_H

#include <linux/bio.h>
#include <linux/blk-mq.h>

/* Represents a crypto mode supported by blk-crypto  */
struct blk_crypto_mode {
        const char *name; /* name of this mode, shown in sysfs */
        const char *cipher_str; /* crypto API name (for fallback case) */
        unsigned int keysize; /* key size in bytes */
        unsigned int ivsize; /* iv size in bytes */
};

extern const struct blk_crypto_mode blk_crypto_modes[];

#ifdef CONFIG_BLK_INLINE_ENCRYPTION

int blk_crypto_sysfs_register(struct gendisk *disk);

void blk_crypto_sysfs_unregister(struct gendisk *disk);

void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
                             unsigned int inc);

bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio);

bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes,
                             struct bio_crypt_ctx *bc2);

static inline bool bio_crypt_ctx_back_mergeable(struct request *req,
                                                struct bio *bio)
{
        return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req),
                                       bio->bi_crypt_context);
}

static inline bool bio_crypt_ctx_front_mergeable(struct request *req,
                                                 struct bio *bio)
{
        return bio_crypt_ctx_mergeable(bio->bi_crypt_context,
                                       bio->bi_iter.bi_size, req->crypt_ctx);
}

static inline bool bio_crypt_ctx_merge_rq(struct request *req,
                                          struct request *next)
{
        return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req),
                                       next->crypt_ctx);
}

static inline void blk_crypto_rq_set_defaults(struct request *rq)
{
        rq->crypt_ctx = NULL;
        rq->crypt_keyslot = NULL;
}

static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
{
        return rq->crypt_ctx;
}

static inline bool blk_crypto_rq_has_keyslot(struct request *rq)
{
        return rq->crypt_keyslot;
}

blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
                                    const struct blk_crypto_key *key,
                                    struct blk_crypto_keyslot **slot_ptr);

void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot);

int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
                           const struct blk_crypto_key *key);

bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
                                const struct blk_crypto_config *cfg);

#else /* CONFIG_BLK_INLINE_ENCRYPTION */

static inline int blk_crypto_sysfs_register(struct gendisk *disk)
{
        return 0;
}

static inline void blk_crypto_sysfs_unregister(struct gendisk *disk)
{
}

static inline bool bio_crypt_rq_ctx_compatible(struct request *rq,
                                               struct bio *bio)
{
        return true;
}

static inline bool bio_crypt_ctx_front_mergeable(struct request *req,
                                                 struct bio *bio)
{
        return true;
}

static inline bool bio_crypt_ctx_back_mergeable(struct request *req,
                                                struct bio *bio)
{
        return true;
}

static inline bool bio_crypt_ctx_merge_rq(struct request *req,
                                          struct request *next)
{
        return true;
}

static inline void blk_crypto_rq_set_defaults(struct request *rq) { }

static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
{
        return false;
}

static inline bool blk_crypto_rq_has_keyslot(struct request *rq)
{
        return false;
}

#endif /* CONFIG_BLK_INLINE_ENCRYPTION */

void __bio_crypt_advance(struct bio *bio, unsigned int bytes);
static inline void bio_crypt_advance(struct bio *bio, unsigned int bytes)
{
        if (bio_has_crypt_ctx(bio))
                __bio_crypt_advance(bio, bytes);
}

void __bio_crypt_free_ctx(struct bio *bio);
static inline void bio_crypt_free_ctx(struct bio *bio)
{
        if (bio_has_crypt_ctx(bio))
                __bio_crypt_free_ctx(bio);
}

static inline void bio_crypt_do_front_merge(struct request *rq,
                                            struct bio *bio)
{
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
        if (bio_has_crypt_ctx(bio))
                memcpy(rq->crypt_ctx->bc_dun, bio->bi_crypt_context->bc_dun,
                       sizeof(rq->crypt_ctx->bc_dun));
#endif
}

bool __blk_crypto_bio_prep(struct bio **bio_ptr);
static inline bool blk_crypto_bio_prep(struct bio **bio_ptr)
{
        if (bio_has_crypt_ctx(*bio_ptr))
                return __blk_crypto_bio_prep(bio_ptr);
        return true;
}

blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq);
static inline blk_status_t blk_crypto_rq_get_keyslot(struct request *rq)
{
        if (blk_crypto_rq_is_encrypted(rq))
                return __blk_crypto_rq_get_keyslot(rq);
        return BLK_STS_OK;
}

void __blk_crypto_rq_put_keyslot(struct request *rq);
static inline void blk_crypto_rq_put_keyslot(struct request *rq)
{
        if (blk_crypto_rq_has_keyslot(rq))
                __blk_crypto_rq_put_keyslot(rq);
}

void __blk_crypto_free_request(struct request *rq);
static inline void blk_crypto_free_request(struct request *rq)
{
        if (blk_crypto_rq_is_encrypted(rq))
                __blk_crypto_free_request(rq);
}

int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
                             gfp_t gfp_mask);
/**
 * blk_crypto_rq_bio_prep - Prepare a request's crypt_ctx when its first bio
 *                            is inserted
 * @rq: The request to prepare
 * @bio: The first bio being inserted into the request
 * @gfp_mask: Memory allocation flags
 *
 * Return: 0 on success, -ENOMEM if out of memory.  -ENOMEM is only possible if
 *           @gfp_mask doesn't include %__GFP_DIRECT_RECLAIM.
 */
static inline int blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
                                         gfp_t gfp_mask)
{
        if (bio_has_crypt_ctx(bio))
                return __blk_crypto_rq_bio_prep(rq, bio, gfp_mask);
        return 0;
}

#ifdef CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK

int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num);

bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr);

int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key);

#else /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */

static inline int
blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num)
{
        pr_warn_once("crypto API fallback is disabled\n");
        return -ENOPKG;
}

static inline bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
{
        pr_warn_once("crypto API fallback disabled; failing request.\n");
        (*bio_ptr)->bi_status = BLK_STS_NOTSUPP;
        return false;
}

static inline int
blk_crypto_fallback_evict_key(const struct blk_crypto_key *key)
{
        return 0;
}

#endif /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */

#endif /* __LINUX_BLK_CRYPTO_INTERNAL_H */












































































































































































































    6 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _MM_PERCPU_INTERNAL_H
#define _MM_PERCPU_INTERNAL_H

#include <linux/types.h>
#include <linux/percpu.h>
#include <linux/memcontrol.h>

/*
 * pcpu_block_md is the metadata block struct.
 * Each chunk's bitmap is split into a number of full blocks.
 * All units are in terms of bits.
 *
 * The scan hint is the largest known contiguous area before the contig hint.
 * It is not necessarily the actual largest contig hint though.  There is an
 * invariant that the scan_hint_start > contig_hint_start iff
 * scan_hint == contig_hint.  This is necessary because when scanning forward,
 * we don't know if a new contig hint would be better than the current one.
 */
struct pcpu_block_md {
        int                        scan_hint;        /* scan hint for block */
        int                        scan_hint_start; /* block relative starting
                                                    position of the scan hint */
        int                     contig_hint;    /* contig hint for block */
        int                     contig_hint_start; /* block relative starting
                                                      position of the contig hint */
        int                     left_free;      /* size of free space along
                                                   the left side of the block */
        int                     right_free;     /* size of free space along
                                                   the right side of the block */
        int                     first_free;     /* block position of first free */
        int                        nr_bits;        /* total bits responsible for */
};

struct pcpuobj_ext {
#ifdef CONFIG_MEMCG_KMEM
        struct obj_cgroup        *cgroup;
#endif
#ifdef CONFIG_MEM_ALLOC_PROFILING
        union codetag_ref        tag;
#endif
};

#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MEM_ALLOC_PROFILING)
#define NEED_PCPUOBJ_EXT
#endif

struct pcpu_chunk {
#ifdef CONFIG_PERCPU_STATS
        int                        nr_alloc;        /* # of allocations */
        size_t                        max_alloc_size; /* largest allocation size */
#endif

        struct list_head        list;                /* linked to pcpu_slot lists */
        int                        free_bytes;        /* free bytes in the chunk */
        struct pcpu_block_md        chunk_md;
        unsigned long                *bound_map;        /* boundary map */

        /*
         * base_addr is the base address of this chunk.
         * To reduce false sharing, current layout is optimized to make sure
         * base_addr locate in the different cacheline with free_bytes and
         * chunk_md.
         */
        void                        *base_addr ____cacheline_aligned_in_smp;

        unsigned long                *alloc_map;        /* allocation map */
        struct pcpu_block_md        *md_blocks;        /* metadata blocks */

        void                        *data;                /* chunk data */
        bool                        immutable;        /* no [de]population allowed */
        bool                        isolated;        /* isolated from active chunk
                                                   slots */
        int                        start_offset;        /* the overlap with the previous
                                                   region to have a page aligned
                                                   base_addr */
        int                        end_offset;        /* additional area required to
                                                   have the region end page
                                                   aligned */
#ifdef NEED_PCPUOBJ_EXT
        struct pcpuobj_ext        *obj_exts;        /* vector of object cgroups */
#endif

        int                        nr_pages;        /* # of pages served by this chunk */
        int                        nr_populated;        /* # of populated pages */
        int                     nr_empty_pop_pages; /* # of empty populated pages */
        unsigned long                populated[];        /* populated bitmap */
};

static inline bool need_pcpuobj_ext(void)
{
        if (IS_ENABLED(CONFIG_MEM_ALLOC_PROFILING))
                return true;
        if (!mem_cgroup_kmem_disabled())
                return true;
        return false;
}

extern spinlock_t pcpu_lock;

extern struct list_head *pcpu_chunk_lists;
extern int pcpu_nr_slots;
extern int pcpu_sidelined_slot;
extern int pcpu_to_depopulate_slot;
extern int pcpu_nr_empty_pop_pages;

extern struct pcpu_chunk *pcpu_first_chunk;
extern struct pcpu_chunk *pcpu_reserved_chunk;

/**
 * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks
 * @chunk: chunk of interest
 *
 * This conversion is from the number of physical pages that the chunk
 * serves to the number of bitmap blocks used.
 */
static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk)
{
        return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE;
}

/**
 * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap
 * @pages: number of physical pages
 *
 * This conversion is from physical pages to the number of bits
 * required in the bitmap.
 */
static inline int pcpu_nr_pages_to_map_bits(int pages)
{
        return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
}

/**
 * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap
 * @chunk: chunk of interest
 *
 * This conversion is from the number of physical pages that the chunk
 * serves to the number of bits in the bitmap.
 */
static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
{
        return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
}

/**
 * pcpu_obj_full_size - helper to calculate size of each accounted object
 * @size: size of area to allocate in bytes
 *
 * For each accounted object there is an extra space which is used to store
 * obj_cgroup membership if kmemcg is not disabled. Charge it too.
 */
static inline size_t pcpu_obj_full_size(size_t size)
{
        size_t extra_size = 0;

#ifdef CONFIG_MEMCG_KMEM
        if (!mem_cgroup_kmem_disabled())
                extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
#endif

        return size * num_possible_cpus() + extra_size;
}

#ifdef CONFIG_PERCPU_STATS

#include <linux/spinlock.h>

struct percpu_stats {
        u64 nr_alloc;                /* lifetime # of allocations */
        u64 nr_dealloc;                /* lifetime # of deallocations */
        u64 nr_cur_alloc;        /* current # of allocations */
        u64 nr_max_alloc;        /* max # of live allocations */
        u32 nr_chunks;                /* current # of live chunks */
        u32 nr_max_chunks;        /* max # of live chunks */
        size_t min_alloc_size;        /* min allocation size */
        size_t max_alloc_size;        /* max allocation size */
};

extern struct percpu_stats pcpu_stats;
extern struct pcpu_alloc_info pcpu_stats_ai;

/*
 * For debug purposes. We don't care about the flexible array.
 */
static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
{
        memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info));

        /* initialize min_alloc_size to unit_size */
        pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size;
}

/*
 * pcpu_stats_area_alloc - increment area allocation stats
 * @chunk: the location of the area being allocated
 * @size: size of area to allocate in bytes
 *
 * CONTEXT:
 * pcpu_lock.
 */
static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
{
        lockdep_assert_held(&pcpu_lock);

        pcpu_stats.nr_alloc++;
        pcpu_stats.nr_cur_alloc++;
        pcpu_stats.nr_max_alloc =
                max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc);
        pcpu_stats.min_alloc_size =
                min(pcpu_stats.min_alloc_size, size);
        pcpu_stats.max_alloc_size =
                max(pcpu_stats.max_alloc_size, size);

        chunk->nr_alloc++;
        chunk->max_alloc_size = max(chunk->max_alloc_size, size);
}

/*
 * pcpu_stats_area_dealloc - decrement allocation stats
 * @chunk: the location of the area being deallocated
 *
 * CONTEXT:
 * pcpu_lock.
 */
static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
{
        lockdep_assert_held(&pcpu_lock);

        pcpu_stats.nr_dealloc++;
        pcpu_stats.nr_cur_alloc--;

        chunk->nr_alloc--;
}

/*
 * pcpu_stats_chunk_alloc - increment chunk stats
 */
static inline void pcpu_stats_chunk_alloc(void)
{
        unsigned long flags;
        spin_lock_irqsave(&pcpu_lock, flags);

        pcpu_stats.nr_chunks++;
        pcpu_stats.nr_max_chunks =
                max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks);

        spin_unlock_irqrestore(&pcpu_lock, flags);
}

/*
 * pcpu_stats_chunk_dealloc - decrement chunk stats
 */
static inline void pcpu_stats_chunk_dealloc(void)
{
        unsigned long flags;
        spin_lock_irqsave(&pcpu_lock, flags);

        pcpu_stats.nr_chunks--;

        spin_unlock_irqrestore(&pcpu_lock, flags);
}

#else

static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
{
}

static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
{
}

static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
{
}

static inline void pcpu_stats_chunk_alloc(void)
{
}

static inline void pcpu_stats_chunk_dealloc(void)
{
}

#endif /* !CONFIG_PERCPU_STATS */

#endif

















































    2 



    2 
























































    1 










    1 
    1 


    1 























































    1 














































































































































































































































    2 




































    1 


    1 























    1 
















    3 



















    3 








    3 



    1 








    1 







    3 
































    1 
    2 

















    1 







    1 

    1 























































    2 




















    2 






    1 



    2 
















    2 








    1 
    1 












    2 












    2 


















    2 








































































































































































































    2 










    2 














































    1 




























    1 





















    1 

    1 



    1 











    1 
























    1 
































    1 




    1 












































    1 






















    1 
    1 































    1 





    1 

















    1 





    1 


    1 








    1 
































    1 









    1 












































    1 
    1 




















    1 







    1 




    1 









    1 

    1 






















































    1 































    1 











































    1 
























    1 













    1 













    1 



    1 



    1 

















    1 

    1 

    1 









    1 






    1 




























    1 






    1 











































































































































































































    1 

























    1 







    1 







    1 













    1 
























    1 











    1 
















    1 

    1 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
// SPDX-License-Identifier: GPL-2.0-only
/*
 * "splice": joining two ropes together by interweaving their strands.
 *
 * This is the "extended pipe" functionality, where a pipe is used as
 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
 * buffer that you can use to transfer data from one end to the other.
 *
 * The traditional unix read/write is extended with a "splice()" operation
 * that transfers data buffers to or from a pipe buffer.
 *
 * Named by Larry McVoy, original implementation from Linus, extended by
 * Jens to support splicing to files, network, direct splicing, etc and
 * fixing lots of bugs.
 *
 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
 *
 */
#include <linux/bvec.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/uio.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/gfp.h>
#include <linux/net.h>
#include <linux/socket.h>
#include <linux/sched/signal.h>

#include "internal.h"

/*
 * Splice doesn't support FMODE_NOWAIT. Since pipes may set this flag to
 * indicate they support non-blocking reads or writes, we must clear it
 * here if set to avoid blocking other users of this pipe if splice is
 * being done on it.
 */
static noinline void noinline pipe_clear_nowait(struct file *file)
{
        fmode_t fmode = READ_ONCE(file->f_mode);

        do {
                if (!(fmode & FMODE_NOWAIT))
                        break;
        } while (!try_cmpxchg(&file->f_mode, &fmode, fmode & ~FMODE_NOWAIT));
}

/*
 * Attempt to steal a page from a pipe buffer. This should perhaps go into
 * a vm helper function, it's already simplified quite a bit by the
 * addition of remove_mapping(). If success is returned, the caller may
 * attempt to reuse this page for another destination.
 */
static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
                struct pipe_buffer *buf)
{
        struct folio *folio = page_folio(buf->page);
        struct address_space *mapping;

        folio_lock(folio);

        mapping = folio_mapping(folio);
        if (mapping) {
                WARN_ON(!folio_test_uptodate(folio));

                /*
                 * At least for ext2 with nobh option, we need to wait on
                 * writeback completing on this folio, since we'll remove it
                 * from the pagecache.  Otherwise truncate wont wait on the
                 * folio, allowing the disk blocks to be reused by someone else
                 * before we actually wrote our data to them. fs corruption
                 * ensues.
                 */
                folio_wait_writeback(folio);

                if (!filemap_release_folio(folio, GFP_KERNEL))
                        goto out_unlock;

                /*
                 * If we succeeded in removing the mapping, set LRU flag
                 * and return good.
                 */
                if (remove_mapping(mapping, folio)) {
                        buf->flags |= PIPE_BUF_FLAG_LRU;
                        return true;
                }
        }

        /*
         * Raced with truncate or failed to remove folio from current
         * address space, unlock and return failure.
         */
out_unlock:
        folio_unlock(folio);
        return false;
}

static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
                                        struct pipe_buffer *buf)
{
        put_page(buf->page);
        buf->flags &= ~PIPE_BUF_FLAG_LRU;
}

/*
 * Check whether the contents of buf is OK to access. Since the content
 * is a page cache page, IO may be in flight.
 */
static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
                                       struct pipe_buffer *buf)
{
        struct folio *folio = page_folio(buf->page);
        int err;

        if (!folio_test_uptodate(folio)) {
                folio_lock(folio);

                /*
                 * Folio got truncated/unhashed. This will cause a 0-byte
                 * splice, if this is the first page.
                 */
                if (!folio->mapping) {
                        err = -ENODATA;
                        goto error;
                }

                /*
                 * Uh oh, read-error from disk.
                 */
                if (!folio_test_uptodate(folio)) {
                        err = -EIO;
                        goto error;
                }

                /* Folio is ok after all, we are done */
                folio_unlock(folio);
        }

        return 0;
error:
        folio_unlock(folio);
        return err;
}

const struct pipe_buf_operations page_cache_pipe_buf_ops = {
        .confirm        = page_cache_pipe_buf_confirm,
        .release        = page_cache_pipe_buf_release,
        .try_steal        = page_cache_pipe_buf_try_steal,
        .get                = generic_pipe_buf_get,
};

static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe,
                struct pipe_buffer *buf)
{
        if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
                return false;

        buf->flags |= PIPE_BUF_FLAG_LRU;
        return generic_pipe_buf_try_steal(pipe, buf);
}

static const struct pipe_buf_operations user_page_pipe_buf_ops = {
        .release        = page_cache_pipe_buf_release,
        .try_steal        = user_page_pipe_buf_try_steal,
        .get                = generic_pipe_buf_get,
};

static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
{
        smp_mb();
        if (waitqueue_active(&pipe->rd_wait))
                wake_up_interruptible(&pipe->rd_wait);
        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}

/**
 * splice_to_pipe - fill passed data into a pipe
 * @pipe:        pipe to fill
 * @spd:        data to fill
 *
 * Description:
 *    @spd contains a map of pages and len/offset tuples, along with
 *    the struct pipe_buf_operations associated with these pages. This
 *    function will link that data to the pipe.
 *
 */
ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
                       struct splice_pipe_desc *spd)
{
        unsigned int spd_pages = spd->nr_pages;
        unsigned int tail = pipe->tail;
        unsigned int head = pipe->head;
        unsigned int mask = pipe->ring_size - 1;
        ssize_t ret = 0;
        int page_nr = 0;

        if (!spd_pages)
                return 0;

        if (unlikely(!pipe->readers)) {
                send_sig(SIGPIPE, current, 0);
                ret = -EPIPE;
                goto out;
        }

        while (!pipe_full(head, tail, pipe->max_usage)) {
                struct pipe_buffer *buf = &pipe->bufs[head & mask];

                buf->page = spd->pages[page_nr];
                buf->offset = spd->partial[page_nr].offset;
                buf->len = spd->partial[page_nr].len;
                buf->private = spd->partial[page_nr].private;
                buf->ops = spd->ops;
                buf->flags = 0;

                head++;
                pipe->head = head;
                page_nr++;
                ret += buf->len;

                if (!--spd->nr_pages)
                        break;
        }

        if (!ret)
                ret = -EAGAIN;

out:
        while (page_nr < spd_pages)
                spd->spd_release(spd, page_nr++);

        return ret;
}
EXPORT_SYMBOL_GPL(splice_to_pipe);

ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
        unsigned int head = pipe->head;
        unsigned int tail = pipe->tail;
        unsigned int mask = pipe->ring_size - 1;
        int ret;

        if (unlikely(!pipe->readers)) {
                send_sig(SIGPIPE, current, 0);
                ret = -EPIPE;
        } else if (pipe_full(head, tail, pipe->max_usage)) {
                ret = -EAGAIN;
        } else {
                pipe->bufs[head & mask] = *buf;
                pipe->head = head + 1;
                return buf->len;
        }
        pipe_buf_release(pipe, buf);
        return ret;
}
EXPORT_SYMBOL(add_to_pipe);

/*
 * Check if we need to grow the arrays holding pages and partial page
 * descriptions.
 */
int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
{
        unsigned int max_usage = READ_ONCE(pipe->max_usage);

        spd->nr_pages_max = max_usage;
        if (max_usage <= PIPE_DEF_BUFFERS)
                return 0;

        spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
        spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
                                     GFP_KERNEL);

        if (spd->pages && spd->partial)
                return 0;

        kfree(spd->pages);
        kfree(spd->partial);
        return -ENOMEM;
}

void splice_shrink_spd(struct splice_pipe_desc *spd)
{
        if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
                return;

        kfree(spd->pages);
        kfree(spd->partial);
}

/**
 * copy_splice_read -  Copy data from a file and splice the copy into a pipe
 * @in: The file to read from
 * @ppos: Pointer to the file position to read from
 * @pipe: The pipe to splice into
 * @len: The amount to splice
 * @flags: The SPLICE_F_* flags
 *
 * This function allocates a bunch of pages sufficient to hold the requested
 * amount of data (but limited by the remaining pipe capacity), passes it to
 * the file's ->read_iter() to read into and then splices the used pages into
 * the pipe.
 *
 * Return: On success, the number of bytes read will be returned and *@ppos
 * will be updated if appropriate; 0 will be returned if there is no more data
 * to be read; -EAGAIN will be returned if the pipe had no space, and some
 * other negative error code will be returned on error.  A short read may occur
 * if the pipe has insufficient space, we reach the end of the data or we hit a
 * hole.
 */
ssize_t copy_splice_read(struct file *in, loff_t *ppos,
                         struct pipe_inode_info *pipe,
                         size_t len, unsigned int flags)
{
        struct iov_iter to;
        struct bio_vec *bv;
        struct kiocb kiocb;
        struct page **pages;
        ssize_t ret;
        size_t used, npages, chunk, remain, keep = 0;
        int i;

        /* Work out how much data we can actually add into the pipe */
        used = pipe_occupancy(pipe->head, pipe->tail);
        npages = max_t(ssize_t, pipe->max_usage - used, 0);
        len = min_t(size_t, len, npages * PAGE_SIZE);
        npages = DIV_ROUND_UP(len, PAGE_SIZE);

        bv = kzalloc(array_size(npages, sizeof(bv[0])) +
                     array_size(npages, sizeof(struct page *)), GFP_KERNEL);
        if (!bv)
                return -ENOMEM;

        pages = (struct page **)(bv + npages);
        npages = alloc_pages_bulk_array(GFP_USER, npages, pages);
        if (!npages) {
                kfree(bv);
                return -ENOMEM;
        }

        remain = len = min_t(size_t, len, npages * PAGE_SIZE);

        for (i = 0; i < npages; i++) {
                chunk = min_t(size_t, PAGE_SIZE, remain);
                bv[i].bv_page = pages[i];
                bv[i].bv_offset = 0;
                bv[i].bv_len = chunk;
                remain -= chunk;
        }

        /* Do the I/O */
        iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
        init_sync_kiocb(&kiocb, in);
        kiocb.ki_pos = *ppos;
        ret = in->f_op->read_iter(&kiocb, &to);

        if (ret > 0) {
                keep = DIV_ROUND_UP(ret, PAGE_SIZE);
                *ppos = kiocb.ki_pos;
        }

        /*
         * Callers of ->splice_read() expect -EAGAIN on "can't put anything in
         * there", rather than -EFAULT.
         */
        if (ret == -EFAULT)
                ret = -EAGAIN;

        /* Free any pages that didn't get touched at all. */
        if (keep < npages)
                release_pages(pages + keep, npages - keep);

        /* Push the remaining pages into the pipe. */
        remain = ret;
        for (i = 0; i < keep; i++) {
                struct pipe_buffer *buf = pipe_head_buf(pipe);

                chunk = min_t(size_t, remain, PAGE_SIZE);
                *buf = (struct pipe_buffer) {
                        .ops        = &default_pipe_buf_ops,
                        .page        = bv[i].bv_page,
                        .offset        = 0,
                        .len        = chunk,
                };
                pipe->head++;
                remain -= chunk;
        }

        kfree(bv);
        return ret;
}
EXPORT_SYMBOL(copy_splice_read);

const struct pipe_buf_operations default_pipe_buf_ops = {
        .release        = generic_pipe_buf_release,
        .try_steal        = generic_pipe_buf_try_steal,
        .get                = generic_pipe_buf_get,
};

/* Pipe buffer operations for a socket and similar. */
const struct pipe_buf_operations nosteal_pipe_buf_ops = {
        .release        = generic_pipe_buf_release,
        .get                = generic_pipe_buf_get,
};
EXPORT_SYMBOL(nosteal_pipe_buf_ops);

static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
{
        smp_mb();
        if (waitqueue_active(&pipe->wr_wait))
                wake_up_interruptible(&pipe->wr_wait);
        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}

/**
 * splice_from_pipe_feed - feed available data from a pipe to a file
 * @pipe:        pipe to splice from
 * @sd:                information to @actor
 * @actor:        handler that splices the data
 *
 * Description:
 *    This function loops over the pipe and calls @actor to do the
 *    actual moving of a single struct pipe_buffer to the desired
 *    destination.  It returns when there's no more buffers left in
 *    the pipe or if the requested number of bytes (@sd->total_len)
 *    have been copied.  It returns a positive number (one) if the
 *    pipe needs to be filled with more data, zero if the required
 *    number of bytes have been copied and -errno on error.
 *
 *    This, together with splice_from_pipe_{begin,end,next}, may be
 *    used to implement the functionality of __splice_from_pipe() when
 *    locking is required around copying the pipe buffers to the
 *    destination.
 */
static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
                          splice_actor *actor)
{
        unsigned int head = pipe->head;
        unsigned int tail = pipe->tail;
        unsigned int mask = pipe->ring_size - 1;
        int ret;

        while (!pipe_empty(head, tail)) {
                struct pipe_buffer *buf = &pipe->bufs[tail & mask];

                sd->len = buf->len;
                if (sd->len > sd->total_len)
                        sd->len = sd->total_len;

                ret = pipe_buf_confirm(pipe, buf);
                if (unlikely(ret)) {
                        if (ret == -ENODATA)
                                ret = 0;
                        return ret;
                }

                ret = actor(pipe, buf, sd);
                if (ret <= 0)
                        return ret;

                buf->offset += ret;
                buf->len -= ret;

                sd->num_spliced += ret;
                sd->len -= ret;
                sd->pos += ret;
                sd->total_len -= ret;

                if (!buf->len) {
                        pipe_buf_release(pipe, buf);
                        tail++;
                        pipe->tail = tail;
                        if (pipe->files)
                                sd->need_wakeup = true;
                }

                if (!sd->total_len)
                        return 0;
        }

        return 1;
}

/* We know we have a pipe buffer, but maybe it's empty? */
static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
{
        unsigned int tail = pipe->tail;
        unsigned int mask = pipe->ring_size - 1;
        struct pipe_buffer *buf = &pipe->bufs[tail & mask];

        if (unlikely(!buf->len)) {
                pipe_buf_release(pipe, buf);
                pipe->tail = tail+1;
                return true;
        }

        return false;
}

/**
 * splice_from_pipe_next - wait for some data to splice from
 * @pipe:        pipe to splice from
 * @sd:                information about the splice operation
 *
 * Description:
 *    This function will wait for some data and return a positive
 *    value (one) if pipe buffers are available.  It will return zero
 *    or -errno if no more data needs to be spliced.
 */
static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
{
        /*
         * Check for signal early to make process killable when there are
         * always buffers available
         */
        if (signal_pending(current))
                return -ERESTARTSYS;

repeat:
        while (pipe_empty(pipe->head, pipe->tail)) {
                if (!pipe->writers)
                        return 0;

                if (sd->num_spliced)
                        return 0;

                if (sd->flags & SPLICE_F_NONBLOCK)
                        return -EAGAIN;

                if (signal_pending(current))
                        return -ERESTARTSYS;

                if (sd->need_wakeup) {
                        wakeup_pipe_writers(pipe);
                        sd->need_wakeup = false;
                }

                pipe_wait_readable(pipe);
        }

        if (eat_empty_buffer(pipe))
                goto repeat;

        return 1;
}

/**
 * splice_from_pipe_begin - start splicing from pipe
 * @sd:                information about the splice operation
 *
 * Description:
 *    This function should be called before a loop containing
 *    splice_from_pipe_next() and splice_from_pipe_feed() to
 *    initialize the necessary fields of @sd.
 */
static void splice_from_pipe_begin(struct splice_desc *sd)
{
        sd->num_spliced = 0;
        sd->need_wakeup = false;
}

/**
 * splice_from_pipe_end - finish splicing from pipe
 * @pipe:        pipe to splice from
 * @sd:                information about the splice operation
 *
 * Description:
 *    This function will wake up pipe writers if necessary.  It should
 *    be called after a loop containing splice_from_pipe_next() and
 *    splice_from_pipe_feed().
 */
static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
{
        if (sd->need_wakeup)
                wakeup_pipe_writers(pipe);
}

/**
 * __splice_from_pipe - splice data from a pipe to given actor
 * @pipe:        pipe to splice from
 * @sd:                information to @actor
 * @actor:        handler that splices the data
 *
 * Description:
 *    This function does little more than loop over the pipe and call
 *    @actor to do the actual moving of a single struct pipe_buffer to
 *    the desired destination. See pipe_to_file, pipe_to_sendmsg, or
 *    pipe_to_user.
 *
 */
ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
                           splice_actor *actor)
{
        int ret;

        splice_from_pipe_begin(sd);
        do {
                cond_resched();
                ret = splice_from_pipe_next(pipe, sd);
                if (ret > 0)
                        ret = splice_from_pipe_feed(pipe, sd, actor);
        } while (ret > 0);
        splice_from_pipe_end(pipe, sd);

        return sd->num_spliced ? sd->num_spliced : ret;
}
EXPORT_SYMBOL(__splice_from_pipe);

/**
 * splice_from_pipe - splice data from a pipe to a file
 * @pipe:        pipe to splice from
 * @out:        file to splice to
 * @ppos:        position in @out
 * @len:        how many bytes to splice
 * @flags:        splice modifier flags
 * @actor:        handler that splices the data
 *
 * Description:
 *    See __splice_from_pipe. This function locks the pipe inode,
 *    otherwise it's identical to __splice_from_pipe().
 *
 */
ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
                         loff_t *ppos, size_t len, unsigned int flags,
                         splice_actor *actor)
{
        ssize_t ret;
        struct splice_desc sd = {
                .total_len = len,
                .flags = flags,
                .pos = *ppos,
                .u.file = out,
        };

        pipe_lock(pipe);
        ret = __splice_from_pipe(pipe, &sd, actor);
        pipe_unlock(pipe);

        return ret;
}

/**
 * iter_file_splice_write - splice data from a pipe to a file
 * @pipe:        pipe info
 * @out:        file to write to
 * @ppos:        position in @out
 * @len:        number of bytes to splice
 * @flags:        splice modifier flags
 *
 * Description:
 *    Will either move or copy pages (determined by @flags options) from
 *    the given pipe inode to the given file.
 *    This one is ->write_iter-based.
 *
 */
ssize_t
iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
                          loff_t *ppos, size_t len, unsigned int flags)
{
        struct splice_desc sd = {
                .total_len = len,
                .flags = flags,
                .pos = *ppos,
                .u.file = out,
        };
        int nbufs = pipe->max_usage;
        struct bio_vec *array;
        ssize_t ret;

        if (!out->f_op->write_iter)
                return -EINVAL;

        array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL);
        if (unlikely(!array))
                return -ENOMEM;

        pipe_lock(pipe);

        splice_from_pipe_begin(&sd);
        while (sd.total_len) {
                struct kiocb kiocb;
                struct iov_iter from;
                unsigned int head, tail, mask;
                size_t left;
                int n;

                ret = splice_from_pipe_next(pipe, &sd);
                if (ret <= 0)
                        break;

                if (unlikely(nbufs < pipe->max_usage)) {
                        kfree(array);
                        nbufs = pipe->max_usage;
                        array = kcalloc(nbufs, sizeof(struct bio_vec),
                                        GFP_KERNEL);
                        if (!array) {
                                ret = -ENOMEM;
                                break;
                        }
                }

                head = pipe->head;
                tail = pipe->tail;
                mask = pipe->ring_size - 1;

                /* build the vector */
                left = sd.total_len;
                for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
                        struct pipe_buffer *buf = &pipe->bufs[tail & mask];
                        size_t this_len = buf->len;

                        /* zero-length bvecs are not supported, skip them */
                        if (!this_len)
                                continue;
                        this_len = min(this_len, left);

                        ret = pipe_buf_confirm(pipe, buf);
                        if (unlikely(ret)) {
                                if (ret == -ENODATA)
                                        ret = 0;
                                goto done;
                        }

                        bvec_set_page(&array[n], buf->page, this_len,
                                      buf->offset);
                        left -= this_len;
                        n++;
                }

                iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
                init_sync_kiocb(&kiocb, out);
                kiocb.ki_pos = sd.pos;
                ret = out->f_op->write_iter(&kiocb, &from);
                sd.pos = kiocb.ki_pos;
                if (ret <= 0)
                        break;

                sd.num_spliced += ret;
                sd.total_len -= ret;
                *ppos = sd.pos;

                /* dismiss the fully eaten buffers, adjust the partial one */
                tail = pipe->tail;
                while (ret) {
                        struct pipe_buffer *buf = &pipe->bufs[tail & mask];
                        if (ret >= buf->len) {
                                ret -= buf->len;
                                buf->len = 0;
                                pipe_buf_release(pipe, buf);
                                tail++;
                                pipe->tail = tail;
                                if (pipe->files)
                                        sd.need_wakeup = true;
                        } else {
                                buf->offset += ret;
                                buf->len -= ret;
                                ret = 0;
                        }
                }
        }
done:
        kfree(array);
        splice_from_pipe_end(pipe, &sd);

        pipe_unlock(pipe);

        if (sd.num_spliced)
                ret = sd.num_spliced;

        return ret;
}

EXPORT_SYMBOL(iter_file_splice_write);

#ifdef CONFIG_NET
/**
 * splice_to_socket - splice data from a pipe to a socket
 * @pipe:        pipe to splice from
 * @out:        socket to write to
 * @ppos:        position in @out
 * @len:        number of bytes to splice
 * @flags:        splice modifier flags
 *
 * Description:
 *    Will send @len bytes from the pipe to a network socket. No data copying
 *    is involved.
 *
 */
ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
                         loff_t *ppos, size_t len, unsigned int flags)
{
        struct socket *sock = sock_from_file(out);
        struct bio_vec bvec[16];
        struct msghdr msg = {};
        ssize_t ret = 0;
        size_t spliced = 0;
        bool need_wakeup = false;

        pipe_lock(pipe);

        while (len > 0) {
                unsigned int head, tail, mask, bc = 0;
                size_t remain = len;

                /*
                 * Check for signal early to make process killable when there
                 * are always buffers available
                 */
                ret = -ERESTARTSYS;
                if (signal_pending(current))
                        break;

                while (pipe_empty(pipe->head, pipe->tail)) {
                        ret = 0;
                        if (!pipe->writers)
                                goto out;

                        if (spliced)
                                goto out;

                        ret = -EAGAIN;
                        if (flags & SPLICE_F_NONBLOCK)
                                goto out;

                        ret = -ERESTARTSYS;
                        if (signal_pending(current))
                                goto out;

                        if (need_wakeup) {
                                wakeup_pipe_writers(pipe);
                                need_wakeup = false;
                        }

                        pipe_wait_readable(pipe);
                }

                head = pipe->head;
                tail = pipe->tail;
                mask = pipe->ring_size - 1;

                while (!pipe_empty(head, tail)) {
                        struct pipe_buffer *buf = &pipe->bufs[tail & mask];
                        size_t seg;

                        if (!buf->len) {
                                tail++;
                                continue;
                        }

                        seg = min_t(size_t, remain, buf->len);

                        ret = pipe_buf_confirm(pipe, buf);
                        if (unlikely(ret)) {
                                if (ret == -ENODATA)
                                        ret = 0;
                                break;
                        }

                        bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset);
                        remain -= seg;
                        if (remain == 0 || bc >= ARRAY_SIZE(bvec))
                                break;
                        tail++;
                }

                if (!bc)
                        break;

                msg.msg_flags = MSG_SPLICE_PAGES;
                if (flags & SPLICE_F_MORE)
                        msg.msg_flags |= MSG_MORE;
                if (remain && pipe_occupancy(pipe->head, tail) > 0)
                        msg.msg_flags |= MSG_MORE;
                if (out->f_flags & O_NONBLOCK)
                        msg.msg_flags |= MSG_DONTWAIT;

                iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc,
                              len - remain);
                ret = sock_sendmsg(sock, &msg);
                if (ret <= 0)
                        break;

                spliced += ret;
                len -= ret;
                tail = pipe->tail;
                while (ret > 0) {
                        struct pipe_buffer *buf = &pipe->bufs[tail & mask];
                        size_t seg = min_t(size_t, ret, buf->len);

                        buf->offset += seg;
                        buf->len -= seg;
                        ret -= seg;

                        if (!buf->len) {
                                pipe_buf_release(pipe, buf);
                                tail++;
                        }
                }

                if (tail != pipe->tail) {
                        pipe->tail = tail;
                        if (pipe->files)
                                need_wakeup = true;
                }
        }

out:
        pipe_unlock(pipe);
        if (need_wakeup)
                wakeup_pipe_writers(pipe);
        return spliced ?: ret;
}
#endif

static int warn_unsupported(struct file *file, const char *op)
{
        pr_debug_ratelimited(
                "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
                op, file, current->pid, current->comm);
        return -EINVAL;
}

/*
 * Attempt to initiate a splice from pipe to file.
 */
static ssize_t do_splice_from(struct pipe_inode_info *pipe, struct file *out,
                              loff_t *ppos, size_t len, unsigned int flags)
{
        if (unlikely(!out->f_op->splice_write))
                return warn_unsupported(out, "write");
        return out->f_op->splice_write(pipe, out, ppos, len, flags);
}

/*
 * Indicate to the caller that there was a premature EOF when reading from the
 * source and the caller didn't indicate they would be sending more data after
 * this.
 */
static void do_splice_eof(struct splice_desc *sd)
{
        if (sd->splice_eof)
                sd->splice_eof(sd);
}

/*
 * Callers already called rw_verify_area() on the entire range.
 * No need to call it for sub ranges.
 */
static ssize_t do_splice_read(struct file *in, loff_t *ppos,
                              struct pipe_inode_info *pipe, size_t len,
                              unsigned int flags)
{
        unsigned int p_space;

        if (unlikely(!(in->f_mode & FMODE_READ)))
                return -EBADF;
        if (!len)
                return 0;

        /* Don't try to read more the pipe has space for. */
        p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
        len = min_t(size_t, len, p_space << PAGE_SHIFT);

        if (unlikely(len > MAX_RW_COUNT))
                len = MAX_RW_COUNT;

        if (unlikely(!in->f_op->splice_read))
                return warn_unsupported(in, "read");
        /*
         * O_DIRECT and DAX don't deal with the pagecache, so we allocate a
         * buffer, copy into it and splice that into the pipe.
         */
        if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host))
                return copy_splice_read(in, ppos, pipe, len, flags);
        return in->f_op->splice_read(in, ppos, pipe, len, flags);
}

/**
 * vfs_splice_read - Read data from a file and splice it into a pipe
 * @in:                File to splice from
 * @ppos:        Input file offset
 * @pipe:        Pipe to splice to
 * @len:        Number of bytes to splice
 * @flags:        Splice modifier flags (SPLICE_F_*)
 *
 * Splice the requested amount of data from the input file to the pipe.  This
 * is synchronous as the caller must hold the pipe lock across the entire
 * operation.
 *
 * If successful, it returns the amount of data spliced, 0 if it hit the EOF or
 * a hole and a negative error code otherwise.
 */
ssize_t vfs_splice_read(struct file *in, loff_t *ppos,
                        struct pipe_inode_info *pipe, size_t len,
                        unsigned int flags)
{
        ssize_t ret;

        ret = rw_verify_area(READ, in, ppos, len);
        if (unlikely(ret < 0))
                return ret;

        return do_splice_read(in, ppos, pipe, len, flags);
}
EXPORT_SYMBOL_GPL(vfs_splice_read);

/**
 * splice_direct_to_actor - splices data directly between two non-pipes
 * @in:                file to splice from
 * @sd:                actor information on where to splice to
 * @actor:        handles the data splicing
 *
 * Description:
 *    This is a special case helper to splice directly between two
 *    points, without requiring an explicit pipe. Internally an allocated
 *    pipe is cached in the process, and reused during the lifetime of
 *    that process.
 *
 */
ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
                               splice_direct_actor *actor)
{
        struct pipe_inode_info *pipe;
        ssize_t ret, bytes;
        size_t len;
        int i, flags, more;

        /*
         * We require the input to be seekable, as we don't want to randomly
         * drop data for eg socket -> socket splicing. Use the piped splicing
         * for that!
         */
        if (unlikely(!(in->f_mode & FMODE_LSEEK)))
                return -EINVAL;

        /*
         * neither in nor out is a pipe, setup an internal pipe attached to
         * 'out' and transfer the wanted data from 'in' to 'out' through that
         */
        pipe = current->splice_pipe;
        if (unlikely(!pipe)) {
                pipe = alloc_pipe_info();
                if (!pipe)
                        return -ENOMEM;

                /*
                 * We don't have an immediate reader, but we'll read the stuff
                 * out of the pipe right after the splice_to_pipe(). So set
                 * PIPE_READERS appropriately.
                 */
                pipe->readers = 1;

                current->splice_pipe = pipe;
        }

        /*
         * Do the splice.
         */
        bytes = 0;
        len = sd->total_len;

        /* Don't block on output, we have to drain the direct pipe. */
        flags = sd->flags;
        sd->flags &= ~SPLICE_F_NONBLOCK;

        /*
         * We signal MORE until we've read sufficient data to fulfill the
         * request and we keep signalling it if the caller set it.
         */
        more = sd->flags & SPLICE_F_MORE;
        sd->flags |= SPLICE_F_MORE;

        WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));

        while (len) {
                size_t read_len;
                loff_t pos = sd->pos, prev_pos = pos;

                ret = do_splice_read(in, &pos, pipe, len, flags);
                if (unlikely(ret <= 0))
                        goto read_failure;

                read_len = ret;
                sd->total_len = read_len;

                /*
                 * If we now have sufficient data to fulfill the request then
                 * we clear SPLICE_F_MORE if it was not set initially.
                 */
                if (read_len >= len && !more)
                        sd->flags &= ~SPLICE_F_MORE;

                /*
                 * NOTE: nonblocking mode only applies to the input. We
                 * must not do the output in nonblocking mode as then we
                 * could get stuck data in the internal pipe:
                 */
                ret = actor(pipe, sd);
                if (unlikely(ret <= 0)) {
                        sd->pos = prev_pos;
                        goto out_release;
                }

                bytes += ret;
                len -= ret;
                sd->pos = pos;

                if (ret < read_len) {
                        sd->pos = prev_pos + ret;
                        goto out_release;
                }
        }

done:
        pipe->tail = pipe->head = 0;
        file_accessed(in);
        return bytes;

read_failure:
        /*
         * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that
         * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a
         * "->splice_in()" that returned EOF (ie zero) *and* we have sent at
         * least 1 byte *then* we will also do the ->splice_eof() call.
         */
        if (ret == 0 && !more && len > 0 && bytes)
                do_splice_eof(sd);
out_release:
        /*
         * If we did an incomplete transfer we must release
         * the pipe buffers in question:
         */
        for (i = 0; i < pipe->ring_size; i++) {
                struct pipe_buffer *buf = &pipe->bufs[i];

                if (buf->ops)
                        pipe_buf_release(pipe, buf);
        }

        if (!bytes)
                bytes = ret;

        goto done;
}
EXPORT_SYMBOL(splice_direct_to_actor);

static int direct_splice_actor(struct pipe_inode_info *pipe,
                               struct splice_desc *sd)
{
        struct file *file = sd->u.file;
        long ret;

        file_start_write(file);
        ret = do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
        file_end_write(file);
        return ret;
}

static int splice_file_range_actor(struct pipe_inode_info *pipe,
                                        struct splice_desc *sd)
{
        struct file *file = sd->u.file;

        return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
}

static void direct_file_splice_eof(struct splice_desc *sd)
{
        struct file *file = sd->u.file;

        if (file->f_op->splice_eof)
                file->f_op->splice_eof(file);
}

static ssize_t do_splice_direct_actor(struct file *in, loff_t *ppos,
                                      struct file *out, loff_t *opos,
                                      size_t len, unsigned int flags,
                                      splice_direct_actor *actor)
{
        struct splice_desc sd = {
                .len                = len,
                .total_len        = len,
                .flags                = flags,
                .pos                = *ppos,
                .u.file                = out,
                .splice_eof        = direct_file_splice_eof,
                .opos                = opos,
        };
        ssize_t ret;

        if (unlikely(!(out->f_mode & FMODE_WRITE)))
                return -EBADF;

        if (unlikely(out->f_flags & O_APPEND))
                return -EINVAL;

        ret = splice_direct_to_actor(in, &sd, actor);
        if (ret > 0)
                *ppos = sd.pos;

        return ret;
}
/**
 * do_splice_direct - splices data directly between two files
 * @in:                file to splice from
 * @ppos:        input file offset
 * @out:        file to splice to
 * @opos:        output file offset
 * @len:        number of bytes to splice
 * @flags:        splice modifier flags
 *
 * Description:
 *    For use by do_sendfile(). splice can easily emulate sendfile, but
 *    doing it in the application would incur an extra system call
 *    (splice in + splice out, as compared to just sendfile()). So this helper
 *    can splice directly through a process-private pipe.
 *
 * Callers already called rw_verify_area() on the entire range.
 */
ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
                         loff_t *opos, size_t len, unsigned int flags)
{
        return do_splice_direct_actor(in, ppos, out, opos, len, flags,
                                      direct_splice_actor);
}
EXPORT_SYMBOL(do_splice_direct);

/**
 * splice_file_range - splices data between two files for copy_file_range()
 * @in:                file to splice from
 * @ppos:        input file offset
 * @out:        file to splice to
 * @opos:        output file offset
 * @len:        number of bytes to splice
 *
 * Description:
 *    For use by ->copy_file_range() methods.
 *    Like do_splice_direct(), but vfs_copy_file_range() already holds
 *    start_file_write() on @out file.
 *
 * Callers already called rw_verify_area() on the entire range.
 */
ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out,
                          loff_t *opos, size_t len)
{
        lockdep_assert(file_write_started(out));

        return do_splice_direct_actor(in, ppos, out, opos,
                                      min_t(size_t, len, MAX_RW_COUNT),
                                      0, splice_file_range_actor);
}
EXPORT_SYMBOL(splice_file_range);

static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
{
        for (;;) {
                if (unlikely(!pipe->readers)) {
                        send_sig(SIGPIPE, current, 0);
                        return -EPIPE;
                }
                if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
                        return 0;
                if (flags & SPLICE_F_NONBLOCK)
                        return -EAGAIN;
                if (signal_pending(current))
                        return -ERESTARTSYS;
                pipe_wait_writable(pipe);
        }
}

static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
                               struct pipe_inode_info *opipe,
                               size_t len, unsigned int flags);

ssize_t splice_file_to_pipe(struct file *in,
                            struct pipe_inode_info *opipe,
                            loff_t *offset,
                            size_t len, unsigned int flags)
{
        ssize_t ret;

        pipe_lock(opipe);
        ret = wait_for_space(opipe, flags);
        if (!ret)
                ret = do_splice_read(in, offset, opipe, len, flags);
        pipe_unlock(opipe);
        if (ret > 0)
                wakeup_pipe_readers(opipe);
        return ret;
}

/*
 * Determine where to splice to/from.
 */
ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out,
                  loff_t *off_out, size_t len, unsigned int flags)
{
        struct pipe_inode_info *ipipe;
        struct pipe_inode_info *opipe;
        loff_t offset;
        ssize_t ret;

        if (unlikely(!(in->f_mode & FMODE_READ) ||
                     !(out->f_mode & FMODE_WRITE)))
                return -EBADF;

        ipipe = get_pipe_info(in, true);
        opipe = get_pipe_info(out, true);

        if (ipipe && opipe) {
                if (off_in || off_out)
                        return -ESPIPE;

                /* Splicing to self would be fun, but... */
                if (ipipe == opipe)
                        return -EINVAL;

                if ((in->f_flags | out->f_flags) & O_NONBLOCK)
                        flags |= SPLICE_F_NONBLOCK;

                ret = splice_pipe_to_pipe(ipipe, opipe, len, flags);
        } else if (ipipe) {
                if (off_in)
                        return -ESPIPE;
                if (off_out) {
                        if (!(out->f_mode & FMODE_PWRITE))
                                return -EINVAL;
                        offset = *off_out;
                } else {
                        offset = out->f_pos;
                }

                if (unlikely(out->f_flags & O_APPEND))
                        return -EINVAL;

                ret = rw_verify_area(WRITE, out, &offset, len);
                if (unlikely(ret < 0))
                        return ret;

                if (in->f_flags & O_NONBLOCK)
                        flags |= SPLICE_F_NONBLOCK;

                file_start_write(out);
                ret = do_splice_from(ipipe, out, &offset, len, flags);
                file_end_write(out);

                if (!off_out)
                        out->f_pos = offset;
                else
                        *off_out = offset;
        } else if (opipe) {
                if (off_out)
                        return -ESPIPE;
                if (off_in) {
                        if (!(in->f_mode & FMODE_PREAD))
                                return -EINVAL;
                        offset = *off_in;
                } else {
                        offset = in->f_pos;
                }

                ret = rw_verify_area(READ, in, &offset, len);
                if (unlikely(ret < 0))
                        return ret;

                if (out->f_flags & O_NONBLOCK)
                        flags |= SPLICE_F_NONBLOCK;

                ret = splice_file_to_pipe(in, opipe, &offset, len, flags);

                if (!off_in)
                        in->f_pos = offset;
                else
                        *off_in = offset;
        } else {
                ret = -EINVAL;
        }

        if (ret > 0) {
                /*
                 * Generate modify out before access in:
                 * do_splice_from() may've already sent modify out,
                 * and this ensures the events get merged.
                 */
                fsnotify_modify(out);
                fsnotify_access(in);
        }

        return ret;
}

static ssize_t __do_splice(struct file *in, loff_t __user *off_in,
                           struct file *out, loff_t __user *off_out,
                           size_t len, unsigned int flags)
{
        struct pipe_inode_info *ipipe;
        struct pipe_inode_info *opipe;
        loff_t offset, *__off_in = NULL, *__off_out = NULL;
        ssize_t ret;

        ipipe = get_pipe_info(in, true);
        opipe = get_pipe_info(out, true);

        if (ipipe) {
                if (off_in)
                        return -ESPIPE;
                pipe_clear_nowait(in);
        }
        if (opipe) {
                if (off_out)
                        return -ESPIPE;
                pipe_clear_nowait(out);
        }

        if (off_out) {
                if (copy_from_user(&offset, off_out, sizeof(loff_t)))
                        return -EFAULT;
                __off_out = &offset;
        }
        if (off_in) {
                if (copy_from_user(&offset, off_in, sizeof(loff_t)))
                        return -EFAULT;
                __off_in = &offset;
        }

        ret = do_splice(in, __off_in, out, __off_out, len, flags);
        if (ret < 0)
                return ret;

        if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
                return -EFAULT;
        if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
                return -EFAULT;

        return ret;
}

static ssize_t iter_to_pipe(struct iov_iter *from,
                            struct pipe_inode_info *pipe,
                            unsigned int flags)
{
        struct pipe_buffer buf = {
                .ops = &user_page_pipe_buf_ops,
                .flags = flags
        };
        size_t total = 0;
        ssize_t ret = 0;

        while (iov_iter_count(from)) {
                struct page *pages[16];
                ssize_t left;
                size_t start;
                int i, n;

                left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start);
                if (left <= 0) {
                        ret = left;
                        break;
                }

                n = DIV_ROUND_UP(left + start, PAGE_SIZE);
                for (i = 0; i < n; i++) {
                        int size = min_t(int, left, PAGE_SIZE - start);

                        buf.page = pages[i];
                        buf.offset = start;
                        buf.len = size;
                        ret = add_to_pipe(pipe, &buf);
                        if (unlikely(ret < 0)) {
                                iov_iter_revert(from, left);
                                // this one got dropped by add_to_pipe()
                                while (++i < n)
                                        put_page(pages[i]);
                                goto out;
                        }
                        total += ret;
                        left -= size;
                        start = 0;
                }
        }
out:
        return total ? total : ret;
}

static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
                        struct splice_desc *sd)
{
        int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
        return n == sd->len ? n : -EFAULT;
}

/*
 * For lack of a better implementation, implement vmsplice() to userspace
 * as a simple copy of the pipes pages to the user iov.
 */
static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter,
                                unsigned int flags)
{
        struct pipe_inode_info *pipe = get_pipe_info(file, true);
        struct splice_desc sd = {
                .total_len = iov_iter_count(iter),
                .flags = flags,
                .u.data = iter
        };
        ssize_t ret = 0;

        if (!pipe)
                return -EBADF;

        pipe_clear_nowait(file);

        if (sd.total_len) {
                pipe_lock(pipe);
                ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
                pipe_unlock(pipe);
        }

        if (ret > 0)
                fsnotify_access(file);

        return ret;
}

/*
 * vmsplice splices a user address range into a pipe. It can be thought of
 * as splice-from-memory, where the regular splice is splice-from-file (or
 * to file). In both cases the output is a pipe, naturally.
 */
static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
                                unsigned int flags)
{
        struct pipe_inode_info *pipe;
        ssize_t ret = 0;
        unsigned buf_flag = 0;

        if (flags & SPLICE_F_GIFT)
                buf_flag = PIPE_BUF_FLAG_GIFT;

        pipe = get_pipe_info(file, true);
        if (!pipe)
                return -EBADF;

        pipe_clear_nowait(file);

        pipe_lock(pipe);
        ret = wait_for_space(pipe, flags);
        if (!ret)
                ret = iter_to_pipe(iter, pipe, buf_flag);
        pipe_unlock(pipe);
        if (ret > 0) {
                wakeup_pipe_readers(pipe);
                fsnotify_modify(file);
        }
        return ret;
}

static int vmsplice_type(struct fd f, int *type)
{
        if (!f.file)
                return -EBADF;
        if (f.file->f_mode & FMODE_WRITE) {
                *type = ITER_SOURCE;
        } else if (f.file->f_mode & FMODE_READ) {
                *type = ITER_DEST;
        } else {
                fdput(f);
                return -EBADF;
        }
        return 0;
}

/*
 * Note that vmsplice only really supports true splicing _from_ user memory
 * to a pipe, not the other way around. Splicing from user memory is a simple
 * operation that can be supported without any funky alignment restrictions
 * or nasty vm tricks. We simply map in the user memory and fill them into
 * a pipe. The reverse isn't quite as easy, though. There are two possible
 * solutions for that:
 *
 *        - memcpy() the data internally, at which point we might as well just
 *          do a regular read() on the buffer anyway.
 *        - Lots of nasty vm tricks, that are neither fast nor flexible (it
 *          has restriction limitations on both ends of the pipe).
 *
 * Currently we punt and implement it as a normal copy, see pipe_to_user().
 *
 */
SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
                unsigned long, nr_segs, unsigned int, flags)
{
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        ssize_t error;
        struct fd f;
        int type;

        if (unlikely(flags & ~SPLICE_F_ALL))
                return -EINVAL;

        f = fdget(fd);
        error = vmsplice_type(f, &type);
        if (error)
                return error;

        error = import_iovec(type, uiov, nr_segs,
                             ARRAY_SIZE(iovstack), &iov, &iter);
        if (error < 0)
                goto out_fdput;

        if (!iov_iter_count(&iter))
                error = 0;
        else if (type == ITER_SOURCE)
                error = vmsplice_to_pipe(f.file, &iter, flags);
        else
                error = vmsplice_to_user(f.file, &iter, flags);

        kfree(iov);
out_fdput:
        fdput(f);
        return error;
}

SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
                int, fd_out, loff_t __user *, off_out,
                size_t, len, unsigned int, flags)
{
        struct fd in, out;
        ssize_t error;

        if (unlikely(!len))
                return 0;

        if (unlikely(flags & ~SPLICE_F_ALL))
                return -EINVAL;

        error = -EBADF;
        in = fdget(fd_in);
        if (in.file) {
                out = fdget(fd_out);
                if (out.file) {
                        error = __do_splice(in.file, off_in, out.file, off_out,
                                            len, flags);
                        fdput(out);
                }
                fdput(in);
        }
        return error;
}

/*
 * Make sure there's data to read. Wait for input if we can, otherwise
 * return an appropriate error.
 */
static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
{
        int ret;

        /*
         * Check the pipe occupancy without the inode lock first. This function
         * is speculative anyways, so missing one is ok.
         */
        if (!pipe_empty(pipe->head, pipe->tail))
                return 0;

        ret = 0;
        pipe_lock(pipe);

        while (pipe_empty(pipe->head, pipe->tail)) {
                if (signal_pending(current)) {
                        ret = -ERESTARTSYS;
                        break;
                }
                if (!pipe->writers)
                        break;
                if (flags & SPLICE_F_NONBLOCK) {
                        ret = -EAGAIN;
                        break;
                }
                pipe_wait_readable(pipe);
        }

        pipe_unlock(pipe);
        return ret;
}

/*
 * Make sure there's writeable room. Wait for room if we can, otherwise
 * return an appropriate error.
 */
static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
{
        int ret;

        /*
         * Check pipe occupancy without the inode lock first. This function
         * is speculative anyways, so missing one is ok.
         */
        if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
                return 0;

        ret = 0;
        pipe_lock(pipe);

        while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
                if (!pipe->readers) {
                        send_sig(SIGPIPE, current, 0);
                        ret = -EPIPE;
                        break;
                }
                if (flags & SPLICE_F_NONBLOCK) {
                        ret = -EAGAIN;
                        break;
                }
                if (signal_pending(current)) {
                        ret = -ERESTARTSYS;
                        break;
                }
                pipe_wait_writable(pipe);
        }

        pipe_unlock(pipe);
        return ret;
}

/*
 * Splice contents of ipipe to opipe.
 */
static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
                               struct pipe_inode_info *opipe,
                               size_t len, unsigned int flags)
{
        struct pipe_buffer *ibuf, *obuf;
        unsigned int i_head, o_head;
        unsigned int i_tail, o_tail;
        unsigned int i_mask, o_mask;
        int ret = 0;
        bool input_wakeup = false;


retry:
        ret = ipipe_prep(ipipe, flags);
        if (ret)
                return ret;

        ret = opipe_prep(opipe, flags);
        if (ret)
                return ret;

        /*
         * Potential ABBA deadlock, work around it by ordering lock
         * grabbing by pipe info address. Otherwise two different processes
         * could deadlock (one doing tee from A -> B, the other from B -> A).
         */
        pipe_double_lock(ipipe, opipe);

        i_tail = ipipe->tail;
        i_mask = ipipe->ring_size - 1;
        o_head = opipe->head;
        o_mask = opipe->ring_size - 1;

        do {
                size_t o_len;

                if (!opipe->readers) {
                        send_sig(SIGPIPE, current, 0);
                        if (!ret)
                                ret = -EPIPE;
                        break;
                }

                i_head = ipipe->head;
                o_tail = opipe->tail;

                if (pipe_empty(i_head, i_tail) && !ipipe->writers)
                        break;

                /*
                 * Cannot make any progress, because either the input
                 * pipe is empty or the output pipe is full.
                 */
                if (pipe_empty(i_head, i_tail) ||
                    pipe_full(o_head, o_tail, opipe->max_usage)) {
                        /* Already processed some buffers, break */
                        if (ret)
                                break;

                        if (flags & SPLICE_F_NONBLOCK) {
                                ret = -EAGAIN;
                                break;
                        }

                        /*
                         * We raced with another reader/writer and haven't
                         * managed to process any buffers.  A zero return
                         * value means EOF, so retry instead.
                         */
                        pipe_unlock(ipipe);
                        pipe_unlock(opipe);
                        goto retry;
                }

                ibuf = &ipipe->bufs[i_tail & i_mask];
                obuf = &opipe->bufs[o_head & o_mask];

                if (len >= ibuf->len) {
                        /*
                         * Simply move the whole buffer from ipipe to opipe
                         */
                        *obuf = *ibuf;
                        ibuf->ops = NULL;
                        i_tail++;
                        ipipe->tail = i_tail;
                        input_wakeup = true;
                        o_len = obuf->len;
                        o_head++;
                        opipe->head = o_head;
                } else {
                        /*
                         * Get a reference to this pipe buffer,
                         * so we can copy the contents over.
                         */
                        if (!pipe_buf_get(ipipe, ibuf)) {
                                if (ret == 0)
                                        ret = -EFAULT;
                                break;
                        }
                        *obuf = *ibuf;

                        /*
                         * Don't inherit the gift and merge flags, we need to
                         * prevent multiple steals of this page.
                         */
                        obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
                        obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;

                        obuf->len = len;
                        ibuf->offset += len;
                        ibuf->len -= len;
                        o_len = len;
                        o_head++;
                        opipe->head = o_head;
                }
                ret += o_len;
                len -= o_len;
        } while (len);

        pipe_unlock(ipipe);
        pipe_unlock(opipe);

        /*
         * If we put data in the output pipe, wakeup any potential readers.
         */
        if (ret > 0)
                wakeup_pipe_readers(opipe);

        if (input_wakeup)
                wakeup_pipe_writers(ipipe);

        return ret;
}

/*
 * Link contents of ipipe to opipe.
 */
static ssize_t link_pipe(struct pipe_inode_info *ipipe,
                         struct pipe_inode_info *opipe,
                         size_t len, unsigned int flags)
{
        struct pipe_buffer *ibuf, *obuf;
        unsigned int i_head, o_head;
        unsigned int i_tail, o_tail;
        unsigned int i_mask, o_mask;
        ssize_t ret = 0;

        /*
         * Potential ABBA deadlock, work around it by ordering lock
         * grabbing by pipe info address. Otherwise two different processes
         * could deadlock (one doing tee from A -> B, the other from B -> A).
         */
        pipe_double_lock(ipipe, opipe);

        i_tail = ipipe->tail;
        i_mask = ipipe->ring_size - 1;
        o_head = opipe->head;
        o_mask = opipe->ring_size - 1;

        do {
                if (!opipe->readers) {
                        send_sig(SIGPIPE, current, 0);
                        if (!ret)
                                ret = -EPIPE;
                        break;
                }

                i_head = ipipe->head;
                o_tail = opipe->tail;

                /*
                 * If we have iterated all input buffers or run out of
                 * output room, break.
                 */
                if (pipe_empty(i_head, i_tail) ||
                    pipe_full(o_head, o_tail, opipe->max_usage))
                        break;

                ibuf = &ipipe->bufs[i_tail & i_mask];
                obuf = &opipe->bufs[o_head & o_mask];

                /*
                 * Get a reference to this pipe buffer,
                 * so we can copy the contents over.
                 */
                if (!pipe_buf_get(ipipe, ibuf)) {
                        if (ret == 0)
                                ret = -EFAULT;
                        break;
                }

                *obuf = *ibuf;

                /*
                 * Don't inherit the gift and merge flag, we need to prevent
                 * multiple steals of this page.
                 */
                obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
                obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;

                if (obuf->len > len)
                        obuf->len = len;
                ret += obuf->len;
                len -= obuf->len;

                o_head++;
                opipe->head = o_head;
                i_tail++;
        } while (len);

        pipe_unlock(ipipe);
        pipe_unlock(opipe);

        /*
         * If we put data in the output pipe, wakeup any potential readers.
         */
        if (ret > 0)
                wakeup_pipe_readers(opipe);

        return ret;
}

/*
 * This is a tee(1) implementation that works on pipes. It doesn't copy
 * any data, it simply references the 'in' pages on the 'out' pipe.
 * The 'flags' used are the SPLICE_F_* variants, currently the only
 * applicable one is SPLICE_F_NONBLOCK.
 */
ssize_t do_tee(struct file *in, struct file *out, size_t len,
               unsigned int flags)
{
        struct pipe_inode_info *ipipe = get_pipe_info(in, true);
        struct pipe_inode_info *opipe = get_pipe_info(out, true);
        ssize_t ret = -EINVAL;

        if (unlikely(!(in->f_mode & FMODE_READ) ||
                     !(out->f_mode & FMODE_WRITE)))
                return -EBADF;

        /*
         * Duplicate the contents of ipipe to opipe without actually
         * copying the data.
         */
        if (ipipe && opipe && ipipe != opipe) {
                if ((in->f_flags | out->f_flags) & O_NONBLOCK)
                        flags |= SPLICE_F_NONBLOCK;

                /*
                 * Keep going, unless we encounter an error. The ipipe/opipe
                 * ordering doesn't really matter.
                 */
                ret = ipipe_prep(ipipe, flags);
                if (!ret) {
                        ret = opipe_prep(opipe, flags);
                        if (!ret)
                                ret = link_pipe(ipipe, opipe, len, flags);
                }
        }

        if (ret > 0) {
                fsnotify_access(in);
                fsnotify_modify(out);
        }

        return ret;
}

SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
{
        struct fd in, out;
        ssize_t error;

        if (unlikely(flags & ~SPLICE_F_ALL))
                return -EINVAL;

        if (unlikely(!len))
                return 0;

        error = -EBADF;
        in = fdget(fdin);
        if (in.file) {
                out = fdget(fdout);
                if (out.file) {
                        error = do_tee(in.file, out.file, len, flags);
                        fdput(out);
                }
                 fdput(in);
         }

        return error;
}





































































































































































































































































    1 







    1 





    1 


























    1 




    1 

    1 

    1 

    1 











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 



































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






























    1 


























    1 
    1 





    1 





























































    1 


















    1 





    1 













    1 
























    1 








    1 




    1 





    1 

















    1 







    1 

















    1 

















    1 









    1 

    1 






    1 








    1 



    1 
    1 

















    1 











































    1 


    1 










    1 
















    1 




















    1 



    1 





































    1 







    1 












    1 


    1 


    1 










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
// SPDX-License-Identifier: GPL-2.0
/*
 * Shared application/kernel submission and completion ring pairs, for
 * supporting fast/efficient IO.
 *
 * A note on the read/write ordering memory barriers that are matched between
 * the application and kernel side.
 *
 * After the application reads the CQ ring tail, it must use an
 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
 * before writing the tail (using smp_load_acquire to read the tail will
 * do). It also needs a smp_mb() before updating CQ head (ordering the
 * entry load(s) with the head store), pairing with an implicit barrier
 * through a control-dependency in io_get_cqe (smp_store_release to
 * store head will do). Failure to do so could lead to reading invalid
 * CQ entries.
 *
 * Likewise, the application must use an appropriate smp_wmb() before
 * writing the SQ tail (ordering SQ entry stores with the tail store),
 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
 * to store the tail will do). And it needs a barrier ordering the SQ
 * head load before writing new SQ entries (smp_load_acquire to read
 * head will do).
 *
 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
 * updating the SQ tail; a full memory barrier smp_mb() is needed
 * between.
 *
 * Also see the examples in the liburing library:
 *
 *        git://git.kernel.dk/liburing
 *
 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
 * from data shared between the kernel and application. This is done both
 * for ordering purposes, but also to ensure that once a value is loaded from
 * data that the application could potentially modify, it remains stable.
 *
 * Copyright (C) 2018-2019 Jens Axboe
 * Copyright (c) 2018-2019 Christoph Hellwig
 */
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <net/compat.h>
#include <linux/refcount.h>
#include <linux/uio.h>
#include <linux/bits.h>

#include <linux/sched/signal.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/bvec.h>
#include <linux/net.h>
#include <net/sock.h>
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
#include <linux/nospec.h>
#include <linux/fsnotify.h>
#include <linux/fadvise.h>
#include <linux/task_work.h>
#include <linux/io_uring.h>
#include <linux/io_uring/cmd.h>
#include <linux/audit.h>
#include <linux/security.h>
#include <asm/shmparam.h>

#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>

#include <uapi/linux/io_uring.h>

#include "io-wq.h"

#include "io_uring.h"
#include "opdef.h"
#include "refs.h"
#include "tctx.h"
#include "register.h"
#include "sqpoll.h"
#include "fdinfo.h"
#include "kbuf.h"
#include "rsrc.h"
#include "cancel.h"
#include "net.h"
#include "notif.h"
#include "waitid.h"
#include "futex.h"
#include "napi.h"
#include "uring_cmd.h"
#include "memmap.h"

#include "timeout.h"
#include "poll.h"
#include "rw.h"
#include "alloc_cache.h"

#define IORING_MAX_ENTRIES        32768
#define IORING_MAX_CQ_ENTRIES        (2 * IORING_MAX_ENTRIES)

#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
                          IOSQE_IO_HARDLINK | IOSQE_ASYNC)

#define SQE_VALID_FLAGS        (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
                        IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)

#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
                                REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
                                REQ_F_ASYNC_DATA)

#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
                                 IO_REQ_CLEAN_FLAGS)

#define IO_TCTX_REFS_CACHE_NR        (1U << 10)

#define IO_COMPL_BATCH                        32
#define IO_REQ_ALLOC_BATCH                8

struct io_defer_entry {
        struct list_head        list;
        struct io_kiocb                *req;
        u32                        seq;
};

/* requests with any of those set should undergo io_disarm_next() */
#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)

/*
 * No waiters. It's larger than any valid value of the tw counter
 * so that tests against ->cq_wait_nr would fail and skip wake_up().
 */
#define IO_CQ_WAKE_INIT                (-1U)
/* Forced wake up if there is a waiter regardless of ->cq_wait_nr */
#define IO_CQ_WAKE_FORCE        (IO_CQ_WAKE_INIT >> 1)

static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                                         struct task_struct *task,
                                         bool cancel_all);

static void io_queue_sqe(struct io_kiocb *req);

struct kmem_cache *req_cachep;
static struct workqueue_struct *iou_wq __ro_after_init;

static int __read_mostly sysctl_io_uring_disabled;
static int __read_mostly sysctl_io_uring_group = -1;

#ifdef CONFIG_SYSCTL
static struct ctl_table kernel_io_uring_disabled_table[] = {
        {
                .procname        = "io_uring_disabled",
                .data                = &sysctl_io_uring_disabled,
                .maxlen                = sizeof(sysctl_io_uring_disabled),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_TWO,
        },
        {
                .procname        = "io_uring_group",
                .data                = &sysctl_io_uring_group,
                .maxlen                = sizeof(gid_t),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
};
#endif

static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
{
        return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
}

static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx)
{
        return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head);
}

static bool io_match_linked(struct io_kiocb *head)
{
        struct io_kiocb *req;

        io_for_each_link(req, head) {
                if (req->flags & REQ_F_INFLIGHT)
                        return true;
        }
        return false;
}

/*
 * As io_match_task() but protected against racing with linked timeouts.
 * User must not hold timeout_lock.
 */
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
                        bool cancel_all)
{
        bool matched;

        if (task && head->task != task)
                return false;
        if (cancel_all)
                return true;

        if (head->flags & REQ_F_LINK_TIMEOUT) {
                struct io_ring_ctx *ctx = head->ctx;

                /* protect against races with linked timeouts */
                spin_lock_irq(&ctx->timeout_lock);
                matched = io_match_linked(head);
                spin_unlock_irq(&ctx->timeout_lock);
        } else {
                matched = io_match_linked(head);
        }
        return matched;
}

static inline void req_fail_link_node(struct io_kiocb *req, int res)
{
        req_set_fail(req);
        io_req_set_res(req, res, 0);
}

static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
{
        wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
}

static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
        struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);

        complete(&ctx->ref_comp);
}

static __cold void io_fallback_req_func(struct work_struct *work)
{
        struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
                                                fallback_work.work);
        struct llist_node *node = llist_del_all(&ctx->fallback_llist);
        struct io_kiocb *req, *tmp;
        struct io_tw_state ts = {};

        percpu_ref_get(&ctx->refs);
        mutex_lock(&ctx->uring_lock);
        llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
                req->io_task_work.func(req, &ts);
        io_submit_flush_completions(ctx);
        mutex_unlock(&ctx->uring_lock);
        percpu_ref_put(&ctx->refs);
}

static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits)
{
        unsigned hash_buckets = 1U << bits;
        size_t hash_size = hash_buckets * sizeof(table->hbs[0]);

        table->hbs = kmalloc(hash_size, GFP_KERNEL);
        if (!table->hbs)
                return -ENOMEM;

        table->hash_bits = bits;
        init_hash_table(table, hash_buckets);
        return 0;
}

static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
        struct io_ring_ctx *ctx;
        int hash_bits;
        bool ret;

        ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
        if (!ctx)
                return NULL;

        xa_init(&ctx->io_bl_xa);

        /*
         * Use 5 bits less than the max cq entries, that should give us around
         * 32 entries per hash list if totally full and uniformly spread, but
         * don't keep too many buckets to not overconsume memory.
         */
        hash_bits = ilog2(p->cq_entries) - 5;
        hash_bits = clamp(hash_bits, 1, 8);
        if (io_alloc_hash_table(&ctx->cancel_table, hash_bits))
                goto err;
        if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits))
                goto err;
        if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
                            0, GFP_KERNEL))
                goto err;

        ctx->flags = p->flags;
        atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
        init_waitqueue_head(&ctx->sqo_sq_wait);
        INIT_LIST_HEAD(&ctx->sqd_list);
        INIT_LIST_HEAD(&ctx->cq_overflow_list);
        INIT_LIST_HEAD(&ctx->io_buffers_cache);
        ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
                            sizeof(struct io_rsrc_node));
        ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX,
                            sizeof(struct async_poll));
        ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
                            sizeof(struct io_async_msghdr));
        ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX,
                            sizeof(struct io_async_rw));
        ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX,
                            sizeof(struct uring_cache));
        ret |= io_futex_cache_init(ctx);
        if (ret)
                goto err;
        init_completion(&ctx->ref_comp);
        xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
        mutex_init(&ctx->uring_lock);
        init_waitqueue_head(&ctx->cq_wait);
        init_waitqueue_head(&ctx->poll_wq);
        init_waitqueue_head(&ctx->rsrc_quiesce_wq);
        spin_lock_init(&ctx->completion_lock);
        spin_lock_init(&ctx->timeout_lock);
        INIT_WQ_LIST(&ctx->iopoll_list);
        INIT_LIST_HEAD(&ctx->io_buffers_comp);
        INIT_LIST_HEAD(&ctx->defer_list);
        INIT_LIST_HEAD(&ctx->timeout_list);
        INIT_LIST_HEAD(&ctx->ltimeout_list);
        INIT_LIST_HEAD(&ctx->rsrc_ref_list);
        init_llist_head(&ctx->work_llist);
        INIT_LIST_HEAD(&ctx->tctx_list);
        ctx->submit_state.free_list.next = NULL;
        INIT_HLIST_HEAD(&ctx->waitid_list);
#ifdef CONFIG_FUTEX
        INIT_HLIST_HEAD(&ctx->futex_list);
#endif
        INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
        INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
        INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
        io_napi_init(ctx);

        return ctx;
err:
        io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
        io_alloc_cache_free(&ctx->apoll_cache, kfree);
        io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
        io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
        io_alloc_cache_free(&ctx->uring_cache, kfree);
        io_futex_cache_free(ctx);
        kfree(ctx->cancel_table.hbs);
        kfree(ctx->cancel_table_locked.hbs);
        xa_destroy(&ctx->io_bl_xa);
        kfree(ctx);
        return NULL;
}

static void io_account_cq_overflow(struct io_ring_ctx *ctx)
{
        struct io_rings *r = ctx->rings;

        WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
        ctx->cq_extra--;
}

static bool req_need_defer(struct io_kiocb *req, u32 seq)
{
        if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
                struct io_ring_ctx *ctx = req->ctx;

                return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
        }

        return false;
}

static void io_clean_op(struct io_kiocb *req)
{
        if (req->flags & REQ_F_BUFFER_SELECTED) {
                spin_lock(&req->ctx->completion_lock);
                io_kbuf_drop(req);
                spin_unlock(&req->ctx->completion_lock);
        }

        if (req->flags & REQ_F_NEED_CLEANUP) {
                const struct io_cold_def *def = &io_cold_defs[req->opcode];

                if (def->cleanup)
                        def->cleanup(req);
        }
        if ((req->flags & REQ_F_POLLED) && req->apoll) {
                kfree(req->apoll->double_poll);
                kfree(req->apoll);
                req->apoll = NULL;
        }
        if (req->flags & REQ_F_INFLIGHT) {
                struct io_uring_task *tctx = req->task->io_uring;

                atomic_dec(&tctx->inflight_tracked);
        }
        if (req->flags & REQ_F_CREDS)
                put_cred(req->creds);
        if (req->flags & REQ_F_ASYNC_DATA) {
                kfree(req->async_data);
                req->async_data = NULL;
        }
        req->flags &= ~IO_REQ_CLEAN_FLAGS;
}

static inline void io_req_track_inflight(struct io_kiocb *req)
{
        if (!(req->flags & REQ_F_INFLIGHT)) {
                req->flags |= REQ_F_INFLIGHT;
                atomic_inc(&req->task->io_uring->inflight_tracked);
        }
}

static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
{
        if (WARN_ON_ONCE(!req->link))
                return NULL;

        req->flags &= ~REQ_F_ARM_LTIMEOUT;
        req->flags |= REQ_F_LINK_TIMEOUT;

        /* linked timeouts should have two refs once prep'ed */
        io_req_set_refcount(req);
        __io_req_set_refcount(req->link, 2);
        return req->link;
}

static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{
        if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
                return NULL;
        return __io_prep_linked_timeout(req);
}

static noinline void __io_arm_ltimeout(struct io_kiocb *req)
{
        io_queue_linked_timeout(__io_prep_linked_timeout(req));
}

static inline void io_arm_ltimeout(struct io_kiocb *req)
{
        if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
                __io_arm_ltimeout(req);
}

static void io_prep_async_work(struct io_kiocb *req)
{
        const struct io_issue_def *def = &io_issue_defs[req->opcode];
        struct io_ring_ctx *ctx = req->ctx;

        if (!(req->flags & REQ_F_CREDS)) {
                req->flags |= REQ_F_CREDS;
                req->creds = get_current_cred();
        }

        req->work.list.next = NULL;
        req->work.flags = 0;
        if (req->flags & REQ_F_FORCE_ASYNC)
                req->work.flags |= IO_WQ_WORK_CONCURRENT;

        if (req->file && !(req->flags & REQ_F_FIXED_FILE))
                req->flags |= io_file_get_flags(req->file);

        if (req->file && (req->flags & REQ_F_ISREG)) {
                bool should_hash = def->hash_reg_file;

                /* don't serialize this request if the fs doesn't need it */
                if (should_hash && (req->file->f_flags & O_DIRECT) &&
                    (req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE))
                        should_hash = false;
                if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL))
                        io_wq_hash_work(&req->work, file_inode(req->file));
        } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
                if (def->unbound_nonreg_file)
                        req->work.flags |= IO_WQ_WORK_UNBOUND;
        }
}

static void io_prep_async_link(struct io_kiocb *req)
{
        struct io_kiocb *cur;

        if (req->flags & REQ_F_LINK_TIMEOUT) {
                struct io_ring_ctx *ctx = req->ctx;

                spin_lock_irq(&ctx->timeout_lock);
                io_for_each_link(cur, req)
                        io_prep_async_work(cur);
                spin_unlock_irq(&ctx->timeout_lock);
        } else {
                io_for_each_link(cur, req)
                        io_prep_async_work(cur);
        }
}

static void io_queue_iowq(struct io_kiocb *req)
{
        struct io_kiocb *link = io_prep_linked_timeout(req);
        struct io_uring_task *tctx = req->task->io_uring;

        BUG_ON(!tctx);
        BUG_ON(!tctx->io_wq);

        /* init ->work of the whole link before punting */
        io_prep_async_link(req);

        /*
         * Not expected to happen, but if we do have a bug where this _can_
         * happen, catch it here and ensure the request is marked as
         * canceled. That will make io-wq go through the usual work cancel
         * procedure rather than attempt to run this request (or create a new
         * worker for it).
         */
        if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
                req->work.flags |= IO_WQ_WORK_CANCEL;

        trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
        io_wq_enqueue(tctx->io_wq, &req->work);
        if (link)
                io_queue_linked_timeout(link);
}

static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
{
        while (!list_empty(&ctx->defer_list)) {
                struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
                                                struct io_defer_entry, list);

                if (req_need_defer(de->req, de->seq))
                        break;
                list_del_init(&de->list);
                io_req_task_queue(de->req);
                kfree(de);
        }
}

void io_eventfd_ops(struct rcu_head *rcu)
{
        struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
        int ops = atomic_xchg(&ev_fd->ops, 0);

        if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
                eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);

        /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
         * ordering in a race but if references are 0 we know we have to free
         * it regardless.
         */
        if (atomic_dec_and_test(&ev_fd->refs)) {
                eventfd_ctx_put(ev_fd->cq_ev_fd);
                kfree(ev_fd);
        }
}

static void io_eventfd_signal(struct io_ring_ctx *ctx)
{
        struct io_ev_fd *ev_fd = NULL;

        rcu_read_lock();
        /*
         * rcu_dereference ctx->io_ev_fd once and use it for both for checking
         * and eventfd_signal
         */
        ev_fd = rcu_dereference(ctx->io_ev_fd);

        /*
         * Check again if ev_fd exists incase an io_eventfd_unregister call
         * completed between the NULL check of ctx->io_ev_fd at the start of
         * the function and rcu_read_lock.
         */
        if (unlikely(!ev_fd))
                goto out;
        if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
                goto out;
        if (ev_fd->eventfd_async && !io_wq_current_is_worker())
                goto out;

        if (likely(eventfd_signal_allowed())) {
                eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
        } else {
                atomic_inc(&ev_fd->refs);
                if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
                        call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops);
                else
                        atomic_dec(&ev_fd->refs);
        }

out:
        rcu_read_unlock();
}

static void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
{
        bool skip;

        spin_lock(&ctx->completion_lock);

        /*
         * Eventfd should only get triggered when at least one event has been
         * posted. Some applications rely on the eventfd notification count
         * only changing IFF a new CQE has been added to the CQ ring. There's
         * no depedency on 1:1 relationship between how many times this
         * function is called (and hence the eventfd count) and number of CQEs
         * posted to the CQ ring.
         */
        skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
        ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
        spin_unlock(&ctx->completion_lock);
        if (skip)
                return;

        io_eventfd_signal(ctx);
}

void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
        if (ctx->poll_activated)
                io_poll_wq_wake(ctx);
        if (ctx->off_timeout_used)
                io_flush_timeouts(ctx);
        if (ctx->drain_active) {
                spin_lock(&ctx->completion_lock);
                io_queue_deferred(ctx);
                spin_unlock(&ctx->completion_lock);
        }
        if (ctx->has_evfd)
                io_eventfd_flush_signal(ctx);
}

static inline void __io_cq_lock(struct io_ring_ctx *ctx)
{
        if (!ctx->lockless_cq)
                spin_lock(&ctx->completion_lock);
}

static inline void io_cq_lock(struct io_ring_ctx *ctx)
        __acquires(ctx->completion_lock)
{
        spin_lock(&ctx->completion_lock);
}

static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
{
        io_commit_cqring(ctx);
        if (!ctx->task_complete) {
                if (!ctx->lockless_cq)
                        spin_unlock(&ctx->completion_lock);
                /* IOPOLL rings only need to wake up if it's also SQPOLL */
                if (!ctx->syscall_iopoll)
                        io_cqring_wake(ctx);
        }
        io_commit_cqring_flush(ctx);
}

static void io_cq_unlock_post(struct io_ring_ctx *ctx)
        __releases(ctx->completion_lock)
{
        io_commit_cqring(ctx);
        spin_unlock(&ctx->completion_lock);
        io_cqring_wake(ctx);
        io_commit_cqring_flush(ctx);
}

static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying)
{
        size_t cqe_size = sizeof(struct io_uring_cqe);

        lockdep_assert_held(&ctx->uring_lock);

        /* don't abort if we're dying, entries must get freed */
        if (!dying && __io_cqring_events(ctx) == ctx->cq_entries)
                return;

        if (ctx->flags & IORING_SETUP_CQE32)
                cqe_size <<= 1;

        io_cq_lock(ctx);
        while (!list_empty(&ctx->cq_overflow_list)) {
                struct io_uring_cqe *cqe;
                struct io_overflow_cqe *ocqe;

                ocqe = list_first_entry(&ctx->cq_overflow_list,
                                        struct io_overflow_cqe, list);

                if (!dying) {
                        if (!io_get_cqe_overflow(ctx, &cqe, true))
                                break;
                        memcpy(cqe, &ocqe->cqe, cqe_size);
                }
                list_del(&ocqe->list);
                kfree(ocqe);
        }

        if (list_empty(&ctx->cq_overflow_list)) {
                clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
                atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
        }
        io_cq_unlock_post(ctx);
}

static void io_cqring_overflow_kill(struct io_ring_ctx *ctx)
{
        if (ctx->rings)
                __io_cqring_overflow_flush(ctx, true);
}

static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx)
{
        mutex_lock(&ctx->uring_lock);
        __io_cqring_overflow_flush(ctx, false);
        mutex_unlock(&ctx->uring_lock);
}

/* can be called by any task */
static void io_put_task_remote(struct task_struct *task)
{
        struct io_uring_task *tctx = task->io_uring;

        percpu_counter_sub(&tctx->inflight, 1);
        if (unlikely(atomic_read(&tctx->in_cancel)))
                wake_up(&tctx->wait);
        put_task_struct(task);
}

/* used by a task to put its own references */
static void io_put_task_local(struct task_struct *task)
{
        task->io_uring->cached_refs++;
}

/* must to be called somewhat shortly after putting a request */
static inline void io_put_task(struct task_struct *task)
{
        if (likely(task == current))
                io_put_task_local(task);
        else
                io_put_task_remote(task);
}

void io_task_refs_refill(struct io_uring_task *tctx)
{
        unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;

        percpu_counter_add(&tctx->inflight, refill);
        refcount_add(refill, &current->usage);
        tctx->cached_refs += refill;
}

static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
{
        struct io_uring_task *tctx = task->io_uring;
        unsigned int refs = tctx->cached_refs;

        if (refs) {
                tctx->cached_refs = 0;
                percpu_counter_sub(&tctx->inflight, refs);
                put_task_struct_many(task, refs);
        }
}

static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
                                     s32 res, u32 cflags, u64 extra1, u64 extra2)
{
        struct io_overflow_cqe *ocqe;
        size_t ocq_size = sizeof(struct io_overflow_cqe);
        bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);

        lockdep_assert_held(&ctx->completion_lock);

        if (is_cqe32)
                ocq_size += sizeof(struct io_uring_cqe);

        ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
        trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
        if (!ocqe) {
                /*
                 * If we're in ring overflow flush mode, or in task cancel mode,
                 * or cannot allocate an overflow entry, then we need to drop it
                 * on the floor.
                 */
                io_account_cq_overflow(ctx);
                set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
                return false;
        }
        if (list_empty(&ctx->cq_overflow_list)) {
                set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
                atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);

        }
        ocqe->cqe.user_data = user_data;
        ocqe->cqe.res = res;
        ocqe->cqe.flags = cflags;
        if (is_cqe32) {
                ocqe->cqe.big_cqe[0] = extra1;
                ocqe->cqe.big_cqe[1] = extra2;
        }
        list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
        return true;
}

static void io_req_cqe_overflow(struct io_kiocb *req)
{
        io_cqring_event_overflow(req->ctx, req->cqe.user_data,
                                req->cqe.res, req->cqe.flags,
                                req->big_cqe.extra1, req->big_cqe.extra2);
        memset(&req->big_cqe, 0, sizeof(req->big_cqe));
}

/*
 * writes to the cq entry need to come after reading head; the
 * control dependency is enough as we're using WRITE_ONCE to
 * fill the cq entry
 */
bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow)
{
        struct io_rings *rings = ctx->rings;
        unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
        unsigned int free, queued, len;

        /*
         * Posting into the CQ when there are pending overflowed CQEs may break
         * ordering guarantees, which will affect links, F_MORE users and more.
         * Force overflow the completion.
         */
        if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
                return false;

        /* userspace may cheat modifying the tail, be safe and do min */
        queued = min(__io_cqring_events(ctx), ctx->cq_entries);
        free = ctx->cq_entries - queued;
        /* we need a contiguous range, limit based on the current array offset */
        len = min(free, ctx->cq_entries - off);
        if (!len)
                return false;

        if (ctx->flags & IORING_SETUP_CQE32) {
                off <<= 1;
                len <<= 1;
        }

        ctx->cqe_cached = &rings->cqes[off];
        ctx->cqe_sentinel = ctx->cqe_cached + len;
        return true;
}

static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
                              u32 cflags)
{
        struct io_uring_cqe *cqe;

        ctx->cq_extra++;

        /*
         * If we can't get a cq entry, userspace overflowed the
         * submission (by quite a lot). Increment the overflow count in
         * the ring.
         */
        if (likely(io_get_cqe(ctx, &cqe))) {
                trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);

                WRITE_ONCE(cqe->user_data, user_data);
                WRITE_ONCE(cqe->res, res);
                WRITE_ONCE(cqe->flags, cflags);

                if (ctx->flags & IORING_SETUP_CQE32) {
                        WRITE_ONCE(cqe->big_cqe[0], 0);
                        WRITE_ONCE(cqe->big_cqe[1], 0);
                }
                return true;
        }
        return false;
}

bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
        bool filled;

        io_cq_lock(ctx);
        filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
        if (!filled)
                filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);

        io_cq_unlock_post(ctx);
        return filled;
}

/*
 * A helper for multishot requests posting additional CQEs.
 * Should only be used from a task_work including IO_URING_F_MULTISHOT.
 */
bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
{
        struct io_ring_ctx *ctx = req->ctx;
        bool posted;

        lockdep_assert(!io_wq_current_is_worker());
        lockdep_assert_held(&ctx->uring_lock);

        __io_cq_lock(ctx);
        posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags);
        ctx->submit_state.cq_flush = true;
        __io_cq_unlock_post(ctx);
        return posted;
}

static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
{
        struct io_ring_ctx *ctx = req->ctx;

        /*
         * All execution paths but io-wq use the deferred completions by
         * passing IO_URING_F_COMPLETE_DEFER and thus should not end up here.
         */
        if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_IOWQ)))
                return;

        /*
         * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires
         * the submitter task context, IOPOLL protects with uring_lock.
         */
        if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) {
                req->io_task_work.func = io_req_task_complete;
                io_req_task_work_add(req);
                return;
        }

        io_cq_lock(ctx);
        if (!(req->flags & REQ_F_CQE_SKIP)) {
                if (!io_fill_cqe_req(ctx, req))
                        io_req_cqe_overflow(req);
        }
        io_cq_unlock_post(ctx);

        /*
         * We don't free the request here because we know it's called from
         * io-wq only, which holds a reference, so it cannot be the last put.
         */
        req_ref_put(req);
}

void io_req_defer_failed(struct io_kiocb *req, s32 res)
        __must_hold(&ctx->uring_lock)
{
        const struct io_cold_def *def = &io_cold_defs[req->opcode];

        lockdep_assert_held(&req->ctx->uring_lock);

        req_set_fail(req);
        io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
        if (def->fail)
                def->fail(req);
        io_req_complete_defer(req);
}

/*
 * Don't initialise the fields below on every allocation, but do that in
 * advance and keep them valid across allocations.
 */
static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
{
        req->ctx = ctx;
        req->link = NULL;
        req->async_data = NULL;
        /* not necessary, but safer to zero */
        memset(&req->cqe, 0, sizeof(req->cqe));
        memset(&req->big_cqe, 0, sizeof(req->big_cqe));
}

/*
 * A request might get retired back into the request caches even before opcode
 * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
 * Because of that, io_alloc_req() should be called only under ->uring_lock
 * and with extra caution to not get a request that is still worked on.
 */
__cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
        __must_hold(&ctx->uring_lock)
{
        gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
        void *reqs[IO_REQ_ALLOC_BATCH];
        int ret;

        ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);

        /*
         * Bulk alloc is all-or-nothing. If we fail to get a batch,
         * retry single alloc to be on the safe side.
         */
        if (unlikely(ret <= 0)) {
                reqs[0] = kmem_cache_alloc(req_cachep, gfp);
                if (!reqs[0])
                        return false;
                ret = 1;
        }

        percpu_ref_get_many(&ctx->refs, ret);
        while (ret--) {
                struct io_kiocb *req = reqs[ret];

                io_preinit_req(req, ctx);
                io_req_add_to_cache(req, ctx);
        }
        return true;
}

__cold void io_free_req(struct io_kiocb *req)
{
        /* refs were already put, restore them for io_req_task_complete() */
        req->flags &= ~REQ_F_REFCOUNT;
        /* we only want to free it, don't post CQEs */
        req->flags |= REQ_F_CQE_SKIP;
        req->io_task_work.func = io_req_task_complete;
        io_req_task_work_add(req);
}

static void __io_req_find_next_prep(struct io_kiocb *req)
{
        struct io_ring_ctx *ctx = req->ctx;

        spin_lock(&ctx->completion_lock);
        io_disarm_next(req);
        spin_unlock(&ctx->completion_lock);
}

static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{
        struct io_kiocb *nxt;

        /*
         * If LINK is set, we have dependent requests in this chain. If we
         * didn't fail this request, queue the first one up, moving any other
         * dependencies to the next request. In case of failure, fail the rest
         * of the chain.
         */
        if (unlikely(req->flags & IO_DISARM_MASK))
                __io_req_find_next_prep(req);
        nxt = req->link;
        req->link = NULL;
        return nxt;
}

static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts)
{
        if (!ctx)
                return;
        if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
                atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);

        io_submit_flush_completions(ctx);
        mutex_unlock(&ctx->uring_lock);
        percpu_ref_put(&ctx->refs);
}

/*
 * Run queued task_work, returning the number of entries processed in *count.
 * If more entries than max_entries are available, stop processing once this
 * is reached and return the rest of the list.
 */
struct llist_node *io_handle_tw_list(struct llist_node *node,
                                     unsigned int *count,
                                     unsigned int max_entries)
{
        struct io_ring_ctx *ctx = NULL;
        struct io_tw_state ts = { };

        do {
                struct llist_node *next = node->next;
                struct io_kiocb *req = container_of(node, struct io_kiocb,
                                                    io_task_work.node);

                if (req->ctx != ctx) {
                        ctx_flush_and_put(ctx, &ts);
                        ctx = req->ctx;
                        mutex_lock(&ctx->uring_lock);
                        percpu_ref_get(&ctx->refs);
                }
                INDIRECT_CALL_2(req->io_task_work.func,
                                io_poll_task_func, io_req_rw_complete,
                                req, &ts);
                node = next;
                (*count)++;
                if (unlikely(need_resched())) {
                        ctx_flush_and_put(ctx, &ts);
                        ctx = NULL;
                        cond_resched();
                }
        } while (node && *count < max_entries);

        ctx_flush_and_put(ctx, &ts);
        return node;
}

/**
 * io_llist_xchg - swap all entries in a lock-less list
 * @head:        the head of lock-less list to delete all entries
 * @new:        new entry as the head of the list
 *
 * If list is empty, return NULL, otherwise, return the pointer to the first entry.
 * The order of entries returned is from the newest to the oldest added one.
 */
static inline struct llist_node *io_llist_xchg(struct llist_head *head,
                                               struct llist_node *new)
{
        return xchg(&head->first, new);
}

static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync)
{
        struct llist_node *node = llist_del_all(&tctx->task_list);
        struct io_ring_ctx *last_ctx = NULL;
        struct io_kiocb *req;

        while (node) {
                req = container_of(node, struct io_kiocb, io_task_work.node);
                node = node->next;
                if (sync && last_ctx != req->ctx) {
                        if (last_ctx) {
                                flush_delayed_work(&last_ctx->fallback_work);
                                percpu_ref_put(&last_ctx->refs);
                        }
                        last_ctx = req->ctx;
                        percpu_ref_get(&last_ctx->refs);
                }
                if (llist_add(&req->io_task_work.node,
                              &req->ctx->fallback_llist))
                        schedule_delayed_work(&req->ctx->fallback_work, 1);
        }

        if (last_ctx) {
                flush_delayed_work(&last_ctx->fallback_work);
                percpu_ref_put(&last_ctx->refs);
        }
}

struct llist_node *tctx_task_work_run(struct io_uring_task *tctx,
                                      unsigned int max_entries,
                                      unsigned int *count)
{
        struct llist_node *node;

        if (unlikely(current->flags & PF_EXITING)) {
                io_fallback_tw(tctx, true);
                return NULL;
        }

        node = llist_del_all(&tctx->task_list);
        if (node) {
                node = llist_reverse_order(node);
                node = io_handle_tw_list(node, count, max_entries);
        }

        /* relaxed read is enough as only the task itself sets ->in_cancel */
        if (unlikely(atomic_read(&tctx->in_cancel)))
                io_uring_drop_tctx_refs(current);

        trace_io_uring_task_work_run(tctx, *count);
        return node;
}

void tctx_task_work(struct callback_head *cb)
{
        struct io_uring_task *tctx;
        struct llist_node *ret;
        unsigned int count = 0;

        tctx = container_of(cb, struct io_uring_task, task_work);
        ret = tctx_task_work_run(tctx, UINT_MAX, &count);
        /* can't happen */
        WARN_ON_ONCE(ret);
}

static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
{
        struct io_ring_ctx *ctx = req->ctx;
        unsigned nr_wait, nr_tw, nr_tw_prev;
        struct llist_node *head;

        /* See comment above IO_CQ_WAKE_INIT */
        BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES);

        /*
         * We don't know how many reuqests is there in the link and whether
         * they can even be queued lazily, fall back to non-lazy.
         */
        if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
                flags &= ~IOU_F_TWQ_LAZY_WAKE;

        head = READ_ONCE(ctx->work_llist.first);
        do {
                nr_tw_prev = 0;
                if (head) {
                        struct io_kiocb *first_req = container_of(head,
                                                        struct io_kiocb,
                                                        io_task_work.node);
                        /*
                         * Might be executed at any moment, rely on
                         * SLAB_TYPESAFE_BY_RCU to keep it alive.
                         */
                        nr_tw_prev = READ_ONCE(first_req->nr_tw);
                }

                /*
                 * Theoretically, it can overflow, but that's fine as one of
                 * previous adds should've tried to wake the task.
                 */
                nr_tw = nr_tw_prev + 1;
                if (!(flags & IOU_F_TWQ_LAZY_WAKE))
                        nr_tw = IO_CQ_WAKE_FORCE;

                req->nr_tw = nr_tw;
                req->io_task_work.node.next = head;
        } while (!try_cmpxchg(&ctx->work_llist.first, &head,
                              &req->io_task_work.node));

        /*
         * cmpxchg implies a full barrier, which pairs with the barrier
         * in set_current_state() on the io_cqring_wait() side. It's used
         * to ensure that either we see updated ->cq_wait_nr, or waiters
         * going to sleep will observe the work added to the list, which
         * is similar to the wait/wawke task state sync.
         */

        if (!head) {
                if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
                        atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
                if (ctx->has_evfd)
                        io_eventfd_signal(ctx);
        }

        nr_wait = atomic_read(&ctx->cq_wait_nr);
        /* not enough or no one is waiting */
        if (nr_tw < nr_wait)
                return;
        /* the previous add has already woken it up */
        if (nr_tw_prev >= nr_wait)
                return;
        wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
}

static void io_req_normal_work_add(struct io_kiocb *req)
{
        struct io_uring_task *tctx = req->task->io_uring;
        struct io_ring_ctx *ctx = req->ctx;

        /* task_work already pending, we're done */
        if (!llist_add(&req->io_task_work.node, &tctx->task_list))
                return;

        if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
                atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);

        /* SQPOLL doesn't need the task_work added, it'll run it itself */
        if (ctx->flags & IORING_SETUP_SQPOLL) {
                struct io_sq_data *sqd = ctx->sq_data;

                if (wq_has_sleeper(&sqd->wait))
                        wake_up(&sqd->wait);
                return;
        }

        if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
                return;

        io_fallback_tw(tctx, false);
}

void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
{
        if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
                rcu_read_lock();
                io_req_local_work_add(req, flags);
                rcu_read_unlock();
        } else {
                io_req_normal_work_add(req);
        }
}

static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
{
        struct llist_node *node;

        node = llist_del_all(&ctx->work_llist);
        while (node) {
                struct io_kiocb *req = container_of(node, struct io_kiocb,
                                                    io_task_work.node);

                node = node->next;
                io_req_normal_work_add(req);
        }
}

static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events,
                                       int min_events)
{
        if (llist_empty(&ctx->work_llist))
                return false;
        if (events < min_events)
                return true;
        if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
                atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
        return false;
}

static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts,
                               int min_events)
{
        struct llist_node *node;
        unsigned int loops = 0;
        int ret = 0;

        if (WARN_ON_ONCE(ctx->submitter_task != current))
                return -EEXIST;
        if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
                atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
again:
        /*
         * llists are in reverse order, flip it back the right way before
         * running the pending items.
         */
        node = llist_reverse_order(io_llist_xchg(&ctx->work_llist, NULL));
        while (node) {
                struct llist_node *next = node->next;
                struct io_kiocb *req = container_of(node, struct io_kiocb,
                                                    io_task_work.node);
                INDIRECT_CALL_2(req->io_task_work.func,
                                io_poll_task_func, io_req_rw_complete,
                                req, ts);
                ret++;
                node = next;
        }
        loops++;

        if (io_run_local_work_continue(ctx, ret, min_events))
                goto again;
        io_submit_flush_completions(ctx);
        if (io_run_local_work_continue(ctx, ret, min_events))
                goto again;

        trace_io_uring_local_work_run(ctx, ret, loops);
        return ret;
}

static inline int io_run_local_work_locked(struct io_ring_ctx *ctx,
                                           int min_events)
{
        struct io_tw_state ts = {};

        if (llist_empty(&ctx->work_llist))
                return 0;
        return __io_run_local_work(ctx, &ts, min_events);
}

static int io_run_local_work(struct io_ring_ctx *ctx, int min_events)
{
        struct io_tw_state ts = {};
        int ret;

        mutex_lock(&ctx->uring_lock);
        ret = __io_run_local_work(ctx, &ts, min_events);
        mutex_unlock(&ctx->uring_lock);
        return ret;
}

static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts)
{
        io_tw_lock(req->ctx, ts);
        io_req_defer_failed(req, req->cqe.res);
}

void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts)
{
        io_tw_lock(req->ctx, ts);
        /* req->task == current here, checking PF_EXITING is safe */
        if (unlikely(req->task->flags & PF_EXITING))
                io_req_defer_failed(req, -EFAULT);
        else if (req->flags & REQ_F_FORCE_ASYNC)
                io_queue_iowq(req);
        else
                io_queue_sqe(req);
}

void io_req_task_queue_fail(struct io_kiocb *req, int ret)
{
        io_req_set_res(req, ret, 0);
        req->io_task_work.func = io_req_task_cancel;
        io_req_task_work_add(req);
}

void io_req_task_queue(struct io_kiocb *req)
{
        req->io_task_work.func = io_req_task_submit;
        io_req_task_work_add(req);
}

void io_queue_next(struct io_kiocb *req)
{
        struct io_kiocb *nxt = io_req_find_next(req);

        if (nxt)
                io_req_task_queue(nxt);
}

static void io_free_batch_list(struct io_ring_ctx *ctx,
                               struct io_wq_work_node *node)
        __must_hold(&ctx->uring_lock)
{
        do {
                struct io_kiocb *req = container_of(node, struct io_kiocb,
                                                    comp_list);

                if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
                        if (req->flags & REQ_F_REFCOUNT) {
                                node = req->comp_list.next;
                                if (!req_ref_put_and_test(req))
                                        continue;
                        }
                        if ((req->flags & REQ_F_POLLED) && req->apoll) {
                                struct async_poll *apoll = req->apoll;

                                if (apoll->double_poll)
                                        kfree(apoll->double_poll);
                                if (!io_alloc_cache_put(&ctx->apoll_cache, apoll))
                                        kfree(apoll);
                                req->flags &= ~REQ_F_POLLED;
                        }
                        if (req->flags & IO_REQ_LINK_FLAGS)
                                io_queue_next(req);
                        if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
                                io_clean_op(req);
                }
                io_put_file(req);
                io_put_rsrc_node(ctx, req->rsrc_node);
                io_put_task(req->task);

                node = req->comp_list.next;
                io_req_add_to_cache(req, ctx);
        } while (node);
}

void __io_submit_flush_completions(struct io_ring_ctx *ctx)
        __must_hold(&ctx->uring_lock)
{
        struct io_submit_state *state = &ctx->submit_state;
        struct io_wq_work_node *node;

        __io_cq_lock(ctx);
        __wq_list_for_each(node, &state->compl_reqs) {
                struct io_kiocb *req = container_of(node, struct io_kiocb,
                                            comp_list);

                if (!(req->flags & REQ_F_CQE_SKIP) &&
                    unlikely(!io_fill_cqe_req(ctx, req))) {
                        if (ctx->lockless_cq) {
                                spin_lock(&ctx->completion_lock);
                                io_req_cqe_overflow(req);
                                spin_unlock(&ctx->completion_lock);
                        } else {
                                io_req_cqe_overflow(req);
                        }
                }
        }
        __io_cq_unlock_post(ctx);

        if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
                io_free_batch_list(ctx, state->compl_reqs.first);
                INIT_WQ_LIST(&state->compl_reqs);
        }
        ctx->submit_state.cq_flush = false;
}

static unsigned io_cqring_events(struct io_ring_ctx *ctx)
{
        /* See comment at the top of this file */
        smp_rmb();
        return __io_cqring_events(ctx);
}

/*
 * We can't just wait for polled events to come to us, we have to actively
 * find and complete them.
 */
static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
{
        if (!(ctx->flags & IORING_SETUP_IOPOLL))
                return;

        mutex_lock(&ctx->uring_lock);
        while (!wq_list_empty(&ctx->iopoll_list)) {
                /* let it sleep and repeat later if can't complete a request */
                if (io_do_iopoll(ctx, true) == 0)
                        break;
                /*
                 * Ensure we allow local-to-the-cpu processing to take place,
                 * in this case we need to ensure that we reap all events.
                 * Also let task_work, etc. to progress by releasing the mutex
                 */
                if (need_resched()) {
                        mutex_unlock(&ctx->uring_lock);
                        cond_resched();
                        mutex_lock(&ctx->uring_lock);
                }
        }
        mutex_unlock(&ctx->uring_lock);
}

static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
{
        unsigned int nr_events = 0;
        unsigned long check_cq;

        lockdep_assert_held(&ctx->uring_lock);

        if (!io_allowed_run_tw(ctx))
                return -EEXIST;

        check_cq = READ_ONCE(ctx->check_cq);
        if (unlikely(check_cq)) {
                if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
                        __io_cqring_overflow_flush(ctx, false);
                /*
                 * Similarly do not spin if we have not informed the user of any
                 * dropped CQE.
                 */
                if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))
                        return -EBADR;
        }
        /*
         * Don't enter poll loop if we already have events pending.
         * If we do, we can potentially be spinning for commands that
         * already triggered a CQE (eg in error).
         */
        if (io_cqring_events(ctx))
                return 0;

        do {
                int ret = 0;

                /*
                 * If a submit got punted to a workqueue, we can have the
                 * application entering polling for a command before it gets
                 * issued. That app will hold the uring_lock for the duration
                 * of the poll right here, so we need to take a breather every
                 * now and then to ensure that the issue has a chance to add
                 * the poll to the issued list. Otherwise we can spin here
                 * forever, while the workqueue is stuck trying to acquire the
                 * very same mutex.
                 */
                if (wq_list_empty(&ctx->iopoll_list) ||
                    io_task_work_pending(ctx)) {
                        u32 tail = ctx->cached_cq_tail;

                        (void) io_run_local_work_locked(ctx, min);

                        if (task_work_pending(current) ||
                            wq_list_empty(&ctx->iopoll_list)) {
                                mutex_unlock(&ctx->uring_lock);
                                io_run_task_work();
                                mutex_lock(&ctx->uring_lock);
                        }
                        /* some requests don't go through iopoll_list */
                        if (tail != ctx->cached_cq_tail ||
                            wq_list_empty(&ctx->iopoll_list))
                                break;
                }
                ret = io_do_iopoll(ctx, !min);
                if (unlikely(ret < 0))
                        return ret;

                if (task_sigpending(current))
                        return -EINTR;
                if (need_resched())
                        break;

                nr_events += ret;
        } while (nr_events < min);

        return 0;
}

void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts)
{
        io_req_complete_defer(req);
}

/*
 * After the iocb has been issued, it's safe to be found on the poll list.
 * Adding the kiocb to the list AFTER submission ensures that we don't
 * find it from a io_do_iopoll() thread before the issuer is done
 * accessing the kiocb cookie.
 */
static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_ring_ctx *ctx = req->ctx;
        const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;

        /* workqueue context doesn't hold uring_lock, grab it now */
        if (unlikely(needs_lock))
                mutex_lock(&ctx->uring_lock);

        /*
         * Track whether we have multiple files in our lists. This will impact
         * how we do polling eventually, not spinning if we're on potentially
         * different devices.
         */
        if (wq_list_empty(&ctx->iopoll_list)) {
                ctx->poll_multi_queue = false;
        } else if (!ctx->poll_multi_queue) {
                struct io_kiocb *list_req;

                list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
                                        comp_list);
                if (list_req->file != req->file)
                        ctx->poll_multi_queue = true;
        }

        /*
         * For fast devices, IO may have already completed. If it has, add
         * it to the front so we find it first.
         */
        if (READ_ONCE(req->iopoll_completed))
                wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
        else
                wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);

        if (unlikely(needs_lock)) {
                /*
                 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
                 * in sq thread task context or in io worker task context. If
                 * current task context is sq thread, we don't need to check
                 * whether should wake up sq thread.
                 */
                if ((ctx->flags & IORING_SETUP_SQPOLL) &&
                    wq_has_sleeper(&ctx->sq_data->wait))
                        wake_up(&ctx->sq_data->wait);

                mutex_unlock(&ctx->uring_lock);
        }
}

io_req_flags_t io_file_get_flags(struct file *file)
{
        io_req_flags_t res = 0;

        if (S_ISREG(file_inode(file)->i_mode))
                res |= REQ_F_ISREG;
        if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT))
                res |= REQ_F_SUPPORT_NOWAIT;
        return res;
}

bool io_alloc_async_data(struct io_kiocb *req)
{
        const struct io_issue_def *def = &io_issue_defs[req->opcode];

        WARN_ON_ONCE(!def->async_size);
        req->async_data = kmalloc(def->async_size, GFP_KERNEL);
        if (req->async_data) {
                req->flags |= REQ_F_ASYNC_DATA;
                return false;
        }
        return true;
}

static u32 io_get_sequence(struct io_kiocb *req)
{
        u32 seq = req->ctx->cached_sq_head;
        struct io_kiocb *cur;

        /* need original cached_sq_head, but it was increased for each req */
        io_for_each_link(cur, req)
                seq--;
        return seq;
}

static __cold void io_drain_req(struct io_kiocb *req)
        __must_hold(&ctx->uring_lock)
{
        struct io_ring_ctx *ctx = req->ctx;
        struct io_defer_entry *de;
        int ret;
        u32 seq = io_get_sequence(req);

        /* Still need defer if there is pending req in defer list. */
        spin_lock(&ctx->completion_lock);
        if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
                spin_unlock(&ctx->completion_lock);
queue:
                ctx->drain_active = false;
                io_req_task_queue(req);
                return;
        }
        spin_unlock(&ctx->completion_lock);

        io_prep_async_link(req);
        de = kmalloc(sizeof(*de), GFP_KERNEL);
        if (!de) {
                ret = -ENOMEM;
                io_req_defer_failed(req, ret);
                return;
        }

        spin_lock(&ctx->completion_lock);
        if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
                spin_unlock(&ctx->completion_lock);
                kfree(de);
                goto queue;
        }

        trace_io_uring_defer(req);
        de->req = req;
        de->seq = seq;
        list_add_tail(&de->list, &ctx->defer_list);
        spin_unlock(&ctx->completion_lock);
}

static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
                           unsigned int issue_flags)
{
        if (req->file || !def->needs_file)
                return true;

        if (req->flags & REQ_F_FIXED_FILE)
                req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
        else
                req->file = io_file_get_normal(req, req->cqe.fd);

        return !!req->file;
}

static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
{
        const struct io_issue_def *def = &io_issue_defs[req->opcode];
        const struct cred *creds = NULL;
        int ret;

        if (unlikely(!io_assign_file(req, def, issue_flags)))
                return -EBADF;

        if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
                creds = override_creds(req->creds);

        if (!def->audit_skip)
                audit_uring_entry(req->opcode);

        ret = def->issue(req, issue_flags);

        if (!def->audit_skip)
                audit_uring_exit(!ret, ret);

        if (creds)
                revert_creds(creds);

        if (ret == IOU_OK) {
                if (issue_flags & IO_URING_F_COMPLETE_DEFER)
                        io_req_complete_defer(req);
                else
                        io_req_complete_post(req, issue_flags);

                return 0;
        }

        if (ret == IOU_ISSUE_SKIP_COMPLETE) {
                ret = 0;
                io_arm_ltimeout(req);

                /* If the op doesn't have a file, we're not polling for it */
                if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
                        io_iopoll_req_issued(req, issue_flags);
        }
        return ret;
}

int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)
{
        io_tw_lock(req->ctx, ts);
        return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT|
                                 IO_URING_F_COMPLETE_DEFER);
}

struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
{
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
        struct io_kiocb *nxt = NULL;

        if (req_ref_put_and_test(req)) {
                if (req->flags & IO_REQ_LINK_FLAGS)
                        nxt = io_req_find_next(req);
                io_free_req(req);
        }
        return nxt ? &nxt->work : NULL;
}

void io_wq_submit_work(struct io_wq_work *work)
{
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
        const struct io_issue_def *def = &io_issue_defs[req->opcode];
        unsigned int issue_flags = IO_URING_F_UNLOCKED | IO_URING_F_IOWQ;
        bool needs_poll = false;
        int ret = 0, err = -ECANCELED;

        /* one will be dropped by ->io_wq_free_work() after returning to io-wq */
        if (!(req->flags & REQ_F_REFCOUNT))
                __io_req_set_refcount(req, 2);
        else
                req_ref_get(req);

        io_arm_ltimeout(req);

        /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
        if (work->flags & IO_WQ_WORK_CANCEL) {
fail:
                io_req_task_queue_fail(req, err);
                return;
        }
        if (!io_assign_file(req, def, issue_flags)) {
                err = -EBADF;
                work->flags |= IO_WQ_WORK_CANCEL;
                goto fail;
        }

        /*
         * If DEFER_TASKRUN is set, it's only allowed to post CQEs from the
         * submitter task context. Final request completions are handed to the
         * right context, however this is not the case of auxiliary CQEs,
         * which is the main mean of operation for multishot requests.
         * Don't allow any multishot execution from io-wq. It's more restrictive
         * than necessary and also cleaner.
         */
        if (req->flags & REQ_F_APOLL_MULTISHOT) {
                err = -EBADFD;
                if (!io_file_can_poll(req))
                        goto fail;
                if (req->file->f_flags & O_NONBLOCK ||
                    req->file->f_mode & FMODE_NOWAIT) {
                        err = -ECANCELED;
                        if (io_arm_poll_handler(req, issue_flags) != IO_APOLL_OK)
                                goto fail;
                        return;
                } else {
                        req->flags &= ~REQ_F_APOLL_MULTISHOT;
                }
        }

        if (req->flags & REQ_F_FORCE_ASYNC) {
                bool opcode_poll = def->pollin || def->pollout;

                if (opcode_poll && io_file_can_poll(req)) {
                        needs_poll = true;
                        issue_flags |= IO_URING_F_NONBLOCK;
                }
        }

        do {
                ret = io_issue_sqe(req, issue_flags);
                if (ret != -EAGAIN)
                        break;

                /*
                 * If REQ_F_NOWAIT is set, then don't wait or retry with
                 * poll. -EAGAIN is final for that case.
                 */
                if (req->flags & REQ_F_NOWAIT)
                        break;

                /*
                 * We can get EAGAIN for iopolled IO even though we're
                 * forcing a sync submission from here, since we can't
                 * wait for request slots on the block side.
                 */
                if (!needs_poll) {
                        if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
                                break;
                        if (io_wq_worker_stopped())
                                break;
                        cond_resched();
                        continue;
                }

                if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
                        return;
                /* aborted or ready, in either case retry blocking */
                needs_poll = false;
                issue_flags &= ~IO_URING_F_NONBLOCK;
        } while (1);

        /* avoid locking problems by failing it from a clean context */
        if (ret < 0)
                io_req_task_queue_fail(req, ret);
}

inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
                                      unsigned int issue_flags)
{
        struct io_ring_ctx *ctx = req->ctx;
        struct io_fixed_file *slot;
        struct file *file = NULL;

        io_ring_submit_lock(ctx, issue_flags);

        if (unlikely((unsigned int)fd >= ctx->nr_user_files))
                goto out;
        fd = array_index_nospec(fd, ctx->nr_user_files);
        slot = io_fixed_file_slot(&ctx->file_table, fd);
        if (!req->rsrc_node)
                __io_req_set_rsrc_node(req, ctx);
        req->flags |= io_slot_flags(slot);
        file = io_slot_file(slot);
out:
        io_ring_submit_unlock(ctx, issue_flags);
        return file;
}

struct file *io_file_get_normal(struct io_kiocb *req, int fd)
{
        struct file *file = fget(fd);

        trace_io_uring_file_get(req, fd);

        /* we don't allow fixed io_uring files */
        if (file && io_is_uring_fops(file))
                io_req_track_inflight(req);
        return file;
}

static void io_queue_async(struct io_kiocb *req, int ret)
        __must_hold(&req->ctx->uring_lock)
{
        struct io_kiocb *linked_timeout;

        if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
                io_req_defer_failed(req, ret);
                return;
        }

        linked_timeout = io_prep_linked_timeout(req);

        switch (io_arm_poll_handler(req, 0)) {
        case IO_APOLL_READY:
                io_kbuf_recycle(req, 0);
                io_req_task_queue(req);
                break;
        case IO_APOLL_ABORTED:
                io_kbuf_recycle(req, 0);
                io_queue_iowq(req);
                break;
        case IO_APOLL_OK:
                break;
        }

        if (linked_timeout)
                io_queue_linked_timeout(linked_timeout);
}

static inline void io_queue_sqe(struct io_kiocb *req)
        __must_hold(&req->ctx->uring_lock)
{
        int ret;

        ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);

        /*
         * We async punt it if the file wasn't marked NOWAIT, or if the file
         * doesn't support non-blocking read/write attempts
         */
        if (unlikely(ret))
                io_queue_async(req, ret);
}

static void io_queue_sqe_fallback(struct io_kiocb *req)
        __must_hold(&req->ctx->uring_lock)
{
        if (unlikely(req->flags & REQ_F_FAIL)) {
                /*
                 * We don't submit, fail them all, for that replace hardlinks
                 * with normal links. Extra REQ_F_LINK is tolerated.
                 */
                req->flags &= ~REQ_F_HARDLINK;
                req->flags |= REQ_F_LINK;
                io_req_defer_failed(req, req->cqe.res);
        } else {
                if (unlikely(req->ctx->drain_active))
                        io_drain_req(req);
                else
                        io_queue_iowq(req);
        }
}

/*
 * Check SQE restrictions (opcode and flags).
 *
 * Returns 'true' if SQE is allowed, 'false' otherwise.
 */
static inline bool io_check_restriction(struct io_ring_ctx *ctx,
                                        struct io_kiocb *req,
                                        unsigned int sqe_flags)
{
        if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
                return false;

        if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
            ctx->restrictions.sqe_flags_required)
                return false;

        if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
                          ctx->restrictions.sqe_flags_required))
                return false;

        return true;
}

static void io_init_req_drain(struct io_kiocb *req)
{
        struct io_ring_ctx *ctx = req->ctx;
        struct io_kiocb *head = ctx->submit_state.link.head;

        ctx->drain_active = true;
        if (head) {
                /*
                 * If we need to drain a request in the middle of a link, drain
                 * the head request and the next request/link after the current
                 * link. Considering sequential execution of links,
                 * REQ_F_IO_DRAIN will be maintained for every request of our
                 * link.
                 */
                head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
                ctx->drain_next = true;
        }
}

static __cold int io_init_fail_req(struct io_kiocb *req, int err)
{
        /* ensure per-opcode data is cleared if we fail before prep */
        memset(&req->cmd.data, 0, sizeof(req->cmd.data));
        return err;
}

static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
                       const struct io_uring_sqe *sqe)
        __must_hold(&ctx->uring_lock)
{
        const struct io_issue_def *def;
        unsigned int sqe_flags;
        int personality;
        u8 opcode;

        /* req is partially pre-initialised, see io_preinit_req() */
        req->opcode = opcode = READ_ONCE(sqe->opcode);
        /* same numerical values with corresponding REQ_F_*, safe to copy */
        sqe_flags = READ_ONCE(sqe->flags);
        req->flags = (io_req_flags_t) sqe_flags;
        req->cqe.user_data = READ_ONCE(sqe->user_data);
        req->file = NULL;
        req->rsrc_node = NULL;
        req->task = current;
        req->cancel_seq_set = false;

        if (unlikely(opcode >= IORING_OP_LAST)) {
                req->opcode = 0;
                return io_init_fail_req(req, -EINVAL);
        }
        def = &io_issue_defs[opcode];
        if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
                /* enforce forwards compatibility on users */
                if (sqe_flags & ~SQE_VALID_FLAGS)
                        return io_init_fail_req(req, -EINVAL);
                if (sqe_flags & IOSQE_BUFFER_SELECT) {
                        if (!def->buffer_select)
                                return io_init_fail_req(req, -EOPNOTSUPP);
                        req->buf_index = READ_ONCE(sqe->buf_group);
                }
                if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
                        ctx->drain_disabled = true;
                if (sqe_flags & IOSQE_IO_DRAIN) {
                        if (ctx->drain_disabled)
                                return io_init_fail_req(req, -EOPNOTSUPP);
                        io_init_req_drain(req);
                }
        }
        if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
                if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
                        return io_init_fail_req(req, -EACCES);
                /* knock it to the slow queue path, will be drained there */
                if (ctx->drain_active)
                        req->flags |= REQ_F_FORCE_ASYNC;
                /* if there is no link, we're at "next" request and need to drain */
                if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
                        ctx->drain_next = false;
                        ctx->drain_active = true;
                        req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
                }
        }

        if (!def->ioprio && sqe->ioprio)
                return io_init_fail_req(req, -EINVAL);
        if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
                return io_init_fail_req(req, -EINVAL);

        if (def->needs_file) {
                struct io_submit_state *state = &ctx->submit_state;

                req->cqe.fd = READ_ONCE(sqe->fd);

                /*
                 * Plug now if we have more than 2 IO left after this, and the
                 * target is potentially a read/write to block based storage.
                 */
                if (state->need_plug && def->plug) {
                        state->plug_started = true;
                        state->need_plug = false;
                        blk_start_plug_nr_ios(&state->plug, state->submit_nr);
                }
        }

        personality = READ_ONCE(sqe->personality);
        if (personality) {
                int ret;

                req->creds = xa_load(&ctx->personalities, personality);
                if (!req->creds)
                        return io_init_fail_req(req, -EINVAL);
                get_cred(req->creds);
                ret = security_uring_override_creds(req->creds);
                if (ret) {
                        put_cred(req->creds);
                        return io_init_fail_req(req, ret);
                }
                req->flags |= REQ_F_CREDS;
        }

        return def->prep(req, sqe);
}

static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
                                      struct io_kiocb *req, int ret)
{
        struct io_ring_ctx *ctx = req->ctx;
        struct io_submit_link *link = &ctx->submit_state.link;
        struct io_kiocb *head = link->head;

        trace_io_uring_req_failed(sqe, req, ret);

        /*
         * Avoid breaking links in the middle as it renders links with SQPOLL
         * unusable. Instead of failing eagerly, continue assembling the link if
         * applicable and mark the head with REQ_F_FAIL. The link flushing code
         * should find the flag and handle the rest.
         */
        req_fail_link_node(req, ret);
        if (head && !(head->flags & REQ_F_FAIL))
                req_fail_link_node(head, -ECANCELED);

        if (!(req->flags & IO_REQ_LINK_FLAGS)) {
                if (head) {
                        link->last->link = req;
                        link->head = NULL;
                        req = head;
                }
                io_queue_sqe_fallback(req);
                return ret;
        }

        if (head)
                link->last->link = req;
        else
                link->head = req;
        link->last = req;
        return 0;
}

static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
                         const struct io_uring_sqe *sqe)
        __must_hold(&ctx->uring_lock)
{
        struct io_submit_link *link = &ctx->submit_state.link;
        int ret;

        ret = io_init_req(ctx, req, sqe);
        if (unlikely(ret))
                return io_submit_fail_init(sqe, req, ret);

        trace_io_uring_submit_req(req);

        /*
         * If we already have a head request, queue this one for async
         * submittal once the head completes. If we don't have a head but
         * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
         * submitted sync once the chain is complete. If none of those
         * conditions are true (normal request), then just queue it.
         */
        if (unlikely(link->head)) {
                trace_io_uring_link(req, link->head);
                link->last->link = req;
                link->last = req;

                if (req->flags & IO_REQ_LINK_FLAGS)
                        return 0;
                /* last request of the link, flush it */
                req = link->head;
                link->head = NULL;
                if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))
                        goto fallback;

        } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS |
                                          REQ_F_FORCE_ASYNC | REQ_F_FAIL))) {
                if (req->flags & IO_REQ_LINK_FLAGS) {
                        link->head = req;
                        link->last = req;
                } else {
fallback:
                        io_queue_sqe_fallback(req);
                }
                return 0;
        }

        io_queue_sqe(req);
        return 0;
}

/*
 * Batched submission is done, ensure local IO is flushed out.
 */
static void io_submit_state_end(struct io_ring_ctx *ctx)
{
        struct io_submit_state *state = &ctx->submit_state;

        if (unlikely(state->link.head))
                io_queue_sqe_fallback(state->link.head);
        /* flush only after queuing links as they can generate completions */
        io_submit_flush_completions(ctx);
        if (state->plug_started)
                blk_finish_plug(&state->plug);
}

/*
 * Start submission side cache.
 */
static void io_submit_state_start(struct io_submit_state *state,
                                  unsigned int max_ios)
{
        state->plug_started = false;
        state->need_plug = max_ios > 2;
        state->submit_nr = max_ios;
        /* set only head, no need to init link_last in advance */
        state->link.head = NULL;
}

static void io_commit_sqring(struct io_ring_ctx *ctx)
{
        struct io_rings *rings = ctx->rings;

        /*
         * Ensure any loads from the SQEs are done at this point,
         * since once we write the new head, the application could
         * write new data to them.
         */
        smp_store_release(&rings->sq.head, ctx->cached_sq_head);
}

/*
 * Fetch an sqe, if one is available. Note this returns a pointer to memory
 * that is mapped by userspace. This means that care needs to be taken to
 * ensure that reads are stable, as we cannot rely on userspace always
 * being a good citizen. If members of the sqe are validated and then later
 * used, it's important that those reads are done through READ_ONCE() to
 * prevent a re-load down the line.
 */
static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
{
        unsigned mask = ctx->sq_entries - 1;
        unsigned head = ctx->cached_sq_head++ & mask;

        if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) {
                head = READ_ONCE(ctx->sq_array[head]);
                if (unlikely(head >= ctx->sq_entries)) {
                        /* drop invalid entries */
                        spin_lock(&ctx->completion_lock);
                        ctx->cq_extra--;
                        spin_unlock(&ctx->completion_lock);
                        WRITE_ONCE(ctx->rings->sq_dropped,
                                   READ_ONCE(ctx->rings->sq_dropped) + 1);
                        return false;
                }
        }

        /*
         * The cached sq head (or cq tail) serves two purposes:
         *
         * 1) allows us to batch the cost of updating the user visible
         *    head updates.
         * 2) allows the kernel side to track the head on its own, even
         *    though the application is the one updating it.
         */

        /* double index for 128-byte SQEs, twice as long */
        if (ctx->flags & IORING_SETUP_SQE128)
                head <<= 1;
        *sqe = &ctx->sq_sqes[head];
        return true;
}

int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
        __must_hold(&ctx->uring_lock)
{
        unsigned int entries = io_sqring_entries(ctx);
        unsigned int left;
        int ret;

        if (unlikely(!entries))
                return 0;
        /* make sure SQ entry isn't read before tail */
        ret = left = min(nr, entries);
        io_get_task_refs(left);
        io_submit_state_start(&ctx->submit_state, left);

        do {
                const struct io_uring_sqe *sqe;
                struct io_kiocb *req;

                if (unlikely(!io_alloc_req(ctx, &req)))
                        break;
                if (unlikely(!io_get_sqe(ctx, &sqe))) {
                        io_req_add_to_cache(req, ctx);
                        break;
                }

                /*
                 * Continue submitting even for sqe failure if the
                 * ring was setup with IORING_SETUP_SUBMIT_ALL
                 */
                if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
                    !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
                        left--;
                        break;
                }
        } while (--left);

        if (unlikely(left)) {
                ret -= left;
                /* try again if it submitted nothing and can't allocate a req */
                if (!ret && io_req_cache_empty(ctx))
                        ret = -EAGAIN;
                current->io_uring->cached_refs += left;
        }

        io_submit_state_end(ctx);
         /* Commit SQ ring head once we've consumed and submitted all SQEs */
        io_commit_sqring(ctx);
        return ret;
}

static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
                            int wake_flags, void *key)
{
        struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq);

        /*
         * Cannot safely flush overflowed CQEs from here, ensure we wake up
         * the task, and the next invocation will do it.
         */
        if (io_should_wake(iowq) || io_has_work(iowq->ctx))
                return autoremove_wake_function(curr, mode, wake_flags, key);
        return -1;
}

int io_run_task_work_sig(struct io_ring_ctx *ctx)
{
        if (!llist_empty(&ctx->work_llist)) {
                __set_current_state(TASK_RUNNING);
                if (io_run_local_work(ctx, INT_MAX) > 0)
                        return 0;
        }
        if (io_run_task_work() > 0)
                return 0;
        if (task_sigpending(current))
                return -EINTR;
        return 0;
}

static bool current_pending_io(void)
{
        struct io_uring_task *tctx = current->io_uring;

        if (!tctx)
                return false;
        return percpu_counter_read_positive(&tctx->inflight);
}

/* when returns >0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
                                          struct io_wait_queue *iowq)
{
        int ret;

        if (unlikely(READ_ONCE(ctx->check_cq)))
                return 1;
        if (unlikely(!llist_empty(&ctx->work_llist)))
                return 1;
        if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL)))
                return 1;
        if (unlikely(task_sigpending(current)))
                return -EINTR;
        if (unlikely(io_should_wake(iowq)))
                return 0;

        /*
         * Mark us as being in io_wait if we have pending requests, so cpufreq
         * can take into account that the task is waiting for IO - turns out
         * to be important for low QD IO.
         */
        if (current_pending_io())
                current->in_iowait = 1;
        ret = 0;
        if (iowq->timeout == KTIME_MAX)
                schedule();
        else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
                ret = -ETIME;
        current->in_iowait = 0;
        return ret;
}

/*
 * Wait until events become available, if we don't already have some. The
 * application must reap them itself, as they reside on the shared cq ring.
 */
static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
                          const sigset_t __user *sig, size_t sigsz,
                          struct __kernel_timespec __user *uts)
{
        struct io_wait_queue iowq;
        struct io_rings *rings = ctx->rings;
        int ret;

        if (!io_allowed_run_tw(ctx))
                return -EEXIST;
        if (!llist_empty(&ctx->work_llist))
                io_run_local_work(ctx, min_events);
        io_run_task_work();

        if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)))
                io_cqring_do_overflow_flush(ctx);
        if (__io_cqring_events_user(ctx) >= min_events)
                return 0;

        init_waitqueue_func_entry(&iowq.wq, io_wake_function);
        iowq.wq.private = current;
        INIT_LIST_HEAD(&iowq.wq.entry);
        iowq.ctx = ctx;
        iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
        iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
        iowq.timeout = KTIME_MAX;

        if (uts) {
                struct timespec64 ts;

                if (get_timespec64(&ts, uts))
                        return -EFAULT;

                iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
                io_napi_adjust_timeout(ctx, &iowq, &ts);
        }

        if (sig) {
#ifdef CONFIG_COMPAT
                if (in_compat_syscall())
                        ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
                                                      sigsz);
                else
#endif
                        ret = set_user_sigmask(sig, sigsz);

                if (ret)
                        return ret;
        }

        io_napi_busy_loop(ctx, &iowq);

        trace_io_uring_cqring_wait(ctx, min_events);
        do {
                int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
                unsigned long check_cq;

                if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
                        atomic_set(&ctx->cq_wait_nr, nr_wait);
                        set_current_state(TASK_INTERRUPTIBLE);
                } else {
                        prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
                                                        TASK_INTERRUPTIBLE);
                }

                ret = io_cqring_wait_schedule(ctx, &iowq);
                __set_current_state(TASK_RUNNING);
                atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);

                /*
                 * Run task_work after scheduling and before io_should_wake().
                 * If we got woken because of task_work being processed, run it
                 * now rather than let the caller do another wait loop.
                 */
                io_run_task_work();
                if (!llist_empty(&ctx->work_llist))
                        io_run_local_work(ctx, nr_wait);

                /*
                 * Non-local task_work will be run on exit to userspace, but
                 * if we're using DEFER_TASKRUN, then we could have waited
                 * with a timeout for a number of requests. If the timeout
                 * hits, we could have some requests ready to process. Ensure
                 * this break is _after_ we have run task_work, to avoid
                 * deferring running potentially pending requests until the
                 * next time we wait for events.
                 */
                if (ret < 0)
                        break;

                check_cq = READ_ONCE(ctx->check_cq);
                if (unlikely(check_cq)) {
                        /* let the caller flush overflows, retry */
                        if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
                                io_cqring_do_overflow_flush(ctx);
                        if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) {
                                ret = -EBADR;
                                break;
                        }
                }

                if (io_should_wake(&iowq)) {
                        ret = 0;
                        break;
                }
                cond_resched();
        } while (1);

        if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
                finish_wait(&ctx->cq_wait, &iowq.wq);
        restore_saved_sigmask_unless(ret == -EINTR);

        return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}

static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
                          size_t size)
{
        return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr,
                                size);
}

static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
                         size_t size)
{
        return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr,
                                size);
}

static void io_rings_free(struct io_ring_ctx *ctx)
{
        if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
                io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages,
                                true);
                io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages,
                                true);
        } else {
                io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
                ctx->n_ring_pages = 0;
                io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages);
                ctx->n_sqe_pages = 0;
                vunmap(ctx->rings);
                vunmap(ctx->sq_sqes);
        }

        ctx->rings = NULL;
        ctx->sq_sqes = NULL;
}

static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
                                unsigned int cq_entries, size_t *sq_offset)
{
        struct io_rings *rings;
        size_t off, sq_array_size;

        off = struct_size(rings, cqes, cq_entries);
        if (off == SIZE_MAX)
                return SIZE_MAX;
        if (ctx->flags & IORING_SETUP_CQE32) {
                if (check_shl_overflow(off, 1, &off))
                        return SIZE_MAX;
        }

#ifdef CONFIG_SMP
        off = ALIGN(off, SMP_CACHE_BYTES);
        if (off == 0)
                return SIZE_MAX;
#endif

        if (ctx->flags & IORING_SETUP_NO_SQARRAY) {
                *sq_offset = SIZE_MAX;
                return off;
        }

        *sq_offset = off;

        sq_array_size = array_size(sizeof(u32), sq_entries);
        if (sq_array_size == SIZE_MAX)
                return SIZE_MAX;

        if (check_add_overflow(off, sq_array_size, &off))
                return SIZE_MAX;

        return off;
}

static void io_req_caches_free(struct io_ring_ctx *ctx)
{
        struct io_kiocb *req;
        int nr = 0;

        mutex_lock(&ctx->uring_lock);

        while (!io_req_cache_empty(ctx)) {
                req = io_extract_req(ctx);
                kmem_cache_free(req_cachep, req);
                nr++;
        }
        if (nr)
                percpu_ref_put_many(&ctx->refs, nr);
        mutex_unlock(&ctx->uring_lock);
}

static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
        io_sq_thread_finish(ctx);
        /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
        if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)))
                return;

        mutex_lock(&ctx->uring_lock);
        if (ctx->buf_data)
                __io_sqe_buffers_unregister(ctx);
        if (ctx->file_data)
                __io_sqe_files_unregister(ctx);
        io_cqring_overflow_kill(ctx);
        io_eventfd_unregister(ctx);
        io_alloc_cache_free(&ctx->apoll_cache, kfree);
        io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
        io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
        io_alloc_cache_free(&ctx->uring_cache, kfree);
        io_futex_cache_free(ctx);
        io_destroy_buffers(ctx);
        mutex_unlock(&ctx->uring_lock);
        if (ctx->sq_creds)
                put_cred(ctx->sq_creds);
        if (ctx->submitter_task)
                put_task_struct(ctx->submitter_task);

        /* there are no registered resources left, nobody uses it */
        if (ctx->rsrc_node)
                io_rsrc_node_destroy(ctx, ctx->rsrc_node);

        WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
        WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));

        io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
        if (ctx->mm_account) {
                mmdrop(ctx->mm_account);
                ctx->mm_account = NULL;
        }
        io_rings_free(ctx);

        percpu_ref_exit(&ctx->refs);
        free_uid(ctx->user);
        io_req_caches_free(ctx);
        if (ctx->hash_map)
                io_wq_put_hash(ctx->hash_map);
        io_napi_free(ctx);
        kfree(ctx->cancel_table.hbs);
        kfree(ctx->cancel_table_locked.hbs);
        xa_destroy(&ctx->io_bl_xa);
        kfree(ctx);
}

static __cold void io_activate_pollwq_cb(struct callback_head *cb)
{
        struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx,
                                               poll_wq_task_work);

        mutex_lock(&ctx->uring_lock);
        ctx->poll_activated = true;
        mutex_unlock(&ctx->uring_lock);

        /*
         * Wake ups for some events between start of polling and activation
         * might've been lost due to loose synchronisation.
         */
        wake_up_all(&ctx->poll_wq);
        percpu_ref_put(&ctx->refs);
}

__cold void io_activate_pollwq(struct io_ring_ctx *ctx)
{
        spin_lock(&ctx->completion_lock);
        /* already activated or in progress */
        if (ctx->poll_activated || ctx->poll_wq_task_work.func)
                goto out;
        if (WARN_ON_ONCE(!ctx->task_complete))
                goto out;
        if (!ctx->submitter_task)
                goto out;
        /*
         * with ->submitter_task only the submitter task completes requests, we
         * only need to sync with it, which is done by injecting a tw
         */
        init_task_work(&ctx->poll_wq_task_work, io_activate_pollwq_cb);
        percpu_ref_get(&ctx->refs);
        if (task_work_add(ctx->submitter_task, &ctx->poll_wq_task_work, TWA_SIGNAL))
                percpu_ref_put(&ctx->refs);
out:
        spin_unlock(&ctx->completion_lock);
}

static __poll_t io_uring_poll(struct file *file, poll_table *wait)
{
        struct io_ring_ctx *ctx = file->private_data;
        __poll_t mask = 0;

        if (unlikely(!ctx->poll_activated))
                io_activate_pollwq(ctx);

        poll_wait(file, &ctx->poll_wq, wait);
        /*
         * synchronizes with barrier from wq_has_sleeper call in
         * io_commit_cqring
         */
        smp_rmb();
        if (!io_sqring_full(ctx))
                mask |= EPOLLOUT | EPOLLWRNORM;

        /*
         * Don't flush cqring overflow list here, just do a simple check.
         * Otherwise there could possible be ABBA deadlock:
         *      CPU0                    CPU1
         *      ----                    ----
         * lock(&ctx->uring_lock);
         *                              lock(&ep->mtx);
         *                              lock(&ctx->uring_lock);
         * lock(&ep->mtx);
         *
         * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
         * pushes them to do the flush.
         */

        if (__io_cqring_events_user(ctx) || io_has_work(ctx))
                mask |= EPOLLIN | EPOLLRDNORM;

        return mask;
}

struct io_tctx_exit {
        struct callback_head                task_work;
        struct completion                completion;
        struct io_ring_ctx                *ctx;
};

static __cold void io_tctx_exit_cb(struct callback_head *cb)
{
        struct io_uring_task *tctx = current->io_uring;
        struct io_tctx_exit *work;

        work = container_of(cb, struct io_tctx_exit, task_work);
        /*
         * When @in_cancel, we're in cancellation and it's racy to remove the
         * node. It'll be removed by the end of cancellation, just ignore it.
         * tctx can be NULL if the queueing of this task_work raced with
         * work cancelation off the exec path.
         */
        if (tctx && !atomic_read(&tctx->in_cancel))
                io_uring_del_tctx_node((unsigned long)work->ctx);
        complete(&work->completion);
}

static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
{
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);

        return req->ctx == data;
}

static __cold void io_ring_exit_work(struct work_struct *work)
{
        struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
        unsigned long timeout = jiffies + HZ * 60 * 5;
        unsigned long interval = HZ / 20;
        struct io_tctx_exit exit;
        struct io_tctx_node *node;
        int ret;

        /*
         * If we're doing polled IO and end up having requests being
         * submitted async (out-of-line), then completions can come in while
         * we're waiting for refs to drop. We need to reap these manually,
         * as nobody else will be looking for them.
         */
        do {
                if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
                        mutex_lock(&ctx->uring_lock);
                        io_cqring_overflow_kill(ctx);
                        mutex_unlock(&ctx->uring_lock);
                }

                if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
                        io_move_task_work_from_local(ctx);

                while (io_uring_try_cancel_requests(ctx, NULL, true))
                        cond_resched();

                if (ctx->sq_data) {
                        struct io_sq_data *sqd = ctx->sq_data;
                        struct task_struct *tsk;

                        io_sq_thread_park(sqd);
                        tsk = sqd->thread;
                        if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
                                io_wq_cancel_cb(tsk->io_uring->io_wq,
                                                io_cancel_ctx_cb, ctx, true);
                        io_sq_thread_unpark(sqd);
                }

                io_req_caches_free(ctx);

                if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
                        /* there is little hope left, don't run it too often */
                        interval = HZ * 60;
                }
                /*
                 * This is really an uninterruptible wait, as it has to be
                 * complete. But it's also run from a kworker, which doesn't
                 * take signals, so it's fine to make it interruptible. This
                 * avoids scenarios where we knowingly can wait much longer
                 * on completions, for example if someone does a SIGSTOP on
                 * a task that needs to finish task_work to make this loop
                 * complete. That's a synthetic situation that should not
                 * cause a stuck task backtrace, and hence a potential panic
                 * on stuck tasks if that is enabled.
                 */
        } while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval));

        init_completion(&exit.completion);
        init_task_work(&exit.task_work, io_tctx_exit_cb);
        exit.ctx = ctx;

        mutex_lock(&ctx->uring_lock);
        while (!list_empty(&ctx->tctx_list)) {
                WARN_ON_ONCE(time_after(jiffies, timeout));

                node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
                                        ctx_node);
                /* don't spin on a single task if cancellation failed */
                list_rotate_left(&ctx->tctx_list);
                ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
                if (WARN_ON_ONCE(ret))
                        continue;

                mutex_unlock(&ctx->uring_lock);
                /*
                 * See comment above for
                 * wait_for_completion_interruptible_timeout() on why this
                 * wait is marked as interruptible.
                 */
                wait_for_completion_interruptible(&exit.completion);
                mutex_lock(&ctx->uring_lock);
        }
        mutex_unlock(&ctx->uring_lock);
        spin_lock(&ctx->completion_lock);
        spin_unlock(&ctx->completion_lock);

        /* pairs with RCU read section in io_req_local_work_add() */
        if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
                synchronize_rcu();

        io_ring_ctx_free(ctx);
}

static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
        unsigned long index;
        struct creds *creds;

        mutex_lock(&ctx->uring_lock);
        percpu_ref_kill(&ctx->refs);
        xa_for_each(&ctx->personalities, index, creds)
                io_unregister_personality(ctx, index);
        mutex_unlock(&ctx->uring_lock);

        flush_delayed_work(&ctx->fallback_work);

        INIT_WORK(&ctx->exit_work, io_ring_exit_work);
        /*
         * Use system_unbound_wq to avoid spawning tons of event kworkers
         * if we're exiting a ton of rings at the same time. It just adds
         * noise and overhead, there's no discernable change in runtime
         * over using system_wq.
         */
        queue_work(iou_wq, &ctx->exit_work);
}

static int io_uring_release(struct inode *inode, struct file *file)
{
        struct io_ring_ctx *ctx = file->private_data;

        file->private_data = NULL;
        io_ring_ctx_wait_and_kill(ctx);
        return 0;
}

struct io_task_cancel {
        struct task_struct *task;
        bool all;
};

static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
{
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
        struct io_task_cancel *cancel = data;

        return io_match_task_safe(req, cancel->task, cancel->all);
}

static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
                                         struct task_struct *task,
                                         bool cancel_all)
{
        struct io_defer_entry *de;
        LIST_HEAD(list);

        spin_lock(&ctx->completion_lock);
        list_for_each_entry_reverse(de, &ctx->defer_list, list) {
                if (io_match_task_safe(de->req, task, cancel_all)) {
                        list_cut_position(&list, &ctx->defer_list, &de->list);
                        break;
                }
        }
        spin_unlock(&ctx->completion_lock);
        if (list_empty(&list))
                return false;

        while (!list_empty(&list)) {
                de = list_first_entry(&list, struct io_defer_entry, list);
                list_del_init(&de->list);
                io_req_task_queue_fail(de->req, -ECANCELED);
                kfree(de);
        }
        return true;
}

static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
{
        struct io_tctx_node *node;
        enum io_wq_cancel cret;
        bool ret = false;

        mutex_lock(&ctx->uring_lock);
        list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
                struct io_uring_task *tctx = node->task->io_uring;

                /*
                 * io_wq will stay alive while we hold uring_lock, because it's
                 * killed after ctx nodes, which requires to take the lock.
                 */
                if (!tctx || !tctx->io_wq)
                        continue;
                cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
                ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
        }
        mutex_unlock(&ctx->uring_lock);

        return ret;
}

static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                                                struct task_struct *task,
                                                bool cancel_all)
{
        struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
        struct io_uring_task *tctx = task ? task->io_uring : NULL;
        enum io_wq_cancel cret;
        bool ret = false;

        /* set it so io_req_local_work_add() would wake us up */
        if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
                atomic_set(&ctx->cq_wait_nr, 1);
                smp_mb();
        }

        /* failed during ring init, it couldn't have issued any requests */
        if (!ctx->rings)
                return false;

        if (!task) {
                ret |= io_uring_try_cancel_iowq(ctx);
        } else if (tctx && tctx->io_wq) {
                /*
                 * Cancels requests of all rings, not only @ctx, but
                 * it's fine as the task is in exit/exec.
                 */
                cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
                                       &cancel, true);
                ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
        }

        /* SQPOLL thread does its own polling */
        if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
            (ctx->sq_data && ctx->sq_data->thread == current)) {
                while (!wq_list_empty(&ctx->iopoll_list)) {
                        io_iopoll_try_reap_events(ctx);
                        ret = true;
                        cond_resched();
                }
        }

        if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
            io_allowed_defer_tw_run(ctx))
                ret |= io_run_local_work(ctx, INT_MAX) > 0;
        ret |= io_cancel_defer_files(ctx, task, cancel_all);
        mutex_lock(&ctx->uring_lock);
        ret |= io_poll_remove_all(ctx, task, cancel_all);
        ret |= io_waitid_remove_all(ctx, task, cancel_all);
        ret |= io_futex_remove_all(ctx, task, cancel_all);
        ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all);
        mutex_unlock(&ctx->uring_lock);
        ret |= io_kill_timeouts(ctx, task, cancel_all);
        if (task)
                ret |= io_run_task_work() > 0;
        else
                ret |= flush_delayed_work(&ctx->fallback_work);
        return ret;
}

static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
{
        if (tracked)
                return atomic_read(&tctx->inflight_tracked);
        return percpu_counter_sum(&tctx->inflight);
}

/*
 * Find any io_uring ctx that this task has registered or done IO on, and cancel
 * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
 */
__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
{
        struct io_uring_task *tctx = current->io_uring;
        struct io_ring_ctx *ctx;
        struct io_tctx_node *node;
        unsigned long index;
        s64 inflight;
        DEFINE_WAIT(wait);

        WARN_ON_ONCE(sqd && sqd->thread != current);

        if (!current->io_uring)
                return;
        if (tctx->io_wq)
                io_wq_exit_start(tctx->io_wq);

        atomic_inc(&tctx->in_cancel);
        do {
                bool loop = false;

                io_uring_drop_tctx_refs(current);
                /* read completions before cancelations */
                inflight = tctx_inflight(tctx, !cancel_all);
                if (!inflight)
                        break;

                if (!sqd) {
                        xa_for_each(&tctx->xa, index, node) {
                                /* sqpoll task will cancel all its requests */
                                if (node->ctx->sq_data)
                                        continue;
                                loop |= io_uring_try_cancel_requests(node->ctx,
                                                        current, cancel_all);
                        }
                } else {
                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                loop |= io_uring_try_cancel_requests(ctx,
                                                                     current,
                                                                     cancel_all);
                }

                if (loop) {
                        cond_resched();
                        continue;
                }

                prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
                io_run_task_work();
                io_uring_drop_tctx_refs(current);
                xa_for_each(&tctx->xa, index, node) {
                        if (!llist_empty(&node->ctx->work_llist)) {
                                WARN_ON_ONCE(node->ctx->submitter_task &&
                                             node->ctx->submitter_task != current);
                                goto end_wait;
                        }
                }
                /*
                 * If we've seen completions, retry without waiting. This
                 * avoids a race where a completion comes in before we did
                 * prepare_to_wait().
                 */
                if (inflight == tctx_inflight(tctx, !cancel_all))
                        schedule();
end_wait:
                finish_wait(&tctx->wait, &wait);
        } while (1);

        io_uring_clean_tctx(tctx);
        if (cancel_all) {
                /*
                 * We shouldn't run task_works after cancel, so just leave
                 * ->in_cancel set for normal exit.
                 */
                atomic_dec(&tctx->in_cancel);
                /* for exec all current's requests should be gone, kill tctx */
                __io_uring_free(current);
        }
}

void __io_uring_cancel(bool cancel_all)
{
        io_uring_cancel_generic(cancel_all, NULL);
}

static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
{
        if (flags & IORING_ENTER_EXT_ARG) {
                struct io_uring_getevents_arg arg;

                if (argsz != sizeof(arg))
                        return -EINVAL;
                if (copy_from_user(&arg, argp, sizeof(arg)))
                        return -EFAULT;
        }
        return 0;
}

static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
                          struct __kernel_timespec __user **ts,
                          const sigset_t __user **sig)
{
        struct io_uring_getevents_arg arg;

        /*
         * If EXT_ARG isn't set, then we have no timespec and the argp pointer
         * is just a pointer to the sigset_t.
         */
        if (!(flags & IORING_ENTER_EXT_ARG)) {
                *sig = (const sigset_t __user *) argp;
                *ts = NULL;
                return 0;
        }

        /*
         * EXT_ARG is set - ensure we agree on the size of it and copy in our
         * timespec and sigset_t pointers if good.
         */
        if (*argsz != sizeof(arg))
                return -EINVAL;
        if (copy_from_user(&arg, argp, sizeof(arg)))
                return -EFAULT;
        if (arg.pad)
                return -EINVAL;
        *sig = u64_to_user_ptr(arg.sigmask);
        *argsz = arg.sigmask_sz;
        *ts = u64_to_user_ptr(arg.ts);
        return 0;
}

SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
                u32, min_complete, u32, flags, const void __user *, argp,
                size_t, argsz)
{
        struct io_ring_ctx *ctx;
        struct file *file;
        long ret;

        if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
                               IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
                               IORING_ENTER_REGISTERED_RING)))
                return -EINVAL;

        /*
         * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
         * need only dereference our task private array to find it.
         */
        if (flags & IORING_ENTER_REGISTERED_RING) {
                struct io_uring_task *tctx = current->io_uring;

                if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
                        return -EINVAL;
                fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
                file = tctx->registered_rings[fd];
                if (unlikely(!file))
                        return -EBADF;
        } else {
                file = fget(fd);
                if (unlikely(!file))
                        return -EBADF;
                ret = -EOPNOTSUPP;
                if (unlikely(!io_is_uring_fops(file)))
                        goto out;
        }

        ctx = file->private_data;
        ret = -EBADFD;
        if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
                goto out;

        /*
         * For SQ polling, the thread will do all submissions and completions.
         * Just return the requested submit count, and wake the thread if
         * we were asked to.
         */
        ret = 0;
        if (ctx->flags & IORING_SETUP_SQPOLL) {
                if (unlikely(ctx->sq_data->thread == NULL)) {
                        ret = -EOWNERDEAD;
                        goto out;
                }
                if (flags & IORING_ENTER_SQ_WAKEUP)
                        wake_up(&ctx->sq_data->wait);
                if (flags & IORING_ENTER_SQ_WAIT)
                        io_sqpoll_wait_sq(ctx);

                ret = to_submit;
        } else if (to_submit) {
                ret = io_uring_add_tctx_node(ctx);
                if (unlikely(ret))
                        goto out;

                mutex_lock(&ctx->uring_lock);
                ret = io_submit_sqes(ctx, to_submit);
                if (ret != to_submit) {
                        mutex_unlock(&ctx->uring_lock);
                        goto out;
                }
                if (flags & IORING_ENTER_GETEVENTS) {
                        if (ctx->syscall_iopoll)
                                goto iopoll_locked;
                        /*
                         * Ignore errors, we'll soon call io_cqring_wait() and
                         * it should handle ownership problems if any.
                         */
                        if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
                                (void)io_run_local_work_locked(ctx, min_complete);
                }
                mutex_unlock(&ctx->uring_lock);
        }

        if (flags & IORING_ENTER_GETEVENTS) {
                int ret2;

                if (ctx->syscall_iopoll) {
                        /*
                         * We disallow the app entering submit/complete with
                         * polling, but we still need to lock the ring to
                         * prevent racing with polled issue that got punted to
                         * a workqueue.
                         */
                        mutex_lock(&ctx->uring_lock);
iopoll_locked:
                        ret2 = io_validate_ext_arg(flags, argp, argsz);
                        if (likely(!ret2)) {
                                min_complete = min(min_complete,
                                                   ctx->cq_entries);
                                ret2 = io_iopoll_check(ctx, min_complete);
                        }
                        mutex_unlock(&ctx->uring_lock);
                } else {
                        const sigset_t __user *sig;
                        struct __kernel_timespec __user *ts;

                        ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
                        if (likely(!ret2)) {
                                min_complete = min(min_complete,
                                                   ctx->cq_entries);
                                ret2 = io_cqring_wait(ctx, min_complete, sig,
                                                      argsz, ts);
                        }
                }

                if (!ret) {
                        ret = ret2;

                        /*
                         * EBADR indicates that one or more CQE were dropped.
                         * Once the user has been informed we can clear the bit
                         * as they are obviously ok with those drops.
                         */
                        if (unlikely(ret2 == -EBADR))
                                clear_bit(IO_CHECK_CQ_DROPPED_BIT,
                                          &ctx->check_cq);
                }
        }
out:
        if (!(flags & IORING_ENTER_REGISTERED_RING))
                fput(file);
        return ret;
}

static const struct file_operations io_uring_fops = {
        .release        = io_uring_release,
        .mmap                = io_uring_mmap,
        .get_unmapped_area = io_uring_get_unmapped_area,
#ifndef CONFIG_MMU
        .mmap_capabilities = io_uring_nommu_mmap_capabilities,
#endif
        .poll                = io_uring_poll,
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = io_uring_show_fdinfo,
#endif
};

bool io_is_uring_fops(struct file *file)
{
        return file->f_op == &io_uring_fops;
}

static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
                                         struct io_uring_params *p)
{
        struct io_rings *rings;
        size_t size, sq_array_offset;
        void *ptr;

        /* make sure these are sane, as we already accounted them */
        ctx->sq_entries = p->sq_entries;
        ctx->cq_entries = p->cq_entries;

        size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);
        if (size == SIZE_MAX)
                return -EOVERFLOW;

        if (!(ctx->flags & IORING_SETUP_NO_MMAP))
                rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size);
        else
                rings = io_rings_map(ctx, p->cq_off.user_addr, size);

        if (IS_ERR(rings))
                return PTR_ERR(rings);

        ctx->rings = rings;
        if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
                ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
        rings->sq_ring_mask = p->sq_entries - 1;
        rings->cq_ring_mask = p->cq_entries - 1;
        rings->sq_ring_entries = p->sq_entries;
        rings->cq_ring_entries = p->cq_entries;

        if (p->flags & IORING_SETUP_SQE128)
                size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
        else
                size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
        if (size == SIZE_MAX) {
                io_rings_free(ctx);
                return -EOVERFLOW;
        }

        if (!(ctx->flags & IORING_SETUP_NO_MMAP))
                ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size);
        else
                ptr = io_sqes_map(ctx, p->sq_off.user_addr, size);

        if (IS_ERR(ptr)) {
                io_rings_free(ctx);
                return PTR_ERR(ptr);
        }

        ctx->sq_sqes = ptr;
        return 0;
}

static int io_uring_install_fd(struct file *file)
{
        int fd;

        fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
        if (fd < 0)
                return fd;
        fd_install(fd, file);
        return fd;
}

/*
 * Allocate an anonymous fd, this is what constitutes the application
 * visible backing of an io_uring instance. The application mmaps this
 * fd to gain access to the SQ/CQ ring details.
 */
static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
{
        /* Create a new inode so that the LSM can block the creation.  */
        return anon_inode_create_getfile("[io_uring]", &io_uring_fops, ctx,
                                         O_RDWR | O_CLOEXEC, NULL);
}

static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
                                  struct io_uring_params __user *params)
{
        struct io_ring_ctx *ctx;
        struct io_uring_task *tctx;
        struct file *file;
        int ret;

        if (!entries)
                return -EINVAL;
        if (entries > IORING_MAX_ENTRIES) {
                if (!(p->flags & IORING_SETUP_CLAMP))
                        return -EINVAL;
                entries = IORING_MAX_ENTRIES;
        }

        if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY)
            && !(p->flags & IORING_SETUP_NO_MMAP))
                return -EINVAL;

        /*
         * Use twice as many entries for the CQ ring. It's possible for the
         * application to drive a higher depth than the size of the SQ ring,
         * since the sqes are only used at submission time. This allows for
         * some flexibility in overcommitting a bit. If the application has
         * set IORING_SETUP_CQSIZE, it will have passed in the desired number
         * of CQ ring entries manually.
         */
        p->sq_entries = roundup_pow_of_two(entries);
        if (p->flags & IORING_SETUP_CQSIZE) {
                /*
                 * If IORING_SETUP_CQSIZE is set, we do the same roundup
                 * to a power-of-two, if it isn't already. We do NOT impose
                 * any cq vs sq ring sizing.
                 */
                if (!p->cq_entries)
                        return -EINVAL;
                if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
                        if (!(p->flags & IORING_SETUP_CLAMP))
                                return -EINVAL;
                        p->cq_entries = IORING_MAX_CQ_ENTRIES;
                }
                p->cq_entries = roundup_pow_of_two(p->cq_entries);
                if (p->cq_entries < p->sq_entries)
                        return -EINVAL;
        } else {
                p->cq_entries = 2 * p->sq_entries;
        }

        ctx = io_ring_ctx_alloc(p);
        if (!ctx)
                return -ENOMEM;

        if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
            !(ctx->flags & IORING_SETUP_IOPOLL) &&
            !(ctx->flags & IORING_SETUP_SQPOLL))
                ctx->task_complete = true;

        if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL))
                ctx->lockless_cq = true;

        /*
         * lazy poll_wq activation relies on ->task_complete for synchronisation
         * purposes, see io_activate_pollwq()
         */
        if (!ctx->task_complete)
                ctx->poll_activated = true;

        /*
         * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
         * space applications don't need to do io completion events
         * polling again, they can rely on io_sq_thread to do polling
         * work, which can reduce cpu usage and uring_lock contention.
         */
        if (ctx->flags & IORING_SETUP_IOPOLL &&
            !(ctx->flags & IORING_SETUP_SQPOLL))
                ctx->syscall_iopoll = 1;

        ctx->compat = in_compat_syscall();
        if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK))
                ctx->user = get_uid(current_user());

        /*
         * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if
         * COOP_TASKRUN is set, then IPIs are never needed by the app.
         */
        ret = -EINVAL;
        if (ctx->flags & IORING_SETUP_SQPOLL) {
                /* IPI related flags don't make sense with SQPOLL */
                if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
                                  IORING_SETUP_TASKRUN_FLAG |
                                  IORING_SETUP_DEFER_TASKRUN))
                        goto err;
                ctx->notify_method = TWA_SIGNAL_NO_IPI;
        } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
                ctx->notify_method = TWA_SIGNAL_NO_IPI;
        } else {
                if (ctx->flags & IORING_SETUP_TASKRUN_FLAG &&
                    !(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
                        goto err;
                ctx->notify_method = TWA_SIGNAL;
        }

        /*
         * For DEFER_TASKRUN we require the completion task to be the same as the
         * submission task. This implies that there is only one submitter, so enforce
         * that.
         */
        if (ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
            !(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) {
                goto err;
        }

        /*
         * This is just grabbed for accounting purposes. When a process exits,
         * the mm is exited and dropped before the files, hence we need to hang
         * on to this mm purely for the purposes of being able to unaccount
         * memory (locked/pinned vm). It's not used for anything else.
         */
        mmgrab(current->mm);
        ctx->mm_account = current->mm;

        ret = io_allocate_scq_urings(ctx, p);
        if (ret)
                goto err;

        ret = io_sq_offload_create(ctx, p);
        if (ret)
                goto err;

        ret = io_rsrc_init(ctx);
        if (ret)
                goto err;

        p->sq_off.head = offsetof(struct io_rings, sq.head);
        p->sq_off.tail = offsetof(struct io_rings, sq.tail);
        p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
        p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
        p->sq_off.flags = offsetof(struct io_rings, sq_flags);
        p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
        if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
                p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
        p->sq_off.resv1 = 0;
        if (!(ctx->flags & IORING_SETUP_NO_MMAP))
                p->sq_off.user_addr = 0;

        p->cq_off.head = offsetof(struct io_rings, cq.head);
        p->cq_off.tail = offsetof(struct io_rings, cq.tail);
        p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
        p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
        p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
        p->cq_off.cqes = offsetof(struct io_rings, cqes);
        p->cq_off.flags = offsetof(struct io_rings, cq_flags);
        p->cq_off.resv1 = 0;
        if (!(ctx->flags & IORING_SETUP_NO_MMAP))
                p->cq_off.user_addr = 0;

        p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
                        IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
                        IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
                        IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
                        IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
                        IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
                        IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
                        IORING_FEAT_RECVSEND_BUNDLE;

        if (copy_to_user(params, p, sizeof(*p))) {
                ret = -EFAULT;
                goto err;
        }

        if (ctx->flags & IORING_SETUP_SINGLE_ISSUER
            && !(ctx->flags & IORING_SETUP_R_DISABLED))
                WRITE_ONCE(ctx->submitter_task, get_task_struct(current));

        file = io_uring_get_file(ctx);
        if (IS_ERR(file)) {
                ret = PTR_ERR(file);
                goto err;
        }

        ret = __io_uring_add_tctx_node(ctx);
        if (ret)
                goto err_fput;
        tctx = current->io_uring;

        /*
         * Install ring fd as the very last thing, so we don't risk someone
         * having closed it before we finish setup
         */
        if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY)
                ret = io_ring_add_registered_file(tctx, file, 0, IO_RINGFD_REG_MAX);
        else
                ret = io_uring_install_fd(file);
        if (ret < 0)
                goto err_fput;

        trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
        return ret;
err:
        io_ring_ctx_wait_and_kill(ctx);
        return ret;
err_fput:
        fput(file);
        return ret;
}

/*
 * Sets up an aio uring context, and returns the fd. Applications asks for a
 * ring size, we return the actual sq/cq ring sizes (among other things) in the
 * params structure passed in.
 */
static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
{
        struct io_uring_params p;
        int i;

        if (copy_from_user(&p, params, sizeof(p)))
                return -EFAULT;
        for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
                if (p.resv[i])
                        return -EINVAL;
        }

        if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
                        IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
                        IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
                        IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
                        IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
                        IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
                        IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
                        IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
                        IORING_SETUP_NO_SQARRAY))
                return -EINVAL;

        return io_uring_create(entries, &p, params);
}

static inline bool io_uring_allowed(void)
{
        int disabled = READ_ONCE(sysctl_io_uring_disabled);
        kgid_t io_uring_group;

        if (disabled == 2)
                return false;

        if (disabled == 0 || capable(CAP_SYS_ADMIN))
                return true;

        io_uring_group = make_kgid(&init_user_ns, sysctl_io_uring_group);
        if (!gid_valid(io_uring_group))
                return false;

        return in_group_p(io_uring_group);
}

SYSCALL_DEFINE2(io_uring_setup, u32, entries,
                struct io_uring_params __user *, params)
{
        if (!io_uring_allowed())
                return -EPERM;

        return io_uring_setup(entries, params);
}

static int __init io_uring_init(void)
{
#define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \
        BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
        BUILD_BUG_ON(sizeof_field(stype, ename) != esize); \
} while (0)

#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
        __BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, sizeof(etype), ename)
#define BUILD_BUG_SQE_ELEM_SIZE(eoffset, esize, ename) \
        __BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, esize, ename)
        BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
        BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
        BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
        BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
        BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
        BUILD_BUG_SQE_ELEM(8,  __u64,  off);
        BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
        BUILD_BUG_SQE_ELEM(8,  __u32,  cmd_op);
        BUILD_BUG_SQE_ELEM(12, __u32, __pad1);
        BUILD_BUG_SQE_ELEM(16, __u64,  addr);
        BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
        BUILD_BUG_SQE_ELEM(24, __u32,  len);
        BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
        BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
        BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
        BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
        BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
        BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
        BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  rename_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  unlink_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  hardlink_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  xattr_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  msg_ring_flags);
        BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
        BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
        BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
        BUILD_BUG_SQE_ELEM(42, __u16,  personality);
        BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
        BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
        BUILD_BUG_SQE_ELEM(44, __u16,  addr_len);
        BUILD_BUG_SQE_ELEM(46, __u16,  __pad3[0]);
        BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
        BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
        BUILD_BUG_SQE_ELEM(56, __u64,  __pad2);

        BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
                     sizeof(struct io_uring_rsrc_update));
        BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
                     sizeof(struct io_uring_rsrc_update2));

        /* ->buf_index is u16 */
        BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
        BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
                     offsetof(struct io_uring_buf_ring, tail));

        /* should fit into one byte */
        BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
        BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
        BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);

        BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof_field(struct io_kiocb, flags));

        BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));

        /* top 8bits are for internal use */
        BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);

        io_uring_optable_init();

        /*
         * Allow user copy in the per-command field, which starts after the
         * file in io_kiocb and until the opcode field. The openat2 handling
         * requires copying in user memory into the io_kiocb object in that
         * range, and HARDENED_USERCOPY will complain if we haven't
         * correctly annotated this range.
         */
        req_cachep = kmem_cache_create_usercopy("io_kiocb",
                                sizeof(struct io_kiocb), 0,
                                SLAB_HWCACHE_ALIGN | SLAB_PANIC |
                                SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU,
                                offsetof(struct io_kiocb, cmd.data),
                                sizeof_field(struct io_kiocb, cmd.data), NULL);
        io_buf_cachep = KMEM_CACHE(io_buffer,
                                          SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);

        iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64);

#ifdef CONFIG_SYSCTL
        register_sysctl_init("kernel", kernel_io_uring_disabled_table);
#endif

        return 0;
};
__initcall(io_uring_init);




























































   14 


























































































   13 











































































































































   13 













   13 




    1 






















   14 





   15 
    1 






















   13 


























































   15 


























   13 














   14 














    1 








   14 








    4 








   15 


    1 

















   10 





























































































































































































   10 














   14 



    1 










   14 


    5 
   11 
























































































































































































































    2 


















    2 




    2 
















    2 

























































    1 








    2 


































    1 















































    2 









   13 




    1 













   13 







   13 

   14 
















   14 
















   12 




















   14 


    2 





























    2 





















    2 
















    2 















































































































































































   14 






















   14 








   15 





































    3 



    4 










    3 


















    4 







































    4 



























    1 









































































    1 






































































    1 





















   11 









    4 














   15 



   15 




































   13 

























   14 







   15 






























   15 

























    4 






















    1 
    4 



















    4 




    4 


    4 











    1 








    4 



    4 

    4 

    4 














    1 





















































    2 




    2 












    2 
















    2 
































































































































































































































































































































    3 




    4 


















    4 

    4 





    4 

    4 








    4 









    4 



    4 


    3 
    4 



































































































    1 

















    1 




    1 









    1 












    1 








    1 



































    1 




    1 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
// SPDX-License-Identifier: GPL-2.0+
/*
 * linux/fs/jbd2/transaction.c
 *
 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
 *
 * Copyright 1998 Red Hat corp --- All Rights Reserved
 *
 * Generic filesystem transaction handling code; part of the ext2fs
 * journaling system.
 *
 * This file manages transactions (compound commits managed by the
 * journaling code) and handles (individual atomic operations by the
 * filesystem).
 */

#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hrtimer.h>
#include <linux/backing-dev.h>
#include <linux/bug.h>
#include <linux/module.h>
#include <linux/sched/mm.h>

#include <trace/events/jbd2.h>

static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
static void __jbd2_journal_unfile_buffer(struct journal_head *jh);

static struct kmem_cache *transaction_cache;
int __init jbd2_journal_init_transaction_cache(void)
{
        J_ASSERT(!transaction_cache);
        transaction_cache = kmem_cache_create("jbd2_transaction_s",
                                        sizeof(transaction_t),
                                        0,
                                        SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
                                        NULL);
        if (!transaction_cache) {
                pr_emerg("JBD2: failed to create transaction cache\n");
                return -ENOMEM;
        }
        return 0;
}

void jbd2_journal_destroy_transaction_cache(void)
{
        kmem_cache_destroy(transaction_cache);
        transaction_cache = NULL;
}

void jbd2_journal_free_transaction(transaction_t *transaction)
{
        if (unlikely(ZERO_OR_NULL_PTR(transaction)))
                return;
        kmem_cache_free(transaction_cache, transaction);
}

/*
 * Base amount of descriptor blocks we reserve for each transaction.
 */
static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
{
        int tag_space = journal->j_blocksize - sizeof(journal_header_t);
        int tags_per_block;

        /* Subtract UUID */
        tag_space -= 16;
        if (jbd2_journal_has_csum_v2or3(journal))
                tag_space -= sizeof(struct jbd2_journal_block_tail);
        /* Commit code leaves a slack space of 16 bytes at the end of block */
        tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
        /*
         * Revoke descriptors are accounted separately so we need to reserve
         * space for commit block and normal transaction descriptor blocks.
         */
        return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers,
                                tags_per_block);
}

/*
 * jbd2_get_transaction: obtain a new transaction_t object.
 *
 * Simply initialise a new transaction. Initialize it in
 * RUNNING state and add it to the current journal (which should not
 * have an existing running transaction: we only make a new transaction
 * once we have started to commit the old one).
 *
 * Preconditions:
 *        The journal MUST be locked.  We don't perform atomic mallocs on the
 *        new transaction        and we can't block without protecting against other
 *        processes trying to touch the journal while it is in transition.
 *
 */

static void jbd2_get_transaction(journal_t *journal,
                                transaction_t *transaction)
{
        transaction->t_journal = journal;
        transaction->t_state = T_RUNNING;
        transaction->t_start_time = ktime_get();
        transaction->t_tid = journal->j_transaction_sequence++;
        transaction->t_expires = jiffies + journal->j_commit_interval;
        atomic_set(&transaction->t_updates, 0);
        atomic_set(&transaction->t_outstanding_credits,
                   jbd2_descriptor_blocks_per_trans(journal) +
                   atomic_read(&journal->j_reserved_credits));
        atomic_set(&transaction->t_outstanding_revokes, 0);
        atomic_set(&transaction->t_handle_count, 0);
        INIT_LIST_HEAD(&transaction->t_inode_list);
        INIT_LIST_HEAD(&transaction->t_private_list);

        /* Set up the commit timer for the new transaction. */
        journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
        add_timer(&journal->j_commit_timer);

        J_ASSERT(journal->j_running_transaction == NULL);
        journal->j_running_transaction = transaction;
        transaction->t_max_wait = 0;
        transaction->t_start = jiffies;
        transaction->t_requested = 0;
}

/*
 * Handle management.
 *
 * A handle_t is an object which represents a single atomic update to a
 * filesystem, and which tracks all of the modifications which form part
 * of that one update.
 */

/*
 * Update transaction's maximum wait time, if debugging is enabled.
 *
 * t_max_wait is carefully updated here with use of atomic compare exchange.
 * Note that there could be multiplre threads trying to do this simultaneously
 * hence using cmpxchg to avoid any use of locks in this case.
 * With this t_max_wait can be updated w/o enabling jbd2_journal_enable_debug.
 */
static inline void update_t_max_wait(transaction_t *transaction,
                                     unsigned long ts)
{
        unsigned long oldts, newts;

        if (time_after(transaction->t_start, ts)) {
                newts = jbd2_time_diff(ts, transaction->t_start);
                oldts = READ_ONCE(transaction->t_max_wait);
                while (oldts < newts)
                        oldts = cmpxchg(&transaction->t_max_wait, oldts, newts);
        }
}

/*
 * Wait until running transaction passes to T_FLUSH state and new transaction
 * can thus be started. Also starts the commit if needed. The function expects
 * running transaction to exist and releases j_state_lock.
 */
static void wait_transaction_locked(journal_t *journal)
        __releases(journal->j_state_lock)
{
        DEFINE_WAIT(wait);
        int need_to_start;
        tid_t tid = journal->j_running_transaction->t_tid;

        prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
                        TASK_UNINTERRUPTIBLE);
        need_to_start = !tid_geq(journal->j_commit_request, tid);
        read_unlock(&journal->j_state_lock);
        if (need_to_start)
                jbd2_log_start_commit(journal, tid);
        jbd2_might_wait_for_commit(journal);
        schedule();
        finish_wait(&journal->j_wait_transaction_locked, &wait);
}

/*
 * Wait until running transaction transitions from T_SWITCH to T_FLUSH
 * state and new transaction can thus be started. The function releases
 * j_state_lock.
 */
static void wait_transaction_switching(journal_t *journal)
        __releases(journal->j_state_lock)
{
        DEFINE_WAIT(wait);

        if (WARN_ON(!journal->j_running_transaction ||
                    journal->j_running_transaction->t_state != T_SWITCH)) {
                read_unlock(&journal->j_state_lock);
                return;
        }
        prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
                        TASK_UNINTERRUPTIBLE);
        read_unlock(&journal->j_state_lock);
        /*
         * We don't call jbd2_might_wait_for_commit() here as there's no
         * waiting for outstanding handles happening anymore in T_SWITCH state
         * and handling of reserved handles actually relies on that for
         * correctness.
         */
        schedule();
        finish_wait(&journal->j_wait_transaction_locked, &wait);
}

static void sub_reserved_credits(journal_t *journal, int blocks)
{
        atomic_sub(blocks, &journal->j_reserved_credits);
        wake_up(&journal->j_wait_reserved);
}

/*
 * Wait until we can add credits for handle to the running transaction.  Called
 * with j_state_lock held for reading. Returns 0 if handle joined the running
 * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
 * caller must retry.
 *
 * Note: because j_state_lock may be dropped depending on the return
 * value, we need to fake out sparse so ti doesn't complain about a
 * locking imbalance.  Callers of add_transaction_credits will need to
 * make a similar accomodation.
 */
static int add_transaction_credits(journal_t *journal, int blocks,
                                   int rsv_blocks)
__must_hold(&journal->j_state_lock)
{
        transaction_t *t = journal->j_running_transaction;
        int needed;
        int total = blocks + rsv_blocks;

        /*
         * If the current transaction is locked down for commit, wait
         * for the lock to be released.
         */
        if (t->t_state != T_RUNNING) {
                WARN_ON_ONCE(t->t_state >= T_FLUSH);
                wait_transaction_locked(journal);
                __acquire(&journal->j_state_lock); /* fake out sparse */
                return 1;
        }

        /*
         * If there is not enough space left in the log to write all
         * potential buffers requested by this operation, we need to
         * stall pending a log checkpoint to free some more log space.
         */
        needed = atomic_add_return(total, &t->t_outstanding_credits);
        if (needed > journal->j_max_transaction_buffers) {
                /*
                 * If the current transaction is already too large,
                 * then start to commit it: we can then go back and
                 * attach this handle to a new transaction.
                 */
                atomic_sub(total, &t->t_outstanding_credits);

                /*
                 * Is the number of reserved credits in the current transaction too
                 * big to fit this handle? Wait until reserved credits are freed.
                 */
                if (atomic_read(&journal->j_reserved_credits) + total >
                    journal->j_max_transaction_buffers) {
                        read_unlock(&journal->j_state_lock);
                        jbd2_might_wait_for_commit(journal);
                        wait_event(journal->j_wait_reserved,
                                   atomic_read(&journal->j_reserved_credits) + total <=
                                   journal->j_max_transaction_buffers);
                        __acquire(&journal->j_state_lock); /* fake out sparse */
                        return 1;
                }

                wait_transaction_locked(journal);
                __acquire(&journal->j_state_lock); /* fake out sparse */
                return 1;
        }

        /*
         * The commit code assumes that it can get enough log space
         * without forcing a checkpoint.  This is *critical* for
         * correctness: a checkpoint of a buffer which is also
         * associated with a committing transaction creates a deadlock,
         * so commit simply cannot force through checkpoints.
         *
         * We must therefore ensure the necessary space in the journal
         * *before* starting to dirty potentially checkpointed buffers
         * in the new transaction.
         */
        if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) {
                atomic_sub(total, &t->t_outstanding_credits);
                read_unlock(&journal->j_state_lock);
                jbd2_might_wait_for_commit(journal);
                write_lock(&journal->j_state_lock);
                if (jbd2_log_space_left(journal) <
                                        journal->j_max_transaction_buffers)
                        __jbd2_log_wait_for_space(journal);
                write_unlock(&journal->j_state_lock);
                __acquire(&journal->j_state_lock); /* fake out sparse */
                return 1;
        }

        /* No reservation? We are done... */
        if (!rsv_blocks)
                return 0;

        needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
        /* We allow at most half of a transaction to be reserved */
        if (needed > journal->j_max_transaction_buffers / 2) {
                sub_reserved_credits(journal, rsv_blocks);
                atomic_sub(total, &t->t_outstanding_credits);
                read_unlock(&journal->j_state_lock);
                jbd2_might_wait_for_commit(journal);
                wait_event(journal->j_wait_reserved,
                         atomic_read(&journal->j_reserved_credits) + rsv_blocks
                         <= journal->j_max_transaction_buffers / 2);
                __acquire(&journal->j_state_lock); /* fake out sparse */
                return 1;
        }
        return 0;
}

/*
 * start_this_handle: Given a handle, deal with any locking or stalling
 * needed to make sure that there is enough journal space for the handle
 * to begin.  Attach the handle to a transaction and set up the
 * transaction's buffer credits.
 */

static int start_this_handle(journal_t *journal, handle_t *handle,
                             gfp_t gfp_mask)
{
        transaction_t        *transaction, *new_transaction = NULL;
        int                blocks = handle->h_total_credits;
        int                rsv_blocks = 0;
        unsigned long ts = jiffies;

        if (handle->h_rsv_handle)
                rsv_blocks = handle->h_rsv_handle->h_total_credits;

        /*
         * Limit the number of reserved credits to 1/2 of maximum transaction
         * size and limit the number of total credits to not exceed maximum
         * transaction size per operation.
         */
        if ((rsv_blocks > journal->j_max_transaction_buffers / 2) ||
            (rsv_blocks + blocks > journal->j_max_transaction_buffers)) {
                printk(KERN_ERR "JBD2: %s wants too many credits "
                       "credits:%d rsv_credits:%d max:%d\n",
                       current->comm, blocks, rsv_blocks,
                       journal->j_max_transaction_buffers);
                WARN_ON(1);
                return -ENOSPC;
        }

alloc_transaction:
        /*
         * This check is racy but it is just an optimization of allocating new
         * transaction early if there are high chances we'll need it. If we
         * guess wrong, we'll retry or free unused transaction.
         */
        if (!data_race(journal->j_running_transaction)) {
                /*
                 * If __GFP_FS is not present, then we may be being called from
                 * inside the fs writeback layer, so we MUST NOT fail.
                 */
                if ((gfp_mask & __GFP_FS) == 0)
                        gfp_mask |= __GFP_NOFAIL;
                new_transaction = kmem_cache_zalloc(transaction_cache,
                                                    gfp_mask);
                if (!new_transaction)
                        return -ENOMEM;
        }

        jbd2_debug(3, "New handle %p going live.\n", handle);

        /*
         * We need to hold j_state_lock until t_updates has been incremented,
         * for proper journal barrier handling
         */
repeat:
        read_lock(&journal->j_state_lock);
        BUG_ON(journal->j_flags & JBD2_UNMOUNT);
        if (is_journal_aborted(journal) ||
            (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
                read_unlock(&journal->j_state_lock);
                jbd2_journal_free_transaction(new_transaction);
                return -EROFS;
        }

        /*
         * Wait on the journal's transaction barrier if necessary. Specifically
         * we allow reserved handles to proceed because otherwise commit could
         * deadlock on page writeback not being able to complete.
         */
        if (!handle->h_reserved && journal->j_barrier_count) {
                read_unlock(&journal->j_state_lock);
                wait_event(journal->j_wait_transaction_locked,
                                journal->j_barrier_count == 0);
                goto repeat;
        }

        if (!journal->j_running_transaction) {
                read_unlock(&journal->j_state_lock);
                if (!new_transaction)
                        goto alloc_transaction;
                write_lock(&journal->j_state_lock);
                if (!journal->j_running_transaction &&
                    (handle->h_reserved || !journal->j_barrier_count)) {
                        jbd2_get_transaction(journal, new_transaction);
                        new_transaction = NULL;
                }
                write_unlock(&journal->j_state_lock);
                goto repeat;
        }

        transaction = journal->j_running_transaction;

        if (!handle->h_reserved) {
                /* We may have dropped j_state_lock - restart in that case */
                if (add_transaction_credits(journal, blocks, rsv_blocks)) {
                        /*
                         * add_transaction_credits releases
                         * j_state_lock on a non-zero return
                         */
                        __release(&journal->j_state_lock);
                        goto repeat;
                }
        } else {
                /*
                 * We have handle reserved so we are allowed to join T_LOCKED
                 * transaction and we don't have to check for transaction size
                 * and journal space. But we still have to wait while running
                 * transaction is being switched to a committing one as it
                 * won't wait for any handles anymore.
                 */
                if (transaction->t_state == T_SWITCH) {
                        wait_transaction_switching(journal);
                        goto repeat;
                }
                sub_reserved_credits(journal, blocks);
                handle->h_reserved = 0;
        }

        /* OK, account for the buffers that this operation expects to
         * use and add the handle to the running transaction.
         */
        update_t_max_wait(transaction, ts);
        handle->h_transaction = transaction;
        handle->h_requested_credits = blocks;
        handle->h_revoke_credits_requested = handle->h_revoke_credits;
        handle->h_start_jiffies = jiffies;
        atomic_inc(&transaction->t_updates);
        atomic_inc(&transaction->t_handle_count);
        jbd2_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
                  handle, blocks,
                  atomic_read(&transaction->t_outstanding_credits),
                  jbd2_log_space_left(journal));
        read_unlock(&journal->j_state_lock);
        current->journal_info = handle;

        rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_);
        jbd2_journal_free_transaction(new_transaction);
        /*
         * Ensure that no allocations done while the transaction is open are
         * going to recurse back to the fs layer.
         */
        handle->saved_alloc_context = memalloc_nofs_save();
        return 0;
}

/* Allocate a new handle.  This should probably be in a slab... */
static handle_t *new_handle(int nblocks)
{
        handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
        if (!handle)
                return NULL;
        handle->h_total_credits = nblocks;
        handle->h_ref = 1;

        return handle;
}

handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
                              int revoke_records, gfp_t gfp_mask,
                              unsigned int type, unsigned int line_no)
{
        handle_t *handle = journal_current_handle();
        int err;

        if (!journal)
                return ERR_PTR(-EROFS);

        if (handle) {
                J_ASSERT(handle->h_transaction->t_journal == journal);
                handle->h_ref++;
                return handle;
        }

        nblocks += DIV_ROUND_UP(revoke_records,
                                journal->j_revoke_records_per_block);
        handle = new_handle(nblocks);
        if (!handle)
                return ERR_PTR(-ENOMEM);
        if (rsv_blocks) {
                handle_t *rsv_handle;

                rsv_handle = new_handle(rsv_blocks);
                if (!rsv_handle) {
                        jbd2_free_handle(handle);
                        return ERR_PTR(-ENOMEM);
                }
                rsv_handle->h_reserved = 1;
                rsv_handle->h_journal = journal;
                handle->h_rsv_handle = rsv_handle;
        }
        handle->h_revoke_credits = revoke_records;

        err = start_this_handle(journal, handle, gfp_mask);
        if (err < 0) {
                if (handle->h_rsv_handle)
                        jbd2_free_handle(handle->h_rsv_handle);
                jbd2_free_handle(handle);
                return ERR_PTR(err);
        }
        handle->h_type = type;
        handle->h_line_no = line_no;
        trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
                                handle->h_transaction->t_tid, type,
                                line_no, nblocks);

        return handle;
}
EXPORT_SYMBOL(jbd2__journal_start);


/**
 * jbd2_journal_start() - Obtain a new handle.
 * @journal: Journal to start transaction on.
 * @nblocks: number of block buffer we might modify
 *
 * We make sure that the transaction can guarantee at least nblocks of
 * modified buffers in the log.  We block until the log can guarantee
 * that much space. Additionally, if rsv_blocks > 0, we also create another
 * handle with rsv_blocks reserved blocks in the journal. This handle is
 * stored in h_rsv_handle. It is not attached to any particular transaction
 * and thus doesn't block transaction commit. If the caller uses this reserved
 * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
 * on the parent handle will dispose the reserved one. Reserved handle has to
 * be converted to a normal handle using jbd2_journal_start_reserved() before
 * it can be used.
 *
 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
 * on failure.
 */
handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
{
        return jbd2__journal_start(journal, nblocks, 0, 0, GFP_NOFS, 0, 0);
}
EXPORT_SYMBOL(jbd2_journal_start);

static void __jbd2_journal_unreserve_handle(handle_t *handle, transaction_t *t)
{
        journal_t *journal = handle->h_journal;

        WARN_ON(!handle->h_reserved);
        sub_reserved_credits(journal, handle->h_total_credits);
        if (t)
                atomic_sub(handle->h_total_credits, &t->t_outstanding_credits);
}

void jbd2_journal_free_reserved(handle_t *handle)
{
        journal_t *journal = handle->h_journal;

        /* Get j_state_lock to pin running transaction if it exists */
        read_lock(&journal->j_state_lock);
        __jbd2_journal_unreserve_handle(handle, journal->j_running_transaction);
        read_unlock(&journal->j_state_lock);
        jbd2_free_handle(handle);
}
EXPORT_SYMBOL(jbd2_journal_free_reserved);

/**
 * jbd2_journal_start_reserved() - start reserved handle
 * @handle: handle to start
 * @type: for handle statistics
 * @line_no: for handle statistics
 *
 * Start handle that has been previously reserved with jbd2_journal_reserve().
 * This attaches @handle to the running transaction (or creates one if there's
 * not transaction running). Unlike jbd2_journal_start() this function cannot
 * block on journal commit, checkpointing, or similar stuff. It can block on
 * memory allocation or frozen journal though.
 *
 * Return 0 on success, non-zero on error - handle is freed in that case.
 */
int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
                                unsigned int line_no)
{
        journal_t *journal = handle->h_journal;
        int ret = -EIO;

        if (WARN_ON(!handle->h_reserved)) {
                /* Someone passed in normal handle? Just stop it. */
                jbd2_journal_stop(handle);
                return ret;
        }
        /*
         * Usefulness of mixing of reserved and unreserved handles is
         * questionable. So far nobody seems to need it so just error out.
         */
        if (WARN_ON(current->journal_info)) {
                jbd2_journal_free_reserved(handle);
                return ret;
        }

        handle->h_journal = NULL;
        /*
         * GFP_NOFS is here because callers are likely from writeback or
         * similarly constrained call sites
         */
        ret = start_this_handle(journal, handle, GFP_NOFS);
        if (ret < 0) {
                handle->h_journal = journal;
                jbd2_journal_free_reserved(handle);
                return ret;
        }
        handle->h_type = type;
        handle->h_line_no = line_no;
        trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
                                handle->h_transaction->t_tid, type,
                                line_no, handle->h_total_credits);
        return 0;
}
EXPORT_SYMBOL(jbd2_journal_start_reserved);

/**
 * jbd2_journal_extend() - extend buffer credits.
 * @handle:  handle to 'extend'
 * @nblocks: nr blocks to try to extend by.
 * @revoke_records: number of revoke records to try to extend by.
 *
 * Some transactions, such as large extends and truncates, can be done
 * atomically all at once or in several stages.  The operation requests
 * a credit for a number of buffer modifications in advance, but can
 * extend its credit if it needs more.
 *
 * jbd2_journal_extend tries to give the running handle more buffer credits.
 * It does not guarantee that allocation - this is a best-effort only.
 * The calling process MUST be able to deal cleanly with a failure to
 * extend here.
 *
 * Return 0 on success, non-zero on failure.
 *
 * return code < 0 implies an error
 * return code > 0 implies normal transaction-full status.
 */
int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        int result;
        int wanted;

        if (is_handle_aborted(handle))
                return -EROFS;
        journal = transaction->t_journal;

        result = 1;

        read_lock(&journal->j_state_lock);

        /* Don't extend a locked-down transaction! */
        if (transaction->t_state != T_RUNNING) {
                jbd2_debug(3, "denied handle %p %d blocks: "
                          "transaction not running\n", handle, nblocks);
                goto error_out;
        }

        nblocks += DIV_ROUND_UP(
                        handle->h_revoke_credits_requested + revoke_records,
                        journal->j_revoke_records_per_block) -
                DIV_ROUND_UP(
                        handle->h_revoke_credits_requested,
                        journal->j_revoke_records_per_block);
        wanted = atomic_add_return(nblocks,
                                   &transaction->t_outstanding_credits);

        if (wanted > journal->j_max_transaction_buffers) {
                jbd2_debug(3, "denied handle %p %d blocks: "
                          "transaction too large\n", handle, nblocks);
                atomic_sub(nblocks, &transaction->t_outstanding_credits);
                goto error_out;
        }

        trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
                                 transaction->t_tid,
                                 handle->h_type, handle->h_line_no,
                                 handle->h_total_credits,
                                 nblocks);

        handle->h_total_credits += nblocks;
        handle->h_requested_credits += nblocks;
        handle->h_revoke_credits += revoke_records;
        handle->h_revoke_credits_requested += revoke_records;
        result = 0;

        jbd2_debug(3, "extended handle %p by %d\n", handle, nblocks);
error_out:
        read_unlock(&journal->j_state_lock);
        return result;
}

static void stop_this_handle(handle_t *handle)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
        int revokes;

        J_ASSERT(journal_current_handle() == handle);
        J_ASSERT(atomic_read(&transaction->t_updates) > 0);
        current->journal_info = NULL;
        /*
         * Subtract necessary revoke descriptor blocks from handle credits. We
         * take care to account only for revoke descriptor blocks the
         * transaction will really need as large sequences of transactions with
         * small numbers of revokes are relatively common.
         */
        revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits;
        if (revokes) {
                int t_revokes, revoke_descriptors;
                int rr_per_blk = journal->j_revoke_records_per_block;

                WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk)
                                > handle->h_total_credits);
                t_revokes = atomic_add_return(revokes,
                                &transaction->t_outstanding_revokes);
                revoke_descriptors =
                        DIV_ROUND_UP(t_revokes, rr_per_blk) -
                        DIV_ROUND_UP(t_revokes - revokes, rr_per_blk);
                handle->h_total_credits -= revoke_descriptors;
        }
        atomic_sub(handle->h_total_credits,
                   &transaction->t_outstanding_credits);
        if (handle->h_rsv_handle)
                __jbd2_journal_unreserve_handle(handle->h_rsv_handle,
                                                transaction);
        if (atomic_dec_and_test(&transaction->t_updates))
                wake_up(&journal->j_wait_updates);

        rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
        /*
         * Scope of the GFP_NOFS context is over here and so we can restore the
         * original alloc context.
         */
        memalloc_nofs_restore(handle->saved_alloc_context);
}

/**
 * jbd2__journal_restart() - restart a handle .
 * @handle:  handle to restart
 * @nblocks: nr credits requested
 * @revoke_records: number of revoke record credits requested
 * @gfp_mask: memory allocation flags (for start_this_handle)
 *
 * Restart a handle for a multi-transaction filesystem
 * operation.
 *
 * If the jbd2_journal_extend() call above fails to grant new buffer credits
 * to a running handle, a call to jbd2_journal_restart will commit the
 * handle's transaction so far and reattach the handle to a new
 * transaction capable of guaranteeing the requested number of
 * credits. We preserve reserved handle if there's any attached to the
 * passed in handle.
 */
int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records,
                          gfp_t gfp_mask)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        tid_t                tid;
        int                need_to_start;
        int                ret;

        /* If we've had an abort of any type, don't even think about
         * actually doing the restart! */
        if (is_handle_aborted(handle))
                return 0;
        journal = transaction->t_journal;
        tid = transaction->t_tid;

        /*
         * First unlink the handle from its current transaction, and start the
         * commit on that.
         */
        jbd2_debug(2, "restarting handle %p\n", handle);
        stop_this_handle(handle);
        handle->h_transaction = NULL;

        /*
         * TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can
          * get rid of pointless j_state_lock traffic like this.
         */
        read_lock(&journal->j_state_lock);
        need_to_start = !tid_geq(journal->j_commit_request, tid);
        read_unlock(&journal->j_state_lock);
        if (need_to_start)
                jbd2_log_start_commit(journal, tid);
        handle->h_total_credits = nblocks +
                DIV_ROUND_UP(revoke_records,
                             journal->j_revoke_records_per_block);
        handle->h_revoke_credits = revoke_records;
        ret = start_this_handle(journal, handle, gfp_mask);
        trace_jbd2_handle_restart(journal->j_fs_dev->bd_dev,
                                 ret ? 0 : handle->h_transaction->t_tid,
                                 handle->h_type, handle->h_line_no,
                                 handle->h_total_credits);
        return ret;
}
EXPORT_SYMBOL(jbd2__journal_restart);


int jbd2_journal_restart(handle_t *handle, int nblocks)
{
        return jbd2__journal_restart(handle, nblocks, 0, GFP_NOFS);
}
EXPORT_SYMBOL(jbd2_journal_restart);

/*
 * Waits for any outstanding t_updates to finish.
 * This is called with write j_state_lock held.
 */
void jbd2_journal_wait_updates(journal_t *journal)
{
        DEFINE_WAIT(wait);

        while (1) {
                /*
                 * Note that the running transaction can get freed under us if
                 * this transaction is getting committed in
                 * jbd2_journal_commit_transaction() ->
                 * jbd2_journal_free_transaction(). This can only happen when we
                 * release j_state_lock -> schedule() -> acquire j_state_lock.
                 * Hence we should everytime retrieve new j_running_transaction
                 * value (after j_state_lock release acquire cycle), else it may
                 * lead to use-after-free of old freed transaction.
                 */
                transaction_t *transaction = journal->j_running_transaction;

                if (!transaction)
                        break;

                prepare_to_wait(&journal->j_wait_updates, &wait,
                                TASK_UNINTERRUPTIBLE);
                if (!atomic_read(&transaction->t_updates)) {
                        finish_wait(&journal->j_wait_updates, &wait);
                        break;
                }
                write_unlock(&journal->j_state_lock);
                schedule();
                finish_wait(&journal->j_wait_updates, &wait);
                write_lock(&journal->j_state_lock);
        }
}

/**
 * jbd2_journal_lock_updates () - establish a transaction barrier.
 * @journal:  Journal to establish a barrier on.
 *
 * This locks out any further updates from being started, and blocks
 * until all existing updates have completed, returning only once the
 * journal is in a quiescent state with no updates running.
 *
 * The journal lock should not be held on entry.
 */
void jbd2_journal_lock_updates(journal_t *journal)
{
        jbd2_might_wait_for_commit(journal);

        write_lock(&journal->j_state_lock);
        ++journal->j_barrier_count;

        /* Wait until there are no reserved handles */
        if (atomic_read(&journal->j_reserved_credits)) {
                write_unlock(&journal->j_state_lock);
                wait_event(journal->j_wait_reserved,
                           atomic_read(&journal->j_reserved_credits) == 0);
                write_lock(&journal->j_state_lock);
        }

        /* Wait until there are no running t_updates */
        jbd2_journal_wait_updates(journal);

        write_unlock(&journal->j_state_lock);

        /*
         * We have now established a barrier against other normal updates, but
         * we also need to barrier against other jbd2_journal_lock_updates() calls
         * to make sure that we serialise special journal-locked operations
         * too.
         */
        mutex_lock(&journal->j_barrier);
}

/**
 * jbd2_journal_unlock_updates () - release barrier
 * @journal:  Journal to release the barrier on.
 *
 * Release a transaction barrier obtained with jbd2_journal_lock_updates().
 *
 * Should be called without the journal lock held.
 */
void jbd2_journal_unlock_updates (journal_t *journal)
{
        J_ASSERT(journal->j_barrier_count != 0);

        mutex_unlock(&journal->j_barrier);
        write_lock(&journal->j_state_lock);
        --journal->j_barrier_count;
        write_unlock(&journal->j_state_lock);
        wake_up_all(&journal->j_wait_transaction_locked);
}

static void warn_dirty_buffer(struct buffer_head *bh)
{
        printk(KERN_WARNING
               "JBD2: Spotted dirty metadata buffer (dev = %pg, blocknr = %llu). "
               "There's a risk of filesystem corruption in case of system "
               "crash.\n",
               bh->b_bdev, (unsigned long long)bh->b_blocknr);
}

/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */
static void jbd2_freeze_jh_data(struct journal_head *jh)
{
        char *source;
        struct buffer_head *bh = jh2bh(jh);

        J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n");
        source = kmap_local_folio(bh->b_folio, bh_offset(bh));
        /* Fire data frozen trigger just before we copy the data */
        jbd2_buffer_frozen_trigger(jh, source, jh->b_triggers);
        memcpy(jh->b_frozen_data, source, bh->b_size);
        kunmap_local(source);

        /*
         * Now that the frozen data is saved off, we need to store any matching
         * triggers.
         */
        jh->b_frozen_triggers = jh->b_triggers;
}

/*
 * If the buffer is already part of the current transaction, then there
 * is nothing we need to do.  If it is already part of a prior
 * transaction which we are still committing to disk, then we need to
 * make sure that we do not overwrite the old copy: we do copy-out to
 * preserve the copy going to disk.  We also account the buffer against
 * the handle's metadata buffer credits (unless the buffer is already
 * part of the transaction, that is).
 *
 */
static int
do_get_write_access(handle_t *handle, struct journal_head *jh,
                        int force_copy)
{
        struct buffer_head *bh;
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        int error;
        char *frozen_buffer = NULL;
        unsigned long start_lock, time_lock;

        journal = transaction->t_journal;

        jbd2_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);

        JBUFFER_TRACE(jh, "entry");
repeat:
        bh = jh2bh(jh);

        /* @@@ Need to check for errors here at some point. */

         start_lock = jiffies;
        lock_buffer(bh);
        spin_lock(&jh->b_state_lock);

        /* If it takes too long to lock the buffer, trace it */
        time_lock = jbd2_time_diff(start_lock, jiffies);
        if (time_lock > HZ/10)
                trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev,
                        jiffies_to_msecs(time_lock));

        /* We now hold the buffer lock so it is safe to query the buffer
         * state.  Is the buffer dirty?
         *
         * If so, there are two possibilities.  The buffer may be
         * non-journaled, and undergoing a quite legitimate writeback.
         * Otherwise, it is journaled, and we don't expect dirty buffers
         * in that state (the buffers should be marked JBD_Dirty
         * instead.)  So either the IO is being done under our own
         * control and this is a bug, or it's a third party IO such as
         * dump(8) (which may leave the buffer scheduled for read ---
         * ie. locked but not dirty) or tune2fs (which may actually have
         * the buffer dirtied, ugh.)  */

        if (buffer_dirty(bh) && jh->b_transaction) {
                warn_dirty_buffer(bh);
                /*
                 * We need to clean the dirty flag and we must do it under the
                 * buffer lock to be sure we don't race with running write-out.
                 */
                JBUFFER_TRACE(jh, "Journalling dirty buffer");
                clear_buffer_dirty(bh);
                /*
                 * The buffer is going to be added to BJ_Reserved list now and
                 * nothing guarantees jbd2_journal_dirty_metadata() will be
                 * ever called for it. So we need to set jbddirty bit here to
                 * make sure the buffer is dirtied and written out when the
                 * journaling machinery is done with it.
                 */
                set_buffer_jbddirty(bh);
        }

        error = -EROFS;
        if (is_handle_aborted(handle)) {
                spin_unlock(&jh->b_state_lock);
                unlock_buffer(bh);
                goto out;
        }
        error = 0;

        /*
         * The buffer is already part of this transaction if b_transaction or
         * b_next_transaction points to it
         */
        if (jh->b_transaction == transaction ||
            jh->b_next_transaction == transaction) {
                unlock_buffer(bh);
                goto done;
        }

        /*
         * this is the first time this transaction is touching this buffer,
         * reset the modified flag
         */
        jh->b_modified = 0;

        /*
         * If the buffer is not journaled right now, we need to make sure it
         * doesn't get written to disk before the caller actually commits the
         * new data
         */
        if (!jh->b_transaction) {
                JBUFFER_TRACE(jh, "no transaction");
                J_ASSERT_JH(jh, !jh->b_next_transaction);
                JBUFFER_TRACE(jh, "file as BJ_Reserved");
                /*
                 * Make sure all stores to jh (b_modified, b_frozen_data) are
                 * visible before attaching it to the running transaction.
                 * Paired with barrier in jbd2_write_access_granted()
                 */
                smp_wmb();
                spin_lock(&journal->j_list_lock);
                if (test_clear_buffer_dirty(bh)) {
                        /*
                         * Execute buffer dirty clearing and jh->b_transaction
                         * assignment under journal->j_list_lock locked to
                         * prevent bh being removed from checkpoint list if
                         * the buffer is in an intermediate state (not dirty
                         * and jh->b_transaction is NULL).
                         */
                        JBUFFER_TRACE(jh, "Journalling dirty buffer");
                        set_buffer_jbddirty(bh);
                }
                __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
                spin_unlock(&journal->j_list_lock);
                unlock_buffer(bh);
                goto done;
        }
        unlock_buffer(bh);

        /*
         * If there is already a copy-out version of this buffer, then we don't
         * need to make another one
         */
        if (jh->b_frozen_data) {
                JBUFFER_TRACE(jh, "has frozen data");
                J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
                goto attach_next;
        }

        JBUFFER_TRACE(jh, "owned by older transaction");
        J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
        J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction);

        /*
         * There is one case we have to be very careful about.  If the
         * committing transaction is currently writing this buffer out to disk
         * and has NOT made a copy-out, then we cannot modify the buffer
         * contents at all right now.  The essence of copy-out is that it is
         * the extra copy, not the primary copy, which gets journaled.  If the
         * primary copy is already going to disk then we cannot do copy-out
         * here.
         */
        if (buffer_shadow(bh)) {
                JBUFFER_TRACE(jh, "on shadow: sleep");
                spin_unlock(&jh->b_state_lock);
                wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
                goto repeat;
        }

        /*
         * Only do the copy if the currently-owning transaction still needs it.
         * If buffer isn't on BJ_Metadata list, the committing transaction is
         * past that stage (here we use the fact that BH_Shadow is set under
         * bh_state lock together with refiling to BJ_Shadow list and at this
         * point we know the buffer doesn't have BH_Shadow set).
         *
         * Subtle point, though: if this is a get_undo_access, then we will be
         * relying on the frozen_data to contain the new value of the
         * committed_data record after the transaction, so we HAVE to force the
         * frozen_data copy in that case.
         */
        if (jh->b_jlist == BJ_Metadata || force_copy) {
                JBUFFER_TRACE(jh, "generate frozen data");
                if (!frozen_buffer) {
                        JBUFFER_TRACE(jh, "allocate memory for buffer");
                        spin_unlock(&jh->b_state_lock);
                        frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
                                                   GFP_NOFS | __GFP_NOFAIL);
                        goto repeat;
                }
                jh->b_frozen_data = frozen_buffer;
                frozen_buffer = NULL;
                jbd2_freeze_jh_data(jh);
        }
attach_next:
        /*
         * Make sure all stores to jh (b_modified, b_frozen_data) are visible
         * before attaching it to the running transaction. Paired with barrier
         * in jbd2_write_access_granted()
         */
        smp_wmb();
        jh->b_next_transaction = transaction;

done:
        spin_unlock(&jh->b_state_lock);

        /*
         * If we are about to journal a buffer, then any revoke pending on it is
         * no longer valid
         */
        jbd2_journal_cancel_revoke(handle, jh);

out:
        if (unlikely(frozen_buffer))        /* It's usually NULL */
                jbd2_free(frozen_buffer, bh->b_size);

        JBUFFER_TRACE(jh, "exit");
        return error;
}

/* Fast check whether buffer is already attached to the required transaction */
static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh,
                                                        bool undo)
{
        struct journal_head *jh;
        bool ret = false;

        /* Dirty buffers require special handling... */
        if (buffer_dirty(bh))
                return false;

        /*
         * RCU protects us from dereferencing freed pages. So the checks we do
         * are guaranteed not to oops. However the jh slab object can get freed
         * & reallocated while we work with it. So we have to be careful. When
         * we see jh attached to the running transaction, we know it must stay
         * so until the transaction is committed. Thus jh won't be freed and
         * will be attached to the same bh while we run.  However it can
         * happen jh gets freed, reallocated, and attached to the transaction
         * just after we get pointer to it from bh. So we have to be careful
         * and recheck jh still belongs to our bh before we return success.
         */
        rcu_read_lock();
        if (!buffer_jbd(bh))
                goto out;
        /* This should be bh2jh() but that doesn't work with inline functions */
        jh = READ_ONCE(bh->b_private);
        if (!jh)
                goto out;
        /* For undo access buffer must have data copied */
        if (undo && !jh->b_committed_data)
                goto out;
        if (READ_ONCE(jh->b_transaction) != handle->h_transaction &&
            READ_ONCE(jh->b_next_transaction) != handle->h_transaction)
                goto out;
        /*
         * There are two reasons for the barrier here:
         * 1) Make sure to fetch b_bh after we did previous checks so that we
         * detect when jh went through free, realloc, attach to transaction
         * while we were checking. Paired with implicit barrier in that path.
         * 2) So that access to bh done after jbd2_write_access_granted()
         * doesn't get reordered and see inconsistent state of concurrent
         * do_get_write_access().
         */
        smp_mb();
        if (unlikely(jh->b_bh != bh))
                goto out;
        ret = true;
out:
        rcu_read_unlock();
        return ret;
}

/**
 * jbd2_journal_get_write_access() - notify intent to modify a buffer
 *                                     for metadata (not data) update.
 * @handle: transaction to add buffer modifications to
 * @bh:     bh to be used for metadata writes
 *
 * Returns: error code or 0 on success.
 *
 * In full data journalling mode the buffer may be of type BJ_AsyncData,
 * because we're ``write()ing`` a buffer which is also part of a shared mapping.
 */

int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
{
        struct journal_head *jh;
        journal_t *journal;
        int rc;

        if (is_handle_aborted(handle))
                return -EROFS;

        journal = handle->h_transaction->t_journal;
        if (jbd2_check_fs_dev_write_error(journal)) {
                /*
                 * If the fs dev has writeback errors, it may have failed
                 * to async write out metadata buffers in the background.
                 * In this case, we could read old data from disk and write
                 * it out again, which may lead to on-disk filesystem
                 * inconsistency. Aborting journal can avoid it happen.
                 */
                jbd2_journal_abort(journal, -EIO);
                return -EIO;
        }

        if (jbd2_write_access_granted(handle, bh, false))
                return 0;

        jh = jbd2_journal_add_journal_head(bh);
        /* We do not want to get caught playing with fields which the
         * log thread also manipulates.  Make sure that the buffer
         * completes any outstanding IO before proceeding. */
        rc = do_get_write_access(handle, jh, 0);
        jbd2_journal_put_journal_head(jh);
        return rc;
}


/*
 * When the user wants to journal a newly created buffer_head
 * (ie. getblk() returned a new buffer and we are going to populate it
 * manually rather than reading off disk), then we need to keep the
 * buffer_head locked until it has been completely filled with new
 * data.  In this case, we should be able to make the assertion that
 * the bh is not already part of an existing transaction.
 *
 * The buffer should already be locked by the caller by this point.
 * There is no lock ranking violation: it was a newly created,
 * unlocked buffer beforehand. */

/**
 * jbd2_journal_get_create_access () - notify intent to use newly created bh
 * @handle: transaction to new buffer to
 * @bh: new buffer.
 *
 * Call this if you create a new bh.
 */
int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        struct journal_head *jh = jbd2_journal_add_journal_head(bh);
        int err;

        jbd2_debug(5, "journal_head %p\n", jh);
        err = -EROFS;
        if (is_handle_aborted(handle))
                goto out;
        journal = transaction->t_journal;
        err = 0;

        JBUFFER_TRACE(jh, "entry");
        /*
         * The buffer may already belong to this transaction due to pre-zeroing
         * in the filesystem's new_block code.  It may also be on the previous,
         * committing transaction's lists, but it HAS to be in Forget state in
         * that case: the transaction must have deleted the buffer for it to be
         * reused here.
         */
        spin_lock(&jh->b_state_lock);
        J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
                jh->b_transaction == NULL ||
                (jh->b_transaction == journal->j_committing_transaction &&
                          jh->b_jlist == BJ_Forget)));

        J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
        J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));

        if (jh->b_transaction == NULL) {
                /*
                 * Previous jbd2_journal_forget() could have left the buffer
                 * with jbddirty bit set because it was being committed. When
                 * the commit finished, we've filed the buffer for
                 * checkpointing and marked it dirty. Now we are reallocating
                 * the buffer so the transaction freeing it must have
                 * committed and so it's safe to clear the dirty bit.
                 */
                clear_buffer_dirty(jh2bh(jh));
                /* first access by this transaction */
                jh->b_modified = 0;

                JBUFFER_TRACE(jh, "file as BJ_Reserved");
                spin_lock(&journal->j_list_lock);
                __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
                spin_unlock(&journal->j_list_lock);
        } else if (jh->b_transaction == journal->j_committing_transaction) {
                /* first access by this transaction */
                jh->b_modified = 0;

                JBUFFER_TRACE(jh, "set next transaction");
                spin_lock(&journal->j_list_lock);
                jh->b_next_transaction = transaction;
                spin_unlock(&journal->j_list_lock);
        }
        spin_unlock(&jh->b_state_lock);

        /*
         * akpm: I added this.  ext3_alloc_branch can pick up new indirect
         * blocks which contain freed but then revoked metadata.  We need
         * to cancel the revoke in case we end up freeing it yet again
         * and the reallocating as data - this would cause a second revoke,
         * which hits an assertion error.
         */
        JBUFFER_TRACE(jh, "cancelling revoke");
        jbd2_journal_cancel_revoke(handle, jh);
out:
        jbd2_journal_put_journal_head(jh);
        return err;
}

/**
 * jbd2_journal_get_undo_access() -  Notify intent to modify metadata with
 *     non-rewindable consequences
 * @handle: transaction
 * @bh: buffer to undo
 *
 * Sometimes there is a need to distinguish between metadata which has
 * been committed to disk and that which has not.  The ext3fs code uses
 * this for freeing and allocating space, we have to make sure that we
 * do not reuse freed space until the deallocation has been committed,
 * since if we overwrote that space we would make the delete
 * un-rewindable in case of a crash.
 *
 * To deal with that, jbd2_journal_get_undo_access requests write access to a
 * buffer for parts of non-rewindable operations such as delete
 * operations on the bitmaps.  The journaling code must keep a copy of
 * the buffer's contents prior to the undo_access call until such time
 * as we know that the buffer has definitely been committed to disk.
 *
 * We never need to know which transaction the committed data is part
 * of, buffers touched here are guaranteed to be dirtied later and so
 * will be committed to a new transaction in due course, at which point
 * we can discard the old committed data pointer.
 *
 * Returns error number or 0 on success.
 */
int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
{
        int err;
        struct journal_head *jh;
        char *committed_data = NULL;

        if (is_handle_aborted(handle))
                return -EROFS;

        if (jbd2_write_access_granted(handle, bh, true))
                return 0;

        jh = jbd2_journal_add_journal_head(bh);
        JBUFFER_TRACE(jh, "entry");

        /*
         * Do this first --- it can drop the journal lock, so we want to
         * make sure that obtaining the committed_data is done
         * atomically wrt. completion of any outstanding commits.
         */
        err = do_get_write_access(handle, jh, 1);
        if (err)
                goto out;

repeat:
        if (!jh->b_committed_data)
                committed_data = jbd2_alloc(jh2bh(jh)->b_size,
                                            GFP_NOFS|__GFP_NOFAIL);

        spin_lock(&jh->b_state_lock);
        if (!jh->b_committed_data) {
                /* Copy out the current buffer contents into the
                 * preserved, committed copy. */
                JBUFFER_TRACE(jh, "generate b_committed data");
                if (!committed_data) {
                        spin_unlock(&jh->b_state_lock);
                        goto repeat;
                }

                jh->b_committed_data = committed_data;
                committed_data = NULL;
                memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
        }
        spin_unlock(&jh->b_state_lock);
out:
        jbd2_journal_put_journal_head(jh);
        if (unlikely(committed_data))
                jbd2_free(committed_data, bh->b_size);
        return err;
}

/**
 * jbd2_journal_set_triggers() - Add triggers for commit writeout
 * @bh: buffer to trigger on
 * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
 *
 * Set any triggers on this journal_head.  This is always safe, because
 * triggers for a committing buffer will be saved off, and triggers for
 * a running transaction will match the buffer in that transaction.
 *
 * Call with NULL to clear the triggers.
 */
void jbd2_journal_set_triggers(struct buffer_head *bh,
                               struct jbd2_buffer_trigger_type *type)
{
        struct journal_head *jh = jbd2_journal_grab_journal_head(bh);

        if (WARN_ON_ONCE(!jh))
                return;
        jh->b_triggers = type;
        jbd2_journal_put_journal_head(jh);
}

void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
                                struct jbd2_buffer_trigger_type *triggers)
{
        struct buffer_head *bh = jh2bh(jh);

        if (!triggers || !triggers->t_frozen)
                return;

        triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
}

void jbd2_buffer_abort_trigger(struct journal_head *jh,
                               struct jbd2_buffer_trigger_type *triggers)
{
        if (!triggers || !triggers->t_abort)
                return;

        triggers->t_abort(triggers, jh2bh(jh));
}

/**
 * jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
 * @handle: transaction to add buffer to.
 * @bh: buffer to mark
 *
 * mark dirty metadata which needs to be journaled as part of the current
 * transaction.
 *
 * The buffer must have previously had jbd2_journal_get_write_access()
 * called so that it has a valid journal_head attached to the buffer
 * head.
 *
 * The buffer is placed on the transaction's metadata list and is marked
 * as belonging to the transaction.
 *
 * Returns error number or 0 on success.
 *
 * Special care needs to be taken if the buffer already belongs to the
 * current committing transaction (in which case we should have frozen
 * data present for that commit).  In that case, we don't relink the
 * buffer: that only gets done when the old transaction finally
 * completes its commit.
 */
int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        struct journal_head *jh;
        int ret = 0;

        if (!buffer_jbd(bh))
                return -EUCLEAN;

        /*
         * We don't grab jh reference here since the buffer must be part
         * of the running transaction.
         */
        jh = bh2jh(bh);
        jbd2_debug(5, "journal_head %p\n", jh);
        JBUFFER_TRACE(jh, "entry");

        /*
         * This and the following assertions are unreliable since we may see jh
         * in inconsistent state unless we grab bh_state lock. But this is
         * crucial to catch bugs so let's do a reliable check until the
         * lockless handling is fully proven.
         */
        if (data_race(jh->b_transaction != transaction &&
            jh->b_next_transaction != transaction)) {
                spin_lock(&jh->b_state_lock);
                J_ASSERT_JH(jh, jh->b_transaction == transaction ||
                                jh->b_next_transaction == transaction);
                spin_unlock(&jh->b_state_lock);
        }
        if (jh->b_modified == 1) {
                /* If it's in our transaction it must be in BJ_Metadata list. */
                if (data_race(jh->b_transaction == transaction &&
                    jh->b_jlist != BJ_Metadata)) {
                        spin_lock(&jh->b_state_lock);
                        if (jh->b_transaction == transaction &&
                            jh->b_jlist != BJ_Metadata)
                                pr_err("JBD2: assertion failure: h_type=%u "
                                       "h_line_no=%u block_no=%llu jlist=%u\n",
                                       handle->h_type, handle->h_line_no,
                                       (unsigned long long) bh->b_blocknr,
                                       jh->b_jlist);
                        J_ASSERT_JH(jh, jh->b_transaction != transaction ||
                                        jh->b_jlist == BJ_Metadata);
                        spin_unlock(&jh->b_state_lock);
                }
                goto out;
        }

        journal = transaction->t_journal;
        spin_lock(&jh->b_state_lock);

        if (is_handle_aborted(handle)) {
                /*
                 * Check journal aborting with @jh->b_state_lock locked,
                 * since 'jh->b_transaction' could be replaced with
                 * 'jh->b_next_transaction' during old transaction
                 * committing if journal aborted, which may fail
                 * assertion on 'jh->b_frozen_data == NULL'.
                 */
                ret = -EROFS;
                goto out_unlock_bh;
        }

        if (jh->b_modified == 0) {
                /*
                 * This buffer's got modified and becoming part
                 * of the transaction. This needs to be done
                 * once a transaction -bzzz
                 */
                if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= 0)) {
                        ret = -ENOSPC;
                        goto out_unlock_bh;
                }
                jh->b_modified = 1;
                handle->h_total_credits--;
        }

        /*
         * fastpath, to avoid expensive locking.  If this buffer is already
         * on the running transaction's metadata list there is nothing to do.
         * Nobody can take it off again because there is a handle open.
         * I _think_ we're OK here with SMP barriers - a mistaken decision will
         * result in this test being false, so we go in and take the locks.
         */
        if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
                JBUFFER_TRACE(jh, "fastpath");
                if (unlikely(jh->b_transaction !=
                             journal->j_running_transaction)) {
                        printk(KERN_ERR "JBD2: %s: "
                               "jh->b_transaction (%llu, %p, %u) != "
                               "journal->j_running_transaction (%p, %u)\n",
                               journal->j_devname,
                               (unsigned long long) bh->b_blocknr,
                               jh->b_transaction,
                               jh->b_transaction ? jh->b_transaction->t_tid : 0,
                               journal->j_running_transaction,
                               journal->j_running_transaction ?
                               journal->j_running_transaction->t_tid : 0);
                        ret = -EINVAL;
                }
                goto out_unlock_bh;
        }

        set_buffer_jbddirty(bh);

        /*
         * Metadata already on the current transaction list doesn't
         * need to be filed.  Metadata on another transaction's list must
         * be committing, and will be refiled once the commit completes:
         * leave it alone for now.
         */
        if (jh->b_transaction != transaction) {
                JBUFFER_TRACE(jh, "already on other transaction");
                if (unlikely(((jh->b_transaction !=
                               journal->j_committing_transaction)) ||
                             (jh->b_next_transaction != transaction))) {
                        printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: "
                               "bad jh for block %llu: "
                               "transaction (%p, %u), "
                               "jh->b_transaction (%p, %u), "
                               "jh->b_next_transaction (%p, %u), jlist %u\n",
                               journal->j_devname,
                               (unsigned long long) bh->b_blocknr,
                               transaction, transaction->t_tid,
                               jh->b_transaction,
                               jh->b_transaction ?
                               jh->b_transaction->t_tid : 0,
                               jh->b_next_transaction,
                               jh->b_next_transaction ?
                               jh->b_next_transaction->t_tid : 0,
                               jh->b_jlist);
                        WARN_ON(1);
                        ret = -EINVAL;
                }
                /* And this case is illegal: we can't reuse another
                 * transaction's data buffer, ever. */
                goto out_unlock_bh;
        }

        /* That test should have eliminated the following case: */
        J_ASSERT_JH(jh, jh->b_frozen_data == NULL);

        JBUFFER_TRACE(jh, "file as BJ_Metadata");
        spin_lock(&journal->j_list_lock);
        __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
        spin_unlock(&journal->j_list_lock);
out_unlock_bh:
        spin_unlock(&jh->b_state_lock);
out:
        JBUFFER_TRACE(jh, "exit");
        return ret;
}

/**
 * jbd2_journal_forget() - bforget() for potentially-journaled buffers.
 * @handle: transaction handle
 * @bh:     bh to 'forget'
 *
 * We can only do the bforget if there are no commits pending against the
 * buffer.  If the buffer is dirty in the current running transaction we
 * can safely unlink it.
 *
 * bh may not be a journalled buffer at all - it may be a non-JBD
 * buffer which came off the hashtable.  Check for this.
 *
 * Decrements bh->b_count by one.
 *
 * Allow this call even if the handle has aborted --- it may be part of
 * the caller's cleanup after an abort.
 */
int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        struct journal_head *jh;
        int drop_reserve = 0;
        int err = 0;
        int was_modified = 0;

        if (is_handle_aborted(handle))
                return -EROFS;
        journal = transaction->t_journal;

        BUFFER_TRACE(bh, "entry");

        jh = jbd2_journal_grab_journal_head(bh);
        if (!jh) {
                __bforget(bh);
                return 0;
        }

        spin_lock(&jh->b_state_lock);

        /* Critical error: attempting to delete a bitmap buffer, maybe?
         * Don't do any jbd operations, and return an error. */
        if (!J_EXPECT_JH(jh, !jh->b_committed_data,
                         "inconsistent data on disk")) {
                err = -EIO;
                goto drop;
        }

        /* keep track of whether or not this transaction modified us */
        was_modified = jh->b_modified;

        /*
         * The buffer's going from the transaction, we must drop
         * all references -bzzz
         */
        jh->b_modified = 0;

        if (jh->b_transaction == transaction) {
                J_ASSERT_JH(jh, !jh->b_frozen_data);

                /* If we are forgetting a buffer which is already part
                 * of this transaction, then we can just drop it from
                 * the transaction immediately. */
                clear_buffer_dirty(bh);
                clear_buffer_jbddirty(bh);

                JBUFFER_TRACE(jh, "belongs to current transaction: unfile");

                /*
                 * we only want to drop a reference if this transaction
                 * modified the buffer
                 */
                if (was_modified)
                        drop_reserve = 1;

                /*
                 * We are no longer going to journal this buffer.
                 * However, the commit of this transaction is still
                 * important to the buffer: the delete that we are now
                 * processing might obsolete an old log entry, so by
                 * committing, we can satisfy the buffer's checkpoint.
                 *
                 * So, if we have a checkpoint on the buffer, we should
                 * now refile the buffer on our BJ_Forget list so that
                 * we know to remove the checkpoint after we commit.
                 */

                spin_lock(&journal->j_list_lock);
                if (jh->b_cp_transaction) {
                        __jbd2_journal_temp_unlink_buffer(jh);
                        __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
                } else {
                        __jbd2_journal_unfile_buffer(jh);
                        jbd2_journal_put_journal_head(jh);
                }
                spin_unlock(&journal->j_list_lock);
        } else if (jh->b_transaction) {
                J_ASSERT_JH(jh, (jh->b_transaction ==
                                 journal->j_committing_transaction));
                /* However, if the buffer is still owned by a prior
                 * (committing) transaction, we can't drop it yet... */
                JBUFFER_TRACE(jh, "belongs to older transaction");
                /* ... but we CAN drop it from the new transaction through
                 * marking the buffer as freed and set j_next_transaction to
                 * the new transaction, so that not only the commit code
                 * knows it should clear dirty bits when it is done with the
                 * buffer, but also the buffer can be checkpointed only
                 * after the new transaction commits. */

                set_buffer_freed(bh);

                if (!jh->b_next_transaction) {
                        spin_lock(&journal->j_list_lock);
                        jh->b_next_transaction = transaction;
                        spin_unlock(&journal->j_list_lock);
                } else {
                        J_ASSERT(jh->b_next_transaction == transaction);

                        /*
                         * only drop a reference if this transaction modified
                         * the buffer
                         */
                        if (was_modified)
                                drop_reserve = 1;
                }
        } else {
                /*
                 * Finally, if the buffer is not belongs to any
                 * transaction, we can just drop it now if it has no
                 * checkpoint.
                 */
                spin_lock(&journal->j_list_lock);
                if (!jh->b_cp_transaction) {
                        JBUFFER_TRACE(jh, "belongs to none transaction");
                        spin_unlock(&journal->j_list_lock);
                        goto drop;
                }

                /*
                 * Otherwise, if the buffer has been written to disk,
                 * it is safe to remove the checkpoint and drop it.
                 */
                if (jbd2_journal_try_remove_checkpoint(jh) >= 0) {
                        spin_unlock(&journal->j_list_lock);
                        goto drop;
                }

                /*
                 * The buffer is still not written to disk, we should
                 * attach this buffer to current transaction so that the
                 * buffer can be checkpointed only after the current
                 * transaction commits.
                 */
                clear_buffer_dirty(bh);
                __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
                spin_unlock(&journal->j_list_lock);
        }
drop:
        __brelse(bh);
        spin_unlock(&jh->b_state_lock);
        jbd2_journal_put_journal_head(jh);
        if (drop_reserve) {
                /* no need to reserve log space for this block -bzzz */
                handle->h_total_credits++;
        }
        return err;
}

/**
 * jbd2_journal_stop() - complete a transaction
 * @handle: transaction to complete.
 *
 * All done for a particular handle.
 *
 * There is not much action needed here.  We just return any remaining
 * buffer credits to the transaction and remove the handle.  The only
 * complication is that we need to start a commit operation if the
 * filesystem is marked for synchronous update.
 *
 * jbd2_journal_stop itself will not usually return an error, but it may
 * do so in unusual circumstances.  In particular, expect it to
 * return -EIO if a jbd2_journal_abort has been executed since the
 * transaction began.
 */
int jbd2_journal_stop(handle_t *handle)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        int err = 0, wait_for_commit = 0;
        tid_t tid;
        pid_t pid;

        if (--handle->h_ref > 0) {
                jbd2_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
                                                 handle->h_ref);
                if (is_handle_aborted(handle))
                        return -EIO;
                return 0;
        }
        if (!transaction) {
                /*
                 * Handle is already detached from the transaction so there is
                 * nothing to do other than free the handle.
                 */
                memalloc_nofs_restore(handle->saved_alloc_context);
                goto free_and_exit;
        }
        journal = transaction->t_journal;
        tid = transaction->t_tid;

        if (is_handle_aborted(handle))
                err = -EIO;

        jbd2_debug(4, "Handle %p going down\n", handle);
        trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
                                tid, handle->h_type, handle->h_line_no,
                                jiffies - handle->h_start_jiffies,
                                handle->h_sync, handle->h_requested_credits,
                                (handle->h_requested_credits -
                                 handle->h_total_credits));

        /*
         * Implement synchronous transaction batching.  If the handle
         * was synchronous, don't force a commit immediately.  Let's
         * yield and let another thread piggyback onto this
         * transaction.  Keep doing that while new threads continue to
         * arrive.  It doesn't cost much - we're about to run a commit
         * and sleep on IO anyway.  Speeds up many-threaded, many-dir
         * operations by 30x or more...
         *
         * We try and optimize the sleep time against what the
         * underlying disk can do, instead of having a static sleep
         * time.  This is useful for the case where our storage is so
         * fast that it is more optimal to go ahead and force a flush
         * and wait for the transaction to be committed than it is to
         * wait for an arbitrary amount of time for new writers to
         * join the transaction.  We achieve this by measuring how
         * long it takes to commit a transaction, and compare it with
         * how long this transaction has been running, and if run time
         * < commit time then we sleep for the delta and commit.  This
         * greatly helps super fast disks that would see slowdowns as
         * more threads started doing fsyncs.
         *
         * But don't do this if this process was the most recent one
         * to perform a synchronous write.  We do this to detect the
         * case where a single process is doing a stream of sync
         * writes.  No point in waiting for joiners in that case.
         *
         * Setting max_batch_time to 0 disables this completely.
         */
        pid = current->pid;
        if (handle->h_sync && journal->j_last_sync_writer != pid &&
            journal->j_max_batch_time) {
                u64 commit_time, trans_time;

                journal->j_last_sync_writer = pid;

                read_lock(&journal->j_state_lock);
                commit_time = journal->j_average_commit_time;
                read_unlock(&journal->j_state_lock);

                trans_time = ktime_to_ns(ktime_sub(ktime_get(),
                                                   transaction->t_start_time));

                commit_time = max_t(u64, commit_time,
                                    1000*journal->j_min_batch_time);
                commit_time = min_t(u64, commit_time,
                                    1000*journal->j_max_batch_time);

                if (trans_time < commit_time) {
                        ktime_t expires = ktime_add_ns(ktime_get(),
                                                       commit_time);
                        set_current_state(TASK_UNINTERRUPTIBLE);
                        schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
                }
        }

        if (handle->h_sync)
                transaction->t_synchronous_commit = 1;

        /*
         * If the handle is marked SYNC, we need to set another commit
         * going!  We also want to force a commit if the transaction is too
         * old now.
         */
        if (handle->h_sync ||
            time_after_eq(jiffies, transaction->t_expires)) {
                /* Do this even for aborted journals: an abort still
                 * completes the commit thread, it just doesn't write
                 * anything to disk. */

                jbd2_debug(2, "transaction too old, requesting commit for "
                                        "handle %p\n", handle);
                /* This is non-blocking */
                jbd2_log_start_commit(journal, tid);

                /*
                 * Special case: JBD2_SYNC synchronous updates require us
                 * to wait for the commit to complete.
                 */
                if (handle->h_sync && !(current->flags & PF_MEMALLOC))
                        wait_for_commit = 1;
        }

        /*
         * Once stop_this_handle() drops t_updates, the transaction could start
         * committing on us and eventually disappear.  So we must not
         * dereference transaction pointer again after calling
         * stop_this_handle().
         */
        stop_this_handle(handle);

        if (wait_for_commit)
                err = jbd2_log_wait_commit(journal, tid);

free_and_exit:
        if (handle->h_rsv_handle)
                jbd2_free_handle(handle->h_rsv_handle);
        jbd2_free_handle(handle);
        return err;
}

/*
 *
 * List management code snippets: various functions for manipulating the
 * transaction buffer lists.
 *
 */

/*
 * Append a buffer to a transaction list, given the transaction's list head
 * pointer.
 *
 * j_list_lock is held.
 *
 * jh->b_state_lock is held.
 */

static inline void
__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
{
        if (!*list) {
                jh->b_tnext = jh->b_tprev = jh;
                *list = jh;
        } else {
                /* Insert at the tail of the list to preserve order */
                struct journal_head *first = *list, *last = first->b_tprev;
                jh->b_tprev = last;
                jh->b_tnext = first;
                last->b_tnext = first->b_tprev = jh;
        }
}

/*
 * Remove a buffer from a transaction list, given the transaction's list
 * head pointer.
 *
 * Called with j_list_lock held, and the journal may not be locked.
 *
 * jh->b_state_lock is held.
 */

static inline void
__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
{
        if (*list == jh) {
                *list = jh->b_tnext;
                if (*list == jh)
                        *list = NULL;
        }
        jh->b_tprev->b_tnext = jh->b_tnext;
        jh->b_tnext->b_tprev = jh->b_tprev;
}

/*
 * Remove a buffer from the appropriate transaction list.
 *
 * Note that this function can *change* the value of
 * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
 * t_reserved_list.  If the caller is holding onto a copy of one of these
 * pointers, it could go bad.  Generally the caller needs to re-read the
 * pointer from the transaction_t.
 *
 * Called under j_list_lock.
 */
static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
{
        struct journal_head **list = NULL;
        transaction_t *transaction;
        struct buffer_head *bh = jh2bh(jh);

        lockdep_assert_held(&jh->b_state_lock);
        transaction = jh->b_transaction;
        if (transaction)
                assert_spin_locked(&transaction->t_journal->j_list_lock);

        J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
        if (jh->b_jlist != BJ_None)
                J_ASSERT_JH(jh, transaction != NULL);

        switch (jh->b_jlist) {
        case BJ_None:
                return;
        case BJ_Metadata:
                transaction->t_nr_buffers--;
                J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
                list = &transaction->t_buffers;
                break;
        case BJ_Forget:
                list = &transaction->t_forget;
                break;
        case BJ_Shadow:
                list = &transaction->t_shadow_list;
                break;
        case BJ_Reserved:
                list = &transaction->t_reserved_list;
                break;
        }

        __blist_del_buffer(list, jh);
        jh->b_jlist = BJ_None;
        if (transaction && is_journal_aborted(transaction->t_journal))
                clear_buffer_jbddirty(bh);
        else if (test_clear_buffer_jbddirty(bh))
                mark_buffer_dirty(bh);        /* Expose it to the VM */
}

/*
 * Remove buffer from all transactions. The caller is responsible for dropping
 * the jh reference that belonged to the transaction.
 *
 * Called with bh_state lock and j_list_lock
 */
static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
{
        J_ASSERT_JH(jh, jh->b_transaction != NULL);
        J_ASSERT_JH(jh, jh->b_next_transaction == NULL);

        __jbd2_journal_temp_unlink_buffer(jh);
        jh->b_transaction = NULL;
}

void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
{
        struct buffer_head *bh = jh2bh(jh);

        /* Get reference so that buffer cannot be freed before we unlock it */
        get_bh(bh);
        spin_lock(&jh->b_state_lock);
        spin_lock(&journal->j_list_lock);
        __jbd2_journal_unfile_buffer(jh);
        spin_unlock(&journal->j_list_lock);
        spin_unlock(&jh->b_state_lock);
        jbd2_journal_put_journal_head(jh);
        __brelse(bh);
}

/**
 * jbd2_journal_try_to_free_buffers() - try to free page buffers.
 * @journal: journal for operation
 * @folio: Folio to detach data from.
 *
 * For all the buffers on this page,
 * if they are fully written out ordered data, move them onto BUF_CLEAN
 * so try_to_free_buffers() can reap them.
 *
 * This function returns non-zero if we wish try_to_free_buffers()
 * to be called. We do this if the page is releasable by try_to_free_buffers().
 * We also do it if the page has locked or dirty buffers and the caller wants
 * us to perform sync or async writeout.
 *
 * This complicates JBD locking somewhat.  We aren't protected by the
 * BKL here.  We wish to remove the buffer from its committing or
 * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
 *
 * This may *change* the value of transaction_t->t_datalist, so anyone
 * who looks at t_datalist needs to lock against this function.
 *
 * Even worse, someone may be doing a jbd2_journal_dirty_data on this
 * buffer.  So we need to lock against that.  jbd2_journal_dirty_data()
 * will come out of the lock with the buffer dirty, which makes it
 * ineligible for release here.
 *
 * Who else is affected by this?  hmm...  Really the only contender
 * is do_get_write_access() - it could be looking at the buffer while
 * journal_try_to_free_buffer() is changing its state.  But that
 * cannot happen because we never reallocate freed data as metadata
 * while the data is part of a transaction.  Yes?
 *
 * Return false on failure, true on success
 */
bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio)
{
        struct buffer_head *head;
        struct buffer_head *bh;
        bool ret = false;

        J_ASSERT(folio_test_locked(folio));

        head = folio_buffers(folio);
        bh = head;
        do {
                struct journal_head *jh;

                /*
                 * We take our own ref against the journal_head here to avoid
                 * having to add tons of locking around each instance of
                 * jbd2_journal_put_journal_head().
                 */
                jh = jbd2_journal_grab_journal_head(bh);
                if (!jh)
                        continue;

                spin_lock(&jh->b_state_lock);
                if (!jh->b_transaction && !jh->b_next_transaction) {
                        spin_lock(&journal->j_list_lock);
                        /* Remove written-back checkpointed metadata buffer */
                        if (jh->b_cp_transaction != NULL)
                                jbd2_journal_try_remove_checkpoint(jh);
                        spin_unlock(&journal->j_list_lock);
                }
                spin_unlock(&jh->b_state_lock);
                jbd2_journal_put_journal_head(jh);
                if (buffer_jbd(bh))
                        goto busy;
        } while ((bh = bh->b_this_page) != head);

        ret = try_to_free_buffers(folio);
busy:
        return ret;
}

/*
 * This buffer is no longer needed.  If it is on an older transaction's
 * checkpoint list we need to record it on this transaction's forget list
 * to pin this buffer (and hence its checkpointing transaction) down until
 * this transaction commits.  If the buffer isn't on a checkpoint list, we
 * release it.
 * Returns non-zero if JBD no longer has an interest in the buffer.
 *
 * Called under j_list_lock.
 *
 * Called under jh->b_state_lock.
 */
static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
{
        int may_free = 1;
        struct buffer_head *bh = jh2bh(jh);

        if (jh->b_cp_transaction) {
                JBUFFER_TRACE(jh, "on running+cp transaction");
                __jbd2_journal_temp_unlink_buffer(jh);
                /*
                 * We don't want to write the buffer anymore, clear the
                 * bit so that we don't confuse checks in
                 * __journal_file_buffer
                 */
                clear_buffer_dirty(bh);
                __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
                may_free = 0;
        } else {
                JBUFFER_TRACE(jh, "on running transaction");
                __jbd2_journal_unfile_buffer(jh);
                jbd2_journal_put_journal_head(jh);
        }
        return may_free;
}

/*
 * jbd2_journal_invalidate_folio
 *
 * This code is tricky.  It has a number of cases to deal with.
 *
 * There are two invariants which this code relies on:
 *
 * i_size must be updated on disk before we start calling invalidate_folio
 * on the data.
 *
 *  This is done in ext3 by defining an ext3_setattr method which
 *  updates i_size before truncate gets going.  By maintaining this
 *  invariant, we can be sure that it is safe to throw away any buffers
 *  attached to the current transaction: once the transaction commits,
 *  we know that the data will not be needed.
 *
 *  Note however that we can *not* throw away data belonging to the
 *  previous, committing transaction!
 *
 * Any disk blocks which *are* part of the previous, committing
 * transaction (and which therefore cannot be discarded immediately) are
 * not going to be reused in the new running transaction
 *
 *  The bitmap committed_data images guarantee this: any block which is
 *  allocated in one transaction and removed in the next will be marked
 *  as in-use in the committed_data bitmap, so cannot be reused until
 *  the next transaction to delete the block commits.  This means that
 *  leaving committing buffers dirty is quite safe: the disk blocks
 *  cannot be reallocated to a different file and so buffer aliasing is
 *  not possible.
 *
 *
 * The above applies mainly to ordered data mode.  In writeback mode we
 * don't make guarantees about the order in which data hits disk --- in
 * particular we don't guarantee that new dirty data is flushed before
 * transaction commit --- so it is always safe just to discard data
 * immediately in that mode.  --sct
 */

/*
 * The journal_unmap_buffer helper function returns zero if the buffer
 * concerned remains pinned as an anonymous buffer belonging to an older
 * transaction.
 *
 * We're outside-transaction here.  Either or both of j_running_transaction
 * and j_committing_transaction may be NULL.
 */
static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
                                int partial_page)
{
        transaction_t *transaction;
        struct journal_head *jh;
        int may_free = 1;

        BUFFER_TRACE(bh, "entry");

        /*
         * It is safe to proceed here without the j_list_lock because the
         * buffers cannot be stolen by try_to_free_buffers as long as we are
         * holding the page lock. --sct
         */

        jh = jbd2_journal_grab_journal_head(bh);
        if (!jh)
                goto zap_buffer_unlocked;

        /* OK, we have data buffer in journaled mode */
        write_lock(&journal->j_state_lock);
        spin_lock(&jh->b_state_lock);
        spin_lock(&journal->j_list_lock);

        /*
         * We cannot remove the buffer from checkpoint lists until the
         * transaction adding inode to orphan list (let's call it T)
         * is committed.  Otherwise if the transaction changing the
         * buffer would be cleaned from the journal before T is
         * committed, a crash will cause that the correct contents of
         * the buffer will be lost.  On the other hand we have to
         * clear the buffer dirty bit at latest at the moment when the
         * transaction marking the buffer as freed in the filesystem
         * structures is committed because from that moment on the
         * block can be reallocated and used by a different page.
         * Since the block hasn't been freed yet but the inode has
         * already been added to orphan list, it is safe for us to add
         * the buffer to BJ_Forget list of the newest transaction.
         *
         * Also we have to clear buffer_mapped flag of a truncated buffer
         * because the buffer_head may be attached to the page straddling
         * i_size (can happen only when blocksize < pagesize) and thus the
         * buffer_head can be reused when the file is extended again. So we end
         * up keeping around invalidated buffers attached to transactions'
         * BJ_Forget list just to stop checkpointing code from cleaning up
         * the transaction this buffer was modified in.
         */
        transaction = jh->b_transaction;
        if (transaction == NULL) {
                /* First case: not on any transaction.  If it
                 * has no checkpoint link, then we can zap it:
                 * it's a writeback-mode buffer so we don't care
                 * if it hits disk safely. */
                if (!jh->b_cp_transaction) {
                        JBUFFER_TRACE(jh, "not on any transaction: zap");
                        goto zap_buffer;
                }

                if (!buffer_dirty(bh)) {
                        /* bdflush has written it.  We can drop it now */
                        __jbd2_journal_remove_checkpoint(jh);
                        goto zap_buffer;
                }

                /* OK, it must be in the journal but still not
                 * written fully to disk: it's metadata or
                 * journaled data... */

                if (journal->j_running_transaction) {
                        /* ... and once the current transaction has
                         * committed, the buffer won't be needed any
                         * longer. */
                        JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
                        may_free = __dispose_buffer(jh,
                                        journal->j_running_transaction);
                        goto zap_buffer;
                } else {
                        /* There is no currently-running transaction. So the
                         * orphan record which we wrote for this file must have
                         * passed into commit.  We must attach this buffer to
                         * the committing transaction, if it exists. */
                        if (journal->j_committing_transaction) {
                                JBUFFER_TRACE(jh, "give to committing trans");
                                may_free = __dispose_buffer(jh,
                                        journal->j_committing_transaction);
                                goto zap_buffer;
                        } else {
                                /* The orphan record's transaction has
                                 * committed.  We can cleanse this buffer */
                                clear_buffer_jbddirty(bh);
                                __jbd2_journal_remove_checkpoint(jh);
                                goto zap_buffer;
                        }
                }
        } else if (transaction == journal->j_committing_transaction) {
                JBUFFER_TRACE(jh, "on committing transaction");
                /*
                 * The buffer is committing, we simply cannot touch
                 * it. If the page is straddling i_size we have to wait
                 * for commit and try again.
                 */
                if (partial_page) {
                        spin_unlock(&journal->j_list_lock);
                        spin_unlock(&jh->b_state_lock);
                        write_unlock(&journal->j_state_lock);
                        jbd2_journal_put_journal_head(jh);
                        /* Already zapped buffer? Nothing to do... */
                        if (!bh->b_bdev)
                                return 0;
                        return -EBUSY;
                }
                /*
                 * OK, buffer won't be reachable after truncate. We just clear
                 * b_modified to not confuse transaction credit accounting, and
                 * set j_next_transaction to the running transaction (if there
                 * is one) and mark buffer as freed so that commit code knows
                 * it should clear dirty bits when it is done with the buffer.
                 */
                set_buffer_freed(bh);
                if (journal->j_running_transaction && buffer_jbddirty(bh))
                        jh->b_next_transaction = journal->j_running_transaction;
                jh->b_modified = 0;
                spin_unlock(&journal->j_list_lock);
                spin_unlock(&jh->b_state_lock);
                write_unlock(&journal->j_state_lock);
                jbd2_journal_put_journal_head(jh);
                return 0;
        } else {
                /* Good, the buffer belongs to the running transaction.
                 * We are writing our own transaction's data, not any
                 * previous one's, so it is safe to throw it away
                 * (remember that we expect the filesystem to have set
                 * i_size already for this truncate so recovery will not
                 * expose the disk blocks we are discarding here.) */
                J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
                JBUFFER_TRACE(jh, "on running transaction");
                may_free = __dispose_buffer(jh, transaction);
        }

zap_buffer:
        /*
         * This is tricky. Although the buffer is truncated, it may be reused
         * if blocksize < pagesize and it is attached to the page straddling
         * EOF. Since the buffer might have been added to BJ_Forget list of the
         * running transaction, journal_get_write_access() won't clear
         * b_modified and credit accounting gets confused. So clear b_modified
         * here.
         */
        jh->b_modified = 0;
        spin_unlock(&journal->j_list_lock);
        spin_unlock(&jh->b_state_lock);
        write_unlock(&journal->j_state_lock);
        jbd2_journal_put_journal_head(jh);
zap_buffer_unlocked:
        clear_buffer_dirty(bh);
        J_ASSERT_BH(bh, !buffer_jbddirty(bh));
        clear_buffer_mapped(bh);
        clear_buffer_req(bh);
        clear_buffer_new(bh);
        clear_buffer_delay(bh);
        clear_buffer_unwritten(bh);
        bh->b_bdev = NULL;
        return may_free;
}

/**
 * jbd2_journal_invalidate_folio()
 * @journal: journal to use for flush...
 * @folio:    folio to flush
 * @offset:  start of the range to invalidate
 * @length:  length of the range to invalidate
 *
 * Reap page buffers containing data after in the specified range in page.
 * Can return -EBUSY if buffers are part of the committing transaction and
 * the page is straddling i_size. Caller then has to wait for current commit
 * and try again.
 */
int jbd2_journal_invalidate_folio(journal_t *journal, struct folio *folio,
                                size_t offset, size_t length)
{
        struct buffer_head *head, *bh, *next;
        unsigned int stop = offset + length;
        unsigned int curr_off = 0;
        int partial_page = (offset || length < folio_size(folio));
        int may_free = 1;
        int ret = 0;

        if (!folio_test_locked(folio))
                BUG();
        head = folio_buffers(folio);
        if (!head)
                return 0;

        BUG_ON(stop > folio_size(folio) || stop < length);

        /* We will potentially be playing with lists other than just the
         * data lists (especially for journaled data mode), so be
         * cautious in our locking. */

        bh = head;
        do {
                unsigned int next_off = curr_off + bh->b_size;
                next = bh->b_this_page;

                if (next_off > stop)
                        return 0;

                if (offset <= curr_off) {
                        /* This block is wholly outside the truncation point */
                        lock_buffer(bh);
                        ret = journal_unmap_buffer(journal, bh, partial_page);
                        unlock_buffer(bh);
                        if (ret < 0)
                                return ret;
                        may_free &= ret;
                }
                curr_off = next_off;
                bh = next;

        } while (bh != head);

        if (!partial_page) {
                if (may_free && try_to_free_buffers(folio))
                        J_ASSERT(!folio_buffers(folio));
        }
        return 0;
}

/*
 * File a buffer on the given transaction list.
 */
void __jbd2_journal_file_buffer(struct journal_head *jh,
                        transaction_t *transaction, int jlist)
{
        struct journal_head **list = NULL;
        int was_dirty = 0;
        struct buffer_head *bh = jh2bh(jh);

        lockdep_assert_held(&jh->b_state_lock);
        assert_spin_locked(&transaction->t_journal->j_list_lock);

        J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
        J_ASSERT_JH(jh, jh->b_transaction == transaction ||
                                jh->b_transaction == NULL);

        if (jh->b_transaction && jh->b_jlist == jlist)
                return;

        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
            jlist == BJ_Shadow || jlist == BJ_Forget) {
                /*
                 * For metadata buffers, we track dirty bit in buffer_jbddirty
                 * instead of buffer_dirty. We should not see a dirty bit set
                 * here because we clear it in do_get_write_access but e.g.
                 * tune2fs can modify the sb and set the dirty bit at any time
                 * so we try to gracefully handle that.
                 */
                if (buffer_dirty(bh))
                        warn_dirty_buffer(bh);
                if (test_clear_buffer_dirty(bh) ||
                    test_clear_buffer_jbddirty(bh))
                        was_dirty = 1;
        }

        if (jh->b_transaction)
                __jbd2_journal_temp_unlink_buffer(jh);
        else
                jbd2_journal_grab_journal_head(bh);
        jh->b_transaction = transaction;

        switch (jlist) {
        case BJ_None:
                J_ASSERT_JH(jh, !jh->b_committed_data);
                J_ASSERT_JH(jh, !jh->b_frozen_data);
                return;
        case BJ_Metadata:
                transaction->t_nr_buffers++;
                list = &transaction->t_buffers;
                break;
        case BJ_Forget:
                list = &transaction->t_forget;
                break;
        case BJ_Shadow:
                list = &transaction->t_shadow_list;
                break;
        case BJ_Reserved:
                list = &transaction->t_reserved_list;
                break;
        }

        __blist_add_buffer(list, jh);
        jh->b_jlist = jlist;

        if (was_dirty)
                set_buffer_jbddirty(bh);
}

void jbd2_journal_file_buffer(struct journal_head *jh,
                                transaction_t *transaction, int jlist)
{
        spin_lock(&jh->b_state_lock);
        spin_lock(&transaction->t_journal->j_list_lock);
        __jbd2_journal_file_buffer(jh, transaction, jlist);
        spin_unlock(&transaction->t_journal->j_list_lock);
        spin_unlock(&jh->b_state_lock);
}

/*
 * Remove a buffer from its current buffer list in preparation for
 * dropping it from its current transaction entirely.  If the buffer has
 * already started to be used by a subsequent transaction, refile the
 * buffer on that transaction's metadata list.
 *
 * Called under j_list_lock
 * Called under jh->b_state_lock
 *
 * When this function returns true, there's no next transaction to refile to
 * and the caller has to drop jh reference through
 * jbd2_journal_put_journal_head().
 */
bool __jbd2_journal_refile_buffer(struct journal_head *jh)
{
        int was_dirty, jlist;
        struct buffer_head *bh = jh2bh(jh);

        lockdep_assert_held(&jh->b_state_lock);
        if (jh->b_transaction)
                assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);

        /* If the buffer is now unused, just drop it. */
        if (jh->b_next_transaction == NULL) {
                __jbd2_journal_unfile_buffer(jh);
                return true;
        }

        /*
         * It has been modified by a later transaction: add it to the new
         * transaction's metadata list.
         */

        was_dirty = test_clear_buffer_jbddirty(bh);
        __jbd2_journal_temp_unlink_buffer(jh);

        /*
         * b_transaction must be set, otherwise the new b_transaction won't
         * be holding jh reference
         */
        J_ASSERT_JH(jh, jh->b_transaction != NULL);

        /*
         * We set b_transaction here because b_next_transaction will inherit
         * our jh reference and thus __jbd2_journal_file_buffer() must not
         * take a new one.
         */
        WRITE_ONCE(jh->b_transaction, jh->b_next_transaction);
        WRITE_ONCE(jh->b_next_transaction, NULL);
        if (buffer_freed(bh))
                jlist = BJ_Forget;
        else if (jh->b_modified)
                jlist = BJ_Metadata;
        else
                jlist = BJ_Reserved;
        __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
        J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);

        if (was_dirty)
                set_buffer_jbddirty(bh);
        return false;
}

/*
 * __jbd2_journal_refile_buffer() with necessary locking added. We take our
 * bh reference so that we can safely unlock bh.
 *
 * The jh and bh may be freed by this call.
 */
void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
{
        bool drop;

        spin_lock(&jh->b_state_lock);
        spin_lock(&journal->j_list_lock);
        drop = __jbd2_journal_refile_buffer(jh);
        spin_unlock(&jh->b_state_lock);
        spin_unlock(&journal->j_list_lock);
        if (drop)
                jbd2_journal_put_journal_head(jh);
}

/*
 * File inode in the inode list of the handle's transaction
 */
static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode,
                unsigned long flags, loff_t start_byte, loff_t end_byte)
{
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;

        if (is_handle_aborted(handle))
                return -EROFS;
        journal = transaction->t_journal;

        jbd2_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
                        transaction->t_tid);

        spin_lock(&journal->j_list_lock);
        jinode->i_flags |= flags;

        if (jinode->i_dirty_end) {
                jinode->i_dirty_start = min(jinode->i_dirty_start, start_byte);
                jinode->i_dirty_end = max(jinode->i_dirty_end, end_byte);
        } else {
                jinode->i_dirty_start = start_byte;
                jinode->i_dirty_end = end_byte;
        }

        /* Is inode already attached where we need it? */
        if (jinode->i_transaction == transaction ||
            jinode->i_next_transaction == transaction)
                goto done;

        /*
         * We only ever set this variable to 1 so the test is safe. Since
         * t_need_data_flush is likely to be set, we do the test to save some
         * cacheline bouncing
         */
        if (!transaction->t_need_data_flush)
                transaction->t_need_data_flush = 1;
        /* On some different transaction's list - should be
         * the committing one */
        if (jinode->i_transaction) {
                J_ASSERT(jinode->i_next_transaction == NULL);
                J_ASSERT(jinode->i_transaction ==
                                        journal->j_committing_transaction);
                jinode->i_next_transaction = transaction;
                goto done;
        }
        /* Not on any transaction list... */
        J_ASSERT(!jinode->i_next_transaction);
        jinode->i_transaction = transaction;
        list_add(&jinode->i_list, &transaction->t_inode_list);
done:
        spin_unlock(&journal->j_list_lock);

        return 0;
}

int jbd2_journal_inode_ranged_write(handle_t *handle,
                struct jbd2_inode *jinode, loff_t start_byte, loff_t length)
{
        return jbd2_journal_file_inode(handle, jinode,
                        JI_WRITE_DATA | JI_WAIT_DATA, start_byte,
                        start_byte + length - 1);
}

int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *jinode,
                loff_t start_byte, loff_t length)
{
        return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA,
                        start_byte, start_byte + length - 1);
}

/*
 * File truncate and transaction commit interact with each other in a
 * non-trivial way.  If a transaction writing data block A is
 * committing, we cannot discard the data by truncate until we have
 * written them.  Otherwise if we crashed after the transaction with
 * write has committed but before the transaction with truncate has
 * committed, we could see stale data in block A.  This function is a
 * helper to solve this problem.  It starts writeout of the truncated
 * part in case it is in the committing transaction.
 *
 * Filesystem code must call this function when inode is journaled in
 * ordered mode before truncation happens and after the inode has been
 * placed on orphan list with the new inode size. The second condition
 * avoids the race that someone writes new data and we start
 * committing the transaction after this function has been called but
 * before a transaction for truncate is started (and furthermore it
 * allows us to optimize the case where the addition to orphan list
 * happens in the same transaction as write --- we don't have to write
 * any data in such case).
 */
int jbd2_journal_begin_ordered_truncate(journal_t *journal,
                                        struct jbd2_inode *jinode,
                                        loff_t new_size)
{
        transaction_t *inode_trans, *commit_trans;
        int ret = 0;

        /* This is a quick check to avoid locking if not necessary */
        if (!jinode->i_transaction)
                goto out;
        /* Locks are here just to force reading of recent values, it is
         * enough that the transaction was not committing before we started
         * a transaction adding the inode to orphan list */
        read_lock(&journal->j_state_lock);
        commit_trans = journal->j_committing_transaction;
        read_unlock(&journal->j_state_lock);
        spin_lock(&journal->j_list_lock);
        inode_trans = jinode->i_transaction;
        spin_unlock(&journal->j_list_lock);
        if (inode_trans == commit_trans) {
                ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
                        new_size, LLONG_MAX);
                if (ret)
                        jbd2_journal_abort(journal, ret);
        }
out:
        return ret;
}






































































































    1 













    1 




    1 













    1 
    1 






























































    1 









































    1 












    1 































    1 














    1 


















    1 














    1 












    1 





    1 






    1 





    1 


    1 






    1 














    1 
































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
// SPDX-License-Identifier: GPL-2.0
/*
 *
 * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
 *
 */

#include <linux/fs.h>

#include "debug.h"
#include "ntfs.h"
#include "ntfs_fs.h"

static inline int compare_attr(const struct ATTRIB *left, enum ATTR_TYPE type,
                               const __le16 *name, u8 name_len,
                               const u16 *upcase)
{
        /* First, compare the type codes. */
        int diff = le32_to_cpu(left->type) - le32_to_cpu(type);

        if (diff)
                return diff;

        /* They have the same type code, so we have to compare the names. */
        return ntfs_cmp_names(attr_name(left), left->name_len, name, name_len,
                              upcase, true);
}

/*
 * mi_new_attt_id
 *
 * Return: Unused attribute id that is less than mrec->next_attr_id.
 */
static __le16 mi_new_attt_id(struct mft_inode *mi)
{
        u16 free_id, max_id, t16;
        struct MFT_REC *rec = mi->mrec;
        struct ATTRIB *attr;
        __le16 id;

        id = rec->next_attr_id;
        free_id = le16_to_cpu(id);
        if (free_id < 0x7FFF) {
                rec->next_attr_id = cpu_to_le16(free_id + 1);
                return id;
        }

        /* One record can store up to 1024/24 ~= 42 attributes. */
        free_id = 0;
        max_id = 0;

        attr = NULL;

        for (;;) {
                attr = mi_enum_attr(mi, attr);
                if (!attr) {
                        rec->next_attr_id = cpu_to_le16(max_id + 1);
                        mi->dirty = true;
                        return cpu_to_le16(free_id);
                }

                t16 = le16_to_cpu(attr->id);
                if (t16 == free_id) {
                        free_id += 1;
                        attr = NULL;
                } else if (max_id < t16)
                        max_id = t16;
        }
}

int mi_get(struct ntfs_sb_info *sbi, CLST rno, struct mft_inode **mi)
{
        int err;
        struct mft_inode *m = kzalloc(sizeof(struct mft_inode), GFP_NOFS);

        if (!m)
                return -ENOMEM;

        err = mi_init(m, sbi, rno);
        if (err) {
                kfree(m);
                return err;
        }

        err = mi_read(m, false);
        if (err) {
                mi_put(m);
                return err;
        }

        *mi = m;
        return 0;
}

void mi_put(struct mft_inode *mi)
{
        mi_clear(mi);
        kfree(mi);
}

int mi_init(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno)
{
        mi->sbi = sbi;
        mi->rno = rno;
        mi->mrec = kmalloc(sbi->record_size, GFP_NOFS);
        if (!mi->mrec)
                return -ENOMEM;

        return 0;
}

/*
 * mi_read - Read MFT data.
 */
int mi_read(struct mft_inode *mi, bool is_mft)
{
        int err;
        struct MFT_REC *rec = mi->mrec;
        struct ntfs_sb_info *sbi = mi->sbi;
        u32 bpr = sbi->record_size;
        u64 vbo = (u64)mi->rno << sbi->record_bits;
        struct ntfs_inode *mft_ni = sbi->mft.ni;
        struct runs_tree *run = mft_ni ? &mft_ni->file.run : NULL;
        struct rw_semaphore *rw_lock = NULL;

        if (is_mounted(sbi)) {
                if (!is_mft && mft_ni) {
                        rw_lock = &mft_ni->file.run_lock;
                        down_read(rw_lock);
                }
        }

        err = ntfs_read_bh(sbi, run, vbo, &rec->rhdr, bpr, &mi->nb);
        if (rw_lock)
                up_read(rw_lock);
        if (!err)
                goto ok;

        if (err == -E_NTFS_FIXUP) {
                mi->dirty = true;
                goto ok;
        }

        if (err != -ENOENT)
                goto out;

        if (rw_lock) {
                ni_lock(mft_ni);
                down_write(rw_lock);
        }
        err = attr_load_runs_vcn(mft_ni, ATTR_DATA, NULL, 0, run,
                                 vbo >> sbi->cluster_bits);
        if (rw_lock) {
                up_write(rw_lock);
                ni_unlock(mft_ni);
        }
        if (err)
                goto out;

        if (rw_lock)
                down_read(rw_lock);
        err = ntfs_read_bh(sbi, run, vbo, &rec->rhdr, bpr, &mi->nb);
        if (rw_lock)
                up_read(rw_lock);

        if (err == -E_NTFS_FIXUP) {
                mi->dirty = true;
                goto ok;
        }
        if (err)
                goto out;

ok:
        /* Check field 'total' only here. */
        if (le32_to_cpu(rec->total) != bpr) {
                err = -EINVAL;
                goto out;
        }

        return 0;

out:
        if (err == -E_NTFS_CORRUPT) {
                ntfs_err(sbi->sb, "mft corrupted");
                ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
                err = -EINVAL;
        }

        return err;
}

/*
 * mi_enum_attr - start/continue attributes enumeration in record.
 *
 * NOTE: mi->mrec - memory of size sbi->record_size
 * here we sure that mi->mrec->total == sbi->record_size (see mi_read)
 */
struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
{
        const struct MFT_REC *rec = mi->mrec;
        u32 used = le32_to_cpu(rec->used);
        u32 t32, off, asize, prev_type;
        u16 t16;
        u64 data_size, alloc_size, tot_size;

        if (!attr) {
                u32 total = le32_to_cpu(rec->total);

                off = le16_to_cpu(rec->attr_off);

                if (used > total)
                        return NULL;

                if (off >= used || off < MFTRECORD_FIXUP_OFFSET_1 ||
                    !IS_ALIGNED(off, 4)) {
                        return NULL;
                }

                /* Skip non-resident records. */
                if (!is_rec_inuse(rec))
                        return NULL;

                prev_type = 0;
                attr = Add2Ptr(rec, off);
        } else {
                /* Check if input attr inside record. */
                off = PtrOffset(rec, attr);
                if (off >= used)
                        return NULL;

                asize = le32_to_cpu(attr->size);
                if (asize < SIZEOF_RESIDENT) {
                        /* Impossible 'cause we should not return such attribute. */
                        return NULL;
                }

                /* Overflow check. */
                if (off + asize < off)
                        return NULL;

                prev_type = le32_to_cpu(attr->type);
                attr = Add2Ptr(attr, asize);
                off += asize;
        }

        asize = le32_to_cpu(attr->size);

        /* Can we use the first field (attr->type). */
        if (off + 8 > used) {
                static_assert(ALIGN(sizeof(enum ATTR_TYPE), 8) == 8);
                return NULL;
        }

        if (attr->type == ATTR_END) {
                /* End of enumeration. */
                return NULL;
        }

        /* 0x100 is last known attribute for now. */
        t32 = le32_to_cpu(attr->type);
        if (!t32 || (t32 & 0xf) || (t32 > 0x100))
                return NULL;

        /* attributes in record must be ordered by type */
        if (t32 < prev_type)
                return NULL;

        /* Check overflow and boundary. */
        if (off + asize < off || off + asize > used)
                return NULL;

        /* Check size of attribute. */
        if (!attr->non_res) {
                /* Check resident fields. */
                if (asize < SIZEOF_RESIDENT)
                        return NULL;

                t16 = le16_to_cpu(attr->res.data_off);
                if (t16 > asize)
                        return NULL;

                if (le32_to_cpu(attr->res.data_size) > asize - t16)
                        return NULL;

                t32 = sizeof(short) * attr->name_len;
                if (t32 && le16_to_cpu(attr->name_off) + t32 > t16)
                        return NULL;

                return attr;
        }

        /* Check nonresident fields. */
        if (attr->non_res != 1)
                return NULL;

        t16 = le16_to_cpu(attr->nres.run_off);
        if (t16 > asize)
                return NULL;

        t32 = sizeof(short) * attr->name_len;
        if (t32 && le16_to_cpu(attr->name_off) + t32 > t16)
                return NULL;

        /* Check start/end vcn. */
        if (le64_to_cpu(attr->nres.svcn) > le64_to_cpu(attr->nres.evcn) + 1)
                return NULL;

        data_size = le64_to_cpu(attr->nres.data_size);
        if (le64_to_cpu(attr->nres.valid_size) > data_size)
                return NULL;

        alloc_size = le64_to_cpu(attr->nres.alloc_size);
        if (data_size > alloc_size)
                return NULL;

        t32 = mi->sbi->cluster_mask;
        if (alloc_size & t32)
                return NULL;

        if (!attr->nres.svcn && is_attr_ext(attr)) {
                /* First segment of sparse/compressed attribute */
                if (asize + 8 < SIZEOF_NONRESIDENT_EX)
                        return NULL;

                tot_size = le64_to_cpu(attr->nres.total_size);
                if (tot_size & t32)
                        return NULL;

                if (tot_size > alloc_size)
                        return NULL;
        } else {
                if (asize + 8 < SIZEOF_NONRESIDENT)
                        return NULL;

                if (attr->nres.c_unit)
                        return NULL;
        }

        return attr;
}

/*
 * mi_find_attr - Find the attribute by type and name and id.
 */
struct ATTRIB *mi_find_attr(struct mft_inode *mi, struct ATTRIB *attr,
                            enum ATTR_TYPE type, const __le16 *name,
                            u8 name_len, const __le16 *id)
{
        u32 type_in = le32_to_cpu(type);
        u32 atype;

next_attr:
        attr = mi_enum_attr(mi, attr);
        if (!attr)
                return NULL;

        atype = le32_to_cpu(attr->type);
        if (atype > type_in)
                return NULL;

        if (atype < type_in)
                goto next_attr;

        if (attr->name_len != name_len)
                goto next_attr;

        if (name_len && memcmp(attr_name(attr), name, name_len * sizeof(short)))
                goto next_attr;

        if (id && *id != attr->id)
                goto next_attr;

        return attr;
}

int mi_write(struct mft_inode *mi, int wait)
{
        struct MFT_REC *rec;
        int err;
        struct ntfs_sb_info *sbi;

        if (!mi->dirty)
                return 0;

        sbi = mi->sbi;
        rec = mi->mrec;

        err = ntfs_write_bh(sbi, &rec->rhdr, &mi->nb, wait);
        if (err)
                return err;

        if (mi->rno < sbi->mft.recs_mirr)
                sbi->flags |= NTFS_FLAGS_MFTMIRR;

        mi->dirty = false;

        return 0;
}

int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno,
                  __le16 flags, bool is_mft)
{
        int err;
        u16 seq = 1;
        struct MFT_REC *rec;
        u64 vbo = (u64)rno << sbi->record_bits;

        err = mi_init(mi, sbi, rno);
        if (err)
                return err;

        rec = mi->mrec;

        if (rno == MFT_REC_MFT) {
                ;
        } else if (rno < MFT_REC_FREE) {
                seq = rno;
        } else if (rno >= sbi->mft.used) {
                ;
        } else if (mi_read(mi, is_mft)) {
                ;
        } else if (rec->rhdr.sign == NTFS_FILE_SIGNATURE) {
                /* Record is reused. Update its sequence number. */
                seq = le16_to_cpu(rec->seq) + 1;
                if (!seq)
                        seq = 1;
        }

        memcpy(rec, sbi->new_rec, sbi->record_size);

        rec->seq = cpu_to_le16(seq);
        rec->flags = RECORD_FLAG_IN_USE | flags;
        if (MFTRECORD_FIXUP_OFFSET == MFTRECORD_FIXUP_OFFSET_3)
                rec->mft_record = cpu_to_le32(rno);

        mi->dirty = true;

        if (!mi->nb.nbufs) {
                struct ntfs_inode *ni = sbi->mft.ni;
                bool lock = false;

                if (is_mounted(sbi) && !is_mft) {
                        down_read(&ni->file.run_lock);
                        lock = true;
                }

                err = ntfs_get_bh(sbi, &ni->file.run, vbo, sbi->record_size,
                                  &mi->nb);
                if (lock)
                        up_read(&ni->file.run_lock);
        }

        return err;
}

/*
 * mi_insert_attr - Reserve space for new attribute.
 *
 * Return: Not full constructed attribute or NULL if not possible to create.
 */
struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
                              const __le16 *name, u8 name_len, u32 asize,
                              u16 name_off)
{
        size_t tail;
        struct ATTRIB *attr;
        __le16 id;
        struct MFT_REC *rec = mi->mrec;
        struct ntfs_sb_info *sbi = mi->sbi;
        u32 used = le32_to_cpu(rec->used);
        const u16 *upcase = sbi->upcase;

        /* Can we insert mi attribute? */
        if (used + asize > sbi->record_size)
                return NULL;

        /*
         * Scan through the list of attributes to find the point
         * at which we should insert it.
         */
        attr = NULL;
        while ((attr = mi_enum_attr(mi, attr))) {
                int diff = compare_attr(attr, type, name, name_len, upcase);

                if (diff < 0)
                        continue;

                if (!diff && !is_attr_indexed(attr))
                        return NULL;
                break;
        }

        if (!attr) {
                /* Append. */
                tail = 8;
                attr = Add2Ptr(rec, used - 8);
        } else {
                /* Insert before 'attr'. */
                tail = used - PtrOffset(rec, attr);
        }

        id = mi_new_attt_id(mi);

        memmove(Add2Ptr(attr, asize), attr, tail);
        memset(attr, 0, asize);

        attr->type = type;
        attr->size = cpu_to_le32(asize);
        attr->name_len = name_len;
        attr->name_off = cpu_to_le16(name_off);
        attr->id = id;

        memmove(Add2Ptr(attr, name_off), name, name_len * sizeof(short));
        rec->used = cpu_to_le32(used + asize);

        mi->dirty = true;

        return attr;
}

/*
 * mi_remove_attr - Remove the attribute from record.
 *
 * NOTE: The source attr will point to next attribute.
 */
bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi,
                    struct ATTRIB *attr)
{
        struct MFT_REC *rec = mi->mrec;
        u32 aoff = PtrOffset(rec, attr);
        u32 used = le32_to_cpu(rec->used);
        u32 asize = le32_to_cpu(attr->size);

        if (aoff + asize > used)
                return false;

        if (ni && is_attr_indexed(attr) && attr->type == ATTR_NAME) {
                u16 links = le16_to_cpu(ni->mi.mrec->hard_links);
                if (!links) {
                        /* minor error. Not critical. */
                } else {
                        ni->mi.mrec->hard_links = cpu_to_le16(links - 1);
                        ni->mi.dirty = true;
                }
        }

        used -= asize;
        memmove(attr, Add2Ptr(attr, asize), used - aoff);
        rec->used = cpu_to_le32(used);
        mi->dirty = true;

        return true;
}

/* bytes = "new attribute size" - "old attribute size" */
bool mi_resize_attr(struct mft_inode *mi, struct ATTRIB *attr, int bytes)
{
        struct MFT_REC *rec = mi->mrec;
        u32 aoff = PtrOffset(rec, attr);
        u32 total, used = le32_to_cpu(rec->used);
        u32 nsize, asize = le32_to_cpu(attr->size);
        u32 rsize = le32_to_cpu(attr->res.data_size);
        int tail = (int)(used - aoff - asize);
        int dsize;
        char *next;

        if (tail < 0 || aoff >= used)
                return false;

        if (!bytes)
                return true;

        total = le32_to_cpu(rec->total);
        next = Add2Ptr(attr, asize);

        if (bytes > 0) {
                dsize = ALIGN(bytes, 8);
                if (used + dsize > total)
                        return false;
                nsize = asize + dsize;
                /* Move tail */
                memmove(next + dsize, next, tail);
                memset(next, 0, dsize);
                used += dsize;
                rsize += dsize;
        } else {
                dsize = ALIGN(-bytes, 8);
                if (dsize > asize)
                        return false;
                nsize = asize - dsize;
                memmove(next - dsize, next, tail);
                used -= dsize;
                rsize -= dsize;
        }

        rec->used = cpu_to_le32(used);
        attr->size = cpu_to_le32(nsize);
        if (!attr->non_res)
                attr->res.data_size = cpu_to_le32(rsize);
        mi->dirty = true;

        return true;
}

/*
 * Pack runs in MFT record.
 * If failed record is not changed.
 */
int mi_pack_runs(struct mft_inode *mi, struct ATTRIB *attr,
                 struct runs_tree *run, CLST len)
{
        int err = 0;
        struct ntfs_sb_info *sbi = mi->sbi;
        u32 new_run_size;
        CLST plen;
        struct MFT_REC *rec = mi->mrec;
        CLST svcn = le64_to_cpu(attr->nres.svcn);
        u32 used = le32_to_cpu(rec->used);
        u32 aoff = PtrOffset(rec, attr);
        u32 asize = le32_to_cpu(attr->size);
        char *next = Add2Ptr(attr, asize);
        u16 run_off = le16_to_cpu(attr->nres.run_off);
        u32 run_size = asize - run_off;
        u32 tail = used - aoff - asize;
        u32 dsize = sbi->record_size - used;

        /* Make a maximum gap in current record. */
        memmove(next + dsize, next, tail);

        /* Pack as much as possible. */
        err = run_pack(run, svcn, len, Add2Ptr(attr, run_off), run_size + dsize,
                       &plen);
        if (err < 0) {
                memmove(next, next + dsize, tail);
                return err;
        }

        new_run_size = ALIGN(err, 8);

        memmove(next + new_run_size - run_size, next + dsize, tail);

        attr->size = cpu_to_le32(asize + new_run_size - run_size);
        attr->nres.evcn = cpu_to_le64(svcn + plen - 1);
        rec->used = cpu_to_le32(used + new_run_size - run_size);
        mi->dirty = true;

        return 0;
}





























































































































































































    3 


























































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCATTERLIST_H
#define _LINUX_SCATTERLIST_H

#include <linux/string.h>
#include <linux/types.h>
#include <linux/bug.h>
#include <linux/mm.h>
#include <asm/io.h>

struct scatterlist {
        unsigned long        page_link;
        unsigned int        offset;
        unsigned int        length;
        dma_addr_t        dma_address;
#ifdef CONFIG_NEED_SG_DMA_LENGTH
        unsigned int        dma_length;
#endif
#ifdef CONFIG_NEED_SG_DMA_FLAGS
        unsigned int    dma_flags;
#endif
};

/*
 * These macros should be used after a dma_map_sg call has been done
 * to get bus addresses of each of the SG entries and their lengths.
 * You should only work with the number of sg entries dma_map_sg
 * returns, or alternatively stop on the first sg_dma_len(sg) which
 * is 0.
 */
#define sg_dma_address(sg)        ((sg)->dma_address)

#ifdef CONFIG_NEED_SG_DMA_LENGTH
#define sg_dma_len(sg)                ((sg)->dma_length)
#else
#define sg_dma_len(sg)                ((sg)->length)
#endif

struct sg_table {
        struct scatterlist *sgl;        /* the list */
        unsigned int nents;                /* number of mapped entries */
        unsigned int orig_nents;        /* original size of list */
};

struct sg_append_table {
        struct sg_table sgt;                /* The scatter list table */
        struct scatterlist *prv;        /* last populated sge in the table */
        unsigned int total_nents;        /* Total entries in the table */
};

/*
 * Notes on SG table design.
 *
 * We use the unsigned long page_link field in the scatterlist struct to place
 * the page pointer AND encode information about the sg table as well. The two
 * lower bits are reserved for this information.
 *
 * If bit 0 is set, then the page_link contains a pointer to the next sg
 * table list. Otherwise the next entry is at sg + 1.
 *
 * If bit 1 is set, then this sg entry is the last element in a list.
 *
 * See sg_next().
 *
 */

#define SG_CHAIN        0x01UL
#define SG_END                0x02UL

/*
 * We overload the LSB of the page pointer to indicate whether it's
 * a valid sg entry, or whether it points to the start of a new scatterlist.
 * Those low bits are there for everyone! (thanks mason :-)
 */
#define SG_PAGE_LINK_MASK (SG_CHAIN | SG_END)

static inline unsigned int __sg_flags(struct scatterlist *sg)
{
        return sg->page_link & SG_PAGE_LINK_MASK;
}

static inline struct scatterlist *sg_chain_ptr(struct scatterlist *sg)
{
        return (struct scatterlist *)(sg->page_link & ~SG_PAGE_LINK_MASK);
}

static inline bool sg_is_chain(struct scatterlist *sg)
{
        return __sg_flags(sg) & SG_CHAIN;
}

static inline bool sg_is_last(struct scatterlist *sg)
{
        return __sg_flags(sg) & SG_END;
}

/**
 * sg_assign_page - Assign a given page to an SG entry
 * @sg:                    SG entry
 * @page:            The page
 *
 * Description:
 *   Assign page to sg entry. Also see sg_set_page(), the most commonly used
 *   variant.
 *
 **/
static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
{
        unsigned long page_link = sg->page_link & (SG_CHAIN | SG_END);

        /*
         * In order for the low bit stealing approach to work, pages
         * must be aligned at a 32-bit boundary as a minimum.
         */
        BUG_ON((unsigned long)page & SG_PAGE_LINK_MASK);
#ifdef CONFIG_DEBUG_SG
        BUG_ON(sg_is_chain(sg));
#endif
        sg->page_link = page_link | (unsigned long) page;
}

/**
 * sg_set_page - Set sg entry to point at given page
 * @sg:                 SG entry
 * @page:         The page
 * @len:         Length of data
 * @offset:         Offset into page
 *
 * Description:
 *   Use this function to set an sg entry pointing at a page, never assign
 *   the page directly. We encode sg table information in the lower bits
 *   of the page pointer. See sg_page() for looking up the page belonging
 *   to an sg entry.
 *
 **/
static inline void sg_set_page(struct scatterlist *sg, struct page *page,
                               unsigned int len, unsigned int offset)
{
        sg_assign_page(sg, page);
        sg->offset = offset;
        sg->length = len;
}

/**
 * sg_set_folio - Set sg entry to point at given folio
 * @sg:                 SG entry
 * @folio:         The folio
 * @len:         Length of data
 * @offset:         Offset into folio
 *
 * Description:
 *   Use this function to set an sg entry pointing at a folio, never assign
 *   the folio directly. We encode sg table information in the lower bits
 *   of the folio pointer. See sg_page() for looking up the page belonging
 *   to an sg entry.
 *
 **/
static inline void sg_set_folio(struct scatterlist *sg, struct folio *folio,
                               size_t len, size_t offset)
{
        WARN_ON_ONCE(len > UINT_MAX);
        WARN_ON_ONCE(offset > UINT_MAX);
        sg_assign_page(sg, &folio->page);
        sg->offset = offset;
        sg->length = len;
}

static inline struct page *sg_page(struct scatterlist *sg)
{
#ifdef CONFIG_DEBUG_SG
        BUG_ON(sg_is_chain(sg));
#endif
        return (struct page *)((sg)->page_link & ~SG_PAGE_LINK_MASK);
}

/**
 * sg_set_buf - Set sg entry to point at given data
 * @sg:                 SG entry
 * @buf:         Data
 * @buflen:         Data length
 *
 **/
static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
                              unsigned int buflen)
{
#ifdef CONFIG_DEBUG_SG
        BUG_ON(!virt_addr_valid(buf));
#endif
        sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
}

/*
 * Loop over each sg element, following the pointer to a new list if necessary
 */
#define for_each_sg(sglist, sg, nr, __i)        \
        for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg))

/*
 * Loop over each sg element in the given sg_table object.
 */
#define for_each_sgtable_sg(sgt, sg, i)                \
        for_each_sg((sgt)->sgl, sg, (sgt)->orig_nents, i)

/*
 * Loop over each sg element in the given *DMA mapped* sg_table object.
 * Please use sg_dma_address(sg) and sg_dma_len(sg) to extract DMA addresses
 * of the each element.
 */
#define for_each_sgtable_dma_sg(sgt, sg, i)        \
        for_each_sg((sgt)->sgl, sg, (sgt)->nents, i)

static inline void __sg_chain(struct scatterlist *chain_sg,
                              struct scatterlist *sgl)
{
        /*
         * offset and length are unused for chain entry. Clear them.
         */
        chain_sg->offset = 0;
        chain_sg->length = 0;

        /*
         * Set lowest bit to indicate a link pointer, and make sure to clear
         * the termination bit if it happens to be set.
         */
        chain_sg->page_link = ((unsigned long) sgl | SG_CHAIN) & ~SG_END;
}

/**
 * sg_chain - Chain two sglists together
 * @prv:        First scatterlist
 * @prv_nents:        Number of entries in prv
 * @sgl:        Second scatterlist
 *
 * Description:
 *   Links @prv@ and @sgl@ together, to form a longer scatterlist.
 *
 **/
static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents,
                            struct scatterlist *sgl)
{
        __sg_chain(&prv[prv_nents - 1], sgl);
}

/**
 * sg_mark_end - Mark the end of the scatterlist
 * @sg:                 SG entryScatterlist
 *
 * Description:
 *   Marks the passed in sg entry as the termination point for the sg
 *   table. A call to sg_next() on this entry will return NULL.
 *
 **/
static inline void sg_mark_end(struct scatterlist *sg)
{
        /*
         * Set termination bit, clear potential chain bit
         */
        sg->page_link |= SG_END;
        sg->page_link &= ~SG_CHAIN;
}

/**
 * sg_unmark_end - Undo setting the end of the scatterlist
 * @sg:                 SG entryScatterlist
 *
 * Description:
 *   Removes the termination marker from the given entry of the scatterlist.
 *
 **/
static inline void sg_unmark_end(struct scatterlist *sg)
{
        sg->page_link &= ~SG_END;
}

/*
 * One 64-bit architectures there is a 4-byte padding in struct scatterlist
 * (assuming also CONFIG_NEED_SG_DMA_LENGTH is set). Use this padding for DMA
 * flags bits to indicate when a specific dma address is a bus address or the
 * buffer may have been bounced via SWIOTLB.
 */
#ifdef CONFIG_NEED_SG_DMA_FLAGS

#define SG_DMA_BUS_ADDRESS        (1 << 0)
#define SG_DMA_SWIOTLB                (1 << 1)

/**
 * sg_dma_is_bus_address - Return whether a given segment was marked
 *                           as a bus address
 * @sg:                 SG entry
 *
 * Description:
 *   Returns true if sg_dma_mark_bus_address() has been called on
 *   this segment.
 **/
static inline bool sg_dma_is_bus_address(struct scatterlist *sg)
{
        return sg->dma_flags & SG_DMA_BUS_ADDRESS;
}

/**
 * sg_dma_mark_bus_address - Mark the scatterlist entry as a bus address
 * @sg:                 SG entry
 *
 * Description:
 *   Marks the passed in sg entry to indicate that the dma_address is
 *   a bus address and doesn't need to be unmapped. This should only be
 *   used by dma_map_sg() implementations to mark bus addresses
 *   so they can be properly cleaned up in dma_unmap_sg().
 **/
static inline void sg_dma_mark_bus_address(struct scatterlist *sg)
{
        sg->dma_flags |= SG_DMA_BUS_ADDRESS;
}

/**
 * sg_unmark_bus_address - Unmark the scatterlist entry as a bus address
 * @sg:                 SG entry
 *
 * Description:
 *   Clears the bus address mark.
 **/
static inline void sg_dma_unmark_bus_address(struct scatterlist *sg)
{
        sg->dma_flags &= ~SG_DMA_BUS_ADDRESS;
}

/**
 * sg_dma_is_swiotlb - Return whether the scatterlist was marked for SWIOTLB
 *                        bouncing
 * @sg:                SG entry
 *
 * Description:
 *   Returns true if the scatterlist was marked for SWIOTLB bouncing. Not all
 *   elements may have been bounced, so the caller would have to check
 *   individual SG entries with is_swiotlb_buffer().
 */
static inline bool sg_dma_is_swiotlb(struct scatterlist *sg)
{
        return sg->dma_flags & SG_DMA_SWIOTLB;
}

/**
 * sg_dma_mark_swiotlb - Mark the scatterlist for SWIOTLB bouncing
 * @sg:                SG entry
 *
 * Description:
 *   Marks a a scatterlist for SWIOTLB bounce. Not all SG entries may be
 *   bounced.
 */
static inline void sg_dma_mark_swiotlb(struct scatterlist *sg)
{
        sg->dma_flags |= SG_DMA_SWIOTLB;
}

#else

static inline bool sg_dma_is_bus_address(struct scatterlist *sg)
{
        return false;
}
static inline void sg_dma_mark_bus_address(struct scatterlist *sg)
{
}
static inline void sg_dma_unmark_bus_address(struct scatterlist *sg)
{
}
static inline bool sg_dma_is_swiotlb(struct scatterlist *sg)
{
        return false;
}
static inline void sg_dma_mark_swiotlb(struct scatterlist *sg)
{
}

#endif        /* CONFIG_NEED_SG_DMA_FLAGS */

/**
 * sg_phys - Return physical address of an sg entry
 * @sg:             SG entry
 *
 * Description:
 *   This calls page_to_phys() on the page in this sg entry, and adds the
 *   sg offset. The caller must know that it is legal to call page_to_phys()
 *   on the sg page.
 *
 **/
static inline dma_addr_t sg_phys(struct scatterlist *sg)
{
        return page_to_phys(sg_page(sg)) + sg->offset;
}

/**
 * sg_virt - Return virtual address of an sg entry
 * @sg:      SG entry
 *
 * Description:
 *   This calls page_address() on the page in this sg entry, and adds the
 *   sg offset. The caller must know that the sg page has a valid virtual
 *   mapping.
 *
 **/
static inline void *sg_virt(struct scatterlist *sg)
{
        return page_address(sg_page(sg)) + sg->offset;
}

/**
 * sg_init_marker - Initialize markers in sg table
 * @sgl:           The SG table
 * @nents:           Number of entries in table
 *
 **/
static inline void sg_init_marker(struct scatterlist *sgl,
                                  unsigned int nents)
{
        sg_mark_end(&sgl[nents - 1]);
}

int sg_nents(struct scatterlist *sg);
int sg_nents_for_len(struct scatterlist *sg, u64 len);
struct scatterlist *sg_next(struct scatterlist *);
struct scatterlist *sg_last(struct scatterlist *s, unsigned int);
void sg_init_table(struct scatterlist *, unsigned int);
void sg_init_one(struct scatterlist *, const void *, unsigned int);
int sg_split(struct scatterlist *in, const int in_mapped_nents,
             const off_t skip, const int nb_splits,
             const size_t *split_sizes,
             struct scatterlist **out, int *out_mapped_nents,
             gfp_t gfp_mask);

typedef struct scatterlist *(sg_alloc_fn)(unsigned int, gfp_t);
typedef void (sg_free_fn)(struct scatterlist *, unsigned int);

void __sg_free_table(struct sg_table *, unsigned int, unsigned int,
                     sg_free_fn *, unsigned int);
void sg_free_table(struct sg_table *);
void sg_free_append_table(struct sg_append_table *sgt);
int __sg_alloc_table(struct sg_table *, unsigned int, unsigned int,
                     struct scatterlist *, unsigned int, gfp_t, sg_alloc_fn *);
int sg_alloc_table(struct sg_table *, unsigned int, gfp_t);
int sg_alloc_append_table_from_pages(struct sg_append_table *sgt,
                                     struct page **pages, unsigned int n_pages,
                                     unsigned int offset, unsigned long size,
                                     unsigned int max_segment,
                                     unsigned int left_pages, gfp_t gfp_mask);
int sg_alloc_table_from_pages_segment(struct sg_table *sgt, struct page **pages,
                                      unsigned int n_pages, unsigned int offset,
                                      unsigned long size,
                                      unsigned int max_segment, gfp_t gfp_mask);

/**
 * sg_alloc_table_from_pages - Allocate and initialize an sg table from
 *                               an array of pages
 * @sgt:         The sg table header to use
 * @pages:         Pointer to an array of page pointers
 * @n_pages:         Number of pages in the pages array
 * @offset:      Offset from start of the first page to the start of a buffer
 * @size:        Number of valid bytes in the buffer (after offset)
 * @gfp_mask:         GFP allocation mask
 *
 *  Description:
 *    Allocate and initialize an sg table from a list of pages. Contiguous
 *    ranges of the pages are squashed into a single scatterlist node. A user
 *    may provide an offset at a start and a size of valid data in a buffer
 *    specified by the page array. The returned sg table is released by
 *    sg_free_table.
 *
 * Returns:
 *   0 on success, negative error on failure
 */
static inline int sg_alloc_table_from_pages(struct sg_table *sgt,
                                            struct page **pages,
                                            unsigned int n_pages,
                                            unsigned int offset,
                                            unsigned long size, gfp_t gfp_mask)
{
        return sg_alloc_table_from_pages_segment(sgt, pages, n_pages, offset,
                                                 size, UINT_MAX, gfp_mask);
}

#ifdef CONFIG_SGL_ALLOC
struct scatterlist *sgl_alloc_order(unsigned long long length,
                                    unsigned int order, bool chainable,
                                    gfp_t gfp, unsigned int *nent_p);
struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp,
                              unsigned int *nent_p);
void sgl_free_n_order(struct scatterlist *sgl, int nents, int order);
void sgl_free_order(struct scatterlist *sgl, int order);
void sgl_free(struct scatterlist *sgl);
#endif /* CONFIG_SGL_ALLOC */

size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
                      size_t buflen, off_t skip, bool to_buffer);

size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents,
                           const void *buf, size_t buflen);
size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents,
                         void *buf, size_t buflen);

size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents,
                            const void *buf, size_t buflen, off_t skip);
size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents,
                          void *buf, size_t buflen, off_t skip);
size_t sg_zero_buffer(struct scatterlist *sgl, unsigned int nents,
                       size_t buflen, off_t skip);

/*
 * Maximum number of entries that will be allocated in one piece, if
 * a list larger than this is required then chaining will be utilized.
 */
#define SG_MAX_SINGLE_ALLOC                (PAGE_SIZE / sizeof(struct scatterlist))

/*
 * The maximum number of SG segments that we will put inside a
 * scatterlist (unless chaining is used). Should ideally fit inside a
 * single page, to avoid a higher order allocation.  We could define this
 * to SG_MAX_SINGLE_ALLOC to pack correctly at the highest order.  The
 * minimum value is 32
 */
#define SG_CHUNK_SIZE        128

/*
 * Like SG_CHUNK_SIZE, but for archs that have sg chaining. This limit
 * is totally arbitrary, a setting of 2048 will get you at least 8mb ios.
 */
#ifdef CONFIG_ARCH_NO_SG_CHAIN
#define SG_MAX_SEGMENTS        SG_CHUNK_SIZE
#else
#define SG_MAX_SEGMENTS        2048
#endif

#ifdef CONFIG_SG_POOL
void sg_free_table_chained(struct sg_table *table,
                           unsigned nents_first_chunk);
int sg_alloc_table_chained(struct sg_table *table, int nents,
                           struct scatterlist *first_chunk,
                           unsigned nents_first_chunk);
#endif

/*
 * sg page iterator
 *
 * Iterates over sg entries page-by-page.  On each successful iteration, you
 * can call sg_page_iter_page(@piter) to get the current page.
 * @piter->sg will point to the sg holding this page and @piter->sg_pgoffset to
 * the page's page offset within the sg. The iteration will stop either when a
 * maximum number of sg entries was reached or a terminating sg
 * (sg_last(sg) == true) was reached.
 */
struct sg_page_iter {
        struct scatterlist        *sg;                /* sg holding the page */
        unsigned int                sg_pgoffset;        /* page offset within the sg */

        /* these are internal states, keep away */
        unsigned int                __nents;        /* remaining sg entries */
        int                        __pg_advance;        /* nr pages to advance at the
                                                 * next step */
};

/*
 * sg page iterator for DMA addresses
 *
 * This is the same as sg_page_iter however you can call
 * sg_page_iter_dma_address(@dma_iter) to get the page's DMA
 * address. sg_page_iter_page() cannot be called on this iterator.
 */
struct sg_dma_page_iter {
        struct sg_page_iter base;
};

bool __sg_page_iter_next(struct sg_page_iter *piter);
bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter);
void __sg_page_iter_start(struct sg_page_iter *piter,
                          struct scatterlist *sglist, unsigned int nents,
                          unsigned long pgoffset);
/**
 * sg_page_iter_page - get the current page held by the page iterator
 * @piter:        page iterator holding the page
 */
static inline struct page *sg_page_iter_page(struct sg_page_iter *piter)
{
        return nth_page(sg_page(piter->sg), piter->sg_pgoffset);
}

/**
 * sg_page_iter_dma_address - get the dma address of the current page held by
 * the page iterator.
 * @dma_iter:        page iterator holding the page
 */
static inline dma_addr_t
sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter)
{
        return sg_dma_address(dma_iter->base.sg) +
               (dma_iter->base.sg_pgoffset << PAGE_SHIFT);
}

/**
 * for_each_sg_page - iterate over the pages of the given sg list
 * @sglist:        sglist to iterate over
 * @piter:        page iterator to hold current page, sg, sg_pgoffset
 * @nents:        maximum number of sg entries to iterate over
 * @pgoffset:        starting page offset (in pages)
 *
 * Callers may use sg_page_iter_page() to get each page pointer.
 * In each loop it operates on PAGE_SIZE unit.
 */
#define for_each_sg_page(sglist, piter, nents, pgoffset)                   \
        for (__sg_page_iter_start((piter), (sglist), (nents), (pgoffset)); \
             __sg_page_iter_next(piter);)

/**
 * for_each_sg_dma_page - iterate over the pages of the given sg list
 * @sglist:        sglist to iterate over
 * @dma_iter:        DMA page iterator to hold current page
 * @dma_nents:        maximum number of sg entries to iterate over, this is the value
 *              returned from dma_map_sg
 * @pgoffset:        starting page offset (in pages)
 *
 * Callers may use sg_page_iter_dma_address() to get each page's DMA address.
 * In each loop it operates on PAGE_SIZE unit.
 */
#define for_each_sg_dma_page(sglist, dma_iter, dma_nents, pgoffset)            \
        for (__sg_page_iter_start(&(dma_iter)->base, sglist, dma_nents,        \
                                  pgoffset);                                   \
             __sg_page_iter_dma_next(dma_iter);)

/**
 * for_each_sgtable_page - iterate over all pages in the sg_table object
 * @sgt:        sg_table object to iterate over
 * @piter:        page iterator to hold current page
 * @pgoffset:        starting page offset (in pages)
 *
 * Iterates over the all memory pages in the buffer described by
 * a scatterlist stored in the given sg_table object.
 * See also for_each_sg_page(). In each loop it operates on PAGE_SIZE unit.
 */
#define for_each_sgtable_page(sgt, piter, pgoffset)        \
        for_each_sg_page((sgt)->sgl, piter, (sgt)->orig_nents, pgoffset)

/**
 * for_each_sgtable_dma_page - iterate over the DMA mapped sg_table object
 * @sgt:        sg_table object to iterate over
 * @dma_iter:        DMA page iterator to hold current page
 * @pgoffset:        starting page offset (in pages)
 *
 * Iterates over the all DMA mapped pages in the buffer described by
 * a scatterlist stored in the given sg_table object.
 * See also for_each_sg_dma_page(). In each loop it operates on PAGE_SIZE
 * unit.
 */
#define for_each_sgtable_dma_page(sgt, dma_iter, pgoffset)        \
        for_each_sg_dma_page((sgt)->sgl, dma_iter, (sgt)->nents, pgoffset)


/*
 * Mapping sg iterator
 *
 * Iterates over sg entries mapping page-by-page.  On each successful
 * iteration, @miter->page points to the mapped page and
 * @miter->length bytes of data can be accessed at @miter->addr.  As
 * long as an iteration is enclosed between start and stop, the user
 * is free to choose control structure and when to stop.
 *
 * @miter->consumed is set to @miter->length on each iteration.  It
 * can be adjusted if the user can't consume all the bytes in one go.
 * Also, a stopped iteration can be resumed by calling next on it.
 * This is useful when iteration needs to release all resources and
 * continue later (e.g. at the next interrupt).
 */

#define SG_MITER_ATOMIC                (1 << 0)         /* use kmap_atomic */
#define SG_MITER_TO_SG                (1 << 1)        /* flush back to phys on unmap */
#define SG_MITER_FROM_SG        (1 << 2)        /* nop */

struct sg_mapping_iter {
        /* the following three fields can be accessed directly */
        struct page                *page;                /* currently mapped page */
        void                        *addr;                /* pointer to the mapped area */
        size_t                        length;                /* length of the mapped area */
        size_t                        consumed;        /* number of consumed bytes */
        struct sg_page_iter        piter;                /* page iterator */

        /* these are internal states, keep away */
        unsigned int                __offset;        /* offset within page */
        unsigned int                __remaining;        /* remaining bytes on page */
        unsigned int                __flags;
};

void sg_miter_start(struct sg_mapping_iter *miter, struct scatterlist *sgl,
                    unsigned int nents, unsigned int flags);
bool sg_miter_skip(struct sg_mapping_iter *miter, off_t offset);
bool sg_miter_next(struct sg_mapping_iter *miter);
void sg_miter_stop(struct sg_mapping_iter *miter);

#endif /* _LINUX_SCATTERLIST_H */




























































































































































































































































































































































































































    1 



    1 





































































    1 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
// SPDX-License-Identifier: GPL-2.0
/*
 * Contains the core associated with submission side polling of the SQ
 * ring, offloading submissions from the application to a kernel thread.
 */
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/audit.h>
#include <linux/security.h>
#include <linux/io_uring.h>

#include <uapi/linux/io_uring.h>

#include "io_uring.h"
#include "napi.h"
#include "sqpoll.h"

#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
#define IORING_TW_CAP_ENTRIES_VALUE        8

enum {
        IO_SQ_THREAD_SHOULD_STOP = 0,
        IO_SQ_THREAD_SHOULD_PARK,
};

void io_sq_thread_unpark(struct io_sq_data *sqd)
        __releases(&sqd->lock)
{
        WARN_ON_ONCE(sqd->thread == current);

        /*
         * Do the dance but not conditional clear_bit() because it'd race with
         * other threads incrementing park_pending and setting the bit.
         */
        clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
        if (atomic_dec_return(&sqd->park_pending))
                set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
        mutex_unlock(&sqd->lock);
}

void io_sq_thread_park(struct io_sq_data *sqd)
        __acquires(&sqd->lock)
{
        WARN_ON_ONCE(sqd->thread == current);

        atomic_inc(&sqd->park_pending);
        set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
        mutex_lock(&sqd->lock);
        if (sqd->thread)
                wake_up_process(sqd->thread);
}

void io_sq_thread_stop(struct io_sq_data *sqd)
{
        WARN_ON_ONCE(sqd->thread == current);
        WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));

        set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
        mutex_lock(&sqd->lock);
        if (sqd->thread)
                wake_up_process(sqd->thread);
        mutex_unlock(&sqd->lock);
        wait_for_completion(&sqd->exited);
}

void io_put_sq_data(struct io_sq_data *sqd)
{
        if (refcount_dec_and_test(&sqd->refs)) {
                WARN_ON_ONCE(atomic_read(&sqd->park_pending));

                io_sq_thread_stop(sqd);
                kfree(sqd);
        }
}

static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
{
        struct io_ring_ctx *ctx;
        unsigned sq_thread_idle = 0;

        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
        sqd->sq_thread_idle = sq_thread_idle;
}

void io_sq_thread_finish(struct io_ring_ctx *ctx)
{
        struct io_sq_data *sqd = ctx->sq_data;

        if (sqd) {
                io_sq_thread_park(sqd);
                list_del_init(&ctx->sqd_list);
                io_sqd_update_thread_idle(sqd);
                io_sq_thread_unpark(sqd);

                io_put_sq_data(sqd);
                ctx->sq_data = NULL;
        }
}

static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
{
        struct io_ring_ctx *ctx_attach;
        struct io_sq_data *sqd;
        struct fd f;

        f = fdget(p->wq_fd);
        if (!f.file)
                return ERR_PTR(-ENXIO);
        if (!io_is_uring_fops(f.file)) {
                fdput(f);
                return ERR_PTR(-EINVAL);
        }

        ctx_attach = f.file->private_data;
        sqd = ctx_attach->sq_data;
        if (!sqd) {
                fdput(f);
                return ERR_PTR(-EINVAL);
        }
        if (sqd->task_tgid != current->tgid) {
                fdput(f);
                return ERR_PTR(-EPERM);
        }

        refcount_inc(&sqd->refs);
        fdput(f);
        return sqd;
}

static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
                                         bool *attached)
{
        struct io_sq_data *sqd;

        *attached = false;
        if (p->flags & IORING_SETUP_ATTACH_WQ) {
                sqd = io_attach_sq_data(p);
                if (!IS_ERR(sqd)) {
                        *attached = true;
                        return sqd;
                }
                /* fall through for EPERM case, setup new sqd/task */
                if (PTR_ERR(sqd) != -EPERM)
                        return sqd;
        }

        sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
        if (!sqd)
                return ERR_PTR(-ENOMEM);

        atomic_set(&sqd->park_pending, 0);
        refcount_set(&sqd->refs, 1);
        INIT_LIST_HEAD(&sqd->ctx_list);
        mutex_init(&sqd->lock);
        init_waitqueue_head(&sqd->wait);
        init_completion(&sqd->exited);
        return sqd;
}

static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
{
        return READ_ONCE(sqd->state);
}

static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
{
        unsigned int to_submit;
        int ret = 0;

        to_submit = io_sqring_entries(ctx);
        /* if we're handling multiple rings, cap submit size for fairness */
        if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
                to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;

        if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
                const struct cred *creds = NULL;

                if (ctx->sq_creds != current_cred())
                        creds = override_creds(ctx->sq_creds);

                mutex_lock(&ctx->uring_lock);
                if (!wq_list_empty(&ctx->iopoll_list))
                        io_do_iopoll(ctx, true);

                /*
                 * Don't submit if refs are dying, good for io_uring_register(),
                 * but also it is relied upon by io_ring_exit_work()
                 */
                if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
                    !(ctx->flags & IORING_SETUP_R_DISABLED))
                        ret = io_submit_sqes(ctx, to_submit);
                mutex_unlock(&ctx->uring_lock);

                if (io_napi(ctx))
                        ret += io_napi_sqpoll_busy_poll(ctx);

                if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
                        wake_up(&ctx->sqo_sq_wait);
                if (creds)
                        revert_creds(creds);
        }

        return ret;
}

static bool io_sqd_handle_event(struct io_sq_data *sqd)
{
        bool did_sig = false;
        struct ksignal ksig;

        if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
            signal_pending(current)) {
                mutex_unlock(&sqd->lock);
                if (signal_pending(current))
                        did_sig = get_signal(&ksig);
                cond_resched();
                mutex_lock(&sqd->lock);
                sqd->sq_cpu = raw_smp_processor_id();
        }
        return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
}

/*
 * Run task_work, processing the retry_list first. The retry_list holds
 * entries that we passed on in the previous run, if we had more task_work
 * than we were asked to process. Newly queued task_work isn't run until the
 * retry list has been fully processed.
 */
static unsigned int io_sq_tw(struct llist_node **retry_list, int max_entries)
{
        struct io_uring_task *tctx = current->io_uring;
        unsigned int count = 0;

        if (*retry_list) {
                *retry_list = io_handle_tw_list(*retry_list, &count, max_entries);
                if (count >= max_entries)
                        goto out;
                max_entries -= count;
        }
        *retry_list = tctx_task_work_run(tctx, max_entries, &count);
out:
        if (task_work_pending(current))
                task_work_run();
        return count;
}

static bool io_sq_tw_pending(struct llist_node *retry_list)
{
        struct io_uring_task *tctx = current->io_uring;

        return retry_list || !llist_empty(&tctx->task_list);
}

static void io_sq_update_worktime(struct io_sq_data *sqd, struct rusage *start)
{
        struct rusage end;

        getrusage(current, RUSAGE_SELF, &end);
        end.ru_stime.tv_sec -= start->ru_stime.tv_sec;
        end.ru_stime.tv_usec -= start->ru_stime.tv_usec;

        sqd->work_time += end.ru_stime.tv_usec + end.ru_stime.tv_sec * 1000000;
}

static int io_sq_thread(void *data)
{
        struct llist_node *retry_list = NULL;
        struct io_sq_data *sqd = data;
        struct io_ring_ctx *ctx;
        struct rusage start;
        unsigned long timeout = 0;
        char buf[TASK_COMM_LEN];
        DEFINE_WAIT(wait);

        /* offload context creation failed, just exit */
        if (!current->io_uring)
                goto err_out;

        snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
        set_task_comm(current, buf);

        /* reset to our pid after we've set task_comm, for fdinfo */
        sqd->task_pid = current->pid;

        if (sqd->sq_cpu != -1) {
                set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
        } else {
                set_cpus_allowed_ptr(current, cpu_online_mask);
                sqd->sq_cpu = raw_smp_processor_id();
        }

        /*
         * Force audit context to get setup, in case we do prep side async
         * operations that would trigger an audit call before any issue side
         * audit has been done.
         */
        audit_uring_entry(IORING_OP_NOP);
        audit_uring_exit(true, 0);

        mutex_lock(&sqd->lock);
        while (1) {
                bool cap_entries, sqt_spin = false;

                if (io_sqd_events_pending(sqd) || signal_pending(current)) {
                        if (io_sqd_handle_event(sqd))
                                break;
                        timeout = jiffies + sqd->sq_thread_idle;
                }

                cap_entries = !list_is_singular(&sqd->ctx_list);
                getrusage(current, RUSAGE_SELF, &start);
                list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
                        int ret = __io_sq_thread(ctx, cap_entries);

                        if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
                                sqt_spin = true;
                }
                if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE))
                        sqt_spin = true;

                if (sqt_spin || !time_after(jiffies, timeout)) {
                        if (sqt_spin) {
                                io_sq_update_worktime(sqd, &start);
                                timeout = jiffies + sqd->sq_thread_idle;
                        }
                        if (unlikely(need_resched())) {
                                mutex_unlock(&sqd->lock);
                                cond_resched();
                                mutex_lock(&sqd->lock);
                                sqd->sq_cpu = raw_smp_processor_id();
                        }
                        continue;
                }

                prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
                if (!io_sqd_events_pending(sqd) && !io_sq_tw_pending(retry_list)) {
                        bool needs_sched = true;

                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
                                atomic_or(IORING_SQ_NEED_WAKEUP,
                                                &ctx->rings->sq_flags);
                                if ((ctx->flags & IORING_SETUP_IOPOLL) &&
                                    !wq_list_empty(&ctx->iopoll_list)) {
                                        needs_sched = false;
                                        break;
                                }

                                /*
                                 * Ensure the store of the wakeup flag is not
                                 * reordered with the load of the SQ tail
                                 */
                                smp_mb__after_atomic();

                                if (io_sqring_entries(ctx)) {
                                        needs_sched = false;
                                        break;
                                }
                        }

                        if (needs_sched) {
                                mutex_unlock(&sqd->lock);
                                schedule();
                                mutex_lock(&sqd->lock);
                                sqd->sq_cpu = raw_smp_processor_id();
                        }
                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                atomic_andnot(IORING_SQ_NEED_WAKEUP,
                                                &ctx->rings->sq_flags);
                }

                finish_wait(&sqd->wait, &wait);
                timeout = jiffies + sqd->sq_thread_idle;
        }

        if (retry_list)
                io_sq_tw(&retry_list, UINT_MAX);

        io_uring_cancel_generic(true, sqd);
        sqd->thread = NULL;
        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags);
        io_run_task_work();
        mutex_unlock(&sqd->lock);
err_out:
        complete(&sqd->exited);
        do_exit(0);
}

void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
{
        DEFINE_WAIT(wait);

        do {
                if (!io_sqring_full(ctx))
                        break;
                prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);

                if (!io_sqring_full(ctx))
                        break;
                schedule();
        } while (!signal_pending(current));

        finish_wait(&ctx->sqo_sq_wait, &wait);
}

__cold int io_sq_offload_create(struct io_ring_ctx *ctx,
                                struct io_uring_params *p)
{
        int ret;

        /* Retain compatibility with failing for an invalid attach attempt */
        if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
                                IORING_SETUP_ATTACH_WQ) {
                struct fd f;

                f = fdget(p->wq_fd);
                if (!f.file)
                        return -ENXIO;
                if (!io_is_uring_fops(f.file)) {
                        fdput(f);
                        return -EINVAL;
                }
                fdput(f);
        }
        if (ctx->flags & IORING_SETUP_SQPOLL) {
                struct task_struct *tsk;
                struct io_sq_data *sqd;
                bool attached;

                ret = security_uring_sqpoll();
                if (ret)
                        return ret;

                sqd = io_get_sq_data(p, &attached);
                if (IS_ERR(sqd)) {
                        ret = PTR_ERR(sqd);
                        goto err;
                }

                ctx->sq_creds = get_current_cred();
                ctx->sq_data = sqd;
                ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
                if (!ctx->sq_thread_idle)
                        ctx->sq_thread_idle = HZ;

                io_sq_thread_park(sqd);
                list_add(&ctx->sqd_list, &sqd->ctx_list);
                io_sqd_update_thread_idle(sqd);
                /* don't attach to a dying SQPOLL thread, would be racy */
                ret = (attached && !sqd->thread) ? -ENXIO : 0;
                io_sq_thread_unpark(sqd);

                if (ret < 0)
                        goto err;
                if (attached)
                        return 0;

                if (p->flags & IORING_SETUP_SQ_AFF) {
                        int cpu = p->sq_thread_cpu;

                        ret = -EINVAL;
                        if (cpu >= nr_cpu_ids || !cpu_online(cpu))
                                goto err_sqpoll;
                        sqd->sq_cpu = cpu;
                } else {
                        sqd->sq_cpu = -1;
                }

                sqd->task_pid = current->pid;
                sqd->task_tgid = current->tgid;
                tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
                if (IS_ERR(tsk)) {
                        ret = PTR_ERR(tsk);
                        goto err_sqpoll;
                }

                sqd->thread = tsk;
                ret = io_uring_alloc_task_context(tsk, ctx);
                wake_up_new_task(tsk);
                if (ret)
                        goto err;
        } else if (p->flags & IORING_SETUP_SQ_AFF) {
                /* Can't have SQ_AFF without SQPOLL */
                ret = -EINVAL;
                goto err;
        }

        return 0;
err_sqpoll:
        complete(&ctx->sq_data->exited);
err:
        io_sq_thread_finish(ctx);
        return ret;
}

__cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx,
                                     cpumask_var_t mask)
{
        struct io_sq_data *sqd = ctx->sq_data;
        int ret = -EINVAL;

        if (sqd) {
                io_sq_thread_park(sqd);
                /* Don't set affinity for a dying thread */
                if (sqd->thread)
                        ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask);
                io_sq_thread_unpark(sqd);
        }

        return ret;
}

















    4 



    3 

    4 





















































    4 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
 *
 * Copyright (C) 2015 Martin Willi
 */

#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/bitops.h>
#include <linux/string.h>
#include <asm/unaligned.h>
#include <crypto/chacha.h>

static void chacha_permute(u32 *x, int nrounds)
{
        int i;

        /* whitelist the allowed round counts */
        WARN_ON_ONCE(nrounds != 20 && nrounds != 12);

        for (i = 0; i < nrounds; i += 2) {
                x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],  16);
                x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],  16);
                x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],  16);
                x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],  16);

                x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],  12);
                x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],  12);
                x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10], 12);
                x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11], 12);

                x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],   8);
                x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],   8);
                x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],   8);
                x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],   8);

                x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],   7);
                x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],   7);
                x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10],  7);
                x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11],  7);

                x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],  16);
                x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],  16);
                x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],  16);
                x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],  16);

                x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10], 12);
                x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11], 12);
                x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],  12);
                x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],  12);

                x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],   8);
                x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],   8);
                x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],   8);
                x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],   8);

                x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10],  7);
                x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11],  7);
                x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],   7);
                x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],   7);
        }
}

/**
 * chacha_block_generic - generate one keystream block and increment block counter
 * @state: input state matrix (16 32-bit words)
 * @stream: output keystream block (64 bytes)
 * @nrounds: number of rounds (20 or 12; 20 is recommended)
 *
 * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
 * The caller has already converted the endianness of the input.  This function
 * also handles incrementing the block counter in the input matrix.
 */
void chacha_block_generic(u32 *state, u8 *stream, int nrounds)
{
        u32 x[16];
        int i;

        memcpy(x, state, 64);

        chacha_permute(x, nrounds);

        for (i = 0; i < ARRAY_SIZE(x); i++)
                put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]);

        state[12]++;
}
EXPORT_SYMBOL(chacha_block_generic);

/**
 * hchacha_block_generic - abbreviated ChaCha core, for XChaCha
 * @state: input state matrix (16 32-bit words)
 * @stream: output (8 32-bit words)
 * @nrounds: number of rounds (20 or 12; 20 is recommended)
 *
 * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
 * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf).  HChaCha
 * skips the final addition of the initial state, and outputs only certain words
 * of the state.  It should not be used for streaming directly.
 */
void hchacha_block_generic(const u32 *state, u32 *stream, int nrounds)
{
        u32 x[16];

        memcpy(x, state, 64);

        chacha_permute(x, nrounds);

        memcpy(&stream[0], &x[0], 16);
        memcpy(&stream[4], &x[12], 16);
}
EXPORT_SYMBOL(hchacha_block_generic);

























































































































































































   13 













    3 


    6 

    2 




    5 












    2 


















   11 

   12 






   11 















   12 













    9 

   13 



























    2 
    2 
    2 














    5 


    7 

    5 




    5 












    2 
    5 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Percpu refcounts:
 * (C) 2012 Google, Inc.
 * Author: Kent Overstreet <koverstreet@google.com>
 *
 * This implements a refcount with similar semantics to atomic_t - atomic_inc(),
 * atomic_dec_and_test() - but percpu.
 *
 * There's one important difference between percpu refs and normal atomic_t
 * refcounts; you have to keep track of your initial refcount, and then when you
 * start shutting down you call percpu_ref_kill() _before_ dropping the initial
 * refcount.
 *
 * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less
 * than an atomic_t - this is because of the way shutdown works, see
 * percpu_ref_kill()/PERCPU_COUNT_BIAS.
 *
 * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the
 * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill()
 * puts the ref back in single atomic_t mode, collecting the per cpu refs and
 * issuing the appropriate barriers, and then marks the ref as shutting down so
 * that percpu_ref_put() will check for the ref hitting 0.  After it returns,
 * it's safe to drop the initial ref.
 *
 * USAGE:
 *
 * See fs/aio.c for some example usage; it's used there for struct kioctx, which
 * is created when userspaces calls io_setup(), and destroyed when userspace
 * calls io_destroy() or the process exits.
 *
 * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it
 * removes the kioctx from the proccess's table of kioctxs and kills percpu_ref.
 * After that, there can't be any new users of the kioctx (from lookup_ioctx())
 * and it's then safe to drop the initial ref with percpu_ref_put().
 *
 * Note that the free path, free_ioctx(), needs to go through explicit call_rcu()
 * to synchronize with RCU protected lookup_ioctx().  percpu_ref operations don't
 * imply RCU grace periods of any kind and if a user wants to combine percpu_ref
 * with RCU protection, it must be done explicitly.
 *
 * Code that does a two stage shutdown like this often needs some kind of
 * explicit synchronization to ensure the initial refcount can only be dropped
 * once - percpu_ref_kill() does this for you, it returns true once and false if
 * someone else already called it. The aio code uses it this way, but it's not
 * necessary if the code has some other mechanism to synchronize teardown.
 * around.
 */

#ifndef _LINUX_PERCPU_REFCOUNT_H
#define _LINUX_PERCPU_REFCOUNT_H

#include <linux/atomic.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/types.h>
#include <linux/gfp.h>

struct percpu_ref;
typedef void (percpu_ref_func_t)(struct percpu_ref *);

/* flags set in the lower bits of percpu_ref->percpu_count_ptr */
enum {
        __PERCPU_REF_ATOMIC        = 1LU << 0,        /* operating in atomic mode */
        __PERCPU_REF_DEAD        = 1LU << 1,        /* (being) killed */
        __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD,

        __PERCPU_REF_FLAG_BITS        = 2,
};

/* @flags for percpu_ref_init() */
enum {
        /*
         * Start w/ ref == 1 in atomic mode.  Can be switched to percpu
         * operation using percpu_ref_switch_to_percpu().  If initialized
         * with this flag, the ref will stay in atomic mode until
         * percpu_ref_switch_to_percpu() is invoked on it.
         * Implies ALLOW_REINIT.
         */
        PERCPU_REF_INIT_ATOMIC        = 1 << 0,

        /*
         * Start dead w/ ref == 0 in atomic mode.  Must be revived with
         * percpu_ref_reinit() before used.  Implies INIT_ATOMIC and
         * ALLOW_REINIT.
         */
        PERCPU_REF_INIT_DEAD        = 1 << 1,

        /*
         * Allow switching from atomic mode to percpu mode.
         */
        PERCPU_REF_ALLOW_REINIT        = 1 << 2,
};

struct percpu_ref_data {
        atomic_long_t                count;
        percpu_ref_func_t        *release;
        percpu_ref_func_t        *confirm_switch;
        bool                        force_atomic:1;
        bool                        allow_reinit:1;
        struct rcu_head                rcu;
        struct percpu_ref        *ref;
};

struct percpu_ref {
        /*
         * The low bit of the pointer indicates whether the ref is in percpu
         * mode; if set, then get/put will manipulate the atomic_t.
         */
        unsigned long                percpu_count_ptr;

        /*
         * 'percpu_ref' is often embedded into user structure, and only
         * 'percpu_count_ptr' is required in fast path, move other fields
         * into 'percpu_ref_data', so we can reduce memory footprint in
         * fast path.
         */
        struct percpu_ref_data  *data;
};

int __must_check percpu_ref_init(struct percpu_ref *ref,
                                 percpu_ref_func_t *release, unsigned int flags,
                                 gfp_t gfp);
void percpu_ref_exit(struct percpu_ref *ref);
void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_switch);
void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref);
void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_kill);
void percpu_ref_resurrect(struct percpu_ref *ref);
void percpu_ref_reinit(struct percpu_ref *ref);
bool percpu_ref_is_zero(struct percpu_ref *ref);

/**
 * percpu_ref_kill - drop the initial ref
 * @ref: percpu_ref to kill
 *
 * Must be used to drop the initial ref on a percpu refcount; must be called
 * precisely once before shutdown.
 *
 * Switches @ref into atomic mode before gathering up the percpu counters
 * and dropping the initial ref.
 *
 * There are no implied RCU grace periods between kill and release.
 */
static inline void percpu_ref_kill(struct percpu_ref *ref)
{
        percpu_ref_kill_and_confirm(ref, NULL);
}

/*
 * Internal helper.  Don't use outside percpu-refcount proper.  The
 * function doesn't return the pointer and let the caller test it for NULL
 * because doing so forces the compiler to generate two conditional
 * branches as it can't assume that @ref->percpu_count is not NULL.
 */
static inline bool __ref_is_percpu(struct percpu_ref *ref,
                                          unsigned long __percpu **percpu_countp)
{
        unsigned long percpu_ptr;

        /*
         * The value of @ref->percpu_count_ptr is tested for
         * !__PERCPU_REF_ATOMIC, which may be set asynchronously, and then
         * used as a pointer.  If the compiler generates a separate fetch
         * when using it as a pointer, __PERCPU_REF_ATOMIC may be set in
         * between contaminating the pointer value, meaning that
         * READ_ONCE() is required when fetching it.
         *
         * The dependency ordering from the READ_ONCE() pairs
         * with smp_store_release() in __percpu_ref_switch_to_percpu().
         */
        percpu_ptr = READ_ONCE(ref->percpu_count_ptr);

        /*
         * Theoretically, the following could test just ATOMIC; however,
         * then we'd have to mask off DEAD separately as DEAD may be
         * visible without ATOMIC if we race with percpu_ref_kill().  DEAD
         * implies ATOMIC anyway.  Test them together.
         */
        if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD))
                return false;

        *percpu_countp = (unsigned long __percpu *)percpu_ptr;
        return true;
}

/**
 * percpu_ref_get_many - increment a percpu refcount
 * @ref: percpu_ref to get
 * @nr: number of references to get
 *
 * Analogous to atomic_long_add().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
{
        unsigned long __percpu *percpu_count;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count))
                this_cpu_add(*percpu_count, nr);
        else
                atomic_long_add(nr, &ref->data->count);

        rcu_read_unlock();
}

/**
 * percpu_ref_get - increment a percpu refcount
 * @ref: percpu_ref to get
 *
 * Analogous to atomic_long_inc().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_get(struct percpu_ref *ref)
{
        percpu_ref_get_many(ref, 1);
}

/**
 * percpu_ref_tryget_many - try to increment a percpu refcount
 * @ref: percpu_ref to try-get
 * @nr: number of references to get
 *
 * Increment a percpu refcount  by @nr unless its count already reached zero.
 * Returns %true on success; %false on failure.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget_many(struct percpu_ref *ref,
                                          unsigned long nr)
{
        unsigned long __percpu *percpu_count;
        bool ret;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count)) {
                this_cpu_add(*percpu_count, nr);
                ret = true;
        } else {
                ret = atomic_long_add_unless(&ref->data->count, nr, 0);
        }

        rcu_read_unlock();

        return ret;
}

/**
 * percpu_ref_tryget - try to increment a percpu refcount
 * @ref: percpu_ref to try-get
 *
 * Increment a percpu refcount unless its count already reached zero.
 * Returns %true on success; %false on failure.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget(struct percpu_ref *ref)
{
        return percpu_ref_tryget_many(ref, 1);
}

/**
 * percpu_ref_tryget_live_rcu - same as percpu_ref_tryget_live() but the
 * caller is responsible for taking RCU.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget_live_rcu(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count;
        bool ret = false;

        WARN_ON_ONCE(!rcu_read_lock_held());

        if (likely(__ref_is_percpu(ref, &percpu_count))) {
                this_cpu_inc(*percpu_count);
                ret = true;
        } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) {
                ret = atomic_long_inc_not_zero(&ref->data->count);
        }
        return ret;
}

/**
 * percpu_ref_tryget_live - try to increment a live percpu refcount
 * @ref: percpu_ref to try-get
 *
 * Increment a percpu refcount unless it has already been killed.  Returns
 * %true on success; %false on failure.
 *
 * Completion of percpu_ref_kill() in itself doesn't guarantee that this
 * function will fail.  For such guarantee, percpu_ref_kill_and_confirm()
 * should be used.  After the confirm_kill callback is invoked, it's
 * guaranteed that no new reference will be given out by
 * percpu_ref_tryget_live().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
{
        bool ret = false;

        rcu_read_lock();
        ret = percpu_ref_tryget_live_rcu(ref);
        rcu_read_unlock();
        return ret;
}

/**
 * percpu_ref_put_many - decrement a percpu refcount
 * @ref: percpu_ref to put
 * @nr: number of references to put
 *
 * Decrement the refcount, and if 0, call the release function (which was passed
 * to percpu_ref_init())
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr)
{
        unsigned long __percpu *percpu_count;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count))
                this_cpu_sub(*percpu_count, nr);
        else if (unlikely(atomic_long_sub_and_test(nr, &ref->data->count)))
                ref->data->release(ref);

        rcu_read_unlock();
}

/**
 * percpu_ref_put - decrement a percpu refcount
 * @ref: percpu_ref to put
 *
 * Decrement the refcount, and if 0, call the release function (which was passed
 * to percpu_ref_init())
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_put(struct percpu_ref *ref)
{
        percpu_ref_put_many(ref, 1);
}

/**
 * percpu_ref_is_dying - test whether a percpu refcount is dying or dead
 * @ref: percpu_ref to test
 *
 * Returns %true if @ref is dying or dead.
 *
 * This function is safe to call as long as @ref is between init and exit
 * and the caller is responsible for synchronizing against state changes.
 */
static inline bool percpu_ref_is_dying(struct percpu_ref *ref)
{
        return ref->percpu_count_ptr & __PERCPU_REF_DEAD;
}

#endif





























































































































   37 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_X86_XSAVE_H
#define __ASM_X86_XSAVE_H

#include <linux/uaccess.h>
#include <linux/types.h>

#include <asm/processor.h>
#include <asm/fpu/api.h>
#include <asm/user.h>

/* Bit 63 of XCR0 is reserved for future expansion */
#define XFEATURE_MASK_EXTEND        (~(XFEATURE_MASK_FPSSE | (1ULL << 63)))

#define XSTATE_CPUID                0x0000000d

#define TILE_CPUID                0x0000001d

#define FXSAVE_SIZE        512

#define XSAVE_HDR_SIZE            64
#define XSAVE_HDR_OFFSET    FXSAVE_SIZE

#define XSAVE_YMM_SIZE            256
#define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)

#define XSAVE_ALIGNMENT     64

/* All currently supported user features */
#define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \
                                      XFEATURE_MASK_SSE | \
                                      XFEATURE_MASK_YMM | \
                                      XFEATURE_MASK_OPMASK | \
                                      XFEATURE_MASK_ZMM_Hi256 | \
                                      XFEATURE_MASK_Hi16_ZMM         | \
                                      XFEATURE_MASK_PKRU | \
                                      XFEATURE_MASK_BNDREGS | \
                                      XFEATURE_MASK_BNDCSR | \
                                      XFEATURE_MASK_XTILE)

/*
 * Features which are restored when returning to user space.
 * PKRU is not restored on return to user space because PKRU
 * is switched eagerly in switch_to() and flush_thread()
 */
#define XFEATURE_MASK_USER_RESTORE        \
        (XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU)

/* Features which are dynamically enabled for a process on request */
#define XFEATURE_MASK_USER_DYNAMIC        XFEATURE_MASK_XTILE_DATA

/* All currently supported supervisor features */
#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \
                                            XFEATURE_MASK_CET_USER)

/*
 * A supervisor state component may not always contain valuable information,
 * and its size may be huge. Saving/restoring such supervisor state components
 * at each context switch can cause high CPU and space overhead, which should
 * be avoided. Such supervisor state components should only be saved/restored
 * on demand. The on-demand supervisor features are set in this mask.
 *
 * Unlike the existing supported supervisor features, an independent supervisor
 * feature does not allocate a buffer in task->fpu, and the corresponding
 * supervisor state component cannot be saved/restored at each context switch.
 *
 * To support an independent supervisor feature, a developer should follow the
 * dos and don'ts as below:
 * - Do dynamically allocate a buffer for the supervisor state component.
 * - Do manually invoke the XSAVES/XRSTORS instruction to save/restore the
 *   state component to/from the buffer.
 * - Don't set the bit corresponding to the independent supervisor feature in
 *   IA32_XSS at run time, since it has been set at boot time.
 */
#define XFEATURE_MASK_INDEPENDENT (XFEATURE_MASK_LBR)

/*
 * Unsupported supervisor features. When a supervisor feature in this mask is
 * supported in the future, move it to the supported supervisor feature mask.
 */
#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT | \
                                              XFEATURE_MASK_CET_KERNEL)

/* All supervisor states including supported and unsupported states. */
#define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
                                      XFEATURE_MASK_INDEPENDENT | \
                                      XFEATURE_MASK_SUPERVISOR_UNSUPPORTED)

/*
 * The feature mask required to restore FPU state:
 * - All user states which are not eagerly switched in switch_to()/exec()
 * - The suporvisor states
 */
#define XFEATURE_MASK_FPSTATE        (XFEATURE_MASK_USER_RESTORE | \
                                 XFEATURE_MASK_SUPERVISOR_SUPPORTED)

/*
 * Features in this mask have space allocated in the signal frame, but may not
 * have that space initialized when the feature is in its init state.
 */
#define XFEATURE_MASK_SIGFRAME_INITOPT        (XFEATURE_MASK_XTILE | \
                                         XFEATURE_MASK_USER_DYNAMIC)

extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];

extern void __init update_regset_xstate_info(unsigned int size,
                                             u64 xstate_mask);

int xfeature_size(int xfeature_nr);

void xsaves(struct xregs_state *xsave, u64 mask);
void xrstors(struct xregs_state *xsave, u64 mask);

int xfd_enable_feature(u64 xfd_err);

#ifdef CONFIG_X86_64
DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
#endif

#ifdef CONFIG_X86_64
DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);

static __always_inline __pure bool fpu_state_size_dynamic(void)
{
        return static_branch_unlikely(&__fpu_state_size_dynamic);
}
#else
static __always_inline __pure bool fpu_state_size_dynamic(void)
{
        return false;
}
#endif

#endif



















































































































































































































































































































































































































































































































































































































































































































































































































































    1 







































































































































































































































































































































































































































































































































    1 












    1 



    1 





    1 




















    1 


    1 



    1 






























































































































































































    1 







    1 

    1 






    1 













    1 

















    1 




    1 










    1 



    1 






















































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Add configfs and memory store: Kyungchan Koh <kkc6196@fb.com> and
 * Shaohua Li <shli@fb.com>
 */
#include <linux/module.h>

#include <linux/moduleparam.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/init.h>
#include "null_blk.h"

#undef pr_fmt
#define pr_fmt(fmt)        "null_blk: " fmt

#define FREE_BATCH                16

#define TICKS_PER_SEC                50ULL
#define TIMER_INTERVAL                (NSEC_PER_SEC / TICKS_PER_SEC)

#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
static DECLARE_FAULT_ATTR(null_timeout_attr);
static DECLARE_FAULT_ATTR(null_requeue_attr);
static DECLARE_FAULT_ATTR(null_init_hctx_attr);
#endif

static inline u64 mb_per_tick(int mbps)
{
        return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
}

/*
 * Status flags for nullb_device.
 *
 * CONFIGURED:        Device has been configured and turned on. Cannot reconfigure.
 * UP:                Device is currently on and visible in userspace.
 * THROTTLED:        Device is being throttled.
 * CACHE:        Device is using a write-back cache.
 */
enum nullb_device_flags {
        NULLB_DEV_FL_CONFIGURED        = 0,
        NULLB_DEV_FL_UP                = 1,
        NULLB_DEV_FL_THROTTLED        = 2,
        NULLB_DEV_FL_CACHE        = 3,
};

#define MAP_SZ                ((PAGE_SIZE >> SECTOR_SHIFT) + 2)
/*
 * nullb_page is a page in memory for nullb devices.
 *
 * @page:        The page holding the data.
 * @bitmap:        The bitmap represents which sector in the page has data.
 *                Each bit represents one block size. For example, sector 8
 *                will use the 7th bit
 * The highest 2 bits of bitmap are for special purpose. LOCK means the cache
 * page is being flushing to storage. FREE means the cache page is freed and
 * should be skipped from flushing to storage. Please see
 * null_make_cache_space
 */
struct nullb_page {
        struct page *page;
        DECLARE_BITMAP(bitmap, MAP_SZ);
};
#define NULLB_PAGE_LOCK (MAP_SZ - 1)
#define NULLB_PAGE_FREE (MAP_SZ - 2)

static LIST_HEAD(nullb_list);
static struct mutex lock;
static int null_major;
static DEFINE_IDA(nullb_indexes);
static struct blk_mq_tag_set tag_set;

enum {
        NULL_IRQ_NONE                = 0,
        NULL_IRQ_SOFTIRQ        = 1,
        NULL_IRQ_TIMER                = 2,
};

static bool g_virt_boundary = false;
module_param_named(virt_boundary, g_virt_boundary, bool, 0444);
MODULE_PARM_DESC(virt_boundary, "Require a virtual boundary for the device. Default: False");

static int g_no_sched;
module_param_named(no_sched, g_no_sched, int, 0444);
MODULE_PARM_DESC(no_sched, "No io scheduler");

static int g_submit_queues = 1;
module_param_named(submit_queues, g_submit_queues, int, 0444);
MODULE_PARM_DESC(submit_queues, "Number of submission queues");

static int g_poll_queues = 1;
module_param_named(poll_queues, g_poll_queues, int, 0444);
MODULE_PARM_DESC(poll_queues, "Number of IOPOLL submission queues");

static int g_home_node = NUMA_NO_NODE;
module_param_named(home_node, g_home_node, int, 0444);
MODULE_PARM_DESC(home_node, "Home node for the device");

#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
/*
 * For more details about fault injection, please refer to
 * Documentation/fault-injection/fault-injection.rst.
 */
static char g_timeout_str[80];
module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444);
MODULE_PARM_DESC(timeout, "Fault injection. timeout=<interval>,<probability>,<space>,<times>");

static char g_requeue_str[80];
module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444);
MODULE_PARM_DESC(requeue, "Fault injection. requeue=<interval>,<probability>,<space>,<times>");

static char g_init_hctx_str[80];
module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444);
MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>");
#endif

/*
 * Historic queue modes.
 *
 * These days nothing but NULL_Q_MQ is actually supported, but we keep it the
 * enum for error reporting.
 */
enum {
        NULL_Q_BIO        = 0,
        NULL_Q_RQ        = 1,
        NULL_Q_MQ        = 2,
};

static int g_queue_mode = NULL_Q_MQ;

static int null_param_store_val(const char *str, int *val, int min, int max)
{
        int ret, new_val;

        ret = kstrtoint(str, 10, &new_val);
        if (ret)
                return -EINVAL;

        if (new_val < min || new_val > max)
                return -EINVAL;

        *val = new_val;
        return 0;
}

static int null_set_queue_mode(const char *str, const struct kernel_param *kp)
{
        return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ);
}

static const struct kernel_param_ops null_queue_mode_param_ops = {
        .set        = null_set_queue_mode,
        .get        = param_get_int,
};

device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444);
MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)");

static int g_gb = 250;
module_param_named(gb, g_gb, int, 0444);
MODULE_PARM_DESC(gb, "Size in GB");

static int g_bs = 512;
module_param_named(bs, g_bs, int, 0444);
MODULE_PARM_DESC(bs, "Block size (in bytes)");

static int g_max_sectors;
module_param_named(max_sectors, g_max_sectors, int, 0444);
MODULE_PARM_DESC(max_sectors, "Maximum size of a command (in 512B sectors)");

static unsigned int nr_devices = 1;
module_param(nr_devices, uint, 0444);
MODULE_PARM_DESC(nr_devices, "Number of devices to register");

static bool g_blocking;
module_param_named(blocking, g_blocking, bool, 0444);
MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");

static bool g_shared_tags;
module_param_named(shared_tags, g_shared_tags, bool, 0444);
MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq");

static bool g_shared_tag_bitmap;
module_param_named(shared_tag_bitmap, g_shared_tag_bitmap, bool, 0444);
MODULE_PARM_DESC(shared_tag_bitmap, "Use shared tag bitmap for all submission queues for blk-mq");

static int g_irqmode = NULL_IRQ_SOFTIRQ;

static int null_set_irqmode(const char *str, const struct kernel_param *kp)
{
        return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE,
                                        NULL_IRQ_TIMER);
}

static const struct kernel_param_ops null_irqmode_param_ops = {
        .set        = null_set_irqmode,
        .get        = param_get_int,
};

device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444);
MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");

static unsigned long g_completion_nsec = 10000;
module_param_named(completion_nsec, g_completion_nsec, ulong, 0444);
MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns");

static int g_hw_queue_depth = 64;
module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444);
MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64");

static bool g_use_per_node_hctx;
module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444);
MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");

static bool g_memory_backed;
module_param_named(memory_backed, g_memory_backed, bool, 0444);
MODULE_PARM_DESC(memory_backed, "Create a memory-backed block device. Default: false");

static bool g_discard;
module_param_named(discard, g_discard, bool, 0444);
MODULE_PARM_DESC(discard, "Support discard operations (requires memory-backed null_blk device). Default: false");

static unsigned long g_cache_size;
module_param_named(cache_size, g_cache_size, ulong, 0444);
MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)");

static bool g_fua = true;
module_param_named(fua, g_fua, bool, 0444);
MODULE_PARM_DESC(zoned, "Enable/disable FUA support when cache_size is used. Default: true");

static unsigned int g_mbps;
module_param_named(mbps, g_mbps, uint, 0444);
MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)");

static bool g_zoned;
module_param_named(zoned, g_zoned, bool, S_IRUGO);
MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false");

static unsigned long g_zone_size = 256;
module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");

static unsigned long g_zone_capacity;
module_param_named(zone_capacity, g_zone_capacity, ulong, 0444);
MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size");

static unsigned int g_zone_nr_conv;
module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0");

static unsigned int g_zone_max_open;
module_param_named(zone_max_open, g_zone_max_open, uint, 0444);
MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)");

static unsigned int g_zone_max_active;
module_param_named(zone_max_active, g_zone_max_active, uint, 0444);
MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)");

static int g_zone_append_max_sectors = INT_MAX;
module_param_named(zone_append_max_sectors, g_zone_append_max_sectors, int, 0444);
MODULE_PARM_DESC(zone_append_max_sectors,
                 "Maximum size of a zone append command (in 512B sectors). Specify 0 for zone append emulation");

static struct nullb_device *null_alloc_dev(void);
static void null_free_dev(struct nullb_device *dev);
static void null_del_dev(struct nullb *nullb);
static int null_add_dev(struct nullb_device *dev);
static struct nullb *null_find_dev_by_name(const char *name);
static void null_free_device_storage(struct nullb_device *dev, bool is_cache);

static inline struct nullb_device *to_nullb_device(struct config_item *item)
{
        return item ? container_of(to_config_group(item), struct nullb_device, group) : NULL;
}

static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page)
{
        return snprintf(page, PAGE_SIZE, "%u\n", val);
}

static inline ssize_t nullb_device_ulong_attr_show(unsigned long val,
        char *page)
{
        return snprintf(page, PAGE_SIZE, "%lu\n", val);
}

static inline ssize_t nullb_device_bool_attr_show(bool val, char *page)
{
        return snprintf(page, PAGE_SIZE, "%u\n", val);
}

static ssize_t nullb_device_uint_attr_store(unsigned int *val,
        const char *page, size_t count)
{
        unsigned int tmp;
        int result;

        result = kstrtouint(page, 0, &tmp);
        if (result < 0)
                return result;

        *val = tmp;
        return count;
}

static ssize_t nullb_device_ulong_attr_store(unsigned long *val,
        const char *page, size_t count)
{
        int result;
        unsigned long tmp;

        result = kstrtoul(page, 0, &tmp);
        if (result < 0)
                return result;

        *val = tmp;
        return count;
}

static ssize_t nullb_device_bool_attr_store(bool *val, const char *page,
        size_t count)
{
        bool tmp;
        int result;

        result = kstrtobool(page,  &tmp);
        if (result < 0)
                return result;

        *val = tmp;
        return count;
}

/* The following macro should only be used with TYPE = {uint, ulong, bool}. */
#define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY)                                \
static ssize_t                                                                \
nullb_device_##NAME##_show(struct config_item *item, char *page)        \
{                                                                        \
        return nullb_device_##TYPE##_attr_show(                                \
                                to_nullb_device(item)->NAME, page);        \
}                                                                        \
static ssize_t                                                                \
nullb_device_##NAME##_store(struct config_item *item, const char *page,        \
                            size_t count)                                \
{                                                                        \
        int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\
        struct nullb_device *dev = to_nullb_device(item);                \
        TYPE new_value = 0;                                                \
        int ret;                                                        \
                                                                        \
        ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\
        if (ret < 0)                                                        \
                return ret;                                                \
        if (apply_fn)                                                        \
                ret = apply_fn(dev, new_value);                                \
        else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags))         \
                ret = -EBUSY;                                                \
        if (ret < 0)                                                        \
                return ret;                                                \
        dev->NAME = new_value;                                                \
        return count;                                                        \
}                                                                        \
CONFIGFS_ATTR(nullb_device_, NAME);

static int nullb_update_nr_hw_queues(struct nullb_device *dev,
                                     unsigned int submit_queues,
                                     unsigned int poll_queues)

{
        struct blk_mq_tag_set *set;
        int ret, nr_hw_queues;

        if (!dev->nullb)
                return 0;

        /*
         * Make sure at least one submit queue exists.
         */
        if (!submit_queues)
                return -EINVAL;

        /*
         * Make sure that null_init_hctx() does not access nullb->queues[] past
         * the end of that array.
         */
        if (submit_queues > nr_cpu_ids || poll_queues > g_poll_queues)
                return -EINVAL;

        /*
         * Keep previous and new queue numbers in nullb_device for reference in
         * the call back function null_map_queues().
         */
        dev->prev_submit_queues = dev->submit_queues;
        dev->prev_poll_queues = dev->poll_queues;
        dev->submit_queues = submit_queues;
        dev->poll_queues = poll_queues;

        set = dev->nullb->tag_set;
        nr_hw_queues = submit_queues + poll_queues;
        blk_mq_update_nr_hw_queues(set, nr_hw_queues);
        ret = set->nr_hw_queues == nr_hw_queues ? 0 : -ENOMEM;

        if (ret) {
                /* on error, revert the queue numbers */
                dev->submit_queues = dev->prev_submit_queues;
                dev->poll_queues = dev->prev_poll_queues;
        }

        return ret;
}

static int nullb_apply_submit_queues(struct nullb_device *dev,
                                     unsigned int submit_queues)
{
        int ret;

        mutex_lock(&lock);
        ret = nullb_update_nr_hw_queues(dev, submit_queues, dev->poll_queues);
        mutex_unlock(&lock);

        return ret;
}

static int nullb_apply_poll_queues(struct nullb_device *dev,
                                   unsigned int poll_queues)
{
        int ret;

        mutex_lock(&lock);
        ret = nullb_update_nr_hw_queues(dev, dev->submit_queues, poll_queues);
        mutex_unlock(&lock);

        return ret;
}

NULLB_DEVICE_ATTR(size, ulong, NULL);
NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL);
NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues);
NULLB_DEVICE_ATTR(poll_queues, uint, nullb_apply_poll_queues);
NULLB_DEVICE_ATTR(home_node, uint, NULL);
NULLB_DEVICE_ATTR(queue_mode, uint, NULL);
NULLB_DEVICE_ATTR(blocksize, uint, NULL);
NULLB_DEVICE_ATTR(max_sectors, uint, NULL);
NULLB_DEVICE_ATTR(irqmode, uint, NULL);
NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL);
NULLB_DEVICE_ATTR(index, uint, NULL);
NULLB_DEVICE_ATTR(blocking, bool, NULL);
NULLB_DEVICE_ATTR(use_per_node_hctx, bool, NULL);
NULLB_DEVICE_ATTR(memory_backed, bool, NULL);
NULLB_DEVICE_ATTR(discard, bool, NULL);
NULLB_DEVICE_ATTR(mbps, uint, NULL);
NULLB_DEVICE_ATTR(cache_size, ulong, NULL);
NULLB_DEVICE_ATTR(zoned, bool, NULL);
NULLB_DEVICE_ATTR(zone_size, ulong, NULL);
NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL);
NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
NULLB_DEVICE_ATTR(zone_append_max_sectors, uint, NULL);
NULLB_DEVICE_ATTR(virt_boundary, bool, NULL);
NULLB_DEVICE_ATTR(no_sched, bool, NULL);
NULLB_DEVICE_ATTR(shared_tags, bool, NULL);
NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL);
NULLB_DEVICE_ATTR(fua, bool, NULL);

static ssize_t nullb_device_power_show(struct config_item *item, char *page)
{
        return nullb_device_bool_attr_show(to_nullb_device(item)->power, page);
}

static ssize_t nullb_device_power_store(struct config_item *item,
                                     const char *page, size_t count)
{
        struct nullb_device *dev = to_nullb_device(item);
        bool newp = false;
        ssize_t ret;

        ret = nullb_device_bool_attr_store(&newp, page, count);
        if (ret < 0)
                return ret;

        ret = count;
        mutex_lock(&lock);
        if (!dev->power && newp) {
                if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags))
                        goto out;

                ret = null_add_dev(dev);
                if (ret) {
                        clear_bit(NULLB_DEV_FL_UP, &dev->flags);
                        goto out;
                }

                set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags);
                dev->power = newp;
                ret = count;
        } else if (dev->power && !newp) {
                if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) {
                        dev->power = newp;
                        null_del_dev(dev->nullb);
                }
                clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags);
        }

out:
        mutex_unlock(&lock);
        return ret;
}

CONFIGFS_ATTR(nullb_device_, power);

static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page)
{
        struct nullb_device *t_dev = to_nullb_device(item);

        return badblocks_show(&t_dev->badblocks, page, 0);
}

static ssize_t nullb_device_badblocks_store(struct config_item *item,
                                     const char *page, size_t count)
{
        struct nullb_device *t_dev = to_nullb_device(item);
        char *orig, *buf, *tmp;
        u64 start, end;
        int ret;

        orig = kstrndup(page, count, GFP_KERNEL);
        if (!orig)
                return -ENOMEM;

        buf = strstrip(orig);

        ret = -EINVAL;
        if (buf[0] != '+' && buf[0] != '-')
                goto out;
        tmp = strchr(&buf[1], '-');
        if (!tmp)
                goto out;
        *tmp = '\0';
        ret = kstrtoull(buf + 1, 0, &start);
        if (ret)
                goto out;
        ret = kstrtoull(tmp + 1, 0, &end);
        if (ret)
                goto out;
        ret = -EINVAL;
        if (start > end)
                goto out;
        /* enable badblocks */
        cmpxchg(&t_dev->badblocks.shift, -1, 0);
        if (buf[0] == '+')
                ret = badblocks_set(&t_dev->badblocks, start,
                        end - start + 1, 1);
        else
                ret = badblocks_clear(&t_dev->badblocks, start,
                        end - start + 1);
        if (ret == 0)
                ret = count;
out:
        kfree(orig);
        return ret;
}
CONFIGFS_ATTR(nullb_device_, badblocks);

static ssize_t nullb_device_zone_readonly_store(struct config_item *item,
                                                const char *page, size_t count)
{
        struct nullb_device *dev = to_nullb_device(item);

        return zone_cond_store(dev, page, count, BLK_ZONE_COND_READONLY);
}
CONFIGFS_ATTR_WO(nullb_device_, zone_readonly);

static ssize_t nullb_device_zone_offline_store(struct config_item *item,
                                               const char *page, size_t count)
{
        struct nullb_device *dev = to_nullb_device(item);

        return zone_cond_store(dev, page, count, BLK_ZONE_COND_OFFLINE);
}
CONFIGFS_ATTR_WO(nullb_device_, zone_offline);

static struct configfs_attribute *nullb_device_attrs[] = {
        &nullb_device_attr_size,
        &nullb_device_attr_completion_nsec,
        &nullb_device_attr_submit_queues,
        &nullb_device_attr_poll_queues,
        &nullb_device_attr_home_node,
        &nullb_device_attr_queue_mode,
        &nullb_device_attr_blocksize,
        &nullb_device_attr_max_sectors,
        &nullb_device_attr_irqmode,
        &nullb_device_attr_hw_queue_depth,
        &nullb_device_attr_index,
        &nullb_device_attr_blocking,
        &nullb_device_attr_use_per_node_hctx,
        &nullb_device_attr_power,
        &nullb_device_attr_memory_backed,
        &nullb_device_attr_discard,
        &nullb_device_attr_mbps,
        &nullb_device_attr_cache_size,
        &nullb_device_attr_badblocks,
        &nullb_device_attr_zoned,
        &nullb_device_attr_zone_size,
        &nullb_device_attr_zone_capacity,
        &nullb_device_attr_zone_nr_conv,
        &nullb_device_attr_zone_max_open,
        &nullb_device_attr_zone_max_active,
        &nullb_device_attr_zone_append_max_sectors,
        &nullb_device_attr_zone_readonly,
        &nullb_device_attr_zone_offline,
        &nullb_device_attr_virt_boundary,
        &nullb_device_attr_no_sched,
        &nullb_device_attr_shared_tags,
        &nullb_device_attr_shared_tag_bitmap,
        &nullb_device_attr_fua,
        NULL,
};

static void nullb_device_release(struct config_item *item)
{
        struct nullb_device *dev = to_nullb_device(item);

        null_free_device_storage(dev, false);
        null_free_dev(dev);
}

static struct configfs_item_operations nullb_device_ops = {
        .release        = nullb_device_release,
};

static const struct config_item_type nullb_device_type = {
        .ct_item_ops        = &nullb_device_ops,
        .ct_attrs        = nullb_device_attrs,
        .ct_owner        = THIS_MODULE,
};

#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION

static void nullb_add_fault_config(struct nullb_device *dev)
{
        fault_config_init(&dev->timeout_config, "timeout_inject");
        fault_config_init(&dev->requeue_config, "requeue_inject");
        fault_config_init(&dev->init_hctx_fault_config, "init_hctx_fault_inject");

        configfs_add_default_group(&dev->timeout_config.group, &dev->group);
        configfs_add_default_group(&dev->requeue_config.group, &dev->group);
        configfs_add_default_group(&dev->init_hctx_fault_config.group, &dev->group);
}

#else

static void nullb_add_fault_config(struct nullb_device *dev)
{
}

#endif

static struct
config_group *nullb_group_make_group(struct config_group *group, const char *name)
{
        struct nullb_device *dev;

        if (null_find_dev_by_name(name))
                return ERR_PTR(-EEXIST);

        dev = null_alloc_dev();
        if (!dev)
                return ERR_PTR(-ENOMEM);

        config_group_init_type_name(&dev->group, name, &nullb_device_type);
        nullb_add_fault_config(dev);

        return &dev->group;
}

static void
nullb_group_drop_item(struct config_group *group, struct config_item *item)
{
        struct nullb_device *dev = to_nullb_device(item);

        if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) {
                mutex_lock(&lock);
                dev->power = false;
                null_del_dev(dev->nullb);
                mutex_unlock(&lock);
        }

        config_item_put(item);
}

static ssize_t memb_group_features_show(struct config_item *item, char *page)
{
        return snprintf(page, PAGE_SIZE,
                        "badblocks,blocking,blocksize,cache_size,fua,"
                        "completion_nsec,discard,home_node,hw_queue_depth,"
                        "irqmode,max_sectors,mbps,memory_backed,no_sched,"
                        "poll_queues,power,queue_mode,shared_tag_bitmap,"
                        "shared_tags,size,submit_queues,use_per_node_hctx,"
                        "virt_boundary,zoned,zone_capacity,zone_max_active,"
                        "zone_max_open,zone_nr_conv,zone_offline,zone_readonly,"
                        "zone_size,zone_append_max_sectors\n");
}

CONFIGFS_ATTR_RO(memb_group_, features);

static struct configfs_attribute *nullb_group_attrs[] = {
        &memb_group_attr_features,
        NULL,
};

static struct configfs_group_operations nullb_group_ops = {
        .make_group        = nullb_group_make_group,
        .drop_item        = nullb_group_drop_item,
};

static const struct config_item_type nullb_group_type = {
        .ct_group_ops        = &nullb_group_ops,
        .ct_attrs        = nullb_group_attrs,
        .ct_owner        = THIS_MODULE,
};

static struct configfs_subsystem nullb_subsys = {
        .su_group = {
                .cg_item = {
                        .ci_namebuf = "nullb",
                        .ci_type = &nullb_group_type,
                },
        },
};

static inline int null_cache_active(struct nullb *nullb)
{
        return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
}

static struct nullb_device *null_alloc_dev(void)
{
        struct nullb_device *dev;

        dev = kzalloc(sizeof(*dev), GFP_KERNEL);
        if (!dev)
                return NULL;

#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
        dev->timeout_config.attr = null_timeout_attr;
        dev->requeue_config.attr = null_requeue_attr;
        dev->init_hctx_fault_config.attr = null_init_hctx_attr;
#endif

        INIT_RADIX_TREE(&dev->data, GFP_ATOMIC);
        INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC);
        if (badblocks_init(&dev->badblocks, 0)) {
                kfree(dev);
                return NULL;
        }

        dev->size = g_gb * 1024;
        dev->completion_nsec = g_completion_nsec;
        dev->submit_queues = g_submit_queues;
        dev->prev_submit_queues = g_submit_queues;
        dev->poll_queues = g_poll_queues;
        dev->prev_poll_queues = g_poll_queues;
        dev->home_node = g_home_node;
        dev->queue_mode = g_queue_mode;
        dev->blocksize = g_bs;
        dev->max_sectors = g_max_sectors;
        dev->irqmode = g_irqmode;
        dev->hw_queue_depth = g_hw_queue_depth;
        dev->blocking = g_blocking;
        dev->memory_backed = g_memory_backed;
        dev->discard = g_discard;
        dev->cache_size = g_cache_size;
        dev->mbps = g_mbps;
        dev->use_per_node_hctx = g_use_per_node_hctx;
        dev->zoned = g_zoned;
        dev->zone_size = g_zone_size;
        dev->zone_capacity = g_zone_capacity;
        dev->zone_nr_conv = g_zone_nr_conv;
        dev->zone_max_open = g_zone_max_open;
        dev->zone_max_active = g_zone_max_active;
        dev->zone_append_max_sectors = g_zone_append_max_sectors;
        dev->virt_boundary = g_virt_boundary;
        dev->no_sched = g_no_sched;
        dev->shared_tags = g_shared_tags;
        dev->shared_tag_bitmap = g_shared_tag_bitmap;
        dev->fua = g_fua;

        return dev;
}

static void null_free_dev(struct nullb_device *dev)
{
        if (!dev)
                return;

        null_free_zoned_dev(dev);
        badblocks_exit(&dev->badblocks);
        kfree(dev);
}

static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
{
        struct nullb_cmd *cmd = container_of(timer, struct nullb_cmd, timer);

        blk_mq_end_request(blk_mq_rq_from_pdu(cmd), cmd->error);
        return HRTIMER_NORESTART;
}

static void null_cmd_end_timer(struct nullb_cmd *cmd)
{
        ktime_t kt = cmd->nq->dev->completion_nsec;

        hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL);
}

static void null_complete_rq(struct request *rq)
{
        struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);

        blk_mq_end_request(rq, cmd->error);
}

static struct nullb_page *null_alloc_page(void)
{
        struct nullb_page *t_page;

        t_page = kmalloc(sizeof(struct nullb_page), GFP_NOIO);
        if (!t_page)
                return NULL;

        t_page->page = alloc_pages(GFP_NOIO, 0);
        if (!t_page->page) {
                kfree(t_page);
                return NULL;
        }

        memset(t_page->bitmap, 0, sizeof(t_page->bitmap));
        return t_page;
}

static void null_free_page(struct nullb_page *t_page)
{
        __set_bit(NULLB_PAGE_FREE, t_page->bitmap);
        if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap))
                return;
        __free_page(t_page->page);
        kfree(t_page);
}

static bool null_page_empty(struct nullb_page *page)
{
        int size = MAP_SZ - 2;

        return find_first_bit(page->bitmap, size) == size;
}

static void null_free_sector(struct nullb *nullb, sector_t sector,
        bool is_cache)
{
        unsigned int sector_bit;
        u64 idx;
        struct nullb_page *t_page, *ret;
        struct radix_tree_root *root;

        root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
        idx = sector >> PAGE_SECTORS_SHIFT;
        sector_bit = (sector & SECTOR_MASK);

        t_page = radix_tree_lookup(root, idx);
        if (t_page) {
                __clear_bit(sector_bit, t_page->bitmap);

                if (null_page_empty(t_page)) {
                        ret = radix_tree_delete_item(root, idx, t_page);
                        WARN_ON(ret != t_page);
                        null_free_page(ret);
                        if (is_cache)
                                nullb->dev->curr_cache -= PAGE_SIZE;
                }
        }
}

static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx,
        struct nullb_page *t_page, bool is_cache)
{
        struct radix_tree_root *root;

        root = is_cache ? &nullb->dev->cache : &nullb->dev->data;

        if (radix_tree_insert(root, idx, t_page)) {
                null_free_page(t_page);
                t_page = radix_tree_lookup(root, idx);
                WARN_ON(!t_page || t_page->page->index != idx);
        } else if (is_cache)
                nullb->dev->curr_cache += PAGE_SIZE;

        return t_page;
}

static void null_free_device_storage(struct nullb_device *dev, bool is_cache)
{
        unsigned long pos = 0;
        int nr_pages;
        struct nullb_page *ret, *t_pages[FREE_BATCH];
        struct radix_tree_root *root;

        root = is_cache ? &dev->cache : &dev->data;

        do {
                int i;

                nr_pages = radix_tree_gang_lookup(root,
                                (void **)t_pages, pos, FREE_BATCH);

                for (i = 0; i < nr_pages; i++) {
                        pos = t_pages[i]->page->index;
                        ret = radix_tree_delete_item(root, pos, t_pages[i]);
                        WARN_ON(ret != t_pages[i]);
                        null_free_page(ret);
                }

                pos++;
        } while (nr_pages == FREE_BATCH);

        if (is_cache)
                dev->curr_cache = 0;
}

static struct nullb_page *__null_lookup_page(struct nullb *nullb,
        sector_t sector, bool for_write, bool is_cache)
{
        unsigned int sector_bit;
        u64 idx;
        struct nullb_page *t_page;
        struct radix_tree_root *root;

        idx = sector >> PAGE_SECTORS_SHIFT;
        sector_bit = (sector & SECTOR_MASK);

        root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
        t_page = radix_tree_lookup(root, idx);
        WARN_ON(t_page && t_page->page->index != idx);

        if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap)))
                return t_page;

        return NULL;
}

static struct nullb_page *null_lookup_page(struct nullb *nullb,
        sector_t sector, bool for_write, bool ignore_cache)
{
        struct nullb_page *page = NULL;

        if (!ignore_cache)
                page = __null_lookup_page(nullb, sector, for_write, true);
        if (page)
                return page;
        return __null_lookup_page(nullb, sector, for_write, false);
}

static struct nullb_page *null_insert_page(struct nullb *nullb,
                                           sector_t sector, bool ignore_cache)
        __releases(&nullb->lock)
        __acquires(&nullb->lock)
{
        u64 idx;
        struct nullb_page *t_page;

        t_page = null_lookup_page(nullb, sector, true, ignore_cache);
        if (t_page)
                return t_page;

        spin_unlock_irq(&nullb->lock);

        t_page = null_alloc_page();
        if (!t_page)
                goto out_lock;

        if (radix_tree_preload(GFP_NOIO))
                goto out_freepage;

        spin_lock_irq(&nullb->lock);
        idx = sector >> PAGE_SECTORS_SHIFT;
        t_page->page->index = idx;
        t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache);
        radix_tree_preload_end();

        return t_page;
out_freepage:
        null_free_page(t_page);
out_lock:
        spin_lock_irq(&nullb->lock);
        return null_lookup_page(nullb, sector, true, ignore_cache);
}

static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page)
{
        int i;
        unsigned int offset;
        u64 idx;
        struct nullb_page *t_page, *ret;
        void *dst, *src;

        idx = c_page->page->index;

        t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true);

        __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap);
        if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) {
                null_free_page(c_page);
                if (t_page && null_page_empty(t_page)) {
                        ret = radix_tree_delete_item(&nullb->dev->data,
                                idx, t_page);
                        null_free_page(t_page);
                }
                return 0;
        }

        if (!t_page)
                return -ENOMEM;

        src = kmap_local_page(c_page->page);
        dst = kmap_local_page(t_page->page);

        for (i = 0; i < PAGE_SECTORS;
                        i += (nullb->dev->blocksize >> SECTOR_SHIFT)) {
                if (test_bit(i, c_page->bitmap)) {
                        offset = (i << SECTOR_SHIFT);
                        memcpy(dst + offset, src + offset,
                                nullb->dev->blocksize);
                        __set_bit(i, t_page->bitmap);
                }
        }

        kunmap_local(dst);
        kunmap_local(src);

        ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page);
        null_free_page(ret);
        nullb->dev->curr_cache -= PAGE_SIZE;

        return 0;
}

static int null_make_cache_space(struct nullb *nullb, unsigned long n)
{
        int i, err, nr_pages;
        struct nullb_page *c_pages[FREE_BATCH];
        unsigned long flushed = 0, one_round;

again:
        if ((nullb->dev->cache_size * 1024 * 1024) >
             nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0)
                return 0;

        nr_pages = radix_tree_gang_lookup(&nullb->dev->cache,
                        (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH);
        /*
         * nullb_flush_cache_page could unlock before using the c_pages. To
         * avoid race, we don't allow page free
         */
        for (i = 0; i < nr_pages; i++) {
                nullb->cache_flush_pos = c_pages[i]->page->index;
                /*
                 * We found the page which is being flushed to disk by other
                 * threads
                 */
                if (test_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap))
                        c_pages[i] = NULL;
                else
                        __set_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap);
        }

        one_round = 0;
        for (i = 0; i < nr_pages; i++) {
                if (c_pages[i] == NULL)
                        continue;
                err = null_flush_cache_page(nullb, c_pages[i]);
                if (err)
                        return err;
                one_round++;
        }
        flushed += one_round << PAGE_SHIFT;

        if (n > flushed) {
                if (nr_pages == 0)
                        nullb->cache_flush_pos = 0;
                if (one_round == 0) {
                        /* give other threads a chance */
                        spin_unlock_irq(&nullb->lock);
                        spin_lock_irq(&nullb->lock);
                }
                goto again;
        }
        return 0;
}

static int copy_to_nullb(struct nullb *nullb, struct page *source,
        unsigned int off, sector_t sector, size_t n, bool is_fua)
{
        size_t temp, count = 0;
        unsigned int offset;
        struct nullb_page *t_page;

        while (count < n) {
                temp = min_t(size_t, nullb->dev->blocksize, n - count);

                if (null_cache_active(nullb) && !is_fua)
                        null_make_cache_space(nullb, PAGE_SIZE);

                offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
                t_page = null_insert_page(nullb, sector,
                        !null_cache_active(nullb) || is_fua);
                if (!t_page)
                        return -ENOSPC;

                memcpy_page(t_page->page, offset, source, off + count, temp);

                __set_bit(sector & SECTOR_MASK, t_page->bitmap);

                if (is_fua)
                        null_free_sector(nullb, sector, true);

                count += temp;
                sector += temp >> SECTOR_SHIFT;
        }
        return 0;
}

static int copy_from_nullb(struct nullb *nullb, struct page *dest,
        unsigned int off, sector_t sector, size_t n)
{
        size_t temp, count = 0;
        unsigned int offset;
        struct nullb_page *t_page;

        while (count < n) {
                temp = min_t(size_t, nullb->dev->blocksize, n - count);

                offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
                t_page = null_lookup_page(nullb, sector, false,
                        !null_cache_active(nullb));

                if (t_page)
                        memcpy_page(dest, off + count, t_page->page, offset,
                                    temp);
                else
                        zero_user(dest, off + count, temp);

                count += temp;
                sector += temp >> SECTOR_SHIFT;
        }
        return 0;
}

static void nullb_fill_pattern(struct nullb *nullb, struct page *page,
                               unsigned int len, unsigned int off)
{
        memset_page(page, off, 0xff, len);
}

blk_status_t null_handle_discard(struct nullb_device *dev,
                                 sector_t sector, sector_t nr_sectors)
{
        struct nullb *nullb = dev->nullb;
        size_t n = nr_sectors << SECTOR_SHIFT;
        size_t temp;

        spin_lock_irq(&nullb->lock);
        while (n > 0) {
                temp = min_t(size_t, n, dev->blocksize);
                null_free_sector(nullb, sector, false);
                if (null_cache_active(nullb))
                        null_free_sector(nullb, sector, true);
                sector += temp >> SECTOR_SHIFT;
                n -= temp;
        }
        spin_unlock_irq(&nullb->lock);

        return BLK_STS_OK;
}

static blk_status_t null_handle_flush(struct nullb *nullb)
{
        int err;

        if (!null_cache_active(nullb))
                return 0;

        spin_lock_irq(&nullb->lock);
        while (true) {
                err = null_make_cache_space(nullb,
                        nullb->dev->cache_size * 1024 * 1024);
                if (err || nullb->dev->curr_cache == 0)
                        break;
        }

        WARN_ON(!radix_tree_empty(&nullb->dev->cache));
        spin_unlock_irq(&nullb->lock);
        return errno_to_blk_status(err);
}

static int null_transfer(struct nullb *nullb, struct page *page,
        unsigned int len, unsigned int off, bool is_write, sector_t sector,
        bool is_fua)
{
        struct nullb_device *dev = nullb->dev;
        unsigned int valid_len = len;
        int err = 0;

        if (!is_write) {
                if (dev->zoned)
                        valid_len = null_zone_valid_read_len(nullb,
                                sector, len);

                if (valid_len) {
                        err = copy_from_nullb(nullb, page, off,
                                sector, valid_len);
                        off += valid_len;
                        len -= valid_len;
                }

                if (len)
                        nullb_fill_pattern(nullb, page, len, off);
                flush_dcache_page(page);
        } else {
                flush_dcache_page(page);
                err = copy_to_nullb(nullb, page, off, sector, len, is_fua);
        }

        return err;
}

static blk_status_t null_handle_rq(struct nullb_cmd *cmd)
{
        struct request *rq = blk_mq_rq_from_pdu(cmd);
        struct nullb *nullb = cmd->nq->dev->nullb;
        int err = 0;
        unsigned int len;
        sector_t sector = blk_rq_pos(rq);
        struct req_iterator iter;
        struct bio_vec bvec;

        spin_lock_irq(&nullb->lock);
        rq_for_each_segment(bvec, rq, iter) {
                len = bvec.bv_len;
                err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
                                     op_is_write(req_op(rq)), sector,
                                     rq->cmd_flags & REQ_FUA);
                if (err)
                        break;
                sector += len >> SECTOR_SHIFT;
        }
        spin_unlock_irq(&nullb->lock);

        return errno_to_blk_status(err);
}

static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd)
{
        struct nullb_device *dev = cmd->nq->dev;
        struct nullb *nullb = dev->nullb;
        blk_status_t sts = BLK_STS_OK;
        struct request *rq = blk_mq_rq_from_pdu(cmd);

        if (!hrtimer_active(&nullb->bw_timer))
                hrtimer_restart(&nullb->bw_timer);

        if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) {
                blk_mq_stop_hw_queues(nullb->q);
                /* race with timer */
                if (atomic_long_read(&nullb->cur_bytes) > 0)
                        blk_mq_start_stopped_hw_queues(nullb->q, true);
                /* requeue request */
                sts = BLK_STS_DEV_RESOURCE;
        }
        return sts;
}

static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd,
                                                 sector_t sector,
                                                 sector_t nr_sectors)
{
        struct badblocks *bb = &cmd->nq->dev->badblocks;
        sector_t first_bad;
        int bad_sectors;

        if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors))
                return BLK_STS_IOERR;

        return BLK_STS_OK;
}

static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd,
                                                     enum req_op op,
                                                     sector_t sector,
                                                     sector_t nr_sectors)
{
        struct nullb_device *dev = cmd->nq->dev;

        if (op == REQ_OP_DISCARD)
                return null_handle_discard(dev, sector, nr_sectors);

        return null_handle_rq(cmd);
}

static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd)
{
        struct request *rq = blk_mq_rq_from_pdu(cmd);
        struct nullb_device *dev = cmd->nq->dev;
        struct bio *bio;

        if (!dev->memory_backed && req_op(rq) == REQ_OP_READ) {
                __rq_for_each_bio(bio, rq)
                        zero_fill_bio(bio);
        }
}

static inline void nullb_complete_cmd(struct nullb_cmd *cmd)
{
        struct request *rq = blk_mq_rq_from_pdu(cmd);

        /*
         * Since root privileges are required to configure the null_blk
         * driver, it is fine that this driver does not initialize the
         * data buffers of read commands. Zero-initialize these buffers
         * anyway if KMSAN is enabled to prevent that KMSAN complains
         * about null_blk not initializing read data buffers.
         */
        if (IS_ENABLED(CONFIG_KMSAN))
                nullb_zero_read_cmd_buffer(cmd);

        /* Complete IO by inline, softirq or timer */
        switch (cmd->nq->dev->irqmode) {
        case NULL_IRQ_SOFTIRQ:
                blk_mq_complete_request(rq);
                break;
        case NULL_IRQ_NONE:
                blk_mq_end_request(rq, cmd->error);
                break;
        case NULL_IRQ_TIMER:
                null_cmd_end_timer(cmd);
                break;
        }
}

blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op,
                              sector_t sector, unsigned int nr_sectors)
{
        struct nullb_device *dev = cmd->nq->dev;
        blk_status_t ret;

        if (dev->badblocks.shift != -1) {
                ret = null_handle_badblocks(cmd, sector, nr_sectors);
                if (ret != BLK_STS_OK)
                        return ret;
        }

        if (dev->memory_backed)
                return null_handle_memory_backed(cmd, op, sector, nr_sectors);

        return BLK_STS_OK;
}

static void null_handle_cmd(struct nullb_cmd *cmd, sector_t sector,
                            sector_t nr_sectors, enum req_op op)
{
        struct nullb_device *dev = cmd->nq->dev;
        struct nullb *nullb = dev->nullb;
        blk_status_t sts;

        if (op == REQ_OP_FLUSH) {
                cmd->error = null_handle_flush(nullb);
                goto out;
        }

        if (dev->zoned)
                sts = null_process_zoned_cmd(cmd, op, sector, nr_sectors);
        else
                sts = null_process_cmd(cmd, op, sector, nr_sectors);

        /* Do not overwrite errors (e.g. timeout errors) */
        if (cmd->error == BLK_STS_OK)
                cmd->error = sts;

out:
        nullb_complete_cmd(cmd);
}

static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer)
{
        struct nullb *nullb = container_of(timer, struct nullb, bw_timer);
        ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL);
        unsigned int mbps = nullb->dev->mbps;

        if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps))
                return HRTIMER_NORESTART;

        atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps));
        blk_mq_start_stopped_hw_queues(nullb->q, true);

        hrtimer_forward_now(&nullb->bw_timer, timer_interval);

        return HRTIMER_RESTART;
}

static void nullb_setup_bwtimer(struct nullb *nullb)
{
        ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL);

        hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        nullb->bw_timer.function = nullb_bwtimer_fn;
        atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps));
        hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL);
}

#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION

static bool should_timeout_request(struct request *rq)
{
        struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
        struct nullb_device *dev = cmd->nq->dev;

        return should_fail(&dev->timeout_config.attr, 1);
}

static bool should_requeue_request(struct request *rq)
{
        struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
        struct nullb_device *dev = cmd->nq->dev;

        return should_fail(&dev->requeue_config.attr, 1);
}

static bool should_init_hctx_fail(struct nullb_device *dev)
{
        return should_fail(&dev->init_hctx_fault_config.attr, 1);
}

#else

static bool should_timeout_request(struct request *rq)
{
        return false;
}

static bool should_requeue_request(struct request *rq)
{
        return false;
}

static bool should_init_hctx_fail(struct nullb_device *dev)
{
        return false;
}

#endif

static void null_map_queues(struct blk_mq_tag_set *set)
{
        struct nullb *nullb = set->driver_data;
        int i, qoff;
        unsigned int submit_queues = g_submit_queues;
        unsigned int poll_queues = g_poll_queues;

        if (nullb) {
                struct nullb_device *dev = nullb->dev;

                /*
                 * Refer nr_hw_queues of the tag set to check if the expected
                 * number of hardware queues are prepared. If block layer failed
                 * to prepare them, use previous numbers of submit queues and
                 * poll queues to map queues.
                 */
                if (set->nr_hw_queues ==
                    dev->submit_queues + dev->poll_queues) {
                        submit_queues = dev->submit_queues;
                        poll_queues = dev->poll_queues;
                } else if (set->nr_hw_queues ==
                           dev->prev_submit_queues + dev->prev_poll_queues) {
                        submit_queues = dev->prev_submit_queues;
                        poll_queues = dev->prev_poll_queues;
                } else {
                        pr_warn("tag set has unexpected nr_hw_queues: %d\n",
                                set->nr_hw_queues);
                        WARN_ON_ONCE(true);
                        submit_queues = 1;
                        poll_queues = 0;
                }
        }

        for (i = 0, qoff = 0; i < set->nr_maps; i++) {
                struct blk_mq_queue_map *map = &set->map[i];

                switch (i) {
                case HCTX_TYPE_DEFAULT:
                        map->nr_queues = submit_queues;
                        break;
                case HCTX_TYPE_READ:
                        map->nr_queues = 0;
                        continue;
                case HCTX_TYPE_POLL:
                        map->nr_queues = poll_queues;
                        break;
                }
                map->queue_offset = qoff;
                qoff += map->nr_queues;
                blk_mq_map_queues(map);
        }
}

static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
{
        struct nullb_queue *nq = hctx->driver_data;
        LIST_HEAD(list);
        int nr = 0;
        struct request *rq;

        spin_lock(&nq->poll_lock);
        list_splice_init(&nq->poll_list, &list);
        list_for_each_entry(rq, &list, queuelist)
                blk_mq_set_request_complete(rq);
        spin_unlock(&nq->poll_lock);

        while (!list_empty(&list)) {
                struct nullb_cmd *cmd;
                struct request *req;

                req = list_first_entry(&list, struct request, queuelist);
                list_del_init(&req->queuelist);
                cmd = blk_mq_rq_to_pdu(req);
                cmd->error = null_process_cmd(cmd, req_op(req), blk_rq_pos(req),
                                                blk_rq_sectors(req));
                if (!blk_mq_add_to_batch(req, iob, (__force int) cmd->error,
                                        blk_mq_end_request_batch))
                        blk_mq_end_request(req, cmd->error);
                nr++;
        }

        return nr;
}

static enum blk_eh_timer_return null_timeout_rq(struct request *rq)
{
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
        struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);

        if (hctx->type == HCTX_TYPE_POLL) {
                struct nullb_queue *nq = hctx->driver_data;

                spin_lock(&nq->poll_lock);
                /* The request may have completed meanwhile. */
                if (blk_mq_request_completed(rq)) {
                        spin_unlock(&nq->poll_lock);
                        return BLK_EH_DONE;
                }
                list_del_init(&rq->queuelist);
                spin_unlock(&nq->poll_lock);
        }

        pr_info("rq %p timed out\n", rq);

        /*
         * If the device is marked as blocking (i.e. memory backed or zoned
         * device), the submission path may be blocked waiting for resources
         * and cause real timeouts. For these real timeouts, the submission
         * path will complete the request using blk_mq_complete_request().
         * Only fake timeouts need to execute blk_mq_complete_request() here.
         */
        cmd->error = BLK_STS_TIMEOUT;
        if (cmd->fake_timeout || hctx->type == HCTX_TYPE_POLL)
                blk_mq_complete_request(rq);
        return BLK_EH_DONE;
}

static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
                                  const struct blk_mq_queue_data *bd)
{
        struct request *rq = bd->rq;
        struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
        struct nullb_queue *nq = hctx->driver_data;
        sector_t nr_sectors = blk_rq_sectors(rq);
        sector_t sector = blk_rq_pos(rq);
        const bool is_poll = hctx->type == HCTX_TYPE_POLL;

        might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);

        if (!is_poll && nq->dev->irqmode == NULL_IRQ_TIMER) {
                hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
                cmd->timer.function = null_cmd_timer_expired;
        }
        cmd->error = BLK_STS_OK;
        cmd->nq = nq;
        cmd->fake_timeout = should_timeout_request(rq) ||
                blk_should_fake_timeout(rq->q);

        if (should_requeue_request(rq)) {
                /*
                 * Alternate between hitting the core BUSY path, and the
                 * driver driven requeue path
                 */
                nq->requeue_selection++;
                if (nq->requeue_selection & 1)
                        return BLK_STS_RESOURCE;
                blk_mq_requeue_request(rq, true);
                return BLK_STS_OK;
        }

        if (test_bit(NULLB_DEV_FL_THROTTLED, &nq->dev->flags)) {
                blk_status_t sts = null_handle_throttled(cmd);

                if (sts != BLK_STS_OK)
                        return sts;
        }

        blk_mq_start_request(rq);

        if (is_poll) {
                spin_lock(&nq->poll_lock);
                list_add_tail(&rq->queuelist, &nq->poll_list);
                spin_unlock(&nq->poll_lock);
                return BLK_STS_OK;
        }
        if (cmd->fake_timeout)
                return BLK_STS_OK;

        null_handle_cmd(cmd, sector, nr_sectors, req_op(rq));
        return BLK_STS_OK;
}

static void null_queue_rqs(struct request **rqlist)
{
        struct request *requeue_list = NULL;
        struct request **requeue_lastp = &requeue_list;
        struct blk_mq_queue_data bd = { };
        blk_status_t ret;

        do {
                struct request *rq = rq_list_pop(rqlist);

                bd.rq = rq;
                ret = null_queue_rq(rq->mq_hctx, &bd);
                if (ret != BLK_STS_OK)
                        rq_list_add_tail(&requeue_lastp, rq);
        } while (!rq_list_empty(*rqlist));

        *rqlist = requeue_list;
}

static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
{
        nq->dev = nullb->dev;
        INIT_LIST_HEAD(&nq->poll_list);
        spin_lock_init(&nq->poll_lock);
}

static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
                          unsigned int hctx_idx)
{
        struct nullb *nullb = hctx->queue->queuedata;
        struct nullb_queue *nq;

        if (should_init_hctx_fail(nullb->dev))
                return -EFAULT;

        nq = &nullb->queues[hctx_idx];
        hctx->driver_data = nq;
        null_init_queue(nullb, nq);

        return 0;
}

static const struct blk_mq_ops null_mq_ops = {
        .queue_rq       = null_queue_rq,
        .queue_rqs        = null_queue_rqs,
        .complete        = null_complete_rq,
        .timeout        = null_timeout_rq,
        .poll                = null_poll,
        .map_queues        = null_map_queues,
        .init_hctx        = null_init_hctx,
};

static void null_del_dev(struct nullb *nullb)
{
        struct nullb_device *dev;

        if (!nullb)
                return;

        dev = nullb->dev;

        ida_free(&nullb_indexes, nullb->index);

        list_del_init(&nullb->list);

        del_gendisk(nullb->disk);

        if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) {
                hrtimer_cancel(&nullb->bw_timer);
                atomic_long_set(&nullb->cur_bytes, LONG_MAX);
                blk_mq_start_stopped_hw_queues(nullb->q, true);
        }

        put_disk(nullb->disk);
        if (nullb->tag_set == &nullb->__tag_set)
                blk_mq_free_tag_set(nullb->tag_set);
        kfree(nullb->queues);
        if (null_cache_active(nullb))
                null_free_device_storage(nullb->dev, true);
        kfree(nullb);
        dev->nullb = NULL;
}

static void null_config_discard(struct nullb *nullb, struct queue_limits *lim)
{
        if (nullb->dev->discard == false)
                return;

        if (!nullb->dev->memory_backed) {
                nullb->dev->discard = false;
                pr_info("discard option is ignored without memory backing\n");
                return;
        }

        if (nullb->dev->zoned) {
                nullb->dev->discard = false;
                pr_info("discard option is ignored in zoned mode\n");
                return;
        }

        lim->max_hw_discard_sectors = UINT_MAX >> 9;
}

static const struct block_device_operations null_ops = {
        .owner                = THIS_MODULE,
        .report_zones        = null_report_zones,
};

static int setup_queues(struct nullb *nullb)
{
        int nqueues = nr_cpu_ids;

        if (g_poll_queues)
                nqueues += g_poll_queues;

        nullb->queues = kcalloc(nqueues, sizeof(struct nullb_queue),
                                GFP_KERNEL);
        if (!nullb->queues)
                return -ENOMEM;

        return 0;
}

static int null_init_tag_set(struct blk_mq_tag_set *set, int poll_queues)
{
        set->ops = &null_mq_ops;
        set->cmd_size = sizeof(struct nullb_cmd);
        set->timeout = 5 * HZ;
        set->nr_maps = 1;
        if (poll_queues) {
                set->nr_hw_queues += poll_queues;
                set->nr_maps += 2;
        }
        return blk_mq_alloc_tag_set(set);
}

static int null_init_global_tag_set(void)
{
        int error;

        if (tag_set.ops)
                return 0;

        tag_set.nr_hw_queues = g_submit_queues;
        tag_set.queue_depth = g_hw_queue_depth;
        tag_set.numa_node = g_home_node;
        tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
        if (g_no_sched)
                tag_set.flags |= BLK_MQ_F_NO_SCHED;
        if (g_shared_tag_bitmap)
                tag_set.flags |= BLK_MQ_F_TAG_HCTX_SHARED;
        if (g_blocking)
                tag_set.flags |= BLK_MQ_F_BLOCKING;

        error = null_init_tag_set(&tag_set, g_poll_queues);
        if (error)
                tag_set.ops = NULL;
        return error;
}

static int null_setup_tagset(struct nullb *nullb)
{
        if (nullb->dev->shared_tags) {
                nullb->tag_set = &tag_set;
                return null_init_global_tag_set();
        }

        nullb->tag_set = &nullb->__tag_set;
        nullb->tag_set->driver_data = nullb;
        nullb->tag_set->nr_hw_queues = nullb->dev->submit_queues;
        nullb->tag_set->queue_depth = nullb->dev->hw_queue_depth;
        nullb->tag_set->numa_node = nullb->dev->home_node;
        nullb->tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
        if (nullb->dev->no_sched)
                nullb->tag_set->flags |= BLK_MQ_F_NO_SCHED;
        if (nullb->dev->shared_tag_bitmap)
                nullb->tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED;
        if (nullb->dev->blocking)
                nullb->tag_set->flags |= BLK_MQ_F_BLOCKING;
        return null_init_tag_set(nullb->tag_set, nullb->dev->poll_queues);
}

static int null_validate_conf(struct nullb_device *dev)
{
        if (dev->queue_mode == NULL_Q_RQ) {
                pr_err("legacy IO path is no longer available\n");
                return -EINVAL;
        }
        if (dev->queue_mode == NULL_Q_BIO) {
                pr_err("BIO-based IO path is no longer available, using blk-mq instead.\n");
                dev->queue_mode = NULL_Q_MQ;
        }

        if (blk_validate_block_size(dev->blocksize))
                return -EINVAL;

        if (dev->use_per_node_hctx) {
                if (dev->submit_queues != nr_online_nodes)
                        dev->submit_queues = nr_online_nodes;
        } else if (dev->submit_queues > nr_cpu_ids)
                dev->submit_queues = nr_cpu_ids;
        else if (dev->submit_queues == 0)
                dev->submit_queues = 1;
        dev->prev_submit_queues = dev->submit_queues;

        if (dev->poll_queues > g_poll_queues)
                dev->poll_queues = g_poll_queues;
        dev->prev_poll_queues = dev->poll_queues;
        dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER);

        /* Do memory allocation, so set blocking */
        if (dev->memory_backed)
                dev->blocking = true;
        else /* cache is meaningless */
                dev->cache_size = 0;
        dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024,
                                                dev->cache_size);
        dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps);

        if (dev->zoned &&
            (!dev->zone_size || !is_power_of_2(dev->zone_size))) {
                pr_err("zone_size must be power-of-two\n");
                return -EINVAL;
        }

        return 0;
}

#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
static bool __null_setup_fault(struct fault_attr *attr, char *str)
{
        if (!str[0])
                return true;

        if (!setup_fault_attr(attr, str))
                return false;

        attr->verbose = 0;
        return true;
}
#endif

static bool null_setup_fault(void)
{
#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
        if (!__null_setup_fault(&null_timeout_attr, g_timeout_str))
                return false;
        if (!__null_setup_fault(&null_requeue_attr, g_requeue_str))
                return false;
        if (!__null_setup_fault(&null_init_hctx_attr, g_init_hctx_str))
                return false;
#endif
        return true;
}

static int null_add_dev(struct nullb_device *dev)
{
        struct queue_limits lim = {
                .logical_block_size        = dev->blocksize,
                .physical_block_size        = dev->blocksize,
                .max_hw_sectors                = dev->max_sectors,
        };

        struct nullb *nullb;
        int rv;

        rv = null_validate_conf(dev);
        if (rv)
                return rv;

        nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node);
        if (!nullb) {
                rv = -ENOMEM;
                goto out;
        }
        nullb->dev = dev;
        dev->nullb = nullb;

        spin_lock_init(&nullb->lock);

        rv = setup_queues(nullb);
        if (rv)
                goto out_free_nullb;

        rv = null_setup_tagset(nullb);
        if (rv)
                goto out_cleanup_queues;

        if (dev->virt_boundary)
                lim.virt_boundary_mask = PAGE_SIZE - 1;
        null_config_discard(nullb, &lim);
        if (dev->zoned) {
                rv = null_init_zoned_dev(dev, &lim);
                if (rv)
                        goto out_cleanup_tags;
        }

        nullb->disk = blk_mq_alloc_disk(nullb->tag_set, &lim, nullb);
        if (IS_ERR(nullb->disk)) {
                rv = PTR_ERR(nullb->disk);
                goto out_cleanup_zone;
        }
        nullb->q = nullb->disk->queue;

        if (dev->mbps) {
                set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags);
                nullb_setup_bwtimer(nullb);
        }

        if (dev->cache_size > 0) {
                set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
                blk_queue_write_cache(nullb->q, true, dev->fua);
        }

        nullb->q->queuedata = nullb;
        blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q);

        rv = ida_alloc(&nullb_indexes, GFP_KERNEL);
        if (rv < 0)
                goto out_cleanup_disk;

        nullb->index = rv;
        dev->index = rv;

        if (config_item_name(&dev->group.cg_item)) {
                /* Use configfs dir name as the device name */
                snprintf(nullb->disk_name, sizeof(nullb->disk_name),
                         "%s", config_item_name(&dev->group.cg_item));
        } else {
                sprintf(nullb->disk_name, "nullb%d", nullb->index);
        }

        set_capacity(nullb->disk,
                ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT);
        nullb->disk->major = null_major;
        nullb->disk->first_minor = nullb->index;
        nullb->disk->minors = 1;
        nullb->disk->fops = &null_ops;
        nullb->disk->private_data = nullb;
        strscpy_pad(nullb->disk->disk_name, nullb->disk_name, DISK_NAME_LEN);

        if (nullb->dev->zoned) {
                rv = null_register_zoned_dev(nullb);
                if (rv)
                        goto out_ida_free;
        }

        rv = add_disk(nullb->disk);
        if (rv)
                goto out_ida_free;

        list_add_tail(&nullb->list, &nullb_list);

        pr_info("disk %s created\n", nullb->disk_name);

        return 0;

out_ida_free:
        ida_free(&nullb_indexes, nullb->index);
out_cleanup_disk:
        put_disk(nullb->disk);
out_cleanup_zone:
        null_free_zoned_dev(dev);
out_cleanup_tags:
        if (nullb->tag_set == &nullb->__tag_set)
                blk_mq_free_tag_set(nullb->tag_set);
out_cleanup_queues:
        kfree(nullb->queues);
out_free_nullb:
        kfree(nullb);
        dev->nullb = NULL;
out:
        return rv;
}

static struct nullb *null_find_dev_by_name(const char *name)
{
        struct nullb *nullb = NULL, *nb;

        mutex_lock(&lock);
        list_for_each_entry(nb, &nullb_list, list) {
                if (strcmp(nb->disk_name, name) == 0) {
                        nullb = nb;
                        break;
                }
        }
        mutex_unlock(&lock);

        return nullb;
}

static int null_create_dev(void)
{
        struct nullb_device *dev;
        int ret;

        dev = null_alloc_dev();
        if (!dev)
                return -ENOMEM;

        mutex_lock(&lock);
        ret = null_add_dev(dev);
        mutex_unlock(&lock);
        if (ret) {
                null_free_dev(dev);
                return ret;
        }

        return 0;
}

static void null_destroy_dev(struct nullb *nullb)
{
        struct nullb_device *dev = nullb->dev;

        null_del_dev(nullb);
        null_free_device_storage(dev, false);
        null_free_dev(dev);
}

static int __init null_init(void)
{
        int ret = 0;
        unsigned int i;
        struct nullb *nullb;

        if (g_bs > PAGE_SIZE) {
                pr_warn("invalid block size\n");
                pr_warn("defaults block size to %lu\n", PAGE_SIZE);
                g_bs = PAGE_SIZE;
        }

        if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) {
                pr_err("invalid home_node value\n");
                g_home_node = NUMA_NO_NODE;
        }

        if (!null_setup_fault())
                return -EINVAL;

        if (g_queue_mode == NULL_Q_RQ) {
                pr_err("legacy IO path is no longer available\n");
                return -EINVAL;
        }

        if (g_use_per_node_hctx) {
                if (g_submit_queues != nr_online_nodes) {
                        pr_warn("submit_queues param is set to %u.\n",
                                nr_online_nodes);
                        g_submit_queues = nr_online_nodes;
                }
        } else if (g_submit_queues > nr_cpu_ids) {
                g_submit_queues = nr_cpu_ids;
        } else if (g_submit_queues <= 0) {
                g_submit_queues = 1;
        }

        config_group_init(&nullb_subsys.su_group);
        mutex_init(&nullb_subsys.su_mutex);

        ret = configfs_register_subsystem(&nullb_subsys);
        if (ret)
                return ret;

        mutex_init(&lock);

        null_major = register_blkdev(0, "nullb");
        if (null_major < 0) {
                ret = null_major;
                goto err_conf;
        }

        for (i = 0; i < nr_devices; i++) {
                ret = null_create_dev();
                if (ret)
                        goto err_dev;
        }

        pr_info("module loaded\n");
        return 0;

err_dev:
        while (!list_empty(&nullb_list)) {
                nullb = list_entry(nullb_list.next, struct nullb, list);
                null_destroy_dev(nullb);
        }
        unregister_blkdev(null_major, "nullb");
err_conf:
        configfs_unregister_subsystem(&nullb_subsys);
        return ret;
}

static void __exit null_exit(void)
{
        struct nullb *nullb;

        configfs_unregister_subsystem(&nullb_subsys);

        unregister_blkdev(null_major, "nullb");

        mutex_lock(&lock);
        while (!list_empty(&nullb_list)) {
                nullb = list_entry(nullb_list.next, struct nullb, list);
                null_destroy_dev(nullb);
        }
        mutex_unlock(&lock);

        if (tag_set.ops)
                blk_mq_free_tag_set(&tag_set);

        mutex_destroy(&lock);
}

module_init(null_init);
module_exit(null_exit);

MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>");
MODULE_DESCRIPTION("multi queue aware block test driver");
MODULE_LICENSE("GPL");












































































































































    2 



























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright IBM Corporation, 2021
 *
 * Author: Mike Rapoport <rppt@linux.ibm.com>
 */

#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/swap.h>
#include <linux/mount.h>
#include <linux/memfd.h>
#include <linux/bitops.h>
#include <linux/printk.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/pseudo_fs.h>
#include <linux/secretmem.h>
#include <linux/set_memory.h>
#include <linux/sched/signal.h>

#include <uapi/linux/magic.h>

#include <asm/tlbflush.h>

#include "internal.h"

#undef pr_fmt
#define pr_fmt(fmt) "secretmem: " fmt

/*
 * Define mode and flag masks to allow validation of the system call
 * parameters.
 */
#define SECRETMEM_MODE_MASK        (0x0)
#define SECRETMEM_FLAGS_MASK        SECRETMEM_MODE_MASK

static bool secretmem_enable __ro_after_init = 1;
module_param_named(enable, secretmem_enable, bool, 0400);
MODULE_PARM_DESC(secretmem_enable,
                 "Enable secretmem and memfd_secret(2) system call");

static atomic_t secretmem_users;

bool secretmem_active(void)
{
        return !!atomic_read(&secretmem_users);
}

static vm_fault_t secretmem_fault(struct vm_fault *vmf)
{
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        struct inode *inode = file_inode(vmf->vma->vm_file);
        pgoff_t offset = vmf->pgoff;
        gfp_t gfp = vmf->gfp_mask;
        unsigned long addr;
        struct page *page;
        struct folio *folio;
        vm_fault_t ret;
        int err;

        if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
                return vmf_error(-EINVAL);

        filemap_invalidate_lock_shared(mapping);

retry:
        page = find_lock_page(mapping, offset);
        if (!page) {
                folio = folio_alloc(gfp | __GFP_ZERO, 0);
                if (!folio) {
                        ret = VM_FAULT_OOM;
                        goto out;
                }

                page = &folio->page;
                err = set_direct_map_invalid_noflush(page);
                if (err) {
                        folio_put(folio);
                        ret = vmf_error(err);
                        goto out;
                }

                __folio_mark_uptodate(folio);
                err = filemap_add_folio(mapping, folio, offset, gfp);
                if (unlikely(err)) {
                        folio_put(folio);
                        /*
                         * If a split of large page was required, it
                         * already happened when we marked the page invalid
                         * which guarantees that this call won't fail
                         */
                        set_direct_map_default_noflush(page);
                        if (err == -EEXIST)
                                goto retry;

                        ret = vmf_error(err);
                        goto out;
                }

                addr = (unsigned long)page_address(page);
                flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
        }

        vmf->page = page;
        ret = VM_FAULT_LOCKED;

out:
        filemap_invalidate_unlock_shared(mapping);
        return ret;
}

static const struct vm_operations_struct secretmem_vm_ops = {
        .fault = secretmem_fault,
};

static int secretmem_release(struct inode *inode, struct file *file)
{
        atomic_dec(&secretmem_users);
        return 0;
}

static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
{
        unsigned long len = vma->vm_end - vma->vm_start;

        if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
                return -EINVAL;

        if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
                return -EAGAIN;

        vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP);
        vma->vm_ops = &secretmem_vm_ops;

        return 0;
}

bool vma_is_secretmem(struct vm_area_struct *vma)
{
        return vma->vm_ops == &secretmem_vm_ops;
}

static const struct file_operations secretmem_fops = {
        .release        = secretmem_release,
        .mmap                = secretmem_mmap,
};

static int secretmem_migrate_folio(struct address_space *mapping,
                struct folio *dst, struct folio *src, enum migrate_mode mode)
{
        return -EBUSY;
}

static void secretmem_free_folio(struct folio *folio)
{
        set_direct_map_default_noflush(&folio->page);
        folio_zero_segment(folio, 0, folio_size(folio));
}

const struct address_space_operations secretmem_aops = {
        .dirty_folio        = noop_dirty_folio,
        .free_folio        = secretmem_free_folio,
        .migrate_folio        = secretmem_migrate_folio,
};

static int secretmem_setattr(struct mnt_idmap *idmap,
                             struct dentry *dentry, struct iattr *iattr)
{
        struct inode *inode = d_inode(dentry);
        struct address_space *mapping = inode->i_mapping;
        unsigned int ia_valid = iattr->ia_valid;
        int ret;

        filemap_invalidate_lock(mapping);

        if ((ia_valid & ATTR_SIZE) && inode->i_size)
                ret = -EINVAL;
        else
                ret = simple_setattr(idmap, dentry, iattr);

        filemap_invalidate_unlock(mapping);

        return ret;
}

static const struct inode_operations secretmem_iops = {
        .setattr = secretmem_setattr,
};

static struct vfsmount *secretmem_mnt;

static struct file *secretmem_file_create(unsigned long flags)
{
        struct file *file;
        struct inode *inode;
        const char *anon_name = "[secretmem]";
        const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name));
        int err;

        inode = alloc_anon_inode(secretmem_mnt->mnt_sb);
        if (IS_ERR(inode))
                return ERR_CAST(inode);

        err = security_inode_init_security_anon(inode, &qname, NULL);
        if (err) {
                file = ERR_PTR(err);
                goto err_free_inode;
        }

        file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
                                 O_RDWR, &secretmem_fops);
        if (IS_ERR(file))
                goto err_free_inode;

        mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
        mapping_set_unevictable(inode->i_mapping);

        inode->i_op = &secretmem_iops;
        inode->i_mapping->a_ops = &secretmem_aops;

        /* pretend we are a normal file with zero size */
        inode->i_mode |= S_IFREG;
        inode->i_size = 0;

        return file;

err_free_inode:
        iput(inode);
        return file;
}

SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
{
        struct file *file;
        int fd, err;

        /* make sure local flags do not confict with global fcntl.h */
        BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC);

        if (!secretmem_enable)
                return -ENOSYS;

        if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC))
                return -EINVAL;
        if (atomic_read(&secretmem_users) < 0)
                return -ENFILE;

        fd = get_unused_fd_flags(flags & O_CLOEXEC);
        if (fd < 0)
                return fd;

        file = secretmem_file_create(flags);
        if (IS_ERR(file)) {
                err = PTR_ERR(file);
                goto err_put_fd;
        }

        file->f_flags |= O_LARGEFILE;

        atomic_inc(&secretmem_users);
        fd_install(fd, file);
        return fd;

err_put_fd:
        put_unused_fd(fd);
        return err;
}

static int secretmem_init_fs_context(struct fs_context *fc)
{
        return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM;
}

static struct file_system_type secretmem_fs = {
        .name                = "secretmem",
        .init_fs_context = secretmem_init_fs_context,
        .kill_sb        = kill_anon_super,
};

static int __init secretmem_init(void)
{
        if (!secretmem_enable)
                return 0;

        secretmem_mnt = kern_mount(&secretmem_fs);
        if (IS_ERR(secretmem_mnt))
                return PTR_ERR(secretmem_mnt);

        /* prevent secretmem mappings from ever getting PROT_EXEC */
        secretmem_mnt->mnt_flags |= MNT_NOEXEC;

        return 0;
}
fs_initcall(secretmem_init);


























































































































    1 





   11 














    1 






   11 








   11 






    3 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef RQ_QOS_H
#define RQ_QOS_H

#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/blk_types.h>
#include <linux/atomic.h>
#include <linux/wait.h>
#include <linux/blk-mq.h>

#include "blk-mq-debugfs.h"

struct blk_mq_debugfs_attr;

enum rq_qos_id {
        RQ_QOS_WBT,
        RQ_QOS_LATENCY,
        RQ_QOS_COST,
};

struct rq_wait {
        wait_queue_head_t wait;
        atomic_t inflight;
};

struct rq_qos {
        const struct rq_qos_ops *ops;
        struct gendisk *disk;
        enum rq_qos_id id;
        struct rq_qos *next;
#ifdef CONFIG_BLK_DEBUG_FS
        struct dentry *debugfs_dir;
#endif
};

struct rq_qos_ops {
        void (*throttle)(struct rq_qos *, struct bio *);
        void (*track)(struct rq_qos *, struct request *, struct bio *);
        void (*merge)(struct rq_qos *, struct request *, struct bio *);
        void (*issue)(struct rq_qos *, struct request *);
        void (*requeue)(struct rq_qos *, struct request *);
        void (*done)(struct rq_qos *, struct request *);
        void (*done_bio)(struct rq_qos *, struct bio *);
        void (*cleanup)(struct rq_qos *, struct bio *);
        void (*queue_depth_changed)(struct rq_qos *);
        void (*exit)(struct rq_qos *);
        const struct blk_mq_debugfs_attr *debugfs_attrs;
};

struct rq_depth {
        unsigned int max_depth;

        int scale_step;
        bool scaled_max;

        unsigned int queue_depth;
        unsigned int default_depth;
};

static inline struct rq_qos *rq_qos_id(struct request_queue *q,
                                       enum rq_qos_id id)
{
        struct rq_qos *rqos;
        for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
                if (rqos->id == id)
                        break;
        }
        return rqos;
}

static inline struct rq_qos *wbt_rq_qos(struct request_queue *q)
{
        return rq_qos_id(q, RQ_QOS_WBT);
}

static inline struct rq_qos *iolat_rq_qos(struct request_queue *q)
{
        return rq_qos_id(q, RQ_QOS_LATENCY);
}

static inline void rq_wait_init(struct rq_wait *rq_wait)
{
        atomic_set(&rq_wait->inflight, 0);
        init_waitqueue_head(&rq_wait->wait);
}

int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
                const struct rq_qos_ops *ops);
void rq_qos_del(struct rq_qos *rqos);

typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data);
typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data);

void rq_qos_wait(struct rq_wait *rqw, void *private_data,
                 acquire_inflight_cb_t *acquire_inflight_cb,
                 cleanup_cb_t *cleanup_cb);
bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit);
bool rq_depth_scale_up(struct rq_depth *rqd);
bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle);
bool rq_depth_calc_max_depth(struct rq_depth *rqd);

void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio);
void __rq_qos_done(struct rq_qos *rqos, struct request *rq);
void __rq_qos_issue(struct rq_qos *rqos, struct request *rq);
void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq);
void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio);
void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio);
void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio);
void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio);
void __rq_qos_queue_depth_changed(struct rq_qos *rqos);

static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
{
        if (q->rq_qos)
                __rq_qos_cleanup(q->rq_qos, bio);
}

static inline void rq_qos_done(struct request_queue *q, struct request *rq)
{
        if (q->rq_qos && !blk_rq_is_passthrough(rq))
                __rq_qos_done(q->rq_qos, rq);
}

static inline void rq_qos_issue(struct request_queue *q, struct request *rq)
{
        if (q->rq_qos)
                __rq_qos_issue(q->rq_qos, rq);
}

static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
{
        if (q->rq_qos)
                __rq_qos_requeue(q->rq_qos, rq);
}

static inline void rq_qos_done_bio(struct bio *bio)
{
        if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) ||
                             bio_flagged(bio, BIO_QOS_MERGED))) {
                struct request_queue *q = bdev_get_queue(bio->bi_bdev);
                if (q->rq_qos)
                        __rq_qos_done_bio(q->rq_qos, bio);
        }
}

static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
{
        if (q->rq_qos) {
                bio_set_flag(bio, BIO_QOS_THROTTLED);
                __rq_qos_throttle(q->rq_qos, bio);
        }
}

static inline void rq_qos_track(struct request_queue *q, struct request *rq,
                                struct bio *bio)
{
        if (q->rq_qos)
                __rq_qos_track(q->rq_qos, rq, bio);
}

static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
                                struct bio *bio)
{
        if (q->rq_qos) {
                bio_set_flag(bio, BIO_QOS_MERGED);
                __rq_qos_merge(q->rq_qos, rq, bio);
        }
}

static inline void rq_qos_queue_depth_changed(struct request_queue *q)
{
        if (q->rq_qos)
                __rq_qos_queue_depth_changed(q->rq_qos);
}

void rq_qos_exit(struct request_queue *);

#endif

































    4 


















    5 





















































































    4 



















































































    2 






























































































    1 





































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM timer

#if !defined(_TRACE_TIMER_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_TIMER_H

#include <linux/tracepoint.h>
#include <linux/hrtimer.h>
#include <linux/timer.h>

DECLARE_EVENT_CLASS(timer_class,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer),

        TP_STRUCT__entry(
                __field( void *,        timer        )
        ),

        TP_fast_assign(
                __entry->timer        = timer;
        ),

        TP_printk("timer=%p", __entry->timer)
);

/**
 * timer_init - called when the timer is initialized
 * @timer:        pointer to struct timer_list
 */
DEFINE_EVENT(timer_class, timer_init,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

#define decode_timer_flags(flags)                        \
        __print_flags(flags, "|",                        \
                {  TIMER_MIGRATING,        "M" },                \
                {  TIMER_DEFERRABLE,        "D" },                \
                {  TIMER_PINNED,        "P" },                \
                {  TIMER_IRQSAFE,        "I" })

/**
 * timer_start - called when the timer is started
 * @timer:                pointer to struct timer_list
 * @bucket_expiry:        the bucket expiry time
 */
TRACE_EVENT(timer_start,

        TP_PROTO(struct timer_list *timer,
                unsigned long bucket_expiry),

        TP_ARGS(timer, bucket_expiry),

        TP_STRUCT__entry(
                __field( void *,        timer                )
                __field( void *,        function        )
                __field( unsigned long,        expires                )
                __field( unsigned long,        bucket_expiry        )
                __field( unsigned long,        now                )
                __field( unsigned int,        flags                )
        ),

        TP_fast_assign(
                __entry->timer                = timer;
                __entry->function        = timer->function;
                __entry->expires        = timer->expires;
                __entry->bucket_expiry        = bucket_expiry;
                __entry->now                = jiffies;
                __entry->flags                = timer->flags;
        ),

        TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] bucket_expiry=%lu cpu=%u idx=%u flags=%s",
                  __entry->timer, __entry->function, __entry->expires,
                  (long)__entry->expires - __entry->now,
                  __entry->bucket_expiry, __entry->flags & TIMER_CPUMASK,
                  __entry->flags >> TIMER_ARRAYSHIFT,
                  decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK))
);

/**
 * timer_expire_entry - called immediately before the timer callback
 * @timer:        pointer to struct timer_list
 * @baseclk:        value of timer_base::clk when timer expires
 *
 * Allows to determine the timer latency.
 */
TRACE_EVENT(timer_expire_entry,

        TP_PROTO(struct timer_list *timer, unsigned long baseclk),

        TP_ARGS(timer, baseclk),

        TP_STRUCT__entry(
                __field( void *,        timer        )
                __field( unsigned long,        now        )
                __field( void *,        function)
                __field( unsigned long,        baseclk        )
        ),

        TP_fast_assign(
                __entry->timer                = timer;
                __entry->now                = jiffies;
                __entry->function        = timer->function;
                __entry->baseclk        = baseclk;
        ),

        TP_printk("timer=%p function=%ps now=%lu baseclk=%lu",
                  __entry->timer, __entry->function, __entry->now,
                  __entry->baseclk)
);

/**
 * timer_expire_exit - called immediately after the timer callback returns
 * @timer:        pointer to struct timer_list
 *
 * When used in combination with the timer_expire_entry tracepoint we can
 * determine the runtime of the timer callback function.
 *
 * NOTE: Do NOT dereference timer in TP_fast_assign. The pointer might
 * be invalid. We solely track the pointer.
 */
DEFINE_EVENT(timer_class, timer_expire_exit,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

/**
 * timer_cancel - called when the timer is canceled
 * @timer:        pointer to struct timer_list
 */
DEFINE_EVENT(timer_class, timer_cancel,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

TRACE_EVENT(timer_base_idle,

        TP_PROTO(bool is_idle, unsigned int cpu),

        TP_ARGS(is_idle, cpu),

        TP_STRUCT__entry(
                __field( bool,                is_idle        )
                __field( unsigned int,        cpu        )
        ),

        TP_fast_assign(
                __entry->is_idle        = is_idle;
                __entry->cpu                = cpu;
        ),

        TP_printk("is_idle=%d cpu=%d",
                  __entry->is_idle, __entry->cpu)
);

#define decode_clockid(type)                                                \
        __print_symbolic(type,                                                \
                { CLOCK_REALTIME,        "CLOCK_REALTIME"        },        \
                { CLOCK_MONOTONIC,        "CLOCK_MONOTONIC"        },        \
                { CLOCK_BOOTTIME,        "CLOCK_BOOTTIME"        },        \
                { CLOCK_TAI,                "CLOCK_TAI"                })

#define decode_hrtimer_mode(mode)                                        \
        __print_symbolic(mode,                                                \
                { HRTIMER_MODE_ABS,                "ABS"                },        \
                { HRTIMER_MODE_REL,                "REL"                },        \
                { HRTIMER_MODE_ABS_PINNED,        "ABS|PINNED"        },        \
                { HRTIMER_MODE_REL_PINNED,        "REL|PINNED"        },        \
                { HRTIMER_MODE_ABS_SOFT,        "ABS|SOFT"        },        \
                { HRTIMER_MODE_REL_SOFT,        "REL|SOFT"        },        \
                { HRTIMER_MODE_ABS_PINNED_SOFT,        "ABS|PINNED|SOFT" },        \
                { HRTIMER_MODE_REL_PINNED_SOFT,        "REL|PINNED|SOFT" },        \
                { HRTIMER_MODE_ABS_HARD,        "ABS|HARD" },                \
                { HRTIMER_MODE_REL_HARD,        "REL|HARD" },                \
                { HRTIMER_MODE_ABS_PINNED_HARD, "ABS|PINNED|HARD" },        \
                { HRTIMER_MODE_REL_PINNED_HARD,        "REL|PINNED|HARD" })

/**
 * hrtimer_init - called when the hrtimer is initialized
 * @hrtimer:        pointer to struct hrtimer
 * @clockid:        the hrtimers clock
 * @mode:        the hrtimers mode
 */
TRACE_EVENT(hrtimer_init,

        TP_PROTO(struct hrtimer *hrtimer, clockid_t clockid,
                 enum hrtimer_mode mode),

        TP_ARGS(hrtimer, clockid, mode),

        TP_STRUCT__entry(
                __field( void *,                hrtimer                )
                __field( clockid_t,                clockid                )
                __field( enum hrtimer_mode,        mode                )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->clockid        = clockid;
                __entry->mode                = mode;
        ),

        TP_printk("hrtimer=%p clockid=%s mode=%s", __entry->hrtimer,
                  decode_clockid(__entry->clockid),
                  decode_hrtimer_mode(__entry->mode))
);

/**
 * hrtimer_start - called when the hrtimer is started
 * @hrtimer:        pointer to struct hrtimer
 * @mode:        the hrtimers mode
 */
TRACE_EVENT(hrtimer_start,

        TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode),

        TP_ARGS(hrtimer, mode),

        TP_STRUCT__entry(
                __field( void *,        hrtimer                )
                __field( void *,        function        )
                __field( s64,                expires                )
                __field( s64,                softexpires        )
                __field( enum hrtimer_mode,        mode        )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->function        = hrtimer->function;
                __entry->expires        = hrtimer_get_expires(hrtimer);
                __entry->softexpires        = hrtimer_get_softexpires(hrtimer);
                __entry->mode                = mode;
        ),

        TP_printk("hrtimer=%p function=%ps expires=%llu softexpires=%llu "
                  "mode=%s", __entry->hrtimer, __entry->function,
                  (unsigned long long) __entry->expires,
                  (unsigned long long) __entry->softexpires,
                  decode_hrtimer_mode(__entry->mode))
);

/**
 * hrtimer_expire_entry - called immediately before the hrtimer callback
 * @hrtimer:        pointer to struct hrtimer
 * @now:        pointer to variable which contains current time of the
 *                timers base.
 *
 * Allows to determine the timer latency.
 */
TRACE_EVENT(hrtimer_expire_entry,

        TP_PROTO(struct hrtimer *hrtimer, ktime_t *now),

        TP_ARGS(hrtimer, now),

        TP_STRUCT__entry(
                __field( void *,        hrtimer        )
                __field( s64,                now        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->now                = *now;
                __entry->function        = hrtimer->function;
        ),

        TP_printk("hrtimer=%p function=%ps now=%llu",
                  __entry->hrtimer, __entry->function,
                  (unsigned long long) __entry->now)
);

DECLARE_EVENT_CLASS(hrtimer_class,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer),

        TP_STRUCT__entry(
                __field( void *,        hrtimer        )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
        ),

        TP_printk("hrtimer=%p", __entry->hrtimer)
);

/**
 * hrtimer_expire_exit - called immediately after the hrtimer callback returns
 * @hrtimer:        pointer to struct hrtimer
 *
 * When used in combination with the hrtimer_expire_entry tracepoint we can
 * determine the runtime of the callback function.
 */
DEFINE_EVENT(hrtimer_class, hrtimer_expire_exit,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer)
);

/**
 * hrtimer_cancel - called when the hrtimer is canceled
 * @hrtimer:        pointer to struct hrtimer
 */
DEFINE_EVENT(hrtimer_class, hrtimer_cancel,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer)
);

/**
 * itimer_state - called when itimer is started or canceled
 * @which:        name of the interval timer
 * @value:        the itimers value, itimer is canceled if value->it_value is
 *                zero, otherwise it is started
 * @expires:        the itimers expiry time
 */
TRACE_EVENT(itimer_state,

        TP_PROTO(int which, const struct itimerspec64 *const value,
                 unsigned long long expires),

        TP_ARGS(which, value, expires),

        TP_STRUCT__entry(
                __field(        int,                        which                )
                __field(        unsigned long long,        expires                )
                __field(        long,                        value_sec        )
                __field(        long,                        value_nsec        )
                __field(        long,                        interval_sec        )
                __field(        long,                        interval_nsec        )
        ),

        TP_fast_assign(
                __entry->which                = which;
                __entry->expires        = expires;
                __entry->value_sec        = value->it_value.tv_sec;
                __entry->value_nsec        = value->it_value.tv_nsec;
                __entry->interval_sec        = value->it_interval.tv_sec;
                __entry->interval_nsec        = value->it_interval.tv_nsec;
        ),

        TP_printk("which=%d expires=%llu it_value=%ld.%06ld it_interval=%ld.%06ld",
                  __entry->which, __entry->expires,
                  __entry->value_sec, __entry->value_nsec / NSEC_PER_USEC,
                  __entry->interval_sec, __entry->interval_nsec / NSEC_PER_USEC)
);

/**
 * itimer_expire - called when itimer expires
 * @which:        type of the interval timer
 * @pid:        pid of the process which owns the timer
 * @now:        current time, used to calculate the latency of itimer
 */
TRACE_EVENT(itimer_expire,

        TP_PROTO(int which, struct pid *pid, unsigned long long now),

        TP_ARGS(which, pid, now),

        TP_STRUCT__entry(
                __field( int ,                        which        )
                __field( pid_t,                        pid        )
                __field( unsigned long long,        now        )
        ),

        TP_fast_assign(
                __entry->which        = which;
                __entry->now        = now;
                __entry->pid        = pid_nr(pid);
        ),

        TP_printk("which=%d pid=%d now=%llu", __entry->which,
                  (int) __entry->pid, __entry->now)
);

#ifdef CONFIG_NO_HZ_COMMON

#define TICK_DEP_NAMES                                        \
                tick_dep_mask_name(NONE)                \
                tick_dep_name(POSIX_TIMER)                \
                tick_dep_name(PERF_EVENTS)                \
                tick_dep_name(SCHED)                        \
                tick_dep_name(CLOCK_UNSTABLE)                \
                tick_dep_name(RCU)                        \
                tick_dep_name_end(RCU_EXP)

#undef tick_dep_name
#undef tick_dep_mask_name
#undef tick_dep_name_end

/* The MASK will convert to their bits and they need to be processed too */
#define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \
        TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
#define tick_dep_name_end(sdep)  TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \
        TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
/* NONE only has a mask defined for it */
#define tick_dep_mask_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);

TICK_DEP_NAMES

#undef tick_dep_name
#undef tick_dep_mask_name
#undef tick_dep_name_end

#define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep },
#define tick_dep_mask_name(sdep) { TICK_DEP_MASK_##sdep, #sdep },
#define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep }

#define show_tick_dep_name(val)                                \
        __print_symbolic(val, TICK_DEP_NAMES)

TRACE_EVENT(tick_stop,

        TP_PROTO(int success, int dependency),

        TP_ARGS(success, dependency),

        TP_STRUCT__entry(
                __field( int ,                success        )
                __field( int ,                dependency )
        ),

        TP_fast_assign(
                __entry->success        = success;
                __entry->dependency        = dependency;
        ),

        TP_printk("success=%d dependency=%s",  __entry->success, \
                        show_tick_dep_name(__entry->dependency))
);
#endif

#endif /*  _TRACE_TIMER_H */

/* This part must be outside protection */
#include <trace/define_trace.h>
































































































    1 



































    2 























    1 






























    1 




    1 









    1 











    1 







    1 

    1 






















    2 









    2 
    2 






















    1 



    2 


























    1 

    2 


















































































    3 







    4 


    2 







    2 






    2 
    2 







    4 






    3 


    2 
    2 
    2 

    2 








































































































































































































































































































    2 




    2 
























































    2 














    1 


    1 



















































































































































































    1 










    2 
    1 

    1 







    2 






    1 


























    2 






    2 
    2 








    2 







    6 








    6 

    5 






    2 

















































    4 
    4 
















































































































































































































































    6 
    6 

    6 


    6 


    6 


    6 


































    4 

    4 




    4 


    4 
    4 















    4 
    4 




































    2 















































    2 










    2 












    2 















    2 
    2 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 


    2 
    2 

































































































    1 










    2 





    1 

    2 










    2 

    2 
    2 




    2 




    2 





    2 









    2 















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
/*
 * mm/rmap.c - physical to virtual reverse mappings
 *
 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
 * Released under the General Public License (GPL).
 *
 * Simple, low overhead reverse mapping scheme.
 * Please try to keep this thing as modular as possible.
 *
 * Provides methods for unmapping each kind of mapped page:
 * the anon methods track anonymous pages, and
 * the file methods track pages belonging to an inode.
 *
 * Original design by Rik van Riel <riel@conectiva.com.br> 2001
 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
 * Contributions by Hugh Dickins 2003, 2004
 */

/*
 * Lock ordering in mm:
 *
 * inode->i_rwsem        (while writing or truncating, not reading or faulting)
 *   mm->mmap_lock
 *     mapping->invalidate_lock (in filemap_fault)
 *       folio_lock
 *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
 *           vma_start_write
 *             mapping->i_mmap_rwsem
 *               anon_vma->rwsem
 *                 mm->page_table_lock or pte_lock
 *                   swap_lock (in swap_duplicate, swap_info_get)
 *                     mmlist_lock (in mmput, drain_mmlist and others)
 *                     mapping->private_lock (in block_dirty_folio)
 *                       folio_lock_memcg move_lock (in block_dirty_folio)
 *                         i_pages lock (widely used)
 *                           lruvec->lru_lock (in folio_lruvec_lock_irq)
 *                     inode->i_lock (in set_page_dirty's __mark_inode_dirty)
 *                     bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
 *                       sb_lock (within inode_lock in fs/fs-writeback.c)
 *                       i_pages lock (widely used, in set_page_dirty,
 *                                 in arch-dependent flush_dcache_mmap_lock,
 *                                 within bdi.wb->list_lock in __sync_single_inode)
 *
 * anon_vma->rwsem,mapping->i_mmap_rwsem   (memory_failure, collect_procs_anon)
 *   ->tasklist_lock
 *     pte map lock
 *
 * hugetlbfs PageHuge() take locks in this order:
 *   hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
 *     vma_lock (hugetlb specific lock for pmd_sharing)
 *       mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
 *         folio_lock
 */

#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
#include <linux/backing-dev.h>
#include <linux/page_idle.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
#include <linux/mm_inline.h>

#include <asm/tlbflush.h>

#define CREATE_TRACE_POINTS
#include <trace/events/tlb.h>
#include <trace/events/migrate.h>

#include "internal.h"

static struct kmem_cache *anon_vma_cachep;
static struct kmem_cache *anon_vma_chain_cachep;

static inline struct anon_vma *anon_vma_alloc(void)
{
        struct anon_vma *anon_vma;

        anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
        if (anon_vma) {
                atomic_set(&anon_vma->refcount, 1);
                anon_vma->num_children = 0;
                anon_vma->num_active_vmas = 0;
                anon_vma->parent = anon_vma;
                /*
                 * Initialise the anon_vma root to point to itself. If called
                 * from fork, the root will be reset to the parents anon_vma.
                 */
                anon_vma->root = anon_vma;
        }

        return anon_vma;
}

static inline void anon_vma_free(struct anon_vma *anon_vma)
{
        VM_BUG_ON(atomic_read(&anon_vma->refcount));

        /*
         * Synchronize against folio_lock_anon_vma_read() such that
         * we can safely hold the lock without the anon_vma getting
         * freed.
         *
         * Relies on the full mb implied by the atomic_dec_and_test() from
         * put_anon_vma() against the acquire barrier implied by
         * down_read_trylock() from folio_lock_anon_vma_read(). This orders:
         *
         * folio_lock_anon_vma_read()        VS        put_anon_vma()
         *   down_read_trylock()                  atomic_dec_and_test()
         *   LOCK                                  MB
         *   atomic_read()                          rwsem_is_locked()
         *
         * LOCK should suffice since the actual taking of the lock must
         * happen _before_ what follows.
         */
        might_sleep();
        if (rwsem_is_locked(&anon_vma->root->rwsem)) {
                anon_vma_lock_write(anon_vma);
                anon_vma_unlock_write(anon_vma);
        }

        kmem_cache_free(anon_vma_cachep, anon_vma);
}

static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
{
        return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
}

static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
{
        kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
}

static void anon_vma_chain_link(struct vm_area_struct *vma,
                                struct anon_vma_chain *avc,
                                struct anon_vma *anon_vma)
{
        avc->vma = vma;
        avc->anon_vma = anon_vma;
        list_add(&avc->same_vma, &vma->anon_vma_chain);
        anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
}

/**
 * __anon_vma_prepare - attach an anon_vma to a memory region
 * @vma: the memory region in question
 *
 * This makes sure the memory mapping described by 'vma' has
 * an 'anon_vma' attached to it, so that we can associate the
 * anonymous pages mapped into it with that anon_vma.
 *
 * The common case will be that we already have one, which
 * is handled inline by anon_vma_prepare(). But if
 * not we either need to find an adjacent mapping that we
 * can re-use the anon_vma from (very common when the only
 * reason for splitting a vma has been mprotect()), or we
 * allocate a new one.
 *
 * Anon-vma allocations are very subtle, because we may have
 * optimistically looked up an anon_vma in folio_lock_anon_vma_read()
 * and that may actually touch the rwsem even in the newly
 * allocated vma (it depends on RCU to make sure that the
 * anon_vma isn't actually destroyed).
 *
 * As a result, we need to do proper anon_vma locking even
 * for the new allocation. At the same time, we do not want
 * to do any locking for the common case of already having
 * an anon_vma.
 */
int __anon_vma_prepare(struct vm_area_struct *vma)
{
        struct mm_struct *mm = vma->vm_mm;
        struct anon_vma *anon_vma, *allocated;
        struct anon_vma_chain *avc;

        mmap_assert_locked(mm);
        might_sleep();

        avc = anon_vma_chain_alloc(GFP_KERNEL);
        if (!avc)
                goto out_enomem;

        anon_vma = find_mergeable_anon_vma(vma);
        allocated = NULL;
        if (!anon_vma) {
                anon_vma = anon_vma_alloc();
                if (unlikely(!anon_vma))
                        goto out_enomem_free_avc;
                anon_vma->num_children++; /* self-parent link for new root */
                allocated = anon_vma;
        }

        anon_vma_lock_write(anon_vma);
        /* page_table_lock to protect against threads */
        spin_lock(&mm->page_table_lock);
        if (likely(!vma->anon_vma)) {
                vma->anon_vma = anon_vma;
                anon_vma_chain_link(vma, avc, anon_vma);
                anon_vma->num_active_vmas++;
                allocated = NULL;
                avc = NULL;
        }
        spin_unlock(&mm->page_table_lock);
        anon_vma_unlock_write(anon_vma);

        if (unlikely(allocated))
                put_anon_vma(allocated);
        if (unlikely(avc))
                anon_vma_chain_free(avc);

        return 0;

 out_enomem_free_avc:
        anon_vma_chain_free(avc);
 out_enomem:
        return -ENOMEM;
}

/*
 * This is a useful helper function for locking the anon_vma root as
 * we traverse the vma->anon_vma_chain, looping over anon_vma's that
 * have the same vma.
 *
 * Such anon_vma's should have the same root, so you'd expect to see
 * just a single mutex_lock for the whole traversal.
 */
static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
{
        struct anon_vma *new_root = anon_vma->root;
        if (new_root != root) {
                if (WARN_ON_ONCE(root))
                        up_write(&root->rwsem);
                root = new_root;
                down_write(&root->rwsem);
        }
        return root;
}

static inline void unlock_anon_vma_root(struct anon_vma *root)
{
        if (root)
                up_write(&root->rwsem);
}

/*
 * Attach the anon_vmas from src to dst.
 * Returns 0 on success, -ENOMEM on failure.
 *
 * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(),
 * copy_vma() and anon_vma_fork(). The first four want an exact copy of src,
 * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to
 * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before
 * call, we can identify this case by checking (!dst->anon_vma &&
 * src->anon_vma).
 *
 * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
 * and reuse existing anon_vma which has no vmas and only one child anon_vma.
 * This prevents degradation of anon_vma hierarchy to endless linear chain in
 * case of constantly forking task. On the other hand, an anon_vma with more
 * than one child isn't reused even if there was no alive vma, thus rmap
 * walker has a good chance of avoiding scanning the whole hierarchy when it
 * searches where page is mapped.
 */
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
        struct anon_vma_chain *avc, *pavc;
        struct anon_vma *root = NULL;

        list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma;

                avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
                if (unlikely(!avc)) {
                        unlock_anon_vma_root(root);
                        root = NULL;
                        avc = anon_vma_chain_alloc(GFP_KERNEL);
                        if (!avc)
                                goto enomem_failure;
                }
                anon_vma = pavc->anon_vma;
                root = lock_anon_vma_root(root, anon_vma);
                anon_vma_chain_link(dst, avc, anon_vma);

                /*
                 * Reuse existing anon_vma if it has no vma and only one
                 * anon_vma child.
                 *
                 * Root anon_vma is never reused:
                 * it has self-parent reference and at least one child.
                 */
                if (!dst->anon_vma && src->anon_vma &&
                    anon_vma->num_children < 2 &&
                    anon_vma->num_active_vmas == 0)
                        dst->anon_vma = anon_vma;
        }
        if (dst->anon_vma)
                dst->anon_vma->num_active_vmas++;
        unlock_anon_vma_root(root);
        return 0;

 enomem_failure:
        /*
         * dst->anon_vma is dropped here otherwise its num_active_vmas can
         * be incorrectly decremented in unlink_anon_vmas().
         * We can safely do this because callers of anon_vma_clone() don't care
         * about dst->anon_vma if anon_vma_clone() failed.
         */
        dst->anon_vma = NULL;
        unlink_anon_vmas(dst);
        return -ENOMEM;
}

/*
 * Attach vma to its own anon_vma, as well as to the anon_vmas that
 * the corresponding VMA in the parent process is attached to.
 * Returns 0 on success, non-zero on failure.
 */
int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
{
        struct anon_vma_chain *avc;
        struct anon_vma *anon_vma;
        int error;

        /* Don't bother if the parent process has no anon_vma here. */
        if (!pvma->anon_vma)
                return 0;

        /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
        vma->anon_vma = NULL;

        /*
         * First, attach the new VMA to the parent VMA's anon_vmas,
         * so rmap can find non-COWed pages in child processes.
         */
        error = anon_vma_clone(vma, pvma);
        if (error)
                return error;

        /* An existing anon_vma has been reused, all done then. */
        if (vma->anon_vma)
                return 0;

        /* Then add our own anon_vma. */
        anon_vma = anon_vma_alloc();
        if (!anon_vma)
                goto out_error;
        anon_vma->num_active_vmas++;
        avc = anon_vma_chain_alloc(GFP_KERNEL);
        if (!avc)
                goto out_error_free_anon_vma;

        /*
         * The root anon_vma's rwsem is the lock actually used when we
         * lock any of the anon_vmas in this anon_vma tree.
         */
        anon_vma->root = pvma->anon_vma->root;
        anon_vma->parent = pvma->anon_vma;
        /*
         * With refcounts, an anon_vma can stay around longer than the
         * process it belongs to. The root anon_vma needs to be pinned until
         * this anon_vma is freed, because the lock lives in the root.
         */
        get_anon_vma(anon_vma->root);
        /* Mark this anon_vma as the one where our new (COWed) pages go. */
        vma->anon_vma = anon_vma;
        anon_vma_lock_write(anon_vma);
        anon_vma_chain_link(vma, avc, anon_vma);
        anon_vma->parent->num_children++;
        anon_vma_unlock_write(anon_vma);

        return 0;

 out_error_free_anon_vma:
        put_anon_vma(anon_vma);
 out_error:
        unlink_anon_vmas(vma);
        return -ENOMEM;
}

void unlink_anon_vmas(struct vm_area_struct *vma)
{
        struct anon_vma_chain *avc, *next;
        struct anon_vma *root = NULL;

        /*
         * Unlink each anon_vma chained to the VMA.  This list is ordered
         * from newest to oldest, ensuring the root anon_vma gets freed last.
         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma = avc->anon_vma;

                root = lock_anon_vma_root(root, anon_vma);
                anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);

                /*
                 * Leave empty anon_vmas on the list - we'll need
                 * to free them outside the lock.
                 */
                if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
                        anon_vma->parent->num_children--;
                        continue;
                }

                list_del(&avc->same_vma);
                anon_vma_chain_free(avc);
        }
        if (vma->anon_vma) {
                vma->anon_vma->num_active_vmas--;

                /*
                 * vma would still be needed after unlink, and anon_vma will be prepared
                 * when handle fault.
                 */
                vma->anon_vma = NULL;
        }
        unlock_anon_vma_root(root);

        /*
         * Iterate the list once more, it now only contains empty and unlinked
         * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
         * needing to write-acquire the anon_vma->root->rwsem.
         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma = avc->anon_vma;

                VM_WARN_ON(anon_vma->num_children);
                VM_WARN_ON(anon_vma->num_active_vmas);
                put_anon_vma(anon_vma);

                list_del(&avc->same_vma);
                anon_vma_chain_free(avc);
        }
}

static void anon_vma_ctor(void *data)
{
        struct anon_vma *anon_vma = data;

        init_rwsem(&anon_vma->rwsem);
        atomic_set(&anon_vma->refcount, 0);
        anon_vma->rb_root = RB_ROOT_CACHED;
}

void __init anon_vma_init(void)
{
        anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
                        0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
                        anon_vma_ctor);
        anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
                        SLAB_PANIC|SLAB_ACCOUNT);
}

/*
 * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
 *
 * Since there is no serialization what so ever against folio_remove_rmap_*()
 * the best this function can do is return a refcount increased anon_vma
 * that might have been relevant to this page.
 *
 * The page might have been remapped to a different anon_vma or the anon_vma
 * returned may already be freed (and even reused).
 *
 * In case it was remapped to a different anon_vma, the new anon_vma will be a
 * child of the old anon_vma, and the anon_vma lifetime rules will therefore
 * ensure that any anon_vma obtained from the page will still be valid for as
 * long as we observe page_mapped() [ hence all those page_mapped() tests ].
 *
 * All users of this function must be very careful when walking the anon_vma
 * chain and verify that the page in question is indeed mapped in it
 * [ something equivalent to page_mapped_in_vma() ].
 *
 * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
 * folio_remove_rmap_*() that the anon_vma pointer from page->mapping is valid
 * if there is a mapcount, we can dereference the anon_vma after observing
 * those.
 *
 * NOTE: the caller should normally hold folio lock when calling this.  If
 * not, the caller needs to double check the anon_vma didn't change after
 * taking the anon_vma lock for either read or write (UFFDIO_MOVE can modify it
 * concurrently without folio lock protection). See folio_lock_anon_vma_read()
 * which has already covered that, and comment above remap_pages().
 */
struct anon_vma *folio_get_anon_vma(struct folio *folio)
{
        struct anon_vma *anon_vma = NULL;
        unsigned long anon_mapping;

        rcu_read_lock();
        anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
        if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                goto out;
        if (!folio_mapped(folio))
                goto out;

        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
        if (!atomic_inc_not_zero(&anon_vma->refcount)) {
                anon_vma = NULL;
                goto out;
        }

        /*
         * If this folio is still mapped, then its anon_vma cannot have been
         * freed.  But if it has been unmapped, we have no security against the
         * anon_vma structure being freed and reused (for another anon_vma:
         * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
         * above cannot corrupt).
         */
        if (!folio_mapped(folio)) {
                rcu_read_unlock();
                put_anon_vma(anon_vma);
                return NULL;
        }
out:
        rcu_read_unlock();

        return anon_vma;
}

/*
 * Similar to folio_get_anon_vma() except it locks the anon_vma.
 *
 * Its a little more complex as it tries to keep the fast path to a single
 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
 * reference like with folio_get_anon_vma() and then block on the mutex
 * on !rwc->try_lock case.
 */
struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
                                          struct rmap_walk_control *rwc)
{
        struct anon_vma *anon_vma = NULL;
        struct anon_vma *root_anon_vma;
        unsigned long anon_mapping;

retry:
        rcu_read_lock();
        anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
        if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                goto out;
        if (!folio_mapped(folio))
                goto out;

        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
        root_anon_vma = READ_ONCE(anon_vma->root);
        if (down_read_trylock(&root_anon_vma->rwsem)) {
                /*
                 * folio_move_anon_rmap() might have changed the anon_vma as we
                 * might not hold the folio lock here.
                 */
                if (unlikely((unsigned long)READ_ONCE(folio->mapping) !=
                             anon_mapping)) {
                        up_read(&root_anon_vma->rwsem);
                        rcu_read_unlock();
                        goto retry;
                }

                /*
                 * If the folio is still mapped, then this anon_vma is still
                 * its anon_vma, and holding the mutex ensures that it will
                 * not go away, see anon_vma_free().
                 */
                if (!folio_mapped(folio)) {
                        up_read(&root_anon_vma->rwsem);
                        anon_vma = NULL;
                }
                goto out;
        }

        if (rwc && rwc->try_lock) {
                anon_vma = NULL;
                rwc->contended = true;
                goto out;
        }

        /* trylock failed, we got to sleep */
        if (!atomic_inc_not_zero(&anon_vma->refcount)) {
                anon_vma = NULL;
                goto out;
        }

        if (!folio_mapped(folio)) {
                rcu_read_unlock();
                put_anon_vma(anon_vma);
                return NULL;
        }

        /* we pinned the anon_vma, its safe to sleep */
        rcu_read_unlock();
        anon_vma_lock_read(anon_vma);

        /*
         * folio_move_anon_rmap() might have changed the anon_vma as we might
         * not hold the folio lock here.
         */
        if (unlikely((unsigned long)READ_ONCE(folio->mapping) !=
                     anon_mapping)) {
                anon_vma_unlock_read(anon_vma);
                put_anon_vma(anon_vma);
                anon_vma = NULL;
                goto retry;
        }

        if (atomic_dec_and_test(&anon_vma->refcount)) {
                /*
                 * Oops, we held the last refcount, release the lock
                 * and bail -- can't simply use put_anon_vma() because
                 * we'll deadlock on the anon_vma_lock_write() recursion.
                 */
                anon_vma_unlock_read(anon_vma);
                __put_anon_vma(anon_vma);
                anon_vma = NULL;
        }

        return anon_vma;

out:
        rcu_read_unlock();
        return anon_vma;
}

#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/*
 * Flush TLB entries for recently unmapped pages from remote CPUs. It is
 * important if a PTE was dirty when it was unmapped that it's flushed
 * before any IO is initiated on the page to prevent lost writes. Similarly,
 * it must be flushed before freeing to prevent data leakage.
 */
void try_to_unmap_flush(void)
{
        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;

        if (!tlb_ubc->flush_required)
                return;

        arch_tlbbatch_flush(&tlb_ubc->arch);
        tlb_ubc->flush_required = false;
        tlb_ubc->writable = false;
}

/* Flush iff there are potentially writable TLB entries that can race with IO */
void try_to_unmap_flush_dirty(void)
{
        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;

        if (tlb_ubc->writable)
                try_to_unmap_flush();
}

/*
 * Bits 0-14 of mm->tlb_flush_batched record pending generations.
 * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations.
 */
#define TLB_FLUSH_BATCH_FLUSHED_SHIFT        16
#define TLB_FLUSH_BATCH_PENDING_MASK                        \
        ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1)
#define TLB_FLUSH_BATCH_PENDING_LARGE                        \
        (TLB_FLUSH_BATCH_PENDING_MASK / 2)

static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
                                      unsigned long uaddr)
{
        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
        int batch;
        bool writable = pte_dirty(pteval);

        if (!pte_accessible(mm, pteval))
                return;

        arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr);
        tlb_ubc->flush_required = true;

        /*
         * Ensure compiler does not re-order the setting of tlb_flush_batched
         * before the PTE is cleared.
         */
        barrier();
        batch = atomic_read(&mm->tlb_flush_batched);
retry:
        if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) {
                /*
                 * Prevent `pending' from catching up with `flushed' because of
                 * overflow.  Reset `pending' and `flushed' to be 1 and 0 if
                 * `pending' becomes large.
                 */
                if (!atomic_try_cmpxchg(&mm->tlb_flush_batched, &batch, 1))
                        goto retry;
        } else {
                atomic_inc(&mm->tlb_flush_batched);
        }

        /*
         * If the PTE was dirty then it's best to assume it's writable. The
         * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
         * before the page is queued for IO.
         */
        if (writable)
                tlb_ubc->writable = true;
}

/*
 * Returns true if the TLB flush should be deferred to the end of a batch of
 * unmap operations to reduce IPIs.
 */
static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
{
        if (!(flags & TTU_BATCH_FLUSH))
                return false;

        return arch_tlbbatch_should_defer(mm);
}

/*
 * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
 * releasing the PTL if TLB flushes are batched. It's possible for a parallel
 * operation such as mprotect or munmap to race between reclaim unmapping
 * the page and flushing the page. If this race occurs, it potentially allows
 * access to data via a stale TLB entry. Tracking all mm's that have TLB
 * batching in flight would be expensive during reclaim so instead track
 * whether TLB batching occurred in the past and if so then do a flush here
 * if required. This will cost one additional flush per reclaim cycle paid
 * by the first operation at risk such as mprotect and mumap.
 *
 * This must be called under the PTL so that an access to tlb_flush_batched
 * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
 * via the PTL.
 */
void flush_tlb_batched_pending(struct mm_struct *mm)
{
        int batch = atomic_read(&mm->tlb_flush_batched);
        int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK;
        int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;

        if (pending != flushed) {
                arch_flush_tlb_batched_pending(mm);
                /*
                 * If the new TLB flushing is pending during flushing, leave
                 * mm->tlb_flush_batched as is, to avoid losing flushing.
                 */
                atomic_cmpxchg(&mm->tlb_flush_batched, batch,
                               pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT));
        }
}
#else
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
                                      unsigned long uaddr)
{
}

static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
{
        return false;
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */

/*
 * At what user virtual address is page expected in vma?
 * Caller should check the page is actually part of the vma.
 */
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
        struct folio *folio = page_folio(page);
        pgoff_t pgoff;

        if (folio_test_anon(folio)) {
                struct anon_vma *page__anon_vma = folio_anon_vma(folio);
                /*
                 * Note: swapoff's unuse_vma() is more efficient with this
                 * check, and needs it to match anon_vma when KSM is active.
                 */
                if (!vma->anon_vma || !page__anon_vma ||
                    vma->anon_vma->root != page__anon_vma->root)
                        return -EFAULT;
        } else if (!vma->vm_file) {
                return -EFAULT;
        } else if (vma->vm_file->f_mapping != folio->mapping) {
                return -EFAULT;
        }

        /* The !page__anon_vma above handles KSM folios */
        pgoff = folio->index + folio_page_idx(folio, page);
        return vma_address(vma, pgoff, 1);
}

/*
 * Returns the actual pmd_t* where we expect 'address' to be mapped from, or
 * NULL if it doesn't exist.  No guarantees / checks on what the pmd_t*
 * represents.
 */
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd = NULL;

        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
                goto out;

        p4d = p4d_offset(pgd, address);
        if (!p4d_present(*p4d))
                goto out;

        pud = pud_offset(p4d, address);
        if (!pud_present(*pud))
                goto out;

        pmd = pmd_offset(pud, address);
out:
        return pmd;
}

struct folio_referenced_arg {
        int mapcount;
        int referenced;
        unsigned long vm_flags;
        struct mem_cgroup *memcg;
};

/*
 * arg: folio_referenced_arg will be passed
 */
static bool folio_referenced_one(struct folio *folio,
                struct vm_area_struct *vma, unsigned long address, void *arg)
{
        struct folio_referenced_arg *pra = arg;
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
        int referenced = 0;
        unsigned long start = address, ptes = 0;

        while (page_vma_mapped_walk(&pvmw)) {
                address = pvmw.address;

                if (vma->vm_flags & VM_LOCKED) {
                        if (!folio_test_large(folio) || !pvmw.pte) {
                                /* Restore the mlock which got missed */
                                mlock_vma_folio(folio, vma);
                                page_vma_mapped_walk_done(&pvmw);
                                pra->vm_flags |= VM_LOCKED;
                                return false; /* To break the loop */
                        }
                        /*
                         * For large folio fully mapped to VMA, will
                         * be handled after the pvmw loop.
                         *
                         * For large folio cross VMA boundaries, it's
                         * expected to be picked  by page reclaim. But
                         * should skip reference of pages which are in
                         * the range of VM_LOCKED vma. As page reclaim
                         * should just count the reference of pages out
                         * the range of VM_LOCKED vma.
                         */
                        ptes++;
                        pra->mapcount--;
                        continue;
                }

                if (pvmw.pte) {
                        if (lru_gen_enabled() &&
                            pte_young(ptep_get(pvmw.pte))) {
                                lru_gen_look_around(&pvmw);
                                referenced++;
                        }

                        if (ptep_clear_flush_young_notify(vma, address,
                                                pvmw.pte))
                                referenced++;
                } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
                        if (pmdp_clear_flush_young_notify(vma, address,
                                                pvmw.pmd))
                                referenced++;
                } else {
                        /* unexpected pmd-mapped folio? */
                        WARN_ON_ONCE(1);
                }

                pra->mapcount--;
        }

        if ((vma->vm_flags & VM_LOCKED) &&
                        folio_test_large(folio) &&
                        folio_within_vma(folio, vma)) {
                unsigned long s_align, e_align;

                s_align = ALIGN_DOWN(start, PMD_SIZE);
                e_align = ALIGN_DOWN(start + folio_size(folio) - 1, PMD_SIZE);

                /* folio doesn't cross page table boundary and fully mapped */
                if ((s_align == e_align) && (ptes == folio_nr_pages(folio))) {
                        /* Restore the mlock which got missed */
                        mlock_vma_folio(folio, vma);
                        pra->vm_flags |= VM_LOCKED;
                        return false; /* To break the loop */
                }
        }

        if (referenced)
                folio_clear_idle(folio);
        if (folio_test_clear_young(folio))
                referenced++;

        if (referenced) {
                pra->referenced++;
                pra->vm_flags |= vma->vm_flags & ~VM_LOCKED;
        }

        if (!pra->mapcount)
                return false; /* To break the loop */

        return true;
}

static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
{
        struct folio_referenced_arg *pra = arg;
        struct mem_cgroup *memcg = pra->memcg;

        /*
         * Ignore references from this mapping if it has no recency. If the
         * folio has been used in another mapping, we will catch it; if this
         * other mapping is already gone, the unmap path will have set the
         * referenced flag or activated the folio in zap_pte_range().
         */
        if (!vma_has_recency(vma))
                return true;

        /*
         * If we are reclaiming on behalf of a cgroup, skip counting on behalf
         * of references from different cgroups.
         */
        if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
                return true;

        return false;
}

/**
 * folio_referenced() - Test if the folio was referenced.
 * @folio: The folio to test.
 * @is_locked: Caller holds lock on the folio.
 * @memcg: target memory cgroup
 * @vm_flags: A combination of all the vma->vm_flags which referenced the folio.
 *
 * Quick test_and_clear_referenced for all mappings of a folio,
 *
 * Return: The number of mappings which referenced the folio. Return -1 if
 * the function bailed out due to rmap lock contention.
 */
int folio_referenced(struct folio *folio, int is_locked,
                     struct mem_cgroup *memcg, unsigned long *vm_flags)
{
        bool we_locked = false;
        struct folio_referenced_arg pra = {
                .mapcount = folio_mapcount(folio),
                .memcg = memcg,
        };
        struct rmap_walk_control rwc = {
                .rmap_one = folio_referenced_one,
                .arg = (void *)&pra,
                .anon_lock = folio_lock_anon_vma_read,
                .try_lock = true,
                .invalid_vma = invalid_folio_referenced_vma,
        };

        *vm_flags = 0;
        if (!pra.mapcount)
                return 0;

        if (!folio_raw_mapping(folio))
                return 0;

        if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) {
                we_locked = folio_trylock(folio);
                if (!we_locked)
                        return 1;
        }

        rmap_walk(folio, &rwc);
        *vm_flags = pra.vm_flags;

        if (we_locked)
                folio_unlock(folio);

        return rwc.contended ? -1 : pra.referenced;
}

static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
{
        int cleaned = 0;
        struct vm_area_struct *vma = pvmw->vma;
        struct mmu_notifier_range range;
        unsigned long address = pvmw->address;

        /*
         * We have to assume the worse case ie pmd for invalidation. Note that
         * the folio can not be freed from this function.
         */
        mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0,
                                vma->vm_mm, address, vma_address_end(pvmw));
        mmu_notifier_invalidate_range_start(&range);

        while (page_vma_mapped_walk(pvmw)) {
                int ret = 0;

                address = pvmw->address;
                if (pvmw->pte) {
                        pte_t *pte = pvmw->pte;
                        pte_t entry = ptep_get(pte);

                        if (!pte_dirty(entry) && !pte_write(entry))
                                continue;

                        flush_cache_page(vma, address, pte_pfn(entry));
                        entry = ptep_clear_flush(vma, address, pte);
                        entry = pte_wrprotect(entry);
                        entry = pte_mkclean(entry);
                        set_pte_at(vma->vm_mm, address, pte, entry);
                        ret = 1;
                } else {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
                        pmd_t *pmd = pvmw->pmd;
                        pmd_t entry;

                        if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
                                continue;

                        flush_cache_range(vma, address,
                                          address + HPAGE_PMD_SIZE);
                        entry = pmdp_invalidate(vma, address, pmd);
                        entry = pmd_wrprotect(entry);
                        entry = pmd_mkclean(entry);
                        set_pmd_at(vma->vm_mm, address, pmd, entry);
                        ret = 1;
#else
                        /* unexpected pmd-mapped folio? */
                        WARN_ON_ONCE(1);
#endif
                }

                if (ret)
                        cleaned++;
        }

        mmu_notifier_invalidate_range_end(&range);

        return cleaned;
}

static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
                             unsigned long address, void *arg)
{
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
        int *cleaned = arg;

        *cleaned += page_vma_mkclean_one(&pvmw);

        return true;
}

static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
{
        if (vma->vm_flags & VM_SHARED)
                return false;

        return true;
}

int folio_mkclean(struct folio *folio)
{
        int cleaned = 0;
        struct address_space *mapping;
        struct rmap_walk_control rwc = {
                .arg = (void *)&cleaned,
                .rmap_one = page_mkclean_one,
                .invalid_vma = invalid_mkclean_vma,
        };

        BUG_ON(!folio_test_locked(folio));

        if (!folio_mapped(folio))
                return 0;

        mapping = folio_mapping(folio);
        if (!mapping)
                return 0;

        rmap_walk(folio, &rwc);

        return cleaned;
}
EXPORT_SYMBOL_GPL(folio_mkclean);

/**
 * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
 *                     [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
 *                     within the @vma of shared mappings. And since clean PTEs
 *                     should also be readonly, write protects them too.
 * @pfn: start pfn.
 * @nr_pages: number of physically contiguous pages srarting with @pfn.
 * @pgoff: page offset that the @pfn mapped with.
 * @vma: vma that @pfn mapped within.
 *
 * Returns the number of cleaned PTEs (including PMDs).
 */
int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
                      struct vm_area_struct *vma)
{
        struct page_vma_mapped_walk pvmw = {
                .pfn                = pfn,
                .nr_pages        = nr_pages,
                .pgoff                = pgoff,
                .vma                = vma,
                .flags                = PVMW_SYNC,
        };

        if (invalid_mkclean_vma(vma, NULL))
                return 0;

        pvmw.address = vma_address(vma, pgoff, nr_pages);
        VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma);

        return page_vma_mkclean_one(&pvmw);
}

static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
                struct page *page, int nr_pages, enum rmap_level level,
                int *nr_pmdmapped)
{
        atomic_t *mapped = &folio->_nr_pages_mapped;
        const int orig_nr_pages = nr_pages;
        int first, nr = 0;

        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        switch (level) {
        case RMAP_LEVEL_PTE:
                if (!folio_test_large(folio)) {
                        nr = atomic_inc_and_test(&page->_mapcount);
                        break;
                }

                do {
                        first = atomic_inc_and_test(&page->_mapcount);
                        if (first) {
                                first = atomic_inc_return_relaxed(mapped);
                                if (first < ENTIRELY_MAPPED)
                                        nr++;
                        }
                } while (page++, --nr_pages > 0);
                atomic_add(orig_nr_pages, &folio->_large_mapcount);
                break;
        case RMAP_LEVEL_PMD:
                first = atomic_inc_and_test(&folio->_entire_mapcount);
                if (first) {
                        nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped);
                        if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) {
                                *nr_pmdmapped = folio_nr_pages(folio);
                                nr = *nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
                                /* Raced ahead of a remove and another add? */
                                if (unlikely(nr < 0))
                                        nr = 0;
                        } else {
                                /* Raced ahead of a remove of ENTIRELY_MAPPED */
                                nr = 0;
                        }
                }
                atomic_inc(&folio->_large_mapcount);
                break;
        }
        return nr;
}

/**
 * folio_move_anon_rmap - move a folio to our anon_vma
 * @folio:        The folio to move to our anon_vma
 * @vma:        The vma the folio belongs to
 *
 * When a folio belongs exclusively to one process after a COW event,
 * that folio can be moved into the anon_vma that belongs to just that
 * process, so the rmap code will not search the parent or sibling processes.
 */
void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma)
{
        void *anon_vma = vma->anon_vma;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_VMA(!anon_vma, vma);

        anon_vma += PAGE_MAPPING_ANON;
        /*
         * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
         * simultaneously, so a concurrent reader (eg folio_referenced()'s
         * folio_test_anon()) will not see one without the other.
         */
        WRITE_ONCE(folio->mapping, anon_vma);
}

/**
 * __folio_set_anon - set up a new anonymous rmap for a folio
 * @folio:        The folio to set up the new anonymous rmap for.
 * @vma:        VM area to add the folio to.
 * @address:        User virtual address of the mapping
 * @exclusive:        Whether the folio is exclusive to the process.
 */
static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma,
                             unsigned long address, bool exclusive)
{
        struct anon_vma *anon_vma = vma->anon_vma;

        BUG_ON(!anon_vma);

        /*
         * If the folio isn't exclusive to this vma, we must use the _oldest_
         * possible anon_vma for the folio mapping!
         */
        if (!exclusive)
                anon_vma = anon_vma->root;

        /*
         * page_idle does a lockless/optimistic rmap scan on folio->mapping.
         * Make sure the compiler doesn't split the stores of anon_vma and
         * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code
         * could mistake the mapping for a struct address_space and crash.
         */
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
        WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma);
        folio->index = linear_page_index(vma, address);
}

/**
 * __page_check_anon_rmap - sanity check anonymous rmap addition
 * @folio:        The folio containing @page.
 * @page:        the page to check the mapping of
 * @vma:        the vm area in which the mapping is added
 * @address:        the user virtual address mapped
 */
static void __page_check_anon_rmap(struct folio *folio, struct page *page,
        struct vm_area_struct *vma, unsigned long address)
{
        /*
         * The page's anon-rmap details (mapping and index) are guaranteed to
         * be set up correctly at this point.
         *
         * We have exclusion against folio_add_anon_rmap_*() because the caller
         * always holds the page locked.
         *
         * We have exclusion against folio_add_new_anon_rmap because those pages
         * are initially only visible via the pagetables, and the pte is locked
         * over the call to folio_add_new_anon_rmap.
         */
        VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
                        folio);
        VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
                       page);
}

static __always_inline void __folio_add_anon_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *vma,
                unsigned long address, rmap_t flags, enum rmap_level level)
{
        int i, nr, nr_pmdmapped = 0;

        nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped);
        if (nr_pmdmapped)
                __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr_pmdmapped);
        if (nr)
                __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr);

        if (unlikely(!folio_test_anon(folio))) {
                VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
                /*
                 * For a PTE-mapped large folio, we only know that the single
                 * PTE is exclusive. Further, __folio_set_anon() might not get
                 * folio->index right when not given the address of the head
                 * page.
                 */
                VM_WARN_ON_FOLIO(folio_test_large(folio) &&
                                 level != RMAP_LEVEL_PMD, folio);
                __folio_set_anon(folio, vma, address,
                                 !!(flags & RMAP_EXCLUSIVE));
        } else if (likely(!folio_test_ksm(folio))) {
                __page_check_anon_rmap(folio, page, vma, address);
        }

        if (flags & RMAP_EXCLUSIVE) {
                switch (level) {
                case RMAP_LEVEL_PTE:
                        for (i = 0; i < nr_pages; i++)
                                SetPageAnonExclusive(page + i);
                        break;
                case RMAP_LEVEL_PMD:
                        SetPageAnonExclusive(page);
                        break;
                }
        }
        for (i = 0; i < nr_pages; i++) {
                struct page *cur_page = page + i;

                /* While PTE-mapping a THP we have a PMD and a PTE mapping. */
                VM_WARN_ON_FOLIO((atomic_read(&cur_page->_mapcount) > 0 ||
                                  (folio_test_large(folio) &&
                                   folio_entire_mapcount(folio) > 1)) &&
                                 PageAnonExclusive(cur_page), folio);
        }

        /*
         * For large folio, only mlock it if it's fully mapped to VMA. It's
         * not easy to check whether the large folio is fully mapped to VMA
         * here. Only mlock normal 4K folio and leave page reclaim to handle
         * large folio.
         */
        if (!folio_test_large(folio))
                mlock_vma_folio(folio, vma);
}

/**
 * folio_add_anon_rmap_ptes - add PTE mappings to a page range of an anon folio
 * @folio:        The folio to add the mappings to
 * @page:        The first page to add
 * @nr_pages:        The number of pages which will be mapped
 * @vma:        The vm area in which the mappings are added
 * @address:        The user virtual address of the first page to map
 * @flags:        The rmap flags
 *
 * The page range of folio is defined by [first_page, first_page + nr_pages)
 *
 * The caller needs to hold the page table lock, and the page must be locked in
 * the anon_vma case: to serialize mapping,index checking after setting,
 * and to ensure that an anon folio is not being upgraded racily to a KSM folio
 * (but KSM folios are never downgraded).
 */
void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page,
                int nr_pages, struct vm_area_struct *vma, unsigned long address,
                rmap_t flags)
{
        __folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags,
                              RMAP_LEVEL_PTE);
}

/**
 * folio_add_anon_rmap_pmd - add a PMD mapping to a page range of an anon folio
 * @folio:        The folio to add the mapping to
 * @page:        The first page to add
 * @vma:        The vm area in which the mapping is added
 * @address:        The user virtual address of the first page to map
 * @flags:        The rmap flags
 *
 * The page range of folio is defined by [first_page, first_page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock, and the page must be locked in
 * the anon_vma case: to serialize mapping,index checking after setting.
 */
void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page,
                struct vm_area_struct *vma, unsigned long address, rmap_t flags)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        __folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags,
                              RMAP_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
#endif
}

/**
 * folio_add_new_anon_rmap - Add mapping to a new anonymous folio.
 * @folio:        The folio to add the mapping to.
 * @vma:        the vm area in which the mapping is added
 * @address:        the user virtual address mapped
 *
 * Like folio_add_anon_rmap_*() but must only be called on *new* folios.
 * This means the inc-and-test can be bypassed.
 * The folio does not have to be locked.
 *
 * If the folio is pmd-mappable, it is accounted as a THP.  As the folio
 * is new, it's assumed to be mapped exclusively by a single process.
 */
void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
                unsigned long address)
{
        int nr = folio_nr_pages(folio);

        VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
        VM_BUG_ON_VMA(address < vma->vm_start ||
                        address + (nr << PAGE_SHIFT) > vma->vm_end, vma);
        __folio_set_swapbacked(folio);
        __folio_set_anon(folio, vma, address, true);

        if (likely(!folio_test_large(folio))) {
                /* increment count (starts at -1) */
                atomic_set(&folio->_mapcount, 0);
                SetPageAnonExclusive(&folio->page);
        } else if (!folio_test_pmd_mappable(folio)) {
                int i;

                for (i = 0; i < nr; i++) {
                        struct page *page = folio_page(folio, i);

                        /* increment count (starts at -1) */
                        atomic_set(&page->_mapcount, 0);
                        SetPageAnonExclusive(page);
                }

                /* increment count (starts at -1) */
                atomic_set(&folio->_large_mapcount, nr - 1);
                atomic_set(&folio->_nr_pages_mapped, nr);
        } else {
                /* increment count (starts at -1) */
                atomic_set(&folio->_entire_mapcount, 0);
                /* increment count (starts at -1) */
                atomic_set(&folio->_large_mapcount, 0);
                atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
                SetPageAnonExclusive(&folio->page);
                __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr);
        }

        __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr);
}

static __always_inline void __folio_add_file_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *vma,
                enum rmap_level level)
{
        pg_data_t *pgdat = folio_pgdat(folio);
        int nr, nr_pmdmapped = 0;

        VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);

        nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped);
        if (nr_pmdmapped)
                __mod_node_page_state(pgdat, folio_test_swapbacked(folio) ?
                        NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped);
        if (nr)
                __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr);

        /* See comments in folio_add_anon_rmap_*() */
        if (!folio_test_large(folio))
                mlock_vma_folio(folio, vma);
}

/**
 * folio_add_file_rmap_ptes - add PTE mappings to a page range of a folio
 * @folio:        The folio to add the mappings to
 * @page:        The first page to add
 * @nr_pages:        The number of pages that will be mapped using PTEs
 * @vma:        The vm area in which the mappings are added
 *
 * The page range of the folio is defined by [page, page + nr_pages)
 *
 * The caller needs to hold the page table lock.
 */
void folio_add_file_rmap_ptes(struct folio *folio, struct page *page,
                int nr_pages, struct vm_area_struct *vma)
{
        __folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE);
}

/**
 * folio_add_file_rmap_pmd - add a PMD mapping to a page range of a folio
 * @folio:        The folio to add the mapping to
 * @page:        The first page to add
 * @vma:        The vm area in which the mapping is added
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock.
 */
void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
                struct vm_area_struct *vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
#endif
}

static __always_inline void __folio_remove_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *vma,
                enum rmap_level level)
{
        atomic_t *mapped = &folio->_nr_pages_mapped;
        pg_data_t *pgdat = folio_pgdat(folio);
        int last, nr = 0, nr_pmdmapped = 0;
        bool partially_mapped = false;
        enum node_stat_item idx;

        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        switch (level) {
        case RMAP_LEVEL_PTE:
                if (!folio_test_large(folio)) {
                        nr = atomic_add_negative(-1, &page->_mapcount);
                        break;
                }

                atomic_sub(nr_pages, &folio->_large_mapcount);
                do {
                        last = atomic_add_negative(-1, &page->_mapcount);
                        if (last) {
                                last = atomic_dec_return_relaxed(mapped);
                                if (last < ENTIRELY_MAPPED)
                                        nr++;
                        }
                } while (page++, --nr_pages > 0);

                partially_mapped = nr && atomic_read(mapped);
                break;
        case RMAP_LEVEL_PMD:
                atomic_dec(&folio->_large_mapcount);
                last = atomic_add_negative(-1, &folio->_entire_mapcount);
                if (last) {
                        nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
                        if (likely(nr < ENTIRELY_MAPPED)) {
                                nr_pmdmapped = folio_nr_pages(folio);
                                nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
                                /* Raced ahead of another remove and an add? */
                                if (unlikely(nr < 0))
                                        nr = 0;
                        } else {
                                /* An add of ENTIRELY_MAPPED raced ahead */
                                nr = 0;
                        }
                }

                partially_mapped = nr < nr_pmdmapped;
                break;
        }

        if (nr_pmdmapped) {
                /* NR_{FILE/SHMEM}_PMDMAPPED are not maintained per-memcg */
                if (folio_test_anon(folio))
                        __lruvec_stat_mod_folio(folio, NR_ANON_THPS, -nr_pmdmapped);
                else
                        __mod_node_page_state(pgdat,
                                        folio_test_swapbacked(folio) ?
                                        NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED,
                                        -nr_pmdmapped);
        }
        if (nr) {
                idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
                __lruvec_stat_mod_folio(folio, idx, -nr);

                /*
                 * Queue anon large folio for deferred split if at least one
                 * page of the folio is unmapped and at least one page
                 * is still mapped.
                 *
                 * Check partially_mapped first to ensure it is a large folio.
                 */
                if (folio_test_anon(folio) && partially_mapped &&
                    list_empty(&folio->_deferred_list))
                        deferred_split_folio(folio);
        }

        /*
         * It would be tidy to reset folio_test_anon mapping when fully
         * unmapped, but that might overwrite a racing folio_add_anon_rmap_*()
         * which increments mapcount after us but sets mapping before us:
         * so leave the reset to free_pages_prepare, and remember that
         * it's only reliable while mapped.
         */

        munlock_vma_folio(folio, vma);
}

/**
 * folio_remove_rmap_ptes - remove PTE mappings from a page range of a folio
 * @folio:        The folio to remove the mappings from
 * @page:        The first page to remove
 * @nr_pages:        The number of pages that will be removed from the mapping
 * @vma:        The vm area from which the mappings are removed
 *
 * The page range of the folio is defined by [page, page + nr_pages)
 *
 * The caller needs to hold the page table lock.
 */
void folio_remove_rmap_ptes(struct folio *folio, struct page *page,
                int nr_pages, struct vm_area_struct *vma)
{
        __folio_remove_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE);
}

/**
 * folio_remove_rmap_pmd - remove a PMD mapping from a page range of a folio
 * @folio:        The folio to remove the mapping from
 * @page:        The first page to remove
 * @vma:        The vm area from which the mapping is removed
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock.
 */
void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
                struct vm_area_struct *vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
#endif
}

/*
 * @arg: enum ttu_flags will be passed to this argument
 */
static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
                     unsigned long address, void *arg)
{
        struct mm_struct *mm = vma->vm_mm;
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
        pte_t pteval;
        struct page *subpage;
        bool anon_exclusive, ret = true;
        struct mmu_notifier_range range;
        enum ttu_flags flags = (enum ttu_flags)(long)arg;
        unsigned long pfn;
        unsigned long hsz = 0;

        /*
         * When racing against e.g. zap_pte_range() on another cpu,
         * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
         * try_to_unmap() may return before page_mapped() has become false,
         * if page table locking is skipped: use TTU_SYNC to wait for that.
         */
        if (flags & TTU_SYNC)
                pvmw.flags = PVMW_SYNC;

        if (flags & TTU_SPLIT_HUGE_PMD)
                split_huge_pmd_address(vma, address, false, folio);

        /*
         * For THP, we have to assume the worse case ie pmd for invalidation.
         * For hugetlb, it could be much worse if we need to do pud
         * invalidation in the case of pmd sharing.
         *
         * Note that the folio can not be freed in this function as call of
         * try_to_unmap() must hold a reference on the folio.
         */
        range.end = vma_address_end(&pvmw);
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address, range.end);
        if (folio_test_hugetlb(folio)) {
                /*
                 * If sharing is possible, start and end will be adjusted
                 * accordingly.
                 */
                adjust_range_if_pmd_sharing_possible(vma, &range.start,
                                                     &range.end);

                /* We need the huge page size for set_huge_pte_at() */
                hsz = huge_page_size(hstate_vma(vma));
        }
        mmu_notifier_invalidate_range_start(&range);

        while (page_vma_mapped_walk(&pvmw)) {
                /* Unexpected PMD-mapped THP? */
                VM_BUG_ON_FOLIO(!pvmw.pte, folio);

                /*
                 * If the folio is in an mlock()d vma, we must not swap it out.
                 */
                if (!(flags & TTU_IGNORE_MLOCK) &&
                    (vma->vm_flags & VM_LOCKED)) {
                        /* Restore the mlock which got missed */
                        if (!folio_test_large(folio))
                                mlock_vma_folio(folio, vma);
                        page_vma_mapped_walk_done(&pvmw);
                        ret = false;
                        break;
                }

                pfn = pte_pfn(ptep_get(pvmw.pte));
                subpage = folio_page(folio, pfn - folio_pfn(folio));
                address = pvmw.address;
                anon_exclusive = folio_test_anon(folio) &&
                                 PageAnonExclusive(subpage);

                if (folio_test_hugetlb(folio)) {
                        bool anon = folio_test_anon(folio);

                        /*
                         * The try_to_unmap() is only passed a hugetlb page
                         * in the case where the hugetlb page is poisoned.
                         */
                        VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
                        /*
                         * huge_pmd_unshare may unmap an entire PMD page.
                         * There is no way of knowing exactly which PMDs may
                         * be cached for this mm, so we must flush them all.
                         * start/end were already adjusted above to cover this
                         * range.
                         */
                        flush_cache_range(vma, range.start, range.end);

                        /*
                         * To call huge_pmd_unshare, i_mmap_rwsem must be
                         * held in write mode.  Caller needs to explicitly
                         * do this outside rmap routines.
                         *
                         * We also must hold hugetlb vma_lock in write mode.
                         * Lock order dictates acquiring vma_lock BEFORE
                         * i_mmap_rwsem.  We can only try lock here and fail
                         * if unsuccessful.
                         */
                        if (!anon) {
                                VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
                                if (!hugetlb_vma_trylock_write(vma)) {
                                        page_vma_mapped_walk_done(&pvmw);
                                        ret = false;
                                        break;
                                }
                                if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
                                        hugetlb_vma_unlock_write(vma);
                                        flush_tlb_range(vma,
                                                range.start, range.end);
                                        /*
                                         * The ref count of the PMD page was
                                         * dropped which is part of the way map
                                         * counting is done for shared PMDs.
                                         * Return 'true' here.  When there is
                                         * no other sharing, huge_pmd_unshare
                                         * returns false and we will unmap the
                                         * actual page and drop map count
                                         * to zero.
                                         */
                                        page_vma_mapped_walk_done(&pvmw);
                                        break;
                                }
                                hugetlb_vma_unlock_write(vma);
                        }
                        pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
                } else {
                        flush_cache_page(vma, address, pfn);
                        /* Nuke the page table entry. */
                        if (should_defer_flush(mm, flags)) {
                                /*
                                 * We clear the PTE but do not flush so potentially
                                 * a remote CPU could still be writing to the folio.
                                 * If the entry was previously clean then the
                                 * architecture must guarantee that a clear->dirty
                                 * transition on a cached TLB entry is written through
                                 * and traps if the PTE is unmapped.
                                 */
                                pteval = ptep_get_and_clear(mm, address, pvmw.pte);

                                set_tlb_ubc_flush_pending(mm, pteval, address);
                        } else {
                                pteval = ptep_clear_flush(vma, address, pvmw.pte);
                        }
                }

                /*
                 * Now the pte is cleared. If this pte was uffd-wp armed,
                 * we may want to replace a none pte with a marker pte if
                 * it's file-backed, so we don't lose the tracking info.
                 */
                pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);

                /* Set the dirty flag on the folio now the pte is gone. */
                if (pte_dirty(pteval))
                        folio_mark_dirty(folio);

                /* Update high watermark before we lower rss */
                update_hiwater_rss(mm);

                if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) {
                        pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
                        if (folio_test_hugetlb(folio)) {
                                hugetlb_count_sub(folio_nr_pages(folio), mm);
                                set_huge_pte_at(mm, address, pvmw.pte, pteval,
                                                hsz);
                        } else {
                                dec_mm_counter(mm, mm_counter(folio));
                                set_pte_at(mm, address, pvmw.pte, pteval);
                        }

                } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
                        /*
                         * The guest indicated that the page content is of no
                         * interest anymore. Simply discard the pte, vmscan
                         * will take care of the rest.
                         * A future reference will then fault in a new zero
                         * page. When userfaultfd is active, we must not drop
                         * this page though, as its main user (postcopy
                         * migration) will not expect userfaults on already
                         * copied pages.
                         */
                        dec_mm_counter(mm, mm_counter(folio));
                } else if (folio_test_anon(folio)) {
                        swp_entry_t entry = page_swap_entry(subpage);
                        pte_t swp_pte;
                        /*
                         * Store the swap location in the pte.
                         * See handle_pte_fault() ...
                         */
                        if (unlikely(folio_test_swapbacked(folio) !=
                                        folio_test_swapcache(folio))) {
                                WARN_ON_ONCE(1);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }

                        /* MADV_FREE page check */
                        if (!folio_test_swapbacked(folio)) {
                                int ref_count, map_count;

                                /*
                                 * Synchronize with gup_pte_range():
                                 * - clear PTE; barrier; read refcount
                                 * - inc refcount; barrier; read PTE
                                 */
                                smp_mb();

                                ref_count = folio_ref_count(folio);
                                map_count = folio_mapcount(folio);

                                /*
                                 * Order reads for page refcount and dirty flag
                                 * (see comments in __remove_mapping()).
                                 */
                                smp_rmb();

                                /*
                                 * The only page refs must be one from isolation
                                 * plus the rmap(s) (dropped by discard:).
                                 */
                                if (ref_count == 1 + map_count &&
                                    !folio_test_dirty(folio)) {
                                        dec_mm_counter(mm, MM_ANONPAGES);
                                        goto discard;
                                }

                                /*
                                 * If the folio was redirtied, it cannot be
                                 * discarded. Remap the page to page table.
                                 */
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                folio_set_swapbacked(folio);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }

                        if (swap_duplicate(entry) < 0) {
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
                        if (arch_unmap_one(mm, vma, address, pteval) < 0) {
                                swap_free(entry);
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }

                        /* See folio_try_share_anon_rmap(): clear PTE first. */
                        if (anon_exclusive &&
                            folio_try_share_anon_rmap_pte(folio, subpage)) {
                                swap_free(entry);
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
                        if (list_empty(&mm->mmlist)) {
                                spin_lock(&mmlist_lock);
                                if (list_empty(&mm->mmlist))
                                        list_add(&mm->mmlist, &init_mm.mmlist);
                                spin_unlock(&mmlist_lock);
                        }
                        dec_mm_counter(mm, MM_ANONPAGES);
                        inc_mm_counter(mm, MM_SWAPENTS);
                        swp_pte = swp_entry_to_pte(entry);
                        if (anon_exclusive)
                                swp_pte = pte_swp_mkexclusive(swp_pte);
                        if (pte_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
                        if (pte_uffd_wp(pteval))
                                swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, address, pvmw.pte, swp_pte);
                } else {
                        /*
                         * This is a locked file-backed folio,
                         * so it cannot be removed from the page
                         * cache and replaced by a new folio before
                         * mmu_notifier_invalidate_range_end, so no
                         * concurrent thread might update its page table
                         * to point at a new folio while a device is
                         * still using this folio.
                         *
                         * See Documentation/mm/mmu_notifier.rst
                         */
                        dec_mm_counter(mm, mm_counter_file(folio));
                }
discard:
                if (unlikely(folio_test_hugetlb(folio)))
                        hugetlb_remove_rmap(folio);
                else
                        folio_remove_rmap_pte(folio, subpage, vma);
                if (vma->vm_flags & VM_LOCKED)
                        mlock_drain_local();
                folio_put(folio);
        }

        mmu_notifier_invalidate_range_end(&range);

        return ret;
}

static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
{
        return vma_is_temporary_stack(vma);
}

static int folio_not_mapped(struct folio *folio)
{
        return !folio_mapped(folio);
}

/**
 * try_to_unmap - Try to remove all page table mappings to a folio.
 * @folio: The folio to unmap.
 * @flags: action and flags
 *
 * Tries to remove all the page table entries which are mapping this
 * folio.  It is the caller's responsibility to check if the folio is
 * still mapped if needed (use TTU_SYNC to prevent accounting races).
 *
 * Context: Caller must hold the folio lock.
 */
void try_to_unmap(struct folio *folio, enum ttu_flags flags)
{
        struct rmap_walk_control rwc = {
                .rmap_one = try_to_unmap_one,
                .arg = (void *)flags,
                .done = folio_not_mapped,
                .anon_lock = folio_lock_anon_vma_read,
        };

        if (flags & TTU_RMAP_LOCKED)
                rmap_walk_locked(folio, &rwc);
        else
                rmap_walk(folio, &rwc);
}

/*
 * @arg: enum ttu_flags will be passed to this argument.
 *
 * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
 * containing migration entries.
 */
static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                     unsigned long address, void *arg)
{
        struct mm_struct *mm = vma->vm_mm;
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
        pte_t pteval;
        struct page *subpage;
        bool anon_exclusive, ret = true;
        struct mmu_notifier_range range;
        enum ttu_flags flags = (enum ttu_flags)(long)arg;
        unsigned long pfn;
        unsigned long hsz = 0;

        /*
         * When racing against e.g. zap_pte_range() on another cpu,
         * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
         * try_to_migrate() may return before page_mapped() has become false,
         * if page table locking is skipped: use TTU_SYNC to wait for that.
         */
        if (flags & TTU_SYNC)
                pvmw.flags = PVMW_SYNC;

        /*
         * unmap_page() in mm/huge_memory.c is the only user of migration with
         * TTU_SPLIT_HUGE_PMD and it wants to freeze.
         */
        if (flags & TTU_SPLIT_HUGE_PMD)
                split_huge_pmd_address(vma, address, true, folio);

        /*
         * For THP, we have to assume the worse case ie pmd for invalidation.
         * For hugetlb, it could be much worse if we need to do pud
         * invalidation in the case of pmd sharing.
         *
         * Note that the page can not be free in this function as call of
         * try_to_unmap() must hold a reference on the page.
         */
        range.end = vma_address_end(&pvmw);
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address, range.end);
        if (folio_test_hugetlb(folio)) {
                /*
                 * If sharing is possible, start and end will be adjusted
                 * accordingly.
                 */
                adjust_range_if_pmd_sharing_possible(vma, &range.start,
                                                     &range.end);

                /* We need the huge page size for set_huge_pte_at() */
                hsz = huge_page_size(hstate_vma(vma));
        }
        mmu_notifier_invalidate_range_start(&range);

        while (page_vma_mapped_walk(&pvmw)) {
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
                /* PMD-mapped THP migration entry */
                if (!pvmw.pte) {
                        subpage = folio_page(folio,
                                pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
                        VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
                                        !folio_test_pmd_mappable(folio), folio);

                        if (set_pmd_migration_entry(&pvmw, subpage)) {
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
                        continue;
                }
#endif

                /* Unexpected PMD-mapped THP? */
                VM_BUG_ON_FOLIO(!pvmw.pte, folio);

                pfn = pte_pfn(ptep_get(pvmw.pte));

                if (folio_is_zone_device(folio)) {
                        /*
                         * Our PTE is a non-present device exclusive entry and
                         * calculating the subpage as for the common case would
                         * result in an invalid pointer.
                         *
                         * Since only PAGE_SIZE pages can currently be
                         * migrated, just set it to page. This will need to be
                         * changed when hugepage migrations to device private
                         * memory are supported.
                         */
                        VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio);
                        subpage = &folio->page;
                } else {
                        subpage = folio_page(folio, pfn - folio_pfn(folio));
                }
                address = pvmw.address;
                anon_exclusive = folio_test_anon(folio) &&
                                 PageAnonExclusive(subpage);

                if (folio_test_hugetlb(folio)) {
                        bool anon = folio_test_anon(folio);

                        /*
                         * huge_pmd_unshare may unmap an entire PMD page.
                         * There is no way of knowing exactly which PMDs may
                         * be cached for this mm, so we must flush them all.
                         * start/end were already adjusted above to cover this
                         * range.
                         */
                        flush_cache_range(vma, range.start, range.end);

                        /*
                         * To call huge_pmd_unshare, i_mmap_rwsem must be
                         * held in write mode.  Caller needs to explicitly
                         * do this outside rmap routines.
                         *
                         * We also must hold hugetlb vma_lock in write mode.
                         * Lock order dictates acquiring vma_lock BEFORE
                         * i_mmap_rwsem.  We can only try lock here and
                         * fail if unsuccessful.
                         */
                        if (!anon) {
                                VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
                                if (!hugetlb_vma_trylock_write(vma)) {
                                        page_vma_mapped_walk_done(&pvmw);
                                        ret = false;
                                        break;
                                }
                                if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
                                        hugetlb_vma_unlock_write(vma);
                                        flush_tlb_range(vma,
                                                range.start, range.end);

                                        /*
                                         * The ref count of the PMD page was
                                         * dropped which is part of the way map
                                         * counting is done for shared PMDs.
                                         * Return 'true' here.  When there is
                                         * no other sharing, huge_pmd_unshare
                                         * returns false and we will unmap the
                                         * actual page and drop map count
                                         * to zero.
                                         */
                                        page_vma_mapped_walk_done(&pvmw);
                                        break;
                                }
                                hugetlb_vma_unlock_write(vma);
                        }
                        /* Nuke the hugetlb page table entry */
                        pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
                } else {
                        flush_cache_page(vma, address, pfn);
                        /* Nuke the page table entry. */
                        if (should_defer_flush(mm, flags)) {
                                /*
                                 * We clear the PTE but do not flush so potentially
                                 * a remote CPU could still be writing to the folio.
                                 * If the entry was previously clean then the
                                 * architecture must guarantee that a clear->dirty
                                 * transition on a cached TLB entry is written through
                                 * and traps if the PTE is unmapped.
                                 */
                                pteval = ptep_get_and_clear(mm, address, pvmw.pte);

                                set_tlb_ubc_flush_pending(mm, pteval, address);
                        } else {
                                pteval = ptep_clear_flush(vma, address, pvmw.pte);
                        }
                }

                /* Set the dirty flag on the folio now the pte is gone. */
                if (pte_dirty(pteval))
                        folio_mark_dirty(folio);

                /* Update high watermark before we lower rss */
                update_hiwater_rss(mm);

                if (folio_is_device_private(folio)) {
                        unsigned long pfn = folio_pfn(folio);
                        swp_entry_t entry;
                        pte_t swp_pte;

                        if (anon_exclusive)
                                WARN_ON_ONCE(folio_try_share_anon_rmap_pte(folio,
                                                                           subpage));

                        /*
                         * Store the pfn of the page in a special migration
                         * pte. do_swap_page() will wait until the migration
                         * pte is removed and then restart fault handling.
                         */
                        entry = pte_to_swp_entry(pteval);
                        if (is_writable_device_private_entry(entry))
                                entry = make_writable_migration_entry(pfn);
                        else if (anon_exclusive)
                                entry = make_readable_exclusive_migration_entry(pfn);
                        else
                                entry = make_readable_migration_entry(pfn);
                        swp_pte = swp_entry_to_pte(entry);

                        /*
                         * pteval maps a zone device page and is therefore
                         * a swap pte.
                         */
                        if (pte_swp_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
                        if (pte_swp_uffd_wp(pteval))
                                swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
                        trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
                                                folio_order(folio));
                        /*
                         * No need to invalidate here it will synchronize on
                         * against the special swap migration pte.
                         */
                } else if (PageHWPoison(subpage)) {
                        pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
                        if (folio_test_hugetlb(folio)) {
                                hugetlb_count_sub(folio_nr_pages(folio), mm);
                                set_huge_pte_at(mm, address, pvmw.pte, pteval,
                                                hsz);
                        } else {
                                dec_mm_counter(mm, mm_counter(folio));
                                set_pte_at(mm, address, pvmw.pte, pteval);
                        }

                } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
                        /*
                         * The guest indicated that the page content is of no
                         * interest anymore. Simply discard the pte, vmscan
                         * will take care of the rest.
                         * A future reference will then fault in a new zero
                         * page. When userfaultfd is active, we must not drop
                         * this page though, as its main user (postcopy
                         * migration) will not expect userfaults on already
                         * copied pages.
                         */
                        dec_mm_counter(mm, mm_counter(folio));
                } else {
                        swp_entry_t entry;
                        pte_t swp_pte;

                        if (arch_unmap_one(mm, vma, address, pteval) < 0) {
                                if (folio_test_hugetlb(folio))
                                        set_huge_pte_at(mm, address, pvmw.pte,
                                                        pteval, hsz);
                                else
                                        set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
                        VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
                                       !anon_exclusive, subpage);

                        /* See folio_try_share_anon_rmap_pte(): clear PTE first. */
                        if (folio_test_hugetlb(folio)) {
                                if (anon_exclusive &&
                                    hugetlb_try_share_anon_rmap(folio)) {
                                        set_huge_pte_at(mm, address, pvmw.pte,
                                                        pteval, hsz);
                                        ret = false;
                                        page_vma_mapped_walk_done(&pvmw);
                                        break;
                                }
                        } else if (anon_exclusive &&
                                   folio_try_share_anon_rmap_pte(folio, subpage)) {
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }

                        /*
                         * Store the pfn of the page in a special migration
                         * pte. do_swap_page() will wait until the migration
                         * pte is removed and then restart fault handling.
                         */
                        if (pte_write(pteval))
                                entry = make_writable_migration_entry(
                                                        page_to_pfn(subpage));
                        else if (anon_exclusive)
                                entry = make_readable_exclusive_migration_entry(
                                                        page_to_pfn(subpage));
                        else
                                entry = make_readable_migration_entry(
                                                        page_to_pfn(subpage));
                        if (pte_young(pteval))
                                entry = make_migration_entry_young(entry);
                        if (pte_dirty(pteval))
                                entry = make_migration_entry_dirty(entry);
                        swp_pte = swp_entry_to_pte(entry);
                        if (pte_soft_dirty(pteval))
                                swp_pte = pte_swp_mksoft_dirty(swp_pte);
                        if (pte_uffd_wp(pteval))
                                swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        if (folio_test_hugetlb(folio))
                                set_huge_pte_at(mm, address, pvmw.pte, swp_pte,
                                                hsz);
                        else
                                set_pte_at(mm, address, pvmw.pte, swp_pte);
                        trace_set_migration_pte(address, pte_val(swp_pte),
                                                folio_order(folio));
                        /*
                         * No need to invalidate here it will synchronize on
                         * against the special swap migration pte.
                         */
                }

                if (unlikely(folio_test_hugetlb(folio)))
                        hugetlb_remove_rmap(folio);
                else
                        folio_remove_rmap_pte(folio, subpage, vma);
                if (vma->vm_flags & VM_LOCKED)
                        mlock_drain_local();
                folio_put(folio);
        }

        mmu_notifier_invalidate_range_end(&range);

        return ret;
}

/**
 * try_to_migrate - try to replace all page table mappings with swap entries
 * @folio: the folio to replace page table entries for
 * @flags: action and flags
 *
 * Tries to remove all the page table entries which are mapping this folio and
 * replace them with special swap entries. Caller must hold the folio lock.
 */
void try_to_migrate(struct folio *folio, enum ttu_flags flags)
{
        struct rmap_walk_control rwc = {
                .rmap_one = try_to_migrate_one,
                .arg = (void *)flags,
                .done = folio_not_mapped,
                .anon_lock = folio_lock_anon_vma_read,
        };

        /*
         * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
         * TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags.
         */
        if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
                                        TTU_SYNC | TTU_BATCH_FLUSH)))
                return;

        if (folio_is_zone_device(folio) &&
            (!folio_is_device_private(folio) && !folio_is_device_coherent(folio)))
                return;

        /*
         * During exec, a temporary VMA is setup and later moved.
         * The VMA is moved under the anon_vma lock but not the
         * page tables leading to a race where migration cannot
         * find the migration ptes. Rather than increasing the
         * locking requirements of exec(), migration skips
         * temporary VMAs until after exec() completes.
         */
        if (!folio_test_ksm(folio) && folio_test_anon(folio))
                rwc.invalid_vma = invalid_migration_vma;

        if (flags & TTU_RMAP_LOCKED)
                rmap_walk_locked(folio, &rwc);
        else
                rmap_walk(folio, &rwc);
}

#ifdef CONFIG_DEVICE_PRIVATE
struct make_exclusive_args {
        struct mm_struct *mm;
        unsigned long address;
        void *owner;
        bool valid;
};

static bool page_make_device_exclusive_one(struct folio *folio,
                struct vm_area_struct *vma, unsigned long address, void *priv)
{
        struct mm_struct *mm = vma->vm_mm;
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
        struct make_exclusive_args *args = priv;
        pte_t pteval;
        struct page *subpage;
        bool ret = true;
        struct mmu_notifier_range range;
        swp_entry_t entry;
        pte_t swp_pte;
        pte_t ptent;

        mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
                                      vma->vm_mm, address, min(vma->vm_end,
                                      address + folio_size(folio)),
                                      args->owner);
        mmu_notifier_invalidate_range_start(&range);

        while (page_vma_mapped_walk(&pvmw)) {
                /* Unexpected PMD-mapped THP? */
                VM_BUG_ON_FOLIO(!pvmw.pte, folio);

                ptent = ptep_get(pvmw.pte);
                if (!pte_present(ptent)) {
                        ret = false;
                        page_vma_mapped_walk_done(&pvmw);
                        break;
                }

                subpage = folio_page(folio,
                                pte_pfn(ptent) - folio_pfn(folio));
                address = pvmw.address;

                /* Nuke the page table entry. */
                flush_cache_page(vma, address, pte_pfn(ptent));
                pteval = ptep_clear_flush(vma, address, pvmw.pte);

                /* Set the dirty flag on the folio now the pte is gone. */
                if (pte_dirty(pteval))
                        folio_mark_dirty(folio);

                /*
                 * Check that our target page is still mapped at the expected
                 * address.
                 */
                if (args->mm == mm && args->address == address &&
                    pte_write(pteval))
                        args->valid = true;

                /*
                 * Store the pfn of the page in a special migration
                 * pte. do_swap_page() will wait until the migration
                 * pte is removed and then restart fault handling.
                 */
                if (pte_write(pteval))
                        entry = make_writable_device_exclusive_entry(
                                                        page_to_pfn(subpage));
                else
                        entry = make_readable_device_exclusive_entry(
                                                        page_to_pfn(subpage));
                swp_pte = swp_entry_to_pte(entry);
                if (pte_soft_dirty(pteval))
                        swp_pte = pte_swp_mksoft_dirty(swp_pte);
                if (pte_uffd_wp(pteval))
                        swp_pte = pte_swp_mkuffd_wp(swp_pte);

                set_pte_at(mm, address, pvmw.pte, swp_pte);

                /*
                 * There is a reference on the page for the swap entry which has
                 * been removed, so shouldn't take another.
                 */
                folio_remove_rmap_pte(folio, subpage, vma);
        }

        mmu_notifier_invalidate_range_end(&range);

        return ret;
}

/**
 * folio_make_device_exclusive - Mark the folio exclusively owned by a device.
 * @folio: The folio to replace page table entries for.
 * @mm: The mm_struct where the folio is expected to be mapped.
 * @address: Address where the folio is expected to be mapped.
 * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
 *
 * Tries to remove all the page table entries which are mapping this
 * folio and replace them with special device exclusive swap entries to
 * grant a device exclusive access to the folio.
 *
 * Context: Caller must hold the folio lock.
 * Return: false if the page is still mapped, or if it could not be unmapped
 * from the expected address. Otherwise returns true (success).
 */
static bool folio_make_device_exclusive(struct folio *folio,
                struct mm_struct *mm, unsigned long address, void *owner)
{
        struct make_exclusive_args args = {
                .mm = mm,
                .address = address,
                .owner = owner,
                .valid = false,
        };
        struct rmap_walk_control rwc = {
                .rmap_one = page_make_device_exclusive_one,
                .done = folio_not_mapped,
                .anon_lock = folio_lock_anon_vma_read,
                .arg = &args,
        };

        /*
         * Restrict to anonymous folios for now to avoid potential writeback
         * issues.
         */
        if (!folio_test_anon(folio))
                return false;

        rmap_walk(folio, &rwc);

        return args.valid && !folio_mapcount(folio);
}

/**
 * make_device_exclusive_range() - Mark a range for exclusive use by a device
 * @mm: mm_struct of associated target process
 * @start: start of the region to mark for exclusive device access
 * @end: end address of region
 * @pages: returns the pages which were successfully marked for exclusive access
 * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
 *
 * Returns: number of pages found in the range by GUP. A page is marked for
 * exclusive access only if the page pointer is non-NULL.
 *
 * This function finds ptes mapping page(s) to the given address range, locks
 * them and replaces mappings with special swap entries preventing userspace CPU
 * access. On fault these entries are replaced with the original mapping after
 * calling MMU notifiers.
 *
 * A driver using this to program access from a device must use a mmu notifier
 * critical section to hold a device specific lock during programming. Once
 * programming is complete it should drop the page lock and reference after
 * which point CPU access to the page will revoke the exclusive access.
 */
int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
                                unsigned long end, struct page **pages,
                                void *owner)
{
        long npages = (end - start) >> PAGE_SHIFT;
        long i;

        npages = get_user_pages_remote(mm, start, npages,
                                       FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
                                       pages, NULL);
        if (npages < 0)
                return npages;

        for (i = 0; i < npages; i++, start += PAGE_SIZE) {
                struct folio *folio = page_folio(pages[i]);
                if (PageTail(pages[i]) || !folio_trylock(folio)) {
                        folio_put(folio);
                        pages[i] = NULL;
                        continue;
                }

                if (!folio_make_device_exclusive(folio, mm, start, owner)) {
                        folio_unlock(folio);
                        folio_put(folio);
                        pages[i] = NULL;
                }
        }

        return npages;
}
EXPORT_SYMBOL_GPL(make_device_exclusive_range);
#endif

void __put_anon_vma(struct anon_vma *anon_vma)
{
        struct anon_vma *root = anon_vma->root;

        anon_vma_free(anon_vma);
        if (root != anon_vma && atomic_dec_and_test(&root->refcount))
                anon_vma_free(root);
}

static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
                                            struct rmap_walk_control *rwc)
{
        struct anon_vma *anon_vma;

        if (rwc->anon_lock)
                return rwc->anon_lock(folio, rwc);

        /*
         * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
         * because that depends on page_mapped(); but not all its usages
         * are holding mmap_lock. Users without mmap_lock are required to
         * take a reference count to prevent the anon_vma disappearing
         */
        anon_vma = folio_anon_vma(folio);
        if (!anon_vma)
                return NULL;

        if (anon_vma_trylock_read(anon_vma))
                goto out;

        if (rwc->try_lock) {
                anon_vma = NULL;
                rwc->contended = true;
                goto out;
        }

        anon_vma_lock_read(anon_vma);
out:
        return anon_vma;
}

/*
 * rmap_walk_anon - do something to anonymous page using the object-based
 * rmap method
 * @folio: the folio to be handled
 * @rwc: control variable according to each walk type
 * @locked: caller holds relevant rmap lock
 *
 * Find all the mappings of a folio using the mapping pointer and the vma
 * chains contained in the anon_vma struct it points to.
 */
static void rmap_walk_anon(struct folio *folio,
                struct rmap_walk_control *rwc, bool locked)
{
        struct anon_vma *anon_vma;
        pgoff_t pgoff_start, pgoff_end;
        struct anon_vma_chain *avc;

        if (locked) {
                anon_vma = folio_anon_vma(folio);
                /* anon_vma disappear under us? */
                VM_BUG_ON_FOLIO(!anon_vma, folio);
        } else {
                anon_vma = rmap_walk_anon_lock(folio, rwc);
        }
        if (!anon_vma)
                return;

        pgoff_start = folio_pgoff(folio);
        pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
                        pgoff_start, pgoff_end) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(vma, pgoff_start,
                                folio_nr_pages(folio));

                VM_BUG_ON_VMA(address == -EFAULT, vma);
                cond_resched();

                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                        continue;

                if (!rwc->rmap_one(folio, vma, address, rwc->arg))
                        break;
                if (rwc->done && rwc->done(folio))
                        break;
        }

        if (!locked)
                anon_vma_unlock_read(anon_vma);
}

/*
 * rmap_walk_file - do something to file page using the object-based rmap method
 * @folio: the folio to be handled
 * @rwc: control variable according to each walk type
 * @locked: caller holds relevant rmap lock
 *
 * Find all the mappings of a folio using the mapping pointer and the vma chains
 * contained in the address_space struct it points to.
 */
static void rmap_walk_file(struct folio *folio,
                struct rmap_walk_control *rwc, bool locked)
{
        struct address_space *mapping = folio_mapping(folio);
        pgoff_t pgoff_start, pgoff_end;
        struct vm_area_struct *vma;

        /*
         * The page lock not only makes sure that page->mapping cannot
         * suddenly be NULLified by truncation, it makes sure that the
         * structure at mapping cannot be freed and reused yet,
         * so we can safely take mapping->i_mmap_rwsem.
         */
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (!mapping)
                return;

        pgoff_start = folio_pgoff(folio);
        pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
        if (!locked) {
                if (i_mmap_trylock_read(mapping))
                        goto lookup;

                if (rwc->try_lock) {
                        rwc->contended = true;
                        return;
                }

                i_mmap_lock_read(mapping);
        }
lookup:
        vma_interval_tree_foreach(vma, &mapping->i_mmap,
                        pgoff_start, pgoff_end) {
                unsigned long address = vma_address(vma, pgoff_start,
                               folio_nr_pages(folio));

                VM_BUG_ON_VMA(address == -EFAULT, vma);
                cond_resched();

                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                        continue;

                if (!rwc->rmap_one(folio, vma, address, rwc->arg))
                        goto done;
                if (rwc->done && rwc->done(folio))
                        goto done;
        }

done:
        if (!locked)
                i_mmap_unlock_read(mapping);
}

void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc)
{
        if (unlikely(folio_test_ksm(folio)))
                rmap_walk_ksm(folio, rwc);
        else if (folio_test_anon(folio))
                rmap_walk_anon(folio, rwc, false);
        else
                rmap_walk_file(folio, rwc, false);
}

/* Like rmap_walk, but caller holds relevant rmap lock */
void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc)
{
        /* no ksm support for now */
        VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio);
        if (folio_test_anon(folio))
                rmap_walk_anon(folio, rwc, true);
        else
                rmap_walk_file(folio, rwc, true);
}

#ifdef CONFIG_HUGETLB_PAGE
/*
 * The following two functions are for anonymous (private mapped) hugepages.
 * Unlike common anonymous pages, anonymous hugepages have no accounting code
 * and no lru code, because we handle hugepages differently from common pages.
 */
void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
                unsigned long address, rmap_t flags)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);

        atomic_inc(&folio->_entire_mapcount);
        atomic_inc(&folio->_large_mapcount);
        if (flags & RMAP_EXCLUSIVE)
                SetPageAnonExclusive(&folio->page);
        VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 &&
                         PageAnonExclusive(&folio->page), folio);
}

void hugetlb_add_new_anon_rmap(struct folio *folio,
                struct vm_area_struct *vma, unsigned long address)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);

        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        /* increment count (starts at -1) */
        atomic_set(&folio->_entire_mapcount, 0);
        atomic_set(&folio->_large_mapcount, 0);
        folio_clear_hugetlb_restore_reserve(folio);
        __folio_set_anon(folio, vma, address, true);
        SetPageAnonExclusive(&folio->page);
}
#endif /* CONFIG_HUGETLB_PAGE */

















































































































   53 






























































































   53 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Latched RB-trees
 *
 * Copyright (C) 2015 Intel Corp., Peter Zijlstra <peterz@infradead.org>
 *
 * Since RB-trees have non-atomic modifications they're not immediately suited
 * for RCU/lockless queries. Even though we made RB-tree lookups non-fatal for
 * lockless lookups; we cannot guarantee they return a correct result.
 *
 * The simplest solution is a seqlock + RB-tree, this will allow lockless
 * lookups; but has the constraint (inherent to the seqlock) that read sides
 * cannot nest in write sides.
 *
 * If we need to allow unconditional lookups (say as required for NMI context
 * usage) we need a more complex setup; this data structure provides this by
 * employing the latch technique -- see @raw_write_seqcount_latch -- to
 * implement a latched RB-tree which does allow for unconditional lookups by
 * virtue of always having (at least) one stable copy of the tree.
 *
 * However, while we have the guarantee that there is at all times one stable
 * copy, this does not guarantee an iteration will not observe modifications.
 * What might have been a stable copy at the start of the iteration, need not
 * remain so for the duration of the iteration.
 *
 * Therefore, this does require a lockless RB-tree iteration to be non-fatal;
 * see the comment in lib/rbtree.c. Note however that we only require the first
 * condition -- not seeing partial stores -- because the latch thing isolates
 * us from loops. If we were to interrupt a modification the lookup would be
 * pointed at the stable tree and complete while the modification was halted.
 */

#ifndef RB_TREE_LATCH_H
#define RB_TREE_LATCH_H

#include <linux/rbtree.h>
#include <linux/seqlock.h>
#include <linux/rcupdate.h>

struct latch_tree_node {
        struct rb_node node[2];
};

struct latch_tree_root {
        seqcount_latch_t        seq;
        struct rb_root                tree[2];
};

/**
 * latch_tree_ops - operators to define the tree order
 * @less: used for insertion; provides the (partial) order between two elements.
 * @comp: used for lookups; provides the order between the search key and an element.
 *
 * The operators are related like:
 *
 *        comp(a->key,b) < 0  := less(a,b)
 *        comp(a->key,b) > 0  := less(b,a)
 *        comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
 *
 * If these operators define a partial order on the elements we make no
 * guarantee on which of the elements matching the key is found. See
 * latch_tree_find().
 */
struct latch_tree_ops {
        bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b);
        int  (*comp)(void *key,                 struct latch_tree_node *b);
};

static __always_inline struct latch_tree_node *
__lt_from_rb(struct rb_node *node, int idx)
{
        return container_of(node, struct latch_tree_node, node[idx]);
}

static __always_inline void
__lt_insert(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx,
            bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b))
{
        struct rb_root *root = &ltr->tree[idx];
        struct rb_node **link = &root->rb_node;
        struct rb_node *node = &ltn->node[idx];
        struct rb_node *parent = NULL;
        struct latch_tree_node *ltp;

        while (*link) {
                parent = *link;
                ltp = __lt_from_rb(parent, idx);

                if (less(ltn, ltp))
                        link = &parent->rb_left;
                else
                        link = &parent->rb_right;
        }

        rb_link_node_rcu(node, parent, link);
        rb_insert_color(node, root);
}

static __always_inline void
__lt_erase(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx)
{
        rb_erase(&ltn->node[idx], &ltr->tree[idx]);
}

static __always_inline struct latch_tree_node *
__lt_find(void *key, struct latch_tree_root *ltr, int idx,
          int (*comp)(void *key, struct latch_tree_node *node))
{
        struct rb_node *node = rcu_dereference_raw(ltr->tree[idx].rb_node);
        struct latch_tree_node *ltn;
        int c;

        while (node) {
                ltn = __lt_from_rb(node, idx);
                c = comp(key, ltn);

                if (c < 0)
                        node = rcu_dereference_raw(node->rb_left);
                else if (c > 0)
                        node = rcu_dereference_raw(node->rb_right);
                else
                        return ltn;
        }

        return NULL;
}

/**
 * latch_tree_insert() - insert @node into the trees @root
 * @node: nodes to insert
 * @root: trees to insert @node into
 * @ops: operators defining the node order
 *
 * It inserts @node into @root in an ordered fashion such that we can always
 * observe one complete tree. See the comment for raw_write_seqcount_latch().
 *
 * The inserts use rcu_assign_pointer() to publish the element such that the
 * tree structure is stored before we can observe the new @node.
 *
 * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be
 * serialized.
 */
static __always_inline void
latch_tree_insert(struct latch_tree_node *node,
                  struct latch_tree_root *root,
                  const struct latch_tree_ops *ops)
{
        raw_write_seqcount_latch(&root->seq);
        __lt_insert(node, root, 0, ops->less);
        raw_write_seqcount_latch(&root->seq);
        __lt_insert(node, root, 1, ops->less);
}

/**
 * latch_tree_erase() - removes @node from the trees @root
 * @node: nodes to remote
 * @root: trees to remove @node from
 * @ops: operators defining the node order
 *
 * Removes @node from the trees @root in an ordered fashion such that we can
 * always observe one complete tree. See the comment for
 * raw_write_seqcount_latch().
 *
 * It is assumed that @node will observe one RCU quiescent state before being
 * reused of freed.
 *
 * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be
 * serialized.
 */
static __always_inline void
latch_tree_erase(struct latch_tree_node *node,
                 struct latch_tree_root *root,
                 const struct latch_tree_ops *ops)
{
        raw_write_seqcount_latch(&root->seq);
        __lt_erase(node, root, 0);
        raw_write_seqcount_latch(&root->seq);
        __lt_erase(node, root, 1);
}

/**
 * latch_tree_find() - find the node matching @key in the trees @root
 * @key: search key
 * @root: trees to search for @key
 * @ops: operators defining the node order
 *
 * Does a lockless lookup in the trees @root for the node matching @key.
 *
 * It is assumed that this is called while holding the appropriate RCU read
 * side lock.
 *
 * If the operators define a partial order on the elements (there are multiple
 * elements which have the same key value) it is undefined which of these
 * elements will be found. Nor is it possible to iterate the tree to find
 * further elements with the same key value.
 *
 * Returns: a pointer to the node matching @key or NULL.
 */
static __always_inline struct latch_tree_node *
latch_tree_find(void *key, struct latch_tree_root *root,
                const struct latch_tree_ops *ops)
{
        struct latch_tree_node *node;
        unsigned int seq;

        do {
                seq = raw_read_seqcount_latch(&root->seq);
                node = __lt_find(key, root, seq & 1, ops->comp);
        } while (raw_read_seqcount_latch_retry(&root->seq, seq));

        return node;
}

#endif /* RB_TREE_LATCH_H */


































































































































































































































































































































    2 


























    4 

    8 

    3 










































































































































































































































































   18 


























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Filesystem access notification for Linux
 *
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

#ifndef __LINUX_FSNOTIFY_BACKEND_H
#define __LINUX_FSNOTIFY_BACKEND_H

#ifdef __KERNEL__

#include <linux/idr.h> /* inotify uses this */
#include <linux/fs.h> /* struct inode */
#include <linux/list.h>
#include <linux/path.h> /* struct path */
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/user_namespace.h>
#include <linux/refcount.h>
#include <linux/mempool.h>
#include <linux/sched/mm.h>

/*
 * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
 * convert between them.  dnotify only needs conversion at watch creation
 * so no perf loss there.  fanotify isn't defined yet, so it can use the
 * wholes if it needs more events.
 */
#define FS_ACCESS                0x00000001        /* File was accessed */
#define FS_MODIFY                0x00000002        /* File was modified */
#define FS_ATTRIB                0x00000004        /* Metadata changed */
#define FS_CLOSE_WRITE                0x00000008        /* Writable file was closed */
#define FS_CLOSE_NOWRITE        0x00000010        /* Unwritable file closed */
#define FS_OPEN                        0x00000020        /* File was opened */
#define FS_MOVED_FROM                0x00000040        /* File was moved from X */
#define FS_MOVED_TO                0x00000080        /* File was moved to Y */
#define FS_CREATE                0x00000100        /* Subfile was created */
#define FS_DELETE                0x00000200        /* Subfile was deleted */
#define FS_DELETE_SELF                0x00000400        /* Self was deleted */
#define FS_MOVE_SELF                0x00000800        /* Self was moved */
#define FS_OPEN_EXEC                0x00001000        /* File was opened for exec */

#define FS_UNMOUNT                0x00002000        /* inode on umount fs */
#define FS_Q_OVERFLOW                0x00004000        /* Event queued overflowed */
#define FS_ERROR                0x00008000        /* Filesystem Error (fanotify) */

/*
 * FS_IN_IGNORED overloads FS_ERROR.  It is only used internally by inotify
 * which does not support FS_ERROR.
 */
#define FS_IN_IGNORED                0x00008000        /* last inotify event here */

#define FS_OPEN_PERM                0x00010000        /* open event in an permission hook */
#define FS_ACCESS_PERM                0x00020000        /* access event in a permissions hook */
#define FS_OPEN_EXEC_PERM        0x00040000        /* open/exec event in a permission hook */

/*
 * Set on inode mark that cares about things that happen to its children.
 * Always set for dnotify and inotify.
 * Set on inode/sb/mount marks that care about parent/name info.
 */
#define FS_EVENT_ON_CHILD        0x08000000

#define FS_RENAME                0x10000000        /* File was renamed */
#define FS_DN_MULTISHOT                0x20000000        /* dnotify multishot */
#define FS_ISDIR                0x40000000        /* event occurred against dir */

#define FS_MOVE                        (FS_MOVED_FROM | FS_MOVED_TO)

/*
 * Directory entry modification events - reported only to directory
 * where entry is modified and not to a watching parent.
 * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event
 * when a directory entry inside a child subdir changes.
 */
#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME)

#define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \
                                  FS_OPEN_EXEC_PERM)

/*
 * This is a list of all events that may get sent to a parent that is watching
 * with flag FS_EVENT_ON_CHILD based on fs event on a child of that directory.
 */
#define FS_EVENTS_POSS_ON_CHILD   (ALL_FSNOTIFY_PERM_EVENTS | \
                                   FS_ACCESS | FS_MODIFY | FS_ATTRIB | \
                                   FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | \
                                   FS_OPEN | FS_OPEN_EXEC)

/*
 * This is a list of all events that may get sent with the parent inode as the
 * @to_tell argument of fsnotify().
 * It may include events that can be sent to an inode/sb/mount mark, but cannot
 * be sent to a parent watching children.
 */
#define FS_EVENTS_POSS_TO_PARENT (FS_EVENTS_POSS_ON_CHILD)

/* Events that can be reported to backends */
#define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \
                             FS_EVENTS_POSS_ON_CHILD | \
                             FS_DELETE_SELF | FS_MOVE_SELF | \
                             FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \
                             FS_ERROR)

/* Extra flags that may be reported with event or control handling of events */
#define ALL_FSNOTIFY_FLAGS  (FS_ISDIR | FS_EVENT_ON_CHILD | FS_DN_MULTISHOT)

#define ALL_FSNOTIFY_BITS   (ALL_FSNOTIFY_EVENTS | ALL_FSNOTIFY_FLAGS)

struct fsnotify_group;
struct fsnotify_event;
struct fsnotify_mark;
struct fsnotify_event_private_data;
struct fsnotify_fname;
struct fsnotify_iter_info;

struct mem_cgroup;

/*
 * Each group much define these ops.  The fsnotify infrastructure will call
 * these operations for each relevant group.
 *
 * handle_event - main call for a group to handle an fs event
 * @group:        group to notify
 * @mask:        event type and flags
 * @data:        object that event happened on
 * @data_type:        type of object for fanotify_data_XXX() accessors
 * @dir:        optional directory associated with event -
 *                if @file_name is not NULL, this is the directory that
 *                @file_name is relative to
 * @file_name:        optional file name associated with event
 * @cookie:        inotify rename cookie
 * @iter_info:        array of marks from this group that are interested in the event
 *
 * handle_inode_event - simple variant of handle_event() for groups that only
 *                have inode marks and don't have ignore mask
 * @mark:        mark to notify
 * @mask:        event type and flags
 * @inode:        inode that event happened on
 * @dir:        optional directory associated with event -
 *                if @file_name is not NULL, this is the directory that
 *                @file_name is relative to.
 *                Either @inode or @dir must be non-NULL.
 * @file_name:        optional file name associated with event
 * @cookie:        inotify rename cookie
 *
 * free_group_priv - called when a group refcnt hits 0 to clean up the private union
 * freeing_mark - called when a mark is being destroyed for some reason.  The group
 *                MUST be holding a reference on each mark and that reference must be
 *                dropped in this function.  inotify uses this function to send
 *                userspace messages that marks have been removed.
 */
struct fsnotify_ops {
        int (*handle_event)(struct fsnotify_group *group, u32 mask,
                            const void *data, int data_type, struct inode *dir,
                            const struct qstr *file_name, u32 cookie,
                            struct fsnotify_iter_info *iter_info);
        int (*handle_inode_event)(struct fsnotify_mark *mark, u32 mask,
                            struct inode *inode, struct inode *dir,
                            const struct qstr *file_name, u32 cookie);
        void (*free_group_priv)(struct fsnotify_group *group);
        void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
        void (*free_event)(struct fsnotify_group *group, struct fsnotify_event *event);
        /* called on final put+free to free memory */
        void (*free_mark)(struct fsnotify_mark *mark);
};

/*
 * all of the information about the original object we want to now send to
 * a group.  If you want to carry more info from the accessing task to the
 * listener this structure is where you need to be adding fields.
 */
struct fsnotify_event {
        struct list_head list;
};

/*
 * fsnotify group priorities.
 * Events are sent in order from highest priority to lowest priority.
 */
enum fsnotify_group_prio {
        FSNOTIFY_PRIO_NORMAL = 0,        /* normal notifiers, no permissions */
        FSNOTIFY_PRIO_CONTENT,                /* fanotify permission events */
        FSNOTIFY_PRIO_PRE_CONTENT,        /* fanotify pre-content events */
        __FSNOTIFY_PRIO_NUM
};

/*
 * A group is a "thing" that wants to receive notification about filesystem
 * events.  The mask holds the subset of event types this group cares about.
 * refcnt on a group is up to the implementor and at any moment if it goes 0
 * everything will be cleaned up.
 */
struct fsnotify_group {
        const struct fsnotify_ops *ops;        /* how this group handles things */

        /*
         * How the refcnt is used is up to each group.  When the refcnt hits 0
         * fsnotify will clean up all of the resources associated with this group.
         * As an example, the dnotify group will always have a refcnt=1 and that
         * will never change.  Inotify, on the other hand, has a group per
         * inotify_init() and the refcnt will hit 0 only when that fd has been
         * closed.
         */
        refcount_t refcnt;                /* things with interest in this group */

        /* needed to send notification to userspace */
        spinlock_t notification_lock;                /* protect the notification_list */
        struct list_head notification_list;        /* list of event_holder this group needs to send to userspace */
        wait_queue_head_t notification_waitq;        /* read() on the notification file blocks on this waitq */
        unsigned int q_len;                        /* events on the queue */
        unsigned int max_events;                /* maximum events allowed on the list */
        enum fsnotify_group_prio priority;        /* priority for sending events */
        bool shutdown;                /* group is being shut down, don't queue more events */

#define FSNOTIFY_GROUP_USER        0x01 /* user allocated group */
#define FSNOTIFY_GROUP_DUPS        0x02 /* allow multiple marks per object */
#define FSNOTIFY_GROUP_NOFS        0x04 /* group lock is not direct reclaim safe */
        int flags;
        unsigned int owner_flags;        /* stored flags of mark_mutex owner */

        /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */
        struct mutex mark_mutex;        /* protect marks_list */
        atomic_t user_waits;                /* Number of tasks waiting for user
                                         * response */
        struct list_head marks_list;        /* all inode marks for this group */

        struct fasync_struct *fsn_fa;    /* async notification */

        struct fsnotify_event *overflow_event;        /* Event we queue when the
                                                 * notification list is too
                                                 * full */

        struct mem_cgroup *memcg;        /* memcg to charge allocations */

        /* groups can define private fields here or use the void *private */
        union {
                void *private;
#ifdef CONFIG_INOTIFY_USER
                struct inotify_group_private_data {
                        spinlock_t        idr_lock;
                        struct idr      idr;
                        struct ucounts *ucounts;
                } inotify_data;
#endif
#ifdef CONFIG_FANOTIFY
                struct fanotify_group_private_data {
                        /* Hash table of events for merge */
                        struct hlist_head *merge_hash;
                        /* allows a group to block waiting for a userspace response */
                        struct list_head access_list;
                        wait_queue_head_t access_waitq;
                        int flags;           /* flags from fanotify_init() */
                        int f_flags; /* event_f_flags from fanotify_init() */
                        struct ucounts *ucounts;
                        mempool_t error_events_pool;
                } fanotify_data;
#endif /* CONFIG_FANOTIFY */
        };
};

/*
 * These helpers are used to prevent deadlock when reclaiming inodes with
 * evictable marks of the same group that is allocating a new mark.
 */
static inline void fsnotify_group_lock(struct fsnotify_group *group)
{
        mutex_lock(&group->mark_mutex);
        if (group->flags & FSNOTIFY_GROUP_NOFS)
                group->owner_flags = memalloc_nofs_save();
}

static inline void fsnotify_group_unlock(struct fsnotify_group *group)
{
        if (group->flags & FSNOTIFY_GROUP_NOFS)
                memalloc_nofs_restore(group->owner_flags);
        mutex_unlock(&group->mark_mutex);
}

static inline void fsnotify_group_assert_locked(struct fsnotify_group *group)
{
        WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex));
        if (group->flags & FSNOTIFY_GROUP_NOFS)
                WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS));
}

/* When calling fsnotify tell it if the data is a path or inode */
enum fsnotify_data_type {
        FSNOTIFY_EVENT_NONE,
        FSNOTIFY_EVENT_PATH,
        FSNOTIFY_EVENT_INODE,
        FSNOTIFY_EVENT_DENTRY,
        FSNOTIFY_EVENT_ERROR,
};

struct fs_error_report {
        int error;
        struct inode *inode;
        struct super_block *sb;
};

static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_INODE:
                return (struct inode *)data;
        case FSNOTIFY_EVENT_DENTRY:
                return d_inode(data);
        case FSNOTIFY_EVENT_PATH:
                return d_inode(((const struct path *)data)->dentry);
        case FSNOTIFY_EVENT_ERROR:
                return ((struct fs_error_report *)data)->inode;
        default:
                return NULL;
        }
}

static inline struct dentry *fsnotify_data_dentry(const void *data, int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_DENTRY:
                /* Non const is needed for dget() */
                return (struct dentry *)data;
        case FSNOTIFY_EVENT_PATH:
                return ((const struct path *)data)->dentry;
        default:
                return NULL;
        }
}

static inline const struct path *fsnotify_data_path(const void *data,
                                                    int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_PATH:
                return data;
        default:
                return NULL;
        }
}

static inline struct super_block *fsnotify_data_sb(const void *data,
                                                   int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_INODE:
                return ((struct inode *)data)->i_sb;
        case FSNOTIFY_EVENT_DENTRY:
                return ((struct dentry *)data)->d_sb;
        case FSNOTIFY_EVENT_PATH:
                return ((const struct path *)data)->dentry->d_sb;
        case FSNOTIFY_EVENT_ERROR:
                return ((struct fs_error_report *) data)->sb;
        default:
                return NULL;
        }
}

static inline struct fs_error_report *fsnotify_data_error_report(
                                                        const void *data,
                                                        int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_ERROR:
                return (struct fs_error_report *) data;
        default:
                return NULL;
        }
}

/*
 * Index to merged marks iterator array that correlates to a type of watch.
 * The type of watched object can be deduced from the iterator type, but not
 * the other way around, because an event can match different watched objects
 * of the same object type.
 * For example, both parent and child are watching an object of type inode.
 */
enum fsnotify_iter_type {
        FSNOTIFY_ITER_TYPE_INODE,
        FSNOTIFY_ITER_TYPE_VFSMOUNT,
        FSNOTIFY_ITER_TYPE_SB,
        FSNOTIFY_ITER_TYPE_PARENT,
        FSNOTIFY_ITER_TYPE_INODE2,
        FSNOTIFY_ITER_TYPE_COUNT
};

/* The type of object that a mark is attached to */
enum fsnotify_obj_type {
        FSNOTIFY_OBJ_TYPE_ANY = -1,
        FSNOTIFY_OBJ_TYPE_INODE,
        FSNOTIFY_OBJ_TYPE_VFSMOUNT,
        FSNOTIFY_OBJ_TYPE_SB,
        FSNOTIFY_OBJ_TYPE_COUNT,
        FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT
};

static inline bool fsnotify_valid_obj_type(unsigned int obj_type)
{
        return (obj_type < FSNOTIFY_OBJ_TYPE_COUNT);
}

struct fsnotify_iter_info {
        struct fsnotify_mark *marks[FSNOTIFY_ITER_TYPE_COUNT];
        struct fsnotify_group *current_group;
        unsigned int report_mask;
        int srcu_idx;
};

static inline bool fsnotify_iter_should_report_type(
                struct fsnotify_iter_info *iter_info, int iter_type)
{
        return (iter_info->report_mask & (1U << iter_type));
}

static inline void fsnotify_iter_set_report_type(
                struct fsnotify_iter_info *iter_info, int iter_type)
{
        iter_info->report_mask |= (1U << iter_type);
}

static inline struct fsnotify_mark *fsnotify_iter_mark(
                struct fsnotify_iter_info *iter_info, int iter_type)
{
        if (fsnotify_iter_should_report_type(iter_info, iter_type))
                return iter_info->marks[iter_type];
        return NULL;
}

static inline int fsnotify_iter_step(struct fsnotify_iter_info *iter, int type,
                                     struct fsnotify_mark **markp)
{
        while (type < FSNOTIFY_ITER_TYPE_COUNT) {
                *markp = fsnotify_iter_mark(iter, type);
                if (*markp)
                        break;
                type++;
        }
        return type;
}

#define FSNOTIFY_ITER_FUNCS(name, NAME) \
static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \
                struct fsnotify_iter_info *iter_info) \
{ \
        return fsnotify_iter_mark(iter_info, FSNOTIFY_ITER_TYPE_##NAME); \
}

FSNOTIFY_ITER_FUNCS(inode, INODE)
FSNOTIFY_ITER_FUNCS(parent, PARENT)
FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT)
FSNOTIFY_ITER_FUNCS(sb, SB)

#define fsnotify_foreach_iter_type(type) \
        for (type = 0; type < FSNOTIFY_ITER_TYPE_COUNT; type++)
#define fsnotify_foreach_iter_mark_type(iter, mark, type) \
        for (type = 0; \
             type = fsnotify_iter_step(iter, type, &mark), \
             type < FSNOTIFY_ITER_TYPE_COUNT; \
             type++)

/*
 * Inode/vfsmount/sb point to this structure which tracks all marks attached to
 * the inode/vfsmount/sb. The reference to inode/vfsmount/sb is held by this
 * structure. We destroy this structure when there are no more marks attached
 * to it. The structure is protected by fsnotify_mark_srcu.
 */
struct fsnotify_mark_connector {
        spinlock_t lock;
        unsigned char type;        /* Type of object [lock] */
        unsigned char prio;        /* Highest priority group */
#define FSNOTIFY_CONN_FLAG_IS_WATCHED        0x01
#define FSNOTIFY_CONN_FLAG_HAS_IREF        0x02
        unsigned short flags;        /* flags [lock] */
        union {
                /* Object pointer [lock] */
                void *obj;
                /* Used listing heads to free after srcu period expires */
                struct fsnotify_mark_connector *destroy_next;
        };
        struct hlist_head list;
};

/*
 * Container for per-sb fsnotify state (sb marks and more).
 * Attached lazily on first marked object on the sb and freed when killing sb.
 */
struct fsnotify_sb_info {
        struct fsnotify_mark_connector __rcu *sb_marks;
        /*
         * Number of inode/mount/sb objects that are being watched in this sb.
         * Note that inodes objects are currently double-accounted.
         *
         * The value in watched_objects[prio] is the number of objects that are
         * watched by groups of priority >= prio, so watched_objects[0] is the
         * total number of watched objects in this sb.
         */
        atomic_long_t watched_objects[__FSNOTIFY_PRIO_NUM];
};

static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb)
{
#ifdef CONFIG_FSNOTIFY
        return READ_ONCE(sb->s_fsnotify_info);
#else
        return NULL;
#endif
}

static inline atomic_long_t *fsnotify_sb_watched_objects(struct super_block *sb)
{
        return &fsnotify_sb_info(sb)->watched_objects[0];
}

/*
 * A mark is simply an object attached to an in core inode which allows an
 * fsnotify listener to indicate they are either no longer interested in events
 * of a type matching mask or only interested in those events.
 *
 * These are flushed when an inode is evicted from core and may be flushed
 * when the inode is modified (as seen by fsnotify_access).  Some fsnotify
 * users (such as dnotify) will flush these when the open fd is closed and not
 * at inode eviction or modification.
 *
 * Text in brackets is showing the lock(s) protecting modifications of a
 * particular entry. obj_lock means either inode->i_lock or
 * mnt->mnt_root->d_lock depending on the mark type.
 */
struct fsnotify_mark {
        /* Mask this mark is for [mark->lock, group->mark_mutex] */
        __u32 mask;
        /* We hold one for presence in g_list. Also one ref for each 'thing'
         * in kernel that found and may be using this mark. */
        refcount_t refcnt;
        /* Group this mark is for. Set on mark creation, stable until last ref
         * is dropped */
        struct fsnotify_group *group;
        /* List of marks by group->marks_list. Also reused for queueing
         * mark into destroy_list when it's waiting for the end of SRCU period
         * before it can be freed. [group->mark_mutex] */
        struct list_head g_list;
        /* Protects inode / mnt pointers, flags, masks */
        spinlock_t lock;
        /* List of marks for inode / vfsmount [connector->lock, mark ref] */
        struct hlist_node obj_list;
        /* Head of list of marks for an object [mark ref] */
        struct fsnotify_mark_connector *connector;
        /* Events types and flags to ignore [mark->lock, group->mark_mutex] */
        __u32 ignore_mask;
        /* General fsnotify mark flags */
#define FSNOTIFY_MARK_FLAG_ALIVE                0x0001
#define FSNOTIFY_MARK_FLAG_ATTACHED                0x0002
        /* inotify mark flags */
#define FSNOTIFY_MARK_FLAG_EXCL_UNLINK                0x0010
#define FSNOTIFY_MARK_FLAG_IN_ONESHOT                0x0020
        /* fanotify mark flags */
#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY        0x0100
#define FSNOTIFY_MARK_FLAG_NO_IREF                0x0200
#define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS        0x0400
#define FSNOTIFY_MARK_FLAG_HAS_FSID                0x0800
#define FSNOTIFY_MARK_FLAG_WEAK_FSID                0x1000
        unsigned int flags;                /* flags [mark->lock] */
};

#ifdef CONFIG_FSNOTIFY

/* called from the vfs helpers */

/* main fsnotify call to send events */
extern int fsnotify(__u32 mask, const void *data, int data_type,
                    struct inode *dir, const struct qstr *name,
                    struct inode *inode, u32 cookie);
extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
                           int data_type);
extern void __fsnotify_inode_delete(struct inode *inode);
extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt);
extern void fsnotify_sb_delete(struct super_block *sb);
extern void fsnotify_sb_free(struct super_block *sb);
extern u32 fsnotify_get_cookie(void);

static inline __u32 fsnotify_parent_needed_mask(__u32 mask)
{
        /* FS_EVENT_ON_CHILD is set on marks that want parent/name info */
        if (!(mask & FS_EVENT_ON_CHILD))
                return 0;
        /*
         * This object might be watched by a mark that cares about parent/name
         * info, does it care about the specific set of events that can be
         * reported with parent/name info?
         */
        return mask & FS_EVENTS_POSS_TO_PARENT;
}

static inline int fsnotify_inode_watches_children(struct inode *inode)
{
        /* FS_EVENT_ON_CHILD is set if the inode may care */
        if (!(inode->i_fsnotify_mask & FS_EVENT_ON_CHILD))
                return 0;
        /* this inode might care about child events, does it care about the
         * specific set of events that can happen on a child? */
        return inode->i_fsnotify_mask & FS_EVENTS_POSS_ON_CHILD;
}

/*
 * Update the dentry with a flag indicating the interest of its parent to receive
 * filesystem events when those events happens to this dentry->d_inode.
 */
static inline void fsnotify_update_flags(struct dentry *dentry)
{
        assert_spin_locked(&dentry->d_lock);

        /*
         * Serialisation of setting PARENT_WATCHED on the dentries is provided
         * by d_lock. If inotify_inode_watched changes after we have taken
         * d_lock, the following __fsnotify_update_child_dentry_flags call will
         * find our entry, so it will spin until we complete here, and update
         * us with the new state.
         */
        if (fsnotify_inode_watches_children(dentry->d_parent->d_inode))
                dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
        else
                dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
}

/* called from fsnotify listeners, such as fanotify or dnotify */

/* create a new group */
extern struct fsnotify_group *fsnotify_alloc_group(
                                const struct fsnotify_ops *ops,
                                int flags);
/* get reference to a group */
extern void fsnotify_get_group(struct fsnotify_group *group);
/* drop reference on a group from fsnotify_alloc_group */
extern void fsnotify_put_group(struct fsnotify_group *group);
/* group destruction begins, stop queuing new events */
extern void fsnotify_group_stop_queueing(struct fsnotify_group *group);
/* destroy group */
extern void fsnotify_destroy_group(struct fsnotify_group *group);
/* fasync handler function */
extern int fsnotify_fasync(int fd, struct file *file, int on);
/* Free event from memory */
extern void fsnotify_destroy_event(struct fsnotify_group *group,
                                   struct fsnotify_event *event);
/* attach the event to the group notification queue */
extern int fsnotify_insert_event(struct fsnotify_group *group,
                                 struct fsnotify_event *event,
                                 int (*merge)(struct fsnotify_group *,
                                              struct fsnotify_event *),
                                 void (*insert)(struct fsnotify_group *,
                                                struct fsnotify_event *));

static inline int fsnotify_add_event(struct fsnotify_group *group,
                                     struct fsnotify_event *event,
                                     int (*merge)(struct fsnotify_group *,
                                                  struct fsnotify_event *))
{
        return fsnotify_insert_event(group, event, merge, NULL);
}

/* Queue overflow event to a notification group */
static inline void fsnotify_queue_overflow(struct fsnotify_group *group)
{
        fsnotify_add_event(group, group->overflow_event, NULL);
}

static inline bool fsnotify_is_overflow_event(u32 mask)
{
        return mask & FS_Q_OVERFLOW;
}

static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
{
        assert_spin_locked(&group->notification_lock);

        return list_empty(&group->notification_list);
}

extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
/* return, but do not dequeue the first event on the notification queue */
extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group);
/* return AND dequeue the first event on the notification queue */
extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group);
/* Remove event queued in the notification list */
extern void fsnotify_remove_queued_event(struct fsnotify_group *group,
                                         struct fsnotify_event *event);

/* functions used to manipulate the marks attached to inodes */

/*
 * Canonical "ignore mask" including event flags.
 *
 * Note the subtle semantic difference from the legacy ->ignored_mask.
 * ->ignored_mask traditionally only meant which events should be ignored,
 * while ->ignore_mask also includes flags regarding the type of objects on
 * which events should be ignored.
 */
static inline __u32 fsnotify_ignore_mask(struct fsnotify_mark *mark)
{
        __u32 ignore_mask = mark->ignore_mask;

        /* The event flags in ignore mask take effect */
        if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
                return ignore_mask;

        /*
         * Legacy behavior:
         * - Always ignore events on dir
         * - Ignore events on child if parent is watching children
         */
        ignore_mask |= FS_ISDIR;
        ignore_mask &= ~FS_EVENT_ON_CHILD;
        ignore_mask |= mark->mask & FS_EVENT_ON_CHILD;

        return ignore_mask;
}

/* Legacy ignored_mask - only event types to ignore */
static inline __u32 fsnotify_ignored_events(struct fsnotify_mark *mark)
{
        return mark->ignore_mask & ALL_FSNOTIFY_EVENTS;
}

/*
 * Check if mask (or ignore mask) should be applied depending if victim is a
 * directory and whether it is reported to a watching parent.
 */
static inline bool fsnotify_mask_applicable(__u32 mask, bool is_dir,
                                            int iter_type)
{
        /* Should mask be applied to a directory? */
        if (is_dir && !(mask & FS_ISDIR))
                return false;

        /* Should mask be applied to a child? */
        if (iter_type == FSNOTIFY_ITER_TYPE_PARENT &&
            !(mask & FS_EVENT_ON_CHILD))
                return false;

        return true;
}

/*
 * Effective ignore mask taking into account if event victim is a
 * directory and whether it is reported to a watching parent.
 */
static inline __u32 fsnotify_effective_ignore_mask(struct fsnotify_mark *mark,
                                                   bool is_dir, int iter_type)
{
        __u32 ignore_mask = fsnotify_ignored_events(mark);

        if (!ignore_mask)
                return 0;

        /* For non-dir and non-child, no need to consult the event flags */
        if (!is_dir && iter_type != FSNOTIFY_ITER_TYPE_PARENT)
                return ignore_mask;

        ignore_mask = fsnotify_ignore_mask(mark);
        if (!fsnotify_mask_applicable(ignore_mask, is_dir, iter_type))
                return 0;

        return ignore_mask & ALL_FSNOTIFY_EVENTS;
}

/* Get mask for calculating object interest taking ignore mask into account */
static inline __u32 fsnotify_calc_mask(struct fsnotify_mark *mark)
{
        __u32 mask = mark->mask;

        if (!fsnotify_ignored_events(mark))
                return mask;

        /* Interest in FS_MODIFY may be needed for clearing ignore mask */
        if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
                mask |= FS_MODIFY;

        /*
         * If mark is interested in ignoring events on children, the object must
         * show interest in those events for fsnotify_parent() to notice it.
         */
        return mask | mark->ignore_mask;
}

/* Get mask of events for a list of marks */
extern __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn);
/* Calculate mask of events for a list of marks */
extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn);
extern void fsnotify_init_mark(struct fsnotify_mark *mark,
                               struct fsnotify_group *group);
/* Find mark belonging to given group in the list of marks */
struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type,
                                         struct fsnotify_group *group);
/* attach the mark to the object */
int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj,
                      unsigned int obj_type, int add_flags);
int fsnotify_add_mark_locked(struct fsnotify_mark *mark, void *obj,
                             unsigned int obj_type, int add_flags);

/* attach the mark to the inode */
static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
                                          struct inode *inode,
                                          int add_flags)
{
        return fsnotify_add_mark(mark, inode, FSNOTIFY_OBJ_TYPE_INODE,
                                 add_flags);
}
static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark,
                                                 struct inode *inode,
                                                 int add_flags)
{
        return fsnotify_add_mark_locked(mark, inode, FSNOTIFY_OBJ_TYPE_INODE,
                                        add_flags);
}

static inline struct fsnotify_mark *fsnotify_find_inode_mark(
                                                struct inode *inode,
                                                struct fsnotify_group *group)
{
        return fsnotify_find_mark(inode, FSNOTIFY_OBJ_TYPE_INODE, group);
}

/* given a group and a mark, flag mark to be freed when all references are dropped */
extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
                                  struct fsnotify_group *group);
/* detach mark from inode / mount list, group list, drop inode reference */
extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
/* free mark */
extern void fsnotify_free_mark(struct fsnotify_mark *mark);
/* Wait until all marks queued for destruction are destroyed */
extern void fsnotify_wait_marks_destroyed(void);
/* Clear all of the marks of a group attached to a given object type */
extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
                                          unsigned int obj_type);
/* run all the marks in a group, and clear all of the vfsmount marks */
static inline void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
{
        fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT);
}
/* run all the marks in a group, and clear all of the inode marks */
static inline void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
{
        fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE);
}
/* run all the marks in a group, and clear all of the sn marks */
static inline void fsnotify_clear_sb_marks_by_group(struct fsnotify_group *group)
{
        fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_SB);
}
extern void fsnotify_get_mark(struct fsnotify_mark *mark);
extern void fsnotify_put_mark(struct fsnotify_mark *mark);
extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info);
extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info);

static inline void fsnotify_init_event(struct fsnotify_event *event)
{
        INIT_LIST_HEAD(&event->list);
}

#else

static inline int fsnotify(__u32 mask, const void *data, int data_type,
                           struct inode *dir, const struct qstr *name,
                           struct inode *inode, u32 cookie)
{
        return 0;
}

static inline int __fsnotify_parent(struct dentry *dentry, __u32 mask,
                                  const void *data, int data_type)
{
        return 0;
}

static inline void __fsnotify_inode_delete(struct inode *inode)
{}

static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
{}

static inline void fsnotify_sb_delete(struct super_block *sb)
{}

static inline void fsnotify_sb_free(struct super_block *sb)
{}

static inline void fsnotify_update_flags(struct dentry *dentry)
{}

static inline u32 fsnotify_get_cookie(void)
{
        return 0;
}

static inline void fsnotify_unmount_inodes(struct super_block *sb)
{}

#endif        /* CONFIG_FSNOTIFY */

#endif        /* __KERNEL __ */

#endif        /* __LINUX_FSNOTIFY_BACKEND_H */




















    2 











    2 


    2 







    2 












    2 

























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/hfs/brec.c
 *
 * Copyright (C) 2001
 * Brad Boyer (flar@allandria.com)
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 *
 * Handle individual btree records
 */

#include "btree.h"

static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd);
static int hfs_brec_update_parent(struct hfs_find_data *fd);
static int hfs_btree_inc_height(struct hfs_btree *tree);

/* Get the length and offset of the given record in the given node */
u16 hfs_brec_lenoff(struct hfs_bnode *node, u16 rec, u16 *off)
{
        __be16 retval[2];
        u16 dataoff;

        dataoff = node->tree->node_size - (rec + 2) * 2;
        hfs_bnode_read(node, retval, dataoff, 4);
        *off = be16_to_cpu(retval[1]);
        return be16_to_cpu(retval[0]) - *off;
}

/* Get the length of the key from a keyed record */
u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
{
        u16 retval, recoff;

        if (node->type != HFS_NODE_INDEX && node->type != HFS_NODE_LEAF)
                return 0;

        if ((node->type == HFS_NODE_INDEX) &&
           !(node->tree->attributes & HFS_TREE_VARIDXKEYS)) {
                if (node->tree->attributes & HFS_TREE_BIGKEYS)
                        retval = node->tree->max_key_len + 2;
                else
                        retval = node->tree->max_key_len + 1;
        } else {
                recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
                if (!recoff)
                        return 0;
                if (node->tree->attributes & HFS_TREE_BIGKEYS) {
                        retval = hfs_bnode_read_u16(node, recoff) + 2;
                        if (retval > node->tree->max_key_len + 2) {
                                pr_err("keylen %d too large\n", retval);
                                retval = 0;
                        }
                } else {
                        retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1;
                        if (retval > node->tree->max_key_len + 1) {
                                pr_err("keylen %d too large\n", retval);
                                retval = 0;
                        }
                }
        }
        return retval;
}

int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len)
{
        struct hfs_btree *tree;
        struct hfs_bnode *node, *new_node;
        int size, key_len, rec;
        int data_off, end_off;
        int idx_rec_off, data_rec_off, end_rec_off;
        __be32 cnid;

        tree = fd->tree;
        if (!fd->bnode) {
                if (!tree->root)
                        hfs_btree_inc_height(tree);
                node = hfs_bnode_find(tree, tree->leaf_head);
                if (IS_ERR(node))
                        return PTR_ERR(node);
                fd->bnode = node;
                fd->record = -1;
        }
        new_node = NULL;
        key_len = (fd->search_key->key_len | 1) + 1;
again:
        /* new record idx and complete record size */
        rec = fd->record + 1;
        size = key_len + entry_len;

        node = fd->bnode;
        hfs_bnode_dump(node);
        /* get last offset */
        end_rec_off = tree->node_size - (node->num_recs + 1) * 2;
        end_off = hfs_bnode_read_u16(node, end_rec_off);
        end_rec_off -= 2;
        hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n",
                rec, size, end_off, end_rec_off);
        if (size > end_rec_off - end_off) {
                if (new_node)
                        panic("not enough room!\n");
                new_node = hfs_bnode_split(fd);
                if (IS_ERR(new_node))
                        return PTR_ERR(new_node);
                goto again;
        }
        if (node->type == HFS_NODE_LEAF) {
                tree->leaf_count++;
                mark_inode_dirty(tree->inode);
        }
        node->num_recs++;
        /* write new last offset */
        hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs);
        hfs_bnode_write_u16(node, end_rec_off, end_off + size);
        data_off = end_off;
        data_rec_off = end_rec_off + 2;
        idx_rec_off = tree->node_size - (rec + 1) * 2;
        if (idx_rec_off == data_rec_off)
                goto skip;
        /* move all following entries */
        do {
                data_off = hfs_bnode_read_u16(node, data_rec_off + 2);
                hfs_bnode_write_u16(node, data_rec_off, data_off + size);
                data_rec_off += 2;
        } while (data_rec_off < idx_rec_off);

        /* move data away */
        hfs_bnode_move(node, data_off + size, data_off,
                       end_off - data_off);

skip:
        hfs_bnode_write(node, fd->search_key, data_off, key_len);
        hfs_bnode_write(node, entry, data_off + key_len, entry_len);
        hfs_bnode_dump(node);

        /*
         * update parent key if we inserted a key
         * at the start of the node and it is not the new node
         */
        if (!rec && new_node != node) {
                hfs_bnode_read_key(node, fd->search_key, data_off + size);
                hfs_brec_update_parent(fd);
        }

        if (new_node) {
                hfs_bnode_put(fd->bnode);
                if (!new_node->parent) {
                        hfs_btree_inc_height(tree);
                        new_node->parent = tree->root;
                }
                fd->bnode = hfs_bnode_find(tree, new_node->parent);

                /* create index data entry */
                cnid = cpu_to_be32(new_node->this);
                entry = &cnid;
                entry_len = sizeof(cnid);

                /* get index key */
                hfs_bnode_read_key(new_node, fd->search_key, 14);
                __hfs_brec_find(fd->bnode, fd);

                hfs_bnode_put(new_node);
                new_node = NULL;

                if (tree->attributes & HFS_TREE_VARIDXKEYS)
                        key_len = fd->search_key->key_len + 1;
                else {
                        fd->search_key->key_len = tree->max_key_len;
                        key_len = tree->max_key_len + 1;
                }
                goto again;
        }

        return 0;
}

int hfs_brec_remove(struct hfs_find_data *fd)
{
        struct hfs_btree *tree;
        struct hfs_bnode *node, *parent;
        int end_off, rec_off, data_off, size;

        tree = fd->tree;
        node = fd->bnode;
again:
        rec_off = tree->node_size - (fd->record + 2) * 2;
        end_off = tree->node_size - (node->num_recs + 1) * 2;

        if (node->type == HFS_NODE_LEAF) {
                tree->leaf_count--;
                mark_inode_dirty(tree->inode);
        }
        hfs_bnode_dump(node);
        hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n",
                fd->record, fd->keylength + fd->entrylength);
        if (!--node->num_recs) {
                hfs_bnode_unlink(node);
                if (!node->parent)
                        return 0;
                parent = hfs_bnode_find(tree, node->parent);
                if (IS_ERR(parent))
                        return PTR_ERR(parent);
                hfs_bnode_put(node);
                node = fd->bnode = parent;

                __hfs_brec_find(node, fd);
                goto again;
        }
        hfs_bnode_write_u16(node, offsetof(struct hfs_bnode_desc, num_recs), node->num_recs);

        if (rec_off == end_off)
                goto skip;
        size = fd->keylength + fd->entrylength;

        do {
                data_off = hfs_bnode_read_u16(node, rec_off);
                hfs_bnode_write_u16(node, rec_off + 2, data_off - size);
                rec_off -= 2;
        } while (rec_off >= end_off);

        /* fill hole */
        hfs_bnode_move(node, fd->keyoffset, fd->keyoffset + size,
                       data_off - fd->keyoffset - size);
skip:
        hfs_bnode_dump(node);
        if (!fd->record)
                hfs_brec_update_parent(fd);
        return 0;
}

static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
{
        struct hfs_btree *tree;
        struct hfs_bnode *node, *new_node, *next_node;
        struct hfs_bnode_desc node_desc;
        int num_recs, new_rec_off, new_off, old_rec_off;
        int data_start, data_end, size;

        tree = fd->tree;
        node = fd->bnode;
        new_node = hfs_bmap_alloc(tree);
        if (IS_ERR(new_node))
                return new_node;
        hfs_bnode_get(node);
        hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n",
                node->this, new_node->this, node->next);
        new_node->next = node->next;
        new_node->prev = node->this;
        new_node->parent = node->parent;
        new_node->type = node->type;
        new_node->height = node->height;

        if (node->next)
                next_node = hfs_bnode_find(tree, node->next);
        else
                next_node = NULL;

        if (IS_ERR(next_node)) {
                hfs_bnode_put(node);
                hfs_bnode_put(new_node);
                return next_node;
        }

        size = tree->node_size / 2 - node->num_recs * 2 - 14;
        old_rec_off = tree->node_size - 4;
        num_recs = 1;
        for (;;) {
                data_start = hfs_bnode_read_u16(node, old_rec_off);
                if (data_start > size)
                        break;
                old_rec_off -= 2;
                if (++num_recs < node->num_recs)
                        continue;
                /* panic? */
                hfs_bnode_put(node);
                hfs_bnode_put(new_node);
                if (next_node)
                        hfs_bnode_put(next_node);
                return ERR_PTR(-ENOSPC);
        }

        if (fd->record + 1 < num_recs) {
                /* new record is in the lower half,
                 * so leave some more space there
                 */
                old_rec_off += 2;
                num_recs--;
                data_start = hfs_bnode_read_u16(node, old_rec_off);
        } else {
                hfs_bnode_put(node);
                hfs_bnode_get(new_node);
                fd->bnode = new_node;
                fd->record -= num_recs;
                fd->keyoffset -= data_start - 14;
                fd->entryoffset -= data_start - 14;
        }
        new_node->num_recs = node->num_recs - num_recs;
        node->num_recs = num_recs;

        new_rec_off = tree->node_size - 2;
        new_off = 14;
        size = data_start - new_off;
        num_recs = new_node->num_recs;
        data_end = data_start;
        while (num_recs) {
                hfs_bnode_write_u16(new_node, new_rec_off, new_off);
                old_rec_off -= 2;
                new_rec_off -= 2;
                data_end = hfs_bnode_read_u16(node, old_rec_off);
                new_off = data_end - size;
                num_recs--;
        }
        hfs_bnode_write_u16(new_node, new_rec_off, new_off);
        hfs_bnode_copy(new_node, 14, node, data_start, data_end - data_start);

        /* update new bnode header */
        node_desc.next = cpu_to_be32(new_node->next);
        node_desc.prev = cpu_to_be32(new_node->prev);
        node_desc.type = new_node->type;
        node_desc.height = new_node->height;
        node_desc.num_recs = cpu_to_be16(new_node->num_recs);
        node_desc.reserved = 0;
        hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc));

        /* update previous bnode header */
        node->next = new_node->this;
        hfs_bnode_read(node, &node_desc, 0, sizeof(node_desc));
        node_desc.next = cpu_to_be32(node->next);
        node_desc.num_recs = cpu_to_be16(node->num_recs);
        hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc));

        /* update next bnode header */
        if (next_node) {
                next_node->prev = new_node->this;
                hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc));
                node_desc.prev = cpu_to_be32(next_node->prev);
                hfs_bnode_write(next_node, &node_desc, 0, sizeof(node_desc));
                hfs_bnode_put(next_node);
        } else if (node->this == tree->leaf_tail) {
                /* if there is no next node, this might be the new tail */
                tree->leaf_tail = new_node->this;
                mark_inode_dirty(tree->inode);
        }

        hfs_bnode_dump(node);
        hfs_bnode_dump(new_node);
        hfs_bnode_put(node);

        return new_node;
}

static int hfs_brec_update_parent(struct hfs_find_data *fd)
{
        struct hfs_btree *tree;
        struct hfs_bnode *node, *new_node, *parent;
        int newkeylen, diff;
        int rec, rec_off, end_rec_off;
        int start_off, end_off;

        tree = fd->tree;
        node = fd->bnode;
        new_node = NULL;
        if (!node->parent)
                return 0;

again:
        parent = hfs_bnode_find(tree, node->parent);
        if (IS_ERR(parent))
                return PTR_ERR(parent);
        __hfs_brec_find(parent, fd);
        if (fd->record < 0)
                return -ENOENT;
        hfs_bnode_dump(parent);
        rec = fd->record;

        /* size difference between old and new key */
        if (tree->attributes & HFS_TREE_VARIDXKEYS)
                newkeylen = (hfs_bnode_read_u8(node, 14) | 1) + 1;
        else
                fd->keylength = newkeylen = tree->max_key_len + 1;
        hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n",
                rec, fd->keylength, newkeylen);

        rec_off = tree->node_size - (rec + 2) * 2;
        end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
        diff = newkeylen - fd->keylength;
        if (!diff)
                goto skip;
        if (diff > 0) {
                end_off = hfs_bnode_read_u16(parent, end_rec_off);
                if (end_rec_off - end_off < diff) {

                        printk(KERN_DEBUG "splitting index node...\n");
                        fd->bnode = parent;
                        new_node = hfs_bnode_split(fd);
                        if (IS_ERR(new_node))
                                return PTR_ERR(new_node);
                        parent = fd->bnode;
                        rec = fd->record;
                        rec_off = tree->node_size - (rec + 2) * 2;
                        end_rec_off = tree->node_size - (parent->num_recs + 1) * 2;
                }
        }

        end_off = start_off = hfs_bnode_read_u16(parent, rec_off);
        hfs_bnode_write_u16(parent, rec_off, start_off + diff);
        start_off -= 4;        /* move previous cnid too */

        while (rec_off > end_rec_off) {
                rec_off -= 2;
                end_off = hfs_bnode_read_u16(parent, rec_off);
                hfs_bnode_write_u16(parent, rec_off, end_off + diff);
        }
        hfs_bnode_move(parent, start_off + diff, start_off,
                       end_off - start_off);
skip:
        hfs_bnode_copy(parent, fd->keyoffset, node, 14, newkeylen);
        if (!(tree->attributes & HFS_TREE_VARIDXKEYS))
                hfs_bnode_write_u8(parent, fd->keyoffset, newkeylen - 1);
        hfs_bnode_dump(parent);

        hfs_bnode_put(node);
        node = parent;

        if (new_node) {
                __be32 cnid;

                if (!new_node->parent) {
                        hfs_btree_inc_height(tree);
                        new_node->parent = tree->root;
                }
                fd->bnode = hfs_bnode_find(tree, new_node->parent);
                /* create index key and entry */
                hfs_bnode_read_key(new_node, fd->search_key, 14);
                cnid = cpu_to_be32(new_node->this);

                __hfs_brec_find(fd->bnode, fd);
                hfs_brec_insert(fd, &cnid, sizeof(cnid));
                hfs_bnode_put(fd->bnode);
                hfs_bnode_put(new_node);

                if (!rec) {
                        if (new_node == node)
                                goto out;
                        /* restore search_key */
                        hfs_bnode_read_key(node, fd->search_key, 14);
                }
                new_node = NULL;
        }

        if (!rec && node->parent)
                goto again;
out:
        fd->bnode = node;
        return 0;
}

static int hfs_btree_inc_height(struct hfs_btree *tree)
{
        struct hfs_bnode *node, *new_node;
        struct hfs_bnode_desc node_desc;
        int key_size, rec;
        __be32 cnid;

        node = NULL;
        if (tree->root) {
                node = hfs_bnode_find(tree, tree->root);
                if (IS_ERR(node))
                        return PTR_ERR(node);
        }
        new_node = hfs_bmap_alloc(tree);
        if (IS_ERR(new_node)) {
                hfs_bnode_put(node);
                return PTR_ERR(new_node);
        }

        tree->root = new_node->this;
        if (!tree->depth) {
                tree->leaf_head = tree->leaf_tail = new_node->this;
                new_node->type = HFS_NODE_LEAF;
                new_node->num_recs = 0;
        } else {
                new_node->type = HFS_NODE_INDEX;
                new_node->num_recs = 1;
        }
        new_node->parent = 0;
        new_node->next = 0;
        new_node->prev = 0;
        new_node->height = ++tree->depth;

        node_desc.next = cpu_to_be32(new_node->next);
        node_desc.prev = cpu_to_be32(new_node->prev);
        node_desc.type = new_node->type;
        node_desc.height = new_node->height;
        node_desc.num_recs = cpu_to_be16(new_node->num_recs);
        node_desc.reserved = 0;
        hfs_bnode_write(new_node, &node_desc, 0, sizeof(node_desc));

        rec = tree->node_size - 2;
        hfs_bnode_write_u16(new_node, rec, 14);

        if (node) {
                /* insert old root idx into new root */
                node->parent = tree->root;
                if (node->type == HFS_NODE_LEAF ||
                    tree->attributes & HFS_TREE_VARIDXKEYS)
                        key_size = hfs_bnode_read_u8(node, 14) + 1;
                else
                        key_size = tree->max_key_len + 1;
                hfs_bnode_copy(new_node, 14, node, 14, key_size);

                if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
                        key_size = tree->max_key_len + 1;
                        hfs_bnode_write_u8(new_node, 14, tree->max_key_len);
                }
                key_size = (key_size + 1) & -2;
                cnid = cpu_to_be32(node->this);
                hfs_bnode_write(new_node, &cnid, 14 + key_size, 4);

                rec -= 2;
                hfs_bnode_write_u16(new_node, rec, 14 + key_size + 4);

                hfs_bnode_put(node);
        }
        hfs_bnode_put(new_node);
        mark_inode_dirty(tree->inode);

        return 0;
}










































    3 
















































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Virtio PCI driver - common functionality for all device versions
 *
 * This module allows virtio devices to be used over a virtual PCI device.
 * This can be used with QEMU based VMMs like KVM or Xen.
 *
 * Copyright IBM Corp. 2007
 * Copyright Red Hat, Inc. 2014
 *
 * Authors:
 *  Anthony Liguori  <aliguori@us.ibm.com>
 *  Rusty Russell <rusty@rustcorp.com.au>
 *  Michael S. Tsirkin <mst@redhat.com>
 */

#include "virtio_pci_common.h"

static bool force_legacy = false;

#if IS_ENABLED(CONFIG_VIRTIO_PCI_LEGACY)
module_param(force_legacy, bool, 0444);
MODULE_PARM_DESC(force_legacy,
                 "Force legacy mode for transitional virtio 1 devices");
#endif

/* wait for pending irq handlers */
void vp_synchronize_vectors(struct virtio_device *vdev)
{
        struct virtio_pci_device *vp_dev = to_vp_device(vdev);
        int i;

        if (vp_dev->intx_enabled)
                synchronize_irq(vp_dev->pci_dev->irq);

        for (i = 0; i < vp_dev->msix_vectors; ++i)
                synchronize_irq(pci_irq_vector(vp_dev->pci_dev, i));
}

/* the notify function used when creating a virt queue */
bool vp_notify(struct virtqueue *vq)
{
        /* we write the queue's selector into the notification register to
         * signal the other end */
        iowrite16(vq->index, (void __iomem *)vq->priv);
        return true;
}

/* Handle a configuration change: Tell driver if it wants to know. */
static irqreturn_t vp_config_changed(int irq, void *opaque)
{
        struct virtio_pci_device *vp_dev = opaque;

        virtio_config_changed(&vp_dev->vdev);
        return IRQ_HANDLED;
}

/* Notify all virtqueues on an interrupt. */
static irqreturn_t vp_vring_interrupt(int irq, void *opaque)
{
        struct virtio_pci_device *vp_dev = opaque;
        struct virtio_pci_vq_info *info;
        irqreturn_t ret = IRQ_NONE;
        unsigned long flags;

        spin_lock_irqsave(&vp_dev->lock, flags);
        list_for_each_entry(info, &vp_dev->virtqueues, node) {
                if (vring_interrupt(irq, info->vq) == IRQ_HANDLED)
                        ret = IRQ_HANDLED;
        }
        spin_unlock_irqrestore(&vp_dev->lock, flags);

        return ret;
}

/* A small wrapper to also acknowledge the interrupt when it's handled.
 * I really need an EIO hook for the vring so I can ack the interrupt once we
 * know that we'll be handling the IRQ but before we invoke the callback since
 * the callback may notify the host which results in the host attempting to
 * raise an interrupt that we would then mask once we acknowledged the
 * interrupt. */
static irqreturn_t vp_interrupt(int irq, void *opaque)
{
        struct virtio_pci_device *vp_dev = opaque;
        u8 isr;

        /* reading the ISR has the effect of also clearing it so it's very
         * important to save off the value. */
        isr = ioread8(vp_dev->isr);

        /* It's definitely not us if the ISR was not high */
        if (!isr)
                return IRQ_NONE;

        /* Configuration change?  Tell driver if it wants to know. */
        if (isr & VIRTIO_PCI_ISR_CONFIG)
                vp_config_changed(irq, opaque);

        return vp_vring_interrupt(irq, opaque);
}

static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
                                   bool per_vq_vectors, struct irq_affinity *desc)
{
        struct virtio_pci_device *vp_dev = to_vp_device(vdev);
        const char *name = dev_name(&vp_dev->vdev.dev);
        unsigned int flags = PCI_IRQ_MSIX;
        unsigned int i, v;
        int err = -ENOMEM;

        vp_dev->msix_vectors = nvectors;

        vp_dev->msix_names = kmalloc_array(nvectors,
                                           sizeof(*vp_dev->msix_names),
                                           GFP_KERNEL);
        if (!vp_dev->msix_names)
                goto error;
        vp_dev->msix_affinity_masks
                = kcalloc(nvectors, sizeof(*vp_dev->msix_affinity_masks),
                          GFP_KERNEL);
        if (!vp_dev->msix_affinity_masks)
                goto error;
        for (i = 0; i < nvectors; ++i)
                if (!alloc_cpumask_var(&vp_dev->msix_affinity_masks[i],
                                        GFP_KERNEL))
                        goto error;

        if (desc) {
                flags |= PCI_IRQ_AFFINITY;
                desc->pre_vectors++; /* virtio config vector */
        }

        err = pci_alloc_irq_vectors_affinity(vp_dev->pci_dev, nvectors,
                                             nvectors, flags, desc);
        if (err < 0)
                goto error;
        vp_dev->msix_enabled = 1;

        /* Set the vector used for configuration */
        v = vp_dev->msix_used_vectors;
        snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
                 "%s-config", name);
        err = request_irq(pci_irq_vector(vp_dev->pci_dev, v),
                          vp_config_changed, 0, vp_dev->msix_names[v],
                          vp_dev);
        if (err)
                goto error;
        ++vp_dev->msix_used_vectors;

        v = vp_dev->config_vector(vp_dev, v);
        /* Verify we had enough resources to assign the vector */
        if (v == VIRTIO_MSI_NO_VECTOR) {
                err = -EBUSY;
                goto error;
        }

        if (!per_vq_vectors) {
                /* Shared vector for all VQs */
                v = vp_dev->msix_used_vectors;
                snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
                         "%s-virtqueues", name);
                err = request_irq(pci_irq_vector(vp_dev->pci_dev, v),
                                  vp_vring_interrupt, 0, vp_dev->msix_names[v],
                                  vp_dev);
                if (err)
                        goto error;
                ++vp_dev->msix_used_vectors;
        }
        return 0;
error:
        return err;
}

static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int index,
                                     void (*callback)(struct virtqueue *vq),
                                     const char *name,
                                     bool ctx,
                                     u16 msix_vec)
{
        struct virtio_pci_device *vp_dev = to_vp_device(vdev);
        struct virtio_pci_vq_info *info = kmalloc(sizeof *info, GFP_KERNEL);
        struct virtqueue *vq;
        unsigned long flags;

        /* fill out our structure that represents an active queue */
        if (!info)
                return ERR_PTR(-ENOMEM);

        vq = vp_dev->setup_vq(vp_dev, info, index, callback, name, ctx,
                              msix_vec);
        if (IS_ERR(vq))
                goto out_info;

        info->vq = vq;
        if (callback) {
                spin_lock_irqsave(&vp_dev->lock, flags);
                list_add(&info->node, &vp_dev->virtqueues);
                spin_unlock_irqrestore(&vp_dev->lock, flags);
        } else {
                INIT_LIST_HEAD(&info->node);
        }

        vp_dev->vqs[index] = info;
        return vq;

out_info:
        kfree(info);
        return vq;
}

static void vp_del_vq(struct virtqueue *vq)
{
        struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
        struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index];
        unsigned long flags;

        /*
         * If it fails during re-enable reset vq. This way we won't rejoin
         * info->node to the queue. Prevent unexpected irqs.
         */
        if (!vq->reset) {
                spin_lock_irqsave(&vp_dev->lock, flags);
                list_del(&info->node);
                spin_unlock_irqrestore(&vp_dev->lock, flags);
        }

        vp_dev->del_vq(info);
        kfree(info);
}

/* the config->del_vqs() implementation */
void vp_del_vqs(struct virtio_device *vdev)
{
        struct virtio_pci_device *vp_dev = to_vp_device(vdev);
        struct virtqueue *vq, *n;
        int i;

        list_for_each_entry_safe(vq, n, &vdev->vqs, list) {
                if (vp_dev->is_avq && vp_dev->is_avq(vdev, vq->index))
                        continue;

                if (vp_dev->per_vq_vectors) {
                        int v = vp_dev->vqs[vq->index]->msix_vector;

                        if (v != VIRTIO_MSI_NO_VECTOR) {
                                int irq = pci_irq_vector(vp_dev->pci_dev, v);

                                irq_update_affinity_hint(irq, NULL);
                                free_irq(irq, vq);
                        }
                }
                vp_del_vq(vq);
        }
        vp_dev->per_vq_vectors = false;

        if (vp_dev->intx_enabled) {
                free_irq(vp_dev->pci_dev->irq, vp_dev);
                vp_dev->intx_enabled = 0;
        }

        for (i = 0; i < vp_dev->msix_used_vectors; ++i)
                free_irq(pci_irq_vector(vp_dev->pci_dev, i), vp_dev);

        if (vp_dev->msix_affinity_masks) {
                for (i = 0; i < vp_dev->msix_vectors; i++)
                        free_cpumask_var(vp_dev->msix_affinity_masks[i]);
        }

        if (vp_dev->msix_enabled) {
                /* Disable the vector used for configuration */
                vp_dev->config_vector(vp_dev, VIRTIO_MSI_NO_VECTOR);

                pci_free_irq_vectors(vp_dev->pci_dev);
                vp_dev->msix_enabled = 0;
        }

        vp_dev->msix_vectors = 0;
        vp_dev->msix_used_vectors = 0;
        kfree(vp_dev->msix_names);
        vp_dev->msix_names = NULL;
        kfree(vp_dev->msix_affinity_masks);
        vp_dev->msix_affinity_masks = NULL;
        kfree(vp_dev->vqs);
        vp_dev->vqs = NULL;
}

static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs,
                struct virtqueue *vqs[], vq_callback_t *callbacks[],
                const char * const names[], bool per_vq_vectors,
                const bool *ctx,
                struct irq_affinity *desc)
{
        struct virtio_pci_device *vp_dev = to_vp_device(vdev);
        u16 msix_vec;
        int i, err, nvectors, allocated_vectors, queue_idx = 0;

        vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL);
        if (!vp_dev->vqs)
                return -ENOMEM;

        if (per_vq_vectors) {
                /* Best option: one for change interrupt, one per vq. */
                nvectors = 1;
                for (i = 0; i < nvqs; ++i)
                        if (names[i] && callbacks[i])
                                ++nvectors;
        } else {
                /* Second best: one for change, shared for all vqs. */
                nvectors = 2;
        }

        err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors,
                                      per_vq_vectors ? desc : NULL);
        if (err)
                goto error_find;

        vp_dev->per_vq_vectors = per_vq_vectors;
        allocated_vectors = vp_dev->msix_used_vectors;
        for (i = 0; i < nvqs; ++i) {
                if (!names[i]) {
                        vqs[i] = NULL;
                        continue;
                }

                if (!callbacks[i])
                        msix_vec = VIRTIO_MSI_NO_VECTOR;
                else if (vp_dev->per_vq_vectors)
                        msix_vec = allocated_vectors++;
                else
                        msix_vec = VP_MSIX_VQ_VECTOR;
                vqs[i] = vp_setup_vq(vdev, queue_idx++, callbacks[i], names[i],
                                     ctx ? ctx[i] : false,
                                     msix_vec);
                if (IS_ERR(vqs[i])) {
                        err = PTR_ERR(vqs[i]);
                        goto error_find;
                }

                if (!vp_dev->per_vq_vectors || msix_vec == VIRTIO_MSI_NO_VECTOR)
                        continue;

                /* allocate per-vq irq if available and necessary */
                snprintf(vp_dev->msix_names[msix_vec],
                         sizeof *vp_dev->msix_names,
                         "%s-%s",
                         dev_name(&vp_dev->vdev.dev), names[i]);
                err = request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec),
                                  vring_interrupt, 0,
                                  vp_dev->msix_names[msix_vec],
                                  vqs[i]);
                if (err) {
                        vp_del_vq(vqs[i]);
                        goto error_find;
                }
        }
        return 0;

error_find:
        vp_del_vqs(vdev);
        return err;
}

static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs,
                struct virtqueue *vqs[], vq_callback_t *callbacks[],
                const char * const names[], const bool *ctx)
{
        struct virtio_pci_device *vp_dev = to_vp_device(vdev);
        int i, err, queue_idx = 0;

        vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL);
        if (!vp_dev->vqs)
                return -ENOMEM;

        err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED,
                        dev_name(&vdev->dev), vp_dev);
        if (err)
                goto out_del_vqs;

        vp_dev->intx_enabled = 1;
        vp_dev->per_vq_vectors = false;
        for (i = 0; i < nvqs; ++i) {
                if (!names[i]) {
                        vqs[i] = NULL;
                        continue;
                }
                vqs[i] = vp_setup_vq(vdev, queue_idx++, callbacks[i], names[i],
                                     ctx ? ctx[i] : false,
                                     VIRTIO_MSI_NO_VECTOR);
                if (IS_ERR(vqs[i])) {
                        err = PTR_ERR(vqs[i]);
                        goto out_del_vqs;
                }
        }

        return 0;
out_del_vqs:
        vp_del_vqs(vdev);
        return err;
}

/* the config->find_vqs() implementation */
int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
                struct virtqueue *vqs[], vq_callback_t *callbacks[],
                const char * const names[], const bool *ctx,
                struct irq_affinity *desc)
{
        int err;

        /* Try MSI-X with one vector per queue. */
        err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, true, ctx, desc);
        if (!err)
                return 0;
        /* Fallback: MSI-X with one vector for config, one shared for queues. */
        err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, false, ctx, desc);
        if (!err)
                return 0;
        /* Is there an interrupt? If not give up. */
        if (!(to_vp_device(vdev)->pci_dev->irq))
                return err;
        /* Finally fall back to regular interrupts. */
        return vp_find_vqs_intx(vdev, nvqs, vqs, callbacks, names, ctx);
}

const char *vp_bus_name(struct virtio_device *vdev)
{
        struct virtio_pci_device *vp_dev = to_vp_device(vdev);

        return pci_name(vp_dev->pci_dev);
}

/* Setup the affinity for a virtqueue:
 * - force the affinity for per vq vector
 * - OR over all affinities for shared MSI
 * - ignore the affinity request if we're using INTX
 */
int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask)
{
        struct virtio_device *vdev = vq->vdev;
        struct virtio_pci_device *vp_dev = to_vp_device(vdev);
        struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index];
        struct cpumask *mask;
        unsigned int irq;

        if (!vq->callback)
                return -EINVAL;

        if (vp_dev->msix_enabled) {
                mask = vp_dev->msix_affinity_masks[info->msix_vector];
                irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector);
                if (!cpu_mask)
                        irq_update_affinity_hint(irq, NULL);
                else {
                        cpumask_copy(mask, cpu_mask);
                        irq_set_affinity_and_hint(irq, mask);
                }
        }
        return 0;
}

const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index)
{
        struct virtio_pci_device *vp_dev = to_vp_device(vdev);

        if (!vp_dev->per_vq_vectors ||
            vp_dev->vqs[index]->msix_vector == VIRTIO_MSI_NO_VECTOR)
                return NULL;

        return pci_irq_get_affinity(vp_dev->pci_dev,
                                    vp_dev->vqs[index]->msix_vector);
}

#ifdef CONFIG_PM_SLEEP
static int virtio_pci_freeze(struct device *dev)
{
        struct pci_dev *pci_dev = to_pci_dev(dev);
        struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
        int ret;

        ret = virtio_device_freeze(&vp_dev->vdev);

        if (!ret)
                pci_disable_device(pci_dev);
        return ret;
}

static int virtio_pci_restore(struct device *dev)
{
        struct pci_dev *pci_dev = to_pci_dev(dev);
        struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
        int ret;

        ret = pci_enable_device(pci_dev);
        if (ret)
                return ret;

        pci_set_master(pci_dev);
        return virtio_device_restore(&vp_dev->vdev);
}

static bool vp_supports_pm_no_reset(struct device *dev)
{
        struct pci_dev *pci_dev = to_pci_dev(dev);
        u16 pmcsr;

        if (!pci_dev->pm_cap)
                return false;

        pci_read_config_word(pci_dev, pci_dev->pm_cap + PCI_PM_CTRL, &pmcsr);
        if (PCI_POSSIBLE_ERROR(pmcsr)) {
                dev_err(dev, "Unable to query pmcsr");
                return false;
        }

        return pmcsr & PCI_PM_CTRL_NO_SOFT_RESET;
}

static int virtio_pci_suspend(struct device *dev)
{
        return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_freeze(dev);
}

static int virtio_pci_resume(struct device *dev)
{
        return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_restore(dev);
}

static const struct dev_pm_ops virtio_pci_pm_ops = {
        .suspend = virtio_pci_suspend,
        .resume = virtio_pci_resume,
        .freeze = virtio_pci_freeze,
        .thaw = virtio_pci_restore,
        .poweroff = virtio_pci_freeze,
        .restore = virtio_pci_restore,
};
#endif


/* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */
static const struct pci_device_id virtio_pci_id_table[] = {
        { PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) },
        { 0 }
};

MODULE_DEVICE_TABLE(pci, virtio_pci_id_table);

static void virtio_pci_release_dev(struct device *_d)
{
        struct virtio_device *vdev = dev_to_virtio(_d);
        struct virtio_pci_device *vp_dev = to_vp_device(vdev);

        /* As struct device is a kobject, it's not safe to
         * free the memory (including the reference counter itself)
         * until it's release callback. */
        kfree(vp_dev);
}

static int virtio_pci_probe(struct pci_dev *pci_dev,
                            const struct pci_device_id *id)
{
        struct virtio_pci_device *vp_dev, *reg_dev = NULL;
        int rc;

        /* allocate our structure and fill it out */
        vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL);
        if (!vp_dev)
                return -ENOMEM;

        pci_set_drvdata(pci_dev, vp_dev);
        vp_dev->vdev.dev.parent = &pci_dev->dev;
        vp_dev->vdev.dev.release = virtio_pci_release_dev;
        vp_dev->pci_dev = pci_dev;
        INIT_LIST_HEAD(&vp_dev->virtqueues);
        spin_lock_init(&vp_dev->lock);

        /* enable the device */
        rc = pci_enable_device(pci_dev);
        if (rc)
                goto err_enable_device;

        if (force_legacy) {
                rc = virtio_pci_legacy_probe(vp_dev);
                /* Also try modern mode if we can't map BAR0 (no IO space). */
                if (rc == -ENODEV || rc == -ENOMEM)
                        rc = virtio_pci_modern_probe(vp_dev);
                if (rc)
                        goto err_probe;
        } else {
                rc = virtio_pci_modern_probe(vp_dev);
                if (rc == -ENODEV)
                        rc = virtio_pci_legacy_probe(vp_dev);
                if (rc)
                        goto err_probe;
        }

        pci_set_master(pci_dev);

        rc = register_virtio_device(&vp_dev->vdev);
        reg_dev = vp_dev;
        if (rc)
                goto err_register;

        return 0;

err_register:
        if (vp_dev->is_legacy)
                virtio_pci_legacy_remove(vp_dev);
        else
                virtio_pci_modern_remove(vp_dev);
err_probe:
        pci_disable_device(pci_dev);
err_enable_device:
        if (reg_dev)
                put_device(&vp_dev->vdev.dev);
        else
                kfree(vp_dev);
        return rc;
}

static void virtio_pci_remove(struct pci_dev *pci_dev)
{
        struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
        struct device *dev = get_device(&vp_dev->vdev.dev);

        /*
         * Device is marked broken on surprise removal so that virtio upper
         * layers can abort any ongoing operation.
         */
        if (!pci_device_is_present(pci_dev))
                virtio_break_device(&vp_dev->vdev);

        pci_disable_sriov(pci_dev);

        unregister_virtio_device(&vp_dev->vdev);

        if (vp_dev->is_legacy)
                virtio_pci_legacy_remove(vp_dev);
        else
                virtio_pci_modern_remove(vp_dev);

        pci_disable_device(pci_dev);
        put_device(dev);
}

static int virtio_pci_sriov_configure(struct pci_dev *pci_dev, int num_vfs)
{
        struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
        struct virtio_device *vdev = &vp_dev->vdev;
        int ret;

        if (!(vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK))
                return -EBUSY;

        if (!__virtio_test_bit(vdev, VIRTIO_F_SR_IOV))
                return -EINVAL;

        if (pci_vfs_assigned(pci_dev))
                return -EPERM;

        if (num_vfs == 0) {
                pci_disable_sriov(pci_dev);
                return 0;
        }

        ret = pci_enable_sriov(pci_dev, num_vfs);
        if (ret < 0)
                return ret;

        return num_vfs;
}

static struct pci_driver virtio_pci_driver = {
        .name                = "virtio-pci",
        .id_table        = virtio_pci_id_table,
        .probe                = virtio_pci_probe,
        .remove                = virtio_pci_remove,
#ifdef CONFIG_PM_SLEEP
        .driver.pm        = &virtio_pci_pm_ops,
#endif
        .sriov_configure = virtio_pci_sriov_configure,
};

struct virtio_device *virtio_pci_vf_get_pf_dev(struct pci_dev *pdev)
{
        struct virtio_pci_device *pf_vp_dev;

        pf_vp_dev = pci_iov_get_pf_drvdata(pdev, &virtio_pci_driver);
        if (IS_ERR(pf_vp_dev))
                return NULL;

        return &pf_vp_dev->vdev;
}

module_pci_driver(virtio_pci_driver);

MODULE_AUTHOR("Anthony Liguori <aliguori@us.ibm.com>");
MODULE_DESCRIPTION("virtio-pci");
MODULE_LICENSE("GPL");
MODULE_VERSION("1");



























    1 












































































































    1 










    1 

    1 






    1 
    1 




    1 










    1 

    1 
    1 




    1 
    1 
    1 


















    1 

















    2 






    1 











    1 













    1 

    1 








































    2 






    2 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/hash.c
 *
 * Copyright (C) 2002 by Theodore Ts'o
 */

#include <linux/fs.h>
#include <linux/unicode.h>
#include <linux/compiler.h>
#include <linux/bitops.h>
#include "ext4.h"

#define DELTA 0x9E3779B9

static void TEA_transform(__u32 buf[4], __u32 const in[])
{
        __u32        sum = 0;
        __u32        b0 = buf[0], b1 = buf[1];
        __u32        a = in[0], b = in[1], c = in[2], d = in[3];
        int        n = 16;

        do {
                sum += DELTA;
                b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
                b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
        } while (--n);

        buf[0] += b0;
        buf[1] += b1;
}

/* F, G and H are basic MD4 functions: selection, majority, parity */
#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z)))
#define H(x, y, z) ((x) ^ (y) ^ (z))

/*
 * The generic round function.  The application is so specific that
 * we don't bother protecting all the arguments with parens, as is generally
 * good macro practice, in favor of extra legibility.
 * Rotation is separate from addition to prevent recomputation
 */
#define ROUND(f, a, b, c, d, x, s)        \
        (a += f(b, c, d) + x, a = rol32(a, s))
#define K1 0
#define K2 013240474631UL
#define K3 015666365641UL

/*
 * Basic cut-down MD4 transform.  Returns only 32 bits of result.
 */
static __u32 half_md4_transform(__u32 buf[4], __u32 const in[8])
{
        __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3];

        /* Round 1 */
        ROUND(F, a, b, c, d, in[0] + K1,  3);
        ROUND(F, d, a, b, c, in[1] + K1,  7);
        ROUND(F, c, d, a, b, in[2] + K1, 11);
        ROUND(F, b, c, d, a, in[3] + K1, 19);
        ROUND(F, a, b, c, d, in[4] + K1,  3);
        ROUND(F, d, a, b, c, in[5] + K1,  7);
        ROUND(F, c, d, a, b, in[6] + K1, 11);
        ROUND(F, b, c, d, a, in[7] + K1, 19);

        /* Round 2 */
        ROUND(G, a, b, c, d, in[1] + K2,  3);
        ROUND(G, d, a, b, c, in[3] + K2,  5);
        ROUND(G, c, d, a, b, in[5] + K2,  9);
        ROUND(G, b, c, d, a, in[7] + K2, 13);
        ROUND(G, a, b, c, d, in[0] + K2,  3);
        ROUND(G, d, a, b, c, in[2] + K2,  5);
        ROUND(G, c, d, a, b, in[4] + K2,  9);
        ROUND(G, b, c, d, a, in[6] + K2, 13);

        /* Round 3 */
        ROUND(H, a, b, c, d, in[3] + K3,  3);
        ROUND(H, d, a, b, c, in[7] + K3,  9);
        ROUND(H, c, d, a, b, in[2] + K3, 11);
        ROUND(H, b, c, d, a, in[6] + K3, 15);
        ROUND(H, a, b, c, d, in[1] + K3,  3);
        ROUND(H, d, a, b, c, in[5] + K3,  9);
        ROUND(H, c, d, a, b, in[0] + K3, 11);
        ROUND(H, b, c, d, a, in[4] + K3, 15);

        buf[0] += a;
        buf[1] += b;
        buf[2] += c;
        buf[3] += d;

        return buf[1]; /* "most hashed" word */
}
#undef ROUND
#undef K1
#undef K2
#undef K3
#undef F
#undef G
#undef H

/* The old legacy hash */
static __u32 dx_hack_hash_unsigned(const char *name, int len)
{
        __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
        const unsigned char *ucp = (const unsigned char *) name;

        while (len--) {
                hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));

                if (hash & 0x80000000)
                        hash -= 0x7fffffff;
                hash1 = hash0;
                hash0 = hash;
        }
        return hash0 << 1;
}

static __u32 dx_hack_hash_signed(const char *name, int len)
{
        __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
        const signed char *scp = (const signed char *) name;

        while (len--) {
                hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));

                if (hash & 0x80000000)
                        hash -= 0x7fffffff;
                hash1 = hash0;
                hash0 = hash;
        }
        return hash0 << 1;
}

static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
{
        __u32        pad, val;
        int        i;
        const signed char *scp = (const signed char *) msg;

        pad = (__u32)len | ((__u32)len << 8);
        pad |= pad << 16;

        val = pad;
        if (len > num*4)
                len = num * 4;
        for (i = 0; i < len; i++) {
                val = ((int) scp[i]) + (val << 8);
                if ((i % 4) == 3) {
                        *buf++ = val;
                        val = pad;
                        num--;
                }
        }
        if (--num >= 0)
                *buf++ = val;
        while (--num >= 0)
                *buf++ = pad;
}

static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
{
        __u32        pad, val;
        int        i;
        const unsigned char *ucp = (const unsigned char *) msg;

        pad = (__u32)len | ((__u32)len << 8);
        pad |= pad << 16;

        val = pad;
        if (len > num*4)
                len = num * 4;
        for (i = 0; i < len; i++) {
                val = ((int) ucp[i]) + (val << 8);
                if ((i % 4) == 3) {
                        *buf++ = val;
                        val = pad;
                        num--;
                }
        }
        if (--num >= 0)
                *buf++ = val;
        while (--num >= 0)
                *buf++ = pad;
}

/*
 * Returns the hash of a filename.  If len is 0 and name is NULL, then
 * this function can be used to test whether or not a hash version is
 * supported.
 *
 * The seed is an 4 longword (32 bits) "secret" which can be used to
 * uniquify a hash.  If the seed is all zero's, then some default seed
 * may be used.
 *
 * A particular hash version specifies whether or not the seed is
 * represented, and whether or not the returned hash is 32 bits or 64
 * bits.  32 bit hashes will return 0 for the minor hash.
 */
static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len,
                            struct dx_hash_info *hinfo)
{
        __u32        hash;
        __u32        minor_hash = 0;
        const char        *p;
        int                i;
        __u32                in[8], buf[4];
        void                (*str2hashbuf)(const char *, int, __u32 *, int) =
                                str2hashbuf_signed;

        /* Initialize the default seed for the hash checksum functions */
        buf[0] = 0x67452301;
        buf[1] = 0xefcdab89;
        buf[2] = 0x98badcfe;
        buf[3] = 0x10325476;

        /* Check to see if the seed is all zero's */
        if (hinfo->seed) {
                for (i = 0; i < 4; i++) {
                        if (hinfo->seed[i]) {
                                memcpy(buf, hinfo->seed, sizeof(buf));
                                break;
                        }
                }
        }

        switch (hinfo->hash_version) {
        case DX_HASH_LEGACY_UNSIGNED:
                hash = dx_hack_hash_unsigned(name, len);
                break;
        case DX_HASH_LEGACY:
                hash = dx_hack_hash_signed(name, len);
                break;
        case DX_HASH_HALF_MD4_UNSIGNED:
                str2hashbuf = str2hashbuf_unsigned;
                fallthrough;
        case DX_HASH_HALF_MD4:
                p = name;
                while (len > 0) {
                        (*str2hashbuf)(p, len, in, 8);
                        half_md4_transform(buf, in);
                        len -= 32;
                        p += 32;
                }
                minor_hash = buf[2];
                hash = buf[1];
                break;
        case DX_HASH_TEA_UNSIGNED:
                str2hashbuf = str2hashbuf_unsigned;
                fallthrough;
        case DX_HASH_TEA:
                p = name;
                while (len > 0) {
                        (*str2hashbuf)(p, len, in, 4);
                        TEA_transform(buf, in);
                        len -= 16;
                        p += 16;
                }
                hash = buf[0];
                minor_hash = buf[1];
                break;
        case DX_HASH_SIPHASH:
        {
                struct qstr qname = QSTR_INIT(name, len);
                __u64        combined_hash;

                if (fscrypt_has_encryption_key(dir)) {
                        combined_hash = fscrypt_fname_siphash(dir, &qname);
                } else {
                        ext4_warning_inode(dir, "Siphash requires key");
                        return -1;
                }

                hash = (__u32)(combined_hash >> 32);
                minor_hash = (__u32)combined_hash;
                break;
        }
        default:
                hinfo->hash = 0;
                hinfo->minor_hash = 0;
                ext4_warning(dir->i_sb,
                             "invalid/unsupported hash tree version %u",
                             hinfo->hash_version);
                return -EINVAL;
        }
        hash = hash & ~1;
        if (hash == (EXT4_HTREE_EOF_32BIT << 1))
                hash = (EXT4_HTREE_EOF_32BIT - 1) << 1;
        hinfo->hash = hash;
        hinfo->minor_hash = minor_hash;
        return 0;
}

int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
                   struct dx_hash_info *hinfo)
{
#if IS_ENABLED(CONFIG_UNICODE)
        const struct unicode_map *um = dir->i_sb->s_encoding;
        int r, dlen;
        unsigned char *buff;
        struct qstr qstr = {.name = name, .len = len };

        if (len && IS_CASEFOLDED(dir) &&
           (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) {
                buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL);
                if (!buff)
                        return -ENOMEM;

                dlen = utf8_casefold(um, &qstr, buff, PATH_MAX);
                if (dlen < 0) {
                        kfree(buff);
                        goto opaque_seq;
                }

                r = __ext4fs_dirhash(dir, buff, dlen, hinfo);

                kfree(buff);
                return r;
        }
opaque_seq:
#endif
        return __ext4fs_dirhash(dir, name, len, hinfo);
}









































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Access to user system call parameters and results
 *
 * Copyright (C) 2008-2009 Red Hat, Inc.  All rights reserved.
 *
 * See asm-generic/syscall.h for descriptions of what we must do here.
 */

#ifndef _ASM_X86_SYSCALL_H
#define _ASM_X86_SYSCALL_H

#include <uapi/linux/audit.h>
#include <linux/sched.h>
#include <linux/err.h>
#include <asm/thread_info.h>        /* for TS_COMPAT */
#include <asm/unistd.h>

/* This is used purely for kernel/trace/trace_syscalls.c */
typedef long (*sys_call_ptr_t)(const struct pt_regs *);
extern const sys_call_ptr_t sys_call_table[];

/*
 * These may not exist, but still put the prototypes in so we
 * can use IS_ENABLED().
 */
extern long ia32_sys_call(const struct pt_regs *, unsigned int nr);
extern long x32_sys_call(const struct pt_regs *, unsigned int nr);
extern long x64_sys_call(const struct pt_regs *, unsigned int nr);

/*
 * Only the low 32 bits of orig_ax are meaningful, so we return int.
 * This importantly ignores the high bits on 64-bit, so comparisons
 * sign-extend the low 32 bits.
 */
static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
{
        return regs->orig_ax;
}

static inline void syscall_rollback(struct task_struct *task,
                                    struct pt_regs *regs)
{
        regs->ax = regs->orig_ax;
}

static inline long syscall_get_error(struct task_struct *task,
                                     struct pt_regs *regs)
{
        unsigned long error = regs->ax;
#ifdef CONFIG_IA32_EMULATION
        /*
         * TS_COMPAT is set for 32-bit syscall entries and then
         * remains set until we return to user mode.
         */
        if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED))
                /*
                 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
                 * and will match correctly in comparisons.
                 */
                error = (long) (int) error;
#endif
        return IS_ERR_VALUE(error) ? error : 0;
}

static inline long syscall_get_return_value(struct task_struct *task,
                                            struct pt_regs *regs)
{
        return regs->ax;
}

static inline void syscall_set_return_value(struct task_struct *task,
                                            struct pt_regs *regs,
                                            int error, long val)
{
        regs->ax = (long) error ?: val;
}

#ifdef CONFIG_X86_32

static inline void syscall_get_arguments(struct task_struct *task,
                                         struct pt_regs *regs,
                                         unsigned long *args)
{
        memcpy(args, &regs->bx, 6 * sizeof(args[0]));
}

static inline int syscall_get_arch(struct task_struct *task)
{
        return AUDIT_ARCH_I386;
}

#else         /* CONFIG_X86_64 */

static inline void syscall_get_arguments(struct task_struct *task,
                                         struct pt_regs *regs,
                                         unsigned long *args)
{
# ifdef CONFIG_IA32_EMULATION
        if (task->thread_info.status & TS_COMPAT) {
                *args++ = regs->bx;
                *args++ = regs->cx;
                *args++ = regs->dx;
                *args++ = regs->si;
                *args++ = regs->di;
                *args   = regs->bp;
        } else
# endif
        {
                *args++ = regs->di;
                *args++ = regs->si;
                *args++ = regs->dx;
                *args++ = regs->r10;
                *args++ = regs->r8;
                *args   = regs->r9;
        }
}

static inline int syscall_get_arch(struct task_struct *task)
{
        /* x32 tasks should be considered AUDIT_ARCH_X86_64. */
        return (IS_ENABLED(CONFIG_IA32_EMULATION) &&
                task->thread_info.status & TS_COMPAT)
                ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
}

bool do_syscall_64(struct pt_regs *regs, int nr);
void do_int80_emulation(struct pt_regs *regs);

#endif        /* CONFIG_X86_32 */

void do_int80_syscall_32(struct pt_regs *regs);
bool do_fast_syscall_32(struct pt_regs *regs);
bool do_SYSENTER_32(struct pt_regs *regs);

#endif        /* _ASM_X86_SYSCALL_H */















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 










































    3 



    3 














































    3 



















    3 

    3 
    3 

































    3 










    3 
    3 
    3 

    3 










    3 





















    3 




    3 















    3 


    3 



















    3 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
// SPDX-License-Identifier: GPL-2.0-only
/*
 *      sd.c Copyright (C) 1992 Drew Eckhardt
 *           Copyright (C) 1993, 1994, 1995, 1999 Eric Youngdale
 *
 *      Linux scsi disk driver
 *              Initial versions: Drew Eckhardt
 *              Subsequent revisions: Eric Youngdale
 *        Modification history:
 *       - Drew Eckhardt <drew@colorado.edu> original
 *       - Eric Youngdale <eric@andante.org> add scatter-gather, multiple 
 *         outstanding request, and other enhancements.
 *         Support loadable low-level scsi drivers.
 *       - Jirka Hanika <geo@ff.cuni.cz> support more scsi disks using 
 *         eight major numbers.
 *       - Richard Gooch <rgooch@atnf.csiro.au> support devfs.
 *         - Torben Mathiasen <tmm@image.dk> Resource allocation fixes in 
 *           sd_init and cleanups.
 *         - Alex Davis <letmein@erols.com> Fix problem where partition info
 *           not being read in sd_open. Fix problem where removable media 
 *           could be ejected after sd_open.
 *         - Douglas Gilbert <dgilbert@interlog.com> cleanup for lk 2.5.x
 *         - Badari Pulavarty <pbadari@us.ibm.com>, Matthew Wilcox 
 *           <willy@debian.org>, Kurt Garloff <garloff@suse.de>: 
 *           Support 32k/1M disks.
 *
 *        Logging policy (needs CONFIG_SCSI_LOGGING defined):
 *         - setting up transfer: SCSI_LOG_HLQUEUE levels 1 and 2
 *         - end of transfer (bh + scsi_lib): SCSI_LOG_HLCOMPLETE level 1
 *         - entering sd_ioctl: SCSI_LOG_IOCTL level 1
 *         - entering other commands: SCSI_LOG_HLQUEUE level 3
 *        Note: when the logging level is set by the user, it must be greater
 *        than the level indicated above to trigger output.        
 */

#include <linux/module.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/bio.h>
#include <linux/hdreg.h>
#include <linux/errno.h>
#include <linux/idr.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/blkpg.h>
#include <linux/blk-pm.h>
#include <linux/delay.h>
#include <linux/rw_hint.h>
#include <linux/major.h>
#include <linux/mutex.h>
#include <linux/string_helpers.h>
#include <linux/slab.h>
#include <linux/sed-opal.h>
#include <linux/pm_runtime.h>
#include <linux/pr.h>
#include <linux/t10-pi.h>
#include <linux/uaccess.h>
#include <asm/unaligned.h>

#include <scsi/scsi.h>
#include <scsi/scsi_cmnd.h>
#include <scsi/scsi_dbg.h>
#include <scsi/scsi_device.h>
#include <scsi/scsi_devinfo.h>
#include <scsi/scsi_driver.h>
#include <scsi/scsi_eh.h>
#include <scsi/scsi_host.h>
#include <scsi/scsi_ioctl.h>
#include <scsi/scsicam.h>
#include <scsi/scsi_common.h>

#include "sd.h"
#include "scsi_priv.h"
#include "scsi_logging.h"

MODULE_AUTHOR("Eric Youngdale");
MODULE_DESCRIPTION("SCSI disk (sd) driver");
MODULE_LICENSE("GPL");

MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK0_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK1_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK2_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK3_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK4_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK5_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK6_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK7_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK8_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK9_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK10_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK11_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK12_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK13_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK14_MAJOR);
MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK15_MAJOR);
MODULE_ALIAS_SCSI_DEVICE(TYPE_DISK);
MODULE_ALIAS_SCSI_DEVICE(TYPE_MOD);
MODULE_ALIAS_SCSI_DEVICE(TYPE_RBC);
MODULE_ALIAS_SCSI_DEVICE(TYPE_ZBC);

#define SD_MINORS        16

static void sd_config_discard(struct scsi_disk *, unsigned int);
static void sd_config_write_same(struct scsi_disk *);
static int  sd_revalidate_disk(struct gendisk *);
static void sd_unlock_native_capacity(struct gendisk *disk);
static void sd_shutdown(struct device *);
static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer);
static void scsi_disk_release(struct device *cdev);

static DEFINE_IDA(sd_index_ida);

static mempool_t *sd_page_pool;
static struct lock_class_key sd_bio_compl_lkclass;

static const char *sd_cache_types[] = {
        "write through", "none", "write back",
        "write back, no read (daft)"
};

static void sd_set_flush_flag(struct scsi_disk *sdkp)
{
        bool wc = false, fua = false;

        if (sdkp->WCE) {
                wc = true;
                if (sdkp->DPOFUA)
                        fua = true;
        }

        blk_queue_write_cache(sdkp->disk->queue, wc, fua);
}

static ssize_t
cache_type_store(struct device *dev, struct device_attribute *attr,
                 const char *buf, size_t count)
{
        int ct, rcd, wce, sp;
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdp = sdkp->device;
        char buffer[64];
        char *buffer_data;
        struct scsi_mode_data data;
        struct scsi_sense_hdr sshdr;
        static const char temp[] = "temporary ";
        int len, ret;

        if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC)
                /* no cache control on RBC devices; theoretically they
                 * can do it, but there's probably so many exceptions
                 * it's not worth the risk */
                return -EINVAL;

        if (strncmp(buf, temp, sizeof(temp) - 1) == 0) {
                buf += sizeof(temp) - 1;
                sdkp->cache_override = 1;
        } else {
                sdkp->cache_override = 0;
        }

        ct = sysfs_match_string(sd_cache_types, buf);
        if (ct < 0)
                return -EINVAL;

        rcd = ct & 0x01 ? 1 : 0;
        wce = (ct & 0x02) && !sdkp->write_prot ? 1 : 0;

        if (sdkp->cache_override) {
                sdkp->WCE = wce;
                sdkp->RCD = rcd;
                sd_set_flush_flag(sdkp);
                return count;
        }

        if (scsi_mode_sense(sdp, 0x08, 8, 0, buffer, sizeof(buffer), SD_TIMEOUT,
                            sdkp->max_retries, &data, NULL))
                return -EINVAL;
        len = min_t(size_t, sizeof(buffer), data.length - data.header_length -
                  data.block_descriptor_length);
        buffer_data = buffer + data.header_length +
                data.block_descriptor_length;
        buffer_data[2] &= ~0x05;
        buffer_data[2] |= wce << 2 | rcd;
        sp = buffer_data[0] & 0x80 ? 1 : 0;
        buffer_data[0] &= ~0x80;

        /*
         * Ensure WP, DPOFUA, and RESERVED fields are cleared in
         * received mode parameter buffer before doing MODE SELECT.
         */
        data.device_specific = 0;

        ret = scsi_mode_select(sdp, 1, sp, buffer_data, len, SD_TIMEOUT,
                               sdkp->max_retries, &data, &sshdr);
        if (ret) {
                if (ret > 0 && scsi_sense_valid(&sshdr))
                        sd_print_sense_hdr(sdkp, &sshdr);
                return -EINVAL;
        }
        sd_revalidate_disk(sdkp->disk);
        return count;
}

static ssize_t
manage_start_stop_show(struct device *dev,
                       struct device_attribute *attr, char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdp = sdkp->device;

        return sysfs_emit(buf, "%u\n",
                          sdp->manage_system_start_stop &&
                          sdp->manage_runtime_start_stop &&
                          sdp->manage_shutdown);
}
static DEVICE_ATTR_RO(manage_start_stop);

static ssize_t
manage_system_start_stop_show(struct device *dev,
                              struct device_attribute *attr, char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdp = sdkp->device;

        return sysfs_emit(buf, "%u\n", sdp->manage_system_start_stop);
}

static ssize_t
manage_system_start_stop_store(struct device *dev,
                               struct device_attribute *attr,
                               const char *buf, size_t count)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdp = sdkp->device;
        bool v;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        if (kstrtobool(buf, &v))
                return -EINVAL;

        sdp->manage_system_start_stop = v;

        return count;
}
static DEVICE_ATTR_RW(manage_system_start_stop);

static ssize_t
manage_runtime_start_stop_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdp = sdkp->device;

        return sysfs_emit(buf, "%u\n", sdp->manage_runtime_start_stop);
}

static ssize_t
manage_runtime_start_stop_store(struct device *dev,
                                struct device_attribute *attr,
                                const char *buf, size_t count)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdp = sdkp->device;
        bool v;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        if (kstrtobool(buf, &v))
                return -EINVAL;

        sdp->manage_runtime_start_stop = v;

        return count;
}
static DEVICE_ATTR_RW(manage_runtime_start_stop);

static ssize_t manage_shutdown_show(struct device *dev,
                                    struct device_attribute *attr, char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdp = sdkp->device;

        return sysfs_emit(buf, "%u\n", sdp->manage_shutdown);
}

static ssize_t manage_shutdown_store(struct device *dev,
                                     struct device_attribute *attr,
                                     const char *buf, size_t count)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdp = sdkp->device;
        bool v;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        if (kstrtobool(buf, &v))
                return -EINVAL;

        sdp->manage_shutdown = v;

        return count;
}
static DEVICE_ATTR_RW(manage_shutdown);

static ssize_t
allow_restart_show(struct device *dev, struct device_attribute *attr, char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        return sprintf(buf, "%u\n", sdkp->device->allow_restart);
}

static ssize_t
allow_restart_store(struct device *dev, struct device_attribute *attr,
                    const char *buf, size_t count)
{
        bool v;
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdp = sdkp->device;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC)
                return -EINVAL;

        if (kstrtobool(buf, &v))
                return -EINVAL;

        sdp->allow_restart = v;

        return count;
}
static DEVICE_ATTR_RW(allow_restart);

static ssize_t
cache_type_show(struct device *dev, struct device_attribute *attr, char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        int ct = sdkp->RCD + 2*sdkp->WCE;

        return sprintf(buf, "%s\n", sd_cache_types[ct]);
}
static DEVICE_ATTR_RW(cache_type);

static ssize_t
FUA_show(struct device *dev, struct device_attribute *attr, char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        return sprintf(buf, "%u\n", sdkp->DPOFUA);
}
static DEVICE_ATTR_RO(FUA);

static ssize_t
protection_type_show(struct device *dev, struct device_attribute *attr,
                     char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        return sprintf(buf, "%u\n", sdkp->protection_type);
}

static ssize_t
protection_type_store(struct device *dev, struct device_attribute *attr,
                      const char *buf, size_t count)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        unsigned int val;
        int err;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        err = kstrtouint(buf, 10, &val);

        if (err)
                return err;

        if (val <= T10_PI_TYPE3_PROTECTION)
                sdkp->protection_type = val;

        return count;
}
static DEVICE_ATTR_RW(protection_type);

static ssize_t
protection_mode_show(struct device *dev, struct device_attribute *attr,
                     char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdp = sdkp->device;
        unsigned int dif, dix;

        dif = scsi_host_dif_capable(sdp->host, sdkp->protection_type);
        dix = scsi_host_dix_capable(sdp->host, sdkp->protection_type);

        if (!dix && scsi_host_dix_capable(sdp->host, T10_PI_TYPE0_PROTECTION)) {
                dif = 0;
                dix = 1;
        }

        if (!dif && !dix)
                return sprintf(buf, "none\n");

        return sprintf(buf, "%s%u\n", dix ? "dix" : "dif", dif);
}
static DEVICE_ATTR_RO(protection_mode);

static ssize_t
app_tag_own_show(struct device *dev, struct device_attribute *attr, char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        return sprintf(buf, "%u\n", sdkp->ATO);
}
static DEVICE_ATTR_RO(app_tag_own);

static ssize_t
thin_provisioning_show(struct device *dev, struct device_attribute *attr,
                       char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        return sprintf(buf, "%u\n", sdkp->lbpme);
}
static DEVICE_ATTR_RO(thin_provisioning);

/* sysfs_match_string() requires dense arrays */
static const char *lbp_mode[] = {
        [SD_LBP_FULL]                = "full",
        [SD_LBP_UNMAP]                = "unmap",
        [SD_LBP_WS16]                = "writesame_16",
        [SD_LBP_WS10]                = "writesame_10",
        [SD_LBP_ZERO]                = "writesame_zero",
        [SD_LBP_DISABLE]        = "disabled",
};

static ssize_t
provisioning_mode_show(struct device *dev, struct device_attribute *attr,
                       char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        return sprintf(buf, "%s\n", lbp_mode[sdkp->provisioning_mode]);
}

static ssize_t
provisioning_mode_store(struct device *dev, struct device_attribute *attr,
                        const char *buf, size_t count)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdp = sdkp->device;
        int mode;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        if (sd_is_zoned(sdkp)) {
                sd_config_discard(sdkp, SD_LBP_DISABLE);
                return count;
        }

        if (sdp->type != TYPE_DISK)
                return -EINVAL;

        mode = sysfs_match_string(lbp_mode, buf);
        if (mode < 0)
                return -EINVAL;

        sd_config_discard(sdkp, mode);

        return count;
}
static DEVICE_ATTR_RW(provisioning_mode);

/* sysfs_match_string() requires dense arrays */
static const char *zeroing_mode[] = {
        [SD_ZERO_WRITE]                = "write",
        [SD_ZERO_WS]                = "writesame",
        [SD_ZERO_WS16_UNMAP]        = "writesame_16_unmap",
        [SD_ZERO_WS10_UNMAP]        = "writesame_10_unmap",
};

static ssize_t
zeroing_mode_show(struct device *dev, struct device_attribute *attr,
                  char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        return sprintf(buf, "%s\n", zeroing_mode[sdkp->zeroing_mode]);
}

static ssize_t
zeroing_mode_store(struct device *dev, struct device_attribute *attr,
                   const char *buf, size_t count)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        int mode;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        mode = sysfs_match_string(zeroing_mode, buf);
        if (mode < 0)
                return -EINVAL;

        sdkp->zeroing_mode = mode;

        return count;
}
static DEVICE_ATTR_RW(zeroing_mode);

static ssize_t
max_medium_access_timeouts_show(struct device *dev,
                                struct device_attribute *attr, char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        return sprintf(buf, "%u\n", sdkp->max_medium_access_timeouts);
}

static ssize_t
max_medium_access_timeouts_store(struct device *dev,
                                 struct device_attribute *attr, const char *buf,
                                 size_t count)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        int err;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        err = kstrtouint(buf, 10, &sdkp->max_medium_access_timeouts);

        return err ? err : count;
}
static DEVICE_ATTR_RW(max_medium_access_timeouts);

static ssize_t
max_write_same_blocks_show(struct device *dev, struct device_attribute *attr,
                           char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        return sprintf(buf, "%u\n", sdkp->max_ws_blocks);
}

static ssize_t
max_write_same_blocks_store(struct device *dev, struct device_attribute *attr,
                            const char *buf, size_t count)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdp = sdkp->device;
        unsigned long max;
        int err;

        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;

        if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC)
                return -EINVAL;

        err = kstrtoul(buf, 10, &max);

        if (err)
                return err;

        if (max == 0)
                sdp->no_write_same = 1;
        else if (max <= SD_MAX_WS16_BLOCKS) {
                sdp->no_write_same = 0;
                sdkp->max_ws_blocks = max;
        }

        sd_config_write_same(sdkp);

        return count;
}
static DEVICE_ATTR_RW(max_write_same_blocks);

static ssize_t
zoned_cap_show(struct device *dev, struct device_attribute *attr, char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        if (sdkp->device->type == TYPE_ZBC)
                return sprintf(buf, "host-managed\n");
        if (sdkp->zoned == 1)
                return sprintf(buf, "host-aware\n");
        if (sdkp->zoned == 2)
                return sprintf(buf, "drive-managed\n");
        return sprintf(buf, "none\n");
}
static DEVICE_ATTR_RO(zoned_cap);

static ssize_t
max_retries_store(struct device *dev, struct device_attribute *attr,
                  const char *buf, size_t count)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);
        struct scsi_device *sdev = sdkp->device;
        int retries, err;

        err = kstrtoint(buf, 10, &retries);
        if (err)
                return err;

        if (retries == SCSI_CMD_RETRIES_NO_LIMIT || retries <= SD_MAX_RETRIES) {
                sdkp->max_retries = retries;
                return count;
        }

        sdev_printk(KERN_ERR, sdev, "max_retries must be between -1 and %d\n",
                    SD_MAX_RETRIES);
        return -EINVAL;
}

static ssize_t
max_retries_show(struct device *dev, struct device_attribute *attr,
                 char *buf)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        return sprintf(buf, "%d\n", sdkp->max_retries);
}

static DEVICE_ATTR_RW(max_retries);

static struct attribute *sd_disk_attrs[] = {
        &dev_attr_cache_type.attr,
        &dev_attr_FUA.attr,
        &dev_attr_allow_restart.attr,
        &dev_attr_manage_start_stop.attr,
        &dev_attr_manage_system_start_stop.attr,
        &dev_attr_manage_runtime_start_stop.attr,
        &dev_attr_manage_shutdown.attr,
        &dev_attr_protection_type.attr,
        &dev_attr_protection_mode.attr,
        &dev_attr_app_tag_own.attr,
        &dev_attr_thin_provisioning.attr,
        &dev_attr_provisioning_mode.attr,
        &dev_attr_zeroing_mode.attr,
        &dev_attr_max_write_same_blocks.attr,
        &dev_attr_max_medium_access_timeouts.attr,
        &dev_attr_zoned_cap.attr,
        &dev_attr_max_retries.attr,
        NULL,
};
ATTRIBUTE_GROUPS(sd_disk);

static struct class sd_disk_class = {
        .name                = "scsi_disk",
        .dev_release        = scsi_disk_release,
        .dev_groups        = sd_disk_groups,
};

/*
 * Don't request a new module, as that could deadlock in multipath
 * environment.
 */
static void sd_default_probe(dev_t devt)
{
}

/*
 * Device no to disk mapping:
 * 
 *       major         disc2     disc  p1
 *   |............|.............|....|....| <- dev_t
 *    31        20 19          8 7  4 3  0
 * 
 * Inside a major, we have 16k disks, however mapped non-
 * contiguously. The first 16 disks are for major0, the next
 * ones with major1, ... Disk 256 is for major0 again, disk 272 
 * for major1, ... 
 * As we stay compatible with our numbering scheme, we can reuse 
 * the well-know SCSI majors 8, 65--71, 136--143.
 */
static int sd_major(int major_idx)
{
        switch (major_idx) {
        case 0:
                return SCSI_DISK0_MAJOR;
        case 1 ... 7:
                return SCSI_DISK1_MAJOR + major_idx - 1;
        case 8 ... 15:
                return SCSI_DISK8_MAJOR + major_idx - 8;
        default:
                BUG();
                return 0;        /* shut up gcc */
        }
}

#ifdef CONFIG_BLK_SED_OPAL
static int sd_sec_submit(void *data, u16 spsp, u8 secp, void *buffer,
                size_t len, bool send)
{
        struct scsi_disk *sdkp = data;
        struct scsi_device *sdev = sdkp->device;
        u8 cdb[12] = { 0, };
        const struct scsi_exec_args exec_args = {
                .req_flags = BLK_MQ_REQ_PM,
        };
        int ret;

        cdb[0] = send ? SECURITY_PROTOCOL_OUT : SECURITY_PROTOCOL_IN;
        cdb[1] = secp;
        put_unaligned_be16(spsp, &cdb[2]);
        put_unaligned_be32(len, &cdb[6]);

        ret = scsi_execute_cmd(sdev, cdb, send ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN,
                               buffer, len, SD_TIMEOUT, sdkp->max_retries,
                               &exec_args);
        return ret <= 0 ? ret : -EIO;
}
#endif /* CONFIG_BLK_SED_OPAL */

/*
 * Look up the DIX operation based on whether the command is read or
 * write and whether dix and dif are enabled.
 */
static unsigned int sd_prot_op(bool write, bool dix, bool dif)
{
        /* Lookup table: bit 2 (write), bit 1 (dix), bit 0 (dif) */
        static const unsigned int ops[] = {        /* wrt dix dif */
                SCSI_PROT_NORMAL,                /*  0        0   0  */
                SCSI_PROT_READ_STRIP,                /*  0        0   1  */
                SCSI_PROT_READ_INSERT,                /*  0        1   0  */
                SCSI_PROT_READ_PASS,                /*  0        1   1  */
                SCSI_PROT_NORMAL,                /*  1        0   0  */
                SCSI_PROT_WRITE_INSERT,                /*  1        0   1  */
                SCSI_PROT_WRITE_STRIP,                /*  1        1   0  */
                SCSI_PROT_WRITE_PASS,                /*  1        1   1  */
        };

        return ops[write << 2 | dix << 1 | dif];
}

/*
 * Returns a mask of the protection flags that are valid for a given DIX
 * operation.
 */
static unsigned int sd_prot_flag_mask(unsigned int prot_op)
{
        static const unsigned int flag_mask[] = {
                [SCSI_PROT_NORMAL]                = 0,

                [SCSI_PROT_READ_STRIP]                = SCSI_PROT_TRANSFER_PI |
                                                  SCSI_PROT_GUARD_CHECK |
                                                  SCSI_PROT_REF_CHECK |
                                                  SCSI_PROT_REF_INCREMENT,

                [SCSI_PROT_READ_INSERT]                = SCSI_PROT_REF_INCREMENT |
                                                  SCSI_PROT_IP_CHECKSUM,

                [SCSI_PROT_READ_PASS]                = SCSI_PROT_TRANSFER_PI |
                                                  SCSI_PROT_GUARD_CHECK |
                                                  SCSI_PROT_REF_CHECK |
                                                  SCSI_PROT_REF_INCREMENT |
                                                  SCSI_PROT_IP_CHECKSUM,

                [SCSI_PROT_WRITE_INSERT]        = SCSI_PROT_TRANSFER_PI |
                                                  SCSI_PROT_REF_INCREMENT,

                [SCSI_PROT_WRITE_STRIP]                = SCSI_PROT_GUARD_CHECK |
                                                  SCSI_PROT_REF_CHECK |
                                                  SCSI_PROT_REF_INCREMENT |
                                                  SCSI_PROT_IP_CHECKSUM,

                [SCSI_PROT_WRITE_PASS]                = SCSI_PROT_TRANSFER_PI |
                                                  SCSI_PROT_GUARD_CHECK |
                                                  SCSI_PROT_REF_CHECK |
                                                  SCSI_PROT_REF_INCREMENT |
                                                  SCSI_PROT_IP_CHECKSUM,
        };

        return flag_mask[prot_op];
}

static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd,
                                           unsigned int dix, unsigned int dif)
{
        struct request *rq = scsi_cmd_to_rq(scmd);
        struct bio *bio = rq->bio;
        unsigned int prot_op = sd_prot_op(rq_data_dir(rq), dix, dif);
        unsigned int protect = 0;

        if (dix) {                                /* DIX Type 0, 1, 2, 3 */
                if (bio_integrity_flagged(bio, BIP_IP_CHECKSUM))
                        scmd->prot_flags |= SCSI_PROT_IP_CHECKSUM;

                if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false)
                        scmd->prot_flags |= SCSI_PROT_GUARD_CHECK;
        }

        if (dif != T10_PI_TYPE3_PROTECTION) {        /* DIX/DIF Type 0, 1, 2 */
                scmd->prot_flags |= SCSI_PROT_REF_INCREMENT;

                if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false)
                        scmd->prot_flags |= SCSI_PROT_REF_CHECK;
        }

        if (dif) {                                /* DIX/DIF Type 1, 2, 3 */
                scmd->prot_flags |= SCSI_PROT_TRANSFER_PI;

                if (bio_integrity_flagged(bio, BIP_DISK_NOCHECK))
                        protect = 3 << 5;        /* Disable target PI checking */
                else
                        protect = 1 << 5;        /* Enable target PI checking */
        }

        scsi_set_prot_op(scmd, prot_op);
        scsi_set_prot_type(scmd, dif);
        scmd->prot_flags &= sd_prot_flag_mask(prot_op);

        return protect;
}

static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
{
        struct request_queue *q = sdkp->disk->queue;
        unsigned int logical_block_size = sdkp->device->sector_size;
        unsigned int max_blocks = 0;

        q->limits.discard_alignment =
                sdkp->unmap_alignment * logical_block_size;
        q->limits.discard_granularity =
                max(sdkp->physical_block_size,
                    sdkp->unmap_granularity * logical_block_size);
        sdkp->provisioning_mode = mode;

        switch (mode) {

        case SD_LBP_FULL:
        case SD_LBP_DISABLE:
                blk_queue_max_discard_sectors(q, 0);
                return;

        case SD_LBP_UNMAP:
                max_blocks = min_not_zero(sdkp->max_unmap_blocks,
                                          (u32)SD_MAX_WS16_BLOCKS);
                break;

        case SD_LBP_WS16:
                if (sdkp->device->unmap_limit_for_ws)
                        max_blocks = sdkp->max_unmap_blocks;
                else
                        max_blocks = sdkp->max_ws_blocks;

                max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS16_BLOCKS);
                break;

        case SD_LBP_WS10:
                if (sdkp->device->unmap_limit_for_ws)
                        max_blocks = sdkp->max_unmap_blocks;
                else
                        max_blocks = sdkp->max_ws_blocks;

                max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS10_BLOCKS);
                break;

        case SD_LBP_ZERO:
                max_blocks = min_not_zero(sdkp->max_ws_blocks,
                                          (u32)SD_MAX_WS10_BLOCKS);
                break;
        }

        blk_queue_max_discard_sectors(q, max_blocks * (logical_block_size >> 9));
}

static void *sd_set_special_bvec(struct request *rq, unsigned int data_len)
{
        struct page *page;

        page = mempool_alloc(sd_page_pool, GFP_ATOMIC);
        if (!page)
                return NULL;
        clear_highpage(page);
        bvec_set_page(&rq->special_vec, page, data_len, 0);
        rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
        return bvec_virt(&rq->special_vec);
}

static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
{
        struct scsi_device *sdp = cmd->device;
        struct request *rq = scsi_cmd_to_rq(cmd);
        struct scsi_disk *sdkp = scsi_disk(rq->q->disk);
        u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq));
        u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq));
        unsigned int data_len = 24;
        char *buf;

        buf = sd_set_special_bvec(rq, data_len);
        if (!buf)
                return BLK_STS_RESOURCE;

        cmd->cmd_len = 10;
        cmd->cmnd[0] = UNMAP;
        cmd->cmnd[8] = 24;

        put_unaligned_be16(6 + 16, &buf[0]);
        put_unaligned_be16(16, &buf[2]);
        put_unaligned_be64(lba, &buf[8]);
        put_unaligned_be32(nr_blocks, &buf[16]);

        cmd->allowed = sdkp->max_retries;
        cmd->transfersize = data_len;
        rq->timeout = SD_TIMEOUT;

        return scsi_alloc_sgtables(cmd);
}

static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd,
                bool unmap)
{
        struct scsi_device *sdp = cmd->device;
        struct request *rq = scsi_cmd_to_rq(cmd);
        struct scsi_disk *sdkp = scsi_disk(rq->q->disk);
        u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq));
        u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq));
        u32 data_len = sdp->sector_size;

        if (!sd_set_special_bvec(rq, data_len))
                return BLK_STS_RESOURCE;

        cmd->cmd_len = 16;
        cmd->cmnd[0] = WRITE_SAME_16;
        if (unmap)
                cmd->cmnd[1] = 0x8; /* UNMAP */
        put_unaligned_be64(lba, &cmd->cmnd[2]);
        put_unaligned_be32(nr_blocks, &cmd->cmnd[10]);

        cmd->allowed = sdkp->max_retries;
        cmd->transfersize = data_len;
        rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT;

        return scsi_alloc_sgtables(cmd);
}

static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd,
                bool unmap)
{
        struct scsi_device *sdp = cmd->device;
        struct request *rq = scsi_cmd_to_rq(cmd);
        struct scsi_disk *sdkp = scsi_disk(rq->q->disk);
        u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq));
        u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq));
        u32 data_len = sdp->sector_size;

        if (!sd_set_special_bvec(rq, data_len))
                return BLK_STS_RESOURCE;

        cmd->cmd_len = 10;
        cmd->cmnd[0] = WRITE_SAME;
        if (unmap)
                cmd->cmnd[1] = 0x8; /* UNMAP */
        put_unaligned_be32(lba, &cmd->cmnd[2]);
        put_unaligned_be16(nr_blocks, &cmd->cmnd[7]);

        cmd->allowed = sdkp->max_retries;
        cmd->transfersize = data_len;
        rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT;

        return scsi_alloc_sgtables(cmd);
}

static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
{
        struct request *rq = scsi_cmd_to_rq(cmd);
        struct scsi_device *sdp = cmd->device;
        struct scsi_disk *sdkp = scsi_disk(rq->q->disk);
        u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq));
        u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq));

        if (!(rq->cmd_flags & REQ_NOUNMAP)) {
                switch (sdkp->zeroing_mode) {
                case SD_ZERO_WS16_UNMAP:
                        return sd_setup_write_same16_cmnd(cmd, true);
                case SD_ZERO_WS10_UNMAP:
                        return sd_setup_write_same10_cmnd(cmd, true);
                }
        }

        if (sdp->no_write_same) {
                rq->rq_flags |= RQF_QUIET;
                return BLK_STS_TARGET;
        }

        if (sdkp->ws16 || lba > 0xffffffff || nr_blocks > 0xffff)
                return sd_setup_write_same16_cmnd(cmd, false);

        return sd_setup_write_same10_cmnd(cmd, false);
}

static void sd_config_write_same(struct scsi_disk *sdkp)
{
        struct request_queue *q = sdkp->disk->queue;
        unsigned int logical_block_size = sdkp->device->sector_size;

        if (sdkp->device->no_write_same) {
                sdkp->max_ws_blocks = 0;
                goto out;
        }

        /* Some devices can not handle block counts above 0xffff despite
         * supporting WRITE SAME(16). Consequently we default to 64k
         * blocks per I/O unless the device explicitly advertises a
         * bigger limit.
         */
        if (sdkp->max_ws_blocks > SD_MAX_WS10_BLOCKS)
                sdkp->max_ws_blocks = min_not_zero(sdkp->max_ws_blocks,
                                                   (u32)SD_MAX_WS16_BLOCKS);
        else if (sdkp->ws16 || sdkp->ws10 || sdkp->device->no_report_opcodes)
                sdkp->max_ws_blocks = min_not_zero(sdkp->max_ws_blocks,
                                                   (u32)SD_MAX_WS10_BLOCKS);
        else {
                sdkp->device->no_write_same = 1;
                sdkp->max_ws_blocks = 0;
        }

        if (sdkp->lbprz && sdkp->lbpws)
                sdkp->zeroing_mode = SD_ZERO_WS16_UNMAP;
        else if (sdkp->lbprz && sdkp->lbpws10)
                sdkp->zeroing_mode = SD_ZERO_WS10_UNMAP;
        else if (sdkp->max_ws_blocks)
                sdkp->zeroing_mode = SD_ZERO_WS;
        else
                sdkp->zeroing_mode = SD_ZERO_WRITE;

        if (sdkp->max_ws_blocks &&
            sdkp->physical_block_size > logical_block_size) {
                /*
                 * Reporting a maximum number of blocks that is not aligned
                 * on the device physical size would cause a large write same
                 * request to be split into physically unaligned chunks by
                 * __blkdev_issue_write_zeroes() even if the caller of this
                 * functions took care to align the large request. So make sure
                 * the maximum reported is aligned to the device physical block
                 * size. This is only an optional optimization for regular
                 * disks, but this is mandatory to avoid failure of large write
                 * same requests directed at sequential write required zones of
                 * host-managed ZBC disks.
                 */
                sdkp->max_ws_blocks =
                        round_down(sdkp->max_ws_blocks,
                                   bytes_to_logical(sdkp->device,
                                                    sdkp->physical_block_size));
        }

out:
        blk_queue_max_write_zeroes_sectors(q, sdkp->max_ws_blocks *
                                         (logical_block_size >> 9));
}

static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd)
{
        struct request *rq = scsi_cmd_to_rq(cmd);
        struct scsi_disk *sdkp = scsi_disk(rq->q->disk);

        /* flush requests don't perform I/O, zero the S/G table */
        memset(&cmd->sdb, 0, sizeof(cmd->sdb));

        if (cmd->device->use_16_for_sync) {
                cmd->cmnd[0] = SYNCHRONIZE_CACHE_16;
                cmd->cmd_len = 16;
        } else {
                cmd->cmnd[0] = SYNCHRONIZE_CACHE;
                cmd->cmd_len = 10;
        }
        cmd->transfersize = 0;
        cmd->allowed = sdkp->max_retries;

        rq->timeout = rq->q->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER;
        return BLK_STS_OK;
}

/**
 * sd_group_number() - Compute the GROUP NUMBER field
 * @cmd: SCSI command for which to compute the value of the six-bit GROUP NUMBER
 *        field.
 *
 * From SBC-5 r05 (https://www.t10.org/cgi-bin/ac.pl?t=f&f=sbc5r05.pdf):
 * 0: no relative lifetime.
 * 1: shortest relative lifetime.
 * 2: second shortest relative lifetime.
 * 3 - 0x3d: intermediate relative lifetimes.
 * 0x3e: second longest relative lifetime.
 * 0x3f: longest relative lifetime.
 */
static u8 sd_group_number(struct scsi_cmnd *cmd)
{
        const struct request *rq = scsi_cmd_to_rq(cmd);
        struct scsi_disk *sdkp = scsi_disk(rq->q->disk);

        if (!sdkp->rscs)
                return 0;

        return min3((u32)rq->write_hint, (u32)sdkp->permanent_stream_count,
                    0x3fu);
}

static blk_status_t sd_setup_rw32_cmnd(struct scsi_cmnd *cmd, bool write,
                                       sector_t lba, unsigned int nr_blocks,
                                       unsigned char flags, unsigned int dld)
{
        cmd->cmd_len = SD_EXT_CDB_SIZE;
        cmd->cmnd[0]  = VARIABLE_LENGTH_CMD;
        cmd->cmnd[6]  = sd_group_number(cmd);
        cmd->cmnd[7]  = 0x18; /* Additional CDB len */
        cmd->cmnd[9]  = write ? WRITE_32 : READ_32;
        cmd->cmnd[10] = flags;
        cmd->cmnd[11] = dld & 0x07;
        put_unaligned_be64(lba, &cmd->cmnd[12]);
        put_unaligned_be32(lba, &cmd->cmnd[20]); /* Expected Indirect LBA */
        put_unaligned_be32(nr_blocks, &cmd->cmnd[28]);

        return BLK_STS_OK;
}

static blk_status_t sd_setup_rw16_cmnd(struct scsi_cmnd *cmd, bool write,
                                       sector_t lba, unsigned int nr_blocks,
                                       unsigned char flags, unsigned int dld)
{
        cmd->cmd_len  = 16;
        cmd->cmnd[0]  = write ? WRITE_16 : READ_16;
        cmd->cmnd[1]  = flags | ((dld >> 2) & 0x01);
        cmd->cmnd[14] = ((dld & 0x03) << 6) | sd_group_number(cmd);
        cmd->cmnd[15] = 0;
        put_unaligned_be64(lba, &cmd->cmnd[2]);
        put_unaligned_be32(nr_blocks, &cmd->cmnd[10]);

        return BLK_STS_OK;
}

static blk_status_t sd_setup_rw10_cmnd(struct scsi_cmnd *cmd, bool write,
                                       sector_t lba, unsigned int nr_blocks,
                                       unsigned char flags)
{
        cmd->cmd_len = 10;
        cmd->cmnd[0] = write ? WRITE_10 : READ_10;
        cmd->cmnd[1] = flags;
        cmd->cmnd[6] = sd_group_number(cmd);
        cmd->cmnd[9] = 0;
        put_unaligned_be32(lba, &cmd->cmnd[2]);
        put_unaligned_be16(nr_blocks, &cmd->cmnd[7]);

        return BLK_STS_OK;
}

static blk_status_t sd_setup_rw6_cmnd(struct scsi_cmnd *cmd, bool write,
                                      sector_t lba, unsigned int nr_blocks,
                                      unsigned char flags)
{
        /* Avoid that 0 blocks gets translated into 256 blocks. */
        if (WARN_ON_ONCE(nr_blocks == 0))
                return BLK_STS_IOERR;

        if (unlikely(flags & 0x8)) {
                /*
                 * This happens only if this drive failed 10byte rw
                 * command with ILLEGAL_REQUEST during operation and
                 * thus turned off use_10_for_rw.
                 */
                scmd_printk(KERN_ERR, cmd, "FUA write on READ/WRITE(6) drive\n");
                return BLK_STS_IOERR;
        }

        cmd->cmd_len = 6;
        cmd->cmnd[0] = write ? WRITE_6 : READ_6;
        cmd->cmnd[1] = (lba >> 16) & 0x1f;
        cmd->cmnd[2] = (lba >> 8) & 0xff;
        cmd->cmnd[3] = lba & 0xff;
        cmd->cmnd[4] = nr_blocks;
        cmd->cmnd[5] = 0;

        return BLK_STS_OK;
}

/*
 * Check if a command has a duration limit set. If it does, and the target
 * device supports CDL and the feature is enabled, return the limit
 * descriptor index to use. Return 0 (no limit) otherwise.
 */
static int sd_cdl_dld(struct scsi_disk *sdkp, struct scsi_cmnd *scmd)
{
        struct scsi_device *sdp = sdkp->device;
        int hint;

        if (!sdp->cdl_supported || !sdp->cdl_enable)
                return 0;

        /*
         * Use "no limit" if the request ioprio does not specify a duration
         * limit hint.
         */
        hint = IOPRIO_PRIO_HINT(req_get_ioprio(scsi_cmd_to_rq(scmd)));
        if (hint < IOPRIO_HINT_DEV_DURATION_LIMIT_1 ||
            hint > IOPRIO_HINT_DEV_DURATION_LIMIT_7)
                return 0;

        return (hint - IOPRIO_HINT_DEV_DURATION_LIMIT_1) + 1;
}

static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd)
{
        struct request *rq = scsi_cmd_to_rq(cmd);
        struct scsi_device *sdp = cmd->device;
        struct scsi_disk *sdkp = scsi_disk(rq->q->disk);
        sector_t lba = sectors_to_logical(sdp, blk_rq_pos(rq));
        sector_t threshold;
        unsigned int nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq));
        unsigned int mask = logical_to_sectors(sdp, 1) - 1;
        bool write = rq_data_dir(rq) == WRITE;
        unsigned char protect, fua;
        unsigned int dld;
        blk_status_t ret;
        unsigned int dif;
        bool dix;

        ret = scsi_alloc_sgtables(cmd);
        if (ret != BLK_STS_OK)
                return ret;

        ret = BLK_STS_IOERR;
        if (!scsi_device_online(sdp) || sdp->changed) {
                scmd_printk(KERN_ERR, cmd, "device offline or changed\n");
                goto fail;
        }

        if (blk_rq_pos(rq) + blk_rq_sectors(rq) > get_capacity(rq->q->disk)) {
                scmd_printk(KERN_ERR, cmd, "access beyond end of device\n");
                goto fail;
        }

        if ((blk_rq_pos(rq) & mask) || (blk_rq_sectors(rq) & mask)) {
                scmd_printk(KERN_ERR, cmd, "request not aligned to the logical block size\n");
                goto fail;
        }

        /*
         * Some SD card readers can't handle accesses which touch the
         * last one or two logical blocks. Split accesses as needed.
         */
        threshold = sdkp->capacity - SD_LAST_BUGGY_SECTORS;

        if (unlikely(sdp->last_sector_bug && lba + nr_blocks > threshold)) {
                if (lba < threshold) {
                        /* Access up to the threshold but not beyond */
                        nr_blocks = threshold - lba;
                } else {
                        /* Access only a single logical block */
                        nr_blocks = 1;
                }
        }

        fua = rq->cmd_flags & REQ_FUA ? 0x8 : 0;
        dix = scsi_prot_sg_count(cmd);
        dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type);
        dld = sd_cdl_dld(sdkp, cmd);

        if (dif || dix)
                protect = sd_setup_protect_cmnd(cmd, dix, dif);
        else
                protect = 0;

        if (protect && sdkp->protection_type == T10_PI_TYPE2_PROTECTION) {
                ret = sd_setup_rw32_cmnd(cmd, write, lba, nr_blocks,
                                         protect | fua, dld);
        } else if (sdp->use_16_for_rw || (nr_blocks > 0xffff)) {
                ret = sd_setup_rw16_cmnd(cmd, write, lba, nr_blocks,
                                         protect | fua, dld);
        } else if ((nr_blocks > 0xff) || (lba > 0x1fffff) ||
                   sdp->use_10_for_rw || protect || rq->write_hint) {
                ret = sd_setup_rw10_cmnd(cmd, write, lba, nr_blocks,
                                         protect | fua);
        } else {
                ret = sd_setup_rw6_cmnd(cmd, write, lba, nr_blocks,
                                        protect | fua);
        }

        if (unlikely(ret != BLK_STS_OK))
                goto fail;

        /*
         * We shouldn't disconnect in the middle of a sector, so with a dumb
         * host adapter, it's safe to assume that we can at least transfer
         * this many bytes between each connect / disconnect.
         */
        cmd->transfersize = sdp->sector_size;
        cmd->underflow = nr_blocks << 9;
        cmd->allowed = sdkp->max_retries;
        cmd->sdb.length = nr_blocks * sdp->sector_size;

        SCSI_LOG_HLQUEUE(1,
                         scmd_printk(KERN_INFO, cmd,
                                     "%s: block=%llu, count=%d\n", __func__,
                                     (unsigned long long)blk_rq_pos(rq),
                                     blk_rq_sectors(rq)));
        SCSI_LOG_HLQUEUE(2,
                         scmd_printk(KERN_INFO, cmd,
                                     "%s %d/%u 512 byte blocks.\n",
                                     write ? "writing" : "reading", nr_blocks,
                                     blk_rq_sectors(rq)));

        /*
         * This indicates that the command is ready from our end to be queued.
         */
        return BLK_STS_OK;
fail:
        scsi_free_sgtables(cmd);
        return ret;
}

static blk_status_t sd_init_command(struct scsi_cmnd *cmd)
{
        struct request *rq = scsi_cmd_to_rq(cmd);

        switch (req_op(rq)) {
        case REQ_OP_DISCARD:
                switch (scsi_disk(rq->q->disk)->provisioning_mode) {
                case SD_LBP_UNMAP:
                        return sd_setup_unmap_cmnd(cmd);
                case SD_LBP_WS16:
                        return sd_setup_write_same16_cmnd(cmd, true);
                case SD_LBP_WS10:
                        return sd_setup_write_same10_cmnd(cmd, true);
                case SD_LBP_ZERO:
                        return sd_setup_write_same10_cmnd(cmd, false);
                default:
                        return BLK_STS_TARGET;
                }
        case REQ_OP_WRITE_ZEROES:
                return sd_setup_write_zeroes_cmnd(cmd);
        case REQ_OP_FLUSH:
                return sd_setup_flush_cmnd(cmd);
        case REQ_OP_READ:
        case REQ_OP_WRITE:
                return sd_setup_read_write_cmnd(cmd);
        case REQ_OP_ZONE_RESET:
                return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER,
                                                   false);
        case REQ_OP_ZONE_RESET_ALL:
                return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER,
                                                   true);
        case REQ_OP_ZONE_OPEN:
                return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_OPEN_ZONE, false);
        case REQ_OP_ZONE_CLOSE:
                return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_CLOSE_ZONE, false);
        case REQ_OP_ZONE_FINISH:
                return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_FINISH_ZONE, false);
        default:
                WARN_ON_ONCE(1);
                return BLK_STS_NOTSUPP;
        }
}

static void sd_uninit_command(struct scsi_cmnd *SCpnt)
{
        struct request *rq = scsi_cmd_to_rq(SCpnt);

        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
                mempool_free(rq->special_vec.bv_page, sd_page_pool);
}

static bool sd_need_revalidate(struct gendisk *disk, struct scsi_disk *sdkp)
{
        if (sdkp->device->removable || sdkp->write_prot) {
                if (disk_check_media_change(disk))
                        return true;
        }

        /*
         * Force a full rescan after ioctl(BLKRRPART).  While the disk state has
         * nothing to do with partitions, BLKRRPART is used to force a full
         * revalidate after things like a format for historical reasons.
         */
        return test_bit(GD_NEED_PART_SCAN, &disk->state);
}

/**
 *        sd_open - open a scsi disk device
 *        @disk: disk to open
 *        @mode: open mode
 *
 *        Returns 0 if successful. Returns a negated errno value in case 
 *        of error.
 *
 *        Note: This can be called from a user context (e.g. fsck(1) )
 *        or from within the kernel (e.g. as a result of a mount(1) ).
 *        In the latter case @inode and @filp carry an abridged amount
 *        of information as noted above.
 *
 *        Locking: called with disk->open_mutex held.
 **/
static int sd_open(struct gendisk *disk, blk_mode_t mode)
{
        struct scsi_disk *sdkp = scsi_disk(disk);
        struct scsi_device *sdev = sdkp->device;
        int retval;

        if (scsi_device_get(sdev))
                return -ENXIO;

        SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_open\n"));

        /*
         * If the device is in error recovery, wait until it is done.
         * If the device is offline, then disallow any access to it.
         */
        retval = -ENXIO;
        if (!scsi_block_when_processing_errors(sdev))
                goto error_out;

        if (sd_need_revalidate(disk, sdkp))
                sd_revalidate_disk(disk);

        /*
         * If the drive is empty, just let the open fail.
         */
        retval = -ENOMEDIUM;
        if (sdev->removable && !sdkp->media_present &&
            !(mode & BLK_OPEN_NDELAY))
                goto error_out;

        /*
         * If the device has the write protect tab set, have the open fail
         * if the user expects to be able to write to the thing.
         */
        retval = -EROFS;
        if (sdkp->write_prot && (mode & BLK_OPEN_WRITE))
                goto error_out;

        /*
         * It is possible that the disk changing stuff resulted in
         * the device being taken offline.  If this is the case,
         * report this to the user, and don't pretend that the
         * open actually succeeded.
         */
        retval = -ENXIO;
        if (!scsi_device_online(sdev))
                goto error_out;

        if ((atomic_inc_return(&sdkp->openers) == 1) && sdev->removable) {
                if (scsi_block_when_processing_errors(sdev))
                        scsi_set_medium_removal(sdev, SCSI_REMOVAL_PREVENT);
        }

        return 0;

error_out:
        scsi_device_put(sdev);
        return retval;        
}

/**
 *        sd_release - invoked when the (last) close(2) is called on this
 *        scsi disk.
 *        @disk: disk to release
 *
 *        Returns 0. 
 *
 *        Note: may block (uninterruptible) if error recovery is underway
 *        on this disk.
 *
 *        Locking: called with disk->open_mutex held.
 **/
static void sd_release(struct gendisk *disk)
{
        struct scsi_disk *sdkp = scsi_disk(disk);
        struct scsi_device *sdev = sdkp->device;

        SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_release\n"));

        if (atomic_dec_return(&sdkp->openers) == 0 && sdev->removable) {
                if (scsi_block_when_processing_errors(sdev))
                        scsi_set_medium_removal(sdev, SCSI_REMOVAL_ALLOW);
        }

        scsi_device_put(sdev);
}

static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
        struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk);
        struct scsi_device *sdp = sdkp->device;
        struct Scsi_Host *host = sdp->host;
        sector_t capacity = logical_to_sectors(sdp, sdkp->capacity);
        int diskinfo[4];

        /* default to most commonly used values */
        diskinfo[0] = 0x40;        /* 1 << 6 */
        diskinfo[1] = 0x20;        /* 1 << 5 */
        diskinfo[2] = capacity >> 11;

        /* override with calculated, extended default, or driver values */
        if (host->hostt->bios_param)
                host->hostt->bios_param(sdp, bdev, capacity, diskinfo);
        else
                scsicam_bios_param(bdev, capacity, diskinfo);

        geo->heads = diskinfo[0];
        geo->sectors = diskinfo[1];
        geo->cylinders = diskinfo[2];
        return 0;
}

/**
 *        sd_ioctl - process an ioctl
 *        @bdev: target block device
 *        @mode: open mode
 *        @cmd: ioctl command number
 *        @arg: this is third argument given to ioctl(2) system call.
 *        Often contains a pointer.
 *
 *        Returns 0 if successful (some ioctls return positive numbers on
 *        success as well). Returns a negated errno value in case of error.
 *
 *        Note: most ioctls are forward onto the block subsystem or further
 *        down in the scsi subsystem.
 **/
static int sd_ioctl(struct block_device *bdev, blk_mode_t mode,
                    unsigned int cmd, unsigned long arg)
{
        struct gendisk *disk = bdev->bd_disk;
        struct scsi_disk *sdkp = scsi_disk(disk);
        struct scsi_device *sdp = sdkp->device;
        void __user *p = (void __user *)arg;
        int error;
    
        SCSI_LOG_IOCTL(1, sd_printk(KERN_INFO, sdkp, "sd_ioctl: disk=%s, "
                                    "cmd=0x%x\n", disk->disk_name, cmd));

        if (bdev_is_partition(bdev) && !capable(CAP_SYS_RAWIO))
                return -ENOIOCTLCMD;

        /*
         * If we are in the middle of error recovery, don't let anyone
         * else try and use this device.  Also, if error recovery fails, it
         * may try and take the device offline, in which case all further
         * access to the device is prohibited.
         */
        error = scsi_ioctl_block_when_processing_errors(sdp, cmd,
                        (mode & BLK_OPEN_NDELAY));
        if (error)
                return error;

        if (is_sed_ioctl(cmd))
                return sed_ioctl(sdkp->opal_dev, cmd, p);
        return scsi_ioctl(sdp, mode & BLK_OPEN_WRITE, cmd, p);
}

static void set_media_not_present(struct scsi_disk *sdkp)
{
        if (sdkp->media_present)
                sdkp->device->changed = 1;

        if (sdkp->device->removable) {
                sdkp->media_present = 0;
                sdkp->capacity = 0;
        }
}

static int media_not_present(struct scsi_disk *sdkp,
                             struct scsi_sense_hdr *sshdr)
{
        if (!scsi_sense_valid(sshdr))
                return 0;

        /* not invoked for commands that could return deferred errors */
        switch (sshdr->sense_key) {
        case UNIT_ATTENTION:
        case NOT_READY:
                /* medium not present */
                if (sshdr->asc == 0x3A) {
                        set_media_not_present(sdkp);
                        return 1;
                }
        }
        return 0;
}

/**
 *        sd_check_events - check media events
 *        @disk: kernel device descriptor
 *        @clearing: disk events currently being cleared
 *
 *        Returns mask of DISK_EVENT_*.
 *
 *        Note: this function is invoked from the block subsystem.
 **/
static unsigned int sd_check_events(struct gendisk *disk, unsigned int clearing)
{
        struct scsi_disk *sdkp = disk->private_data;
        struct scsi_device *sdp;
        int retval;
        bool disk_changed;

        if (!sdkp)
                return 0;

        sdp = sdkp->device;
        SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_check_events\n"));

        /*
         * If the device is offline, don't send any commands - just pretend as
         * if the command failed.  If the device ever comes back online, we
         * can deal with it then.  It is only because of unrecoverable errors
         * that we would ever take a device offline in the first place.
         */
        if (!scsi_device_online(sdp)) {
                set_media_not_present(sdkp);
                goto out;
        }

        /*
         * Using TEST_UNIT_READY enables differentiation between drive with
         * no cartridge loaded - NOT READY, drive with changed cartridge -
         * UNIT ATTENTION, or with same cartridge - GOOD STATUS.
         *
         * Drives that auto spin down. eg iomega jaz 1G, will be started
         * by sd_spinup_disk() from sd_revalidate_disk(), which happens whenever
         * sd_revalidate() is called.
         */
        if (scsi_block_when_processing_errors(sdp)) {
                struct scsi_sense_hdr sshdr = { 0, };

                retval = scsi_test_unit_ready(sdp, SD_TIMEOUT, sdkp->max_retries,
                                              &sshdr);

                /* failed to execute TUR, assume media not present */
                if (retval < 0 || host_byte(retval)) {
                        set_media_not_present(sdkp);
                        goto out;
                }

                if (media_not_present(sdkp, &sshdr))
                        goto out;
        }

        /*
         * For removable scsi disk we have to recognise the presence
         * of a disk in the drive.
         */
        if (!sdkp->media_present)
                sdp->changed = 1;
        sdkp->media_present = 1;
out:
        /*
         * sdp->changed is set under the following conditions:
         *
         *        Medium present state has changed in either direction.
         *        Device has indicated UNIT_ATTENTION.
         */
        disk_changed = sdp->changed;
        sdp->changed = 0;
        return disk_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
}

static int sd_sync_cache(struct scsi_disk *sdkp)
{
        int res;
        struct scsi_device *sdp = sdkp->device;
        const int timeout = sdp->request_queue->rq_timeout
                * SD_FLUSH_TIMEOUT_MULTIPLIER;
        /* Leave the rest of the command zero to indicate flush everything. */
        const unsigned char cmd[16] = { sdp->use_16_for_sync ?
                                SYNCHRONIZE_CACHE_16 : SYNCHRONIZE_CACHE };
        struct scsi_sense_hdr sshdr;
        struct scsi_failure failure_defs[] = {
                {
                        .allowed = 3,
                        .result = SCMD_FAILURE_RESULT_ANY,
                },
                {}
        };
        struct scsi_failures failures = {
                .failure_definitions = failure_defs,
        };
        const struct scsi_exec_args exec_args = {
                .req_flags = BLK_MQ_REQ_PM,
                .sshdr = &sshdr,
                .failures = &failures,
        };

        if (!scsi_device_online(sdp))
                return -ENODEV;

        res = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, NULL, 0, timeout,
                               sdkp->max_retries, &exec_args);
        if (res) {
                sd_print_result(sdkp, "Synchronize Cache(10) failed", res);

                if (res < 0)
                        return res;

                if (scsi_status_is_check_condition(res) &&
                    scsi_sense_valid(&sshdr)) {
                        sd_print_sense_hdr(sdkp, &sshdr);

                        /* we need to evaluate the error return  */
                        if (sshdr.asc == 0x3a ||        /* medium not present */
                            sshdr.asc == 0x20 ||        /* invalid command */
                            (sshdr.asc == 0x74 && sshdr.ascq == 0x71))        /* drive is password locked */
                                /* this is no error here */
                                return 0;
                        /*
                         * This drive doesn't support sync and there's not much
                         * we can do because this is called during shutdown
                         * or suspend so just return success so those operations
                         * can proceed.
                         */
                        if (sshdr.sense_key == ILLEGAL_REQUEST)
                                return 0;
                }

                switch (host_byte(res)) {
                /* ignore errors due to racing a disconnection */
                case DID_BAD_TARGET:
                case DID_NO_CONNECT:
                        return 0;
                /* signal the upper layer it might try again */
                case DID_BUS_BUSY:
                case DID_IMM_RETRY:
                case DID_REQUEUE:
                case DID_SOFT_ERROR:
                        return -EBUSY;
                default:
                        return -EIO;
                }
        }
        return 0;
}

static void sd_rescan(struct device *dev)
{
        struct scsi_disk *sdkp = dev_get_drvdata(dev);

        sd_revalidate_disk(sdkp->disk);
}

static int sd_get_unique_id(struct gendisk *disk, u8 id[16],
                enum blk_unique_id type)
{
        struct scsi_device *sdev = scsi_disk(disk)->device;
        const struct scsi_vpd *vpd;
        const unsigned char *d;
        int ret = -ENXIO, len;

        rcu_read_lock();
        vpd = rcu_dereference(sdev->vpd_pg83);
        if (!vpd)
                goto out_unlock;

        ret = -EINVAL;
        for (d = vpd->data + 4; d < vpd->data + vpd->len; d += d[3] + 4) {
                /* we only care about designators with LU association */
                if (((d[1] >> 4) & 0x3) != 0x00)
                        continue;
                if ((d[1] & 0xf) != type)
                        continue;

                /*
                 * Only exit early if a 16-byte descriptor was found.  Otherwise
                 * keep looking as one with more entropy might still show up.
                 */
                len = d[3];
                if (len != 8 && len != 12 && len != 16)
                        continue;
                ret = len;
                memcpy(id, d + 4, len);
                if (len == 16)
                        break;
        }
out_unlock:
        rcu_read_unlock();
        return ret;
}

static int sd_scsi_to_pr_err(struct scsi_sense_hdr *sshdr, int result)
{
        switch (host_byte(result)) {
        case DID_TRANSPORT_MARGINAL:
        case DID_TRANSPORT_DISRUPTED:
        case DID_BUS_BUSY:
                return PR_STS_RETRY_PATH_FAILURE;
        case DID_NO_CONNECT:
                return PR_STS_PATH_FAILED;
        case DID_TRANSPORT_FAILFAST:
                return PR_STS_PATH_FAST_FAILED;
        }

        switch (status_byte(result)) {
        case SAM_STAT_RESERVATION_CONFLICT:
                return PR_STS_RESERVATION_CONFLICT;
        case SAM_STAT_CHECK_CONDITION:
                if (!scsi_sense_valid(sshdr))
                        return PR_STS_IOERR;

                if (sshdr->sense_key == ILLEGAL_REQUEST &&
                    (sshdr->asc == 0x26 || sshdr->asc == 0x24))
                        return -EINVAL;

                fallthrough;
        default:
                return PR_STS_IOERR;
        }
}

static int sd_pr_in_command(struct block_device *bdev, u8 sa,
                            unsigned char *data, int data_len)
{
        struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk);
        struct scsi_device *sdev = sdkp->device;
        struct scsi_sense_hdr sshdr;
        u8 cmd[10] = { PERSISTENT_RESERVE_IN, sa };
        struct scsi_failure failure_defs[] = {
                {
                        .sense = UNIT_ATTENTION,
                        .asc = SCMD_FAILURE_ASC_ANY,
                        .ascq = SCMD_FAILURE_ASCQ_ANY,
                        .allowed = 5,
                        .result = SAM_STAT_CHECK_CONDITION,
                },
                {}
        };
        struct scsi_failures failures = {
                .failure_definitions = failure_defs,
        };
        const struct scsi_exec_args exec_args = {
                .sshdr = &sshdr,
                .failures = &failures,
        };
        int result;

        put_unaligned_be16(data_len, &cmd[7]);

        result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, data, data_len,
                                  SD_TIMEOUT, sdkp->max_retries, &exec_args);
        if (scsi_status_is_check_condition(result) &&
            scsi_sense_valid(&sshdr)) {
                sdev_printk(KERN_INFO, sdev, "PR command failed: %d\n", result);
                scsi_print_sense_hdr(sdev, NULL, &sshdr);
        }

        if (result <= 0)
                return result;

        return sd_scsi_to_pr_err(&sshdr, result);
}

static int sd_pr_read_keys(struct block_device *bdev, struct pr_keys *keys_info)
{
        int result, i, data_offset, num_copy_keys;
        u32 num_keys = keys_info->num_keys;
        int data_len = num_keys * 8 + 8;
        u8 *data;

        data = kzalloc(data_len, GFP_KERNEL);
        if (!data)
                return -ENOMEM;

        result = sd_pr_in_command(bdev, READ_KEYS, data, data_len);
        if (result)
                goto free_data;

        keys_info->generation = get_unaligned_be32(&data[0]);
        keys_info->num_keys = get_unaligned_be32(&data[4]) / 8;

        data_offset = 8;
        num_copy_keys = min(num_keys, keys_info->num_keys);

        for (i = 0; i < num_copy_keys; i++) {
                keys_info->keys[i] = get_unaligned_be64(&data[data_offset]);
                data_offset += 8;
        }

free_data:
        kfree(data);
        return result;
}

static int sd_pr_read_reservation(struct block_device *bdev,
                                  struct pr_held_reservation *rsv)
{
        struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk);
        struct scsi_device *sdev = sdkp->device;
        u8 data[24] = { };
        int result, len;

        result = sd_pr_in_command(bdev, READ_RESERVATION, data, sizeof(data));
        if (result)
                return result;

        len = get_unaligned_be32(&data[4]);
        if (!len)
                return 0;

        /* Make sure we have at least the key and type */
        if (len < 14) {
                sdev_printk(KERN_INFO, sdev,
                            "READ RESERVATION failed due to short return buffer of %d bytes\n",
                            len);
                return -EINVAL;
        }

        rsv->generation = get_unaligned_be32(&data[0]);
        rsv->key = get_unaligned_be64(&data[8]);
        rsv->type = scsi_pr_type_to_block(data[21] & 0x0f);
        return 0;
}

static int sd_pr_out_command(struct block_device *bdev, u8 sa, u64 key,
                             u64 sa_key, enum scsi_pr_type type, u8 flags)
{
        struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk);
        struct scsi_device *sdev = sdkp->device;
        struct scsi_sense_hdr sshdr;
        struct scsi_failure failure_defs[] = {
                {
                        .sense = UNIT_ATTENTION,
                        .asc = SCMD_FAILURE_ASC_ANY,
                        .ascq = SCMD_FAILURE_ASCQ_ANY,
                        .allowed = 5,
                        .result = SAM_STAT_CHECK_CONDITION,
                },
                {}
        };
        struct scsi_failures failures = {
                .failure_definitions = failure_defs,
        };
        const struct scsi_exec_args exec_args = {
                .sshdr = &sshdr,
                .failures = &failures,
        };
        int result;
        u8 cmd[16] = { 0, };
        u8 data[24] = { 0, };

        cmd[0] = PERSISTENT_RESERVE_OUT;
        cmd[1] = sa;
        cmd[2] = type;
        put_unaligned_be32(sizeof(data), &cmd[5]);

        put_unaligned_be64(key, &data[0]);
        put_unaligned_be64(sa_key, &data[8]);
        data[20] = flags;

        result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_OUT, &data,
                                  sizeof(data), SD_TIMEOUT, sdkp->max_retries,
                                  &exec_args);

        if (scsi_status_is_check_condition(result) &&
            scsi_sense_valid(&sshdr)) {
                sdev_printk(KERN_INFO, sdev, "PR command failed: %d\n", result);
                scsi_print_sense_hdr(sdev, NULL, &sshdr);
        }

        if (result <= 0)
                return result;

        return sd_scsi_to_pr_err(&sshdr, result);
}

static int sd_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
                u32 flags)
{
        if (flags & ~PR_FL_IGNORE_KEY)
                return -EOPNOTSUPP;
        return sd_pr_out_command(bdev, (flags & PR_FL_IGNORE_KEY) ? 0x06 : 0x00,
                        old_key, new_key, 0,
                        (1 << 0) /* APTPL */);
}

static int sd_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
                u32 flags)
{
        if (flags)
                return -EOPNOTSUPP;
        return sd_pr_out_command(bdev, 0x01, key, 0,
                                 block_pr_type_to_scsi(type), 0);
}

static int sd_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
{
        return sd_pr_out_command(bdev, 0x02, key, 0,
                                 block_pr_type_to_scsi(type), 0);
}

static int sd_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
                enum pr_type type, bool abort)
{
        return sd_pr_out_command(bdev, abort ? 0x05 : 0x04, old_key, new_key,
                                 block_pr_type_to_scsi(type), 0);
}

static int sd_pr_clear(struct block_device *bdev, u64 key)
{
        return sd_pr_out_command(bdev, 0x03, key, 0, 0, 0);
}

static const struct pr_ops sd_pr_ops = {
        .pr_register        = sd_pr_register,
        .pr_reserve        = sd_pr_reserve,
        .pr_release        = sd_pr_release,
        .pr_preempt        = sd_pr_preempt,
        .pr_clear        = sd_pr_clear,
        .pr_read_keys        = sd_pr_read_keys,
        .pr_read_reservation = sd_pr_read_reservation,
};

static void scsi_disk_free_disk(struct gendisk *disk)
{
        struct scsi_disk *sdkp = scsi_disk(disk);

        put_device(&sdkp->disk_dev);
}

static const struct block_device_operations sd_fops = {
        .owner                        = THIS_MODULE,
        .open                        = sd_open,
        .release                = sd_release,
        .ioctl                        = sd_ioctl,
        .getgeo                        = sd_getgeo,
        .compat_ioctl                = blkdev_compat_ptr_ioctl,
        .check_events                = sd_check_events,
        .unlock_native_capacity        = sd_unlock_native_capacity,
        .report_zones                = sd_zbc_report_zones,
        .get_unique_id                = sd_get_unique_id,
        .free_disk                = scsi_disk_free_disk,
        .pr_ops                        = &sd_pr_ops,
};

/**
 *        sd_eh_reset - reset error handling callback
 *        @scmd:                sd-issued command that has failed
 *
 *        This function is called by the SCSI midlayer before starting
 *        SCSI EH. When counting medium access failures we have to be
 *        careful to register it only only once per device and SCSI EH run;
 *        there might be several timed out commands which will cause the
 *        'max_medium_access_timeouts' counter to trigger after the first
 *        SCSI EH run already and set the device to offline.
 *        So this function resets the internal counter before starting SCSI EH.
 **/
static void sd_eh_reset(struct scsi_cmnd *scmd)
{
        struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->q->disk);

        /* New SCSI EH run, reset gate variable */
        sdkp->ignore_medium_access_errors = false;
}

/**
 *        sd_eh_action - error handling callback
 *        @scmd:                sd-issued command that has failed
 *        @eh_disp:        The recovery disposition suggested by the midlayer
 *
 *        This function is called by the SCSI midlayer upon completion of an
 *        error test command (currently TEST UNIT READY). The result of sending
 *        the eh command is passed in eh_disp.  We're looking for devices that
 *        fail medium access commands but are OK with non access commands like
 *        test unit ready (so wrongly see the device as having a successful
 *        recovery)
 **/
static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp)
{
        struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->q->disk);
        struct scsi_device *sdev = scmd->device;

        if (!scsi_device_online(sdev) ||
            !scsi_medium_access_command(scmd) ||
            host_byte(scmd->result) != DID_TIME_OUT ||
            eh_disp != SUCCESS)
                return eh_disp;

        /*
         * The device has timed out executing a medium access command.
         * However, the TEST UNIT READY command sent during error
         * handling completed successfully. Either the device is in the
         * process of recovering or has it suffered an internal failure
         * that prevents access to the storage medium.
         */
        if (!sdkp->ignore_medium_access_errors) {
                sdkp->medium_access_timed_out++;
                sdkp->ignore_medium_access_errors = true;
        }

        /*
         * If the device keeps failing read/write commands but TEST UNIT
         * READY always completes successfully we assume that medium
         * access is no longer possible and take the device offline.
         */
        if (sdkp->medium_access_timed_out >= sdkp->max_medium_access_timeouts) {
                scmd_printk(KERN_ERR, scmd,
                            "Medium access timeout failure. Offlining disk!\n");
                mutex_lock(&sdev->state_mutex);
                scsi_device_set_state(sdev, SDEV_OFFLINE);
                mutex_unlock(&sdev->state_mutex);

                return SUCCESS;
        }

        return eh_disp;
}

static unsigned int sd_completed_bytes(struct scsi_cmnd *scmd)
{
        struct request *req = scsi_cmd_to_rq(scmd);
        struct scsi_device *sdev = scmd->device;
        unsigned int transferred, good_bytes;
        u64 start_lba, end_lba, bad_lba;

        /*
         * Some commands have a payload smaller than the device logical
         * block size (e.g. INQUIRY on a 4K disk).
         */
        if (scsi_bufflen(scmd) <= sdev->sector_size)
                return 0;

        /* Check if we have a 'bad_lba' information */
        if (!scsi_get_sense_info_fld(scmd->sense_buffer,
                                     SCSI_SENSE_BUFFERSIZE,
                                     &bad_lba))
                return 0;

        /*
         * If the bad lba was reported incorrectly, we have no idea where
         * the error is.
         */
        start_lba = sectors_to_logical(sdev, blk_rq_pos(req));
        end_lba = start_lba + bytes_to_logical(sdev, scsi_bufflen(scmd));
        if (bad_lba < start_lba || bad_lba >= end_lba)
                return 0;

        /*
         * resid is optional but mostly filled in.  When it's unused,
         * its value is zero, so we assume the whole buffer transferred
         */
        transferred = scsi_bufflen(scmd) - scsi_get_resid(scmd);

        /* This computation should always be done in terms of the
         * resolution of the device's medium.
         */
        good_bytes = logical_to_bytes(sdev, bad_lba - start_lba);

        return min(good_bytes, transferred);
}

/**
 *        sd_done - bottom half handler: called when the lower level
 *        driver has completed (successfully or otherwise) a scsi command.
 *        @SCpnt: mid-level's per command structure.
 *
 *        Note: potentially run from within an ISR. Must not block.
 **/
static int sd_done(struct scsi_cmnd *SCpnt)
{
        int result = SCpnt->result;
        unsigned int good_bytes = result ? 0 : scsi_bufflen(SCpnt);
        unsigned int sector_size = SCpnt->device->sector_size;
        unsigned int resid;
        struct scsi_sense_hdr sshdr;
        struct request *req = scsi_cmd_to_rq(SCpnt);
        struct scsi_disk *sdkp = scsi_disk(req->q->disk);
        int sense_valid = 0;
        int sense_deferred = 0;

        switch (req_op(req)) {
        case REQ_OP_DISCARD:
        case REQ_OP_WRITE_ZEROES:
        case REQ_OP_ZONE_RESET:
        case REQ_OP_ZONE_RESET_ALL:
        case REQ_OP_ZONE_OPEN:
        case REQ_OP_ZONE_CLOSE:
        case REQ_OP_ZONE_FINISH:
                if (!result) {
                        good_bytes = blk_rq_bytes(req);
                        scsi_set_resid(SCpnt, 0);
                } else {
                        good_bytes = 0;
                        scsi_set_resid(SCpnt, blk_rq_bytes(req));
                }
                break;
        default:
                /*
                 * In case of bogus fw or device, we could end up having
                 * an unaligned partial completion. Check this here and force
                 * alignment.
                 */
                resid = scsi_get_resid(SCpnt);
                if (resid & (sector_size - 1)) {
                        sd_printk(KERN_INFO, sdkp,
                                "Unaligned partial completion (resid=%u, sector_sz=%u)\n",
                                resid, sector_size);
                        scsi_print_command(SCpnt);
                        resid = min(scsi_bufflen(SCpnt),
                                    round_up(resid, sector_size));
                        scsi_set_resid(SCpnt, resid);
                }
        }

        if (result) {
                sense_valid = scsi_command_normalize_sense(SCpnt, &sshdr);
                if (sense_valid)
                        sense_deferred = scsi_sense_is_deferred(&sshdr);
        }
        sdkp->medium_access_timed_out = 0;

        if (!scsi_status_is_check_condition(result) &&
            (!sense_valid || sense_deferred))
                goto out;

        switch (sshdr.sense_key) {
        case HARDWARE_ERROR:
        case MEDIUM_ERROR:
                good_bytes = sd_completed_bytes(SCpnt);
                break;
        case RECOVERED_ERROR:
                good_bytes = scsi_bufflen(SCpnt);
                break;
        case NO_SENSE:
                /* This indicates a false check condition, so ignore it.  An
                 * unknown amount of data was transferred so treat it as an
                 * error.
                 */
                SCpnt->result = 0;
                memset(SCpnt->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
                break;
        case ABORTED_COMMAND:
                if (sshdr.asc == 0x10)  /* DIF: Target detected corruption */
                        good_bytes = sd_completed_bytes(SCpnt);
                break;
        case ILLEGAL_REQUEST:
                switch (sshdr.asc) {
                case 0x10:        /* DIX: Host detected corruption */
                        good_bytes = sd_completed_bytes(SCpnt);
                        break;
                case 0x20:        /* INVALID COMMAND OPCODE */
                case 0x24:        /* INVALID FIELD IN CDB */
                        switch (SCpnt->cmnd[0]) {
                        case UNMAP:
                                sd_config_discard(sdkp, SD_LBP_DISABLE);
                                break;
                        case WRITE_SAME_16:
                        case WRITE_SAME:
                                if (SCpnt->cmnd[1] & 8) { /* UNMAP */
                                        sd_config_discard(sdkp, SD_LBP_DISABLE);
                                } else {
                                        sdkp->device->no_write_same = 1;
                                        sd_config_write_same(sdkp);
                                        req->rq_flags |= RQF_QUIET;
                                }
                                break;
                        }
                }
                break;
        default:
                break;
        }

 out:
        if (sd_is_zoned(sdkp))
                good_bytes = sd_zbc_complete(SCpnt, good_bytes, &sshdr);

        SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt,
                                           "sd_done: completed %d of %d bytes\n",
                                           good_bytes, scsi_bufflen(SCpnt)));

        return good_bytes;
}

/*
 * spinup disk - called only in sd_revalidate_disk()
 */
static void
sd_spinup_disk(struct scsi_disk *sdkp)
{
        static const u8 cmd[10] = { TEST_UNIT_READY };
        unsigned long spintime_expire = 0;
        int spintime, sense_valid = 0;
        unsigned int the_result;
        struct scsi_sense_hdr sshdr;
        struct scsi_failure failure_defs[] = {
                /* Do not retry Medium Not Present */
                {
                        .sense = UNIT_ATTENTION,
                        .asc = 0x3A,
                        .ascq = SCMD_FAILURE_ASCQ_ANY,
                        .result = SAM_STAT_CHECK_CONDITION,
                },
                {
                        .sense = NOT_READY,
                        .asc = 0x3A,
                        .ascq = SCMD_FAILURE_ASCQ_ANY,
                        .result = SAM_STAT_CHECK_CONDITION,
                },
                /* Retry when scsi_status_is_good would return false 3 times */
                {
                        .result = SCMD_FAILURE_STAT_ANY,
                        .allowed = 3,
                },
                {}
        };
        struct scsi_failures failures = {
                .failure_definitions = failure_defs,
        };
        const struct scsi_exec_args exec_args = {
                .sshdr = &sshdr,
                .failures = &failures,
        };

        spintime = 0;

        /* Spin up drives, as required.  Only do this at boot time */
        /* Spinup needs to be done for module loads too. */
        do {
                bool media_was_present = sdkp->media_present;

                scsi_failures_reset_retries(&failures);

                the_result = scsi_execute_cmd(sdkp->device, cmd, REQ_OP_DRV_IN,
                                              NULL, 0, SD_TIMEOUT,
                                              sdkp->max_retries, &exec_args);


                if (the_result > 0) {
                        /*
                         * If the drive has indicated to us that it doesn't
                         * have any media in it, don't bother with any more
                         * polling.
                         */
                        if (media_not_present(sdkp, &sshdr)) {
                                if (media_was_present)
                                        sd_printk(KERN_NOTICE, sdkp,
                                                  "Media removed, stopped polling\n");
                                return;
                        }
                        sense_valid = scsi_sense_valid(&sshdr);
                }

                if (!scsi_status_is_check_condition(the_result)) {
                        /* no sense, TUR either succeeded or failed
                         * with a status error */
                        if(!spintime && !scsi_status_is_good(the_result)) {
                                sd_print_result(sdkp, "Test Unit Ready failed",
                                                the_result);
                        }
                        break;
                }

                /*
                 * The device does not want the automatic start to be issued.
                 */
                if (sdkp->device->no_start_on_add)
                        break;

                if (sense_valid && sshdr.sense_key == NOT_READY) {
                        if (sshdr.asc == 4 && sshdr.ascq == 3)
                                break;        /* manual intervention required */
                        if (sshdr.asc == 4 && sshdr.ascq == 0xb)
                                break;        /* standby */
                        if (sshdr.asc == 4 && sshdr.ascq == 0xc)
                                break;        /* unavailable */
                        if (sshdr.asc == 4 && sshdr.ascq == 0x1b)
                                break;        /* sanitize in progress */
                        if (sshdr.asc == 4 && sshdr.ascq == 0x24)
                                break;        /* depopulation in progress */
                        if (sshdr.asc == 4 && sshdr.ascq == 0x25)
                                break;        /* depopulation restoration in progress */
                        /*
                         * Issue command to spin up drive when not ready
                         */
                        if (!spintime) {
                                /* Return immediately and start spin cycle */
                                const u8 start_cmd[10] = {
                                        [0] = START_STOP,
                                        [1] = 1,
                                        [4] = sdkp->device->start_stop_pwr_cond ?
                                                0x11 : 1,
                                };

                                sd_printk(KERN_NOTICE, sdkp, "Spinning up disk...");
                                scsi_execute_cmd(sdkp->device, start_cmd,
                                                 REQ_OP_DRV_IN, NULL, 0,
                                                 SD_TIMEOUT, sdkp->max_retries,
                                                 &exec_args);
                                spintime_expire = jiffies + 100 * HZ;
                                spintime = 1;
                        }
                        /* Wait 1 second for next try */
                        msleep(1000);
                        printk(KERN_CONT ".");

                /*
                 * Wait for USB flash devices with slow firmware.
                 * Yes, this sense key/ASC combination shouldn't
                 * occur here.  It's characteristic of these devices.
                 */
                } else if (sense_valid &&
                                sshdr.sense_key == UNIT_ATTENTION &&
                                sshdr.asc == 0x28) {
                        if (!spintime) {
                                spintime_expire = jiffies + 5 * HZ;
                                spintime = 1;
                        }
                        /* Wait 1 second for next try */
                        msleep(1000);
                } else {
                        /* we don't understand the sense code, so it's
                         * probably pointless to loop */
                        if(!spintime) {
                                sd_printk(KERN_NOTICE, sdkp, "Unit Not Ready\n");
                                sd_print_sense_hdr(sdkp, &sshdr);
                        }
                        break;
                }
                                
        } while (spintime && time_before_eq(jiffies, spintime_expire));

        if (spintime) {
                if (scsi_status_is_good(the_result))
                        printk(KERN_CONT "ready\n");
                else
                        printk(KERN_CONT "not responding...\n");
        }
}

/*
 * Determine whether disk supports Data Integrity Field.
 */
static int sd_read_protection_type(struct scsi_disk *sdkp, unsigned char *buffer)
{
        struct scsi_device *sdp = sdkp->device;
        u8 type;

        if (scsi_device_protection(sdp) == 0 || (buffer[12] & 1) == 0) {
                sdkp->protection_type = 0;
                return 0;
        }

        type = ((buffer[12] >> 1) & 7) + 1; /* P_TYPE 0 = Type 1 */

        if (type > T10_PI_TYPE3_PROTECTION) {
                sd_printk(KERN_ERR, sdkp, "formatted with unsupported"        \
                          " protection type %u. Disabling disk!\n",
                          type);
                sdkp->protection_type = 0;
                return -ENODEV;
        }

        sdkp->protection_type = type;

        return 0;
}

static void sd_config_protection(struct scsi_disk *sdkp)
{
        struct scsi_device *sdp = sdkp->device;

        sd_dif_config_host(sdkp);

        if (!sdkp->protection_type)
                return;

        if (!scsi_host_dif_capable(sdp->host, sdkp->protection_type)) {
                sd_first_printk(KERN_NOTICE, sdkp,
                                "Disabling DIF Type %u protection\n",
                                sdkp->protection_type);
                sdkp->protection_type = 0;
        }

        sd_first_printk(KERN_NOTICE, sdkp, "Enabling DIF Type %u protection\n",
                        sdkp->protection_type);
}

static void read_capacity_error(struct scsi_disk *sdkp, struct scsi_device *sdp,
                        struct scsi_sense_hdr *sshdr, int sense_valid,
                        int the_result)
{
        if (sense_valid)
                sd_print_sense_hdr(sdkp, sshdr);
        else
                sd_printk(KERN_NOTICE, sdkp, "Sense not available.\n");

        /*
         * Set dirty bit for removable devices if not ready -
         * sometimes drives will not report this properly.
         */
        if (sdp->removable &&
            sense_valid && sshdr->sense_key == NOT_READY)
                set_media_not_present(sdkp);

        /*
         * We used to set media_present to 0 here to indicate no media
         * in the drive, but some drives fail read capacity even with
         * media present, so we can't do that.
         */
        sdkp->capacity = 0; /* unknown mapped to zero - as usual */
}

#define RC16_LEN 32
#if RC16_LEN > SD_BUF_SIZE
#error RC16_LEN must not be more than SD_BUF_SIZE
#endif

#define READ_CAPACITY_RETRIES_ON_RESET        10

static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp,
                                                unsigned char *buffer)
{
        unsigned char cmd[16];
        struct scsi_sense_hdr sshdr;
        const struct scsi_exec_args exec_args = {
                .sshdr = &sshdr,
        };
        int sense_valid = 0;
        int the_result;
        int retries = 3, reset_retries = READ_CAPACITY_RETRIES_ON_RESET;
        unsigned int alignment;
        unsigned long long lba;
        unsigned sector_size;

        if (sdp->no_read_capacity_16)
                return -EINVAL;

        do {
                memset(cmd, 0, 16);
                cmd[0] = SERVICE_ACTION_IN_16;
                cmd[1] = SAI_READ_CAPACITY_16;
                cmd[13] = RC16_LEN;
                memset(buffer, 0, RC16_LEN);

                the_result = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN,
                                              buffer, RC16_LEN, SD_TIMEOUT,
                                              sdkp->max_retries, &exec_args);
                if (the_result > 0) {
                        if (media_not_present(sdkp, &sshdr))
                                return -ENODEV;

                        sense_valid = scsi_sense_valid(&sshdr);
                        if (sense_valid &&
                            sshdr.sense_key == ILLEGAL_REQUEST &&
                            (sshdr.asc == 0x20 || sshdr.asc == 0x24) &&
                            sshdr.ascq == 0x00)
                                /* Invalid Command Operation Code or
                                 * Invalid Field in CDB, just retry
                                 * silently with RC10 */
                                return -EINVAL;
                        if (sense_valid &&
                            sshdr.sense_key == UNIT_ATTENTION &&
                            sshdr.asc == 0x29 && sshdr.ascq == 0x00)
                                /* Device reset might occur several times,
                                 * give it one more chance */
                                if (--reset_retries > 0)
                                        continue;
                }
                retries--;

        } while (the_result && retries);

        if (the_result) {
                sd_print_result(sdkp, "Read Capacity(16) failed", the_result);
                read_capacity_error(sdkp, sdp, &sshdr, sense_valid, the_result);
                return -EINVAL;
        }

        sector_size = get_unaligned_be32(&buffer[8]);
        lba = get_unaligned_be64(&buffer[0]);

        if (sd_read_protection_type(sdkp, buffer) < 0) {
                sdkp->capacity = 0;
                return -ENODEV;
        }

        /* Logical blocks per physical block exponent */
        sdkp->physical_block_size = (1 << (buffer[13] & 0xf)) * sector_size;

        /* RC basis */
        sdkp->rc_basis = (buffer[12] >> 4) & 0x3;

        /* Lowest aligned logical block */
        alignment = ((buffer[14] & 0x3f) << 8 | buffer[15]) * sector_size;
        blk_queue_alignment_offset(sdp->request_queue, alignment);
        if (alignment && sdkp->first_scan)
                sd_printk(KERN_NOTICE, sdkp,
                          "physical block alignment offset: %u\n", alignment);

        if (buffer[14] & 0x80) { /* LBPME */
                sdkp->lbpme = 1;

                if (buffer[14] & 0x40) /* LBPRZ */
                        sdkp->lbprz = 1;

                sd_config_discard(sdkp, SD_LBP_WS16);
        }

        sdkp->capacity = lba + 1;
        return sector_size;
}

static int read_capacity_10(struct scsi_disk *sdkp, struct scsi_device *sdp,
                                                unsigned char *buffer)
{
        static const u8 cmd[10] = { READ_CAPACITY };
        struct scsi_sense_hdr sshdr;
        struct scsi_failure failure_defs[] = {
                /* Do not retry Medium Not Present */
                {
                        .sense = UNIT_ATTENTION,
                        .asc = 0x3A,
                        .result = SAM_STAT_CHECK_CONDITION,
                },
                {
                        .sense = NOT_READY,
                        .asc = 0x3A,
                        .result = SAM_STAT_CHECK_CONDITION,
                },
                 /* Device reset might occur several times so retry a lot */
                {
                        .sense = UNIT_ATTENTION,
                        .asc = 0x29,
                        .allowed = READ_CAPACITY_RETRIES_ON_RESET,
                        .result = SAM_STAT_CHECK_CONDITION,
                },
                /* Any other error not listed above retry 3 times */
                {
                        .result = SCMD_FAILURE_RESULT_ANY,
                        .allowed = 3,
                },
                {}
        };
        struct scsi_failures failures = {
                .failure_definitions = failure_defs,
        };
        const struct scsi_exec_args exec_args = {
                .sshdr = &sshdr,
                .failures = &failures,
        };
        int sense_valid = 0;
        int the_result;
        sector_t lba;
        unsigned sector_size;

        memset(buffer, 0, 8);

        the_result = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, buffer,
                                      8, SD_TIMEOUT, sdkp->max_retries,
                                      &exec_args);

        if (the_result > 0) {
                sense_valid = scsi_sense_valid(&sshdr);

                if (media_not_present(sdkp, &sshdr))
                        return -ENODEV;
        }

        if (the_result) {
                sd_print_result(sdkp, "Read Capacity(10) failed", the_result);
                read_capacity_error(sdkp, sdp, &sshdr, sense_valid, the_result);
                return -EINVAL;
        }

        sector_size = get_unaligned_be32(&buffer[4]);
        lba = get_unaligned_be32(&buffer[0]);

        if (sdp->no_read_capacity_16 && (lba == 0xffffffff)) {
                /* Some buggy (usb cardreader) devices return an lba of
                   0xffffffff when the want to report a size of 0 (with
                   which they really mean no media is present) */
                sdkp->capacity = 0;
                sdkp->physical_block_size = sector_size;
                return sector_size;
        }

        sdkp->capacity = lba + 1;
        sdkp->physical_block_size = sector_size;
        return sector_size;
}

static int sd_try_rc16_first(struct scsi_device *sdp)
{
        if (sdp->host->max_cmd_len < 16)
                return 0;
        if (sdp->try_rc_10_first)
                return 0;
        if (sdp->scsi_level > SCSI_SPC_2)
                return 1;
        if (scsi_device_protection(sdp))
                return 1;
        return 0;
}

/*
 * read disk capacity
 */
static void
sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer)
{
        int sector_size;
        struct scsi_device *sdp = sdkp->device;

        if (sd_try_rc16_first(sdp)) {
                sector_size = read_capacity_16(sdkp, sdp, buffer);
                if (sector_size == -EOVERFLOW)
                        goto got_data;
                if (sector_size == -ENODEV)
                        return;
                if (sector_size < 0)
                        sector_size = read_capacity_10(sdkp, sdp, buffer);
                if (sector_size < 0)
                        return;
        } else {
                sector_size = read_capacity_10(sdkp, sdp, buffer);
                if (sector_size == -EOVERFLOW)
                        goto got_data;
                if (sector_size < 0)
                        return;
                if ((sizeof(sdkp->capacity) > 4) &&
                    (sdkp->capacity > 0xffffffffULL)) {
                        int old_sector_size = sector_size;
                        sd_printk(KERN_NOTICE, sdkp, "Very big device. "
                                        "Trying to use READ CAPACITY(16).\n");
                        sector_size = read_capacity_16(sdkp, sdp, buffer);
                        if (sector_size < 0) {
                                sd_printk(KERN_NOTICE, sdkp,
                                        "Using 0xffffffff as device size\n");
                                sdkp->capacity = 1 + (sector_t) 0xffffffff;
                                sector_size = old_sector_size;
                                goto got_data;
                        }
                        /* Remember that READ CAPACITY(16) succeeded */
                        sdp->try_rc_10_first = 0;
                }
        }

        /* Some devices are known to return the total number of blocks,
         * not the highest block number.  Some devices have versions
         * which do this and others which do not.  Some devices we might
         * suspect of doing this but we don't know for certain.
         *
         * If we know the reported capacity is wrong, decrement it.  If
         * we can only guess, then assume the number of blocks is even
         * (usually true but not always) and err on the side of lowering
         * the capacity.
         */
        if (sdp->fix_capacity ||
            (sdp->guess_capacity && (sdkp->capacity & 0x01))) {
                sd_printk(KERN_INFO, sdkp, "Adjusting the sector count "
                                "from its reported value: %llu\n",
                                (unsigned long long) sdkp->capacity);
                --sdkp->capacity;
        }

got_data:
        if (sector_size == 0) {
                sector_size = 512;
                sd_printk(KERN_NOTICE, sdkp, "Sector size 0 reported, "
                          "assuming 512.\n");
        }

        if (sector_size != 512 &&
            sector_size != 1024 &&
            sector_size != 2048 &&
            sector_size != 4096) {
                sd_printk(KERN_NOTICE, sdkp, "Unsupported sector size %d.\n",
                          sector_size);
                /*
                 * The user might want to re-format the drive with
                 * a supported sectorsize.  Once this happens, it
                 * would be relatively trivial to set the thing up.
                 * For this reason, we leave the thing in the table.
                 */
                sdkp->capacity = 0;
                /*
                 * set a bogus sector size so the normal read/write
                 * logic in the block layer will eventually refuse any
                 * request on this device without tripping over power
                 * of two sector size assumptions
                 */
                sector_size = 512;
        }
        blk_queue_logical_block_size(sdp->request_queue, sector_size);
        blk_queue_physical_block_size(sdp->request_queue,
                                      sdkp->physical_block_size);
        sdkp->device->sector_size = sector_size;

        if (sdkp->capacity > 0xffffffff)
                sdp->use_16_for_rw = 1;

}

/*
 * Print disk capacity
 */
static void
sd_print_capacity(struct scsi_disk *sdkp,
                  sector_t old_capacity)
{
        int sector_size = sdkp->device->sector_size;
        char cap_str_2[10], cap_str_10[10];

        if (!sdkp->first_scan && old_capacity == sdkp->capacity)
                return;

        string_get_size(sdkp->capacity, sector_size,
                        STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
        string_get_size(sdkp->capacity, sector_size,
                        STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));

        sd_printk(KERN_NOTICE, sdkp,
                  "%llu %d-byte logical blocks: (%s/%s)\n",
                  (unsigned long long)sdkp->capacity,
                  sector_size, cap_str_10, cap_str_2);

        if (sdkp->physical_block_size != sector_size)
                sd_printk(KERN_NOTICE, sdkp,
                          "%u-byte physical blocks\n",
                          sdkp->physical_block_size);
}

/* called with buffer of length 512 */
static inline int
sd_do_mode_sense(struct scsi_disk *sdkp, int dbd, int modepage,
                 unsigned char *buffer, int len, struct scsi_mode_data *data,
                 struct scsi_sense_hdr *sshdr)
{
        /*
         * If we must use MODE SENSE(10), make sure that the buffer length
         * is at least 8 bytes so that the mode sense header fits.
         */
        if (sdkp->device->use_10_for_ms && len < 8)
                len = 8;

        return scsi_mode_sense(sdkp->device, dbd, modepage, 0, buffer, len,
                               SD_TIMEOUT, sdkp->max_retries, data, sshdr);
}

/*
 * read write protect setting, if possible - called only in sd_revalidate_disk()
 * called with buffer of length SD_BUF_SIZE
 */
static void
sd_read_write_protect_flag(struct scsi_disk *sdkp, unsigned char *buffer)
{
        int res;
        struct scsi_device *sdp = sdkp->device;
        struct scsi_mode_data data;
        int old_wp = sdkp->write_prot;

        set_disk_ro(sdkp->disk, 0);
        if (sdp->skip_ms_page_3f) {
                sd_first_printk(KERN_NOTICE, sdkp, "Assuming Write Enabled\n");
                return;
        }

        if (sdp->use_192_bytes_for_3f) {
                res = sd_do_mode_sense(sdkp, 0, 0x3F, buffer, 192, &data, NULL);
        } else {
                /*
                 * First attempt: ask for all pages (0x3F), but only 4 bytes.
                 * We have to start carefully: some devices hang if we ask
                 * for more than is available.
                 */
                res = sd_do_mode_sense(sdkp, 0, 0x3F, buffer, 4, &data, NULL);

                /*
                 * Second attempt: ask for page 0 When only page 0 is
                 * implemented, a request for page 3F may return Sense Key
                 * 5: Illegal Request, Sense Code 24: Invalid field in
                 * CDB.
                 */
                if (res < 0)
                        res = sd_do_mode_sense(sdkp, 0, 0, buffer, 4, &data, NULL);

                /*
                 * Third attempt: ask 255 bytes, as we did earlier.
                 */
                if (res < 0)
                        res = sd_do_mode_sense(sdkp, 0, 0x3F, buffer, 255,
                                               &data, NULL);
        }

        if (res < 0) {
                sd_first_printk(KERN_WARNING, sdkp,
                          "Test WP failed, assume Write Enabled\n");
        } else {
                sdkp->write_prot = ((data.device_specific & 0x80) != 0);
                set_disk_ro(sdkp->disk, sdkp->write_prot);
                if (sdkp->first_scan || old_wp != sdkp->write_prot) {
                        sd_printk(KERN_NOTICE, sdkp, "Write Protect is %s\n",
                                  sdkp->write_prot ? "on" : "off");
                        sd_printk(KERN_DEBUG, sdkp, "Mode Sense: %4ph\n", buffer);
                }
        }
}

/*
 * sd_read_cache_type - called only from sd_revalidate_disk()
 * called with buffer of length SD_BUF_SIZE
 */
static void
sd_read_cache_type(struct scsi_disk *sdkp, unsigned char *buffer)
{
        int len = 0, res;
        struct scsi_device *sdp = sdkp->device;

        int dbd;
        int modepage;
        int first_len;
        struct scsi_mode_data data;
        struct scsi_sense_hdr sshdr;
        int old_wce = sdkp->WCE;
        int old_rcd = sdkp->RCD;
        int old_dpofua = sdkp->DPOFUA;


        if (sdkp->cache_override)
                return;

        first_len = 4;
        if (sdp->skip_ms_page_8) {
                if (sdp->type == TYPE_RBC)
                        goto defaults;
                else {
                        if (sdp->skip_ms_page_3f)
                                goto defaults;
                        modepage = 0x3F;
                        if (sdp->use_192_bytes_for_3f)
                                first_len = 192;
                        dbd = 0;
                }
        } else if (sdp->type == TYPE_RBC) {
                modepage = 6;
                dbd = 8;
        } else {
                modepage = 8;
                dbd = 0;
        }

        /* cautiously ask */
        res = sd_do_mode_sense(sdkp, dbd, modepage, buffer, first_len,
                        &data, &sshdr);

        if (res < 0)
                goto bad_sense;

        if (!data.header_length) {
                modepage = 6;
                first_len = 0;
                sd_first_printk(KERN_ERR, sdkp,
                                "Missing header in MODE_SENSE response\n");
        }

        /* that went OK, now ask for the proper length */
        len = data.length;

        /*
         * We're only interested in the first three bytes, actually.
         * But the data cache page is defined for the first 20.
         */
        if (len < 3)
                goto bad_sense;
        else if (len > SD_BUF_SIZE) {
                sd_first_printk(KERN_NOTICE, sdkp, "Truncating mode parameter "
                          "data from %d to %d bytes\n", len, SD_BUF_SIZE);
                len = SD_BUF_SIZE;
        }
        if (modepage == 0x3F && sdp->use_192_bytes_for_3f)
                len = 192;

        /* Get the data */
        if (len > first_len)
                res = sd_do_mode_sense(sdkp, dbd, modepage, buffer, len,
                                &data, &sshdr);

        if (!res) {
                int offset = data.header_length + data.block_descriptor_length;

                while (offset < len) {
                        u8 page_code = buffer[offset] & 0x3F;
                        u8 spf       = buffer[offset] & 0x40;

                        if (page_code == 8 || page_code == 6) {
                                /* We're interested only in the first 3 bytes.
                                 */
                                if (len - offset <= 2) {
                                        sd_first_printk(KERN_ERR, sdkp,
                                                "Incomplete mode parameter "
                                                        "data\n");
                                        goto defaults;
                                } else {
                                        modepage = page_code;
                                        goto Page_found;
                                }
                        } else {
                                /* Go to the next page */
                                if (spf && len - offset > 3)
                                        offset += 4 + (buffer[offset+2] << 8) +
                                                buffer[offset+3];
                                else if (!spf && len - offset > 1)
                                        offset += 2 + buffer[offset+1];
                                else {
                                        sd_first_printk(KERN_ERR, sdkp,
                                                        "Incomplete mode "
                                                        "parameter data\n");
                                        goto defaults;
                                }
                        }
                }

                sd_first_printk(KERN_WARNING, sdkp,
                                "No Caching mode page found\n");
                goto defaults;

        Page_found:
                if (modepage == 8) {
                        sdkp->WCE = ((buffer[offset + 2] & 0x04) != 0);
                        sdkp->RCD = ((buffer[offset + 2] & 0x01) != 0);
                } else {
                        sdkp->WCE = ((buffer[offset + 2] & 0x01) == 0);
                        sdkp->RCD = 0;
                }

                sdkp->DPOFUA = (data.device_specific & 0x10) != 0;
                if (sdp->broken_fua) {
                        sd_first_printk(KERN_NOTICE, sdkp, "Disabling FUA\n");
                        sdkp->DPOFUA = 0;
                } else if (sdkp->DPOFUA && !sdkp->device->use_10_for_rw &&
                           !sdkp->device->use_16_for_rw) {
                        sd_first_printk(KERN_NOTICE, sdkp,
                                  "Uses READ/WRITE(6), disabling FUA\n");
                        sdkp->DPOFUA = 0;
                }

                /* No cache flush allowed for write protected devices */
                if (sdkp->WCE && sdkp->write_prot)
                        sdkp->WCE = 0;

                if (sdkp->first_scan || old_wce != sdkp->WCE ||
                    old_rcd != sdkp->RCD || old_dpofua != sdkp->DPOFUA)
                        sd_printk(KERN_NOTICE, sdkp,
                                  "Write cache: %s, read cache: %s, %s\n",
                                  sdkp->WCE ? "enabled" : "disabled",
                                  sdkp->RCD ? "disabled" : "enabled",
                                  sdkp->DPOFUA ? "supports DPO and FUA"
                                  : "doesn't support DPO or FUA");

                return;
        }

bad_sense:
        if (res == -EIO && scsi_sense_valid(&sshdr) &&
            sshdr.sense_key == ILLEGAL_REQUEST &&
            sshdr.asc == 0x24 && sshdr.ascq == 0x0)
                /* Invalid field in CDB */
                sd_first_printk(KERN_NOTICE, sdkp, "Cache data unavailable\n");
        else
                sd_first_printk(KERN_ERR, sdkp,
                                "Asking for cache data failed\n");

defaults:
        if (sdp->wce_default_on) {
                sd_first_printk(KERN_NOTICE, sdkp,
                                "Assuming drive cache: write back\n");
                sdkp->WCE = 1;
        } else {
                sd_first_printk(KERN_WARNING, sdkp,
                                "Assuming drive cache: write through\n");
                sdkp->WCE = 0;
        }
        sdkp->RCD = 0;
        sdkp->DPOFUA = 0;
}

static bool sd_is_perm_stream(struct scsi_disk *sdkp, unsigned int stream_id)
{
        u8 cdb[16] = { SERVICE_ACTION_IN_16, SAI_GET_STREAM_STATUS };
        struct {
                struct scsi_stream_status_header h;
                struct scsi_stream_status s;
        } buf;
        struct scsi_device *sdev = sdkp->device;
        struct scsi_sense_hdr sshdr;
        const struct scsi_exec_args exec_args = {
                .sshdr = &sshdr,
        };
        int res;

        put_unaligned_be16(stream_id, &cdb[4]);
        put_unaligned_be32(sizeof(buf), &cdb[10]);

        res = scsi_execute_cmd(sdev, cdb, REQ_OP_DRV_IN, &buf, sizeof(buf),
                               SD_TIMEOUT, sdkp->max_retries, &exec_args);
        if (res < 0)
                return false;
        if (scsi_status_is_check_condition(res) && scsi_sense_valid(&sshdr))
                sd_print_sense_hdr(sdkp, &sshdr);
        if (res)
                return false;
        if (get_unaligned_be32(&buf.h.len) < sizeof(struct scsi_stream_status))
                return false;
        return buf.h.stream_status[0].perm;
}

static void sd_read_io_hints(struct scsi_disk *sdkp, unsigned char *buffer)
{
        struct scsi_device *sdp = sdkp->device;
        const struct scsi_io_group_descriptor *desc, *start, *end;
        u16 permanent_stream_count_old;
        struct scsi_sense_hdr sshdr;
        struct scsi_mode_data data;
        int res;

        if (sdp->sdev_bflags & BLIST_SKIP_IO_HINTS)
                return;

        res = scsi_mode_sense(sdp, /*dbd=*/0x8, /*modepage=*/0x0a,
                              /*subpage=*/0x05, buffer, SD_BUF_SIZE, SD_TIMEOUT,
                              sdkp->max_retries, &data, &sshdr);
        if (res < 0)
                return;
        start = (void *)buffer + data.header_length + 16;
        end = (void *)buffer + ALIGN_DOWN(data.header_length + data.length,
                                          sizeof(*end));
        /*
         * From "SBC-5 Constrained Streams with Data Lifetimes": Device severs
         * should assign the lowest numbered stream identifiers to permanent
         * streams.
         */
        for (desc = start; desc < end; desc++)
                if (!desc->st_enble || !sd_is_perm_stream(sdkp, desc - start))
                        break;
        permanent_stream_count_old = sdkp->permanent_stream_count;
        sdkp->permanent_stream_count = desc - start;
        if (sdkp->rscs && sdkp->permanent_stream_count < 2)
                sd_printk(KERN_INFO, sdkp,
                          "Unexpected: RSCS has been set and the permanent stream count is %u\n",
                          sdkp->permanent_stream_count);
        else if (sdkp->permanent_stream_count != permanent_stream_count_old)
                sd_printk(KERN_INFO, sdkp, "permanent stream count = %d\n",
                          sdkp->permanent_stream_count);
}

/*
 * The ATO bit indicates whether the DIF application tag is available
 * for use by the operating system.
 */
static void sd_read_app_tag_own(struct scsi_disk *sdkp, unsigned char *buffer)
{
        int res, offset;
        struct scsi_device *sdp = sdkp->device;
        struct scsi_mode_data data;
        struct scsi_sense_hdr sshdr;

        if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC)
                return;

        if (sdkp->protection_type == 0)
                return;

        res = scsi_mode_sense(sdp, 1, 0x0a, 0, buffer, 36, SD_TIMEOUT,
                              sdkp->max_retries, &data, &sshdr);

        if (res < 0 || !data.header_length ||
            data.length < 6) {
                sd_first_printk(KERN_WARNING, sdkp,
                          "getting Control mode page failed, assume no ATO\n");

                if (res == -EIO && scsi_sense_valid(&sshdr))
                        sd_print_sense_hdr(sdkp, &sshdr);

                return;
        }

        offset = data.header_length + data.block_descriptor_length;

        if ((buffer[offset] & 0x3f) != 0x0a) {
                sd_first_printk(KERN_ERR, sdkp, "ATO Got wrong page\n");
                return;
        }

        if ((buffer[offset + 5] & 0x80) == 0)
                return;

        sdkp->ATO = 1;

        return;
}

/**
 * sd_read_block_limits - Query disk device for preferred I/O sizes.
 * @sdkp: disk to query
 */
static void sd_read_block_limits(struct scsi_disk *sdkp)
{
        struct scsi_vpd *vpd;

        rcu_read_lock();

        vpd = rcu_dereference(sdkp->device->vpd_pgb0);
        if (!vpd || vpd->len < 16)
                goto out;

        sdkp->min_xfer_blocks = get_unaligned_be16(&vpd->data[6]);
        sdkp->max_xfer_blocks = get_unaligned_be32(&vpd->data[8]);
        sdkp->opt_xfer_blocks = get_unaligned_be32(&vpd->data[12]);

        if (vpd->len >= 64) {
                unsigned int lba_count, desc_count;

                sdkp->max_ws_blocks = (u32)get_unaligned_be64(&vpd->data[36]);

                if (!sdkp->lbpme)
                        goto out;

                lba_count = get_unaligned_be32(&vpd->data[20]);
                desc_count = get_unaligned_be32(&vpd->data[24]);

                if (lba_count && desc_count)
                        sdkp->max_unmap_blocks = lba_count;

                sdkp->unmap_granularity = get_unaligned_be32(&vpd->data[28]);

                if (vpd->data[32] & 0x80)
                        sdkp->unmap_alignment =
                                get_unaligned_be32(&vpd->data[32]) & ~(1 << 31);

                if (!sdkp->lbpvpd) { /* LBP VPD page not provided */

                        if (sdkp->max_unmap_blocks)
                                sd_config_discard(sdkp, SD_LBP_UNMAP);
                        else
                                sd_config_discard(sdkp, SD_LBP_WS16);

                } else {        /* LBP VPD page tells us what to use */
                        if (sdkp->lbpu && sdkp->max_unmap_blocks)
                                sd_config_discard(sdkp, SD_LBP_UNMAP);
                        else if (sdkp->lbpws)
                                sd_config_discard(sdkp, SD_LBP_WS16);
                        else if (sdkp->lbpws10)
                                sd_config_discard(sdkp, SD_LBP_WS10);
                        else
                                sd_config_discard(sdkp, SD_LBP_DISABLE);
                }
        }

 out:
        rcu_read_unlock();
}

/* Parse the Block Limits Extension VPD page (0xb7) */
static void sd_read_block_limits_ext(struct scsi_disk *sdkp)
{
        struct scsi_vpd *vpd;

        rcu_read_lock();
        vpd = rcu_dereference(sdkp->device->vpd_pgb7);
        if (vpd && vpd->len >= 2)
                sdkp->rscs = vpd->data[5] & 1;
        rcu_read_unlock();
}

/**
 * sd_read_block_characteristics - Query block dev. characteristics
 * @sdkp: disk to query
 */
static void sd_read_block_characteristics(struct scsi_disk *sdkp)
{
        struct request_queue *q = sdkp->disk->queue;
        struct scsi_vpd *vpd;
        u16 rot;

        rcu_read_lock();
        vpd = rcu_dereference(sdkp->device->vpd_pgb1);

        if (!vpd || vpd->len < 8) {
                rcu_read_unlock();
                return;
        }

        rot = get_unaligned_be16(&vpd->data[4]);
        sdkp->zoned = (vpd->data[8] >> 4) & 3;
        rcu_read_unlock();

        if (rot == 1) {
                blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
                blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
        }


#ifdef CONFIG_BLK_DEV_ZONED /* sd_probe rejects ZBD devices early otherwise */
        if (sdkp->device->type == TYPE_ZBC) {
                /*
                 * Host-managed.
                 */
                disk_set_zoned(sdkp->disk);

                /*
                 * Per ZBC and ZAC specifications, writes in sequential write
                 * required zones of host-managed devices must be aligned to
                 * the device physical block size.
                 */
                blk_queue_zone_write_granularity(q, sdkp->physical_block_size);
        } else {
                /*
                 * Host-aware devices are treated as conventional.
                 */
                WARN_ON_ONCE(blk_queue_is_zoned(q));
        }
#endif /* CONFIG_BLK_DEV_ZONED */

        if (!sdkp->first_scan)
                return;

        if (blk_queue_is_zoned(q))
                sd_printk(KERN_NOTICE, sdkp, "Host-managed zoned block device\n");
        else if (sdkp->zoned == 1)
                sd_printk(KERN_NOTICE, sdkp, "Host-aware SMR disk used as regular disk\n");
        else if (sdkp->zoned == 2)
                sd_printk(KERN_NOTICE, sdkp, "Drive-managed SMR disk\n");
}

/**
 * sd_read_block_provisioning - Query provisioning VPD page
 * @sdkp: disk to query
 */
static void sd_read_block_provisioning(struct scsi_disk *sdkp)
{
        struct scsi_vpd *vpd;

        if (sdkp->lbpme == 0)
                return;

        rcu_read_lock();
        vpd = rcu_dereference(sdkp->device->vpd_pgb2);

        if (!vpd || vpd->len < 8) {
                rcu_read_unlock();
                return;
        }

        sdkp->lbpvpd        = 1;
        sdkp->lbpu        = (vpd->data[5] >> 7) & 1; /* UNMAP */
        sdkp->lbpws        = (vpd->data[5] >> 6) & 1; /* WRITE SAME(16) w/ UNMAP */
        sdkp->lbpws10        = (vpd->data[5] >> 5) & 1; /* WRITE SAME(10) w/ UNMAP */
        rcu_read_unlock();
}

static void sd_read_write_same(struct scsi_disk *sdkp, unsigned char *buffer)
{
        struct scsi_device *sdev = sdkp->device;

        if (sdev->host->no_write_same) {
                sdev->no_write_same = 1;

                return;
        }

        if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, INQUIRY, 0) < 0) {
                struct scsi_vpd *vpd;

                sdev->no_report_opcodes = 1;

                /* Disable WRITE SAME if REPORT SUPPORTED OPERATION
                 * CODES is unsupported and the device has an ATA
                 * Information VPD page (SAT).
                 */
                rcu_read_lock();
                vpd = rcu_dereference(sdev->vpd_pg89);
                if (vpd)
                        sdev->no_write_same = 1;
                rcu_read_unlock();
        }

        if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, WRITE_SAME_16, 0) == 1)
                sdkp->ws16 = 1;

        if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, WRITE_SAME, 0) == 1)
                sdkp->ws10 = 1;
}

static void sd_read_security(struct scsi_disk *sdkp, unsigned char *buffer)
{
        struct scsi_device *sdev = sdkp->device;

        if (!sdev->security_supported)
                return;

        if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE,
                        SECURITY_PROTOCOL_IN, 0) == 1 &&
            scsi_report_opcode(sdev, buffer, SD_BUF_SIZE,
                        SECURITY_PROTOCOL_OUT, 0) == 1)
                sdkp->security = 1;
}

static inline sector_t sd64_to_sectors(struct scsi_disk *sdkp, u8 *buf)
{
        return logical_to_sectors(sdkp->device, get_unaligned_be64(buf));
}

/**
 * sd_read_cpr - Query concurrent positioning ranges
 * @sdkp:        disk to query
 */
static void sd_read_cpr(struct scsi_disk *sdkp)
{
        struct blk_independent_access_ranges *iars = NULL;
        unsigned char *buffer = NULL;
        unsigned int nr_cpr = 0;
        int i, vpd_len, buf_len = SD_BUF_SIZE;
        u8 *desc;

        /*
         * We need to have the capacity set first for the block layer to be
         * able to check the ranges.
         */
        if (sdkp->first_scan)
                return;

        if (!sdkp->capacity)
                goto out;

        /*
         * Concurrent Positioning Ranges VPD: there can be at most 256 ranges,
         * leading to a maximum page size of 64 + 256*32 bytes.
         */
        buf_len = 64 + 256*32;
        buffer = kmalloc(buf_len, GFP_KERNEL);
        if (!buffer || scsi_get_vpd_page(sdkp->device, 0xb9, buffer, buf_len))
                goto out;

        /* We must have at least a 64B header and one 32B range descriptor */
        vpd_len = get_unaligned_be16(&buffer[2]) + 4;
        if (vpd_len > buf_len || vpd_len < 64 + 32 || (vpd_len & 31)) {
                sd_printk(KERN_ERR, sdkp,
                          "Invalid Concurrent Positioning Ranges VPD page\n");
                goto out;
        }

        nr_cpr = (vpd_len - 64) / 32;
        if (nr_cpr == 1) {
                nr_cpr = 0;
                goto out;
        }

        iars = disk_alloc_independent_access_ranges(sdkp->disk, nr_cpr);
        if (!iars) {
                nr_cpr = 0;
                goto out;
        }

        desc = &buffer[64];
        for (i = 0; i < nr_cpr; i++, desc += 32) {
                if (desc[0] != i) {
                        sd_printk(KERN_ERR, sdkp,
                                "Invalid Concurrent Positioning Range number\n");
                        nr_cpr = 0;
                        break;
                }

                iars->ia_range[i].sector = sd64_to_sectors(sdkp, desc + 8);
                iars->ia_range[i].nr_sectors = sd64_to_sectors(sdkp, desc + 16);
        }

out:
        disk_set_independent_access_ranges(sdkp->disk, iars);
        if (nr_cpr && sdkp->nr_actuators != nr_cpr) {
                sd_printk(KERN_NOTICE, sdkp,
                          "%u concurrent positioning ranges\n", nr_cpr);
                sdkp->nr_actuators = nr_cpr;
        }

        kfree(buffer);
}

static bool sd_validate_min_xfer_size(struct scsi_disk *sdkp)
{
        struct scsi_device *sdp = sdkp->device;
        unsigned int min_xfer_bytes =
                logical_to_bytes(sdp, sdkp->min_xfer_blocks);

        if (sdkp->min_xfer_blocks == 0)
                return false;

        if (min_xfer_bytes & (sdkp->physical_block_size - 1)) {
                sd_first_printk(KERN_WARNING, sdkp,
                                "Preferred minimum I/O size %u bytes not a " \
                                "multiple of physical block size (%u bytes)\n",
                                min_xfer_bytes, sdkp->physical_block_size);
                sdkp->min_xfer_blocks = 0;
                return false;
        }

        sd_first_printk(KERN_INFO, sdkp, "Preferred minimum I/O size %u bytes\n",
                        min_xfer_bytes);
        return true;
}

/*
 * Determine the device's preferred I/O size for reads and writes
 * unless the reported value is unreasonably small, large, not a
 * multiple of the physical block size, or simply garbage.
 */
static bool sd_validate_opt_xfer_size(struct scsi_disk *sdkp,
                                      unsigned int dev_max)
{
        struct scsi_device *sdp = sdkp->device;
        unsigned int opt_xfer_bytes =
                logical_to_bytes(sdp, sdkp->opt_xfer_blocks);
        unsigned int min_xfer_bytes =
                logical_to_bytes(sdp, sdkp->min_xfer_blocks);

        if (sdkp->opt_xfer_blocks == 0)
                return false;

        if (sdkp->opt_xfer_blocks > dev_max) {
                sd_first_printk(KERN_WARNING, sdkp,
                                "Optimal transfer size %u logical blocks " \
                                "> dev_max (%u logical blocks)\n",
                                sdkp->opt_xfer_blocks, dev_max);
                return false;
        }

        if (sdkp->opt_xfer_blocks > SD_DEF_XFER_BLOCKS) {
                sd_first_printk(KERN_WARNING, sdkp,
                                "Optimal transfer size %u logical blocks " \
                                "> sd driver limit (%u logical blocks)\n",
                                sdkp->opt_xfer_blocks, SD_DEF_XFER_BLOCKS);
                return false;
        }

        if (opt_xfer_bytes < PAGE_SIZE) {
                sd_first_printk(KERN_WARNING, sdkp,
                                "Optimal transfer size %u bytes < " \
                                "PAGE_SIZE (%u bytes)\n",
                                opt_xfer_bytes, (unsigned int)PAGE_SIZE);
                return false;
        }

        if (min_xfer_bytes && opt_xfer_bytes % min_xfer_bytes) {
                sd_first_printk(KERN_WARNING, sdkp,
                                "Optimal transfer size %u bytes not a " \
                                "multiple of preferred minimum block " \
                                "size (%u bytes)\n",
                                opt_xfer_bytes, min_xfer_bytes);
                return false;
        }

        if (opt_xfer_bytes & (sdkp->physical_block_size - 1)) {
                sd_first_printk(KERN_WARNING, sdkp,
                                "Optimal transfer size %u bytes not a " \
                                "multiple of physical block size (%u bytes)\n",
                                opt_xfer_bytes, sdkp->physical_block_size);
                return false;
        }

        sd_first_printk(KERN_INFO, sdkp, "Optimal transfer size %u bytes\n",
                        opt_xfer_bytes);
        return true;
}

static void sd_read_block_zero(struct scsi_disk *sdkp)
{
        struct scsi_device *sdev = sdkp->device;
        unsigned int buf_len = sdev->sector_size;
        u8 *buffer, cmd[16] = { };

        buffer = kmalloc(buf_len, GFP_KERNEL);
        if (!buffer)
                return;

        if (sdev->use_16_for_rw) {
                cmd[0] = READ_16;
                put_unaligned_be64(0, &cmd[2]); /* Logical block address 0 */
                put_unaligned_be32(1, &cmd[10]);/* Transfer 1 logical block */
        } else {
                cmd[0] = READ_10;
                put_unaligned_be32(0, &cmd[2]); /* Logical block address 0 */
                put_unaligned_be16(1, &cmd[7]);        /* Transfer 1 logical block */
        }

        scsi_execute_cmd(sdkp->device, cmd, REQ_OP_DRV_IN, buffer, buf_len,
                         SD_TIMEOUT, sdkp->max_retries, NULL);
        kfree(buffer);
}

/**
 *        sd_revalidate_disk - called the first time a new disk is seen,
 *        performs disk spin up, read_capacity, etc.
 *        @disk: struct gendisk we care about
 **/
static int sd_revalidate_disk(struct gendisk *disk)
{
        struct scsi_disk *sdkp = scsi_disk(disk);
        struct scsi_device *sdp = sdkp->device;
        struct request_queue *q = sdkp->disk->queue;
        sector_t old_capacity = sdkp->capacity;
        unsigned char *buffer;
        unsigned int dev_max, rw_max;

        SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp,
                                      "sd_revalidate_disk\n"));

        /*
         * If the device is offline, don't try and read capacity or any
         * of the other niceties.
         */
        if (!scsi_device_online(sdp))
                goto out;

        buffer = kmalloc(SD_BUF_SIZE, GFP_KERNEL);
        if (!buffer) {
                sd_printk(KERN_WARNING, sdkp, "sd_revalidate_disk: Memory "
                          "allocation failure.\n");
                goto out;
        }

        sd_spinup_disk(sdkp);

        /*
         * Without media there is no reason to ask; moreover, some devices
         * react badly if we do.
         */
        if (sdkp->media_present) {
                sd_read_capacity(sdkp, buffer);
                /*
                 * Some USB/UAS devices return generic values for mode pages
                 * until the media has been accessed. Trigger a READ operation
                 * to force the device to populate mode pages.
                 */
                if (sdp->read_before_ms)
                        sd_read_block_zero(sdkp);
                /*
                 * set the default to rotational.  All non-rotational devices
                 * support the block characteristics VPD page, which will
                 * cause this to be updated correctly and any device which
                 * doesn't support it should be treated as rotational.
                 */
                blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
                blk_queue_flag_set(QUEUE_FLAG_ADD_RANDOM, q);

                if (scsi_device_supports_vpd(sdp)) {
                        sd_read_block_provisioning(sdkp);
                        sd_read_block_limits(sdkp);
                        sd_read_block_limits_ext(sdkp);
                        sd_read_block_characteristics(sdkp);
                        sd_zbc_read_zones(sdkp, buffer);
                        sd_read_cpr(sdkp);
                }

                sd_print_capacity(sdkp, old_capacity);

                sd_read_write_protect_flag(sdkp, buffer);
                sd_read_cache_type(sdkp, buffer);
                sd_read_io_hints(sdkp, buffer);
                sd_read_app_tag_own(sdkp, buffer);
                sd_read_write_same(sdkp, buffer);
                sd_read_security(sdkp, buffer);
                sd_config_protection(sdkp);
        }

        /*
         * We now have all cache related info, determine how we deal
         * with flush requests.
         */
        sd_set_flush_flag(sdkp);

        /* Initial block count limit based on CDB TRANSFER LENGTH field size. */
        dev_max = sdp->use_16_for_rw ? SD_MAX_XFER_BLOCKS : SD_DEF_XFER_BLOCKS;

        /* Some devices report a maximum block count for READ/WRITE requests. */
        dev_max = min_not_zero(dev_max, sdkp->max_xfer_blocks);
        q->limits.max_dev_sectors = logical_to_sectors(sdp, dev_max);

        if (sd_validate_min_xfer_size(sdkp))
                blk_queue_io_min(sdkp->disk->queue,
                                 logical_to_bytes(sdp, sdkp->min_xfer_blocks));
        else
                blk_queue_io_min(sdkp->disk->queue, 0);

        if (sd_validate_opt_xfer_size(sdkp, dev_max)) {
                q->limits.io_opt = logical_to_bytes(sdp, sdkp->opt_xfer_blocks);
                rw_max = logical_to_sectors(sdp, sdkp->opt_xfer_blocks);
        } else {
                q->limits.io_opt = 0;
                rw_max = min_not_zero(logical_to_sectors(sdp, dev_max),
                                      (sector_t)BLK_DEF_MAX_SECTORS_CAP);
        }

        /*
         * Limit default to SCSI host optimal sector limit if set. There may be
         * an impact on performance for when the size of a request exceeds this
         * host limit.
         */
        rw_max = min_not_zero(rw_max, sdp->host->opt_sectors);

        /* Do not exceed controller limit */
        rw_max = min(rw_max, queue_max_hw_sectors(q));

        /*
         * Only update max_sectors if previously unset or if the current value
         * exceeds the capabilities of the hardware.
         */
        if (sdkp->first_scan ||
            q->limits.max_sectors > q->limits.max_dev_sectors ||
            q->limits.max_sectors > q->limits.max_hw_sectors) {
                q->limits.max_sectors = rw_max;
                q->limits.max_user_sectors = rw_max;
        }

        sdkp->first_scan = 0;

        set_capacity_and_notify(disk, logical_to_sectors(sdp, sdkp->capacity));
        sd_config_write_same(sdkp);
        kfree(buffer);

        /*
         * For a zoned drive, revalidating the zones can be done only once
         * the gendisk capacity is set. So if this fails, set back the gendisk
         * capacity to 0.
         */
        if (sd_zbc_revalidate_zones(sdkp))
                set_capacity_and_notify(disk, 0);

 out:
        return 0;
}

/**
 *        sd_unlock_native_capacity - unlock native capacity
 *        @disk: struct gendisk to set capacity for
 *
 *        Block layer calls this function if it detects that partitions
 *        on @disk reach beyond the end of the device.  If the SCSI host
 *        implements ->unlock_native_capacity() method, it's invoked to
 *        give it a chance to adjust the device capacity.
 *
 *        CONTEXT:
 *        Defined by block layer.  Might sleep.
 */
static void sd_unlock_native_capacity(struct gendisk *disk)
{
        struct scsi_device *sdev = scsi_disk(disk)->device;

        if (sdev->host->hostt->unlock_native_capacity)
                sdev->host->hostt->unlock_native_capacity(sdev);
}

/**
 *        sd_format_disk_name - format disk name
 *        @prefix: name prefix - ie. "sd" for SCSI disks
 *        @index: index of the disk to format name for
 *        @buf: output buffer
 *        @buflen: length of the output buffer
 *
 *        SCSI disk names starts at sda.  The 26th device is sdz and the
 *        27th is sdaa.  The last one for two lettered suffix is sdzz
 *        which is followed by sdaaa.
 *
 *        This is basically 26 base counting with one extra 'nil' entry
 *        at the beginning from the second digit on and can be
 *        determined using similar method as 26 base conversion with the
 *        index shifted -1 after each digit is computed.
 *
 *        CONTEXT:
 *        Don't care.
 *
 *        RETURNS:
 *        0 on success, -errno on failure.
 */
static int sd_format_disk_name(char *prefix, int index, char *buf, int buflen)
{
        const int base = 'z' - 'a' + 1;
        char *begin = buf + strlen(prefix);
        char *end = buf + buflen;
        char *p;
        int unit;

        p = end - 1;
        *p = '\0';
        unit = base;
        do {
                if (p == begin)
                        return -EINVAL;
                *--p = 'a' + (index % unit);
                index = (index / unit) - 1;
        } while (index >= 0);

        memmove(begin, p, end - p);
        memcpy(buf, prefix, strlen(prefix));

        return 0;
}

/**
 *        sd_probe - called during driver initialization and whenever a
 *        new scsi device is attached to the system. It is called once
 *        for each scsi device (not just disks) present.
 *        @dev: pointer to device object
 *
 *        Returns 0 if successful (or not interested in this scsi device 
 *        (e.g. scanner)); 1 when there is an error.
 *
 *        Note: this function is invoked from the scsi mid-level.
 *        This function sets up the mapping between a given 
 *        <host,channel,id,lun> (found in sdp) and new device name 
 *        (e.g. /dev/sda). More precisely it is the block device major 
 *        and minor number that is chosen here.
 *
 *        Assume sd_probe is not re-entrant (for time being)
 *        Also think about sd_probe() and sd_remove() running coincidentally.
 **/
static int sd_probe(struct device *dev)
{
        struct scsi_device *sdp = to_scsi_device(dev);
        struct scsi_disk *sdkp;
        struct gendisk *gd;
        int index;
        int error;

        scsi_autopm_get_device(sdp);
        error = -ENODEV;
        if (sdp->type != TYPE_DISK &&
            sdp->type != TYPE_ZBC &&
            sdp->type != TYPE_MOD &&
            sdp->type != TYPE_RBC)
                goto out;

        if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) && sdp->type == TYPE_ZBC) {
                sdev_printk(KERN_WARNING, sdp,
                            "Unsupported ZBC host-managed device.\n");
                goto out;
        }

        SCSI_LOG_HLQUEUE(3, sdev_printk(KERN_INFO, sdp,
                                        "sd_probe\n"));

        error = -ENOMEM;
        sdkp = kzalloc(sizeof(*sdkp), GFP_KERNEL);
        if (!sdkp)
                goto out;

        gd = blk_mq_alloc_disk_for_queue(sdp->request_queue,
                                         &sd_bio_compl_lkclass);
        if (!gd)
                goto out_free;

        index = ida_alloc(&sd_index_ida, GFP_KERNEL);
        if (index < 0) {
                sdev_printk(KERN_WARNING, sdp, "sd_probe: memory exhausted.\n");
                goto out_put;
        }

        error = sd_format_disk_name("sd", index, gd->disk_name, DISK_NAME_LEN);
        if (error) {
                sdev_printk(KERN_WARNING, sdp, "SCSI disk (sd) name length exceeded.\n");
                goto out_free_index;
        }

        sdkp->device = sdp;
        sdkp->disk = gd;
        sdkp->index = index;
        sdkp->max_retries = SD_MAX_RETRIES;
        atomic_set(&sdkp->openers, 0);
        atomic_set(&sdkp->device->ioerr_cnt, 0);

        if (!sdp->request_queue->rq_timeout) {
                if (sdp->type != TYPE_MOD)
                        blk_queue_rq_timeout(sdp->request_queue, SD_TIMEOUT);
                else
                        blk_queue_rq_timeout(sdp->request_queue,
                                             SD_MOD_TIMEOUT);
        }

        device_initialize(&sdkp->disk_dev);
        sdkp->disk_dev.parent = get_device(dev);
        sdkp->disk_dev.class = &sd_disk_class;
        dev_set_name(&sdkp->disk_dev, "%s", dev_name(dev));

        error = device_add(&sdkp->disk_dev);
        if (error) {
                put_device(&sdkp->disk_dev);
                goto out;
        }

        dev_set_drvdata(dev, sdkp);

        gd->major = sd_major((index & 0xf0) >> 4);
        gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00);
        gd->minors = SD_MINORS;

        gd->fops = &sd_fops;
        gd->private_data = sdkp;

        /* defaults, until the device tells us otherwise */
        sdp->sector_size = 512;
        sdkp->capacity = 0;
        sdkp->media_present = 1;
        sdkp->write_prot = 0;
        sdkp->cache_override = 0;
        sdkp->WCE = 0;
        sdkp->RCD = 0;
        sdkp->ATO = 0;
        sdkp->first_scan = 1;
        sdkp->max_medium_access_timeouts = SD_MAX_MEDIUM_TIMEOUTS;

        sd_revalidate_disk(gd);

        if (sdp->removable) {
                gd->flags |= GENHD_FL_REMOVABLE;
                gd->events |= DISK_EVENT_MEDIA_CHANGE;
                gd->event_flags = DISK_EVENT_FLAG_POLL | DISK_EVENT_FLAG_UEVENT;
        }

        blk_pm_runtime_init(sdp->request_queue, dev);
        if (sdp->rpm_autosuspend) {
                pm_runtime_set_autosuspend_delay(dev,
                        sdp->host->rpm_autosuspend_delay);
        }

        error = device_add_disk(dev, gd, NULL);
        if (error) {
                device_unregister(&sdkp->disk_dev);
                put_disk(gd);
                goto out;
        }

        if (sdkp->security) {
                sdkp->opal_dev = init_opal_dev(sdkp, &sd_sec_submit);
                if (sdkp->opal_dev)
                        sd_printk(KERN_NOTICE, sdkp, "supports TCG Opal\n");
        }

        sd_printk(KERN_NOTICE, sdkp, "Attached SCSI %sdisk\n",
                  sdp->removable ? "removable " : "");
        scsi_autopm_put_device(sdp);

        return 0;

 out_free_index:
        ida_free(&sd_index_ida, index);
 out_put:
        put_disk(gd);
 out_free:
        kfree(sdkp);
 out:
        scsi_autopm_put_device(sdp);
        return error;
}

/**
 *        sd_remove - called whenever a scsi disk (previously recognized by
 *        sd_probe) is detached from the system. It is called (potentially
 *        multiple times) during sd module unload.
 *        @dev: pointer to device object
 *
 *        Note: this function is invoked from the scsi mid-level.
 *        This function potentially frees up a device name (e.g. /dev/sdc)
 *        that could be re-used by a subsequent sd_probe().
 *        This function is not called when the built-in sd driver is "exit-ed".
 **/
static int sd_remove(struct device *dev)
{
        struct scsi_disk *sdkp = dev_get_drvdata(dev);

        scsi_autopm_get_device(sdkp->device);

        device_del(&sdkp->disk_dev);
        del_gendisk(sdkp->disk);
        if (!sdkp->suspended)
                sd_shutdown(dev);

        put_disk(sdkp->disk);
        return 0;
}

static void scsi_disk_release(struct device *dev)
{
        struct scsi_disk *sdkp = to_scsi_disk(dev);

        ida_free(&sd_index_ida, sdkp->index);
        put_device(&sdkp->device->sdev_gendev);
        free_opal_dev(sdkp->opal_dev);

        kfree(sdkp);
}

static int sd_start_stop_device(struct scsi_disk *sdkp, int start)
{
        unsigned char cmd[6] = { START_STOP };        /* START_VALID */
        struct scsi_sense_hdr sshdr;
        const struct scsi_exec_args exec_args = {
                .sshdr = &sshdr,
                .req_flags = BLK_MQ_REQ_PM,
        };
        struct scsi_device *sdp = sdkp->device;
        int res;

        if (start)
                cmd[4] |= 1;        /* START */

        if (sdp->start_stop_pwr_cond)
                cmd[4] |= start ? 1 << 4 : 3 << 4;        /* Active or Standby */

        if (!scsi_device_online(sdp))
                return -ENODEV;

        res = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, NULL, 0, SD_TIMEOUT,
                               sdkp->max_retries, &exec_args);
        if (res) {
                sd_print_result(sdkp, "Start/Stop Unit failed", res);
                if (res > 0 && scsi_sense_valid(&sshdr)) {
                        sd_print_sense_hdr(sdkp, &sshdr);
                        /* 0x3a is medium not present */
                        if (sshdr.asc == 0x3a)
                                res = 0;
                }
        }

        /* SCSI error codes must not go to the generic layer */
        if (res)
                return -EIO;

        return 0;
}

/*
 * Send a SYNCHRONIZE CACHE instruction down to the device through
 * the normal SCSI command structure.  Wait for the command to
 * complete.
 */
static void sd_shutdown(struct device *dev)
{
        struct scsi_disk *sdkp = dev_get_drvdata(dev);

        if (!sdkp)
                return;         /* this can happen */

        if (pm_runtime_suspended(dev))
                return;

        if (sdkp->WCE && sdkp->media_present) {
                sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n");
                sd_sync_cache(sdkp);
        }

        if ((system_state != SYSTEM_RESTART &&
             sdkp->device->manage_system_start_stop) ||
            (system_state == SYSTEM_POWER_OFF &&
             sdkp->device->manage_shutdown)) {
                sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n");
                sd_start_stop_device(sdkp, 0);
        }
}

static inline bool sd_do_start_stop(struct scsi_device *sdev, bool runtime)
{
        return (sdev->manage_system_start_stop && !runtime) ||
                (sdev->manage_runtime_start_stop && runtime);
}

static int sd_suspend_common(struct device *dev, bool runtime)
{
        struct scsi_disk *sdkp = dev_get_drvdata(dev);
        int ret = 0;

        if (!sdkp)        /* E.g.: runtime suspend following sd_remove() */
                return 0;

        if (sdkp->WCE && sdkp->media_present) {
                if (!sdkp->device->silence_suspend)
                        sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n");
                ret = sd_sync_cache(sdkp);
                /* ignore OFFLINE device */
                if (ret == -ENODEV)
                        return 0;

                if (ret)
                        return ret;
        }

        if (sd_do_start_stop(sdkp->device, runtime)) {
                if (!sdkp->device->silence_suspend)
                        sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n");
                /* an error is not worth aborting a system sleep */
                ret = sd_start_stop_device(sdkp, 0);
                if (!runtime)
                        ret = 0;
        }

        if (!ret)
                sdkp->suspended = true;

        return ret;
}

static int sd_suspend_system(struct device *dev)
{
        if (pm_runtime_suspended(dev))
                return 0;

        return sd_suspend_common(dev, false);
}

static int sd_suspend_runtime(struct device *dev)
{
        return sd_suspend_common(dev, true);
}

static int sd_resume(struct device *dev)
{
        struct scsi_disk *sdkp = dev_get_drvdata(dev);

        sd_printk(KERN_NOTICE, sdkp, "Starting disk\n");

        if (opal_unlock_from_suspend(sdkp->opal_dev)) {
                sd_printk(KERN_NOTICE, sdkp, "OPAL unlock failed\n");
                return -EIO;
        }

        return 0;
}

static int sd_resume_common(struct device *dev, bool runtime)
{
        struct scsi_disk *sdkp = dev_get_drvdata(dev);
        int ret;

        if (!sdkp)        /* E.g.: runtime resume at the start of sd_probe() */
                return 0;

        if (!sd_do_start_stop(sdkp->device, runtime)) {
                sdkp->suspended = false;
                return 0;
        }

        sd_printk(KERN_NOTICE, sdkp, "Starting disk\n");
        ret = sd_start_stop_device(sdkp, 1);
        if (!ret) {
                sd_resume(dev);
                sdkp->suspended = false;
        }

        return ret;
}

static int sd_resume_system(struct device *dev)
{
        if (pm_runtime_suspended(dev)) {
                struct scsi_disk *sdkp = dev_get_drvdata(dev);
                struct scsi_device *sdp = sdkp ? sdkp->device : NULL;

                if (sdp && sdp->force_runtime_start_on_system_start)
                        pm_request_resume(dev);

                return 0;
        }

        return sd_resume_common(dev, false);
}

static int sd_resume_runtime(struct device *dev)
{
        struct scsi_disk *sdkp = dev_get_drvdata(dev);
        struct scsi_device *sdp;

        if (!sdkp)        /* E.g.: runtime resume at the start of sd_probe() */
                return 0;

        sdp = sdkp->device;

        if (sdp->ignore_media_change) {
                /* clear the device's sense data */
                static const u8 cmd[10] = { REQUEST_SENSE };
                const struct scsi_exec_args exec_args = {
                        .req_flags = BLK_MQ_REQ_PM,
                };

                if (scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, NULL, 0,
                                     sdp->request_queue->rq_timeout, 1,
                                     &exec_args))
                        sd_printk(KERN_NOTICE, sdkp,
                                  "Failed to clear sense data\n");
        }

        return sd_resume_common(dev, true);
}

static const struct dev_pm_ops sd_pm_ops = {
        .suspend                = sd_suspend_system,
        .resume                        = sd_resume_system,
        .poweroff                = sd_suspend_system,
        .restore                = sd_resume_system,
        .runtime_suspend        = sd_suspend_runtime,
        .runtime_resume                = sd_resume_runtime,
};

static struct scsi_driver sd_template = {
        .gendrv = {
                .name                = "sd",
                .probe                = sd_probe,
                .probe_type        = PROBE_PREFER_ASYNCHRONOUS,
                .remove                = sd_remove,
                .shutdown        = sd_shutdown,
                .pm                = &sd_pm_ops,
        },
        .rescan                        = sd_rescan,
        .resume                        = sd_resume,
        .init_command                = sd_init_command,
        .uninit_command                = sd_uninit_command,
        .done                        = sd_done,
        .eh_action                = sd_eh_action,
        .eh_reset                = sd_eh_reset,
};

/**
 *        init_sd - entry point for this driver (both when built in or when
 *        a module).
 *
 *        Note: this function registers this driver with the scsi mid-level.
 **/
static int __init init_sd(void)
{
        int majors = 0, i, err;

        SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n"));

        for (i = 0; i < SD_MAJORS; i++) {
                if (__register_blkdev(sd_major(i), "sd", sd_default_probe))
                        continue;
                majors++;
        }

        if (!majors)
                return -ENODEV;

        err = class_register(&sd_disk_class);
        if (err)
                goto err_out;

        sd_page_pool = mempool_create_page_pool(SD_MEMPOOL_SIZE, 0);
        if (!sd_page_pool) {
                printk(KERN_ERR "sd: can't init discard page pool\n");
                err = -ENOMEM;
                goto err_out_class;
        }

        err = scsi_register_driver(&sd_template.gendrv);
        if (err)
                goto err_out_driver;

        return 0;

err_out_driver:
        mempool_destroy(sd_page_pool);
err_out_class:
        class_unregister(&sd_disk_class);
err_out:
        for (i = 0; i < SD_MAJORS; i++)
                unregister_blkdev(sd_major(i), "sd");
        return err;
}

/**
 *        exit_sd - exit point for this driver (when it is a module).
 *
 *        Note: this function unregisters this driver from the scsi mid-level.
 **/
static void __exit exit_sd(void)
{
        int i;

        SCSI_LOG_HLQUEUE(3, printk("exit_sd: exiting sd driver\n"));

        scsi_unregister_driver(&sd_template.gendrv);
        mempool_destroy(sd_page_pool);

        class_unregister(&sd_disk_class);

        for (i = 0; i < SD_MAJORS; i++)
                unregister_blkdev(sd_major(i), "sd");
}

module_init(init_sd);
module_exit(exit_sd);

void sd_print_sense_hdr(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr)
{
        scsi_print_sense_hdr(sdkp->device,
                             sdkp->disk ? sdkp->disk->disk_name : NULL, sshdr);
}

void sd_print_result(const struct scsi_disk *sdkp, const char *msg, int result)
{
        const char *hb_string = scsi_hostbyte_string(result);

        if (hb_string)
                sd_printk(KERN_INFO, sdkp,
                          "%s: Result: hostbyte=%s driverbyte=%s\n", msg,
                          hb_string ? hb_string : "invalid",
                          "DRIVER_OK");
        else
                sd_printk(KERN_INFO, sdkp,
                          "%s: Result: hostbyte=0x%02x driverbyte=%s\n",
                          msg, host_byte(result), "DRIVER_OK");
}

























    1 


    1 

    1 




    1 

    1 
    1 

    1 



    1 


    1 
















    1 
    1 





    1 


    1 






    1 









    1 












    1 


    1 







    1 





    1 







    1 


















    1 

    1 






    1 










    1 





    1 










































































































































    1 



















    1 
















    2 















    1 
    2 

    2 








    1 





    1 


    1 






    1 

    1 





















































    1 


    1 
    1 
    1 


    1 
    1 
    1 






























































































    1 























    1 



















    1 



    1 













    1 

    1 






    1 
    1 
    1 


    1 

























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2004, OGAWA Hirofumi
 */

#include <linux/blkdev.h>
#include <linux/sched/signal.h>
#include <linux/backing-dev-defs.h>
#include "fat.h"

struct fatent_operations {
        void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
        void (*ent_set_ptr)(struct fat_entry *, int);
        int (*ent_bread)(struct super_block *, struct fat_entry *,
                         int, sector_t);
        int (*ent_get)(struct fat_entry *);
        void (*ent_put)(struct fat_entry *, int);
        int (*ent_next)(struct fat_entry *);
};

static DEFINE_SPINLOCK(fat12_entry_lock);

static void fat12_ent_blocknr(struct super_block *sb, int entry,
                              int *offset, sector_t *blocknr)
{
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        int bytes = entry + (entry >> 1);
        WARN_ON(!fat_valid_entry(sbi, entry));
        *offset = bytes & (sb->s_blocksize - 1);
        *blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits);
}

static void fat_ent_blocknr(struct super_block *sb, int entry,
                            int *offset, sector_t *blocknr)
{
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        int bytes = (entry << sbi->fatent_shift);
        WARN_ON(!fat_valid_entry(sbi, entry));
        *offset = bytes & (sb->s_blocksize - 1);
        *blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits);
}

static void fat12_ent_set_ptr(struct fat_entry *fatent, int offset)
{
        struct buffer_head **bhs = fatent->bhs;
        if (fatent->nr_bhs == 1) {
                WARN_ON(offset >= (bhs[0]->b_size - 1));
                fatent->u.ent12_p[0] = bhs[0]->b_data + offset;
                fatent->u.ent12_p[1] = bhs[0]->b_data + (offset + 1);
        } else {
                WARN_ON(offset != (bhs[0]->b_size - 1));
                fatent->u.ent12_p[0] = bhs[0]->b_data + offset;
                fatent->u.ent12_p[1] = bhs[1]->b_data;
        }
}

static void fat16_ent_set_ptr(struct fat_entry *fatent, int offset)
{
        WARN_ON(offset & (2 - 1));
        fatent->u.ent16_p = (__le16 *)(fatent->bhs[0]->b_data + offset);
}

static void fat32_ent_set_ptr(struct fat_entry *fatent, int offset)
{
        WARN_ON(offset & (4 - 1));
        fatent->u.ent32_p = (__le32 *)(fatent->bhs[0]->b_data + offset);
}

static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
                           int offset, sector_t blocknr)
{
        struct buffer_head **bhs = fatent->bhs;

        WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
        fatent->fat_inode = MSDOS_SB(sb)->fat_inode;

        bhs[0] = sb_bread(sb, blocknr);
        if (!bhs[0])
                goto err;

        if ((offset + 1) < sb->s_blocksize)
                fatent->nr_bhs = 1;
        else {
                /* This entry is block boundary, it needs the next block */
                blocknr++;
                bhs[1] = sb_bread(sb, blocknr);
                if (!bhs[1])
                        goto err_brelse;
                fatent->nr_bhs = 2;
        }
        fat12_ent_set_ptr(fatent, offset);
        return 0;

err_brelse:
        brelse(bhs[0]);
err:
        fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)",
                          (llu)blocknr);
        return -EIO;
}

static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
                         int offset, sector_t blocknr)
{
        const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;

        WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
        fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
        fatent->bhs[0] = sb_bread(sb, blocknr);
        if (!fatent->bhs[0]) {
                fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)",
                                  (llu)blocknr);
                return -EIO;
        }
        fatent->nr_bhs = 1;
        ops->ent_set_ptr(fatent, offset);
        return 0;
}

static int fat12_ent_get(struct fat_entry *fatent)
{
        u8 **ent12_p = fatent->u.ent12_p;
        int next;

        spin_lock(&fat12_entry_lock);
        if (fatent->entry & 1)
                next = (*ent12_p[0] >> 4) | (*ent12_p[1] << 4);
        else
                next = (*ent12_p[1] << 8) | *ent12_p[0];
        spin_unlock(&fat12_entry_lock);

        next &= 0x0fff;
        if (next >= BAD_FAT12)
                next = FAT_ENT_EOF;
        return next;
}

static int fat16_ent_get(struct fat_entry *fatent)
{
        int next = le16_to_cpu(*fatent->u.ent16_p);
        WARN_ON((unsigned long)fatent->u.ent16_p & (2 - 1));
        if (next >= BAD_FAT16)
                next = FAT_ENT_EOF;
        return next;
}

static int fat32_ent_get(struct fat_entry *fatent)
{
        int next = le32_to_cpu(*fatent->u.ent32_p) & 0x0fffffff;
        WARN_ON((unsigned long)fatent->u.ent32_p & (4 - 1));
        if (next >= BAD_FAT32)
                next = FAT_ENT_EOF;
        return next;
}

static void fat12_ent_put(struct fat_entry *fatent, int new)
{
        u8 **ent12_p = fatent->u.ent12_p;

        if (new == FAT_ENT_EOF)
                new = EOF_FAT12;

        spin_lock(&fat12_entry_lock);
        if (fatent->entry & 1) {
                *ent12_p[0] = (new << 4) | (*ent12_p[0] & 0x0f);
                *ent12_p[1] = new >> 4;
        } else {
                *ent12_p[0] = new & 0xff;
                *ent12_p[1] = (*ent12_p[1] & 0xf0) | (new >> 8);
        }
        spin_unlock(&fat12_entry_lock);

        mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
        if (fatent->nr_bhs == 2)
                mark_buffer_dirty_inode(fatent->bhs[1], fatent->fat_inode);
}

static void fat16_ent_put(struct fat_entry *fatent, int new)
{
        if (new == FAT_ENT_EOF)
                new = EOF_FAT16;

        *fatent->u.ent16_p = cpu_to_le16(new);
        mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
}

static void fat32_ent_put(struct fat_entry *fatent, int new)
{
        WARN_ON(new & 0xf0000000);
        new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff;
        *fatent->u.ent32_p = cpu_to_le32(new);
        mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
}

static int fat12_ent_next(struct fat_entry *fatent)
{
        u8 **ent12_p = fatent->u.ent12_p;
        struct buffer_head **bhs = fatent->bhs;
        u8 *nextp = ent12_p[1] + 1 + (fatent->entry & 1);

        fatent->entry++;
        if (fatent->nr_bhs == 1) {
                WARN_ON(ent12_p[0] > (u8 *)(bhs[0]->b_data +
                                                        (bhs[0]->b_size - 2)));
                WARN_ON(ent12_p[1] > (u8 *)(bhs[0]->b_data +
                                                        (bhs[0]->b_size - 1)));
                if (nextp < (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))) {
                        ent12_p[0] = nextp - 1;
                        ent12_p[1] = nextp;
                        return 1;
                }
        } else {
                WARN_ON(ent12_p[0] != (u8 *)(bhs[0]->b_data +
                                                        (bhs[0]->b_size - 1)));
                WARN_ON(ent12_p[1] != (u8 *)bhs[1]->b_data);
                ent12_p[0] = nextp - 1;
                ent12_p[1] = nextp;
                brelse(bhs[0]);
                bhs[0] = bhs[1];
                fatent->nr_bhs = 1;
                return 1;
        }
        ent12_p[0] = NULL;
        ent12_p[1] = NULL;
        return 0;
}

static int fat16_ent_next(struct fat_entry *fatent)
{
        const struct buffer_head *bh = fatent->bhs[0];
        fatent->entry++;
        if (fatent->u.ent16_p < (__le16 *)(bh->b_data + (bh->b_size - 2))) {
                fatent->u.ent16_p++;
                return 1;
        }
        fatent->u.ent16_p = NULL;
        return 0;
}

static int fat32_ent_next(struct fat_entry *fatent)
{
        const struct buffer_head *bh = fatent->bhs[0];
        fatent->entry++;
        if (fatent->u.ent32_p < (__le32 *)(bh->b_data + (bh->b_size - 4))) {
                fatent->u.ent32_p++;
                return 1;
        }
        fatent->u.ent32_p = NULL;
        return 0;
}

static const struct fatent_operations fat12_ops = {
        .ent_blocknr        = fat12_ent_blocknr,
        .ent_set_ptr        = fat12_ent_set_ptr,
        .ent_bread        = fat12_ent_bread,
        .ent_get        = fat12_ent_get,
        .ent_put        = fat12_ent_put,
        .ent_next        = fat12_ent_next,
};

static const struct fatent_operations fat16_ops = {
        .ent_blocknr        = fat_ent_blocknr,
        .ent_set_ptr        = fat16_ent_set_ptr,
        .ent_bread        = fat_ent_bread,
        .ent_get        = fat16_ent_get,
        .ent_put        = fat16_ent_put,
        .ent_next        = fat16_ent_next,
};

static const struct fatent_operations fat32_ops = {
        .ent_blocknr        = fat_ent_blocknr,
        .ent_set_ptr        = fat32_ent_set_ptr,
        .ent_bread        = fat_ent_bread,
        .ent_get        = fat32_ent_get,
        .ent_put        = fat32_ent_put,
        .ent_next        = fat32_ent_next,
};

static inline void lock_fat(struct msdos_sb_info *sbi)
{
        mutex_lock(&sbi->fat_lock);
}

static inline void unlock_fat(struct msdos_sb_info *sbi)
{
        mutex_unlock(&sbi->fat_lock);
}

void fat_ent_access_init(struct super_block *sb)
{
        struct msdos_sb_info *sbi = MSDOS_SB(sb);

        mutex_init(&sbi->fat_lock);

        if (is_fat32(sbi)) {
                sbi->fatent_shift = 2;
                sbi->fatent_ops = &fat32_ops;
        } else if (is_fat16(sbi)) {
                sbi->fatent_shift = 1;
                sbi->fatent_ops = &fat16_ops;
        } else if (is_fat12(sbi)) {
                sbi->fatent_shift = -1;
                sbi->fatent_ops = &fat12_ops;
        } else {
                fat_fs_error(sb, "invalid FAT variant, %u bits", sbi->fat_bits);
        }
}

static void mark_fsinfo_dirty(struct super_block *sb)
{
        struct msdos_sb_info *sbi = MSDOS_SB(sb);

        if (sb_rdonly(sb) || !is_fat32(sbi))
                return;

        __mark_inode_dirty(sbi->fsinfo_inode, I_DIRTY_SYNC);
}

static inline int fat_ent_update_ptr(struct super_block *sb,
                                     struct fat_entry *fatent,
                                     int offset, sector_t blocknr)
{
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        const struct fatent_operations *ops = sbi->fatent_ops;
        struct buffer_head **bhs = fatent->bhs;

        /* Is this fatent's blocks including this entry? */
        if (!fatent->nr_bhs || bhs[0]->b_blocknr != blocknr)
                return 0;
        if (is_fat12(sbi)) {
                if ((offset + 1) < sb->s_blocksize) {
                        /* This entry is on bhs[0]. */
                        if (fatent->nr_bhs == 2) {
                                brelse(bhs[1]);
                                fatent->nr_bhs = 1;
                        }
                } else {
                        /* This entry needs the next block. */
                        if (fatent->nr_bhs != 2)
                                return 0;
                        if (bhs[1]->b_blocknr != (blocknr + 1))
                                return 0;
                }
        }
        ops->ent_set_ptr(fatent, offset);
        return 1;
}

int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry)
{
        struct super_block *sb = inode->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
        const struct fatent_operations *ops = sbi->fatent_ops;
        int err, offset;
        sector_t blocknr;

        if (!fat_valid_entry(sbi, entry)) {
                fatent_brelse(fatent);
                fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry);
                return -EIO;
        }

        fatent_set_entry(fatent, entry);
        ops->ent_blocknr(sb, entry, &offset, &blocknr);

        if (!fat_ent_update_ptr(sb, fatent, offset, blocknr)) {
                fatent_brelse(fatent);
                err = ops->ent_bread(sb, fatent, offset, blocknr);
                if (err)
                        return err;
        }
        return ops->ent_get(fatent);
}

/* FIXME: We can write the blocks as more big chunk. */
static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs,
                          int nr_bhs)
{
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        struct buffer_head *c_bh;
        int err, n, copy;

        err = 0;
        for (copy = 1; copy < sbi->fats; copy++) {
                sector_t backup_fat = sbi->fat_length * copy;

                for (n = 0; n < nr_bhs; n++) {
                        c_bh = sb_getblk(sb, backup_fat + bhs[n]->b_blocknr);
                        if (!c_bh) {
                                err = -ENOMEM;
                                goto error;
                        }
                        /* Avoid race with userspace read via bdev */
                        lock_buffer(c_bh);
                        memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
                        set_buffer_uptodate(c_bh);
                        unlock_buffer(c_bh);
                        mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
                        if (sb->s_flags & SB_SYNCHRONOUS)
                                err = sync_dirty_buffer(c_bh);
                        brelse(c_bh);
                        if (err)
                                goto error;
                }
        }
error:
        return err;
}

int fat_ent_write(struct inode *inode, struct fat_entry *fatent,
                  int new, int wait)
{
        struct super_block *sb = inode->i_sb;
        const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
        int err;

        ops->ent_put(fatent, new);
        if (wait) {
                err = fat_sync_bhs(fatent->bhs, fatent->nr_bhs);
                if (err)
                        return err;
        }
        return fat_mirror_bhs(sb, fatent->bhs, fatent->nr_bhs);
}

static inline int fat_ent_next(struct msdos_sb_info *sbi,
                               struct fat_entry *fatent)
{
        if (sbi->fatent_ops->ent_next(fatent)) {
                if (fatent->entry < sbi->max_cluster)
                        return 1;
        }
        return 0;
}

static inline int fat_ent_read_block(struct super_block *sb,
                                     struct fat_entry *fatent)
{
        const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
        sector_t blocknr;
        int offset;

        fatent_brelse(fatent);
        ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr);
        return ops->ent_bread(sb, fatent, offset, blocknr);
}

static void fat_collect_bhs(struct buffer_head **bhs, int *nr_bhs,
                            struct fat_entry *fatent)
{
        int n, i;

        for (n = 0; n < fatent->nr_bhs; n++) {
                for (i = 0; i < *nr_bhs; i++) {
                        if (fatent->bhs[n] == bhs[i])
                                break;
                }
                if (i == *nr_bhs) {
                        get_bh(fatent->bhs[n]);
                        bhs[i] = fatent->bhs[n];
                        (*nr_bhs)++;
                }
        }
}

int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster)
{
        struct super_block *sb = inode->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        const struct fatent_operations *ops = sbi->fatent_ops;
        struct fat_entry fatent, prev_ent;
        struct buffer_head *bhs[MAX_BUF_PER_PAGE];
        int i, count, err, nr_bhs, idx_clus;

        BUG_ON(nr_cluster > (MAX_BUF_PER_PAGE / 2));        /* fixed limit */

        lock_fat(sbi);
        if (sbi->free_clusters != -1 && sbi->free_clus_valid &&
            sbi->free_clusters < nr_cluster) {
                unlock_fat(sbi);
                return -ENOSPC;
        }

        err = nr_bhs = idx_clus = 0;
        count = FAT_START_ENT;
        fatent_init(&prev_ent);
        fatent_init(&fatent);
        fatent_set_entry(&fatent, sbi->prev_free + 1);
        while (count < sbi->max_cluster) {
                if (fatent.entry >= sbi->max_cluster)
                        fatent.entry = FAT_START_ENT;
                fatent_set_entry(&fatent, fatent.entry);
                err = fat_ent_read_block(sb, &fatent);
                if (err)
                        goto out;

                /* Find the free entries in a block */
                do {
                        if (ops->ent_get(&fatent) == FAT_ENT_FREE) {
                                int entry = fatent.entry;

                                /* make the cluster chain */
                                ops->ent_put(&fatent, FAT_ENT_EOF);
                                if (prev_ent.nr_bhs)
                                        ops->ent_put(&prev_ent, entry);

                                fat_collect_bhs(bhs, &nr_bhs, &fatent);

                                sbi->prev_free = entry;
                                if (sbi->free_clusters != -1)
                                        sbi->free_clusters--;

                                cluster[idx_clus] = entry;
                                idx_clus++;
                                if (idx_clus == nr_cluster)
                                        goto out;

                                /*
                                 * fat_collect_bhs() gets ref-count of bhs,
                                 * so we can still use the prev_ent.
                                 */
                                prev_ent = fatent;
                        }
                        count++;
                        if (count == sbi->max_cluster)
                                break;
                } while (fat_ent_next(sbi, &fatent));
        }

        /* Couldn't allocate the free entries */
        sbi->free_clusters = 0;
        sbi->free_clus_valid = 1;
        err = -ENOSPC;

out:
        unlock_fat(sbi);
        mark_fsinfo_dirty(sb);
        fatent_brelse(&fatent);
        if (!err) {
                if (inode_needs_sync(inode))
                        err = fat_sync_bhs(bhs, nr_bhs);
                if (!err)
                        err = fat_mirror_bhs(sb, bhs, nr_bhs);
        }
        for (i = 0; i < nr_bhs; i++)
                brelse(bhs[i]);

        if (err && idx_clus)
                fat_free_clusters(inode, cluster[0]);

        return err;
}

int fat_free_clusters(struct inode *inode, int cluster)
{
        struct super_block *sb = inode->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        const struct fatent_operations *ops = sbi->fatent_ops;
        struct fat_entry fatent;
        struct buffer_head *bhs[MAX_BUF_PER_PAGE];
        int i, err, nr_bhs;
        int first_cl = cluster, dirty_fsinfo = 0;

        nr_bhs = 0;
        fatent_init(&fatent);
        lock_fat(sbi);
        do {
                cluster = fat_ent_read(inode, &fatent, cluster);
                if (cluster < 0) {
                        err = cluster;
                        goto error;
                } else if (cluster == FAT_ENT_FREE) {
                        fat_fs_error(sb, "%s: deleting FAT entry beyond EOF",
                                     __func__);
                        err = -EIO;
                        goto error;
                }

                if (sbi->options.discard) {
                        /*
                         * Issue discard for the sectors we no longer
                         * care about, batching contiguous clusters
                         * into one request
                         */
                        if (cluster != fatent.entry + 1) {
                                int nr_clus = fatent.entry - first_cl + 1;

                                sb_issue_discard(sb,
                                        fat_clus_to_blknr(sbi, first_cl),
                                        nr_clus * sbi->sec_per_clus,
                                        GFP_NOFS, 0);

                                first_cl = cluster;
                        }
                }

                ops->ent_put(&fatent, FAT_ENT_FREE);
                if (sbi->free_clusters != -1) {
                        sbi->free_clusters++;
                        dirty_fsinfo = 1;
                }

                if (nr_bhs + fatent.nr_bhs > MAX_BUF_PER_PAGE) {
                        if (sb->s_flags & SB_SYNCHRONOUS) {
                                err = fat_sync_bhs(bhs, nr_bhs);
                                if (err)
                                        goto error;
                        }
                        err = fat_mirror_bhs(sb, bhs, nr_bhs);
                        if (err)
                                goto error;
                        for (i = 0; i < nr_bhs; i++)
                                brelse(bhs[i]);
                        nr_bhs = 0;
                }
                fat_collect_bhs(bhs, &nr_bhs, &fatent);
        } while (cluster != FAT_ENT_EOF);

        if (sb->s_flags & SB_SYNCHRONOUS) {
                err = fat_sync_bhs(bhs, nr_bhs);
                if (err)
                        goto error;
        }
        err = fat_mirror_bhs(sb, bhs, nr_bhs);
error:
        fatent_brelse(&fatent);
        for (i = 0; i < nr_bhs; i++)
                brelse(bhs[i]);
        unlock_fat(sbi);
        if (dirty_fsinfo)
                mark_fsinfo_dirty(sb);

        return err;
}
EXPORT_SYMBOL_GPL(fat_free_clusters);

struct fatent_ra {
        sector_t cur;
        sector_t limit;

        unsigned int ra_blocks;
        sector_t ra_advance;
        sector_t ra_next;
        sector_t ra_limit;
};

static void fat_ra_init(struct super_block *sb, struct fatent_ra *ra,
                        struct fat_entry *fatent, int ent_limit)
{
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        const struct fatent_operations *ops = sbi->fatent_ops;
        sector_t blocknr, block_end;
        int offset;
        /*
         * This is the sequential read, so ra_pages * 2 (but try to
         * align the optimal hardware IO size).
         * [BTW, 128kb covers the whole sectors for FAT12 and FAT16]
         */
        unsigned long ra_pages = sb->s_bdi->ra_pages;
        unsigned int reada_blocks;

        if (fatent->entry >= ent_limit)
                return;

        if (ra_pages > sb->s_bdi->io_pages)
                ra_pages = rounddown(ra_pages, sb->s_bdi->io_pages);
        reada_blocks = ra_pages << (PAGE_SHIFT - sb->s_blocksize_bits + 1);

        /* Initialize the range for sequential read */
        ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr);
        ops->ent_blocknr(sb, ent_limit - 1, &offset, &block_end);
        ra->cur = 0;
        ra->limit = (block_end + 1) - blocknr;

        /* Advancing the window at half size */
        ra->ra_blocks = reada_blocks >> 1;
        ra->ra_advance = ra->cur;
        ra->ra_next = ra->cur;
        ra->ra_limit = ra->cur + min_t(sector_t, reada_blocks, ra->limit);
}

/* Assuming to be called before reading a new block (increments ->cur). */
static void fat_ent_reada(struct super_block *sb, struct fatent_ra *ra,
                          struct fat_entry *fatent)
{
        if (ra->ra_next >= ra->ra_limit)
                return;

        if (ra->cur >= ra->ra_advance) {
                struct msdos_sb_info *sbi = MSDOS_SB(sb);
                const struct fatent_operations *ops = sbi->fatent_ops;
                struct blk_plug plug;
                sector_t blocknr, diff;
                int offset;

                ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr);

                diff = blocknr - ra->cur;
                blk_start_plug(&plug);
                /*
                 * FIXME: we would want to directly use the bio with
                 * pages to reduce the number of segments.
                 */
                for (; ra->ra_next < ra->ra_limit; ra->ra_next++)
                        sb_breadahead(sb, ra->ra_next + diff);
                blk_finish_plug(&plug);

                /* Advance the readahead window */
                ra->ra_advance += ra->ra_blocks;
                ra->ra_limit += min_t(sector_t,
                                      ra->ra_blocks, ra->limit - ra->ra_limit);
        }
        ra->cur++;
}

int fat_count_free_clusters(struct super_block *sb)
{
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        const struct fatent_operations *ops = sbi->fatent_ops;
        struct fat_entry fatent;
        struct fatent_ra fatent_ra;
        int err = 0, free;

        lock_fat(sbi);
        if (sbi->free_clusters != -1 && sbi->free_clus_valid)
                goto out;

        free = 0;
        fatent_init(&fatent);
        fatent_set_entry(&fatent, FAT_START_ENT);
        fat_ra_init(sb, &fatent_ra, &fatent, sbi->max_cluster);
        while (fatent.entry < sbi->max_cluster) {
                /* readahead of fat blocks */
                fat_ent_reada(sb, &fatent_ra, &fatent);

                err = fat_ent_read_block(sb, &fatent);
                if (err)
                        goto out;

                do {
                        if (ops->ent_get(&fatent) == FAT_ENT_FREE)
                                free++;
                } while (fat_ent_next(sbi, &fatent));
                cond_resched();
        }
        sbi->free_clusters = free;
        sbi->free_clus_valid = 1;
        mark_fsinfo_dirty(sb);
        fatent_brelse(&fatent);
out:
        unlock_fat(sbi);
        return err;
}

static int fat_trim_clusters(struct super_block *sb, u32 clus, u32 nr_clus)
{
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        return sb_issue_discard(sb, fat_clus_to_blknr(sbi, clus),
                                nr_clus * sbi->sec_per_clus, GFP_NOFS, 0);
}

int fat_trim_fs(struct inode *inode, struct fstrim_range *range)
{
        struct super_block *sb = inode->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        const struct fatent_operations *ops = sbi->fatent_ops;
        struct fat_entry fatent;
        struct fatent_ra fatent_ra;
        u64 ent_start, ent_end, minlen, trimmed = 0;
        u32 free = 0;
        int err = 0;

        /*
         * FAT data is organized as clusters, trim at the granulary of cluster.
         *
         * fstrim_range is in byte, convert values to cluster index.
         * Treat sectors before data region as all used, not to trim them.
         */
        ent_start = max_t(u64, range->start>>sbi->cluster_bits, FAT_START_ENT);
        ent_end = ent_start + (range->len >> sbi->cluster_bits) - 1;
        minlen = range->minlen >> sbi->cluster_bits;

        if (ent_start >= sbi->max_cluster || range->len < sbi->cluster_size)
                return -EINVAL;
        if (ent_end >= sbi->max_cluster)
                ent_end = sbi->max_cluster - 1;

        fatent_init(&fatent);
        lock_fat(sbi);
        fatent_set_entry(&fatent, ent_start);
        fat_ra_init(sb, &fatent_ra, &fatent, ent_end + 1);
        while (fatent.entry <= ent_end) {
                /* readahead of fat blocks */
                fat_ent_reada(sb, &fatent_ra, &fatent);

                err = fat_ent_read_block(sb, &fatent);
                if (err)
                        goto error;
                do {
                        if (ops->ent_get(&fatent) == FAT_ENT_FREE) {
                                free++;
                        } else if (free) {
                                if (free >= minlen) {
                                        u32 clus = fatent.entry - free;

                                        err = fat_trim_clusters(sb, clus, free);
                                        if (err && err != -EOPNOTSUPP)
                                                goto error;
                                        if (!err)
                                                trimmed += free;
                                        err = 0;
                                }
                                free = 0;
                        }
                } while (fat_ent_next(sbi, &fatent) && fatent.entry <= ent_end);

                if (fatal_signal_pending(current)) {
                        err = -ERESTARTSYS;
                        goto error;
                }

                if (need_resched()) {
                        fatent_brelse(&fatent);
                        unlock_fat(sbi);
                        cond_resched();
                        lock_fat(sbi);
                }
        }
        /* handle scenario when tail entries are all free */
        if (free && free >= minlen) {
                u32 clus = fatent.entry - free;

                err = fat_trim_clusters(sb, clus, free);
                if (err && err != -EOPNOTSUPP)
                        goto error;
                if (!err)
                        trimmed += free;
                err = 0;
        }

error:
        fatent_brelse(&fatent);
        unlock_fat(sbi);

        range->len = trimmed << sbi->cluster_bits;

        return err;
}
































































































































































































































































































































    1 





    1 












    1 


    1 






























    1 






    1 









    1 

    1 

    1 





    1 






    1 




    1 





































































    1 







    1 





















    1 







    1 


    1 


    1 






















    1 









    1 

    1 

    1 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/fat/file.c
 *
 *  Written 1992,1993 by Werner Almesberger
 *
 *  regular file handling primitives for fat-based filesystems
 */

#include <linux/capability.h>
#include <linux/module.h>
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/falloc.h>
#include "fat.h"

static long fat_fallocate(struct file *file, int mode,
                          loff_t offset, loff_t len);

static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
{
        u32 attr;

        inode_lock_shared(inode);
        attr = fat_make_attrs(inode);
        inode_unlock_shared(inode);

        return put_user(attr, user_attr);
}

static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
{
        struct inode *inode = file_inode(file);
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
        int is_dir = S_ISDIR(inode->i_mode);
        u32 attr, oldattr;
        struct iattr ia;
        int err;

        err = get_user(attr, user_attr);
        if (err)
                goto out;

        err = mnt_want_write_file(file);
        if (err)
                goto out;
        inode_lock(inode);

        /*
         * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
         * prevents the user from turning us into a VFAT
         * longname entry.  Also, we obviously can't set
         * any of the NTFS attributes in the high 24 bits.
         */
        attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR);
        /* Merge in ATTR_VOLUME and ATTR_DIR */
        attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) |
                (is_dir ? ATTR_DIR : 0);
        oldattr = fat_make_attrs(inode);

        /* Equivalent to a chmod() */
        ia.ia_valid = ATTR_MODE | ATTR_CTIME;
        ia.ia_ctime = current_time(inode);
        if (is_dir)
                ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
        else {
                ia.ia_mode = fat_make_mode(sbi, attr,
                        S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO));
        }

        /* The root directory has no attributes */
        if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) {
                err = -EINVAL;
                goto out_unlock_inode;
        }

        if (sbi->options.sys_immutable &&
            ((attr | oldattr) & ATTR_SYS) &&
            !capable(CAP_LINUX_IMMUTABLE)) {
                err = -EPERM;
                goto out_unlock_inode;
        }

        /*
         * The security check is questionable...  We single
         * out the RO attribute for checking by the security
         * module, just because it maps to a file mode.
         */
        err = security_inode_setattr(file_mnt_idmap(file),
                                     file->f_path.dentry, &ia);
        if (err)
                goto out_unlock_inode;

        /* This MUST be done before doing anything irreversible... */
        err = fat_setattr(file_mnt_idmap(file), file->f_path.dentry, &ia);
        if (err)
                goto out_unlock_inode;

        fsnotify_change(file->f_path.dentry, ia.ia_valid);
        if (sbi->options.sys_immutable) {
                if (attr & ATTR_SYS)
                        inode->i_flags |= S_IMMUTABLE;
                else
                        inode->i_flags &= ~S_IMMUTABLE;
        }

        fat_save_attrs(inode, attr);
        mark_inode_dirty(inode);
out_unlock_inode:
        inode_unlock(inode);
        mnt_drop_write_file(file);
out:
        return err;
}

static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr)
{
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
        return put_user(sbi->vol_id, user_attr);
}

static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg)
{
        struct super_block *sb = inode->i_sb;
        struct fstrim_range __user *user_range;
        struct fstrim_range range;
        int err;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        if (!bdev_max_discard_sectors(sb->s_bdev))
                return -EOPNOTSUPP;

        user_range = (struct fstrim_range __user *)arg;
        if (copy_from_user(&range, user_range, sizeof(range)))
                return -EFAULT;

        range.minlen = max_t(unsigned int, range.minlen,
                             bdev_discard_granularity(sb->s_bdev));

        err = fat_trim_fs(inode, &range);
        if (err < 0)
                return err;

        if (copy_to_user(user_range, &range, sizeof(range)))
                return -EFAULT;

        return 0;
}

long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
        struct inode *inode = file_inode(filp);
        u32 __user *user_attr = (u32 __user *)arg;

        switch (cmd) {
        case FAT_IOCTL_GET_ATTRIBUTES:
                return fat_ioctl_get_attributes(inode, user_attr);
        case FAT_IOCTL_SET_ATTRIBUTES:
                return fat_ioctl_set_attributes(filp, user_attr);
        case FAT_IOCTL_GET_VOLUME_ID:
                return fat_ioctl_get_volume_id(inode, user_attr);
        case FITRIM:
                return fat_ioctl_fitrim(inode, arg);
        default:
                return -ENOTTY;        /* Inappropriate ioctl for device */
        }
}

static int fat_file_release(struct inode *inode, struct file *filp)
{
        if ((filp->f_mode & FMODE_WRITE) &&
            MSDOS_SB(inode->i_sb)->options.flush) {
                fat_flush_inodes(inode->i_sb, inode, NULL);
                set_current_state(TASK_UNINTERRUPTIBLE);
                io_schedule_timeout(HZ/10);
        }
        return 0;
}

int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
        struct inode *inode = filp->f_mapping->host;
        int err;

        err = __generic_file_fsync(filp, start, end, datasync);
        if (err)
                return err;

        err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
        if (err)
                return err;

        return blkdev_issue_flush(inode->i_sb->s_bdev);
}


const struct file_operations fat_file_operations = {
        .llseek                = generic_file_llseek,
        .read_iter        = generic_file_read_iter,
        .write_iter        = generic_file_write_iter,
        .mmap                = generic_file_mmap,
        .release        = fat_file_release,
        .unlocked_ioctl        = fat_generic_ioctl,
        .compat_ioctl        = compat_ptr_ioctl,
        .fsync                = fat_file_fsync,
        .splice_read        = filemap_splice_read,
        .splice_write        = iter_file_splice_write,
        .fallocate        = fat_fallocate,
};

static int fat_cont_expand(struct inode *inode, loff_t size)
{
        struct address_space *mapping = inode->i_mapping;
        loff_t start = inode->i_size, count = size - inode->i_size;
        int err;

        err = generic_cont_expand_simple(inode, size);
        if (err)
                goto out;

        fat_truncate_time(inode, NULL, S_CTIME|S_MTIME);
        mark_inode_dirty(inode);
        if (IS_SYNC(inode)) {
                int err2;

                /*
                 * Opencode syncing since we don't have a file open to use
                 * standard fsync path.
                 */
                err = filemap_fdatawrite_range(mapping, start,
                                               start + count - 1);
                err2 = sync_mapping_buffers(mapping);
                if (!err)
                        err = err2;
                err2 = write_inode_now(inode, 1);
                if (!err)
                        err = err2;
                if (!err) {
                        err =  filemap_fdatawait_range(mapping, start,
                                                       start + count - 1);
                }
        }
out:
        return err;
}

/*
 * Preallocate space for a file. This implements fat's fallocate file
 * operation, which gets called from sys_fallocate system call. User
 * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set
 * we just allocate clusters without zeroing them out. Otherwise we
 * allocate and zero out clusters via an expanding truncate.
 */
static long fat_fallocate(struct file *file, int mode,
                          loff_t offset, loff_t len)
{
        int nr_cluster; /* Number of clusters to be allocated */
        loff_t mm_bytes; /* Number of bytes to be allocated for file */
        loff_t ondisksize; /* block aligned on-disk size in bytes*/
        struct inode *inode = file->f_mapping->host;
        struct super_block *sb = inode->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        int err = 0;

        /* No support for hole punch or other fallocate flags. */
        if (mode & ~FALLOC_FL_KEEP_SIZE)
                return -EOPNOTSUPP;

        /* No support for dir */
        if (!S_ISREG(inode->i_mode))
                return -EOPNOTSUPP;

        inode_lock(inode);
        if (mode & FALLOC_FL_KEEP_SIZE) {
                ondisksize = inode->i_blocks << 9;
                if ((offset + len) <= ondisksize)
                        goto error;

                /* First compute the number of clusters to be allocated */
                mm_bytes = offset + len - ondisksize;
                nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >>
                        sbi->cluster_bits;

                /* Start the allocation.We are not zeroing out the clusters */
                while (nr_cluster-- > 0) {
                        err = fat_add_cluster(inode);
                        if (err)
                                goto error;
                }
        } else {
                if ((offset + len) <= i_size_read(inode))
                        goto error;

                /* This is just an expanding truncate */
                err = fat_cont_expand(inode, (offset + len));
        }

error:
        inode_unlock(inode);
        return err;
}

/* Free all clusters after the skip'th cluster. */
static int fat_free(struct inode *inode, int skip)
{
        struct super_block *sb = inode->i_sb;
        int err, wait, free_start, i_start, i_logstart;

        if (MSDOS_I(inode)->i_start == 0)
                return 0;

        fat_cache_inval_inode(inode);

        wait = IS_DIRSYNC(inode);
        i_start = free_start = MSDOS_I(inode)->i_start;
        i_logstart = MSDOS_I(inode)->i_logstart;

        /* First, we write the new file size. */
        if (!skip) {
                MSDOS_I(inode)->i_start = 0;
                MSDOS_I(inode)->i_logstart = 0;
        }
        MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
        fat_truncate_time(inode, NULL, S_CTIME|S_MTIME);
        if (wait) {
                err = fat_sync_inode(inode);
                if (err) {
                        MSDOS_I(inode)->i_start = i_start;
                        MSDOS_I(inode)->i_logstart = i_logstart;
                        return err;
                }
        } else
                mark_inode_dirty(inode);

        /* Write a new EOF, and get the remaining cluster chain for freeing. */
        if (skip) {
                struct fat_entry fatent;
                int ret, fclus, dclus;

                ret = fat_get_cluster(inode, skip - 1, &fclus, &dclus);
                if (ret < 0)
                        return ret;
                else if (ret == FAT_ENT_EOF)
                        return 0;

                fatent_init(&fatent);
                ret = fat_ent_read(inode, &fatent, dclus);
                if (ret == FAT_ENT_EOF) {
                        fatent_brelse(&fatent);
                        return 0;
                } else if (ret == FAT_ENT_FREE) {
                        fat_fs_error(sb,
                                     "%s: invalid cluster chain (i_pos %lld)",
                                     __func__, MSDOS_I(inode)->i_pos);
                        ret = -EIO;
                } else if (ret > 0) {
                        err = fat_ent_write(inode, &fatent, FAT_ENT_EOF, wait);
                        if (err)
                                ret = err;
                }
                fatent_brelse(&fatent);
                if (ret < 0)
                        return ret;

                free_start = ret;
        }
        inode->i_blocks = skip << (MSDOS_SB(sb)->cluster_bits - 9);

        /* Freeing the remained cluster chain */
        return fat_free_clusters(inode, free_start);
}

void fat_truncate_blocks(struct inode *inode, loff_t offset)
{
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
        const unsigned int cluster_size = sbi->cluster_size;
        int nr_clusters;

        /*
         * This protects against truncating a file bigger than it was then
         * trying to write into the hole.
         */
        if (MSDOS_I(inode)->mmu_private > offset)
                MSDOS_I(inode)->mmu_private = offset;

        nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits;

        fat_free(inode, nr_clusters);
        fat_flush_inodes(inode->i_sb, inode, NULL);
}

int fat_getattr(struct mnt_idmap *idmap, const struct path *path,
                struct kstat *stat, u32 request_mask, unsigned int flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);

        generic_fillattr(idmap, request_mask, inode, stat);
        stat->blksize = sbi->cluster_size;

        if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) {
                /* Use i_pos for ino. This is used as fileid of nfs. */
                stat->ino = fat_i_pos_read(sbi, inode);
        }

        if (sbi->options.isvfat && request_mask & STATX_BTIME) {
                stat->result_mask |= STATX_BTIME;
                stat->btime = MSDOS_I(inode)->i_crtime;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(fat_getattr);

static int fat_sanitize_mode(const struct msdos_sb_info *sbi,
                             struct inode *inode, umode_t *mode_ptr)
{
        umode_t mask, perm;

        /*
         * Note, the basic check is already done by a caller of
         * (attr->ia_mode & ~FAT_VALID_MODE)
         */

        if (S_ISREG(inode->i_mode))
                mask = sbi->options.fs_fmask;
        else
                mask = sbi->options.fs_dmask;

        perm = *mode_ptr & ~(S_IFMT | mask);

        /*
         * Of the r and x bits, all (subject to umask) must be present. Of the
         * w bits, either all (subject to umask) or none must be present.
         *
         * If fat_mode_can_hold_ro(inode) is false, can't change w bits.
         */
        if ((perm & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO)))
                return -EPERM;
        if (fat_mode_can_hold_ro(inode)) {
                if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask)))
                        return -EPERM;
        } else {
                if ((perm & S_IWUGO) != (S_IWUGO & ~mask))
                        return -EPERM;
        }

        *mode_ptr &= S_IFMT | perm;

        return 0;
}

static int fat_allow_set_time(struct mnt_idmap *idmap,
                              struct msdos_sb_info *sbi, struct inode *inode)
{
        umode_t allow_utime = sbi->options.allow_utime;

        if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode),
                            current_fsuid())) {
                if (vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)))
                        allow_utime >>= 3;
                if (allow_utime & MAY_WRITE)
                        return 1;
        }

        /* use a default check */
        return 0;
}

#define TIMES_SET_FLAGS        (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
/* valid file mode bits */
#define FAT_VALID_MODE        (S_IFREG | S_IFDIR | S_IRWXUGO)

int fat_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                struct iattr *attr)
{
        struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
        struct inode *inode = d_inode(dentry);
        unsigned int ia_valid;
        int error;

        /* Check for setting the inode time. */
        ia_valid = attr->ia_valid;
        if (ia_valid & TIMES_SET_FLAGS) {
                if (fat_allow_set_time(idmap, sbi, inode))
                        attr->ia_valid &= ~TIMES_SET_FLAGS;
        }

        error = setattr_prepare(idmap, dentry, attr);
        attr->ia_valid = ia_valid;
        if (error) {
                if (sbi->options.quiet)
                        error = 0;
                goto out;
        }

        /*
         * Expand the file. Since inode_setattr() updates ->i_size
         * before calling the ->truncate(), but FAT needs to fill the
         * hole before it. XXX: this is no longer true with new truncate
         * sequence.
         */
        if (attr->ia_valid & ATTR_SIZE) {
                inode_dio_wait(inode);

                if (attr->ia_size > inode->i_size) {
                        error = fat_cont_expand(inode, attr->ia_size);
                        if (error || attr->ia_valid == ATTR_SIZE)
                                goto out;
                        attr->ia_valid &= ~ATTR_SIZE;
                }
        }

        if (((attr->ia_valid & ATTR_UID) &&
             (!uid_eq(from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid),
                      sbi->options.fs_uid))) ||
            ((attr->ia_valid & ATTR_GID) &&
             (!gid_eq(from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid),
                      sbi->options.fs_gid))) ||
            ((attr->ia_valid & ATTR_MODE) &&
             (attr->ia_mode & ~FAT_VALID_MODE)))
                error = -EPERM;

        if (error) {
                if (sbi->options.quiet)
                        error = 0;
                goto out;
        }

        /*
         * We don't return -EPERM here. Yes, strange, but this is too
         * old behavior.
         */
        if (attr->ia_valid & ATTR_MODE) {
                if (fat_sanitize_mode(sbi, inode, &attr->ia_mode) < 0)
                        attr->ia_valid &= ~ATTR_MODE;
        }

        if (attr->ia_valid & ATTR_SIZE) {
                error = fat_block_truncate_page(inode, attr->ia_size);
                if (error)
                        goto out;
                down_write(&MSDOS_I(inode)->truncate_lock);
                truncate_setsize(inode, attr->ia_size);
                fat_truncate_blocks(inode, attr->ia_size);
                up_write(&MSDOS_I(inode)->truncate_lock);
        }

        /*
         * setattr_copy can't truncate these appropriately, so we'll
         * copy them ourselves
         */
        if (attr->ia_valid & ATTR_ATIME)
                fat_truncate_time(inode, &attr->ia_atime, S_ATIME);
        if (attr->ia_valid & ATTR_CTIME)
                fat_truncate_time(inode, &attr->ia_ctime, S_CTIME);
        if (attr->ia_valid & ATTR_MTIME)
                fat_truncate_time(inode, &attr->ia_mtime, S_MTIME);
        attr->ia_valid &= ~(ATTR_ATIME|ATTR_CTIME|ATTR_MTIME);

        setattr_copy(idmap, inode, attr);
        mark_inode_dirty(inode);
out:
        return error;
}
EXPORT_SYMBOL_GPL(fat_setattr);

const struct inode_operations fat_file_inode_operations = {
        .setattr        = fat_setattr,
        .getattr        = fat_getattr,
        .update_time        = fat_update_time,
};




































































    2 





















    2 







    1 






    1 










    1 



    1 
    1 






































































    1 




















    1 

































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
// SPDX-License-Identifier: GPL-2.0
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/statfs.h>
#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/compat.h>
#include "internal.h"

static int flags_by_mnt(int mnt_flags)
{
        int flags = 0;

        if (mnt_flags & MNT_READONLY)
                flags |= ST_RDONLY;
        if (mnt_flags & MNT_NOSUID)
                flags |= ST_NOSUID;
        if (mnt_flags & MNT_NODEV)
                flags |= ST_NODEV;
        if (mnt_flags & MNT_NOEXEC)
                flags |= ST_NOEXEC;
        if (mnt_flags & MNT_NOATIME)
                flags |= ST_NOATIME;
        if (mnt_flags & MNT_NODIRATIME)
                flags |= ST_NODIRATIME;
        if (mnt_flags & MNT_RELATIME)
                flags |= ST_RELATIME;
        if (mnt_flags & MNT_NOSYMFOLLOW)
                flags |= ST_NOSYMFOLLOW;
        return flags;
}

static int flags_by_sb(int s_flags)
{
        int flags = 0;
        if (s_flags & SB_SYNCHRONOUS)
                flags |= ST_SYNCHRONOUS;
        if (s_flags & SB_MANDLOCK)
                flags |= ST_MANDLOCK;
        if (s_flags & SB_RDONLY)
                flags |= ST_RDONLY;
        return flags;
}

static int calculate_f_flags(struct vfsmount *mnt)
{
        return ST_VALID | flags_by_mnt(mnt->mnt_flags) |
                flags_by_sb(mnt->mnt_sb->s_flags);
}

static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
{
        int retval;

        if (!dentry->d_sb->s_op->statfs)
                return -ENOSYS;

        memset(buf, 0, sizeof(*buf));
        retval = security_sb_statfs(dentry);
        if (retval)
                return retval;
        retval = dentry->d_sb->s_op->statfs(dentry, buf);
        if (retval == 0 && buf->f_frsize == 0)
                buf->f_frsize = buf->f_bsize;
        return retval;
}

int vfs_get_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
{
        struct kstatfs st;
        int error;

        error = statfs_by_dentry(dentry, &st);
        if (error)
                return error;

        *fsid = st.f_fsid;
        return 0;
}
EXPORT_SYMBOL(vfs_get_fsid);

int vfs_statfs(const struct path *path, struct kstatfs *buf)
{
        int error;

        error = statfs_by_dentry(path->dentry, buf);
        if (!error)
                buf->f_flags = calculate_f_flags(path->mnt);
        return error;
}
EXPORT_SYMBOL(vfs_statfs);

int user_statfs(const char __user *pathname, struct kstatfs *st)
{
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT;
retry:
        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (!error) {
                error = vfs_statfs(&path, st);
                path_put(&path);
                if (retry_estale(error, lookup_flags)) {
                        lookup_flags |= LOOKUP_REVAL;
                        goto retry;
                }
        }
        return error;
}

int fd_statfs(int fd, struct kstatfs *st)
{
        struct fd f = fdget_raw(fd);
        int error = -EBADF;
        if (f.file) {
                error = vfs_statfs(&f.file->f_path, st);
                fdput(f);
        }
        return error;
}

static int do_statfs_native(struct kstatfs *st, struct statfs __user *p)
{
        struct statfs buf;

        if (sizeof(buf) == sizeof(*st))
                memcpy(&buf, st, sizeof(*st));
        else {
                memset(&buf, 0, sizeof(buf));
                if (sizeof buf.f_blocks == 4) {
                        if ((st->f_blocks | st->f_bfree | st->f_bavail |
                             st->f_bsize | st->f_frsize) &
                            0xffffffff00000000ULL)
                                return -EOVERFLOW;
                        /*
                         * f_files and f_ffree may be -1; it's okay to stuff
                         * that into 32 bits
                         */
                        if (st->f_files != -1 &&
                            (st->f_files & 0xffffffff00000000ULL))
                                return -EOVERFLOW;
                        if (st->f_ffree != -1 &&
                            (st->f_ffree & 0xffffffff00000000ULL))
                                return -EOVERFLOW;
                }

                buf.f_type = st->f_type;
                buf.f_bsize = st->f_bsize;
                buf.f_blocks = st->f_blocks;
                buf.f_bfree = st->f_bfree;
                buf.f_bavail = st->f_bavail;
                buf.f_files = st->f_files;
                buf.f_ffree = st->f_ffree;
                buf.f_fsid = st->f_fsid;
                buf.f_namelen = st->f_namelen;
                buf.f_frsize = st->f_frsize;
                buf.f_flags = st->f_flags;
        }
        if (copy_to_user(p, &buf, sizeof(buf)))
                return -EFAULT;
        return 0;
}

static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p)
{
        struct statfs64 buf;
        if (sizeof(buf) == sizeof(*st))
                memcpy(&buf, st, sizeof(*st));
        else {
                memset(&buf, 0, sizeof(buf));
                buf.f_type = st->f_type;
                buf.f_bsize = st->f_bsize;
                buf.f_blocks = st->f_blocks;
                buf.f_bfree = st->f_bfree;
                buf.f_bavail = st->f_bavail;
                buf.f_files = st->f_files;
                buf.f_ffree = st->f_ffree;
                buf.f_fsid = st->f_fsid;
                buf.f_namelen = st->f_namelen;
                buf.f_frsize = st->f_frsize;
                buf.f_flags = st->f_flags;
        }
        if (copy_to_user(p, &buf, sizeof(buf)))
                return -EFAULT;
        return 0;
}

SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
{
        struct kstatfs st;
        int error = user_statfs(pathname, &st);
        if (!error)
                error = do_statfs_native(&st, buf);
        return error;
}

SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
{
        struct kstatfs st;
        int error;
        if (sz != sizeof(*buf))
                return -EINVAL;
        error = user_statfs(pathname, &st);
        if (!error)
                error = do_statfs64(&st, buf);
        return error;
}

SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
{
        struct kstatfs st;
        int error = fd_statfs(fd, &st);
        if (!error)
                error = do_statfs_native(&st, buf);
        return error;
}

SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
{
        struct kstatfs st;
        int error;

        if (sz != sizeof(*buf))
                return -EINVAL;

        error = fd_statfs(fd, &st);
        if (!error)
                error = do_statfs64(&st, buf);
        return error;
}

static int vfs_ustat(dev_t dev, struct kstatfs *sbuf)
{
        struct super_block *s = user_get_super(dev, false);
        int err;
        if (!s)
                return -EINVAL;

        err = statfs_by_dentry(s->s_root, sbuf);
        drop_super(s);
        return err;
}

SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
{
        struct ustat tmp;
        struct kstatfs sbuf;
        int err = vfs_ustat(new_decode_dev(dev), &sbuf);
        if (err)
                return err;

        memset(&tmp,0,sizeof(struct ustat));
        tmp.f_tfree = sbuf.f_bfree;
        if (IS_ENABLED(CONFIG_ARCH_32BIT_USTAT_F_TINODE))
                tmp.f_tinode = min_t(u64, sbuf.f_ffree, UINT_MAX);
        else
                tmp.f_tinode = sbuf.f_ffree;

        return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0;
}

#ifdef CONFIG_COMPAT
static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf)
{
        struct compat_statfs buf;
        if (sizeof ubuf->f_blocks == 4) {
                if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
                     kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
                        return -EOVERFLOW;
                /* f_files and f_ffree may be -1; it's okay
                 * to stuff that into 32 bits */
                if (kbuf->f_files != 0xffffffffffffffffULL
                 && (kbuf->f_files & 0xffffffff00000000ULL))
                        return -EOVERFLOW;
                if (kbuf->f_ffree != 0xffffffffffffffffULL
                 && (kbuf->f_ffree & 0xffffffff00000000ULL))
                        return -EOVERFLOW;
        }
        memset(&buf, 0, sizeof(struct compat_statfs));
        buf.f_type = kbuf->f_type;
        buf.f_bsize = kbuf->f_bsize;
        buf.f_blocks = kbuf->f_blocks;
        buf.f_bfree = kbuf->f_bfree;
        buf.f_bavail = kbuf->f_bavail;
        buf.f_files = kbuf->f_files;
        buf.f_ffree = kbuf->f_ffree;
        buf.f_namelen = kbuf->f_namelen;
        buf.f_fsid.val[0] = kbuf->f_fsid.val[0];
        buf.f_fsid.val[1] = kbuf->f_fsid.val[1];
        buf.f_frsize = kbuf->f_frsize;
        buf.f_flags = kbuf->f_flags;
        if (copy_to_user(ubuf, &buf, sizeof(struct compat_statfs)))
                return -EFAULT;
        return 0;
}

/*
 * The following statfs calls are copies of code from fs/statfs.c and
 * should be checked against those from time to time
 */
COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf)
{
        struct kstatfs tmp;
        int error = user_statfs(pathname, &tmp);
        if (!error)
                error = put_compat_statfs(buf, &tmp);
        return error;
}

COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf)
{
        struct kstatfs tmp;
        int error = fd_statfs(fd, &tmp);
        if (!error)
                error = put_compat_statfs(buf, &tmp);
        return error;
}

static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf)
{
        struct compat_statfs64 buf;

        if ((kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
                return -EOVERFLOW;

        memset(&buf, 0, sizeof(struct compat_statfs64));
        buf.f_type = kbuf->f_type;
        buf.f_bsize = kbuf->f_bsize;
        buf.f_blocks = kbuf->f_blocks;
        buf.f_bfree = kbuf->f_bfree;
        buf.f_bavail = kbuf->f_bavail;
        buf.f_files = kbuf->f_files;
        buf.f_ffree = kbuf->f_ffree;
        buf.f_namelen = kbuf->f_namelen;
        buf.f_fsid.val[0] = kbuf->f_fsid.val[0];
        buf.f_fsid.val[1] = kbuf->f_fsid.val[1];
        buf.f_frsize = kbuf->f_frsize;
        buf.f_flags = kbuf->f_flags;
        if (copy_to_user(ubuf, &buf, sizeof(struct compat_statfs64)))
                return -EFAULT;
        return 0;
}

int kcompat_sys_statfs64(const char __user * pathname, compat_size_t sz, struct compat_statfs64 __user * buf)
{
        struct kstatfs tmp;
        int error;

        if (sz != sizeof(*buf))
                return -EINVAL;

        error = user_statfs(pathname, &tmp);
        if (!error)
                error = put_compat_statfs64(buf, &tmp);
        return error;
}

COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf)
{
        return kcompat_sys_statfs64(pathname, sz, buf);
}

int kcompat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user * buf)
{
        struct kstatfs tmp;
        int error;

        if (sz != sizeof(*buf))
                return -EINVAL;

        error = fd_statfs(fd, &tmp);
        if (!error)
                error = put_compat_statfs64(buf, &tmp);
        return error;
}

COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf)
{
        return kcompat_sys_fstatfs64(fd, sz, buf);
}

/*
 * This is a copy of sys_ustat, just dealing with a structure layout.
 * Given how simple this syscall is that apporach is more maintainable
 * than the various conversion hacks.
 */
COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u)
{
        struct compat_ustat tmp;
        struct kstatfs sbuf;
        int err = vfs_ustat(new_decode_dev(dev), &sbuf);
        if (err)
                return err;

        memset(&tmp, 0, sizeof(struct compat_ustat));
        tmp.f_tfree = sbuf.f_bfree;
        tmp.f_tinode = sbuf.f_ffree;
        if (copy_to_user(u, &tmp, sizeof(struct compat_ustat)))
                return -EFAULT;
        return 0;
}
#endif



































    1 










    1 




    1 










    1 
    1 

    1 
    1 
    1 





























    1 




    1 

    1 







    1 




    1 




    1 




    1 



    1 


















    1 





    1 
    1 
    1 








































































































































































































    1 




    1 



















    1 





    1 

    1 



    1 


    1 







    1 





    1 









    1 


    1 

















    1 

















    1 































    1 


    1 

    1 


    1 
    1 


    1 

























































































































































































    1 







    1 


















    1 
    1 




    1 








    1 









    1 























    1 
    1 




















    1 














    1 



























    1 






    1 





    1 







    1 





























































    1 





















    1 







































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
// SPDX-License-Identifier: GPL-2.0

#include <linux/err.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include "messages.h"
#include "ctree.h"
#include "extent_map.h"
#include "compression.h"
#include "btrfs_inode.h"
#include "disk-io.h"


static struct kmem_cache *extent_map_cache;

int __init extent_map_init(void)
{
        extent_map_cache = kmem_cache_create("btrfs_extent_map",
                                             sizeof(struct extent_map), 0, 0, NULL);
        if (!extent_map_cache)
                return -ENOMEM;
        return 0;
}

void __cold extent_map_exit(void)
{
        kmem_cache_destroy(extent_map_cache);
}

/*
 * Initialize the extent tree @tree.  Should be called for each new inode or
 * other user of the extent_map interface.
 */
void extent_map_tree_init(struct extent_map_tree *tree)
{
        tree->map = RB_ROOT_CACHED;
        INIT_LIST_HEAD(&tree->modified_extents);
        rwlock_init(&tree->lock);
}

/*
 * Allocate a new extent_map structure.  The new structure is returned with a
 * reference count of one and needs to be freed using free_extent_map()
 */
struct extent_map *alloc_extent_map(void)
{
        struct extent_map *em;
        em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
        if (!em)
                return NULL;
        RB_CLEAR_NODE(&em->rb_node);
        refcount_set(&em->refs, 1);
        INIT_LIST_HEAD(&em->list);
        return em;
}

/*
 * Drop the reference out on @em by one and free the structure if the reference
 * count hits zero.
 */
void free_extent_map(struct extent_map *em)
{
        if (!em)
                return;
        if (refcount_dec_and_test(&em->refs)) {
                WARN_ON(extent_map_in_tree(em));
                WARN_ON(!list_empty(&em->list));
                kmem_cache_free(extent_map_cache, em);
        }
}

/* Do the math around the end of an extent, handling wrapping. */
static u64 range_end(u64 start, u64 len)
{
        if (start + len < start)
                return (u64)-1;
        return start + len;
}

static void dec_evictable_extent_maps(struct btrfs_inode *inode)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;

        if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root)))
                percpu_counter_dec(&fs_info->evictable_extent_maps);
}

static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
{
        struct rb_node **p = &root->rb_root.rb_node;
        struct rb_node *parent = NULL;
        struct extent_map *entry = NULL;
        struct rb_node *orig_parent = NULL;
        u64 end = range_end(em->start, em->len);
        bool leftmost = true;

        while (*p) {
                parent = *p;
                entry = rb_entry(parent, struct extent_map, rb_node);

                if (em->start < entry->start) {
                        p = &(*p)->rb_left;
                } else if (em->start >= extent_map_end(entry)) {
                        p = &(*p)->rb_right;
                        leftmost = false;
                } else {
                        return -EEXIST;
                }
        }

        orig_parent = parent;
        while (parent && em->start >= extent_map_end(entry)) {
                parent = rb_next(parent);
                entry = rb_entry(parent, struct extent_map, rb_node);
        }
        if (parent)
                if (end > entry->start && em->start < extent_map_end(entry))
                        return -EEXIST;

        parent = orig_parent;
        entry = rb_entry(parent, struct extent_map, rb_node);
        while (parent && em->start < entry->start) {
                parent = rb_prev(parent);
                entry = rb_entry(parent, struct extent_map, rb_node);
        }
        if (parent)
                if (end > entry->start && em->start < extent_map_end(entry))
                        return -EEXIST;

        rb_link_node(&em->rb_node, orig_parent, p);
        rb_insert_color_cached(&em->rb_node, root, leftmost);
        return 0;
}

/*
 * Search through the tree for an extent_map with a given offset.  If it can't
 * be found, try to find some neighboring extents
 */
static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
                                     struct rb_node **prev_or_next_ret)
{
        struct rb_node *n = root->rb_node;
        struct rb_node *prev = NULL;
        struct rb_node *orig_prev = NULL;
        struct extent_map *entry;
        struct extent_map *prev_entry = NULL;

        ASSERT(prev_or_next_ret);

        while (n) {
                entry = rb_entry(n, struct extent_map, rb_node);
                prev = n;
                prev_entry = entry;

                if (offset < entry->start)
                        n = n->rb_left;
                else if (offset >= extent_map_end(entry))
                        n = n->rb_right;
                else
                        return n;
        }

        orig_prev = prev;
        while (prev && offset >= extent_map_end(prev_entry)) {
                prev = rb_next(prev);
                prev_entry = rb_entry(prev, struct extent_map, rb_node);
        }

        /*
         * Previous extent map found, return as in this case the caller does not
         * care about the next one.
         */
        if (prev) {
                *prev_or_next_ret = prev;
                return NULL;
        }

        prev = orig_prev;
        prev_entry = rb_entry(prev, struct extent_map, rb_node);
        while (prev && offset < prev_entry->start) {
                prev = rb_prev(prev);
                prev_entry = rb_entry(prev, struct extent_map, rb_node);
        }
        *prev_or_next_ret = prev;

        return NULL;
}

static inline u64 extent_map_block_end(const struct extent_map *em)
{
        if (em->block_start + em->block_len < em->block_start)
                return (u64)-1;
        return em->block_start + em->block_len;
}

static bool can_merge_extent_map(const struct extent_map *em)
{
        if (em->flags & EXTENT_FLAG_PINNED)
                return false;

        /* Don't merge compressed extents, we need to know their actual size. */
        if (extent_map_is_compressed(em))
                return false;

        if (em->flags & EXTENT_FLAG_LOGGING)
                return false;

        /*
         * We don't want to merge stuff that hasn't been written to the log yet
         * since it may not reflect exactly what is on disk, and that would be
         * bad.
         */
        if (!list_empty(&em->list))
                return false;

        return true;
}

/* Check to see if two extent_map structs are adjacent and safe to merge. */
static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next)
{
        if (extent_map_end(prev) != next->start)
                return false;

        if (prev->flags != next->flags)
                return false;

        if (next->block_start < EXTENT_MAP_LAST_BYTE - 1)
                return next->block_start == extent_map_block_end(prev);

        /* HOLES and INLINE extents. */
        return next->block_start == prev->block_start;
}

static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
{
        struct extent_map_tree *tree = &inode->extent_tree;
        struct extent_map *merge = NULL;
        struct rb_node *rb;

        /*
         * We can't modify an extent map that is in the tree and that is being
         * used by another task, as it can cause that other task to see it in
         * inconsistent state during the merging. We always have 1 reference for
         * the tree and 1 for this task (which is unpinning the extent map or
         * clearing the logging flag), so anything > 2 means it's being used by
         * other tasks too.
         */
        if (refcount_read(&em->refs) > 2)
                return;

        if (!can_merge_extent_map(em))
                return;

        if (em->start != 0) {
                rb = rb_prev(&em->rb_node);
                if (rb)
                        merge = rb_entry(rb, struct extent_map, rb_node);
                if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) {
                        em->start = merge->start;
                        em->orig_start = merge->orig_start;
                        em->len += merge->len;
                        em->block_len += merge->block_len;
                        em->block_start = merge->block_start;
                        em->generation = max(em->generation, merge->generation);
                        em->flags |= EXTENT_FLAG_MERGED;

                        rb_erase_cached(&merge->rb_node, &tree->map);
                        RB_CLEAR_NODE(&merge->rb_node);
                        free_extent_map(merge);
                        dec_evictable_extent_maps(inode);
                }
        }

        rb = rb_next(&em->rb_node);
        if (rb)
                merge = rb_entry(rb, struct extent_map, rb_node);
        if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
                em->len += merge->len;
                em->block_len += merge->block_len;
                rb_erase_cached(&merge->rb_node, &tree->map);
                RB_CLEAR_NODE(&merge->rb_node);
                em->generation = max(em->generation, merge->generation);
                em->flags |= EXTENT_FLAG_MERGED;
                free_extent_map(merge);
                dec_evictable_extent_maps(inode);
        }
}

/*
 * Unpin an extent from the cache.
 *
 * @inode:        the inode from which we are unpinning an extent range
 * @start:        logical offset in the file
 * @len:        length of the extent
 * @gen:        generation that this extent has been modified in
 *
 * Called after an extent has been written to disk properly.  Set the generation
 * to the generation that actually added the file item to the inode so we know
 * we need to sync this extent when we call fsync().
 *
 * Returns: 0             on success
 *             -ENOENT  when the extent is not found in the tree
 *             -EUCLEAN if the found extent does not match the expected start
 */
int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct extent_map_tree *tree = &inode->extent_tree;
        int ret = 0;
        struct extent_map *em;

        write_lock(&tree->lock);
        em = lookup_extent_mapping(tree, start, len);

        if (WARN_ON(!em)) {
                btrfs_warn(fs_info,
"no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu",
                           btrfs_ino(inode), btrfs_root_id(inode->root),
                           start, start + len, gen);
                ret = -ENOENT;
                goto out;
        }

        if (WARN_ON(em->start != start)) {
                btrfs_warn(fs_info,
"found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu",
                           btrfs_ino(inode), btrfs_root_id(inode->root),
                           em->start, start, start + len, gen);
                ret = -EUCLEAN;
                goto out;
        }

        em->generation = gen;
        em->flags &= ~EXTENT_FLAG_PINNED;

        try_merge_map(inode, em);

out:
        write_unlock(&tree->lock);
        free_extent_map(em);
        return ret;

}

void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em)
{
        lockdep_assert_held_write(&inode->extent_tree.lock);

        em->flags &= ~EXTENT_FLAG_LOGGING;
        if (extent_map_in_tree(em))
                try_merge_map(inode, em);
}

static inline void setup_extent_mapping(struct btrfs_inode *inode,
                                        struct extent_map *em,
                                        int modified)
{
        refcount_inc(&em->refs);

        ASSERT(list_empty(&em->list));

        if (modified)
                list_add(&em->list, &inode->extent_tree.modified_extents);
        else
                try_merge_map(inode, em);
}

/*
 * Add a new extent map to an inode's extent map tree.
 *
 * @inode:        the target inode
 * @em:                map to insert
 * @modified:        indicate whether the given @em should be added to the
 *                modified list, which indicates the extent needs to be logged
 *
 * Insert @em into the @inode's extent map tree or perform a simple
 * forward/backward merge with existing mappings.  The extent_map struct passed
 * in will be inserted into the tree directly, with an additional reference
 * taken, or a reference dropped if the merge attempt was successful.
 */
static int add_extent_mapping(struct btrfs_inode *inode,
                              struct extent_map *em, int modified)
{
        struct extent_map_tree *tree = &inode->extent_tree;
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;

        lockdep_assert_held_write(&tree->lock);

        ret = tree_insert(&tree->map, em);
        if (ret)
                return ret;

        setup_extent_mapping(inode, em, modified);

        if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(root)))
                percpu_counter_inc(&fs_info->evictable_extent_maps);

        return 0;
}

static struct extent_map *
__lookup_extent_mapping(struct extent_map_tree *tree,
                        u64 start, u64 len, int strict)
{
        struct extent_map *em;
        struct rb_node *rb_node;
        struct rb_node *prev_or_next = NULL;
        u64 end = range_end(start, len);

        rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next);
        if (!rb_node) {
                if (prev_or_next)
                        rb_node = prev_or_next;
                else
                        return NULL;
        }

        em = rb_entry(rb_node, struct extent_map, rb_node);

        if (strict && !(end > em->start && start < extent_map_end(em)))
                return NULL;

        refcount_inc(&em->refs);
        return em;
}

/*
 * Lookup extent_map that intersects @start + @len range.
 *
 * @tree:        tree to lookup in
 * @start:        byte offset to start the search
 * @len:        length of the lookup range
 *
 * Find and return the first extent_map struct in @tree that intersects the
 * [start, len] range.  There may be additional objects in the tree that
 * intersect, so check the object returned carefully to make sure that no
 * additional lookups are needed.
 */
struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
                                         u64 start, u64 len)
{
        return __lookup_extent_mapping(tree, start, len, 1);
}

/*
 * Find a nearby extent map intersecting @start + @len (not an exact search).
 *
 * @tree:        tree to lookup in
 * @start:        byte offset to start the search
 * @len:        length of the lookup range
 *
 * Find and return the first extent_map struct in @tree that intersects the
 * [start, len] range.
 *
 * If one can't be found, any nearby extent may be returned
 */
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
                                         u64 start, u64 len)
{
        return __lookup_extent_mapping(tree, start, len, 0);
}

/*
 * Remove an extent_map from its inode's extent tree.
 *
 * @inode:        the inode the extent map belongs to
 * @em:                extent map being removed
 *
 * Remove @em from the extent tree of @inode.  No reference counts are dropped,
 * and no checks are done to see if the range is in use.
 */
void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
{
        struct extent_map_tree *tree = &inode->extent_tree;

        lockdep_assert_held_write(&tree->lock);

        WARN_ON(em->flags & EXTENT_FLAG_PINNED);
        rb_erase_cached(&em->rb_node, &tree->map);
        if (!(em->flags & EXTENT_FLAG_LOGGING))
                list_del_init(&em->list);
        RB_CLEAR_NODE(&em->rb_node);

        dec_evictable_extent_maps(inode);
}

static void replace_extent_mapping(struct btrfs_inode *inode,
                                   struct extent_map *cur,
                                   struct extent_map *new,
                                   int modified)
{
        struct extent_map_tree *tree = &inode->extent_tree;

        lockdep_assert_held_write(&tree->lock);

        WARN_ON(cur->flags & EXTENT_FLAG_PINNED);
        ASSERT(extent_map_in_tree(cur));
        if (!(cur->flags & EXTENT_FLAG_LOGGING))
                list_del_init(&cur->list);
        rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
        RB_CLEAR_NODE(&cur->rb_node);

        setup_extent_mapping(inode, new, modified);
}

static struct extent_map *next_extent_map(const struct extent_map *em)
{
        struct rb_node *next;

        next = rb_next(&em->rb_node);
        if (!next)
                return NULL;
        return container_of(next, struct extent_map, rb_node);
}

static struct extent_map *prev_extent_map(struct extent_map *em)
{
        struct rb_node *prev;

        prev = rb_prev(&em->rb_node);
        if (!prev)
                return NULL;
        return container_of(prev, struct extent_map, rb_node);
}

/*
 * Helper for btrfs_get_extent.  Given an existing extent in the tree,
 * the existing extent is the nearest extent to map_start,
 * and an extent that you want to insert, deal with overlap and insert
 * the best fitted new extent into the tree.
 */
static noinline int merge_extent_mapping(struct btrfs_inode *inode,
                                         struct extent_map *existing,
                                         struct extent_map *em,
                                         u64 map_start)
{
        struct extent_map *prev;
        struct extent_map *next;
        u64 start;
        u64 end;
        u64 start_diff;

        if (map_start < em->start || map_start >= extent_map_end(em))
                return -EINVAL;

        if (existing->start > map_start) {
                next = existing;
                prev = prev_extent_map(next);
        } else {
                prev = existing;
                next = next_extent_map(prev);
        }

        start = prev ? extent_map_end(prev) : em->start;
        start = max_t(u64, start, em->start);
        end = next ? next->start : extent_map_end(em);
        end = min_t(u64, end, extent_map_end(em));
        start_diff = start - em->start;
        em->start = start;
        em->len = end - start;
        if (em->block_start < EXTENT_MAP_LAST_BYTE &&
            !extent_map_is_compressed(em)) {
                em->block_start += start_diff;
                em->block_len = em->len;
        }
        return add_extent_mapping(inode, em, 0);
}

/*
 * Add extent mapping into an inode's extent map tree.
 *
 * @inode:    target inode
 * @em_in:    extent we are inserting
 * @start:    start of the logical range btrfs_get_extent() is requesting
 * @len:      length of the logical range btrfs_get_extent() is requesting
 *
 * Note that @em_in's range may be different from [start, start+len),
 * but they must be overlapped.
 *
 * Insert @em_in into the inode's extent map tree. In case there is an
 * overlapping range, handle the -EEXIST by either:
 * a) Returning the existing extent in @em_in if @start is within the
 *    existing em.
 * b) Merge the existing extent with @em_in passed in.
 *
 * Return 0 on success, otherwise -EEXIST.
 *
 */
int btrfs_add_extent_mapping(struct btrfs_inode *inode,
                             struct extent_map **em_in, u64 start, u64 len)
{
        int ret;
        struct extent_map *em = *em_in;
        struct btrfs_fs_info *fs_info = inode->root->fs_info;

        /*
         * Tree-checker should have rejected any inline extent with non-zero
         * file offset. Here just do a sanity check.
         */
        if (em->block_start == EXTENT_MAP_INLINE)
                ASSERT(em->start == 0);

        ret = add_extent_mapping(inode, em, 0);
        /* it is possible that someone inserted the extent into the tree
         * while we had the lock dropped.  It is also possible that
         * an overlapping map exists in the tree
         */
        if (ret == -EEXIST) {
                struct extent_map *existing;

                existing = search_extent_mapping(&inode->extent_tree, start, len);

                trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);

                /*
                 * existing will always be non-NULL, since there must be
                 * extent causing the -EEXIST.
                 */
                if (start >= existing->start &&
                    start < extent_map_end(existing)) {
                        free_extent_map(em);
                        *em_in = existing;
                        ret = 0;
                } else {
                        u64 orig_start = em->start;
                        u64 orig_len = em->len;

                        /*
                         * The existing extent map is the one nearest to
                         * the [start, start + len) range which overlaps
                         */
                        ret = merge_extent_mapping(inode, existing, em, start);
                        if (WARN_ON(ret)) {
                                free_extent_map(em);
                                *em_in = NULL;
                                btrfs_warn(fs_info,
"extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu",
                                           existing->start, extent_map_end(existing),
                                           orig_start, orig_start + orig_len, start);
                        }
                        free_extent_map(existing);
                }
        }

        ASSERT(ret == 0 || ret == -EEXIST);
        return ret;
}

/*
 * Drop all extent maps from a tree in the fastest possible way, rescheduling
 * if needed. This avoids searching the tree, from the root down to the first
 * extent map, before each deletion.
 */
static void drop_all_extent_maps_fast(struct btrfs_inode *inode)
{
        struct extent_map_tree *tree = &inode->extent_tree;

        write_lock(&tree->lock);
        while (!RB_EMPTY_ROOT(&tree->map.rb_root)) {
                struct extent_map *em;
                struct rb_node *node;

                node = rb_first_cached(&tree->map);
                em = rb_entry(node, struct extent_map, rb_node);
                em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
                remove_extent_mapping(inode, em);
                free_extent_map(em);
                cond_resched_rwlock_write(&tree->lock);
        }
        write_unlock(&tree->lock);
}

/*
 * Drop all extent maps in a given range.
 *
 * @inode:       The target inode.
 * @start:       Start offset of the range.
 * @end:         End offset of the range (inclusive value).
 * @skip_pinned: Indicate if pinned extent maps should be ignored or not.
 *
 * This drops all the extent maps that intersect the given range [@start, @end].
 * Extent maps that partially overlap the range and extend behind or beyond it,
 * are split.
 * The caller should have locked an appropriate file range in the inode's io
 * tree before calling this function.
 */
void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
                                 bool skip_pinned)
{
        struct extent_map *split;
        struct extent_map *split2;
        struct extent_map *em;
        struct extent_map_tree *em_tree = &inode->extent_tree;
        u64 len = end - start + 1;

        WARN_ON(end < start);
        if (end == (u64)-1) {
                if (start == 0 && !skip_pinned) {
                        drop_all_extent_maps_fast(inode);
                        return;
                }
                len = (u64)-1;
        } else {
                /* Make end offset exclusive for use in the loop below. */
                end++;
        }

        /*
         * It's ok if we fail to allocate the extent maps, see the comment near
         * the bottom of the loop below. We only need two spare extent maps in
         * the worst case, where the first extent map that intersects our range
         * starts before the range and the last extent map that intersects our
         * range ends after our range (and they might be the same extent map),
         * because we need to split those two extent maps at the boundaries.
         */
        split = alloc_extent_map();
        split2 = alloc_extent_map();

        write_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, start, len);

        while (em) {
                /* extent_map_end() returns exclusive value (last byte + 1). */
                const u64 em_end = extent_map_end(em);
                struct extent_map *next_em = NULL;
                u64 gen;
                unsigned long flags;
                bool modified;
                bool compressed;

                if (em_end < end) {
                        next_em = next_extent_map(em);
                        if (next_em) {
                                if (next_em->start < end)
                                        refcount_inc(&next_em->refs);
                                else
                                        next_em = NULL;
                        }
                }

                if (skip_pinned && (em->flags & EXTENT_FLAG_PINNED)) {
                        start = em_end;
                        goto next;
                }

                flags = em->flags;
                /*
                 * In case we split the extent map, we want to preserve the
                 * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want
                 * it on the new extent maps.
                 */
                em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
                modified = !list_empty(&em->list);

                /*
                 * The extent map does not cross our target range, so no need to
                 * split it, we can remove it directly.
                 */
                if (em->start >= start && em_end <= end)
                        goto remove_em;

                gen = em->generation;
                compressed = extent_map_is_compressed(em);

                if (em->start < start) {
                        if (!split) {
                                split = split2;
                                split2 = NULL;
                                if (!split)
                                        goto remove_em;
                        }
                        split->start = em->start;
                        split->len = start - em->start;

                        if (em->block_start < EXTENT_MAP_LAST_BYTE) {
                                split->orig_start = em->orig_start;
                                split->block_start = em->block_start;

                                if (compressed)
                                        split->block_len = em->block_len;
                                else
                                        split->block_len = split->len;
                                split->orig_block_len = max(split->block_len,
                                                em->orig_block_len);
                                split->ram_bytes = em->ram_bytes;
                        } else {
                                split->orig_start = split->start;
                                split->block_len = 0;
                                split->block_start = em->block_start;
                                split->orig_block_len = 0;
                                split->ram_bytes = split->len;
                        }

                        split->generation = gen;
                        split->flags = flags;
                        replace_extent_mapping(inode, em, split, modified);
                        free_extent_map(split);
                        split = split2;
                        split2 = NULL;
                }
                if (em_end > end) {
                        if (!split) {
                                split = split2;
                                split2 = NULL;
                                if (!split)
                                        goto remove_em;
                        }
                        split->start = end;
                        split->len = em_end - end;
                        split->block_start = em->block_start;
                        split->flags = flags;
                        split->generation = gen;

                        if (em->block_start < EXTENT_MAP_LAST_BYTE) {
                                split->orig_block_len = max(em->block_len,
                                                    em->orig_block_len);

                                split->ram_bytes = em->ram_bytes;
                                if (compressed) {
                                        split->block_len = em->block_len;
                                        split->orig_start = em->orig_start;
                                } else {
                                        const u64 diff = end - em->start;

                                        split->block_len = split->len;
                                        split->block_start += diff;
                                        split->orig_start = em->orig_start;
                                }
                        } else {
                                split->ram_bytes = split->len;
                                split->orig_start = split->start;
                                split->block_len = 0;
                                split->orig_block_len = 0;
                        }

                        if (extent_map_in_tree(em)) {
                                replace_extent_mapping(inode, em, split, modified);
                        } else {
                                int ret;

                                ret = add_extent_mapping(inode, split, modified);
                                /* Logic error, shouldn't happen. */
                                ASSERT(ret == 0);
                                if (WARN_ON(ret != 0) && modified)
                                        btrfs_set_inode_full_sync(inode);
                        }
                        free_extent_map(split);
                        split = NULL;
                }
remove_em:
                if (extent_map_in_tree(em)) {
                        /*
                         * If the extent map is still in the tree it means that
                         * either of the following is true:
                         *
                         * 1) It fits entirely in our range (doesn't end beyond
                         *    it or starts before it);
                         *
                         * 2) It starts before our range and/or ends after our
                         *    range, and we were not able to allocate the extent
                         *    maps for split operations, @split and @split2.
                         *
                         * If we are at case 2) then we just remove the entire
                         * extent map - this is fine since if anyone needs it to
                         * access the subranges outside our range, will just
                         * load it again from the subvolume tree's file extent
                         * item. However if the extent map was in the list of
                         * modified extents, then we must mark the inode for a
                         * full fsync, otherwise a fast fsync will miss this
                         * extent if it's new and needs to be logged.
                         */
                        if ((em->start < start || em_end > end) && modified) {
                                ASSERT(!split);
                                btrfs_set_inode_full_sync(inode);
                        }
                        remove_extent_mapping(inode, em);
                }

                /*
                 * Once for the tree reference (we replaced or removed the
                 * extent map from the tree).
                 */
                free_extent_map(em);
next:
                /* Once for us (for our lookup reference). */
                free_extent_map(em);

                em = next_em;
        }

        write_unlock(&em_tree->lock);

        free_extent_map(split);
        free_extent_map(split2);
}

/*
 * Replace a range in the inode's extent map tree with a new extent map.
 *
 * @inode:      The target inode.
 * @new_em:     The new extent map to add to the inode's extent map tree.
 * @modified:   Indicate if the new extent map should be added to the list of
 *              modified extents (for fast fsync tracking).
 *
 * Drops all the extent maps in the inode's extent map tree that intersect the
 * range of the new extent map and adds the new extent map to the tree.
 * The caller should have locked an appropriate file range in the inode's io
 * tree before calling this function.
 */
int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
                                   struct extent_map *new_em,
                                   bool modified)
{
        const u64 end = new_em->start + new_em->len - 1;
        struct extent_map_tree *tree = &inode->extent_tree;
        int ret;

        ASSERT(!extent_map_in_tree(new_em));

        /*
         * The caller has locked an appropriate file range in the inode's io
         * tree, but getting -EEXIST when adding the new extent map can still
         * happen in case there are extents that partially cover the range, and
         * this is due to two tasks operating on different parts of the extent.
         * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from
         * btrfs_get_extent") for an example and details.
         */
        do {
                btrfs_drop_extent_map_range(inode, new_em->start, end, false);
                write_lock(&tree->lock);
                ret = add_extent_mapping(inode, new_em, modified);
                write_unlock(&tree->lock);
        } while (ret == -EEXIST);

        return ret;
}

/*
 * Split off the first pre bytes from the extent_map at [start, start + len],
 * and set the block_start for it to new_logical.
 *
 * This function is used when an ordered_extent needs to be split.
 */
int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
                     u64 new_logical)
{
        struct extent_map_tree *em_tree = &inode->extent_tree;
        struct extent_map *em;
        struct extent_map *split_pre = NULL;
        struct extent_map *split_mid = NULL;
        int ret = 0;
        unsigned long flags;

        ASSERT(pre != 0);
        ASSERT(pre < len);

        split_pre = alloc_extent_map();
        if (!split_pre)
                return -ENOMEM;
        split_mid = alloc_extent_map();
        if (!split_mid) {
                ret = -ENOMEM;
                goto out_free_pre;
        }

        lock_extent(&inode->io_tree, start, start + len - 1, NULL);
        write_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, start, len);
        if (!em) {
                ret = -EIO;
                goto out_unlock;
        }

        ASSERT(em->len == len);
        ASSERT(!extent_map_is_compressed(em));
        ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
        ASSERT(em->flags & EXTENT_FLAG_PINNED);
        ASSERT(!(em->flags & EXTENT_FLAG_LOGGING));
        ASSERT(!list_empty(&em->list));

        flags = em->flags;
        em->flags &= ~EXTENT_FLAG_PINNED;

        /* First, replace the em with a new extent_map starting from * em->start */
        split_pre->start = em->start;
        split_pre->len = pre;
        split_pre->orig_start = split_pre->start;
        split_pre->block_start = new_logical;
        split_pre->block_len = split_pre->len;
        split_pre->orig_block_len = split_pre->block_len;
        split_pre->ram_bytes = split_pre->len;
        split_pre->flags = flags;
        split_pre->generation = em->generation;

        replace_extent_mapping(inode, em, split_pre, 1);

        /*
         * Now we only have an extent_map at:
         *     [em->start, em->start + pre]
         */

        /* Insert the middle extent_map. */
        split_mid->start = em->start + pre;
        split_mid->len = em->len - pre;
        split_mid->orig_start = split_mid->start;
        split_mid->block_start = em->block_start + pre;
        split_mid->block_len = split_mid->len;
        split_mid->orig_block_len = split_mid->block_len;
        split_mid->ram_bytes = split_mid->len;
        split_mid->flags = flags;
        split_mid->generation = em->generation;
        add_extent_mapping(inode, split_mid, 1);

        /* Once for us */
        free_extent_map(em);
        /* Once for the tree */
        free_extent_map(em);

out_unlock:
        write_unlock(&em_tree->lock);
        unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
        free_extent_map(split_mid);
out_free_pre:
        free_extent_map(split_pre);
        return ret;
}

static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_to_scan)
{
        const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
        struct extent_map_tree *tree = &inode->extent_tree;
        long nr_dropped = 0;
        struct rb_node *node;

        /*
         * Take the mmap lock so that we serialize with the inode logging phase
         * of fsync because we may need to set the full sync flag on the inode,
         * in case we have to remove extent maps in the tree's list of modified
         * extents. If we set the full sync flag in the inode while an fsync is
         * in progress, we may risk missing new extents because before the flag
         * is set, fsync decides to only wait for writeback to complete and then
         * during inode logging it sees the flag set and uses the subvolume tree
         * to find new extents, which may not be there yet because ordered
         * extents haven't completed yet.
         *
         * We also do a try lock because otherwise we could deadlock. This is
         * because the shrinker for this filesystem may be invoked while we are
         * in a path that is holding the mmap lock in write mode. For example in
         * a reflink operation while COWing an extent buffer, when allocating
         * pages for a new extent buffer and under memory pressure, the shrinker
         * may be invoked, and therefore we would deadlock by attempting to read
         * lock the mmap lock while we are holding already a write lock on it.
         */
        if (!down_read_trylock(&inode->i_mmap_lock))
                return 0;

        write_lock(&tree->lock);
        node = rb_first_cached(&tree->map);
        while (node) {
                struct extent_map *em;

                em = rb_entry(node, struct extent_map, rb_node);
                node = rb_next(node);
                (*scanned)++;

                if (em->flags & EXTENT_FLAG_PINNED)
                        goto next;

                /*
                 * If the inode is in the list of modified extents (new) and its
                 * generation is the same (or is greater than) the current fs
                 * generation, it means it was not yet persisted so we have to
                 * set the full sync flag so that the next fsync will not miss
                 * it.
                 */
                if (!list_empty(&em->list) && em->generation >= cur_fs_gen)
                        btrfs_set_inode_full_sync(inode);

                remove_extent_mapping(inode, em);
                trace_btrfs_extent_map_shrinker_remove_em(inode, em);
                /* Drop the reference for the tree. */
                free_extent_map(em);
                nr_dropped++;
next:
                if (*scanned >= nr_to_scan)
                        break;

                /*
                 * Restart if we had to reschedule, and any extent maps that were
                 * pinned before may have become unpinned after we released the
                 * lock and took it again.
                 */
                if (cond_resched_rwlock_write(&tree->lock))
                        node = rb_first_cached(&tree->map);
        }
        write_unlock(&tree->lock);
        up_read(&inode->i_mmap_lock);

        return nr_dropped;
}

static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_scan)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_inode *inode;
        long nr_dropped = 0;
        u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1;

        inode = btrfs_find_first_inode(root, min_ino);
        while (inode) {
                nr_dropped += btrfs_scan_inode(inode, scanned, nr_to_scan);

                min_ino = btrfs_ino(inode) + 1;
                fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode);
                iput(&inode->vfs_inode);

                if (*scanned >= nr_to_scan)
                        break;

                cond_resched();
                inode = btrfs_find_first_inode(root, min_ino);
        }

        if (inode) {
                /*
                 * There are still inodes in this root or we happened to process
                 * the last one and reached the scan limit. In either case set
                 * the current root to this one, so we'll resume from the next
                 * inode if there is one or we will find out this was the last
                 * one and move to the next root.
                 */
                fs_info->extent_map_shrinker_last_root = btrfs_root_id(root);
        } else {
                /*
                 * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
                 * that when processing the next root we start from its first inode.
                 */
                fs_info->extent_map_shrinker_last_ino = 0;
                fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1;
        }

        return nr_dropped;
}

long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
{
        const u64 start_root_id = fs_info->extent_map_shrinker_last_root;
        u64 next_root_id = start_root_id;
        bool cycled = false;
        long nr_dropped = 0;
        long scanned = 0;

        if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
                s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);

                trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, nr);
        }

        while (scanned < nr_to_scan) {
                struct btrfs_root *root;
                unsigned long count;

                spin_lock(&fs_info->fs_roots_radix_lock);
                count = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
                                               (void **)&root,
                                               (unsigned long)next_root_id, 1);
                if (count == 0) {
                        spin_unlock(&fs_info->fs_roots_radix_lock);
                        if (start_root_id > 0 && !cycled) {
                                next_root_id = 0;
                                fs_info->extent_map_shrinker_last_root = 0;
                                fs_info->extent_map_shrinker_last_ino = 0;
                                cycled = true;
                                continue;
                        }
                        break;
                }
                next_root_id = btrfs_root_id(root) + 1;
                root = btrfs_grab_root(root);
                spin_unlock(&fs_info->fs_roots_radix_lock);

                if (!root)
                        continue;

                if (is_fstree(btrfs_root_id(root)))
                        nr_dropped += btrfs_scan_root(root, &scanned, nr_to_scan);

                btrfs_put_root(root);
        }

        if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
                s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);

                trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
        }

        return nr_dropped;
}






































    2 



    2 



































    1 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/export.h>

/*
 * fs on-disk file type to dirent file type conversion
 */
static const unsigned char fs_dtype_by_ftype[FT_MAX] = {
        [FT_UNKNOWN]        = DT_UNKNOWN,
        [FT_REG_FILE]        = DT_REG,
        [FT_DIR]        = DT_DIR,
        [FT_CHRDEV]        = DT_CHR,
        [FT_BLKDEV]        = DT_BLK,
        [FT_FIFO]        = DT_FIFO,
        [FT_SOCK]        = DT_SOCK,
        [FT_SYMLINK]        = DT_LNK
};

/**
 * fs_ftype_to_dtype() - fs on-disk file type to dirent type.
 * @filetype: The on-disk file type to convert.
 *
 * This function converts the on-disk file type value (FT_*) to the directory
 * entry type (DT_*).
 *
 * Context: Any context.
 * Return:
 * * DT_UNKNOWN                - Unknown type
 * * DT_FIFO                - FIFO
 * * DT_CHR                - Character device
 * * DT_DIR                - Directory
 * * DT_BLK                - Block device
 * * DT_REG                - Regular file
 * * DT_LNK                - Symbolic link
 * * DT_SOCK                - Local-domain socket
 */
unsigned char fs_ftype_to_dtype(unsigned int filetype)
{
        if (filetype >= FT_MAX)
                return DT_UNKNOWN;

        return fs_dtype_by_ftype[filetype];
}
EXPORT_SYMBOL_GPL(fs_ftype_to_dtype);

/*
 * dirent file type to fs on-disk file type conversion
 * Values not initialized explicitly are FT_UNKNOWN (0).
 */
static const unsigned char fs_ftype_by_dtype[DT_MAX] = {
        [DT_REG]        = FT_REG_FILE,
        [DT_DIR]        = FT_DIR,
        [DT_LNK]        = FT_SYMLINK,
        [DT_CHR]        = FT_CHRDEV,
        [DT_BLK]        = FT_BLKDEV,
        [DT_FIFO]        = FT_FIFO,
        [DT_SOCK]        = FT_SOCK,
};

/**
 * fs_umode_to_ftype() - file mode to on-disk file type.
 * @mode: The file mode to convert.
 *
 * This function converts the file mode value to the on-disk file type (FT_*).
 *
 * Context: Any context.
 * Return:
 * * FT_UNKNOWN                - Unknown type
 * * FT_REG_FILE        - Regular file
 * * FT_DIR                - Directory
 * * FT_CHRDEV                - Character device
 * * FT_BLKDEV                - Block device
 * * FT_FIFO                - FIFO
 * * FT_SOCK                - Local-domain socket
 * * FT_SYMLINK                - Symbolic link
 */
unsigned char fs_umode_to_ftype(umode_t mode)
{
        return fs_ftype_by_dtype[S_DT(mode)];
}
EXPORT_SYMBOL_GPL(fs_umode_to_ftype);

/**
 * fs_umode_to_dtype() - file mode to dirent file type.
 * @mode: The file mode to convert.
 *
 * This function converts the file mode value to the directory
 * entry type (DT_*).
 *
 * Context: Any context.
 * Return:
 * * DT_UNKNOWN                - Unknown type
 * * DT_FIFO                - FIFO
 * * DT_CHR                - Character device
 * * DT_DIR                - Directory
 * * DT_BLK                - Block device
 * * DT_REG                - Regular file
 * * DT_LNK                - Symbolic link
 * * DT_SOCK                - Local-domain socket
 */
unsigned char fs_umode_to_dtype(umode_t mode)
{
        return fs_ftype_to_dtype(fs_umode_to_ftype(mode));
}
EXPORT_SYMBOL_GPL(fs_umode_to_dtype);




































































































































































































































































    1 




















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Copyright (C) 2001 Momchil Velikov
 * Portions Copyright (C) 2001 Christoph Hellwig
 * Copyright (C) 2006 Nick Piggin
 * Copyright (C) 2012 Konstantin Khlebnikov
 */
#ifndef _LINUX_RADIX_TREE_H
#define _LINUX_RADIX_TREE_H

#include <linux/bitops.h>
#include <linux/gfp_types.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/math.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/xarray.h>
#include <linux/local_lock.h>

/* Keep unconverted code working */
#define radix_tree_root                xarray
#define radix_tree_node                xa_node

struct radix_tree_preload {
        local_lock_t lock;
        unsigned nr;
        /* nodes->parent points to next preallocated node */
        struct radix_tree_node *nodes;
};
DECLARE_PER_CPU(struct radix_tree_preload, radix_tree_preloads);

/*
 * The bottom two bits of the slot determine how the remaining bits in the
 * slot are interpreted:
 *
 * 00 - data pointer
 * 10 - internal entry
 * x1 - value entry
 *
 * The internal entry may be a pointer to the next level in the tree, a
 * sibling entry, or an indicator that the entry in this slot has been moved
 * to another location in the tree and the lookup should be restarted.  While
 * NULL fits the 'data pointer' pattern, it means that there is no entry in
 * the tree for this index (no matter what level of the tree it is found at).
 * This means that storing a NULL entry in the tree is the same as deleting
 * the entry from the tree.
 */
#define RADIX_TREE_ENTRY_MASK                3UL
#define RADIX_TREE_INTERNAL_NODE        2UL

static inline bool radix_tree_is_internal_node(void *ptr)
{
        return ((unsigned long)ptr & RADIX_TREE_ENTRY_MASK) ==
                                RADIX_TREE_INTERNAL_NODE;
}

/*** radix-tree API starts here ***/

#define RADIX_TREE_MAP_SHIFT        XA_CHUNK_SHIFT
#define RADIX_TREE_MAP_SIZE        (1UL << RADIX_TREE_MAP_SHIFT)
#define RADIX_TREE_MAP_MASK        (RADIX_TREE_MAP_SIZE-1)

#define RADIX_TREE_MAX_TAGS        XA_MAX_MARKS
#define RADIX_TREE_TAG_LONGS        XA_MARK_LONGS

#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
                                          RADIX_TREE_MAP_SHIFT))

/* The IDR tag is stored in the low bits of xa_flags */
#define ROOT_IS_IDR        ((__force gfp_t)4)
/* The top bits of xa_flags are used to store the root tags */
#define ROOT_TAG_SHIFT        (__GFP_BITS_SHIFT)

#define RADIX_TREE_INIT(name, mask)        XARRAY_INIT(name, mask)

#define RADIX_TREE(name, mask) \
        struct radix_tree_root name = RADIX_TREE_INIT(name, mask)

#define INIT_RADIX_TREE(root, mask) xa_init_flags(root, mask)

static inline bool radix_tree_empty(const struct radix_tree_root *root)
{
        return root->xa_head == NULL;
}

/**
 * struct radix_tree_iter - radix tree iterator state
 *
 * @index:        index of current slot
 * @next_index:        one beyond the last index for this chunk
 * @tags:        bit-mask for tag-iterating
 * @node:        node that contains current slot
 *
 * This radix tree iterator works in terms of "chunks" of slots.  A chunk is a
 * subinterval of slots contained within one radix tree leaf node.  It is
 * described by a pointer to its first slot and a struct radix_tree_iter
 * which holds the chunk's position in the tree and its size.  For tagged
 * iteration radix_tree_iter also holds the slots' bit-mask for one chosen
 * radix tree tag.
 */
struct radix_tree_iter {
        unsigned long        index;
        unsigned long        next_index;
        unsigned long        tags;
        struct radix_tree_node *node;
};

/**
 * Radix-tree synchronization
 *
 * The radix-tree API requires that users provide all synchronisation (with
 * specific exceptions, noted below).
 *
 * Synchronization of access to the data items being stored in the tree, and
 * management of their lifetimes must be completely managed by API users.
 *
 * For API usage, in general,
 * - any function _modifying_ the tree or tags (inserting or deleting
 *   items, setting or clearing tags) must exclude other modifications, and
 *   exclude any functions reading the tree.
 * - any function _reading_ the tree or tags (looking up items or tags,
 *   gang lookups) must exclude modifications to the tree, but may occur
 *   concurrently with other readers.
 *
 * The notable exceptions to this rule are the following functions:
 * __radix_tree_lookup
 * radix_tree_lookup
 * radix_tree_lookup_slot
 * radix_tree_tag_get
 * radix_tree_gang_lookup
 * radix_tree_gang_lookup_tag
 * radix_tree_gang_lookup_tag_slot
 * radix_tree_tagged
 *
 * The first 7 functions are able to be called locklessly, using RCU. The
 * caller must ensure calls to these functions are made within rcu_read_lock()
 * regions. Other readers (lock-free or otherwise) and modifications may be
 * running concurrently.
 *
 * It is still required that the caller manage the synchronization and lifetimes
 * of the items. So if RCU lock-free lookups are used, typically this would mean
 * that the items have their own locks, or are amenable to lock-free access; and
 * that the items are freed by RCU (or only freed after having been deleted from
 * the radix tree *and* a synchronize_rcu() grace period).
 *
 * (Note, rcu_assign_pointer and rcu_dereference are not needed to control
 * access to data items when inserting into or looking up from the radix tree)
 *
 * Note that the value returned by radix_tree_tag_get() may not be relied upon
 * if only the RCU read lock is held.  Functions to set/clear tags and to
 * delete nodes running concurrently with it may affect its result such that
 * two consecutive reads in the same locked section may return different
 * values.  If reliability is required, modification functions must also be
 * excluded from concurrency.
 *
 * radix_tree_tagged is able to be called without locking or RCU.
 */

/**
 * radix_tree_deref_slot - dereference a slot
 * @slot: slot pointer, returned by radix_tree_lookup_slot
 *
 * For use with radix_tree_lookup_slot().  Caller must hold tree at least read
 * locked across slot lookup and dereference. Not required if write lock is
 * held (ie. items cannot be concurrently inserted).
 *
 * radix_tree_deref_retry must be used to confirm validity of the pointer if
 * only the read lock is held.
 *
 * Return: entry stored in that slot.
 */
static inline void *radix_tree_deref_slot(void __rcu **slot)
{
        return rcu_dereference(*slot);
}

/**
 * radix_tree_deref_slot_protected - dereference a slot with tree lock held
 * @slot: slot pointer, returned by radix_tree_lookup_slot
 *
 * Similar to radix_tree_deref_slot.  The caller does not hold the RCU read
 * lock but it must hold the tree lock to prevent parallel updates.
 *
 * Return: entry stored in that slot.
 */
static inline void *radix_tree_deref_slot_protected(void __rcu **slot,
                                                        spinlock_t *treelock)
{
        return rcu_dereference_protected(*slot, lockdep_is_held(treelock));
}

/**
 * radix_tree_deref_retry        - check radix_tree_deref_slot
 * @arg:        pointer returned by radix_tree_deref_slot
 * Returns:        0 if retry is not required, otherwise retry is required
 *
 * radix_tree_deref_retry must be used with radix_tree_deref_slot.
 */
static inline int radix_tree_deref_retry(void *arg)
{
        return unlikely(radix_tree_is_internal_node(arg));
}

/**
 * radix_tree_exception        - radix_tree_deref_slot returned either exception?
 * @arg:        value returned by radix_tree_deref_slot
 * Returns:        0 if well-aligned pointer, non-0 if either kind of exception.
 */
static inline int radix_tree_exception(void *arg)
{
        return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
}

int radix_tree_insert(struct radix_tree_root *, unsigned long index,
                        void *);
void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index,
                          struct radix_tree_node **nodep, void __rcu ***slotp);
void *radix_tree_lookup(const struct radix_tree_root *, unsigned long);
void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *,
                                        unsigned long index);
void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *,
                          void __rcu **slot, void *entry);
void radix_tree_iter_replace(struct radix_tree_root *,
                const struct radix_tree_iter *, void __rcu **slot, void *entry);
void radix_tree_replace_slot(struct radix_tree_root *,
                             void __rcu **slot, void *entry);
void radix_tree_iter_delete(struct radix_tree_root *,
                        struct radix_tree_iter *iter, void __rcu **slot);
void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
void *radix_tree_delete(struct radix_tree_root *, unsigned long);
unsigned int radix_tree_gang_lookup(const struct radix_tree_root *,
                        void **results, unsigned long first_index,
                        unsigned int max_items);
int radix_tree_preload(gfp_t gfp_mask);
int radix_tree_maybe_preload(gfp_t gfp_mask);
void radix_tree_init(void);
void *radix_tree_tag_set(struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
void *radix_tree_tag_clear(struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
int radix_tree_tag_get(const struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
void radix_tree_iter_tag_clear(struct radix_tree_root *,
                const struct radix_tree_iter *iter, unsigned int tag);
unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *,
                void **results, unsigned long first_index,
                unsigned int max_items, unsigned int tag);
unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *,
                void __rcu ***results, unsigned long first_index,
                unsigned int max_items, unsigned int tag);
int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag);

static inline void radix_tree_preload_end(void)
{
        local_unlock(&radix_tree_preloads.lock);
}

void __rcu **idr_get_free(struct radix_tree_root *root,
                              struct radix_tree_iter *iter, gfp_t gfp,
                              unsigned long max);

enum {
        RADIX_TREE_ITER_TAG_MASK = 0x0f,        /* tag index in lower nybble */
        RADIX_TREE_ITER_TAGGED   = 0x10,        /* lookup tagged slots */
        RADIX_TREE_ITER_CONTIG   = 0x20,        /* stop at first hole */
};

/**
 * radix_tree_iter_init - initialize radix tree iterator
 *
 * @iter:        pointer to iterator state
 * @start:        iteration starting index
 * Returns:        NULL
 */
static __always_inline void __rcu **
radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start)
{
        /*
         * Leave iter->tags uninitialized. radix_tree_next_chunk() will fill it
         * in the case of a successful tagged chunk lookup.  If the lookup was
         * unsuccessful or non-tagged then nobody cares about ->tags.
         *
         * Set index to zero to bypass next_index overflow protection.
         * See the comment in radix_tree_next_chunk() for details.
         */
        iter->index = 0;
        iter->next_index = start;
        return NULL;
}

/**
 * radix_tree_next_chunk - find next chunk of slots for iteration
 *
 * @root:        radix tree root
 * @iter:        iterator state
 * @flags:        RADIX_TREE_ITER_* flags and tag index
 * Returns:        pointer to chunk first slot, or NULL if there no more left
 *
 * This function looks up the next chunk in the radix tree starting from
 * @iter->next_index.  It returns a pointer to the chunk's first slot.
 * Also it fills @iter with data about chunk: position in the tree (index),
 * its end (next_index), and constructs a bit mask for tagged iterating (tags).
 */
void __rcu **radix_tree_next_chunk(const struct radix_tree_root *,
                             struct radix_tree_iter *iter, unsigned flags);

/**
 * radix_tree_iter_lookup - look up an index in the radix tree
 * @root: radix tree root
 * @iter: iterator state
 * @index: key to look up
 *
 * If @index is present in the radix tree, this function returns the slot
 * containing it and updates @iter to describe the entry.  If @index is not
 * present, it returns NULL.
 */
static inline void __rcu **
radix_tree_iter_lookup(const struct radix_tree_root *root,
                        struct radix_tree_iter *iter, unsigned long index)
{
        radix_tree_iter_init(iter, index);
        return radix_tree_next_chunk(root, iter, RADIX_TREE_ITER_CONTIG);
}

/**
 * radix_tree_iter_retry - retry this chunk of the iteration
 * @iter:        iterator state
 *
 * If we iterate over a tree protected only by the RCU lock, a race
 * against deletion or creation may result in seeing a slot for which
 * radix_tree_deref_retry() returns true.  If so, call this function
 * and continue the iteration.
 */
static inline __must_check
void __rcu **radix_tree_iter_retry(struct radix_tree_iter *iter)
{
        iter->next_index = iter->index;
        iter->tags = 0;
        return NULL;
}

static inline unsigned long
__radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
{
        return iter->index + slots;
}

/**
 * radix_tree_iter_resume - resume iterating when the chunk may be invalid
 * @slot: pointer to current slot
 * @iter: iterator state
 * Returns: New slot pointer
 *
 * If the iterator needs to release then reacquire a lock, the chunk may
 * have been invalidated by an insertion or deletion.  Call this function
 * before releasing the lock to continue the iteration from the next index.
 */
void __rcu **__must_check radix_tree_iter_resume(void __rcu **slot,
                                        struct radix_tree_iter *iter);

/**
 * radix_tree_chunk_size - get current chunk size
 *
 * @iter:        pointer to radix tree iterator
 * Returns:        current chunk size
 */
static __always_inline long
radix_tree_chunk_size(struct radix_tree_iter *iter)
{
        return iter->next_index - iter->index;
}

/**
 * radix_tree_next_slot - find next slot in chunk
 *
 * @slot:        pointer to current slot
 * @iter:        pointer to iterator state
 * @flags:        RADIX_TREE_ITER_*, should be constant
 * Returns:        pointer to next slot, or NULL if there no more left
 *
 * This function updates @iter->index in the case of a successful lookup.
 * For tagged lookup it also eats @iter->tags.
 *
 * There are several cases where 'slot' can be passed in as NULL to this
 * function.  These cases result from the use of radix_tree_iter_resume() or
 * radix_tree_iter_retry().  In these cases we don't end up dereferencing
 * 'slot' because either:
 * a) we are doing tagged iteration and iter->tags has been set to 0, or
 * b) we are doing non-tagged iteration, and iter->index and iter->next_index
 *    have been set up so that radix_tree_chunk_size() returns 1 or 0.
 */
static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot,
                                struct radix_tree_iter *iter, unsigned flags)
{
        if (flags & RADIX_TREE_ITER_TAGGED) {
                iter->tags >>= 1;
                if (unlikely(!iter->tags))
                        return NULL;
                if (likely(iter->tags & 1ul)) {
                        iter->index = __radix_tree_iter_add(iter, 1);
                        slot++;
                        goto found;
                }
                if (!(flags & RADIX_TREE_ITER_CONTIG)) {
                        unsigned offset = __ffs(iter->tags);

                        iter->tags >>= offset++;
                        iter->index = __radix_tree_iter_add(iter, offset);
                        slot += offset;
                        goto found;
                }
        } else {
                long count = radix_tree_chunk_size(iter);

                while (--count > 0) {
                        slot++;
                        iter->index = __radix_tree_iter_add(iter, 1);

                        if (likely(*slot))
                                goto found;
                        if (flags & RADIX_TREE_ITER_CONTIG) {
                                /* forbid switching to the next chunk */
                                iter->next_index = 0;
                                break;
                        }
                }
        }
        return NULL;

 found:
        return slot;
}

/**
 * radix_tree_for_each_slot - iterate over non-empty slots
 *
 * @slot:        the void** variable for pointer to slot
 * @root:        the struct radix_tree_root pointer
 * @iter:        the struct radix_tree_iter pointer
 * @start:        iteration starting index
 *
 * @slot points to radix tree slot, @iter->index contains its index.
 */
#define radix_tree_for_each_slot(slot, root, iter, start)                \
        for (slot = radix_tree_iter_init(iter, start) ;                        \
             slot || (slot = radix_tree_next_chunk(root, iter, 0)) ;        \
             slot = radix_tree_next_slot(slot, iter, 0))

/**
 * radix_tree_for_each_tagged - iterate over tagged slots
 *
 * @slot:        the void** variable for pointer to slot
 * @root:        the struct radix_tree_root pointer
 * @iter:        the struct radix_tree_iter pointer
 * @start:        iteration starting index
 * @tag:        tag index
 *
 * @slot points to radix tree slot, @iter->index contains its index.
 */
#define radix_tree_for_each_tagged(slot, root, iter, start, tag)        \
        for (slot = radix_tree_iter_init(iter, start) ;                        \
             slot || (slot = radix_tree_next_chunk(root, iter,                \
                              RADIX_TREE_ITER_TAGGED | tag)) ;                \
             slot = radix_tree_next_slot(slot, iter,                        \
                                RADIX_TREE_ITER_TAGGED | tag))

#endif /* _LINUX_RADIX_TREE_H */












































































































































































    6 























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
// SPDX-License-Identifier: GPL-2.0
/*
  File: fs/ext4/xattr.h

  On-disk format of extended attributes for the ext4 filesystem.

  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/

#include <linux/xattr.h>

/* Magic value in attribute blocks */
#define EXT4_XATTR_MAGIC                0xEA020000

/* Maximum number of references to one attribute block */
#define EXT4_XATTR_REFCOUNT_MAX                1024

/* Name indexes */
#define EXT4_XATTR_INDEX_USER                        1
#define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS        2
#define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT        3
#define EXT4_XATTR_INDEX_TRUSTED                4
#define        EXT4_XATTR_INDEX_LUSTRE                        5
#define EXT4_XATTR_INDEX_SECURITY                6
#define EXT4_XATTR_INDEX_SYSTEM                        7
#define EXT4_XATTR_INDEX_RICHACL                8
#define EXT4_XATTR_INDEX_ENCRYPTION                9
#define EXT4_XATTR_INDEX_HURD                        10 /* Reserved for Hurd */

struct ext4_xattr_header {
        __le32        h_magic;        /* magic number for identification */
        __le32        h_refcount;        /* reference count */
        __le32        h_blocks;        /* number of disk blocks used */
        __le32        h_hash;                /* hash value of all attributes */
        __le32        h_checksum;        /* crc32c(uuid+id+xattrblock) */
                                /* id = inum if refcount=1, blknum otherwise */
        __u32        h_reserved[3];        /* zero right now */
};

struct ext4_xattr_ibody_header {
        __le32        h_magic;        /* magic number for identification */
};

struct ext4_xattr_entry {
        __u8        e_name_len;        /* length of name */
        __u8        e_name_index;        /* attribute name index */
        __le16        e_value_offs;        /* offset in disk block of value */
        __le32        e_value_inum;        /* inode in which the value is stored */
        __le32        e_value_size;        /* size of attribute value */
        __le32        e_hash;                /* hash value of name and value */
        char        e_name[];        /* attribute name */
};

#define EXT4_XATTR_PAD_BITS                2
#define EXT4_XATTR_PAD                (1<<EXT4_XATTR_PAD_BITS)
#define EXT4_XATTR_ROUND                (EXT4_XATTR_PAD-1)
#define EXT4_XATTR_LEN(name_len) \
        (((name_len) + EXT4_XATTR_ROUND + \
        sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
#define EXT4_XATTR_NEXT(entry) \
        ((struct ext4_xattr_entry *)( \
         (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)))
#define EXT4_XATTR_SIZE(size) \
        (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)

#define IHDR(inode, raw_inode) \
        ((struct ext4_xattr_ibody_header *) \
                ((void *)raw_inode + \
                EXT4_GOOD_OLD_INODE_SIZE + \
                EXT4_I(inode)->i_extra_isize))
#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))

/*
 * XATTR_SIZE_MAX is currently 64k, but for the purposes of checking
 * for file system consistency errors, we use a somewhat bigger value.
 * This allows XATTR_SIZE_MAX to grow in the future, but by using this
 * instead of INT_MAX for certain consistency checks, we don't need to
 * worry about arithmetic overflows.  (Actually XATTR_SIZE_MAX is
 * defined in include/uapi/linux/limits.h, so changing it is going
 * not going to be trivial....)
 */
#define EXT4_XATTR_SIZE_MAX (1 << 24)

/*
 * The minimum size of EA value when you start storing it in an external inode
 * size of block - size of header - size of 1 entry - 4 null bytes
 */
#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b)                                        \
        ((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4)

#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
#define BFIRST(bh) ENTRY(BHDR(bh)+1)
#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)

#define EXT4_ZERO_XATTR_VALUE ((void *)-1)

/*
 * If we want to add an xattr to the inode, we should make sure that
 * i_extra_isize is not 0 and that the inode size is not less than
 * EXT4_GOOD_OLD_INODE_SIZE + extra_isize + pad.
 *   EXT4_GOOD_OLD_INODE_SIZE   extra_isize header   entry   pad  data
 * |--------------------------|------------|------|---------|---|-------|
 */
#define EXT4_INODE_HAS_XATTR_SPACE(inode)                                \
        ((EXT4_I(inode)->i_extra_isize != 0) &&                                \
         (EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize +        \
          sizeof(struct ext4_xattr_ibody_header) + EXT4_XATTR_PAD <=        \
          EXT4_INODE_SIZE((inode)->i_sb)))

struct ext4_xattr_info {
        const char *name;
        const void *value;
        size_t value_len;
        int name_index;
        int in_inode;
};

struct ext4_xattr_search {
        struct ext4_xattr_entry *first;
        void *base;
        void *end;
        struct ext4_xattr_entry *here;
        int not_found;
};

struct ext4_xattr_ibody_find {
        struct ext4_xattr_search s;
        struct ext4_iloc iloc;
};

struct ext4_xattr_inode_array {
        unsigned int count;                /* # of used items in the array */
        struct inode *inodes[];
};

extern const struct xattr_handler ext4_xattr_user_handler;
extern const struct xattr_handler ext4_xattr_trusted_handler;
extern const struct xattr_handler ext4_xattr_security_handler;
extern const struct xattr_handler ext4_xattr_hurd_handler;

#define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c"

/*
 * The EXT4_STATE_NO_EXPAND is overloaded and used for two purposes.
 * The first is to signal that there the inline xattrs and data are
 * taking up so much space that we might as well not keep trying to
 * expand it.  The second is that xattr_sem is taken for writing, so
 * we shouldn't try to recurse into the inode expansion.  For this
 * second case, we need to make sure that we take save and restore the
 * NO_EXPAND state flag appropriately.
 */
static inline void ext4_write_lock_xattr(struct inode *inode, int *save)
{
        down_write(&EXT4_I(inode)->xattr_sem);
        *save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
        ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
}

static inline int ext4_write_trylock_xattr(struct inode *inode, int *save)
{
        if (down_write_trylock(&EXT4_I(inode)->xattr_sem) == 0)
                return 0;
        *save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
        ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
        return 1;
}

static inline void ext4_write_unlock_xattr(struct inode *inode, int *save)
{
        if (*save == 0)
                ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
        up_write(&EXT4_I(inode)->xattr_sem);
}

extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);

extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
                                  bool is_create, int *credits);
extern int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
                                struct buffer_head *block_bh, size_t value_len,
                                bool is_create);

extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
                                   struct ext4_xattr_inode_array **array,
                                   int extra_credits);
extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);

extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
                            struct ext4_inode *raw_inode, handle_t *handle);
extern void ext4_evict_ea_inode(struct inode *inode);

extern const struct xattr_handler * const ext4_xattr_handlers[];

extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
                                 struct ext4_xattr_ibody_find *is);
extern int ext4_xattr_ibody_get(struct inode *inode, int name_index,
                                const char *name,
                                void *buffer, size_t buffer_size);
extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
                                struct ext4_xattr_info *i,
                                struct ext4_xattr_ibody_find *is);

extern struct mb_cache *ext4_xattr_create_cache(void);
extern void ext4_xattr_destroy_cache(struct mb_cache *);

#ifdef CONFIG_EXT4_FS_SECURITY
extern int ext4_init_security(handle_t *handle, struct inode *inode,
                              struct inode *dir, const struct qstr *qstr);
#else
static inline int ext4_init_security(handle_t *handle, struct inode *inode,
                                     struct inode *dir, const struct qstr *qstr)
{
        return 0;
}
#endif

#ifdef CONFIG_LOCKDEP
extern void ext4_xattr_inode_set_class(struct inode *ea_inode);
#else
static inline void ext4_xattr_inode_set_class(struct inode *ea_inode) { }
#endif

extern int ext4_get_inode_usage(struct inode *inode, qsize_t *usage);































































    1 






























































































































    4 







    5 
    1 

    5 










    4 


    4 

    4 

    4 
    4 





    5 




























    4 



    4 
    5 








    5 














    1 


    1 










    1 






















































































    1 





























    1 
























    1 
























    1 


































    1 



































    1 














    1 

    1 





    1 

    1 

    1 





















    1 











    1 










    1 
    1 










    1 







    1 















    1 










    1 



























    1 
    1 

















    1 





























    1 
















    1 









    3 
    4 

    3 

    1 

    5 

    3 

    3 




















    5 



























    5 












    4 
    5 








    5 






    4 













































































































































































































































































































































































































































































































































































































    4 




    4 
    4 

    5 



    5 




    3 


    1 
    1 






    5 

    1 

    1 





















    4 









    4 


















    5 














    4 





















































































    4 












    5 






































































































    5 


    5 



    1 






















    4 


























































    1 































    1 





    1 

























































    1 

    4 



    5 









    1 






    1 

















































    4 
















    3 
    5 

















    5 










    1 




    5 











    5 





    4 




    5 







    1 






    1 
    1 













    1 








    4 












    2 


    5 
    3 








    5 

    5 









    1 



    3 
    5 







    5 







    5 





























    1 








    3 

    4 

    5 




    5 


    5 




















































































































































































































































































































    2 
    2 


















































































































































































































































































































































































































































































































































    1 









    1 















    1 



























































































































































    1 








    1 













































































































































































































    1 








    1 


























































































































































































































































































































































































































































































































































































































































































































    1 















    1 









    1 


    1 





















    1 
















    1 














    1 






































    1 














    1 
































































































































































    1 











    1 




    1 






    1 























    1 














































    1 





















    1 


















































































































































































































    1 















    1 

















    1 













































    1 





    1 



































































































    1 




















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
 */

#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/mm.h>
#include <linux/error-injection.h>
#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "locking.h"
#include "volumes.h"
#include "qgroup.h"
#include "tree-mod-log.h"
#include "tree-checker.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "relocation.h"
#include "file-item.h"

static struct kmem_cache *btrfs_path_cachep;

static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
                      *root, struct btrfs_path *path, int level);
static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                      const struct btrfs_key *ins_key, struct btrfs_path *path,
                      int data_size, int extend);
static int push_node_left(struct btrfs_trans_handle *trans,
                          struct extent_buffer *dst,
                          struct extent_buffer *src, int empty);
static int balance_node_right(struct btrfs_trans_handle *trans,
                              struct extent_buffer *dst_buf,
                              struct extent_buffer *src_buf);

static const struct btrfs_csums {
        u16                size;
        const char        name[10];
        const char        driver[12];
} btrfs_csums[] = {
        [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
        [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
        [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
        [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
                                     .driver = "blake2b-256" },
};

/*
 * The leaf data grows from end-to-front in the node.  this returns the address
 * of the start of the last item, which is the stop of the leaf data stack.
 */
static unsigned int leaf_data_end(const struct extent_buffer *leaf)
{
        u32 nr = btrfs_header_nritems(leaf);

        if (nr == 0)
                return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
        return btrfs_item_offset(leaf, nr - 1);
}

/*
 * Move data in a @leaf (using memmove, safe for overlapping ranges).
 *
 * @leaf:        leaf that we're doing a memmove on
 * @dst_offset:        item data offset we're moving to
 * @src_offset:        item data offset were' moving from
 * @len:        length of the data we're moving
 *
 * Wrapper around memmove_extent_buffer() that takes into account the header on
 * the leaf.  The btrfs_item offset's start directly after the header, so we
 * have to adjust any offsets to account for the header in the leaf.  This
 * handles that math to simplify the callers.
 */
static inline void memmove_leaf_data(const struct extent_buffer *leaf,
                                     unsigned long dst_offset,
                                     unsigned long src_offset,
                                     unsigned long len)
{
        memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset,
                              btrfs_item_nr_offset(leaf, 0) + src_offset, len);
}

/*
 * Copy item data from @src into @dst at the given @offset.
 *
 * @dst:        destination leaf that we're copying into
 * @src:        source leaf that we're copying from
 * @dst_offset:        item data offset we're copying to
 * @src_offset:        item data offset were' copying from
 * @len:        length of the data we're copying
 *
 * Wrapper around copy_extent_buffer() that takes into account the header on
 * the leaf.  The btrfs_item offset's start directly after the header, so we
 * have to adjust any offsets to account for the header in the leaf.  This
 * handles that math to simplify the callers.
 */
static inline void copy_leaf_data(const struct extent_buffer *dst,
                                  const struct extent_buffer *src,
                                  unsigned long dst_offset,
                                  unsigned long src_offset, unsigned long len)
{
        copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset,
                           btrfs_item_nr_offset(src, 0) + src_offset, len);
}

/*
 * Move items in a @leaf (using memmove).
 *
 * @dst:        destination leaf for the items
 * @dst_item:        the item nr we're copying into
 * @src_item:        the item nr we're copying from
 * @nr_items:        the number of items to copy
 *
 * Wrapper around memmove_extent_buffer() that does the math to get the
 * appropriate offsets into the leaf from the item numbers.
 */
static inline void memmove_leaf_items(const struct extent_buffer *leaf,
                                      int dst_item, int src_item, int nr_items)
{
        memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item),
                              btrfs_item_nr_offset(leaf, src_item),
                              nr_items * sizeof(struct btrfs_item));
}

/*
 * Copy items from @src into @dst at the given @offset.
 *
 * @dst:        destination leaf for the items
 * @src:        source leaf for the items
 * @dst_item:        the item nr we're copying into
 * @src_item:        the item nr we're copying from
 * @nr_items:        the number of items to copy
 *
 * Wrapper around copy_extent_buffer() that does the math to get the
 * appropriate offsets into the leaf from the item numbers.
 */
static inline void copy_leaf_items(const struct extent_buffer *dst,
                                   const struct extent_buffer *src,
                                   int dst_item, int src_item, int nr_items)
{
        copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item),
                              btrfs_item_nr_offset(src, src_item),
                              nr_items * sizeof(struct btrfs_item));
}

/* This exists for btrfs-progs usages. */
u16 btrfs_csum_type_size(u16 type)
{
        return btrfs_csums[type].size;
}

int btrfs_super_csum_size(const struct btrfs_super_block *s)
{
        u16 t = btrfs_super_csum_type(s);
        /*
         * csum type is validated at mount time
         */
        return btrfs_csum_type_size(t);
}

const char *btrfs_super_csum_name(u16 csum_type)
{
        /* csum type is validated at mount time */
        return btrfs_csums[csum_type].name;
}

/*
 * Return driver name if defined, otherwise the name that's also a valid driver
 * name
 */
const char *btrfs_super_csum_driver(u16 csum_type)
{
        /* csum type is validated at mount time */
        return btrfs_csums[csum_type].driver[0] ?
                btrfs_csums[csum_type].driver :
                btrfs_csums[csum_type].name;
}

size_t __attribute_const__ btrfs_get_num_csums(void)
{
        return ARRAY_SIZE(btrfs_csums);
}

struct btrfs_path *btrfs_alloc_path(void)
{
        might_sleep();

        return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
}

/* this also releases the path */
void btrfs_free_path(struct btrfs_path *p)
{
        if (!p)
                return;
        btrfs_release_path(p);
        kmem_cache_free(btrfs_path_cachep, p);
}

/*
 * path release drops references on the extent buffers in the path
 * and it drops any locks held by this path
 *
 * It is safe to call this on paths that no locks or extent buffers held.
 */
noinline void btrfs_release_path(struct btrfs_path *p)
{
        int i;

        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
                p->slots[i] = 0;
                if (!p->nodes[i])
                        continue;
                if (p->locks[i]) {
                        btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
                        p->locks[i] = 0;
                }
                free_extent_buffer(p->nodes[i]);
                p->nodes[i] = NULL;
        }
}

/*
 * We want the transaction abort to print stack trace only for errors where the
 * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
 * caused by external factors.
 */
bool __cold abort_should_print_stack(int error)
{
        switch (error) {
        case -EIO:
        case -EROFS:
        case -ENOMEM:
                return false;
        }
        return true;
}

/*
 * safely gets a reference on the root node of a tree.  A lock
 * is not taken, so a concurrent writer may put a different node
 * at the root of the tree.  See btrfs_lock_root_node for the
 * looping required.
 *
 * The extent buffer returned by this has a reference taken, so
 * it won't disappear.  It may stop being the root of the tree
 * at any time because there are no locks held.
 */
struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
{
        struct extent_buffer *eb;

        while (1) {
                rcu_read_lock();
                eb = rcu_dereference(root->node);

                /*
                 * RCU really hurts here, we could free up the root node because
                 * it was COWed but we may not get the new root node yet so do
                 * the inc_not_zero dance and if it doesn't work then
                 * synchronize_rcu and try again.
                 */
                if (atomic_inc_not_zero(&eb->refs)) {
                        rcu_read_unlock();
                        break;
                }
                rcu_read_unlock();
                synchronize_rcu();
        }
        return eb;
}

/*
 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
 * just get put onto a simple dirty list.  Transaction walks this list to make
 * sure they get properly updated on disk.
 */
static void add_root_to_dirty_list(struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;

        if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
            !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
                return;

        spin_lock(&fs_info->trans_lock);
        if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
                /* Want the extent tree to be the last on the list */
                if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID)
                        list_move_tail(&root->dirty_list,
                                       &fs_info->dirty_cowonly_roots);
                else
                        list_move(&root->dirty_list,
                                  &fs_info->dirty_cowonly_roots);
        }
        spin_unlock(&fs_info->trans_lock);
}

/*
 * used by snapshot creation to make a copy of a root for a tree with
 * a given objectid.  The buffer with the new root node is returned in
 * cow_ret, and this func returns zero on success or a negative error code.
 */
int btrfs_copy_root(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      struct extent_buffer *buf,
                      struct extent_buffer **cow_ret, u64 new_root_objectid)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_buffer *cow;
        int ret = 0;
        int level;
        struct btrfs_disk_key disk_key;
        u64 reloc_src_root = 0;

        WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
                trans->transid != fs_info->running_transaction->transid);
        WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
                trans->transid != root->last_trans);

        level = btrfs_header_level(buf);
        if (level == 0)
                btrfs_item_key(buf, &disk_key, 0);
        else
                btrfs_node_key(buf, &disk_key, 0);

        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
                reloc_src_root = btrfs_header_owner(buf);
        cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
                                     &disk_key, level, buf->start, 0,
                                     reloc_src_root, BTRFS_NESTING_NEW_ROOT);
        if (IS_ERR(cow))
                return PTR_ERR(cow);

        copy_extent_buffer_full(cow, buf);
        btrfs_set_header_bytenr(cow, cow->start);
        btrfs_set_header_generation(cow, trans->transid);
        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
                                     BTRFS_HEADER_FLAG_RELOC);
        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
        else
                btrfs_set_header_owner(cow, new_root_objectid);

        write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);

        WARN_ON(btrfs_header_generation(buf) > trans->transid);
        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
                ret = btrfs_inc_ref(trans, root, cow, 1);
        else
                ret = btrfs_inc_ref(trans, root, cow, 0);
        if (ret) {
                btrfs_tree_unlock(cow);
                free_extent_buffer(cow);
                btrfs_abort_transaction(trans, ret);
                return ret;
        }

        btrfs_mark_buffer_dirty(trans, cow);
        *cow_ret = cow;
        return 0;
}

/*
 * check if the tree block can be shared by multiple trees
 */
bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               struct extent_buffer *buf)
{
        const u64 buf_gen = btrfs_header_generation(buf);

        /*
         * Tree blocks not in shareable trees and tree roots are never shared.
         * If a block was allocated after the last snapshot and the block was
         * not allocated by tree relocation, we know the block is not shared.
         */

        if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
                return false;

        if (buf == root->node)
                return false;

        if (buf_gen > btrfs_root_last_snapshot(&root->root_item) &&
            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
                return false;

        if (buf != root->commit_root)
                return true;

        /*
         * An extent buffer that used to be the commit root may still be shared
         * because the tree height may have increased and it became a child of a
         * higher level root. This can happen when snapshotting a subvolume
         * created in the current transaction.
         */
        if (buf_gen == trans->transid)
                return true;

        return false;
}

static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root,
                                       struct extent_buffer *buf,
                                       struct extent_buffer *cow,
                                       int *last_ref)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 refs;
        u64 owner;
        u64 flags;
        u64 new_flags = 0;
        int ret;

        /*
         * Backrefs update rules:
         *
         * Always use full backrefs for extent pointers in tree block
         * allocated by tree relocation.
         *
         * If a shared tree block is no longer referenced by its owner
         * tree (btrfs_header_owner(buf) == root->root_key.objectid),
         * use full backrefs for extent pointers in tree block.
         *
         * If a tree block is been relocating
         * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
         * use full backrefs for extent pointers in tree block.
         * The reason for this is some operations (such as drop tree)
         * are only allowed for blocks use full backrefs.
         */

        if (btrfs_block_can_be_shared(trans, root, buf)) {
                ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
                                               btrfs_header_level(buf), 1,
                                               &refs, &flags, NULL);
                if (ret)
                        return ret;
                if (unlikely(refs == 0)) {
                        btrfs_crit(fs_info,
                "found 0 references for tree block at bytenr %llu level %d root %llu",
                                   buf->start, btrfs_header_level(buf),
                                   btrfs_root_id(root));
                        ret = -EUCLEAN;
                        btrfs_abort_transaction(trans, ret);
                        return ret;
                }
        } else {
                refs = 1;
                if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
                        flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
                else
                        flags = 0;
        }

        owner = btrfs_header_owner(buf);
        BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
               !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));

        if (refs > 1) {
                if ((owner == btrfs_root_id(root) ||
                     btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) &&
                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
                        ret = btrfs_inc_ref(trans, root, buf, 1);
                        if (ret)
                                return ret;

                        if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
                                ret = btrfs_dec_ref(trans, root, buf, 0);
                                if (ret)
                                        return ret;
                                ret = btrfs_inc_ref(trans, root, cow, 1);
                                if (ret)
                                        return ret;
                        }
                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
                } else {

                        if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
                                ret = btrfs_inc_ref(trans, root, cow, 1);
                        else
                                ret = btrfs_inc_ref(trans, root, cow, 0);
                        if (ret)
                                return ret;
                }
                if (new_flags != 0) {
                        ret = btrfs_set_disk_extent_flags(trans, buf, new_flags);
                        if (ret)
                                return ret;
                }
        } else {
                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
                        if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
                                ret = btrfs_inc_ref(trans, root, cow, 1);
                        else
                                ret = btrfs_inc_ref(trans, root, cow, 0);
                        if (ret)
                                return ret;
                        ret = btrfs_dec_ref(trans, root, buf, 1);
                        if (ret)
                                return ret;
                }
                btrfs_clear_buffer_dirty(trans, buf);
                *last_ref = 1;
        }
        return 0;
}

/*
 * does the dirty work in cow of a single block.  The parent block (if
 * supplied) is updated to point to the new cow copy.  The new buffer is marked
 * dirty and returned locked.  If you modify the block it needs to be marked
 * dirty again.
 *
 * search_start -- an allocation hint for the new block
 *
 * empty_size -- a hint that you plan on doing more cow.  This is the size in
 * bytes the allocator should try to find free next to the block it returns.
 * This is just a hint and may be ignored by the allocator.
 */
int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          struct extent_buffer *buf,
                          struct extent_buffer *parent, int parent_slot,
                          struct extent_buffer **cow_ret,
                          u64 search_start, u64 empty_size,
                          enum btrfs_lock_nesting nest)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_disk_key disk_key;
        struct extent_buffer *cow;
        int level, ret;
        int last_ref = 0;
        int unlock_orig = 0;
        u64 parent_start = 0;
        u64 reloc_src_root = 0;

        if (*cow_ret == buf)
                unlock_orig = 1;

        btrfs_assert_tree_write_locked(buf);

        WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
                trans->transid != fs_info->running_transaction->transid);
        WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
                trans->transid != root->last_trans);

        level = btrfs_header_level(buf);

        if (level == 0)
                btrfs_item_key(buf, &disk_key, 0);
        else
                btrfs_node_key(buf, &disk_key, 0);

        if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
                if (parent)
                        parent_start = parent->start;
                reloc_src_root = btrfs_header_owner(buf);
        }
        cow = btrfs_alloc_tree_block(trans, root, parent_start,
                                     btrfs_root_id(root), &disk_key, level,
                                     search_start, empty_size, reloc_src_root, nest);
        if (IS_ERR(cow))
                return PTR_ERR(cow);

        /* cow is set to blocking by btrfs_init_new_buffer */

        copy_extent_buffer_full(cow, buf);
        btrfs_set_header_bytenr(cow, cow->start);
        btrfs_set_header_generation(cow, trans->transid);
        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
                                     BTRFS_HEADER_FLAG_RELOC);
        if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
        else
                btrfs_set_header_owner(cow, btrfs_root_id(root));

        write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);

        ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
        if (ret) {
                btrfs_tree_unlock(cow);
                free_extent_buffer(cow);
                btrfs_abort_transaction(trans, ret);
                return ret;
        }

        if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
                ret = btrfs_reloc_cow_block(trans, root, buf, cow);
                if (ret) {
                        btrfs_tree_unlock(cow);
                        free_extent_buffer(cow);
                        btrfs_abort_transaction(trans, ret);
                        return ret;
                }
        }

        if (buf == root->node) {
                WARN_ON(parent && parent != buf);
                if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID ||
                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
                        parent_start = buf->start;

                ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
                if (ret < 0) {
                        btrfs_tree_unlock(cow);
                        free_extent_buffer(cow);
                        btrfs_abort_transaction(trans, ret);
                        return ret;
                }
                atomic_inc(&cow->refs);
                rcu_assign_pointer(root->node, cow);

                btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
                                      parent_start, last_ref);
                free_extent_buffer(buf);
                add_root_to_dirty_list(root);
        } else {
                WARN_ON(trans->transid != btrfs_header_generation(parent));
                ret = btrfs_tree_mod_log_insert_key(parent, parent_slot,
                                                    BTRFS_MOD_LOG_KEY_REPLACE);
                if (ret) {
                        btrfs_tree_unlock(cow);
                        free_extent_buffer(cow);
                        btrfs_abort_transaction(trans, ret);
                        return ret;
                }
                btrfs_set_node_blockptr(parent, parent_slot,
                                        cow->start);
                btrfs_set_node_ptr_generation(parent, parent_slot,
                                              trans->transid);
                btrfs_mark_buffer_dirty(trans, parent);
                if (last_ref) {
                        ret = btrfs_tree_mod_log_free_eb(buf);
                        if (ret) {
                                btrfs_tree_unlock(cow);
                                free_extent_buffer(cow);
                                btrfs_abort_transaction(trans, ret);
                                return ret;
                        }
                }
                btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
                                      parent_start, last_ref);
        }
        if (unlock_orig)
                btrfs_tree_unlock(buf);
        free_extent_buffer_stale(buf);
        btrfs_mark_buffer_dirty(trans, cow);
        *cow_ret = cow;
        return 0;
}

static inline int should_cow_block(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
                                   struct extent_buffer *buf)
{
        if (btrfs_is_testing(root->fs_info))
                return 0;

        /* Ensure we can see the FORCE_COW bit */
        smp_mb__before_atomic();

        /*
         * We do not need to cow a block if
         * 1) this block is not created or changed in this transaction;
         * 2) this block does not belong to TREE_RELOC tree;
         * 3) the root is not forced COW.
         *
         * What is forced COW:
         *    when we create snapshot during committing the transaction,
         *    after we've finished copying src root, we must COW the shared
         *    block to ensure the metadata consistency.
         */
        if (btrfs_header_generation(buf) == trans->transid &&
            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
            !(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
            !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
                return 0;
        return 1;
}

/*
 * COWs a single block, see btrfs_force_cow_block() for the real work.
 * This version of it has extra checks so that a block isn't COWed more than
 * once per transaction, as long as it hasn't been written yet
 */
int btrfs_cow_block(struct btrfs_trans_handle *trans,
                    struct btrfs_root *root, struct extent_buffer *buf,
                    struct extent_buffer *parent, int parent_slot,
                    struct extent_buffer **cow_ret,
                    enum btrfs_lock_nesting nest)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 search_start;
        int ret;

        if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) {
                btrfs_abort_transaction(trans, -EUCLEAN);
                btrfs_crit(fs_info,
                   "attempt to COW block %llu on root %llu that is being deleted",
                           buf->start, btrfs_root_id(root));
                return -EUCLEAN;
        }

        /*
         * COWing must happen through a running transaction, which always
         * matches the current fs generation (it's a transaction with a state
         * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
         * into error state to prevent the commit of any transaction.
         */
        if (unlikely(trans->transaction != fs_info->running_transaction ||
                     trans->transid != fs_info->generation)) {
                btrfs_abort_transaction(trans, -EUCLEAN);
                btrfs_crit(fs_info,
"unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu",
                           buf->start, btrfs_root_id(root), trans->transid,
                           fs_info->running_transaction->transid,
                           fs_info->generation);
                return -EUCLEAN;
        }

        if (!should_cow_block(trans, root, buf)) {
                *cow_ret = buf;
                return 0;
        }

        search_start = round_down(buf->start, SZ_1G);

        /*
         * Before CoWing this block for later modification, check if it's
         * the subtree root and do the delayed subtree trace if needed.
         *
         * Also We don't care about the error, as it's handled internally.
         */
        btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
        ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
                                    cow_ret, search_start, 0, nest);

        trace_btrfs_cow_block(root, buf, *cow_ret);

        return ret;
}
ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);

/*
 * same as comp_keys only with two btrfs_key's
 */
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
{
        if (k1->objectid > k2->objectid)
                return 1;
        if (k1->objectid < k2->objectid)
                return -1;
        if (k1->type > k2->type)
                return 1;
        if (k1->type < k2->type)
                return -1;
        if (k1->offset > k2->offset)
                return 1;
        if (k1->offset < k2->offset)
                return -1;
        return 0;
}

/*
 * Search for a key in the given extent_buffer.
 *
 * The lower boundary for the search is specified by the slot number @first_slot.
 * Use a value of 0 to search over the whole extent buffer. Works for both
 * leaves and nodes.
 *
 * The slot in the extent buffer is returned via @slot. If the key exists in the
 * extent buffer, then @slot will point to the slot where the key is, otherwise
 * it points to the slot where you would insert the key.
 *
 * Slot may point to the total number of items (i.e. one position beyond the last
 * key) if the key is bigger than the last key in the extent buffer.
 */
int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
                     const struct btrfs_key *key, int *slot)
{
        unsigned long p;
        int item_size;
        /*
         * Use unsigned types for the low and high slots, so that we get a more
         * efficient division in the search loop below.
         */
        u32 low = first_slot;
        u32 high = btrfs_header_nritems(eb);
        int ret;
        const int key_size = sizeof(struct btrfs_disk_key);

        if (unlikely(low > high)) {
                btrfs_err(eb->fs_info,
                 "%s: low (%u) > high (%u) eb %llu owner %llu level %d",
                          __func__, low, high, eb->start,
                          btrfs_header_owner(eb), btrfs_header_level(eb));
                return -EINVAL;
        }

        if (btrfs_header_level(eb) == 0) {
                p = offsetof(struct btrfs_leaf, items);
                item_size = sizeof(struct btrfs_item);
        } else {
                p = offsetof(struct btrfs_node, ptrs);
                item_size = sizeof(struct btrfs_key_ptr);
        }

        while (low < high) {
                const int unit_size = eb->folio_size;
                unsigned long oil;
                unsigned long offset;
                struct btrfs_disk_key *tmp;
                struct btrfs_disk_key unaligned;
                int mid;

                mid = (low + high) / 2;
                offset = p + mid * item_size;
                oil = get_eb_offset_in_folio(eb, offset);

                if (oil + key_size <= unit_size) {
                        const unsigned long idx = get_eb_folio_index(eb, offset);
                        char *kaddr = folio_address(eb->folios[idx]);

                        oil = get_eb_offset_in_folio(eb, offset);
                        tmp = (struct btrfs_disk_key *)(kaddr + oil);
                } else {
                        read_extent_buffer(eb, &unaligned, offset, key_size);
                        tmp = &unaligned;
                }

                ret = btrfs_comp_keys(tmp, key);

                if (ret < 0)
                        low = mid + 1;
                else if (ret > 0)
                        high = mid;
                else {
                        *slot = mid;
                        return 0;
                }
        }
        *slot = low;
        return 1;
}

static void root_add_used_bytes(struct btrfs_root *root)
{
        spin_lock(&root->accounting_lock);
        btrfs_set_root_used(&root->root_item,
                btrfs_root_used(&root->root_item) + root->fs_info->nodesize);
        spin_unlock(&root->accounting_lock);
}

static void root_sub_used_bytes(struct btrfs_root *root)
{
        spin_lock(&root->accounting_lock);
        btrfs_set_root_used(&root->root_item,
                btrfs_root_used(&root->root_item) - root->fs_info->nodesize);
        spin_unlock(&root->accounting_lock);
}

/* given a node and slot number, this reads the blocks it points to.  The
 * extent buffer is returned with a reference taken (but unlocked).
 */
struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
                                           int slot)
{
        int level = btrfs_header_level(parent);
        struct btrfs_tree_parent_check check = { 0 };
        struct extent_buffer *eb;

        if (slot < 0 || slot >= btrfs_header_nritems(parent))
                return ERR_PTR(-ENOENT);

        ASSERT(level);

        check.level = level - 1;
        check.transid = btrfs_node_ptr_generation(parent, slot);
        check.owner_root = btrfs_header_owner(parent);
        check.has_first_key = true;
        btrfs_node_key_to_cpu(parent, &check.first_key, slot);

        eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
                             &check);
        if (IS_ERR(eb))
                return eb;
        if (!extent_buffer_uptodate(eb)) {
                free_extent_buffer(eb);
                return ERR_PTR(-EIO);
        }

        return eb;
}

/*
 * node level balancing, used to make sure nodes are in proper order for
 * item deletion.  We balance from the top down, so we have to make sure
 * that a deletion won't leave an node completely empty later on.
 */
static noinline int balance_level(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         struct btrfs_path *path, int level)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_buffer *right = NULL;
        struct extent_buffer *mid;
        struct extent_buffer *left = NULL;
        struct extent_buffer *parent = NULL;
        int ret = 0;
        int wret;
        int pslot;
        int orig_slot = path->slots[level];
        u64 orig_ptr;

        ASSERT(level > 0);

        mid = path->nodes[level];

        WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
        WARN_ON(btrfs_header_generation(mid) != trans->transid);

        orig_ptr = btrfs_node_blockptr(mid, orig_slot);

        if (level < BTRFS_MAX_LEVEL - 1) {
                parent = path->nodes[level + 1];
                pslot = path->slots[level + 1];
        }

        /*
         * deal with the case where there is only one pointer in the root
         * by promoting the node below to a root
         */
        if (!parent) {
                struct extent_buffer *child;

                if (btrfs_header_nritems(mid) != 1)
                        return 0;

                /* promote the child to a root */
                child = btrfs_read_node_slot(mid, 0);
                if (IS_ERR(child)) {
                        ret = PTR_ERR(child);
                        goto out;
                }

                btrfs_tree_lock(child);
                ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
                                      BTRFS_NESTING_COW);
                if (ret) {
                        btrfs_tree_unlock(child);
                        free_extent_buffer(child);
                        goto out;
                }

                ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
                if (ret < 0) {
                        btrfs_tree_unlock(child);
                        free_extent_buffer(child);
                        btrfs_abort_transaction(trans, ret);
                        goto out;
                }
                rcu_assign_pointer(root->node, child);

                add_root_to_dirty_list(root);
                btrfs_tree_unlock(child);

                path->locks[level] = 0;
                path->nodes[level] = NULL;
                btrfs_clear_buffer_dirty(trans, mid);
                btrfs_tree_unlock(mid);
                /* once for the path */
                free_extent_buffer(mid);

                root_sub_used_bytes(root);
                btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
                /* once for the root ptr */
                free_extent_buffer_stale(mid);
                return 0;
        }
        if (btrfs_header_nritems(mid) >
            BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
                return 0;

        if (pslot) {
                left = btrfs_read_node_slot(parent, pslot - 1);
                if (IS_ERR(left)) {
                        ret = PTR_ERR(left);
                        left = NULL;
                        goto out;
                }

                btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);
                wret = btrfs_cow_block(trans, root, left,
                                       parent, pslot - 1, &left,
                                       BTRFS_NESTING_LEFT_COW);
                if (wret) {
                        ret = wret;
                        goto out;
                }
        }

        if (pslot + 1 < btrfs_header_nritems(parent)) {
                right = btrfs_read_node_slot(parent, pslot + 1);
                if (IS_ERR(right)) {
                        ret = PTR_ERR(right);
                        right = NULL;
                        goto out;
                }

                btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);
                wret = btrfs_cow_block(trans, root, right,
                                       parent, pslot + 1, &right,
                                       BTRFS_NESTING_RIGHT_COW);
                if (wret) {
                        ret = wret;
                        goto out;
                }
        }

        /* first, try to make some room in the middle buffer */
        if (left) {
                orig_slot += btrfs_header_nritems(left);
                wret = push_node_left(trans, left, mid, 1);
                if (wret < 0)
                        ret = wret;
        }

        /*
         * then try to empty the right most buffer into the middle
         */
        if (right) {
                wret = push_node_left(trans, mid, right, 1);
                if (wret < 0 && wret != -ENOSPC)
                        ret = wret;
                if (btrfs_header_nritems(right) == 0) {
                        btrfs_clear_buffer_dirty(trans, right);
                        btrfs_tree_unlock(right);
                        ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1);
                        if (ret < 0) {
                                free_extent_buffer_stale(right);
                                right = NULL;
                                goto out;
                        }
                        root_sub_used_bytes(root);
                        btrfs_free_tree_block(trans, btrfs_root_id(root), right,
                                              0, 1);
                        free_extent_buffer_stale(right);
                        right = NULL;
                } else {
                        struct btrfs_disk_key right_key;
                        btrfs_node_key(right, &right_key, 0);
                        ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
                                        BTRFS_MOD_LOG_KEY_REPLACE);
                        if (ret < 0) {
                                btrfs_abort_transaction(trans, ret);
                                goto out;
                        }
                        btrfs_set_node_key(parent, &right_key, pslot + 1);
                        btrfs_mark_buffer_dirty(trans, parent);
                }
        }
        if (btrfs_header_nritems(mid) == 1) {
                /*
                 * we're not allowed to leave a node with one item in the
                 * tree during a delete.  A deletion from lower in the tree
                 * could try to delete the only pointer in this node.
                 * So, pull some keys from the left.
                 * There has to be a left pointer at this point because
                 * otherwise we would have pulled some pointers from the
                 * right
                 */
                if (unlikely(!left)) {
                        btrfs_crit(fs_info,
"missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu",
                                   parent->start, btrfs_header_level(parent),
                                   mid->start, btrfs_root_id(root));
                        ret = -EUCLEAN;
                        btrfs_abort_transaction(trans, ret);
                        goto out;
                }
                wret = balance_node_right(trans, mid, left);
                if (wret < 0) {
                        ret = wret;
                        goto out;
                }
                if (wret == 1) {
                        wret = push_node_left(trans, left, mid, 1);
                        if (wret < 0)
                                ret = wret;
                }
                BUG_ON(wret == 1);
        }
        if (btrfs_header_nritems(mid) == 0) {
                btrfs_clear_buffer_dirty(trans, mid);
                btrfs_tree_unlock(mid);
                ret = btrfs_del_ptr(trans, root, path, level + 1, pslot);
                if (ret < 0) {
                        free_extent_buffer_stale(mid);
                        mid = NULL;
                        goto out;
                }
                root_sub_used_bytes(root);
                btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
                free_extent_buffer_stale(mid);
                mid = NULL;
        } else {
                /* update the parent key to reflect our changes */
                struct btrfs_disk_key mid_key;
                btrfs_node_key(mid, &mid_key, 0);
                ret = btrfs_tree_mod_log_insert_key(parent, pslot,
                                                    BTRFS_MOD_LOG_KEY_REPLACE);
                if (ret < 0) {
                        btrfs_abort_transaction(trans, ret);
                        goto out;
                }
                btrfs_set_node_key(parent, &mid_key, pslot);
                btrfs_mark_buffer_dirty(trans, parent);
        }

        /* update the path */
        if (left) {
                if (btrfs_header_nritems(left) > orig_slot) {
                        atomic_inc(&left->refs);
                        /* left was locked after cow */
                        path->nodes[level] = left;
                        path->slots[level + 1] -= 1;
                        path->slots[level] = orig_slot;
                        if (mid) {
                                btrfs_tree_unlock(mid);
                                free_extent_buffer(mid);
                        }
                } else {
                        orig_slot -= btrfs_header_nritems(left);
                        path->slots[level] = orig_slot;
                }
        }
        /* double check we haven't messed things up */
        if (orig_ptr !=
            btrfs_node_blockptr(path->nodes[level], path->slots[level]))
                BUG();
out:
        if (right) {
                btrfs_tree_unlock(right);
                free_extent_buffer(right);
        }
        if (left) {
                if (path->nodes[level] != left)
                        btrfs_tree_unlock(left);
                free_extent_buffer(left);
        }
        return ret;
}

/* Node balancing for insertion.  Here we only split or push nodes around
 * when they are completely full.  This is also done top down, so we
 * have to be pessimistic.
 */
static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
                                          struct btrfs_path *path, int level)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_buffer *right = NULL;
        struct extent_buffer *mid;
        struct extent_buffer *left = NULL;
        struct extent_buffer *parent = NULL;
        int ret = 0;
        int wret;
        int pslot;
        int orig_slot = path->slots[level];

        if (level == 0)
                return 1;

        mid = path->nodes[level];
        WARN_ON(btrfs_header_generation(mid) != trans->transid);

        if (level < BTRFS_MAX_LEVEL - 1) {
                parent = path->nodes[level + 1];
                pslot = path->slots[level + 1];
        }

        if (!parent)
                return 1;

        /* first, try to make some room in the middle buffer */
        if (pslot) {
                u32 left_nr;

                left = btrfs_read_node_slot(parent, pslot - 1);
                if (IS_ERR(left))
                        return PTR_ERR(left);

                btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);

                left_nr = btrfs_header_nritems(left);
                if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
                        wret = 1;
                } else {
                        ret = btrfs_cow_block(trans, root, left, parent,
                                              pslot - 1, &left,
                                              BTRFS_NESTING_LEFT_COW);
                        if (ret)
                                wret = 1;
                        else {
                                wret = push_node_left(trans, left, mid, 0);
                        }
                }
                if (wret < 0)
                        ret = wret;
                if (wret == 0) {
                        struct btrfs_disk_key disk_key;
                        orig_slot += left_nr;
                        btrfs_node_key(mid, &disk_key, 0);
                        ret = btrfs_tree_mod_log_insert_key(parent, pslot,
                                        BTRFS_MOD_LOG_KEY_REPLACE);
                        if (ret < 0) {
                                btrfs_tree_unlock(left);
                                free_extent_buffer(left);
                                btrfs_abort_transaction(trans, ret);
                                return ret;
                        }
                        btrfs_set_node_key(parent, &disk_key, pslot);
                        btrfs_mark_buffer_dirty(trans, parent);
                        if (btrfs_header_nritems(left) > orig_slot) {
                                path->nodes[level] = left;
                                path->slots[level + 1] -= 1;
                                path->slots[level] = orig_slot;
                                btrfs_tree_unlock(mid);
                                free_extent_buffer(mid);
                        } else {
                                orig_slot -=
                                        btrfs_header_nritems(left);
                                path->slots[level] = orig_slot;
                                btrfs_tree_unlock(left);
                                free_extent_buffer(left);
                        }
                        return 0;
                }
                btrfs_tree_unlock(left);
                free_extent_buffer(left);
        }

        /*
         * then try to empty the right most buffer into the middle
         */
        if (pslot + 1 < btrfs_header_nritems(parent)) {
                u32 right_nr;

                right = btrfs_read_node_slot(parent, pslot + 1);
                if (IS_ERR(right))
                        return PTR_ERR(right);

                btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);

                right_nr = btrfs_header_nritems(right);
                if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
                        wret = 1;
                } else {
                        ret = btrfs_cow_block(trans, root, right,
                                              parent, pslot + 1,
                                              &right, BTRFS_NESTING_RIGHT_COW);
                        if (ret)
                                wret = 1;
                        else {
                                wret = balance_node_right(trans, right, mid);
                        }
                }
                if (wret < 0)
                        ret = wret;
                if (wret == 0) {
                        struct btrfs_disk_key disk_key;

                        btrfs_node_key(right, &disk_key, 0);
                        ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
                                        BTRFS_MOD_LOG_KEY_REPLACE);
                        if (ret < 0) {
                                btrfs_tree_unlock(right);
                                free_extent_buffer(right);
                                btrfs_abort_transaction(trans, ret);
                                return ret;
                        }
                        btrfs_set_node_key(parent, &disk_key, pslot + 1);
                        btrfs_mark_buffer_dirty(trans, parent);

                        if (btrfs_header_nritems(mid) <= orig_slot) {
                                path->nodes[level] = right;
                                path->slots[level + 1] += 1;
                                path->slots[level] = orig_slot -
                                        btrfs_header_nritems(mid);
                                btrfs_tree_unlock(mid);
                                free_extent_buffer(mid);
                        } else {
                                btrfs_tree_unlock(right);
                                free_extent_buffer(right);
                        }
                        return 0;
                }
                btrfs_tree_unlock(right);
                free_extent_buffer(right);
        }
        return 1;
}

/*
 * readahead one full node of leaves, finding things that are close
 * to the block in 'slot', and triggering ra on them.
 */
static void reada_for_search(struct btrfs_fs_info *fs_info,
                             struct btrfs_path *path,
                             int level, int slot, u64 objectid)
{
        struct extent_buffer *node;
        struct btrfs_disk_key disk_key;
        u32 nritems;
        u64 search;
        u64 target;
        u64 nread = 0;
        u64 nread_max;
        u32 nr;
        u32 blocksize;
        u32 nscan = 0;

        if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
                return;

        if (!path->nodes[level])
                return;

        node = path->nodes[level];

        /*
         * Since the time between visiting leaves is much shorter than the time
         * between visiting nodes, limit read ahead of nodes to 1, to avoid too
         * much IO at once (possibly random).
         */
        if (path->reada == READA_FORWARD_ALWAYS) {
                if (level > 1)
                        nread_max = node->fs_info->nodesize;
                else
                        nread_max = SZ_128K;
        } else {
                nread_max = SZ_64K;
        }

        search = btrfs_node_blockptr(node, slot);
        blocksize = fs_info->nodesize;
        if (path->reada != READA_FORWARD_ALWAYS) {
                struct extent_buffer *eb;

                eb = find_extent_buffer(fs_info, search);
                if (eb) {
                        free_extent_buffer(eb);
                        return;
                }
        }

        target = search;

        nritems = btrfs_header_nritems(node);
        nr = slot;

        while (1) {
                if (path->reada == READA_BACK) {
                        if (nr == 0)
                                break;
                        nr--;
                } else if (path->reada == READA_FORWARD ||
                           path->reada == READA_FORWARD_ALWAYS) {
                        nr++;
                        if (nr >= nritems)
                                break;
                }
                if (path->reada == READA_BACK && objectid) {
                        btrfs_node_key(node, &disk_key, nr);
                        if (btrfs_disk_key_objectid(&disk_key) != objectid)
                                break;
                }
                search = btrfs_node_blockptr(node, nr);
                if (path->reada == READA_FORWARD_ALWAYS ||
                    (search <= target && target - search <= 65536) ||
                    (search > target && search - target <= 65536)) {
                        btrfs_readahead_node_child(node, nr);
                        nread += blocksize;
                }
                nscan++;
                if (nread > nread_max || nscan > 32)
                        break;
        }
}

static noinline void reada_for_balance(struct btrfs_path *path, int level)
{
        struct extent_buffer *parent;
        int slot;
        int nritems;

        parent = path->nodes[level + 1];
        if (!parent)
                return;

        nritems = btrfs_header_nritems(parent);
        slot = path->slots[level + 1];

        if (slot > 0)
                btrfs_readahead_node_child(parent, slot - 1);
        if (slot + 1 < nritems)
                btrfs_readahead_node_child(parent, slot + 1);
}


/*
 * when we walk down the tree, it is usually safe to unlock the higher layers
 * in the tree.  The exceptions are when our path goes through slot 0, because
 * operations on the tree might require changing key pointers higher up in the
 * tree.
 *
 * callers might also have set path->keep_locks, which tells this code to keep
 * the lock if the path points to the last slot in the block.  This is part of
 * walking through the tree, and selecting the next slot in the higher block.
 *
 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
 * if lowest_unlock is 1, level 0 won't be unlocked
 */
static noinline void unlock_up(struct btrfs_path *path, int level,
                               int lowest_unlock, int min_write_lock_level,
                               int *write_lock_level)
{
        int i;
        int skip_level = level;
        bool check_skip = true;

        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
                if (!path->nodes[i])
                        break;
                if (!path->locks[i])
                        break;

                if (check_skip) {
                        if (path->slots[i] == 0) {
                                skip_level = i + 1;
                                continue;
                        }

                        if (path->keep_locks) {
                                u32 nritems;

                                nritems = btrfs_header_nritems(path->nodes[i]);
                                if (nritems < 1 || path->slots[i] >= nritems - 1) {
                                        skip_level = i + 1;
                                        continue;
                                }
                        }
                }

                if (i >= lowest_unlock && i > skip_level) {
                        check_skip = false;
                        btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
                        path->locks[i] = 0;
                        if (write_lock_level &&
                            i > min_write_lock_level &&
                            i <= *write_lock_level) {
                                *write_lock_level = i - 1;
                        }
                }
        }
}

/*
 * Helper function for btrfs_search_slot() and other functions that do a search
 * on a btree. The goal is to find a tree block in the cache (the radix tree at
 * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
 * its pages from disk.
 *
 * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
 * whole btree search, starting again from the current root node.
 */
static int
read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
                      struct extent_buffer **eb_ret, int level, int slot,
                      const struct btrfs_key *key)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_tree_parent_check check = { 0 };
        u64 blocknr;
        u64 gen;
        struct extent_buffer *tmp;
        int ret;
        int parent_level;
        bool unlock_up;

        unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
        blocknr = btrfs_node_blockptr(*eb_ret, slot);
        gen = btrfs_node_ptr_generation(*eb_ret, slot);
        parent_level = btrfs_header_level(*eb_ret);
        btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
        check.has_first_key = true;
        check.level = parent_level - 1;
        check.transid = gen;
        check.owner_root = btrfs_root_id(root);

        /*
         * If we need to read an extent buffer from disk and we are holding locks
         * on upper level nodes, we unlock all the upper nodes before reading the
         * extent buffer, and then return -EAGAIN to the caller as it needs to
         * restart the search. We don't release the lock on the current level
         * because we need to walk this node to figure out which blocks to read.
         */
        tmp = find_extent_buffer(fs_info, blocknr);
        if (tmp) {
                if (p->reada == READA_FORWARD_ALWAYS)
                        reada_for_search(fs_info, p, level, slot, key->objectid);

                /* first we do an atomic uptodate check */
                if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
                        /*
                         * Do extra check for first_key, eb can be stale due to
                         * being cached, read from scrub, or have multiple
                         * parents (shared tree blocks).
                         */
                        if (btrfs_verify_level_key(tmp,
                                        parent_level - 1, &check.first_key, gen)) {
                                free_extent_buffer(tmp);
                                return -EUCLEAN;
                        }
                        *eb_ret = tmp;
                        return 0;
                }

                if (p->nowait) {
                        free_extent_buffer(tmp);
                        return -EAGAIN;
                }

                if (unlock_up)
                        btrfs_unlock_up_safe(p, level + 1);

                /* now we're allowed to do a blocking uptodate check */
                ret = btrfs_read_extent_buffer(tmp, &check);
                if (ret) {
                        free_extent_buffer(tmp);
                        btrfs_release_path(p);
                        return -EIO;
                }
                if (btrfs_check_eb_owner(tmp, btrfs_root_id(root))) {
                        free_extent_buffer(tmp);
                        btrfs_release_path(p);
                        return -EUCLEAN;
                }

                if (unlock_up)
                        ret = -EAGAIN;

                goto out;
        } else if (p->nowait) {
                return -EAGAIN;
        }

        if (unlock_up) {
                btrfs_unlock_up_safe(p, level + 1);
                ret = -EAGAIN;
        } else {
                ret = 0;
        }

        if (p->reada != READA_NONE)
                reada_for_search(fs_info, p, level, slot, key->objectid);

        tmp = read_tree_block(fs_info, blocknr, &check);
        if (IS_ERR(tmp)) {
                btrfs_release_path(p);
                return PTR_ERR(tmp);
        }
        /*
         * If the read above didn't mark this buffer up to date,
         * it will never end up being up to date.  Set ret to EIO now
         * and give up so that our caller doesn't loop forever
         * on our EAGAINs.
         */
        if (!extent_buffer_uptodate(tmp))
                ret = -EIO;

out:
        if (ret == 0) {
                *eb_ret = tmp;
        } else {
                free_extent_buffer(tmp);
                btrfs_release_path(p);
        }

        return ret;
}

/*
 * helper function for btrfs_search_slot.  This does all of the checks
 * for node-level blocks and does any balancing required based on
 * the ins_len.
 *
 * If no extra work was required, zero is returned.  If we had to
 * drop the path, -EAGAIN is returned and btrfs_search_slot must
 * start over
 */
static int
setup_nodes_for_search(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct btrfs_path *p,
                       struct extent_buffer *b, int level, int ins_len,
                       int *write_lock_level)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret = 0;

        if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
            BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {

                if (*write_lock_level < level + 1) {
                        *write_lock_level = level + 1;
                        btrfs_release_path(p);
                        return -EAGAIN;
                }

                reada_for_balance(p, level);
                ret = split_node(trans, root, p, level);

                b = p->nodes[level];
        } else if (ins_len < 0 && btrfs_header_nritems(b) <
                   BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {

                if (*write_lock_level < level + 1) {
                        *write_lock_level = level + 1;
                        btrfs_release_path(p);
                        return -EAGAIN;
                }

                reada_for_balance(p, level);
                ret = balance_level(trans, root, p, level);
                if (ret)
                        return ret;

                b = p->nodes[level];
                if (!b) {
                        btrfs_release_path(p);
                        return -EAGAIN;
                }
                BUG_ON(btrfs_header_nritems(b) == 1);
        }
        return ret;
}

int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
                u64 iobjectid, u64 ioff, u8 key_type,
                struct btrfs_key *found_key)
{
        int ret;
        struct btrfs_key key;
        struct extent_buffer *eb;

        ASSERT(path);
        ASSERT(found_key);

        key.type = key_type;
        key.objectid = iobjectid;
        key.offset = ioff;

        ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
        if (ret < 0)
                return ret;

        eb = path->nodes[0];
        if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
                ret = btrfs_next_leaf(fs_root, path);
                if (ret)
                        return ret;
                eb = path->nodes[0];
        }

        btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
        if (found_key->type != key.type ||
                        found_key->objectid != key.objectid)
                return 1;

        return 0;
}

static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
                                                        struct btrfs_path *p,
                                                        int write_lock_level)
{
        struct extent_buffer *b;
        int root_lock = 0;
        int level = 0;

        if (p->search_commit_root) {
                b = root->commit_root;
                atomic_inc(&b->refs);
                level = btrfs_header_level(b);
                /*
                 * Ensure that all callers have set skip_locking when
                 * p->search_commit_root = 1.
                 */
                ASSERT(p->skip_locking == 1);

                goto out;
        }

        if (p->skip_locking) {
                b = btrfs_root_node(root);
                level = btrfs_header_level(b);
                goto out;
        }

        /* We try very hard to do read locks on the root */
        root_lock = BTRFS_READ_LOCK;

        /*
         * If the level is set to maximum, we can skip trying to get the read
         * lock.
         */
        if (write_lock_level < BTRFS_MAX_LEVEL) {
                /*
                 * We don't know the level of the root node until we actually
                 * have it read locked
                 */
                if (p->nowait) {
                        b = btrfs_try_read_lock_root_node(root);
                        if (IS_ERR(b))
                                return b;
                } else {
                        b = btrfs_read_lock_root_node(root);
                }
                level = btrfs_header_level(b);
                if (level > write_lock_level)
                        goto out;

                /* Whoops, must trade for write lock */
                btrfs_tree_read_unlock(b);
                free_extent_buffer(b);
        }

        b = btrfs_lock_root_node(root);
        root_lock = BTRFS_WRITE_LOCK;

        /* The level might have changed, check again */
        level = btrfs_header_level(b);

out:
        /*
         * The root may have failed to write out at some point, and thus is no
         * longer valid, return an error in this case.
         */
        if (!extent_buffer_uptodate(b)) {
                if (root_lock)
                        btrfs_tree_unlock_rw(b, root_lock);
                free_extent_buffer(b);
                return ERR_PTR(-EIO);
        }

        p->nodes[level] = b;
        if (!p->skip_locking)
                p->locks[level] = root_lock;
        /*
         * Callers are responsible for dropping b's references.
         */
        return b;
}

/*
 * Replace the extent buffer at the lowest level of the path with a cloned
 * version. The purpose is to be able to use it safely, after releasing the
 * commit root semaphore, even if relocation is happening in parallel, the
 * transaction used for relocation is committed and the extent buffer is
 * reallocated in the next transaction.
 *
 * This is used in a context where the caller does not prevent transaction
 * commits from happening, either by holding a transaction handle or holding
 * some lock, while it's doing searches through a commit root.
 * At the moment it's only used for send operations.
 */
static int finish_need_commit_sem_search(struct btrfs_path *path)
{
        const int i = path->lowest_level;
        const int slot = path->slots[i];
        struct extent_buffer *lowest = path->nodes[i];
        struct extent_buffer *clone;

        ASSERT(path->need_commit_sem);

        if (!lowest)
                return 0;

        lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);

        clone = btrfs_clone_extent_buffer(lowest);
        if (!clone)
                return -ENOMEM;

        btrfs_release_path(path);
        path->nodes[i] = clone;
        path->slots[i] = slot;

        return 0;
}

static inline int search_for_key_slot(struct extent_buffer *eb,
                                      int search_low_slot,
                                      const struct btrfs_key *key,
                                      int prev_cmp,
                                      int *slot)
{
        /*
         * If a previous call to btrfs_bin_search() on a parent node returned an
         * exact match (prev_cmp == 0), we can safely assume the target key will
         * always be at slot 0 on lower levels, since each key pointer
         * (struct btrfs_key_ptr) refers to the lowest key accessible from the
         * subtree it points to. Thus we can skip searching lower levels.
         */
        if (prev_cmp == 0) {
                *slot = 0;
                return 0;
        }

        return btrfs_bin_search(eb, search_low_slot, key, slot);
}

static int search_leaf(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root,
                       const struct btrfs_key *key,
                       struct btrfs_path *path,
                       int ins_len,
                       int prev_cmp)
{
        struct extent_buffer *leaf = path->nodes[0];
        int leaf_free_space = -1;
        int search_low_slot = 0;
        int ret;
        bool do_bin_search = true;

        /*
         * If we are doing an insertion, the leaf has enough free space and the
         * destination slot for the key is not slot 0, then we can unlock our
         * write lock on the parent, and any other upper nodes, before doing the
         * binary search on the leaf (with search_for_key_slot()), allowing other
         * tasks to lock the parent and any other upper nodes.
         */
        if (ins_len > 0) {
                /*
                 * Cache the leaf free space, since we will need it later and it
                 * will not change until then.
                 */
                leaf_free_space = btrfs_leaf_free_space(leaf);

                /*
                 * !path->locks[1] means we have a single node tree, the leaf is
                 * the root of the tree.
                 */
                if (path->locks[1] && leaf_free_space >= ins_len) {
                        struct btrfs_disk_key first_key;

                        ASSERT(btrfs_header_nritems(leaf) > 0);
                        btrfs_item_key(leaf, &first_key, 0);

                        /*
                         * Doing the extra comparison with the first key is cheap,
                         * taking into account that the first key is very likely
                         * already in a cache line because it immediately follows
                         * the extent buffer's header and we have recently accessed
                         * the header's level field.
                         */
                        ret = btrfs_comp_keys(&first_key, key);
                        if (ret < 0) {
                                /*
                                 * The first key is smaller than the key we want
                                 * to insert, so we are safe to unlock all upper
                                 * nodes and we have to do the binary search.
                                 *
                                 * We do use btrfs_unlock_up_safe() and not
                                 * unlock_up() because the later does not unlock
                                 * nodes with a slot of 0 - we can safely unlock
                                 * any node even if its slot is 0 since in this
                                 * case the key does not end up at slot 0 of the
                                 * leaf and there's no need to split the leaf.
                                 */
                                btrfs_unlock_up_safe(path, 1);
                                search_low_slot = 1;
                        } else {
                                /*
                                 * The first key is >= then the key we want to
                                 * insert, so we can skip the binary search as
                                 * the target key will be at slot 0.
                                 *
                                 * We can not unlock upper nodes when the key is
                                 * less than the first key, because we will need
                                 * to update the key at slot 0 of the parent node
                                 * and possibly of other upper nodes too.
                                 * If the key matches the first key, then we can
                                 * unlock all the upper nodes, using
                                 * btrfs_unlock_up_safe() instead of unlock_up()
                                 * as stated above.
                                 */
                                if (ret == 0)
                                        btrfs_unlock_up_safe(path, 1);
                                /*
                                 * ret is already 0 or 1, matching the result of
                                 * a btrfs_bin_search() call, so there is no need
                                 * to adjust it.
                                 */
                                do_bin_search = false;
                                path->slots[0] = 0;
                        }
                }
        }

        if (do_bin_search) {
                ret = search_for_key_slot(leaf, search_low_slot, key,
                                          prev_cmp, &path->slots[0]);
                if (ret < 0)
                        return ret;
        }

        if (ins_len > 0) {
                /*
                 * Item key already exists. In this case, if we are allowed to
                 * insert the item (for example, in dir_item case, item key
                 * collision is allowed), it will be merged with the original
                 * item. Only the item size grows, no new btrfs item will be
                 * added. If search_for_extension is not set, ins_len already
                 * accounts the size btrfs_item, deduct it here so leaf space
                 * check will be correct.
                 */
                if (ret == 0 && !path->search_for_extension) {
                        ASSERT(ins_len >= sizeof(struct btrfs_item));
                        ins_len -= sizeof(struct btrfs_item);
                }

                ASSERT(leaf_free_space >= 0);

                if (leaf_free_space < ins_len) {
                        int err;

                        err = split_leaf(trans, root, key, path, ins_len,
                                         (ret == 0));
                        ASSERT(err <= 0);
                        if (WARN_ON(err > 0))
                                err = -EUCLEAN;
                        if (err)
                                ret = err;
                }
        }

        return ret;
}

/*
 * Look for a key in a tree and perform necessary modifications to preserve
 * tree invariants.
 *
 * @trans:        Handle of transaction, used when modifying the tree
 * @p:                Holds all btree nodes along the search path
 * @root:        The root node of the tree
 * @key:        The key we are looking for
 * @ins_len:        Indicates purpose of search:
 *              >0  for inserts it's size of item inserted (*)
 *              <0  for deletions
 *               0  for plain searches, not modifying the tree
 *
 *              (*) If size of item inserted doesn't include
 *              sizeof(struct btrfs_item), then p->search_for_extension must
 *              be set.
 * @cow:        boolean should CoW operations be performed. Must always be 1
 *                when modifying the tree.
 *
 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
 *
 * If @key is found, 0 is returned and you can find the item in the leaf level
 * of the path (level 0)
 *
 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
 * points to the slot where it should be inserted
 *
 * If an error is encountered while searching the tree a negative error number
 * is returned
 */
int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                      const struct btrfs_key *key, struct btrfs_path *p,
                      int ins_len, int cow)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_buffer *b;
        int slot;
        int ret;
        int err;
        int level;
        int lowest_unlock = 1;
        /* everything at write_lock_level or lower must be write locked */
        int write_lock_level = 0;
        u8 lowest_level = 0;
        int min_write_lock_level;
        int prev_cmp;

        might_sleep();

        lowest_level = p->lowest_level;
        WARN_ON(lowest_level && ins_len > 0);
        WARN_ON(p->nodes[0] != NULL);
        BUG_ON(!cow && ins_len);

        /*
         * For now only allow nowait for read only operations.  There's no
         * strict reason why we can't, we just only need it for reads so it's
         * only implemented for reads.
         */
        ASSERT(!p->nowait || !cow);

        if (ins_len < 0) {
                lowest_unlock = 2;

                /* when we are removing items, we might have to go up to level
                 * two as we update tree pointers  Make sure we keep write
                 * for those levels as well
                 */
                write_lock_level = 2;
        } else if (ins_len > 0) {
                /*
                 * for inserting items, make sure we have a write lock on
                 * level 1 so we can update keys
                 */
                write_lock_level = 1;
        }

        if (!cow)
                write_lock_level = -1;

        if (cow && (p->keep_locks || p->lowest_level))
                write_lock_level = BTRFS_MAX_LEVEL;

        min_write_lock_level = write_lock_level;

        if (p->need_commit_sem) {
                ASSERT(p->search_commit_root);
                if (p->nowait) {
                        if (!down_read_trylock(&fs_info->commit_root_sem))
                                return -EAGAIN;
                } else {
                        down_read(&fs_info->commit_root_sem);
                }
        }

again:
        prev_cmp = -1;
        b = btrfs_search_slot_get_root(root, p, write_lock_level);
        if (IS_ERR(b)) {
                ret = PTR_ERR(b);
                goto done;
        }

        while (b) {
                int dec = 0;

                level = btrfs_header_level(b);

                if (cow) {
                        bool last_level = (level == (BTRFS_MAX_LEVEL - 1));

                        /*
                         * if we don't really need to cow this block
                         * then we don't want to set the path blocking,
                         * so we test it here
                         */
                        if (!should_cow_block(trans, root, b))
                                goto cow_done;

                        /*
                         * must have write locks on this node and the
                         * parent
                         */
                        if (level > write_lock_level ||
                            (level + 1 > write_lock_level &&
                            level + 1 < BTRFS_MAX_LEVEL &&
                            p->nodes[level + 1])) {
                                write_lock_level = level + 1;
                                btrfs_release_path(p);
                                goto again;
                        }

                        if (last_level)
                                err = btrfs_cow_block(trans, root, b, NULL, 0,
                                                      &b,
                                                      BTRFS_NESTING_COW);
                        else
                                err = btrfs_cow_block(trans, root, b,
                                                      p->nodes[level + 1],
                                                      p->slots[level + 1], &b,
                                                      BTRFS_NESTING_COW);
                        if (err) {
                                ret = err;
                                goto done;
                        }
                }
cow_done:
                p->nodes[level] = b;

                /*
                 * we have a lock on b and as long as we aren't changing
                 * the tree, there is no way to for the items in b to change.
                 * It is safe to drop the lock on our parent before we
                 * go through the expensive btree search on b.
                 *
                 * If we're inserting or deleting (ins_len != 0), then we might
                 * be changing slot zero, which may require changing the parent.
                 * So, we can't drop the lock until after we know which slot
                 * we're operating on.
                 */
                if (!ins_len && !p->keep_locks) {
                        int u = level + 1;

                        if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
                                btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
                                p->locks[u] = 0;
                        }
                }

                if (level == 0) {
                        if (ins_len > 0)
                                ASSERT(write_lock_level >= 1);

                        ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
                        if (!p->search_for_split)
                                unlock_up(p, level, lowest_unlock,
                                          min_write_lock_level, NULL);
                        goto done;
                }

                ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
                if (ret < 0)
                        goto done;
                prev_cmp = ret;

                if (ret && slot > 0) {
                        dec = 1;
                        slot--;
                }
                p->slots[level] = slot;
                err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
                                             &write_lock_level);
                if (err == -EAGAIN)
                        goto again;
                if (err) {
                        ret = err;
                        goto done;
                }
                b = p->nodes[level];
                slot = p->slots[level];

                /*
                 * Slot 0 is special, if we change the key we have to update
                 * the parent pointer which means we must have a write lock on
                 * the parent
                 */
                if (slot == 0 && ins_len && write_lock_level < level + 1) {
                        write_lock_level = level + 1;
                        btrfs_release_path(p);
                        goto again;
                }

                unlock_up(p, level, lowest_unlock, min_write_lock_level,
                          &write_lock_level);

                if (level == lowest_level) {
                        if (dec)
                                p->slots[level]++;
                        goto done;
                }

                err = read_block_for_search(root, p, &b, level, slot, key);
                if (err == -EAGAIN)
                        goto again;
                if (err) {
                        ret = err;
                        goto done;
                }

                if (!p->skip_locking) {
                        level = btrfs_header_level(b);

                        btrfs_maybe_reset_lockdep_class(root, b);

                        if (level <= write_lock_level) {
                                btrfs_tree_lock(b);
                                p->locks[level] = BTRFS_WRITE_LOCK;
                        } else {
                                if (p->nowait) {
                                        if (!btrfs_try_tree_read_lock(b)) {
                                                free_extent_buffer(b);
                                                ret = -EAGAIN;
                                                goto done;
                                        }
                                } else {
                                        btrfs_tree_read_lock(b);
                                }
                                p->locks[level] = BTRFS_READ_LOCK;
                        }
                        p->nodes[level] = b;
                }
        }
        ret = 1;
done:
        if (ret < 0 && !p->skip_release_on_error)
                btrfs_release_path(p);

        if (p->need_commit_sem) {
                int ret2;

                ret2 = finish_need_commit_sem_search(p);
                up_read(&fs_info->commit_root_sem);
                if (ret2)
                        ret = ret2;
        }

        return ret;
}
ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);

/*
 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
 * current state of the tree together with the operations recorded in the tree
 * modification log to search for the key in a previous version of this tree, as
 * denoted by the time_seq parameter.
 *
 * Naturally, there is no support for insert, delete or cow operations.
 *
 * The resulting path and return value will be set up as if we called
 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
 */
int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
                          struct btrfs_path *p, u64 time_seq)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_buffer *b;
        int slot;
        int ret;
        int err;
        int level;
        int lowest_unlock = 1;
        u8 lowest_level = 0;

        lowest_level = p->lowest_level;
        WARN_ON(p->nodes[0] != NULL);
        ASSERT(!p->nowait);

        if (p->search_commit_root) {
                BUG_ON(time_seq);
                return btrfs_search_slot(NULL, root, key, p, 0, 0);
        }

again:
        b = btrfs_get_old_root(root, time_seq);
        if (!b) {
                ret = -EIO;
                goto done;
        }
        level = btrfs_header_level(b);
        p->locks[level] = BTRFS_READ_LOCK;

        while (b) {
                int dec = 0;

                level = btrfs_header_level(b);
                p->nodes[level] = b;

                /*
                 * we have a lock on b and as long as we aren't changing
                 * the tree, there is no way to for the items in b to change.
                 * It is safe to drop the lock on our parent before we
                 * go through the expensive btree search on b.
                 */
                btrfs_unlock_up_safe(p, level + 1);

                ret = btrfs_bin_search(b, 0, key, &slot);
                if (ret < 0)
                        goto done;

                if (level == 0) {
                        p->slots[level] = slot;
                        unlock_up(p, level, lowest_unlock, 0, NULL);
                        goto done;
                }

                if (ret && slot > 0) {
                        dec = 1;
                        slot--;
                }
                p->slots[level] = slot;
                unlock_up(p, level, lowest_unlock, 0, NULL);

                if (level == lowest_level) {
                        if (dec)
                                p->slots[level]++;
                        goto done;
                }

                err = read_block_for_search(root, p, &b, level, slot, key);
                if (err == -EAGAIN)
                        goto again;
                if (err) {
                        ret = err;
                        goto done;
                }

                level = btrfs_header_level(b);
                btrfs_tree_read_lock(b);
                b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
                if (!b) {
                        ret = -ENOMEM;
                        goto done;
                }
                p->locks[level] = BTRFS_READ_LOCK;
                p->nodes[level] = b;
        }
        ret = 1;
done:
        if (ret < 0)
                btrfs_release_path(p);

        return ret;
}

/*
 * Search the tree again to find a leaf with smaller keys.
 * Returns 0 if it found something.
 * Returns 1 if there are no smaller keys.
 * Returns < 0 on error.
 *
 * This may release the path, and so you may lose any locks held at the
 * time you call it.
 */
static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
{
        struct btrfs_key key;
        struct btrfs_key orig_key;
        struct btrfs_disk_key found_key;
        int ret;

        btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
        orig_key = key;

        if (key.offset > 0) {
                key.offset--;
        } else if (key.type > 0) {
                key.type--;
                key.offset = (u64)-1;
        } else if (key.objectid > 0) {
                key.objectid--;
                key.type = (u8)-1;
                key.offset = (u64)-1;
        } else {
                return 1;
        }

        btrfs_release_path(path);
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret <= 0)
                return ret;

        /*
         * Previous key not found. Even if we were at slot 0 of the leaf we had
         * before releasing the path and calling btrfs_search_slot(), we now may
         * be in a slot pointing to the same original key - this can happen if
         * after we released the path, one of more items were moved from a
         * sibling leaf into the front of the leaf we had due to an insertion
         * (see push_leaf_right()).
         * If we hit this case and our slot is > 0 and just decrement the slot
         * so that the caller does not process the same key again, which may or
         * may not break the caller, depending on its logic.
         */
        if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
                btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
                ret = btrfs_comp_keys(&found_key, &orig_key);
                if (ret == 0) {
                        if (path->slots[0] > 0) {
                                path->slots[0]--;
                                return 0;
                        }
                        /*
                         * At slot 0, same key as before, it means orig_key is
                         * the lowest, leftmost, key in the tree. We're done.
                         */
                        return 1;
                }
        }

        btrfs_item_key(path->nodes[0], &found_key, 0);
        ret = btrfs_comp_keys(&found_key, &key);
        /*
         * We might have had an item with the previous key in the tree right
         * before we released our path. And after we released our path, that
         * item might have been pushed to the first slot (0) of the leaf we
         * were holding due to a tree balance. Alternatively, an item with the
         * previous key can exist as the only element of a leaf (big fat item).
         * Therefore account for these 2 cases, so that our callers (like
         * btrfs_previous_item) don't miss an existing item with a key matching
         * the previous key we computed above.
         */
        if (ret <= 0)
                return 0;
        return 1;
}

/*
 * helper to use instead of search slot if no exact match is needed but
 * instead the next or previous item should be returned.
 * When find_higher is true, the next higher item is returned, the next lower
 * otherwise.
 * When return_any and find_higher are both true, and no higher item is found,
 * return the next lower instead.
 * When return_any is true and find_higher is false, and no lower item is found,
 * return the next higher instead.
 * It returns 0 if any item is found, 1 if none is found (tree empty), and
 * < 0 on error
 */
int btrfs_search_slot_for_read(struct btrfs_root *root,
                               const struct btrfs_key *key,
                               struct btrfs_path *p, int find_higher,
                               int return_any)
{
        int ret;
        struct extent_buffer *leaf;

again:
        ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
        if (ret <= 0)
                return ret;
        /*
         * a return value of 1 means the path is at the position where the
         * item should be inserted. Normally this is the next bigger item,
         * but in case the previous item is the last in a leaf, path points
         * to the first free slot in the previous leaf, i.e. at an invalid
         * item.
         */
        leaf = p->nodes[0];

        if (find_higher) {
                if (p->slots[0] >= btrfs_header_nritems(leaf)) {
                        ret = btrfs_next_leaf(root, p);
                        if (ret <= 0)
                                return ret;
                        if (!return_any)
                                return 1;
                        /*
                         * no higher item found, return the next
                         * lower instead
                         */
                        return_any = 0;
                        find_higher = 0;
                        btrfs_release_path(p);
                        goto again;
                }
        } else {
                if (p->slots[0] == 0) {
                        ret = btrfs_prev_leaf(root, p);
                        if (ret < 0)
                                return ret;
                        if (!ret) {
                                leaf = p->nodes[0];
                                if (p->slots[0] == btrfs_header_nritems(leaf))
                                        p->slots[0]--;
                                return 0;
                        }
                        if (!return_any)
                                return 1;
                        /*
                         * no lower item found, return the next
                         * higher instead
                         */
                        return_any = 0;
                        find_higher = 1;
                        btrfs_release_path(p);
                        goto again;
                } else {
                        --p->slots[0];
                }
        }
        return 0;
}

/*
 * Execute search and call btrfs_previous_item to traverse backwards if the item
 * was not found.
 *
 * Return 0 if found, 1 if not found and < 0 if error.
 */
int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
                           struct btrfs_path *path)
{
        int ret;

        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
        if (ret > 0)
                ret = btrfs_previous_item(root, path, key->objectid, key->type);

        if (ret == 0)
                btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);

        return ret;
}

/*
 * Search for a valid slot for the given path.
 *
 * @root:        The root node of the tree.
 * @key:        Will contain a valid item if found.
 * @path:        The starting point to validate the slot.
 *
 * Return: 0  if the item is valid
 *         1  if not found
 *         <0 if error.
 */
int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
                              struct btrfs_path *path)
{
        if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
                int ret;

                ret = btrfs_next_leaf(root, path);
                if (ret)
                        return ret;
        }

        btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
        return 0;
}

/*
 * adjust the pointers going up the tree, starting at level
 * making sure the right key of each node is points to 'key'.
 * This is used after shifting pointers to the left, so it stops
 * fixing up pointers when a given leaf/node is not in slot 0 of the
 * higher levels
 *
 */
static void fixup_low_keys(struct btrfs_trans_handle *trans,
                           struct btrfs_path *path,
                           struct btrfs_disk_key *key, int level)
{
        int i;
        struct extent_buffer *t;
        int ret;

        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
                int tslot = path->slots[i];

                if (!path->nodes[i])
                        break;
                t = path->nodes[i];
                ret = btrfs_tree_mod_log_insert_key(t, tslot,
                                                    BTRFS_MOD_LOG_KEY_REPLACE);
                BUG_ON(ret < 0);
                btrfs_set_node_key(t, key, tslot);
                btrfs_mark_buffer_dirty(trans, path->nodes[i]);
                if (tslot != 0)
                        break;
        }
}

/*
 * update item key.
 *
 * This function isn't completely safe. It's the caller's responsibility
 * that the new key won't break the order
 */
void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
                             struct btrfs_path *path,
                             const struct btrfs_key *new_key)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_disk_key disk_key;
        struct extent_buffer *eb;
        int slot;

        eb = path->nodes[0];
        slot = path->slots[0];
        if (slot > 0) {
                btrfs_item_key(eb, &disk_key, slot - 1);
                if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) {
                        btrfs_print_leaf(eb);
                        btrfs_crit(fs_info,
                "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
                                   slot, btrfs_disk_key_objectid(&disk_key),
                                   btrfs_disk_key_type(&disk_key),
                                   btrfs_disk_key_offset(&disk_key),
                                   new_key->objectid, new_key->type,
                                   new_key->offset);
                        BUG();
                }
        }
        if (slot < btrfs_header_nritems(eb) - 1) {
                btrfs_item_key(eb, &disk_key, slot + 1);
                if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) {
                        btrfs_print_leaf(eb);
                        btrfs_crit(fs_info,
                "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
                                   slot, btrfs_disk_key_objectid(&disk_key),
                                   btrfs_disk_key_type(&disk_key),
                                   btrfs_disk_key_offset(&disk_key),
                                   new_key->objectid, new_key->type,
                                   new_key->offset);
                        BUG();
                }
        }

        btrfs_cpu_key_to_disk(&disk_key, new_key);
        btrfs_set_item_key(eb, &disk_key, slot);
        btrfs_mark_buffer_dirty(trans, eb);
        if (slot == 0)
                fixup_low_keys(trans, path, &disk_key, 1);
}

/*
 * Check key order of two sibling extent buffers.
 *
 * Return true if something is wrong.
 * Return false if everything is fine.
 *
 * Tree-checker only works inside one tree block, thus the following
 * corruption can not be detected by tree-checker:
 *
 * Leaf @left                        | Leaf @right
 * --------------------------------------------------------------
 * | 1 | 2 | 3 | 4 | 5 | f6 |   | 7 | 8 |
 *
 * Key f6 in leaf @left itself is valid, but not valid when the next
 * key in leaf @right is 7.
 * This can only be checked at tree block merge time.
 * And since tree checker has ensured all key order in each tree block
 * is correct, we only need to bother the last key of @left and the first
 * key of @right.
 */
static bool check_sibling_keys(struct extent_buffer *left,
                               struct extent_buffer *right)
{
        struct btrfs_key left_last;
        struct btrfs_key right_first;
        int level = btrfs_header_level(left);
        int nr_left = btrfs_header_nritems(left);
        int nr_right = btrfs_header_nritems(right);

        /* No key to check in one of the tree blocks */
        if (!nr_left || !nr_right)
                return false;

        if (level) {
                btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
                btrfs_node_key_to_cpu(right, &right_first, 0);
        } else {
                btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
                btrfs_item_key_to_cpu(right, &right_first, 0);
        }

        if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) {
                btrfs_crit(left->fs_info, "left extent buffer:");
                btrfs_print_tree(left, false);
                btrfs_crit(left->fs_info, "right extent buffer:");
                btrfs_print_tree(right, false);
                btrfs_crit(left->fs_info,
"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
                           left_last.objectid, left_last.type,
                           left_last.offset, right_first.objectid,
                           right_first.type, right_first.offset);
                return true;
        }
        return false;
}

/*
 * try to push data from one node into the next node left in the
 * tree.
 *
 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
 * error, and > 0 if there was no room in the left hand block.
 */
static int push_node_left(struct btrfs_trans_handle *trans,
                          struct extent_buffer *dst,
                          struct extent_buffer *src, int empty)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        int push_items = 0;
        int src_nritems;
        int dst_nritems;
        int ret = 0;

        src_nritems = btrfs_header_nritems(src);
        dst_nritems = btrfs_header_nritems(dst);
        push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
        WARN_ON(btrfs_header_generation(src) != trans->transid);
        WARN_ON(btrfs_header_generation(dst) != trans->transid);

        if (!empty && src_nritems <= 8)
                return 1;

        if (push_items <= 0)
                return 1;

        if (empty) {
                push_items = min(src_nritems, push_items);
                if (push_items < src_nritems) {
                        /* leave at least 8 pointers in the node if
                         * we aren't going to empty it
                         */
                        if (src_nritems - push_items < 8) {
                                if (push_items <= 8)
                                        return 1;
                                push_items -= 8;
                        }
                }
        } else
                push_items = min(src_nritems - 8, push_items);

        /* dst is the left eb, src is the middle eb */
        if (check_sibling_keys(dst, src)) {
                ret = -EUCLEAN;
                btrfs_abort_transaction(trans, ret);
                return ret;
        }
        ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                return ret;
        }
        copy_extent_buffer(dst, src,
                           btrfs_node_key_ptr_offset(dst, dst_nritems),
                           btrfs_node_key_ptr_offset(src, 0),
                           push_items * sizeof(struct btrfs_key_ptr));

        if (push_items < src_nritems) {
                /*
                 * btrfs_tree_mod_log_eb_copy handles logging the move, so we
                 * don't need to do an explicit tree mod log operation for it.
                 */
                memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0),
                                      btrfs_node_key_ptr_offset(src, push_items),
                                      (src_nritems - push_items) *
                                      sizeof(struct btrfs_key_ptr));
        }
        btrfs_set_header_nritems(src, src_nritems - push_items);
        btrfs_set_header_nritems(dst, dst_nritems + push_items);
        btrfs_mark_buffer_dirty(trans, src);
        btrfs_mark_buffer_dirty(trans, dst);

        return ret;
}

/*
 * try to push data from one node into the next node right in the
 * tree.
 *
 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
 * error, and > 0 if there was no room in the right hand block.
 *
 * this will  only push up to 1/2 the contents of the left node over
 */
static int balance_node_right(struct btrfs_trans_handle *trans,
                              struct extent_buffer *dst,
                              struct extent_buffer *src)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        int push_items = 0;
        int max_push;
        int src_nritems;
        int dst_nritems;
        int ret = 0;

        WARN_ON(btrfs_header_generation(src) != trans->transid);
        WARN_ON(btrfs_header_generation(dst) != trans->transid);

        src_nritems = btrfs_header_nritems(src);
        dst_nritems = btrfs_header_nritems(dst);
        push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
        if (push_items <= 0)
                return 1;

        if (src_nritems < 4)
                return 1;

        max_push = src_nritems / 2 + 1;
        /* don't try to empty the node */
        if (max_push >= src_nritems)
                return 1;

        if (max_push < push_items)
                push_items = max_push;

        /* dst is the right eb, src is the middle eb */
        if (check_sibling_keys(src, dst)) {
                ret = -EUCLEAN;
                btrfs_abort_transaction(trans, ret);
                return ret;
        }

        /*
         * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't
         * need to do an explicit tree mod log operation for it.
         */
        memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items),
                                      btrfs_node_key_ptr_offset(dst, 0),
                                      (dst_nritems) *
                                      sizeof(struct btrfs_key_ptr));

        ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
                                         push_items);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                return ret;
        }
        copy_extent_buffer(dst, src,
                           btrfs_node_key_ptr_offset(dst, 0),
                           btrfs_node_key_ptr_offset(src, src_nritems - push_items),
                           push_items * sizeof(struct btrfs_key_ptr));

        btrfs_set_header_nritems(src, src_nritems - push_items);
        btrfs_set_header_nritems(dst, dst_nritems + push_items);

        btrfs_mark_buffer_dirty(trans, src);
        btrfs_mark_buffer_dirty(trans, dst);

        return ret;
}

/*
 * helper function to insert a new root level in the tree.
 * A new node is allocated, and a single item is inserted to
 * point to the existing root
 *
 * returns zero on success or < 0 on failure.
 */
static noinline int insert_new_root(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct btrfs_path *path, int level)
{
        u64 lower_gen;
        struct extent_buffer *lower;
        struct extent_buffer *c;
        struct extent_buffer *old;
        struct btrfs_disk_key lower_key;
        int ret;

        BUG_ON(path->nodes[level]);
        BUG_ON(path->nodes[level-1] != root->node);

        lower = path->nodes[level-1];
        if (level == 1)
                btrfs_item_key(lower, &lower_key, 0);
        else
                btrfs_node_key(lower, &lower_key, 0);

        c = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
                                   &lower_key, level, root->node->start, 0,
                                   0, BTRFS_NESTING_NEW_ROOT);
        if (IS_ERR(c))
                return PTR_ERR(c);

        root_add_used_bytes(root);

        btrfs_set_header_nritems(c, 1);
        btrfs_set_node_key(c, &lower_key, 0);
        btrfs_set_node_blockptr(c, 0, lower->start);
        lower_gen = btrfs_header_generation(lower);
        WARN_ON(lower_gen != trans->transid);

        btrfs_set_node_ptr_generation(c, 0, lower_gen);

        btrfs_mark_buffer_dirty(trans, c);

        old = root->node;
        ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
        if (ret < 0) {
                btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
                btrfs_tree_unlock(c);
                free_extent_buffer(c);
                return ret;
        }
        rcu_assign_pointer(root->node, c);

        /* the super has an extra ref to root->node */
        free_extent_buffer(old);

        add_root_to_dirty_list(root);
        atomic_inc(&c->refs);
        path->nodes[level] = c;
        path->locks[level] = BTRFS_WRITE_LOCK;
        path->slots[level] = 0;
        return 0;
}

/*
 * worker function to insert a single pointer in a node.
 * the node should have enough room for the pointer already
 *
 * slot and level indicate where you want the key to go, and
 * blocknr is the block the key points to.
 */
static int insert_ptr(struct btrfs_trans_handle *trans,
                      struct btrfs_path *path,
                      struct btrfs_disk_key *key, u64 bytenr,
                      int slot, int level)
{
        struct extent_buffer *lower;
        int nritems;
        int ret;

        BUG_ON(!path->nodes[level]);
        btrfs_assert_tree_write_locked(path->nodes[level]);
        lower = path->nodes[level];
        nritems = btrfs_header_nritems(lower);
        BUG_ON(slot > nritems);
        BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
        if (slot != nritems) {
                if (level) {
                        ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
                                        slot, nritems - slot);
                        if (ret < 0) {
                                btrfs_abort_transaction(trans, ret);
                                return ret;
                        }
                }
                memmove_extent_buffer(lower,
                              btrfs_node_key_ptr_offset(lower, slot + 1),
                              btrfs_node_key_ptr_offset(lower, slot),
                              (nritems - slot) * sizeof(struct btrfs_key_ptr));
        }
        if (level) {
                ret = btrfs_tree_mod_log_insert_key(lower, slot,
                                                    BTRFS_MOD_LOG_KEY_ADD);
                if (ret < 0) {
                        btrfs_abort_transaction(trans, ret);
                        return ret;
                }
        }
        btrfs_set_node_key(lower, key, slot);
        btrfs_set_node_blockptr(lower, slot, bytenr);
        WARN_ON(trans->transid == 0);
        btrfs_set_node_ptr_generation(lower, slot, trans->transid);
        btrfs_set_header_nritems(lower, nritems + 1);
        btrfs_mark_buffer_dirty(trans, lower);

        return 0;
}

/*
 * split the node at the specified level in path in two.
 * The path is corrected to point to the appropriate node after the split
 *
 * Before splitting this tries to make some room in the node by pushing
 * left and right, if either one works, it returns right away.
 *
 * returns 0 on success and < 0 on failure
 */
static noinline int split_node(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               struct btrfs_path *path, int level)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_buffer *c;
        struct extent_buffer *split;
        struct btrfs_disk_key disk_key;
        int mid;
        int ret;
        u32 c_nritems;

        c = path->nodes[level];
        WARN_ON(btrfs_header_generation(c) != trans->transid);
        if (c == root->node) {
                /*
                 * trying to split the root, lets make a new one
                 *
                 * tree mod log: We don't log_removal old root in
                 * insert_new_root, because that root buffer will be kept as a
                 * normal node. We are going to log removal of half of the
                 * elements below with btrfs_tree_mod_log_eb_copy(). We're
                 * holding a tree lock on the buffer, which is why we cannot
                 * race with other tree_mod_log users.
                 */
                ret = insert_new_root(trans, root, path, level + 1);
                if (ret)
                        return ret;
        } else {
                ret = push_nodes_for_insert(trans, root, path, level);
                c = path->nodes[level];
                if (!ret && btrfs_header_nritems(c) <
                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
                        return 0;
                if (ret < 0)
                        return ret;
        }

        c_nritems = btrfs_header_nritems(c);
        mid = (c_nritems + 1) / 2;
        btrfs_node_key(c, &disk_key, mid);

        split = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
                                       &disk_key, level, c->start, 0,
                                       0, BTRFS_NESTING_SPLIT);
        if (IS_ERR(split))
                return PTR_ERR(split);

        root_add_used_bytes(root);
        ASSERT(btrfs_header_level(c) == level);

        ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
        if (ret) {
                btrfs_tree_unlock(split);
                free_extent_buffer(split);
                btrfs_abort_transaction(trans, ret);
                return ret;
        }
        copy_extent_buffer(split, c,
                           btrfs_node_key_ptr_offset(split, 0),
                           btrfs_node_key_ptr_offset(c, mid),
                           (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
        btrfs_set_header_nritems(split, c_nritems - mid);
        btrfs_set_header_nritems(c, mid);

        btrfs_mark_buffer_dirty(trans, c);
        btrfs_mark_buffer_dirty(trans, split);

        ret = insert_ptr(trans, path, &disk_key, split->start,
                         path->slots[level + 1] + 1, level + 1);
        if (ret < 0) {
                btrfs_tree_unlock(split);
                free_extent_buffer(split);
                return ret;
        }

        if (path->slots[level] >= mid) {
                path->slots[level] -= mid;
                btrfs_tree_unlock(c);
                free_extent_buffer(c);
                path->nodes[level] = split;
                path->slots[level + 1] += 1;
        } else {
                btrfs_tree_unlock(split);
                free_extent_buffer(split);
        }
        return 0;
}

/*
 * how many bytes are required to store the items in a leaf.  start
 * and nr indicate which items in the leaf to check.  This totals up the
 * space used both by the item structs and the item data
 */
static int leaf_space_used(const struct extent_buffer *l, int start, int nr)
{
        int data_len;
        int nritems = btrfs_header_nritems(l);
        int end = min(nritems, start + nr) - 1;

        if (!nr)
                return 0;
        data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
        data_len = data_len - btrfs_item_offset(l, end);
        data_len += sizeof(struct btrfs_item) * nr;
        WARN_ON(data_len < 0);
        return data_len;
}

/*
 * The space between the end of the leaf items and
 * the start of the leaf data.  IOW, how much room
 * the leaf has left for both items and data
 */
int btrfs_leaf_free_space(const struct extent_buffer *leaf)
{
        struct btrfs_fs_info *fs_info = leaf->fs_info;
        int nritems = btrfs_header_nritems(leaf);
        int ret;

        ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
        if (ret < 0) {
                btrfs_crit(fs_info,
                           "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
                           ret,
                           (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
                           leaf_space_used(leaf, 0, nritems), nritems);
        }
        return ret;
}

/*
 * min slot controls the lowest index we're willing to push to the
 * right.  We'll push up to and including min_slot, but no lower
 */
static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
                                      struct btrfs_path *path,
                                      int data_size, int empty,
                                      struct extent_buffer *right,
                                      int free_space, u32 left_nritems,
                                      u32 min_slot)
{
        struct btrfs_fs_info *fs_info = right->fs_info;
        struct extent_buffer *left = path->nodes[0];
        struct extent_buffer *upper = path->nodes[1];
        struct btrfs_map_token token;
        struct btrfs_disk_key disk_key;
        int slot;
        u32 i;
        int push_space = 0;
        int push_items = 0;
        u32 nr;
        u32 right_nritems;
        u32 data_end;
        u32 this_item_size;

        if (empty)
                nr = 0;
        else
                nr = max_t(u32, 1, min_slot);

        if (path->slots[0] >= left_nritems)
                push_space += data_size;

        slot = path->slots[1];
        i = left_nritems - 1;
        while (i >= nr) {
                if (!empty && push_items > 0) {
                        if (path->slots[0] > i)
                                break;
                        if (path->slots[0] == i) {
                                int space = btrfs_leaf_free_space(left);

                                if (space + push_space * 2 > free_space)
                                        break;
                        }
                }

                if (path->slots[0] == i)
                        push_space += data_size;

                this_item_size = btrfs_item_size(left, i);
                if (this_item_size + sizeof(struct btrfs_item) +
                    push_space > free_space)
                        break;

                push_items++;
                push_space += this_item_size + sizeof(struct btrfs_item);
                if (i == 0)
                        break;
                i--;
        }

        if (push_items == 0)
                goto out_unlock;

        WARN_ON(!empty && push_items == left_nritems);

        /* push left to right */
        right_nritems = btrfs_header_nritems(right);

        push_space = btrfs_item_data_end(left, left_nritems - push_items);
        push_space -= leaf_data_end(left);

        /* make room in the right data area */
        data_end = leaf_data_end(right);
        memmove_leaf_data(right, data_end - push_space, data_end,
                          BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);

        /* copy from the left data area */
        copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
                       leaf_data_end(left), push_space);

        memmove_leaf_items(right, push_items, 0, right_nritems);

        /* copy the items from left to right */
        copy_leaf_items(right, left, 0, left_nritems - push_items, push_items);

        /* update the item pointers */
        btrfs_init_map_token(&token, right);
        right_nritems += push_items;
        btrfs_set_header_nritems(right, right_nritems);
        push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
        for (i = 0; i < right_nritems; i++) {
                push_space -= btrfs_token_item_size(&token, i);
                btrfs_set_token_item_offset(&token, i, push_space);
        }

        left_nritems -= push_items;
        btrfs_set_header_nritems(left, left_nritems);

        if (left_nritems)
                btrfs_mark_buffer_dirty(trans, left);
        else
                btrfs_clear_buffer_dirty(trans, left);

        btrfs_mark_buffer_dirty(trans, right);

        btrfs_item_key(right, &disk_key, 0);
        btrfs_set_node_key(upper, &disk_key, slot + 1);
        btrfs_mark_buffer_dirty(trans, upper);

        /* then fixup the leaf pointer in the path */
        if (path->slots[0] >= left_nritems) {
                path->slots[0] -= left_nritems;
                if (btrfs_header_nritems(path->nodes[0]) == 0)
                        btrfs_clear_buffer_dirty(trans, path->nodes[0]);
                btrfs_tree_unlock(path->nodes[0]);
                free_extent_buffer(path->nodes[0]);
                path->nodes[0] = right;
                path->slots[1] += 1;
        } else {
                btrfs_tree_unlock(right);
                free_extent_buffer(right);
        }
        return 0;

out_unlock:
        btrfs_tree_unlock(right);
        free_extent_buffer(right);
        return 1;
}

/*
 * push some data in the path leaf to the right, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
 *
 * returns 1 if the push failed because the other node didn't have enough
 * room, 0 if everything worked out and < 0 if there were major errors.
 *
 * this will push starting from min_slot to the end of the leaf.  It won't
 * push any slot lower than min_slot
 */
static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
                           *root, struct btrfs_path *path,
                           int min_data_size, int data_size,
                           int empty, u32 min_slot)
{
        struct extent_buffer *left = path->nodes[0];
        struct extent_buffer *right;
        struct extent_buffer *upper;
        int slot;
        int free_space;
        u32 left_nritems;
        int ret;

        if (!path->nodes[1])
                return 1;

        slot = path->slots[1];
        upper = path->nodes[1];
        if (slot >= btrfs_header_nritems(upper) - 1)
                return 1;

        btrfs_assert_tree_write_locked(path->nodes[1]);

        right = btrfs_read_node_slot(upper, slot + 1);
        if (IS_ERR(right))
                return PTR_ERR(right);

        btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT);

        free_space = btrfs_leaf_free_space(right);
        if (free_space < data_size)
                goto out_unlock;

        ret = btrfs_cow_block(trans, root, right, upper,
                              slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
        if (ret)
                goto out_unlock;

        left_nritems = btrfs_header_nritems(left);
        if (left_nritems == 0)
                goto out_unlock;

        if (check_sibling_keys(left, right)) {
                ret = -EUCLEAN;
                btrfs_abort_transaction(trans, ret);
                btrfs_tree_unlock(right);
                free_extent_buffer(right);
                return ret;
        }
        if (path->slots[0] == left_nritems && !empty) {
                /* Key greater than all keys in the leaf, right neighbor has
                 * enough room for it and we're not emptying our leaf to delete
                 * it, therefore use right neighbor to insert the new item and
                 * no need to touch/dirty our left leaf. */
                btrfs_tree_unlock(left);
                free_extent_buffer(left);
                path->nodes[0] = right;
                path->slots[0] = 0;
                path->slots[1]++;
                return 0;
        }

        return __push_leaf_right(trans, path, min_data_size, empty, right,
                                 free_space, left_nritems, min_slot);
out_unlock:
        btrfs_tree_unlock(right);
        free_extent_buffer(right);
        return 1;
}

/*
 * push some data in the path leaf to the left, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
 *
 * max_slot can put a limit on how far into the leaf we'll push items.  The
 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
 * items
 */
static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
                                     struct btrfs_path *path, int data_size,
                                     int empty, struct extent_buffer *left,
                                     int free_space, u32 right_nritems,
                                     u32 max_slot)
{
        struct btrfs_fs_info *fs_info = left->fs_info;
        struct btrfs_disk_key disk_key;
        struct extent_buffer *right = path->nodes[0];
        int i;
        int push_space = 0;
        int push_items = 0;
        u32 old_left_nritems;
        u32 nr;
        int ret = 0;
        u32 this_item_size;
        u32 old_left_item_size;
        struct btrfs_map_token token;

        if (empty)
                nr = min(right_nritems, max_slot);
        else
                nr = min(right_nritems - 1, max_slot);

        for (i = 0; i < nr; i++) {
                if (!empty && push_items > 0) {
                        if (path->slots[0] < i)
                                break;
                        if (path->slots[0] == i) {
                                int space = btrfs_leaf_free_space(right);

                                if (space + push_space * 2 > free_space)
                                        break;
                        }
                }

                if (path->slots[0] == i)
                        push_space += data_size;

                this_item_size = btrfs_item_size(right, i);
                if (this_item_size + sizeof(struct btrfs_item) + push_space >
                    free_space)
                        break;

                push_items++;
                push_space += this_item_size + sizeof(struct btrfs_item);
        }

        if (push_items == 0) {
                ret = 1;
                goto out;
        }
        WARN_ON(!empty && push_items == btrfs_header_nritems(right));

        /* push data from right to left */
        copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items);

        push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
                     btrfs_item_offset(right, push_items - 1);

        copy_leaf_data(left, right, leaf_data_end(left) - push_space,
                       btrfs_item_offset(right, push_items - 1), push_space);
        old_left_nritems = btrfs_header_nritems(left);
        BUG_ON(old_left_nritems <= 0);

        btrfs_init_map_token(&token, left);
        old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
        for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
                u32 ioff;

                ioff = btrfs_token_item_offset(&token, i);
                btrfs_set_token_item_offset(&token, i,
                      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
        }
        btrfs_set_header_nritems(left, old_left_nritems + push_items);

        /* fixup right node */
        if (push_items > right_nritems)
                WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
                       right_nritems);

        if (push_items < right_nritems) {
                push_space = btrfs_item_offset(right, push_items - 1) -
                                                  leaf_data_end(right);
                memmove_leaf_data(right,
                                  BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
                                  leaf_data_end(right), push_space);

                memmove_leaf_items(right, 0, push_items,
                                   btrfs_header_nritems(right) - push_items);
        }

        btrfs_init_map_token(&token, right);
        right_nritems -= push_items;
        btrfs_set_header_nritems(right, right_nritems);
        push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
        for (i = 0; i < right_nritems; i++) {
                push_space = push_space - btrfs_token_item_size(&token, i);
                btrfs_set_token_item_offset(&token, i, push_space);
        }

        btrfs_mark_buffer_dirty(trans, left);
        if (right_nritems)
                btrfs_mark_buffer_dirty(trans, right);
        else
                btrfs_clear_buffer_dirty(trans, right);

        btrfs_item_key(right, &disk_key, 0);
        fixup_low_keys(trans, path, &disk_key, 1);

        /* then fixup the leaf pointer in the path */
        if (path->slots[0] < push_items) {
                path->slots[0] += old_left_nritems;
                btrfs_tree_unlock(path->nodes[0]);
                free_extent_buffer(path->nodes[0]);
                path->nodes[0] = left;
                path->slots[1] -= 1;
        } else {
                btrfs_tree_unlock(left);
                free_extent_buffer(left);
                path->slots[0] -= push_items;
        }
        BUG_ON(path->slots[0] < 0);
        return ret;
out:
        btrfs_tree_unlock(left);
        free_extent_buffer(left);
        return ret;
}

/*
 * push some data in the path leaf to the left, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
 *
 * max_slot can put a limit on how far into the leaf we'll push items.  The
 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
 * items
 */
static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
                          *root, struct btrfs_path *path, int min_data_size,
                          int data_size, int empty, u32 max_slot)
{
        struct extent_buffer *right = path->nodes[0];
        struct extent_buffer *left;
        int slot;
        int free_space;
        u32 right_nritems;
        int ret = 0;

        slot = path->slots[1];
        if (slot == 0)
                return 1;
        if (!path->nodes[1])
                return 1;

        right_nritems = btrfs_header_nritems(right);
        if (right_nritems == 0)
                return 1;

        btrfs_assert_tree_write_locked(path->nodes[1]);

        left = btrfs_read_node_slot(path->nodes[1], slot - 1);
        if (IS_ERR(left))
                return PTR_ERR(left);

        btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT);

        free_space = btrfs_leaf_free_space(left);
        if (free_space < data_size) {
                ret = 1;
                goto out;
        }

        ret = btrfs_cow_block(trans, root, left,
                              path->nodes[1], slot - 1, &left,
                              BTRFS_NESTING_LEFT_COW);
        if (ret) {
                /* we hit -ENOSPC, but it isn't fatal here */
                if (ret == -ENOSPC)
                        ret = 1;
                goto out;
        }

        if (check_sibling_keys(left, right)) {
                ret = -EUCLEAN;
                btrfs_abort_transaction(trans, ret);
                goto out;
        }
        return __push_leaf_left(trans, path, min_data_size, empty, left,
                                free_space, right_nritems, max_slot);
out:
        btrfs_tree_unlock(left);
        free_extent_buffer(left);
        return ret;
}

/*
 * split the path's leaf in two, making sure there is at least data_size
 * available for the resulting leaf level of the path.
 */
static noinline int copy_for_split(struct btrfs_trans_handle *trans,
                                   struct btrfs_path *path,
                                   struct extent_buffer *l,
                                   struct extent_buffer *right,
                                   int slot, int mid, int nritems)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        int data_copy_size;
        int rt_data_off;
        int i;
        int ret;
        struct btrfs_disk_key disk_key;
        struct btrfs_map_token token;

        nritems = nritems - mid;
        btrfs_set_header_nritems(right, nritems);
        data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);

        copy_leaf_items(right, l, 0, mid, nritems);

        copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size,
                       leaf_data_end(l), data_copy_size);

        rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);

        btrfs_init_map_token(&token, right);
        for (i = 0; i < nritems; i++) {
                u32 ioff;

                ioff = btrfs_token_item_offset(&token, i);
                btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
        }

        btrfs_set_header_nritems(l, mid);
        btrfs_item_key(right, &disk_key, 0);
        ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
        if (ret < 0)
                return ret;

        btrfs_mark_buffer_dirty(trans, right);
        btrfs_mark_buffer_dirty(trans, l);
        BUG_ON(path->slots[0] != slot);

        if (mid <= slot) {
                btrfs_tree_unlock(path->nodes[0]);
                free_extent_buffer(path->nodes[0]);
                path->nodes[0] = right;
                path->slots[0] -= mid;
                path->slots[1] += 1;
        } else {
                btrfs_tree_unlock(right);
                free_extent_buffer(right);
        }

        BUG_ON(path->slots[0] < 0);

        return 0;
}

/*
 * double splits happen when we need to insert a big item in the middle
 * of a leaf.  A double split can leave us with 3 mostly empty leaves:
 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
 *          A                 B                 C
 *
 * We avoid this by trying to push the items on either side of our target
 * into the adjacent leaves.  If all goes well we can avoid the double split
 * completely.
 */
static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
                                          struct btrfs_path *path,
                                          int data_size)
{
        int ret;
        int progress = 0;
        int slot;
        u32 nritems;
        int space_needed = data_size;

        slot = path->slots[0];
        if (slot < btrfs_header_nritems(path->nodes[0]))
                space_needed -= btrfs_leaf_free_space(path->nodes[0]);

        /*
         * try to push all the items after our slot into the
         * right leaf
         */
        ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
        if (ret < 0)
                return ret;

        if (ret == 0)
                progress++;

        nritems = btrfs_header_nritems(path->nodes[0]);
        /*
         * our goal is to get our slot at the start or end of a leaf.  If
         * we've done so we're done
         */
        if (path->slots[0] == 0 || path->slots[0] == nritems)
                return 0;

        if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
                return 0;

        /* try to push all the items before our slot into the next leaf */
        slot = path->slots[0];
        space_needed = data_size;
        if (slot > 0)
                space_needed -= btrfs_leaf_free_space(path->nodes[0]);
        ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
        if (ret < 0)
                return ret;

        if (ret == 0)
                progress++;

        if (progress)
                return 0;
        return 1;
}

/*
 * split the path's leaf in two, making sure there is at least data_size
 * available for the resulting leaf level of the path.
 *
 * returns 0 if all went well and < 0 on failure.
 */
static noinline int split_leaf(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               const struct btrfs_key *ins_key,
                               struct btrfs_path *path, int data_size,
                               int extend)
{
        struct btrfs_disk_key disk_key;
        struct extent_buffer *l;
        u32 nritems;
        int mid;
        int slot;
        struct extent_buffer *right;
        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret = 0;
        int wret;
        int split;
        int num_doubles = 0;
        int tried_avoid_double = 0;

        l = path->nodes[0];
        slot = path->slots[0];
        if (extend && data_size + btrfs_item_size(l, slot) +
            sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
                return -EOVERFLOW;

        /* first try to make some room by pushing left and right */
        if (data_size && path->nodes[1]) {
                int space_needed = data_size;

                if (slot < btrfs_header_nritems(l))
                        space_needed -= btrfs_leaf_free_space(l);

                wret = push_leaf_right(trans, root, path, space_needed,
                                       space_needed, 0, 0);
                if (wret < 0)
                        return wret;
                if (wret) {
                        space_needed = data_size;
                        if (slot > 0)
                                space_needed -= btrfs_leaf_free_space(l);
                        wret = push_leaf_left(trans, root, path, space_needed,
                                              space_needed, 0, (u32)-1);
                        if (wret < 0)
                                return wret;
                }
                l = path->nodes[0];

                /* did the pushes work? */
                if (btrfs_leaf_free_space(l) >= data_size)
                        return 0;
        }

        if (!path->nodes[1]) {
                ret = insert_new_root(trans, root, path, 1);
                if (ret)
                        return ret;
        }
again:
        split = 1;
        l = path->nodes[0];
        slot = path->slots[0];
        nritems = btrfs_header_nritems(l);
        mid = (nritems + 1) / 2;

        if (mid <= slot) {
                if (nritems == 1 ||
                    leaf_space_used(l, mid, nritems - mid) + data_size >
                        BTRFS_LEAF_DATA_SIZE(fs_info)) {
                        if (slot >= nritems) {
                                split = 0;
                        } else {
                                mid = slot;
                                if (mid != nritems &&
                                    leaf_space_used(l, mid, nritems - mid) +
                                    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
                                        if (data_size && !tried_avoid_double)
                                                goto push_for_double;
                                        split = 2;
                                }
                        }
                }
        } else {
                if (leaf_space_used(l, 0, mid) + data_size >
                        BTRFS_LEAF_DATA_SIZE(fs_info)) {
                        if (!extend && data_size && slot == 0) {
                                split = 0;
                        } else if ((extend || !data_size) && slot == 0) {
                                mid = 1;
                        } else {
                                mid = slot;
                                if (mid != nritems &&
                                    leaf_space_used(l, mid, nritems - mid) +
                                    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
                                        if (data_size && !tried_avoid_double)
                                                goto push_for_double;
                                        split = 2;
                                }
                        }
                }
        }

        if (split == 0)
                btrfs_cpu_key_to_disk(&disk_key, ins_key);
        else
                btrfs_item_key(l, &disk_key, mid);

        /*
         * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
         * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
         * subclasses, which is 8 at the time of this patch, and we've maxed it
         * out.  In the future we could add a
         * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
         * use BTRFS_NESTING_NEW_ROOT.
         */
        right = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root),
                                       &disk_key, 0, l->start, 0, 0,
                                       num_doubles ? BTRFS_NESTING_NEW_ROOT :
                                       BTRFS_NESTING_SPLIT);
        if (IS_ERR(right))
                return PTR_ERR(right);

        root_add_used_bytes(root);

        if (split == 0) {
                if (mid <= slot) {
                        btrfs_set_header_nritems(right, 0);
                        ret = insert_ptr(trans, path, &disk_key,
                                         right->start, path->slots[1] + 1, 1);
                        if (ret < 0) {
                                btrfs_tree_unlock(right);
                                free_extent_buffer(right);
                                return ret;
                        }
                        btrfs_tree_unlock(path->nodes[0]);
                        free_extent_buffer(path->nodes[0]);
                        path->nodes[0] = right;
                        path->slots[0] = 0;
                        path->slots[1] += 1;
                } else {
                        btrfs_set_header_nritems(right, 0);
                        ret = insert_ptr(trans, path, &disk_key,
                                         right->start, path->slots[1], 1);
                        if (ret < 0) {
                                btrfs_tree_unlock(right);
                                free_extent_buffer(right);
                                return ret;
                        }
                        btrfs_tree_unlock(path->nodes[0]);
                        free_extent_buffer(path->nodes[0]);
                        path->nodes[0] = right;
                        path->slots[0] = 0;
                        if (path->slots[1] == 0)
                                fixup_low_keys(trans, path, &disk_key, 1);
                }
                /*
                 * We create a new leaf 'right' for the required ins_len and
                 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
                 * the content of ins_len to 'right'.
                 */
                return ret;
        }

        ret = copy_for_split(trans, path, l, right, slot, mid, nritems);
        if (ret < 0) {
                btrfs_tree_unlock(right);
                free_extent_buffer(right);
                return ret;
        }

        if (split == 2) {
                BUG_ON(num_doubles != 0);
                num_doubles++;
                goto again;
        }

        return 0;

push_for_double:
        push_for_double_split(trans, root, path, data_size);
        tried_avoid_double = 1;
        if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
                return 0;
        goto again;
}

static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
                                         struct btrfs_root *root,
                                         struct btrfs_path *path, int ins_len)
{
        struct btrfs_key key;
        struct extent_buffer *leaf;
        struct btrfs_file_extent_item *fi;
        u64 extent_len = 0;
        u32 item_size;
        int ret;

        leaf = path->nodes[0];
        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);

        BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
               key.type != BTRFS_EXTENT_CSUM_KEY);

        if (btrfs_leaf_free_space(leaf) >= ins_len)
                return 0;

        item_size = btrfs_item_size(leaf, path->slots[0]);
        if (key.type == BTRFS_EXTENT_DATA_KEY) {
                fi = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_file_extent_item);
                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
        }
        btrfs_release_path(path);

        path->keep_locks = 1;
        path->search_for_split = 1;
        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        path->search_for_split = 0;
        if (ret > 0)
                ret = -EAGAIN;
        if (ret < 0)
                goto err;

        ret = -EAGAIN;
        leaf = path->nodes[0];
        /* if our item isn't there, return now */
        if (item_size != btrfs_item_size(leaf, path->slots[0]))
                goto err;

        /* the leaf has  changed, it now has room.  return now */
        if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
                goto err;

        if (key.type == BTRFS_EXTENT_DATA_KEY) {
                fi = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_file_extent_item);
                if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
                        goto err;
        }

        ret = split_leaf(trans, root, &key, path, ins_len, 1);
        if (ret)
                goto err;

        path->keep_locks = 0;
        btrfs_unlock_up_safe(path, 1);
        return 0;
err:
        path->keep_locks = 0;
        return ret;
}

static noinline int split_item(struct btrfs_trans_handle *trans,
                               struct btrfs_path *path,
                               const struct btrfs_key *new_key,
                               unsigned long split_offset)
{
        struct extent_buffer *leaf;
        int orig_slot, slot;
        char *buf;
        u32 nritems;
        u32 item_size;
        u32 orig_offset;
        struct btrfs_disk_key disk_key;

        leaf = path->nodes[0];
        /*
         * Shouldn't happen because the caller must have previously called
         * setup_leaf_for_split() to make room for the new item in the leaf.
         */
        if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)))
                return -ENOSPC;

        orig_slot = path->slots[0];
        orig_offset = btrfs_item_offset(leaf, path->slots[0]);
        item_size = btrfs_item_size(leaf, path->slots[0]);

        buf = kmalloc(item_size, GFP_NOFS);
        if (!buf)
                return -ENOMEM;

        read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
                            path->slots[0]), item_size);

        slot = path->slots[0] + 1;
        nritems = btrfs_header_nritems(leaf);
        if (slot != nritems) {
                /* shift the items */
                memmove_leaf_items(leaf, slot + 1, slot, nritems - slot);
        }

        btrfs_cpu_key_to_disk(&disk_key, new_key);
        btrfs_set_item_key(leaf, &disk_key, slot);

        btrfs_set_item_offset(leaf, slot, orig_offset);
        btrfs_set_item_size(leaf, slot, item_size - split_offset);

        btrfs_set_item_offset(leaf, orig_slot,
                                 orig_offset + item_size - split_offset);
        btrfs_set_item_size(leaf, orig_slot, split_offset);

        btrfs_set_header_nritems(leaf, nritems + 1);

        /* write the data for the start of the original item */
        write_extent_buffer(leaf, buf,
                            btrfs_item_ptr_offset(leaf, path->slots[0]),
                            split_offset);

        /* write the data for the new item */
        write_extent_buffer(leaf, buf + split_offset,
                            btrfs_item_ptr_offset(leaf, slot),
                            item_size - split_offset);
        btrfs_mark_buffer_dirty(trans, leaf);

        BUG_ON(btrfs_leaf_free_space(leaf) < 0);
        kfree(buf);
        return 0;
}

/*
 * This function splits a single item into two items,
 * giving 'new_key' to the new item and splitting the
 * old one at split_offset (from the start of the item).
 *
 * The path may be released by this operation.  After
 * the split, the path is pointing to the old item.  The
 * new item is going to be in the same node as the old one.
 *
 * Note, the item being split must be smaller enough to live alone on
 * a tree block with room for one extra struct btrfs_item
 *
 * This allows us to split the item in place, keeping a lock on the
 * leaf the entire time.
 */
int btrfs_split_item(struct btrfs_trans_handle *trans,
                     struct btrfs_root *root,
                     struct btrfs_path *path,
                     const struct btrfs_key *new_key,
                     unsigned long split_offset)
{
        int ret;
        ret = setup_leaf_for_split(trans, root, path,
                                   sizeof(struct btrfs_item));
        if (ret)
                return ret;

        ret = split_item(trans, path, new_key, split_offset);
        return ret;
}

/*
 * make the item pointed to by the path smaller.  new_size indicates
 * how small to make it, and from_end tells us if we just chop bytes
 * off the end of the item or if we shift the item to chop bytes off
 * the front.
 */
void btrfs_truncate_item(struct btrfs_trans_handle *trans,
                         struct btrfs_path *path, u32 new_size, int from_end)
{
        int slot;
        struct extent_buffer *leaf;
        u32 nritems;
        unsigned int data_end;
        unsigned int old_data_start;
        unsigned int old_size;
        unsigned int size_diff;
        int i;
        struct btrfs_map_token token;

        leaf = path->nodes[0];
        slot = path->slots[0];

        old_size = btrfs_item_size(leaf, slot);
        if (old_size == new_size)
                return;

        nritems = btrfs_header_nritems(leaf);
        data_end = leaf_data_end(leaf);

        old_data_start = btrfs_item_offset(leaf, slot);

        size_diff = old_size - new_size;

        BUG_ON(slot < 0);
        BUG_ON(slot >= nritems);

        /*
         * item0..itemN ... dataN.offset..dataN.size .. data0.size
         */
        /* first correct the data pointers */
        btrfs_init_map_token(&token, leaf);
        for (i = slot; i < nritems; i++) {
                u32 ioff;

                ioff = btrfs_token_item_offset(&token, i);
                btrfs_set_token_item_offset(&token, i, ioff + size_diff);
        }

        /* shift the data */
        if (from_end) {
                memmove_leaf_data(leaf, data_end + size_diff, data_end,
                                  old_data_start + new_size - data_end);
        } else {
                struct btrfs_disk_key disk_key;
                u64 offset;

                btrfs_item_key(leaf, &disk_key, slot);

                if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
                        unsigned long ptr;
                        struct btrfs_file_extent_item *fi;

                        fi = btrfs_item_ptr(leaf, slot,
                                            struct btrfs_file_extent_item);
                        fi = (struct btrfs_file_extent_item *)(
                             (unsigned long)fi - size_diff);

                        if (btrfs_file_extent_type(leaf, fi) ==
                            BTRFS_FILE_EXTENT_INLINE) {
                                ptr = btrfs_item_ptr_offset(leaf, slot);
                                memmove_extent_buffer(leaf, ptr,
                                      (unsigned long)fi,
                                      BTRFS_FILE_EXTENT_INLINE_DATA_START);
                        }
                }

                memmove_leaf_data(leaf, data_end + size_diff, data_end,
                                  old_data_start - data_end);

                offset = btrfs_disk_key_offset(&disk_key);
                btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
                btrfs_set_item_key(leaf, &disk_key, slot);
                if (slot == 0)
                        fixup_low_keys(trans, path, &disk_key, 1);
        }

        btrfs_set_item_size(leaf, slot, new_size);
        btrfs_mark_buffer_dirty(trans, leaf);

        if (btrfs_leaf_free_space(leaf) < 0) {
                btrfs_print_leaf(leaf);
                BUG();
        }
}

/*
 * make the item pointed to by the path bigger, data_size is the added size.
 */
void btrfs_extend_item(struct btrfs_trans_handle *trans,
                       struct btrfs_path *path, u32 data_size)
{
        int slot;
        struct extent_buffer *leaf;
        u32 nritems;
        unsigned int data_end;
        unsigned int old_data;
        unsigned int old_size;
        int i;
        struct btrfs_map_token token;

        leaf = path->nodes[0];

        nritems = btrfs_header_nritems(leaf);
        data_end = leaf_data_end(leaf);

        if (btrfs_leaf_free_space(leaf) < data_size) {
                btrfs_print_leaf(leaf);
                BUG();
        }
        slot = path->slots[0];
        old_data = btrfs_item_data_end(leaf, slot);

        BUG_ON(slot < 0);
        if (slot >= nritems) {
                btrfs_print_leaf(leaf);
                btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
                           slot, nritems);
                BUG();
        }

        /*
         * item0..itemN ... dataN.offset..dataN.size .. data0.size
         */
        /* first correct the data pointers */
        btrfs_init_map_token(&token, leaf);
        for (i = slot; i < nritems; i++) {
                u32 ioff;

                ioff = btrfs_token_item_offset(&token, i);
                btrfs_set_token_item_offset(&token, i, ioff - data_size);
        }

        /* shift the data */
        memmove_leaf_data(leaf, data_end - data_size, data_end,
                          old_data - data_end);

        data_end = old_data;
        old_size = btrfs_item_size(leaf, slot);
        btrfs_set_item_size(leaf, slot, old_size + data_size);
        btrfs_mark_buffer_dirty(trans, leaf);

        if (btrfs_leaf_free_space(leaf) < 0) {
                btrfs_print_leaf(leaf);
                BUG();
        }
}

/*
 * Make space in the node before inserting one or more items.
 *
 * @trans:        transaction handle
 * @root:        root we are inserting items to
 * @path:        points to the leaf/slot where we are going to insert new items
 * @batch:      information about the batch of items to insert
 *
 * Main purpose is to save stack depth by doing the bulk of the work in a
 * function that doesn't call btrfs_search_slot
 */
static void setup_items_for_insert(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root, struct btrfs_path *path,
                                   const struct btrfs_item_batch *batch)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        int i;
        u32 nritems;
        unsigned int data_end;
        struct btrfs_disk_key disk_key;
        struct extent_buffer *leaf;
        int slot;
        struct btrfs_map_token token;
        u32 total_size;

        /*
         * Before anything else, update keys in the parent and other ancestors
         * if needed, then release the write locks on them, so that other tasks
         * can use them while we modify the leaf.
         */
        if (path->slots[0] == 0) {
                btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
                fixup_low_keys(trans, path, &disk_key, 1);
        }
        btrfs_unlock_up_safe(path, 1);

        leaf = path->nodes[0];
        slot = path->slots[0];

        nritems = btrfs_header_nritems(leaf);
        data_end = leaf_data_end(leaf);
        total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));

        if (btrfs_leaf_free_space(leaf) < total_size) {
                btrfs_print_leaf(leaf);
                btrfs_crit(fs_info, "not enough freespace need %u have %d",
                           total_size, btrfs_leaf_free_space(leaf));
                BUG();
        }

        btrfs_init_map_token(&token, leaf);
        if (slot != nritems) {
                unsigned int old_data = btrfs_item_data_end(leaf, slot);

                if (old_data < data_end) {
                        btrfs_print_leaf(leaf);
                        btrfs_crit(fs_info,
                "item at slot %d with data offset %u beyond data end of leaf %u",
                                   slot, old_data, data_end);
                        BUG();
                }
                /*
                 * item0..itemN ... dataN.offset..dataN.size .. data0.size
                 */
                /* first correct the data pointers */
                for (i = slot; i < nritems; i++) {
                        u32 ioff;

                        ioff = btrfs_token_item_offset(&token, i);
                        btrfs_set_token_item_offset(&token, i,
                                                       ioff - batch->total_data_size);
                }
                /* shift the items */
                memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot);

                /* shift the data */
                memmove_leaf_data(leaf, data_end - batch->total_data_size,
                                  data_end, old_data - data_end);
                data_end = old_data;
        }

        /* setup the item for the new data */
        for (i = 0; i < batch->nr; i++) {
                btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
                btrfs_set_item_key(leaf, &disk_key, slot + i);
                data_end -= batch->data_sizes[i];
                btrfs_set_token_item_offset(&token, slot + i, data_end);
                btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
        }

        btrfs_set_header_nritems(leaf, nritems + batch->nr);
        btrfs_mark_buffer_dirty(trans, leaf);

        if (btrfs_leaf_free_space(leaf) < 0) {
                btrfs_print_leaf(leaf);
                BUG();
        }
}

/*
 * Insert a new item into a leaf.
 *
 * @trans:     Transaction handle.
 * @root:      The root of the btree.
 * @path:      A path pointing to the target leaf and slot.
 * @key:       The key of the new item.
 * @data_size: The size of the data associated with the new key.
 */
void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path,
                                 const struct btrfs_key *key,
                                 u32 data_size)
{
        struct btrfs_item_batch batch;

        batch.keys = key;
        batch.data_sizes = &data_size;
        batch.total_data_size = data_size;
        batch.nr = 1;

        setup_items_for_insert(trans, root, path, &batch);
}

/*
 * Given a key and some data, insert items into the tree.
 * This does all the path init required, making room in the tree if needed.
 *
 * Returns: 0        on success
 *          -EEXIST  if the first key already exists
 *          < 0      on other errors
 */
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root,
                            struct btrfs_path *path,
                            const struct btrfs_item_batch *batch)
{
        int ret = 0;
        int slot;
        u32 total_size;

        total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
        ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
        if (ret == 0)
                return -EEXIST;
        if (ret < 0)
                return ret;

        slot = path->slots[0];
        BUG_ON(slot < 0);

        setup_items_for_insert(trans, root, path, batch);
        return 0;
}

/*
 * Given a key and some data, insert an item into the tree.
 * This does all the path init required, making room in the tree if needed.
 */
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                      const struct btrfs_key *cpu_key, void *data,
                      u32 data_size)
{
        int ret = 0;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        unsigned long ptr;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
        if (!ret) {
                leaf = path->nodes[0];
                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
                write_extent_buffer(leaf, data, ptr, data_size);
                btrfs_mark_buffer_dirty(trans, leaf);
        }
        btrfs_free_path(path);
        return ret;
}

/*
 * This function duplicates an item, giving 'new_key' to the new item.
 * It guarantees both items live in the same tree leaf and the new item is
 * contiguous with the original item.
 *
 * This allows us to split a file extent in place, keeping a lock on the leaf
 * the entire time.
 */
int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         struct btrfs_path *path,
                         const struct btrfs_key *new_key)
{
        struct extent_buffer *leaf;
        int ret;
        u32 item_size;

        leaf = path->nodes[0];
        item_size = btrfs_item_size(leaf, path->slots[0]);
        ret = setup_leaf_for_split(trans, root, path,
                                   item_size + sizeof(struct btrfs_item));
        if (ret)
                return ret;

        path->slots[0]++;
        btrfs_setup_item_for_insert(trans, root, path, new_key, item_size);
        leaf = path->nodes[0];
        memcpy_extent_buffer(leaf,
                             btrfs_item_ptr_offset(leaf, path->slots[0]),
                             btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
                             item_size);
        return 0;
}

/*
 * delete the pointer from a given node.
 *
 * the tree should have been previously balanced so the deletion does not
 * empty a node.
 *
 * This is exported for use inside btrfs-progs, don't un-export it.
 */
int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                  struct btrfs_path *path, int level, int slot)
{
        struct extent_buffer *parent = path->nodes[level];
        u32 nritems;
        int ret;

        nritems = btrfs_header_nritems(parent);
        if (slot != nritems - 1) {
                if (level) {
                        ret = btrfs_tree_mod_log_insert_move(parent, slot,
                                        slot + 1, nritems - slot - 1);
                        if (ret < 0) {
                                btrfs_abort_transaction(trans, ret);
                                return ret;
                        }
                }
                memmove_extent_buffer(parent,
                              btrfs_node_key_ptr_offset(parent, slot),
                              btrfs_node_key_ptr_offset(parent, slot + 1),
                              sizeof(struct btrfs_key_ptr) *
                              (nritems - slot - 1));
        } else if (level) {
                ret = btrfs_tree_mod_log_insert_key(parent, slot,
                                                    BTRFS_MOD_LOG_KEY_REMOVE);
                if (ret < 0) {
                        btrfs_abort_transaction(trans, ret);
                        return ret;
                }
        }

        nritems--;
        btrfs_set_header_nritems(parent, nritems);
        if (nritems == 0 && parent == root->node) {
                BUG_ON(btrfs_header_level(root->node) != 1);
                /* just turn the root into a leaf and break */
                btrfs_set_header_level(root->node, 0);
        } else if (slot == 0) {
                struct btrfs_disk_key disk_key;

                btrfs_node_key(parent, &disk_key, 0);
                fixup_low_keys(trans, path, &disk_key, level + 1);
        }
        btrfs_mark_buffer_dirty(trans, parent);
        return 0;
}

/*
 * a helper function to delete the leaf pointed to by path->slots[1] and
 * path->nodes[1].
 *
 * This deletes the pointer in path->nodes[1] and frees the leaf
 * block extent.  zero is returned if it all worked out, < 0 otherwise.
 *
 * The path must have already been setup for deleting the leaf, including
 * all the proper balancing.  path->nodes[1] must be locked.
 */
static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
                                   struct btrfs_path *path,
                                   struct extent_buffer *leaf)
{
        int ret;

        WARN_ON(btrfs_header_generation(leaf) != trans->transid);
        ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]);
        if (ret < 0)
                return ret;

        /*
         * btrfs_free_extent is expensive, we want to make sure we
         * aren't holding any locks when we call it
         */
        btrfs_unlock_up_safe(path, 0);

        root_sub_used_bytes(root);

        atomic_inc(&leaf->refs);
        btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
        free_extent_buffer_stale(leaf);
        return 0;
}
/*
 * delete the item at the leaf level in path.  If that empties
 * the leaf, remove it from the tree
 */
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                    struct btrfs_path *path, int slot, int nr)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_buffer *leaf;
        int ret = 0;
        int wret;
        u32 nritems;

        leaf = path->nodes[0];
        nritems = btrfs_header_nritems(leaf);

        if (slot + nr != nritems) {
                const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
                const int data_end = leaf_data_end(leaf);
                struct btrfs_map_token token;
                u32 dsize = 0;
                int i;

                for (i = 0; i < nr; i++)
                        dsize += btrfs_item_size(leaf, slot + i);

                memmove_leaf_data(leaf, data_end + dsize, data_end,
                                  last_off - data_end);

                btrfs_init_map_token(&token, leaf);
                for (i = slot + nr; i < nritems; i++) {
                        u32 ioff;

                        ioff = btrfs_token_item_offset(&token, i);
                        btrfs_set_token_item_offset(&token, i, ioff + dsize);
                }

                memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr);
        }
        btrfs_set_header_nritems(leaf, nritems - nr);
        nritems -= nr;

        /* delete the leaf if we've emptied it */
        if (nritems == 0) {
                if (leaf == root->node) {
                        btrfs_set_header_level(leaf, 0);
                } else {
                        btrfs_clear_buffer_dirty(trans, leaf);
                        ret = btrfs_del_leaf(trans, root, path, leaf);
                        if (ret < 0)
                                return ret;
                }
        } else {
                int used = leaf_space_used(leaf, 0, nritems);
                if (slot == 0) {
                        struct btrfs_disk_key disk_key;

                        btrfs_item_key(leaf, &disk_key, 0);
                        fixup_low_keys(trans, path, &disk_key, 1);
                }

                /*
                 * Try to delete the leaf if it is mostly empty. We do this by
                 * trying to move all its items into its left and right neighbours.
                 * If we can't move all the items, then we don't delete it - it's
                 * not ideal, but future insertions might fill the leaf with more
                 * items, or items from other leaves might be moved later into our
                 * leaf due to deletions on those leaves.
                 */
                if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
                        u32 min_push_space;

                        /* push_leaf_left fixes the path.
                         * make sure the path still points to our leaf
                         * for possible call to btrfs_del_ptr below
                         */
                        slot = path->slots[1];
                        atomic_inc(&leaf->refs);
                        /*
                         * We want to be able to at least push one item to the
                         * left neighbour leaf, and that's the first item.
                         */
                        min_push_space = sizeof(struct btrfs_item) +
                                btrfs_item_size(leaf, 0);
                        wret = push_leaf_left(trans, root, path, 0,
                                              min_push_space, 1, (u32)-1);
                        if (wret < 0 && wret != -ENOSPC)
                                ret = wret;

                        if (path->nodes[0] == leaf &&
                            btrfs_header_nritems(leaf)) {
                                /*
                                 * If we were not able to push all items from our
                                 * leaf to its left neighbour, then attempt to
                                 * either push all the remaining items to the
                                 * right neighbour or none. There's no advantage
                                 * in pushing only some items, instead of all, as
                                 * it's pointless to end up with a leaf having
                                 * too few items while the neighbours can be full
                                 * or nearly full.
                                 */
                                nritems = btrfs_header_nritems(leaf);
                                min_push_space = leaf_space_used(leaf, 0, nritems);
                                wret = push_leaf_right(trans, root, path, 0,
                                                       min_push_space, 1, 0);
                                if (wret < 0 && wret != -ENOSPC)
                                        ret = wret;
                        }

                        if (btrfs_header_nritems(leaf) == 0) {
                                path->slots[1] = slot;
                                ret = btrfs_del_leaf(trans, root, path, leaf);
                                if (ret < 0)
                                        return ret;
                                free_extent_buffer(leaf);
                                ret = 0;
                        } else {
                                /* if we're still in the path, make sure
                                 * we're dirty.  Otherwise, one of the
                                 * push_leaf functions must have already
                                 * dirtied this buffer
                                 */
                                if (path->nodes[0] == leaf)
                                        btrfs_mark_buffer_dirty(trans, leaf);
                                free_extent_buffer(leaf);
                        }
                } else {
                        btrfs_mark_buffer_dirty(trans, leaf);
                }
        }
        return ret;
}

/*
 * A helper function to walk down the tree starting at min_key, and looking
 * for nodes or leaves that are have a minimum transaction id.
 * This is used by the btree defrag code, and tree logging
 *
 * This does not cow, but it does stuff the starting key it finds back
 * into min_key, so you can call btrfs_search_slot with cow=1 on the
 * key and get a writable path.
 *
 * This honors path->lowest_level to prevent descent past a given level
 * of the tree.
 *
 * min_trans indicates the oldest transaction that you are interested
 * in walking through.  Any nodes or leaves older than min_trans are
 * skipped over (without reading them).
 *
 * returns zero if something useful was found, < 0 on error and 1 if there
 * was nothing in the tree that matched the search criteria.
 */
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
                         struct btrfs_path *path,
                         u64 min_trans)
{
        struct extent_buffer *cur;
        struct btrfs_key found_key;
        int slot;
        int sret;
        u32 nritems;
        int level;
        int ret = 1;
        int keep_locks = path->keep_locks;

        ASSERT(!path->nowait);
        path->keep_locks = 1;
again:
        cur = btrfs_read_lock_root_node(root);
        level = btrfs_header_level(cur);
        WARN_ON(path->nodes[level]);
        path->nodes[level] = cur;
        path->locks[level] = BTRFS_READ_LOCK;

        if (btrfs_header_generation(cur) < min_trans) {
                ret = 1;
                goto out;
        }
        while (1) {
                nritems = btrfs_header_nritems(cur);
                level = btrfs_header_level(cur);
                sret = btrfs_bin_search(cur, 0, min_key, &slot);
                if (sret < 0) {
                        ret = sret;
                        goto out;
                }

                /* at the lowest level, we're done, setup the path and exit */
                if (level == path->lowest_level) {
                        if (slot >= nritems)
                                goto find_next_key;
                        ret = 0;
                        path->slots[level] = slot;
                        btrfs_item_key_to_cpu(cur, &found_key, slot);
                        goto out;
                }
                if (sret && slot > 0)
                        slot--;
                /*
                 * check this node pointer against the min_trans parameters.
                 * If it is too old, skip to the next one.
                 */
                while (slot < nritems) {
                        u64 gen;

                        gen = btrfs_node_ptr_generation(cur, slot);
                        if (gen < min_trans) {
                                slot++;
                                continue;
                        }
                        break;
                }
find_next_key:
                /*
                 * we didn't find a candidate key in this node, walk forward
                 * and find another one
                 */
                if (slot >= nritems) {
                        path->slots[level] = slot;
                        sret = btrfs_find_next_key(root, path, min_key, level,
                                                  min_trans);
                        if (sret == 0) {
                                btrfs_release_path(path);
                                goto again;
                        } else {
                                goto out;
                        }
                }
                /* save our key for returning back */
                btrfs_node_key_to_cpu(cur, &found_key, slot);
                path->slots[level] = slot;
                if (level == path->lowest_level) {
                        ret = 0;
                        goto out;
                }
                cur = btrfs_read_node_slot(cur, slot);
                if (IS_ERR(cur)) {
                        ret = PTR_ERR(cur);
                        goto out;
                }

                btrfs_tree_read_lock(cur);

                path->locks[level - 1] = BTRFS_READ_LOCK;
                path->nodes[level - 1] = cur;
                unlock_up(path, level, 1, 0, NULL);
        }
out:
        path->keep_locks = keep_locks;
        if (ret == 0) {
                btrfs_unlock_up_safe(path, path->lowest_level + 1);
                memcpy(min_key, &found_key, sizeof(found_key));
        }
        return ret;
}

/*
 * this is similar to btrfs_next_leaf, but does not try to preserve
 * and fixup the path.  It looks for and returns the next key in the
 * tree based on the current path and the min_trans parameters.
 *
 * 0 is returned if another key is found, < 0 if there are any errors
 * and 1 is returned if there are no higher keys in the tree
 *
 * path->keep_locks should be set to 1 on the search made before
 * calling this function.
 */
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
                        struct btrfs_key *key, int level, u64 min_trans)
{
        int slot;
        struct extent_buffer *c;

        WARN_ON(!path->keep_locks && !path->skip_locking);
        while (level < BTRFS_MAX_LEVEL) {
                if (!path->nodes[level])
                        return 1;

                slot = path->slots[level] + 1;
                c = path->nodes[level];
next:
                if (slot >= btrfs_header_nritems(c)) {
                        int ret;
                        int orig_lowest;
                        struct btrfs_key cur_key;
                        if (level + 1 >= BTRFS_MAX_LEVEL ||
                            !path->nodes[level + 1])
                                return 1;

                        if (path->locks[level + 1] || path->skip_locking) {
                                level++;
                                continue;
                        }

                        slot = btrfs_header_nritems(c) - 1;
                        if (level == 0)
                                btrfs_item_key_to_cpu(c, &cur_key, slot);
                        else
                                btrfs_node_key_to_cpu(c, &cur_key, slot);

                        orig_lowest = path->lowest_level;
                        btrfs_release_path(path);
                        path->lowest_level = level;
                        ret = btrfs_search_slot(NULL, root, &cur_key, path,
                                                0, 0);
                        path->lowest_level = orig_lowest;
                        if (ret < 0)
                                return ret;

                        c = path->nodes[level];
                        slot = path->slots[level];
                        if (ret == 0)
                                slot++;
                        goto next;
                }

                if (level == 0)
                        btrfs_item_key_to_cpu(c, key, slot);
                else {
                        u64 gen = btrfs_node_ptr_generation(c, slot);

                        if (gen < min_trans) {
                                slot++;
                                goto next;
                        }
                        btrfs_node_key_to_cpu(c, key, slot);
                }
                return 0;
        }
        return 1;
}

int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
                        u64 time_seq)
{
        int slot;
        int level;
        struct extent_buffer *c;
        struct extent_buffer *next;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_key key;
        bool need_commit_sem = false;
        u32 nritems;
        int ret;
        int i;

        /*
         * The nowait semantics are used only for write paths, where we don't
         * use the tree mod log and sequence numbers.
         */
        if (time_seq)
                ASSERT(!path->nowait);

        nritems = btrfs_header_nritems(path->nodes[0]);
        if (nritems == 0)
                return 1;

        btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
again:
        level = 1;
        next = NULL;
        btrfs_release_path(path);

        path->keep_locks = 1;

        if (time_seq) {
                ret = btrfs_search_old_slot(root, &key, path, time_seq);
        } else {
                if (path->need_commit_sem) {
                        path->need_commit_sem = 0;
                        need_commit_sem = true;
                        if (path->nowait) {
                                if (!down_read_trylock(&fs_info->commit_root_sem)) {
                                        ret = -EAGAIN;
                                        goto done;
                                }
                        } else {
                                down_read(&fs_info->commit_root_sem);
                        }
                }
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        }
        path->keep_locks = 0;

        if (ret < 0)
                goto done;

        nritems = btrfs_header_nritems(path->nodes[0]);
        /*
         * by releasing the path above we dropped all our locks.  A balance
         * could have added more items next to the key that used to be
         * at the very end of the block.  So, check again here and
         * advance the path if there are now more items available.
         */
        if (nritems > 0 && path->slots[0] < nritems - 1) {
                if (ret == 0)
                        path->slots[0]++;
                ret = 0;
                goto done;
        }
        /*
         * So the above check misses one case:
         * - after releasing the path above, someone has removed the item that
         *   used to be at the very end of the block, and balance between leafs
         *   gets another one with bigger key.offset to replace it.
         *
         * This one should be returned as well, or we can get leaf corruption
         * later(esp. in __btrfs_drop_extents()).
         *
         * And a bit more explanation about this check,
         * with ret > 0, the key isn't found, the path points to the slot
         * where it should be inserted, so the path->slots[0] item must be the
         * bigger one.
         */
        if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
                ret = 0;
                goto done;
        }

        while (level < BTRFS_MAX_LEVEL) {
                if (!path->nodes[level]) {
                        ret = 1;
                        goto done;
                }

                slot = path->slots[level] + 1;
                c = path->nodes[level];
                if (slot >= btrfs_header_nritems(c)) {
                        level++;
                        if (level == BTRFS_MAX_LEVEL) {
                                ret = 1;
                                goto done;
                        }
                        continue;
                }


                /*
                 * Our current level is where we're going to start from, and to
                 * make sure lockdep doesn't complain we need to drop our locks
                 * and nodes from 0 to our current level.
                 */
                for (i = 0; i < level; i++) {
                        if (path->locks[level]) {
                                btrfs_tree_read_unlock(path->nodes[i]);
                                path->locks[i] = 0;
                        }
                        free_extent_buffer(path->nodes[i]);
                        path->nodes[i] = NULL;
                }

                next = c;
                ret = read_block_for_search(root, path, &next, level,
                                            slot, &key);
                if (ret == -EAGAIN && !path->nowait)
                        goto again;

                if (ret < 0) {
                        btrfs_release_path(path);
                        goto done;
                }

                if (!path->skip_locking) {
                        ret = btrfs_try_tree_read_lock(next);
                        if (!ret && path->nowait) {
                                ret = -EAGAIN;
                                goto done;
                        }
                        if (!ret && time_seq) {
                                /*
                                 * If we don't get the lock, we may be racing
                                 * with push_leaf_left, holding that lock while
                                 * itself waiting for the leaf we've currently
                                 * locked. To solve this situation, we give up
                                 * on our lock and cycle.
                                 */
                                free_extent_buffer(next);
                                btrfs_release_path(path);
                                cond_resched();
                                goto again;
                        }
                        if (!ret)
                                btrfs_tree_read_lock(next);
                }
                break;
        }
        path->slots[level] = slot;
        while (1) {
                level--;
                path->nodes[level] = next;
                path->slots[level] = 0;
                if (!path->skip_locking)
                        path->locks[level] = BTRFS_READ_LOCK;
                if (!level)
                        break;

                ret = read_block_for_search(root, path, &next, level,
                                            0, &key);
                if (ret == -EAGAIN && !path->nowait)
                        goto again;

                if (ret < 0) {
                        btrfs_release_path(path);
                        goto done;
                }

                if (!path->skip_locking) {
                        if (path->nowait) {
                                if (!btrfs_try_tree_read_lock(next)) {
                                        ret = -EAGAIN;
                                        goto done;
                                }
                        } else {
                                btrfs_tree_read_lock(next);
                        }
                }
        }
        ret = 0;
done:
        unlock_up(path, 0, 1, 0, NULL);
        if (need_commit_sem) {
                int ret2;

                path->need_commit_sem = 1;
                ret2 = finish_need_commit_sem_search(path);
                up_read(&fs_info->commit_root_sem);
                if (ret2)
                        ret = ret2;
        }

        return ret;
}

int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq)
{
        path->slots[0]++;
        if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
                return btrfs_next_old_leaf(root, path, time_seq);
        return 0;
}

/*
 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
 * searching until it gets past min_objectid or finds an item of 'type'
 *
 * returns 0 if something is found, 1 if nothing was found and < 0 on error
 */
int btrfs_previous_item(struct btrfs_root *root,
                        struct btrfs_path *path, u64 min_objectid,
                        int type)
{
        struct btrfs_key found_key;
        struct extent_buffer *leaf;
        u32 nritems;
        int ret;

        while (1) {
                if (path->slots[0] == 0) {
                        ret = btrfs_prev_leaf(root, path);
                        if (ret != 0)
                                return ret;
                } else {
                        path->slots[0]--;
                }
                leaf = path->nodes[0];
                nritems = btrfs_header_nritems(leaf);
                if (nritems == 0)
                        return 1;
                if (path->slots[0] == nritems)
                        path->slots[0]--;

                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                if (found_key.objectid < min_objectid)
                        break;
                if (found_key.type == type)
                        return 0;
                if (found_key.objectid == min_objectid &&
                    found_key.type < type)
                        break;
        }
        return 1;
}

/*
 * search in extent tree to find a previous Metadata/Data extent item with
 * min objecitd.
 *
 * returns 0 if something is found, 1 if nothing was found and < 0 on error
 */
int btrfs_previous_extent_item(struct btrfs_root *root,
                        struct btrfs_path *path, u64 min_objectid)
{
        struct btrfs_key found_key;
        struct extent_buffer *leaf;
        u32 nritems;
        int ret;

        while (1) {
                if (path->slots[0] == 0) {
                        ret = btrfs_prev_leaf(root, path);
                        if (ret != 0)
                                return ret;
                } else {
                        path->slots[0]--;
                }
                leaf = path->nodes[0];
                nritems = btrfs_header_nritems(leaf);
                if (nritems == 0)
                        return 1;
                if (path->slots[0] == nritems)
                        path->slots[0]--;

                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                if (found_key.objectid < min_objectid)
                        break;
                if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
                    found_key.type == BTRFS_METADATA_ITEM_KEY)
                        return 0;
                if (found_key.objectid == min_objectid &&
                    found_key.type < BTRFS_EXTENT_ITEM_KEY)
                        break;
        }
        return 1;
}

int __init btrfs_ctree_init(void)
{
        btrfs_path_cachep = KMEM_CACHE(btrfs_path, 0);
        if (!btrfs_path_cachep)
                return -ENOMEM;
        return 0;
}

void __cold btrfs_ctree_exit(void)
{
        kmem_cache_destroy(btrfs_path_cachep);
}










































































    5 


























    8 










































































































    2 


































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_WAIT_BIT_H
#define _LINUX_WAIT_BIT_H

/*
 * Linux wait-bit related types and methods:
 */
#include <linux/wait.h>

struct wait_bit_key {
        void                        *flags;
        int                        bit_nr;
        unsigned long                timeout;
};

struct wait_bit_queue_entry {
        struct wait_bit_key        key;
        struct wait_queue_entry        wq_entry;
};

#define __WAIT_BIT_KEY_INITIALIZER(word, bit)                                        \
        { .flags = word, .bit_nr = bit, }

typedef int wait_bit_action_f(struct wait_bit_key *key, int mode);

void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit);
int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
void wake_up_bit(void *word, int bit);
int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode);
int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout);
int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode);
struct wait_queue_head *bit_waitqueue(void *word, int bit);
extern void __init wait_bit_init(void);

int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);

#define DEFINE_WAIT_BIT(name, word, bit)                                        \
        struct wait_bit_queue_entry name = {                                        \
                .key = __WAIT_BIT_KEY_INITIALIZER(word, bit),                        \
                .wq_entry = {                                                        \
                        .private        = current,                                \
                        .func                = wake_bit_function,                        \
                        .entry                =                                        \
                                LIST_HEAD_INIT((name).wq_entry.entry),                \
                },                                                                \
        }

extern int bit_wait(struct wait_bit_key *key, int mode);
extern int bit_wait_io(struct wait_bit_key *key, int mode);
extern int bit_wait_timeout(struct wait_bit_key *key, int mode);
extern int bit_wait_io_timeout(struct wait_bit_key *key, int mode);

/**
 * wait_on_bit - wait for a bit to be cleared
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @mode: the task state to sleep in
 *
 * There is a standard hashed waitqueue table for generic use. This
 * is the part of the hashtable's accessor API that waits on a bit.
 * For instance, if one were to have waiters on a bitflag, one would
 * call wait_on_bit() in threads waiting for the bit to clear.
 * One uses wait_on_bit() where one is waiting for the bit to clear,
 * but has no intention of setting it.
 * Returned value will be zero if the bit was cleared, or non-zero
 * if the process received a signal and the mode permitted wakeup
 * on that signal.
 */
static inline int
wait_on_bit(unsigned long *word, int bit, unsigned mode)
{
        might_sleep();
        if (!test_bit_acquire(bit, word))
                return 0;
        return out_of_line_wait_on_bit(word, bit,
                                       bit_wait,
                                       mode);
}

/**
 * wait_on_bit_io - wait for a bit to be cleared
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @mode: the task state to sleep in
 *
 * Use the standard hashed waitqueue table to wait for a bit
 * to be cleared.  This is similar to wait_on_bit(), but calls
 * io_schedule() instead of schedule() for the actual waiting.
 *
 * Returned value will be zero if the bit was cleared, or non-zero
 * if the process received a signal and the mode permitted wakeup
 * on that signal.
 */
static inline int
wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
{
        might_sleep();
        if (!test_bit_acquire(bit, word))
                return 0;
        return out_of_line_wait_on_bit(word, bit,
                                       bit_wait_io,
                                       mode);
}

/**
 * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @mode: the task state to sleep in
 * @timeout: timeout, in jiffies
 *
 * Use the standard hashed waitqueue table to wait for a bit
 * to be cleared. This is similar to wait_on_bit(), except also takes a
 * timeout parameter.
 *
 * Returned value will be zero if the bit was cleared before the
 * @timeout elapsed, or non-zero if the @timeout elapsed or process
 * received a signal and the mode permitted wakeup on that signal.
 */
static inline int
wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
                    unsigned long timeout)
{
        might_sleep();
        if (!test_bit_acquire(bit, word))
                return 0;
        return out_of_line_wait_on_bit_timeout(word, bit,
                                               bit_wait_timeout,
                                               mode, timeout);
}

/**
 * wait_on_bit_action - wait for a bit to be cleared
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @action: the function used to sleep, which may take special actions
 * @mode: the task state to sleep in
 *
 * Use the standard hashed waitqueue table to wait for a bit
 * to be cleared, and allow the waiting action to be specified.
 * This is like wait_on_bit() but allows fine control of how the waiting
 * is done.
 *
 * Returned value will be zero if the bit was cleared, or non-zero
 * if the process received a signal and the mode permitted wakeup
 * on that signal.
 */
static inline int
wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
                   unsigned mode)
{
        might_sleep();
        if (!test_bit_acquire(bit, word))
                return 0;
        return out_of_line_wait_on_bit(word, bit, action, mode);
}

/**
 * wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @mode: the task state to sleep in
 *
 * There is a standard hashed waitqueue table for generic use. This
 * is the part of the hashtable's accessor API that waits on a bit
 * when one intends to set it, for instance, trying to lock bitflags.
 * For instance, if one were to have waiters trying to set bitflag
 * and waiting for it to clear before setting it, one would call
 * wait_on_bit() in threads waiting to be able to set the bit.
 * One uses wait_on_bit_lock() where one is waiting for the bit to
 * clear with the intention of setting it, and when done, clearing it.
 *
 * Returns zero if the bit was (eventually) found to be clear and was
 * set.  Returns non-zero if a signal was delivered to the process and
 * the @mode allows that signal to wake the process.
 */
static inline int
wait_on_bit_lock(unsigned long *word, int bit, unsigned mode)
{
        might_sleep();
        if (!test_and_set_bit(bit, word))
                return 0;
        return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode);
}

/**
 * wait_on_bit_lock_io - wait for a bit to be cleared, when wanting to set it
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @mode: the task state to sleep in
 *
 * Use the standard hashed waitqueue table to wait for a bit
 * to be cleared and then to atomically set it.  This is similar
 * to wait_on_bit(), but calls io_schedule() instead of schedule()
 * for the actual waiting.
 *
 * Returns zero if the bit was (eventually) found to be clear and was
 * set.  Returns non-zero if a signal was delivered to the process and
 * the @mode allows that signal to wake the process.
 */
static inline int
wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode)
{
        might_sleep();
        if (!test_and_set_bit(bit, word))
                return 0;
        return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode);
}

/**
 * wait_on_bit_lock_action - wait for a bit to be cleared, when wanting to set it
 * @word: the word being waited on, a kernel virtual address
 * @bit: the bit of the word being waited on
 * @action: the function used to sleep, which may take special actions
 * @mode: the task state to sleep in
 *
 * Use the standard hashed waitqueue table to wait for a bit
 * to be cleared and then to set it, and allow the waiting action
 * to be specified.
 * This is like wait_on_bit() but allows fine control of how the waiting
 * is done.
 *
 * Returns zero if the bit was (eventually) found to be clear and was
 * set.  Returns non-zero if a signal was delivered to the process and
 * the @mode allows that signal to wake the process.
 */
static inline int
wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
                        unsigned mode)
{
        might_sleep();
        if (!test_and_set_bit(bit, word))
                return 0;
        return out_of_line_wait_on_bit_lock(word, bit, action, mode);
}

extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags);
extern void wake_up_var(void *var);
extern wait_queue_head_t *__var_waitqueue(void *p);

#define ___wait_var_event(var, condition, state, exclusive, ret, cmd)        \
({                                                                        \
        __label__ __out;                                                \
        struct wait_queue_head *__wq_head = __var_waitqueue(var);        \
        struct wait_bit_queue_entry __wbq_entry;                        \
        long __ret = ret; /* explicit shadow */                                \
                                                                        \
        init_wait_var_entry(&__wbq_entry, var,                                \
                            exclusive ? WQ_FLAG_EXCLUSIVE : 0);                \
        for (;;) {                                                        \
                long __int = prepare_to_wait_event(__wq_head,                \
                                                   &__wbq_entry.wq_entry, \
                                                   state);                \
                if (condition)                                                \
                        break;                                                \
                                                                        \
                if (___wait_is_interruptible(state) && __int) {                \
                        __ret = __int;                                        \
                        goto __out;                                        \
                }                                                        \
                                                                        \
                cmd;                                                        \
        }                                                                \
        finish_wait(__wq_head, &__wbq_entry.wq_entry);                        \
__out:        __ret;                                                                \
})

#define __wait_var_event(var, condition)                                \
        ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                          schedule())

#define wait_var_event(var, condition)                                        \
do {                                                                        \
        might_sleep();                                                        \
        if (condition)                                                        \
                break;                                                        \
        __wait_var_event(var, condition);                                \
} while (0)

#define __wait_var_event_killable(var, condition)                        \
        ___wait_var_event(var, condition, TASK_KILLABLE, 0, 0,                \
                          schedule())

#define wait_var_event_killable(var, condition)                                \
({                                                                        \
        int __ret = 0;                                                        \
        might_sleep();                                                        \
        if (!(condition))                                                \
                __ret = __wait_var_event_killable(var, condition);        \
        __ret;                                                                \
})

#define __wait_var_event_timeout(var, condition, timeout)                \
        ___wait_var_event(var, ___wait_cond_timeout(condition),                \
                          TASK_UNINTERRUPTIBLE, 0, timeout,                \
                          __ret = schedule_timeout(__ret))

#define wait_var_event_timeout(var, condition, timeout)                        \
({                                                                        \
        long __ret = timeout;                                                \
        might_sleep();                                                        \
        if (!___wait_cond_timeout(condition))                                \
                __ret = __wait_var_event_timeout(var, condition, timeout); \
        __ret;                                                                \
})

#define __wait_var_event_interruptible(var, condition)                        \
        ___wait_var_event(var, condition, TASK_INTERRUPTIBLE, 0, 0,        \
                          schedule())

#define wait_var_event_interruptible(var, condition)                        \
({                                                                        \
        int __ret = 0;                                                        \
        might_sleep();                                                        \
        if (!(condition))                                                \
                __ret = __wait_var_event_interruptible(var, condition);        \
        __ret;                                                                \
})

/**
 * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit
 *
 * @bit: the bit of the word being waited on
 * @word: the word being waited on, a kernel virtual address
 *
 * You can use this helper if bitflags are manipulated atomically rather than
 * non-atomically under a lock.
 */
static inline void clear_and_wake_up_bit(int bit, void *word)
{
        clear_bit_unlock(bit, word);
        /* See wake_up_bit() for which memory barrier you need to use. */
        smp_mb__after_atomic();
        wake_up_bit(word, bit);
}

#endif /* _LINUX_WAIT_BIT_H */



















































































    9 











































   12 



















   16 
   18 




   18 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LIST_BL_H
#define _LINUX_LIST_BL_H

#include <linux/list.h>
#include <linux/bit_spinlock.h>

/*
 * Special version of lists, where head of the list has a lock in the lowest
 * bit. This is useful for scalable hash tables without increasing memory
 * footprint overhead.
 *
 * For modification operations, the 0 bit of hlist_bl_head->first
 * pointer must be set.
 *
 * With some small modifications, this can easily be adapted to store several
 * arbitrary bits (not just a single lock bit), if the need arises to store
 * some fast and compact auxiliary data.
 */

#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
#define LIST_BL_LOCKMASK        1UL
#else
#define LIST_BL_LOCKMASK        0UL
#endif

#ifdef CONFIG_DEBUG_LIST
#define LIST_BL_BUG_ON(x) BUG_ON(x)
#else
#define LIST_BL_BUG_ON(x)
#endif


struct hlist_bl_head {
        struct hlist_bl_node *first;
};

struct hlist_bl_node {
        struct hlist_bl_node *next, **pprev;
};
#define INIT_HLIST_BL_HEAD(ptr) \
        ((ptr)->first = NULL)

static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
{
        h->next = NULL;
        h->pprev = NULL;
}

#define hlist_bl_entry(ptr, type, member) container_of(ptr,type,member)

static inline bool  hlist_bl_unhashed(const struct hlist_bl_node *h)
{
        return !h->pprev;
}

static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h)
{
        return (struct hlist_bl_node *)
                ((unsigned long)h->first & ~LIST_BL_LOCKMASK);
}

static inline void hlist_bl_set_first(struct hlist_bl_head *h,
                                        struct hlist_bl_node *n)
{
        LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
        LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) !=
                                                        LIST_BL_LOCKMASK);
        h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK);
}

static inline bool hlist_bl_empty(const struct hlist_bl_head *h)
{
        return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK);
}

static inline void hlist_bl_add_head(struct hlist_bl_node *n,
                                        struct hlist_bl_head *h)
{
        struct hlist_bl_node *first = hlist_bl_first(h);

        n->next = first;
        if (first)
                first->pprev = &n->next;
        n->pprev = &h->first;
        hlist_bl_set_first(h, n);
}

static inline void hlist_bl_add_before(struct hlist_bl_node *n,
                                       struct hlist_bl_node *next)
{
        struct hlist_bl_node **pprev = next->pprev;

        n->pprev = pprev;
        n->next = next;
        next->pprev = &n->next;

        /* pprev may be `first`, so be careful not to lose the lock bit */
        WRITE_ONCE(*pprev,
                   (struct hlist_bl_node *)
                        ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK)));
}

static inline void hlist_bl_add_behind(struct hlist_bl_node *n,
                                       struct hlist_bl_node *prev)
{
        n->next = prev->next;
        n->pprev = &prev->next;
        prev->next = n;

        if (n->next)
                n->next->pprev = &n->next;
}

static inline void __hlist_bl_del(struct hlist_bl_node *n)
{
        struct hlist_bl_node *next = n->next;
        struct hlist_bl_node **pprev = n->pprev;

        LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);

        /* pprev may be `first`, so be careful not to lose the lock bit */
        WRITE_ONCE(*pprev,
                   (struct hlist_bl_node *)
                        ((unsigned long)next |
                         ((unsigned long)*pprev & LIST_BL_LOCKMASK)));
        if (next)
                next->pprev = pprev;
}

static inline void hlist_bl_del(struct hlist_bl_node *n)
{
        __hlist_bl_del(n);
        n->next = LIST_POISON1;
        n->pprev = LIST_POISON2;
}

static inline void hlist_bl_del_init(struct hlist_bl_node *n)
{
        if (!hlist_bl_unhashed(n)) {
                __hlist_bl_del(n);
                INIT_HLIST_BL_NODE(n);
        }
}

static inline void hlist_bl_lock(struct hlist_bl_head *b)
{
        bit_spin_lock(0, (unsigned long *)b);
}

static inline void hlist_bl_unlock(struct hlist_bl_head *b)
{
        __bit_spin_unlock(0, (unsigned long *)b);
}

static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
{
        return bit_spin_is_locked(0, (unsigned long *)b);
}

/**
 * hlist_bl_for_each_entry        - iterate over list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 */
#define hlist_bl_for_each_entry(tpos, pos, head, member)                \
        for (pos = hlist_bl_first(head);                                \
             pos &&                                                        \
                ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \
             pos = pos->next)

/**
 * hlist_bl_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @n:                another &struct hlist_node to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_bl_for_each_entry_safe(tpos, pos, n, head, member)         \
        for (pos = hlist_bl_first(head);                                 \
             pos && ({ n = pos->next; 1; }) &&                                  \
                ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \
             pos = n)

#endif




















































    1 




































































    1 








































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __UDF_DECL_H
#define __UDF_DECL_H

#define pr_fmt(fmt) "UDF-fs: " fmt

#include "ecma_167.h"
#include "osta_udf.h"

#include <linux/fs.h>
#include <linux/types.h>
#include <linux/buffer_head.h>
#include <linux/udf_fs_i.h>

#include "udf_sb.h"
#include "udfend.h"
#include "udf_i.h"

#define UDF_DEFAULT_PREALLOC_BLOCKS        8

extern __printf(3, 4) void _udf_err(struct super_block *sb,
                const char *function, const char *fmt, ...);
#define udf_err(sb, fmt, ...)                                        \
        _udf_err(sb, __func__, fmt, ##__VA_ARGS__)

extern __printf(3, 4) void _udf_warn(struct super_block *sb,
                const char *function, const char *fmt, ...);
#define udf_warn(sb, fmt, ...)                                        \
        _udf_warn(sb, __func__, fmt, ##__VA_ARGS__)

#define udf_info(fmt, ...)                                        \
        pr_info("INFO " fmt, ##__VA_ARGS__)

#define udf_debug(fmt, ...)                                        \
        pr_debug("%s:%d:%s: " fmt, __FILE__, __LINE__, __func__, ##__VA_ARGS__)

#define UDF_EXTENT_LENGTH_MASK        0x3FFFFFFF
#define UDF_EXTENT_FLAG_MASK        0xC0000000

#define UDF_INVALID_ID ((uint32_t)-1)

#define UDF_NAME_PAD                4
#define UDF_NAME_LEN                254
#define UDF_NAME_LEN_CS0        255

static inline size_t udf_file_entry_alloc_offset(struct inode *inode)
{
        struct udf_inode_info *iinfo = UDF_I(inode);
        if (iinfo->i_use)
                return sizeof(struct unallocSpaceEntry);
        else if (iinfo->i_efe)
                return sizeof(struct extendedFileEntry) + iinfo->i_lenEAttr;
        else
                return sizeof(struct fileEntry) + iinfo->i_lenEAttr;
}

static inline size_t udf_ext0_offset(struct inode *inode)
{
        if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
                return udf_file_entry_alloc_offset(inode);
        else
                return 0;
}

/* computes tag checksum */
u8 udf_tag_checksum(const struct tag *t);

typedef uint32_t udf_pblk_t;

struct dentry;
struct inode;
struct task_struct;
struct buffer_head;
struct super_block;

extern const struct export_operations udf_export_ops;
extern const struct inode_operations udf_dir_inode_operations;
extern const struct file_operations udf_dir_operations;
extern const struct inode_operations udf_file_inode_operations;
extern const struct file_operations udf_file_operations;
extern const struct inode_operations udf_symlink_inode_operations;
extern const struct address_space_operations udf_aops;
extern const struct address_space_operations udf_symlink_aops;

struct udf_fileident_iter {
        struct inode *dir;                /* Directory we are working with */
        loff_t pos;                        /* Logical position in a dir */
        struct buffer_head *bh[2];        /* Buffer containing 'pos' and possibly
                                         * next buffer if entry straddles
                                         * blocks */
        struct kernel_lb_addr eloc;        /* Start of extent containing 'pos' */
        uint32_t elen;                        /* Length of extent containing 'pos' */
        sector_t loffset;                /* Block offset of 'pos' within above
                                         * extent */
        struct extent_position epos;        /* Position after the above extent */
        struct fileIdentDesc fi;        /* Copied directory entry */
        uint8_t *name;                        /* Pointer to entry name */
        uint8_t *namebuf;                /* Storage for entry name in case
                                         * the name is split between two blocks
                                         */
};

struct udf_vds_record {
        uint32_t block;
        uint32_t volDescSeqNum;
};

struct generic_desc {
        struct tag        descTag;
        __le32                volDescSeqNum;
};


/* super.c */

static inline void udf_updated_lvid(struct super_block *sb)
{
        struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh;

        BUG_ON(!bh);
        WARN_ON_ONCE(((struct logicalVolIntegrityDesc *)
                     bh->b_data)->integrityType !=
                     cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN));
        UDF_SB(sb)->s_lvid_dirty = 1;
}
extern u64 lvid_get_unique_id(struct super_block *sb);
struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
                                        u32 meta_file_loc, u32 partition_num);

/* namei.c */
static inline unsigned int udf_dir_entry_len(struct fileIdentDesc *cfi)
{
        return ALIGN(sizeof(struct fileIdentDesc) +
                le16_to_cpu(cfi->lengthOfImpUse) + cfi->lengthFileIdent,
                UDF_NAME_PAD);
}

/* file.c */
extern long udf_ioctl(struct file *, unsigned int, unsigned long);

/* inode.c */
extern struct inode *__udf_iget(struct super_block *, struct kernel_lb_addr *,
                                bool hidden_inode);
static inline struct inode *udf_iget_special(struct super_block *sb,
                                             struct kernel_lb_addr *ino)
{
        return __udf_iget(sb, ino, true);
}
static inline struct inode *udf_iget(struct super_block *sb,
                                     struct kernel_lb_addr *ino)
{
        return __udf_iget(sb, ino, false);
}
extern int udf_expand_file_adinicb(struct inode *);
extern struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block,
                                      int create, int *err);
extern int udf_setsize(struct inode *, loff_t);
extern void udf_evict_inode(struct inode *);
extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
                         struct kernel_lb_addr *, uint32_t *, sector_t *);
int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
extern int udf_setup_indirect_aext(struct inode *inode, udf_pblk_t block,
                                   struct extent_position *epos);
extern int __udf_add_aext(struct inode *inode, struct extent_position *epos,
                          struct kernel_lb_addr *eloc, uint32_t elen, int inc);
extern int udf_add_aext(struct inode *, struct extent_position *,
                        struct kernel_lb_addr *, uint32_t, int);
extern void udf_write_aext(struct inode *, struct extent_position *,
                           struct kernel_lb_addr *, uint32_t, int);
extern int8_t udf_delete_aext(struct inode *, struct extent_position);
extern int8_t udf_next_aext(struct inode *, struct extent_position *,
                            struct kernel_lb_addr *, uint32_t *, int);
extern int8_t udf_current_aext(struct inode *, struct extent_position *,
                               struct kernel_lb_addr *, uint32_t *, int);
extern void udf_update_extra_perms(struct inode *inode, umode_t mode);

/* misc.c */
extern struct genericFormat *udf_add_extendedattr(struct inode *, uint32_t,
                                                  uint32_t, uint8_t);
extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t,
                                                  uint8_t);
extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t,
                                           uint32_t, uint16_t *);
extern struct buffer_head *udf_read_ptagged(struct super_block *,
                                            struct kernel_lb_addr *, uint32_t,
                                            uint16_t *);
extern void udf_update_tag(char *, int);
extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int);

/* lowlevel.c */
extern unsigned int udf_get_last_session(struct super_block *);
udf_pblk_t udf_get_last_block(struct super_block *);

/* partition.c */
extern uint32_t udf_get_pblock(struct super_block *, uint32_t, uint16_t,
                               uint32_t);
extern uint32_t udf_get_pblock_virt15(struct super_block *, uint32_t, uint16_t,
                                      uint32_t);
extern uint32_t udf_get_pblock_virt20(struct super_block *, uint32_t, uint16_t,
                                      uint32_t);
extern uint32_t udf_get_pblock_spar15(struct super_block *, uint32_t, uint16_t,
                                      uint32_t);
extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t,
                                          uint32_t);
extern int udf_relocate_blocks(struct super_block *, long, long *);

static inline uint32_t
udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc,
                  uint32_t offset)
{
        return udf_get_pblock(sb, loc->logicalBlockNum,
                        loc->partitionReferenceNum, offset);
}

/* unicode.c */
extern int udf_get_filename(struct super_block *, const uint8_t *, int,
                            uint8_t *, int);
extern int udf_put_filename(struct super_block *, const uint8_t *, int,
                            uint8_t *, int);
extern int udf_dstrCS0toChar(struct super_block *, uint8_t *, int,
                             const uint8_t *, int);

/* ialloc.c */
extern void udf_free_inode(struct inode *);
extern struct inode *udf_new_inode(struct inode *, umode_t);

/* truncate.c */
extern void udf_truncate_tail_extent(struct inode *);
extern void udf_discard_prealloc(struct inode *);
extern int udf_truncate_extents(struct inode *);

/* balloc.c */
extern void udf_free_blocks(struct super_block *, struct inode *,
                            struct kernel_lb_addr *, uint32_t, uint32_t);
extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t,
                               uint32_t, uint32_t);
extern udf_pblk_t udf_new_block(struct super_block *sb, struct inode *inode,
                                 uint16_t partition, uint32_t goal, int *err);

/* directory.c */
int udf_fiiter_init(struct udf_fileident_iter *iter, struct inode *dir,
                    loff_t pos);
int udf_fiiter_advance(struct udf_fileident_iter *iter);
void udf_fiiter_release(struct udf_fileident_iter *iter);
void udf_fiiter_write_fi(struct udf_fileident_iter *iter, uint8_t *impuse);
void udf_fiiter_update_elen(struct udf_fileident_iter *iter, uint32_t new_elen);
int udf_fiiter_append_blk(struct udf_fileident_iter *iter);
extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);

/* udftime.c */
extern void udf_disk_stamp_to_time(struct timespec64 *dest,
                                                struct timestamp src);
extern void udf_time_to_disk_stamp(struct timestamp *dest, struct timespec64 src);

#endif                                /* __UDF_DECL_H */




























   14 






   13 





   10 























   14 




    5 













    4 




































    4 

    5 





    4 
    3 





    2 
    2 





    2 

    4 






    1 

    1 




    1 


   11 

   10 




    2 
   10 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#ifndef _LINUX_MMAP_LOCK_H
#define _LINUX_MMAP_LOCK_H

#include <linux/lockdep.h>
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/rwsem.h>
#include <linux/tracepoint-defs.h>
#include <linux/types.h>

#define MMAP_LOCK_INITIALIZER(name) \
        .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),

DECLARE_TRACEPOINT(mmap_lock_start_locking);
DECLARE_TRACEPOINT(mmap_lock_acquire_returned);
DECLARE_TRACEPOINT(mmap_lock_released);

#ifdef CONFIG_TRACING

void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write);
void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
                                           bool success);
void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write);

static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
                                                   bool write)
{
        if (tracepoint_enabled(mmap_lock_start_locking))
                __mmap_lock_do_trace_start_locking(mm, write);
}

static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
                                                      bool write, bool success)
{
        if (tracepoint_enabled(mmap_lock_acquire_returned))
                __mmap_lock_do_trace_acquire_returned(mm, write, success);
}

static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
{
        if (tracepoint_enabled(mmap_lock_released))
                __mmap_lock_do_trace_released(mm, write);
}

#else /* !CONFIG_TRACING */

static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
                                                   bool write)
{
}

static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
                                                      bool write, bool success)
{
}

static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
{
}

#endif /* CONFIG_TRACING */

static inline void mmap_assert_locked(const struct mm_struct *mm)
{
        rwsem_assert_held(&mm->mmap_lock);
}

static inline void mmap_assert_write_locked(const struct mm_struct *mm)
{
        rwsem_assert_held_write(&mm->mmap_lock);
}

#ifdef CONFIG_PER_VMA_LOCK
/*
 * Drop all currently-held per-VMA locks.
 * This is called from the mmap_lock implementation directly before releasing
 * a write-locked mmap_lock (or downgrading it to read-locked).
 * This should normally NOT be called manually from other places.
 * If you want to call this manually anyway, keep in mind that this will release
 * *all* VMA write locks, including ones from further up the stack.
 */
static inline void vma_end_write_all(struct mm_struct *mm)
{
        mmap_assert_write_locked(mm);
        /*
         * Nobody can concurrently modify mm->mm_lock_seq due to exclusive
         * mmap_lock being held.
         * We need RELEASE semantics here to ensure that preceding stores into
         * the VMA take effect before we unlock it with this store.
         * Pairs with ACQUIRE semantics in vma_start_read().
         */
        smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1);
}
#else
static inline void vma_end_write_all(struct mm_struct *mm) {}
#endif

static inline void mmap_init_lock(struct mm_struct *mm)
{
        init_rwsem(&mm->mmap_lock);
}

static inline void mmap_write_lock(struct mm_struct *mm)
{
        __mmap_lock_trace_start_locking(mm, true);
        down_write(&mm->mmap_lock);
        __mmap_lock_trace_acquire_returned(mm, true, true);
}

static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
{
        __mmap_lock_trace_start_locking(mm, true);
        down_write_nested(&mm->mmap_lock, subclass);
        __mmap_lock_trace_acquire_returned(mm, true, true);
}

static inline int mmap_write_lock_killable(struct mm_struct *mm)
{
        int ret;

        __mmap_lock_trace_start_locking(mm, true);
        ret = down_write_killable(&mm->mmap_lock);
        __mmap_lock_trace_acquire_returned(mm, true, ret == 0);
        return ret;
}

static inline void mmap_write_unlock(struct mm_struct *mm)
{
        __mmap_lock_trace_released(mm, true);
        vma_end_write_all(mm);
        up_write(&mm->mmap_lock);
}

static inline void mmap_write_downgrade(struct mm_struct *mm)
{
        __mmap_lock_trace_acquire_returned(mm, false, true);
        vma_end_write_all(mm);
        downgrade_write(&mm->mmap_lock);
}

static inline void mmap_read_lock(struct mm_struct *mm)
{
        __mmap_lock_trace_start_locking(mm, false);
        down_read(&mm->mmap_lock);
        __mmap_lock_trace_acquire_returned(mm, false, true);
}

static inline int mmap_read_lock_killable(struct mm_struct *mm)
{
        int ret;

        __mmap_lock_trace_start_locking(mm, false);
        ret = down_read_killable(&mm->mmap_lock);
        __mmap_lock_trace_acquire_returned(mm, false, ret == 0);
        return ret;
}

static inline bool mmap_read_trylock(struct mm_struct *mm)
{
        bool ret;

        __mmap_lock_trace_start_locking(mm, false);
        ret = down_read_trylock(&mm->mmap_lock) != 0;
        __mmap_lock_trace_acquire_returned(mm, false, ret);
        return ret;
}

static inline void mmap_read_unlock(struct mm_struct *mm)
{
        __mmap_lock_trace_released(mm, false);
        up_read(&mm->mmap_lock);
}

static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
{
        __mmap_lock_trace_released(mm, false);
        up_read_non_owner(&mm->mmap_lock);
}

static inline int mmap_lock_is_contended(struct mm_struct *mm)
{
        return rwsem_is_contended(&mm->mmap_lock);
}

#endif /* _LINUX_MMAP_LOCK_H */





































































    3 












    3 





   15 














    2 





    2 





    4 



















    1 





















    2 

































    3 





















   18 




   18 






    3 











    1 

















    1 
























    1 










































    1 










    1 
    1 
    1 





























    1 

    1 


























    1 

   18 












    1 






    1 



    1 






























































    3 




























































    4 





    3 
    1 

    3 






















































    3 


    3 







    1 








    1 

    2 



    3 
    2 




















































    2 















    3 






    3 



    2 







    2 

































   16 
    1 





    1 
   12 



















































   16 
   16 







    4 



























    3 






















    3 


    3 






    3 

    3 




    3 






































    3 


































    2 



    2 


    2 



    2 


    2 
    2 



    2 









    2 

    3 

    2 
    2 


























































    2 




























































    2 
































    1 




    1 















































































































































    1 







    1 




















































    1 


    1 








    1 




































    1 




    1 


    1 






















    1 
































    3 






    2 





    3 









    2 











    3 



    3 

    3 

    3 


    3 






    3 






    3 



    2 



























    1 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C) 2001 Momchil Velikov
 * Portions Copyright (C) 2001 Christoph Hellwig
 * Copyright (C) 2005 SGI, Christoph Lameter
 * Copyright (C) 2006 Nick Piggin
 * Copyright (C) 2012 Konstantin Khlebnikov
 * Copyright (C) 2016 Intel, Matthew Wilcox
 * Copyright (C) 2016 Intel, Ross Zwisler
 */

#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/bug.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/idr.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kmemleak.h>
#include <linux/percpu.h>
#include <linux/preempt.h>                /* in_interrupt() */
#include <linux/radix-tree.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/xarray.h>

#include "radix-tree.h"

/*
 * Radix tree node cache.
 */
struct kmem_cache *radix_tree_node_cachep;

/*
 * The radix tree is variable-height, so an insert operation not only has
 * to build the branch to its corresponding item, it also has to build the
 * branch to existing items if the size has to be increased (by
 * radix_tree_extend).
 *
 * The worst case is a zero height tree with just a single item at index 0,
 * and then inserting an item at index ULONG_MAX. This requires 2 new branches
 * of RADIX_TREE_MAX_PATH size to be created, with only the root node shared.
 * Hence:
 */
#define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1)

/*
 * The IDR does not have to be as high as the radix tree since it uses
 * signed integers, not unsigned longs.
 */
#define IDR_INDEX_BITS                (8 /* CHAR_BIT */ * sizeof(int) - 1)
#define IDR_MAX_PATH                (DIV_ROUND_UP(IDR_INDEX_BITS, \
                                                RADIX_TREE_MAP_SHIFT))
#define IDR_PRELOAD_SIZE        (IDR_MAX_PATH * 2 - 1)

/*
 * Per-cpu pool of preloaded nodes
 */
DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = {
        .lock = INIT_LOCAL_LOCK(lock),
};
EXPORT_PER_CPU_SYMBOL_GPL(radix_tree_preloads);

static inline struct radix_tree_node *entry_to_node(void *ptr)
{
        return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
}

static inline void *node_to_entry(void *ptr)
{
        return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
}

#define RADIX_TREE_RETRY        XA_RETRY_ENTRY

static inline unsigned long
get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot)
{
        return parent ? slot - parent->slots : 0;
}

static unsigned int radix_tree_descend(const struct radix_tree_node *parent,
                        struct radix_tree_node **nodep, unsigned long index)
{
        unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK;
        void __rcu **entry = rcu_dereference_raw(parent->slots[offset]);

        *nodep = (void *)entry;
        return offset;
}

static inline gfp_t root_gfp_mask(const struct radix_tree_root *root)
{
        return root->xa_flags & (__GFP_BITS_MASK & ~GFP_ZONEMASK);
}

static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        __set_bit(offset, node->tags[tag]);
}

static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        __clear_bit(offset, node->tags[tag]);
}

static inline int tag_get(const struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        return test_bit(offset, node->tags[tag]);
}

static inline void root_tag_set(struct radix_tree_root *root, unsigned tag)
{
        root->xa_flags |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT));
}

static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag)
{
        root->xa_flags &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT));
}

static inline void root_tag_clear_all(struct radix_tree_root *root)
{
        root->xa_flags &= (__force gfp_t)((1 << ROOT_TAG_SHIFT) - 1);
}

static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag)
{
        return (__force int)root->xa_flags & (1 << (tag + ROOT_TAG_SHIFT));
}

static inline unsigned root_tags_get(const struct radix_tree_root *root)
{
        return (__force unsigned)root->xa_flags >> ROOT_TAG_SHIFT;
}

static inline bool is_idr(const struct radix_tree_root *root)
{
        return !!(root->xa_flags & ROOT_IS_IDR);
}

/*
 * Returns 1 if any slot in the node has this tag set.
 * Otherwise returns 0.
 */
static inline int any_tag_set(const struct radix_tree_node *node,
                                                        unsigned int tag)
{
        unsigned idx;
        for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
                if (node->tags[tag][idx])
                        return 1;
        }
        return 0;
}

static inline void all_tag_set(struct radix_tree_node *node, unsigned int tag)
{
        bitmap_fill(node->tags[tag], RADIX_TREE_MAP_SIZE);
}

/**
 * radix_tree_find_next_bit - find the next set bit in a memory region
 *
 * @node: where to begin the search
 * @tag: the tag index
 * @offset: the bitnumber to start searching at
 *
 * Unrollable variant of find_next_bit() for constant size arrays.
 * Tail bits starting from size to roundup(size, BITS_PER_LONG) must be zero.
 * Returns next bit offset, or size if nothing found.
 */
static __always_inline unsigned long
radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag,
                         unsigned long offset)
{
        const unsigned long *addr = node->tags[tag];

        if (offset < RADIX_TREE_MAP_SIZE) {
                unsigned long tmp;

                addr += offset / BITS_PER_LONG;
                tmp = *addr >> (offset % BITS_PER_LONG);
                if (tmp)
                        return __ffs(tmp) + offset;
                offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1);
                while (offset < RADIX_TREE_MAP_SIZE) {
                        tmp = *++addr;
                        if (tmp)
                                return __ffs(tmp) + offset;
                        offset += BITS_PER_LONG;
                }
        }
        return RADIX_TREE_MAP_SIZE;
}

static unsigned int iter_offset(const struct radix_tree_iter *iter)
{
        return iter->index & RADIX_TREE_MAP_MASK;
}

/*
 * The maximum index which can be stored in a radix tree
 */
static inline unsigned long shift_maxindex(unsigned int shift)
{
        return (RADIX_TREE_MAP_SIZE << shift) - 1;
}

static inline unsigned long node_maxindex(const struct radix_tree_node *node)
{
        return shift_maxindex(node->shift);
}

static unsigned long next_index(unsigned long index,
                                const struct radix_tree_node *node,
                                unsigned long offset)
{
        return (index & ~node_maxindex(node)) + (offset << node->shift);
}

/*
 * This assumes that the caller has performed appropriate preallocation, and
 * that the caller has pinned this thread of control to the current CPU.
 */
static struct radix_tree_node *
radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent,
                        struct radix_tree_root *root,
                        unsigned int shift, unsigned int offset,
                        unsigned int count, unsigned int nr_values)
{
        struct radix_tree_node *ret = NULL;

        /*
         * Preload code isn't irq safe and it doesn't make sense to use
         * preloading during an interrupt anyway as all the allocations have
         * to be atomic. So just do normal allocation when in interrupt.
         */
        if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) {
                struct radix_tree_preload *rtp;

                /*
                 * Even if the caller has preloaded, try to allocate from the
                 * cache first for the new node to get accounted to the memory
                 * cgroup.
                 */
                ret = kmem_cache_alloc(radix_tree_node_cachep,
                                       gfp_mask | __GFP_NOWARN);
                if (ret)
                        goto out;

                /*
                 * Provided the caller has preloaded here, we will always
                 * succeed in getting a node here (and never reach
                 * kmem_cache_alloc)
                 */
                rtp = this_cpu_ptr(&radix_tree_preloads);
                if (rtp->nr) {
                        ret = rtp->nodes;
                        rtp->nodes = ret->parent;
                        rtp->nr--;
                }
                /*
                 * Update the allocation stack trace as this is more useful
                 * for debugging.
                 */
                kmemleak_update_trace(ret);
                goto out;
        }
        ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
out:
        BUG_ON(radix_tree_is_internal_node(ret));
        if (ret) {
                ret->shift = shift;
                ret->offset = offset;
                ret->count = count;
                ret->nr_values = nr_values;
                ret->parent = parent;
                ret->array = root;
        }
        return ret;
}

void radix_tree_node_rcu_free(struct rcu_head *head)
{
        struct radix_tree_node *node =
                        container_of(head, struct radix_tree_node, rcu_head);

        /*
         * Must only free zeroed nodes into the slab.  We can be left with
         * non-NULL entries by radix_tree_free_nodes, so clear the entries
         * and tags here.
         */
        memset(node->slots, 0, sizeof(node->slots));
        memset(node->tags, 0, sizeof(node->tags));
        INIT_LIST_HEAD(&node->private_list);

        kmem_cache_free(radix_tree_node_cachep, node);
}

static inline void
radix_tree_node_free(struct radix_tree_node *node)
{
        call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
}

/*
 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 * ensure that the addition of a single element in the tree cannot fail.  On
 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 * with preemption not disabled.
 *
 * To make use of this facility, the radix tree must be initialised without
 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
 */
static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
{
        struct radix_tree_preload *rtp;
        struct radix_tree_node *node;
        int ret = -ENOMEM;

        /*
         * Nodes preloaded by one cgroup can be used by another cgroup, so
         * they should never be accounted to any particular memory cgroup.
         */
        gfp_mask &= ~__GFP_ACCOUNT;

        local_lock(&radix_tree_preloads.lock);
        rtp = this_cpu_ptr(&radix_tree_preloads);
        while (rtp->nr < nr) {
                local_unlock(&radix_tree_preloads.lock);
                node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
                if (node == NULL)
                        goto out;
                local_lock(&radix_tree_preloads.lock);
                rtp = this_cpu_ptr(&radix_tree_preloads);
                if (rtp->nr < nr) {
                        node->parent = rtp->nodes;
                        rtp->nodes = node;
                        rtp->nr++;
                } else {
                        kmem_cache_free(radix_tree_node_cachep, node);
                }
        }
        ret = 0;
out:
        return ret;
}

/*
 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 * ensure that the addition of a single element in the tree cannot fail.  On
 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 * with preemption not disabled.
 *
 * To make use of this facility, the radix tree must be initialised without
 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
 */
int radix_tree_preload(gfp_t gfp_mask)
{
        /* Warn on non-sensical use... */
        WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask));
        return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
}
EXPORT_SYMBOL(radix_tree_preload);

/*
 * The same as above function, except we don't guarantee preloading happens.
 * We do it, if we decide it helps. On success, return zero with preemption
 * disabled. On error, return -ENOMEM with preemption not disabled.
 */
int radix_tree_maybe_preload(gfp_t gfp_mask)
{
        if (gfpflags_allow_blocking(gfp_mask))
                return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
        /* Preloading doesn't help anything with this gfp mask, skip it */
        local_lock(&radix_tree_preloads.lock);
        return 0;
}
EXPORT_SYMBOL(radix_tree_maybe_preload);

static unsigned radix_tree_load_root(const struct radix_tree_root *root,
                struct radix_tree_node **nodep, unsigned long *maxindex)
{
        struct radix_tree_node *node = rcu_dereference_raw(root->xa_head);

        *nodep = node;

        if (likely(radix_tree_is_internal_node(node))) {
                node = entry_to_node(node);
                *maxindex = node_maxindex(node);
                return node->shift + RADIX_TREE_MAP_SHIFT;
        }

        *maxindex = 0;
        return 0;
}

/*
 *        Extend a radix tree so it can store key @index.
 */
static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
                                unsigned long index, unsigned int shift)
{
        void *entry;
        unsigned int maxshift;
        int tag;

        /* Figure out what the shift should be.  */
        maxshift = shift;
        while (index > shift_maxindex(maxshift))
                maxshift += RADIX_TREE_MAP_SHIFT;

        entry = rcu_dereference_raw(root->xa_head);
        if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE)))
                goto out;

        do {
                struct radix_tree_node *node = radix_tree_node_alloc(gfp, NULL,
                                                        root, shift, 0, 1, 0);
                if (!node)
                        return -ENOMEM;

                if (is_idr(root)) {
                        all_tag_set(node, IDR_FREE);
                        if (!root_tag_get(root, IDR_FREE)) {
                                tag_clear(node, IDR_FREE, 0);
                                root_tag_set(root, IDR_FREE);
                        }
                } else {
                        /* Propagate the aggregated tag info to the new child */
                        for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
                                if (root_tag_get(root, tag))
                                        tag_set(node, tag, 0);
                        }
                }

                BUG_ON(shift > BITS_PER_LONG);
                if (radix_tree_is_internal_node(entry)) {
                        entry_to_node(entry)->parent = node;
                } else if (xa_is_value(entry)) {
                        /* Moving a value entry root->xa_head to a node */
                        node->nr_values = 1;
                }
                /*
                 * entry was already in the radix tree, so we do not need
                 * rcu_assign_pointer here
                 */
                node->slots[0] = (void __rcu *)entry;
                entry = node_to_entry(node);
                rcu_assign_pointer(root->xa_head, entry);
                shift += RADIX_TREE_MAP_SHIFT;
        } while (shift <= maxshift);
out:
        return maxshift + RADIX_TREE_MAP_SHIFT;
}

/**
 *        radix_tree_shrink    -    shrink radix tree to minimum height
 *        @root:                radix tree root
 */
static inline bool radix_tree_shrink(struct radix_tree_root *root)
{
        bool shrunk = false;

        for (;;) {
                struct radix_tree_node *node = rcu_dereference_raw(root->xa_head);
                struct radix_tree_node *child;

                if (!radix_tree_is_internal_node(node))
                        break;
                node = entry_to_node(node);

                /*
                 * The candidate node has more than one child, or its child
                 * is not at the leftmost slot, we cannot shrink.
                 */
                if (node->count != 1)
                        break;
                child = rcu_dereference_raw(node->slots[0]);
                if (!child)
                        break;

                /*
                 * For an IDR, we must not shrink entry 0 into the root in
                 * case somebody calls idr_replace() with a pointer that
                 * appears to be an internal entry
                 */
                if (!node->shift && is_idr(root))
                        break;

                if (radix_tree_is_internal_node(child))
                        entry_to_node(child)->parent = NULL;

                /*
                 * We don't need rcu_assign_pointer(), since we are simply
                 * moving the node from one part of the tree to another: if it
                 * was safe to dereference the old pointer to it
                 * (node->slots[0]), it will be safe to dereference the new
                 * one (root->xa_head) as far as dependent read barriers go.
                 */
                root->xa_head = (void __rcu *)child;
                if (is_idr(root) && !tag_get(node, IDR_FREE, 0))
                        root_tag_clear(root, IDR_FREE);

                /*
                 * We have a dilemma here. The node's slot[0] must not be
                 * NULLed in case there are concurrent lookups expecting to
                 * find the item. However if this was a bottom-level node,
                 * then it may be subject to the slot pointer being visible
                 * to callers dereferencing it. If item corresponding to
                 * slot[0] is subsequently deleted, these callers would expect
                 * their slot to become empty sooner or later.
                 *
                 * For example, lockless pagecache will look up a slot, deref
                 * the page pointer, and if the page has 0 refcount it means it
                 * was concurrently deleted from pagecache so try the deref
                 * again. Fortunately there is already a requirement for logic
                 * to retry the entire slot lookup -- the indirect pointer
                 * problem (replacing direct root node with an indirect pointer
                 * also results in a stale slot). So tag the slot as indirect
                 * to force callers to retry.
                 */
                node->count = 0;
                if (!radix_tree_is_internal_node(child)) {
                        node->slots[0] = (void __rcu *)RADIX_TREE_RETRY;
                }

                WARN_ON_ONCE(!list_empty(&node->private_list));
                radix_tree_node_free(node);
                shrunk = true;
        }

        return shrunk;
}

static bool delete_node(struct radix_tree_root *root,
                        struct radix_tree_node *node)
{
        bool deleted = false;

        do {
                struct radix_tree_node *parent;

                if (node->count) {
                        if (node_to_entry(node) ==
                                        rcu_dereference_raw(root->xa_head))
                                deleted |= radix_tree_shrink(root);
                        return deleted;
                }

                parent = node->parent;
                if (parent) {
                        parent->slots[node->offset] = NULL;
                        parent->count--;
                } else {
                        /*
                         * Shouldn't the tags already have all been cleared
                         * by the caller?
                         */
                        if (!is_idr(root))
                                root_tag_clear_all(root);
                        root->xa_head = NULL;
                }

                WARN_ON_ONCE(!list_empty(&node->private_list));
                radix_tree_node_free(node);
                deleted = true;

                node = parent;
        } while (node);

        return deleted;
}

/**
 *        __radix_tree_create        -        create a slot in a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *        @nodep:                returns node
 *        @slotp:                returns slot
 *
 *        Create, if necessary, and return the node and slot for an item
 *        at position @index in the radix tree @root.
 *
 *        Until there is more than one item in the tree, no nodes are
 *        allocated and @root->xa_head is used as a direct slot instead of
 *        pointing to a node, in which case *@nodep will be NULL.
 *
 *        Returns -ENOMEM, or 0 for success.
 */
static int __radix_tree_create(struct radix_tree_root *root,
                unsigned long index, struct radix_tree_node **nodep,
                void __rcu ***slotp)
{
        struct radix_tree_node *node = NULL, *child;
        void __rcu **slot = (void __rcu **)&root->xa_head;
        unsigned long maxindex;
        unsigned int shift, offset = 0;
        unsigned long max = index;
        gfp_t gfp = root_gfp_mask(root);

        shift = radix_tree_load_root(root, &child, &maxindex);

        /* Make sure the tree is high enough.  */
        if (max > maxindex) {
                int error = radix_tree_extend(root, gfp, max, shift);
                if (error < 0)
                        return error;
                shift = error;
                child = rcu_dereference_raw(root->xa_head);
        }

        while (shift > 0) {
                shift -= RADIX_TREE_MAP_SHIFT;
                if (child == NULL) {
                        /* Have to add a child node.  */
                        child = radix_tree_node_alloc(gfp, node, root, shift,
                                                        offset, 0, 0);
                        if (!child)
                                return -ENOMEM;
                        rcu_assign_pointer(*slot, node_to_entry(child));
                        if (node)
                                node->count++;
                } else if (!radix_tree_is_internal_node(child))
                        break;

                /* Go a level down */
                node = entry_to_node(child);
                offset = radix_tree_descend(node, &child, index);
                slot = &node->slots[offset];
        }

        if (nodep)
                *nodep = node;
        if (slotp)
                *slotp = slot;
        return 0;
}

/*
 * Free any nodes below this node.  The tree is presumed to not need
 * shrinking, and any user data in the tree is presumed to not need a
 * destructor called on it.  If we need to add a destructor, we can
 * add that functionality later.  Note that we may not clear tags or
 * slots from the tree as an RCU walker may still have a pointer into
 * this subtree.  We could replace the entries with RADIX_TREE_RETRY,
 * but we'll still have to clear those in rcu_free.
 */
static void radix_tree_free_nodes(struct radix_tree_node *node)
{
        unsigned offset = 0;
        struct radix_tree_node *child = entry_to_node(node);

        for (;;) {
                void *entry = rcu_dereference_raw(child->slots[offset]);
                if (xa_is_node(entry) && child->shift) {
                        child = entry_to_node(entry);
                        offset = 0;
                        continue;
                }
                offset++;
                while (offset == RADIX_TREE_MAP_SIZE) {
                        struct radix_tree_node *old = child;
                        offset = child->offset + 1;
                        child = child->parent;
                        WARN_ON_ONCE(!list_empty(&old->private_list));
                        radix_tree_node_free(old);
                        if (old == entry_to_node(node))
                                return;
                }
        }
}

static inline int insert_entries(struct radix_tree_node *node,
                void __rcu **slot, void *item)
{
        if (*slot)
                return -EEXIST;
        rcu_assign_pointer(*slot, item);
        if (node) {
                node->count++;
                if (xa_is_value(item))
                        node->nr_values++;
        }
        return 1;
}

/**
 *        radix_tree_insert    -    insert into a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *        @item:                item to insert
 *
 *        Insert an item into the radix tree at position @index.
 */
int radix_tree_insert(struct radix_tree_root *root, unsigned long index,
                        void *item)
{
        struct radix_tree_node *node;
        void __rcu **slot;
        int error;

        BUG_ON(radix_tree_is_internal_node(item));

        error = __radix_tree_create(root, index, &node, &slot);
        if (error)
                return error;

        error = insert_entries(node, slot, item);
        if (error < 0)
                return error;

        if (node) {
                unsigned offset = get_slot_offset(node, slot);
                BUG_ON(tag_get(node, 0, offset));
                BUG_ON(tag_get(node, 1, offset));
                BUG_ON(tag_get(node, 2, offset));
        } else {
                BUG_ON(root_tags_get(root));
        }

        return 0;
}
EXPORT_SYMBOL(radix_tree_insert);

/**
 *        __radix_tree_lookup        -        lookup an item in a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *        @nodep:                returns node
 *        @slotp:                returns slot
 *
 *        Lookup and return the item at position @index in the radix
 *        tree @root.
 *
 *        Until there is more than one item in the tree, no nodes are
 *        allocated and @root->xa_head is used as a direct slot instead of
 *        pointing to a node, in which case *@nodep will be NULL.
 */
void *__radix_tree_lookup(const struct radix_tree_root *root,
                          unsigned long index, struct radix_tree_node **nodep,
                          void __rcu ***slotp)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;
        void __rcu **slot;

 restart:
        parent = NULL;
        slot = (void __rcu **)&root->xa_head;
        radix_tree_load_root(root, &node, &maxindex);
        if (index > maxindex)
                return NULL;

        while (radix_tree_is_internal_node(node)) {
                unsigned offset;

                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);
                slot = parent->slots + offset;
                if (node == RADIX_TREE_RETRY)
                        goto restart;
                if (parent->shift == 0)
                        break;
        }

        if (nodep)
                *nodep = parent;
        if (slotp)
                *slotp = slot;
        return node;
}

/**
 *        radix_tree_lookup_slot    -    lookup a slot in a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *
 *        Returns:  the slot corresponding to the position @index in the
 *        radix tree @root. This is useful for update-if-exists operations.
 *
 *        This function can be called under rcu_read_lock iff the slot is not
 *        modified by radix_tree_replace_slot, otherwise it must be called
 *        exclusive from other writers. Any dereference of the slot must be done
 *        using radix_tree_deref_slot.
 */
void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *root,
                                unsigned long index)
{
        void __rcu **slot;

        if (!__radix_tree_lookup(root, index, NULL, &slot))
                return NULL;
        return slot;
}
EXPORT_SYMBOL(radix_tree_lookup_slot);

/**
 *        radix_tree_lookup    -    perform lookup operation on a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *
 *        Lookup the item at the position @index in the radix tree @root.
 *
 *        This function can be called under rcu_read_lock, however the caller
 *        must manage lifetimes of leaf nodes (eg. RCU may also be used to free
 *        them safely). No RCU barriers are required to access or modify the
 *        returned item, however.
 */
void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index)
{
        return __radix_tree_lookup(root, index, NULL, NULL);
}
EXPORT_SYMBOL(radix_tree_lookup);

static void replace_slot(void __rcu **slot, void *item,
                struct radix_tree_node *node, int count, int values)
{
        if (node && (count || values)) {
                node->count += count;
                node->nr_values += values;
        }

        rcu_assign_pointer(*slot, item);
}

static bool node_tag_get(const struct radix_tree_root *root,
                                const struct radix_tree_node *node,
                                unsigned int tag, unsigned int offset)
{
        if (node)
                return tag_get(node, tag, offset);
        return root_tag_get(root, tag);
}

/*
 * IDR users want to be able to store NULL in the tree, so if the slot isn't
 * free, don't adjust the count, even if it's transitioning between NULL and
 * non-NULL.  For the IDA, we mark slots as being IDR_FREE while they still
 * have empty bits, but it only stores NULL in slots when they're being
 * deleted.
 */
static int calculate_count(struct radix_tree_root *root,
                                struct radix_tree_node *node, void __rcu **slot,
                                void *item, void *old)
{
        if (is_idr(root)) {
                unsigned offset = get_slot_offset(node, slot);
                bool free = node_tag_get(root, node, IDR_FREE, offset);
                if (!free)
                        return 0;
                if (!old)
                        return 1;
        }
        return !!item - !!old;
}

/**
 * __radix_tree_replace                - replace item in a slot
 * @root:                radix tree root
 * @node:                pointer to tree node
 * @slot:                pointer to slot in @node
 * @item:                new item to store in the slot.
 *
 * For use with __radix_tree_lookup().  Caller must hold tree write locked
 * across slot lookup and replacement.
 */
void __radix_tree_replace(struct radix_tree_root *root,
                          struct radix_tree_node *node,
                          void __rcu **slot, void *item)
{
        void *old = rcu_dereference_raw(*slot);
        int values = !!xa_is_value(item) - !!xa_is_value(old);
        int count = calculate_count(root, node, slot, item, old);

        /*
         * This function supports replacing value entries and
         * deleting entries, but that needs accounting against the
         * node unless the slot is root->xa_head.
         */
        WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->xa_head) &&
                        (count || values));
        replace_slot(slot, item, node, count, values);

        if (!node)
                return;

        delete_node(root, node);
}

/**
 * radix_tree_replace_slot        - replace item in a slot
 * @root:        radix tree root
 * @slot:        pointer to slot
 * @item:        new item to store in the slot.
 *
 * For use with radix_tree_lookup_slot() and
 * radix_tree_gang_lookup_tag_slot().  Caller must hold tree write locked
 * across slot lookup and replacement.
 *
 * NOTE: This cannot be used to switch between non-entries (empty slots),
 * regular entries, and value entries, as that requires accounting
 * inside the radix tree node. When switching from one type of entry or
 * deleting, use __radix_tree_lookup() and __radix_tree_replace() or
 * radix_tree_iter_replace().
 */
void radix_tree_replace_slot(struct radix_tree_root *root,
                             void __rcu **slot, void *item)
{
        __radix_tree_replace(root, NULL, slot, item);
}
EXPORT_SYMBOL(radix_tree_replace_slot);

/**
 * radix_tree_iter_replace - replace item in a slot
 * @root:        radix tree root
 * @iter:        iterator state
 * @slot:        pointer to slot
 * @item:        new item to store in the slot.
 *
 * For use with radix_tree_for_each_slot().
 * Caller must hold tree write locked.
 */
void radix_tree_iter_replace(struct radix_tree_root *root,
                                const struct radix_tree_iter *iter,
                                void __rcu **slot, void *item)
{
        __radix_tree_replace(root, iter->node, slot, item);
}

static void node_tag_set(struct radix_tree_root *root,
                                struct radix_tree_node *node,
                                unsigned int tag, unsigned int offset)
{
        while (node) {
                if (tag_get(node, tag, offset))
                        return;
                tag_set(node, tag, offset);
                offset = node->offset;
                node = node->parent;
        }

        if (!root_tag_get(root, tag))
                root_tag_set(root, tag);
}

/**
 *        radix_tree_tag_set - set a tag on a radix tree node
 *        @root:                radix tree root
 *        @index:                index key
 *        @tag:                tag index
 *
 *        Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
 *        corresponding to @index in the radix tree.  From
 *        the root all the way down to the leaf node.
 *
 *        Returns the address of the tagged item.  Setting a tag on a not-present
 *        item is a bug.
 */
void *radix_tree_tag_set(struct radix_tree_root *root,
                        unsigned long index, unsigned int tag)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;

        radix_tree_load_root(root, &node, &maxindex);
        BUG_ON(index > maxindex);

        while (radix_tree_is_internal_node(node)) {
                unsigned offset;

                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);
                BUG_ON(!node);

                if (!tag_get(parent, tag, offset))
                        tag_set(parent, tag, offset);
        }

        /* set the root's tag bit */
        if (!root_tag_get(root, tag))
                root_tag_set(root, tag);

        return node;
}
EXPORT_SYMBOL(radix_tree_tag_set);

static void node_tag_clear(struct radix_tree_root *root,
                                struct radix_tree_node *node,
                                unsigned int tag, unsigned int offset)
{
        while (node) {
                if (!tag_get(node, tag, offset))
                        return;
                tag_clear(node, tag, offset);
                if (any_tag_set(node, tag))
                        return;

                offset = node->offset;
                node = node->parent;
        }

        /* clear the root's tag bit */
        if (root_tag_get(root, tag))
                root_tag_clear(root, tag);
}

/**
 *        radix_tree_tag_clear - clear a tag on a radix tree node
 *        @root:                radix tree root
 *        @index:                index key
 *        @tag:                tag index
 *
 *        Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
 *        corresponding to @index in the radix tree.  If this causes
 *        the leaf node to have no tags set then clear the tag in the
 *        next-to-leaf node, etc.
 *
 *        Returns the address of the tagged item on success, else NULL.  ie:
 *        has the same return value and semantics as radix_tree_lookup().
 */
void *radix_tree_tag_clear(struct radix_tree_root *root,
                        unsigned long index, unsigned int tag)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;
        int offset = 0;

        radix_tree_load_root(root, &node, &maxindex);
        if (index > maxindex)
                return NULL;

        parent = NULL;

        while (radix_tree_is_internal_node(node)) {
                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);
        }

        if (node)
                node_tag_clear(root, parent, tag, offset);

        return node;
}
EXPORT_SYMBOL(radix_tree_tag_clear);

/**
  * radix_tree_iter_tag_clear - clear a tag on the current iterator entry
  * @root: radix tree root
  * @iter: iterator state
  * @tag: tag to clear
  */
void radix_tree_iter_tag_clear(struct radix_tree_root *root,
                        const struct radix_tree_iter *iter, unsigned int tag)
{
        node_tag_clear(root, iter->node, tag, iter_offset(iter));
}

/**
 * radix_tree_tag_get - get a tag on a radix tree node
 * @root:                radix tree root
 * @index:                index key
 * @tag:                tag index (< RADIX_TREE_MAX_TAGS)
 *
 * Return values:
 *
 *  0: tag not present or not set
 *  1: tag set
 *
 * Note that the return value of this function may not be relied on, even if
 * the RCU lock is held, unless tag modification and node deletion are excluded
 * from concurrency.
 */
int radix_tree_tag_get(const struct radix_tree_root *root,
                        unsigned long index, unsigned int tag)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;

        if (!root_tag_get(root, tag))
                return 0;

        radix_tree_load_root(root, &node, &maxindex);
        if (index > maxindex)
                return 0;

        while (radix_tree_is_internal_node(node)) {
                unsigned offset;

                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);

                if (!tag_get(parent, tag, offset))
                        return 0;
                if (node == RADIX_TREE_RETRY)
                        break;
        }

        return 1;
}
EXPORT_SYMBOL(radix_tree_tag_get);

/* Construct iter->tags bit-mask from node->tags[tag] array */
static void set_iter_tags(struct radix_tree_iter *iter,
                                struct radix_tree_node *node, unsigned offset,
                                unsigned tag)
{
        unsigned tag_long = offset / BITS_PER_LONG;
        unsigned tag_bit  = offset % BITS_PER_LONG;

        if (!node) {
                iter->tags = 1;
                return;
        }

        iter->tags = node->tags[tag][tag_long] >> tag_bit;

        /* This never happens if RADIX_TREE_TAG_LONGS == 1 */
        if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
                /* Pick tags from next element */
                if (tag_bit)
                        iter->tags |= node->tags[tag][tag_long + 1] <<
                                                (BITS_PER_LONG - tag_bit);
                /* Clip chunk size, here only BITS_PER_LONG tags */
                iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG);
        }
}

void __rcu **radix_tree_iter_resume(void __rcu **slot,
                                        struct radix_tree_iter *iter)
{
        iter->index = __radix_tree_iter_add(iter, 1);
        iter->next_index = iter->index;
        iter->tags = 0;
        return NULL;
}
EXPORT_SYMBOL(radix_tree_iter_resume);

/**
 * radix_tree_next_chunk - find next chunk of slots for iteration
 *
 * @root:        radix tree root
 * @iter:        iterator state
 * @flags:        RADIX_TREE_ITER_* flags and tag index
 * Returns:        pointer to chunk first slot, or NULL if iteration is over
 */
void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root,
                             struct radix_tree_iter *iter, unsigned flags)
{
        unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
        struct radix_tree_node *node, *child;
        unsigned long index, offset, maxindex;

        if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
                return NULL;

        /*
         * Catch next_index overflow after ~0UL. iter->index never overflows
         * during iterating; it can be zero only at the beginning.
         * And we cannot overflow iter->next_index in a single step,
         * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG.
         *
         * This condition also used by radix_tree_next_slot() to stop
         * contiguous iterating, and forbid switching to the next chunk.
         */
        index = iter->next_index;
        if (!index && iter->index)
                return NULL;

 restart:
        radix_tree_load_root(root, &child, &maxindex);
        if (index > maxindex)
                return NULL;
        if (!child)
                return NULL;

        if (!radix_tree_is_internal_node(child)) {
                /* Single-slot tree */
                iter->index = index;
                iter->next_index = maxindex + 1;
                iter->tags = 1;
                iter->node = NULL;
                return (void __rcu **)&root->xa_head;
        }

        do {
                node = entry_to_node(child);
                offset = radix_tree_descend(node, &child, index);

                if ((flags & RADIX_TREE_ITER_TAGGED) ?
                                !tag_get(node, tag, offset) : !child) {
                        /* Hole detected */
                        if (flags & RADIX_TREE_ITER_CONTIG)
                                return NULL;

                        if (flags & RADIX_TREE_ITER_TAGGED)
                                offset = radix_tree_find_next_bit(node, tag,
                                                offset + 1);
                        else
                                while (++offset        < RADIX_TREE_MAP_SIZE) {
                                        void *slot = rcu_dereference_raw(
                                                        node->slots[offset]);
                                        if (slot)
                                                break;
                                }
                        index &= ~node_maxindex(node);
                        index += offset << node->shift;
                        /* Overflow after ~0UL */
                        if (!index)
                                return NULL;
                        if (offset == RADIX_TREE_MAP_SIZE)
                                goto restart;
                        child = rcu_dereference_raw(node->slots[offset]);
                }

                if (!child)
                        goto restart;
                if (child == RADIX_TREE_RETRY)
                        break;
        } while (node->shift && radix_tree_is_internal_node(child));

        /* Update the iterator state */
        iter->index = (index &~ node_maxindex(node)) | offset;
        iter->next_index = (index | node_maxindex(node)) + 1;
        iter->node = node;

        if (flags & RADIX_TREE_ITER_TAGGED)
                set_iter_tags(iter, node, offset, tag);

        return node->slots + offset;
}
EXPORT_SYMBOL(radix_tree_next_chunk);

/**
 *        radix_tree_gang_lookup - perform multiple lookup on a radix tree
 *        @root:                radix tree root
 *        @results:        where the results of the lookup are placed
 *        @first_index:        start the lookup from this key
 *        @max_items:        place up to this many items at *results
 *
 *        Performs an index-ascending scan of the tree for present items.  Places
 *        them at *@results and returns the number of items which were placed at
 *        *@results.
 *
 *        The implementation is naive.
 *
 *        Like radix_tree_lookup, radix_tree_gang_lookup may be called under
 *        rcu_read_lock. In this case, rather than the returned results being
 *        an atomic snapshot of the tree at a single point in time, the
 *        semantics of an RCU protected gang lookup are as though multiple
 *        radix_tree_lookups have been issued in individual locks, and results
 *        stored in 'results'.
 */
unsigned int
radix_tree_gang_lookup(const struct radix_tree_root *root, void **results,
                        unsigned long first_index, unsigned int max_items)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int ret = 0;

        if (unlikely(!max_items))
                return 0;

        radix_tree_for_each_slot(slot, root, &iter, first_index) {
                results[ret] = rcu_dereference_raw(*slot);
                if (!results[ret])
                        continue;
                if (radix_tree_is_internal_node(results[ret])) {
                        slot = radix_tree_iter_retry(&iter);
                        continue;
                }
                if (++ret == max_items)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup);

/**
 *        radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
 *                                     based on a tag
 *        @root:                radix tree root
 *        @results:        where the results of the lookup are placed
 *        @first_index:        start the lookup from this key
 *        @max_items:        place up to this many items at *results
 *        @tag:                the tag index (< RADIX_TREE_MAX_TAGS)
 *
 *        Performs an index-ascending scan of the tree for present items which
 *        have the tag indexed by @tag set.  Places the items at *@results and
 *        returns the number of items which were placed at *@results.
 */
unsigned int
radix_tree_gang_lookup_tag(const struct radix_tree_root *root, void **results,
                unsigned long first_index, unsigned int max_items,
                unsigned int tag)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int ret = 0;

        if (unlikely(!max_items))
                return 0;

        radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
                results[ret] = rcu_dereference_raw(*slot);
                if (!results[ret])
                        continue;
                if (radix_tree_is_internal_node(results[ret])) {
                        slot = radix_tree_iter_retry(&iter);
                        continue;
                }
                if (++ret == max_items)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup_tag);

/**
 *        radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a
 *                                          radix tree based on a tag
 *        @root:                radix tree root
 *        @results:        where the results of the lookup are placed
 *        @first_index:        start the lookup from this key
 *        @max_items:        place up to this many items at *results
 *        @tag:                the tag index (< RADIX_TREE_MAX_TAGS)
 *
 *        Performs an index-ascending scan of the tree for present items which
 *        have the tag indexed by @tag set.  Places the slots at *@results and
 *        returns the number of slots which were placed at *@results.
 */
unsigned int
radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root,
                void __rcu ***results, unsigned long first_index,
                unsigned int max_items, unsigned int tag)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int ret = 0;

        if (unlikely(!max_items))
                return 0;

        radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
                results[ret] = slot;
                if (++ret == max_items)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);

static bool __radix_tree_delete(struct radix_tree_root *root,
                                struct radix_tree_node *node, void __rcu **slot)
{
        void *old = rcu_dereference_raw(*slot);
        int values = xa_is_value(old) ? -1 : 0;
        unsigned offset = get_slot_offset(node, slot);
        int tag;

        if (is_idr(root))
                node_tag_set(root, node, IDR_FREE, offset);
        else
                for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
                        node_tag_clear(root, node, tag, offset);

        replace_slot(slot, NULL, node, -1, values);
        return node && delete_node(root, node);
}

/**
 * radix_tree_iter_delete - delete the entry at this iterator position
 * @root: radix tree root
 * @iter: iterator state
 * @slot: pointer to slot
 *
 * Delete the entry at the position currently pointed to by the iterator.
 * This may result in the current node being freed; if it is, the iterator
 * is advanced so that it will not reference the freed memory.  This
 * function may be called without any locking if there are no other threads
 * which can access this tree.
 */
void radix_tree_iter_delete(struct radix_tree_root *root,
                                struct radix_tree_iter *iter, void __rcu **slot)
{
        if (__radix_tree_delete(root, iter->node, slot))
                iter->index = iter->next_index;
}
EXPORT_SYMBOL(radix_tree_iter_delete);

/**
 * radix_tree_delete_item - delete an item from a radix tree
 * @root: radix tree root
 * @index: index key
 * @item: expected item
 *
 * Remove @item at @index from the radix tree rooted at @root.
 *
 * Return: the deleted entry, or %NULL if it was not present
 * or the entry at the given @index was not @item.
 */
void *radix_tree_delete_item(struct radix_tree_root *root,
                             unsigned long index, void *item)
{
        struct radix_tree_node *node = NULL;
        void __rcu **slot = NULL;
        void *entry;

        entry = __radix_tree_lookup(root, index, &node, &slot);
        if (!slot)
                return NULL;
        if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE,
                                                get_slot_offset(node, slot))))
                return NULL;

        if (item && entry != item)
                return NULL;

        __radix_tree_delete(root, node, slot);

        return entry;
}
EXPORT_SYMBOL(radix_tree_delete_item);

/**
 * radix_tree_delete - delete an entry from a radix tree
 * @root: radix tree root
 * @index: index key
 *
 * Remove the entry at @index from the radix tree rooted at @root.
 *
 * Return: The deleted entry, or %NULL if it was not present.
 */
void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
{
        return radix_tree_delete_item(root, index, NULL);
}
EXPORT_SYMBOL(radix_tree_delete);

/**
 *        radix_tree_tagged - test whether any items in the tree are tagged
 *        @root:                radix tree root
 *        @tag:                tag to test
 */
int radix_tree_tagged(const struct radix_tree_root *root, unsigned int tag)
{
        return root_tag_get(root, tag);
}
EXPORT_SYMBOL(radix_tree_tagged);

/**
 * idr_preload - preload for idr_alloc()
 * @gfp_mask: allocation mask to use for preloading
 *
 * Preallocate memory to use for the next call to idr_alloc().  This function
 * returns with preemption disabled.  It will be enabled by idr_preload_end().
 */
void idr_preload(gfp_t gfp_mask)
{
        if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE))
                local_lock(&radix_tree_preloads.lock);
}
EXPORT_SYMBOL(idr_preload);

void __rcu **idr_get_free(struct radix_tree_root *root,
                              struct radix_tree_iter *iter, gfp_t gfp,
                              unsigned long max)
{
        struct radix_tree_node *node = NULL, *child;
        void __rcu **slot = (void __rcu **)&root->xa_head;
        unsigned long maxindex, start = iter->next_index;
        unsigned int shift, offset = 0;

 grow:
        shift = radix_tree_load_root(root, &child, &maxindex);
        if (!radix_tree_tagged(root, IDR_FREE))
                start = max(start, maxindex + 1);
        if (start > max)
                return ERR_PTR(-ENOSPC);

        if (start > maxindex) {
                int error = radix_tree_extend(root, gfp, start, shift);
                if (error < 0)
                        return ERR_PTR(error);
                shift = error;
                child = rcu_dereference_raw(root->xa_head);
        }
        if (start == 0 && shift == 0)
                shift = RADIX_TREE_MAP_SHIFT;

        while (shift) {
                shift -= RADIX_TREE_MAP_SHIFT;
                if (child == NULL) {
                        /* Have to add a child node.  */
                        child = radix_tree_node_alloc(gfp, node, root, shift,
                                                        offset, 0, 0);
                        if (!child)
                                return ERR_PTR(-ENOMEM);
                        all_tag_set(child, IDR_FREE);
                        rcu_assign_pointer(*slot, node_to_entry(child));
                        if (node)
                                node->count++;
                } else if (!radix_tree_is_internal_node(child))
                        break;

                node = entry_to_node(child);
                offset = radix_tree_descend(node, &child, start);
                if (!tag_get(node, IDR_FREE, offset)) {
                        offset = radix_tree_find_next_bit(node, IDR_FREE,
                                                        offset + 1);
                        start = next_index(start, node, offset);
                        if (start > max || start == 0)
                                return ERR_PTR(-ENOSPC);
                        while (offset == RADIX_TREE_MAP_SIZE) {
                                offset = node->offset + 1;
                                node = node->parent;
                                if (!node)
                                        goto grow;
                                shift = node->shift;
                        }
                        child = rcu_dereference_raw(node->slots[offset]);
                }
                slot = &node->slots[offset];
        }

        iter->index = start;
        if (node)
                iter->next_index = 1 + min(max, (start | node_maxindex(node)));
        else
                iter->next_index = 1;
        iter->node = node;
        set_iter_tags(iter, node, offset, IDR_FREE);

        return slot;
}

/**
 * idr_destroy - release all internal memory from an IDR
 * @idr: idr handle
 *
 * After this function is called, the IDR is empty, and may be reused or
 * the data structure containing it may be freed.
 *
 * A typical clean-up sequence for objects stored in an idr tree will use
 * idr_for_each() to free all objects, if necessary, then idr_destroy() to
 * free the memory used to keep track of those objects.
 */
void idr_destroy(struct idr *idr)
{
        struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.xa_head);
        if (radix_tree_is_internal_node(node))
                radix_tree_free_nodes(node);
        idr->idr_rt.xa_head = NULL;
        root_tag_set(&idr->idr_rt, IDR_FREE);
}
EXPORT_SYMBOL(idr_destroy);

static void
radix_tree_node_ctor(void *arg)
{
        struct radix_tree_node *node = arg;

        memset(node, 0, sizeof(*node));
        INIT_LIST_HEAD(&node->private_list);
}

static int radix_tree_cpu_dead(unsigned int cpu)
{
        struct radix_tree_preload *rtp;
        struct radix_tree_node *node;

        /* Free per-cpu pool of preloaded nodes */
        rtp = &per_cpu(radix_tree_preloads, cpu);
        while (rtp->nr) {
                node = rtp->nodes;
                rtp->nodes = node->parent;
                kmem_cache_free(radix_tree_node_cachep, node);
                rtp->nr--;
        }
        return 0;
}

void __init radix_tree_init(void)
{
        int ret;

        BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32);
        BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK);
        BUILD_BUG_ON(XA_CHUNK_SIZE > 255);
        radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
                        sizeof(struct radix_tree_node), 0,
                        SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
                        radix_tree_node_ctor);
        ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead",
                                        NULL, radix_tree_cpu_dead);
        WARN_ON(ret < 0);
}















































































    3 
    3 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_NLS_H
#define _LINUX_NLS_H

#include <linux/init.h>

/* Unicode has changed over the years.  Unicode code points no longer
 * fit into 16 bits; as of Unicode 5 valid code points range from 0
 * to 0x10ffff (17 planes, where each plane holds 65536 code points).
 *
 * The original decision to represent Unicode characters as 16-bit
 * wchar_t values is now outdated.  But plane 0 still includes the
 * most commonly used characters, so we will retain it.  The newer
 * 32-bit unicode_t type can be used when it is necessary to
 * represent the full Unicode character set.
 */

/* Plane-0 Unicode character */
typedef u16 wchar_t;
#define MAX_WCHAR_T        0xffff

/* Arbitrary Unicode character */
typedef u32 unicode_t;

struct nls_table {
        const char *charset;
        const char *alias;
        int (*uni2char) (wchar_t uni, unsigned char *out, int boundlen);
        int (*char2uni) (const unsigned char *rawstring, int boundlen,
                         wchar_t *uni);
        const unsigned char *charset2lower;
        const unsigned char *charset2upper;
        struct module *owner;
        struct nls_table *next;
};

/* this value hold the maximum octet of charset */
#define NLS_MAX_CHARSET_SIZE 6 /* for UTF-8 */

/* Byte order for UTF-16 strings */
enum utf16_endian {
        UTF16_HOST_ENDIAN,
        UTF16_LITTLE_ENDIAN,
        UTF16_BIG_ENDIAN
};

/* nls_base.c */
extern int __register_nls(struct nls_table *, struct module *);
extern int unregister_nls(struct nls_table *);
extern struct nls_table *load_nls(const char *charset);
extern void unload_nls(struct nls_table *);
extern struct nls_table *load_nls_default(void);
#define register_nls(nls) __register_nls((nls), THIS_MODULE)

extern int utf8_to_utf32(const u8 *s, int len, unicode_t *pu);
extern int utf32_to_utf8(unicode_t u, u8 *s, int maxlen);
extern int utf8s_to_utf16s(const u8 *s, int len,
                enum utf16_endian endian, wchar_t *pwcs, int maxlen);
extern int utf16s_to_utf8s(const wchar_t *pwcs, int len,
                enum utf16_endian endian, u8 *s, int maxlen);

static inline unsigned char nls_tolower(struct nls_table *t, unsigned char c)
{
        unsigned char nc = t->charset2lower[c];

        return nc ? nc : c;
}

static inline unsigned char nls_toupper(struct nls_table *t, unsigned char c)
{
        unsigned char nc = t->charset2upper[c];

        return nc ? nc : c;
}

static inline int nls_strnicmp(struct nls_table *t, const unsigned char *s1,
                const unsigned char *s2, int len)
{
        while (len--) {
                if (nls_tolower(t, *s1++) != nls_tolower(t, *s2++))
                        return 1;
        }

        return 0;
}

/*
 * nls_nullsize - return length of null character for codepage
 * @codepage - codepage for which to return length of NULL terminator
 *
 * Since we can't guarantee that the null terminator will be a particular
 * length, we have to check against the codepage. If there's a problem
 * determining it, assume a single-byte NULL terminator.
 */
static inline int
nls_nullsize(const struct nls_table *codepage)
{
        int charlen;
        char tmp[NLS_MAX_CHARSET_SIZE];

        charlen = codepage->uni2char(0, tmp, NLS_MAX_CHARSET_SIZE);

        return charlen > 0 ? charlen : 1;
}

#define MODULE_ALIAS_NLS(name)        MODULE_ALIAS("nls_" __stringify(name))

#endif /* _LINUX_NLS_H */







































































































































































































































































































    1 






    1 












    1 
















    1 




























    1 
























    2 






    1 








    1 
    1 
















































































































































































































    2 










    1 
    1 
    2 

    1 

    1 













    1 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
/*
 *  linux/fs/hfs/inode.c
 *
 * Copyright (C) 1995-1997  Paul H. Hargrove
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 * This file may be distributed under the terms of the GNU General Public License.
 *
 * This file contains inode-related functions which do not depend on
 * which scheme is being used to represent forks.
 *
 * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds
 */

#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/sched.h>
#include <linux/cred.h>
#include <linux/uio.h>
#include <linux/xattr.h>
#include <linux/blkdev.h>

#include "hfs_fs.h"
#include "btree.h"

static const struct file_operations hfs_file_operations;
static const struct inode_operations hfs_file_inode_operations;

/*================ Variable-like macros ================*/

#define HFS_VALID_MODE_BITS  (S_IFREG | S_IFDIR | S_IRWXUGO)

static int hfs_read_folio(struct file *file, struct folio *folio)
{
        return block_read_full_folio(folio, hfs_get_block);
}

static void hfs_write_failed(struct address_space *mapping, loff_t to)
{
        struct inode *inode = mapping->host;

        if (to > inode->i_size) {
                truncate_pagecache(inode, inode->i_size);
                hfs_file_truncate(inode);
        }
}

int hfs_write_begin(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, struct page **pagep, void **fsdata)
{
        int ret;

        *pagep = NULL;
        ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
                                hfs_get_block,
                                &HFS_I(mapping->host)->phys_size);
        if (unlikely(ret))
                hfs_write_failed(mapping, pos + len);

        return ret;
}

static sector_t hfs_bmap(struct address_space *mapping, sector_t block)
{
        return generic_block_bmap(mapping, block, hfs_get_block);
}

static bool hfs_release_folio(struct folio *folio, gfp_t mask)
{
        struct inode *inode = folio->mapping->host;
        struct super_block *sb = inode->i_sb;
        struct hfs_btree *tree;
        struct hfs_bnode *node;
        u32 nidx;
        int i;
        bool res = true;

        switch (inode->i_ino) {
        case HFS_EXT_CNID:
                tree = HFS_SB(sb)->ext_tree;
                break;
        case HFS_CAT_CNID:
                tree = HFS_SB(sb)->cat_tree;
                break;
        default:
                BUG();
                return false;
        }

        if (!tree)
                return false;

        if (tree->node_size >= PAGE_SIZE) {
                nidx = folio->index >> (tree->node_size_shift - PAGE_SHIFT);
                spin_lock(&tree->hash_lock);
                node = hfs_bnode_findhash(tree, nidx);
                if (!node)
                        ;
                else if (atomic_read(&node->refcnt))
                        res = false;
                if (res && node) {
                        hfs_bnode_unhash(node);
                        hfs_bnode_free(node);
                }
                spin_unlock(&tree->hash_lock);
        } else {
                nidx = folio->index << (PAGE_SHIFT - tree->node_size_shift);
                i = 1 << (PAGE_SHIFT - tree->node_size_shift);
                spin_lock(&tree->hash_lock);
                do {
                        node = hfs_bnode_findhash(tree, nidx++);
                        if (!node)
                                continue;
                        if (atomic_read(&node->refcnt)) {
                                res = false;
                                break;
                        }
                        hfs_bnode_unhash(node);
                        hfs_bnode_free(node);
                } while (--i && nidx < tree->node_count);
                spin_unlock(&tree->hash_lock);
        }
        return res ? try_to_free_buffers(folio) : false;
}

static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        size_t count = iov_iter_count(iter);
        ssize_t ret;

        ret = blockdev_direct_IO(iocb, inode, iter, hfs_get_block);

        /*
         * In case of error extending write may have instantiated a few
         * blocks outside i_size. Trim these off again.
         */
        if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
                loff_t isize = i_size_read(inode);
                loff_t end = iocb->ki_pos + count;

                if (end > isize)
                        hfs_write_failed(mapping, end);
        }

        return ret;
}

static int hfs_writepages(struct address_space *mapping,
                          struct writeback_control *wbc)
{
        return mpage_writepages(mapping, wbc, hfs_get_block);
}

const struct address_space_operations hfs_btree_aops = {
        .dirty_folio        = block_dirty_folio,
        .invalidate_folio = block_invalidate_folio,
        .read_folio        = hfs_read_folio,
        .writepages        = hfs_writepages,
        .write_begin        = hfs_write_begin,
        .write_end        = generic_write_end,
        .migrate_folio        = buffer_migrate_folio,
        .bmap                = hfs_bmap,
        .release_folio        = hfs_release_folio,
};

const struct address_space_operations hfs_aops = {
        .dirty_folio        = block_dirty_folio,
        .invalidate_folio = block_invalidate_folio,
        .read_folio        = hfs_read_folio,
        .write_begin        = hfs_write_begin,
        .write_end        = generic_write_end,
        .bmap                = hfs_bmap,
        .direct_IO        = hfs_direct_IO,
        .writepages        = hfs_writepages,
        .migrate_folio        = buffer_migrate_folio,
};

/*
 * hfs_new_inode
 */
struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t mode)
{
        struct super_block *sb = dir->i_sb;
        struct inode *inode = new_inode(sb);
        if (!inode)
                return NULL;

        mutex_init(&HFS_I(inode)->extents_lock);
        INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
        spin_lock_init(&HFS_I(inode)->open_dir_lock);
        hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name);
        inode->i_ino = HFS_SB(sb)->next_id++;
        inode->i_mode = mode;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
        set_nlink(inode, 1);
        simple_inode_init_ts(inode);
        HFS_I(inode)->flags = 0;
        HFS_I(inode)->rsrc_inode = NULL;
        HFS_I(inode)->fs_blocks = 0;
        if (S_ISDIR(mode)) {
                inode->i_size = 2;
                HFS_SB(sb)->folder_count++;
                if (dir->i_ino == HFS_ROOT_CNID)
                        HFS_SB(sb)->root_dirs++;
                inode->i_op = &hfs_dir_inode_operations;
                inode->i_fop = &hfs_dir_operations;
                inode->i_mode |= S_IRWXUGO;
                inode->i_mode &= ~HFS_SB(inode->i_sb)->s_dir_umask;
        } else if (S_ISREG(mode)) {
                HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks;
                HFS_SB(sb)->file_count++;
                if (dir->i_ino == HFS_ROOT_CNID)
                        HFS_SB(sb)->root_files++;
                inode->i_op = &hfs_file_inode_operations;
                inode->i_fop = &hfs_file_operations;
                inode->i_mapping->a_ops = &hfs_aops;
                inode->i_mode |= S_IRUGO|S_IXUGO;
                if (mode & S_IWUSR)
                        inode->i_mode |= S_IWUGO;
                inode->i_mode &= ~HFS_SB(inode->i_sb)->s_file_umask;
                HFS_I(inode)->phys_size = 0;
                HFS_I(inode)->alloc_blocks = 0;
                HFS_I(inode)->first_blocks = 0;
                HFS_I(inode)->cached_start = 0;
                HFS_I(inode)->cached_blocks = 0;
                memset(HFS_I(inode)->first_extents, 0, sizeof(hfs_extent_rec));
                memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec));
        }
        insert_inode_hash(inode);
        mark_inode_dirty(inode);
        set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
        hfs_mark_mdb_dirty(sb);

        return inode;
}

void hfs_delete_inode(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;

        hfs_dbg(INODE, "delete_inode: %lu\n", inode->i_ino);
        if (S_ISDIR(inode->i_mode)) {
                HFS_SB(sb)->folder_count--;
                if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID))
                        HFS_SB(sb)->root_dirs--;
                set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
                hfs_mark_mdb_dirty(sb);
                return;
        }
        HFS_SB(sb)->file_count--;
        if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID))
                HFS_SB(sb)->root_files--;
        if (S_ISREG(inode->i_mode)) {
                if (!inode->i_nlink) {
                        inode->i_size = 0;
                        hfs_file_truncate(inode);
                }
        }
        set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags);
        hfs_mark_mdb_dirty(sb);
}

void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
                         __be32 __log_size, __be32 phys_size, u32 clump_size)
{
        struct super_block *sb = inode->i_sb;
        u32 log_size = be32_to_cpu(__log_size);
        u16 count;
        int i;

        memcpy(HFS_I(inode)->first_extents, ext, sizeof(hfs_extent_rec));
        for (count = 0, i = 0; i < 3; i++)
                count += be16_to_cpu(ext[i].count);
        HFS_I(inode)->first_blocks = count;

        inode->i_size = HFS_I(inode)->phys_size = log_size;
        HFS_I(inode)->fs_blocks = (log_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
        inode_set_bytes(inode, HFS_I(inode)->fs_blocks << sb->s_blocksize_bits);
        HFS_I(inode)->alloc_blocks = be32_to_cpu(phys_size) /
                                     HFS_SB(sb)->alloc_blksz;
        HFS_I(inode)->clump_blocks = clump_size / HFS_SB(sb)->alloc_blksz;
        if (!HFS_I(inode)->clump_blocks)
                HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks;
}

struct hfs_iget_data {
        struct hfs_cat_key *key;
        hfs_cat_rec *rec;
};

static int hfs_test_inode(struct inode *inode, void *data)
{
        struct hfs_iget_data *idata = data;
        hfs_cat_rec *rec;

        rec = idata->rec;
        switch (rec->type) {
        case HFS_CDR_DIR:
                return inode->i_ino == be32_to_cpu(rec->dir.DirID);
        case HFS_CDR_FIL:
                return inode->i_ino == be32_to_cpu(rec->file.FlNum);
        default:
                BUG();
                return 1;
        }
}

/*
 * hfs_read_inode
 */
static int hfs_read_inode(struct inode *inode, void *data)
{
        struct hfs_iget_data *idata = data;
        struct hfs_sb_info *hsb = HFS_SB(inode->i_sb);
        hfs_cat_rec *rec;

        HFS_I(inode)->flags = 0;
        HFS_I(inode)->rsrc_inode = NULL;
        mutex_init(&HFS_I(inode)->extents_lock);
        INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
        spin_lock_init(&HFS_I(inode)->open_dir_lock);

        /* Initialize the inode */
        inode->i_uid = hsb->s_uid;
        inode->i_gid = hsb->s_gid;
        set_nlink(inode, 1);

        if (idata->key)
                HFS_I(inode)->cat_key = *idata->key;
        else
                HFS_I(inode)->flags |= HFS_FLG_RSRC;
        HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60;

        rec = idata->rec;
        switch (rec->type) {
        case HFS_CDR_FIL:
                if (!HFS_IS_RSRC(inode)) {
                        hfs_inode_read_fork(inode, rec->file.ExtRec, rec->file.LgLen,
                                            rec->file.PyLen, be16_to_cpu(rec->file.ClpSize));
                } else {
                        hfs_inode_read_fork(inode, rec->file.RExtRec, rec->file.RLgLen,
                                            rec->file.RPyLen, be16_to_cpu(rec->file.ClpSize));
                }

                inode->i_ino = be32_to_cpu(rec->file.FlNum);
                inode->i_mode = S_IRUGO | S_IXUGO;
                if (!(rec->file.Flags & HFS_FIL_LOCK))
                        inode->i_mode |= S_IWUGO;
                inode->i_mode &= ~hsb->s_file_umask;
                inode->i_mode |= S_IFREG;
                inode_set_mtime_to_ts(inode,
                                      inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, hfs_m_to_utime(rec->file.MdDat))));
                inode->i_op = &hfs_file_inode_operations;
                inode->i_fop = &hfs_file_operations;
                inode->i_mapping->a_ops = &hfs_aops;
                break;
        case HFS_CDR_DIR:
                inode->i_ino = be32_to_cpu(rec->dir.DirID);
                inode->i_size = be16_to_cpu(rec->dir.Val) + 2;
                HFS_I(inode)->fs_blocks = 0;
                inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask);
                inode_set_mtime_to_ts(inode,
                                      inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, hfs_m_to_utime(rec->dir.MdDat))));
                inode->i_op = &hfs_dir_inode_operations;
                inode->i_fop = &hfs_dir_operations;
                break;
        default:
                make_bad_inode(inode);
        }
        return 0;
}

/*
 * __hfs_iget()
 *
 * Given the MDB for a HFS filesystem, a 'key' and an 'entry' in
 * the catalog B-tree and the 'type' of the desired file return the
 * inode for that file/directory or NULL.  Note that 'type' indicates
 * whether we want the actual file or directory, or the corresponding
 * metadata (AppleDouble header file or CAP metadata file).
 */
struct inode *hfs_iget(struct super_block *sb, struct hfs_cat_key *key, hfs_cat_rec *rec)
{
        struct hfs_iget_data data = { key, rec };
        struct inode *inode;
        u32 cnid;

        switch (rec->type) {
        case HFS_CDR_DIR:
                cnid = be32_to_cpu(rec->dir.DirID);
                break;
        case HFS_CDR_FIL:
                cnid = be32_to_cpu(rec->file.FlNum);
                break;
        default:
                return NULL;
        }
        inode = iget5_locked(sb, cnid, hfs_test_inode, hfs_read_inode, &data);
        if (inode && (inode->i_state & I_NEW))
                unlock_new_inode(inode);
        return inode;
}

void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext,
                          __be32 *log_size, __be32 *phys_size)
{
        memcpy(ext, HFS_I(inode)->first_extents, sizeof(hfs_extent_rec));

        if (log_size)
                *log_size = cpu_to_be32(inode->i_size);
        if (phys_size)
                *phys_size = cpu_to_be32(HFS_I(inode)->alloc_blocks *
                                         HFS_SB(inode->i_sb)->alloc_blksz);
}

int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
        struct inode *main_inode = inode;
        struct hfs_find_data fd;
        hfs_cat_rec rec;
        int res;

        hfs_dbg(INODE, "hfs_write_inode: %lu\n", inode->i_ino);
        res = hfs_ext_write_extent(inode);
        if (res)
                return res;

        if (inode->i_ino < HFS_FIRSTUSER_CNID) {
                switch (inode->i_ino) {
                case HFS_ROOT_CNID:
                        break;
                case HFS_EXT_CNID:
                        hfs_btree_write(HFS_SB(inode->i_sb)->ext_tree);
                        return 0;
                case HFS_CAT_CNID:
                        hfs_btree_write(HFS_SB(inode->i_sb)->cat_tree);
                        return 0;
                default:
                        BUG();
                        return -EIO;
                }
        }

        if (HFS_IS_RSRC(inode))
                main_inode = HFS_I(inode)->rsrc_inode;

        if (!main_inode->i_nlink)
                return 0;

        if (hfs_find_init(HFS_SB(main_inode->i_sb)->cat_tree, &fd))
                /* panic? */
                return -EIO;

        res = -EIO;
        if (HFS_I(main_inode)->cat_key.CName.len > HFS_NAMELEN)
                goto out;
        fd.search_key->cat = HFS_I(main_inode)->cat_key;
        if (hfs_brec_find(&fd))
                goto out;

        if (S_ISDIR(main_inode->i_mode)) {
                if (fd.entrylength < sizeof(struct hfs_cat_dir))
                        goto out;
                hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
                           sizeof(struct hfs_cat_dir));
                if (rec.type != HFS_CDR_DIR ||
                    be32_to_cpu(rec.dir.DirID) != inode->i_ino) {
                }

                rec.dir.MdDat = hfs_u_to_mtime(inode_get_mtime(inode));
                rec.dir.Val = cpu_to_be16(inode->i_size - 2);

                hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
                            sizeof(struct hfs_cat_dir));
        } else if (HFS_IS_RSRC(inode)) {
                if (fd.entrylength < sizeof(struct hfs_cat_file))
                        goto out;
                hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
                               sizeof(struct hfs_cat_file));
                hfs_inode_write_fork(inode, rec.file.RExtRec,
                                     &rec.file.RLgLen, &rec.file.RPyLen);
                hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
                                sizeof(struct hfs_cat_file));
        } else {
                if (fd.entrylength < sizeof(struct hfs_cat_file))
                        goto out;
                hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
                           sizeof(struct hfs_cat_file));
                if (rec.type != HFS_CDR_FIL ||
                    be32_to_cpu(rec.file.FlNum) != inode->i_ino) {
                }

                if (inode->i_mode & S_IWUSR)
                        rec.file.Flags &= ~HFS_FIL_LOCK;
                else
                        rec.file.Flags |= HFS_FIL_LOCK;
                hfs_inode_write_fork(inode, rec.file.ExtRec, &rec.file.LgLen, &rec.file.PyLen);
                rec.file.MdDat = hfs_u_to_mtime(inode_get_mtime(inode));

                hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
                            sizeof(struct hfs_cat_file));
        }
        res = 0;
out:
        hfs_find_exit(&fd);
        return res;
}

static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
                                      unsigned int flags)
{
        struct inode *inode = NULL;
        hfs_cat_rec rec;
        struct hfs_find_data fd;
        int res;

        if (HFS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
                goto out;

        inode = HFS_I(dir)->rsrc_inode;
        if (inode)
                goto out;

        inode = new_inode(dir->i_sb);
        if (!inode)
                return ERR_PTR(-ENOMEM);

        res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
        if (res) {
                iput(inode);
                return ERR_PTR(res);
        }
        fd.search_key->cat = HFS_I(dir)->cat_key;
        res = hfs_brec_read(&fd, &rec, sizeof(rec));
        if (!res) {
                struct hfs_iget_data idata = { NULL, &rec };
                hfs_read_inode(inode, &idata);
        }
        hfs_find_exit(&fd);
        if (res) {
                iput(inode);
                return ERR_PTR(res);
        }
        HFS_I(inode)->rsrc_inode = dir;
        HFS_I(dir)->rsrc_inode = inode;
        igrab(dir);
        inode_fake_hash(inode);
        mark_inode_dirty(inode);
        dont_mount(dentry);
out:
        return d_splice_alias(inode, dentry);
}

void hfs_evict_inode(struct inode *inode)
{
        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) {
                HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
                iput(HFS_I(inode)->rsrc_inode);
        }
}

static int hfs_file_open(struct inode *inode, struct file *file)
{
        if (HFS_IS_RSRC(inode))
                inode = HFS_I(inode)->rsrc_inode;
        atomic_inc(&HFS_I(inode)->opencnt);
        return 0;
}

static int hfs_file_release(struct inode *inode, struct file *file)
{
        //struct super_block *sb = inode->i_sb;

        if (HFS_IS_RSRC(inode))
                inode = HFS_I(inode)->rsrc_inode;
        if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) {
                inode_lock(inode);
                hfs_file_truncate(inode);
                //if (inode->i_flags & S_DEAD) {
                //        hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
                //        hfs_delete_inode(inode);
                //}
                inode_unlock(inode);
        }
        return 0;
}

/*
 * hfs_notify_change()
 *
 * Based very closely on fs/msdos/inode.c by Werner Almesberger
 *
 * This is the notify_change() field in the super_operations structure
 * for HFS file systems.  The purpose is to take that changes made to
 * an inode and apply then in a filesystem-dependent manner.  In this
 * case the process has a few of tasks to do:
 *  1) prevent changes to the i_uid and i_gid fields.
 *  2) map file permissions to the closest allowable permissions
 *  3) Since multiple Linux files can share the same on-disk inode under
 *     HFS (for instance the data and resource forks of a file) a change
 *     to permissions must be applied to all other in-core inodes which
 *     correspond to the same HFS file.
 */

int hfs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                      struct iattr *attr)
{
        struct inode *inode = d_inode(dentry);
        struct hfs_sb_info *hsb = HFS_SB(inode->i_sb);
        int error;

        error = setattr_prepare(&nop_mnt_idmap, dentry,
                                attr); /* basic permission checks */
        if (error)
                return error;

        /* no uig/gid changes and limit which mode bits can be set */
        if (((attr->ia_valid & ATTR_UID) &&
             (!uid_eq(attr->ia_uid, hsb->s_uid))) ||
            ((attr->ia_valid & ATTR_GID) &&
             (!gid_eq(attr->ia_gid, hsb->s_gid))) ||
            ((attr->ia_valid & ATTR_MODE) &&
             ((S_ISDIR(inode->i_mode) &&
               (attr->ia_mode != inode->i_mode)) ||
              (attr->ia_mode & ~HFS_VALID_MODE_BITS)))) {
                return hsb->s_quiet ? 0 : error;
        }

        if (attr->ia_valid & ATTR_MODE) {
                /* Only the 'w' bits can ever change and only all together. */
                if (attr->ia_mode & S_IWUSR)
                        attr->ia_mode = inode->i_mode | S_IWUGO;
                else
                        attr->ia_mode = inode->i_mode & ~S_IWUGO;
                attr->ia_mode &= S_ISDIR(inode->i_mode) ? ~hsb->s_dir_umask: ~hsb->s_file_umask;
        }

        if ((attr->ia_valid & ATTR_SIZE) &&
            attr->ia_size != i_size_read(inode)) {
                inode_dio_wait(inode);

                error = inode_newsize_ok(inode, attr->ia_size);
                if (error)
                        return error;

                truncate_setsize(inode, attr->ia_size);
                hfs_file_truncate(inode);
                simple_inode_init_ts(inode);
        }

        setattr_copy(&nop_mnt_idmap, inode, attr);
        mark_inode_dirty(inode);
        return 0;
}

static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
                          int datasync)
{
        struct inode *inode = filp->f_mapping->host;
        struct super_block * sb;
        int ret, err;

        ret = file_write_and_wait_range(filp, start, end);
        if (ret)
                return ret;
        inode_lock(inode);

        /* sync the inode to buffers */
        ret = write_inode_now(inode, 0);

        /* sync the superblock to buffers */
        sb = inode->i_sb;
        flush_delayed_work(&HFS_SB(sb)->mdb_work);
        /* .. finally sync the buffers to disk */
        err = sync_blockdev(sb->s_bdev);
        if (!ret)
                ret = err;
        inode_unlock(inode);
        return ret;
}

static const struct file_operations hfs_file_operations = {
        .llseek                = generic_file_llseek,
        .read_iter        = generic_file_read_iter,
        .write_iter        = generic_file_write_iter,
        .mmap                = generic_file_mmap,
        .splice_read        = filemap_splice_read,
        .fsync                = hfs_file_fsync,
        .open                = hfs_file_open,
        .release        = hfs_file_release,
};

static const struct inode_operations hfs_file_inode_operations = {
        .lookup                = hfs_file_lookup,
        .setattr        = hfs_inode_setattr,
        .listxattr        = generic_listxattr,
};























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * net busy poll support
 * Copyright(c) 2013 Intel Corporation.
 *
 * Author: Eliezer Tamir
 *
 * Contact Information:
 * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
 */

#ifndef _LINUX_NET_BUSY_POLL_H
#define _LINUX_NET_BUSY_POLL_H

#include <linux/netdevice.h>
#include <linux/sched/clock.h>
#include <linux/sched/signal.h>
#include <net/ip.h>
#include <net/xdp.h>

/*                0 - Reserved to indicate value not set
 *     1..NR_CPUS - Reserved for sender_cpu
 *  NR_CPUS+1..~0 - Region available for NAPI IDs
 */
#define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1))

#define BUSY_POLL_BUDGET 8

#ifdef CONFIG_NET_RX_BUSY_POLL

struct napi_struct;
extern unsigned int sysctl_net_busy_read __read_mostly;
extern unsigned int sysctl_net_busy_poll __read_mostly;

static inline bool net_busy_loop_on(void)
{
        return READ_ONCE(sysctl_net_busy_poll);
}

static inline bool sk_can_busy_loop(const struct sock *sk)
{
        return READ_ONCE(sk->sk_ll_usec) && !signal_pending(current);
}

bool sk_busy_loop_end(void *p, unsigned long start_time);

void napi_busy_loop(unsigned int napi_id,
                    bool (*loop_end)(void *, unsigned long),
                    void *loop_end_arg, bool prefer_busy_poll, u16 budget);

void napi_busy_loop_rcu(unsigned int napi_id,
                        bool (*loop_end)(void *, unsigned long),
                        void *loop_end_arg, bool prefer_busy_poll, u16 budget);

#else /* CONFIG_NET_RX_BUSY_POLL */
static inline unsigned long net_busy_loop_on(void)
{
        return 0;
}

static inline bool sk_can_busy_loop(struct sock *sk)
{
        return false;
}

#endif /* CONFIG_NET_RX_BUSY_POLL */

static inline unsigned long busy_loop_current_time(void)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        return (unsigned long)(local_clock() >> 10);
#else
        return 0;
#endif
}

/* in poll/select we use the global sysctl_net_ll_poll value */
static inline bool busy_loop_timeout(unsigned long start_time)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        unsigned long bp_usec = READ_ONCE(sysctl_net_busy_poll);

        if (bp_usec) {
                unsigned long end_time = start_time + bp_usec;
                unsigned long now = busy_loop_current_time();

                return time_after(now, end_time);
        }
#endif
        return true;
}

static inline bool sk_busy_loop_timeout(struct sock *sk,
                                        unsigned long start_time)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        unsigned long bp_usec = READ_ONCE(sk->sk_ll_usec);

        if (bp_usec) {
                unsigned long end_time = start_time + bp_usec;
                unsigned long now = busy_loop_current_time();

                return time_after(now, end_time);
        }
#endif
        return true;
}

static inline void sk_busy_loop(struct sock *sk, int nonblock)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        unsigned int napi_id = READ_ONCE(sk->sk_napi_id);

        if (napi_id >= MIN_NAPI_ID)
                napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk,
                               READ_ONCE(sk->sk_prefer_busy_poll),
                               READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET);
#endif
}

/* used in the NIC receive handler to mark the skb */
static inline void skb_mark_napi_id(struct sk_buff *skb,
                                    struct napi_struct *napi)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        /* If the skb was already marked with a valid NAPI ID, avoid overwriting
         * it.
         */
        if (skb->napi_id < MIN_NAPI_ID)
                skb->napi_id = napi->napi_id;
#endif
}

/* used in the protocol hanlder to propagate the napi_id to the socket */
static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        if (unlikely(READ_ONCE(sk->sk_napi_id) != skb->napi_id))
                WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
#endif
        sk_rx_queue_update(sk, skb);
}

/* Variant of sk_mark_napi_id() for passive flow setup,
 * as sk->sk_napi_id and sk->sk_rx_queue_mapping content
 * needs to be set.
 */
static inline void sk_mark_napi_id_set(struct sock *sk,
                                       const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        WRITE_ONCE(sk->sk_napi_id, skb->napi_id);
#endif
        sk_rx_queue_set(sk, skb);
}

static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        if (!READ_ONCE(sk->sk_napi_id))
                WRITE_ONCE(sk->sk_napi_id, napi_id);
#endif
}

/* variant used for unconnected sockets */
static inline void sk_mark_napi_id_once(struct sock *sk,
                                        const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        __sk_mark_napi_id_once(sk, skb->napi_id);
#endif
}

static inline void sk_mark_napi_id_once_xdp(struct sock *sk,
                                            const struct xdp_buff *xdp)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        __sk_mark_napi_id_once(sk, xdp->rxq->napi_id);
#endif
}

#endif /* _LINUX_NET_BUSY_POLL_H */




































    1 






























    1 




















    4 










    4 













    4 






















































































































































    1 












    1 







    1 











    1 













    2 











    1 




    1 







    1 

    1 













    2 











    1 


    1 











    1 














    2 











    1 




    2 
    1 







    1 

    2 










































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/readdir.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 */

#include <linux/stddef.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/time.h>
#include <linux/mm.h>
#include <linux/errno.h>
#include <linux/stat.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/fsnotify.h>
#include <linux/dirent.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/compat.h>
#include <linux/uaccess.h>

#include <asm/unaligned.h>

/*
 * Some filesystems were never converted to '->iterate_shared()'
 * and their directory iterators want the inode lock held for
 * writing. This wrapper allows for converting from the shared
 * semantics to the exclusive inode use.
 */
int wrap_directory_iterator(struct file *file,
                            struct dir_context *ctx,
                            int (*iter)(struct file *, struct dir_context *))
{
        struct inode *inode = file_inode(file);
        int ret;

        /*
         * We'd love to have an 'inode_upgrade_trylock()' operation,
         * see the comment in mmap_upgrade_trylock() in mm/memory.c.
         *
         * But considering this is for "filesystems that never got
         * converted", it really doesn't matter.
         *
         * Also note that since we have to return with the lock held
         * for reading, we can't use the "killable()" locking here,
         * since we do need to get the lock even if we're dying.
         *
         * We could do the write part killably and then get the read
         * lock unconditionally if it mattered, but see above on why
         * this does the very simplistic conversion.
         */
        up_read(&inode->i_rwsem);
        down_write(&inode->i_rwsem);

        /*
         * Since we dropped the inode lock, we should do the
         * DEADDIR test again. See 'iterate_dir()' below.
         *
         * Note that we don't need to re-do the f_pos games,
         * since the file must be locked wrt f_pos anyway.
         */
        ret = -ENOENT;
        if (!IS_DEADDIR(inode))
                ret = iter(file, ctx);

        downgrade_write(&inode->i_rwsem);
        return ret;
}
EXPORT_SYMBOL(wrap_directory_iterator);

/*
 * Note the "unsafe_put_user() semantics: we goto a
 * label for errors.
 */
#define unsafe_copy_dirent_name(_dst, _src, _len, label) do {        \
        char __user *dst = (_dst);                                \
        const char *src = (_src);                                \
        size_t len = (_len);                                        \
        unsafe_put_user(0, dst+len, label);                        \
        unsafe_copy_to_user(dst, src, len, label);                \
} while (0)


int iterate_dir(struct file *file, struct dir_context *ctx)
{
        struct inode *inode = file_inode(file);
        int res = -ENOTDIR;

        if (!file->f_op->iterate_shared)
                goto out;

        res = security_file_permission(file, MAY_READ);
        if (res)
                goto out;

        res = fsnotify_file_perm(file, MAY_READ);
        if (res)
                goto out;

        res = down_read_killable(&inode->i_rwsem);
        if (res)
                goto out;

        res = -ENOENT;
        if (!IS_DEADDIR(inode)) {
                ctx->pos = file->f_pos;
                res = file->f_op->iterate_shared(file, ctx);
                file->f_pos = ctx->pos;
                fsnotify_access(file);
                file_accessed(file);
        }
        inode_unlock_shared(inode);
out:
        return res;
}
EXPORT_SYMBOL(iterate_dir);

/*
 * POSIX says that a dirent name cannot contain NULL or a '/'.
 *
 * It's not 100% clear what we should really do in this case.
 * The filesystem is clearly corrupted, but returning a hard
 * error means that you now don't see any of the other names
 * either, so that isn't a perfect alternative.
 *
 * And if you return an error, what error do you use? Several
 * filesystems seem to have decided on EUCLEAN being the error
 * code for EFSCORRUPTED, and that may be the error to use. Or
 * just EIO, which is perhaps more obvious to users.
 *
 * In order to see the other file names in the directory, the
 * caller might want to make this a "soft" error: skip the
 * entry, and return the error at the end instead.
 *
 * Note that this should likely do a "memchr(name, 0, len)"
 * check too, since that would be filesystem corruption as
 * well. However, that case can't actually confuse user space,
 * which has to do a strlen() on the name anyway to find the
 * filename length, and the above "soft error" worry means
 * that it's probably better left alone until we have that
 * issue clarified.
 *
 * Note the PATH_MAX check - it's arbitrary but the real
 * kernel limit on a possible path component, not NAME_MAX,
 * which is the technical standard limit.
 */
static int verify_dirent_name(const char *name, int len)
{
        if (len <= 0 || len >= PATH_MAX)
                return -EIO;
        if (memchr(name, '/', len))
                return -EIO;
        return 0;
}

/*
 * Traditional linux readdir() handling..
 *
 * "count=1" is a special case, meaning that the buffer is one
 * dirent-structure in size and that the code can't handle more
 * anyway. Thus the special "fillonedir()" function for that
 * case (the low-level handlers don't need to care about this).
 */

#ifdef __ARCH_WANT_OLD_READDIR

struct old_linux_dirent {
        unsigned long        d_ino;
        unsigned long        d_offset;
        unsigned short        d_namlen;
        char                d_name[];
};

struct readdir_callback {
        struct dir_context ctx;
        struct old_linux_dirent __user * dirent;
        int result;
};

static bool fillonedir(struct dir_context *ctx, const char *name, int namlen,
                      loff_t offset, u64 ino, unsigned int d_type)
{
        struct readdir_callback *buf =
                container_of(ctx, struct readdir_callback, ctx);
        struct old_linux_dirent __user * dirent;
        unsigned long d_ino;

        if (buf->result)
                return false;
        buf->result = verify_dirent_name(name, namlen);
        if (buf->result)
                return false;
        d_ino = ino;
        if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
                buf->result = -EOVERFLOW;
                return false;
        }
        buf->result++;
        dirent = buf->dirent;
        if (!user_write_access_begin(dirent,
                        (unsigned long)(dirent->d_name + namlen + 1) -
                                (unsigned long)dirent))
                goto efault;
        unsafe_put_user(d_ino, &dirent->d_ino, efault_end);
        unsafe_put_user(offset, &dirent->d_offset, efault_end);
        unsafe_put_user(namlen, &dirent->d_namlen, efault_end);
        unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
        user_write_access_end();
        return true;
efault_end:
        user_write_access_end();
efault:
        buf->result = -EFAULT;
        return false;
}

SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
                struct old_linux_dirent __user *, dirent, unsigned int, count)
{
        int error;
        struct fd f = fdget_pos(fd);
        struct readdir_callback buf = {
                .ctx.actor = fillonedir,
                .dirent = dirent
        };

        if (!f.file)
                return -EBADF;

        error = iterate_dir(f.file, &buf.ctx);
        if (buf.result)
                error = buf.result;

        fdput_pos(f);
        return error;
}

#endif /* __ARCH_WANT_OLD_READDIR */

/*
 * New, all-improved, singing, dancing, iBCS2-compliant getdents()
 * interface. 
 */
struct linux_dirent {
        unsigned long        d_ino;
        unsigned long        d_off;
        unsigned short        d_reclen;
        char                d_name[];
};

struct getdents_callback {
        struct dir_context ctx;
        struct linux_dirent __user * current_dir;
        int prev_reclen;
        int count;
        int error;
};

static bool filldir(struct dir_context *ctx, const char *name, int namlen,
                   loff_t offset, u64 ino, unsigned int d_type)
{
        struct linux_dirent __user *dirent, *prev;
        struct getdents_callback *buf =
                container_of(ctx, struct getdents_callback, ctx);
        unsigned long d_ino;
        int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2,
                sizeof(long));
        int prev_reclen;

        buf->error = verify_dirent_name(name, namlen);
        if (unlikely(buf->error))
                return false;
        buf->error = -EINVAL;        /* only used if we fail.. */
        if (reclen > buf->count)
                return false;
        d_ino = ino;
        if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
                buf->error = -EOVERFLOW;
                return false;
        }
        prev_reclen = buf->prev_reclen;
        if (prev_reclen && signal_pending(current))
                return false;
        dirent = buf->current_dir;
        prev = (void __user *) dirent - prev_reclen;
        if (!user_write_access_begin(prev, reclen + prev_reclen))
                goto efault;

        /* This might be 'dirent->d_off', but if so it will get overwritten */
        unsafe_put_user(offset, &prev->d_off, efault_end);
        unsafe_put_user(d_ino, &dirent->d_ino, efault_end);
        unsafe_put_user(reclen, &dirent->d_reclen, efault_end);
        unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end);
        unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
        user_write_access_end();

        buf->current_dir = (void __user *)dirent + reclen;
        buf->prev_reclen = reclen;
        buf->count -= reclen;
        return true;
efault_end:
        user_write_access_end();
efault:
        buf->error = -EFAULT;
        return false;
}

SYSCALL_DEFINE3(getdents, unsigned int, fd,
                struct linux_dirent __user *, dirent, unsigned int, count)
{
        struct fd f;
        struct getdents_callback buf = {
                .ctx.actor = filldir,
                .count = count,
                .current_dir = dirent
        };
        int error;

        f = fdget_pos(fd);
        if (!f.file)
                return -EBADF;

        error = iterate_dir(f.file, &buf.ctx);
        if (error >= 0)
                error = buf.error;
        if (buf.prev_reclen) {
                struct linux_dirent __user * lastdirent;
                lastdirent = (void __user *)buf.current_dir - buf.prev_reclen;

                if (put_user(buf.ctx.pos, &lastdirent->d_off))
                        error = -EFAULT;
                else
                        error = count - buf.count;
        }
        fdput_pos(f);
        return error;
}

struct getdents_callback64 {
        struct dir_context ctx;
        struct linux_dirent64 __user * current_dir;
        int prev_reclen;
        int count;
        int error;
};

static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
                     loff_t offset, u64 ino, unsigned int d_type)
{
        struct linux_dirent64 __user *dirent, *prev;
        struct getdents_callback64 *buf =
                container_of(ctx, struct getdents_callback64, ctx);
        int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
                sizeof(u64));
        int prev_reclen;

        buf->error = verify_dirent_name(name, namlen);
        if (unlikely(buf->error))
                return false;
        buf->error = -EINVAL;        /* only used if we fail.. */
        if (reclen > buf->count)
                return false;
        prev_reclen = buf->prev_reclen;
        if (prev_reclen && signal_pending(current))
                return false;
        dirent = buf->current_dir;
        prev = (void __user *)dirent - prev_reclen;
        if (!user_write_access_begin(prev, reclen + prev_reclen))
                goto efault;

        /* This might be 'dirent->d_off', but if so it will get overwritten */
        unsafe_put_user(offset, &prev->d_off, efault_end);
        unsafe_put_user(ino, &dirent->d_ino, efault_end);
        unsafe_put_user(reclen, &dirent->d_reclen, efault_end);
        unsafe_put_user(d_type, &dirent->d_type, efault_end);
        unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
        user_write_access_end();

        buf->prev_reclen = reclen;
        buf->current_dir = (void __user *)dirent + reclen;
        buf->count -= reclen;
        return true;

efault_end:
        user_write_access_end();
efault:
        buf->error = -EFAULT;
        return false;
}

SYSCALL_DEFINE3(getdents64, unsigned int, fd,
                struct linux_dirent64 __user *, dirent, unsigned int, count)
{
        struct fd f;
        struct getdents_callback64 buf = {
                .ctx.actor = filldir64,
                .count = count,
                .current_dir = dirent
        };
        int error;

        f = fdget_pos(fd);
        if (!f.file)
                return -EBADF;

        error = iterate_dir(f.file, &buf.ctx);
        if (error >= 0)
                error = buf.error;
        if (buf.prev_reclen) {
                struct linux_dirent64 __user * lastdirent;
                typeof(lastdirent->d_off) d_off = buf.ctx.pos;

                lastdirent = (void __user *) buf.current_dir - buf.prev_reclen;
                if (put_user(d_off, &lastdirent->d_off))
                        error = -EFAULT;
                else
                        error = count - buf.count;
        }
        fdput_pos(f);
        return error;
}

#ifdef CONFIG_COMPAT
struct compat_old_linux_dirent {
        compat_ulong_t        d_ino;
        compat_ulong_t        d_offset;
        unsigned short        d_namlen;
        char                d_name[];
};

struct compat_readdir_callback {
        struct dir_context ctx;
        struct compat_old_linux_dirent __user *dirent;
        int result;
};

static bool compat_fillonedir(struct dir_context *ctx, const char *name,
                             int namlen, loff_t offset, u64 ino,
                             unsigned int d_type)
{
        struct compat_readdir_callback *buf =
                container_of(ctx, struct compat_readdir_callback, ctx);
        struct compat_old_linux_dirent __user *dirent;
        compat_ulong_t d_ino;

        if (buf->result)
                return false;
        buf->result = verify_dirent_name(name, namlen);
        if (buf->result)
                return false;
        d_ino = ino;
        if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
                buf->result = -EOVERFLOW;
                return false;
        }
        buf->result++;
        dirent = buf->dirent;
        if (!user_write_access_begin(dirent,
                        (unsigned long)(dirent->d_name + namlen + 1) -
                                (unsigned long)dirent))
                goto efault;
        unsafe_put_user(d_ino, &dirent->d_ino, efault_end);
        unsafe_put_user(offset, &dirent->d_offset, efault_end);
        unsafe_put_user(namlen, &dirent->d_namlen, efault_end);
        unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
        user_write_access_end();
        return true;
efault_end:
        user_write_access_end();
efault:
        buf->result = -EFAULT;
        return false;
}

COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
                struct compat_old_linux_dirent __user *, dirent, unsigned int, count)
{
        int error;
        struct fd f = fdget_pos(fd);
        struct compat_readdir_callback buf = {
                .ctx.actor = compat_fillonedir,
                .dirent = dirent
        };

        if (!f.file)
                return -EBADF;

        error = iterate_dir(f.file, &buf.ctx);
        if (buf.result)
                error = buf.result;

        fdput_pos(f);
        return error;
}

struct compat_linux_dirent {
        compat_ulong_t        d_ino;
        compat_ulong_t        d_off;
        unsigned short        d_reclen;
        char                d_name[];
};

struct compat_getdents_callback {
        struct dir_context ctx;
        struct compat_linux_dirent __user *current_dir;
        int prev_reclen;
        int count;
        int error;
};

static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen,
                loff_t offset, u64 ino, unsigned int d_type)
{
        struct compat_linux_dirent __user *dirent, *prev;
        struct compat_getdents_callback *buf =
                container_of(ctx, struct compat_getdents_callback, ctx);
        compat_ulong_t d_ino;
        int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) +
                namlen + 2, sizeof(compat_long_t));
        int prev_reclen;

        buf->error = verify_dirent_name(name, namlen);
        if (unlikely(buf->error))
                return false;
        buf->error = -EINVAL;        /* only used if we fail.. */
        if (reclen > buf->count)
                return false;
        d_ino = ino;
        if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
                buf->error = -EOVERFLOW;
                return false;
        }
        prev_reclen = buf->prev_reclen;
        if (prev_reclen && signal_pending(current))
                return false;
        dirent = buf->current_dir;
        prev = (void __user *) dirent - prev_reclen;
        if (!user_write_access_begin(prev, reclen + prev_reclen))
                goto efault;

        unsafe_put_user(offset, &prev->d_off, efault_end);
        unsafe_put_user(d_ino, &dirent->d_ino, efault_end);
        unsafe_put_user(reclen, &dirent->d_reclen, efault_end);
        unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end);
        unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
        user_write_access_end();

        buf->prev_reclen = reclen;
        buf->current_dir = (void __user *)dirent + reclen;
        buf->count -= reclen;
        return true;
efault_end:
        user_write_access_end();
efault:
        buf->error = -EFAULT;
        return false;
}

COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
                struct compat_linux_dirent __user *, dirent, unsigned int, count)
{
        struct fd f;
        struct compat_getdents_callback buf = {
                .ctx.actor = compat_filldir,
                .current_dir = dirent,
                .count = count
        };
        int error;

        f = fdget_pos(fd);
        if (!f.file)
                return -EBADF;

        error = iterate_dir(f.file, &buf.ctx);
        if (error >= 0)
                error = buf.error;
        if (buf.prev_reclen) {
                struct compat_linux_dirent __user * lastdirent;
                lastdirent = (void __user *)buf.current_dir - buf.prev_reclen;

                if (put_user(buf.ctx.pos, &lastdirent->d_off))
                        error = -EFAULT;
                else
                        error = count - buf.count;
        }
        fdput_pos(f);
        return error;
}
#endif

















































































































































































































































































































    2 


   19 













   19 





   19 








   19 

















   30 





























   30 





   26 




















   17 

   19 
















   26 

   29 






















    5 

    4 

    5 



















   16 

   17 
















   23 

   29 





















    5 

    5 

    5 



















    5 

    5 
















   15 

   18 











































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
// SPDX-License-Identifier: GPL-2.0-only

#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
#include <linux/highuid.h>
#include <linux/cred.h>
#include <linux/securebits.h>
#include <linux/security.h>
#include <linux/keyctl.h>
#include <linux/key-type.h>
#include <keys/user-type.h>
#include <linux/seq_file.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/projid.h>
#include <linux/fs_struct.h>
#include <linux/bsearch.h>
#include <linux/sort.h>

static struct kmem_cache *user_ns_cachep __ro_after_init;
static DEFINE_MUTEX(userns_state_mutex);

static bool new_idmap_permitted(const struct file *file,
                                struct user_namespace *ns, int cap_setid,
                                struct uid_gid_map *map);
static void free_user_ns(struct work_struct *work);

static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid)
{
        return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES);
}

static void dec_user_namespaces(struct ucounts *ucounts)
{
        return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES);
}

static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
{
        /* Start with the same capabilities as init but useless for doing
         * anything as the capabilities are bound to the new user namespace.
         */
        cred->securebits = SECUREBITS_DEFAULT;
        cred->cap_inheritable = CAP_EMPTY_SET;
        cred->cap_permitted = CAP_FULL_SET;
        cred->cap_effective = CAP_FULL_SET;
        cred->cap_ambient = CAP_EMPTY_SET;
        cred->cap_bset = CAP_FULL_SET;
#ifdef CONFIG_KEYS
        key_put(cred->request_key_auth);
        cred->request_key_auth = NULL;
#endif
        /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
        cred->user_ns = user_ns;
}

static unsigned long enforced_nproc_rlimit(void)
{
        unsigned long limit = RLIM_INFINITY;

        /* Is RLIMIT_NPROC currently enforced? */
        if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) ||
            (current_user_ns() != &init_user_ns))
                limit = rlimit(RLIMIT_NPROC);

        return limit;
}

/*
 * Create a new user namespace, deriving the creator from the user in the
 * passed credentials, and replacing that user with the new root user for the
 * new namespace.
 *
 * This is called by copy_creds(), which will finish setting the target task's
 * credentials.
 */
int create_user_ns(struct cred *new)
{
        struct user_namespace *ns, *parent_ns = new->user_ns;
        kuid_t owner = new->euid;
        kgid_t group = new->egid;
        struct ucounts *ucounts;
        int ret, i;

        ret = -ENOSPC;
        if (parent_ns->level > 32)
                goto fail;

        ucounts = inc_user_namespaces(parent_ns, owner);
        if (!ucounts)
                goto fail;

        /*
         * Verify that we can not violate the policy of which files
         * may be accessed that is specified by the root directory,
         * by verifying that the root directory is at the root of the
         * mount namespace which allows all files to be accessed.
         */
        ret = -EPERM;
        if (current_chrooted())
                goto fail_dec;

        /* The creator needs a mapping in the parent user namespace
         * or else we won't be able to reasonably tell userspace who
         * created a user_namespace.
         */
        ret = -EPERM;
        if (!kuid_has_mapping(parent_ns, owner) ||
            !kgid_has_mapping(parent_ns, group))
                goto fail_dec;

        ret = security_create_user_ns(new);
        if (ret < 0)
                goto fail_dec;

        ret = -ENOMEM;
        ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
        if (!ns)
                goto fail_dec;

        ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
        ret = ns_alloc_inum(&ns->ns);
        if (ret)
                goto fail_free;
        ns->ns.ops = &userns_operations;

        refcount_set(&ns->ns.count, 1);
        /* Leave the new->user_ns reference with the new user namespace. */
        ns->parent = parent_ns;
        ns->level = parent_ns->level + 1;
        ns->owner = owner;
        ns->group = group;
        INIT_WORK(&ns->work, free_user_ns);
        for (i = 0; i < UCOUNT_COUNTS; i++) {
                ns->ucount_max[i] = INT_MAX;
        }
        set_userns_rlimit_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit());
        set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE));
        set_userns_rlimit_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING));
        set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK));
        ns->ucounts = ucounts;

        /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
        mutex_lock(&userns_state_mutex);
        ns->flags = parent_ns->flags;
        mutex_unlock(&userns_state_mutex);

#ifdef CONFIG_KEYS
        INIT_LIST_HEAD(&ns->keyring_name_list);
        init_rwsem(&ns->keyring_sem);
#endif
        ret = -ENOMEM;
        if (!setup_userns_sysctls(ns))
                goto fail_keyring;

        set_cred_user_ns(new, ns);
        return 0;
fail_keyring:
#ifdef CONFIG_PERSISTENT_KEYRINGS
        key_put(ns->persistent_keyring_register);
#endif
        ns_free_inum(&ns->ns);
fail_free:
        kmem_cache_free(user_ns_cachep, ns);
fail_dec:
        dec_user_namespaces(ucounts);
fail:
        return ret;
}

int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
{
        struct cred *cred;
        int err = -ENOMEM;

        if (!(unshare_flags & CLONE_NEWUSER))
                return 0;

        cred = prepare_creds();
        if (cred) {
                err = create_user_ns(cred);
                if (err)
                        put_cred(cred);
                else
                        *new_cred = cred;
        }

        return err;
}

static void free_user_ns(struct work_struct *work)
{
        struct user_namespace *parent, *ns =
                container_of(work, struct user_namespace, work);

        do {
                struct ucounts *ucounts = ns->ucounts;
                parent = ns->parent;
                if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                        kfree(ns->gid_map.forward);
                        kfree(ns->gid_map.reverse);
                }
                if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                        kfree(ns->uid_map.forward);
                        kfree(ns->uid_map.reverse);
                }
                if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                        kfree(ns->projid_map.forward);
                        kfree(ns->projid_map.reverse);
                }
#if IS_ENABLED(CONFIG_BINFMT_MISC)
                kfree(ns->binfmt_misc);
#endif
                retire_userns_sysctls(ns);
                key_free_user_ns(ns);
                ns_free_inum(&ns->ns);
                kmem_cache_free(user_ns_cachep, ns);
                dec_user_namespaces(ucounts);
                ns = parent;
        } while (refcount_dec_and_test(&parent->ns.count));
}

void __put_user_ns(struct user_namespace *ns)
{
        schedule_work(&ns->work);
}
EXPORT_SYMBOL(__put_user_ns);

/*
 * struct idmap_key - holds the information necessary to find an idmapping in a
 * sorted idmap array. It is passed to cmp_map_id() as first argument.
 */
struct idmap_key {
        bool map_up; /* true  -> id from kid; false -> kid from id */
        u32 id; /* id to find */
        u32 count; /* == 0 unless used with map_id_range_down() */
};

/*
 * cmp_map_id - Function to be passed to bsearch() to find the requested
 * idmapping. Expects struct idmap_key to be passed via @k.
 */
static int cmp_map_id(const void *k, const void *e)
{
        u32 first, last, id2;
        const struct idmap_key *key = k;
        const struct uid_gid_extent *el = e;

        id2 = key->id + key->count - 1;

        /* handle map_id_{down,up}() */
        if (key->map_up)
                first = el->lower_first;
        else
                first = el->first;

        last = first + el->count - 1;

        if (key->id >= first && key->id <= last &&
            (id2 >= first && id2 <= last))
                return 0;

        if (key->id < first || id2 < first)
                return -1;

        return 1;
}

/*
 * map_id_range_down_max - Find idmap via binary search in ordered idmap array.
 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
{
        struct idmap_key key;

        key.map_up = false;
        key.count = count;
        key.id = id;

        return bsearch(&key, map->forward, extents,
                       sizeof(struct uid_gid_extent), cmp_map_id);
}

/*
 * map_id_range_down_base - Find idmap via binary search in static extent array.
 * Can only be called if number of mappings is equal or less than
 * UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
{
        unsigned idx;
        u32 first, last, id2;

        id2 = id + count - 1;

        /* Find the matching extent */
        for (idx = 0; idx < extents; idx++) {
                first = map->extent[idx].first;
                last = first + map->extent[idx].count - 1;
                if (id >= first && id <= last &&
                    (id2 >= first && id2 <= last))
                        return &map->extent[idx];
        }
        return NULL;
}

static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
{
        struct uid_gid_extent *extent;
        unsigned extents = map->nr_extents;
        smp_rmb();

        if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                extent = map_id_range_down_base(extents, map, id, count);
        else
                extent = map_id_range_down_max(extents, map, id, count);

        /* Map the id or note failure */
        if (extent)
                id = (id - extent->first) + extent->lower_first;
        else
                id = (u32) -1;

        return id;
}

u32 map_id_down(struct uid_gid_map *map, u32 id)
{
        return map_id_range_down(map, id, 1);
}

/*
 * map_id_up_base - Find idmap via binary search in static extent array.
 * Can only be called if number of mappings is equal or less than
 * UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id)
{
        unsigned idx;
        u32 first, last;

        /* Find the matching extent */
        for (idx = 0; idx < extents; idx++) {
                first = map->extent[idx].lower_first;
                last = first + map->extent[idx].count - 1;
                if (id >= first && id <= last)
                        return &map->extent[idx];
        }
        return NULL;
}

/*
 * map_id_up_max - Find idmap via binary search in ordered idmap array.
 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id)
{
        struct idmap_key key;

        key.map_up = true;
        key.count = 1;
        key.id = id;

        return bsearch(&key, map->reverse, extents,
                       sizeof(struct uid_gid_extent), cmp_map_id);
}

u32 map_id_up(struct uid_gid_map *map, u32 id)
{
        struct uid_gid_extent *extent;
        unsigned extents = map->nr_extents;
        smp_rmb();

        if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                extent = map_id_up_base(extents, map, id);
        else
                extent = map_id_up_max(extents, map, id);

        /* Map the id or note failure */
        if (extent)
                id = (id - extent->lower_first) + extent->first;
        else
                id = (u32) -1;

        return id;
}

/**
 *        make_kuid - Map a user-namespace uid pair into a kuid.
 *        @ns:  User namespace that the uid is in
 *        @uid: User identifier
 *
 *        Maps a user-namespace uid pair into a kernel internal kuid,
 *        and returns that kuid.
 *
 *        When there is no mapping defined for the user-namespace uid
 *        pair INVALID_UID is returned.  Callers are expected to test
 *        for and handle INVALID_UID being returned.  INVALID_UID
 *        may be tested for using uid_valid().
 */
kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
{
        /* Map the uid to a global kernel uid */
        return KUIDT_INIT(map_id_down(&ns->uid_map, uid));
}
EXPORT_SYMBOL(make_kuid);

/**
 *        from_kuid - Create a uid from a kuid user-namespace pair.
 *        @targ: The user namespace we want a uid in.
 *        @kuid: The kernel internal uid to start with.
 *
 *        Map @kuid into the user-namespace specified by @targ and
 *        return the resulting uid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        If @kuid has no mapping in @targ (uid_t)-1 is returned.
 */
uid_t from_kuid(struct user_namespace *targ, kuid_t kuid)
{
        /* Map the uid from a global kernel uid */
        return map_id_up(&targ->uid_map, __kuid_val(kuid));
}
EXPORT_SYMBOL(from_kuid);

/**
 *        from_kuid_munged - Create a uid from a kuid user-namespace pair.
 *        @targ: The user namespace we want a uid in.
 *        @kuid: The kernel internal uid to start with.
 *
 *        Map @kuid into the user-namespace specified by @targ and
 *        return the resulting uid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        Unlike from_kuid from_kuid_munged never fails and always
 *        returns a valid uid.  This makes from_kuid_munged appropriate
 *        for use in syscalls like stat and getuid where failing the
 *        system call and failing to provide a valid uid are not an
 *        options.
 *
 *        If @kuid has no mapping in @targ overflowuid is returned.
 */
uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
{
        uid_t uid;
        uid = from_kuid(targ, kuid);

        if (uid == (uid_t) -1)
                uid = overflowuid;
        return uid;
}
EXPORT_SYMBOL(from_kuid_munged);

/**
 *        make_kgid - Map a user-namespace gid pair into a kgid.
 *        @ns:  User namespace that the gid is in
 *        @gid: group identifier
 *
 *        Maps a user-namespace gid pair into a kernel internal kgid,
 *        and returns that kgid.
 *
 *        When there is no mapping defined for the user-namespace gid
 *        pair INVALID_GID is returned.  Callers are expected to test
 *        for and handle INVALID_GID being returned.  INVALID_GID may be
 *        tested for using gid_valid().
 */
kgid_t make_kgid(struct user_namespace *ns, gid_t gid)
{
        /* Map the gid to a global kernel gid */
        return KGIDT_INIT(map_id_down(&ns->gid_map, gid));
}
EXPORT_SYMBOL(make_kgid);

/**
 *        from_kgid - Create a gid from a kgid user-namespace pair.
 *        @targ: The user namespace we want a gid in.
 *        @kgid: The kernel internal gid to start with.
 *
 *        Map @kgid into the user-namespace specified by @targ and
 *        return the resulting gid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        If @kgid has no mapping in @targ (gid_t)-1 is returned.
 */
gid_t from_kgid(struct user_namespace *targ, kgid_t kgid)
{
        /* Map the gid from a global kernel gid */
        return map_id_up(&targ->gid_map, __kgid_val(kgid));
}
EXPORT_SYMBOL(from_kgid);

/**
 *        from_kgid_munged - Create a gid from a kgid user-namespace pair.
 *        @targ: The user namespace we want a gid in.
 *        @kgid: The kernel internal gid to start with.
 *
 *        Map @kgid into the user-namespace specified by @targ and
 *        return the resulting gid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        Unlike from_kgid from_kgid_munged never fails and always
 *        returns a valid gid.  This makes from_kgid_munged appropriate
 *        for use in syscalls like stat and getgid where failing the
 *        system call and failing to provide a valid gid are not options.
 *
 *        If @kgid has no mapping in @targ overflowgid is returned.
 */
gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
{
        gid_t gid;
        gid = from_kgid(targ, kgid);

        if (gid == (gid_t) -1)
                gid = overflowgid;
        return gid;
}
EXPORT_SYMBOL(from_kgid_munged);

/**
 *        make_kprojid - Map a user-namespace projid pair into a kprojid.
 *        @ns:  User namespace that the projid is in
 *        @projid: Project identifier
 *
 *        Maps a user-namespace uid pair into a kernel internal kuid,
 *        and returns that kuid.
 *
 *        When there is no mapping defined for the user-namespace projid
 *        pair INVALID_PROJID is returned.  Callers are expected to test
 *        for and handle INVALID_PROJID being returned.  INVALID_PROJID
 *        may be tested for using projid_valid().
 */
kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid)
{
        /* Map the uid to a global kernel uid */
        return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid));
}
EXPORT_SYMBOL(make_kprojid);

/**
 *        from_kprojid - Create a projid from a kprojid user-namespace pair.
 *        @targ: The user namespace we want a projid in.
 *        @kprojid: The kernel internal project identifier to start with.
 *
 *        Map @kprojid into the user-namespace specified by @targ and
 *        return the resulting projid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        If @kprojid has no mapping in @targ (projid_t)-1 is returned.
 */
projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid)
{
        /* Map the uid from a global kernel uid */
        return map_id_up(&targ->projid_map, __kprojid_val(kprojid));
}
EXPORT_SYMBOL(from_kprojid);

/**
 *        from_kprojid_munged - Create a projiid from a kprojid user-namespace pair.
 *        @targ: The user namespace we want a projid in.
 *        @kprojid: The kernel internal projid to start with.
 *
 *        Map @kprojid into the user-namespace specified by @targ and
 *        return the resulting projid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        Unlike from_kprojid from_kprojid_munged never fails and always
 *        returns a valid projid.  This makes from_kprojid_munged
 *        appropriate for use in syscalls like stat and where
 *        failing the system call and failing to provide a valid projid are
 *        not an options.
 *
 *        If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned.
 */
projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid)
{
        projid_t projid;
        projid = from_kprojid(targ, kprojid);

        if (projid == (projid_t) -1)
                projid = OVERFLOW_PROJID;
        return projid;
}
EXPORT_SYMBOL(from_kprojid_munged);


static int uid_m_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        struct uid_gid_extent *extent = v;
        struct user_namespace *lower_ns;
        uid_t lower;

        lower_ns = seq_user_ns(seq);
        if ((lower_ns == ns) && lower_ns->parent)
                lower_ns = lower_ns->parent;

        lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first));

        seq_printf(seq, "%10u %10u %10u\n",
                extent->first,
                lower,
                extent->count);

        return 0;
}

static int gid_m_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        struct uid_gid_extent *extent = v;
        struct user_namespace *lower_ns;
        gid_t lower;

        lower_ns = seq_user_ns(seq);
        if ((lower_ns == ns) && lower_ns->parent)
                lower_ns = lower_ns->parent;

        lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first));

        seq_printf(seq, "%10u %10u %10u\n",
                extent->first,
                lower,
                extent->count);

        return 0;
}

static int projid_m_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        struct uid_gid_extent *extent = v;
        struct user_namespace *lower_ns;
        projid_t lower;

        lower_ns = seq_user_ns(seq);
        if ((lower_ns == ns) && lower_ns->parent)
                lower_ns = lower_ns->parent;

        lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first));

        seq_printf(seq, "%10u %10u %10u\n",
                extent->first,
                lower,
                extent->count);

        return 0;
}

static void *m_start(struct seq_file *seq, loff_t *ppos,
                     struct uid_gid_map *map)
{
        loff_t pos = *ppos;
        unsigned extents = map->nr_extents;
        smp_rmb();

        if (pos >= extents)
                return NULL;

        if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                return &map->extent[pos];

        return &map->forward[pos];
}

static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
{
        struct user_namespace *ns = seq->private;

        return m_start(seq, ppos, &ns->uid_map);
}

static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
{
        struct user_namespace *ns = seq->private;

        return m_start(seq, ppos, &ns->gid_map);
}

static void *projid_m_start(struct seq_file *seq, loff_t *ppos)
{
        struct user_namespace *ns = seq->private;

        return m_start(seq, ppos, &ns->projid_map);
}

static void *m_next(struct seq_file *seq, void *v, loff_t *pos)
{
        (*pos)++;
        return seq->op->start(seq, pos);
}

static void m_stop(struct seq_file *seq, void *v)
{
        return;
}

const struct seq_operations proc_uid_seq_operations = {
        .start = uid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = uid_m_show,
};

const struct seq_operations proc_gid_seq_operations = {
        .start = gid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = gid_m_show,
};

const struct seq_operations proc_projid_seq_operations = {
        .start = projid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = projid_m_show,
};

static bool mappings_overlap(struct uid_gid_map *new_map,
                             struct uid_gid_extent *extent)
{
        u32 upper_first, lower_first, upper_last, lower_last;
        unsigned idx;

        upper_first = extent->first;
        lower_first = extent->lower_first;
        upper_last = upper_first + extent->count - 1;
        lower_last = lower_first + extent->count - 1;

        for (idx = 0; idx < new_map->nr_extents; idx++) {
                u32 prev_upper_first, prev_lower_first;
                u32 prev_upper_last, prev_lower_last;
                struct uid_gid_extent *prev;

                if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                        prev = &new_map->extent[idx];
                else
                        prev = &new_map->forward[idx];

                prev_upper_first = prev->first;
                prev_lower_first = prev->lower_first;
                prev_upper_last = prev_upper_first + prev->count - 1;
                prev_lower_last = prev_lower_first + prev->count - 1;

                /* Does the upper range intersect a previous extent? */
                if ((prev_upper_first <= upper_last) &&
                    (prev_upper_last >= upper_first))
                        return true;

                /* Does the lower range intersect a previous extent? */
                if ((prev_lower_first <= lower_last) &&
                    (prev_lower_last >= lower_first))
                        return true;
        }
        return false;
}

/*
 * insert_extent - Safely insert a new idmap extent into struct uid_gid_map.
 * Takes care to allocate a 4K block of memory if the number of mappings exceeds
 * UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent)
{
        struct uid_gid_extent *dest;

        if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) {
                struct uid_gid_extent *forward;

                /* Allocate memory for 340 mappings. */
                forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS,
                                        sizeof(struct uid_gid_extent),
                                        GFP_KERNEL);
                if (!forward)
                        return -ENOMEM;

                /* Copy over memory. Only set up memory for the forward pointer.
                 * Defer the memory setup for the reverse pointer.
                 */
                memcpy(forward, map->extent,
                       map->nr_extents * sizeof(map->extent[0]));

                map->forward = forward;
                map->reverse = NULL;
        }

        if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS)
                dest = &map->extent[map->nr_extents];
        else
                dest = &map->forward[map->nr_extents];

        *dest = *extent;
        map->nr_extents++;
        return 0;
}

/* cmp function to sort() forward mappings */
static int cmp_extents_forward(const void *a, const void *b)
{
        const struct uid_gid_extent *e1 = a;
        const struct uid_gid_extent *e2 = b;

        if (e1->first < e2->first)
                return -1;

        if (e1->first > e2->first)
                return 1;

        return 0;
}

/* cmp function to sort() reverse mappings */
static int cmp_extents_reverse(const void *a, const void *b)
{
        const struct uid_gid_extent *e1 = a;
        const struct uid_gid_extent *e2 = b;

        if (e1->lower_first < e2->lower_first)
                return -1;

        if (e1->lower_first > e2->lower_first)
                return 1;

        return 0;
}

/*
 * sort_idmaps - Sorts an array of idmap entries.
 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static int sort_idmaps(struct uid_gid_map *map)
{
        if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                return 0;

        /* Sort forward array. */
        sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent),
             cmp_extents_forward, NULL);

        /* Only copy the memory from forward we actually need. */
        map->reverse = kmemdup(map->forward,
                               map->nr_extents * sizeof(struct uid_gid_extent),
                               GFP_KERNEL);
        if (!map->reverse)
                return -ENOMEM;

        /* Sort reverse array. */
        sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent),
             cmp_extents_reverse, NULL);

        return 0;
}

/**
 * verify_root_map() - check the uid 0 mapping
 * @file: idmapping file
 * @map_ns: user namespace of the target process
 * @new_map: requested idmap
 *
 * If a process requests mapping parent uid 0 into the new ns, verify that the
 * process writing the map had the CAP_SETFCAP capability as the target process
 * will be able to write fscaps that are valid in ancestor user namespaces.
 *
 * Return: true if the mapping is allowed, false if not.
 */
static bool verify_root_map(const struct file *file,
                            struct user_namespace *map_ns,
                            struct uid_gid_map *new_map)
{
        int idx;
        const struct user_namespace *file_ns = file->f_cred->user_ns;
        struct uid_gid_extent *extent0 = NULL;

        for (idx = 0; idx < new_map->nr_extents; idx++) {
                if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                        extent0 = &new_map->extent[idx];
                else
                        extent0 = &new_map->forward[idx];
                if (extent0->lower_first == 0)
                        break;

                extent0 = NULL;
        }

        if (!extent0)
                return true;

        if (map_ns == file_ns) {
                /* The process unshared its ns and is writing to its own
                 * /proc/self/uid_map.  User already has full capabilites in
                 * the new namespace.  Verify that the parent had CAP_SETFCAP
                 * when it unshared.
                 * */
                if (!file_ns->parent_could_setfcap)
                        return false;
        } else {
                /* Process p1 is writing to uid_map of p2, who is in a child
                 * user namespace to p1's.  Verify that the opener of the map
                 * file has CAP_SETFCAP against the parent of the new map
                 * namespace */
                if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP))
                        return false;
        }

        return true;
}

static ssize_t map_write(struct file *file, const char __user *buf,
                         size_t count, loff_t *ppos,
                         int cap_setid,
                         struct uid_gid_map *map,
                         struct uid_gid_map *parent_map)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *map_ns = seq->private;
        struct uid_gid_map new_map;
        unsigned idx;
        struct uid_gid_extent extent;
        char *kbuf, *pos, *next_line;
        ssize_t ret;

        /* Only allow < page size writes at the beginning of the file */
        if ((*ppos != 0) || (count >= PAGE_SIZE))
                return -EINVAL;

        /* Slurp in the user data */
        kbuf = memdup_user_nul(buf, count);
        if (IS_ERR(kbuf))
                return PTR_ERR(kbuf);

        /*
         * The userns_state_mutex serializes all writes to any given map.
         *
         * Any map is only ever written once.
         *
         * An id map fits within 1 cache line on most architectures.
         *
         * On read nothing needs to be done unless you are on an
         * architecture with a crazy cache coherency model like alpha.
         *
         * There is a one time data dependency between reading the
         * count of the extents and the values of the extents.  The
         * desired behavior is to see the values of the extents that
         * were written before the count of the extents.
         *
         * To achieve this smp_wmb() is used on guarantee the write
         * order and smp_rmb() is guaranteed that we don't have crazy
         * architectures returning stale data.
         */
        mutex_lock(&userns_state_mutex);

        memset(&new_map, 0, sizeof(struct uid_gid_map));

        ret = -EPERM;
        /* Only allow one successful write to the map */
        if (map->nr_extents != 0)
                goto out;

        /*
         * Adjusting namespace settings requires capabilities on the target.
         */
        if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN))
                goto out;

        /* Parse the user data */
        ret = -EINVAL;
        pos = kbuf;
        for (; pos; pos = next_line) {

                /* Find the end of line and ensure I don't look past it */
                next_line = strchr(pos, '\n');
                if (next_line) {
                        *next_line = '\0';
                        next_line++;
                        if (*next_line == '\0')
                                next_line = NULL;
                }

                pos = skip_spaces(pos);
                extent.first = simple_strtoul(pos, &pos, 10);
                if (!isspace(*pos))
                        goto out;

                pos = skip_spaces(pos);
                extent.lower_first = simple_strtoul(pos, &pos, 10);
                if (!isspace(*pos))
                        goto out;

                pos = skip_spaces(pos);
                extent.count = simple_strtoul(pos, &pos, 10);
                if (*pos && !isspace(*pos))
                        goto out;

                /* Verify there is not trailing junk on the line */
                pos = skip_spaces(pos);
                if (*pos != '\0')
                        goto out;

                /* Verify we have been given valid starting values */
                if ((extent.first == (u32) -1) ||
                    (extent.lower_first == (u32) -1))
                        goto out;

                /* Verify count is not zero and does not cause the
                 * extent to wrap
                 */
                if ((extent.first + extent.count) <= extent.first)
                        goto out;
                if ((extent.lower_first + extent.count) <=
                     extent.lower_first)
                        goto out;

                /* Do the ranges in extent overlap any previous extents? */
                if (mappings_overlap(&new_map, &extent))
                        goto out;

                if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS &&
                    (next_line != NULL))
                        goto out;

                ret = insert_extent(&new_map, &extent);
                if (ret < 0)
                        goto out;
                ret = -EINVAL;
        }
        /* Be very certain the new map actually exists */
        if (new_map.nr_extents == 0)
                goto out;

        ret = -EPERM;
        /* Validate the user is allowed to use user id's mapped to. */
        if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map))
                goto out;

        ret = -EPERM;
        /* Map the lower ids from the parent user namespace to the
         * kernel global id space.
         */
        for (idx = 0; idx < new_map.nr_extents; idx++) {
                struct uid_gid_extent *e;
                u32 lower_first;

                if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                        e = &new_map.extent[idx];
                else
                        e = &new_map.forward[idx];

                lower_first = map_id_range_down(parent_map,
                                                e->lower_first,
                                                e->count);

                /* Fail if we can not map the specified extent to
                 * the kernel global id space.
                 */
                if (lower_first == (u32) -1)
                        goto out;

                e->lower_first = lower_first;
        }

        /*
         * If we want to use binary search for lookup, this clones the extent
         * array and sorts both copies.
         */
        ret = sort_idmaps(&new_map);
        if (ret < 0)
                goto out;

        /* Install the map */
        if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
                memcpy(map->extent, new_map.extent,
                       new_map.nr_extents * sizeof(new_map.extent[0]));
        } else {
                map->forward = new_map.forward;
                map->reverse = new_map.reverse;
        }
        smp_wmb();
        map->nr_extents = new_map.nr_extents;

        *ppos = count;
        ret = count;
out:
        if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                kfree(new_map.forward);
                kfree(new_map.reverse);
                map->forward = NULL;
                map->reverse = NULL;
                map->nr_extents = 0;
        }

        mutex_unlock(&userns_state_mutex);
        kfree(kbuf);
        return ret;
}

ssize_t proc_uid_map_write(struct file *file, const char __user *buf,
                           size_t size, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        struct user_namespace *seq_ns = seq_user_ns(seq);

        if (!ns->parent)
                return -EPERM;

        if ((seq_ns != ns) && (seq_ns != ns->parent))
                return -EPERM;

        return map_write(file, buf, size, ppos, CAP_SETUID,
                         &ns->uid_map, &ns->parent->uid_map);
}

ssize_t proc_gid_map_write(struct file *file, const char __user *buf,
                           size_t size, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        struct user_namespace *seq_ns = seq_user_ns(seq);

        if (!ns->parent)
                return -EPERM;

        if ((seq_ns != ns) && (seq_ns != ns->parent))
                return -EPERM;

        return map_write(file, buf, size, ppos, CAP_SETGID,
                         &ns->gid_map, &ns->parent->gid_map);
}

ssize_t proc_projid_map_write(struct file *file, const char __user *buf,
                              size_t size, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        struct user_namespace *seq_ns = seq_user_ns(seq);

        if (!ns->parent)
                return -EPERM;

        if ((seq_ns != ns) && (seq_ns != ns->parent))
                return -EPERM;

        /* Anyone can set any valid project id no capability needed */
        return map_write(file, buf, size, ppos, -1,
                         &ns->projid_map, &ns->parent->projid_map);
}

static bool new_idmap_permitted(const struct file *file,
                                struct user_namespace *ns, int cap_setid,
                                struct uid_gid_map *new_map)
{
        const struct cred *cred = file->f_cred;

        if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map))
                return false;

        /* Don't allow mappings that would allow anything that wouldn't
         * be allowed without the establishment of unprivileged mappings.
         */
        if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) &&
            uid_eq(ns->owner, cred->euid)) {
                u32 id = new_map->extent[0].lower_first;
                if (cap_setid == CAP_SETUID) {
                        kuid_t uid = make_kuid(ns->parent, id);
                        if (uid_eq(uid, cred->euid))
                                return true;
                } else if (cap_setid == CAP_SETGID) {
                        kgid_t gid = make_kgid(ns->parent, id);
                        if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) &&
                            gid_eq(gid, cred->egid))
                                return true;
                }
        }

        /* Allow anyone to set a mapping that doesn't require privilege */
        if (!cap_valid(cap_setid))
                return true;

        /* Allow the specified ids if we have the appropriate capability
         * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
         * And the opener of the id file also has the appropriate capability.
         */
        if (ns_capable(ns->parent, cap_setid) &&
            file_ns_capable(file, ns->parent, cap_setid))
                return true;

        return false;
}

int proc_setgroups_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        unsigned long userns_flags = READ_ONCE(ns->flags);

        seq_printf(seq, "%s\n",
                   (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
                   "allow" : "deny");
        return 0;
}

ssize_t proc_setgroups_write(struct file *file, const char __user *buf,
                             size_t count, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        char kbuf[8], *pos;
        bool setgroups_allowed;
        ssize_t ret;

        /* Only allow a very narrow range of strings to be written */
        ret = -EINVAL;
        if ((*ppos != 0) || (count >= sizeof(kbuf)))
                goto out;

        /* What was written? */
        ret = -EFAULT;
        if (copy_from_user(kbuf, buf, count))
                goto out;
        kbuf[count] = '\0';
        pos = kbuf;

        /* What is being requested? */
        ret = -EINVAL;
        if (strncmp(pos, "allow", 5) == 0) {
                pos += 5;
                setgroups_allowed = true;
        }
        else if (strncmp(pos, "deny", 4) == 0) {
                pos += 4;
                setgroups_allowed = false;
        }
        else
                goto out;

        /* Verify there is not trailing junk on the line */
        pos = skip_spaces(pos);
        if (*pos != '\0')
                goto out;

        ret = -EPERM;
        mutex_lock(&userns_state_mutex);
        if (setgroups_allowed) {
                /* Enabling setgroups after setgroups has been disabled
                 * is not allowed.
                 */
                if (!(ns->flags & USERNS_SETGROUPS_ALLOWED))
                        goto out_unlock;
        } else {
                /* Permanently disabling setgroups after setgroups has
                 * been enabled by writing the gid_map is not allowed.
                 */
                if (ns->gid_map.nr_extents != 0)
                        goto out_unlock;
                ns->flags &= ~USERNS_SETGROUPS_ALLOWED;
        }
        mutex_unlock(&userns_state_mutex);

        /* Report a successful write */
        *ppos = count;
        ret = count;
out:
        return ret;
out_unlock:
        mutex_unlock(&userns_state_mutex);
        goto out;
}

bool userns_may_setgroups(const struct user_namespace *ns)
{
        bool allowed;

        mutex_lock(&userns_state_mutex);
        /* It is not safe to use setgroups until a gid mapping in
         * the user namespace has been established.
         */
        allowed = ns->gid_map.nr_extents != 0;
        /* Is setgroups allowed? */
        allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED);
        mutex_unlock(&userns_state_mutex);

        return allowed;
}

/*
 * Returns true if @child is the same namespace or a descendant of
 * @ancestor.
 */
bool in_userns(const struct user_namespace *ancestor,
               const struct user_namespace *child)
{
        const struct user_namespace *ns;
        for (ns = child; ns->level > ancestor->level; ns = ns->parent)
                ;
        return (ns == ancestor);
}

bool current_in_userns(const struct user_namespace *target_ns)
{
        return in_userns(target_ns, current_user_ns());
}
EXPORT_SYMBOL(current_in_userns);

static inline struct user_namespace *to_user_ns(struct ns_common *ns)
{
        return container_of(ns, struct user_namespace, ns);
}

static struct ns_common *userns_get(struct task_struct *task)
{
        struct user_namespace *user_ns;

        rcu_read_lock();
        user_ns = get_user_ns(__task_cred(task)->user_ns);
        rcu_read_unlock();

        return user_ns ? &user_ns->ns : NULL;
}

static void userns_put(struct ns_common *ns)
{
        put_user_ns(to_user_ns(ns));
}

static int userns_install(struct nsset *nsset, struct ns_common *ns)
{
        struct user_namespace *user_ns = to_user_ns(ns);
        struct cred *cred;

        /* Don't allow gaining capabilities by reentering
         * the same user namespace.
         */
        if (user_ns == current_user_ns())
                return -EINVAL;

        /* Tasks that share a thread group must share a user namespace */
        if (!thread_group_empty(current))
                return -EINVAL;

        if (current->fs->users != 1)
                return -EINVAL;

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        cred = nsset_cred(nsset);
        if (!cred)
                return -EINVAL;

        put_user_ns(cred->user_ns);
        set_cred_user_ns(cred, get_user_ns(user_ns));

        if (set_cred_ucounts(cred) < 0)
                return -EINVAL;

        return 0;
}

struct ns_common *ns_get_owner(struct ns_common *ns)
{
        struct user_namespace *my_user_ns = current_user_ns();
        struct user_namespace *owner, *p;

        /* See if the owner is in the current user namespace */
        owner = p = ns->ops->owner(ns);
        for (;;) {
                if (!p)
                        return ERR_PTR(-EPERM);
                if (p == my_user_ns)
                        break;
                p = p->parent;
        }

        return &get_user_ns(owner)->ns;
}

static struct user_namespace *userns_owner(struct ns_common *ns)
{
        return to_user_ns(ns)->parent;
}

const struct proc_ns_operations userns_operations = {
        .name                = "user",
        .type                = CLONE_NEWUSER,
        .get                = userns_get,
        .put                = userns_put,
        .install        = userns_install,
        .owner                = userns_owner,
        .get_parent        = ns_get_owner,
};

static __init int user_namespaces_init(void)
{
        user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT);
        return 0;
}
subsys_initcall(user_namespaces_init);


































































































































































































































































































































































































































































   11 







   12 
















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
// SPDX-License-Identifier: GPL-2.0
/*
 * bio-integrity.c - bio data integrity extensions
 *
 * Copyright (C) 2007, 2008, 2009 Oracle Corporation
 * Written by: Martin K. Petersen <martin.petersen@oracle.com>
 */

#include <linux/blk-integrity.h>
#include <linux/mempool.h>
#include <linux/export.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include "blk.h"

static struct kmem_cache *bip_slab;
static struct workqueue_struct *kintegrityd_wq;

void blk_flush_integrity(void)
{
        flush_workqueue(kintegrityd_wq);
}

static void __bio_integrity_free(struct bio_set *bs,
                                 struct bio_integrity_payload *bip)
{
        if (bs && mempool_initialized(&bs->bio_integrity_pool)) {
                if (bip->bip_vec)
                        bvec_free(&bs->bvec_integrity_pool, bip->bip_vec,
                                  bip->bip_max_vcnt);
                mempool_free(bip, &bs->bio_integrity_pool);
        } else {
                kfree(bip);
        }
}

/**
 * bio_integrity_alloc - Allocate integrity payload and attach it to bio
 * @bio:        bio to attach integrity metadata to
 * @gfp_mask:        Memory allocation mask
 * @nr_vecs:        Number of integrity metadata scatter-gather elements
 *
 * Description: This function prepares a bio for attaching integrity
 * metadata.  nr_vecs specifies the maximum number of pages containing
 * integrity metadata that can be attached.
 */
struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
                                                  gfp_t gfp_mask,
                                                  unsigned int nr_vecs)
{
        struct bio_integrity_payload *bip;
        struct bio_set *bs = bio->bi_pool;
        unsigned inline_vecs;

        if (WARN_ON_ONCE(bio_has_crypt_ctx(bio)))
                return ERR_PTR(-EOPNOTSUPP);

        if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) {
                bip = kmalloc(struct_size(bip, bip_inline_vecs, nr_vecs), gfp_mask);
                inline_vecs = nr_vecs;
        } else {
                bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask);
                inline_vecs = BIO_INLINE_VECS;
        }

        if (unlikely(!bip))
                return ERR_PTR(-ENOMEM);

        memset(bip, 0, sizeof(*bip));

        /* always report as many vecs as asked explicitly, not inline vecs */
        bip->bip_max_vcnt = nr_vecs;
        if (nr_vecs > inline_vecs) {
                bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool,
                                          &bip->bip_max_vcnt, gfp_mask);
                if (!bip->bip_vec)
                        goto err;
        } else {
                bip->bip_vec = bip->bip_inline_vecs;
        }

        bip->bip_bio = bio;
        bio->bi_integrity = bip;
        bio->bi_opf |= REQ_INTEGRITY;

        return bip;
err:
        __bio_integrity_free(bs, bip);
        return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL(bio_integrity_alloc);

static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs,
                                     bool dirty)
{
        int i;

        for (i = 0; i < nr_vecs; i++) {
                if (dirty && !PageCompound(bv[i].bv_page))
                        set_page_dirty_lock(bv[i].bv_page);
                unpin_user_page(bv[i].bv_page);
        }
}

static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip)
{
        unsigned short nr_vecs = bip->bip_max_vcnt - 1;
        struct bio_vec *copy = &bip->bip_vec[1];
        size_t bytes = bip->bip_iter.bi_size;
        struct iov_iter iter;
        int ret;

        iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes);
        ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter);
        WARN_ON_ONCE(ret != bytes);

        bio_integrity_unpin_bvec(copy, nr_vecs, true);
}

static void bio_integrity_unmap_user(struct bio_integrity_payload *bip)
{
        bool dirty = bio_data_dir(bip->bip_bio) == READ;

        if (bip->bip_flags & BIP_COPY_USER) {
                if (dirty)
                        bio_integrity_uncopy_user(bip);
                kfree(bvec_virt(bip->bip_vec));
                return;
        }

        bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt, dirty);
}

/**
 * bio_integrity_free - Free bio integrity payload
 * @bio:        bio containing bip to be freed
 *
 * Description: Used to free the integrity portion of a bio. Usually
 * called from bio_free().
 */
void bio_integrity_free(struct bio *bio)
{
        struct bio_integrity_payload *bip = bio_integrity(bio);
        struct bio_set *bs = bio->bi_pool;

        if (bip->bip_flags & BIP_INTEGRITY_USER)
                return;
        if (bip->bip_flags & BIP_BLOCK_INTEGRITY)
                kfree(bvec_virt(bip->bip_vec));

        __bio_integrity_free(bs, bip);
        bio->bi_integrity = NULL;
        bio->bi_opf &= ~REQ_INTEGRITY;
}

/**
 * bio_integrity_unmap_free_user - Unmap and free bio user integrity payload
 * @bio:        bio containing bip to be unmapped and freed
 *
 * Description: Used to unmap and free the user mapped integrity portion of a
 * bio. Submitter attaching the user integrity buffer is responsible for
 * unmapping and freeing it during completion.
 */
void bio_integrity_unmap_free_user(struct bio *bio)
{
        struct bio_integrity_payload *bip = bio_integrity(bio);
        struct bio_set *bs = bio->bi_pool;

        if (WARN_ON_ONCE(!(bip->bip_flags & BIP_INTEGRITY_USER)))
                return;
        bio_integrity_unmap_user(bip);
        __bio_integrity_free(bs, bip);
        bio->bi_integrity = NULL;
        bio->bi_opf &= ~REQ_INTEGRITY;
}
EXPORT_SYMBOL(bio_integrity_unmap_free_user);

/**
 * bio_integrity_add_page - Attach integrity metadata
 * @bio:        bio to update
 * @page:        page containing integrity metadata
 * @len:        number of bytes of integrity metadata in page
 * @offset:        start offset within page
 *
 * Description: Attach a page containing integrity metadata to bio.
 */
int bio_integrity_add_page(struct bio *bio, struct page *page,
                           unsigned int len, unsigned int offset)
{
        struct request_queue *q = bdev_get_queue(bio->bi_bdev);
        struct bio_integrity_payload *bip = bio_integrity(bio);

        if (((bip->bip_iter.bi_size + len) >> SECTOR_SHIFT) >
            queue_max_hw_sectors(q))
                return 0;

        if (bip->bip_vcnt > 0) {
                struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1];
                bool same_page = false;

                if (bvec_try_merge_hw_page(q, bv, page, len, offset,
                                           &same_page)) {
                        bip->bip_iter.bi_size += len;
                        return len;
                }

                if (bip->bip_vcnt >=
                    min(bip->bip_max_vcnt, queue_max_integrity_segments(q)))
                        return 0;

                /*
                 * If the queue doesn't support SG gaps and adding this segment
                 * would create a gap, disallow it.
                 */
                if (bvec_gap_to_prev(&q->limits, bv, offset))
                        return 0;
        }

        bvec_set_page(&bip->bip_vec[bip->bip_vcnt], page, len, offset);
        bip->bip_vcnt++;
        bip->bip_iter.bi_size += len;

        return len;
}
EXPORT_SYMBOL(bio_integrity_add_page);

static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
                                   int nr_vecs, unsigned int len,
                                   unsigned int direction, u32 seed)
{
        bool write = direction == ITER_SOURCE;
        struct bio_integrity_payload *bip;
        struct iov_iter iter;
        void *buf;
        int ret;

        buf = kmalloc(len, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;

        if (write) {
                iov_iter_bvec(&iter, direction, bvec, nr_vecs, len);
                if (!copy_from_iter_full(buf, len, &iter)) {
                        ret = -EFAULT;
                        goto free_buf;
                }

                bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
        } else {
                memset(buf, 0, len);

                /*
                 * We need to preserve the original bvec and the number of vecs
                 * in it for completion handling
                 */
                bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs + 1);
        }

        if (IS_ERR(bip)) {
                ret = PTR_ERR(bip);
                goto free_buf;
        }

        if (write)
                bio_integrity_unpin_bvec(bvec, nr_vecs, false);
        else
                memcpy(&bip->bip_vec[1], bvec, nr_vecs * sizeof(*bvec));

        ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
                                     offset_in_page(buf));
        if (ret != len) {
                ret = -ENOMEM;
                goto free_bip;
        }

        bip->bip_flags |= BIP_INTEGRITY_USER | BIP_COPY_USER;
        bip->bip_iter.bi_sector = seed;
        return 0;
free_bip:
        bio_integrity_free(bio);
free_buf:
        kfree(buf);
        return ret;
}

static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec,
                                   int nr_vecs, unsigned int len, u32 seed)
{
        struct bio_integrity_payload *bip;

        bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs);
        if (IS_ERR(bip))
                return PTR_ERR(bip);

        memcpy(bip->bip_vec, bvec, nr_vecs * sizeof(*bvec));
        bip->bip_flags |= BIP_INTEGRITY_USER;
        bip->bip_iter.bi_sector = seed;
        bip->bip_iter.bi_size = len;
        return 0;
}

static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
                                    int nr_vecs, ssize_t bytes, ssize_t offset)
{
        unsigned int nr_bvecs = 0;
        int i, j;

        for (i = 0; i < nr_vecs; i = j) {
                size_t size = min_t(size_t, bytes, PAGE_SIZE - offset);
                struct folio *folio = page_folio(pages[i]);

                bytes -= size;
                for (j = i + 1; j < nr_vecs; j++) {
                        size_t next = min_t(size_t, PAGE_SIZE, bytes);

                        if (page_folio(pages[j]) != folio ||
                            pages[j] != pages[j - 1] + 1)
                                break;
                        unpin_user_page(pages[j]);
                        size += next;
                        bytes -= next;
                }

                bvec_set_page(&bvec[nr_bvecs], pages[i], size, offset);
                offset = 0;
                nr_bvecs++;
        }

        return nr_bvecs;
}

int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes,
                           u32 seed)
{
        struct request_queue *q = bdev_get_queue(bio->bi_bdev);
        unsigned int align = q->dma_pad_mask | queue_dma_alignment(q);
        struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages;
        struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec;
        unsigned int direction, nr_bvecs;
        struct iov_iter iter;
        int ret, nr_vecs;
        size_t offset;
        bool copy;

        if (bio_integrity(bio))
                return -EINVAL;
        if (bytes >> SECTOR_SHIFT > queue_max_hw_sectors(q))
                return -E2BIG;

        if (bio_data_dir(bio) == READ)
                direction = ITER_DEST;
        else
                direction = ITER_SOURCE;

        iov_iter_ubuf(&iter, direction, ubuf, bytes);
        nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1);
        if (nr_vecs > BIO_MAX_VECS)
                return -E2BIG;
        if (nr_vecs > UIO_FASTIOV) {
                bvec = kcalloc(nr_vecs, sizeof(*bvec), GFP_KERNEL);
                if (!bvec)
                        return -ENOMEM;
                pages = NULL;
        }

        copy = !iov_iter_is_aligned(&iter, align, align);
        ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset);
        if (unlikely(ret < 0))
                goto free_bvec;

        nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset);
        if (pages != stack_pages)
                kvfree(pages);
        if (nr_bvecs > queue_max_integrity_segments(q))
                copy = true;

        if (copy)
                ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes,
                                              direction, seed);
        else
                ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes, seed);
        if (ret)
                goto release_pages;
        if (bvec != stack_vec)
                kfree(bvec);

        return 0;

release_pages:
        bio_integrity_unpin_bvec(bvec, nr_bvecs, false);
free_bvec:
        if (bvec != stack_vec)
                kfree(bvec);
        return ret;
}
EXPORT_SYMBOL_GPL(bio_integrity_map_user);

/**
 * bio_integrity_process - Process integrity metadata for a bio
 * @bio:        bio to generate/verify integrity metadata for
 * @proc_iter:  iterator to process
 * @proc_fn:        Pointer to the relevant processing function
 */
static blk_status_t bio_integrity_process(struct bio *bio,
                struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn)
{
        struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
        struct blk_integrity_iter iter;
        struct bvec_iter bviter;
        struct bio_vec bv;
        struct bio_integrity_payload *bip = bio_integrity(bio);
        blk_status_t ret = BLK_STS_OK;

        iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
        iter.interval = 1 << bi->interval_exp;
        iter.tuple_size = bi->tuple_size;
        iter.seed = proc_iter->bi_sector;
        iter.prot_buf = bvec_virt(bip->bip_vec);
        iter.pi_offset = bi->pi_offset;

        __bio_for_each_segment(bv, bio, bviter, *proc_iter) {
                void *kaddr = bvec_kmap_local(&bv);

                iter.data_buf = kaddr;
                iter.data_size = bv.bv_len;
                ret = proc_fn(&iter);
                kunmap_local(kaddr);

                if (ret)
                        break;

        }
        return ret;
}

/**
 * bio_integrity_prep - Prepare bio for integrity I/O
 * @bio:        bio to prepare
 *
 * Description:  Checks if the bio already has an integrity payload attached.
 * If it does, the payload has been generated by another kernel subsystem,
 * and we just pass it through. Otherwise allocates integrity payload.
 * The bio must have data direction, target device and start sector set priot
 * to calling.  In the WRITE case, integrity metadata will be generated using
 * the block device's integrity function.  In the READ case, the buffer
 * will be prepared for DMA and a suitable end_io handler set up.
 */
bool bio_integrity_prep(struct bio *bio)
{
        struct bio_integrity_payload *bip;
        struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
        void *buf;
        unsigned long start, end;
        unsigned int len, nr_pages;
        unsigned int bytes, offset, i;

        if (!bi)
                return true;

        if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE)
                return true;

        if (!bio_sectors(bio))
                return true;

        /* Already protected? */
        if (bio_integrity(bio))
                return true;

        if (bio_data_dir(bio) == READ) {
                if (!bi->profile->verify_fn ||
                    !(bi->flags & BLK_INTEGRITY_VERIFY))
                        return true;
        } else {
                if (!bi->profile->generate_fn ||
                    !(bi->flags & BLK_INTEGRITY_GENERATE))
                        return true;
        }

        /* Allocate kernel buffer for protection data */
        len = bio_integrity_bytes(bi, bio_sectors(bio));
        buf = kmalloc(len, GFP_NOIO);
        if (unlikely(buf == NULL)) {
                printk(KERN_ERR "could not allocate integrity buffer\n");
                goto err_end_io;
        }

        end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
        start = ((unsigned long) buf) >> PAGE_SHIFT;
        nr_pages = end - start;

        /* Allocate bio integrity payload and integrity vectors */
        bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
        if (IS_ERR(bip)) {
                printk(KERN_ERR "could not allocate data integrity bioset\n");
                kfree(buf);
                goto err_end_io;
        }

        bip->bip_flags |= BIP_BLOCK_INTEGRITY;
        bip_set_seed(bip, bio->bi_iter.bi_sector);

        if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM)
                bip->bip_flags |= BIP_IP_CHECKSUM;

        /* Map it */
        offset = offset_in_page(buf);
        for (i = 0; i < nr_pages && len > 0; i++) {
                bytes = PAGE_SIZE - offset;

                if (bytes > len)
                        bytes = len;

                if (bio_integrity_add_page(bio, virt_to_page(buf),
                                           bytes, offset) < bytes) {
                        printk(KERN_ERR "could not attach integrity payload\n");
                        goto err_end_io;
                }

                buf += bytes;
                len -= bytes;
                offset = 0;
        }

        /* Auto-generate integrity metadata if this is a write */
        if (bio_data_dir(bio) == WRITE) {
                bio_integrity_process(bio, &bio->bi_iter,
                                      bi->profile->generate_fn);
        } else {
                bip->bio_iter = bio->bi_iter;
        }
        return true;

err_end_io:
        bio->bi_status = BLK_STS_RESOURCE;
        bio_endio(bio);
        return false;
}
EXPORT_SYMBOL(bio_integrity_prep);

/**
 * bio_integrity_verify_fn - Integrity I/O completion worker
 * @work:        Work struct stored in bio to be verified
 *
 * Description: This workqueue function is called to complete a READ
 * request.  The function verifies the transferred integrity metadata
 * and then calls the original bio end_io function.
 */
static void bio_integrity_verify_fn(struct work_struct *work)
{
        struct bio_integrity_payload *bip =
                container_of(work, struct bio_integrity_payload, bip_work);
        struct bio *bio = bip->bip_bio;
        struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);

        /*
         * At the moment verify is called bio's iterator was advanced
         * during split and completion, we need to rewind iterator to
         * it's original position.
         */
        bio->bi_status = bio_integrity_process(bio, &bip->bio_iter,
                                                bi->profile->verify_fn);
        bio_integrity_free(bio);
        bio_endio(bio);
}

/**
 * __bio_integrity_endio - Integrity I/O completion function
 * @bio:        Protected bio
 *
 * Description: Completion for integrity I/O
 *
 * Normally I/O completion is done in interrupt context.  However,
 * verifying I/O integrity is a time-consuming task which must be run
 * in process context.        This function postpones completion
 * accordingly.
 */
bool __bio_integrity_endio(struct bio *bio)
{
        struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
        struct bio_integrity_payload *bip = bio_integrity(bio);

        if (bio_op(bio) == REQ_OP_READ && !bio->bi_status &&
            (bip->bip_flags & BIP_BLOCK_INTEGRITY) && bi->profile->verify_fn) {
                INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
                queue_work(kintegrityd_wq, &bip->bip_work);
                return false;
        }

        bio_integrity_free(bio);
        return true;
}

/**
 * bio_integrity_advance - Advance integrity vector
 * @bio:        bio whose integrity vector to update
 * @bytes_done:        number of data bytes that have been completed
 *
 * Description: This function calculates how many integrity bytes the
 * number of completed data bytes correspond to and advances the
 * integrity vector accordingly.
 */
void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
{
        struct bio_integrity_payload *bip = bio_integrity(bio);
        struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
        unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9);

        bip->bip_iter.bi_sector += bio_integrity_intervals(bi, bytes_done >> 9);
        bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes);
}

/**
 * bio_integrity_trim - Trim integrity vector
 * @bio:        bio whose integrity vector to update
 *
 * Description: Used to trim the integrity vector in a cloned bio.
 */
void bio_integrity_trim(struct bio *bio)
{
        struct bio_integrity_payload *bip = bio_integrity(bio);
        struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);

        bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio));
}
EXPORT_SYMBOL(bio_integrity_trim);

/**
 * bio_integrity_clone - Callback for cloning bios with integrity metadata
 * @bio:        New bio
 * @bio_src:        Original bio
 * @gfp_mask:        Memory allocation mask
 *
 * Description:        Called to allocate a bip when cloning a bio
 */
int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
                        gfp_t gfp_mask)
{
        struct bio_integrity_payload *bip_src = bio_integrity(bio_src);
        struct bio_integrity_payload *bip;

        BUG_ON(bip_src == NULL);

        bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
        if (IS_ERR(bip))
                return PTR_ERR(bip);

        memcpy(bip->bip_vec, bip_src->bip_vec,
               bip_src->bip_vcnt * sizeof(struct bio_vec));

        bip->bip_vcnt = bip_src->bip_vcnt;
        bip->bip_iter = bip_src->bip_iter;
        bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY;

        return 0;
}

int bioset_integrity_create(struct bio_set *bs, int pool_size)
{
        if (mempool_initialized(&bs->bio_integrity_pool))
                return 0;

        if (mempool_init_slab_pool(&bs->bio_integrity_pool,
                                   pool_size, bip_slab))
                return -1;

        if (biovec_init_pool(&bs->bvec_integrity_pool, pool_size)) {
                mempool_exit(&bs->bio_integrity_pool);
                return -1;
        }

        return 0;
}
EXPORT_SYMBOL(bioset_integrity_create);

void bioset_integrity_free(struct bio_set *bs)
{
        mempool_exit(&bs->bio_integrity_pool);
        mempool_exit(&bs->bvec_integrity_pool);
}

void __init bio_integrity_init(void)
{
        /*
         * kintegrityd won't block much but may burn a lot of CPU cycles.
         * Make it highpri CPU intensive wq with max concurrency of 1.
         */
        kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
                                         WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
        if (!kintegrityd_wq)
                panic("Failed to create kintegrityd\n");

        bip_slab = kmem_cache_create("bio_integrity_payload",
                                     sizeof(struct bio_integrity_payload) +
                                     sizeof(struct bio_vec) * BIO_INLINE_VECS,
                                     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
}





















































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


    8 

























    1 













    5 
    7 






    8 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    8 
















    9 












    8 





    9 






























































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright 1993 by Theodore Ts'o.
 */
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/major.h>
#include <linux/wait.h>
#include <linux/blkpg.h>
#include <linux/init.h>
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/compat.h>
#include <linux/suspend.h>
#include <linux/freezer.h>
#include <linux/mutex.h>
#include <linux/writeback.h>
#include <linux/completion.h>
#include <linux/highmem.h>
#include <linux/splice.h>
#include <linux/sysfs.h>
#include <linux/miscdevice.h>
#include <linux/falloc.h>
#include <linux/uio.h>
#include <linux/ioprio.h>
#include <linux/blk-cgroup.h>
#include <linux/sched/mm.h>
#include <linux/statfs.h>
#include <linux/uaccess.h>
#include <linux/blk-mq.h>
#include <linux/spinlock.h>
#include <uapi/linux/loop.h>

/* Possible states of device */
enum {
        Lo_unbound,
        Lo_bound,
        Lo_rundown,
        Lo_deleting,
};

struct loop_func_table;

struct loop_device {
        int                lo_number;
        loff_t                lo_offset;
        loff_t                lo_sizelimit;
        int                lo_flags;
        char                lo_file_name[LO_NAME_SIZE];

        struct file *        lo_backing_file;
        struct block_device *lo_device;

        gfp_t                old_gfp_mask;

        spinlock_t                lo_lock;
        int                        lo_state;
        spinlock_t              lo_work_lock;
        struct workqueue_struct *workqueue;
        struct work_struct      rootcg_work;
        struct list_head        rootcg_cmd_list;
        struct list_head        idle_worker_list;
        struct rb_root          worker_tree;
        struct timer_list       timer;
        bool                        use_dio;
        bool                        sysfs_inited;

        struct request_queue        *lo_queue;
        struct blk_mq_tag_set        tag_set;
        struct gendisk                *lo_disk;
        struct mutex                lo_mutex;
        bool                        idr_visible;
};

struct loop_cmd {
        struct list_head list_entry;
        bool use_aio; /* use AIO interface to handle I/O */
        atomic_t ref; /* only for aio */
        long ret;
        struct kiocb iocb;
        struct bio_vec *bvec;
        struct cgroup_subsys_state *blkcg_css;
        struct cgroup_subsys_state *memcg_css;
};

#define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ)
#define LOOP_DEFAULT_HW_Q_DEPTH 128

static DEFINE_IDR(loop_index_idr);
static DEFINE_MUTEX(loop_ctl_mutex);
static DEFINE_MUTEX(loop_validate_mutex);

/**
 * loop_global_lock_killable() - take locks for safe loop_validate_file() test
 *
 * @lo: struct loop_device
 * @global: true if @lo is about to bind another "struct loop_device", false otherwise
 *
 * Returns 0 on success, -EINTR otherwise.
 *
 * Since loop_validate_file() traverses on other "struct loop_device" if
 * is_loop_device() is true, we need a global lock for serializing concurrent
 * loop_configure()/loop_change_fd()/__loop_clr_fd() calls.
 */
static int loop_global_lock_killable(struct loop_device *lo, bool global)
{
        int err;

        if (global) {
                err = mutex_lock_killable(&loop_validate_mutex);
                if (err)
                        return err;
        }
        err = mutex_lock_killable(&lo->lo_mutex);
        if (err && global)
                mutex_unlock(&loop_validate_mutex);
        return err;
}

/**
 * loop_global_unlock() - release locks taken by loop_global_lock_killable()
 *
 * @lo: struct loop_device
 * @global: true if @lo was about to bind another "struct loop_device", false otherwise
 */
static void loop_global_unlock(struct loop_device *lo, bool global)
{
        mutex_unlock(&lo->lo_mutex);
        if (global)
                mutex_unlock(&loop_validate_mutex);
}

static int max_part;
static int part_shift;

static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file)
{
        loff_t loopsize;

        /* Compute loopsize in bytes */
        loopsize = i_size_read(file->f_mapping->host);
        if (offset > 0)
                loopsize -= offset;
        /* offset is beyond i_size, weird but possible */
        if (loopsize < 0)
                return 0;

        if (sizelimit > 0 && sizelimit < loopsize)
                loopsize = sizelimit;
        /*
         * Unfortunately, if we want to do I/O on the device,
         * the number of 512-byte sectors has to fit into a sector_t.
         */
        return loopsize >> 9;
}

static loff_t get_loop_size(struct loop_device *lo, struct file *file)
{
        return get_size(lo->lo_offset, lo->lo_sizelimit, file);
}

/*
 * We support direct I/O only if lo_offset is aligned with the logical I/O size
 * of backing device, and the logical block size of loop is bigger than that of
 * the backing device.
 */
static bool lo_bdev_can_use_dio(struct loop_device *lo,
                struct block_device *backing_bdev)
{
        unsigned short sb_bsize = bdev_logical_block_size(backing_bdev);

        if (queue_logical_block_size(lo->lo_queue) < sb_bsize)
                return false;
        if (lo->lo_offset & (sb_bsize - 1))
                return false;
        return true;
}

static void __loop_update_dio(struct loop_device *lo, bool dio)
{
        struct file *file = lo->lo_backing_file;
        struct inode *inode = file->f_mapping->host;
        struct block_device *backing_bdev = NULL;
        bool use_dio;

        if (S_ISBLK(inode->i_mode))
                backing_bdev = I_BDEV(inode);
        else if (inode->i_sb->s_bdev)
                backing_bdev = inode->i_sb->s_bdev;

        use_dio = dio && (file->f_mode & FMODE_CAN_ODIRECT) &&
                (!backing_bdev || lo_bdev_can_use_dio(lo, backing_bdev));

        if (lo->use_dio == use_dio)
                return;

        /* flush dirty pages before changing direct IO */
        vfs_fsync(file, 0);

        /*
         * The flag of LO_FLAGS_DIRECT_IO is handled similarly with
         * LO_FLAGS_READ_ONLY, both are set from kernel, and losetup
         * will get updated by ioctl(LOOP_GET_STATUS)
         */
        if (lo->lo_state == Lo_bound)
                blk_mq_freeze_queue(lo->lo_queue);
        lo->use_dio = use_dio;
        if (use_dio) {
                blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, lo->lo_queue);
                lo->lo_flags |= LO_FLAGS_DIRECT_IO;
        } else {
                blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue);
                lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
        }
        if (lo->lo_state == Lo_bound)
                blk_mq_unfreeze_queue(lo->lo_queue);
}

/**
 * loop_set_size() - sets device size and notifies userspace
 * @lo: struct loop_device to set the size for
 * @size: new size of the loop device
 *
 * Callers must validate that the size passed into this function fits into
 * a sector_t, eg using loop_validate_size()
 */
static void loop_set_size(struct loop_device *lo, loff_t size)
{
        if (!set_capacity_and_notify(lo->lo_disk, size))
                kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
}

static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos)
{
        struct iov_iter i;
        ssize_t bw;

        iov_iter_bvec(&i, ITER_SOURCE, bvec, 1, bvec->bv_len);

        bw = vfs_iter_write(file, &i, ppos, 0);

        if (likely(bw ==  bvec->bv_len))
                return 0;

        printk_ratelimited(KERN_ERR
                "loop: Write error at byte offset %llu, length %i.\n",
                (unsigned long long)*ppos, bvec->bv_len);
        if (bw >= 0)
                bw = -EIO;
        return bw;
}

static int lo_write_simple(struct loop_device *lo, struct request *rq,
                loff_t pos)
{
        struct bio_vec bvec;
        struct req_iterator iter;
        int ret = 0;

        rq_for_each_segment(bvec, rq, iter) {
                ret = lo_write_bvec(lo->lo_backing_file, &bvec, &pos);
                if (ret < 0)
                        break;
                cond_resched();
        }

        return ret;
}

static int lo_read_simple(struct loop_device *lo, struct request *rq,
                loff_t pos)
{
        struct bio_vec bvec;
        struct req_iterator iter;
        struct iov_iter i;
        ssize_t len;

        rq_for_each_segment(bvec, rq, iter) {
                iov_iter_bvec(&i, ITER_DEST, &bvec, 1, bvec.bv_len);
                len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0);
                if (len < 0)
                        return len;

                flush_dcache_page(bvec.bv_page);

                if (len != bvec.bv_len) {
                        struct bio *bio;

                        __rq_for_each_bio(bio, rq)
                                zero_fill_bio(bio);
                        break;
                }
                cond_resched();
        }

        return 0;
}

static void loop_clear_limits(struct loop_device *lo, int mode)
{
        struct queue_limits lim = queue_limits_start_update(lo->lo_queue);

        if (mode & FALLOC_FL_ZERO_RANGE)
                lim.max_write_zeroes_sectors = 0;

        if (mode & FALLOC_FL_PUNCH_HOLE) {
                lim.max_hw_discard_sectors = 0;
                lim.discard_granularity = 0;
        }

        queue_limits_commit_update(lo->lo_queue, &lim);
}

static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos,
                        int mode)
{
        /*
         * We use fallocate to manipulate the space mappings used by the image
         * a.k.a. discard/zerorange.
         */
        struct file *file = lo->lo_backing_file;
        int ret;

        mode |= FALLOC_FL_KEEP_SIZE;

        if (!bdev_max_discard_sectors(lo->lo_device))
                return -EOPNOTSUPP;

        ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq));
        if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP))
                return -EIO;

        /*
         * We initially configure the limits in a hope that fallocate is
         * supported and clear them here if that turns out not to be true.
         */
        if (unlikely(ret == -EOPNOTSUPP))
                loop_clear_limits(lo, mode);

        return ret;
}

static int lo_req_flush(struct loop_device *lo, struct request *rq)
{
        int ret = vfs_fsync(lo->lo_backing_file, 0);
        if (unlikely(ret && ret != -EINVAL))
                ret = -EIO;

        return ret;
}

static void lo_complete_rq(struct request *rq)
{
        struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
        blk_status_t ret = BLK_STS_OK;

        if (!cmd->use_aio || cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) ||
            req_op(rq) != REQ_OP_READ) {
                if (cmd->ret < 0)
                        ret = errno_to_blk_status(cmd->ret);
                goto end_io;
        }

        /*
         * Short READ - if we got some data, advance our request and
         * retry it. If we got no data, end the rest with EIO.
         */
        if (cmd->ret) {
                blk_update_request(rq, BLK_STS_OK, cmd->ret);
                cmd->ret = 0;
                blk_mq_requeue_request(rq, true);
        } else {
                if (cmd->use_aio) {
                        struct bio *bio = rq->bio;

                        while (bio) {
                                zero_fill_bio(bio);
                                bio = bio->bi_next;
                        }
                }
                ret = BLK_STS_IOERR;
end_io:
                blk_mq_end_request(rq, ret);
        }
}

static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
{
        struct request *rq = blk_mq_rq_from_pdu(cmd);

        if (!atomic_dec_and_test(&cmd->ref))
                return;
        kfree(cmd->bvec);
        cmd->bvec = NULL;
        if (likely(!blk_should_fake_timeout(rq->q)))
                blk_mq_complete_request(rq);
}

static void lo_rw_aio_complete(struct kiocb *iocb, long ret)
{
        struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);

        cmd->ret = ret;
        lo_rw_aio_do_completion(cmd);
}

static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
                     loff_t pos, int rw)
{
        struct iov_iter iter;
        struct req_iterator rq_iter;
        struct bio_vec *bvec;
        struct request *rq = blk_mq_rq_from_pdu(cmd);
        struct bio *bio = rq->bio;
        struct file *file = lo->lo_backing_file;
        struct bio_vec tmp;
        unsigned int offset;
        int nr_bvec = 0;
        int ret;

        rq_for_each_bvec(tmp, rq, rq_iter)
                nr_bvec++;

        if (rq->bio != rq->biotail) {

                bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
                                     GFP_NOIO);
                if (!bvec)
                        return -EIO;
                cmd->bvec = bvec;

                /*
                 * The bios of the request may be started from the middle of
                 * the 'bvec' because of bio splitting, so we can't directly
                 * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
                 * API will take care of all details for us.
                 */
                rq_for_each_bvec(tmp, rq, rq_iter) {
                        *bvec = tmp;
                        bvec++;
                }
                bvec = cmd->bvec;
                offset = 0;
        } else {
                /*
                 * Same here, this bio may be started from the middle of the
                 * 'bvec' because of bio splitting, so offset from the bvec
                 * must be passed to iov iterator
                 */
                offset = bio->bi_iter.bi_bvec_done;
                bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
        }
        atomic_set(&cmd->ref, 2);

        iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq));
        iter.iov_offset = offset;

        cmd->iocb.ki_pos = pos;
        cmd->iocb.ki_filp = file;
        cmd->iocb.ki_complete = lo_rw_aio_complete;
        cmd->iocb.ki_flags = IOCB_DIRECT;
        cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);

        if (rw == ITER_SOURCE)
                ret = file->f_op->write_iter(&cmd->iocb, &iter);
        else
                ret = file->f_op->read_iter(&cmd->iocb, &iter);

        lo_rw_aio_do_completion(cmd);

        if (ret != -EIOCBQUEUED)
                lo_rw_aio_complete(&cmd->iocb, ret);
        return 0;
}

static int do_req_filebacked(struct loop_device *lo, struct request *rq)
{
        struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
        loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;

        /*
         * lo_write_simple and lo_read_simple should have been covered
         * by io submit style function like lo_rw_aio(), one blocker
         * is that lo_read_simple() need to call flush_dcache_page after
         * the page is written from kernel, and it isn't easy to handle
         * this in io submit style function which submits all segments
         * of the req at one time. And direct read IO doesn't need to
         * run flush_dcache_page().
         */
        switch (req_op(rq)) {
        case REQ_OP_FLUSH:
                return lo_req_flush(lo, rq);
        case REQ_OP_WRITE_ZEROES:
                /*
                 * If the caller doesn't want deallocation, call zeroout to
                 * write zeroes the range.  Otherwise, punch them out.
                 */
                return lo_fallocate(lo, rq, pos,
                        (rq->cmd_flags & REQ_NOUNMAP) ?
                                FALLOC_FL_ZERO_RANGE :
                                FALLOC_FL_PUNCH_HOLE);
        case REQ_OP_DISCARD:
                return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE);
        case REQ_OP_WRITE:
                if (cmd->use_aio)
                        return lo_rw_aio(lo, cmd, pos, ITER_SOURCE);
                else
                        return lo_write_simple(lo, rq, pos);
        case REQ_OP_READ:
                if (cmd->use_aio)
                        return lo_rw_aio(lo, cmd, pos, ITER_DEST);
                else
                        return lo_read_simple(lo, rq, pos);
        default:
                WARN_ON_ONCE(1);
                return -EIO;
        }
}

static inline void loop_update_dio(struct loop_device *lo)
{
        __loop_update_dio(lo, (lo->lo_backing_file->f_flags & O_DIRECT) |
                                lo->use_dio);
}

static void loop_reread_partitions(struct loop_device *lo)
{
        int rc;

        mutex_lock(&lo->lo_disk->open_mutex);
        rc = bdev_disk_changed(lo->lo_disk, false);
        mutex_unlock(&lo->lo_disk->open_mutex);
        if (rc)
                pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n",
                        __func__, lo->lo_number, lo->lo_file_name, rc);
}

static inline int is_loop_device(struct file *file)
{
        struct inode *i = file->f_mapping->host;

        return i && S_ISBLK(i->i_mode) && imajor(i) == LOOP_MAJOR;
}

static int loop_validate_file(struct file *file, struct block_device *bdev)
{
        struct inode        *inode = file->f_mapping->host;
        struct file        *f = file;

        /* Avoid recursion */
        while (is_loop_device(f)) {
                struct loop_device *l;

                lockdep_assert_held(&loop_validate_mutex);
                if (f->f_mapping->host->i_rdev == bdev->bd_dev)
                        return -EBADF;

                l = I_BDEV(f->f_mapping->host)->bd_disk->private_data;
                if (l->lo_state != Lo_bound)
                        return -EINVAL;
                /* Order wrt setting lo->lo_backing_file in loop_configure(). */
                rmb();
                f = l->lo_backing_file;
        }
        if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
                return -EINVAL;
        return 0;
}

/*
 * loop_change_fd switched the backing store of a loopback device to
 * a new file. This is useful for operating system installers to free up
 * the original file and in High Availability environments to switch to
 * an alternative location for the content in case of server meltdown.
 * This can only work if the loop device is used read-only, and if the
 * new backing store is the same size and type as the old backing store.
 */
static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
                          unsigned int arg)
{
        struct file *file = fget(arg);
        struct file *old_file;
        int error;
        bool partscan;
        bool is_loop;

        if (!file)
                return -EBADF;

        /* suppress uevents while reconfiguring the device */
        dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1);

        is_loop = is_loop_device(file);
        error = loop_global_lock_killable(lo, is_loop);
        if (error)
                goto out_putf;
        error = -ENXIO;
        if (lo->lo_state != Lo_bound)
                goto out_err;

        /* the loop device has to be read-only */
        error = -EINVAL;
        if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))
                goto out_err;

        error = loop_validate_file(file, bdev);
        if (error)
                goto out_err;

        old_file = lo->lo_backing_file;

        error = -EINVAL;

        /* size of the new backing store needs to be the same */
        if (get_loop_size(lo, file) != get_loop_size(lo, old_file))
                goto out_err;

        /* and ... switch */
        disk_force_media_change(lo->lo_disk);
        blk_mq_freeze_queue(lo->lo_queue);
        mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
        lo->lo_backing_file = file;
        lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping);
        mapping_set_gfp_mask(file->f_mapping,
                             lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
        loop_update_dio(lo);
        blk_mq_unfreeze_queue(lo->lo_queue);
        partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
        loop_global_unlock(lo, is_loop);

        /*
         * Flush loop_validate_file() before fput(), for l->lo_backing_file
         * might be pointing at old_file which might be the last reference.
         */
        if (!is_loop) {
                mutex_lock(&loop_validate_mutex);
                mutex_unlock(&loop_validate_mutex);
        }
        /*
         * We must drop file reference outside of lo_mutex as dropping
         * the file ref can take open_mutex which creates circular locking
         * dependency.
         */
        fput(old_file);
        if (partscan)
                loop_reread_partitions(lo);

        error = 0;
done:
        /* enable and uncork uevent now that we are done */
        dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0);
        return error;

out_err:
        loop_global_unlock(lo, is_loop);
out_putf:
        fput(file);
        goto done;
}

/* loop sysfs attributes */

static ssize_t loop_attr_show(struct device *dev, char *page,
                              ssize_t (*callback)(struct loop_device *, char *))
{
        struct gendisk *disk = dev_to_disk(dev);
        struct loop_device *lo = disk->private_data;

        return callback(lo, page);
}

#define LOOP_ATTR_RO(_name)                                                \
static ssize_t loop_attr_##_name##_show(struct loop_device *, char *);        \
static ssize_t loop_attr_do_show_##_name(struct device *d,                \
                                struct device_attribute *attr, char *b)        \
{                                                                        \
        return loop_attr_show(d, b, loop_attr_##_name##_show);                \
}                                                                        \
static struct device_attribute loop_attr_##_name =                        \
        __ATTR(_name, 0444, loop_attr_do_show_##_name, NULL);

static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
{
        ssize_t ret;
        char *p = NULL;

        spin_lock_irq(&lo->lo_lock);
        if (lo->lo_backing_file)
                p = file_path(lo->lo_backing_file, buf, PAGE_SIZE - 1);
        spin_unlock_irq(&lo->lo_lock);

        if (IS_ERR_OR_NULL(p))
                ret = PTR_ERR(p);
        else {
                ret = strlen(p);
                memmove(buf, p, ret);
                buf[ret++] = '\n';
                buf[ret] = 0;
        }

        return ret;
}

static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf)
{
        return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_offset);
}

static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf)
{
        return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit);
}

static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf)
{
        int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR);

        return sysfs_emit(buf, "%s\n", autoclear ? "1" : "0");
}

static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf)
{
        int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN);

        return sysfs_emit(buf, "%s\n", partscan ? "1" : "0");
}

static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf)
{
        int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO);

        return sysfs_emit(buf, "%s\n", dio ? "1" : "0");
}

LOOP_ATTR_RO(backing_file);
LOOP_ATTR_RO(offset);
LOOP_ATTR_RO(sizelimit);
LOOP_ATTR_RO(autoclear);
LOOP_ATTR_RO(partscan);
LOOP_ATTR_RO(dio);

static struct attribute *loop_attrs[] = {
        &loop_attr_backing_file.attr,
        &loop_attr_offset.attr,
        &loop_attr_sizelimit.attr,
        &loop_attr_autoclear.attr,
        &loop_attr_partscan.attr,
        &loop_attr_dio.attr,
        NULL,
};

static struct attribute_group loop_attribute_group = {
        .name = "loop",
        .attrs= loop_attrs,
};

static void loop_sysfs_init(struct loop_device *lo)
{
        lo->sysfs_inited = !sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
                                                &loop_attribute_group);
}

static void loop_sysfs_exit(struct loop_device *lo)
{
        if (lo->sysfs_inited)
                sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
                                   &loop_attribute_group);
}

static void loop_config_discard(struct loop_device *lo,
                struct queue_limits *lim)
{
        struct file *file = lo->lo_backing_file;
        struct inode *inode = file->f_mapping->host;
        u32 granularity = 0, max_discard_sectors = 0;
        struct kstatfs sbuf;

        /*
         * If the backing device is a block device, mirror its zeroing
         * capability. Set the discard sectors to the block device's zeroing
         * capabilities because loop discards result in blkdev_issue_zeroout(),
         * not blkdev_issue_discard(). This maintains consistent behavior with
         * file-backed loop devices: discarded regions read back as zero.
         */
        if (S_ISBLK(inode->i_mode)) {
                struct request_queue *backingq = bdev_get_queue(I_BDEV(inode));

                max_discard_sectors = backingq->limits.max_write_zeroes_sectors;
                granularity = bdev_discard_granularity(I_BDEV(inode)) ?:
                        queue_physical_block_size(backingq);

        /*
         * We use punch hole to reclaim the free space used by the
         * image a.k.a. discard.
         */
        } else if (file->f_op->fallocate && !vfs_statfs(&file->f_path, &sbuf)) {
                max_discard_sectors = UINT_MAX >> 9;
                granularity = sbuf.f_bsize;
        }

        lim->max_hw_discard_sectors = max_discard_sectors;
        lim->max_write_zeroes_sectors = max_discard_sectors;
        if (max_discard_sectors)
                lim->discard_granularity = granularity;
        else
                lim->discard_granularity = 0;
}

struct loop_worker {
        struct rb_node rb_node;
        struct work_struct work;
        struct list_head cmd_list;
        struct list_head idle_list;
        struct loop_device *lo;
        struct cgroup_subsys_state *blkcg_css;
        unsigned long last_ran_at;
};

static void loop_workfn(struct work_struct *work);

#ifdef CONFIG_BLK_CGROUP
static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
{
        return !css || css == blkcg_root_css;
}
#else
static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
{
        return !css;
}
#endif

static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
{
        struct rb_node **node, *parent = NULL;
        struct loop_worker *cur_worker, *worker = NULL;
        struct work_struct *work;
        struct list_head *cmd_list;

        spin_lock_irq(&lo->lo_work_lock);

        if (queue_on_root_worker(cmd->blkcg_css))
                goto queue_work;

        node = &lo->worker_tree.rb_node;

        while (*node) {
                parent = *node;
                cur_worker = container_of(*node, struct loop_worker, rb_node);
                if (cur_worker->blkcg_css == cmd->blkcg_css) {
                        worker = cur_worker;
                        break;
                } else if ((long)cur_worker->blkcg_css < (long)cmd->blkcg_css) {
                        node = &(*node)->rb_left;
                } else {
                        node = &(*node)->rb_right;
                }
        }
        if (worker)
                goto queue_work;

        worker = kzalloc(sizeof(struct loop_worker), GFP_NOWAIT | __GFP_NOWARN);
        /*
         * In the event we cannot allocate a worker, just queue on the
         * rootcg worker and issue the I/O as the rootcg
         */
        if (!worker) {
                cmd->blkcg_css = NULL;
                if (cmd->memcg_css)
                        css_put(cmd->memcg_css);
                cmd->memcg_css = NULL;
                goto queue_work;
        }

        worker->blkcg_css = cmd->blkcg_css;
        css_get(worker->blkcg_css);
        INIT_WORK(&worker->work, loop_workfn);
        INIT_LIST_HEAD(&worker->cmd_list);
        INIT_LIST_HEAD(&worker->idle_list);
        worker->lo = lo;
        rb_link_node(&worker->rb_node, parent, node);
        rb_insert_color(&worker->rb_node, &lo->worker_tree);
queue_work:
        if (worker) {
                /*
                 * We need to remove from the idle list here while
                 * holding the lock so that the idle timer doesn't
                 * free the worker
                 */
                if (!list_empty(&worker->idle_list))
                        list_del_init(&worker->idle_list);
                work = &worker->work;
                cmd_list = &worker->cmd_list;
        } else {
                work = &lo->rootcg_work;
                cmd_list = &lo->rootcg_cmd_list;
        }
        list_add_tail(&cmd->list_entry, cmd_list);
        queue_work(lo->workqueue, work);
        spin_unlock_irq(&lo->lo_work_lock);
}

static void loop_set_timer(struct loop_device *lo)
{
        timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT);
}

static void loop_free_idle_workers(struct loop_device *lo, bool delete_all)
{
        struct loop_worker *pos, *worker;

        spin_lock_irq(&lo->lo_work_lock);
        list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
                                idle_list) {
                if (!delete_all &&
                    time_is_after_jiffies(worker->last_ran_at +
                                          LOOP_IDLE_WORKER_TIMEOUT))
                        break;
                list_del(&worker->idle_list);
                rb_erase(&worker->rb_node, &lo->worker_tree);
                css_put(worker->blkcg_css);
                kfree(worker);
        }
        if (!list_empty(&lo->idle_worker_list))
                loop_set_timer(lo);
        spin_unlock_irq(&lo->lo_work_lock);
}

static void loop_free_idle_workers_timer(struct timer_list *timer)
{
        struct loop_device *lo = container_of(timer, struct loop_device, timer);

        return loop_free_idle_workers(lo, false);
}

static void loop_update_rotational(struct loop_device *lo)
{
        struct file *file = lo->lo_backing_file;
        struct inode *file_inode = file->f_mapping->host;
        struct block_device *file_bdev = file_inode->i_sb->s_bdev;
        struct request_queue *q = lo->lo_queue;
        bool nonrot = true;

        /* not all filesystems (e.g. tmpfs) have a sb->s_bdev */
        if (file_bdev)
                nonrot = bdev_nonrot(file_bdev);

        if (nonrot)
                blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
        else
                blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
}

/**
 * loop_set_status_from_info - configure device from loop_info
 * @lo: struct loop_device to configure
 * @info: struct loop_info64 to configure the device with
 *
 * Configures the loop device parameters according to the passed
 * in loop_info64 configuration.
 */
static int
loop_set_status_from_info(struct loop_device *lo,
                          const struct loop_info64 *info)
{
        if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE)
                return -EINVAL;

        switch (info->lo_encrypt_type) {
        case LO_CRYPT_NONE:
                break;
        case LO_CRYPT_XOR:
                pr_warn("support for the xor transformation has been removed.\n");
                return -EINVAL;
        case LO_CRYPT_CRYPTOAPI:
                pr_warn("support for cryptoloop has been removed.  Use dm-crypt instead.\n");
                return -EINVAL;
        default:
                return -EINVAL;
        }

        /* Avoid assigning overflow values */
        if (info->lo_offset > LLONG_MAX || info->lo_sizelimit > LLONG_MAX)
                return -EOVERFLOW;

        lo->lo_offset = info->lo_offset;
        lo->lo_sizelimit = info->lo_sizelimit;

        memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE);
        lo->lo_file_name[LO_NAME_SIZE-1] = 0;
        lo->lo_flags = info->lo_flags;
        return 0;
}

static int loop_reconfigure_limits(struct loop_device *lo, unsigned short bsize,
                bool update_discard_settings)
{
        struct queue_limits lim;

        lim = queue_limits_start_update(lo->lo_queue);
        lim.logical_block_size = bsize;
        lim.physical_block_size = bsize;
        lim.io_min = bsize;
        if (update_discard_settings)
                loop_config_discard(lo, &lim);
        return queue_limits_commit_update(lo->lo_queue, &lim);
}

static int loop_configure(struct loop_device *lo, blk_mode_t mode,
                          struct block_device *bdev,
                          const struct loop_config *config)
{
        struct file *file = fget(config->fd);
        struct inode *inode;
        struct address_space *mapping;
        int error;
        loff_t size;
        bool partscan;
        unsigned short bsize;
        bool is_loop;

        if (!file)
                return -EBADF;
        is_loop = is_loop_device(file);

        /* This is safe, since we have a reference from open(). */
        __module_get(THIS_MODULE);

        /*
         * If we don't hold exclusive handle for the device, upgrade to it
         * here to avoid changing device under exclusive owner.
         */
        if (!(mode & BLK_OPEN_EXCL)) {
                error = bd_prepare_to_claim(bdev, loop_configure, NULL);
                if (error)
                        goto out_putf;
        }

        error = loop_global_lock_killable(lo, is_loop);
        if (error)
                goto out_bdev;

        error = -EBUSY;
        if (lo->lo_state != Lo_unbound)
                goto out_unlock;

        error = loop_validate_file(file, bdev);
        if (error)
                goto out_unlock;

        mapping = file->f_mapping;
        inode = mapping->host;

        if ((config->info.lo_flags & ~LOOP_CONFIGURE_SETTABLE_FLAGS) != 0) {
                error = -EINVAL;
                goto out_unlock;
        }

        if (config->block_size) {
                error = blk_validate_block_size(config->block_size);
                if (error)
                        goto out_unlock;
        }

        error = loop_set_status_from_info(lo, &config->info);
        if (error)
                goto out_unlock;

        if (!(file->f_mode & FMODE_WRITE) || !(mode & BLK_OPEN_WRITE) ||
            !file->f_op->write_iter)
                lo->lo_flags |= LO_FLAGS_READ_ONLY;

        if (!lo->workqueue) {
                lo->workqueue = alloc_workqueue("loop%d",
                                                WQ_UNBOUND | WQ_FREEZABLE,
                                                0, lo->lo_number);
                if (!lo->workqueue) {
                        error = -ENOMEM;
                        goto out_unlock;
                }
        }

        /* suppress uevents while reconfiguring the device */
        dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1);

        disk_force_media_change(lo->lo_disk);
        set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0);

        lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
        lo->lo_device = bdev;
        lo->lo_backing_file = file;
        lo->old_gfp_mask = mapping_gfp_mask(mapping);
        mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));

        if (!(lo->lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
                blk_queue_write_cache(lo->lo_queue, true, false);

        if (config->block_size)
                bsize = config->block_size;
        else if ((lo->lo_backing_file->f_flags & O_DIRECT) && inode->i_sb->s_bdev)
                /* In case of direct I/O, match underlying block size */
                bsize = bdev_logical_block_size(inode->i_sb->s_bdev);
        else
                bsize = 512;

        error = loop_reconfigure_limits(lo, bsize, true);
        if (WARN_ON_ONCE(error))
                goto out_unlock;

        loop_update_rotational(lo);
        loop_update_dio(lo);
        loop_sysfs_init(lo);

        size = get_loop_size(lo, file);
        loop_set_size(lo, size);

        /* Order wrt reading lo_state in loop_validate_file(). */
        wmb();

        lo->lo_state = Lo_bound;
        if (part_shift)
                lo->lo_flags |= LO_FLAGS_PARTSCAN;
        partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
        if (partscan)
                clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state);

        /* enable and uncork uevent now that we are done */
        dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0);

        loop_global_unlock(lo, is_loop);
        if (partscan)
                loop_reread_partitions(lo);

        if (!(mode & BLK_OPEN_EXCL))
                bd_abort_claiming(bdev, loop_configure);

        return 0;

out_unlock:
        loop_global_unlock(lo, is_loop);
out_bdev:
        if (!(mode & BLK_OPEN_EXCL))
                bd_abort_claiming(bdev, loop_configure);
out_putf:
        fput(file);
        /* This is safe: open() is still holding a reference. */
        module_put(THIS_MODULE);
        return error;
}

static void __loop_clr_fd(struct loop_device *lo, bool release)
{
        struct file *filp;
        gfp_t gfp = lo->old_gfp_mask;

        if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags))
                blk_queue_write_cache(lo->lo_queue, false, false);

        /*
         * Freeze the request queue when unbinding on a live file descriptor and
         * thus an open device.  When called from ->release we are guaranteed
         * that there is no I/O in progress already.
         */
        if (!release)
                blk_mq_freeze_queue(lo->lo_queue);

        spin_lock_irq(&lo->lo_lock);
        filp = lo->lo_backing_file;
        lo->lo_backing_file = NULL;
        spin_unlock_irq(&lo->lo_lock);

        lo->lo_device = NULL;
        lo->lo_offset = 0;
        lo->lo_sizelimit = 0;
        memset(lo->lo_file_name, 0, LO_NAME_SIZE);
        loop_reconfigure_limits(lo, 512, false);
        invalidate_disk(lo->lo_disk);
        loop_sysfs_exit(lo);
        /* let user-space know about this change */
        kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
        mapping_set_gfp_mask(filp->f_mapping, gfp);
        /* This is safe: open() is still holding a reference. */
        module_put(THIS_MODULE);
        if (!release)
                blk_mq_unfreeze_queue(lo->lo_queue);

        disk_force_media_change(lo->lo_disk);

        if (lo->lo_flags & LO_FLAGS_PARTSCAN) {
                int err;

                /*
                 * open_mutex has been held already in release path, so don't
                 * acquire it if this function is called in such case.
                 *
                 * If the reread partition isn't from release path, lo_refcnt
                 * must be at least one and it can only become zero when the
                 * current holder is released.
                 */
                if (!release)
                        mutex_lock(&lo->lo_disk->open_mutex);
                err = bdev_disk_changed(lo->lo_disk, false);
                if (!release)
                        mutex_unlock(&lo->lo_disk->open_mutex);
                if (err)
                        pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
                                __func__, lo->lo_number, err);
                /* Device is gone, no point in returning error */
        }

        /*
         * lo->lo_state is set to Lo_unbound here after above partscan has
         * finished. There cannot be anybody else entering __loop_clr_fd() as
         * Lo_rundown state protects us from all the other places trying to
         * change the 'lo' device.
         */
        lo->lo_flags = 0;
        if (!part_shift)
                set_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state);
        mutex_lock(&lo->lo_mutex);
        lo->lo_state = Lo_unbound;
        mutex_unlock(&lo->lo_mutex);

        /*
         * Need not hold lo_mutex to fput backing file. Calling fput holding
         * lo_mutex triggers a circular lock dependency possibility warning as
         * fput can take open_mutex which is usually taken before lo_mutex.
         */
        fput(filp);
}

static int loop_clr_fd(struct loop_device *lo)
{
        int err;

        /*
         * Since lo_ioctl() is called without locks held, it is possible that
         * loop_configure()/loop_change_fd() and loop_clr_fd() run in parallel.
         *
         * Therefore, use global lock when setting Lo_rundown state in order to
         * make sure that loop_validate_file() will fail if the "struct file"
         * which loop_configure()/loop_change_fd() found via fget() was this
         * loop device.
         */
        err = loop_global_lock_killable(lo, true);
        if (err)
                return err;
        if (lo->lo_state != Lo_bound) {
                loop_global_unlock(lo, true);
                return -ENXIO;
        }
        /*
         * If we've explicitly asked to tear down the loop device,
         * and it has an elevated reference count, set it for auto-teardown when
         * the last reference goes away. This stops $!~#$@ udev from
         * preventing teardown because it decided that it needs to run blkid on
         * the loopback device whenever they appear. xfstests is notorious for
         * failing tests because blkid via udev races with a losetup
         * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
         * command to fail with EBUSY.
         */
        if (disk_openers(lo->lo_disk) > 1) {
                lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
                loop_global_unlock(lo, true);
                return 0;
        }
        lo->lo_state = Lo_rundown;
        loop_global_unlock(lo, true);

        __loop_clr_fd(lo, false);
        return 0;
}

static int
loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
{
        int err;
        int prev_lo_flags;
        bool partscan = false;
        bool size_changed = false;

        err = mutex_lock_killable(&lo->lo_mutex);
        if (err)
                return err;
        if (lo->lo_state != Lo_bound) {
                err = -ENXIO;
                goto out_unlock;
        }

        if (lo->lo_offset != info->lo_offset ||
            lo->lo_sizelimit != info->lo_sizelimit) {
                size_changed = true;
                sync_blockdev(lo->lo_device);
                invalidate_bdev(lo->lo_device);
        }

        /* I/O need to be drained during transfer transition */
        blk_mq_freeze_queue(lo->lo_queue);

        prev_lo_flags = lo->lo_flags;

        err = loop_set_status_from_info(lo, info);
        if (err)
                goto out_unfreeze;

        /* Mask out flags that can't be set using LOOP_SET_STATUS. */
        lo->lo_flags &= LOOP_SET_STATUS_SETTABLE_FLAGS;
        /* For those flags, use the previous values instead */
        lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_SETTABLE_FLAGS;
        /* For flags that can't be cleared, use previous values too */
        lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_CLEARABLE_FLAGS;

        if (size_changed) {
                loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit,
                                           lo->lo_backing_file);
                loop_set_size(lo, new_size);
        }

        /* update dio if lo_offset or transfer is changed */
        __loop_update_dio(lo, lo->use_dio);

out_unfreeze:
        blk_mq_unfreeze_queue(lo->lo_queue);

        if (!err && (lo->lo_flags & LO_FLAGS_PARTSCAN) &&
             !(prev_lo_flags & LO_FLAGS_PARTSCAN)) {
                clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state);
                partscan = true;
        }
out_unlock:
        mutex_unlock(&lo->lo_mutex);
        if (partscan)
                loop_reread_partitions(lo);

        return err;
}

static int
loop_get_status(struct loop_device *lo, struct loop_info64 *info)
{
        struct path path;
        struct kstat stat;
        int ret;

        ret = mutex_lock_killable(&lo->lo_mutex);
        if (ret)
                return ret;
        if (lo->lo_state != Lo_bound) {
                mutex_unlock(&lo->lo_mutex);
                return -ENXIO;
        }

        memset(info, 0, sizeof(*info));
        info->lo_number = lo->lo_number;
        info->lo_offset = lo->lo_offset;
        info->lo_sizelimit = lo->lo_sizelimit;
        info->lo_flags = lo->lo_flags;
        memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE);

        /* Drop lo_mutex while we call into the filesystem. */
        path = lo->lo_backing_file->f_path;
        path_get(&path);
        mutex_unlock(&lo->lo_mutex);
        ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT);
        if (!ret) {
                info->lo_device = huge_encode_dev(stat.dev);
                info->lo_inode = stat.ino;
                info->lo_rdevice = huge_encode_dev(stat.rdev);
        }
        path_put(&path);
        return ret;
}

static void
loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64)
{
        memset(info64, 0, sizeof(*info64));
        info64->lo_number = info->lo_number;
        info64->lo_device = info->lo_device;
        info64->lo_inode = info->lo_inode;
        info64->lo_rdevice = info->lo_rdevice;
        info64->lo_offset = info->lo_offset;
        info64->lo_sizelimit = 0;
        info64->lo_flags = info->lo_flags;
        memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE);
}

static int
loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info)
{
        memset(info, 0, sizeof(*info));
        info->lo_number = info64->lo_number;
        info->lo_device = info64->lo_device;
        info->lo_inode = info64->lo_inode;
        info->lo_rdevice = info64->lo_rdevice;
        info->lo_offset = info64->lo_offset;
        info->lo_flags = info64->lo_flags;
        memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE);

        /* error in case values were truncated */
        if (info->lo_device != info64->lo_device ||
            info->lo_rdevice != info64->lo_rdevice ||
            info->lo_inode != info64->lo_inode ||
            info->lo_offset != info64->lo_offset)
                return -EOVERFLOW;

        return 0;
}

static int
loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg)
{
        struct loop_info info;
        struct loop_info64 info64;

        if (copy_from_user(&info, arg, sizeof (struct loop_info)))
                return -EFAULT;
        loop_info64_from_old(&info, &info64);
        return loop_set_status(lo, &info64);
}

static int
loop_set_status64(struct loop_device *lo, const struct loop_info64 __user *arg)
{
        struct loop_info64 info64;

        if (copy_from_user(&info64, arg, sizeof (struct loop_info64)))
                return -EFAULT;
        return loop_set_status(lo, &info64);
}

static int
loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) {
        struct loop_info info;
        struct loop_info64 info64;
        int err;

        if (!arg)
                return -EINVAL;
        err = loop_get_status(lo, &info64);
        if (!err)
                err = loop_info64_to_old(&info64, &info);
        if (!err && copy_to_user(arg, &info, sizeof(info)))
                err = -EFAULT;

        return err;
}

static int
loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) {
        struct loop_info64 info64;
        int err;

        if (!arg)
                return -EINVAL;
        err = loop_get_status(lo, &info64);
        if (!err && copy_to_user(arg, &info64, sizeof(info64)))
                err = -EFAULT;

        return err;
}

static int loop_set_capacity(struct loop_device *lo)
{
        loff_t size;

        if (unlikely(lo->lo_state != Lo_bound))
                return -ENXIO;

        size = get_loop_size(lo, lo->lo_backing_file);
        loop_set_size(lo, size);

        return 0;
}

static int loop_set_dio(struct loop_device *lo, unsigned long arg)
{
        int error = -ENXIO;
        if (lo->lo_state != Lo_bound)
                goto out;

        __loop_update_dio(lo, !!arg);
        if (lo->use_dio == !!arg)
                return 0;
        error = -EINVAL;
 out:
        return error;
}

static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
{
        int err = 0;

        if (lo->lo_state != Lo_bound)
                return -ENXIO;

        err = blk_validate_block_size(arg);
        if (err)
                return err;

        if (lo->lo_queue->limits.logical_block_size == arg)
                return 0;

        sync_blockdev(lo->lo_device);
        invalidate_bdev(lo->lo_device);

        blk_mq_freeze_queue(lo->lo_queue);
        err = loop_reconfigure_limits(lo, arg, false);
        loop_update_dio(lo);
        blk_mq_unfreeze_queue(lo->lo_queue);

        return err;
}

static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,
                           unsigned long arg)
{
        int err;

        err = mutex_lock_killable(&lo->lo_mutex);
        if (err)
                return err;
        switch (cmd) {
        case LOOP_SET_CAPACITY:
                err = loop_set_capacity(lo);
                break;
        case LOOP_SET_DIRECT_IO:
                err = loop_set_dio(lo, arg);
                break;
        case LOOP_SET_BLOCK_SIZE:
                err = loop_set_block_size(lo, arg);
                break;
        default:
                err = -EINVAL;
        }
        mutex_unlock(&lo->lo_mutex);
        return err;
}

static int lo_ioctl(struct block_device *bdev, blk_mode_t mode,
        unsigned int cmd, unsigned long arg)
{
        struct loop_device *lo = bdev->bd_disk->private_data;
        void __user *argp = (void __user *) arg;
        int err;

        switch (cmd) {
        case LOOP_SET_FD: {
                /*
                 * Legacy case - pass in a zeroed out struct loop_config with
                 * only the file descriptor set , which corresponds with the
                 * default parameters we'd have used otherwise.
                 */
                struct loop_config config;

                memset(&config, 0, sizeof(config));
                config.fd = arg;

                return loop_configure(lo, mode, bdev, &config);
        }
        case LOOP_CONFIGURE: {
                struct loop_config config;

                if (copy_from_user(&config, argp, sizeof(config)))
                        return -EFAULT;

                return loop_configure(lo, mode, bdev, &config);
        }
        case LOOP_CHANGE_FD:
                return loop_change_fd(lo, bdev, arg);
        case LOOP_CLR_FD:
                return loop_clr_fd(lo);
        case LOOP_SET_STATUS:
                err = -EPERM;
                if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN))
                        err = loop_set_status_old(lo, argp);
                break;
        case LOOP_GET_STATUS:
                return loop_get_status_old(lo, argp);
        case LOOP_SET_STATUS64:
                err = -EPERM;
                if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN))
                        err = loop_set_status64(lo, argp);
                break;
        case LOOP_GET_STATUS64:
                return loop_get_status64(lo, argp);
        case LOOP_SET_CAPACITY:
        case LOOP_SET_DIRECT_IO:
        case LOOP_SET_BLOCK_SIZE:
                if (!(mode & BLK_OPEN_WRITE) && !capable(CAP_SYS_ADMIN))
                        return -EPERM;
                fallthrough;
        default:
                err = lo_simple_ioctl(lo, cmd, arg);
                break;
        }

        return err;
}

#ifdef CONFIG_COMPAT
struct compat_loop_info {
        compat_int_t        lo_number;      /* ioctl r/o */
        compat_dev_t        lo_device;      /* ioctl r/o */
        compat_ulong_t        lo_inode;       /* ioctl r/o */
        compat_dev_t        lo_rdevice;     /* ioctl r/o */
        compat_int_t        lo_offset;
        compat_int_t        lo_encrypt_type;        /* obsolete, ignored */
        compat_int_t        lo_encrypt_key_size;    /* ioctl w/o */
        compat_int_t        lo_flags;       /* ioctl r/o */
        char                lo_name[LO_NAME_SIZE];
        unsigned char        lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */
        compat_ulong_t        lo_init[2];
        char                reserved[4];
};

/*
 * Transfer 32-bit compatibility structure in userspace to 64-bit loop info
 * - noinlined to reduce stack space usage in main part of driver
 */
static noinline int
loop_info64_from_compat(const struct compat_loop_info __user *arg,
                        struct loop_info64 *info64)
{
        struct compat_loop_info info;

        if (copy_from_user(&info, arg, sizeof(info)))
                return -EFAULT;

        memset(info64, 0, sizeof(*info64));
        info64->lo_number = info.lo_number;
        info64->lo_device = info.lo_device;
        info64->lo_inode = info.lo_inode;
        info64->lo_rdevice = info.lo_rdevice;
        info64->lo_offset = info.lo_offset;
        info64->lo_sizelimit = 0;
        info64->lo_flags = info.lo_flags;
        memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE);
        return 0;
}

/*
 * Transfer 64-bit loop info to 32-bit compatibility structure in userspace
 * - noinlined to reduce stack space usage in main part of driver
 */
static noinline int
loop_info64_to_compat(const struct loop_info64 *info64,
                      struct compat_loop_info __user *arg)
{
        struct compat_loop_info info;

        memset(&info, 0, sizeof(info));
        info.lo_number = info64->lo_number;
        info.lo_device = info64->lo_device;
        info.lo_inode = info64->lo_inode;
        info.lo_rdevice = info64->lo_rdevice;
        info.lo_offset = info64->lo_offset;
        info.lo_flags = info64->lo_flags;
        memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE);

        /* error in case values were truncated */
        if (info.lo_device != info64->lo_device ||
            info.lo_rdevice != info64->lo_rdevice ||
            info.lo_inode != info64->lo_inode ||
            info.lo_offset != info64->lo_offset)
                return -EOVERFLOW;

        if (copy_to_user(arg, &info, sizeof(info)))
                return -EFAULT;
        return 0;
}

static int
loop_set_status_compat(struct loop_device *lo,
                       const struct compat_loop_info __user *arg)
{
        struct loop_info64 info64;
        int ret;

        ret = loop_info64_from_compat(arg, &info64);
        if (ret < 0)
                return ret;
        return loop_set_status(lo, &info64);
}

static int
loop_get_status_compat(struct loop_device *lo,
                       struct compat_loop_info __user *arg)
{
        struct loop_info64 info64;
        int err;

        if (!arg)
                return -EINVAL;
        err = loop_get_status(lo, &info64);
        if (!err)
                err = loop_info64_to_compat(&info64, arg);
        return err;
}

static int lo_compat_ioctl(struct block_device *bdev, blk_mode_t mode,
                           unsigned int cmd, unsigned long arg)
{
        struct loop_device *lo = bdev->bd_disk->private_data;
        int err;

        switch(cmd) {
        case LOOP_SET_STATUS:
                err = loop_set_status_compat(lo,
                             (const struct compat_loop_info __user *)arg);
                break;
        case LOOP_GET_STATUS:
                err = loop_get_status_compat(lo,
                                     (struct compat_loop_info __user *)arg);
                break;
        case LOOP_SET_CAPACITY:
        case LOOP_CLR_FD:
        case LOOP_GET_STATUS64:
        case LOOP_SET_STATUS64:
        case LOOP_CONFIGURE:
                arg = (unsigned long) compat_ptr(arg);
                fallthrough;
        case LOOP_SET_FD:
        case LOOP_CHANGE_FD:
        case LOOP_SET_BLOCK_SIZE:
        case LOOP_SET_DIRECT_IO:
                err = lo_ioctl(bdev, mode, cmd, arg);
                break;
        default:
                err = -ENOIOCTLCMD;
                break;
        }
        return err;
}
#endif

static void lo_release(struct gendisk *disk)
{
        struct loop_device *lo = disk->private_data;

        if (disk_openers(disk) > 0)
                return;

        mutex_lock(&lo->lo_mutex);
        if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) {
                lo->lo_state = Lo_rundown;
                mutex_unlock(&lo->lo_mutex);
                /*
                 * In autoclear mode, stop the loop thread
                 * and remove configuration after last close.
                 */
                __loop_clr_fd(lo, true);
                return;
        }
        mutex_unlock(&lo->lo_mutex);
}

static void lo_free_disk(struct gendisk *disk)
{
        struct loop_device *lo = disk->private_data;

        if (lo->workqueue)
                destroy_workqueue(lo->workqueue);
        loop_free_idle_workers(lo, true);
        timer_shutdown_sync(&lo->timer);
        mutex_destroy(&lo->lo_mutex);
        kfree(lo);
}

static const struct block_device_operations lo_fops = {
        .owner =        THIS_MODULE,
        .release =        lo_release,
        .ioctl =        lo_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl =        lo_compat_ioctl,
#endif
        .free_disk =        lo_free_disk,
};

/*
 * And now the modules code and kernel interface.
 */

/*
 * If max_loop is specified, create that many devices upfront.
 * This also becomes a hard limit. If max_loop is not specified,
 * the default isn't a hard limit (as before commit 85c50197716c
 * changed the default value from 0 for max_loop=0 reasons), just
 * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
 * init time. Loop devices can be requested on-demand with the
 * /dev/loop-control interface, or be instantiated by accessing
 * a 'dead' device node.
 */
static int max_loop = CONFIG_BLK_DEV_LOOP_MIN_COUNT;

#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
static bool max_loop_specified;

static int max_loop_param_set_int(const char *val,
                                  const struct kernel_param *kp)
{
        int ret;

        ret = param_set_int(val, kp);
        if (ret < 0)
                return ret;

        max_loop_specified = true;
        return 0;
}

static const struct kernel_param_ops max_loop_param_ops = {
        .set = max_loop_param_set_int,
        .get = param_get_int,
};

module_param_cb(max_loop, &max_loop_param_ops, &max_loop, 0444);
MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
#else
module_param(max_loop, int, 0444);
MODULE_PARM_DESC(max_loop, "Initial number of loop devices");
#endif

module_param(max_part, int, 0444);
MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device");

static int hw_queue_depth = LOOP_DEFAULT_HW_Q_DEPTH;

static int loop_set_hw_queue_depth(const char *s, const struct kernel_param *p)
{
        int qd, ret;

        ret = kstrtoint(s, 0, &qd);
        if (ret < 0)
                return ret;
        if (qd < 1)
                return -EINVAL;
        hw_queue_depth = qd;
        return 0;
}

static const struct kernel_param_ops loop_hw_qdepth_param_ops = {
        .set        = loop_set_hw_queue_depth,
        .get        = param_get_int,
};

device_param_cb(hw_queue_depth, &loop_hw_qdepth_param_ops, &hw_queue_depth, 0444);
MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: " __stringify(LOOP_DEFAULT_HW_Q_DEPTH));

MODULE_LICENSE("GPL");
MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);

static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
                const struct blk_mq_queue_data *bd)
{
        struct request *rq = bd->rq;
        struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
        struct loop_device *lo = rq->q->queuedata;

        blk_mq_start_request(rq);

        if (lo->lo_state != Lo_bound)
                return BLK_STS_IOERR;

        switch (req_op(rq)) {
        case REQ_OP_FLUSH:
        case REQ_OP_DISCARD:
        case REQ_OP_WRITE_ZEROES:
                cmd->use_aio = false;
                break;
        default:
                cmd->use_aio = lo->use_dio;
                break;
        }

        /* always use the first bio's css */
        cmd->blkcg_css = NULL;
        cmd->memcg_css = NULL;
#ifdef CONFIG_BLK_CGROUP
        if (rq->bio) {
                cmd->blkcg_css = bio_blkcg_css(rq->bio);
#ifdef CONFIG_MEMCG
                if (cmd->blkcg_css) {
                        cmd->memcg_css =
                                cgroup_get_e_css(cmd->blkcg_css->cgroup,
                                                &memory_cgrp_subsys);
                }
#endif
        }
#endif
        loop_queue_work(lo, cmd);

        return BLK_STS_OK;
}

static void loop_handle_cmd(struct loop_cmd *cmd)
{
        struct cgroup_subsys_state *cmd_blkcg_css = cmd->blkcg_css;
        struct cgroup_subsys_state *cmd_memcg_css = cmd->memcg_css;
        struct request *rq = blk_mq_rq_from_pdu(cmd);
        const bool write = op_is_write(req_op(rq));
        struct loop_device *lo = rq->q->queuedata;
        int ret = 0;
        struct mem_cgroup *old_memcg = NULL;
        const bool use_aio = cmd->use_aio;

        if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) {
                ret = -EIO;
                goto failed;
        }

        if (cmd_blkcg_css)
                kthread_associate_blkcg(cmd_blkcg_css);
        if (cmd_memcg_css)
                old_memcg = set_active_memcg(
                        mem_cgroup_from_css(cmd_memcg_css));

        /*
         * do_req_filebacked() may call blk_mq_complete_request() synchronously
         * or asynchronously if using aio. Hence, do not touch 'cmd' after
         * do_req_filebacked() has returned unless we are sure that 'cmd' has
         * not yet been completed.
         */
        ret = do_req_filebacked(lo, rq);

        if (cmd_blkcg_css)
                kthread_associate_blkcg(NULL);

        if (cmd_memcg_css) {
                set_active_memcg(old_memcg);
                css_put(cmd_memcg_css);
        }
 failed:
        /* complete non-aio request */
        if (!use_aio || ret) {
                if (ret == -EOPNOTSUPP)
                        cmd->ret = ret;
                else
                        cmd->ret = ret ? -EIO : 0;
                if (likely(!blk_should_fake_timeout(rq->q)))
                        blk_mq_complete_request(rq);
        }
}

static void loop_process_work(struct loop_worker *worker,
                        struct list_head *cmd_list, struct loop_device *lo)
{
        int orig_flags = current->flags;
        struct loop_cmd *cmd;

        current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
        spin_lock_irq(&lo->lo_work_lock);
        while (!list_empty(cmd_list)) {
                cmd = container_of(
                        cmd_list->next, struct loop_cmd, list_entry);
                list_del(cmd_list->next);
                spin_unlock_irq(&lo->lo_work_lock);

                loop_handle_cmd(cmd);
                cond_resched();

                spin_lock_irq(&lo->lo_work_lock);
        }

        /*
         * We only add to the idle list if there are no pending cmds
         * *and* the worker will not run again which ensures that it
         * is safe to free any worker on the idle list
         */
        if (worker && !work_pending(&worker->work)) {
                worker->last_ran_at = jiffies;
                list_add_tail(&worker->idle_list, &lo->idle_worker_list);
                loop_set_timer(lo);
        }
        spin_unlock_irq(&lo->lo_work_lock);
        current->flags = orig_flags;
}

static void loop_workfn(struct work_struct *work)
{
        struct loop_worker *worker =
                container_of(work, struct loop_worker, work);
        loop_process_work(worker, &worker->cmd_list, worker->lo);
}

static void loop_rootcg_workfn(struct work_struct *work)
{
        struct loop_device *lo =
                container_of(work, struct loop_device, rootcg_work);
        loop_process_work(NULL, &lo->rootcg_cmd_list, lo);
}

static const struct blk_mq_ops loop_mq_ops = {
        .queue_rq       = loop_queue_rq,
        .complete        = lo_complete_rq,
};

static int loop_add(int i)
{
        struct queue_limits lim = {
                /*
                 * Random number picked from the historic block max_sectors cap.
                 */
                .max_hw_sectors                = 2560u,
        };
        struct loop_device *lo;
        struct gendisk *disk;
        int err;

        err = -ENOMEM;
        lo = kzalloc(sizeof(*lo), GFP_KERNEL);
        if (!lo)
                goto out;
        lo->worker_tree = RB_ROOT;
        INIT_LIST_HEAD(&lo->idle_worker_list);
        timer_setup(&lo->timer, loop_free_idle_workers_timer, TIMER_DEFERRABLE);
        lo->lo_state = Lo_unbound;

        err = mutex_lock_killable(&loop_ctl_mutex);
        if (err)
                goto out_free_dev;

        /* allocate id, if @id >= 0, we're requesting that specific id */
        if (i >= 0) {
                err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL);
                if (err == -ENOSPC)
                        err = -EEXIST;
        } else {
                err = idr_alloc(&loop_index_idr, lo, 0, 0, GFP_KERNEL);
        }
        mutex_unlock(&loop_ctl_mutex);
        if (err < 0)
                goto out_free_dev;
        i = err;

        lo->tag_set.ops = &loop_mq_ops;
        lo->tag_set.nr_hw_queues = 1;
        lo->tag_set.queue_depth = hw_queue_depth;
        lo->tag_set.numa_node = NUMA_NO_NODE;
        lo->tag_set.cmd_size = sizeof(struct loop_cmd);
        lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING |
                BLK_MQ_F_NO_SCHED_BY_DEFAULT;
        lo->tag_set.driver_data = lo;

        err = blk_mq_alloc_tag_set(&lo->tag_set);
        if (err)
                goto out_free_idr;

        disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, &lim, lo);
        if (IS_ERR(disk)) {
                err = PTR_ERR(disk);
                goto out_cleanup_tags;
        }
        lo->lo_queue = lo->lo_disk->queue;

        /*
         * By default, we do buffer IO, so it doesn't make sense to enable
         * merge because the I/O submitted to backing file is handled page by
         * page. For directio mode, merge does help to dispatch bigger request
         * to underlayer disk. We will enable merge once directio is enabled.
         */
        blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue);

        /*
         * Disable partition scanning by default. The in-kernel partition
         * scanning can be requested individually per-device during its
         * setup. Userspace can always add and remove partitions from all
         * devices. The needed partition minors are allocated from the
         * extended minor space, the main loop device numbers will continue
         * to match the loop minors, regardless of the number of partitions
         * used.
         *
         * If max_part is given, partition scanning is globally enabled for
         * all loop devices. The minors for the main loop devices will be
         * multiples of max_part.
         *
         * Note: Global-for-all-devices, set-only-at-init, read-only module
         * parameteters like 'max_loop' and 'max_part' make things needlessly
         * complicated, are too static, inflexible and may surprise
         * userspace tools. Parameters like this in general should be avoided.
         */
        if (!part_shift)
                set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
        mutex_init(&lo->lo_mutex);
        lo->lo_number                = i;
        spin_lock_init(&lo->lo_lock);
        spin_lock_init(&lo->lo_work_lock);
        INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn);
        INIT_LIST_HEAD(&lo->rootcg_cmd_list);
        disk->major                = LOOP_MAJOR;
        disk->first_minor        = i << part_shift;
        disk->minors                = 1 << part_shift;
        disk->fops                = &lo_fops;
        disk->private_data        = lo;
        disk->queue                = lo->lo_queue;
        disk->events                = DISK_EVENT_MEDIA_CHANGE;
        disk->event_flags        = DISK_EVENT_FLAG_UEVENT;
        sprintf(disk->disk_name, "loop%d", i);
        /* Make this loop device reachable from pathname. */
        err = add_disk(disk);
        if (err)
                goto out_cleanup_disk;

        /* Show this loop device. */
        mutex_lock(&loop_ctl_mutex);
        lo->idr_visible = true;
        mutex_unlock(&loop_ctl_mutex);

        return i;

out_cleanup_disk:
        put_disk(disk);
out_cleanup_tags:
        blk_mq_free_tag_set(&lo->tag_set);
out_free_idr:
        mutex_lock(&loop_ctl_mutex);
        idr_remove(&loop_index_idr, i);
        mutex_unlock(&loop_ctl_mutex);
out_free_dev:
        kfree(lo);
out:
        return err;
}

static void loop_remove(struct loop_device *lo)
{
        /* Make this loop device unreachable from pathname. */
        del_gendisk(lo->lo_disk);
        blk_mq_free_tag_set(&lo->tag_set);

        mutex_lock(&loop_ctl_mutex);
        idr_remove(&loop_index_idr, lo->lo_number);
        mutex_unlock(&loop_ctl_mutex);

        put_disk(lo->lo_disk);
}

#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
static void loop_probe(dev_t dev)
{
        int idx = MINOR(dev) >> part_shift;

        if (max_loop_specified && max_loop && idx >= max_loop)
                return;
        loop_add(idx);
}
#else
#define loop_probe NULL
#endif /* !CONFIG_BLOCK_LEGACY_AUTOLOAD */

static int loop_control_remove(int idx)
{
        struct loop_device *lo;
        int ret;

        if (idx < 0) {
                pr_warn_once("deleting an unspecified loop device is not supported.\n");
                return -EINVAL;
        }
                
        /* Hide this loop device for serialization. */
        ret = mutex_lock_killable(&loop_ctl_mutex);
        if (ret)
                return ret;
        lo = idr_find(&loop_index_idr, idx);
        if (!lo || !lo->idr_visible)
                ret = -ENODEV;
        else
                lo->idr_visible = false;
        mutex_unlock(&loop_ctl_mutex);
        if (ret)
                return ret;

        /* Check whether this loop device can be removed. */
        ret = mutex_lock_killable(&lo->lo_mutex);
        if (ret)
                goto mark_visible;
        if (lo->lo_state != Lo_unbound || disk_openers(lo->lo_disk) > 0) {
                mutex_unlock(&lo->lo_mutex);
                ret = -EBUSY;
                goto mark_visible;
        }
        /* Mark this loop device as no more bound, but not quite unbound yet */
        lo->lo_state = Lo_deleting;
        mutex_unlock(&lo->lo_mutex);

        loop_remove(lo);
        return 0;

mark_visible:
        /* Show this loop device again. */
        mutex_lock(&loop_ctl_mutex);
        lo->idr_visible = true;
        mutex_unlock(&loop_ctl_mutex);
        return ret;
}

static int loop_control_get_free(int idx)
{
        struct loop_device *lo;
        int id, ret;

        ret = mutex_lock_killable(&loop_ctl_mutex);
        if (ret)
                return ret;
        idr_for_each_entry(&loop_index_idr, lo, id) {
                /* Hitting a race results in creating a new loop device which is harmless. */
                if (lo->idr_visible && data_race(lo->lo_state) == Lo_unbound)
                        goto found;
        }
        mutex_unlock(&loop_ctl_mutex);
        return loop_add(-1);
found:
        mutex_unlock(&loop_ctl_mutex);
        return id;
}

static long loop_control_ioctl(struct file *file, unsigned int cmd,
                               unsigned long parm)
{
        switch (cmd) {
        case LOOP_CTL_ADD:
                return loop_add(parm);
        case LOOP_CTL_REMOVE:
                return loop_control_remove(parm);
        case LOOP_CTL_GET_FREE:
                return loop_control_get_free(parm);
        default:
                return -ENOSYS;
        }
}

static const struct file_operations loop_ctl_fops = {
        .open                = nonseekable_open,
        .unlocked_ioctl        = loop_control_ioctl,
        .compat_ioctl        = loop_control_ioctl,
        .owner                = THIS_MODULE,
        .llseek                = noop_llseek,
};

static struct miscdevice loop_misc = {
        .minor                = LOOP_CTRL_MINOR,
        .name                = "loop-control",
        .fops                = &loop_ctl_fops,
};

MODULE_ALIAS_MISCDEV(LOOP_CTRL_MINOR);
MODULE_ALIAS("devname:loop-control");

static int __init loop_init(void)
{
        int i;
        int err;

        part_shift = 0;
        if (max_part > 0) {
                part_shift = fls(max_part);

                /*
                 * Adjust max_part according to part_shift as it is exported
                 * to user space so that user can decide correct minor number
                 * if [s]he want to create more devices.
                 *
                 * Note that -1 is required because partition 0 is reserved
                 * for the whole disk.
                 */
                max_part = (1UL << part_shift) - 1;
        }

        if ((1UL << part_shift) > DISK_MAX_PARTS) {
                err = -EINVAL;
                goto err_out;
        }

        if (max_loop > 1UL << (MINORBITS - part_shift)) {
                err = -EINVAL;
                goto err_out;
        }

        err = misc_register(&loop_misc);
        if (err < 0)
                goto err_out;


        if (__register_blkdev(LOOP_MAJOR, "loop", loop_probe)) {
                err = -EIO;
                goto misc_out;
        }

        /* pre-create number of devices given by config or max_loop */
        for (i = 0; i < max_loop; i++)
                loop_add(i);

        printk(KERN_INFO "loop: module loaded\n");
        return 0;

misc_out:
        misc_deregister(&loop_misc);
err_out:
        return err;
}

static void __exit loop_exit(void)
{
        struct loop_device *lo;
        int id;

        unregister_blkdev(LOOP_MAJOR, "loop");
        misc_deregister(&loop_misc);

        /*
         * There is no need to use loop_ctl_mutex here, for nobody else can
         * access loop_index_idr when this module is unloading (unless forced
         * module unloading is requested). If this is not a clean unloading,
         * we have no means to avoid kernel crash.
         */
        idr_for_each_entry(&loop_index_idr, lo, id)
                loop_remove(lo);

        idr_destroy(&loop_index_idr);
}

module_init(loop_init);
module_exit(loop_exit);

#ifndef MODULE
static int __init max_loop_setup(char *str)
{
        max_loop = simple_strtol(str, NULL, 0);
#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
        max_loop_specified = true;
#endif
        return 1;
}

__setup("max_loop=", max_loop_setup);
#endif


















































    1 

    1 





























    1 










    1 




    1 





    1 














































    1 
























    1 





    1 




















    1 


    1 




























    1 





    1 




    1 















    1 


















































































































    1 








    1 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
// SPDX-License-Identifier: GPL-2.0-only
/*
 * unicode.c
 *
 * PURPOSE
 *        Routines for converting between UTF-8 and OSTA Compressed Unicode.
 *      Also handles filename mangling
 *
 * DESCRIPTION
 *        OSTA Compressed Unicode is explained in the OSTA UDF specification.
 *                http://www.osta.org/
 *        UTF-8 is explained in the IETF RFC XXXX.
 *                ftp://ftp.internic.net/rfc/rfcxxxx.txt
 *
 */

#include "udfdecl.h"

#include <linux/kernel.h>
#include <linux/string.h>        /* for memset */
#include <linux/nls.h>
#include <linux/crc-itu-t.h>
#include <linux/slab.h>

#include "udf_sb.h"

#define PLANE_SIZE 0x10000
#define UNICODE_MAX 0x10ffff
#define SURROGATE_MASK 0xfffff800
#define SURROGATE_PAIR 0x0000d800
#define SURROGATE_LOW  0x00000400
#define SURROGATE_CHAR_BITS 10
#define SURROGATE_CHAR_MASK ((1 << SURROGATE_CHAR_BITS) - 1)

#define ILLEGAL_CHAR_MARK        '_'
#define EXT_MARK                '.'
#define CRC_MARK                '#'
#define EXT_SIZE                5
/* Number of chars we need to store generated CRC to make filename unique */
#define CRC_LEN                        5

static unicode_t get_utf16_char(const uint8_t *str_i, int str_i_max_len,
                                int str_i_idx, int u_ch, unicode_t *ret)
{
        unicode_t c;
        int start_idx = str_i_idx;

        /* Expand OSTA compressed Unicode to Unicode */
        c = str_i[str_i_idx++];
        if (u_ch > 1)
                c = (c << 8) | str_i[str_i_idx++];
        if ((c & SURROGATE_MASK) == SURROGATE_PAIR) {
                unicode_t next;

                /* Trailing surrogate char */
                if (str_i_idx >= str_i_max_len) {
                        c = UNICODE_MAX + 1;
                        goto out;
                }

                /* Low surrogate must follow the high one... */
                if (c & SURROGATE_LOW) {
                        c = UNICODE_MAX + 1;
                        goto out;
                }

                WARN_ON_ONCE(u_ch != 2);
                next = str_i[str_i_idx++] << 8;
                next |= str_i[str_i_idx++];
                if ((next & SURROGATE_MASK) != SURROGATE_PAIR ||
                    !(next & SURROGATE_LOW)) {
                        c = UNICODE_MAX + 1;
                        goto out;
                }

                c = PLANE_SIZE +
                    ((c & SURROGATE_CHAR_MASK) << SURROGATE_CHAR_BITS) +
                    (next & SURROGATE_CHAR_MASK);
        }
out:
        *ret = c;
        return str_i_idx - start_idx;
}


static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
                              int *str_o_idx,
                              const uint8_t *str_i, int str_i_max_len,
                              int *str_i_idx,
                              int u_ch, int *needsCRC,
                              int (*conv_f)(wchar_t, unsigned char *, int),
                              int translate)
{
        unicode_t c;
        int illChar = 0;
        int len, gotch = 0;

        while (!gotch && *str_i_idx < str_i_max_len) {
                if (*str_o_idx >= str_o_max_len) {
                        *needsCRC = 1;
                        return gotch;
                }

                len = get_utf16_char(str_i, str_i_max_len, *str_i_idx, u_ch,
                                     &c);
                /* These chars cannot be converted. Replace them. */
                if (c == 0 || c > UNICODE_MAX || (conv_f && c > MAX_WCHAR_T) ||
                    (translate && c == '/')) {
                        illChar = 1;
                        if (!translate)
                                gotch = 1;
                } else if (illChar)
                        break;
                else
                        gotch = 1;
                *str_i_idx += len;
        }
        if (illChar) {
                *needsCRC = 1;
                c = ILLEGAL_CHAR_MARK;
                gotch = 1;
        }
        if (gotch) {
                if (conv_f) {
                        len = conv_f(c, &str_o[*str_o_idx],
                                     str_o_max_len - *str_o_idx);
                } else {
                        len = utf32_to_utf8(c, &str_o[*str_o_idx],
                                            str_o_max_len - *str_o_idx);
                        if (len < 0)
                                len = -ENAMETOOLONG;
                }
                /* Valid character? */
                if (len >= 0)
                        *str_o_idx += len;
                else if (len == -ENAMETOOLONG) {
                        *needsCRC = 1;
                        gotch = 0;
                } else {
                        str_o[(*str_o_idx)++] = ILLEGAL_CHAR_MARK;
                        *needsCRC = 1;
                }
        }
        return gotch;
}

static int udf_name_from_CS0(struct super_block *sb,
                             uint8_t *str_o, int str_max_len,
                             const uint8_t *ocu, int ocu_len,
                             int translate)
{
        uint32_t c;
        uint8_t cmp_id;
        int idx, len;
        int u_ch;
        int needsCRC = 0;
        int ext_i_len, ext_max_len;
        int str_o_len = 0;        /* Length of resulting output */
        int ext_o_len = 0;        /* Extension output length */
        int ext_crc_len = 0;        /* Extension output length if used with CRC */
        int i_ext = -1;                /* Extension position in input buffer */
        int o_crc = 0;                /* Rightmost possible output pos for CRC+ext */
        unsigned short valueCRC;
        uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1];
        uint8_t crc[CRC_LEN];
        int (*conv_f)(wchar_t, unsigned char *, int);

        if (str_max_len <= 0)
                return 0;

        if (ocu_len == 0) {
                memset(str_o, 0, str_max_len);
                return 0;
        }

        if (UDF_SB(sb)->s_nls_map)
                conv_f = UDF_SB(sb)->s_nls_map->uni2char;
        else
                conv_f = NULL;

        cmp_id = ocu[0];
        if (cmp_id != 8 && cmp_id != 16) {
                memset(str_o, 0, str_max_len);
                pr_err("unknown compression code (%u)\n", cmp_id);
                return -EINVAL;
        }
        u_ch = cmp_id >> 3;

        ocu++;
        ocu_len--;

        if (ocu_len % u_ch) {
                pr_err("incorrect filename length (%d)\n", ocu_len + 1);
                return -EINVAL;
        }

        if (translate) {
                /* Look for extension */
                for (idx = ocu_len - u_ch, ext_i_len = 0;
                     (idx >= 0) && (ext_i_len < EXT_SIZE);
                     idx -= u_ch, ext_i_len++) {
                        c = ocu[idx];
                        if (u_ch > 1)
                                c = (c << 8) | ocu[idx + 1];

                        if (c == EXT_MARK) {
                                if (ext_i_len)
                                        i_ext = idx;
                                break;
                        }
                }
                if (i_ext >= 0) {
                        /* Convert extension */
                        ext_max_len = min_t(int, sizeof(ext), str_max_len);
                        ext[ext_o_len++] = EXT_MARK;
                        idx = i_ext + u_ch;
                        while (udf_name_conv_char(ext, ext_max_len, &ext_o_len,
                                                  ocu, ocu_len, &idx,
                                                  u_ch, &needsCRC,
                                                  conv_f, translate)) {
                                if ((ext_o_len + CRC_LEN) < str_max_len)
                                        ext_crc_len = ext_o_len;
                        }
                }
        }

        idx = 0;
        while (1) {
                if (translate && (idx == i_ext)) {
                        if (str_o_len > (str_max_len - ext_o_len))
                                needsCRC = 1;
                        break;
                }

                if (!udf_name_conv_char(str_o, str_max_len, &str_o_len,
                                        ocu, ocu_len, &idx,
                                        u_ch, &needsCRC, conv_f, translate))
                        break;

                if (translate &&
                    (str_o_len <= (str_max_len - ext_o_len - CRC_LEN)))
                        o_crc = str_o_len;
        }

        if (translate) {
                if (str_o_len > 0 && str_o_len <= 2 && str_o[0] == '.' &&
                    (str_o_len == 1 || str_o[1] == '.'))
                        needsCRC = 1;
                if (needsCRC) {
                        str_o_len = o_crc;
                        valueCRC = crc_itu_t(0, ocu, ocu_len);
                        crc[0] = CRC_MARK;
                        crc[1] = hex_asc_upper_hi(valueCRC >> 8);
                        crc[2] = hex_asc_upper_lo(valueCRC >> 8);
                        crc[3] = hex_asc_upper_hi(valueCRC);
                        crc[4] = hex_asc_upper_lo(valueCRC);
                        len = min_t(int, CRC_LEN, str_max_len - str_o_len);
                        memcpy(&str_o[str_o_len], crc, len);
                        str_o_len += len;
                        ext_o_len = ext_crc_len;
                }
                if (ext_o_len > 0) {
                        memcpy(&str_o[str_o_len], ext, ext_o_len);
                        str_o_len += ext_o_len;
                }
        }

        return str_o_len;
}

static int udf_name_to_CS0(struct super_block *sb,
                           uint8_t *ocu, int ocu_max_len,
                           const uint8_t *str_i, int str_len)
{
        int i, len;
        unsigned int max_val;
        int u_len, u_ch;
        unicode_t uni_char;
        int (*conv_f)(const unsigned char *, int, wchar_t *);

        if (ocu_max_len <= 0)
                return 0;

        if (UDF_SB(sb)->s_nls_map)
                conv_f = UDF_SB(sb)->s_nls_map->char2uni;
        else
                conv_f = NULL;

        memset(ocu, 0, ocu_max_len);
        ocu[0] = 8;
        max_val = 0xff;
        u_ch = 1;

try_again:
        u_len = 1;
        for (i = 0; i < str_len; i += len) {
                /* Name didn't fit? */
                if (u_len + u_ch > ocu_max_len)
                        return 0;
                if (conv_f) {
                        wchar_t wchar;

                        len = conv_f(&str_i[i], str_len - i, &wchar);
                        if (len > 0)
                                uni_char = wchar;
                } else {
                        len = utf8_to_utf32(&str_i[i], str_len - i,
                                            &uni_char);
                }
                /* Invalid character, deal with it */
                if (len <= 0 || uni_char > UNICODE_MAX) {
                        len = 1;
                        uni_char = '?';
                }

                if (uni_char > max_val) {
                        unicode_t c;

                        if (max_val == 0xff) {
                                max_val = 0xffff;
                                ocu[0] = 0x10;
                                u_ch = 2;
                                goto try_again;
                        }
                        /*
                         * Use UTF-16 encoding for chars outside we
                         * cannot encode directly.
                         */
                        if (u_len + 2 * u_ch > ocu_max_len)
                                return 0;

                        uni_char -= PLANE_SIZE;
                        c = SURROGATE_PAIR |
                            ((uni_char >> SURROGATE_CHAR_BITS) &
                             SURROGATE_CHAR_MASK);
                        ocu[u_len++] = (uint8_t)(c >> 8);
                        ocu[u_len++] = (uint8_t)(c & 0xff);
                        uni_char = SURROGATE_PAIR | SURROGATE_LOW |
                                        (uni_char & SURROGATE_CHAR_MASK);
                }

                if (max_val == 0xffff)
                        ocu[u_len++] = (uint8_t)(uni_char >> 8);
                ocu[u_len++] = (uint8_t)(uni_char & 0xff);
        }

        return u_len;
}

/*
 * Convert CS0 dstring to output charset. Warning: This function may truncate
 * input string if it is too long as it is used for informational strings only
 * and it is better to truncate the string than to refuse mounting a media.
 */
int udf_dstrCS0toChar(struct super_block *sb, uint8_t *utf_o, int o_len,
                      const uint8_t *ocu_i, int i_len)
{
        int s_len = 0;

        if (i_len > 0) {
                s_len = ocu_i[i_len - 1];
                if (s_len >= i_len) {
                        pr_warn("incorrect dstring lengths (%d/%d),"
                                " truncating\n", s_len, i_len);
                        s_len = i_len - 1;
                        /* 2-byte encoding? Need to round properly... */
                        if (ocu_i[0] == 16)
                                s_len -= (s_len - 1) & 2;
                }
        }

        return udf_name_from_CS0(sb, utf_o, o_len, ocu_i, s_len, 0);
}

int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
                     uint8_t *dname, int dlen)
{
        int ret;

        if (!slen)
                return -EIO;

        if (dlen <= 0)
                return 0;

        ret = udf_name_from_CS0(sb, dname, dlen, sname, slen, 1);
        /* Zero length filename isn't valid... */
        if (ret == 0)
                ret = -EINVAL;
        return ret;
}

int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen,
                     uint8_t *dname, int dlen)
{
        return udf_name_to_CS0(sb, dname, dlen, sname, slen);
}










    7 
    1 




















































    2 
































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
/**
 * css_get - obtain a reference on the specified css
 * @css: target css
 *
 * The caller must already have a reference.
 */
CGROUP_REF_FN_ATTRS
void css_get(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_get(&css->refcnt);
}
CGROUP_REF_EXPORT(css_get)

/**
 * css_get_many - obtain references on the specified css
 * @css: target css
 * @n: number of references to get
 *
 * The caller must already have a reference.
 */
CGROUP_REF_FN_ATTRS
void css_get_many(struct cgroup_subsys_state *css, unsigned int n)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_get_many(&css->refcnt, n);
}
CGROUP_REF_EXPORT(css_get_many)

/**
 * css_tryget - try to obtain a reference on the specified css
 * @css: target css
 *
 * Obtain a reference on @css unless it already has reached zero and is
 * being released.  This function doesn't care whether @css is on or
 * offline.  The caller naturally needs to ensure that @css is accessible
 * but doesn't have to be holding a reference on it - IOW, RCU protected
 * access is good enough for this function.  Returns %true if a reference
 * count was successfully obtained; %false otherwise.
 */
CGROUP_REF_FN_ATTRS
bool css_tryget(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                return percpu_ref_tryget(&css->refcnt);
        return true;
}
CGROUP_REF_EXPORT(css_tryget)

/**
 * css_tryget_online - try to obtain a reference on the specified css if online
 * @css: target css
 *
 * Obtain a reference on @css if it's online.  The caller naturally needs
 * to ensure that @css is accessible but doesn't have to be holding a
 * reference on it - IOW, RCU protected access is good enough for this
 * function.  Returns %true if a reference count was successfully obtained;
 * %false otherwise.
 */
CGROUP_REF_FN_ATTRS
bool css_tryget_online(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                return percpu_ref_tryget_live(&css->refcnt);
        return true;
}
CGROUP_REF_EXPORT(css_tryget_online)

/**
 * css_put - put a css reference
 * @css: target css
 *
 * Put a reference obtained via css_get() and css_tryget_online().
 */
CGROUP_REF_FN_ATTRS
void css_put(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_put(&css->refcnt);
}
CGROUP_REF_EXPORT(css_put)

/**
 * css_put_many - put css references
 * @css: target css
 * @n: number of references to put
 *
 * Put references obtained via css_get() and css_tryget_online().
 */
CGROUP_REF_FN_ATTRS
void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_put_many(&css->refcnt, n);
}
CGROUP_REF_EXPORT(css_put_many)












































































































































    1 




















    1 












    1 





















































































































































































































































    1 






































    1 














    1 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 
















































































































































    3 




    3 










    1 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
// SPDX-License-Identifier: LGPL-2.1
/*
 * Copyright (c) 2012 Taobao.
 * Written by Tao Ma <boyu.mt@taobao.com>
 */

#include <linux/iomap.h>
#include <linux/fiemap.h>
#include <linux/namei.h>
#include <linux/iversion.h>
#include <linux/sched/mm.h>

#include "ext4_jbd2.h"
#include "ext4.h"
#include "xattr.h"
#include "truncate.h"

#define EXT4_XATTR_SYSTEM_DATA        "data"
#define EXT4_MIN_INLINE_DATA_SIZE        ((sizeof(__le32) * EXT4_N_BLOCKS))
#define EXT4_INLINE_DOTDOT_OFFSET        2
#define EXT4_INLINE_DOTDOT_SIZE                4

static int ext4_get_inline_size(struct inode *inode)
{
        if (EXT4_I(inode)->i_inline_off)
                return EXT4_I(inode)->i_inline_size;

        return 0;
}

static int get_max_inline_xattr_value_size(struct inode *inode,
                                           struct ext4_iloc *iloc)
{
        struct ext4_xattr_ibody_header *header;
        struct ext4_xattr_entry *entry;
        struct ext4_inode *raw_inode;
        void *end;
        int free, min_offs;

        if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
                return 0;

        min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
                        EXT4_GOOD_OLD_INODE_SIZE -
                        EXT4_I(inode)->i_extra_isize -
                        sizeof(struct ext4_xattr_ibody_header);

        /*
         * We need to subtract another sizeof(__u32) since an in-inode xattr
         * needs an empty 4 bytes to indicate the gap between the xattr entry
         * and the name/value pair.
         */
        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
                return EXT4_XATTR_SIZE(min_offs -
                        EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) -
                        EXT4_XATTR_ROUND - sizeof(__u32));

        raw_inode = ext4_raw_inode(iloc);
        header = IHDR(inode, raw_inode);
        entry = IFIRST(header);
        end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;

        /* Compute min_offs. */
        while (!IS_LAST_ENTRY(entry)) {
                void *next = EXT4_XATTR_NEXT(entry);

                if (next >= end) {
                        EXT4_ERROR_INODE(inode,
                                         "corrupt xattr in inline inode");
                        return 0;
                }
                if (!entry->e_value_inum && entry->e_value_size) {
                        size_t offs = le16_to_cpu(entry->e_value_offs);
                        if (offs < min_offs)
                                min_offs = offs;
                }
                entry = next;
        }
        free = min_offs -
                ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32);

        if (EXT4_I(inode)->i_inline_off) {
                entry = (struct ext4_xattr_entry *)
                        ((void *)raw_inode + EXT4_I(inode)->i_inline_off);

                free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
                goto out;
        }

        free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA));

        if (free > EXT4_XATTR_ROUND)
                free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND);
        else
                free = 0;

out:
        return free;
}

/*
 * Get the maximum size we now can store in an inode.
 * If we can't find the space for a xattr entry, don't use the space
 * of the extents since we have no space to indicate the inline data.
 */
int ext4_get_max_inline_size(struct inode *inode)
{
        int error, max_inline_size;
        struct ext4_iloc iloc;

        if (EXT4_I(inode)->i_extra_isize == 0)
                return 0;

        error = ext4_get_inode_loc(inode, &iloc);
        if (error) {
                ext4_error_inode_err(inode, __func__, __LINE__, 0, -error,
                                     "can't get inode location %lu",
                                     inode->i_ino);
                return 0;
        }

        down_read(&EXT4_I(inode)->xattr_sem);
        max_inline_size = get_max_inline_xattr_value_size(inode, &iloc);
        up_read(&EXT4_I(inode)->xattr_sem);

        brelse(iloc.bh);

        if (!max_inline_size)
                return 0;

        return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE;
}

/*
 * this function does not take xattr_sem, which is OK because it is
 * currently only used in a code path coming form ext4_iget, before
 * the new inode has been unlocked
 */
int ext4_find_inline_data_nolock(struct inode *inode)
{
        struct ext4_xattr_ibody_find is = {
                .s = { .not_found = -ENODATA, },
        };
        struct ext4_xattr_info i = {
                .name_index = EXT4_XATTR_INDEX_SYSTEM,
                .name = EXT4_XATTR_SYSTEM_DATA,
        };
        int error;

        if (EXT4_I(inode)->i_extra_isize == 0)
                return 0;

        error = ext4_get_inode_loc(inode, &is.iloc);
        if (error)
                return error;

        error = ext4_xattr_ibody_find(inode, &i, &is);
        if (error)
                goto out;

        if (!is.s.not_found) {
                if (is.s.here->e_value_inum) {
                        EXT4_ERROR_INODE(inode, "inline data xattr refers "
                                         "to an external xattr inode");
                        error = -EFSCORRUPTED;
                        goto out;
                }
                EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
                                        (void *)ext4_raw_inode(&is.iloc));
                EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
                                le32_to_cpu(is.s.here->e_value_size);
        }
out:
        brelse(is.iloc.bh);
        return error;
}

static int ext4_read_inline_data(struct inode *inode, void *buffer,
                                 unsigned int len,
                                 struct ext4_iloc *iloc)
{
        struct ext4_xattr_entry *entry;
        struct ext4_xattr_ibody_header *header;
        int cp_len = 0;
        struct ext4_inode *raw_inode;

        if (!len)
                return 0;

        BUG_ON(len > EXT4_I(inode)->i_inline_size);

        cp_len = min_t(unsigned int, len, EXT4_MIN_INLINE_DATA_SIZE);

        raw_inode = ext4_raw_inode(iloc);
        memcpy(buffer, (void *)(raw_inode->i_block), cp_len);

        len -= cp_len;
        buffer += cp_len;

        if (!len)
                goto out;

        header = IHDR(inode, raw_inode);
        entry = (struct ext4_xattr_entry *)((void *)raw_inode +
                                            EXT4_I(inode)->i_inline_off);
        len = min_t(unsigned int, len,
                    (unsigned int)le32_to_cpu(entry->e_value_size));

        memcpy(buffer,
               (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len);
        cp_len += len;

out:
        return cp_len;
}

/*
 * write the buffer to the inline inode.
 * If 'create' is set, we don't need to do the extra copy in the xattr
 * value since it is already handled by ext4_xattr_ibody_set.
 * That saves us one memcpy.
 */
static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
                                   void *buffer, loff_t pos, unsigned int len)
{
        struct ext4_xattr_entry *entry;
        struct ext4_xattr_ibody_header *header;
        struct ext4_inode *raw_inode;
        int cp_len = 0;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return;

        BUG_ON(!EXT4_I(inode)->i_inline_off);
        BUG_ON(pos + len > EXT4_I(inode)->i_inline_size);

        raw_inode = ext4_raw_inode(iloc);
        buffer += pos;

        if (pos < EXT4_MIN_INLINE_DATA_SIZE) {
                cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ?
                         EXT4_MIN_INLINE_DATA_SIZE - pos : len;
                memcpy((void *)raw_inode->i_block + pos, buffer, cp_len);

                len -= cp_len;
                buffer += cp_len;
                pos += cp_len;
        }

        if (!len)
                return;

        pos -= EXT4_MIN_INLINE_DATA_SIZE;
        header = IHDR(inode, raw_inode);
        entry = (struct ext4_xattr_entry *)((void *)raw_inode +
                                            EXT4_I(inode)->i_inline_off);

        memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos,
               buffer, len);
}

static int ext4_create_inline_data(handle_t *handle,
                                   struct inode *inode, unsigned len)
{
        int error;
        void *value = NULL;
        struct ext4_xattr_ibody_find is = {
                .s = { .not_found = -ENODATA, },
        };
        struct ext4_xattr_info i = {
                .name_index = EXT4_XATTR_INDEX_SYSTEM,
                .name = EXT4_XATTR_SYSTEM_DATA,
        };

        error = ext4_get_inode_loc(inode, &is.iloc);
        if (error)
                return error;

        BUFFER_TRACE(is.iloc.bh, "get_write_access");
        error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh,
                                              EXT4_JTR_NONE);
        if (error)
                goto out;

        if (len > EXT4_MIN_INLINE_DATA_SIZE) {
                value = EXT4_ZERO_XATTR_VALUE;
                len -= EXT4_MIN_INLINE_DATA_SIZE;
        } else {
                value = "";
                len = 0;
        }

        /* Insert the xttr entry. */
        i.value = value;
        i.value_len = len;

        error = ext4_xattr_ibody_find(inode, &i, &is);
        if (error)
                goto out;

        BUG_ON(!is.s.not_found);

        error = ext4_xattr_ibody_set(handle, inode, &i, &is);
        if (error) {
                if (error == -ENOSPC)
                        ext4_clear_inode_state(inode,
                                               EXT4_STATE_MAY_INLINE_DATA);
                goto out;
        }

        memset((void *)ext4_raw_inode(&is.iloc)->i_block,
                0, EXT4_MIN_INLINE_DATA_SIZE);

        EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
                                      (void *)ext4_raw_inode(&is.iloc));
        EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE;
        ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
        ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
        get_bh(is.iloc.bh);
        error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);

out:
        brelse(is.iloc.bh);
        return error;
}

static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
                                   unsigned int len)
{
        int error;
        void *value = NULL;
        struct ext4_xattr_ibody_find is = {
                .s = { .not_found = -ENODATA, },
        };
        struct ext4_xattr_info i = {
                .name_index = EXT4_XATTR_INDEX_SYSTEM,
                .name = EXT4_XATTR_SYSTEM_DATA,
        };

        /* If the old space is ok, write the data directly. */
        if (len <= EXT4_I(inode)->i_inline_size)
                return 0;

        error = ext4_get_inode_loc(inode, &is.iloc);
        if (error)
                return error;

        error = ext4_xattr_ibody_find(inode, &i, &is);
        if (error)
                goto out;

        BUG_ON(is.s.not_found);

        len -= EXT4_MIN_INLINE_DATA_SIZE;
        value = kzalloc(len, GFP_NOFS);
        if (!value) {
                error = -ENOMEM;
                goto out;
        }

        error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
                                     value, len);
        if (error < 0)
                goto out;

        BUFFER_TRACE(is.iloc.bh, "get_write_access");
        error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh,
                                              EXT4_JTR_NONE);
        if (error)
                goto out;

        /* Update the xattr entry. */
        i.value = value;
        i.value_len = len;

        error = ext4_xattr_ibody_set(handle, inode, &i, &is);
        if (error)
                goto out;

        EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
                                      (void *)ext4_raw_inode(&is.iloc));
        EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
                                le32_to_cpu(is.s.here->e_value_size);
        ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
        get_bh(is.iloc.bh);
        error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);

out:
        kfree(value);
        brelse(is.iloc.bh);
        return error;
}

static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
                                    unsigned int len)
{
        int ret, size, no_expand;
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
                return -ENOSPC;

        size = ext4_get_max_inline_size(inode);
        if (size < len)
                return -ENOSPC;

        ext4_write_lock_xattr(inode, &no_expand);

        if (ei->i_inline_off)
                ret = ext4_update_inline_data(handle, inode, len);
        else
                ret = ext4_create_inline_data(handle, inode, len);

        ext4_write_unlock_xattr(inode, &no_expand);
        return ret;
}

static int ext4_destroy_inline_data_nolock(handle_t *handle,
                                           struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_xattr_ibody_find is = {
                .s = { .not_found = 0, },
        };
        struct ext4_xattr_info i = {
                .name_index = EXT4_XATTR_INDEX_SYSTEM,
                .name = EXT4_XATTR_SYSTEM_DATA,
                .value = NULL,
                .value_len = 0,
        };
        int error;

        if (!ei->i_inline_off)
                return 0;

        error = ext4_get_inode_loc(inode, &is.iloc);
        if (error)
                return error;

        error = ext4_xattr_ibody_find(inode, &i, &is);
        if (error)
                goto out;

        BUFFER_TRACE(is.iloc.bh, "get_write_access");
        error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh,
                                              EXT4_JTR_NONE);
        if (error)
                goto out;

        error = ext4_xattr_ibody_set(handle, inode, &i, &is);
        if (error)
                goto out;

        memset((void *)ext4_raw_inode(&is.iloc)->i_block,
                0, EXT4_MIN_INLINE_DATA_SIZE);
        memset(ei->i_data, 0, EXT4_MIN_INLINE_DATA_SIZE);

        if (ext4_has_feature_extents(inode->i_sb)) {
                if (S_ISDIR(inode->i_mode) ||
                    S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) {
                        ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
                        ext4_ext_tree_init(handle, inode);
                }
        }
        ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA);

        get_bh(is.iloc.bh);
        error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);

        EXT4_I(inode)->i_inline_off = 0;
        EXT4_I(inode)->i_inline_size = 0;
        ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
out:
        brelse(is.iloc.bh);
        if (error == -ENODATA)
                error = 0;
        return error;
}

static int ext4_read_inline_folio(struct inode *inode, struct folio *folio)
{
        void *kaddr;
        int ret = 0;
        size_t len;
        struct ext4_iloc iloc;

        BUG_ON(!folio_test_locked(folio));
        BUG_ON(!ext4_has_inline_data(inode));
        BUG_ON(folio->index);

        if (!EXT4_I(inode)->i_inline_off) {
                ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.",
                             inode->i_ino);
                goto out;
        }

        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
                goto out;

        len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode));
        BUG_ON(len > PAGE_SIZE);
        kaddr = kmap_local_folio(folio, 0);
        ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
        kaddr = folio_zero_tail(folio, len, kaddr + len);
        kunmap_local(kaddr);
        folio_mark_uptodate(folio);
        brelse(iloc.bh);

out:
        return ret;
}

int ext4_readpage_inline(struct inode *inode, struct folio *folio)
{
        int ret = 0;

        down_read(&EXT4_I(inode)->xattr_sem);
        if (!ext4_has_inline_data(inode)) {
                up_read(&EXT4_I(inode)->xattr_sem);
                return -EAGAIN;
        }

        /*
         * Current inline data can only exist in the 1st page,
         * So for all the other pages, just set them uptodate.
         */
        if (!folio->index)
                ret = ext4_read_inline_folio(inode, folio);
        else if (!folio_test_uptodate(folio)) {
                folio_zero_segment(folio, 0, folio_size(folio));
                folio_mark_uptodate(folio);
        }

        up_read(&EXT4_I(inode)->xattr_sem);

        folio_unlock(folio);
        return ret >= 0 ? 0 : ret;
}

static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
                                              struct inode *inode)
{
        int ret, needed_blocks, no_expand;
        handle_t *handle = NULL;
        int retries = 0, sem_held = 0;
        struct folio *folio = NULL;
        unsigned from, to;
        struct ext4_iloc iloc;

        if (!ext4_has_inline_data(inode)) {
                /*
                 * clear the flag so that no new write
                 * will trap here again.
                 */
                ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
                return 0;
        }

        needed_blocks = ext4_writepage_trans_blocks(inode);

        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
                return ret;

retry:
        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                handle = NULL;
                goto out;
        }

        /* We cannot recurse into the filesystem as the transaction is already
         * started */
        folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio)) {
                ret = PTR_ERR(folio);
                goto out_nofolio;
        }

        ext4_write_lock_xattr(inode, &no_expand);
        sem_held = 1;
        /* If some one has already done this for us, just exit. */
        if (!ext4_has_inline_data(inode)) {
                ret = 0;
                goto out;
        }

        from = 0;
        to = ext4_get_inline_size(inode);
        if (!folio_test_uptodate(folio)) {
                ret = ext4_read_inline_folio(inode, folio);
                if (ret < 0)
                        goto out;
        }

        ret = ext4_destroy_inline_data_nolock(handle, inode);
        if (ret)
                goto out;

        if (ext4_should_dioread_nolock(inode)) {
                ret = __block_write_begin(&folio->page, from, to,
                                          ext4_get_block_unwritten);
        } else
                ret = __block_write_begin(&folio->page, from, to, ext4_get_block);

        if (!ret && ext4_should_journal_data(inode)) {
                ret = ext4_walk_page_buffers(handle, inode,
                                             folio_buffers(folio), from, to,
                                             NULL, do_journal_get_write_access);
        }

        if (ret) {
                folio_unlock(folio);
                folio_put(folio);
                folio = NULL;
                ext4_orphan_add(handle, inode);
                ext4_write_unlock_xattr(inode, &no_expand);
                sem_held = 0;
                ext4_journal_stop(handle);
                handle = NULL;
                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might
                 * still be on the orphan list; we need to
                 * make sure the inode is removed from the
                 * orphan list in that case.
                 */
                if (inode->i_nlink)
                        ext4_orphan_del(NULL, inode);
        }

        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;

        if (folio)
                block_commit_write(&folio->page, from, to);
out:
        if (folio) {
                folio_unlock(folio);
                folio_put(folio);
        }
out_nofolio:
        if (sem_held)
                ext4_write_unlock_xattr(inode, &no_expand);
        if (handle)
                ext4_journal_stop(handle);
        brelse(iloc.bh);
        return ret;
}

/*
 * Try to write data in the inode.
 * If the inode has inline data, check whether the new write can be
 * in the inode also. If not, create the page the handle, move the data
 * to the page make it update and let the later codes create extent for it.
 */
int ext4_try_to_write_inline_data(struct address_space *mapping,
                                  struct inode *inode,
                                  loff_t pos, unsigned len,
                                  struct page **pagep)
{
        int ret;
        handle_t *handle;
        struct folio *folio;
        struct ext4_iloc iloc;

        if (pos + len > ext4_get_max_inline_size(inode))
                goto convert;

        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
                return ret;

        /*
         * The possible write could happen in the inode,
         * so try to reserve the space in inode first.
         */
        handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                handle = NULL;
                goto out;
        }

        ret = ext4_prepare_inline_data(handle, inode, pos + len);
        if (ret && ret != -ENOSPC)
                goto out;

        /* We don't have space in inline inode, so convert it to extent. */
        if (ret == -ENOSPC) {
                ext4_journal_stop(handle);
                brelse(iloc.bh);
                goto convert;
        }

        ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh,
                                            EXT4_JTR_NONE);
        if (ret)
                goto out;

        folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
                                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio)) {
                ret = PTR_ERR(folio);
                goto out;
        }

        *pagep = &folio->page;
        down_read(&EXT4_I(inode)->xattr_sem);
        if (!ext4_has_inline_data(inode)) {
                ret = 0;
                folio_unlock(folio);
                folio_put(folio);
                goto out_up_read;
        }

        if (!folio_test_uptodate(folio)) {
                ret = ext4_read_inline_folio(inode, folio);
                if (ret < 0) {
                        folio_unlock(folio);
                        folio_put(folio);
                        goto out_up_read;
                }
        }

        ret = 1;
        handle = NULL;
out_up_read:
        up_read(&EXT4_I(inode)->xattr_sem);
out:
        if (handle && (ret != 1))
                ext4_journal_stop(handle);
        brelse(iloc.bh);
        return ret;
convert:
        return ext4_convert_inline_data_to_extent(mapping, inode);
}

int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
                               unsigned copied, struct folio *folio)
{
        handle_t *handle = ext4_journal_current_handle();
        int no_expand;
        void *kaddr;
        struct ext4_iloc iloc;
        int ret = 0, ret2;

        if (unlikely(copied < len) && !folio_test_uptodate(folio))
                copied = 0;

        if (likely(copied)) {
                ret = ext4_get_inode_loc(inode, &iloc);
                if (ret) {
                        folio_unlock(folio);
                        folio_put(folio);
                        ext4_std_error(inode->i_sb, ret);
                        goto out;
                }
                ext4_write_lock_xattr(inode, &no_expand);
                BUG_ON(!ext4_has_inline_data(inode));

                /*
                 * ei->i_inline_off may have changed since
                 * ext4_write_begin() called
                 * ext4_try_to_write_inline_data()
                 */
                (void) ext4_find_inline_data_nolock(inode);

                kaddr = kmap_local_folio(folio, 0);
                ext4_write_inline_data(inode, &iloc, kaddr, pos, copied);
                kunmap_local(kaddr);
                folio_mark_uptodate(folio);
                /* clear dirty flag so that writepages wouldn't work for us. */
                folio_clear_dirty(folio);

                ext4_write_unlock_xattr(inode, &no_expand);
                brelse(iloc.bh);

                /*
                 * It's important to update i_size while still holding folio
                 * lock: page writeout could otherwise come in and zero
                 * beyond i_size.
                 */
                ext4_update_inode_size(inode, pos + copied);
        }
        folio_unlock(folio);
        folio_put(folio);

        /*
         * Don't mark the inode dirty under folio lock. First, it unnecessarily
         * makes the holding time of folio lock longer. Second, it forces lock
         * ordering of folio lock and transaction start for journaling
         * filesystems.
         */
        if (likely(copied))
                mark_inode_dirty(inode);
out:
        /*
         * If we didn't copy as much data as expected, we need to trim back
         * size of xattr containing inline data.
         */
        if (pos + len > inode->i_size && ext4_can_truncate(inode))
                ext4_orphan_add(handle, inode);

        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
        if (pos + len > inode->i_size) {
                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
                if (inode->i_nlink)
                        ext4_orphan_del(NULL, inode);
        }
        return ret ? ret : copied;
}

/*
 * Try to make the page cache and handle ready for the inline data case.
 * We can call this function in 2 cases:
 * 1. The inode is created and the first write exceeds inline size. We can
 *    clear the inode state safely.
 * 2. The inode has inline data, then we need to read the data, make it
 *    update and dirty so that ext4_da_writepages can handle it. We don't
 *    need to start the journal since the file's metadata isn't changed now.
 */
static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
                                                 struct inode *inode,
                                                 void **fsdata)
{
        int ret = 0, inline_size;
        struct folio *folio;

        folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN,
                                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio))
                return PTR_ERR(folio);

        down_read(&EXT4_I(inode)->xattr_sem);
        if (!ext4_has_inline_data(inode)) {
                ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
                goto out;
        }

        inline_size = ext4_get_inline_size(inode);

        if (!folio_test_uptodate(folio)) {
                ret = ext4_read_inline_folio(inode, folio);
                if (ret < 0)
                        goto out;
        }

        ret = __block_write_begin(&folio->page, 0, inline_size,
                                  ext4_da_get_block_prep);
        if (ret) {
                up_read(&EXT4_I(inode)->xattr_sem);
                folio_unlock(folio);
                folio_put(folio);
                ext4_truncate_failed_write(inode);
                return ret;
        }

        folio_mark_dirty(folio);
        folio_mark_uptodate(folio);
        ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
        *fsdata = (void *)CONVERT_INLINE_DATA;

out:
        up_read(&EXT4_I(inode)->xattr_sem);
        if (folio) {
                folio_unlock(folio);
                folio_put(folio);
        }
        return ret;
}

/*
 * Prepare the write for the inline data.
 * If the data can be written into the inode, we just read
 * the page and make it uptodate, and start the journal.
 * Otherwise read the page, makes it dirty so that it can be
 * handle in writepages(the i_disksize update is left to the
 * normal ext4_da_write_end).
 */
int ext4_da_write_inline_data_begin(struct address_space *mapping,
                                    struct inode *inode,
                                    loff_t pos, unsigned len,
                                    struct page **pagep,
                                    void **fsdata)
{
        int ret;
        handle_t *handle;
        struct folio *folio;
        struct ext4_iloc iloc;
        int retries = 0;

        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
                return ret;

retry_journal:
        handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                goto out;
        }

        ret = ext4_prepare_inline_data(handle, inode, pos + len);
        if (ret && ret != -ENOSPC)
                goto out_journal;

        if (ret == -ENOSPC) {
                ext4_journal_stop(handle);
                ret = ext4_da_convert_inline_data_to_extent(mapping,
                                                            inode,
                                                            fsdata);
                if (ret == -ENOSPC &&
                    ext4_should_retry_alloc(inode->i_sb, &retries))
                        goto retry_journal;
                goto out;
        }

        /*
         * We cannot recurse into the filesystem as the transaction
         * is already started.
         */
        folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
                                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio)) {
                ret = PTR_ERR(folio);
                goto out_journal;
        }

        down_read(&EXT4_I(inode)->xattr_sem);
        if (!ext4_has_inline_data(inode)) {
                ret = 0;
                goto out_release_page;
        }

        if (!folio_test_uptodate(folio)) {
                ret = ext4_read_inline_folio(inode, folio);
                if (ret < 0)
                        goto out_release_page;
        }
        ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh,
                                            EXT4_JTR_NONE);
        if (ret)
                goto out_release_page;

        up_read(&EXT4_I(inode)->xattr_sem);
        *pagep = &folio->page;
        brelse(iloc.bh);
        return 1;
out_release_page:
        up_read(&EXT4_I(inode)->xattr_sem);
        folio_unlock(folio);
        folio_put(folio);
out_journal:
        ext4_journal_stop(handle);
out:
        brelse(iloc.bh);
        return ret;
}

#ifdef INLINE_DIR_DEBUG
void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
                          void *inline_start, int inline_size)
{
        int offset;
        unsigned short de_len;
        struct ext4_dir_entry_2 *de = inline_start;
        void *dlimit = inline_start + inline_size;

        trace_printk("inode %lu\n", dir->i_ino);
        offset = 0;
        while ((void *)de < dlimit) {
                de_len = ext4_rec_len_from_disk(de->rec_len, inline_size);
                trace_printk("de: off %u rlen %u name %.*s nlen %u ino %u\n",
                             offset, de_len, de->name_len, de->name,
                             de->name_len, le32_to_cpu(de->inode));
                if (ext4_check_dir_entry(dir, NULL, de, bh,
                                         inline_start, inline_size, offset))
                        BUG();

                offset += de_len;
                de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
        }
}
#else
#define ext4_show_inline_dir(dir, bh, inline_start, inline_size)
#endif

/*
 * Add a new entry into a inline dir.
 * It will return -ENOSPC if no space is available, and -EIO
 * and -EEXIST if directory entry already exists.
 */
static int ext4_add_dirent_to_inline(handle_t *handle,
                                     struct ext4_filename *fname,
                                     struct inode *dir,
                                     struct inode *inode,
                                     struct ext4_iloc *iloc,
                                     void *inline_start, int inline_size)
{
        int                err;
        struct ext4_dir_entry_2 *de;

        err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start,
                                inline_size, fname, &de);
        if (err)
                return err;

        BUFFER_TRACE(iloc->bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, dir->i_sb, iloc->bh,
                                            EXT4_JTR_NONE);
        if (err)
                return err;
        ext4_insert_dentry(dir, inode, de, inline_size, fname);

        ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);

        /*
         * XXX shouldn't update any times until successful
         * completion of syscall, but too many callers depend
         * on this.
         *
         * XXX similarly, too many callers depend on
         * ext4_new_inode() setting the times, but error
         * recovery deletes the inode, so the worst that can
         * happen is that the times are slightly out of date
         * and/or different from the directory change time.
         */
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        ext4_update_dx_flag(dir);
        inode_inc_iversion(dir);
        return 1;
}

static void *ext4_get_inline_xattr_pos(struct inode *inode,
                                       struct ext4_iloc *iloc)
{
        struct ext4_xattr_entry *entry;
        struct ext4_xattr_ibody_header *header;

        BUG_ON(!EXT4_I(inode)->i_inline_off);

        header = IHDR(inode, ext4_raw_inode(iloc));
        entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
                                            EXT4_I(inode)->i_inline_off);

        return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs);
}

/* Set the final de to cover the whole block. */
static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
{
        struct ext4_dir_entry_2 *de, *prev_de;
        void *limit;
        int de_len;

        de = de_buf;
        if (old_size) {
                limit = de_buf + old_size;
                do {
                        prev_de = de;
                        de_len = ext4_rec_len_from_disk(de->rec_len, old_size);
                        de_buf += de_len;
                        de = de_buf;
                } while (de_buf < limit);

                prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size -
                                                        old_size, new_size);
        } else {
                /* this is just created, so create an empty entry. */
                de->inode = 0;
                de->rec_len = ext4_rec_len_to_disk(new_size, new_size);
        }
}

static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
                                  struct ext4_iloc *iloc)
{
        int ret;
        int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
        int new_size = get_max_inline_xattr_value_size(dir, iloc);

        if (new_size - old_size <= ext4_dir_rec_len(1, NULL))
                return -ENOSPC;

        ret = ext4_update_inline_data(handle, dir,
                                      new_size + EXT4_MIN_INLINE_DATA_SIZE);
        if (ret)
                return ret;

        ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size,
                             EXT4_I(dir)->i_inline_size -
                                                EXT4_MIN_INLINE_DATA_SIZE);
        dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size;
        return 0;
}

static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
                                     struct ext4_iloc *iloc,
                                     void *buf, int inline_size)
{
        int ret;

        ret = ext4_create_inline_data(handle, inode, inline_size);
        if (ret) {
                ext4_msg(inode->i_sb, KERN_EMERG,
                        "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)",
                        inode->i_ino, ret);
                return;
        }
        ext4_write_inline_data(inode, iloc, buf, 0, inline_size);
        ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
}

static int ext4_finish_convert_inline_dir(handle_t *handle,
                                          struct inode *inode,
                                          struct buffer_head *dir_block,
                                          void *buf,
                                          int inline_size)
{
        int err, csum_size = 0, header_size = 0;
        struct ext4_dir_entry_2 *de;
        void *target = dir_block->b_data;

        /*
         * First create "." and ".." and then copy the dir information
         * back to the block.
         */
        de = target;
        de = ext4_init_dot_dotdot(inode, de,
                inode->i_sb->s_blocksize, csum_size,
                le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
        header_size = (void *)de - target;

        memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
                inline_size - EXT4_INLINE_DOTDOT_SIZE);

        if (ext4_has_metadata_csum(inode->i_sb))
                csum_size = sizeof(struct ext4_dir_entry_tail);

        inode->i_size = inode->i_sb->s_blocksize;
        i_size_write(inode, inode->i_sb->s_blocksize);
        EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
        ext4_update_final_de(dir_block->b_data,
                        inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size,
                        inode->i_sb->s_blocksize - csum_size);

        if (csum_size)
                ext4_initialize_dirent_tail(dir_block,
                                            inode->i_sb->s_blocksize);
        set_buffer_uptodate(dir_block);
        unlock_buffer(dir_block);
        err = ext4_handle_dirty_dirblock(handle, inode, dir_block);
        if (err)
                return err;
        set_buffer_verified(dir_block);
        return ext4_mark_inode_dirty(handle, inode);
}

static int ext4_convert_inline_data_nolock(handle_t *handle,
                                           struct inode *inode,
                                           struct ext4_iloc *iloc)
{
        int error;
        void *buf = NULL;
        struct buffer_head *data_bh = NULL;
        struct ext4_map_blocks map;
        int inline_size;

        inline_size = ext4_get_inline_size(inode);
        buf = kmalloc(inline_size, GFP_NOFS);
        if (!buf) {
                error = -ENOMEM;
                goto out;
        }

        error = ext4_read_inline_data(inode, buf, inline_size, iloc);
        if (error < 0)
                goto out;

        /*
         * Make sure the inline directory entries pass checks before we try to
         * convert them, so that we avoid touching stuff that needs fsck.
         */
        if (S_ISDIR(inode->i_mode)) {
                error = ext4_check_all_de(inode, iloc->bh,
                                        buf + EXT4_INLINE_DOTDOT_SIZE,
                                        inline_size - EXT4_INLINE_DOTDOT_SIZE);
                if (error)
                        goto out;
        }

        error = ext4_destroy_inline_data_nolock(handle, inode);
        if (error)
                goto out;

        map.m_lblk = 0;
        map.m_len = 1;
        map.m_flags = 0;
        error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE);
        if (error < 0)
                goto out_restore;
        if (!(map.m_flags & EXT4_MAP_MAPPED)) {
                error = -EIO;
                goto out_restore;
        }

        data_bh = sb_getblk(inode->i_sb, map.m_pblk);
        if (!data_bh) {
                error = -ENOMEM;
                goto out_restore;
        }

        lock_buffer(data_bh);
        error = ext4_journal_get_create_access(handle, inode->i_sb, data_bh,
                                               EXT4_JTR_NONE);
        if (error) {
                unlock_buffer(data_bh);
                error = -EIO;
                goto out_restore;
        }
        memset(data_bh->b_data, 0, inode->i_sb->s_blocksize);

        if (!S_ISDIR(inode->i_mode)) {
                memcpy(data_bh->b_data, buf, inline_size);
                set_buffer_uptodate(data_bh);
                unlock_buffer(data_bh);
                error = ext4_handle_dirty_metadata(handle,
                                                   inode, data_bh);
        } else {
                error = ext4_finish_convert_inline_dir(handle, inode, data_bh,
                                                       buf, inline_size);
        }

out_restore:
        if (error)
                ext4_restore_inline_data(handle, inode, iloc, buf, inline_size);

out:
        brelse(data_bh);
        kfree(buf);
        return error;
}

/*
 * Try to add the new entry to the inline data.
 * If succeeds, return 0. If not, extended the inline dir and copied data to
 * the new created block.
 */
int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
                              struct inode *dir, struct inode *inode)
{
        int ret, ret2, inline_size, no_expand;
        void *inline_start;
        struct ext4_iloc iloc;

        ret = ext4_get_inode_loc(dir, &iloc);
        if (ret)
                return ret;

        ext4_write_lock_xattr(dir, &no_expand);
        if (!ext4_has_inline_data(dir))
                goto out;

        inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
                                                 EXT4_INLINE_DOTDOT_SIZE;
        inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;

        ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc,
                                        inline_start, inline_size);
        if (ret != -ENOSPC)
                goto out;

        /* check whether it can be inserted to inline xattr space. */
        inline_size = EXT4_I(dir)->i_inline_size -
                        EXT4_MIN_INLINE_DATA_SIZE;
        if (!inline_size) {
                /* Try to use the xattr space.*/
                ret = ext4_update_inline_dir(handle, dir, &iloc);
                if (ret && ret != -ENOSPC)
                        goto out;

                inline_size = EXT4_I(dir)->i_inline_size -
                                EXT4_MIN_INLINE_DATA_SIZE;
        }

        if (inline_size) {
                inline_start = ext4_get_inline_xattr_pos(dir, &iloc);

                ret = ext4_add_dirent_to_inline(handle, fname, dir,
                                                inode, &iloc, inline_start,
                                                inline_size);

                if (ret != -ENOSPC)
                        goto out;
        }

        /*
         * The inline space is filled up, so create a new block for it.
         * As the extent tree will be created, we have to save the inline
         * dir first.
         */
        ret = ext4_convert_inline_data_nolock(handle, dir, &iloc);

out:
        ext4_write_unlock_xattr(dir, &no_expand);
        ret2 = ext4_mark_inode_dirty(handle, dir);
        if (unlikely(ret2 && !ret))
                ret = ret2;
        brelse(iloc.bh);
        return ret;
}

/*
 * This function fills a red-black tree with information from an
 * inlined dir.  It returns the number directory entries loaded
 * into the tree.  If there is an error it is returned in err.
 */
int ext4_inlinedir_to_tree(struct file *dir_file,
                           struct inode *dir, ext4_lblk_t block,
                           struct dx_hash_info *hinfo,
                           __u32 start_hash, __u32 start_minor_hash,
                           int *has_inline_data)
{
        int err = 0, count = 0;
        unsigned int parent_ino;
        int pos;
        struct ext4_dir_entry_2 *de;
        struct inode *inode = file_inode(dir_file);
        int ret, inline_size = 0;
        struct ext4_iloc iloc;
        void *dir_buf = NULL;
        struct ext4_dir_entry_2 fake;
        struct fscrypt_str tmp_str;

        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
                return ret;

        down_read(&EXT4_I(inode)->xattr_sem);
        if (!ext4_has_inline_data(inode)) {
                up_read(&EXT4_I(inode)->xattr_sem);
                *has_inline_data = 0;
                goto out;
        }

        inline_size = ext4_get_inline_size(inode);
        dir_buf = kmalloc(inline_size, GFP_NOFS);
        if (!dir_buf) {
                ret = -ENOMEM;
                up_read(&EXT4_I(inode)->xattr_sem);
                goto out;
        }

        ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
        up_read(&EXT4_I(inode)->xattr_sem);
        if (ret < 0)
                goto out;

        pos = 0;
        parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
        while (pos < inline_size) {
                /*
                 * As inlined dir doesn't store any information about '.' and
                 * only the inode number of '..' is stored, we have to handle
                 * them differently.
                 */
                if (pos == 0) {
                        fake.inode = cpu_to_le32(inode->i_ino);
                        fake.name_len = 1;
                        strcpy(fake.name, ".");
                        fake.rec_len = ext4_rec_len_to_disk(
                                          ext4_dir_rec_len(fake.name_len, NULL),
                                          inline_size);
                        ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
                        de = &fake;
                        pos = EXT4_INLINE_DOTDOT_OFFSET;
                } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
                        fake.inode = cpu_to_le32(parent_ino);
                        fake.name_len = 2;
                        strcpy(fake.name, "..");
                        fake.rec_len = ext4_rec_len_to_disk(
                                          ext4_dir_rec_len(fake.name_len, NULL),
                                          inline_size);
                        ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
                        de = &fake;
                        pos = EXT4_INLINE_DOTDOT_SIZE;
                } else {
                        de = (struct ext4_dir_entry_2 *)(dir_buf + pos);
                        pos += ext4_rec_len_from_disk(de->rec_len, inline_size);
                        if (ext4_check_dir_entry(inode, dir_file, de,
                                         iloc.bh, dir_buf,
                                         inline_size, pos)) {
                                ret = count;
                                goto out;
                        }
                }

                if (ext4_hash_in_dirent(dir)) {
                        hinfo->hash = EXT4_DIRENT_HASH(de);
                        hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de);
                } else {
                        ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
                }
                if ((hinfo->hash < start_hash) ||
                    ((hinfo->hash == start_hash) &&
                     (hinfo->minor_hash < start_minor_hash)))
                        continue;
                if (de->inode == 0)
                        continue;
                tmp_str.name = de->name;
                tmp_str.len = de->name_len;
                err = ext4_htree_store_dirent(dir_file, hinfo->hash,
                                              hinfo->minor_hash, de, &tmp_str);
                if (err) {
                        ret = err;
                        goto out;
                }
                count++;
        }
        ret = count;
out:
        kfree(dir_buf);
        brelse(iloc.bh);
        return ret;
}

/*
 * So this function is called when the volume is mkfsed with
 * dir_index disabled. In order to keep f_pos persistent
 * after we convert from an inlined dir to a blocked based,
 * we just pretend that we are a normal dir and return the
 * offset as if '.' and '..' really take place.
 *
 */
int ext4_read_inline_dir(struct file *file,
                         struct dir_context *ctx,
                         int *has_inline_data)
{
        unsigned int offset, parent_ino;
        int i;
        struct ext4_dir_entry_2 *de;
        struct super_block *sb;
        struct inode *inode = file_inode(file);
        int ret, inline_size = 0;
        struct ext4_iloc iloc;
        void *dir_buf = NULL;
        int dotdot_offset, dotdot_size, extra_offset, extra_size;

        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
                return ret;

        down_read(&EXT4_I(inode)->xattr_sem);
        if (!ext4_has_inline_data(inode)) {
                up_read(&EXT4_I(inode)->xattr_sem);
                *has_inline_data = 0;
                goto out;
        }

        inline_size = ext4_get_inline_size(inode);
        dir_buf = kmalloc(inline_size, GFP_NOFS);
        if (!dir_buf) {
                ret = -ENOMEM;
                up_read(&EXT4_I(inode)->xattr_sem);
                goto out;
        }

        ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
        up_read(&EXT4_I(inode)->xattr_sem);
        if (ret < 0)
                goto out;

        ret = 0;
        sb = inode->i_sb;
        parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
        offset = ctx->pos;

        /*
         * dotdot_offset and dotdot_size is the real offset and
         * size for ".." and "." if the dir is block based while
         * the real size for them are only EXT4_INLINE_DOTDOT_SIZE.
         * So we will use extra_offset and extra_size to indicate them
         * during the inline dir iteration.
         */
        dotdot_offset = ext4_dir_rec_len(1, NULL);
        dotdot_size = dotdot_offset + ext4_dir_rec_len(2, NULL);
        extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
        extra_size = extra_offset + inline_size;

        /*
         * If the version has changed since the last call to
         * readdir(2), then we might be pointing to an invalid
         * dirent right now.  Scan from the start of the inline
         * dir to make sure.
         */
        if (!inode_eq_iversion(inode, file->f_version)) {
                for (i = 0; i < extra_size && i < offset;) {
                        /*
                         * "." is with offset 0 and
                         * ".." is dotdot_offset.
                         */
                        if (!i) {
                                i = dotdot_offset;
                                continue;
                        } else if (i == dotdot_offset) {
                                i = dotdot_size;
                                continue;
                        }
                        /* for other entry, the real offset in
                         * the buf has to be tuned accordingly.
                         */
                        de = (struct ext4_dir_entry_2 *)
                                (dir_buf + i - extra_offset);
                        /* It's too expensive to do a full
                         * dirent test each time round this
                         * loop, but we do have to test at
                         * least that it is non-zero.  A
                         * failure will be detected in the
                         * dirent test below. */
                        if (ext4_rec_len_from_disk(de->rec_len, extra_size)
                                < ext4_dir_rec_len(1, NULL))
                                break;
                        i += ext4_rec_len_from_disk(de->rec_len,
                                                    extra_size);
                }
                offset = i;
                ctx->pos = offset;
                file->f_version = inode_query_iversion(inode);
        }

        while (ctx->pos < extra_size) {
                if (ctx->pos == 0) {
                        if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
                                goto out;
                        ctx->pos = dotdot_offset;
                        continue;
                }

                if (ctx->pos == dotdot_offset) {
                        if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR))
                                goto out;
                        ctx->pos = dotdot_size;
                        continue;
                }

                de = (struct ext4_dir_entry_2 *)
                        (dir_buf + ctx->pos - extra_offset);
                if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf,
                                         extra_size, ctx->pos))
                        goto out;
                if (le32_to_cpu(de->inode)) {
                        if (!dir_emit(ctx, de->name, de->name_len,
                                      le32_to_cpu(de->inode),
                                      get_dtype(sb, de->file_type)))
                                goto out;
                }
                ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size);
        }
out:
        kfree(dir_buf);
        brelse(iloc.bh);
        return ret;
}

void *ext4_read_inline_link(struct inode *inode)
{
        struct ext4_iloc iloc;
        int ret, inline_size;
        void *link;

        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
                return ERR_PTR(ret);

        ret = -ENOMEM;
        inline_size = ext4_get_inline_size(inode);
        link = kmalloc(inline_size + 1, GFP_NOFS);
        if (!link)
                goto out;

        ret = ext4_read_inline_data(inode, link, inline_size, &iloc);
        if (ret < 0) {
                kfree(link);
                goto out;
        }
        nd_terminate_link(link, inode->i_size, ret);
out:
        if (ret < 0)
                link = ERR_PTR(ret);
        brelse(iloc.bh);
        return link;
}

struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
                                        struct ext4_dir_entry_2 **parent_de,
                                        int *retval)
{
        struct ext4_iloc iloc;

        *retval = ext4_get_inode_loc(inode, &iloc);
        if (*retval)
                return NULL;

        *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;

        return iloc.bh;
}

/*
 * Try to create the inline data for the new dir.
 * If it succeeds, return 0, otherwise return the error.
 * In case of ENOSPC, the caller should create the normal disk layout dir.
 */
int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent,
                               struct inode *inode)
{
        int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE;
        struct ext4_iloc iloc;
        struct ext4_dir_entry_2 *de;

        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
                return ret;

        ret = ext4_prepare_inline_data(handle, inode, inline_size);
        if (ret)
                goto out;

        /*
         * For inline dir, we only save the inode information for the ".."
         * and create a fake dentry to cover the left space.
         */
        de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
        de->inode = cpu_to_le32(parent->i_ino);
        de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE);
        de->inode = 0;
        de->rec_len = ext4_rec_len_to_disk(
                                inline_size - EXT4_INLINE_DOTDOT_SIZE,
                                inline_size);
        set_nlink(inode, 2);
        inode->i_size = EXT4_I(inode)->i_disksize = inline_size;
out:
        brelse(iloc.bh);
        return ret;
}

struct buffer_head *ext4_find_inline_entry(struct inode *dir,
                                        struct ext4_filename *fname,
                                        struct ext4_dir_entry_2 **res_dir,
                                        int *has_inline_data)
{
        int ret;
        struct ext4_iloc iloc;
        void *inline_start;
        int inline_size;

        if (ext4_get_inode_loc(dir, &iloc))
                return NULL;

        down_read(&EXT4_I(dir)->xattr_sem);
        if (!ext4_has_inline_data(dir)) {
                *has_inline_data = 0;
                goto out;
        }

        inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
                                                EXT4_INLINE_DOTDOT_SIZE;
        inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
        ret = ext4_search_dir(iloc.bh, inline_start, inline_size,
                              dir, fname, 0, res_dir);
        if (ret == 1)
                goto out_find;
        if (ret < 0)
                goto out;

        if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE)
                goto out;

        inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
        inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;

        ret = ext4_search_dir(iloc.bh, inline_start, inline_size,
                              dir, fname, 0, res_dir);
        if (ret == 1)
                goto out_find;

out:
        brelse(iloc.bh);
        iloc.bh = NULL;
out_find:
        up_read(&EXT4_I(dir)->xattr_sem);
        return iloc.bh;
}

int ext4_delete_inline_entry(handle_t *handle,
                             struct inode *dir,
                             struct ext4_dir_entry_2 *de_del,
                             struct buffer_head *bh,
                             int *has_inline_data)
{
        int err, inline_size, no_expand;
        struct ext4_iloc iloc;
        void *inline_start;

        err = ext4_get_inode_loc(dir, &iloc);
        if (err)
                return err;

        ext4_write_lock_xattr(dir, &no_expand);
        if (!ext4_has_inline_data(dir)) {
                *has_inline_data = 0;
                goto out;
        }

        if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) <
                EXT4_MIN_INLINE_DATA_SIZE) {
                inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
                                        EXT4_INLINE_DOTDOT_SIZE;
                inline_size = EXT4_MIN_INLINE_DATA_SIZE -
                                EXT4_INLINE_DOTDOT_SIZE;
        } else {
                inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
                inline_size = ext4_get_inline_size(dir) -
                                EXT4_MIN_INLINE_DATA_SIZE;
        }

        BUFFER_TRACE(bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, dir->i_sb, bh,
                                            EXT4_JTR_NONE);
        if (err)
                goto out;

        err = ext4_generic_delete_entry(dir, de_del, bh,
                                        inline_start, inline_size, 0);
        if (err)
                goto out;

        ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size);
out:
        ext4_write_unlock_xattr(dir, &no_expand);
        if (likely(err == 0))
                err = ext4_mark_inode_dirty(handle, dir);
        brelse(iloc.bh);
        if (err != -ENOENT)
                ext4_std_error(dir->i_sb, err);
        return err;
}

/*
 * Get the inline dentry at offset.
 */
static inline struct ext4_dir_entry_2 *
ext4_get_inline_entry(struct inode *inode,
                      struct ext4_iloc *iloc,
                      unsigned int offset,
                      void **inline_start,
                      int *inline_size)
{
        void *inline_pos;

        BUG_ON(offset > ext4_get_inline_size(inode));

        if (offset < EXT4_MIN_INLINE_DATA_SIZE) {
                inline_pos = (void *)ext4_raw_inode(iloc)->i_block;
                *inline_size = EXT4_MIN_INLINE_DATA_SIZE;
        } else {
                inline_pos = ext4_get_inline_xattr_pos(inode, iloc);
                offset -= EXT4_MIN_INLINE_DATA_SIZE;
                *inline_size = ext4_get_inline_size(inode) -
                                EXT4_MIN_INLINE_DATA_SIZE;
        }

        if (inline_start)
                *inline_start = inline_pos;
        return (struct ext4_dir_entry_2 *)(inline_pos + offset);
}

bool empty_inline_dir(struct inode *dir, int *has_inline_data)
{
        int err, inline_size;
        struct ext4_iloc iloc;
        size_t inline_len;
        void *inline_pos;
        unsigned int offset;
        struct ext4_dir_entry_2 *de;
        bool ret = false;

        err = ext4_get_inode_loc(dir, &iloc);
        if (err) {
                EXT4_ERROR_INODE_ERR(dir, -err,
                                     "error %d getting inode %lu block",
                                     err, dir->i_ino);
                return false;
        }

        down_read(&EXT4_I(dir)->xattr_sem);
        if (!ext4_has_inline_data(dir)) {
                *has_inline_data = 0;
                ret = true;
                goto out;
        }

        de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
        if (!le32_to_cpu(de->inode)) {
                ext4_warning(dir->i_sb,
                             "bad inline directory (dir #%lu) - no `..'",
                             dir->i_ino);
                goto out;
        }

        inline_len = ext4_get_inline_size(dir);
        offset = EXT4_INLINE_DOTDOT_SIZE;
        while (offset < inline_len) {
                de = ext4_get_inline_entry(dir, &iloc, offset,
                                           &inline_pos, &inline_size);
                if (ext4_check_dir_entry(dir, NULL, de,
                                         iloc.bh, inline_pos,
                                         inline_size, offset)) {
                        ext4_warning(dir->i_sb,
                                     "bad inline directory (dir #%lu) - "
                                     "inode %u, rec_len %u, name_len %d"
                                     "inline size %d",
                                     dir->i_ino, le32_to_cpu(de->inode),
                                     le16_to_cpu(de->rec_len), de->name_len,
                                     inline_size);
                        goto out;
                }
                if (le32_to_cpu(de->inode)) {
                        goto out;
                }
                offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
        }

        ret = true;
out:
        up_read(&EXT4_I(dir)->xattr_sem);
        brelse(iloc.bh);
        return ret;
}

int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
{
        int ret, no_expand;

        ext4_write_lock_xattr(inode, &no_expand);
        ret = ext4_destroy_inline_data_nolock(handle, inode);
        ext4_write_unlock_xattr(inode, &no_expand);

        return ret;
}

int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap)
{
        __u64 addr;
        int error = -EAGAIN;
        struct ext4_iloc iloc;

        down_read(&EXT4_I(inode)->xattr_sem);
        if (!ext4_has_inline_data(inode))
                goto out;

        error = ext4_get_inode_loc(inode, &iloc);
        if (error)
                goto out;

        addr = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
        addr += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
        addr += offsetof(struct ext4_inode, i_block);

        brelse(iloc.bh);

        iomap->addr = addr;
        iomap->offset = 0;
        iomap->length = min_t(loff_t, ext4_get_inline_size(inode),
                              i_size_read(inode));
        iomap->type = IOMAP_INLINE;
        iomap->flags = 0;

out:
        up_read(&EXT4_I(inode)->xattr_sem);
        return error;
}

int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
{
        handle_t *handle;
        int inline_size, value_len, needed_blocks, no_expand, err = 0;
        size_t i_size;
        void *value = NULL;
        struct ext4_xattr_ibody_find is = {
                .s = { .not_found = -ENODATA, },
        };
        struct ext4_xattr_info i = {
                .name_index = EXT4_XATTR_INDEX_SYSTEM,
                .name = EXT4_XATTR_SYSTEM_DATA,
        };


        needed_blocks = ext4_writepage_trans_blocks(inode);
        handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        ext4_write_lock_xattr(inode, &no_expand);
        if (!ext4_has_inline_data(inode)) {
                ext4_write_unlock_xattr(inode, &no_expand);
                *has_inline = 0;
                ext4_journal_stop(handle);
                return 0;
        }

        if ((err = ext4_orphan_add(handle, inode)) != 0)
                goto out;

        if ((err = ext4_get_inode_loc(inode, &is.iloc)) != 0)
                goto out;

        down_write(&EXT4_I(inode)->i_data_sem);
        i_size = inode->i_size;
        inline_size = ext4_get_inline_size(inode);
        EXT4_I(inode)->i_disksize = i_size;

        if (i_size < inline_size) {
                /*
                 * if there's inline data to truncate and this file was
                 * converted to extents after that inline data was written,
                 * the extent status cache must be cleared to avoid leaving
                 * behind stale delayed allocated extent entries
                 */
                if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
                        ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);

                /* Clear the content in the xattr space. */
                if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
                        if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0)
                                goto out_error;

                        BUG_ON(is.s.not_found);

                        value_len = le32_to_cpu(is.s.here->e_value_size);
                        value = kmalloc(value_len, GFP_NOFS);
                        if (!value) {
                                err = -ENOMEM;
                                goto out_error;
                        }

                        err = ext4_xattr_ibody_get(inode, i.name_index,
                                                   i.name, value, value_len);
                        if (err <= 0)
                                goto out_error;

                        i.value = value;
                        i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
                                        i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
                        err = ext4_xattr_ibody_set(handle, inode, &i, &is);
                        if (err)
                                goto out_error;
                }

                /* Clear the content within i_blocks. */
                if (i_size < EXT4_MIN_INLINE_DATA_SIZE) {
                        void *p = (void *) ext4_raw_inode(&is.iloc)->i_block;
                        memset(p + i_size, 0,
                               EXT4_MIN_INLINE_DATA_SIZE - i_size);
                }

                EXT4_I(inode)->i_inline_size = i_size <
                                        EXT4_MIN_INLINE_DATA_SIZE ?
                                        EXT4_MIN_INLINE_DATA_SIZE : i_size;
        }

out_error:
        up_write(&EXT4_I(inode)->i_data_sem);
out:
        brelse(is.iloc.bh);
        ext4_write_unlock_xattr(inode, &no_expand);
        kfree(value);
        if (inode->i_nlink)
                ext4_orphan_del(handle, inode);

        if (err == 0) {
                inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
                err = ext4_mark_inode_dirty(handle, inode);
                if (IS_SYNC(inode))
                        ext4_handle_sync(handle);
        }
        ext4_journal_stop(handle);
        return err;
}

int ext4_convert_inline_data(struct inode *inode)
{
        int error, needed_blocks, no_expand;
        handle_t *handle;
        struct ext4_iloc iloc;

        if (!ext4_has_inline_data(inode)) {
                ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
                return 0;
        } else if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
                /*
                 * Inode has inline data but EXT4_STATE_MAY_INLINE_DATA is
                 * cleared. This means we are in the middle of moving of
                 * inline data to delay allocated block. Just force writeout
                 * here to finish conversion.
                 */
                error = filemap_flush(inode->i_mapping);
                if (error)
                        return error;
                if (!ext4_has_inline_data(inode))
                        return 0;
        }

        needed_blocks = ext4_writepage_trans_blocks(inode);

        iloc.bh = NULL;
        error = ext4_get_inode_loc(inode, &iloc);
        if (error)
                return error;

        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
        if (IS_ERR(handle)) {
                error = PTR_ERR(handle);
                goto out_free;
        }

        ext4_write_lock_xattr(inode, &no_expand);
        if (ext4_has_inline_data(inode))
                error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
        ext4_write_unlock_xattr(inode, &no_expand);
        ext4_journal_stop(handle);
out_free:
        brelse(iloc.bh);
        return error;
}







































































































































    2 





















    4 







    2 





    2 
























    2 







    6 


    6 























    2 



    2 


    2 














    2 

    2 
    2 







































    4 
    4 


    2 
    4 




    4 
    4 



    4 





    4 
    4 




    4 




    2 

    4 














































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
// SPDX-License-Identifier: GPL-2.0-only

#include <linux/stat.h>
#include <linux/sysctl.h>
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/hash.h>
#include <linux/kmemleak.h>
#include <linux/user_namespace.h>

struct ucounts init_ucounts = {
        .ns    = &init_user_ns,
        .uid   = GLOBAL_ROOT_UID,
        .count = ATOMIC_INIT(1),
};

#define UCOUNTS_HASHTABLE_BITS 10
static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)];
static DEFINE_SPINLOCK(ucounts_lock);

#define ucounts_hashfn(ns, uid)                                                \
        hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \
                  UCOUNTS_HASHTABLE_BITS)
#define ucounts_hashentry(ns, uid)        \
        (ucounts_hashtable + ucounts_hashfn(ns, uid))


#ifdef CONFIG_SYSCTL
static struct ctl_table_set *
set_lookup(struct ctl_table_root *root)
{
        return &current_user_ns()->set;
}

static int set_is_seen(struct ctl_table_set *set)
{
        return &current_user_ns()->set == set;
}

static int set_permissions(struct ctl_table_header *head,
                           const struct ctl_table *table)
{
        struct user_namespace *user_ns =
                container_of(head->set, struct user_namespace, set);
        int mode;

        /* Allow users with CAP_SYS_RESOURCE unrestrained access */
        if (ns_capable(user_ns, CAP_SYS_RESOURCE))
                mode = (table->mode & S_IRWXU) >> 6;
        else
        /* Allow all others at most read-only access */
                mode = table->mode & S_IROTH;
        return (mode << 6) | (mode << 3) | mode;
}

static struct ctl_table_root set_root = {
        .lookup = set_lookup,
        .permissions = set_permissions,
};

static long ue_zero = 0;
static long ue_int_max = INT_MAX;

#define UCOUNT_ENTRY(name)                                        \
        {                                                        \
                .procname        = name,                                \
                .maxlen                = sizeof(long),                        \
                .mode                = 0644,                                \
                .proc_handler        = proc_doulongvec_minmax,        \
                .extra1                = &ue_zero,                        \
                .extra2                = &ue_int_max,                        \
        }
static struct ctl_table user_table[] = {
        UCOUNT_ENTRY("max_user_namespaces"),
        UCOUNT_ENTRY("max_pid_namespaces"),
        UCOUNT_ENTRY("max_uts_namespaces"),
        UCOUNT_ENTRY("max_ipc_namespaces"),
        UCOUNT_ENTRY("max_net_namespaces"),
        UCOUNT_ENTRY("max_mnt_namespaces"),
        UCOUNT_ENTRY("max_cgroup_namespaces"),
        UCOUNT_ENTRY("max_time_namespaces"),
#ifdef CONFIG_INOTIFY_USER
        UCOUNT_ENTRY("max_inotify_instances"),
        UCOUNT_ENTRY("max_inotify_watches"),
#endif
#ifdef CONFIG_FANOTIFY
        UCOUNT_ENTRY("max_fanotify_groups"),
        UCOUNT_ENTRY("max_fanotify_marks"),
#endif
};
#endif /* CONFIG_SYSCTL */

bool setup_userns_sysctls(struct user_namespace *ns)
{
#ifdef CONFIG_SYSCTL
        struct ctl_table *tbl;

        BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS);
        setup_sysctl_set(&ns->set, &set_root, set_is_seen);
        tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL);
        if (tbl) {
                int i;
                for (i = 0; i < UCOUNT_COUNTS; i++) {
                        tbl[i].data = &ns->ucount_max[i];
                }
                ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl,
                                                      ARRAY_SIZE(user_table));
        }
        if (!ns->sysctls) {
                kfree(tbl);
                retire_sysctl_set(&ns->set);
                return false;
        }
#endif
        return true;
}

void retire_userns_sysctls(struct user_namespace *ns)
{
#ifdef CONFIG_SYSCTL
        const struct ctl_table *tbl;

        tbl = ns->sysctls->ctl_table_arg;
        unregister_sysctl_table(ns->sysctls);
        retire_sysctl_set(&ns->set);
        kfree(tbl);
#endif
}

static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent)
{
        struct ucounts *ucounts;

        hlist_for_each_entry(ucounts, hashent, node) {
                if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns))
                        return ucounts;
        }
        return NULL;
}

static void hlist_add_ucounts(struct ucounts *ucounts)
{
        struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid);
        spin_lock_irq(&ucounts_lock);
        hlist_add_head(&ucounts->node, hashent);
        spin_unlock_irq(&ucounts_lock);
}

static inline bool get_ucounts_or_wrap(struct ucounts *ucounts)
{
        /* Returns true on a successful get, false if the count wraps. */
        return !atomic_add_negative(1, &ucounts->count);
}

struct ucounts *get_ucounts(struct ucounts *ucounts)
{
        if (!get_ucounts_or_wrap(ucounts)) {
                put_ucounts(ucounts);
                ucounts = NULL;
        }
        return ucounts;
}

struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
{
        struct hlist_head *hashent = ucounts_hashentry(ns, uid);
        struct ucounts *ucounts, *new;
        bool wrapped;

        spin_lock_irq(&ucounts_lock);
        ucounts = find_ucounts(ns, uid, hashent);
        if (!ucounts) {
                spin_unlock_irq(&ucounts_lock);

                new = kzalloc(sizeof(*new), GFP_KERNEL);
                if (!new)
                        return NULL;

                new->ns = ns;
                new->uid = uid;
                atomic_set(&new->count, 1);

                spin_lock_irq(&ucounts_lock);
                ucounts = find_ucounts(ns, uid, hashent);
                if (ucounts) {
                        kfree(new);
                } else {
                        hlist_add_head(&new->node, hashent);
                        get_user_ns(new->ns);
                        spin_unlock_irq(&ucounts_lock);
                        return new;
                }
        }
        wrapped = !get_ucounts_or_wrap(ucounts);
        spin_unlock_irq(&ucounts_lock);
        if (wrapped) {
                put_ucounts(ucounts);
                return NULL;
        }
        return ucounts;
}

void put_ucounts(struct ucounts *ucounts)
{
        unsigned long flags;

        if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) {
                hlist_del_init(&ucounts->node);
                spin_unlock_irqrestore(&ucounts_lock, flags);
                put_user_ns(ucounts->ns);
                kfree(ucounts);
        }
}

static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
{
        long c, old;
        c = atomic_long_read(v);
        for (;;) {
                if (unlikely(c >= u))
                        return false;
                old = atomic_long_cmpxchg(v, c, c+1);
                if (likely(old == c))
                        return true;
                c = old;
        }
}

struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
                           enum ucount_type type)
{
        struct ucounts *ucounts, *iter, *bad;
        struct user_namespace *tns;
        ucounts = alloc_ucounts(ns, uid);
        for (iter = ucounts; iter; iter = tns->ucounts) {
                long max;
                tns = iter->ns;
                max = READ_ONCE(tns->ucount_max[type]);
                if (!atomic_long_inc_below(&iter->ucount[type], max))
                        goto fail;
        }
        return ucounts;
fail:
        bad = iter;
        for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
                atomic_long_dec(&iter->ucount[type]);

        put_ucounts(ucounts);
        return NULL;
}

void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
{
        struct ucounts *iter;
        for (iter = ucounts; iter; iter = iter->ns->ucounts) {
                long dec = atomic_long_dec_if_positive(&iter->ucount[type]);
                WARN_ON_ONCE(dec < 0);
        }
        put_ucounts(ucounts);
}

long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v)
{
        struct ucounts *iter;
        long max = LONG_MAX;
        long ret = 0;

        for (iter = ucounts; iter; iter = iter->ns->ucounts) {
                long new = atomic_long_add_return(v, &iter->rlimit[type]);
                if (new < 0 || new > max)
                        ret = LONG_MAX;
                else if (iter == ucounts)
                        ret = new;
                max = get_userns_rlimit_max(iter->ns, type);
        }
        return ret;
}

bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v)
{
        struct ucounts *iter;
        long new = -1; /* Silence compiler warning */
        for (iter = ucounts; iter; iter = iter->ns->ucounts) {
                long dec = atomic_long_sub_return(v, &iter->rlimit[type]);
                WARN_ON_ONCE(dec < 0);
                if (iter == ucounts)
                        new = dec;
        }
        return (new == 0);
}

static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts,
                                struct ucounts *last, enum rlimit_type type)
{
        struct ucounts *iter, *next;
        for (iter = ucounts; iter != last; iter = next) {
                long dec = atomic_long_sub_return(1, &iter->rlimit[type]);
                WARN_ON_ONCE(dec < 0);
                next = iter->ns->ucounts;
                if (dec == 0)
                        put_ucounts(iter);
        }
}

void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type)
{
        do_dec_rlimit_put_ucounts(ucounts, NULL, type);
}

long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type)
{
        /* Caller must hold a reference to ucounts */
        struct ucounts *iter;
        long max = LONG_MAX;
        long dec, ret = 0;

        for (iter = ucounts; iter; iter = iter->ns->ucounts) {
                long new = atomic_long_add_return(1, &iter->rlimit[type]);
                if (new < 0 || new > max)
                        goto unwind;
                if (iter == ucounts)
                        ret = new;
                max = get_userns_rlimit_max(iter->ns, type);
                /*
                 * Grab an extra ucount reference for the caller when
                 * the rlimit count was previously 0.
                 */
                if (new != 1)
                        continue;
                if (!get_ucounts(iter))
                        goto dec_unwind;
        }
        return ret;
dec_unwind:
        dec = atomic_long_sub_return(1, &iter->rlimit[type]);
        WARN_ON_ONCE(dec < 0);
unwind:
        do_dec_rlimit_put_ucounts(ucounts, iter, type);
        return 0;
}

bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long rlimit)
{
        struct ucounts *iter;
        long max = rlimit;
        if (rlimit > LONG_MAX)
                max = LONG_MAX;
        for (iter = ucounts; iter; iter = iter->ns->ucounts) {
                long val = get_rlimit_value(iter, type);
                if (val < 0 || val > max)
                        return true;
                max = get_userns_rlimit_max(iter->ns, type);
        }
        return false;
}

static __init int user_namespace_sysctl_init(void)
{
#ifdef CONFIG_SYSCTL
        static struct ctl_table_header *user_header;
        static struct ctl_table empty[1];
        /*
         * It is necessary to register the user directory in the
         * default set so that registrations in the child sets work
         * properly.
         */
        user_header = register_sysctl_sz("user", empty, 0);
        kmemleak_ignore(user_header);
        BUG_ON(!user_header);
        BUG_ON(!setup_userns_sysctls(&init_user_ns));
#endif
        hlist_add_ucounts(&init_ucounts);
        inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1);
        return 0;
}
subsys_initcall(user_namespace_sysctl_init);















































































    3 








    3 
























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#ifndef _LINUX_JHASH_H
#define _LINUX_JHASH_H

/* jhash.h: Jenkins hash support.
 *
 * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net)
 *
 * https://burtleburtle.net/bob/hash/
 *
 * These are the credits from Bob's sources:
 *
 * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
 *
 * These are functions for producing 32-bit hashes for hash table lookup.
 * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
 * are externally useful functions.  Routines to test the hash are included
 * if SELF_TEST is defined.  You can use this free for any purpose.  It's in
 * the public domain.  It has no warranty.
 *
 * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@netfilter.org)
 *
 * I've modified Bob's hash to be useful in the Linux kernel, and
 * any bugs present are my fault.
 * Jozsef
 */
#include <linux/bitops.h>
#include <linux/unaligned/packed_struct.h>

/* Best hash sizes are of power of two */
#define jhash_size(n)   ((u32)1<<(n))
/* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */
#define jhash_mask(n)   (jhash_size(n)-1)

/* __jhash_mix -- mix 3 32-bit values reversibly. */
#define __jhash_mix(a, b, c)                        \
{                                                \
        a -= c;  a ^= rol32(c, 4);  c += b;        \
        b -= a;  b ^= rol32(a, 6);  a += c;        \
        c -= b;  c ^= rol32(b, 8);  b += a;        \
        a -= c;  a ^= rol32(c, 16); c += b;        \
        b -= a;  b ^= rol32(a, 19); a += c;        \
        c -= b;  c ^= rol32(b, 4);  b += a;        \
}

/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
#define __jhash_final(a, b, c)                        \
{                                                \
        c ^= b; c -= rol32(b, 14);                \
        a ^= c; a -= rol32(c, 11);                \
        b ^= a; b -= rol32(a, 25);                \
        c ^= b; c -= rol32(b, 16);                \
        a ^= c; a -= rol32(c, 4);                \
        b ^= a; b -= rol32(a, 14);                \
        c ^= b; c -= rol32(b, 24);                \
}

/* An arbitrary initial parameter */
#define JHASH_INITVAL                0xdeadbeef

/* jhash - hash an arbitrary key
 * @k: sequence of bytes as key
 * @length: the length of the key
 * @initval: the previous hash, or an arbitray value
 *
 * The generic version, hashes an arbitrary sequence of bytes.
 * No alignment or length assumptions are made about the input key.
 *
 * Returns the hash value of the key. The result depends on endianness.
 */
static inline u32 jhash(const void *key, u32 length, u32 initval)
{
        u32 a, b, c;
        const u8 *k = key;

        /* Set up the internal state */
        a = b = c = JHASH_INITVAL + length + initval;

        /* All but the last block: affect some 32 bits of (a,b,c) */
        while (length > 12) {
                a += __get_unaligned_cpu32(k);
                b += __get_unaligned_cpu32(k + 4);
                c += __get_unaligned_cpu32(k + 8);
                __jhash_mix(a, b, c);
                length -= 12;
                k += 12;
        }
        /* Last block: affect all 32 bits of (c) */
        switch (length) {
        case 12: c += (u32)k[11]<<24;        fallthrough;
        case 11: c += (u32)k[10]<<16;        fallthrough;
        case 10: c += (u32)k[9]<<8;        fallthrough;
        case 9:  c += k[8];                fallthrough;
        case 8:  b += (u32)k[7]<<24;        fallthrough;
        case 7:  b += (u32)k[6]<<16;        fallthrough;
        case 6:  b += (u32)k[5]<<8;        fallthrough;
        case 5:  b += k[4];                fallthrough;
        case 4:  a += (u32)k[3]<<24;        fallthrough;
        case 3:  a += (u32)k[2]<<16;        fallthrough;
        case 2:  a += (u32)k[1]<<8;        fallthrough;
        case 1:  a += k[0];
                 __jhash_final(a, b, c);
                 break;
        case 0: /* Nothing left to add */
                break;
        }

        return c;
}

/* jhash2 - hash an array of u32's
 * @k: the key which must be an array of u32's
 * @length: the number of u32's in the key
 * @initval: the previous hash, or an arbitray value
 *
 * Returns the hash value of the key.
 */
static inline u32 jhash2(const u32 *k, u32 length, u32 initval)
{
        u32 a, b, c;

        /* Set up the internal state */
        a = b = c = JHASH_INITVAL + (length<<2) + initval;

        /* Handle most of the key */
        while (length > 3) {
                a += k[0];
                b += k[1];
                c += k[2];
                __jhash_mix(a, b, c);
                length -= 3;
                k += 3;
        }

        /* Handle the last 3 u32's */
        switch (length) {
        case 3: c += k[2];        fallthrough;
        case 2: b += k[1];        fallthrough;
        case 1: a += k[0];
                __jhash_final(a, b, c);
                break;
        case 0:        /* Nothing left to add */
                break;
        }

        return c;
}


/* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */
static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
{
        a += initval;
        b += initval;
        c += initval;

        __jhash_final(a, b, c);

        return c;
}

static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval)
{
        return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2));
}

static inline u32 jhash_2words(u32 a, u32 b, u32 initval)
{
        return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
}

static inline u32 jhash_1word(u32 a, u32 initval)
{
        return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2));
}

#endif /* _LINUX_JHASH_H */






































    2 






















































































































































    4 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UACCESS_64_H
#define _ASM_X86_UACCESS_64_H

/*
 * User space memory access functions
 */
#include <linux/compiler.h>
#include <linux/lockdep.h>
#include <linux/kasan-checks.h>
#include <asm/alternative.h>
#include <asm/cpufeatures.h>
#include <asm/page.h>
#include <asm/percpu.h>

#ifdef CONFIG_ADDRESS_MASKING
/*
 * Mask out tag bits from the address.
 */
static inline unsigned long __untagged_addr(unsigned long addr)
{
        asm (ALTERNATIVE("",
                         "and " __percpu_arg([mask]) ", %[addr]", X86_FEATURE_LAM)
             : [addr] "+r" (addr)
             : [mask] "m" (__my_cpu_var(tlbstate_untag_mask)));

        return addr;
}

#define untagged_addr(addr)        ({                                        \
        unsigned long __addr = (__force unsigned long)(addr);                \
        (__force __typeof__(addr))__untagged_addr(__addr);                \
})

static inline unsigned long __untagged_addr_remote(struct mm_struct *mm,
                                                   unsigned long addr)
{
        mmap_assert_locked(mm);
        return addr & (mm)->context.untag_mask;
}

#define untagged_addr_remote(mm, addr)        ({                                \
        unsigned long __addr = (__force unsigned long)(addr);                \
        (__force __typeof__(addr))__untagged_addr_remote(mm, __addr);        \
})

#endif

/*
 * The virtual address space space is logically divided into a kernel
 * half and a user half.  When cast to a signed type, user pointers
 * are positive and kernel pointers are negative.
 */
#define valid_user_address(x) ((__force long)(x) >= 0)

/*
 * User pointers can have tag bits on x86-64.  This scheme tolerates
 * arbitrary values in those bits rather then masking them off.
 *
 * Enforce two rules:
 * 1. 'ptr' must be in the user half of the address space
 * 2. 'ptr+size' must not overflow into kernel addresses
 *
 * Note that addresses around the sign change are not valid addresses,
 * and will GP-fault even with LAM enabled if the sign bit is set (see
 * "CR3.LAM_SUP" that can narrow the canonicality check if we ever
 * enable it, but not remove it entirely).
 *
 * So the "overflow into kernel addresses" does not imply some sudden
 * exact boundary at the sign bit, and we can allow a lot of slop on the
 * size check.
 *
 * In fact, we could probably remove the size check entirely, since
 * any kernel accesses will be in increasing address order starting
 * at 'ptr', and even if the end might be in kernel space, we'll
 * hit the GP faults for non-canonical accesses before we ever get
 * there.
 *
 * That's a separate optimization, for now just handle the small
 * constant case.
 */
static inline bool __access_ok(const void __user *ptr, unsigned long size)
{
        if (__builtin_constant_p(size <= PAGE_SIZE) && size <= PAGE_SIZE) {
                return valid_user_address(ptr);
        } else {
                unsigned long sum = size + (__force unsigned long)ptr;

                return valid_user_address(sum) && sum >= (__force unsigned long)ptr;
        }
}
#define __access_ok __access_ok

/*
 * Copy To/From Userspace
 */

/* Handles exceptions in both to and from, but doesn't do access_ok */
__must_check unsigned long
rep_movs_alternative(void *to, const void *from, unsigned len);

static __always_inline __must_check unsigned long
copy_user_generic(void *to, const void *from, unsigned long len)
{
        stac();
        /*
         * If CPU has FSRM feature, use 'rep movs'.
         * Otherwise, use rep_movs_alternative.
         */
        asm volatile(
                "1:\n\t"
                ALTERNATIVE("rep movsb",
                            "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
                "2:\n"
                _ASM_EXTABLE_UA(1b, 2b)
                :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
                : : "memory", "rax");
        clac();
        return len;
}

static __always_inline __must_check unsigned long
raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
{
        return copy_user_generic(dst, (__force void *)src, size);
}

static __always_inline __must_check unsigned long
raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
{
        return copy_user_generic((__force void *)dst, src, size);
}

extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size);
extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);

static inline int
__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
                                  unsigned size)
{
        long ret;
        kasan_check_write(dst, size);
        stac();
        ret = __copy_user_nocache(dst, src, size);
        clac();
        return ret;
}

static inline int
__copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
{
        kasan_check_write(dst, size);
        return __copy_user_flushcache(dst, src, size);
}

/*
 * Zero Userspace.
 */

__must_check unsigned long
rep_stos_alternative(void __user *addr, unsigned long len);

static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
{
        might_fault();
        stac();

        /*
         * No memory constraint because it doesn't change any memory gcc
         * knows about.
         */
        asm volatile(
                "1:\n\t"
                ALTERNATIVE("rep stosb",
                            "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
                "2:\n"
               _ASM_EXTABLE_UA(1b, 2b)
               : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
               : "a" (0));

        clac();

        return size;
}

static __always_inline unsigned long clear_user(void __user *to, unsigned long n)
{
        if (__access_ok(to, n))
                return __clear_user(to, n);
        return n;
}
#endif /* _ASM_X86_UACCESS_64_H */






























































































































































































    1 










































    1 
















    1 


























    1 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
// SPDX-License-Identifier: GPL-2.0-only
/*
 * misc.c
 *
 * PURPOSE
 *        Miscellaneous routines for the OSTA-UDF(tm) filesystem.
 *
 * COPYRIGHT
 *  (C) 1998 Dave Boynton
 *  (C) 1998-2004 Ben Fennema
 *  (C) 1999-2000 Stelias Computing Inc
 *
 * HISTORY
 *
 *  04/19/99 blf  partial support for reading/writing specific EA's
 */

#include "udfdecl.h"

#include <linux/fs.h>
#include <linux/string.h>
#include <linux/crc-itu-t.h>

#include "udf_i.h"
#include "udf_sb.h"

struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
                                           uint32_t type, uint8_t loc)
{
        uint8_t *ea = NULL, *ad = NULL;
        int offset;
        uint16_t crclen;
        struct udf_inode_info *iinfo = UDF_I(inode);

        ea = iinfo->i_data;
        if (iinfo->i_lenEAttr) {
                ad = iinfo->i_data + iinfo->i_lenEAttr;
        } else {
                ad = ea;
                size += sizeof(struct extendedAttrHeaderDesc);
        }

        offset = inode->i_sb->s_blocksize - udf_file_entry_alloc_offset(inode) -
                iinfo->i_lenAlloc;

        /* TODO - Check for FreeEASpace */

        if (loc & 0x01 && offset >= size) {
                struct extendedAttrHeaderDesc *eahd;
                eahd = (struct extendedAttrHeaderDesc *)ea;

                if (iinfo->i_lenAlloc)
                        memmove(&ad[size], ad, iinfo->i_lenAlloc);

                if (iinfo->i_lenEAttr) {
                        /* check checksum/crc */
                        if (eahd->descTag.tagIdent !=
                                        cpu_to_le16(TAG_IDENT_EAHD) ||
                            le32_to_cpu(eahd->descTag.tagLocation) !=
                                        iinfo->i_location.logicalBlockNum)
                                return NULL;
                } else {
                        struct udf_sb_info *sbi = UDF_SB(inode->i_sb);

                        size -= sizeof(struct extendedAttrHeaderDesc);
                        iinfo->i_lenEAttr +=
                                sizeof(struct extendedAttrHeaderDesc);
                        eahd->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EAHD);
                        if (sbi->s_udfrev >= 0x0200)
                                eahd->descTag.descVersion = cpu_to_le16(3);
                        else
                                eahd->descTag.descVersion = cpu_to_le16(2);
                        eahd->descTag.tagSerialNum =
                                        cpu_to_le16(sbi->s_serial_number);
                        eahd->descTag.tagLocation = cpu_to_le32(
                                        iinfo->i_location.logicalBlockNum);
                        eahd->impAttrLocation = cpu_to_le32(0xFFFFFFFF);
                        eahd->appAttrLocation = cpu_to_le32(0xFFFFFFFF);
                }

                offset = iinfo->i_lenEAttr;
                if (type < 2048) {
                        if (le32_to_cpu(eahd->appAttrLocation) <
                                        iinfo->i_lenEAttr) {
                                uint32_t aal =
                                        le32_to_cpu(eahd->appAttrLocation);
                                memmove(&ea[offset - aal + size],
                                        &ea[aal], offset - aal);
                                offset -= aal;
                                eahd->appAttrLocation =
                                                cpu_to_le32(aal + size);
                        }
                        if (le32_to_cpu(eahd->impAttrLocation) <
                                        iinfo->i_lenEAttr) {
                                uint32_t ial =
                                        le32_to_cpu(eahd->impAttrLocation);
                                memmove(&ea[offset - ial + size],
                                        &ea[ial], offset - ial);
                                offset -= ial;
                                eahd->impAttrLocation =
                                                cpu_to_le32(ial + size);
                        }
                } else if (type < 65536) {
                        if (le32_to_cpu(eahd->appAttrLocation) <
                                        iinfo->i_lenEAttr) {
                                uint32_t aal =
                                        le32_to_cpu(eahd->appAttrLocation);
                                memmove(&ea[offset - aal + size],
                                        &ea[aal], offset - aal);
                                offset -= aal;
                                eahd->appAttrLocation =
                                                cpu_to_le32(aal + size);
                        }
                }
                /* rewrite CRC + checksum of eahd */
                crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(struct tag);
                eahd->descTag.descCRCLength = cpu_to_le16(crclen);
                eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd +
                                                sizeof(struct tag), crclen));
                eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag);
                iinfo->i_lenEAttr += size;
                return (struct genericFormat *)&ea[offset];
        }

        return NULL;
}

struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type,
                                           uint8_t subtype)
{
        struct genericFormat *gaf;
        uint8_t *ea = NULL;
        uint32_t offset;
        struct udf_inode_info *iinfo = UDF_I(inode);

        ea = iinfo->i_data;

        if (iinfo->i_lenEAttr) {
                struct extendedAttrHeaderDesc *eahd;
                eahd = (struct extendedAttrHeaderDesc *)ea;

                /* check checksum/crc */
                if (eahd->descTag.tagIdent !=
                                cpu_to_le16(TAG_IDENT_EAHD) ||
                    le32_to_cpu(eahd->descTag.tagLocation) !=
                                iinfo->i_location.logicalBlockNum)
                        return NULL;

                if (type < 2048)
                        offset = sizeof(struct extendedAttrHeaderDesc);
                else if (type < 65536)
                        offset = le32_to_cpu(eahd->impAttrLocation);
                else
                        offset = le32_to_cpu(eahd->appAttrLocation);

                while (offset + sizeof(*gaf) < iinfo->i_lenEAttr) {
                        uint32_t attrLength;

                        gaf = (struct genericFormat *)&ea[offset];
                        attrLength = le32_to_cpu(gaf->attrLength);

                        /* Detect undersized elements and buffer overflows */
                        if ((attrLength < sizeof(*gaf)) ||
                            (attrLength > (iinfo->i_lenEAttr - offset)))
                                break;

                        if (le32_to_cpu(gaf->attrType) == type &&
                                        gaf->attrSubtype == subtype)
                                return gaf;
                        else
                                offset += attrLength;
                }
        }

        return NULL;
}

/*
 * udf_read_tagged
 *
 * PURPOSE
 *        Read the first block of a tagged descriptor.
 *
 * HISTORY
 *        July 1, 1997 - Andrew E. Mileski
 *        Written, tested, and released.
 */
struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
                                    uint32_t location, uint16_t *ident)
{
        struct tag *tag_p;
        struct buffer_head *bh = NULL;
        u8 checksum;

        /* Read the block */
        if (block == 0xFFFFFFFF)
                return NULL;

        bh = sb_bread(sb, block);
        if (!bh) {
                udf_err(sb, "read failed, block=%u, location=%u\n",
                        block, location);
                return NULL;
        }

        tag_p = (struct tag *)(bh->b_data);

        *ident = le16_to_cpu(tag_p->tagIdent);

        if (location != le32_to_cpu(tag_p->tagLocation)) {
                udf_debug("location mismatch block %u, tag %u != %u\n",
                          block, le32_to_cpu(tag_p->tagLocation), location);
                goto error_out;
        }

        /* Verify the tag checksum */
        checksum = udf_tag_checksum(tag_p);
        if (checksum != tag_p->tagChecksum) {
                udf_err(sb, "tag checksum failed, block %u: 0x%02x != 0x%02x\n",
                        block, checksum, tag_p->tagChecksum);
                goto error_out;
        }

        /* Verify the tag version */
        if (tag_p->descVersion != cpu_to_le16(0x0002U) &&
            tag_p->descVersion != cpu_to_le16(0x0003U)) {
                udf_err(sb, "tag version 0x%04x != 0x0002 || 0x0003, block %u\n",
                        le16_to_cpu(tag_p->descVersion), block);
                goto error_out;
        }

        /* Verify the descriptor CRC */
        if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize ||
            le16_to_cpu(tag_p->descCRC) == crc_itu_t(0,
                                        bh->b_data + sizeof(struct tag),
                                        le16_to_cpu(tag_p->descCRCLength)))
                return bh;

        udf_debug("Crc failure block %u: crc = %u, crclen = %u\n", block,
                  le16_to_cpu(tag_p->descCRC),
                  le16_to_cpu(tag_p->descCRCLength));
error_out:
        brelse(bh);
        return NULL;
}

struct buffer_head *udf_read_ptagged(struct super_block *sb,
                                     struct kernel_lb_addr *loc,
                                     uint32_t offset, uint16_t *ident)
{
        return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset),
                               loc->logicalBlockNum + offset, ident);
}

void udf_update_tag(char *data, int length)
{
        struct tag *tptr = (struct tag *)data;
        length -= sizeof(struct tag);

        tptr->descCRCLength = cpu_to_le16(length);
        tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(struct tag), length));
        tptr->tagChecksum = udf_tag_checksum(tptr);
}

void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum,
                 uint32_t loc, int length)
{
        struct tag *tptr = (struct tag *)data;
        tptr->tagIdent = cpu_to_le16(ident);
        tptr->descVersion = cpu_to_le16(version);
        tptr->tagSerialNum = cpu_to_le16(snum);
        tptr->tagLocation = cpu_to_le32(loc);
        udf_update_tag(data, length);
}

u8 udf_tag_checksum(const struct tag *t)
{
        u8 *data = (u8 *)t;
        u8 checksum = 0;
        int i;
        for (i = 0; i < sizeof(struct tag); ++i)
                if (i != 4) /* position of checksum */
                        checksum += data[i];
        return checksum;
}


























































































































    2 




   12 
    5 
    1 
    4 
    4 



   18 
   15 












   21 







    9 


















   11 



















































































































































   18 
   21 















































































    6 








   12 
    7 








    4 

   13 
    2 






    1 






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/buffer_head.h
 *
 * Everything to do with buffer_heads.
 */

#ifndef _LINUX_BUFFER_HEAD_H
#define _LINUX_BUFFER_HEAD_H

#include <linux/types.h>
#include <linux/blk_types.h>
#include <linux/fs.h>
#include <linux/linkage.h>
#include <linux/pagemap.h>
#include <linux/wait.h>
#include <linux/atomic.h>

enum bh_state_bits {
        BH_Uptodate,        /* Contains valid data */
        BH_Dirty,        /* Is dirty */
        BH_Lock,        /* Is locked */
        BH_Req,                /* Has been submitted for I/O */

        BH_Mapped,        /* Has a disk mapping */
        BH_New,                /* Disk mapping was newly created by get_block */
        BH_Async_Read,        /* Is under end_buffer_async_read I/O */
        BH_Async_Write,        /* Is under end_buffer_async_write I/O */
        BH_Delay,        /* Buffer is not yet allocated on disk */
        BH_Boundary,        /* Block is followed by a discontiguity */
        BH_Write_EIO,        /* I/O error on write */
        BH_Unwritten,        /* Buffer is allocated on disk but not written */
        BH_Quiet,        /* Buffer Error Prinks to be quiet */
        BH_Meta,        /* Buffer contains metadata */
        BH_Prio,        /* Buffer should be submitted with REQ_PRIO */
        BH_Defer_Completion, /* Defer AIO completion to workqueue */

        BH_PrivateStart,/* not a state bit, but the first bit available
                         * for private allocation by other entities
                         */
};

#define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)

struct page;
struct buffer_head;
struct address_space;
typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);

/*
 * Historically, a buffer_head was used to map a single block
 * within a page, and of course as the unit of I/O through the
 * filesystem and block layers.  Nowadays the basic I/O unit
 * is the bio, and buffer_heads are used for extracting block
 * mappings (via a get_block_t call), for tracking state within
 * a page (via a page_mapping) and for wrapping bio submission
 * for backward compatibility reasons (e.g. submit_bh).
 */
struct buffer_head {
        unsigned long b_state;                /* buffer state bitmap (see above) */
        struct buffer_head *b_this_page;/* circular list of page's buffers */
        union {
                struct page *b_page;        /* the page this bh is mapped to */
                struct folio *b_folio;        /* the folio this bh is mapped to */
        };

        sector_t b_blocknr;                /* start block number */
        size_t b_size;                        /* size of mapping */
        char *b_data;                        /* pointer to data within the page */

        struct block_device *b_bdev;
        bh_end_io_t *b_end_io;                /* I/O completion */
         void *b_private;                /* reserved for b_end_io */
        struct list_head b_assoc_buffers; /* associated with another mapping */
        struct address_space *b_assoc_map;        /* mapping this buffer is
                                                   associated with */
        atomic_t b_count;                /* users using this buffer_head */
        spinlock_t b_uptodate_lock;        /* Used by the first bh in a page, to
                                         * serialise IO completion of other
                                         * buffers in the page */
};

/*
 * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
 * and buffer_foo() functions.
 * To avoid reset buffer flags that are already set, because that causes
 * a costly cache line transition, check the flag first.
 */
#define BUFFER_FNS(bit, name)                                                \
static __always_inline void set_buffer_##name(struct buffer_head *bh)        \
{                                                                        \
        if (!test_bit(BH_##bit, &(bh)->b_state))                        \
                set_bit(BH_##bit, &(bh)->b_state);                        \
}                                                                        \
static __always_inline void clear_buffer_##name(struct buffer_head *bh)        \
{                                                                        \
        clear_bit(BH_##bit, &(bh)->b_state);                                \
}                                                                        \
static __always_inline int buffer_##name(const struct buffer_head *bh)        \
{                                                                        \
        return test_bit(BH_##bit, &(bh)->b_state);                        \
}

/*
 * test_set_buffer_foo() and test_clear_buffer_foo()
 */
#define TAS_BUFFER_FNS(bit, name)                                        \
static __always_inline int test_set_buffer_##name(struct buffer_head *bh) \
{                                                                        \
        return test_and_set_bit(BH_##bit, &(bh)->b_state);                \
}                                                                        \
static __always_inline int test_clear_buffer_##name(struct buffer_head *bh) \
{                                                                        \
        return test_and_clear_bit(BH_##bit, &(bh)->b_state);                \
}                                                                        \

/*
 * Emit the buffer bitops functions.   Note that there are also functions
 * of the form "mark_buffer_foo()".  These are higher-level functions which
 * do something in addition to setting a b_state bit.
 */
BUFFER_FNS(Dirty, dirty)
TAS_BUFFER_FNS(Dirty, dirty)
BUFFER_FNS(Lock, locked)
BUFFER_FNS(Req, req)
TAS_BUFFER_FNS(Req, req)
BUFFER_FNS(Mapped, mapped)
BUFFER_FNS(New, new)
BUFFER_FNS(Async_Read, async_read)
BUFFER_FNS(Async_Write, async_write)
BUFFER_FNS(Delay, delay)
BUFFER_FNS(Boundary, boundary)
BUFFER_FNS(Write_EIO, write_io_error)
BUFFER_FNS(Unwritten, unwritten)
BUFFER_FNS(Meta, meta)
BUFFER_FNS(Prio, prio)
BUFFER_FNS(Defer_Completion, defer_completion)

static __always_inline void set_buffer_uptodate(struct buffer_head *bh)
{
        /*
         * If somebody else already set this uptodate, they will
         * have done the memory barrier, and a reader will thus
         * see *some* valid buffer state.
         *
         * Any other serialization (with IO errors or whatever that
         * might clear the bit) has to come from other state (eg BH_Lock).
         */
        if (test_bit(BH_Uptodate, &bh->b_state))
                return;

        /*
         * make it consistent with folio_mark_uptodate
         * pairs with smp_load_acquire in buffer_uptodate
         */
        smp_mb__before_atomic();
        set_bit(BH_Uptodate, &bh->b_state);
}

static __always_inline void clear_buffer_uptodate(struct buffer_head *bh)
{
        clear_bit(BH_Uptodate, &bh->b_state);
}

static __always_inline int buffer_uptodate(const struct buffer_head *bh)
{
        /*
         * make it consistent with folio_test_uptodate
         * pairs with smp_mb__before_atomic in set_buffer_uptodate
         */
        return test_bit_acquire(BH_Uptodate, &bh->b_state);
}

static inline unsigned long bh_offset(const struct buffer_head *bh)
{
        return (unsigned long)(bh)->b_data & (page_size(bh->b_page) - 1);
}

/* If we *know* page->private refers to buffer_heads */
#define page_buffers(page)                                        \
        ({                                                        \
                BUG_ON(!PagePrivate(page));                        \
                ((struct buffer_head *)page_private(page));        \
        })
#define page_has_buffers(page)        PagePrivate(page)
#define folio_buffers(folio)                folio_get_private(folio)

void buffer_check_dirty_writeback(struct folio *folio,
                                     bool *dirty, bool *writeback);

/*
 * Declarations
 */

void mark_buffer_dirty(struct buffer_head *bh);
void mark_buffer_write_io_error(struct buffer_head *bh);
void touch_buffer(struct buffer_head *bh);
void folio_set_bh(struct buffer_head *bh, struct folio *folio,
                  unsigned long offset);
struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
                                        gfp_t gfp);
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
                bool retry);
struct buffer_head *create_empty_buffers(struct folio *folio,
                unsigned long blocksize, unsigned long b_state);
void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
void end_buffer_write_sync(struct buffer_head *bh, int uptodate);

/* Things to do with buffers at mapping->private_list */
void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
                                  bool datasync);
int generic_buffers_fsync(struct file *file, loff_t start, loff_t end,
                          bool datasync);
void clean_bdev_aliases(struct block_device *bdev, sector_t block,
                        sector_t len);
static inline void clean_bdev_bh_alias(struct buffer_head *bh)
{
        clean_bdev_aliases(bh->b_bdev, bh->b_blocknr, 1);
}

void mark_buffer_async_write(struct buffer_head *bh);
void __wait_on_buffer(struct buffer_head *);
wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
                        unsigned size);
struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
                unsigned size, gfp_t gfp);
void __brelse(struct buffer_head *);
void __bforget(struct buffer_head *);
void __breadahead(struct block_device *, sector_t block, unsigned int size);
struct buffer_head *__bread_gfp(struct block_device *,
                                sector_t block, unsigned size, gfp_t gfp);
struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
void free_buffer_head(struct buffer_head * bh);
void unlock_buffer(struct buffer_head *bh);
void __lock_buffer(struct buffer_head *bh);
int sync_dirty_buffer(struct buffer_head *bh);
int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags);
void submit_bh(blk_opf_t, struct buffer_head *);
void write_boundary_block(struct block_device *bdev,
                        sector_t bblock, unsigned blocksize);
int bh_uptodate_or_lock(struct buffer_head *bh);
int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
void __bh_read_batch(int nr, struct buffer_head *bhs[],
                     blk_opf_t op_flags, bool force_lock);

/*
 * Generic address_space_operations implementations for buffer_head-backed
 * address_spaces.
 */
void block_invalidate_folio(struct folio *folio, size_t offset, size_t length);
int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
                void *get_block);
int __block_write_full_folio(struct inode *inode, struct folio *folio,
                get_block_t *get_block, struct writeback_control *wbc);
int block_read_full_folio(struct folio *, get_block_t *);
bool block_is_partially_uptodate(struct folio *, size_t from, size_t count);
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
                struct page **pagep, get_block_t *get_block);
int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                get_block_t *get_block);
int block_write_end(struct file *, struct address_space *,
                                loff_t, unsigned, unsigned,
                                struct page *, void *);
int generic_write_end(struct file *, struct address_space *,
                                loff_t, unsigned, unsigned,
                                struct page *, void *);
void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to);
int cont_write_begin(struct file *, struct address_space *, loff_t,
                        unsigned, struct page **, void **,
                        get_block_t *, loff_t *);
int generic_cont_expand_simple(struct inode *inode, loff_t size);
void block_commit_write(struct page *page, unsigned int from, unsigned int to);
int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
                                get_block_t get_block);
sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
int block_truncate_page(struct address_space *, loff_t, get_block_t *);

#ifdef CONFIG_MIGRATION
extern int buffer_migrate_folio(struct address_space *,
                struct folio *dst, struct folio *src, enum migrate_mode);
extern int buffer_migrate_folio_norefs(struct address_space *,
                struct folio *dst, struct folio *src, enum migrate_mode);
#else
#define buffer_migrate_folio NULL
#define buffer_migrate_folio_norefs NULL
#endif

/*
 * inline definitions
 */

static inline void get_bh(struct buffer_head *bh)
{
        atomic_inc(&bh->b_count);
}

static inline void put_bh(struct buffer_head *bh)
{
        smp_mb__before_atomic();
        atomic_dec(&bh->b_count);
}

/**
 * brelse - Release a buffer.
 * @bh: The buffer to release.
 *
 * Decrement a buffer_head's reference count.  If @bh is NULL, this
 * function is a no-op.
 *
 * If all buffers on a folio have zero reference count, are clean
 * and unlocked, and if the folio is unlocked and not under writeback
 * then try_to_free_buffers() may strip the buffers from the folio in
 * preparation for freeing it (sometimes, rarely, buffers are removed
 * from a folio but it ends up not being freed, and buffers may later
 * be reattached).
 *
 * Context: Any context.
 */
static inline void brelse(struct buffer_head *bh)
{
        if (bh)
                __brelse(bh);
}

/**
 * bforget - Discard any dirty data in a buffer.
 * @bh: The buffer to forget.
 *
 * Call this function instead of brelse() if the data written to a buffer
 * no longer needs to be written back.  It will clear the buffer's dirty
 * flag so writeback of this buffer will be skipped.
 *
 * Context: Any context.
 */
static inline void bforget(struct buffer_head *bh)
{
        if (bh)
                __bforget(bh);
}

static inline struct buffer_head *
sb_bread(struct super_block *sb, sector_t block)
{
        return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
}

static inline struct buffer_head *
sb_bread_unmovable(struct super_block *sb, sector_t block)
{
        return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, 0);
}

static inline void
sb_breadahead(struct super_block *sb, sector_t block)
{
        __breadahead(sb->s_bdev, block, sb->s_blocksize);
}

static inline struct buffer_head *getblk_unmovable(struct block_device *bdev,
                sector_t block, unsigned size)
{
        gfp_t gfp;

        gfp = mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);
        gfp |= __GFP_NOFAIL;

        return bdev_getblk(bdev, block, size, gfp);
}

static inline struct buffer_head *__getblk(struct block_device *bdev,
                sector_t block, unsigned size)
{
        gfp_t gfp;

        gfp = mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);
        gfp |= __GFP_MOVABLE | __GFP_NOFAIL;

        return bdev_getblk(bdev, block, size, gfp);
}

static inline struct buffer_head *sb_getblk(struct super_block *sb,
                sector_t block)
{
        return __getblk(sb->s_bdev, block, sb->s_blocksize);
}

static inline struct buffer_head *sb_getblk_gfp(struct super_block *sb,
                sector_t block, gfp_t gfp)
{
        return bdev_getblk(sb->s_bdev, block, sb->s_blocksize, gfp);
}

static inline struct buffer_head *
sb_find_get_block(struct super_block *sb, sector_t block)
{
        return __find_get_block(sb->s_bdev, block, sb->s_blocksize);
}

static inline void
map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block)
{
        set_buffer_mapped(bh);
        bh->b_bdev = sb->s_bdev;
        bh->b_blocknr = block;
        bh->b_size = sb->s_blocksize;
}

static inline void wait_on_buffer(struct buffer_head *bh)
{
        might_sleep();
        if (buffer_locked(bh))
                __wait_on_buffer(bh);
}

static inline int trylock_buffer(struct buffer_head *bh)
{
        return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state));
}

static inline void lock_buffer(struct buffer_head *bh)
{
        might_sleep();
        if (!trylock_buffer(bh))
                __lock_buffer(bh);
}

static inline void bh_readahead(struct buffer_head *bh, blk_opf_t op_flags)
{
        if (!buffer_uptodate(bh) && trylock_buffer(bh)) {
                if (!buffer_uptodate(bh))
                        __bh_read(bh, op_flags, false);
                else
                        unlock_buffer(bh);
        }
}

static inline void bh_read_nowait(struct buffer_head *bh, blk_opf_t op_flags)
{
        if (!bh_uptodate_or_lock(bh))
                __bh_read(bh, op_flags, false);
}

/* Returns 1 if buffer uptodated, 0 on success, and -EIO on error. */
static inline int bh_read(struct buffer_head *bh, blk_opf_t op_flags)
{
        if (bh_uptodate_or_lock(bh))
                return 1;
        return __bh_read(bh, op_flags, true);
}

static inline void bh_read_batch(int nr, struct buffer_head *bhs[])
{
        __bh_read_batch(nr, bhs, 0, true);
}

static inline void bh_readahead_batch(int nr, struct buffer_head *bhs[],
                                      blk_opf_t op_flags)
{
        __bh_read_batch(nr, bhs, op_flags, false);
}

/**
 * __bread() - Read a block.
 * @bdev: The block device to read from.
 * @block: Block number in units of block size.
 * @size: The block size of this device in bytes.
 *
 * Read a specified block, and return the buffer head that refers
 * to it.  The memory is allocated from the movable area so that it can
 * be migrated.  The returned buffer head has its refcount increased.
 * The caller should call brelse() when it has finished with the buffer.
 *
 * Context: May sleep waiting for I/O.
 * Return: NULL if the block was unreadable.
 */
static inline struct buffer_head *__bread(struct block_device *bdev,
                sector_t block, unsigned size)
{
        return __bread_gfp(bdev, block, size, __GFP_MOVABLE);
}

/**
 * get_nth_bh - Get a reference on the n'th buffer after this one.
 * @bh: The buffer to start counting from.
 * @count: How many buffers to skip.
 *
 * This is primarily useful for finding the nth buffer in a folio; in
 * that case you pass the head buffer and the byte offset in the folio
 * divided by the block size.  It can be used for other purposes, but
 * it will wrap at the end of the folio rather than returning NULL or
 * proceeding to the next folio for you.
 *
 * Return: The requested buffer with an elevated refcount.
 */
static inline __must_check
struct buffer_head *get_nth_bh(struct buffer_head *bh, unsigned int count)
{
        while (count--)
                bh = bh->b_this_page;
        get_bh(bh);
        return bh;
}

bool block_dirty_folio(struct address_space *mapping, struct folio *folio);

#ifdef CONFIG_BUFFER_HEAD

void buffer_init(void);
bool try_to_free_buffers(struct folio *folio);
int inode_has_buffers(struct inode *inode);
void invalidate_inode_buffers(struct inode *inode);
int remove_inode_buffers(struct inode *inode);
int sync_mapping_buffers(struct address_space *mapping);
void invalidate_bh_lrus(void);
void invalidate_bh_lrus_cpu(void);
bool has_bh_in_lru(int cpu, void *dummy);
extern int buffer_heads_over_limit;

#else /* CONFIG_BUFFER_HEAD */

static inline void buffer_init(void) {}
static inline bool try_to_free_buffers(struct folio *folio) { return true; }
static inline int inode_has_buffers(struct inode *inode) { return 0; }
static inline void invalidate_inode_buffers(struct inode *inode) {}
static inline int remove_inode_buffers(struct inode *inode) { return 1; }
static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
static inline void invalidate_bh_lrus(void) {}
static inline void invalidate_bh_lrus_cpu(void) {}
static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; }
#define buffer_heads_over_limit 0

#endif /* CONFIG_BUFFER_HEAD */
#endif /* _LINUX_BUFFER_HEAD_H */




















































































































    2 











    2 
























































    1 




    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 *
 * Copyright (C) 2011 Novell Inc.
 * Copyright (C) 2016 Red Hat, Inc.
 */

struct ovl_config {
        char *upperdir;
        char *workdir;
        char **lowerdirs;
        bool default_permissions;
        int redirect_mode;
        int verity_mode;
        bool index;
        int uuid;
        bool nfs_export;
        int xino;
        bool metacopy;
        bool userxattr;
        bool ovl_volatile;
};

struct ovl_sb {
        struct super_block *sb;
        dev_t pseudo_dev;
        /* Unusable (conflicting) uuid */
        bool bad_uuid;
        /* Used as a lower layer (but maybe also as upper) */
        bool is_lower;
};

struct ovl_layer {
        /* ovl_free_fs() relies on @mnt being the first member! */
        struct vfsmount *mnt;
        /* Trap in ovl inode cache */
        struct inode *trap;
        struct ovl_sb *fs;
        /* Index of this layer in fs root (upper idx == 0) */
        int idx;
        /* One fsid per unique underlying sb (upper fsid == 0) */
        int fsid;
        /* xwhiteouts were found on this layer */
        bool has_xwhiteouts;
};

struct ovl_path {
        const struct ovl_layer *layer;
        struct dentry *dentry;
};

struct ovl_entry {
        unsigned int __numlower;
        struct ovl_path __lowerstack[];
};

/* private information held for overlayfs's superblock */
struct ovl_fs {
        unsigned int numlayer;
        /* Number of unique fs among layers including upper fs */
        unsigned int numfs;
        /* Number of data-only lower layers */
        unsigned int numdatalayer;
        struct ovl_layer *layers;
        struct ovl_sb *fs;
        /* workbasedir is the path at workdir= mount option */
        struct dentry *workbasedir;
        /* workdir is the 'work' or 'index' directory under workbasedir */
        struct dentry *workdir;
        long namelen;
        /* pathnames of lower and upper dirs, for show_options */
        struct ovl_config config;
        /* creds of process who forced instantiation of super block */
        const struct cred *creator_cred;
        bool tmpfile;
        bool noxattr;
        bool nofh;
        /* Did we take the inuse lock? */
        bool upperdir_locked;
        bool workdir_locked;
        /* Traps in ovl inode cache */
        struct inode *workbasedir_trap;
        struct inode *workdir_trap;
        /* -1: disabled, 0: same fs, 1..32: number of unused ino bits */
        int xino_mode;
        /* For allocation of non-persistent inode numbers */
        atomic_long_t last_ino;
        /* Shared whiteout cache */
        struct dentry *whiteout;
        bool no_shared_whiteout;
        /* r/o snapshot of upperdir sb's only taken on volatile mounts */
        errseq_t errseq;
};

/* Number of lower layers, not including data-only layers */
static inline unsigned int ovl_numlowerlayer(struct ovl_fs *ofs)
{
        return ofs->numlayer - ofs->numdatalayer - 1;
}

static inline struct vfsmount *ovl_upper_mnt(struct ovl_fs *ofs)
{
        return ofs->layers[0].mnt;
}

static inline struct mnt_idmap *ovl_upper_mnt_idmap(struct ovl_fs *ofs)
{
        return mnt_idmap(ovl_upper_mnt(ofs));
}

extern struct file_system_type ovl_fs_type;

static inline struct ovl_fs *OVL_FS(struct super_block *sb)
{
        if (IS_ENABLED(CONFIG_OVERLAY_FS_DEBUG))
                WARN_ON_ONCE(sb->s_type != &ovl_fs_type);

        return (struct ovl_fs *)sb->s_fs_info;
}

static inline bool ovl_should_sync(struct ovl_fs *ofs)
{
        return !ofs->config.ovl_volatile;
}

static inline unsigned int ovl_numlower(struct ovl_entry *oe)
{
        return oe ? oe->__numlower : 0;
}

static inline struct ovl_path *ovl_lowerstack(struct ovl_entry *oe)
{
        return ovl_numlower(oe) ? oe->__lowerstack : NULL;
}

static inline struct ovl_path *ovl_lowerpath(struct ovl_entry *oe)
{
        return ovl_lowerstack(oe);
}

static inline struct ovl_path *ovl_lowerdata(struct ovl_entry *oe)
{
        struct ovl_path *lowerstack = ovl_lowerstack(oe);

        return lowerstack ? &lowerstack[oe->__numlower - 1] : NULL;
}

/* May return NULL if lazy lookup of lowerdata is needed */
static inline struct dentry *ovl_lowerdata_dentry(struct ovl_entry *oe)
{
        struct ovl_path *lowerdata = ovl_lowerdata(oe);

        return lowerdata ? READ_ONCE(lowerdata->dentry) : NULL;
}

/* private information held for every overlayfs dentry */
static inline unsigned long *OVL_E_FLAGS(struct dentry *dentry)
{
        return (unsigned long *) &dentry->d_fsdata;
}

struct ovl_inode {
        union {
                struct ovl_dir_cache *cache;        /* directory */
                const char *lowerdata_redirect;        /* regular file */
        };
        const char *redirect;
        u64 version;
        unsigned long flags;
        struct inode vfs_inode;
        struct dentry *__upperdentry;
        struct ovl_entry *oe;

        /* synchronize copy up and more */
        struct mutex lock;
};

static inline struct ovl_inode *OVL_I(struct inode *inode)
{
        return container_of(inode, struct ovl_inode, vfs_inode);
}

static inline struct ovl_entry *OVL_I_E(struct inode *inode)
{
        return inode ? OVL_I(inode)->oe : NULL;
}

static inline struct ovl_entry *OVL_E(struct dentry *dentry)
{
        return OVL_I_E(d_inode(dentry));
}

static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi)
{
        return READ_ONCE(oi->__upperdentry);
}
































    2 
    2 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// SPDX-License-Identifier: GPL-2.0-only
/*
 * A generic implementation of binary search for the Linux kernel
 *
 * Copyright (C) 2008-2009 Ksplice, Inc.
 * Author: Tim Abbott <tabbott@ksplice.com>
 */

#include <linux/export.h>
#include <linux/bsearch.h>
#include <linux/kprobes.h>

/*
 * bsearch - binary search an array of elements
 * @key: pointer to item being searched for
 * @base: pointer to first element to search
 * @num: number of elements
 * @size: size of each element
 * @cmp: pointer to comparison function
 *
 * This function does a binary search on the given array.  The
 * contents of the array should already be in ascending sorted order
 * under the provided comparison function.
 *
 * Note that the key need not have the same type as the elements in
 * the array, e.g. key could be a string and the comparison function
 * could compare the string with the struct's name field.  However, if
 * the key and elements in the array are of the same type, you can use
 * the same comparison function for both sort() and bsearch().
 */
void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp)
{
        return __inline_bsearch(key, base, num, size, cmp);
}
EXPORT_SYMBOL(bsearch);
NOKPROBE_SYMBOL(bsearch);



































































































   54 































































































































   54 









































































































































   51 



   56 























   57 








   53 
   57 














   50 

































   51 


   51 













   51 












   53 











   54 
   52 









   55 











   54 



   53 































































   52 










   50 














































   54 




   53 
















   55 






   51 














   53 

























   55 















   52 



























   57 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/objtool.h>
#include <linux/module.h>
#include <linux/sort.h>
#include <asm/ptrace.h>
#include <asm/stacktrace.h>
#include <asm/unwind.h>
#include <asm/orc_types.h>
#include <asm/orc_lookup.h>
#include <asm/orc_header.h>

ORC_HEADER;

#define orc_warn(fmt, ...) \
        printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__)

#define orc_warn_current(args...)                                        \
({                                                                        \
        static bool dumped_before;                                        \
        if (state->task == current && !state->error) {                        \
                orc_warn(args);                                                \
                if (unwind_debug && !dumped_before) {                        \
                        dumped_before = true;                                \
                        unwind_dump(state);                                \
                }                                                        \
        }                                                                \
})

extern int __start_orc_unwind_ip[];
extern int __stop_orc_unwind_ip[];
extern struct orc_entry __start_orc_unwind[];
extern struct orc_entry __stop_orc_unwind[];

static bool orc_init __ro_after_init;
static bool unwind_debug __ro_after_init;
static unsigned int lookup_num_blocks __ro_after_init;

static int __init unwind_debug_cmdline(char *str)
{
        unwind_debug = true;

        return 0;
}
early_param("unwind_debug", unwind_debug_cmdline);

static void unwind_dump(struct unwind_state *state)
{
        static bool dumped_before;
        unsigned long word, *sp;
        struct stack_info stack_info = {0};
        unsigned long visit_mask = 0;

        if (dumped_before)
                return;

        dumped_before = true;

        printk_deferred("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n",
                        state->stack_info.type, state->stack_info.next_sp,
                        state->stack_mask, state->graph_idx);

        for (sp = __builtin_frame_address(0); sp;
             sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
                if (get_stack_info(sp, state->task, &stack_info, &visit_mask))
                        break;

                for (; sp < stack_info.end; sp++) {

                        word = READ_ONCE_NOCHECK(*sp);

                        printk_deferred("%0*lx: %0*lx (%pB)\n", BITS_PER_LONG/4,
                                        (unsigned long)sp, BITS_PER_LONG/4,
                                        word, (void *)word);
                }
        }
}

static inline unsigned long orc_ip(const int *ip)
{
        return (unsigned long)ip + *ip;
}

static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table,
                                    unsigned int num_entries, unsigned long ip)
{
        int *first = ip_table;
        int *last = ip_table + num_entries - 1;
        int *mid, *found = first;

        if (!num_entries)
                return NULL;

        /*
         * Do a binary range search to find the rightmost duplicate of a given
         * starting address.  Some entries are section terminators which are
         * "weak" entries for ensuring there are no gaps.  They should be
         * ignored when they conflict with a real entry.
         */
        while (first <= last) {
                mid = first + ((last - first) / 2);

                if (orc_ip(mid) <= ip) {
                        found = mid;
                        first = mid + 1;
                } else
                        last = mid - 1;
        }

        return u_table + (found - ip_table);
}

#ifdef CONFIG_MODULES
static struct orc_entry *orc_module_find(unsigned long ip)
{
        struct module *mod;

        mod = __module_address(ip);
        if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip)
                return NULL;
        return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind,
                          mod->arch.num_orcs, ip);
}
#else
static struct orc_entry *orc_module_find(unsigned long ip)
{
        return NULL;
}
#endif

#ifdef CONFIG_DYNAMIC_FTRACE
static struct orc_entry *orc_find(unsigned long ip);

/*
 * Ftrace dynamic trampolines do not have orc entries of their own.
 * But they are copies of the ftrace entries that are static and
 * defined in ftrace_*.S, which do have orc entries.
 *
 * If the unwinder comes across a ftrace trampoline, then find the
 * ftrace function that was used to create it, and use that ftrace
 * function's orc entry, as the placement of the return code in
 * the stack will be identical.
 */
static struct orc_entry *orc_ftrace_find(unsigned long ip)
{
        struct ftrace_ops *ops;
        unsigned long tramp_addr, offset;

        ops = ftrace_ops_trampoline(ip);
        if (!ops)
                return NULL;

        /* Set tramp_addr to the start of the code copied by the trampoline */
        if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
                tramp_addr = (unsigned long)ftrace_regs_caller;
        else
                tramp_addr = (unsigned long)ftrace_caller;

        /* Now place tramp_addr to the location within the trampoline ip is at */
        offset = ip - ops->trampoline;
        tramp_addr += offset;

        /* Prevent unlikely recursion */
        if (ip == tramp_addr)
                return NULL;

        return orc_find(tramp_addr);
}
#else
static struct orc_entry *orc_ftrace_find(unsigned long ip)
{
        return NULL;
}
#endif

/*
 * If we crash with IP==0, the last successfully executed instruction
 * was probably an indirect function call with a NULL function pointer,
 * and we don't have unwind information for NULL.
 * This hardcoded ORC entry for IP==0 allows us to unwind from a NULL function
 * pointer into its parent and then continue normally from there.
 */
static struct orc_entry null_orc_entry = {
        .sp_offset = sizeof(long),
        .sp_reg = ORC_REG_SP,
        .bp_reg = ORC_REG_UNDEFINED,
        .type = ORC_TYPE_CALL
};

/* Fake frame pointer entry -- used as a fallback for generated code */
static struct orc_entry orc_fp_entry = {
        .type                = ORC_TYPE_CALL,
        .sp_reg                = ORC_REG_BP,
        .sp_offset        = 16,
        .bp_reg                = ORC_REG_PREV_SP,
        .bp_offset        = -16,
};

static struct orc_entry *orc_find(unsigned long ip)
{
        static struct orc_entry *orc;

        if (ip == 0)
                return &null_orc_entry;

        /* For non-init vmlinux addresses, use the fast lookup table: */
        if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) {
                unsigned int idx, start, stop;

                idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE;

                if (unlikely((idx >= lookup_num_blocks-1))) {
                        orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%pB\n",
                                 idx, lookup_num_blocks, (void *)ip);
                        return NULL;
                }

                start = orc_lookup[idx];
                stop = orc_lookup[idx + 1] + 1;

                if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) ||
                             (__start_orc_unwind + stop > __stop_orc_unwind))) {
                        orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%pB\n",
                                 idx, lookup_num_blocks, start, stop, (void *)ip);
                        return NULL;
                }

                return __orc_find(__start_orc_unwind_ip + start,
                                  __start_orc_unwind + start, stop - start, ip);
        }

        /* vmlinux .init slow lookup: */
        if (is_kernel_inittext(ip))
                return __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
                                  __stop_orc_unwind_ip - __start_orc_unwind_ip, ip);

        /* Module lookup: */
        orc = orc_module_find(ip);
        if (orc)
                return orc;

        return orc_ftrace_find(ip);
}

#ifdef CONFIG_MODULES

static DEFINE_MUTEX(sort_mutex);
static int *cur_orc_ip_table = __start_orc_unwind_ip;
static struct orc_entry *cur_orc_table = __start_orc_unwind;

static void orc_sort_swap(void *_a, void *_b, int size)
{
        struct orc_entry *orc_a, *orc_b;
        int *a = _a, *b = _b, tmp;
        int delta = _b - _a;

        /* Swap the .orc_unwind_ip entries: */
        tmp = *a;
        *a = *b + delta;
        *b = tmp - delta;

        /* Swap the corresponding .orc_unwind entries: */
        orc_a = cur_orc_table + (a - cur_orc_ip_table);
        orc_b = cur_orc_table + (b - cur_orc_ip_table);
        swap(*orc_a, *orc_b);
}

static int orc_sort_cmp(const void *_a, const void *_b)
{
        struct orc_entry *orc_a;
        const int *a = _a, *b = _b;
        unsigned long a_val = orc_ip(a);
        unsigned long b_val = orc_ip(b);

        if (a_val > b_val)
                return 1;
        if (a_val < b_val)
                return -1;

        /*
         * The "weak" section terminator entries need to always be first
         * to ensure the lookup code skips them in favor of real entries.
         * These terminator entries exist to handle any gaps created by
         * whitelisted .o files which didn't get objtool generation.
         */
        orc_a = cur_orc_table + (a - cur_orc_ip_table);
        return orc_a->type == ORC_TYPE_UNDEFINED ? -1 : 1;
}

void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size,
                        void *_orc, size_t orc_size)
{
        int *orc_ip = _orc_ip;
        struct orc_entry *orc = _orc;
        unsigned int num_entries = orc_ip_size / sizeof(int);

        WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 ||
                     orc_size % sizeof(*orc) != 0 ||
                     num_entries != orc_size / sizeof(*orc));

        /*
         * The 'cur_orc_*' globals allow the orc_sort_swap() callback to
         * associate an .orc_unwind_ip table entry with its corresponding
         * .orc_unwind entry so they can both be swapped.
         */
        mutex_lock(&sort_mutex);
        cur_orc_ip_table = orc_ip;
        cur_orc_table = orc;
        sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap);
        mutex_unlock(&sort_mutex);

        mod->arch.orc_unwind_ip = orc_ip;
        mod->arch.orc_unwind = orc;
        mod->arch.num_orcs = num_entries;
}
#endif

void __init unwind_init(void)
{
        size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip;
        size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind;
        size_t num_entries = orc_ip_size / sizeof(int);
        struct orc_entry *orc;
        int i;

        if (!num_entries || orc_ip_size % sizeof(int) != 0 ||
            orc_size % sizeof(struct orc_entry) != 0 ||
            num_entries != orc_size / sizeof(struct orc_entry)) {
                orc_warn("WARNING: Bad or missing .orc_unwind table.  Disabling unwinder.\n");
                return;
        }

        /*
         * Note, the orc_unwind and orc_unwind_ip tables were already
         * sorted at build time via the 'sorttable' tool.
         * It's ready for binary search straight away, no need to sort it.
         */

        /* Initialize the fast lookup table: */
        lookup_num_blocks = orc_lookup_end - orc_lookup;
        for (i = 0; i < lookup_num_blocks-1; i++) {
                orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
                                 num_entries,
                                 LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i));
                if (!orc) {
                        orc_warn("WARNING: Corrupt .orc_unwind table.  Disabling unwinder.\n");
                        return;
                }

                orc_lookup[i] = orc - __start_orc_unwind;
        }

        /* Initialize the ending block: */
        orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries,
                         LOOKUP_STOP_IP);
        if (!orc) {
                orc_warn("WARNING: Corrupt .orc_unwind table.  Disabling unwinder.\n");
                return;
        }
        orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind;

        orc_init = true;
}

unsigned long unwind_get_return_address(struct unwind_state *state)
{
        if (unwind_done(state))
                return 0;

        return __kernel_text_address(state->ip) ? state->ip : 0;
}
EXPORT_SYMBOL_GPL(unwind_get_return_address);

unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
{
        if (unwind_done(state))
                return NULL;

        if (state->regs)
                return &state->regs->ip;

        if (state->sp)
                return (unsigned long *)state->sp - 1;

        return NULL;
}

static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
                            size_t len)
{
        struct stack_info *info = &state->stack_info;
        void *addr = (void *)_addr;

        if (on_stack(info, addr, len))
                return true;

        return !get_stack_info(addr, state->task, info, &state->stack_mask) &&
                on_stack(info, addr, len);
}

static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
                            unsigned long *val)
{
        if (!stack_access_ok(state, addr, sizeof(long)))
                return false;

        *val = READ_ONCE_NOCHECK(*(unsigned long *)addr);
        return true;
}

static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
                             unsigned long *ip, unsigned long *sp)
{
        struct pt_regs *regs = (struct pt_regs *)addr;

        /* x86-32 support will be more complicated due to the &regs->sp hack */
        BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));

        if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
                return false;

        *ip = READ_ONCE_NOCHECK(regs->ip);
        *sp = READ_ONCE_NOCHECK(regs->sp);
        return true;
}

static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr,
                                  unsigned long *ip, unsigned long *sp)
{
        struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET;

        if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
                return false;

        *ip = READ_ONCE_NOCHECK(regs->ip);
        *sp = READ_ONCE_NOCHECK(regs->sp);
        return true;
}

/*
 * If state->regs is non-NULL, and points to a full pt_regs, just get the reg
 * value from state->regs.
 *
 * Otherwise, if state->regs just points to IRET regs, and the previous frame
 * had full regs, it's safe to get the value from the previous regs.  This can
 * happen when early/late IRQ entry code gets interrupted by an NMI.
 */
static bool get_reg(struct unwind_state *state, unsigned int reg_off,
                    unsigned long *val)
{
        unsigned int reg = reg_off/8;

        if (!state->regs)
                return false;

        if (state->full_regs) {
                *val = READ_ONCE_NOCHECK(((unsigned long *)state->regs)[reg]);
                return true;
        }

        if (state->prev_regs) {
                *val = READ_ONCE_NOCHECK(((unsigned long *)state->prev_regs)[reg]);
                return true;
        }

        return false;
}

bool unwind_next_frame(struct unwind_state *state)
{
        unsigned long ip_p, sp, tmp, orig_ip = state->ip, prev_sp = state->sp;
        enum stack_type prev_type = state->stack_info.type;
        struct orc_entry *orc;
        bool indirect = false;

        if (unwind_done(state))
                return false;

        /* Don't let modules unload while we're reading their ORC data. */
        preempt_disable();

        /* End-of-stack check for user tasks: */
        if (state->regs && user_mode(state->regs))
                goto the_end;

        /*
         * Find the orc_entry associated with the text address.
         *
         * For a call frame (as opposed to a signal frame), state->ip points to
         * the instruction after the call.  That instruction's stack layout
         * could be different from the call instruction's layout, for example
         * if the call was to a noreturn function.  So get the ORC data for the
         * call instruction itself.
         */
        orc = orc_find(state->signal ? state->ip : state->ip - 1);
        if (!orc) {
                /*
                 * As a fallback, try to assume this code uses a frame pointer.
                 * This is useful for generated code, like BPF, which ORC
                 * doesn't know about.  This is just a guess, so the rest of
                 * the unwind is no longer considered reliable.
                 */
                orc = &orc_fp_entry;
                state->error = true;
        } else {
                if (orc->type == ORC_TYPE_UNDEFINED)
                        goto err;

                if (orc->type == ORC_TYPE_END_OF_STACK)
                        goto the_end;
        }

        state->signal = orc->signal;

        /* Find the previous frame's stack: */
        switch (orc->sp_reg) {
        case ORC_REG_SP:
                sp = state->sp + orc->sp_offset;
                break;

        case ORC_REG_BP:
                sp = state->bp + orc->sp_offset;
                break;

        case ORC_REG_SP_INDIRECT:
                sp = state->sp;
                indirect = true;
                break;

        case ORC_REG_BP_INDIRECT:
                sp = state->bp + orc->sp_offset;
                indirect = true;
                break;

        case ORC_REG_R10:
                if (!get_reg(state, offsetof(struct pt_regs, r10), &sp)) {
                        orc_warn_current("missing R10 value at %pB\n",
                                         (void *)state->ip);
                        goto err;
                }
                break;

        case ORC_REG_R13:
                if (!get_reg(state, offsetof(struct pt_regs, r13), &sp)) {
                        orc_warn_current("missing R13 value at %pB\n",
                                         (void *)state->ip);
                        goto err;
                }
                break;

        case ORC_REG_DI:
                if (!get_reg(state, offsetof(struct pt_regs, di), &sp)) {
                        orc_warn_current("missing RDI value at %pB\n",
                                         (void *)state->ip);
                        goto err;
                }
                break;

        case ORC_REG_DX:
                if (!get_reg(state, offsetof(struct pt_regs, dx), &sp)) {
                        orc_warn_current("missing DX value at %pB\n",
                                         (void *)state->ip);
                        goto err;
                }
                break;

        default:
                orc_warn("unknown SP base reg %d at %pB\n",
                         orc->sp_reg, (void *)state->ip);
                goto err;
        }

        if (indirect) {
                if (!deref_stack_reg(state, sp, &sp))
                        goto err;

                if (orc->sp_reg == ORC_REG_SP_INDIRECT)
                        sp += orc->sp_offset;
        }

        /* Find IP, SP and possibly regs: */
        switch (orc->type) {
        case ORC_TYPE_CALL:
                ip_p = sp - sizeof(long);

                if (!deref_stack_reg(state, ip_p, &state->ip))
                        goto err;

                state->ip = unwind_recover_ret_addr(state, state->ip,
                                                    (unsigned long *)ip_p);
                state->sp = sp;
                state->regs = NULL;
                state->prev_regs = NULL;
                break;

        case ORC_TYPE_REGS:
                if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
                        orc_warn_current("can't access registers at %pB\n",
                                         (void *)orig_ip);
                        goto err;
                }
                /*
                 * There is a small chance to interrupt at the entry of
                 * arch_rethook_trampoline() where the ORC info doesn't exist.
                 * That point is right after the RET to arch_rethook_trampoline()
                 * which was modified return address.
                 * At that point, the @addr_p of the unwind_recover_rethook()
                 * (this has to point the address of the stack entry storing
                 * the modified return address) must be "SP - (a stack entry)"
                 * because SP is incremented by the RET.
                 */
                state->ip = unwind_recover_rethook(state, state->ip,
                                (unsigned long *)(state->sp - sizeof(long)));
                state->regs = (struct pt_regs *)sp;
                state->prev_regs = NULL;
                state->full_regs = true;
                break;

        case ORC_TYPE_REGS_PARTIAL:
                if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
                        orc_warn_current("can't access iret registers at %pB\n",
                                         (void *)orig_ip);
                        goto err;
                }
                /* See ORC_TYPE_REGS case comment. */
                state->ip = unwind_recover_rethook(state, state->ip,
                                (unsigned long *)(state->sp - sizeof(long)));

                if (state->full_regs)
                        state->prev_regs = state->regs;
                state->regs = (void *)sp - IRET_FRAME_OFFSET;
                state->full_regs = false;
                break;

        default:
                orc_warn("unknown .orc_unwind entry type %d at %pB\n",
                         orc->type, (void *)orig_ip);
                goto err;
        }

        /* Find BP: */
        switch (orc->bp_reg) {
        case ORC_REG_UNDEFINED:
                if (get_reg(state, offsetof(struct pt_regs, bp), &tmp))
                        state->bp = tmp;
                break;

        case ORC_REG_PREV_SP:
                if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp))
                        goto err;
                break;

        case ORC_REG_BP:
                if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp))
                        goto err;
                break;

        default:
                orc_warn("unknown BP base reg %d for ip %pB\n",
                         orc->bp_reg, (void *)orig_ip);
                goto err;
        }

        /* Prevent a recursive loop due to bad ORC data: */
        if (state->stack_info.type == prev_type &&
            on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) &&
            state->sp <= prev_sp) {
                orc_warn_current("stack going in the wrong direction? at %pB\n",
                                 (void *)orig_ip);
                goto err;
        }

        preempt_enable();
        return true;

err:
        state->error = true;

the_end:
        preempt_enable();
        state->stack_info.type = STACK_TYPE_UNKNOWN;
        return false;
}
EXPORT_SYMBOL_GPL(unwind_next_frame);

void __unwind_start(struct unwind_state *state, struct task_struct *task,
                    struct pt_regs *regs, unsigned long *first_frame)
{
        memset(state, 0, sizeof(*state));
        state->task = task;

        if (!orc_init)
                goto err;

        /*
         * Refuse to unwind the stack of a task while it's executing on another
         * CPU.  This check is racy, but that's ok: the unwinder has other
         * checks to prevent it from going off the rails.
         */
        if (task_on_another_cpu(task))
                goto err;

        if (regs) {
                if (user_mode(regs))
                        goto the_end;

                state->ip = regs->ip;
                state->sp = regs->sp;
                state->bp = regs->bp;
                state->regs = regs;
                state->full_regs = true;
                state->signal = true;

        } else if (task == current) {
                asm volatile("lea (%%rip), %0\n\t"
                             "mov %%rsp, %1\n\t"
                             "mov %%rbp, %2\n\t"
                             : "=r" (state->ip), "=r" (state->sp),
                               "=r" (state->bp));

        } else {
                struct inactive_task_frame *frame = (void *)task->thread.sp;

                state->sp = task->thread.sp + sizeof(*frame);
                state->bp = READ_ONCE_NOCHECK(frame->bp);
                state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
                state->signal = (void *)state->ip == ret_from_fork;
        }

        if (get_stack_info((unsigned long *)state->sp, state->task,
                           &state->stack_info, &state->stack_mask)) {
                /*
                 * We weren't on a valid stack.  It's possible that
                 * we overflowed a valid stack into a guard page.
                 * See if the next page up is valid so that we can
                 * generate some kind of backtrace if this happens.
                 */
                void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
                state->error = true;
                if (get_stack_info(next_page, state->task, &state->stack_info,
                                   &state->stack_mask))
                        return;
        }

        /*
         * The caller can provide the address of the first frame directly
         * (first_frame) or indirectly (regs->sp) to indicate which stack frame
         * to start unwinding at.  Skip ahead until we reach it.
         */

        /* When starting from regs, skip the regs frame: */
        if (regs) {
                unwind_next_frame(state);
                return;
        }

        /* Otherwise, skip ahead to the user-specified starting frame: */
        while (!unwind_done(state) &&
               (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
                        state->sp <= (unsigned long)first_frame))
                unwind_next_frame(state);

        return;

err:
        state->error = true;
the_end:
        state->stack_info.type = STACK_TYPE_UNKNOWN;
}
EXPORT_SYMBOL_GPL(__unwind_start);









































































































































































































    2 










































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
 * Written by Alex Tomas <alex@clusterfs.com>
 */

#ifndef _EXT4_EXTENTS
#define _EXT4_EXTENTS

#include "ext4.h"

/*
 * With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks
 * becomes very small, so index split, in-depth growing and
 * other hard changes happen much more often.
 * This is for debug purposes only.
 */
#define AGGRESSIVE_TEST_

/*
 * With EXTENTS_STATS defined, the number of blocks and extents
 * are collected in the truncate path. They'll be shown at
 * umount time.
 */
#define EXTENTS_STATS__

/*
 * If CHECK_BINSEARCH is defined, then the results of the binary search
 * will also be checked by linear search.
 */
#define CHECK_BINSEARCH__

/*
 * If EXT_STATS is defined then stats numbers are collected.
 * These number will be displayed at umount time.
 */
#define EXT_STATS_


/*
 * ext4_inode has i_block array (60 bytes total).
 * The first 12 bytes store ext4_extent_header;
 * the remainder stores an array of ext4_extent.
 * For non-inode extent blocks, ext4_extent_tail
 * follows the array.
 */

/*
 * This is the extent tail on-disk structure.
 * All other extent structures are 12 bytes long.  It turns out that
 * block_size % 12 >= 4 for at least all powers of 2 greater than 512, which
 * covers all valid ext4 block sizes.  Therefore, this tail structure can be
 * crammed into the end of the block without having to rebalance the tree.
 */
struct ext4_extent_tail {
        __le32        et_checksum;        /* crc32c(uuid+inum+extent_block) */
};

/*
 * This is the extent on-disk structure.
 * It's used at the bottom of the tree.
 */
struct ext4_extent {
        __le32        ee_block;        /* first logical block extent covers */
        __le16        ee_len;                /* number of blocks covered by extent */
        __le16        ee_start_hi;        /* high 16 bits of physical block */
        __le32        ee_start_lo;        /* low 32 bits of physical block */
};

/*
 * This is index on-disk structure.
 * It's used at all the levels except the bottom.
 */
struct ext4_extent_idx {
        __le32        ei_block;        /* index covers logical blocks from 'block' */
        __le32        ei_leaf_lo;        /* pointer to the physical block of the next *
                                 * level. leaf or next index could be there */
        __le16        ei_leaf_hi;        /* high 16 bits of physical block */
        __u16        ei_unused;
};

/*
 * Each block (leaves and indexes), even inode-stored has header.
 */
struct ext4_extent_header {
        __le16        eh_magic;        /* probably will support different formats */
        __le16        eh_entries;        /* number of valid entries */
        __le16        eh_max;                /* capacity of store in entries */
        __le16        eh_depth;        /* has tree real underlying blocks? */
        __le32        eh_generation;        /* generation of the tree */
};

#define EXT4_EXT_MAGIC                cpu_to_le16(0xf30a)
#define EXT4_MAX_EXTENT_DEPTH 5

#define EXT4_EXTENT_TAIL_OFFSET(hdr) \
        (sizeof(struct ext4_extent_header) + \
         (sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max)))

static inline struct ext4_extent_tail *
find_ext4_extent_tail(struct ext4_extent_header *eh)
{
        return (struct ext4_extent_tail *)(((void *)eh) +
                                           EXT4_EXTENT_TAIL_OFFSET(eh));
}

/*
 * Array of ext4_ext_path contains path to some extent.
 * Creation/lookup routines use it for traversal/splitting/etc.
 * Truncate uses it to simulate recursive walking.
 */
struct ext4_ext_path {
        ext4_fsblk_t                        p_block;
        __u16                                p_depth;
        __u16                                p_maxdepth;
        struct ext4_extent                *p_ext;
        struct ext4_extent_idx                *p_idx;
        struct ext4_extent_header        *p_hdr;
        struct buffer_head                *p_bh;
};

/*
 * Used to record a portion of a cluster found at the beginning or end
 * of an extent while traversing the extent tree during space removal.
 * A partial cluster may be removed if it does not contain blocks shared
 * with extents that aren't being deleted (tofree state).  Otherwise,
 * it cannot be removed (nofree state).
 */
struct partial_cluster {
        ext4_fsblk_t pclu;  /* physical cluster number */
        ext4_lblk_t lblk;   /* logical block number within logical cluster */
        enum {initial, tofree, nofree} state;
};

/*
 * structure for external API
 */

/*
 * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
 * initialized extent. This is 2^15 and not (2^16 - 1), since we use the
 * MSB of ee_len field in the extent datastructure to signify if this
 * particular extent is an initialized extent or an unwritten (i.e.
 * preallocated).
 * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an
 * unwritten extent.
 * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an
 * unwritten one. In other words, if MSB of ee_len is set, it is an
 * unwritten extent with only one special scenario when ee_len = 0x8000.
 * In this case we can not have an unwritten extent of zero length and
 * thus we make it as a special case of initialized extent with 0x8000 length.
 * This way we get better extent-to-group alignment for initialized extents.
 * Hence, the maximum number of blocks we can have in an *initialized*
 * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767).
 */
#define EXT_INIT_MAX_LEN        (1UL << 15)
#define EXT_UNWRITTEN_MAX_LEN        (EXT_INIT_MAX_LEN - 1)


#define EXT_FIRST_EXTENT(__hdr__) \
        ((struct ext4_extent *) (((char *) (__hdr__)) +                \
                                 sizeof(struct ext4_extent_header)))
#define EXT_FIRST_INDEX(__hdr__) \
        ((struct ext4_extent_idx *) (((char *) (__hdr__)) +        \
                                     sizeof(struct ext4_extent_header)))
#define EXT_HAS_FREE_INDEX(__path__) \
        (le16_to_cpu((__path__)->p_hdr->eh_entries) \
                                     < le16_to_cpu((__path__)->p_hdr->eh_max))
#define EXT_LAST_EXTENT(__hdr__) \
        (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
#define EXT_LAST_INDEX(__hdr__) \
        (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
#define EXT_MAX_EXTENT(__hdr__)        \
        ((le16_to_cpu((__hdr__)->eh_max)) ? \
        ((EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \
                                        : NULL)
#define EXT_MAX_INDEX(__hdr__) \
        ((le16_to_cpu((__hdr__)->eh_max)) ? \
        ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \
                                        : NULL)

static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode)
{
        return (struct ext4_extent_header *) EXT4_I(inode)->i_data;
}

static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh)
{
        return (struct ext4_extent_header *) bh->b_data;
}

static inline unsigned short ext_depth(struct inode *inode)
{
        return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
}

static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext)
{
        /* We can not have an unwritten extent of zero length! */
        BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0);
        ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN);
}

static inline int ext4_ext_is_unwritten(struct ext4_extent *ext)
{
        /* Extent with ee_len of 0x8000 is treated as an initialized extent */
        return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN);
}

static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
{
        return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ?
                le16_to_cpu(ext->ee_len) :
                (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
}

static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
{
        ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
}

/*
 * ext4_ext_pblock:
 * combine low and high parts of physical block number into ext4_fsblk_t
 */
static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex)
{
        ext4_fsblk_t block;

        block = le32_to_cpu(ex->ee_start_lo);
        block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
        return block;
}

/*
 * ext4_idx_pblock:
 * combine low and high parts of a leaf physical block number into ext4_fsblk_t
 */
static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix)
{
        ext4_fsblk_t block;

        block = le32_to_cpu(ix->ei_leaf_lo);
        block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
        return block;
}

/*
 * ext4_ext_store_pblock:
 * stores a large physical block number into an extent struct,
 * breaking it into parts
 */
static inline void ext4_ext_store_pblock(struct ext4_extent *ex,
                                         ext4_fsblk_t pb)
{
        ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
        ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
                                      0xffff);
}

/*
 * ext4_idx_store_pblock:
 * stores a large physical block number into an index struct,
 * breaking it into parts
 */
static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
                                         ext4_fsblk_t pb)
{
        ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
        ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
                                     0xffff);
}

#endif /* _EXT4_EXTENTS */





































































    1 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Unified UUID/GUID definition
 *
 * Copyright (C) 2009, 2016 Intel Corp.
 *        Huang Ying <ying.huang@intel.com>
 */

#include <linux/kernel.h>
#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/uuid.h>
#include <linux/random.h>

const guid_t guid_null;
EXPORT_SYMBOL(guid_null);
const uuid_t uuid_null;
EXPORT_SYMBOL(uuid_null);

const u8 guid_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15};
const u8 uuid_index[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};

/**
 * generate_random_uuid - generate a random UUID
 * @uuid: where to put the generated UUID
 *
 * Random UUID interface
 *
 * Used to create a Boot ID or a filesystem UUID/GUID, but can be
 * useful for other kernel drivers.
 */
void generate_random_uuid(unsigned char uuid[16])
{
        get_random_bytes(uuid, 16);
        /* Set UUID version to 4 --- truly random generation */
        uuid[6] = (uuid[6] & 0x0F) | 0x40;
        /* Set the UUID variant to DCE */
        uuid[8] = (uuid[8] & 0x3F) | 0x80;
}
EXPORT_SYMBOL(generate_random_uuid);

void generate_random_guid(unsigned char guid[16])
{
        get_random_bytes(guid, 16);
        /* Set GUID version to 4 --- truly random generation */
        guid[7] = (guid[7] & 0x0F) | 0x40;
        /* Set the GUID variant to DCE */
        guid[8] = (guid[8] & 0x3F) | 0x80;
}
EXPORT_SYMBOL(generate_random_guid);

static void __uuid_gen_common(__u8 b[16])
{
        get_random_bytes(b, 16);
        /* reversion 0b10 */
        b[8] = (b[8] & 0x3F) | 0x80;
}

void guid_gen(guid_t *lu)
{
        __uuid_gen_common(lu->b);
        /* version 4 : random generation */
        lu->b[7] = (lu->b[7] & 0x0F) | 0x40;
}
EXPORT_SYMBOL_GPL(guid_gen);

void uuid_gen(uuid_t *bu)
{
        __uuid_gen_common(bu->b);
        /* version 4 : random generation */
        bu->b[6] = (bu->b[6] & 0x0F) | 0x40;
}
EXPORT_SYMBOL_GPL(uuid_gen);

/**
 * uuid_is_valid - checks if a UUID string is valid
 * @uuid:        UUID string to check
 *
 * Description:
 * It checks if the UUID string is following the format:
 *        xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
 *
 * where x is a hex digit.
 *
 * Return: true if input is valid UUID string.
 */
bool uuid_is_valid(const char *uuid)
{
        unsigned int i;

        for (i = 0; i < UUID_STRING_LEN; i++) {
                if (i == 8 || i == 13 || i == 18 || i == 23) {
                        if (uuid[i] != '-')
                                return false;
                } else if (!isxdigit(uuid[i])) {
                        return false;
                }
        }

        return true;
}
EXPORT_SYMBOL(uuid_is_valid);

static int __uuid_parse(const char *uuid, __u8 b[16], const u8 ei[16])
{
        static const u8 si[16] = {0,2,4,6,9,11,14,16,19,21,24,26,28,30,32,34};
        unsigned int i;

        if (!uuid_is_valid(uuid))
                return -EINVAL;

        for (i = 0; i < 16; i++) {
                int hi = hex_to_bin(uuid[si[i] + 0]);
                int lo = hex_to_bin(uuid[si[i] + 1]);

                b[ei[i]] = (hi << 4) | lo;
        }

        return 0;
}

int guid_parse(const char *uuid, guid_t *u)
{
        return __uuid_parse(uuid, u->b, guid_index);
}
EXPORT_SYMBOL(guid_parse);

int uuid_parse(const char *uuid, uuid_t *u)
{
        return __uuid_parse(uuid, u->b, uuid_index);
}
EXPORT_SYMBOL(uuid_parse);



























































    1 


















    1 
























































































































































































    1 



















    1 




    1 










    1 






    1 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
/*
 * Resizable simple ram filesystem for Linux.
 *
 * Copyright (C) 2000 Linus Torvalds.
 *               2000 Transmeta Corp.
 *
 * Usage limits added by David Gibson, Linuxcare Australia.
 * This file is released under the GPL.
 */

/*
 * NOTE! This filesystem is probably most useful
 * not as a real filesystem, but as an example of
 * how virtual filesystems can be written.
 *
 * It doesn't get much simpler than this. Consider
 * that this file implements the full semantics of
 * a POSIX-compliant read-write filesystem.
 *
 * Note in particular how the filesystem does not
 * need to implement any data structures of its own
 * to keep track of the virtual data: using the VFS
 * caches is sufficient.
 */

#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/ramfs.h>
#include <linux/sched.h>
#include <linux/parser.h>
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/seq_file.h>
#include "internal.h"

struct ramfs_mount_opts {
        umode_t mode;
};

struct ramfs_fs_info {
        struct ramfs_mount_opts mount_opts;
};

#define RAMFS_DEFAULT_MODE        0755

static const struct super_operations ramfs_ops;
static const struct inode_operations ramfs_dir_inode_operations;

struct inode *ramfs_get_inode(struct super_block *sb,
                                const struct inode *dir, umode_t mode, dev_t dev)
{
        struct inode * inode = new_inode(sb);

        if (inode) {
                inode->i_ino = get_next_ino();
                inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
                inode->i_mapping->a_ops = &ram_aops;
                mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
                mapping_set_unevictable(inode->i_mapping);
                simple_inode_init_ts(inode);
                switch (mode & S_IFMT) {
                default:
                        init_special_inode(inode, mode, dev);
                        break;
                case S_IFREG:
                        inode->i_op = &ramfs_file_inode_operations;
                        inode->i_fop = &ramfs_file_operations;
                        break;
                case S_IFDIR:
                        inode->i_op = &ramfs_dir_inode_operations;
                        inode->i_fop = &simple_dir_operations;

                        /* directory inodes start off with i_nlink == 2 (for "." entry) */
                        inc_nlink(inode);
                        break;
                case S_IFLNK:
                        inode->i_op = &page_symlink_inode_operations;
                        inode_nohighmem(inode);
                        break;
                }
        }
        return inode;
}

/*
 * File creation. Allocate an inode, and we're done..
 */
/* SMP-safe */
static int
ramfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
            struct dentry *dentry, umode_t mode, dev_t dev)
{
        struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
        int error = -ENOSPC;

        if (inode) {
                error = security_inode_init_security(inode, dir,
                                                     &dentry->d_name, NULL,
                                                     NULL);
                if (error) {
                        iput(inode);
                        goto out;
                }

                d_instantiate(dentry, inode);
                dget(dentry);        /* Extra count - pin the dentry in core */
                error = 0;
                inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        }
out:
        return error;
}

static int ramfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
                       struct dentry *dentry, umode_t mode)
{
        int retval = ramfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0);
        if (!retval)
                inc_nlink(dir);
        return retval;
}

static int ramfs_create(struct mnt_idmap *idmap, struct inode *dir,
                        struct dentry *dentry, umode_t mode, bool excl)
{
        return ramfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0);
}

static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
                         struct dentry *dentry, const char *symname)
{
        struct inode *inode;
        int error = -ENOSPC;

        inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
        if (inode) {
                int l = strlen(symname)+1;

                error = security_inode_init_security(inode, dir,
                                                     &dentry->d_name, NULL,
                                                     NULL);
                if (error) {
                        iput(inode);
                        goto out;
                }

                error = page_symlink(inode, symname, l);
                if (!error) {
                        d_instantiate(dentry, inode);
                        dget(dentry);
                        inode_set_mtime_to_ts(dir,
                                              inode_set_ctime_current(dir));
                } else
                        iput(inode);
        }
out:
        return error;
}

static int ramfs_tmpfile(struct mnt_idmap *idmap,
                         struct inode *dir, struct file *file, umode_t mode)
{
        struct inode *inode;
        int error;

        inode = ramfs_get_inode(dir->i_sb, dir, mode, 0);
        if (!inode)
                return -ENOSPC;

        error = security_inode_init_security(inode, dir,
                                             &file_dentry(file)->d_name, NULL,
                                             NULL);
        if (error) {
                iput(inode);
                goto out;
        }

        d_tmpfile(file, inode);
out:
        return finish_open_simple(file, error);
}

static const struct inode_operations ramfs_dir_inode_operations = {
        .create                = ramfs_create,
        .lookup                = simple_lookup,
        .link                = simple_link,
        .unlink                = simple_unlink,
        .symlink        = ramfs_symlink,
        .mkdir                = ramfs_mkdir,
        .rmdir                = simple_rmdir,
        .mknod                = ramfs_mknod,
        .rename                = simple_rename,
        .tmpfile        = ramfs_tmpfile,
};

/*
 * Display the mount options in /proc/mounts.
 */
static int ramfs_show_options(struct seq_file *m, struct dentry *root)
{
        struct ramfs_fs_info *fsi = root->d_sb->s_fs_info;

        if (fsi->mount_opts.mode != RAMFS_DEFAULT_MODE)
                seq_printf(m, ",mode=%o", fsi->mount_opts.mode);
        return 0;
}

static const struct super_operations ramfs_ops = {
        .statfs                = simple_statfs,
        .drop_inode        = generic_delete_inode,
        .show_options        = ramfs_show_options,
};

enum ramfs_param {
        Opt_mode,
};

const struct fs_parameter_spec ramfs_fs_parameters[] = {
        fsparam_u32oct("mode",        Opt_mode),
        {}
};

static int ramfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct fs_parse_result result;
        struct ramfs_fs_info *fsi = fc->s_fs_info;
        int opt;

        opt = fs_parse(fc, ramfs_fs_parameters, param, &result);
        if (opt == -ENOPARAM) {
                opt = vfs_parse_fs_param_source(fc, param);
                if (opt != -ENOPARAM)
                        return opt;
                /*
                 * We might like to report bad mount options here;
                 * but traditionally ramfs has ignored all mount options,
                 * and as it is used as a !CONFIG_SHMEM simple substitute
                 * for tmpfs, better continue to ignore other mount options.
                 */
                return 0;
        }
        if (opt < 0)
                return opt;

        switch (opt) {
        case Opt_mode:
                fsi->mount_opts.mode = result.uint_32 & S_IALLUGO;
                break;
        }

        return 0;
}

static int ramfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
        struct ramfs_fs_info *fsi = sb->s_fs_info;
        struct inode *inode;

        sb->s_maxbytes                = MAX_LFS_FILESIZE;
        sb->s_blocksize                = PAGE_SIZE;
        sb->s_blocksize_bits        = PAGE_SHIFT;
        sb->s_magic                = RAMFS_MAGIC;
        sb->s_op                = &ramfs_ops;
        sb->s_time_gran                = 1;

        inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
        sb->s_root = d_make_root(inode);
        if (!sb->s_root)
                return -ENOMEM;

        return 0;
}

static int ramfs_get_tree(struct fs_context *fc)
{
        return get_tree_nodev(fc, ramfs_fill_super);
}

static void ramfs_free_fc(struct fs_context *fc)
{
        kfree(fc->s_fs_info);
}

static const struct fs_context_operations ramfs_context_ops = {
        .free                = ramfs_free_fc,
        .parse_param        = ramfs_parse_param,
        .get_tree        = ramfs_get_tree,
};

int ramfs_init_fs_context(struct fs_context *fc)
{
        struct ramfs_fs_info *fsi;

        fsi = kzalloc(sizeof(*fsi), GFP_KERNEL);
        if (!fsi)
                return -ENOMEM;

        fsi->mount_opts.mode = RAMFS_DEFAULT_MODE;
        fc->s_fs_info = fsi;
        fc->ops = &ramfs_context_ops;
        return 0;
}

void ramfs_kill_sb(struct super_block *sb)
{
        kfree(sb->s_fs_info);
        kill_litter_super(sb);
}

static struct file_system_type ramfs_fs_type = {
        .name                = "ramfs",
        .init_fs_context = ramfs_init_fs_context,
        .parameters        = ramfs_fs_parameters,
        .kill_sb        = ramfs_kill_sb,
        .fs_flags        = FS_USERNS_MOUNT,
};

static int __init init_ramfs_fs(void)
{
        return register_filesystem(&ramfs_fs_type);
}
fs_initcall(init_ramfs_fs);




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


    1 






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (C) 2009 Red Hat, Inc.
 * Copyright (C) 2006 Rusty Russell IBM Corporation
 *
 * Author: Michael S. Tsirkin <mst@redhat.com>
 *
 * Inspiration, some code, and most witty comments come from
 * Documentation/virtual/lguest/lguest.c, by Rusty Russell
 *
 * Generic code for virtio server in host kernel.
 */

#include <linux/eventfd.h>
#include <linux/vhost.h>
#include <linux/uio.h>
#include <linux/mm.h>
#include <linux/miscdevice.h>
#include <linux/mutex.h>
#include <linux/poll.h>
#include <linux/file.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/sort.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/sched/vhost_task.h>
#include <linux/interval_tree_generic.h>
#include <linux/nospec.h>
#include <linux/kcov.h>

#include "vhost.h"

static ushort max_mem_regions = 64;
module_param(max_mem_regions, ushort, 0444);
MODULE_PARM_DESC(max_mem_regions,
        "Maximum number of memory regions in memory map. (default: 64)");
static int max_iotlb_entries = 2048;
module_param(max_iotlb_entries, int, 0444);
MODULE_PARM_DESC(max_iotlb_entries,
        "Maximum number of iotlb entries. (default: 2048)");

enum {
        VHOST_MEMORY_F_LOG = 0x1,
};

#define vhost_used_event(vq) ((__virtio16 __user *)&vq->avail->ring[vq->num])
#define vhost_avail_event(vq) ((__virtio16 __user *)&vq->used->ring[vq->num])

#ifdef CONFIG_VHOST_CROSS_ENDIAN_LEGACY
static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
{
        vq->user_be = !virtio_legacy_is_little_endian();
}

static void vhost_enable_cross_endian_big(struct vhost_virtqueue *vq)
{
        vq->user_be = true;
}

static void vhost_enable_cross_endian_little(struct vhost_virtqueue *vq)
{
        vq->user_be = false;
}

static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
{
        struct vhost_vring_state s;

        if (vq->private_data)
                return -EBUSY;

        if (copy_from_user(&s, argp, sizeof(s)))
                return -EFAULT;

        if (s.num != VHOST_VRING_LITTLE_ENDIAN &&
            s.num != VHOST_VRING_BIG_ENDIAN)
                return -EINVAL;

        if (s.num == VHOST_VRING_BIG_ENDIAN)
                vhost_enable_cross_endian_big(vq);
        else
                vhost_enable_cross_endian_little(vq);

        return 0;
}

static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
                                   int __user *argp)
{
        struct vhost_vring_state s = {
                .index = idx,
                .num = vq->user_be
        };

        if (copy_to_user(argp, &s, sizeof(s)))
                return -EFAULT;

        return 0;
}

static void vhost_init_is_le(struct vhost_virtqueue *vq)
{
        /* Note for legacy virtio: user_be is initialized at reset time
         * according to the host endianness. If userspace does not set an
         * explicit endianness, the default behavior is native endian, as
         * expected by legacy virtio.
         */
        vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1) || !vq->user_be;
}
#else
static void vhost_disable_cross_endian(struct vhost_virtqueue *vq)
{
}

static long vhost_set_vring_endian(struct vhost_virtqueue *vq, int __user *argp)
{
        return -ENOIOCTLCMD;
}

static long vhost_get_vring_endian(struct vhost_virtqueue *vq, u32 idx,
                                   int __user *argp)
{
        return -ENOIOCTLCMD;
}

static void vhost_init_is_le(struct vhost_virtqueue *vq)
{
        vq->is_le = vhost_has_feature(vq, VIRTIO_F_VERSION_1)
                || virtio_legacy_is_little_endian();
}
#endif /* CONFIG_VHOST_CROSS_ENDIAN_LEGACY */

static void vhost_reset_is_le(struct vhost_virtqueue *vq)
{
        vhost_init_is_le(vq);
}

struct vhost_flush_struct {
        struct vhost_work work;
        struct completion wait_event;
};

static void vhost_flush_work(struct vhost_work *work)
{
        struct vhost_flush_struct *s;

        s = container_of(work, struct vhost_flush_struct, work);
        complete(&s->wait_event);
}

static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
                            poll_table *pt)
{
        struct vhost_poll *poll;

        poll = container_of(pt, struct vhost_poll, table);
        poll->wqh = wqh;
        add_wait_queue(wqh, &poll->wait);
}

static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync,
                             void *key)
{
        struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
        struct vhost_work *work = &poll->work;

        if (!(key_to_poll(key) & poll->mask))
                return 0;

        if (!poll->dev->use_worker)
                work->fn(work);
        else
                vhost_poll_queue(poll);

        return 0;
}

void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
{
        clear_bit(VHOST_WORK_QUEUED, &work->flags);
        work->fn = fn;
}
EXPORT_SYMBOL_GPL(vhost_work_init);

/* Init poll structure */
void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
                     __poll_t mask, struct vhost_dev *dev,
                     struct vhost_virtqueue *vq)
{
        init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
        init_poll_funcptr(&poll->table, vhost_poll_func);
        poll->mask = mask;
        poll->dev = dev;
        poll->wqh = NULL;
        poll->vq = vq;

        vhost_work_init(&poll->work, fn);
}
EXPORT_SYMBOL_GPL(vhost_poll_init);

/* Start polling a file. We add ourselves to file's wait queue. The caller must
 * keep a reference to a file until after vhost_poll_stop is called. */
int vhost_poll_start(struct vhost_poll *poll, struct file *file)
{
        __poll_t mask;

        if (poll->wqh)
                return 0;

        mask = vfs_poll(file, &poll->table);
        if (mask)
                vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
        if (mask & EPOLLERR) {
                vhost_poll_stop(poll);
                return -EINVAL;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(vhost_poll_start);

/* Stop polling a file. After this function returns, it becomes safe to drop the
 * file reference. You must also flush afterwards. */
void vhost_poll_stop(struct vhost_poll *poll)
{
        if (poll->wqh) {
                remove_wait_queue(poll->wqh, &poll->wait);
                poll->wqh = NULL;
        }
}
EXPORT_SYMBOL_GPL(vhost_poll_stop);

static void vhost_worker_queue(struct vhost_worker *worker,
                               struct vhost_work *work)
{
        if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
                /* We can only add the work to the list after we're
                 * sure it was not in the list.
                 * test_and_set_bit() implies a memory barrier.
                 */
                llist_add(&work->node, &worker->work_list);
                vhost_task_wake(worker->vtsk);
        }
}

bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work)
{
        struct vhost_worker *worker;
        bool queued = false;

        rcu_read_lock();
        worker = rcu_dereference(vq->worker);
        if (worker) {
                queued = true;
                vhost_worker_queue(worker, work);
        }
        rcu_read_unlock();

        return queued;
}
EXPORT_SYMBOL_GPL(vhost_vq_work_queue);

/**
 * __vhost_worker_flush - flush a worker
 * @worker: worker to flush
 *
 * The worker's flush_mutex must be held.
 */
static void __vhost_worker_flush(struct vhost_worker *worker)
{
        struct vhost_flush_struct flush;

        if (!worker->attachment_cnt || worker->killed)
                return;

        init_completion(&flush.wait_event);
        vhost_work_init(&flush.work, vhost_flush_work);

        vhost_worker_queue(worker, &flush.work);
        /*
         * Drop mutex in case our worker is killed and it needs to take the
         * mutex to force cleanup.
         */
        mutex_unlock(&worker->mutex);
        wait_for_completion(&flush.wait_event);
        mutex_lock(&worker->mutex);
}

static void vhost_worker_flush(struct vhost_worker *worker)
{
        mutex_lock(&worker->mutex);
        __vhost_worker_flush(worker);
        mutex_unlock(&worker->mutex);
}

void vhost_dev_flush(struct vhost_dev *dev)
{
        struct vhost_worker *worker;
        unsigned long i;

        xa_for_each(&dev->worker_xa, i, worker)
                vhost_worker_flush(worker);
}
EXPORT_SYMBOL_GPL(vhost_dev_flush);

/* A lockless hint for busy polling code to exit the loop */
bool vhost_vq_has_work(struct vhost_virtqueue *vq)
{
        struct vhost_worker *worker;
        bool has_work = false;

        rcu_read_lock();
        worker = rcu_dereference(vq->worker);
        if (worker && !llist_empty(&worker->work_list))
                has_work = true;
        rcu_read_unlock();

        return has_work;
}
EXPORT_SYMBOL_GPL(vhost_vq_has_work);

void vhost_poll_queue(struct vhost_poll *poll)
{
        vhost_vq_work_queue(poll->vq, &poll->work);
}
EXPORT_SYMBOL_GPL(vhost_poll_queue);

static void __vhost_vq_meta_reset(struct vhost_virtqueue *vq)
{
        int j;

        for (j = 0; j < VHOST_NUM_ADDRS; j++)
                vq->meta_iotlb[j] = NULL;
}

static void vhost_vq_meta_reset(struct vhost_dev *d)
{
        int i;

        for (i = 0; i < d->nvqs; ++i)
                __vhost_vq_meta_reset(d->vqs[i]);
}

static void vhost_vring_call_reset(struct vhost_vring_call *call_ctx)
{
        call_ctx->ctx = NULL;
        memset(&call_ctx->producer, 0x0, sizeof(struct irq_bypass_producer));
}

bool vhost_vq_is_setup(struct vhost_virtqueue *vq)
{
        return vq->avail && vq->desc && vq->used && vhost_vq_access_ok(vq);
}
EXPORT_SYMBOL_GPL(vhost_vq_is_setup);

static void vhost_vq_reset(struct vhost_dev *dev,
                           struct vhost_virtqueue *vq)
{
        vq->num = 1;
        vq->desc = NULL;
        vq->avail = NULL;
        vq->used = NULL;
        vq->last_avail_idx = 0;
        vq->avail_idx = 0;
        vq->last_used_idx = 0;
        vq->signalled_used = 0;
        vq->signalled_used_valid = false;
        vq->used_flags = 0;
        vq->log_used = false;
        vq->log_addr = -1ull;
        vq->private_data = NULL;
        vq->acked_features = 0;
        vq->acked_backend_features = 0;
        vq->log_base = NULL;
        vq->error_ctx = NULL;
        vq->kick = NULL;
        vq->log_ctx = NULL;
        vhost_disable_cross_endian(vq);
        vhost_reset_is_le(vq);
        vq->busyloop_timeout = 0;
        vq->umem = NULL;
        vq->iotlb = NULL;
        rcu_assign_pointer(vq->worker, NULL);
        vhost_vring_call_reset(&vq->call_ctx);
        __vhost_vq_meta_reset(vq);
}

static bool vhost_run_work_list(void *data)
{
        struct vhost_worker *worker = data;
        struct vhost_work *work, *work_next;
        struct llist_node *node;

        node = llist_del_all(&worker->work_list);
        if (node) {
                __set_current_state(TASK_RUNNING);

                node = llist_reverse_order(node);
                /* make sure flag is seen after deletion */
                smp_wmb();
                llist_for_each_entry_safe(work, work_next, node, node) {
                        clear_bit(VHOST_WORK_QUEUED, &work->flags);
                        kcov_remote_start_common(worker->kcov_handle);
                        work->fn(work);
                        kcov_remote_stop();
                        cond_resched();
                }
        }

        return !!node;
}

static void vhost_worker_killed(void *data)
{
        struct vhost_worker *worker = data;
        struct vhost_dev *dev = worker->dev;
        struct vhost_virtqueue *vq;
        int i, attach_cnt = 0;

        mutex_lock(&worker->mutex);
        worker->killed = true;

        for (i = 0; i < dev->nvqs; i++) {
                vq = dev->vqs[i];

                mutex_lock(&vq->mutex);
                if (worker ==
                    rcu_dereference_check(vq->worker,
                                          lockdep_is_held(&vq->mutex))) {
                        rcu_assign_pointer(vq->worker, NULL);
                        attach_cnt++;
                }
                mutex_unlock(&vq->mutex);
        }

        worker->attachment_cnt -= attach_cnt;
        if (attach_cnt)
                synchronize_rcu();
        /*
         * Finish vhost_worker_flush calls and any other works that snuck in
         * before the synchronize_rcu.
         */
        vhost_run_work_list(worker);
        mutex_unlock(&worker->mutex);
}

static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
{
        kfree(vq->indirect);
        vq->indirect = NULL;
        kfree(vq->log);
        vq->log = NULL;
        kfree(vq->heads);
        vq->heads = NULL;
}

/* Helper to allocate iovec buffers for all vqs. */
static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
{
        struct vhost_virtqueue *vq;
        int i;

        for (i = 0; i < dev->nvqs; ++i) {
                vq = dev->vqs[i];
                vq->indirect = kmalloc_array(UIO_MAXIOV,
                                             sizeof(*vq->indirect),
                                             GFP_KERNEL);
                vq->log = kmalloc_array(dev->iov_limit, sizeof(*vq->log),
                                        GFP_KERNEL);
                vq->heads = kmalloc_array(dev->iov_limit, sizeof(*vq->heads),
                                          GFP_KERNEL);
                if (!vq->indirect || !vq->log || !vq->heads)
                        goto err_nomem;
        }
        return 0;

err_nomem:
        for (; i >= 0; --i)
                vhost_vq_free_iovecs(dev->vqs[i]);
        return -ENOMEM;
}

static void vhost_dev_free_iovecs(struct vhost_dev *dev)
{
        int i;

        for (i = 0; i < dev->nvqs; ++i)
                vhost_vq_free_iovecs(dev->vqs[i]);
}

bool vhost_exceeds_weight(struct vhost_virtqueue *vq,
                          int pkts, int total_len)
{
        struct vhost_dev *dev = vq->dev;

        if ((dev->byte_weight && total_len >= dev->byte_weight) ||
            pkts >= dev->weight) {
                vhost_poll_queue(&vq->poll);
                return true;
        }

        return false;
}
EXPORT_SYMBOL_GPL(vhost_exceeds_weight);

static size_t vhost_get_avail_size(struct vhost_virtqueue *vq,
                                   unsigned int num)
{
        size_t event __maybe_unused =
               vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;

        return size_add(struct_size(vq->avail, ring, num), event);
}

static size_t vhost_get_used_size(struct vhost_virtqueue *vq,
                                  unsigned int num)
{
        size_t event __maybe_unused =
               vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;

        return size_add(struct_size(vq->used, ring, num), event);
}

static size_t vhost_get_desc_size(struct vhost_virtqueue *vq,
                                  unsigned int num)
{
        return sizeof(*vq->desc) * num;
}

void vhost_dev_init(struct vhost_dev *dev,
                    struct vhost_virtqueue **vqs, int nvqs,
                    int iov_limit, int weight, int byte_weight,
                    bool use_worker,
                    int (*msg_handler)(struct vhost_dev *dev, u32 asid,
                                       struct vhost_iotlb_msg *msg))
{
        struct vhost_virtqueue *vq;
        int i;

        dev->vqs = vqs;
        dev->nvqs = nvqs;
        mutex_init(&dev->mutex);
        dev->log_ctx = NULL;
        dev->umem = NULL;
        dev->iotlb = NULL;
        dev->mm = NULL;
        dev->iov_limit = iov_limit;
        dev->weight = weight;
        dev->byte_weight = byte_weight;
        dev->use_worker = use_worker;
        dev->msg_handler = msg_handler;
        init_waitqueue_head(&dev->wait);
        INIT_LIST_HEAD(&dev->read_list);
        INIT_LIST_HEAD(&dev->pending_list);
        spin_lock_init(&dev->iotlb_lock);
        xa_init_flags(&dev->worker_xa, XA_FLAGS_ALLOC);

        for (i = 0; i < dev->nvqs; ++i) {
                vq = dev->vqs[i];
                vq->log = NULL;
                vq->indirect = NULL;
                vq->heads = NULL;
                vq->dev = dev;
                mutex_init(&vq->mutex);
                vhost_vq_reset(dev, vq);
                if (vq->handle_kick)
                        vhost_poll_init(&vq->poll, vq->handle_kick,
                                        EPOLLIN, dev, vq);
        }
}
EXPORT_SYMBOL_GPL(vhost_dev_init);

/* Caller should have device mutex */
long vhost_dev_check_owner(struct vhost_dev *dev)
{
        /* Are you the owner? If not, I don't think you mean to do that */
        return dev->mm == current->mm ? 0 : -EPERM;
}
EXPORT_SYMBOL_GPL(vhost_dev_check_owner);

/* Caller should have device mutex */
bool vhost_dev_has_owner(struct vhost_dev *dev)
{
        return dev->mm;
}
EXPORT_SYMBOL_GPL(vhost_dev_has_owner);

static void vhost_attach_mm(struct vhost_dev *dev)
{
        /* No owner, become one */
        if (dev->use_worker) {
                dev->mm = get_task_mm(current);
        } else {
                /* vDPA device does not use worker thead, so there's
                 * no need to hold the address space for mm. This help
                 * to avoid deadlock in the case of mmap() which may
                 * held the refcnt of the file and depends on release
                 * method to remove vma.
                 */
                dev->mm = current->mm;
                mmgrab(dev->mm);
        }
}

static void vhost_detach_mm(struct vhost_dev *dev)
{
        if (!dev->mm)
                return;

        if (dev->use_worker)
                mmput(dev->mm);
        else
                mmdrop(dev->mm);

        dev->mm = NULL;
}

static void vhost_worker_destroy(struct vhost_dev *dev,
                                 struct vhost_worker *worker)
{
        if (!worker)
                return;

        WARN_ON(!llist_empty(&worker->work_list));
        xa_erase(&dev->worker_xa, worker->id);
        vhost_task_stop(worker->vtsk);
        kfree(worker);
}

static void vhost_workers_free(struct vhost_dev *dev)
{
        struct vhost_worker *worker;
        unsigned long i;

        if (!dev->use_worker)
                return;

        for (i = 0; i < dev->nvqs; i++)
                rcu_assign_pointer(dev->vqs[i]->worker, NULL);
        /*
         * Free the default worker we created and cleanup workers userspace
         * created but couldn't clean up (it forgot or crashed).
         */
        xa_for_each(&dev->worker_xa, i, worker)
                vhost_worker_destroy(dev, worker);
        xa_destroy(&dev->worker_xa);
}

static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
{
        struct vhost_worker *worker;
        struct vhost_task *vtsk;
        char name[TASK_COMM_LEN];
        int ret;
        u32 id;

        worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
        if (!worker)
                return NULL;

        worker->dev = dev;
        snprintf(name, sizeof(name), "vhost-%d", current->pid);

        vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed,
                                 worker, name);
        if (!vtsk)
                goto free_worker;

        mutex_init(&worker->mutex);
        init_llist_head(&worker->work_list);
        worker->kcov_handle = kcov_common_handle();
        worker->vtsk = vtsk;

        vhost_task_start(vtsk);

        ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
        if (ret < 0)
                goto stop_worker;
        worker->id = id;

        return worker;

stop_worker:
        vhost_task_stop(vtsk);
free_worker:
        kfree(worker);
        return NULL;
}

/* Caller must have device mutex */
static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq,
                                     struct vhost_worker *worker)
{
        struct vhost_worker *old_worker;

        mutex_lock(&worker->mutex);
        if (worker->killed) {
                mutex_unlock(&worker->mutex);
                return;
        }

        mutex_lock(&vq->mutex);

        old_worker = rcu_dereference_check(vq->worker,
                                           lockdep_is_held(&vq->mutex));
        rcu_assign_pointer(vq->worker, worker);
        worker->attachment_cnt++;

        if (!old_worker) {
                mutex_unlock(&vq->mutex);
                mutex_unlock(&worker->mutex);
                return;
        }
        mutex_unlock(&vq->mutex);
        mutex_unlock(&worker->mutex);

        /*
         * Take the worker mutex to make sure we see the work queued from
         * device wide flushes which doesn't use RCU for execution.
         */
        mutex_lock(&old_worker->mutex);
        if (old_worker->killed) {
                mutex_unlock(&old_worker->mutex);
                return;
        }

        /*
         * We don't want to call synchronize_rcu for every vq during setup
         * because it will slow down VM startup. If we haven't done
         * VHOST_SET_VRING_KICK and not done the driver specific
         * SET_ENDPOINT/RUNNUNG then we can skip the sync since there will
         * not be any works queued for scsi and net.
         */
        mutex_lock(&vq->mutex);
        if (!vhost_vq_get_backend(vq) && !vq->kick) {
                mutex_unlock(&vq->mutex);

                old_worker->attachment_cnt--;
                mutex_unlock(&old_worker->mutex);
                /*
                 * vsock can queue anytime after VHOST_VSOCK_SET_GUEST_CID.
                 * Warn if it adds support for multiple workers but forgets to
                 * handle the early queueing case.
                 */
                WARN_ON(!old_worker->attachment_cnt &&
                        !llist_empty(&old_worker->work_list));
                return;
        }
        mutex_unlock(&vq->mutex);

        /* Make sure new vq queue/flush/poll calls see the new worker */
        synchronize_rcu();
        /* Make sure whatever was queued gets run */
        __vhost_worker_flush(old_worker);
        old_worker->attachment_cnt--;
        mutex_unlock(&old_worker->mutex);
}

 /* Caller must have device mutex */
static int vhost_vq_attach_worker(struct vhost_virtqueue *vq,
                                  struct vhost_vring_worker *info)
{
        unsigned long index = info->worker_id;
        struct vhost_dev *dev = vq->dev;
        struct vhost_worker *worker;

        if (!dev->use_worker)
                return -EINVAL;

        worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
        if (!worker || worker->id != info->worker_id)
                return -ENODEV;

        __vhost_vq_attach_worker(vq, worker);
        return 0;
}

/* Caller must have device mutex */
static int vhost_new_worker(struct vhost_dev *dev,
                            struct vhost_worker_state *info)
{
        struct vhost_worker *worker;

        worker = vhost_worker_create(dev);
        if (!worker)
                return -ENOMEM;

        info->worker_id = worker->id;
        return 0;
}

/* Caller must have device mutex */
static int vhost_free_worker(struct vhost_dev *dev,
                             struct vhost_worker_state *info)
{
        unsigned long index = info->worker_id;
        struct vhost_worker *worker;

        worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
        if (!worker || worker->id != info->worker_id)
                return -ENODEV;

        mutex_lock(&worker->mutex);
        if (worker->attachment_cnt || worker->killed) {
                mutex_unlock(&worker->mutex);
                return -EBUSY;
        }
        /*
         * A flush might have raced and snuck in before attachment_cnt was set
         * to zero. Make sure flushes are flushed from the queue before
         * freeing.
         */
        __vhost_worker_flush(worker);
        mutex_unlock(&worker->mutex);

        vhost_worker_destroy(dev, worker);
        return 0;
}

static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp,
                                  struct vhost_virtqueue **vq, u32 *id)
{
        u32 __user *idxp = argp;
        u32 idx;
        long r;

        r = get_user(idx, idxp);
        if (r < 0)
                return r;

        if (idx >= dev->nvqs)
                return -ENOBUFS;

        idx = array_index_nospec(idx, dev->nvqs);

        *vq = dev->vqs[idx];
        *id = idx;
        return 0;
}

/* Caller must have device mutex */
long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl,
                        void __user *argp)
{
        struct vhost_vring_worker ring_worker;
        struct vhost_worker_state state;
        struct vhost_worker *worker;
        struct vhost_virtqueue *vq;
        long ret;
        u32 idx;

        if (!dev->use_worker)
                return -EINVAL;

        if (!vhost_dev_has_owner(dev))
                return -EINVAL;

        ret = vhost_dev_check_owner(dev);
        if (ret)
                return ret;

        switch (ioctl) {
        /* dev worker ioctls */
        case VHOST_NEW_WORKER:
                ret = vhost_new_worker(dev, &state);
                if (!ret && copy_to_user(argp, &state, sizeof(state)))
                        ret = -EFAULT;
                return ret;
        case VHOST_FREE_WORKER:
                if (copy_from_user(&state, argp, sizeof(state)))
                        return -EFAULT;
                return vhost_free_worker(dev, &state);
        /* vring worker ioctls */
        case VHOST_ATTACH_VRING_WORKER:
        case VHOST_GET_VRING_WORKER:
                break;
        default:
                return -ENOIOCTLCMD;
        }

        ret = vhost_get_vq_from_user(dev, argp, &vq, &idx);
        if (ret)
                return ret;

        switch (ioctl) {
        case VHOST_ATTACH_VRING_WORKER:
                if (copy_from_user(&ring_worker, argp, sizeof(ring_worker))) {
                        ret = -EFAULT;
                        break;
                }

                ret = vhost_vq_attach_worker(vq, &ring_worker);
                break;
        case VHOST_GET_VRING_WORKER:
                worker = rcu_dereference_check(vq->worker,
                                               lockdep_is_held(&dev->mutex));
                if (!worker) {
                        ret = -EINVAL;
                        break;
                }

                ring_worker.index = idx;
                ring_worker.worker_id = worker->id;

                if (copy_to_user(argp, &ring_worker, sizeof(ring_worker)))
                        ret = -EFAULT;
                break;
        default:
                ret = -ENOIOCTLCMD;
                break;
        }

        return ret;
}
EXPORT_SYMBOL_GPL(vhost_worker_ioctl);

/* Caller should have device mutex */
long vhost_dev_set_owner(struct vhost_dev *dev)
{
        struct vhost_worker *worker;
        int err, i;

        /* Is there an owner already? */
        if (vhost_dev_has_owner(dev)) {
                err = -EBUSY;
                goto err_mm;
        }

        vhost_attach_mm(dev);

        err = vhost_dev_alloc_iovecs(dev);
        if (err)
                goto err_iovecs;

        if (dev->use_worker) {
                /*
                 * This should be done last, because vsock can queue work
                 * before VHOST_SET_OWNER so it simplifies the failure path
                 * below since we don't have to worry about vsock queueing
                 * while we free the worker.
                 */
                worker = vhost_worker_create(dev);
                if (!worker) {
                        err = -ENOMEM;
                        goto err_worker;
                }

                for (i = 0; i < dev->nvqs; i++)
                        __vhost_vq_attach_worker(dev->vqs[i], worker);
        }

        return 0;

err_worker:
        vhost_dev_free_iovecs(dev);
err_iovecs:
        vhost_detach_mm(dev);
err_mm:
        return err;
}
EXPORT_SYMBOL_GPL(vhost_dev_set_owner);

static struct vhost_iotlb *iotlb_alloc(void)
{
        return vhost_iotlb_alloc(max_iotlb_entries,
                                 VHOST_IOTLB_FLAG_RETIRE);
}

struct vhost_iotlb *vhost_dev_reset_owner_prepare(void)
{
        return iotlb_alloc();
}
EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare);

/* Caller should have device mutex */
void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem)
{
        int i;

        vhost_dev_cleanup(dev);

        dev->umem = umem;
        /* We don't need VQ locks below since vhost_dev_cleanup makes sure
         * VQs aren't running.
         */
        for (i = 0; i < dev->nvqs; ++i)
                dev->vqs[i]->umem = umem;
}
EXPORT_SYMBOL_GPL(vhost_dev_reset_owner);

void vhost_dev_stop(struct vhost_dev *dev)
{
        int i;

        for (i = 0; i < dev->nvqs; ++i) {
                if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick)
                        vhost_poll_stop(&dev->vqs[i]->poll);
        }

        vhost_dev_flush(dev);
}
EXPORT_SYMBOL_GPL(vhost_dev_stop);

void vhost_clear_msg(struct vhost_dev *dev)
{
        struct vhost_msg_node *node, *n;

        spin_lock(&dev->iotlb_lock);

        list_for_each_entry_safe(node, n, &dev->read_list, node) {
                list_del(&node->node);
                kfree(node);
        }

        list_for_each_entry_safe(node, n, &dev->pending_list, node) {
                list_del(&node->node);
                kfree(node);
        }

        spin_unlock(&dev->iotlb_lock);
}
EXPORT_SYMBOL_GPL(vhost_clear_msg);

void vhost_dev_cleanup(struct vhost_dev *dev)
{
        int i;

        for (i = 0; i < dev->nvqs; ++i) {
                if (dev->vqs[i]->error_ctx)
                        eventfd_ctx_put(dev->vqs[i]->error_ctx);
                if (dev->vqs[i]->kick)
                        fput(dev->vqs[i]->kick);
                if (dev->vqs[i]->call_ctx.ctx)
                        eventfd_ctx_put(dev->vqs[i]->call_ctx.ctx);
                vhost_vq_reset(dev, dev->vqs[i]);
        }
        vhost_dev_free_iovecs(dev);
        if (dev->log_ctx)
                eventfd_ctx_put(dev->log_ctx);
        dev->log_ctx = NULL;
        /* No one will access memory at this point */
        vhost_iotlb_free(dev->umem);
        dev->umem = NULL;
        vhost_iotlb_free(dev->iotlb);
        dev->iotlb = NULL;
        vhost_clear_msg(dev);
        wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
        vhost_workers_free(dev);
        vhost_detach_mm(dev);
}
EXPORT_SYMBOL_GPL(vhost_dev_cleanup);

static bool log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
{
        u64 a = addr / VHOST_PAGE_SIZE / 8;

        /* Make sure 64 bit math will not overflow. */
        if (a > ULONG_MAX - (unsigned long)log_base ||
            a + (unsigned long)log_base > ULONG_MAX)
                return false;

        return access_ok(log_base + a,
                         (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8);
}

/* Make sure 64 bit math will not overflow. */
static bool vhost_overflow(u64 uaddr, u64 size)
{
        if (uaddr > ULONG_MAX || size > ULONG_MAX)
                return true;

        if (!size)
                return false;

        return uaddr > ULONG_MAX - size + 1;
}

/* Caller should have vq mutex and device mutex. */
static bool vq_memory_access_ok(void __user *log_base, struct vhost_iotlb *umem,
                                int log_all)
{
        struct vhost_iotlb_map *map;

        if (!umem)
                return false;

        list_for_each_entry(map, &umem->list, link) {
                unsigned long a = map->addr;

                if (vhost_overflow(map->addr, map->size))
                        return false;


                if (!access_ok((void __user *)a, map->size))
                        return false;
                else if (log_all && !log_access_ok(log_base,
                                                   map->start,
                                                   map->size))
                        return false;
        }
        return true;
}

static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq,
                                               u64 addr, unsigned int size,
                                               int type)
{
        const struct vhost_iotlb_map *map = vq->meta_iotlb[type];

        if (!map)
                return NULL;

        return (void __user *)(uintptr_t)(map->addr + addr - map->start);
}

/* Can we switch to this memory table? */
/* Caller should have device mutex but not vq mutex */
static bool memory_access_ok(struct vhost_dev *d, struct vhost_iotlb *umem,
                             int log_all)
{
        int i;

        for (i = 0; i < d->nvqs; ++i) {
                bool ok;
                bool log;

                mutex_lock(&d->vqs[i]->mutex);
                log = log_all || vhost_has_feature(d->vqs[i], VHOST_F_LOG_ALL);
                /* If ring is inactive, will check when it's enabled. */
                if (d->vqs[i]->private_data)
                        ok = vq_memory_access_ok(d->vqs[i]->log_base,
                                                 umem, log);
                else
                        ok = true;
                mutex_unlock(&d->vqs[i]->mutex);
                if (!ok)
                        return false;
        }
        return true;
}

static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
                          struct iovec iov[], int iov_size, int access);

static int vhost_copy_to_user(struct vhost_virtqueue *vq, void __user *to,
                              const void *from, unsigned size)
{
        int ret;

        if (!vq->iotlb)
                return __copy_to_user(to, from, size);
        else {
                /* This function should be called after iotlb
                 * prefetch, which means we're sure that all vq
                 * could be access through iotlb. So -EAGAIN should
                 * not happen in this case.
                 */
                struct iov_iter t;
                void __user *uaddr = vhost_vq_meta_fetch(vq,
                                     (u64)(uintptr_t)to, size,
                                     VHOST_ADDR_USED);

                if (uaddr)
                        return __copy_to_user(uaddr, from, size);

                ret = translate_desc(vq, (u64)(uintptr_t)to, size, vq->iotlb_iov,
                                     ARRAY_SIZE(vq->iotlb_iov),
                                     VHOST_ACCESS_WO);
                if (ret < 0)
                        goto out;
                iov_iter_init(&t, ITER_DEST, vq->iotlb_iov, ret, size);
                ret = copy_to_iter(from, size, &t);
                if (ret == size)
                        ret = 0;
        }
out:
        return ret;
}

static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
                                void __user *from, unsigned size)
{
        int ret;

        if (!vq->iotlb)
                return __copy_from_user(to, from, size);
        else {
                /* This function should be called after iotlb
                 * prefetch, which means we're sure that vq
                 * could be access through iotlb. So -EAGAIN should
                 * not happen in this case.
                 */
                void __user *uaddr = vhost_vq_meta_fetch(vq,
                                     (u64)(uintptr_t)from, size,
                                     VHOST_ADDR_DESC);
                struct iov_iter f;

                if (uaddr)
                        return __copy_from_user(to, uaddr, size);

                ret = translate_desc(vq, (u64)(uintptr_t)from, size, vq->iotlb_iov,
                                     ARRAY_SIZE(vq->iotlb_iov),
                                     VHOST_ACCESS_RO);
                if (ret < 0) {
                        vq_err(vq, "IOTLB translation failure: uaddr "
                               "%p size 0x%llx\n", from,
                               (unsigned long long) size);
                        goto out;
                }
                iov_iter_init(&f, ITER_SOURCE, vq->iotlb_iov, ret, size);
                ret = copy_from_iter(to, size, &f);
                if (ret == size)
                        ret = 0;
        }

out:
        return ret;
}

static void __user *__vhost_get_user_slow(struct vhost_virtqueue *vq,
                                          void __user *addr, unsigned int size,
                                          int type)
{
        int ret;

        ret = translate_desc(vq, (u64)(uintptr_t)addr, size, vq->iotlb_iov,
                             ARRAY_SIZE(vq->iotlb_iov),
                             VHOST_ACCESS_RO);
        if (ret < 0) {
                vq_err(vq, "IOTLB translation failure: uaddr "
                        "%p size 0x%llx\n", addr,
                        (unsigned long long) size);
                return NULL;
        }

        if (ret != 1 || vq->iotlb_iov[0].iov_len != size) {
                vq_err(vq, "Non atomic userspace memory access: uaddr "
                        "%p size 0x%llx\n", addr,
                        (unsigned long long) size);
                return NULL;
        }

        return vq->iotlb_iov[0].iov_base;
}

/* This function should be called after iotlb
 * prefetch, which means we're sure that vq
 * could be access through iotlb. So -EAGAIN should
 * not happen in this case.
 */
static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
                                            void __user *addr, unsigned int size,
                                            int type)
{
        void __user *uaddr = vhost_vq_meta_fetch(vq,
                             (u64)(uintptr_t)addr, size, type);
        if (uaddr)
                return uaddr;

        return __vhost_get_user_slow(vq, addr, size, type);
}

#define vhost_put_user(vq, x, ptr)                \
({ \
        int ret; \
        if (!vq->iotlb) { \
                ret = __put_user(x, ptr); \
        } else { \
                __typeof__(ptr) to = \
                        (__typeof__(ptr)) __vhost_get_user(vq, ptr,        \
                                          sizeof(*ptr), VHOST_ADDR_USED); \
                if (to != NULL) \
                        ret = __put_user(x, to); \
                else \
                        ret = -EFAULT;        \
        } \
        ret; \
})

static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
{
        return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
                              vhost_avail_event(vq));
}

static inline int vhost_put_used(struct vhost_virtqueue *vq,
                                 struct vring_used_elem *head, int idx,
                                 int count)
{
        return vhost_copy_to_user(vq, vq->used->ring + idx, head,
                                  count * sizeof(*head));
}

static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)

{
        return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
                              &vq->used->flags);
}

static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)

{
        return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
                              &vq->used->idx);
}

#define vhost_get_user(vq, x, ptr, type)                \
({ \
        int ret; \
        if (!vq->iotlb) { \
                ret = __get_user(x, ptr); \
        } else { \
                __typeof__(ptr) from = \
                        (__typeof__(ptr)) __vhost_get_user(vq, ptr, \
                                                           sizeof(*ptr), \
                                                           type); \
                if (from != NULL) \
                        ret = __get_user(x, from); \
                else \
                        ret = -EFAULT; \
        } \
        ret; \
})

#define vhost_get_avail(vq, x, ptr) \
        vhost_get_user(vq, x, ptr, VHOST_ADDR_AVAIL)

#define vhost_get_used(vq, x, ptr) \
        vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)

static void vhost_dev_lock_vqs(struct vhost_dev *d)
{
        int i = 0;
        for (i = 0; i < d->nvqs; ++i)
                mutex_lock_nested(&d->vqs[i]->mutex, i);
}

static void vhost_dev_unlock_vqs(struct vhost_dev *d)
{
        int i = 0;
        for (i = 0; i < d->nvqs; ++i)
                mutex_unlock(&d->vqs[i]->mutex);
}

static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
                                      __virtio16 *idx)
{
        return vhost_get_avail(vq, *idx, &vq->avail->idx);
}

static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
                                       __virtio16 *head, int idx)
{
        return vhost_get_avail(vq, *head,
                               &vq->avail->ring[idx & (vq->num - 1)]);
}

static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
                                        __virtio16 *flags)
{
        return vhost_get_avail(vq, *flags, &vq->avail->flags);
}

static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
                                       __virtio16 *event)
{
        return vhost_get_avail(vq, *event, vhost_used_event(vq));
}

static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
                                     __virtio16 *idx)
{
        return vhost_get_used(vq, *idx, &vq->used->idx);
}

static inline int vhost_get_desc(struct vhost_virtqueue *vq,
                                 struct vring_desc *desc, int idx)
{
        return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
}

static void vhost_iotlb_notify_vq(struct vhost_dev *d,
                                  struct vhost_iotlb_msg *msg)
{
        struct vhost_msg_node *node, *n;

        spin_lock(&d->iotlb_lock);

        list_for_each_entry_safe(node, n, &d->pending_list, node) {
                struct vhost_iotlb_msg *vq_msg = &node->msg.iotlb;
                if (msg->iova <= vq_msg->iova &&
                    msg->iova + msg->size - 1 >= vq_msg->iova &&
                    vq_msg->type == VHOST_IOTLB_MISS) {
                        vhost_poll_queue(&node->vq->poll);
                        list_del(&node->node);
                        kfree(node);
                }
        }

        spin_unlock(&d->iotlb_lock);
}

static bool umem_access_ok(u64 uaddr, u64 size, int access)
{
        unsigned long a = uaddr;

        /* Make sure 64 bit math will not overflow. */
        if (vhost_overflow(uaddr, size))
                return false;

        if ((access & VHOST_ACCESS_RO) &&
            !access_ok((void __user *)a, size))
                return false;
        if ((access & VHOST_ACCESS_WO) &&
            !access_ok((void __user *)a, size))
                return false;
        return true;
}

static int vhost_process_iotlb_msg(struct vhost_dev *dev, u32 asid,
                                   struct vhost_iotlb_msg *msg)
{
        int ret = 0;

        if (asid != 0)
                return -EINVAL;

        mutex_lock(&dev->mutex);
        vhost_dev_lock_vqs(dev);
        switch (msg->type) {
        case VHOST_IOTLB_UPDATE:
                if (!dev->iotlb) {
                        ret = -EFAULT;
                        break;
                }
                if (!umem_access_ok(msg->uaddr, msg->size, msg->perm)) {
                        ret = -EFAULT;
                        break;
                }
                vhost_vq_meta_reset(dev);
                if (vhost_iotlb_add_range(dev->iotlb, msg->iova,
                                          msg->iova + msg->size - 1,
                                          msg->uaddr, msg->perm)) {
                        ret = -ENOMEM;
                        break;
                }
                vhost_iotlb_notify_vq(dev, msg);
                break;
        case VHOST_IOTLB_INVALIDATE:
                if (!dev->iotlb) {
                        ret = -EFAULT;
                        break;
                }
                vhost_vq_meta_reset(dev);
                vhost_iotlb_del_range(dev->iotlb, msg->iova,
                                      msg->iova + msg->size - 1);
                break;
        default:
                ret = -EINVAL;
                break;
        }

        vhost_dev_unlock_vqs(dev);
        mutex_unlock(&dev->mutex);

        return ret;
}
ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
                             struct iov_iter *from)
{
        struct vhost_iotlb_msg msg;
        size_t offset;
        int type, ret;
        u32 asid = 0;

        ret = copy_from_iter(&type, sizeof(type), from);
        if (ret != sizeof(type)) {
                ret = -EINVAL;
                goto done;
        }

        switch (type) {
        case VHOST_IOTLB_MSG:
                /* There maybe a hole after type for V1 message type,
                 * so skip it here.
                 */
                offset = offsetof(struct vhost_msg, iotlb) - sizeof(int);
                break;
        case VHOST_IOTLB_MSG_V2:
                if (vhost_backend_has_feature(dev->vqs[0],
                                              VHOST_BACKEND_F_IOTLB_ASID)) {
                        ret = copy_from_iter(&asid, sizeof(asid), from);
                        if (ret != sizeof(asid)) {
                                ret = -EINVAL;
                                goto done;
                        }
                        offset = 0;
                } else
                        offset = sizeof(__u32);
                break;
        default:
                ret = -EINVAL;
                goto done;
        }

        iov_iter_advance(from, offset);
        ret = copy_from_iter(&msg, sizeof(msg), from);
        if (ret != sizeof(msg)) {
                ret = -EINVAL;
                goto done;
        }

        if (msg.type == VHOST_IOTLB_UPDATE && msg.size == 0) {
                ret = -EINVAL;
                goto done;
        }

        if (dev->msg_handler)
                ret = dev->msg_handler(dev, asid, &msg);
        else
                ret = vhost_process_iotlb_msg(dev, asid, &msg);
        if (ret) {
                ret = -EFAULT;
                goto done;
        }

        ret = (type == VHOST_IOTLB_MSG) ? sizeof(struct vhost_msg) :
              sizeof(struct vhost_msg_v2);
done:
        return ret;
}
EXPORT_SYMBOL(vhost_chr_write_iter);

__poll_t vhost_chr_poll(struct file *file, struct vhost_dev *dev,
                            poll_table *wait)
{
        __poll_t mask = 0;

        poll_wait(file, &dev->wait, wait);

        if (!list_empty(&dev->read_list))
                mask |= EPOLLIN | EPOLLRDNORM;

        return mask;
}
EXPORT_SYMBOL(vhost_chr_poll);

ssize_t vhost_chr_read_iter(struct vhost_dev *dev, struct iov_iter *to,
                            int noblock)
{
        DEFINE_WAIT(wait);
        struct vhost_msg_node *node;
        ssize_t ret = 0;
        unsigned size = sizeof(struct vhost_msg);

        if (iov_iter_count(to) < size)
                return 0;

        while (1) {
                if (!noblock)
                        prepare_to_wait(&dev->wait, &wait,
                                        TASK_INTERRUPTIBLE);

                node = vhost_dequeue_msg(dev, &dev->read_list);
                if (node)
                        break;
                if (noblock) {
                        ret = -EAGAIN;
                        break;
                }
                if (signal_pending(current)) {
                        ret = -ERESTARTSYS;
                        break;
                }
                if (!dev->iotlb) {
                        ret = -EBADFD;
                        break;
                }

                schedule();
        }

        if (!noblock)
                finish_wait(&dev->wait, &wait);

        if (node) {
                struct vhost_iotlb_msg *msg;
                void *start = &node->msg;

                switch (node->msg.type) {
                case VHOST_IOTLB_MSG:
                        size = sizeof(node->msg);
                        msg = &node->msg.iotlb;
                        break;
                case VHOST_IOTLB_MSG_V2:
                        size = sizeof(node->msg_v2);
                        msg = &node->msg_v2.iotlb;
                        break;
                default:
                        BUG();
                        break;
                }

                ret = copy_to_iter(start, size, to);
                if (ret != size || msg->type != VHOST_IOTLB_MISS) {
                        kfree(node);
                        return ret;
                }
                vhost_enqueue_msg(dev, &dev->pending_list, node);
        }

        return ret;
}
EXPORT_SYMBOL_GPL(vhost_chr_read_iter);

static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access)
{
        struct vhost_dev *dev = vq->dev;
        struct vhost_msg_node *node;
        struct vhost_iotlb_msg *msg;
        bool v2 = vhost_backend_has_feature(vq, VHOST_BACKEND_F_IOTLB_MSG_V2);

        node = vhost_new_msg(vq, v2 ? VHOST_IOTLB_MSG_V2 : VHOST_IOTLB_MSG);
        if (!node)
                return -ENOMEM;

        if (v2) {
                node->msg_v2.type = VHOST_IOTLB_MSG_V2;
                msg = &node->msg_v2.iotlb;
        } else {
                msg = &node->msg.iotlb;
        }

        msg->type = VHOST_IOTLB_MISS;
        msg->iova = iova;
        msg->perm = access;

        vhost_enqueue_msg(dev, &dev->read_list, node);

        return 0;
}

static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
                         vring_desc_t __user *desc,
                         vring_avail_t __user *avail,
                         vring_used_t __user *used)

{
        /* If an IOTLB device is present, the vring addresses are
         * GIOVAs. Access validation occurs at prefetch time. */
        if (vq->iotlb)
                return true;

        return access_ok(desc, vhost_get_desc_size(vq, num)) &&
               access_ok(avail, vhost_get_avail_size(vq, num)) &&
               access_ok(used, vhost_get_used_size(vq, num));
}

static void vhost_vq_meta_update(struct vhost_virtqueue *vq,
                                 const struct vhost_iotlb_map *map,
                                 int type)
{
        int access = (type == VHOST_ADDR_USED) ?
                     VHOST_ACCESS_WO : VHOST_ACCESS_RO;

        if (likely(map->perm & access))
                vq->meta_iotlb[type] = map;
}

static bool iotlb_access_ok(struct vhost_virtqueue *vq,
                            int access, u64 addr, u64 len, int type)
{
        const struct vhost_iotlb_map *map;
        struct vhost_iotlb *umem = vq->iotlb;
        u64 s = 0, size, orig_addr = addr, last = addr + len - 1;

        if (vhost_vq_meta_fetch(vq, addr, len, type))
                return true;

        while (len > s) {
                map = vhost_iotlb_itree_first(umem, addr, last);
                if (map == NULL || map->start > addr) {
                        vhost_iotlb_miss(vq, addr, access);
                        return false;
                } else if (!(map->perm & access)) {
                        /* Report the possible access violation by
                         * request another translation from userspace.
                         */
                        return false;
                }

                size = map->size - addr + map->start;

                if (orig_addr == addr && size >= len)
                        vhost_vq_meta_update(vq, map, type);

                s += size;
                addr += size;
        }

        return true;
}

int vq_meta_prefetch(struct vhost_virtqueue *vq)
{
        unsigned int num = vq->num;

        if (!vq->iotlb)
                return 1;

        return iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->desc,
                               vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
               iotlb_access_ok(vq, VHOST_MAP_RO, (u64)(uintptr_t)vq->avail,
                               vhost_get_avail_size(vq, num),
                               VHOST_ADDR_AVAIL) &&
               iotlb_access_ok(vq, VHOST_MAP_WO, (u64)(uintptr_t)vq->used,
                               vhost_get_used_size(vq, num), VHOST_ADDR_USED);
}
EXPORT_SYMBOL_GPL(vq_meta_prefetch);

/* Can we log writes? */
/* Caller should have device mutex but not vq mutex */
bool vhost_log_access_ok(struct vhost_dev *dev)
{
        return memory_access_ok(dev, dev->umem, 1);
}
EXPORT_SYMBOL_GPL(vhost_log_access_ok);

static bool vq_log_used_access_ok(struct vhost_virtqueue *vq,
                                  void __user *log_base,
                                  bool log_used,
                                  u64 log_addr)
{
        /* If an IOTLB device is present, log_addr is a GIOVA that
         * will never be logged by log_used(). */
        if (vq->iotlb)
                return true;

        return !log_used || log_access_ok(log_base, log_addr,
                                          vhost_get_used_size(vq, vq->num));
}

/* Verify access for write logging. */
/* Caller should have vq mutex and device mutex */
static bool vq_log_access_ok(struct vhost_virtqueue *vq,
                             void __user *log_base)
{
        return vq_memory_access_ok(log_base, vq->umem,
                                   vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&
                vq_log_used_access_ok(vq, log_base, vq->log_used, vq->log_addr);
}

/* Can we start vq? */
/* Caller should have vq mutex and device mutex */
bool vhost_vq_access_ok(struct vhost_virtqueue *vq)
{
        if (!vq_log_access_ok(vq, vq->log_base))
                return false;

        return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used);
}
EXPORT_SYMBOL_GPL(vhost_vq_access_ok);

static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
{
        struct vhost_memory mem, *newmem;
        struct vhost_memory_region *region;
        struct vhost_iotlb *newumem, *oldumem;
        unsigned long size = offsetof(struct vhost_memory, regions);
        int i;

        if (copy_from_user(&mem, m, size))
                return -EFAULT;
        if (mem.padding)
                return -EOPNOTSUPP;
        if (mem.nregions > max_mem_regions)
                return -E2BIG;
        newmem = kvzalloc(struct_size(newmem, regions, mem.nregions),
                        GFP_KERNEL);
        if (!newmem)
                return -ENOMEM;

        memcpy(newmem, &mem, size);
        if (copy_from_user(newmem->regions, m->regions,
                           flex_array_size(newmem, regions, mem.nregions))) {
                kvfree(newmem);
                return -EFAULT;
        }

        newumem = iotlb_alloc();
        if (!newumem) {
                kvfree(newmem);
                return -ENOMEM;
        }

        for (region = newmem->regions;
             region < newmem->regions + mem.nregions;
             region++) {
                if (vhost_iotlb_add_range(newumem,
                                          region->guest_phys_addr,
                                          region->guest_phys_addr +
                                          region->memory_size - 1,
                                          region->userspace_addr,
                                          VHOST_MAP_RW))
                        goto err;
        }

        if (!memory_access_ok(d, newumem, 0))
                goto err;

        oldumem = d->umem;
        d->umem = newumem;

        /* All memory accesses are done under some VQ mutex. */
        for (i = 0; i < d->nvqs; ++i) {
                mutex_lock(&d->vqs[i]->mutex);
                d->vqs[i]->umem = newumem;
                mutex_unlock(&d->vqs[i]->mutex);
        }

        kvfree(newmem);
        vhost_iotlb_free(oldumem);
        return 0;

err:
        vhost_iotlb_free(newumem);
        kvfree(newmem);
        return -EFAULT;
}

static long vhost_vring_set_num(struct vhost_dev *d,
                                struct vhost_virtqueue *vq,
                                void __user *argp)
{
        struct vhost_vring_state s;

        /* Resizing ring with an active backend?
         * You don't want to do that. */
        if (vq->private_data)
                return -EBUSY;

        if (copy_from_user(&s, argp, sizeof s))
                return -EFAULT;

        if (!s.num || s.num > 0xffff || (s.num & (s.num - 1)))
                return -EINVAL;
        vq->num = s.num;

        return 0;
}

static long vhost_vring_set_addr(struct vhost_dev *d,
                                 struct vhost_virtqueue *vq,
                                 void __user *argp)
{
        struct vhost_vring_addr a;

        if (copy_from_user(&a, argp, sizeof a))
                return -EFAULT;
        if (a.flags & ~(0x1 << VHOST_VRING_F_LOG))
                return -EOPNOTSUPP;

        /* For 32bit, verify that the top 32bits of the user
           data are set to zero. */
        if ((u64)(unsigned long)a.desc_user_addr != a.desc_user_addr ||
            (u64)(unsigned long)a.used_user_addr != a.used_user_addr ||
            (u64)(unsigned long)a.avail_user_addr != a.avail_user_addr)
                return -EFAULT;

        /* Make sure it's safe to cast pointers to vring types. */
        BUILD_BUG_ON(__alignof__ *vq->avail > VRING_AVAIL_ALIGN_SIZE);
        BUILD_BUG_ON(__alignof__ *vq->used > VRING_USED_ALIGN_SIZE);
        if ((a.avail_user_addr & (VRING_AVAIL_ALIGN_SIZE - 1)) ||
            (a.used_user_addr & (VRING_USED_ALIGN_SIZE - 1)) ||
            (a.log_guest_addr & (VRING_USED_ALIGN_SIZE - 1)))
                return -EINVAL;

        /* We only verify access here if backend is configured.
         * If it is not, we don't as size might not have been setup.
         * We will verify when backend is configured. */
        if (vq->private_data) {
                if (!vq_access_ok(vq, vq->num,
                        (void __user *)(unsigned long)a.desc_user_addr,
                        (void __user *)(unsigned long)a.avail_user_addr,
                        (void __user *)(unsigned long)a.used_user_addr))
                        return -EINVAL;

                /* Also validate log access for used ring if enabled. */
                if (!vq_log_used_access_ok(vq, vq->log_base,
                                a.flags & (0x1 << VHOST_VRING_F_LOG),
                                a.log_guest_addr))
                        return -EINVAL;
        }

        vq->log_used = !!(a.flags & (0x1 << VHOST_VRING_F_LOG));
        vq->desc = (void __user *)(unsigned long)a.desc_user_addr;
        vq->avail = (void __user *)(unsigned long)a.avail_user_addr;
        vq->log_addr = a.log_guest_addr;
        vq->used = (void __user *)(unsigned long)a.used_user_addr;

        return 0;
}

static long vhost_vring_set_num_addr(struct vhost_dev *d,
                                     struct vhost_virtqueue *vq,
                                     unsigned int ioctl,
                                     void __user *argp)
{
        long r;

        mutex_lock(&vq->mutex);

        switch (ioctl) {
        case VHOST_SET_VRING_NUM:
                r = vhost_vring_set_num(d, vq, argp);
                break;
        case VHOST_SET_VRING_ADDR:
                r = vhost_vring_set_addr(d, vq, argp);
                break;
        default:
                BUG();
        }

        mutex_unlock(&vq->mutex);

        return r;
}
long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
{
        struct file *eventfp, *filep = NULL;
        bool pollstart = false, pollstop = false;
        struct eventfd_ctx *ctx = NULL;
        struct vhost_virtqueue *vq;
        struct vhost_vring_state s;
        struct vhost_vring_file f;
        u32 idx;
        long r;

        r = vhost_get_vq_from_user(d, argp, &vq, &idx);
        if (r < 0)
                return r;

        if (ioctl == VHOST_SET_VRING_NUM ||
            ioctl == VHOST_SET_VRING_ADDR) {
                return vhost_vring_set_num_addr(d, vq, ioctl, argp);
        }

        mutex_lock(&vq->mutex);

        switch (ioctl) {
        case VHOST_SET_VRING_BASE:
                /* Moving base with an active backend?
                 * You don't want to do that. */
                if (vq->private_data) {
                        r = -EBUSY;
                        break;
                }
                if (copy_from_user(&s, argp, sizeof s)) {
                        r = -EFAULT;
                        break;
                }
                if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) {
                        vq->last_avail_idx = s.num & 0xffff;
                        vq->last_used_idx = (s.num >> 16) & 0xffff;
                } else {
                        if (s.num > 0xffff) {
                                r = -EINVAL;
                                break;
                        }
                        vq->last_avail_idx = s.num;
                }
                /* Forget the cached index value. */
                vq->avail_idx = vq->last_avail_idx;
                break;
        case VHOST_GET_VRING_BASE:
                s.index = idx;
                if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
                        s.num = (u32)vq->last_avail_idx | ((u32)vq->last_used_idx << 16);
                else
                        s.num = vq->last_avail_idx;
                if (copy_to_user(argp, &s, sizeof s))
                        r = -EFAULT;
                break;
        case VHOST_SET_VRING_KICK:
                if (copy_from_user(&f, argp, sizeof f)) {
                        r = -EFAULT;
                        break;
                }
                eventfp = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_fget(f.fd);
                if (IS_ERR(eventfp)) {
                        r = PTR_ERR(eventfp);
                        break;
                }
                if (eventfp != vq->kick) {
                        pollstop = (filep = vq->kick) != NULL;
                        pollstart = (vq->kick = eventfp) != NULL;
                } else
                        filep = eventfp;
                break;
        case VHOST_SET_VRING_CALL:
                if (copy_from_user(&f, argp, sizeof f)) {
                        r = -EFAULT;
                        break;
                }
                ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd);
                if (IS_ERR(ctx)) {
                        r = PTR_ERR(ctx);
                        break;
                }

                swap(ctx, vq->call_ctx.ctx);
                break;
        case VHOST_SET_VRING_ERR:
                if (copy_from_user(&f, argp, sizeof f)) {
                        r = -EFAULT;
                        break;
                }
                ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd);
                if (IS_ERR(ctx)) {
                        r = PTR_ERR(ctx);
                        break;
                }
                swap(ctx, vq->error_ctx);
                break;
        case VHOST_SET_VRING_ENDIAN:
                r = vhost_set_vring_endian(vq, argp);
                break;
        case VHOST_GET_VRING_ENDIAN:
                r = vhost_get_vring_endian(vq, idx, argp);
                break;
        case VHOST_SET_VRING_BUSYLOOP_TIMEOUT:
                if (copy_from_user(&s, argp, sizeof(s))) {
                        r = -EFAULT;
                        break;
                }
                vq->busyloop_timeout = s.num;
                break;
        case VHOST_GET_VRING_BUSYLOOP_TIMEOUT:
                s.index = idx;
                s.num = vq->busyloop_timeout;
                if (copy_to_user(argp, &s, sizeof(s)))
                        r = -EFAULT;
                break;
        default:
                r = -ENOIOCTLCMD;
        }

        if (pollstop && vq->handle_kick)
                vhost_poll_stop(&vq->poll);

        if (!IS_ERR_OR_NULL(ctx))
                eventfd_ctx_put(ctx);
        if (filep)
                fput(filep);

        if (pollstart && vq->handle_kick)
                r = vhost_poll_start(&vq->poll, vq->kick);

        mutex_unlock(&vq->mutex);

        if (pollstop && vq->handle_kick)
                vhost_dev_flush(vq->poll.dev);
        return r;
}
EXPORT_SYMBOL_GPL(vhost_vring_ioctl);

int vhost_init_device_iotlb(struct vhost_dev *d)
{
        struct vhost_iotlb *niotlb, *oiotlb;
        int i;

        niotlb = iotlb_alloc();
        if (!niotlb)
                return -ENOMEM;

        oiotlb = d->iotlb;
        d->iotlb = niotlb;

        for (i = 0; i < d->nvqs; ++i) {
                struct vhost_virtqueue *vq = d->vqs[i];

                mutex_lock(&vq->mutex);
                vq->iotlb = niotlb;
                __vhost_vq_meta_reset(vq);
                mutex_unlock(&vq->mutex);
        }

        vhost_iotlb_free(oiotlb);

        return 0;
}
EXPORT_SYMBOL_GPL(vhost_init_device_iotlb);

/* Caller must have device mutex */
long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
{
        struct eventfd_ctx *ctx;
        u64 p;
        long r;
        int i, fd;

        /* If you are not the owner, you can become one */
        if (ioctl == VHOST_SET_OWNER) {
                r = vhost_dev_set_owner(d);
                goto done;
        }

        /* You must be the owner to do anything else */
        r = vhost_dev_check_owner(d);
        if (r)
                goto done;

        switch (ioctl) {
        case VHOST_SET_MEM_TABLE:
                r = vhost_set_memory(d, argp);
                break;
        case VHOST_SET_LOG_BASE:
                if (copy_from_user(&p, argp, sizeof p)) {
                        r = -EFAULT;
                        break;
                }
                if ((u64)(unsigned long)p != p) {
                        r = -EFAULT;
                        break;
                }
                for (i = 0; i < d->nvqs; ++i) {
                        struct vhost_virtqueue *vq;
                        void __user *base = (void __user *)(unsigned long)p;
                        vq = d->vqs[i];
                        mutex_lock(&vq->mutex);
                        /* If ring is inactive, will check when it's enabled. */
                        if (vq->private_data && !vq_log_access_ok(vq, base))
                                r = -EFAULT;
                        else
                                vq->log_base = base;
                        mutex_unlock(&vq->mutex);
                }
                break;
        case VHOST_SET_LOG_FD:
                r = get_user(fd, (int __user *)argp);
                if (r < 0)
                        break;
                ctx = fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(fd);
                if (IS_ERR(ctx)) {
                        r = PTR_ERR(ctx);
                        break;
                }
                swap(ctx, d->log_ctx);
                for (i = 0; i < d->nvqs; ++i) {
                        mutex_lock(&d->vqs[i]->mutex);
                        d->vqs[i]->log_ctx = d->log_ctx;
                        mutex_unlock(&d->vqs[i]->mutex);
                }
                if (ctx)
                        eventfd_ctx_put(ctx);
                break;
        default:
                r = -ENOIOCTLCMD;
                break;
        }
done:
        return r;
}
EXPORT_SYMBOL_GPL(vhost_dev_ioctl);

/* TODO: This is really inefficient.  We need something like get_user()
 * (instruction directly accesses the data, with an exception table entry
 * returning -EFAULT). See Documentation/arch/x86/exception-tables.rst.
 */
static int set_bit_to_user(int nr, void __user *addr)
{
        unsigned long log = (unsigned long)addr;
        struct page *page;
        void *base;
        int bit = nr + (log % PAGE_SIZE) * 8;
        int r;

        r = pin_user_pages_fast(log, 1, FOLL_WRITE, &page);
        if (r < 0)
                return r;
        BUG_ON(r != 1);
        base = kmap_atomic(page);
        set_bit(bit, base);
        kunmap_atomic(base);
        unpin_user_pages_dirty_lock(&page, 1, true);
        return 0;
}

static int log_write(void __user *log_base,
                     u64 write_address, u64 write_length)
{
        u64 write_page = write_address / VHOST_PAGE_SIZE;
        int r;

        if (!write_length)
                return 0;
        write_length += write_address % VHOST_PAGE_SIZE;
        for (;;) {
                u64 base = (u64)(unsigned long)log_base;
                u64 log = base + write_page / 8;
                int bit = write_page % 8;
                if ((u64)(unsigned long)log != log)
                        return -EFAULT;
                r = set_bit_to_user(bit, (void __user *)(unsigned long)log);
                if (r < 0)
                        return r;
                if (write_length <= VHOST_PAGE_SIZE)
                        break;
                write_length -= VHOST_PAGE_SIZE;
                write_page += 1;
        }
        return r;
}

static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len)
{
        struct vhost_iotlb *umem = vq->umem;
        struct vhost_iotlb_map *u;
        u64 start, end, l, min;
        int r;
        bool hit = false;

        while (len) {
                min = len;
                /* More than one GPAs can be mapped into a single HVA. So
                 * iterate all possible umems here to be safe.
                 */
                list_for_each_entry(u, &umem->list, link) {
                        if (u->addr > hva - 1 + len ||
                            u->addr - 1 + u->size < hva)
                                continue;
                        start = max(u->addr, hva);
                        end = min(u->addr - 1 + u->size, hva - 1 + len);
                        l = end - start + 1;
                        r = log_write(vq->log_base,
                                      u->start + start - u->addr,
                                      l);
                        if (r < 0)
                                return r;
                        hit = true;
                        min = min(l, min);
                }

                if (!hit)
                        return -EFAULT;

                len -= min;
                hva += min;
        }

        return 0;
}

static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len)
{
        struct iovec *iov = vq->log_iov;
        int i, ret;

        if (!vq->iotlb)
                return log_write(vq->log_base, vq->log_addr + used_offset, len);

        ret = translate_desc(vq, (uintptr_t)vq->used + used_offset,
                             len, iov, 64, VHOST_ACCESS_WO);
        if (ret < 0)
                return ret;

        for (i = 0; i < ret; i++) {
                ret = log_write_hva(vq,        (uintptr_t)iov[i].iov_base,
                                    iov[i].iov_len);
                if (ret)
                        return ret;
        }

        return 0;
}

int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
                    unsigned int log_num, u64 len, struct iovec *iov, int count)
{
        int i, r;

        /* Make sure data written is seen before log. */
        smp_wmb();

        if (vq->iotlb) {
                for (i = 0; i < count; i++) {
                        r = log_write_hva(vq, (uintptr_t)iov[i].iov_base,
                                          iov[i].iov_len);
                        if (r < 0)
                                return r;
                }
                return 0;
        }

        for (i = 0; i < log_num; ++i) {
                u64 l = min(log[i].len, len);
                r = log_write(vq->log_base, log[i].addr, l);
                if (r < 0)
                        return r;
                len -= l;
                if (!len) {
                        if (vq->log_ctx)
                                eventfd_signal(vq->log_ctx);
                        return 0;
                }
        }
        /* Length written exceeds what we have stored. This is a bug. */
        BUG();
        return 0;
}
EXPORT_SYMBOL_GPL(vhost_log_write);

static int vhost_update_used_flags(struct vhost_virtqueue *vq)
{
        void __user *used;
        if (vhost_put_used_flags(vq))
                return -EFAULT;
        if (unlikely(vq->log_used)) {
                /* Make sure the flag is seen before log. */
                smp_wmb();
                /* Log used flag write. */
                used = &vq->used->flags;
                log_used(vq, (used - (void __user *)vq->used),
                         sizeof vq->used->flags);
                if (vq->log_ctx)
                        eventfd_signal(vq->log_ctx);
        }
        return 0;
}

static int vhost_update_avail_event(struct vhost_virtqueue *vq)
{
        if (vhost_put_avail_event(vq))
                return -EFAULT;
        if (unlikely(vq->log_used)) {
                void __user *used;
                /* Make sure the event is seen before log. */
                smp_wmb();
                /* Log avail event write */
                used = vhost_avail_event(vq);
                log_used(vq, (used - (void __user *)vq->used),
                         sizeof *vhost_avail_event(vq));
                if (vq->log_ctx)
                        eventfd_signal(vq->log_ctx);
        }
        return 0;
}

int vhost_vq_init_access(struct vhost_virtqueue *vq)
{
        __virtio16 last_used_idx;
        int r;
        bool is_le = vq->is_le;

        if (!vq->private_data)
                return 0;

        vhost_init_is_le(vq);

        r = vhost_update_used_flags(vq);
        if (r)
                goto err;
        vq->signalled_used_valid = false;
        if (!vq->iotlb &&
            !access_ok(&vq->used->idx, sizeof vq->used->idx)) {
                r = -EFAULT;
                goto err;
        }
        r = vhost_get_used_idx(vq, &last_used_idx);
        if (r) {
                vq_err(vq, "Can't access used idx at %p\n",
                       &vq->used->idx);
                goto err;
        }
        vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);
        return 0;

err:
        vq->is_le = is_le;
        return r;
}
EXPORT_SYMBOL_GPL(vhost_vq_init_access);

static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
                          struct iovec iov[], int iov_size, int access)
{
        const struct vhost_iotlb_map *map;
        struct vhost_dev *dev = vq->dev;
        struct vhost_iotlb *umem = dev->iotlb ? dev->iotlb : dev->umem;
        struct iovec *_iov;
        u64 s = 0, last = addr + len - 1;
        int ret = 0;

        while ((u64)len > s) {
                u64 size;
                if (unlikely(ret >= iov_size)) {
                        ret = -ENOBUFS;
                        break;
                }

                map = vhost_iotlb_itree_first(umem, addr, last);
                if (map == NULL || map->start > addr) {
                        if (umem != dev->iotlb) {
                                ret = -EFAULT;
                                break;
                        }
                        ret = -EAGAIN;
                        break;
                } else if (!(map->perm & access)) {
                        ret = -EPERM;
                        break;
                }

                _iov = iov + ret;
                size = map->size - addr + map->start;
                _iov->iov_len = min((u64)len - s, size);
                _iov->iov_base = (void __user *)(unsigned long)
                                 (map->addr + addr - map->start);
                s += size;
                addr += size;
                ++ret;
        }

        if (ret == -EAGAIN)
                vhost_iotlb_miss(vq, addr, access);
        return ret;
}

/* Each buffer in the virtqueues is actually a chain of descriptors.  This
 * function returns the next descriptor in the chain,
 * or -1U if we're at the end. */
static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
{
        unsigned int next;

        /* If this descriptor says it doesn't chain, we're done. */
        if (!(desc->flags & cpu_to_vhost16(vq, VRING_DESC_F_NEXT)))
                return -1U;

        /* Check they're not leading us off end of descriptors. */
        next = vhost16_to_cpu(vq, READ_ONCE(desc->next));
        return next;
}

static int get_indirect(struct vhost_virtqueue *vq,
                        struct iovec iov[], unsigned int iov_size,
                        unsigned int *out_num, unsigned int *in_num,
                        struct vhost_log *log, unsigned int *log_num,
                        struct vring_desc *indirect)
{
        struct vring_desc desc;
        unsigned int i = 0, count, found = 0;
        u32 len = vhost32_to_cpu(vq, indirect->len);
        struct iov_iter from;
        int ret, access;

        /* Sanity check */
        if (unlikely(len % sizeof desc)) {
                vq_err(vq, "Invalid length in indirect descriptor: "
                       "len 0x%llx not multiple of 0x%zx\n",
                       (unsigned long long)len,
                       sizeof desc);
                return -EINVAL;
        }

        ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
                             UIO_MAXIOV, VHOST_ACCESS_RO);
        if (unlikely(ret < 0)) {
                if (ret != -EAGAIN)
                        vq_err(vq, "Translation failure %d in indirect.\n", ret);
                return ret;
        }
        iov_iter_init(&from, ITER_SOURCE, vq->indirect, ret, len);
        count = len / sizeof desc;
        /* Buffers are chained via a 16 bit next field, so
         * we can have at most 2^16 of these. */
        if (unlikely(count > USHRT_MAX + 1)) {
                vq_err(vq, "Indirect buffer length too big: %d\n",
                       indirect->len);
                return -E2BIG;
        }

        do {
                unsigned iov_count = *in_num + *out_num;
                if (unlikely(++found > count)) {
                        vq_err(vq, "Loop detected: last one at %u "
                               "indirect size %u\n",
                               i, count);
                        return -EINVAL;
                }
                if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
                        vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
                               i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
                        return -EINVAL;
                }
                if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
                        vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
                               i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
                        return -EINVAL;
                }

                if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
                        access = VHOST_ACCESS_WO;
                else
                        access = VHOST_ACCESS_RO;

                ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
                                     vhost32_to_cpu(vq, desc.len), iov + iov_count,
                                     iov_size - iov_count, access);
                if (unlikely(ret < 0)) {
                        if (ret != -EAGAIN)
                                vq_err(vq, "Translation failure %d indirect idx %d\n",
                                        ret, i);
                        return ret;
                }
                /* If this is an input descriptor, increment that count. */
                if (access == VHOST_ACCESS_WO) {
                        *in_num += ret;
                        if (unlikely(log && ret)) {
                                log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
                                log[*log_num].len = vhost32_to_cpu(vq, desc.len);
                                ++*log_num;
                        }
                } else {
                        /* If it's an output descriptor, they're all supposed
                         * to come before any input descriptors. */
                        if (unlikely(*in_num)) {
                                vq_err(vq, "Indirect descriptor "
                                       "has out after in: idx %d\n", i);
                                return -EINVAL;
                        }
                        *out_num += ret;
                }
        } while ((i = next_desc(vq, &desc)) != -1);
        return 0;
}

/* This looks in the virtqueue and for the first available buffer, and converts
 * it to an iovec for convenient access.  Since descriptors consist of some
 * number of output then some number of input descriptors, it's actually two
 * iovecs, but we pack them into one and note how many of each there were.
 *
 * This function returns the descriptor number found, or vq->num (which is
 * never a valid descriptor number) if none was found.  A negative code is
 * returned on error. */
int vhost_get_vq_desc(struct vhost_virtqueue *vq,
                      struct iovec iov[], unsigned int iov_size,
                      unsigned int *out_num, unsigned int *in_num,
                      struct vhost_log *log, unsigned int *log_num)
{
        struct vring_desc desc;
        unsigned int i, head, found = 0;
        u16 last_avail_idx;
        __virtio16 avail_idx;
        __virtio16 ring_head;
        int ret, access;

        /* Check it isn't doing very strange things with descriptor numbers. */
        last_avail_idx = vq->last_avail_idx;

        if (vq->avail_idx == vq->last_avail_idx) {
                if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
                        vq_err(vq, "Failed to access avail idx at %p\n",
                                &vq->avail->idx);
                        return -EFAULT;
                }
                vq->avail_idx = vhost16_to_cpu(vq, avail_idx);

                if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
                        vq_err(vq, "Guest moved avail index from %u to %u",
                                last_avail_idx, vq->avail_idx);
                        return -EFAULT;
                }

                /* If there's nothing new since last we looked, return
                 * invalid.
                 */
                if (vq->avail_idx == last_avail_idx)
                        return vq->num;

                /* Only get avail ring entries after they have been
                 * exposed by guest.
                 */
                smp_rmb();
        }

        /* Grab the next descriptor number they're advertising, and increment
         * the index we've seen. */
        if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
                vq_err(vq, "Failed to read head: idx %d address %p\n",
                       last_avail_idx,
                       &vq->avail->ring[last_avail_idx % vq->num]);
                return -EFAULT;
        }

        head = vhost16_to_cpu(vq, ring_head);

        /* If their number is silly, that's an error. */
        if (unlikely(head >= vq->num)) {
                vq_err(vq, "Guest says index %u > %u is available",
                       head, vq->num);
                return -EINVAL;
        }

        /* When we start there are none of either input nor output. */
        *out_num = *in_num = 0;
        if (unlikely(log))
                *log_num = 0;

        i = head;
        do {
                unsigned iov_count = *in_num + *out_num;
                if (unlikely(i >= vq->num)) {
                        vq_err(vq, "Desc index is %u > %u, head = %u",
                               i, vq->num, head);
                        return -EINVAL;
                }
                if (unlikely(++found > vq->num)) {
                        vq_err(vq, "Loop detected: last one at %u "
                               "vq size %u head %u\n",
                               i, vq->num, head);
                        return -EINVAL;
                }
                ret = vhost_get_desc(vq, &desc, i);
                if (unlikely(ret)) {
                        vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
                               i, vq->desc + i);
                        return -EFAULT;
                }
                if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
                        ret = get_indirect(vq, iov, iov_size,
                                           out_num, in_num,
                                           log, log_num, &desc);
                        if (unlikely(ret < 0)) {
                                if (ret != -EAGAIN)
                                        vq_err(vq, "Failure detected "
                                                "in indirect descriptor at idx %d\n", i);
                                return ret;
                        }
                        continue;
                }

                if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
                        access = VHOST_ACCESS_WO;
                else
                        access = VHOST_ACCESS_RO;
                ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
                                     vhost32_to_cpu(vq, desc.len), iov + iov_count,
                                     iov_size - iov_count, access);
                if (unlikely(ret < 0)) {
                        if (ret != -EAGAIN)
                                vq_err(vq, "Translation failure %d descriptor idx %d\n",
                                        ret, i);
                        return ret;
                }
                if (access == VHOST_ACCESS_WO) {
                        /* If this is an input descriptor,
                         * increment that count. */
                        *in_num += ret;
                        if (unlikely(log && ret)) {
                                log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
                                log[*log_num].len = vhost32_to_cpu(vq, desc.len);
                                ++*log_num;
                        }
                } else {
                        /* If it's an output descriptor, they're all supposed
                         * to come before any input descriptors. */
                        if (unlikely(*in_num)) {
                                vq_err(vq, "Descriptor has out after in: "
                                       "idx %d\n", i);
                                return -EINVAL;
                        }
                        *out_num += ret;
                }
        } while ((i = next_desc(vq, &desc)) != -1);

        /* On success, increment avail index. */
        vq->last_avail_idx++;

        /* Assume notifications from guest are disabled at this point,
         * if they aren't we would need to update avail_event index. */
        BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
        return head;
}
EXPORT_SYMBOL_GPL(vhost_get_vq_desc);

/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
{
        vq->last_avail_idx -= n;
}
EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);

/* After we've used one of their buffers, we tell them about it.  We'll then
 * want to notify the guest, using eventfd. */
int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
{
        struct vring_used_elem heads = {
                cpu_to_vhost32(vq, head),
                cpu_to_vhost32(vq, len)
        };

        return vhost_add_used_n(vq, &heads, 1);
}
EXPORT_SYMBOL_GPL(vhost_add_used);

static int __vhost_add_used_n(struct vhost_virtqueue *vq,
                            struct vring_used_elem *heads,
                            unsigned count)
{
        vring_used_elem_t __user *used;
        u16 old, new;
        int start;

        start = vq->last_used_idx & (vq->num - 1);
        used = vq->used->ring + start;
        if (vhost_put_used(vq, heads, start, count)) {
                vq_err(vq, "Failed to write used");
                return -EFAULT;
        }
        if (unlikely(vq->log_used)) {
                /* Make sure data is seen before log. */
                smp_wmb();
                /* Log used ring entry write. */
                log_used(vq, ((void __user *)used - (void __user *)vq->used),
                         count * sizeof *used);
        }
        old = vq->last_used_idx;
        new = (vq->last_used_idx += count);
        /* If the driver never bothers to signal in a very long while,
         * used index might wrap around. If that happens, invalidate
         * signalled_used index we stored. TODO: make sure driver
         * signals at least once in 2^16 and remove this. */
        if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
                vq->signalled_used_valid = false;
        return 0;
}

/* After we've used one of their buffers, we tell them about it.  We'll then
 * want to notify the guest, using eventfd. */
int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
                     unsigned count)
{
        int start, n, r;

        start = vq->last_used_idx & (vq->num - 1);
        n = vq->num - start;
        if (n < count) {
                r = __vhost_add_used_n(vq, heads, n);
                if (r < 0)
                        return r;
                heads += n;
                count -= n;
        }
        r = __vhost_add_used_n(vq, heads, count);

        /* Make sure buffer is written before we update index. */
        smp_wmb();
        if (vhost_put_used_idx(vq)) {
                vq_err(vq, "Failed to increment used idx");
                return -EFAULT;
        }
        if (unlikely(vq->log_used)) {
                /* Make sure used idx is seen before log. */
                smp_wmb();
                /* Log used index update. */
                log_used(vq, offsetof(struct vring_used, idx),
                         sizeof vq->used->idx);
                if (vq->log_ctx)
                        eventfd_signal(vq->log_ctx);
        }
        return r;
}
EXPORT_SYMBOL_GPL(vhost_add_used_n);

static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
{
        __u16 old, new;
        __virtio16 event;
        bool v;
        /* Flush out used index updates. This is paired
         * with the barrier that the Guest executes when enabling
         * interrupts. */
        smp_mb();

        if (vhost_has_feature(vq, VIRTIO_F_NOTIFY_ON_EMPTY) &&
            unlikely(vq->avail_idx == vq->last_avail_idx))
                return true;

        if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
                __virtio16 flags;
                if (vhost_get_avail_flags(vq, &flags)) {
                        vq_err(vq, "Failed to get flags");
                        return true;
                }
                return !(flags & cpu_to_vhost16(vq, VRING_AVAIL_F_NO_INTERRUPT));
        }
        old = vq->signalled_used;
        v = vq->signalled_used_valid;
        new = vq->signalled_used = vq->last_used_idx;
        vq->signalled_used_valid = true;

        if (unlikely(!v))
                return true;

        if (vhost_get_used_event(vq, &event)) {
                vq_err(vq, "Failed to get used event idx");
                return true;
        }
        return vring_need_event(vhost16_to_cpu(vq, event), new, old);
}

/* This actually signals the guest, using eventfd. */
void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
{
        /* Signal the Guest tell them we used something up. */
        if (vq->call_ctx.ctx && vhost_notify(dev, vq))
                eventfd_signal(vq->call_ctx.ctx);
}
EXPORT_SYMBOL_GPL(vhost_signal);

/* And here's the combo meal deal.  Supersize me! */
void vhost_add_used_and_signal(struct vhost_dev *dev,
                               struct vhost_virtqueue *vq,
                               unsigned int head, int len)
{
        vhost_add_used(vq, head, len);
        vhost_signal(dev, vq);
}
EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);

/* multi-buffer version of vhost_add_used_and_signal */
void vhost_add_used_and_signal_n(struct vhost_dev *dev,
                                 struct vhost_virtqueue *vq,
                                 struct vring_used_elem *heads, unsigned count)
{
        vhost_add_used_n(vq, heads, count);
        vhost_signal(dev, vq);
}
EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);

/* return true if we're sure that avaiable ring is empty */
bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
{
        __virtio16 avail_idx;
        int r;

        if (vq->avail_idx != vq->last_avail_idx)
                return false;

        r = vhost_get_avail_idx(vq, &avail_idx);
        if (unlikely(r))
                return false;

        vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
        if (vq->avail_idx != vq->last_avail_idx) {
                /* Since we have updated avail_idx, the following
                 * call to vhost_get_vq_desc() will read available
                 * ring entries. Make sure that read happens after
                 * the avail_idx read.
                 */
                smp_rmb();
                return false;
        }

        return true;
}
EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);

/* OK, now we need to know about added descriptors. */
bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
{
        __virtio16 avail_idx;
        int r;

        if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY))
                return false;
        vq->used_flags &= ~VRING_USED_F_NO_NOTIFY;
        if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
                r = vhost_update_used_flags(vq);
                if (r) {
                        vq_err(vq, "Failed to enable notification at %p: %d\n",
                               &vq->used->flags, r);
                        return false;
                }
        } else {
                r = vhost_update_avail_event(vq);
                if (r) {
                        vq_err(vq, "Failed to update avail event index at %p: %d\n",
                               vhost_avail_event(vq), r);
                        return false;
                }
        }
        /* They could have slipped one in as we were doing that: make
         * sure it's written, then check again. */
        smp_mb();
        r = vhost_get_avail_idx(vq, &avail_idx);
        if (r) {
                vq_err(vq, "Failed to check avail idx at %p: %d\n",
                       &vq->avail->idx, r);
                return false;
        }

        vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
        if (vq->avail_idx != vq->last_avail_idx) {
                /* Since we have updated avail_idx, the following
                 * call to vhost_get_vq_desc() will read available
                 * ring entries. Make sure that read happens after
                 * the avail_idx read.
                 */
                smp_rmb();
                return true;
        }

        return false;
}
EXPORT_SYMBOL_GPL(vhost_enable_notify);

/* We don't need to be notified again. */
void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
{
        int r;

        if (vq->used_flags & VRING_USED_F_NO_NOTIFY)
                return;
        vq->used_flags |= VRING_USED_F_NO_NOTIFY;
        if (!vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX)) {
                r = vhost_update_used_flags(vq);
                if (r)
                        vq_err(vq, "Failed to disable notification at %p: %d\n",
                               &vq->used->flags, r);
        }
}
EXPORT_SYMBOL_GPL(vhost_disable_notify);

/* Create a new message. */
struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type)
{
        /* Make sure all padding within the structure is initialized. */
        struct vhost_msg_node *node = kzalloc(sizeof(*node), GFP_KERNEL);
        if (!node)
                return NULL;

        node->vq = vq;
        node->msg.type = type;
        return node;
}
EXPORT_SYMBOL_GPL(vhost_new_msg);

void vhost_enqueue_msg(struct vhost_dev *dev, struct list_head *head,
                       struct vhost_msg_node *node)
{
        spin_lock(&dev->iotlb_lock);
        list_add_tail(&node->node, head);
        spin_unlock(&dev->iotlb_lock);

        wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
}
EXPORT_SYMBOL_GPL(vhost_enqueue_msg);

struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
                                         struct list_head *head)
{
        struct vhost_msg_node *node = NULL;

        spin_lock(&dev->iotlb_lock);
        if (!list_empty(head)) {
                node = list_first_entry(head, struct vhost_msg_node,
                                        node);
                list_del(&node->node);
        }
        spin_unlock(&dev->iotlb_lock);

        return node;
}
EXPORT_SYMBOL_GPL(vhost_dequeue_msg);

void vhost_set_backend_features(struct vhost_dev *dev, u64 features)
{
        struct vhost_virtqueue *vq;
        int i;

        mutex_lock(&dev->mutex);
        for (i = 0; i < dev->nvqs; ++i) {
                vq = dev->vqs[i];
                mutex_lock(&vq->mutex);
                vq->acked_backend_features = features;
                mutex_unlock(&vq->mutex);
        }
        mutex_unlock(&dev->mutex);
}
EXPORT_SYMBOL_GPL(vhost_set_backend_features);

static int __init vhost_init(void)
{
        return 0;
}

static void __exit vhost_exit(void)
{
}

module_init(vhost_init);
module_exit(vhost_exit);

MODULE_VERSION("0.0.1");
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Michael S. Tsirkin");
MODULE_DESCRIPTION("Host kernel accelerator for virtio");















































































    5 
    5 
    5 
    5 
    5 








    5 



















    1 



    1 








    1 
    1 

    1 
    1 





































   14 
   14 
   14 

   11 












   15 
   14 
















   15 
   15 



   13 




   11 
   13 


   12 


   15 


   14 



   12 





   21 
   21 
































































































    6 
    6 
    6 

    6 
    6 
    6 
    6 


    6 







   12 

   12 
    5 



    3 
    4 
    3 



    4 
    4 

    4 


























   11 
   11 











    1 
   10 

    3 
   10 




   11 


















































   24 





   28 
   15 
   23 





   28 






   11 
    4 

   11 
   11 


   31 














   18 


   19 

   21 







   21 
   18 

   21 












    6 
    6 




    6 













































































































   11 
   12 


    9 
   11 

   10 

    7 









   11 



   10 



   10 


   12 








































































   10 
   12 

   11 


















































































































































































































   14 




   13 
   13 
   14 
















   13 











    5 





    5 



   11 
    5 
   13 





    5 























































    8 
    3 
    8 
















    2 


    2 
    2 

    2 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/mm/swap.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * This file contains the default values for the operation of the
 * Linux VM subsystem. Fine-tuning documentation can be found in
 * Documentation/admin-guide/sysctl/vm.rst.
 * Started 18.12.91
 * Swap aging added 23.2.95, Stephen Tweedie.
 * Buffermem limits added 12.3.98, Rik van Riel.
 */

#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/init.h>
#include <linux/export.h>
#include <linux/mm_inline.h>
#include <linux/percpu_counter.h>
#include <linux/memremap.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/backing-dev.h>
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
#include <linux/hugetlb.h>
#include <linux/page_idle.h>
#include <linux/local_lock.h>
#include <linux/buffer_head.h>

#include "internal.h"

#define CREATE_TRACE_POINTS
#include <trace/events/pagemap.h>

/* How many pages do we try to swap or page in/out together? As a power of 2 */
int page_cluster;
const int page_cluster_max = 31;

/* Protecting only lru_rotate.fbatch which requires disabling interrupts */
struct lru_rotate {
        local_lock_t lock;
        struct folio_batch fbatch;
};
static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
        .lock = INIT_LOCAL_LOCK(lock),
};

/*
 * The following folio batches are grouped together because they are protected
 * by disabling preemption (and interrupts remain enabled).
 */
struct cpu_fbatches {
        local_lock_t lock;
        struct folio_batch lru_add;
        struct folio_batch lru_deactivate_file;
        struct folio_batch lru_deactivate;
        struct folio_batch lru_lazyfree;
#ifdef CONFIG_SMP
        struct folio_batch activate;
#endif
};
static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = {
        .lock = INIT_LOCAL_LOCK(lock),
};

static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp,
                unsigned long *flagsp)
{
        if (folio_test_lru(folio)) {
                folio_lruvec_relock_irqsave(folio, lruvecp, flagsp);
                lruvec_del_folio(*lruvecp, folio);
                __folio_clear_lru_flags(folio);
        }

        /*
         * In rare cases, when truncation or holepunching raced with
         * munlock after VM_LOCKED was cleared, Mlocked may still be
         * found set here.  This does not indicate a problem, unless
         * "unevictable_pgs_cleared" appears worryingly large.
         */
        if (unlikely(folio_test_mlocked(folio))) {
                long nr_pages = folio_nr_pages(folio);

                __folio_clear_mlocked(folio);
                zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
                count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
        }
}

/*
 * This path almost never happens for VM activity - pages are normally freed
 * in batches.  But it gets used by networking - and for compound pages.
 */
static void page_cache_release(struct folio *folio)
{
        struct lruvec *lruvec = NULL;
        unsigned long flags;

        __page_cache_release(folio, &lruvec, &flags);
        if (lruvec)
                unlock_page_lruvec_irqrestore(lruvec, flags);
}

void __folio_put(struct folio *folio)
{
        if (unlikely(folio_is_zone_device(folio))) {
                free_zone_device_folio(folio);
                return;
        } else if (folio_test_hugetlb(folio)) {
                free_huge_folio(folio);
                return;
        }

        page_cache_release(folio);
        if (folio_test_large(folio) && folio_test_large_rmappable(folio))
                folio_undo_large_rmappable(folio);
        mem_cgroup_uncharge(folio);
        free_unref_page(&folio->page, folio_order(folio));
}
EXPORT_SYMBOL(__folio_put);

/**
 * put_pages_list() - release a list of pages
 * @pages: list of pages threaded on page->lru
 *
 * Release a list of pages which are strung together on page.lru.
 */
void put_pages_list(struct list_head *pages)
{
        struct folio_batch fbatch;
        struct folio *folio, *next;

        folio_batch_init(&fbatch);
        list_for_each_entry_safe(folio, next, pages, lru) {
                if (!folio_put_testzero(folio))
                        continue;
                if (folio_test_hugetlb(folio)) {
                        free_huge_folio(folio);
                        continue;
                }
                /* LRU flag must be clear because it's passed using the lru */
                if (folio_batch_add(&fbatch, folio) > 0)
                        continue;
                free_unref_folios(&fbatch);
        }

        if (fbatch.nr)
                free_unref_folios(&fbatch);
        INIT_LIST_HEAD(pages);
}
EXPORT_SYMBOL(put_pages_list);

typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio);

static void lru_add_fn(struct lruvec *lruvec, struct folio *folio)
{
        int was_unevictable = folio_test_clear_unevictable(folio);
        long nr_pages = folio_nr_pages(folio);

        VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

        /*
         * Is an smp_mb__after_atomic() still required here, before
         * folio_evictable() tests the mlocked flag, to rule out the possibility
         * of stranding an evictable folio on an unevictable LRU?  I think
         * not, because __munlock_folio() only clears the mlocked flag
         * while the LRU lock is held.
         *
         * (That is not true of __page_cache_release(), and not necessarily
         * true of folios_put(): but those only clear the mlocked flag after
         * folio_put_testzero() has excluded any other users of the folio.)
         */
        if (folio_evictable(folio)) {
                if (was_unevictable)
                        __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
        } else {
                folio_clear_active(folio);
                folio_set_unevictable(folio);
                /*
                 * folio->mlock_count = !!folio_test_mlocked(folio)?
                 * But that leaves __mlock_folio() in doubt whether another
                 * actor has already counted the mlock or not.  Err on the
                 * safe side, underestimate, let page reclaim fix it, rather
                 * than leaving a page on the unevictable LRU indefinitely.
                 */
                folio->mlock_count = 0;
                if (!was_unevictable)
                        __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
        }

        lruvec_add_folio(lruvec, folio);
        trace_mm_lru_insertion(folio);
}

static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
{
        int i;
        struct lruvec *lruvec = NULL;
        unsigned long flags = 0;

        for (i = 0; i < folio_batch_count(fbatch); i++) {
                struct folio *folio = fbatch->folios[i];

                /* block memcg migration while the folio moves between lru */
                if (move_fn != lru_add_fn && !folio_test_clear_lru(folio))
                        continue;

                folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
                move_fn(lruvec, folio);

                folio_set_lru(folio);
        }

        if (lruvec)
                unlock_page_lruvec_irqrestore(lruvec, flags);
        folios_put(fbatch);
}

static void folio_batch_add_and_move(struct folio_batch *fbatch,
                struct folio *folio, move_fn_t move_fn)
{
        if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) &&
            !lru_cache_disabled())
                return;
        folio_batch_move_lru(fbatch, move_fn);
}

static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio)
{
        if (!folio_test_unevictable(folio)) {
                lruvec_del_folio(lruvec, folio);
                folio_clear_active(folio);
                lruvec_add_folio_tail(lruvec, folio);
                __count_vm_events(PGROTATED, folio_nr_pages(folio));
        }
}

/*
 * Writeback is about to end against a folio which has been marked for
 * immediate reclaim.  If it still appears to be reclaimable, move it
 * to the tail of the inactive list.
 *
 * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races.
 */
void folio_rotate_reclaimable(struct folio *folio)
{
        if (!folio_test_locked(folio) && !folio_test_dirty(folio) &&
            !folio_test_unevictable(folio) && folio_test_lru(folio)) {
                struct folio_batch *fbatch;
                unsigned long flags;

                folio_get(folio);
                local_lock_irqsave(&lru_rotate.lock, flags);
                fbatch = this_cpu_ptr(&lru_rotate.fbatch);
                folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn);
                local_unlock_irqrestore(&lru_rotate.lock, flags);
        }
}

void lru_note_cost(struct lruvec *lruvec, bool file,
                   unsigned int nr_io, unsigned int nr_rotated)
{
        unsigned long cost;

        /*
         * Reflect the relative cost of incurring IO and spending CPU
         * time on rotations. This doesn't attempt to make a precise
         * comparison, it just says: if reloads are about comparable
         * between the LRU lists, or rotations are overwhelmingly
         * different between them, adjust scan balance for CPU work.
         */
        cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated;

        do {
                unsigned long lrusize;

                /*
                 * Hold lruvec->lru_lock is safe here, since
                 * 1) The pinned lruvec in reclaim, or
                 * 2) From a pre-LRU page during refault (which also holds the
                 *    rcu lock, so would be safe even if the page was on the LRU
                 *    and could move simultaneously to a new lruvec).
                 */
                spin_lock_irq(&lruvec->lru_lock);
                /* Record cost event */
                if (file)
                        lruvec->file_cost += cost;
                else
                        lruvec->anon_cost += cost;

                /*
                 * Decay previous events
                 *
                 * Because workloads change over time (and to avoid
                 * overflow) we keep these statistics as a floating
                 * average, which ends up weighing recent refaults
                 * more than old ones.
                 */
                lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
                          lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
                          lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
                          lruvec_page_state(lruvec, NR_ACTIVE_FILE);

                if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) {
                        lruvec->file_cost /= 2;
                        lruvec->anon_cost /= 2;
                }
                spin_unlock_irq(&lruvec->lru_lock);
        } while ((lruvec = parent_lruvec(lruvec)));
}

void lru_note_cost_refault(struct folio *folio)
{
        lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
                      folio_nr_pages(folio), 0);
}

static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio)
{
        if (!folio_test_active(folio) && !folio_test_unevictable(folio)) {
                long nr_pages = folio_nr_pages(folio);

                lruvec_del_folio(lruvec, folio);
                folio_set_active(folio);
                lruvec_add_folio(lruvec, folio);
                trace_mm_lru_activate(folio);

                __count_vm_events(PGACTIVATE, nr_pages);
                __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE,
                                     nr_pages);
        }
}

#ifdef CONFIG_SMP
static void folio_activate_drain(int cpu)
{
        struct folio_batch *fbatch = &per_cpu(cpu_fbatches.activate, cpu);

        if (folio_batch_count(fbatch))
                folio_batch_move_lru(fbatch, folio_activate_fn);
}

void folio_activate(struct folio *folio)
{
        if (folio_test_lru(folio) && !folio_test_active(folio) &&
            !folio_test_unevictable(folio)) {
                struct folio_batch *fbatch;

                folio_get(folio);
                local_lock(&cpu_fbatches.lock);
                fbatch = this_cpu_ptr(&cpu_fbatches.activate);
                folio_batch_add_and_move(fbatch, folio, folio_activate_fn);
                local_unlock(&cpu_fbatches.lock);
        }
}

#else
static inline void folio_activate_drain(int cpu)
{
}

void folio_activate(struct folio *folio)
{
        struct lruvec *lruvec;

        if (folio_test_clear_lru(folio)) {
                lruvec = folio_lruvec_lock_irq(folio);
                folio_activate_fn(lruvec, folio);
                unlock_page_lruvec_irq(lruvec);
                folio_set_lru(folio);
        }
}
#endif

static void __lru_cache_activate_folio(struct folio *folio)
{
        struct folio_batch *fbatch;
        int i;

        local_lock(&cpu_fbatches.lock);
        fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);

        /*
         * Search backwards on the optimistic assumption that the folio being
         * activated has just been added to this batch. Note that only
         * the local batch is examined as a !LRU folio could be in the
         * process of being released, reclaimed, migrated or on a remote
         * batch that is currently being drained. Furthermore, marking
         * a remote batch's folio active potentially hits a race where
         * a folio is marked active just after it is added to the inactive
         * list causing accounting errors and BUG_ON checks to trigger.
         */
        for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) {
                struct folio *batch_folio = fbatch->folios[i];

                if (batch_folio == folio) {
                        folio_set_active(folio);
                        break;
                }
        }

        local_unlock(&cpu_fbatches.lock);
}

#ifdef CONFIG_LRU_GEN
static void folio_inc_refs(struct folio *folio)
{
        unsigned long new_flags, old_flags = READ_ONCE(folio->flags);

        if (folio_test_unevictable(folio))
                return;

        if (!folio_test_referenced(folio)) {
                folio_set_referenced(folio);
                return;
        }

        if (!folio_test_workingset(folio)) {
                folio_set_workingset(folio);
                return;
        }

        /* see the comment on MAX_NR_TIERS */
        do {
                new_flags = old_flags & LRU_REFS_MASK;
                if (new_flags == LRU_REFS_MASK)
                        break;

                new_flags += BIT(LRU_REFS_PGOFF);
                new_flags |= old_flags & ~LRU_REFS_MASK;
        } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
}
#else
static void folio_inc_refs(struct folio *folio)
{
}
#endif /* CONFIG_LRU_GEN */

/**
 * folio_mark_accessed - Mark a folio as having seen activity.
 * @folio: The folio to mark.
 *
 * This function will perform one of the following transitions:
 *
 * * inactive,unreferenced        ->        inactive,referenced
 * * inactive,referenced        ->        active,unreferenced
 * * active,unreferenced        ->        active,referenced
 *
 * When a newly allocated folio is not yet visible, so safe for non-atomic ops,
 * __folio_set_referenced() may be substituted for folio_mark_accessed().
 */
void folio_mark_accessed(struct folio *folio)
{
        if (lru_gen_enabled()) {
                folio_inc_refs(folio);
                return;
        }

        if (!folio_test_referenced(folio)) {
                folio_set_referenced(folio);
        } else if (folio_test_unevictable(folio)) {
                /*
                 * Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
                 * this list is never rotated or maintained, so marking an
                 * unevictable page accessed has no effect.
                 */
        } else if (!folio_test_active(folio)) {
                /*
                 * If the folio is on the LRU, queue it for activation via
                 * cpu_fbatches.activate. Otherwise, assume the folio is in a
                 * folio_batch, mark it active and it'll be moved to the active
                 * LRU on the next drain.
                 */
                if (folio_test_lru(folio))
                        folio_activate(folio);
                else
                        __lru_cache_activate_folio(folio);
                folio_clear_referenced(folio);
                workingset_activation(folio);
        }
        if (folio_test_idle(folio))
                folio_clear_idle(folio);
}
EXPORT_SYMBOL(folio_mark_accessed);

/**
 * folio_add_lru - Add a folio to an LRU list.
 * @folio: The folio to be added to the LRU.
 *
 * Queue the folio for addition to the LRU. The decision on whether
 * to add the page to the [in]active [file|anon] list is deferred until the
 * folio_batch is drained. This gives a chance for the caller of folio_add_lru()
 * have the folio added to the active list using folio_mark_accessed().
 */
void folio_add_lru(struct folio *folio)
{
        struct folio_batch *fbatch;

        VM_BUG_ON_FOLIO(folio_test_active(folio) &&
                        folio_test_unevictable(folio), folio);
        VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

        /* see the comment in lru_gen_add_folio() */
        if (lru_gen_enabled() && !folio_test_unevictable(folio) &&
            lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
                folio_set_active(folio);

        folio_get(folio);
        local_lock(&cpu_fbatches.lock);
        fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);
        folio_batch_add_and_move(fbatch, folio, lru_add_fn);
        local_unlock(&cpu_fbatches.lock);
}
EXPORT_SYMBOL(folio_add_lru);

/**
 * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA.
 * @folio: The folio to be added to the LRU.
 * @vma: VMA in which the folio is mapped.
 *
 * If the VMA is mlocked, @folio is added to the unevictable list.
 * Otherwise, it is treated the same way as folio_add_lru().
 */
void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma)
{
        VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

        if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED))
                mlock_new_folio(folio);
        else
                folio_add_lru(folio);
}

/*
 * If the folio cannot be invalidated, it is moved to the
 * inactive list to speed up its reclaim.  It is moved to the
 * head of the list, rather than the tail, to give the flusher
 * threads some time to write it out, as this is much more
 * effective than the single-page writeout from reclaim.
 *
 * If the folio isn't mapped and dirty/writeback, the folio
 * could be reclaimed asap using the reclaim flag.
 *
 * 1. active, mapped folio -> none
 * 2. active, dirty/writeback folio -> inactive, head, reclaim
 * 3. inactive, mapped folio -> none
 * 4. inactive, dirty/writeback folio -> inactive, head, reclaim
 * 5. inactive, clean -> inactive, tail
 * 6. Others -> none
 *
 * In 4, it moves to the head of the inactive list so the folio is
 * written out by flusher threads as this is much more efficient
 * than the single-page writeout from reclaim.
 */
static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio)
{
        bool active = folio_test_active(folio);
        long nr_pages = folio_nr_pages(folio);

        if (folio_test_unevictable(folio))
                return;

        /* Some processes are using the folio */
        if (folio_mapped(folio))
                return;

        lruvec_del_folio(lruvec, folio);
        folio_clear_active(folio);
        folio_clear_referenced(folio);

        if (folio_test_writeback(folio) || folio_test_dirty(folio)) {
                /*
                 * Setting the reclaim flag could race with
                 * folio_end_writeback() and confuse readahead.  But the
                 * race window is _really_ small and  it's not a critical
                 * problem.
                 */
                lruvec_add_folio(lruvec, folio);
                folio_set_reclaim(folio);
        } else {
                /*
                 * The folio's writeback ended while it was in the batch.
                 * We move that folio to the tail of the inactive list.
                 */
                lruvec_add_folio_tail(lruvec, folio);
                __count_vm_events(PGROTATED, nr_pages);
        }

        if (active) {
                __count_vm_events(PGDEACTIVATE, nr_pages);
                __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
                                     nr_pages);
        }
}

static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio)
{
        if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) {
                long nr_pages = folio_nr_pages(folio);

                lruvec_del_folio(lruvec, folio);
                folio_clear_active(folio);
                folio_clear_referenced(folio);
                lruvec_add_folio(lruvec, folio);

                __count_vm_events(PGDEACTIVATE, nr_pages);
                __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
                                     nr_pages);
        }
}

static void lru_lazyfree_fn(struct lruvec *lruvec, struct folio *folio)
{
        if (folio_test_anon(folio) && folio_test_swapbacked(folio) &&
            !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) {
                long nr_pages = folio_nr_pages(folio);

                lruvec_del_folio(lruvec, folio);
                folio_clear_active(folio);
                folio_clear_referenced(folio);
                /*
                 * Lazyfree folios are clean anonymous folios.  They have
                 * the swapbacked flag cleared, to distinguish them from normal
                 * anonymous folios
                 */
                folio_clear_swapbacked(folio);
                lruvec_add_folio(lruvec, folio);

                __count_vm_events(PGLAZYFREE, nr_pages);
                __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE,
                                     nr_pages);
        }
}

/*
 * Drain pages out of the cpu's folio_batch.
 * Either "cpu" is the current CPU, and preemption has already been
 * disabled; or "cpu" is being hot-unplugged, and is already dead.
 */
void lru_add_drain_cpu(int cpu)
{
        struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);
        struct folio_batch *fbatch = &fbatches->lru_add;

        if (folio_batch_count(fbatch))
                folio_batch_move_lru(fbatch, lru_add_fn);

        fbatch = &per_cpu(lru_rotate.fbatch, cpu);
        /* Disabling interrupts below acts as a compiler barrier. */
        if (data_race(folio_batch_count(fbatch))) {
                unsigned long flags;

                /* No harm done if a racing interrupt already did this */
                local_lock_irqsave(&lru_rotate.lock, flags);
                folio_batch_move_lru(fbatch, lru_move_tail_fn);
                local_unlock_irqrestore(&lru_rotate.lock, flags);
        }

        fbatch = &fbatches->lru_deactivate_file;
        if (folio_batch_count(fbatch))
                folio_batch_move_lru(fbatch, lru_deactivate_file_fn);

        fbatch = &fbatches->lru_deactivate;
        if (folio_batch_count(fbatch))
                folio_batch_move_lru(fbatch, lru_deactivate_fn);

        fbatch = &fbatches->lru_lazyfree;
        if (folio_batch_count(fbatch))
                folio_batch_move_lru(fbatch, lru_lazyfree_fn);

        folio_activate_drain(cpu);
}

/**
 * deactivate_file_folio() - Deactivate a file folio.
 * @folio: Folio to deactivate.
 *
 * This function hints to the VM that @folio is a good reclaim candidate,
 * for example if its invalidation fails due to the folio being dirty
 * or under writeback.
 *
 * Context: Caller holds a reference on the folio.
 */
void deactivate_file_folio(struct folio *folio)
{
        struct folio_batch *fbatch;

        /* Deactivating an unevictable folio will not accelerate reclaim */
        if (folio_test_unevictable(folio))
                return;

        folio_get(folio);
        local_lock(&cpu_fbatches.lock);
        fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file);
        folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn);
        local_unlock(&cpu_fbatches.lock);
}

/*
 * folio_deactivate - deactivate a folio
 * @folio: folio to deactivate
 *
 * folio_deactivate() moves @folio to the inactive list if @folio was on the
 * active list and was not unevictable. This is done to accelerate the
 * reclaim of @folio.
 */
void folio_deactivate(struct folio *folio)
{
        if (folio_test_lru(folio) && !folio_test_unevictable(folio) &&
            (folio_test_active(folio) || lru_gen_enabled())) {
                struct folio_batch *fbatch;

                folio_get(folio);
                local_lock(&cpu_fbatches.lock);
                fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate);
                folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn);
                local_unlock(&cpu_fbatches.lock);
        }
}

/**
 * folio_mark_lazyfree - make an anon folio lazyfree
 * @folio: folio to deactivate
 *
 * folio_mark_lazyfree() moves @folio to the inactive file list.
 * This is done to accelerate the reclaim of @folio.
 */
void folio_mark_lazyfree(struct folio *folio)
{
        if (folio_test_lru(folio) && folio_test_anon(folio) &&
            folio_test_swapbacked(folio) && !folio_test_swapcache(folio) &&
            !folio_test_unevictable(folio)) {
                struct folio_batch *fbatch;

                folio_get(folio);
                local_lock(&cpu_fbatches.lock);
                fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree);
                folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn);
                local_unlock(&cpu_fbatches.lock);
        }
}

void lru_add_drain(void)
{
        local_lock(&cpu_fbatches.lock);
        lru_add_drain_cpu(smp_processor_id());
        local_unlock(&cpu_fbatches.lock);
        mlock_drain_local();
}

/*
 * It's called from per-cpu workqueue context in SMP case so
 * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on
 * the same cpu. It shouldn't be a problem in !SMP case since
 * the core is only one and the locks will disable preemption.
 */
static void lru_add_and_bh_lrus_drain(void)
{
        local_lock(&cpu_fbatches.lock);
        lru_add_drain_cpu(smp_processor_id());
        local_unlock(&cpu_fbatches.lock);
        invalidate_bh_lrus_cpu();
        mlock_drain_local();
}

void lru_add_drain_cpu_zone(struct zone *zone)
{
        local_lock(&cpu_fbatches.lock);
        lru_add_drain_cpu(smp_processor_id());
        drain_local_pages(zone);
        local_unlock(&cpu_fbatches.lock);
        mlock_drain_local();
}

#ifdef CONFIG_SMP

static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);

static void lru_add_drain_per_cpu(struct work_struct *dummy)
{
        lru_add_and_bh_lrus_drain();
}

static bool cpu_needs_drain(unsigned int cpu)
{
        struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);

        /* Check these in order of likelihood that they're not zero */
        return folio_batch_count(&fbatches->lru_add) ||
                data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) ||
                folio_batch_count(&fbatches->lru_deactivate_file) ||
                folio_batch_count(&fbatches->lru_deactivate) ||
                folio_batch_count(&fbatches->lru_lazyfree) ||
                folio_batch_count(&fbatches->activate) ||
                need_mlock_drain(cpu) ||
                has_bh_in_lru(cpu, NULL);
}

/*
 * Doesn't need any cpu hotplug locking because we do rely on per-cpu
 * kworkers being shut down before our page_alloc_cpu_dead callback is
 * executed on the offlined cpu.
 * Calling this function with cpu hotplug locks held can actually lead
 * to obscure indirect dependencies via WQ context.
 */
static inline void __lru_add_drain_all(bool force_all_cpus)
{
        /*
         * lru_drain_gen - Global pages generation number
         *
         * (A) Definition: global lru_drain_gen = x implies that all generations
         *     0 < n <= x are already *scheduled* for draining.
         *
         * This is an optimization for the highly-contended use case where a
         * user space workload keeps constantly generating a flow of pages for
         * each CPU.
         */
        static unsigned int lru_drain_gen;
        static struct cpumask has_work;
        static DEFINE_MUTEX(lock);
        unsigned cpu, this_gen;

        /*
         * Make sure nobody triggers this path before mm_percpu_wq is fully
         * initialized.
         */
        if (WARN_ON(!mm_percpu_wq))
                return;

        /*
         * Guarantee folio_batch counter stores visible by this CPU
         * are visible to other CPUs before loading the current drain
         * generation.
         */
        smp_mb();

        /*
         * (B) Locally cache global LRU draining generation number
         *
         * The read barrier ensures that the counter is loaded before the mutex
         * is taken. It pairs with smp_mb() inside the mutex critical section
         * at (D).
         */
        this_gen = smp_load_acquire(&lru_drain_gen);

        mutex_lock(&lock);

        /*
         * (C) Exit the draining operation if a newer generation, from another
         * lru_add_drain_all(), was already scheduled for draining. Check (A).
         */
        if (unlikely(this_gen != lru_drain_gen && !force_all_cpus))
                goto done;

        /*
         * (D) Increment global generation number
         *
         * Pairs with smp_load_acquire() at (B), outside of the critical
         * section. Use a full memory barrier to guarantee that the
         * new global drain generation number is stored before loading
         * folio_batch counters.
         *
         * This pairing must be done here, before the for_each_online_cpu loop
         * below which drains the page vectors.
         *
         * Let x, y, and z represent some system CPU numbers, where x < y < z.
         * Assume CPU #z is in the middle of the for_each_online_cpu loop
         * below and has already reached CPU #y's per-cpu data. CPU #x comes
         * along, adds some pages to its per-cpu vectors, then calls
         * lru_add_drain_all().
         *
         * If the paired barrier is done at any later step, e.g. after the
         * loop, CPU #x will just exit at (C) and miss flushing out all of its
         * added pages.
         */
        WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1);
        smp_mb();

        cpumask_clear(&has_work);
        for_each_online_cpu(cpu) {
                struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);

                if (cpu_needs_drain(cpu)) {
                        INIT_WORK(work, lru_add_drain_per_cpu);
                        queue_work_on(cpu, mm_percpu_wq, work);
                        __cpumask_set_cpu(cpu, &has_work);
                }
        }

        for_each_cpu(cpu, &has_work)
                flush_work(&per_cpu(lru_add_drain_work, cpu));

done:
        mutex_unlock(&lock);
}

void lru_add_drain_all(void)
{
        __lru_add_drain_all(false);
}
#else
void lru_add_drain_all(void)
{
        lru_add_drain();
}
#endif /* CONFIG_SMP */

atomic_t lru_disable_count = ATOMIC_INIT(0);

/*
 * lru_cache_disable() needs to be called before we start compiling
 * a list of pages to be migrated using isolate_lru_page().
 * It drains pages on LRU cache and then disable on all cpus until
 * lru_cache_enable is called.
 *
 * Must be paired with a call to lru_cache_enable().
 */
void lru_cache_disable(void)
{
        atomic_inc(&lru_disable_count);
        /*
         * Readers of lru_disable_count are protected by either disabling
         * preemption or rcu_read_lock:
         *
         * preempt_disable, local_irq_disable  [bh_lru_lock()]
         * rcu_read_lock                       [rt_spin_lock CONFIG_PREEMPT_RT]
         * preempt_disable                       [local_lock !CONFIG_PREEMPT_RT]
         *
         * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on
         * preempt_disable() regions of code. So any CPU which sees
         * lru_disable_count = 0 will have exited the critical
         * section when synchronize_rcu() returns.
         */
        synchronize_rcu_expedited();
#ifdef CONFIG_SMP
        __lru_add_drain_all(true);
#else
        lru_add_and_bh_lrus_drain();
#endif
}

/**
 * folios_put_refs - Reduce the reference count on a batch of folios.
 * @folios: The folios.
 * @refs: The number of refs to subtract from each folio.
 *
 * Like folio_put(), but for a batch of folios.  This is more efficient
 * than writing the loop yourself as it will optimise the locks which need
 * to be taken if the folios are freed.  The folios batch is returned
 * empty and ready to be reused for another batch; there is no need
 * to reinitialise it.  If @refs is NULL, we subtract one from each
 * folio refcount.
 *
 * Context: May be called in process or interrupt context, but not in NMI
 * context.  May be called while holding a spinlock.
 */
void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
{
        int i, j;
        struct lruvec *lruvec = NULL;
        unsigned long flags = 0;

        for (i = 0, j = 0; i < folios->nr; i++) {
                struct folio *folio = folios->folios[i];
                unsigned int nr_refs = refs ? refs[i] : 1;

                if (is_huge_zero_folio(folio))
                        continue;

                if (folio_is_zone_device(folio)) {
                        if (lruvec) {
                                unlock_page_lruvec_irqrestore(lruvec, flags);
                                lruvec = NULL;
                        }
                        if (put_devmap_managed_folio_refs(folio, nr_refs))
                                continue;
                        if (folio_ref_sub_and_test(folio, nr_refs))
                                free_zone_device_folio(folio);
                        continue;
                }

                if (!folio_ref_sub_and_test(folio, nr_refs))
                        continue;

                /* hugetlb has its own memcg */
                if (folio_test_hugetlb(folio)) {
                        if (lruvec) {
                                unlock_page_lruvec_irqrestore(lruvec, flags);
                                lruvec = NULL;
                        }
                        free_huge_folio(folio);
                        continue;
                }
                if (folio_test_large(folio) &&
                    folio_test_large_rmappable(folio))
                        folio_undo_large_rmappable(folio);

                __page_cache_release(folio, &lruvec, &flags);

                if (j != i)
                        folios->folios[j] = folio;
                j++;
        }
        if (lruvec)
                unlock_page_lruvec_irqrestore(lruvec, flags);
        if (!j) {
                folio_batch_reinit(folios);
                return;
        }

        folios->nr = j;
        mem_cgroup_uncharge_folios(folios);
        free_unref_folios(folios);
}
EXPORT_SYMBOL(folios_put_refs);

/**
 * release_pages - batched put_page()
 * @arg: array of pages to release
 * @nr: number of pages
 *
 * Decrement the reference count on all the pages in @arg.  If it
 * fell to zero, remove the page from the LRU and free it.
 *
 * Note that the argument can be an array of pages, encoded pages,
 * or folio pointers. We ignore any encoded bits, and turn any of
 * them into just a folio that gets free'd.
 */
void release_pages(release_pages_arg arg, int nr)
{
        struct folio_batch fbatch;
        int refs[PAGEVEC_SIZE];
        struct encoded_page **encoded = arg.encoded_pages;
        int i;

        folio_batch_init(&fbatch);
        for (i = 0; i < nr; i++) {
                /* Turn any of the argument types into a folio */
                struct folio *folio = page_folio(encoded_page_ptr(encoded[i]));

                /* Is our next entry actually "nr_pages" -> "nr_refs" ? */
                refs[fbatch.nr] = 1;
                if (unlikely(encoded_page_flags(encoded[i]) &
                             ENCODED_PAGE_BIT_NR_PAGES_NEXT))
                        refs[fbatch.nr] = encoded_nr_pages(encoded[++i]);

                if (folio_batch_add(&fbatch, folio) > 0)
                        continue;
                folios_put_refs(&fbatch, refs);
        }

        if (fbatch.nr)
                folios_put_refs(&fbatch, refs);
}
EXPORT_SYMBOL(release_pages);

/*
 * The folios which we're about to release may be in the deferred lru-addition
 * queues.  That would prevent them from really being freed right now.  That's
 * OK from a correctness point of view but is inefficient - those folios may be
 * cache-warm and we want to give them back to the page allocator ASAP.
 *
 * So __folio_batch_release() will drain those queues here.
 * folio_batch_move_lru() calls folios_put() directly to avoid
 * mutual recursion.
 */
void __folio_batch_release(struct folio_batch *fbatch)
{
        if (!fbatch->percpu_pvec_drained) {
                lru_add_drain();
                fbatch->percpu_pvec_drained = true;
        }
        folios_put(fbatch);
}
EXPORT_SYMBOL(__folio_batch_release);

/**
 * folio_batch_remove_exceptionals() - Prune non-folios from a batch.
 * @fbatch: The batch to prune
 *
 * find_get_entries() fills a batch with both folios and shadow/swap/DAX
 * entries.  This function prunes all the non-folio entries from @fbatch
 * without leaving holes, so that it can be passed on to folio-only batch
 * operations.
 */
void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
{
        unsigned int i, j;

        for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) {
                struct folio *folio = fbatch->folios[i];
                if (!xa_is_value(folio))
                        fbatch->folios[j++] = folio;
        }
        fbatch->nr = j;
}

/*
 * Perform any setup for the swap system
 */
void __init swap_setup(void)
{
        unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);

        /* Use a smaller cluster for small-memory machines */
        if (megs < 16)
                page_cluster = 2;
        else
                page_cluster = 3;
        /*
         * Right now other parts of the system means that we
         * _really_ don't want to cluster much more
         */
}





























































































































































































































































































































































    1 




    1 









    1 


































































































































































































































































































































































































































    1 






















    1 











    1 












    1 

















































    1 




















































    1 



















    1 




















    1 


















































































    1 








































































































































































































































































































































































































































































































































































































































































    1 

    1 




















    1 

    1 



    1 























    1 

    1 









    1 

    1 












    1 














    1 


















    1 
















    1 












    1 






    1 












    1 
















    1 
























    1 












    1 

    1 














    1 


































    1 
    1 

    1 
    1 






    1 

    1 
    1 




    1 





    1 























    1 




    1 






















    1 









    1 






    1 







    1 
















    1 











    1 




    1 











    1 

































































    1 

















    1 








    1 





















































































































































































































































































































































    1 

    1 
































































































































































    2 
















































































































































































































































    1 





























    1 











































































    1 

















    1 
    1 
























    1 





















    1 




































































































    1 
































































    1 









    1 
































    1 











    1 
















    1 











    1 






































    1 


























    1 




    1 




    1 
























































































































    1 






    1 
    1 













































    1 
    1 




































































































































    2 







    2 










    2 






    2 









































































































































































































    2 
































    2 











    2 

























































































    2 



    2 





















































































    2 
























    2 












    2 

































































    2 




































    2 







    2 


















    2 






    2 







    2 



    1 


    2 

























    1 









    2 





    2 













    2 







    2 























    2 

    1 

























    2 



    2 





















    2 




















    2 
    2 
















    2 





































































    2 








    2 

    1 













    1 
    1 

































































    1 















    1 






























































































    1 

































    1 













    1 









































































































































    1 











































    1 



















    1 






























    1 















    1 














    1 






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/sort.h>
#include <linux/rcupdate.h>
#include <linux/kthread.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/percpu_counter.h>
#include <linux/lockdep.h>
#include <linux/crc32c.h>
#include "ctree.h"
#include "extent-tree.h"
#include "transaction.h"
#include "disk-io.h"
#include "print-tree.h"
#include "volumes.h"
#include "raid56.h"
#include "locking.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
#include "qgroup.h"
#include "ref-verify.h"
#include "space-info.h"
#include "block-rsv.h"
#include "discard.h"
#include "zoned.h"
#include "dev-replace.h"
#include "fs.h"
#include "accessors.h"
#include "root-tree.h"
#include "file-item.h"
#include "orphan.h"
#include "tree-checker.h"
#include "raid-stripe-tree.h"

#undef SCRAMBLE_DELAYED_REFS


static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                               struct btrfs_delayed_ref_head *href,
                               struct btrfs_delayed_ref_node *node,
                               struct btrfs_delayed_extent_op *extra_op);
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
                                    struct extent_buffer *leaf,
                                    struct btrfs_extent_item *ei);
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
                                      u64 parent, u64 root_objectid,
                                      u64 flags, u64 owner, u64 offset,
                                      struct btrfs_key *ins, int ref_mod, u64 oref_root);
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                                     struct btrfs_delayed_ref_node *node,
                                     struct btrfs_delayed_extent_op *extent_op);
static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key);

static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
{
        return (cache->flags & bits) == bits;
}

/* simple helper to search for an existing data extent at a given offset */
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
{
        struct btrfs_root *root = btrfs_extent_root(fs_info, start);
        int ret;
        struct btrfs_key key;
        struct btrfs_path *path;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = start;
        key.offset = len;
        key.type = BTRFS_EXTENT_ITEM_KEY;
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        btrfs_free_path(path);
        return ret;
}

/*
 * helper function to lookup reference count and flags of a tree block.
 *
 * the head node for delayed ref is used to store the sum of all the
 * reference count modifications queued up in the rbtree. the head
 * node may also store the extent flags to set. This way you can check
 * to see what the reference count and extent flags would be if all of
 * the delayed refs are not processed.
 */
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *fs_info, u64 bytenr,
                             u64 offset, int metadata, u64 *refs, u64 *flags,
                             u64 *owning_root)
{
        struct btrfs_root *extent_root;
        struct btrfs_delayed_ref_head *head;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_path *path;
        struct btrfs_extent_item *ei;
        struct extent_buffer *leaf;
        struct btrfs_key key;
        u32 item_size;
        u64 num_refs;
        u64 extent_flags;
        u64 owner = 0;
        int ret;

        /*
         * If we don't have skinny metadata, don't bother doing anything
         * different
         */
        if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
                offset = fs_info->nodesize;
                metadata = 0;
        }

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        if (!trans) {
                path->skip_locking = 1;
                path->search_commit_root = 1;
        }

search_again:
        key.objectid = bytenr;
        key.offset = offset;
        if (metadata)
                key.type = BTRFS_METADATA_ITEM_KEY;
        else
                key.type = BTRFS_EXTENT_ITEM_KEY;

        extent_root = btrfs_extent_root(fs_info, bytenr);
        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
        if (ret < 0)
                goto out_free;

        if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
                if (path->slots[0]) {
                        path->slots[0]--;
                        btrfs_item_key_to_cpu(path->nodes[0], &key,
                                              path->slots[0]);
                        if (key.objectid == bytenr &&
                            key.type == BTRFS_EXTENT_ITEM_KEY &&
                            key.offset == fs_info->nodesize)
                                ret = 0;
                }
        }

        if (ret == 0) {
                leaf = path->nodes[0];
                item_size = btrfs_item_size(leaf, path->slots[0]);
                if (item_size >= sizeof(*ei)) {
                        ei = btrfs_item_ptr(leaf, path->slots[0],
                                            struct btrfs_extent_item);
                        num_refs = btrfs_extent_refs(leaf, ei);
                        extent_flags = btrfs_extent_flags(leaf, ei);
                        owner = btrfs_get_extent_owner_root(fs_info, leaf,
                                                            path->slots[0]);
                } else {
                        ret = -EUCLEAN;
                        btrfs_err(fs_info,
                        "unexpected extent item size, has %u expect >= %zu",
                                  item_size, sizeof(*ei));
                        if (trans)
                                btrfs_abort_transaction(trans, ret);
                        else
                                btrfs_handle_fs_error(fs_info, ret, NULL);

                        goto out_free;
                }

                BUG_ON(num_refs == 0);
        } else {
                num_refs = 0;
                extent_flags = 0;
                ret = 0;
        }

        if (!trans)
                goto out;

        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
        head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
        if (head) {
                if (!mutex_trylock(&head->mutex)) {
                        refcount_inc(&head->refs);
                        spin_unlock(&delayed_refs->lock);

                        btrfs_release_path(path);

                        /*
                         * Mutex was contended, block until it's released and try
                         * again
                         */
                        mutex_lock(&head->mutex);
                        mutex_unlock(&head->mutex);
                        btrfs_put_delayed_ref_head(head);
                        goto search_again;
                }
                spin_lock(&head->lock);
                if (head->extent_op && head->extent_op->update_flags)
                        extent_flags |= head->extent_op->flags_to_set;
                else
                        BUG_ON(num_refs == 0);

                num_refs += head->ref_mod;
                spin_unlock(&head->lock);
                mutex_unlock(&head->mutex);
        }
        spin_unlock(&delayed_refs->lock);
out:
        WARN_ON(num_refs == 0);
        if (refs)
                *refs = num_refs;
        if (flags)
                *flags = extent_flags;
        if (owning_root)
                *owning_root = owner;
out_free:
        btrfs_free_path(path);
        return ret;
}

/*
 * Back reference rules.  Back refs have three main goals:
 *
 * 1) differentiate between all holders of references to an extent so that
 *    when a reference is dropped we can make sure it was a valid reference
 *    before freeing the extent.
 *
 * 2) Provide enough information to quickly find the holders of an extent
 *    if we notice a given block is corrupted or bad.
 *
 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 *    maintenance.  This is actually the same as #2, but with a slightly
 *    different use case.
 *
 * There are two kinds of back refs. The implicit back refs is optimized
 * for pointers in non-shared tree blocks. For a given pointer in a block,
 * back refs of this kind provide information about the block's owner tree
 * and the pointer's key. These information allow us to find the block by
 * b-tree searching. The full back refs is for pointers in tree blocks not
 * referenced by their owner trees. The location of tree block is recorded
 * in the back refs. Actually the full back refs is generic, and can be
 * used in all cases the implicit back refs is used. The major shortcoming
 * of the full back refs is its overhead. Every time a tree block gets
 * COWed, we have to update back refs entry for all pointers in it.
 *
 * For a newly allocated tree block, we use implicit back refs for
 * pointers in it. This means most tree related operations only involve
 * implicit back refs. For a tree block created in old transaction, the
 * only way to drop a reference to it is COW it. So we can detect the
 * event that tree block loses its owner tree's reference and do the
 * back refs conversion.
 *
 * When a tree block is COWed through a tree, there are four cases:
 *
 * The reference count of the block is one and the tree is the block's
 * owner tree. Nothing to do in this case.
 *
 * The reference count of the block is one and the tree is not the
 * block's owner tree. In this case, full back refs is used for pointers
 * in the block. Remove these full back refs, add implicit back refs for
 * every pointers in the new block.
 *
 * The reference count of the block is greater than one and the tree is
 * the block's owner tree. In this case, implicit back refs is used for
 * pointers in the block. Add full back refs for every pointers in the
 * block, increase lower level extents' reference counts. The original
 * implicit back refs are entailed to the new block.
 *
 * The reference count of the block is greater than one and the tree is
 * not the block's owner tree. Add implicit back refs for every pointer in
 * the new block, increase lower level extents' reference count.
 *
 * Back Reference Key composing:
 *
 * The key objectid corresponds to the first byte in the extent,
 * The key type is used to differentiate between types of back refs.
 * There are different meanings of the key offset for different types
 * of back refs.
 *
 * File extents can be referenced by:
 *
 * - multiple snapshots, subvolumes, or different generations in one subvol
 * - different files inside a single subvolume
 * - different offsets inside a file (bookend extents in file.c)
 *
 * The extent ref structure for the implicit back refs has fields for:
 *
 * - Objectid of the subvolume root
 * - objectid of the file holding the reference
 * - original offset in the file
 * - how many bookend extents
 *
 * The key offset for the implicit back refs is hash of the first
 * three fields.
 *
 * The extent ref structure for the full back refs has field for:
 *
 * - number of pointers in the tree leaf
 *
 * The key offset for the implicit back refs is the first byte of
 * the tree leaf
 *
 * When a file extent is allocated, The implicit back refs is used.
 * the fields are filled in:
 *
 *     (root_key.objectid, inode objectid, offset in file, 1)
 *
 * When a file extent is removed file truncation, we find the
 * corresponding implicit back refs and check the following fields:
 *
 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
 *
 * Btree extents can be referenced by:
 *
 * - Different subvolumes
 *
 * Both the implicit back refs and the full back refs for tree blocks
 * only consist of key. The key offset for the implicit back refs is
 * objectid of block's owner tree. The key offset for the full back refs
 * is the first byte of parent block.
 *
 * When implicit back refs is used, information about the lowest key and
 * level of the tree block are required. These information are stored in
 * tree block info structure.
 */

/*
 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
 */
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
                                     struct btrfs_extent_inline_ref *iref,
                                     enum btrfs_inline_ref_type is_data)
{
        struct btrfs_fs_info *fs_info = eb->fs_info;
        int type = btrfs_extent_inline_ref_type(eb, iref);
        u64 offset = btrfs_extent_inline_ref_offset(eb, iref);

        if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
                ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
                return type;
        }

        if (type == BTRFS_TREE_BLOCK_REF_KEY ||
            type == BTRFS_SHARED_BLOCK_REF_KEY ||
            type == BTRFS_SHARED_DATA_REF_KEY ||
            type == BTRFS_EXTENT_DATA_REF_KEY) {
                if (is_data == BTRFS_REF_TYPE_BLOCK) {
                        if (type == BTRFS_TREE_BLOCK_REF_KEY)
                                return type;
                        if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
                                ASSERT(fs_info);
                                /*
                                 * Every shared one has parent tree block,
                                 * which must be aligned to sector size.
                                 */
                                if (offset && IS_ALIGNED(offset, fs_info->sectorsize))
                                        return type;
                        }
                } else if (is_data == BTRFS_REF_TYPE_DATA) {
                        if (type == BTRFS_EXTENT_DATA_REF_KEY)
                                return type;
                        if (type == BTRFS_SHARED_DATA_REF_KEY) {
                                ASSERT(fs_info);
                                /*
                                 * Every shared one has parent tree block,
                                 * which must be aligned to sector size.
                                 */
                                if (offset &&
                                    IS_ALIGNED(offset, fs_info->sectorsize))
                                        return type;
                        }
                } else {
                        ASSERT(is_data == BTRFS_REF_TYPE_ANY);
                        return type;
                }
        }

        WARN_ON(1);
        btrfs_print_leaf(eb);
        btrfs_err(fs_info,
                  "eb %llu iref 0x%lx invalid extent inline ref type %d",
                  eb->start, (unsigned long)iref, type);

        return BTRFS_REF_TYPE_INVALID;
}

u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
{
        u32 high_crc = ~(u32)0;
        u32 low_crc = ~(u32)0;
        __le64 lenum;

        lenum = cpu_to_le64(root_objectid);
        high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
        lenum = cpu_to_le64(owner);
        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
        lenum = cpu_to_le64(offset);
        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));

        return ((u64)high_crc << 31) ^ (u64)low_crc;
}

static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
                                     struct btrfs_extent_data_ref *ref)
{
        return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
                                    btrfs_extent_data_ref_objectid(leaf, ref),
                                    btrfs_extent_data_ref_offset(leaf, ref));
}

static int match_extent_data_ref(struct extent_buffer *leaf,
                                 struct btrfs_extent_data_ref *ref,
                                 u64 root_objectid, u64 owner, u64 offset)
{
        if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
            btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
            btrfs_extent_data_ref_offset(leaf, ref) != offset)
                return 0;
        return 1;
}

static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
                                           struct btrfs_path *path,
                                           u64 bytenr, u64 parent,
                                           u64 root_objectid,
                                           u64 owner, u64 offset)
{
        struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
        struct btrfs_key key;
        struct btrfs_extent_data_ref *ref;
        struct extent_buffer *leaf;
        u32 nritems;
        int recow;
        int ret;

        key.objectid = bytenr;
        if (parent) {
                key.type = BTRFS_SHARED_DATA_REF_KEY;
                key.offset = parent;
        } else {
                key.type = BTRFS_EXTENT_DATA_REF_KEY;
                key.offset = hash_extent_data_ref(root_objectid,
                                                  owner, offset);
        }
again:
        recow = 0;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret < 0)
                return ret;

        if (parent) {
                if (ret)
                        return -ENOENT;
                return 0;
        }

        ret = -ENOENT;
        leaf = path->nodes[0];
        nritems = btrfs_header_nritems(leaf);
        while (1) {
                if (path->slots[0] >= nritems) {
                        ret = btrfs_next_leaf(root, path);
                        if (ret) {
                                if (ret > 0)
                                        return -ENOENT;
                                return ret;
                        }

                        leaf = path->nodes[0];
                        nritems = btrfs_header_nritems(leaf);
                        recow = 1;
                }

                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
                if (key.objectid != bytenr ||
                    key.type != BTRFS_EXTENT_DATA_REF_KEY)
                        goto fail;

                ref = btrfs_item_ptr(leaf, path->slots[0],
                                     struct btrfs_extent_data_ref);

                if (match_extent_data_ref(leaf, ref, root_objectid,
                                          owner, offset)) {
                        if (recow) {
                                btrfs_release_path(path);
                                goto again;
                        }
                        ret = 0;
                        break;
                }
                path->slots[0]++;
        }
fail:
        return ret;
}

static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
                                           struct btrfs_path *path,
                                           struct btrfs_delayed_ref_node *node,
                                           u64 bytenr)
{
        struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
        struct btrfs_key key;
        struct extent_buffer *leaf;
        u64 owner = btrfs_delayed_ref_owner(node);
        u64 offset = btrfs_delayed_ref_offset(node);
        u32 size;
        u32 num_refs;
        int ret;

        key.objectid = bytenr;
        if (node->parent) {
                key.type = BTRFS_SHARED_DATA_REF_KEY;
                key.offset = node->parent;
                size = sizeof(struct btrfs_shared_data_ref);
        } else {
                key.type = BTRFS_EXTENT_DATA_REF_KEY;
                key.offset = hash_extent_data_ref(node->ref_root, owner, offset);
                size = sizeof(struct btrfs_extent_data_ref);
        }

        ret = btrfs_insert_empty_item(trans, root, path, &key, size);
        if (ret && ret != -EEXIST)
                goto fail;

        leaf = path->nodes[0];
        if (node->parent) {
                struct btrfs_shared_data_ref *ref;
                ref = btrfs_item_ptr(leaf, path->slots[0],
                                     struct btrfs_shared_data_ref);
                if (ret == 0) {
                        btrfs_set_shared_data_ref_count(leaf, ref, node->ref_mod);
                } else {
                        num_refs = btrfs_shared_data_ref_count(leaf, ref);
                        num_refs += node->ref_mod;
                        btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
                }
        } else {
                struct btrfs_extent_data_ref *ref;
                while (ret == -EEXIST) {
                        ref = btrfs_item_ptr(leaf, path->slots[0],
                                             struct btrfs_extent_data_ref);
                        if (match_extent_data_ref(leaf, ref, node->ref_root,
                                                  owner, offset))
                                break;
                        btrfs_release_path(path);
                        key.offset++;
                        ret = btrfs_insert_empty_item(trans, root, path, &key,
                                                      size);
                        if (ret && ret != -EEXIST)
                                goto fail;

                        leaf = path->nodes[0];
                }
                ref = btrfs_item_ptr(leaf, path->slots[0],
                                     struct btrfs_extent_data_ref);
                if (ret == 0) {
                        btrfs_set_extent_data_ref_root(leaf, ref, node->ref_root);
                        btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
                        btrfs_set_extent_data_ref_offset(leaf, ref, offset);
                        btrfs_set_extent_data_ref_count(leaf, ref, node->ref_mod);
                } else {
                        num_refs = btrfs_extent_data_ref_count(leaf, ref);
                        num_refs += node->ref_mod;
                        btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
                }
        }
        btrfs_mark_buffer_dirty(trans, leaf);
        ret = 0;
fail:
        btrfs_release_path(path);
        return ret;
}

static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
                                           struct btrfs_root *root,
                                           struct btrfs_path *path,
                                           int refs_to_drop)
{
        struct btrfs_key key;
        struct btrfs_extent_data_ref *ref1 = NULL;
        struct btrfs_shared_data_ref *ref2 = NULL;
        struct extent_buffer *leaf;
        u32 num_refs = 0;
        int ret = 0;

        leaf = path->nodes[0];
        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);

        if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
                ref1 = btrfs_item_ptr(leaf, path->slots[0],
                                      struct btrfs_extent_data_ref);
                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
                ref2 = btrfs_item_ptr(leaf, path->slots[0],
                                      struct btrfs_shared_data_ref);
                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
        } else {
                btrfs_err(trans->fs_info,
                          "unrecognized backref key (%llu %u %llu)",
                          key.objectid, key.type, key.offset);
                btrfs_abort_transaction(trans, -EUCLEAN);
                return -EUCLEAN;
        }

        BUG_ON(num_refs < refs_to_drop);
        num_refs -= refs_to_drop;

        if (num_refs == 0) {
                ret = btrfs_del_item(trans, root, path);
        } else {
                if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
                        btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
                else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
                        btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
                btrfs_mark_buffer_dirty(trans, leaf);
        }
        return ret;
}

static noinline u32 extent_data_ref_count(struct btrfs_path *path,
                                          struct btrfs_extent_inline_ref *iref)
{
        struct btrfs_key key;
        struct extent_buffer *leaf;
        struct btrfs_extent_data_ref *ref1;
        struct btrfs_shared_data_ref *ref2;
        u32 num_refs = 0;
        int type;

        leaf = path->nodes[0];
        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);

        if (iref) {
                /*
                 * If type is invalid, we should have bailed out earlier than
                 * this call.
                 */
                type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
                ASSERT(type != BTRFS_REF_TYPE_INVALID);
                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
                        ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
                        num_refs = btrfs_extent_data_ref_count(leaf, ref1);
                } else {
                        ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
                        num_refs = btrfs_shared_data_ref_count(leaf, ref2);
                }
        } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
                ref1 = btrfs_item_ptr(leaf, path->slots[0],
                                      struct btrfs_extent_data_ref);
                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
                ref2 = btrfs_item_ptr(leaf, path->slots[0],
                                      struct btrfs_shared_data_ref);
                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
        } else {
                WARN_ON(1);
        }
        return num_refs;
}

static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
                                          struct btrfs_path *path,
                                          u64 bytenr, u64 parent,
                                          u64 root_objectid)
{
        struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
        struct btrfs_key key;
        int ret;

        key.objectid = bytenr;
        if (parent) {
                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
                key.offset = parent;
        } else {
                key.type = BTRFS_TREE_BLOCK_REF_KEY;
                key.offset = root_objectid;
        }

        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret > 0)
                ret = -ENOENT;
        return ret;
}

static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
                                          struct btrfs_path *path,
                                          struct btrfs_delayed_ref_node *node,
                                          u64 bytenr)
{
        struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
        struct btrfs_key key;
        int ret;

        key.objectid = bytenr;
        if (node->parent) {
                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
                key.offset = node->parent;
        } else {
                key.type = BTRFS_TREE_BLOCK_REF_KEY;
                key.offset = node->ref_root;
        }

        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
        btrfs_release_path(path);
        return ret;
}

static inline int extent_ref_type(u64 parent, u64 owner)
{
        int type;
        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
                if (parent > 0)
                        type = BTRFS_SHARED_BLOCK_REF_KEY;
                else
                        type = BTRFS_TREE_BLOCK_REF_KEY;
        } else {
                if (parent > 0)
                        type = BTRFS_SHARED_DATA_REF_KEY;
                else
                        type = BTRFS_EXTENT_DATA_REF_KEY;
        }
        return type;
}

static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key)

{
        for (; level < BTRFS_MAX_LEVEL; level++) {
                if (!path->nodes[level])
                        break;
                if (path->slots[level] + 1 >=
                    btrfs_header_nritems(path->nodes[level]))
                        continue;
                if (level == 0)
                        btrfs_item_key_to_cpu(path->nodes[level], key,
                                              path->slots[level] + 1);
                else
                        btrfs_node_key_to_cpu(path->nodes[level], key,
                                              path->slots[level] + 1);
                return 0;
        }
        return 1;
}

/*
 * look for inline back ref. if back ref is found, *ref_ret is set
 * to the address of inline back ref, and 0 is returned.
 *
 * if back ref isn't found, *ref_ret is set to the address where it
 * should be inserted, and -ENOENT is returned.
 *
 * if insert is true and there are too many inline back refs, the path
 * points to the extent item, and -EAGAIN is returned.
 *
 * NOTE: inline back refs are ordered in the same way that back ref
 *         items in the tree are ordered.
 */
static noinline_for_stack
int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
                                 struct btrfs_path *path,
                                 struct btrfs_extent_inline_ref **ref_ret,
                                 u64 bytenr, u64 num_bytes,
                                 u64 parent, u64 root_objectid,
                                 u64 owner, u64 offset, int insert)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
        struct btrfs_key key;
        struct extent_buffer *leaf;
        struct btrfs_extent_item *ei;
        struct btrfs_extent_inline_ref *iref;
        u64 flags;
        u64 item_size;
        unsigned long ptr;
        unsigned long end;
        int extra_size;
        int type;
        int want;
        int ret;
        bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
        int needed;

        key.objectid = bytenr;
        key.type = BTRFS_EXTENT_ITEM_KEY;
        key.offset = num_bytes;

        want = extent_ref_type(parent, owner);
        if (insert) {
                extra_size = btrfs_extent_inline_ref_size(want);
                path->search_for_extension = 1;
                path->keep_locks = 1;
        } else
                extra_size = -1;

        /*
         * Owner is our level, so we can just add one to get the level for the
         * block we are interested in.
         */
        if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
                key.type = BTRFS_METADATA_ITEM_KEY;
                key.offset = owner;
        }

again:
        ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
        if (ret < 0)
                goto out;

        /*
         * We may be a newly converted file system which still has the old fat
         * extent entries for metadata, so try and see if we have one of those.
         */
        if (ret > 0 && skinny_metadata) {
                skinny_metadata = false;
                if (path->slots[0]) {
                        path->slots[0]--;
                        btrfs_item_key_to_cpu(path->nodes[0], &key,
                                              path->slots[0]);
                        if (key.objectid == bytenr &&
                            key.type == BTRFS_EXTENT_ITEM_KEY &&
                            key.offset == num_bytes)
                                ret = 0;
                }
                if (ret) {
                        key.objectid = bytenr;
                        key.type = BTRFS_EXTENT_ITEM_KEY;
                        key.offset = num_bytes;
                        btrfs_release_path(path);
                        goto again;
                }
        }

        if (ret && !insert) {
                ret = -ENOENT;
                goto out;
        } else if (WARN_ON(ret)) {
                btrfs_print_leaf(path->nodes[0]);
                btrfs_err(fs_info,
"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
                          bytenr, num_bytes, parent, root_objectid, owner,
                          offset);
                ret = -EUCLEAN;
                goto out;
        }

        leaf = path->nodes[0];
        item_size = btrfs_item_size(leaf, path->slots[0]);
        if (unlikely(item_size < sizeof(*ei))) {
                ret = -EUCLEAN;
                btrfs_err(fs_info,
                          "unexpected extent item size, has %llu expect >= %zu",
                          item_size, sizeof(*ei));
                btrfs_abort_transaction(trans, ret);
                goto out;
        }

        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
        flags = btrfs_extent_flags(leaf, ei);

        ptr = (unsigned long)(ei + 1);
        end = (unsigned long)ei + item_size;

        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
                ptr += sizeof(struct btrfs_tree_block_info);
                BUG_ON(ptr > end);
        }

        if (owner >= BTRFS_FIRST_FREE_OBJECTID)
                needed = BTRFS_REF_TYPE_DATA;
        else
                needed = BTRFS_REF_TYPE_BLOCK;

        ret = -ENOENT;
        while (ptr < end) {
                iref = (struct btrfs_extent_inline_ref *)ptr;
                type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
                if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
                        ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
                        ptr += btrfs_extent_inline_ref_size(type);
                        continue;
                }
                if (type == BTRFS_REF_TYPE_INVALID) {
                        ret = -EUCLEAN;
                        goto out;
                }

                if (want < type)
                        break;
                if (want > type) {
                        ptr += btrfs_extent_inline_ref_size(type);
                        continue;
                }

                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
                        struct btrfs_extent_data_ref *dref;
                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
                        if (match_extent_data_ref(leaf, dref, root_objectid,
                                                  owner, offset)) {
                                ret = 0;
                                break;
                        }
                        if (hash_extent_data_ref_item(leaf, dref) <
                            hash_extent_data_ref(root_objectid, owner, offset))
                                break;
                } else {
                        u64 ref_offset;
                        ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
                        if (parent > 0) {
                                if (parent == ref_offset) {
                                        ret = 0;
                                        break;
                                }
                                if (ref_offset < parent)
                                        break;
                        } else {
                                if (root_objectid == ref_offset) {
                                        ret = 0;
                                        break;
                                }
                                if (ref_offset < root_objectid)
                                        break;
                        }
                }
                ptr += btrfs_extent_inline_ref_size(type);
        }

        if (unlikely(ptr > end)) {
                ret = -EUCLEAN;
                btrfs_print_leaf(path->nodes[0]);
                btrfs_crit(fs_info,
"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
                           path->slots[0], root_objectid, owner, offset, parent);
                goto out;
        }

        if (ret == -ENOENT && insert) {
                if (item_size + extra_size >=
                    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
                        ret = -EAGAIN;
                        goto out;
                }
                /*
                 * To add new inline back ref, we have to make sure
                 * there is no corresponding back ref item.
                 * For simplicity, we just do not add new inline back
                 * ref if there is any kind of item for this block
                 */
                if (find_next_key(path, 0, &key) == 0 &&
                    key.objectid == bytenr &&
                    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
                        ret = -EAGAIN;
                        goto out;
                }
        }
        *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
out:
        if (insert) {
                path->keep_locks = 0;
                path->search_for_extension = 0;
                btrfs_unlock_up_safe(path, 1);
        }
        return ret;
}

/*
 * helper to add new inline back ref
 */
static noinline_for_stack
void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
                                 struct btrfs_path *path,
                                 struct btrfs_extent_inline_ref *iref,
                                 u64 parent, u64 root_objectid,
                                 u64 owner, u64 offset, int refs_to_add,
                                 struct btrfs_delayed_extent_op *extent_op)
{
        struct extent_buffer *leaf;
        struct btrfs_extent_item *ei;
        unsigned long ptr;
        unsigned long end;
        unsigned long item_offset;
        u64 refs;
        int size;
        int type;

        leaf = path->nodes[0];
        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
        item_offset = (unsigned long)iref - (unsigned long)ei;

        type = extent_ref_type(parent, owner);
        size = btrfs_extent_inline_ref_size(type);

        btrfs_extend_item(trans, path, size);

        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
        refs = btrfs_extent_refs(leaf, ei);
        refs += refs_to_add;
        btrfs_set_extent_refs(leaf, ei, refs);
        if (extent_op)
                __run_delayed_extent_op(extent_op, leaf, ei);

        ptr = (unsigned long)ei + item_offset;
        end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
        if (ptr < end - size)
                memmove_extent_buffer(leaf, ptr + size, ptr,
                                      end - size - ptr);

        iref = (struct btrfs_extent_inline_ref *)ptr;
        btrfs_set_extent_inline_ref_type(leaf, iref, type);
        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
                struct btrfs_extent_data_ref *dref;
                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
                btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
                btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
                btrfs_set_extent_data_ref_offset(leaf, dref, offset);
                btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
                struct btrfs_shared_data_ref *sref;
                sref = (struct btrfs_shared_data_ref *)(iref + 1);
                btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
        } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
        } else {
                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
        }
        btrfs_mark_buffer_dirty(trans, leaf);
}

static int lookup_extent_backref(struct btrfs_trans_handle *trans,
                                 struct btrfs_path *path,
                                 struct btrfs_extent_inline_ref **ref_ret,
                                 u64 bytenr, u64 num_bytes, u64 parent,
                                 u64 root_objectid, u64 owner, u64 offset)
{
        int ret;

        ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
                                           num_bytes, parent, root_objectid,
                                           owner, offset, 0);
        if (ret != -ENOENT)
                return ret;

        btrfs_release_path(path);
        *ref_ret = NULL;

        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
                ret = lookup_tree_block_ref(trans, path, bytenr, parent,
                                            root_objectid);
        } else {
                ret = lookup_extent_data_ref(trans, path, bytenr, parent,
                                             root_objectid, owner, offset);
        }
        return ret;
}

/*
 * helper to update/remove inline back ref
 */
static noinline_for_stack int update_inline_extent_backref(
                                  struct btrfs_trans_handle *trans,
                                  struct btrfs_path *path,
                                  struct btrfs_extent_inline_ref *iref,
                                  int refs_to_mod,
                                  struct btrfs_delayed_extent_op *extent_op)
{
        struct extent_buffer *leaf = path->nodes[0];
        struct btrfs_fs_info *fs_info = leaf->fs_info;
        struct btrfs_extent_item *ei;
        struct btrfs_extent_data_ref *dref = NULL;
        struct btrfs_shared_data_ref *sref = NULL;
        unsigned long ptr;
        unsigned long end;
        u32 item_size;
        int size;
        int type;
        u64 refs;

        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
        refs = btrfs_extent_refs(leaf, ei);
        if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) {
                struct btrfs_key key;
                u32 extent_size;

                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
                if (key.type == BTRFS_METADATA_ITEM_KEY)
                        extent_size = fs_info->nodesize;
                else
                        extent_size = key.offset;
                btrfs_print_leaf(leaf);
                btrfs_err(fs_info,
        "invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu",
                          key.objectid, extent_size, refs_to_mod, refs);
                return -EUCLEAN;
        }
        refs += refs_to_mod;
        btrfs_set_extent_refs(leaf, ei, refs);
        if (extent_op)
                __run_delayed_extent_op(extent_op, leaf, ei);

        type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
        /*
         * Function btrfs_get_extent_inline_ref_type() has already printed
         * error messages.
         */
        if (unlikely(type == BTRFS_REF_TYPE_INVALID))
                return -EUCLEAN;

        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
                refs = btrfs_extent_data_ref_count(leaf, dref);
        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
                sref = (struct btrfs_shared_data_ref *)(iref + 1);
                refs = btrfs_shared_data_ref_count(leaf, sref);
        } else {
                refs = 1;
                /*
                 * For tree blocks we can only drop one ref for it, and tree
                 * blocks should not have refs > 1.
                 *
                 * Furthermore if we're inserting a new inline backref, we
                 * won't reach this path either. That would be
                 * setup_inline_extent_backref().
                 */
                if (unlikely(refs_to_mod != -1)) {
                        struct btrfs_key key;

                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);

                        btrfs_print_leaf(leaf);
                        btrfs_err(fs_info,
                        "invalid refs_to_mod for tree block %llu, has %d expect -1",
                                  key.objectid, refs_to_mod);
                        return -EUCLEAN;
                }
        }

        if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) {
                struct btrfs_key key;
                u32 extent_size;

                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
                if (key.type == BTRFS_METADATA_ITEM_KEY)
                        extent_size = fs_info->nodesize;
                else
                        extent_size = key.offset;
                btrfs_print_leaf(leaf);
                btrfs_err(fs_info,
"invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu",
                          (unsigned long)iref, key.objectid, extent_size,
                          refs_to_mod, refs);
                return -EUCLEAN;
        }
        refs += refs_to_mod;

        if (refs > 0) {
                if (type == BTRFS_EXTENT_DATA_REF_KEY)
                        btrfs_set_extent_data_ref_count(leaf, dref, refs);
                else
                        btrfs_set_shared_data_ref_count(leaf, sref, refs);
        } else {
                size =  btrfs_extent_inline_ref_size(type);
                item_size = btrfs_item_size(leaf, path->slots[0]);
                ptr = (unsigned long)iref;
                end = (unsigned long)ei + item_size;
                if (ptr + size < end)
                        memmove_extent_buffer(leaf, ptr, ptr + size,
                                              end - ptr - size);
                item_size -= size;
                btrfs_truncate_item(trans, path, item_size, 1);
        }
        btrfs_mark_buffer_dirty(trans, leaf);
        return 0;
}

static noinline_for_stack
int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
                                 struct btrfs_path *path,
                                 u64 bytenr, u64 num_bytes, u64 parent,
                                 u64 root_objectid, u64 owner,
                                 u64 offset, int refs_to_add,
                                 struct btrfs_delayed_extent_op *extent_op)
{
        struct btrfs_extent_inline_ref *iref;
        int ret;

        ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
                                           num_bytes, parent, root_objectid,
                                           owner, offset, 1);
        if (ret == 0) {
                /*
                 * We're adding refs to a tree block we already own, this
                 * should not happen at all.
                 */
                if (owner < BTRFS_FIRST_FREE_OBJECTID) {
                        btrfs_print_leaf(path->nodes[0]);
                        btrfs_crit(trans->fs_info,
"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u",
                                   bytenr, num_bytes, root_objectid, path->slots[0]);
                        return -EUCLEAN;
                }
                ret = update_inline_extent_backref(trans, path, iref,
                                                   refs_to_add, extent_op);
        } else if (ret == -ENOENT) {
                setup_inline_extent_backref(trans, path, iref, parent,
                                            root_objectid, owner, offset,
                                            refs_to_add, extent_op);
                ret = 0;
        }
        return ret;
}

static int remove_extent_backref(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path,
                                 struct btrfs_extent_inline_ref *iref,
                                 int refs_to_drop, int is_data)
{
        int ret = 0;

        BUG_ON(!is_data && refs_to_drop != 1);
        if (iref)
                ret = update_inline_extent_backref(trans, path, iref,
                                                   -refs_to_drop, NULL);
        else if (is_data)
                ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
        else
                ret = btrfs_del_item(trans, root, path);
        return ret;
}

static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
                               u64 *discarded_bytes)
{
        int j, ret = 0;
        u64 bytes_left, end;
        u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);

        /* Adjust the range to be aligned to 512B sectors if necessary. */
        if (start != aligned_start) {
                len -= aligned_start - start;
                len = round_down(len, 1 << SECTOR_SHIFT);
                start = aligned_start;
        }

        *discarded_bytes = 0;

        if (!len)
                return 0;

        end = start + len;
        bytes_left = len;

        /* Skip any superblocks on this device. */
        for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
                u64 sb_start = btrfs_sb_offset(j);
                u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
                u64 size = sb_start - start;

                if (!in_range(sb_start, start, bytes_left) &&
                    !in_range(sb_end, start, bytes_left) &&
                    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
                        continue;

                /*
                 * Superblock spans beginning of range.  Adjust start and
                 * try again.
                 */
                if (sb_start <= start) {
                        start += sb_end - start;
                        if (start > end) {
                                bytes_left = 0;
                                break;
                        }
                        bytes_left = end - start;
                        continue;
                }

                if (size) {
                        ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
                                                   size >> SECTOR_SHIFT,
                                                   GFP_NOFS);
                        if (!ret)
                                *discarded_bytes += size;
                        else if (ret != -EOPNOTSUPP)
                                return ret;
                }

                start = sb_end;
                if (start > end) {
                        bytes_left = 0;
                        break;
                }
                bytes_left = end - start;
        }

        if (bytes_left) {
                ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
                                           bytes_left >> SECTOR_SHIFT,
                                           GFP_NOFS);
                if (!ret)
                        *discarded_bytes += bytes_left;
        }
        return ret;
}

static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
{
        struct btrfs_device *dev = stripe->dev;
        struct btrfs_fs_info *fs_info = dev->fs_info;
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
        u64 phys = stripe->physical;
        u64 len = stripe->length;
        u64 discarded = 0;
        int ret = 0;

        /* Zone reset on a zoned filesystem */
        if (btrfs_can_zone_reset(dev, phys, len)) {
                u64 src_disc;

                ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
                if (ret)
                        goto out;

                if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
                    dev != dev_replace->srcdev)
                        goto out;

                src_disc = discarded;

                /* Send to replace target as well */
                ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
                                              &discarded);
                discarded += src_disc;
        } else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
                ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
        } else {
                ret = 0;
                *bytes = 0;
        }

out:
        *bytes = discarded;
        return ret;
}

int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
                         u64 num_bytes, u64 *actual_bytes)
{
        int ret = 0;
        u64 discarded_bytes = 0;
        u64 end = bytenr + num_bytes;
        u64 cur = bytenr;

        /*
         * Avoid races with device replace and make sure the devices in the
         * stripes don't go away while we are discarding.
         */
        btrfs_bio_counter_inc_blocked(fs_info);
        while (cur < end) {
                struct btrfs_discard_stripe *stripes;
                unsigned int num_stripes;
                int i;

                num_bytes = end - cur;
                stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
                if (IS_ERR(stripes)) {
                        ret = PTR_ERR(stripes);
                        if (ret == -EOPNOTSUPP)
                                ret = 0;
                        break;
                }

                for (i = 0; i < num_stripes; i++) {
                        struct btrfs_discard_stripe *stripe = stripes + i;
                        u64 bytes;

                        if (!stripe->dev->bdev) {
                                ASSERT(btrfs_test_opt(fs_info, DEGRADED));
                                continue;
                        }

                        if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
                                        &stripe->dev->dev_state))
                                continue;

                        ret = do_discard_extent(stripe, &bytes);
                        if (ret) {
                                /*
                                 * Keep going if discard is not supported by the
                                 * device.
                                 */
                                if (ret != -EOPNOTSUPP)
                                        break;
                                ret = 0;
                        } else {
                                discarded_bytes += bytes;
                        }
                }
                kfree(stripes);
                if (ret)
                        break;
                cur += num_bytes;
        }
        btrfs_bio_counter_dec(fs_info);
        if (actual_bytes)
                *actual_bytes = discarded_bytes;
        return ret;
}

/* Can return -ENOMEM */
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_ref *generic_ref)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        int ret;

        ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
               generic_ref->action);
        BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
               generic_ref->ref_root == BTRFS_TREE_LOG_OBJECTID);

        if (generic_ref->type == BTRFS_REF_METADATA)
                ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
        else
                ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);

        btrfs_ref_tree_mod(fs_info, generic_ref);

        return ret;
}

/*
 * Insert backreference for a given extent.
 *
 * The counterpart is in __btrfs_free_extent(), with examples and more details
 * how it works.
 *
 * @trans:            Handle of transaction
 *
 * @node:            The delayed ref node used to get the bytenr/length for
 *                    extent whose references are incremented.
 *
 * @extent_op       Pointer to a structure, holding information necessary when
 *                  updating a tree block's flags
 *
 */
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                                  struct btrfs_delayed_ref_node *node,
                                  struct btrfs_delayed_extent_op *extent_op)
{
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_extent_item *item;
        struct btrfs_key key;
        u64 bytenr = node->bytenr;
        u64 num_bytes = node->num_bytes;
        u64 owner = btrfs_delayed_ref_owner(node);
        u64 offset = btrfs_delayed_ref_offset(node);
        u64 refs;
        int refs_to_add = node->ref_mod;
        int ret;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        /* this will setup the path even if it fails to insert the back ref */
        ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
                                           node->parent, node->ref_root, owner,
                                           offset, refs_to_add, extent_op);
        if ((ret < 0 && ret != -EAGAIN) || !ret)
                goto out;

        /*
         * Ok we had -EAGAIN which means we didn't have space to insert and
         * inline extent ref, so just update the reference count and add a
         * normal backref.
         */
        leaf = path->nodes[0];
        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
        refs = btrfs_extent_refs(leaf, item);
        btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
        if (extent_op)
                __run_delayed_extent_op(extent_op, leaf, item);

        btrfs_mark_buffer_dirty(trans, leaf);
        btrfs_release_path(path);

        /* now insert the actual backref */
        if (owner < BTRFS_FIRST_FREE_OBJECTID)
                ret = insert_tree_block_ref(trans, path, node, bytenr);
        else
                ret = insert_extent_data_ref(trans, path, node, bytenr);

        if (ret)
                btrfs_abort_transaction(trans, ret);
out:
        btrfs_free_path(path);
        return ret;
}

static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
                                     struct btrfs_delayed_ref_head *href)
{
        u64 root = href->owning_root;

        /*
         * Don't check must_insert_reserved, as this is called from contexts
         * where it has already been unset.
         */
        if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE ||
            !href->is_data || !is_fstree(root))
                return;

        btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes,
                                  BTRFS_QGROUP_RSV_DATA);
}

static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
                                struct btrfs_delayed_ref_head *href,
                                struct btrfs_delayed_ref_node *node,
                                struct btrfs_delayed_extent_op *extent_op,
                                bool insert_reserved)
{
        int ret = 0;
        u64 parent = 0;
        u64 flags = 0;

        trace_run_delayed_data_ref(trans->fs_info, node);

        if (node->type == BTRFS_SHARED_DATA_REF_KEY)
                parent = node->parent;

        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
                struct btrfs_key key;
                struct btrfs_squota_delta delta = {
                        .root = href->owning_root,
                        .num_bytes = node->num_bytes,
                        .is_data = true,
                        .is_inc        = true,
                        .generation = trans->transid,
                };
                u64 owner = btrfs_delayed_ref_owner(node);
                u64 offset = btrfs_delayed_ref_offset(node);

                if (extent_op)
                        flags |= extent_op->flags_to_set;

                key.objectid = node->bytenr;
                key.type = BTRFS_EXTENT_ITEM_KEY;
                key.offset = node->num_bytes;

                ret = alloc_reserved_file_extent(trans, parent, node->ref_root,
                                                 flags, owner, offset, &key,
                                                 node->ref_mod,
                                                 href->owning_root);
                free_head_ref_squota_rsv(trans->fs_info, href);
                if (!ret)
                        ret = btrfs_record_squota_delta(trans->fs_info, &delta);
        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
                ret = __btrfs_inc_extent_ref(trans, node, extent_op);
        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
                ret = __btrfs_free_extent(trans, href, node, extent_op);
        } else {
                BUG();
        }
        return ret;
}

static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
                                    struct extent_buffer *leaf,
                                    struct btrfs_extent_item *ei)
{
        u64 flags = btrfs_extent_flags(leaf, ei);
        if (extent_op->update_flags) {
                flags |= extent_op->flags_to_set;
                btrfs_set_extent_flags(leaf, ei, flags);
        }

        if (extent_op->update_key) {
                struct btrfs_tree_block_info *bi;
                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
                bi = (struct btrfs_tree_block_info *)(ei + 1);
                btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
        }
}

static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
                                 struct btrfs_delayed_ref_head *head,
                                 struct btrfs_delayed_extent_op *extent_op)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *root;
        struct btrfs_key key;
        struct btrfs_path *path;
        struct btrfs_extent_item *ei;
        struct extent_buffer *leaf;
        u32 item_size;
        int ret;
        int metadata = 1;

        if (TRANS_ABORTED(trans))
                return 0;

        if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
                metadata = 0;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = head->bytenr;

        if (metadata) {
                key.type = BTRFS_METADATA_ITEM_KEY;
                key.offset = extent_op->level;
        } else {
                key.type = BTRFS_EXTENT_ITEM_KEY;
                key.offset = head->num_bytes;
        }

        root = btrfs_extent_root(fs_info, key.objectid);
again:
        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        if (ret < 0) {
                goto out;
        } else if (ret > 0) {
                if (metadata) {
                        if (path->slots[0] > 0) {
                                path->slots[0]--;
                                btrfs_item_key_to_cpu(path->nodes[0], &key,
                                                      path->slots[0]);
                                if (key.objectid == head->bytenr &&
                                    key.type == BTRFS_EXTENT_ITEM_KEY &&
                                    key.offset == head->num_bytes)
                                        ret = 0;
                        }
                        if (ret > 0) {
                                btrfs_release_path(path);
                                metadata = 0;

                                key.objectid = head->bytenr;
                                key.offset = head->num_bytes;
                                key.type = BTRFS_EXTENT_ITEM_KEY;
                                goto again;
                        }
                } else {
                        ret = -EUCLEAN;
                        btrfs_err(fs_info,
                  "missing extent item for extent %llu num_bytes %llu level %d",
                                  head->bytenr, head->num_bytes, extent_op->level);
                        goto out;
                }
        }

        leaf = path->nodes[0];
        item_size = btrfs_item_size(leaf, path->slots[0]);

        if (unlikely(item_size < sizeof(*ei))) {
                ret = -EUCLEAN;
                btrfs_err(fs_info,
                          "unexpected extent item size, has %u expect >= %zu",
                          item_size, sizeof(*ei));
                btrfs_abort_transaction(trans, ret);
                goto out;
        }

        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
        __run_delayed_extent_op(extent_op, leaf, ei);

        btrfs_mark_buffer_dirty(trans, leaf);
out:
        btrfs_free_path(path);
        return ret;
}

static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
                                struct btrfs_delayed_ref_head *href,
                                struct btrfs_delayed_ref_node *node,
                                struct btrfs_delayed_extent_op *extent_op,
                                bool insert_reserved)
{
        int ret = 0;
        struct btrfs_fs_info *fs_info = trans->fs_info;
        u64 parent = 0;
        u64 ref_root = 0;

        trace_run_delayed_tree_ref(trans->fs_info, node);

        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
                parent = node->parent;
        ref_root = node->ref_root;

        if (unlikely(node->ref_mod != 1)) {
                btrfs_err(trans->fs_info,
        "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu",
                          node->bytenr, node->ref_mod, node->action, ref_root,
                          parent);
                return -EUCLEAN;
        }
        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
                struct btrfs_squota_delta delta = {
                        .root = href->owning_root,
                        .num_bytes = fs_info->nodesize,
                        .is_data = false,
                        .is_inc = true,
                        .generation = trans->transid,
                };

                BUG_ON(!extent_op || !extent_op->update_flags);
                ret = alloc_reserved_tree_block(trans, node, extent_op);
                if (!ret)
                        btrfs_record_squota_delta(fs_info, &delta);
        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
                ret = __btrfs_inc_extent_ref(trans, node, extent_op);
        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
                ret = __btrfs_free_extent(trans, href, node, extent_op);
        } else {
                BUG();
        }
        return ret;
}

/* helper function to actually process a single delayed ref entry */
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                               struct btrfs_delayed_ref_head *href,
                               struct btrfs_delayed_ref_node *node,
                               struct btrfs_delayed_extent_op *extent_op,
                               bool insert_reserved)
{
        int ret = 0;

        if (TRANS_ABORTED(trans)) {
                if (insert_reserved) {
                        btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
                        free_head_ref_squota_rsv(trans->fs_info, href);
                }
                return 0;
        }

        if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
            node->type == BTRFS_SHARED_BLOCK_REF_KEY)
                ret = run_delayed_tree_ref(trans, href, node, extent_op,
                                           insert_reserved);
        else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
                 node->type == BTRFS_SHARED_DATA_REF_KEY)
                ret = run_delayed_data_ref(trans, href, node, extent_op,
                                           insert_reserved);
        else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY)
                ret = 0;
        else
                BUG();
        if (ret && insert_reserved)
                btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
        if (ret < 0)
                btrfs_err(trans->fs_info,
"failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d",
                          node->bytenr, node->num_bytes, node->type,
                          node->action, node->ref_mod, ret);
        return ret;
}

static inline struct btrfs_delayed_ref_node *
select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
        struct btrfs_delayed_ref_node *ref;

        if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
                return NULL;

        /*
         * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
         * This is to prevent a ref count from going down to zero, which deletes
         * the extent item from the extent tree, when there still are references
         * to add, which would fail because they would not find the extent item.
         */
        if (!list_empty(&head->ref_add_list))
                return list_first_entry(&head->ref_add_list,
                                struct btrfs_delayed_ref_node, add_list);

        ref = rb_entry(rb_first_cached(&head->ref_tree),
                       struct btrfs_delayed_ref_node, ref_node);
        ASSERT(list_empty(&ref->add_list));
        return ref;
}

static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
                                      struct btrfs_delayed_ref_head *head)
{
        spin_lock(&delayed_refs->lock);
        head->processing = false;
        delayed_refs->num_heads_ready++;
        spin_unlock(&delayed_refs->lock);
        btrfs_delayed_ref_unlock(head);
}

static struct btrfs_delayed_extent_op *cleanup_extent_op(
                                struct btrfs_delayed_ref_head *head)
{
        struct btrfs_delayed_extent_op *extent_op = head->extent_op;

        if (!extent_op)
                return NULL;

        if (head->must_insert_reserved) {
                head->extent_op = NULL;
                btrfs_free_delayed_extent_op(extent_op);
                return NULL;
        }
        return extent_op;
}

static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
                                     struct btrfs_delayed_ref_head *head)
{
        struct btrfs_delayed_extent_op *extent_op;
        int ret;

        extent_op = cleanup_extent_op(head);
        if (!extent_op)
                return 0;
        head->extent_op = NULL;
        spin_unlock(&head->lock);
        ret = run_delayed_extent_op(trans, head, extent_op);
        btrfs_free_delayed_extent_op(extent_op);
        return ret ? ret : 1;
}

u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
                                  struct btrfs_delayed_ref_root *delayed_refs,
                                  struct btrfs_delayed_ref_head *head)
{
        u64 ret = 0;

        /*
         * We had csum deletions accounted for in our delayed refs rsv, we need
         * to drop the csum leaves for this update from our delayed_refs_rsv.
         */
        if (head->total_ref_mod < 0 && head->is_data) {
                int nr_csums;

                spin_lock(&delayed_refs->lock);
                delayed_refs->pending_csums -= head->num_bytes;
                spin_unlock(&delayed_refs->lock);
                nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);

                btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums);

                ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);
        }
        /* must_insert_reserved can be set only if we didn't run the head ref. */
        if (head->must_insert_reserved)
                free_head_ref_squota_rsv(fs_info, head);

        return ret;
}

static int cleanup_ref_head(struct btrfs_trans_handle *trans,
                            struct btrfs_delayed_ref_head *head,
                            u64 *bytes_released)
{

        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_delayed_ref_root *delayed_refs;
        int ret;

        delayed_refs = &trans->transaction->delayed_refs;

        ret = run_and_cleanup_extent_op(trans, head);
        if (ret < 0) {
                unselect_delayed_ref_head(delayed_refs, head);
                btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
                return ret;
        } else if (ret) {
                return ret;
        }

        /*
         * Need to drop our head ref lock and re-acquire the delayed ref lock
         * and then re-check to make sure nobody got added.
         */
        spin_unlock(&head->lock);
        spin_lock(&delayed_refs->lock);
        spin_lock(&head->lock);
        if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
                spin_unlock(&head->lock);
                spin_unlock(&delayed_refs->lock);
                return 1;
        }
        btrfs_delete_ref_head(delayed_refs, head);
        spin_unlock(&head->lock);
        spin_unlock(&delayed_refs->lock);

        if (head->must_insert_reserved) {
                btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
                if (head->is_data) {
                        struct btrfs_root *csum_root;

                        csum_root = btrfs_csum_root(fs_info, head->bytenr);
                        ret = btrfs_del_csums(trans, csum_root, head->bytenr,
                                              head->num_bytes);
                }
        }

        *bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);

        trace_run_delayed_ref_head(fs_info, head, 0);
        btrfs_delayed_ref_unlock(head);
        btrfs_put_delayed_ref_head(head);
        return ret;
}

static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
                                        struct btrfs_trans_handle *trans)
{
        struct btrfs_delayed_ref_root *delayed_refs =
                &trans->transaction->delayed_refs;
        struct btrfs_delayed_ref_head *head = NULL;
        int ret;

        spin_lock(&delayed_refs->lock);
        head = btrfs_select_ref_head(delayed_refs);
        if (!head) {
                spin_unlock(&delayed_refs->lock);
                return head;
        }

        /*
         * Grab the lock that says we are going to process all the refs for
         * this head
         */
        ret = btrfs_delayed_ref_lock(delayed_refs, head);
        spin_unlock(&delayed_refs->lock);

        /*
         * We may have dropped the spin lock to get the head mutex lock, and
         * that might have given someone else time to free the head.  If that's
         * true, it has been removed from our list and we can move on.
         */
        if (ret == -EAGAIN)
                head = ERR_PTR(-EAGAIN);

        return head;
}

static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
                                           struct btrfs_delayed_ref_head *locked_ref,
                                           u64 *bytes_released)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_extent_op *extent_op;
        struct btrfs_delayed_ref_node *ref;
        bool must_insert_reserved;
        int ret;

        delayed_refs = &trans->transaction->delayed_refs;

        lockdep_assert_held(&locked_ref->mutex);
        lockdep_assert_held(&locked_ref->lock);

        while ((ref = select_delayed_ref(locked_ref))) {
                if (ref->seq &&
                    btrfs_check_delayed_seq(fs_info, ref->seq)) {
                        spin_unlock(&locked_ref->lock);
                        unselect_delayed_ref_head(delayed_refs, locked_ref);
                        return -EAGAIN;
                }

                rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
                RB_CLEAR_NODE(&ref->ref_node);
                if (!list_empty(&ref->add_list))
                        list_del(&ref->add_list);
                /*
                 * When we play the delayed ref, also correct the ref_mod on
                 * head
                 */
                switch (ref->action) {
                case BTRFS_ADD_DELAYED_REF:
                case BTRFS_ADD_DELAYED_EXTENT:
                        locked_ref->ref_mod -= ref->ref_mod;
                        break;
                case BTRFS_DROP_DELAYED_REF:
                        locked_ref->ref_mod += ref->ref_mod;
                        break;
                default:
                        WARN_ON(1);
                }
                atomic_dec(&delayed_refs->num_entries);

                /*
                 * Record the must_insert_reserved flag before we drop the
                 * spin lock.
                 */
                must_insert_reserved = locked_ref->must_insert_reserved;
                /*
                 * Unsetting this on the head ref relinquishes ownership of
                 * the rsv_bytes, so it is critical that every possible code
                 * path from here forward frees all reserves including qgroup
                 * reserve.
                 */
                locked_ref->must_insert_reserved = false;

                extent_op = locked_ref->extent_op;
                locked_ref->extent_op = NULL;
                spin_unlock(&locked_ref->lock);

                ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op,
                                          must_insert_reserved);
                btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
                *bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1);

                btrfs_free_delayed_extent_op(extent_op);
                if (ret) {
                        unselect_delayed_ref_head(delayed_refs, locked_ref);
                        btrfs_put_delayed_ref(ref);
                        return ret;
                }

                btrfs_put_delayed_ref(ref);
                cond_resched();

                spin_lock(&locked_ref->lock);
                btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
        }

        return 0;
}

/*
 * Returns 0 on success or if called with an already aborted transaction.
 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
 */
static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                                             u64 min_bytes)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_head *locked_ref = NULL;
        int ret;
        unsigned long count = 0;
        unsigned long max_count = 0;
        u64 bytes_processed = 0;

        delayed_refs = &trans->transaction->delayed_refs;
        if (min_bytes == 0) {
                max_count = delayed_refs->num_heads_ready;
                min_bytes = U64_MAX;
        }

        do {
                if (!locked_ref) {
                        locked_ref = btrfs_obtain_ref_head(trans);
                        if (IS_ERR_OR_NULL(locked_ref)) {
                                if (PTR_ERR(locked_ref) == -EAGAIN) {
                                        continue;
                                } else {
                                        break;
                                }
                        }
                        count++;
                }
                /*
                 * We need to try and merge add/drops of the same ref since we
                 * can run into issues with relocate dropping the implicit ref
                 * and then it being added back again before the drop can
                 * finish.  If we merged anything we need to re-loop so we can
                 * get a good ref.
                 * Or we can get node references of the same type that weren't
                 * merged when created due to bumps in the tree mod seq, and
                 * we need to merge them to prevent adding an inline extent
                 * backref before dropping it (triggering a BUG_ON at
                 * insert_inline_extent_backref()).
                 */
                spin_lock(&locked_ref->lock);
                btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);

                ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed);
                if (ret < 0 && ret != -EAGAIN) {
                        /*
                         * Error, btrfs_run_delayed_refs_for_head already
                         * unlocked everything so just bail out
                         */
                        return ret;
                } else if (!ret) {
                        /*
                         * Success, perform the usual cleanup of a processed
                         * head
                         */
                        ret = cleanup_ref_head(trans, locked_ref, &bytes_processed);
                        if (ret > 0 ) {
                                /* We dropped our lock, we need to loop. */
                                ret = 0;
                                continue;
                        } else if (ret) {
                                return ret;
                        }
                }

                /*
                 * Either success case or btrfs_run_delayed_refs_for_head
                 * returned -EAGAIN, meaning we need to select another head
                 */

                locked_ref = NULL;
                cond_resched();
        } while ((min_bytes != U64_MAX && bytes_processed < min_bytes) ||
                 (max_count > 0 && count < max_count) ||
                 locked_ref);

        return 0;
}

#ifdef SCRAMBLE_DELAYED_REFS
/*
 * Normally delayed refs get processed in ascending bytenr order. This
 * correlates in most cases to the order added. To expose dependencies on this
 * order, we start to process the tree in the middle instead of the beginning
 */
static u64 find_middle(struct rb_root *root)
{
        struct rb_node *n = root->rb_node;
        struct btrfs_delayed_ref_node *entry;
        int alt = 1;
        u64 middle;
        u64 first = 0, last = 0;

        n = rb_first(root);
        if (n) {
                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
                first = entry->bytenr;
        }
        n = rb_last(root);
        if (n) {
                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
                last = entry->bytenr;
        }
        n = root->rb_node;

        while (n) {
                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
                WARN_ON(!entry->in_tree);

                middle = entry->bytenr;

                if (alt)
                        n = n->rb_left;
                else
                        n = n->rb_right;

                alt = 1 - alt;
        }
        return middle;
}
#endif

/*
 * Start processing the delayed reference count updates and extent insertions
 * we have queued up so far.
 *
 * @trans:        Transaction handle.
 * @min_bytes:        How many bytes of delayed references to process. After this
 *                many bytes we stop processing delayed references if there are
 *                any more. If 0 it means to run all existing delayed references,
 *                but not new ones added after running all existing ones.
 *                Use (u64)-1 (U64_MAX) to run all existing delayed references
 *                plus any new ones that are added.
 *
 * Returns 0 on success or if called with an aborted transaction
 * Returns <0 on error and aborts the transaction
 */
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_delayed_ref_root *delayed_refs;
        int ret;

        /* We'll clean this up in btrfs_cleanup_transaction */
        if (TRANS_ABORTED(trans))
                return 0;

        if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
                return 0;

        delayed_refs = &trans->transaction->delayed_refs;
again:
#ifdef SCRAMBLE_DELAYED_REFS
        delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
        ret = __btrfs_run_delayed_refs(trans, min_bytes);
        if (ret < 0) {
                btrfs_abort_transaction(trans, ret);
                return ret;
        }

        if (min_bytes == U64_MAX) {
                btrfs_create_pending_block_groups(trans);

                spin_lock(&delayed_refs->lock);
                if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) {
                        spin_unlock(&delayed_refs->lock);
                        return 0;
                }
                spin_unlock(&delayed_refs->lock);

                cond_resched();
                goto again;
        }

        return 0;
}

int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
                                struct extent_buffer *eb, u64 flags)
{
        struct btrfs_delayed_extent_op *extent_op;
        int level = btrfs_header_level(eb);
        int ret;

        extent_op = btrfs_alloc_delayed_extent_op();
        if (!extent_op)
                return -ENOMEM;

        extent_op->flags_to_set = flags;
        extent_op->update_flags = true;
        extent_op->update_key = false;
        extent_op->level = level;

        ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
        if (ret)
                btrfs_free_delayed_extent_op(extent_op);
        return ret;
}

static noinline int check_delayed_ref(struct btrfs_root *root,
                                      struct btrfs_path *path,
                                      u64 objectid, u64 offset, u64 bytenr)
{
        struct btrfs_delayed_ref_head *head;
        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_transaction *cur_trans;
        struct rb_node *node;
        int ret = 0;

        spin_lock(&root->fs_info->trans_lock);
        cur_trans = root->fs_info->running_transaction;
        if (cur_trans)
                refcount_inc(&cur_trans->use_count);
        spin_unlock(&root->fs_info->trans_lock);
        if (!cur_trans)
                return 0;

        delayed_refs = &cur_trans->delayed_refs;
        spin_lock(&delayed_refs->lock);
        head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
        if (!head) {
                spin_unlock(&delayed_refs->lock);
                btrfs_put_transaction(cur_trans);
                return 0;
        }

        if (!mutex_trylock(&head->mutex)) {
                if (path->nowait) {
                        spin_unlock(&delayed_refs->lock);
                        btrfs_put_transaction(cur_trans);
                        return -EAGAIN;
                }

                refcount_inc(&head->refs);
                spin_unlock(&delayed_refs->lock);

                btrfs_release_path(path);

                /*
                 * Mutex was contended, block until it's released and let
                 * caller try again
                 */
                mutex_lock(&head->mutex);
                mutex_unlock(&head->mutex);
                btrfs_put_delayed_ref_head(head);
                btrfs_put_transaction(cur_trans);
                return -EAGAIN;
        }
        spin_unlock(&delayed_refs->lock);

        spin_lock(&head->lock);
        /*
         * XXX: We should replace this with a proper search function in the
         * future.
         */
        for (node = rb_first_cached(&head->ref_tree); node;
             node = rb_next(node)) {
                u64 ref_owner;
                u64 ref_offset;

                ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
                /* If it's a shared ref we know a cross reference exists */
                if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
                        ret = 1;
                        break;
                }

                ref_owner = btrfs_delayed_ref_owner(ref);
                ref_offset = btrfs_delayed_ref_offset(ref);

                /*
                 * If our ref doesn't match the one we're currently looking at
                 * then we have a cross reference.
                 */
                if (ref->ref_root != btrfs_root_id(root) ||
                    ref_owner != objectid || ref_offset != offset) {
                        ret = 1;
                        break;
                }
        }
        spin_unlock(&head->lock);
        mutex_unlock(&head->mutex);
        btrfs_put_transaction(cur_trans);
        return ret;
}

static noinline int check_committed_ref(struct btrfs_root *root,
                                        struct btrfs_path *path,
                                        u64 objectid, u64 offset, u64 bytenr,
                                        bool strict)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
        struct extent_buffer *leaf;
        struct btrfs_extent_data_ref *ref;
        struct btrfs_extent_inline_ref *iref;
        struct btrfs_extent_item *ei;
        struct btrfs_key key;
        u32 item_size;
        u32 expected_size;
        int type;
        int ret;

        key.objectid = bytenr;
        key.offset = (u64)-1;
        key.type = BTRFS_EXTENT_ITEM_KEY;

        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
        if (ret < 0)
                goto out;
        if (ret == 0) {
                /*
                 * Key with offset -1 found, there would have to exist an extent
                 * item with such offset, but this is out of the valid range.
                 */
                ret = -EUCLEAN;
                goto out;
        }

        ret = -ENOENT;
        if (path->slots[0] == 0)
                goto out;

        path->slots[0]--;
        leaf = path->nodes[0];
        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);

        if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
                goto out;

        ret = 1;
        item_size = btrfs_item_size(leaf, path->slots[0]);
        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
        expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY);

        /* No inline refs; we need to bail before checking for owner ref. */
        if (item_size == sizeof(*ei))
                goto out;

        /* Check for an owner ref; skip over it to the real inline refs. */
        iref = (struct btrfs_extent_inline_ref *)(ei + 1);
        type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
        if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) {
                expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
                iref = (struct btrfs_extent_inline_ref *)(iref + 1);
        }

        /* If extent item has more than 1 inline ref then it's shared */
        if (item_size != expected_size)
                goto out;

        /*
         * If extent created before last snapshot => it's shared unless the
         * snapshot has been deleted. Use the heuristic if strict is false.
         */
        if (!strict &&
            (btrfs_extent_generation(leaf, ei) <=
             btrfs_root_last_snapshot(&root->root_item)))
                goto out;

        /* If this extent has SHARED_DATA_REF then it's shared */
        type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
        if (type != BTRFS_EXTENT_DATA_REF_KEY)
                goto out;

        ref = (struct btrfs_extent_data_ref *)(&iref->offset);
        if (btrfs_extent_refs(leaf, ei) !=
            btrfs_extent_data_ref_count(leaf, ref) ||
            btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) ||
            btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
            btrfs_extent_data_ref_offset(leaf, ref) != offset)
                goto out;

        ret = 0;
out:
        return ret;
}

int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
                          u64 bytenr, bool strict, struct btrfs_path *path)
{
        int ret;

        do {
                ret = check_committed_ref(root, path, objectid,
                                          offset, bytenr, strict);
                if (ret && ret != -ENOENT)
                        goto out;

                ret = check_delayed_ref(root, path, objectid, offset, bytenr);
        } while (ret == -EAGAIN);

out:
        btrfs_release_path(path);
        if (btrfs_is_data_reloc_root(root))
                WARN_ON(ret > 0);
        return ret;
}

static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
                           int full_backref, int inc)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 parent;
        u64 ref_root;
        u32 nritems;
        struct btrfs_key key;
        struct btrfs_file_extent_item *fi;
        bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
        int i;
        int action;
        int level;
        int ret = 0;

        if (btrfs_is_testing(fs_info))
                return 0;

        ref_root = btrfs_header_owner(buf);
        nritems = btrfs_header_nritems(buf);
        level = btrfs_header_level(buf);

        if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
                return 0;

        if (full_backref)
                parent = buf->start;
        else
                parent = 0;
        if (inc)
                action = BTRFS_ADD_DELAYED_REF;
        else
                action = BTRFS_DROP_DELAYED_REF;

        for (i = 0; i < nritems; i++) {
                struct btrfs_ref ref = {
                        .action = action,
                        .parent = parent,
                        .ref_root = ref_root,
                };

                if (level == 0) {
                        btrfs_item_key_to_cpu(buf, &key, i);
                        if (key.type != BTRFS_EXTENT_DATA_KEY)
                                continue;
                        fi = btrfs_item_ptr(buf, i,
                                            struct btrfs_file_extent_item);
                        if (btrfs_file_extent_type(buf, fi) ==
                            BTRFS_FILE_EXTENT_INLINE)
                                continue;
                        ref.bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
                        if (ref.bytenr == 0)
                                continue;

                        ref.num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
                        ref.owning_root = ref_root;

                        key.offset -= btrfs_file_extent_offset(buf, fi);
                        btrfs_init_data_ref(&ref, key.objectid, key.offset,
                                            btrfs_root_id(root), for_reloc);
                        if (inc)
                                ret = btrfs_inc_extent_ref(trans, &ref);
                        else
                                ret = btrfs_free_extent(trans, &ref);
                        if (ret)
                                goto fail;
                } else {
                        /* We don't know the owning_root, leave as 0. */
                        ref.bytenr = btrfs_node_blockptr(buf, i);
                        ref.num_bytes = fs_info->nodesize;

                        btrfs_init_tree_ref(&ref, level - 1,
                                            btrfs_root_id(root), for_reloc);
                        if (inc)
                                ret = btrfs_inc_extent_ref(trans, &ref);
                        else
                                ret = btrfs_free_extent(trans, &ref);
                        if (ret)
                                goto fail;
                }
        }
        return 0;
fail:
        return ret;
}

int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                  struct extent_buffer *buf, int full_backref)
{
        return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
}

int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                  struct extent_buffer *buf, int full_backref)
{
        return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
}

static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 flags;
        u64 ret;

        if (data)
                flags = BTRFS_BLOCK_GROUP_DATA;
        else if (root == fs_info->chunk_root)
                flags = BTRFS_BLOCK_GROUP_SYSTEM;
        else
                flags = BTRFS_BLOCK_GROUP_METADATA;

        ret = btrfs_get_alloc_profile(fs_info, flags);
        return ret;
}

static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
{
        struct rb_node *leftmost;
        u64 bytenr = 0;

        read_lock(&fs_info->block_group_cache_lock);
        /* Get the block group with the lowest logical start address. */
        leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
        if (leftmost) {
                struct btrfs_block_group *bg;

                bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
                bytenr = bg->start;
        }
        read_unlock(&fs_info->block_group_cache_lock);

        return bytenr;
}

static int pin_down_extent(struct btrfs_trans_handle *trans,
                           struct btrfs_block_group *cache,
                           u64 bytenr, u64 num_bytes, int reserved)
{
        struct btrfs_fs_info *fs_info = cache->fs_info;

        spin_lock(&cache->space_info->lock);
        spin_lock(&cache->lock);
        cache->pinned += num_bytes;
        btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
                                             num_bytes);
        if (reserved) {
                cache->reserved -= num_bytes;
                cache->space_info->bytes_reserved -= num_bytes;
        }
        spin_unlock(&cache->lock);
        spin_unlock(&cache->space_info->lock);

        set_extent_bit(&trans->transaction->pinned_extents, bytenr,
                       bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
        return 0;
}

int btrfs_pin_extent(struct btrfs_trans_handle *trans,
                     u64 bytenr, u64 num_bytes, int reserved)
{
        struct btrfs_block_group *cache;

        cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
        BUG_ON(!cache); /* Logic error */

        pin_down_extent(trans, cache, bytenr, num_bytes, reserved);

        btrfs_put_block_group(cache);
        return 0;
}

int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
                                    const struct extent_buffer *eb)
{
        struct btrfs_block_group *cache;
        int ret;

        cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
        if (!cache)
                return -EINVAL;

        /*
         * Fully cache the free space first so that our pin removes the free space
         * from the cache.
         */
        ret = btrfs_cache_block_group(cache, true);
        if (ret)
                goto out;

        pin_down_extent(trans, cache, eb->start, eb->len, 0);

        /* remove us from the free space cache (if we're there at all) */
        ret = btrfs_remove_free_space(cache, eb->start, eb->len);
out:
        btrfs_put_block_group(cache);
        return ret;
}

static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
                                   u64 start, u64 num_bytes)
{
        int ret;
        struct btrfs_block_group *block_group;

        block_group = btrfs_lookup_block_group(fs_info, start);
        if (!block_group)
                return -EINVAL;

        ret = btrfs_cache_block_group(block_group, true);
        if (ret)
                goto out;

        ret = btrfs_remove_free_space(block_group, start, num_bytes);
out:
        btrfs_put_block_group(block_group);
        return ret;
}

int btrfs_exclude_logged_extents(struct extent_buffer *eb)
{
        struct btrfs_fs_info *fs_info = eb->fs_info;
        struct btrfs_file_extent_item *item;
        struct btrfs_key key;
        int found_type;
        int i;
        int ret = 0;

        if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
                return 0;

        for (i = 0; i < btrfs_header_nritems(eb); i++) {
                btrfs_item_key_to_cpu(eb, &key, i);
                if (key.type != BTRFS_EXTENT_DATA_KEY)
                        continue;
                item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
                found_type = btrfs_file_extent_type(eb, item);
                if (found_type == BTRFS_FILE_EXTENT_INLINE)
                        continue;
                if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
                        continue;
                key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
                key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
                ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
                if (ret)
                        break;
        }

        return ret;
}

static void
btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
{
        atomic_inc(&bg->reservations);
}

/*
 * Returns the free cluster for the given space info and sets empty_cluster to
 * what it should be based on the mount options.
 */
static struct btrfs_free_cluster *
fetch_cluster_info(struct btrfs_fs_info *fs_info,
                   struct btrfs_space_info *space_info, u64 *empty_cluster)
{
        struct btrfs_free_cluster *ret = NULL;

        *empty_cluster = 0;
        if (btrfs_mixed_space_info(space_info))
                return ret;

        if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
                ret = &fs_info->meta_alloc_cluster;
                if (btrfs_test_opt(fs_info, SSD))
                        *empty_cluster = SZ_2M;
                else
                        *empty_cluster = SZ_64K;
        } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
                   btrfs_test_opt(fs_info, SSD_SPREAD)) {
                *empty_cluster = SZ_2M;
                ret = &fs_info->data_alloc_cluster;
        }

        return ret;
}

static int unpin_extent_range(struct btrfs_fs_info *fs_info,
                              u64 start, u64 end,
                              const bool return_free_space)
{
        struct btrfs_block_group *cache = NULL;
        struct btrfs_space_info *space_info;
        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
        struct btrfs_free_cluster *cluster = NULL;
        u64 len;
        u64 total_unpinned = 0;
        u64 empty_cluster = 0;
        bool readonly;
        int ret = 0;

        while (start <= end) {
                readonly = false;
                if (!cache ||
                    start >= cache->start + cache->length) {
                        if (cache)
                                btrfs_put_block_group(cache);
                        total_unpinned = 0;
                        cache = btrfs_lookup_block_group(fs_info, start);
                        if (cache == NULL) {
                                /* Logic error, something removed the block group. */
                                ret = -EUCLEAN;
                                goto out;
                        }

                        cluster = fetch_cluster_info(fs_info,
                                                     cache->space_info,
                                                     &empty_cluster);
                        empty_cluster <<= 1;
                }

                len = cache->start + cache->length - start;
                len = min(len, end + 1 - start);

                if (return_free_space)
                        btrfs_add_free_space(cache, start, len);

                start += len;
                total_unpinned += len;
                space_info = cache->space_info;

                /*
                 * If this space cluster has been marked as fragmented and we've
                 * unpinned enough in this block group to potentially allow a
                 * cluster to be created inside of it go ahead and clear the
                 * fragmented check.
                 */
                if (cluster && cluster->fragmented &&
                    total_unpinned > empty_cluster) {
                        spin_lock(&cluster->lock);
                        cluster->fragmented = 0;
                        spin_unlock(&cluster->lock);
                }

                spin_lock(&space_info->lock);
                spin_lock(&cache->lock);
                cache->pinned -= len;
                btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
                space_info->max_extent_size = 0;
                if (cache->ro) {
                        space_info->bytes_readonly += len;
                        readonly = true;
                } else if (btrfs_is_zoned(fs_info)) {
                        /* Need reset before reusing in a zoned block group */
                        space_info->bytes_zone_unusable += len;
                        readonly = true;
                }
                spin_unlock(&cache->lock);
                if (!readonly && return_free_space &&
                    global_rsv->space_info == space_info) {
                        spin_lock(&global_rsv->lock);
                        if (!global_rsv->full) {
                                u64 to_add = min(len, global_rsv->size -
                                                      global_rsv->reserved);

                                global_rsv->reserved += to_add;
                                btrfs_space_info_update_bytes_may_use(fs_info,
                                                space_info, to_add);
                                if (global_rsv->reserved >= global_rsv->size)
                                        global_rsv->full = 1;
                                len -= to_add;
                        }
                        spin_unlock(&global_rsv->lock);
                }
                /* Add to any tickets we may have */
                if (!readonly && return_free_space && len)
                        btrfs_try_granting_tickets(fs_info, space_info);
                spin_unlock(&space_info->lock);
        }

        if (cache)
                btrfs_put_block_group(cache);
out:
        return ret;
}

int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group *block_group, *tmp;
        struct list_head *deleted_bgs;
        struct extent_io_tree *unpin;
        u64 start;
        u64 end;
        int ret;

        unpin = &trans->transaction->pinned_extents;

        while (!TRANS_ABORTED(trans)) {
                struct extent_state *cached_state = NULL;

                mutex_lock(&fs_info->unused_bg_unpin_mutex);
                if (!find_first_extent_bit(unpin, 0, &start, &end,
                                           EXTENT_DIRTY, &cached_state)) {
                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                        break;
                }

                if (btrfs_test_opt(fs_info, DISCARD_SYNC))
                        ret = btrfs_discard_extent(fs_info, start,
                                                   end + 1 - start, NULL);

                clear_extent_dirty(unpin, start, end, &cached_state);
                ret = unpin_extent_range(fs_info, start, end, true);
                BUG_ON(ret);
                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                free_extent_state(cached_state);
                cond_resched();
        }

        if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
                btrfs_discard_calc_delay(&fs_info->discard_ctl);
                btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
        }

        /*
         * Transaction is finished.  We don't need the lock anymore.  We
         * do need to clean up the block groups in case of a transaction
         * abort.
         */
        deleted_bgs = &trans->transaction->deleted_bgs;
        list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
                u64 trimmed = 0;

                ret = -EROFS;
                if (!TRANS_ABORTED(trans))
                        ret = btrfs_discard_extent(fs_info,
                                                   block_group->start,
                                                   block_group->length,
                                                   &trimmed);

                list_del_init(&block_group->bg_list);
                btrfs_unfreeze_block_group(block_group);
                btrfs_put_block_group(block_group);

                if (ret) {
                        const char *errstr = btrfs_decode_error(ret);
                        btrfs_warn(fs_info,
                           "discard failed while removing blockgroup: errno=%d %s",
                                   ret, errstr);
                }
        }

        return 0;
}

/*
 * Parse an extent item's inline extents looking for a simple quotas owner ref.
 *
 * @fs_info:        the btrfs_fs_info for this mount
 * @leaf:        a leaf in the extent tree containing the extent item
 * @slot:        the slot in the leaf where the extent item is found
 *
 * Returns the objectid of the root that originally allocated the extent item
 * if the inline owner ref is expected and present, otherwise 0.
 *
 * If an extent item has an owner ref item, it will be the first inline ref
 * item. Therefore the logic is to check whether there are any inline ref
 * items, then check the type of the first one.
 */
u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
                                struct extent_buffer *leaf, int slot)
{
        struct btrfs_extent_item *ei;
        struct btrfs_extent_inline_ref *iref;
        struct btrfs_extent_owner_ref *oref;
        unsigned long ptr;
        unsigned long end;
        int type;

        if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA))
                return 0;

        ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
        ptr = (unsigned long)(ei + 1);
        end = (unsigned long)ei + btrfs_item_size(leaf, slot);

        /* No inline ref items of any kind, can't check type. */
        if (ptr == end)
                return 0;

        iref = (struct btrfs_extent_inline_ref *)ptr;
        type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);

        /* We found an owner ref, get the root out of it. */
        if (type == BTRFS_EXTENT_OWNER_REF_KEY) {
                oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
                return btrfs_extent_owner_ref_root_id(leaf, oref);
        }

        /* We have inline refs, but not an owner ref. */
        return 0;
}

static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
                                     u64 bytenr, struct btrfs_squota_delta *delta)
{
        int ret;
        u64 num_bytes = delta->num_bytes;

        if (delta->is_data) {
                struct btrfs_root *csum_root;

                csum_root = btrfs_csum_root(trans->fs_info, bytenr);
                ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        return ret;
                }

                ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        return ret;
                }
        }

        ret = btrfs_record_squota_delta(trans->fs_info, delta);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                return ret;
        }

        ret = add_to_free_space_tree(trans, bytenr, num_bytes);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                return ret;
        }

        ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
        if (ret)
                btrfs_abort_transaction(trans, ret);

        return ret;
}

#define abort_and_dump(trans, path, fmt, args...)        \
({                                                        \
        btrfs_abort_transaction(trans, -EUCLEAN);        \
        btrfs_print_leaf(path->nodes[0]);                \
        btrfs_crit(trans->fs_info, fmt, ##args);        \
})

/*
 * Drop one or more refs of @node.
 *
 * 1. Locate the extent refs.
 *    It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
 *    Locate it, then reduce the refs number or remove the ref line completely.
 *
 * 2. Update the refs count in EXTENT/METADATA_ITEM
 *
 * Inline backref case:
 *
 * in extent tree we have:
 *
 *         item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
 *                refs 2 gen 6 flags DATA
 *                extent data backref root FS_TREE objectid 258 offset 0 count 1
 *                extent data backref root FS_TREE objectid 257 offset 0 count 1
 *
 * This function gets called with:
 *
 *    node->bytenr = 13631488
 *    node->num_bytes = 1048576
 *    root_objectid = FS_TREE
 *    owner_objectid = 257
 *    owner_offset = 0
 *    refs_to_drop = 1
 *
 * Then we should get some like:
 *
 *         item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
 *                refs 1 gen 6 flags DATA
 *                extent data backref root FS_TREE objectid 258 offset 0 count 1
 *
 * Keyed backref case:
 *
 * in extent tree we have:
 *
 *        item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
 *                refs 754 gen 6 flags DATA
 *        [...]
 *        item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
 *                extent data backref root FS_TREE objectid 866 offset 0 count 1
 *
 * This function get called with:
 *
 *    node->bytenr = 13631488
 *    node->num_bytes = 1048576
 *    root_objectid = FS_TREE
 *    owner_objectid = 866
 *    owner_offset = 0
 *    refs_to_drop = 1
 *
 * Then we should get some like:
 *
 *        item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
 *                refs 753 gen 6 flags DATA
 *
 * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
 */
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                               struct btrfs_delayed_ref_head *href,
                               struct btrfs_delayed_ref_node *node,
                               struct btrfs_delayed_extent_op *extent_op)
{
        struct btrfs_fs_info *info = trans->fs_info;
        struct btrfs_key key;
        struct btrfs_path *path;
        struct btrfs_root *extent_root;
        struct extent_buffer *leaf;
        struct btrfs_extent_item *ei;
        struct btrfs_extent_inline_ref *iref;
        int ret;
        int is_data;
        int extent_slot = 0;
        int found_extent = 0;
        int num_to_del = 1;
        int refs_to_drop = node->ref_mod;
        u32 item_size;
        u64 refs;
        u64 bytenr = node->bytenr;
        u64 num_bytes = node->num_bytes;
        u64 owner_objectid = btrfs_delayed_ref_owner(node);
        u64 owner_offset = btrfs_delayed_ref_offset(node);
        bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
        u64 delayed_ref_root = href->owning_root;

        extent_root = btrfs_extent_root(info, bytenr);
        ASSERT(extent_root);

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;

        if (!is_data && refs_to_drop != 1) {
                btrfs_crit(info,
"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
                           node->bytenr, refs_to_drop);
                ret = -EINVAL;
                btrfs_abort_transaction(trans, ret);
                goto out;
        }

        if (is_data)
                skinny_metadata = false;

        ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
                                    node->parent, node->ref_root, owner_objectid,
                                    owner_offset);
        if (ret == 0) {
                /*
                 * Either the inline backref or the SHARED_DATA_REF/
                 * SHARED_BLOCK_REF is found
                 *
                 * Here is a quick path to locate EXTENT/METADATA_ITEM.
                 * It's possible the EXTENT/METADATA_ITEM is near current slot.
                 */
                extent_slot = path->slots[0];
                while (extent_slot >= 0) {
                        btrfs_item_key_to_cpu(path->nodes[0], &key,
                                              extent_slot);
                        if (key.objectid != bytenr)
                                break;
                        if (key.type == BTRFS_EXTENT_ITEM_KEY &&
                            key.offset == num_bytes) {
                                found_extent = 1;
                                break;
                        }
                        if (key.type == BTRFS_METADATA_ITEM_KEY &&
                            key.offset == owner_objectid) {
                                found_extent = 1;
                                break;
                        }

                        /* Quick path didn't find the EXTEMT/METADATA_ITEM */
                        if (path->slots[0] - extent_slot > 5)
                                break;
                        extent_slot--;
                }

                if (!found_extent) {
                        if (iref) {
                                abort_and_dump(trans, path,
"invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref",
                                           path->slots[0]);
                                ret = -EUCLEAN;
                                goto out;
                        }
                        /* Must be SHARED_* item, remove the backref first */
                        ret = remove_extent_backref(trans, extent_root, path,
                                                    NULL, refs_to_drop, is_data);
                        if (ret) {
                                btrfs_abort_transaction(trans, ret);
                                goto out;
                        }
                        btrfs_release_path(path);

                        /* Slow path to locate EXTENT/METADATA_ITEM */
                        key.objectid = bytenr;
                        key.type = BTRFS_EXTENT_ITEM_KEY;
                        key.offset = num_bytes;

                        if (!is_data && skinny_metadata) {
                                key.type = BTRFS_METADATA_ITEM_KEY;
                                key.offset = owner_objectid;
                        }

                        ret = btrfs_search_slot(trans, extent_root,
                                                &key, path, -1, 1);
                        if (ret > 0 && skinny_metadata && path->slots[0]) {
                                /*
                                 * Couldn't find our skinny metadata item,
                                 * see if we have ye olde extent item.
                                 */
                                path->slots[0]--;
                                btrfs_item_key_to_cpu(path->nodes[0], &key,
                                                      path->slots[0]);
                                if (key.objectid == bytenr &&
                                    key.type == BTRFS_EXTENT_ITEM_KEY &&
                                    key.offset == num_bytes)
                                        ret = 0;
                        }

                        if (ret > 0 && skinny_metadata) {
                                skinny_metadata = false;
                                key.objectid = bytenr;
                                key.type = BTRFS_EXTENT_ITEM_KEY;
                                key.offset = num_bytes;
                                btrfs_release_path(path);
                                ret = btrfs_search_slot(trans, extent_root,
                                                        &key, path, -1, 1);
                        }

                        if (ret) {
                                if (ret > 0)
                                        btrfs_print_leaf(path->nodes[0]);
                                btrfs_err(info,
                        "umm, got %d back from search, was looking for %llu, slot %d",
                                          ret, bytenr, path->slots[0]);
                        }
                        if (ret < 0) {
                                btrfs_abort_transaction(trans, ret);
                                goto out;
                        }
                        extent_slot = path->slots[0];
                }
        } else if (WARN_ON(ret == -ENOENT)) {
                abort_and_dump(trans, path,
"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d",
                               bytenr, node->parent, node->ref_root, owner_objectid,
                               owner_offset, path->slots[0]);
                goto out;
        } else {
                btrfs_abort_transaction(trans, ret);
                goto out;
        }

        leaf = path->nodes[0];
        item_size = btrfs_item_size(leaf, extent_slot);
        if (unlikely(item_size < sizeof(*ei))) {
                ret = -EUCLEAN;
                btrfs_err(trans->fs_info,
                          "unexpected extent item size, has %u expect >= %zu",
                          item_size, sizeof(*ei));
                btrfs_abort_transaction(trans, ret);
                goto out;
        }
        ei = btrfs_item_ptr(leaf, extent_slot,
                            struct btrfs_extent_item);
        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
            key.type == BTRFS_EXTENT_ITEM_KEY) {
                struct btrfs_tree_block_info *bi;

                if (item_size < sizeof(*ei) + sizeof(*bi)) {
                        abort_and_dump(trans, path,
"invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu",
                                       key.objectid, key.type, key.offset,
                                       path->slots[0], owner_objectid, item_size,
                                       sizeof(*ei) + sizeof(*bi));
                        ret = -EUCLEAN;
                        goto out;
                }
                bi = (struct btrfs_tree_block_info *)(ei + 1);
                WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
        }

        refs = btrfs_extent_refs(leaf, ei);
        if (refs < refs_to_drop) {
                abort_and_dump(trans, path,
                "trying to drop %d refs but we only have %llu for bytenr %llu slot %u",
                               refs_to_drop, refs, bytenr, path->slots[0]);
                ret = -EUCLEAN;
                goto out;
        }
        refs -= refs_to_drop;

        if (refs > 0) {
                if (extent_op)
                        __run_delayed_extent_op(extent_op, leaf, ei);
                /*
                 * In the case of inline back ref, reference count will
                 * be updated by remove_extent_backref
                 */
                if (iref) {
                        if (!found_extent) {
                                abort_and_dump(trans, path,
"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u",
                                               path->slots[0]);
                                ret = -EUCLEAN;
                                goto out;
                        }
                } else {
                        btrfs_set_extent_refs(leaf, ei, refs);
                        btrfs_mark_buffer_dirty(trans, leaf);
                }
                if (found_extent) {
                        ret = remove_extent_backref(trans, extent_root, path,
                                                    iref, refs_to_drop, is_data);
                        if (ret) {
                                btrfs_abort_transaction(trans, ret);
                                goto out;
                        }
                }
        } else {
                struct btrfs_squota_delta delta = {
                        .root = delayed_ref_root,
                        .num_bytes = num_bytes,
                        .is_data = is_data,
                        .is_inc = false,
                        .generation = btrfs_extent_generation(leaf, ei),
                };

                /* In this branch refs == 1 */
                if (found_extent) {
                        if (is_data && refs_to_drop !=
                            extent_data_ref_count(path, iref)) {
                                abort_and_dump(trans, path,
                "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u",
                                               extent_data_ref_count(path, iref),
                                               refs_to_drop, path->slots[0]);
                                ret = -EUCLEAN;
                                goto out;
                        }
                        if (iref) {
                                if (path->slots[0] != extent_slot) {
                                        abort_and_dump(trans, path,
"invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref",
                                                       key.objectid, key.type,
                                                       key.offset, path->slots[0]);
                                        ret = -EUCLEAN;
                                        goto out;
                                }
                        } else {
                                /*
                                 * No inline ref, we must be at SHARED_* item,
                                 * And it's single ref, it must be:
                                 * |        extent_slot          ||extent_slot + 1|
                                 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
                                 */
                                if (path->slots[0] != extent_slot + 1) {
                                        abort_and_dump(trans, path,
        "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM",
                                                       path->slots[0]);
                                        ret = -EUCLEAN;
                                        goto out;
                                }
                                path->slots[0] = extent_slot;
                                num_to_del = 2;
                        }
                }
                /*
                 * We can't infer the data owner from the delayed ref, so we need
                 * to try to get it from the owning ref item.
                 *
                 * If it is not present, then that extent was not written under
                 * simple quotas mode, so we don't need to account for its deletion.
                 */
                if (is_data)
                        delta.root = btrfs_get_extent_owner_root(trans->fs_info,
                                                                 leaf, extent_slot);

                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
                                      num_to_del);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        goto out;
                }
                btrfs_release_path(path);

                ret = do_free_extent_accounting(trans, bytenr, &delta);
        }
        btrfs_release_path(path);

out:
        btrfs_free_path(path);
        return ret;
}

/*
 * when we free an block, it is possible (and likely) that we free the last
 * delayed ref for that extent as well.  This searches the delayed ref tree for
 * a given extent, and if there are no other delayed refs to be processed, it
 * removes it from the tree.
 */
static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
                                      u64 bytenr)
{
        struct btrfs_delayed_ref_head *head;
        struct btrfs_delayed_ref_root *delayed_refs;
        int ret = 0;

        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
        head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
        if (!head)
                goto out_delayed_unlock;

        spin_lock(&head->lock);
        if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
                goto out;

        if (cleanup_extent_op(head) != NULL)
                goto out;

        /*
         * waiting for the lock here would deadlock.  If someone else has it
         * locked they are already in the process of dropping it anyway
         */
        if (!mutex_trylock(&head->mutex))
                goto out;

        btrfs_delete_ref_head(delayed_refs, head);
        head->processing = false;

        spin_unlock(&head->lock);
        spin_unlock(&delayed_refs->lock);

        BUG_ON(head->extent_op);
        if (head->must_insert_reserved)
                ret = 1;

        btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
        mutex_unlock(&head->mutex);
        btrfs_put_delayed_ref_head(head);
        return ret;
out:
        spin_unlock(&head->lock);

out_delayed_unlock:
        spin_unlock(&delayed_refs->lock);
        return 0;
}

void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                           u64 root_id,
                           struct extent_buffer *buf,
                           u64 parent, int last_ref)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group *bg;
        int ret;

        if (root_id != BTRFS_TREE_LOG_OBJECTID) {
                struct btrfs_ref generic_ref = {
                        .action = BTRFS_DROP_DELAYED_REF,
                        .bytenr = buf->start,
                        .num_bytes = buf->len,
                        .parent = parent,
                        .owning_root = btrfs_header_owner(buf),
                        .ref_root = root_id,
                };

                /*
                 * Assert that the extent buffer is not cleared due to
                 * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer
                 * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for
                 * detail.
                 */
                ASSERT(btrfs_header_bytenr(buf) != 0);

                btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false);
                btrfs_ref_tree_mod(fs_info, &generic_ref);
                ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
                BUG_ON(ret); /* -ENOMEM */
        }

        if (!last_ref)
                return;

        if (btrfs_header_generation(buf) != trans->transid)
                goto out;

        if (root_id != BTRFS_TREE_LOG_OBJECTID) {
                ret = check_ref_cleanup(trans, buf->start);
                if (!ret)
                        goto out;
        }

        bg = btrfs_lookup_block_group(fs_info, buf->start);

        if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
                pin_down_extent(trans, bg, buf->start, buf->len, 1);
                btrfs_put_block_group(bg);
                goto out;
        }

        /*
         * If there are tree mod log users we may have recorded mod log
         * operations for this node.  If we re-allocate this node we
         * could replay operations on this node that happened when it
         * existed in a completely different root.  For example if it
         * was part of root A, then was reallocated to root B, and we
         * are doing a btrfs_old_search_slot(root b), we could replay
         * operations that happened when the block was part of root A,
         * giving us an inconsistent view of the btree.
         *
         * We are safe from races here because at this point no other
         * node or root points to this extent buffer, so if after this
         * check a new tree mod log user joins we will not have an
         * existing log of operations on this node that we have to
         * contend with.
         */

        if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)
                     || btrfs_is_zoned(fs_info)) {
                pin_down_extent(trans, bg, buf->start, buf->len, 1);
                btrfs_put_block_group(bg);
                goto out;
        }

        WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));

        btrfs_add_free_space(bg, buf->start, buf->len);
        btrfs_free_reserved_bytes(bg, buf->len, 0);
        btrfs_put_block_group(bg);
        trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);

out:

        /*
         * Deleting the buffer, clear the corrupt flag since it doesn't
         * matter anymore.
         */
        clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
}

/* Can return -ENOMEM */
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        int ret;

        if (btrfs_is_testing(fs_info))
                return 0;

        /*
         * tree log blocks never actually go into the extent allocation
         * tree, just update pinning info and exit early.
         */
        if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) {
                btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1);
                ret = 0;
        } else if (ref->type == BTRFS_REF_METADATA) {
                ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
        } else {
                ret = btrfs_add_delayed_data_ref(trans, ref, 0);
        }

        if (ref->ref_root != BTRFS_TREE_LOG_OBJECTID)
                btrfs_ref_tree_mod(fs_info, ref);

        return ret;
}

enum btrfs_loop_type {
        /*
         * Start caching block groups but do not wait for progress or for them
         * to be done.
         */
        LOOP_CACHING_NOWAIT,

        /*
         * Wait for the block group free_space >= the space we're waiting for if
         * the block group isn't cached.
         */
        LOOP_CACHING_WAIT,

        /*
         * Allow allocations to happen from block groups that do not yet have a
         * size classification.
         */
        LOOP_UNSET_SIZE_CLASS,

        /*
         * Allocate a chunk and then retry the allocation.
         */
        LOOP_ALLOC_CHUNK,

        /*
         * Ignore the size class restrictions for this allocation.
         */
        LOOP_WRONG_SIZE_CLASS,

        /*
         * Ignore the empty size, only try to allocate the number of bytes
         * needed for this allocation.
         */
        LOOP_NO_EMPTY_SIZE,
};

static inline void
btrfs_lock_block_group(struct btrfs_block_group *cache,
                       int delalloc)
{
        if (delalloc)
                down_read(&cache->data_rwsem);
}

static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
                       int delalloc)
{
        btrfs_get_block_group(cache);
        if (delalloc)
                down_read(&cache->data_rwsem);
}

static struct btrfs_block_group *btrfs_lock_cluster(
                   struct btrfs_block_group *block_group,
                   struct btrfs_free_cluster *cluster,
                   int delalloc)
        __acquires(&cluster->refill_lock)
{
        struct btrfs_block_group *used_bg = NULL;

        spin_lock(&cluster->refill_lock);
        while (1) {
                used_bg = cluster->block_group;
                if (!used_bg)
                        return NULL;

                if (used_bg == block_group)
                        return used_bg;

                btrfs_get_block_group(used_bg);

                if (!delalloc)
                        return used_bg;

                if (down_read_trylock(&used_bg->data_rwsem))
                        return used_bg;

                spin_unlock(&cluster->refill_lock);

                /* We should only have one-level nested. */
                down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);

                spin_lock(&cluster->refill_lock);
                if (used_bg == cluster->block_group)
                        return used_bg;

                up_read(&used_bg->data_rwsem);
                btrfs_put_block_group(used_bg);
        }
}

static inline void
btrfs_release_block_group(struct btrfs_block_group *cache,
                         int delalloc)
{
        if (delalloc)
                up_read(&cache->data_rwsem);
        btrfs_put_block_group(cache);
}

/*
 * Helper function for find_free_extent().
 *
 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
 * Return >0 to inform caller that we find nothing
 * Return 0 means we have found a location and set ffe_ctl->found_offset.
 */
static int find_free_extent_clustered(struct btrfs_block_group *bg,
                                      struct find_free_extent_ctl *ffe_ctl,
                                      struct btrfs_block_group **cluster_bg_ret)
{
        struct btrfs_block_group *cluster_bg;
        struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
        u64 aligned_cluster;
        u64 offset;
        int ret;

        cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
        if (!cluster_bg)
                goto refill_cluster;
        if (cluster_bg != bg && (cluster_bg->ro ||
            !block_group_bits(cluster_bg, ffe_ctl->flags)))
                goto release_cluster;

        offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
                        ffe_ctl->num_bytes, cluster_bg->start,
                        &ffe_ctl->max_extent_size);
        if (offset) {
                /* We have a block, we're done */
                spin_unlock(&last_ptr->refill_lock);
                trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl);
                *cluster_bg_ret = cluster_bg;
                ffe_ctl->found_offset = offset;
                return 0;
        }
        WARN_ON(last_ptr->block_group != cluster_bg);

release_cluster:
        /*
         * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
         * lets just skip it and let the allocator find whatever block it can
         * find. If we reach this point, we will have tried the cluster
         * allocator plenty of times and not have found anything, so we are
         * likely way too fragmented for the clustering stuff to find anything.
         *
         * However, if the cluster is taken from the current block group,
         * release the cluster first, so that we stand a better chance of
         * succeeding in the unclustered allocation.
         */
        if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
                spin_unlock(&last_ptr->refill_lock);
                btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
                return -ENOENT;
        }

        /* This cluster didn't work out, free it and start over */
        btrfs_return_cluster_to_free_space(NULL, last_ptr);

        if (cluster_bg != bg)
                btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);

refill_cluster:
        if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
                spin_unlock(&last_ptr->refill_lock);
                return -ENOENT;
        }

        aligned_cluster = max_t(u64,
                        ffe_ctl->empty_cluster + ffe_ctl->empty_size,
                        bg->full_stripe_len);
        ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
                        ffe_ctl->num_bytes, aligned_cluster);
        if (ret == 0) {
                /* Now pull our allocation out of this cluster */
                offset = btrfs_alloc_from_cluster(bg, last_ptr,
                                ffe_ctl->num_bytes, ffe_ctl->search_start,
                                &ffe_ctl->max_extent_size);
                if (offset) {
                        /* We found one, proceed */
                        spin_unlock(&last_ptr->refill_lock);
                        ffe_ctl->found_offset = offset;
                        trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);
                        return 0;
                }
        }
        /*
         * At this point we either didn't find a cluster or we weren't able to
         * allocate a block from our cluster.  Free the cluster we've been
         * trying to use, and go to the next block group.
         */
        btrfs_return_cluster_to_free_space(NULL, last_ptr);
        spin_unlock(&last_ptr->refill_lock);
        return 1;
}

/*
 * Return >0 to inform caller that we find nothing
 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
 */
static int find_free_extent_unclustered(struct btrfs_block_group *bg,
                                        struct find_free_extent_ctl *ffe_ctl)
{
        struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
        u64 offset;

        /*
         * We are doing an unclustered allocation, set the fragmented flag so
         * we don't bother trying to setup a cluster again until we get more
         * space.
         */
        if (unlikely(last_ptr)) {
                spin_lock(&last_ptr->lock);
                last_ptr->fragmented = 1;
                spin_unlock(&last_ptr->lock);
        }
        if (ffe_ctl->cached) {
                struct btrfs_free_space_ctl *free_space_ctl;

                free_space_ctl = bg->free_space_ctl;
                spin_lock(&free_space_ctl->tree_lock);
                if (free_space_ctl->free_space <
                    ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
                    ffe_ctl->empty_size) {
                        ffe_ctl->total_free_space = max_t(u64,
                                        ffe_ctl->total_free_space,
                                        free_space_ctl->free_space);
                        spin_unlock(&free_space_ctl->tree_lock);
                        return 1;
                }
                spin_unlock(&free_space_ctl->tree_lock);
        }

        offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
                        ffe_ctl->num_bytes, ffe_ctl->empty_size,
                        &ffe_ctl->max_extent_size);
        if (!offset)
                return 1;
        ffe_ctl->found_offset = offset;
        return 0;
}

static int do_allocation_clustered(struct btrfs_block_group *block_group,
                                   struct find_free_extent_ctl *ffe_ctl,
                                   struct btrfs_block_group **bg_ret)
{
        int ret;

        /* We want to try and use the cluster allocator, so lets look there */
        if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
                ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
                if (ret >= 0)
                        return ret;
                /* ret == -ENOENT case falls through */
        }

        return find_free_extent_unclustered(block_group, ffe_ctl);
}

/*
 * Tree-log block group locking
 * ============================
 *
 * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
 * indicates the starting address of a block group, which is reserved only
 * for tree-log metadata.
 *
 * Lock nesting
 * ============
 *
 * space_info::lock
 *   block_group::lock
 *     fs_info::treelog_bg_lock
 */

/*
 * Simple allocator for sequential-only block group. It only allows sequential
 * allocation. No need to play with trees. This function also reserves the
 * bytes as in btrfs_add_reserved_bytes.
 */
static int do_allocation_zoned(struct btrfs_block_group *block_group,
                               struct find_free_extent_ctl *ffe_ctl,
                               struct btrfs_block_group **bg_ret)
{
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        struct btrfs_space_info *space_info = block_group->space_info;
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        u64 start = block_group->start;
        u64 num_bytes = ffe_ctl->num_bytes;
        u64 avail;
        u64 bytenr = block_group->start;
        u64 log_bytenr;
        u64 data_reloc_bytenr;
        int ret = 0;
        bool skip = false;

        ASSERT(btrfs_is_zoned(block_group->fs_info));

        /*
         * Do not allow non-tree-log blocks in the dedicated tree-log block
         * group, and vice versa.
         */
        spin_lock(&fs_info->treelog_bg_lock);
        log_bytenr = fs_info->treelog_bg;
        if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
                           (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
                skip = true;
        spin_unlock(&fs_info->treelog_bg_lock);
        if (skip)
                return 1;

        /*
         * Do not allow non-relocation blocks in the dedicated relocation block
         * group, and vice versa.
         */
        spin_lock(&fs_info->relocation_bg_lock);
        data_reloc_bytenr = fs_info->data_reloc_bg;
        if (data_reloc_bytenr &&
            ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
             (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
                skip = true;
        spin_unlock(&fs_info->relocation_bg_lock);
        if (skip)
                return 1;

        /* Check RO and no space case before trying to activate it */
        spin_lock(&block_group->lock);
        if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
                ret = 1;
                /*
                 * May need to clear fs_info->{treelog,data_reloc}_bg.
                 * Return the error after taking the locks.
                 */
        }
        spin_unlock(&block_group->lock);

        /* Metadata block group is activated at write time. */
        if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
            !btrfs_zone_activate(block_group)) {
                ret = 1;
                /*
                 * May need to clear fs_info->{treelog,data_reloc}_bg.
                 * Return the error after taking the locks.
                 */
        }

        spin_lock(&space_info->lock);
        spin_lock(&block_group->lock);
        spin_lock(&fs_info->treelog_bg_lock);
        spin_lock(&fs_info->relocation_bg_lock);

        if (ret)
                goto out;

        ASSERT(!ffe_ctl->for_treelog ||
               block_group->start == fs_info->treelog_bg ||
               fs_info->treelog_bg == 0);
        ASSERT(!ffe_ctl->for_data_reloc ||
               block_group->start == fs_info->data_reloc_bg ||
               fs_info->data_reloc_bg == 0);

        if (block_group->ro ||
            (!ffe_ctl->for_data_reloc &&
             test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) {
                ret = 1;
                goto out;
        }

        /*
         * Do not allow currently using block group to be tree-log dedicated
         * block group.
         */
        if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
            (block_group->used || block_group->reserved)) {
                ret = 1;
                goto out;
        }

        /*
         * Do not allow currently used block group to be the data relocation
         * dedicated block group.
         */
        if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
            (block_group->used || block_group->reserved)) {
                ret = 1;
                goto out;
        }

        WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
        avail = block_group->zone_capacity - block_group->alloc_offset;
        if (avail < num_bytes) {
                if (ffe_ctl->max_extent_size < avail) {
                        /*
                         * With sequential allocator, free space is always
                         * contiguous
                         */
                        ffe_ctl->max_extent_size = avail;
                        ffe_ctl->total_free_space = avail;
                }
                ret = 1;
                goto out;
        }

        if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
                fs_info->treelog_bg = block_group->start;

        if (ffe_ctl->for_data_reloc) {
                if (!fs_info->data_reloc_bg)
                        fs_info->data_reloc_bg = block_group->start;
                /*
                 * Do not allow allocations from this block group, unless it is
                 * for data relocation. Compared to increasing the ->ro, setting
                 * the ->zoned_data_reloc_ongoing flag still allows nocow
                 * writers to come in. See btrfs_inc_nocow_writers().
                 *
                 * We need to disable an allocation to avoid an allocation of
                 * regular (non-relocation data) extent. With mix of relocation
                 * extents and regular extents, we can dispatch WRITE commands
                 * (for relocation extents) and ZONE APPEND commands (for
                 * regular extents) at the same time to the same zone, which
                 * easily break the write pointer.
                 *
                 * Also, this flag avoids this block group to be zone finished.
                 */
                set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
        }

        ffe_ctl->found_offset = start + block_group->alloc_offset;
        block_group->alloc_offset += num_bytes;
        spin_lock(&ctl->tree_lock);
        ctl->free_space -= num_bytes;
        spin_unlock(&ctl->tree_lock);

        /*
         * We do not check if found_offset is aligned to stripesize. The
         * address is anyway rewritten when using zone append writing.
         */

        ffe_ctl->search_start = ffe_ctl->found_offset;

out:
        if (ret && ffe_ctl->for_treelog)
                fs_info->treelog_bg = 0;
        if (ret && ffe_ctl->for_data_reloc)
                fs_info->data_reloc_bg = 0;
        spin_unlock(&fs_info->relocation_bg_lock);
        spin_unlock(&fs_info->treelog_bg_lock);
        spin_unlock(&block_group->lock);
        spin_unlock(&space_info->lock);
        return ret;
}

static int do_allocation(struct btrfs_block_group *block_group,
                         struct find_free_extent_ctl *ffe_ctl,
                         struct btrfs_block_group **bg_ret)
{
        switch (ffe_ctl->policy) {
        case BTRFS_EXTENT_ALLOC_CLUSTERED:
                return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
        case BTRFS_EXTENT_ALLOC_ZONED:
                return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
        default:
                BUG();
        }
}

static void release_block_group(struct btrfs_block_group *block_group,
                                struct find_free_extent_ctl *ffe_ctl,
                                int delalloc)
{
        switch (ffe_ctl->policy) {
        case BTRFS_EXTENT_ALLOC_CLUSTERED:
                ffe_ctl->retry_uncached = false;
                break;
        case BTRFS_EXTENT_ALLOC_ZONED:
                /* Nothing to do */
                break;
        default:
                BUG();
        }

        BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
               ffe_ctl->index);
        btrfs_release_block_group(block_group, delalloc);
}

static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
                                   struct btrfs_key *ins)
{
        struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;

        if (!ffe_ctl->use_cluster && last_ptr) {
                spin_lock(&last_ptr->lock);
                last_ptr->window_start = ins->objectid;
                spin_unlock(&last_ptr->lock);
        }
}

static void found_extent(struct find_free_extent_ctl *ffe_ctl,
                         struct btrfs_key *ins)
{
        switch (ffe_ctl->policy) {
        case BTRFS_EXTENT_ALLOC_CLUSTERED:
                found_extent_clustered(ffe_ctl, ins);
                break;
        case BTRFS_EXTENT_ALLOC_ZONED:
                /* Nothing to do */
                break;
        default:
                BUG();
        }
}

static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
                                    struct find_free_extent_ctl *ffe_ctl)
{
        /* Block group's activeness is not a requirement for METADATA block groups. */
        if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA))
                return 0;

        /* If we can activate new zone, just allocate a chunk and use it */
        if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
                return 0;

        /*
         * We already reached the max active zones. Try to finish one block
         * group to make a room for a new block group. This is only possible
         * for a data block group because btrfs_zone_finish() may need to wait
         * for a running transaction which can cause a deadlock for metadata
         * allocation.
         */
        if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
                int ret = btrfs_zone_finish_one_bg(fs_info);

                if (ret == 1)
                        return 0;
                else if (ret < 0)
                        return ret;
        }

        /*
         * If we have enough free space left in an already active block group
         * and we can't activate any other zone now, do not allow allocating a
         * new chunk and let find_free_extent() retry with a smaller size.
         */
        if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
                return -ENOSPC;

        /*
         * Even min_alloc_size is not left in any block groups. Since we cannot
         * activate a new block group, allocating it may not help. Let's tell a
         * caller to try again and hope it progress something by writing some
         * parts of the region. That is only possible for data block groups,
         * where a part of the region can be written.
         */
        if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
                return -EAGAIN;

        /*
         * We cannot activate a new block group and no enough space left in any
         * block groups. So, allocating a new block group may not help. But,
         * there is nothing to do anyway, so let's go with it.
         */
        return 0;
}

static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
                              struct find_free_extent_ctl *ffe_ctl)
{
        switch (ffe_ctl->policy) {
        case BTRFS_EXTENT_ALLOC_CLUSTERED:
                return 0;
        case BTRFS_EXTENT_ALLOC_ZONED:
                return can_allocate_chunk_zoned(fs_info, ffe_ctl);
        default:
                BUG();
        }
}

/*
 * Return >0 means caller needs to re-search for free extent
 * Return 0 means we have the needed free extent.
 * Return <0 means we failed to locate any free extent.
 */
static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
                                        struct btrfs_key *ins,
                                        struct find_free_extent_ctl *ffe_ctl,
                                        bool full_search)
{
        struct btrfs_root *root = fs_info->chunk_root;
        int ret;

        if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
            ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
                ffe_ctl->orig_have_caching_bg = true;

        if (ins->objectid) {
                found_extent(ffe_ctl, ins);
                return 0;
        }

        if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
                return 1;

        ffe_ctl->index++;
        if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
                return 1;

        /* See the comments for btrfs_loop_type for an explanation of the phases. */
        if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
                ffe_ctl->index = 0;
                /*
                 * We want to skip the LOOP_CACHING_WAIT step if we don't have
                 * any uncached bgs and we've already done a full search
                 * through.
                 */
                if (ffe_ctl->loop == LOOP_CACHING_NOWAIT &&
                    (!ffe_ctl->orig_have_caching_bg && full_search))
                        ffe_ctl->loop++;
                ffe_ctl->loop++;

                if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
                        struct btrfs_trans_handle *trans;
                        int exist = 0;

                        /* Check if allocation policy allows to create a new chunk */
                        ret = can_allocate_chunk(fs_info, ffe_ctl);
                        if (ret)
                                return ret;

                        trans = current->journal_info;
                        if (trans)
                                exist = 1;
                        else
                                trans = btrfs_join_transaction(root);

                        if (IS_ERR(trans)) {
                                ret = PTR_ERR(trans);
                                return ret;
                        }

                        ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
                                                CHUNK_ALLOC_FORCE_FOR_EXTENT);

                        /* Do not bail out on ENOSPC since we can do more. */
                        if (ret == -ENOSPC) {
                                ret = 0;
                                ffe_ctl->loop++;
                        }
                        else if (ret < 0)
                                btrfs_abort_transaction(trans, ret);
                        else
                                ret = 0;
                        if (!exist)
                                btrfs_end_transaction(trans);
                        if (ret)
                                return ret;
                }

                if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
                        if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
                                return -ENOSPC;

                        /*
                         * Don't loop again if we already have no empty_size and
                         * no empty_cluster.
                         */
                        if (ffe_ctl->empty_size == 0 &&
                            ffe_ctl->empty_cluster == 0)
                                return -ENOSPC;
                        ffe_ctl->empty_size = 0;
                        ffe_ctl->empty_cluster = 0;
                }
                return 1;
        }
        return -ENOSPC;
}

static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
                                              struct btrfs_block_group *bg)
{
        if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
                return true;
        if (!btrfs_block_group_should_use_size_class(bg))
                return true;
        if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
                return true;
        if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
            bg->size_class == BTRFS_BG_SZ_NONE)
                return true;
        return ffe_ctl->size_class == bg->size_class;
}

static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
                                        struct find_free_extent_ctl *ffe_ctl,
                                        struct btrfs_space_info *space_info,
                                        struct btrfs_key *ins)
{
        /*
         * If our free space is heavily fragmented we may not be able to make
         * big contiguous allocations, so instead of doing the expensive search
         * for free space, simply return ENOSPC with our max_extent_size so we
         * can go ahead and search for a more manageable chunk.
         *
         * If our max_extent_size is large enough for our allocation simply
         * disable clustering since we will likely not be able to find enough
         * space to create a cluster and induce latency trying.
         */
        if (space_info->max_extent_size) {
                spin_lock(&space_info->lock);
                if (space_info->max_extent_size &&
                    ffe_ctl->num_bytes > space_info->max_extent_size) {
                        ins->offset = space_info->max_extent_size;
                        spin_unlock(&space_info->lock);
                        return -ENOSPC;
                } else if (space_info->max_extent_size) {
                        ffe_ctl->use_cluster = false;
                }
                spin_unlock(&space_info->lock);
        }

        ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
                                               &ffe_ctl->empty_cluster);
        if (ffe_ctl->last_ptr) {
                struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;

                spin_lock(&last_ptr->lock);
                if (last_ptr->block_group)
                        ffe_ctl->hint_byte = last_ptr->window_start;
                if (last_ptr->fragmented) {
                        /*
                         * We still set window_start so we can keep track of the
                         * last place we found an allocation to try and save
                         * some time.
                         */
                        ffe_ctl->hint_byte = last_ptr->window_start;
                        ffe_ctl->use_cluster = false;
                }
                spin_unlock(&last_ptr->lock);
        }

        return 0;
}

static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
                                    struct find_free_extent_ctl *ffe_ctl)
{
        if (ffe_ctl->for_treelog) {
                spin_lock(&fs_info->treelog_bg_lock);
                if (fs_info->treelog_bg)
                        ffe_ctl->hint_byte = fs_info->treelog_bg;
                spin_unlock(&fs_info->treelog_bg_lock);
        } else if (ffe_ctl->for_data_reloc) {
                spin_lock(&fs_info->relocation_bg_lock);
                if (fs_info->data_reloc_bg)
                        ffe_ctl->hint_byte = fs_info->data_reloc_bg;
                spin_unlock(&fs_info->relocation_bg_lock);
        } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
                struct btrfs_block_group *block_group;

                spin_lock(&fs_info->zone_active_bgs_lock);
                list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
                        /*
                         * No lock is OK here because avail is monotinically
                         * decreasing, and this is just a hint.
                         */
                        u64 avail = block_group->zone_capacity - block_group->alloc_offset;

                        if (block_group_bits(block_group, ffe_ctl->flags) &&
                            avail >= ffe_ctl->num_bytes) {
                                ffe_ctl->hint_byte = block_group->start;
                                break;
                        }
                }
                spin_unlock(&fs_info->zone_active_bgs_lock);
        }

        return 0;
}

static int prepare_allocation(struct btrfs_fs_info *fs_info,
                              struct find_free_extent_ctl *ffe_ctl,
                              struct btrfs_space_info *space_info,
                              struct btrfs_key *ins)
{
        switch (ffe_ctl->policy) {
        case BTRFS_EXTENT_ALLOC_CLUSTERED:
                return prepare_allocation_clustered(fs_info, ffe_ctl,
                                                    space_info, ins);
        case BTRFS_EXTENT_ALLOC_ZONED:
                return prepare_allocation_zoned(fs_info, ffe_ctl);
        default:
                BUG();
        }
}

/*
 * walks the btree of allocated extents and find a hole of a given size.
 * The key ins is changed to record the hole:
 * ins->objectid == start position
 * ins->flags = BTRFS_EXTENT_ITEM_KEY
 * ins->offset == the size of the hole.
 * Any available blocks before search_start are skipped.
 *
 * If there is no suitable free space, we will record the max size of
 * the free space extent currently.
 *
 * The overall logic and call chain:
 *
 * find_free_extent()
 * |- Iterate through all block groups
 * |  |- Get a valid block group
 * |  |- Try to do clustered allocation in that block group
 * |  |- Try to do unclustered allocation in that block group
 * |  |- Check if the result is valid
 * |  |  |- If valid, then exit
 * |  |- Jump to next block group
 * |
 * |- Push harder to find free extents
 *    |- If not found, re-iterate all block groups
 */
static noinline int find_free_extent(struct btrfs_root *root,
                                     struct btrfs_key *ins,
                                     struct find_free_extent_ctl *ffe_ctl)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret = 0;
        int cache_block_group_error = 0;
        struct btrfs_block_group *block_group = NULL;
        struct btrfs_space_info *space_info;
        bool full_search = false;

        WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);

        ffe_ctl->search_start = 0;
        /* For clustered allocation */
        ffe_ctl->empty_cluster = 0;
        ffe_ctl->last_ptr = NULL;
        ffe_ctl->use_cluster = true;
        ffe_ctl->have_caching_bg = false;
        ffe_ctl->orig_have_caching_bg = false;
        ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
        ffe_ctl->loop = 0;
        ffe_ctl->retry_uncached = false;
        ffe_ctl->cached = 0;
        ffe_ctl->max_extent_size = 0;
        ffe_ctl->total_free_space = 0;
        ffe_ctl->found_offset = 0;
        ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
        ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes);

        if (btrfs_is_zoned(fs_info))
                ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;

        ins->type = BTRFS_EXTENT_ITEM_KEY;
        ins->objectid = 0;
        ins->offset = 0;

        trace_find_free_extent(root, ffe_ctl);

        space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
        if (!space_info) {
                btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
                return -ENOSPC;
        }

        ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
        if (ret < 0)
                return ret;

        ffe_ctl->search_start = max(ffe_ctl->search_start,
                                    first_logical_byte(fs_info));
        ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
        if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
                block_group = btrfs_lookup_block_group(fs_info,
                                                       ffe_ctl->search_start);
                /*
                 * we don't want to use the block group if it doesn't match our
                 * allocation bits, or if its not cached.
                 *
                 * However if we are re-searching with an ideal block group
                 * picked out then we don't care that the block group is cached.
                 */
                if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
                    block_group->cached != BTRFS_CACHE_NO) {
                        down_read(&space_info->groups_sem);
                        if (list_empty(&block_group->list) ||
                            block_group->ro) {
                                /*
                                 * someone is removing this block group,
                                 * we can't jump into the have_block_group
                                 * target because our list pointers are not
                                 * valid
                                 */
                                btrfs_put_block_group(block_group);
                                up_read(&space_info->groups_sem);
                        } else {
                                ffe_ctl->index = btrfs_bg_flags_to_raid_index(
                                                        block_group->flags);
                                btrfs_lock_block_group(block_group,
                                                       ffe_ctl->delalloc);
                                ffe_ctl->hinted = true;
                                goto have_block_group;
                        }
                } else if (block_group) {
                        btrfs_put_block_group(block_group);
                }
        }
search:
        trace_find_free_extent_search_loop(root, ffe_ctl);
        ffe_ctl->have_caching_bg = false;
        if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
            ffe_ctl->index == 0)
                full_search = true;
        down_read(&space_info->groups_sem);
        list_for_each_entry(block_group,
                            &space_info->block_groups[ffe_ctl->index], list) {
                struct btrfs_block_group *bg_ret;

                ffe_ctl->hinted = false;
                /* If the block group is read-only, we can skip it entirely. */
                if (unlikely(block_group->ro)) {
                        if (ffe_ctl->for_treelog)
                                btrfs_clear_treelog_bg(block_group);
                        if (ffe_ctl->for_data_reloc)
                                btrfs_clear_data_reloc_bg(block_group);
                        continue;
                }

                btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
                ffe_ctl->search_start = block_group->start;

                /*
                 * this can happen if we end up cycling through all the
                 * raid types, but we want to make sure we only allocate
                 * for the proper type.
                 */
                if (!block_group_bits(block_group, ffe_ctl->flags)) {
                        u64 extra = BTRFS_BLOCK_GROUP_DUP |
                                BTRFS_BLOCK_GROUP_RAID1_MASK |
                                BTRFS_BLOCK_GROUP_RAID56_MASK |
                                BTRFS_BLOCK_GROUP_RAID10;

                        /*
                         * if they asked for extra copies and this block group
                         * doesn't provide them, bail.  This does allow us to
                         * fill raid0 from raid1.
                         */
                        if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
                                goto loop;

                        /*
                         * This block group has different flags than we want.
                         * It's possible that we have MIXED_GROUP flag but no
                         * block group is mixed.  Just skip such block group.
                         */
                        btrfs_release_block_group(block_group, ffe_ctl->delalloc);
                        continue;
                }

have_block_group:
                trace_find_free_extent_have_block_group(root, ffe_ctl, block_group);
                ffe_ctl->cached = btrfs_block_group_done(block_group);
                if (unlikely(!ffe_ctl->cached)) {
                        ffe_ctl->have_caching_bg = true;
                        ret = btrfs_cache_block_group(block_group, false);

                        /*
                         * If we get ENOMEM here or something else we want to
                         * try other block groups, because it may not be fatal.
                         * However if we can't find anything else we need to
                         * save our return here so that we return the actual
                         * error that caused problems, not ENOSPC.
                         */
                        if (ret < 0) {
                                if (!cache_block_group_error)
                                        cache_block_group_error = ret;
                                ret = 0;
                                goto loop;
                        }
                        ret = 0;
                }

                if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
                        if (!cache_block_group_error)
                                cache_block_group_error = -EIO;
                        goto loop;
                }

                if (!find_free_extent_check_size_class(ffe_ctl, block_group))
                        goto loop;

                bg_ret = NULL;
                ret = do_allocation(block_group, ffe_ctl, &bg_ret);
                if (ret > 0)
                        goto loop;

                if (bg_ret && bg_ret != block_group) {
                        btrfs_release_block_group(block_group, ffe_ctl->delalloc);
                        block_group = bg_ret;
                }

                /* Checks */
                ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
                                                 fs_info->stripesize);

                /* move on to the next group */
                if (ffe_ctl->search_start + ffe_ctl->num_bytes >
                    block_group->start + block_group->length) {
                        btrfs_add_free_space_unused(block_group,
                                            ffe_ctl->found_offset,
                                            ffe_ctl->num_bytes);
                        goto loop;
                }

                if (ffe_ctl->found_offset < ffe_ctl->search_start)
                        btrfs_add_free_space_unused(block_group,
                                        ffe_ctl->found_offset,
                                        ffe_ctl->search_start - ffe_ctl->found_offset);

                ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
                                               ffe_ctl->num_bytes,
                                               ffe_ctl->delalloc,
                                               ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS);
                if (ret == -EAGAIN) {
                        btrfs_add_free_space_unused(block_group,
                                        ffe_ctl->found_offset,
                                        ffe_ctl->num_bytes);
                        goto loop;
                }
                btrfs_inc_block_group_reservations(block_group);

                /* we are all good, lets return */
                ins->objectid = ffe_ctl->search_start;
                ins->offset = ffe_ctl->num_bytes;

                trace_btrfs_reserve_extent(block_group, ffe_ctl);
                btrfs_release_block_group(block_group, ffe_ctl->delalloc);
                break;
loop:
                if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
                    !ffe_ctl->retry_uncached) {
                        ffe_ctl->retry_uncached = true;
                        btrfs_wait_block_group_cache_progress(block_group,
                                                ffe_ctl->num_bytes +
                                                ffe_ctl->empty_cluster +
                                                ffe_ctl->empty_size);
                        goto have_block_group;
                }
                release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
                cond_resched();
        }
        up_read(&space_info->groups_sem);

        ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
        if (ret > 0)
                goto search;

        if (ret == -ENOSPC && !cache_block_group_error) {
                /*
                 * Use ffe_ctl->total_free_space as fallback if we can't find
                 * any contiguous hole.
                 */
                if (!ffe_ctl->max_extent_size)
                        ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
                spin_lock(&space_info->lock);
                space_info->max_extent_size = ffe_ctl->max_extent_size;
                spin_unlock(&space_info->lock);
                ins->offset = ffe_ctl->max_extent_size;
        } else if (ret == -ENOSPC) {
                ret = cache_block_group_error;
        }
        return ret;
}

/*
 * Entry point to the extent allocator. Tries to find a hole that is at least
 * as big as @num_bytes.
 *
 * @root           -        The root that will contain this extent
 *
 * @ram_bytes      -        The amount of space in ram that @num_bytes take. This
 *                        is used for accounting purposes. This value differs
 *                        from @num_bytes only in the case of compressed extents.
 *
 * @num_bytes      -        Number of bytes to allocate on-disk.
 *
 * @min_alloc_size -        Indicates the minimum amount of space that the
 *                        allocator should try to satisfy. In some cases
 *                        @num_bytes may be larger than what is required and if
 *                        the filesystem is fragmented then allocation fails.
 *                        However, the presence of @min_alloc_size gives a
 *                        chance to try and satisfy the smaller allocation.
 *
 * @empty_size     -        A hint that you plan on doing more COW. This is the
 *                        size in bytes the allocator should try to find free
 *                        next to the block it returns.  This is just a hint and
 *                        may be ignored by the allocator.
 *
 * @hint_byte      -        Hint to the allocator to start searching above the byte
 *                        address passed. It might be ignored.
 *
 * @ins            -        This key is modified to record the found hole. It will
 *                        have the following values:
 *                        ins->objectid == start position
 *                        ins->flags = BTRFS_EXTENT_ITEM_KEY
 *                        ins->offset == the size of the hole.
 *
 * @is_data        -        Boolean flag indicating whether an extent is
 *                        allocated for data (true) or metadata (false)
 *
 * @delalloc       -        Boolean flag indicating whether this allocation is for
 *                        delalloc or not. If 'true' data_rwsem of block groups
 *                        is going to be acquired.
 *
 *
 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
 * case -ENOSPC is returned then @ins->offset will contain the size of the
 * largest available hole the allocator managed to find.
 */
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
                         u64 num_bytes, u64 min_alloc_size,
                         u64 empty_size, u64 hint_byte,
                         struct btrfs_key *ins, int is_data, int delalloc)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct find_free_extent_ctl ffe_ctl = {};
        bool final_tried = num_bytes == min_alloc_size;
        u64 flags;
        int ret;
        bool for_treelog = (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID);
        bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);

        flags = get_alloc_profile_by_root(root, is_data);
again:
        WARN_ON(num_bytes < fs_info->sectorsize);

        ffe_ctl.ram_bytes = ram_bytes;
        ffe_ctl.num_bytes = num_bytes;
        ffe_ctl.min_alloc_size = min_alloc_size;
        ffe_ctl.empty_size = empty_size;
        ffe_ctl.flags = flags;
        ffe_ctl.delalloc = delalloc;
        ffe_ctl.hint_byte = hint_byte;
        ffe_ctl.for_treelog = for_treelog;
        ffe_ctl.for_data_reloc = for_data_reloc;

        ret = find_free_extent(root, ins, &ffe_ctl);
        if (!ret && !is_data) {
                btrfs_dec_block_group_reservations(fs_info, ins->objectid);
        } else if (ret == -ENOSPC) {
                if (!final_tried && ins->offset) {
                        num_bytes = min(num_bytes >> 1, ins->offset);
                        num_bytes = round_down(num_bytes,
                                               fs_info->sectorsize);
                        num_bytes = max(num_bytes, min_alloc_size);
                        ram_bytes = num_bytes;
                        if (num_bytes == min_alloc_size)
                                final_tried = true;
                        goto again;
                } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
                        struct btrfs_space_info *sinfo;

                        sinfo = btrfs_find_space_info(fs_info, flags);
                        btrfs_err(fs_info,
        "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
                                  flags, num_bytes, for_treelog, for_data_reloc);
                        if (sinfo)
                                btrfs_dump_space_info(fs_info, sinfo,
                                                      num_bytes, 1);
                }
        }

        return ret;
}

int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
                               u64 start, u64 len, int delalloc)
{
        struct btrfs_block_group *cache;

        cache = btrfs_lookup_block_group(fs_info, start);
        if (!cache) {
                btrfs_err(fs_info, "Unable to find block group for %llu",
                          start);
                return -ENOSPC;
        }

        btrfs_add_free_space(cache, start, len);
        btrfs_free_reserved_bytes(cache, len, delalloc);
        trace_btrfs_reserved_extent_free(fs_info, start, len);

        btrfs_put_block_group(cache);
        return 0;
}

int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
                              const struct extent_buffer *eb)
{
        struct btrfs_block_group *cache;
        int ret = 0;

        cache = btrfs_lookup_block_group(trans->fs_info, eb->start);
        if (!cache) {
                btrfs_err(trans->fs_info, "unable to find block group for %llu",
                          eb->start);
                return -ENOSPC;
        }

        ret = pin_down_extent(trans, cache, eb->start, eb->len, 1);
        btrfs_put_block_group(cache);
        return ret;
}

static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
                                 u64 num_bytes)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        int ret;

        ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
        if (ret)
                return ret;

        ret = btrfs_update_block_group(trans, bytenr, num_bytes, true);
        if (ret) {
                ASSERT(!ret);
                btrfs_err(fs_info, "update block group failed for %llu %llu",
                          bytenr, num_bytes);
                return ret;
        }

        trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes);
        return 0;
}

static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
                                      u64 parent, u64 root_objectid,
                                      u64 flags, u64 owner, u64 offset,
                                      struct btrfs_key *ins, int ref_mod, u64 oref_root)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *extent_root;
        int ret;
        struct btrfs_extent_item *extent_item;
        struct btrfs_extent_owner_ref *oref;
        struct btrfs_extent_inline_ref *iref;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        int type;
        u32 size;
        const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE);

        if (parent > 0)
                type = BTRFS_SHARED_DATA_REF_KEY;
        else
                type = BTRFS_EXTENT_DATA_REF_KEY;

        size = sizeof(*extent_item);
        if (simple_quota)
                size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
        size += btrfs_extent_inline_ref_size(type);

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        extent_root = btrfs_extent_root(fs_info, ins->objectid);
        ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
        if (ret) {
                btrfs_free_path(path);
                return ret;
        }

        leaf = path->nodes[0];
        extent_item = btrfs_item_ptr(leaf, path->slots[0],
                                     struct btrfs_extent_item);
        btrfs_set_extent_refs(leaf, extent_item, ref_mod);
        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
        btrfs_set_extent_flags(leaf, extent_item,
                               flags | BTRFS_EXTENT_FLAG_DATA);

        iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
        if (simple_quota) {
                btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY);
                oref = (struct btrfs_extent_owner_ref *)(&iref->offset);
                btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root);
                iref = (struct btrfs_extent_inline_ref *)(oref + 1);
        }
        btrfs_set_extent_inline_ref_type(leaf, iref, type);

        if (parent > 0) {
                struct btrfs_shared_data_ref *ref;
                ref = (struct btrfs_shared_data_ref *)(iref + 1);
                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
                btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
        } else {
                struct btrfs_extent_data_ref *ref;
                ref = (struct btrfs_extent_data_ref *)(&iref->offset);
                btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
                btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
                btrfs_set_extent_data_ref_offset(leaf, ref, offset);
                btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
        }

        btrfs_mark_buffer_dirty(trans, path->nodes[0]);
        btrfs_free_path(path);

        return alloc_reserved_extent(trans, ins->objectid, ins->offset);
}

static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                                     struct btrfs_delayed_ref_node *node,
                                     struct btrfs_delayed_extent_op *extent_op)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *extent_root;
        int ret;
        struct btrfs_extent_item *extent_item;
        struct btrfs_key extent_key;
        struct btrfs_tree_block_info *block_info;
        struct btrfs_extent_inline_ref *iref;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        u32 size = sizeof(*extent_item) + sizeof(*iref);
        u64 flags = extent_op->flags_to_set;
        /* The owner of a tree block is the level. */
        int level = btrfs_delayed_ref_owner(node);
        bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);

        extent_key.objectid = node->bytenr;
        if (skinny_metadata) {
                /* The owner of a tree block is the level. */
                extent_key.offset = level;
                extent_key.type = BTRFS_METADATA_ITEM_KEY;
        } else {
                extent_key.offset = node->num_bytes;
                extent_key.type = BTRFS_EXTENT_ITEM_KEY;
                size += sizeof(*block_info);
        }

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
        ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
                                      size);
        if (ret) {
                btrfs_free_path(path);
                return ret;
        }

        leaf = path->nodes[0];
        extent_item = btrfs_item_ptr(leaf, path->slots[0],
                                     struct btrfs_extent_item);
        btrfs_set_extent_refs(leaf, extent_item, 1);
        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
        btrfs_set_extent_flags(leaf, extent_item,
                               flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);

        if (skinny_metadata) {
                iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
        } else {
                block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
                btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
                btrfs_set_tree_block_level(leaf, block_info, level);
                iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
        }

        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
                btrfs_set_extent_inline_ref_type(leaf, iref,
                                                 BTRFS_SHARED_BLOCK_REF_KEY);
                btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent);
        } else {
                btrfs_set_extent_inline_ref_type(leaf, iref,
                                                 BTRFS_TREE_BLOCK_REF_KEY);
                btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root);
        }

        btrfs_mark_buffer_dirty(trans, leaf);
        btrfs_free_path(path);

        return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
}

int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root, u64 owner,
                                     u64 offset, u64 ram_bytes,
                                     struct btrfs_key *ins)
{
        struct btrfs_ref generic_ref = {
                .action = BTRFS_ADD_DELAYED_EXTENT,
                .bytenr = ins->objectid,
                .num_bytes = ins->offset,
                .owning_root = btrfs_root_id(root),
                .ref_root = btrfs_root_id(root),
        };

        ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID);

        if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root))
                generic_ref.owning_root = root->relocation_src_root;

        btrfs_init_data_ref(&generic_ref, owner, offset, 0, false);
        btrfs_ref_tree_mod(root->fs_info, &generic_ref);

        return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
}

/*
 * this is used by the tree logging recovery code.  It records that
 * an extent has been allocated and makes sure to clear the free
 * space cache bits as well
 */
int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
                                   u64 root_objectid, u64 owner, u64 offset,
                                   struct btrfs_key *ins)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        int ret;
        struct btrfs_block_group *block_group;
        struct btrfs_space_info *space_info;
        struct btrfs_squota_delta delta = {
                .root = root_objectid,
                .num_bytes = ins->offset,
                .generation = trans->transid,
                .is_data = true,
                .is_inc = true,
        };

        /*
         * Mixed block groups will exclude before processing the log so we only
         * need to do the exclude dance if this fs isn't mixed.
         */
        if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
                ret = __exclude_logged_extent(fs_info, ins->objectid,
                                              ins->offset);
                if (ret)
                        return ret;
        }

        block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
        if (!block_group)
                return -EINVAL;

        space_info = block_group->space_info;
        spin_lock(&space_info->lock);
        spin_lock(&block_group->lock);
        space_info->bytes_reserved += ins->offset;
        block_group->reserved += ins->offset;
        spin_unlock(&block_group->lock);
        spin_unlock(&space_info->lock);

        ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
                                         offset, ins, 1, root_objectid);
        if (ret)
                btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
        ret = btrfs_record_squota_delta(fs_info, &delta);
        btrfs_put_block_group(block_group);
        return ret;
}

#ifdef CONFIG_BTRFS_DEBUG
/*
 * Extra safety check in case the extent tree is corrupted and extent allocator
 * chooses to use a tree block which is already used and locked.
 */
static bool check_eb_lock_owner(const struct extent_buffer *eb)
{
        if (eb->lock_owner == current->pid) {
                btrfs_err_rl(eb->fs_info,
"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
                             eb->start, btrfs_header_owner(eb), current->pid);
                return true;
        }
        return false;
}
#else
static bool check_eb_lock_owner(struct extent_buffer *eb)
{
        return false;
}
#endif

static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                      u64 bytenr, int level, u64 owner,
                      enum btrfs_lock_nesting nest)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_buffer *buf;
        u64 lockdep_owner = owner;

        buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
        if (IS_ERR(buf))
                return buf;

        if (check_eb_lock_owner(buf)) {
                free_extent_buffer(buf);
                return ERR_PTR(-EUCLEAN);
        }

        /*
         * The reloc trees are just snapshots, so we need them to appear to be
         * just like any other fs tree WRT lockdep.
         *
         * The exception however is in replace_path() in relocation, where we
         * hold the lock on the original fs root and then search for the reloc
         * root.  At that point we need to make sure any reloc root buffers are
         * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
         * lockdep happy.
         */
        if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
            !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
                lockdep_owner = BTRFS_FS_TREE_OBJECTID;

        /* btrfs_clear_buffer_dirty() accesses generation field. */
        btrfs_set_header_generation(buf, trans->transid);

        /*
         * This needs to stay, because we could allocate a freed block from an
         * old tree into a new tree, so we need to make sure this new block is
         * set to the appropriate level and owner.
         */
        btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);

        btrfs_tree_lock_nested(buf, nest);
        btrfs_clear_buffer_dirty(trans, buf);
        clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
        clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags);

        set_extent_buffer_uptodate(buf);

        memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
        btrfs_set_header_level(buf, level);
        btrfs_set_header_bytenr(buf, buf->start);
        btrfs_set_header_generation(buf, trans->transid);
        btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(buf, owner);
        write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
        write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
        if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) {
                buf->log_index = root->log_transid % 2;
                /*
                 * we allow two log transactions at a time, use different
                 * EXTENT bit to differentiate dirty pages.
                 */
                if (buf->log_index == 0)
                        set_extent_bit(&root->dirty_log_pages, buf->start,
                                       buf->start + buf->len - 1,
                                       EXTENT_DIRTY, NULL);
                else
                        set_extent_bit(&root->dirty_log_pages, buf->start,
                                       buf->start + buf->len - 1,
                                       EXTENT_NEW, NULL);
        } else {
                buf->log_index = -1;
                set_extent_bit(&trans->transaction->dirty_pages, buf->start,
                               buf->start + buf->len - 1, EXTENT_DIRTY, NULL);
        }
        /* this returns a buffer locked for blocking */
        return buf;
}

/*
 * finds a free extent and does all the dirty work required for allocation
 * returns the tree buffer or an ERR_PTR on error.
 */
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
                                             struct btrfs_root *root,
                                             u64 parent, u64 root_objectid,
                                             const struct btrfs_disk_key *key,
                                             int level, u64 hint,
                                             u64 empty_size,
                                             u64 reloc_src_root,
                                             enum btrfs_lock_nesting nest)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_key ins;
        struct btrfs_block_rsv *block_rsv;
        struct extent_buffer *buf;
        struct btrfs_delayed_extent_op *extent_op;
        u64 flags = 0;
        int ret;
        u32 blocksize = fs_info->nodesize;
        bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
        u64 owning_root;

#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
        if (btrfs_is_testing(fs_info)) {
                buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
                                            level, root_objectid, nest);
                if (!IS_ERR(buf))
                        root->alloc_bytenr += blocksize;
                return buf;
        }
#endif

        block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
        if (IS_ERR(block_rsv))
                return ERR_CAST(block_rsv);

        ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
                                   empty_size, hint, &ins, 0, 0);
        if (ret)
                goto out_unuse;

        buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
                                    root_objectid, nest);
        if (IS_ERR(buf)) {
                ret = PTR_ERR(buf);
                goto out_free_reserved;
        }
        owning_root = btrfs_header_owner(buf);

        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
                if (parent == 0)
                        parent = ins.objectid;
                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
                owning_root = reloc_src_root;
        } else
                BUG_ON(parent > 0);

        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
                struct btrfs_ref generic_ref = {
                        .action = BTRFS_ADD_DELAYED_EXTENT,
                        .bytenr = ins.objectid,
                        .num_bytes = ins.offset,
                        .parent = parent,
                        .owning_root = owning_root,
                        .ref_root = root_objectid,
                };
                extent_op = btrfs_alloc_delayed_extent_op();
                if (!extent_op) {
                        ret = -ENOMEM;
                        goto out_free_buf;
                }
                if (key)
                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
                else
                        memset(&extent_op->key, 0, sizeof(extent_op->key));
                extent_op->flags_to_set = flags;
                extent_op->update_key = skinny_metadata ? false : true;
                extent_op->update_flags = true;
                extent_op->level = level;

                btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false);
                btrfs_ref_tree_mod(fs_info, &generic_ref);
                ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
                if (ret)
                        goto out_free_delayed;
        }
        return buf;

out_free_delayed:
        btrfs_free_delayed_extent_op(extent_op);
out_free_buf:
        btrfs_tree_unlock(buf);
        free_extent_buffer(buf);
out_free_reserved:
        btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
out_unuse:
        btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
        return ERR_PTR(ret);
}

struct walk_control {
        u64 refs[BTRFS_MAX_LEVEL];
        u64 flags[BTRFS_MAX_LEVEL];
        struct btrfs_key update_progress;
        struct btrfs_key drop_progress;
        int drop_level;
        int stage;
        int level;
        int shared_level;
        int update_ref;
        int keep_locks;
        int reada_slot;
        int reada_count;
        int restarted;
};

#define DROP_REFERENCE        1
#define UPDATE_BACKREF        2

static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     struct walk_control *wc,
                                     struct btrfs_path *path)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 bytenr;
        u64 generation;
        u64 refs;
        u64 flags;
        u32 nritems;
        struct btrfs_key key;
        struct extent_buffer *eb;
        int ret;
        int slot;
        int nread = 0;

        if (path->slots[wc->level] < wc->reada_slot) {
                wc->reada_count = wc->reada_count * 2 / 3;
                wc->reada_count = max(wc->reada_count, 2);
        } else {
                wc->reada_count = wc->reada_count * 3 / 2;
                wc->reada_count = min_t(int, wc->reada_count,
                                        BTRFS_NODEPTRS_PER_BLOCK(fs_info));
        }

        eb = path->nodes[wc->level];
        nritems = btrfs_header_nritems(eb);

        for (slot = path->slots[wc->level]; slot < nritems; slot++) {
                if (nread >= wc->reada_count)
                        break;

                cond_resched();
                bytenr = btrfs_node_blockptr(eb, slot);
                generation = btrfs_node_ptr_generation(eb, slot);

                if (slot == path->slots[wc->level])
                        goto reada;

                if (wc->stage == UPDATE_BACKREF &&
                    generation <= root->root_key.offset)
                        continue;

                /* We don't lock the tree block, it's OK to be racy here */
                ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
                                               wc->level - 1, 1, &refs,
                                               &flags, NULL);
                /* We don't care about errors in readahead. */
                if (ret < 0)
                        continue;
                BUG_ON(refs == 0);

                if (wc->stage == DROP_REFERENCE) {
                        if (refs == 1)
                                goto reada;

                        if (wc->level == 1 &&
                            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
                                continue;
                        if (!wc->update_ref ||
                            generation <= root->root_key.offset)
                                continue;
                        btrfs_node_key_to_cpu(eb, &key, slot);
                        ret = btrfs_comp_cpu_keys(&key,
                                                  &wc->update_progress);
                        if (ret < 0)
                                continue;
                } else {
                        if (wc->level == 1 &&
                            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
                                continue;
                }
reada:
                btrfs_readahead_node_child(eb, slot);
                nread++;
        }
        wc->reada_slot = slot;
}

/*
 * helper to process tree block while walking down the tree.
 *
 * when wc->stage == UPDATE_BACKREF, this function updates
 * back refs for pointers in the block.
 *
 * NOTE: return value 1 means we should stop walking down.
 */
static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
                                   struct btrfs_path *path,
                                   struct walk_control *wc, int lookup_info)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        int level = wc->level;
        struct extent_buffer *eb = path->nodes[level];
        u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
        int ret;

        if (wc->stage == UPDATE_BACKREF && btrfs_header_owner(eb) != btrfs_root_id(root))
                return 1;

        /*
         * when reference count of tree block is 1, it won't increase
         * again. once full backref flag is set, we never clear it.
         */
        if (lookup_info &&
            ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
             (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
                BUG_ON(!path->locks[level]);
                ret = btrfs_lookup_extent_info(trans, fs_info,
                                               eb->start, level, 1,
                                               &wc->refs[level],
                                               &wc->flags[level],
                                               NULL);
                BUG_ON(ret == -ENOMEM);
                if (ret)
                        return ret;
                BUG_ON(wc->refs[level] == 0);
        }

        if (wc->stage == DROP_REFERENCE) {
                if (wc->refs[level] > 1)
                        return 1;

                if (path->locks[level] && !wc->keep_locks) {
                        btrfs_tree_unlock_rw(eb, path->locks[level]);
                        path->locks[level] = 0;
                }
                return 0;
        }

        /* wc->stage == UPDATE_BACKREF */
        if (!(wc->flags[level] & flag)) {
                BUG_ON(!path->locks[level]);
                ret = btrfs_inc_ref(trans, root, eb, 1);
                BUG_ON(ret); /* -ENOMEM */
                ret = btrfs_dec_ref(trans, root, eb, 0);
                BUG_ON(ret); /* -ENOMEM */
                ret = btrfs_set_disk_extent_flags(trans, eb, flag);
                BUG_ON(ret); /* -ENOMEM */
                wc->flags[level] |= flag;
        }

        /*
         * the block is shared by multiple trees, so it's not good to
         * keep the tree lock
         */
        if (path->locks[level] && level > 0) {
                btrfs_tree_unlock_rw(eb, path->locks[level]);
                path->locks[level] = 0;
        }
        return 0;
}

/*
 * This is used to verify a ref exists for this root to deal with a bug where we
 * would have a drop_progress key that hadn't been updated properly.
 */
static int check_ref_exists(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root, u64 bytenr, u64 parent,
                            int level)
{
        struct btrfs_path *path;
        struct btrfs_extent_inline_ref *iref;
        int ret;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        ret = lookup_extent_backref(trans, path, &iref, bytenr,
                                    root->fs_info->nodesize, parent,
                                    btrfs_root_id(root), level, 0);
        btrfs_free_path(path);
        if (ret == -ENOENT)
                return 0;
        if (ret < 0)
                return ret;
        return 1;
}

/*
 * helper to process tree block pointer.
 *
 * when wc->stage == DROP_REFERENCE, this function checks
 * reference count of the block pointed to. if the block
 * is shared and we need update back refs for the subtree
 * rooted at the block, this function changes wc->stage to
 * UPDATE_BACKREF. if the block is shared and there is no
 * need to update back, this function drops the reference
 * to the block.
 *
 * NOTE: return value 1 means we should stop walking down.
 */
static noinline int do_walk_down(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path,
                                 struct walk_control *wc, int *lookup_info)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 bytenr;
        u64 generation;
        u64 owner_root = 0;
        struct btrfs_tree_parent_check check = { 0 };
        struct btrfs_key key;
        struct extent_buffer *next;
        int level = wc->level;
        int reada = 0;
        int ret = 0;
        bool need_account = false;

        generation = btrfs_node_ptr_generation(path->nodes[level],
                                               path->slots[level]);
        /*
         * if the lower level block was created before the snapshot
         * was created, we know there is no need to update back refs
         * for the subtree
         */
        if (wc->stage == UPDATE_BACKREF &&
            generation <= root->root_key.offset) {
                *lookup_info = 1;
                return 1;
        }

        bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);

        check.level = level - 1;
        check.transid = generation;
        check.owner_root = btrfs_root_id(root);
        check.has_first_key = true;
        btrfs_node_key_to_cpu(path->nodes[level], &check.first_key,
                              path->slots[level]);

        next = find_extent_buffer(fs_info, bytenr);
        if (!next) {
                next = btrfs_find_create_tree_block(fs_info, bytenr,
                                btrfs_root_id(root), level - 1);
                if (IS_ERR(next))
                        return PTR_ERR(next);
                reada = 1;
        }
        btrfs_tree_lock(next);

        ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
                                       &wc->refs[level - 1],
                                       &wc->flags[level - 1],
                                       &owner_root);
        if (ret < 0)
                goto out_unlock;

        if (unlikely(wc->refs[level - 1] == 0)) {
                btrfs_err(fs_info, "Missing references.");
                ret = -EIO;
                goto out_unlock;
        }
        *lookup_info = 0;

        if (wc->stage == DROP_REFERENCE) {
                if (wc->refs[level - 1] > 1) {
                        need_account = true;
                        if (level == 1 &&
                            (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
                                goto skip;

                        if (!wc->update_ref ||
                            generation <= root->root_key.offset)
                                goto skip;

                        btrfs_node_key_to_cpu(path->nodes[level], &key,
                                              path->slots[level]);
                        ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
                        if (ret < 0)
                                goto skip;

                        wc->stage = UPDATE_BACKREF;
                        wc->shared_level = level - 1;
                }
        } else {
                if (level == 1 &&
                    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
                        goto skip;
        }

        if (!btrfs_buffer_uptodate(next, generation, 0)) {
                btrfs_tree_unlock(next);
                free_extent_buffer(next);
                next = NULL;
                *lookup_info = 1;
        }

        if (!next) {
                if (reada && level == 1)
                        reada_walk_down(trans, root, wc, path);
                next = read_tree_block(fs_info, bytenr, &check);
                if (IS_ERR(next)) {
                        return PTR_ERR(next);
                } else if (!extent_buffer_uptodate(next)) {
                        free_extent_buffer(next);
                        return -EIO;
                }
                btrfs_tree_lock(next);
        }

        level--;
        ASSERT(level == btrfs_header_level(next));
        if (level != btrfs_header_level(next)) {
                btrfs_err(root->fs_info, "mismatched level");
                ret = -EIO;
                goto out_unlock;
        }
        path->nodes[level] = next;
        path->slots[level] = 0;
        path->locks[level] = BTRFS_WRITE_LOCK;
        wc->level = level;
        if (wc->level == 1)
                wc->reada_slot = 0;
        return 0;
skip:
        wc->refs[level - 1] = 0;
        wc->flags[level - 1] = 0;
        if (wc->stage == DROP_REFERENCE) {
                struct btrfs_ref ref = {
                        .action = BTRFS_DROP_DELAYED_REF,
                        .bytenr = bytenr,
                        .num_bytes = fs_info->nodesize,
                        .owning_root = owner_root,
                        .ref_root = btrfs_root_id(root),
                };
                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
                        ref.parent = path->nodes[level]->start;
                } else {
                        ASSERT(btrfs_root_id(root) ==
                               btrfs_header_owner(path->nodes[level]));
                        if (btrfs_root_id(root) !=
                            btrfs_header_owner(path->nodes[level])) {
                                btrfs_err(root->fs_info,
                                                "mismatched block owner");
                                ret = -EIO;
                                goto out_unlock;
                        }
                }

                /*
                 * If we had a drop_progress we need to verify the refs are set
                 * as expected.  If we find our ref then we know that from here
                 * on out everything should be correct, and we can clear the
                 * ->restarted flag.
                 */
                if (wc->restarted) {
                        ret = check_ref_exists(trans, root, bytenr, ref.parent,
                                               level - 1);
                        if (ret < 0)
                                goto out_unlock;
                        if (ret == 0)
                                goto no_delete;
                        ret = 0;
                        wc->restarted = 0;
                }

                /*
                 * Reloc tree doesn't contribute to qgroup numbers, and we have
                 * already accounted them at merge time (replace_path),
                 * thus we could skip expensive subtree trace here.
                 */
                if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && need_account) {
                        ret = btrfs_qgroup_trace_subtree(trans, next,
                                                         generation, level - 1);
                        if (ret) {
                                btrfs_err_rl(fs_info,
                                             "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
                                             ret);
                        }
                }

                /*
                 * We need to update the next key in our walk control so we can
                 * update the drop_progress key accordingly.  We don't care if
                 * find_next_key doesn't find a key because that means we're at
                 * the end and are going to clean up now.
                 */
                wc->drop_level = level;
                find_next_key(path, level, &wc->drop_progress);

                btrfs_init_tree_ref(&ref, level - 1, 0, false);
                ret = btrfs_free_extent(trans, &ref);
                if (ret)
                        goto out_unlock;
        }
no_delete:
        *lookup_info = 1;
        ret = 1;

out_unlock:
        btrfs_tree_unlock(next);
        free_extent_buffer(next);

        return ret;
}

/*
 * helper to process tree block while walking up the tree.
 *
 * when wc->stage == DROP_REFERENCE, this function drops
 * reference count on the block.
 *
 * when wc->stage == UPDATE_BACKREF, this function changes
 * wc->stage back to DROP_REFERENCE if we changed wc->stage
 * to UPDATE_BACKREF previously while processing the block.
 *
 * NOTE: return value 1 means we should stop walking up.
 */
static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path,
                                 struct walk_control *wc)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;
        int level = wc->level;
        struct extent_buffer *eb = path->nodes[level];
        u64 parent = 0;

        if (wc->stage == UPDATE_BACKREF) {
                BUG_ON(wc->shared_level < level);
                if (level < wc->shared_level)
                        goto out;

                ret = find_next_key(path, level + 1, &wc->update_progress);
                if (ret > 0)
                        wc->update_ref = 0;

                wc->stage = DROP_REFERENCE;
                wc->shared_level = -1;
                path->slots[level] = 0;

                /*
                 * check reference count again if the block isn't locked.
                 * we should start walking down the tree again if reference
                 * count is one.
                 */
                if (!path->locks[level]) {
                        BUG_ON(level == 0);
                        btrfs_tree_lock(eb);
                        path->locks[level] = BTRFS_WRITE_LOCK;

                        ret = btrfs_lookup_extent_info(trans, fs_info,
                                                       eb->start, level, 1,
                                                       &wc->refs[level],
                                                       &wc->flags[level],
                                                       NULL);
                        if (ret < 0) {
                                btrfs_tree_unlock_rw(eb, path->locks[level]);
                                path->locks[level] = 0;
                                return ret;
                        }
                        BUG_ON(wc->refs[level] == 0);
                        if (wc->refs[level] == 1) {
                                btrfs_tree_unlock_rw(eb, path->locks[level]);
                                path->locks[level] = 0;
                                return 1;
                        }
                }
        }

        /* wc->stage == DROP_REFERENCE */
        BUG_ON(wc->refs[level] > 1 && !path->locks[level]);

        if (wc->refs[level] == 1) {
                if (level == 0) {
                        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
                                ret = btrfs_dec_ref(trans, root, eb, 1);
                        else
                                ret = btrfs_dec_ref(trans, root, eb, 0);
                        BUG_ON(ret); /* -ENOMEM */
                        if (is_fstree(btrfs_root_id(root))) {
                                ret = btrfs_qgroup_trace_leaf_items(trans, eb);
                                if (ret) {
                                        btrfs_err_rl(fs_info,
        "error %d accounting leaf items, quota is out of sync, rescan required",
                                             ret);
                                }
                        }
                }
                /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */
                if (!path->locks[level]) {
                        btrfs_tree_lock(eb);
                        path->locks[level] = BTRFS_WRITE_LOCK;
                }
                btrfs_clear_buffer_dirty(trans, eb);
        }

        if (eb == root->node) {
                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
                        parent = eb->start;
                else if (btrfs_root_id(root) != btrfs_header_owner(eb))
                        goto owner_mismatch;
        } else {
                if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
                        parent = path->nodes[level + 1]->start;
                else if (btrfs_root_id(root) !=
                         btrfs_header_owner(path->nodes[level + 1]))
                        goto owner_mismatch;
        }

        btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
                              wc->refs[level] == 1);
out:
        wc->refs[level] = 0;
        wc->flags[level] = 0;
        return 0;

owner_mismatch:
        btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
                     btrfs_header_owner(eb), btrfs_root_id(root));
        return -EUCLEAN;
}

static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
                                   struct btrfs_path *path,
                                   struct walk_control *wc)
{
        int level = wc->level;
        int lookup_info = 1;
        int ret = 0;

        while (level >= 0) {
                ret = walk_down_proc(trans, root, path, wc, lookup_info);
                if (ret)
                        break;

                if (level == 0)
                        break;

                if (path->slots[level] >=
                    btrfs_header_nritems(path->nodes[level]))
                        break;

                ret = do_walk_down(trans, root, path, wc, &lookup_info);
                if (ret > 0) {
                        path->slots[level]++;
                        continue;
                } else if (ret < 0)
                        break;
                level = wc->level;
        }
        return (ret == 1) ? 0 : ret;
}

static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path,
                                 struct walk_control *wc, int max_level)
{
        int level = wc->level;
        int ret;

        path->slots[level] = btrfs_header_nritems(path->nodes[level]);
        while (level < max_level && path->nodes[level]) {
                wc->level = level;
                if (path->slots[level] + 1 <
                    btrfs_header_nritems(path->nodes[level])) {
                        path->slots[level]++;
                        return 0;
                } else {
                        ret = walk_up_proc(trans, root, path, wc);
                        if (ret > 0)
                                return 0;
                        if (ret < 0)
                                return ret;

                        if (path->locks[level]) {
                                btrfs_tree_unlock_rw(path->nodes[level],
                                                     path->locks[level]);
                                path->locks[level] = 0;
                        }
                        free_extent_buffer(path->nodes[level]);
                        path->nodes[level] = NULL;
                        level++;
                }
        }
        return 1;
}

/*
 * drop a subvolume tree.
 *
 * this function traverses the tree freeing any blocks that only
 * referenced by the tree.
 *
 * when a shared tree block is found. this function decreases its
 * reference count by one. if update_ref is true, this function
 * also make sure backrefs for the shared block and all lower level
 * blocks are properly updated.
 *
 * If called with for_reloc == 0, may exit early with -EAGAIN
 */
int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
{
        const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID);
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_path *path;
        struct btrfs_trans_handle *trans;
        struct btrfs_root *tree_root = fs_info->tree_root;
        struct btrfs_root_item *root_item = &root->root_item;
        struct walk_control *wc;
        struct btrfs_key key;
        int err = 0;
        int ret;
        int level;
        bool root_dropped = false;
        bool unfinished_drop = false;

        btrfs_debug(fs_info, "Drop subvolume %llu", btrfs_root_id(root));

        path = btrfs_alloc_path();
        if (!path) {
                err = -ENOMEM;
                goto out;
        }

        wc = kzalloc(sizeof(*wc), GFP_NOFS);
        if (!wc) {
                btrfs_free_path(path);
                err = -ENOMEM;
                goto out;
        }

        /*
         * Use join to avoid potential EINTR from transaction start. See
         * wait_reserve_ticket and the whole reservation callchain.
         */
        if (for_reloc)
                trans = btrfs_join_transaction(tree_root);
        else
                trans = btrfs_start_transaction(tree_root, 0);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto out_free;
        }

        err = btrfs_run_delayed_items(trans);
        if (err)
                goto out_end_trans;

        /*
         * This will help us catch people modifying the fs tree while we're
         * dropping it.  It is unsafe to mess with the fs tree while it's being
         * dropped as we unlock the root node and parent nodes as we walk down
         * the tree, assuming nothing will change.  If something does change
         * then we'll have stale information and drop references to blocks we've
         * already dropped.
         */
        set_bit(BTRFS_ROOT_DELETING, &root->state);
        unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);

        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
                level = btrfs_header_level(root->node);
                path->nodes[level] = btrfs_lock_root_node(root);
                path->slots[level] = 0;
                path->locks[level] = BTRFS_WRITE_LOCK;
                memset(&wc->update_progress, 0,
                       sizeof(wc->update_progress));
        } else {
                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
                memcpy(&wc->update_progress, &key,
                       sizeof(wc->update_progress));

                level = btrfs_root_drop_level(root_item);
                BUG_ON(level == 0);
                path->lowest_level = level;
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                path->lowest_level = 0;
                if (ret < 0) {
                        err = ret;
                        goto out_end_trans;
                }
                WARN_ON(ret > 0);

                /*
                 * unlock our path, this is safe because only this
                 * function is allowed to delete this snapshot
                 */
                btrfs_unlock_up_safe(path, 0);

                level = btrfs_header_level(root->node);
                while (1) {
                        btrfs_tree_lock(path->nodes[level]);
                        path->locks[level] = BTRFS_WRITE_LOCK;

                        ret = btrfs_lookup_extent_info(trans, fs_info,
                                                path->nodes[level]->start,
                                                level, 1, &wc->refs[level],
                                                &wc->flags[level], NULL);
                        if (ret < 0) {
                                err = ret;
                                goto out_end_trans;
                        }
                        BUG_ON(wc->refs[level] == 0);

                        if (level == btrfs_root_drop_level(root_item))
                                break;

                        btrfs_tree_unlock(path->nodes[level]);
                        path->locks[level] = 0;
                        WARN_ON(wc->refs[level] != 1);
                        level--;
                }
        }

        wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
        wc->level = level;
        wc->shared_level = -1;
        wc->stage = DROP_REFERENCE;
        wc->update_ref = update_ref;
        wc->keep_locks = 0;
        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);

        while (1) {

                ret = walk_down_tree(trans, root, path, wc);
                if (ret < 0) {
                        btrfs_abort_transaction(trans, ret);
                        err = ret;
                        break;
                }

                ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
                if (ret < 0) {
                        btrfs_abort_transaction(trans, ret);
                        err = ret;
                        break;
                }

                if (ret > 0) {
                        BUG_ON(wc->stage != DROP_REFERENCE);
                        break;
                }

                if (wc->stage == DROP_REFERENCE) {
                        wc->drop_level = wc->level;
                        btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
                                              &wc->drop_progress,
                                              path->slots[wc->drop_level]);
                }
                btrfs_cpu_key_to_disk(&root_item->drop_progress,
                                      &wc->drop_progress);
                btrfs_set_root_drop_level(root_item, wc->drop_level);

                BUG_ON(wc->level == 0);
                if (btrfs_should_end_transaction(trans) ||
                    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
                        ret = btrfs_update_root(trans, tree_root,
                                                &root->root_key,
                                                root_item);
                        if (ret) {
                                btrfs_abort_transaction(trans, ret);
                                err = ret;
                                goto out_end_trans;
                        }

                        if (!is_reloc_root)
                                btrfs_set_last_root_drop_gen(fs_info, trans->transid);

                        btrfs_end_transaction_throttle(trans);
                        if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
                                btrfs_debug(fs_info,
                                            "drop snapshot early exit");
                                err = -EAGAIN;
                                goto out_free;
                        }

                       /*
                        * Use join to avoid potential EINTR from transaction
                        * start. See wait_reserve_ticket and the whole
                        * reservation callchain.
                        */
                        if (for_reloc)
                                trans = btrfs_join_transaction(tree_root);
                        else
                                trans = btrfs_start_transaction(tree_root, 0);
                        if (IS_ERR(trans)) {
                                err = PTR_ERR(trans);
                                goto out_free;
                        }
                }
        }
        btrfs_release_path(path);
        if (err)
                goto out_end_trans;

        ret = btrfs_del_root(trans, &root->root_key);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                err = ret;
                goto out_end_trans;
        }

        if (!is_reloc_root) {
                ret = btrfs_find_root(tree_root, &root->root_key, path,
                                      NULL, NULL);
                if (ret < 0) {
                        btrfs_abort_transaction(trans, ret);
                        err = ret;
                        goto out_end_trans;
                } else if (ret > 0) {
                        /* if we fail to delete the orphan item this time
                         * around, it'll get picked up the next time.
                         *
                         * The most common failure here is just -ENOENT.
                         */
                        btrfs_del_orphan_item(trans, tree_root, btrfs_root_id(root));
                }
        }

        /*
         * This subvolume is going to be completely dropped, and won't be
         * recorded as dirty roots, thus pertrans meta rsv will not be freed at
         * commit transaction time.  So free it here manually.
         */
        btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
        btrfs_qgroup_free_meta_all_pertrans(root);

        if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
                btrfs_add_dropped_root(trans, root);
        else
                btrfs_put_root(root);
        root_dropped = true;
out_end_trans:
        if (!is_reloc_root)
                btrfs_set_last_root_drop_gen(fs_info, trans->transid);

        btrfs_end_transaction_throttle(trans);
out_free:
        kfree(wc);
        btrfs_free_path(path);
out:
        /*
         * We were an unfinished drop root, check to see if there are any
         * pending, and if not clear and wake up any waiters.
         */
        if (!err && unfinished_drop)
                btrfs_maybe_wake_unfinished_drop(fs_info);

        /*
         * So if we need to stop dropping the snapshot for whatever reason we
         * need to make sure to add it back to the dead root list so that we
         * keep trying to do the work later.  This also cleans up roots if we
         * don't have it in the radix (like when we recover after a power fail
         * or unmount) so we don't leak memory.
         */
        if (!for_reloc && !root_dropped)
                btrfs_add_dead_root(root);
        return err;
}

/*
 * drop subtree rooted at tree block 'node'.
 *
 * NOTE: this function will unlock and release tree block 'node'
 * only used by relocation code
 */
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
                        struct extent_buffer *parent)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_path *path;
        struct walk_control *wc;
        int level;
        int parent_level;
        int ret = 0;

        BUG_ON(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID);

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        wc = kzalloc(sizeof(*wc), GFP_NOFS);
        if (!wc) {
                btrfs_free_path(path);
                return -ENOMEM;
        }

        btrfs_assert_tree_write_locked(parent);
        parent_level = btrfs_header_level(parent);
        atomic_inc(&parent->refs);
        path->nodes[parent_level] = parent;
        path->slots[parent_level] = btrfs_header_nritems(parent);

        btrfs_assert_tree_write_locked(node);
        level = btrfs_header_level(node);
        path->nodes[level] = node;
        path->slots[level] = 0;
        path->locks[level] = BTRFS_WRITE_LOCK;

        wc->refs[parent_level] = 1;
        wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
        wc->level = level;
        wc->shared_level = -1;
        wc->stage = DROP_REFERENCE;
        wc->update_ref = 0;
        wc->keep_locks = 1;
        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);

        while (1) {
                ret = walk_down_tree(trans, root, path, wc);
                if (ret < 0)
                        break;

                ret = walk_up_tree(trans, root, path, wc, parent_level);
                if (ret) {
                        if (ret > 0)
                                ret = 0;
                        break;
                }
        }

        kfree(wc);
        btrfs_free_path(path);
        return ret;
}

/*
 * Unpin the extent range in an error context and don't add the space back.
 * Errors are not propagated further.
 */
void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
{
        unpin_extent_range(fs_info, start, end, false);
}

/*
 * It used to be that old block groups would be left around forever.
 * Iterating over them would be enough to trim unused space.  Since we
 * now automatically remove them, we also need to iterate over unallocated
 * space.
 *
 * We don't want a transaction for this since the discard may take a
 * substantial amount of time.  We don't require that a transaction be
 * running, but we do need to take a running transaction into account
 * to ensure that we're not discarding chunks that were released or
 * allocated in the current transaction.
 *
 * Holding the chunks lock will prevent other threads from allocating
 * or releasing chunks, but it won't prevent a running transaction
 * from committing and releasing the memory that the pending chunks
 * list head uses.  For that, we need to take a reference to the
 * transaction and hold the commit root sem.  We only need to hold
 * it while performing the free space search since we have already
 * held back allocations.
 */
static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
{
        u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
        int ret;

        *trimmed = 0;

        /* Discard not supported = nothing to do. */
        if (!bdev_max_discard_sectors(device->bdev))
                return 0;

        /* Not writable = nothing to do. */
        if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
                return 0;

        /* No free space = nothing to do. */
        if (device->total_bytes <= device->bytes_used)
                return 0;

        ret = 0;

        while (1) {
                struct btrfs_fs_info *fs_info = device->fs_info;
                u64 bytes;

                ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
                if (ret)
                        break;

                find_first_clear_extent_bit(&device->alloc_state, start,
                                            &start, &end,
                                            CHUNK_TRIMMED | CHUNK_ALLOCATED);

                /* Check if there are any CHUNK_* bits left */
                if (start > device->total_bytes) {
                        WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
                        btrfs_warn_in_rcu(fs_info,
"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
                                          start, end - start + 1,
                                          btrfs_dev_name(device),
                                          device->total_bytes);
                        mutex_unlock(&fs_info->chunk_mutex);
                        ret = 0;
                        break;
                }

                /* Ensure we skip the reserved space on each device. */
                start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);

                /*
                 * If find_first_clear_extent_bit find a range that spans the
                 * end of the device it will set end to -1, in this case it's up
                 * to the caller to trim the value to the size of the device.
                 */
                end = min(end, device->total_bytes - 1);

                len = end - start + 1;

                /* We didn't find any extents */
                if (!len) {
                        mutex_unlock(&fs_info->chunk_mutex);
                        ret = 0;
                        break;
                }

                ret = btrfs_issue_discard(device->bdev, start, len,
                                          &bytes);
                if (!ret)
                        set_extent_bit(&device->alloc_state, start,
                                       start + bytes - 1, CHUNK_TRIMMED, NULL);
                mutex_unlock(&fs_info->chunk_mutex);

                if (ret)
                        break;

                start += len;
                *trimmed += bytes;

                if (fatal_signal_pending(current)) {
                        ret = -ERESTARTSYS;
                        break;
                }

                cond_resched();
        }

        return ret;
}

/*
 * Trim the whole filesystem by:
 * 1) trimming the free space in each block group
 * 2) trimming the unallocated space on each device
 *
 * This will also continue trimming even if a block group or device encounters
 * an error.  The return value will be the last error, or 0 if nothing bad
 * happens.
 */
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
{
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        struct btrfs_block_group *cache = NULL;
        struct btrfs_device *device;
        u64 group_trimmed;
        u64 range_end = U64_MAX;
        u64 start;
        u64 end;
        u64 trimmed = 0;
        u64 bg_failed = 0;
        u64 dev_failed = 0;
        int bg_ret = 0;
        int dev_ret = 0;
        int ret = 0;

        if (range->start == U64_MAX)
                return -EINVAL;

        /*
         * Check range overflow if range->len is set.
         * The default range->len is U64_MAX.
         */
        if (range->len != U64_MAX &&
            check_add_overflow(range->start, range->len, &range_end))
                return -EINVAL;

        cache = btrfs_lookup_first_block_group(fs_info, range->start);
        for (; cache; cache = btrfs_next_block_group(cache)) {
                if (cache->start >= range_end) {
                        btrfs_put_block_group(cache);
                        break;
                }

                start = max(range->start, cache->start);
                end = min(range_end, cache->start + cache->length);

                if (end - start >= range->minlen) {
                        if (!btrfs_block_group_done(cache)) {
                                ret = btrfs_cache_block_group(cache, true);
                                if (ret) {
                                        bg_failed++;
                                        bg_ret = ret;
                                        continue;
                                }
                        }
                        ret = btrfs_trim_block_group(cache,
                                                     &group_trimmed,
                                                     start,
                                                     end,
                                                     range->minlen);

                        trimmed += group_trimmed;
                        if (ret) {
                                bg_failed++;
                                bg_ret = ret;
                                continue;
                        }
                }
        }

        if (bg_failed)
                btrfs_warn(fs_info,
                        "failed to trim %llu block group(s), last error %d",
                        bg_failed, bg_ret);

        mutex_lock(&fs_devices->device_list_mutex);
        list_for_each_entry(device, &fs_devices->devices, dev_list) {
                if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
                        continue;

                ret = btrfs_trim_free_extents(device, &group_trimmed);
                if (ret) {
                        dev_failed++;
                        dev_ret = ret;
                        break;
                }

                trimmed += group_trimmed;
        }
        mutex_unlock(&fs_devices->device_list_mutex);

        if (dev_failed)
                btrfs_warn(fs_info,
                        "failed to trim %llu device(s), last error %d",
                        dev_failed, dev_ret);
        range->len = trimmed;
        if (bg_ret)
                return bg_ret;
        return dev_ret;
}

















































































































































































































































































































































































































































































































































































    1 







































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SWAPOPS_H
#define _LINUX_SWAPOPS_H

#include <linux/radix-tree.h>
#include <linux/bug.h>
#include <linux/mm_types.h>

#ifdef CONFIG_MMU

#ifdef CONFIG_SWAP
#include <linux/swapfile.h>
#endif        /* CONFIG_SWAP */

/*
 * swapcache pages are stored in the swapper_space radix tree.  We want to
 * get good packing density in that tree, so the index should be dense in
 * the low-order bits.
 *
 * We arrange the `type' and `offset' fields so that `type' is at the six
 * high-order bits of the swp_entry_t and `offset' is right-aligned in the
 * remaining bits.  Although `type' itself needs only five bits, we allow for
 * shmem/tmpfs to shift it all up a further one bit: see swp_to_radix_entry().
 *
 * swp_entry_t's are *never* stored anywhere in their arch-dependent format.
 */
#define SWP_TYPE_SHIFT        (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT)
#define SWP_OFFSET_MASK        ((1UL << SWP_TYPE_SHIFT) - 1)

/*
 * Definitions only for PFN swap entries (see is_pfn_swap_entry()).  To
 * store PFN, we only need SWP_PFN_BITS bits.  Each of the pfn swap entries
 * can use the extra bits to store other information besides PFN.
 */
#ifdef MAX_PHYSMEM_BITS
#define SWP_PFN_BITS                (MAX_PHYSMEM_BITS - PAGE_SHIFT)
#else  /* MAX_PHYSMEM_BITS */
#define SWP_PFN_BITS                min_t(int, \
                                      sizeof(phys_addr_t) * 8 - PAGE_SHIFT, \
                                      SWP_TYPE_SHIFT)
#endif        /* MAX_PHYSMEM_BITS */
#define SWP_PFN_MASK                (BIT(SWP_PFN_BITS) - 1)

/**
 * Migration swap entry specific bitfield definitions.  Layout:
 *
 *   |----------+--------------------|
 *   | swp_type | swp_offset         |
 *   |----------+--------+-+-+-------|
 *   |          | resv   |D|A|  PFN  |
 *   |----------+--------+-+-+-------|
 *
 * @SWP_MIG_YOUNG_BIT: Whether the page used to have young bit set (bit A)
 * @SWP_MIG_DIRTY_BIT: Whether the page used to have dirty bit set (bit D)
 *
 * Note: A/D bits will be stored in migration entries iff there're enough
 * free bits in arch specific swp offset.  By default we'll ignore A/D bits
 * when migrating a page.  Please refer to migration_entry_supports_ad()
 * for more information.  If there're more bits besides PFN and A/D bits,
 * they should be reserved and always be zeros.
 */
#define SWP_MIG_YOUNG_BIT                (SWP_PFN_BITS)
#define SWP_MIG_DIRTY_BIT                (SWP_PFN_BITS + 1)
#define SWP_MIG_TOTAL_BITS                (SWP_PFN_BITS + 2)

#define SWP_MIG_YOUNG                        BIT(SWP_MIG_YOUNG_BIT)
#define SWP_MIG_DIRTY                        BIT(SWP_MIG_DIRTY_BIT)

static inline bool is_pfn_swap_entry(swp_entry_t entry);

/* Clear all flags but only keep swp_entry_t related information */
static inline pte_t pte_swp_clear_flags(pte_t pte)
{
        if (pte_swp_exclusive(pte))
                pte = pte_swp_clear_exclusive(pte);
        if (pte_swp_soft_dirty(pte))
                pte = pte_swp_clear_soft_dirty(pte);
        if (pte_swp_uffd_wp(pte))
                pte = pte_swp_clear_uffd_wp(pte);
        return pte;
}

/*
 * Store a type+offset into a swp_entry_t in an arch-independent format
 */
static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset)
{
        swp_entry_t ret;

        ret.val = (type << SWP_TYPE_SHIFT) | (offset & SWP_OFFSET_MASK);
        return ret;
}

/*
 * Extract the `type' field from a swp_entry_t.  The swp_entry_t is in
 * arch-independent format
 */
static inline unsigned swp_type(swp_entry_t entry)
{
        return (entry.val >> SWP_TYPE_SHIFT);
}

/*
 * Extract the `offset' field from a swp_entry_t.  The swp_entry_t is in
 * arch-independent format
 */
static inline pgoff_t swp_offset(swp_entry_t entry)
{
        return entry.val & SWP_OFFSET_MASK;
}

/*
 * This should only be called upon a pfn swap entry to get the PFN stored
 * in the swap entry.  Please refers to is_pfn_swap_entry() for definition
 * of pfn swap entry.
 */
static inline unsigned long swp_offset_pfn(swp_entry_t entry)
{
        VM_BUG_ON(!is_pfn_swap_entry(entry));
        return swp_offset(entry) & SWP_PFN_MASK;
}

/* check whether a pte points to a swap entry */
static inline int is_swap_pte(pte_t pte)
{
        return !pte_none(pte) && !pte_present(pte);
}

/*
 * Convert the arch-dependent pte representation of a swp_entry_t into an
 * arch-independent swp_entry_t.
 */
static inline swp_entry_t pte_to_swp_entry(pte_t pte)
{
        swp_entry_t arch_entry;

        pte = pte_swp_clear_flags(pte);
        arch_entry = __pte_to_swp_entry(pte);
        return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
}

/*
 * Convert the arch-independent representation of a swp_entry_t into the
 * arch-dependent pte representation.
 */
static inline pte_t swp_entry_to_pte(swp_entry_t entry)
{
        swp_entry_t arch_entry;

        arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
        return __swp_entry_to_pte(arch_entry);
}

static inline swp_entry_t radix_to_swp_entry(void *arg)
{
        swp_entry_t entry;

        entry.val = xa_to_value(arg);
        return entry;
}

static inline void *swp_to_radix_entry(swp_entry_t entry)
{
        return xa_mk_value(entry.val);
}

#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
{
        return swp_entry(SWP_DEVICE_READ, offset);
}

static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset)
{
        return swp_entry(SWP_DEVICE_WRITE, offset);
}

static inline bool is_device_private_entry(swp_entry_t entry)
{
        int type = swp_type(entry);
        return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE;
}

static inline bool is_writable_device_private_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
}

static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset)
{
        return swp_entry(SWP_DEVICE_EXCLUSIVE_READ, offset);
}

static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset)
{
        return swp_entry(SWP_DEVICE_EXCLUSIVE_WRITE, offset);
}

static inline bool is_device_exclusive_entry(swp_entry_t entry)
{
        return swp_type(entry) == SWP_DEVICE_EXCLUSIVE_READ ||
                swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE;
}

static inline bool is_writable_device_exclusive_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_DEVICE_EXCLUSIVE_WRITE);
}
#else /* CONFIG_DEVICE_PRIVATE */
static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
{
        return swp_entry(0, 0);
}

static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset)
{
        return swp_entry(0, 0);
}

static inline bool is_device_private_entry(swp_entry_t entry)
{
        return false;
}

static inline bool is_writable_device_private_entry(swp_entry_t entry)
{
        return false;
}

static inline swp_entry_t make_readable_device_exclusive_entry(pgoff_t offset)
{
        return swp_entry(0, 0);
}

static inline swp_entry_t make_writable_device_exclusive_entry(pgoff_t offset)
{
        return swp_entry(0, 0);
}

static inline bool is_device_exclusive_entry(swp_entry_t entry)
{
        return false;
}

static inline bool is_writable_device_exclusive_entry(swp_entry_t entry)
{
        return false;
}
#endif /* CONFIG_DEVICE_PRIVATE */

#ifdef CONFIG_MIGRATION
static inline int is_migration_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_MIGRATION_READ ||
                        swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE ||
                        swp_type(entry) == SWP_MIGRATION_WRITE);
}

static inline int is_writable_migration_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE);
}

static inline int is_readable_migration_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_MIGRATION_READ);
}

static inline int is_readable_exclusive_migration_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE);
}

static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
{
        return swp_entry(SWP_MIGRATION_READ, offset);
}

static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset)
{
        return swp_entry(SWP_MIGRATION_READ_EXCLUSIVE, offset);
}

static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
{
        return swp_entry(SWP_MIGRATION_WRITE, offset);
}

/*
 * Returns whether the host has large enough swap offset field to support
 * carrying over pgtable A/D bits for page migrations.  The result is
 * pretty much arch specific.
 */
static inline bool migration_entry_supports_ad(void)
{
#ifdef CONFIG_SWAP
        return swap_migration_ad_supported;
#else  /* CONFIG_SWAP */
        return false;
#endif        /* CONFIG_SWAP */
}

static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
{
        if (migration_entry_supports_ad())
                return swp_entry(swp_type(entry),
                                 swp_offset(entry) | SWP_MIG_YOUNG);
        return entry;
}

static inline bool is_migration_entry_young(swp_entry_t entry)
{
        if (migration_entry_supports_ad())
                return swp_offset(entry) & SWP_MIG_YOUNG;
        /* Keep the old behavior of aging page after migration */
        return false;
}

static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
{
        if (migration_entry_supports_ad())
                return swp_entry(swp_type(entry),
                                 swp_offset(entry) | SWP_MIG_DIRTY);
        return entry;
}

static inline bool is_migration_entry_dirty(swp_entry_t entry)
{
        if (migration_entry_supports_ad())
                return swp_offset(entry) & SWP_MIG_DIRTY;
        /* Keep the old behavior of clean page after migration */
        return false;
}

extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
                                        unsigned long address);
extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte);
#else  /* CONFIG_MIGRATION */
static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
{
        return swp_entry(0, 0);
}

static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset)
{
        return swp_entry(0, 0);
}

static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
{
        return swp_entry(0, 0);
}

static inline int is_migration_entry(swp_entry_t swp)
{
        return 0;
}

static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
                                        unsigned long address) { }
static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
                                        pte_t *pte) { }
static inline int is_writable_migration_entry(swp_entry_t entry)
{
        return 0;
}
static inline int is_readable_migration_entry(swp_entry_t entry)
{
        return 0;
}

static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
{
        return entry;
}

static inline bool is_migration_entry_young(swp_entry_t entry)
{
        return false;
}

static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
{
        return entry;
}

static inline bool is_migration_entry_dirty(swp_entry_t entry)
{
        return false;
}
#endif        /* CONFIG_MIGRATION */

#ifdef CONFIG_MEMORY_FAILURE

/*
 * Support for hardware poisoned pages
 */
static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
        BUG_ON(!PageLocked(page));
        return swp_entry(SWP_HWPOISON, page_to_pfn(page));
}

static inline int is_hwpoison_entry(swp_entry_t entry)
{
        return swp_type(entry) == SWP_HWPOISON;
}

#else

static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
        return swp_entry(0, 0);
}

static inline int is_hwpoison_entry(swp_entry_t swp)
{
        return 0;
}
#endif

typedef unsigned long pte_marker;

#define  PTE_MARKER_UFFD_WP                        BIT(0)
/*
 * "Poisoned" here is meant in the very general sense of "future accesses are
 * invalid", instead of referring very specifically to hardware memory errors.
 * This marker is meant to represent any of various different causes of this.
 */
#define  PTE_MARKER_POISONED                        BIT(1)
#define  PTE_MARKER_MASK                        (BIT(2) - 1)

static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
{
        return swp_entry(SWP_PTE_MARKER, marker);
}

static inline bool is_pte_marker_entry(swp_entry_t entry)
{
        return swp_type(entry) == SWP_PTE_MARKER;
}

static inline pte_marker pte_marker_get(swp_entry_t entry)
{
        return swp_offset(entry) & PTE_MARKER_MASK;
}

static inline bool is_pte_marker(pte_t pte)
{
        return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte));
}

static inline pte_t make_pte_marker(pte_marker marker)
{
        return swp_entry_to_pte(make_pte_marker_entry(marker));
}

static inline swp_entry_t make_poisoned_swp_entry(void)
{
        return make_pte_marker_entry(PTE_MARKER_POISONED);
}

static inline int is_poisoned_swp_entry(swp_entry_t entry)
{
        return is_pte_marker_entry(entry) &&
            (pte_marker_get(entry) & PTE_MARKER_POISONED);
}

/*
 * This is a special version to check pte_none() just to cover the case when
 * the pte is a pte marker.  It existed because in many cases the pte marker
 * should be seen as a none pte; it's just that we have stored some information
 * onto the none pte so it becomes not-none any more.
 *
 * It should be used when the pte is file-backed, ram-based and backing
 * userspace pages, like shmem.  It is not needed upon pgtables that do not
 * support pte markers at all.  For example, it's not needed on anonymous
 * memory, kernel-only memory (including when the system is during-boot),
 * non-ram based generic file-system.  It's fine to be used even there, but the
 * extra pte marker check will be pure overhead.
 */
static inline int pte_none_mostly(pte_t pte)
{
        return pte_none(pte) || is_pte_marker(pte);
}

static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
{
        struct page *p = pfn_to_page(swp_offset_pfn(entry));

        /*
         * Any use of migration entries may only occur while the
         * corresponding page is locked
         */
        BUG_ON(is_migration_entry(entry) && !PageLocked(p));

        return p;
}

static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry)
{
        struct folio *folio = pfn_folio(swp_offset_pfn(entry));

        /*
         * Any use of migration entries may only occur while the
         * corresponding folio is locked
         */
        BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio));

        return folio;
}

/*
 * A pfn swap entry is a special type of swap entry that always has a pfn stored
 * in the swap offset. They can either be used to represent unaddressable device
 * memory, to restrict access to a page undergoing migration or to represent a
 * pfn which has been hwpoisoned and unmapped.
 */
static inline bool is_pfn_swap_entry(swp_entry_t entry)
{
        /* Make sure the swp offset can always store the needed fields */
        BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS);

        return is_migration_entry(entry) || is_device_private_entry(entry) ||
               is_device_exclusive_entry(entry) || is_hwpoison_entry(entry);
}

struct page_vma_mapped_walk;

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
extern int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
                struct page *page);

extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
                struct page *new);

extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd);

static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
{
        swp_entry_t arch_entry;

        if (pmd_swp_soft_dirty(pmd))
                pmd = pmd_swp_clear_soft_dirty(pmd);
        if (pmd_swp_uffd_wp(pmd))
                pmd = pmd_swp_clear_uffd_wp(pmd);
        arch_entry = __pmd_to_swp_entry(pmd);
        return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
}

static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
{
        swp_entry_t arch_entry;

        arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
        return __swp_entry_to_pmd(arch_entry);
}

static inline int is_pmd_migration_entry(pmd_t pmd)
{
        return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
}
#else  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
                struct page *page)
{
        BUILD_BUG();
}

static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
                struct page *new)
{
        BUILD_BUG();
}

static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { }

static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
{
        return swp_entry(0, 0);
}

static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
{
        return __pmd(0);
}

static inline int is_pmd_migration_entry(pmd_t pmd)
{
        return 0;
}
#endif  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */

static inline int non_swap_entry(swp_entry_t entry)
{
        return swp_type(entry) >= MAX_SWAPFILES;
}

#endif /* CONFIG_MMU */
#endif /* _LINUX_SWAPOPS_H */

























































































































































    1 





    1 























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
// SPDX-License-Identifier: GPL-2.0
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
#include <linux/ktime.h>
#include <linux/seq_file.h>
#include <linux/user_namespace.h>
#include <linux/nsfs.h>
#include <linux/uaccess.h>

#include "internal.h"

static struct vfsmount *nsfs_mnt;

static long ns_ioctl(struct file *filp, unsigned int ioctl,
                        unsigned long arg);
static const struct file_operations ns_file_operations = {
        .llseek                = no_llseek,
        .unlocked_ioctl = ns_ioctl,
        .compat_ioctl   = compat_ptr_ioctl,
};

static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
{
        struct inode *inode = d_inode(dentry);
        struct ns_common *ns = inode->i_private;
        const struct proc_ns_operations *ns_ops = ns->ops;

        return dynamic_dname(buffer, buflen, "%s:[%lu]",
                ns_ops->name, inode->i_ino);
}

const struct dentry_operations ns_dentry_operations = {
        .d_delete        = always_delete_dentry,
        .d_dname        = ns_dname,
        .d_prune        = stashed_dentry_prune,
};

static void nsfs_evict(struct inode *inode)
{
        struct ns_common *ns = inode->i_private;
        clear_inode(inode);
        ns->ops->put(ns);
}

int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
                     void *private_data)
{
        struct ns_common *ns;

        ns = ns_get_cb(private_data);
        if (!ns)
                return -ENOENT;

        return path_from_stashed(&ns->stashed, nsfs_mnt, ns, path);
}

struct ns_get_path_task_args {
        const struct proc_ns_operations *ns_ops;
        struct task_struct *task;
};

static struct ns_common *ns_get_path_task(void *private_data)
{
        struct ns_get_path_task_args *args = private_data;

        return args->ns_ops->get(args->task);
}

int ns_get_path(struct path *path, struct task_struct *task,
                  const struct proc_ns_operations *ns_ops)
{
        struct ns_get_path_task_args args = {
                .ns_ops        = ns_ops,
                .task        = task,
        };

        return ns_get_path_cb(path, ns_get_path_task, &args);
}

int open_related_ns(struct ns_common *ns,
                   struct ns_common *(*get_ns)(struct ns_common *ns))
{
        struct path path = {};
        struct ns_common *relative;
        struct file *f;
        int err;
        int fd;

        fd = get_unused_fd_flags(O_CLOEXEC);
        if (fd < 0)
                return fd;

        relative = get_ns(ns);
        if (IS_ERR(relative)) {
                put_unused_fd(fd);
                return PTR_ERR(relative);
        }

        err = path_from_stashed(&relative->stashed, nsfs_mnt, relative, &path);
        if (err < 0) {
                put_unused_fd(fd);
                return err;
        }

        f = dentry_open(&path, O_RDONLY, current_cred());
        path_put(&path);
        if (IS_ERR(f)) {
                put_unused_fd(fd);
                fd = PTR_ERR(f);
        } else
                fd_install(fd, f);

        return fd;
}
EXPORT_SYMBOL_GPL(open_related_ns);

static long ns_ioctl(struct file *filp, unsigned int ioctl,
                        unsigned long arg)
{
        struct user_namespace *user_ns;
        struct ns_common *ns = get_proc_ns(file_inode(filp));
        uid_t __user *argp;
        uid_t uid;

        switch (ioctl) {
        case NS_GET_USERNS:
                return open_related_ns(ns, ns_get_owner);
        case NS_GET_PARENT:
                if (!ns->ops->get_parent)
                        return -EINVAL;
                return open_related_ns(ns, ns->ops->get_parent);
        case NS_GET_NSTYPE:
                return ns->ops->type;
        case NS_GET_OWNER_UID:
                if (ns->ops->type != CLONE_NEWUSER)
                        return -EINVAL;
                user_ns = container_of(ns, struct user_namespace, ns);
                argp = (uid_t __user *) arg;
                uid = from_kuid_munged(current_user_ns(), user_ns->owner);
                return put_user(uid, argp);
        default:
                return -ENOTTY;
        }
}

int ns_get_name(char *buf, size_t size, struct task_struct *task,
                        const struct proc_ns_operations *ns_ops)
{
        struct ns_common *ns;
        int res = -ENOENT;
        const char *name;
        ns = ns_ops->get(task);
        if (ns) {
                name = ns_ops->real_ns_name ? : ns_ops->name;
                res = snprintf(buf, size, "%s:[%u]", name, ns->inum);
                ns_ops->put(ns);
        }
        return res;
}

bool proc_ns_file(const struct file *file)
{
        return file->f_op == &ns_file_operations;
}

/**
 * ns_match() - Returns true if current namespace matches dev/ino provided.
 * @ns: current namespace
 * @dev: dev_t from nsfs that will be matched against current nsfs
 * @ino: ino_t from nsfs that will be matched against current nsfs
 *
 * Return: true if dev and ino matches the current nsfs.
 */
bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino)
{
        return (ns->inum == ino) && (nsfs_mnt->mnt_sb->s_dev == dev);
}


static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        const struct ns_common *ns = inode->i_private;
        const struct proc_ns_operations *ns_ops = ns->ops;

        seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
        return 0;
}

static const struct super_operations nsfs_ops = {
        .statfs = simple_statfs,
        .evict_inode = nsfs_evict,
        .show_path = nsfs_show_path,
};

static int nsfs_init_inode(struct inode *inode, void *data)
{
        struct ns_common *ns = data;

        inode->i_private = data;
        inode->i_mode |= S_IRUGO;
        inode->i_fop = &ns_file_operations;
        inode->i_ino = ns->inum;
        return 0;
}

static void nsfs_put_data(void *data)
{
        struct ns_common *ns = data;
        ns->ops->put(ns);
}

static const struct stashed_operations nsfs_stashed_ops = {
        .init_inode = nsfs_init_inode,
        .put_data = nsfs_put_data,
};

static int nsfs_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        ctx->ops = &nsfs_ops;
        ctx->dops = &ns_dentry_operations;
        fc->s_fs_info = (void *)&nsfs_stashed_ops;
        return 0;
}

static struct file_system_type nsfs = {
        .name = "nsfs",
        .init_fs_context = nsfs_init_fs_context,
        .kill_sb = kill_anon_super,
};

void __init nsfs_init(void)
{
        nsfs_mnt = kern_mount(&nsfs);
        if (IS_ERR(nsfs_mnt))
                panic("can't set nsfs up\n");
        nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER;
}



















































































































































    4 




    4 






































   40 





















   35 




    4 

















    4 




   19 















    3 
   19 











    2 



   26 



























   15 




   19 

















   19 









    3 















    3 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Variant of atomic_t specialized for reference counts.
 *
 * The interface matches the atomic_t interface (to aid in porting) but only
 * provides the few functions one should use for reference counting.
 *
 * Saturation semantics
 * ====================
 *
 * refcount_t differs from atomic_t in that the counter saturates at
 * REFCOUNT_SATURATED and will not move once there. This avoids wrapping the
 * counter and causing 'spurious' use-after-free issues. In order to avoid the
 * cost associated with introducing cmpxchg() loops into all of the saturating
 * operations, we temporarily allow the counter to take on an unchecked value
 * and then explicitly set it to REFCOUNT_SATURATED on detecting that underflow
 * or overflow has occurred. Although this is racy when multiple threads
 * access the refcount concurrently, by placing REFCOUNT_SATURATED roughly
 * equidistant from 0 and INT_MAX we minimise the scope for error:
 *
 *                                    INT_MAX     REFCOUNT_SATURATED   UINT_MAX
 *   0                          (0x7fff_ffff)    (0xc000_0000)    (0xffff_ffff)
 *   +--------------------------------+----------------+----------------+
 *                                     <---------- bad value! ---------->
 *
 * (in a signed view of the world, the "bad value" range corresponds to
 * a negative counter value).
 *
 * As an example, consider a refcount_inc() operation that causes the counter
 * to overflow:
 *
 *         int old = atomic_fetch_add_relaxed(r);
 *        // old is INT_MAX, refcount now INT_MIN (0x8000_0000)
 *        if (old < 0)
 *                atomic_set(r, REFCOUNT_SATURATED);
 *
 * If another thread also performs a refcount_inc() operation between the two
 * atomic operations, then the count will continue to edge closer to 0. If it
 * reaches a value of 1 before /any/ of the threads reset it to the saturated
 * value, then a concurrent refcount_dec_and_test() may erroneously free the
 * underlying object.
 * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently
 * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK).
 * With the current PID limit, if no batched refcounting operations are used and
 * the attacker can't repeatedly trigger kernel oopses in the middle of refcount
 * operations, this makes it impossible for a saturated refcount to leave the
 * saturation range, even if it is possible for multiple uses of the same
 * refcount to nest in the context of a single task:
 *
 *     (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT =
 *     0x40000000 / 0x400000 = 0x100 = 256
 *
 * If hundreds of references are added/removed with a single refcounting
 * operation, it may potentially be possible to leave the saturation range; but
 * given the precise timing details involved with the round-robin scheduling of
 * each thread manipulating the refcount and the need to hit the race multiple
 * times in succession, there doesn't appear to be a practical avenue of attack
 * even if using refcount_add() operations with larger increments.
 *
 * Memory ordering
 * ===============
 *
 * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
 * and provide only what is strictly required for refcounts.
 *
 * The increments are fully relaxed; these will not provide ordering. The
 * rationale is that whatever is used to obtain the object we're increasing the
 * reference count on will provide the ordering. For locked data structures,
 * its the lock acquire, for RCU/lockless data structures its the dependent
 * load.
 *
 * Do note that inc_not_zero() provides a control dependency which will order
 * future stores against the inc, this ensures we'll never modify the object
 * if we did not in fact acquire a reference.
 *
 * The decrements will provide release order, such that all the prior loads and
 * stores will be issued before, it also provides a control dependency, which
 * will order us against the subsequent free().
 *
 * The control dependency is against the load of the cmpxchg (ll/sc) that
 * succeeded. This means the stores aren't fully ordered, but this is fine
 * because the 1->0 transition indicates no concurrency.
 *
 * Note that the allocator is responsible for ordering things between free()
 * and alloc().
 *
 * The decrements dec_and_test() and sub_and_test() also provide acquire
 * ordering on success.
 *
 */

#ifndef _LINUX_REFCOUNT_H
#define _LINUX_REFCOUNT_H

#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/limits.h>
#include <linux/refcount_types.h>
#include <linux/spinlock_types.h>

struct mutex;

#define REFCOUNT_INIT(n)        { .refs = ATOMIC_INIT(n), }
#define REFCOUNT_MAX                INT_MAX
#define REFCOUNT_SATURATED        (INT_MIN / 2)

enum refcount_saturation_type {
        REFCOUNT_ADD_NOT_ZERO_OVF,
        REFCOUNT_ADD_OVF,
        REFCOUNT_ADD_UAF,
        REFCOUNT_SUB_UAF,
        REFCOUNT_DEC_LEAK,
};

void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t);

/**
 * refcount_set - set a refcount's value
 * @r: the refcount
 * @n: value to which the refcount will be set
 */
static inline void refcount_set(refcount_t *r, int n)
{
        atomic_set(&r->refs, n);
}

/**
 * refcount_read - get a refcount's value
 * @r: the refcount
 *
 * Return: the refcount's value
 */
static inline unsigned int refcount_read(const refcount_t *r)
{
        return atomic_read(&r->refs);
}

static inline __must_check __signed_wrap
bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp)
{
        int old = refcount_read(r);

        do {
                if (!old)
                        break;
        } while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i));

        if (oldp)
                *oldp = old;

        if (unlikely(old < 0 || old + i < 0))
                refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF);

        return old;
}

/**
 * refcount_add_not_zero - add a value to a refcount unless it is 0
 * @i: the value to add to the refcount
 * @r: the refcount
 *
 * Will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_inc(), or one of its variants, should instead be used to
 * increment a reference count.
 *
 * Return: false if the passed refcount is 0, true otherwise
 */
static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
{
        return __refcount_add_not_zero(i, r, NULL);
}

static inline __signed_wrap
void __refcount_add(int i, refcount_t *r, int *oldp)
{
        int old = atomic_fetch_add_relaxed(i, &r->refs);

        if (oldp)
                *oldp = old;

        if (unlikely(!old))
                refcount_warn_saturate(r, REFCOUNT_ADD_UAF);
        else if (unlikely(old < 0 || old + i < 0))
                refcount_warn_saturate(r, REFCOUNT_ADD_OVF);
}

/**
 * refcount_add - add a value to a refcount
 * @i: the value to add to the refcount
 * @r: the refcount
 *
 * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_inc(), or one of its variants, should instead be used to
 * increment a reference count.
 */
static inline void refcount_add(int i, refcount_t *r)
{
        __refcount_add(i, r, NULL);
}

static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp)
{
        return __refcount_add_not_zero(1, r, oldp);
}

/**
 * refcount_inc_not_zero - increment a refcount unless it is 0
 * @r: the refcount to increment
 *
 * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED
 * and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Return: true if the increment was successful, false otherwise
 */
static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
{
        return __refcount_inc_not_zero(r, NULL);
}

static inline void __refcount_inc(refcount_t *r, int *oldp)
{
        __refcount_add(1, r, oldp);
}

/**
 * refcount_inc - increment a refcount
 * @r: the refcount to increment
 *
 * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller already has a
 * reference on the object.
 *
 * Will WARN if the refcount is 0, as this represents a possible use-after-free
 * condition.
 */
static inline void refcount_inc(refcount_t *r)
{
        __refcount_inc(r, NULL);
}

static inline __must_check __signed_wrap
bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp)
{
        int old = atomic_fetch_sub_release(i, &r->refs);

        if (oldp)
                *oldp = old;

        if (old == i) {
                smp_acquire__after_ctrl_dep();
                return true;
        }

        if (unlikely(old < 0 || old - i < 0))
                refcount_warn_saturate(r, REFCOUNT_SUB_UAF);

        return false;
}

/**
 * refcount_sub_and_test - subtract from a refcount and test if it is 0
 * @i: amount to subtract from the refcount
 * @r: the refcount
 *
 * Similar to atomic_dec_and_test(), but it will WARN, return false and
 * ultimately leak on underflow and will fail to decrement when saturated
 * at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_dec(), or one of its variants, should instead be used to
 * decrement a reference count.
 *
 * Return: true if the resulting refcount is 0, false otherwise
 */
static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r)
{
        return __refcount_sub_and_test(i, r, NULL);
}

static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp)
{
        return __refcount_sub_and_test(1, r, oldp);
}

/**
 * refcount_dec_and_test - decrement a refcount and test if it is 0
 * @r: the refcount
 *
 * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
 * decrement when saturated at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Return: true if the resulting refcount is 0, false otherwise
 */
static inline __must_check bool refcount_dec_and_test(refcount_t *r)
{
        return __refcount_dec_and_test(r, NULL);
}

static inline void __refcount_dec(refcount_t *r, int *oldp)
{
        int old = atomic_fetch_sub_release(1, &r->refs);

        if (oldp)
                *oldp = old;

        if (unlikely(old <= 1))
                refcount_warn_saturate(r, REFCOUNT_DEC_LEAK);
}

/**
 * refcount_dec - decrement a refcount
 * @r: the refcount
 *
 * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
 * when saturated at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before.
 */
static inline void refcount_dec(refcount_t *r)
{
        __refcount_dec(r, NULL);
}

extern __must_check bool refcount_dec_if_one(refcount_t *r);
extern __must_check bool refcount_dec_not_one(refcount_t *r);
extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) __cond_acquires(lock);
extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) __cond_acquires(lock);
extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r,
                                                       spinlock_t *lock,
                                                       unsigned long *flags) __cond_acquires(lock);
#endif /* _LINUX_REFCOUNT_H */





















































































































































































































































































































































































































































    4 






























    4 


    5 





























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMU_NOTIFIER_H
#define _LINUX_MMU_NOTIFIER_H

#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/srcu.h>
#include <linux/interval_tree.h>

struct mmu_notifier_subscriptions;
struct mmu_notifier;
struct mmu_notifier_range;
struct mmu_interval_notifier;

/**
 * enum mmu_notifier_event - reason for the mmu notifier callback
 * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that
 * move the range
 *
 * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like
 * madvise() or replacing a page by another one, ...).
 *
 * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range
 * ie using the vma access permission (vm_page_prot) to update the whole range
 * is enough no need to inspect changes to the CPU page table (mprotect()
 * syscall)
 *
 * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for
 * pages in the range so to mirror those changes the user must inspect the CPU
 * page table (from the end callback).
 *
 * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same
 * access flags). User should soft dirty the page in the end callback to make
 * sure that anyone relying on soft dirtiness catch pages that might be written
 * through non CPU mappings.
 *
 * @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal
 * that the mm refcount is zero and the range is no longer accessible.
 *
 * @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal
 * a device driver to possibly ignore the invalidation if the
 * owner field matches the driver's device private pgmap owner.
 *
 * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no
 * longer have exclusive access to the page. When sent during creation of an
 * exclusive range the owner will be initialised to the value provided by the
 * caller of make_device_exclusive_range(), otherwise the owner will be NULL.
 */
enum mmu_notifier_event {
        MMU_NOTIFY_UNMAP = 0,
        MMU_NOTIFY_CLEAR,
        MMU_NOTIFY_PROTECTION_VMA,
        MMU_NOTIFY_PROTECTION_PAGE,
        MMU_NOTIFY_SOFT_DIRTY,
        MMU_NOTIFY_RELEASE,
        MMU_NOTIFY_MIGRATE,
        MMU_NOTIFY_EXCLUSIVE,
};

#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)

struct mmu_notifier_ops {
        /*
         * Called either by mmu_notifier_unregister or when the mm is
         * being destroyed by exit_mmap, always before all pages are
         * freed. This can run concurrently with other mmu notifier
         * methods (the ones invoked outside the mm context) and it
         * should tear down all secondary mmu mappings and freeze the
         * secondary mmu. If this method isn't implemented you've to
         * be sure that nothing could possibly write to the pages
         * through the secondary mmu by the time the last thread with
         * tsk->mm == mm exits.
         *
         * As side note: the pages freed after ->release returns could
         * be immediately reallocated by the gart at an alias physical
         * address with a different cache model, so if ->release isn't
         * implemented because all _software_ driven memory accesses
         * through the secondary mmu are terminated by the time the
         * last thread of this mm quits, you've also to be sure that
         * speculative _hardware_ operations can't allocate dirty
         * cachelines in the cpu that could not be snooped and made
         * coherent with the other read and write operations happening
         * through the gart alias address, so leading to memory
         * corruption.
         */
        void (*release)(struct mmu_notifier *subscription,
                        struct mm_struct *mm);

        /*
         * clear_flush_young is called after the VM is
         * test-and-clearing the young/accessed bitflag in the
         * pte. This way the VM will provide proper aging to the
         * accesses to the page through the secondary MMUs and not
         * only to the ones through the Linux pte.
         * Start-end is necessary in case the secondary MMU is mapping the page
         * at a smaller granularity than the primary MMU.
         */
        int (*clear_flush_young)(struct mmu_notifier *subscription,
                                 struct mm_struct *mm,
                                 unsigned long start,
                                 unsigned long end);

        /*
         * clear_young is a lightweight version of clear_flush_young. Like the
         * latter, it is supposed to test-and-clear the young/accessed bitflag
         * in the secondary pte, but it may omit flushing the secondary tlb.
         */
        int (*clear_young)(struct mmu_notifier *subscription,
                           struct mm_struct *mm,
                           unsigned long start,
                           unsigned long end);

        /*
         * test_young is called to check the young/accessed bitflag in
         * the secondary pte. This is used to know if the page is
         * frequently used without actually clearing the flag or tearing
         * down the secondary mapping on the page.
         */
        int (*test_young)(struct mmu_notifier *subscription,
                          struct mm_struct *mm,
                          unsigned long address);

        /*
         * invalidate_range_start() and invalidate_range_end() must be
         * paired and are called only when the mmap_lock and/or the
         * locks protecting the reverse maps are held. If the subsystem
         * can't guarantee that no additional references are taken to
         * the pages in the range, it has to implement the
         * invalidate_range() notifier to remove any references taken
         * after invalidate_range_start().
         *
         * Invalidation of multiple concurrent ranges may be
         * optionally permitted by the driver. Either way the
         * establishment of sptes is forbidden in the range passed to
         * invalidate_range_begin/end for the whole duration of the
         * invalidate_range_begin/end critical section.
         *
         * invalidate_range_start() is called when all pages in the
         * range are still mapped and have at least a refcount of one.
         *
         * invalidate_range_end() is called when all pages in the
         * range have been unmapped and the pages have been freed by
         * the VM.
         *
         * The VM will remove the page table entries and potentially
         * the page between invalidate_range_start() and
         * invalidate_range_end(). If the page must not be freed
         * because of pending I/O or other circumstances then the
         * invalidate_range_start() callback (or the initial mapping
         * by the driver) must make sure that the refcount is kept
         * elevated.
         *
         * If the driver increases the refcount when the pages are
         * initially mapped into an address space then either
         * invalidate_range_start() or invalidate_range_end() may
         * decrease the refcount. If the refcount is decreased on
         * invalidate_range_start() then the VM can free pages as page
         * table entries are removed.  If the refcount is only
         * dropped on invalidate_range_end() then the driver itself
         * will drop the last refcount but it must take care to flush
         * any secondary tlb before doing the final free on the
         * page. Pages will no longer be referenced by the linux
         * address space but may still be referenced by sptes until
         * the last refcount is dropped.
         *
         * If blockable argument is set to false then the callback cannot
         * sleep and has to return with -EAGAIN if sleeping would be required.
         * 0 should be returned otherwise. Please note that notifiers that can
         * fail invalidate_range_start are not allowed to implement
         * invalidate_range_end, as there is no mechanism for informing the
         * notifier that its start failed.
         */
        int (*invalidate_range_start)(struct mmu_notifier *subscription,
                                      const struct mmu_notifier_range *range);
        void (*invalidate_range_end)(struct mmu_notifier *subscription,
                                     const struct mmu_notifier_range *range);

        /*
         * arch_invalidate_secondary_tlbs() is used to manage a non-CPU TLB
         * which shares page-tables with the CPU. The
         * invalidate_range_start()/end() callbacks should not be implemented as
         * invalidate_secondary_tlbs() already catches the points in time when
         * an external TLB needs to be flushed.
         *
         * This requires arch_invalidate_secondary_tlbs() to be called while
         * holding the ptl spin-lock and therefore this callback is not allowed
         * to sleep.
         *
         * This is called by architecture code whenever invalidating a TLB
         * entry. It is assumed that any secondary TLB has the same rules for
         * when invalidations are required. If this is not the case architecture
         * code will need to call this explicitly when required for secondary
         * TLB invalidation.
         */
        void (*arch_invalidate_secondary_tlbs)(
                                        struct mmu_notifier *subscription,
                                        struct mm_struct *mm,
                                        unsigned long start,
                                        unsigned long end);

        /*
         * These callbacks are used with the get/put interface to manage the
         * lifetime of the mmu_notifier memory. alloc_notifier() returns a new
         * notifier for use with the mm.
         *
         * free_notifier() is only called after the mmu_notifier has been
         * fully put, calls to any ops callback are prevented and no ops
         * callbacks are currently running. It is called from a SRCU callback
         * and cannot sleep.
         */
        struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm);
        void (*free_notifier)(struct mmu_notifier *subscription);
};

/*
 * The notifier chains are protected by mmap_lock and/or the reverse map
 * semaphores. Notifier chains are only changed when all reverse maps and
 * the mmap_lock locks are taken.
 *
 * Therefore notifier chains can only be traversed when either
 *
 * 1. mmap_lock is held.
 * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem).
 * 3. No other concurrent thread can access the list (release)
 */
struct mmu_notifier {
        struct hlist_node hlist;
        const struct mmu_notifier_ops *ops;
        struct mm_struct *mm;
        struct rcu_head rcu;
        unsigned int users;
};

/**
 * struct mmu_interval_notifier_ops
 * @invalidate: Upon return the caller must stop using any SPTEs within this
 *              range. This function can sleep. Return false only if sleeping
 *              was required but mmu_notifier_range_blockable(range) is false.
 */
struct mmu_interval_notifier_ops {
        bool (*invalidate)(struct mmu_interval_notifier *interval_sub,
                           const struct mmu_notifier_range *range,
                           unsigned long cur_seq);
};

struct mmu_interval_notifier {
        struct interval_tree_node interval_tree;
        const struct mmu_interval_notifier_ops *ops;
        struct mm_struct *mm;
        struct hlist_node deferred_item;
        unsigned long invalidate_seq;
};

#ifdef CONFIG_MMU_NOTIFIER

#ifdef CONFIG_LOCKDEP
extern struct lockdep_map __mmu_notifier_invalidate_range_start_map;
#endif

struct mmu_notifier_range {
        struct mm_struct *mm;
        unsigned long start;
        unsigned long end;
        unsigned flags;
        enum mmu_notifier_event event;
        void *owner;
};

static inline int mm_has_notifiers(struct mm_struct *mm)
{
        return unlikely(mm->notifier_subscriptions);
}

struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
                                             struct mm_struct *mm);
static inline struct mmu_notifier *
mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm)
{
        struct mmu_notifier *ret;

        mmap_write_lock(mm);
        ret = mmu_notifier_get_locked(ops, mm);
        mmap_write_unlock(mm);
        return ret;
}
void mmu_notifier_put(struct mmu_notifier *subscription);
void mmu_notifier_synchronize(void);

extern int mmu_notifier_register(struct mmu_notifier *subscription,
                                 struct mm_struct *mm);
extern int __mmu_notifier_register(struct mmu_notifier *subscription,
                                   struct mm_struct *mm);
extern void mmu_notifier_unregister(struct mmu_notifier *subscription,
                                    struct mm_struct *mm);

unsigned long
mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub);
int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
                                 struct mm_struct *mm, unsigned long start,
                                 unsigned long length,
                                 const struct mmu_interval_notifier_ops *ops);
int mmu_interval_notifier_insert_locked(
        struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
        unsigned long start, unsigned long length,
        const struct mmu_interval_notifier_ops *ops);
void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub);

/**
 * mmu_interval_set_seq - Save the invalidation sequence
 * @interval_sub - The subscription passed to invalidate
 * @cur_seq - The cur_seq passed to the invalidate() callback
 *
 * This must be called unconditionally from the invalidate callback of a
 * struct mmu_interval_notifier_ops under the same lock that is used to call
 * mmu_interval_read_retry(). It updates the sequence number for later use by
 * mmu_interval_read_retry(). The provided cur_seq will always be odd.
 *
 * If the caller does not call mmu_interval_read_begin() or
 * mmu_interval_read_retry() then this call is not required.
 */
static inline void
mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub,
                     unsigned long cur_seq)
{
        WRITE_ONCE(interval_sub->invalidate_seq, cur_seq);
}

/**
 * mmu_interval_read_retry - End a read side critical section against a VA range
 * interval_sub: The subscription
 * seq: The return of the paired mmu_interval_read_begin()
 *
 * This MUST be called under a user provided lock that is also held
 * unconditionally by op->invalidate() when it calls mmu_interval_set_seq().
 *
 * Each call should be paired with a single mmu_interval_read_begin() and
 * should be used to conclude the read side.
 *
 * Returns true if an invalidation collided with this critical section, and
 * the caller should retry.
 */
static inline bool
mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub,
                        unsigned long seq)
{
        return interval_sub->invalidate_seq != seq;
}

/**
 * mmu_interval_check_retry - Test if a collision has occurred
 * interval_sub: The subscription
 * seq: The return of the matching mmu_interval_read_begin()
 *
 * This can be used in the critical section between mmu_interval_read_begin()
 * and mmu_interval_read_retry().  A return of true indicates an invalidation
 * has collided with this critical region and a future
 * mmu_interval_read_retry() will return true.
 *
 * False is not reliable and only suggests a collision may not have
 * occurred. It can be called many times and does not have to hold the user
 * provided lock.
 *
 * This call can be used as part of loops and other expensive operations to
 * expedite a retry.
 */
static inline bool
mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub,
                         unsigned long seq)
{
        /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
        return READ_ONCE(interval_sub->invalidate_seq) != seq;
}

extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm);
extern void __mmu_notifier_release(struct mm_struct *mm);
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end);
extern int __mmu_notifier_clear_young(struct mm_struct *mm,
                                      unsigned long start,
                                      unsigned long end);
extern int __mmu_notifier_test_young(struct mm_struct *mm,
                                     unsigned long address);
extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r);
extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r);
extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
                                        unsigned long start, unsigned long end);
extern bool
mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range);

static inline bool
mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
{
        return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE);
}

static inline void mmu_notifier_release(struct mm_struct *mm)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_release(mm);
}

static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_clear_flush_young(mm, start, end);
        return 0;
}

static inline int mmu_notifier_clear_young(struct mm_struct *mm,
                                           unsigned long start,
                                           unsigned long end)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_clear_young(mm, start, end);
        return 0;
}

static inline int mmu_notifier_test_young(struct mm_struct *mm,
                                          unsigned long address)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_test_young(mm, address);
        return 0;
}

static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
        might_sleep();

        lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
        if (mm_has_notifiers(range->mm)) {
                range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE;
                __mmu_notifier_invalidate_range_start(range);
        }
        lock_map_release(&__mmu_notifier_invalidate_range_start_map);
}

/*
 * This version of mmu_notifier_invalidate_range_start() avoids blocking, but it
 * can return an error if a notifier can't proceed without blocking, in which
 * case you're not allowed to modify PTEs in the specified range.
 *
 * This is mainly intended for OOM handling.
 */
static inline int __must_check
mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
{
        int ret = 0;

        lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
        if (mm_has_notifiers(range->mm)) {
                range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE;
                ret = __mmu_notifier_invalidate_range_start(range);
        }
        lock_map_release(&__mmu_notifier_invalidate_range_start_map);
        return ret;
}

static inline void
mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
        if (mmu_notifier_range_blockable(range))
                might_sleep();

        if (mm_has_notifiers(range->mm))
                __mmu_notifier_invalidate_range_end(range);
}

static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
                                        unsigned long start, unsigned long end)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}

static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
{
        mm->notifier_subscriptions = NULL;
}

static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_subscriptions_destroy(mm);
}


static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
                                           enum mmu_notifier_event event,
                                           unsigned flags,
                                           struct mm_struct *mm,
                                           unsigned long start,
                                           unsigned long end)
{
        range->event = event;
        range->mm = mm;
        range->start = start;
        range->end = end;
        range->flags = flags;
}

static inline void mmu_notifier_range_init_owner(
                        struct mmu_notifier_range *range,
                        enum mmu_notifier_event event, unsigned int flags,
                        struct mm_struct *mm, unsigned long start,
                        unsigned long end, void *owner)
{
        mmu_notifier_range_init(range, event, flags, mm, start, end);
        range->owner = owner;
}

#define ptep_clear_flush_young_notify(__vma, __address, __ptep)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = ptep_clear_flush_young(___vma, ___address, __ptep);        \
        __young |= mmu_notifier_clear_flush_young(___vma->vm_mm,        \
                                                  ___address,                \
                                                  ___address +                \
                                                        PAGE_SIZE);        \
        __young;                                                        \
})

#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = pmdp_clear_flush_young(___vma, ___address, __pmdp);        \
        __young |= mmu_notifier_clear_flush_young(___vma->vm_mm,        \
                                                  ___address,                \
                                                  ___address +                \
                                                        PMD_SIZE);        \
        __young;                                                        \
})

#define ptep_clear_young_notify(__vma, __address, __ptep)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\
        __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,        \
                                            ___address + PAGE_SIZE);        \
        __young;                                                        \
})

#define pmdp_clear_young_notify(__vma, __address, __pmdp)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\
        __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,        \
                                            ___address + PMD_SIZE);        \
        __young;                                                        \
})

#else /* CONFIG_MMU_NOTIFIER */

struct mmu_notifier_range {
        unsigned long start;
        unsigned long end;
};

static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range,
                                            unsigned long start,
                                            unsigned long end)
{
        range->start = start;
        range->end = end;
}

#define mmu_notifier_range_init(range,event,flags,mm,start,end)  \
        _mmu_notifier_range_init(range, start, end)
#define mmu_notifier_range_init_owner(range, event, flags, mm, start, \
                                        end, owner) \
        _mmu_notifier_range_init(range, start, end)

static inline bool
mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
{
        return true;
}

static inline int mm_has_notifiers(struct mm_struct *mm)
{
        return 0;
}

static inline void mmu_notifier_release(struct mm_struct *mm)
{
}

static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end)
{
        return 0;
}

static inline int mmu_notifier_test_young(struct mm_struct *mm,
                                          unsigned long address)
{
        return 0;
}

static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
}

static inline int
mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
{
        return 0;
}

static inline
void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
}

static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
                                  unsigned long start, unsigned long end)
{
}

static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
{
}

static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
}

#define mmu_notifier_range_update_to_read_only(r) false

#define ptep_clear_flush_young_notify ptep_clear_flush_young
#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
#define ptep_clear_young_notify ptep_test_and_clear_young
#define pmdp_clear_young_notify pmdp_test_and_clear_young
#define        ptep_clear_flush_notify ptep_clear_flush
#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
#define pudp_huge_clear_flush_notify pudp_huge_clear_flush

static inline void mmu_notifier_synchronize(void)
{
}

#endif /* CONFIG_MMU_NOTIFIER */

#endif /* _LINUX_MMU_NOTIFIER_H */














































   23 












   11 

























































































    1 

































































    5 













   11 


























   13 












    1 



























































































    3 





















   13 










   13 












   13 












































    5 
















    1 





































    3 
















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM block

#if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_BLOCK_H

#include <linux/blktrace_api.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/tracepoint.h>

#define RWBS_LEN        8

#ifdef CONFIG_BUFFER_HEAD
DECLARE_EVENT_CLASS(block_buffer,

        TP_PROTO(struct buffer_head *bh),

        TP_ARGS(bh),

        TP_STRUCT__entry (
                __field(  dev_t,        dev                        )
                __field(  sector_t,        sector                        )
                __field(  size_t,        size                        )
        ),

        TP_fast_assign(
                __entry->dev                = bh->b_bdev->bd_dev;
                __entry->sector                = bh->b_blocknr;
                __entry->size                = bh->b_size;
        ),

        TP_printk("%d,%d sector=%llu size=%zu",
                MAJOR(__entry->dev), MINOR(__entry->dev),
                (unsigned long long)__entry->sector, __entry->size
        )
);

/**
 * block_touch_buffer - mark a buffer accessed
 * @bh: buffer_head being touched
 *
 * Called from touch_buffer().
 */
DEFINE_EVENT(block_buffer, block_touch_buffer,

        TP_PROTO(struct buffer_head *bh),

        TP_ARGS(bh)
);

/**
 * block_dirty_buffer - mark a buffer dirty
 * @bh: buffer_head being dirtied
 *
 * Called from mark_buffer_dirty().
 */
DEFINE_EVENT(block_buffer, block_dirty_buffer,

        TP_PROTO(struct buffer_head *bh),

        TP_ARGS(bh)
);
#endif /* CONFIG_BUFFER_HEAD */

/**
 * block_rq_requeue - place block IO request back on a queue
 * @rq: block IO operation request
 *
 * The block operation request @rq is being placed back into queue
 * @q.  For some reason the request was not completed and needs to be
 * put back in the queue.
 */
TRACE_EVENT(block_rq_requeue,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq),

        TP_STRUCT__entry(
                __field(  dev_t,        dev                        )
                __field(  sector_t,        sector                        )
                __field(  unsigned int,        nr_sector                )
                __array(  char,                rwbs,        RWBS_LEN        )
                __dynamic_array( char,        cmd,        1                )
        ),

        TP_fast_assign(
                __entry->dev           = rq->q->disk ? disk_devt(rq->q->disk) : 0;
                __entry->sector    = blk_rq_trace_sector(rq);
                __entry->nr_sector = blk_rq_trace_nr_sectors(rq);

                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
                __get_str(cmd)[0] = '\0';
        ),

        TP_printk("%d,%d %s (%s) %llu + %u [%d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rwbs, __get_str(cmd),
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, 0)
);

DECLARE_EVENT_CLASS(block_rq_completion,

        TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes),

        TP_ARGS(rq, error, nr_bytes),

        TP_STRUCT__entry(
                __field(  dev_t,        dev                        )
                __field(  sector_t,        sector                        )
                __field(  unsigned int,        nr_sector                )
                __field(  int        ,        error                        )
                __array(  char,                rwbs,        RWBS_LEN        )
                __dynamic_array( char,        cmd,        1                )
        ),

        TP_fast_assign(
                __entry->dev           = rq->q->disk ? disk_devt(rq->q->disk) : 0;
                __entry->sector    = blk_rq_pos(rq);
                __entry->nr_sector = nr_bytes >> 9;
                __entry->error     = blk_status_to_errno(error);

                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
                __get_str(cmd)[0] = '\0';
        ),

        TP_printk("%d,%d %s (%s) %llu + %u [%d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rwbs, __get_str(cmd),
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->error)
);

/**
 * block_rq_complete - block IO operation completed by device driver
 * @rq: block operations request
 * @error: status code
 * @nr_bytes: number of completed bytes
 *
 * The block_rq_complete tracepoint event indicates that some portion
 * of operation request has been completed by the device driver.  If
 * the @rq->bio is %NULL, then there is absolutely no additional work to
 * do for the request. If @rq->bio is non-NULL then there is
 * additional work required to complete the request.
 */
DEFINE_EVENT(block_rq_completion, block_rq_complete,

        TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes),

        TP_ARGS(rq, error, nr_bytes)
);

/**
 * block_rq_error - block IO operation error reported by device driver
 * @rq: block operations request
 * @error: status code
 * @nr_bytes: number of completed bytes
 *
 * The block_rq_error tracepoint event indicates that some portion
 * of operation request has failed as reported by the device driver.
 */
DEFINE_EVENT(block_rq_completion, block_rq_error,

        TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes),

        TP_ARGS(rq, error, nr_bytes)
);

DECLARE_EVENT_CLASS(block_rq,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq),

        TP_STRUCT__entry(
                __field(  dev_t,        dev                        )
                __field(  sector_t,        sector                        )
                __field(  unsigned int,        nr_sector                )
                __field(  unsigned int,        bytes                        )
                __array(  char,                rwbs,        RWBS_LEN        )
                __array(  char,         comm,   TASK_COMM_LEN   )
                __dynamic_array( char,        cmd,        1                )
        ),

        TP_fast_assign(
                __entry->dev           = rq->q->disk ? disk_devt(rq->q->disk) : 0;
                __entry->sector    = blk_rq_trace_sector(rq);
                __entry->nr_sector = blk_rq_trace_nr_sectors(rq);
                __entry->bytes     = blk_rq_bytes(rq);

                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
                __get_str(cmd)[0] = '\0';
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->rwbs, __entry->bytes, __get_str(cmd),
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->comm)
);

/**
 * block_rq_insert - insert block operation request into queue
 * @rq: block IO operation request
 *
 * Called immediately before block operation request @rq is inserted
 * into queue @q.  The fields in the operation request @rq struct can
 * be examined to determine which device and sectors the pending
 * operation would access.
 */
DEFINE_EVENT(block_rq, block_rq_insert,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_rq_issue - issue pending block IO request operation to device driver
 * @rq: block IO operation request
 *
 * Called when block operation request @rq from queue @q is sent to a
 * device driver for processing.
 */
DEFINE_EVENT(block_rq, block_rq_issue,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_rq_merge - merge request with another one in the elevator
 * @rq: block IO operation request
 *
 * Called when block operation request @rq from queue @q is merged to another
 * request queued in the elevator.
 */
DEFINE_EVENT(block_rq, block_rq_merge,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_io_start - insert a request for execution
 * @rq: block IO operation request
 *
 * Called when block operation request @rq is queued for execution
 */
DEFINE_EVENT(block_rq, block_io_start,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_io_done - block IO operation request completed
 * @rq: block IO operation request
 *
 * Called when block operation request @rq is completed
 */
DEFINE_EVENT(block_rq, block_io_done,

        TP_PROTO(struct request *rq),

        TP_ARGS(rq)
);

/**
 * block_bio_complete - completed all work on the block operation
 * @q: queue holding the block operation
 * @bio: block operation completed
 *
 * This tracepoint indicates there is no further work to do on this
 * block IO operation @bio.
 */
TRACE_EVENT(block_bio_complete,

        TP_PROTO(struct request_queue *q, struct bio *bio),

        TP_ARGS(q, bio),

        TP_STRUCT__entry(
                __field( dev_t,                dev                )
                __field( sector_t,        sector                )
                __field( unsigned,        nr_sector        )
                __field( int,                error                )
                __array( char,                rwbs,        RWBS_LEN)
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->nr_sector        = bio_sectors(bio);
                __entry->error                = blk_status_to_errno(bio->bi_status);
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
        ),

        TP_printk("%d,%d %s %llu + %u [%d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->error)
);

DECLARE_EVENT_CLASS(block_bio,

        TP_PROTO(struct bio *bio),

        TP_ARGS(bio),

        TP_STRUCT__entry(
                __field( dev_t,                dev                        )
                __field( sector_t,        sector                        )
                __field( unsigned int,        nr_sector                )
                __array( char,                rwbs,        RWBS_LEN        )
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->nr_sector        = bio_sectors(bio);
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %llu + %u [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector, __entry->comm)
);

/**
 * block_bio_bounce - used bounce buffer when processing block operation
 * @bio: block operation
 *
 * A bounce buffer was used to handle the block operation @bio in @q.
 * This occurs when hardware limitations prevent a direct transfer of
 * data between the @bio data memory area and the IO device.  Use of a
 * bounce buffer requires extra copying of data and decreases
 * performance.
 */
DEFINE_EVENT(block_bio, block_bio_bounce,
        TP_PROTO(struct bio *bio),
        TP_ARGS(bio)
);

/**
 * block_bio_backmerge - merging block operation to the end of an existing operation
 * @bio: new block operation to merge
 *
 * Merging block request @bio to the end of an existing block request.
 */
DEFINE_EVENT(block_bio, block_bio_backmerge,
        TP_PROTO(struct bio *bio),
        TP_ARGS(bio)
);

/**
 * block_bio_frontmerge - merging block operation to the beginning of an existing operation
 * @bio: new block operation to merge
 *
 * Merging block IO operation @bio to the beginning of an existing block request.
 */
DEFINE_EVENT(block_bio, block_bio_frontmerge,
        TP_PROTO(struct bio *bio),
        TP_ARGS(bio)
);

/**
 * block_bio_queue - putting new block IO operation in queue
 * @bio: new block operation
 *
 * About to place the block IO operation @bio into queue @q.
 */
DEFINE_EVENT(block_bio, block_bio_queue,
        TP_PROTO(struct bio *bio),
        TP_ARGS(bio)
);

/**
 * block_getrq - get a free request entry in queue for block IO operations
 * @bio: pending block IO operation (can be %NULL)
 *
 * A request struct has been allocated to handle the block IO operation @bio.
 */
DEFINE_EVENT(block_bio, block_getrq,
        TP_PROTO(struct bio *bio),
        TP_ARGS(bio)
);

/**
 * block_plug - keep operations requests in request queue
 * @q: request queue to plug
 *
 * Plug the request queue @q.  Do not allow block operation requests
 * to be sent to the device driver. Instead, accumulate requests in
 * the queue to improve throughput performance of the block device.
 */
TRACE_EVENT(block_plug,

        TP_PROTO(struct request_queue *q),

        TP_ARGS(q),

        TP_STRUCT__entry(
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("[%s]", __entry->comm)
);

DECLARE_EVENT_CLASS(block_unplug,

        TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit),

        TP_ARGS(q, depth, explicit),

        TP_STRUCT__entry(
                __field( int,                nr_rq                        )
                __array( char,                comm,        TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->nr_rq = depth;
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
);

/**
 * block_unplug - release of operations requests in request queue
 * @q: request queue to unplug
 * @depth: number of requests just added to the queue
 * @explicit: whether this was an explicit unplug, or one from schedule()
 *
 * Unplug request queue @q because device driver is scheduled to work
 * on elements in the request queue.
 */
DEFINE_EVENT(block_unplug, block_unplug,

        TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit),

        TP_ARGS(q, depth, explicit)
);

/**
 * block_split - split a single bio struct into two bio structs
 * @bio: block operation being split
 * @new_sector: The starting sector for the new bio
 *
 * The bio request @bio needs to be split into two bio requests.  The newly
 * created @bio request starts at @new_sector. This split may be required due to
 * hardware limitations such as operation crossing device boundaries in a RAID
 * system.
 */
TRACE_EVENT(block_split,

        TP_PROTO(struct bio *bio, unsigned int new_sector),

        TP_ARGS(bio, new_sector),

        TP_STRUCT__entry(
                __field( dev_t,                dev                                )
                __field( sector_t,        sector                                )
                __field( sector_t,        new_sector                        )
                __array( char,                rwbs,                RWBS_LEN        )
                __array( char,                comm,                TASK_COMM_LEN        )
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->new_sector        = new_sector;
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),

        TP_printk("%d,%d %s %llu / %llu [%s]",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  (unsigned long long)__entry->new_sector,
                  __entry->comm)
);

/**
 * block_bio_remap - map request for a logical device to the raw device
 * @bio: revised operation
 * @dev: original device for the operation
 * @from: original sector for the operation
 *
 * An operation for a logical device has been mapped to the
 * raw block device.
 */
TRACE_EVENT(block_bio_remap,

        TP_PROTO(struct bio *bio, dev_t dev, sector_t from),

        TP_ARGS(bio, dev, from),

        TP_STRUCT__entry(
                __field( dev_t,                dev                )
                __field( sector_t,        sector                )
                __field( unsigned int,        nr_sector        )
                __field( dev_t,                old_dev                )
                __field( sector_t,        old_sector        )
                __array( char,                rwbs,        RWBS_LEN)
        ),

        TP_fast_assign(
                __entry->dev                = bio_dev(bio);
                __entry->sector                = bio->bi_iter.bi_sector;
                __entry->nr_sector        = bio_sectors(bio);
                __entry->old_dev        = dev;
                __entry->old_sector        = from;
                blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
        ),

        TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector,
                  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
                  (unsigned long long)__entry->old_sector)
);

/**
 * block_rq_remap - map request for a block operation request
 * @rq: block IO operation request
 * @dev: device for the operation
 * @from: original sector for the operation
 *
 * The block operation request @rq in @q has been remapped.  The block
 * operation request @rq holds the current information and @from hold
 * the original sector.
 */
TRACE_EVENT(block_rq_remap,

        TP_PROTO(struct request *rq, dev_t dev, sector_t from),

        TP_ARGS(rq, dev, from),

        TP_STRUCT__entry(
                __field( dev_t,                dev                )
                __field( sector_t,        sector                )
                __field( unsigned int,        nr_sector        )
                __field( dev_t,                old_dev                )
                __field( sector_t,        old_sector        )
                __field( unsigned int,        nr_bios                )
                __array( char,                rwbs,        RWBS_LEN)
        ),

        TP_fast_assign(
                __entry->dev                = disk_devt(rq->q->disk);
                __entry->sector                = blk_rq_pos(rq);
                __entry->nr_sector        = blk_rq_sectors(rq);
                __entry->old_dev        = dev;
                __entry->old_sector        = from;
                __entry->nr_bios        = blk_rq_count_bios(rq);
                blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
        ),

        TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
                  (unsigned long long)__entry->sector,
                  __entry->nr_sector,
                  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
                  (unsigned long long)__entry->old_sector, __entry->nr_bios)
);

#endif /* _TRACE_BLOCK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>
















































    2 









    1 



    1 















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
// SPDX-License-Identifier: LGPL-2.0+
/*
 * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
 * This file is part of the GNU C Library.
 * Contributed by Paul Eggert (eggert@twinsun.com).
 *
 * The GNU C Library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * The GNU C Library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with the GNU C Library; see the file COPYING.LIB.  If not,
 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */

/*
 * Converts the calendar time to broken-down time representation
 *
 * 2009-7-14:
 *   Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com>
 * 2021-06-02:
 *   Reimplemented by Cassio Neri <cassio.neri@gmail.com>
 */

#include <linux/time.h>
#include <linux/module.h>
#include <linux/kernel.h>

#define SECS_PER_HOUR        (60 * 60)
#define SECS_PER_DAY        (SECS_PER_HOUR * 24)

/**
 * time64_to_tm - converts the calendar time to local broken-down time
 *
 * @totalsecs:        the number of seconds elapsed since 00:00:00 on January 1, 1970,
 *                Coordinated Universal Time (UTC).
 * @offset:        offset seconds adding to totalsecs.
 * @result:        pointer to struct tm variable to receive broken-down time
 */
void time64_to_tm(time64_t totalsecs, int offset, struct tm *result)
{
        u32 u32tmp, day_of_century, year_of_century, day_of_year, month, day;
        u64 u64tmp, udays, century, year;
        bool is_Jan_or_Feb, is_leap_year;
        long days, rem;
        int remainder;

        days = div_s64_rem(totalsecs, SECS_PER_DAY, &remainder);
        rem = remainder;
        rem += offset;
        while (rem < 0) {
                rem += SECS_PER_DAY;
                --days;
        }
        while (rem >= SECS_PER_DAY) {
                rem -= SECS_PER_DAY;
                ++days;
        }

        result->tm_hour = rem / SECS_PER_HOUR;
        rem %= SECS_PER_HOUR;
        result->tm_min = rem / 60;
        result->tm_sec = rem % 60;

        /* January 1, 1970 was a Thursday. */
        result->tm_wday = (4 + days) % 7;
        if (result->tm_wday < 0)
                result->tm_wday += 7;

        /*
         * The following algorithm is, basically, Proposition 6.3 of Neri
         * and Schneider [1]. In a few words: it works on the computational
         * (fictitious) calendar where the year starts in March, month = 2
         * (*), and finishes in February, month = 13. This calendar is
         * mathematically convenient because the day of the year does not
         * depend on whether the year is leap or not. For instance:
         *
         * March 1st                0-th day of the year;
         * ...
         * April 1st                31-st day of the year;
         * ...
         * January 1st                306-th day of the year; (Important!)
         * ...
         * February 28th        364-th day of the year;
         * February 29th        365-th day of the year (if it exists).
         *
         * After having worked out the date in the computational calendar
         * (using just arithmetics) it's easy to convert it to the
         * corresponding date in the Gregorian calendar.
         *
         * [1] "Euclidean Affine Functions and Applications to Calendar
         * Algorithms". https://arxiv.org/abs/2102.06959
         *
         * (*) The numbering of months follows tm more closely and thus,
         * is slightly different from [1].
         */

        udays        = ((u64) days) + 2305843009213814918ULL;

        u64tmp                = 4 * udays + 3;
        century                = div64_u64_rem(u64tmp, 146097, &u64tmp);
        day_of_century        = (u32) (u64tmp / 4);

        u32tmp                = 4 * day_of_century + 3;
        u64tmp                = 2939745ULL * u32tmp;
        year_of_century        = upper_32_bits(u64tmp);
        day_of_year        = lower_32_bits(u64tmp) / 2939745 / 4;

        year                = 100 * century + year_of_century;
        is_leap_year        = year_of_century ? !(year_of_century % 4) : !(century % 4);

        u32tmp                = 2141 * day_of_year + 132377;
        month                = u32tmp >> 16;
        day                = ((u16) u32tmp) / 2141;

        /*
         * Recall that January 1st is the 306-th day of the year in the
         * computational (not Gregorian) calendar.
         */
        is_Jan_or_Feb        = day_of_year >= 306;

        /* Convert to the Gregorian calendar and adjust to Unix time. */
        year                = year + is_Jan_or_Feb - 6313183731940000ULL;
        month                = is_Jan_or_Feb ? month - 12 : month;
        day                = day + 1;
        day_of_year        += is_Jan_or_Feb ? -306 : 31 + 28 + is_leap_year;

        /* Convert to tm's format. */
        result->tm_year = (long) (year - 1900);
        result->tm_mon  = (int) month;
        result->tm_mday = (int) day;
        result->tm_yday = (int) day_of_year;
}
EXPORT_SYMBOL(time64_to_tm);




































































































    3 





    3 








    3 
















    3 






    2 










    3 



    3 
    3 














    3 


    3 










    3 


    2 



















































































    3 







    3 




























    3 
    3 










    2 










    3 







    3 






    1 
    3 



    1 


    1 
    1 



















    3 
    3 














































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
// SPDX-License-Identifier: GPL-2.0
/*
 * blk-mq scheduling framework
 *
 * Copyright (C) 2016 Jens Axboe
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/list_sort.h>

#include <trace/events/block.h>

#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-mq-sched.h"
#include "blk-wbt.h"

/*
 * Mark a hardware queue as needing a restart.
 */
void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
{
        if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
                return;

        set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
}
EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);

void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
        clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);

        /*
         * Order clearing SCHED_RESTART and list_empty_careful(&hctx->dispatch)
         * in blk_mq_run_hw_queue(). Its pair is the barrier in
         * blk_mq_dispatch_rq_list(). So dispatch code won't see SCHED_RESTART,
         * meantime new request added to hctx->dispatch is missed to check in
         * blk_mq_run_hw_queue().
         */
        smp_mb();

        blk_mq_run_hw_queue(hctx, true);
}

static int sched_rq_cmp(void *priv, const struct list_head *a,
                        const struct list_head *b)
{
        struct request *rqa = container_of(a, struct request, queuelist);
        struct request *rqb = container_of(b, struct request, queuelist);

        return rqa->mq_hctx > rqb->mq_hctx;
}

static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list)
{
        struct blk_mq_hw_ctx *hctx =
                list_first_entry(rq_list, struct request, queuelist)->mq_hctx;
        struct request *rq;
        LIST_HEAD(hctx_list);
        unsigned int count = 0;

        list_for_each_entry(rq, rq_list, queuelist) {
                if (rq->mq_hctx != hctx) {
                        list_cut_before(&hctx_list, rq_list, &rq->queuelist);
                        goto dispatch;
                }
                count++;
        }
        list_splice_tail_init(rq_list, &hctx_list);

dispatch:
        return blk_mq_dispatch_rq_list(hctx, &hctx_list, count);
}

#define BLK_MQ_BUDGET_DELAY        3                /* ms units */

/*
 * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
 * its queue by itself in its completion handler, so we don't need to
 * restart queue if .get_budget() fails to get the budget.
 *
 * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
 * be run again.  This is necessary to avoid starving flushes.
 */
static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
        struct request_queue *q = hctx->queue;
        struct elevator_queue *e = q->elevator;
        bool multi_hctxs = false, run_queue = false;
        bool dispatched = false, busy = false;
        unsigned int max_dispatch;
        LIST_HEAD(rq_list);
        int count = 0;

        if (hctx->dispatch_busy)
                max_dispatch = 1;
        else
                max_dispatch = hctx->queue->nr_requests;

        do {
                struct request *rq;
                int budget_token;

                if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
                        break;

                if (!list_empty_careful(&hctx->dispatch)) {
                        busy = true;
                        break;
                }

                budget_token = blk_mq_get_dispatch_budget(q);
                if (budget_token < 0)
                        break;

                rq = e->type->ops.dispatch_request(hctx);
                if (!rq) {
                        blk_mq_put_dispatch_budget(q, budget_token);
                        /*
                         * We're releasing without dispatching. Holding the
                         * budget could have blocked any "hctx"s with the
                         * same queue and if we didn't dispatch then there's
                         * no guarantee anyone will kick the queue.  Kick it
                         * ourselves.
                         */
                        run_queue = true;
                        break;
                }

                blk_mq_set_rq_budget_token(rq, budget_token);

                /*
                 * Now this rq owns the budget which has to be released
                 * if this rq won't be queued to driver via .queue_rq()
                 * in blk_mq_dispatch_rq_list().
                 */
                list_add_tail(&rq->queuelist, &rq_list);
                count++;
                if (rq->mq_hctx != hctx)
                        multi_hctxs = true;

                /*
                 * If we cannot get tag for the request, stop dequeueing
                 * requests from the IO scheduler. We are unlikely to be able
                 * to submit them anyway and it creates false impression for
                 * scheduling heuristics that the device can take more IO.
                 */
                if (!blk_mq_get_driver_tag(rq))
                        break;
        } while (count < max_dispatch);

        if (!count) {
                if (run_queue)
                        blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
        } else if (multi_hctxs) {
                /*
                 * Requests from different hctx may be dequeued from some
                 * schedulers, such as bfq and deadline.
                 *
                 * Sort the requests in the list according to their hctx,
                 * dispatch batching requests from same hctx at a time.
                 */
                list_sort(NULL, &rq_list, sched_rq_cmp);
                do {
                        dispatched |= blk_mq_dispatch_hctx_list(&rq_list);
                } while (!list_empty(&rq_list));
        } else {
                dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count);
        }

        if (busy)
                return -EAGAIN;
        return !!dispatched;
}

static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
        unsigned long end = jiffies + HZ;
        int ret;

        do {
                ret = __blk_mq_do_dispatch_sched(hctx);
                if (ret != 1)
                        break;
                if (need_resched() || time_is_before_jiffies(end)) {
                        blk_mq_delay_run_hw_queue(hctx, 0);
                        break;
                }
        } while (1);

        return ret;
}

static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
                                          struct blk_mq_ctx *ctx)
{
        unsigned short idx = ctx->index_hw[hctx->type];

        if (++idx == hctx->nr_ctx)
                idx = 0;

        return hctx->ctxs[idx];
}

/*
 * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
 * its queue by itself in its completion handler, so we don't need to
 * restart queue if .get_budget() fails to get the budget.
 *
 * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
 * be run again.  This is necessary to avoid starving flushes.
 */
static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
{
        struct request_queue *q = hctx->queue;
        LIST_HEAD(rq_list);
        struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
        int ret = 0;
        struct request *rq;

        do {
                int budget_token;

                if (!list_empty_careful(&hctx->dispatch)) {
                        ret = -EAGAIN;
                        break;
                }

                if (!sbitmap_any_bit_set(&hctx->ctx_map))
                        break;

                budget_token = blk_mq_get_dispatch_budget(q);
                if (budget_token < 0)
                        break;

                rq = blk_mq_dequeue_from_ctx(hctx, ctx);
                if (!rq) {
                        blk_mq_put_dispatch_budget(q, budget_token);
                        /*
                         * We're releasing without dispatching. Holding the
                         * budget could have blocked any "hctx"s with the
                         * same queue and if we didn't dispatch then there's
                         * no guarantee anyone will kick the queue.  Kick it
                         * ourselves.
                         */
                        blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
                        break;
                }

                blk_mq_set_rq_budget_token(rq, budget_token);

                /*
                 * Now this rq owns the budget which has to be released
                 * if this rq won't be queued to driver via .queue_rq()
                 * in blk_mq_dispatch_rq_list().
                 */
                list_add(&rq->queuelist, &rq_list);

                /* round robin for fair dispatch */
                ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);

        } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1));

        WRITE_ONCE(hctx->dispatch_from, ctx);
        return ret;
}

static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
        bool need_dispatch = false;
        LIST_HEAD(rq_list);

        /*
         * If we have previous entries on our dispatch list, grab them first for
         * more fair dispatch.
         */
        if (!list_empty_careful(&hctx->dispatch)) {
                spin_lock(&hctx->lock);
                if (!list_empty(&hctx->dispatch))
                        list_splice_init(&hctx->dispatch, &rq_list);
                spin_unlock(&hctx->lock);
        }

        /*
         * Only ask the scheduler for requests, if we didn't have residual
         * requests from the dispatch list. This is to avoid the case where
         * we only ever dispatch a fraction of the requests available because
         * of low device queue depth. Once we pull requests out of the IO
         * scheduler, we can no longer merge or sort them. So it's best to
         * leave them there for as long as we can. Mark the hw queue as
         * needing a restart in that case.
         *
         * We want to dispatch from the scheduler if there was nothing
         * on the dispatch list or we were able to dispatch from the
         * dispatch list.
         */
        if (!list_empty(&rq_list)) {
                blk_mq_sched_mark_restart_hctx(hctx);
                if (!blk_mq_dispatch_rq_list(hctx, &rq_list, 0))
                        return 0;
                need_dispatch = true;
        } else {
                need_dispatch = hctx->dispatch_busy;
        }

        if (hctx->queue->elevator)
                return blk_mq_do_dispatch_sched(hctx);

        /* dequeue request one by one from sw queue if queue is busy */
        if (need_dispatch)
                return blk_mq_do_dispatch_ctx(hctx);
        blk_mq_flush_busy_ctxs(hctx, &rq_list);
        blk_mq_dispatch_rq_list(hctx, &rq_list, 0);
        return 0;
}

void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
        struct request_queue *q = hctx->queue;

        /* RCU or SRCU read lock is needed before checking quiesced flag */
        if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
                return;

        /*
         * A return of -EAGAIN is an indication that hctx->dispatch is not
         * empty and we must run again in order to avoid starving flushes.
         */
        if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) {
                if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN)
                        blk_mq_run_hw_queue(hctx, true);
        }
}

bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs)
{
        struct elevator_queue *e = q->elevator;
        struct blk_mq_ctx *ctx;
        struct blk_mq_hw_ctx *hctx;
        bool ret = false;
        enum hctx_type type;

        if (e && e->type->ops.bio_merge) {
                ret = e->type->ops.bio_merge(q, bio, nr_segs);
                goto out_put;
        }

        ctx = blk_mq_get_ctx(q);
        hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
        type = hctx->type;
        if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
            list_empty_careful(&ctx->rq_lists[type]))
                goto out_put;

        /* default per sw-queue merge */
        spin_lock(&ctx->lock);
        /*
         * Reverse check our software queue for entries that we could
         * potentially merge with. Currently includes a hand-wavy stop
         * count of 8, to not spend too much time checking for merges.
         */
        if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs))
                ret = true;

        spin_unlock(&ctx->lock);
out_put:
        return ret;
}

bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
                                   struct list_head *free)
{
        return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free);
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);

static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
                                          struct blk_mq_hw_ctx *hctx,
                                          unsigned int hctx_idx)
{
        if (blk_mq_is_shared_tags(q->tag_set->flags)) {
                hctx->sched_tags = q->sched_shared_tags;
                return 0;
        }

        hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
                                                    q->nr_requests);

        if (!hctx->sched_tags)
                return -ENOMEM;
        return 0;
}

static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
{
        blk_mq_free_rq_map(queue->sched_shared_tags);
        queue->sched_shared_tags = NULL;
}

/* called in queue's release handler, tagset has gone away */
static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i) {
                if (hctx->sched_tags) {
                        if (!blk_mq_is_shared_tags(flags))
                                blk_mq_free_rq_map(hctx->sched_tags);
                        hctx->sched_tags = NULL;
                }
        }

        if (blk_mq_is_shared_tags(flags))
                blk_mq_exit_sched_shared_tags(q);
}

static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
{
        struct blk_mq_tag_set *set = queue->tag_set;

        /*
         * Set initial depth at max so that we don't need to reallocate for
         * updating nr_requests.
         */
        queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
                                                BLK_MQ_NO_HCTX_IDX,
                                                MAX_SCHED_RQ);
        if (!queue->sched_shared_tags)
                return -ENOMEM;

        blk_mq_tag_update_sched_shared_tags(queue);

        return 0;
}

/* caller must have a reference to @e, will grab another one if successful */
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
        unsigned int flags = q->tag_set->flags;
        struct blk_mq_hw_ctx *hctx;
        struct elevator_queue *eq;
        unsigned long i;
        int ret;

        /*
         * Default to double of smaller one between hw queue_depth and 128,
         * since we don't split into sync/async like the old code did.
         * Additionally, this is a per-hw queue depth.
         */
        q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
                                   BLKDEV_DEFAULT_RQ);

        if (blk_mq_is_shared_tags(flags)) {
                ret = blk_mq_init_sched_shared_tags(q);
                if (ret)
                        return ret;
        }

        queue_for_each_hw_ctx(q, hctx, i) {
                ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
                if (ret)
                        goto err_free_map_and_rqs;
        }

        ret = e->ops.init_sched(q, e);
        if (ret)
                goto err_free_map_and_rqs;

        mutex_lock(&q->debugfs_mutex);
        blk_mq_debugfs_register_sched(q);
        mutex_unlock(&q->debugfs_mutex);

        queue_for_each_hw_ctx(q, hctx, i) {
                if (e->ops.init_hctx) {
                        ret = e->ops.init_hctx(hctx, i);
                        if (ret) {
                                eq = q->elevator;
                                blk_mq_sched_free_rqs(q);
                                blk_mq_exit_sched(q, eq);
                                kobject_put(&eq->kobj);
                                return ret;
                        }
                }
                mutex_lock(&q->debugfs_mutex);
                blk_mq_debugfs_register_sched_hctx(q, hctx);
                mutex_unlock(&q->debugfs_mutex);
        }

        return 0;

err_free_map_and_rqs:
        blk_mq_sched_free_rqs(q);
        blk_mq_sched_tags_teardown(q, flags);

        q->elevator = NULL;
        return ret;
}

/*
 * called in either blk_queue_cleanup or elevator_switch, tagset
 * is required for freeing requests
 */
void blk_mq_sched_free_rqs(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        if (blk_mq_is_shared_tags(q->tag_set->flags)) {
                blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
                                BLK_MQ_NO_HCTX_IDX);
        } else {
                queue_for_each_hw_ctx(q, hctx, i) {
                        if (hctx->sched_tags)
                                blk_mq_free_rqs(q->tag_set,
                                                hctx->sched_tags, i);
                }
        }
}

void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;
        unsigned int flags = 0;

        queue_for_each_hw_ctx(q, hctx, i) {
                mutex_lock(&q->debugfs_mutex);
                blk_mq_debugfs_unregister_sched_hctx(hctx);
                mutex_unlock(&q->debugfs_mutex);

                if (e->type->ops.exit_hctx && hctx->sched_data) {
                        e->type->ops.exit_hctx(hctx, i);
                        hctx->sched_data = NULL;
                }
                flags = hctx->flags;
        }

        mutex_lock(&q->debugfs_mutex);
        blk_mq_debugfs_unregister_sched(q);
        mutex_unlock(&q->debugfs_mutex);

        if (e->type->ops.exit_sched)
                e->type->ops.exit_sched(e);
        blk_mq_sched_tags_teardown(q, flags);
        q->elevator = NULL;
}





































































   14 








   19 
   16 


























































































































































































































    4 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 


    2 

















    4 


    3 











    3 













































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
// SPDX-License-Identifier: GPL-2.0-or-later
/* Common capabilities, needed by capability.o.
 */

#include <linux/capability.h>
#include <linux/audit.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/lsm_hooks.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/ptrace.h>
#include <linux/xattr.h>
#include <linux/hugetlb.h>
#include <linux/mount.h>
#include <linux/sched.h>
#include <linux/prctl.h>
#include <linux/securebits.h>
#include <linux/user_namespace.h>
#include <linux/binfmts.h>
#include <linux/personality.h>
#include <linux/mnt_idmapping.h>
#include <uapi/linux/lsm.h>

/*
 * If a non-root user executes a setuid-root binary in
 * !secure(SECURE_NOROOT) mode, then we raise capabilities.
 * However if fE is also set, then the intent is for only
 * the file capabilities to be applied, and the setuid-root
 * bit is left on either to change the uid (plausible) or
 * to get full privilege on a kernel without file capabilities
 * support.  So in that case we do not raise capabilities.
 *
 * Warn if that happens, once per boot.
 */
static void warn_setuid_and_fcaps_mixed(const char *fname)
{
        static int warned;
        if (!warned) {
                printk(KERN_INFO "warning: `%s' has both setuid-root and"
                        " effective capabilities. Therefore not raising all"
                        " capabilities.\n", fname);
                warned = 1;
        }
}

/**
 * cap_capable - Determine whether a task has a particular effective capability
 * @cred: The credentials to use
 * @targ_ns:  The user namespace in which we need the capability
 * @cap: The capability to check for
 * @opts: Bitmask of options defined in include/linux/security.h
 *
 * Determine whether the nominated task has the specified capability amongst
 * its effective set, returning 0 if it does, -ve if it does not.
 *
 * NOTE WELL: cap_has_capability() cannot be used like the kernel's capable()
 * and has_capability() functions.  That is, it has the reverse semantics:
 * cap_has_capability() returns 0 when a task has a capability, but the
 * kernel's capable() and has_capability() returns 1 for this case.
 */
int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
                int cap, unsigned int opts)
{
        struct user_namespace *ns = targ_ns;

        /* See if cred has the capability in the target user namespace
         * by examining the target user namespace and all of the target
         * user namespace's parents.
         */
        for (;;) {
                /* Do we have the necessary capabilities? */
                if (ns == cred->user_ns)
                        return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;

                /*
                 * If we're already at a lower level than we're looking for,
                 * we're done searching.
                 */
                if (ns->level <= cred->user_ns->level)
                        return -EPERM;

                /* 
                 * The owner of the user namespace in the parent of the
                 * user namespace has all caps.
                 */
                if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid))
                        return 0;

                /*
                 * If you have a capability in a parent user ns, then you have
                 * it over all children user namespaces as well.
                 */
                ns = ns->parent;
        }

        /* We never get here */
}

/**
 * cap_settime - Determine whether the current process may set the system clock
 * @ts: The time to set
 * @tz: The timezone to set
 *
 * Determine whether the current process may set the system clock and timezone
 * information, returning 0 if permission granted, -ve if denied.
 */
int cap_settime(const struct timespec64 *ts, const struct timezone *tz)
{
        if (!capable(CAP_SYS_TIME))
                return -EPERM;
        return 0;
}

/**
 * cap_ptrace_access_check - Determine whether the current process may access
 *                           another
 * @child: The process to be accessed
 * @mode: The mode of attachment.
 *
 * If we are in the same or an ancestor user_ns and have all the target
 * task's capabilities, then ptrace access is allowed.
 * If we have the ptrace capability to the target user_ns, then ptrace
 * access is allowed.
 * Else denied.
 *
 * Determine whether a process may access another, returning 0 if permission
 * granted, -ve if denied.
 */
int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
{
        int ret = 0;
        const struct cred *cred, *child_cred;
        const kernel_cap_t *caller_caps;

        rcu_read_lock();
        cred = current_cred();
        child_cred = __task_cred(child);
        if (mode & PTRACE_MODE_FSCREDS)
                caller_caps = &cred->cap_effective;
        else
                caller_caps = &cred->cap_permitted;
        if (cred->user_ns == child_cred->user_ns &&
            cap_issubset(child_cred->cap_permitted, *caller_caps))
                goto out;
        if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE))
                goto out;
        ret = -EPERM;
out:
        rcu_read_unlock();
        return ret;
}

/**
 * cap_ptrace_traceme - Determine whether another process may trace the current
 * @parent: The task proposed to be the tracer
 *
 * If parent is in the same or an ancestor user_ns and has all current's
 * capabilities, then ptrace access is allowed.
 * If parent has the ptrace capability to current's user_ns, then ptrace
 * access is allowed.
 * Else denied.
 *
 * Determine whether the nominated task is permitted to trace the current
 * process, returning 0 if permission is granted, -ve if denied.
 */
int cap_ptrace_traceme(struct task_struct *parent)
{
        int ret = 0;
        const struct cred *cred, *child_cred;

        rcu_read_lock();
        cred = __task_cred(parent);
        child_cred = current_cred();
        if (cred->user_ns == child_cred->user_ns &&
            cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
                goto out;
        if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE))
                goto out;
        ret = -EPERM;
out:
        rcu_read_unlock();
        return ret;
}

/**
 * cap_capget - Retrieve a task's capability sets
 * @target: The task from which to retrieve the capability sets
 * @effective: The place to record the effective set
 * @inheritable: The place to record the inheritable set
 * @permitted: The place to record the permitted set
 *
 * This function retrieves the capabilities of the nominated task and returns
 * them to the caller.
 */
int cap_capget(const struct task_struct *target, kernel_cap_t *effective,
               kernel_cap_t *inheritable, kernel_cap_t *permitted)
{
        const struct cred *cred;

        /* Derived from kernel/capability.c:sys_capget. */
        rcu_read_lock();
        cred = __task_cred(target);
        *effective   = cred->cap_effective;
        *inheritable = cred->cap_inheritable;
        *permitted   = cred->cap_permitted;
        rcu_read_unlock();
        return 0;
}

/*
 * Determine whether the inheritable capabilities are limited to the old
 * permitted set.  Returns 1 if they are limited, 0 if they are not.
 */
static inline int cap_inh_is_capped(void)
{
        /* they are so limited unless the current task has the CAP_SETPCAP
         * capability
         */
        if (cap_capable(current_cred(), current_cred()->user_ns,
                        CAP_SETPCAP, CAP_OPT_NONE) == 0)
                return 0;
        return 1;
}

/**
 * cap_capset - Validate and apply proposed changes to current's capabilities
 * @new: The proposed new credentials; alterations should be made here
 * @old: The current task's current credentials
 * @effective: A pointer to the proposed new effective capabilities set
 * @inheritable: A pointer to the proposed new inheritable capabilities set
 * @permitted: A pointer to the proposed new permitted capabilities set
 *
 * This function validates and applies a proposed mass change to the current
 * process's capability sets.  The changes are made to the proposed new
 * credentials, and assuming no error, will be committed by the caller of LSM.
 */
int cap_capset(struct cred *new,
               const struct cred *old,
               const kernel_cap_t *effective,
               const kernel_cap_t *inheritable,
               const kernel_cap_t *permitted)
{
        if (cap_inh_is_capped() &&
            !cap_issubset(*inheritable,
                          cap_combine(old->cap_inheritable,
                                      old->cap_permitted)))
                /* incapable of using this inheritable set */
                return -EPERM;

        if (!cap_issubset(*inheritable,
                          cap_combine(old->cap_inheritable,
                                      old->cap_bset)))
                /* no new pI capabilities outside bounding set */
                return -EPERM;

        /* verify restrictions on target's new Permitted set */
        if (!cap_issubset(*permitted, old->cap_permitted))
                return -EPERM;

        /* verify the _new_Effective_ is a subset of the _new_Permitted_ */
        if (!cap_issubset(*effective, *permitted))
                return -EPERM;

        new->cap_effective   = *effective;
        new->cap_inheritable = *inheritable;
        new->cap_permitted   = *permitted;

        /*
         * Mask off ambient bits that are no longer both permitted and
         * inheritable.
         */
        new->cap_ambient = cap_intersect(new->cap_ambient,
                                         cap_intersect(*permitted,
                                                       *inheritable));
        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EINVAL;
        return 0;
}

/**
 * cap_inode_need_killpriv - Determine if inode change affects privileges
 * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
 *
 * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV
 * affects the security markings on that inode, and if it is, should
 * inode_killpriv() be invoked or the change rejected.
 *
 * Return: 1 if security.capability has a value, meaning inode_killpriv()
 * is required, 0 otherwise, meaning inode_killpriv() is not required.
 */
int cap_inode_need_killpriv(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);
        int error;

        error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0);
        return error > 0;
}

/**
 * cap_inode_killpriv - Erase the security markings on an inode
 *
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        The inode/dentry to alter
 *
 * Erase the privilege-enhancing security markings on an inode.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * Return: 0 if successful, -ve on error.
 */
int cap_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry)
{
        int error;

        error = __vfs_removexattr(idmap, dentry, XATTR_NAME_CAPS);
        if (error == -EOPNOTSUPP)
                error = 0;
        return error;
}

static bool rootid_owns_currentns(vfsuid_t rootvfsuid)
{
        struct user_namespace *ns;
        kuid_t kroot;

        if (!vfsuid_valid(rootvfsuid))
                return false;

        kroot = vfsuid_into_kuid(rootvfsuid);
        for (ns = current_user_ns();; ns = ns->parent) {
                if (from_kuid(ns, kroot) == 0)
                        return true;
                if (ns == &init_user_ns)
                        break;
        }

        return false;
}

static __u32 sansflags(__u32 m)
{
        return m & ~VFS_CAP_FLAGS_EFFECTIVE;
}

static bool is_v2header(int size, const struct vfs_cap_data *cap)
{
        if (size != XATTR_CAPS_SZ_2)
                return false;
        return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2;
}

static bool is_v3header(int size, const struct vfs_cap_data *cap)
{
        if (size != XATTR_CAPS_SZ_3)
                return false;
        return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
}

/*
 * getsecurity: We are called for security.* before any attempt to read the
 * xattr from the inode itself.
 *
 * This gives us a chance to read the on-disk value and convert it.  If we
 * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
 *
 * Note we are not called by vfs_getxattr_alloc(), but that is only called
 * by the integrity subsystem, which really wants the unconverted values -
 * so that's good.
 */
int cap_inode_getsecurity(struct mnt_idmap *idmap,
                          struct inode *inode, const char *name, void **buffer,
                          bool alloc)
{
        int size;
        kuid_t kroot;
        vfsuid_t vfsroot;
        u32 nsmagic, magic;
        uid_t root, mappedroot;
        char *tmpbuf = NULL;
        struct vfs_cap_data *cap;
        struct vfs_ns_cap_data *nscap = NULL;
        struct dentry *dentry;
        struct user_namespace *fs_ns;

        if (strcmp(name, "capability") != 0)
                return -EOPNOTSUPP;

        dentry = d_find_any_alias(inode);
        if (!dentry)
                return -EINVAL;
        size = vfs_getxattr_alloc(idmap, dentry, XATTR_NAME_CAPS, &tmpbuf,
                                  sizeof(struct vfs_ns_cap_data), GFP_NOFS);
        dput(dentry);
        /* gcc11 complains if we don't check for !tmpbuf */
        if (size < 0 || !tmpbuf)
                goto out_free;

        fs_ns = inode->i_sb->s_user_ns;
        cap = (struct vfs_cap_data *) tmpbuf;
        if (is_v2header(size, cap)) {
                root = 0;
        } else if (is_v3header(size, cap)) {
                nscap = (struct vfs_ns_cap_data *) tmpbuf;
                root = le32_to_cpu(nscap->rootid);
        } else {
                size = -EINVAL;
                goto out_free;
        }

        kroot = make_kuid(fs_ns, root);

        /* If this is an idmapped mount shift the kuid. */
        vfsroot = make_vfsuid(idmap, fs_ns, kroot);

        /* If the root kuid maps to a valid uid in current ns, then return
         * this as a nscap. */
        mappedroot = from_kuid(current_user_ns(), vfsuid_into_kuid(vfsroot));
        if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
                size = sizeof(struct vfs_ns_cap_data);
                if (alloc) {
                        if (!nscap) {
                                /* v2 -> v3 conversion */
                                nscap = kzalloc(size, GFP_ATOMIC);
                                if (!nscap) {
                                        size = -ENOMEM;
                                        goto out_free;
                                }
                                nsmagic = VFS_CAP_REVISION_3;
                                magic = le32_to_cpu(cap->magic_etc);
                                if (magic & VFS_CAP_FLAGS_EFFECTIVE)
                                        nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
                                memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
                                nscap->magic_etc = cpu_to_le32(nsmagic);
                        } else {
                                /* use allocated v3 buffer */
                                tmpbuf = NULL;
                        }
                        nscap->rootid = cpu_to_le32(mappedroot);
                        *buffer = nscap;
                }
                goto out_free;
        }

        if (!rootid_owns_currentns(vfsroot)) {
                size = -EOVERFLOW;
                goto out_free;
        }

        /* This comes from a parent namespace.  Return as a v2 capability */
        size = sizeof(struct vfs_cap_data);
        if (alloc) {
                if (nscap) {
                        /* v3 -> v2 conversion */
                        cap = kzalloc(size, GFP_ATOMIC);
                        if (!cap) {
                                size = -ENOMEM;
                                goto out_free;
                        }
                        magic = VFS_CAP_REVISION_2;
                        nsmagic = le32_to_cpu(nscap->magic_etc);
                        if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
                                magic |= VFS_CAP_FLAGS_EFFECTIVE;
                        memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
                        cap->magic_etc = cpu_to_le32(magic);
                } else {
                        /* use unconverted v2 */
                        tmpbuf = NULL;
                }
                *buffer = cap;
        }
out_free:
        kfree(tmpbuf);
        return size;
}

/**
 * rootid_from_xattr - translate root uid of vfs caps
 *
 * @value:        vfs caps value which may be modified by this function
 * @size:        size of @ivalue
 * @task_ns:        user namespace of the caller
 */
static vfsuid_t rootid_from_xattr(const void *value, size_t size,
                                  struct user_namespace *task_ns)
{
        const struct vfs_ns_cap_data *nscap = value;
        uid_t rootid = 0;

        if (size == XATTR_CAPS_SZ_3)
                rootid = le32_to_cpu(nscap->rootid);

        return VFSUIDT_INIT(make_kuid(task_ns, rootid));
}

static bool validheader(size_t size, const struct vfs_cap_data *cap)
{
        return is_v2header(size, cap) || is_v3header(size, cap);
}

/**
 * cap_convert_nscap - check vfs caps
 *
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        used to retrieve inode to check permissions on
 * @ivalue:        vfs caps value which may be modified by this function
 * @size:        size of @ivalue
 *
 * User requested a write of security.capability.  If needed, update the
 * xattr to change from v2 to v3, or to fixup the v3 rootid.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * Return: On success, return the new size; on error, return < 0.
 */
int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry,
                      const void **ivalue, size_t size)
{
        struct vfs_ns_cap_data *nscap;
        uid_t nsrootid;
        const struct vfs_cap_data *cap = *ivalue;
        __u32 magic, nsmagic;
        struct inode *inode = d_backing_inode(dentry);
        struct user_namespace *task_ns = current_user_ns(),
                *fs_ns = inode->i_sb->s_user_ns;
        kuid_t rootid;
        vfsuid_t vfsrootid;
        size_t newsize;

        if (!*ivalue)
                return -EINVAL;
        if (!validheader(size, cap))
                return -EINVAL;
        if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP))
                return -EPERM;
        if (size == XATTR_CAPS_SZ_2 && (idmap == &nop_mnt_idmap))
                if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
                        /* user is privileged, just write the v2 */
                        return size;

        vfsrootid = rootid_from_xattr(*ivalue, size, task_ns);
        if (!vfsuid_valid(vfsrootid))
                return -EINVAL;

        rootid = from_vfsuid(idmap, fs_ns, vfsrootid);
        if (!uid_valid(rootid))
                return -EINVAL;

        nsrootid = from_kuid(fs_ns, rootid);
        if (nsrootid == -1)
                return -EINVAL;

        newsize = sizeof(struct vfs_ns_cap_data);
        nscap = kmalloc(newsize, GFP_ATOMIC);
        if (!nscap)
                return -ENOMEM;
        nscap->rootid = cpu_to_le32(nsrootid);
        nsmagic = VFS_CAP_REVISION_3;
        magic = le32_to_cpu(cap->magic_etc);
        if (magic & VFS_CAP_FLAGS_EFFECTIVE)
                nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
        nscap->magic_etc = cpu_to_le32(nsmagic);
        memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);

        *ivalue = nscap;
        return newsize;
}

/*
 * Calculate the new process capability sets from the capability sets attached
 * to a file.
 */
static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
                                          struct linux_binprm *bprm,
                                          bool *effective,
                                          bool *has_fcap)
{
        struct cred *new = bprm->cred;
        int ret = 0;

        if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
                *effective = true;

        if (caps->magic_etc & VFS_CAP_REVISION_MASK)
                *has_fcap = true;

        /*
         * pP' = (X & fP) | (pI & fI)
         * The addition of pA' is handled later.
         */
        new->cap_permitted.val =
                (new->cap_bset.val & caps->permitted.val) |
                (new->cap_inheritable.val & caps->inheritable.val);

        if (caps->permitted.val & ~new->cap_permitted.val)
                /* insufficient to execute correctly */
                ret = -EPERM;

        /*
         * For legacy apps, with no internal support for recognizing they
         * do not have enough capabilities, we return an error if they are
         * missing some "forced" (aka file-permitted) capabilities.
         */
        return *effective ? ret : 0;
}

/**
 * get_vfs_caps_from_disk - retrieve vfs caps from disk
 *
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        dentry from which @inode is retrieved
 * @cpu_caps:        vfs capabilities
 *
 * Extract the on-exec-apply capability sets for an executable file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 */
int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
                           const struct dentry *dentry,
                           struct cpu_vfs_cap_data *cpu_caps)
{
        struct inode *inode = d_backing_inode(dentry);
        __u32 magic_etc;
        int size;
        struct vfs_ns_cap_data data, *nscaps = &data;
        struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
        kuid_t rootkuid;
        vfsuid_t rootvfsuid;
        struct user_namespace *fs_ns;

        memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));

        if (!inode)
                return -ENODATA;

        fs_ns = inode->i_sb->s_user_ns;
        size = __vfs_getxattr((struct dentry *)dentry, inode,
                              XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
        if (size == -ENODATA || size == -EOPNOTSUPP)
                /* no data, that's ok */
                return -ENODATA;

        if (size < 0)
                return size;

        if (size < sizeof(magic_etc))
                return -EINVAL;

        cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);

        rootkuid = make_kuid(fs_ns, 0);
        switch (magic_etc & VFS_CAP_REVISION_MASK) {
        case VFS_CAP_REVISION_1:
                if (size != XATTR_CAPS_SZ_1)
                        return -EINVAL;
                break;
        case VFS_CAP_REVISION_2:
                if (size != XATTR_CAPS_SZ_2)
                        return -EINVAL;
                break;
        case VFS_CAP_REVISION_3:
                if (size != XATTR_CAPS_SZ_3)
                        return -EINVAL;
                rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
                break;

        default:
                return -EINVAL;
        }

        rootvfsuid = make_vfsuid(idmap, fs_ns, rootkuid);
        if (!vfsuid_valid(rootvfsuid))
                return -ENODATA;

        /* Limit the caps to the mounter of the filesystem
         * or the more limited uid specified in the xattr.
         */
        if (!rootid_owns_currentns(rootvfsuid))
                return -ENODATA;

        cpu_caps->permitted.val = le32_to_cpu(caps->data[0].permitted);
        cpu_caps->inheritable.val = le32_to_cpu(caps->data[0].inheritable);

        /*
         * Rev1 had just a single 32-bit word, later expanded
         * to a second one for the high bits
         */
        if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
                cpu_caps->permitted.val += (u64)le32_to_cpu(caps->data[1].permitted) << 32;
                cpu_caps->inheritable.val += (u64)le32_to_cpu(caps->data[1].inheritable) << 32;
        }

        cpu_caps->permitted.val &= CAP_VALID_MASK;
        cpu_caps->inheritable.val &= CAP_VALID_MASK;

        cpu_caps->rootid = vfsuid_into_kuid(rootvfsuid);

        return 0;
}

/*
 * Attempt to get the on-exec apply capability sets for an executable file from
 * its xattrs and, if present, apply them to the proposed credentials being
 * constructed by execve().
 */
static int get_file_caps(struct linux_binprm *bprm, const struct file *file,
                         bool *effective, bool *has_fcap)
{
        int rc = 0;
        struct cpu_vfs_cap_data vcaps;

        cap_clear(bprm->cred->cap_permitted);

        if (!file_caps_enabled)
                return 0;

        if (!mnt_may_suid(file->f_path.mnt))
                return 0;

        /*
         * This check is redundant with mnt_may_suid() but is kept to make
         * explicit that capability bits are limited to s_user_ns and its
         * descendants.
         */
        if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns))
                return 0;

        rc = get_vfs_caps_from_disk(file_mnt_idmap(file),
                                    file->f_path.dentry, &vcaps);
        if (rc < 0) {
                if (rc == -EINVAL)
                        printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
                                        bprm->filename);
                else if (rc == -ENODATA)
                        rc = 0;
                goto out;
        }

        rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap);

out:
        if (rc)
                cap_clear(bprm->cred->cap_permitted);

        return rc;
}

static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); }

static inline bool __is_real(kuid_t uid, struct cred *cred)
{ return uid_eq(cred->uid, uid); }

static inline bool __is_eff(kuid_t uid, struct cred *cred)
{ return uid_eq(cred->euid, uid); }

static inline bool __is_suid(kuid_t uid, struct cred *cred)
{ return !__is_real(uid, cred) && __is_eff(uid, cred); }

/*
 * handle_privileged_root - Handle case of privileged root
 * @bprm: The execution parameters, including the proposed creds
 * @has_fcap: Are any file capabilities set?
 * @effective: Do we have effective root privilege?
 * @root_uid: This namespace' root UID WRT initial USER namespace
 *
 * Handle the case where root is privileged and hasn't been neutered by
 * SECURE_NOROOT.  If file capabilities are set, they won't be combined with
 * set UID root and nothing is changed.  If we are root, cap_permitted is
 * updated.  If we have become set UID root, the effective bit is set.
 */
static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap,
                                   bool *effective, kuid_t root_uid)
{
        const struct cred *old = current_cred();
        struct cred *new = bprm->cred;

        if (!root_privileged())
                return;
        /*
         * If the legacy file capability is set, then don't set privs
         * for a setuid root binary run by a non-root user.  Do set it
         * for a root user just to cause least surprise to an admin.
         */
        if (has_fcap && __is_suid(root_uid, new)) {
                warn_setuid_and_fcaps_mixed(bprm->filename);
                return;
        }
        /*
         * To support inheritance of root-permissions and suid-root
         * executables under compatibility mode, we override the
         * capability sets for the file.
         */
        if (__is_eff(root_uid, new) || __is_real(root_uid, new)) {
                /* pP' = (cap_bset & ~0) | (pI & ~0) */
                new->cap_permitted = cap_combine(old->cap_bset,
                                                 old->cap_inheritable);
        }
        /*
         * If only the real uid is 0, we do not set the effective bit.
         */
        if (__is_eff(root_uid, new))
                *effective = true;
}

#define __cap_gained(field, target, source) \
        !cap_issubset(target->cap_##field, source->cap_##field)
#define __cap_grew(target, source, cred) \
        !cap_issubset(cred->cap_##target, cred->cap_##source)
#define __cap_full(field, cred) \
        cap_issubset(CAP_FULL_SET, cred->cap_##field)

static inline bool __is_setuid(struct cred *new, const struct cred *old)
{ return !uid_eq(new->euid, old->uid); }

static inline bool __is_setgid(struct cred *new, const struct cred *old)
{ return !gid_eq(new->egid, old->gid); }

/*
 * 1) Audit candidate if current->cap_effective is set
 *
 * We do not bother to audit if 3 things are true:
 *   1) cap_effective has all caps
 *   2) we became root *OR* are were already root
 *   3) root is supposed to have all caps (SECURE_NOROOT)
 * Since this is just a normal root execing a process.
 *
 * Number 1 above might fail if you don't have a full bset, but I think
 * that is interesting information to audit.
 *
 * A number of other conditions require logging:
 * 2) something prevented setuid root getting all caps
 * 3) non-setuid root gets fcaps
 * 4) non-setuid root gets ambient
 */
static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old,
                                     kuid_t root, bool has_fcap)
{
        bool ret = false;

        if ((__cap_grew(effective, ambient, new) &&
             !(__cap_full(effective, new) &&
               (__is_eff(root, new) || __is_real(root, new)) &&
               root_privileged())) ||
            (root_privileged() &&
             __is_suid(root, new) &&
             !__cap_full(effective, new)) ||
            (!__is_setuid(new, old) &&
             ((has_fcap &&
               __cap_gained(permitted, new, old)) ||
              __cap_gained(ambient, new, old))))

                ret = true;

        return ret;
}

/**
 * cap_bprm_creds_from_file - Set up the proposed credentials for execve().
 * @bprm: The execution parameters, including the proposed creds
 * @file: The file to pull the credentials from
 *
 * Set up the proposed credentials for a new execution context being
 * constructed by execve().  The proposed creds in @bprm->cred is altered,
 * which won't take effect immediately.
 *
 * Return: 0 if successful, -ve on error.
 */
int cap_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file)
{
        /* Process setpcap binaries and capabilities for uid 0 */
        const struct cred *old = current_cred();
        struct cred *new = bprm->cred;
        bool effective = false, has_fcap = false, is_setid;
        int ret;
        kuid_t root_uid;

        if (WARN_ON(!cap_ambient_invariant_ok(old)))
                return -EPERM;

        ret = get_file_caps(bprm, file, &effective, &has_fcap);
        if (ret < 0)
                return ret;

        root_uid = make_kuid(new->user_ns, 0);

        handle_privileged_root(bprm, has_fcap, &effective, root_uid);

        /* if we have fs caps, clear dangerous personality flags */
        if (__cap_gained(permitted, new, old))
                bprm->per_clear |= PER_CLEAR_ON_SETID;

        /* Don't let someone trace a set[ug]id/setpcap binary with the revised
         * credentials unless they have the appropriate permit.
         *
         * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
         */
        is_setid = __is_setuid(new, old) || __is_setgid(new, old);

        if ((is_setid || __cap_gained(permitted, new, old)) &&
            ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
             !ptracer_capable(current, new->user_ns))) {
                /* downgrade; they get no more than they had, and maybe less */
                if (!ns_capable(new->user_ns, CAP_SETUID) ||
                    (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
                        new->euid = new->uid;
                        new->egid = new->gid;
                }
                new->cap_permitted = cap_intersect(new->cap_permitted,
                                                   old->cap_permitted);
        }

        new->suid = new->fsuid = new->euid;
        new->sgid = new->fsgid = new->egid;

        /* File caps or setid cancels ambient. */
        if (has_fcap || is_setid)
                cap_clear(new->cap_ambient);

        /*
         * Now that we've computed pA', update pP' to give:
         *   pP' = (X & fP) | (pI & fI) | pA'
         */
        new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);

        /*
         * Set pE' = (fE ? pP' : pA').  Because pA' is zero if fE is set,
         * this is the same as pE' = (fE ? pP' : 0) | pA'.
         */
        if (effective)
                new->cap_effective = new->cap_permitted;
        else
                new->cap_effective = new->cap_ambient;

        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EPERM;

        if (nonroot_raised_pE(new, old, root_uid, has_fcap)) {
                ret = audit_log_bprm_fcaps(bprm, new, old);
                if (ret < 0)
                        return ret;
        }

        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);

        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EPERM;

        /* Check for privilege-elevated exec. */
        if (is_setid ||
            (!__is_real(root_uid, new) &&
             (effective ||
              __cap_grew(permitted, ambient, new))))
                bprm->secureexec = 1;

        return 0;
}

/**
 * cap_inode_setxattr - Determine whether an xattr may be altered
 * @dentry: The inode/dentry being altered
 * @name: The name of the xattr to be changed
 * @value: The value that the xattr will be changed to
 * @size: The size of value
 * @flags: The replacement flag
 *
 * Determine whether an xattr may be altered or set on an inode, returning 0 if
 * permission is granted, -ve if denied.
 *
 * This is used to make sure security xattrs don't get updated or set by those
 * who aren't privileged to do so.
 */
int cap_inode_setxattr(struct dentry *dentry, const char *name,
                       const void *value, size_t size, int flags)
{
        struct user_namespace *user_ns = dentry->d_sb->s_user_ns;

        /* Ignore non-security xattrs */
        if (strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) != 0)
                return 0;

        /*
         * For XATTR_NAME_CAPS the check will be done in
         * cap_convert_nscap(), called by setxattr()
         */
        if (strcmp(name, XATTR_NAME_CAPS) == 0)
                return 0;

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

/**
 * cap_inode_removexattr - Determine whether an xattr may be removed
 *
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        The inode/dentry being altered
 * @name:        The name of the xattr to be changed
 *
 * Determine whether an xattr may be removed from an inode, returning 0 if
 * permission is granted, -ve if denied.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * This is used to make sure security xattrs don't get removed by those who
 * aren't privileged to remove them.
 */
int cap_inode_removexattr(struct mnt_idmap *idmap,
                          struct dentry *dentry, const char *name)
{
        struct user_namespace *user_ns = dentry->d_sb->s_user_ns;

        /* Ignore non-security xattrs */
        if (strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) != 0)
                return 0;

        if (strcmp(name, XATTR_NAME_CAPS) == 0) {
                /* security.capability gets namespaced */
                struct inode *inode = d_backing_inode(dentry);
                if (!inode)
                        return -EINVAL;
                if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP))
                        return -EPERM;
                return 0;
        }

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

/*
 * cap_emulate_setxuid() fixes the effective / permitted capabilities of
 * a process after a call to setuid, setreuid, or setresuid.
 *
 *  1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of
 *  {r,e,s}uid != 0, the permitted and effective capabilities are
 *  cleared.
 *
 *  2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective
 *  capabilities of the process are cleared.
 *
 *  3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
 *  capabilities are set to the permitted capabilities.
 *
 *  fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
 *  never happen.
 *
 *  -astor
 *
 * cevans - New behaviour, Oct '99
 * A process may, via prctl(), elect to keep its capabilities when it
 * calls setuid() and switches away from uid==0. Both permitted and
 * effective sets will be retained.
 * Without this change, it was impossible for a daemon to drop only some
 * of its privilege. The call to setuid(!=0) would drop all privileges!
 * Keeping uid 0 is not an option because uid 0 owns too many vital
 * files..
 * Thanks to Olaf Kirch and Peter Benie for spotting this.
 */
static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
{
        kuid_t root_uid = make_kuid(old->user_ns, 0);

        if ((uid_eq(old->uid, root_uid) ||
             uid_eq(old->euid, root_uid) ||
             uid_eq(old->suid, root_uid)) &&
            (!uid_eq(new->uid, root_uid) &&
             !uid_eq(new->euid, root_uid) &&
             !uid_eq(new->suid, root_uid))) {
                if (!issecure(SECURE_KEEP_CAPS)) {
                        cap_clear(new->cap_permitted);
                        cap_clear(new->cap_effective);
                }

                /*
                 * Pre-ambient programs expect setresuid to nonroot followed
                 * by exec to drop capabilities.  We should make sure that
                 * this remains the case.
                 */
                cap_clear(new->cap_ambient);
        }
        if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
                cap_clear(new->cap_effective);
        if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid))
                new->cap_effective = new->cap_permitted;
}

/**
 * cap_task_fix_setuid - Fix up the results of setuid() call
 * @new: The proposed credentials
 * @old: The current task's current credentials
 * @flags: Indications of what has changed
 *
 * Fix up the results of setuid() call before the credential changes are
 * actually applied.
 *
 * Return: 0 to grant the changes, -ve to deny them.
 */
int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
{
        switch (flags) {
        case LSM_SETID_RE:
        case LSM_SETID_ID:
        case LSM_SETID_RES:
                /* juggle the capabilities to follow [RES]UID changes unless
                 * otherwise suppressed */
                if (!issecure(SECURE_NO_SETUID_FIXUP))
                        cap_emulate_setxuid(new, old);
                break;

        case LSM_SETID_FS:
                /* juggle the capabilities to follow FSUID changes, unless
                 * otherwise suppressed
                 *
                 * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
                 *          if not, we might be a bit too harsh here.
                 */
                if (!issecure(SECURE_NO_SETUID_FIXUP)) {
                        kuid_t root_uid = make_kuid(old->user_ns, 0);
                        if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid))
                                new->cap_effective =
                                        cap_drop_fs_set(new->cap_effective);

                        if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid))
                                new->cap_effective =
                                        cap_raise_fs_set(new->cap_effective,
                                                         new->cap_permitted);
                }
                break;

        default:
                return -EINVAL;
        }

        return 0;
}

/*
 * Rationale: code calling task_setscheduler, task_setioprio, and
 * task_setnice, assumes that
 *   . if capable(cap_sys_nice), then those actions should be allowed
 *   . if not capable(cap_sys_nice), but acting on your own processes,
 *           then those actions should be allowed
 * This is insufficient now since you can call code without suid, but
 * yet with increased caps.
 * So we check for increased caps on the target process.
 */
static int cap_safe_nice(struct task_struct *p)
{
        int is_subset, ret = 0;

        rcu_read_lock();
        is_subset = cap_issubset(__task_cred(p)->cap_permitted,
                                 current_cred()->cap_permitted);
        if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
                ret = -EPERM;
        rcu_read_unlock();

        return ret;
}

/**
 * cap_task_setscheduler - Determine if scheduler policy change is permitted
 * @p: The task to affect
 *
 * Determine if the requested scheduler policy change is permitted for the
 * specified task.
 *
 * Return: 0 if permission is granted, -ve if denied.
 */
int cap_task_setscheduler(struct task_struct *p)
{
        return cap_safe_nice(p);
}

/**
 * cap_task_setioprio - Determine if I/O priority change is permitted
 * @p: The task to affect
 * @ioprio: The I/O priority to set
 *
 * Determine if the requested I/O priority change is permitted for the specified
 * task.
 *
 * Return: 0 if permission is granted, -ve if denied.
 */
int cap_task_setioprio(struct task_struct *p, int ioprio)
{
        return cap_safe_nice(p);
}

/**
 * cap_task_setnice - Determine if task priority change is permitted
 * @p: The task to affect
 * @nice: The nice value to set
 *
 * Determine if the requested task priority change is permitted for the
 * specified task.
 *
 * Return: 0 if permission is granted, -ve if denied.
 */
int cap_task_setnice(struct task_struct *p, int nice)
{
        return cap_safe_nice(p);
}

/*
 * Implement PR_CAPBSET_DROP.  Attempt to remove the specified capability from
 * the current task's bounding set.  Returns 0 on success, -ve on error.
 */
static int cap_prctl_drop(unsigned long cap)
{
        struct cred *new;

        if (!ns_capable(current_user_ns(), CAP_SETPCAP))
                return -EPERM;
        if (!cap_valid(cap))
                return -EINVAL;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        cap_lower(new->cap_bset, cap);
        return commit_creds(new);
}

/**
 * cap_task_prctl - Implement process control functions for this security module
 * @option: The process control function requested
 * @arg2: The argument data for this function
 * @arg3: The argument data for this function
 * @arg4: The argument data for this function
 * @arg5: The argument data for this function
 *
 * Allow process control functions (sys_prctl()) to alter capabilities; may
 * also deny access to other functions not otherwise implemented here.
 *
 * Return: 0 or +ve on success, -ENOSYS if this function is not implemented
 * here, other -ve on error.  If -ENOSYS is returned, sys_prctl() and other LSM
 * modules will consider performing the function.
 */
int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                   unsigned long arg4, unsigned long arg5)
{
        const struct cred *old = current_cred();
        struct cred *new;

        switch (option) {
        case PR_CAPBSET_READ:
                if (!cap_valid(arg2))
                        return -EINVAL;
                return !!cap_raised(old->cap_bset, arg2);

        case PR_CAPBSET_DROP:
                return cap_prctl_drop(arg2);

        /*
         * The next four prctl's remain to assist with transitioning a
         * system from legacy UID=0 based privilege (when filesystem
         * capabilities are not in use) to a system using filesystem
         * capabilities only - as the POSIX.1e draft intended.
         *
         * Note:
         *
         *  PR_SET_SECUREBITS =
         *      issecure_mask(SECURE_KEEP_CAPS_LOCKED)
         *    | issecure_mask(SECURE_NOROOT)
         *    | issecure_mask(SECURE_NOROOT_LOCKED)
         *    | issecure_mask(SECURE_NO_SETUID_FIXUP)
         *    | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED)
         *
         * will ensure that the current process and all of its
         * children will be locked into a pure
         * capability-based-privilege environment.
         */
        case PR_SET_SECUREBITS:
                if ((((old->securebits & SECURE_ALL_LOCKS) >> 1)
                     & (old->securebits ^ arg2))                        /*[1]*/
                    || ((old->securebits & SECURE_ALL_LOCKS & ~arg2))        /*[2]*/
                    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))        /*[3]*/
                    || (cap_capable(current_cred(),
                                    current_cred()->user_ns,
                                    CAP_SETPCAP,
                                    CAP_OPT_NONE) != 0)                        /*[4]*/
                        /*
                         * [1] no changing of bits that are locked
                         * [2] no unlocking of locks
                         * [3] no setting of unsupported bits
                         * [4] doing anything requires privilege (go read about
                         *     the "sendmail capabilities bug")
                         */
                    )
                        /* cannot change a locked bit */
                        return -EPERM;

                new = prepare_creds();
                if (!new)
                        return -ENOMEM;
                new->securebits = arg2;
                return commit_creds(new);

        case PR_GET_SECUREBITS:
                return old->securebits;

        case PR_GET_KEEPCAPS:
                return !!issecure(SECURE_KEEP_CAPS);

        case PR_SET_KEEPCAPS:
                if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */
                        return -EINVAL;
                if (issecure(SECURE_KEEP_CAPS_LOCKED))
                        return -EPERM;

                new = prepare_creds();
                if (!new)
                        return -ENOMEM;
                if (arg2)
                        new->securebits |= issecure_mask(SECURE_KEEP_CAPS);
                else
                        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
                return commit_creds(new);

        case PR_CAP_AMBIENT:
                if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
                        if (arg3 | arg4 | arg5)
                                return -EINVAL;

                        new = prepare_creds();
                        if (!new)
                                return -ENOMEM;
                        cap_clear(new->cap_ambient);
                        return commit_creds(new);
                }

                if (((!cap_valid(arg3)) | arg4 | arg5))
                        return -EINVAL;

                if (arg2 == PR_CAP_AMBIENT_IS_SET) {
                        return !!cap_raised(current_cred()->cap_ambient, arg3);
                } else if (arg2 != PR_CAP_AMBIENT_RAISE &&
                           arg2 != PR_CAP_AMBIENT_LOWER) {
                        return -EINVAL;
                } else {
                        if (arg2 == PR_CAP_AMBIENT_RAISE &&
                            (!cap_raised(current_cred()->cap_permitted, arg3) ||
                             !cap_raised(current_cred()->cap_inheritable,
                                         arg3) ||
                             issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
                                return -EPERM;

                        new = prepare_creds();
                        if (!new)
                                return -ENOMEM;
                        if (arg2 == PR_CAP_AMBIENT_RAISE)
                                cap_raise(new->cap_ambient, arg3);
                        else
                                cap_lower(new->cap_ambient, arg3);
                        return commit_creds(new);
                }

        default:
                /* No functionality available - continue with default */
                return -ENOSYS;
        }
}

/**
 * cap_vm_enough_memory - Determine whether a new virtual mapping is permitted
 * @mm: The VM space in which the new mapping is to be made
 * @pages: The size of the mapping
 *
 * Determine whether the allocation of a new virtual mapping by the current
 * task is permitted.
 *
 * Return: 1 if permission is granted, 0 if not.
 */
int cap_vm_enough_memory(struct mm_struct *mm, long pages)
{
        int cap_sys_admin = 0;

        if (cap_capable(current_cred(), &init_user_ns,
                                CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) == 0)
                cap_sys_admin = 1;

        return cap_sys_admin;
}

/**
 * cap_mmap_addr - check if able to map given addr
 * @addr: address attempting to be mapped
 *
 * If the process is attempting to map memory below dac_mmap_min_addr they need
 * CAP_SYS_RAWIO.  The other parameters to this function are unused by the
 * capability security module.
 *
 * Return: 0 if this mapping should be allowed or -EPERM if not.
 */
int cap_mmap_addr(unsigned long addr)
{
        int ret = 0;

        if (addr < dac_mmap_min_addr) {
                ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO,
                                  CAP_OPT_NONE);
                /* set PF_SUPERPRIV if it turns out we allow the low mmap */
                if (ret == 0)
                        current->flags |= PF_SUPERPRIV;
        }
        return ret;
}

int cap_mmap_file(struct file *file, unsigned long reqprot,
                  unsigned long prot, unsigned long flags)
{
        return 0;
}

#ifdef CONFIG_SECURITY

static const struct lsm_id capability_lsmid = {
        .name = "capability",
        .id = LSM_ID_CAPABILITY,
};

static struct security_hook_list capability_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(capable, cap_capable),
        LSM_HOOK_INIT(settime, cap_settime),
        LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check),
        LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme),
        LSM_HOOK_INIT(capget, cap_capget),
        LSM_HOOK_INIT(capset, cap_capset),
        LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file),
        LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
        LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
        LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
        LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
        LSM_HOOK_INIT(mmap_file, cap_mmap_file),
        LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
        LSM_HOOK_INIT(task_prctl, cap_task_prctl),
        LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler),
        LSM_HOOK_INIT(task_setioprio, cap_task_setioprio),
        LSM_HOOK_INIT(task_setnice, cap_task_setnice),
        LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory),
};

static int __init capability_init(void)
{
        security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks),
                           &capability_lsmid);
        return 0;
}

DEFINE_LSM(capability) = {
        .name = "capability",
        .order = LSM_ORDER_FIRST,
        .init = capability_init,
};

#endif /* CONFIG_SECURITY */















































































































































































































   48 













































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_JUMP_LABEL_H
#define _LINUX_JUMP_LABEL_H

/*
 * Jump label support
 *
 * Copyright (C) 2009-2012 Jason Baron <jbaron@redhat.com>
 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
 *
 * DEPRECATED API:
 *
 * The use of 'struct static_key' directly, is now DEPRECATED. In addition
 * static_key_{true,false}() is also DEPRECATED. IE DO NOT use the following:
 *
 * struct static_key false = STATIC_KEY_INIT_FALSE;
 * struct static_key true = STATIC_KEY_INIT_TRUE;
 * static_key_true()
 * static_key_false()
 *
 * The updated API replacements are:
 *
 * DEFINE_STATIC_KEY_TRUE(key);
 * DEFINE_STATIC_KEY_FALSE(key);
 * DEFINE_STATIC_KEY_ARRAY_TRUE(keys, count);
 * DEFINE_STATIC_KEY_ARRAY_FALSE(keys, count);
 * static_branch_likely()
 * static_branch_unlikely()
 *
 * Jump labels provide an interface to generate dynamic branches using
 * self-modifying code. Assuming toolchain and architecture support, if we
 * define a "key" that is initially false via "DEFINE_STATIC_KEY_FALSE(key)",
 * an "if (static_branch_unlikely(&key))" statement is an unconditional branch
 * (which defaults to false - and the true block is placed out of line).
 * Similarly, we can define an initially true key via
 * "DEFINE_STATIC_KEY_TRUE(key)", and use it in the same
 * "if (static_branch_unlikely(&key))", in which case we will generate an
 * unconditional branch to the out-of-line true branch. Keys that are
 * initially true or false can be using in both static_branch_unlikely()
 * and static_branch_likely() statements.
 *
 * At runtime we can change the branch target by setting the key
 * to true via a call to static_branch_enable(), or false using
 * static_branch_disable(). If the direction of the branch is switched by
 * these calls then we run-time modify the branch target via a
 * no-op -> jump or jump -> no-op conversion. For example, for an
 * initially false key that is used in an "if (static_branch_unlikely(&key))"
 * statement, setting the key to true requires us to patch in a jump
 * to the out-of-line of true branch.
 *
 * In addition to static_branch_{enable,disable}, we can also reference count
 * the key or branch direction via static_branch_{inc,dec}. Thus,
 * static_branch_inc() can be thought of as a 'make more true' and
 * static_branch_dec() as a 'make more false'.
 *
 * Since this relies on modifying code, the branch modifying functions
 * must be considered absolute slow paths (machine wide synchronization etc.).
 * OTOH, since the affected branches are unconditional, their runtime overhead
 * will be absolutely minimal, esp. in the default (off) case where the total
 * effect is a single NOP of appropriate size. The on case will patch in a jump
 * to the out-of-line block.
 *
 * When the control is directly exposed to userspace, it is prudent to delay the
 * decrement to avoid high frequency code modifications which can (and do)
 * cause significant performance degradation. Struct static_key_deferred and
 * static_key_slow_dec_deferred() provide for this.
 *
 * Lacking toolchain and or architecture support, static keys fall back to a
 * simple conditional branch.
 *
 * Additional babbling in: Documentation/staging/static-keys.rst
 */

#ifndef __ASSEMBLY__

#include <linux/types.h>
#include <linux/compiler.h>

extern bool static_key_initialized;

#define STATIC_KEY_CHECK_USE(key) WARN(!static_key_initialized,                      \
                                    "%s(): static key '%pS' used before call to jump_label_init()", \
                                    __func__, (key))

struct static_key {
        atomic_t enabled;
#ifdef CONFIG_JUMP_LABEL
/*
 * Note:
 *   To make anonymous unions work with old compilers, the static
 *   initialization of them requires brackets. This creates a dependency
 *   on the order of the struct with the initializers. If any fields
 *   are added, STATIC_KEY_INIT_TRUE and STATIC_KEY_INIT_FALSE may need
 *   to be modified.
 *
 * bit 0 => 1 if key is initially true
 *            0 if initially false
 * bit 1 => 1 if points to struct static_key_mod
 *            0 if points to struct jump_entry
 */
        union {
                unsigned long type;
                struct jump_entry *entries;
                struct static_key_mod *next;
        };
#endif        /* CONFIG_JUMP_LABEL */
};

#endif /* __ASSEMBLY__ */

#ifdef CONFIG_JUMP_LABEL
#include <asm/jump_label.h>

#ifndef __ASSEMBLY__
#ifdef CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE

struct jump_entry {
        s32 code;
        s32 target;
        long key;        // key may be far away from the core kernel under KASLR
};

static inline unsigned long jump_entry_code(const struct jump_entry *entry)
{
        return (unsigned long)&entry->code + entry->code;
}

static inline unsigned long jump_entry_target(const struct jump_entry *entry)
{
        return (unsigned long)&entry->target + entry->target;
}

static inline struct static_key *jump_entry_key(const struct jump_entry *entry)
{
        long offset = entry->key & ~3L;

        return (struct static_key *)((unsigned long)&entry->key + offset);
}

#else

static inline unsigned long jump_entry_code(const struct jump_entry *entry)
{
        return entry->code;
}

static inline unsigned long jump_entry_target(const struct jump_entry *entry)
{
        return entry->target;
}

static inline struct static_key *jump_entry_key(const struct jump_entry *entry)
{
        return (struct static_key *)((unsigned long)entry->key & ~3UL);
}

#endif

static inline bool jump_entry_is_branch(const struct jump_entry *entry)
{
        return (unsigned long)entry->key & 1UL;
}

static inline bool jump_entry_is_init(const struct jump_entry *entry)
{
        return (unsigned long)entry->key & 2UL;
}

static inline void jump_entry_set_init(struct jump_entry *entry, bool set)
{
        if (set)
                entry->key |= 2;
        else
                entry->key &= ~2;
}

static inline int jump_entry_size(struct jump_entry *entry)
{
#ifdef JUMP_LABEL_NOP_SIZE
        return JUMP_LABEL_NOP_SIZE;
#else
        return arch_jump_entry_size(entry);
#endif
}

#endif
#endif

#ifndef __ASSEMBLY__

enum jump_label_type {
        JUMP_LABEL_NOP = 0,
        JUMP_LABEL_JMP,
};

struct module;

#ifdef CONFIG_JUMP_LABEL

#define JUMP_TYPE_FALSE                0UL
#define JUMP_TYPE_TRUE                1UL
#define JUMP_TYPE_LINKED        2UL
#define JUMP_TYPE_MASK                3UL

static __always_inline bool static_key_false(struct static_key *key)
{
        return arch_static_branch(key, false);
}

static __always_inline bool static_key_true(struct static_key *key)
{
        return !arch_static_branch(key, true);
}

extern struct jump_entry __start___jump_table[];
extern struct jump_entry __stop___jump_table[];

extern void jump_label_init(void);
extern void jump_label_init_ro(void);
extern void jump_label_lock(void);
extern void jump_label_unlock(void);
extern void arch_jump_label_transform(struct jump_entry *entry,
                                      enum jump_label_type type);
extern bool arch_jump_label_transform_queue(struct jump_entry *entry,
                                            enum jump_label_type type);
extern void arch_jump_label_transform_apply(void);
extern int jump_label_text_reserved(void *start, void *end);
extern bool static_key_slow_inc(struct static_key *key);
extern bool static_key_fast_inc_not_disabled(struct static_key *key);
extern void static_key_slow_dec(struct static_key *key);
extern bool static_key_slow_inc_cpuslocked(struct static_key *key);
extern void static_key_slow_dec_cpuslocked(struct static_key *key);
extern int static_key_count(struct static_key *key);
extern void static_key_enable(struct static_key *key);
extern void static_key_disable(struct static_key *key);
extern void static_key_enable_cpuslocked(struct static_key *key);
extern void static_key_disable_cpuslocked(struct static_key *key);
extern enum jump_label_type jump_label_init_type(struct jump_entry *entry);

/*
 * We should be using ATOMIC_INIT() for initializing .enabled, but
 * the inclusion of atomic.h is problematic for inclusion of jump_label.h
 * in 'low-level' headers. Thus, we are initializing .enabled with a
 * raw value, but have added a BUILD_BUG_ON() to catch any issues in
 * jump_label_init() see: kernel/jump_label.c.
 */
#define STATIC_KEY_INIT_TRUE                                        \
        { .enabled = { 1 },                                        \
          { .type = JUMP_TYPE_TRUE } }
#define STATIC_KEY_INIT_FALSE                                        \
        { .enabled = { 0 },                                        \
          { .type = JUMP_TYPE_FALSE } }

#else  /* !CONFIG_JUMP_LABEL */

#include <linux/atomic.h>
#include <linux/bug.h>

static __always_inline int static_key_count(struct static_key *key)
{
        return raw_atomic_read(&key->enabled);
}

static __always_inline void jump_label_init(void)
{
        static_key_initialized = true;
}

static __always_inline void jump_label_init_ro(void) { }

static __always_inline bool static_key_false(struct static_key *key)
{
        if (unlikely_notrace(static_key_count(key) > 0))
                return true;
        return false;
}

static __always_inline bool static_key_true(struct static_key *key)
{
        if (likely_notrace(static_key_count(key) > 0))
                return true;
        return false;
}

static inline bool static_key_fast_inc_not_disabled(struct static_key *key)
{
        int v;

        STATIC_KEY_CHECK_USE(key);
        /*
         * Prevent key->enabled getting negative to follow the same semantics
         * as for CONFIG_JUMP_LABEL=y, see kernel/jump_label.c comment.
         */
        v = atomic_read(&key->enabled);
        do {
                if (v < 0 || (v + 1) < 0)
                        return false;
        } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1)));
        return true;
}
#define static_key_slow_inc(key)        static_key_fast_inc_not_disabled(key)

static inline void static_key_slow_dec(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);
        atomic_dec(&key->enabled);
}

#define static_key_slow_inc_cpuslocked(key) static_key_slow_inc(key)
#define static_key_slow_dec_cpuslocked(key) static_key_slow_dec(key)

static inline int jump_label_text_reserved(void *start, void *end)
{
        return 0;
}

static inline void jump_label_lock(void) {}
static inline void jump_label_unlock(void) {}

static inline void static_key_enable(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);

        if (atomic_read(&key->enabled) != 0) {
                WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
                return;
        }
        atomic_set(&key->enabled, 1);
}

static inline void static_key_disable(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);

        if (atomic_read(&key->enabled) != 1) {
                WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
                return;
        }
        atomic_set(&key->enabled, 0);
}

#define static_key_enable_cpuslocked(k)                static_key_enable((k))
#define static_key_disable_cpuslocked(k)        static_key_disable((k))

#define STATIC_KEY_INIT_TRUE        { .enabled = ATOMIC_INIT(1) }
#define STATIC_KEY_INIT_FALSE        { .enabled = ATOMIC_INIT(0) }

#endif        /* CONFIG_JUMP_LABEL */

#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE
#define jump_label_enabled static_key_enabled

/* -------------------------------------------------------------------------- */

/*
 * Two type wrappers around static_key, such that we can use compile time
 * type differentiation to emit the right code.
 *
 * All the below code is macros in order to play type games.
 */

struct static_key_true {
        struct static_key key;
};

struct static_key_false {
        struct static_key key;
};

#define STATIC_KEY_TRUE_INIT  (struct static_key_true) { .key = STATIC_KEY_INIT_TRUE,  }
#define STATIC_KEY_FALSE_INIT (struct static_key_false){ .key = STATIC_KEY_INIT_FALSE, }

#define DEFINE_STATIC_KEY_TRUE(name)        \
        struct static_key_true name = STATIC_KEY_TRUE_INIT

#define DEFINE_STATIC_KEY_TRUE_RO(name)        \
        struct static_key_true name __ro_after_init = STATIC_KEY_TRUE_INIT

#define DECLARE_STATIC_KEY_TRUE(name)        \
        extern struct static_key_true name

#define DEFINE_STATIC_KEY_FALSE(name)        \
        struct static_key_false name = STATIC_KEY_FALSE_INIT

#define DEFINE_STATIC_KEY_FALSE_RO(name)        \
        struct static_key_false name __ro_after_init = STATIC_KEY_FALSE_INIT

#define DECLARE_STATIC_KEY_FALSE(name)        \
        extern struct static_key_false name

#define DEFINE_STATIC_KEY_ARRAY_TRUE(name, count)                \
        struct static_key_true name[count] = {                        \
                [0 ... (count) - 1] = STATIC_KEY_TRUE_INIT,        \
        }

#define DEFINE_STATIC_KEY_ARRAY_FALSE(name, count)                \
        struct static_key_false name[count] = {                        \
                [0 ... (count) - 1] = STATIC_KEY_FALSE_INIT,        \
        }

#define _DEFINE_STATIC_KEY_1(name)        DEFINE_STATIC_KEY_TRUE(name)
#define _DEFINE_STATIC_KEY_0(name)        DEFINE_STATIC_KEY_FALSE(name)
#define DEFINE_STATIC_KEY_MAYBE(cfg, name)                        \
        __PASTE(_DEFINE_STATIC_KEY_, IS_ENABLED(cfg))(name)

#define _DEFINE_STATIC_KEY_RO_1(name)        DEFINE_STATIC_KEY_TRUE_RO(name)
#define _DEFINE_STATIC_KEY_RO_0(name)        DEFINE_STATIC_KEY_FALSE_RO(name)
#define DEFINE_STATIC_KEY_MAYBE_RO(cfg, name)                        \
        __PASTE(_DEFINE_STATIC_KEY_RO_, IS_ENABLED(cfg))(name)

#define _DECLARE_STATIC_KEY_1(name)        DECLARE_STATIC_KEY_TRUE(name)
#define _DECLARE_STATIC_KEY_0(name)        DECLARE_STATIC_KEY_FALSE(name)
#define DECLARE_STATIC_KEY_MAYBE(cfg, name)                        \
        __PASTE(_DECLARE_STATIC_KEY_, IS_ENABLED(cfg))(name)

extern bool ____wrong_branch_error(void);

#define static_key_enabled(x)                                                        \
({                                                                                \
        if (!__builtin_types_compatible_p(typeof(*x), struct static_key) &&        \
            !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\
            !__builtin_types_compatible_p(typeof(*x), struct static_key_false))        \
                ____wrong_branch_error();                                        \
        static_key_count((struct static_key *)x) > 0;                                \
})

#ifdef CONFIG_JUMP_LABEL

/*
 * Combine the right initial value (type) with the right branch order
 * to generate the desired result.
 *
 *
 * type\branch|        likely (1)              |        unlikely (0)
 * -----------+-----------------------+------------------
 *            |                       |
 *  true (1)  |           ...                      |           ...
 *            |    NOP                      |           JMP L
 *            |    <br-stmts>              |        1: ...
 *            |        L: ...                      |
 *            |                              |
 *            |                              |        L: <br-stmts>
 *            |                              |           jmp 1b
 *            |                       |
 * -----------+-----------------------+------------------
 *            |                       |
 *  false (0) |           ...                      |           ...
 *            |    JMP L              |           NOP
 *            |    <br-stmts>              |        1: ...
 *            |        L: ...                      |
 *            |                              |
 *            |                              |        L: <br-stmts>
 *            |                              |           jmp 1b
 *            |                       |
 * -----------+-----------------------+------------------
 *
 * The initial value is encoded in the LSB of static_key::entries,
 * type: 0 = false, 1 = true.
 *
 * The branch type is encoded in the LSB of jump_entry::key,
 * branch: 0 = unlikely, 1 = likely.
 *
 * This gives the following logic table:
 *
 *        enabled        type        branch          instuction
 * -----------------------------+-----------
 *        0        0        0        | NOP
 *        0        0        1        | JMP
 *        0        1        0        | NOP
 *        0        1        1        | JMP
 *
 *        1        0        0        | JMP
 *        1        0        1        | NOP
 *        1        1        0        | JMP
 *        1        1        1        | NOP
 *
 * Which gives the following functions:
 *
 *   dynamic: instruction = enabled ^ branch
 *   static:  instruction = type ^ branch
 *
 * See jump_label_type() / jump_label_init_type().
 */

#define static_branch_likely(x)                                                        \
({                                                                                \
        bool branch;                                                                \
        if (__builtin_types_compatible_p(typeof(*x), struct static_key_true))        \
                branch = !arch_static_branch(&(x)->key, true);                        \
        else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \
                branch = !arch_static_branch_jump(&(x)->key, true);                \
        else                                                                        \
                branch = ____wrong_branch_error();                                \
        likely_notrace(branch);                                                                \
})

#define static_branch_unlikely(x)                                                \
({                                                                                \
        bool branch;                                                                \
        if (__builtin_types_compatible_p(typeof(*x), struct static_key_true))        \
                branch = arch_static_branch_jump(&(x)->key, false);                \
        else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \
                branch = arch_static_branch(&(x)->key, false);                        \
        else                                                                        \
                branch = ____wrong_branch_error();                                \
        unlikely_notrace(branch);                                                        \
})

#else /* !CONFIG_JUMP_LABEL */

#define static_branch_likely(x)                likely_notrace(static_key_enabled(&(x)->key))
#define static_branch_unlikely(x)        unlikely_notrace(static_key_enabled(&(x)->key))

#endif /* CONFIG_JUMP_LABEL */

#define static_branch_maybe(config, x)                                        \
        (IS_ENABLED(config) ? static_branch_likely(x)                        \
                            : static_branch_unlikely(x))

/*
 * Advanced usage; refcount, branch is enabled when: count != 0
 */

#define static_branch_inc(x)                static_key_slow_inc(&(x)->key)
#define static_branch_dec(x)                static_key_slow_dec(&(x)->key)
#define static_branch_inc_cpuslocked(x)        static_key_slow_inc_cpuslocked(&(x)->key)
#define static_branch_dec_cpuslocked(x)        static_key_slow_dec_cpuslocked(&(x)->key)

/*
 * Normal usage; boolean enable/disable.
 */

#define static_branch_enable(x)                        static_key_enable(&(x)->key)
#define static_branch_disable(x)                static_key_disable(&(x)->key)
#define static_branch_enable_cpuslocked(x)        static_key_enable_cpuslocked(&(x)->key)
#define static_branch_disable_cpuslocked(x)        static_key_disable_cpuslocked(&(x)->key)

#endif /* __ASSEMBLY__ */

#endif        /* _LINUX_JUMP_LABEL_H */





































    2 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 2023 Western Digital Corporation or its affiliates.
 */

#ifndef BTRFS_RAID_STRIPE_TREE_H
#define BTRFS_RAID_STRIPE_TREE_H

#include <linux/types.h>
#include <uapi/linux/btrfs_tree.h>
#include "fs.h"

#define BTRFS_RST_SUPP_BLOCK_GROUP_MASK    (BTRFS_BLOCK_GROUP_DUP |                \
                                            BTRFS_BLOCK_GROUP_RAID1_MASK |        \
                                            BTRFS_BLOCK_GROUP_RAID0 |                \
                                            BTRFS_BLOCK_GROUP_RAID10)

struct btrfs_io_context;
struct btrfs_io_stripe;
struct btrfs_fs_info;
struct btrfs_ordered_extent;
struct btrfs_trans_handle;

int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length);
int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
                                 u64 logical, u64 *length, u64 map_type,
                                 u32 stripe_index, struct btrfs_io_stripe *stripe);
int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_ordered_extent *ordered_extent);

static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
                                                 u64 map_type)
{
        u64 type = map_type & BTRFS_BLOCK_GROUP_TYPE_MASK;
        u64 profile = map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK;

        if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE))
                return false;

        if (type != BTRFS_BLOCK_GROUP_DATA)
                return false;

        if (profile & BTRFS_RST_SUPP_BLOCK_GROUP_MASK)
                return true;

        return false;
}

static inline int btrfs_num_raid_stripes(u32 item_size)
{
        return (item_size - offsetof(struct btrfs_stripe_extent, strides)) /
                sizeof(struct btrfs_raid_stride);
}

#endif





































   11 


















    4 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PKRU_H
#define _ASM_X86_PKRU_H

#include <asm/cpufeature.h>

#define PKRU_AD_BIT 0x1u
#define PKRU_WD_BIT 0x2u
#define PKRU_BITS_PER_PKEY 2

#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
extern u32 init_pkru_value;
#define pkru_get_init_value()        READ_ONCE(init_pkru_value)
#else
#define init_pkru_value        0
#define pkru_get_init_value()        0
#endif

static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
{
        int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
        return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
}

static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
{
        int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
        /*
         * Access-disable disables writes too so we need to check
         * both bits here.
         */
        return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
}

static inline u32 read_pkru(void)
{
        if (cpu_feature_enabled(X86_FEATURE_OSPKE))
                return rdpkru();
        return 0;
}

static inline void write_pkru(u32 pkru)
{
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return;
        /*
         * WRPKRU is relatively expensive compared to RDPKRU.
         * Avoid WRPKRU when it would not change the value.
         */
        if (pkru != rdpkru())
                wrpkru(pkru);
}

static inline void pkru_write_default(void)
{
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return;

        wrpkru(pkru_get_init_value());
}

#endif


































































































































































































































































































































































































































































































































































































































































































    1 








































































































































































































































































































































































































































    1 
    1 
    1 








    1 



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *
 * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
 *
 */

// clang-format off
#ifndef _LINUX_NTFS3_NTFS_FS_H
#define _LINUX_NTFS3_NTFS_FS_H

#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/page-flags.h>
#include <linux/pagemap.h>
#include <linux/rbtree.h>
#include <linux/rwsem.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/time64.h>
#include <linux/types.h>
#include <linux/uidgid.h>
#include <asm/div64.h>
#include <asm/page.h>

#include "debug.h"
#include "ntfs.h"

struct dentry;
struct fiemap_extent_info;
struct user_namespace;
struct page;
struct writeback_control;
enum utf16_endian;


#define MINUS_ONE_T                        ((size_t)(-1))
/* Biggest MFT / smallest cluster */
#define MAXIMUM_BYTES_PER_MFT                4096
#define MAXIMUM_SHIFT_BYTES_PER_MFT        12
#define NTFS_BLOCKS_PER_MFT_RECORD        (MAXIMUM_BYTES_PER_MFT / 512)

#define MAXIMUM_BYTES_PER_INDEX                4096
#define MAXIMUM_SHIFT_BYTES_PER_INDEX        12
#define NTFS_BLOCKS_PER_INODE                (MAXIMUM_BYTES_PER_INDEX / 512)

/* NTFS specific error code when fixup failed. */
#define E_NTFS_FIXUP                        555
/* NTFS specific error code about resident->nonresident. */
#define E_NTFS_NONRESIDENT                556
/* NTFS specific error code about punch hole. */
#define E_NTFS_NOTALIGNED                557
/* NTFS specific error code when on-disk struct is corrupted. */
#define E_NTFS_CORRUPT                        558


/* sbi->flags */
#define NTFS_FLAGS_NODISCARD                0x00000001
/* ntfs in shutdown state. */
#define NTFS_FLAGS_SHUTDOWN_BIT                0x00000002  /* == 4*/
/* Set when LogFile is replaying. */
#define NTFS_FLAGS_LOG_REPLAYING        0x00000008
/* Set when we changed first MFT's which copy must be updated in $MftMirr. */
#define NTFS_FLAGS_MFTMIRR                0x00001000
#define NTFS_FLAGS_NEED_REPLAY                0x04000000


/* ni->ni_flags */
/*
 * Data attribute is external compressed (LZX/Xpress)
 * 1 - WOF_COMPRESSION_XPRESS4K
 * 2 - WOF_COMPRESSION_XPRESS8K
 * 3 - WOF_COMPRESSION_XPRESS16K
 * 4 - WOF_COMPRESSION_LZX32K
 */
#define NI_FLAG_COMPRESSED_MASK                0x0000000f
/* Data attribute is deduplicated. */
#define NI_FLAG_DEDUPLICATED                0x00000010
#define NI_FLAG_EA                        0x00000020
#define NI_FLAG_DIR                        0x00000040
#define NI_FLAG_RESIDENT                0x00000080
#define NI_FLAG_UPDATE_PARENT                0x00000100
// clang-format on

struct ntfs_mount_options {
        char *nls_name;
        struct nls_table *nls;

        kuid_t fs_uid;
        kgid_t fs_gid;
        u16 fs_fmask_inv;
        u16 fs_dmask_inv;

        unsigned fmask : 1; /* fmask was set. */
        unsigned dmask : 1; /*dmask was set. */
        unsigned sys_immutable : 1; /* Immutable system files. */
        unsigned discard : 1; /* Issue discard requests on deletions. */
        unsigned sparse : 1; /* Create sparse files. */
        unsigned showmeta : 1; /* Show meta files. */
        unsigned nohidden : 1; /* Do not show hidden files. */
        unsigned hide_dot_files : 1; /* Set hidden flag on dot files. */
        unsigned windows_names : 1; /* Disallow names forbidden by Windows. */
        unsigned force : 1; /* RW mount dirty volume. */
        unsigned prealloc : 1; /* Preallocate space when file is growing. */
        unsigned nocase : 1; /* case insensitive. */
};

/* Special value to unpack and deallocate. */
#define RUN_DEALLOCATE ((struct runs_tree *)(size_t)1)

/* TODO: Use rb tree instead of array. */
struct runs_tree {
        struct ntfs_run *runs;
        size_t count; /* Currently used size a ntfs_run storage. */
        size_t allocated; /* Currently allocated ntfs_run storage size. */
};

struct ntfs_buffers {
        /* Biggest MFT / smallest cluster = 4096 / 512 = 8 */
        /* Biggest index / smallest cluster = 4096 / 512 = 8 */
        struct buffer_head *bh[PAGE_SIZE >> SECTOR_SHIFT];
        u32 bytes;
        u32 nbufs;
        u32 off;
};

enum ALLOCATE_OPT {
        ALLOCATE_DEF = 0, // Allocate all clusters.
        ALLOCATE_MFT = 1, // Allocate for MFT.
        ALLOCATE_ZERO = 2, // Zeroout new allocated clusters
};

enum bitmap_mutex_classes {
        BITMAP_MUTEX_CLUSTERS = 0,
        BITMAP_MUTEX_MFT = 1,
};

struct wnd_bitmap {
        struct super_block *sb;
        struct rw_semaphore rw_lock;

        struct runs_tree run;
        size_t nbits;

        size_t total_zeroes; // Total number of free bits.
        u16 *free_bits; // Free bits in each window.
        size_t nwnd;
        u32 bits_last; // Bits in last window.

        struct rb_root start_tree; // Extents, sorted by 'start'.
        struct rb_root count_tree; // Extents, sorted by 'count + start'.
        size_t count; // Extents count.

        /*
         * -1 Tree is activated but not updated (too many fragments).
         * 0 - Tree is not activated.
         * 1 - Tree is activated and updated.
         */
        int uptodated;
        size_t extent_min; // Minimal extent used while building.
        size_t extent_max; // Upper estimate of biggest free block.

        /* Zone [bit, end) */
        size_t zone_bit;
        size_t zone_end;

        bool inited;
};

typedef int (*NTFS_CMP_FUNC)(const void *key1, size_t len1, const void *key2,
                             size_t len2, const void *param);

enum index_mutex_classed {
        INDEX_MUTEX_I30 = 0,
        INDEX_MUTEX_SII = 1,
        INDEX_MUTEX_SDH = 2,
        INDEX_MUTEX_SO = 3,
        INDEX_MUTEX_SQ = 4,
        INDEX_MUTEX_SR = 5,
        INDEX_MUTEX_TOTAL
};

/* ntfs_index - Allocation unit inside directory. */
struct ntfs_index {
        struct runs_tree bitmap_run;
        struct runs_tree alloc_run;
        /* read/write access to 'bitmap_run'/'alloc_run' while ntfs_readdir */
        struct rw_semaphore run_lock;

        /*TODO: Remove 'cmp'. */
        NTFS_CMP_FUNC cmp;

        u8 index_bits; // log2(root->index_block_size)
        u8 idx2vbn_bits; // log2(root->index_block_clst)
        u8 vbn2vbo_bits; // index_block_size < cluster? 9 : cluster_bits
        u8 type; // index_mutex_classed
};

/* Minimum MFT zone. */
#define NTFS_MIN_MFT_ZONE 100
/* Step to increase the MFT. */
#define NTFS_MFT_INCREASE_STEP 1024

/* Ntfs file system in-core superblock data. */
struct ntfs_sb_info {
        struct super_block *sb;

        u32 discard_granularity;
        u64 discard_granularity_mask_inv; // ~(discard_granularity_mask_inv-1)

        u32 cluster_size; // bytes per cluster
        u32 cluster_mask; // == cluster_size - 1
        u64 cluster_mask_inv; // ~(cluster_size - 1)
        u32 block_mask; // sb->s_blocksize - 1
        u32 blocks_per_cluster; // cluster_size / sb->s_blocksize

        u32 record_size;
        u32 index_size;

        u8 cluster_bits;
        u8 record_bits;

        u64 maxbytes; // Maximum size for normal files.
        u64 maxbytes_sparse; // Maximum size for sparse file.

        unsigned long flags; // See NTFS_FLAGS_

        CLST zone_max; // Maximum MFT zone length in clusters
        CLST bad_clusters; // The count of marked bad clusters.

        u16 max_bytes_per_attr; // Maximum attribute size in record.
        u16 attr_size_tr; // Attribute size threshold (320 bytes).

        /* Records in $Extend. */
        CLST objid_no;
        CLST quota_no;
        CLST reparse_no;
        CLST usn_jrnl_no;

        struct ATTR_DEF_ENTRY *def_table; // Attribute definition table.
        u32 def_entries;
        u32 ea_max_size;

        struct MFT_REC *new_rec;

        u16 *upcase;

        struct {
                u64 lbo, lbo2;
                struct ntfs_inode *ni;
                struct wnd_bitmap bitmap; // $MFT::Bitmap
                /*
                 * MFT records [11-24) used to expand MFT itself.
                 * They always marked as used in $MFT::Bitmap
                 * 'reserved_bitmap' contains real bitmap of these records.
                 */
                ulong reserved_bitmap; // Bitmap of used records [11 - 24)
                size_t next_free; // The next record to allocate from
                size_t used; // MFT valid size in records.
                u32 recs_mirr; // Number of records in MFTMirr
                u8 next_reserved;
                u8 reserved_bitmap_inited;
        } mft;

        struct {
                struct wnd_bitmap bitmap; // $Bitmap::Data
                CLST next_free_lcn;
        } used;

        struct {
                u64 size; // In bytes.
                u64 blocks; // In blocks.
                u64 ser_num;
                struct ntfs_inode *ni;
                __le16 flags; // Cached current VOLUME_INFO::flags, VOLUME_FLAG_DIRTY.
                u8 major_ver;
                u8 minor_ver;
                char label[256];
                bool real_dirty; // Real fs state.
        } volume;

        struct {
                struct ntfs_index index_sii;
                struct ntfs_index index_sdh;
                struct ntfs_inode *ni;
                u32 next_id;
                u64 next_off;
                __le32 def_security_id;
        } security;

        struct {
                struct ntfs_index index_r;
                struct ntfs_inode *ni;
                u64 max_size; // 16K
        } reparse;

        struct {
                struct ntfs_index index_o;
                struct ntfs_inode *ni;
        } objid;

        struct {
                struct mutex mtx_lznt;
                struct lznt *lznt;
#ifdef CONFIG_NTFS3_LZX_XPRESS
                struct mutex mtx_xpress;
                struct xpress_decompressor *xpress;
                struct mutex mtx_lzx;
                struct lzx_decompressor *lzx;
#endif
        } compress;

        struct ntfs_mount_options *options;
        struct ratelimit_state msg_ratelimit;
        struct proc_dir_entry *procdir;
};

/* One MFT record(usually 1024 bytes), consists of attributes. */
struct mft_inode {
        struct rb_node node;
        struct ntfs_sb_info *sbi;

        struct MFT_REC *mrec;
        struct ntfs_buffers nb;

        CLST rno;
        bool dirty;
};

/* Nested class for ntfs_inode::ni_lock. */
enum ntfs_inode_mutex_lock_class {
        NTFS_INODE_MUTEX_DIRTY,
        NTFS_INODE_MUTEX_SECURITY,
        NTFS_INODE_MUTEX_OBJID,
        NTFS_INODE_MUTEX_REPARSE,
        NTFS_INODE_MUTEX_NORMAL,
        NTFS_INODE_MUTEX_PARENT,
        NTFS_INODE_MUTEX_PARENT2,
};

/*
 * struct ntfs_inode
 *
 * Ntfs inode - extends linux inode. consists of one or more MFT inodes.
 */
struct ntfs_inode {
        struct mft_inode mi; // base record

        /*
         * Valid size: [0 - i_valid) - these range in file contains valid data.
         * Range [i_valid - inode->i_size) - contains 0.
         * Usually i_valid <= inode->i_size.
         */
        u64 i_valid;
        struct timespec64 i_crtime;

        struct mutex ni_lock;

        /* File attributes from std. */
        enum FILE_ATTRIBUTE std_fa;
        __le32 std_security_id;

        /*
         * Tree of mft_inode.
         * Not empty when primary MFT record (usually 1024 bytes) can't save all attributes
         * e.g. file becomes too fragmented or contains a lot of names.
         */
        struct rb_root mi_tree;

        /*
         * This member is used in ntfs_readdir to ensure that all subrecords are loaded
         */
        u8 mi_loaded;

        union {
                struct ntfs_index dir;
                struct {
                        struct rw_semaphore run_lock;
                        struct runs_tree run;
#ifdef CONFIG_NTFS3_LZX_XPRESS
                        struct page *offs_page;
#endif
                } file;
        };

        struct {
                struct runs_tree run;
                struct ATTR_LIST_ENTRY *le; // 1K aligned memory.
                size_t size;
                bool dirty;
        } attr_list;

        size_t ni_flags; // NI_FLAG_XXX

        struct inode vfs_inode;
};

struct indx_node {
        struct ntfs_buffers nb;
        struct INDEX_BUFFER *index;
};

struct ntfs_fnd {
        int level;
        struct indx_node *nodes[20];
        struct NTFS_DE *de[20];
        struct NTFS_DE *root_de;
};

enum REPARSE_SIGN {
        REPARSE_NONE = 0,
        REPARSE_COMPRESSED = 1,
        REPARSE_DEDUPLICATED = 2,
        REPARSE_LINK = 3
};

/* Functions from attrib.c */
int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run,
                           CLST vcn, CLST lcn, CLST len, CLST *pre_alloc,
                           enum ALLOCATE_OPT opt, CLST *alen, const size_t fr,
                           CLST *new_lcn, CLST *new_len);
int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr,
                          struct ATTR_LIST_ENTRY *le, struct mft_inode *mi,
                          u64 new_size, struct runs_tree *run,
                          struct ATTRIB **ins_attr, struct page *page);
int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type,
                  const __le16 *name, u8 name_len, struct runs_tree *run,
                  u64 new_size, const u64 *new_valid, bool keep_prealloc,
                  struct ATTRIB **ret);
int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
                        CLST *len, bool *new, bool zero);
int attr_data_read_resident(struct ntfs_inode *ni, struct page *page);
int attr_data_write_resident(struct ntfs_inode *ni, struct page *page);
int attr_load_runs_vcn(struct ntfs_inode *ni, enum ATTR_TYPE type,
                       const __le16 *name, u8 name_len, struct runs_tree *run,
                       CLST vcn);
int attr_load_runs_range(struct ntfs_inode *ni, enum ATTR_TYPE type,
                         const __le16 *name, u8 name_len, struct runs_tree *run,
                         u64 from, u64 to);
int attr_wof_frame_info(struct ntfs_inode *ni, struct ATTRIB *attr,
                        struct runs_tree *run, u64 frame, u64 frames,
                        u8 frame_bits, u32 *ondisk_size, u64 *vbo_data);
int attr_is_frame_compressed(struct ntfs_inode *ni, struct ATTRIB *attr,
                             CLST frame, CLST *clst_data);
int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size,
                        u64 new_valid);
int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes);
int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes);
int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size);
int attr_force_nonresident(struct ntfs_inode *ni);

/* Functions from attrlist.c */
void al_destroy(struct ntfs_inode *ni);
bool al_verify(struct ntfs_inode *ni);
int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr);
struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni,
                                     struct ATTR_LIST_ENTRY *le);
struct ATTR_LIST_ENTRY *al_find_le(struct ntfs_inode *ni,
                                   struct ATTR_LIST_ENTRY *le,
                                   const struct ATTRIB *attr);
struct ATTR_LIST_ENTRY *al_find_ex(struct ntfs_inode *ni,
                                   struct ATTR_LIST_ENTRY *le,
                                   enum ATTR_TYPE type, const __le16 *name,
                                   u8 name_len, const CLST *vcn);
int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name,
              u8 name_len, CLST svcn, __le16 id, const struct MFT_REF *ref,
              struct ATTR_LIST_ENTRY **new_le);
bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le);
bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn,
                  const __le16 *name, u8 name_len, const struct MFT_REF *ref);
int al_update(struct ntfs_inode *ni, int sync);
static inline size_t al_aligned(size_t size)
{
        return size_add(size, 1023) & ~(size_t)1023;
}

/* Globals from bitfunc.c */
bool are_bits_clear(const void *map, size_t bit, size_t nbits);
bool are_bits_set(const void *map, size_t bit, size_t nbits);
size_t get_set_bits_ex(const void *map, size_t bit, size_t nbits);

/* Globals from dir.c */
int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len,
                      u8 *buf, int buf_len);
int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len,
                      struct cpu_str *uni, u32 max_ulen,
                      enum utf16_endian endian);
struct inode *dir_search_u(struct inode *dir, const struct cpu_str *uni,
                           struct ntfs_fnd *fnd);
bool dir_is_empty(struct inode *dir);
extern const struct file_operations ntfs_dir_operations;
extern const struct file_operations ntfs_legacy_dir_operations;

/* Globals from file.c */
int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path,
                 struct kstat *stat, u32 request_mask, u32 flags);
int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                  struct iattr *attr);
int ntfs_file_open(struct inode *inode, struct file *file);
int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len);
long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg);
long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg);
extern const struct inode_operations ntfs_special_inode_operations;
extern const struct inode_operations ntfs_file_inode_operations;
extern const struct file_operations ntfs_file_operations;
extern const struct file_operations ntfs_legacy_file_operations;

/* Globals from frecord.c */
void ni_remove_mi(struct ntfs_inode *ni, struct mft_inode *mi);
struct ATTR_STD_INFO *ni_std(struct ntfs_inode *ni);
struct ATTR_STD_INFO5 *ni_std5(struct ntfs_inode *ni);
void ni_clear(struct ntfs_inode *ni);
int ni_load_mi_ex(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi);
int ni_load_mi(struct ntfs_inode *ni, const struct ATTR_LIST_ENTRY *le,
               struct mft_inode **mi);
struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr,
                            struct ATTR_LIST_ENTRY **entry_o,
                            enum ATTR_TYPE type, const __le16 *name,
                            u8 name_len, const CLST *vcn,
                            struct mft_inode **mi);
struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr,
                               struct ATTR_LIST_ENTRY **le,
                               struct mft_inode **mi);
struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
                            const __le16 *name, u8 name_len, CLST vcn,
                            struct mft_inode **pmi);
int ni_load_all_mi(struct ntfs_inode *ni);
bool ni_add_subrecord(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi);
int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
                   const __le16 *name, u8 name_len, bool base_only,
                   const __le16 *id);
int ni_create_attr_list(struct ntfs_inode *ni);
int ni_expand_list(struct ntfs_inode *ni);
int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type,
                          const __le16 *name, u8 name_len,
                          const struct runs_tree *run, CLST svcn, CLST len,
                          __le16 flags, struct ATTRIB **new_attr,
                          struct mft_inode **mi, struct ATTR_LIST_ENTRY **le);
int ni_insert_resident(struct ntfs_inode *ni, u32 data_size,
                       enum ATTR_TYPE type, const __le16 *name, u8 name_len,
                       struct ATTRIB **new_attr, struct mft_inode **mi,
                       struct ATTR_LIST_ENTRY **le);
void ni_remove_attr_le(struct ntfs_inode *ni, struct ATTRIB *attr,
                       struct mft_inode *mi, struct ATTR_LIST_ENTRY *le);
int ni_delete_all(struct ntfs_inode *ni);
struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni,
                                     const struct le_str *uni,
                                     const struct MFT_REF *home,
                                     struct mft_inode **mi,
                                     struct ATTR_LIST_ENTRY **entry);
struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type,
                                     struct mft_inode **mi,
                                     struct ATTR_LIST_ENTRY **entry);
int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa);
enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr,
                                   struct REPARSE_DATA_BUFFER *buffer);
int ni_write_inode(struct inode *inode, int sync, const char *hint);
#define _ni_write_inode(i, w) ni_write_inode(i, w, __func__)
int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo,
              __u64 vbo, __u64 len);
int ni_readpage_cmpr(struct ntfs_inode *ni, struct page *page);
int ni_decompress_file(struct ntfs_inode *ni);
int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages,
                  u32 pages_per_frame);
int ni_write_frame(struct ntfs_inode *ni, struct page **pages,
                   u32 pages_per_frame);
int ni_remove_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
                   struct NTFS_DE *de, struct NTFS_DE **de2, int *undo_step);

bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
                         struct NTFS_DE *de, struct NTFS_DE *de2,
                         int undo_step);

int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
                struct NTFS_DE *de);

int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni,
              struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de,
              bool *is_bad);

bool ni_is_dirty(struct inode *inode);

/* Globals from fslog.c */
bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes);
int log_replay(struct ntfs_inode *ni, bool *initialized);

/* Globals from fsntfs.c */
struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block);
bool ntfs_fix_pre_write(struct NTFS_RECORD_HEADER *rhdr, size_t bytes);
int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes,
                       bool simple);
int ntfs_extend_init(struct ntfs_sb_info *sbi);
int ntfs_loadlog_and_replay(struct ntfs_inode *ni, struct ntfs_sb_info *sbi);
int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len,
                             CLST *new_lcn, CLST *new_len,
                             enum ALLOCATE_OPT opt);
bool ntfs_check_for_free_space(struct ntfs_sb_info *sbi, CLST clen, CLST mlen);
int ntfs_look_free_mft(struct ntfs_sb_info *sbi, CLST *rno, bool mft,
                       struct ntfs_inode *ni, struct mft_inode **mi);
void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft);
int ntfs_clear_mft_tail(struct ntfs_sb_info *sbi, size_t from, size_t to);
int ntfs_refresh_zone(struct ntfs_sb_info *sbi);
void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait);
void ntfs_bad_inode(struct inode *inode, const char *hint);
#define _ntfs_bad_inode(i) ntfs_bad_inode(i, __func__)
enum NTFS_DIRTY_FLAGS {
        NTFS_DIRTY_CLEAR = 0,
        NTFS_DIRTY_DIRTY = 1,
        NTFS_DIRTY_ERROR = 2,
};
int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty);
int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer);
int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes,
                  const void *buffer, int wait);
int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run,
                      u64 vbo, const void *buf, size_t bytes, int sync);
struct buffer_head *ntfs_bread_run(struct ntfs_sb_info *sbi,
                                   const struct runs_tree *run, u64 vbo);
int ntfs_read_run_nb(struct ntfs_sb_info *sbi, const struct runs_tree *run,
                     u64 vbo, void *buf, u32 bytes, struct ntfs_buffers *nb);
int ntfs_read_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo,
                 struct NTFS_RECORD_HEADER *rhdr, u32 bytes,
                 struct ntfs_buffers *nb);
int ntfs_get_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo,
                u32 bytes, struct ntfs_buffers *nb);
int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr,
                  struct ntfs_buffers *nb, int sync);
int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run,
                   struct page **pages, u32 nr_pages, u64 vbo, u32 bytes,
                   enum req_op op);
int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run);
int ntfs_vbo_to_lbo(struct ntfs_sb_info *sbi, const struct runs_tree *run,
                    u64 vbo, u64 *lbo, u64 *bytes);
struct ntfs_inode *ntfs_new_inode(struct ntfs_sb_info *sbi, CLST nRec,
                                  enum RECORD_FLAG flag);
extern const u8 s_default_security[0x50];
bool is_sd_valid(const struct SECURITY_DESCRIPTOR_RELATIVE *sd, u32 len);
int ntfs_security_init(struct ntfs_sb_info *sbi);
int ntfs_get_security_by_id(struct ntfs_sb_info *sbi, __le32 security_id,
                            struct SECURITY_DESCRIPTOR_RELATIVE **sd,
                            size_t *size);
int ntfs_insert_security(struct ntfs_sb_info *sbi,
                         const struct SECURITY_DESCRIPTOR_RELATIVE *sd,
                         u32 size, __le32 *security_id, bool *inserted);
int ntfs_reparse_init(struct ntfs_sb_info *sbi);
int ntfs_objid_init(struct ntfs_sb_info *sbi);
int ntfs_objid_remove(struct ntfs_sb_info *sbi, struct GUID *guid);
int ntfs_insert_reparse(struct ntfs_sb_info *sbi, __le32 rtag,
                        const struct MFT_REF *ref);
int ntfs_remove_reparse(struct ntfs_sb_info *sbi, __le32 rtag,
                        const struct MFT_REF *ref);
void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim);
int run_deallocate(struct ntfs_sb_info *sbi, const struct runs_tree *run,
                   bool trim);
bool valid_windows_name(struct ntfs_sb_info *sbi, const struct le_str *name);
int ntfs_set_label(struct ntfs_sb_info *sbi, u8 *label, int len);

/* Globals from index.c */
int indx_used_bit(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit);
void fnd_clear(struct ntfs_fnd *fnd);
static inline struct ntfs_fnd *fnd_get(void)
{
        return kzalloc(sizeof(struct ntfs_fnd), GFP_NOFS);
}
static inline void fnd_put(struct ntfs_fnd *fnd)
{
        if (fnd) {
                fnd_clear(fnd);
                kfree(fnd);
        }
}
void indx_clear(struct ntfs_index *idx);
int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi,
              const struct ATTRIB *attr, enum index_mutex_classed type);
struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni,
                                 struct ATTRIB **attr, struct mft_inode **mi);
int indx_read(struct ntfs_index *idx, struct ntfs_inode *ni, CLST vbn,
              struct indx_node **node);
int indx_find(struct ntfs_index *indx, struct ntfs_inode *dir,
              const struct INDEX_ROOT *root, const void *Key, size_t KeyLen,
              const void *param, int *diff, struct NTFS_DE **entry,
              struct ntfs_fnd *fnd);
int indx_find_sort(struct ntfs_index *indx, struct ntfs_inode *ni,
                   const struct INDEX_ROOT *root, struct NTFS_DE **entry,
                   struct ntfs_fnd *fnd);
int indx_find_raw(struct ntfs_index *indx, struct ntfs_inode *ni,
                  const struct INDEX_ROOT *root, struct NTFS_DE **entry,
                  size_t *off, struct ntfs_fnd *fnd);
int indx_insert_entry(struct ntfs_index *indx, struct ntfs_inode *ni,
                      const struct NTFS_DE *new_de, const void *param,
                      struct ntfs_fnd *fnd, bool undo);
int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni,
                      const void *key, u32 key_len, const void *param);
int indx_update_dup(struct ntfs_inode *ni, struct ntfs_sb_info *sbi,
                    const struct ATTR_FILE_NAME *fname,
                    const struct NTFS_DUP_INFO *dup, int sync);

/* Globals from inode.c */
struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref,
                         const struct cpu_str *name);
int ntfs_set_size(struct inode *inode, u64 new_size);
int reset_log_file(struct inode *inode);
int ntfs_get_block(struct inode *inode, sector_t vbn,
                   struct buffer_head *bh_result, int create);
int ntfs_write_begin(struct file *file, struct address_space *mapping,
                     loff_t pos, u32 len, struct page **pagep, void **fsdata);
int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
                   u32 len, u32 copied, struct page *page, void *fsdata);
int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc);
int ntfs_sync_inode(struct inode *inode);
int ntfs_flush_inodes(struct super_block *sb, struct inode *i1,
                      struct inode *i2);
int inode_write_data(struct inode *inode, const void *data, size_t bytes);
int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
                      struct dentry *dentry, const struct cpu_str *uni,
                      umode_t mode, dev_t dev, const char *symname, u32 size,
                      struct ntfs_fnd *fnd);
int ntfs_link_inode(struct inode *inode, struct dentry *dentry);
int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry);
void ntfs_evict_inode(struct inode *inode);
extern const struct inode_operations ntfs_link_inode_operations;
extern const struct address_space_operations ntfs_aops;
extern const struct address_space_operations ntfs_aops_cmpr;

/* Globals from name_i.c */
int fill_name_de(struct ntfs_sb_info *sbi, void *buf, const struct qstr *name,
                 const struct cpu_str *uni);
struct dentry *ntfs3_get_parent(struct dentry *child);

extern const struct inode_operations ntfs_dir_inode_operations;
extern const struct inode_operations ntfs_special_inode_operations;
extern const struct dentry_operations ntfs_dentry_ops;

/* Globals from record.c */
int mi_get(struct ntfs_sb_info *sbi, CLST rno, struct mft_inode **mi);
void mi_put(struct mft_inode *mi);
int mi_init(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno);
int mi_read(struct mft_inode *mi, bool is_mft);
struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr);
// TODO: id?
struct ATTRIB *mi_find_attr(struct mft_inode *mi, struct ATTRIB *attr,
                            enum ATTR_TYPE type, const __le16 *name,
                            u8 name_len, const __le16 *id);
static inline struct ATTRIB *rec_find_attr_le(struct mft_inode *rec,
                                              struct ATTR_LIST_ENTRY *le)
{
        return mi_find_attr(rec, NULL, le->type, le_name(le), le->name_len,
                            &le->id);
}
int mi_write(struct mft_inode *mi, int wait);
int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno,
                  __le16 flags, bool is_mft);
struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
                              const __le16 *name, u8 name_len, u32 asize,
                              u16 name_off);

bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi,
                    struct ATTRIB *attr);
bool mi_resize_attr(struct mft_inode *mi, struct ATTRIB *attr, int bytes);
int mi_pack_runs(struct mft_inode *mi, struct ATTRIB *attr,
                 struct runs_tree *run, CLST len);
static inline bool mi_is_ref(const struct mft_inode *mi,
                             const struct MFT_REF *ref)
{
        if (le32_to_cpu(ref->low) != mi->rno)
                return false;
        if (ref->seq != mi->mrec->seq)
                return false;

#ifdef CONFIG_NTFS3_64BIT_CLUSTER
        return le16_to_cpu(ref->high) == (mi->rno >> 32);
#else
        return !ref->high;
#endif
}

static inline void mi_get_ref(const struct mft_inode *mi, struct MFT_REF *ref)
{
        ref->low = cpu_to_le32(mi->rno);
#ifdef CONFIG_NTFS3_64BIT_CLUSTER
        ref->high = cpu_to_le16(mi->rno >> 32);
#else
        ref->high = 0;
#endif
        ref->seq = mi->mrec->seq;
}

/* Globals from run.c */
bool run_lookup_entry(const struct runs_tree *run, CLST vcn, CLST *lcn,
                      CLST *len, size_t *index);
void run_truncate(struct runs_tree *run, CLST vcn);
void run_truncate_head(struct runs_tree *run, CLST vcn);
void run_truncate_around(struct runs_tree *run, CLST vcn);
bool run_add_entry(struct runs_tree *run, CLST vcn, CLST lcn, CLST len,
                   bool is_mft);
bool run_collapse_range(struct runs_tree *run, CLST vcn, CLST len);
bool run_insert_range(struct runs_tree *run, CLST vcn, CLST len);
bool run_get_entry(const struct runs_tree *run, size_t index, CLST *vcn,
                   CLST *lcn, CLST *len);
bool run_is_mapped_full(const struct runs_tree *run, CLST svcn, CLST evcn);

int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf,
             u32 run_buf_size, CLST *packed_vcns);
int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
               CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf,
               int run_buf_size);

#ifdef NTFS3_CHECK_FREE_CLST
int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
                  CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf,
                  int run_buf_size);
#else
#define run_unpack_ex run_unpack
#endif
int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn);
int run_clone(const struct runs_tree *run, struct runs_tree *new_run);

/* Globals from super.c */
void *ntfs_set_shared(void *ptr, u32 bytes);
void *ntfs_put_shared(void *ptr);
void ntfs_unmap_meta(struct super_block *sb, CLST lcn, CLST len);
int ntfs_discard(struct ntfs_sb_info *sbi, CLST Lcn, CLST Len);

/* Globals from bitmap.c*/
int __init ntfs3_init_bitmap(void);
void ntfs3_exit_bitmap(void);
void wnd_close(struct wnd_bitmap *wnd);
static inline size_t wnd_zeroes(const struct wnd_bitmap *wnd)
{
        return wnd->total_zeroes;
}
int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits);
int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits);
int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits);
int wnd_set_used_safe(struct wnd_bitmap *wnd, size_t bit, size_t bits,
                      size_t *done);
bool wnd_is_free(struct wnd_bitmap *wnd, size_t bit, size_t bits);
bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits);

/* Possible values for 'flags' 'wnd_find'. */
#define BITMAP_FIND_MARK_AS_USED 0x01
#define BITMAP_FIND_FULL 0x02
size_t wnd_find(struct wnd_bitmap *wnd, size_t to_alloc, size_t hint,
                size_t flags, size_t *allocated);
int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits);
void wnd_zone_set(struct wnd_bitmap *wnd, size_t Lcn, size_t Len);
int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range);

void ntfs_bitmap_set_le(void *map, unsigned int start, int len);
void ntfs_bitmap_clear_le(void *map, unsigned int start, int len);
unsigned int ntfs_bitmap_weight_le(const void *bitmap, int bits);

/* Globals from upcase.c */
int ntfs_cmp_names(const __le16 *s1, size_t l1, const __le16 *s2, size_t l2,
                   const u16 *upcase, bool bothcase);
int ntfs_cmp_names_cpu(const struct cpu_str *uni1, const struct le_str *uni2,
                       const u16 *upcase, bool bothcase);
unsigned long ntfs_names_hash(const u16 *name, size_t len, const u16 *upcase,
                              unsigned long hash);

/* globals from xattr.c */
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
struct posix_acl *ntfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                               int type);
int ntfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                 struct posix_acl *acl, int type);
int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode,
                  struct inode *dir);
#else
#define ntfs_get_acl NULL
#define ntfs_set_acl NULL
#endif

int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry);
ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
extern const struct xattr_handler *const ntfs_xattr_handlers[];

int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size);
void ntfs_get_wsl_perm(struct inode *inode);

/* globals from lznt.c */
struct lznt *get_lznt_ctx(int level);
size_t compress_lznt(const void *uncompressed, size_t uncompressed_size,
                     void *compressed, size_t compressed_size,
                     struct lznt *ctx);
ssize_t decompress_lznt(const void *compressed, size_t compressed_size,
                        void *uncompressed, size_t uncompressed_size);

static inline bool is_ntfs3(struct ntfs_sb_info *sbi)
{
        return sbi->volume.major_ver >= 3;
}

/* (sb->s_flags & SB_ACTIVE) */
static inline bool is_mounted(struct ntfs_sb_info *sbi)
{
        return !!sbi->sb->s_root;
}

static inline bool ntfs_is_meta_file(struct ntfs_sb_info *sbi, CLST rno)
{
        return rno < MFT_REC_FREE || rno == sbi->objid_no ||
               rno == sbi->quota_no || rno == sbi->reparse_no ||
               rno == sbi->usn_jrnl_no;
}

static inline void ntfs_unmap_page(struct page *page)
{
        kunmap(page);
        put_page(page);
}

static inline struct page *ntfs_map_page(struct address_space *mapping,
                                         unsigned long index)
{
        struct page *page = read_mapping_page(mapping, index, NULL);

        if (!IS_ERR(page))
                kmap(page);
        return page;
}

static inline size_t wnd_zone_bit(const struct wnd_bitmap *wnd)
{
        return wnd->zone_bit;
}

static inline size_t wnd_zone_len(const struct wnd_bitmap *wnd)
{
        return wnd->zone_end - wnd->zone_bit;
}

static inline void run_init(struct runs_tree *run)
{
        run->runs = NULL;
        run->count = 0;
        run->allocated = 0;
}

static inline struct runs_tree *run_alloc(void)
{
        return kzalloc(sizeof(struct runs_tree), GFP_NOFS);
}

static inline void run_close(struct runs_tree *run)
{
        kvfree(run->runs);
        memset(run, 0, sizeof(*run));
}

static inline void run_free(struct runs_tree *run)
{
        if (run) {
                kvfree(run->runs);
                kfree(run);
        }
}

static inline bool run_is_empty(struct runs_tree *run)
{
        return !run->count;
}

/* NTFS uses quad aligned bitmaps. */
static inline size_t ntfs3_bitmap_size(size_t bits)
{
        return BITS_TO_U64(bits) * sizeof(u64);
}

#define _100ns2seconds 10000000
#define SecondsToStartOf1970 0x00000002B6109100

#define NTFS_TIME_GRAN 100

/*
 * kernel2nt - Converts in-memory kernel timestamp into nt time.
 */
static inline __le64 kernel2nt(const struct timespec64 *ts)
{
        // 10^7 units of 100 nanoseconds one second
        return cpu_to_le64(_100ns2seconds *
                                   (ts->tv_sec + SecondsToStartOf1970) +
                           ts->tv_nsec / NTFS_TIME_GRAN);
}

/*
 * nt2kernel - Converts on-disk nt time into kernel timestamp.
 */
static inline void nt2kernel(const __le64 tm, struct timespec64 *ts)
{
        u64 t = le64_to_cpu(tm) - _100ns2seconds * SecondsToStartOf1970;

        // WARNING: do_div changes its first argument(!)
        ts->tv_nsec = do_div(t, _100ns2seconds) * 100;
        ts->tv_sec = t;
}

static inline struct ntfs_sb_info *ntfs_sb(struct super_block *sb)
{
        return sb->s_fs_info;
}

static inline int ntfs3_forced_shutdown(struct super_block *sb)
{
        return test_bit(NTFS_FLAGS_SHUTDOWN_BIT, &ntfs_sb(sb)->flags);
}

/*
 * ntfs_up_cluster - Align up on cluster boundary.
 */
static inline u64 ntfs_up_cluster(const struct ntfs_sb_info *sbi, u64 size)
{
        return (size + sbi->cluster_mask) & sbi->cluster_mask_inv;
}

/*
 * ntfs_up_block - Align up on cluster boundary.
 */
static inline u64 ntfs_up_block(const struct super_block *sb, u64 size)
{
        return (size + sb->s_blocksize - 1) & ~(u64)(sb->s_blocksize - 1);
}

static inline CLST bytes_to_cluster(const struct ntfs_sb_info *sbi, u64 size)
{
        return (size + sbi->cluster_mask) >> sbi->cluster_bits;
}

static inline u64 bytes_to_block(const struct super_block *sb, u64 size)
{
        return (size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
}

static inline struct ntfs_inode *ntfs_i(struct inode *inode)
{
        return container_of(inode, struct ntfs_inode, vfs_inode);
}

static inline bool is_compressed(const struct ntfs_inode *ni)
{
        return (ni->std_fa & FILE_ATTRIBUTE_COMPRESSED) ||
               (ni->ni_flags & NI_FLAG_COMPRESSED_MASK);
}

static inline int ni_ext_compress_bits(const struct ntfs_inode *ni)
{
        return 0xb + (ni->ni_flags & NI_FLAG_COMPRESSED_MASK);
}

/* Bits - 0xc, 0xd, 0xe, 0xf, 0x10 */
static inline void ni_set_ext_compress_bits(struct ntfs_inode *ni, u8 bits)
{
        ni->ni_flags |= (bits - 0xb) & NI_FLAG_COMPRESSED_MASK;
}

static inline bool is_dedup(const struct ntfs_inode *ni)
{
        return ni->ni_flags & NI_FLAG_DEDUPLICATED;
}

static inline bool is_encrypted(const struct ntfs_inode *ni)
{
        return ni->std_fa & FILE_ATTRIBUTE_ENCRYPTED;
}

static inline bool is_sparsed(const struct ntfs_inode *ni)
{
        return ni->std_fa & FILE_ATTRIBUTE_SPARSE_FILE;
}

static inline int is_resident(struct ntfs_inode *ni)
{
        return ni->ni_flags & NI_FLAG_RESIDENT;
}

static inline void le16_sub_cpu(__le16 *var, u16 val)
{
        *var = cpu_to_le16(le16_to_cpu(*var) - val);
}

static inline void le32_sub_cpu(__le32 *var, u32 val)
{
        *var = cpu_to_le32(le32_to_cpu(*var) - val);
}

static inline void nb_put(struct ntfs_buffers *nb)
{
        u32 i, nbufs = nb->nbufs;

        if (!nbufs)
                return;

        for (i = 0; i < nbufs; i++)
                put_bh(nb->bh[i]);
        nb->nbufs = 0;
}

static inline void put_indx_node(struct indx_node *in)
{
        if (!in)
                return;

        kfree(in->index);
        nb_put(&in->nb);
        kfree(in);
}

static inline void mi_clear(struct mft_inode *mi)
{
        nb_put(&mi->nb);
        kfree(mi->mrec);
        mi->mrec = NULL;
}

static inline void ni_lock(struct ntfs_inode *ni)
{
        mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_NORMAL);
}

static inline void ni_lock_dir(struct ntfs_inode *ni)
{
        mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_PARENT);
}

static inline void ni_lock_dir2(struct ntfs_inode *ni)
{
        mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_PARENT2);
}

static inline void ni_unlock(struct ntfs_inode *ni)
{
        mutex_unlock(&ni->ni_lock);
}

static inline int ni_trylock(struct ntfs_inode *ni)
{
        return mutex_trylock(&ni->ni_lock);
}

static inline int attr_load_runs_attr(struct ntfs_inode *ni,
                                      struct ATTRIB *attr,
                                      struct runs_tree *run, CLST vcn)
{
        return attr_load_runs_vcn(ni, attr->type, attr_name(attr),
                                  attr->name_len, run, vcn);
}

static inline void le64_sub_cpu(__le64 *var, u64 val)
{
        *var = cpu_to_le64(le64_to_cpu(*var) - val);
}

bool is_legacy_ntfs(struct super_block *sb);

#endif /* _LINUX_NTFS3_NTFS_FS_H */
















































































































































































































   47 































   39 







   31 
   43 


































   39 



























   39 








   28 






























































































































































































   19 
    5 
    4 
   31 

    7 
   16 

   26 
    5 
   37 

    4 

    4 












   23 

    7 






   14 
    1 







    8 
    6 
    4 


    7 

    3 
















   20 
   16 













   28 
    5 
   14 


   13 

    1 






























   28 









































































   11 





























    6 























   18 














   18 








    9 
   11 












    6 










   10 












    1 











   30 




















   30 














































































































































































































   39 


















    7 























    1 




    1 






    6 
    6 
    6 















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Macros for manipulating and testing page->flags
 */

#ifndef PAGE_FLAGS_H
#define PAGE_FLAGS_H

#include <linux/types.h>
#include <linux/bug.h>
#include <linux/mmdebug.h>
#ifndef __GENERATING_BOUNDS_H
#include <linux/mm_types.h>
#include <generated/bounds.h>
#endif /* !__GENERATING_BOUNDS_H */

/*
 * Various page->flags bits:
 *
 * PG_reserved is set for special pages. The "struct page" of such a page
 * should in general not be touched (e.g. set dirty) except by its owner.
 * Pages marked as PG_reserved include:
 * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS,
 *   initrd, HW tables)
 * - Pages reserved or allocated early during boot (before the page allocator
 *   was initialized). This includes (depending on the architecture) the
 *   initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much
 *   much more. Once (if ever) freed, PG_reserved is cleared and they will
 *   be given to the page allocator.
 * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying
 *   to read/write these pages might end badly. Don't touch!
 * - The zero page(s)
 * - Pages not added to the page allocator when onlining a section because
 *   they were excluded via the online_page_callback() or because they are
 *   PG_hwpoison.
 * - Pages allocated in the context of kexec/kdump (loaded kernel image,
 *   control pages, vmcoreinfo)
 * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are
 *   not marked PG_reserved (as they might be in use by somebody else who does
 *   not respect the caching strategy).
 * - Pages part of an offline section (struct pages of offline sections should
 *   not be trusted as they will be initialized when first onlined).
 * - MCA pages on ia64
 * - Pages holding CPU notes for POWER Firmware Assisted Dump
 * - Device memory (e.g. PMEM, DAX, HMM)
 * Some PG_reserved pages will be excluded from the hibernation image.
 * PG_reserved does in general not hinder anybody from dumping or swapping
 * and is no longer required for remap_pfn_range(). ioremap might require it.
 * Consequently, PG_reserved for a page mapped into user space can indicate
 * the zero page, the vDSO, MMIO pages or device memory.
 *
 * The PG_private bitflag is set on pagecache pages if they contain filesystem
 * specific data (which is normally at page->private). It can be used by
 * private allocations for its own usage.
 *
 * During initiation of disk I/O, PG_locked is set. This bit is set before I/O
 * and cleared when writeback _starts_ or when read _completes_. PG_writeback
 * is set before writeback starts and cleared when it finishes.
 *
 * PG_locked also pins a page in pagecache, and blocks truncation of the file
 * while it is held.
 *
 * page_waitqueue(page) is a wait queue of all tasks waiting for the page
 * to become unlocked.
 *
 * PG_swapbacked is set when a page uses swap as a backing storage.  This are
 * usually PageAnon or shmem pages but please note that even anonymous pages
 * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as
 * a result of MADV_FREE).
 *
 * PG_referenced, PG_reclaim are used for page reclaim for anonymous and
 * file-backed pagecache (see mm/vmscan.c).
 *
 * PG_error is set to indicate that an I/O error occurred on this page.
 *
 * PG_arch_1 is an architecture specific page state bit.  The generic code
 * guarantees that this bit is cleared for a page when it first is entered into
 * the page cache.
 *
 * PG_hwpoison indicates that a page got corrupted in hardware and contains
 * data with incorrect ECC bits that triggered a machine check. Accessing is
 * not safe since it may cause another machine check. Don't touch!
 */

/*
 * Don't use the pageflags directly.  Use the PageFoo macros.
 *
 * The page flags field is split into two parts, the main flags area
 * which extends from the low bits upwards, and the fields area which
 * extends from the high bits downwards.
 *
 *  | FIELD | ... | FLAGS |
 *  N-1           ^       0
 *               (NR_PAGEFLAGS)
 *
 * The fields area is reserved for fields mapping zone, node (for NUMA) and
 * SPARSEMEM section (for variants of SPARSEMEM that require section ids like
 * SPARSEMEM_EXTREME with !SPARSEMEM_VMEMMAP).
 */
enum pageflags {
        PG_locked,                /* Page is locked. Don't touch. */
        PG_writeback,                /* Page is under writeback */
        PG_referenced,
        PG_uptodate,
        PG_dirty,
        PG_lru,
        PG_head,                /* Must be in bit 6 */
        PG_waiters,                /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
        PG_active,
        PG_workingset,
        PG_error,
        PG_owner_priv_1,        /* Owner use. If pagecache, fs may use*/
        PG_arch_1,
        PG_reserved,
        PG_private,                /* If pagecache, has fs-private data */
        PG_private_2,                /* If pagecache, has fs aux data */
        PG_mappedtodisk,        /* Has blocks allocated on-disk */
        PG_reclaim,                /* To be reclaimed asap */
        PG_swapbacked,                /* Page is backed by RAM/swap */
        PG_unevictable,                /* Page is "unevictable"  */
#ifdef CONFIG_MMU
        PG_mlocked,                /* Page is vma mlocked */
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
        PG_uncached,                /* Page has been mapped as uncached */
#endif
#ifdef CONFIG_MEMORY_FAILURE
        PG_hwpoison,                /* hardware poisoned page. Don't touch */
#endif
#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
        PG_young,
        PG_idle,
#endif
#ifdef CONFIG_ARCH_USES_PG_ARCH_X
        PG_arch_2,
        PG_arch_3,
#endif
        __NR_PAGEFLAGS,

        PG_readahead = PG_reclaim,

        /*
         * Depending on the way an anonymous folio can be mapped into a page
         * table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped
         * THP), PG_anon_exclusive may be set only for the head page or for
         * tail pages of an anonymous folio. For now, we only expect it to be
         * set on tail pages for PTE-mapped THP.
         */
        PG_anon_exclusive = PG_mappedtodisk,

        /* Filesystems */
        PG_checked = PG_owner_priv_1,

        /* SwapBacked */
        PG_swapcache = PG_owner_priv_1,        /* Swap page: swp_entry_t in private */

        /* Two page bits are conscripted by FS-Cache to maintain local caching
         * state.  These bits are set on pages belonging to the netfs's inodes
         * when those inodes are being locally cached.
         */
        PG_fscache = PG_private_2,        /* page backed by cache */

        /* XEN */
        /* Pinned in Xen as a read-only pagetable page. */
        PG_pinned = PG_owner_priv_1,
        /* Pinned as part of domain save (see xen_mm_pin_all()). */
        PG_savepinned = PG_dirty,
        /* Has a grant mapping of another (foreign) domain's page. */
        PG_foreign = PG_owner_priv_1,
        /* Remapped by swiotlb-xen. */
        PG_xen_remapped = PG_owner_priv_1,

        /* non-lru isolated movable page */
        PG_isolated = PG_reclaim,

        /* Only valid for buddy pages. Used to track pages that are reported */
        PG_reported = PG_uptodate,

#ifdef CONFIG_MEMORY_HOTPLUG
        /* For self-hosted memmap pages */
        PG_vmemmap_self_hosted = PG_owner_priv_1,
#endif

        /*
         * Flags only valid for compound pages.  Stored in first tail page's
         * flags word.  Cannot use the first 8 flags or any flag marked as
         * PF_ANY.
         */

        /* At least one page in this folio has the hwpoison flag set */
        PG_has_hwpoisoned = PG_error,
        PG_large_rmappable = PG_workingset, /* anon or file-backed */
};

#define PAGEFLAGS_MASK                ((1UL << NR_PAGEFLAGS) - 1)

#ifndef __GENERATING_BOUNDS_H

#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);

/*
 * Return the real head page struct iff the @page is a fake head page, otherwise
 * return the @page itself. See Documentation/mm/vmemmap_dedup.rst.
 */
static __always_inline const struct page *page_fixed_fake_head(const struct page *page)
{
        if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key))
                return page;

        /*
         * Only addresses aligned with PAGE_SIZE of struct page may be fake head
         * struct page. The alignment check aims to avoid access the fields (
         * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly)
         * cold cacheline in some cases.
         */
        if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) &&
            test_bit(PG_head, &page->flags)) {
                /*
                 * We can safely access the field of the @page[1] with PG_head
                 * because the @page is a compound page composed with at least
                 * two contiguous pages.
                 */
                unsigned long head = READ_ONCE(page[1].compound_head);

                if (likely(head & 1))
                        return (const struct page *)(head - 1);
        }
        return page;
}
#else
static inline const struct page *page_fixed_fake_head(const struct page *page)
{
        return page;
}
#endif

static __always_inline int page_is_fake_head(const struct page *page)
{
        return page_fixed_fake_head(page) != page;
}

static inline unsigned long _compound_head(const struct page *page)
{
        unsigned long head = READ_ONCE(page->compound_head);

        if (unlikely(head & 1))
                return head - 1;
        return (unsigned long)page_fixed_fake_head(page);
}

#define compound_head(page)        ((typeof(page))_compound_head(page))

/**
 * page_folio - Converts from page to folio.
 * @p: The page.
 *
 * Every page is part of a folio.  This function cannot be called on a
 * NULL pointer.
 *
 * Context: No reference, nor lock is required on @page.  If the caller
 * does not hold a reference, this call may race with a folio split, so
 * it should re-check the folio still contains this page after gaining
 * a reference on the folio.
 * Return: The folio which contains this page.
 */
#define page_folio(p)                (_Generic((p),                                \
        const struct page *:        (const struct folio *)_compound_head(p), \
        struct page *:                (struct folio *)_compound_head(p)))

/**
 * folio_page - Return a page from a folio.
 * @folio: The folio.
 * @n: The page number to return.
 *
 * @n is relative to the start of the folio.  This function does not
 * check that the page number lies within @folio; the caller is presumed
 * to have a reference to the page.
 */
#define folio_page(folio, n)        nth_page(&(folio)->page, n)

static __always_inline int PageTail(const struct page *page)
{
        return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page);
}

static __always_inline int PageCompound(const struct page *page)
{
        return test_bit(PG_head, &page->flags) ||
               READ_ONCE(page->compound_head) & 1;
}

#define        PAGE_POISON_PATTERN        -1l
static inline int PagePoisoned(const struct page *page)
{
        return READ_ONCE(page->flags) == PAGE_POISON_PATTERN;
}

#ifdef CONFIG_DEBUG_VM
void page_init_poison(struct page *page, size_t size);
#else
static inline void page_init_poison(struct page *page, size_t size)
{
}
#endif

static const unsigned long *const_folio_flags(const struct folio *folio,
                unsigned n)
{
        const struct page *page = &folio->page;

        VM_BUG_ON_PGFLAGS(PageTail(page), page);
        VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page);
        return &page[n].flags;
}

static unsigned long *folio_flags(struct folio *folio, unsigned n)
{
        struct page *page = &folio->page;

        VM_BUG_ON_PGFLAGS(PageTail(page), page);
        VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page);
        return &page[n].flags;
}

/*
 * Page flags policies wrt compound pages
 *
 * PF_POISONED_CHECK
 *     check if this struct page poisoned/uninitialized
 *
 * PF_ANY:
 *     the page flag is relevant for small, head and tail pages.
 *
 * PF_HEAD:
 *     for compound page all operations related to the page flag applied to
 *     head page.
 *
 * PF_NO_TAIL:
 *     modifications of the page flag must be done on small or head pages,
 *     checks can be done on tail pages too.
 *
 * PF_NO_COMPOUND:
 *     the page flag is not relevant for compound pages.
 *
 * PF_SECOND:
 *     the page flag is stored in the first tail page.
 */
#define PF_POISONED_CHECK(page) ({                                        \
                VM_BUG_ON_PGFLAGS(PagePoisoned(page), page);                \
                page; })
#define PF_ANY(page, enforce)        PF_POISONED_CHECK(page)
#define PF_HEAD(page, enforce)        PF_POISONED_CHECK(compound_head(page))
#define PF_NO_TAIL(page, enforce) ({                                        \
                VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page);        \
                PF_POISONED_CHECK(compound_head(page)); })
#define PF_NO_COMPOUND(page, enforce) ({                                \
                VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page);        \
                PF_POISONED_CHECK(page); })
#define PF_SECOND(page, enforce) ({                                        \
                VM_BUG_ON_PGFLAGS(!PageHead(page), page);                \
                PF_POISONED_CHECK(&page[1]); })

/* Which page is the flag stored in */
#define FOLIO_PF_ANY                0
#define FOLIO_PF_HEAD                0
#define FOLIO_PF_NO_TAIL        0
#define FOLIO_PF_NO_COMPOUND        0
#define FOLIO_PF_SECOND                1

#define FOLIO_HEAD_PAGE                0
#define FOLIO_SECOND_PAGE        1

/*
 * Macros to create function definitions for page flags
 */
#define FOLIO_TEST_FLAG(name, page)                                        \
static __always_inline bool folio_test_##name(const struct folio *folio) \
{ return test_bit(PG_##name, const_folio_flags(folio, page)); }

#define FOLIO_SET_FLAG(name, page)                                        \
static __always_inline void folio_set_##name(struct folio *folio)        \
{ set_bit(PG_##name, folio_flags(folio, page)); }

#define FOLIO_CLEAR_FLAG(name, page)                                        \
static __always_inline void folio_clear_##name(struct folio *folio)        \
{ clear_bit(PG_##name, folio_flags(folio, page)); }

#define __FOLIO_SET_FLAG(name, page)                                        \
static __always_inline void __folio_set_##name(struct folio *folio)        \
{ __set_bit(PG_##name, folio_flags(folio, page)); }

#define __FOLIO_CLEAR_FLAG(name, page)                                        \
static __always_inline void __folio_clear_##name(struct folio *folio)        \
{ __clear_bit(PG_##name, folio_flags(folio, page)); }

#define FOLIO_TEST_SET_FLAG(name, page)                                        \
static __always_inline bool folio_test_set_##name(struct folio *folio)        \
{ return test_and_set_bit(PG_##name, folio_flags(folio, page)); }

#define FOLIO_TEST_CLEAR_FLAG(name, page)                                \
static __always_inline bool folio_test_clear_##name(struct folio *folio) \
{ return test_and_clear_bit(PG_##name, folio_flags(folio, page)); }

#define FOLIO_FLAG(name, page)                                                \
FOLIO_TEST_FLAG(name, page)                                                \
FOLIO_SET_FLAG(name, page)                                                \
FOLIO_CLEAR_FLAG(name, page)

#define TESTPAGEFLAG(uname, lname, policy)                                \
FOLIO_TEST_FLAG(lname, FOLIO_##policy)                                        \
static __always_inline int Page##uname(const struct page *page)                \
{ return test_bit(PG_##lname, &policy(page, 0)->flags); }

#define SETPAGEFLAG(uname, lname, policy)                                \
FOLIO_SET_FLAG(lname, FOLIO_##policy)                                        \
static __always_inline void SetPage##uname(struct page *page)                \
{ set_bit(PG_##lname, &policy(page, 1)->flags); }

#define CLEARPAGEFLAG(uname, lname, policy)                                \
FOLIO_CLEAR_FLAG(lname, FOLIO_##policy)                                        \
static __always_inline void ClearPage##uname(struct page *page)                \
{ clear_bit(PG_##lname, &policy(page, 1)->flags); }

#define __SETPAGEFLAG(uname, lname, policy)                                \
__FOLIO_SET_FLAG(lname, FOLIO_##policy)                                        \
static __always_inline void __SetPage##uname(struct page *page)                \
{ __set_bit(PG_##lname, &policy(page, 1)->flags); }

#define __CLEARPAGEFLAG(uname, lname, policy)                                \
__FOLIO_CLEAR_FLAG(lname, FOLIO_##policy)                                \
static __always_inline void __ClearPage##uname(struct page *page)        \
{ __clear_bit(PG_##lname, &policy(page, 1)->flags); }

#define TESTSETFLAG(uname, lname, policy)                                \
FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy)                                \
static __always_inline int TestSetPage##uname(struct page *page)        \
{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }

#define TESTCLEARFLAG(uname, lname, policy)                                \
FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy)                                \
static __always_inline int TestClearPage##uname(struct page *page)        \
{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }

#define PAGEFLAG(uname, lname, policy)                                        \
        TESTPAGEFLAG(uname, lname, policy)                                \
        SETPAGEFLAG(uname, lname, policy)                                \
        CLEARPAGEFLAG(uname, lname, policy)

#define __PAGEFLAG(uname, lname, policy)                                \
        TESTPAGEFLAG(uname, lname, policy)                                \
        __SETPAGEFLAG(uname, lname, policy)                                \
        __CLEARPAGEFLAG(uname, lname, policy)

#define TESTSCFLAG(uname, lname, policy)                                \
        TESTSETFLAG(uname, lname, policy)                                \
        TESTCLEARFLAG(uname, lname, policy)

#define FOLIO_TEST_FLAG_FALSE(name)                                        \
static inline bool folio_test_##name(const struct folio *folio)                \
{ return false; }
#define FOLIO_SET_FLAG_NOOP(name)                                        \
static inline void folio_set_##name(struct folio *folio) { }
#define FOLIO_CLEAR_FLAG_NOOP(name)                                        \
static inline void folio_clear_##name(struct folio *folio) { }
#define __FOLIO_SET_FLAG_NOOP(name)                                        \
static inline void __folio_set_##name(struct folio *folio) { }
#define __FOLIO_CLEAR_FLAG_NOOP(name)                                        \
static inline void __folio_clear_##name(struct folio *folio) { }
#define FOLIO_TEST_SET_FLAG_FALSE(name)                                        \
static inline bool folio_test_set_##name(struct folio *folio)                \
{ return false; }
#define FOLIO_TEST_CLEAR_FLAG_FALSE(name)                                \
static inline bool folio_test_clear_##name(struct folio *folio)                \
{ return false; }

#define FOLIO_FLAG_FALSE(name)                                                \
FOLIO_TEST_FLAG_FALSE(name)                                                \
FOLIO_SET_FLAG_NOOP(name)                                                \
FOLIO_CLEAR_FLAG_NOOP(name)

#define TESTPAGEFLAG_FALSE(uname, lname)                                \
FOLIO_TEST_FLAG_FALSE(lname)                                                \
static inline int Page##uname(const struct page *page) { return 0; }

#define SETPAGEFLAG_NOOP(uname, lname)                                        \
FOLIO_SET_FLAG_NOOP(lname)                                                \
static inline void SetPage##uname(struct page *page) {  }

#define CLEARPAGEFLAG_NOOP(uname, lname)                                \
FOLIO_CLEAR_FLAG_NOOP(lname)                                                \
static inline void ClearPage##uname(struct page *page) {  }

#define __CLEARPAGEFLAG_NOOP(uname, lname)                                \
__FOLIO_CLEAR_FLAG_NOOP(lname)                                                \
static inline void __ClearPage##uname(struct page *page) {  }

#define TESTSETFLAG_FALSE(uname, lname)                                        \
FOLIO_TEST_SET_FLAG_FALSE(lname)                                        \
static inline int TestSetPage##uname(struct page *page) { return 0; }

#define TESTCLEARFLAG_FALSE(uname, lname)                                \
FOLIO_TEST_CLEAR_FLAG_FALSE(lname)                                        \
static inline int TestClearPage##uname(struct page *page) { return 0; }

#define PAGEFLAG_FALSE(uname, lname) TESTPAGEFLAG_FALSE(uname, lname)        \
        SETPAGEFLAG_NOOP(uname, lname) CLEARPAGEFLAG_NOOP(uname, lname)

#define TESTSCFLAG_FALSE(uname, lname)                                        \
        TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname)

__PAGEFLAG(Locked, locked, PF_NO_TAIL)
FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE)
PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL)
FOLIO_FLAG(referenced, FOLIO_HEAD_PAGE)
        FOLIO_TEST_CLEAR_FLAG(referenced, FOLIO_HEAD_PAGE)
        __FOLIO_SET_FLAG(referenced, FOLIO_HEAD_PAGE)
PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
        __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
        TESTCLEARFLAG(LRU, lru, PF_HEAD)
PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
        TESTCLEARFLAG(Active, active, PF_HEAD)
PAGEFLAG(Workingset, workingset, PF_HEAD)
        TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
PAGEFLAG(Checked, checked, PF_NO_COMPOUND)           /* Used by some filesystems */

/* Xen */
PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND)
        TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND);
PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND);
PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)
        TESTCLEARFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)

PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
        __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
        __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
        __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)
        __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL)

/*
 * Private page markings that may be used by the filesystem that owns the page
 * for its own purposes.
 * - PG_private and PG_private_2 cause release_folio() and co to be invoked
 */
PAGEFLAG(Private, private, PF_ANY)
PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
        TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)

/*
 * Only test-and-set exist for PG_writeback.  The unconditional operators are
 * risky: they bypass page accounting.
 */
TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL)
        TESTSCFLAG(Writeback, writeback, PF_NO_TAIL)
PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)

/* PG_readahead is only used for reads; PG_reclaim is only for writes */
PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
        TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL)
PAGEFLAG(Readahead, readahead, PF_NO_COMPOUND)
        TESTCLEARFLAG(Readahead, readahead, PF_NO_COMPOUND)

#ifdef CONFIG_HIGHMEM
/*
 * Must use a macro here due to header dependency issues. page_zone() is not
 * available at this point.
 */
#define PageHighMem(__p) is_highmem_idx(page_zonenum(__p))
#define folio_test_highmem(__f)        is_highmem_idx(folio_zonenum(__f))
#else
PAGEFLAG_FALSE(HighMem, highmem)
#endif

#ifdef CONFIG_SWAP
static __always_inline bool folio_test_swapcache(const struct folio *folio)
{
        return folio_test_swapbacked(folio) &&
                        test_bit(PG_swapcache, const_folio_flags(folio, 0));
}

static __always_inline bool PageSwapCache(const struct page *page)
{
        return folio_test_swapcache(page_folio(page));
}

SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
#else
PAGEFLAG_FALSE(SwapCache, swapcache)
#endif

PAGEFLAG(Unevictable, unevictable, PF_HEAD)
        __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD)
        TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD)

#ifdef CONFIG_MMU
PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
        __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL)
        TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL)
#else
PAGEFLAG_FALSE(Mlocked, mlocked) __CLEARPAGEFLAG_NOOP(Mlocked, mlocked)
        TESTSCFLAG_FALSE(Mlocked, mlocked)
#endif

#ifdef CONFIG_ARCH_USES_PG_UNCACHED
PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND)
#else
PAGEFLAG_FALSE(Uncached, uncached)
#endif

#ifdef CONFIG_MEMORY_FAILURE
PAGEFLAG(HWPoison, hwpoison, PF_ANY)
TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
#define __PG_HWPOISON (1UL << PG_hwpoison)
#define MAGIC_HWPOISON        0x48575053U        /* HWPS */
extern void SetPageHWPoisonTakenOff(struct page *page);
extern void ClearPageHWPoisonTakenOff(struct page *page);
extern bool take_page_off_buddy(struct page *page);
extern bool put_page_back_buddy(struct page *page);
#else
PAGEFLAG_FALSE(HWPoison, hwpoison)
#define __PG_HWPOISON 0
#endif

#ifdef CONFIG_PAGE_IDLE_FLAG
#ifdef CONFIG_64BIT
FOLIO_TEST_FLAG(young, FOLIO_HEAD_PAGE)
FOLIO_SET_FLAG(young, FOLIO_HEAD_PAGE)
FOLIO_TEST_CLEAR_FLAG(young, FOLIO_HEAD_PAGE)
FOLIO_FLAG(idle, FOLIO_HEAD_PAGE)
#endif
/* See page_idle.h for !64BIT workaround */
#else /* !CONFIG_PAGE_IDLE_FLAG */
FOLIO_FLAG_FALSE(young)
FOLIO_TEST_CLEAR_FLAG_FALSE(young)
FOLIO_FLAG_FALSE(idle)
#endif

/*
 * PageReported() is used to track reported free pages within the Buddy
 * allocator. We can use the non-atomic version of the test and set
 * operations as both should be shielded with the zone lock to prevent
 * any possible races on the setting or clearing of the bit.
 */
__PAGEFLAG(Reported, reported, PF_NO_COMPOUND)

#ifdef CONFIG_MEMORY_HOTPLUG
PAGEFLAG(VmemmapSelfHosted, vmemmap_self_hosted, PF_ANY)
#else
PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
#endif

/*
 * On an anonymous page mapped into a user virtual memory area,
 * page->mapping points to its anon_vma, not to a struct address_space;
 * with the PAGE_MAPPING_ANON bit set to distinguish it.  See rmap.h.
 *
 * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
 * the PAGE_MAPPING_MOVABLE bit may be set along with the PAGE_MAPPING_ANON
 * bit; and then page->mapping points, not to an anon_vma, but to a private
 * structure which KSM associates with that merged page.  See ksm.h.
 *
 * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable
 * page and then page->mapping points to a struct movable_operations.
 *
 * Please note that, confusingly, "page_mapping" refers to the inode
 * address_space which maps the page from disk; whereas "page_mapped"
 * refers to user virtual address space into which the page is mapped.
 *
 * For slab pages, since slab reuses the bits in struct page to store its
 * internal states, the page->mapping does not exist as such, nor do these
 * flags below.  So in order to avoid testing non-existent bits, please
 * make sure that PageSlab(page) actually evaluates to false before calling
 * the following functions (e.g., PageAnon).  See mm/slab.h.
 */
#define PAGE_MAPPING_ANON        0x1
#define PAGE_MAPPING_MOVABLE        0x2
#define PAGE_MAPPING_KSM        (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
#define PAGE_MAPPING_FLAGS        (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)

/*
 * Different with flags above, this flag is used only for fsdax mode.  It
 * indicates that this page->mapping is now under reflink case.
 */
#define PAGE_MAPPING_DAX_SHARED        ((void *)0x1)

static __always_inline bool folio_mapping_flags(const struct folio *folio)
{
        return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0;
}

static __always_inline bool PageMappingFlags(const struct page *page)
{
        return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0;
}

static __always_inline bool folio_test_anon(const struct folio *folio)
{
        return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0;
}

static __always_inline bool PageAnon(const struct page *page)
{
        return folio_test_anon(page_folio(page));
}

static __always_inline bool __folio_test_movable(const struct folio *folio)
{
        return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) ==
                        PAGE_MAPPING_MOVABLE;
}

static __always_inline bool __PageMovable(const struct page *page)
{
        return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
                                PAGE_MAPPING_MOVABLE;
}

#ifdef CONFIG_KSM
/*
 * A KSM page is one of those write-protected "shared pages" or "merged pages"
 * which KSM maps into multiple mms, wherever identical anonymous page content
 * is found in VM_MERGEABLE vmas.  It's a PageAnon page, pointing not to any
 * anon_vma, but to that page's node of the stable tree.
 */
static __always_inline bool folio_test_ksm(const struct folio *folio)
{
        return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) ==
                                PAGE_MAPPING_KSM;
}

static __always_inline bool PageKsm(const struct page *page)
{
        return folio_test_ksm(page_folio(page));
}
#else
TESTPAGEFLAG_FALSE(Ksm, ksm)
#endif

u64 stable_page_flags(const struct page *page);

/**
 * folio_xor_flags_has_waiters - Change some folio flags.
 * @folio: The folio.
 * @mask: Bits set in this word will be changed.
 *
 * This must only be used for flags which are changed with the folio
 * lock held.  For example, it is unsafe to use for PG_dirty as that
 * can be set without the folio lock held.  It can also only be used
 * on flags which are in the range 0-6 as some of the implementations
 * only affect those bits.
 *
 * Return: Whether there are tasks waiting on the folio.
 */
static inline bool folio_xor_flags_has_waiters(struct folio *folio,
                unsigned long mask)
{
        return xor_unlock_is_negative_byte(mask, folio_flags(folio, 0));
}

/**
 * folio_test_uptodate - Is this folio up to date?
 * @folio: The folio.
 *
 * The uptodate flag is set on a folio when every byte in the folio is
 * at least as new as the corresponding bytes on storage.  Anonymous
 * and CoW folios are always uptodate.  If the folio is not uptodate,
 * some of the bytes in it may be; see the is_partially_uptodate()
 * address_space operation.
 */
static inline bool folio_test_uptodate(const struct folio *folio)
{
        bool ret = test_bit(PG_uptodate, const_folio_flags(folio, 0));
        /*
         * Must ensure that the data we read out of the folio is loaded
         * _after_ we've loaded folio->flags to check the uptodate bit.
         * We can skip the barrier if the folio is not uptodate, because
         * we wouldn't be reading anything from it.
         *
         * See folio_mark_uptodate() for the other side of the story.
         */
        if (ret)
                smp_rmb();

        return ret;
}

static inline bool PageUptodate(const struct page *page)
{
        return folio_test_uptodate(page_folio(page));
}

static __always_inline void __folio_mark_uptodate(struct folio *folio)
{
        smp_wmb();
        __set_bit(PG_uptodate, folio_flags(folio, 0));
}

static __always_inline void folio_mark_uptodate(struct folio *folio)
{
        /*
         * Memory barrier must be issued before setting the PG_uptodate bit,
         * so that all previous stores issued in order to bring the folio
         * uptodate are actually visible before folio_test_uptodate becomes true.
         */
        smp_wmb();
        set_bit(PG_uptodate, folio_flags(folio, 0));
}

static __always_inline void __SetPageUptodate(struct page *page)
{
        __folio_mark_uptodate((struct folio *)page);
}

static __always_inline void SetPageUptodate(struct page *page)
{
        folio_mark_uptodate((struct folio *)page);
}

CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)

void __folio_start_writeback(struct folio *folio, bool keep_write);
void set_page_writeback(struct page *page);

#define folio_start_writeback(folio)                        \
        __folio_start_writeback(folio, false)
#define folio_start_writeback_keepwrite(folio)        \
        __folio_start_writeback(folio, true)

static __always_inline bool folio_test_head(const struct folio *folio)
{
        return test_bit(PG_head, const_folio_flags(folio, FOLIO_PF_ANY));
}

static __always_inline int PageHead(const struct page *page)
{
        PF_POISONED_CHECK(page);
        return test_bit(PG_head, &page->flags) && !page_is_fake_head(page);
}

__SETPAGEFLAG(Head, head, PF_ANY)
__CLEARPAGEFLAG(Head, head, PF_ANY)
CLEARPAGEFLAG(Head, head, PF_ANY)

/**
 * folio_test_large() - Does this folio contain more than one page?
 * @folio: The folio to test.
 *
 * Return: True if the folio is larger than one page.
 */
static inline bool folio_test_large(const struct folio *folio)
{
        return folio_test_head(folio);
}

static __always_inline void set_compound_head(struct page *page, struct page *head)
{
        WRITE_ONCE(page->compound_head, (unsigned long)head + 1);
}

static __always_inline void clear_compound_head(struct page *page)
{
        WRITE_ONCE(page->compound_head, 0);
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void ClearPageCompound(struct page *page)
{
        BUG_ON(!PageHead(page));
        ClearPageHead(page);
}
FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
#else
FOLIO_FLAG_FALSE(large_rmappable)
#endif

#define PG_head_mask ((1UL << PG_head))

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * PageHuge() only returns true for hugetlbfs pages, but not for
 * normal or transparent huge pages.
 *
 * PageTransHuge() returns true for both transparent huge and
 * hugetlbfs pages, but not normal pages. PageTransHuge() can only be
 * called only in the core VM paths where hugetlbfs pages can't exist.
 */
static inline int PageTransHuge(const struct page *page)
{
        VM_BUG_ON_PAGE(PageTail(page), page);
        return PageHead(page);
}

/*
 * PageTransCompound returns true for both transparent huge pages
 * and hugetlbfs pages, so it should only be called when it's known
 * that hugetlbfs pages aren't involved.
 */
static inline int PageTransCompound(const struct page *page)
{
        return PageCompound(page);
}

/*
 * PageTransTail returns true for both transparent huge pages
 * and hugetlbfs pages, so it should only be called when it's known
 * that hugetlbfs pages aren't involved.
 */
static inline int PageTransTail(const struct page *page)
{
        return PageTail(page);
}
#else
TESTPAGEFLAG_FALSE(TransHuge, transhuge)
TESTPAGEFLAG_FALSE(TransCompound, transcompound)
TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap)
TESTPAGEFLAG_FALSE(TransTail, transtail)
#endif

#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
/*
 * PageHasHWPoisoned indicates that at least one subpage is hwpoisoned in the
 * compound page.
 *
 * This flag is set by hwpoison handler.  Cleared by THP split or free page.
 */
PAGEFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND)
        TESTSCFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND)
#else
PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned)
        TESTSCFLAG_FALSE(HasHWPoisoned, has_hwpoisoned)
#endif

/*
 * For pages that are never mapped to userspace,
 * page_type may be used.  Because it is initialised to -1, we invert the
 * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and
 * __ClearPageFoo *sets* the bit used for PageFoo.  We reserve a few high and
 * low bits so that an underflow or overflow of _mapcount won't be
 * mistaken for a page type value.
 */

enum pagetype {
        PG_buddy        = 0x00000080,
        PG_offline        = 0x00000100,
        PG_table        = 0x00000200,
        PG_guard        = 0x00000400,
        PG_hugetlb        = 0x00000800,
        PG_slab                = 0x00001000,

        PAGE_TYPE_BASE        = 0xf0000000,
        /* Reserve 0x0000007f to catch underflows of _mapcount */
        PAGE_MAPCOUNT_RESERVE        = -128,
};

#define PageType(page, flag)                                                \
        ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
#define folio_test_type(folio, flag)                                        \
        ((folio->page.page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)

static inline int page_type_has_type(unsigned int page_type)
{
        return (int)page_type < PAGE_MAPCOUNT_RESERVE;
}

static inline int page_has_type(const struct page *page)
{
        return page_type_has_type(page->page_type);
}

#define FOLIO_TYPE_OPS(lname, fname)                                        \
static __always_inline bool folio_test_##fname(const struct folio *folio)\
{                                                                        \
        return folio_test_type(folio, PG_##lname);                        \
}                                                                        \
static __always_inline void __folio_set_##fname(struct folio *folio)        \
{                                                                        \
        VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio);                \
        folio->page.page_type &= ~PG_##lname;                                \
}                                                                        \
static __always_inline void __folio_clear_##fname(struct folio *folio)        \
{                                                                        \
        VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio);                \
        folio->page.page_type |= PG_##lname;                                \
}

#define PAGE_TYPE_OPS(uname, lname, fname)                                \
FOLIO_TYPE_OPS(lname, fname)                                                \
static __always_inline int Page##uname(const struct page *page)                \
{                                                                        \
        return PageType(page, PG_##lname);                                \
}                                                                        \
static __always_inline void __SetPage##uname(struct page *page)                \
{                                                                        \
        VM_BUG_ON_PAGE(!PageType(page, 0), page);                        \
        page->page_type &= ~PG_##lname;                                        \
}                                                                        \
static __always_inline void __ClearPage##uname(struct page *page)        \
{                                                                        \
        VM_BUG_ON_PAGE(!Page##uname(page), page);                        \
        page->page_type |= PG_##lname;                                        \
}

/*
 * PageBuddy() indicates that the page is free and in the buddy system
 * (see mm/page_alloc.c).
 */
PAGE_TYPE_OPS(Buddy, buddy, buddy)

/*
 * PageOffline() indicates that the page is logically offline although the
 * containing section is online. (e.g. inflated in a balloon driver or
 * not onlined when onlining the section).
 * The content of these pages is effectively stale. Such pages should not
 * be touched (read/write/dump/save) except by their owner.
 *
 * If a driver wants to allow to offline unmovable PageOffline() pages without
 * putting them back to the buddy, it can do so via the memory notifier by
 * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the
 * reference count in MEM_CANCEL_OFFLINE. When offlining, the PageOffline()
 * pages (now with a reference count of zero) are treated like free pages,
 * allowing the containing memory block to get offlined. A driver that
 * relies on this feature is aware that re-onlining the memory block will
 * require to re-set the pages PageOffline() and not giving them to the
 * buddy via online_page_callback_t.
 *
 * There are drivers that mark a page PageOffline() and expect there won't be
 * any further access to page content. PFN walkers that read content of random
 * pages should check PageOffline() and synchronize with such drivers using
 * page_offline_freeze()/page_offline_thaw().
 */
PAGE_TYPE_OPS(Offline, offline, offline)

extern void page_offline_freeze(void);
extern void page_offline_thaw(void);
extern void page_offline_begin(void);
extern void page_offline_end(void);

/*
 * Marks pages in use as page tables.
 */
PAGE_TYPE_OPS(Table, table, pgtable)

/*
 * Marks guardpages used with debug_pagealloc.
 */
PAGE_TYPE_OPS(Guard, guard, guard)

FOLIO_TYPE_OPS(slab, slab)

/**
 * PageSlab - Determine if the page belongs to the slab allocator
 * @page: The page to test.
 *
 * Context: Any context.
 * Return: True for slab pages, false for any other kind of page.
 */
static inline bool PageSlab(const struct page *page)
{
        return folio_test_slab(page_folio(page));
}

#ifdef CONFIG_HUGETLB_PAGE
FOLIO_TYPE_OPS(hugetlb, hugetlb)
#else
FOLIO_TEST_FLAG_FALSE(hugetlb)
#endif

/**
 * PageHuge - Determine if the page belongs to hugetlbfs
 * @page: The page to test.
 *
 * Context: Any context.
 * Return: True for hugetlbfs pages, false for anon pages or pages
 * belonging to other filesystems.
 */
static inline bool PageHuge(const struct page *page)
{
        return folio_test_hugetlb(page_folio(page));
}

/*
 * Check if a page is currently marked HWPoisoned. Note that this check is
 * best effort only and inherently racy: there is no way to synchronize with
 * failing hardware.
 */
static inline bool is_page_hwpoison(const struct page *page)
{
        const struct folio *folio;

        if (PageHWPoison(page))
                return true;
        folio = page_folio(page);
        return folio_test_hugetlb(folio) && PageHWPoison(&folio->page);
}

bool is_free_buddy_page(const struct page *page);

PAGEFLAG(Isolated, isolated, PF_ANY);

static __always_inline int PageAnonExclusive(const struct page *page)
{
        VM_BUG_ON_PGFLAGS(!PageAnon(page), page);
        /*
         * HugeTLB stores this information on the head page; THP keeps it per
         * page
         */
        if (PageHuge(page))
                page = compound_head(page);
        return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
}

static __always_inline void SetPageAnonExclusive(struct page *page)
{
        VM_BUG_ON_PGFLAGS(!PageAnon(page) || PageKsm(page), page);
        VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
        set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
}

static __always_inline void ClearPageAnonExclusive(struct page *page)
{
        VM_BUG_ON_PGFLAGS(!PageAnon(page) || PageKsm(page), page);
        VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
        clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
}

static __always_inline void __ClearPageAnonExclusive(struct page *page)
{
        VM_BUG_ON_PGFLAGS(!PageAnon(page), page);
        VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
        __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
}

#ifdef CONFIG_MMU
#define __PG_MLOCKED                (1UL << PG_mlocked)
#else
#define __PG_MLOCKED                0
#endif

/*
 * Flags checked when a page is freed.  Pages being freed should not have
 * these flags set.  If they are, there is a problem.
 */
#define PAGE_FLAGS_CHECK_AT_FREE                                \
        (1UL << PG_lru                | 1UL << PG_locked        |        \
         1UL << PG_private        | 1UL << PG_private_2        |        \
         1UL << PG_writeback        | 1UL << PG_reserved        |        \
         1UL << PG_active         |                                \
         1UL << PG_unevictable        | __PG_MLOCKED | LRU_GEN_MASK)

/*
 * Flags checked when a page is prepped for return by the page allocator.
 * Pages being prepped should not have these flags set.  If they are set,
 * there has been a kernel bug or struct page corruption.
 *
 * __PG_HWPOISON is exceptional because it needs to be kept beyond page's
 * alloc-free cycle to prevent from reusing the page.
 */
#define PAGE_FLAGS_CHECK_AT_PREP        \
        ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)

/*
 * Flags stored in the second page of a compound page.  They may overlap
 * the CHECK_AT_FREE flags above, so need to be cleared.
 */
#define PAGE_FLAGS_SECOND                                                \
        (0xffUL /* order */                | 1UL << PG_has_hwpoisoned |        \
         1UL << PG_large_rmappable)

#define PAGE_FLAGS_PRIVATE                                \
        (1UL << PG_private | 1UL << PG_private_2)
/**
 * page_has_private - Determine if page has private stuff
 * @page: The page to be checked
 *
 * Determine if a page has private stuff, indicating that release routines
 * should be invoked upon it.
 */
static inline int page_has_private(const struct page *page)
{
        return !!(page->flags & PAGE_FLAGS_PRIVATE);
}

static inline bool folio_has_private(const struct folio *folio)
{
        return page_has_private(&folio->page);
}

#undef PF_ANY
#undef PF_HEAD
#undef PF_NO_TAIL
#undef PF_NO_COMPOUND
#undef PF_SECOND
#endif /* !__GENERATING_BOUNDS_H */

#endif        /* PAGE_FLAGS_H */

















































    2 




    1 

    1 













































































    2 







    1 








    1 















    2 




    1 


    2 


















    2 



































    2 






    2 
    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/mount.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include <linux/slab.h>
#include <uapi/linux/mount.h>
#include "common.h"

/* String table for special mount operations. */
static const char * const tomoyo_mounts[TOMOYO_MAX_SPECIAL_MOUNT] = {
        [TOMOYO_MOUNT_BIND]            = "--bind",
        [TOMOYO_MOUNT_MOVE]            = "--move",
        [TOMOYO_MOUNT_REMOUNT]         = "--remount",
        [TOMOYO_MOUNT_MAKE_UNBINDABLE] = "--make-unbindable",
        [TOMOYO_MOUNT_MAKE_PRIVATE]    = "--make-private",
        [TOMOYO_MOUNT_MAKE_SLAVE]      = "--make-slave",
        [TOMOYO_MOUNT_MAKE_SHARED]     = "--make-shared",
};

/**
 * tomoyo_audit_mount_log - Audit mount log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_mount_log(struct tomoyo_request_info *r)
{
        return tomoyo_supervisor(r, "file mount %s %s %s 0x%lX\n",
                                 r->param.mount.dev->name,
                                 r->param.mount.dir->name,
                                 r->param.mount.type->name,
                                 r->param.mount.flags);
}

/**
 * tomoyo_check_mount_acl - Check permission for path path path number operation.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @ptr: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if granted, false otherwise.
 */
static bool tomoyo_check_mount_acl(struct tomoyo_request_info *r,
                                   const struct tomoyo_acl_info *ptr)
{
        const struct tomoyo_mount_acl *acl =
                container_of(ptr, typeof(*acl), head);

        return tomoyo_compare_number_union(r->param.mount.flags,
                                           &acl->flags) &&
                tomoyo_compare_name_union(r->param.mount.type,
                                          &acl->fs_type) &&
                tomoyo_compare_name_union(r->param.mount.dir,
                                          &acl->dir_name) &&
                (!r->param.mount.need_dev ||
                 tomoyo_compare_name_union(r->param.mount.dev,
                                           &acl->dev_name));
}

/**
 * tomoyo_mount_acl - Check permission for mount() operation.
 *
 * @r:        Pointer to "struct tomoyo_request_info".
 * @dev_name: Name of device file. Maybe NULL.
 * @dir:      Pointer to "struct path".
 * @type:     Name of filesystem type.
 * @flags:    Mount options.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_mount_acl(struct tomoyo_request_info *r,
                            const char *dev_name,
                            const struct path *dir, const char *type,
                            unsigned long flags)
{
        struct tomoyo_obj_info obj = { };
        struct path path;
        struct file_system_type *fstype = NULL;
        const char *requested_type = NULL;
        const char *requested_dir_name = NULL;
        const char *requested_dev_name = NULL;
        struct tomoyo_path_info rtype;
        struct tomoyo_path_info rdev;
        struct tomoyo_path_info rdir;
        int need_dev = 0;
        int error = -ENOMEM;

        r->obj = &obj;

        /* Get fstype. */
        requested_type = tomoyo_encode(type);
        if (!requested_type)
                goto out;
        rtype.name = requested_type;
        tomoyo_fill_path_info(&rtype);

        /* Get mount point. */
        obj.path2 = *dir;
        requested_dir_name = tomoyo_realpath_from_path(dir);
        if (!requested_dir_name) {
                error = -ENOMEM;
                goto out;
        }
        rdir.name = requested_dir_name;
        tomoyo_fill_path_info(&rdir);

        /* Compare fs name. */
        if (type == tomoyo_mounts[TOMOYO_MOUNT_REMOUNT]) {
                /* dev_name is ignored. */
        } else if (type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_UNBINDABLE] ||
                   type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_PRIVATE] ||
                   type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_SLAVE] ||
                   type == tomoyo_mounts[TOMOYO_MOUNT_MAKE_SHARED]) {
                /* dev_name is ignored. */
        } else if (type == tomoyo_mounts[TOMOYO_MOUNT_BIND] ||
                   type == tomoyo_mounts[TOMOYO_MOUNT_MOVE]) {
                need_dev = -1; /* dev_name is a directory */
        } else {
                fstype = get_fs_type(type);
                if (!fstype) {
                        error = -ENODEV;
                        goto out;
                }
                if (fstype->fs_flags & FS_REQUIRES_DEV)
                        /* dev_name is a block device file. */
                        need_dev = 1;
        }
        if (need_dev) {
                /* Get mount point or device file. */
                if (!dev_name || kern_path(dev_name, LOOKUP_FOLLOW, &path)) {
                        error = -ENOENT;
                        goto out;
                }
                obj.path1 = path;
                requested_dev_name = tomoyo_realpath_from_path(&path);
                if (!requested_dev_name) {
                        error = -ENOENT;
                        goto out;
                }
        } else {
                /* Map dev_name to "<NULL>" if no dev_name given. */
                if (!dev_name)
                        dev_name = "<NULL>";
                requested_dev_name = tomoyo_encode(dev_name);
                if (!requested_dev_name) {
                        error = -ENOMEM;
                        goto out;
                }
        }
        rdev.name = requested_dev_name;
        tomoyo_fill_path_info(&rdev);
        r->param_type = TOMOYO_TYPE_MOUNT_ACL;
        r->param.mount.need_dev = need_dev;
        r->param.mount.dev = &rdev;
        r->param.mount.dir = &rdir;
        r->param.mount.type = &rtype;
        r->param.mount.flags = flags;
        do {
                tomoyo_check_acl(r, tomoyo_check_mount_acl);
                error = tomoyo_audit_mount_log(r);
        } while (error == TOMOYO_RETRY_REQUEST);
 out:
        kfree(requested_dev_name);
        kfree(requested_dir_name);
        if (fstype)
                put_filesystem(fstype);
        kfree(requested_type);
        /* Drop refcount obtained by kern_path(). */
        if (obj.path1.dentry)
                path_put(&obj.path1);
        return error;
}

/**
 * tomoyo_mount_permission - Check permission for mount() operation.
 *
 * @dev_name:  Name of device file. Maybe NULL.
 * @path:      Pointer to "struct path".
 * @type:      Name of filesystem type. Maybe NULL.
 * @flags:     Mount options.
 * @data_page: Optional data. Maybe NULL.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_mount_permission(const char *dev_name, const struct path *path,
                            const char *type, unsigned long flags,
                            void *data_page)
{
        struct tomoyo_request_info r;
        int error;
        int idx;

        if (tomoyo_init_request_info(&r, NULL, TOMOYO_MAC_FILE_MOUNT)
            == TOMOYO_CONFIG_DISABLED)
                return 0;
        if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
                flags &= ~MS_MGC_MSK;
        if (flags & MS_REMOUNT) {
                type = tomoyo_mounts[TOMOYO_MOUNT_REMOUNT];
                flags &= ~MS_REMOUNT;
        } else if (flags & MS_BIND) {
                type = tomoyo_mounts[TOMOYO_MOUNT_BIND];
                flags &= ~MS_BIND;
        } else if (flags & MS_SHARED) {
                if (flags & (MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
                        return -EINVAL;
                type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_SHARED];
                flags &= ~MS_SHARED;
        } else if (flags & MS_PRIVATE) {
                if (flags & (MS_SHARED | MS_SLAVE | MS_UNBINDABLE))
                        return -EINVAL;
                type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_PRIVATE];
                flags &= ~MS_PRIVATE;
        } else if (flags & MS_SLAVE) {
                if (flags & (MS_SHARED | MS_PRIVATE | MS_UNBINDABLE))
                        return -EINVAL;
                type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_SLAVE];
                flags &= ~MS_SLAVE;
        } else if (flags & MS_UNBINDABLE) {
                if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE))
                        return -EINVAL;
                type = tomoyo_mounts[TOMOYO_MOUNT_MAKE_UNBINDABLE];
                flags &= ~MS_UNBINDABLE;
        } else if (flags & MS_MOVE) {
                type = tomoyo_mounts[TOMOYO_MOUNT_MOVE];
                flags &= ~MS_MOVE;
        }
        if (!type)
                type = "<NULL>";
        idx = tomoyo_read_lock();
        error = tomoyo_mount_acl(&r, dev_name, path, type, flags);
        tomoyo_read_unlock(idx);
        return error;
}






















































    1 



    1 








    1 

    1 




    1 








    1 









    1 















    1 












    1 











































































































































































































    1 





    1 









    1 






























































































































































































    1 














    1 




    1 















































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
// SPDX-License-Identifier: GPL-2.0-only
/*
 * namei.c
 *
 * PURPOSE
 *      Inode name handling routines for the OSTA-UDF(tm) filesystem.
 *
 * COPYRIGHT
 *  (C) 1998-2004 Ben Fennema
 *  (C) 1999-2000 Stelias Computing Inc
 *
 * HISTORY
 *
 *  12/12/98 blf  Created. Split out the lookup code from dir.c
 *  04/19/99 blf  link, mknod, symlink support
 */

#include "udfdecl.h"

#include "udf_i.h"
#include "udf_sb.h"
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/crc-itu-t.h>
#include <linux/exportfs.h>
#include <linux/iversion.h>

static inline int udf_match(int len1, const unsigned char *name1, int len2,
                            const unsigned char *name2)
{
        if (len1 != len2)
                return 0;

        return !memcmp(name1, name2, len1);
}

/**
 * udf_fiiter_find_entry - find entry in given directory.
 *
 * @dir:        directory inode to search in
 * @child:        qstr of the name
 * @iter:        iter to use for searching
 *
 * This function searches in the directory @dir for a file name @child. When
 * found, @iter points to the position in the directory with given entry.
 *
 * Returns 0 on success, < 0 on error (including -ENOENT).
 */
static int udf_fiiter_find_entry(struct inode *dir, const struct qstr *child,
                                 struct udf_fileident_iter *iter)
{
        int flen;
        unsigned char *fname = NULL;
        struct super_block *sb = dir->i_sb;
        int isdotdot = child->len == 2 &&
                child->name[0] == '.' && child->name[1] == '.';
        int ret;

        fname = kmalloc(UDF_NAME_LEN, GFP_KERNEL);
        if (!fname)
                return -ENOMEM;

        for (ret = udf_fiiter_init(iter, dir, 0);
             !ret && iter->pos < dir->i_size;
             ret = udf_fiiter_advance(iter)) {
                if (iter->fi.fileCharacteristics & FID_FILE_CHAR_DELETED) {
                        if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE))
                                continue;
                }

                if (iter->fi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) {
                        if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
                                continue;
                }

                if ((iter->fi.fileCharacteristics & FID_FILE_CHAR_PARENT) &&
                    isdotdot)
                        goto out_ok;

                if (!iter->fi.lengthFileIdent)
                        continue;

                flen = udf_get_filename(sb, iter->name,
                                iter->fi.lengthFileIdent, fname, UDF_NAME_LEN);
                if (flen < 0) {
                        ret = flen;
                        goto out_err;
                }

                if (udf_match(flen, fname, child->len, child->name))
                        goto out_ok;
        }
        if (!ret)
                ret = -ENOENT;

out_err:
        udf_fiiter_release(iter);
out_ok:
        kfree(fname);

        return ret;
}

static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
                                 unsigned int flags)
{
        struct inode *inode = NULL;
        struct udf_fileident_iter iter;
        int err;

        if (dentry->d_name.len > UDF_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);

        err = udf_fiiter_find_entry(dir, &dentry->d_name, &iter);
        if (err < 0 && err != -ENOENT)
                return ERR_PTR(err);

        if (err == 0) {
                struct kernel_lb_addr loc;

                loc = lelb_to_cpu(iter.fi.icb.extLocation);
                udf_fiiter_release(&iter);

                inode = udf_iget(dir->i_sb, &loc);
        }

        return d_splice_alias(inode, dentry);
}

static int udf_expand_dir_adinicb(struct inode *inode, udf_pblk_t *block)
{
        udf_pblk_t newblock;
        struct buffer_head *dbh = NULL;
        struct kernel_lb_addr eloc;
        struct extent_position epos;
        uint8_t alloctype;
        struct udf_inode_info *iinfo = UDF_I(inode);
        struct udf_fileident_iter iter;
        uint8_t *impuse;
        int ret;

        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
                alloctype = ICBTAG_FLAG_AD_SHORT;
        else
                alloctype = ICBTAG_FLAG_AD_LONG;

        if (!inode->i_size) {
                iinfo->i_alloc_type = alloctype;
                mark_inode_dirty(inode);
                return 0;
        }

        /* alloc block, and copy data to it */
        *block = udf_new_block(inode->i_sb, inode,
                               iinfo->i_location.partitionReferenceNum,
                               iinfo->i_location.logicalBlockNum, &ret);
        if (!(*block))
                return ret;
        newblock = udf_get_pblock(inode->i_sb, *block,
                                  iinfo->i_location.partitionReferenceNum,
                                0);
        if (newblock == 0xffffffff)
                return -EFSCORRUPTED;
        dbh = sb_getblk(inode->i_sb, newblock);
        if (!dbh)
                return -ENOMEM;
        lock_buffer(dbh);
        memcpy(dbh->b_data, iinfo->i_data, inode->i_size);
        memset(dbh->b_data + inode->i_size, 0,
               inode->i_sb->s_blocksize - inode->i_size);
        set_buffer_uptodate(dbh);
        unlock_buffer(dbh);

        /* Drop inline data, add block instead */
        iinfo->i_alloc_type = alloctype;
        memset(iinfo->i_data + iinfo->i_lenEAttr, 0, iinfo->i_lenAlloc);
        iinfo->i_lenAlloc = 0;
        eloc.logicalBlockNum = *block;
        eloc.partitionReferenceNum =
                                iinfo->i_location.partitionReferenceNum;
        iinfo->i_lenExtents = inode->i_size;
        epos.bh = NULL;
        epos.block = iinfo->i_location;
        epos.offset = udf_file_entry_alloc_offset(inode);
        ret = udf_add_aext(inode, &epos, &eloc, inode->i_size, 0);
        brelse(epos.bh);
        if (ret < 0) {
                brelse(dbh);
                udf_free_blocks(inode->i_sb, inode, &eloc, 0, 1);
                return ret;
        }
        mark_inode_dirty(inode);

        /* Now fixup tags in moved directory entries */
        for (ret = udf_fiiter_init(&iter, inode, 0);
             !ret && iter.pos < inode->i_size;
             ret = udf_fiiter_advance(&iter)) {
                iter.fi.descTag.tagLocation = cpu_to_le32(*block);
                if (iter.fi.lengthOfImpUse != cpu_to_le16(0))
                        impuse = dbh->b_data + iter.pos +
                                                sizeof(struct fileIdentDesc);
                else
                        impuse = NULL;
                udf_fiiter_write_fi(&iter, impuse);
        }
        brelse(dbh);
        /*
         * We don't expect the iteration to fail as the directory has been
         * already verified to be correct
         */
        WARN_ON_ONCE(ret);
        udf_fiiter_release(&iter);

        return 0;
}

static int udf_fiiter_add_entry(struct inode *dir, struct dentry *dentry,
                                struct udf_fileident_iter *iter)
{
        struct udf_inode_info *dinfo = UDF_I(dir);
        int nfidlen, namelen = 0;
        int ret;
        int off, blksize = 1 << dir->i_blkbits;
        udf_pblk_t block;
        char name[UDF_NAME_LEN_CS0];

        if (dentry) {
                namelen = udf_put_filename(dir->i_sb, dentry->d_name.name,
                                           dentry->d_name.len,
                                           name, UDF_NAME_LEN_CS0);
                if (!namelen)
                        return -ENAMETOOLONG;
        }
        nfidlen = ALIGN(sizeof(struct fileIdentDesc) + namelen, UDF_NAME_PAD);

        for (ret = udf_fiiter_init(iter, dir, 0);
             !ret && iter->pos < dir->i_size;
             ret = udf_fiiter_advance(iter)) {
                if (iter->fi.fileCharacteristics & FID_FILE_CHAR_DELETED) {
                        if (udf_dir_entry_len(&iter->fi) == nfidlen) {
                                iter->fi.descTag.tagSerialNum = cpu_to_le16(1);
                                iter->fi.fileVersionNum = cpu_to_le16(1);
                                iter->fi.fileCharacteristics = 0;
                                iter->fi.lengthFileIdent = namelen;
                                iter->fi.lengthOfImpUse = cpu_to_le16(0);
                                memcpy(iter->namebuf, name, namelen);
                                iter->name = iter->namebuf;
                                return 0;
                        }
                }
        }
        if (ret) {
                udf_fiiter_release(iter);
                return ret;
        }
        if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB &&
            blksize - udf_ext0_offset(dir) - iter->pos < nfidlen) {
                udf_fiiter_release(iter);
                ret = udf_expand_dir_adinicb(dir, &block);
                if (ret)
                        return ret;
                ret = udf_fiiter_init(iter, dir, dir->i_size);
                if (ret < 0)
                        return ret;
        }

        /* Get blocknumber to use for entry tag */
        if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                block = dinfo->i_location.logicalBlockNum;
        } else {
                block = iter->eloc.logicalBlockNum +
                                ((iter->elen - 1) >> dir->i_blkbits);
        }
        off = iter->pos & (blksize - 1);
        if (!off)
                off = blksize;
        /* Entry fits into current block? */
        if (blksize - udf_ext0_offset(dir) - off >= nfidlen)
                goto store_fi;

        ret = udf_fiiter_append_blk(iter);
        if (ret) {
                udf_fiiter_release(iter);
                return ret;
        }

        /* Entry will be completely in the new block? Update tag location... */
        if (!(iter->pos & (blksize - 1)))
                block = iter->eloc.logicalBlockNum +
                                ((iter->elen - 1) >> dir->i_blkbits);
store_fi:
        memset(&iter->fi, 0, sizeof(struct fileIdentDesc));
        if (UDF_SB(dir->i_sb)->s_udfrev >= 0x0200)
                udf_new_tag((char *)(&iter->fi), TAG_IDENT_FID, 3, 1, block,
                            sizeof(struct tag));
        else
                udf_new_tag((char *)(&iter->fi), TAG_IDENT_FID, 2, 1, block,
                            sizeof(struct tag));
        iter->fi.fileVersionNum = cpu_to_le16(1);
        iter->fi.lengthFileIdent = namelen;
        iter->fi.lengthOfImpUse = cpu_to_le16(0);
        memcpy(iter->namebuf, name, namelen);
        iter->name = iter->namebuf;

        dir->i_size += nfidlen;
        if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                dinfo->i_lenAlloc += nfidlen;
        } else {
                /* Truncate last extent to proper size */
                udf_fiiter_update_elen(iter, iter->elen -
                                        (dinfo->i_lenExtents - dir->i_size));
        }
        mark_inode_dirty(dir);

        return 0;
}

static void udf_fiiter_delete_entry(struct udf_fileident_iter *iter)
{
        iter->fi.fileCharacteristics |= FID_FILE_CHAR_DELETED;

        if (UDF_QUERY_FLAG(iter->dir->i_sb, UDF_FLAG_STRICT))
                memset(&iter->fi.icb, 0x00, sizeof(struct long_ad));

        udf_fiiter_write_fi(iter, NULL);
}

static void udf_add_fid_counter(struct super_block *sb, bool dir, int val)
{
        struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb);

        if (!lvidiu)
                return;
        mutex_lock(&UDF_SB(sb)->s_alloc_mutex);
        if (dir)
                le32_add_cpu(&lvidiu->numDirs, val);
        else
                le32_add_cpu(&lvidiu->numFiles, val);
        udf_updated_lvid(sb);
        mutex_unlock(&UDF_SB(sb)->s_alloc_mutex);
}

static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
{
        struct udf_inode_info *iinfo = UDF_I(inode);
        struct inode *dir = d_inode(dentry->d_parent);
        struct udf_fileident_iter iter;
        int err;

        err = udf_fiiter_add_entry(dir, dentry, &iter);
        if (err) {
                inode_dec_link_count(inode);
                discard_new_inode(inode);
                return err;
        }
        iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
        iter.fi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
        *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse =
                cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
        udf_fiiter_write_fi(&iter, NULL);
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        mark_inode_dirty(dir);
        udf_fiiter_release(&iter);
        udf_add_fid_counter(dir->i_sb, false, 1);
        d_instantiate_new(dentry, inode);

        return 0;
}

static int udf_create(struct mnt_idmap *idmap, struct inode *dir,
                      struct dentry *dentry, umode_t mode, bool excl)
{
        struct inode *inode = udf_new_inode(dir, mode);

        if (IS_ERR(inode))
                return PTR_ERR(inode);

        inode->i_data.a_ops = &udf_aops;
        inode->i_op = &udf_file_inode_operations;
        inode->i_fop = &udf_file_operations;
        mark_inode_dirty(inode);

        return udf_add_nondir(dentry, inode);
}

static int udf_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
                       struct file *file, umode_t mode)
{
        struct inode *inode = udf_new_inode(dir, mode);

        if (IS_ERR(inode))
                return PTR_ERR(inode);

        inode->i_data.a_ops = &udf_aops;
        inode->i_op = &udf_file_inode_operations;
        inode->i_fop = &udf_file_operations;
        mark_inode_dirty(inode);
        d_tmpfile(file, inode);
        unlock_new_inode(inode);
        return finish_open_simple(file, 0);
}

static int udf_mknod(struct mnt_idmap *idmap, struct inode *dir,
                     struct dentry *dentry, umode_t mode, dev_t rdev)
{
        struct inode *inode;

        if (!old_valid_dev(rdev))
                return -EINVAL;

        inode = udf_new_inode(dir, mode);
        if (IS_ERR(inode))
                return PTR_ERR(inode);

        init_special_inode(inode, mode, rdev);
        return udf_add_nondir(dentry, inode);
}

static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
                     struct dentry *dentry, umode_t mode)
{
        struct inode *inode;
        struct udf_fileident_iter iter;
        int err;
        struct udf_inode_info *dinfo = UDF_I(dir);
        struct udf_inode_info *iinfo;

        inode = udf_new_inode(dir, S_IFDIR | mode);
        if (IS_ERR(inode))
                return PTR_ERR(inode);

        iinfo = UDF_I(inode);
        inode->i_op = &udf_dir_inode_operations;
        inode->i_fop = &udf_dir_operations;
        err = udf_fiiter_add_entry(inode, NULL, &iter);
        if (err) {
                clear_nlink(inode);
                discard_new_inode(inode);
                return err;
        }
        set_nlink(inode, 2);
        iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
        iter.fi.icb.extLocation = cpu_to_lelb(dinfo->i_location);
        *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse =
                cpu_to_le32(dinfo->i_unique & 0x00000000FFFFFFFFUL);
        iter.fi.fileCharacteristics =
                        FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT;
        udf_fiiter_write_fi(&iter, NULL);
        udf_fiiter_release(&iter);
        mark_inode_dirty(inode);

        err = udf_fiiter_add_entry(dir, dentry, &iter);
        if (err) {
                clear_nlink(inode);
                discard_new_inode(inode);
                return err;
        }
        iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
        iter.fi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
        *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse =
                cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
        iter.fi.fileCharacteristics |= FID_FILE_CHAR_DIRECTORY;
        udf_fiiter_write_fi(&iter, NULL);
        udf_fiiter_release(&iter);
        udf_add_fid_counter(dir->i_sb, true, 1);
        inc_nlink(dir);
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        mark_inode_dirty(dir);
        d_instantiate_new(dentry, inode);

        return 0;
}

static int empty_dir(struct inode *dir)
{
        struct udf_fileident_iter iter;
        int ret;

        for (ret = udf_fiiter_init(&iter, dir, 0);
             !ret && iter.pos < dir->i_size;
             ret = udf_fiiter_advance(&iter)) {
                if (iter.fi.lengthFileIdent &&
                    !(iter.fi.fileCharacteristics & FID_FILE_CHAR_DELETED)) {
                        udf_fiiter_release(&iter);
                        return 0;
                }
        }
        udf_fiiter_release(&iter);

        return 1;
}

static int udf_rmdir(struct inode *dir, struct dentry *dentry)
{
        int ret;
        struct inode *inode = d_inode(dentry);
        struct udf_fileident_iter iter;
        struct kernel_lb_addr tloc;

        ret = udf_fiiter_find_entry(dir, &dentry->d_name, &iter);
        if (ret)
                goto out;

        ret = -EFSCORRUPTED;
        tloc = lelb_to_cpu(iter.fi.icb.extLocation);
        if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
                goto end_rmdir;
        ret = -ENOTEMPTY;
        if (!empty_dir(inode))
                goto end_rmdir;
        udf_fiiter_delete_entry(&iter);
        if (inode->i_nlink != 2)
                udf_warn(inode->i_sb, "empty directory has nlink != 2 (%u)\n",
                         inode->i_nlink);
        clear_nlink(inode);
        inode->i_size = 0;
        inode_dec_link_count(dir);
        udf_add_fid_counter(dir->i_sb, true, -1);
        inode_set_mtime_to_ts(dir,
                              inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
        mark_inode_dirty(dir);
        ret = 0;
end_rmdir:
        udf_fiiter_release(&iter);
out:
        return ret;
}

static int udf_unlink(struct inode *dir, struct dentry *dentry)
{
        int ret;
        struct inode *inode = d_inode(dentry);
        struct udf_fileident_iter iter;
        struct kernel_lb_addr tloc;

        ret = udf_fiiter_find_entry(dir, &dentry->d_name, &iter);
        if (ret)
                goto out;

        ret = -EFSCORRUPTED;
        tloc = lelb_to_cpu(iter.fi.icb.extLocation);
        if (udf_get_lb_pblock(dir->i_sb, &tloc, 0) != inode->i_ino)
                goto end_unlink;

        if (!inode->i_nlink) {
                udf_debug("Deleting nonexistent file (%lu), %u\n",
                          inode->i_ino, inode->i_nlink);
                set_nlink(inode, 1);
        }
        udf_fiiter_delete_entry(&iter);
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        mark_inode_dirty(dir);
        inode_dec_link_count(inode);
        udf_add_fid_counter(dir->i_sb, false, -1);
        inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
        ret = 0;
end_unlink:
        udf_fiiter_release(&iter);
out:
        return ret;
}

static int udf_symlink(struct mnt_idmap *idmap, struct inode *dir,
                       struct dentry *dentry, const char *symname)
{
        struct inode *inode;
        struct pathComponent *pc;
        const char *compstart;
        struct extent_position epos = {};
        int eoffset, elen = 0;
        uint8_t *ea;
        int err;
        udf_pblk_t block;
        unsigned char *name = NULL;
        int namelen;
        struct udf_inode_info *iinfo;
        struct super_block *sb = dir->i_sb;

        name = kmalloc(UDF_NAME_LEN_CS0, GFP_KERNEL);
        if (!name) {
                err = -ENOMEM;
                goto out;
        }

        inode = udf_new_inode(dir, S_IFLNK | 0777);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                goto out;
        }

        iinfo = UDF_I(inode);
        down_write(&iinfo->i_data_sem);
        inode->i_data.a_ops = &udf_symlink_aops;
        inode->i_op = &udf_symlink_inode_operations;
        inode_nohighmem(inode);

        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
                struct kernel_lb_addr eloc;
                uint32_t bsize;

                block = udf_new_block(sb, inode,
                                iinfo->i_location.partitionReferenceNum,
                                iinfo->i_location.logicalBlockNum, &err);
                if (!block)
                        goto out_no_entry;
                epos.block = iinfo->i_location;
                epos.offset = udf_file_entry_alloc_offset(inode);
                epos.bh = NULL;
                eloc.logicalBlockNum = block;
                eloc.partitionReferenceNum =
                                iinfo->i_location.partitionReferenceNum;
                bsize = sb->s_blocksize;
                iinfo->i_lenExtents = bsize;
                err = udf_add_aext(inode, &epos, &eloc, bsize, 0);
                brelse(epos.bh);
                if (err < 0) {
                        udf_free_blocks(sb, inode, &eloc, 0, 1);
                        goto out_no_entry;
                }

                block = udf_get_pblock(sb, block,
                                iinfo->i_location.partitionReferenceNum,
                                0);
                epos.bh = sb_getblk(sb, block);
                if (unlikely(!epos.bh)) {
                        err = -ENOMEM;
                        udf_free_blocks(sb, inode, &eloc, 0, 1);
                        goto out_no_entry;
                }
                lock_buffer(epos.bh);
                memset(epos.bh->b_data, 0x00, bsize);
                set_buffer_uptodate(epos.bh);
                unlock_buffer(epos.bh);
                mark_buffer_dirty_inode(epos.bh, inode);
                ea = epos.bh->b_data + udf_ext0_offset(inode);
        } else
                ea = iinfo->i_data + iinfo->i_lenEAttr;

        eoffset = sb->s_blocksize - udf_ext0_offset(inode);
        pc = (struct pathComponent *)ea;

        if (*symname == '/') {
                do {
                        symname++;
                } while (*symname == '/');

                pc->componentType = 1;
                pc->lengthComponentIdent = 0;
                pc->componentFileVersionNum = 0;
                elen += sizeof(struct pathComponent);
        }

        err = -ENAMETOOLONG;

        while (*symname) {
                if (elen + sizeof(struct pathComponent) > eoffset)
                        goto out_no_entry;

                pc = (struct pathComponent *)(ea + elen);

                compstart = symname;

                do {
                        symname++;
                } while (*symname && *symname != '/');

                pc->componentType = 5;
                pc->lengthComponentIdent = 0;
                pc->componentFileVersionNum = 0;
                if (compstart[0] == '.') {
                        if ((symname - compstart) == 1)
                                pc->componentType = 4;
                        else if ((symname - compstart) == 2 &&
                                        compstart[1] == '.')
                                pc->componentType = 3;
                }

                if (pc->componentType == 5) {
                        namelen = udf_put_filename(sb, compstart,
                                                   symname - compstart,
                                                   name, UDF_NAME_LEN_CS0);
                        if (!namelen)
                                goto out_no_entry;

                        if (elen + sizeof(struct pathComponent) + namelen >
                                        eoffset)
                                goto out_no_entry;
                        else
                                pc->lengthComponentIdent = namelen;

                        memcpy(pc->componentIdent, name, namelen);
                }

                elen += sizeof(struct pathComponent) + pc->lengthComponentIdent;

                if (*symname) {
                        do {
                                symname++;
                        } while (*symname == '/');
                }
        }

        brelse(epos.bh);
        inode->i_size = elen;
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
                iinfo->i_lenAlloc = inode->i_size;
        else
                udf_truncate_tail_extent(inode);
        mark_inode_dirty(inode);
        up_write(&iinfo->i_data_sem);

        err = udf_add_nondir(dentry, inode);
out:
        kfree(name);
        return err;

out_no_entry:
        up_write(&iinfo->i_data_sem);
        inode_dec_link_count(inode);
        discard_new_inode(inode);
        goto out;
}

static int udf_link(struct dentry *old_dentry, struct inode *dir,
                    struct dentry *dentry)
{
        struct inode *inode = d_inode(old_dentry);
        struct udf_fileident_iter iter;
        int err;

        err = udf_fiiter_add_entry(dir, dentry, &iter);
        if (err)
                return err;
        iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
        iter.fi.icb.extLocation = cpu_to_lelb(UDF_I(inode)->i_location);
        if (UDF_SB(inode->i_sb)->s_lvid_bh) {
                *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse =
                        cpu_to_le32(lvid_get_unique_id(inode->i_sb));
        }
        udf_fiiter_write_fi(&iter, NULL);
        udf_fiiter_release(&iter);

        inc_nlink(inode);
        udf_add_fid_counter(dir->i_sb, false, 1);
        inode_set_ctime_current(inode);
        mark_inode_dirty(inode);
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        mark_inode_dirty(dir);
        ihold(inode);
        d_instantiate(dentry, inode);

        return 0;
}

/* Anybody can rename anything with this: the permission checks are left to the
 * higher-level routines.
 */
static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir,
                      struct dentry *old_dentry, struct inode *new_dir,
                      struct dentry *new_dentry, unsigned int flags)
{
        struct inode *old_inode = d_inode(old_dentry);
        struct inode *new_inode = d_inode(new_dentry);
        struct udf_fileident_iter oiter, niter, diriter;
        bool has_diriter = false, is_dir = false;
        int retval;
        struct kernel_lb_addr tloc;

        if (flags & ~RENAME_NOREPLACE)
                return -EINVAL;

        retval = udf_fiiter_find_entry(old_dir, &old_dentry->d_name, &oiter);
        if (retval)
                return retval;

        tloc = lelb_to_cpu(oiter.fi.icb.extLocation);
        if (udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) != old_inode->i_ino) {
                retval = -ENOENT;
                goto out_oiter;
        }

        if (S_ISDIR(old_inode->i_mode)) {
                if (new_inode) {
                        retval = -ENOTEMPTY;
                        if (!empty_dir(new_inode))
                                goto out_oiter;
                }
                is_dir = true;
        }
        if (is_dir && old_dir != new_dir) {
                retval = udf_fiiter_find_entry(old_inode, &dotdot_name,
                                               &diriter);
                if (retval == -ENOENT) {
                        udf_err(old_inode->i_sb,
                                "directory (ino %lu) has no '..' entry\n",
                                old_inode->i_ino);
                        retval = -EFSCORRUPTED;
                }
                if (retval)
                        goto out_oiter;
                has_diriter = true;
                tloc = lelb_to_cpu(diriter.fi.icb.extLocation);
                if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) !=
                                old_dir->i_ino) {
                        retval = -EFSCORRUPTED;
                        udf_err(old_inode->i_sb,
                                "directory (ino %lu) has parent entry pointing to another inode (%lu != %u)\n",
                                old_inode->i_ino, old_dir->i_ino,
                                udf_get_lb_pblock(old_inode->i_sb, &tloc, 0));
                        goto out_oiter;
                }
        }

        retval = udf_fiiter_find_entry(new_dir, &new_dentry->d_name, &niter);
        if (retval && retval != -ENOENT)
                goto out_oiter;
        /* Entry found but not passed by VFS? */
        if (!retval && !new_inode) {
                retval = -EFSCORRUPTED;
                udf_fiiter_release(&niter);
                goto out_oiter;
        }
        /* Entry not found? Need to add one... */
        if (retval) {
                udf_fiiter_release(&niter);
                retval = udf_fiiter_add_entry(new_dir, new_dentry, &niter);
                if (retval)
                        goto out_oiter;
        }

        /*
         * Like most other Unix systems, set the ctime for inodes on a
         * rename.
         */
        inode_set_ctime_current(old_inode);
        mark_inode_dirty(old_inode);

        /*
         * ok, that's it
         */
        niter.fi.fileVersionNum = oiter.fi.fileVersionNum;
        niter.fi.fileCharacteristics = oiter.fi.fileCharacteristics;
        memcpy(&(niter.fi.icb), &(oiter.fi.icb), sizeof(oiter.fi.icb));
        udf_fiiter_write_fi(&niter, NULL);
        udf_fiiter_release(&niter);

        /*
         * The old entry may have moved due to new entry allocation. Find it
         * again.
         */
        udf_fiiter_release(&oiter);
        retval = udf_fiiter_find_entry(old_dir, &old_dentry->d_name, &oiter);
        if (retval) {
                udf_err(old_dir->i_sb,
                        "failed to find renamed entry again in directory (ino %lu)\n",
                        old_dir->i_ino);
        } else {
                udf_fiiter_delete_entry(&oiter);
                udf_fiiter_release(&oiter);
        }

        if (new_inode) {
                inode_set_ctime_current(new_inode);
                inode_dec_link_count(new_inode);
                udf_add_fid_counter(old_dir->i_sb, S_ISDIR(new_inode->i_mode),
                                    -1);
        }
        inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
        inode_set_mtime_to_ts(new_dir, inode_set_ctime_current(new_dir));
        mark_inode_dirty(old_dir);
        mark_inode_dirty(new_dir);

        if (has_diriter) {
                diriter.fi.icb.extLocation =
                                        cpu_to_lelb(UDF_I(new_dir)->i_location);
                udf_update_tag((char *)&diriter.fi,
                               udf_dir_entry_len(&diriter.fi));
                udf_fiiter_write_fi(&diriter, NULL);
                udf_fiiter_release(&diriter);
        }

        if (is_dir) {
                inode_dec_link_count(old_dir);
                if (new_inode)
                        inode_dec_link_count(new_inode);
                else {
                        inc_nlink(new_dir);
                        mark_inode_dirty(new_dir);
                }
        }
        return 0;
out_oiter:
        if (has_diriter)
                udf_fiiter_release(&diriter);
        udf_fiiter_release(&oiter);

        return retval;
}

static struct dentry *udf_get_parent(struct dentry *child)
{
        struct kernel_lb_addr tloc;
        struct udf_fileident_iter iter;
        int err;

        err = udf_fiiter_find_entry(d_inode(child), &dotdot_name, &iter);
        if (err)
                return ERR_PTR(err);

        tloc = lelb_to_cpu(iter.fi.icb.extLocation);
        udf_fiiter_release(&iter);
        return d_obtain_alias(udf_iget(child->d_sb, &tloc));
}


static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
                                        u16 partref, __u32 generation)
{
        struct inode *inode;
        struct kernel_lb_addr loc;

        if (block == 0)
                return ERR_PTR(-ESTALE);

        loc.logicalBlockNum = block;
        loc.partitionReferenceNum = partref;
        inode = udf_iget(sb, &loc);

        if (IS_ERR(inode))
                return ERR_CAST(inode);

        if (generation && inode->i_generation != generation) {
                iput(inode);
                return ERR_PTR(-ESTALE);
        }
        return d_obtain_alias(inode);
}

static struct dentry *udf_fh_to_dentry(struct super_block *sb,
                                       struct fid *fid, int fh_len, int fh_type)
{
        if (fh_len < 3 ||
            (fh_type != FILEID_UDF_WITH_PARENT &&
             fh_type != FILEID_UDF_WITHOUT_PARENT))
                return NULL;

        return udf_nfs_get_inode(sb, fid->udf.block, fid->udf.partref,
                        fid->udf.generation);
}

static struct dentry *udf_fh_to_parent(struct super_block *sb,
                                       struct fid *fid, int fh_len, int fh_type)
{
        if (fh_len < 5 || fh_type != FILEID_UDF_WITH_PARENT)
                return NULL;

        return udf_nfs_get_inode(sb, fid->udf.parent_block,
                                 fid->udf.parent_partref,
                                 fid->udf.parent_generation);
}
static int udf_encode_fh(struct inode *inode, __u32 *fh, int *lenp,
                         struct inode *parent)
{
        int len = *lenp;
        struct kernel_lb_addr location = UDF_I(inode)->i_location;
        struct fid *fid = (struct fid *)fh;
        int type = FILEID_UDF_WITHOUT_PARENT;

        if (parent && (len < 5)) {
                *lenp = 5;
                return FILEID_INVALID;
        } else if (len < 3) {
                *lenp = 3;
                return FILEID_INVALID;
        }

        *lenp = 3;
        fid->udf.block = location.logicalBlockNum;
        fid->udf.partref = location.partitionReferenceNum;
        fid->udf.parent_partref = 0;
        fid->udf.generation = inode->i_generation;

        if (parent) {
                location = UDF_I(parent)->i_location;
                fid->udf.parent_block = location.logicalBlockNum;
                fid->udf.parent_partref = location.partitionReferenceNum;
                fid->udf.parent_generation = inode->i_generation;
                *lenp = 5;
                type = FILEID_UDF_WITH_PARENT;
        }

        return type;
}

const struct export_operations udf_export_ops = {
        .encode_fh        = udf_encode_fh,
        .fh_to_dentry   = udf_fh_to_dentry,
        .fh_to_parent   = udf_fh_to_parent,
        .get_parent     = udf_get_parent,
};

const struct inode_operations udf_dir_inode_operations = {
        .lookup                                = udf_lookup,
        .create                                = udf_create,
        .link                                = udf_link,
        .unlink                                = udf_unlink,
        .symlink                        = udf_symlink,
        .mkdir                                = udf_mkdir,
        .rmdir                                = udf_rmdir,
        .mknod                                = udf_mknod,
        .rename                                = udf_rename,
        .tmpfile                        = udf_tmpfile,
};












































































































































    7 






    2 




    2 


    2 










    2 

















    2 



    2 


















    2 





















    2 












    2 



















    2 
    1 
    2 




























    1 






    2 











    1 





    1 









    1 





    1 





























































































































































    2 








    2 













































    2 































































































































































































    1 





    1 



    1 

    1 




























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/readahead.c - address_space-level file readahead.
 *
 * Copyright (C) 2002, Linus Torvalds
 *
 * 09Apr2002        Andrew Morton
 *                Initial version.
 */

/**
 * DOC: Readahead Overview
 *
 * Readahead is used to read content into the page cache before it is
 * explicitly requested by the application.  Readahead only ever
 * attempts to read folios that are not yet in the page cache.  If a
 * folio is present but not up-to-date, readahead will not try to read
 * it. In that case a simple ->read_folio() will be requested.
 *
 * Readahead is triggered when an application read request (whether a
 * system call or a page fault) finds that the requested folio is not in
 * the page cache, or that it is in the page cache and has the
 * readahead flag set.  This flag indicates that the folio was read
 * as part of a previous readahead request and now that it has been
 * accessed, it is time for the next readahead.
 *
 * Each readahead request is partly synchronous read, and partly async
 * readahead.  This is reflected in the struct file_ra_state which
 * contains ->size being the total number of pages, and ->async_size
 * which is the number of pages in the async section.  The readahead
 * flag will be set on the first folio in this async section to trigger
 * a subsequent readahead.  Once a series of sequential reads has been
 * established, there should be no need for a synchronous component and
 * all readahead request will be fully asynchronous.
 *
 * When either of the triggers causes a readahead, three numbers need
 * to be determined: the start of the region to read, the size of the
 * region, and the size of the async tail.
 *
 * The start of the region is simply the first page address at or after
 * the accessed address, which is not currently populated in the page
 * cache.  This is found with a simple search in the page cache.
 *
 * The size of the async tail is determined by subtracting the size that
 * was explicitly requested from the determined request size, unless
 * this would be less than zero - then zero is used.  NOTE THIS
 * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED
 * PAGE.  ALSO THIS CALCULATION IS NOT USED CONSISTENTLY.
 *
 * The size of the region is normally determined from the size of the
 * previous readahead which loaded the preceding pages.  This may be
 * discovered from the struct file_ra_state for simple sequential reads,
 * or from examining the state of the page cache when multiple
 * sequential reads are interleaved.  Specifically: where the readahead
 * was triggered by the readahead flag, the size of the previous
 * readahead is assumed to be the number of pages from the triggering
 * page to the start of the new readahead.  In these cases, the size of
 * the previous readahead is scaled, often doubled, for the new
 * readahead, though see get_next_ra_size() for details.
 *
 * If the size of the previous read cannot be determined, the number of
 * preceding pages in the page cache is used to estimate the size of
 * a previous read.  This estimate could easily be misled by random
 * reads being coincidentally adjacent, so it is ignored unless it is
 * larger than the current request, and it is not scaled up, unless it
 * is at the start of file.
 *
 * In general readahead is accelerated at the start of the file, as
 * reads from there are often sequential.  There are other minor
 * adjustments to the readahead size in various special cases and these
 * are best discovered by reading the code.
 *
 * The above calculation, based on the previous readahead size,
 * determines the size of the readahead, to which any requested read
 * size may be added.
 *
 * Readahead requests are sent to the filesystem using the ->readahead()
 * address space operation, for which mpage_readahead() is a canonical
 * implementation.  ->readahead() should normally initiate reads on all
 * folios, but may fail to read any or all folios without causing an I/O
 * error.  The page cache reading code will issue a ->read_folio() request
 * for any folio which ->readahead() did not read, and only an error
 * from this will be final.
 *
 * ->readahead() will generally call readahead_folio() repeatedly to get
 * each folio from those prepared for readahead.  It may fail to read a
 * folio by:
 *
 * * not calling readahead_folio() sufficiently many times, effectively
 *   ignoring some folios, as might be appropriate if the path to
 *   storage is congested.
 *
 * * failing to actually submit a read request for a given folio,
 *   possibly due to insufficient resources, or
 *
 * * getting an error during subsequent processing of a request.
 *
 * In the last two cases, the folio should be unlocked by the filesystem
 * to indicate that the read attempt has failed.  In the first case the
 * folio will be unlocked by the VFS.
 *
 * Those folios not in the final ``async_size`` of the request should be
 * considered to be important and ->readahead() should not fail them due
 * to congestion or temporary resource unavailability, but should wait
 * for necessary resources (e.g.  memory or indexing information) to
 * become available.  Folios in the final ``async_size`` may be
 * considered less urgent and failure to read them is more acceptable.
 * In this case it is best to use filemap_remove_folio() to remove the
 * folios from the page cache as is automatically done for folios that
 * were not fetched with readahead_folio().  This will allow a
 * subsequent synchronous readahead request to try them again.  If they
 * are left in the page cache, then they will be read individually using
 * ->read_folio() which may be less efficient.
 */

#include <linux/blkdev.h>
#include <linux/kernel.h>
#include <linux/dax.h>
#include <linux/gfp.h>
#include <linux/export.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/pagemap.h>
#include <linux/psi.h>
#include <linux/syscalls.h>
#include <linux/file.h>
#include <linux/mm_inline.h>
#include <linux/blk-cgroup.h>
#include <linux/fadvise.h>
#include <linux/sched/mm.h>

#include "internal.h"

/*
 * Initialise a struct file's readahead state.  Assumes that the caller has
 * memset *ra to zero.
 */
void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
{
        ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
        ra->prev_pos = -1;
}
EXPORT_SYMBOL_GPL(file_ra_state_init);

static void read_pages(struct readahead_control *rac)
{
        const struct address_space_operations *aops = rac->mapping->a_ops;
        struct folio *folio;
        struct blk_plug plug;

        if (!readahead_count(rac))
                return;

        if (unlikely(rac->_workingset))
                psi_memstall_enter(&rac->_pflags);
        blk_start_plug(&plug);

        if (aops->readahead) {
                aops->readahead(rac);
                /*
                 * Clean up the remaining folios.  The sizes in ->ra
                 * may be used to size the next readahead, so make sure
                 * they accurately reflect what happened.
                 */
                while ((folio = readahead_folio(rac)) != NULL) {
                        unsigned long nr = folio_nr_pages(folio);

                        folio_get(folio);
                        rac->ra->size -= nr;
                        if (rac->ra->async_size >= nr) {
                                rac->ra->async_size -= nr;
                                filemap_remove_folio(folio);
                        }
                        folio_unlock(folio);
                        folio_put(folio);
                }
        } else {
                while ((folio = readahead_folio(rac)) != NULL)
                        aops->read_folio(rac->file, folio);
        }

        blk_finish_plug(&plug);
        if (unlikely(rac->_workingset))
                psi_memstall_leave(&rac->_pflags);
        rac->_workingset = false;

        BUG_ON(readahead_count(rac));
}

/**
 * page_cache_ra_unbounded - Start unchecked readahead.
 * @ractl: Readahead control.
 * @nr_to_read: The number of pages to read.
 * @lookahead_size: Where to start the next readahead.
 *
 * This function is for filesystems to call when they want to start
 * readahead beyond a file's stated i_size.  This is almost certainly
 * not the function you want to call.  Use page_cache_async_readahead()
 * or page_cache_sync_readahead() instead.
 *
 * Context: File is referenced by caller.  Mutexes may be held by caller.
 * May sleep, but will not reenter filesystem to reclaim memory.
 */
void page_cache_ra_unbounded(struct readahead_control *ractl,
                unsigned long nr_to_read, unsigned long lookahead_size)
{
        struct address_space *mapping = ractl->mapping;
        unsigned long index = readahead_index(ractl);
        gfp_t gfp_mask = readahead_gfp_mask(mapping);
        unsigned long i;

        /*
         * Partway through the readahead operation, we will have added
         * locked pages to the page cache, but will not yet have submitted
         * them for I/O.  Adding another page may need to allocate memory,
         * which can trigger memory reclaim.  Telling the VM we're in
         * the middle of a filesystem operation will cause it to not
         * touch file-backed pages, preventing a deadlock.  Most (all?)
         * filesystems already specify __GFP_NOFS in their mapping's
         * gfp_mask, but let's be explicit here.
         */
        unsigned int nofs = memalloc_nofs_save();

        filemap_invalidate_lock_shared(mapping);
        /*
         * Preallocate as many pages as we will need.
         */
        for (i = 0; i < nr_to_read; i++) {
                struct folio *folio = xa_load(&mapping->i_pages, index + i);
                int ret;

                if (folio && !xa_is_value(folio)) {
                        /*
                         * Page already present?  Kick off the current batch
                         * of contiguous pages before continuing with the
                         * next batch.  This page may be the one we would
                         * have intended to mark as Readahead, but we don't
                         * have a stable reference to this page, and it's
                         * not worth getting one just for that.
                         */
                        read_pages(ractl);
                        ractl->_index++;
                        i = ractl->_index + ractl->_nr_pages - index - 1;
                        continue;
                }

                folio = filemap_alloc_folio(gfp_mask, 0);
                if (!folio)
                        break;

                ret = filemap_add_folio(mapping, folio, index + i, gfp_mask);
                if (ret < 0) {
                        folio_put(folio);
                        if (ret == -ENOMEM)
                                break;
                        read_pages(ractl);
                        ractl->_index++;
                        i = ractl->_index + ractl->_nr_pages - index - 1;
                        continue;
                }
                if (i == nr_to_read - lookahead_size)
                        folio_set_readahead(folio);
                ractl->_workingset |= folio_test_workingset(folio);
                ractl->_nr_pages++;
        }

        /*
         * Now start the IO.  We ignore I/O errors - if the folio is not
         * uptodate then the caller will launch read_folio again, and
         * will then handle the error.
         */
        read_pages(ractl);
        filemap_invalidate_unlock_shared(mapping);
        memalloc_nofs_restore(nofs);
}
EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);

/*
 * do_page_cache_ra() actually reads a chunk of disk.  It allocates
 * the pages first, then submits them for I/O. This avoids the very bad
 * behaviour which would occur if page allocations are causing VM writeback.
 * We really don't want to intermingle reads and writes like that.
 */
static void do_page_cache_ra(struct readahead_control *ractl,
                unsigned long nr_to_read, unsigned long lookahead_size)
{
        struct inode *inode = ractl->mapping->host;
        unsigned long index = readahead_index(ractl);
        loff_t isize = i_size_read(inode);
        pgoff_t end_index;        /* The last page we want to read */

        if (isize == 0)
                return;

        end_index = (isize - 1) >> PAGE_SHIFT;
        if (index > end_index)
                return;
        /* Don't read past the page containing the last byte of the file */
        if (nr_to_read > end_index - index)
                nr_to_read = end_index - index + 1;

        page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
}

/*
 * Chunk the readahead into 2 megabyte units, so that we don't pin too much
 * memory at once.
 */
void force_page_cache_ra(struct readahead_control *ractl,
                unsigned long nr_to_read)
{
        struct address_space *mapping = ractl->mapping;
        struct file_ra_state *ra = ractl->ra;
        struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
        unsigned long max_pages, index;

        if (unlikely(!mapping->a_ops->read_folio && !mapping->a_ops->readahead))
                return;

        /*
         * If the request exceeds the readahead window, allow the read to
         * be up to the optimal hardware IO size
         */
        index = readahead_index(ractl);
        max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
        nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
        while (nr_to_read) {
                unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;

                if (this_chunk > nr_to_read)
                        this_chunk = nr_to_read;
                ractl->_index = index;
                do_page_cache_ra(ractl, this_chunk, 0);

                index += this_chunk;
                nr_to_read -= this_chunk;
        }
}

/*
 * Set the initial window size, round to next power of 2 and square
 * for small size, x 4 for medium, and x 2 for large
 * for 128k (32 page) max ra
 * 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial
 */
static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
{
        unsigned long newsize = roundup_pow_of_two(size);

        if (newsize <= max / 32)
                newsize = newsize * 4;
        else if (newsize <= max / 4)
                newsize = newsize * 2;
        else
                newsize = max;

        return newsize;
}

/*
 *  Get the previous window size, ramp it up, and
 *  return it as the new window size.
 */
static unsigned long get_next_ra_size(struct file_ra_state *ra,
                                      unsigned long max)
{
        unsigned long cur = ra->size;

        if (cur < max / 16)
                return 4 * cur;
        if (cur <= max / 2)
                return 2 * cur;
        return max;
}

/*
 * On-demand readahead design.
 *
 * The fields in struct file_ra_state represent the most-recently-executed
 * readahead attempt:
 *
 *                        |<----- async_size ---------|
 *     |------------------- size -------------------->|
 *     |==================#===========================|
 *     ^start             ^page marked with PG_readahead
 *
 * To overlap application thinking time and disk I/O time, we do
 * `readahead pipelining': Do not wait until the application consumed all
 * readahead pages and stalled on the missing page at readahead_index;
 * Instead, submit an asynchronous readahead I/O as soon as there are
 * only async_size pages left in the readahead window. Normally async_size
 * will be equal to size, for maximum pipelining.
 *
 * In interleaved sequential reads, concurrent streams on the same fd can
 * be invalidating each other's readahead state. So we flag the new readahead
 * page at (start+size-async_size) with PG_readahead, and use it as readahead
 * indicator. The flag won't be set on already cached pages, to avoid the
 * readahead-for-nothing fuss, saving pointless page cache lookups.
 *
 * prev_pos tracks the last visited byte in the _previous_ read request.
 * It should be maintained by the caller, and will be used for detecting
 * small random reads. Note that the readahead algorithm checks loosely
 * for sequential patterns. Hence interleaved reads might be served as
 * sequential ones.
 *
 * There is a special-case: if the first page which the application tries to
 * read happens to be the first page of the file, it is assumed that a linear
 * read is about to happen and the window is immediately set to the initial size
 * based on I/O request size and the max_readahead.
 *
 * The code ramps up the readahead size aggressively at first, but slow down as
 * it approaches max_readhead.
 */

/*
 * Count contiguously cached pages from @index-1 to @index-@max,
 * this count is a conservative estimation of
 *         - length of the sequential read sequence, or
 *         - thrashing threshold in memory tight systems
 */
static pgoff_t count_history_pages(struct address_space *mapping,
                                   pgoff_t index, unsigned long max)
{
        pgoff_t head;

        rcu_read_lock();
        head = page_cache_prev_miss(mapping, index - 1, max);
        rcu_read_unlock();

        return index - 1 - head;
}

/*
 * page cache context based readahead
 */
static int try_context_readahead(struct address_space *mapping,
                                 struct file_ra_state *ra,
                                 pgoff_t index,
                                 unsigned long req_size,
                                 unsigned long max)
{
        pgoff_t size;

        size = count_history_pages(mapping, index, max);

        /*
         * not enough history pages:
         * it could be a random read
         */
        if (size <= req_size)
                return 0;

        /*
         * starts from beginning of file:
         * it is a strong indication of long-run stream (or whole-file-read)
         */
        if (size >= index)
                size *= 2;

        ra->start = index;
        ra->size = min(size + req_size, max);
        ra->async_size = 1;

        return 1;
}

static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
                pgoff_t mark, unsigned int order, gfp_t gfp)
{
        int err;
        struct folio *folio = filemap_alloc_folio(gfp, order);

        if (!folio)
                return -ENOMEM;
        mark = round_down(mark, 1UL << order);
        if (index == mark)
                folio_set_readahead(folio);
        err = filemap_add_folio(ractl->mapping, folio, index, gfp);
        if (err) {
                folio_put(folio);
                return err;
        }

        ractl->_nr_pages += 1UL << order;
        ractl->_workingset |= folio_test_workingset(folio);
        return 0;
}

void page_cache_ra_order(struct readahead_control *ractl,
                struct file_ra_state *ra, unsigned int new_order)
{
        struct address_space *mapping = ractl->mapping;
        pgoff_t index = readahead_index(ractl);
        pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
        pgoff_t mark = index + ra->size - ra->async_size;
        unsigned int nofs;
        int err = 0;
        gfp_t gfp = readahead_gfp_mask(mapping);

        if (!mapping_large_folio_support(mapping) || ra->size < 4)
                goto fallback;

        limit = min(limit, index + ra->size - 1);

        if (new_order < MAX_PAGECACHE_ORDER) {
                new_order += 2;
                new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order);
                new_order = min_t(unsigned int, new_order, ilog2(ra->size));
        }

        /* See comment in page_cache_ra_unbounded() */
        nofs = memalloc_nofs_save();
        filemap_invalidate_lock_shared(mapping);
        while (index <= limit) {
                unsigned int order = new_order;

                /* Align with smaller pages if needed */
                if (index & ((1UL << order) - 1))
                        order = __ffs(index);
                /* Don't allocate pages past EOF */
                while (index + (1UL << order) - 1 > limit)
                        order--;
                err = ra_alloc_folio(ractl, index, mark, order, gfp);
                if (err)
                        break;
                index += 1UL << order;
        }

        if (index > limit) {
                ra->size += index - limit - 1;
                ra->async_size += index - limit - 1;
        }

        read_pages(ractl);
        filemap_invalidate_unlock_shared(mapping);
        memalloc_nofs_restore(nofs);

        /*
         * If there were already pages in the page cache, then we may have
         * left some gaps.  Let the regular readahead code take care of this
         * situation.
         */
        if (!err)
                return;
fallback:
        do_page_cache_ra(ractl, ra->size, ra->async_size);
}

/*
 * A minimal readahead algorithm for trivial sequential/random reads.
 */
static void ondemand_readahead(struct readahead_control *ractl,
                struct folio *folio, unsigned long req_size)
{
        struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
        struct file_ra_state *ra = ractl->ra;
        unsigned long max_pages = ra->ra_pages;
        unsigned long add_pages;
        pgoff_t index = readahead_index(ractl);
        pgoff_t expected, prev_index;
        unsigned int order = folio ? folio_order(folio) : 0;

        /*
         * If the request exceeds the readahead window, allow the read to
         * be up to the optimal hardware IO size
         */
        if (req_size > max_pages && bdi->io_pages > max_pages)
                max_pages = min(req_size, bdi->io_pages);

        /*
         * start of file
         */
        if (!index)
                goto initial_readahead;

        /*
         * It's the expected callback index, assume sequential access.
         * Ramp up sizes, and push forward the readahead window.
         */
        expected = round_down(ra->start + ra->size - ra->async_size,
                        1UL << order);
        if (index == expected || index == (ra->start + ra->size)) {
                ra->start += ra->size;
                ra->size = get_next_ra_size(ra, max_pages);
                ra->async_size = ra->size;
                goto readit;
        }

        /*
         * Hit a marked folio without valid readahead state.
         * E.g. interleaved reads.
         * Query the pagecache for async_size, which normally equals to
         * readahead size. Ramp it up and use it as the new readahead size.
         */
        if (folio) {
                pgoff_t start;

                rcu_read_lock();
                start = page_cache_next_miss(ractl->mapping, index + 1,
                                max_pages);
                rcu_read_unlock();

                if (!start || start - index > max_pages)
                        return;

                ra->start = start;
                ra->size = start - index;        /* old async_size */
                ra->size += req_size;
                ra->size = get_next_ra_size(ra, max_pages);
                ra->async_size = ra->size;
                goto readit;
        }

        /*
         * oversize read
         */
        if (req_size > max_pages)
                goto initial_readahead;

        /*
         * sequential cache miss
         * trivial case: (index - prev_index) == 1
         * unaligned reads: (index - prev_index) == 0
         */
        prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
        if (index - prev_index <= 1UL)
                goto initial_readahead;

        /*
         * Query the page cache and look for the traces(cached history pages)
         * that a sequential stream would leave behind.
         */
        if (try_context_readahead(ractl->mapping, ra, index, req_size,
                        max_pages))
                goto readit;

        /*
         * standalone, small random read
         * Read as is, and do not pollute the readahead state.
         */
        do_page_cache_ra(ractl, req_size, 0);
        return;

initial_readahead:
        ra->start = index;
        ra->size = get_init_ra_size(req_size, max_pages);
        ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;

readit:
        /*
         * Will this read hit the readahead marker made by itself?
         * If so, trigger the readahead marker hit now, and merge
         * the resulted next readahead window into the current one.
         * Take care of maximum IO pages as above.
         */
        if (index == ra->start && ra->size == ra->async_size) {
                add_pages = get_next_ra_size(ra, max_pages);
                if (ra->size + add_pages <= max_pages) {
                        ra->async_size = add_pages;
                        ra->size += add_pages;
                } else {
                        ra->size = max_pages;
                        ra->async_size = max_pages >> 1;
                }
        }

        ractl->_index = ra->start;
        page_cache_ra_order(ractl, ra, order);
}

void page_cache_sync_ra(struct readahead_control *ractl,
                unsigned long req_count)
{
        bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);

        /*
         * Even if readahead is disabled, issue this request as readahead
         * as we'll need it to satisfy the requested range. The forced
         * readahead will do the right thing and limit the read to just the
         * requested range, which we'll set to 1 page for this case.
         */
        if (!ractl->ra->ra_pages || blk_cgroup_congested()) {
                if (!ractl->file)
                        return;
                req_count = 1;
                do_forced_ra = true;
        }

        /* be dumb */
        if (do_forced_ra) {
                force_page_cache_ra(ractl, req_count);
                return;
        }

        ondemand_readahead(ractl, NULL, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_sync_ra);

void page_cache_async_ra(struct readahead_control *ractl,
                struct folio *folio, unsigned long req_count)
{
        /* no readahead */
        if (!ractl->ra->ra_pages)
                return;

        /*
         * Same bit is used for PG_readahead and PG_reclaim.
         */
        if (folio_test_writeback(folio))
                return;

        folio_clear_readahead(folio);

        if (blk_cgroup_congested())
                return;

        ondemand_readahead(ractl, folio, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_async_ra);

ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
        ssize_t ret;
        struct fd f;

        ret = -EBADF;
        f = fdget(fd);
        if (!f.file || !(f.file->f_mode & FMODE_READ))
                goto out;

        /*
         * The readahead() syscall is intended to run only on files
         * that can execute readahead. If readahead is not possible
         * on this file, then we must return -EINVAL.
         */
        ret = -EINVAL;
        if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
            (!S_ISREG(file_inode(f.file)->i_mode) &&
            !S_ISBLK(file_inode(f.file)->i_mode)))
                goto out;

        ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
out:
        fdput(f);
        return ret;
}

SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
{
        return ksys_readahead(fd, offset, count);
}

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_READAHEAD)
COMPAT_SYSCALL_DEFINE4(readahead, int, fd, compat_arg_u64_dual(offset), size_t, count)
{
        return ksys_readahead(fd, compat_arg_u64_glue(offset), count);
}
#endif

/**
 * readahead_expand - Expand a readahead request
 * @ractl: The request to be expanded
 * @new_start: The revised start
 * @new_len: The revised size of the request
 *
 * Attempt to expand a readahead request outwards from the current size to the
 * specified size by inserting locked pages before and after the current window
 * to increase the size to the new window.  This may involve the insertion of
 * THPs, in which case the window may get expanded even beyond what was
 * requested.
 *
 * The algorithm will stop if it encounters a conflicting page already in the
 * pagecache and leave a smaller expansion than requested.
 *
 * The caller must check for this by examining the revised @ractl object for a
 * different expansion than was requested.
 */
void readahead_expand(struct readahead_control *ractl,
                      loff_t new_start, size_t new_len)
{
        struct address_space *mapping = ractl->mapping;
        struct file_ra_state *ra = ractl->ra;
        pgoff_t new_index, new_nr_pages;
        gfp_t gfp_mask = readahead_gfp_mask(mapping);

        new_index = new_start / PAGE_SIZE;

        /* Expand the leading edge downwards */
        while (ractl->_index > new_index) {
                unsigned long index = ractl->_index - 1;
                struct folio *folio = xa_load(&mapping->i_pages, index);

                if (folio && !xa_is_value(folio))
                        return; /* Folio apparently present */

                folio = filemap_alloc_folio(gfp_mask, 0);
                if (!folio)
                        return;
                if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) {
                        folio_put(folio);
                        return;
                }
                if (unlikely(folio_test_workingset(folio)) &&
                                !ractl->_workingset) {
                        ractl->_workingset = true;
                        psi_memstall_enter(&ractl->_pflags);
                }
                ractl->_nr_pages++;
                ractl->_index = folio->index;
        }

        new_len += new_start - readahead_pos(ractl);
        new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE);

        /* Expand the trailing edge upwards */
        while (ractl->_nr_pages < new_nr_pages) {
                unsigned long index = ractl->_index + ractl->_nr_pages;
                struct folio *folio = xa_load(&mapping->i_pages, index);

                if (folio && !xa_is_value(folio))
                        return; /* Folio apparently present */

                folio = filemap_alloc_folio(gfp_mask, 0);
                if (!folio)
                        return;
                if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) {
                        folio_put(folio);
                        return;
                }
                if (unlikely(folio_test_workingset(folio)) &&
                                !ractl->_workingset) {
                        ractl->_workingset = true;
                        psi_memstall_enter(&ractl->_pflags);
                }
                ractl->_nr_pages++;
                if (ra) {
                        ra->size++;
                        ra->async_size++;
                }
        }
}
EXPORT_SYMBOL(readahead_expand);






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *   Copyright (C) International Business Machines Corp., 2000-2004
 *   Portions Copyright (C) Christoph Hellwig, 2001-2002
 */

/*
 *        jfs_logmgr.c: log manager
 *
 * for related information, see transaction manager (jfs_txnmgr.c), and
 * recovery manager (jfs_logredo.c).
 *
 * note: for detail, RTFS.
 *
 *        log buffer manager:
 * special purpose buffer manager supporting log i/o requirements.
 * per log serial pageout of logpage
 * queuing i/o requests and redrive i/o at iodone
 * maintain current logpage buffer
 * no caching since append only
 * appropriate jfs buffer cache buffers as needed
 *
 *        group commit:
 * transactions which wrote COMMIT records in the same in-memory
 * log page during the pageout of previous/current log page(s) are
 * committed together by the pageout of the page.
 *
 *        TBD lazy commit:
 * transactions are committed asynchronously when the log page
 * containing it COMMIT is paged out when it becomes full;
 *
 *        serialization:
 * . a per log lock serialize log write.
 * . a per log lock serialize group commit.
 * . a per log lock serialize log open/close;
 *
 *        TBD log integrity:
 * careful-write (ping-pong) of last logpage to recover from crash
 * in overwrite.
 * detection of split (out-of-order) write of physical sectors
 * of last logpage via timestamp at end of each sector
 * with its mirror data array at trailer).
 *
 *        alternatives:
 * lsn - 64-bit monotonically increasing integer vs
 * 32-bit lspn and page eor.
 */

#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/interrupt.h>
#include <linux/completion.h>
#include <linux/kthread.h>
#include <linux/buffer_head.h>                /* for sync_blockdev() */
#include <linux/bio.h>
#include <linux/freezer.h>
#include <linux/export.h>
#include <linux/delay.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include "jfs_incore.h"
#include "jfs_filsys.h"
#include "jfs_metapage.h"
#include "jfs_superblock.h"
#include "jfs_txnmgr.h"
#include "jfs_debug.h"


/*
 * lbuf's ready to be redriven.  Protected by log_redrive_lock (jfsIO thread)
 */
static struct lbuf *log_redrive_list;
static DEFINE_SPINLOCK(log_redrive_lock);


/*
 *        log read/write serialization (per log)
 */
#define LOG_LOCK_INIT(log)        mutex_init(&(log)->loglock)
#define LOG_LOCK(log)                mutex_lock(&((log)->loglock))
#define LOG_UNLOCK(log)                mutex_unlock(&((log)->loglock))


/*
 *        log group commit serialization (per log)
 */

#define LOGGC_LOCK_INIT(log)        spin_lock_init(&(log)->gclock)
#define LOGGC_LOCK(log)                spin_lock_irq(&(log)->gclock)
#define LOGGC_UNLOCK(log)        spin_unlock_irq(&(log)->gclock)
#define LOGGC_WAKEUP(tblk)        wake_up_all(&(tblk)->gcwait)

/*
 *        log sync serialization (per log)
 */
#define        LOGSYNC_DELTA(logsize)                min((logsize)/8, 128*LOGPSIZE)
#define        LOGSYNC_BARRIER(logsize)        ((logsize)/4)
/*
#define        LOGSYNC_DELTA(logsize)                min((logsize)/4, 256*LOGPSIZE)
#define        LOGSYNC_BARRIER(logsize)        ((logsize)/2)
*/


/*
 *        log buffer cache synchronization
 */
static DEFINE_SPINLOCK(jfsLCacheLock);

#define        LCACHE_LOCK(flags)        spin_lock_irqsave(&jfsLCacheLock, flags)
#define        LCACHE_UNLOCK(flags)        spin_unlock_irqrestore(&jfsLCacheLock, flags)

/*
 * See __SLEEP_COND in jfs_locks.h
 */
#define LCACHE_SLEEP_COND(wq, cond, flags)        \
do {                                                \
        if (cond)                                \
                break;                                \
        __SLEEP_COND(wq, cond, LCACHE_LOCK(flags), LCACHE_UNLOCK(flags)); \
} while (0)

#define        LCACHE_WAKEUP(event)        wake_up(event)


/*
 *        lbuf buffer cache (lCache) control
 */
/* log buffer manager pageout control (cumulative, inclusive) */
#define        lbmREAD                0x0001
#define        lbmWRITE        0x0002        /* enqueue at tail of write queue;
                                 * init pageout if at head of queue;
                                 */
#define        lbmRELEASE        0x0004        /* remove from write queue
                                 * at completion of pageout;
                                 * do not free/recycle it yet:
                                 * caller will free it;
                                 */
#define        lbmSYNC                0x0008        /* do not return to freelist
                                 * when removed from write queue;
                                 */
#define lbmFREE                0x0010        /* return to freelist
                                 * at completion of pageout;
                                 * the buffer may be recycled;
                                 */
#define        lbmDONE                0x0020
#define        lbmERROR        0x0040
#define lbmGC                0x0080        /* lbmIODone to perform post-GC processing
                                 * of log page
                                 */
#define lbmDIRECT        0x0100

/*
 * Global list of active external journals
 */
static LIST_HEAD(jfs_external_logs);
static struct jfs_log *dummy_log;
static DEFINE_MUTEX(jfs_log_mutex);

/*
 * forward references
 */
static int lmWriteRecord(struct jfs_log * log, struct tblock * tblk,
                         struct lrd * lrd, struct tlock * tlck);

static int lmNextPage(struct jfs_log * log);
static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi,
                           int activate);

static int open_inline_log(struct super_block *sb);
static int open_dummy_log(struct super_block *sb);
static int lbmLogInit(struct jfs_log * log);
static void lbmLogShutdown(struct jfs_log * log);
static struct lbuf *lbmAllocate(struct jfs_log * log, int);
static void lbmFree(struct lbuf * bp);
static void lbmfree(struct lbuf * bp);
static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp);
static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, int cant_block);
static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag);
static int lbmIOWait(struct lbuf * bp, int flag);
static bio_end_io_t lbmIODone;
static void lbmStartIO(struct lbuf * bp);
static void lmGCwrite(struct jfs_log * log, int cant_block);
static int lmLogSync(struct jfs_log * log, int hard_sync);



/*
 *        statistics
 */
#ifdef CONFIG_JFS_STATISTICS
static struct lmStat {
        uint commit;                /* # of commit */
        uint pagedone;                /* # of page written */
        uint submitted;                /* # of pages submitted */
        uint full_page;                /* # of full pages submitted */
        uint partial_page;        /* # of partial pages submitted */
} lmStat;
#endif

static void write_special_inodes(struct jfs_log *log,
                                 int (*writer)(struct address_space *))
{
        struct jfs_sb_info *sbi;

        list_for_each_entry(sbi, &log->sb_list, log_list) {
                writer(sbi->ipbmap->i_mapping);
                writer(sbi->ipimap->i_mapping);
                writer(sbi->direct_inode->i_mapping);
        }
}

/*
 * NAME:        lmLog()
 *
 * FUNCTION:        write a log record;
 *
 * PARAMETER:
 *
 * RETURN:        lsn - offset to the next log record to write (end-of-log);
 *                -1  - error;
 *
 * note: todo: log error handler
 */
int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
          struct tlock * tlck)
{
        int lsn;
        int diffp, difft;
        struct metapage *mp = NULL;
        unsigned long flags;

        jfs_info("lmLog: log:0x%p tblk:0x%p, lrd:0x%p tlck:0x%p",
                 log, tblk, lrd, tlck);

        LOG_LOCK(log);

        /* log by (out-of-transaction) JFS ? */
        if (tblk == NULL)
                goto writeRecord;

        /* log from page ? */
        if (tlck == NULL ||
            tlck->type & tlckBTROOT || (mp = tlck->mp) == NULL)
                goto writeRecord;

        /*
         *        initialize/update page/transaction recovery lsn
         */
        lsn = log->lsn;

        LOGSYNC_LOCK(log, flags);

        /*
         * initialize page lsn if first log write of the page
         */
        if (mp->lsn == 0) {
                mp->log = log;
                mp->lsn = lsn;
                log->count++;

                /* insert page at tail of logsynclist */
                list_add_tail(&mp->synclist, &log->synclist);
        }

        /*
         *        initialize/update lsn of tblock of the page
         *
         * transaction inherits oldest lsn of pages associated
         * with allocation/deallocation of resources (their
         * log records are used to reconstruct allocation map
         * at recovery time: inode for inode allocation map,
         * B+-tree index of extent descriptors for block
         * allocation map);
         * allocation map pages inherit transaction lsn at
         * commit time to allow forwarding log syncpt past log
         * records associated with allocation/deallocation of
         * resources only after persistent map of these map pages
         * have been updated and propagated to home.
         */
        /*
         * initialize transaction lsn:
         */
        if (tblk->lsn == 0) {
                /* inherit lsn of its first page logged */
                tblk->lsn = mp->lsn;
                log->count++;

                /* insert tblock after the page on logsynclist */
                list_add(&tblk->synclist, &mp->synclist);
        }
        /*
         * update transaction lsn:
         */
        else {
                /* inherit oldest/smallest lsn of page */
                logdiff(diffp, mp->lsn, log);
                logdiff(difft, tblk->lsn, log);
                if (diffp < difft) {
                        /* update tblock lsn with page lsn */
                        tblk->lsn = mp->lsn;

                        /* move tblock after page on logsynclist */
                        list_move(&tblk->synclist, &mp->synclist);
                }
        }

        LOGSYNC_UNLOCK(log, flags);

        /*
         *        write the log record
         */
      writeRecord:
        lsn = lmWriteRecord(log, tblk, lrd, tlck);

        /*
         * forward log syncpt if log reached next syncpt trigger
         */
        logdiff(diffp, lsn, log);
        if (diffp >= log->nextsync)
                lsn = lmLogSync(log, 0);

        /* update end-of-log lsn */
        log->lsn = lsn;

        LOG_UNLOCK(log);

        /* return end-of-log address */
        return lsn;
}

/*
 * NAME:        lmWriteRecord()
 *
 * FUNCTION:        move the log record to current log page
 *
 * PARAMETER:        cd        - commit descriptor
 *
 * RETURN:        end-of-log address
 *
 * serialization: LOG_LOCK() held on entry/exit
 */
static int
lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
              struct tlock * tlck)
{
        int lsn = 0;                /* end-of-log address */
        struct lbuf *bp;        /* dst log page buffer */
        struct logpage *lp;        /* dst log page */
        caddr_t dst;                /* destination address in log page */
        int dstoffset;                /* end-of-log offset in log page */
        int freespace;                /* free space in log page */
        caddr_t p;                /* src meta-data page */
        caddr_t src;
        int srclen;
        int nbytes;                /* number of bytes to move */
        int i;
        int len;
        struct linelock *linelock;
        struct lv *lv;
        struct lvd *lvd;
        int l2linesize;

        len = 0;

        /* retrieve destination log page to write */
        bp = (struct lbuf *) log->bp;
        lp = (struct logpage *) bp->l_ldata;
        dstoffset = log->eor;

        /* any log data to write ? */
        if (tlck == NULL)
                goto moveLrd;

        /*
         *        move log record data
         */
        /* retrieve source meta-data page to log */
        if (tlck->flag & tlckPAGELOCK) {
                p = (caddr_t) (tlck->mp->data);
                linelock = (struct linelock *) & tlck->lock;
        }
        /* retrieve source in-memory inode to log */
        else if (tlck->flag & tlckINODELOCK) {
                if (tlck->type & tlckDTREE)
                        p = (caddr_t) &JFS_IP(tlck->ip)->i_dtroot;
                else
                        p = (caddr_t) &JFS_IP(tlck->ip)->i_xtroot;
                linelock = (struct linelock *) & tlck->lock;
        }
        else {
                jfs_err("lmWriteRecord: UFO tlck:0x%p", tlck);
                return 0;        /* Probably should trap */
        }
        l2linesize = linelock->l2linesize;

      moveData:
        ASSERT(linelock->index <= linelock->maxcnt);

        lv = linelock->lv;
        for (i = 0; i < linelock->index; i++, lv++) {
                if (lv->length == 0)
                        continue;

                /* is page full ? */
                if (dstoffset >= LOGPSIZE - LOGPTLRSIZE) {
                        /* page become full: move on to next page */
                        lmNextPage(log);

                        bp = log->bp;
                        lp = (struct logpage *) bp->l_ldata;
                        dstoffset = LOGPHDRSIZE;
                }

                /*
                 * move log vector data
                 */
                src = (u8 *) p + (lv->offset << l2linesize);
                srclen = lv->length << l2linesize;
                len += srclen;
                while (srclen > 0) {
                        freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset;
                        nbytes = min(freespace, srclen);
                        dst = (caddr_t) lp + dstoffset;
                        memcpy(dst, src, nbytes);
                        dstoffset += nbytes;

                        /* is page not full ? */
                        if (dstoffset < LOGPSIZE - LOGPTLRSIZE)
                                break;

                        /* page become full: move on to next page */
                        lmNextPage(log);

                        bp = (struct lbuf *) log->bp;
                        lp = (struct logpage *) bp->l_ldata;
                        dstoffset = LOGPHDRSIZE;

                        srclen -= nbytes;
                        src += nbytes;
                }

                /*
                 * move log vector descriptor
                 */
                len += 4;
                lvd = (struct lvd *) ((caddr_t) lp + dstoffset);
                lvd->offset = cpu_to_le16(lv->offset);
                lvd->length = cpu_to_le16(lv->length);
                dstoffset += 4;
                jfs_info("lmWriteRecord: lv offset:%d length:%d",
                         lv->offset, lv->length);
        }

        if ((i = linelock->next)) {
                linelock = (struct linelock *) lid_to_tlock(i);
                goto moveData;
        }

        /*
         *        move log record descriptor
         */
      moveLrd:
        lrd->length = cpu_to_le16(len);

        src = (caddr_t) lrd;
        srclen = LOGRDSIZE;

        while (srclen > 0) {
                freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset;
                nbytes = min(freespace, srclen);
                dst = (caddr_t) lp + dstoffset;
                memcpy(dst, src, nbytes);

                dstoffset += nbytes;
                srclen -= nbytes;

                /* are there more to move than freespace of page ? */
                if (srclen)
                        goto pageFull;

                /*
                 * end of log record descriptor
                 */

                /* update last log record eor */
                log->eor = dstoffset;
                bp->l_eor = dstoffset;
                lsn = (log->page << L2LOGPSIZE) + dstoffset;

                if (lrd->type & cpu_to_le16(LOG_COMMIT)) {
                        tblk->clsn = lsn;
                        jfs_info("wr: tclsn:0x%x, beor:0x%x", tblk->clsn,
                                 bp->l_eor);

                        INCREMENT(lmStat.commit);        /* # of commit */

                        /*
                         * enqueue tblock for group commit:
                         *
                         * enqueue tblock of non-trivial/synchronous COMMIT
                         * at tail of group commit queue
                         * (trivial/asynchronous COMMITs are ignored by
                         * group commit.)
                         */
                        LOGGC_LOCK(log);

                        /* init tblock gc state */
                        tblk->flag = tblkGC_QUEUE;
                        tblk->bp = log->bp;
                        tblk->pn = log->page;
                        tblk->eor = log->eor;

                        /* enqueue transaction to commit queue */
                        list_add_tail(&tblk->cqueue, &log->cqueue);

                        LOGGC_UNLOCK(log);
                }

                jfs_info("lmWriteRecord: lrd:0x%04x bp:0x%p pn:%d eor:0x%x",
                        le16_to_cpu(lrd->type), log->bp, log->page, dstoffset);

                /* page not full ? */
                if (dstoffset < LOGPSIZE - LOGPTLRSIZE)
                        return lsn;

              pageFull:
                /* page become full: move on to next page */
                lmNextPage(log);

                bp = (struct lbuf *) log->bp;
                lp = (struct logpage *) bp->l_ldata;
                dstoffset = LOGPHDRSIZE;
                src += nbytes;
        }

        return lsn;
}


/*
 * NAME:        lmNextPage()
 *
 * FUNCTION:        write current page and allocate next page.
 *
 * PARAMETER:        log
 *
 * RETURN:        0
 *
 * serialization: LOG_LOCK() held on entry/exit
 */
static int lmNextPage(struct jfs_log * log)
{
        struct logpage *lp;
        int lspn;                /* log sequence page number */
        int pn;                        /* current page number */
        struct lbuf *bp;
        struct lbuf *nextbp;
        struct tblock *tblk;

        /* get current log page number and log sequence page number */
        pn = log->page;
        bp = log->bp;
        lp = (struct logpage *) bp->l_ldata;
        lspn = le32_to_cpu(lp->h.page);

        LOGGC_LOCK(log);

        /*
         *        write or queue the full page at the tail of write queue
         */
        /* get the tail tblk on commit queue */
        if (list_empty(&log->cqueue))
                tblk = NULL;
        else
                tblk = list_entry(log->cqueue.prev, struct tblock, cqueue);

        /* every tblk who has COMMIT record on the current page,
         * and has not been committed, must be on commit queue
         * since tblk is queued at commit queueu at the time
         * of writing its COMMIT record on the page before
         * page becomes full (even though the tblk thread
         * who wrote COMMIT record may have been suspended
         * currently);
         */

        /* is page bound with outstanding tail tblk ? */
        if (tblk && tblk->pn == pn) {
                /* mark tblk for end-of-page */
                tblk->flag |= tblkGC_EOP;

                if (log->cflag & logGC_PAGEOUT) {
                        /* if page is not already on write queue,
                         * just enqueue (no lbmWRITE to prevent redrive)
                         * buffer to wqueue to ensure correct serial order
                         * of the pages since log pages will be added
                         * continuously
                         */
                        if (bp->l_wqnext == NULL)
                                lbmWrite(log, bp, 0, 0);
                } else {
                        /*
                         * No current GC leader, initiate group commit
                         */
                        log->cflag |= logGC_PAGEOUT;
                        lmGCwrite(log, 0);
                }
        }
        /* page is not bound with outstanding tblk:
         * init write or mark it to be redriven (lbmWRITE)
         */
        else {
                /* finalize the page */
                bp->l_ceor = bp->l_eor;
                lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
                lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, 0);
        }
        LOGGC_UNLOCK(log);

        /*
         *        allocate/initialize next page
         */
        /* if log wraps, the first data page of log is 2
         * (0 never used, 1 is superblock).
         */
        log->page = (pn == log->size - 1) ? 2 : pn + 1;
        log->eor = LOGPHDRSIZE;        /* ? valid page empty/full at logRedo() */

        /* allocate/initialize next log page buffer */
        nextbp = lbmAllocate(log, log->page);
        nextbp->l_eor = log->eor;
        log->bp = nextbp;

        /* initialize next log page */
        lp = (struct logpage *) nextbp->l_ldata;
        lp->h.page = lp->t.page = cpu_to_le32(lspn + 1);
        lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE);

        return 0;
}


/*
 * NAME:        lmGroupCommit()
 *
 * FUNCTION:        group commit
 *        initiate pageout of the pages with COMMIT in the order of
 *        page number - redrive pageout of the page at the head of
 *        pageout queue until full page has been written.
 *
 * RETURN:
 *
 * NOTE:
 *        LOGGC_LOCK serializes log group commit queue, and
 *        transaction blocks on the commit queue.
 *        N.B. LOG_LOCK is NOT held during lmGroupCommit().
 */
int lmGroupCommit(struct jfs_log * log, struct tblock * tblk)
{
        int rc = 0;

        LOGGC_LOCK(log);

        /* group committed already ? */
        if (tblk->flag & tblkGC_COMMITTED) {
                if (tblk->flag & tblkGC_ERROR)
                        rc = -EIO;

                LOGGC_UNLOCK(log);
                return rc;
        }
        jfs_info("lmGroup Commit: tblk = 0x%p, gcrtc = %d", tblk, log->gcrtc);

        if (tblk->xflag & COMMIT_LAZY)
                tblk->flag |= tblkGC_LAZY;

        if ((!(log->cflag & logGC_PAGEOUT)) && (!list_empty(&log->cqueue)) &&
            (!(tblk->xflag & COMMIT_LAZY) || test_bit(log_FLUSH, &log->flag)
             || jfs_tlocks_low)) {
                /*
                 * No pageout in progress
                 *
                 * start group commit as its group leader.
                 */
                log->cflag |= logGC_PAGEOUT;

                lmGCwrite(log, 0);
        }

        if (tblk->xflag & COMMIT_LAZY) {
                /*
                 * Lazy transactions can leave now
                 */
                LOGGC_UNLOCK(log);
                return 0;
        }

        /* lmGCwrite gives up LOGGC_LOCK, check again */

        if (tblk->flag & tblkGC_COMMITTED) {
                if (tblk->flag & tblkGC_ERROR)
                        rc = -EIO;

                LOGGC_UNLOCK(log);
                return rc;
        }

        /* upcount transaction waiting for completion
         */
        log->gcrtc++;
        tblk->flag |= tblkGC_READY;

        __SLEEP_COND(tblk->gcwait, (tblk->flag & tblkGC_COMMITTED),
                     LOGGC_LOCK(log), LOGGC_UNLOCK(log));

        /* removed from commit queue */
        if (tblk->flag & tblkGC_ERROR)
                rc = -EIO;

        LOGGC_UNLOCK(log);
        return rc;
}

/*
 * NAME:        lmGCwrite()
 *
 * FUNCTION:        group commit write
 *        initiate write of log page, building a group of all transactions
 *        with commit records on that page.
 *
 * RETURN:        None
 *
 * NOTE:
 *        LOGGC_LOCK must be held by caller.
 *        N.B. LOG_LOCK is NOT held during lmGroupCommit().
 */
static void lmGCwrite(struct jfs_log * log, int cant_write)
{
        struct lbuf *bp;
        struct logpage *lp;
        int gcpn;                /* group commit page number */
        struct tblock *tblk;
        struct tblock *xtblk = NULL;

        /*
         * build the commit group of a log page
         *
         * scan commit queue and make a commit group of all
         * transactions with COMMIT records on the same log page.
         */
        /* get the head tblk on the commit queue */
        gcpn = list_entry(log->cqueue.next, struct tblock, cqueue)->pn;

        list_for_each_entry(tblk, &log->cqueue, cqueue) {
                if (tblk->pn != gcpn)
                        break;

                xtblk = tblk;

                /* state transition: (QUEUE, READY) -> COMMIT */
                tblk->flag |= tblkGC_COMMIT;
        }
        tblk = xtblk;                /* last tblk of the page */

        /*
         * pageout to commit transactions on the log page.
         */
        bp = (struct lbuf *) tblk->bp;
        lp = (struct logpage *) bp->l_ldata;
        /* is page already full ? */
        if (tblk->flag & tblkGC_EOP) {
                /* mark page to free at end of group commit of the page */
                tblk->flag &= ~tblkGC_EOP;
                tblk->flag |= tblkGC_FREE;
                bp->l_ceor = bp->l_eor;
                lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
                lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmGC,
                         cant_write);
                INCREMENT(lmStat.full_page);
        }
        /* page is not yet full */
        else {
                bp->l_ceor = tblk->eor;        /* ? bp->l_ceor = bp->l_eor; */
                lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
                lbmWrite(log, bp, lbmWRITE | lbmGC, cant_write);
                INCREMENT(lmStat.partial_page);
        }
}

/*
 * NAME:        lmPostGC()
 *
 * FUNCTION:        group commit post-processing
 *        Processes transactions after their commit records have been written
 *        to disk, redriving log I/O if necessary.
 *
 * RETURN:        None
 *
 * NOTE:
 *        This routine is called a interrupt time by lbmIODone
 */
static void lmPostGC(struct lbuf * bp)
{
        unsigned long flags;
        struct jfs_log *log = bp->l_log;
        struct logpage *lp;
        struct tblock *tblk, *temp;

        //LOGGC_LOCK(log);
        spin_lock_irqsave(&log->gclock, flags);
        /*
         * current pageout of group commit completed.
         *
         * remove/wakeup transactions from commit queue who were
         * group committed with the current log page
         */
        list_for_each_entry_safe(tblk, temp, &log->cqueue, cqueue) {
                if (!(tblk->flag & tblkGC_COMMIT))
                        break;
                /* if transaction was marked GC_COMMIT then
                 * it has been shipped in the current pageout
                 * and made it to disk - it is committed.
                 */

                if (bp->l_flag & lbmERROR)
                        tblk->flag |= tblkGC_ERROR;

                /* remove it from the commit queue */
                list_del(&tblk->cqueue);
                tblk->flag &= ~tblkGC_QUEUE;

                if (tblk == log->flush_tblk) {
                        /* we can stop flushing the log now */
                        clear_bit(log_FLUSH, &log->flag);
                        log->flush_tblk = NULL;
                }

                jfs_info("lmPostGC: tblk = 0x%p, flag = 0x%x", tblk,
                         tblk->flag);

                if (!(tblk->xflag & COMMIT_FORCE))
                        /*
                         * Hand tblk over to lazy commit thread
                         */
                        txLazyUnlock(tblk);
                else {
                        /* state transition: COMMIT -> COMMITTED */
                        tblk->flag |= tblkGC_COMMITTED;

                        if (tblk->flag & tblkGC_READY)
                                log->gcrtc--;

                        LOGGC_WAKEUP(tblk);
                }

                /* was page full before pageout ?
                 * (and this is the last tblk bound with the page)
                 */
                if (tblk->flag & tblkGC_FREE)
                        lbmFree(bp);
                /* did page become full after pageout ?
                 * (and this is the last tblk bound with the page)
                 */
                else if (tblk->flag & tblkGC_EOP) {
                        /* finalize the page */
                        lp = (struct logpage *) bp->l_ldata;
                        bp->l_ceor = bp->l_eor;
                        lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
                        jfs_info("lmPostGC: calling lbmWrite");
                        lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE,
                                 1);
                }

        }

        /* are there any transactions who have entered lnGroupCommit()
         * (whose COMMITs are after that of the last log page written.
         * They are waiting for new group commit (above at (SLEEP 1))
         * or lazy transactions are on a full (queued) log page,
         * select the latest ready transaction as new group leader and
         * wake her up to lead her group.
         */
        if ((!list_empty(&log->cqueue)) &&
            ((log->gcrtc > 0) || (tblk->bp->l_wqnext != NULL) ||
             test_bit(log_FLUSH, &log->flag) || jfs_tlocks_low))
                /*
                 * Call lmGCwrite with new group leader
                 */
                lmGCwrite(log, 1);

        /* no transaction are ready yet (transactions are only just
         * queued (GC_QUEUE) and not entered for group commit yet).
         * the first transaction entering group commit
         * will elect herself as new group leader.
         */
        else
                log->cflag &= ~logGC_PAGEOUT;

        //LOGGC_UNLOCK(log);
        spin_unlock_irqrestore(&log->gclock, flags);
        return;
}

/*
 * NAME:        lmLogSync()
 *
 * FUNCTION:        write log SYNCPT record for specified log
 *        if new sync address is available
 *        (normally the case if sync() is executed by back-ground
 *        process).
 *        calculate new value of i_nextsync which determines when
 *        this code is called again.
 *
 * PARAMETERS:        log        - log structure
 *                hard_sync - 1 to force all metadata to be written
 *
 * RETURN:        0
 *
 * serialization: LOG_LOCK() held on entry/exit
 */
static int lmLogSync(struct jfs_log * log, int hard_sync)
{
        int logsize;
        int written;                /* written since last syncpt */
        int free;                /* free space left available */
        int delta;                /* additional delta to write normally */
        int more;                /* additional write granted */
        struct lrd lrd;
        int lsn;
        struct logsyncblk *lp;
        unsigned long flags;

        /* push dirty metapages out to disk */
        if (hard_sync)
                write_special_inodes(log, filemap_fdatawrite);
        else
                write_special_inodes(log, filemap_flush);

        /*
         *        forward syncpt
         */
        /* if last sync is same as last syncpt,
         * invoke sync point forward processing to update sync.
         */

        if (log->sync == log->syncpt) {
                LOGSYNC_LOCK(log, flags);
                if (list_empty(&log->synclist))
                        log->sync = log->lsn;
                else {
                        lp = list_entry(log->synclist.next,
                                        struct logsyncblk, synclist);
                        log->sync = lp->lsn;
                }
                LOGSYNC_UNLOCK(log, flags);

        }

        /* if sync is different from last syncpt,
         * write a SYNCPT record with syncpt = sync.
         * reset syncpt = sync
         */
        if (log->sync != log->syncpt) {
                lrd.logtid = 0;
                lrd.backchain = 0;
                lrd.type = cpu_to_le16(LOG_SYNCPT);
                lrd.length = 0;
                lrd.log.syncpt.sync = cpu_to_le32(log->sync);
                lsn = lmWriteRecord(log, NULL, &lrd, NULL);

                log->syncpt = log->sync;
        } else
                lsn = log->lsn;

        /*
         *        setup next syncpt trigger (SWAG)
         */
        logsize = log->logsize;

        logdiff(written, lsn, log);
        free = logsize - written;
        delta = LOGSYNC_DELTA(logsize);
        more = min(free / 2, delta);
        if (more < 2 * LOGPSIZE) {
                jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n");
                /*
                 *        log wrapping
                 *
                 * option 1 - panic ? No.!
                 * option 2 - shutdown file systems
                 *              associated with log ?
                 * option 3 - extend log ?
                 * option 4 - second chance
                 *
                 * mark log wrapped, and continue.
                 * when all active transactions are completed,
                 * mark log valid for recovery.
                 * if crashed during invalid state, log state
                 * implies invalid log, forcing fsck().
                 */
                /* mark log state log wrap in log superblock */
                /* log->state = LOGWRAP; */

                /* reset sync point computation */
                log->syncpt = log->sync = lsn;
                log->nextsync = delta;
        } else
                /* next syncpt trigger = written + more */
                log->nextsync = written + more;

        /* if number of bytes written from last sync point is more
         * than 1/4 of the log size, stop new transactions from
         * starting until all current transactions are completed
         * by setting syncbarrier flag.
         */
        if (!test_bit(log_SYNCBARRIER, &log->flag) &&
            (written > LOGSYNC_BARRIER(logsize)) && log->active) {
                set_bit(log_SYNCBARRIER, &log->flag);
                jfs_info("log barrier on: lsn=0x%x syncpt=0x%x", lsn,
                         log->syncpt);
                /*
                 * We may have to initiate group commit
                 */
                jfs_flush_journal(log, 0);
        }

        return lsn;
}

/*
 * NAME:        jfs_syncpt
 *
 * FUNCTION:        write log SYNCPT record for specified log
 *
 * PARAMETERS:        log          - log structure
 *                hard_sync - set to 1 to force metadata to be written
 */
void jfs_syncpt(struct jfs_log *log, int hard_sync)
{        LOG_LOCK(log);
        if (!test_bit(log_QUIESCE, &log->flag))
                lmLogSync(log, hard_sync);
        LOG_UNLOCK(log);
}

/*
 * NAME:        lmLogOpen()
 *
 * FUNCTION:        open the log on first open;
 *        insert filesystem in the active list of the log.
 *
 * PARAMETER:        ipmnt        - file system mount inode
 *                iplog        - log inode (out)
 *
 * RETURN:
 *
 * serialization:
 */
int lmLogOpen(struct super_block *sb)
{
        int rc;
        struct file *bdev_file;
        struct jfs_log *log;
        struct jfs_sb_info *sbi = JFS_SBI(sb);

        if (sbi->flag & JFS_NOINTEGRITY)
                return open_dummy_log(sb);

        if (sbi->mntflag & JFS_INLINELOG)
                return open_inline_log(sb);

        mutex_lock(&jfs_log_mutex);
        list_for_each_entry(log, &jfs_external_logs, journal_list) {
                if (file_bdev(log->bdev_file)->bd_dev == sbi->logdev) {
                        if (!uuid_equal(&log->uuid, &sbi->loguuid)) {
                                jfs_warn("wrong uuid on JFS journal");
                                mutex_unlock(&jfs_log_mutex);
                                return -EINVAL;
                        }
                        /*
                         * add file system to log active file system list
                         */
                        if ((rc = lmLogFileSystem(log, sbi, 1))) {
                                mutex_unlock(&jfs_log_mutex);
                                return rc;
                        }
                        goto journal_found;
                }
        }

        if (!(log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL))) {
                mutex_unlock(&jfs_log_mutex);
                return -ENOMEM;
        }
        INIT_LIST_HEAD(&log->sb_list);
        init_waitqueue_head(&log->syncwait);

        /*
         *        external log as separate logical volume
         *
         * file systems to log may have n-to-1 relationship;
         */

        bdev_file = bdev_file_open_by_dev(sbi->logdev,
                        BLK_OPEN_READ | BLK_OPEN_WRITE, log, NULL);
        if (IS_ERR(bdev_file)) {
                rc = PTR_ERR(bdev_file);
                goto free;
        }

        log->bdev_file = bdev_file;
        uuid_copy(&log->uuid, &sbi->loguuid);

        /*
         * initialize log:
         */
        if ((rc = lmLogInit(log)))
                goto close;

        list_add(&log->journal_list, &jfs_external_logs);

        /*
         * add file system to log active file system list
         */
        if ((rc = lmLogFileSystem(log, sbi, 1)))
                goto shutdown;

journal_found:
        LOG_LOCK(log);
        list_add(&sbi->log_list, &log->sb_list);
        sbi->log = log;
        LOG_UNLOCK(log);

        mutex_unlock(&jfs_log_mutex);
        return 0;

        /*
         *        unwind on error
         */
      shutdown:                /* unwind lbmLogInit() */
        list_del(&log->journal_list);
        lbmLogShutdown(log);

      close:                /* close external log device */
        bdev_fput(bdev_file);

      free:                /* free log descriptor */
        mutex_unlock(&jfs_log_mutex);
        kfree(log);

        jfs_warn("lmLogOpen: exit(%d)", rc);
        return rc;
}

static int open_inline_log(struct super_block *sb)
{
        struct jfs_log *log;
        int rc;

        if (!(log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL)))
                return -ENOMEM;
        INIT_LIST_HEAD(&log->sb_list);
        init_waitqueue_head(&log->syncwait);

        set_bit(log_INLINELOG, &log->flag);
        log->bdev_file = sb->s_bdev_file;
        log->base = addressPXD(&JFS_SBI(sb)->logpxd);
        log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >>
            (L2LOGPSIZE - sb->s_blocksize_bits);
        log->l2bsize = sb->s_blocksize_bits;
        ASSERT(L2LOGPSIZE >= sb->s_blocksize_bits);

        /*
         * initialize log.
         */
        if ((rc = lmLogInit(log))) {
                kfree(log);
                jfs_warn("lmLogOpen: exit(%d)", rc);
                return rc;
        }

        list_add(&JFS_SBI(sb)->log_list, &log->sb_list);
        JFS_SBI(sb)->log = log;

        return rc;
}

static int open_dummy_log(struct super_block *sb)
{
        int rc;

        mutex_lock(&jfs_log_mutex);
        if (!dummy_log) {
                dummy_log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL);
                if (!dummy_log) {
                        mutex_unlock(&jfs_log_mutex);
                        return -ENOMEM;
                }
                INIT_LIST_HEAD(&dummy_log->sb_list);
                init_waitqueue_head(&dummy_log->syncwait);
                dummy_log->no_integrity = 1;
                /* Make up some stuff */
                dummy_log->base = 0;
                dummy_log->size = 1024;
                rc = lmLogInit(dummy_log);
                if (rc) {
                        kfree(dummy_log);
                        dummy_log = NULL;
                        mutex_unlock(&jfs_log_mutex);
                        return rc;
                }
        }

        LOG_LOCK(dummy_log);
        list_add(&JFS_SBI(sb)->log_list, &dummy_log->sb_list);
        JFS_SBI(sb)->log = dummy_log;
        LOG_UNLOCK(dummy_log);
        mutex_unlock(&jfs_log_mutex);

        return 0;
}

/*
 * NAME:        lmLogInit()
 *
 * FUNCTION:        log initialization at first log open.
 *
 *        logredo() (or logformat()) should have been run previously.
 *        initialize the log from log superblock.
 *        set the log state in the superblock to LOGMOUNT and
 *        write SYNCPT log record.
 *
 * PARAMETER:        log        - log structure
 *
 * RETURN:        0        - if ok
 *                -EINVAL        - bad log magic number or superblock dirty
 *                error returned from logwait()
 *
 * serialization: single first open thread
 */
int lmLogInit(struct jfs_log * log)
{
        int rc = 0;
        struct lrd lrd;
        struct logsuper *logsuper;
        struct lbuf *bpsuper;
        struct lbuf *bp;
        struct logpage *lp;
        int lsn = 0;

        jfs_info("lmLogInit: log:0x%p", log);

        /* initialize the group commit serialization lock */
        LOGGC_LOCK_INIT(log);

        /* allocate/initialize the log write serialization lock */
        LOG_LOCK_INIT(log);

        LOGSYNC_LOCK_INIT(log);

        INIT_LIST_HEAD(&log->synclist);

        INIT_LIST_HEAD(&log->cqueue);
        log->flush_tblk = NULL;

        log->count = 0;

        /*
         * initialize log i/o
         */
        if ((rc = lbmLogInit(log)))
                return rc;

        if (!test_bit(log_INLINELOG, &log->flag))
                log->l2bsize = L2LOGPSIZE;

        /* check for disabled journaling to disk */
        if (log->no_integrity) {
                /*
                 * Journal pages will still be filled.  When the time comes
                 * to actually do the I/O, the write is not done, and the
                 * endio routine is called directly.
                 */
                bp = lbmAllocate(log , 0);
                log->bp = bp;
                bp->l_pn = bp->l_eor = 0;
        } else {
                /*
                 * validate log superblock
                 */
                if ((rc = lbmRead(log, 1, &bpsuper)))
                        goto errout10;

                logsuper = (struct logsuper *) bpsuper->l_ldata;

                if (logsuper->magic != cpu_to_le32(LOGMAGIC)) {
                        jfs_warn("*** Log Format Error ! ***");
                        rc = -EINVAL;
                        goto errout20;
                }

                /* logredo() should have been run successfully. */
                if (logsuper->state != cpu_to_le32(LOGREDONE)) {
                        jfs_warn("*** Log Is Dirty ! ***");
                        rc = -EINVAL;
                        goto errout20;
                }

                /* initialize log from log superblock */
                if (test_bit(log_INLINELOG,&log->flag)) {
                        if (log->size != le32_to_cpu(logsuper->size)) {
                                rc = -EINVAL;
                                goto errout20;
                        }
                        jfs_info("lmLogInit: inline log:0x%p base:0x%Lx size:0x%x",
                                 log, (unsigned long long)log->base, log->size);
                } else {
                        if (!uuid_equal(&logsuper->uuid, &log->uuid)) {
                                jfs_warn("wrong uuid on JFS log device");
                                rc = -EINVAL;
                                goto errout20;
                        }
                        log->size = le32_to_cpu(logsuper->size);
                        log->l2bsize = le32_to_cpu(logsuper->l2bsize);
                        jfs_info("lmLogInit: external log:0x%p base:0x%Lx size:0x%x",
                                 log, (unsigned long long)log->base, log->size);
                }

                log->page = le32_to_cpu(logsuper->end) / LOGPSIZE;
                log->eor = le32_to_cpu(logsuper->end) - (LOGPSIZE * log->page);

                /*
                 * initialize for log append write mode
                 */
                /* establish current/end-of-log page/buffer */
                if ((rc = lbmRead(log, log->page, &bp)))
                        goto errout20;

                lp = (struct logpage *) bp->l_ldata;

                jfs_info("lmLogInit: lsn:0x%x page:%d eor:%d:%d",
                         le32_to_cpu(logsuper->end), log->page, log->eor,
                         le16_to_cpu(lp->h.eor));

                log->bp = bp;
                bp->l_pn = log->page;
                bp->l_eor = log->eor;

                /* if current page is full, move on to next page */
                if (log->eor >= LOGPSIZE - LOGPTLRSIZE)
                        lmNextPage(log);

                /*
                 * initialize log syncpoint
                 */
                /*
                 * write the first SYNCPT record with syncpoint = 0
                 * (i.e., log redo up to HERE !);
                 * remove current page from lbm write queue at end of pageout
                 * (to write log superblock update), but do not release to
                 * freelist;
                 */
                lrd.logtid = 0;
                lrd.backchain = 0;
                lrd.type = cpu_to_le16(LOG_SYNCPT);
                lrd.length = 0;
                lrd.log.syncpt.sync = 0;
                lsn = lmWriteRecord(log, NULL, &lrd, NULL);
                bp = log->bp;
                bp->l_ceor = bp->l_eor;
                lp = (struct logpage *) bp->l_ldata;
                lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
                lbmWrite(log, bp, lbmWRITE | lbmSYNC, 0);
                if ((rc = lbmIOWait(bp, 0)))
                        goto errout30;

                /*
                 * update/write superblock
                 */
                logsuper->state = cpu_to_le32(LOGMOUNT);
                log->serial = le32_to_cpu(logsuper->serial) + 1;
                logsuper->serial = cpu_to_le32(log->serial);
                lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
                if ((rc = lbmIOWait(bpsuper, lbmFREE)))
                        goto errout30;
        }

        /* initialize logsync parameters */
        log->logsize = (log->size - 2) << L2LOGPSIZE;
        log->lsn = lsn;
        log->syncpt = lsn;
        log->sync = log->syncpt;
        log->nextsync = LOGSYNC_DELTA(log->logsize);

        jfs_info("lmLogInit: lsn:0x%x syncpt:0x%x sync:0x%x",
                 log->lsn, log->syncpt, log->sync);

        /*
         * initialize for lazy/group commit
         */
        log->clsn = lsn;

        return 0;

        /*
         *        unwind on error
         */
      errout30:                /* release log page */
        log->wqueue = NULL;
        bp->l_wqnext = NULL;
        lbmFree(bp);

      errout20:                /* release log superblock */
        lbmFree(bpsuper);

      errout10:                /* unwind lbmLogInit() */
        lbmLogShutdown(log);

        jfs_warn("lmLogInit: exit(%d)", rc);
        return rc;
}


/*
 * NAME:        lmLogClose()
 *
 * FUNCTION:        remove file system <ipmnt> from active list of log <iplog>
 *                and close it on last close.
 *
 * PARAMETER:        sb        - superblock
 *
 * RETURN:        errors from subroutines
 *
 * serialization:
 */
int lmLogClose(struct super_block *sb)
{
        struct jfs_sb_info *sbi = JFS_SBI(sb);
        struct jfs_log *log = sbi->log;
        struct file *bdev_file;
        int rc = 0;

        jfs_info("lmLogClose: log:0x%p", log);

        mutex_lock(&jfs_log_mutex);
        LOG_LOCK(log);
        list_del(&sbi->log_list);
        LOG_UNLOCK(log);
        sbi->log = NULL;

        /*
         * We need to make sure all of the "written" metapages
         * actually make it to disk
         */
        sync_blockdev(sb->s_bdev);

        if (test_bit(log_INLINELOG, &log->flag)) {
                /*
                 *        in-line log in host file system
                 */
                rc = lmLogShutdown(log);
                kfree(log);
                goto out;
        }

        if (!log->no_integrity)
                lmLogFileSystem(log, sbi, 0);

        if (!list_empty(&log->sb_list))
                goto out;

        /*
         * TODO: ensure that the dummy_log is in a state to allow
         * lbmLogShutdown to deallocate all the buffers and call
         * kfree against dummy_log.  For now, leave dummy_log & its
         * buffers in memory, and resuse if another no-integrity mount
         * is requested.
         */
        if (log->no_integrity)
                goto out;

        /*
         *        external log as separate logical volume
         */
        list_del(&log->journal_list);
        bdev_file = log->bdev_file;
        rc = lmLogShutdown(log);

        bdev_fput(bdev_file);

        kfree(log);

      out:
        mutex_unlock(&jfs_log_mutex);
        jfs_info("lmLogClose: exit(%d)", rc);
        return rc;
}


/*
 * NAME:        jfs_flush_journal()
 *
 * FUNCTION:        initiate write of any outstanding transactions to the journal
 *                and optionally wait until they are all written to disk
 *
 *                wait == 0  flush until latest txn is committed, don't wait
 *                wait == 1  flush until latest txn is committed, wait
 *                wait > 1   flush until all txn's are complete, wait
 */
void jfs_flush_journal(struct jfs_log *log, int wait)
{
        int i;
        struct tblock *target = NULL;

        /* jfs_write_inode may call us during read-only mount */
        if (!log)
                return;

        jfs_info("jfs_flush_journal: log:0x%p wait=%d", log, wait);

        LOGGC_LOCK(log);

        if (!list_empty(&log->cqueue)) {
                /*
                 * This ensures that we will keep writing to the journal as long
                 * as there are unwritten commit records
                 */
                target = list_entry(log->cqueue.prev, struct tblock, cqueue);

                if (test_bit(log_FLUSH, &log->flag)) {
                        /*
                         * We're already flushing.
                         * if flush_tblk is NULL, we are flushing everything,
                         * so leave it that way.  Otherwise, update it to the
                         * latest transaction
                         */
                        if (log->flush_tblk)
                                log->flush_tblk = target;
                } else {
                        /* Only flush until latest transaction is committed */
                        log->flush_tblk = target;
                        set_bit(log_FLUSH, &log->flag);

                        /*
                         * Initiate I/O on outstanding transactions
                         */
                        if (!(log->cflag & logGC_PAGEOUT)) {
                                log->cflag |= logGC_PAGEOUT;
                                lmGCwrite(log, 0);
                        }
                }
        }
        if ((wait > 1) || test_bit(log_SYNCBARRIER, &log->flag)) {
                /* Flush until all activity complete */
                set_bit(log_FLUSH, &log->flag);
                log->flush_tblk = NULL;
        }

        if (wait && target && !(target->flag & tblkGC_COMMITTED)) {
                DECLARE_WAITQUEUE(__wait, current);

                add_wait_queue(&target->gcwait, &__wait);
                set_current_state(TASK_UNINTERRUPTIBLE);
                LOGGC_UNLOCK(log);
                schedule();
                LOGGC_LOCK(log);
                remove_wait_queue(&target->gcwait, &__wait);
        }
        LOGGC_UNLOCK(log);

        if (wait < 2)
                return;

        write_special_inodes(log, filemap_fdatawrite);

        /*
         * If there was recent activity, we may need to wait
         * for the lazycommit thread to catch up
         */
        if ((!list_empty(&log->cqueue)) || !list_empty(&log->synclist)) {
                for (i = 0; i < 200; i++) {        /* Too much? */
                        msleep(250);
                        write_special_inodes(log, filemap_fdatawrite);
                        if (list_empty(&log->cqueue) &&
                            list_empty(&log->synclist))
                                break;
                }
        }
        assert(list_empty(&log->cqueue));

#ifdef CONFIG_JFS_DEBUG
        if (!list_empty(&log->synclist)) {
                struct logsyncblk *lp;

                printk(KERN_ERR "jfs_flush_journal: synclist not empty\n");
                list_for_each_entry(lp, &log->synclist, synclist) {
                        if (lp->xflag & COMMIT_PAGE) {
                                struct metapage *mp = (struct metapage *)lp;
                                print_hex_dump(KERN_ERR, "metapage: ",
                                               DUMP_PREFIX_ADDRESS, 16, 4,
                                               mp, sizeof(struct metapage), 0);
                                print_hex_dump(KERN_ERR, "page: ",
                                               DUMP_PREFIX_ADDRESS, 16,
                                               sizeof(long), mp->page,
                                               sizeof(struct page), 0);
                        } else
                                print_hex_dump(KERN_ERR, "tblock:",
                                               DUMP_PREFIX_ADDRESS, 16, 4,
                                               lp, sizeof(struct tblock), 0);
                }
        }
#else
        WARN_ON(!list_empty(&log->synclist));
#endif
        clear_bit(log_FLUSH, &log->flag);
}

/*
 * NAME:        lmLogShutdown()
 *
 * FUNCTION:        log shutdown at last LogClose().
 *
 *                write log syncpt record.
 *                update super block to set redone flag to 0.
 *
 * PARAMETER:        log        - log inode
 *
 * RETURN:        0        - success
 *
 * serialization: single last close thread
 */
int lmLogShutdown(struct jfs_log * log)
{
        int rc;
        struct lrd lrd;
        int lsn;
        struct logsuper *logsuper;
        struct lbuf *bpsuper;
        struct lbuf *bp;
        struct logpage *lp;

        jfs_info("lmLogShutdown: log:0x%p", log);

        jfs_flush_journal(log, 2);

        /*
         * write the last SYNCPT record with syncpoint = 0
         * (i.e., log redo up to HERE !)
         */
        lrd.logtid = 0;
        lrd.backchain = 0;
        lrd.type = cpu_to_le16(LOG_SYNCPT);
        lrd.length = 0;
        lrd.log.syncpt.sync = 0;

        lsn = lmWriteRecord(log, NULL, &lrd, NULL);
        bp = log->bp;
        lp = (struct logpage *) bp->l_ldata;
        lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
        lbmWrite(log, log->bp, lbmWRITE | lbmRELEASE | lbmSYNC, 0);
        lbmIOWait(log->bp, lbmFREE);
        log->bp = NULL;

        /*
         * synchronous update log superblock
         * mark log state as shutdown cleanly
         * (i.e., Log does not need to be replayed).
         */
        if ((rc = lbmRead(log, 1, &bpsuper)))
                goto out;

        logsuper = (struct logsuper *) bpsuper->l_ldata;
        logsuper->state = cpu_to_le32(LOGREDONE);
        logsuper->end = cpu_to_le32(lsn);
        lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
        rc = lbmIOWait(bpsuper, lbmFREE);

        jfs_info("lmLogShutdown: lsn:0x%x page:%d eor:%d",
                 lsn, log->page, log->eor);

      out:
        /*
         * shutdown per log i/o
         */
        lbmLogShutdown(log);

        if (rc) {
                jfs_warn("lmLogShutdown: exit(%d)", rc);
        }
        return rc;
}


/*
 * NAME:        lmLogFileSystem()
 *
 * FUNCTION:        insert (<activate> = true)/remove (<activate> = false)
 *        file system into/from log active file system list.
 *
 * PARAMETE:        log        - pointer to logs inode.
 *                fsdev        - kdev_t of filesystem.
 *                serial        - pointer to returned log serial number
 *                activate - insert/remove device from active list.
 *
 * RETURN:        0        - success
 *                errors returned by vms_iowait().
 */
static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi,
                           int activate)
{
        int rc = 0;
        int i;
        struct logsuper *logsuper;
        struct lbuf *bpsuper;
        uuid_t *uuid = &sbi->uuid;

        /*
         * insert/remove file system device to log active file system list.
         */
        if ((rc = lbmRead(log, 1, &bpsuper)))
                return rc;

        logsuper = (struct logsuper *) bpsuper->l_ldata;
        if (activate) {
                for (i = 0; i < MAX_ACTIVE; i++)
                        if (uuid_is_null(&logsuper->active[i].uuid)) {
                                uuid_copy(&logsuper->active[i].uuid, uuid);
                                sbi->aggregate = i;
                                break;
                        }
                if (i == MAX_ACTIVE) {
                        jfs_warn("Too many file systems sharing journal!");
                        lbmFree(bpsuper);
                        return -EMFILE;        /* Is there a better rc? */
                }
        } else {
                for (i = 0; i < MAX_ACTIVE; i++)
                        if (uuid_equal(&logsuper->active[i].uuid, uuid)) {
                                uuid_copy(&logsuper->active[i].uuid,
                                          &uuid_null);
                                break;
                        }
                if (i == MAX_ACTIVE) {
                        jfs_warn("Somebody stomped on the journal!");
                        lbmFree(bpsuper);
                        return -EIO;
                }

        }

        /*
         * synchronous write log superblock:
         *
         * write sidestream bypassing write queue:
         * at file system mount, log super block is updated for
         * activation of the file system before any log record
         * (MOUNT record) of the file system, and at file system
         * unmount, all meta data for the file system has been
         * flushed before log super block is updated for deactivation
         * of the file system.
         */
        lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
        rc = lbmIOWait(bpsuper, lbmFREE);

        return rc;
}

/*
 *                log buffer manager (lbm)
 *                ------------------------
 *
 * special purpose buffer manager supporting log i/o requirements.
 *
 * per log write queue:
 * log pageout occurs in serial order by fifo write queue and
 * restricting to a single i/o in pregress at any one time.
 * a circular singly-linked list
 * (log->wrqueue points to the tail, and buffers are linked via
 * bp->wrqueue field), and
 * maintains log page in pageout ot waiting for pageout in serial pageout.
 */

/*
 *        lbmLogInit()
 *
 * initialize per log I/O setup at lmLogInit()
 */
static int lbmLogInit(struct jfs_log * log)
{                                /* log inode */
        int i;
        struct lbuf *lbuf;

        jfs_info("lbmLogInit: log:0x%p", log);

        /* initialize current buffer cursor */
        log->bp = NULL;

        /* initialize log device write queue */
        log->wqueue = NULL;

        /*
         * Each log has its own buffer pages allocated to it.  These are
         * not managed by the page cache.  This ensures that a transaction
         * writing to the log does not block trying to allocate a page from
         * the page cache (for the log).  This would be bad, since page
         * allocation waits on the kswapd thread that may be committing inodes
         * which would cause log activity.  Was that clear?  I'm trying to
         * avoid deadlock here.
         */
        init_waitqueue_head(&log->free_wait);

        log->lbuf_free = NULL;

        for (i = 0; i < LOGPAGES;) {
                char *buffer;
                uint offset;
                struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);

                if (!page)
                        goto error;
                buffer = page_address(page);
                for (offset = 0; offset < PAGE_SIZE; offset += LOGPSIZE) {
                        lbuf = kmalloc(sizeof(struct lbuf), GFP_KERNEL);
                        if (lbuf == NULL) {
                                if (offset == 0)
                                        __free_page(page);
                                goto error;
                        }
                        if (offset) /* we already have one reference */
                                get_page(page);
                        lbuf->l_offset = offset;
                        lbuf->l_ldata = buffer + offset;
                        lbuf->l_page = page;
                        lbuf->l_log = log;
                        init_waitqueue_head(&lbuf->l_ioevent);

                        lbuf->l_freelist = log->lbuf_free;
                        log->lbuf_free = lbuf;
                        i++;
                }
        }

        return (0);

      error:
        lbmLogShutdown(log);
        return -ENOMEM;
}


/*
 *        lbmLogShutdown()
 *
 * finalize per log I/O setup at lmLogShutdown()
 */
static void lbmLogShutdown(struct jfs_log * log)
{
        struct lbuf *lbuf;

        jfs_info("lbmLogShutdown: log:0x%p", log);

        lbuf = log->lbuf_free;
        while (lbuf) {
                struct lbuf *next = lbuf->l_freelist;
                __free_page(lbuf->l_page);
                kfree(lbuf);
                lbuf = next;
        }
}


/*
 *        lbmAllocate()
 *
 * allocate an empty log buffer
 */
static struct lbuf *lbmAllocate(struct jfs_log * log, int pn)
{
        struct lbuf *bp;
        unsigned long flags;

        /*
         * recycle from log buffer freelist if any
         */
        LCACHE_LOCK(flags);
        LCACHE_SLEEP_COND(log->free_wait, (bp = log->lbuf_free), flags);
        log->lbuf_free = bp->l_freelist;
        LCACHE_UNLOCK(flags);

        bp->l_flag = 0;

        bp->l_wqnext = NULL;
        bp->l_freelist = NULL;

        bp->l_pn = pn;
        bp->l_blkno = log->base + (pn << (L2LOGPSIZE - log->l2bsize));
        bp->l_ceor = 0;

        return bp;
}


/*
 *        lbmFree()
 *
 * release a log buffer to freelist
 */
static void lbmFree(struct lbuf * bp)
{
        unsigned long flags;

        LCACHE_LOCK(flags);

        lbmfree(bp);

        LCACHE_UNLOCK(flags);
}

static void lbmfree(struct lbuf * bp)
{
        struct jfs_log *log = bp->l_log;

        assert(bp->l_wqnext == NULL);

        /*
         * return the buffer to head of freelist
         */
        bp->l_freelist = log->lbuf_free;
        log->lbuf_free = bp;

        wake_up(&log->free_wait);
        return;
}


/*
 * NAME:        lbmRedrive
 *
 * FUNCTION:        add a log buffer to the log redrive list
 *
 * PARAMETER:
 *        bp        - log buffer
 *
 * NOTES:
 *        Takes log_redrive_lock.
 */
static inline void lbmRedrive(struct lbuf *bp)
{
        unsigned long flags;

        spin_lock_irqsave(&log_redrive_lock, flags);
        bp->l_redrive_next = log_redrive_list;
        log_redrive_list = bp;
        spin_unlock_irqrestore(&log_redrive_lock, flags);

        wake_up_process(jfsIOthread);
}


/*
 *        lbmRead()
 */
static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
{
        struct bio *bio;
        struct lbuf *bp;

        /*
         * allocate a log buffer
         */
        *bpp = bp = lbmAllocate(log, pn);
        jfs_info("lbmRead: bp:0x%p pn:0x%x", bp, pn);

        bp->l_flag |= lbmREAD;

        bio = bio_alloc(file_bdev(log->bdev_file), 1, REQ_OP_READ, GFP_NOFS);
        bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
        __bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
        BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);

        bio->bi_end_io = lbmIODone;
        bio->bi_private = bp;
        /*check if journaling to disk has been disabled*/
        if (log->no_integrity) {
                bio->bi_iter.bi_size = 0;
                lbmIODone(bio);
        } else {
                submit_bio(bio);
        }

        wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));

        return 0;
}


/*
 *        lbmWrite()
 *
 * buffer at head of pageout queue stays after completion of
 * partial-page pageout and redriven by explicit initiation of
 * pageout by caller until full-page pageout is completed and
 * released.
 *
 * device driver i/o done redrives pageout of new buffer at
 * head of pageout queue when current buffer at head of pageout
 * queue is released at the completion of its full-page pageout.
 *
 * LOGGC_LOCK() serializes lbmWrite() by lmNextPage() and lmGroupCommit().
 * LCACHE_LOCK() serializes xflag between lbmWrite() and lbmIODone()
 */
static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag,
                     int cant_block)
{
        struct lbuf *tail;
        unsigned long flags;

        jfs_info("lbmWrite: bp:0x%p flag:0x%x pn:0x%x", bp, flag, bp->l_pn);

        /* map the logical block address to physical block address */
        bp->l_blkno =
            log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));

        LCACHE_LOCK(flags);                /* disable+lock */

        /*
         * initialize buffer for device driver
         */
        bp->l_flag = flag;

        /*
         *        insert bp at tail of write queue associated with log
         *
         * (request is either for bp already/currently at head of queue
         * or new bp to be inserted at tail)
         */
        tail = log->wqueue;

        /* is buffer not already on write queue ? */
        if (bp->l_wqnext == NULL) {
                /* insert at tail of wqueue */
                if (tail == NULL) {
                        log->wqueue = bp;
                        bp->l_wqnext = bp;
                } else {
                        log->wqueue = bp;
                        bp->l_wqnext = tail->l_wqnext;
                        tail->l_wqnext = bp;
                }

                tail = bp;
        }

        /* is buffer at head of wqueue and for write ? */
        if ((bp != tail->l_wqnext) || !(flag & lbmWRITE)) {
                LCACHE_UNLOCK(flags);        /* unlock+enable */
                return;
        }

        LCACHE_UNLOCK(flags);        /* unlock+enable */

        if (cant_block)
                lbmRedrive(bp);
        else if (flag & lbmSYNC)
                lbmStartIO(bp);
        else {
                LOGGC_UNLOCK(log);
                lbmStartIO(bp);
                LOGGC_LOCK(log);
        }
}


/*
 *        lbmDirectWrite()
 *
 * initiate pageout bypassing write queue for sidestream
 * (e.g., log superblock) write;
 */
static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag)
{
        jfs_info("lbmDirectWrite: bp:0x%p flag:0x%x pn:0x%x",
                 bp, flag, bp->l_pn);

        /*
         * initialize buffer for device driver
         */
        bp->l_flag = flag | lbmDIRECT;

        /* map the logical block address to physical block address */
        bp->l_blkno =
            log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));

        /*
         *        initiate pageout of the page
         */
        lbmStartIO(bp);
}


/*
 * NAME:        lbmStartIO()
 *
 * FUNCTION:        Interface to DD strategy routine
 *
 * RETURN:        none
 *
 * serialization: LCACHE_LOCK() is NOT held during log i/o;
 */
static void lbmStartIO(struct lbuf * bp)
{
        struct bio *bio;
        struct jfs_log *log = bp->l_log;
        struct block_device *bdev = NULL;

        jfs_info("lbmStartIO");

        if (!log->no_integrity)
                bdev = file_bdev(log->bdev_file);

        bio = bio_alloc(bdev, 1, REQ_OP_WRITE | REQ_SYNC,
                        GFP_NOFS);
        bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
        __bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
        BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);

        bio->bi_end_io = lbmIODone;
        bio->bi_private = bp;

        /* check if journaling to disk has been disabled */
        if (log->no_integrity) {
                bio->bi_iter.bi_size = 0;
                lbmIODone(bio);
        } else {
                submit_bio(bio);
                INCREMENT(lmStat.submitted);
        }
}


/*
 *        lbmIOWait()
 */
static int lbmIOWait(struct lbuf * bp, int flag)
{
        unsigned long flags;
        int rc = 0;

        jfs_info("lbmIOWait1: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag);

        LCACHE_LOCK(flags);                /* disable+lock */

        LCACHE_SLEEP_COND(bp->l_ioevent, (bp->l_flag & lbmDONE), flags);

        rc = (bp->l_flag & lbmERROR) ? -EIO : 0;

        if (flag & lbmFREE)
                lbmfree(bp);

        LCACHE_UNLOCK(flags);        /* unlock+enable */

        jfs_info("lbmIOWait2: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag);
        return rc;
}

/*
 *        lbmIODone()
 *
 * executed at INTIODONE level
 */
static void lbmIODone(struct bio *bio)
{
        struct lbuf *bp = bio->bi_private;
        struct lbuf *nextbp, *tail;
        struct jfs_log *log;
        unsigned long flags;

        /*
         * get back jfs buffer bound to the i/o buffer
         */
        jfs_info("lbmIODone: bp:0x%p flag:0x%x", bp, bp->l_flag);

        LCACHE_LOCK(flags);                /* disable+lock */

        bp->l_flag |= lbmDONE;

        if (bio->bi_status) {
                bp->l_flag |= lbmERROR;

                jfs_err("lbmIODone: I/O error in JFS log");
        }

        bio_put(bio);

        /*
         *        pagein completion
         */
        if (bp->l_flag & lbmREAD) {
                bp->l_flag &= ~lbmREAD;

                LCACHE_UNLOCK(flags);        /* unlock+enable */

                /* wakeup I/O initiator */
                LCACHE_WAKEUP(&bp->l_ioevent);

                return;
        }

        /*
         *        pageout completion
         *
         * the bp at the head of write queue has completed pageout.
         *
         * if single-commit/full-page pageout, remove the current buffer
         * from head of pageout queue, and redrive pageout with
         * the new buffer at head of pageout queue;
         * otherwise, the partial-page pageout buffer stays at
         * the head of pageout queue to be redriven for pageout
         * by lmGroupCommit() until full-page pageout is completed.
         */
        bp->l_flag &= ~lbmWRITE;
        INCREMENT(lmStat.pagedone);

        /* update committed lsn */
        log = bp->l_log;
        log->clsn = (bp->l_pn << L2LOGPSIZE) + bp->l_ceor;

        if (bp->l_flag & lbmDIRECT) {
                LCACHE_WAKEUP(&bp->l_ioevent);
                LCACHE_UNLOCK(flags);
                return;
        }

        tail = log->wqueue;

        /* single element queue */
        if (bp == tail) {
                /* remove head buffer of full-page pageout
                 * from log device write queue
                 */
                if (bp->l_flag & lbmRELEASE) {
                        log->wqueue = NULL;
                        bp->l_wqnext = NULL;
                }
        }
        /* multi element queue */
        else {
                /* remove head buffer of full-page pageout
                 * from log device write queue
                 */
                if (bp->l_flag & lbmRELEASE) {
                        nextbp = tail->l_wqnext = bp->l_wqnext;
                        bp->l_wqnext = NULL;

                        /*
                         * redrive pageout of next page at head of write queue:
                         * redrive next page without any bound tblk
                         * (i.e., page w/o any COMMIT records), or
                         * first page of new group commit which has been
                         * queued after current page (subsequent pageout
                         * is performed synchronously, except page without
                         * any COMMITs) by lmGroupCommit() as indicated
                         * by lbmWRITE flag;
                         */
                        if (nextbp->l_flag & lbmWRITE) {
                                /*
                                 * We can't do the I/O at interrupt time.
                                 * The jfsIO thread can do it
                                 */
                                lbmRedrive(nextbp);
                        }
                }
        }

        /*
         *        synchronous pageout:
         *
         * buffer has not necessarily been removed from write queue
         * (e.g., synchronous write of partial-page with COMMIT):
         * leave buffer for i/o initiator to dispose
         */
        if (bp->l_flag & lbmSYNC) {
                LCACHE_UNLOCK(flags);        /* unlock+enable */

                /* wakeup I/O initiator */
                LCACHE_WAKEUP(&bp->l_ioevent);
        }

        /*
         *        Group Commit pageout:
         */
        else if (bp->l_flag & lbmGC) {
                LCACHE_UNLOCK(flags);
                lmPostGC(bp);
        }

        /*
         *        asynchronous pageout:
         *
         * buffer must have been removed from write queue:
         * insert buffer at head of freelist where it can be recycled
         */
        else {
                assert(bp->l_flag & lbmRELEASE);
                assert(bp->l_flag & lbmFREE);
                lbmfree(bp);

                LCACHE_UNLOCK(flags);        /* unlock+enable */
        }
}

int jfsIOWait(void *arg)
{
        struct lbuf *bp;

        do {
                spin_lock_irq(&log_redrive_lock);
                while ((bp = log_redrive_list)) {
                        log_redrive_list = bp->l_redrive_next;
                        bp->l_redrive_next = NULL;
                        spin_unlock_irq(&log_redrive_lock);
                        lbmStartIO(bp);
                        spin_lock_irq(&log_redrive_lock);
                }

                if (freezing(current)) {
                        spin_unlock_irq(&log_redrive_lock);
                        try_to_freeze();
                } else {
                        set_current_state(TASK_INTERRUPTIBLE);
                        spin_unlock_irq(&log_redrive_lock);
                        schedule();
                }
        } while (!kthread_should_stop());

        jfs_info("jfsIOWait being killed!");
        return 0;
}

/*
 * NAME:        lmLogFormat()/jfs_logform()
 *
 * FUNCTION:        format file system log
 *
 * PARAMETERS:
 *        log        - volume log
 *        logAddress - start address of log space in FS block
 *        logSize        - length of log space in FS block;
 *
 * RETURN:        0        - success
 *                -EIO        - i/o error
 *
 * XXX: We're synchronously writing one page at a time.  This needs to
 *        be improved by writing multiple pages at once.
 */
int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
{
        int rc = -EIO;
        struct jfs_sb_info *sbi;
        struct logsuper *logsuper;
        struct logpage *lp;
        int lspn;                /* log sequence page number */
        struct lrd *lrd_ptr;
        int npages = 0;
        struct lbuf *bp;

        jfs_info("lmLogFormat: logAddress:%Ld logSize:%d",
                 (long long)logAddress, logSize);

        sbi = list_entry(log->sb_list.next, struct jfs_sb_info, log_list);

        /* allocate a log buffer */
        bp = lbmAllocate(log, 1);

        npages = logSize >> sbi->l2nbperpage;

        /*
         *        log space:
         *
         * page 0 - reserved;
         * page 1 - log superblock;
         * page 2 - log data page: A SYNC log record is written
         *            into this page at logform time;
         * pages 3-N - log data page: set to empty log data pages;
         */
        /*
         *        init log superblock: log page 1
         */
        logsuper = (struct logsuper *) bp->l_ldata;

        logsuper->magic = cpu_to_le32(LOGMAGIC);
        logsuper->version = cpu_to_le32(LOGVERSION);
        logsuper->state = cpu_to_le32(LOGREDONE);
        logsuper->flag = cpu_to_le32(sbi->mntflag);        /* ? */
        logsuper->size = cpu_to_le32(npages);
        logsuper->bsize = cpu_to_le32(sbi->bsize);
        logsuper->l2bsize = cpu_to_le32(sbi->l2bsize);
        logsuper->end = cpu_to_le32(2 * LOGPSIZE + LOGPHDRSIZE + LOGRDSIZE);

        bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
        bp->l_blkno = logAddress + sbi->nbperpage;
        lbmStartIO(bp);
        if ((rc = lbmIOWait(bp, 0)))
                goto exit;

        /*
         *        init pages 2 to npages-1 as log data pages:
         *
         * log page sequence number (lpsn) initialization:
         *
         * pn:   0     1     2     3                 n-1
         *       +-----+-----+=====+=====+===.....===+=====+
         * lspn:             N-1   0     1           N-2
         *                   <--- N page circular file ---->
         *
         * the N (= npages-2) data pages of the log is maintained as
         * a circular file for the log records;
         * lpsn grows by 1 monotonically as each log page is written
         * to the circular file of the log;
         * and setLogpage() will not reset the page number even if
         * the eor is equal to LOGPHDRSIZE. In order for binary search
         * still work in find log end process, we have to simulate the
         * log wrap situation at the log format time.
         * The 1st log page written will have the highest lpsn. Then
         * the succeeding log pages will have ascending order of
         * the lspn starting from 0, ... (N-2)
         */
        lp = (struct logpage *) bp->l_ldata;
        /*
         * initialize 1st log page to be written: lpsn = N - 1,
         * write a SYNCPT log record is written to this page
         */
        lp->h.page = lp->t.page = cpu_to_le32(npages - 3);
        lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE + LOGRDSIZE);

        lrd_ptr = (struct lrd *) &lp->data;
        lrd_ptr->logtid = 0;
        lrd_ptr->backchain = 0;
        lrd_ptr->type = cpu_to_le16(LOG_SYNCPT);
        lrd_ptr->length = 0;
        lrd_ptr->log.syncpt.sync = 0;

        bp->l_blkno += sbi->nbperpage;
        bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
        lbmStartIO(bp);
        if ((rc = lbmIOWait(bp, 0)))
                goto exit;

        /*
         *        initialize succeeding log pages: lpsn = 0, 1, ..., (N-2)
         */
        for (lspn = 0; lspn < npages - 3; lspn++) {
                lp->h.page = lp->t.page = cpu_to_le32(lspn);
                lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE);

                bp->l_blkno += sbi->nbperpage;
                bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
                lbmStartIO(bp);
                if ((rc = lbmIOWait(bp, 0)))
                        goto exit;
        }

        rc = 0;
exit:
        /*
         *        finalize log
         */
        /* release the buffer */
        lbmFree(bp);

        return rc;
}

#ifdef CONFIG_JFS_STATISTICS
int jfs_lmstats_proc_show(struct seq_file *m, void *v)
{
        seq_printf(m,
                       "JFS Logmgr stats\n"
                       "================\n"
                       "commits = %d\n"
                       "writes submitted = %d\n"
                       "writes completed = %d\n"
                       "full pages submitted = %d\n"
                       "partial pages submitted = %d\n",
                       lmStat.commit,
                       lmStat.submitted,
                       lmStat.pagedone,
                       lmStat.full_page,
                       lmStat.partial_page);
        return 0;
}
#endif /* CONFIG_JFS_STATISTICS */




























































































































































































































































































































































    4 




    4 


































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
// SPDX-License-Identifier: GPL-2.0
/*
 * shstk.c - Intel shadow stack support
 *
 * Copyright (c) 2021, Intel Corporation.
 * Yu-cheng Yu <yu-cheng.yu@intel.com>
 */

#include <linux/sched.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/sched/signal.h>
#include <linux/compat.h>
#include <linux/sizes.h>
#include <linux/user.h>
#include <linux/syscalls.h>
#include <asm/msr.h>
#include <asm/fpu/xstate.h>
#include <asm/fpu/types.h>
#include <asm/shstk.h>
#include <asm/special_insns.h>
#include <asm/fpu/api.h>
#include <asm/prctl.h>

#define SS_FRAME_SIZE 8

static bool features_enabled(unsigned long features)
{
        return current->thread.features & features;
}

static void features_set(unsigned long features)
{
        current->thread.features |= features;
}

static void features_clr(unsigned long features)
{
        current->thread.features &= ~features;
}

/*
 * Create a restore token on the shadow stack.  A token is always 8-byte
 * and aligned to 8.
 */
static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
{
        unsigned long addr;

        /* Token must be aligned */
        if (!IS_ALIGNED(ssp, 8))
                return -EINVAL;

        addr = ssp - SS_FRAME_SIZE;

        /*
         * SSP is aligned, so reserved bits and mode bit are a zero, just mark
         * the token 64-bit.
         */
        ssp |= BIT(0);

        if (write_user_shstk_64((u64 __user *)addr, (u64)ssp))
                return -EFAULT;

        if (token_addr)
                *token_addr = addr;

        return 0;
}

/*
 * VM_SHADOW_STACK will have a guard page. This helps userspace protect
 * itself from attacks. The reasoning is as follows:
 *
 * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The
 * INCSSP instruction can increment the shadow stack pointer. It is the
 * shadow stack analog of an instruction like:
 *
 *   addq $0x80, %rsp
 *
 * However, there is one important difference between an ADD on %rsp
 * and INCSSP. In addition to modifying SSP, INCSSP also reads from the
 * memory of the first and last elements that were "popped". It can be
 * thought of as acting like this:
 *
 * READ_ONCE(ssp);       // read+discard top element on stack
 * ssp += nr_to_pop * 8; // move the shadow stack
 * READ_ONCE(ssp-8);     // read+discard last popped stack element
 *
 * The maximum distance INCSSP can move the SSP is 2040 bytes, before
 * it would read the memory. Therefore a single page gap will be enough
 * to prevent any operation from shifting the SSP to an adjacent stack,
 * since it would have to land in the gap at least once, causing a
 * fault.
 */
static unsigned long alloc_shstk(unsigned long addr, unsigned long size,
                                 unsigned long token_offset, bool set_res_tok)
{
        int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G;
        struct mm_struct *mm = current->mm;
        unsigned long mapped_addr, unused;

        if (addr)
                flags |= MAP_FIXED_NOREPLACE;

        mmap_write_lock(mm);
        mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
                              VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
        mmap_write_unlock(mm);

        if (!set_res_tok || IS_ERR_VALUE(mapped_addr))
                goto out;

        if (create_rstor_token(mapped_addr + token_offset, NULL)) {
                vm_munmap(mapped_addr, size);
                return -EINVAL;
        }

out:
        return mapped_addr;
}

static unsigned long adjust_shstk_size(unsigned long size)
{
        if (size)
                return PAGE_ALIGN(size);

        return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
}

static void unmap_shadow_stack(u64 base, u64 size)
{
        int r;

        r = vm_munmap(base, size);

        /*
         * mmap_write_lock_killable() failed with -EINTR. This means
         * the process is about to die and have it's MM cleaned up.
         * This task shouldn't ever make it back to userspace. In this
         * case it is ok to leak a shadow stack, so just exit out.
         */
        if (r == -EINTR)
                return;

        /*
         * For all other types of vm_munmap() failure, either the
         * system is out of memory or there is bug.
         */
        WARN_ON_ONCE(r);
}

static int shstk_setup(void)
{
        struct thread_shstk *shstk = &current->thread.shstk;
        unsigned long addr, size;

        /* Already enabled */
        if (features_enabled(ARCH_SHSTK_SHSTK))
                return 0;

        /* Also not supported for 32 bit */
        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall())
                return -EOPNOTSUPP;

        size = adjust_shstk_size(0);
        addr = alloc_shstk(0, size, 0, false);
        if (IS_ERR_VALUE(addr))
                return PTR_ERR((void *)addr);

        fpregs_lock_and_load();
        wrmsrl(MSR_IA32_PL3_SSP, addr + size);
        wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN);
        fpregs_unlock();

        shstk->base = addr;
        shstk->size = size;
        features_set(ARCH_SHSTK_SHSTK);

        return 0;
}

void reset_thread_features(void)
{
        memset(&current->thread.shstk, 0, sizeof(struct thread_shstk));
        current->thread.features = 0;
        current->thread.features_locked = 0;
}

unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags,
                                       unsigned long stack_size)
{
        struct thread_shstk *shstk = &tsk->thread.shstk;
        unsigned long addr, size;

        /*
         * If shadow stack is not enabled on the new thread, skip any
         * switch to a new shadow stack.
         */
        if (!features_enabled(ARCH_SHSTK_SHSTK))
                return 0;

        /*
         * For CLONE_VFORK the child will share the parents shadow stack.
         * Make sure to clear the internal tracking of the thread shadow
         * stack so the freeing logic run for child knows to leave it alone.
         */
        if (clone_flags & CLONE_VFORK) {
                shstk->base = 0;
                shstk->size = 0;
                return 0;
        }

        /*
         * For !CLONE_VM the child will use a copy of the parents shadow
         * stack.
         */
        if (!(clone_flags & CLONE_VM))
                return 0;

        size = adjust_shstk_size(stack_size);
        addr = alloc_shstk(0, size, 0, false);
        if (IS_ERR_VALUE(addr))
                return addr;

        shstk->base = addr;
        shstk->size = size;

        return addr + size;
}

static unsigned long get_user_shstk_addr(void)
{
        unsigned long long ssp;

        fpregs_lock_and_load();

        rdmsrl(MSR_IA32_PL3_SSP, ssp);

        fpregs_unlock();

        return ssp;
}

#define SHSTK_DATA_BIT BIT(63)

static int put_shstk_data(u64 __user *addr, u64 data)
{
        if (WARN_ON_ONCE(data & SHSTK_DATA_BIT))
                return -EINVAL;

        /*
         * Mark the high bit so that the sigframe can't be processed as a
         * return address.
         */
        if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT))
                return -EFAULT;
        return 0;
}

static int get_shstk_data(unsigned long *data, unsigned long __user *addr)
{
        unsigned long ldata;

        if (unlikely(get_user(ldata, addr)))
                return -EFAULT;

        if (!(ldata & SHSTK_DATA_BIT))
                return -EINVAL;

        *data = ldata & ~SHSTK_DATA_BIT;

        return 0;
}

static int shstk_push_sigframe(unsigned long *ssp)
{
        unsigned long target_ssp = *ssp;

        /* Token must be aligned */
        if (!IS_ALIGNED(target_ssp, 8))
                return -EINVAL;

        *ssp -= SS_FRAME_SIZE;
        if (put_shstk_data((void __user *)*ssp, target_ssp))
                return -EFAULT;

        return 0;
}

static int shstk_pop_sigframe(unsigned long *ssp)
{
        struct vm_area_struct *vma;
        unsigned long token_addr;
        bool need_to_check_vma;
        int err = 1;

        /*
         * It is possible for the SSP to be off the end of a shadow stack by 4
         * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes
         * before it, it might be this case, so check that the address being
         * read is actually shadow stack.
         */
        if (!IS_ALIGNED(*ssp, 8))
                return -EINVAL;

        need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp;

        if (need_to_check_vma)
                mmap_read_lock_killable(current->mm);

        err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp);
        if (unlikely(err))
                goto out_err;

        if (need_to_check_vma) {
                vma = find_vma(current->mm, *ssp);
                if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) {
                        err = -EFAULT;
                        goto out_err;
                }

                mmap_read_unlock(current->mm);
        }

        /* Restore SSP aligned? */
        if (unlikely(!IS_ALIGNED(token_addr, 8)))
                return -EINVAL;

        /* SSP in userspace? */
        if (unlikely(token_addr >= TASK_SIZE_MAX))
                return -EINVAL;

        *ssp = token_addr;

        return 0;
out_err:
        if (need_to_check_vma)
                mmap_read_unlock(current->mm);
        return err;
}

int setup_signal_shadow_stack(struct ksignal *ksig)
{
        void __user *restorer = ksig->ka.sa.sa_restorer;
        unsigned long ssp;
        int err;

        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
            !features_enabled(ARCH_SHSTK_SHSTK))
                return 0;

        if (!restorer)
                return -EINVAL;

        ssp = get_user_shstk_addr();
        if (unlikely(!ssp))
                return -EINVAL;

        err = shstk_push_sigframe(&ssp);
        if (unlikely(err))
                return err;

        /* Push restorer address */
        ssp -= SS_FRAME_SIZE;
        err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer);
        if (unlikely(err))
                return -EFAULT;

        fpregs_lock_and_load();
        wrmsrl(MSR_IA32_PL3_SSP, ssp);
        fpregs_unlock();

        return 0;
}

int restore_signal_shadow_stack(void)
{
        unsigned long ssp;
        int err;

        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
            !features_enabled(ARCH_SHSTK_SHSTK))
                return 0;

        ssp = get_user_shstk_addr();
        if (unlikely(!ssp))
                return -EINVAL;

        err = shstk_pop_sigframe(&ssp);
        if (unlikely(err))
                return err;

        fpregs_lock_and_load();
        wrmsrl(MSR_IA32_PL3_SSP, ssp);
        fpregs_unlock();

        return 0;
}

void shstk_free(struct task_struct *tsk)
{
        struct thread_shstk *shstk = &tsk->thread.shstk;

        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
            !features_enabled(ARCH_SHSTK_SHSTK))
                return;

        /*
         * When fork() with CLONE_VM fails, the child (tsk) already has a
         * shadow stack allocated, and exit_thread() calls this function to
         * free it.  In this case the parent (current) and the child share
         * the same mm struct.
         */
        if (!tsk->mm || tsk->mm != current->mm)
                return;

        /*
         * If shstk->base is NULL, then this task is not managing its
         * own shadow stack (CLONE_VFORK). So skip freeing it.
         */
        if (!shstk->base)
                return;

        /*
         * shstk->base is NULL for CLONE_VFORK child tasks, and so is
         * normal. But size = 0 on a shstk->base is not normal and
         * indicated an attempt to free the thread shadow stack twice.
         * Warn about it.
         */
        if (WARN_ON(!shstk->size))
                return;

        unmap_shadow_stack(shstk->base, shstk->size);

        shstk->size = 0;
}

static int wrss_control(bool enable)
{
        u64 msrval;

        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
                return -EOPNOTSUPP;

        /*
         * Only enable WRSS if shadow stack is enabled. If shadow stack is not
         * enabled, WRSS will already be disabled, so don't bother clearing it
         * when disabling.
         */
        if (!features_enabled(ARCH_SHSTK_SHSTK))
                return -EPERM;

        /* Already enabled/disabled? */
        if (features_enabled(ARCH_SHSTK_WRSS) == enable)
                return 0;

        fpregs_lock_and_load();
        rdmsrl(MSR_IA32_U_CET, msrval);

        if (enable) {
                features_set(ARCH_SHSTK_WRSS);
                msrval |= CET_WRSS_EN;
        } else {
                features_clr(ARCH_SHSTK_WRSS);
                if (!(msrval & CET_WRSS_EN))
                        goto unlock;

                msrval &= ~CET_WRSS_EN;
        }

        wrmsrl(MSR_IA32_U_CET, msrval);

unlock:
        fpregs_unlock();

        return 0;
}

static int shstk_disable(void)
{
        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
                return -EOPNOTSUPP;

        /* Already disabled? */
        if (!features_enabled(ARCH_SHSTK_SHSTK))
                return 0;

        fpregs_lock_and_load();
        /* Disable WRSS too when disabling shadow stack */
        wrmsrl(MSR_IA32_U_CET, 0);
        wrmsrl(MSR_IA32_PL3_SSP, 0);
        fpregs_unlock();

        shstk_free(current);
        features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS);

        return 0;
}

SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
{
        bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
        unsigned long aligned_size;

        if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
                return -EOPNOTSUPP;

        if (flags & ~SHADOW_STACK_SET_TOKEN)
                return -EINVAL;

        /* If there isn't space for a token */
        if (set_tok && size < 8)
                return -ENOSPC;

        if (addr && addr < SZ_4G)
                return -ERANGE;

        /*
         * An overflow would result in attempting to write the restore token
         * to the wrong location. Not catastrophic, but just return the right
         * error code and block it.
         */
        aligned_size = PAGE_ALIGN(size);
        if (aligned_size < size)
                return -EOVERFLOW;

        return alloc_shstk(addr, aligned_size, size, set_tok);
}

long shstk_prctl(struct task_struct *task, int option, unsigned long arg2)
{
        unsigned long features = arg2;

        if (option == ARCH_SHSTK_STATUS) {
                return put_user(task->thread.features, (unsigned long __user *)arg2);
        }

        if (option == ARCH_SHSTK_LOCK) {
                task->thread.features_locked |= features;
                return 0;
        }

        /* Only allow via ptrace */
        if (task != current) {
                if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) {
                        task->thread.features_locked &= ~features;
                        return 0;
                }
                return -EINVAL;
        }

        /* Do not allow to change locked features */
        if (features & task->thread.features_locked)
                return -EPERM;

        /* Only support enabling/disabling one feature at a time. */
        if (hweight_long(features) > 1)
                return -EINVAL;

        if (option == ARCH_SHSTK_DISABLE) {
                if (features & ARCH_SHSTK_WRSS)
                        return wrss_control(false);
                if (features & ARCH_SHSTK_SHSTK)
                        return shstk_disable();
                return -EINVAL;
        }

        /* Handle ARCH_SHSTK_ENABLE */
        if (features & ARCH_SHSTK_SHSTK)
                return shstk_setup();
        if (features & ARCH_SHSTK_WRSS)
                return wrss_control(true);
        return -EINVAL;
}









































    1 
    1 










































































































































































    9 


   10 
   10 
   11 

   12 






















    1 


    1 
    1 

    1 
    1 

    1 




    1 
    1 

    1 





























































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
// SPDX-License-Identifier: GPL-2.0
/*
 *        linux/mm/mlock.c
 *
 *  (C) Copyright 1995 Linus Torvalds
 *  (C) Copyright 2002 Christoph Hellwig
 */

#include <linux/capability.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/sched/user.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/pagewalk.h>
#include <linux/mempolicy.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
#include <linux/export.h>
#include <linux/rmap.h>
#include <linux/mmzone.h>
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/secretmem.h>

#include "internal.h"

struct mlock_fbatch {
        local_lock_t lock;
        struct folio_batch fbatch;
};

static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch) = {
        .lock = INIT_LOCAL_LOCK(lock),
};

bool can_do_mlock(void)
{
        if (rlimit(RLIMIT_MEMLOCK) != 0)
                return true;
        if (capable(CAP_IPC_LOCK))
                return true;
        return false;
}
EXPORT_SYMBOL(can_do_mlock);

/*
 * Mlocked folios are marked with the PG_mlocked flag for efficient testing
 * in vmscan and, possibly, the fault path; and to support semi-accurate
 * statistics.
 *
 * An mlocked folio [folio_test_mlocked(folio)] is unevictable.  As such, it
 * will be ostensibly placed on the LRU "unevictable" list (actually no such
 * list exists), rather than the [in]active lists. PG_unevictable is set to
 * indicate the unevictable state.
 */

static struct lruvec *__mlock_folio(struct folio *folio, struct lruvec *lruvec)
{
        /* There is nothing more we can do while it's off LRU */
        if (!folio_test_clear_lru(folio))
                return lruvec;

        lruvec = folio_lruvec_relock_irq(folio, lruvec);

        if (unlikely(folio_evictable(folio))) {
                /*
                 * This is a little surprising, but quite possible: PG_mlocked
                 * must have got cleared already by another CPU.  Could this
                 * folio be unevictable?  I'm not sure, but move it now if so.
                 */
                if (folio_test_unevictable(folio)) {
                        lruvec_del_folio(lruvec, folio);
                        folio_clear_unevictable(folio);
                        lruvec_add_folio(lruvec, folio);

                        __count_vm_events(UNEVICTABLE_PGRESCUED,
                                          folio_nr_pages(folio));
                }
                goto out;
        }

        if (folio_test_unevictable(folio)) {
                if (folio_test_mlocked(folio))
                        folio->mlock_count++;
                goto out;
        }

        lruvec_del_folio(lruvec, folio);
        folio_clear_active(folio);
        folio_set_unevictable(folio);
        folio->mlock_count = !!folio_test_mlocked(folio);
        lruvec_add_folio(lruvec, folio);
        __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio));
out:
        folio_set_lru(folio);
        return lruvec;
}

static struct lruvec *__mlock_new_folio(struct folio *folio, struct lruvec *lruvec)
{
        VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

        lruvec = folio_lruvec_relock_irq(folio, lruvec);

        /* As above, this is a little surprising, but possible */
        if (unlikely(folio_evictable(folio)))
                goto out;

        folio_set_unevictable(folio);
        folio->mlock_count = !!folio_test_mlocked(folio);
        __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio));
out:
        lruvec_add_folio(lruvec, folio);
        folio_set_lru(folio);
        return lruvec;
}

static struct lruvec *__munlock_folio(struct folio *folio, struct lruvec *lruvec)
{
        int nr_pages = folio_nr_pages(folio);
        bool isolated = false;

        if (!folio_test_clear_lru(folio))
                goto munlock;

        isolated = true;
        lruvec = folio_lruvec_relock_irq(folio, lruvec);

        if (folio_test_unevictable(folio)) {
                /* Then mlock_count is maintained, but might undercount */
                if (folio->mlock_count)
                        folio->mlock_count--;
                if (folio->mlock_count)
                        goto out;
        }
        /* else assume that was the last mlock: reclaim will fix it if not */

munlock:
        if (folio_test_clear_mlocked(folio)) {
                __zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
                if (isolated || !folio_test_unevictable(folio))
                        __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
                else
                        __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
        }

        /* folio_evictable() has to be checked *after* clearing Mlocked */
        if (isolated && folio_test_unevictable(folio) && folio_evictable(folio)) {
                lruvec_del_folio(lruvec, folio);
                folio_clear_unevictable(folio);
                lruvec_add_folio(lruvec, folio);
                __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
        }
out:
        if (isolated)
                folio_set_lru(folio);
        return lruvec;
}

/*
 * Flags held in the low bits of a struct folio pointer on the mlock_fbatch.
 */
#define LRU_FOLIO 0x1
#define NEW_FOLIO 0x2
static inline struct folio *mlock_lru(struct folio *folio)
{
        return (struct folio *)((unsigned long)folio + LRU_FOLIO);
}

static inline struct folio *mlock_new(struct folio *folio)
{
        return (struct folio *)((unsigned long)folio + NEW_FOLIO);
}

/*
 * mlock_folio_batch() is derived from folio_batch_move_lru(): perhaps that can
 * make use of such folio pointer flags in future, but for now just keep it for
 * mlock.  We could use three separate folio batches instead, but one feels
 * better (munlocking a full folio batch does not need to drain mlocking folio
 * batches first).
 */
static void mlock_folio_batch(struct folio_batch *fbatch)
{
        struct lruvec *lruvec = NULL;
        unsigned long mlock;
        struct folio *folio;
        int i;

        for (i = 0; i < folio_batch_count(fbatch); i++) {
                folio = fbatch->folios[i];
                mlock = (unsigned long)folio & (LRU_FOLIO | NEW_FOLIO);
                folio = (struct folio *)((unsigned long)folio - mlock);
                fbatch->folios[i] = folio;

                if (mlock & LRU_FOLIO)
                        lruvec = __mlock_folio(folio, lruvec);
                else if (mlock & NEW_FOLIO)
                        lruvec = __mlock_new_folio(folio, lruvec);
                else
                        lruvec = __munlock_folio(folio, lruvec);
        }

        if (lruvec)
                unlock_page_lruvec_irq(lruvec);
        folios_put(fbatch);
}

void mlock_drain_local(void)
{
        struct folio_batch *fbatch;

        local_lock(&mlock_fbatch.lock);
        fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
        if (folio_batch_count(fbatch))
                mlock_folio_batch(fbatch);
        local_unlock(&mlock_fbatch.lock);
}

void mlock_drain_remote(int cpu)
{
        struct folio_batch *fbatch;

        WARN_ON_ONCE(cpu_online(cpu));
        fbatch = &per_cpu(mlock_fbatch.fbatch, cpu);
        if (folio_batch_count(fbatch))
                mlock_folio_batch(fbatch);
}

bool need_mlock_drain(int cpu)
{
        return folio_batch_count(&per_cpu(mlock_fbatch.fbatch, cpu));
}

/**
 * mlock_folio - mlock a folio already on (or temporarily off) LRU
 * @folio: folio to be mlocked.
 */
void mlock_folio(struct folio *folio)
{
        struct folio_batch *fbatch;

        local_lock(&mlock_fbatch.lock);
        fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);

        if (!folio_test_set_mlocked(folio)) {
                int nr_pages = folio_nr_pages(folio);

                zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
                __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
        }

        folio_get(folio);
        if (!folio_batch_add(fbatch, mlock_lru(folio)) ||
            folio_test_large(folio) || lru_cache_disabled())
                mlock_folio_batch(fbatch);
        local_unlock(&mlock_fbatch.lock);
}

/**
 * mlock_new_folio - mlock a newly allocated folio not yet on LRU
 * @folio: folio to be mlocked, either normal or a THP head.
 */
void mlock_new_folio(struct folio *folio)
{
        struct folio_batch *fbatch;
        int nr_pages = folio_nr_pages(folio);

        local_lock(&mlock_fbatch.lock);
        fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
        folio_set_mlocked(folio);

        zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
        __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);

        folio_get(folio);
        if (!folio_batch_add(fbatch, mlock_new(folio)) ||
            folio_test_large(folio) || lru_cache_disabled())
                mlock_folio_batch(fbatch);
        local_unlock(&mlock_fbatch.lock);
}

/**
 * munlock_folio - munlock a folio
 * @folio: folio to be munlocked, either normal or a THP head.
 */
void munlock_folio(struct folio *folio)
{
        struct folio_batch *fbatch;

        local_lock(&mlock_fbatch.lock);
        fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
        /*
         * folio_test_clear_mlocked(folio) must be left to __munlock_folio(),
         * which will check whether the folio is multiply mlocked.
         */
        folio_get(folio);
        if (!folio_batch_add(fbatch, folio) ||
            folio_test_large(folio) || lru_cache_disabled())
                mlock_folio_batch(fbatch);
        local_unlock(&mlock_fbatch.lock);
}

static inline unsigned int folio_mlock_step(struct folio *folio,
                pte_t *pte, unsigned long addr, unsigned long end)
{
        unsigned int count, i, nr = folio_nr_pages(folio);
        unsigned long pfn = folio_pfn(folio);
        pte_t ptent = ptep_get(pte);

        if (!folio_test_large(folio))
                return 1;

        count = pfn + nr - pte_pfn(ptent);
        count = min_t(unsigned int, count, (end - addr) >> PAGE_SHIFT);

        for (i = 0; i < count; i++, pte++) {
                pte_t entry = ptep_get(pte);

                if (!pte_present(entry))
                        break;
                if (pte_pfn(entry) - pfn >= nr)
                        break;
        }

        return i;
}

static inline bool allow_mlock_munlock(struct folio *folio,
                struct vm_area_struct *vma, unsigned long start,
                unsigned long end, unsigned int step)
{
        /*
         * For unlock, allow munlock large folio which is partially
         * mapped to VMA. As it's possible that large folio is
         * mlocked and VMA is split later.
         *
         * During memory pressure, such kind of large folio can
         * be split. And the pages are not in VM_LOCKed VMA
         * can be reclaimed.
         */
        if (!(vma->vm_flags & VM_LOCKED))
                return true;

        /* folio_within_range() cannot take KSM, but any small folio is OK */
        if (!folio_test_large(folio))
                return true;

        /* folio not in range [start, end), skip mlock */
        if (!folio_within_range(folio, vma, start, end))
                return false;

        /* folio is not fully mapped, skip mlock */
        if (step != folio_nr_pages(folio))
                return false;

        return true;
}

static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
                           unsigned long end, struct mm_walk *walk)

{
        struct vm_area_struct *vma = walk->vma;
        spinlock_t *ptl;
        pte_t *start_pte, *pte;
        pte_t ptent;
        struct folio *folio;
        unsigned int step = 1;
        unsigned long start = addr;

        ptl = pmd_trans_huge_lock(pmd, vma);
        if (ptl) {
                if (!pmd_present(*pmd))
                        goto out;
                if (is_huge_zero_pmd(*pmd))
                        goto out;
                folio = pmd_folio(*pmd);
                if (vma->vm_flags & VM_LOCKED)
                        mlock_folio(folio);
                else
                        munlock_folio(folio);
                goto out;
        }

        start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (!start_pte) {
                walk->action = ACTION_AGAIN;
                return 0;
        }

        for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) {
                ptent = ptep_get(pte);
                if (!pte_present(ptent))
                        continue;
                folio = vm_normal_folio(vma, addr, ptent);
                if (!folio || folio_is_zone_device(folio))
                        continue;

                step = folio_mlock_step(folio, pte, addr, end);
                if (!allow_mlock_munlock(folio, vma, start, end, step))
                        goto next_entry;

                if (vma->vm_flags & VM_LOCKED)
                        mlock_folio(folio);
                else
                        munlock_folio(folio);

next_entry:
                pte += step - 1;
                addr += (step - 1) << PAGE_SHIFT;
        }
        pte_unmap(start_pte);
out:
        spin_unlock(ptl);
        cond_resched();
        return 0;
}

/*
 * mlock_vma_pages_range() - mlock any pages already in the range,
 *                           or munlock all pages in the range.
 * @vma - vma containing range to be mlock()ed or munlock()ed
 * @start - start address in @vma of the range
 * @end - end of range in @vma
 * @newflags - the new set of flags for @vma.
 *
 * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED;
 * called for munlock() and munlockall(), to clear VM_LOCKED from @vma.
 */
static void mlock_vma_pages_range(struct vm_area_struct *vma,
        unsigned long start, unsigned long end, vm_flags_t newflags)
{
        static const struct mm_walk_ops mlock_walk_ops = {
                .pmd_entry = mlock_pte_range,
                .walk_lock = PGWALK_WRLOCK_VERIFY,
        };

        /*
         * There is a slight chance that concurrent page migration,
         * or page reclaim finding a page of this now-VM_LOCKED vma,
         * will call mlock_vma_folio() and raise page's mlock_count:
         * double counting, leaving the page unevictable indefinitely.
         * Communicate this danger to mlock_vma_folio() with VM_IO,
         * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas.
         * mmap_lock is held in write mode here, so this weird
         * combination should not be visible to other mmap_lock users;
         * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED.
         */
        if (newflags & VM_LOCKED)
                newflags |= VM_IO;
        vma_start_write(vma);
        vm_flags_reset_once(vma, newflags);

        lru_add_drain();
        walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL);
        lru_add_drain();

        if (newflags & VM_IO) {
                newflags &= ~VM_IO;
                vm_flags_reset_once(vma, newflags);
        }
}

/*
 * mlock_fixup  - handle mlock[all]/munlock[all] requests.
 *
 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
 * munlock is a no-op.  However, for some special vmas, we go ahead and
 * populate the ptes.
 *
 * For vmas that pass the filters, merge/split as appropriate.
 */
static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
               struct vm_area_struct **prev, unsigned long start,
               unsigned long end, vm_flags_t newflags)
{
        struct mm_struct *mm = vma->vm_mm;
        int nr_pages;
        int ret = 0;
        vm_flags_t oldflags = vma->vm_flags;

        if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
            is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
            vma_is_dax(vma) || vma_is_secretmem(vma))
                /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
                goto out;

        vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
        if (IS_ERR(vma)) {
                ret = PTR_ERR(vma);
                goto out;
        }

        /*
         * Keep track of amount of locked VM.
         */
        nr_pages = (end - start) >> PAGE_SHIFT;
        if (!(newflags & VM_LOCKED))
                nr_pages = -nr_pages;
        else if (oldflags & VM_LOCKED)
                nr_pages = 0;
        mm->locked_vm += nr_pages;

        /*
         * vm_flags is protected by the mmap_lock held in write mode.
         * It's okay if try_to_unmap_one unmaps a page just after we
         * set VM_LOCKED, populate_vma_page_range will bring it back.
         */
        if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
                /* No work to do, and mlocking twice would be wrong */
                vma_start_write(vma);
                vm_flags_reset(vma, newflags);
        } else {
                mlock_vma_pages_range(vma, start, end, newflags);
        }
out:
        *prev = vma;
        return ret;
}

static int apply_vma_lock_flags(unsigned long start, size_t len,
                                vm_flags_t flags)
{
        unsigned long nstart, end, tmp;
        struct vm_area_struct *vma, *prev;
        VMA_ITERATOR(vmi, current->mm, start);

        VM_BUG_ON(offset_in_page(start));
        VM_BUG_ON(len != PAGE_ALIGN(len));
        end = start + len;
        if (end < start)
                return -EINVAL;
        if (end == start)
                return 0;
        vma = vma_iter_load(&vmi);
        if (!vma)
                return -ENOMEM;

        prev = vma_prev(&vmi);
        if (start > vma->vm_start)
                prev = vma;

        nstart = start;
        tmp = vma->vm_start;
        for_each_vma_range(vmi, vma, end) {
                int error;
                vm_flags_t newflags;

                if (vma->vm_start != tmp)
                        return -ENOMEM;

                newflags = vma->vm_flags & ~VM_LOCKED_MASK;
                newflags |= flags;
                /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
                tmp = vma->vm_end;
                if (tmp > end)
                        tmp = end;
                error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
                if (error)
                        return error;
                tmp = vma_iter_end(&vmi);
                nstart = tmp;
        }

        if (tmp < end)
                return -ENOMEM;

        return 0;
}

/*
 * Go through vma areas and sum size of mlocked
 * vma pages, as return value.
 * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
 * is also counted.
 * Return value: previously mlocked page counts
 */
static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
                unsigned long start, size_t len)
{
        struct vm_area_struct *vma;
        unsigned long count = 0;
        unsigned long end;
        VMA_ITERATOR(vmi, mm, start);

        /* Don't overflow past ULONG_MAX */
        if (unlikely(ULONG_MAX - len < start))
                end = ULONG_MAX;
        else
                end = start + len;

        for_each_vma_range(vmi, vma, end) {
                if (vma->vm_flags & VM_LOCKED) {
                        if (start > vma->vm_start)
                                count -= (start - vma->vm_start);
                        if (end < vma->vm_end) {
                                count += end - vma->vm_start;
                                break;
                        }
                        count += vma->vm_end - vma->vm_start;
                }
        }

        return count >> PAGE_SHIFT;
}

/*
 * convert get_user_pages() return value to posix mlock() error
 */
static int __mlock_posix_error_return(long retval)
{
        if (retval == -EFAULT)
                retval = -ENOMEM;
        else if (retval == -ENOMEM)
                retval = -EAGAIN;
        return retval;
}

static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
{
        unsigned long locked;
        unsigned long lock_limit;
        int error = -ENOMEM;

        start = untagged_addr(start);

        if (!can_do_mlock())
                return -EPERM;

        len = PAGE_ALIGN(len + (offset_in_page(start)));
        start &= PAGE_MASK;

        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
        locked = len >> PAGE_SHIFT;

        if (mmap_write_lock_killable(current->mm))
                return -EINTR;

        locked += current->mm->locked_vm;
        if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
                /*
                 * It is possible that the regions requested intersect with
                 * previously mlocked areas, that part area in "mm->locked_vm"
                 * should not be counted to new mlock increment count. So check
                 * and adjust locked count if necessary.
                 */
                locked -= count_mm_mlocked_page_nr(current->mm,
                                start, len);
        }

        /* check against resource limits */
        if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
                error = apply_vma_lock_flags(start, len, flags);

        mmap_write_unlock(current->mm);
        if (error)
                return error;

        error = __mm_populate(start, len, 0);
        if (error)
                return __mlock_posix_error_return(error);
        return 0;
}

SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
{
        return do_mlock(start, len, VM_LOCKED);
}

SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
{
        vm_flags_t vm_flags = VM_LOCKED;

        if (flags & ~MLOCK_ONFAULT)
                return -EINVAL;

        if (flags & MLOCK_ONFAULT)
                vm_flags |= VM_LOCKONFAULT;

        return do_mlock(start, len, vm_flags);
}

SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
{
        int ret;

        start = untagged_addr(start);

        len = PAGE_ALIGN(len + (offset_in_page(start)));
        start &= PAGE_MASK;

        if (mmap_write_lock_killable(current->mm))
                return -EINTR;
        ret = apply_vma_lock_flags(start, len, 0);
        mmap_write_unlock(current->mm);

        return ret;
}

/*
 * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
 * and translate into the appropriate modifications to mm->def_flags and/or the
 * flags for all current VMAs.
 *
 * There are a couple of subtleties with this.  If mlockall() is called multiple
 * times with different flags, the values do not necessarily stack.  If mlockall
 * is called once including the MCL_FUTURE flag and then a second time without
 * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
 */
static int apply_mlockall_flags(int flags)
{
        VMA_ITERATOR(vmi, current->mm, 0);
        struct vm_area_struct *vma, *prev = NULL;
        vm_flags_t to_add = 0;

        current->mm->def_flags &= ~VM_LOCKED_MASK;
        if (flags & MCL_FUTURE) {
                current->mm->def_flags |= VM_LOCKED;

                if (flags & MCL_ONFAULT)
                        current->mm->def_flags |= VM_LOCKONFAULT;

                if (!(flags & MCL_CURRENT))
                        goto out;
        }

        if (flags & MCL_CURRENT) {
                to_add |= VM_LOCKED;
                if (flags & MCL_ONFAULT)
                        to_add |= VM_LOCKONFAULT;
        }

        for_each_vma(vmi, vma) {
                vm_flags_t newflags;

                newflags = vma->vm_flags & ~VM_LOCKED_MASK;
                newflags |= to_add;

                /* Ignore errors */
                mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end,
                            newflags);
                cond_resched();
        }
out:
        return 0;
}

SYSCALL_DEFINE1(mlockall, int, flags)
{
        unsigned long lock_limit;
        int ret;

        if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
            flags == MCL_ONFAULT)
                return -EINVAL;

        if (!can_do_mlock())
                return -EPERM;

        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;

        if (mmap_write_lock_killable(current->mm))
                return -EINTR;

        ret = -ENOMEM;
        if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
            capable(CAP_IPC_LOCK))
                ret = apply_mlockall_flags(flags);
        mmap_write_unlock(current->mm);
        if (!ret && (flags & MCL_CURRENT))
                mm_populate(0, TASK_SIZE);

        return ret;
}

SYSCALL_DEFINE0(munlockall)
{
        int ret;

        if (mmap_write_lock_killable(current->mm))
                return -EINTR;
        ret = apply_mlockall_flags(0);
        mmap_write_unlock(current->mm);
        return ret;
}

/*
 * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
 * shm segments) get accounted against the user_struct instead.
 */
static DEFINE_SPINLOCK(shmlock_user_lock);

int user_shm_lock(size_t size, struct ucounts *ucounts)
{
        unsigned long lock_limit, locked;
        long memlock;
        int allowed = 0;

        locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
        lock_limit = rlimit(RLIMIT_MEMLOCK);
        if (lock_limit != RLIM_INFINITY)
                lock_limit >>= PAGE_SHIFT;
        spin_lock(&shmlock_user_lock);
        memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);

        if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
                dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
                goto out;
        }
        if (!get_ucounts(ucounts)) {
                dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
                allowed = 0;
                goto out;
        }
        allowed = 1;
out:
        spin_unlock(&shmlock_user_lock);
        return allowed;
}

void user_shm_unlock(size_t size, struct ucounts *ucounts)
{
        spin_lock(&shmlock_user_lock);
        dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
        spin_unlock(&shmlock_user_lock);
        put_ucounts(ucounts);
}












































    3 
















    3 






















































































   10 











   10 
    9 




    9 
















   10 

   10 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
// SPDX-License-Identifier: GPL-2.0
#include <linux/spinlock.h>
#include <linux/task_work.h>
#include <linux/resume_user_mode.h>

static struct callback_head work_exited; /* all we need is ->next == NULL */

/**
 * task_work_add - ask the @task to execute @work->func()
 * @task: the task which should run the callback
 * @work: the callback to run
 * @notify: how to notify the targeted task
 *
 * Queue @work for task_work_run() below and notify the @task if @notify
 * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI.
 *
 * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted
 * task and run the task_work, regardless of whether the task is currently
 * running in the kernel or userspace.
 * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a
 * reschedule IPI to force the targeted task to reschedule and run task_work.
 * This can be advantageous if there's no strict requirement that the
 * task_work be run as soon as possible, just whenever the task enters the
 * kernel anyway.
 * @TWA_RESUME work is run only when the task exits the kernel and returns to
 * user mode, or before entering guest mode.
 *
 * Fails if the @task is exiting/exited and thus it can't process this @work.
 * Otherwise @work->func() will be called when the @task goes through one of
 * the aforementioned transitions, or exits.
 *
 * If the targeted task is exiting, then an error is returned and the work item
 * is not queued. It's up to the caller to arrange for an alternative mechanism
 * in that case.
 *
 * Note: there is no ordering guarantee on works queued here. The task_work
 * list is LIFO.
 *
 * RETURNS:
 * 0 if succeeds or -ESRCH.
 */
int task_work_add(struct task_struct *task, struct callback_head *work,
                  enum task_work_notify_mode notify)
{
        struct callback_head *head;

        /* record the work call stack in order to print it in KASAN reports */
        kasan_record_aux_stack(work);

        head = READ_ONCE(task->task_works);
        do {
                if (unlikely(head == &work_exited))
                        return -ESRCH;
                work->next = head;
        } while (!try_cmpxchg(&task->task_works, &head, work));

        switch (notify) {
        case TWA_NONE:
                break;
        case TWA_RESUME:
                set_notify_resume(task);
                break;
        case TWA_SIGNAL:
                set_notify_signal(task);
                break;
        case TWA_SIGNAL_NO_IPI:
                __set_notify_signal(task);
                break;
        default:
                WARN_ON_ONCE(1);
                break;
        }

        return 0;
}

/**
 * task_work_cancel_match - cancel a pending work added by task_work_add()
 * @task: the task which should execute the work
 * @match: match function to call
 * @data: data to be passed in to match function
 *
 * RETURNS:
 * The found work or NULL if not found.
 */
struct callback_head *
task_work_cancel_match(struct task_struct *task,
                       bool (*match)(struct callback_head *, void *data),
                       void *data)
{
        struct callback_head **pprev = &task->task_works;
        struct callback_head *work;
        unsigned long flags;

        if (likely(!task_work_pending(task)))
                return NULL;
        /*
         * If cmpxchg() fails we continue without updating pprev.
         * Either we raced with task_work_add() which added the
         * new entry before this work, we will find it again. Or
         * we raced with task_work_run(), *pprev == NULL/exited.
         */
        raw_spin_lock_irqsave(&task->pi_lock, flags);
        work = READ_ONCE(*pprev);
        while (work) {
                if (!match(work, data)) {
                        pprev = &work->next;
                        work = READ_ONCE(*pprev);
                } else if (try_cmpxchg(pprev, &work, work->next))
                        break;
        }
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);

        return work;
}

static bool task_work_func_match(struct callback_head *cb, void *data)
{
        return cb->func == data;
}

/**
 * task_work_cancel - cancel a pending work added by task_work_add()
 * @task: the task which should execute the work
 * @func: identifies the work to remove
 *
 * Find the last queued pending work with ->func == @func and remove
 * it from queue.
 *
 * RETURNS:
 * The found work or NULL if not found.
 */
struct callback_head *
task_work_cancel(struct task_struct *task, task_work_func_t func)
{
        return task_work_cancel_match(task, task_work_func_match, func);
}

/**
 * task_work_run - execute the works added by task_work_add()
 *
 * Flush the pending works. Should be used by the core kernel code.
 * Called before the task returns to the user-mode or stops, or when
 * it exits. In the latter case task_work_add() can no longer add the
 * new work after task_work_run() returns.
 */
void task_work_run(void)
{
        struct task_struct *task = current;
        struct callback_head *work, *head, *next;

        for (;;) {
                /*
                 * work->func() can do task_work_add(), do not set
                 * work_exited unless the list is empty.
                 */
                work = READ_ONCE(task->task_works);
                do {
                        head = NULL;
                        if (!work) {
                                if (task->flags & PF_EXITING)
                                        head = &work_exited;
                                else
                                        break;
                        }
                } while (!try_cmpxchg(&task->task_works, &work, head));

                if (!work)
                        break;
                /*
                 * Synchronize with task_work_cancel(). It can not remove
                 * the first entry == work, cmpxchg(task_works) must fail.
                 * But it can remove another entry from the ->next list.
                 */
                raw_spin_lock_irq(&task->pi_lock);
                raw_spin_unlock_irq(&task->pi_lock);

                do {
                        next = work->next;
                        work->func(work);
                        work = next;
                        cond_resched();
                } while (work);
        }
}




























    3 































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_VIRTIO_RING_H
#define _LINUX_VIRTIO_RING_H

#include <asm/barrier.h>
#include <linux/irqreturn.h>
#include <uapi/linux/virtio_ring.h>

/*
 * Barriers in virtio are tricky.  Non-SMP virtio guests can't assume
 * they're not on an SMP host system, so they need to assume real
 * barriers.  Non-SMP virtio hosts could skip the barriers, but does
 * anyone care?
 *
 * For virtio_pci on SMP, we don't need to order with respect to MMIO
 * accesses through relaxed memory I/O windows, so virt_mb() et al are
 * sufficient.
 *
 * For using virtio to talk to real devices (eg. other heterogeneous
 * CPUs) we do need real barriers.  In theory, we could be using both
 * kinds of virtio, so it's a runtime decision, and the branch is
 * actually quite cheap.
 */

static inline void virtio_mb(bool weak_barriers)
{
        if (weak_barriers)
                virt_mb();
        else
                mb();
}

static inline void virtio_rmb(bool weak_barriers)
{
        if (weak_barriers)
                virt_rmb();
        else
                dma_rmb();
}

static inline void virtio_wmb(bool weak_barriers)
{
        if (weak_barriers)
                virt_wmb();
        else
                dma_wmb();
}

#define virtio_store_mb(weak_barriers, p, v) \
do { \
        if (weak_barriers) { \
                virt_store_mb(*p, v); \
        } else { \
                WRITE_ONCE(*p, v); \
                mb(); \
        } \
} while (0) \

struct virtio_device;
struct virtqueue;
struct device;

/*
 * Creates a virtqueue and allocates the descriptor ring.  If
 * may_reduce_num is set, then this may allocate a smaller ring than
 * expected.  The caller should query virtqueue_get_vring_size to learn
 * the actual size of the ring.
 */
struct virtqueue *vring_create_virtqueue(unsigned int index,
                                         unsigned int num,
                                         unsigned int vring_align,
                                         struct virtio_device *vdev,
                                         bool weak_barriers,
                                         bool may_reduce_num,
                                         bool ctx,
                                         bool (*notify)(struct virtqueue *vq),
                                         void (*callback)(struct virtqueue *vq),
                                         const char *name);

/*
 * Creates a virtqueue and allocates the descriptor ring with per
 * virtqueue DMA device.
 */
struct virtqueue *vring_create_virtqueue_dma(unsigned int index,
                                             unsigned int num,
                                             unsigned int vring_align,
                                             struct virtio_device *vdev,
                                             bool weak_barriers,
                                             bool may_reduce_num,
                                             bool ctx,
                                             bool (*notify)(struct virtqueue *vq),
                                             void (*callback)(struct virtqueue *vq),
                                             const char *name,
                                             struct device *dma_dev);

/*
 * Creates a virtqueue with a standard layout but a caller-allocated
 * ring.
 */
struct virtqueue *vring_new_virtqueue(unsigned int index,
                                      unsigned int num,
                                      unsigned int vring_align,
                                      struct virtio_device *vdev,
                                      bool weak_barriers,
                                      bool ctx,
                                      void *pages,
                                      bool (*notify)(struct virtqueue *vq),
                                      void (*callback)(struct virtqueue *vq),
                                      const char *name);

/*
 * Destroys a virtqueue.  If created with vring_create_virtqueue, this
 * also frees the ring.
 */
void vring_del_virtqueue(struct virtqueue *vq);

/* Filter out transport-specific feature bits. */
void vring_transport_features(struct virtio_device *vdev);

irqreturn_t vring_interrupt(int irq, void *_vq);

u32 vring_notification_data(struct virtqueue *_vq);
#endif /* _LINUX_VIRTIO_RING_H */


























































































































































































































































































    2 























































































































    1 





    2 
    3 















    1 















    5 


    2 
    1 
   14 
    5 


































    1 





    1 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
// SPDX-License-Identifier: GPL-2.0+
/*
 * ext4_jbd2.h
 *
 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
 *
 * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
 *
 * Ext4-specific journaling extensions.
 */

#ifndef _EXT4_JBD2_H
#define _EXT4_JBD2_H

#include <linux/fs.h>
#include <linux/jbd2.h>
#include "ext4.h"

#define EXT4_JOURNAL(inode)        (EXT4_SB((inode)->i_sb)->s_journal)

/* Define the number of blocks we need to account to a transaction to
 * modify one block of data.
 *
 * We may have to touch one inode, one bitmap buffer, up to three
 * indirection blocks, the group and superblock summaries, and the data
 * block to complete the transaction.
 *
 * For extents-enabled fs we may have to allocate and modify up to
 * 5 levels of tree, data block (for each of these we need bitmap + group
 * summaries), root which is stored in the inode, sb
 */

#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)                                \
        (ext4_has_feature_extents(sb) ? 20U : 8U)

/* Extended attribute operations touch at most two data buffers,
 * two bitmap buffers, and two group summaries, in addition to the inode
 * and the superblock, which are already accounted for. */

#define EXT4_XATTR_TRANS_BLOCKS                6U

/* Define the minimum size for a transaction which modifies data.  This
 * needs to take into account the fact that we may end up modifying two
 * quota files too (one for the group, one for the user quota).  The
 * superblock only gets updated once, of course, so don't bother
 * counting that again for the quota updates. */

#define EXT4_DATA_TRANS_BLOCKS(sb)        (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
                                         EXT4_XATTR_TRANS_BLOCKS - 2 + \
                                         EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))

/*
 * Define the number of metadata blocks we need to account to modify data.
 *
 * This include super block, inode block, quota blocks and xattr blocks
 */
#define EXT4_META_TRANS_BLOCKS(sb)        (EXT4_XATTR_TRANS_BLOCKS + \
                                        EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))

/* Define an arbitrary limit for the amount of data we will anticipate
 * writing to any given transaction.  For unbounded transactions such as
 * write(2) and truncate(2) we can write more than this, but we always
 * start off at the maximum transaction size and grow the transaction
 * optimistically as we go. */

#define EXT4_MAX_TRANS_DATA                64U

/* We break up a large truncate or write transaction once the handle's
 * buffer credits gets this low, we need either to extend the
 * transaction or to start a new one.  Reserve enough space here for
 * inode, bitmap, superblock, group and indirection updates for at least
 * one block, plus two quota updates.  Quota allocations are not
 * needed. */

#define EXT4_RESERVE_TRANS_BLOCKS        12U

/*
 * Number of credits needed if we need to insert an entry into a
 * directory.  For each new index block, we need 4 blocks (old index
 * block, new index block, bitmap block, bg summary).  For normal
 * htree directories there are 2 levels; if the largedir feature
 * enabled it's 3 levels.
 */
#define EXT4_INDEX_EXTRA_TRANS_BLOCKS        12U

#ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was
 * allocated so we need to update only data block */
#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((ext4_quota_capable(sb)) ? 1 : 0)
/* Amount of blocks needed for quota insert/delete - we do some block writes
 * but inode, sb and group updates are done only once */
#define EXT4_QUOTA_INIT_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\
                (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
                 +3+DQUOT_INIT_REWRITE) : 0)

#define EXT4_QUOTA_DEL_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\
                (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
                 +3+DQUOT_DEL_REWRITE) : 0)
#else
#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
#endif
#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))

/*
 * Ext4 handle operation types -- for logging purposes
 */
#define EXT4_HT_MISC             0
#define EXT4_HT_INODE            1
#define EXT4_HT_WRITE_PAGE       2
#define EXT4_HT_MAP_BLOCKS       3
#define EXT4_HT_DIR              4
#define EXT4_HT_TRUNCATE         5
#define EXT4_HT_QUOTA            6
#define EXT4_HT_RESIZE           7
#define EXT4_HT_MIGRATE          8
#define EXT4_HT_MOVE_EXTENTS     9
#define EXT4_HT_XATTR           10
#define EXT4_HT_EXT_CONVERT     11
#define EXT4_HT_MAX             12

/**
 *   struct ext4_journal_cb_entry - Base structure for callback information.
 *
 *   This struct is a 'seed' structure for a using with your own callback
 *   structs. If you are using callbacks you must allocate one of these
 *   or another struct of your own definition which has this struct
 *   as it's first element and pass it to ext4_journal_callback_add().
 */
struct ext4_journal_cb_entry {
        /* list information for other callbacks attached to the same handle */
        struct list_head jce_list;

        /*  Function to call with this callback structure */
        void (*jce_func)(struct super_block *sb,
                         struct ext4_journal_cb_entry *jce, int error);

        /* user data goes here */
};

/**
 * ext4_journal_callback_add: add a function to call after transaction commit
 * @handle: active journal transaction handle to register callback on
 * @func: callback function to call after the transaction has committed:
 *        @sb: superblock of current filesystem for transaction
 *        @jce: returned journal callback data
 *        @rc: journal state at commit (0 = transaction committed properly)
 * @jce: journal callback data (internal and function private data struct)
 *
 * The registered function will be called in the context of the journal thread
 * after the transaction for which the handle was created has completed.
 *
 * No locks are held when the callback function is called, so it is safe to
 * call blocking functions from within the callback, but the callback should
 * not block or run for too long, or the filesystem will be blocked waiting for
 * the next transaction to commit. No journaling functions can be used, or
 * there is a risk of deadlock.
 *
 * There is no guaranteed calling order of multiple registered callbacks on
 * the same transaction.
 */
static inline void _ext4_journal_callback_add(handle_t *handle,
                        struct ext4_journal_cb_entry *jce)
{
        /* Add the jce to transaction's private list */
        list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
}

static inline void ext4_journal_callback_add(handle_t *handle,
                        void (*func)(struct super_block *sb,
                                     struct ext4_journal_cb_entry *jce,
                                     int rc),
                        struct ext4_journal_cb_entry *jce)
{
        struct ext4_sb_info *sbi =
                        EXT4_SB(handle->h_transaction->t_journal->j_private);

        /* Add the jce to transaction's private list */
        jce->jce_func = func;
        spin_lock(&sbi->s_md_lock);
        _ext4_journal_callback_add(handle, jce);
        spin_unlock(&sbi->s_md_lock);
}


/**
 * ext4_journal_callback_del: delete a registered callback
 * @handle: active journal transaction handle on which callback was registered
 * @jce: registered journal callback entry to unregister
 * Return true if object was successfully removed
 */
static inline bool ext4_journal_callback_try_del(handle_t *handle,
                                             struct ext4_journal_cb_entry *jce)
{
        bool deleted;
        struct ext4_sb_info *sbi =
                        EXT4_SB(handle->h_transaction->t_journal->j_private);

        spin_lock(&sbi->s_md_lock);
        deleted = !list_empty(&jce->jce_list);
        list_del_init(&jce->jce_list);
        spin_unlock(&sbi->s_md_lock);
        return deleted;
}

int
ext4_mark_iloc_dirty(handle_t *handle,
                     struct inode *inode,
                     struct ext4_iloc *iloc);

/*
 * On success, We end up with an outstanding reference count against
 * iloc->bh.  This _must_ be cleaned up later.
 */

int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
                        struct ext4_iloc *iloc);

#define ext4_mark_inode_dirty(__h, __i)                                        \
                __ext4_mark_inode_dirty((__h), (__i), __func__, __LINE__)
int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
                                const char *func, unsigned int line);

int ext4_expand_extra_isize(struct inode *inode,
                            unsigned int new_extra_isize,
                            struct ext4_iloc *iloc);
/*
 * Wrapper functions with which ext4 calls into JBD.
 */
int __ext4_journal_get_write_access(const char *where, unsigned int line,
                                    handle_t *handle, struct super_block *sb,
                                    struct buffer_head *bh,
                                    enum ext4_journal_trigger_type trigger_type);

int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
                  int is_metadata, struct inode *inode,
                  struct buffer_head *bh, ext4_fsblk_t blocknr);

int __ext4_journal_get_create_access(const char *where, unsigned int line,
                                handle_t *handle, struct super_block *sb,
                                struct buffer_head *bh,
                                enum ext4_journal_trigger_type trigger_type);

int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
                                 handle_t *handle, struct inode *inode,
                                 struct buffer_head *bh);

#define ext4_journal_get_write_access(handle, sb, bh, trigger_type) \
        __ext4_journal_get_write_access(__func__, __LINE__, (handle), (sb), \
                                        (bh), (trigger_type))
#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
        __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
                      (bh), (block_nr))
#define ext4_journal_get_create_access(handle, sb, bh, trigger_type) \
        __ext4_journal_get_create_access(__func__, __LINE__, (handle), (sb), \
                                         (bh), (trigger_type))
#define ext4_handle_dirty_metadata(handle, inode, bh) \
        __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
                                     (bh))

handle_t *__ext4_journal_start_sb(struct inode *inode, struct super_block *sb,
                                  unsigned int line, int type, int blocks,
                                  int rsv_blocks, int revoke_creds);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);

#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)

/* Note:  Do not use this for NULL handles.  This is only to determine if
 * a properly allocated handle is using a journal or not. */
static inline int ext4_handle_valid(handle_t *handle)
{
        if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
                return 0;
        return 1;
}

static inline void ext4_handle_sync(handle_t *handle)
{
        if (ext4_handle_valid(handle))
                handle->h_sync = 1;
}

static inline int ext4_handle_is_aborted(handle_t *handle)
{
        if (ext4_handle_valid(handle))
                return is_handle_aborted(handle);
        return 0;
}

static inline int ext4_free_metadata_revoke_credits(struct super_block *sb,
                                                    int blocks)
{
        /* Freeing each metadata block can result in freeing one cluster */
        return blocks * EXT4_SB(sb)->s_cluster_ratio;
}

static inline int ext4_trans_default_revoke_credits(struct super_block *sb)
{
        return ext4_free_metadata_revoke_credits(sb, 8);
}

#define ext4_journal_start_sb(sb, type, nblocks)                        \
        __ext4_journal_start_sb(NULL, (sb), __LINE__, (type), (nblocks), 0,\
                                ext4_trans_default_revoke_credits(sb))

#define ext4_journal_start(inode, type, nblocks)                        \
        __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0,        \
                             ext4_trans_default_revoke_credits((inode)->i_sb))

#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\
        __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\
                             ext4_trans_default_revoke_credits((inode)->i_sb))

#define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \
        __ext4_journal_start((inode), __LINE__, (type), (blocks), 0,        \
                             (revoke_creds))

static inline handle_t *__ext4_journal_start(struct inode *inode,
                                             unsigned int line, int type,
                                             int blocks, int rsv_blocks,
                                             int revoke_creds)
{
        return __ext4_journal_start_sb(inode, inode->i_sb, line, type, blocks,
                                       rsv_blocks, revoke_creds);
}

#define ext4_journal_stop(handle) \
        __ext4_journal_stop(__func__, __LINE__, (handle))

#define ext4_journal_start_reserved(handle, type) \
        __ext4_journal_start_reserved((handle), __LINE__, (type))

handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
                                        int type);

static inline handle_t *ext4_journal_current_handle(void)
{
        return journal_current_handle();
}

static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke)
{
        if (ext4_handle_valid(handle))
                return jbd2_journal_extend(handle, nblocks, revoke);
        return 0;
}

static inline int ext4_journal_restart(handle_t *handle, int nblocks,
                                       int revoke)
{
        if (ext4_handle_valid(handle))
                return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS);
        return 0;
}

int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
                                  int extend_cred, int revoke_cred);


/*
 * Ensure @handle has at least @check_creds credits available. If not,
 * transaction will be extended or restarted to contain at least @extend_cred
 * credits. Before restarting transaction @fn is executed to allow for cleanup
 * before the transaction is restarted.
 *
 * The return value is < 0 in case of error, 0 in case the handle has enough
 * credits or transaction extension succeeded, 1 in case transaction had to be
 * restarted.
 */
#define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred,        \
                                       revoke_cred, fn) \
({                                                                        \
        __label__ __ensure_end;                                                \
        int err = __ext4_journal_ensure_credits((handle), (check_cred),        \
                                        (extend_cred), (revoke_cred));        \
                                                                        \
        if (err <= 0)                                                        \
                goto __ensure_end;                                        \
        err = (fn);                                                        \
        if (err < 0)                                                        \
                goto __ensure_end;                                        \
        err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \
        if (err == 0)                                                        \
                err = 1;                                                \
__ensure_end:                                                                \
        err;                                                                \
})

/*
 * Ensure given handle has at least requested amount of credits available,
 * possibly restarting transaction if needed. We also make sure the transaction
 * has space for at least ext4_trans_default_revoke_credits(sb) revoke records
 * as freeing one or two blocks is very common pattern and requesting this is
 * very cheap.
 */
static inline int ext4_journal_ensure_credits(handle_t *handle, int credits,
                                              int revoke_creds)
{
        return ext4_journal_ensure_credits_fn(handle, credits, credits,
                                revoke_creds, 0);
}

static inline int ext4_journal_blocks_per_page(struct inode *inode)
{
        if (EXT4_JOURNAL(inode) != NULL)
                return jbd2_journal_blocks_per_page(inode);
        return 0;
}

static inline int ext4_journal_force_commit(journal_t *journal)
{
        if (journal)
                return jbd2_journal_force_commit(journal);
        return 0;
}

static inline int ext4_jbd2_inode_add_write(handle_t *handle,
                struct inode *inode, loff_t start_byte, loff_t length)
{
        if (ext4_handle_valid(handle))
                return jbd2_journal_inode_ranged_write(handle,
                                EXT4_I(inode)->jinode, start_byte, length);
        return 0;
}

static inline int ext4_jbd2_inode_add_wait(handle_t *handle,
                struct inode *inode, loff_t start_byte, loff_t length)
{
        if (ext4_handle_valid(handle))
                return jbd2_journal_inode_ranged_wait(handle,
                                EXT4_I(inode)->jinode, start_byte, length);
        return 0;
}

static inline void ext4_update_inode_fsync_trans(handle_t *handle,
                                                 struct inode *inode,
                                                 int datasync)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) {
                ei->i_sync_tid = handle->h_transaction->t_tid;
                if (datasync)
                        ei->i_datasync_tid = handle->h_transaction->t_tid;
        }
}

/* super.c */
int ext4_force_commit(struct super_block *sb);

/*
 * Ext4 inode journal modes
 */
#define EXT4_INODE_JOURNAL_DATA_MODE        0x01 /* journal data mode */
#define EXT4_INODE_ORDERED_DATA_MODE        0x02 /* ordered data mode */
#define EXT4_INODE_WRITEBACK_DATA_MODE        0x04 /* writeback data mode */

int ext4_inode_journal_mode(struct inode *inode);

static inline int ext4_should_journal_data(struct inode *inode)
{
        return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
}

static inline int ext4_should_order_data(struct inode *inode)
{
        return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
}

static inline int ext4_should_writeback_data(struct inode *inode)
{
        return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
}

static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks)
{
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
                return 0;
        if (!ext4_should_journal_data(inode))
                return 0;
        /*
         * Data blocks in one extent are contiguous, just account for partial
         * clusters at extent boundaries
         */
        return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1);
}

/*
 * This function controls whether or not we should try to go down the
 * dioread_nolock code paths, which makes it safe to avoid taking
 * i_rwsem for direct I/O reads.  This only works for extent-based
 * files, and it doesn't work if data journaling is enabled, since the
 * dioread_nolock code uses b_private to pass information back to the
 * I/O completion handler, and this conflicts with the jbd's use of
 * b_private.
 */
static inline int ext4_should_dioread_nolock(struct inode *inode)
{
        if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return 0;
        if (ext4_should_journal_data(inode))
                return 0;
        /* temporary fix to prevent generic/422 test failures */
        if (!test_opt(inode->i_sb, DELALLOC))
                return 0;
        return 1;
}

#endif        /* _EXT4_JBD2_H */















































































   29 




   29 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PAGE_H
#define _ASM_X86_PAGE_H

#include <linux/types.h>

#ifdef __KERNEL__

#include <asm/page_types.h>

#ifdef CONFIG_X86_64
#include <asm/page_64.h>
#else
#include <asm/page_32.h>
#endif        /* CONFIG_X86_64 */

#ifndef __ASSEMBLY__

struct page;

#include <linux/range.h>
extern struct range pfn_mapped[];
extern int nr_pfn_mapped;

static inline void clear_user_page(void *page, unsigned long vaddr,
                                   struct page *pg)
{
        clear_page(page);
}

static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
                                  struct page *topage)
{
        copy_page(to, from);
}

#define vma_alloc_zeroed_movable_folio(vma, vaddr) \
        vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false)

#ifndef __pa
#define __pa(x)                __phys_addr((unsigned long)(x))
#endif

#define __pa_nodebug(x)        __phys_addr_nodebug((unsigned long)(x))
/* __pa_symbol should be used for C visible symbols.
   This seems to be the official gcc blessed way to do such arithmetic. */
/*
 * We need __phys_reloc_hide() here because gcc may assume that there is no
 * overflow during __pa() calculation and can optimize it unexpectedly.
 * Newer versions of gcc provide -fno-strict-overflow switch to handle this
 * case properly. Once all supported versions of gcc understand it, we can
 * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated)
 */
#define __pa_symbol(x) \
        __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x)))

#ifndef __va
#define __va(x)                        ((void *)((unsigned long)(x)+PAGE_OFFSET))
#endif

#define __boot_va(x)                __va(x)
#define __boot_pa(x)                __pa(x)

/*
 * virt_to_page(kaddr) returns a valid pointer if and only if
 * virt_addr_valid(kaddr) returns true.
 */
#define virt_to_page(kaddr)        pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
extern bool __virt_addr_valid(unsigned long kaddr);
#define virt_addr_valid(kaddr)        __virt_addr_valid((unsigned long) (kaddr))

static __always_inline void *pfn_to_kaddr(unsigned long pfn)
{
        return __va(pfn << PAGE_SHIFT);
}

static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits)
{
        return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits);
}

static __always_inline u64 __is_canonical_address(u64 vaddr, u8 vaddr_bits)
{
        return __canonical_address(vaddr, vaddr_bits) == vaddr;
}

#endif        /* __ASSEMBLY__ */

#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>

#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA

#endif        /* __KERNEL__ */
#endif /* _ASM_X86_PAGE_H */




















































































































































































    2 


    2 



    2 




















    1 
    1 

    1 














    1 






    1 












    1 











































    3 














    5 


    5 


    3 




    7 

    1 
    6 




    6 

    1 
    5 




   17 

    1 
   17 




    7 

    1 
    6 




    5 

    1 
    4 




    2 


    2 




    4 


    4 




























    6 

    1 
    5 




    5 

    1 
    4 




    2 


    2 




    3 


    3 






















































































































































































































    1 























    1 
































    1 

    1 


    1 




    1 




    1 









    1 







    1 







    1 














































































































































































































































    1 










    1 



































    1 






    1 














    1 













    1 

    1 






























    1 






    1 











    1 



    1 


















































































































































































































































































    5 






    5 































    1 
    1 




    1 














    1 
    1 









    1 












    2 

































    1 






    1 






    1 




    1 







    1 











    1 








































    1 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    7 





    7 











    7 



































    8 
    8 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 











    1 

    1 




    1 

    1 




















    1 









    1 





    1 










    1 







    1 






    1 



















    1 
    1 








































































































    2 








    2 













    1 
    2 


    1 

    2 




    2 








































































































































































































































































































































































































































































    1 





    1 


    1 



    1 



    1 

    1 
    1 







    1 







































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/super.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/inode.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 */

#include <linux/module.h>
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/time.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/parser.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
#include <linux/vfs.h>
#include <linux/random.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/seq_file.h>
#include <linux/ctype.h>
#include <linux/log2.h>
#include <linux/crc16.h>
#include <linux/dax.h>
#include <linux/uaccess.h>
#include <linux/iversion.h>
#include <linux/unicode.h>
#include <linux/part_stat.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/fsnotify.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>

#include "ext4.h"
#include "ext4_extents.h"        /* Needed for trace points definition */
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
#include "mballoc.h"
#include "fsmap.h"

#define CREATE_TRACE_POINTS
#include <trace/events/ext4.h>

static struct ext4_lazy_init *ext4_li_info;
static DEFINE_MUTEX(ext4_li_mtx);
static struct ratelimit_state ext4_mount_msg_ratelimit;

static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
static int ext4_show_options(struct seq_file *seq, struct dentry *root);
static void ext4_update_super(struct super_block *sb);
static int ext4_commit_super(struct super_block *sb);
static int ext4_mark_recovery_complete(struct super_block *sb,
                                        struct ext4_super_block *es);
static int ext4_clear_journal_err(struct super_block *sb,
                                  struct ext4_super_block *es);
static int ext4_sync_fs(struct super_block *sb, int wait);
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
static int ext4_unfreeze(struct super_block *sb);
static int ext4_freeze(struct super_block *sb);
static inline int ext2_feature_set_ok(struct super_block *sb);
static inline int ext3_feature_set_ok(struct super_block *sb);
static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
static struct inode *ext4_get_journal_inode(struct super_block *sb,
                                            unsigned int journal_inum);
static int ext4_validate_options(struct fs_context *fc);
static int ext4_check_opt_consistency(struct fs_context *fc,
                                      struct super_block *sb);
static void ext4_apply_options(struct fs_context *fc, struct super_block *sb);
static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param);
static int ext4_get_tree(struct fs_context *fc);
static int ext4_reconfigure(struct fs_context *fc);
static void ext4_fc_free(struct fs_context *fc);
static int ext4_init_fs_context(struct fs_context *fc);
static void ext4_kill_sb(struct super_block *sb);
static const struct fs_parameter_spec ext4_param_specs[];

/*
 * Lock ordering
 *
 * page fault path:
 * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start
 *   -> page lock -> i_data_sem (rw)
 *
 * buffered write path:
 * sb_start_write -> i_mutex -> mmap_lock
 * sb_start_write -> i_mutex -> transaction start -> page lock ->
 *   i_data_sem (rw)
 *
 * truncate:
 * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
 *   page lock
 * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start ->
 *   i_data_sem (rw)
 *
 * direct IO:
 * sb_start_write -> i_mutex -> mmap_lock
 * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw)
 *
 * writepages:
 * transaction start -> page lock(s) -> i_data_sem (rw)
 */

static const struct fs_context_operations ext4_context_ops = {
        .parse_param        = ext4_parse_param,
        .get_tree        = ext4_get_tree,
        .reconfigure        = ext4_reconfigure,
        .free                = ext4_fc_free,
};


#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
static struct file_system_type ext2_fs_type = {
        .owner                        = THIS_MODULE,
        .name                        = "ext2",
        .init_fs_context        = ext4_init_fs_context,
        .parameters                = ext4_param_specs,
        .kill_sb                = ext4_kill_sb,
        .fs_flags                = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext2");
MODULE_ALIAS("ext2");
#define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type)
#else
#define IS_EXT2_SB(sb) (0)
#endif


static struct file_system_type ext3_fs_type = {
        .owner                        = THIS_MODULE,
        .name                        = "ext3",
        .init_fs_context        = ext4_init_fs_context,
        .parameters                = ext4_param_specs,
        .kill_sb                = ext4_kill_sb,
        .fs_flags                = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext3");
MODULE_ALIAS("ext3");
#define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type)


static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
                                  bh_end_io_t *end_io)
{
        /*
         * buffer's verified bit is no longer valid after reading from
         * disk again due to write out error, clear it to make sure we
         * recheck the buffer contents.
         */
        clear_buffer_verified(bh);

        bh->b_end_io = end_io ? end_io : end_buffer_read_sync;
        get_bh(bh);
        submit_bh(REQ_OP_READ | op_flags, bh);
}

void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
                         bh_end_io_t *end_io)
{
        BUG_ON(!buffer_locked(bh));

        if (ext4_buffer_uptodate(bh)) {
                unlock_buffer(bh);
                return;
        }
        __ext4_read_bh(bh, op_flags, end_io);
}

int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io)
{
        BUG_ON(!buffer_locked(bh));

        if (ext4_buffer_uptodate(bh)) {
                unlock_buffer(bh);
                return 0;
        }

        __ext4_read_bh(bh, op_flags, end_io);

        wait_on_buffer(bh);
        if (buffer_uptodate(bh))
                return 0;
        return -EIO;
}

int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
{
        lock_buffer(bh);
        if (!wait) {
                ext4_read_bh_nowait(bh, op_flags, NULL);
                return 0;
        }
        return ext4_read_bh(bh, op_flags, NULL);
}

/*
 * This works like __bread_gfp() except it uses ERR_PTR for error
 * returns.  Currently with sb_bread it's impossible to distinguish
 * between ENOMEM and EIO situations (since both result in a NULL
 * return.
 */
static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
                                               sector_t block,
                                               blk_opf_t op_flags, gfp_t gfp)
{
        struct buffer_head *bh;
        int ret;

        bh = sb_getblk_gfp(sb, block, gfp);
        if (bh == NULL)
                return ERR_PTR(-ENOMEM);
        if (ext4_buffer_uptodate(bh))
                return bh;

        ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true);
        if (ret) {
                put_bh(bh);
                return ERR_PTR(ret);
        }
        return bh;
}

struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
                                   blk_opf_t op_flags)
{
        gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
                        ~__GFP_FS) | __GFP_MOVABLE;

        return __ext4_sb_bread_gfp(sb, block, op_flags, gfp);
}

struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
                                            sector_t block)
{
        gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
                        ~__GFP_FS);

        return __ext4_sb_bread_gfp(sb, block, 0, gfp);
}

void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
{
        struct buffer_head *bh = bdev_getblk(sb->s_bdev, block,
                        sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN);

        if (likely(bh)) {
                if (trylock_buffer(bh))
                        ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL);
                brelse(bh);
        }
}

static int ext4_verify_csum_type(struct super_block *sb,
                                 struct ext4_super_block *es)
{
        if (!ext4_has_feature_metadata_csum(sb))
                return 1;

        return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
}

__le32 ext4_superblock_csum(struct super_block *sb,
                            struct ext4_super_block *es)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int offset = offsetof(struct ext4_super_block, s_checksum);
        __u32 csum;

        csum = ext4_chksum(sbi, ~0, (char *)es, offset);

        return cpu_to_le32(csum);
}

static int ext4_superblock_csum_verify(struct super_block *sb,
                                       struct ext4_super_block *es)
{
        if (!ext4_has_metadata_csum(sb))
                return 1;

        return es->s_checksum == ext4_superblock_csum(sb, es);
}

void ext4_superblock_csum_set(struct super_block *sb)
{
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;

        if (!ext4_has_metadata_csum(sb))
                return;

        es->s_checksum = ext4_superblock_csum(sb, es);
}

ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
                               struct ext4_group_desc *bg)
{
        return le32_to_cpu(bg->bg_block_bitmap_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
}

ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
                               struct ext4_group_desc *bg)
{
        return le32_to_cpu(bg->bg_inode_bitmap_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
}

ext4_fsblk_t ext4_inode_table(struct super_block *sb,
                              struct ext4_group_desc *bg)
{
        return le32_to_cpu(bg->bg_inode_table_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
}

__u32 ext4_free_group_clusters(struct super_block *sb,
                               struct ext4_group_desc *bg)
{
        return le16_to_cpu(bg->bg_free_blocks_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
}

__u32 ext4_free_inodes_count(struct super_block *sb,
                              struct ext4_group_desc *bg)
{
        return le16_to_cpu(bg->bg_free_inodes_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
}

__u32 ext4_used_dirs_count(struct super_block *sb,
                              struct ext4_group_desc *bg)
{
        return le16_to_cpu(bg->bg_used_dirs_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
}

__u32 ext4_itable_unused_count(struct super_block *sb,
                              struct ext4_group_desc *bg)
{
        return le16_to_cpu(bg->bg_itable_unused_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
                 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
}

void ext4_block_bitmap_set(struct super_block *sb,
                           struct ext4_group_desc *bg, ext4_fsblk_t blk)
{
        bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
}

void ext4_inode_bitmap_set(struct super_block *sb,
                           struct ext4_group_desc *bg, ext4_fsblk_t blk)
{
        bg->bg_inode_bitmap_lo  = cpu_to_le32((u32)blk);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
}

void ext4_inode_table_set(struct super_block *sb,
                          struct ext4_group_desc *bg, ext4_fsblk_t blk)
{
        bg->bg_inode_table_lo = cpu_to_le32((u32)blk);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
}

void ext4_free_group_clusters_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, __u32 count)
{
        bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
}

void ext4_free_inodes_set(struct super_block *sb,
                          struct ext4_group_desc *bg, __u32 count)
{
        bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
}

void ext4_used_dirs_set(struct super_block *sb,
                          struct ext4_group_desc *bg, __u32 count)
{
        bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
}

void ext4_itable_unused_set(struct super_block *sb,
                          struct ext4_group_desc *bg, __u32 count)
{
        bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
                bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
}

static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now)
{
        now = clamp_val(now, 0, (1ull << 40) - 1);

        *lo = cpu_to_le32(lower_32_bits(now));
        *hi = upper_32_bits(now);
}

static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
{
        return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo);
}
#define ext4_update_tstamp(es, tstamp) \
        __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \
                             ktime_get_real_seconds())
#define ext4_get_tstamp(es, tstamp) \
        __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)

#define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */
#define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */

/*
 * The ext4_maybe_update_superblock() function checks and updates the
 * superblock if needed.
 *
 * This function is designed to update the on-disk superblock only under
 * certain conditions to prevent excessive disk writes and unnecessary
 * waking of the disk from sleep. The superblock will be updated if:
 * 1. More than an hour has passed since the last superblock update, and
 * 2. More than 16MB have been written since the last superblock update.
 *
 * @sb: The superblock
 */
static void ext4_maybe_update_superblock(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        journal_t *journal = sbi->s_journal;
        time64_t now;
        __u64 last_update;
        __u64 lifetime_write_kbytes;
        __u64 diff_size;

        if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) ||
            !journal || (journal->j_flags & JBD2_UNMOUNT))
                return;

        now = ktime_get_real_seconds();
        last_update = ext4_get_tstamp(es, s_wtime);

        if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC))
                return;

        lifetime_write_kbytes = sbi->s_kbytes_written +
                ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
                  sbi->s_sectors_written_start) >> 1);

        /* Get the number of kilobytes not written to disk to account
         * for statistics and compare with a multiple of 16 MB. This
         * is used to determine when the next superblock commit should
         * occur (i.e. not more often than once per 16MB if there was
         * less written in an hour).
         */
        diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written);

        if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB)
                schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
}

static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
{
        struct super_block                *sb = journal->j_private;
        struct ext4_sb_info                *sbi = EXT4_SB(sb);
        int                                error = is_journal_aborted(journal);
        struct ext4_journal_cb_entry        *jce;

        BUG_ON(txn->t_state == T_FINISHED);

        ext4_process_freed_data(sb, txn->t_tid);
        ext4_maybe_update_superblock(sb);

        spin_lock(&sbi->s_md_lock);
        while (!list_empty(&txn->t_private_list)) {
                jce = list_entry(txn->t_private_list.next,
                                 struct ext4_journal_cb_entry, jce_list);
                list_del_init(&jce->jce_list);
                spin_unlock(&sbi->s_md_lock);
                jce->jce_func(sb, jce, error);
                spin_lock(&sbi->s_md_lock);
        }
        spin_unlock(&sbi->s_md_lock);
}

/*
 * This writepage callback for write_cache_pages()
 * takes care of a few cases after page cleaning.
 *
 * write_cache_pages() already checks for dirty pages
 * and calls clear_page_dirty_for_io(), which we want,
 * to write protect the pages.
 *
 * However, we may have to redirty a page (see below.)
 */
static int ext4_journalled_writepage_callback(struct folio *folio,
                                              struct writeback_control *wbc,
                                              void *data)
{
        transaction_t *transaction = (transaction_t *) data;
        struct buffer_head *bh, *head;
        struct journal_head *jh;

        bh = head = folio_buffers(folio);
        do {
                /*
                 * We have to redirty a page in these cases:
                 * 1) If buffer is dirty, it means the page was dirty because it
                 * contains a buffer that needs checkpointing. So the dirty bit
                 * needs to be preserved so that checkpointing writes the buffer
                 * properly.
                 * 2) If buffer is not part of the committing transaction
                 * (we may have just accidentally come across this buffer because
                 * inode range tracking is not exact) or if the currently running
                 * transaction already contains this buffer as well, dirty bit
                 * needs to be preserved so that the buffer gets writeprotected
                 * properly on running transaction's commit.
                 */
                jh = bh2jh(bh);
                if (buffer_dirty(bh) ||
                    (jh && (jh->b_transaction != transaction ||
                            jh->b_next_transaction))) {
                        folio_redirty_for_writepage(wbc, folio);
                        goto out;
                }
        } while ((bh = bh->b_this_page) != head);

out:
        return AOP_WRITEPAGE_ACTIVATE;
}

static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
{
        struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
        struct writeback_control wbc = {
                .sync_mode =  WB_SYNC_ALL,
                .nr_to_write = LONG_MAX,
                .range_start = jinode->i_dirty_start,
                .range_end = jinode->i_dirty_end,
        };

        return write_cache_pages(mapping, &wbc,
                                 ext4_journalled_writepage_callback,
                                 jinode->i_transaction);
}

static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
{
        int ret;

        if (ext4_should_journal_data(jinode->i_vfs_inode))
                ret = ext4_journalled_submit_inode_data_buffers(jinode);
        else
                ret = ext4_normal_submit_inode_data_buffers(jinode);
        return ret;
}

static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
{
        int ret = 0;

        if (!ext4_should_journal_data(jinode->i_vfs_inode))
                ret = jbd2_journal_finish_inode_data_buffers(jinode);

        return ret;
}

static bool system_going_down(void)
{
        return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
                || system_state == SYSTEM_RESTART;
}

struct ext4_err_translation {
        int code;
        int errno;
};

#define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err }

static struct ext4_err_translation err_translation[] = {
        EXT4_ERR_TRANSLATE(EIO),
        EXT4_ERR_TRANSLATE(ENOMEM),
        EXT4_ERR_TRANSLATE(EFSBADCRC),
        EXT4_ERR_TRANSLATE(EFSCORRUPTED),
        EXT4_ERR_TRANSLATE(ENOSPC),
        EXT4_ERR_TRANSLATE(ENOKEY),
        EXT4_ERR_TRANSLATE(EROFS),
        EXT4_ERR_TRANSLATE(EFBIG),
        EXT4_ERR_TRANSLATE(EEXIST),
        EXT4_ERR_TRANSLATE(ERANGE),
        EXT4_ERR_TRANSLATE(EOVERFLOW),
        EXT4_ERR_TRANSLATE(EBUSY),
        EXT4_ERR_TRANSLATE(ENOTDIR),
        EXT4_ERR_TRANSLATE(ENOTEMPTY),
        EXT4_ERR_TRANSLATE(ESHUTDOWN),
        EXT4_ERR_TRANSLATE(EFAULT),
};

static int ext4_errno_to_code(int errno)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(err_translation); i++)
                if (err_translation[i].errno == errno)
                        return err_translation[i].code;
        return EXT4_ERR_UNKNOWN;
}

static void save_error_info(struct super_block *sb, int error,
                            __u32 ino, __u64 block,
                            const char *func, unsigned int line)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /* We default to EFSCORRUPTED error... */
        if (error == 0)
                error = EFSCORRUPTED;

        spin_lock(&sbi->s_error_lock);
        sbi->s_add_error_count++;
        sbi->s_last_error_code = error;
        sbi->s_last_error_line = line;
        sbi->s_last_error_ino = ino;
        sbi->s_last_error_block = block;
        sbi->s_last_error_func = func;
        sbi->s_last_error_time = ktime_get_real_seconds();
        if (!sbi->s_first_error_time) {
                sbi->s_first_error_code = error;
                sbi->s_first_error_line = line;
                sbi->s_first_error_ino = ino;
                sbi->s_first_error_block = block;
                sbi->s_first_error_func = func;
                sbi->s_first_error_time = sbi->s_last_error_time;
        }
        spin_unlock(&sbi->s_error_lock);
}

/* Deal with the reporting of failure conditions on a filesystem such as
 * inconsistencies detected or read IO failures.
 *
 * On ext2, we can store the error state of the filesystem in the
 * superblock.  That is not possible on ext4, because we may have other
 * write ordering constraints on the superblock which prevent us from
 * writing it out straight away; and given that the journal is about to
 * be aborted, we can't rely on the current, or future, transactions to
 * write out the superblock safely.
 *
 * We'll just use the jbd2_journal_abort() error code to record an error in
 * the journal instead.  On recovery, the journal will complain about
 * that error until we've noted it down and cleared it.
 *
 * If force_ro is set, we unconditionally force the filesystem into an
 * ABORT|READONLY state, unless the error response on the fs has been set to
 * panic in which case we take the easy way out and panic immediately. This is
 * used to deal with unrecoverable failures such as journal IO errors or ENOMEM
 * at a critical moment in log management.
 */
static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
                              __u32 ino, __u64 block,
                              const char *func, unsigned int line)
{
        journal_t *journal = EXT4_SB(sb)->s_journal;
        bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT);

        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
        if (test_opt(sb, WARN_ON_ERROR))
                WARN_ON_ONCE(1);

        if (!continue_fs && !sb_rdonly(sb)) {
                set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags);
                if (journal)
                        jbd2_journal_abort(journal, -EIO);
        }

        if (!bdev_read_only(sb->s_bdev)) {
                save_error_info(sb, error, ino, block, func, line);
                /*
                 * In case the fs should keep running, we need to writeout
                 * superblock through the journal. Due to lock ordering
                 * constraints, it may not be safe to do it right here so we
                 * defer superblock flushing to a workqueue.
                 */
                if (continue_fs && journal)
                        schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
                else
                        ext4_commit_super(sb);
        }

        /*
         * We force ERRORS_RO behavior when system is rebooting. Otherwise we
         * could panic during 'reboot -f' as the underlying device got already
         * disabled.
         */
        if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
                panic("EXT4-fs (device %s): panic forced after error\n",
                        sb->s_id);
        }

        if (sb_rdonly(sb) || continue_fs)
                return;

        ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
        /*
         * Make sure updated value of ->s_mount_flags will be visible before
         * ->s_flags update
         */
        smp_wmb();
        sb->s_flags |= SB_RDONLY;
}

static void update_super_work(struct work_struct *work)
{
        struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info,
                                                s_sb_upd_work);
        journal_t *journal = sbi->s_journal;
        handle_t *handle;

        /*
         * If the journal is still running, we have to write out superblock
         * through the journal to avoid collisions of other journalled sb
         * updates.
         *
         * We use directly jbd2 functions here to avoid recursing back into
         * ext4 error handling code during handling of previous errors.
         */
        if (!sb_rdonly(sbi->s_sb) && journal) {
                struct buffer_head *sbh = sbi->s_sbh;
                bool call_notify_err = false;

                handle = jbd2_journal_start(journal, 1);
                if (IS_ERR(handle))
                        goto write_directly;
                if (jbd2_journal_get_write_access(handle, sbh)) {
                        jbd2_journal_stop(handle);
                        goto write_directly;
                }

                if (sbi->s_add_error_count > 0)
                        call_notify_err = true;

                ext4_update_super(sbi->s_sb);
                if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
                        ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to "
                                 "superblock detected");
                        clear_buffer_write_io_error(sbh);
                        set_buffer_uptodate(sbh);
                }

                if (jbd2_journal_dirty_metadata(handle, sbh)) {
                        jbd2_journal_stop(handle);
                        goto write_directly;
                }
                jbd2_journal_stop(handle);

                if (call_notify_err)
                        ext4_notify_error_sysfs(sbi);

                return;
        }
write_directly:
        /*
         * Write through journal failed. Write sb directly to get error info
         * out and hope for the best.
         */
        ext4_commit_super(sbi->s_sb);
        ext4_notify_error_sysfs(sbi);
}

#define ext4_error_ratelimit(sb)                                        \
                ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state),        \
                             "EXT4-fs error")

void __ext4_error(struct super_block *sb, const char *function,
                  unsigned int line, bool force_ro, int error, __u64 block,
                  const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        if (unlikely(ext4_forced_shutdown(sb)))
                return;

        trace_ext4_error(sb, function, line);
        if (ext4_error_ratelimit(sb)) {
                va_start(args, fmt);
                vaf.fmt = fmt;
                vaf.va = &args;
                printk(KERN_CRIT
                       "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
                       sb->s_id, function, line, current->comm, &vaf);
                va_end(args);
        }
        fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED);

        ext4_handle_error(sb, force_ro, error, 0, block, function, line);
}

void __ext4_error_inode(struct inode *inode, const char *function,
                        unsigned int line, ext4_fsblk_t block, int error,
                        const char *fmt, ...)
{
        va_list args;
        struct va_format vaf;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return;

        trace_ext4_error(inode->i_sb, function, line);
        if (ext4_error_ratelimit(inode->i_sb)) {
                va_start(args, fmt);
                vaf.fmt = fmt;
                vaf.va = &args;
                if (block)
                        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
                               "inode #%lu: block %llu: comm %s: %pV\n",
                               inode->i_sb->s_id, function, line, inode->i_ino,
                               block, current->comm, &vaf);
                else
                        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
                               "inode #%lu: comm %s: %pV\n",
                               inode->i_sb->s_id, function, line, inode->i_ino,
                               current->comm, &vaf);
                va_end(args);
        }
        fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED);

        ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block,
                          function, line);
}

void __ext4_error_file(struct file *file, const char *function,
                       unsigned int line, ext4_fsblk_t block,
                       const char *fmt, ...)
{
        va_list args;
        struct va_format vaf;
        struct inode *inode = file_inode(file);
        char pathname[80], *path;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return;

        trace_ext4_error(inode->i_sb, function, line);
        if (ext4_error_ratelimit(inode->i_sb)) {
                path = file_path(file, pathname, sizeof(pathname));
                if (IS_ERR(path))
                        path = "(unknown)";
                va_start(args, fmt);
                vaf.fmt = fmt;
                vaf.va = &args;
                if (block)
                        printk(KERN_CRIT
                               "EXT4-fs error (device %s): %s:%d: inode #%lu: "
                               "block %llu: comm %s: path %s: %pV\n",
                               inode->i_sb->s_id, function, line, inode->i_ino,
                               block, current->comm, path, &vaf);
                else
                        printk(KERN_CRIT
                               "EXT4-fs error (device %s): %s:%d: inode #%lu: "
                               "comm %s: path %s: %pV\n",
                               inode->i_sb->s_id, function, line, inode->i_ino,
                               current->comm, path, &vaf);
                va_end(args);
        }
        fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED);

        ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block,
                          function, line);
}

const char *ext4_decode_error(struct super_block *sb, int errno,
                              char nbuf[16])
{
        char *errstr = NULL;

        switch (errno) {
        case -EFSCORRUPTED:
                errstr = "Corrupt filesystem";
                break;
        case -EFSBADCRC:
                errstr = "Filesystem failed CRC";
                break;
        case -EIO:
                errstr = "IO failure";
                break;
        case -ENOMEM:
                errstr = "Out of memory";
                break;
        case -EROFS:
                if (!sb || (EXT4_SB(sb)->s_journal &&
                            EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
                        errstr = "Journal has aborted";
                else
                        errstr = "Readonly filesystem";
                break;
        default:
                /* If the caller passed in an extra buffer for unknown
                 * errors, textualise them now.  Else we just return
                 * NULL. */
                if (nbuf) {
                        /* Check for truncated error codes... */
                        if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
                                errstr = nbuf;
                }
                break;
        }

        return errstr;
}

/* __ext4_std_error decodes expected errors from journaling functions
 * automatically and invokes the appropriate error response.  */

void __ext4_std_error(struct super_block *sb, const char *function,
                      unsigned int line, int errno)
{
        char nbuf[16];
        const char *errstr;

        if (unlikely(ext4_forced_shutdown(sb)))
                return;

        /* Special case: if the error is EROFS, and we're not already
         * inside a transaction, then there's really no point in logging
         * an error. */
        if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb))
                return;

        if (ext4_error_ratelimit(sb)) {
                errstr = ext4_decode_error(sb, errno, nbuf);
                printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
                       sb->s_id, function, line, errstr);
        }
        fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED);

        ext4_handle_error(sb, false, -errno, 0, 0, function, line);
}

void __ext4_msg(struct super_block *sb,
                const char *prefix, const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        if (sb) {
                atomic_inc(&EXT4_SB(sb)->s_msg_count);
                if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state),
                                  "EXT4-fs"))
                        return;
        }

        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
        if (sb)
                printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
        else
                printk("%sEXT4-fs: %pV\n", prefix, &vaf);
        va_end(args);
}

static int ext4_warning_ratelimit(struct super_block *sb)
{
        atomic_inc(&EXT4_SB(sb)->s_warning_count);
        return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
                            "EXT4-fs warning");
}

void __ext4_warning(struct super_block *sb, const char *function,
                    unsigned int line, const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        if (!ext4_warning_ratelimit(sb))
                return;

        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
               sb->s_id, function, line, &vaf);
        va_end(args);
}

void __ext4_warning_inode(const struct inode *inode, const char *function,
                          unsigned int line, const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        if (!ext4_warning_ratelimit(inode->i_sb))
                return;

        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
        printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: "
               "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id,
               function, line, inode->i_ino, current->comm, &vaf);
        va_end(args);
}

void __ext4_grp_locked_error(const char *function, unsigned int line,
                             struct super_block *sb, ext4_group_t grp,
                             unsigned long ino, ext4_fsblk_t block,
                             const char *fmt, ...)
__releases(bitlock)
__acquires(bitlock)
{
        struct va_format vaf;
        va_list args;

        if (unlikely(ext4_forced_shutdown(sb)))
                return;

        trace_ext4_error(sb, function, line);
        if (ext4_error_ratelimit(sb)) {
                va_start(args, fmt);
                vaf.fmt = fmt;
                vaf.va = &args;
                printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
                       sb->s_id, function, line, grp);
                if (ino)
                        printk(KERN_CONT "inode %lu: ", ino);
                if (block)
                        printk(KERN_CONT "block %llu:",
                               (unsigned long long) block);
                printk(KERN_CONT "%pV\n", &vaf);
                va_end(args);
        }

        if (test_opt(sb, ERRORS_CONT)) {
                if (test_opt(sb, WARN_ON_ERROR))
                        WARN_ON_ONCE(1);
                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                if (!bdev_read_only(sb->s_bdev)) {
                        save_error_info(sb, EFSCORRUPTED, ino, block, function,
                                        line);
                        schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
                }
                return;
        }
        ext4_unlock_group(sb, grp);
        ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line);
        /*
         * We only get here in the ERRORS_RO case; relocking the group
         * may be dangerous, but nothing bad will happen since the
         * filesystem will have already been marked read/only and the
         * journal has been aborted.  We return 1 as a hint to callers
         * who might what to use the return value from
         * ext4_grp_locked_error() to distinguish between the
         * ERRORS_CONT and ERRORS_RO case, and perhaps return more
         * aggressively from the ext4 function in question, with a
         * more appropriate error code.
         */
        ext4_lock_group(sb, grp);
        return;
}

void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
                                     ext4_group_t group,
                                     unsigned int flags)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
        int ret;

        if (!grp || !gdp)
                return;
        if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) {
                ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
                                            &grp->bb_state);
                if (!ret)
                        percpu_counter_sub(&sbi->s_freeclusters_counter,
                                           grp->bb_free);
        }

        if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) {
                ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
                                            &grp->bb_state);
                if (!ret && gdp) {
                        int count;

                        count = ext4_free_inodes_count(sb, gdp);
                        percpu_counter_sub(&sbi->s_freeinodes_counter,
                                           count);
                }
        }
}

void ext4_update_dynamic_rev(struct super_block *sb)
{
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;

        if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
                return;

        ext4_warning(sb,
                     "updating to rev %d because of new feature flag, "
                     "running e2fsck is recommended",
                     EXT4_DYNAMIC_REV);

        es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
        es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
        es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
        /* leave es->s_feature_*compat flags alone */
        /* es->s_uuid will be set by e2fsck if empty */

        /*
         * The rest of the superblock fields should be zero, and if not it
         * means they are likely already in use, so leave them alone.  We
         * can leave it up to e2fsck to clean up any inconsistencies there.
         */
}

static inline struct inode *orphan_list_entry(struct list_head *l)
{
        return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
}

static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
{
        struct list_head *l;

        ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
                 le32_to_cpu(sbi->s_es->s_last_orphan));

        printk(KERN_ERR "sb_info orphan list:\n");
        list_for_each(l, &sbi->s_orphan) {
                struct inode *inode = orphan_list_entry(l);
                printk(KERN_ERR "  "
                       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
                       inode->i_sb->s_id, inode->i_ino, inode,
                       inode->i_mode, inode->i_nlink,
                       NEXT_ORPHAN(inode));
        }
}

#ifdef CONFIG_QUOTA
static int ext4_quota_off(struct super_block *sb, int type);

static inline void ext4_quotas_off(struct super_block *sb, int type)
{
        BUG_ON(type > EXT4_MAXQUOTAS);

        /* Use our quota_off function to clear inode flags etc. */
        for (type--; type >= 0; type--)
                ext4_quota_off(sb, type);
}

/*
 * This is a helper function which is used in the mount/remount
 * codepaths (which holds s_umount) to fetch the quota file name.
 */
static inline char *get_qf_name(struct super_block *sb,
                                struct ext4_sb_info *sbi,
                                int type)
{
        return rcu_dereference_protected(sbi->s_qf_names[type],
                                         lockdep_is_held(&sb->s_umount));
}
#else
static inline void ext4_quotas_off(struct super_block *sb, int type)
{
}
#endif

static int ext4_percpu_param_init(struct ext4_sb_info *sbi)
{
        ext4_fsblk_t block;
        int err;

        block = ext4_count_free_clusters(sbi->s_sb);
        ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block));
        err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
                                  GFP_KERNEL);
        if (!err) {
                unsigned long freei = ext4_count_free_inodes(sbi->s_sb);
                sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
                err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
                                          GFP_KERNEL);
        }
        if (!err)
                err = percpu_counter_init(&sbi->s_dirs_counter,
                                          ext4_count_dirs(sbi->s_sb), GFP_KERNEL);
        if (!err)
                err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
                                          GFP_KERNEL);
        if (!err)
                err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0,
                                          GFP_KERNEL);
        if (!err)
                err = percpu_init_rwsem(&sbi->s_writepages_rwsem);

        if (err)
                ext4_msg(sbi->s_sb, KERN_ERR, "insufficient memory");

        return err;
}

static void ext4_percpu_param_destroy(struct ext4_sb_info *sbi)
{
        percpu_counter_destroy(&sbi->s_freeclusters_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
        percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
        percpu_free_rwsem(&sbi->s_writepages_rwsem);
}

static void ext4_group_desc_free(struct ext4_sb_info *sbi)
{
        struct buffer_head **group_desc;
        int i;

        rcu_read_lock();
        group_desc = rcu_dereference(sbi->s_group_desc);
        for (i = 0; i < sbi->s_gdb_count; i++)
                brelse(group_desc[i]);
        kvfree(group_desc);
        rcu_read_unlock();
}

static void ext4_flex_groups_free(struct ext4_sb_info *sbi)
{
        struct flex_groups **flex_groups;
        int i;

        rcu_read_lock();
        flex_groups = rcu_dereference(sbi->s_flex_groups);
        if (flex_groups) {
                for (i = 0; i < sbi->s_flex_groups_allocated; i++)
                        kvfree(flex_groups[i]);
                kvfree(flex_groups);
        }
        rcu_read_unlock();
}

static void ext4_put_super(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        int aborted = 0;
        int err;

        /*
         * Unregister sysfs before destroying jbd2 journal.
         * Since we could still access attr_journal_task attribute via sysfs
         * path which could have sbi->s_journal->j_task as NULL
         * Unregister sysfs before flush sbi->s_sb_upd_work.
         * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If
         * read metadata verify failed then will queue error work.
         * update_super_work will call start_this_handle may trigger
         * BUG_ON.
         */
        ext4_unregister_sysfs(sb);

        if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount"))
                ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.",
                         &sb->s_uuid);

        ext4_unregister_li_request(sb);
        ext4_quotas_off(sb, EXT4_MAXQUOTAS);

        flush_work(&sbi->s_sb_upd_work);
        destroy_workqueue(sbi->rsv_conversion_wq);
        ext4_release_orphan_info(sb);

        if (sbi->s_journal) {
                aborted = is_journal_aborted(sbi->s_journal);
                err = jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
                if ((err < 0) && !aborted) {
                        ext4_abort(sb, -err, "Couldn't clean up the journal");
                }
        }

        ext4_es_unregister_shrinker(sbi);
        timer_shutdown_sync(&sbi->s_err_report);
        ext4_release_system_zone(sb);
        ext4_mb_release(sb);
        ext4_ext_release(sb);

        if (!sb_rdonly(sb) && !aborted) {
                ext4_clear_feature_journal_needs_recovery(sb);
                ext4_clear_feature_orphan_present(sb);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
        }
        if (!sb_rdonly(sb))
                ext4_commit_super(sb);

        ext4_group_desc_free(sbi);
        ext4_flex_groups_free(sbi);
        ext4_percpu_param_destroy(sbi);
#ifdef CONFIG_QUOTA
        for (int i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(get_qf_name(sb, sbi, i));
#endif

        /* Debugging code just in case the in-memory inode orphan list
         * isn't empty.  The on-disk one can be non-empty if we've
         * detected an error and taken the fs readonly, but the
         * in-memory list had better be clean by this point. */
        if (!list_empty(&sbi->s_orphan))
                dump_orphan_list(sb, sbi);
        ASSERT(list_empty(&sbi->s_orphan));

        sync_blockdev(sb->s_bdev);
        invalidate_bdev(sb->s_bdev);
        if (sbi->s_journal_bdev_file) {
                /*
                 * Invalidate the journal device's buffers.  We don't want them
                 * floating about in memory - the physical journal device may
                 * hotswapped, and it breaks the `ro-after' testing code.
                 */
                sync_blockdev(file_bdev(sbi->s_journal_bdev_file));
                invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
        }

        ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
        sbi->s_ea_inode_cache = NULL;

        ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
        sbi->s_ea_block_cache = NULL;

        ext4_stop_mmpd(sbi);

        brelse(sbi->s_sbh);
        sb->s_fs_info = NULL;
        /*
         * Now that we are completely done shutting down the
         * superblock, we need to actually destroy the kobject.
         */
        kobject_put(&sbi->s_kobj);
        wait_for_completion(&sbi->s_kobj_unregister);
        if (sbi->s_chksum_driver)
                crypto_free_shash(sbi->s_chksum_driver);
        kfree(sbi->s_blockgroup_lock);
        fs_put_dax(sbi->s_daxdev, NULL);
        fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
#if IS_ENABLED(CONFIG_UNICODE)
        utf8_unload(sb->s_encoding);
#endif
        kfree(sbi);
}

static struct kmem_cache *ext4_inode_cachep;

/*
 * Called inside transaction, so use GFP_NOFS
 */
static struct inode *ext4_alloc_inode(struct super_block *sb)
{
        struct ext4_inode_info *ei;

        ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;

        inode_set_iversion(&ei->vfs_inode, 1);
        ei->i_flags = 0;
        spin_lock_init(&ei->i_raw_lock);
        ei->i_prealloc_node = RB_ROOT;
        atomic_set(&ei->i_prealloc_active, 0);
        rwlock_init(&ei->i_prealloc_lock);
        ext4_es_init_tree(&ei->i_es_tree);
        rwlock_init(&ei->i_es_lock);
        INIT_LIST_HEAD(&ei->i_es_list);
        ei->i_es_all_nr = 0;
        ei->i_es_shk_nr = 0;
        ei->i_es_shrink_lblk = 0;
        ei->i_reserved_data_blocks = 0;
        spin_lock_init(&(ei->i_block_reservation_lock));
        ext4_init_pending_tree(&ei->i_pending_tree);
#ifdef CONFIG_QUOTA
        ei->i_reserved_quota = 0;
        memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
#endif
        ei->jinode = NULL;
        INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
        spin_lock_init(&ei->i_completed_io_lock);
        ei->i_sync_tid = 0;
        ei->i_datasync_tid = 0;
        atomic_set(&ei->i_unwritten, 0);
        INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
        ext4_fc_init_inode(&ei->vfs_inode);
        mutex_init(&ei->i_fc_lock);
        return &ei->vfs_inode;
}

static int ext4_drop_inode(struct inode *inode)
{
        int drop = generic_drop_inode(inode);

        if (!drop)
                drop = fscrypt_drop_inode(inode);

        trace_ext4_drop_inode(inode, drop);
        return drop;
}

static void ext4_free_in_core_inode(struct inode *inode)
{
        fscrypt_free_inode(inode);
        if (!list_empty(&(EXT4_I(inode)->i_fc_list))) {
                pr_warn("%s: inode %ld still in fc list",
                        __func__, inode->i_ino);
        }
        kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
}

static void ext4_destroy_inode(struct inode *inode)
{
        if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
                ext4_msg(inode->i_sb, KERN_ERR,
                         "Inode %lu (%p): orphan list check failed!",
                         inode->i_ino, EXT4_I(inode));
                print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
                                EXT4_I(inode), sizeof(struct ext4_inode_info),
                                true);
                dump_stack();
        }

        if (EXT4_I(inode)->i_reserved_data_blocks)
                ext4_msg(inode->i_sb, KERN_ERR,
                         "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
                         inode->i_ino, EXT4_I(inode),
                         EXT4_I(inode)->i_reserved_data_blocks);
}

static void ext4_shutdown(struct super_block *sb)
{
       ext4_force_shutdown(sb, EXT4_GOING_FLAGS_NOLOGFLUSH);
}

static void init_once(void *foo)
{
        struct ext4_inode_info *ei = foo;

        INIT_LIST_HEAD(&ei->i_orphan);
        init_rwsem(&ei->xattr_sem);
        init_rwsem(&ei->i_data_sem);
        inode_init_once(&ei->vfs_inode);
        ext4_fc_init_inode(&ei->vfs_inode);
}

static int __init init_inodecache(void)
{
        ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache",
                                sizeof(struct ext4_inode_info), 0,
                                SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
                                offsetof(struct ext4_inode_info, i_data),
                                sizeof_field(struct ext4_inode_info, i_data),
                                init_once);
        if (ext4_inode_cachep == NULL)
                return -ENOMEM;
        return 0;
}

static void destroy_inodecache(void)
{
        /*
         * Make sure all delayed rcu free inodes are flushed before we
         * destroy cache.
         */
        rcu_barrier();
        kmem_cache_destroy(ext4_inode_cachep);
}

void ext4_clear_inode(struct inode *inode)
{
        ext4_fc_del(inode);
        invalidate_inode_buffers(inode);
        clear_inode(inode);
        ext4_discard_preallocations(inode);
        ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
        dquot_drop(inode);
        if (EXT4_I(inode)->jinode) {
                jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
                                               EXT4_I(inode)->jinode);
                jbd2_free_inode(EXT4_I(inode)->jinode);
                EXT4_I(inode)->jinode = NULL;
        }
        fscrypt_put_encryption_info(inode);
        fsverity_cleanup_inode(inode);
}

static struct inode *ext4_nfs_get_inode(struct super_block *sb,
                                        u64 ino, u32 generation)
{
        struct inode *inode;

        /*
         * Currently we don't know the generation for parent directory, so
         * a generation of 0 means "accept any"
         */
        inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE);
        if (IS_ERR(inode))
                return ERR_CAST(inode);
        if (generation && inode->i_generation != generation) {
                iput(inode);
                return ERR_PTR(-ESTALE);
        }

        return inode;
}

static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
                                        int fh_len, int fh_type)
{
        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
                                    ext4_nfs_get_inode);
}

static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
                                        int fh_len, int fh_type)
{
        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
                                    ext4_nfs_get_inode);
}

static int ext4_nfs_commit_metadata(struct inode *inode)
{
        struct writeback_control wbc = {
                .sync_mode = WB_SYNC_ALL
        };

        trace_ext4_nfs_commit_metadata(inode);
        return ext4_write_inode(inode, &wbc);
}

#ifdef CONFIG_QUOTA
static const char * const quotatypes[] = INITQFNAMES;
#define QTYPE2NAME(t) (quotatypes[t])

static int ext4_write_dquot(struct dquot *dquot);
static int ext4_acquire_dquot(struct dquot *dquot);
static int ext4_release_dquot(struct dquot *dquot);
static int ext4_mark_dquot_dirty(struct dquot *dquot);
static int ext4_write_info(struct super_block *sb, int type);
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                         const struct path *path);
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off);
static ssize_t ext4_quota_write(struct super_block *sb, int type,
                                const char *data, size_t len, loff_t off);
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
                             unsigned int flags);

static struct dquot __rcu **ext4_get_dquots(struct inode *inode)
{
        return EXT4_I(inode)->i_dquot;
}

static const struct dquot_operations ext4_quota_operations = {
        .get_reserved_space        = ext4_get_reserved_space,
        .write_dquot                = ext4_write_dquot,
        .acquire_dquot                = ext4_acquire_dquot,
        .release_dquot                = ext4_release_dquot,
        .mark_dirty                = ext4_mark_dquot_dirty,
        .write_info                = ext4_write_info,
        .alloc_dquot                = dquot_alloc,
        .destroy_dquot                = dquot_destroy,
        .get_projid                = ext4_get_projid,
        .get_inode_usage        = ext4_get_inode_usage,
        .get_next_id                = dquot_get_next_id,
};

static const struct quotactl_ops ext4_qctl_operations = {
        .quota_on        = ext4_quota_on,
        .quota_off        = ext4_quota_off,
        .quota_sync        = dquot_quota_sync,
        .get_state        = dquot_get_state,
        .set_info        = dquot_set_dqinfo,
        .get_dqblk        = dquot_get_dqblk,
        .set_dqblk        = dquot_set_dqblk,
        .get_nextdqblk        = dquot_get_next_dqblk,
};
#endif

static const struct super_operations ext4_sops = {
        .alloc_inode        = ext4_alloc_inode,
        .free_inode        = ext4_free_in_core_inode,
        .destroy_inode        = ext4_destroy_inode,
        .write_inode        = ext4_write_inode,
        .dirty_inode        = ext4_dirty_inode,
        .drop_inode        = ext4_drop_inode,
        .evict_inode        = ext4_evict_inode,
        .put_super        = ext4_put_super,
        .sync_fs        = ext4_sync_fs,
        .freeze_fs        = ext4_freeze,
        .unfreeze_fs        = ext4_unfreeze,
        .statfs                = ext4_statfs,
        .show_options        = ext4_show_options,
        .shutdown        = ext4_shutdown,
#ifdef CONFIG_QUOTA
        .quota_read        = ext4_quota_read,
        .quota_write        = ext4_quota_write,
        .get_dquots        = ext4_get_dquots,
#endif
};

static const struct export_operations ext4_export_ops = {
        .encode_fh = generic_encode_ino32_fh,
        .fh_to_dentry = ext4_fh_to_dentry,
        .fh_to_parent = ext4_fh_to_parent,
        .get_parent = ext4_get_parent,
        .commit_metadata = ext4_nfs_commit_metadata,
};

enum {
        Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
        Opt_resgid, Opt_resuid, Opt_sb,
        Opt_nouid32, Opt_debug, Opt_removed,
        Opt_user_xattr, Opt_acl,
        Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
        Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
        Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
        Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
        Opt_inlinecrypt,
        Opt_usrjquota, Opt_grpjquota, Opt_quota,
        Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
        Opt_usrquota, Opt_grpquota, Opt_prjquota,
        Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
        Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize,
        Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio,
        Opt_dioread_nolock, Opt_dioread_lock,
        Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
        Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
        Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
        Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type,
#ifdef CONFIG_EXT4_DEBUG
        Opt_fc_debug_max_replay, Opt_fc_debug_force
#endif
};

static const struct constant_table ext4_param_errors[] = {
        {"continue",        EXT4_MOUNT_ERRORS_CONT},
        {"panic",        EXT4_MOUNT_ERRORS_PANIC},
        {"remount-ro",        EXT4_MOUNT_ERRORS_RO},
        {}
};

static const struct constant_table ext4_param_data[] = {
        {"journal",        EXT4_MOUNT_JOURNAL_DATA},
        {"ordered",        EXT4_MOUNT_ORDERED_DATA},
        {"writeback",        EXT4_MOUNT_WRITEBACK_DATA},
        {}
};

static const struct constant_table ext4_param_data_err[] = {
        {"abort",        Opt_data_err_abort},
        {"ignore",        Opt_data_err_ignore},
        {}
};

static const struct constant_table ext4_param_jqfmt[] = {
        {"vfsold",        QFMT_VFS_OLD},
        {"vfsv0",        QFMT_VFS_V0},
        {"vfsv1",        QFMT_VFS_V1},
        {}
};

static const struct constant_table ext4_param_dax[] = {
        {"always",        Opt_dax_always},
        {"inode",        Opt_dax_inode},
        {"never",        Opt_dax_never},
        {}
};

/*
 * Mount option specification
 * We don't use fsparam_flag_no because of the way we set the
 * options and the way we show them in _ext4_show_options(). To
 * keep the changes to a minimum, let's keep the negative options
 * separate for now.
 */
static const struct fs_parameter_spec ext4_param_specs[] = {
        fsparam_flag        ("bsddf",                Opt_bsd_df),
        fsparam_flag        ("minixdf",                Opt_minix_df),
        fsparam_flag        ("grpid",                Opt_grpid),
        fsparam_flag        ("bsdgroups",                Opt_grpid),
        fsparam_flag        ("nogrpid",                Opt_nogrpid),
        fsparam_flag        ("sysvgroups",                Opt_nogrpid),
        fsparam_u32        ("resgid",                Opt_resgid),
        fsparam_u32        ("resuid",                Opt_resuid),
        fsparam_u32        ("sb",                        Opt_sb),
        fsparam_enum        ("errors",                Opt_errors, ext4_param_errors),
        fsparam_flag        ("nouid32",                Opt_nouid32),
        fsparam_flag        ("debug",                Opt_debug),
        fsparam_flag        ("oldalloc",                Opt_removed),
        fsparam_flag        ("orlov",                Opt_removed),
        fsparam_flag        ("user_xattr",                Opt_user_xattr),
        fsparam_flag        ("acl",                        Opt_acl),
        fsparam_flag        ("norecovery",                Opt_noload),
        fsparam_flag        ("noload",                Opt_noload),
        fsparam_flag        ("bh",                        Opt_removed),
        fsparam_flag        ("nobh",                Opt_removed),
        fsparam_u32        ("commit",                Opt_commit),
        fsparam_u32        ("min_batch_time",        Opt_min_batch_time),
        fsparam_u32        ("max_batch_time",        Opt_max_batch_time),
        fsparam_u32        ("journal_dev",                Opt_journal_dev),
        fsparam_bdev        ("journal_path",        Opt_journal_path),
        fsparam_flag        ("journal_checksum",        Opt_journal_checksum),
        fsparam_flag        ("nojournal_checksum",        Opt_nojournal_checksum),
        fsparam_flag        ("journal_async_commit",Opt_journal_async_commit),
        fsparam_flag        ("abort",                Opt_abort),
        fsparam_enum        ("data",                Opt_data, ext4_param_data),
        fsparam_enum        ("data_err",                Opt_data_err,
                                                ext4_param_data_err),
        fsparam_string_empty
                        ("usrjquota",                Opt_usrjquota),
        fsparam_string_empty
                        ("grpjquota",                Opt_grpjquota),
        fsparam_enum        ("jqfmt",                Opt_jqfmt, ext4_param_jqfmt),
        fsparam_flag        ("grpquota",                Opt_grpquota),
        fsparam_flag        ("quota",                Opt_quota),
        fsparam_flag        ("noquota",                Opt_noquota),
        fsparam_flag        ("usrquota",                Opt_usrquota),
        fsparam_flag        ("prjquota",                Opt_prjquota),
        fsparam_flag        ("barrier",                Opt_barrier),
        fsparam_u32        ("barrier",                Opt_barrier),
        fsparam_flag        ("nobarrier",                Opt_nobarrier),
        fsparam_flag        ("i_version",                Opt_removed),
        fsparam_flag        ("dax",                        Opt_dax),
        fsparam_enum        ("dax",                        Opt_dax_type, ext4_param_dax),
        fsparam_u32        ("stripe",                Opt_stripe),
        fsparam_flag        ("delalloc",                Opt_delalloc),
        fsparam_flag        ("nodelalloc",                Opt_nodelalloc),
        fsparam_flag        ("warn_on_error",        Opt_warn_on_error),
        fsparam_flag        ("nowarn_on_error",        Opt_nowarn_on_error),
        fsparam_u32        ("debug_want_extra_isize",
                                                Opt_debug_want_extra_isize),
        fsparam_flag        ("mblk_io_submit",        Opt_removed),
        fsparam_flag        ("nomblk_io_submit",        Opt_removed),
        fsparam_flag        ("block_validity",        Opt_block_validity),
        fsparam_flag        ("noblock_validity",        Opt_noblock_validity),
        fsparam_u32        ("inode_readahead_blks",
                                                Opt_inode_readahead_blks),
        fsparam_u32        ("journal_ioprio",        Opt_journal_ioprio),
        fsparam_u32        ("auto_da_alloc",        Opt_auto_da_alloc),
        fsparam_flag        ("auto_da_alloc",        Opt_auto_da_alloc),
        fsparam_flag        ("noauto_da_alloc",        Opt_noauto_da_alloc),
        fsparam_flag        ("dioread_nolock",        Opt_dioread_nolock),
        fsparam_flag        ("nodioread_nolock",        Opt_dioread_lock),
        fsparam_flag        ("dioread_lock",        Opt_dioread_lock),
        fsparam_flag        ("discard",                Opt_discard),
        fsparam_flag        ("nodiscard",                Opt_nodiscard),
        fsparam_u32        ("init_itable",                Opt_init_itable),
        fsparam_flag        ("init_itable",                Opt_init_itable),
        fsparam_flag        ("noinit_itable",        Opt_noinit_itable),
#ifdef CONFIG_EXT4_DEBUG
        fsparam_flag        ("fc_debug_force",        Opt_fc_debug_force),
        fsparam_u32        ("fc_debug_max_replay",        Opt_fc_debug_max_replay),
#endif
        fsparam_u32        ("max_dir_size_kb",        Opt_max_dir_size_kb),
        fsparam_flag        ("test_dummy_encryption",
                                                Opt_test_dummy_encryption),
        fsparam_string        ("test_dummy_encryption",
                                                Opt_test_dummy_encryption),
        fsparam_flag        ("inlinecrypt",                Opt_inlinecrypt),
        fsparam_flag        ("nombcache",                Opt_nombcache),
        fsparam_flag        ("no_mbcache",                Opt_nombcache),        /* for backward compatibility */
        fsparam_flag        ("prefetch_block_bitmaps",
                                                Opt_removed),
        fsparam_flag        ("no_prefetch_block_bitmaps",
                                                Opt_no_prefetch_block_bitmaps),
        fsparam_s32        ("mb_optimize_scan",        Opt_mb_optimize_scan),
        fsparam_string        ("check",                Opt_removed),        /* mount option from ext2/3 */
        fsparam_flag        ("nocheck",                Opt_removed),        /* mount option from ext2/3 */
        fsparam_flag        ("reservation",                Opt_removed),        /* mount option from ext2/3 */
        fsparam_flag        ("noreservation",        Opt_removed),        /* mount option from ext2/3 */
        fsparam_u32        ("journal",                Opt_removed),        /* mount option from ext2/3 */
        {}
};

#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))

#define MOPT_SET        0x0001
#define MOPT_CLEAR        0x0002
#define MOPT_NOSUPPORT        0x0004
#define MOPT_EXPLICIT        0x0008
#ifdef CONFIG_QUOTA
#define MOPT_Q                0
#define MOPT_QFMT        0x0010
#else
#define MOPT_Q                MOPT_NOSUPPORT
#define MOPT_QFMT        MOPT_NOSUPPORT
#endif
#define MOPT_NO_EXT2        0x0020
#define MOPT_NO_EXT3        0x0040
#define MOPT_EXT4_ONLY        (MOPT_NO_EXT2 | MOPT_NO_EXT3)
#define MOPT_SKIP        0x0080
#define        MOPT_2                0x0100

static const struct mount_opts {
        int        token;
        int        mount_opt;
        int        flags;
} ext4_mount_opts[] = {
        {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
        {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
        {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
        {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
        {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
        {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
        {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
         MOPT_EXT4_ONLY | MOPT_SET},
        {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
         MOPT_EXT4_ONLY | MOPT_CLEAR},
        {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
        {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
        {Opt_delalloc, EXT4_MOUNT_DELALLOC,
         MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
        {Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
         MOPT_EXT4_ONLY | MOPT_CLEAR},
        {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET},
        {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR},
        {Opt_commit, 0, MOPT_NO_EXT2},
        {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
         MOPT_EXT4_ONLY | MOPT_CLEAR},
        {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
         MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
        {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
                                    EXT4_MOUNT_JOURNAL_CHECKSUM),
         MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
        {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
        {Opt_data_err, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_NO_EXT2},
        {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
        {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
        {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
        {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
        {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
        {Opt_dax_type, 0, MOPT_EXT4_ONLY},
        {Opt_journal_dev, 0, MOPT_NO_EXT2},
        {Opt_journal_path, 0, MOPT_NO_EXT2},
        {Opt_journal_ioprio, 0, MOPT_NO_EXT2},
        {Opt_data, 0, MOPT_NO_EXT2},
        {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
#ifdef CONFIG_EXT4_FS_POSIX_ACL
        {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
#else
        {Opt_acl, 0, MOPT_NOSUPPORT},
#endif
        {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
        {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
        {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
        {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
                                                        MOPT_SET | MOPT_Q},
        {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
                                                        MOPT_SET | MOPT_Q},
        {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA,
                                                        MOPT_SET | MOPT_Q},
        {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
                       EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA),
                                                        MOPT_CLEAR | MOPT_Q},
        {Opt_usrjquota, 0, MOPT_Q},
        {Opt_grpjquota, 0, MOPT_Q},
        {Opt_jqfmt, 0, MOPT_QFMT},
        {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
        {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS,
         MOPT_SET},
#ifdef CONFIG_EXT4_DEBUG
        {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
         MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
#endif
        {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2},
        {Opt_err, 0, 0}
};

#if IS_ENABLED(CONFIG_UNICODE)
static const struct ext4_sb_encodings {
        __u16 magic;
        char *name;
        unsigned int version;
} ext4_sb_encoding_map[] = {
        {EXT4_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)},
};

static const struct ext4_sb_encodings *
ext4_sb_read_encoding(const struct ext4_super_block *es)
{
        __u16 magic = le16_to_cpu(es->s_encoding);
        int i;

        for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++)
                if (magic == ext4_sb_encoding_map[i].magic)
                        return &ext4_sb_encoding_map[i];

        return NULL;
}
#endif

#define EXT4_SPEC_JQUOTA                        (1 <<  0)
#define EXT4_SPEC_JQFMT                                (1 <<  1)
#define EXT4_SPEC_DATAJ                                (1 <<  2)
#define EXT4_SPEC_SB_BLOCK                        (1 <<  3)
#define EXT4_SPEC_JOURNAL_DEV                        (1 <<  4)
#define EXT4_SPEC_JOURNAL_IOPRIO                (1 <<  5)
#define EXT4_SPEC_s_want_extra_isize                (1 <<  7)
#define EXT4_SPEC_s_max_batch_time                (1 <<  8)
#define EXT4_SPEC_s_min_batch_time                (1 <<  9)
#define EXT4_SPEC_s_inode_readahead_blks        (1 << 10)
#define EXT4_SPEC_s_li_wait_mult                (1 << 11)
#define EXT4_SPEC_s_max_dir_size_kb                (1 << 12)
#define EXT4_SPEC_s_stripe                        (1 << 13)
#define EXT4_SPEC_s_resuid                        (1 << 14)
#define EXT4_SPEC_s_resgid                        (1 << 15)
#define EXT4_SPEC_s_commit_interval                (1 << 16)
#define EXT4_SPEC_s_fc_debug_max_replay                (1 << 17)
#define EXT4_SPEC_s_sb_block                        (1 << 18)
#define EXT4_SPEC_mb_optimize_scan                (1 << 19)

struct ext4_fs_context {
        char                *s_qf_names[EXT4_MAXQUOTAS];
        struct fscrypt_dummy_policy dummy_enc_policy;
        int                s_jquota_fmt;        /* Format of quota to use */
#ifdef CONFIG_EXT4_DEBUG
        int s_fc_debug_max_replay;
#endif
        unsigned short        qname_spec;
        unsigned long        vals_s_flags;        /* Bits to set in s_flags */
        unsigned long        mask_s_flags;        /* Bits changed in s_flags */
        unsigned long        journal_devnum;
        unsigned long        s_commit_interval;
        unsigned long        s_stripe;
        unsigned int        s_inode_readahead_blks;
        unsigned int        s_want_extra_isize;
        unsigned int        s_li_wait_mult;
        unsigned int        s_max_dir_size_kb;
        unsigned int        journal_ioprio;
        unsigned int        vals_s_mount_opt;
        unsigned int        mask_s_mount_opt;
        unsigned int        vals_s_mount_opt2;
        unsigned int        mask_s_mount_opt2;
        unsigned int        opt_flags;        /* MOPT flags */
        unsigned int        spec;
        u32                s_max_batch_time;
        u32                s_min_batch_time;
        kuid_t                s_resuid;
        kgid_t                s_resgid;
        ext4_fsblk_t        s_sb_block;
};

static void ext4_fc_free(struct fs_context *fc)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        int i;

        if (!ctx)
                return;

        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(ctx->s_qf_names[i]);

        fscrypt_free_dummy_policy(&ctx->dummy_enc_policy);
        kfree(ctx);
}

int ext4_init_fs_context(struct fs_context *fc)
{
        struct ext4_fs_context *ctx;

        ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        fc->fs_private = ctx;
        fc->ops = &ext4_context_ops;

        return 0;
}

#ifdef CONFIG_QUOTA
/*
 * Note the name of the specified quota file.
 */
static int note_qf_name(struct fs_context *fc, int qtype,
                       struct fs_parameter *param)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        char *qname;

        if (param->size < 1) {
                ext4_msg(NULL, KERN_ERR, "Missing quota name");
                return -EINVAL;
        }
        if (strchr(param->string, '/')) {
                ext4_msg(NULL, KERN_ERR,
                         "quotafile must be on filesystem root");
                return -EINVAL;
        }
        if (ctx->s_qf_names[qtype]) {
                if (strcmp(ctx->s_qf_names[qtype], param->string) != 0) {
                        ext4_msg(NULL, KERN_ERR,
                                 "%s quota file already specified",
                                 QTYPE2NAME(qtype));
                        return -EINVAL;
                }
                return 0;
        }

        qname = kmemdup_nul(param->string, param->size, GFP_KERNEL);
        if (!qname) {
                ext4_msg(NULL, KERN_ERR,
                         "Not enough memory for storing quotafile name");
                return -ENOMEM;
        }
        ctx->s_qf_names[qtype] = qname;
        ctx->qname_spec |= 1 << qtype;
        ctx->spec |= EXT4_SPEC_JQUOTA;
        return 0;
}

/*
 * Clear the name of the specified quota file.
 */
static int unnote_qf_name(struct fs_context *fc, int qtype)
{
        struct ext4_fs_context *ctx = fc->fs_private;

        kfree(ctx->s_qf_names[qtype]);

        ctx->s_qf_names[qtype] = NULL;
        ctx->qname_spec |= 1 << qtype;
        ctx->spec |= EXT4_SPEC_JQUOTA;
        return 0;
}
#endif

static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param,
                                            struct ext4_fs_context *ctx)
{
        int err;

        if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) {
                ext4_msg(NULL, KERN_WARNING,
                         "test_dummy_encryption option not supported");
                return -EINVAL;
        }
        err = fscrypt_parse_test_dummy_encryption(param,
                                                  &ctx->dummy_enc_policy);
        if (err == -EINVAL) {
                ext4_msg(NULL, KERN_WARNING,
                         "Value of option \"%s\" is unrecognized", param->key);
        } else if (err == -EEXIST) {
                ext4_msg(NULL, KERN_WARNING,
                         "Conflicting test_dummy_encryption options");
                return -EINVAL;
        }
        return err;
}

#define EXT4_SET_CTX(name)                                                \
static inline void ctx_set_##name(struct ext4_fs_context *ctx,                \
                                  unsigned long flag)                        \
{                                                                        \
        ctx->mask_s_##name |= flag;                                        \
        ctx->vals_s_##name |= flag;                                        \
}

#define EXT4_CLEAR_CTX(name)                                                \
static inline void ctx_clear_##name(struct ext4_fs_context *ctx,        \
                                    unsigned long flag)                        \
{                                                                        \
        ctx->mask_s_##name |= flag;                                        \
        ctx->vals_s_##name &= ~flag;                                        \
}

#define EXT4_TEST_CTX(name)                                                \
static inline unsigned long                                                \
ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag)        \
{                                                                        \
        return (ctx->vals_s_##name & flag);                                \
}

EXT4_SET_CTX(flags); /* set only */
EXT4_SET_CTX(mount_opt);
EXT4_CLEAR_CTX(mount_opt);
EXT4_TEST_CTX(mount_opt);
EXT4_SET_CTX(mount_opt2);
EXT4_CLEAR_CTX(mount_opt2);
EXT4_TEST_CTX(mount_opt2);

static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        struct fs_parse_result result;
        const struct mount_opts *m;
        int is_remount;
        kuid_t uid;
        kgid_t gid;
        int token;

        token = fs_parse(fc, ext4_param_specs, param, &result);
        if (token < 0)
                return token;
        is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;

        for (m = ext4_mount_opts; m->token != Opt_err; m++)
                if (token == m->token)
                        break;

        ctx->opt_flags |= m->flags;

        if (m->flags & MOPT_EXPLICIT) {
                if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
                        ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_DELALLOC);
                } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
                        ctx_set_mount_opt2(ctx,
                                       EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM);
                } else
                        return -EINVAL;
        }

        if (m->flags & MOPT_NOSUPPORT) {
                ext4_msg(NULL, KERN_ERR, "%s option not supported",
                         param->key);
                return 0;
        }

        switch (token) {
#ifdef CONFIG_QUOTA
        case Opt_usrjquota:
                if (!*param->string)
                        return unnote_qf_name(fc, USRQUOTA);
                else
                        return note_qf_name(fc, USRQUOTA, param);
        case Opt_grpjquota:
                if (!*param->string)
                        return unnote_qf_name(fc, GRPQUOTA);
                else
                        return note_qf_name(fc, GRPQUOTA, param);
#endif
        case Opt_sb:
                if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
                        ext4_msg(NULL, KERN_WARNING,
                                 "Ignoring %s option on remount", param->key);
                } else {
                        ctx->s_sb_block = result.uint_32;
                        ctx->spec |= EXT4_SPEC_s_sb_block;
                }
                return 0;
        case Opt_removed:
                ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option",
                         param->key);
                return 0;
        case Opt_inlinecrypt:
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
                ctx_set_flags(ctx, SB_INLINECRYPT);
#else
                ext4_msg(NULL, KERN_ERR, "inline encryption not supported");
#endif
                return 0;
        case Opt_errors:
                ctx_clear_mount_opt(ctx, EXT4_MOUNT_ERRORS_MASK);
                ctx_set_mount_opt(ctx, result.uint_32);
                return 0;
#ifdef CONFIG_QUOTA
        case Opt_jqfmt:
                ctx->s_jquota_fmt = result.uint_32;
                ctx->spec |= EXT4_SPEC_JQFMT;
                return 0;
#endif
        case Opt_data:
                ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS);
                ctx_set_mount_opt(ctx, result.uint_32);
                ctx->spec |= EXT4_SPEC_DATAJ;
                return 0;
        case Opt_commit:
                if (result.uint_32 == 0)
                        result.uint_32 = JBD2_DEFAULT_MAX_COMMIT_AGE;
                else if (result.uint_32 > INT_MAX / HZ) {
                        ext4_msg(NULL, KERN_ERR,
                                 "Invalid commit interval %d, "
                                 "must be smaller than %d",
                                 result.uint_32, INT_MAX / HZ);
                        return -EINVAL;
                }
                ctx->s_commit_interval = HZ * result.uint_32;
                ctx->spec |= EXT4_SPEC_s_commit_interval;
                return 0;
        case Opt_debug_want_extra_isize:
                if ((result.uint_32 & 1) || (result.uint_32 < 4)) {
                        ext4_msg(NULL, KERN_ERR,
                                 "Invalid want_extra_isize %d", result.uint_32);
                        return -EINVAL;
                }
                ctx->s_want_extra_isize = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_want_extra_isize;
                return 0;
        case Opt_max_batch_time:
                ctx->s_max_batch_time = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_max_batch_time;
                return 0;
        case Opt_min_batch_time:
                ctx->s_min_batch_time = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_min_batch_time;
                return 0;
        case Opt_inode_readahead_blks:
                if (result.uint_32 &&
                    (result.uint_32 > (1 << 30) ||
                     !is_power_of_2(result.uint_32))) {
                        ext4_msg(NULL, KERN_ERR,
                                 "EXT4-fs: inode_readahead_blks must be "
                                 "0 or a power of 2 smaller than 2^31");
                        return -EINVAL;
                }
                ctx->s_inode_readahead_blks = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_inode_readahead_blks;
                return 0;
        case Opt_init_itable:
                ctx_set_mount_opt(ctx, EXT4_MOUNT_INIT_INODE_TABLE);
                ctx->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
                if (param->type == fs_value_is_string)
                        ctx->s_li_wait_mult = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_li_wait_mult;
                return 0;
        case Opt_max_dir_size_kb:
                ctx->s_max_dir_size_kb = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_max_dir_size_kb;
                return 0;
#ifdef CONFIG_EXT4_DEBUG
        case Opt_fc_debug_max_replay:
                ctx->s_fc_debug_max_replay = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_fc_debug_max_replay;
                return 0;
#endif
        case Opt_stripe:
                ctx->s_stripe = result.uint_32;
                ctx->spec |= EXT4_SPEC_s_stripe;
                return 0;
        case Opt_resuid:
                uid = make_kuid(current_user_ns(), result.uint_32);
                if (!uid_valid(uid)) {
                        ext4_msg(NULL, KERN_ERR, "Invalid uid value %d",
                                 result.uint_32);
                        return -EINVAL;
                }
                ctx->s_resuid = uid;
                ctx->spec |= EXT4_SPEC_s_resuid;
                return 0;
        case Opt_resgid:
                gid = make_kgid(current_user_ns(), result.uint_32);
                if (!gid_valid(gid)) {
                        ext4_msg(NULL, KERN_ERR, "Invalid gid value %d",
                                 result.uint_32);
                        return -EINVAL;
                }
                ctx->s_resgid = gid;
                ctx->spec |= EXT4_SPEC_s_resgid;
                return 0;
        case Opt_journal_dev:
                if (is_remount) {
                        ext4_msg(NULL, KERN_ERR,
                                 "Cannot specify journal on remount");
                        return -EINVAL;
                }
                ctx->journal_devnum = result.uint_32;
                ctx->spec |= EXT4_SPEC_JOURNAL_DEV;
                return 0;
        case Opt_journal_path:
        {
                struct inode *journal_inode;
                struct path path;
                int error;

                if (is_remount) {
                        ext4_msg(NULL, KERN_ERR,
                                 "Cannot specify journal on remount");
                        return -EINVAL;
                }

                error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path);
                if (error) {
                        ext4_msg(NULL, KERN_ERR, "error: could not find "
                                 "journal device path");
                        return -EINVAL;
                }

                journal_inode = d_inode(path.dentry);
                ctx->journal_devnum = new_encode_dev(journal_inode->i_rdev);
                ctx->spec |= EXT4_SPEC_JOURNAL_DEV;
                path_put(&path);
                return 0;
        }
        case Opt_journal_ioprio:
                if (result.uint_32 > 7) {
                        ext4_msg(NULL, KERN_ERR, "Invalid journal IO priority"
                                 " (must be 0-7)");
                        return -EINVAL;
                }
                ctx->journal_ioprio =
                        IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, result.uint_32);
                ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO;
                return 0;
        case Opt_test_dummy_encryption:
                return ext4_parse_test_dummy_encryption(param, ctx);
        case Opt_dax:
        case Opt_dax_type:
#ifdef CONFIG_FS_DAX
        {
                int type = (token == Opt_dax) ?
                           Opt_dax : result.uint_32;

                switch (type) {
                case Opt_dax:
                case Opt_dax_always:
                        ctx_set_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
                        ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
                        break;
                case Opt_dax_never:
                        ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
                        ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
                        break;
                case Opt_dax_inode:
                        ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS);
                        ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER);
                        /* Strictly for printing options */
                        ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE);
                        break;
                }
                return 0;
        }
#else
                ext4_msg(NULL, KERN_INFO, "dax option not supported");
                return -EINVAL;
#endif
        case Opt_data_err:
                if (result.uint_32 == Opt_data_err_abort)
                        ctx_set_mount_opt(ctx, m->mount_opt);
                else if (result.uint_32 == Opt_data_err_ignore)
                        ctx_clear_mount_opt(ctx, m->mount_opt);
                return 0;
        case Opt_mb_optimize_scan:
                if (result.int_32 == 1) {
                        ctx_set_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
                        ctx->spec |= EXT4_SPEC_mb_optimize_scan;
                } else if (result.int_32 == 0) {
                        ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
                        ctx->spec |= EXT4_SPEC_mb_optimize_scan;
                } else {
                        ext4_msg(NULL, KERN_WARNING,
                                 "mb_optimize_scan should be set to 0 or 1.");
                        return -EINVAL;
                }
                return 0;
        }

        /*
         * At this point we should only be getting options requiring MOPT_SET,
         * or MOPT_CLEAR. Anything else is a bug
         */
        if (m->token == Opt_err) {
                ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s",
                         param->key);
                WARN_ON(1);
                return -EINVAL;
        }

        else {
                unsigned int set = 0;

                if ((param->type == fs_value_is_flag) ||
                    result.uint_32 > 0)
                        set = 1;

                if (m->flags & MOPT_CLEAR)
                        set = !set;
                else if (unlikely(!(m->flags & MOPT_SET))) {
                        ext4_msg(NULL, KERN_WARNING,
                                 "buggy handling of option %s",
                                 param->key);
                        WARN_ON(1);
                        return -EINVAL;
                }
                if (m->flags & MOPT_2) {
                        if (set != 0)
                                ctx_set_mount_opt2(ctx, m->mount_opt);
                        else
                                ctx_clear_mount_opt2(ctx, m->mount_opt);
                } else {
                        if (set != 0)
                                ctx_set_mount_opt(ctx, m->mount_opt);
                        else
                                ctx_clear_mount_opt(ctx, m->mount_opt);
                }
        }

        return 0;
}

static int parse_options(struct fs_context *fc, char *options)
{
        struct fs_parameter param;
        int ret;
        char *key;

        if (!options)
                return 0;

        while ((key = strsep(&options, ",")) != NULL) {
                if (*key) {
                        size_t v_len = 0;
                        char *value = strchr(key, '=');

                        param.type = fs_value_is_flag;
                        param.string = NULL;

                        if (value) {
                                if (value == key)
                                        continue;

                                *value++ = 0;
                                v_len = strlen(value);
                                param.string = kmemdup_nul(value, v_len,
                                                           GFP_KERNEL);
                                if (!param.string)
                                        return -ENOMEM;
                                param.type = fs_value_is_string;
                        }

                        param.key = key;
                        param.size = v_len;

                        ret = ext4_parse_param(fc, &param);
                        kfree(param.string);
                        if (ret < 0)
                                return ret;
                }
        }

        ret = ext4_validate_options(fc);
        if (ret < 0)
                return ret;

        return 0;
}

static int parse_apply_sb_mount_options(struct super_block *sb,
                                        struct ext4_fs_context *m_ctx)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        char *s_mount_opts = NULL;
        struct ext4_fs_context *s_ctx = NULL;
        struct fs_context *fc = NULL;
        int ret = -ENOMEM;

        if (!sbi->s_es->s_mount_opts[0])
                return 0;

        s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
                                sizeof(sbi->s_es->s_mount_opts),
                                GFP_KERNEL);
        if (!s_mount_opts)
                return ret;

        fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
        if (!fc)
                goto out_free;

        s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL);
        if (!s_ctx)
                goto out_free;

        fc->fs_private = s_ctx;
        fc->s_fs_info = sbi;

        ret = parse_options(fc, s_mount_opts);
        if (ret < 0)
                goto parse_failed;

        ret = ext4_check_opt_consistency(fc, sb);
        if (ret < 0) {
parse_failed:
                ext4_msg(sb, KERN_WARNING,
                         "failed to parse options in superblock: %s",
                         s_mount_opts);
                ret = 0;
                goto out_free;
        }

        if (s_ctx->spec & EXT4_SPEC_JOURNAL_DEV)
                m_ctx->journal_devnum = s_ctx->journal_devnum;
        if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)
                m_ctx->journal_ioprio = s_ctx->journal_ioprio;

        ext4_apply_options(fc, sb);
        ret = 0;

out_free:
        if (fc) {
                ext4_fc_free(fc);
                kfree(fc);
        }
        kfree(s_mount_opts);
        return ret;
}

static void ext4_apply_quota_options(struct fs_context *fc,
                                     struct super_block *sb)
{
#ifdef CONFIG_QUOTA
        bool quota_feature = ext4_has_feature_quota(sb);
        struct ext4_fs_context *ctx = fc->fs_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        char *qname;
        int i;

        if (quota_feature)
                return;

        if (ctx->spec & EXT4_SPEC_JQUOTA) {
                for (i = 0; i < EXT4_MAXQUOTAS; i++) {
                        if (!(ctx->qname_spec & (1 << i)))
                                continue;

                        qname = ctx->s_qf_names[i]; /* May be NULL */
                        if (qname)
                                set_opt(sb, QUOTA);
                        ctx->s_qf_names[i] = NULL;
                        qname = rcu_replace_pointer(sbi->s_qf_names[i], qname,
                                                lockdep_is_held(&sb->s_umount));
                        if (qname)
                                kfree_rcu_mightsleep(qname);
                }
        }

        if (ctx->spec & EXT4_SPEC_JQFMT)
                sbi->s_jquota_fmt = ctx->s_jquota_fmt;
#endif
}

/*
 * Check quota settings consistency.
 */
static int ext4_check_quota_consistency(struct fs_context *fc,
                                        struct super_block *sb)
{
#ifdef CONFIG_QUOTA
        struct ext4_fs_context *ctx = fc->fs_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        bool quota_feature = ext4_has_feature_quota(sb);
        bool quota_loaded = sb_any_quota_loaded(sb);
        bool usr_qf_name, grp_qf_name, usrquota, grpquota;
        int quota_flags, i;

        /*
         * We do the test below only for project quotas. 'usrquota' and
         * 'grpquota' mount options are allowed even without quota feature
         * to support legacy quotas in quota files.
         */
        if (ctx_test_mount_opt(ctx, EXT4_MOUNT_PRJQUOTA) &&
            !ext4_has_feature_project(sb)) {
                ext4_msg(NULL, KERN_ERR, "Project quota feature not enabled. "
                         "Cannot enable project quota enforcement.");
                return -EINVAL;
        }

        quota_flags = EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
                      EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA;
        if (quota_loaded &&
            ctx->mask_s_mount_opt & quota_flags &&
            !ctx_test_mount_opt(ctx, quota_flags))
                goto err_quota_change;

        if (ctx->spec & EXT4_SPEC_JQUOTA) {

                for (i = 0; i < EXT4_MAXQUOTAS; i++) {
                        if (!(ctx->qname_spec & (1 << i)))
                                continue;

                        if (quota_loaded &&
                            !!sbi->s_qf_names[i] != !!ctx->s_qf_names[i])
                                goto err_jquota_change;

                        if (sbi->s_qf_names[i] && ctx->s_qf_names[i] &&
                            strcmp(get_qf_name(sb, sbi, i),
                                   ctx->s_qf_names[i]) != 0)
                                goto err_jquota_specified;
                }

                if (quota_feature) {
                        ext4_msg(NULL, KERN_INFO,
                                 "Journaled quota options ignored when "
                                 "QUOTA feature is enabled");
                        return 0;
                }
        }

        if (ctx->spec & EXT4_SPEC_JQFMT) {
                if (sbi->s_jquota_fmt != ctx->s_jquota_fmt && quota_loaded)
                        goto err_jquota_change;
                if (quota_feature) {
                        ext4_msg(NULL, KERN_INFO, "Quota format mount options "
                                 "ignored when QUOTA feature is enabled");
                        return 0;
                }
        }

        /* Make sure we don't mix old and new quota format */
        usr_qf_name = (get_qf_name(sb, sbi, USRQUOTA) ||
                       ctx->s_qf_names[USRQUOTA]);
        grp_qf_name = (get_qf_name(sb, sbi, GRPQUOTA) ||
                       ctx->s_qf_names[GRPQUOTA]);

        usrquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) ||
                    test_opt(sb, USRQUOTA));

        grpquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) ||
                    test_opt(sb, GRPQUOTA));

        if (usr_qf_name) {
                ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA);
                usrquota = false;
        }
        if (grp_qf_name) {
                ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA);
                grpquota = false;
        }

        if (usr_qf_name || grp_qf_name) {
                if (usrquota || grpquota) {
                        ext4_msg(NULL, KERN_ERR, "old and new quota "
                                 "format mixing");
                        return -EINVAL;
                }

                if (!(ctx->spec & EXT4_SPEC_JQFMT || sbi->s_jquota_fmt)) {
                        ext4_msg(NULL, KERN_ERR, "journaled quota format "
                                 "not specified");
                        return -EINVAL;
                }
        }

        return 0;

err_quota_change:
        ext4_msg(NULL, KERN_ERR,
                 "Cannot change quota options when quota turned on");
        return -EINVAL;
err_jquota_change:
        ext4_msg(NULL, KERN_ERR, "Cannot change journaled quota "
                 "options when quota turned on");
        return -EINVAL;
err_jquota_specified:
        ext4_msg(NULL, KERN_ERR, "%s quota file already specified",
                 QTYPE2NAME(i));
        return -EINVAL;
#else
        return 0;
#endif
}

static int ext4_check_test_dummy_encryption(const struct fs_context *fc,
                                            struct super_block *sb)
{
        const struct ext4_fs_context *ctx = fc->fs_private;
        const struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy))
                return 0;

        if (!ext4_has_feature_encrypt(sb)) {
                ext4_msg(NULL, KERN_WARNING,
                         "test_dummy_encryption requires encrypt feature");
                return -EINVAL;
        }
        /*
         * This mount option is just for testing, and it's not worthwhile to
         * implement the extra complexity (e.g. RCU protection) that would be
         * needed to allow it to be set or changed during remount.  We do allow
         * it to be specified during remount, but only if there is no change.
         */
        if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
                if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
                                                 &ctx->dummy_enc_policy))
                        return 0;
                ext4_msg(NULL, KERN_WARNING,
                         "Can't set or change test_dummy_encryption on remount");
                return -EINVAL;
        }
        /* Also make sure s_mount_opts didn't contain a conflicting value. */
        if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) {
                if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
                                                 &ctx->dummy_enc_policy))
                        return 0;
                ext4_msg(NULL, KERN_WARNING,
                         "Conflicting test_dummy_encryption options");
                return -EINVAL;
        }
        return 0;
}

static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx,
                                             struct super_block *sb)
{
        if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) ||
            /* if already set, it was already verified to be the same */
            fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy))
                return;
        EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy;
        memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy));
        ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
}

static int ext4_check_opt_consistency(struct fs_context *fc,
                                      struct super_block *sb)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        struct ext4_sb_info *sbi = fc->s_fs_info;
        int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE;
        int err;

        if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
                ext4_msg(NULL, KERN_ERR,
                         "Mount option(s) incompatible with ext2");
                return -EINVAL;
        }
        if ((ctx->opt_flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
                ext4_msg(NULL, KERN_ERR,
                         "Mount option(s) incompatible with ext3");
                return -EINVAL;
        }

        if (ctx->s_want_extra_isize >
            (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE)) {
                ext4_msg(NULL, KERN_ERR,
                         "Invalid want_extra_isize %d",
                         ctx->s_want_extra_isize);
                return -EINVAL;
        }

        err = ext4_check_test_dummy_encryption(fc, sb);
        if (err)
                return err;

        if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) {
                if (!sbi->s_journal) {
                        ext4_msg(NULL, KERN_WARNING,
                                 "Remounting file system with no journal "
                                 "so ignoring journalled data option");
                        ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS);
                } else if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS) !=
                           test_opt(sb, DATA_FLAGS)) {
                        ext4_msg(NULL, KERN_ERR, "Cannot change data mode "
                                 "on remount");
                        return -EINVAL;
                }
        }

        if (is_remount) {
                if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
                    (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
                        ext4_msg(NULL, KERN_ERR, "can't mount with "
                                 "both data=journal and dax");
                        return -EINVAL;
                }

                if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
                    (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
                     (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) {
fail_dax_change_remount:
                        ext4_msg(NULL, KERN_ERR, "can't change "
                                 "dax mount option while remounting");
                        return -EINVAL;
                } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER) &&
                         (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
                          (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) {
                        goto fail_dax_change_remount;
                } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE) &&
                           ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
                            (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
                            !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) {
                        goto fail_dax_change_remount;
                }
        }

        return ext4_check_quota_consistency(fc, sb);
}

static void ext4_apply_options(struct fs_context *fc, struct super_block *sb)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        struct ext4_sb_info *sbi = fc->s_fs_info;

        sbi->s_mount_opt &= ~ctx->mask_s_mount_opt;
        sbi->s_mount_opt |= ctx->vals_s_mount_opt;
        sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2;
        sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2;
        sb->s_flags &= ~ctx->mask_s_flags;
        sb->s_flags |= ctx->vals_s_flags;

#define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; })
        APPLY(s_commit_interval);
        APPLY(s_stripe);
        APPLY(s_max_batch_time);
        APPLY(s_min_batch_time);
        APPLY(s_want_extra_isize);
        APPLY(s_inode_readahead_blks);
        APPLY(s_max_dir_size_kb);
        APPLY(s_li_wait_mult);
        APPLY(s_resgid);
        APPLY(s_resuid);

#ifdef CONFIG_EXT4_DEBUG
        APPLY(s_fc_debug_max_replay);
#endif

        ext4_apply_quota_options(fc, sb);
        ext4_apply_test_dummy_encryption(ctx, sb);
}


static int ext4_validate_options(struct fs_context *fc)
{
#ifdef CONFIG_QUOTA
        struct ext4_fs_context *ctx = fc->fs_private;
        char *usr_qf_name, *grp_qf_name;

        usr_qf_name = ctx->s_qf_names[USRQUOTA];
        grp_qf_name = ctx->s_qf_names[GRPQUOTA];

        if (usr_qf_name || grp_qf_name) {
                if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) && usr_qf_name)
                        ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA);

                if (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) && grp_qf_name)
                        ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA);

                if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) ||
                    ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA)) {
                        ext4_msg(NULL, KERN_ERR, "old and new quota "
                                 "format mixing");
                        return -EINVAL;
                }
        }
#endif
        return 1;
}

static inline void ext4_show_quota_options(struct seq_file *seq,
                                           struct super_block *sb)
{
#if defined(CONFIG_QUOTA)
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        char *usr_qf_name, *grp_qf_name;

        if (sbi->s_jquota_fmt) {
                char *fmtname = "";

                switch (sbi->s_jquota_fmt) {
                case QFMT_VFS_OLD:
                        fmtname = "vfsold";
                        break;
                case QFMT_VFS_V0:
                        fmtname = "vfsv0";
                        break;
                case QFMT_VFS_V1:
                        fmtname = "vfsv1";
                        break;
                }
                seq_printf(seq, ",jqfmt=%s", fmtname);
        }

        rcu_read_lock();
        usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]);
        grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]);
        if (usr_qf_name)
                seq_show_option(seq, "usrjquota", usr_qf_name);
        if (grp_qf_name)
                seq_show_option(seq, "grpjquota", grp_qf_name);
        rcu_read_unlock();
#endif
}

static const char *token2str(int token)
{
        const struct fs_parameter_spec *spec;

        for (spec = ext4_param_specs; spec->name != NULL; spec++)
                if (spec->opt == token && !spec->type)
                        break;
        return spec->name;
}

/*
 * Show an option if
 *  - it's set to a non-default value OR
 *  - if the per-sb default is different from the global default
 */
static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
                              int nodefs)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        int def_errors;
        const struct mount_opts *m;
        char sep = nodefs ? '\n' : ',';

#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)

        if (sbi->s_sb_block != 1)
                SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);

        for (m = ext4_mount_opts; m->token != Opt_err; m++) {
                int want_set = m->flags & MOPT_SET;
                int opt_2 = m->flags & MOPT_2;
                unsigned int mount_opt, def_mount_opt;

                if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
                    m->flags & MOPT_SKIP)
                        continue;

                if (opt_2) {
                        mount_opt = sbi->s_mount_opt2;
                        def_mount_opt = sbi->s_def_mount_opt2;
                } else {
                        mount_opt = sbi->s_mount_opt;
                        def_mount_opt = sbi->s_def_mount_opt;
                }
                /* skip if same as the default */
                if (!nodefs && !(m->mount_opt & (mount_opt ^ def_mount_opt)))
                        continue;
                /* select Opt_noFoo vs Opt_Foo */
                if ((want_set &&
                     (mount_opt & m->mount_opt) != m->mount_opt) ||
                    (!want_set && (mount_opt & m->mount_opt)))
                        continue;
                SEQ_OPTS_PRINT("%s", token2str(m->token));
        }

        if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
            le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
                SEQ_OPTS_PRINT("resuid=%u",
                                from_kuid_munged(&init_user_ns, sbi->s_resuid));
        if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
            le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
                SEQ_OPTS_PRINT("resgid=%u",
                                from_kgid_munged(&init_user_ns, sbi->s_resgid));
        def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
        if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
                SEQ_OPTS_PUTS("errors=remount-ro");
        if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
                SEQ_OPTS_PUTS("errors=continue");
        if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
                SEQ_OPTS_PUTS("errors=panic");
        if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
                SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
        if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
                SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
        if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
                SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
        if (nodefs || sbi->s_stripe)
                SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
        if (nodefs || EXT4_MOUNT_DATA_FLAGS &
                        (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
                        SEQ_OPTS_PUTS("data=journal");
                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
                        SEQ_OPTS_PUTS("data=ordered");
                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
                        SEQ_OPTS_PUTS("data=writeback");
        }
        if (nodefs ||
            sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
                SEQ_OPTS_PRINT("inode_readahead_blks=%u",
                               sbi->s_inode_readahead_blks);

        if (test_opt(sb, INIT_INODE_TABLE) && (nodefs ||
                       (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
                SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
        if (nodefs || sbi->s_max_dir_size_kb)
                SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
        if (test_opt(sb, DATA_ERR_ABORT))
                SEQ_OPTS_PUTS("data_err=abort");

        fscrypt_show_test_dummy_encryption(seq, sep, sb);

        if (sb->s_flags & SB_INLINECRYPT)
                SEQ_OPTS_PUTS("inlinecrypt");

        if (test_opt(sb, DAX_ALWAYS)) {
                if (IS_EXT2_SB(sb))
                        SEQ_OPTS_PUTS("dax");
                else
                        SEQ_OPTS_PUTS("dax=always");
        } else if (test_opt2(sb, DAX_NEVER)) {
                SEQ_OPTS_PUTS("dax=never");
        } else if (test_opt2(sb, DAX_INODE)) {
                SEQ_OPTS_PUTS("dax=inode");
        }

        if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
                        !test_opt2(sb, MB_OPTIMIZE_SCAN)) {
                SEQ_OPTS_PUTS("mb_optimize_scan=0");
        } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD &&
                        test_opt2(sb, MB_OPTIMIZE_SCAN)) {
                SEQ_OPTS_PUTS("mb_optimize_scan=1");
        }

        ext4_show_quota_options(seq, sb);
        return 0;
}

static int ext4_show_options(struct seq_file *seq, struct dentry *root)
{
        return _ext4_show_options(seq, root->d_sb, 0);
}

int ext4_seq_options_show(struct seq_file *seq, void *offset)
{
        struct super_block *sb = seq->private;
        int rc;

        seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw");
        rc = _ext4_show_options(seq, sb, 1);
        seq_puts(seq, "\n");
        return rc;
}

static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                            int read_only)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int err = 0;

        if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
                ext4_msg(sb, KERN_ERR, "revision level too high, "
                         "forcing read-only mode");
                err = -EROFS;
                goto done;
        }
        if (read_only)
                goto done;
        if (!(sbi->s_mount_state & EXT4_VALID_FS))
                ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
                         "running e2fsck is recommended");
        else if (sbi->s_mount_state & EXT4_ERROR_FS)
                ext4_msg(sb, KERN_WARNING,
                         "warning: mounting fs with errors, "
                         "running e2fsck is recommended");
        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
                 le16_to_cpu(es->s_mnt_count) >=
                 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
                ext4_msg(sb, KERN_WARNING,
                         "warning: maximal mount count reached, "
                         "running e2fsck is recommended");
        else if (le32_to_cpu(es->s_checkinterval) &&
                 (ext4_get_tstamp(es, s_lastcheck) +
                  le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds()))
                ext4_msg(sb, KERN_WARNING,
                         "warning: checktime reached, "
                         "running e2fsck is recommended");
        if (!sbi->s_journal)
                es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
        if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
                es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
        le16_add_cpu(&es->s_mnt_count, 1);
        ext4_update_tstamp(es, s_mtime);
        if (sbi->s_journal) {
                ext4_set_feature_journal_needs_recovery(sb);
                if (ext4_has_feature_orphan_file(sb))
                        ext4_set_feature_orphan_present(sb);
        }

        err = ext4_commit_super(sb);
done:
        if (test_opt(sb, DEBUG))
                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
                                "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
                        sb->s_blocksize,
                        sbi->s_groups_count,
                        EXT4_BLOCKS_PER_GROUP(sb),
                        EXT4_INODES_PER_GROUP(sb),
                        sbi->s_mount_opt, sbi->s_mount_opt2);
        return err;
}

int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct flex_groups **old_groups, **new_groups;
        int size, i, j;

        if (!sbi->s_log_groups_per_flex)
                return 0;

        size = ext4_flex_group(sbi, ngroup - 1) + 1;
        if (size <= sbi->s_flex_groups_allocated)
                return 0;

        new_groups = kvzalloc(roundup_pow_of_two(size *
                              sizeof(*sbi->s_flex_groups)), GFP_KERNEL);
        if (!new_groups) {
                ext4_msg(sb, KERN_ERR,
                         "not enough memory for %d flex group pointers", size);
                return -ENOMEM;
        }
        for (i = sbi->s_flex_groups_allocated; i < size; i++) {
                new_groups[i] = kvzalloc(roundup_pow_of_two(
                                         sizeof(struct flex_groups)),
                                         GFP_KERNEL);
                if (!new_groups[i]) {
                        for (j = sbi->s_flex_groups_allocated; j < i; j++)
                                kvfree(new_groups[j]);
                        kvfree(new_groups);
                        ext4_msg(sb, KERN_ERR,
                                 "not enough memory for %d flex groups", size);
                        return -ENOMEM;
                }
        }
        rcu_read_lock();
        old_groups = rcu_dereference(sbi->s_flex_groups);
        if (old_groups)
                memcpy(new_groups, old_groups,
                       (sbi->s_flex_groups_allocated *
                        sizeof(struct flex_groups *)));
        rcu_read_unlock();
        rcu_assign_pointer(sbi->s_flex_groups, new_groups);
        sbi->s_flex_groups_allocated = size;
        if (old_groups)
                ext4_kvfree_array_rcu(old_groups);
        return 0;
}

static int ext4_fill_flex_info(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *gdp = NULL;
        struct flex_groups *fg;
        ext4_group_t flex_group;
        int i, err;

        sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
        if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
                sbi->s_log_groups_per_flex = 0;
                return 1;
        }

        err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
        if (err)
                goto failed;

        for (i = 0; i < sbi->s_groups_count; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);

                flex_group = ext4_flex_group(sbi, i);
                fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
                atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes);
                atomic64_add(ext4_free_group_clusters(sb, gdp),
                             &fg->free_clusters);
                atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs);
        }

        return 1;
failed:
        return 0;
}

static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
                                   struct ext4_group_desc *gdp)
{
        int offset = offsetof(struct ext4_group_desc, bg_checksum);
        __u16 crc = 0;
        __le32 le_group = cpu_to_le32(block_group);
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (ext4_has_metadata_csum(sbi->s_sb)) {
                /* Use new metadata_csum algorithm */
                __u32 csum32;
                __u16 dummy_csum = 0;

                csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
                                     sizeof(le_group));
                csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset);
                csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum,
                                     sizeof(dummy_csum));
                offset += sizeof(dummy_csum);
                if (offset < sbi->s_desc_size)
                        csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset,
                                             sbi->s_desc_size - offset);

                crc = csum32 & 0xFFFF;
                goto out;
        }

        /* old crc16 code */
        if (!ext4_has_feature_gdt_csum(sb))
                return 0;

        crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
        crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
        crc = crc16(crc, (__u8 *)gdp, offset);
        offset += sizeof(gdp->bg_checksum); /* skip checksum */
        /* for checksum of struct ext4_group_desc do the rest...*/
        if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size)
                crc = crc16(crc, (__u8 *)gdp + offset,
                            sbi->s_desc_size - offset);

out:
        return cpu_to_le16(crc);
}

int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
                                struct ext4_group_desc *gdp)
{
        if (ext4_has_group_desc_csum(sb) &&
            (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp)))
                return 0;

        return 1;
}

void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
                              struct ext4_group_desc *gdp)
{
        if (!ext4_has_group_desc_csum(sb))
                return;
        gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp);
}

/* Called at mount-time, super-block is locked */
static int ext4_check_descriptors(struct super_block *sb,
                                  ext4_fsblk_t sb_block,
                                  ext4_group_t *first_not_zeroed)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
        ext4_fsblk_t last_block;
        ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0);
        ext4_fsblk_t block_bitmap;
        ext4_fsblk_t inode_bitmap;
        ext4_fsblk_t inode_table;
        int flexbg_flag = 0;
        ext4_group_t i, grp = sbi->s_groups_count;

        if (ext4_has_feature_flex_bg(sb))
                flexbg_flag = 1;

        ext4_debug("Checking group descriptors");

        for (i = 0; i < sbi->s_groups_count; i++) {
                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);

                if (i == sbi->s_groups_count - 1 || flexbg_flag)
                        last_block = ext4_blocks_count(sbi->s_es) - 1;
                else
                        last_block = first_block +
                                (EXT4_BLOCKS_PER_GROUP(sb) - 1);

                if ((grp == sbi->s_groups_count) &&
                   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
                        grp = i;

                block_bitmap = ext4_block_bitmap(sb, gdp);
                if (block_bitmap == sb_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Block bitmap for group %u overlaps "
                                 "superblock", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (block_bitmap >= sb_block + 1 &&
                    block_bitmap <= last_bg_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Block bitmap for group %u overlaps "
                                 "block group descriptors", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (block_bitmap < first_block || block_bitmap > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Block bitmap for group %u not in group "
                               "(block %llu)!", i, block_bitmap);
                        return 0;
                }
                inode_bitmap = ext4_inode_bitmap(sb, gdp);
                if (inode_bitmap == sb_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Inode bitmap for group %u overlaps "
                                 "superblock", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (inode_bitmap >= sb_block + 1 &&
                    inode_bitmap <= last_bg_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Inode bitmap for group %u overlaps "
                                 "block group descriptors", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (inode_bitmap < first_block || inode_bitmap > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Inode bitmap for group %u not in group "
                               "(block %llu)!", i, inode_bitmap);
                        return 0;
                }
                inode_table = ext4_inode_table(sb, gdp);
                if (inode_table == sb_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Inode table for group %u overlaps "
                                 "superblock", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (inode_table >= sb_block + 1 &&
                    inode_table <= last_bg_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Inode table for group %u overlaps "
                                 "block group descriptors", i);
                        if (!sb_rdonly(sb))
                                return 0;
                }
                if (inode_table < first_block ||
                    inode_table + sbi->s_itb_per_group - 1 > last_block) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Inode table for group %u not in group "
                               "(block %llu)!", i, inode_table);
                        return 0;
                }
                ext4_lock_group(sb, i);
                if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Checksum for group %u failed (%u!=%u)",
                                 i, le16_to_cpu(ext4_group_desc_csum(sb, i,
                                     gdp)), le16_to_cpu(gdp->bg_checksum));
                        if (!sb_rdonly(sb)) {
                                ext4_unlock_group(sb, i);
                                return 0;
                        }
                }
                ext4_unlock_group(sb, i);
                if (!flexbg_flag)
                        first_block += EXT4_BLOCKS_PER_GROUP(sb);
        }
        if (NULL != first_not_zeroed)
                *first_not_zeroed = grp;
        return 1;
}

/*
 * Maximal extent format file size.
 * Resulting logical blkno at s_maxbytes must fit in our on-disk
 * extent format containers, within a sector_t, and within i_blocks
 * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
 * so that won't be a limiting factor.
 *
 * However there is other limiting factor. We do store extents in the form
 * of starting block and length, hence the resulting length of the extent
 * covering maximum file size must fit into on-disk format containers as
 * well. Given that length is always by 1 unit bigger than max unit (because
 * we count 0 as well) we have to lower the s_maxbytes by one fs block.
 *
 * Note, this does *not* consider any metadata overhead for vfs i_blocks.
 */
static loff_t ext4_max_size(int blkbits, int has_huge_files)
{
        loff_t res;
        loff_t upper_limit = MAX_LFS_FILESIZE;

        BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64));

        if (!has_huge_files) {
                upper_limit = (1LL << 32) - 1;

                /* total blocks in file system block size */
                upper_limit >>= (blkbits - 9);
                upper_limit <<= blkbits;
        }

        /*
         * 32-bit extent-start container, ee_block. We lower the maxbytes
         * by one fs block, so ee_len can cover the extent of maximum file
         * size
         */
        res = (1LL << 32) - 1;
        res <<= blkbits;

        /* Sanity check against vm- & vfs- imposed limits */
        if (res > upper_limit)
                res = upper_limit;

        return res;
}

/*
 * Maximal bitmap file size.  There is a direct, and {,double-,triple-}indirect
 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
 * We need to be 1 filesystem block less than the 2^48 sector limit.
 */
static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
{
        loff_t upper_limit, res = EXT4_NDIR_BLOCKS;
        int meta_blocks;
        unsigned int ppb = 1 << (bits - 2);

        /*
         * This is calculated to be the largest file size for a dense, block
         * mapped file such that the file's total number of 512-byte sectors,
         * including data and all indirect blocks, does not exceed (2^48 - 1).
         *
         * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
         * number of 512-byte sectors of the file.
         */
        if (!has_huge_files) {
                /*
                 * !has_huge_files or implies that the inode i_block field
                 * represents total file blocks in 2^32 512-byte sectors ==
                 * size of vfs inode i_blocks * 8
                 */
                upper_limit = (1LL << 32) - 1;

                /* total blocks in file system block size */
                upper_limit >>= (bits - 9);

        } else {
                /*
                 * We use 48 bit ext4_inode i_blocks
                 * With EXT4_HUGE_FILE_FL set the i_blocks
                 * represent total number of blocks in
                 * file system block size
                 */
                upper_limit = (1LL << 48) - 1;

        }

        /* Compute how many blocks we can address by block tree */
        res += ppb;
        res += ppb * ppb;
        res += ((loff_t)ppb) * ppb * ppb;
        /* Compute how many metadata blocks are needed */
        meta_blocks = 1;
        meta_blocks += 1 + ppb;
        meta_blocks += 1 + ppb + ppb * ppb;
        /* Does block tree limit file size? */
        if (res + meta_blocks <= upper_limit)
                goto check_lfs;

        res = upper_limit;
        /* How many metadata blocks are needed for addressing upper_limit? */
        upper_limit -= EXT4_NDIR_BLOCKS;
        /* indirect blocks */
        meta_blocks = 1;
        upper_limit -= ppb;
        /* double indirect blocks */
        if (upper_limit < ppb * ppb) {
                meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb);
                res -= meta_blocks;
                goto check_lfs;
        }
        meta_blocks += 1 + ppb;
        upper_limit -= ppb * ppb;
        /* tripple indirect blocks for the rest */
        meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb) +
                DIV_ROUND_UP_ULL(upper_limit, ppb*ppb);
        res -= meta_blocks;
check_lfs:
        res <<= bits;
        if (res > MAX_LFS_FILESIZE)
                res = MAX_LFS_FILESIZE;

        return res;
}

static ext4_fsblk_t descriptor_loc(struct super_block *sb,
                                   ext4_fsblk_t logical_sb_block, int nr)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t bg, first_meta_bg;
        int has_super = 0;

        first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);

        if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg)
                return logical_sb_block + nr + 1;
        bg = sbi->s_desc_per_block * nr;
        if (ext4_bg_has_super(sb, bg))
                has_super = 1;

        /*
         * If we have a meta_bg fs with 1k blocks, group 0's GDT is at
         * block 2, not 1.  If s_first_data_block == 0 (bigalloc is enabled
         * on modern mke2fs or blksize > 1k on older mke2fs) then we must
         * compensate.
         */
        if (sb->s_blocksize == 1024 && nr == 0 &&
            le32_to_cpu(sbi->s_es->s_first_data_block) == 0)
                has_super++;

        return (has_super + ext4_group_first_block_no(sb, bg));
}

/**
 * ext4_get_stripe_size: Get the stripe size.
 * @sbi: In memory super block info
 *
 * If we have specified it via mount option, then
 * use the mount option value. If the value specified at mount time is
 * greater than the blocks per group use the super block value.
 * If the super block value is greater than blocks per group return 0.
 * Allocator needs it be less than blocks per group.
 *
 */
static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
{
        unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
        unsigned long stripe_width =
                        le32_to_cpu(sbi->s_es->s_raid_stripe_width);
        int ret;

        if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
                ret = sbi->s_stripe;
        else if (stripe_width && stripe_width <= sbi->s_blocks_per_group)
                ret = stripe_width;
        else if (stride && stride <= sbi->s_blocks_per_group)
                ret = stride;
        else
                ret = 0;

        /*
         * If the stripe width is 1, this makes no sense and
         * we set it to 0 to turn off stripe handling code.
         */
        if (ret <= 1)
                ret = 0;

        return ret;
}

/*
 * Check whether this filesystem can be mounted based on
 * the features present and the RDONLY/RDWR mount requested.
 * Returns 1 if this filesystem can be mounted as requested,
 * 0 if it cannot be.
 */
int ext4_feature_set_ok(struct super_block *sb, int readonly)
{
        if (ext4_has_unknown_ext4_incompat_features(sb)) {
                ext4_msg(sb, KERN_ERR,
                        "Couldn't mount because of "
                        "unsupported optional features (%x)",
                        (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
                        ~EXT4_FEATURE_INCOMPAT_SUPP));
                return 0;
        }

#if !IS_ENABLED(CONFIG_UNICODE)
        if (ext4_has_feature_casefold(sb)) {
                ext4_msg(sb, KERN_ERR,
                         "Filesystem with casefold feature cannot be "
                         "mounted without CONFIG_UNICODE");
                return 0;
        }
#endif

        if (readonly)
                return 1;

        if (ext4_has_feature_readonly(sb)) {
                ext4_msg(sb, KERN_INFO, "filesystem is read-only");
                sb->s_flags |= SB_RDONLY;
                return 1;
        }

        /* Check that feature set is OK for a read-write mount */
        if (ext4_has_unknown_ext4_ro_compat_features(sb)) {
                ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
                         "unsupported optional features (%x)",
                         (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
                                ~EXT4_FEATURE_RO_COMPAT_SUPP));
                return 0;
        }
        if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
                ext4_msg(sb, KERN_ERR,
                         "Can't support bigalloc feature without "
                         "extents feature\n");
                return 0;
        }

#if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2)
        if (!readonly && (ext4_has_feature_quota(sb) ||
                          ext4_has_feature_project(sb))) {
                ext4_msg(sb, KERN_ERR,
                         "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2");
                return 0;
        }
#endif  /* CONFIG_QUOTA */
        return 1;
}

/*
 * This function is called once a day if we have errors logged
 * on the file system
 */
static void print_daily_error_info(struct timer_list *t)
{
        struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report);
        struct super_block *sb = sbi->s_sb;
        struct ext4_super_block *es = sbi->s_es;

        if (es->s_error_count)
                /* fsck newer than v1.41.13 is needed to clean this condition. */
                ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
                         le32_to_cpu(es->s_error_count));
        if (es->s_first_error_time) {
                printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d",
                       sb->s_id,
                       ext4_get_tstamp(es, s_first_error_time),
                       (int) sizeof(es->s_first_error_func),
                       es->s_first_error_func,
                       le32_to_cpu(es->s_first_error_line));
                if (es->s_first_error_ino)
                        printk(KERN_CONT ": inode %u",
                               le32_to_cpu(es->s_first_error_ino));
                if (es->s_first_error_block)
                        printk(KERN_CONT ": block %llu", (unsigned long long)
                               le64_to_cpu(es->s_first_error_block));
                printk(KERN_CONT "\n");
        }
        if (es->s_last_error_time) {
                printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d",
                       sb->s_id,
                       ext4_get_tstamp(es, s_last_error_time),
                       (int) sizeof(es->s_last_error_func),
                       es->s_last_error_func,
                       le32_to_cpu(es->s_last_error_line));
                if (es->s_last_error_ino)
                        printk(KERN_CONT ": inode %u",
                               le32_to_cpu(es->s_last_error_ino));
                if (es->s_last_error_block)
                        printk(KERN_CONT ": block %llu", (unsigned long long)
                               le64_to_cpu(es->s_last_error_block));
                printk(KERN_CONT "\n");
        }
        mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
}

/* Find next suitable group and run ext4_init_inode_table */
static int ext4_run_li_request(struct ext4_li_request *elr)
{
        struct ext4_group_desc *gdp = NULL;
        struct super_block *sb = elr->lr_super;
        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
        ext4_group_t group = elr->lr_next_group;
        unsigned int prefetch_ios = 0;
        int ret = 0;
        int nr = EXT4_SB(sb)->s_mb_prefetch;
        u64 start_time;

        if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
                elr->lr_next_group = ext4_mb_prefetch(sb, group, nr, &prefetch_ios);
                ext4_mb_prefetch_fini(sb, elr->lr_next_group, nr);
                trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, nr);
                if (group >= elr->lr_next_group) {
                        ret = 1;
                        if (elr->lr_first_not_zeroed != ngroups &&
                            !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) {
                                elr->lr_next_group = elr->lr_first_not_zeroed;
                                elr->lr_mode = EXT4_LI_MODE_ITABLE;
                                ret = 0;
                        }
                }
                return ret;
        }

        for (; group < ngroups; group++) {
                gdp = ext4_get_group_desc(sb, group, NULL);
                if (!gdp) {
                        ret = 1;
                        break;
                }

                if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
                        break;
        }

        if (group >= ngroups)
                ret = 1;

        if (!ret) {
                start_time = ktime_get_real_ns();
                ret = ext4_init_inode_table(sb, group,
                                            elr->lr_timeout ? 0 : 1);
                trace_ext4_lazy_itable_init(sb, group);
                if (elr->lr_timeout == 0) {
                        elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
                                EXT4_SB(elr->lr_super)->s_li_wait_mult);
                }
                elr->lr_next_sched = jiffies + elr->lr_timeout;
                elr->lr_next_group = group + 1;
        }
        return ret;
}

/*
 * Remove lr_request from the list_request and free the
 * request structure. Should be called with li_list_mtx held
 */
static void ext4_remove_li_request(struct ext4_li_request *elr)
{
        if (!elr)
                return;

        list_del(&elr->lr_request);
        EXT4_SB(elr->lr_super)->s_li_request = NULL;
        kfree(elr);
}

static void ext4_unregister_li_request(struct super_block *sb)
{
        mutex_lock(&ext4_li_mtx);
        if (!ext4_li_info) {
                mutex_unlock(&ext4_li_mtx);
                return;
        }

        mutex_lock(&ext4_li_info->li_list_mtx);
        ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
        mutex_unlock(&ext4_li_info->li_list_mtx);
        mutex_unlock(&ext4_li_mtx);
}

static struct task_struct *ext4_lazyinit_task;

/*
 * This is the function where ext4lazyinit thread lives. It walks
 * through the request list searching for next scheduled filesystem.
 * When such a fs is found, run the lazy initialization request
 * (ext4_rn_li_request) and keep track of the time spend in this
 * function. Based on that time we compute next schedule time of
 * the request. When walking through the list is complete, compute
 * next waking time and put itself into sleep.
 */
static int ext4_lazyinit_thread(void *arg)
{
        struct ext4_lazy_init *eli = arg;
        struct list_head *pos, *n;
        struct ext4_li_request *elr;
        unsigned long next_wakeup, cur;

        BUG_ON(NULL == eli);
        set_freezable();

cont_thread:
        while (true) {
                next_wakeup = MAX_JIFFY_OFFSET;

                mutex_lock(&eli->li_list_mtx);
                if (list_empty(&eli->li_request_list)) {
                        mutex_unlock(&eli->li_list_mtx);
                        goto exit_thread;
                }
                list_for_each_safe(pos, n, &eli->li_request_list) {
                        int err = 0;
                        int progress = 0;
                        elr = list_entry(pos, struct ext4_li_request,
                                         lr_request);

                        if (time_before(jiffies, elr->lr_next_sched)) {
                                if (time_before(elr->lr_next_sched, next_wakeup))
                                        next_wakeup = elr->lr_next_sched;
                                continue;
                        }
                        if (down_read_trylock(&elr->lr_super->s_umount)) {
                                if (sb_start_write_trylock(elr->lr_super)) {
                                        progress = 1;
                                        /*
                                         * We hold sb->s_umount, sb can not
                                         * be removed from the list, it is
                                         * now safe to drop li_list_mtx
                                         */
                                        mutex_unlock(&eli->li_list_mtx);
                                        err = ext4_run_li_request(elr);
                                        sb_end_write(elr->lr_super);
                                        mutex_lock(&eli->li_list_mtx);
                                        n = pos->next;
                                }
                                up_read((&elr->lr_super->s_umount));
                        }
                        /* error, remove the lazy_init job */
                        if (err) {
                                ext4_remove_li_request(elr);
                                continue;
                        }
                        if (!progress) {
                                elr->lr_next_sched = jiffies +
                                        get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
                        }
                        if (time_before(elr->lr_next_sched, next_wakeup))
                                next_wakeup = elr->lr_next_sched;
                }
                mutex_unlock(&eli->li_list_mtx);

                try_to_freeze();

                cur = jiffies;
                if ((time_after_eq(cur, next_wakeup)) ||
                    (MAX_JIFFY_OFFSET == next_wakeup)) {
                        cond_resched();
                        continue;
                }

                schedule_timeout_interruptible(next_wakeup - cur);

                if (kthread_should_stop()) {
                        ext4_clear_request_list();
                        goto exit_thread;
                }
        }

exit_thread:
        /*
         * It looks like the request list is empty, but we need
         * to check it under the li_list_mtx lock, to prevent any
         * additions into it, and of course we should lock ext4_li_mtx
         * to atomically free the list and ext4_li_info, because at
         * this point another ext4 filesystem could be registering
         * new one.
         */
        mutex_lock(&ext4_li_mtx);
        mutex_lock(&eli->li_list_mtx);
        if (!list_empty(&eli->li_request_list)) {
                mutex_unlock(&eli->li_list_mtx);
                mutex_unlock(&ext4_li_mtx);
                goto cont_thread;
        }
        mutex_unlock(&eli->li_list_mtx);
        kfree(ext4_li_info);
        ext4_li_info = NULL;
        mutex_unlock(&ext4_li_mtx);

        return 0;
}

static void ext4_clear_request_list(void)
{
        struct list_head *pos, *n;
        struct ext4_li_request *elr;

        mutex_lock(&ext4_li_info->li_list_mtx);
        list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
                elr = list_entry(pos, struct ext4_li_request,
                                 lr_request);
                ext4_remove_li_request(elr);
        }
        mutex_unlock(&ext4_li_info->li_list_mtx);
}

static int ext4_run_lazyinit_thread(void)
{
        ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
                                         ext4_li_info, "ext4lazyinit");
        if (IS_ERR(ext4_lazyinit_task)) {
                int err = PTR_ERR(ext4_lazyinit_task);
                ext4_clear_request_list();
                kfree(ext4_li_info);
                ext4_li_info = NULL;
                printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
                                 "initialization thread\n",
                                 err);
                return err;
        }
        ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
        return 0;
}

/*
 * Check whether it make sense to run itable init. thread or not.
 * If there is at least one uninitialized inode table, return
 * corresponding group number, else the loop goes through all
 * groups and return total number of groups.
 */
static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
{
        ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
        struct ext4_group_desc *gdp = NULL;

        if (!ext4_has_group_desc_csum(sb))
                return ngroups;

        for (group = 0; group < ngroups; group++) {
                gdp = ext4_get_group_desc(sb, group, NULL);
                if (!gdp)
                        continue;

                if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
                        break;
        }

        return group;
}

static int ext4_li_info_new(void)
{
        struct ext4_lazy_init *eli = NULL;

        eli = kzalloc(sizeof(*eli), GFP_KERNEL);
        if (!eli)
                return -ENOMEM;

        INIT_LIST_HEAD(&eli->li_request_list);
        mutex_init(&eli->li_list_mtx);

        eli->li_state |= EXT4_LAZYINIT_QUIT;

        ext4_li_info = eli;

        return 0;
}

static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
                                            ext4_group_t start)
{
        struct ext4_li_request *elr;

        elr = kzalloc(sizeof(*elr), GFP_KERNEL);
        if (!elr)
                return NULL;

        elr->lr_super = sb;
        elr->lr_first_not_zeroed = start;
        if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) {
                elr->lr_mode = EXT4_LI_MODE_ITABLE;
                elr->lr_next_group = start;
        } else {
                elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP;
        }

        /*
         * Randomize first schedule time of the request to
         * spread the inode table initialization requests
         * better.
         */
        elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
        return elr;
}

int ext4_register_li_request(struct super_block *sb,
                             ext4_group_t first_not_zeroed)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_li_request *elr = NULL;
        ext4_group_t ngroups = sbi->s_groups_count;
        int ret = 0;

        mutex_lock(&ext4_li_mtx);
        if (sbi->s_li_request != NULL) {
                /*
                 * Reset timeout so it can be computed again, because
                 * s_li_wait_mult might have changed.
                 */
                sbi->s_li_request->lr_timeout = 0;
                goto out;
        }

        if (sb_rdonly(sb) ||
            (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
             (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE))))
                goto out;

        elr = ext4_li_request_new(sb, first_not_zeroed);
        if (!elr) {
                ret = -ENOMEM;
                goto out;
        }

        if (NULL == ext4_li_info) {
                ret = ext4_li_info_new();
                if (ret)
                        goto out;
        }

        mutex_lock(&ext4_li_info->li_list_mtx);
        list_add(&elr->lr_request, &ext4_li_info->li_request_list);
        mutex_unlock(&ext4_li_info->li_list_mtx);

        sbi->s_li_request = elr;
        /*
         * set elr to NULL here since it has been inserted to
         * the request_list and the removal and free of it is
         * handled by ext4_clear_request_list from now on.
         */
        elr = NULL;

        if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
                ret = ext4_run_lazyinit_thread();
                if (ret)
                        goto out;
        }
out:
        mutex_unlock(&ext4_li_mtx);
        if (ret)
                kfree(elr);
        return ret;
}

/*
 * We do not need to lock anything since this is called on
 * module unload.
 */
static void ext4_destroy_lazyinit_thread(void)
{
        /*
         * If thread exited earlier
         * there's nothing to be done.
         */
        if (!ext4_li_info || !ext4_lazyinit_task)
                return;

        kthread_stop(ext4_lazyinit_task);
}

static int set_journal_csum_feature_set(struct super_block *sb)
{
        int ret = 1;
        int compat, incompat;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (ext4_has_metadata_csum(sb)) {
                /* journal checksum v3 */
                compat = 0;
                incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
        } else {
                /* journal checksum v1 */
                compat = JBD2_FEATURE_COMPAT_CHECKSUM;
                incompat = 0;
        }

        jbd2_journal_clear_features(sbi->s_journal,
                        JBD2_FEATURE_COMPAT_CHECKSUM, 0,
                        JBD2_FEATURE_INCOMPAT_CSUM_V3 |
                        JBD2_FEATURE_INCOMPAT_CSUM_V2);
        if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
                ret = jbd2_journal_set_features(sbi->s_journal,
                                compat, 0,
                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
                                incompat);
        } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
                ret = jbd2_journal_set_features(sbi->s_journal,
                                compat, 0,
                                incompat);
                jbd2_journal_clear_features(sbi->s_journal, 0, 0,
                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
        } else {
                jbd2_journal_clear_features(sbi->s_journal, 0, 0,
                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
        }

        return ret;
}

/*
 * Note: calculating the overhead so we can be compatible with
 * historical BSD practice is quite difficult in the face of
 * clusters/bigalloc.  This is because multiple metadata blocks from
 * different block group can end up in the same allocation cluster.
 * Calculating the exact overhead in the face of clustered allocation
 * requires either O(all block bitmaps) in memory or O(number of block
 * groups**2) in time.  We will still calculate the superblock for
 * older file systems --- and if we come across with a bigalloc file
 * system with zero in s_overhead_clusters the estimate will be close to
 * correct especially for very large cluster sizes --- but for newer
 * file systems, it's better to calculate this figure once at mkfs
 * time, and store it in the superblock.  If the superblock value is
 * present (even for non-bigalloc file systems), we will use it.
 */
static int count_overhead(struct super_block *sb, ext4_group_t grp,
                          char *buf)
{
        struct ext4_sb_info        *sbi = EXT4_SB(sb);
        struct ext4_group_desc        *gdp;
        ext4_fsblk_t                first_block, last_block, b;
        ext4_group_t                i, ngroups = ext4_get_groups_count(sb);
        int                        s, j, count = 0;
        int                        has_super = ext4_bg_has_super(sb, grp);

        if (!ext4_has_feature_bigalloc(sb))
                return (has_super + ext4_bg_num_gdb(sb, grp) +
                        (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) +
                        sbi->s_itb_per_group + 2);

        first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
                (grp * EXT4_BLOCKS_PER_GROUP(sb));
        last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                b = ext4_block_bitmap(sb, gdp);
                if (b >= first_block && b <= last_block) {
                        ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
                        count++;
                }
                b = ext4_inode_bitmap(sb, gdp);
                if (b >= first_block && b <= last_block) {
                        ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
                        count++;
                }
                b = ext4_inode_table(sb, gdp);
                if (b >= first_block && b + sbi->s_itb_per_group <= last_block)
                        for (j = 0; j < sbi->s_itb_per_group; j++, b++) {
                                int c = EXT4_B2C(sbi, b - first_block);
                                ext4_set_bit(c, buf);
                                count++;
                        }
                if (i != grp)
                        continue;
                s = 0;
                if (ext4_bg_has_super(sb, grp)) {
                        ext4_set_bit(s++, buf);
                        count++;
                }
                j = ext4_bg_num_gdb(sb, grp);
                if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) {
                        ext4_error(sb, "Invalid number of block group "
                                   "descriptor blocks: %d", j);
                        j = EXT4_BLOCKS_PER_GROUP(sb) - s;
                }
                count += j;
                for (; j > 0; j--)
                        ext4_set_bit(EXT4_B2C(sbi, s++), buf);
        }
        if (!count)
                return 0;
        return EXT4_CLUSTERS_PER_GROUP(sb) -
                ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8);
}

/*
 * Compute the overhead and stash it in sbi->s_overhead
 */
int ext4_calculate_overhead(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        struct inode *j_inode;
        unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum);
        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
        ext4_fsblk_t overhead = 0;
        char *buf = (char *) get_zeroed_page(GFP_NOFS);

        if (!buf)
                return -ENOMEM;

        /*
         * Compute the overhead (FS structures).  This is constant
         * for a given filesystem unless the number of block groups
         * changes so we cache the previous value until it does.
         */

        /*
         * All of the blocks before first_data_block are overhead
         */
        overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));

        /*
         * Add the overhead found in each block group
         */
        for (i = 0; i < ngroups; i++) {
                int blks;

                blks = count_overhead(sb, i, buf);
                overhead += blks;
                if (blks)
                        memset(buf, 0, PAGE_SIZE);
                cond_resched();
        }

        /*
         * Add the internal journal blocks whether the journal has been
         * loaded or not
         */
        if (sbi->s_journal && !sbi->s_journal_bdev_file)
                overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
        else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
                /* j_inum for internal journal is non-zero */
                j_inode = ext4_get_journal_inode(sb, j_inum);
                if (!IS_ERR(j_inode)) {
                        j_blocks = j_inode->i_size >> sb->s_blocksize_bits;
                        overhead += EXT4_NUM_B2C(sbi, j_blocks);
                        iput(j_inode);
                } else {
                        ext4_msg(sb, KERN_ERR, "can't get journal size");
                }
        }
        sbi->s_overhead = overhead;
        smp_wmb();
        free_page((unsigned long) buf);
        return 0;
}

static void ext4_set_resv_clusters(struct super_block *sb)
{
        ext4_fsblk_t resv_clusters;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /*
         * There's no need to reserve anything when we aren't using extents.
         * The space estimates are exact, there are no unwritten extents,
         * hole punching doesn't need new metadata... This is needed especially
         * to keep ext2/3 backward compatibility.
         */
        if (!ext4_has_feature_extents(sb))
                return;
        /*
         * By default we reserve 2% or 4096 clusters, whichever is smaller.
         * This should cover the situations where we can not afford to run
         * out of space like for example punch hole, or converting
         * unwritten extents in delalloc path. In most cases such
         * allocation would require 1, or 2 blocks, higher numbers are
         * very rare.
         */
        resv_clusters = (ext4_blocks_count(sbi->s_es) >>
                         sbi->s_cluster_bits);

        do_div(resv_clusters, 50);
        resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);

        atomic64_set(&sbi->s_resv_clusters, resv_clusters);
}

static const char *ext4_quota_mode(struct super_block *sb)
{
#ifdef CONFIG_QUOTA
        if (!ext4_quota_capable(sb))
                return "none";

        if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb))
                return "journalled";
        else
                return "writeback";
#else
        return "disabled";
#endif
}

static void ext4_setup_csum_trigger(struct super_block *sb,
                                    enum ext4_journal_trigger_type type,
                                    void (*trigger)(
                                        struct jbd2_buffer_trigger_type *type,
                                        struct buffer_head *bh,
                                        void *mapped_data,
                                        size_t size))
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        sbi->s_journal_triggers[type].sb = sb;
        sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger;
}

static void ext4_free_sbi(struct ext4_sb_info *sbi)
{
        if (!sbi)
                return;

        kfree(sbi->s_blockgroup_lock);
        fs_put_dax(sbi->s_daxdev, NULL);
        kfree(sbi);
}

static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb)
{
        struct ext4_sb_info *sbi;

        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return NULL;

        sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
                                           NULL, NULL);

        sbi->s_blockgroup_lock =
                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);

        if (!sbi->s_blockgroup_lock)
                goto err_out;

        sb->s_fs_info = sbi;
        sbi->s_sb = sb;
        return sbi;
err_out:
        fs_put_dax(sbi->s_daxdev, NULL);
        kfree(sbi);
        return NULL;
}

static void ext4_set_def_opts(struct super_block *sb,
                              struct ext4_super_block *es)
{
        unsigned long def_mount_opts;

        /* Set defaults before we parse the mount options */
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
        set_opt(sb, INIT_INODE_TABLE);
        if (def_mount_opts & EXT4_DEFM_DEBUG)
                set_opt(sb, DEBUG);
        if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
                set_opt(sb, GRPID);
        if (def_mount_opts & EXT4_DEFM_UID16)
                set_opt(sb, NO_UID32);
        /* xattr user namespace & acls are now defaulted on */
        set_opt(sb, XATTR_USER);
#ifdef CONFIG_EXT4_FS_POSIX_ACL
        set_opt(sb, POSIX_ACL);
#endif
        if (ext4_has_feature_fast_commit(sb))
                set_opt2(sb, JOURNAL_FAST_COMMIT);
        /* don't forget to enable journal_csum when metadata_csum is enabled. */
        if (ext4_has_metadata_csum(sb))
                set_opt(sb, JOURNAL_CHECKSUM);

        if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
                set_opt(sb, JOURNAL_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
                set_opt(sb, ORDERED_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
                set_opt(sb, WRITEBACK_DATA);

        if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC)
                set_opt(sb, ERRORS_PANIC);
        else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE)
                set_opt(sb, ERRORS_CONT);
        else
                set_opt(sb, ERRORS_RO);
        /* block_validity enabled by default; disable with noblock_validity */
        set_opt(sb, BLOCK_VALIDITY);
        if (def_mount_opts & EXT4_DEFM_DISCARD)
                set_opt(sb, DISCARD);

        if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
                set_opt(sb, BARRIER);

        /*
         * enable delayed allocation by default
         * Use -o nodelalloc to turn it off
         */
        if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
            ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
                set_opt(sb, DELALLOC);

        if (sb->s_blocksize <= PAGE_SIZE)
                set_opt(sb, DIOREAD_NOLOCK);
}

static int ext4_handle_clustersize(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        int clustersize;

        /* Handle clustersize */
        clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
        if (ext4_has_feature_bigalloc(sb)) {
                if (clustersize < sb->s_blocksize) {
                        ext4_msg(sb, KERN_ERR,
                                 "cluster size (%d) smaller than "
                                 "block size (%lu)", clustersize, sb->s_blocksize);
                        return -EINVAL;
                }
                sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
                        le32_to_cpu(es->s_log_block_size);
        } else {
                if (clustersize != sb->s_blocksize) {
                        ext4_msg(sb, KERN_ERR,
                                 "fragment/cluster size (%d) != "
                                 "block size (%lu)", clustersize, sb->s_blocksize);
                        return -EINVAL;
                }
                if (sbi->s_blocks_per_group > sb->s_blocksize * 8) {
                        ext4_msg(sb, KERN_ERR,
                                 "#blocks per group too big: %lu",
                                 sbi->s_blocks_per_group);
                        return -EINVAL;
                }
                sbi->s_cluster_bits = 0;
        }
        sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group);
        if (sbi->s_clusters_per_group > sb->s_blocksize * 8) {
                ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu",
                         sbi->s_clusters_per_group);
                return -EINVAL;
        }
        if (sbi->s_blocks_per_group !=
            (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) {
                ext4_msg(sb, KERN_ERR,
                         "blocks per group (%lu) and clusters per group (%lu) inconsistent",
                         sbi->s_blocks_per_group, sbi->s_clusters_per_group);
                return -EINVAL;
        }
        sbi->s_cluster_ratio = clustersize / sb->s_blocksize;

        /* Do we have standard group size of clustersize * 8 blocks ? */
        if (sbi->s_blocks_per_group == clustersize << 3)
                set_opt2(sb, STD_GROUP_SIZE);

        return 0;
}

static void ext4_fast_commit_init(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /* Initialize fast commit stuff */
        atomic_set(&sbi->s_fc_subtid, 0);
        INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
        INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
        INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
        INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
        sbi->s_fc_bytes = 0;
        ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
        sbi->s_fc_ineligible_tid = 0;
        spin_lock_init(&sbi->s_fc_lock);
        memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
        sbi->s_fc_replay_state.fc_regions = NULL;
        sbi->s_fc_replay_state.fc_regions_size = 0;
        sbi->s_fc_replay_state.fc_regions_used = 0;
        sbi->s_fc_replay_state.fc_regions_valid = 0;
        sbi->s_fc_replay_state.fc_modified_inodes = NULL;
        sbi->s_fc_replay_state.fc_modified_inodes_size = 0;
        sbi->s_fc_replay_state.fc_modified_inodes_used = 0;
}

static int ext4_inode_info_init(struct super_block *sb,
                                struct ext4_super_block *es)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
                sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
                sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
        } else {
                sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
                sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
                if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
                        ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
                                 sbi->s_first_ino);
                        return -EINVAL;
                }
                if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
                    (!is_power_of_2(sbi->s_inode_size)) ||
                    (sbi->s_inode_size > sb->s_blocksize)) {
                        ext4_msg(sb, KERN_ERR,
                               "unsupported inode size: %d",
                               sbi->s_inode_size);
                        ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize);
                        return -EINVAL;
                }
                /*
                 * i_atime_extra is the last extra field available for
                 * [acm]times in struct ext4_inode. Checking for that
                 * field should suffice to ensure we have extra space
                 * for all three.
                 */
                if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) +
                        sizeof(((struct ext4_inode *)0)->i_atime_extra)) {
                        sb->s_time_gran = 1;
                        sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX;
                } else {
                        sb->s_time_gran = NSEC_PER_SEC;
                        sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX;
                }
                sb->s_time_min = EXT4_TIMESTAMP_MIN;
        }

        if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
                sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
                        EXT4_GOOD_OLD_INODE_SIZE;
                if (ext4_has_feature_extra_isize(sb)) {
                        unsigned v, max = (sbi->s_inode_size -
                                           EXT4_GOOD_OLD_INODE_SIZE);

                        v = le16_to_cpu(es->s_want_extra_isize);
                        if (v > max) {
                                ext4_msg(sb, KERN_ERR,
                                         "bad s_want_extra_isize: %d", v);
                                return -EINVAL;
                        }
                        if (sbi->s_want_extra_isize < v)
                                sbi->s_want_extra_isize = v;

                        v = le16_to_cpu(es->s_min_extra_isize);
                        if (v > max) {
                                ext4_msg(sb, KERN_ERR,
                                         "bad s_min_extra_isize: %d", v);
                                return -EINVAL;
                        }
                        if (sbi->s_want_extra_isize < v)
                                sbi->s_want_extra_isize = v;
                }
        }

        return 0;
}

#if IS_ENABLED(CONFIG_UNICODE)
static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
{
        const struct ext4_sb_encodings *encoding_info;
        struct unicode_map *encoding;
        __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags);

        if (!ext4_has_feature_casefold(sb) || sb->s_encoding)
                return 0;

        encoding_info = ext4_sb_read_encoding(es);
        if (!encoding_info) {
                ext4_msg(sb, KERN_ERR,
                        "Encoding requested by superblock is unknown");
                return -EINVAL;
        }

        encoding = utf8_load(encoding_info->version);
        if (IS_ERR(encoding)) {
                ext4_msg(sb, KERN_ERR,
                        "can't mount with superblock charset: %s-%u.%u.%u "
                        "not supported by the kernel. flags: 0x%x.",
                        encoding_info->name,
                        unicode_major(encoding_info->version),
                        unicode_minor(encoding_info->version),
                        unicode_rev(encoding_info->version),
                        encoding_flags);
                return -EINVAL;
        }
        ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
                "%s-%u.%u.%u with flags 0x%hx", encoding_info->name,
                unicode_major(encoding_info->version),
                unicode_minor(encoding_info->version),
                unicode_rev(encoding_info->version),
                encoding_flags);

        sb->s_encoding = encoding;
        sb->s_encoding_flags = encoding_flags;

        return 0;
}
#else
static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es)
{
        return 0;
}
#endif

static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /* Warn if metadata_csum and gdt_csum are both set. */
        if (ext4_has_feature_metadata_csum(sb) &&
            ext4_has_feature_gdt_csum(sb))
                ext4_warning(sb, "metadata_csum and uninit_bg are "
                             "redundant flags; please run fsck.");

        /* Check for a known checksum algorithm */
        if (!ext4_verify_csum_type(sb, es)) {
                ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
                         "unknown checksum algorithm.");
                return -EINVAL;
        }
        ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE,
                                ext4_orphan_file_block_trigger);

        /* Load the checksum driver */
        sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
        if (IS_ERR(sbi->s_chksum_driver)) {
                int ret = PTR_ERR(sbi->s_chksum_driver);
                ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
                sbi->s_chksum_driver = NULL;
                return ret;
        }

        /* Check superblock checksum */
        if (!ext4_superblock_csum_verify(sb, es)) {
                ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
                         "invalid superblock checksum.  Run e2fsck?");
                return -EFSBADCRC;
        }

        /* Precompute checksum seed for all metadata */
        if (ext4_has_feature_csum_seed(sb))
                sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
        else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
                sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
                                               sizeof(es->s_uuid));
        return 0;
}

static int ext4_check_feature_compatibility(struct super_block *sb,
                                            struct ext4_super_block *es,
                                            int silent)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
            (ext4_has_compat_features(sb) ||
             ext4_has_ro_compat_features(sb) ||
             ext4_has_incompat_features(sb)))
                ext4_msg(sb, KERN_WARNING,
                       "feature flags set on rev 0 fs, "
                       "running e2fsck is recommended");

        if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
                set_opt2(sb, HURD_COMPAT);
                if (ext4_has_feature_64bit(sb)) {
                        ext4_msg(sb, KERN_ERR,
                                 "The Hurd can't support 64-bit file systems");
                        return -EINVAL;
                }

                /*
                 * ea_inode feature uses l_i_version field which is not
                 * available in HURD_COMPAT mode.
                 */
                if (ext4_has_feature_ea_inode(sb)) {
                        ext4_msg(sb, KERN_ERR,
                                 "ea_inode feature is not supported for Hurd");
                        return -EINVAL;
                }
        }

        if (IS_EXT2_SB(sb)) {
                if (ext2_feature_set_ok(sb))
                        ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
                                 "using the ext4 subsystem");
                else {
                        /*
                         * If we're probing be silent, if this looks like
                         * it's actually an ext[34] filesystem.
                         */
                        if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
                                return -EINVAL;
                        ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
                                 "to feature incompatibilities");
                        return -EINVAL;
                }
        }

        if (IS_EXT3_SB(sb)) {
                if (ext3_feature_set_ok(sb))
                        ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
                                 "using the ext4 subsystem");
                else {
                        /*
                         * If we're probing be silent, if this looks like
                         * it's actually an ext4 filesystem.
                         */
                        if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
                                return -EINVAL;
                        ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
                                 "to feature incompatibilities");
                        return -EINVAL;
                }
        }

        /*
         * Check feature flags regardless of the revision level, since we
         * previously didn't change the revision level when setting the flags,
         * so there is a chance incompat flags are set on a rev 0 filesystem.
         */
        if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
                return -EINVAL;

        if (sbi->s_daxdev) {
                if (sb->s_blocksize == PAGE_SIZE)
                        set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
                else
                        ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n");
        }

        if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
                if (ext4_has_feature_inline_data(sb)) {
                        ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
                                        " that may contain inline data");
                        return -EINVAL;
                }
                if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) {
                        ext4_msg(sb, KERN_ERR,
                                "DAX unsupported by block device.");
                        return -EINVAL;
                }
        }

        if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
                ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
                         es->s_encryption_level);
                return -EINVAL;
        }

        return 0;
}

static int ext4_check_geometry(struct super_block *sb,
                               struct ext4_super_block *es)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        __u64 blocks_count;
        int err;

        if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) {
                ext4_msg(sb, KERN_ERR,
                         "Number of reserved GDT blocks insanely large: %d",
                         le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks));
                return -EINVAL;
        }
        /*
         * Test whether we have more sectors than will fit in sector_t,
         * and whether the max offset is addressable by the page cache.
         */
        err = generic_check_addressable(sb->s_blocksize_bits,
                                        ext4_blocks_count(es));
        if (err) {
                ext4_msg(sb, KERN_ERR, "filesystem"
                         " too large to mount safely on this system");
                return err;
        }

        /* check blocks count against device size */
        blocks_count = sb_bdev_nr_blocks(sb);
        if (blocks_count && ext4_blocks_count(es) > blocks_count) {
                ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
                       "exceeds size of device (%llu blocks)",
                       ext4_blocks_count(es), blocks_count);
                return -EINVAL;
        }

        /*
         * It makes no sense for the first data block to be beyond the end
         * of the filesystem.
         */
        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
                ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
                         "block %u is beyond end of filesystem (%llu)",
                         le32_to_cpu(es->s_first_data_block),
                         ext4_blocks_count(es));
                return -EINVAL;
        }
        if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) &&
            (sbi->s_cluster_ratio == 1)) {
                ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
                         "block is 0 with a 1k block and cluster size");
                return -EINVAL;
        }

        blocks_count = (ext4_blocks_count(es) -
                        le32_to_cpu(es->s_first_data_block) +
                        EXT4_BLOCKS_PER_GROUP(sb) - 1);
        do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
        if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
                ext4_msg(sb, KERN_WARNING, "groups count too large: %llu "
                       "(block count %llu, first data block %u, "
                       "blocks per group %lu)", blocks_count,
                       ext4_blocks_count(es),
                       le32_to_cpu(es->s_first_data_block),
                       EXT4_BLOCKS_PER_GROUP(sb));
                return -EINVAL;
        }
        sbi->s_groups_count = blocks_count;
        sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
                        (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
        if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
            le32_to_cpu(es->s_inodes_count)) {
                ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
                         le32_to_cpu(es->s_inodes_count),
                         ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
                return -EINVAL;
        }

        return 0;
}

static int ext4_group_desc_init(struct super_block *sb,
                                struct ext4_super_block *es,
                                ext4_fsblk_t logical_sb_block,
                                ext4_group_t *first_not_zeroed)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned int db_count;
        ext4_fsblk_t block;
        int i;

        db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
                   EXT4_DESC_PER_BLOCK(sb);
        if (ext4_has_feature_meta_bg(sb)) {
                if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
                        ext4_msg(sb, KERN_WARNING,
                                 "first meta block group too large: %u "
                                 "(group descriptor block count %u)",
                                 le32_to_cpu(es->s_first_meta_bg), db_count);
                        return -EINVAL;
                }
        }
        rcu_assign_pointer(sbi->s_group_desc,
                           kvmalloc_array(db_count,
                                          sizeof(struct buffer_head *),
                                          GFP_KERNEL));
        if (sbi->s_group_desc == NULL) {
                ext4_msg(sb, KERN_ERR, "not enough memory");
                return -ENOMEM;
        }

        bgl_lock_init(sbi->s_blockgroup_lock);

        /* Pre-read the descriptors into the buffer cache */
        for (i = 0; i < db_count; i++) {
                block = descriptor_loc(sb, logical_sb_block, i);
                ext4_sb_breadahead_unmovable(sb, block);
        }

        for (i = 0; i < db_count; i++) {
                struct buffer_head *bh;

                block = descriptor_loc(sb, logical_sb_block, i);
                bh = ext4_sb_bread_unmovable(sb, block);
                if (IS_ERR(bh)) {
                        ext4_msg(sb, KERN_ERR,
                               "can't read group descriptor %d", i);
                        sbi->s_gdb_count = i;
                        return PTR_ERR(bh);
                }
                rcu_read_lock();
                rcu_dereference(sbi->s_group_desc)[i] = bh;
                rcu_read_unlock();
        }
        sbi->s_gdb_count = db_count;
        if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) {
                ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
                return -EFSCORRUPTED;
        }

        return 0;
}

static int ext4_load_and_init_journal(struct super_block *sb,
                                      struct ext4_super_block *es,
                                      struct ext4_fs_context *ctx)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int err;

        err = ext4_load_journal(sb, es, ctx->journal_devnum);
        if (err)
                return err;

        if (ext4_has_feature_64bit(sb) &&
            !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                       JBD2_FEATURE_INCOMPAT_64BIT)) {
                ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
                goto out;
        }

        if (!set_journal_csum_feature_set(sb)) {
                ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
                         "feature set");
                goto out;
        }

        if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
                !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                          JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
                ext4_msg(sb, KERN_ERR,
                        "Failed to set fast commit journal feature");
                goto out;
        }

        /* We have now updated the journal if required, so we can
         * validate the data journaling mode. */
        switch (test_opt(sb, DATA_FLAGS)) {
        case 0:
                /* No mode set, assume a default based on the journal
                 * capabilities: ORDERED_DATA if the journal can
                 * cope, else JOURNAL_DATA
                 */
                if (jbd2_journal_check_available_features
                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
                        set_opt(sb, ORDERED_DATA);
                        sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
                } else {
                        set_opt(sb, JOURNAL_DATA);
                        sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
                }
                break;

        case EXT4_MOUNT_ORDERED_DATA:
        case EXT4_MOUNT_WRITEBACK_DATA:
                if (!jbd2_journal_check_available_features
                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
                        ext4_msg(sb, KERN_ERR, "Journal does not support "
                               "requested data journaling mode");
                        goto out;
                }
                break;
        default:
                break;
        }

        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
            test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
                ext4_msg(sb, KERN_ERR, "can't mount with "
                        "journal_async_commit in data=ordered mode");
                goto out;
        }

        set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);

        sbi->s_journal->j_submit_inode_data_buffers =
                ext4_journal_submit_inode_data_buffers;
        sbi->s_journal->j_finish_inode_data_buffers =
                ext4_journal_finish_inode_data_buffers;

        return 0;

out:
        /* flush s_sb_upd_work before destroying the journal. */
        flush_work(&sbi->s_sb_upd_work);
        jbd2_journal_destroy(sbi->s_journal);
        sbi->s_journal = NULL;
        return -EINVAL;
}

static int ext4_check_journal_data_mode(struct super_block *sb)
{
        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
                printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with "
                            "data=journal disables delayed allocation, "
                            "dioread_nolock, O_DIRECT and fast_commit support!\n");
                /* can't mount with both data=journal and dioread_nolock. */
                clear_opt(sb, DIOREAD_NOLOCK);
                clear_opt2(sb, JOURNAL_FAST_COMMIT);
                if (test_opt2(sb, EXPLICIT_DELALLOC)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "both data=journal and delalloc");
                        return -EINVAL;
                }
                if (test_opt(sb, DAX_ALWAYS)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "both data=journal and dax");
                        return -EINVAL;
                }
                if (ext4_has_feature_encrypt(sb)) {
                        ext4_msg(sb, KERN_WARNING,
                                 "encrypted files will use data=ordered "
                                 "instead of data journaling mode");
                }
                if (test_opt(sb, DELALLOC))
                        clear_opt(sb, DELALLOC);
        } else {
                sb->s_iflags |= SB_I_CGROUPWB;
        }

        return 0;
}

static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb,
                           int silent)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es;
        ext4_fsblk_t logical_sb_block;
        unsigned long offset = 0;
        struct buffer_head *bh;
        int ret = -EINVAL;
        int blocksize;

        blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
        if (!blocksize) {
                ext4_msg(sb, KERN_ERR, "unable to set blocksize");
                return -EINVAL;
        }

        /*
         * The ext4 superblock will not be buffer aligned for other than 1kB
         * block sizes.  We need to calculate the offset from buffer start.
         */
        if (blocksize != EXT4_MIN_BLOCK_SIZE) {
                logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
                offset = do_div(logical_sb_block, blocksize);
        } else {
                logical_sb_block = sbi->s_sb_block;
        }

        bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
        if (IS_ERR(bh)) {
                ext4_msg(sb, KERN_ERR, "unable to read superblock");
                return PTR_ERR(bh);
        }
        /*
         * Note: s_es must be initialized as soon as possible because
         *       some ext4 macro-instructions depend on its value
         */
        es = (struct ext4_super_block *) (bh->b_data + offset);
        sbi->s_es = es;
        sb->s_magic = le16_to_cpu(es->s_magic);
        if (sb->s_magic != EXT4_SUPER_MAGIC) {
                if (!silent)
                        ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
                goto out;
        }

        if (le32_to_cpu(es->s_log_block_size) >
            (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
                ext4_msg(sb, KERN_ERR,
                         "Invalid log block size: %u",
                         le32_to_cpu(es->s_log_block_size));
                goto out;
        }
        if (le32_to_cpu(es->s_log_cluster_size) >
            (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
                ext4_msg(sb, KERN_ERR,
                         "Invalid log cluster size: %u",
                         le32_to_cpu(es->s_log_cluster_size));
                goto out;
        }

        blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);

        /*
         * If the default block size is not the same as the real block size,
         * we need to reload it.
         */
        if (sb->s_blocksize == blocksize) {
                *lsb = logical_sb_block;
                sbi->s_sbh = bh;
                return 0;
        }

        /*
         * bh must be released before kill_bdev(), otherwise
         * it won't be freed and its page also. kill_bdev()
         * is called by sb_set_blocksize().
         */
        brelse(bh);
        /* Validate the filesystem blocksize */
        if (!sb_set_blocksize(sb, blocksize)) {
                ext4_msg(sb, KERN_ERR, "bad block size %d",
                                blocksize);
                bh = NULL;
                goto out;
        }

        logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE;
        offset = do_div(logical_sb_block, blocksize);
        bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
        if (IS_ERR(bh)) {
                ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try");
                ret = PTR_ERR(bh);
                bh = NULL;
                goto out;
        }
        es = (struct ext4_super_block *)(bh->b_data + offset);
        sbi->s_es = es;
        if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
                ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!");
                goto out;
        }
        *lsb = logical_sb_block;
        sbi->s_sbh = bh;
        return 0;
out:
        brelse(bh);
        return ret;
}

static void ext4_hash_info_init(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        unsigned int i;

        for (i = 0; i < 4; i++)
                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);

        sbi->s_def_hash_version = es->s_def_hash_version;
        if (ext4_has_feature_dir_index(sb)) {
                i = le32_to_cpu(es->s_flags);
                if (i & EXT2_FLAGS_UNSIGNED_HASH)
                        sbi->s_hash_unsigned = 3;
                else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
#ifdef __CHAR_UNSIGNED__
                        if (!sb_rdonly(sb))
                                es->s_flags |=
                                        cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
                        sbi->s_hash_unsigned = 3;
#else
                        if (!sb_rdonly(sb))
                                es->s_flags |=
                                        cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
#endif
                }
        }
}

static int ext4_block_group_meta_init(struct super_block *sb, int silent)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        int has_huge_files;

        has_huge_files = ext4_has_feature_huge_file(sb);
        sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
                                                      has_huge_files);
        sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);

        sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
        if (ext4_has_feature_64bit(sb)) {
                if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
                    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
                    !is_power_of_2(sbi->s_desc_size)) {
                        ext4_msg(sb, KERN_ERR,
                               "unsupported descriptor size %lu",
                               sbi->s_desc_size);
                        return -EINVAL;
                }
        } else
                sbi->s_desc_size = EXT4_MIN_DESC_SIZE;

        sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
        sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);

        sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb);
        if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) {
                if (!silent)
                        ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
                return -EINVAL;
        }
        if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
            sbi->s_inodes_per_group > sb->s_blocksize * 8) {
                ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
                         sbi->s_inodes_per_group);
                return -EINVAL;
        }
        sbi->s_itb_per_group = sbi->s_inodes_per_group /
                                        sbi->s_inodes_per_block;
        sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb);
        sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY;
        sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
        sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));

        return 0;
}

static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
{
        struct ext4_super_block *es = NULL;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t logical_sb_block;
        struct inode *root;
        int needs_recovery;
        int err;
        ext4_group_t first_not_zeroed;
        struct ext4_fs_context *ctx = fc->fs_private;
        int silent = fc->sb_flags & SB_SILENT;

        /* Set defaults for the variables that will be set during parsing */
        if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO))
                ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;

        sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
        sbi->s_sectors_written_start =
                part_stat_read(sb->s_bdev, sectors[STAT_WRITE]);

        err = ext4_load_super(sb, &logical_sb_block, silent);
        if (err)
                goto out_fail;

        es = sbi->s_es;
        sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);

        err = ext4_init_metadata_csum(sb, es);
        if (err)
                goto failed_mount;

        ext4_set_def_opts(sb, es);

        sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
        sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
        sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
        sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;

        /*
         * set default s_li_wait_mult for lazyinit, for the case there is
         * no mount option specified.
         */
        sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;

        err = ext4_inode_info_init(sb, es);
        if (err)
                goto failed_mount;

        err = parse_apply_sb_mount_options(sb, ctx);
        if (err < 0)
                goto failed_mount;

        sbi->s_def_mount_opt = sbi->s_mount_opt;
        sbi->s_def_mount_opt2 = sbi->s_mount_opt2;

        err = ext4_check_opt_consistency(fc, sb);
        if (err < 0)
                goto failed_mount;

        ext4_apply_options(fc, sb);

        err = ext4_encoding_init(sb, es);
        if (err)
                goto failed_mount;

        err = ext4_check_journal_data_mode(sb);
        if (err)
                goto failed_mount;

        sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
                (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);

        /* i_version is always enabled now */
        sb->s_flags |= SB_I_VERSION;

        err = ext4_check_feature_compatibility(sb, es, silent);
        if (err)
                goto failed_mount;

        err = ext4_block_group_meta_init(sb, silent);
        if (err)
                goto failed_mount;

        ext4_hash_info_init(sb);

        err = ext4_handle_clustersize(sb);
        if (err)
                goto failed_mount;

        err = ext4_check_geometry(sb, es);
        if (err)
                goto failed_mount;

        timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
        spin_lock_init(&sbi->s_error_lock);
        INIT_WORK(&sbi->s_sb_upd_work, update_super_work);

        err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed);
        if (err)
                goto failed_mount3;

        err = ext4_es_register_shrinker(sbi);
        if (err)
                goto failed_mount3;

        sbi->s_stripe = ext4_get_stripe_size(sbi);
        /*
         * It's hard to get stripe aligned blocks if stripe is not aligned with
         * cluster, just disable stripe and alert user to simpfy code and avoid
         * stripe aligned allocation which will rarely successes.
         */
        if (sbi->s_stripe > 0 && sbi->s_cluster_ratio > 1 &&
            sbi->s_stripe % sbi->s_cluster_ratio != 0) {
                ext4_msg(sb, KERN_WARNING,
                         "stripe (%lu) is not aligned with cluster size (%u), "
                         "stripe is disabled",
                         sbi->s_stripe, sbi->s_cluster_ratio);
                sbi->s_stripe = 0;
        }
        sbi->s_extent_max_zeroout_kb = 32;

        /*
         * set up enough so that it can read an inode
         */
        sb->s_op = &ext4_sops;
        sb->s_export_op = &ext4_export_ops;
        sb->s_xattr = ext4_xattr_handlers;
#ifdef CONFIG_FS_ENCRYPTION
        sb->s_cop = &ext4_cryptops;
#endif
#ifdef CONFIG_FS_VERITY
        sb->s_vop = &ext4_verityops;
#endif
#ifdef CONFIG_QUOTA
        sb->dq_op = &ext4_quota_operations;
        if (ext4_has_feature_quota(sb))
                sb->s_qcop = &dquot_quotactl_sysfile_ops;
        else
                sb->s_qcop = &ext4_qctl_operations;
        sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
#endif
        super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid));
        super_set_sysfs_name_bdev(sb);

        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
        mutex_init(&sbi->s_orphan_lock);

        ext4_fast_commit_init(sb);

        sb->s_root = NULL;

        needs_recovery = (es->s_last_orphan != 0 ||
                          ext4_has_feature_orphan_present(sb) ||
                          ext4_has_feature_journal_needs_recovery(sb));

        if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) {
                err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block));
                if (err)
                        goto failed_mount3a;
        }

        err = -EINVAL;
        /*
         * The first inode we look at is the journal inode.  Don't try
         * root first: it may be modified in the journal!
         */
        if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
                err = ext4_load_and_init_journal(sb, es, ctx);
                if (err)
                        goto failed_mount3a;
        } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
                   ext4_has_feature_journal_needs_recovery(sb)) {
                ext4_msg(sb, KERN_ERR, "required journal recovery "
                       "suppressed and not mounted read-only");
                goto failed_mount3a;
        } else {
                /* Nojournal mode, all journal mount options are illegal */
                if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "journal_async_commit, fs mounted w/o journal");
                        goto failed_mount3a;
                }

                if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "journal_checksum, fs mounted w/o journal");
                        goto failed_mount3a;
                }
                if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "commit=%lu, fs mounted w/o journal",
                                 sbi->s_commit_interval / HZ);
                        goto failed_mount3a;
                }
                if (EXT4_MOUNT_DATA_FLAGS &
                    (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "data=, fs mounted w/o journal");
                        goto failed_mount3a;
                }
                sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
                clear_opt(sb, JOURNAL_CHECKSUM);
                clear_opt(sb, DATA_FLAGS);
                clear_opt2(sb, JOURNAL_FAST_COMMIT);
                sbi->s_journal = NULL;
                needs_recovery = 0;
        }

        if (!test_opt(sb, NO_MBCACHE)) {
                sbi->s_ea_block_cache = ext4_xattr_create_cache();
                if (!sbi->s_ea_block_cache) {
                        ext4_msg(sb, KERN_ERR,
                                 "Failed to create ea_block_cache");
                        err = -EINVAL;
                        goto failed_mount_wq;
                }

                if (ext4_has_feature_ea_inode(sb)) {
                        sbi->s_ea_inode_cache = ext4_xattr_create_cache();
                        if (!sbi->s_ea_inode_cache) {
                                ext4_msg(sb, KERN_ERR,
                                         "Failed to create ea_inode_cache");
                                err = -EINVAL;
                                goto failed_mount_wq;
                        }
                }
        }

        /*
         * Get the # of file system overhead blocks from the
         * superblock if present.
         */
        sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
        /* ignore the precalculated value if it is ridiculous */
        if (sbi->s_overhead > ext4_blocks_count(es))
                sbi->s_overhead = 0;
        /*
         * If the bigalloc feature is not enabled recalculating the
         * overhead doesn't take long, so we might as well just redo
         * it to make sure we are using the correct value.
         */
        if (!ext4_has_feature_bigalloc(sb))
                sbi->s_overhead = 0;
        if (sbi->s_overhead == 0) {
                err = ext4_calculate_overhead(sb);
                if (err)
                        goto failed_mount_wq;
        }

        /*
         * The maximum number of concurrent works can be high and
         * concurrency isn't really necessary.  Limit it to 1.
         */
        EXT4_SB(sb)->rsv_conversion_wq =
                alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
        if (!EXT4_SB(sb)->rsv_conversion_wq) {
                printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
                err = -ENOMEM;
                goto failed_mount4;
        }

        /*
         * The jbd2_journal_load will have done any necessary log recovery,
         * so we can safely mount the rest of the filesystem now.
         */

        root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL);
        if (IS_ERR(root)) {
                ext4_msg(sb, KERN_ERR, "get root inode failed");
                err = PTR_ERR(root);
                root = NULL;
                goto failed_mount4;
        }
        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
                ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
                iput(root);
                err = -EFSCORRUPTED;
                goto failed_mount4;
        }

        generic_set_sb_d_ops(sb);
        sb->s_root = d_make_root(root);
        if (!sb->s_root) {
                ext4_msg(sb, KERN_ERR, "get root dentry failed");
                err = -ENOMEM;
                goto failed_mount4;
        }

        err = ext4_setup_super(sb, es, sb_rdonly(sb));
        if (err == -EROFS) {
                sb->s_flags |= SB_RDONLY;
        } else if (err)
                goto failed_mount4a;

        ext4_set_resv_clusters(sb);

        if (test_opt(sb, BLOCK_VALIDITY)) {
                err = ext4_setup_system_zone(sb);
                if (err) {
                        ext4_msg(sb, KERN_ERR, "failed to initialize system "
                                 "zone (%d)", err);
                        goto failed_mount4a;
                }
        }
        ext4_fc_replay_cleanup(sb);

        ext4_ext_init(sb);

        /*
         * Enable optimize_scan if number of groups is > threshold. This can be
         * turned off by passing "mb_optimize_scan=0". This can also be
         * turned on forcefully by passing "mb_optimize_scan=1".
         */
        if (!(ctx->spec & EXT4_SPEC_mb_optimize_scan)) {
                if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
                        set_opt2(sb, MB_OPTIMIZE_SCAN);
                else
                        clear_opt2(sb, MB_OPTIMIZE_SCAN);
        }

        err = ext4_mb_init(sb);
        if (err) {
                ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
                         err);
                goto failed_mount5;
        }

        /*
         * We can only set up the journal commit callback once
         * mballoc is initialized
         */
        if (sbi->s_journal)
                sbi->s_journal->j_commit_callback =
                        ext4_journal_commit_callback;

        err = ext4_percpu_param_init(sbi);
        if (err)
                goto failed_mount6;

        if (ext4_has_feature_flex_bg(sb))
                if (!ext4_fill_flex_info(sb)) {
                        ext4_msg(sb, KERN_ERR,
                               "unable to initialize "
                               "flex_bg meta info!");
                        err = -ENOMEM;
                        goto failed_mount6;
                }

        err = ext4_register_li_request(sb, first_not_zeroed);
        if (err)
                goto failed_mount6;

        err = ext4_init_orphan_info(sb);
        if (err)
                goto failed_mount7;
#ifdef CONFIG_QUOTA
        /* Enable quota usage during mount. */
        if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) {
                err = ext4_enable_quotas(sb);
                if (err)
                        goto failed_mount8;
        }
#endif  /* CONFIG_QUOTA */

        /*
         * Save the original bdev mapping's wb_err value which could be
         * used to detect the metadata async write error.
         */
        spin_lock_init(&sbi->s_bdev_wb_lock);
        errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err,
                                 &sbi->s_bdev_wb_err);
        EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
        ext4_orphan_cleanup(sb, es);
        EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
        /*
         * Update the checksum after updating free space/inode counters and
         * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect
         * checksum in the buffer cache until it is written out and
         * e2fsprogs programs trying to open a file system immediately
         * after it is mounted can fail.
         */
        ext4_superblock_csum_set(sb);
        if (needs_recovery) {
                ext4_msg(sb, KERN_INFO, "recovery complete");
                err = ext4_mark_recovery_complete(sb, es);
                if (err)
                        goto failed_mount9;
        }

        if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev))
                ext4_msg(sb, KERN_WARNING,
                         "mounting with \"discard\" option, but the device does not support discard");

        if (es->s_error_count)
                mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */

        /* Enable message ratelimiting. Default is 10 messages per 5 secs. */
        ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
        ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
        ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
        atomic_set(&sbi->s_warning_count, 0);
        atomic_set(&sbi->s_msg_count, 0);

        /* Register sysfs after all initializations are complete. */
        err = ext4_register_sysfs(sb);
        if (err)
                goto failed_mount9;

        return 0;

failed_mount9:
        ext4_quotas_off(sb, EXT4_MAXQUOTAS);
failed_mount8: __maybe_unused
        ext4_release_orphan_info(sb);
failed_mount7:
        ext4_unregister_li_request(sb);
failed_mount6:
        ext4_mb_release(sb);
        ext4_flex_groups_free(sbi);
        ext4_percpu_param_destroy(sbi);
failed_mount5:
        ext4_ext_release(sb);
        ext4_release_system_zone(sb);
failed_mount4a:
        dput(sb->s_root);
        sb->s_root = NULL;
failed_mount4:
        ext4_msg(sb, KERN_ERR, "mount failed");
        if (EXT4_SB(sb)->rsv_conversion_wq)
                destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
failed_mount_wq:
        ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
        sbi->s_ea_inode_cache = NULL;

        ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
        sbi->s_ea_block_cache = NULL;

        if (sbi->s_journal) {
                /* flush s_sb_upd_work before journal destroy. */
                flush_work(&sbi->s_sb_upd_work);
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
        }
failed_mount3a:
        ext4_es_unregister_shrinker(sbi);
failed_mount3:
        /* flush s_sb_upd_work before sbi destroy */
        flush_work(&sbi->s_sb_upd_work);
        del_timer_sync(&sbi->s_err_report);
        ext4_stop_mmpd(sbi);
        ext4_group_desc_free(sbi);
failed_mount:
        if (sbi->s_chksum_driver)
                crypto_free_shash(sbi->s_chksum_driver);

#if IS_ENABLED(CONFIG_UNICODE)
        utf8_unload(sb->s_encoding);
#endif

#ifdef CONFIG_QUOTA
        for (unsigned int i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(get_qf_name(sb, sbi, i));
#endif
        fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
        brelse(sbi->s_sbh);
        if (sbi->s_journal_bdev_file) {
                invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
                bdev_fput(sbi->s_journal_bdev_file);
        }
out_fail:
        invalidate_bdev(sb->s_bdev);
        sb->s_fs_info = NULL;
        return err;
}

static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        struct ext4_sb_info *sbi;
        const char *descr;
        int ret;

        sbi = ext4_alloc_sbi(sb);
        if (!sbi)
                return -ENOMEM;

        fc->s_fs_info = sbi;

        /* Cleanup superblock name */
        strreplace(sb->s_id, '/', '!');

        sbi->s_sb_block = 1;        /* Default super block location */
        if (ctx->spec & EXT4_SPEC_s_sb_block)
                sbi->s_sb_block = ctx->s_sb_block;

        ret = __ext4_fill_super(fc, sb);
        if (ret < 0)
                goto free_sbi;

        if (sbi->s_journal) {
                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
                        descr = " journalled data mode";
                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
                        descr = " ordered data mode";
                else
                        descr = " writeback data mode";
        } else
                descr = "out journal";

        if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
                ext4_msg(sb, KERN_INFO, "mounted filesystem %pU %s with%s. "
                         "Quota mode: %s.", &sb->s_uuid,
                         sb_rdonly(sb) ? "ro" : "r/w", descr,
                         ext4_quota_mode(sb));

        /* Update the s_overhead_clusters if necessary */
        ext4_update_overhead(sb, false);
        return 0;

free_sbi:
        ext4_free_sbi(sbi);
        fc->s_fs_info = NULL;
        return ret;
}

static int ext4_get_tree(struct fs_context *fc)
{
        return get_tree_bdev(fc, ext4_fill_super);
}

/*
 * Setup any per-fs journal parameters now.  We'll do this both on
 * initial mount, once the journal has been initialised but before we've
 * done any recovery; and again on any subsequent remount.
 */
static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        journal->j_commit_interval = sbi->s_commit_interval;
        journal->j_min_batch_time = sbi->s_min_batch_time;
        journal->j_max_batch_time = sbi->s_max_batch_time;
        ext4_fc_init(sb, journal);

        write_lock(&journal->j_state_lock);
        if (test_opt(sb, BARRIER))
                journal->j_flags |= JBD2_BARRIER;
        else
                journal->j_flags &= ~JBD2_BARRIER;
        if (test_opt(sb, DATA_ERR_ABORT))
                journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
        else
                journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
        /*
         * Always enable journal cycle record option, letting the journal
         * records log transactions continuously between each mount.
         */
        journal->j_flags |= JBD2_CYCLE_RECORD;
        write_unlock(&journal->j_state_lock);
}

static struct inode *ext4_get_journal_inode(struct super_block *sb,
                                             unsigned int journal_inum)
{
        struct inode *journal_inode;

        /*
         * Test for the existence of a valid inode on disk.  Bad things
         * happen if we iget() an unused inode, as the subsequent iput()
         * will try to delete it.
         */
        journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL);
        if (IS_ERR(journal_inode)) {
                ext4_msg(sb, KERN_ERR, "no journal found");
                return ERR_CAST(journal_inode);
        }
        if (!journal_inode->i_nlink) {
                make_bad_inode(journal_inode);
                iput(journal_inode);
                ext4_msg(sb, KERN_ERR, "journal inode is deleted");
                return ERR_PTR(-EFSCORRUPTED);
        }
        if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) {
                ext4_msg(sb, KERN_ERR, "invalid journal inode");
                iput(journal_inode);
                return ERR_PTR(-EFSCORRUPTED);
        }

        ext4_debug("Journal inode found at %p: %lld bytes\n",
                  journal_inode, journal_inode->i_size);
        return journal_inode;
}

static int ext4_journal_bmap(journal_t *journal, sector_t *block)
{
        struct ext4_map_blocks map;
        int ret;

        if (journal->j_inode == NULL)
                return 0;

        map.m_lblk = *block;
        map.m_len = 1;
        ret = ext4_map_blocks(NULL, journal->j_inode, &map, 0);
        if (ret <= 0) {
                ext4_msg(journal->j_inode->i_sb, KERN_CRIT,
                         "journal bmap failed: block %llu ret %d\n",
                         *block, ret);
                jbd2_journal_abort(journal, ret ? ret : -EIO);
                return ret;
        }
        *block = map.m_pblk;
        return 0;
}

static journal_t *ext4_open_inode_journal(struct super_block *sb,
                                          unsigned int journal_inum)
{
        struct inode *journal_inode;
        journal_t *journal;

        journal_inode = ext4_get_journal_inode(sb, journal_inum);
        if (IS_ERR(journal_inode))
                return ERR_CAST(journal_inode);

        journal = jbd2_journal_init_inode(journal_inode);
        if (IS_ERR(journal)) {
                ext4_msg(sb, KERN_ERR, "Could not load journal inode");
                iput(journal_inode);
                return ERR_CAST(journal);
        }
        journal->j_private = sb;
        journal->j_bmap = ext4_journal_bmap;
        ext4_init_journal_params(sb, journal);
        return journal;
}

static struct file *ext4_get_journal_blkdev(struct super_block *sb,
                                        dev_t j_dev, ext4_fsblk_t *j_start,
                                        ext4_fsblk_t *j_len)
{
        struct buffer_head *bh;
        struct block_device *bdev;
        struct file *bdev_file;
        int hblock, blocksize;
        ext4_fsblk_t sb_block;
        unsigned long offset;
        struct ext4_super_block *es;
        int errno;

        bdev_file = bdev_file_open_by_dev(j_dev,
                BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
                sb, &fs_holder_ops);
        if (IS_ERR(bdev_file)) {
                ext4_msg(sb, KERN_ERR,
                         "failed to open journal device unknown-block(%u,%u) %ld",
                         MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file));
                return bdev_file;
        }

        bdev = file_bdev(bdev_file);
        blocksize = sb->s_blocksize;
        hblock = bdev_logical_block_size(bdev);
        if (blocksize < hblock) {
                ext4_msg(sb, KERN_ERR,
                        "blocksize too small for journal device");
                errno = -EINVAL;
                goto out_bdev;
        }

        sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
        offset = EXT4_MIN_BLOCK_SIZE % blocksize;
        set_blocksize(bdev_file, blocksize);
        bh = __bread(bdev, sb_block, blocksize);
        if (!bh) {
                ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
                       "external journal");
                errno = -EINVAL;
                goto out_bdev;
        }

        es = (struct ext4_super_block *) (bh->b_data + offset);
        if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
            !(le32_to_cpu(es->s_feature_incompat) &
              EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
                ext4_msg(sb, KERN_ERR, "external journal has bad superblock");
                errno = -EFSCORRUPTED;
                goto out_bh;
        }

        if ((le32_to_cpu(es->s_feature_ro_compat) &
             EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
            es->s_checksum != ext4_superblock_csum(sb, es)) {
                ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock");
                errno = -EFSCORRUPTED;
                goto out_bh;
        }

        if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
                ext4_msg(sb, KERN_ERR, "journal UUID does not match");
                errno = -EFSCORRUPTED;
                goto out_bh;
        }

        *j_start = sb_block + 1;
        *j_len = ext4_blocks_count(es);
        brelse(bh);
        return bdev_file;

out_bh:
        brelse(bh);
out_bdev:
        bdev_fput(bdev_file);
        return ERR_PTR(errno);
}

static journal_t *ext4_open_dev_journal(struct super_block *sb,
                                        dev_t j_dev)
{
        journal_t *journal;
        ext4_fsblk_t j_start;
        ext4_fsblk_t j_len;
        struct file *bdev_file;
        int errno = 0;

        bdev_file = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
        if (IS_ERR(bdev_file))
                return ERR_CAST(bdev_file);

        journal = jbd2_journal_init_dev(file_bdev(bdev_file), sb->s_bdev, j_start,
                                        j_len, sb->s_blocksize);
        if (IS_ERR(journal)) {
                ext4_msg(sb, KERN_ERR, "failed to create device journal");
                errno = PTR_ERR(journal);
                goto out_bdev;
        }
        if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
                ext4_msg(sb, KERN_ERR, "External journal has more than one "
                                        "user (unsupported) - %d",
                        be32_to_cpu(journal->j_superblock->s_nr_users));
                errno = -EINVAL;
                goto out_journal;
        }
        journal->j_private = sb;
        EXT4_SB(sb)->s_journal_bdev_file = bdev_file;
        ext4_init_journal_params(sb, journal);
        return journal;

out_journal:
        jbd2_journal_destroy(journal);
out_bdev:
        bdev_fput(bdev_file);
        return ERR_PTR(errno);
}

static int ext4_load_journal(struct super_block *sb,
                             struct ext4_super_block *es,
                             unsigned long journal_devnum)
{
        journal_t *journal;
        unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
        dev_t journal_dev;
        int err = 0;
        int really_read_only;
        int journal_dev_ro;

        if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
                return -EFSCORRUPTED;

        if (journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                ext4_msg(sb, KERN_INFO, "external journal device major/minor "
                        "numbers have changed");
                journal_dev = new_decode_dev(journal_devnum);
        } else
                journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));

        if (journal_inum && journal_dev) {
                ext4_msg(sb, KERN_ERR,
                         "filesystem has both journal inode and journal device!");
                return -EINVAL;
        }

        if (journal_inum) {
                journal = ext4_open_inode_journal(sb, journal_inum);
                if (IS_ERR(journal))
                        return PTR_ERR(journal);
        } else {
                journal = ext4_open_dev_journal(sb, journal_dev);
                if (IS_ERR(journal))
                        return PTR_ERR(journal);
        }

        journal_dev_ro = bdev_read_only(journal->j_dev);
        really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro;

        if (journal_dev_ro && !sb_rdonly(sb)) {
                ext4_msg(sb, KERN_ERR,
                         "journal device read-only, try mounting with '-o ro'");
                err = -EROFS;
                goto err_out;
        }

        /*
         * Are we loading a blank journal or performing recovery after a
         * crash?  For recovery, we need to check in advance whether we
         * can get read-write access to the device.
         */
        if (ext4_has_feature_journal_needs_recovery(sb)) {
                if (sb_rdonly(sb)) {
                        ext4_msg(sb, KERN_INFO, "INFO: recovery "
                                        "required on readonly filesystem");
                        if (really_read_only) {
                                ext4_msg(sb, KERN_ERR, "write access "
                                        "unavailable, cannot proceed "
                                        "(try mounting with noload)");
                                err = -EROFS;
                                goto err_out;
                        }
                        ext4_msg(sb, KERN_INFO, "write access will "
                               "be enabled during recovery");
                }
        }

        if (!(journal->j_flags & JBD2_BARRIER))
                ext4_msg(sb, KERN_INFO, "barriers disabled");

        if (!ext4_has_feature_journal_needs_recovery(sb))
                err = jbd2_journal_wipe(journal, !really_read_only);
        if (!err) {
                char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
                __le16 orig_state;
                bool changed = false;

                if (save)
                        memcpy(save, ((char *) es) +
                               EXT4_S_ERR_START, EXT4_S_ERR_LEN);
                err = jbd2_journal_load(journal);
                if (save && memcmp(((char *) es) + EXT4_S_ERR_START,
                                   save, EXT4_S_ERR_LEN)) {
                        memcpy(((char *) es) + EXT4_S_ERR_START,
                               save, EXT4_S_ERR_LEN);
                        changed = true;
                }
                kfree(save);
                orig_state = es->s_state;
                es->s_state |= cpu_to_le16(EXT4_SB(sb)->s_mount_state &
                                           EXT4_ERROR_FS);
                if (orig_state != es->s_state)
                        changed = true;
                /* Write out restored error information to the superblock */
                if (changed && !really_read_only) {
                        int err2;
                        err2 = ext4_commit_super(sb);
                        err = err ? : err2;
                }
        }

        if (err) {
                ext4_msg(sb, KERN_ERR, "error loading journal");
                goto err_out;
        }

        EXT4_SB(sb)->s_journal = journal;
        err = ext4_clear_journal_err(sb, es);
        if (err) {
                EXT4_SB(sb)->s_journal = NULL;
                jbd2_journal_destroy(journal);
                return err;
        }

        if (!really_read_only && journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
                ext4_commit_super(sb);
        }
        if (!really_read_only && journal_inum &&
            journal_inum != le32_to_cpu(es->s_journal_inum)) {
                es->s_journal_inum = cpu_to_le32(journal_inum);
                ext4_commit_super(sb);
        }

        return 0;

err_out:
        jbd2_journal_destroy(journal);
        return err;
}

/* Copy state of EXT4_SB(sb) into buffer for on-disk superblock */
static void ext4_update_super(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        struct buffer_head *sbh = sbi->s_sbh;

        lock_buffer(sbh);
        /*
         * If the file system is mounted read-only, don't update the
         * superblock write time.  This avoids updating the superblock
         * write time when we are mounting the root file system
         * read/only but we need to replay the journal; at that point,
         * for people who are east of GMT and who make their clock
         * tick in localtime for Windows bug-for-bug compatibility,
         * the clock is set in the future, and this will cause e2fsck
         * to complain and force a full file system check.
         */
        if (!sb_rdonly(sb))
                ext4_update_tstamp(es, s_wtime);
        es->s_kbytes_written =
                cpu_to_le64(sbi->s_kbytes_written +
                    ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
                      sbi->s_sectors_written_start) >> 1));
        if (percpu_counter_initialized(&sbi->s_freeclusters_counter))
                ext4_free_blocks_count_set(es,
                        EXT4_C2B(sbi, percpu_counter_sum_positive(
                                &sbi->s_freeclusters_counter)));
        if (percpu_counter_initialized(&sbi->s_freeinodes_counter))
                es->s_free_inodes_count =
                        cpu_to_le32(percpu_counter_sum_positive(
                                &sbi->s_freeinodes_counter));
        /* Copy error information to the on-disk superblock */
        spin_lock(&sbi->s_error_lock);
        if (sbi->s_add_error_count > 0) {
                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
                if (!es->s_first_error_time && !es->s_first_error_time_hi) {
                        __ext4_update_tstamp(&es->s_first_error_time,
                                             &es->s_first_error_time_hi,
                                             sbi->s_first_error_time);
                        strtomem_pad(es->s_first_error_func,
                                     sbi->s_first_error_func, 0);
                        es->s_first_error_line =
                                cpu_to_le32(sbi->s_first_error_line);
                        es->s_first_error_ino =
                                cpu_to_le32(sbi->s_first_error_ino);
                        es->s_first_error_block =
                                cpu_to_le64(sbi->s_first_error_block);
                        es->s_first_error_errcode =
                                ext4_errno_to_code(sbi->s_first_error_code);
                }
                __ext4_update_tstamp(&es->s_last_error_time,
                                     &es->s_last_error_time_hi,
                                     sbi->s_last_error_time);
                strtomem_pad(es->s_last_error_func, sbi->s_last_error_func, 0);
                es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line);
                es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino);
                es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block);
                es->s_last_error_errcode =
                                ext4_errno_to_code(sbi->s_last_error_code);
                /*
                 * Start the daily error reporting function if it hasn't been
                 * started already
                 */
                if (!es->s_error_count)
                        mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);
                le32_add_cpu(&es->s_error_count, sbi->s_add_error_count);
                sbi->s_add_error_count = 0;
        }
        spin_unlock(&sbi->s_error_lock);

        ext4_superblock_csum_set(sb);
        unlock_buffer(sbh);
}

static int ext4_commit_super(struct super_block *sb)
{
        struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;

        if (!sbh)
                return -EINVAL;

        ext4_update_super(sb);

        lock_buffer(sbh);
        /* Buffer got discarded which means block device got invalidated */
        if (!buffer_mapped(sbh)) {
                unlock_buffer(sbh);
                return -EIO;
        }

        if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
                /*
                 * Oh, dear.  A previous attempt to write the
                 * superblock failed.  This could happen because the
                 * USB device was yanked out.  Or it could happen to
                 * be a transient write error and maybe the block will
                 * be remapped.  Nothing we can do but to retry the
                 * write and hope for the best.
                 */
                ext4_msg(sb, KERN_ERR, "previous I/O error to "
                       "superblock detected");
                clear_buffer_write_io_error(sbh);
                set_buffer_uptodate(sbh);
        }
        get_bh(sbh);
        /* Clear potential dirty bit if it was journalled update */
        clear_buffer_dirty(sbh);
        sbh->b_end_io = end_buffer_write_sync;
        submit_bh(REQ_OP_WRITE | REQ_SYNC |
                  (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh);
        wait_on_buffer(sbh);
        if (buffer_write_io_error(sbh)) {
                ext4_msg(sb, KERN_ERR, "I/O error while writing "
                       "superblock");
                clear_buffer_write_io_error(sbh);
                set_buffer_uptodate(sbh);
                return -EIO;
        }
        return 0;
}

/*
 * Have we just finished recovery?  If so, and if we are mounting (or
 * remounting) the filesystem readonly, then we will end up with a
 * consistent fs on disk.  Record that fact.
 */
static int ext4_mark_recovery_complete(struct super_block *sb,
                                       struct ext4_super_block *es)
{
        int err;
        journal_t *journal = EXT4_SB(sb)->s_journal;

        if (!ext4_has_feature_journal(sb)) {
                if (journal != NULL) {
                        ext4_error(sb, "Journal got removed while the fs was "
                                   "mounted!");
                        return -EFSCORRUPTED;
                }
                return 0;
        }
        jbd2_journal_lock_updates(journal);
        err = jbd2_journal_flush(journal, 0);
        if (err < 0)
                goto out;

        if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) ||
            ext4_has_feature_orphan_present(sb))) {
                if (!ext4_orphan_file_empty(sb)) {
                        ext4_error(sb, "Orphan file not empty on read-only fs.");
                        err = -EFSCORRUPTED;
                        goto out;
                }
                ext4_clear_feature_journal_needs_recovery(sb);
                ext4_clear_feature_orphan_present(sb);
                ext4_commit_super(sb);
        }
out:
        jbd2_journal_unlock_updates(journal);
        return err;
}

/*
 * If we are mounting (or read-write remounting) a filesystem whose journal
 * has recorded an error from a previous lifetime, move that error to the
 * main filesystem now.
 */
static int ext4_clear_journal_err(struct super_block *sb,
                                   struct ext4_super_block *es)
{
        journal_t *journal;
        int j_errno;
        const char *errstr;

        if (!ext4_has_feature_journal(sb)) {
                ext4_error(sb, "Journal got removed while the fs was mounted!");
                return -EFSCORRUPTED;
        }

        journal = EXT4_SB(sb)->s_journal;

        /*
         * Now check for any error status which may have been recorded in the
         * journal by a prior ext4_error() or ext4_abort()
         */

        j_errno = jbd2_journal_errno(journal);
        if (j_errno) {
                char nbuf[16];

                errstr = ext4_decode_error(sb, j_errno, nbuf);
                ext4_warning(sb, "Filesystem error recorded "
                             "from previous mount: %s", errstr);

                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
                j_errno = ext4_commit_super(sb);
                if (j_errno)
                        return j_errno;
                ext4_warning(sb, "Marked fs in need of filesystem check.");

                jbd2_journal_clear_err(journal);
                jbd2_journal_update_sb_errno(journal);
        }
        return 0;
}

/*
 * Force the running and committing transactions to commit,
 * and wait on the commit.
 */
int ext4_force_commit(struct super_block *sb)
{
        return ext4_journal_force_commit(EXT4_SB(sb)->s_journal);
}

static int ext4_sync_fs(struct super_block *sb, int wait)
{
        int ret = 0;
        tid_t target;
        bool needs_barrier = false;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (unlikely(ext4_forced_shutdown(sb)))
                return 0;

        trace_ext4_sync_fs(sb, wait);
        flush_workqueue(sbi->rsv_conversion_wq);
        /*
         * Writeback quota in non-journalled quota case - journalled quota has
         * no dirty dquots
         */
        dquot_writeback_dquots(sb, -1);
        /*
         * Data writeback is possible w/o journal transaction, so barrier must
         * being sent at the end of the function. But we can skip it if
         * transaction_commit will do it for us.
         */
        if (sbi->s_journal) {
                target = jbd2_get_latest_transaction(sbi->s_journal);
                if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
                    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
                        needs_barrier = true;

                if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
                        if (wait)
                                ret = jbd2_log_wait_commit(sbi->s_journal,
                                                           target);
                }
        } else if (wait && test_opt(sb, BARRIER))
                needs_barrier = true;
        if (needs_barrier) {
                int err;
                err = blkdev_issue_flush(sb->s_bdev);
                if (!ret)
                        ret = err;
        }

        return ret;
}

/*
 * LVM calls this function before a (read-only) snapshot is created.  This
 * gives us a chance to flush the journal completely and mark the fs clean.
 *
 * Note that only this function cannot bring a filesystem to be in a clean
 * state independently. It relies on upper layer to stop all data & metadata
 * modifications.
 */
static int ext4_freeze(struct super_block *sb)
{
        int error = 0;
        journal_t *journal = EXT4_SB(sb)->s_journal;

        if (journal) {
                /* Now we set up the journal barrier. */
                jbd2_journal_lock_updates(journal);

                /*
                 * Don't clear the needs_recovery flag if we failed to
                 * flush the journal.
                 */
                error = jbd2_journal_flush(journal, 0);
                if (error < 0)
                        goto out;

                /* Journal blocked and flushed, clear needs_recovery flag. */
                ext4_clear_feature_journal_needs_recovery(sb);
                if (ext4_orphan_file_empty(sb))
                        ext4_clear_feature_orphan_present(sb);
        }

        error = ext4_commit_super(sb);
out:
        if (journal)
                /* we rely on upper layer to stop further updates */
                jbd2_journal_unlock_updates(journal);
        return error;
}

/*
 * Called by LVM after the snapshot is done.  We need to reset the RECOVER
 * flag here, even though the filesystem is not technically dirty yet.
 */
static int ext4_unfreeze(struct super_block *sb)
{
        if (ext4_forced_shutdown(sb))
                return 0;

        if (EXT4_SB(sb)->s_journal) {
                /* Reset the needs_recovery flag before the fs is unlocked. */
                ext4_set_feature_journal_needs_recovery(sb);
                if (ext4_has_feature_orphan_file(sb))
                        ext4_set_feature_orphan_present(sb);
        }

        ext4_commit_super(sb);
        return 0;
}

/*
 * Structure to save mount options for ext4_remount's benefit
 */
struct ext4_mount_options {
        unsigned long s_mount_opt;
        unsigned long s_mount_opt2;
        kuid_t s_resuid;
        kgid_t s_resgid;
        unsigned long s_commit_interval;
        u32 s_min_batch_time, s_max_batch_time;
#ifdef CONFIG_QUOTA
        int s_jquota_fmt;
        char *s_qf_names[EXT4_MAXQUOTAS];
#endif
};

static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
{
        struct ext4_fs_context *ctx = fc->fs_private;
        struct ext4_super_block *es;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned long old_sb_flags;
        struct ext4_mount_options old_opts;
        ext4_group_t g;
        int err = 0;
        int alloc_ctx;
#ifdef CONFIG_QUOTA
        int enable_quota = 0;
        int i, j;
        char *to_free[EXT4_MAXQUOTAS];
#endif


        /* Store the original options */
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
        old_opts.s_mount_opt2 = sbi->s_mount_opt2;
        old_opts.s_resuid = sbi->s_resuid;
        old_opts.s_resgid = sbi->s_resgid;
        old_opts.s_commit_interval = sbi->s_commit_interval;
        old_opts.s_min_batch_time = sbi->s_min_batch_time;
        old_opts.s_max_batch_time = sbi->s_max_batch_time;
#ifdef CONFIG_QUOTA
        old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                if (sbi->s_qf_names[i]) {
                        char *qf_name = get_qf_name(sb, sbi, i);

                        old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL);
                        if (!old_opts.s_qf_names[i]) {
                                for (j = 0; j < i; j++)
                                        kfree(old_opts.s_qf_names[j]);
                                return -ENOMEM;
                        }
                } else
                        old_opts.s_qf_names[i] = NULL;
#endif
        if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) {
                if (sbi->s_journal && sbi->s_journal->j_task->io_context)
                        ctx->journal_ioprio =
                                sbi->s_journal->j_task->io_context->ioprio;
                else
                        ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;

        }

        /*
         * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause
         * two calls to ext4_should_dioread_nolock() to return inconsistent
         * values, triggering WARN_ON in ext4_add_complete_io(). we grab
         * here s_writepages_rwsem to avoid race between writepages ops and
         * remount.
         */
        alloc_ctx = ext4_writepages_down_write(sb);
        ext4_apply_options(fc, sb);
        ext4_writepages_up_write(sb, alloc_ctx);

        if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
            test_opt(sb, JOURNAL_CHECKSUM)) {
                ext4_msg(sb, KERN_ERR, "changing journal_checksum "
                         "during remount not supported; ignoring");
                sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM;
        }

        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
                if (test_opt2(sb, EXPLICIT_DELALLOC)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "both data=journal and delalloc");
                        err = -EINVAL;
                        goto restore_opts;
                }
                if (test_opt(sb, DIOREAD_NOLOCK)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                 "both data=journal and dioread_nolock");
                        err = -EINVAL;
                        goto restore_opts;
                }
        } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
                if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
                        ext4_msg(sb, KERN_ERR, "can't mount with "
                                "journal_async_commit in data=ordered mode");
                        err = -EINVAL;
                        goto restore_opts;
                }
        }

        if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) {
                ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount");
                err = -EINVAL;
                goto restore_opts;
        }

        if (test_opt2(sb, ABORT))
                ext4_abort(sb, ESHUTDOWN, "Abort forced by user");

        sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
                (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);

        es = sbi->s_es;

        if (sbi->s_journal) {
                ext4_init_journal_params(sb, sbi->s_journal);
                set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio);
        }

        /* Flush outstanding errors before changing fs state */
        flush_work(&sbi->s_sb_upd_work);

        if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) {
                if (ext4_forced_shutdown(sb)) {
                        err = -EROFS;
                        goto restore_opts;
                }

                if (fc->sb_flags & SB_RDONLY) {
                        err = sync_filesystem(sb);
                        if (err < 0)
                                goto restore_opts;
                        err = dquot_suspend(sb, -1);
                        if (err < 0)
                                goto restore_opts;

                        /*
                         * First of all, the unconditional stuff we have to do
                         * to disable replay of the journal when we next remount
                         */
                        sb->s_flags |= SB_RDONLY;

                        /*
                         * OK, test if we are remounting a valid rw partition
                         * readonly, and if so set the rdonly flag and then
                         * mark the partition as valid again.
                         */
                        if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
                            (sbi->s_mount_state & EXT4_VALID_FS))
                                es->s_state = cpu_to_le16(sbi->s_mount_state);

                        if (sbi->s_journal) {
                                /*
                                 * We let remount-ro finish even if marking fs
                                 * as clean failed...
                                 */
                                ext4_mark_recovery_complete(sb, es);
                        }
                } else {
                        /* Make sure we can mount this feature set readwrite */
                        if (ext4_has_feature_readonly(sb) ||
                            !ext4_feature_set_ok(sb, 0)) {
                                err = -EROFS;
                                goto restore_opts;
                        }
                        /*
                         * Make sure the group descriptor checksums
                         * are sane.  If they aren't, refuse to remount r/w.
                         */
                        for (g = 0; g < sbi->s_groups_count; g++) {
                                struct ext4_group_desc *gdp =
                                        ext4_get_group_desc(sb, g, NULL);

                                if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
                                        ext4_msg(sb, KERN_ERR,
               "ext4_remount: Checksum for group %u failed (%u!=%u)",
                g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)),
                                               le16_to_cpu(gdp->bg_checksum));
                                        err = -EFSBADCRC;
                                        goto restore_opts;
                                }
                        }

                        /*
                         * If we have an unprocessed orphan list hanging
                         * around from a previously readonly bdev mount,
                         * require a full umount/remount for now.
                         */
                        if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) {
                                ext4_msg(sb, KERN_WARNING, "Couldn't "
                                       "remount RDWR because of unprocessed "
                                       "orphan inode list.  Please "
                                       "umount/remount instead");
                                err = -EINVAL;
                                goto restore_opts;
                        }

                        /*
                         * Mounting a RDONLY partition read-write, so reread
                         * and store the current valid flag.  (It may have
                         * been changed by e2fsck since we originally mounted
                         * the partition.)
                         */
                        if (sbi->s_journal) {
                                err = ext4_clear_journal_err(sb, es);
                                if (err)
                                        goto restore_opts;
                        }
                        sbi->s_mount_state = (le16_to_cpu(es->s_state) &
                                              ~EXT4_FC_REPLAY);

                        err = ext4_setup_super(sb, es, 0);
                        if (err)
                                goto restore_opts;

                        sb->s_flags &= ~SB_RDONLY;
                        if (ext4_has_feature_mmp(sb)) {
                                err = ext4_multi_mount_protect(sb,
                                                le64_to_cpu(es->s_mmp_block));
                                if (err)
                                        goto restore_opts;
                        }
#ifdef CONFIG_QUOTA
                        enable_quota = 1;
#endif
                }
        }

        /*
         * Handle creation of system zone data early because it can fail.
         * Releasing of existing data is done when we are sure remount will
         * succeed.
         */
        if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) {
                err = ext4_setup_system_zone(sb);
                if (err)
                        goto restore_opts;
        }

        if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) {
                err = ext4_commit_super(sb);
                if (err)
                        goto restore_opts;
        }

#ifdef CONFIG_QUOTA
        if (enable_quota) {
                if (sb_any_quota_suspended(sb))
                        dquot_resume(sb, -1);
                else if (ext4_has_feature_quota(sb)) {
                        err = ext4_enable_quotas(sb);
                        if (err)
                                goto restore_opts;
                }
        }
        /* Release old quota file names */
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(old_opts.s_qf_names[i]);
#endif
        if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
                ext4_release_system_zone(sb);

        /*
         * Reinitialize lazy itable initialization thread based on
         * current settings
         */
        if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE))
                ext4_unregister_li_request(sb);
        else {
                ext4_group_t first_not_zeroed;
                first_not_zeroed = ext4_has_uninit_itable(sb);
                ext4_register_li_request(sb, first_not_zeroed);
        }

        if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
                ext4_stop_mmpd(sbi);

        return 0;

restore_opts:
        /*
         * If there was a failing r/w to ro transition, we may need to
         * re-enable quota
         */
        if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) &&
            sb_any_quota_suspended(sb))
                dquot_resume(sb, -1);

        alloc_ctx = ext4_writepages_down_write(sb);
        sb->s_flags = old_sb_flags;
        sbi->s_mount_opt = old_opts.s_mount_opt;
        sbi->s_mount_opt2 = old_opts.s_mount_opt2;
        sbi->s_resuid = old_opts.s_resuid;
        sbi->s_resgid = old_opts.s_resgid;
        sbi->s_commit_interval = old_opts.s_commit_interval;
        sbi->s_min_batch_time = old_opts.s_min_batch_time;
        sbi->s_max_batch_time = old_opts.s_max_batch_time;
        ext4_writepages_up_write(sb, alloc_ctx);

        if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
                ext4_release_system_zone(sb);
#ifdef CONFIG_QUOTA
        sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
        for (i = 0; i < EXT4_MAXQUOTAS; i++) {
                to_free[i] = get_qf_name(sb, sbi, i);
                rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]);
        }
        synchronize_rcu();
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
                kfree(to_free[i]);
#endif
        if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
                ext4_stop_mmpd(sbi);
        return err;
}

static int ext4_reconfigure(struct fs_context *fc)
{
        struct super_block *sb = fc->root->d_sb;
        int ret;

        fc->s_fs_info = EXT4_SB(sb);

        ret = ext4_check_opt_consistency(fc, sb);
        if (ret < 0)
                return ret;

        ret = __ext4_remount(fc, sb);
        if (ret < 0)
                return ret;

        ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.",
                 &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w",
                 ext4_quota_mode(sb));

        return 0;
}

#ifdef CONFIG_QUOTA
static int ext4_statfs_project(struct super_block *sb,
                               kprojid_t projid, struct kstatfs *buf)
{
        struct kqid qid;
        struct dquot *dquot;
        u64 limit;
        u64 curblock;

        qid = make_kqid_projid(projid);
        dquot = dqget(sb, qid);
        if (IS_ERR(dquot))
                return PTR_ERR(dquot);
        spin_lock(&dquot->dq_dqb_lock);

        limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit,
                             dquot->dq_dqb.dqb_bhardlimit);
        limit >>= sb->s_blocksize_bits;

        if (limit && buf->f_blocks > limit) {
                curblock = (dquot->dq_dqb.dqb_curspace +
                            dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
                buf->f_blocks = limit;
                buf->f_bfree = buf->f_bavail =
                        (buf->f_blocks > curblock) ?
                         (buf->f_blocks - curblock) : 0;
        }

        limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
                             dquot->dq_dqb.dqb_ihardlimit);
        if (limit && buf->f_files > limit) {
                buf->f_files = limit;
                buf->f_ffree =
                        (buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
                         (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
        }

        spin_unlock(&dquot->dq_dqb_lock);
        dqput(dquot);
        return 0;
}
#endif

static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct super_block *sb = dentry->d_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        ext4_fsblk_t overhead = 0, resv_blocks;
        s64 bfree;
        resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));

        if (!test_opt(sb, MINIX_DF))
                overhead = sbi->s_overhead;

        buf->f_type = EXT4_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
        bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
                percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
        /* prevent underflow in case that few free space is available */
        buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
        buf->f_bavail = buf->f_bfree -
                        (ext4_r_blocks_count(es) + resv_blocks);
        if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
                buf->f_bavail = 0;
        buf->f_files = le32_to_cpu(es->s_inodes_count);
        buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
        buf->f_namelen = EXT4_NAME_LEN;
        buf->f_fsid = uuid_to_fsid(es->s_uuid);

#ifdef CONFIG_QUOTA
        if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
            sb_has_quota_limits_enabled(sb, PRJQUOTA))
                ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf);
#endif
        return 0;
}


#ifdef CONFIG_QUOTA

/*
 * Helper functions so that transaction is started before we acquire dqio_sem
 * to keep correct lock ordering of transaction > dqio_sem
 */
static inline struct inode *dquot_to_inode(struct dquot *dquot)
{
        return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
}

static int ext4_write_dquot(struct dquot *dquot)
{
        int ret, err;
        handle_t *handle;
        struct inode *inode;

        inode = dquot_to_inode(dquot);
        handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
                                    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ret = dquot_commit(dquot);
        if (ret < 0)
                ext4_error_err(dquot->dq_sb, -ret,
                               "Failed to commit dquot type %d",
                               dquot->dq_id.type);
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;
        return ret;
}

static int ext4_acquire_dquot(struct dquot *dquot)
{
        int ret, err;
        handle_t *handle;

        handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
                                    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ret = dquot_acquire(dquot);
        if (ret < 0)
                ext4_error_err(dquot->dq_sb, -ret,
                              "Failed to acquire dquot type %d",
                              dquot->dq_id.type);
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;
        return ret;
}

static int ext4_release_dquot(struct dquot *dquot)
{
        int ret, err;
        handle_t *handle;

        handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
                                    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle)) {
                /* Release dquot anyway to avoid endless cycle in dqput() */
                dquot_release(dquot);
                return PTR_ERR(handle);
        }
        ret = dquot_release(dquot);
        if (ret < 0)
                ext4_error_err(dquot->dq_sb, -ret,
                               "Failed to release dquot type %d",
                               dquot->dq_id.type);
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;
        return ret;
}

static int ext4_mark_dquot_dirty(struct dquot *dquot)
{
        struct super_block *sb = dquot->dq_sb;

        if (ext4_is_quota_journalled(sb)) {
                dquot_mark_dquot_dirty(dquot);
                return ext4_write_dquot(dquot);
        } else {
                return dquot_mark_dquot_dirty(dquot);
        }
}

static int ext4_write_info(struct super_block *sb, int type)
{
        int ret, err;
        handle_t *handle;

        /* Data block + inode block */
        handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2);
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ret = dquot_commit_info(sb, type);
        err = ext4_journal_stop(handle);
        if (!ret)
                ret = err;
        return ret;
}

static void lockdep_set_quota_inode(struct inode *inode, int subclass)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        /* The first argument of lockdep_set_subclass has to be
         * *exactly* the same as the argument to init_rwsem() --- in
         * this case, in init_once() --- or lockdep gets unhappy
         * because the name of the lock is set using the
         * stringification of the argument to init_rwsem().
         */
        (void) ei;        /* shut up clang warning if !CONFIG_LOCKDEP */
        lockdep_set_subclass(&ei->i_data_sem, subclass);
}

/*
 * Standard function to be called on quota_on
 */
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                         const struct path *path)
{
        int err;

        if (!test_opt(sb, QUOTA))
                return -EINVAL;

        /* Quotafile not on the same filesystem? */
        if (path->dentry->d_sb != sb)
                return -EXDEV;

        /* Quota already enabled for this file? */
        if (IS_NOQUOTA(d_inode(path->dentry)))
                return -EBUSY;

        /* Journaling quota? */
        if (EXT4_SB(sb)->s_qf_names[type]) {
                /* Quotafile not in fs root? */
                if (path->dentry->d_parent != sb->s_root)
                        ext4_msg(sb, KERN_WARNING,
                                "Quota file not on filesystem root. "
                                "Journaled quota will not work");
                sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY;
        } else {
                /*
                 * Clear the flag just in case mount options changed since
                 * last time.
                 */
                sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY;
        }

        lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
        err = dquot_quota_on(sb, type, format_id, path);
        if (!err) {
                struct inode *inode = d_inode(path->dentry);
                handle_t *handle;

                /*
                 * Set inode flags to prevent userspace from messing with quota
                 * files. If this fails, we return success anyway since quotas
                 * are already enabled and this is not a hard failure.
                 */
                inode_lock(inode);
                handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
                if (IS_ERR(handle))
                        goto unlock_inode;
                EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL;
                inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
                                S_NOATIME | S_IMMUTABLE);
                err = ext4_mark_inode_dirty(handle, inode);
                ext4_journal_stop(handle);
        unlock_inode:
                inode_unlock(inode);
                if (err)
                        dquot_quota_off(sb, type);
        }
        if (err)
                lockdep_set_quota_inode(path->dentry->d_inode,
                                             I_DATA_SEM_NORMAL);
        return err;
}

static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum)
{
        switch (type) {
        case USRQUOTA:
                return qf_inum == EXT4_USR_QUOTA_INO;
        case GRPQUOTA:
                return qf_inum == EXT4_GRP_QUOTA_INO;
        case PRJQUOTA:
                return qf_inum >= EXT4_GOOD_OLD_FIRST_INO;
        default:
                BUG();
        }
}

static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
                             unsigned int flags)
{
        int err;
        struct inode *qf_inode;
        unsigned long qf_inums[EXT4_MAXQUOTAS] = {
                le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
                le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
                le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
        };

        BUG_ON(!ext4_has_feature_quota(sb));

        if (!qf_inums[type])
                return -EPERM;

        if (!ext4_check_quota_inum(type, qf_inums[type])) {
                ext4_error(sb, "Bad quota inum: %lu, type: %d",
                                qf_inums[type], type);
                return -EUCLEAN;
        }

        qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
        if (IS_ERR(qf_inode)) {
                ext4_error(sb, "Bad quota inode: %lu, type: %d",
                                qf_inums[type], type);
                return PTR_ERR(qf_inode);
        }

        /* Don't account quota for quota files to avoid recursion */
        qf_inode->i_flags |= S_NOQUOTA;
        lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
        err = dquot_load_quota_inode(qf_inode, type, format_id, flags);
        if (err)
                lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
        iput(qf_inode);

        return err;
}

/* Enable usage tracking for all quota types. */
int ext4_enable_quotas(struct super_block *sb)
{
        int type, err = 0;
        unsigned long qf_inums[EXT4_MAXQUOTAS] = {
                le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
                le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
                le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
        };
        bool quota_mopt[EXT4_MAXQUOTAS] = {
                test_opt(sb, USRQUOTA),
                test_opt(sb, GRPQUOTA),
                test_opt(sb, PRJQUOTA),
        };

        sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
        for (type = 0; type < EXT4_MAXQUOTAS; type++) {
                if (qf_inums[type]) {
                        err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
                                DQUOT_USAGE_ENABLED |
                                (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
                        if (err) {
                                ext4_warning(sb,
                                        "Failed to enable quota tracking "
                                        "(type=%d, err=%d, ino=%lu). "
                                        "Please run e2fsck to fix.", type,
                                        err, qf_inums[type]);

                                ext4_quotas_off(sb, type);
                                return err;
                        }
                }
        }
        return 0;
}

static int ext4_quota_off(struct super_block *sb, int type)
{
        struct inode *inode = sb_dqopt(sb)->files[type];
        handle_t *handle;
        int err;

        /* Force all delayed allocation blocks to be allocated.
         * Caller already holds s_umount sem */
        if (test_opt(sb, DELALLOC))
                sync_filesystem(sb);

        if (!inode || !igrab(inode))
                goto out;

        err = dquot_quota_off(sb, type);
        if (err || ext4_has_feature_quota(sb))
                goto out_put;
        /*
         * When the filesystem was remounted read-only first, we cannot cleanup
         * inode flags here. Bad luck but people should be using QUOTA feature
         * these days anyway.
         */
        if (sb_rdonly(sb))
                goto out_put;

        inode_lock(inode);
        /*
         * Update modification times of quota files when userspace can
         * start looking at them. If we fail, we return success anyway since
         * this is not a hard failure and quotas are already disabled.
         */
        handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
        if (IS_ERR(handle)) {
                err = PTR_ERR(handle);
                goto out_unlock;
        }
        EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
        inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        err = ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);
out_unlock:
        inode_unlock(inode);
out_put:
        lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL);
        iput(inode);
        return err;
out:
        return dquot_quota_off(sb, type);
}

/* Read data from quotafile - avoid pagecache and such because we cannot afford
 * acquiring the locks... As quota files are never truncated and quota code
 * itself serializes the operations (and no one else should touch the files)
 * we don't have to be afraid of races */
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off)
{
        struct inode *inode = sb_dqopt(sb)->files[type];
        ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
        int offset = off & (sb->s_blocksize - 1);
        int tocopy;
        size_t toread;
        struct buffer_head *bh;
        loff_t i_size = i_size_read(inode);

        if (off > i_size)
                return 0;
        if (off+len > i_size)
                len = i_size-off;
        toread = len;
        while (toread > 0) {
                tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
                bh = ext4_bread(NULL, inode, blk, 0);
                if (IS_ERR(bh))
                        return PTR_ERR(bh);
                if (!bh)        /* A hole? */
                        memset(data, 0, tocopy);
                else
                        memcpy(data, bh->b_data+offset, tocopy);
                brelse(bh);
                offset = 0;
                toread -= tocopy;
                data += tocopy;
                blk++;
        }
        return len;
}

/* Write to quotafile (we know the transaction is already started and has
 * enough credits) */
static ssize_t ext4_quota_write(struct super_block *sb, int type,
                                const char *data, size_t len, loff_t off)
{
        struct inode *inode = sb_dqopt(sb)->files[type];
        ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
        int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1);
        int retries = 0;
        struct buffer_head *bh;
        handle_t *handle = journal_current_handle();

        if (!handle) {
                ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
                        " cancelled because transaction is not started",
                        (unsigned long long)off, (unsigned long long)len);
                return -EIO;
        }
        /*
         * Since we account only one data block in transaction credits,
         * then it is impossible to cross a block boundary.
         */
        if (sb->s_blocksize - offset < len) {
                ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
                        " cancelled because not block aligned",
                        (unsigned long long)off, (unsigned long long)len);
                return -EIO;
        }

        do {
                bh = ext4_bread(handle, inode, blk,
                                EXT4_GET_BLOCKS_CREATE |
                                EXT4_GET_BLOCKS_METADATA_NOFAIL);
        } while (PTR_ERR(bh) == -ENOSPC &&
                 ext4_should_retry_alloc(inode->i_sb, &retries));
        if (IS_ERR(bh))
                return PTR_ERR(bh);
        if (!bh)
                goto out;
        BUFFER_TRACE(bh, "get write access");
        err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE);
        if (err) {
                brelse(bh);
                return err;
        }
        lock_buffer(bh);
        memcpy(bh->b_data+offset, data, len);
        flush_dcache_page(bh->b_page);
        unlock_buffer(bh);
        err = ext4_handle_dirty_metadata(handle, NULL, bh);
        brelse(bh);
out:
        if (inode->i_size < off + len) {
                i_size_write(inode, off + len);
                EXT4_I(inode)->i_disksize = inode->i_size;
                err2 = ext4_mark_inode_dirty(handle, inode);
                if (unlikely(err2 && !err))
                        err = err2;
        }
        return err ? err : len;
}
#endif

#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
static inline void register_as_ext2(void)
{
        int err = register_filesystem(&ext2_fs_type);
        if (err)
                printk(KERN_WARNING
                       "EXT4-fs: Unable to register as ext2 (%d)\n", err);
}

static inline void unregister_as_ext2(void)
{
        unregister_filesystem(&ext2_fs_type);
}

static inline int ext2_feature_set_ok(struct super_block *sb)
{
        if (ext4_has_unknown_ext2_incompat_features(sb))
                return 0;
        if (sb_rdonly(sb))
                return 1;
        if (ext4_has_unknown_ext2_ro_compat_features(sb))
                return 0;
        return 1;
}
#else
static inline void register_as_ext2(void) { }
static inline void unregister_as_ext2(void) { }
static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
#endif

static inline void register_as_ext3(void)
{
        int err = register_filesystem(&ext3_fs_type);
        if (err)
                printk(KERN_WARNING
                       "EXT4-fs: Unable to register as ext3 (%d)\n", err);
}

static inline void unregister_as_ext3(void)
{
        unregister_filesystem(&ext3_fs_type);
}

static inline int ext3_feature_set_ok(struct super_block *sb)
{
        if (ext4_has_unknown_ext3_incompat_features(sb))
                return 0;
        if (!ext4_has_feature_journal(sb))
                return 0;
        if (sb_rdonly(sb))
                return 1;
        if (ext4_has_unknown_ext3_ro_compat_features(sb))
                return 0;
        return 1;
}

static void ext4_kill_sb(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct file *bdev_file = sbi ? sbi->s_journal_bdev_file : NULL;

        kill_block_super(sb);

        if (bdev_file)
                bdev_fput(bdev_file);
}

static struct file_system_type ext4_fs_type = {
        .owner                        = THIS_MODULE,
        .name                        = "ext4",
        .init_fs_context        = ext4_init_fs_context,
        .parameters                = ext4_param_specs,
        .kill_sb                = ext4_kill_sb,
        .fs_flags                = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("ext4");

/* Shared across all ext4 file systems */
wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];

static int __init ext4_init_fs(void)
{
        int i, err;

        ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64);
        ext4_li_info = NULL;

        /* Build-time check for flags consistency */
        ext4_check_flag_values();

        for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
                init_waitqueue_head(&ext4__ioend_wq[i]);

        err = ext4_init_es();
        if (err)
                return err;

        err = ext4_init_pending();
        if (err)
                goto out7;

        err = ext4_init_post_read_processing();
        if (err)
                goto out6;

        err = ext4_init_pageio();
        if (err)
                goto out5;

        err = ext4_init_system_zone();
        if (err)
                goto out4;

        err = ext4_init_sysfs();
        if (err)
                goto out3;

        err = ext4_init_mballoc();
        if (err)
                goto out2;
        err = init_inodecache();
        if (err)
                goto out1;

        err = ext4_fc_init_dentry_cache();
        if (err)
                goto out05;

        register_as_ext3();
        register_as_ext2();
        err = register_filesystem(&ext4_fs_type);
        if (err)
                goto out;

        return 0;
out:
        unregister_as_ext2();
        unregister_as_ext3();
        ext4_fc_destroy_dentry_cache();
out05:
        destroy_inodecache();
out1:
        ext4_exit_mballoc();
out2:
        ext4_exit_sysfs();
out3:
        ext4_exit_system_zone();
out4:
        ext4_exit_pageio();
out5:
        ext4_exit_post_read_processing();
out6:
        ext4_exit_pending();
out7:
        ext4_exit_es();

        return err;
}

static void __exit ext4_exit_fs(void)
{
        ext4_destroy_lazyinit_thread();
        unregister_as_ext2();
        unregister_as_ext3();
        unregister_filesystem(&ext4_fs_type);
        ext4_fc_destroy_dentry_cache();
        destroy_inodecache();
        ext4_exit_mballoc();
        ext4_exit_sysfs();
        ext4_exit_system_zone();
        ext4_exit_pageio();
        ext4_exit_post_read_processing();
        ext4_exit_es();
        ext4_exit_pending();
}

MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
MODULE_DESCRIPTION("Fourth Extended Filesystem");
MODULE_LICENSE("GPL");
MODULE_SOFTDEP("pre: crc32c");
module_init(ext4_init_fs)
module_exit(ext4_exit_fs)



























    1 
    2 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _DELAYED_CALL_H
#define _DELAYED_CALL_H

/*
 * Poor man's closures; I wish we could've done them sanely polymorphic,
 * but...
 */

struct delayed_call {
        void (*fn)(void *);
        void *arg;
};

#define DEFINE_DELAYED_CALL(name) struct delayed_call name = {NULL, NULL}

/* I really wish we had closures with sane typechecking... */
static inline void set_delayed_call(struct delayed_call *call,
                void (*fn)(void *), void *arg)
{
        call->fn = fn;
        call->arg = arg;
}

static inline void do_delayed_call(struct delayed_call *call)
{
        if (call->fn)
                call->fn(call->arg);
}

static inline void clear_delayed_call(struct delayed_call *call)
{
        call->fn = NULL;
}
#endif








































































































































   37 
   37 






   11 
   13 































































   25 
   25 






    3 







    2 

























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
// SPDX-License-Identifier: GPL-2.0-or-later
/* bit search implementation
 *
 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * Copyright (C) 2008 IBM Corporation
 * 'find_last_bit' is written by Rusty Russell <rusty@rustcorp.com.au>
 * (Inspired by David Howell's find_next_bit implementation)
 *
 * Rewritten by Yury Norov <yury.norov@gmail.com> to decrease
 * size and improve performance, 2015.
 */

#include <linux/bitops.h>
#include <linux/bitmap.h>
#include <linux/export.h>
#include <linux/math.h>
#include <linux/minmax.h>
#include <linux/swab.h>

/*
 * Common helper for find_bit() function family
 * @FETCH: The expression that fetches and pre-processes each word of bitmap(s)
 * @MUNGE: The expression that post-processes a word containing found bit (may be empty)
 * @size: The bitmap size in bits
 */
#define FIND_FIRST_BIT(FETCH, MUNGE, size)                                        \
({                                                                                \
        unsigned long idx, val, sz = (size);                                        \
                                                                                \
        for (idx = 0; idx * BITS_PER_LONG < sz; idx++) {                        \
                val = (FETCH);                                                        \
                if (val) {                                                        \
                        sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(val)), sz);        \
                        break;                                                        \
                }                                                                \
        }                                                                        \
                                                                                \
        sz;                                                                        \
})

/*
 * Common helper for find_next_bit() function family
 * @FETCH: The expression that fetches and pre-processes each word of bitmap(s)
 * @MUNGE: The expression that post-processes a word containing found bit (may be empty)
 * @size: The bitmap size in bits
 * @start: The bitnumber to start searching at
 */
#define FIND_NEXT_BIT(FETCH, MUNGE, size, start)                                \
({                                                                                \
        unsigned long mask, idx, tmp, sz = (size), __start = (start);                \
                                                                                \
        if (unlikely(__start >= sz))                                                \
                goto out;                                                        \
                                                                                \
        mask = MUNGE(BITMAP_FIRST_WORD_MASK(__start));                                \
        idx = __start / BITS_PER_LONG;                                                \
                                                                                \
        for (tmp = (FETCH) & mask; !tmp; tmp = (FETCH)) {                        \
                if ((idx + 1) * BITS_PER_LONG >= sz)                                \
                        goto out;                                                \
                idx++;                                                                \
        }                                                                        \
                                                                                \
        sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(tmp)), sz);                        \
out:                                                                                \
        sz;                                                                        \
})

#define FIND_NTH_BIT(FETCH, size, num)                                                \
({                                                                                \
        unsigned long sz = (size), nr = (num), idx, w, tmp;                        \
                                                                                \
        for (idx = 0; (idx + 1) * BITS_PER_LONG <= sz; idx++) {                        \
                if (idx * BITS_PER_LONG + nr >= sz)                                \
                        goto out;                                                \
                                                                                \
                tmp = (FETCH);                                                        \
                w = hweight_long(tmp);                                                \
                if (w > nr)                                                        \
                        goto found;                                                \
                                                                                \
                nr -= w;                                                        \
        }                                                                        \
                                                                                \
        if (sz % BITS_PER_LONG)                                                        \
                tmp = (FETCH) & BITMAP_LAST_WORD_MASK(sz);                        \
found:                                                                                \
        sz = idx * BITS_PER_LONG + fns(tmp, nr);                                \
out:                                                                                \
        sz;                                                                        \
})

#ifndef find_first_bit
/*
 * Find the first set bit in a memory region.
 */
unsigned long _find_first_bit(const unsigned long *addr, unsigned long size)
{
        return FIND_FIRST_BIT(addr[idx], /* nop */, size);
}
EXPORT_SYMBOL(_find_first_bit);
#endif

#ifndef find_first_and_bit
/*
 * Find the first set bit in two memory regions.
 */
unsigned long _find_first_and_bit(const unsigned long *addr1,
                                  const unsigned long *addr2,
                                  unsigned long size)
{
        return FIND_FIRST_BIT(addr1[idx] & addr2[idx], /* nop */, size);
}
EXPORT_SYMBOL(_find_first_and_bit);
#endif

/*
 * Find the first set bit in three memory regions.
 */
unsigned long _find_first_and_and_bit(const unsigned long *addr1,
                                      const unsigned long *addr2,
                                      const unsigned long *addr3,
                                      unsigned long size)
{
        return FIND_FIRST_BIT(addr1[idx] & addr2[idx] & addr3[idx], /* nop */, size);
}
EXPORT_SYMBOL(_find_first_and_and_bit);

#ifndef find_first_zero_bit
/*
 * Find the first cleared bit in a memory region.
 */
unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size)
{
        return FIND_FIRST_BIT(~addr[idx], /* nop */, size);
}
EXPORT_SYMBOL(_find_first_zero_bit);
#endif

#ifndef find_next_bit
unsigned long _find_next_bit(const unsigned long *addr, unsigned long nbits, unsigned long start)
{
        return FIND_NEXT_BIT(addr[idx], /* nop */, nbits, start);
}
EXPORT_SYMBOL(_find_next_bit);
#endif

unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n)
{
        return FIND_NTH_BIT(addr[idx], size, n);
}
EXPORT_SYMBOL(__find_nth_bit);

unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                 unsigned long size, unsigned long n)
{
        return FIND_NTH_BIT(addr1[idx] & addr2[idx], size, n);
}
EXPORT_SYMBOL(__find_nth_and_bit);

unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                 unsigned long size, unsigned long n)
{
        return FIND_NTH_BIT(addr1[idx] & ~addr2[idx], size, n);
}
EXPORT_SYMBOL(__find_nth_andnot_bit);

unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1,
                                        const unsigned long *addr2,
                                        const unsigned long *addr3,
                                        unsigned long size, unsigned long n)
{
        return FIND_NTH_BIT(addr1[idx] & addr2[idx] & ~addr3[idx], size, n);
}
EXPORT_SYMBOL(__find_nth_and_andnot_bit);

#ifndef find_next_and_bit
unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start)
{
        return FIND_NEXT_BIT(addr1[idx] & addr2[idx], /* nop */, nbits, start);
}
EXPORT_SYMBOL(_find_next_and_bit);
#endif

#ifndef find_next_andnot_bit
unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start)
{
        return FIND_NEXT_BIT(addr1[idx] & ~addr2[idx], /* nop */, nbits, start);
}
EXPORT_SYMBOL(_find_next_andnot_bit);
#endif

#ifndef find_next_or_bit
unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start)
{
        return FIND_NEXT_BIT(addr1[idx] | addr2[idx], /* nop */, nbits, start);
}
EXPORT_SYMBOL(_find_next_or_bit);
#endif

#ifndef find_next_zero_bit
unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits,
                                         unsigned long start)
{
        return FIND_NEXT_BIT(~addr[idx], /* nop */, nbits, start);
}
EXPORT_SYMBOL(_find_next_zero_bit);
#endif

#ifndef find_last_bit
unsigned long _find_last_bit(const unsigned long *addr, unsigned long size)
{
        if (size) {
                unsigned long val = BITMAP_LAST_WORD_MASK(size);
                unsigned long idx = (size-1) / BITS_PER_LONG;

                do {
                        val &= addr[idx];
                        if (val)
                                return idx * BITS_PER_LONG + __fls(val);

                        val = ~0ul;
                } while (idx--);
        }
        return size;
}
EXPORT_SYMBOL(_find_last_bit);
#endif

unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr,
                               unsigned long size, unsigned long offset)
{
        offset = find_next_bit(addr, size, offset);
        if (offset == size)
                return size;

        offset = round_down(offset, 8);
        *clump = bitmap_get_value8(addr, offset);

        return offset;
}
EXPORT_SYMBOL(find_next_clump8);

#ifdef __BIG_ENDIAN

#ifndef find_first_zero_bit_le
/*
 * Find the first cleared bit in an LE memory region.
 */
unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size)
{
        return FIND_FIRST_BIT(~addr[idx], swab, size);
}
EXPORT_SYMBOL(_find_first_zero_bit_le);

#endif

#ifndef find_next_zero_bit_le
unsigned long _find_next_zero_bit_le(const unsigned long *addr,
                                        unsigned long size, unsigned long offset)
{
        return FIND_NEXT_BIT(~addr[idx], swab, size, offset);
}
EXPORT_SYMBOL(_find_next_zero_bit_le);
#endif

#ifndef find_next_bit_le
unsigned long _find_next_bit_le(const unsigned long *addr,
                                unsigned long size, unsigned long offset)
{
        return FIND_NEXT_BIT(addr[idx], swab, size, offset);
}
EXPORT_SYMBOL(_find_next_bit_le);

#endif

#endif /* __BIG_ENDIAN */


































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_DAX_H
#define _LINUX_DAX_H

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/radix-tree.h>

typedef unsigned long dax_entry_t;

struct dax_device;
struct gendisk;
struct iomap_ops;
struct iomap_iter;
struct iomap;

enum dax_access_mode {
        DAX_ACCESS,
        DAX_RECOVERY_WRITE,
};

struct dax_operations {
        /*
         * direct_access: translate a device-relative
         * logical-page-offset into an absolute physical pfn. Return the
         * number of pages available for DAX at that pfn.
         */
        long (*direct_access)(struct dax_device *, pgoff_t, long,
                        enum dax_access_mode, void **, pfn_t *);
        /*
         * Validate whether this device is usable as an fsdax backing
         * device.
         */
        bool (*dax_supported)(struct dax_device *, struct block_device *, int,
                        sector_t, sector_t);
        /* zero_page_range: required operation. Zero page range   */
        int (*zero_page_range)(struct dax_device *, pgoff_t, size_t);
        /*
         * recovery_write: recover a poisoned range by DAX device driver
         * capable of clearing poison.
         */
        size_t (*recovery_write)(struct dax_device *dax_dev, pgoff_t pgoff,
                        void *addr, size_t bytes, struct iov_iter *iter);
};

struct dax_holder_operations {
        /*
         * notify_failure - notify memory failure into inner holder device
         * @dax_dev: the dax device which contains the holder
         * @offset: offset on this dax device where memory failure occurs
         * @len: length of this memory failure event
         * @flags: action flags for memory failure handler
         */
        int (*notify_failure)(struct dax_device *dax_dev, u64 offset,
                        u64 len, int mf_flags);
};

#if IS_ENABLED(CONFIG_DAX)
struct dax_device *alloc_dax(void *private, const struct dax_operations *ops);
void *dax_holder(struct dax_device *dax_dev);
void put_dax(struct dax_device *dax_dev);
void kill_dax(struct dax_device *dax_dev);
void dax_write_cache(struct dax_device *dax_dev, bool wc);
bool dax_write_cache_enabled(struct dax_device *dax_dev);
bool dax_synchronous(struct dax_device *dax_dev);
void set_dax_nocache(struct dax_device *dax_dev);
void set_dax_nomc(struct dax_device *dax_dev);
void set_dax_synchronous(struct dax_device *dax_dev);
size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
                void *addr, size_t bytes, struct iov_iter *i);
/*
 * Check if given mapping is supported by the file / underlying device.
 */
static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
                                             struct dax_device *dax_dev)
{
        if (!(vma->vm_flags & VM_SYNC))
                return true;
        if (!IS_DAX(file_inode(vma->vm_file)))
                return false;
        return dax_synchronous(dax_dev);
}
#else
static inline void *dax_holder(struct dax_device *dax_dev)
{
        return NULL;
}
static inline struct dax_device *alloc_dax(void *private,
                const struct dax_operations *ops)
{
        return ERR_PTR(-EOPNOTSUPP);
}
static inline void put_dax(struct dax_device *dax_dev)
{
}
static inline void kill_dax(struct dax_device *dax_dev)
{
}
static inline void dax_write_cache(struct dax_device *dax_dev, bool wc)
{
}
static inline bool dax_write_cache_enabled(struct dax_device *dax_dev)
{
        return false;
}
static inline bool dax_synchronous(struct dax_device *dax_dev)
{
        return true;
}
static inline void set_dax_nocache(struct dax_device *dax_dev)
{
}
static inline void set_dax_nomc(struct dax_device *dax_dev)
{
}
static inline void set_dax_synchronous(struct dax_device *dax_dev)
{
}
static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
                                struct dax_device *dax_dev)
{
        return !(vma->vm_flags & VM_SYNC);
}
static inline size_t dax_recovery_write(struct dax_device *dax_dev,
                pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i)
{
        return 0;
}
#endif

struct writeback_control;
#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
void dax_remove_host(struct gendisk *disk);
struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off,
                void *holder, const struct dax_holder_operations *ops);
void fs_put_dax(struct dax_device *dax_dev, void *holder);
#else
static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
{
        return 0;
}
static inline void dax_remove_host(struct gendisk *disk)
{
}
static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
                u64 *start_off, void *holder,
                const struct dax_holder_operations *ops)
{
        return NULL;
}
static inline void fs_put_dax(struct dax_device *dax_dev, void *holder)
{
}
#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */

#if IS_ENABLED(CONFIG_FS_DAX)
int dax_writeback_mapping_range(struct address_space *mapping,
                struct dax_device *dax_dev, struct writeback_control *wbc);

struct page *dax_layout_busy_page(struct address_space *mapping);
struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
dax_entry_t dax_lock_folio(struct folio *folio);
void dax_unlock_folio(struct folio *folio, dax_entry_t cookie);
dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
                unsigned long index, struct page **page);
void dax_unlock_mapping_entry(struct address_space *mapping,
                unsigned long index, dax_entry_t cookie);
#else
static inline struct page *dax_layout_busy_page(struct address_space *mapping)
{
        return NULL;
}

static inline struct page *dax_layout_busy_page_range(struct address_space *mapping, pgoff_t start, pgoff_t nr_pages)
{
        return NULL;
}

static inline int dax_writeback_mapping_range(struct address_space *mapping,
                struct dax_device *dax_dev, struct writeback_control *wbc)
{
        return -EOPNOTSUPP;
}

static inline dax_entry_t dax_lock_folio(struct folio *folio)
{
        if (IS_DAX(folio->mapping->host))
                return ~0UL;
        return 0;
}

static inline void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
{
}

static inline dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
                unsigned long index, struct page **page)
{
        return 0;
}

static inline void dax_unlock_mapping_entry(struct address_space *mapping,
                unsigned long index, dax_entry_t cookie)
{
}
#endif

int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
                const struct iomap_ops *ops);
int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
                const struct iomap_ops *ops);
int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
                const struct iomap_ops *ops);

#if IS_ENABLED(CONFIG_DAX)
int dax_read_lock(void);
void dax_read_unlock(int id);
#else
static inline int dax_read_lock(void)
{
        return 0;
}

static inline void dax_read_unlock(int id)
{
}
#endif /* CONFIG_DAX */
bool dax_alive(struct dax_device *dax_dev);
void *dax_get_private(struct dax_device *dax_dev);
long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
                enum dax_access_mode mode, void **kaddr, pfn_t *pfn);
size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
                size_t bytes, struct iov_iter *i);
size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
                size_t bytes, struct iov_iter *i);
int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
                        size_t nr_pages);
int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, u64 len,
                int mf_flags);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);

ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops);
vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
                    pfn_t *pfnp, int *errp, const struct iomap_ops *ops);
vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
                unsigned int order, pfn_t pfn);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
                                      pgoff_t index);
int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
                                  struct inode *dest, loff_t destoff,
                                  loff_t len, bool *is_same,
                                  const struct iomap_ops *ops);
int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
                              struct file *file_out, loff_t pos_out,
                              loff_t *len, unsigned int remap_flags,
                              const struct iomap_ops *ops);
static inline bool dax_mapping(struct address_space *mapping)
{
        return mapping->host && IS_DAX(mapping->host);
}

/*
 * Due to dax's memory and block duo personalities, hwpoison reporting
 * takes into consideration which personality is presently visible.
 * When dax acts like a block device, such as in block IO, an encounter of
 * dax hwpoison is reported as -EIO.
 * When dax acts like memory, such as in page fault, a detection of hwpoison
 * is reported as -EHWPOISON which leads to VM_FAULT_HWPOISON.
 */
static inline int dax_mem2blk_err(int err)
{
        return (err == -EHWPOISON) ? -EIO : err;
}

#ifdef CONFIG_DEV_DAX_HMEM_DEVICES
void hmem_register_resource(int target_nid, struct resource *r);
#else
static inline void hmem_register_resource(int target_nid, struct resource *r)
{
}
#endif

typedef int (*walk_hmem_fn)(struct device *dev, int target_nid,
                            const struct resource *res);
int walk_hmem_resources(struct device *dev, walk_hmem_fn fn);
#endif




















































































    1 


   14 
    2 

























    1 


    5 














    5 




    2 






























   19 














    6 


































    8 

















    4 

















   19 



   19 

   14 
















   12 



   12 
    5 
    1 



































































































































































































































































































   19 


   20 








   19 
   17 

























































































































   12 








   12 






   11 
    2 






   12 

   12 

   12 
















    8 




















    2 


    6 
    5 






    8 
















   14 










   14 









    3 


    3 





    2 



    8 
    3 



   14 





























































    5 










    5 








    6 


    5 



    2 
    2 









    6 
    5 



    6 

































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/file.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include "common.h"
#include <linux/slab.h>

/*
 * Mapping table from "enum tomoyo_path_acl_index" to "enum tomoyo_mac_index".
 */
static const u8 tomoyo_p2mac[TOMOYO_MAX_PATH_OPERATION] = {
        [TOMOYO_TYPE_EXECUTE]    = TOMOYO_MAC_FILE_EXECUTE,
        [TOMOYO_TYPE_READ]       = TOMOYO_MAC_FILE_OPEN,
        [TOMOYO_TYPE_WRITE]      = TOMOYO_MAC_FILE_OPEN,
        [TOMOYO_TYPE_APPEND]     = TOMOYO_MAC_FILE_OPEN,
        [TOMOYO_TYPE_UNLINK]     = TOMOYO_MAC_FILE_UNLINK,
        [TOMOYO_TYPE_GETATTR]    = TOMOYO_MAC_FILE_GETATTR,
        [TOMOYO_TYPE_RMDIR]      = TOMOYO_MAC_FILE_RMDIR,
        [TOMOYO_TYPE_TRUNCATE]   = TOMOYO_MAC_FILE_TRUNCATE,
        [TOMOYO_TYPE_SYMLINK]    = TOMOYO_MAC_FILE_SYMLINK,
        [TOMOYO_TYPE_CHROOT]     = TOMOYO_MAC_FILE_CHROOT,
        [TOMOYO_TYPE_UMOUNT]     = TOMOYO_MAC_FILE_UMOUNT,
};

/*
 * Mapping table from "enum tomoyo_mkdev_acl_index" to "enum tomoyo_mac_index".
 */
const u8 tomoyo_pnnn2mac[TOMOYO_MAX_MKDEV_OPERATION] = {
        [TOMOYO_TYPE_MKBLOCK] = TOMOYO_MAC_FILE_MKBLOCK,
        [TOMOYO_TYPE_MKCHAR]  = TOMOYO_MAC_FILE_MKCHAR,
};

/*
 * Mapping table from "enum tomoyo_path2_acl_index" to "enum tomoyo_mac_index".
 */
const u8 tomoyo_pp2mac[TOMOYO_MAX_PATH2_OPERATION] = {
        [TOMOYO_TYPE_LINK]       = TOMOYO_MAC_FILE_LINK,
        [TOMOYO_TYPE_RENAME]     = TOMOYO_MAC_FILE_RENAME,
        [TOMOYO_TYPE_PIVOT_ROOT] = TOMOYO_MAC_FILE_PIVOT_ROOT,
};

/*
 * Mapping table from "enum tomoyo_path_number_acl_index" to
 * "enum tomoyo_mac_index".
 */
const u8 tomoyo_pn2mac[TOMOYO_MAX_PATH_NUMBER_OPERATION] = {
        [TOMOYO_TYPE_CREATE] = TOMOYO_MAC_FILE_CREATE,
        [TOMOYO_TYPE_MKDIR]  = TOMOYO_MAC_FILE_MKDIR,
        [TOMOYO_TYPE_MKFIFO] = TOMOYO_MAC_FILE_MKFIFO,
        [TOMOYO_TYPE_MKSOCK] = TOMOYO_MAC_FILE_MKSOCK,
        [TOMOYO_TYPE_IOCTL]  = TOMOYO_MAC_FILE_IOCTL,
        [TOMOYO_TYPE_CHMOD]  = TOMOYO_MAC_FILE_CHMOD,
        [TOMOYO_TYPE_CHOWN]  = TOMOYO_MAC_FILE_CHOWN,
        [TOMOYO_TYPE_CHGRP]  = TOMOYO_MAC_FILE_CHGRP,
};

/**
 * tomoyo_put_name_union - Drop reference on "struct tomoyo_name_union".
 *
 * @ptr: Pointer to "struct tomoyo_name_union".
 *
 * Returns nothing.
 */
void tomoyo_put_name_union(struct tomoyo_name_union *ptr)
{
        tomoyo_put_group(ptr->group);
        tomoyo_put_name(ptr->filename);
}

/**
 * tomoyo_compare_name_union - Check whether a name matches "struct tomoyo_name_union" or not.
 *
 * @name: Pointer to "struct tomoyo_path_info".
 * @ptr:  Pointer to "struct tomoyo_name_union".
 *
 * Returns "struct tomoyo_path_info" if @name matches @ptr, NULL otherwise.
 */
const struct tomoyo_path_info *
tomoyo_compare_name_union(const struct tomoyo_path_info *name,
                          const struct tomoyo_name_union *ptr)
{
        if (ptr->group)
                return tomoyo_path_matches_group(name, ptr->group);
        if (tomoyo_path_matches_pattern(name, ptr->filename))
                return ptr->filename;
        return NULL;
}

/**
 * tomoyo_put_number_union - Drop reference on "struct tomoyo_number_union".
 *
 * @ptr: Pointer to "struct tomoyo_number_union".
 *
 * Returns nothing.
 */
void tomoyo_put_number_union(struct tomoyo_number_union *ptr)
{
        tomoyo_put_group(ptr->group);
}

/**
 * tomoyo_compare_number_union - Check whether a value matches "struct tomoyo_number_union" or not.
 *
 * @value: Number to check.
 * @ptr:   Pointer to "struct tomoyo_number_union".
 *
 * Returns true if @value matches @ptr, false otherwise.
 */
bool tomoyo_compare_number_union(const unsigned long value,
                                 const struct tomoyo_number_union *ptr)
{
        if (ptr->group)
                return tomoyo_number_matches_group(value, value, ptr->group);
        return value >= ptr->values[0] && value <= ptr->values[1];
}

/**
 * tomoyo_add_slash - Add trailing '/' if needed.
 *
 * @buf: Pointer to "struct tomoyo_path_info".
 *
 * Returns nothing.
 *
 * @buf must be generated by tomoyo_encode() because this function does not
 * allocate memory for adding '/'.
 */
static void tomoyo_add_slash(struct tomoyo_path_info *buf)
{
        if (buf->is_dir)
                return;
        /*
         * This is OK because tomoyo_encode() reserves space for appending "/".
         */
        strcat((char *) buf->name, "/");
        tomoyo_fill_path_info(buf);
}

/**
 * tomoyo_get_realpath - Get realpath.
 *
 * @buf:  Pointer to "struct tomoyo_path_info".
 * @path: Pointer to "struct path".
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_get_realpath(struct tomoyo_path_info *buf, const struct path *path)
{
        buf->name = tomoyo_realpath_from_path(path);
        if (buf->name) {
                tomoyo_fill_path_info(buf);
                return true;
        }
        return false;
}

/**
 * tomoyo_audit_path_log - Audit path request log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_path_log(struct tomoyo_request_info *r)
{
        return tomoyo_supervisor(r, "file %s %s\n", tomoyo_path_keyword
                                 [r->param.path.operation],
                                 r->param.path.filename->name);
}

/**
 * tomoyo_audit_path2_log - Audit path/path request log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_path2_log(struct tomoyo_request_info *r)
{
        return tomoyo_supervisor(r, "file %s %s %s\n", tomoyo_mac_keywords
                                 [tomoyo_pp2mac[r->param.path2.operation]],
                                 r->param.path2.filename1->name,
                                 r->param.path2.filename2->name);
}

/**
 * tomoyo_audit_mkdev_log - Audit path/number/number/number request log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_mkdev_log(struct tomoyo_request_info *r)
{
        return tomoyo_supervisor(r, "file %s %s 0%o %u %u\n",
                                 tomoyo_mac_keywords
                                 [tomoyo_pnnn2mac[r->param.mkdev.operation]],
                                 r->param.mkdev.filename->name,
                                 r->param.mkdev.mode, r->param.mkdev.major,
                                 r->param.mkdev.minor);
}

/**
 * tomoyo_audit_path_number_log - Audit path/number request log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_path_number_log(struct tomoyo_request_info *r)
{
        const u8 type = r->param.path_number.operation;
        u8 radix;
        char buffer[64];

        switch (type) {
        case TOMOYO_TYPE_CREATE:
        case TOMOYO_TYPE_MKDIR:
        case TOMOYO_TYPE_MKFIFO:
        case TOMOYO_TYPE_MKSOCK:
        case TOMOYO_TYPE_CHMOD:
                radix = TOMOYO_VALUE_TYPE_OCTAL;
                break;
        case TOMOYO_TYPE_IOCTL:
                radix = TOMOYO_VALUE_TYPE_HEXADECIMAL;
                break;
        default:
                radix = TOMOYO_VALUE_TYPE_DECIMAL;
                break;
        }
        tomoyo_print_ulong(buffer, sizeof(buffer), r->param.path_number.number,
                           radix);
        return tomoyo_supervisor(r, "file %s %s %s\n", tomoyo_mac_keywords
                                 [tomoyo_pn2mac[type]],
                                 r->param.path_number.filename->name, buffer);
}

/**
 * tomoyo_check_path_acl - Check permission for path operation.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @ptr: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if granted, false otherwise.
 *
 * To be able to use wildcard for domain transition, this function sets
 * matching entry on success. Since the caller holds tomoyo_read_lock(),
 * it is safe to set matching entry.
 */
static bool tomoyo_check_path_acl(struct tomoyo_request_info *r,
                                  const struct tomoyo_acl_info *ptr)
{
        const struct tomoyo_path_acl *acl = container_of(ptr, typeof(*acl),
                                                         head);

        if (acl->perm & (1 << r->param.path.operation)) {
                r->param.path.matched_path =
                        tomoyo_compare_name_union(r->param.path.filename,
                                                  &acl->name);
                return r->param.path.matched_path != NULL;
        }
        return false;
}

/**
 * tomoyo_check_path_number_acl - Check permission for path number operation.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @ptr: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if granted, false otherwise.
 */
static bool tomoyo_check_path_number_acl(struct tomoyo_request_info *r,
                                         const struct tomoyo_acl_info *ptr)
{
        const struct tomoyo_path_number_acl *acl =
                container_of(ptr, typeof(*acl), head);

        return (acl->perm & (1 << r->param.path_number.operation)) &&
                tomoyo_compare_number_union(r->param.path_number.number,
                                            &acl->number) &&
                tomoyo_compare_name_union(r->param.path_number.filename,
                                          &acl->name);
}

/**
 * tomoyo_check_path2_acl - Check permission for path path operation.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @ptr: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if granted, false otherwise.
 */
static bool tomoyo_check_path2_acl(struct tomoyo_request_info *r,
                                   const struct tomoyo_acl_info *ptr)
{
        const struct tomoyo_path2_acl *acl =
                container_of(ptr, typeof(*acl), head);

        return (acl->perm & (1 << r->param.path2.operation)) &&
                tomoyo_compare_name_union(r->param.path2.filename1, &acl->name1)
                && tomoyo_compare_name_union(r->param.path2.filename2,
                                             &acl->name2);
}

/**
 * tomoyo_check_mkdev_acl - Check permission for path number number number operation.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @ptr: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if granted, false otherwise.
 */
static bool tomoyo_check_mkdev_acl(struct tomoyo_request_info *r,
                                   const struct tomoyo_acl_info *ptr)
{
        const struct tomoyo_mkdev_acl *acl =
                container_of(ptr, typeof(*acl), head);

        return (acl->perm & (1 << r->param.mkdev.operation)) &&
                tomoyo_compare_number_union(r->param.mkdev.mode,
                                            &acl->mode) &&
                tomoyo_compare_number_union(r->param.mkdev.major,
                                            &acl->major) &&
                tomoyo_compare_number_union(r->param.mkdev.minor,
                                            &acl->minor) &&
                tomoyo_compare_name_union(r->param.mkdev.filename,
                                          &acl->name);
}

/**
 * tomoyo_same_path_acl - Check for duplicated "struct tomoyo_path_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b except permission bits, false otherwise.
 */
static bool tomoyo_same_path_acl(const struct tomoyo_acl_info *a,
                                 const struct tomoyo_acl_info *b)
{
        const struct tomoyo_path_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_path_acl *p2 = container_of(b, typeof(*p2), head);

        return tomoyo_same_name_union(&p1->name, &p2->name);
}

/**
 * tomoyo_merge_path_acl - Merge duplicated "struct tomoyo_path_acl" entry.
 *
 * @a:         Pointer to "struct tomoyo_acl_info".
 * @b:         Pointer to "struct tomoyo_acl_info".
 * @is_delete: True for @a &= ~@b, false for @a |= @b.
 *
 * Returns true if @a is empty, false otherwise.
 */
static bool tomoyo_merge_path_acl(struct tomoyo_acl_info *a,
                                  struct tomoyo_acl_info *b,
                                  const bool is_delete)
{
        u16 * const a_perm = &container_of(a, struct tomoyo_path_acl, head)
                ->perm;
        u16 perm = READ_ONCE(*a_perm);
        const u16 b_perm = container_of(b, struct tomoyo_path_acl, head)->perm;

        if (is_delete)
                perm &= ~b_perm;
        else
                perm |= b_perm;
        WRITE_ONCE(*a_perm, perm);
        return !perm;
}

/**
 * tomoyo_update_path_acl - Update "struct tomoyo_path_acl" list.
 *
 * @perm:  Permission.
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_update_path_acl(const u16 perm,
                                  struct tomoyo_acl_param *param)
{
        struct tomoyo_path_acl e = {
                .head.type = TOMOYO_TYPE_PATH_ACL,
                .perm = perm
        };
        int error;

        if (!tomoyo_parse_name_union(param, &e.name))
                error = -EINVAL;
        else
                error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                             tomoyo_same_path_acl,
                                             tomoyo_merge_path_acl);
        tomoyo_put_name_union(&e.name);
        return error;
}

/**
 * tomoyo_same_mkdev_acl - Check for duplicated "struct tomoyo_mkdev_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b except permission bits, false otherwise.
 */
static bool tomoyo_same_mkdev_acl(const struct tomoyo_acl_info *a,
                                         const struct tomoyo_acl_info *b)
{
        const struct tomoyo_mkdev_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_mkdev_acl *p2 = container_of(b, typeof(*p2), head);

        return tomoyo_same_name_union(&p1->name, &p2->name) &&
                tomoyo_same_number_union(&p1->mode, &p2->mode) &&
                tomoyo_same_number_union(&p1->major, &p2->major) &&
                tomoyo_same_number_union(&p1->minor, &p2->minor);
}

/**
 * tomoyo_merge_mkdev_acl - Merge duplicated "struct tomoyo_mkdev_acl" entry.
 *
 * @a:         Pointer to "struct tomoyo_acl_info".
 * @b:         Pointer to "struct tomoyo_acl_info".
 * @is_delete: True for @a &= ~@b, false for @a |= @b.
 *
 * Returns true if @a is empty, false otherwise.
 */
static bool tomoyo_merge_mkdev_acl(struct tomoyo_acl_info *a,
                                   struct tomoyo_acl_info *b,
                                   const bool is_delete)
{
        u8 *const a_perm = &container_of(a, struct tomoyo_mkdev_acl,
                                         head)->perm;
        u8 perm = READ_ONCE(*a_perm);
        const u8 b_perm = container_of(b, struct tomoyo_mkdev_acl, head)
                ->perm;

        if (is_delete)
                perm &= ~b_perm;
        else
                perm |= b_perm;
        WRITE_ONCE(*a_perm, perm);
        return !perm;
}

/**
 * tomoyo_update_mkdev_acl - Update "struct tomoyo_mkdev_acl" list.
 *
 * @perm:  Permission.
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_update_mkdev_acl(const u8 perm,
                                   struct tomoyo_acl_param *param)
{
        struct tomoyo_mkdev_acl e = {
                .head.type = TOMOYO_TYPE_MKDEV_ACL,
                .perm = perm
        };
        int error;

        if (!tomoyo_parse_name_union(param, &e.name) ||
            !tomoyo_parse_number_union(param, &e.mode) ||
            !tomoyo_parse_number_union(param, &e.major) ||
            !tomoyo_parse_number_union(param, &e.minor))
                error = -EINVAL;
        else
                error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                             tomoyo_same_mkdev_acl,
                                             tomoyo_merge_mkdev_acl);
        tomoyo_put_name_union(&e.name);
        tomoyo_put_number_union(&e.mode);
        tomoyo_put_number_union(&e.major);
        tomoyo_put_number_union(&e.minor);
        return error;
}

/**
 * tomoyo_same_path2_acl - Check for duplicated "struct tomoyo_path2_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b except permission bits, false otherwise.
 */
static bool tomoyo_same_path2_acl(const struct tomoyo_acl_info *a,
                                  const struct tomoyo_acl_info *b)
{
        const struct tomoyo_path2_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_path2_acl *p2 = container_of(b, typeof(*p2), head);

        return tomoyo_same_name_union(&p1->name1, &p2->name1) &&
                tomoyo_same_name_union(&p1->name2, &p2->name2);
}

/**
 * tomoyo_merge_path2_acl - Merge duplicated "struct tomoyo_path2_acl" entry.
 *
 * @a:         Pointer to "struct tomoyo_acl_info".
 * @b:         Pointer to "struct tomoyo_acl_info".
 * @is_delete: True for @a &= ~@b, false for @a |= @b.
 *
 * Returns true if @a is empty, false otherwise.
 */
static bool tomoyo_merge_path2_acl(struct tomoyo_acl_info *a,
                                   struct tomoyo_acl_info *b,
                                   const bool is_delete)
{
        u8 * const a_perm = &container_of(a, struct tomoyo_path2_acl, head)
                ->perm;
        u8 perm = READ_ONCE(*a_perm);
        const u8 b_perm = container_of(b, struct tomoyo_path2_acl, head)->perm;

        if (is_delete)
                perm &= ~b_perm;
        else
                perm |= b_perm;
        WRITE_ONCE(*a_perm, perm);
        return !perm;
}

/**
 * tomoyo_update_path2_acl - Update "struct tomoyo_path2_acl" list.
 *
 * @perm:  Permission.
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_update_path2_acl(const u8 perm,
                                   struct tomoyo_acl_param *param)
{
        struct tomoyo_path2_acl e = {
                .head.type = TOMOYO_TYPE_PATH2_ACL,
                .perm = perm
        };
        int error;

        if (!tomoyo_parse_name_union(param, &e.name1) ||
            !tomoyo_parse_name_union(param, &e.name2))
                error = -EINVAL;
        else
                error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                             tomoyo_same_path2_acl,
                                             tomoyo_merge_path2_acl);
        tomoyo_put_name_union(&e.name1);
        tomoyo_put_name_union(&e.name2);
        return error;
}

/**
 * tomoyo_path_permission - Check permission for single path operation.
 *
 * @r:         Pointer to "struct tomoyo_request_info".
 * @operation: Type of operation.
 * @filename:  Filename to check.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_path_permission(struct tomoyo_request_info *r, u8 operation,
                                  const struct tomoyo_path_info *filename)
{
        int error;

        r->type = tomoyo_p2mac[operation];
        r->mode = tomoyo_get_mode(r->domain->ns, r->profile, r->type);
        if (r->mode == TOMOYO_CONFIG_DISABLED)
                return 0;
        r->param_type = TOMOYO_TYPE_PATH_ACL;
        r->param.path.filename = filename;
        r->param.path.operation = operation;
        do {
                tomoyo_check_acl(r, tomoyo_check_path_acl);
                error = tomoyo_audit_path_log(r);
        } while (error == TOMOYO_RETRY_REQUEST);
        return error;
}

/**
 * tomoyo_execute_permission - Check permission for execute operation.
 *
 * @r:         Pointer to "struct tomoyo_request_info".
 * @filename:  Filename to check.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_execute_permission(struct tomoyo_request_info *r,
                              const struct tomoyo_path_info *filename)
{
        /*
         * Unlike other permission checks, this check is done regardless of
         * profile mode settings in order to check for domain transition
         * preference.
         */
        r->type = TOMOYO_MAC_FILE_EXECUTE;
        r->mode = tomoyo_get_mode(r->domain->ns, r->profile, r->type);
        r->param_type = TOMOYO_TYPE_PATH_ACL;
        r->param.path.filename = filename;
        r->param.path.operation = TOMOYO_TYPE_EXECUTE;
        tomoyo_check_acl(r, tomoyo_check_path_acl);
        r->ee->transition = r->matched_acl && r->matched_acl->cond ?
                r->matched_acl->cond->transit : NULL;
        if (r->mode != TOMOYO_CONFIG_DISABLED)
                return tomoyo_audit_path_log(r);
        return 0;
}

/**
 * tomoyo_same_path_number_acl - Check for duplicated "struct tomoyo_path_number_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b except permission bits, false otherwise.
 */
static bool tomoyo_same_path_number_acl(const struct tomoyo_acl_info *a,
                                        const struct tomoyo_acl_info *b)
{
        const struct tomoyo_path_number_acl *p1 = container_of(a, typeof(*p1),
                                                               head);
        const struct tomoyo_path_number_acl *p2 = container_of(b, typeof(*p2),
                                                               head);

        return tomoyo_same_name_union(&p1->name, &p2->name) &&
                tomoyo_same_number_union(&p1->number, &p2->number);
}

/**
 * tomoyo_merge_path_number_acl - Merge duplicated "struct tomoyo_path_number_acl" entry.
 *
 * @a:         Pointer to "struct tomoyo_acl_info".
 * @b:         Pointer to "struct tomoyo_acl_info".
 * @is_delete: True for @a &= ~@b, false for @a |= @b.
 *
 * Returns true if @a is empty, false otherwise.
 */
static bool tomoyo_merge_path_number_acl(struct tomoyo_acl_info *a,
                                         struct tomoyo_acl_info *b,
                                         const bool is_delete)
{
        u8 * const a_perm = &container_of(a, struct tomoyo_path_number_acl,
                                          head)->perm;
        u8 perm = READ_ONCE(*a_perm);
        const u8 b_perm = container_of(b, struct tomoyo_path_number_acl, head)
                ->perm;

        if (is_delete)
                perm &= ~b_perm;
        else
                perm |= b_perm;
        WRITE_ONCE(*a_perm, perm);
        return !perm;
}

/**
 * tomoyo_update_path_number_acl - Update ioctl/chmod/chown/chgrp ACL.
 *
 * @perm:  Permission.
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_update_path_number_acl(const u8 perm,
                                         struct tomoyo_acl_param *param)
{
        struct tomoyo_path_number_acl e = {
                .head.type = TOMOYO_TYPE_PATH_NUMBER_ACL,
                .perm = perm
        };
        int error;

        if (!tomoyo_parse_name_union(param, &e.name) ||
            !tomoyo_parse_number_union(param, &e.number))
                error = -EINVAL;
        else
                error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                             tomoyo_same_path_number_acl,
                                             tomoyo_merge_path_number_acl);
        tomoyo_put_name_union(&e.name);
        tomoyo_put_number_union(&e.number);
        return error;
}

/**
 * tomoyo_path_number_perm - Check permission for "create", "mkdir", "mkfifo", "mksock", "ioctl", "chmod", "chown", "chgrp".
 *
 * @type:   Type of operation.
 * @path:   Pointer to "struct path".
 * @number: Number.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_path_number_perm(const u8 type, const struct path *path,
                            unsigned long number)
{
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj = {
                .path1 = { .mnt = path->mnt, .dentry = path->dentry },
        };
        int error = -ENOMEM;
        struct tomoyo_path_info buf;
        int idx;

        if (tomoyo_init_request_info(&r, NULL, tomoyo_pn2mac[type])
            == TOMOYO_CONFIG_DISABLED)
                return 0;
        idx = tomoyo_read_lock();
        if (!tomoyo_get_realpath(&buf, path))
                goto out;
        r.obj = &obj;
        if (type == TOMOYO_TYPE_MKDIR)
                tomoyo_add_slash(&buf);
        r.param_type = TOMOYO_TYPE_PATH_NUMBER_ACL;
        r.param.path_number.operation = type;
        r.param.path_number.filename = &buf;
        r.param.path_number.number = number;
        do {
                tomoyo_check_acl(&r, tomoyo_check_path_number_acl);
                error = tomoyo_audit_path_number_log(&r);
        } while (error == TOMOYO_RETRY_REQUEST);
        kfree(buf.name);
 out:
        tomoyo_read_unlock(idx);
        if (r.mode != TOMOYO_CONFIG_ENFORCING)
                error = 0;
        return error;
}

/**
 * tomoyo_check_open_permission - Check permission for "read" and "write".
 *
 * @domain: Pointer to "struct tomoyo_domain_info".
 * @path:   Pointer to "struct path".
 * @flag:   Flags for open().
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_check_open_permission(struct tomoyo_domain_info *domain,
                                 const struct path *path, const int flag)
{
        const u8 acc_mode = ACC_MODE(flag);
        int error = 0;
        struct tomoyo_path_info buf;
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj = {
                .path1 = { .mnt = path->mnt, .dentry = path->dentry },
        };
        int idx;

        buf.name = NULL;
        r.mode = TOMOYO_CONFIG_DISABLED;
        idx = tomoyo_read_lock();
        if (acc_mode &&
            tomoyo_init_request_info(&r, domain, TOMOYO_MAC_FILE_OPEN)
            != TOMOYO_CONFIG_DISABLED) {
                if (!tomoyo_get_realpath(&buf, path)) {
                        error = -ENOMEM;
                        goto out;
                }
                r.obj = &obj;
                if (acc_mode & MAY_READ)
                        error = tomoyo_path_permission(&r, TOMOYO_TYPE_READ,
                                                       &buf);
                if (!error && (acc_mode & MAY_WRITE))
                        error = tomoyo_path_permission(&r, (flag & O_APPEND) ?
                                                       TOMOYO_TYPE_APPEND :
                                                       TOMOYO_TYPE_WRITE,
                                                       &buf);
        }
 out:
        kfree(buf.name);
        tomoyo_read_unlock(idx);
        if (r.mode != TOMOYO_CONFIG_ENFORCING)
                error = 0;
        return error;
}

/**
 * tomoyo_path_perm - Check permission for "unlink", "rmdir", "truncate", "symlink", "append", "chroot" and "unmount".
 *
 * @operation: Type of operation.
 * @path:      Pointer to "struct path".
 * @target:    Symlink's target if @operation is TOMOYO_TYPE_SYMLINK,
 *             NULL otherwise.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_path_perm(const u8 operation, const struct path *path, const char *target)
{
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj = {
                .path1 = { .mnt = path->mnt, .dentry = path->dentry },
        };
        int error;
        struct tomoyo_path_info buf;
        bool is_enforce;
        struct tomoyo_path_info symlink_target;
        int idx;

        if (tomoyo_init_request_info(&r, NULL, tomoyo_p2mac[operation])
            == TOMOYO_CONFIG_DISABLED)
                return 0;
        is_enforce = (r.mode == TOMOYO_CONFIG_ENFORCING);
        error = -ENOMEM;
        buf.name = NULL;
        idx = tomoyo_read_lock();
        if (!tomoyo_get_realpath(&buf, path))
                goto out;
        r.obj = &obj;
        switch (operation) {
        case TOMOYO_TYPE_RMDIR:
        case TOMOYO_TYPE_CHROOT:
                tomoyo_add_slash(&buf);
                break;
        case TOMOYO_TYPE_SYMLINK:
                symlink_target.name = tomoyo_encode(target);
                if (!symlink_target.name)
                        goto out;
                tomoyo_fill_path_info(&symlink_target);
                obj.symlink_target = &symlink_target;
                break;
        }
        error = tomoyo_path_permission(&r, operation, &buf);
        if (operation == TOMOYO_TYPE_SYMLINK)
                kfree(symlink_target.name);
 out:
        kfree(buf.name);
        tomoyo_read_unlock(idx);
        if (!is_enforce)
                error = 0;
        return error;
}

/**
 * tomoyo_mkdev_perm - Check permission for "mkblock" and "mkchar".
 *
 * @operation: Type of operation. (TOMOYO_TYPE_MKCHAR or TOMOYO_TYPE_MKBLOCK)
 * @path:      Pointer to "struct path".
 * @mode:      Create mode.
 * @dev:       Device number.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_mkdev_perm(const u8 operation, const struct path *path,
                      const unsigned int mode, unsigned int dev)
{
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj = {
                .path1 = { .mnt = path->mnt, .dentry = path->dentry },
        };
        int error = -ENOMEM;
        struct tomoyo_path_info buf;
        int idx;

        if (tomoyo_init_request_info(&r, NULL, tomoyo_pnnn2mac[operation])
            == TOMOYO_CONFIG_DISABLED)
                return 0;
        idx = tomoyo_read_lock();
        error = -ENOMEM;
        if (tomoyo_get_realpath(&buf, path)) {
                r.obj = &obj;
                dev = new_decode_dev(dev);
                r.param_type = TOMOYO_TYPE_MKDEV_ACL;
                r.param.mkdev.filename = &buf;
                r.param.mkdev.operation = operation;
                r.param.mkdev.mode = mode;
                r.param.mkdev.major = MAJOR(dev);
                r.param.mkdev.minor = MINOR(dev);
                tomoyo_check_acl(&r, tomoyo_check_mkdev_acl);
                error = tomoyo_audit_mkdev_log(&r);
                kfree(buf.name);
        }
        tomoyo_read_unlock(idx);
        if (r.mode != TOMOYO_CONFIG_ENFORCING)
                error = 0;
        return error;
}

/**
 * tomoyo_path2_perm - Check permission for "rename", "link" and "pivot_root".
 *
 * @operation: Type of operation.
 * @path1:      Pointer to "struct path".
 * @path2:      Pointer to "struct path".
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_path2_perm(const u8 operation, const struct path *path1,
                      const struct path *path2)
{
        int error = -ENOMEM;
        struct tomoyo_path_info buf1;
        struct tomoyo_path_info buf2;
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj = {
                .path1 = { .mnt = path1->mnt, .dentry = path1->dentry },
                .path2 = { .mnt = path2->mnt, .dentry = path2->dentry }
        };
        int idx;

        if (tomoyo_init_request_info(&r, NULL, tomoyo_pp2mac[operation])
            == TOMOYO_CONFIG_DISABLED)
                return 0;
        buf1.name = NULL;
        buf2.name = NULL;
        idx = tomoyo_read_lock();
        if (!tomoyo_get_realpath(&buf1, path1) ||
            !tomoyo_get_realpath(&buf2, path2))
                goto out;
        switch (operation) {
        case TOMOYO_TYPE_RENAME:
        case TOMOYO_TYPE_LINK:
                if (!d_is_dir(path1->dentry))
                        break;
                fallthrough;
        case TOMOYO_TYPE_PIVOT_ROOT:
                tomoyo_add_slash(&buf1);
                tomoyo_add_slash(&buf2);
                break;
        }
        r.obj = &obj;
        r.param_type = TOMOYO_TYPE_PATH2_ACL;
        r.param.path2.operation = operation;
        r.param.path2.filename1 = &buf1;
        r.param.path2.filename2 = &buf2;
        do {
                tomoyo_check_acl(&r, tomoyo_check_path2_acl);
                error = tomoyo_audit_path2_log(&r);
        } while (error == TOMOYO_RETRY_REQUEST);
 out:
        kfree(buf1.name);
        kfree(buf2.name);
        tomoyo_read_unlock(idx);
        if (r.mode != TOMOYO_CONFIG_ENFORCING)
                error = 0;
        return error;
}

/**
 * tomoyo_same_mount_acl - Check for duplicated "struct tomoyo_mount_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b, false otherwise.
 */
static bool tomoyo_same_mount_acl(const struct tomoyo_acl_info *a,
                                  const struct tomoyo_acl_info *b)
{
        const struct tomoyo_mount_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_mount_acl *p2 = container_of(b, typeof(*p2), head);

        return tomoyo_same_name_union(&p1->dev_name, &p2->dev_name) &&
                tomoyo_same_name_union(&p1->dir_name, &p2->dir_name) &&
                tomoyo_same_name_union(&p1->fs_type, &p2->fs_type) &&
                tomoyo_same_number_union(&p1->flags, &p2->flags);
}

/**
 * tomoyo_update_mount_acl - Write "struct tomoyo_mount_acl" list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_update_mount_acl(struct tomoyo_acl_param *param)
{
        struct tomoyo_mount_acl e = { .head.type = TOMOYO_TYPE_MOUNT_ACL };
        int error;

        if (!tomoyo_parse_name_union(param, &e.dev_name) ||
            !tomoyo_parse_name_union(param, &e.dir_name) ||
            !tomoyo_parse_name_union(param, &e.fs_type) ||
            !tomoyo_parse_number_union(param, &e.flags))
                error = -EINVAL;
        else
                error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                             tomoyo_same_mount_acl, NULL);
        tomoyo_put_name_union(&e.dev_name);
        tomoyo_put_name_union(&e.dir_name);
        tomoyo_put_name_union(&e.fs_type);
        tomoyo_put_number_union(&e.flags);
        return error;
}

/**
 * tomoyo_write_file - Update file related list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_write_file(struct tomoyo_acl_param *param)
{
        u16 perm = 0;
        u8 type;
        const char *operation = tomoyo_read_token(param);

        for (type = 0; type < TOMOYO_MAX_PATH_OPERATION; type++)
                if (tomoyo_permstr(operation, tomoyo_path_keyword[type]))
                        perm |= 1 << type;
        if (perm)
                return tomoyo_update_path_acl(perm, param);
        for (type = 0; type < TOMOYO_MAX_PATH2_OPERATION; type++)
                if (tomoyo_permstr(operation,
                                   tomoyo_mac_keywords[tomoyo_pp2mac[type]]))
                        perm |= 1 << type;
        if (perm)
                return tomoyo_update_path2_acl(perm, param);
        for (type = 0; type < TOMOYO_MAX_PATH_NUMBER_OPERATION; type++)
                if (tomoyo_permstr(operation,
                                   tomoyo_mac_keywords[tomoyo_pn2mac[type]]))
                        perm |= 1 << type;
        if (perm)
                return tomoyo_update_path_number_acl(perm, param);
        for (type = 0; type < TOMOYO_MAX_MKDEV_OPERATION; type++)
                if (tomoyo_permstr(operation,
                                   tomoyo_mac_keywords[tomoyo_pnnn2mac[type]]))
                        perm |= 1 << type;
        if (perm)
                return tomoyo_update_mkdev_acl(perm, param);
        if (tomoyo_permstr(operation,
                           tomoyo_mac_keywords[TOMOYO_MAC_FILE_MOUNT]))
                return tomoyo_update_mount_acl(param);
        return -EINVAL;
}














































    2 
























   12 














































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * Copyright (c) 2021, Google LLC.
 * Pasha Tatashin <pasha.tatashin@soleen.com>
 */
#ifndef __LINUX_PAGE_TABLE_CHECK_H
#define __LINUX_PAGE_TABLE_CHECK_H

#ifdef CONFIG_PAGE_TABLE_CHECK
#include <linux/jump_label.h>

extern struct static_key_true page_table_check_disabled;
extern struct page_ext_operations page_table_check_ops;

void __page_table_check_zero(struct page *page, unsigned int order);
void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte);
void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd);
void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud);
void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte,
                unsigned int nr);
void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd);
void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud);
void __page_table_check_pte_clear_range(struct mm_struct *mm,
                                        unsigned long addr,
                                        pmd_t pmd);

static inline void page_table_check_alloc(struct page *page, unsigned int order)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_zero(page, order);
}

static inline void page_table_check_free(struct page *page, unsigned int order)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_zero(page, order);
}

static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_pte_clear(mm, pte);
}

static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_pmd_clear(mm, pmd);
}

static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_pud_clear(mm, pud);
}

static inline void page_table_check_ptes_set(struct mm_struct *mm,
                pte_t *ptep, pte_t pte, unsigned int nr)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_ptes_set(mm, ptep, pte, nr);
}

static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp,
                                            pmd_t pmd)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_pmd_set(mm, pmdp, pmd);
}

static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp,
                                            pud_t pud)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_pud_set(mm, pudp, pud);
}

static inline void page_table_check_pte_clear_range(struct mm_struct *mm,
                                                    unsigned long addr,
                                                    pmd_t pmd)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_pte_clear_range(mm, addr, pmd);
}

#else

static inline void page_table_check_alloc(struct page *page, unsigned int order)
{
}

static inline void page_table_check_free(struct page *page, unsigned int order)
{
}

static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
{
}

static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd)
{
}

static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
{
}

static inline void page_table_check_ptes_set(struct mm_struct *mm,
                pte_t *ptep, pte_t pte, unsigned int nr)
{
}

static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp,
                                            pmd_t pmd)
{
}

static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp,
                                            pud_t pud)
{
}

static inline void page_table_check_pte_clear_range(struct mm_struct *mm,
                                                    unsigned long addr,
                                                    pmd_t pmd)
{
}

#endif /* CONFIG_PAGE_TABLE_CHECK */
#endif /* __LINUX_PAGE_TABLE_CHECK_H */































































































































































































































































































































































































































































































































































































































































































































































   53 





























   52 


   52 
   53 
   55 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Linux Socket Filter - Kernel level socket filtering
 *
 * Based on the design of the Berkeley Packet Filter. The new
 * internal format has been designed by PLUMgrid:
 *
 *        Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
 *
 * Authors:
 *
 *        Jay Schulist <jschlst@samba.org>
 *        Alexei Starovoitov <ast@plumgrid.com>
 *        Daniel Borkmann <dborkman@redhat.com>
 *
 * Andi Kleen - Fix a few bad bugs and races.
 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
 */

#include <uapi/linux/btf.h>
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
#include <linux/random.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/objtool.h>
#include <linux/overflow.h>
#include <linux/rbtree_latch.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
#include <linux/perf_event.h>
#include <linux/extable.h>
#include <linux/log2.h>
#include <linux/bpf_verifier.h>
#include <linux/nodemask.h>
#include <linux/nospec.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/memcontrol.h>
#include <linux/execmem.h>

#include <asm/barrier.h>
#include <asm/unaligned.h>

/* Registers */
#define BPF_R0        regs[BPF_REG_0]
#define BPF_R1        regs[BPF_REG_1]
#define BPF_R2        regs[BPF_REG_2]
#define BPF_R3        regs[BPF_REG_3]
#define BPF_R4        regs[BPF_REG_4]
#define BPF_R5        regs[BPF_REG_5]
#define BPF_R6        regs[BPF_REG_6]
#define BPF_R7        regs[BPF_REG_7]
#define BPF_R8        regs[BPF_REG_8]
#define BPF_R9        regs[BPF_REG_9]
#define BPF_R10        regs[BPF_REG_10]

/* Named registers */
#define DST        regs[insn->dst_reg]
#define SRC        regs[insn->src_reg]
#define FP        regs[BPF_REG_FP]
#define AX        regs[BPF_REG_AX]
#define ARG1        regs[BPF_REG_ARG1]
#define CTX        regs[BPF_REG_CTX]
#define OFF        insn->off
#define IMM        insn->imm

struct bpf_mem_alloc bpf_global_ma;
bool bpf_global_ma_set;

/* No hurry in this branch
 *
 * Exported for the bpf jit load helper.
 */
void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
{
        u8 *ptr = NULL;

        if (k >= SKF_NET_OFF) {
                ptr = skb_network_header(skb) + k - SKF_NET_OFF;
        } else if (k >= SKF_LL_OFF) {
                if (unlikely(!skb_mac_header_was_set(skb)))
                        return NULL;
                ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
        }
        if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
                return ptr;

        return NULL;
}

/* tell bpf programs that include vmlinux.h kernel's PAGE_SIZE */
enum page_size_enum {
        __PAGE_SIZE = PAGE_SIZE
};

struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
{
        gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
        struct bpf_prog_aux *aux;
        struct bpf_prog *fp;

        size = round_up(size, __PAGE_SIZE);
        fp = __vmalloc(size, gfp_flags);
        if (fp == NULL)
                return NULL;

        aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
        if (aux == NULL) {
                vfree(fp);
                return NULL;
        }
        fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
        if (!fp->active) {
                vfree(fp);
                kfree(aux);
                return NULL;
        }

        fp->pages = size / PAGE_SIZE;
        fp->aux = aux;
        fp->aux->prog = fp;
        fp->jit_requested = ebpf_jit_enabled();
        fp->blinding_requested = bpf_jit_blinding_enabled(fp);
#ifdef CONFIG_CGROUP_BPF
        aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
#endif

        INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
#ifdef CONFIG_FINEIBT
        INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode);
#endif
        mutex_init(&fp->aux->used_maps_mutex);
        mutex_init(&fp->aux->dst_mutex);

        return fp;
}

struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
{
        gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
        struct bpf_prog *prog;
        int cpu;

        prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
        if (!prog)
                return NULL;

        prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
        if (!prog->stats) {
                free_percpu(prog->active);
                kfree(prog->aux);
                vfree(prog);
                return NULL;
        }

        for_each_possible_cpu(cpu) {
                struct bpf_prog_stats *pstats;

                pstats = per_cpu_ptr(prog->stats, cpu);
                u64_stats_init(&pstats->syncp);
        }
        return prog;
}
EXPORT_SYMBOL_GPL(bpf_prog_alloc);

int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
{
        if (!prog->aux->nr_linfo || !prog->jit_requested)
                return 0;

        prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,
                                          sizeof(*prog->aux->jited_linfo),
                                          bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN));
        if (!prog->aux->jited_linfo)
                return -ENOMEM;

        return 0;
}

void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
{
        if (prog->aux->jited_linfo &&
            (!prog->jited || !prog->aux->jited_linfo[0])) {
                kvfree(prog->aux->jited_linfo);
                prog->aux->jited_linfo = NULL;
        }

        kfree(prog->aux->kfunc_tab);
        prog->aux->kfunc_tab = NULL;
}

/* The jit engine is responsible to provide an array
 * for insn_off to the jited_off mapping (insn_to_jit_off).
 *
 * The idx to this array is the insn_off.  Hence, the insn_off
 * here is relative to the prog itself instead of the main prog.
 * This array has one entry for each xlated bpf insn.
 *
 * jited_off is the byte off to the end of the jited insn.
 *
 * Hence, with
 * insn_start:
 *      The first bpf insn off of the prog.  The insn off
 *      here is relative to the main prog.
 *      e.g. if prog is a subprog, insn_start > 0
 * linfo_idx:
 *      The prog's idx to prog->aux->linfo and jited_linfo
 *
 * jited_linfo[linfo_idx] = prog->bpf_func
 *
 * For i > linfo_idx,
 *
 * jited_linfo[i] = prog->bpf_func +
 *        insn_to_jit_off[linfo[i].insn_off - insn_start - 1]
 */
void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
                               const u32 *insn_to_jit_off)
{
        u32 linfo_idx, insn_start, insn_end, nr_linfo, i;
        const struct bpf_line_info *linfo;
        void **jited_linfo;

        if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt)
                /* Userspace did not provide linfo */
                return;

        linfo_idx = prog->aux->linfo_idx;
        linfo = &prog->aux->linfo[linfo_idx];
        insn_start = linfo[0].insn_off;
        insn_end = insn_start + prog->len;

        jited_linfo = &prog->aux->jited_linfo[linfo_idx];
        jited_linfo[0] = prog->bpf_func;

        nr_linfo = prog->aux->nr_linfo - linfo_idx;

        for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++)
                /* The verifier ensures that linfo[i].insn_off is
                 * strictly increasing
                 */
                jited_linfo[i] = prog->bpf_func +
                        insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
}

struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
                                  gfp_t gfp_extra_flags)
{
        gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
        struct bpf_prog *fp;
        u32 pages;

        size = round_up(size, PAGE_SIZE);
        pages = size / PAGE_SIZE;
        if (pages <= fp_old->pages)
                return fp_old;

        fp = __vmalloc(size, gfp_flags);
        if (fp) {
                memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
                fp->pages = pages;
                fp->aux->prog = fp;

                /* We keep fp->aux from fp_old around in the new
                 * reallocated structure.
                 */
                fp_old->aux = NULL;
                fp_old->stats = NULL;
                fp_old->active = NULL;
                __bpf_prog_free(fp_old);
        }

        return fp;
}

void __bpf_prog_free(struct bpf_prog *fp)
{
        if (fp->aux) {
                mutex_destroy(&fp->aux->used_maps_mutex);
                mutex_destroy(&fp->aux->dst_mutex);
                kfree(fp->aux->poke_tab);
                kfree(fp->aux);
        }
        free_percpu(fp->stats);
        free_percpu(fp->active);
        vfree(fp);
}

int bpf_prog_calc_tag(struct bpf_prog *fp)
{
        const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64);
        u32 raw_size = bpf_prog_tag_scratch_size(fp);
        u32 digest[SHA1_DIGEST_WORDS];
        u32 ws[SHA1_WORKSPACE_WORDS];
        u32 i, bsize, psize, blocks;
        struct bpf_insn *dst;
        bool was_ld_map;
        u8 *raw, *todo;
        __be32 *result;
        __be64 *bits;

        raw = vmalloc(raw_size);
        if (!raw)
                return -ENOMEM;

        sha1_init(digest);
        memset(ws, 0, sizeof(ws));

        /* We need to take out the map fd for the digest calculation
         * since they are unstable from user space side.
         */
        dst = (void *)raw;
        for (i = 0, was_ld_map = false; i < fp->len; i++) {
                dst[i] = fp->insnsi[i];
                if (!was_ld_map &&
                    dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
                    (dst[i].src_reg == BPF_PSEUDO_MAP_FD ||
                     dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) {
                        was_ld_map = true;
                        dst[i].imm = 0;
                } else if (was_ld_map &&
                           dst[i].code == 0 &&
                           dst[i].dst_reg == 0 &&
                           dst[i].src_reg == 0 &&
                           dst[i].off == 0) {
                        was_ld_map = false;
                        dst[i].imm = 0;
                } else {
                        was_ld_map = false;
                }
        }

        psize = bpf_prog_insn_size(fp);
        memset(&raw[psize], 0, raw_size - psize);
        raw[psize++] = 0x80;

        bsize  = round_up(psize, SHA1_BLOCK_SIZE);
        blocks = bsize / SHA1_BLOCK_SIZE;
        todo   = raw;
        if (bsize - psize >= sizeof(__be64)) {
                bits = (__be64 *)(todo + bsize - sizeof(__be64));
        } else {
                bits = (__be64 *)(todo + bsize + bits_offset);
                blocks++;
        }
        *bits = cpu_to_be64((psize - 1) << 3);

        while (blocks--) {
                sha1_transform(digest, todo, ws);
                todo += SHA1_BLOCK_SIZE;
        }

        result = (__force __be32 *)digest;
        for (i = 0; i < SHA1_DIGEST_WORDS; i++)
                result[i] = cpu_to_be32(digest[i]);
        memcpy(fp->tag, result, sizeof(fp->tag));

        vfree(raw);
        return 0;
}

static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
                                s32 end_new, s32 curr, const bool probe_pass)
{
        const s64 imm_min = S32_MIN, imm_max = S32_MAX;
        s32 delta = end_new - end_old;
        s64 imm = insn->imm;

        if (curr < pos && curr + imm + 1 >= end_old)
                imm += delta;
        else if (curr >= end_new && curr + imm + 1 < end_new)
                imm -= delta;
        if (imm < imm_min || imm > imm_max)
                return -ERANGE;
        if (!probe_pass)
                insn->imm = imm;
        return 0;
}

static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
                                s32 end_new, s32 curr, const bool probe_pass)
{
        s64 off_min, off_max, off;
        s32 delta = end_new - end_old;

        if (insn->code == (BPF_JMP32 | BPF_JA)) {
                off = insn->imm;
                off_min = S32_MIN;
                off_max = S32_MAX;
        } else {
                off = insn->off;
                off_min = S16_MIN;
                off_max = S16_MAX;
        }

        if (curr < pos && curr + off + 1 >= end_old)
                off += delta;
        else if (curr >= end_new && curr + off + 1 < end_new)
                off -= delta;
        if (off < off_min || off > off_max)
                return -ERANGE;
        if (!probe_pass) {
                if (insn->code == (BPF_JMP32 | BPF_JA))
                        insn->imm = off;
                else
                        insn->off = off;
        }
        return 0;
}

static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
                            s32 end_new, const bool probe_pass)
{
        u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0);
        struct bpf_insn *insn = prog->insnsi;
        int ret = 0;

        for (i = 0; i < insn_cnt; i++, insn++) {
                u8 code;

                /* In the probing pass we still operate on the original,
                 * unpatched image in order to check overflows before we
                 * do any other adjustments. Therefore skip the patchlet.
                 */
                if (probe_pass && i == pos) {
                        i = end_new;
                        insn = prog->insnsi + end_old;
                }
                if (bpf_pseudo_func(insn)) {
                        ret = bpf_adj_delta_to_imm(insn, pos, end_old,
                                                   end_new, i, probe_pass);
                        if (ret)
                                return ret;
                        continue;
                }
                code = insn->code;
                if ((BPF_CLASS(code) != BPF_JMP &&
                     BPF_CLASS(code) != BPF_JMP32) ||
                    BPF_OP(code) == BPF_EXIT)
                        continue;
                /* Adjust offset of jmps if we cross patch boundaries. */
                if (BPF_OP(code) == BPF_CALL) {
                        if (insn->src_reg != BPF_PSEUDO_CALL)
                                continue;
                        ret = bpf_adj_delta_to_imm(insn, pos, end_old,
                                                   end_new, i, probe_pass);
                } else {
                        ret = bpf_adj_delta_to_off(insn, pos, end_old,
                                                   end_new, i, probe_pass);
                }
                if (ret)
                        break;
        }

        return ret;
}

static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta)
{
        struct bpf_line_info *linfo;
        u32 i, nr_linfo;

        nr_linfo = prog->aux->nr_linfo;
        if (!nr_linfo || !delta)
                return;

        linfo = prog->aux->linfo;

        for (i = 0; i < nr_linfo; i++)
                if (off < linfo[i].insn_off)
                        break;

        /* Push all off < linfo[i].insn_off by delta */
        for (; i < nr_linfo; i++)
                linfo[i].insn_off += delta;
}

struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
                                       const struct bpf_insn *patch, u32 len)
{
        u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
        const u32 cnt_max = S16_MAX;
        struct bpf_prog *prog_adj;
        int err;

        /* Since our patchlet doesn't expand the image, we're done. */
        if (insn_delta == 0) {
                memcpy(prog->insnsi + off, patch, sizeof(*patch));
                return prog;
        }

        insn_adj_cnt = prog->len + insn_delta;

        /* Reject anything that would potentially let the insn->off
         * target overflow when we have excessive program expansions.
         * We need to probe here before we do any reallocation where
         * we afterwards may not fail anymore.
         */
        if (insn_adj_cnt > cnt_max &&
            (err = bpf_adj_branches(prog, off, off + 1, off + len, true)))
                return ERR_PTR(err);

        /* Several new instructions need to be inserted. Make room
         * for them. Likely, there's no need for a new allocation as
         * last page could have large enough tailroom.
         */
        prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
                                    GFP_USER);
        if (!prog_adj)
                return ERR_PTR(-ENOMEM);

        prog_adj->len = insn_adj_cnt;

        /* Patching happens in 3 steps:
         *
         * 1) Move over tail of insnsi from next instruction onwards,
         *    so we can patch the single target insn with one or more
         *    new ones (patching is always from 1 to n insns, n > 0).
         * 2) Inject new instructions at the target location.
         * 3) Adjust branch offsets if necessary.
         */
        insn_rest = insn_adj_cnt - off - len;

        memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1,
                sizeof(*patch) * insn_rest);
        memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);

        /* We are guaranteed to not fail at this point, otherwise
         * the ship has sailed to reverse to the original state. An
         * overflow cannot happen at this point.
         */
        BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false));

        bpf_adj_linfo(prog_adj, off, insn_delta);

        return prog_adj;
}

int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
{
        /* Branch offsets can't overflow when program is shrinking, no need
         * to call bpf_adj_branches(..., true) here
         */
        memmove(prog->insnsi + off, prog->insnsi + off + cnt,
                sizeof(struct bpf_insn) * (prog->len - off - cnt));
        prog->len -= cnt;

        return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false));
}

static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
{
        int i;

        for (i = 0; i < fp->aux->real_func_cnt; i++)
                bpf_prog_kallsyms_del(fp->aux->func[i]);
}

void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
{
        bpf_prog_kallsyms_del_subprogs(fp);
        bpf_prog_kallsyms_del(fp);
}

#ifdef CONFIG_BPF_JIT
/* All BPF JIT sysctl knobs here. */
int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_harden   __read_mostly;
long bpf_jit_limit   __read_mostly;
long bpf_jit_limit_max __read_mostly;

static void
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
{
        WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));

        prog->aux->ksym.start = (unsigned long) prog->bpf_func;
        prog->aux->ksym.end   = prog->aux->ksym.start + prog->jited_len;
}

static void
bpf_prog_ksym_set_name(struct bpf_prog *prog)
{
        char *sym = prog->aux->ksym.name;
        const char *end = sym + KSYM_NAME_LEN;
        const struct btf_type *type;
        const char *func_name;

        BUILD_BUG_ON(sizeof("bpf_prog_") +
                     sizeof(prog->tag) * 2 +
                     /* name has been null terminated.
                      * We should need +1 for the '_' preceding
                      * the name.  However, the null character
                      * is double counted between the name and the
                      * sizeof("bpf_prog_") above, so we omit
                      * the +1 here.
                      */
                     sizeof(prog->aux->name) > KSYM_NAME_LEN);

        sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
        sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));

        /* prog->aux->name will be ignored if full btf name is available */
        if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) {
                type = btf_type_by_id(prog->aux->btf,
                                      prog->aux->func_info[prog->aux->func_idx].type_id);
                func_name = btf_name_by_offset(prog->aux->btf, type->name_off);
                snprintf(sym, (size_t)(end - sym), "_%s", func_name);
                return;
        }

        if (prog->aux->name[0])
                snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
        else
                *sym = 0;
}

static unsigned long bpf_get_ksym_start(struct latch_tree_node *n)
{
        return container_of(n, struct bpf_ksym, tnode)->start;
}

static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
                                          struct latch_tree_node *b)
{
        return bpf_get_ksym_start(a) < bpf_get_ksym_start(b);
}

static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
{
        unsigned long val = (unsigned long)key;
        const struct bpf_ksym *ksym;

        ksym = container_of(n, struct bpf_ksym, tnode);

        if (val < ksym->start)
                return -1;
        /* Ensure that we detect return addresses as part of the program, when
         * the final instruction is a call for a program part of the stack
         * trace. Therefore, do val > ksym->end instead of val >= ksym->end.
         */
        if (val > ksym->end)
                return  1;

        return 0;
}

static const struct latch_tree_ops bpf_tree_ops = {
        .less        = bpf_tree_less,
        .comp        = bpf_tree_comp,
};

static DEFINE_SPINLOCK(bpf_lock);
static LIST_HEAD(bpf_kallsyms);
static struct latch_tree_root bpf_tree __cacheline_aligned;

void bpf_ksym_add(struct bpf_ksym *ksym)
{
        spin_lock_bh(&bpf_lock);
        WARN_ON_ONCE(!list_empty(&ksym->lnode));
        list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms);
        latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
        spin_unlock_bh(&bpf_lock);
}

static void __bpf_ksym_del(struct bpf_ksym *ksym)
{
        if (list_empty(&ksym->lnode))
                return;

        latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
        list_del_rcu(&ksym->lnode);
}

void bpf_ksym_del(struct bpf_ksym *ksym)
{
        spin_lock_bh(&bpf_lock);
        __bpf_ksym_del(ksym);
        spin_unlock_bh(&bpf_lock);
}

static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
{
        return fp->jited && !bpf_prog_was_classic(fp);
}

void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
        if (!bpf_prog_kallsyms_candidate(fp) ||
            !bpf_token_capable(fp->aux->token, CAP_BPF))
                return;

        bpf_prog_ksym_set_addr(fp);
        bpf_prog_ksym_set_name(fp);
        fp->aux->ksym.prog = true;

        bpf_ksym_add(&fp->aux->ksym);

#ifdef CONFIG_FINEIBT
        /*
         * When FineIBT, code in the __cfi_foo() symbols can get executed
         * and hence unwinder needs help.
         */
        if (cfi_mode != CFI_FINEIBT)
                return;

        snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN,
                 "__cfi_%s", fp->aux->ksym.name);

        fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16;
        fp->aux->ksym_prefix.end   = (unsigned long) fp->bpf_func;

        bpf_ksym_add(&fp->aux->ksym_prefix);
#endif
}

void bpf_prog_kallsyms_del(struct bpf_prog *fp)
{
        if (!bpf_prog_kallsyms_candidate(fp))
                return;

        bpf_ksym_del(&fp->aux->ksym);
#ifdef CONFIG_FINEIBT
        if (cfi_mode != CFI_FINEIBT)
                return;
        bpf_ksym_del(&fp->aux->ksym_prefix);
#endif
}

static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
{
        struct latch_tree_node *n;

        n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
        return n ? container_of(n, struct bpf_ksym, tnode) : NULL;
}

const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
                                 unsigned long *off, char *sym)
{
        struct bpf_ksym *ksym;
        char *ret = NULL;

        rcu_read_lock();
        ksym = bpf_ksym_find(addr);
        if (ksym) {
                unsigned long symbol_start = ksym->start;
                unsigned long symbol_end = ksym->end;

                strscpy(sym, ksym->name, KSYM_NAME_LEN);

                ret = sym;
                if (size)
                        *size = symbol_end - symbol_start;
                if (off)
                        *off  = addr - symbol_start;
        }
        rcu_read_unlock();

        return ret;
}

bool is_bpf_text_address(unsigned long addr)
{
        bool ret;

        rcu_read_lock();
        ret = bpf_ksym_find(addr) != NULL;
        rcu_read_unlock();

        return ret;
}

struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
{
        struct bpf_ksym *ksym = bpf_ksym_find(addr);

        return ksym && ksym->prog ?
               container_of(ksym, struct bpf_prog_aux, ksym)->prog :
               NULL;
}

const struct exception_table_entry *search_bpf_extables(unsigned long addr)
{
        const struct exception_table_entry *e = NULL;
        struct bpf_prog *prog;

        rcu_read_lock();
        prog = bpf_prog_ksym_find(addr);
        if (!prog)
                goto out;
        if (!prog->aux->num_exentries)
                goto out;

        e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr);
out:
        rcu_read_unlock();
        return e;
}

int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                    char *sym)
{
        struct bpf_ksym *ksym;
        unsigned int it = 0;
        int ret = -ERANGE;

        if (!bpf_jit_kallsyms_enabled())
                return ret;

        rcu_read_lock();
        list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) {
                if (it++ != symnum)
                        continue;

                strscpy(sym, ksym->name, KSYM_NAME_LEN);

                *value = ksym->start;
                *type  = BPF_SYM_ELF_TYPE;

                ret = 0;
                break;
        }
        rcu_read_unlock();

        return ret;
}

int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
                                struct bpf_jit_poke_descriptor *poke)
{
        struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab;
        static const u32 poke_tab_max = 1024;
        u32 slot = prog->aux->size_poke_tab;
        u32 size = slot + 1;

        if (size > poke_tab_max)
                return -ENOSPC;
        if (poke->tailcall_target || poke->tailcall_target_stable ||
            poke->tailcall_bypass || poke->adj_off || poke->bypass_addr)
                return -EINVAL;

        switch (poke->reason) {
        case BPF_POKE_REASON_TAIL_CALL:
                if (!poke->tail_call.map)
                        return -EINVAL;
                break;
        default:
                return -EINVAL;
        }

        tab = krealloc_array(tab, size, sizeof(*poke), GFP_KERNEL);
        if (!tab)
                return -ENOMEM;

        memcpy(&tab[slot], poke, sizeof(*poke));
        prog->aux->size_poke_tab = size;
        prog->aux->poke_tab = tab;

        return slot;
}

/*
 * BPF program pack allocator.
 *
 * Most BPF programs are pretty small. Allocating a hole page for each
 * program is sometime a waste. Many small bpf program also adds pressure
 * to instruction TLB. To solve this issue, we introduce a BPF program pack
 * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
 * to host BPF programs.
 */
#define BPF_PROG_CHUNK_SHIFT        6
#define BPF_PROG_CHUNK_SIZE        (1 << BPF_PROG_CHUNK_SHIFT)
#define BPF_PROG_CHUNK_MASK        (~(BPF_PROG_CHUNK_SIZE - 1))

struct bpf_prog_pack {
        struct list_head list;
        void *ptr;
        unsigned long bitmap[];
};

void bpf_jit_fill_hole_with_zero(void *area, unsigned int size)
{
        memset(area, 0, size);
}

#define BPF_PROG_SIZE_TO_NBITS(size)        (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)

static DEFINE_MUTEX(pack_mutex);
static LIST_HEAD(pack_list);

/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with
 * CONFIG_MMU=n. Use PAGE_SIZE in these cases.
 */
#ifdef PMD_SIZE
/* PMD_SIZE is really big for some archs. It doesn't make sense to
 * reserve too much memory in one allocation. Hardcode BPF_PROG_PACK_SIZE to
 * 2MiB * num_possible_nodes(). On most architectures PMD_SIZE will be
 * greater than or equal to 2MB.
 */
#define BPF_PROG_PACK_SIZE (SZ_2M * num_possible_nodes())
#else
#define BPF_PROG_PACK_SIZE PAGE_SIZE
#endif

#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)

static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
        struct bpf_prog_pack *pack;
        int err;

        pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)),
                       GFP_KERNEL);
        if (!pack)
                return NULL;
        pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE);
        if (!pack->ptr)
                goto out;
        bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
        bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);

        set_vm_flush_reset_perms(pack->ptr);
        err = set_memory_rox((unsigned long)pack->ptr,
                             BPF_PROG_PACK_SIZE / PAGE_SIZE);
        if (err)
                goto out;
        list_add_tail(&pack->list, &pack_list);
        return pack;

out:
        bpf_jit_free_exec(pack->ptr);
        kfree(pack);
        return NULL;
}

void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
        unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
        struct bpf_prog_pack *pack;
        unsigned long pos;
        void *ptr = NULL;

        mutex_lock(&pack_mutex);
        if (size > BPF_PROG_PACK_SIZE) {
                size = round_up(size, PAGE_SIZE);
                ptr = bpf_jit_alloc_exec(size);
                if (ptr) {
                        int err;

                        bpf_fill_ill_insns(ptr, size);
                        set_vm_flush_reset_perms(ptr);
                        err = set_memory_rox((unsigned long)ptr,
                                             size / PAGE_SIZE);
                        if (err) {
                                bpf_jit_free_exec(ptr);
                                ptr = NULL;
                        }
                }
                goto out;
        }
        list_for_each_entry(pack, &pack_list, list) {
                pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
                                                 nbits, 0);
                if (pos < BPF_PROG_CHUNK_COUNT)
                        goto found_free_area;
        }

        pack = alloc_new_pack(bpf_fill_ill_insns);
        if (!pack)
                goto out;

        pos = 0;

found_free_area:
        bitmap_set(pack->bitmap, pos, nbits);
        ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);

out:
        mutex_unlock(&pack_mutex);
        return ptr;
}

void bpf_prog_pack_free(void *ptr, u32 size)
{
        struct bpf_prog_pack *pack = NULL, *tmp;
        unsigned int nbits;
        unsigned long pos;

        mutex_lock(&pack_mutex);
        if (size > BPF_PROG_PACK_SIZE) {
                bpf_jit_free_exec(ptr);
                goto out;
        }

        list_for_each_entry(tmp, &pack_list, list) {
                if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) {
                        pack = tmp;
                        break;
                }
        }

        if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
                goto out;

        nbits = BPF_PROG_SIZE_TO_NBITS(size);
        pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;

        WARN_ONCE(bpf_arch_text_invalidate(ptr, size),
                  "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");

        bitmap_clear(pack->bitmap, pos, nbits);
        if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
                                       BPF_PROG_CHUNK_COUNT, 0) == 0) {
                list_del(&pack->list);
                bpf_jit_free_exec(pack->ptr);
                kfree(pack);
        }
out:
        mutex_unlock(&pack_mutex);
}

static atomic_long_t bpf_jit_current;

/* Can be overridden by an arch's JIT compiler if it has a custom,
 * dedicated BPF backend memory area, or if neither of the two
 * below apply.
 */
u64 __weak bpf_jit_alloc_exec_limit(void)
{
#if defined(MODULES_VADDR)
        return MODULES_END - MODULES_VADDR;
#else
        return VMALLOC_END - VMALLOC_START;
#endif
}

static int __init bpf_jit_charge_init(void)
{
        /* Only used as heuristic here to derive limit. */
        bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
        bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1,
                                            PAGE_SIZE), LONG_MAX);
        return 0;
}
pure_initcall(bpf_jit_charge_init);

int bpf_jit_charge_modmem(u32 size)
{
        if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) {
                if (!bpf_capable()) {
                        atomic_long_sub(size, &bpf_jit_current);
                        return -EPERM;
                }
        }

        return 0;
}

void bpf_jit_uncharge_modmem(u32 size)
{
        atomic_long_sub(size, &bpf_jit_current);
}

void *__weak bpf_jit_alloc_exec(unsigned long size)
{
        return execmem_alloc(EXECMEM_BPF, size);
}

void __weak bpf_jit_free_exec(void *addr)
{
        execmem_free(addr);
}

struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
                     unsigned int alignment,
                     bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
        struct bpf_binary_header *hdr;
        u32 size, hole, start;

        WARN_ON_ONCE(!is_power_of_2(alignment) ||
                     alignment > BPF_IMAGE_ALIGNMENT);

        /* Most of BPF filters are really small, but if some of them
         * fill a page, allow at least 128 extra bytes to insert a
         * random section of illegal instructions.
         */
        size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);

        if (bpf_jit_charge_modmem(size))
                return NULL;
        hdr = bpf_jit_alloc_exec(size);
        if (!hdr) {
                bpf_jit_uncharge_modmem(size);
                return NULL;
        }

        /* Fill space with illegal/arch-dep instructions. */
        bpf_fill_ill_insns(hdr, size);

        hdr->size = size;
        hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
                     PAGE_SIZE - sizeof(*hdr));
        start = get_random_u32_below(hole) & ~(alignment - 1);

        /* Leave a random number of instructions before BPF code. */
        *image_ptr = &hdr->image[start];

        return hdr;
}

void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
        u32 size = hdr->size;

        bpf_jit_free_exec(hdr);
        bpf_jit_uncharge_modmem(size);
}

/* Allocate jit binary from bpf_prog_pack allocator.
 * Since the allocated memory is RO+X, the JIT engine cannot write directly
 * to the memory. To solve this problem, a RW buffer is also allocated at
 * as the same time. The JIT engine should calculate offsets based on the
 * RO memory address, but write JITed program to the RW buffer. Once the
 * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
 * the JITed program to the RO memory.
 */
struct bpf_binary_header *
bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
                          unsigned int alignment,
                          struct bpf_binary_header **rw_header,
                          u8 **rw_image,
                          bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
        struct bpf_binary_header *ro_header;
        u32 size, hole, start;

        WARN_ON_ONCE(!is_power_of_2(alignment) ||
                     alignment > BPF_IMAGE_ALIGNMENT);

        /* add 16 bytes for a random section of illegal instructions */
        size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);

        if (bpf_jit_charge_modmem(size))
                return NULL;
        ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns);
        if (!ro_header) {
                bpf_jit_uncharge_modmem(size);
                return NULL;
        }

        *rw_header = kvmalloc(size, GFP_KERNEL);
        if (!*rw_header) {
                bpf_prog_pack_free(ro_header, size);
                bpf_jit_uncharge_modmem(size);
                return NULL;
        }

        /* Fill space with illegal/arch-dep instructions. */
        bpf_fill_ill_insns(*rw_header, size);
        (*rw_header)->size = size;

        hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
                     BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
        start = get_random_u32_below(hole) & ~(alignment - 1);

        *image_ptr = &ro_header->image[start];
        *rw_image = &(*rw_header)->image[start];

        return ro_header;
}

/* Copy JITed text from rw_header to its final location, the ro_header. */
int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
                                 struct bpf_binary_header *ro_header,
                                 struct bpf_binary_header *rw_header)
{
        void *ptr;

        ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);

        kvfree(rw_header);

        if (IS_ERR(ptr)) {
                bpf_prog_pack_free(ro_header, ro_header->size);
                return PTR_ERR(ptr);
        }
        return 0;
}

/* bpf_jit_binary_pack_free is called in two different scenarios:
 *   1) when the program is freed after;
 *   2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
 * For case 2), we need to free both the RO memory and the RW buffer.
 *
 * bpf_jit_binary_pack_free requires proper ro_header->size. However,
 * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size
 * must be set with either bpf_jit_binary_pack_finalize (normal path) or
 * bpf_arch_text_copy (when jit fails).
 */
void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
                              struct bpf_binary_header *rw_header)
{
        u32 size = ro_header->size;

        bpf_prog_pack_free(ro_header, size);
        kvfree(rw_header);
        bpf_jit_uncharge_modmem(size);
}

struct bpf_binary_header *
bpf_jit_binary_pack_hdr(const struct bpf_prog *fp)
{
        unsigned long real_start = (unsigned long)fp->bpf_func;
        unsigned long addr;

        addr = real_start & BPF_PROG_CHUNK_MASK;
        return (void *)addr;
}

static inline struct bpf_binary_header *
bpf_jit_binary_hdr(const struct bpf_prog *fp)
{
        unsigned long real_start = (unsigned long)fp->bpf_func;
        unsigned long addr;

        addr = real_start & PAGE_MASK;
        return (void *)addr;
}

/* This symbol is only overridden by archs that have different
 * requirements than the usual eBPF JITs, f.e. when they only
 * implement cBPF JIT, do not set images read-only, etc.
 */
void __weak bpf_jit_free(struct bpf_prog *fp)
{
        if (fp->jited) {
                struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);

                bpf_jit_binary_free(hdr);
                WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
        }

        bpf_prog_unlock_free(fp);
}

int bpf_jit_get_func_addr(const struct bpf_prog *prog,
                          const struct bpf_insn *insn, bool extra_pass,
                          u64 *func_addr, bool *func_addr_fixed)
{
        s16 off = insn->off;
        s32 imm = insn->imm;
        u8 *addr;
        int err;

        *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL;
        if (!*func_addr_fixed) {
                /* Place-holder address till the last pass has collected
                 * all addresses for JITed subprograms in which case we
                 * can pick them up from prog->aux.
                 */
                if (!extra_pass)
                        addr = NULL;
                else if (prog->aux->func &&
                         off >= 0 && off < prog->aux->real_func_cnt)
                        addr = (u8 *)prog->aux->func[off]->bpf_func;
                else
                        return -EINVAL;
        } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
                   bpf_jit_supports_far_kfunc_call()) {
                err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr);
                if (err)
                        return err;
        } else {
                /* Address of a BPF helper call. Since part of the core
                 * kernel, it's always at a fixed location. __bpf_call_base
                 * and the helper with imm relative to it are both in core
                 * kernel.
                 */
                addr = (u8 *)__bpf_call_base + imm;
        }

        *func_addr = (unsigned long)addr;
        return 0;
}

static int bpf_jit_blind_insn(const struct bpf_insn *from,
                              const struct bpf_insn *aux,
                              struct bpf_insn *to_buff,
                              bool emit_zext)
{
        struct bpf_insn *to = to_buff;
        u32 imm_rnd = get_random_u32();
        s16 off;

        BUILD_BUG_ON(BPF_REG_AX  + 1 != MAX_BPF_JIT_REG);
        BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);

        /* Constraints on AX register:
         *
         * AX register is inaccessible from user space. It is mapped in
         * all JITs, and used here for constant blinding rewrites. It is
         * typically "stateless" meaning its contents are only valid within
         * the executed instruction, but not across several instructions.
         * There are a few exceptions however which are further detailed
         * below.
         *
         * Constant blinding is only used by JITs, not in the interpreter.
         * The interpreter uses AX in some occasions as a local temporary
         * register e.g. in DIV or MOD instructions.
         *
         * In restricted circumstances, the verifier can also use the AX
         * register for rewrites as long as they do not interfere with
         * the above cases!
         */
        if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX)
                goto out;

        if (from->imm == 0 &&
            (from->code == (BPF_ALU   | BPF_MOV | BPF_K) ||
             from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {
                *to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg);
                goto out;
        }

        switch (from->code) {
        case BPF_ALU | BPF_ADD | BPF_K:
        case BPF_ALU | BPF_SUB | BPF_K:
        case BPF_ALU | BPF_AND | BPF_K:
        case BPF_ALU | BPF_OR  | BPF_K:
        case BPF_ALU | BPF_XOR | BPF_K:
        case BPF_ALU | BPF_MUL | BPF_K:
        case BPF_ALU | BPF_MOV | BPF_K:
        case BPF_ALU | BPF_DIV | BPF_K:
        case BPF_ALU | BPF_MOD | BPF_K:
                *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_ALU32_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
                break;

        case BPF_ALU64 | BPF_ADD | BPF_K:
        case BPF_ALU64 | BPF_SUB | BPF_K:
        case BPF_ALU64 | BPF_AND | BPF_K:
        case BPF_ALU64 | BPF_OR  | BPF_K:
        case BPF_ALU64 | BPF_XOR | BPF_K:
        case BPF_ALU64 | BPF_MUL | BPF_K:
        case BPF_ALU64 | BPF_MOV | BPF_K:
        case BPF_ALU64 | BPF_DIV | BPF_K:
        case BPF_ALU64 | BPF_MOD | BPF_K:
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_ALU64_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off);
                break;

        case BPF_JMP | BPF_JEQ  | BPF_K:
        case BPF_JMP | BPF_JNE  | BPF_K:
        case BPF_JMP | BPF_JGT  | BPF_K:
        case BPF_JMP | BPF_JLT  | BPF_K:
        case BPF_JMP | BPF_JGE  | BPF_K:
        case BPF_JMP | BPF_JLE  | BPF_K:
        case BPF_JMP | BPF_JSGT | BPF_K:
        case BPF_JMP | BPF_JSLT | BPF_K:
        case BPF_JMP | BPF_JSGE | BPF_K:
        case BPF_JMP | BPF_JSLE | BPF_K:
        case BPF_JMP | BPF_JSET | BPF_K:
                /* Accommodate for extra offset in case of a backjump. */
                off = from->off;
                if (off < 0)
                        off -= 2;
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
                break;

        case BPF_JMP32 | BPF_JEQ  | BPF_K:
        case BPF_JMP32 | BPF_JNE  | BPF_K:
        case BPF_JMP32 | BPF_JGT  | BPF_K:
        case BPF_JMP32 | BPF_JLT  | BPF_K:
        case BPF_JMP32 | BPF_JGE  | BPF_K:
        case BPF_JMP32 | BPF_JLE  | BPF_K:
        case BPF_JMP32 | BPF_JSGT | BPF_K:
        case BPF_JMP32 | BPF_JSLT | BPF_K:
        case BPF_JMP32 | BPF_JSGE | BPF_K:
        case BPF_JMP32 | BPF_JSLE | BPF_K:
        case BPF_JMP32 | BPF_JSET | BPF_K:
                /* Accommodate for extra offset in case of a backjump. */
                off = from->off;
                if (off < 0)
                        off -= 2;
                *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX,
                                      off);
                break;

        case BPF_LD | BPF_IMM | BPF_DW:
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
                *to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX);
                break;
        case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
                *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
                *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                if (emit_zext)
                        *to++ = BPF_ZEXT_REG(BPF_REG_AX);
                *to++ = BPF_ALU64_REG(BPF_OR,  aux[0].dst_reg, BPF_REG_AX);
                break;

        case BPF_ST | BPF_MEM | BPF_DW:
        case BPF_ST | BPF_MEM | BPF_W:
        case BPF_ST | BPF_MEM | BPF_H:
        case BPF_ST | BPF_MEM | BPF_B:
                *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
                *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
                *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off);
                break;
        }
out:
        return to - to_buff;
}

static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
                                              gfp_t gfp_extra_flags)
{
        gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
        struct bpf_prog *fp;

        fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags);
        if (fp != NULL) {
                /* aux->prog still points to the fp_other one, so
                 * when promoting the clone to the real program,
                 * this still needs to be adapted.
                 */
                memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE);
        }

        return fp;
}

static void bpf_prog_clone_free(struct bpf_prog *fp)
{
        /* aux was stolen by the other clone, so we cannot free
         * it from this path! It will be freed eventually by the
         * other program on release.
         *
         * At this point, we don't need a deferred release since
         * clone is guaranteed to not be locked.
         */
        fp->aux = NULL;
        fp->stats = NULL;
        fp->active = NULL;
        __bpf_prog_free(fp);
}

void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
{
        /* We have to repoint aux->prog to self, as we don't
         * know whether fp here is the clone or the original.
         */
        fp->aux->prog = fp;
        bpf_prog_clone_free(fp_other);
}

struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
{
        struct bpf_insn insn_buff[16], aux[2];
        struct bpf_prog *clone, *tmp;
        int insn_delta, insn_cnt;
        struct bpf_insn *insn;
        int i, rewritten;

        if (!prog->blinding_requested || prog->blinded)
                return prog;

        clone = bpf_prog_clone_create(prog, GFP_USER);
        if (!clone)
                return ERR_PTR(-ENOMEM);

        insn_cnt = clone->len;
        insn = clone->insnsi;

        for (i = 0; i < insn_cnt; i++, insn++) {
                if (bpf_pseudo_func(insn)) {
                        /* ld_imm64 with an address of bpf subprog is not
                         * a user controlled constant. Don't randomize it,
                         * since it will conflict with jit_subprogs() logic.
                         */
                        insn++;
                        i++;
                        continue;
                }

                /* We temporarily need to hold the original ld64 insn
                 * so that we can still access the first part in the
                 * second blinding run.
                 */
                if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) &&
                    insn[1].code == 0)
                        memcpy(aux, insn, sizeof(aux));

                rewritten = bpf_jit_blind_insn(insn, aux, insn_buff,
                                                clone->aux->verifier_zext);
                if (!rewritten)
                        continue;

                tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
                if (IS_ERR(tmp)) {
                        /* Patching may have repointed aux->prog during
                         * realloc from the original one, so we need to
                         * fix it up here on error.
                         */
                        bpf_jit_prog_release_other(prog, clone);
                        return tmp;
                }

                clone = tmp;
                insn_delta = rewritten - 1;

                /* Walk new program and skip insns we just inserted. */
                insn = clone->insnsi + i + insn_delta;
                insn_cnt += insn_delta;
                i        += insn_delta;
        }

        clone->blinded = 1;
        return clone;
}
#endif /* CONFIG_BPF_JIT */

/* Base function for offset calculation. Needs to go into .text section,
 * therefore keeping it non-static as well; will also be used by JITs
 * anyway later on, so do not let the compiler omit it. This also needs
 * to go into kallsyms for correlation from e.g. bpftool, so naming
 * must not change.
 */
noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
        return 0;
}
EXPORT_SYMBOL_GPL(__bpf_call_base);

/* All UAPI available opcodes. */
#define BPF_INSN_MAP(INSN_2, INSN_3)                \
        /* 32 bit ALU operations. */                \
        /*   Register based. */                        \
        INSN_3(ALU, ADD,  X),                        \
        INSN_3(ALU, SUB,  X),                        \
        INSN_3(ALU, AND,  X),                        \
        INSN_3(ALU, OR,   X),                        \
        INSN_3(ALU, LSH,  X),                        \
        INSN_3(ALU, RSH,  X),                        \
        INSN_3(ALU, XOR,  X),                        \
        INSN_3(ALU, MUL,  X),                        \
        INSN_3(ALU, MOV,  X),                        \
        INSN_3(ALU, ARSH, X),                        \
        INSN_3(ALU, DIV,  X),                        \
        INSN_3(ALU, MOD,  X),                        \
        INSN_2(ALU, NEG),                        \
        INSN_3(ALU, END, TO_BE),                \
        INSN_3(ALU, END, TO_LE),                \
        /*   Immediate based. */                \
        INSN_3(ALU, ADD,  K),                        \
        INSN_3(ALU, SUB,  K),                        \
        INSN_3(ALU, AND,  K),                        \
        INSN_3(ALU, OR,   K),                        \
        INSN_3(ALU, LSH,  K),                        \
        INSN_3(ALU, RSH,  K),                        \
        INSN_3(ALU, XOR,  K),                        \
        INSN_3(ALU, MUL,  K),                        \
        INSN_3(ALU, MOV,  K),                        \
        INSN_3(ALU, ARSH, K),                        \
        INSN_3(ALU, DIV,  K),                        \
        INSN_3(ALU, MOD,  K),                        \
        /* 64 bit ALU operations. */                \
        /*   Register based. */                        \
        INSN_3(ALU64, ADD,  X),                        \
        INSN_3(ALU64, SUB,  X),                        \
        INSN_3(ALU64, AND,  X),                        \
        INSN_3(ALU64, OR,   X),                        \
        INSN_3(ALU64, LSH,  X),                        \
        INSN_3(ALU64, RSH,  X),                        \
        INSN_3(ALU64, XOR,  X),                        \
        INSN_3(ALU64, MUL,  X),                        \
        INSN_3(ALU64, MOV,  X),                        \
        INSN_3(ALU64, ARSH, X),                        \
        INSN_3(ALU64, DIV,  X),                        \
        INSN_3(ALU64, MOD,  X),                        \
        INSN_2(ALU64, NEG),                        \
        INSN_3(ALU64, END, TO_LE),                \
        /*   Immediate based. */                \
        INSN_3(ALU64, ADD,  K),                        \
        INSN_3(ALU64, SUB,  K),                        \
        INSN_3(ALU64, AND,  K),                        \
        INSN_3(ALU64, OR,   K),                        \
        INSN_3(ALU64, LSH,  K),                        \
        INSN_3(ALU64, RSH,  K),                        \
        INSN_3(ALU64, XOR,  K),                        \
        INSN_3(ALU64, MUL,  K),                        \
        INSN_3(ALU64, MOV,  K),                        \
        INSN_3(ALU64, ARSH, K),                        \
        INSN_3(ALU64, DIV,  K),                        \
        INSN_3(ALU64, MOD,  K),                        \
        /* Call instruction. */                        \
        INSN_2(JMP, CALL),                        \
        /* Exit instruction. */                        \
        INSN_2(JMP, EXIT),                        \
        /* 32-bit Jump instructions. */                \
        /*   Register based. */                        \
        INSN_3(JMP32, JEQ,  X),                        \
        INSN_3(JMP32, JNE,  X),                        \
        INSN_3(JMP32, JGT,  X),                        \
        INSN_3(JMP32, JLT,  X),                        \
        INSN_3(JMP32, JGE,  X),                        \
        INSN_3(JMP32, JLE,  X),                        \
        INSN_3(JMP32, JSGT, X),                        \
        INSN_3(JMP32, JSLT, X),                        \
        INSN_3(JMP32, JSGE, X),                        \
        INSN_3(JMP32, JSLE, X),                        \
        INSN_3(JMP32, JSET, X),                        \
        /*   Immediate based. */                \
        INSN_3(JMP32, JEQ,  K),                        \
        INSN_3(JMP32, JNE,  K),                        \
        INSN_3(JMP32, JGT,  K),                        \
        INSN_3(JMP32, JLT,  K),                        \
        INSN_3(JMP32, JGE,  K),                        \
        INSN_3(JMP32, JLE,  K),                        \
        INSN_3(JMP32, JSGT, K),                        \
        INSN_3(JMP32, JSLT, K),                        \
        INSN_3(JMP32, JSGE, K),                        \
        INSN_3(JMP32, JSLE, K),                        \
        INSN_3(JMP32, JSET, K),                        \
        /* Jump instructions. */                \
        /*   Register based. */                        \
        INSN_3(JMP, JEQ,  X),                        \
        INSN_3(JMP, JNE,  X),                        \
        INSN_3(JMP, JGT,  X),                        \
        INSN_3(JMP, JLT,  X),                        \
        INSN_3(JMP, JGE,  X),                        \
        INSN_3(JMP, JLE,  X),                        \
        INSN_3(JMP, JSGT, X),                        \
        INSN_3(JMP, JSLT, X),                        \
        INSN_3(JMP, JSGE, X),                        \
        INSN_3(JMP, JSLE, X),                        \
        INSN_3(JMP, JSET, X),                        \
        /*   Immediate based. */                \
        INSN_3(JMP, JEQ,  K),                        \
        INSN_3(JMP, JNE,  K),                        \
        INSN_3(JMP, JGT,  K),                        \
        INSN_3(JMP, JLT,  K),                        \
        INSN_3(JMP, JGE,  K),                        \
        INSN_3(JMP, JLE,  K),                        \
        INSN_3(JMP, JSGT, K),                        \
        INSN_3(JMP, JSLT, K),                        \
        INSN_3(JMP, JSGE, K),                        \
        INSN_3(JMP, JSLE, K),                        \
        INSN_3(JMP, JSET, K),                        \
        INSN_2(JMP, JA),                        \
        INSN_2(JMP32, JA),                        \
        /* Store instructions. */                \
        /*   Register based. */                        \
        INSN_3(STX, MEM,  B),                        \
        INSN_3(STX, MEM,  H),                        \
        INSN_3(STX, MEM,  W),                        \
        INSN_3(STX, MEM,  DW),                        \
        INSN_3(STX, ATOMIC, W),                        \
        INSN_3(STX, ATOMIC, DW),                \
        /*   Immediate based. */                \
        INSN_3(ST, MEM, B),                        \
        INSN_3(ST, MEM, H),                        \
        INSN_3(ST, MEM, W),                        \
        INSN_3(ST, MEM, DW),                        \
        /* Load instructions. */                \
        /*   Register based. */                        \
        INSN_3(LDX, MEM, B),                        \
        INSN_3(LDX, MEM, H),                        \
        INSN_3(LDX, MEM, W),                        \
        INSN_3(LDX, MEM, DW),                        \
        INSN_3(LDX, MEMSX, B),                        \
        INSN_3(LDX, MEMSX, H),                        \
        INSN_3(LDX, MEMSX, W),                        \
        /*   Immediate based. */                \
        INSN_3(LD, IMM, DW)

bool bpf_opcode_in_insntable(u8 code)
{
#define BPF_INSN_2_TBL(x, y)    [BPF_##x | BPF_##y] = true
#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true
        static const bool public_insntable[256] = {
                [0 ... 255] = false,
                /* Now overwrite non-defaults ... */
                BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
                /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */
                [BPF_LD | BPF_ABS | BPF_B] = true,
                [BPF_LD | BPF_ABS | BPF_H] = true,
                [BPF_LD | BPF_ABS | BPF_W] = true,
                [BPF_LD | BPF_IND | BPF_B] = true,
                [BPF_LD | BPF_IND | BPF_H] = true,
                [BPF_LD | BPF_IND | BPF_W] = true,
                [BPF_JMP | BPF_JCOND] = true,
        };
#undef BPF_INSN_3_TBL
#undef BPF_INSN_2_TBL
        return public_insntable[code];
}

#ifndef CONFIG_BPF_JIT_ALWAYS_ON
/**
 *        ___bpf_prog_run - run eBPF program on a given context
 *        @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
 *        @insn: is the array of eBPF instructions
 *
 * Decode and execute eBPF instructions.
 *
 * Return: whatever value is in %BPF_R0 at program exit
 */
static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
{
#define BPF_INSN_2_LBL(x, y)    [BPF_##x | BPF_##y] = &&x##_##y
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
        static const void * const jumptable[256] __annotate_jump_table = {
                [0 ... 255] = &&default_label,
                /* Now overwrite non-defaults ... */
                BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
                /* Non-UAPI available opcodes. */
                [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
                [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
                [BPF_ST  | BPF_NOSPEC] = &&ST_NOSPEC,
                [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
                [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
                [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
                [BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW,
                [BPF_LDX | BPF_PROBE_MEMSX | BPF_B] = &&LDX_PROBE_MEMSX_B,
                [BPF_LDX | BPF_PROBE_MEMSX | BPF_H] = &&LDX_PROBE_MEMSX_H,
                [BPF_LDX | BPF_PROBE_MEMSX | BPF_W] = &&LDX_PROBE_MEMSX_W,
        };
#undef BPF_INSN_3_LBL
#undef BPF_INSN_2_LBL
        u32 tail_call_cnt = 0;

#define CONT         ({ insn++; goto select_insn; })
#define CONT_JMP ({ insn++; goto select_insn; })

select_insn:
        goto *jumptable[insn->code];

        /* Explicitly mask the register-based shift amounts with 63 or 31
         * to avoid undefined behavior. Normally this won't affect the
         * generated code, for example, in case of native 64 bit archs such
         * as x86-64 or arm64, the compiler is optimizing the AND away for
         * the interpreter. In case of JITs, each of the JIT backends compiles
         * the BPF shift operations to machine instructions which produce
         * implementation-defined results in such a case; the resulting
         * contents of the register may be arbitrary, but program behaviour
         * as a whole remains defined. In other words, in case of JIT backends,
         * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation.
         */
        /* ALU (shifts) */
#define SHT(OPCODE, OP)                                        \
        ALU64_##OPCODE##_X:                                \
                DST = DST OP (SRC & 63);                \
                CONT;                                        \
        ALU_##OPCODE##_X:                                \
                DST = (u32) DST OP ((u32) SRC & 31);        \
                CONT;                                        \
        ALU64_##OPCODE##_K:                                \
                DST = DST OP IMM;                        \
                CONT;                                        \
        ALU_##OPCODE##_K:                                \
                DST = (u32) DST OP (u32) IMM;                \
                CONT;
        /* ALU (rest) */
#define ALU(OPCODE, OP)                                        \
        ALU64_##OPCODE##_X:                                \
                DST = DST OP SRC;                        \
                CONT;                                        \
        ALU_##OPCODE##_X:                                \
                DST = (u32) DST OP (u32) SRC;                \
                CONT;                                        \
        ALU64_##OPCODE##_K:                                \
                DST = DST OP IMM;                        \
                CONT;                                        \
        ALU_##OPCODE##_K:                                \
                DST = (u32) DST OP (u32) IMM;                \
                CONT;
        ALU(ADD,  +)
        ALU(SUB,  -)
        ALU(AND,  &)
        ALU(OR,   |)
        ALU(XOR,  ^)
        ALU(MUL,  *)
        SHT(LSH, <<)
        SHT(RSH, >>)
#undef SHT
#undef ALU
        ALU_NEG:
                DST = (u32) -DST;
                CONT;
        ALU64_NEG:
                DST = -DST;
                CONT;
        ALU_MOV_X:
                switch (OFF) {
                case 0:
                        DST = (u32) SRC;
                        break;
                case 8:
                        DST = (u32)(s8) SRC;
                        break;
                case 16:
                        DST = (u32)(s16) SRC;
                        break;
                }
                CONT;
        ALU_MOV_K:
                DST = (u32) IMM;
                CONT;
        ALU64_MOV_X:
                switch (OFF) {
                case 0:
                        DST = SRC;
                        break;
                case 8:
                        DST = (s8) SRC;
                        break;
                case 16:
                        DST = (s16) SRC;
                        break;
                case 32:
                        DST = (s32) SRC;
                        break;
                }
                CONT;
        ALU64_MOV_K:
                DST = IMM;
                CONT;
        LD_IMM_DW:
                DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
                insn++;
                CONT;
        ALU_ARSH_X:
                DST = (u64) (u32) (((s32) DST) >> (SRC & 31));
                CONT;
        ALU_ARSH_K:
                DST = (u64) (u32) (((s32) DST) >> IMM);
                CONT;
        ALU64_ARSH_X:
                (*(s64 *) &DST) >>= (SRC & 63);
                CONT;
        ALU64_ARSH_K:
                (*(s64 *) &DST) >>= IMM;
                CONT;
        ALU64_MOD_X:
                switch (OFF) {
                case 0:
                        div64_u64_rem(DST, SRC, &AX);
                        DST = AX;
                        break;
                case 1:
                        AX = div64_s64(DST, SRC);
                        DST = DST - AX * SRC;
                        break;
                }
                CONT;
        ALU_MOD_X:
                switch (OFF) {
                case 0:
                        AX = (u32) DST;
                        DST = do_div(AX, (u32) SRC);
                        break;
                case 1:
                        AX = abs((s32)DST);
                        AX = do_div(AX, abs((s32)SRC));
                        if ((s32)DST < 0)
                                DST = (u32)-AX;
                        else
                                DST = (u32)AX;
                        break;
                }
                CONT;
        ALU64_MOD_K:
                switch (OFF) {
                case 0:
                        div64_u64_rem(DST, IMM, &AX);
                        DST = AX;
                        break;
                case 1:
                        AX = div64_s64(DST, IMM);
                        DST = DST - AX * IMM;
                        break;
                }
                CONT;
        ALU_MOD_K:
                switch (OFF) {
                case 0:
                        AX = (u32) DST;
                        DST = do_div(AX, (u32) IMM);
                        break;
                case 1:
                        AX = abs((s32)DST);
                        AX = do_div(AX, abs((s32)IMM));
                        if ((s32)DST < 0)
                                DST = (u32)-AX;
                        else
                                DST = (u32)AX;
                        break;
                }
                CONT;
        ALU64_DIV_X:
                switch (OFF) {
                case 0:
                        DST = div64_u64(DST, SRC);
                        break;
                case 1:
                        DST = div64_s64(DST, SRC);
                        break;
                }
                CONT;
        ALU_DIV_X:
                switch (OFF) {
                case 0:
                        AX = (u32) DST;
                        do_div(AX, (u32) SRC);
                        DST = (u32) AX;
                        break;
                case 1:
                        AX = abs((s32)DST);
                        do_div(AX, abs((s32)SRC));
                        if (((s32)DST < 0) == ((s32)SRC < 0))
                                DST = (u32)AX;
                        else
                                DST = (u32)-AX;
                        break;
                }
                CONT;
        ALU64_DIV_K:
                switch (OFF) {
                case 0:
                        DST = div64_u64(DST, IMM);
                        break;
                case 1:
                        DST = div64_s64(DST, IMM);
                        break;
                }
                CONT;
        ALU_DIV_K:
                switch (OFF) {
                case 0:
                        AX = (u32) DST;
                        do_div(AX, (u32) IMM);
                        DST = (u32) AX;
                        break;
                case 1:
                        AX = abs((s32)DST);
                        do_div(AX, abs((s32)IMM));
                        if (((s32)DST < 0) == ((s32)IMM < 0))
                                DST = (u32)AX;
                        else
                                DST = (u32)-AX;
                        break;
                }
                CONT;
        ALU_END_TO_BE:
                switch (IMM) {
                case 16:
                        DST = (__force u16) cpu_to_be16(DST);
                        break;
                case 32:
                        DST = (__force u32) cpu_to_be32(DST);
                        break;
                case 64:
                        DST = (__force u64) cpu_to_be64(DST);
                        break;
                }
                CONT;
        ALU_END_TO_LE:
                switch (IMM) {
                case 16:
                        DST = (__force u16) cpu_to_le16(DST);
                        break;
                case 32:
                        DST = (__force u32) cpu_to_le32(DST);
                        break;
                case 64:
                        DST = (__force u64) cpu_to_le64(DST);
                        break;
                }
                CONT;
        ALU64_END_TO_LE:
                switch (IMM) {
                case 16:
                        DST = (__force u16) __swab16(DST);
                        break;
                case 32:
                        DST = (__force u32) __swab32(DST);
                        break;
                case 64:
                        DST = (__force u64) __swab64(DST);
                        break;
                }
                CONT;

        /* CALL */
        JMP_CALL:
                /* Function call scratches BPF_R1-BPF_R5 registers,
                 * preserves BPF_R6-BPF_R9, and stores return value
                 * into BPF_R0.
                 */
                BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
                                                       BPF_R4, BPF_R5);
                CONT;

        JMP_CALL_ARGS:
                BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2,
                                                            BPF_R3, BPF_R4,
                                                            BPF_R5,
                                                            insn + insn->off + 1);
                CONT;

        JMP_TAIL_CALL: {
                struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
                struct bpf_array *array = container_of(map, struct bpf_array, map);
                struct bpf_prog *prog;
                u32 index = BPF_R3;

                if (unlikely(index >= array->map.max_entries))
                        goto out;

                if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
                        goto out;

                tail_call_cnt++;

                prog = READ_ONCE(array->ptrs[index]);
                if (!prog)
                        goto out;

                /* ARG1 at this point is guaranteed to point to CTX from
                 * the verifier side due to the fact that the tail call is
                 * handled like a helper, that is, bpf_tail_call_proto,
                 * where arg1_type is ARG_PTR_TO_CTX.
                 */
                insn = prog->insnsi;
                goto select_insn;
out:
                CONT;
        }
        JMP_JA:
                insn += insn->off;
                CONT;
        JMP32_JA:
                insn += insn->imm;
                CONT;
        JMP_EXIT:
                return BPF_R0;
        /* JMP */
#define COND_JMP(SIGN, OPCODE, CMP_OP)                                \
        JMP_##OPCODE##_X:                                        \
                if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) {        \
                        insn += insn->off;                        \
                        CONT_JMP;                                \
                }                                                \
                CONT;                                                \
        JMP32_##OPCODE##_X:                                        \
                if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) {        \
                        insn += insn->off;                        \
                        CONT_JMP;                                \
                }                                                \
                CONT;                                                \
        JMP_##OPCODE##_K:                                        \
                if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) {        \
                        insn += insn->off;                        \
                        CONT_JMP;                                \
                }                                                \
                CONT;                                                \
        JMP32_##OPCODE##_K:                                        \
                if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) {        \
                        insn += insn->off;                        \
                        CONT_JMP;                                \
                }                                                \
                CONT;
        COND_JMP(u, JEQ, ==)
        COND_JMP(u, JNE, !=)
        COND_JMP(u, JGT, >)
        COND_JMP(u, JLT, <)
        COND_JMP(u, JGE, >=)
        COND_JMP(u, JLE, <=)
        COND_JMP(u, JSET, &)
        COND_JMP(s, JSGT, >)
        COND_JMP(s, JSLT, <)
        COND_JMP(s, JSGE, >=)
        COND_JMP(s, JSLE, <=)
#undef COND_JMP
        /* ST, STX and LDX*/
        ST_NOSPEC:
                /* Speculation barrier for mitigating Speculative Store Bypass.
                 * In case of arm64, we rely on the firmware mitigation as
                 * controlled via the ssbd kernel parameter. Whenever the
                 * mitigation is enabled, it works for all of the kernel code
                 * with no need to provide any additional instructions here.
                 * In case of x86, we use 'lfence' insn for mitigation. We
                 * reuse preexisting logic from Spectre v1 mitigation that
                 * happens to produce the required code on x86 for v4 as well.
                 */
                barrier_nospec();
                CONT;
#define LDST(SIZEOP, SIZE)                                                \
        STX_MEM_##SIZEOP:                                                \
                *(SIZE *)(unsigned long) (DST + insn->off) = SRC;        \
                CONT;                                                        \
        ST_MEM_##SIZEOP:                                                \
                *(SIZE *)(unsigned long) (DST + insn->off) = IMM;        \
                CONT;                                                        \
        LDX_MEM_##SIZEOP:                                                \
                DST = *(SIZE *)(unsigned long) (SRC + insn->off);        \
                CONT;                                                        \
        LDX_PROBE_MEM_##SIZEOP:                                                \
                bpf_probe_read_kernel_common(&DST, sizeof(SIZE),        \
                              (const void *)(long) (SRC + insn->off));        \
                DST = *((SIZE *)&DST);                                        \
                CONT;

        LDST(B,   u8)
        LDST(H,  u16)
        LDST(W,  u32)
        LDST(DW, u64)
#undef LDST

#define LDSX(SIZEOP, SIZE)                                                \
        LDX_MEMSX_##SIZEOP:                                                \
                DST = *(SIZE *)(unsigned long) (SRC + insn->off);        \
                CONT;                                                        \
        LDX_PROBE_MEMSX_##SIZEOP:                                        \
                bpf_probe_read_kernel_common(&DST, sizeof(SIZE),                \
                                      (const void *)(long) (SRC + insn->off));        \
                DST = *((SIZE *)&DST);                                        \
                CONT;

        LDSX(B,   s8)
        LDSX(H,  s16)
        LDSX(W,  s32)
#undef LDSX

#define ATOMIC_ALU_OP(BOP, KOP)                                                \
                case BOP:                                                \
                        if (BPF_SIZE(insn->code) == BPF_W)                \
                                atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \
                                             (DST + insn->off));        \
                        else                                                \
                                atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \
                                               (DST + insn->off));        \
                        break;                                                \
                case BOP | BPF_FETCH:                                        \
                        if (BPF_SIZE(insn->code) == BPF_W)                \
                                SRC = (u32) atomic_fetch_##KOP(                \
                                        (u32) SRC,                        \
                                        (atomic_t *)(unsigned long) (DST + insn->off)); \
                        else                                                \
                                SRC = (u64) atomic64_fetch_##KOP(        \
                                        (u64) SRC,                        \
                                        (atomic64_t *)(unsigned long) (DST + insn->off)); \
                        break;

        STX_ATOMIC_DW:
        STX_ATOMIC_W:
                switch (IMM) {
                ATOMIC_ALU_OP(BPF_ADD, add)
                ATOMIC_ALU_OP(BPF_AND, and)
                ATOMIC_ALU_OP(BPF_OR, or)
                ATOMIC_ALU_OP(BPF_XOR, xor)
#undef ATOMIC_ALU_OP

                case BPF_XCHG:
                        if (BPF_SIZE(insn->code) == BPF_W)
                                SRC = (u32) atomic_xchg(
                                        (atomic_t *)(unsigned long) (DST + insn->off),
                                        (u32) SRC);
                        else
                                SRC = (u64) atomic64_xchg(
                                        (atomic64_t *)(unsigned long) (DST + insn->off),
                                        (u64) SRC);
                        break;
                case BPF_CMPXCHG:
                        if (BPF_SIZE(insn->code) == BPF_W)
                                BPF_R0 = (u32) atomic_cmpxchg(
                                        (atomic_t *)(unsigned long) (DST + insn->off),
                                        (u32) BPF_R0, (u32) SRC);
                        else
                                BPF_R0 = (u64) atomic64_cmpxchg(
                                        (atomic64_t *)(unsigned long) (DST + insn->off),
                                        (u64) BPF_R0, (u64) SRC);
                        break;

                default:
                        goto default_label;
                }
                CONT;

        default_label:
                /* If we ever reach this, we have a bug somewhere. Die hard here
                 * instead of just returning 0; we could be somewhere in a subprog,
                 * so execution could continue otherwise which we do /not/ want.
                 *
                 * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
                 */
                pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n",
                        insn->code, insn->imm);
                BUG_ON(1);
                return 0;
}

#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
#define DEFINE_BPF_PROG_RUN(stack_size) \
static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
{ \
        u64 stack[stack_size / sizeof(u64)]; \
        u64 regs[MAX_BPF_EXT_REG] = {}; \
\
        kmsan_unpoison_memory(stack, sizeof(stack)); \
        FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
        ARG1 = (u64) (unsigned long) ctx; \
        return ___bpf_prog_run(regs, insn); \
}

#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size
#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \
static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
                                      const struct bpf_insn *insn) \
{ \
        u64 stack[stack_size / sizeof(u64)]; \
        u64 regs[MAX_BPF_EXT_REG]; \
\
        kmsan_unpoison_memory(stack, sizeof(stack)); \
        FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
        BPF_R1 = r1; \
        BPF_R2 = r2; \
        BPF_R3 = r3; \
        BPF_R4 = r4; \
        BPF_R5 = r5; \
        return ___bpf_prog_run(regs, insn); \
}

#define EVAL1(FN, X) FN(X)
#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y)
#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y)
#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y)

EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);

EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192);
EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384);
EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512);

#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),

static unsigned int (*interpreters[])(const void *ctx,
                                      const struct bpf_insn *insn) = {
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
#undef PROG_NAME_LIST
#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size),
static __maybe_unused
u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5,
                           const struct bpf_insn *insn) = {
EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
#undef PROG_NAME_LIST

#ifdef CONFIG_BPF_SYSCALL
void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
{
        stack_depth = max_t(u32, stack_depth, 1);
        insn->off = (s16) insn->imm;
        insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] -
                __bpf_call_base_args;
        insn->code = BPF_JMP | BPF_CALL_ARGS;
}
#endif
#else
static unsigned int __bpf_prog_ret0_warn(const void *ctx,
                                         const struct bpf_insn *insn)
{
        /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
         * is not working properly, so warn about it!
         */
        WARN_ON_ONCE(1);
        return 0;
}
#endif

bool bpf_prog_map_compatible(struct bpf_map *map,
                             const struct bpf_prog *fp)
{
        enum bpf_prog_type prog_type = resolve_prog_type(fp);
        bool ret;

        if (fp->kprobe_override)
                return false;

        /* XDP programs inserted into maps are not guaranteed to run on
         * a particular netdev (and can run outside driver context entirely
         * in the case of devmap and cpumap). Until device checks
         * are implemented, prohibit adding dev-bound programs to program maps.
         */
        if (bpf_prog_is_dev_bound(fp->aux))
                return false;

        spin_lock(&map->owner.lock);
        if (!map->owner.type) {
                /* There's no owner yet where we could check for
                 * compatibility.
                 */
                map->owner.type  = prog_type;
                map->owner.jited = fp->jited;
                map->owner.xdp_has_frags = fp->aux->xdp_has_frags;
                ret = true;
        } else {
                ret = map->owner.type  == prog_type &&
                      map->owner.jited == fp->jited &&
                      map->owner.xdp_has_frags == fp->aux->xdp_has_frags;
        }
        spin_unlock(&map->owner.lock);

        return ret;
}

static int bpf_check_tail_call(const struct bpf_prog *fp)
{
        struct bpf_prog_aux *aux = fp->aux;
        int i, ret = 0;

        mutex_lock(&aux->used_maps_mutex);
        for (i = 0; i < aux->used_map_cnt; i++) {
                struct bpf_map *map = aux->used_maps[i];

                if (!map_type_contains_progs(map))
                        continue;

                if (!bpf_prog_map_compatible(map, fp)) {
                        ret = -EINVAL;
                        goto out;
                }
        }

out:
        mutex_unlock(&aux->used_maps_mutex);
        return ret;
}

static void bpf_prog_select_func(struct bpf_prog *fp)
{
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
        u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);

        fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
#else
        fp->bpf_func = __bpf_prog_ret0_warn;
#endif
}

/**
 *        bpf_prog_select_runtime - select exec runtime for BPF program
 *        @fp: bpf_prog populated with BPF program
 *        @err: pointer to error variable
 *
 * Try to JIT eBPF program, if JIT is not available, use interpreter.
 * The BPF program will be executed via bpf_prog_run() function.
 *
 * Return: the &fp argument along with &err set to 0 for success or
 * a negative errno code on failure
 */
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
        /* In case of BPF to BPF calls, verifier did all the prep
         * work with regards to JITing, etc.
         */
        bool jit_needed = false;

        if (fp->bpf_func)
                goto finalize;

        if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) ||
            bpf_prog_has_kfunc_call(fp))
                jit_needed = true;

        bpf_prog_select_func(fp);

        /* eBPF JITs can rewrite the program in case constant
         * blinding is active. However, in case of error during
         * blinding, bpf_int_jit_compile() must always return a
         * valid program, which in this case would simply not
         * be JITed, but falls back to the interpreter.
         */
        if (!bpf_prog_is_offloaded(fp->aux)) {
                *err = bpf_prog_alloc_jited_linfo(fp);
                if (*err)
                        return fp;

                fp = bpf_int_jit_compile(fp);
                bpf_prog_jit_attempt_done(fp);
                if (!fp->jited && jit_needed) {
                        *err = -ENOTSUPP;
                        return fp;
                }
        } else {
                *err = bpf_prog_offload_compile(fp);
                if (*err)
                        return fp;
        }

finalize:
        *err = bpf_prog_lock_ro(fp);
        if (*err)
                return fp;

        /* The tail call compatibility check can only be done at
         * this late stage as we need to determine, if we deal
         * with JITed or non JITed program concatenations and not
         * all eBPF JITs might immediately support all features.
         */
        *err = bpf_check_tail_call(fp);

        return fp;
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);

static unsigned int __bpf_prog_ret1(const void *ctx,
                                    const struct bpf_insn *insn)
{
        return 1;
}

static struct bpf_prog_dummy {
        struct bpf_prog prog;
} dummy_bpf_prog = {
        .prog = {
                .bpf_func = __bpf_prog_ret1,
        },
};

struct bpf_empty_prog_array bpf_empty_prog_array = {
        .null_prog = NULL,
};
EXPORT_SYMBOL(bpf_empty_prog_array);

struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
{
        struct bpf_prog_array *p;

        if (prog_cnt)
                p = kzalloc(struct_size(p, items, prog_cnt + 1), flags);
        else
                p = &bpf_empty_prog_array.hdr;

        return p;
}

void bpf_prog_array_free(struct bpf_prog_array *progs)
{
        if (!progs || progs == &bpf_empty_prog_array.hdr)
                return;
        kfree_rcu(progs, rcu);
}

static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
{
        struct bpf_prog_array *progs;

        /* If RCU Tasks Trace grace period implies RCU grace period, there is
         * no need to call kfree_rcu(), just call kfree() directly.
         */
        progs = container_of(rcu, struct bpf_prog_array, rcu);
        if (rcu_trace_implies_rcu_gp())
                kfree(progs);
        else
                kfree_rcu(progs, rcu);
}

void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
{
        if (!progs || progs == &bpf_empty_prog_array.hdr)
                return;
        call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
}

int bpf_prog_array_length(struct bpf_prog_array *array)
{
        struct bpf_prog_array_item *item;
        u32 cnt = 0;

        for (item = array->items; item->prog; item++)
                if (item->prog != &dummy_bpf_prog.prog)
                        cnt++;
        return cnt;
}

bool bpf_prog_array_is_empty(struct bpf_prog_array *array)
{
        struct bpf_prog_array_item *item;

        for (item = array->items; item->prog; item++)
                if (item->prog != &dummy_bpf_prog.prog)
                        return false;
        return true;
}

static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
                                     u32 *prog_ids,
                                     u32 request_cnt)
{
        struct bpf_prog_array_item *item;
        int i = 0;

        for (item = array->items; item->prog; item++) {
                if (item->prog == &dummy_bpf_prog.prog)
                        continue;
                prog_ids[i] = item->prog->aux->id;
                if (++i == request_cnt) {
                        item++;
                        break;
                }
        }

        return !!(item->prog);
}

int bpf_prog_array_copy_to_user(struct bpf_prog_array *array,
                                __u32 __user *prog_ids, u32 cnt)
{
        unsigned long err = 0;
        bool nospc;
        u32 *ids;

        /* users of this function are doing:
         * cnt = bpf_prog_array_length();
         * if (cnt > 0)
         *     bpf_prog_array_copy_to_user(..., cnt);
         * so below kcalloc doesn't need extra cnt > 0 check.
         */
        ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN);
        if (!ids)
                return -ENOMEM;
        nospc = bpf_prog_array_copy_core(array, ids, cnt);
        err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
        kfree(ids);
        if (err)
                return -EFAULT;
        if (nospc)
                return -ENOSPC;
        return 0;
}

void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
                                struct bpf_prog *old_prog)
{
        struct bpf_prog_array_item *item;

        for (item = array->items; item->prog; item++)
                if (item->prog == old_prog) {
                        WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
                        break;
                }
}

/**
 * bpf_prog_array_delete_safe_at() - Replaces the program at the given
 *                                   index into the program array with
 *                                   a dummy no-op program.
 * @array: a bpf_prog_array
 * @index: the index of the program to replace
 *
 * Skips over dummy programs, by not counting them, when calculating
 * the position of the program to replace.
 *
 * Return:
 * * 0                - Success
 * * -EINVAL        - Invalid index value. Must be a non-negative integer.
 * * -ENOENT        - Index out of range
 */
int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index)
{
        return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog);
}

/**
 * bpf_prog_array_update_at() - Updates the program at the given index
 *                              into the program array.
 * @array: a bpf_prog_array
 * @index: the index of the program to update
 * @prog: the program to insert into the array
 *
 * Skips over dummy programs, by not counting them, when calculating
 * the position of the program to update.
 *
 * Return:
 * * 0                - Success
 * * -EINVAL        - Invalid index value. Must be a non-negative integer.
 * * -ENOENT        - Index out of range
 */
int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
                             struct bpf_prog *prog)
{
        struct bpf_prog_array_item *item;

        if (unlikely(index < 0))
                return -EINVAL;

        for (item = array->items; item->prog; item++) {
                if (item->prog == &dummy_bpf_prog.prog)
                        continue;
                if (!index) {
                        WRITE_ONCE(item->prog, prog);
                        return 0;
                }
                index--;
        }
        return -ENOENT;
}

int bpf_prog_array_copy(struct bpf_prog_array *old_array,
                        struct bpf_prog *exclude_prog,
                        struct bpf_prog *include_prog,
                        u64 bpf_cookie,
                        struct bpf_prog_array **new_array)
{
        int new_prog_cnt, carry_prog_cnt = 0;
        struct bpf_prog_array_item *existing, *new;
        struct bpf_prog_array *array;
        bool found_exclude = false;

        /* Figure out how many existing progs we need to carry over to
         * the new array.
         */
        if (old_array) {
                existing = old_array->items;
                for (; existing->prog; existing++) {
                        if (existing->prog == exclude_prog) {
                                found_exclude = true;
                                continue;
                        }
                        if (existing->prog != &dummy_bpf_prog.prog)
                                carry_prog_cnt++;
                        if (existing->prog == include_prog)
                                return -EEXIST;
                }
        }

        if (exclude_prog && !found_exclude)
                return -ENOENT;

        /* How many progs (not NULL) will be in the new array? */
        new_prog_cnt = carry_prog_cnt;
        if (include_prog)
                new_prog_cnt += 1;

        /* Do we have any prog (not NULL) in the new array? */
        if (!new_prog_cnt) {
                *new_array = NULL;
                return 0;
        }

        /* +1 as the end of prog_array is marked with NULL */
        array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
        if (!array)
                return -ENOMEM;
        new = array->items;

        /* Fill in the new prog array */
        if (carry_prog_cnt) {
                existing = old_array->items;
                for (; existing->prog; existing++) {
                        if (existing->prog == exclude_prog ||
                            existing->prog == &dummy_bpf_prog.prog)
                                continue;

                        new->prog = existing->prog;
                        new->bpf_cookie = existing->bpf_cookie;
                        new++;
                }
        }
        if (include_prog) {
                new->prog = include_prog;
                new->bpf_cookie = bpf_cookie;
                new++;
        }
        new->prog = NULL;
        *new_array = array;
        return 0;
}

int bpf_prog_array_copy_info(struct bpf_prog_array *array,
                             u32 *prog_ids, u32 request_cnt,
                             u32 *prog_cnt)
{
        u32 cnt = 0;

        if (array)
                cnt = bpf_prog_array_length(array);

        *prog_cnt = cnt;

        /* return early if user requested only program count or nothing to copy */
        if (!request_cnt || !cnt)
                return 0;

        /* this function is called under trace/bpf_trace.c: bpf_event_mutex */
        return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC
                                                                     : 0;
}

void __bpf_free_used_maps(struct bpf_prog_aux *aux,
                          struct bpf_map **used_maps, u32 len)
{
        struct bpf_map *map;
        bool sleepable;
        u32 i;

        sleepable = aux->prog->sleepable;
        for (i = 0; i < len; i++) {
                map = used_maps[i];
                if (map->ops->map_poke_untrack)
                        map->ops->map_poke_untrack(map, aux);
                if (sleepable)
                        atomic64_dec(&map->sleepable_refcnt);
                bpf_map_put(map);
        }
}

static void bpf_free_used_maps(struct bpf_prog_aux *aux)
{
        __bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt);
        kfree(aux->used_maps);
}

void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
                          struct btf_mod_pair *used_btfs, u32 len)
{
#ifdef CONFIG_BPF_SYSCALL
        struct btf_mod_pair *btf_mod;
        u32 i;

        for (i = 0; i < len; i++) {
                btf_mod = &used_btfs[i];
                if (btf_mod->module)
                        module_put(btf_mod->module);
                btf_put(btf_mod->btf);
        }
#endif
}

static void bpf_free_used_btfs(struct bpf_prog_aux *aux)
{
        __bpf_free_used_btfs(aux, aux->used_btfs, aux->used_btf_cnt);
        kfree(aux->used_btfs);
}

static void bpf_prog_free_deferred(struct work_struct *work)
{
        struct bpf_prog_aux *aux;
        int i;

        aux = container_of(work, struct bpf_prog_aux, work);
#ifdef CONFIG_BPF_SYSCALL
        bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
#endif
#ifdef CONFIG_CGROUP_BPF
        if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID)
                bpf_cgroup_atype_put(aux->cgroup_atype);
#endif
        bpf_free_used_maps(aux);
        bpf_free_used_btfs(aux);
        if (bpf_prog_is_dev_bound(aux))
                bpf_prog_dev_bound_destroy(aux->prog);
#ifdef CONFIG_PERF_EVENTS
        if (aux->prog->has_callchain_buf)
                put_callchain_buffers();
#endif
        if (aux->dst_trampoline)
                bpf_trampoline_put(aux->dst_trampoline);
        for (i = 0; i < aux->real_func_cnt; i++) {
                /* We can just unlink the subprog poke descriptor table as
                 * it was originally linked to the main program and is also
                 * released along with it.
                 */
                aux->func[i]->aux->poke_tab = NULL;
                bpf_jit_free(aux->func[i]);
        }
        if (aux->real_func_cnt) {
                kfree(aux->func);
                bpf_prog_unlock_free(aux->prog);
        } else {
                bpf_jit_free(aux->prog);
        }
}

void bpf_prog_free(struct bpf_prog *fp)
{
        struct bpf_prog_aux *aux = fp->aux;

        if (aux->dst_prog)
                bpf_prog_put(aux->dst_prog);
        bpf_token_put(aux->token);
        INIT_WORK(&aux->work, bpf_prog_free_deferred);
        schedule_work(&aux->work);
}
EXPORT_SYMBOL_GPL(bpf_prog_free);

/* RNG for unprivileged user space with separated state from prandom_u32(). */
static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);

void bpf_user_rnd_init_once(void)
{
        prandom_init_once(&bpf_user_rnd_state);
}

BPF_CALL_0(bpf_user_rnd_u32)
{
        /* Should someone ever have the rather unwise idea to use some
         * of the registers passed into this function, then note that
         * this function is called from native eBPF and classic-to-eBPF
         * transformations. Register assignments from both sides are
         * different, f.e. classic always sets fn(ctx, A, X) here.
         */
        struct rnd_state *state;
        u32 res;

        state = &get_cpu_var(bpf_user_rnd_state);
        res = prandom_u32_state(state);
        put_cpu_var(bpf_user_rnd_state);

        return res;
}

BPF_CALL_0(bpf_get_raw_cpu_id)
{
        return raw_smp_processor_id();
}

/* Weak definitions of helper functions in case we don't have bpf syscall. */
const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
const struct bpf_func_proto bpf_map_update_elem_proto __weak;
const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
const struct bpf_func_proto bpf_map_push_elem_proto __weak;
const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak;
const struct bpf_func_proto bpf_spin_lock_proto __weak;
const struct bpf_func_proto bpf_spin_unlock_proto __weak;
const struct bpf_func_proto bpf_jiffies64_proto __weak;

const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak;

const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak;
const struct bpf_func_proto bpf_get_local_storage_proto __weak;
const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_snprintf_btf_proto __weak;
const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;
const struct bpf_func_proto bpf_set_retval_proto __weak;
const struct bpf_func_proto bpf_get_retval_proto __weak;

const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
        return NULL;
}

const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void)
{
        return NULL;
}

u64 __weak
bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
                 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
        return -ENOTSUPP;
}
EXPORT_SYMBOL_GPL(bpf_event_output);

/* Always built-in helper functions. */
const struct bpf_func_proto bpf_tail_call_proto = {
        .func                = NULL,
        .gpl_only        = false,
        .ret_type        = RET_VOID,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
};

/* Stub for JITs that only support cBPF. eBPF programs are interpreted.
 * It is encouraged to implement bpf_int_jit_compile() instead, so that
 * eBPF and implicitly also cBPF can get JITed!
 */
struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
{
        return prog;
}

/* Stub for JITs that support eBPF. All cBPF code gets transformed into
 * eBPF by the kernel and is later compiled by bpf_int_jit_compile().
 */
void __weak bpf_jit_compile(struct bpf_prog *prog)
{
}

bool __weak bpf_helper_changes_pkt_data(void *func)
{
        return false;
}

/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
 * analysis code and wants explicit zero extension inserted by verifier.
 * Otherwise, return FALSE.
 *
 * The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if
 * you don't override this. JITs that don't want these extra insns can detect
 * them using insn_is_zext.
 */
bool __weak bpf_jit_needs_zext(void)
{
        return false;
}

/* Return true if the JIT inlines the call to the helper corresponding to
 * the imm.
 *
 * The verifier will not patch the insn->imm for the call to the helper if
 * this returns true.
 */
bool __weak bpf_jit_inlines_helper_call(s32 imm)
{
        return false;
}

/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
bool __weak bpf_jit_supports_subprog_tailcalls(void)
{
        return false;
}

bool __weak bpf_jit_supports_percpu_insn(void)
{
        return false;
}

bool __weak bpf_jit_supports_kfunc_call(void)
{
        return false;
}

bool __weak bpf_jit_supports_far_kfunc_call(void)
{
        return false;
}

bool __weak bpf_jit_supports_arena(void)
{
        return false;
}

bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
{
        return false;
}

u64 __weak bpf_arch_uaddress_limit(void)
{
#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
        return TASK_SIZE;
#else
        return 0;
#endif
}

/* Return TRUE if the JIT backend satisfies the following two conditions:
 * 1) JIT backend supports atomic_xchg() on pointer-sized words.
 * 2) Under the specific arch, the implementation of xchg() is the same
 *    as atomic_xchg() on pointer-sized words.
 */
bool __weak bpf_jit_supports_ptr_xchg(void)
{
        return false;
}

/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
 * skb_copy_bits(), so provide a weak definition of it for NET-less config.
 */
int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
                         int len)
{
        return -EFAULT;
}

int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
                              void *addr1, void *addr2)
{
        return -ENOTSUPP;
}

void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
{
        return ERR_PTR(-ENOTSUPP);
}

int __weak bpf_arch_text_invalidate(void *dst, size_t len)
{
        return -ENOTSUPP;
}

bool __weak bpf_jit_supports_exceptions(void)
{
        return false;
}

void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
{
}

/* for configs without MMU or 32-bit */
__weak const struct bpf_map_ops arena_map_ops;
__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
{
        return 0;
}
__weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
{
        return 0;
}

#ifdef CONFIG_BPF_SYSCALL
static int __init bpf_global_ma_init(void)
{
        int ret;

        ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
        bpf_global_ma_set = !ret;
        return ret;
}
late_initcall(bpf_global_ma_init);
#endif

DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);

/* All definitions of tracepoints related to BPF. */
#define CREATE_TRACE_POINTS
#include <linux/bpf_trace.h>

EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);









































































    4 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_HARDIRQ_H
#define _ASM_X86_HARDIRQ_H

#include <linux/threads.h>
#include <asm/current.h>

typedef struct {
#if IS_ENABLED(CONFIG_KVM_INTEL)
        u8             kvm_cpu_l1tf_flush_l1d;
#endif
        unsigned int __nmi_count;        /* arch dependent */
#ifdef CONFIG_X86_LOCAL_APIC
        unsigned int apic_timer_irqs;        /* arch dependent */
        unsigned int irq_spurious_count;
        unsigned int icr_read_retry_count;
#endif
#if IS_ENABLED(CONFIG_KVM)
        unsigned int kvm_posted_intr_ipis;
        unsigned int kvm_posted_intr_wakeup_ipis;
        unsigned int kvm_posted_intr_nested_ipis;
#endif
        unsigned int x86_platform_ipis;        /* arch dependent */
        unsigned int apic_perf_irqs;
        unsigned int apic_irq_work_irqs;
#ifdef CONFIG_SMP
        unsigned int irq_resched_count;
        unsigned int irq_call_count;
#endif
        unsigned int irq_tlb_count;
#ifdef CONFIG_X86_THERMAL_VECTOR
        unsigned int irq_thermal_count;
#endif
#ifdef CONFIG_X86_MCE_THRESHOLD
        unsigned int irq_threshold_count;
#endif
#ifdef CONFIG_X86_MCE_AMD
        unsigned int irq_deferred_error_count;
#endif
#ifdef CONFIG_X86_HV_CALLBACK_VECTOR
        unsigned int irq_hv_callback_count;
#endif
#if IS_ENABLED(CONFIG_HYPERV)
        unsigned int irq_hv_reenlightenment_count;
        unsigned int hyperv_stimer0_count;
#endif
#ifdef CONFIG_X86_POSTED_MSI
        unsigned int posted_msi_notification_count;
#endif
} ____cacheline_aligned irq_cpustat_t;

DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);

#ifdef CONFIG_X86_POSTED_MSI
DECLARE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc);
#endif
#define __ARCH_IRQ_STAT

#define inc_irq_stat(member)        this_cpu_inc(irq_stat.member)

extern void ack_bad_irq(unsigned int irq);

extern u64 arch_irq_stat_cpu(unsigned int cpu);
#define arch_irq_stat_cpu        arch_irq_stat_cpu

extern u64 arch_irq_stat(void);
#define arch_irq_stat                arch_irq_stat

#define local_softirq_pending_ref       pcpu_hot.softirq_pending

#if IS_ENABLED(CONFIG_KVM_INTEL)
static inline void kvm_set_cpu_l1tf_flush_l1d(void)
{
        __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1);
}

static __always_inline void kvm_clear_cpu_l1tf_flush_l1d(void)
{
        __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0);
}

static __always_inline bool kvm_get_cpu_l1tf_flush_l1d(void)
{
        return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d);
}
#else /* !IS_ENABLED(CONFIG_KVM_INTEL) */
static inline void kvm_set_cpu_l1tf_flush_l1d(void) { }
#endif /* IS_ENABLED(CONFIG_KVM_INTEL) */

#endif /* _ASM_X86_HARDIRQ_H */




































    2 

    2 

    2 













    1 
    1 

    1 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Generic Timer-queue
 *
 *  Manages a simple queue of timers, ordered by expiration time.
 *  Uses rbtrees for quick list adds and expiration.
 *
 *  NOTE: All of the following functions need to be serialized
 *  to avoid races. No locking is done by this library code.
 */

#include <linux/bug.h>
#include <linux/timerqueue.h>
#include <linux/rbtree.h>
#include <linux/export.h>

#define __node_2_tq(_n) \
        rb_entry((_n), struct timerqueue_node, node)

static inline bool __timerqueue_less(struct rb_node *a, const struct rb_node *b)
{
        return __node_2_tq(a)->expires < __node_2_tq(b)->expires;
}

/**
 * timerqueue_add - Adds timer to timerqueue.
 *
 * @head: head of timerqueue
 * @node: timer node to be added
 *
 * Adds the timer node to the timerqueue, sorted by the node's expires
 * value. Returns true if the newly added timer is the first expiring timer in
 * the queue.
 */
bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
{
        /* Make sure we don't add nodes that are already added */
        WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node));

        return rb_add_cached(&node->node, &head->rb_root, __timerqueue_less);
}
EXPORT_SYMBOL_GPL(timerqueue_add);

/**
 * timerqueue_del - Removes a timer from the timerqueue.
 *
 * @head: head of timerqueue
 * @node: timer node to be removed
 *
 * Removes the timer node from the timerqueue. Returns true if the queue is
 * not empty after the remove.
 */
bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
{
        WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));

        rb_erase_cached(&node->node, &head->rb_root);
        RB_CLEAR_NODE(&node->node);

        return !RB_EMPTY_ROOT(&head->rb_root.rb_root);
}
EXPORT_SYMBOL_GPL(timerqueue_del);

/**
 * timerqueue_iterate_next - Returns the timer after the provided timer
 *
 * @node: Pointer to a timer.
 *
 * Provides the timer that is after the given node. This is used, when
 * necessary, to iterate through the list of timers in a timer list
 * without modifying the list.
 */
struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node)
{
        struct rb_node *next;

        if (!node)
                return NULL;
        next = rb_next(&node->node);
        if (!next)
                return NULL;
        return container_of(next, struct timerqueue_node, node);
}
EXPORT_SYMBOL_GPL(timerqueue_iterate_next);





































    1 
    1 























































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/io_uring.h>

#include <uapi/linux/io_uring.h>

#include "../kernel/futex/futex.h"
#include "io_uring.h"
#include "alloc_cache.h"
#include "futex.h"

struct io_futex {
        struct file        *file;
        union {
                u32 __user                        *uaddr;
                struct futex_waitv __user        *uwaitv;
        };
        unsigned long        futex_val;
        unsigned long        futex_mask;
        unsigned long        futexv_owned;
        u32                futex_flags;
        unsigned int        futex_nr;
        bool                futexv_unqueued;
};

struct io_futex_data {
        struct futex_q        q;
        struct io_kiocb        *req;
};

#define IO_FUTEX_ALLOC_CACHE_MAX        32

bool io_futex_cache_init(struct io_ring_ctx *ctx)
{
        return io_alloc_cache_init(&ctx->futex_cache, IO_FUTEX_ALLOC_CACHE_MAX,
                                sizeof(struct io_futex_data));
}

void io_futex_cache_free(struct io_ring_ctx *ctx)
{
        io_alloc_cache_free(&ctx->futex_cache, kfree);
}

static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts)
{
        req->async_data = NULL;
        hlist_del_init(&req->hash_node);
        io_req_task_complete(req, ts);
}

static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts)
{
        struct io_futex_data *ifd = req->async_data;
        struct io_ring_ctx *ctx = req->ctx;

        io_tw_lock(ctx, ts);
        if (!io_alloc_cache_put(&ctx->futex_cache, ifd))
                kfree(ifd);
        __io_futex_complete(req, ts);
}

static void io_futexv_complete(struct io_kiocb *req, struct io_tw_state *ts)
{
        struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
        struct futex_vector *futexv = req->async_data;

        io_tw_lock(req->ctx, ts);

        if (!iof->futexv_unqueued) {
                int res;

                res = futex_unqueue_multiple(futexv, iof->futex_nr);
                if (res != -1)
                        io_req_set_res(req, res, 0);
        }

        kfree(req->async_data);
        req->flags &= ~REQ_F_ASYNC_DATA;
        __io_futex_complete(req, ts);
}

static bool io_futexv_claim(struct io_futex *iof)
{
        if (test_bit(0, &iof->futexv_owned) ||
            test_and_set_bit_lock(0, &iof->futexv_owned))
                return false;
        return true;
}

static bool __io_futex_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
{
        /* futex wake already done or in progress */
        if (req->opcode == IORING_OP_FUTEX_WAIT) {
                struct io_futex_data *ifd = req->async_data;

                if (!futex_unqueue(&ifd->q))
                        return false;
                req->io_task_work.func = io_futex_complete;
        } else {
                struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);

                if (!io_futexv_claim(iof))
                        return false;
                req->io_task_work.func = io_futexv_complete;
        }

        hlist_del_init(&req->hash_node);
        io_req_set_res(req, -ECANCELED, 0);
        io_req_task_work_add(req);
        return true;
}

int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
                    unsigned int issue_flags)
{
        struct hlist_node *tmp;
        struct io_kiocb *req;
        int nr = 0;

        if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED))
                return -ENOENT;

        io_ring_submit_lock(ctx, issue_flags);
        hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) {
                if (req->cqe.user_data != cd->data &&
                    !(cd->flags & IORING_ASYNC_CANCEL_ANY))
                        continue;
                if (__io_futex_cancel(ctx, req))
                        nr++;
                if (!(cd->flags & IORING_ASYNC_CANCEL_ALL))
                        break;
        }
        io_ring_submit_unlock(ctx, issue_flags);

        if (nr)
                return nr;

        return -ENOENT;
}

bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
                         bool cancel_all)
{
        struct hlist_node *tmp;
        struct io_kiocb *req;
        bool found = false;

        lockdep_assert_held(&ctx->uring_lock);

        hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) {
                if (!io_match_task_safe(req, task, cancel_all))
                        continue;
                hlist_del_init(&req->hash_node);
                __io_futex_cancel(ctx, req);
                found = true;
        }

        return found;
}

int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
        u32 flags;

        if (unlikely(sqe->len || sqe->futex_flags || sqe->buf_index ||
                     sqe->file_index))
                return -EINVAL;

        iof->uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
        iof->futex_val = READ_ONCE(sqe->addr2);
        iof->futex_mask = READ_ONCE(sqe->addr3);
        flags = READ_ONCE(sqe->fd);

        if (flags & ~FUTEX2_VALID_MASK)
                return -EINVAL;

        iof->futex_flags = futex2_to_flags(flags);
        if (!futex_flags_valid(iof->futex_flags))
                return -EINVAL;

        if (!futex_validate_input(iof->futex_flags, iof->futex_val) ||
            !futex_validate_input(iof->futex_flags, iof->futex_mask))
                return -EINVAL;

        return 0;
}

static void io_futex_wakev_fn(struct wake_q_head *wake_q, struct futex_q *q)
{
        struct io_kiocb *req = q->wake_data;
        struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);

        if (!io_futexv_claim(iof))
                return;
        if (unlikely(!__futex_wake_mark(q)))
                return;

        io_req_set_res(req, 0, 0);
        req->io_task_work.func = io_futexv_complete;
        io_req_task_work_add(req);
}

int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
        struct futex_vector *futexv;
        int ret;

        /* No flags or mask supported for waitv */
        if (unlikely(sqe->fd || sqe->buf_index || sqe->file_index ||
                     sqe->addr2 || sqe->futex_flags || sqe->addr3))
                return -EINVAL;

        iof->uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
        iof->futex_nr = READ_ONCE(sqe->len);
        if (!iof->futex_nr || iof->futex_nr > FUTEX_WAITV_MAX)
                return -EINVAL;

        futexv = kcalloc(iof->futex_nr, sizeof(*futexv), GFP_KERNEL);
        if (!futexv)
                return -ENOMEM;

        ret = futex_parse_waitv(futexv, iof->uwaitv, iof->futex_nr,
                                io_futex_wakev_fn, req);
        if (ret) {
                kfree(futexv);
                return ret;
        }

        iof->futexv_owned = 0;
        iof->futexv_unqueued = 0;
        req->flags |= REQ_F_ASYNC_DATA;
        req->async_data = futexv;
        return 0;
}

static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q)
{
        struct io_futex_data *ifd = container_of(q, struct io_futex_data, q);
        struct io_kiocb *req = ifd->req;

        if (unlikely(!__futex_wake_mark(q)))
                return;

        io_req_set_res(req, 0, 0);
        req->io_task_work.func = io_futex_complete;
        io_req_task_work_add(req);
}

static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx)
{
        struct io_futex_data *ifd;

        ifd = io_alloc_cache_get(&ctx->futex_cache);
        if (ifd)
                return ifd;

        return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT);
}

int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
        struct futex_vector *futexv = req->async_data;
        struct io_ring_ctx *ctx = req->ctx;
        int ret, woken = -1;

        io_ring_submit_lock(ctx, issue_flags);

        ret = futex_wait_multiple_setup(futexv, iof->futex_nr, &woken);

        /*
         * Error case, ret is < 0. Mark the request as failed.
         */
        if (unlikely(ret < 0)) {
                io_ring_submit_unlock(ctx, issue_flags);
                req_set_fail(req);
                io_req_set_res(req, ret, 0);
                kfree(futexv);
                req->async_data = NULL;
                req->flags &= ~REQ_F_ASYNC_DATA;
                return IOU_OK;
        }

        /*
         * 0 return means that we successfully setup the waiters, and that
         * nobody triggered a wakeup while we were doing so. If the wakeup
         * happened post setup, the task_work will be run post this issue and
         * under the submission lock. 1 means We got woken while setting up,
         * let that side do the completion. Note that
         * futex_wait_multiple_setup() will have unqueued all the futexes in
         * this case. Mark us as having done that already, since this is
         * different from normal wakeup.
         */
        if (!ret) {
                /*
                 * If futex_wait_multiple_setup() returns 0 for a
                 * successful setup, then the task state will not be
                 * runnable. This is fine for the sync syscall, as
                 * it'll be blocking unless we already got one of the
                 * futexes woken, but it obviously won't work for an
                 * async invocation. Mark us runnable again.
                 */
                __set_current_state(TASK_RUNNING);
                hlist_add_head(&req->hash_node, &ctx->futex_list);
        } else {
                iof->futexv_unqueued = 1;
                if (woken != -1)
                        io_req_set_res(req, woken, 0);
        }

        io_ring_submit_unlock(ctx, issue_flags);
        return IOU_ISSUE_SKIP_COMPLETE;
}

int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
        struct io_ring_ctx *ctx = req->ctx;
        struct io_futex_data *ifd = NULL;
        struct futex_hash_bucket *hb;
        int ret;

        if (!iof->futex_mask) {
                ret = -EINVAL;
                goto done;
        }

        io_ring_submit_lock(ctx, issue_flags);
        ifd = io_alloc_ifd(ctx);
        if (!ifd) {
                ret = -ENOMEM;
                goto done_unlock;
        }

        req->async_data = ifd;
        ifd->q = futex_q_init;
        ifd->q.bitset = iof->futex_mask;
        ifd->q.wake = io_futex_wake_fn;
        ifd->req = req;

        ret = futex_wait_setup(iof->uaddr, iof->futex_val, iof->futex_flags,
                               &ifd->q, &hb);
        if (!ret) {
                hlist_add_head(&req->hash_node, &ctx->futex_list);
                io_ring_submit_unlock(ctx, issue_flags);

                futex_queue(&ifd->q, hb);
                return IOU_ISSUE_SKIP_COMPLETE;
        }

done_unlock:
        io_ring_submit_unlock(ctx, issue_flags);
done:
        if (ret < 0)
                req_set_fail(req);
        io_req_set_res(req, ret, 0);
        kfree(ifd);
        return IOU_OK;
}

int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
        int ret;

        /*
         * Strict flags - ensure that waking 0 futexes yields a 0 result.
         * See commit 43adf8449510 ("futex: FLAGS_STRICT") for details.
         */
        ret = futex_wake(iof->uaddr, FLAGS_STRICT | iof->futex_flags,
                         iof->futex_val, iof->futex_mask);
        if (ret < 0)
                req_set_fail(req);
        io_req_set_res(req, ret, 0);
        return IOU_OK;
}






























    1 









































    1 





















    1 

















    1 



















    2 





    2 









    2 





























































    2 









    2 





    2 































    2 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
// SPDX-License-Identifier: GPL-2.0
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/exportfs.h>
#include <linux/fs_struct.h>
#include <linux/fsnotify.h>
#include <linux/personality.h>
#include <linux/uaccess.h>
#include <linux/compat.h>
#include "internal.h"
#include "mount.h"

static long do_sys_name_to_handle(const struct path *path,
                                  struct file_handle __user *ufh,
                                  int __user *mnt_id, int fh_flags)
{
        long retval;
        struct file_handle f_handle;
        int handle_dwords, handle_bytes;
        struct file_handle *handle = NULL;

        /*
         * We need to make sure whether the file system support decoding of
         * the file handle if decodeable file handle was requested.
         */
        if (!exportfs_can_encode_fh(path->dentry->d_sb->s_export_op, fh_flags))
                return -EOPNOTSUPP;

        if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
                return -EFAULT;

        if (f_handle.handle_bytes > MAX_HANDLE_SZ)
                return -EINVAL;

        handle = kzalloc(struct_size(handle, f_handle, f_handle.handle_bytes),
                         GFP_KERNEL);
        if (!handle)
                return -ENOMEM;

        /* convert handle size to multiple of sizeof(u32) */
        handle_dwords = f_handle.handle_bytes >> 2;

        /* we ask for a non connectable maybe decodeable file handle */
        retval = exportfs_encode_fh(path->dentry,
                                    (struct fid *)handle->f_handle,
                                    &handle_dwords, fh_flags);
        handle->handle_type = retval;
        /* convert handle size to bytes */
        handle_bytes = handle_dwords * sizeof(u32);
        handle->handle_bytes = handle_bytes;
        if ((handle->handle_bytes > f_handle.handle_bytes) ||
            (retval == FILEID_INVALID) || (retval < 0)) {
                /* As per old exportfs_encode_fh documentation
                 * we could return ENOSPC to indicate overflow
                 * But file system returned 255 always. So handle
                 * both the values
                 */
                if (retval == FILEID_INVALID || retval == -ENOSPC)
                        retval = -EOVERFLOW;
                /*
                 * set the handle size to zero so we copy only
                 * non variable part of the file_handle
                 */
                handle_bytes = 0;
        } else
                retval = 0;
        /* copy the mount id */
        if (put_user(real_mount(path->mnt)->mnt_id, mnt_id) ||
            copy_to_user(ufh, handle,
                         struct_size(handle, f_handle, handle_bytes)))
                retval = -EFAULT;
        kfree(handle);
        return retval;
}

/**
 * sys_name_to_handle_at: convert name to handle
 * @dfd: directory relative to which name is interpreted if not absolute
 * @name: name that should be converted to handle.
 * @handle: resulting file handle
 * @mnt_id: mount id of the file system containing the file
 * @flag: flag value to indicate whether to follow symlink or not
 *        and whether a decodable file handle is required.
 *
 * @handle->handle_size indicate the space available to store the
 * variable part of the file handle in bytes. If there is not
 * enough space, the field is updated to return the minimum
 * value required.
 */
SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
                struct file_handle __user *, handle, int __user *, mnt_id,
                int, flag)
{
        struct path path;
        int lookup_flags;
        int fh_flags;
        int err;

        if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID))
                return -EINVAL;

        lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
        fh_flags = (flag & AT_HANDLE_FID) ? EXPORT_FH_FID : 0;
        if (flag & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;
        err = user_path_at(dfd, name, lookup_flags, &path);
        if (!err) {
                err = do_sys_name_to_handle(&path, handle, mnt_id, fh_flags);
                path_put(&path);
        }
        return err;
}

static struct vfsmount *get_vfsmount_from_fd(int fd)
{
        struct vfsmount *mnt;

        if (fd == AT_FDCWD) {
                struct fs_struct *fs = current->fs;
                spin_lock(&fs->lock);
                mnt = mntget(fs->pwd.mnt);
                spin_unlock(&fs->lock);
        } else {
                struct fd f = fdget(fd);
                if (!f.file)
                        return ERR_PTR(-EBADF);
                mnt = mntget(f.file->f_path.mnt);
                fdput(f);
        }
        return mnt;
}

static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
{
        return 1;
}

static int do_handle_to_path(int mountdirfd, struct file_handle *handle,
                             struct path *path)
{
        int retval = 0;
        int handle_dwords;

        path->mnt = get_vfsmount_from_fd(mountdirfd);
        if (IS_ERR(path->mnt)) {
                retval = PTR_ERR(path->mnt);
                goto out_err;
        }
        /* change the handle size to multiple of sizeof(u32) */
        handle_dwords = handle->handle_bytes >> 2;
        path->dentry = exportfs_decode_fh(path->mnt,
                                          (struct fid *)handle->f_handle,
                                          handle_dwords, handle->handle_type,
                                          vfs_dentry_acceptable, NULL);
        if (IS_ERR(path->dentry)) {
                retval = PTR_ERR(path->dentry);
                goto out_mnt;
        }
        return 0;
out_mnt:
        mntput(path->mnt);
out_err:
        return retval;
}

static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
                   struct path *path)
{
        int retval = 0;
        struct file_handle f_handle;
        struct file_handle *handle = NULL;

        /*
         * With handle we don't look at the execute bit on the
         * directory. Ideally we would like CAP_DAC_SEARCH.
         * But we don't have that
         */
        if (!capable(CAP_DAC_READ_SEARCH)) {
                retval = -EPERM;
                goto out_err;
        }
        if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
                retval = -EFAULT;
                goto out_err;
        }
        if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
            (f_handle.handle_bytes == 0)) {
                retval = -EINVAL;
                goto out_err;
        }
        handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes),
                         GFP_KERNEL);
        if (!handle) {
                retval = -ENOMEM;
                goto out_err;
        }
        /* copy the full handle */
        *handle = f_handle;
        if (copy_from_user(&handle->f_handle,
                           &ufh->f_handle,
                           f_handle.handle_bytes)) {
                retval = -EFAULT;
                goto out_handle;
        }

        retval = do_handle_to_path(mountdirfd, handle, path);

out_handle:
        kfree(handle);
out_err:
        return retval;
}

static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
                           int open_flag)
{
        long retval = 0;
        struct path path;
        struct file *file;
        int fd;

        retval = handle_to_path(mountdirfd, ufh, &path);
        if (retval)
                return retval;

        fd = get_unused_fd_flags(open_flag);
        if (fd < 0) {
                path_put(&path);
                return fd;
        }
        file = file_open_root(&path, "", open_flag, 0);
        if (IS_ERR(file)) {
                put_unused_fd(fd);
                retval =  PTR_ERR(file);
        } else {
                retval = fd;
                fd_install(fd, file);
        }
        path_put(&path);
        return retval;
}

/**
 * sys_open_by_handle_at: Open the file handle
 * @mountdirfd: directory file descriptor
 * @handle: file handle to be opened
 * @flags: open flags.
 *
 * @mountdirfd indicate the directory file descriptor
 * of the mount point. file handle is decoded relative
 * to the vfsmount pointed by the @mountdirfd. @flags
 * value is same as the open(2) flags.
 */
SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
                struct file_handle __user *, handle,
                int, flags)
{
        long ret;

        if (force_o_largefile())
                flags |= O_LARGEFILE;

        ret = do_handle_open(mountdirfd, handle, flags);
        return ret;
}

#ifdef CONFIG_COMPAT
/*
 * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
 * doesn't set the O_LARGEFILE flag.
 */
COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
                             struct file_handle __user *, handle, int, flags)
{
        return do_handle_open(mountdirfd, handle, flags);
}
#endif























































































































































































































































































































































































    4 





















































































    1 



    2 
    2 





    1 















    1 























































    4 







    4 









































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* include/asm-generic/tlb.h
 *
 *        Generic TLB shootdown code
 *
 * Copyright 2001 Red Hat, Inc.
 * Based on code from mm/memory.c Copyright Linus Torvalds and others.
 *
 * Copyright 2011 Red Hat, Inc., Peter Zijlstra
 */
#ifndef _ASM_GENERIC__TLB_H
#define _ASM_GENERIC__TLB_H

#include <linux/mmu_notifier.h>
#include <linux/swap.h>
#include <linux/hugetlb_inline.h>
#include <asm/tlbflush.h>
#include <asm/cacheflush.h>

/*
 * Blindly accessing user memory from NMI context can be dangerous
 * if we're in the middle of switching the current user task or switching
 * the loaded mm.
 */
#ifndef nmi_uaccess_okay
# define nmi_uaccess_okay() true
#endif

#ifdef CONFIG_MMU

/*
 * Generic MMU-gather implementation.
 *
 * The mmu_gather data structure is used by the mm code to implement the
 * correct and efficient ordering of freeing pages and TLB invalidations.
 *
 * This correct ordering is:
 *
 *  1) unhook page
 *  2) TLB invalidate page
 *  3) free page
 *
 * That is, we must never free a page before we have ensured there are no live
 * translations left to it. Otherwise it might be possible to observe (or
 * worse, change) the page content after it has been reused.
 *
 * The mmu_gather API consists of:
 *
 *  - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_finish_mmu()
 *
 *    start and finish a mmu_gather
 *
 *    Finish in particular will issue a (final) TLB invalidate and free
 *    all (remaining) queued pages.
 *
 *  - tlb_start_vma() / tlb_end_vma(); marks the start / end of a VMA
 *
 *    Defaults to flushing at tlb_end_vma() to reset the range; helps when
 *    there's large holes between the VMAs.
 *
 *  - tlb_remove_table()
 *
 *    tlb_remove_table() is the basic primitive to free page-table directories
 *    (__p*_free_tlb()).  In it's most primitive form it is an alias for
 *    tlb_remove_page() below, for when page directories are pages and have no
 *    additional constraints.
 *
 *    See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE.
 *
 *  - tlb_remove_page() / __tlb_remove_page()
 *  - tlb_remove_page_size() / __tlb_remove_page_size()
 *  - __tlb_remove_folio_pages()
 *
 *    __tlb_remove_page_size() is the basic primitive that queues a page for
 *    freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a
 *    boolean indicating if the queue is (now) full and a call to
 *    tlb_flush_mmu() is required.
 *
 *    tlb_remove_page() and tlb_remove_page_size() imply the call to
 *    tlb_flush_mmu() when required and has no return value.
 *
 *    __tlb_remove_folio_pages() is similar to __tlb_remove_page(), however,
 *    instead of removing a single page, remove the given number of consecutive
 *    pages that are all part of the same (large) folio: just like calling
 *    __tlb_remove_page() on each page individually.
 *
 *  - tlb_change_page_size()
 *
 *    call before __tlb_remove_page*() to set the current page-size; implies a
 *    possible tlb_flush_mmu() call.
 *
 *  - tlb_flush_mmu() / tlb_flush_mmu_tlbonly()
 *
 *    tlb_flush_mmu_tlbonly() - does the TLB invalidate (and resets
 *                              related state, like the range)
 *
 *    tlb_flush_mmu() - in addition to the above TLB invalidate, also frees
 *                        whatever pages are still batched.
 *
 *  - mmu_gather::fullmm
 *
 *    A flag set by tlb_gather_mmu_fullmm() to indicate we're going to free
 *    the entire mm; this allows a number of optimizations.
 *
 *    - We can ignore tlb_{start,end}_vma(); because we don't
 *      care about ranges. Everything will be shot down.
 *
 *    - (RISC) architectures that use ASIDs can cycle to a new ASID
 *      and delay the invalidation until ASID space runs out.
 *
 *  - mmu_gather::need_flush_all
 *
 *    A flag that can be set by the arch code if it wants to force
 *    flush the entire TLB irrespective of the range. For instance
 *    x86-PAE needs this when changing top-level entries.
 *
 * And allows the architecture to provide and implement tlb_flush():
 *
 * tlb_flush() may, in addition to the above mentioned mmu_gather fields, make
 * use of:
 *
 *  - mmu_gather::start / mmu_gather::end
 *
 *    which provides the range that needs to be flushed to cover the pages to
 *    be freed.
 *
 *  - mmu_gather::freed_tables
 *
 *    set when we freed page table pages
 *
 *  - tlb_get_unmap_shift() / tlb_get_unmap_size()
 *
 *    returns the smallest TLB entry size unmapped in this range.
 *
 * If an architecture does not provide tlb_flush() a default implementation
 * based on flush_tlb_range() will be used, unless MMU_GATHER_NO_RANGE is
 * specified, in which case we'll default to flush_tlb_mm().
 *
 * Additionally there are a few opt-in features:
 *
 *  MMU_GATHER_PAGE_SIZE
 *
 *  This ensures we call tlb_flush() every time tlb_change_page_size() actually
 *  changes the size and provides mmu_gather::page_size to tlb_flush().
 *
 *  This might be useful if your architecture has size specific TLB
 *  invalidation instructions.
 *
 *  MMU_GATHER_TABLE_FREE
 *
 *  This provides tlb_remove_table(), to be used instead of tlb_remove_page()
 *  for page directores (__p*_free_tlb()).
 *
 *  Useful if your architecture has non-page page directories.
 *
 *  When used, an architecture is expected to provide __tlb_remove_table()
 *  which does the actual freeing of these pages.
 *
 *  MMU_GATHER_RCU_TABLE_FREE
 *
 *  Like MMU_GATHER_TABLE_FREE, and adds semi-RCU semantics to the free (see
 *  comment below).
 *
 *  Useful if your architecture doesn't use IPIs for remote TLB invalidates
 *  and therefore doesn't naturally serialize with software page-table walkers.
 *
 *  MMU_GATHER_NO_FLUSH_CACHE
 *
 *  Indicates the architecture has flush_cache_range() but it needs *NOT* be called
 *  before unmapping a VMA.
 *
 *  NOTE: strictly speaking we shouldn't have this knob and instead rely on
 *          flush_cache_range() being a NOP, except Sparc64 seems to be
 *          different here.
 *
 *  MMU_GATHER_MERGE_VMAS
 *
 *  Indicates the architecture wants to merge ranges over VMAs; typical when
 *  multiple range invalidates are more expensive than a full invalidate.
 *
 *  MMU_GATHER_NO_RANGE
 *
 *  Use this if your architecture lacks an efficient flush_tlb_range(). This
 *  option implies MMU_GATHER_MERGE_VMAS above.
 *
 *  MMU_GATHER_NO_GATHER
 *
 *  If the option is set the mmu_gather will not track individual pages for
 *  delayed page free anymore. A platform that enables the option needs to
 *  provide its own implementation of the __tlb_remove_page_size() function to
 *  free pages.
 *
 *  This is useful if your architecture already flushes TLB entries in the
 *  various ptep_get_and_clear() functions.
 */

#ifdef CONFIG_MMU_GATHER_TABLE_FREE

struct mmu_table_batch {
#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
        struct rcu_head                rcu;
#endif
        unsigned int                nr;
        void                        *tables[];
};

#define MAX_TABLE_BATCH                \
        ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *))

extern void tlb_remove_table(struct mmu_gather *tlb, void *table);

#else /* !CONFIG_MMU_GATHER_HAVE_TABLE_FREE */

/*
 * Without MMU_GATHER_TABLE_FREE the architecture is assumed to have page based
 * page directories and we can use the normal page batching to free them.
 */
#define tlb_remove_table(tlb, page) tlb_remove_page((tlb), (page))

#endif /* CONFIG_MMU_GATHER_TABLE_FREE */

#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
/*
 * This allows an architecture that does not use the linux page-tables for
 * hardware to skip the TLBI when freeing page tables.
 */
#ifndef tlb_needs_table_invalidate
#define tlb_needs_table_invalidate() (true)
#endif

void tlb_remove_table_sync_one(void);

#else

#ifdef tlb_needs_table_invalidate
#error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE
#endif

static inline void tlb_remove_table_sync_one(void) { }

#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */


#ifndef CONFIG_MMU_GATHER_NO_GATHER
/*
 * If we can't allocate a page to make a big batch of page pointers
 * to work on, then just handle a few from the on-stack structure.
 */
#define MMU_GATHER_BUNDLE        8

struct mmu_gather_batch {
        struct mmu_gather_batch        *next;
        unsigned int                nr;
        unsigned int                max;
        struct encoded_page        *encoded_pages[];
};

#define MAX_GATHER_BATCH        \
        ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *))

/*
 * Limit the maximum number of mmu_gather batches to reduce a risk of soft
 * lockups for non-preemptible kernels on huge machines when a lot of memory
 * is zapped during unmapping.
 * 10K pages freed at once should be safe even without a preemption point.
 */
#define MAX_GATHER_BATCH_COUNT        (10000UL/MAX_GATHER_BATCH)

extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
                bool delay_rmap, int page_size);
bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page,
                unsigned int nr_pages, bool delay_rmap);

#ifdef CONFIG_SMP
/*
 * This both sets 'delayed_rmap', and returns true. It would be an inline
 * function, except we define it before the 'struct mmu_gather'.
 */
#define tlb_delay_rmap(tlb) (((tlb)->delayed_rmap = 1), true)
extern void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma);
#endif

#endif

/*
 * We have a no-op version of the rmap removal that doesn't
 * delay anything. That is used on S390, which flushes remote
 * TLBs synchronously, and on UP, which doesn't have any
 * remote TLBs to flush and is not preemptible due to this
 * all happening under the page table lock.
 */
#ifndef tlb_delay_rmap
#define tlb_delay_rmap(tlb) (false)
static inline void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
#endif

/*
 * struct mmu_gather is an opaque type used by the mm code for passing around
 * any data needed by arch specific code for tlb_remove_page.
 */
struct mmu_gather {
        struct mm_struct        *mm;

#ifdef CONFIG_MMU_GATHER_TABLE_FREE
        struct mmu_table_batch        *batch;
#endif

        unsigned long                start;
        unsigned long                end;
        /*
         * we are in the middle of an operation to clear
         * a full mm and can make some optimizations
         */
        unsigned int                fullmm : 1;

        /*
         * we have performed an operation which
         * requires a complete flush of the tlb
         */
        unsigned int                need_flush_all : 1;

        /*
         * we have removed page directories
         */
        unsigned int                freed_tables : 1;

        /*
         * Do we have pending delayed rmap removals?
         */
        unsigned int                delayed_rmap : 1;

        /*
         * at which levels have we cleared entries?
         */
        unsigned int                cleared_ptes : 1;
        unsigned int                cleared_pmds : 1;
        unsigned int                cleared_puds : 1;
        unsigned int                cleared_p4ds : 1;

        /*
         * tracks VM_EXEC | VM_HUGETLB in tlb_start_vma
         */
        unsigned int                vma_exec : 1;
        unsigned int                vma_huge : 1;
        unsigned int                vma_pfn  : 1;

        unsigned int                batch_count;

#ifndef CONFIG_MMU_GATHER_NO_GATHER
        struct mmu_gather_batch *active;
        struct mmu_gather_batch        local;
        struct page                *__pages[MMU_GATHER_BUNDLE];

#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        unsigned int page_size;
#endif
#endif
};

void tlb_flush_mmu(struct mmu_gather *tlb);

static inline void __tlb_adjust_range(struct mmu_gather *tlb,
                                      unsigned long address,
                                      unsigned int range_size)
{
        tlb->start = min(tlb->start, address);
        tlb->end = max(tlb->end, address + range_size);
}

static inline void __tlb_reset_range(struct mmu_gather *tlb)
{
        if (tlb->fullmm) {
                tlb->start = tlb->end = ~0;
        } else {
                tlb->start = TASK_SIZE;
                tlb->end = 0;
        }
        tlb->freed_tables = 0;
        tlb->cleared_ptes = 0;
        tlb->cleared_pmds = 0;
        tlb->cleared_puds = 0;
        tlb->cleared_p4ds = 0;
        /*
         * Do not reset mmu_gather::vma_* fields here, we do not
         * call into tlb_start_vma() again to set them if there is an
         * intermediate flush.
         */
}

#ifdef CONFIG_MMU_GATHER_NO_RANGE

#if defined(tlb_flush)
#error MMU_GATHER_NO_RANGE relies on default tlb_flush()
#endif

/*
 * When an architecture does not have efficient means of range flushing TLBs
 * there is no point in doing intermediate flushes on tlb_end_vma() to keep the
 * range small. We equally don't have to worry about page granularity or other
 * things.
 *
 * All we need to do is issue a full flush for any !0 range.
 */
static inline void tlb_flush(struct mmu_gather *tlb)
{
        if (tlb->end)
                flush_tlb_mm(tlb->mm);
}

#else /* CONFIG_MMU_GATHER_NO_RANGE */

#ifndef tlb_flush
/*
 * When an architecture does not provide its own tlb_flush() implementation
 * but does have a reasonably efficient flush_vma_range() implementation
 * use that.
 */
static inline void tlb_flush(struct mmu_gather *tlb)
{
        if (tlb->fullmm || tlb->need_flush_all) {
                flush_tlb_mm(tlb->mm);
        } else if (tlb->end) {
                struct vm_area_struct vma = {
                        .vm_mm = tlb->mm,
                        .vm_flags = (tlb->vma_exec ? VM_EXEC    : 0) |
                                    (tlb->vma_huge ? VM_HUGETLB : 0),
                };

                flush_tlb_range(&vma, tlb->start, tlb->end);
        }
}
#endif

#endif /* CONFIG_MMU_GATHER_NO_RANGE */

static inline void
tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        /*
         * flush_tlb_range() implementations that look at VM_HUGETLB (tile,
         * mips-4k) flush only large pages.
         *
         * flush_tlb_range() implementations that flush I-TLB also flush D-TLB
         * (tile, xtensa, arm), so it's ok to just add VM_EXEC to an existing
         * range.
         *
         * We rely on tlb_end_vma() to issue a flush, such that when we reset
         * these values the batch is empty.
         */
        tlb->vma_huge = is_vm_hugetlb_page(vma);
        tlb->vma_exec = !!(vma->vm_flags & VM_EXEC);
        tlb->vma_pfn  = !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP));
}

static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
{
        /*
         * Anything calling __tlb_adjust_range() also sets at least one of
         * these bits.
         */
        if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds ||
              tlb->cleared_puds || tlb->cleared_p4ds))
                return;

        tlb_flush(tlb);
        __tlb_reset_range(tlb);
}

static inline void tlb_remove_page_size(struct mmu_gather *tlb,
                                        struct page *page, int page_size)
{
        if (__tlb_remove_page_size(tlb, page, false, page_size))
                tlb_flush_mmu(tlb);
}

static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb,
                struct page *page, bool delay_rmap)
{
        return __tlb_remove_page_size(tlb, page, delay_rmap, PAGE_SIZE);
}

/* tlb_remove_page
 *        Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when
 *        required.
 */
static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
        return tlb_remove_page_size(tlb, page, PAGE_SIZE);
}

static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt)
{
        tlb_remove_table(tlb, pt);
}

/* Like tlb_remove_ptdesc, but for page-like page directories. */
static inline void tlb_remove_page_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt)
{
        tlb_remove_page(tlb, ptdesc_page(pt));
}

static inline void tlb_change_page_size(struct mmu_gather *tlb,
                                                     unsigned int page_size)
{
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        if (tlb->page_size && tlb->page_size != page_size) {
                if (!tlb->fullmm && !tlb->need_flush_all)
                        tlb_flush_mmu(tlb);
        }

        tlb->page_size = page_size;
#endif
}

static inline unsigned long tlb_get_unmap_shift(struct mmu_gather *tlb)
{
        if (tlb->cleared_ptes)
                return PAGE_SHIFT;
        if (tlb->cleared_pmds)
                return PMD_SHIFT;
        if (tlb->cleared_puds)
                return PUD_SHIFT;
        if (tlb->cleared_p4ds)
                return P4D_SHIFT;

        return PAGE_SHIFT;
}

static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb)
{
        return 1UL << tlb_get_unmap_shift(tlb);
}

/*
 * In the case of tlb vma handling, we can optimise these away in the
 * case where we're doing a full MM flush.  When we're doing a munmap,
 * the vmas are adjusted to only cover the region to be torn down.
 */
static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        if (tlb->fullmm)
                return;

        tlb_update_vma_flags(tlb, vma);
#ifndef CONFIG_MMU_GATHER_NO_FLUSH_CACHE
        flush_cache_range(vma, vma->vm_start, vma->vm_end);
#endif
}

static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        if (tlb->fullmm)
                return;

        /*
         * VM_PFNMAP is more fragile because the core mm will not track the
         * page mapcount -- there might not be page-frames for these PFNs after
         * all. Force flush TLBs for such ranges to avoid munmap() vs
         * unmap_mapping_range() races.
         */
        if (tlb->vma_pfn || !IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) {
                /*
                 * Do a TLB flush and reset the range at VMA boundaries; this avoids
                 * the ranges growing with the unused space between consecutive VMAs.
                 */
                tlb_flush_mmu_tlbonly(tlb);
        }
}

/*
 * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end,
 * and set corresponding cleared_*.
 */
static inline void tlb_flush_pte_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_ptes = 1;
}

static inline void tlb_flush_pmd_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_pmds = 1;
}

static inline void tlb_flush_pud_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_puds = 1;
}

static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_p4ds = 1;
}

#ifndef __tlb_remove_tlb_entry
static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address)
{
}
#endif

/**
 * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation.
 *
 * Record the fact that pte's were really unmapped by updating the range,
 * so we can later optimise away the tlb invalidate.   This helps when
 * userspace is unmapping already-unmapped pages, which happens quite a lot.
 */
#define tlb_remove_tlb_entry(tlb, ptep, address)                \
        do {                                                        \
                tlb_flush_pte_range(tlb, address, PAGE_SIZE);        \
                __tlb_remove_tlb_entry(tlb, ptep, address);        \
        } while (0)

/**
 * tlb_remove_tlb_entries - remember unmapping of multiple consecutive ptes for
 *                            later tlb invalidation.
 *
 * Similar to tlb_remove_tlb_entry(), but remember unmapping of multiple
 * consecutive ptes instead of only a single one.
 */
static inline void tlb_remove_tlb_entries(struct mmu_gather *tlb,
                pte_t *ptep, unsigned int nr, unsigned long address)
{
        tlb_flush_pte_range(tlb, address, PAGE_SIZE * nr);
        for (;;) {
                __tlb_remove_tlb_entry(tlb, ptep, address);
                if (--nr == 0)
                        break;
                ptep++;
                address += PAGE_SIZE;
        }
}

#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)        \
        do {                                                        \
                unsigned long _sz = huge_page_size(h);                \
                if (_sz >= P4D_SIZE)                                \
                        tlb_flush_p4d_range(tlb, address, _sz);        \
                else if (_sz >= PUD_SIZE)                        \
                        tlb_flush_pud_range(tlb, address, _sz);        \
                else if (_sz >= PMD_SIZE)                        \
                        tlb_flush_pmd_range(tlb, address, _sz);        \
                else                                                \
                        tlb_flush_pte_range(tlb, address, _sz);        \
                __tlb_remove_tlb_entry(tlb, ptep, address);        \
        } while (0)

/**
 * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation
 * This is a nop so far, because only x86 needs it.
 */
#ifndef __tlb_remove_pmd_tlb_entry
#define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0)
#endif

#define tlb_remove_pmd_tlb_entry(tlb, pmdp, address)                        \
        do {                                                                \
                tlb_flush_pmd_range(tlb, address, HPAGE_PMD_SIZE);        \
                __tlb_remove_pmd_tlb_entry(tlb, pmdp, address);                \
        } while (0)

/**
 * tlb_remove_pud_tlb_entry - remember a pud mapping for later tlb
 * invalidation. This is a nop so far, because only x86 needs it.
 */
#ifndef __tlb_remove_pud_tlb_entry
#define __tlb_remove_pud_tlb_entry(tlb, pudp, address) do {} while (0)
#endif

#define tlb_remove_pud_tlb_entry(tlb, pudp, address)                        \
        do {                                                                \
                tlb_flush_pud_range(tlb, address, HPAGE_PUD_SIZE);        \
                __tlb_remove_pud_tlb_entry(tlb, pudp, address);                \
        } while (0)

/*
 * For things like page tables caches (ie caching addresses "inside" the
 * page tables, like x86 does), for legacy reasons, flushing an
 * individual page had better flush the page table caches behind it. This
 * is definitely how x86 works, for example. And if you have an
 * architected non-legacy page table cache (which I'm not aware of
 * anybody actually doing), you're going to have some architecturally
 * explicit flushing for that, likely *separate* from a regular TLB entry
 * flush, and thus you'd need more than just some range expansion..
 *
 * So if we ever find an architecture
 * that would want something that odd, I think it is up to that
 * architecture to do its own odd thing, not cause pain for others
 * http://lkml.kernel.org/r/CA+55aFzBggoXtNXQeng5d_mRoDnaMBE5Y+URs+PHR67nUpMtaw@mail.gmail.com
 *
 * For now w.r.t page table cache, mark the range_size as PAGE_SIZE
 */

#ifndef pte_free_tlb
#define pte_free_tlb(tlb, ptep, address)                        \
        do {                                                        \
                tlb_flush_pmd_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __pte_free_tlb(tlb, ptep, address);                \
        } while (0)
#endif

#ifndef pmd_free_tlb
#define pmd_free_tlb(tlb, pmdp, address)                        \
        do {                                                        \
                tlb_flush_pud_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __pmd_free_tlb(tlb, pmdp, address);                \
        } while (0)
#endif

#ifndef pud_free_tlb
#define pud_free_tlb(tlb, pudp, address)                        \
        do {                                                        \
                tlb_flush_p4d_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __pud_free_tlb(tlb, pudp, address);                \
        } while (0)
#endif

#ifndef p4d_free_tlb
#define p4d_free_tlb(tlb, pudp, address)                        \
        do {                                                        \
                __tlb_adjust_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __p4d_free_tlb(tlb, pudp, address);                \
        } while (0)
#endif

#ifndef pte_needs_flush
static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte)
{
        return true;
}
#endif

#ifndef huge_pmd_needs_flush
static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
{
        return true;
}
#endif

#endif /* CONFIG_MMU */

#endif /* _ASM_GENERIC__TLB_H */




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * Copyright (C) 2020 Google LLC.
 */

#ifndef _LINUX_BPF_LSM_H
#define _LINUX_BPF_LSM_H

#include <linux/sched.h>
#include <linux/bpf.h>
#include <linux/lsm_hooks.h>

#ifdef CONFIG_BPF_LSM

#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
        RET bpf_lsm_##NAME(__VA_ARGS__);
#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK

struct bpf_storage_blob {
        struct bpf_local_storage __rcu *storage;
};

extern struct lsm_blob_sizes bpf_lsm_blob_sizes;

int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
                        const struct bpf_prog *prog);

bool bpf_lsm_is_sleepable_hook(u32 btf_id);
bool bpf_lsm_is_trusted(const struct bpf_prog *prog);

static inline struct bpf_storage_blob *bpf_inode(
        const struct inode *inode)
{
        if (unlikely(!inode->i_security))
                return NULL;

        return inode->i_security + bpf_lsm_blob_sizes.lbs_inode;
}

extern const struct bpf_func_proto bpf_inode_storage_get_proto;
extern const struct bpf_func_proto bpf_inode_storage_delete_proto;
void bpf_inode_storage_free(struct inode *inode);

void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func);

#else /* !CONFIG_BPF_LSM */

static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id)
{
        return false;
}

static inline bool bpf_lsm_is_trusted(const struct bpf_prog *prog)
{
        return false;
}

static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
                                      const struct bpf_prog *prog)
{
        return -EOPNOTSUPP;
}

static inline struct bpf_storage_blob *bpf_inode(
        const struct inode *inode)
{
        return NULL;
}

static inline void bpf_inode_storage_free(struct inode *inode)
{
}

static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
                                           bpf_func_t *bpf_func)
{
}

#endif /* CONFIG_BPF_LSM */

#endif /* _LINUX_BPF_LSM_H */































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PERCPU_COUNTER_H
#define _LINUX_PERCPU_COUNTER_H
/*
 * A simple "approximate counter" for use in ext2 and ext3 superblocks.
 *
 * WARNING: these things are HUGE.  4 kbytes per counter on 32-way P4.
 */

#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/list.h>
#include <linux/threads.h>
#include <linux/percpu.h>
#include <linux/types.h>

/* percpu_counter batch for local add or sub */
#define PERCPU_COUNTER_LOCAL_BATCH        INT_MAX

#ifdef CONFIG_SMP

struct percpu_counter {
        raw_spinlock_t lock;
        s64 count;
#ifdef CONFIG_HOTPLUG_CPU
        struct list_head list;        /* All percpu_counters are on a list */
#endif
        s32 __percpu *counters;
};

extern int percpu_counter_batch;

int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
                               gfp_t gfp, u32 nr_counters,
                               struct lock_class_key *key);

#define percpu_counter_init_many(fbc, value, gfp, nr_counters)                \
        ({                                                                \
                static struct lock_class_key __key;                        \
                                                                        \
                __percpu_counter_init_many(fbc, value, gfp, nr_counters,\
                                           &__key);                        \
        })


#define percpu_counter_init(fbc, value, gfp)                                \
        percpu_counter_init_many(fbc, value, gfp, 1)

void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters);
static inline void percpu_counter_destroy(struct percpu_counter *fbc)
{
        percpu_counter_destroy_many(fbc, 1);
}

void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
                              s32 batch);
s64 __percpu_counter_sum(struct percpu_counter *fbc);
int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit,
                                  s64 amount, s32 batch);
void percpu_counter_sync(struct percpu_counter *fbc);

static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
{
        return __percpu_counter_compare(fbc, rhs, percpu_counter_batch);
}

static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add_batch(fbc, amount, percpu_counter_batch);
}

static inline bool
percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
{
        return __percpu_counter_limited_add(fbc, limit, amount,
                                            percpu_counter_batch);
}

/*
 * With percpu_counter_add_local() and percpu_counter_sub_local(), counts
 * are accumulated in local per cpu counter and not in fbc->count until
 * local count overflows PERCPU_COUNTER_LOCAL_BATCH. This makes counter
 * write efficient.
 * But percpu_counter_sum(), instead of percpu_counter_read(), needs to be
 * used to add up the counts from each CPU to account for all the local
 * counts. So percpu_counter_add_local() and percpu_counter_sub_local()
 * should be used when a counter is updated frequently and read rarely.
 */
static inline void
percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add_batch(fbc, amount, PERCPU_COUNTER_LOCAL_BATCH);
}

static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
{
        s64 ret = __percpu_counter_sum(fbc);
        return ret < 0 ? 0 : ret;
}

static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
{
        return __percpu_counter_sum(fbc);
}

static inline s64 percpu_counter_read(struct percpu_counter *fbc)
{
        return fbc->count;
}

/*
 * It is possible for the percpu_counter_read() to return a small negative
 * number for some counter which should never be negative.
 *
 */
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
        /* Prevent reloads of fbc->count */
        s64 ret = READ_ONCE(fbc->count);

        if (ret >= 0)
                return ret;
        return 0;
}

static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
{
        return (fbc->counters != NULL);
}

#else /* !CONFIG_SMP */

struct percpu_counter {
        s64 count;
};

static inline int percpu_counter_init_many(struct percpu_counter *fbc,
                                           s64 amount, gfp_t gfp,
                                           u32 nr_counters)
{
        u32 i;

        for (i = 0; i < nr_counters; i++)
                fbc[i].count = amount;

        return 0;
}

static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount,
                                      gfp_t gfp)
{
        return percpu_counter_init_many(fbc, amount, gfp, 1);
}

static inline void percpu_counter_destroy_many(struct percpu_counter *fbc,
                                               u32 nr_counters)
{
}

static inline void percpu_counter_destroy(struct percpu_counter *fbc)
{
}

static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
{
        fbc->count = amount;
}

static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
{
        if (fbc->count > rhs)
                return 1;
        else if (fbc->count < rhs)
                return -1;
        else
                return 0;
}

static inline int
__percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
{
        return percpu_counter_compare(fbc, rhs);
}

static inline void
percpu_counter_add(struct percpu_counter *fbc, s64 amount)
{
        unsigned long flags;

        local_irq_save(flags);
        fbc->count += amount;
        local_irq_restore(flags);
}

static inline bool
percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
{
        unsigned long flags;
        bool good = false;
        s64 count;

        if (amount == 0)
                return true;

        local_irq_save(flags);
        count = fbc->count + amount;
        if ((amount > 0 && count <= limit) ||
            (amount < 0 && count >= limit)) {
                fbc->count = count;
                good = true;
        }
        local_irq_restore(flags);
        return good;
}

/* non-SMP percpu_counter_add_local is the same with percpu_counter_add */
static inline void
percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add(fbc, amount);
}

static inline void
percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
        percpu_counter_add(fbc, amount);
}

static inline s64 percpu_counter_read(struct percpu_counter *fbc)
{
        return fbc->count;
}

/*
 * percpu_counter is intended to track positive numbers. In the UP case the
 * number should never be negative.
 */
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
        return fbc->count;
}

static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
{
        return percpu_counter_read_positive(fbc);
}

static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
{
        return percpu_counter_read(fbc);
}

static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
{
        return true;
}

static inline void percpu_counter_sync(struct percpu_counter *fbc)
{
}
#endif        /* CONFIG_SMP */

static inline void percpu_counter_inc(struct percpu_counter *fbc)
{
        percpu_counter_add(fbc, 1);
}

static inline void percpu_counter_dec(struct percpu_counter *fbc)
{
        percpu_counter_add(fbc, -1);
}

static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add(fbc, -amount);
}

static inline void
percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add_local(fbc, -amount);
}

#endif /* _LINUX_PERCPU_COUNTER_H */

























































































































































   38 















   28 













   31 


































   32 










   10 

























































   23 










   12 
   12 










    5 
    5 













































































    2 




















    5 








































    1 





























































































    2 











    1 













    6 
    3 
































































































































































































































































































































































































































   14 
    6 











    3 













    8 
















   24 
   22 


















































































    4 
    1 































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LIST_H
#define _LINUX_LIST_H

#include <linux/container_of.h>
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/poison.h>
#include <linux/const.h>

#include <asm/barrier.h>

/*
 * Circular doubly linked list implementation.
 *
 * Some of the internal functions ("__xxx") are useful when
 * manipulating whole lists rather than single entries, as
 * sometimes we already know the next/prev entries and we can
 * generate better code by using them directly rather than
 * using the generic single-entry routines.
 */

#define LIST_HEAD_INIT(name) { &(name), &(name) }

#define LIST_HEAD(name) \
        struct list_head name = LIST_HEAD_INIT(name)

/**
 * INIT_LIST_HEAD - Initialize a list_head structure
 * @list: list_head structure to be initialized.
 *
 * Initializes the list_head to point to itself.  If it is a list header,
 * the result is an empty list.
 */
static inline void INIT_LIST_HEAD(struct list_head *list)
{
        WRITE_ONCE(list->next, list);
        WRITE_ONCE(list->prev, list);
}

#ifdef CONFIG_LIST_HARDENED

#ifdef CONFIG_DEBUG_LIST
# define __list_valid_slowpath
#else
# define __list_valid_slowpath __cold __preserve_most
#endif

/*
 * Performs the full set of list corruption checks before __list_add().
 * On list corruption reports a warning, and returns false.
 */
extern bool __list_valid_slowpath __list_add_valid_or_report(struct list_head *new,
                                                             struct list_head *prev,
                                                             struct list_head *next);

/*
 * Performs list corruption checks before __list_add(). Returns false if a
 * corruption is detected, true otherwise.
 *
 * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking
 * inline to catch non-faulting corruptions, and only if a corruption is
 * detected calls the reporting function __list_add_valid_or_report().
 */
static __always_inline bool __list_add_valid(struct list_head *new,
                                             struct list_head *prev,
                                             struct list_head *next)
{
        bool ret = true;

        if (!IS_ENABLED(CONFIG_DEBUG_LIST)) {
                /*
                 * With the hardening version, elide checking if next and prev
                 * are NULL, since the immediate dereference of them below would
                 * result in a fault if NULL.
                 *
                 * With the reduced set of checks, we can afford to inline the
                 * checks, which also gives the compiler a chance to elide some
                 * of them completely if they can be proven at compile-time. If
                 * one of the pre-conditions does not hold, the slow-path will
                 * show a report which pre-condition failed.
                 */
                if (likely(next->prev == prev && prev->next == next && new != prev && new != next))
                        return true;
                ret = false;
        }

        ret &= __list_add_valid_or_report(new, prev, next);
        return ret;
}

/*
 * Performs the full set of list corruption checks before __list_del_entry().
 * On list corruption reports a warning, and returns false.
 */
extern bool __list_valid_slowpath __list_del_entry_valid_or_report(struct list_head *entry);

/*
 * Performs list corruption checks before __list_del_entry(). Returns false if a
 * corruption is detected, true otherwise.
 *
 * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking
 * inline to catch non-faulting corruptions, and only if a corruption is
 * detected calls the reporting function __list_del_entry_valid_or_report().
 */
static __always_inline bool __list_del_entry_valid(struct list_head *entry)
{
        bool ret = true;

        if (!IS_ENABLED(CONFIG_DEBUG_LIST)) {
                struct list_head *prev = entry->prev;
                struct list_head *next = entry->next;

                /*
                 * With the hardening version, elide checking if next and prev
                 * are NULL, LIST_POISON1 or LIST_POISON2, since the immediate
                 * dereference of them below would result in a fault.
                 */
                if (likely(prev->next == entry && next->prev == entry))
                        return true;
                ret = false;
        }

        ret &= __list_del_entry_valid_or_report(entry);
        return ret;
}
#else
static inline bool __list_add_valid(struct list_head *new,
                                struct list_head *prev,
                                struct list_head *next)
{
        return true;
}
static inline bool __list_del_entry_valid(struct list_head *entry)
{
        return true;
}
#endif

/*
 * Insert a new entry between two known consecutive entries.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_add(struct list_head *new,
                              struct list_head *prev,
                              struct list_head *next)
{
        if (!__list_add_valid(new, prev, next))
                return;

        next->prev = new;
        new->next = next;
        new->prev = prev;
        WRITE_ONCE(prev->next, new);
}

/**
 * list_add - add a new entry
 * @new: new entry to be added
 * @head: list head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 */
static inline void list_add(struct list_head *new, struct list_head *head)
{
        __list_add(new, head, head->next);
}


/**
 * list_add_tail - add a new entry
 * @new: new entry to be added
 * @head: list head to add it before
 *
 * Insert a new entry before the specified head.
 * This is useful for implementing queues.
 */
static inline void list_add_tail(struct list_head *new, struct list_head *head)
{
        __list_add(new, head->prev, head);
}

/*
 * Delete a list entry by making the prev/next entries
 * point to each other.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_del(struct list_head * prev, struct list_head * next)
{
        next->prev = prev;
        WRITE_ONCE(prev->next, next);
}

/*
 * Delete a list entry and clear the 'prev' pointer.
 *
 * This is a special-purpose list clearing method used in the networking code
 * for lists allocated as per-cpu, where we don't want to incur the extra
 * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this
 * needs to check the node 'prev' pointer instead of calling list_empty().
 */
static inline void __list_del_clearprev(struct list_head *entry)
{
        __list_del(entry->prev, entry->next);
        entry->prev = NULL;
}

static inline void __list_del_entry(struct list_head *entry)
{
        if (!__list_del_entry_valid(entry))
                return;

        __list_del(entry->prev, entry->next);
}

/**
 * list_del - deletes entry from list.
 * @entry: the element to delete from the list.
 * Note: list_empty() on entry does not return true after this, the entry is
 * in an undefined state.
 */
static inline void list_del(struct list_head *entry)
{
        __list_del_entry(entry);
        entry->next = LIST_POISON1;
        entry->prev = LIST_POISON2;
}

/**
 * list_replace - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * If @old was empty, it will be overwritten.
 */
static inline void list_replace(struct list_head *old,
                                struct list_head *new)
{
        new->next = old->next;
        new->next->prev = new;
        new->prev = old->prev;
        new->prev->next = new;
}

/**
 * list_replace_init - replace old entry by new one and initialize the old one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * If @old was empty, it will be overwritten.
 */
static inline void list_replace_init(struct list_head *old,
                                     struct list_head *new)
{
        list_replace(old, new);
        INIT_LIST_HEAD(old);
}

/**
 * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position
 * @entry1: the location to place entry2
 * @entry2: the location to place entry1
 */
static inline void list_swap(struct list_head *entry1,
                             struct list_head *entry2)
{
        struct list_head *pos = entry2->prev;

        list_del(entry2);
        list_replace(entry1, entry2);
        if (pos == entry1)
                pos = entry2;
        list_add(entry1, pos);
}

/**
 * list_del_init - deletes entry from list and reinitialize it.
 * @entry: the element to delete from the list.
 */
static inline void list_del_init(struct list_head *entry)
{
        __list_del_entry(entry);
        INIT_LIST_HEAD(entry);
}

/**
 * list_move - delete from one list and add as another's head
 * @list: the entry to move
 * @head: the head that will precede our entry
 */
static inline void list_move(struct list_head *list, struct list_head *head)
{
        __list_del_entry(list);
        list_add(list, head);
}

/**
 * list_move_tail - delete from one list and add as another's tail
 * @list: the entry to move
 * @head: the head that will follow our entry
 */
static inline void list_move_tail(struct list_head *list,
                                  struct list_head *head)
{
        __list_del_entry(list);
        list_add_tail(list, head);
}

/**
 * list_bulk_move_tail - move a subsection of a list to its tail
 * @head: the head that will follow our entry
 * @first: first entry to move
 * @last: last entry to move, can be the same as first
 *
 * Move all entries between @first and including @last before @head.
 * All three entries must belong to the same linked list.
 */
static inline void list_bulk_move_tail(struct list_head *head,
                                       struct list_head *first,
                                       struct list_head *last)
{
        first->prev->next = last->next;
        last->next->prev = first->prev;

        head->prev->next = first;
        first->prev = head->prev;

        last->next = head;
        head->prev = last;
}

/**
 * list_is_first -- tests whether @list is the first entry in list @head
 * @list: the entry to test
 * @head: the head of the list
 */
static inline int list_is_first(const struct list_head *list, const struct list_head *head)
{
        return list->prev == head;
}

/**
 * list_is_last - tests whether @list is the last entry in list @head
 * @list: the entry to test
 * @head: the head of the list
 */
static inline int list_is_last(const struct list_head *list, const struct list_head *head)
{
        return list->next == head;
}

/**
 * list_is_head - tests whether @list is the list @head
 * @list: the entry to test
 * @head: the head of the list
 */
static inline int list_is_head(const struct list_head *list, const struct list_head *head)
{
        return list == head;
}

/**
 * list_empty - tests whether a list is empty
 * @head: the list to test.
 */
static inline int list_empty(const struct list_head *head)
{
        return READ_ONCE(head->next) == head;
}

/**
 * list_del_init_careful - deletes entry from list and reinitialize it.
 * @entry: the element to delete from the list.
 *
 * This is the same as list_del_init(), except designed to be used
 * together with list_empty_careful() in a way to guarantee ordering
 * of other memory operations.
 *
 * Any memory operations done before a list_del_init_careful() are
 * guaranteed to be visible after a list_empty_careful() test.
 */
static inline void list_del_init_careful(struct list_head *entry)
{
        __list_del_entry(entry);
        WRITE_ONCE(entry->prev, entry);
        smp_store_release(&entry->next, entry);
}

/**
 * list_empty_careful - tests whether a list is empty and not being modified
 * @head: the list to test
 *
 * Description:
 * tests whether a list is empty _and_ checks that no other CPU might be
 * in the process of modifying either member (next or prev)
 *
 * NOTE: using list_empty_careful() without synchronization
 * can only be safe if the only activity that can happen
 * to the list entry is list_del_init(). Eg. it cannot be used
 * if another CPU could re-list_add() it.
 */
static inline int list_empty_careful(const struct list_head *head)
{
        struct list_head *next = smp_load_acquire(&head->next);
        return list_is_head(next, head) && (next == READ_ONCE(head->prev));
}

/**
 * list_rotate_left - rotate the list to the left
 * @head: the head of the list
 */
static inline void list_rotate_left(struct list_head *head)
{
        struct list_head *first;

        if (!list_empty(head)) {
                first = head->next;
                list_move_tail(first, head);
        }
}

/**
 * list_rotate_to_front() - Rotate list to specific item.
 * @list: The desired new front of the list.
 * @head: The head of the list.
 *
 * Rotates list so that @list becomes the new front of the list.
 */
static inline void list_rotate_to_front(struct list_head *list,
                                        struct list_head *head)
{
        /*
         * Deletes the list head from the list denoted by @head and
         * places it as the tail of @list, this effectively rotates the
         * list so that @list is at the front.
         */
        list_move_tail(head, list);
}

/**
 * list_is_singular - tests whether a list has just one entry.
 * @head: the list to test.
 */
static inline int list_is_singular(const struct list_head *head)
{
        return !list_empty(head) && (head->next == head->prev);
}

static inline void __list_cut_position(struct list_head *list,
                struct list_head *head, struct list_head *entry)
{
        struct list_head *new_first = entry->next;
        list->next = head->next;
        list->next->prev = list;
        list->prev = entry;
        entry->next = list;
        head->next = new_first;
        new_first->prev = head;
}

/**
 * list_cut_position - cut a list into two
 * @list: a new list to add all removed entries
 * @head: a list with entries
 * @entry: an entry within head, could be the head itself
 *        and if so we won't cut the list
 *
 * This helper moves the initial part of @head, up to and
 * including @entry, from @head to @list. You should
 * pass on @entry an element you know is on @head. @list
 * should be an empty list or a list you do not care about
 * losing its data.
 *
 */
static inline void list_cut_position(struct list_head *list,
                struct list_head *head, struct list_head *entry)
{
        if (list_empty(head))
                return;
        if (list_is_singular(head) && !list_is_head(entry, head) && (entry != head->next))
                return;
        if (list_is_head(entry, head))
                INIT_LIST_HEAD(list);
        else
                __list_cut_position(list, head, entry);
}

/**
 * list_cut_before - cut a list into two, before given entry
 * @list: a new list to add all removed entries
 * @head: a list with entries
 * @entry: an entry within head, could be the head itself
 *
 * This helper moves the initial part of @head, up to but
 * excluding @entry, from @head to @list.  You should pass
 * in @entry an element you know is on @head.  @list should
 * be an empty list or a list you do not care about losing
 * its data.
 * If @entry == @head, all entries on @head are moved to
 * @list.
 */
static inline void list_cut_before(struct list_head *list,
                                   struct list_head *head,
                                   struct list_head *entry)
{
        if (head->next == entry) {
                INIT_LIST_HEAD(list);
                return;
        }
        list->next = head->next;
        list->next->prev = list;
        list->prev = entry->prev;
        list->prev->next = list;
        head->next = entry;
        entry->prev = head;
}

static inline void __list_splice(const struct list_head *list,
                                 struct list_head *prev,
                                 struct list_head *next)
{
        struct list_head *first = list->next;
        struct list_head *last = list->prev;

        first->prev = prev;
        prev->next = first;

        last->next = next;
        next->prev = last;
}

/**
 * list_splice - join two lists, this is designed for stacks
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 */
static inline void list_splice(const struct list_head *list,
                                struct list_head *head)
{
        if (!list_empty(list))
                __list_splice(list, head, head->next);
}

/**
 * list_splice_tail - join two lists, each list being a queue
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 */
static inline void list_splice_tail(struct list_head *list,
                                struct list_head *head)
{
        if (!list_empty(list))
                __list_splice(list, head->prev, head);
}

/**
 * list_splice_init - join two lists and reinitialise the emptied list.
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 *
 * The list at @list is reinitialised
 */
static inline void list_splice_init(struct list_head *list,
                                    struct list_head *head)
{
        if (!list_empty(list)) {
                __list_splice(list, head, head->next);
                INIT_LIST_HEAD(list);
        }
}

/**
 * list_splice_tail_init - join two lists and reinitialise the emptied list
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 *
 * Each of the lists is a queue.
 * The list at @list is reinitialised
 */
static inline void list_splice_tail_init(struct list_head *list,
                                         struct list_head *head)
{
        if (!list_empty(list)) {
                __list_splice(list, head->prev, head);
                INIT_LIST_HEAD(list);
        }
}

/**
 * list_entry - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 */
#define list_entry(ptr, type, member) \
        container_of(ptr, type, member)

/**
 * list_first_entry - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note, that list is expected to be not empty.
 */
#define list_first_entry(ptr, type, member) \
        list_entry((ptr)->next, type, member)

/**
 * list_last_entry - get the last element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note, that list is expected to be not empty.
 */
#define list_last_entry(ptr, type, member) \
        list_entry((ptr)->prev, type, member)

/**
 * list_first_entry_or_null - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note that if the list is empty, it returns NULL.
 */
#define list_first_entry_or_null(ptr, type, member) ({ \
        struct list_head *head__ = (ptr); \
        struct list_head *pos__ = READ_ONCE(head__->next); \
        pos__ != head__ ? list_entry(pos__, type, member) : NULL; \
})

/**
 * list_next_entry - get the next element in list
 * @pos:        the type * to cursor
 * @member:        the name of the list_head within the struct.
 */
#define list_next_entry(pos, member) \
        list_entry((pos)->member.next, typeof(*(pos)), member)

/**
 * list_next_entry_circular - get the next element in list
 * @pos:        the type * to cursor.
 * @head:        the list head to take the element from.
 * @member:        the name of the list_head within the struct.
 *
 * Wraparound if pos is the last element (return the first element).
 * Note, that list is expected to be not empty.
 */
#define list_next_entry_circular(pos, head, member) \
        (list_is_last(&(pos)->member, head) ? \
        list_first_entry(head, typeof(*(pos)), member) : list_next_entry(pos, member))

/**
 * list_prev_entry - get the prev element in list
 * @pos:        the type * to cursor
 * @member:        the name of the list_head within the struct.
 */
#define list_prev_entry(pos, member) \
        list_entry((pos)->member.prev, typeof(*(pos)), member)

/**
 * list_prev_entry_circular - get the prev element in list
 * @pos:        the type * to cursor.
 * @head:        the list head to take the element from.
 * @member:        the name of the list_head within the struct.
 *
 * Wraparound if pos is the first element (return the last element).
 * Note, that list is expected to be not empty.
 */
#define list_prev_entry_circular(pos, head, member) \
        (list_is_first(&(pos)->member, head) ? \
        list_last_entry(head, typeof(*(pos)), member) : list_prev_entry(pos, member))

/**
 * list_for_each        -        iterate over a list
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each(pos, head) \
        for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next)

/**
 * list_for_each_reverse - iterate backwards over a list
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each_reverse(pos, head) \
        for (pos = (head)->prev; pos != (head); pos = pos->prev)

/**
 * list_for_each_rcu - Iterate over a list in an RCU-safe fashion
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each_rcu(pos, head)                  \
        for (pos = rcu_dereference((head)->next); \
             !list_is_head(pos, (head)); \
             pos = rcu_dereference(pos->next))

/**
 * list_for_each_continue - continue iteration over a list
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 *
 * Continue to iterate over a list, continuing after the current position.
 */
#define list_for_each_continue(pos, head) \
        for (pos = pos->next; !list_is_head(pos, (head)); pos = pos->next)

/**
 * list_for_each_prev        -        iterate over a list backwards
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each_prev(pos, head) \
        for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev)

/**
 * list_for_each_safe - iterate over a list safe against removal of list entry
 * @pos:        the &struct list_head to use as a loop cursor.
 * @n:                another &struct list_head to use as temporary storage
 * @head:        the head for your list.
 */
#define list_for_each_safe(pos, n, head) \
        for (pos = (head)->next, n = pos->next; \
             !list_is_head(pos, (head)); \
             pos = n, n = pos->next)

/**
 * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
 * @pos:        the &struct list_head to use as a loop cursor.
 * @n:                another &struct list_head to use as temporary storage
 * @head:        the head for your list.
 */
#define list_for_each_prev_safe(pos, n, head) \
        for (pos = (head)->prev, n = pos->prev; \
             !list_is_head(pos, (head)); \
             pos = n, n = pos->prev)

/**
 * list_count_nodes - count nodes in the list
 * @head:        the head for your list.
 */
static inline size_t list_count_nodes(struct list_head *head)
{
        struct list_head *pos;
        size_t count = 0;

        list_for_each(pos, head)
                count++;

        return count;
}

/**
 * list_entry_is_head - test if the entry points to the head of the list
 * @pos:        the type * to cursor
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_entry_is_head(pos, head, member)                                \
        list_is_head(&pos->member, (head))

/**
 * list_for_each_entry        -        iterate over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry(pos, head, member)                                \
        for (pos = list_first_entry(head, typeof(*pos), member);        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_reverse - iterate backwards over list of given type.
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry_reverse(pos, head, member)                        \
        for (pos = list_last_entry(head, typeof(*pos), member);                \
             !list_entry_is_head(pos, head, member);                         \
             pos = list_prev_entry(pos, member))

/**
 * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue()
 * @pos:        the type * to use as a start point
 * @head:        the head of the list
 * @member:        the name of the list_head within the struct.
 *
 * Prepares a pos entry for use as a start point in list_for_each_entry_continue().
 */
#define list_prepare_entry(pos, head, member) \
        ((pos) ? : list_entry(head, typeof(*pos), member))

/**
 * list_for_each_entry_continue - continue iteration over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Continue to iterate over list of given type, continuing after
 * the current position.
 */
#define list_for_each_entry_continue(pos, head, member)                 \
        for (pos = list_next_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_continue_reverse - iterate backwards from the given point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Start to iterate over list of given type backwards, continuing after
 * the current position.
 */
#define list_for_each_entry_continue_reverse(pos, head, member)                \
        for (pos = list_prev_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_prev_entry(pos, member))

/**
 * list_for_each_entry_from - iterate over list of given type from the current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type, continuing from current position.
 */
#define list_for_each_entry_from(pos, head, member)                         \
        for (; !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_from_reverse - iterate backwards over list of given type
 *                                    from the current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate backwards over list of given type, continuing from current position.
 */
#define list_for_each_entry_from_reverse(pos, head, member)                \
        for (; !list_entry_is_head(pos, head, member);                        \
             pos = list_prev_entry(pos, member))

/**
 * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry_safe(pos, n, head, member)                        \
        for (pos = list_first_entry(head, typeof(*pos), member),        \
                n = list_next_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                         \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_continue - continue list iteration safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type, continuing after current point,
 * safe against removal of list entry.
 */
#define list_for_each_entry_safe_continue(pos, n, head, member)                 \
        for (pos = list_next_entry(pos, member),                                 \
                n = list_next_entry(pos, member);                                \
             !list_entry_is_head(pos, head, member);                                \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_from - iterate over list from current point safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type from current point, safe against
 * removal of list entry.
 */
#define list_for_each_entry_safe_from(pos, n, head, member)                         \
        for (n = list_next_entry(pos, member);                                        \
             !list_entry_is_head(pos, head, member);                                \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate backwards over list of given type, safe against removal
 * of list entry.
 */
#define list_for_each_entry_safe_reverse(pos, n, head, member)                \
        for (pos = list_last_entry(head, typeof(*pos), member),                \
                n = list_prev_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                         \
             pos = n, n = list_prev_entry(n, member))

/**
 * list_safe_reset_next - reset a stale list_for_each_entry_safe loop
 * @pos:        the loop cursor used in the list_for_each_entry_safe loop
 * @n:                temporary storage used in list_for_each_entry_safe
 * @member:        the name of the list_head within the struct.
 *
 * list_safe_reset_next is not safe to use in general if the list may be
 * modified concurrently (eg. the lock is dropped in the loop body). An
 * exception to this is if the cursor element (pos) is pinned in the list,
 * and list_safe_reset_next is called after re-taking the lock and before
 * completing the current iteration of the loop body.
 */
#define list_safe_reset_next(pos, n, member)                                \
        n = list_next_entry(pos, member)

/*
 * Double linked lists with a single pointer list head.
 * Mostly useful for hash tables where the two pointer list head is
 * too wasteful.
 * You lose the ability to access the tail in O(1).
 */

#define HLIST_HEAD_INIT { .first = NULL }
#define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
static inline void INIT_HLIST_NODE(struct hlist_node *h)
{
        h->next = NULL;
        h->pprev = NULL;
}

/**
 * hlist_unhashed - Has node been removed from list and reinitialized?
 * @h: Node to be checked
 *
 * Not that not all removal functions will leave a node in unhashed
 * state.  For example, hlist_nulls_del_init_rcu() does leave the
 * node in unhashed state, but hlist_nulls_del() does not.
 */
static inline int hlist_unhashed(const struct hlist_node *h)
{
        return !h->pprev;
}

/**
 * hlist_unhashed_lockless - Version of hlist_unhashed for lockless use
 * @h: Node to be checked
 *
 * This variant of hlist_unhashed() must be used in lockless contexts
 * to avoid potential load-tearing.  The READ_ONCE() is paired with the
 * various WRITE_ONCE() in hlist helpers that are defined below.
 */
static inline int hlist_unhashed_lockless(const struct hlist_node *h)
{
        return !READ_ONCE(h->pprev);
}

/**
 * hlist_empty - Is the specified hlist_head structure an empty hlist?
 * @h: Structure to check.
 */
static inline int hlist_empty(const struct hlist_head *h)
{
        return !READ_ONCE(h->first);
}

static inline void __hlist_del(struct hlist_node *n)
{
        struct hlist_node *next = n->next;
        struct hlist_node **pprev = n->pprev;

        WRITE_ONCE(*pprev, next);
        if (next)
                WRITE_ONCE(next->pprev, pprev);
}

/**
 * hlist_del - Delete the specified hlist_node from its list
 * @n: Node to delete.
 *
 * Note that this function leaves the node in hashed state.  Use
 * hlist_del_init() or similar instead to unhash @n.
 */
static inline void hlist_del(struct hlist_node *n)
{
        __hlist_del(n);
        n->next = LIST_POISON1;
        n->pprev = LIST_POISON2;
}

/**
 * hlist_del_init - Delete the specified hlist_node from its list and initialize
 * @n: Node to delete.
 *
 * Note that this function leaves the node in unhashed state.
 */
static inline void hlist_del_init(struct hlist_node *n)
{
        if (!hlist_unhashed(n)) {
                __hlist_del(n);
                INIT_HLIST_NODE(n);
        }
}

/**
 * hlist_add_head - add a new entry at the beginning of the hlist
 * @n: new entry to be added
 * @h: hlist head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 */
static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
{
        struct hlist_node *first = h->first;
        WRITE_ONCE(n->next, first);
        if (first)
                WRITE_ONCE(first->pprev, &n->next);
        WRITE_ONCE(h->first, n);
        WRITE_ONCE(n->pprev, &h->first);
}

/**
 * hlist_add_before - add a new entry before the one specified
 * @n: new entry to be added
 * @next: hlist node to add it before, which must be non-NULL
 */
static inline void hlist_add_before(struct hlist_node *n,
                                    struct hlist_node *next)
{
        WRITE_ONCE(n->pprev, next->pprev);
        WRITE_ONCE(n->next, next);
        WRITE_ONCE(next->pprev, &n->next);
        WRITE_ONCE(*(n->pprev), n);
}

/**
 * hlist_add_behind - add a new entry after the one specified
 * @n: new entry to be added
 * @prev: hlist node to add it after, which must be non-NULL
 */
static inline void hlist_add_behind(struct hlist_node *n,
                                    struct hlist_node *prev)
{
        WRITE_ONCE(n->next, prev->next);
        WRITE_ONCE(prev->next, n);
        WRITE_ONCE(n->pprev, &prev->next);

        if (n->next)
                WRITE_ONCE(n->next->pprev, &n->next);
}

/**
 * hlist_add_fake - create a fake hlist consisting of a single headless node
 * @n: Node to make a fake list out of
 *
 * This makes @n appear to be its own predecessor on a headless hlist.
 * The point of this is to allow things like hlist_del() to work correctly
 * in cases where there is no list.
 */
static inline void hlist_add_fake(struct hlist_node *n)
{
        n->pprev = &n->next;
}

/**
 * hlist_fake: Is this node a fake hlist?
 * @h: Node to check for being a self-referential fake hlist.
 */
static inline bool hlist_fake(struct hlist_node *h)
{
        return h->pprev == &h->next;
}

/**
 * hlist_is_singular_node - is node the only element of the specified hlist?
 * @n: Node to check for singularity.
 * @h: Header for potentially singular list.
 *
 * Check whether the node is the only node of the head without
 * accessing head, thus avoiding unnecessary cache misses.
 */
static inline bool
hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h)
{
        return !n->next && n->pprev == &h->first;
}

/**
 * hlist_move_list - Move an hlist
 * @old: hlist_head for old list.
 * @new: hlist_head for new list.
 *
 * Move a list from one list head to another. Fixup the pprev
 * reference of the first entry if it exists.
 */
static inline void hlist_move_list(struct hlist_head *old,
                                   struct hlist_head *new)
{
        new->first = old->first;
        if (new->first)
                new->first->pprev = &new->first;
        old->first = NULL;
}

/**
 * hlist_splice_init() - move all entries from one list to another
 * @from: hlist_head from which entries will be moved
 * @last: last entry on the @from list
 * @to:   hlist_head to which entries will be moved
 *
 * @to can be empty, @from must contain at least @last.
 */
static inline void hlist_splice_init(struct hlist_head *from,
                                     struct hlist_node *last,
                                     struct hlist_head *to)
{
        if (to->first)
                to->first->pprev = &last->next;
        last->next = to->first;
        to->first = from->first;
        from->first->pprev = &to->first;
        from->first = NULL;
}

#define hlist_entry(ptr, type, member) container_of(ptr,type,member)

#define hlist_for_each(pos, head) \
        for (pos = (head)->first; pos ; pos = pos->next)

#define hlist_for_each_safe(pos, n, head) \
        for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
             pos = n)

#define hlist_entry_safe(ptr, type, member) \
        ({ typeof(ptr) ____ptr = (ptr); \
           ____ptr ? hlist_entry(____ptr, type, member) : NULL; \
        })

/**
 * hlist_for_each_entry        - iterate over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry(pos, head, member)                                \
        for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\
             pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue(pos, member)                        \
        for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\
             pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_from - iterate over a hlist continuing from current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_from(pos, member)                                \
        for (; pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @pos:        the type * to use as a loop cursor.
 * @n:                a &struct hlist_node to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_safe(pos, n, head, member)                 \
        for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\
             pos && ({ n = pos->member.next; 1; });                        \
             pos = hlist_entry_safe(n, typeof(*pos), member))

/**
 * hlist_count_nodes - count nodes in the hlist
 * @head:        the head for your hlist.
 */
static inline size_t hlist_count_nodes(struct hlist_head *head)
{
        struct hlist_node *pos;
        size_t count = 0;

        hlist_for_each(pos, head)
                count++;

        return count;
}

#endif



























































































    2 
































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * descriptor table internals; you almost certainly want file.h instead.
 */

#ifndef __LINUX_FDTABLE_H
#define __LINUX_FDTABLE_H

#include <linux/posix_types.h>
#include <linux/compiler.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/nospec.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/fs.h>

#include <linux/atomic.h>

/*
 * The default fd array needs to be at least BITS_PER_LONG,
 * as this is the granularity returned by copy_fdset().
 */
#define NR_OPEN_DEFAULT BITS_PER_LONG
#define NR_OPEN_MAX ~0U

struct fdtable {
        unsigned int max_fds;
        struct file __rcu **fd;      /* current fd array */
        unsigned long *close_on_exec;
        unsigned long *open_fds;
        unsigned long *full_fds_bits;
        struct rcu_head rcu;
};

/*
 * Open file table structure
 */
struct files_struct {
  /*
   * read mostly part
   */
        atomic_t count;
        bool resize_in_progress;
        wait_queue_head_t resize_wait;

        struct fdtable __rcu *fdt;
        struct fdtable fdtab;
  /*
   * written part on a separate cache line in SMP
   */
        spinlock_t file_lock ____cacheline_aligned_in_smp;
        unsigned int next_fd;
        unsigned long close_on_exec_init[1];
        unsigned long open_fds_init[1];
        unsigned long full_fds_bits_init[1];
        struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};

struct file_operations;
struct vfsmount;
struct dentry;

#define rcu_dereference_check_fdtable(files, fdtfd) \
        rcu_dereference_check((fdtfd), lockdep_is_held(&(files)->file_lock))

#define files_fdtable(files) \
        rcu_dereference_check_fdtable((files), (files)->fdt)

/*
 * The caller must ensure that fd table isn't shared or hold rcu or file lock
 */
static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd)
{
        struct fdtable *fdt = rcu_dereference_raw(files->fdt);
        unsigned long mask = array_index_mask_nospec(fd, fdt->max_fds);
        struct file *needs_masking;

        /*
         * 'mask' is zero for an out-of-bounds fd, all ones for ok.
         * 'fd&mask' is 'fd' for ok, or 0 for out of bounds.
         *
         * Accessing fdt->fd[0] is ok, but needs masking of the result.
         */
        needs_masking = rcu_dereference_raw(fdt->fd[fd&mask]);
        return (struct file *)(mask & (unsigned long)needs_masking);
}

static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd)
{
        RCU_LOCKDEP_WARN(!lockdep_is_held(&files->file_lock),
                           "suspicious rcu_dereference_check() usage");
        return files_lookup_fd_raw(files, fd);
}

struct file *lookup_fdget_rcu(unsigned int fd);
struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd);
struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *fd);

static inline bool close_on_exec(unsigned int fd, const struct files_struct *files)
{
        return test_bit(fd, files_fdtable(files)->close_on_exec);
}

struct task_struct;

void put_files_struct(struct files_struct *fs);
int unshare_files(void);
struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy;
void do_close_on_exec(struct files_struct *);
int iterate_fd(struct files_struct *, unsigned,
                int (*)(const void *, struct file *, unsigned),
                const void *);

extern int close_fd(unsigned int fd);
extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
extern struct file *file_close_fd(unsigned int fd);
extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
                      struct files_struct **new_fdp);

extern struct kmem_cache *files_cachep;

#endif /* __LINUX_FDTABLE_H */

































































































































































































    1 
































    1 



    1 

































































































































































    1 







    1 













































































































































































































































































    1 





    1 






































































































































































































































































































































































































































    1 



    1 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
// SPDX-License-Identifier: GPL-2.0

#include "messages.h"
#include "tree-mod-log.h"
#include "disk-io.h"
#include "fs.h"
#include "accessors.h"
#include "tree-checker.h"

struct tree_mod_root {
        u64 logical;
        u8 level;
};

struct tree_mod_elem {
        struct rb_node node;
        u64 logical;
        u64 seq;
        enum btrfs_mod_log_op op;

        /*
         * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
         * operations.
         */
        int slot;

        /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
        u64 generation;

        /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
        struct btrfs_disk_key key;
        u64 blockptr;

        /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
        struct {
                int dst_slot;
                int nr_items;
        } move;

        /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
        struct tree_mod_root old_root;
};

/*
 * Pull a new tree mod seq number for our operation.
 */
static u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
{
        return atomic64_inc_return(&fs_info->tree_mod_seq);
}

/*
 * This adds a new blocker to the tree mod log's blocker list if the @elem
 * passed does not already have a sequence number set. So when a caller expects
 * to record tree modifications, it should ensure to set elem->seq to zero
 * before calling btrfs_get_tree_mod_seq.
 * Returns a fresh, unused tree log modification sequence number, even if no new
 * blocker was added.
 */
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
                           struct btrfs_seq_list *elem)
{
        write_lock(&fs_info->tree_mod_log_lock);
        if (!elem->seq) {
                elem->seq = btrfs_inc_tree_mod_seq(fs_info);
                list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
                set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
        }
        write_unlock(&fs_info->tree_mod_log_lock);

        return elem->seq;
}

void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
                            struct btrfs_seq_list *elem)
{
        struct rb_root *tm_root;
        struct rb_node *node;
        struct rb_node *next;
        struct tree_mod_elem *tm;
        u64 min_seq = BTRFS_SEQ_LAST;
        u64 seq_putting = elem->seq;

        if (!seq_putting)
                return;

        write_lock(&fs_info->tree_mod_log_lock);
        list_del(&elem->list);
        elem->seq = 0;

        if (list_empty(&fs_info->tree_mod_seq_list)) {
                clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
        } else {
                struct btrfs_seq_list *first;

                first = list_first_entry(&fs_info->tree_mod_seq_list,
                                         struct btrfs_seq_list, list);
                if (seq_putting > first->seq) {
                        /*
                         * Blocker with lower sequence number exists, we cannot
                         * remove anything from the log.
                         */
                        write_unlock(&fs_info->tree_mod_log_lock);
                        return;
                }
                min_seq = first->seq;
        }

        /*
         * Anything that's lower than the lowest existing (read: blocked)
         * sequence number can be removed from the tree.
         */
        tm_root = &fs_info->tree_mod_log;
        for (node = rb_first(tm_root); node; node = next) {
                next = rb_next(node);
                tm = rb_entry(node, struct tree_mod_elem, node);
                if (tm->seq >= min_seq)
                        continue;
                rb_erase(node, tm_root);
                kfree(tm);
        }
        write_unlock(&fs_info->tree_mod_log_lock);
}

/*
 * Key order of the log:
 *       node/leaf start address -> sequence
 *
 * The 'start address' is the logical address of the *new* root node for root
 * replace operations, or the logical address of the affected block for all
 * other operations.
 */
static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
                                        struct tree_mod_elem *tm)
{
        struct rb_root *tm_root;
        struct rb_node **new;
        struct rb_node *parent = NULL;
        struct tree_mod_elem *cur;

        lockdep_assert_held_write(&fs_info->tree_mod_log_lock);

        tm->seq = btrfs_inc_tree_mod_seq(fs_info);

        tm_root = &fs_info->tree_mod_log;
        new = &tm_root->rb_node;
        while (*new) {
                cur = rb_entry(*new, struct tree_mod_elem, node);
                parent = *new;
                if (cur->logical < tm->logical)
                        new = &((*new)->rb_left);
                else if (cur->logical > tm->logical)
                        new = &((*new)->rb_right);
                else if (cur->seq < tm->seq)
                        new = &((*new)->rb_left);
                else if (cur->seq > tm->seq)
                        new = &((*new)->rb_right);
                else
                        return -EEXIST;
        }

        rb_link_node(&tm->node, parent, new);
        rb_insert_color(&tm->node, tm_root);
        return 0;
}

/*
 * Determines if logging can be omitted. Returns true if it can. Otherwise, it
 * returns false with the tree_mod_log_lock acquired. The caller must hold
 * this until all tree mod log insertions are recorded in the rb tree and then
 * write unlock fs_info::tree_mod_log_lock.
 */
static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
{
        if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
                return true;
        if (eb && btrfs_header_level(eb) == 0)
                return true;

        write_lock(&fs_info->tree_mod_log_lock);
        if (list_empty(&(fs_info)->tree_mod_seq_list)) {
                write_unlock(&fs_info->tree_mod_log_lock);
                return true;
        }

        return false;
}

/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
                                    struct extent_buffer *eb)
{
        if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
                return false;
        if (eb && btrfs_header_level(eb) == 0)
                return false;

        return true;
}

static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
                                                 int slot,
                                                 enum btrfs_mod_log_op op)
{
        struct tree_mod_elem *tm;

        tm = kzalloc(sizeof(*tm), GFP_NOFS);
        if (!tm)
                return NULL;

        tm->logical = eb->start;
        if (op != BTRFS_MOD_LOG_KEY_ADD) {
                btrfs_node_key(eb, &tm->key, slot);
                tm->blockptr = btrfs_node_blockptr(eb, slot);
        }
        tm->op = op;
        tm->slot = slot;
        tm->generation = btrfs_node_ptr_generation(eb, slot);
        RB_CLEAR_NODE(&tm->node);

        return tm;
}

int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
                                  enum btrfs_mod_log_op op)
{
        struct tree_mod_elem *tm;
        int ret = 0;

        if (!tree_mod_need_log(eb->fs_info, eb))
                return 0;

        tm = alloc_tree_mod_elem(eb, slot, op);
        if (!tm)
                ret = -ENOMEM;

        if (tree_mod_dont_log(eb->fs_info, eb)) {
                kfree(tm);
                /*
                 * Don't error if we failed to allocate memory because we don't
                 * need to log.
                 */
                return 0;
        } else if (ret != 0) {
                /*
                 * We previously failed to allocate memory and we need to log,
                 * so we have to fail.
                 */
                goto out_unlock;
        }

        ret = tree_mod_log_insert(eb->fs_info, tm);
out_unlock:
        write_unlock(&eb->fs_info->tree_mod_log_lock);
        if (ret)
                kfree(tm);

        return ret;
}

static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb,
                                                     int dst_slot, int src_slot,
                                                     int nr_items)
{
        struct tree_mod_elem *tm;

        tm = kzalloc(sizeof(*tm), GFP_NOFS);
        if (!tm)
                return ERR_PTR(-ENOMEM);

        tm->logical = eb->start;
        tm->slot = src_slot;
        tm->move.dst_slot = dst_slot;
        tm->move.nr_items = nr_items;
        tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
        RB_CLEAR_NODE(&tm->node);

        return tm;
}

int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
                                   int dst_slot, int src_slot,
                                   int nr_items)
{
        struct tree_mod_elem *tm = NULL;
        struct tree_mod_elem **tm_list = NULL;
        int ret = 0;
        int i;
        bool locked = false;

        if (!tree_mod_need_log(eb->fs_info, eb))
                return 0;

        tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
        if (!tm_list) {
                ret = -ENOMEM;
                goto lock;
        }

        tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items);
        if (IS_ERR(tm)) {
                ret = PTR_ERR(tm);
                tm = NULL;
                goto lock;
        }

        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
                tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
                                BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING);
                if (!tm_list[i]) {
                        ret = -ENOMEM;
                        goto lock;
                }
        }

lock:
        if (tree_mod_dont_log(eb->fs_info, eb)) {
                /*
                 * Don't error if we failed to allocate memory because we don't
                 * need to log.
                 */
                ret = 0;
                goto free_tms;
        }
        locked = true;

        /*
         * We previously failed to allocate memory and we need to log, so we
         * have to fail.
         */
        if (ret != 0)
                goto free_tms;

        /*
         * When we override something during the move, we log these removals.
         * This can only happen when we move towards the beginning of the
         * buffer, i.e. dst_slot < src_slot.
         */
        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
                ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
                if (ret)
                        goto free_tms;
        }

        ret = tree_mod_log_insert(eb->fs_info, tm);
        if (ret)
                goto free_tms;
        write_unlock(&eb->fs_info->tree_mod_log_lock);
        kfree(tm_list);

        return 0;

free_tms:
        if (tm_list) {
                for (i = 0; i < nr_items; i++) {
                        if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
                                rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
                        kfree(tm_list[i]);
                }
        }
        if (locked)
                write_unlock(&eb->fs_info->tree_mod_log_lock);
        kfree(tm_list);
        kfree(tm);

        return ret;
}

static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
                                struct tree_mod_elem **tm_list,
                                int nritems)
{
        int i, j;
        int ret;

        for (i = nritems - 1; i >= 0; i--) {
                ret = tree_mod_log_insert(fs_info, tm_list[i]);
                if (ret) {
                        for (j = nritems - 1; j > i; j--)
                                rb_erase(&tm_list[j]->node,
                                         &fs_info->tree_mod_log);
                        return ret;
                }
        }

        return 0;
}

int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
                                   struct extent_buffer *new_root,
                                   bool log_removal)
{
        struct btrfs_fs_info *fs_info = old_root->fs_info;
        struct tree_mod_elem *tm = NULL;
        struct tree_mod_elem **tm_list = NULL;
        int nritems = 0;
        int ret = 0;
        int i;

        if (!tree_mod_need_log(fs_info, NULL))
                return 0;

        if (log_removal && btrfs_header_level(old_root) > 0) {
                nritems = btrfs_header_nritems(old_root);
                tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
                                  GFP_NOFS);
                if (!tm_list) {
                        ret = -ENOMEM;
                        goto lock;
                }
                for (i = 0; i < nritems; i++) {
                        tm_list[i] = alloc_tree_mod_elem(old_root, i,
                            BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
                        if (!tm_list[i]) {
                                ret = -ENOMEM;
                                goto lock;
                        }
                }
        }

        tm = kzalloc(sizeof(*tm), GFP_NOFS);
        if (!tm) {
                ret = -ENOMEM;
                goto lock;
        }

        tm->logical = new_root->start;
        tm->old_root.logical = old_root->start;
        tm->old_root.level = btrfs_header_level(old_root);
        tm->generation = btrfs_header_generation(old_root);
        tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;

lock:
        if (tree_mod_dont_log(fs_info, NULL)) {
                /*
                 * Don't error if we failed to allocate memory because we don't
                 * need to log.
                 */
                ret = 0;
                goto free_tms;
        } else if (ret != 0) {
                /*
                 * We previously failed to allocate memory and we need to log,
                 * so we have to fail.
                 */
                goto out_unlock;
        }

        if (tm_list)
                ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
        if (!ret)
                ret = tree_mod_log_insert(fs_info, tm);

out_unlock:
        write_unlock(&fs_info->tree_mod_log_lock);
        if (ret)
                goto free_tms;
        kfree(tm_list);

        return ret;

free_tms:
        if (tm_list) {
                for (i = 0; i < nritems; i++)
                        kfree(tm_list[i]);
                kfree(tm_list);
        }
        kfree(tm);

        return ret;
}

static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
                                                   u64 start, u64 min_seq,
                                                   bool smallest)
{
        struct rb_root *tm_root;
        struct rb_node *node;
        struct tree_mod_elem *cur = NULL;
        struct tree_mod_elem *found = NULL;

        read_lock(&fs_info->tree_mod_log_lock);
        tm_root = &fs_info->tree_mod_log;
        node = tm_root->rb_node;
        while (node) {
                cur = rb_entry(node, struct tree_mod_elem, node);
                if (cur->logical < start) {
                        node = node->rb_left;
                } else if (cur->logical > start) {
                        node = node->rb_right;
                } else if (cur->seq < min_seq) {
                        node = node->rb_left;
                } else if (!smallest) {
                        /* We want the node with the highest seq */
                        if (found)
                                BUG_ON(found->seq > cur->seq);
                        found = cur;
                        node = node->rb_left;
                } else if (cur->seq > min_seq) {
                        /* We want the node with the smallest seq */
                        if (found)
                                BUG_ON(found->seq < cur->seq);
                        found = cur;
                        node = node->rb_right;
                } else {
                        found = cur;
                        break;
                }
        }
        read_unlock(&fs_info->tree_mod_log_lock);

        return found;
}

/*
 * This returns the element from the log with the smallest time sequence
 * value that's in the log (the oldest log item). Any element with a time
 * sequence lower than min_seq will be ignored.
 */
static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
                                                        u64 start, u64 min_seq)
{
        return __tree_mod_log_search(fs_info, start, min_seq, true);
}

/*
 * This returns the element from the log with the largest time sequence
 * value that's in the log (the most recent log item). Any element with
 * a time sequence lower than min_seq will be ignored.
 */
static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
                                                 u64 start, u64 min_seq)
{
        return __tree_mod_log_search(fs_info, start, min_seq, false);
}

int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
                               struct extent_buffer *src,
                               unsigned long dst_offset,
                               unsigned long src_offset,
                               int nr_items)
{
        struct btrfs_fs_info *fs_info = dst->fs_info;
        int ret = 0;
        struct tree_mod_elem **tm_list = NULL;
        struct tree_mod_elem **tm_list_add = NULL;
        struct tree_mod_elem **tm_list_rem = NULL;
        int i;
        bool locked = false;
        struct tree_mod_elem *dst_move_tm = NULL;
        struct tree_mod_elem *src_move_tm = NULL;
        u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset;
        u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items);

        if (!tree_mod_need_log(fs_info, NULL))
                return 0;

        if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
                return 0;

        tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
                          GFP_NOFS);
        if (!tm_list) {
                ret = -ENOMEM;
                goto lock;
        }

        if (dst_move_nr_items) {
                dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items,
                                                      dst_offset, dst_move_nr_items);
                if (IS_ERR(dst_move_tm)) {
                        ret = PTR_ERR(dst_move_tm);
                        dst_move_tm = NULL;
                        goto lock;
                }
        }
        if (src_move_nr_items) {
                src_move_tm = tree_mod_log_alloc_move(src, src_offset,
                                                      src_offset + nr_items,
                                                      src_move_nr_items);
                if (IS_ERR(src_move_tm)) {
                        ret = PTR_ERR(src_move_tm);
                        src_move_tm = NULL;
                        goto lock;
                }
        }

        tm_list_add = tm_list;
        tm_list_rem = tm_list + nr_items;
        for (i = 0; i < nr_items; i++) {
                tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
                                                     BTRFS_MOD_LOG_KEY_REMOVE);
                if (!tm_list_rem[i]) {
                        ret = -ENOMEM;
                        goto lock;
                }

                tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
                                                     BTRFS_MOD_LOG_KEY_ADD);
                if (!tm_list_add[i]) {
                        ret = -ENOMEM;
                        goto lock;
                }
        }

lock:
        if (tree_mod_dont_log(fs_info, NULL)) {
                /*
                 * Don't error if we failed to allocate memory because we don't
                 * need to log.
                 */
                ret = 0;
                goto free_tms;
        }
        locked = true;

        /*
         * We previously failed to allocate memory and we need to log, so we
         * have to fail.
         */
        if (ret != 0)
                goto free_tms;

        if (dst_move_tm) {
                ret = tree_mod_log_insert(fs_info, dst_move_tm);
                if (ret)
                        goto free_tms;
        }
        for (i = 0; i < nr_items; i++) {
                ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
                if (ret)
                        goto free_tms;
                ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
                if (ret)
                        goto free_tms;
        }
        if (src_move_tm) {
                ret = tree_mod_log_insert(fs_info, src_move_tm);
                if (ret)
                        goto free_tms;
        }

        write_unlock(&fs_info->tree_mod_log_lock);
        kfree(tm_list);

        return 0;

free_tms:
        if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node))
                rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log);
        kfree(dst_move_tm);
        if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node))
                rb_erase(&src_move_tm->node, &fs_info->tree_mod_log);
        kfree(src_move_tm);
        if (tm_list) {
                for (i = 0; i < nr_items * 2; i++) {
                        if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
                                rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
                        kfree(tm_list[i]);
                }
        }
        if (locked)
                write_unlock(&fs_info->tree_mod_log_lock);
        kfree(tm_list);

        return ret;
}

int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
{
        struct tree_mod_elem **tm_list = NULL;
        int nritems = 0;
        int i;
        int ret = 0;

        if (!tree_mod_need_log(eb->fs_info, eb))
                return 0;

        nritems = btrfs_header_nritems(eb);
        tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
        if (!tm_list) {
                ret = -ENOMEM;
                goto lock;
        }

        for (i = 0; i < nritems; i++) {
                tm_list[i] = alloc_tree_mod_elem(eb, i,
                                    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
                if (!tm_list[i]) {
                        ret = -ENOMEM;
                        goto lock;
                }
        }

lock:
        if (tree_mod_dont_log(eb->fs_info, eb)) {
                /*
                 * Don't error if we failed to allocate memory because we don't
                 * need to log.
                 */
                ret = 0;
                goto free_tms;
        } else if (ret != 0) {
                /*
                 * We previously failed to allocate memory and we need to log,
                 * so we have to fail.
                 */
                goto out_unlock;
        }

        ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
out_unlock:
        write_unlock(&eb->fs_info->tree_mod_log_lock);
        if (ret)
                goto free_tms;
        kfree(tm_list);

        return 0;

free_tms:
        if (tm_list) {
                for (i = 0; i < nritems; i++)
                        kfree(tm_list[i]);
                kfree(tm_list);
        }

        return ret;
}

/*
 * Returns the logical address of the oldest predecessor of the given root.
 * Entries older than time_seq are ignored.
 */
static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
                                                      u64 time_seq)
{
        struct tree_mod_elem *tm;
        struct tree_mod_elem *found = NULL;
        u64 root_logical = eb_root->start;
        bool looped = false;

        if (!time_seq)
                return NULL;

        /*
         * The very last operation that's logged for a root is the replacement
         * operation (if it is replaced at all). This has the logical address
         * of the *new* root, making it the very first operation that's logged
         * for this root.
         */
        while (1) {
                tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
                                                time_seq);
                if (!looped && !tm)
                        return NULL;
                /*
                 * If there are no tree operation for the oldest root, we simply
                 * return it. This should only happen if that (old) root is at
                 * level 0.
                 */
                if (!tm)
                        break;

                /*
                 * If there's an operation that's not a root replacement, we
                 * found the oldest version of our root. Normally, we'll find a
                 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
                 */
                if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
                        break;

                found = tm;
                root_logical = tm->old_root.logical;
                looped = true;
        }

        /* If there's no old root to return, return what we found instead */
        if (!found)
                found = tm;

        return found;
}


/*
 * tm is a pointer to the first operation to rewind within eb. Then, all
 * previous operations will be rewound (until we reach something older than
 * time_seq).
 */
static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
                                struct extent_buffer *eb,
                                u64 time_seq,
                                struct tree_mod_elem *first_tm)
{
        u32 n;
        struct rb_node *next;
        struct tree_mod_elem *tm = first_tm;
        unsigned long o_dst;
        unsigned long o_src;
        unsigned long p_size = sizeof(struct btrfs_key_ptr);
        /*
         * max_slot tracks the maximum valid slot of the rewind eb at every
         * step of the rewind. This is in contrast with 'n' which eventually
         * matches the number of items, but can be wrong during moves or if
         * removes overlap on already valid slots (which is probably separately
         * a bug). We do this to validate the offsets of memmoves for rewinding
         * moves and detect invalid memmoves.
         *
         * Since a rewind eb can start empty, max_slot is a signed integer with
         * a special meaning for -1, which is that no slot is valid to move out
         * of. Any other negative value is invalid.
         */
        int max_slot;
        int move_src_end_slot;
        int move_dst_end_slot;

        n = btrfs_header_nritems(eb);
        max_slot = n - 1;
        read_lock(&fs_info->tree_mod_log_lock);
        while (tm && tm->seq >= time_seq) {
                ASSERT(max_slot >= -1);
                /*
                 * All the operations are recorded with the operator used for
                 * the modification. As we're going backwards, we do the
                 * opposite of each operation here.
                 */
                switch (tm->op) {
                case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
                        BUG_ON(tm->slot < n);
                        fallthrough;
                case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
                case BTRFS_MOD_LOG_KEY_REMOVE:
                        btrfs_set_node_key(eb, &tm->key, tm->slot);
                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
                        btrfs_set_node_ptr_generation(eb, tm->slot,
                                                      tm->generation);
                        n++;
                        if (tm->slot > max_slot)
                                max_slot = tm->slot;
                        break;
                case BTRFS_MOD_LOG_KEY_REPLACE:
                        BUG_ON(tm->slot >= n);
                        btrfs_set_node_key(eb, &tm->key, tm->slot);
                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
                        btrfs_set_node_ptr_generation(eb, tm->slot,
                                                      tm->generation);
                        break;
                case BTRFS_MOD_LOG_KEY_ADD:
                        /*
                         * It is possible we could have already removed keys
                         * behind the known max slot, so this will be an
                         * overestimate. In practice, the copy operation
                         * inserts them in increasing order, and overestimating
                         * just means we miss some warnings, so it's OK. It
                         * isn't worth carefully tracking the full array of
                         * valid slots to check against when moving.
                         */
                        if (tm->slot == max_slot)
                                max_slot--;
                        /* if a move operation is needed it's in the log */
                        n--;
                        break;
                case BTRFS_MOD_LOG_MOVE_KEYS:
                        ASSERT(tm->move.nr_items > 0);
                        move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1;
                        move_dst_end_slot = tm->slot + tm->move.nr_items - 1;
                        o_dst = btrfs_node_key_ptr_offset(eb, tm->slot);
                        o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot);
                        if (WARN_ON(move_src_end_slot > max_slot ||
                                    tm->move.nr_items <= 0)) {
                                btrfs_warn(fs_info,
"move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d",
                                           eb->start, tm->slot,
                                           tm->move.dst_slot, tm->move.nr_items,
                                           tm->seq, n, max_slot);
                        }
                        memmove_extent_buffer(eb, o_dst, o_src,
                                              tm->move.nr_items * p_size);
                        max_slot = move_dst_end_slot;
                        break;
                case BTRFS_MOD_LOG_ROOT_REPLACE:
                        /*
                         * This operation is special. For roots, this must be
                         * handled explicitly before rewinding.
                         * For non-roots, this operation may exist if the node
                         * was a root: root A -> child B; then A gets empty and
                         * B is promoted to the new root. In the mod log, we'll
                         * have a root-replace operation for B, a tree block
                         * that is no root. We simply ignore that operation.
                         */
                        break;
                }
                next = rb_next(&tm->node);
                if (!next)
                        break;
                tm = rb_entry(next, struct tree_mod_elem, node);
                if (tm->logical != first_tm->logical)
                        break;
        }
        read_unlock(&fs_info->tree_mod_log_lock);
        btrfs_set_header_nritems(eb, n);
}

/*
 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
 * is returned. If rewind operations happen, a fresh buffer is returned. The
 * returned buffer is always read-locked. If the returned buffer is not the
 * input buffer, the lock on the input buffer is released and the input buffer
 * is freed (its refcount is decremented).
 */
struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
                                                struct btrfs_path *path,
                                                struct extent_buffer *eb,
                                                u64 time_seq)
{
        struct extent_buffer *eb_rewin;
        struct tree_mod_elem *tm;

        if (!time_seq)
                return eb;

        if (btrfs_header_level(eb) == 0)
                return eb;

        tm = tree_mod_log_search(fs_info, eb->start, time_seq);
        if (!tm)
                return eb;

        if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
                BUG_ON(tm->slot != 0);
                eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
                if (!eb_rewin) {
                        btrfs_tree_read_unlock(eb);
                        free_extent_buffer(eb);
                        return NULL;
                }
                btrfs_set_header_bytenr(eb_rewin, eb->start);
                btrfs_set_header_backref_rev(eb_rewin,
                                             btrfs_header_backref_rev(eb));
                btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
                btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
        } else {
                eb_rewin = btrfs_clone_extent_buffer(eb);
                if (!eb_rewin) {
                        btrfs_tree_read_unlock(eb);
                        free_extent_buffer(eb);
                        return NULL;
                }
        }

        btrfs_tree_read_unlock(eb);
        free_extent_buffer(eb);

        btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
                                       eb_rewin, btrfs_header_level(eb_rewin));
        btrfs_tree_read_lock(eb_rewin);
        tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
        WARN_ON(btrfs_header_nritems(eb_rewin) >
                BTRFS_NODEPTRS_PER_BLOCK(fs_info));

        return eb_rewin;
}

/*
 * Rewind the state of @root's root node to the given @time_seq value.
 * If there are no changes, the current root->root_node is returned. If anything
 * changed in between, there's a fresh buffer allocated on which the rewind
 * operations are done. In any case, the returned buffer is read locked.
 * Returns NULL on error (with no locks held).
 */
struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct tree_mod_elem *tm;
        struct extent_buffer *eb = NULL;
        struct extent_buffer *eb_root;
        u64 eb_root_owner = 0;
        struct extent_buffer *old;
        struct tree_mod_root *old_root = NULL;
        u64 old_generation = 0;
        u64 logical;
        int level;

        eb_root = btrfs_read_lock_root_node(root);
        tm = tree_mod_log_oldest_root(eb_root, time_seq);
        if (!tm)
                return eb_root;

        if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
                old_root = &tm->old_root;
                old_generation = tm->generation;
                logical = old_root->logical;
                level = old_root->level;
        } else {
                logical = eb_root->start;
                level = btrfs_header_level(eb_root);
        }

        tm = tree_mod_log_search(fs_info, logical, time_seq);
        if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
                struct btrfs_tree_parent_check check = { 0 };

                btrfs_tree_read_unlock(eb_root);
                free_extent_buffer(eb_root);

                check.level = level;
                check.owner_root = btrfs_root_id(root);

                old = read_tree_block(fs_info, logical, &check);
                if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
                        if (!IS_ERR(old))
                                free_extent_buffer(old);
                        btrfs_warn(fs_info,
                                   "failed to read tree block %llu from get_old_root",
                                   logical);
                } else {
                        struct tree_mod_elem *tm2;

                        btrfs_tree_read_lock(old);
                        eb = btrfs_clone_extent_buffer(old);
                        /*
                         * After the lookup for the most recent tree mod operation
                         * above and before we locked and cloned the extent buffer
                         * 'old', a new tree mod log operation may have been added.
                         * So lookup for a more recent one to make sure the number
                         * of mod log operations we replay is consistent with the
                         * number of items we have in the cloned extent buffer,
                         * otherwise we can hit a BUG_ON when rewinding the extent
                         * buffer.
                         */
                        tm2 = tree_mod_log_search(fs_info, logical, time_seq);
                        btrfs_tree_read_unlock(old);
                        free_extent_buffer(old);
                        ASSERT(tm2);
                        ASSERT(tm2 == tm || tm2->seq > tm->seq);
                        if (!tm2 || tm2->seq < tm->seq) {
                                free_extent_buffer(eb);
                                return NULL;
                        }
                        tm = tm2;
                }
        } else if (old_root) {
                eb_root_owner = btrfs_header_owner(eb_root);
                btrfs_tree_read_unlock(eb_root);
                free_extent_buffer(eb_root);
                eb = alloc_dummy_extent_buffer(fs_info, logical);
        } else {
                eb = btrfs_clone_extent_buffer(eb_root);
                btrfs_tree_read_unlock(eb_root);
                free_extent_buffer(eb_root);
        }

        if (!eb)
                return NULL;
        if (old_root) {
                btrfs_set_header_bytenr(eb, eb->start);
                btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
                btrfs_set_header_owner(eb, eb_root_owner);
                btrfs_set_header_level(eb, old_root->level);
                btrfs_set_header_generation(eb, old_generation);
        }
        btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
                                       btrfs_header_level(eb));
        btrfs_tree_read_lock(eb);
        if (tm)
                tree_mod_log_rewind(fs_info, eb, time_seq, tm);
        else
                WARN_ON(btrfs_header_level(eb) != 0);
        WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));

        return eb;
}

int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
{
        struct tree_mod_elem *tm;
        int level;
        struct extent_buffer *eb_root = btrfs_root_node(root);

        tm = tree_mod_log_oldest_root(eb_root, time_seq);
        if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
                level = tm->old_root.level;
        else
                level = btrfs_header_level(eb_root);

        free_extent_buffer(eb_root);

        return level;
}

/*
 * Return the lowest sequence number in the tree modification log.
 *
 * Return the sequence number of the oldest tree modification log user, which
 * corresponds to the lowest sequence number of all existing users. If there are
 * no users it returns 0.
 */
u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
{
        u64 ret = 0;

        read_lock(&fs_info->tree_mod_log_lock);
        if (!list_empty(&fs_info->tree_mod_seq_list)) {
                struct btrfs_seq_list *elem;

                elem = list_first_entry(&fs_info->tree_mod_seq_list,
                                        struct btrfs_seq_list, list);
                ret = elem->seq;
        }
        read_unlock(&fs_info->tree_mod_log_lock);

        return ret;
}






























































































































































































    1 


    1 























    1 







    1 







































































































































    1 







    1 














    1 

















    1 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "accessors.h"
#include "dir-item.h"

/*
 * insert a name into a directory, doing overflow properly if there is a hash
 * collision.  data_size indicates how big the item inserted should be.  On
 * success a struct btrfs_dir_item pointer is returned, otherwise it is
 * an ERR_PTR.
 *
 * The name is not copied into the dir item, you have to do that yourself.
 */
static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
                                                   *trans,
                                                   struct btrfs_root *root,
                                                   struct btrfs_path *path,
                                                   struct btrfs_key *cpu_key,
                                                   u32 data_size,
                                                   const char *name,
                                                   int name_len)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;
        char *ptr;
        struct extent_buffer *leaf;

        ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
        if (ret == -EEXIST) {
                struct btrfs_dir_item *di;
                di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
                if (di)
                        return ERR_PTR(-EEXIST);
                btrfs_extend_item(trans, path, data_size);
        } else if (ret < 0)
                return ERR_PTR(ret);
        WARN_ON(ret > 0);
        leaf = path->nodes[0];
        ptr = btrfs_item_ptr(leaf, path->slots[0], char);
        ASSERT(data_size <= btrfs_item_size(leaf, path->slots[0]));
        ptr += btrfs_item_size(leaf, path->slots[0]) - data_size;
        return (struct btrfs_dir_item *)ptr;
}

/*
 * xattrs work a lot like directories, this inserts an xattr item
 * into the tree
 */
int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root,
                            struct btrfs_path *path, u64 objectid,
                            const char *name, u16 name_len,
                            const void *data, u16 data_len)
{
        int ret = 0;
        struct btrfs_dir_item *dir_item;
        unsigned long name_ptr, data_ptr;
        struct btrfs_key key, location;
        struct btrfs_disk_key disk_key;
        struct extent_buffer *leaf;
        u32 data_size;

        if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info))
                return -ENOSPC;

        key.objectid = objectid;
        key.type = BTRFS_XATTR_ITEM_KEY;
        key.offset = btrfs_name_hash(name, name_len);

        data_size = sizeof(*dir_item) + name_len + data_len;
        dir_item = insert_with_overflow(trans, root, path, &key, data_size,
                                        name, name_len);
        if (IS_ERR(dir_item))
                return PTR_ERR(dir_item);
        memset(&location, 0, sizeof(location));

        leaf = path->nodes[0];
        btrfs_cpu_key_to_disk(&disk_key, &location);
        btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
        btrfs_set_dir_flags(leaf, dir_item, BTRFS_FT_XATTR);
        btrfs_set_dir_name_len(leaf, dir_item, name_len);
        btrfs_set_dir_transid(leaf, dir_item, trans->transid);
        btrfs_set_dir_data_len(leaf, dir_item, data_len);
        name_ptr = (unsigned long)(dir_item + 1);
        data_ptr = (unsigned long)((char *)name_ptr + name_len);

        write_extent_buffer(leaf, name, name_ptr, name_len);
        write_extent_buffer(leaf, data, data_ptr, data_len);
        btrfs_mark_buffer_dirty(trans, path->nodes[0]);

        return ret;
}

/*
 * insert a directory item in the tree, doing all the magic for
 * both indexes. 'dir' indicates which objectid to insert it into,
 * 'location' is the key to stuff into the directory item, 'type' is the
 * type of the inode we're pointing to, and 'index' is the sequence number
 * to use for the second index (if one is created).
 * Will return 0 or -ENOMEM
 */
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
                          const struct fscrypt_str *name, struct btrfs_inode *dir,
                          struct btrfs_key *location, u8 type, u64 index)
{
        int ret = 0;
        int ret2 = 0;
        struct btrfs_root *root = dir->root;
        struct btrfs_path *path;
        struct btrfs_dir_item *dir_item;
        struct extent_buffer *leaf;
        unsigned long name_ptr;
        struct btrfs_key key;
        struct btrfs_disk_key disk_key;
        u32 data_size;

        key.objectid = btrfs_ino(dir);
        key.type = BTRFS_DIR_ITEM_KEY;
        key.offset = btrfs_name_hash(name->name, name->len);

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        btrfs_cpu_key_to_disk(&disk_key, location);

        data_size = sizeof(*dir_item) + name->len;
        dir_item = insert_with_overflow(trans, root, path, &key, data_size,
                                        name->name, name->len);
        if (IS_ERR(dir_item)) {
                ret = PTR_ERR(dir_item);
                if (ret == -EEXIST)
                        goto second_insert;
                goto out_free;
        }

        if (IS_ENCRYPTED(&dir->vfs_inode))
                type |= BTRFS_FT_ENCRYPTED;

        leaf = path->nodes[0];
        btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
        btrfs_set_dir_flags(leaf, dir_item, type);
        btrfs_set_dir_data_len(leaf, dir_item, 0);
        btrfs_set_dir_name_len(leaf, dir_item, name->len);
        btrfs_set_dir_transid(leaf, dir_item, trans->transid);
        name_ptr = (unsigned long)(dir_item + 1);

        write_extent_buffer(leaf, name->name, name_ptr, name->len);
        btrfs_mark_buffer_dirty(trans, leaf);

second_insert:
        /* FIXME, use some real flag for selecting the extra index */
        if (root == root->fs_info->tree_root) {
                ret = 0;
                goto out_free;
        }
        btrfs_release_path(path);

        ret2 = btrfs_insert_delayed_dir_index(trans, name->name, name->len, dir,
                                              &disk_key, type, index);
out_free:
        btrfs_free_path(path);
        if (ret)
                return ret;
        if (ret2)
                return ret2;
        return 0;
}

static struct btrfs_dir_item *btrfs_lookup_match_dir(
                        struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct btrfs_path *path,
                        struct btrfs_key *key, const char *name,
                        int name_len, int mod)
{
        const int ins_len = (mod < 0 ? -1 : 0);
        const int cow = (mod != 0);
        int ret;

        ret = btrfs_search_slot(trans, root, key, path, ins_len, cow);
        if (ret < 0)
                return ERR_PTR(ret);
        if (ret > 0)
                return ERR_PTR(-ENOENT);

        return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
}

/*
 * Lookup for a directory item by name.
 *
 * @trans:        The transaction handle to use. Can be NULL if @mod is 0.
 * @root:        The root of the target tree.
 * @path:        Path to use for the search.
 * @dir:        The inode number (objectid) of the directory.
 * @name:        The name associated to the directory entry we are looking for.
 * @name_len:        The length of the name.
 * @mod:        Used to indicate if the tree search is meant for a read only
 *                lookup, for a modification lookup or for a deletion lookup, so
 *                its value should be 0, 1 or -1, respectively.
 *
 * Returns: NULL if the dir item does not exists, an error pointer if an error
 * happened, or a pointer to a dir item if a dir item exists for the given name.
 */
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
                                             struct btrfs_root *root,
                                             struct btrfs_path *path, u64 dir,
                                             const struct fscrypt_str *name,
                                             int mod)
{
        struct btrfs_key key;
        struct btrfs_dir_item *di;

        key.objectid = dir;
        key.type = BTRFS_DIR_ITEM_KEY;
        key.offset = btrfs_name_hash(name->name, name->len);

        di = btrfs_lookup_match_dir(trans, root, path, &key, name->name,
                                    name->len, mod);
        if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
                return NULL;

        return di;
}

int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
                                   const struct fscrypt_str *name)
{
        int ret;
        struct btrfs_key key;
        struct btrfs_dir_item *di;
        int data_size;
        struct extent_buffer *leaf;
        int slot;
        struct btrfs_path *path;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = dir;
        key.type = BTRFS_DIR_ITEM_KEY;
        key.offset = btrfs_name_hash(name->name, name->len);

        di = btrfs_lookup_match_dir(NULL, root, path, &key, name->name,
                                    name->len, 0);
        if (IS_ERR(di)) {
                ret = PTR_ERR(di);
                /* Nothing found, we're safe */
                if (ret == -ENOENT) {
                        ret = 0;
                        goto out;
                }

                if (ret < 0)
                        goto out;
        }

        /* we found an item, look for our name in the item */
        if (di) {
                /* our exact name was found */
                ret = -EEXIST;
                goto out;
        }

        /* See if there is room in the item to insert this name. */
        data_size = sizeof(*di) + name->len;
        leaf = path->nodes[0];
        slot = path->slots[0];
        if (data_size + btrfs_item_size(leaf, slot) +
            sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) {
                ret = -EOVERFLOW;
        } else {
                /* plenty of insertion room */
                ret = 0;
        }
out:
        btrfs_free_path(path);
        return ret;
}

/*
 * Lookup for a directory index item by name and index number.
 *
 * @trans:        The transaction handle to use. Can be NULL if @mod is 0.
 * @root:        The root of the target tree.
 * @path:        Path to use for the search.
 * @dir:        The inode number (objectid) of the directory.
 * @index:        The index number.
 * @name:        The name associated to the directory entry we are looking for.
 * @name_len:        The length of the name.
 * @mod:        Used to indicate if the tree search is meant for a read only
 *                lookup, for a modification lookup or for a deletion lookup, so
 *                its value should be 0, 1 or -1, respectively.
 *
 * Returns: NULL if the dir index item does not exists, an error pointer if an
 * error happened, or a pointer to a dir item if the dir index item exists and
 * matches the criteria (name and index number).
 */
struct btrfs_dir_item *
btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root,
                            struct btrfs_path *path, u64 dir,
                            u64 index, const struct fscrypt_str *name, int mod)
{
        struct btrfs_dir_item *di;
        struct btrfs_key key;

        key.objectid = dir;
        key.type = BTRFS_DIR_INDEX_KEY;
        key.offset = index;

        di = btrfs_lookup_match_dir(trans, root, path, &key, name->name,
                                    name->len, mod);
        if (di == ERR_PTR(-ENOENT))
                return NULL;

        return di;
}

struct btrfs_dir_item *
btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path,
                            u64 dirid, const struct fscrypt_str *name)
{
        struct btrfs_dir_item *di;
        struct btrfs_key key;
        int ret;

        key.objectid = dirid;
        key.type = BTRFS_DIR_INDEX_KEY;
        key.offset = 0;

        btrfs_for_each_slot(root, &key, &key, path, ret) {
                if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
                        break;

                di = btrfs_match_dir_item_name(root->fs_info, path,
                                               name->name, name->len);
                if (di)
                        return di;
        }
        /* Adjust return code if the key was not found in the next leaf. */
        if (ret > 0)
                ret = 0;

        return ERR_PTR(ret);
}

struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
                                          struct btrfs_path *path, u64 dir,
                                          const char *name, u16 name_len,
                                          int mod)
{
        struct btrfs_key key;
        struct btrfs_dir_item *di;

        key.objectid = dir;
        key.type = BTRFS_XATTR_ITEM_KEY;
        key.offset = btrfs_name_hash(name, name_len);

        di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
        if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
                return NULL;

        return di;
}

/*
 * helper function to look at the directory item pointed to by 'path'
 * this walks through all the entries in a dir item and finds one
 * for a specific name.
 */
struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
                                                 struct btrfs_path *path,
                                                 const char *name, int name_len)
{
        struct btrfs_dir_item *dir_item;
        unsigned long name_ptr;
        u32 total_len;
        u32 cur = 0;
        u32 this_len;
        struct extent_buffer *leaf;

        leaf = path->nodes[0];
        dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);

        total_len = btrfs_item_size(leaf, path->slots[0]);
        while (cur < total_len) {
                this_len = sizeof(*dir_item) +
                        btrfs_dir_name_len(leaf, dir_item) +
                        btrfs_dir_data_len(leaf, dir_item);
                name_ptr = (unsigned long)(dir_item + 1);

                if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
                    memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
                        return dir_item;

                cur += this_len;
                dir_item = (struct btrfs_dir_item *)((char *)dir_item +
                                                     this_len);
        }
        return NULL;
}

/*
 * given a pointer into a directory item, delete it.  This
 * handles items that have more than one entry in them.
 */
int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              struct btrfs_path *path,
                              struct btrfs_dir_item *di)
{

        struct extent_buffer *leaf;
        u32 sub_item_len;
        u32 item_len;
        int ret = 0;

        leaf = path->nodes[0];
        sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
                btrfs_dir_data_len(leaf, di);
        item_len = btrfs_item_size(leaf, path->slots[0]);
        if (sub_item_len == item_len) {
                ret = btrfs_del_item(trans, root, path);
        } else {
                /* MARKER */
                unsigned long ptr = (unsigned long)di;
                unsigned long start;

                start = btrfs_item_ptr_offset(leaf, path->slots[0]);
                memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
                        item_len - (ptr + sub_item_len - start));
                btrfs_truncate_item(trans, path, item_len - sub_item_len, 1);
        }
        return ret;
}




















    1 

















    1 













































































































































    1 














    1 




















    1 


















    1 











    1 

















    1 




    1 

    1 

























    1 


    1 



























    1 

















    1 







    1 







    1 












    1 
    1 












    1 























    1 







    1 


























    1 










    1 














    1 







    1 






    1 








    1 

    1 






























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
// SPDX-License-Identifier: GPL-2.0
/*
 *
 * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
 *
 *  Directory handling functions for NTFS-based filesystems.
 *
 */

#include <linux/fs.h>
#include <linux/nls.h>

#include "debug.h"
#include "ntfs.h"
#include "ntfs_fs.h"

/* Convert little endian UTF-16 to NLS string. */
int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len,
                      u8 *buf, int buf_len)
{
        int ret, warn;
        u8 *op;
        struct nls_table *nls = sbi->options->nls;

        static_assert(sizeof(wchar_t) == sizeof(__le16));

        if (!nls) {
                /* UTF-16 -> UTF-8 */
                ret = utf16s_to_utf8s((wchar_t *)name, len, UTF16_LITTLE_ENDIAN,
                                      buf, buf_len);
                buf[ret] = '\0';
                return ret;
        }

        op = buf;
        warn = 0;

        while (len--) {
                u16 ec;
                int charlen;
                char dump[5];

                if (buf_len < NLS_MAX_CHARSET_SIZE) {
                        ntfs_warn(sbi->sb,
                                  "filename was truncated while converting.");
                        break;
                }

                ec = le16_to_cpu(*name++);
                charlen = nls->uni2char(ec, op, buf_len);

                if (charlen > 0) {
                        op += charlen;
                        buf_len -= charlen;
                        continue;
                }

                *op++ = '_';
                buf_len -= 1;
                if (warn)
                        continue;

                warn = 1;
                hex_byte_pack(&dump[0], ec >> 8);
                hex_byte_pack(&dump[2], ec);
                dump[4] = 0;

                ntfs_err(sbi->sb, "failed to convert \"%s\" to %s", dump,
                         nls->charset);
        }

        *op = '\0';
        return op - buf;
}

// clang-format off
#define PLANE_SIZE        0x00010000

#define SURROGATE_PAIR        0x0000d800
#define SURROGATE_LOW        0x00000400
#define SURROGATE_BITS        0x000003ff
// clang-format on

/*
 * put_utf16 - Modified version of put_utf16 from fs/nls/nls_base.c
 *
 * Function is sparse warnings free.
 */
static inline void put_utf16(wchar_t *s, unsigned int c,
                             enum utf16_endian endian)
{
        static_assert(sizeof(wchar_t) == sizeof(__le16));
        static_assert(sizeof(wchar_t) == sizeof(__be16));

        switch (endian) {
        default:
                *s = (wchar_t)c;
                break;
        case UTF16_LITTLE_ENDIAN:
                *(__le16 *)s = __cpu_to_le16(c);
                break;
        case UTF16_BIG_ENDIAN:
                *(__be16 *)s = __cpu_to_be16(c);
                break;
        }
}

/*
 * _utf8s_to_utf16s
 *
 * Modified version of 'utf8s_to_utf16s' allows to
 * detect -ENAMETOOLONG without writing out of expected maximum.
 */
static int _utf8s_to_utf16s(const u8 *s, int inlen, enum utf16_endian endian,
                            wchar_t *pwcs, int maxout)
{
        u16 *op;
        int size;
        unicode_t u;

        op = pwcs;
        while (inlen > 0 && *s) {
                if (*s & 0x80) {
                        size = utf8_to_utf32(s, inlen, &u);
                        if (size < 0)
                                return -EINVAL;
                        s += size;
                        inlen -= size;

                        if (u >= PLANE_SIZE) {
                                if (maxout < 2)
                                        return -ENAMETOOLONG;

                                u -= PLANE_SIZE;
                                put_utf16(op++,
                                          SURROGATE_PAIR |
                                                  ((u >> 10) & SURROGATE_BITS),
                                          endian);
                                put_utf16(op++,
                                          SURROGATE_PAIR | SURROGATE_LOW |
                                                  (u & SURROGATE_BITS),
                                          endian);
                                maxout -= 2;
                        } else {
                                if (maxout < 1)
                                        return -ENAMETOOLONG;

                                put_utf16(op++, u, endian);
                                maxout--;
                        }
                } else {
                        if (maxout < 1)
                                return -ENAMETOOLONG;

                        put_utf16(op++, *s++, endian);
                        inlen--;
                        maxout--;
                }
        }
        return op - pwcs;
}

/*
 * ntfs_nls_to_utf16 - Convert input string to UTF-16.
 * @name:        Input name.
 * @name_len:        Input name length.
 * @uni:        Destination memory.
 * @max_ulen:        Destination memory.
 * @endian:        Endian of target UTF-16 string.
 *
 * This function is called:
 * - to create NTFS name
 * - to create symlink
 *
 * Return: UTF-16 string length or error (if negative).
 */
int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len,
                      struct cpu_str *uni, u32 max_ulen,
                      enum utf16_endian endian)
{
        int ret, slen;
        const u8 *end;
        struct nls_table *nls = sbi->options->nls;
        u16 *uname = uni->name;

        static_assert(sizeof(wchar_t) == sizeof(u16));

        if (!nls) {
                /* utf8 -> utf16 */
                ret = _utf8s_to_utf16s(name, name_len, endian, uname, max_ulen);
                uni->len = ret;
                return ret;
        }

        for (ret = 0, end = name + name_len; name < end; ret++, name += slen) {
                if (ret >= max_ulen)
                        return -ENAMETOOLONG;

                slen = nls->char2uni(name, end - name, uname + ret);
                if (!slen)
                        return -EINVAL;
                if (slen < 0)
                        return slen;
        }

#ifdef __BIG_ENDIAN
        if (endian == UTF16_LITTLE_ENDIAN) {
                int i = ret;

                while (i--) {
                        __cpu_to_le16s(uname);
                        uname++;
                }
        }
#else
        if (endian == UTF16_BIG_ENDIAN) {
                int i = ret;

                while (i--) {
                        __cpu_to_be16s(uname);
                        uname++;
                }
        }
#endif

        uni->len = ret;
        return ret;
}

/*
 * dir_search_u - Helper function.
 */
struct inode *dir_search_u(struct inode *dir, const struct cpu_str *uni,
                           struct ntfs_fnd *fnd)
{
        int err = 0;
        struct super_block *sb = dir->i_sb;
        struct ntfs_sb_info *sbi = sb->s_fs_info;
        struct ntfs_inode *ni = ntfs_i(dir);
        struct NTFS_DE *e;
        int diff;
        struct inode *inode = NULL;
        struct ntfs_fnd *fnd_a = NULL;

        if (!fnd) {
                fnd_a = fnd_get();
                if (!fnd_a) {
                        err = -ENOMEM;
                        goto out;
                }
                fnd = fnd_a;
        }

        err = indx_find(&ni->dir, ni, NULL, uni, 0, sbi, &diff, &e, fnd);

        if (err)
                goto out;

        if (diff) {
                err = -ENOENT;
                goto out;
        }

        inode = ntfs_iget5(sb, &e->ref, uni);
        if (!IS_ERR(inode) && is_bad_inode(inode)) {
                iput(inode);
                err = -EINVAL;
        }
out:
        fnd_put(fnd_a);

        return err == -ENOENT ? NULL : err ? ERR_PTR(err) : inode;
}

static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni,
                               const struct NTFS_DE *e, u8 *name,
                               struct dir_context *ctx)
{
        const struct ATTR_FILE_NAME *fname;
        unsigned long ino;
        int name_len;
        u32 dt_type;

        fname = Add2Ptr(e, sizeof(struct NTFS_DE));

        if (fname->type == FILE_NAME_DOS)
                return 0;

        if (!mi_is_ref(&ni->mi, &fname->home))
                return 0;

        ino = ino_get(&e->ref);

        if (ino == MFT_REC_ROOT)
                return 0;

        /* Skip meta files. Unless option to show metafiles is set. */
        if (!sbi->options->showmeta && ntfs_is_meta_file(sbi, ino))
                return 0;

        if (sbi->options->nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN))
                return 0;

        name_len = ntfs_utf16_to_nls(sbi, fname->name, fname->name_len, name,
                                     PATH_MAX);
        if (name_len <= 0) {
                ntfs_warn(sbi->sb, "failed to convert name for inode %lx.",
                          ino);
                return 0;
        }

        /*
         * NTFS: symlinks are "dir + reparse" or "file + reparse"
         * Unfortunately reparse attribute is used for many purposes (several dozens).
         * It is not possible here to know is this name symlink or not.
         * To get exactly the type of name we should to open inode (read mft).
         * getattr for opened file (fstat) correctly returns symlink.
         */
        dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG;

        /*
         * It is not reliable to detect the type of name using duplicated information
         * stored in parent directory.
         * The only correct way to get the type of name - read MFT record and find ATTR_STD.
         * The code below is not good idea.
         * It does additional locks/reads just to get the type of name.
         * Should we use additional mount option to enable branch below?
         */
        if ((fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) &&
            ino != ni->mi.rno) {
                struct inode *inode = ntfs_iget5(sbi->sb, &e->ref, NULL);
                if (!IS_ERR_OR_NULL(inode)) {
                        dt_type = fs_umode_to_dtype(inode->i_mode);
                        iput(inode);
                }
        }

        return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type);
}

/*
 * ntfs_read_hdr - Helper function for ntfs_readdir().
 */
static int ntfs_read_hdr(struct ntfs_sb_info *sbi, struct ntfs_inode *ni,
                         const struct INDEX_HDR *hdr, u64 vbo, u64 pos,
                         u8 *name, struct dir_context *ctx)
{
        int err;
        const struct NTFS_DE *e;
        u32 e_size;
        u32 end = le32_to_cpu(hdr->used);
        u32 off = le32_to_cpu(hdr->de_off);

        for (;; off += e_size) {
                if (off + sizeof(struct NTFS_DE) > end)
                        return -1;

                e = Add2Ptr(hdr, off);
                e_size = le16_to_cpu(e->size);
                if (e_size < sizeof(struct NTFS_DE) || off + e_size > end)
                        return -1;

                if (de_is_last(e))
                        return 0;

                /* Skip already enumerated. */
                if (vbo + off < pos)
                        continue;

                if (le16_to_cpu(e->key_size) < SIZEOF_ATTRIBUTE_FILENAME)
                        return -1;

                ctx->pos = vbo + off;

                /* Submit the name to the filldir callback. */
                err = ntfs_filldir(sbi, ni, e, name, ctx);
                if (err)
                        return err;
        }
}

/*
 * ntfs_readdir - file_operations::iterate_shared
 *
 * Use non sorted enumeration.
 * We have an example of broken volume where sorted enumeration
 * counts each name twice.
 */
static int ntfs_readdir(struct file *file, struct dir_context *ctx)
{
        const struct INDEX_ROOT *root;
        u64 vbo;
        size_t bit;
        loff_t eod;
        int err = 0;
        struct inode *dir = file_inode(file);
        struct ntfs_inode *ni = ntfs_i(dir);
        struct super_block *sb = dir->i_sb;
        struct ntfs_sb_info *sbi = sb->s_fs_info;
        loff_t i_size = i_size_read(dir);
        u32 pos = ctx->pos;
        u8 *name = NULL;
        struct indx_node *node = NULL;
        u8 index_bits = ni->dir.index_bits;

        /* Name is a buffer of PATH_MAX length. */
        static_assert(NTFS_NAME_LEN * 4 < PATH_MAX);

        eod = i_size + sbi->record_size;

        if (pos >= eod)
                return 0;

        if (!dir_emit_dots(file, ctx))
                return 0;

        /* Allocate PATH_MAX bytes. */
        name = __getname();
        if (!name)
                return -ENOMEM;

        if (!ni->mi_loaded && ni->attr_list.size) {
                /*
                 * Directory inode is locked for read.
                 * Load all subrecords to avoid 'write' access to 'ni' during
                 * directory reading.
                 */
                ni_lock(ni);
                if (!ni->mi_loaded && ni->attr_list.size) {
                        err = ni_load_all_mi(ni);
                        if (!err)
                                ni->mi_loaded = true;
                }
                ni_unlock(ni);
                if (err)
                        goto out;
        }

        root = indx_get_root(&ni->dir, ni, NULL, NULL);
        if (!root) {
                err = -EINVAL;
                goto out;
        }

        if (pos >= sbi->record_size) {
                bit = (pos - sbi->record_size) >> index_bits;
        } else {
                err = ntfs_read_hdr(sbi, ni, &root->ihdr, 0, pos, name, ctx);
                if (err)
                        goto out;
                bit = 0;
        }

        if (!i_size) {
                ctx->pos = eod;
                goto out;
        }

        for (;;) {
                vbo = (u64)bit << index_bits;
                if (vbo >= i_size) {
                        ctx->pos = eod;
                        goto out;
                }

                err = indx_used_bit(&ni->dir, ni, &bit);
                if (err)
                        goto out;

                if (bit == MINUS_ONE_T) {
                        ctx->pos = eod;
                        goto out;
                }

                vbo = (u64)bit << index_bits;
                if (vbo >= i_size) {
                        ntfs_inode_err(dir, "Looks like your dir is corrupt");
                        ctx->pos = eod;
                        err = -EINVAL;
                        goto out;
                }

                err = indx_read(&ni->dir, ni, bit << ni->dir.idx2vbn_bits,
                                &node);
                if (err)
                        goto out;

                err = ntfs_read_hdr(sbi, ni, &node->index->ihdr,
                                    vbo + sbi->record_size, pos, name, ctx);
                if (err)
                        goto out;

                bit += 1;
        }

out:

        __putname(name);
        put_indx_node(node);

        if (err == -ENOENT) {
                err = 0;
                ctx->pos = pos;
        }

        return err;
}

static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs,
                          size_t *files)
{
        int err = 0;
        struct ntfs_inode *ni = ntfs_i(dir);
        struct NTFS_DE *e = NULL;
        struct INDEX_ROOT *root;
        struct INDEX_HDR *hdr;
        const struct ATTR_FILE_NAME *fname;
        u32 e_size, off, end;
        size_t drs = 0, fles = 0, bit = 0;
        struct indx_node *node = NULL;
        size_t max_indx = i_size_read(&ni->vfs_inode) >> ni->dir.index_bits;

        if (is_empty)
                *is_empty = true;

        root = indx_get_root(&ni->dir, ni, NULL, NULL);
        if (!root)
                return -EINVAL;

        hdr = &root->ihdr;

        for (;;) {
                end = le32_to_cpu(hdr->used);
                off = le32_to_cpu(hdr->de_off);

                for (; off + sizeof(struct NTFS_DE) <= end; off += e_size) {
                        e = Add2Ptr(hdr, off);
                        e_size = le16_to_cpu(e->size);
                        if (e_size < sizeof(struct NTFS_DE) ||
                            off + e_size > end) {
                                /* Looks like corruption. */
                                break;
                        }

                        if (de_is_last(e))
                                break;

                        fname = de_get_fname(e);
                        if (!fname)
                                continue;

                        if (fname->type == FILE_NAME_DOS)
                                continue;

                        if (is_empty) {
                                *is_empty = false;
                                if (!dirs && !files)
                                        goto out;
                        }

                        if (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY)
                                drs += 1;
                        else
                                fles += 1;
                }

                if (bit >= max_indx)
                        goto out;

                err = indx_used_bit(&ni->dir, ni, &bit);
                if (err)
                        goto out;

                if (bit == MINUS_ONE_T)
                        goto out;

                if (bit >= max_indx)
                        goto out;

                err = indx_read(&ni->dir, ni, bit << ni->dir.idx2vbn_bits,
                                &node);
                if (err)
                        goto out;

                hdr = &node->index->ihdr;
                bit += 1;
        }

out:
        put_indx_node(node);
        if (dirs)
                *dirs = drs;
        if (files)
                *files = fles;

        return err;
}

bool dir_is_empty(struct inode *dir)
{
        bool is_empty = false;

        ntfs_dir_count(dir, &is_empty, NULL, NULL);

        return is_empty;
}

// clang-format off
const struct file_operations ntfs_dir_operations = {
        .llseek                = generic_file_llseek,
        .read                = generic_read_dir,
        .iterate_shared        = ntfs_readdir,
        .fsync                = generic_file_fsync,
        .open                = ntfs_file_open,
        .unlocked_ioctl = ntfs_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl   = ntfs_compat_ioctl,
#endif
};

const struct file_operations ntfs_legacy_dir_operations = {
        .llseek                = generic_file_llseek,
        .read                = generic_read_dir,
        .iterate_shared        = ntfs_readdir,
        .open                = ntfs_file_open,
};
// clang-format on

























































































































































































































































































































































    1 










    1 



































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 
    2 































    2 











    1 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) STRATO AG 2012.  All rights reserved.
 */

#include <linux/sched.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/kthread.h>
#include <linux/math64.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "volumes.h"
#include "async-thread.h"
#include "dev-replace.h"
#include "sysfs.h"
#include "zoned.h"
#include "block-group.h"
#include "fs.h"
#include "accessors.h"
#include "scrub.h"

/*
 * Device replace overview
 *
 * [Objective]
 * To copy all extents (both new and on-disk) from source device to target
 * device, while still keeping the filesystem read-write.
 *
 * [Method]
 * There are two main methods involved:
 *
 * - Write duplication
 *
 *   All new writes will be written to both target and source devices, so even
 *   if replace gets canceled, sources device still contains up-to-date data.
 *
 *   Location:                handle_ops_on_dev_replace() from btrfs_map_block()
 *   Start:                btrfs_dev_replace_start()
 *   End:                btrfs_dev_replace_finishing()
 *   Content:                Latest data/metadata
 *
 * - Copy existing extents
 *
 *   This happens by re-using scrub facility, as scrub also iterates through
 *   existing extents from commit root.
 *
 *   Location:                scrub_write_block_to_dev_replace() from
 *                           scrub_block_complete()
 *   Content:                Data/meta from commit root.
 *
 * Due to the content difference, we need to avoid nocow write when dev-replace
 * is happening.  This is done by marking the block group read-only and waiting
 * for NOCOW writes.
 *
 * After replace is done, the finishing part is done by swapping the target and
 * source devices.
 *
 *   Location:                btrfs_dev_replace_update_device_in_mapping_tree() from
 *                           btrfs_dev_replace_finishing()
 */

static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
                                       int scrub_ret);
static int btrfs_dev_replace_kthread(void *data);

int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
{
        struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID };
        struct btrfs_key key;
        struct btrfs_root *dev_root = fs_info->dev_root;
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
        struct extent_buffer *eb;
        int slot;
        int ret = 0;
        struct btrfs_path *path = NULL;
        int item_size;
        struct btrfs_dev_replace_item *ptr;
        u64 src_devid;

        if (!dev_root)
                return 0;

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }

        key.objectid = 0;
        key.type = BTRFS_DEV_REPLACE_KEY;
        key.offset = 0;
        ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
        if (ret) {
no_valid_dev_replace_entry_found:
                /*
                 * We don't have a replace item or it's corrupted.  If there is
                 * a replace target, fail the mount.
                 */
                if (btrfs_find_device(fs_info->fs_devices, &args)) {
                        btrfs_err(fs_info,
                        "found replace target device without a valid replace item");
                        ret = -EUCLEAN;
                        goto out;
                }
                ret = 0;
                dev_replace->replace_state =
                        BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
                dev_replace->cont_reading_from_srcdev_mode =
                    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
                dev_replace->time_started = 0;
                dev_replace->time_stopped = 0;
                atomic64_set(&dev_replace->num_write_errors, 0);
                atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
                dev_replace->cursor_left = 0;
                dev_replace->committed_cursor_left = 0;
                dev_replace->cursor_left_last_write_of_item = 0;
                dev_replace->cursor_right = 0;
                dev_replace->srcdev = NULL;
                dev_replace->tgtdev = NULL;
                dev_replace->is_valid = 0;
                dev_replace->item_needs_writeback = 0;
                goto out;
        }
        slot = path->slots[0];
        eb = path->nodes[0];
        item_size = btrfs_item_size(eb, slot);
        ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);

        if (item_size != sizeof(struct btrfs_dev_replace_item)) {
                btrfs_warn(fs_info,
                        "dev_replace entry found has unexpected size, ignore entry");
                goto no_valid_dev_replace_entry_found;
        }

        src_devid = btrfs_dev_replace_src_devid(eb, ptr);
        dev_replace->cont_reading_from_srcdev_mode =
                btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
        dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
        dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
        dev_replace->time_stopped =
                btrfs_dev_replace_time_stopped(eb, ptr);
        atomic64_set(&dev_replace->num_write_errors,
                     btrfs_dev_replace_num_write_errors(eb, ptr));
        atomic64_set(&dev_replace->num_uncorrectable_read_errors,
                     btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
        dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
        dev_replace->committed_cursor_left = dev_replace->cursor_left;
        dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
        dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
        dev_replace->is_valid = 1;

        dev_replace->item_needs_writeback = 0;
        switch (dev_replace->replace_state) {
        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
                /*
                 * We don't have an active replace item but if there is a
                 * replace target, fail the mount.
                 */
                if (btrfs_find_device(fs_info->fs_devices, &args)) {
                        btrfs_err(fs_info,
"replace without active item, run 'device scan --forget' on the target device");
                        ret = -EUCLEAN;
                } else {
                        dev_replace->srcdev = NULL;
                        dev_replace->tgtdev = NULL;
                }
                break;
        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
                dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args);
                args.devid = src_devid;
                dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args);

                /*
                 * allow 'btrfs dev replace_cancel' if src/tgt device is
                 * missing
                 */
                if (!dev_replace->srcdev &&
                    !btrfs_test_opt(fs_info, DEGRADED)) {
                        ret = -EIO;
                        btrfs_warn(fs_info,
                           "cannot mount because device replace operation is ongoing and");
                        btrfs_warn(fs_info,
                           "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
                           src_devid);
                }
                if (!dev_replace->tgtdev &&
                    !btrfs_test_opt(fs_info, DEGRADED)) {
                        ret = -EIO;
                        btrfs_warn(fs_info,
                           "cannot mount because device replace operation is ongoing and");
                        btrfs_warn(fs_info,
                           "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
                                BTRFS_DEV_REPLACE_DEVID);
                }
                if (dev_replace->tgtdev) {
                        if (dev_replace->srcdev) {
                                dev_replace->tgtdev->total_bytes =
                                        dev_replace->srcdev->total_bytes;
                                dev_replace->tgtdev->disk_total_bytes =
                                        dev_replace->srcdev->disk_total_bytes;
                                dev_replace->tgtdev->commit_total_bytes =
                                        dev_replace->srcdev->commit_total_bytes;
                                dev_replace->tgtdev->bytes_used =
                                        dev_replace->srcdev->bytes_used;
                                dev_replace->tgtdev->commit_bytes_used =
                                        dev_replace->srcdev->commit_bytes_used;
                        }
                        set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
                                &dev_replace->tgtdev->dev_state);

                        WARN_ON(fs_info->fs_devices->rw_devices == 0);
                        dev_replace->tgtdev->io_width = fs_info->sectorsize;
                        dev_replace->tgtdev->io_align = fs_info->sectorsize;
                        dev_replace->tgtdev->sector_size = fs_info->sectorsize;
                        dev_replace->tgtdev->fs_info = fs_info;
                        set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
                                &dev_replace->tgtdev->dev_state);
                }
                break;
        }

out:
        btrfs_free_path(path);
        return ret;
}

/*
 * Initialize a new device for device replace target from a given source dev
 * and path.
 *
 * Return 0 and new device in @device_out, otherwise return < 0
 */
static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
                                  const char *device_path,
                                  struct btrfs_device *srcdev,
                                  struct btrfs_device **device_out)
{
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        struct btrfs_device *device;
        struct file *bdev_file;
        struct block_device *bdev;
        u64 devid = BTRFS_DEV_REPLACE_DEVID;
        int ret = 0;

        *device_out = NULL;
        if (srcdev->fs_devices->seeding) {
                btrfs_err(fs_info, "the filesystem is a seed filesystem!");
                return -EINVAL;
        }

        bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
                                        fs_info->bdev_holder, NULL);
        if (IS_ERR(bdev_file)) {
                btrfs_err(fs_info, "target device %s is invalid!", device_path);
                return PTR_ERR(bdev_file);
        }
        bdev = file_bdev(bdev_file);

        if (!btrfs_check_device_zone_type(fs_info, bdev)) {
                btrfs_err(fs_info,
                "dev-replace: zoned type of target device mismatch with filesystem");
                ret = -EINVAL;
                goto error;
        }

        sync_blockdev(bdev);

        list_for_each_entry(device, &fs_devices->devices, dev_list) {
                if (device->bdev == bdev) {
                        btrfs_err(fs_info,
                                  "target device is in the filesystem!");
                        ret = -EEXIST;
                        goto error;
                }
        }


        if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) {
                btrfs_err(fs_info,
                          "target device is smaller than source device!");
                ret = -EINVAL;
                goto error;
        }


        device = btrfs_alloc_device(NULL, &devid, NULL, device_path);
        if (IS_ERR(device)) {
                ret = PTR_ERR(device);
                goto error;
        }

        ret = lookup_bdev(device_path, &device->devt);
        if (ret)
                goto error;

        set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
        device->generation = 0;
        device->io_width = fs_info->sectorsize;
        device->io_align = fs_info->sectorsize;
        device->sector_size = fs_info->sectorsize;
        device->total_bytes = btrfs_device_get_total_bytes(srcdev);
        device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
        device->bytes_used = btrfs_device_get_bytes_used(srcdev);
        device->commit_total_bytes = srcdev->commit_total_bytes;
        device->commit_bytes_used = device->bytes_used;
        device->fs_info = fs_info;
        device->bdev = bdev;
        device->bdev_file = bdev_file;
        set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
        set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
        device->dev_stats_valid = 1;
        set_blocksize(bdev_file, BTRFS_BDEV_BLOCKSIZE);
        device->fs_devices = fs_devices;

        ret = btrfs_get_dev_zone_info(device, false);
        if (ret)
                goto error;

        mutex_lock(&fs_devices->device_list_mutex);
        list_add(&device->dev_list, &fs_devices->devices);
        fs_devices->num_devices++;
        fs_devices->open_devices++;
        mutex_unlock(&fs_devices->device_list_mutex);

        *device_out = device;
        return 0;

error:
        fput(bdev_file);
        return ret;
}

/*
 * called from commit_transaction. Writes changed device replace state to
 * disk.
 */
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        int ret;
        struct btrfs_root *dev_root = fs_info->dev_root;
        struct btrfs_path *path;
        struct btrfs_key key;
        struct extent_buffer *eb;
        struct btrfs_dev_replace_item *ptr;
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;

        down_read(&dev_replace->rwsem);
        if (!dev_replace->is_valid ||
            !dev_replace->item_needs_writeback) {
                up_read(&dev_replace->rwsem);
                return 0;
        }
        up_read(&dev_replace->rwsem);

        key.objectid = 0;
        key.type = BTRFS_DEV_REPLACE_KEY;
        key.offset = 0;

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }
        ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
        if (ret < 0) {
                btrfs_warn(fs_info,
                           "error %d while searching for dev_replace item!",
                           ret);
                goto out;
        }

        if (ret == 0 &&
            btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
                /*
                 * need to delete old one and insert a new one.
                 * Since no attempt is made to recover any old state, if the
                 * dev_replace state is 'running', the data on the target
                 * drive is lost.
                 * It would be possible to recover the state: just make sure
                 * that the beginning of the item is never changed and always
                 * contains all the essential information. Then read this
                 * minimal set of information and use it as a base for the
                 * new state.
                 */
                ret = btrfs_del_item(trans, dev_root, path);
                if (ret != 0) {
                        btrfs_warn(fs_info,
                                   "delete too small dev_replace item failed %d!",
                                   ret);
                        goto out;
                }
                ret = 1;
        }

        if (ret == 1) {
                /* need to insert a new item */
                btrfs_release_path(path);
                ret = btrfs_insert_empty_item(trans, dev_root, path,
                                              &key, sizeof(*ptr));
                if (ret < 0) {
                        btrfs_warn(fs_info,
                                   "insert dev_replace item failed %d!", ret);
                        goto out;
                }
        }

        eb = path->nodes[0];
        ptr = btrfs_item_ptr(eb, path->slots[0],
                             struct btrfs_dev_replace_item);

        down_write(&dev_replace->rwsem);
        if (dev_replace->srcdev)
                btrfs_set_dev_replace_src_devid(eb, ptr,
                        dev_replace->srcdev->devid);
        else
                btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
        btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
                dev_replace->cont_reading_from_srcdev_mode);
        btrfs_set_dev_replace_replace_state(eb, ptr,
                dev_replace->replace_state);
        btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
        btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
        btrfs_set_dev_replace_num_write_errors(eb, ptr,
                atomic64_read(&dev_replace->num_write_errors));
        btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
                atomic64_read(&dev_replace->num_uncorrectable_read_errors));
        dev_replace->cursor_left_last_write_of_item =
                dev_replace->cursor_left;
        btrfs_set_dev_replace_cursor_left(eb, ptr,
                dev_replace->cursor_left_last_write_of_item);
        btrfs_set_dev_replace_cursor_right(eb, ptr,
                dev_replace->cursor_right);
        dev_replace->item_needs_writeback = 0;
        up_write(&dev_replace->rwsem);

        btrfs_mark_buffer_dirty(trans, eb);

out:
        btrfs_free_path(path);

        return ret;
}

static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
                                    struct btrfs_device *src_dev)
{
        struct btrfs_path *path;
        struct btrfs_key key;
        struct btrfs_key found_key;
        struct btrfs_root *root = fs_info->dev_root;
        struct btrfs_dev_extent *dev_extent = NULL;
        struct btrfs_block_group *cache;
        struct btrfs_trans_handle *trans;
        int iter_ret = 0;
        int ret = 0;
        u64 chunk_offset;

        /* Do not use "to_copy" on non zoned filesystem for now */
        if (!btrfs_is_zoned(fs_info))
                return 0;

        mutex_lock(&fs_info->chunk_mutex);

        /* Ensure we don't have pending new block group */
        spin_lock(&fs_info->trans_lock);
        while (fs_info->running_transaction &&
               !list_empty(&fs_info->running_transaction->dev_update_list)) {
                spin_unlock(&fs_info->trans_lock);
                mutex_unlock(&fs_info->chunk_mutex);
                trans = btrfs_attach_transaction(root);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
                        mutex_lock(&fs_info->chunk_mutex);
                        if (ret == -ENOENT) {
                                spin_lock(&fs_info->trans_lock);
                                continue;
                        } else {
                                goto unlock;
                        }
                }

                ret = btrfs_commit_transaction(trans);
                mutex_lock(&fs_info->chunk_mutex);
                if (ret)
                        goto unlock;

                spin_lock(&fs_info->trans_lock);
        }
        spin_unlock(&fs_info->trans_lock);

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto unlock;
        }

        path->reada = READA_FORWARD;
        path->search_commit_root = 1;
        path->skip_locking = 1;

        key.objectid = src_dev->devid;
        key.type = BTRFS_DEV_EXTENT_KEY;
        key.offset = 0;

        btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
                struct extent_buffer *leaf = path->nodes[0];

                if (found_key.objectid != src_dev->devid)
                        break;

                if (found_key.type != BTRFS_DEV_EXTENT_KEY)
                        break;

                if (found_key.offset < key.offset)
                        break;

                dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);

                chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent);

                cache = btrfs_lookup_block_group(fs_info, chunk_offset);
                if (!cache)
                        continue;

                set_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
                btrfs_put_block_group(cache);
        }
        if (iter_ret < 0)
                ret = iter_ret;

        btrfs_free_path(path);
unlock:
        mutex_unlock(&fs_info->chunk_mutex);

        return ret;
}

bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
                                      struct btrfs_block_group *cache,
                                      u64 physical)
{
        struct btrfs_fs_info *fs_info = cache->fs_info;
        struct btrfs_chunk_map *map;
        u64 chunk_offset = cache->start;
        int num_extents, cur_extent;
        int i;

        /* Do not use "to_copy" on non zoned filesystem for now */
        if (!btrfs_is_zoned(fs_info))
                return true;

        spin_lock(&cache->lock);
        if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
                spin_unlock(&cache->lock);
                return true;
        }
        spin_unlock(&cache->lock);

        map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
        ASSERT(!IS_ERR(map));

        num_extents = 0;
        cur_extent = 0;
        for (i = 0; i < map->num_stripes; i++) {
                /* We have more device extent to copy */
                if (srcdev != map->stripes[i].dev)
                        continue;

                num_extents++;
                if (physical == map->stripes[i].physical)
                        cur_extent = i;
        }

        btrfs_free_chunk_map(map);

        if (num_extents > 1 && cur_extent < num_extents - 1) {
                /*
                 * Has more stripes on this device. Keep this block group
                 * readonly until we finish all the stripes.
                 */
                return false;
        }

        /* Last stripe on this device */
        clear_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);

        return true;
}

static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
                const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
                int read_src)
{
        struct btrfs_root *root = fs_info->dev_root;
        struct btrfs_trans_handle *trans;
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
        int ret;
        struct btrfs_device *tgt_device = NULL;
        struct btrfs_device *src_device = NULL;

        src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
                                                  srcdev_name);
        if (IS_ERR(src_device))
                return PTR_ERR(src_device);

        if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
                btrfs_warn_in_rcu(fs_info,
          "cannot replace device %s (devid %llu) due to active swapfile",
                        btrfs_dev_name(src_device), src_device->devid);
                return -ETXTBSY;
        }

        /*
         * Here we commit the transaction to make sure commit_total_bytes
         * of all the devices are updated.
         */
        trans = btrfs_attach_transaction(root);
        if (!IS_ERR(trans)) {
                ret = btrfs_commit_transaction(trans);
                if (ret)
                        return ret;
        } else if (PTR_ERR(trans) != -ENOENT) {
                return PTR_ERR(trans);
        }

        ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
                                            src_device, &tgt_device);
        if (ret)
                return ret;

        ret = mark_block_group_to_copy(fs_info, src_device);
        if (ret)
                return ret;

        down_write(&dev_replace->rwsem);
        switch (dev_replace->replace_state) {
        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
                break;
        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
                ASSERT(0);
                ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
                up_write(&dev_replace->rwsem);
                goto leave;
        }

        dev_replace->cont_reading_from_srcdev_mode = read_src;
        dev_replace->srcdev = src_device;
        dev_replace->tgtdev = tgt_device;

        btrfs_info_in_rcu(fs_info,
                      "dev_replace from %s (devid %llu) to %s started",
                      btrfs_dev_name(src_device),
                      src_device->devid,
                      btrfs_dev_name(tgt_device));

        /*
         * from now on, the writes to the srcdev are all duplicated to
         * go to the tgtdev as well (refer to btrfs_map_block()).
         */
        dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
        dev_replace->time_started = ktime_get_real_seconds();
        dev_replace->cursor_left = 0;
        dev_replace->committed_cursor_left = 0;
        dev_replace->cursor_left_last_write_of_item = 0;
        dev_replace->cursor_right = 0;
        dev_replace->is_valid = 1;
        dev_replace->item_needs_writeback = 1;
        atomic64_set(&dev_replace->num_write_errors, 0);
        atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
        up_write(&dev_replace->rwsem);

        ret = btrfs_sysfs_add_device(tgt_device);
        if (ret)
                btrfs_err(fs_info, "kobj add dev failed %d", ret);

        btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);

        /*
         * Commit dev_replace state and reserve 1 item for it.
         * This is crucial to ensure we won't miss copying extents for new block
         * groups that are allocated after we started the device replace, and
         * must be done after setting up the device replace state.
         */
        trans = btrfs_start_transaction(root, 1);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                down_write(&dev_replace->rwsem);
                dev_replace->replace_state =
                        BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
                dev_replace->srcdev = NULL;
                dev_replace->tgtdev = NULL;
                up_write(&dev_replace->rwsem);
                goto leave;
        }

        ret = btrfs_commit_transaction(trans);
        WARN_ON(ret);

        /* the disk copy procedure reuses the scrub code */
        ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
                              btrfs_device_get_total_bytes(src_device),
                              &dev_replace->scrub_progress, 0, 1);

        ret = btrfs_dev_replace_finishing(fs_info, ret);
        if (ret == -EINPROGRESS)
                ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;

        return ret;

leave:
        btrfs_destroy_dev_replace_tgtdev(tgt_device);
        return ret;
}

static int btrfs_check_replace_dev_names(struct btrfs_ioctl_dev_replace_args *args)
{
        if (args->start.srcdevid == 0) {
                if (memchr(args->start.srcdev_name, 0,
                           sizeof(args->start.srcdev_name)) == NULL)
                        return -ENAMETOOLONG;
        } else {
                args->start.srcdev_name[0] = 0;
        }

        if (memchr(args->start.tgtdev_name, 0,
                   sizeof(args->start.tgtdev_name)) == NULL)
            return -ENAMETOOLONG;

        return 0;
}

int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
                            struct btrfs_ioctl_dev_replace_args *args)
{
        int ret;

        switch (args->start.cont_reading_from_srcdev_mode) {
        case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
        case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
                break;
        default:
                return -EINVAL;
        }
        ret = btrfs_check_replace_dev_names(args);
        if (ret < 0)
                return ret;

        ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
                                        args->start.srcdevid,
                                        args->start.srcdev_name,
                                        args->start.cont_reading_from_srcdev_mode);
        args->result = ret;
        /* don't warn if EINPROGRESS, someone else might be running scrub */
        if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
            ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
                return 0;

        return ret;
}

/*
 * blocked until all in-flight bios operations are finished.
 */
static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
{
        set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
        wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum(
                   &fs_info->dev_replace.bio_counter));
}

/*
 * we have removed target device, it is safe to allow new bios request.
 */
static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
{
        clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
        wake_up(&fs_info->dev_replace.replace_wait);
}

/*
 * When finishing the device replace, before swapping the source device with the
 * target device we must update the chunk allocation state in the target device,
 * as it is empty because replace works by directly copying the chunks and not
 * through the normal chunk allocation path.
 */
static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
                                        struct btrfs_device *tgtdev)
{
        struct extent_state *cached_state = NULL;
        u64 start = 0;
        u64 found_start;
        u64 found_end;
        int ret = 0;

        lockdep_assert_held(&srcdev->fs_info->chunk_mutex);

        while (find_first_extent_bit(&srcdev->alloc_state, start,
                                     &found_start, &found_end,
                                     CHUNK_ALLOCATED, &cached_state)) {
                ret = set_extent_bit(&tgtdev->alloc_state, found_start,
                                     found_end, CHUNK_ALLOCATED, NULL);
                if (ret)
                        break;
                start = found_end + 1;
        }

        free_extent_state(cached_state);
        return ret;
}

static void btrfs_dev_replace_update_device_in_mapping_tree(
                                                struct btrfs_fs_info *fs_info,
                                                struct btrfs_device *srcdev,
                                                struct btrfs_device *tgtdev)
{
        u64 start = 0;
        int i;

        write_lock(&fs_info->mapping_tree_lock);
        do {
                struct btrfs_chunk_map *map;

                map = btrfs_find_chunk_map_nolock(fs_info, start, U64_MAX);
                if (!map)
                        break;
                for (i = 0; i < map->num_stripes; i++)
                        if (srcdev == map->stripes[i].dev)
                                map->stripes[i].dev = tgtdev;
                start = map->start + map->chunk_len;
                btrfs_free_chunk_map(map);
        } while (start);
        write_unlock(&fs_info->mapping_tree_lock);
}

static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
                                       int scrub_ret)
{
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        struct btrfs_device *tgt_device;
        struct btrfs_device *src_device;
        struct btrfs_root *root = fs_info->tree_root;
        u8 uuid_tmp[BTRFS_UUID_SIZE];
        struct btrfs_trans_handle *trans;
        int ret = 0;

        /* don't allow cancel or unmount to disturb the finishing procedure */
        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);

        down_read(&dev_replace->rwsem);
        /* was the operation canceled, or is it finished? */
        if (dev_replace->replace_state !=
            BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
                up_read(&dev_replace->rwsem);
                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
                return 0;
        }

        tgt_device = dev_replace->tgtdev;
        src_device = dev_replace->srcdev;
        up_read(&dev_replace->rwsem);

        /*
         * flush all outstanding I/O and inode extent mappings before the
         * copy operation is declared as being finished
         */
        ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
        if (ret) {
                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
                return ret;
        }
        btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);

        /*
         * We have to use this loop approach because at this point src_device
         * has to be available for transaction commit to complete, yet new
         * chunks shouldn't be allocated on the device.
         */
        while (1) {
                trans = btrfs_start_transaction(root, 0);
                if (IS_ERR(trans)) {
                        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
                        return PTR_ERR(trans);
                }
                ret = btrfs_commit_transaction(trans);
                WARN_ON(ret);

                /* Prevent write_all_supers() during the finishing procedure */
                mutex_lock(&fs_devices->device_list_mutex);
                /* Prevent new chunks being allocated on the source device */
                mutex_lock(&fs_info->chunk_mutex);

                if (!list_empty(&src_device->post_commit_list)) {
                        mutex_unlock(&fs_devices->device_list_mutex);
                        mutex_unlock(&fs_info->chunk_mutex);
                } else {
                        break;
                }
        }

        down_write(&dev_replace->rwsem);
        dev_replace->replace_state =
                scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
                          : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
        dev_replace->tgtdev = NULL;
        dev_replace->srcdev = NULL;
        dev_replace->time_stopped = ktime_get_real_seconds();
        dev_replace->item_needs_writeback = 1;

        /*
         * Update allocation state in the new device and replace the old device
         * with the new one in the mapping tree.
         */
        if (!scrub_ret) {
                scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device);
                if (scrub_ret)
                        goto error;
                btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
                                                                src_device,
                                                                tgt_device);
        } else {
                if (scrub_ret != -ECANCELED)
                        btrfs_err_in_rcu(fs_info,
                                 "btrfs_scrub_dev(%s, %llu, %s) failed %d",
                                 btrfs_dev_name(src_device),
                                 src_device->devid,
                                 btrfs_dev_name(tgt_device), scrub_ret);
error:
                up_write(&dev_replace->rwsem);
                mutex_unlock(&fs_info->chunk_mutex);
                mutex_unlock(&fs_devices->device_list_mutex);
                btrfs_rm_dev_replace_blocked(fs_info);
                if (tgt_device)
                        btrfs_destroy_dev_replace_tgtdev(tgt_device);
                btrfs_rm_dev_replace_unblocked(fs_info);
                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);

                return scrub_ret;
        }

        btrfs_info_in_rcu(fs_info,
                          "dev_replace from %s (devid %llu) to %s finished",
                          btrfs_dev_name(src_device),
                          src_device->devid,
                          btrfs_dev_name(tgt_device));
        clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
        tgt_device->devid = src_device->devid;
        src_device->devid = BTRFS_DEV_REPLACE_DEVID;
        memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
        memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
        memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
        btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
        btrfs_device_set_disk_total_bytes(tgt_device,
                                          src_device->disk_total_bytes);
        btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
        tgt_device->commit_bytes_used = src_device->bytes_used;

        btrfs_assign_next_active_device(src_device, tgt_device);

        list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list);
        fs_devices->rw_devices++;

        up_write(&dev_replace->rwsem);
        btrfs_rm_dev_replace_blocked(fs_info);

        btrfs_rm_dev_replace_remove_srcdev(src_device);

        btrfs_rm_dev_replace_unblocked(fs_info);

        /*
         * Increment dev_stats_ccnt so that btrfs_run_dev_stats() will
         * update on-disk dev stats value during commit transaction
         */
        atomic_inc(&tgt_device->dev_stats_ccnt);

        /*
         * this is again a consistent state where no dev_replace procedure
         * is running, the target device is part of the filesystem, the
         * source device is not part of the filesystem anymore and its 1st
         * superblock is scratched out so that it is no longer marked to
         * belong to this filesystem.
         */
        mutex_unlock(&fs_info->chunk_mutex);
        mutex_unlock(&fs_devices->device_list_mutex);

        /* replace the sysfs entry */
        btrfs_sysfs_remove_device(src_device);
        btrfs_sysfs_update_devid(tgt_device);
        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
                btrfs_scratch_superblocks(fs_info, src_device);

        /* write back the superblocks */
        trans = btrfs_start_transaction(root, 0);
        if (!IS_ERR(trans))
                btrfs_commit_transaction(trans);

        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);

        btrfs_rm_dev_replace_free_srcdev(src_device);

        return 0;
}

/*
 * Read progress of device replace status according to the state and last
 * stored position. The value format is the same as for
 * btrfs_dev_replace::progress_1000
 */
static u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info)
{
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
        u64 ret = 0;

        switch (dev_replace->replace_state) {
        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
                ret = 0;
                break;
        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
                ret = 1000;
                break;
        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
                ret = div64_u64(dev_replace->cursor_left,
                                div_u64(btrfs_device_get_total_bytes(
                                                dev_replace->srcdev), 1000));
                break;
        }

        return ret;
}

void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
                              struct btrfs_ioctl_dev_replace_args *args)
{
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;

        down_read(&dev_replace->rwsem);
        /* even if !dev_replace_is_valid, the values are good enough for
         * the replace_status ioctl */
        args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
        args->status.replace_state = dev_replace->replace_state;
        args->status.time_started = dev_replace->time_started;
        args->status.time_stopped = dev_replace->time_stopped;
        args->status.num_write_errors =
                atomic64_read(&dev_replace->num_write_errors);
        args->status.num_uncorrectable_read_errors =
                atomic64_read(&dev_replace->num_uncorrectable_read_errors);
        args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
        up_read(&dev_replace->rwsem);
}

int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
{
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
        struct btrfs_device *tgt_device = NULL;
        struct btrfs_device *src_device = NULL;
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = fs_info->tree_root;
        int result;
        int ret;

        if (sb_rdonly(fs_info->sb))
                return -EROFS;

        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
        down_write(&dev_replace->rwsem);
        switch (dev_replace->replace_state) {
        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
                result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
                up_write(&dev_replace->rwsem);
                break;
        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
                tgt_device = dev_replace->tgtdev;
                src_device = dev_replace->srcdev;
                up_write(&dev_replace->rwsem);
                ret = btrfs_scrub_cancel(fs_info);
                if (ret < 0) {
                        result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
                } else {
                        result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
                        /*
                         * btrfs_dev_replace_finishing() will handle the
                         * cleanup part
                         */
                        btrfs_info_in_rcu(fs_info,
                                "dev_replace from %s (devid %llu) to %s canceled",
                                btrfs_dev_name(src_device), src_device->devid,
                                btrfs_dev_name(tgt_device));
                }
                break;
        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
                /*
                 * Scrub doing the replace isn't running so we need to do the
                 * cleanup step of btrfs_dev_replace_finishing() here
                 */
                result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
                tgt_device = dev_replace->tgtdev;
                src_device = dev_replace->srcdev;
                dev_replace->tgtdev = NULL;
                dev_replace->srcdev = NULL;
                dev_replace->replace_state =
                                BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
                dev_replace->time_stopped = ktime_get_real_seconds();
                dev_replace->item_needs_writeback = 1;

                up_write(&dev_replace->rwsem);

                /* Scrub for replace must not be running in suspended state */
                btrfs_scrub_cancel(fs_info);

                trans = btrfs_start_transaction(root, 0);
                if (IS_ERR(trans)) {
                        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
                        return PTR_ERR(trans);
                }
                ret = btrfs_commit_transaction(trans);
                WARN_ON(ret);

                btrfs_info_in_rcu(fs_info,
                "suspended dev_replace from %s (devid %llu) to %s canceled",
                        btrfs_dev_name(src_device), src_device->devid,
                        btrfs_dev_name(tgt_device));

                if (tgt_device)
                        btrfs_destroy_dev_replace_tgtdev(tgt_device);
                break;
        default:
                up_write(&dev_replace->rwsem);
                result = -EINVAL;
        }

        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
        return result;
}

void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
{
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;

        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
        down_write(&dev_replace->rwsem);

        switch (dev_replace->replace_state) {
        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
                break;
        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
                dev_replace->replace_state =
                        BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
                dev_replace->time_stopped = ktime_get_real_seconds();
                dev_replace->item_needs_writeback = 1;
                btrfs_info(fs_info, "suspending dev_replace for unmount");
                break;
        }

        up_write(&dev_replace->rwsem);
        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
}

/* resume dev_replace procedure that was interrupted by unmount */
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
{
        struct task_struct *task;
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;

        down_write(&dev_replace->rwsem);

        switch (dev_replace->replace_state) {
        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
                up_write(&dev_replace->rwsem);
                return 0;
        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
                break;
        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
                dev_replace->replace_state =
                        BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
                break;
        }
        if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
                btrfs_info(fs_info,
                           "cannot continue dev_replace, tgtdev is missing");
                btrfs_info(fs_info,
                           "you may cancel the operation after 'mount -o degraded'");
                dev_replace->replace_state =
                                        BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
                up_write(&dev_replace->rwsem);
                return 0;
        }
        up_write(&dev_replace->rwsem);

        /*
         * This could collide with a paused balance, but the exclusive op logic
         * should never allow both to start and pause. We don't want to allow
         * dev-replace to start anyway.
         */
        if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
                down_write(&dev_replace->rwsem);
                dev_replace->replace_state =
                                        BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
                up_write(&dev_replace->rwsem);
                btrfs_info(fs_info,
                "cannot resume dev-replace, other exclusive operation running");
                return 0;
        }

        task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
        return PTR_ERR_OR_ZERO(task);
}

static int btrfs_dev_replace_kthread(void *data)
{
        struct btrfs_fs_info *fs_info = data;
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
        u64 progress;
        int ret;

        progress = btrfs_dev_replace_progress(fs_info);
        progress = div_u64(progress, 10);
        btrfs_info_in_rcu(fs_info,
                "continuing dev_replace from %s (devid %llu) to target %s @%u%%",
                btrfs_dev_name(dev_replace->srcdev),
                dev_replace->srcdev->devid,
                btrfs_dev_name(dev_replace->tgtdev),
                (unsigned int)progress);

        ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
                              dev_replace->committed_cursor_left,
                              btrfs_device_get_total_bytes(dev_replace->srcdev),
                              &dev_replace->scrub_progress, 0, 1);
        ret = btrfs_dev_replace_finishing(fs_info, ret);
        WARN_ON(ret && ret != -ECANCELED);

        btrfs_exclop_finish(fs_info);
        return 0;
}

int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
{
        if (!dev_replace->is_valid)
                return 0;

        switch (dev_replace->replace_state) {
        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
                return 0;
        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
                /*
                 * return true even if tgtdev is missing (this is
                 * something that can happen if the dev_replace
                 * procedure is suspended by an umount and then
                 * the tgtdev is missing (or "btrfs dev scan") was
                 * not called and the filesystem is remounted
                 * in degraded state. This does not stop the
                 * dev_replace procedure. It needs to be canceled
                 * manually if the cancellation is wanted.
                 */
                break;
        }
        return 1;
}

void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
{
        percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
        cond_wake_up_nomb(&fs_info->dev_replace.replace_wait);
}

void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
{
        while (1) {
                percpu_counter_inc(&fs_info->dev_replace.bio_counter);
                if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
                                     &fs_info->fs_state)))
                        break;

                btrfs_bio_counter_dec(fs_info);
                wait_event(fs_info->dev_replace.replace_wait,
                           !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
                                     &fs_info->fs_state));
        }
}









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 















    1 





























































    1 















    1 







    1 

    1 
    1 




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2007 Casey Schaufler <casey@schaufler-ca.com>
 *
 * Authors:
 *         Casey Schaufler <casey@schaufler-ca.com>
 *         Ahmed S. Darwish <darwish.07@gmail.com>
 *
 * Special thanks to the authors of selinuxfs.
 *
 *        Karl MacMillan <kmacmillan@tresys.com>
 *        James Morris <jmorris@redhat.com>
 */

#include <linux/kernel.h>
#include <linux/vmalloc.h>
#include <linux/security.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <net/net_namespace.h>
#include <net/cipso_ipv4.h>
#include <linux/seq_file.h>
#include <linux/ctype.h>
#include <linux/audit.h>
#include <linux/magic.h>
#include <linux/mount.h>
#include <linux/fs_context.h>
#include "smack.h"

#define BEBITS        (sizeof(__be32) * 8)
/*
 * smackfs pseudo filesystem.
 */

enum smk_inos {
        SMK_ROOT_INO        = 2,
        SMK_LOAD        = 3,        /* load policy */
        SMK_CIPSO        = 4,        /* load label -> CIPSO mapping */
        SMK_DOI                = 5,        /* CIPSO DOI */
        SMK_DIRECT        = 6,        /* CIPSO level indicating direct label */
        SMK_AMBIENT        = 7,        /* internet ambient label */
        SMK_NET4ADDR        = 8,        /* single label hosts */
        SMK_ONLYCAP        = 9,        /* the only "capable" label */
        SMK_LOGGING        = 10,        /* logging */
        SMK_LOAD_SELF        = 11,        /* task specific rules */
        SMK_ACCESSES        = 12,        /* access policy */
        SMK_MAPPED        = 13,        /* CIPSO level indicating mapped label */
        SMK_LOAD2        = 14,        /* load policy with long labels */
        SMK_LOAD_SELF2        = 15,        /* load task specific rules with long labels */
        SMK_ACCESS2        = 16,        /* make an access check with long labels */
        SMK_CIPSO2        = 17,        /* load long label -> CIPSO mapping */
        SMK_REVOKE_SUBJ        = 18,        /* set rules with subject label to '-' */
        SMK_CHANGE_RULE        = 19,        /* change or add rules (long labels) */
        SMK_SYSLOG        = 20,        /* change syslog label) */
        SMK_PTRACE        = 21,        /* set ptrace rule */
#ifdef CONFIG_SECURITY_SMACK_BRINGUP
        SMK_UNCONFINED        = 22,        /* define an unconfined label */
#endif
#if IS_ENABLED(CONFIG_IPV6)
        SMK_NET6ADDR        = 23,        /* single label IPv6 hosts */
#endif /* CONFIG_IPV6 */
        SMK_RELABEL_SELF = 24, /* relabel possible without CAP_MAC_ADMIN */
};

/*
 * List locks
 */
static DEFINE_MUTEX(smack_cipso_lock);
static DEFINE_MUTEX(smack_ambient_lock);
static DEFINE_MUTEX(smk_net4addr_lock);
#if IS_ENABLED(CONFIG_IPV6)
static DEFINE_MUTEX(smk_net6addr_lock);
#endif /* CONFIG_IPV6 */

/*
 * This is the "ambient" label for network traffic.
 * If it isn't somehow marked, use this.
 * It can be reset via smackfs/ambient
 */
struct smack_known *smack_net_ambient;

/*
 * This is the level in a CIPSO header that indicates a
 * smack label is contained directly in the category set.
 * It can be reset via smackfs/direct
 */
int smack_cipso_direct = SMACK_CIPSO_DIRECT_DEFAULT;

/*
 * This is the level in a CIPSO header that indicates a
 * secid is contained directly in the category set.
 * It can be reset via smackfs/mapped
 */
int smack_cipso_mapped = SMACK_CIPSO_MAPPED_DEFAULT;

#ifdef CONFIG_SECURITY_SMACK_BRINGUP
/*
 * Allow one label to be unconfined. This is for
 * debugging and application bring-up purposes only.
 * It is bad and wrong, but everyone seems to expect
 * to have it.
 */
struct smack_known *smack_unconfined;
#endif

/*
 * If this value is set restrict syslog use to the label specified.
 * It can be reset via smackfs/syslog
 */
struct smack_known *smack_syslog_label;

/*
 * Ptrace current rule
 * SMACK_PTRACE_DEFAULT    regular smack ptrace rules (/proc based)
 * SMACK_PTRACE_EXACT      labels must match, but can be overriden with
 *                           CAP_SYS_PTRACE
 * SMACK_PTRACE_DRACONIAN  labels must match, CAP_SYS_PTRACE has no effect
 */
int smack_ptrace_rule = SMACK_PTRACE_DEFAULT;

/*
 * Certain IP addresses may be designated as single label hosts.
 * Packets are sent there unlabeled, but only from tasks that
 * can write to the specified label.
 */

LIST_HEAD(smk_net4addr_list);
#if IS_ENABLED(CONFIG_IPV6)
LIST_HEAD(smk_net6addr_list);
#endif /* CONFIG_IPV6 */

/*
 * Rule lists are maintained for each label.
 */
struct smack_parsed_rule {
        struct smack_known        *smk_subject;
        struct smack_known        *smk_object;
        int                        smk_access1;
        int                        smk_access2;
};

static int smk_cipso_doi_value = SMACK_CIPSO_DOI_DEFAULT;

/*
 * Values for parsing cipso rules
 * SMK_DIGITLEN: Length of a digit field in a rule.
 * SMK_CIPSOMIN: Minimum possible cipso rule length.
 * SMK_CIPSOMAX: Maximum possible cipso rule length.
 */
#define SMK_DIGITLEN 4
#define SMK_CIPSOMIN (SMK_LABELLEN + 2 * SMK_DIGITLEN)
#define SMK_CIPSOMAX (SMK_CIPSOMIN + SMACK_CIPSO_MAXCATNUM * SMK_DIGITLEN)

/*
 * Values for parsing MAC rules
 * SMK_ACCESS: Maximum possible combination of access permissions
 * SMK_ACCESSLEN: Maximum length for a rule access field
 * SMK_LOADLEN: Smack rule length
 */
#define SMK_OACCESS        "rwxa"
#define SMK_ACCESS        "rwxatl"
#define SMK_OACCESSLEN        (sizeof(SMK_OACCESS) - 1)
#define SMK_ACCESSLEN        (sizeof(SMK_ACCESS) - 1)
#define SMK_OLOADLEN        (SMK_LABELLEN + SMK_LABELLEN + SMK_OACCESSLEN)
#define SMK_LOADLEN        (SMK_LABELLEN + SMK_LABELLEN + SMK_ACCESSLEN)

/*
 * Stricly for CIPSO level manipulation.
 * Set the category bit number in a smack label sized buffer.
 */
static inline void smack_catset_bit(unsigned int cat, char *catsetp)
{
        if (cat == 0 || cat > (SMK_CIPSOLEN * 8))
                return;

        catsetp[(cat - 1) / 8] |= 0x80 >> ((cat - 1) % 8);
}

/**
 * smk_netlabel_audit_set - fill a netlbl_audit struct
 * @nap: structure to fill
 */
static void smk_netlabel_audit_set(struct netlbl_audit *nap)
{
        struct smack_known *skp = smk_of_current();

        nap->loginuid = audit_get_loginuid(current);
        nap->sessionid = audit_get_sessionid(current);
        nap->secid = skp->smk_secid;
}

/*
 * Value for parsing single label host rules
 * "1.2.3.4 X"
 */
#define SMK_NETLBLADDRMIN        9

/**
 * smk_set_access - add a rule to the rule list or replace an old rule
 * @srp: the rule to add or replace
 * @rule_list: the list of rules
 * @rule_lock: the rule list lock
 *
 * Looks through the current subject/object/access list for
 * the subject/object pair and replaces the access that was
 * there. If the pair isn't found add it with the specified
 * access.
 *
 * Returns 0 if nothing goes wrong or -ENOMEM if it fails
 * during the allocation of the new pair to add.
 */
static int smk_set_access(struct smack_parsed_rule *srp,
                                struct list_head *rule_list,
                                struct mutex *rule_lock)
{
        struct smack_rule *sp;
        int found = 0;
        int rc = 0;

        mutex_lock(rule_lock);

        /*
         * Because the object label is less likely to match
         * than the subject label check it first
         */
        list_for_each_entry_rcu(sp, rule_list, list) {
                if (sp->smk_object == srp->smk_object &&
                    sp->smk_subject == srp->smk_subject) {
                        found = 1;
                        sp->smk_access |= srp->smk_access1;
                        sp->smk_access &= ~srp->smk_access2;
                        break;
                }
        }

        if (found == 0) {
                sp = kmem_cache_zalloc(smack_rule_cache, GFP_KERNEL);
                if (sp == NULL) {
                        rc = -ENOMEM;
                        goto out;
                }

                sp->smk_subject = srp->smk_subject;
                sp->smk_object = srp->smk_object;
                sp->smk_access = srp->smk_access1 & ~srp->smk_access2;

                list_add_rcu(&sp->list, rule_list);
        }

out:
        mutex_unlock(rule_lock);
        return rc;
}

/**
 * smk_perm_from_str - parse smack accesses from a text string
 * @string: a text string that contains a Smack accesses code
 *
 * Returns an integer with respective bits set for specified accesses.
 */
static int smk_perm_from_str(const char *string)
{
        int perm = 0;
        const char *cp;

        for (cp = string; ; cp++)
                switch (*cp) {
                case '-':
                        break;
                case 'r':
                case 'R':
                        perm |= MAY_READ;
                        break;
                case 'w':
                case 'W':
                        perm |= MAY_WRITE;
                        break;
                case 'x':
                case 'X':
                        perm |= MAY_EXEC;
                        break;
                case 'a':
                case 'A':
                        perm |= MAY_APPEND;
                        break;
                case 't':
                case 'T':
                        perm |= MAY_TRANSMUTE;
                        break;
                case 'l':
                case 'L':
                        perm |= MAY_LOCK;
                        break;
                case 'b':
                case 'B':
                        perm |= MAY_BRINGUP;
                        break;
                default:
                        return perm;
                }
}

/**
 * smk_fill_rule - Fill Smack rule from strings
 * @subject: subject label string
 * @object: object label string
 * @access1: access string
 * @access2: string with permissions to be removed
 * @rule: Smack rule
 * @import: if non-zero, import labels
 * @len: label length limit
 *
 * Returns 0 on success, appropriate error code on failure.
 */
static int smk_fill_rule(const char *subject, const char *object,
                                const char *access1, const char *access2,
                                struct smack_parsed_rule *rule, int import,
                                int len)
{
        const char *cp;
        struct smack_known *skp;

        if (import) {
                rule->smk_subject = smk_import_entry(subject, len);
                if (IS_ERR(rule->smk_subject))
                        return PTR_ERR(rule->smk_subject);

                rule->smk_object = smk_import_entry(object, len);
                if (IS_ERR(rule->smk_object))
                        return PTR_ERR(rule->smk_object);
        } else {
                cp = smk_parse_smack(subject, len);
                if (IS_ERR(cp))
                        return PTR_ERR(cp);
                skp = smk_find_entry(cp);
                kfree(cp);
                if (skp == NULL)
                        return -ENOENT;
                rule->smk_subject = skp;

                cp = smk_parse_smack(object, len);
                if (IS_ERR(cp))
                        return PTR_ERR(cp);
                skp = smk_find_entry(cp);
                kfree(cp);
                if (skp == NULL)
                        return -ENOENT;
                rule->smk_object = skp;
        }

        rule->smk_access1 = smk_perm_from_str(access1);
        if (access2)
                rule->smk_access2 = smk_perm_from_str(access2);
        else
                rule->smk_access2 = ~rule->smk_access1;

        return 0;
}

/**
 * smk_parse_rule - parse Smack rule from load string
 * @data: string to be parsed whose size is SMK_LOADLEN
 * @rule: Smack rule
 * @import: if non-zero, import labels
 *
 * Returns 0 on success, -1 on errors.
 */
static int smk_parse_rule(const char *data, struct smack_parsed_rule *rule,
                                int import)
{
        int rc;

        rc = smk_fill_rule(data, data + SMK_LABELLEN,
                           data + SMK_LABELLEN + SMK_LABELLEN, NULL, rule,
                           import, SMK_LABELLEN);
        return rc;
}

/**
 * smk_parse_long_rule - parse Smack rule from rule string
 * @data: string to be parsed, null terminated
 * @rule: Will be filled with Smack parsed rule
 * @import: if non-zero, import labels
 * @tokens: number of substrings expected in data
 *
 * Returns number of processed bytes on success, -ERRNO on failure.
 */
static ssize_t smk_parse_long_rule(char *data, struct smack_parsed_rule *rule,
                                int import, int tokens)
{
        ssize_t cnt = 0;
        char *tok[4];
        int rc;
        int i;

        /*
         * Parsing the rule in-place, filling all white-spaces with '\0'
         */
        for (i = 0; i < tokens; ++i) {
                while (isspace(data[cnt]))
                        data[cnt++] = '\0';

                if (data[cnt] == '\0')
                        /* Unexpected end of data */
                        return -EINVAL;

                tok[i] = data + cnt;

                while (data[cnt] && !isspace(data[cnt]))
                        ++cnt;
        }
        while (isspace(data[cnt]))
                data[cnt++] = '\0';

        while (i < 4)
                tok[i++] = NULL;

        rc = smk_fill_rule(tok[0], tok[1], tok[2], tok[3], rule, import, 0);
        return rc == 0 ? cnt : rc;
}

#define SMK_FIXED24_FMT        0        /* Fixed 24byte label format */
#define SMK_LONG_FMT        1        /* Variable long label format */
#define SMK_CHANGE_FMT        2        /* Rule modification format */
/**
 * smk_write_rules_list - write() for any /smack rule file
 * @file: file pointer, not actually used
 * @buf: where to get the data from
 * @count: bytes sent
 * @ppos: where to start - must be 0
 * @rule_list: the list of rules to write to
 * @rule_lock: lock for the rule list
 * @format: /smack/load or /smack/load2 or /smack/change-rule format.
 *
 * Get one smack access rule from above.
 * The format for SMK_LONG_FMT is:
 *        "subject<whitespace>object<whitespace>access[<whitespace>...]"
 * The format for SMK_FIXED24_FMT is exactly:
 *        "subject                 object                  rwxat"
 * The format for SMK_CHANGE_FMT is:
 *        "subject<whitespace>object<whitespace>
 *         acc_enable<whitespace>acc_disable[<whitespace>...]"
 */
static ssize_t smk_write_rules_list(struct file *file, const char __user *buf,
                                        size_t count, loff_t *ppos,
                                        struct list_head *rule_list,
                                        struct mutex *rule_lock, int format)
{
        struct smack_parsed_rule rule;
        char *data;
        int rc;
        int trunc = 0;
        int tokens;
        ssize_t cnt = 0;

        /*
         * No partial writes.
         * Enough data must be present.
         */
        if (*ppos != 0)
                return -EINVAL;

        if (format == SMK_FIXED24_FMT) {
                /*
                 * Minor hack for backward compatibility
                 */
                if (count < SMK_OLOADLEN || count > SMK_LOADLEN)
                        return -EINVAL;
        } else {
                if (count >= PAGE_SIZE) {
                        count = PAGE_SIZE - 1;
                        trunc = 1;
                }
        }

        data = memdup_user_nul(buf, count);
        if (IS_ERR(data))
                return PTR_ERR(data);

        /*
         * In case of parsing only part of user buf,
         * avoid having partial rule at the data buffer
         */
        if (trunc) {
                while (count > 0 && (data[count - 1] != '\n'))
                        --count;
                if (count == 0) {
                        rc = -EINVAL;
                        goto out;
                }
        }

        data[count] = '\0';
        tokens = (format == SMK_CHANGE_FMT ? 4 : 3);
        while (cnt < count) {
                if (format == SMK_FIXED24_FMT) {
                        rc = smk_parse_rule(data, &rule, 1);
                        if (rc < 0)
                                goto out;
                        cnt = count;
                } else {
                        rc = smk_parse_long_rule(data + cnt, &rule, 1, tokens);
                        if (rc < 0)
                                goto out;
                        if (rc == 0) {
                                rc = -EINVAL;
                                goto out;
                        }
                        cnt += rc;
                }

                if (rule_list == NULL)
                        rc = smk_set_access(&rule, &rule.smk_subject->smk_rules,
                                &rule.smk_subject->smk_rules_lock);
                else
                        rc = smk_set_access(&rule, rule_list, rule_lock);

                if (rc)
                        goto out;
        }

        rc = cnt;
out:
        kfree(data);
        return rc;
}

/*
 * Core logic for smackfs seq list operations.
 */

static void *smk_seq_start(struct seq_file *s, loff_t *pos,
                                struct list_head *head)
{
        struct list_head *list;
        int i = *pos;

        rcu_read_lock();
        for (list = rcu_dereference(list_next_rcu(head));
                list != head;
                list = rcu_dereference(list_next_rcu(list))) {
                if (i-- == 0)
                        return list;
        }

        return NULL;
}

static void *smk_seq_next(struct seq_file *s, void *v, loff_t *pos,
                                struct list_head *head)
{
        struct list_head *list = v;

        ++*pos;
        list = rcu_dereference(list_next_rcu(list));

        return (list == head) ? NULL : list;
}

static void smk_seq_stop(struct seq_file *s, void *v)
{
        rcu_read_unlock();
}

static void smk_rule_show(struct seq_file *s, struct smack_rule *srp, int max)
{
        /*
         * Don't show any rules with label names too long for
         * interface file (/smack/load or /smack/load2)
         * because you should expect to be able to write
         * anything you read back.
         */
        if (strlen(srp->smk_subject->smk_known) >= max ||
            strlen(srp->smk_object->smk_known) >= max)
                return;

        if (srp->smk_access == 0)
                return;

        seq_printf(s, "%s %s",
                   srp->smk_subject->smk_known,
                   srp->smk_object->smk_known);

        seq_putc(s, ' ');

        if (srp->smk_access & MAY_READ)
                seq_putc(s, 'r');
        if (srp->smk_access & MAY_WRITE)
                seq_putc(s, 'w');
        if (srp->smk_access & MAY_EXEC)
                seq_putc(s, 'x');
        if (srp->smk_access & MAY_APPEND)
                seq_putc(s, 'a');
        if (srp->smk_access & MAY_TRANSMUTE)
                seq_putc(s, 't');
        if (srp->smk_access & MAY_LOCK)
                seq_putc(s, 'l');
        if (srp->smk_access & MAY_BRINGUP)
                seq_putc(s, 'b');

        seq_putc(s, '\n');
}

/*
 * Seq_file read operations for /smack/load
 */

static void *load2_seq_start(struct seq_file *s, loff_t *pos)
{
        return smk_seq_start(s, pos, &smack_known_list);
}

static void *load2_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
        return smk_seq_next(s, v, pos, &smack_known_list);
}

static int load_seq_show(struct seq_file *s, void *v)
{
        struct list_head *list = v;
        struct smack_rule *srp;
        struct smack_known *skp =
                list_entry_rcu(list, struct smack_known, list);

        list_for_each_entry_rcu(srp, &skp->smk_rules, list)
                smk_rule_show(s, srp, SMK_LABELLEN);

        return 0;
}

static const struct seq_operations load_seq_ops = {
        .start = load2_seq_start,
        .next  = load2_seq_next,
        .show  = load_seq_show,
        .stop  = smk_seq_stop,
};

/**
 * smk_open_load - open() for /smack/load
 * @inode: inode structure representing file
 * @file: "load" file pointer
 *
 * For reading, use load_seq_* seq_file reading operations.
 */
static int smk_open_load(struct inode *inode, struct file *file)
{
        return seq_open(file, &load_seq_ops);
}

/**
 * smk_write_load - write() for /smack/load
 * @file: file pointer, not actually used
 * @buf: where to get the data from
 * @count: bytes sent
 * @ppos: where to start - must be 0
 *
 */
static ssize_t smk_write_load(struct file *file, const char __user *buf,
                              size_t count, loff_t *ppos)
{
        /*
         * Must have privilege.
         * No partial writes.
         * Enough data must be present.
         */
        if (!smack_privileged(CAP_MAC_ADMIN))
                return -EPERM;

        return smk_write_rules_list(file, buf, count, ppos, NULL, NULL,
                                    SMK_FIXED24_FMT);
}

static const struct file_operations smk_load_ops = {
        .open           = smk_open_load,
        .read                = seq_read,
        .llseek         = seq_lseek,
        .write                = smk_write_load,
        .release        = seq_release,
};

/**
 * smk_cipso_doi - initialize the CIPSO domain
 */
static void smk_cipso_doi(void)
{
        int rc;
        struct cipso_v4_doi *doip;
        struct netlbl_audit nai;

        smk_netlabel_audit_set(&nai);

        rc = netlbl_cfg_map_del(NULL, PF_INET, NULL, NULL, &nai);
        if (rc != 0)
                printk(KERN_WARNING "%s:%d remove rc = %d\n",
                       __func__, __LINE__, rc);

        doip = kmalloc(sizeof(struct cipso_v4_doi), GFP_KERNEL | __GFP_NOFAIL);
        doip->map.std = NULL;
        doip->doi = smk_cipso_doi_value;
        doip->type = CIPSO_V4_MAP_PASS;
        doip->tags[0] = CIPSO_V4_TAG_RBITMAP;
        for (rc = 1; rc < CIPSO_V4_TAG_MAXCNT; rc++)
                doip->tags[rc] = CIPSO_V4_TAG_INVALID;

        rc = netlbl_cfg_cipsov4_add(doip, &nai);
        if (rc != 0) {
                printk(KERN_WARNING "%s:%d cipso add rc = %d\n",
                       __func__, __LINE__, rc);
                kfree(doip);
                return;
        }
        rc = netlbl_cfg_cipsov4_map_add(doip->doi, NULL, NULL, NULL, &nai);
        if (rc != 0) {
                printk(KERN_WARNING "%s:%d map add rc = %d\n",
                       __func__, __LINE__, rc);
                netlbl_cfg_cipsov4_del(doip->doi, &nai);
                return;
        }
}

/**
 * smk_unlbl_ambient - initialize the unlabeled domain
 * @oldambient: previous domain string
 */
static void smk_unlbl_ambient(char *oldambient)
{
        int rc;
        struct netlbl_audit nai;

        smk_netlabel_audit_set(&nai);

        if (oldambient != NULL) {
                rc = netlbl_cfg_map_del(oldambient, PF_INET, NULL, NULL, &nai);
                if (rc != 0)
                        printk(KERN_WARNING "%s:%d remove rc = %d\n",
                               __func__, __LINE__, rc);
        }
        if (smack_net_ambient == NULL)
                smack_net_ambient = &smack_known_floor;

        rc = netlbl_cfg_unlbl_map_add(smack_net_ambient->smk_known, PF_INET,
                                      NULL, NULL, &nai);
        if (rc != 0)
                printk(KERN_WARNING "%s:%d add rc = %d\n",
                       __func__, __LINE__, rc);
}

/*
 * Seq_file read operations for /smack/cipso
 */

static void *cipso_seq_start(struct seq_file *s, loff_t *pos)
{
        return smk_seq_start(s, pos, &smack_known_list);
}

static void *cipso_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
        return smk_seq_next(s, v, pos, &smack_known_list);
}

/*
 * Print cipso labels in format:
 * label level[/cat[,cat]]
 */
static int cipso_seq_show(struct seq_file *s, void *v)
{
        struct list_head  *list = v;
        struct smack_known *skp =
                list_entry_rcu(list, struct smack_known, list);
        struct netlbl_lsm_catmap *cmp = skp->smk_netlabel.attr.mls.cat;
        char sep = '/';
        int i;

        /*
         * Don't show a label that could not have been set using
         * /smack/cipso. This is in support of the notion that
         * anything read from /smack/cipso ought to be writeable
         * to /smack/cipso.
         *
         * /smack/cipso2 should be used instead.
         */
        if (strlen(skp->smk_known) >= SMK_LABELLEN)
                return 0;

        seq_printf(s, "%s %3d", skp->smk_known, skp->smk_netlabel.attr.mls.lvl);

        for (i = netlbl_catmap_walk(cmp, 0); i >= 0;
             i = netlbl_catmap_walk(cmp, i + 1)) {
                seq_printf(s, "%c%d", sep, i);
                sep = ',';
        }

        seq_putc(s, '\n');

        return 0;
}

static const struct seq_operations cipso_seq_ops = {
        .start = cipso_seq_start,
        .next  = cipso_seq_next,
        .show  = cipso_seq_show,
        .stop  = smk_seq_stop,
};

/**
 * smk_open_cipso - open() for /smack/cipso
 * @inode: inode structure representing file
 * @file: "cipso" file pointer
 *
 * Connect our cipso_seq_* operations with /smack/cipso
 * file_operations
 */
static int smk_open_cipso(struct inode *inode, struct file *file)
{
        return seq_open(file, &cipso_seq_ops);
}

/**
 * smk_set_cipso - do the work for write() for cipso and cipso2
 * @file: file pointer, not actually used
 * @buf: where to get the data from
 * @count: bytes sent
 * @ppos: where to start
 * @format: /smack/cipso or /smack/cipso2
 *
 * Accepts only one cipso rule per write call.
 * Returns number of bytes written or error code, as appropriate
 */
static ssize_t smk_set_cipso(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos, int format)
{
        struct netlbl_lsm_catmap *old_cat, *new_cat = NULL;
        struct smack_known *skp;
        struct netlbl_lsm_secattr ncats;
        char mapcatset[SMK_CIPSOLEN];
        int maplevel;
        unsigned int cat;
        int catlen;
        ssize_t rc = -EINVAL;
        char *data = NULL;
        char *rule;
        int ret;
        int i;

        /*
         * Must have privilege.
         * No partial writes.
         * Enough data must be present.
         */
        if (!smack_privileged(CAP_MAC_ADMIN))
                return -EPERM;
        if (*ppos != 0)
                return -EINVAL;
        if (format == SMK_FIXED24_FMT &&
            (count < SMK_CIPSOMIN || count > SMK_CIPSOMAX))
                return -EINVAL;
        if (count > PAGE_SIZE)
                return -EINVAL;

        data = memdup_user_nul(buf, count);
        if (IS_ERR(data))
                return PTR_ERR(data);

        rule = data;
        /*
         * Only allow one writer at a time. Writes should be
         * quite rare and small in any case.
         */
        mutex_lock(&smack_cipso_lock);

        skp = smk_import_entry(rule, 0);
        if (IS_ERR(skp)) {
                rc = PTR_ERR(skp);
                goto out;
        }

        if (format == SMK_FIXED24_FMT)
                rule += SMK_LABELLEN;
        else
                rule += strlen(skp->smk_known) + 1;

        if (rule > data + count) {
                rc = -EOVERFLOW;
                goto out;
        }

        ret = sscanf(rule, "%d", &maplevel);
        if (ret != 1 || maplevel < 0 || maplevel > SMACK_CIPSO_MAXLEVEL)
                goto out;

        rule += SMK_DIGITLEN;
        if (rule > data + count) {
                rc = -EOVERFLOW;
                goto out;
        }

        ret = sscanf(rule, "%d", &catlen);
        if (ret != 1 || catlen < 0 || catlen > SMACK_CIPSO_MAXCATNUM)
                goto out;

        if (format == SMK_FIXED24_FMT &&
            count != (SMK_CIPSOMIN + catlen * SMK_DIGITLEN))
                goto out;

        memset(mapcatset, 0, sizeof(mapcatset));

        for (i = 0; i < catlen; i++) {
                rule += SMK_DIGITLEN;
                if (rule > data + count) {
                        rc = -EOVERFLOW;
                        goto out;
                }
                ret = sscanf(rule, "%u", &cat);
                if (ret != 1 || cat > SMACK_CIPSO_MAXCATNUM)
                        goto out;

                smack_catset_bit(cat, mapcatset);
        }
        ncats.flags = 0;
        if (catlen == 0) {
                ncats.attr.mls.cat = NULL;
                ncats.attr.mls.lvl = maplevel;
                new_cat = netlbl_catmap_alloc(GFP_ATOMIC);
                if (new_cat)
                        new_cat->next = ncats.attr.mls.cat;
                ncats.attr.mls.cat = new_cat;
                skp->smk_netlabel.flags &= ~(1U << 3);
                rc = 0;
        } else {
                rc = smk_netlbl_mls(maplevel, mapcatset, &ncats, SMK_CIPSOLEN);
        }
        if (rc >= 0) {
                old_cat = skp->smk_netlabel.attr.mls.cat;
                skp->smk_netlabel.attr.mls.cat = ncats.attr.mls.cat;
                skp->smk_netlabel.attr.mls.lvl = ncats.attr.mls.lvl;
                synchronize_rcu();
                netlbl_catmap_free(old_cat);
                rc = count;
                /*
                 * This mapping may have been cached, so clear the cache.
                 */
                netlbl_cache_invalidate();
        }

out:
        mutex_unlock(&smack_cipso_lock);
        kfree(data);
        return rc;
}

/**
 * smk_write_cipso - write() for /smack/cipso
 * @file: file pointer, not actually used
 * @buf: where to get the data from
 * @count: bytes sent
 * @ppos: where to start
 *
 * Accepts only one cipso rule per write call.
 * Returns number of bytes written or error code, as appropriate
 */
static ssize_t smk_write_cipso(struct file *file, const char __user *buf,
                               size_t count, loff_t *ppos)
{
        return smk_set_cipso(file, buf, count, ppos, SMK_FIXED24_FMT);
}

static const struct file_operations smk_cipso_ops = {
        .open           = smk_open_cipso,
        .read                = seq_read,
        .llseek         = seq_lseek,
        .write                = smk_write_cipso,
        .release        = seq_release,
};

/*
 * Seq_file read operations for /smack/cipso2
 */

/*
 * Print cipso labels in format:
 * label level[/cat[,cat]]
 */
static int cipso2_seq_show(struct seq_file *s, void *v)
{
        struct list_head  *list = v;
        struct smack_known *skp =
                list_entry_rcu(list, struct smack_known, list);
        struct netlbl_lsm_catmap *cmp = skp->smk_netlabel.attr.mls.cat;
        char sep = '/';
        int i;

        seq_printf(s, "%s %3d", skp->smk_known, skp->smk_netlabel.attr.mls.lvl);

        for (i = netlbl_catmap_walk(cmp, 0); i >= 0;
             i = netlbl_catmap_walk(cmp, i + 1)) {
                seq_printf(s, "%c%d", sep, i);
                sep = ',';
        }

        seq_putc(s, '\n');

        return 0;
}

static const struct seq_operations cipso2_seq_ops = {
        .start = cipso_seq_start,
        .next  = cipso_seq_next,
        .show  = cipso2_seq_show,
        .stop  = smk_seq_stop,
};

/**
 * smk_open_cipso2 - open() for /smack/cipso2
 * @inode: inode structure representing file
 * @file: "cipso2" file pointer
 *
 * Connect our cipso_seq_* operations with /smack/cipso2
 * file_operations
 */
static int smk_open_cipso2(struct inode *inode, struct file *file)
{
        return seq_open(file, &cipso2_seq_ops);
}

/**
 * smk_write_cipso2 - write() for /smack/cipso2
 * @file: file pointer, not actually used
 * @buf: where to get the data from
 * @count: bytes sent
 * @ppos: where to start
 *
 * Accepts only one cipso rule per write call.
 * Returns number of bytes written or error code, as appropriate
 */
static ssize_t smk_write_cipso2(struct file *file, const char __user *buf,
                              size_t count, loff_t *ppos)
{
        return smk_set_cipso(file, buf, count, ppos, SMK_LONG_FMT);
}

static const struct file_operations smk_cipso2_ops = {
        .open           = smk_open_cipso2,
        .read                = seq_read,
        .llseek         = seq_lseek,
        .write                = smk_write_cipso2,
        .release        = seq_release,
};

/*
 * Seq_file read operations for /smack/netlabel
 */

static void *net4addr_seq_start(struct seq_file *s, loff_t *pos)
{
        return smk_seq_start(s, pos, &smk_net4addr_list);
}

static void *net4addr_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
        return smk_seq_next(s, v, pos, &smk_net4addr_list);
}

/*
 * Print host/label pairs
 */
static int net4addr_seq_show(struct seq_file *s, void *v)
{
        struct list_head *list = v;
        struct smk_net4addr *skp =
                        list_entry_rcu(list, struct smk_net4addr, list);
        char *kp = SMACK_CIPSO_OPTION;

        if (skp->smk_label != NULL)
                kp = skp->smk_label->smk_known;
        seq_printf(s, "%pI4/%d %s\n", &skp->smk_host.s_addr,
                        skp->smk_masks, kp);

        return 0;
}

static const struct seq_operations net4addr_seq_ops = {
        .start = net4addr_seq_start,
        .next  = net4addr_seq_next,
        .show  = net4addr_seq_show,
        .stop  = smk_seq_stop,
};

/**
 * smk_open_net4addr - open() for /smack/netlabel
 * @inode: inode structure representing file
 * @file: "netlabel" file pointer
 *
 * Connect our net4addr_seq_* operations with /smack/netlabel
 * file_operations
 */
static int smk_open_net4addr(struct inode *inode, struct file *file)
{
        return seq_open(file, &net4addr_seq_ops);
}

/**
 * smk_net4addr_insert
 * @new : netlabel to insert
 *
 * This helper insert netlabel in the smack_net4addrs list
 * sorted by netmask length (longest to smallest)
 * locked by &smk_net4addr_lock in smk_write_net4addr
 *
 */
static void smk_net4addr_insert(struct smk_net4addr *new)
{
        struct smk_net4addr *m;
        struct smk_net4addr *m_next;

        if (list_empty(&smk_net4addr_list)) {
                list_add_rcu(&new->list, &smk_net4addr_list);
                return;
        }

        m = list_entry_rcu(smk_net4addr_list.next,
                           struct smk_net4addr, list);

        /* the comparison '>' is a bit hacky, but works */
        if (new->smk_masks > m->smk_masks) {
                list_add_rcu(&new->list, &smk_net4addr_list);
                return;
        }

        list_for_each_entry_rcu(m, &smk_net4addr_list, list) {
                if (list_is_last(&m->list, &smk_net4addr_list)) {
                        list_add_rcu(&new->list, &m->list);
                        return;
                }
                m_next = list_entry_rcu(m->list.next,
                                        struct smk_net4addr, list);
                if (new->smk_masks > m_next->smk_masks) {
                        list_add_rcu(&new->list, &m->list);
                        return;
                }
        }
}


/**
 * smk_write_net4addr - write() for /smack/netlabel
 * @file: file pointer, not actually used
 * @buf: where to get the data from
 * @count: bytes sent
 * @ppos: where to start
 *
 * Accepts only one net4addr per write call.
 * Returns number of bytes written or error code, as appropriate
 */
static ssize_t smk_write_net4addr(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
{
        struct smk_net4addr *snp;
        struct sockaddr_in newname;
        char *smack;
        struct smack_known *skp = NULL;
        char *data;
        char *host = (char *)&newname.sin_addr.s_addr;
        int rc;
        struct netlbl_audit audit_info;
        struct in_addr mask;
        unsigned int m;
        unsigned int masks;
        int found;
        u32 mask_bits = (1<<31);
        __be32 nsa;
        u32 temp_mask;

        /*
         * Must have privilege.
         * No partial writes.
         * Enough data must be present.
         * "<addr/mask, as a.b.c.d/e><space><label>"
         * "<addr, as a.b.c.d><space><label>"
         */
        if (!smack_privileged(CAP_MAC_ADMIN))
                return -EPERM;
        if (*ppos != 0)
                return -EINVAL;
        if (count < SMK_NETLBLADDRMIN || count > PAGE_SIZE - 1)
                return -EINVAL;

        data = memdup_user_nul(buf, count);
        if (IS_ERR(data))
                return PTR_ERR(data);

        smack = kzalloc(count + 1, GFP_KERNEL);
        if (smack == NULL) {
                rc = -ENOMEM;
                goto free_data_out;
        }

        rc = sscanf(data, "%hhd.%hhd.%hhd.%hhd/%u %s",
                &host[0], &host[1], &host[2], &host[3], &masks, smack);
        if (rc != 6) {
                rc = sscanf(data, "%hhd.%hhd.%hhd.%hhd %s",
                        &host[0], &host[1], &host[2], &host[3], smack);
                if (rc != 5) {
                        rc = -EINVAL;
                        goto free_out;
                }
                masks = 32;
        }
        if (masks > BEBITS) {
                rc = -EINVAL;
                goto free_out;
        }

        /*
         * If smack begins with '-', it is an option, don't import it
         */
        if (smack[0] != '-') {
                skp = smk_import_entry(smack, 0);
                if (IS_ERR(skp)) {
                        rc = PTR_ERR(skp);
                        goto free_out;
                }
        } else {
                /*
                 * Only the -CIPSO option is supported for IPv4
                 */
                if (strcmp(smack, SMACK_CIPSO_OPTION) != 0) {
                        rc = -EINVAL;
                        goto free_out;
                }
        }

        for (m = masks, temp_mask = 0; m > 0; m--) {
                temp_mask |= mask_bits;
                mask_bits >>= 1;
        }
        mask.s_addr = cpu_to_be32(temp_mask);

        newname.sin_addr.s_addr &= mask.s_addr;
        /*
         * Only allow one writer at a time. Writes should be
         * quite rare and small in any case.
         */
        mutex_lock(&smk_net4addr_lock);

        nsa = newname.sin_addr.s_addr;
        /* try to find if the prefix is already in the list */
        found = 0;
        list_for_each_entry_rcu(snp, &smk_net4addr_list, list) {
                if (snp->smk_host.s_addr == nsa && snp->smk_masks == masks) {
                        found = 1;
                        break;
                }
        }
        smk_netlabel_audit_set(&audit_info);

        if (found == 0) {
                snp = kzalloc(sizeof(*snp), GFP_KERNEL);
                if (snp == NULL)
                        rc = -ENOMEM;
                else {
                        rc = 0;
                        snp->smk_host.s_addr = newname.sin_addr.s_addr;
                        snp->smk_mask.s_addr = mask.s_addr;
                        snp->smk_label = skp;
                        snp->smk_masks = masks;
                        smk_net4addr_insert(snp);
                }
        } else {
                /*
                 * Delete the unlabeled entry, only if the previous label
                 * wasn't the special CIPSO option
                 */
                if (snp->smk_label != NULL)
                        rc = netlbl_cfg_unlbl_static_del(&init_net, NULL,
                                        &snp->smk_host, &snp->smk_mask,
                                        PF_INET, &audit_info);
                else
                        rc = 0;
                snp->smk_label = skp;
        }

        /*
         * Now tell netlabel about the single label nature of
         * this host so that incoming packets get labeled.
         * but only if we didn't get the special CIPSO option
         */
        if (rc == 0 && skp != NULL)
                rc = netlbl_cfg_unlbl_static_add(&init_net, NULL,
                        &snp->smk_host, &snp->smk_mask, PF_INET,
                        snp->smk_label->smk_secid, &audit_info);

        if (rc == 0)
                rc = count;

        mutex_unlock(&smk_net4addr_lock);

free_out:
        kfree(smack);
free_data_out:
        kfree(data);

        return rc;
}

static const struct file_operations smk_net4addr_ops = {
        .open           = smk_open_net4addr,
        .read                = seq_read,
        .llseek         = seq_lseek,
        .write                = smk_write_net4addr,
        .release        = seq_release,
};

#if IS_ENABLED(CONFIG_IPV6)
/*
 * Seq_file read operations for /smack/netlabel6
 */

static void *net6addr_seq_start(struct seq_file *s, loff_t *pos)
{
        return smk_seq_start(s, pos, &smk_net6addr_list);
}

static void *net6addr_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
        return smk_seq_next(s, v, pos, &smk_net6addr_list);
}

/*
 * Print host/label pairs
 */
static int net6addr_seq_show(struct seq_file *s, void *v)
{
        struct list_head *list = v;
        struct smk_net6addr *skp =
                         list_entry(list, struct smk_net6addr, list);

        if (skp->smk_label != NULL)
                seq_printf(s, "%pI6/%d %s\n", &skp->smk_host, skp->smk_masks,
                                skp->smk_label->smk_known);

        return 0;
}

static const struct seq_operations net6addr_seq_ops = {
        .start = net6addr_seq_start,
        .next  = net6addr_seq_next,
        .show  = net6addr_seq_show,
        .stop  = smk_seq_stop,
};

/**
 * smk_open_net6addr - open() for /smack/netlabel
 * @inode: inode structure representing file
 * @file: "netlabel" file pointer
 *
 * Connect our net6addr_seq_* operations with /smack/netlabel
 * file_operations
 */
static int smk_open_net6addr(struct inode *inode, struct file *file)
{
        return seq_open(file, &net6addr_seq_ops);
}

/**
 * smk_net6addr_insert
 * @new : entry to insert
 *
 * This inserts an entry in the smack_net6addrs list
 * sorted by netmask length (longest to smallest)
 * locked by &smk_net6addr_lock in smk_write_net6addr
 *
 */
static void smk_net6addr_insert(struct smk_net6addr *new)
{
        struct smk_net6addr *m_next;
        struct smk_net6addr *m;

        if (list_empty(&smk_net6addr_list)) {
                list_add_rcu(&new->list, &smk_net6addr_list);
                return;
        }

        m = list_entry_rcu(smk_net6addr_list.next,
                           struct smk_net6addr, list);

        if (new->smk_masks > m->smk_masks) {
                list_add_rcu(&new->list, &smk_net6addr_list);
                return;
        }

        list_for_each_entry_rcu(m, &smk_net6addr_list, list) {
                if (list_is_last(&m->list, &smk_net6addr_list)) {
                        list_add_rcu(&new->list, &m->list);
                        return;
                }
                m_next = list_entry_rcu(m->list.next,
                                        struct smk_net6addr, list);
                if (new->smk_masks > m_next->smk_masks) {
                        list_add_rcu(&new->list, &m->list);
                        return;
                }
        }
}


/**
 * smk_write_net6addr - write() for /smack/netlabel
 * @file: file pointer, not actually used
 * @buf: where to get the data from
 * @count: bytes sent
 * @ppos: where to start
 *
 * Accepts only one net6addr per write call.
 * Returns number of bytes written or error code, as appropriate
 */
static ssize_t smk_write_net6addr(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
{
        struct smk_net6addr *snp;
        struct in6_addr newname;
        struct in6_addr fullmask;
        struct smack_known *skp = NULL;
        char *smack;
        char *data;
        int rc = 0;
        int found = 0;
        int i;
        unsigned int scanned[8];
        unsigned int m;
        unsigned int mask = 128;

        /*
         * Must have privilege.
         * No partial writes.
         * Enough data must be present.
         * "<addr/mask, as a:b:c:d:e:f:g:h/e><space><label>"
         * "<addr, as a:b:c:d:e:f:g:h><space><label>"
         */
        if (!smack_privileged(CAP_MAC_ADMIN))
                return -EPERM;
        if (*ppos != 0)
                return -EINVAL;
        if (count < SMK_NETLBLADDRMIN || count > PAGE_SIZE - 1)
                return -EINVAL;

        data = memdup_user_nul(buf, count);
        if (IS_ERR(data))
                return PTR_ERR(data);

        smack = kzalloc(count + 1, GFP_KERNEL);
        if (smack == NULL) {
                rc = -ENOMEM;
                goto free_data_out;
        }

        i = sscanf(data, "%x:%x:%x:%x:%x:%x:%x:%x/%u %s",
                        &scanned[0], &scanned[1], &scanned[2], &scanned[3],
                        &scanned[4], &scanned[5], &scanned[6], &scanned[7],
                        &mask, smack);
        if (i != 10) {
                i = sscanf(data, "%x:%x:%x:%x:%x:%x:%x:%x %s",
                                &scanned[0], &scanned[1], &scanned[2],
                                &scanned[3], &scanned[4], &scanned[5],
                                &scanned[6], &scanned[7], smack);
                if (i != 9) {
                        rc = -EINVAL;
                        goto free_out;
                }
        }
        if (mask > 128) {
                rc = -EINVAL;
                goto free_out;
        }
        for (i = 0; i < 8; i++) {
                if (scanned[i] > 0xffff) {
                        rc = -EINVAL;
                        goto free_out;
                }
                newname.s6_addr16[i] = htons(scanned[i]);
        }

        /*
         * If smack begins with '-', it is an option, don't import it
         */
        if (smack[0] != '-') {
                skp = smk_import_entry(smack, 0);
                if (IS_ERR(skp)) {
                        rc = PTR_ERR(skp);
                        goto free_out;
                }
        } else {
                /*
                 * Only -DELETE is supported for IPv6
                 */
                if (strcmp(smack, SMACK_DELETE_OPTION) != 0) {
                        rc = -EINVAL;
                        goto free_out;
                }
        }

        for (i = 0, m = mask; i < 8; i++) {
                if (m >= 16) {
                        fullmask.s6_addr16[i] = 0xffff;
                        m -= 16;
                } else if (m > 0) {
                        fullmask.s6_addr16[i] = (1 << m) - 1;
                        m = 0;
                } else
                        fullmask.s6_addr16[i] = 0;
                newname.s6_addr16[i] &= fullmask.s6_addr16[i];
        }

        /*
         * Only allow one writer at a time. Writes should be
         * quite rare and small in any case.
         */
        mutex_lock(&smk_net6addr_lock);
        /*
         * Try to find the prefix in the list
         */
        list_for_each_entry_rcu(snp, &smk_net6addr_list, list) {
                if (mask != snp->smk_masks)
                        continue;
                for (found = 1, i = 0; i < 8; i++) {
                        if (newname.s6_addr16[i] !=
                            snp->smk_host.s6_addr16[i]) {
                                found = 0;
                                break;
                        }
                }
                if (found == 1)
                        break;
        }
        if (found == 0) {
                snp = kzalloc(sizeof(*snp), GFP_KERNEL);
                if (snp == NULL)
                        rc = -ENOMEM;
                else {
                        snp->smk_host = newname;
                        snp->smk_mask = fullmask;
                        snp->smk_masks = mask;
                        snp->smk_label = skp;
                        smk_net6addr_insert(snp);
                }
        } else {
                snp->smk_label = skp;
        }

        if (rc == 0)
                rc = count;

        mutex_unlock(&smk_net6addr_lock);

free_out:
        kfree(smack);
free_data_out:
        kfree(data);

        return rc;
}

static const struct file_operations smk_net6addr_ops = {
        .open           = smk_open_net6addr,
        .read                = seq_read,
        .llseek         = seq_lseek,
        .write                = smk_write_net6addr,
        .release        = seq_release,
};
#endif /* CONFIG_IPV6 */

/**
 * smk_read_doi - read() for /smack/doi
 * @filp: file pointer, not actually used
 * @buf: where to put the result
 * @count: maximum to send along
 * @ppos: where to start
 *
 * Returns number of bytes read or error code, as appropriate
 */
static ssize_t smk_read_doi(struct file *filp, char __user *buf,
                            size_t count, loff_t *ppos)
{
        char temp[80];
        ssize_t rc;

        if (*ppos != 0)
                return 0;

        sprintf(temp, "%d", smk_cipso_doi_value);
        rc = simple_read_from_buffer(buf, count, ppos, temp, strlen(temp));

        return rc;
}

/**
 * smk_write_doi - write() for /smack/doi
 * @file: file pointer, not actually used
 * @buf: where to get the data from
 * @count: bytes sent
 * @ppos: where to start
 *
 * Returns number of bytes written or error code, as appropriate
 */
static ssize_t smk_write_doi(struct file *file, const char __user *buf,
                             size_t count, loff_t *ppos)
{
        char temp[80];
        int i;

        if (!smack_privileged(CAP_MAC_ADMIN))
                return -EPERM;

        if (count >= sizeof(temp) || count == 0)
                return -EINVAL;

        if (copy_from_user(temp, buf, count) != 0)
                return -EFAULT;

        temp[count] = '\0';

        if (sscanf(temp, "%d", &i) != 1)
                return -EINVAL;

        smk_cipso_doi_value = i;

        smk_cipso_doi();

        return count;
}

static const struct file_operations smk_doi_ops = {
        .read                = smk_read_doi,
        .write                = smk_write_doi,
        .llseek                = default_llseek,
};

/**
 * smk_read_direct - read() for /smack/direct
 * @filp: file pointer, not actually used
 * @buf: where to put the result
 * @count: maximum to send along
 * @ppos: where to start
 *
 * Returns number of bytes read or error code, as appropriate
 */
static ssize_t smk_read_direct(struct file *filp, char __user *buf,
                               size_t count, loff_t *ppos)
{
        char temp[80];
        ssize_t rc;

        if (*ppos != 0)
                return 0;

        sprintf(temp, "%d", smack_cipso_direct);
        rc = simple_read_from_buffer(buf, count, ppos, temp, strlen(temp));

        return rc;
}

/**
 * smk_write_direct - write() for /smack/direct
 * @file: file pointer, not actually used
 * @buf: where to get the data from
 * @count: bytes sent
 * @ppos: where to start
 *
 * Returns number of bytes written or error code, as appropriate
 */
static ssize_t smk_write_direct(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
{
        struct smack_known *skp;
        char temp[80];
        int i;

        if (!smack_privileged(CAP_MAC_ADMIN))
                return -EPERM;

        if (count >= sizeof(temp) || count == 0)
                return -EINVAL;

        if (copy_from_user(temp, buf, count) != 0)
                return -EFAULT;

        temp[count] = '\0';

        if (sscanf(temp, "%d", &i) != 1)
                return -EINVAL;

        /*
         * Don't do anything if the value hasn't actually changed.
         * If it is changing reset the level on entries that were
         * set up to be direct when they were created.
         */
        if (smack_cipso_direct != i) {
                mutex_lock(&smack_known_lock);
                list_for_each_entry_rcu(skp, &smack_known_list, list)
                        if (skp->smk_netlabel.attr.mls.lvl ==
                            smack_cipso_direct)
                                skp->smk_netlabel.attr.mls.lvl = i;
                smack_cipso_direct = i;
                mutex_unlock(&smack_known_lock);
        }

        return count;
}

static const struct file_operations smk_direct_ops = {
        .read                = smk_read_direct,
        .write                = smk_write_direct,
        .llseek                = default_llseek,
};

/**
 * smk_read_mapped - read() for /smack/mapped
 * @filp: file pointer, not actually used
 * @buf: where to put the result
 * @count: maximum to send along
 * @ppos: where to start
 *
 * Returns number of bytes read or error code, as appropriate
 */
static ssize_t smk_read_mapped(struct file *filp, char __user *buf,
                               size_t count, loff_t *ppos)
{
        char temp[80];
        ssize_t rc;

        if (*ppos != 0)
                return 0;

        sprintf(temp, "%d", smack_cipso_mapped);
        rc = simple_read_from_buffer(buf, count, ppos, temp, strlen(temp));

        return rc;
}

/**
 * smk_write_mapped - write() for /smack/mapped
 * @file: file pointer, not actually used
 * @buf: where to get the data from
 * @count: bytes sent
 * @ppos: where to start
 *
 * Returns number of bytes written or error code, as appropriate
 */
static ssize_t smk_write_mapped(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
{
        struct smack_known *skp;
        char temp[80];
        int i;

        if (!smack_privileged(CAP_MAC_ADMIN))
                return -EPERM;

        if (count >= sizeof(temp) || count == 0)
                return -EINVAL;

        if (copy_from_user(temp, buf, count) != 0)
                return -EFAULT;

        temp[count] = '\0';

        if (sscanf(temp, "%d", &i) != 1)
                return -EINVAL;

        /*
         * Don't do anything if the value hasn't actually changed.
         * If it is changing reset the level on entries that were
         * set up to be mapped when they were created.
         */
        if (smack_cipso_mapped != i) {
                mutex_lock(&smack_known_lock);
                list_for_each_entry_rcu(skp, &smack_known_list, list)
                        if (skp->smk_netlabel.attr.mls.lvl ==
                            smack_cipso_mapped)
                                skp->smk_netlabel.attr.mls.lvl = i;
                smack_cipso_mapped = i;
                mutex_unlock(&smack_known_lock);
        }

        return count;
}

static const struct file_operations smk_mapped_ops = {
        .read                = smk_read_mapped,
        .write                = smk_write_mapped,
        .llseek                = default_llseek,
};

/**
 * smk_read_ambient - read() for /smack/ambient
 * @filp: file pointer, not actually used
 * @buf: where to put the result
 * @cn: maximum to send along
 * @ppos: where to start
 *
 * Returns number of bytes read or error code, as appropriate
 */
static ssize_t smk_read_ambient(struct file *filp, char __user *buf,
                                size_t cn, loff_t *ppos)
{
        ssize_t rc;
        int asize;

        if (*ppos != 0)
                return 0;
        /*
         * Being careful to avoid a problem in the case where
         * smack_net_ambient gets changed in midstream.
         */
        mutex_lock(&smack_ambient_lock);

        asize = strlen(smack_net_ambient->smk_known) + 1;

        if (cn >= asize)
                rc = simple_read_from_buffer(buf, cn, ppos,
                                             smack_net_ambient->smk_known,
                                             asize);
        else
                rc = -EINVAL;

        mutex_unlock(&smack_ambient_lock);

        return rc;
}

/**
 * smk_write_ambient - write() for /smack/ambient
 * @file: file pointer, not actually used
 * @buf: where to get the data from
 * @count: bytes sent
 * @ppos: where to start
 *
 * Returns number of bytes written or error code, as appropriate
 */
static ssize_t smk_write_ambient(struct file *file, const char __user *buf,
                                 size_t count, loff_t *ppos)
{
        struct smack_known *skp;
        char *oldambient;
        char *data;
        int rc = count;

        if (!smack_privileged(CAP_MAC_ADMIN))
                return -EPERM;

        /* Enough data must be present */
        if (count == 0 || count > PAGE_SIZE)
                return -EINVAL;

        data = memdup_user_nul(buf, count);
        if (IS_ERR(data))
                return PTR_ERR(data);

        skp = smk_import_entry(data, count);
        if (IS_ERR(skp)) {
                rc = PTR_ERR(skp);
                goto out;
        }

        mutex_lock(&smack_ambient_lock);

        oldambient = smack_net_ambient->smk_known;
        smack_net_ambient = skp;
        smk_unlbl_ambient(oldambient);

        mutex_unlock(&smack_ambient_lock);

out:
        kfree(data);
        return rc;
}

static const struct file_operations smk_ambient_ops = {
        .read                = smk_read_ambient,
        .write                = smk_write_ambient,
        .llseek                = default_llseek,
};

/*
 * Seq_file operations for /smack/onlycap
 */
static void *onlycap_seq_start(struct seq_file *s, loff_t *pos)
{
        return smk_seq_start(s, pos, &smack_onlycap_list);
}

static void *onlycap_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
        return smk_seq_next(s, v, pos, &smack_onlycap_list);
}

static int onlycap_seq_show(struct seq_file *s, void *v)
{
        struct list_head *list = v;
        struct smack_known_list_elem *sklep =
                list_entry_rcu(list, struct smack_known_list_elem, list);

        seq_puts(s, sklep->smk_label->smk_known);
        seq_putc(s, ' ');

        return 0;
}

static const struct seq_operations onlycap_seq_ops = {
        .start = onlycap_seq_start,
        .next  = onlycap_seq_next,
        .show  = onlycap_seq_show,
        .stop  = smk_seq_stop,
};

static int smk_open_onlycap(struct inode *inode, struct file *file)
{
        return seq_open(file, &onlycap_seq_ops);
}

/**
 * smk_list_swap_rcu - swap public list with a private one in RCU-safe way
 * The caller must hold appropriate mutex to prevent concurrent modifications
 * to the public list.
 * Private list is assumed to be not accessible to other threads yet.
 *
 * @public: public list
 * @private: private list
 */
static void smk_list_swap_rcu(struct list_head *public,
                              struct list_head *private)
{
        struct list_head *first, *last;

        if (list_empty(public)) {
                list_splice_init_rcu(private, public, synchronize_rcu);
        } else {
                /* Remember public list before replacing it */
                first = public->next;
                last = public->prev;

                /* Publish private list in place of public in RCU-safe way */
                private->prev->next = public;
                private->next->prev = public;
                rcu_assign_pointer(public->next, private->next);
                public->prev = private->prev;

                synchronize_rcu();

                /* When all readers are done with the old public list,
                 * attach it in place of private */
                private->next = first;
                private->prev = last;
                first->prev = private;
                last->next = private;
        }
}

/**
 * smk_parse_label_list - parse list of Smack labels, separated by spaces
 *
 * @data: the string to parse
 * @list: destination list
 *
 * Returns zero on success or error code, as appropriate
 */
static int smk_parse_label_list(char *data, struct list_head *list)
{
        char *tok;
        struct smack_known *skp;
        struct smack_known_list_elem *sklep;

        while ((tok = strsep(&data, " ")) != NULL) {
                if (!*tok)
                        continue;

                skp = smk_import_entry(tok, 0);
                if (IS_ERR(skp))
                        return PTR_ERR(skp);

                sklep = kzalloc(sizeof(*sklep), GFP_KERNEL);
                if (sklep == NULL)
                        return -ENOMEM;

                sklep->smk_label = skp;
                list_add(&sklep->list, list);
        }

        return 0;
}

/**
 * smk_destroy_label_list - destroy a list of smack_known_list_elem
 * @list: header pointer of the list to destroy
 */
void smk_destroy_label_list(struct list_head *list)
{
        struct smack_known_list_elem *sklep;
        struct smack_known_list_elem *sklep2;

        list_for_each_entry_safe(sklep, sklep2, list, list)
                kfree(sklep);

        INIT_LIST_HEAD(list);
}

/**
 * smk_write_onlycap - write() for smackfs/onlycap
 * @file: file pointer, not actually used
 * @buf: where to get the data from
 * @count: bytes sent
 * @ppos: where to start
 *
 * Returns number of bytes written or error code, as appropriate
 */
static ssize_t smk_write_onlycap(struct file *file, const char __user *buf,
                                 size_t count, loff_t *ppos)
{
        char *data;
        LIST_HEAD(list_tmp);
        int rc;

        if (!smack_privileged(CAP_MAC_ADMIN))
                return -EPERM;

        if (count > PAGE_SIZE)
                return -EINVAL;

        data = memdup_user_nul(buf, count);
        if (IS_ERR(data))
                return PTR_ERR(data);

        rc = smk_parse_label_list(data, &list_tmp);
        kfree(data);

        /*
         * Clear the smack_onlycap on invalid label errors. This means
         * that we can pass a null string to unset the onlycap value.
         *
         * Importing will also reject a label beginning with '-',
         * so "-usecapabilities" will also work.
         *
         * But do so only on invalid label, not on system errors.
         * The invalid label must be first to count as clearing attempt.
         */
        if (!rc || (rc == -EINVAL && list_empty(&list_tmp))) {
                mutex_lock(&smack_onlycap_lock);
                smk_list_swap_rcu(&smack_onlycap_list, &list_tmp);
                mutex_unlock(&smack_onlycap_lock);
                rc = count;
        }

        smk_destroy_label_list(&list_tmp);

        return rc;
}

static const struct file_operations smk_onlycap_ops = {
        .open                = smk_open_onlycap,
        .read                = seq_read,
        .write                = smk_write_onlycap,
        .llseek                = seq_lseek,
        .release        = seq_release,
};

#ifdef CONFIG_SECURITY_SMACK_BRINGUP
/**
 * smk_read_unconfined - read() for smackfs/unconfined
 * @filp: file pointer, not actually used
 * @buf: where to put the result
 * @cn: maximum to send along
 * @ppos: where to start
 *
 * Returns number of bytes read or error code, as appropriate
 */
static ssize_t smk_read_unconfined(struct file *filp, char __user *buf,
                                        size_t cn, loff_t *ppos)
{
        char *smack = "";
        ssize_t rc = -EINVAL;
        int asize;

        if (*ppos != 0)
                return 0;

        if (smack_unconfined != NULL)
                smack = smack_unconfined->smk_known;

        asize = strlen(smack) + 1;

        if (cn >= asize)
                rc = simple_read_from_buffer(buf, cn, ppos, smack, asize);

        return rc;
}

/**
 * smk_write_unconfined - write() for smackfs/unconfined
 * @file: file pointer, not actually used
 * @buf: where to get the data from
 * @count: bytes sent
 * @ppos: where to start
 *
 * Returns number of bytes written or error code, as appropriate
 */
static ssize_t smk_write_unconfined(struct file *file, const char __user *buf,
                                        size_t count, loff_t *ppos)
{
        char *data;
        struct smack_known *skp;
        int rc = count;

        if (!smack_privileged(CAP_MAC_ADMIN))
                return -EPERM;

        if (count > PAGE_SIZE)
                return -EINVAL;

        data = memdup_user_nul(buf, count);
        if (IS_ERR(data))
                return PTR_ERR(data);

        /*
         * Clear the smack_unconfined on invalid label errors. This means
         * that we can pass a null string to unset the unconfined value.
         *
         * Importing will also reject a label beginning with '-',
         * so "-confine" will also work.
         *
         * But do so only on invalid label, not on system errors.
         */
        skp = smk_import_entry(data, count);
        if (PTR_ERR(skp) == -EINVAL)
                skp = NULL;
        else if (IS_ERR(skp)) {
                rc = PTR_ERR(skp);
                goto freeout;
        }

        smack_unconfined = skp;

freeout:
        kfree(data);
        return rc;
}

static const struct file_operations smk_unconfined_ops = {
        .read                = smk_read_unconfined,
        .write                = smk_write_unconfined,
        .llseek                = default_llseek,
};
#endif /* CONFIG_SECURITY_SMACK_BRINGUP */

/**
 * smk_read_logging - read() for /smack/logging
 * @filp: file pointer, not actually used
 * @buf: where to put the result
 * @count: maximum to send along
 * @ppos: where to start
 *
 * Returns number of bytes read or error code, as appropriate
 */
static ssize_t smk_read_logging(struct file *filp, char __user *buf,
                                size_t count, loff_t *ppos)
{
        char temp[32];
        ssize_t rc;

        if (*ppos != 0)
                return 0;

        sprintf(temp, "%d\n", log_policy);
        rc = simple_read_from_buffer(buf, count, ppos, temp, strlen(temp));
        return rc;
}

/**
 * smk_write_logging - write() for /smack/logging
 * @file: file pointer, not actually used
 * @buf: where to get the data from
 * @count: bytes sent
 * @ppos: where to start
 *
 * Returns number of bytes written or error code, as appropriate
 */
static ssize_t smk_write_logging(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
{
        char temp[32];
        int i;

        if (!smack_privileged(CAP_MAC_ADMIN))
                return -EPERM;

        if (count >= sizeof(temp) || count == 0)
                return -EINVAL;

        if (copy_from_user(temp, buf, count) != 0)
                return -EFAULT;

        temp[count] = '\0';

        if (sscanf(temp, "%d", &i) != 1)
                return -EINVAL;
        if (i < 0 || i > 3)
                return -EINVAL;
        log_policy = i;
        return count;
}



static const struct file_operations smk_logging_ops = {
        .read                = smk_read_logging,
        .write                = smk_write_logging,
        .llseek                = default_llseek,
};

/*
 * Seq_file read operations for /smack/load-self
 */

static void *load_self_seq_start(struct seq_file *s, loff_t *pos)
{
        struct task_smack *tsp = smack_cred(current_cred());

        return smk_seq_start(s, pos, &tsp->smk_rules);
}

static void *load_self_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
        struct task_smack *tsp = smack_cred(current_cred());

        return smk_seq_next(s, v, pos, &tsp->smk_rules);
}

static int load_self_seq_show(struct seq_file *s, void *v)
{
        struct list_head *list = v;
        struct smack_rule *srp =
                list_entry_rcu(list, struct smack_rule, list);

        smk_rule_show(s, srp, SMK_LABELLEN);

        return 0;
}

static const struct seq_operations load_self_seq_ops = {
        .start = load_self_seq_start,
        .next  = load_self_seq_next,
        .show  = load_self_seq_show,
        .stop  = smk_seq_stop,
};


/**
 * smk_open_load_self - open() for /smack/load-self2
 * @inode: inode structure representing file
 * @file: "load" file pointer
 *
 * For reading, use load_seq_* seq_file reading operations.
 */
static int smk_open_load_self(struct inode *inode, struct file *file)
{
        return seq_open(file, &load_self_seq_ops);
}

/**
 * smk_write_load_self - write() for /smack/load-self
 * @file: file pointer, not actually used
 * @buf: where to get the data from
 * @count: bytes sent
 * @ppos: where to start - must be 0
 *
 */
static ssize_t smk_write_load_self(struct file *file, const char __user *buf,
                              size_t count, loff_t *ppos)
{
        struct task_smack *tsp = smack_cred(current_cred());

        return smk_write_rules_list(file, buf, count, ppos, &tsp->smk_rules,
                                    &tsp->smk_rules_lock, SMK_FIXED24_FMT);
}

static const struct file_operations smk_load_self_ops = {
        .open           = smk_open_load_self,
        .read                = seq_read,
        .llseek         = seq_lseek,
        .write                = smk_write_load_self,
        .release        = seq_release,
};

/**
 * smk_user_access - handle access check transaction
 * @file: file pointer
 * @buf: data from user space
 * @count: bytes sent
 * @ppos: where to start - must be 0
 * @format: /smack/load or /smack/load2 or /smack/change-rule format.
 */
static ssize_t smk_user_access(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos, int format)
{
        struct smack_parsed_rule rule;
        char *data;
        int res;

        data = simple_transaction_get(file, buf, count);
        if (IS_ERR(data))
                return PTR_ERR(data);

        if (format == SMK_FIXED24_FMT) {
                if (count < SMK_LOADLEN)
                        return -EINVAL;
                res = smk_parse_rule(data, &rule, 0);
        } else {
                /*
                 * simple_transaction_get() returns null-terminated data
                 */
                res = smk_parse_long_rule(data, &rule, 0, 3);
        }

        if (res >= 0)
                res = smk_access(rule.smk_subject, rule.smk_object,
                                 rule.smk_access1, NULL);
        else if (res != -ENOENT)
                return res;

        /*
         * smk_access() can return a value > 0 in the "bringup" case.
         */
        data[0] = res >= 0 ? '1' : '0';
        data[1] = '\0';

        simple_transaction_set(file, 2);

        if (format == SMK_FIXED24_FMT)
                return SMK_LOADLEN;
        return count;
}

/**
 * smk_write_access - handle access check transaction
 * @file: file pointer
 * @buf: data from user space
 * @count: bytes sent
 * @ppos: where to start - must be 0
 */
static ssize_t smk_write_access(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
{
        return smk_user_access(file, buf, count, ppos, SMK_FIXED24_FMT);
}

static const struct file_operations smk_access_ops = {
        .write                = smk_write_access,
        .read                = simple_transaction_read,
        .release        = simple_transaction_release,
        .llseek                = generic_file_llseek,
};


/*
 * Seq_file read operations for /smack/load2
 */

static int load2_seq_show(struct seq_file *s, void *v)
{
        struct list_head *list = v;
        struct smack_rule *srp;
        struct smack_known *skp =
                list_entry_rcu(list, struct smack_known, list);

        list_for_each_entry_rcu(srp, &skp->smk_rules, list)
                smk_rule_show(s, srp, SMK_LONGLABEL);

        return 0;
}

static const struct seq_operations load2_seq_ops = {
        .start = load2_seq_start,
        .next  = load2_seq_next,
        .show  = load2_seq_show,
        .stop  = smk_seq_stop,
};

/**
 * smk_open_load2 - open() for /smack/load2
 * @inode: inode structure representing file
 * @file: "load2" file pointer
 *
 * For reading, use load2_seq_* seq_file reading operations.
 */
static int smk_open_load2(struct inode *inode, struct file *file)
{
        return seq_open(file, &load2_seq_ops);
}

/**
 * smk_write_load2 - write() for /smack/load2
 * @file: file pointer, not actually used
 * @buf: where to get the data from
 * @count: bytes sent
 * @ppos: where to start - must be 0
 *
 */
static ssize_t smk_write_load2(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
{
        /*
         * Must have privilege.
         */
        if (!smack_privileged(CAP_MAC_ADMIN))
                return -EPERM;

        return smk_write_rules_list(file, buf, count, ppos, NULL, NULL,
                                    SMK_LONG_FMT);
}

static const struct file_operations smk_load2_ops = {
        .open           = smk_open_load2,
        .read                = seq_read,
        .llseek         = seq_lseek,
        .write                = smk_write_load2,
        .release        = seq_release,
};

/*
 * Seq_file read operations for /smack/load-self2
 */

static void *load_self2_seq_start(struct seq_file *s, loff_t *pos)
{
        struct task_smack *tsp = smack_cred(current_cred());

        return smk_seq_start(s, pos, &tsp->smk_rules);
}

static void *load_self2_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
        struct task_smack *tsp = smack_cred(current_cred());

        return smk_seq_next(s, v, pos, &tsp->smk_rules);
}

static int load_self2_seq_show(struct seq_file *s, void *v)
{
        struct list_head *list = v;
        struct smack_rule *srp =
                list_entry_rcu(list, struct smack_rule, list);

        smk_rule_show(s, srp, SMK_LONGLABEL);

        return 0;
}

static const struct seq_operations load_self2_seq_ops = {
        .start = load_self2_seq_start,
        .next  = load_self2_seq_next,
        .show  = load_self2_seq_show,
        .stop  = smk_seq_stop,
};

/**
 * smk_open_load_self2 - open() for /smack/load-self2
 * @inode: inode structure representing file
 * @file: "load" file pointer
 *
 * For reading, use load_seq_* seq_file reading operations.
 */
static int smk_open_load_self2(struct inode *inode, struct file *file)
{
        return seq_open(file, &load_self2_seq_ops);
}

/**
 * smk_write_load_self2 - write() for /smack/load-self2
 * @file: file pointer, not actually used
 * @buf: where to get the data from
 * @count: bytes sent
 * @ppos: where to start - must be 0
 *
 */
static ssize_t smk_write_load_self2(struct file *file, const char __user *buf,
                              size_t count, loff_t *ppos)
{
        struct task_smack *tsp = smack_cred(current_cred());

        return smk_write_rules_list(file, buf, count, ppos, &tsp->smk_rules,
                                    &tsp->smk_rules_lock, SMK_LONG_FMT);
}

static const struct file_operations smk_load_self2_ops = {
        .open           = smk_open_load_self2,
        .read                = seq_read,
        .llseek         = seq_lseek,
        .write                = smk_write_load_self2,
        .release        = seq_release,
};

/**
 * smk_write_access2 - handle access check transaction
 * @file: file pointer
 * @buf: data from user space
 * @count: bytes sent
 * @ppos: where to start - must be 0
 */
static ssize_t smk_write_access2(struct file *file, const char __user *buf,
                                        size_t count, loff_t *ppos)
{
        return smk_user_access(file, buf, count, ppos, SMK_LONG_FMT);
}

static const struct file_operations smk_access2_ops = {
        .write                = smk_write_access2,
        .read                = simple_transaction_read,
        .release        = simple_transaction_release,
        .llseek                = generic_file_llseek,
};

/**
 * smk_write_revoke_subj - write() for /smack/revoke-subject
 * @file: file pointer
 * @buf: data from user space
 * @count: bytes sent
 * @ppos: where to start - must be 0
 */
static ssize_t smk_write_revoke_subj(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
{
        char *data;
        const char *cp;
        struct smack_known *skp;
        struct smack_rule *sp;
        struct list_head *rule_list;
        struct mutex *rule_lock;
        int rc = count;

        if (*ppos != 0)
                return -EINVAL;

        if (!smack_privileged(CAP_MAC_ADMIN))
                return -EPERM;

        if (count == 0 || count > SMK_LONGLABEL)
                return -EINVAL;

        data = memdup_user(buf, count);
        if (IS_ERR(data))
                return PTR_ERR(data);

        cp = smk_parse_smack(data, count);
        if (IS_ERR(cp)) {
                rc = PTR_ERR(cp);
                goto out_data;
        }

        skp = smk_find_entry(cp);
        if (skp == NULL)
                goto out_cp;

        rule_list = &skp->smk_rules;
        rule_lock = &skp->smk_rules_lock;

        mutex_lock(rule_lock);

        list_for_each_entry_rcu(sp, rule_list, list)
                sp->smk_access = 0;

        mutex_unlock(rule_lock);

out_cp:
        kfree(cp);
out_data:
        kfree(data);

        return rc;
}

static const struct file_operations smk_revoke_subj_ops = {
        .write                = smk_write_revoke_subj,
        .read                = simple_transaction_read,
        .release        = simple_transaction_release,
        .llseek                = generic_file_llseek,
};

/**
 * smk_init_sysfs - initialize /sys/fs/smackfs
 *
 */
static int smk_init_sysfs(void)
{
        return sysfs_create_mount_point(fs_kobj, "smackfs");
}

/**
 * smk_write_change_rule - write() for /smack/change-rule
 * @file: file pointer
 * @buf: data from user space
 * @count: bytes sent
 * @ppos: where to start - must be 0
 */
static ssize_t smk_write_change_rule(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
{
        /*
         * Must have privilege.
         */
        if (!smack_privileged(CAP_MAC_ADMIN))
                return -EPERM;

        return smk_write_rules_list(file, buf, count, ppos, NULL, NULL,
                                    SMK_CHANGE_FMT);
}

static const struct file_operations smk_change_rule_ops = {
        .write                = smk_write_change_rule,
        .read                = simple_transaction_read,
        .release        = simple_transaction_release,
        .llseek                = generic_file_llseek,
};

/**
 * smk_read_syslog - read() for smackfs/syslog
 * @filp: file pointer, not actually used
 * @buf: where to put the result
 * @cn: maximum to send along
 * @ppos: where to start
 *
 * Returns number of bytes read or error code, as appropriate
 */
static ssize_t smk_read_syslog(struct file *filp, char __user *buf,
                                size_t cn, loff_t *ppos)
{
        struct smack_known *skp;
        ssize_t rc = -EINVAL;
        int asize;

        if (*ppos != 0)
                return 0;

        if (smack_syslog_label == NULL)
                skp = &smack_known_star;
        else
                skp = smack_syslog_label;

        asize = strlen(skp->smk_known) + 1;

        if (cn >= asize)
                rc = simple_read_from_buffer(buf, cn, ppos, skp->smk_known,
                                                asize);

        return rc;
}

/**
 * smk_write_syslog - write() for smackfs/syslog
 * @file: file pointer, not actually used
 * @buf: where to get the data from
 * @count: bytes sent
 * @ppos: where to start
 *
 * Returns number of bytes written or error code, as appropriate
 */
static ssize_t smk_write_syslog(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
{
        char *data;
        struct smack_known *skp;
        int rc = count;

        if (!smack_privileged(CAP_MAC_ADMIN))
                return -EPERM;

        /* Enough data must be present */
        if (count == 0 || count > PAGE_SIZE)
                return -EINVAL;

        data = memdup_user_nul(buf, count);
        if (IS_ERR(data))
                return PTR_ERR(data);

        skp = smk_import_entry(data, count);
        if (IS_ERR(skp))
                rc = PTR_ERR(skp);
        else
                smack_syslog_label = skp;

        kfree(data);
        return rc;
}

static const struct file_operations smk_syslog_ops = {
        .read                = smk_read_syslog,
        .write                = smk_write_syslog,
        .llseek                = default_llseek,
};

/*
 * Seq_file read operations for /smack/relabel-self
 */

static void *relabel_self_seq_start(struct seq_file *s, loff_t *pos)
{
        struct task_smack *tsp = smack_cred(current_cred());

        return smk_seq_start(s, pos, &tsp->smk_relabel);
}

static void *relabel_self_seq_next(struct seq_file *s, void *v, loff_t *pos)
{
        struct task_smack *tsp = smack_cred(current_cred());

        return smk_seq_next(s, v, pos, &tsp->smk_relabel);
}

static int relabel_self_seq_show(struct seq_file *s, void *v)
{
        struct list_head *list = v;
        struct smack_known_list_elem *sklep =
                list_entry(list, struct smack_known_list_elem, list);

        seq_puts(s, sklep->smk_label->smk_known);
        seq_putc(s, ' ');

        return 0;
}

static const struct seq_operations relabel_self_seq_ops = {
        .start = relabel_self_seq_start,
        .next  = relabel_self_seq_next,
        .show  = relabel_self_seq_show,
        .stop  = smk_seq_stop,
};

/**
 * smk_open_relabel_self - open() for /smack/relabel-self
 * @inode: inode structure representing file
 * @file: "relabel-self" file pointer
 *
 * Connect our relabel_self_seq_* operations with /smack/relabel-self
 * file_operations
 */
static int smk_open_relabel_self(struct inode *inode, struct file *file)
{
        return seq_open(file, &relabel_self_seq_ops);
}

/**
 * smk_write_relabel_self - write() for /smack/relabel-self
 * @file: file pointer, not actually used
 * @buf: where to get the data from
 * @count: bytes sent
 * @ppos: where to start - must be 0
 *
 */
static ssize_t smk_write_relabel_self(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
{
        char *data;
        int rc;
        LIST_HEAD(list_tmp);

        /*
         * Must have privilege.
         */
        if (!smack_privileged(CAP_MAC_ADMIN))
                return -EPERM;

        /*
         * No partial write.
         * Enough data must be present.
         */
        if (*ppos != 0)
                return -EINVAL;
        if (count == 0 || count > PAGE_SIZE)
                return -EINVAL;

        data = memdup_user_nul(buf, count);
        if (IS_ERR(data))
                return PTR_ERR(data);

        rc = smk_parse_label_list(data, &list_tmp);
        kfree(data);

        if (!rc || (rc == -EINVAL && list_empty(&list_tmp))) {
                struct cred *new;
                struct task_smack *tsp;

                new = prepare_creds();
                if (!new) {
                        rc = -ENOMEM;
                        goto out;
                }
                tsp = smack_cred(new);
                smk_destroy_label_list(&tsp->smk_relabel);
                list_splice(&list_tmp, &tsp->smk_relabel);
                commit_creds(new);
                return count;
        }
out:
        smk_destroy_label_list(&list_tmp);
        return rc;
}

static const struct file_operations smk_relabel_self_ops = {
        .open                = smk_open_relabel_self,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .write                = smk_write_relabel_self,
        .release        = seq_release,
};

/**
 * smk_read_ptrace - read() for /smack/ptrace
 * @filp: file pointer, not actually used
 * @buf: where to put the result
 * @count: maximum to send along
 * @ppos: where to start
 *
 * Returns number of bytes read or error code, as appropriate
 */
static ssize_t smk_read_ptrace(struct file *filp, char __user *buf,
                               size_t count, loff_t *ppos)
{
        char temp[32];
        ssize_t rc;

        if (*ppos != 0)
                return 0;

        sprintf(temp, "%d\n", smack_ptrace_rule);
        rc = simple_read_from_buffer(buf, count, ppos, temp, strlen(temp));
        return rc;
}

/**
 * smk_write_ptrace - write() for /smack/ptrace
 * @file: file pointer
 * @buf: data from user space
 * @count: bytes sent
 * @ppos: where to start - must be 0
 */
static ssize_t smk_write_ptrace(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
{
        char temp[32];
        int i;

        if (!smack_privileged(CAP_MAC_ADMIN))
                return -EPERM;

        if (*ppos != 0 || count >= sizeof(temp) || count == 0)
                return -EINVAL;

        if (copy_from_user(temp, buf, count) != 0)
                return -EFAULT;

        temp[count] = '\0';

        if (sscanf(temp, "%d", &i) != 1)
                return -EINVAL;
        if (i < SMACK_PTRACE_DEFAULT || i > SMACK_PTRACE_MAX)
                return -EINVAL;
        smack_ptrace_rule = i;

        return count;
}

static const struct file_operations smk_ptrace_ops = {
        .write                = smk_write_ptrace,
        .read                = smk_read_ptrace,
        .llseek                = default_llseek,
};

/**
 * smk_fill_super - fill the smackfs superblock
 * @sb: the empty superblock
 * @fc: unused
 *
 * Fill in the well known entries for the smack filesystem
 *
 * Returns 0 on success, an error code on failure
 */
static int smk_fill_super(struct super_block *sb, struct fs_context *fc)
{
        int rc;

        static const struct tree_descr smack_files[] = {
                [SMK_LOAD] = {
                        "load", &smk_load_ops, S_IRUGO|S_IWUSR},
                [SMK_CIPSO] = {
                        "cipso", &smk_cipso_ops, S_IRUGO|S_IWUSR},
                [SMK_DOI] = {
                        "doi", &smk_doi_ops, S_IRUGO|S_IWUSR},
                [SMK_DIRECT] = {
                        "direct", &smk_direct_ops, S_IRUGO|S_IWUSR},
                [SMK_AMBIENT] = {
                        "ambient", &smk_ambient_ops, S_IRUGO|S_IWUSR},
                [SMK_NET4ADDR] = {
                        "netlabel", &smk_net4addr_ops, S_IRUGO|S_IWUSR},
                [SMK_ONLYCAP] = {
                        "onlycap", &smk_onlycap_ops, S_IRUGO|S_IWUSR},
                [SMK_LOGGING] = {
                        "logging", &smk_logging_ops, S_IRUGO|S_IWUSR},
                [SMK_LOAD_SELF] = {
                        "load-self", &smk_load_self_ops, S_IRUGO|S_IWUGO},
                [SMK_ACCESSES] = {
                        "access", &smk_access_ops, S_IRUGO|S_IWUGO},
                [SMK_MAPPED] = {
                        "mapped", &smk_mapped_ops, S_IRUGO|S_IWUSR},
                [SMK_LOAD2] = {
                        "load2", &smk_load2_ops, S_IRUGO|S_IWUSR},
                [SMK_LOAD_SELF2] = {
                        "load-self2", &smk_load_self2_ops, S_IRUGO|S_IWUGO},
                [SMK_ACCESS2] = {
                        "access2", &smk_access2_ops, S_IRUGO|S_IWUGO},
                [SMK_CIPSO2] = {
                        "cipso2", &smk_cipso2_ops, S_IRUGO|S_IWUSR},
                [SMK_REVOKE_SUBJ] = {
                        "revoke-subject", &smk_revoke_subj_ops,
                        S_IRUGO|S_IWUSR},
                [SMK_CHANGE_RULE] = {
                        "change-rule", &smk_change_rule_ops, S_IRUGO|S_IWUSR},
                [SMK_SYSLOG] = {
                        "syslog", &smk_syslog_ops, S_IRUGO|S_IWUSR},
                [SMK_PTRACE] = {
                        "ptrace", &smk_ptrace_ops, S_IRUGO|S_IWUSR},
#ifdef CONFIG_SECURITY_SMACK_BRINGUP
                [SMK_UNCONFINED] = {
                        "unconfined", &smk_unconfined_ops, S_IRUGO|S_IWUSR},
#endif
#if IS_ENABLED(CONFIG_IPV6)
                [SMK_NET6ADDR] = {
                        "ipv6host", &smk_net6addr_ops, S_IRUGO|S_IWUSR},
#endif /* CONFIG_IPV6 */
                [SMK_RELABEL_SELF] = {
                        "relabel-self", &smk_relabel_self_ops,
                                S_IRUGO|S_IWUGO},
                /* last one */
                        {""}
        };

        rc = simple_fill_super(sb, SMACK_MAGIC, smack_files);
        if (rc != 0) {
                printk(KERN_ERR "%s failed %d while creating inodes\n",
                        __func__, rc);
                return rc;
        }

        return 0;
}

/**
 * smk_get_tree - get the smackfs superblock
 * @fc: The mount context, including any options
 *
 * Just passes everything along.
 *
 * Returns what the lower level code does.
 */
static int smk_get_tree(struct fs_context *fc)
{
        return get_tree_single(fc, smk_fill_super);
}

static const struct fs_context_operations smk_context_ops = {
        .get_tree        = smk_get_tree,
};

/**
 * smk_init_fs_context - Initialise a filesystem context for smackfs
 * @fc: The blank mount context
 */
static int smk_init_fs_context(struct fs_context *fc)
{
        fc->ops = &smk_context_ops;
        return 0;
}

static struct file_system_type smk_fs_type = {
        .name                = "smackfs",
        .init_fs_context = smk_init_fs_context,
        .kill_sb        = kill_litter_super,
};

static struct vfsmount *smackfs_mount;

/**
 * init_smk_fs - get the smackfs superblock
 *
 * register the smackfs
 *
 * Do not register smackfs if Smack wasn't enabled
 * on boot. We can not put this method normally under the
 * smack_init() code path since the security subsystem get
 * initialized before the vfs caches.
 *
 * Returns true if we were not chosen on boot or if
 * we were chosen and filesystem registration succeeded.
 */
static int __init init_smk_fs(void)
{
        int err;
        int rc;

        if (smack_enabled == 0)
                return 0;

        err = smk_init_sysfs();
        if (err)
                printk(KERN_ERR "smackfs: sysfs mountpoint problem.\n");

        err = register_filesystem(&smk_fs_type);
        if (!err) {
                smackfs_mount = kern_mount(&smk_fs_type);
                if (IS_ERR(smackfs_mount)) {
                        printk(KERN_ERR "smackfs:  could not mount!\n");
                        err = PTR_ERR(smackfs_mount);
                        smackfs_mount = NULL;
                }
        }

        smk_cipso_doi();
        smk_unlbl_ambient(NULL);

        rc = smack_populate_secattr(&smack_known_floor);
        if (err == 0 && rc < 0)
                err = rc;
        rc = smack_populate_secattr(&smack_known_hat);
        if (err == 0 && rc < 0)
                err = rc;
        rc = smack_populate_secattr(&smack_known_huh);
        if (err == 0 && rc < 0)
                err = rc;
        rc = smack_populate_secattr(&smack_known_star);
        if (err == 0 && rc < 0)
                err = rc;
        rc = smack_populate_secattr(&smack_known_web);
        if (err == 0 && rc < 0)
                err = rc;

        return err;
}

__initcall(init_smk_fs);
































    2 
    2 




















    1 


    1 































































































































































































































































































































































































































































    2 



    2 





























































































































































































































































































































































































    7 



    7 















    2 







    2 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/hugetlb.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/fixmap.h>
#include <asm/mtrr.h>

#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
EXPORT_SYMBOL(physical_mask);
#endif

#ifdef CONFIG_HIGHPTE
#define PGTABLE_HIGHMEM __GFP_HIGHMEM
#else
#define PGTABLE_HIGHMEM 0
#endif

#ifndef CONFIG_PARAVIRT
static inline
void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
{
        tlb_remove_page(tlb, table);
}
#endif

gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;

pgtable_t pte_alloc_one(struct mm_struct *mm)
{
        return __pte_alloc_one(mm, __userpte_alloc_gfp);
}

static int __init setup_userpte(char *arg)
{
        if (!arg)
                return -EINVAL;

        /*
         * "userpte=nohigh" disables allocation of user pagetables in
         * high memory.
         */
        if (strcmp(arg, "nohigh") == 0)
                __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
        else
                return -EINVAL;
        return 0;
}
early_param("userpte", setup_userpte);

void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
        pagetable_pte_dtor(page_ptdesc(pte));
        paravirt_release_pte(page_to_pfn(pte));
        paravirt_tlb_remove_table(tlb, pte);
}

#if CONFIG_PGTABLE_LEVELS > 2
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
        paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
        /*
         * NOTE! For PAE, any changes to the top page-directory-pointer-table
         * entries need a full cr3 reload to flush.
         */
#ifdef CONFIG_X86_PAE
        tlb->need_flush_all = 1;
#endif
        pagetable_pmd_dtor(ptdesc);
        paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc));
}

#if CONFIG_PGTABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pud);

        pagetable_pud_dtor(ptdesc);
        paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
        paravirt_tlb_remove_table(tlb, virt_to_page(pud));
}

#if CONFIG_PGTABLE_LEVELS > 4
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
{
        paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
        paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
}
#endif        /* CONFIG_PGTABLE_LEVELS > 4 */
#endif        /* CONFIG_PGTABLE_LEVELS > 3 */
#endif        /* CONFIG_PGTABLE_LEVELS > 2 */

static inline void pgd_list_add(pgd_t *pgd)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pgd);

        list_add(&ptdesc->pt_list, &pgd_list);
}

static inline void pgd_list_del(pgd_t *pgd)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pgd);

        list_del(&ptdesc->pt_list);
}

#define UNSHARED_PTRS_PER_PGD                                \
        (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
#define MAX_UNSHARED_PTRS_PER_PGD                        \
        max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD)


static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
{
        virt_to_ptdesc(pgd)->pt_mm = mm;
}

struct mm_struct *pgd_page_get_mm(struct page *page)
{
        return page_ptdesc(page)->pt_mm;
}

static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
{
        /* If the pgd points to a shared pagetable level (either the
           ptes in non-PAE, or shared PMD in PAE), then just copy the
           references from swapper_pg_dir. */
        if (CONFIG_PGTABLE_LEVELS == 2 ||
            (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
            CONFIG_PGTABLE_LEVELS >= 4) {
                clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
                                swapper_pg_dir + KERNEL_PGD_BOUNDARY,
                                KERNEL_PGD_PTRS);
        }

        /* list required to sync kernel mapping updates */
        if (!SHARED_KERNEL_PMD) {
                pgd_set_mm(pgd, mm);
                pgd_list_add(pgd);
        }
}

static void pgd_dtor(pgd_t *pgd)
{
        if (SHARED_KERNEL_PMD)
                return;

        spin_lock(&pgd_lock);
        pgd_list_del(pgd);
        spin_unlock(&pgd_lock);
}

/*
 * List of all pgd's needed for non-PAE so it can invalidate entries
 * in both cached and uncached pgd's; not needed for PAE since the
 * kernel pmd is shared. If PAE were not to share the pmd a similar
 * tactic would be needed. This is essentially codepath-based locking
 * against pageattr.c; it is the unique case in which a valid change
 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
 * vmalloc faults work because attached pagetables are never freed.
 * -- nyc
 */

#ifdef CONFIG_X86_PAE
/*
 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
 * updating the top-level pagetable entries to guarantee the
 * processor notices the update.  Since this is expensive, and
 * all 4 top-level entries are used almost immediately in a
 * new process's life, we just pre-populate them here.
 *
 * Also, if we're in a paravirt environment where the kernel pmd is
 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
 * and initialize the kernel pmds here.
 */
#define PREALLOCATED_PMDS        UNSHARED_PTRS_PER_PGD
#define MAX_PREALLOCATED_PMDS        MAX_UNSHARED_PTRS_PER_PGD

/*
 * We allocate separate PMDs for the kernel part of the user page-table
 * when PTI is enabled. We need them to map the per-process LDT into the
 * user-space page-table.
 */
#define PREALLOCATED_USER_PMDS         (boot_cpu_has(X86_FEATURE_PTI) ? \
                                        KERNEL_PGD_PTRS : 0)
#define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS

void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
{
        paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);

        /* Note: almost everything apart from _PAGE_PRESENT is
           reserved at the pmd (PDPT) level. */
        set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));

        /*
         * According to Intel App note "TLBs, Paging-Structure Caches,
         * and Their Invalidation", April 2007, document 317080-001,
         * section 8.1: in PAE mode we explicitly have to flush the
         * TLB via cr3 if the top-level pgd is changed...
         */
        flush_tlb_mm(mm);
}
#else  /* !CONFIG_X86_PAE */

/* No need to prepopulate any pagetable entries in non-PAE modes. */
#define PREALLOCATED_PMDS        0
#define MAX_PREALLOCATED_PMDS        0
#define PREALLOCATED_USER_PMDS         0
#define MAX_PREALLOCATED_USER_PMDS 0
#endif        /* CONFIG_X86_PAE */

static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
{
        int i;
        struct ptdesc *ptdesc;

        for (i = 0; i < count; i++)
                if (pmds[i]) {
                        ptdesc = virt_to_ptdesc(pmds[i]);

                        pagetable_pmd_dtor(ptdesc);
                        pagetable_free(ptdesc);
                        mm_dec_nr_pmds(mm);
                }
}

static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
{
        int i;
        bool failed = false;
        gfp_t gfp = GFP_PGTABLE_USER;

        if (mm == &init_mm)
                gfp &= ~__GFP_ACCOUNT;
        gfp &= ~__GFP_HIGHMEM;

        for (i = 0; i < count; i++) {
                pmd_t *pmd = NULL;
                struct ptdesc *ptdesc = pagetable_alloc(gfp, 0);

                if (!ptdesc)
                        failed = true;
                if (ptdesc && !pagetable_pmd_ctor(ptdesc)) {
                        pagetable_free(ptdesc);
                        ptdesc = NULL;
                        failed = true;
                }
                if (ptdesc) {
                        mm_inc_nr_pmds(mm);
                        pmd = ptdesc_address(ptdesc);
                }

                pmds[i] = pmd;
        }

        if (failed) {
                free_pmds(mm, pmds, count);
                return -ENOMEM;
        }

        return 0;
}

/*
 * Mop up any pmd pages which may still be attached to the pgd.
 * Normally they will be freed by munmap/exit_mmap, but any pmd we
 * preallocate which never got a corresponding vma will need to be
 * freed manually.
 */
static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
{
        pgd_t pgd = *pgdp;

        if (pgd_val(pgd) != 0) {
                pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);

                pgd_clear(pgdp);

                paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
                pmd_free(mm, pmd);
                mm_dec_nr_pmds(mm);
        }
}

static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
{
        int i;

        for (i = 0; i < PREALLOCATED_PMDS; i++)
                mop_up_one_pmd(mm, &pgdp[i]);

#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION

        if (!boot_cpu_has(X86_FEATURE_PTI))
                return;

        pgdp = kernel_to_user_pgdp(pgdp);

        for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
                mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
#endif
}

static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
{
        p4d_t *p4d;
        pud_t *pud;
        int i;

        p4d = p4d_offset(pgd, 0);
        pud = pud_offset(p4d, 0);

        for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
                pmd_t *pmd = pmds[i];

                if (i >= KERNEL_PGD_BOUNDARY)
                        memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
                               sizeof(pmd_t) * PTRS_PER_PMD);

                pud_populate(mm, pud, pmd);
        }
}

#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
                                     pgd_t *k_pgd, pmd_t *pmds[])
{
        pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
        pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
        p4d_t *u_p4d;
        pud_t *u_pud;
        int i;

        u_p4d = p4d_offset(u_pgd, 0);
        u_pud = pud_offset(u_p4d, 0);

        s_pgd += KERNEL_PGD_BOUNDARY;
        u_pud += KERNEL_PGD_BOUNDARY;

        for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
                pmd_t *pmd = pmds[i];

                memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
                       sizeof(pmd_t) * PTRS_PER_PMD);

                pud_populate(mm, u_pud, pmd);
        }

}
#else
static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
                                     pgd_t *k_pgd, pmd_t *pmds[])
{
}
#endif
/*
 * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
 * assumes that pgd should be in one page.
 *
 * But kernel with PAE paging that is not running as a Xen domain
 * only needs to allocate 32 bytes for pgd instead of one page.
 */
#ifdef CONFIG_X86_PAE

#include <linux/slab.h>

#define PGD_SIZE        (PTRS_PER_PGD * sizeof(pgd_t))
#define PGD_ALIGN        32

static struct kmem_cache *pgd_cache;

void __init pgtable_cache_init(void)
{
        /*
         * When PAE kernel is running as a Xen domain, it does not use
         * shared kernel pmd. And this requires a whole page for pgd.
         */
        if (!SHARED_KERNEL_PMD)
                return;

        /*
         * when PAE kernel is not running as a Xen domain, it uses
         * shared kernel pmd. Shared kernel pmd does not require a whole
         * page for pgd. We are able to just allocate a 32-byte for pgd.
         * During boot time, we create a 32-byte slab for pgd table allocation.
         */
        pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
                                      SLAB_PANIC, NULL);
}

static inline pgd_t *_pgd_alloc(void)
{
        /*
         * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
         * We allocate one page for pgd.
         */
        if (!SHARED_KERNEL_PMD)
                return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
                                                 PGD_ALLOCATION_ORDER);

        /*
         * Now PAE kernel is not running as a Xen domain. We can allocate
         * a 32-byte slab for pgd to save memory space.
         */
        return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER);
}

static inline void _pgd_free(pgd_t *pgd)
{
        if (!SHARED_KERNEL_PMD)
                free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
        else
                kmem_cache_free(pgd_cache, pgd);
}
#else

static inline pgd_t *_pgd_alloc(void)
{
        return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
                                         PGD_ALLOCATION_ORDER);
}

static inline void _pgd_free(pgd_t *pgd)
{
        free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
}
#endif /* CONFIG_X86_PAE */

pgd_t *pgd_alloc(struct mm_struct *mm)
{
        pgd_t *pgd;
        pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
        pmd_t *pmds[MAX_PREALLOCATED_PMDS];

        pgd = _pgd_alloc();

        if (pgd == NULL)
                goto out;

        mm->pgd = pgd;

        if (sizeof(pmds) != 0 &&
                        preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
                goto out_free_pgd;

        if (sizeof(u_pmds) != 0 &&
                        preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
                goto out_free_pmds;

        if (paravirt_pgd_alloc(mm) != 0)
                goto out_free_user_pmds;

        /*
         * Make sure that pre-populating the pmds is atomic with
         * respect to anything walking the pgd_list, so that they
         * never see a partially populated pgd.
         */
        spin_lock(&pgd_lock);

        pgd_ctor(mm, pgd);
        if (sizeof(pmds) != 0)
                pgd_prepopulate_pmd(mm, pgd, pmds);

        if (sizeof(u_pmds) != 0)
                pgd_prepopulate_user_pmd(mm, pgd, u_pmds);

        spin_unlock(&pgd_lock);

        return pgd;

out_free_user_pmds:
        if (sizeof(u_pmds) != 0)
                free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
out_free_pmds:
        if (sizeof(pmds) != 0)
                free_pmds(mm, pmds, PREALLOCATED_PMDS);
out_free_pgd:
        _pgd_free(pgd);
out:
        return NULL;
}

void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
        pgd_mop_up_pmds(mm, pgd);
        pgd_dtor(pgd);
        paravirt_pgd_free(mm, pgd);
        _pgd_free(pgd);
}

/*
 * Used to set accessed or dirty bits in the page table entries
 * on other architectures. On x86, the accessed and dirty bits
 * are tracked by hardware. However, do_wp_page calls this function
 * to also make the pte writeable at the same time the dirty bit is
 * set. In that case we do actually need to write the PTE.
 */
int ptep_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pte_t *ptep,
                          pte_t entry, int dirty)
{
        int changed = !pte_same(*ptep, entry);

        if (changed && dirty)
                set_pte(ptep, entry);

        return changed;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pmdp_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pmd_t *pmdp,
                          pmd_t entry, int dirty)
{
        int changed = !pmd_same(*pmdp, entry);

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);

        if (changed && dirty) {
                set_pmd(pmdp, entry);
                /*
                 * We had a write-protection fault here and changed the pmd
                 * to to more permissive. No need to flush the TLB for that,
                 * #PF is architecturally guaranteed to do that and in the
                 * worst-case we'll generate a spurious fault.
                 */
        }

        return changed;
}

int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
                          pud_t *pudp, pud_t entry, int dirty)
{
        int changed = !pud_same(*pudp, entry);

        VM_BUG_ON(address & ~HPAGE_PUD_MASK);

        if (changed && dirty) {
                set_pud(pudp, entry);
                /*
                 * We had a write-protection fault here and changed the pud
                 * to to more permissive. No need to flush the TLB for that,
                 * #PF is architecturally guaranteed to do that and in the
                 * worst-case we'll generate a spurious fault.
                 */
        }

        return changed;
}
#endif

int ptep_test_and_clear_young(struct vm_area_struct *vma,
                              unsigned long addr, pte_t *ptep)
{
        int ret = 0;

        if (pte_young(*ptep))
                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
                                         (unsigned long *) &ptep->pte);

        return ret;
}

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                              unsigned long addr, pmd_t *pmdp)
{
        int ret = 0;

        if (pmd_young(*pmdp))
                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
                                         (unsigned long *)pmdp);

        return ret;
}
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pudp_test_and_clear_young(struct vm_area_struct *vma,
                              unsigned long addr, pud_t *pudp)
{
        int ret = 0;

        if (pud_young(*pudp))
                ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
                                         (unsigned long *)pudp);

        return ret;
}
#endif

int ptep_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pte_t *ptep)
{
        /*
         * On x86 CPUs, clearing the accessed bit without a TLB flush
         * doesn't cause data corruption. [ It could cause incorrect
         * page aging and the (mistaken) reclaim of hot pages, but the
         * chance of that should be relatively low. ]
         *
         * So as a performance optimization don't flush the TLB when
         * clearing the accessed bit, it will eventually be flushed by
         * a context switch or a VM operation anyway. [ In the rare
         * event of it not getting flushed for a long time the delay
         * shouldn't really matter because there's no real memory
         * pressure for swapout to react to. ]
         */
        return ptep_test_and_clear_young(vma, address, ptep);
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pmdp_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pmd_t *pmdp)
{
        int young;

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);

        young = pmdp_test_and_clear_young(vma, address, pmdp);
        if (young)
                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);

        return young;
}

pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
                         pmd_t *pmdp)
{
        VM_WARN_ON_ONCE(!pmd_present(*pmdp));

        /*
         * No flush is necessary. Once an invalid PTE is established, the PTE's
         * access and dirty bits cannot be updated.
         */
        return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
}
#endif

/**
 * reserve_top_address - reserves a hole in the top of kernel address space
 * @reserve - size of hole to reserve
 *
 * Can be used to relocate the fixmap area and poke a hole in the top
 * of kernel address space to make room for a hypervisor.
 */
void __init reserve_top_address(unsigned long reserve)
{
#ifdef CONFIG_X86_32
        BUG_ON(fixmaps_set > 0);
        __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
        printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
               -reserve, __FIXADDR_TOP + PAGE_SIZE);
#endif
}

int fixmaps_set;

void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
{
        unsigned long address = __fix_to_virt(idx);

#ifdef CONFIG_X86_64
       /*
        * Ensure that the static initial page tables are covering the
        * fixmap completely.
        */
        BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
                     (FIXMAP_PMD_NUM * PTRS_PER_PTE));
#endif

        if (idx >= __end_of_fixed_addresses) {
                BUG();
                return;
        }
        set_pte_vaddr(address, pte);
        fixmaps_set++;
}

void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
                       phys_addr_t phys, pgprot_t flags)
{
        /* Sanitize 'prot' against any unsupported bits: */
        pgprot_val(flags) &= __default_kernel_pte_mask;

        __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
}

#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
#ifdef CONFIG_X86_5LEVEL
/**
 * p4d_set_huge - setup kernel P4D mapping
 *
 * No 512GB pages yet -- always return 0
 */
int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}

/**
 * p4d_clear_huge - clear kernel P4D mapping when it is set
 *
 * No 512GB pages yet -- always return 0
 */
void p4d_clear_huge(p4d_t *p4d)
{
}
#endif

/**
 * pud_set_huge - setup kernel PUD mapping
 *
 * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
 * function sets up a huge page only if the complete range has the same MTRR
 * caching mode.
 *
 * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
 * page mapping attempt fails.
 *
 * Returns 1 on success and 0 on failure.
 */
int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
{
        u8 uniform;

        mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
        if (!uniform)
                return 0;

        /* Bail out if we are we on a populated non-leaf entry: */
        if (pud_present(*pud) && !pud_leaf(*pud))
                return 0;

        set_pte((pte_t *)pud, pfn_pte(
                (u64)addr >> PAGE_SHIFT,
                __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));

        return 1;
}

/**
 * pmd_set_huge - setup kernel PMD mapping
 *
 * See text over pud_set_huge() above.
 *
 * Returns 1 on success and 0 on failure.
 */
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
{
        u8 uniform;

        mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
        if (!uniform) {
                pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
                             __func__, addr, addr + PMD_SIZE);
                return 0;
        }

        /* Bail out if we are we on a populated non-leaf entry: */
        if (pmd_present(*pmd) && !pmd_leaf(*pmd))
                return 0;

        set_pte((pte_t *)pmd, pfn_pte(
                (u64)addr >> PAGE_SHIFT,
                __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));

        return 1;
}

/**
 * pud_clear_huge - clear kernel PUD mapping when it is set
 *
 * Returns 1 on success and 0 on failure (no PUD map is found).
 */
int pud_clear_huge(pud_t *pud)
{
        if (pud_leaf(*pud)) {
                pud_clear(pud);
                return 1;
        }

        return 0;
}

/**
 * pmd_clear_huge - clear kernel PMD mapping when it is set
 *
 * Returns 1 on success and 0 on failure (no PMD map is found).
 */
int pmd_clear_huge(pmd_t *pmd)
{
        if (pmd_leaf(*pmd)) {
                pmd_clear(pmd);
                return 1;
        }

        return 0;
}

#ifdef CONFIG_X86_64
/**
 * pud_free_pmd_page - Clear pud entry and free pmd page.
 * @pud: Pointer to a PUD.
 * @addr: Virtual address associated with pud.
 *
 * Context: The pud range has been unmapped and TLB purged.
 * Return: 1 if clearing the entry succeeded. 0 otherwise.
 *
 * NOTE: Callers must allow a single page allocation.
 */
int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
        pmd_t *pmd, *pmd_sv;
        pte_t *pte;
        int i;

        pmd = pud_pgtable(*pud);
        pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
        if (!pmd_sv)
                return 0;

        for (i = 0; i < PTRS_PER_PMD; i++) {
                pmd_sv[i] = pmd[i];
                if (!pmd_none(pmd[i]))
                        pmd_clear(&pmd[i]);
        }

        pud_clear(pud);

        /* INVLPG to clear all paging-structure caches */
        flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);

        for (i = 0; i < PTRS_PER_PMD; i++) {
                if (!pmd_none(pmd_sv[i])) {
                        pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
                        free_page((unsigned long)pte);
                }
        }

        free_page((unsigned long)pmd_sv);

        pagetable_pmd_dtor(virt_to_ptdesc(pmd));
        free_page((unsigned long)pmd);

        return 1;
}

/**
 * pmd_free_pte_page - Clear pmd entry and free pte page.
 * @pmd: Pointer to a PMD.
 * @addr: Virtual address associated with pmd.
 *
 * Context: The pmd range has been unmapped and TLB purged.
 * Return: 1 if clearing the entry succeeded. 0 otherwise.
 */
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
        pte_t *pte;

        pte = (pte_t *)pmd_page_vaddr(*pmd);
        pmd_clear(pmd);

        /* INVLPG to clear all paging-structure caches */
        flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);

        free_page((unsigned long)pte);

        return 1;
}

#else /* !CONFIG_X86_64 */

/*
 * Disable free page handling on x86-PAE. This assures that ioremap()
 * does not update sync'd pmd entries. See vmalloc_sync_one().
 */
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
        return pmd_none(*pmd);
}

#endif /* CONFIG_X86_64 */
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMAP */

pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
        if (vma->vm_flags & VM_SHADOW_STACK)
                return pte_mkwrite_shstk(pte);

        pte = pte_mkwrite_novma(pte);

        return pte_clear_saveddirty(pte);
}

pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
        if (vma->vm_flags & VM_SHADOW_STACK)
                return pmd_mkwrite_shstk(pmd);

        pmd = pmd_mkwrite_novma(pmd);

        return pmd_clear_saveddirty(pmd);
}

void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte)
{
        /*
         * Hardware before shadow stack can (rarely) set Dirty=1
         * on a Write=0 PTE. So the below condition
         * only indicates a software bug when shadow stack is
         * supported by the HW. This checking is covered in
         * pte_shstk().
         */
        VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
                        pte_shstk(pte));
}

void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd)
{
        /* See note in arch_check_zapped_pte() */
        VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
                        pmd_shstk(pmd));
}



















   41 





   39 




    2 














    3 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * printk_safe.c - Safe printk for printk-deadlock-prone contexts
 */

#include <linux/preempt.h>
#include <linux/kdb.h>
#include <linux/smp.h>
#include <linux/cpumask.h>
#include <linux/printk.h>
#include <linux/kprobes.h>

#include "internal.h"

static DEFINE_PER_CPU(int, printk_context);

/* Can be preempted by NMI. */
void __printk_safe_enter(void)
{
        this_cpu_inc(printk_context);
}

/* Can be preempted by NMI. */
void __printk_safe_exit(void)
{
        this_cpu_dec(printk_context);
}

asmlinkage int vprintk(const char *fmt, va_list args)
{
#ifdef CONFIG_KGDB_KDB
        /* Allow to pass printk() to kdb but avoid a recursion. */
        if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0))
                return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
#endif

        /*
         * Use the main logbuf even in NMI. But avoid calling console
         * drivers that might have their own locks.
         */
        if (this_cpu_read(printk_context) || in_nmi())
                return vprintk_deferred(fmt, args);

        /* No obstacles. */
        return vprintk_default(fmt, args);
}
EXPORT_SYMBOL(vprintk);






















































    3 


    3 


















































































































    5 








































































    1 































    5 




    5 




    4 


    1 
    1 

    3 


    2 
























































    5 













    5 
    4 




    5 



    1 

    1 
    1 
































































    3 









    3 









































    3 


















    3 


    1 













    3 
    3 





    3 
    3 



    3 




















    3 





    3 


    3 





    3 














    3 
    3 



























    3 








    3 


    3 





    3 




    3 






















































































































    3 
























    3 













    3 
























    3 























































































    3 




    3 
    1 










    1 














    3 







    3 









    3 




    3 
    3 




    3 





    3 
    3 

    3 


    3 

    3 






    2 







































































    2 
    1 


    3 

    3 





































    4 



    4 




    3 














































    3 


    3 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
// SPDX-License-Identifier: GPL-2.0
/*
 * Functions related to segment and merge handling
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/scatterlist.h>
#include <linux/part_stat.h>
#include <linux/blk-cgroup.h>

#include <trace/events/block.h>

#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
#include "blk-throttle.h"

static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
{
        *bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
}

static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
{
        struct bvec_iter iter = bio->bi_iter;
        int idx;

        bio_get_first_bvec(bio, bv);
        if (bv->bv_len == bio->bi_iter.bi_size)
                return;                /* this bio only has a single bvec */

        bio_advance_iter(bio, &iter, iter.bi_size);

        if (!iter.bi_bvec_done)
                idx = iter.bi_idx - 1;
        else        /* in the middle of bvec */
                idx = iter.bi_idx;

        *bv = bio->bi_io_vec[idx];

        /*
         * iter.bi_bvec_done records actual length of the last bvec
         * if this bio ends in the middle of one io vector
         */
        if (iter.bi_bvec_done)
                bv->bv_len = iter.bi_bvec_done;
}

static inline bool bio_will_gap(struct request_queue *q,
                struct request *prev_rq, struct bio *prev, struct bio *next)
{
        struct bio_vec pb, nb;

        if (!bio_has_data(prev) || !queue_virt_boundary(q))
                return false;

        /*
         * Don't merge if the 1st bio starts with non-zero offset, otherwise it
         * is quite difficult to respect the sg gap limit.  We work hard to
         * merge a huge number of small single bios in case of mkfs.
         */
        if (prev_rq)
                bio_get_first_bvec(prev_rq->bio, &pb);
        else
                bio_get_first_bvec(prev, &pb);
        if (pb.bv_offset & queue_virt_boundary(q))
                return true;

        /*
         * We don't need to worry about the situation that the merged segment
         * ends in unaligned virt boundary:
         *
         * - if 'pb' ends aligned, the merged segment ends aligned
         * - if 'pb' ends unaligned, the next bio must include
         *   one single bvec of 'nb', otherwise the 'nb' can't
         *   merge with 'pb'
         */
        bio_get_last_bvec(prev, &pb);
        bio_get_first_bvec(next, &nb);
        if (biovec_phys_mergeable(q, &pb, &nb))
                return false;
        return __bvec_gap_to_prev(&q->limits, &pb, nb.bv_offset);
}

static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
{
        return bio_will_gap(req->q, req, req->biotail, bio);
}

static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
{
        return bio_will_gap(req->q, NULL, bio, req->bio);
}

/*
 * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size
 * is defined as 'unsigned int', meantime it has to be aligned to with the
 * logical block size, which is the minimum accepted unit by hardware.
 */
static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim)
{
        return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT;
}

static struct bio *bio_split_discard(struct bio *bio,
                                     const struct queue_limits *lim,
                                     unsigned *nsegs, struct bio_set *bs)
{
        unsigned int max_discard_sectors, granularity;
        sector_t tmp;
        unsigned split_sectors;

        *nsegs = 1;

        granularity = max(lim->discard_granularity >> 9, 1U);

        max_discard_sectors =
                min(lim->max_discard_sectors, bio_allowed_max_sectors(lim));
        max_discard_sectors -= max_discard_sectors % granularity;
        if (unlikely(!max_discard_sectors))
                return NULL;

        if (bio_sectors(bio) <= max_discard_sectors)
                return NULL;

        split_sectors = max_discard_sectors;

        /*
         * If the next starting sector would be misaligned, stop the discard at
         * the previous aligned sector.
         */
        tmp = bio->bi_iter.bi_sector + split_sectors -
                ((lim->discard_alignment >> 9) % granularity);
        tmp = sector_div(tmp, granularity);

        if (split_sectors > tmp)
                split_sectors -= tmp;

        return bio_split(bio, split_sectors, GFP_NOIO, bs);
}

static struct bio *bio_split_write_zeroes(struct bio *bio,
                                          const struct queue_limits *lim,
                                          unsigned *nsegs, struct bio_set *bs)
{
        *nsegs = 0;
        if (!lim->max_write_zeroes_sectors)
                return NULL;
        if (bio_sectors(bio) <= lim->max_write_zeroes_sectors)
                return NULL;
        return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
}

/*
 * Return the maximum number of sectors from the start of a bio that may be
 * submitted as a single request to a block device. If enough sectors remain,
 * align the end to the physical block size. Otherwise align the end to the
 * logical block size. This approach minimizes the number of non-aligned
 * requests that are submitted to a block device if the start of a bio is not
 * aligned to a physical block boundary.
 */
static inline unsigned get_max_io_size(struct bio *bio,
                                       const struct queue_limits *lim)
{
        unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT;
        unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT;
        unsigned max_sectors = lim->max_sectors, start, end;

        if (lim->chunk_sectors) {
                max_sectors = min(max_sectors,
                        blk_chunk_sectors_left(bio->bi_iter.bi_sector,
                                               lim->chunk_sectors));
        }

        start = bio->bi_iter.bi_sector & (pbs - 1);
        end = (start + max_sectors) & ~(pbs - 1);
        if (end > start)
                return end - start;
        return max_sectors & ~(lbs - 1);
}

/**
 * get_max_segment_size() - maximum number of bytes to add as a single segment
 * @lim: Request queue limits.
 * @start_page: See below.
 * @offset: Offset from @start_page where to add a segment.
 *
 * Returns the maximum number of bytes that can be added as a single segment.
 */
static inline unsigned get_max_segment_size(const struct queue_limits *lim,
                struct page *start_page, unsigned long offset)
{
        unsigned long mask = lim->seg_boundary_mask;

        offset = mask & (page_to_phys(start_page) + offset);

        /*
         * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1
         * after having calculated the minimum.
         */
        return min(mask - offset, (unsigned long)lim->max_segment_size - 1) + 1;
}

/**
 * bvec_split_segs - verify whether or not a bvec should be split in the middle
 * @lim:      [in] queue limits to split based on
 * @bv:       [in] bvec to examine
 * @nsegs:    [in,out] Number of segments in the bio being built. Incremented
 *            by the number of segments from @bv that may be appended to that
 *            bio without exceeding @max_segs
 * @bytes:    [in,out] Number of bytes in the bio being built. Incremented
 *            by the number of bytes from @bv that may be appended to that
 *            bio without exceeding @max_bytes
 * @max_segs: [in] upper bound for *@nsegs
 * @max_bytes: [in] upper bound for *@bytes
 *
 * When splitting a bio, it can happen that a bvec is encountered that is too
 * big to fit in a single segment and hence that it has to be split in the
 * middle. This function verifies whether or not that should happen. The value
 * %true is returned if and only if appending the entire @bv to a bio with
 * *@nsegs segments and *@sectors sectors would make that bio unacceptable for
 * the block driver.
 */
static bool bvec_split_segs(const struct queue_limits *lim,
                const struct bio_vec *bv, unsigned *nsegs, unsigned *bytes,
                unsigned max_segs, unsigned max_bytes)
{
        unsigned max_len = min(max_bytes, UINT_MAX) - *bytes;
        unsigned len = min(bv->bv_len, max_len);
        unsigned total_len = 0;
        unsigned seg_size = 0;

        while (len && *nsegs < max_segs) {
                seg_size = get_max_segment_size(lim, bv->bv_page,
                                                bv->bv_offset + total_len);
                seg_size = min(seg_size, len);

                (*nsegs)++;
                total_len += seg_size;
                len -= seg_size;

                if ((bv->bv_offset + total_len) & lim->virt_boundary_mask)
                        break;
        }

        *bytes += total_len;

        /* tell the caller to split the bvec if it is too big to fit */
        return len > 0 || bv->bv_len > max_len;
}

/**
 * bio_split_rw - split a bio in two bios
 * @bio:  [in] bio to be split
 * @lim:  [in] queue limits to split based on
 * @segs: [out] number of segments in the bio with the first half of the sectors
 * @bs:          [in] bio set to allocate the clone from
 * @max_bytes: [in] maximum number of bytes per bio
 *
 * Clone @bio, update the bi_iter of the clone to represent the first sectors
 * of @bio and update @bio->bi_iter to represent the remaining sectors. The
 * following is guaranteed for the cloned bio:
 * - That it has at most @max_bytes worth of data
 * - That it has at most queue_max_segments(@q) segments.
 *
 * Except for discard requests the cloned bio will point at the bi_io_vec of
 * the original bio. It is the responsibility of the caller to ensure that the
 * original bio is not freed before the cloned bio. The caller is also
 * responsible for ensuring that @bs is only destroyed after processing of the
 * split bio has finished.
 */
struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
                unsigned *segs, struct bio_set *bs, unsigned max_bytes)
{
        struct bio_vec bv, bvprv, *bvprvp = NULL;
        struct bvec_iter iter;
        unsigned nsegs = 0, bytes = 0;

        bio_for_each_bvec(bv, bio, iter) {
                /*
                 * If the queue doesn't support SG gaps and adding this
                 * offset would create a gap, disallow it.
                 */
                if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
                        goto split;

                if (nsegs < lim->max_segments &&
                    bytes + bv.bv_len <= max_bytes &&
                    bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
                        nsegs++;
                        bytes += bv.bv_len;
                } else {
                        if (bvec_split_segs(lim, &bv, &nsegs, &bytes,
                                        lim->max_segments, max_bytes))
                                goto split;
                }

                bvprv = bv;
                bvprvp = &bvprv;
        }

        *segs = nsegs;
        return NULL;
split:
        /*
         * We can't sanely support splitting for a REQ_NOWAIT bio. End it
         * with EAGAIN if splitting is required and return an error pointer.
         */
        if (bio->bi_opf & REQ_NOWAIT) {
                bio->bi_status = BLK_STS_AGAIN;
                bio_endio(bio);
                return ERR_PTR(-EAGAIN);
        }

        *segs = nsegs;

        /*
         * Individual bvecs might not be logical block aligned. Round down the
         * split size so that each bio is properly block size aligned, even if
         * we do not use the full hardware limits.
         */
        bytes = ALIGN_DOWN(bytes, lim->logical_block_size);

        /*
         * Bio splitting may cause subtle trouble such as hang when doing sync
         * iopoll in direct IO routine. Given performance gain of iopoll for
         * big IO can be trival, disable iopoll when split needed.
         */
        bio_clear_polled(bio);
        return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs);
}
EXPORT_SYMBOL_GPL(bio_split_rw);

/**
 * __bio_split_to_limits - split a bio to fit the queue limits
 * @bio:     bio to be split
 * @lim:     queue limits to split based on
 * @nr_segs: returns the number of segments in the returned bio
 *
 * Check if @bio needs splitting based on the queue limits, and if so split off
 * a bio fitting the limits from the beginning of @bio and return it.  @bio is
 * shortened to the remainder and re-submitted.
 *
 * The split bio is allocated from @q->bio_split, which is provided by the
 * block layer.
 */
struct bio *__bio_split_to_limits(struct bio *bio,
                                  const struct queue_limits *lim,
                                  unsigned int *nr_segs)
{
        struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split;
        struct bio *split;

        switch (bio_op(bio)) {
        case REQ_OP_DISCARD:
        case REQ_OP_SECURE_ERASE:
                split = bio_split_discard(bio, lim, nr_segs, bs);
                break;
        case REQ_OP_WRITE_ZEROES:
                split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
                break;
        default:
                split = bio_split_rw(bio, lim, nr_segs, bs,
                                get_max_io_size(bio, lim) << SECTOR_SHIFT);
                if (IS_ERR(split))
                        return NULL;
                break;
        }

        if (split) {
                /* there isn't chance to merge the split bio */
                split->bi_opf |= REQ_NOMERGE;

                blkcg_bio_issue_init(split);
                bio_chain(split, bio);
                trace_block_split(split, bio->bi_iter.bi_sector);
                WARN_ON_ONCE(bio_zone_write_plugging(bio));
                submit_bio_noacct(bio);
                return split;
        }
        return bio;
}

/**
 * bio_split_to_limits - split a bio to fit the queue limits
 * @bio:     bio to be split
 *
 * Check if @bio needs splitting based on the queue limits of @bio->bi_bdev, and
 * if so split off a bio fitting the limits from the beginning of @bio and
 * return it.  @bio is shortened to the remainder and re-submitted.
 *
 * The split bio is allocated from @q->bio_split, which is provided by the
 * block layer.
 */
struct bio *bio_split_to_limits(struct bio *bio)
{
        const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
        unsigned int nr_segs;

        if (bio_may_exceed_limits(bio, lim))
                return __bio_split_to_limits(bio, lim, &nr_segs);
        return bio;
}
EXPORT_SYMBOL(bio_split_to_limits);

unsigned int blk_recalc_rq_segments(struct request *rq)
{
        unsigned int nr_phys_segs = 0;
        unsigned int bytes = 0;
        struct req_iterator iter;
        struct bio_vec bv;

        if (!rq->bio)
                return 0;

        switch (bio_op(rq->bio)) {
        case REQ_OP_DISCARD:
        case REQ_OP_SECURE_ERASE:
                if (queue_max_discard_segments(rq->q) > 1) {
                        struct bio *bio = rq->bio;

                        for_each_bio(bio)
                                nr_phys_segs++;
                        return nr_phys_segs;
                }
                return 1;
        case REQ_OP_WRITE_ZEROES:
                return 0;
        default:
                break;
        }

        rq_for_each_bvec(bv, rq, iter)
                bvec_split_segs(&rq->q->limits, &bv, &nr_phys_segs, &bytes,
                                UINT_MAX, UINT_MAX);
        return nr_phys_segs;
}

static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
                struct scatterlist *sglist)
{
        if (!*sg)
                return sglist;

        /*
         * If the driver previously mapped a shorter list, we could see a
         * termination bit prematurely unless it fully inits the sg table
         * on each mapping. We KNOW that there must be more entries here
         * or the driver would be buggy, so force clear the termination bit
         * to avoid doing a full sg_init_table() in drivers for each command.
         */
        sg_unmark_end(*sg);
        return sg_next(*sg);
}

static unsigned blk_bvec_map_sg(struct request_queue *q,
                struct bio_vec *bvec, struct scatterlist *sglist,
                struct scatterlist **sg)
{
        unsigned nbytes = bvec->bv_len;
        unsigned nsegs = 0, total = 0;

        while (nbytes > 0) {
                unsigned offset = bvec->bv_offset + total;
                unsigned len = min(get_max_segment_size(&q->limits,
                                   bvec->bv_page, offset), nbytes);
                struct page *page = bvec->bv_page;

                /*
                 * Unfortunately a fair number of drivers barf on scatterlists
                 * that have an offset larger than PAGE_SIZE, despite other
                 * subsystems dealing with that invariant just fine.  For now
                 * stick to the legacy format where we never present those from
                 * the block layer, but the code below should be removed once
                 * these offenders (mostly MMC/SD drivers) are fixed.
                 */
                page += (offset >> PAGE_SHIFT);
                offset &= ~PAGE_MASK;

                *sg = blk_next_sg(sg, sglist);
                sg_set_page(*sg, page, len, offset);

                total += len;
                nbytes -= len;
                nsegs++;
        }

        return nsegs;
}

static inline int __blk_bvec_map_sg(struct bio_vec bv,
                struct scatterlist *sglist, struct scatterlist **sg)
{
        *sg = blk_next_sg(sg, sglist);
        sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
        return 1;
}

/* only try to merge bvecs into one sg if they are from two bios */
static inline bool
__blk_segment_map_sg_merge(struct request_queue *q, struct bio_vec *bvec,
                           struct bio_vec *bvprv, struct scatterlist **sg)
{

        int nbytes = bvec->bv_len;

        if (!*sg)
                return false;

        if ((*sg)->length + nbytes > queue_max_segment_size(q))
                return false;

        if (!biovec_phys_mergeable(q, bvprv, bvec))
                return false;

        (*sg)->length += nbytes;

        return true;
}

static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
                             struct scatterlist *sglist,
                             struct scatterlist **sg)
{
        struct bio_vec bvec, bvprv = { NULL };
        struct bvec_iter iter;
        int nsegs = 0;
        bool new_bio = false;

        for_each_bio(bio) {
                bio_for_each_bvec(bvec, bio, iter) {
                        /*
                         * Only try to merge bvecs from two bios given we
                         * have done bio internal merge when adding pages
                         * to bio
                         */
                        if (new_bio &&
                            __blk_segment_map_sg_merge(q, &bvec, &bvprv, sg))
                                goto next_bvec;

                        if (bvec.bv_offset + bvec.bv_len <= PAGE_SIZE)
                                nsegs += __blk_bvec_map_sg(bvec, sglist, sg);
                        else
                                nsegs += blk_bvec_map_sg(q, &bvec, sglist, sg);
 next_bvec:
                        new_bio = false;
                }
                if (likely(bio->bi_iter.bi_size)) {
                        bvprv = bvec;
                        new_bio = true;
                }
        }

        return nsegs;
}

/*
 * map a request to scatterlist, return number of sg entries setup. Caller
 * must make sure sg can hold rq->nr_phys_segments entries
 */
int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
                struct scatterlist *sglist, struct scatterlist **last_sg)
{
        int nsegs = 0;

        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
                nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg);
        else if (rq->bio)
                nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg);

        if (*last_sg)
                sg_mark_end(*last_sg);

        /*
         * Something must have been wrong if the figured number of
         * segment is bigger than number of req's physical segments
         */
        WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));

        return nsegs;
}
EXPORT_SYMBOL(__blk_rq_map_sg);

static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
                                                  sector_t offset)
{
        struct request_queue *q = rq->q;
        unsigned int max_sectors;

        if (blk_rq_is_passthrough(rq))
                return q->limits.max_hw_sectors;

        max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
        if (!q->limits.chunk_sectors ||
            req_op(rq) == REQ_OP_DISCARD ||
            req_op(rq) == REQ_OP_SECURE_ERASE)
                return max_sectors;
        return min(max_sectors,
                   blk_chunk_sectors_left(offset, q->limits.chunk_sectors));
}

static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
                unsigned int nr_phys_segs)
{
        if (!blk_cgroup_mergeable(req, bio))
                goto no_merge;

        if (blk_integrity_merge_bio(req->q, req, bio) == false)
                goto no_merge;

        /* discard request merge won't add new segment */
        if (req_op(req) == REQ_OP_DISCARD)
                return 1;

        if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req))
                goto no_merge;

        /*
         * This will form the start of a new hw segment.  Bump both
         * counters.
         */
        req->nr_phys_segments += nr_phys_segs;
        return 1;

no_merge:
        req_set_nomerge(req->q, req);
        return 0;
}

int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs)
{
        if (req_gap_back_merge(req, bio))
                return 0;
        if (blk_integrity_rq(req) &&
            integrity_req_gap_back_merge(req, bio))
                return 0;
        if (!bio_crypt_ctx_back_mergeable(req, bio))
                return 0;
        if (blk_rq_sectors(req) + bio_sectors(bio) >
            blk_rq_get_max_sectors(req, blk_rq_pos(req))) {
                req_set_nomerge(req->q, req);
                return 0;
        }

        return ll_new_hw_segment(req, bio, nr_segs);
}

static int ll_front_merge_fn(struct request *req, struct bio *bio,
                unsigned int nr_segs)
{
        if (req_gap_front_merge(req, bio))
                return 0;
        if (blk_integrity_rq(req) &&
            integrity_req_gap_front_merge(req, bio))
                return 0;
        if (!bio_crypt_ctx_front_mergeable(req, bio))
                return 0;
        if (blk_rq_sectors(req) + bio_sectors(bio) >
            blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) {
                req_set_nomerge(req->q, req);
                return 0;
        }

        return ll_new_hw_segment(req, bio, nr_segs);
}

static bool req_attempt_discard_merge(struct request_queue *q, struct request *req,
                struct request *next)
{
        unsigned short segments = blk_rq_nr_discard_segments(req);

        if (segments >= queue_max_discard_segments(q))
                goto no_merge;
        if (blk_rq_sectors(req) + bio_sectors(next->bio) >
            blk_rq_get_max_sectors(req, blk_rq_pos(req)))
                goto no_merge;

        req->nr_phys_segments = segments + blk_rq_nr_discard_segments(next);
        return true;
no_merge:
        req_set_nomerge(q, req);
        return false;
}

static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
                                struct request *next)
{
        int total_phys_segments;

        if (req_gap_back_merge(req, next->bio))
                return 0;

        /*
         * Will it become too large?
         */
        if ((blk_rq_sectors(req) + blk_rq_sectors(next)) >
            blk_rq_get_max_sectors(req, blk_rq_pos(req)))
                return 0;

        total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
        if (total_phys_segments > blk_rq_get_max_segments(req))
                return 0;

        if (!blk_cgroup_mergeable(req, next->bio))
                return 0;

        if (blk_integrity_merge_rq(q, req, next) == false)
                return 0;

        if (!bio_crypt_ctx_merge_rq(req, next))
                return 0;

        /* Merge is OK... */
        req->nr_phys_segments = total_phys_segments;
        return 1;
}

/**
 * blk_rq_set_mixed_merge - mark a request as mixed merge
 * @rq: request to mark as mixed merge
 *
 * Description:
 *     @rq is about to be mixed merged.  Make sure the attributes
 *     which can be mixed are set in each bio and mark @rq as mixed
 *     merged.
 */
static void blk_rq_set_mixed_merge(struct request *rq)
{
        blk_opf_t ff = rq->cmd_flags & REQ_FAILFAST_MASK;
        struct bio *bio;

        if (rq->rq_flags & RQF_MIXED_MERGE)
                return;

        /*
         * @rq will no longer represent mixable attributes for all the
         * contained bios.  It will just track those of the first one.
         * Distributes the attributs to each bio.
         */
        for (bio = rq->bio; bio; bio = bio->bi_next) {
                WARN_ON_ONCE((bio->bi_opf & REQ_FAILFAST_MASK) &&
                             (bio->bi_opf & REQ_FAILFAST_MASK) != ff);
                bio->bi_opf |= ff;
        }
        rq->rq_flags |= RQF_MIXED_MERGE;
}

static inline blk_opf_t bio_failfast(const struct bio *bio)
{
        if (bio->bi_opf & REQ_RAHEAD)
                return REQ_FAILFAST_MASK;

        return bio->bi_opf & REQ_FAILFAST_MASK;
}

/*
 * After we are marked as MIXED_MERGE, any new RA bio has to be updated
 * as failfast, and request's failfast has to be updated in case of
 * front merge.
 */
static inline void blk_update_mixed_merge(struct request *req,
                struct bio *bio, bool front_merge)
{
        if (req->rq_flags & RQF_MIXED_MERGE) {
                if (bio->bi_opf & REQ_RAHEAD)
                        bio->bi_opf |= REQ_FAILFAST_MASK;

                if (front_merge) {
                        req->cmd_flags &= ~REQ_FAILFAST_MASK;
                        req->cmd_flags |= bio->bi_opf & REQ_FAILFAST_MASK;
                }
        }
}

static void blk_account_io_merge_request(struct request *req)
{
        if (blk_do_io_stat(req)) {
                part_stat_lock();
                part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
                part_stat_local_dec(req->part,
                                    in_flight[op_is_write(req_op(req))]);
                part_stat_unlock();
        }
}

static enum elv_merge blk_try_req_merge(struct request *req,
                                        struct request *next)
{
        if (blk_discard_mergable(req))
                return ELEVATOR_DISCARD_MERGE;
        else if (blk_rq_pos(req) + blk_rq_sectors(req) == blk_rq_pos(next))
                return ELEVATOR_BACK_MERGE;

        return ELEVATOR_NO_MERGE;
}

/*
 * For non-mq, this has to be called with the request spinlock acquired.
 * For mq with scheduling, the appropriate queue wide lock should be held.
 */
static struct request *attempt_merge(struct request_queue *q,
                                     struct request *req, struct request *next)
{
        if (!rq_mergeable(req) || !rq_mergeable(next))
                return NULL;

        if (req_op(req) != req_op(next))
                return NULL;

        if (rq_data_dir(req) != rq_data_dir(next))
                return NULL;

        /* Don't merge requests with different write hints. */
        if (req->write_hint != next->write_hint)
                return NULL;

        if (req->ioprio != next->ioprio)
                return NULL;

        /*
         * If we are allowed to merge, then append bio list
         * from next to rq and release next. merge_requests_fn
         * will have updated segment counts, update sector
         * counts here. Handle DISCARDs separately, as they
         * have separate settings.
         */

        switch (blk_try_req_merge(req, next)) {
        case ELEVATOR_DISCARD_MERGE:
                if (!req_attempt_discard_merge(q, req, next))
                        return NULL;
                break;
        case ELEVATOR_BACK_MERGE:
                if (!ll_merge_requests_fn(q, req, next))
                        return NULL;
                break;
        default:
                return NULL;
        }

        /*
         * If failfast settings disagree or any of the two is already
         * a mixed merge, mark both as mixed before proceeding.  This
         * makes sure that all involved bios have mixable attributes
         * set properly.
         */
        if (((req->rq_flags | next->rq_flags) & RQF_MIXED_MERGE) ||
            (req->cmd_flags & REQ_FAILFAST_MASK) !=
            (next->cmd_flags & REQ_FAILFAST_MASK)) {
                blk_rq_set_mixed_merge(req);
                blk_rq_set_mixed_merge(next);
        }

        /*
         * At this point we have either done a back merge or front merge. We
         * need the smaller start_time_ns of the merged requests to be the
         * current request for accounting purposes.
         */
        if (next->start_time_ns < req->start_time_ns)
                req->start_time_ns = next->start_time_ns;

        req->biotail->bi_next = next->bio;
        req->biotail = next->biotail;

        req->__data_len += blk_rq_bytes(next);

        if (!blk_discard_mergable(req))
                elv_merge_requests(q, req, next);

        blk_crypto_rq_put_keyslot(next);

        /*
         * 'next' is going away, so update stats accordingly
         */
        blk_account_io_merge_request(next);

        trace_block_rq_merge(next);

        /*
         * ownership of bio passed from next to req, return 'next' for
         * the caller to free
         */
        next->bio = NULL;
        return next;
}

static struct request *attempt_back_merge(struct request_queue *q,
                struct request *rq)
{
        struct request *next = elv_latter_request(q, rq);

        if (next)
                return attempt_merge(q, rq, next);

        return NULL;
}

static struct request *attempt_front_merge(struct request_queue *q,
                struct request *rq)
{
        struct request *prev = elv_former_request(q, rq);

        if (prev)
                return attempt_merge(q, prev, rq);

        return NULL;
}

/*
 * Try to merge 'next' into 'rq'. Return true if the merge happened, false
 * otherwise. The caller is responsible for freeing 'next' if the merge
 * happened.
 */
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
                           struct request *next)
{
        return attempt_merge(q, rq, next);
}

bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
{
        if (!rq_mergeable(rq) || !bio_mergeable(bio))
                return false;

        if (req_op(rq) != bio_op(bio))
                return false;

        /* different data direction or already started, don't merge */
        if (bio_data_dir(bio) != rq_data_dir(rq))
                return false;

        /* don't merge across cgroup boundaries */
        if (!blk_cgroup_mergeable(rq, bio))
                return false;

        /* only merge integrity protected bio into ditto rq */
        if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
                return false;

        /* Only merge if the crypt contexts are compatible */
        if (!bio_crypt_rq_ctx_compatible(rq, bio))
                return false;

        /* Don't merge requests with different write hints. */
        if (rq->write_hint != bio->bi_write_hint)
                return false;

        if (rq->ioprio != bio_prio(bio))
                return false;

        return true;
}

enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
{
        if (blk_discard_mergable(rq))
                return ELEVATOR_DISCARD_MERGE;
        else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
                return ELEVATOR_BACK_MERGE;
        else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
                return ELEVATOR_FRONT_MERGE;
        return ELEVATOR_NO_MERGE;
}

static void blk_account_io_merge_bio(struct request *req)
{
        if (!blk_do_io_stat(req))
                return;

        part_stat_lock();
        part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
        part_stat_unlock();
}

enum bio_merge_status bio_attempt_back_merge(struct request *req,
                struct bio *bio, unsigned int nr_segs)
{
        const blk_opf_t ff = bio_failfast(bio);

        if (!ll_back_merge_fn(req, bio, nr_segs))
                return BIO_MERGE_FAILED;

        trace_block_bio_backmerge(bio);
        rq_qos_merge(req->q, req, bio);

        if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
                blk_rq_set_mixed_merge(req);

        blk_update_mixed_merge(req, bio, false);

        if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
                blk_zone_write_plug_bio_merged(bio);

        req->biotail->bi_next = bio;
        req->biotail = bio;
        req->__data_len += bio->bi_iter.bi_size;

        bio_crypt_free_ctx(bio);

        blk_account_io_merge_bio(req);
        return BIO_MERGE_OK;
}

static enum bio_merge_status bio_attempt_front_merge(struct request *req,
                struct bio *bio, unsigned int nr_segs)
{
        const blk_opf_t ff = bio_failfast(bio);

        /*
         * A front merge for writes to sequential zones of a zoned block device
         * can happen only if the user submitted writes out of order. Do not
         * merge such write to let it fail.
         */
        if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
                return BIO_MERGE_FAILED;

        if (!ll_front_merge_fn(req, bio, nr_segs))
                return BIO_MERGE_FAILED;

        trace_block_bio_frontmerge(bio);
        rq_qos_merge(req->q, req, bio);

        if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
                blk_rq_set_mixed_merge(req);

        blk_update_mixed_merge(req, bio, true);

        bio->bi_next = req->bio;
        req->bio = bio;

        req->__sector = bio->bi_iter.bi_sector;
        req->__data_len += bio->bi_iter.bi_size;

        bio_crypt_do_front_merge(req, bio);

        blk_account_io_merge_bio(req);
        return BIO_MERGE_OK;
}

static enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
                struct request *req, struct bio *bio)
{
        unsigned short segments = blk_rq_nr_discard_segments(req);

        if (segments >= queue_max_discard_segments(q))
                goto no_merge;
        if (blk_rq_sectors(req) + bio_sectors(bio) >
            blk_rq_get_max_sectors(req, blk_rq_pos(req)))
                goto no_merge;

        rq_qos_merge(q, req, bio);

        req->biotail->bi_next = bio;
        req->biotail = bio;
        req->__data_len += bio->bi_iter.bi_size;
        req->nr_phys_segments = segments + 1;

        blk_account_io_merge_bio(req);
        return BIO_MERGE_OK;
no_merge:
        req_set_nomerge(q, req);
        return BIO_MERGE_FAILED;
}

static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
                                                   struct request *rq,
                                                   struct bio *bio,
                                                   unsigned int nr_segs,
                                                   bool sched_allow_merge)
{
        if (!blk_rq_merge_ok(rq, bio))
                return BIO_MERGE_NONE;

        switch (blk_try_merge(rq, bio)) {
        case ELEVATOR_BACK_MERGE:
                if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio))
                        return bio_attempt_back_merge(rq, bio, nr_segs);
                break;
        case ELEVATOR_FRONT_MERGE:
                if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio))
                        return bio_attempt_front_merge(rq, bio, nr_segs);
                break;
        case ELEVATOR_DISCARD_MERGE:
                return bio_attempt_discard_merge(q, rq, bio);
        default:
                return BIO_MERGE_NONE;
        }

        return BIO_MERGE_FAILED;
}

/**
 * blk_attempt_plug_merge - try to merge with %current's plugged list
 * @q: request_queue new bio is being queued at
 * @bio: new bio being queued
 * @nr_segs: number of segments in @bio
 * from the passed in @q already in the plug list
 *
 * Determine whether @bio being queued on @q can be merged with the previous
 * request on %current's plugged list.  Returns %true if merge was successful,
 * otherwise %false.
 *
 * Plugging coalesces IOs from the same issuer for the same purpose without
 * going through @q->queue_lock.  As such it's more of an issuing mechanism
 * than scheduling, and the request, while may have elvpriv data, is not
 * added on the elevator at this point.  In addition, we don't have
 * reliable access to the elevator outside queue lock.  Only check basic
 * merging parameters without querying the elevator.
 *
 * Caller must ensure !blk_queue_nomerges(q) beforehand.
 */
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs)
{
        struct blk_plug *plug = current->plug;
        struct request *rq;

        if (!plug || rq_list_empty(plug->mq_list))
                return false;

        rq_list_for_each(&plug->mq_list, rq) {
                if (rq->q == q) {
                        if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
                            BIO_MERGE_OK)
                                return true;
                        break;
                }

                /*
                 * Only keep iterating plug list for merges if we have multiple
                 * queues
                 */
                if (!plug->multiple_queues)
                        break;
        }
        return false;
}

/*
 * Iterate list of requests and see if we can merge this bio with any
 * of them.
 */
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
                        struct bio *bio, unsigned int nr_segs)
{
        struct request *rq;
        int checked = 8;

        list_for_each_entry_reverse(rq, list, queuelist) {
                if (!checked--)
                        break;

                switch (blk_attempt_bio_merge(q, rq, bio, nr_segs, true)) {
                case BIO_MERGE_NONE:
                        continue;
                case BIO_MERGE_OK:
                        return true;
                case BIO_MERGE_FAILED:
                        return false;
                }

        }

        return false;
}
EXPORT_SYMBOL_GPL(blk_bio_list_merge);

bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs, struct request **merged_request)
{
        struct request *rq;

        switch (elv_merge(q, &rq, bio)) {
        case ELEVATOR_BACK_MERGE:
                if (!blk_mq_sched_allow_merge(q, rq, bio))
                        return false;
                if (bio_attempt_back_merge(rq, bio, nr_segs) != BIO_MERGE_OK)
                        return false;
                *merged_request = attempt_back_merge(q, rq);
                if (!*merged_request)
                        elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
                return true;
        case ELEVATOR_FRONT_MERGE:
                if (!blk_mq_sched_allow_merge(q, rq, bio))
                        return false;
                if (bio_attempt_front_merge(rq, bio, nr_segs) != BIO_MERGE_OK)
                        return false;
                *merged_request = attempt_front_merge(q, rq);
                if (!*merged_request)
                        elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
                return true;
        case ELEVATOR_DISCARD_MERGE:
                return bio_attempt_discard_merge(q, rq, bio) == BIO_MERGE_OK;
        default:
                return false;
        }
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);









































































































































































































































































































































































































































































































































































































































































    4 




    3 
    3 








    4 





























































































































































































































    3 






    4 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

/*
 * fsnotify inode mark locking/lifetime/and refcnting
 *
 * REFCNT:
 * The group->recnt and mark->refcnt tell how many "things" in the kernel
 * currently are referencing the objects. Both kind of objects typically will
 * live inside the kernel with a refcnt of 2, one for its creation and one for
 * the reference a group and a mark hold to each other.
 * If you are holding the appropriate locks, you can take a reference and the
 * object itself is guaranteed to survive until the reference is dropped.
 *
 * LOCKING:
 * There are 3 locks involved with fsnotify inode marks and they MUST be taken
 * in order as follows:
 *
 * group->mark_mutex
 * mark->lock
 * mark->connector->lock
 *
 * group->mark_mutex protects the marks_list anchored inside a given group and
 * each mark is hooked via the g_list.  It also protects the groups private
 * data (i.e group limits).

 * mark->lock protects the marks attributes like its masks and flags.
 * Furthermore it protects the access to a reference of the group that the mark
 * is assigned to as well as the access to a reference of the inode/vfsmount
 * that is being watched by the mark.
 *
 * mark->connector->lock protects the list of marks anchored inside an
 * inode / vfsmount and each mark is hooked via the i_list.
 *
 * A list of notification marks relating to inode / mnt is contained in
 * fsnotify_mark_connector. That structure is alive as long as there are any
 * marks in the list and is also protected by fsnotify_mark_srcu. A mark gets
 * detached from fsnotify_mark_connector when last reference to the mark is
 * dropped.  Thus having mark reference is enough to protect mark->connector
 * pointer and to make sure fsnotify_mark_connector cannot disappear. Also
 * because we remove mark from g_list before dropping mark reference associated
 * with that, any mark found through g_list is guaranteed to have
 * mark->connector set until we drop group->mark_mutex.
 *
 * LIFETIME:
 * Inode marks survive between when they are added to an inode and when their
 * refcnt==0. Marks are also protected by fsnotify_mark_srcu.
 *
 * The inode mark can be cleared for a number of different reasons including:
 * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
 * - The inode is being evicted from cache. (fsnotify_inode_delete)
 * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
 * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark)
 * - The fsnotify_group associated with the mark is going away and all such marks
 *   need to be cleaned up. (fsnotify_clear_marks_by_group)
 *
 * This has the very interesting property of being able to run concurrently with
 * any (or all) other directions.
 */

#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/srcu.h>
#include <linux/ratelimit.h>

#include <linux/atomic.h>

#include <linux/fsnotify_backend.h>
#include "fsnotify.h"

#define FSNOTIFY_REAPER_DELAY        (1)        /* 1 jiffy */

struct srcu_struct fsnotify_mark_srcu;
struct kmem_cache *fsnotify_mark_connector_cachep;

static DEFINE_SPINLOCK(destroy_lock);
static LIST_HEAD(destroy_list);
static struct fsnotify_mark_connector *connector_destroy_list;

static void fsnotify_mark_destroy_workfn(struct work_struct *work);
static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);

static void fsnotify_connector_destroy_workfn(struct work_struct *work);
static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn);

void fsnotify_get_mark(struct fsnotify_mark *mark)
{
        WARN_ON_ONCE(!refcount_read(&mark->refcnt));
        refcount_inc(&mark->refcnt);
}

static fsnotify_connp_t *fsnotify_object_connp(void *obj,
                                enum fsnotify_obj_type obj_type)
{
        switch (obj_type) {
        case FSNOTIFY_OBJ_TYPE_INODE:
                return &((struct inode *)obj)->i_fsnotify_marks;
        case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
                return &real_mount(obj)->mnt_fsnotify_marks;
        case FSNOTIFY_OBJ_TYPE_SB:
                return fsnotify_sb_marks(obj);
        default:
                return NULL;
        }
}

static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn)
{
        if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
                return &fsnotify_conn_inode(conn)->i_fsnotify_mask;
        else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT)
                return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask;
        else if (conn->type == FSNOTIFY_OBJ_TYPE_SB)
                return &fsnotify_conn_sb(conn)->s_fsnotify_mask;
        return NULL;
}

__u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn)
{
        if (WARN_ON(!fsnotify_valid_obj_type(conn->type)))
                return 0;

        return *fsnotify_conn_mask_p(conn);
}

static void fsnotify_get_sb_watched_objects(struct super_block *sb)
{
        atomic_long_inc(fsnotify_sb_watched_objects(sb));
}

static void fsnotify_put_sb_watched_objects(struct super_block *sb)
{
        if (atomic_long_dec_and_test(fsnotify_sb_watched_objects(sb)))
                wake_up_var(fsnotify_sb_watched_objects(sb));
}

static void fsnotify_get_inode_ref(struct inode *inode)
{
        ihold(inode);
        fsnotify_get_sb_watched_objects(inode->i_sb);
}

static void fsnotify_put_inode_ref(struct inode *inode)
{
        fsnotify_put_sb_watched_objects(inode->i_sb);
        iput(inode);
}

/*
 * Grab or drop watched objects reference depending on whether the connector
 * is attached and has any marks attached.
 */
static void fsnotify_update_sb_watchers(struct super_block *sb,
                                        struct fsnotify_mark_connector *conn)
{
        struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
        bool is_watched = conn->flags & FSNOTIFY_CONN_FLAG_IS_WATCHED;
        struct fsnotify_mark *first_mark = NULL;
        unsigned int highest_prio = 0;

        if (conn->obj)
                first_mark = hlist_entry_safe(conn->list.first,
                                              struct fsnotify_mark, obj_list);
        if (first_mark)
                highest_prio = first_mark->group->priority;
        if (WARN_ON(highest_prio >= __FSNOTIFY_PRIO_NUM))
                highest_prio = 0;

        /*
         * If the highest priority of group watching this object is prio,
         * then watched object has a reference on counters [0..prio].
         * Update priority >= 1 watched objects counters.
         */
        for (unsigned int p = conn->prio + 1; p <= highest_prio; p++)
                atomic_long_inc(&sbinfo->watched_objects[p]);
        for (unsigned int p = conn->prio; p > highest_prio; p--)
                atomic_long_dec(&sbinfo->watched_objects[p]);
        conn->prio = highest_prio;

        /* Update priority >= 0 (a.k.a total) watched objects counter */
        BUILD_BUG_ON(FSNOTIFY_PRIO_NORMAL != 0);
        if (first_mark && !is_watched) {
                conn->flags |= FSNOTIFY_CONN_FLAG_IS_WATCHED;
                fsnotify_get_sb_watched_objects(sb);
        } else if (!first_mark && is_watched) {
                conn->flags &= ~FSNOTIFY_CONN_FLAG_IS_WATCHED;
                fsnotify_put_sb_watched_objects(sb);
        }
}

/*
 * Grab or drop inode reference for the connector if needed.
 *
 * When it's time to drop the reference, we only clear the HAS_IREF flag and
 * return the inode object. fsnotify_drop_object() will be resonsible for doing
 * iput() outside of spinlocks. This happens when last mark that wanted iref is
 * detached.
 */
static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn,
                                          bool want_iref)
{
        bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF;
        struct inode *inode = NULL;

        if (conn->type != FSNOTIFY_OBJ_TYPE_INODE ||
            want_iref == has_iref)
                return NULL;

        if (want_iref) {
                /* Pin inode if any mark wants inode refcount held */
                fsnotify_get_inode_ref(fsnotify_conn_inode(conn));
                conn->flags |= FSNOTIFY_CONN_FLAG_HAS_IREF;
        } else {
                /* Unpin inode after detach of last mark that wanted iref */
                inode = fsnotify_conn_inode(conn);
                conn->flags &= ~FSNOTIFY_CONN_FLAG_HAS_IREF;
        }

        return inode;
}

static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
        u32 new_mask = 0;
        bool want_iref = false;
        struct fsnotify_mark *mark;

        assert_spin_locked(&conn->lock);
        /* We can get detached connector here when inode is getting unlinked. */
        if (!fsnotify_valid_obj_type(conn->type))
                return NULL;
        hlist_for_each_entry(mark, &conn->list, obj_list) {
                if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED))
                        continue;
                new_mask |= fsnotify_calc_mask(mark);
                if (conn->type == FSNOTIFY_OBJ_TYPE_INODE &&
                    !(mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
                        want_iref = true;
        }
        *fsnotify_conn_mask_p(conn) = new_mask;

        return fsnotify_update_iref(conn, want_iref);
}

/*
 * Calculate mask of events for a list of marks. The caller must make sure
 * connector and connector->obj cannot disappear under us.  Callers achieve
 * this by holding a mark->lock or mark->group->mark_mutex for a mark on this
 * list.
 */
void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
        if (!conn)
                return;

        spin_lock(&conn->lock);
        __fsnotify_recalc_mask(conn);
        spin_unlock(&conn->lock);
        if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
                __fsnotify_update_child_dentry_flags(
                                        fsnotify_conn_inode(conn));
}

/* Free all connectors queued for freeing once SRCU period ends */
static void fsnotify_connector_destroy_workfn(struct work_struct *work)
{
        struct fsnotify_mark_connector *conn, *free;

        spin_lock(&destroy_lock);
        conn = connector_destroy_list;
        connector_destroy_list = NULL;
        spin_unlock(&destroy_lock);

        synchronize_srcu(&fsnotify_mark_srcu);
        while (conn) {
                free = conn;
                conn = conn->destroy_next;
                kmem_cache_free(fsnotify_mark_connector_cachep, free);
        }
}

static void *fsnotify_detach_connector_from_object(
                                        struct fsnotify_mark_connector *conn,
                                        unsigned int *type)
{
        fsnotify_connp_t *connp = fsnotify_object_connp(conn->obj, conn->type);
        struct super_block *sb = fsnotify_connector_sb(conn);
        struct inode *inode = NULL;

        *type = conn->type;
        if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED)
                return NULL;

        if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
                inode = fsnotify_conn_inode(conn);
                inode->i_fsnotify_mask = 0;

                /* Unpin inode when detaching from connector */
                if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF))
                        inode = NULL;
        } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
                fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
        } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) {
                fsnotify_conn_sb(conn)->s_fsnotify_mask = 0;
        }

        rcu_assign_pointer(*connp, NULL);
        conn->obj = NULL;
        conn->type = FSNOTIFY_OBJ_TYPE_DETACHED;
        fsnotify_update_sb_watchers(sb, conn);

        return inode;
}

static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
{
        struct fsnotify_group *group = mark->group;

        if (WARN_ON_ONCE(!group))
                return;
        group->ops->free_mark(mark);
        fsnotify_put_group(group);
}

/* Drop object reference originally held by a connector */
static void fsnotify_drop_object(unsigned int type, void *objp)
{
        if (!objp)
                return;
        /* Currently only inode references are passed to be dropped */
        if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE))
                return;
        fsnotify_put_inode_ref(objp);
}

void fsnotify_put_mark(struct fsnotify_mark *mark)
{
        struct fsnotify_mark_connector *conn = READ_ONCE(mark->connector);
        void *objp = NULL;
        unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED;
        bool free_conn = false;

        /* Catch marks that were actually never attached to object */
        if (!conn) {
                if (refcount_dec_and_test(&mark->refcnt))
                        fsnotify_final_mark_destroy(mark);
                return;
        }

        /*
         * We have to be careful so that traversals of obj_list under lock can
         * safely grab mark reference.
         */
        if (!refcount_dec_and_lock(&mark->refcnt, &conn->lock))
                return;

        hlist_del_init_rcu(&mark->obj_list);
        if (hlist_empty(&conn->list)) {
                objp = fsnotify_detach_connector_from_object(conn, &type);
                free_conn = true;
        } else {
                struct super_block *sb = fsnotify_connector_sb(conn);

                /* Update watched objects after detaching mark */
                if (sb)
                        fsnotify_update_sb_watchers(sb, conn);
                objp = __fsnotify_recalc_mask(conn);
                type = conn->type;
        }
        WRITE_ONCE(mark->connector, NULL);
        spin_unlock(&conn->lock);

        fsnotify_drop_object(type, objp);

        if (free_conn) {
                spin_lock(&destroy_lock);
                conn->destroy_next = connector_destroy_list;
                connector_destroy_list = conn;
                spin_unlock(&destroy_lock);
                queue_work(system_unbound_wq, &connector_reaper_work);
        }
        /*
         * Note that we didn't update flags telling whether inode cares about
         * what's happening with children. We update these flags from
         * __fsnotify_parent() lazily when next event happens on one of our
         * children.
         */
        spin_lock(&destroy_lock);
        list_add(&mark->g_list, &destroy_list);
        spin_unlock(&destroy_lock);
        queue_delayed_work(system_unbound_wq, &reaper_work,
                           FSNOTIFY_REAPER_DELAY);
}
EXPORT_SYMBOL_GPL(fsnotify_put_mark);

/*
 * Get mark reference when we found the mark via lockless traversal of object
 * list. Mark can be already removed from the list by now and on its way to be
 * destroyed once SRCU period ends.
 *
 * Also pin the group so it doesn't disappear under us.
 */
static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark)
{
        if (!mark)
                return true;

        if (refcount_inc_not_zero(&mark->refcnt)) {
                spin_lock(&mark->lock);
                if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) {
                        /* mark is attached, group is still alive then */
                        atomic_inc(&mark->group->user_waits);
                        spin_unlock(&mark->lock);
                        return true;
                }
                spin_unlock(&mark->lock);
                fsnotify_put_mark(mark);
        }
        return false;
}

/*
 * Puts marks and wakes up group destruction if necessary.
 *
 * Pairs with fsnotify_get_mark_safe()
 */
static void fsnotify_put_mark_wake(struct fsnotify_mark *mark)
{
        if (mark) {
                struct fsnotify_group *group = mark->group;

                fsnotify_put_mark(mark);
                /*
                 * We abuse notification_waitq on group shutdown for waiting for
                 * all marks pinned when waiting for userspace.
                 */
                if (atomic_dec_and_test(&group->user_waits) && group->shutdown)
                        wake_up(&group->notification_waitq);
        }
}

bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info)
        __releases(&fsnotify_mark_srcu)
{
        int type;

        fsnotify_foreach_iter_type(type) {
                /* This can fail if mark is being removed */
                if (!fsnotify_get_mark_safe(iter_info->marks[type])) {
                        __release(&fsnotify_mark_srcu);
                        goto fail;
                }
        }

        /*
         * Now that both marks are pinned by refcount in the inode / vfsmount
         * lists, we can drop SRCU lock, and safely resume the list iteration
         * once userspace returns.
         */
        srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx);

        return true;

fail:
        for (type--; type >= 0; type--)
                fsnotify_put_mark_wake(iter_info->marks[type]);
        return false;
}

void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info)
        __acquires(&fsnotify_mark_srcu)
{
        int type;

        iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
        fsnotify_foreach_iter_type(type)
                fsnotify_put_mark_wake(iter_info->marks[type]);
}

/*
 * Mark mark as detached, remove it from group list. Mark still stays in object
 * list until its last reference is dropped. Note that we rely on mark being
 * removed from group list before corresponding reference to it is dropped. In
 * particular we rely on mark->connector being valid while we hold
 * group->mark_mutex if we found the mark through g_list.
 *
 * Must be called with group->mark_mutex held. The caller must either hold
 * reference to the mark or be protected by fsnotify_mark_srcu.
 */
void fsnotify_detach_mark(struct fsnotify_mark *mark)
{
        fsnotify_group_assert_locked(mark->group);
        WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) &&
                     refcount_read(&mark->refcnt) < 1 +
                        !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED));

        spin_lock(&mark->lock);
        /* something else already called this function on this mark */
        if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
                spin_unlock(&mark->lock);
                return;
        }
        mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
        list_del_init(&mark->g_list);
        spin_unlock(&mark->lock);

        /* Drop mark reference acquired in fsnotify_add_mark_locked() */
        fsnotify_put_mark(mark);
}

/*
 * Free fsnotify mark. The mark is actually only marked as being freed.  The
 * freeing is actually happening only once last reference to the mark is
 * dropped from a workqueue which first waits for srcu period end.
 *
 * Caller must have a reference to the mark or be protected by
 * fsnotify_mark_srcu.
 */
void fsnotify_free_mark(struct fsnotify_mark *mark)
{
        struct fsnotify_group *group = mark->group;

        spin_lock(&mark->lock);
        /* something else already called this function on this mark */
        if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
                spin_unlock(&mark->lock);
                return;
        }
        mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
        spin_unlock(&mark->lock);

        /*
         * Some groups like to know that marks are being freed.  This is a
         * callback to the group function to let it know that this mark
         * is being freed.
         */
        if (group->ops->freeing_mark)
                group->ops->freeing_mark(mark, group);
}

void fsnotify_destroy_mark(struct fsnotify_mark *mark,
                           struct fsnotify_group *group)
{
        fsnotify_group_lock(group);
        fsnotify_detach_mark(mark);
        fsnotify_group_unlock(group);
        fsnotify_free_mark(mark);
}
EXPORT_SYMBOL_GPL(fsnotify_destroy_mark);

/*
 * Sorting function for lists of fsnotify marks.
 *
 * Fanotify supports different notification classes (reflected as priority of
 * notification group). Events shall be passed to notification groups in
 * decreasing priority order. To achieve this marks in notification lists for
 * inodes and vfsmounts are sorted so that priorities of corresponding groups
 * are descending.
 *
 * Furthermore correct handling of the ignore mask requires processing inode
 * and vfsmount marks of each group together. Using the group address as
 * further sort criterion provides a unique sorting order and thus we can
 * merge inode and vfsmount lists of marks in linear time and find groups
 * present in both lists.
 *
 * A return value of 1 signifies that b has priority over a.
 * A return value of 0 signifies that the two marks have to be handled together.
 * A return value of -1 signifies that a has priority over b.
 */
int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
{
        if (a == b)
                return 0;
        if (!a)
                return 1;
        if (!b)
                return -1;
        if (a->priority < b->priority)
                return 1;
        if (a->priority > b->priority)
                return -1;
        if (a < b)
                return 1;
        return -1;
}

static int fsnotify_attach_info_to_sb(struct super_block *sb)
{
        struct fsnotify_sb_info *sbinfo;

        /* sb info is freed on fsnotify_sb_delete() */
        sbinfo = kzalloc(sizeof(*sbinfo), GFP_KERNEL);
        if (!sbinfo)
                return -ENOMEM;

        /*
         * cmpxchg() provides the barrier so that callers of fsnotify_sb_info()
         * will observe an initialized structure
         */
        if (cmpxchg(&sb->s_fsnotify_info, NULL, sbinfo)) {
                /* Someone else created sbinfo for us */
                kfree(sbinfo);
        }
        return 0;
}

static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
                                               void *obj, unsigned int obj_type)
{
        struct fsnotify_mark_connector *conn;

        conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL);
        if (!conn)
                return -ENOMEM;
        spin_lock_init(&conn->lock);
        INIT_HLIST_HEAD(&conn->list);
        conn->flags = 0;
        conn->prio = 0;
        conn->type = obj_type;
        conn->obj = obj;

        /*
         * cmpxchg() provides the barrier so that readers of *connp can see
         * only initialized structure
         */
        if (cmpxchg(connp, NULL, conn)) {
                /* Someone else created list structure for us */
                kmem_cache_free(fsnotify_mark_connector_cachep, conn);
        }
        return 0;
}

/*
 * Get mark connector, make sure it is alive and return with its lock held.
 * This is for users that get connector pointer from inode or mount. Users that
 * hold reference to a mark on the list may directly lock connector->lock as
 * they are sure list cannot go away under them.
 */
static struct fsnotify_mark_connector *fsnotify_grab_connector(
                                                fsnotify_connp_t *connp)
{
        struct fsnotify_mark_connector *conn;
        int idx;

        idx = srcu_read_lock(&fsnotify_mark_srcu);
        conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
        if (!conn)
                goto out;
        spin_lock(&conn->lock);
        if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) {
                spin_unlock(&conn->lock);
                srcu_read_unlock(&fsnotify_mark_srcu, idx);
                return NULL;
        }
out:
        srcu_read_unlock(&fsnotify_mark_srcu, idx);
        return conn;
}

/*
 * Add mark into proper place in given list of marks. These marks may be used
 * for the fsnotify backend to determine which event types should be delivered
 * to which group and for which inodes. These marks are ordered according to
 * priority, highest number first, and then by the group's location in memory.
 */
static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj,
                                  unsigned int obj_type, int add_flags)
{
        struct super_block *sb = fsnotify_object_sb(obj, obj_type);
        struct fsnotify_mark *lmark, *last = NULL;
        struct fsnotify_mark_connector *conn;
        fsnotify_connp_t *connp;
        int cmp;
        int err = 0;

        if (WARN_ON(!fsnotify_valid_obj_type(obj_type)))
                return -EINVAL;

        /*
         * Attach the sb info before attaching a connector to any object on sb.
         * The sb info will remain attached as long as sb lives.
         */
        if (!fsnotify_sb_info(sb)) {
                err = fsnotify_attach_info_to_sb(sb);
                if (err)
                        return err;
        }

        connp = fsnotify_object_connp(obj, obj_type);
restart:
        spin_lock(&mark->lock);
        conn = fsnotify_grab_connector(connp);
        if (!conn) {
                spin_unlock(&mark->lock);
                err = fsnotify_attach_connector_to_object(connp, obj, obj_type);
                if (err)
                        return err;
                goto restart;
        }

        /* is mark the first mark? */
        if (hlist_empty(&conn->list)) {
                hlist_add_head_rcu(&mark->obj_list, &conn->list);
                goto added;
        }

        /* should mark be in the middle of the current list? */
        hlist_for_each_entry(lmark, &conn->list, obj_list) {
                last = lmark;

                if ((lmark->group == mark->group) &&
                    (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) &&
                    !(mark->group->flags & FSNOTIFY_GROUP_DUPS)) {
                        err = -EEXIST;
                        goto out_err;
                }

                cmp = fsnotify_compare_groups(lmark->group, mark->group);
                if (cmp >= 0) {
                        hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list);
                        goto added;
                }
        }

        BUG_ON(last == NULL);
        /* mark should be the last entry.  last is the current last entry */
        hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
added:
        fsnotify_update_sb_watchers(sb, conn);
        /*
         * Since connector is attached to object using cmpxchg() we are
         * guaranteed that connector initialization is fully visible by anyone
         * seeing mark->connector set.
         */
        WRITE_ONCE(mark->connector, conn);
out_err:
        spin_unlock(&conn->lock);
        spin_unlock(&mark->lock);
        return err;
}

/*
 * Attach an initialized mark to a given group and fs object.
 * These marks may be used for the fsnotify backend to determine which
 * event types should be delivered to which group.
 */
int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
                             void *obj, unsigned int obj_type,
                             int add_flags)
{
        struct fsnotify_group *group = mark->group;
        int ret = 0;

        fsnotify_group_assert_locked(group);

        /*
         * LOCKING ORDER!!!!
         * group->mark_mutex
         * mark->lock
         * mark->connector->lock
         */
        spin_lock(&mark->lock);
        mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;

        list_add(&mark->g_list, &group->marks_list);
        fsnotify_get_mark(mark); /* for g_list */
        spin_unlock(&mark->lock);

        ret = fsnotify_add_mark_list(mark, obj, obj_type, add_flags);
        if (ret)
                goto err;

        fsnotify_recalc_mask(mark->connector);

        return ret;
err:
        spin_lock(&mark->lock);
        mark->flags &= ~(FSNOTIFY_MARK_FLAG_ALIVE |
                         FSNOTIFY_MARK_FLAG_ATTACHED);
        list_del_init(&mark->g_list);
        spin_unlock(&mark->lock);

        fsnotify_put_mark(mark);
        return ret;
}

int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj,
                      unsigned int obj_type, int add_flags)
{
        int ret;
        struct fsnotify_group *group = mark->group;

        fsnotify_group_lock(group);
        ret = fsnotify_add_mark_locked(mark, obj, obj_type, add_flags);
        fsnotify_group_unlock(group);
        return ret;
}
EXPORT_SYMBOL_GPL(fsnotify_add_mark);

/*
 * Given a list of marks, find the mark associated with given group. If found
 * take a reference to that mark and return it, else return NULL.
 */
struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type,
                                         struct fsnotify_group *group)
{
        fsnotify_connp_t *connp = fsnotify_object_connp(obj, obj_type);
        struct fsnotify_mark_connector *conn;
        struct fsnotify_mark *mark;

        if (!connp)
                return NULL;

        conn = fsnotify_grab_connector(connp);
        if (!conn)
                return NULL;

        hlist_for_each_entry(mark, &conn->list, obj_list) {
                if (mark->group == group &&
                    (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
                        fsnotify_get_mark(mark);
                        spin_unlock(&conn->lock);
                        return mark;
                }
        }
        spin_unlock(&conn->lock);
        return NULL;
}
EXPORT_SYMBOL_GPL(fsnotify_find_mark);

/* Clear any marks in a group with given type mask */
void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
                                   unsigned int obj_type)
{
        struct fsnotify_mark *lmark, *mark;
        LIST_HEAD(to_free);
        struct list_head *head = &to_free;

        /* Skip selection step if we want to clear all marks. */
        if (obj_type == FSNOTIFY_OBJ_TYPE_ANY) {
                head = &group->marks_list;
                goto clear;
        }
        /*
         * We have to be really careful here. Anytime we drop mark_mutex, e.g.
         * fsnotify_clear_marks_by_inode() can come and free marks. Even in our
         * to_free list so we have to use mark_mutex even when accessing that
         * list. And freeing mark requires us to drop mark_mutex. So we can
         * reliably free only the first mark in the list. That's why we first
         * move marks to free to to_free list in one go and then free marks in
         * to_free list one by one.
         */
        fsnotify_group_lock(group);
        list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
                if (mark->connector->type == obj_type)
                        list_move(&mark->g_list, &to_free);
        }
        fsnotify_group_unlock(group);

clear:
        while (1) {
                fsnotify_group_lock(group);
                if (list_empty(head)) {
                        fsnotify_group_unlock(group);
                        break;
                }
                mark = list_first_entry(head, struct fsnotify_mark, g_list);
                fsnotify_get_mark(mark);
                fsnotify_detach_mark(mark);
                fsnotify_group_unlock(group);
                fsnotify_free_mark(mark);
                fsnotify_put_mark(mark);
        }
}

/* Destroy all marks attached to an object via connector */
void fsnotify_destroy_marks(fsnotify_connp_t *connp)
{
        struct fsnotify_mark_connector *conn;
        struct fsnotify_mark *mark, *old_mark = NULL;
        void *objp;
        unsigned int type;

        conn = fsnotify_grab_connector(connp);
        if (!conn)
                return;
        /*
         * We have to be careful since we can race with e.g.
         * fsnotify_clear_marks_by_group() and once we drop the conn->lock, the
         * list can get modified. However we are holding mark reference and
         * thus our mark cannot be removed from obj_list so we can continue
         * iteration after regaining conn->lock.
         */
        hlist_for_each_entry(mark, &conn->list, obj_list) {
                fsnotify_get_mark(mark);
                spin_unlock(&conn->lock);
                if (old_mark)
                        fsnotify_put_mark(old_mark);
                old_mark = mark;
                fsnotify_destroy_mark(mark, mark->group);
                spin_lock(&conn->lock);
        }
        /*
         * Detach list from object now so that we don't pin inode until all
         * mark references get dropped. It would lead to strange results such
         * as delaying inode deletion or blocking unmount.
         */
        objp = fsnotify_detach_connector_from_object(conn, &type);
        spin_unlock(&conn->lock);
        if (old_mark)
                fsnotify_put_mark(old_mark);
        fsnotify_drop_object(type, objp);
}

/*
 * Nothing fancy, just initialize lists and locks and counters.
 */
void fsnotify_init_mark(struct fsnotify_mark *mark,
                        struct fsnotify_group *group)
{
        memset(mark, 0, sizeof(*mark));
        spin_lock_init(&mark->lock);
        refcount_set(&mark->refcnt, 1);
        fsnotify_get_group(group);
        mark->group = group;
        WRITE_ONCE(mark->connector, NULL);
}
EXPORT_SYMBOL_GPL(fsnotify_init_mark);

/*
 * Destroy all marks in destroy_list, waits for SRCU period to finish before
 * actually freeing marks.
 */
static void fsnotify_mark_destroy_workfn(struct work_struct *work)
{
        struct fsnotify_mark *mark, *next;
        struct list_head private_destroy_list;

        spin_lock(&destroy_lock);
        /* exchange the list head */
        list_replace_init(&destroy_list, &private_destroy_list);
        spin_unlock(&destroy_lock);

        synchronize_srcu(&fsnotify_mark_srcu);

        list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
                list_del_init(&mark->g_list);
                fsnotify_final_mark_destroy(mark);
        }
}

/* Wait for all marks queued for destruction to be actually destroyed */
void fsnotify_wait_marks_destroyed(void)
{
        flush_delayed_work(&reaper_work);
}
EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed);




















































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _SCSI_SCSI_DEVICE_H
#define _SCSI_SCSI_DEVICE_H

#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/workqueue.h>
#include <linux/blk-mq.h>
#include <scsi/scsi.h>
#include <linux/atomic.h>
#include <linux/sbitmap.h>

struct bsg_device;
struct device;
struct request_queue;
struct scsi_cmnd;
struct scsi_lun;
struct scsi_sense_hdr;

typedef __u64 __bitwise blist_flags_t;

#define SCSI_SENSE_BUFFERSIZE        96

struct scsi_mode_data {
        __u32        length;
        __u16        block_descriptor_length;
        __u8        medium_type;
        __u8        device_specific;
        __u8        header_length;
        __u8        longlba:1;
};

/*
 * sdev state: If you alter this, you also need to alter scsi_sysfs.c
 * (for the ascii descriptions) and the state model enforcer:
 * scsi_lib:scsi_device_set_state().
 */
enum scsi_device_state {
        SDEV_CREATED = 1,        /* device created but not added to sysfs
                                 * Only internal commands allowed (for inq) */
        SDEV_RUNNING,                /* device properly configured
                                 * All commands allowed */
        SDEV_CANCEL,                /* beginning to delete device
                                 * Only error handler commands allowed */
        SDEV_DEL,                /* device deleted 
                                 * no commands allowed */
        SDEV_QUIESCE,                /* Device quiescent.  No block commands
                                 * will be accepted, only specials (which
                                 * originate in the mid-layer) */
        SDEV_OFFLINE,                /* Device offlined (by error handling or
                                 * user request */
        SDEV_TRANSPORT_OFFLINE,        /* Offlined by transport class error handler */
        SDEV_BLOCK,                /* Device blocked by scsi lld.  No
                                 * scsi commands from user or midlayer
                                 * should be issued to the scsi
                                 * lld. */
        SDEV_CREATED_BLOCK,        /* same as above but for created devices */
};

enum scsi_scan_mode {
        SCSI_SCAN_INITIAL = 0,
        SCSI_SCAN_RESCAN,
        SCSI_SCAN_MANUAL,
};

enum scsi_device_event {
        SDEV_EVT_MEDIA_CHANGE        = 1,        /* media has changed */
        SDEV_EVT_INQUIRY_CHANGE_REPORTED,                /* 3F 03  UA reported */
        SDEV_EVT_CAPACITY_CHANGE_REPORTED,                /* 2A 09  UA reported */
        SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED,        /* 38 07  UA reported */
        SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED,        /* 2A 01  UA reported */
        SDEV_EVT_LUN_CHANGE_REPORTED,                        /* 3F 0E  UA reported */
        SDEV_EVT_ALUA_STATE_CHANGE_REPORTED,                /* 2A 06  UA reported */
        SDEV_EVT_POWER_ON_RESET_OCCURRED,                /* 29 00  UA reported */

        SDEV_EVT_FIRST                = SDEV_EVT_MEDIA_CHANGE,
        SDEV_EVT_LAST                = SDEV_EVT_POWER_ON_RESET_OCCURRED,

        SDEV_EVT_MAXBITS        = SDEV_EVT_LAST + 1
};

struct scsi_event {
        enum scsi_device_event        evt_type;
        struct list_head        node;

        /* put union of data structures, for non-simple event types,
         * here
         */
};

/**
 * struct scsi_vpd - SCSI Vital Product Data
 * @rcu: For kfree_rcu().
 * @len: Length in bytes of @data.
 * @data: VPD data as defined in various T10 SCSI standard documents.
 */
struct scsi_vpd {
        struct rcu_head        rcu;
        int                len;
        unsigned char        data[];
};

struct scsi_device {
        struct Scsi_Host *host;
        struct request_queue *request_queue;

        /* the next two are protected by the host->host_lock */
        struct list_head    siblings;   /* list of all devices on this host */
        struct list_head    same_target_siblings; /* just the devices sharing same target id */

        struct sbitmap budget_map;
        atomic_t device_blocked;        /* Device returned QUEUE_FULL. */

        atomic_t restarts;
        spinlock_t list_lock;
        struct list_head starved_entry;
        unsigned short queue_depth;        /* How deep of a queue we want */
        unsigned short max_queue_depth;        /* max queue depth */
        unsigned short last_queue_full_depth; /* These two are used by */
        unsigned short last_queue_full_count; /* scsi_track_queue_full() */
        unsigned long last_queue_full_time;        /* last queue full time */
        unsigned long queue_ramp_up_period;        /* ramp up period in jiffies */
#define SCSI_DEFAULT_RAMP_UP_PERIOD        (120 * HZ)

        unsigned long last_queue_ramp_up;        /* last queue ramp up time */

        unsigned int id, channel;
        u64 lun;
        unsigned int manufacturer;        /* Manufacturer of device, for using 
                                         * vendor-specific cmd's */
        unsigned sector_size;        /* size in bytes */

        void *hostdata;                /* available to low-level driver */
        unsigned char type;
        char scsi_level;
        char inq_periph_qual;        /* PQ from INQUIRY data */        
        struct mutex inquiry_mutex;
        unsigned char inquiry_len;        /* valid bytes in 'inquiry' */
        unsigned char * inquiry;        /* INQUIRY response data */
        const char * vendor;                /* [back_compat] point into 'inquiry' ... */
        const char * model;                /* ... after scan; point to static string */
        const char * rev;                /* ... "nullnullnullnull" before scan */

#define SCSI_DEFAULT_VPD_LEN        255        /* default SCSI VPD page size (max) */
        struct scsi_vpd __rcu *vpd_pg0;
        struct scsi_vpd __rcu *vpd_pg83;
        struct scsi_vpd __rcu *vpd_pg80;
        struct scsi_vpd __rcu *vpd_pg89;
        struct scsi_vpd __rcu *vpd_pgb0;
        struct scsi_vpd __rcu *vpd_pgb1;
        struct scsi_vpd __rcu *vpd_pgb2;
        struct scsi_vpd __rcu *vpd_pgb7;

        struct scsi_target      *sdev_target;

        blist_flags_t                sdev_bflags; /* black/white flags as also found in
                                 * scsi_devinfo.[hc]. For now used only to
                                 * pass settings from slave_alloc to scsi
                                 * core. */
        unsigned int eh_timeout; /* Error handling timeout */

        /*
         * If true, let the high-level device driver (sd) manage the device
         * power state for system suspend/resume (suspend to RAM and
         * hibernation) operations.
         */
        unsigned manage_system_start_stop:1;

        /*
         * If true, let the high-level device driver (sd) manage the device
         * power state for runtime device suspand and resume operations.
         */
        unsigned manage_runtime_start_stop:1;

        /*
         * If true, let the high-level device driver (sd) manage the device
         * power state for system shutdown (power off) operations.
         */
        unsigned manage_shutdown:1;

        /*
         * If set and if the device is runtime suspended, ask the high-level
         * device driver (sd) to force a runtime resume of the device.
         */
        unsigned force_runtime_start_on_system_start:1;

        unsigned removable:1;
        unsigned changed:1;        /* Data invalid due to media change */
        unsigned busy:1;        /* Used to prevent races */
        unsigned lockable:1;        /* Able to prevent media removal */
        unsigned locked:1;      /* Media removal disabled */
        unsigned borken:1;        /* Tell the Seagate driver to be 
                                 * painfully slow on this device */
        unsigned disconnect:1;        /* can disconnect */
        unsigned soft_reset:1;        /* Uses soft reset option */
        unsigned sdtr:1;        /* Device supports SDTR messages */
        unsigned wdtr:1;        /* Device supports WDTR messages */
        unsigned ppr:1;                /* Device supports PPR messages */
        unsigned tagged_supported:1;        /* Supports SCSI-II tagged queuing */
        unsigned simple_tags:1;        /* simple queue tag messages are enabled */
        unsigned was_reset:1;        /* There was a bus reset on the bus for 
                                 * this device */
        unsigned expecting_cc_ua:1; /* Expecting a CHECK_CONDITION/UNIT_ATTN
                                     * because we did a bus reset. */
        unsigned use_10_for_rw:1; /* first try 10-byte read / write */
        unsigned use_10_for_ms:1; /* first try 10-byte mode sense/select */
        unsigned set_dbd_for_ms:1; /* Set "DBD" field in mode sense */
        unsigned read_before_ms:1;        /* perform a READ before MODE SENSE */
        unsigned no_report_opcodes:1;        /* no REPORT SUPPORTED OPERATION CODES */
        unsigned no_write_same:1;        /* no WRITE SAME command */
        unsigned use_16_for_rw:1; /* Use read/write(16) over read/write(10) */
        unsigned use_16_for_sync:1;        /* Use sync (16) over sync (10) */
        unsigned skip_ms_page_8:1;        /* do not use MODE SENSE page 0x08 */
        unsigned skip_ms_page_3f:1;        /* do not use MODE SENSE page 0x3f */
        unsigned skip_vpd_pages:1;        /* do not read VPD pages */
        unsigned try_vpd_pages:1;        /* attempt to read VPD pages */
        unsigned use_192_bytes_for_3f:1; /* ask for 192 bytes from page 0x3f */
        unsigned no_start_on_add:1;        /* do not issue start on add */
        unsigned allow_restart:1; /* issue START_UNIT in error handler */
        unsigned start_stop_pwr_cond:1;        /* Set power cond. in START_STOP_UNIT */
        unsigned no_uld_attach:1; /* disable connecting to upper level drivers */
        unsigned select_no_atn:1;
        unsigned fix_capacity:1;        /* READ_CAPACITY is too high by 1 */
        unsigned guess_capacity:1;        /* READ_CAPACITY might be too high by 1 */
        unsigned retry_hwerror:1;        /* Retry HARDWARE_ERROR */
        unsigned last_sector_bug:1;        /* do not use multisector accesses on
                                           SD_LAST_BUGGY_SECTORS */
        unsigned no_read_disc_info:1;        /* Avoid READ_DISC_INFO cmds */
        unsigned no_read_capacity_16:1; /* Avoid READ_CAPACITY_16 cmds */
        unsigned try_rc_10_first:1;        /* Try READ_CAPACACITY_10 first */
        unsigned security_supported:1;        /* Supports Security Protocols */
        unsigned is_visible:1;        /* is the device visible in sysfs */
        unsigned wce_default_on:1;        /* Cache is ON by default */
        unsigned no_dif:1;        /* T10 PI (DIF) should be disabled */
        unsigned broken_fua:1;                /* Don't set FUA bit */
        unsigned lun_in_cdb:1;                /* Store LUN bits in CDB[1] */
        unsigned unmap_limit_for_ws:1;        /* Use the UNMAP limit for WRITE SAME */
        unsigned rpm_autosuspend:1;        /* Enable runtime autosuspend at device
                                         * creation time */
        unsigned ignore_media_change:1; /* Ignore MEDIA CHANGE on resume */
        unsigned silence_suspend:1;        /* Do not print runtime PM related messages */
        unsigned no_vpd_size:1;                /* No VPD size reported in header */

        unsigned cdl_supported:1;        /* Command duration limits supported */
        unsigned cdl_enable:1;                /* Enable/disable Command duration limits */

        unsigned int queue_stopped;        /* request queue is quiesced */
        bool offline_already;                /* Device offline message logged */

        atomic_t disk_events_disable_depth; /* disable depth for disk events */

        DECLARE_BITMAP(supported_events, SDEV_EVT_MAXBITS); /* supported events */
        DECLARE_BITMAP(pending_events, SDEV_EVT_MAXBITS); /* pending events */
        struct list_head event_list;        /* asserted events */
        struct work_struct event_work;

        unsigned int max_device_blocked; /* what device_blocked counts down from  */
#define SCSI_DEFAULT_DEVICE_BLOCKED        3

        atomic_t iorequest_cnt;
        atomic_t iodone_cnt;
        atomic_t ioerr_cnt;
        atomic_t iotmo_cnt;

        struct device                sdev_gendev,
                                sdev_dev;

        struct work_struct        requeue_work;

        struct scsi_device_handler *handler;
        void                        *handler_data;

        size_t                        dma_drain_len;
        void                        *dma_drain_buf;

        unsigned int                sg_timeout;
        unsigned int                sg_reserved_size;

        struct bsg_device        *bsg_dev;
        unsigned char                access_state;
        struct mutex                state_mutex;
        enum scsi_device_state sdev_state;
        struct task_struct        *quiesced_by;
        unsigned long                sdev_data[];
} __attribute__((aligned(sizeof(unsigned long))));

#define        to_scsi_device(d)        \
        container_of(d, struct scsi_device, sdev_gendev)
#define        class_to_sdev(d)        \
        container_of(d, struct scsi_device, sdev_dev)
#define transport_class_to_sdev(class_dev) \
        to_scsi_device(class_dev->parent)

#define sdev_dbg(sdev, fmt, a...) \
        dev_dbg(&(sdev)->sdev_gendev, fmt, ##a)

/*
 * like scmd_printk, but the device name is passed in
 * as a string pointer
 */
__printf(4, 5) void
sdev_prefix_printk(const char *, const struct scsi_device *, const char *,
                const char *, ...);

#define sdev_printk(l, sdev, fmt, a...)                                \
        sdev_prefix_printk(l, sdev, NULL, fmt, ##a)

__printf(3, 4) void
scmd_printk(const char *, const struct scsi_cmnd *, const char *, ...);

#define scmd_dbg(scmd, fmt, a...)                                        \
        do {                                                                \
                struct request *__rq = scsi_cmd_to_rq((scmd));                \
                                                                        \
                if (__rq->q->disk)                                        \
                        sdev_dbg((scmd)->device, "[%s] " fmt,                \
                                 __rq->q->disk->disk_name, ##a);        \
                else                                                        \
                        sdev_dbg((scmd)->device, fmt, ##a);                \
        } while (0)

enum scsi_target_state {
        STARGET_CREATED = 1,
        STARGET_RUNNING,
        STARGET_REMOVE,
        STARGET_CREATED_REMOVE,
        STARGET_DEL,
};

/*
 * scsi_target: representation of a scsi target, for now, this is only
 * used for single_lun devices. If no one has active IO to the target,
 * starget_sdev_user is NULL, else it points to the active sdev.
 */
struct scsi_target {
        struct scsi_device        *starget_sdev_user;
        struct list_head        siblings;
        struct list_head        devices;
        struct device                dev;
        struct kref                reap_ref; /* last put renders target invisible */
        unsigned int                channel;
        unsigned int                id; /* target id ... replace
                                     * scsi_device.id eventually */
        unsigned int                create:1; /* signal that it needs to be added */
        unsigned int                single_lun:1;        /* Indicates we should only
                                                 * allow I/O to one of the luns
                                                 * for the device at a time. */
        unsigned int                pdt_1f_for_no_lun:1;        /* PDT = 0x1f
                                                 * means no lun present. */
        unsigned int                no_report_luns:1;        /* Don't use
                                                 * REPORT LUNS for scanning. */
        unsigned int                expecting_lun_change:1;        /* A device has reported
                                                 * a 3F/0E UA, other devices on
                                                 * the same target will also. */
        /* commands actually active on LLD. */
        atomic_t                target_busy;
        atomic_t                target_blocked;

        /*
         * LLDs should set this in the slave_alloc host template callout.
         * If set to zero then there is not limit.
         */
        unsigned int                can_queue;
        unsigned int                max_target_blocked;
#define SCSI_DEFAULT_TARGET_BLOCKED        3

        char                        scsi_level;
        enum scsi_target_state        state;
        void                         *hostdata; /* available to low-level driver */
        unsigned long                starget_data[]; /* for the transport */
        /* starget_data must be the last element!!!! */
} __attribute__((aligned(sizeof(unsigned long))));

#define to_scsi_target(d)        container_of(d, struct scsi_target, dev)
static inline struct scsi_target *scsi_target(struct scsi_device *sdev)
{
        return to_scsi_target(sdev->sdev_gendev.parent);
}
#define transport_class_to_starget(class_dev) \
        to_scsi_target(class_dev->parent)

#define starget_printk(prefix, starget, fmt, a...)        \
        dev_printk(prefix, &(starget)->dev, fmt, ##a)

extern struct scsi_device *__scsi_add_device(struct Scsi_Host *,
                uint, uint, u64, void *hostdata);
extern int scsi_add_device(struct Scsi_Host *host, uint channel,
                           uint target, u64 lun);
extern int scsi_register_device_handler(struct scsi_device_handler *scsi_dh);
extern void scsi_remove_device(struct scsi_device *);
extern int scsi_unregister_device_handler(struct scsi_device_handler *scsi_dh);
void scsi_attach_vpd(struct scsi_device *sdev);
void scsi_cdl_check(struct scsi_device *sdev);
int scsi_cdl_enable(struct scsi_device *sdev, bool enable);

extern struct scsi_device *scsi_device_from_queue(struct request_queue *q);
extern int __must_check scsi_device_get(struct scsi_device *);
extern void scsi_device_put(struct scsi_device *);
extern struct scsi_device *scsi_device_lookup(struct Scsi_Host *,
                                              uint, uint, u64);
extern struct scsi_device *__scsi_device_lookup(struct Scsi_Host *,
                                                uint, uint, u64);
extern struct scsi_device *scsi_device_lookup_by_target(struct scsi_target *,
                                                        u64);
extern struct scsi_device *__scsi_device_lookup_by_target(struct scsi_target *,
                                                          u64);
extern void starget_for_each_device(struct scsi_target *, void *,
                     void (*fn)(struct scsi_device *, void *));
extern void __starget_for_each_device(struct scsi_target *, void *,
                                      void (*fn)(struct scsi_device *,
                                                 void *));

/* only exposed to implement shost_for_each_device */
extern struct scsi_device *__scsi_iterate_devices(struct Scsi_Host *,
                                                  struct scsi_device *);

/**
 * shost_for_each_device - iterate over all devices of a host
 * @sdev: the &struct scsi_device to use as a cursor
 * @shost: the &struct scsi_host to iterate over
 *
 * Iterator that returns each device attached to @shost.  This loop
 * takes a reference on each device and releases it at the end.  If
 * you break out of the loop, you must call scsi_device_put(sdev).
 */
#define shost_for_each_device(sdev, shost) \
        for ((sdev) = __scsi_iterate_devices((shost), NULL); \
             (sdev); \
             (sdev) = __scsi_iterate_devices((shost), (sdev)))

/**
 * __shost_for_each_device - iterate over all devices of a host (UNLOCKED)
 * @sdev: the &struct scsi_device to use as a cursor
 * @shost: the &struct scsi_host to iterate over
 *
 * Iterator that returns each device attached to @shost.  It does _not_
 * take a reference on the scsi_device, so the whole loop must be
 * protected by shost->host_lock.
 *
 * Note: The only reason to use this is because you need to access the
 * device list in interrupt context.  Otherwise you really want to use
 * shost_for_each_device instead.
 */
#define __shost_for_each_device(sdev, shost) \
        list_for_each_entry((sdev), &((shost)->__devices), siblings)

extern int scsi_change_queue_depth(struct scsi_device *, int);
extern int scsi_track_queue_full(struct scsi_device *, int);

extern int scsi_set_medium_removal(struct scsi_device *, char);

int scsi_mode_sense(struct scsi_device *sdev, int dbd, int modepage,
                    int subpage, unsigned char *buffer, int len, int timeout,
                    int retries, struct scsi_mode_data *data,
                    struct scsi_sense_hdr *);
extern int scsi_mode_select(struct scsi_device *sdev, int pf, int sp,
                            unsigned char *buffer, int len, int timeout,
                            int retries, struct scsi_mode_data *data,
                            struct scsi_sense_hdr *);
extern int scsi_test_unit_ready(struct scsi_device *sdev, int timeout,
                                int retries, struct scsi_sense_hdr *sshdr);
extern int scsi_get_vpd_page(struct scsi_device *, u8 page, unsigned char *buf,
                             int buf_len);
int scsi_report_opcode(struct scsi_device *sdev, unsigned char *buffer,
                       unsigned int len, unsigned char opcode,
                       unsigned short sa);
extern int scsi_device_set_state(struct scsi_device *sdev,
                                 enum scsi_device_state state);
extern struct scsi_event *sdev_evt_alloc(enum scsi_device_event evt_type,
                                          gfp_t gfpflags);
extern void sdev_evt_send(struct scsi_device *sdev, struct scsi_event *evt);
extern void sdev_evt_send_simple(struct scsi_device *sdev,
                          enum scsi_device_event evt_type, gfp_t gfpflags);
extern int scsi_device_quiesce(struct scsi_device *sdev);
extern void scsi_device_resume(struct scsi_device *sdev);
extern void scsi_target_quiesce(struct scsi_target *);
extern void scsi_target_resume(struct scsi_target *);
extern void scsi_scan_target(struct device *parent, unsigned int channel,
                             unsigned int id, u64 lun,
                             enum scsi_scan_mode rescan);
extern void scsi_target_reap(struct scsi_target *);
void scsi_block_targets(struct Scsi_Host *shost, struct device *dev);
extern void scsi_target_unblock(struct device *, enum scsi_device_state);
extern void scsi_remove_target(struct device *);
extern const char *scsi_device_state_name(enum scsi_device_state);
extern int scsi_is_sdev_device(const struct device *);
extern int scsi_is_target_device(const struct device *);
extern void scsi_sanitize_inquiry_string(unsigned char *s, int len);

/*
 * scsi_execute_cmd users can set scsi_failure.result to have
 * scsi_check_passthrough fail/retry a command. scsi_failure.result can be a
 * specific host byte or message code, or SCMD_FAILURE_RESULT_ANY can be used
 * to match any host or message code.
 */
#define SCMD_FAILURE_RESULT_ANY        0x7fffffff
/*
 * Set scsi_failure.result to SCMD_FAILURE_STAT_ANY to fail/retry any failure
 * scsi_status_is_good returns false for.
 */
#define SCMD_FAILURE_STAT_ANY        0xff
/*
 * The following can be set to the scsi_failure sense, asc and ascq fields to
 * match on any sense, ASC, or ASCQ value.
 */
#define SCMD_FAILURE_SENSE_ANY        0xff
#define SCMD_FAILURE_ASC_ANY        0xff
#define SCMD_FAILURE_ASCQ_ANY        0xff
/* Always retry a matching failure. */
#define SCMD_FAILURE_NO_LIMIT        -1

struct scsi_failure {
        int result;
        u8 sense;
        u8 asc;
        u8 ascq;
        /*
         * Number of times scsi_execute_cmd will retry the failure. It does
         * not count for the total_allowed.
         */
        s8 allowed;
        /* Number of times the failure has been retried. */
        s8 retries;
};

struct scsi_failures {
        /*
         * If a scsi_failure does not have a retry limit setup this limit will
         * be used.
         */
        int total_allowed;
        int total_retries;
        struct scsi_failure *failure_definitions;
};

/* Optional arguments to scsi_execute_cmd */
struct scsi_exec_args {
        unsigned char *sense;                /* sense buffer */
        unsigned int sense_len;                /* sense buffer len */
        struct scsi_sense_hdr *sshdr;        /* decoded sense header */
        blk_mq_req_flags_t req_flags;        /* BLK_MQ_REQ flags */
        int scmd_flags;                        /* SCMD flags */
        int *resid;                        /* residual length */
        struct scsi_failures *failures;        /* failures to retry */
};

int scsi_execute_cmd(struct scsi_device *sdev, const unsigned char *cmd,
                     blk_opf_t opf, void *buffer, unsigned int bufflen,
                     int timeout, int retries,
                     const struct scsi_exec_args *args);
void scsi_failures_reset_retries(struct scsi_failures *failures);

extern void sdev_disable_disk_events(struct scsi_device *sdev);
extern void sdev_enable_disk_events(struct scsi_device *sdev);
extern int scsi_vpd_lun_id(struct scsi_device *, char *, size_t);
extern int scsi_vpd_tpg_id(struct scsi_device *, int *);

#ifdef CONFIG_PM
extern int scsi_autopm_get_device(struct scsi_device *);
extern void scsi_autopm_put_device(struct scsi_device *);
#else
static inline int scsi_autopm_get_device(struct scsi_device *d) { return 0; }
static inline void scsi_autopm_put_device(struct scsi_device *d) {}
#endif /* CONFIG_PM */

static inline int __must_check scsi_device_reprobe(struct scsi_device *sdev)
{
        return device_reprobe(&sdev->sdev_gendev);
}

static inline unsigned int sdev_channel(struct scsi_device *sdev)
{
        return sdev->channel;
}

static inline unsigned int sdev_id(struct scsi_device *sdev)
{
        return sdev->id;
}

#define scmd_id(scmd) sdev_id((scmd)->device)
#define scmd_channel(scmd) sdev_channel((scmd)->device)

/*
 * checks for positions of the SCSI state machine
 */
static inline int scsi_device_online(struct scsi_device *sdev)
{
        return (sdev->sdev_state != SDEV_OFFLINE &&
                sdev->sdev_state != SDEV_TRANSPORT_OFFLINE &&
                sdev->sdev_state != SDEV_DEL);
}
static inline int scsi_device_blocked(struct scsi_device *sdev)
{
        return sdev->sdev_state == SDEV_BLOCK ||
                sdev->sdev_state == SDEV_CREATED_BLOCK;
}
static inline int scsi_device_created(struct scsi_device *sdev)
{
        return sdev->sdev_state == SDEV_CREATED ||
                sdev->sdev_state == SDEV_CREATED_BLOCK;
}

int scsi_internal_device_block_nowait(struct scsi_device *sdev);
int scsi_internal_device_unblock_nowait(struct scsi_device *sdev,
                                        enum scsi_device_state new_state);

/* accessor functions for the SCSI parameters */
static inline int scsi_device_sync(struct scsi_device *sdev)
{
        return sdev->sdtr;
}
static inline int scsi_device_wide(struct scsi_device *sdev)
{
        return sdev->wdtr;
}
static inline int scsi_device_dt(struct scsi_device *sdev)
{
        return sdev->ppr;
}
static inline int scsi_device_dt_only(struct scsi_device *sdev)
{
        if (sdev->inquiry_len < 57)
                return 0;
        return (sdev->inquiry[56] & 0x0c) == 0x04;
}
static inline int scsi_device_ius(struct scsi_device *sdev)
{
        if (sdev->inquiry_len < 57)
                return 0;
        return sdev->inquiry[56] & 0x01;
}
static inline int scsi_device_qas(struct scsi_device *sdev)
{
        if (sdev->inquiry_len < 57)
                return 0;
        return sdev->inquiry[56] & 0x02;
}
static inline int scsi_device_enclosure(struct scsi_device *sdev)
{
        return sdev->inquiry ? (sdev->inquiry[6] & (1<<6)) : 1;
}

static inline int scsi_device_protection(struct scsi_device *sdev)
{
        if (sdev->no_dif)
                return 0;

        return sdev->scsi_level > SCSI_2 && sdev->inquiry[5] & (1<<0);
}

static inline int scsi_device_tpgs(struct scsi_device *sdev)
{
        return sdev->inquiry ? (sdev->inquiry[5] >> 4) & 0x3 : 0;
}

/**
 * scsi_device_supports_vpd - test if a device supports VPD pages
 * @sdev: the &struct scsi_device to test
 *
 * If the 'try_vpd_pages' flag is set it takes precedence.
 * Otherwise we will assume VPD pages are supported if the
 * SCSI level is at least SPC-3 and 'skip_vpd_pages' is not set.
 */
static inline int scsi_device_supports_vpd(struct scsi_device *sdev)
{
        /* Attempt VPD inquiry if the device blacklist explicitly calls
         * for it.
         */
        if (sdev->try_vpd_pages)
                return 1;
        /*
         * Although VPD inquiries can go to SCSI-2 type devices,
         * some USB ones crash on receiving them, and the pages
         * we currently ask for are mandatory for SPC-2 and beyond
         */
        if (sdev->scsi_level >= SCSI_SPC_2 && !sdev->skip_vpd_pages)
                return 1;
        return 0;
}

static inline int scsi_device_busy(struct scsi_device *sdev)
{
        return sbitmap_weight(&sdev->budget_map);
}

#define MODULE_ALIAS_SCSI_DEVICE(type) \
        MODULE_ALIAS("scsi:t-" __stringify(type) "*")
#define SCSI_DEVICE_MODALIAS_FMT "scsi:t-0x%02x"

#endif /* _SCSI_SCSI_DEVICE_H */




































































































































































   15 













































   15 































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM jbd2

#if !defined(_TRACE_JBD2_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_JBD2_H

#include <linux/jbd2.h>
#include <linux/tracepoint.h>

struct transaction_chp_stats_s;
struct transaction_run_stats_s;

TRACE_EVENT(jbd2_checkpoint,

        TP_PROTO(journal_t *journal, int result),

        TP_ARGS(journal, result),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        result                        )
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->result                = result;
        ),

        TP_printk("dev %d,%d result %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->result)
);

DECLARE_EVENT_CLASS(jbd2_commit,

        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        char,        sync_commit                  )
                __field(        tid_t,        transaction                  )
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->sync_commit = commit_transaction->t_synchronous_commit;
                __entry->transaction        = commit_transaction->t_tid;
        ),

        TP_printk("dev %d,%d transaction %u sync %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->transaction, __entry->sync_commit)
);

DEFINE_EVENT(jbd2_commit, jbd2_start_commit,

        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction)
);

DEFINE_EVENT(jbd2_commit, jbd2_commit_locking,

        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction)
);

DEFINE_EVENT(jbd2_commit, jbd2_commit_flushing,

        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction)
);

DEFINE_EVENT(jbd2_commit, jbd2_commit_logging,

        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction)
);

DEFINE_EVENT(jbd2_commit, jbd2_drop_transaction,

        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction)
);

TRACE_EVENT(jbd2_end_commit,
        TP_PROTO(journal_t *journal, transaction_t *commit_transaction),

        TP_ARGS(journal, commit_transaction),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        char,        sync_commit                  )
                __field(        tid_t,        transaction                  )
                __field(        tid_t,        head                            )
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->sync_commit = commit_transaction->t_synchronous_commit;
                __entry->transaction        = commit_transaction->t_tid;
                __entry->head                = journal->j_tail_sequence;
        ),

        TP_printk("dev %d,%d transaction %u sync %d head %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->transaction, __entry->sync_commit, __entry->head)
);

TRACE_EVENT(jbd2_submit_inode_data,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
        ),

        TP_printk("dev %d,%d ino %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino)
);

DECLARE_EVENT_CLASS(jbd2_handle_start_class,
        TP_PROTO(dev_t dev, tid_t tid, unsigned int type,
                 unsigned int line_no, int requested_blocks),

        TP_ARGS(dev, tid, type, line_no, requested_blocks),

        TP_STRUCT__entry(
                __field(                dev_t,        dev                )
                __field(                tid_t,        tid                )
                __field(         unsigned int,        type                )
                __field(         unsigned int,        line_no                )
                __field(                  int,        requested_blocks)
        ),

        TP_fast_assign(
                __entry->dev                  = dev;
                __entry->tid                  = tid;
                __entry->type                  = type;
                __entry->line_no          = line_no;
                __entry->requested_blocks = requested_blocks;
        ),

        TP_printk("dev %d,%d tid %u type %u line_no %u "
                  "requested_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
                  __entry->type, __entry->line_no, __entry->requested_blocks)
);

DEFINE_EVENT(jbd2_handle_start_class, jbd2_handle_start,
        TP_PROTO(dev_t dev, tid_t tid, unsigned int type,
                 unsigned int line_no, int requested_blocks),

        TP_ARGS(dev, tid, type, line_no, requested_blocks)
);

DEFINE_EVENT(jbd2_handle_start_class, jbd2_handle_restart,
        TP_PROTO(dev_t dev, tid_t tid, unsigned int type,
                 unsigned int line_no, int requested_blocks),

        TP_ARGS(dev, tid, type, line_no, requested_blocks)
);

TRACE_EVENT(jbd2_handle_extend,
        TP_PROTO(dev_t dev, tid_t tid, unsigned int type,
                 unsigned int line_no, int buffer_credits,
                 int requested_blocks),

        TP_ARGS(dev, tid, type, line_no, buffer_credits, requested_blocks),

        TP_STRUCT__entry(
                __field(                dev_t,        dev                )
                __field(                tid_t,        tid                )
                __field(         unsigned int,        type                )
                __field(         unsigned int,        line_no                )
                __field(                  int,        buffer_credits  )
                __field(                  int,        requested_blocks)
        ),

        TP_fast_assign(
                __entry->dev                  = dev;
                __entry->tid                  = tid;
                __entry->type                  = type;
                __entry->line_no          = line_no;
                __entry->buffer_credits   = buffer_credits;
                __entry->requested_blocks = requested_blocks;
        ),

        TP_printk("dev %d,%d tid %u type %u line_no %u "
                  "buffer_credits %d requested_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
                  __entry->type, __entry->line_no, __entry->buffer_credits,
                  __entry->requested_blocks)
);

TRACE_EVENT(jbd2_handle_stats,
        TP_PROTO(dev_t dev, tid_t tid, unsigned int type,
                 unsigned int line_no, int interval, int sync,
                 int requested_blocks, int dirtied_blocks),

        TP_ARGS(dev, tid, type, line_no, interval, sync,
                requested_blocks, dirtied_blocks),

        TP_STRUCT__entry(
                __field(                dev_t,        dev                )
                __field(                tid_t,        tid                )
                __field(         unsigned int,        type                )
                __field(         unsigned int,        line_no                )
                __field(                  int,        interval        )
                __field(                  int,        sync                )
                __field(                  int,        requested_blocks)
                __field(                  int,        dirtied_blocks        )
        ),

        TP_fast_assign(
                __entry->dev                  = dev;
                __entry->tid                  = tid;
                __entry->type                  = type;
                __entry->line_no          = line_no;
                __entry->interval          = interval;
                __entry->sync                  = sync;
                __entry->requested_blocks = requested_blocks;
                __entry->dirtied_blocks          = dirtied_blocks;
        ),

        TP_printk("dev %d,%d tid %u type %u line_no %u interval %d "
                  "sync %d requested_blocks %d dirtied_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
                  __entry->type, __entry->line_no, __entry->interval,
                  __entry->sync, __entry->requested_blocks,
                  __entry->dirtied_blocks)
);

TRACE_EVENT(jbd2_run_stats,
        TP_PROTO(dev_t dev, tid_t tid,
                 struct transaction_run_stats_s *stats),

        TP_ARGS(dev, tid, stats),

        TP_STRUCT__entry(
                __field(                dev_t,        dev                )
                __field(                tid_t,        tid                )
                __field(        unsigned long,        wait                )
                __field(        unsigned long,        request_delay        )
                __field(        unsigned long,        running                )
                __field(        unsigned long,        locked                )
                __field(        unsigned long,        flushing        )
                __field(        unsigned long,        logging                )
                __field(                __u32,        handle_count        )
                __field(                __u32,        blocks                )
                __field(                __u32,        blocks_logged        )
        ),

        TP_fast_assign(
                __entry->dev                = dev;
                __entry->tid                = tid;
                __entry->wait                = stats->rs_wait;
                __entry->request_delay        = stats->rs_request_delay;
                __entry->running        = stats->rs_running;
                __entry->locked                = stats->rs_locked;
                __entry->flushing        = stats->rs_flushing;
                __entry->logging        = stats->rs_logging;
                __entry->handle_count        = stats->rs_handle_count;
                __entry->blocks                = stats->rs_blocks;
                __entry->blocks_logged        = stats->rs_blocks_logged;
        ),

        TP_printk("dev %d,%d tid %u wait %u request_delay %u running %u "
                  "locked %u flushing %u logging %u handle_count %u "
                  "blocks %u blocks_logged %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
                  jiffies_to_msecs(__entry->wait),
                  jiffies_to_msecs(__entry->request_delay),
                  jiffies_to_msecs(__entry->running),
                  jiffies_to_msecs(__entry->locked),
                  jiffies_to_msecs(__entry->flushing),
                  jiffies_to_msecs(__entry->logging),
                  __entry->handle_count, __entry->blocks,
                  __entry->blocks_logged)
);

TRACE_EVENT(jbd2_checkpoint_stats,
        TP_PROTO(dev_t dev, tid_t tid,
                 struct transaction_chp_stats_s *stats),

        TP_ARGS(dev, tid, stats),

        TP_STRUCT__entry(
                __field(                dev_t,        dev                )
                __field(                tid_t,        tid                )
                __field(        unsigned long,        chp_time        )
                __field(                __u32,        forced_to_close        )
                __field(                __u32,        written                )
                __field(                __u32,        dropped                )
        ),

        TP_fast_assign(
                __entry->dev                = dev;
                __entry->tid                = tid;
                __entry->chp_time        = stats->cs_chp_time;
                __entry->forced_to_close= stats->cs_forced_to_close;
                __entry->written        = stats->cs_written;
                __entry->dropped        = stats->cs_dropped;
        ),

        TP_printk("dev %d,%d tid %u chp_time %u forced_to_close %u "
                  "written %u dropped %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
                  jiffies_to_msecs(__entry->chp_time),
                  __entry->forced_to_close, __entry->written, __entry->dropped)
);

TRACE_EVENT(jbd2_update_log_tail,

        TP_PROTO(journal_t *journal, tid_t first_tid,
                 unsigned long block_nr, unsigned long freed),

        TP_ARGS(journal, first_tid, block_nr, freed),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        tid_t,        tail_sequence                )
                __field(        tid_t,        first_tid                )
                __field(unsigned long,        block_nr                )
                __field(unsigned long,        freed                        )
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->tail_sequence        = journal->j_tail_sequence;
                __entry->first_tid        = first_tid;
                __entry->block_nr        = block_nr;
                __entry->freed                = freed;
        ),

        TP_printk("dev %d,%d from %u to %u offset %lu freed %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->tail_sequence, __entry->first_tid,
                  __entry->block_nr, __entry->freed)
);

TRACE_EVENT(jbd2_write_superblock,

        TP_PROTO(journal_t *journal, blk_opf_t write_flags),

        TP_ARGS(journal, write_flags),

        TP_STRUCT__entry(
                __field(        dev_t,  dev                        )
                __field(    blk_opf_t,  write_flags                )
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->write_flags        = write_flags;
        ),

        TP_printk("dev %d,%d write_flags %x", MAJOR(__entry->dev),
                  MINOR(__entry->dev), (__force u32)__entry->write_flags)
);

TRACE_EVENT(jbd2_lock_buffer_stall,

        TP_PROTO(dev_t dev, unsigned long stall_ms),

        TP_ARGS(dev, stall_ms),

        TP_STRUCT__entry(
                __field(        dev_t, dev        )
                __field(unsigned long, stall_ms        )
        ),

        TP_fast_assign(
                __entry->dev                = dev;
                __entry->stall_ms        = stall_ms;
        ),

        TP_printk("dev %d,%d stall_ms %lu",
                MAJOR(__entry->dev), MINOR(__entry->dev),
                __entry->stall_ms)
);

DECLARE_EVENT_CLASS(jbd2_journal_shrink,

        TP_PROTO(journal_t *journal, unsigned long nr_to_scan,
                 unsigned long count),

        TP_ARGS(journal, nr_to_scan, count),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned long, nr_to_scan)
                __field(unsigned long, count)
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->nr_to_scan        = nr_to_scan;
                __entry->count                = count;
        ),

        TP_printk("dev %d,%d nr_to_scan %lu count %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nr_to_scan, __entry->count)
);

DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_count,

        TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count),

        TP_ARGS(journal, nr_to_scan, count)
);

DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_scan_enter,

        TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count),

        TP_ARGS(journal, nr_to_scan, count)
);

TRACE_EVENT(jbd2_shrink_scan_exit,

        TP_PROTO(journal_t *journal, unsigned long nr_to_scan,
                 unsigned long nr_shrunk, unsigned long count),

        TP_ARGS(journal, nr_to_scan, nr_shrunk, count),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(unsigned long, nr_to_scan)
                __field(unsigned long, nr_shrunk)
                __field(unsigned long, count)
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->nr_to_scan        = nr_to_scan;
                __entry->nr_shrunk        = nr_shrunk;
                __entry->count                = count;
        ),

        TP_printk("dev %d,%d nr_to_scan %lu nr_shrunk %lu count %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nr_to_scan, __entry->nr_shrunk,
                  __entry->count)
);

TRACE_EVENT(jbd2_shrink_checkpoint_list,

        TP_PROTO(journal_t *journal, tid_t first_tid, tid_t tid, tid_t last_tid,
                 unsigned long nr_freed, tid_t next_tid),

        TP_ARGS(journal, first_tid, tid, last_tid, nr_freed, next_tid),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(tid_t, first_tid)
                __field(tid_t, tid)
                __field(tid_t, last_tid)
                __field(unsigned long, nr_freed)
                __field(tid_t, next_tid)
        ),

        TP_fast_assign(
                __entry->dev                = journal->j_fs_dev->bd_dev;
                __entry->first_tid        = first_tid;
                __entry->tid                = tid;
                __entry->last_tid        = last_tid;
                __entry->nr_freed        = nr_freed;
                __entry->next_tid        = next_tid;
        ),

        TP_printk("dev %d,%d shrink transaction %u-%u(%u) freed %lu "
                  "next transaction %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->first_tid, __entry->tid, __entry->last_tid,
                  __entry->nr_freed, __entry->next_tid)
);

#endif /* _TRACE_JBD2_H */

/* This part must be outside protection */
#include <trace/define_trace.h>
























































    2 
























































































































































































































































































































    2 




















































































































































































































































































































































































































































































    2 
    2 


    2 





    2 












    2 
    2 





    2 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
// SPDX-License-Identifier: GPL-2.0
/*
 * device_cgroup.c - device cgroup subsystem
 *
 * Copyright 2007 IBM Corp
 */

#include <linux/bpf-cgroup.h>
#include <linux/device_cgroup.h>
#include <linux/cgroup.h>
#include <linux/ctype.h>
#include <linux/list.h>
#include <linux/uaccess.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/rcupdate.h>
#include <linux/mutex.h>

#ifdef CONFIG_CGROUP_DEVICE

static DEFINE_MUTEX(devcgroup_mutex);

enum devcg_behavior {
        DEVCG_DEFAULT_NONE,
        DEVCG_DEFAULT_ALLOW,
        DEVCG_DEFAULT_DENY,
};

/*
 * exception list locking rules:
 * hold devcgroup_mutex for update/read.
 * hold rcu_read_lock() for read.
 */

struct dev_exception_item {
        u32 major, minor;
        short type;
        short access;
        struct list_head list;
        struct rcu_head rcu;
};

struct dev_cgroup {
        struct cgroup_subsys_state css;
        struct list_head exceptions;
        enum devcg_behavior behavior;
};

static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
{
        return s ? container_of(s, struct dev_cgroup, css) : NULL;
}

static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
{
        return css_to_devcgroup(task_css(task, devices_cgrp_id));
}

/*
 * called under devcgroup_mutex
 */
static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig)
{
        struct dev_exception_item *ex, *tmp, *new;

        lockdep_assert_held(&devcgroup_mutex);

        list_for_each_entry(ex, orig, list) {
                new = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
                if (!new)
                        goto free_and_exit;
                list_add_tail(&new->list, dest);
        }

        return 0;

free_and_exit:
        list_for_each_entry_safe(ex, tmp, dest, list) {
                list_del(&ex->list);
                kfree(ex);
        }
        return -ENOMEM;
}

static void dev_exceptions_move(struct list_head *dest, struct list_head *orig)
{
        struct dev_exception_item *ex, *tmp;

        lockdep_assert_held(&devcgroup_mutex);

        list_for_each_entry_safe(ex, tmp, orig, list) {
                list_move_tail(&ex->list, dest);
        }
}

/*
 * called under devcgroup_mutex
 */
static int dev_exception_add(struct dev_cgroup *dev_cgroup,
                             struct dev_exception_item *ex)
{
        struct dev_exception_item *excopy, *walk;

        lockdep_assert_held(&devcgroup_mutex);

        excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
        if (!excopy)
                return -ENOMEM;

        list_for_each_entry(walk, &dev_cgroup->exceptions, list) {
                if (walk->type != ex->type)
                        continue;
                if (walk->major != ex->major)
                        continue;
                if (walk->minor != ex->minor)
                        continue;

                walk->access |= ex->access;
                kfree(excopy);
                excopy = NULL;
        }

        if (excopy != NULL)
                list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions);
        return 0;
}

/*
 * called under devcgroup_mutex
 */
static void dev_exception_rm(struct dev_cgroup *dev_cgroup,
                             struct dev_exception_item *ex)
{
        struct dev_exception_item *walk, *tmp;

        lockdep_assert_held(&devcgroup_mutex);

        list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) {
                if (walk->type != ex->type)
                        continue;
                if (walk->major != ex->major)
                        continue;
                if (walk->minor != ex->minor)
                        continue;

                walk->access &= ~ex->access;
                if (!walk->access) {
                        list_del_rcu(&walk->list);
                        kfree_rcu(walk, rcu);
                }
        }
}

static void __dev_exception_clean(struct dev_cgroup *dev_cgroup)
{
        struct dev_exception_item *ex, *tmp;

        list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) {
                list_del_rcu(&ex->list);
                kfree_rcu(ex, rcu);
        }
}

/**
 * dev_exception_clean - frees all entries of the exception list
 * @dev_cgroup: dev_cgroup with the exception list to be cleaned
 *
 * called under devcgroup_mutex
 */
static void dev_exception_clean(struct dev_cgroup *dev_cgroup)
{
        lockdep_assert_held(&devcgroup_mutex);

        __dev_exception_clean(dev_cgroup);
}

static inline bool is_devcg_online(const struct dev_cgroup *devcg)
{
        return (devcg->behavior != DEVCG_DEFAULT_NONE);
}

/**
 * devcgroup_online - initializes devcgroup's behavior and exceptions based on
 *                       parent's
 * @css: css getting online
 * returns 0 in case of success, error code otherwise
 */
static int devcgroup_online(struct cgroup_subsys_state *css)
{
        struct dev_cgroup *dev_cgroup = css_to_devcgroup(css);
        struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css->parent);
        int ret = 0;

        mutex_lock(&devcgroup_mutex);

        if (parent_dev_cgroup == NULL)
                dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW;
        else {
                ret = dev_exceptions_copy(&dev_cgroup->exceptions,
                                          &parent_dev_cgroup->exceptions);
                if (!ret)
                        dev_cgroup->behavior = parent_dev_cgroup->behavior;
        }
        mutex_unlock(&devcgroup_mutex);

        return ret;
}

static void devcgroup_offline(struct cgroup_subsys_state *css)
{
        struct dev_cgroup *dev_cgroup = css_to_devcgroup(css);

        mutex_lock(&devcgroup_mutex);
        dev_cgroup->behavior = DEVCG_DEFAULT_NONE;
        mutex_unlock(&devcgroup_mutex);
}

/*
 * called from kernel/cgroup/cgroup.c with cgroup_lock() held.
 */
static struct cgroup_subsys_state *
devcgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct dev_cgroup *dev_cgroup;

        dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL);
        if (!dev_cgroup)
                return ERR_PTR(-ENOMEM);
        INIT_LIST_HEAD(&dev_cgroup->exceptions);
        dev_cgroup->behavior = DEVCG_DEFAULT_NONE;

        return &dev_cgroup->css;
}

static void devcgroup_css_free(struct cgroup_subsys_state *css)
{
        struct dev_cgroup *dev_cgroup = css_to_devcgroup(css);

        __dev_exception_clean(dev_cgroup);
        kfree(dev_cgroup);
}

#define DEVCG_ALLOW 1
#define DEVCG_DENY 2
#define DEVCG_LIST 3

#define MAJMINLEN 13
#define ACCLEN 4

static void set_access(char *acc, short access)
{
        int idx = 0;
        memset(acc, 0, ACCLEN);
        if (access & DEVCG_ACC_READ)
                acc[idx++] = 'r';
        if (access & DEVCG_ACC_WRITE)
                acc[idx++] = 'w';
        if (access & DEVCG_ACC_MKNOD)
                acc[idx++] = 'm';
}

static char type_to_char(short type)
{
        if (type == DEVCG_DEV_ALL)
                return 'a';
        if (type == DEVCG_DEV_CHAR)
                return 'c';
        if (type == DEVCG_DEV_BLOCK)
                return 'b';
        return 'X';
}

static void set_majmin(char *str, unsigned m)
{
        if (m == ~0)
                strcpy(str, "*");
        else
                sprintf(str, "%u", m);
}

static int devcgroup_seq_show(struct seq_file *m, void *v)
{
        struct dev_cgroup *devcgroup = css_to_devcgroup(seq_css(m));
        struct dev_exception_item *ex;
        char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN];

        rcu_read_lock();
        /*
         * To preserve the compatibility:
         * - Only show the "all devices" when the default policy is to allow
         * - List the exceptions in case the default policy is to deny
         * This way, the file remains as a "whitelist of devices"
         */
        if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) {
                set_access(acc, DEVCG_ACC_MASK);
                set_majmin(maj, ~0);
                set_majmin(min, ~0);
                seq_printf(m, "%c %s:%s %s\n", type_to_char(DEVCG_DEV_ALL),
                           maj, min, acc);
        } else {
                list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) {
                        set_access(acc, ex->access);
                        set_majmin(maj, ex->major);
                        set_majmin(min, ex->minor);
                        seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type),
                                   maj, min, acc);
                }
        }
        rcu_read_unlock();

        return 0;
}

/**
 * match_exception        - iterates the exception list trying to find a complete match
 * @exceptions: list of exceptions
 * @type: device type (DEVCG_DEV_BLOCK or DEVCG_DEV_CHAR)
 * @major: device file major number, ~0 to match all
 * @minor: device file minor number, ~0 to match all
 * @access: permission mask (DEVCG_ACC_READ, DEVCG_ACC_WRITE, DEVCG_ACC_MKNOD)
 *
 * It is considered a complete match if an exception is found that will
 * contain the entire range of provided parameters.
 *
 * Return: true in case it matches an exception completely
 */
static bool match_exception(struct list_head *exceptions, short type,
                            u32 major, u32 minor, short access)
{
        struct dev_exception_item *ex;

        list_for_each_entry_rcu(ex, exceptions, list) {
                if ((type & DEVCG_DEV_BLOCK) && !(ex->type & DEVCG_DEV_BLOCK))
                        continue;
                if ((type & DEVCG_DEV_CHAR) && !(ex->type & DEVCG_DEV_CHAR))
                        continue;
                if (ex->major != ~0 && ex->major != major)
                        continue;
                if (ex->minor != ~0 && ex->minor != minor)
                        continue;
                /* provided access cannot have more than the exception rule */
                if (access & (~ex->access))
                        continue;
                return true;
        }
        return false;
}

/**
 * match_exception_partial - iterates the exception list trying to find a partial match
 * @exceptions: list of exceptions
 * @type: device type (DEVCG_DEV_BLOCK or DEVCG_DEV_CHAR)
 * @major: device file major number, ~0 to match all
 * @minor: device file minor number, ~0 to match all
 * @access: permission mask (DEVCG_ACC_READ, DEVCG_ACC_WRITE, DEVCG_ACC_MKNOD)
 *
 * It is considered a partial match if an exception's range is found to
 * contain *any* of the devices specified by provided parameters. This is
 * used to make sure no extra access is being granted that is forbidden by
 * any of the exception list.
 *
 * Return: true in case the provided range mat matches an exception completely
 */
static bool match_exception_partial(struct list_head *exceptions, short type,
                                    u32 major, u32 minor, short access)
{
        struct dev_exception_item *ex;

        list_for_each_entry_rcu(ex, exceptions, list,
                                lockdep_is_held(&devcgroup_mutex)) {
                if ((type & DEVCG_DEV_BLOCK) && !(ex->type & DEVCG_DEV_BLOCK))
                        continue;
                if ((type & DEVCG_DEV_CHAR) && !(ex->type & DEVCG_DEV_CHAR))
                        continue;
                /*
                 * We must be sure that both the exception and the provided
                 * range aren't masking all devices
                 */
                if (ex->major != ~0 && major != ~0 && ex->major != major)
                        continue;
                if (ex->minor != ~0 && minor != ~0 && ex->minor != minor)
                        continue;
                /*
                 * In order to make sure the provided range isn't matching
                 * an exception, all its access bits shouldn't match the
                 * exception's access bits
                 */
                if (!(access & ex->access))
                        continue;
                return true;
        }
        return false;
}

/**
 * verify_new_ex - verifies if a new exception is allowed by parent cgroup's permissions
 * @dev_cgroup: dev cgroup to be tested against
 * @refex: new exception
 * @behavior: behavior of the exception's dev_cgroup
 *
 * This is used to make sure a child cgroup won't have more privileges
 * than its parent
 */
static bool verify_new_ex(struct dev_cgroup *dev_cgroup,
                          struct dev_exception_item *refex,
                          enum devcg_behavior behavior)
{
        bool match = false;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&
                         !lockdep_is_held(&devcgroup_mutex),
                         "device_cgroup:verify_new_ex called without proper synchronization");

        if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) {
                if (behavior == DEVCG_DEFAULT_ALLOW) {
                        /*
                         * new exception in the child doesn't matter, only
                         * adding extra restrictions
                         */ 
                        return true;
                } else {
                        /*
                         * new exception in the child will add more devices
                         * that can be accessed, so it can't match any of
                         * parent's exceptions, even slightly
                         */ 
                        match = match_exception_partial(&dev_cgroup->exceptions,
                                                        refex->type,
                                                        refex->major,
                                                        refex->minor,
                                                        refex->access);

                        if (match)
                                return false;
                        return true;
                }
        } else {
                /*
                 * Only behavior == DEVCG_DEFAULT_DENY allowed here, therefore
                 * the new exception will add access to more devices and must
                 * be contained completely in an parent's exception to be
                 * allowed
                 */
                match = match_exception(&dev_cgroup->exceptions, refex->type,
                                        refex->major, refex->minor,
                                        refex->access);

                if (match)
                        /* parent has an exception that matches the proposed */
                        return true;
                else
                        return false;
        }
        return false;
}

/*
 * parent_has_perm:
 * when adding a new allow rule to a device exception list, the rule
 * must be allowed in the parent device
 */
static int parent_has_perm(struct dev_cgroup *childcg,
                                  struct dev_exception_item *ex)
{
        struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent);

        if (!parent)
                return 1;
        return verify_new_ex(parent, ex, childcg->behavior);
}

/**
 * parent_allows_removal - verify if it's ok to remove an exception
 * @childcg: child cgroup from where the exception will be removed
 * @ex: exception being removed
 *
 * When removing an exception in cgroups with default ALLOW policy, it must
 * be checked if removing it will give the child cgroup more access than the
 * parent.
 *
 * Return: true if it's ok to remove exception, false otherwise
 */
static bool parent_allows_removal(struct dev_cgroup *childcg,
                                  struct dev_exception_item *ex)
{
        struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent);

        if (!parent)
                return true;

        /* It's always allowed to remove access to devices */
        if (childcg->behavior == DEVCG_DEFAULT_DENY)
                return true;

        /*
         * Make sure you're not removing part or a whole exception existing in
         * the parent cgroup
         */
        return !match_exception_partial(&parent->exceptions, ex->type,
                                        ex->major, ex->minor, ex->access);
}

/**
 * may_allow_all - checks if it's possible to change the behavior to
 *                   allow based on parent's rules.
 * @parent: device cgroup's parent
 * returns: != 0 in case it's allowed, 0 otherwise
 */
static inline int may_allow_all(struct dev_cgroup *parent)
{
        if (!parent)
                return 1;
        return parent->behavior == DEVCG_DEFAULT_ALLOW;
}

/**
 * revalidate_active_exceptions - walks through the active exception list and
 *                                   revalidates the exceptions based on parent's
 *                                   behavior and exceptions. The exceptions that
 *                                   are no longer valid will be removed.
 *                                   Called with devcgroup_mutex held.
 * @devcg: cgroup which exceptions will be checked
 *
 * This is one of the three key functions for hierarchy implementation.
 * This function is responsible for re-evaluating all the cgroup's active
 * exceptions due to a parent's exception change.
 * Refer to Documentation/admin-guide/cgroup-v1/devices.rst for more details.
 */
static void revalidate_active_exceptions(struct dev_cgroup *devcg)
{
        struct dev_exception_item *ex;
        struct list_head *this, *tmp;

        list_for_each_safe(this, tmp, &devcg->exceptions) {
                ex = container_of(this, struct dev_exception_item, list);
                if (!parent_has_perm(devcg, ex))
                        dev_exception_rm(devcg, ex);
        }
}

/**
 * propagate_exception - propagates a new exception to the children
 * @devcg_root: device cgroup that added a new exception
 * @ex: new exception to be propagated
 *
 * returns: 0 in case of success, != 0 in case of error
 */
static int propagate_exception(struct dev_cgroup *devcg_root,
                               struct dev_exception_item *ex)
{
        struct cgroup_subsys_state *pos;
        int rc = 0;

        rcu_read_lock();

        css_for_each_descendant_pre(pos, &devcg_root->css) {
                struct dev_cgroup *devcg = css_to_devcgroup(pos);

                /*
                 * Because devcgroup_mutex is held, no devcg will become
                 * online or offline during the tree walk (see on/offline
                 * methods), and online ones are safe to access outside RCU
                 * read lock without bumping refcnt.
                 */
                if (pos == &devcg_root->css || !is_devcg_online(devcg))
                        continue;

                rcu_read_unlock();

                /*
                 * in case both root's behavior and devcg is allow, a new
                 * restriction means adding to the exception list
                 */
                if (devcg_root->behavior == DEVCG_DEFAULT_ALLOW &&
                    devcg->behavior == DEVCG_DEFAULT_ALLOW) {
                        rc = dev_exception_add(devcg, ex);
                        if (rc)
                                return rc;
                } else {
                        /*
                         * in the other possible cases:
                         * root's behavior: allow, devcg's: deny
                         * root's behavior: deny, devcg's: deny
                         * the exception will be removed
                         */
                        dev_exception_rm(devcg, ex);
                }
                revalidate_active_exceptions(devcg);

                rcu_read_lock();
        }

        rcu_read_unlock();
        return rc;
}

/*
 * Modify the exception list using allow/deny rules.
 * CAP_SYS_ADMIN is needed for this.  It's at least separate from CAP_MKNOD
 * so we can give a container CAP_MKNOD to let it create devices but not
 * modify the exception list.
 * It seems likely we'll want to add a CAP_CONTAINER capability to allow
 * us to also grant CAP_SYS_ADMIN to containers without giving away the
 * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN
 *
 * Taking rules away is always allowed (given CAP_SYS_ADMIN).  Granting
 * new access is only allowed if you're in the top-level cgroup, or your
 * parent cgroup has the access you're asking for.
 */
static int devcgroup_update_access(struct dev_cgroup *devcgroup,
                                   int filetype, char *buffer)
{
        const char *b;
        char temp[12];                /* 11 + 1 characters needed for a u32 */
        int count, rc = 0;
        struct dev_exception_item ex;
        struct dev_cgroup *parent = css_to_devcgroup(devcgroup->css.parent);
        struct dev_cgroup tmp_devcgrp;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        memset(&ex, 0, sizeof(ex));
        memset(&tmp_devcgrp, 0, sizeof(tmp_devcgrp));
        b = buffer;

        switch (*b) {
        case 'a':
                switch (filetype) {
                case DEVCG_ALLOW:
                        if (css_has_online_children(&devcgroup->css))
                                return -EINVAL;

                        if (!may_allow_all(parent))
                                return -EPERM;
                        if (!parent) {
                                devcgroup->behavior = DEVCG_DEFAULT_ALLOW;
                                dev_exception_clean(devcgroup);
                                break;
                        }

                        INIT_LIST_HEAD(&tmp_devcgrp.exceptions);
                        rc = dev_exceptions_copy(&tmp_devcgrp.exceptions,
                                                 &devcgroup->exceptions);
                        if (rc)
                                return rc;
                        dev_exception_clean(devcgroup);
                        rc = dev_exceptions_copy(&devcgroup->exceptions,
                                                 &parent->exceptions);
                        if (rc) {
                                dev_exceptions_move(&devcgroup->exceptions,
                                                    &tmp_devcgrp.exceptions);
                                return rc;
                        }
                        devcgroup->behavior = DEVCG_DEFAULT_ALLOW;
                        dev_exception_clean(&tmp_devcgrp);
                        break;
                case DEVCG_DENY:
                        if (css_has_online_children(&devcgroup->css))
                                return -EINVAL;

                        dev_exception_clean(devcgroup);
                        devcgroup->behavior = DEVCG_DEFAULT_DENY;
                        break;
                default:
                        return -EINVAL;
                }
                return 0;
        case 'b':
                ex.type = DEVCG_DEV_BLOCK;
                break;
        case 'c':
                ex.type = DEVCG_DEV_CHAR;
                break;
        default:
                return -EINVAL;
        }
        b++;
        if (!isspace(*b))
                return -EINVAL;
        b++;
        if (*b == '*') {
                ex.major = ~0;
                b++;
        } else if (isdigit(*b)) {
                memset(temp, 0, sizeof(temp));
                for (count = 0; count < sizeof(temp) - 1; count++) {
                        temp[count] = *b;
                        b++;
                        if (!isdigit(*b))
                                break;
                }
                rc = kstrtou32(temp, 10, &ex.major);
                if (rc)
                        return -EINVAL;
        } else {
                return -EINVAL;
        }
        if (*b != ':')
                return -EINVAL;
        b++;

        /* read minor */
        if (*b == '*') {
                ex.minor = ~0;
                b++;
        } else if (isdigit(*b)) {
                memset(temp, 0, sizeof(temp));
                for (count = 0; count < sizeof(temp) - 1; count++) {
                        temp[count] = *b;
                        b++;
                        if (!isdigit(*b))
                                break;
                }
                rc = kstrtou32(temp, 10, &ex.minor);
                if (rc)
                        return -EINVAL;
        } else {
                return -EINVAL;
        }
        if (!isspace(*b))
                return -EINVAL;
        for (b++, count = 0; count < 3; count++, b++) {
                switch (*b) {
                case 'r':
                        ex.access |= DEVCG_ACC_READ;
                        break;
                case 'w':
                        ex.access |= DEVCG_ACC_WRITE;
                        break;
                case 'm':
                        ex.access |= DEVCG_ACC_MKNOD;
                        break;
                case '\n':
                case '\0':
                        count = 3;
                        break;
                default:
                        return -EINVAL;
                }
        }

        switch (filetype) {
        case DEVCG_ALLOW:
                /*
                 * If the default policy is to allow by default, try to remove
                 * an matching exception instead. And be silent about it: we
                 * don't want to break compatibility
                 */
                if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) {
                        /* Check if the parent allows removing it first */
                        if (!parent_allows_removal(devcgroup, &ex))
                                return -EPERM;
                        dev_exception_rm(devcgroup, &ex);
                        break;
                }

                if (!parent_has_perm(devcgroup, &ex))
                        return -EPERM;
                rc = dev_exception_add(devcgroup, &ex);
                break;
        case DEVCG_DENY:
                /*
                 * If the default policy is to deny by default, try to remove
                 * an matching exception instead. And be silent about it: we
                 * don't want to break compatibility
                 */
                if (devcgroup->behavior == DEVCG_DEFAULT_DENY)
                        dev_exception_rm(devcgroup, &ex);
                else
                        rc = dev_exception_add(devcgroup, &ex);

                if (rc)
                        break;
                /* we only propagate new restrictions */
                rc = propagate_exception(devcgroup, &ex);
                break;
        default:
                rc = -EINVAL;
        }
        return rc;
}

static ssize_t devcgroup_access_write(struct kernfs_open_file *of,
                                      char *buf, size_t nbytes, loff_t off)
{
        int retval;

        mutex_lock(&devcgroup_mutex);
        retval = devcgroup_update_access(css_to_devcgroup(of_css(of)),
                                         of_cft(of)->private, strstrip(buf));
        mutex_unlock(&devcgroup_mutex);
        return retval ?: nbytes;
}

static struct cftype dev_cgroup_files[] = {
        {
                .name = "allow",
                .write = devcgroup_access_write,
                .private = DEVCG_ALLOW,
        },
        {
                .name = "deny",
                .write = devcgroup_access_write,
                .private = DEVCG_DENY,
        },
        {
                .name = "list",
                .seq_show = devcgroup_seq_show,
                .private = DEVCG_LIST,
        },
        { }        /* terminate */
};

struct cgroup_subsys devices_cgrp_subsys = {
        .css_alloc = devcgroup_css_alloc,
        .css_free = devcgroup_css_free,
        .css_online = devcgroup_online,
        .css_offline = devcgroup_offline,
        .legacy_cftypes = dev_cgroup_files,
};

/**
 * devcgroup_legacy_check_permission - checks if an inode operation is permitted
 * @type: device type
 * @major: device major number
 * @minor: device minor number
 * @access: combination of DEVCG_ACC_WRITE, DEVCG_ACC_READ and DEVCG_ACC_MKNOD
 *
 * returns 0 on success, -EPERM case the operation is not permitted
 */
static int devcgroup_legacy_check_permission(short type, u32 major, u32 minor,
                                        short access)
{
        struct dev_cgroup *dev_cgroup;
        bool rc;

        rcu_read_lock();
        dev_cgroup = task_devcgroup(current);
        if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW)
                /* Can't match any of the exceptions, even partially */
                rc = !match_exception_partial(&dev_cgroup->exceptions,
                                              type, major, minor, access);
        else
                /* Need to match completely one exception to be allowed */
                rc = match_exception(&dev_cgroup->exceptions, type, major,
                                     minor, access);
        rcu_read_unlock();

        if (!rc)
                return -EPERM;

        return 0;
}

#endif /* CONFIG_CGROUP_DEVICE */

#if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF)

int devcgroup_check_permission(short type, u32 major, u32 minor, short access)
{
        int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access);

        if (rc)
                return rc;

        #ifdef CONFIG_CGROUP_DEVICE
        return devcgroup_legacy_check_permission(type, major, minor, access);

        #else /* CONFIG_CGROUP_DEVICE */
        return 0;

        #endif /* CONFIG_CGROUP_DEVICE */
}
EXPORT_SYMBOL(devcgroup_check_permission);
#endif /* defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) */



































































   47 



   48 


   48 
























































































































































   10 































































































































































































































































































































































































































































































































































   21 
   36 
















































    2 












    2 

















































































































































































































































   31 


























    2 























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_SEQLOCK_H
#define __LINUX_SEQLOCK_H

/*
 * seqcount_t / seqlock_t - a reader-writer consistency mechanism with
 * lockless readers (read-only retry loops), and no writer starvation.
 *
 * See Documentation/locking/seqlock.rst
 *
 * Copyrights:
 * - Based on x86_64 vsyscall gettimeofday: Keith Owens, Andrea Arcangeli
 * - Sequence counters with associated locks, (C) 2020 Linutronix GmbH
 */

#include <linux/compiler.h>
#include <linux/kcsan-checks.h>
#include <linux/lockdep.h>
#include <linux/mutex.h>
#include <linux/preempt.h>
#include <linux/seqlock_types.h>
#include <linux/spinlock.h>

#include <asm/processor.h>

/*
 * The seqlock seqcount_t interface does not prescribe a precise sequence of
 * read begin/retry/end. For readers, typically there is a call to
 * read_seqcount_begin() and read_seqcount_retry(), however, there are more
 * esoteric cases which do not follow this pattern.
 *
 * As a consequence, we take the following best-effort approach for raw usage
 * via seqcount_t under KCSAN: upon beginning a seq-reader critical section,
 * pessimistically mark the next KCSAN_SEQLOCK_REGION_MAX memory accesses as
 * atomics; if there is a matching read_seqcount_retry() call, no following
 * memory operations are considered atomic. Usage of the seqlock_t interface
 * is not affected.
 */
#define KCSAN_SEQLOCK_REGION_MAX 1000

static inline void __seqcount_init(seqcount_t *s, const char *name,
                                          struct lock_class_key *key)
{
        /*
         * Make sure we are not reinitializing a held lock:
         */
        lockdep_init_map(&s->dep_map, name, key, 0);
        s->sequence = 0;
}

#ifdef CONFIG_DEBUG_LOCK_ALLOC

# define SEQCOUNT_DEP_MAP_INIT(lockname)                                \
                .dep_map = { .name = #lockname }

/**
 * seqcount_init() - runtime initializer for seqcount_t
 * @s: Pointer to the seqcount_t instance
 */
# define seqcount_init(s)                                                \
        do {                                                                \
                static struct lock_class_key __key;                        \
                __seqcount_init((s), #s, &__key);                        \
        } while (0)

static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
{
        seqcount_t *l = (seqcount_t *)s;
        unsigned long flags;

        local_irq_save(flags);
        seqcount_acquire_read(&l->dep_map, 0, 0, _RET_IP_);
        seqcount_release(&l->dep_map, _RET_IP_);
        local_irq_restore(flags);
}

#else
# define SEQCOUNT_DEP_MAP_INIT(lockname)
# define seqcount_init(s) __seqcount_init(s, NULL, NULL)
# define seqcount_lockdep_reader_access(x)
#endif

/**
 * SEQCNT_ZERO() - static initializer for seqcount_t
 * @name: Name of the seqcount_t instance
 */
#define SEQCNT_ZERO(name) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(name) }

/*
 * Sequence counters with associated locks (seqcount_LOCKNAME_t)
 *
 * A sequence counter which associates the lock used for writer
 * serialization at initialization time. This enables lockdep to validate
 * that the write side critical section is properly serialized.
 *
 * For associated locks which do not implicitly disable preemption,
 * preemption protection is enforced in the write side function.
 *
 * Lockdep is never used in any for the raw write variants.
 *
 * See Documentation/locking/seqlock.rst
 */

/*
 * typedef seqcount_LOCKNAME_t - sequence counter with LOCKNAME associated
 * @seqcount:        The real sequence counter
 * @lock:        Pointer to the associated lock
 *
 * A plain sequence counter with external writer synchronization by
 * LOCKNAME @lock. The lock is associated to the sequence counter in the
 * static initializer or init function. This enables lockdep to validate
 * that the write side critical section is properly serialized.
 *
 * LOCKNAME:        raw_spinlock, spinlock, rwlock or mutex
 */

/*
 * seqcount_LOCKNAME_init() - runtime initializer for seqcount_LOCKNAME_t
 * @s:                Pointer to the seqcount_LOCKNAME_t instance
 * @lock:        Pointer to the associated lock
 */

#define seqcount_LOCKNAME_init(s, _lock, lockname)                        \
        do {                                                                \
                seqcount_##lockname##_t *____s = (s);                        \
                seqcount_init(&____s->seqcount);                        \
                __SEQ_LOCK(____s->lock = (_lock));                        \
        } while (0)

#define seqcount_raw_spinlock_init(s, lock)        seqcount_LOCKNAME_init(s, lock, raw_spinlock)
#define seqcount_spinlock_init(s, lock)                seqcount_LOCKNAME_init(s, lock, spinlock)
#define seqcount_rwlock_init(s, lock)                seqcount_LOCKNAME_init(s, lock, rwlock)
#define seqcount_mutex_init(s, lock)                seqcount_LOCKNAME_init(s, lock, mutex)

/*
 * SEQCOUNT_LOCKNAME()        - Instantiate seqcount_LOCKNAME_t and helpers
 * seqprop_LOCKNAME_*()        - Property accessors for seqcount_LOCKNAME_t
 *
 * @lockname:                "LOCKNAME" part of seqcount_LOCKNAME_t
 * @locktype:                LOCKNAME canonical C data type
 * @preemptible:        preemptibility of above locktype
 * @lockbase:                prefix for associated lock/unlock
 */
#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase)        \
static __always_inline seqcount_t *                                        \
__seqprop_##lockname##_ptr(seqcount_##lockname##_t *s)                        \
{                                                                        \
        return &s->seqcount;                                                \
}                                                                        \
                                                                        \
static __always_inline const seqcount_t *                                \
__seqprop_##lockname##_const_ptr(const seqcount_##lockname##_t *s)        \
{                                                                        \
        return &s->seqcount;                                                \
}                                                                        \
                                                                        \
static __always_inline unsigned                                                \
__seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s)        \
{                                                                        \
        unsigned seq = READ_ONCE(s->seqcount.sequence);                        \
                                                                        \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))                                \
                return seq;                                                \
                                                                        \
        if (preemptible && unlikely(seq & 1)) {                                \
                __SEQ_LOCK(lockbase##_lock(s->lock));                        \
                __SEQ_LOCK(lockbase##_unlock(s->lock));                        \
                                                                        \
                /*                                                        \
                 * Re-read the sequence counter since the (possibly        \
                 * preempted) writer made progress.                        \
                 */                                                        \
                seq = READ_ONCE(s->seqcount.sequence);                        \
        }                                                                \
                                                                        \
        return seq;                                                        \
}                                                                        \
                                                                        \
static __always_inline bool                                                \
__seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s)        \
{                                                                        \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))                                \
                return preemptible;                                        \
                                                                        \
        /* PREEMPT_RT relies on the above LOCK+UNLOCK */                \
        return false;                                                        \
}                                                                        \
                                                                        \
static __always_inline void                                                \
__seqprop_##lockname##_assert(const seqcount_##lockname##_t *s)                \
{                                                                        \
        __SEQ_LOCK(lockdep_assert_held(s->lock));                        \
}

/*
 * __seqprop() for seqcount_t
 */

static inline seqcount_t *__seqprop_ptr(seqcount_t *s)
{
        return s;
}

static inline const seqcount_t *__seqprop_const_ptr(const seqcount_t *s)
{
        return s;
}

static inline unsigned __seqprop_sequence(const seqcount_t *s)
{
        return READ_ONCE(s->sequence);
}

static inline bool __seqprop_preemptible(const seqcount_t *s)
{
        return false;
}

static inline void __seqprop_assert(const seqcount_t *s)
{
        lockdep_assert_preemption_disabled();
}

#define __SEQ_RT        IS_ENABLED(CONFIG_PREEMPT_RT)

SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t,  false,    raw_spin)
SEQCOUNT_LOCKNAME(spinlock,     spinlock_t,      __SEQ_RT, spin)
SEQCOUNT_LOCKNAME(rwlock,       rwlock_t,        __SEQ_RT, read)
SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
#undef SEQCOUNT_LOCKNAME

/*
 * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t
 * @name:        Name of the seqcount_LOCKNAME_t instance
 * @lock:        Pointer to the associated LOCKNAME
 */

#define SEQCOUNT_LOCKNAME_ZERO(seq_name, assoc_lock) {                        \
        .seqcount                = SEQCNT_ZERO(seq_name.seqcount),        \
        __SEQ_LOCK(.lock        = (assoc_lock))                                \
}

#define SEQCNT_RAW_SPINLOCK_ZERO(name, lock)        SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_SPINLOCK_ZERO(name, lock)        SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_RWLOCK_ZERO(name, lock)                SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_MUTEX_ZERO(name, lock)                SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_WW_MUTEX_ZERO(name, lock)         SEQCOUNT_LOCKNAME_ZERO(name, lock)

#define __seqprop_case(s, lockname, prop)                                \
        seqcount_##lockname##_t: __seqprop_##lockname##_##prop

#define __seqprop(s, prop) _Generic(*(s),                                \
        seqcount_t:                __seqprop_##prop,                        \
        __seqprop_case((s),        raw_spinlock,        prop),                        \
        __seqprop_case((s),        spinlock,        prop),                        \
        __seqprop_case((s),        rwlock,                prop),                        \
        __seqprop_case((s),        mutex,                prop))

#define seqprop_ptr(s)                        __seqprop(s, ptr)(s)
#define seqprop_const_ptr(s)                __seqprop(s, const_ptr)(s)
#define seqprop_sequence(s)                __seqprop(s, sequence)(s)
#define seqprop_preemptible(s)                __seqprop(s, preemptible)(s)
#define seqprop_assert(s)                __seqprop(s, assert)(s)

/**
 * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb()
 * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
 * provided before actually loading any of the variables that are to be
 * protected in this critical section.
 *
 * Use carefully, only in critical code, and comment how the barrier is
 * provided.
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define __read_seqcount_begin(s)                                        \
({                                                                        \
        unsigned __seq;                                                        \
                                                                        \
        while ((__seq = seqprop_sequence(s)) & 1)                        \
                cpu_relax();                                                \
                                                                        \
        kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);                        \
        __seq;                                                                \
})

/**
 * raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_read_seqcount_begin(s)                                        \
({                                                                        \
        unsigned _seq = __read_seqcount_begin(s);                        \
                                                                        \
        smp_rmb();                                                        \
        _seq;                                                                \
})

/**
 * read_seqcount_begin() - begin a seqcount_t read critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define read_seqcount_begin(s)                                                \
({                                                                        \
        seqcount_lockdep_reader_access(seqprop_const_ptr(s));                \
        raw_read_seqcount_begin(s);                                        \
})

/**
 * raw_read_seqcount() - read the raw seqcount_t counter value
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * raw_read_seqcount opens a read critical section of the given
 * seqcount_t, without any lockdep checking, and without checking or
 * masking the sequence counter LSB. Calling code is responsible for
 * handling that.
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_read_seqcount(s)                                                \
({                                                                        \
        unsigned __seq = seqprop_sequence(s);                                \
                                                                        \
        smp_rmb();                                                        \
        kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);                        \
        __seq;                                                                \
})

/**
 * raw_seqcount_begin() - begin a seqcount_t read critical section w/o
 *                        lockdep and w/o counter stabilization
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * raw_seqcount_begin opens a read critical section of the given
 * seqcount_t. Unlike read_seqcount_begin(), this function will not wait
 * for the count to stabilize. If a writer is active when it begins, it
 * will fail the read_seqcount_retry() at the end of the read critical
 * section instead of stabilizing at the beginning of it.
 *
 * Use this only in special kernel hot paths where the read section is
 * small and has a high probability of success through other external
 * means. It will save a single branching instruction.
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_seqcount_begin(s)                                                \
({                                                                        \
        /*                                                                \
         * If the counter is odd, let read_seqcount_retry() fail        \
         * by decrementing the counter.                                        \
         */                                                                \
        raw_read_seqcount(s) & ~1;                                        \
})

/**
 * __read_seqcount_retry() - end a seqcount_t read section w/o barrier
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @start: count, from read_seqcount_begin()
 *
 * __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb()
 * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
 * provided before actually loading any of the variables that are to be
 * protected in this critical section.
 *
 * Use carefully, only in critical code, and comment how the barrier is
 * provided.
 *
 * Return: true if a read section retry is required, else false
 */
#define __read_seqcount_retry(s, start)                                        \
        do___read_seqcount_retry(seqprop_const_ptr(s), start)

static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start)
{
        kcsan_atomic_next(0);
        return unlikely(READ_ONCE(s->sequence) != start);
}

/**
 * read_seqcount_retry() - end a seqcount_t read critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @start: count, from read_seqcount_begin()
 *
 * read_seqcount_retry closes the read critical section of given
 * seqcount_t.  If the critical section was invalid, it must be ignored
 * (and typically retried).
 *
 * Return: true if a read section retry is required, else false
 */
#define read_seqcount_retry(s, start)                                        \
        do_read_seqcount_retry(seqprop_const_ptr(s), start)

static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start)
{
        smp_rmb();
        return do___read_seqcount_retry(s, start);
}

/**
 * raw_write_seqcount_begin() - start a seqcount_t write section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Context: check write_seqcount_begin()
 */
#define raw_write_seqcount_begin(s)                                        \
do {                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_raw_write_seqcount_begin(seqprop_ptr(s));                        \
} while (0)

static inline void do_raw_write_seqcount_begin(seqcount_t *s)
{
        kcsan_nestable_atomic_begin();
        s->sequence++;
        smp_wmb();
}

/**
 * raw_write_seqcount_end() - end a seqcount_t write section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Context: check write_seqcount_end()
 */
#define raw_write_seqcount_end(s)                                        \
do {                                                                        \
        do_raw_write_seqcount_end(seqprop_ptr(s));                        \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_enable();                                        \
} while (0)

static inline void do_raw_write_seqcount_end(seqcount_t *s)
{
        smp_wmb();
        s->sequence++;
        kcsan_nestable_atomic_end();
}

/**
 * write_seqcount_begin_nested() - start a seqcount_t write section with
 *                                 custom lockdep nesting level
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @subclass: lockdep nesting level
 *
 * See Documentation/locking/lockdep-design.rst
 * Context: check write_seqcount_begin()
 */
#define write_seqcount_begin_nested(s, subclass)                        \
do {                                                                        \
        seqprop_assert(s);                                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_write_seqcount_begin_nested(seqprop_ptr(s), subclass);        \
} while (0)

static inline void do_write_seqcount_begin_nested(seqcount_t *s, int subclass)
{
        seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_);
        do_raw_write_seqcount_begin(s);
}

/**
 * write_seqcount_begin() - start a seqcount_t write side critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Context: sequence counter write side sections must be serialized and
 * non-preemptible. Preemption will be automatically disabled if and
 * only if the seqcount write serialization lock is associated, and
 * preemptible.  If readers can be invoked from hardirq or softirq
 * context, interrupts or bottom halves must be respectively disabled.
 */
#define write_seqcount_begin(s)                                                \
do {                                                                        \
        seqprop_assert(s);                                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_write_seqcount_begin(seqprop_ptr(s));                        \
} while (0)

static inline void do_write_seqcount_begin(seqcount_t *s)
{
        do_write_seqcount_begin_nested(s, 0);
}

/**
 * write_seqcount_end() - end a seqcount_t write side critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Context: Preemption will be automatically re-enabled if and only if
 * the seqcount write serialization lock is associated, and preemptible.
 */
#define write_seqcount_end(s)                                                \
do {                                                                        \
        do_write_seqcount_end(seqprop_ptr(s));                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_enable();                                        \
} while (0)

static inline void do_write_seqcount_end(seqcount_t *s)
{
        seqcount_release(&s->dep_map, _RET_IP_);
        do_raw_write_seqcount_end(s);
}

/**
 * raw_write_seqcount_barrier() - do a seqcount_t write barrier
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * This can be used to provide an ordering guarantee instead of the usual
 * consistency guarantee. It is one wmb cheaper, because it can collapse
 * the two back-to-back wmb()s.
 *
 * Note that writes surrounding the barrier should be declared atomic (e.g.
 * via WRITE_ONCE): a) to ensure the writes become visible to other threads
 * atomically, avoiding compiler optimizations; b) to document which writes are
 * meant to propagate to the reader critical section. This is necessary because
 * neither writes before nor after the barrier are enclosed in a seq-writer
 * critical section that would ensure readers are aware of ongoing writes::
 *
 *        seqcount_t seq;
 *        bool X = true, Y = false;
 *
 *        void read(void)
 *        {
 *                bool x, y;
 *
 *                do {
 *                        int s = read_seqcount_begin(&seq);
 *
 *                        x = X; y = Y;
 *
 *                } while (read_seqcount_retry(&seq, s));
 *
 *                BUG_ON(!x && !y);
 *      }
 *
 *      void write(void)
 *      {
 *                WRITE_ONCE(Y, true);
 *
 *                raw_write_seqcount_barrier(seq);
 *
 *                WRITE_ONCE(X, false);
 *      }
 */
#define raw_write_seqcount_barrier(s)                                        \
        do_raw_write_seqcount_barrier(seqprop_ptr(s))

static inline void do_raw_write_seqcount_barrier(seqcount_t *s)
{
        kcsan_nestable_atomic_begin();
        s->sequence++;
        smp_wmb();
        s->sequence++;
        kcsan_nestable_atomic_end();
}

/**
 * write_seqcount_invalidate() - invalidate in-progress seqcount_t read
 *                               side operations
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * After write_seqcount_invalidate, no seqcount_t read side operations
 * will complete successfully and see data older than this.
 */
#define write_seqcount_invalidate(s)                                        \
        do_write_seqcount_invalidate(seqprop_ptr(s))

static inline void do_write_seqcount_invalidate(seqcount_t *s)
{
        smp_wmb();
        kcsan_nestable_atomic_begin();
        s->sequence+=2;
        kcsan_nestable_atomic_end();
}

/*
 * Latch sequence counters (seqcount_latch_t)
 *
 * A sequence counter variant where the counter even/odd value is used to
 * switch between two copies of protected data. This allows the read path,
 * typically NMIs, to safely interrupt the write side critical section.
 *
 * As the write sections are fully preemptible, no special handling for
 * PREEMPT_RT is needed.
 */
typedef struct {
        seqcount_t seqcount;
} seqcount_latch_t;

/**
 * SEQCNT_LATCH_ZERO() - static initializer for seqcount_latch_t
 * @seq_name: Name of the seqcount_latch_t instance
 */
#define SEQCNT_LATCH_ZERO(seq_name) {                                        \
        .seqcount                = SEQCNT_ZERO(seq_name.seqcount),        \
}

/**
 * seqcount_latch_init() - runtime initializer for seqcount_latch_t
 * @s: Pointer to the seqcount_latch_t instance
 */
#define seqcount_latch_init(s) seqcount_init(&(s)->seqcount)

/**
 * raw_read_seqcount_latch() - pick even/odd latch data copy
 * @s: Pointer to seqcount_latch_t
 *
 * See raw_write_seqcount_latch() for details and a full reader/writer
 * usage example.
 *
 * Return: sequence counter raw value. Use the lowest bit as an index for
 * picking which data copy to read. The full counter must then be checked
 * with raw_read_seqcount_latch_retry().
 */
static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
{
        /*
         * Pairs with the first smp_wmb() in raw_write_seqcount_latch().
         * Due to the dependent load, a full smp_rmb() is not needed.
         */
        return READ_ONCE(s->seqcount.sequence);
}

/**
 * raw_read_seqcount_latch_retry() - end a seqcount_latch_t read section
 * @s:                Pointer to seqcount_latch_t
 * @start:        count, from raw_read_seqcount_latch()
 *
 * Return: true if a read section retry is required, else false
 */
static __always_inline int
raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
{
        smp_rmb();
        return unlikely(READ_ONCE(s->seqcount.sequence) != start);
}

/**
 * raw_write_seqcount_latch() - redirect latch readers to even/odd copy
 * @s: Pointer to seqcount_latch_t
 *
 * The latch technique is a multiversion concurrency control method that allows
 * queries during non-atomic modifications. If you can guarantee queries never
 * interrupt the modification -- e.g. the concurrency is strictly between CPUs
 * -- you most likely do not need this.
 *
 * Where the traditional RCU/lockless data structures rely on atomic
 * modifications to ensure queries observe either the old or the new state the
 * latch allows the same for non-atomic updates. The trade-off is doubling the
 * cost of storage; we have to maintain two copies of the entire data
 * structure.
 *
 * Very simply put: we first modify one copy and then the other. This ensures
 * there is always one copy in a stable state, ready to give us an answer.
 *
 * The basic form is a data structure like::
 *
 *        struct latch_struct {
 *                seqcount_latch_t        seq;
 *                struct data_struct        data[2];
 *        };
 *
 * Where a modification, which is assumed to be externally serialized, does the
 * following::
 *
 *        void latch_modify(struct latch_struct *latch, ...)
 *        {
 *                smp_wmb();        // Ensure that the last data[1] update is visible
 *                latch->seq.sequence++;
 *                smp_wmb();        // Ensure that the seqcount update is visible
 *
 *                modify(latch->data[0], ...);
 *
 *                smp_wmb();        // Ensure that the data[0] update is visible
 *                latch->seq.sequence++;
 *                smp_wmb();        // Ensure that the seqcount update is visible
 *
 *                modify(latch->data[1], ...);
 *        }
 *
 * The query will have a form like::
 *
 *        struct entry *latch_query(struct latch_struct *latch, ...)
 *        {
 *                struct entry *entry;
 *                unsigned seq, idx;
 *
 *                do {
 *                        seq = raw_read_seqcount_latch(&latch->seq);
 *
 *                        idx = seq & 0x01;
 *                        entry = data_query(latch->data[idx], ...);
 *
 *                // This includes needed smp_rmb()
 *                } while (raw_read_seqcount_latch_retry(&latch->seq, seq));
 *
 *                return entry;
 *        }
 *
 * So during the modification, queries are first redirected to data[1]. Then we
 * modify data[0]. When that is complete, we redirect queries back to data[0]
 * and we can modify data[1].
 *
 * NOTE:
 *
 *        The non-requirement for atomic modifications does _NOT_ include
 *        the publishing of new entries in the case where data is a dynamic
 *        data structure.
 *
 *        An iteration might start in data[0] and get suspended long enough
 *        to miss an entire modification sequence, once it resumes it might
 *        observe the new entry.
 *
 * NOTE2:
 *
 *        When data is a dynamic data structure; one should use regular RCU
 *        patterns to manage the lifetimes of the objects within.
 */
static inline void raw_write_seqcount_latch(seqcount_latch_t *s)
{
        smp_wmb();        /* prior stores before incrementing "sequence" */
        s->seqcount.sequence++;
        smp_wmb();      /* increment "sequence" before following stores */
}

#define __SEQLOCK_UNLOCKED(lockname)                                        \
        {                                                                \
                .seqcount = SEQCNT_SPINLOCK_ZERO(lockname, &(lockname).lock), \
                .lock =        __SPIN_LOCK_UNLOCKED(lockname)                        \
        }

/**
 * seqlock_init() - dynamic initializer for seqlock_t
 * @sl: Pointer to the seqlock_t instance
 */
#define seqlock_init(sl)                                                \
        do {                                                                \
                spin_lock_init(&(sl)->lock);                                \
                seqcount_spinlock_init(&(sl)->seqcount, &(sl)->lock);        \
        } while (0)

/**
 * DEFINE_SEQLOCK(sl) - Define a statically allocated seqlock_t
 * @sl: Name of the seqlock_t instance
 */
#define DEFINE_SEQLOCK(sl) \
                seqlock_t sl = __SEQLOCK_UNLOCKED(sl)

/**
 * read_seqbegin() - start a seqlock_t read side critical section
 * @sl: Pointer to seqlock_t
 *
 * Return: count, to be passed to read_seqretry()
 */
static inline unsigned read_seqbegin(const seqlock_t *sl)
{
        unsigned ret = read_seqcount_begin(&sl->seqcount);

        kcsan_atomic_next(0);  /* non-raw usage, assume closing read_seqretry() */
        kcsan_flat_atomic_begin();
        return ret;
}

/**
 * read_seqretry() - end a seqlock_t read side section
 * @sl: Pointer to seqlock_t
 * @start: count, from read_seqbegin()
 *
 * read_seqretry closes the read side critical section of given seqlock_t.
 * If the critical section was invalid, it must be ignored (and typically
 * retried).
 *
 * Return: true if a read section retry is required, else false
 */
static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
{
        /*
         * Assume not nested: read_seqretry() may be called multiple times when
         * completing read critical section.
         */
        kcsan_flat_atomic_end();

        return read_seqcount_retry(&sl->seqcount, start);
}

/*
 * For all seqlock_t write side functions, use the internal
 * do_write_seqcount_begin() instead of generic write_seqcount_begin().
 * This way, no redundant lockdep_assert_held() checks are added.
 */

/**
 * write_seqlock() - start a seqlock_t write side critical section
 * @sl: Pointer to seqlock_t
 *
 * write_seqlock opens a write side critical section for the given
 * seqlock_t.  It also implicitly acquires the spinlock_t embedded inside
 * that sequential lock. All seqlock_t write side sections are thus
 * automatically serialized and non-preemptible.
 *
 * Context: if the seqlock_t read section, or other write side critical
 * sections, can be invoked from hardirq or softirq contexts, use the
 * _irqsave or _bh variants of this function instead.
 */
static inline void write_seqlock(seqlock_t *sl)
{
        spin_lock(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock() - end a seqlock_t write side critical section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock closes the (serialized and non-preemptible) write side
 * critical section of given seqlock_t.
 */
static inline void write_sequnlock(seqlock_t *sl)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock(&sl->lock);
}

/**
 * write_seqlock_bh() - start a softirqs-disabled seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * _bh variant of write_seqlock(). Use only if the read side section, or
 * other write side sections, can be invoked from softirq contexts.
 */
static inline void write_seqlock_bh(seqlock_t *sl)
{
        spin_lock_bh(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock_bh() - end a softirqs-disabled seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock_bh closes the serialized, non-preemptible, and
 * softirqs-disabled, seqlock_t write side critical section opened with
 * write_seqlock_bh().
 */
static inline void write_sequnlock_bh(seqlock_t *sl)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_bh(&sl->lock);
}

/**
 * write_seqlock_irq() - start a non-interruptible seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * _irq variant of write_seqlock(). Use only if the read side section, or
 * other write sections, can be invoked from hardirq contexts.
 */
static inline void write_seqlock_irq(seqlock_t *sl)
{
        spin_lock_irq(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock_irq() - end a non-interruptible seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock_irq closes the serialized and non-interruptible
 * seqlock_t write side section opened with write_seqlock_irq().
 */
static inline void write_sequnlock_irq(seqlock_t *sl)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_irq(&sl->lock);
}

static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
{
        unsigned long flags;

        spin_lock_irqsave(&sl->lock, flags);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
        return flags;
}

/**
 * write_seqlock_irqsave() - start a non-interruptible seqlock_t write
 *                           section
 * @lock:  Pointer to seqlock_t
 * @flags: Stack-allocated storage for saving caller's local interrupt
 *         state, to be passed to write_sequnlock_irqrestore().
 *
 * _irqsave variant of write_seqlock(). Use it only if the read side
 * section, or other write sections, can be invoked from hardirq context.
 */
#define write_seqlock_irqsave(lock, flags)                                \
        do { flags = __write_seqlock_irqsave(lock); } while (0)

/**
 * write_sequnlock_irqrestore() - end non-interruptible seqlock_t write
 *                                section
 * @sl:    Pointer to seqlock_t
 * @flags: Caller's saved interrupt state, from write_seqlock_irqsave()
 *
 * write_sequnlock_irqrestore closes the serialized and non-interruptible
 * seqlock_t write section previously opened with write_seqlock_irqsave().
 */
static inline void
write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_irqrestore(&sl->lock, flags);
}

/**
 * read_seqlock_excl() - begin a seqlock_t locking reader section
 * @sl:        Pointer to seqlock_t
 *
 * read_seqlock_excl opens a seqlock_t locking reader critical section.  A
 * locking reader exclusively locks out *both* other writers *and* other
 * locking readers, but it does not update the embedded sequence number.
 *
 * Locking readers act like a normal spin_lock()/spin_unlock().
 *
 * Context: if the seqlock_t write section, *or other read sections*, can
 * be invoked from hardirq or softirq contexts, use the _irqsave or _bh
 * variant of this function instead.
 *
 * The opened read section must be closed with read_sequnlock_excl().
 */
static inline void read_seqlock_excl(seqlock_t *sl)
{
        spin_lock(&sl->lock);
}

/**
 * read_sequnlock_excl() - end a seqlock_t locking reader critical section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl(seqlock_t *sl)
{
        spin_unlock(&sl->lock);
}

/**
 * read_seqlock_excl_bh() - start a seqlock_t locking reader section with
 *                            softirqs disabled
 * @sl: Pointer to seqlock_t
 *
 * _bh variant of read_seqlock_excl(). Use this variant only if the
 * seqlock_t write side section, *or other read sections*, can be invoked
 * from softirq contexts.
 */
static inline void read_seqlock_excl_bh(seqlock_t *sl)
{
        spin_lock_bh(&sl->lock);
}

/**
 * read_sequnlock_excl_bh() - stop a seqlock_t softirq-disabled locking
 *                              reader section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl_bh(seqlock_t *sl)
{
        spin_unlock_bh(&sl->lock);
}

/**
 * read_seqlock_excl_irq() - start a non-interruptible seqlock_t locking
 *                             reader section
 * @sl: Pointer to seqlock_t
 *
 * _irq variant of read_seqlock_excl(). Use this only if the seqlock_t
 * write side section, *or other read sections*, can be invoked from a
 * hardirq context.
 */
static inline void read_seqlock_excl_irq(seqlock_t *sl)
{
        spin_lock_irq(&sl->lock);
}

/**
 * read_sequnlock_excl_irq() - end an interrupts-disabled seqlock_t
 *                             locking reader section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl_irq(seqlock_t *sl)
{
        spin_unlock_irq(&sl->lock);
}

static inline unsigned long __read_seqlock_excl_irqsave(seqlock_t *sl)
{
        unsigned long flags;

        spin_lock_irqsave(&sl->lock, flags);
        return flags;
}

/**
 * read_seqlock_excl_irqsave() - start a non-interruptible seqlock_t
 *                                 locking reader section
 * @lock:  Pointer to seqlock_t
 * @flags: Stack-allocated storage for saving caller's local interrupt
 *         state, to be passed to read_sequnlock_excl_irqrestore().
 *
 * _irqsave variant of read_seqlock_excl(). Use this only if the seqlock_t
 * write side section, *or other read sections*, can be invoked from a
 * hardirq context.
 */
#define read_seqlock_excl_irqsave(lock, flags)                                \
        do { flags = __read_seqlock_excl_irqsave(lock); } while (0)

/**
 * read_sequnlock_excl_irqrestore() - end non-interruptible seqlock_t
 *                                      locking reader section
 * @sl:    Pointer to seqlock_t
 * @flags: Caller saved interrupt state, from read_seqlock_excl_irqsave()
 */
static inline void
read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags)
{
        spin_unlock_irqrestore(&sl->lock, flags);
}

/**
 * read_seqbegin_or_lock() - begin a seqlock_t lockless or locking reader
 * @lock: Pointer to seqlock_t
 * @seq : Marker and return parameter. If the passed value is even, the
 * reader will become a *lockless* seqlock_t reader as in read_seqbegin().
 * If the passed value is odd, the reader will become a *locking* reader
 * as in read_seqlock_excl().  In the first call to this function, the
 * caller *must* initialize and pass an even value to @seq; this way, a
 * lockless read can be optimistically tried first.
 *
 * read_seqbegin_or_lock is an API designed to optimistically try a normal
 * lockless seqlock_t read section first.  If an odd counter is found, the
 * lockless read trial has failed, and the next read iteration transforms
 * itself into a full seqlock_t locking reader.
 *
 * This is typically used to avoid seqlock_t lockless readers starvation
 * (too much retry loops) in the case of a sharp spike in write side
 * activity.
 *
 * Context: if the seqlock_t write section, *or other read sections*, can
 * be invoked from hardirq or softirq contexts, use the _irqsave or _bh
 * variant of this function instead.
 *
 * Check Documentation/locking/seqlock.rst for template example code.
 *
 * Return: the encountered sequence counter value, through the @seq
 * parameter, which is overloaded as a return parameter. This returned
 * value must be checked with need_seqretry(). If the read section need to
 * be retried, this returned value must also be passed as the @seq
 * parameter of the next read_seqbegin_or_lock() iteration.
 */
static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
{
        if (!(*seq & 1))        /* Even */
                *seq = read_seqbegin(lock);
        else                        /* Odd */
                read_seqlock_excl(lock);
}

/**
 * need_seqretry() - validate seqlock_t "locking or lockless" read section
 * @lock: Pointer to seqlock_t
 * @seq: sequence count, from read_seqbegin_or_lock()
 *
 * Return: true if a read section retry is required, false otherwise
 */
static inline int need_seqretry(seqlock_t *lock, int seq)
{
        return !(seq & 1) && read_seqretry(lock, seq);
}

/**
 * done_seqretry() - end seqlock_t "locking or lockless" reader section
 * @lock: Pointer to seqlock_t
 * @seq: count, from read_seqbegin_or_lock()
 *
 * done_seqretry finishes the seqlock_t read side critical section started
 * with read_seqbegin_or_lock() and validated by need_seqretry().
 */
static inline void done_seqretry(seqlock_t *lock, int seq)
{
        if (seq & 1)
                read_sequnlock_excl(lock);
}

/**
 * read_seqbegin_or_lock_irqsave() - begin a seqlock_t lockless reader, or
 *                                   a non-interruptible locking reader
 * @lock: Pointer to seqlock_t
 * @seq:  Marker and return parameter. Check read_seqbegin_or_lock().
 *
 * This is the _irqsave variant of read_seqbegin_or_lock(). Use it only if
 * the seqlock_t write section, *or other read sections*, can be invoked
 * from hardirq context.
 *
 * Note: Interrupts will be disabled only for "locking reader" mode.
 *
 * Return:
 *
 *   1. The saved local interrupts state in case of a locking reader, to
 *      be passed to done_seqretry_irqrestore().
 *
 *   2. The encountered sequence counter value, returned through @seq
 *      overloaded as a return parameter. Check read_seqbegin_or_lock().
 */
static inline unsigned long
read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq)
{
        unsigned long flags = 0;

        if (!(*seq & 1))        /* Even */
                *seq = read_seqbegin(lock);
        else                        /* Odd */
                read_seqlock_excl_irqsave(lock, flags);

        return flags;
}

/**
 * done_seqretry_irqrestore() - end a seqlock_t lockless reader, or a
 *                                non-interruptible locking reader section
 * @lock:  Pointer to seqlock_t
 * @seq:   Count, from read_seqbegin_or_lock_irqsave()
 * @flags: Caller's saved local interrupt state in case of a locking
 *           reader, also from read_seqbegin_or_lock_irqsave()
 *
 * This is the _irqrestore variant of done_seqretry(). The read section
 * must've been opened with read_seqbegin_or_lock_irqsave(), and validated
 * by need_seqretry().
 */
static inline void
done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags)
{
        if (seq & 1)
                read_sequnlock_excl_irqrestore(lock, flags);
}
#endif /* __LINUX_SEQLOCK_H */
















































































































































































    1 































    1 




    1 





    1 

















    1 




    1 






    1 

    1 
    1 







    1 





























    1 


































    1 


    1 



























































    1 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/fs/ext4/readpage.c
 *
 * Copyright (C) 2002, Linus Torvalds.
 * Copyright (C) 2015, Google, Inc.
 *
 * This was originally taken from fs/mpage.c
 *
 * The ext4_mpage_readpages() function here is intended to
 * replace mpage_readahead() in the general case, not just for
 * encrypted files.  It has some limitations (see below), where it
 * will fall back to read_block_full_page(), but these limitations
 * should only be hit when page_size != block_size.
 *
 * This will allow us to attach a callback function to support ext4
 * encryption.
 *
 * If anything unusual happens, such as:
 *
 * - encountering a page which has buffers
 * - encountering a page which has a non-hole after a hole
 * - encountering a page with non-contiguous blocks
 *
 * then this code just gives up and calls the buffer_head-based read function.
 * It does handle a page which has holes at the end - that is a common case:
 * the end-of-file on blocksize < PAGE_SIZE setups.
 *
 */

#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/kdev_t.h>
#include <linux/gfp.h>
#include <linux/bio.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/highmem.h>
#include <linux/prefetch.h>
#include <linux/mpage.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>

#include "ext4.h"

#define NUM_PREALLOC_POST_READ_CTXS        128

static struct kmem_cache *bio_post_read_ctx_cache;
static mempool_t *bio_post_read_ctx_pool;

/* postprocessing steps for read bios */
enum bio_post_read_step {
        STEP_INITIAL = 0,
        STEP_DECRYPT,
        STEP_VERITY,
        STEP_MAX,
};

struct bio_post_read_ctx {
        struct bio *bio;
        struct work_struct work;
        unsigned int cur_step;
        unsigned int enabled_steps;
};

static void __read_end_io(struct bio *bio)
{
        struct folio_iter fi;

        bio_for_each_folio_all(fi, bio)
                folio_end_read(fi.folio, bio->bi_status == 0);
        if (bio->bi_private)
                mempool_free(bio->bi_private, bio_post_read_ctx_pool);
        bio_put(bio);
}

static void bio_post_read_processing(struct bio_post_read_ctx *ctx);

static void decrypt_work(struct work_struct *work)
{
        struct bio_post_read_ctx *ctx =
                container_of(work, struct bio_post_read_ctx, work);
        struct bio *bio = ctx->bio;

        if (fscrypt_decrypt_bio(bio))
                bio_post_read_processing(ctx);
        else
                __read_end_io(bio);
}

static void verity_work(struct work_struct *work)
{
        struct bio_post_read_ctx *ctx =
                container_of(work, struct bio_post_read_ctx, work);
        struct bio *bio = ctx->bio;

        /*
         * fsverity_verify_bio() may call readahead() again, and although verity
         * will be disabled for that, decryption may still be needed, causing
         * another bio_post_read_ctx to be allocated.  So to guarantee that
         * mempool_alloc() never deadlocks we must free the current ctx first.
         * This is safe because verity is the last post-read step.
         */
        BUILD_BUG_ON(STEP_VERITY + 1 != STEP_MAX);
        mempool_free(ctx, bio_post_read_ctx_pool);
        bio->bi_private = NULL;

        fsverity_verify_bio(bio);

        __read_end_io(bio);
}

static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
{
        /*
         * We use different work queues for decryption and for verity because
         * verity may require reading metadata pages that need decryption, and
         * we shouldn't recurse to the same workqueue.
         */
        switch (++ctx->cur_step) {
        case STEP_DECRYPT:
                if (ctx->enabled_steps & (1 << STEP_DECRYPT)) {
                        INIT_WORK(&ctx->work, decrypt_work);
                        fscrypt_enqueue_decrypt_work(&ctx->work);
                        return;
                }
                ctx->cur_step++;
                fallthrough;
        case STEP_VERITY:
                if (ctx->enabled_steps & (1 << STEP_VERITY)) {
                        INIT_WORK(&ctx->work, verity_work);
                        fsverity_enqueue_verify_work(&ctx->work);
                        return;
                }
                ctx->cur_step++;
                fallthrough;
        default:
                __read_end_io(ctx->bio);
        }
}

static bool bio_post_read_required(struct bio *bio)
{
        return bio->bi_private && !bio->bi_status;
}

/*
 * I/O completion handler for multipage BIOs.
 *
 * The mpage code never puts partial pages into a BIO (except for end-of-file).
 * If a page does not map to a contiguous run of blocks then it simply falls
 * back to block_read_full_folio().
 *
 * Why is this?  If a page's completion depends on a number of different BIOs
 * which can complete in any order (or at the same time) then determining the
 * status of that page is hard.  See end_buffer_async_read() for the details.
 * There is no point in duplicating all that complexity.
 */
static void mpage_end_io(struct bio *bio)
{
        if (bio_post_read_required(bio)) {
                struct bio_post_read_ctx *ctx = bio->bi_private;

                ctx->cur_step = STEP_INITIAL;
                bio_post_read_processing(ctx);
                return;
        }
        __read_end_io(bio);
}

static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx)
{
        return fsverity_active(inode) &&
               idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
}

static void ext4_set_bio_post_read_ctx(struct bio *bio,
                                       const struct inode *inode,
                                       pgoff_t first_idx)
{
        unsigned int post_read_steps = 0;

        if (fscrypt_inode_uses_fs_layer_crypto(inode))
                post_read_steps |= 1 << STEP_DECRYPT;

        if (ext4_need_verity(inode, first_idx))
                post_read_steps |= 1 << STEP_VERITY;

        if (post_read_steps) {
                /* Due to the mempool, this never fails. */
                struct bio_post_read_ctx *ctx =
                        mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);

                ctx->bio = bio;
                ctx->enabled_steps = post_read_steps;
                bio->bi_private = ctx;
        }
}

static inline loff_t ext4_readpage_limit(struct inode *inode)
{
        if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
                return inode->i_sb->s_maxbytes;

        return i_size_read(inode);
}

int ext4_mpage_readpages(struct inode *inode,
                struct readahead_control *rac, struct folio *folio)
{
        struct bio *bio = NULL;
        sector_t last_block_in_bio = 0;

        const unsigned blkbits = inode->i_blkbits;
        const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
        const unsigned blocksize = 1 << blkbits;
        sector_t next_block;
        sector_t block_in_file;
        sector_t last_block;
        sector_t last_block_in_file;
        sector_t blocks[MAX_BUF_PER_PAGE];
        unsigned page_block;
        struct block_device *bdev = inode->i_sb->s_bdev;
        int length;
        unsigned relative_block = 0;
        struct ext4_map_blocks map;
        unsigned int nr_pages = rac ? readahead_count(rac) : 1;

        map.m_pblk = 0;
        map.m_lblk = 0;
        map.m_len = 0;
        map.m_flags = 0;

        for (; nr_pages; nr_pages--) {
                int fully_mapped = 1;
                unsigned first_hole = blocks_per_page;

                if (rac)
                        folio = readahead_folio(rac);
                prefetchw(&folio->flags);

                if (folio_buffers(folio))
                        goto confused;

                block_in_file = next_block =
                        (sector_t)folio->index << (PAGE_SHIFT - blkbits);
                last_block = block_in_file + nr_pages * blocks_per_page;
                last_block_in_file = (ext4_readpage_limit(inode) +
                                      blocksize - 1) >> blkbits;
                if (last_block > last_block_in_file)
                        last_block = last_block_in_file;
                page_block = 0;

                /*
                 * Map blocks using the previous result first.
                 */
                if ((map.m_flags & EXT4_MAP_MAPPED) &&
                    block_in_file > map.m_lblk &&
                    block_in_file < (map.m_lblk + map.m_len)) {
                        unsigned map_offset = block_in_file - map.m_lblk;
                        unsigned last = map.m_len - map_offset;

                        for (relative_block = 0; ; relative_block++) {
                                if (relative_block == last) {
                                        /* needed? */
                                        map.m_flags &= ~EXT4_MAP_MAPPED;
                                        break;
                                }
                                if (page_block == blocks_per_page)
                                        break;
                                blocks[page_block] = map.m_pblk + map_offset +
                                        relative_block;
                                page_block++;
                                block_in_file++;
                        }
                }

                /*
                 * Then do more ext4_map_blocks() calls until we are
                 * done with this folio.
                 */
                while (page_block < blocks_per_page) {
                        if (block_in_file < last_block) {
                                map.m_lblk = block_in_file;
                                map.m_len = last_block - block_in_file;

                                if (ext4_map_blocks(NULL, inode, &map, 0) < 0) {
                                set_error_page:
                                        folio_zero_segment(folio, 0,
                                                          folio_size(folio));
                                        folio_unlock(folio);
                                        goto next_page;
                                }
                        }
                        if ((map.m_flags & EXT4_MAP_MAPPED) == 0) {
                                fully_mapped = 0;
                                if (first_hole == blocks_per_page)
                                        first_hole = page_block;
                                page_block++;
                                block_in_file++;
                                continue;
                        }
                        if (first_hole != blocks_per_page)
                                goto confused;                /* hole -> non-hole */

                        /* Contiguous blocks? */
                        if (page_block && blocks[page_block-1] != map.m_pblk-1)
                                goto confused;
                        for (relative_block = 0; ; relative_block++) {
                                if (relative_block == map.m_len) {
                                        /* needed? */
                                        map.m_flags &= ~EXT4_MAP_MAPPED;
                                        break;
                                } else if (page_block == blocks_per_page)
                                        break;
                                blocks[page_block] = map.m_pblk+relative_block;
                                page_block++;
                                block_in_file++;
                        }
                }
                if (first_hole != blocks_per_page) {
                        folio_zero_segment(folio, first_hole << blkbits,
                                          folio_size(folio));
                        if (first_hole == 0) {
                                if (ext4_need_verity(inode, folio->index) &&
                                    !fsverity_verify_folio(folio))
                                        goto set_error_page;
                                folio_end_read(folio, true);
                                continue;
                        }
                } else if (fully_mapped) {
                        folio_set_mappedtodisk(folio);
                }

                /*
                 * This folio will go to BIO.  Do we need to send this
                 * BIO off first?
                 */
                if (bio && (last_block_in_bio != blocks[0] - 1 ||
                            !fscrypt_mergeable_bio(bio, inode, next_block))) {
                submit_and_realloc:
                        submit_bio(bio);
                        bio = NULL;
                }
                if (bio == NULL) {
                        /*
                         * bio_alloc will _always_ be able to allocate a bio if
                         * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
                         */
                        bio = bio_alloc(bdev, bio_max_segs(nr_pages),
                                        REQ_OP_READ, GFP_KERNEL);
                        fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
                                                  GFP_KERNEL);
                        ext4_set_bio_post_read_ctx(bio, inode, folio->index);
                        bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
                        bio->bi_end_io = mpage_end_io;
                        if (rac)
                                bio->bi_opf |= REQ_RAHEAD;
                }

                length = first_hole << blkbits;
                if (!bio_add_folio(bio, folio, length, 0))
                        goto submit_and_realloc;

                if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
                     (relative_block == map.m_len)) ||
                    (first_hole != blocks_per_page)) {
                        submit_bio(bio);
                        bio = NULL;
                } else
                        last_block_in_bio = blocks[blocks_per_page - 1];
                continue;
        confused:
                if (bio) {
                        submit_bio(bio);
                        bio = NULL;
                }
                if (!folio_test_uptodate(folio))
                        block_read_full_folio(folio, ext4_get_block);
                else
                        folio_unlock(folio);
next_page:
                ; /* A label shall be followed by a statement until C23 */
        }
        if (bio)
                submit_bio(bio);
        return 0;
}

int __init ext4_init_post_read_processing(void)
{
        bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT);

        if (!bio_post_read_ctx_cache)
                goto fail;
        bio_post_read_ctx_pool =
                mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS,
                                         bio_post_read_ctx_cache);
        if (!bio_post_read_ctx_pool)
                goto fail_free_cache;
        return 0;

fail_free_cache:
        kmem_cache_destroy(bio_post_read_ctx_cache);
fail:
        return -ENOMEM;
}

void ext4_exit_post_read_processing(void)
{
        mempool_destroy(bio_post_read_ctx_pool);
        kmem_cache_destroy(bio_post_read_ctx_cache);
}



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _SCSI_SCSI_HOST_H
#define _SCSI_SCSI_HOST_H

#include <linux/device.h>
#include <linux/list.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>
#include <linux/blk-mq.h>
#include <scsi/scsi.h>

struct block_device;
struct completion;
struct module;
struct scsi_cmnd;
struct scsi_device;
struct scsi_target;
struct Scsi_Host;
struct scsi_transport_template;


#define SG_ALL        SG_CHUNK_SIZE

#define MODE_UNKNOWN 0x00
#define MODE_INITIATOR 0x01
#define MODE_TARGET 0x02

/**
 * enum scsi_timeout_action - How to handle a command that timed out.
 * @SCSI_EH_DONE: The command has already been completed.
 * @SCSI_EH_RESET_TIMER: Reset the timer and continue waiting for completion.
 * @SCSI_EH_NOT_HANDLED: The command has not yet finished. Abort the command.
 */
enum scsi_timeout_action {
        SCSI_EH_DONE,
        SCSI_EH_RESET_TIMER,
        SCSI_EH_NOT_HANDLED,
};

struct scsi_host_template {
        /*
         * Put fields referenced in IO submission path together in
         * same cacheline
         */

        /*
         * Additional per-command data allocated for the driver.
         */
        unsigned int cmd_size;

        /*
         * The queuecommand function is used to queue up a scsi
         * command block to the LLDD.  When the driver finished
         * processing the command the done callback is invoked.
         *
         * If queuecommand returns 0, then the driver has accepted the
         * command.  It must also push it to the HBA if the scsi_cmnd
         * flag SCMD_LAST is set, or if the driver does not implement
         * commit_rqs.  The done() function must be called on the command
         * when the driver has finished with it. (you may call done on the
         * command before queuecommand returns, but in this case you
         * *must* return 0 from queuecommand).
         *
         * Queuecommand may also reject the command, in which case it may
         * not touch the command and must not call done() for it.
         *
         * There are two possible rejection returns:
         *
         *   SCSI_MLQUEUE_DEVICE_BUSY: Block this device temporarily, but
         *   allow commands to other devices serviced by this host.
         *
         *   SCSI_MLQUEUE_HOST_BUSY: Block all devices served by this
         *   host temporarily.
         *
         * For compatibility, any other non-zero return is treated the
         * same as SCSI_MLQUEUE_HOST_BUSY.
         *
         * NOTE: "temporarily" means either until the next command for#
         * this device/host completes, or a period of time determined by
         * I/O pressure in the system if there are no other outstanding
         * commands.
         *
         * STATUS: REQUIRED
         */
        int (* queuecommand)(struct Scsi_Host *, struct scsi_cmnd *);

        /*
         * The commit_rqs function is used to trigger a hardware
         * doorbell after some requests have been queued with
         * queuecommand, when an error is encountered before sending
         * the request with SCMD_LAST set.
         *
         * STATUS: OPTIONAL
         */
        void (*commit_rqs)(struct Scsi_Host *, u16);

        struct module *module;
        const char *name;

        /*
         * The info function will return whatever useful information the
         * developer sees fit.  If not provided, then the name field will
         * be used instead.
         *
         * Status: OPTIONAL
         */
        const char *(*info)(struct Scsi_Host *);

        /*
         * Ioctl interface
         *
         * Status: OPTIONAL
         */
        int (*ioctl)(struct scsi_device *dev, unsigned int cmd,
                     void __user *arg);


#ifdef CONFIG_COMPAT
        /*
         * Compat handler. Handle 32bit ABI.
         * When unknown ioctl is passed return -ENOIOCTLCMD.
         *
         * Status: OPTIONAL
         */
        int (*compat_ioctl)(struct scsi_device *dev, unsigned int cmd,
                            void __user *arg);
#endif

        int (*init_cmd_priv)(struct Scsi_Host *shost, struct scsi_cmnd *cmd);
        int (*exit_cmd_priv)(struct Scsi_Host *shost, struct scsi_cmnd *cmd);

        /*
         * This is an error handling strategy routine.  You don't need to
         * define one of these if you don't want to - there is a default
         * routine that is present that should work in most cases.  For those
         * driver authors that have the inclination and ability to write their
         * own strategy routine, this is where it is specified.  Note - the
         * strategy routine is *ALWAYS* run in the context of the kernel eh
         * thread.  Thus you are guaranteed to *NOT* be in an interrupt
         * handler when you execute this, and you are also guaranteed to
         * *NOT* have any other commands being queued while you are in the
         * strategy routine. When you return from this function, operations
         * return to normal.
         *
         * See scsi_error.c scsi_unjam_host for additional comments about
         * what this function should and should not be attempting to do.
         *
         * Status: REQUIRED        (at least one of them)
         */
        int (* eh_abort_handler)(struct scsi_cmnd *);
        int (* eh_device_reset_handler)(struct scsi_cmnd *);
        int (* eh_target_reset_handler)(struct scsi_cmnd *);
        int (* eh_bus_reset_handler)(struct scsi_cmnd *);
        int (* eh_host_reset_handler)(struct scsi_cmnd *);

        /*
         * Before the mid layer attempts to scan for a new device where none
         * currently exists, it will call this entry in your driver.  Should
         * your driver need to allocate any structs or perform any other init
         * items in order to send commands to a currently unused target/lun
         * combo, then this is where you can perform those allocations.  This
         * is specifically so that drivers won't have to perform any kind of
         * "is this a new device" checks in their queuecommand routine,
         * thereby making the hot path a bit quicker.
         *
         * Return values: 0 on success, non-0 on failure
         *
         * Deallocation:  If we didn't find any devices at this ID, you will
         * get an immediate call to slave_destroy().  If we find something
         * here then you will get a call to slave_configure(), then the
         * device will be used for however long it is kept around, then when
         * the device is removed from the system (or * possibly at reboot
         * time), you will then get a call to slave_destroy().  This is
         * assuming you implement slave_configure and slave_destroy.
         * However, if you allocate memory and hang it off the device struct,
         * then you must implement the slave_destroy() routine at a minimum
         * in order to avoid leaking memory
         * each time a device is tore down.
         *
         * Status: OPTIONAL
         */
        int (* slave_alloc)(struct scsi_device *);

        /*
         * Once the device has responded to an INQUIRY and we know the
         * device is online, we call into the low level driver with the
         * struct scsi_device *.  If the low level device driver implements
         * this function, it *must* perform the task of setting the queue
         * depth on the device.  All other tasks are optional and depend
         * on what the driver supports and various implementation details.
         * 
         * Things currently recommended to be handled at this time include:
         *
         * 1.  Setting the device queue depth.  Proper setting of this is
         *     described in the comments for scsi_change_queue_depth.
         * 2.  Determining if the device supports the various synchronous
         *     negotiation protocols.  The device struct will already have
         *     responded to INQUIRY and the results of the standard items
         *     will have been shoved into the various device flag bits, eg.
         *     device->sdtr will be true if the device supports SDTR messages.
         * 3.  Allocating command structs that the device will need.
         * 4.  Setting the default timeout on this device (if needed).
         * 5.  Anything else the low level driver might want to do on a device
         *     specific setup basis...
         * 6.  Return 0 on success, non-0 on error.  The device will be marked
         *     as offline on error so that no access will occur.  If you return
         *     non-0, your slave_destroy routine will never get called for this
         *     device, so don't leave any loose memory hanging around, clean
         *     up after yourself before returning non-0
         *
         * Status: OPTIONAL
         *
         * Note: slave_configure is the legacy version, use device_configure for
         * all new code.  A driver must never define both.
         */
        int (* device_configure)(struct scsi_device *, struct queue_limits *lim);
        int (* slave_configure)(struct scsi_device *);

        /*
         * Immediately prior to deallocating the device and after all activity
         * has ceased the mid layer calls this point so that the low level
         * driver may completely detach itself from the scsi device and vice
         * versa.  The low level driver is responsible for freeing any memory
         * it allocated in the slave_alloc or slave_configure calls. 
         *
         * Status: OPTIONAL
         */
        void (* slave_destroy)(struct scsi_device *);

        /*
         * Before the mid layer attempts to scan for a new device attached
         * to a target where no target currently exists, it will call this
         * entry in your driver.  Should your driver need to allocate any
         * structs or perform any other init items in order to send commands
         * to a currently unused target, then this is where you can perform
         * those allocations.
         *
         * Return values: 0 on success, non-0 on failure
         *
         * Status: OPTIONAL
         */
        int (* target_alloc)(struct scsi_target *);

        /*
         * Immediately prior to deallocating the target structure, and
         * after all activity to attached scsi devices has ceased, the
         * midlayer calls this point so that the driver may deallocate
         * and terminate any references to the target.
         *
         * Note: This callback is called with the host lock held and hence
         * must not sleep.
         *
         * Status: OPTIONAL
         */
        void (* target_destroy)(struct scsi_target *);

        /*
         * If a host has the ability to discover targets on its own instead
         * of scanning the entire bus, it can fill in this function and
         * call scsi_scan_host().  This function will be called periodically
         * until it returns 1 with the scsi_host and the elapsed time of
         * the scan in jiffies.
         *
         * Status: OPTIONAL
         */
        int (* scan_finished)(struct Scsi_Host *, unsigned long);

        /*
         * If the host wants to be called before the scan starts, but
         * after the midlayer has set up ready for the scan, it can fill
         * in this function.
         *
         * Status: OPTIONAL
         */
        void (* scan_start)(struct Scsi_Host *);

        /*
         * Fill in this function to allow the queue depth of this host
         * to be changeable (on a per device basis).  Returns either
         * the current queue depth setting (may be different from what
         * was passed in) or an error.  An error should only be
         * returned if the requested depth is legal but the driver was
         * unable to set it.  If the requested depth is illegal, the
         * driver should set and return the closest legal queue depth.
         *
         * Status: OPTIONAL
         */
        int (* change_queue_depth)(struct scsi_device *, int);

        /*
         * This functions lets the driver expose the queue mapping
         * to the block layer.
         *
         * Status: OPTIONAL
         */
        void (* map_queues)(struct Scsi_Host *shost);

        /*
         * SCSI interface of blk_poll - poll for IO completions.
         * Only applicable if SCSI LLD exposes multiple h/w queues.
         *
         * Return value: Number of completed entries found.
         *
         * Status: OPTIONAL
         */
        int (* mq_poll)(struct Scsi_Host *shost, unsigned int queue_num);

        /*
         * Check if scatterlists need to be padded for DMA draining.
         *
         * Status: OPTIONAL
         */
        bool (* dma_need_drain)(struct request *rq);

        /*
         * This function determines the BIOS parameters for a given
         * harddisk.  These tend to be numbers that are made up by
         * the host adapter.  Parameters:
         * size, device, list (heads, sectors, cylinders)
         *
         * Status: OPTIONAL
         */
        int (* bios_param)(struct scsi_device *, struct block_device *,
                        sector_t, int []);

        /*
         * This function is called when one or more partitions on the
         * device reach beyond the end of the device.
         *
         * Status: OPTIONAL
         */
        void (*unlock_native_capacity)(struct scsi_device *);

        /*
         * Can be used to export driver statistics and other infos to the
         * world outside the kernel ie. userspace and it also provides an
         * interface to feed the driver with information.
         *
         * Status: OBSOLETE
         */
        int (*show_info)(struct seq_file *, struct Scsi_Host *);
        int (*write_info)(struct Scsi_Host *, char *, int);

        /*
         * This is an optional routine that allows the transport to become
         * involved when a scsi io timer fires. The return value tells the
         * timer routine how to finish the io timeout handling.
         *
         * Status: OPTIONAL
         */
        enum scsi_timeout_action (*eh_timed_out)(struct scsi_cmnd *);
        /*
         * Optional routine that allows the transport to decide if a cmd
         * is retryable. Return true if the transport is in a state the
         * cmd should be retried on.
         */
        bool (*eh_should_retry_cmd)(struct scsi_cmnd *scmd);

        /* This is an optional routine that allows transport to initiate
         * LLD adapter or firmware reset using sysfs attribute.
         *
         * Return values: 0 on success, -ve value on failure.
         *
         * Status: OPTIONAL
         */

        int (*host_reset)(struct Scsi_Host *shost, int reset_type);
#define SCSI_ADAPTER_RESET        1
#define SCSI_FIRMWARE_RESET        2


        /*
         * Name of proc directory
         */
        const char *proc_name;

        /*
         * This determines if we will use a non-interrupt driven
         * or an interrupt driven scheme.  It is set to the maximum number
         * of simultaneous commands a single hw queue in HBA will accept.
         */
        int can_queue;

        /*
         * In many instances, especially where disconnect / reconnect are
         * supported, our host also has an ID on the SCSI bus.  If this is
         * the case, then it must be reserved.  Please set this_id to -1 if
         * your setup is in single initiator mode, and the host lacks an
         * ID.
         */
        int this_id;

        /*
         * This determines the degree to which the host adapter is capable
         * of scatter-gather.
         */
        unsigned short sg_tablesize;
        unsigned short sg_prot_tablesize;

        /*
         * Set this if the host adapter has limitations beside segment count.
         */
        unsigned int max_sectors;

        /*
         * Maximum size in bytes of a single segment.
         */
        unsigned int max_segment_size;

        unsigned int dma_alignment;

        /*
         * DMA scatter gather segment boundary limit. A segment crossing this
         * boundary will be split in two.
         */
        unsigned long dma_boundary;

        unsigned long virt_boundary_mask;

        /*
         * This specifies "machine infinity" for host templates which don't
         * limit the transfer size.  Note this limit represents an absolute
         * maximum, and may be over the transfer limits allowed for
         * individual devices (e.g. 256 for SCSI-1).
         */
#define SCSI_DEFAULT_MAX_SECTORS        1024

        /*
         * True if this host adapter can make good use of linked commands.
         * This will allow more than one command to be queued to a given
         * unit on a given host.  Set this to the maximum number of command
         * blocks to be provided for each device.  Set this to 1 for one
         * command block per lun, 2 for two, etc.  Do not set this to 0.
         * You should make sure that the host adapter will do the right thing
         * before you try setting this above 1.
         */
        short cmd_per_lun;

        /* If use block layer to manage tags, this is tag allocation policy */
        int tag_alloc_policy;

        /*
         * Track QUEUE_FULL events and reduce queue depth on demand.
         */
        unsigned track_queue_depth:1;

        /*
         * This specifies the mode that a LLD supports.
         */
        unsigned supported_mode:2;

        /*
         * True for emulated SCSI host adapters (e.g. ATAPI).
         */
        unsigned emulated:1;

        /*
         * True if the low-level driver performs its own reset-settle delays.
         */
        unsigned skip_settle_delay:1;

        /* True if the controller does not support WRITE SAME */
        unsigned no_write_same:1;

        /* True if the host uses host-wide tagspace */
        unsigned host_tagset:1;

        /* The queuecommand callback may block. See also BLK_MQ_F_BLOCKING. */
        unsigned queuecommand_may_block:1;

        /*
         * Countdown for host blocking with no commands outstanding.
         */
        unsigned int max_host_blocked;

        /*
         * Default value for the blocking.  If the queue is empty,
         * host_blocked counts down in the request_fn until it restarts
         * host operations as zero is reached.  
         *
         * FIXME: This should probably be a value in the template
         */
#define SCSI_DEFAULT_HOST_BLOCKED        7

        /*
         * Pointer to the SCSI host sysfs attribute groups, NULL terminated.
         */
        const struct attribute_group **shost_groups;

        /*
         * Pointer to the SCSI device attribute groups for this host,
         * NULL terminated.
         */
        const struct attribute_group **sdev_groups;

        /*
         * Vendor Identifier associated with the host
         *
         * Note: When specifying vendor_id, be sure to read the
         *   Vendor Type and ID formatting requirements specified in
         *   scsi_netlink.h
         */
        u64 vendor_id;
};

/*
 * Temporary #define for host lock push down. Can be removed when all
 * drivers have been updated to take advantage of unlocked
 * queuecommand.
 *
 */
#define DEF_SCSI_QCMD(func_name) \
        int func_name(struct Scsi_Host *shost, struct scsi_cmnd *cmd)        \
        {                                                                \
                unsigned long irq_flags;                                \
                int rc;                                                        \
                spin_lock_irqsave(shost->host_lock, irq_flags);                \
                rc = func_name##_lck(cmd);                                \
                spin_unlock_irqrestore(shost->host_lock, irq_flags);        \
                return rc;                                                \
        }


/*
 * shost state: If you alter this, you also need to alter scsi_sysfs.c
 * (for the ascii descriptions) and the state model enforcer:
 * scsi_host_set_state()
 */
enum scsi_host_state {
        SHOST_CREATED = 1,
        SHOST_RUNNING,
        SHOST_CANCEL,
        SHOST_DEL,
        SHOST_RECOVERY,
        SHOST_CANCEL_RECOVERY,
        SHOST_DEL_RECOVERY,
};

struct Scsi_Host {
        /*
         * __devices is protected by the host_lock, but you should
         * usually use scsi_device_lookup / shost_for_each_device
         * to access it and don't care about locking yourself.
         * In the rare case of being in irq context you can use
         * their __ prefixed variants with the lock held. NEVER
         * access this list directly from a driver.
         */
        struct list_head        __devices;
        struct list_head        __targets;
        
        struct list_head        starved_list;

        spinlock_t                default_lock;
        spinlock_t                *host_lock;

        struct mutex                scan_mutex;/* serialize scanning activity */

        struct list_head        eh_abort_list;
        struct list_head        eh_cmd_q;
        struct task_struct    * ehandler;  /* Error recovery thread. */
        struct completion     * eh_action; /* Wait for specific actions on the
                                              host. */
        wait_queue_head_t       host_wait;
        const struct scsi_host_template *hostt;
        struct scsi_transport_template *transportt;

        struct kref                tagset_refcnt;
        struct completion        tagset_freed;
        /* Area to keep a shared tag map */
        struct blk_mq_tag_set        tag_set;

        atomic_t host_blocked;

        unsigned int host_failed;           /* commands that failed.
                                              protected by host_lock */
        unsigned int host_eh_scheduled;    /* EH scheduled without command */
    
        unsigned int host_no;  /* Used for IOCTL_GET_IDLUN, /proc/scsi et al. */

        /* next two fields are used to bound the time spent in error handling */
        int eh_deadline;
        unsigned long last_reset;


        /*
         * These three parameters can be used to allow for wide scsi,
         * and for host adapters that support multiple busses
         * The last two should be set to 1 more than the actual max id
         * or lun (e.g. 8 for SCSI parallel systems).
         */
        unsigned int max_channel;
        unsigned int max_id;
        u64 max_lun;

        /*
         * This is a unique identifier that must be assigned so that we
         * have some way of identifying each detected host adapter properly
         * and uniquely.  For hosts that do not support more than one card
         * in the system at one time, this does not need to be set.  It is
         * initialized to 0 in scsi_register.
         */
        unsigned int unique_id;

        /*
         * The maximum length of SCSI commands that this host can accept.
         * Probably 12 for most host adapters, but could be 16 for others.
         * or 260 if the driver supports variable length cdbs.
         * For drivers that don't set this field, a value of 12 is
         * assumed.
         */
        unsigned short max_cmd_len;

        int this_id;
        int can_queue;
        short cmd_per_lun;
        short unsigned int sg_tablesize;
        short unsigned int sg_prot_tablesize;
        unsigned int max_sectors;
        unsigned int opt_sectors;
        unsigned int max_segment_size;
        unsigned int dma_alignment;
        unsigned long dma_boundary;
        unsigned long virt_boundary_mask;
        /*
         * In scsi-mq mode, the number of hardware queues supported by the LLD.
         *
         * Note: it is assumed that each hardware queue has a queue depth of
         * can_queue. In other words, the total queue depth per host
         * is nr_hw_queues * can_queue. However, for when host_tagset is set,
         * the total queue depth is can_queue.
         */
        unsigned nr_hw_queues;
        unsigned nr_maps;
        unsigned active_mode:2;

        /*
         * Host has requested that no further requests come through for the
         * time being.
         */
        unsigned host_self_blocked:1;
    
        /*
         * Host uses correct SCSI ordering not PC ordering. The bit is
         * set for the minority of drivers whose authors actually read
         * the spec ;).
         */
        unsigned reverse_ordering:1;

        /* Task mgmt function in progress */
        unsigned tmf_in_progress:1;

        /* Asynchronous scan in progress */
        unsigned async_scan:1;

        /* Don't resume host in EH */
        unsigned eh_noresume:1;

        /* The controller does not support WRITE SAME */
        unsigned no_write_same:1;

        /* True if the host uses host-wide tagspace */
        unsigned host_tagset:1;

        /* The queuecommand callback may block. See also BLK_MQ_F_BLOCKING. */
        unsigned queuecommand_may_block:1;

        /* Host responded with short (<36 bytes) INQUIRY result */
        unsigned short_inquiry:1;

        /* The transport requires the LUN bits NOT to be stored in CDB[1] */
        unsigned no_scsi2_lun_in_cdb:1;

        unsigned no_highmem:1;

        /*
         * Optional work queue to be utilized by the transport
         */
        char work_q_name[20];
        struct workqueue_struct *work_q;

        /*
         * Task management function work queue
         */
        struct workqueue_struct *tmf_work_q;

        /*
         * Value host_blocked counts down from
         */
        unsigned int max_host_blocked;

        /* Protection Information */
        unsigned int prot_capabilities;
        unsigned char prot_guard_type;

        /* legacy crap */
        unsigned long base;
        unsigned long io_port;
        unsigned char n_io_port;
        unsigned char dma_channel;
        unsigned int  irq;
        

        enum scsi_host_state shost_state;

        /* ldm bits */
        struct device                shost_gendev, shost_dev;

        /*
         * Points to the transport data (if any) which is allocated
         * separately
         */
        void *shost_data;

        /*
         * Points to the physical bus device we'd use to do DMA
         * Needed just in case we have virtual hosts.
         */
        struct device *dma_dev;

        /* Delay for runtime autosuspend */
        int rpm_autosuspend_delay;

        /*
         * We should ensure that this is aligned, both for better performance
         * and also because some compilers (m68k) don't automatically force
         * alignment to a long boundary.
         */
        unsigned long hostdata[]  /* Used for storage of host specific stuff */
                __attribute__ ((aligned (sizeof(unsigned long))));
};

#define                class_to_shost(d)        \
        container_of(d, struct Scsi_Host, shost_dev)

#define shost_printk(prefix, shost, fmt, a...)        \
        dev_printk(prefix, &(shost)->shost_gendev, fmt, ##a)

static inline void *shost_priv(struct Scsi_Host *shost)
{
        return (void *)shost->hostdata;
}

int scsi_is_host_device(const struct device *);

static inline struct Scsi_Host *dev_to_shost(struct device *dev)
{
        while (!scsi_is_host_device(dev)) {
                if (!dev->parent)
                        return NULL;
                dev = dev->parent;
        }
        return container_of(dev, struct Scsi_Host, shost_gendev);
}

static inline int scsi_host_in_recovery(struct Scsi_Host *shost)
{
        return shost->shost_state == SHOST_RECOVERY ||
                shost->shost_state == SHOST_CANCEL_RECOVERY ||
                shost->shost_state == SHOST_DEL_RECOVERY ||
                shost->tmf_in_progress;
}

extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *);
extern void scsi_flush_work(struct Scsi_Host *);

extern struct Scsi_Host *scsi_host_alloc(const struct scsi_host_template *, int);
extern int __must_check scsi_add_host_with_dma(struct Scsi_Host *,
                                               struct device *,
                                               struct device *);
#if defined(CONFIG_SCSI_PROC_FS)
struct proc_dir_entry *
scsi_template_proc_dir(const struct scsi_host_template *sht);
#else
#define scsi_template_proc_dir(sht) NULL
#endif
extern void scsi_scan_host(struct Scsi_Host *);
extern int scsi_resume_device(struct scsi_device *sdev);
extern int scsi_rescan_device(struct scsi_device *sdev);
extern void scsi_remove_host(struct Scsi_Host *);
extern struct Scsi_Host *scsi_host_get(struct Scsi_Host *);
extern int scsi_host_busy(struct Scsi_Host *shost);
extern void scsi_host_put(struct Scsi_Host *t);
extern struct Scsi_Host *scsi_host_lookup(unsigned int hostnum);
extern const char *scsi_host_state_name(enum scsi_host_state);
extern void scsi_host_complete_all_commands(struct Scsi_Host *shost,
                                            enum scsi_host_status status);

static inline int __must_check scsi_add_host(struct Scsi_Host *host,
                                             struct device *dev)
{
        return scsi_add_host_with_dma(host, dev, dev);
}

static inline struct device *scsi_get_device(struct Scsi_Host *shost)
{
        return shost->shost_gendev.parent;
}

/**
 * scsi_host_scan_allowed - Is scanning of this host allowed
 * @shost:        Pointer to Scsi_Host.
 **/
static inline int scsi_host_scan_allowed(struct Scsi_Host *shost)
{
        return shost->shost_state == SHOST_RUNNING ||
               shost->shost_state == SHOST_RECOVERY;
}

extern void scsi_unblock_requests(struct Scsi_Host *);
extern void scsi_block_requests(struct Scsi_Host *);
extern int scsi_host_block(struct Scsi_Host *shost);
extern int scsi_host_unblock(struct Scsi_Host *shost, int new_state);

void scsi_host_busy_iter(struct Scsi_Host *,
                         bool (*fn)(struct scsi_cmnd *, void *), void *priv);

struct class_container;

/*
 * DIF defines the exchange of protection information between
 * initiator and SBC block device.
 *
 * DIX defines the exchange of protection information between OS and
 * initiator.
 */
enum scsi_host_prot_capabilities {
        SHOST_DIF_TYPE1_PROTECTION = 1 << 0, /* T10 DIF Type 1 */
        SHOST_DIF_TYPE2_PROTECTION = 1 << 1, /* T10 DIF Type 2 */
        SHOST_DIF_TYPE3_PROTECTION = 1 << 2, /* T10 DIF Type 3 */

        SHOST_DIX_TYPE0_PROTECTION = 1 << 3, /* DIX between OS and HBA only */
        SHOST_DIX_TYPE1_PROTECTION = 1 << 4, /* DIX with DIF Type 1 */
        SHOST_DIX_TYPE2_PROTECTION = 1 << 5, /* DIX with DIF Type 2 */
        SHOST_DIX_TYPE3_PROTECTION = 1 << 6, /* DIX with DIF Type 3 */
};

/*
 * SCSI hosts which support the Data Integrity Extensions must
 * indicate their capabilities by setting the prot_capabilities using
 * this call.
 */
static inline void scsi_host_set_prot(struct Scsi_Host *shost, unsigned int mask)
{
        shost->prot_capabilities = mask;
}

static inline unsigned int scsi_host_get_prot(struct Scsi_Host *shost)
{
        return shost->prot_capabilities;
}

static inline int scsi_host_prot_dma(struct Scsi_Host *shost)
{
        return shost->prot_capabilities >= SHOST_DIX_TYPE0_PROTECTION;
}

static inline unsigned int scsi_host_dif_capable(struct Scsi_Host *shost, unsigned int target_type)
{
        static unsigned char cap[] = { 0,
                                       SHOST_DIF_TYPE1_PROTECTION,
                                       SHOST_DIF_TYPE2_PROTECTION,
                                       SHOST_DIF_TYPE3_PROTECTION };

        if (target_type >= ARRAY_SIZE(cap))
                return 0;

        return shost->prot_capabilities & cap[target_type] ? target_type : 0;
}

static inline unsigned int scsi_host_dix_capable(struct Scsi_Host *shost, unsigned int target_type)
{
#if defined(CONFIG_BLK_DEV_INTEGRITY)
        static unsigned char cap[] = { SHOST_DIX_TYPE0_PROTECTION,
                                       SHOST_DIX_TYPE1_PROTECTION,
                                       SHOST_DIX_TYPE2_PROTECTION,
                                       SHOST_DIX_TYPE3_PROTECTION };

        if (target_type >= ARRAY_SIZE(cap))
                return 0;

        return shost->prot_capabilities & cap[target_type];
#endif
        return 0;
}

/*
 * All DIX-capable initiators must support the T10-mandated CRC
 * checksum.  Controllers can optionally implement the IP checksum
 * scheme which has much lower impact on system performance.  Note
 * that the main rationale for the checksum is to match integrity
 * metadata with data.  Detecting bit errors are a job for ECC memory
 * and buses.
 */

enum scsi_host_guard_type {
        SHOST_DIX_GUARD_CRC = 1 << 0,
        SHOST_DIX_GUARD_IP  = 1 << 1,
};

static inline void scsi_host_set_guard(struct Scsi_Host *shost, unsigned char type)
{
        shost->prot_guard_type = type;
}

static inline unsigned char scsi_host_get_guard(struct Scsi_Host *shost)
{
        return shost->prot_guard_type;
}

extern int scsi_host_set_state(struct Scsi_Host *, enum scsi_host_state);

#endif /* _SCSI_SCSI_HOST_H */
















    2 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_TLB_H
#define _ASM_X86_TLB_H

#define tlb_flush tlb_flush
static inline void tlb_flush(struct mmu_gather *tlb);

#include <asm-generic/tlb.h>

static inline void tlb_flush(struct mmu_gather *tlb)
{
        unsigned long start = 0UL, end = TLB_FLUSH_ALL;
        unsigned int stride_shift = tlb_get_unmap_shift(tlb);

        if (!tlb->fullmm && !tlb->need_flush_all) {
                start = tlb->start;
                end = tlb->end;
        }

        flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
}

/*
 * While x86 architecture in general requires an IPI to perform TLB
 * shootdown, enablement code for several hypervisors overrides
 * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing
 * a hypercall. To keep software pagetable walkers safe in this case we
 * switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the comment
 * below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h
 * for more details.
 */
static inline void __tlb_remove_table(void *table)
{
        free_page_and_swap_cache(table);
}

#endif /* _ASM_X86_TLB_H */









































































   38 


   39 

































   30 


   30 
























   14 


   14 























   13 


   13 





















    4 











































































































    1 
    1 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2022 Christian Brauner <brauner@kernel.org> */

#include <linux/cred.h>
#include <linux/fs.h>
#include <linux/mnt_idmapping.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>

#include "internal.h"

/*
 * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t,
 * never from raw values. These are just internal helpers.
 */
#define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val }
#define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val }

struct mnt_idmap {
        struct uid_gid_map uid_map;
        struct uid_gid_map gid_map;
        refcount_t count;
};

/*
 * Carries the initial idmapping of 0:0:4294967295 which is an identity
 * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is
 * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...].
 */
struct mnt_idmap nop_mnt_idmap = {
        .count        = REFCOUNT_INIT(1),
};
EXPORT_SYMBOL_GPL(nop_mnt_idmap);

/**
 * initial_idmapping - check whether this is the initial mapping
 * @ns: idmapping to check
 *
 * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1,
 * [...], 1000 to 1000 [...].
 *
 * Return: true if this is the initial mapping, false if not.
 */
static inline bool initial_idmapping(const struct user_namespace *ns)
{
        return ns == &init_user_ns;
}

/**
 * make_vfsuid - map a filesystem kuid according to an idmapping
 * @idmap: the mount's idmapping
 * @fs_userns: the filesystem's idmapping
 * @kuid : kuid to be mapped
 *
 * Take a @kuid and remap it from @fs_userns into @idmap. Use this
 * function when preparing a @kuid to be reported to userspace.
 *
 * If initial_idmapping() determines that this is not an idmapped mount
 * we can simply return @kuid unchanged.
 * If initial_idmapping() tells us that the filesystem is not mounted with an
 * idmapping we know the value of @kuid won't change when calling
 * from_kuid() so we can simply retrieve the value via __kuid_val()
 * directly.
 *
 * Return: @kuid mapped according to @idmap.
 * If @kuid has no mapping in either @idmap or @fs_userns INVALID_UID is
 * returned.
 */

vfsuid_t make_vfsuid(struct mnt_idmap *idmap,
                     struct user_namespace *fs_userns,
                     kuid_t kuid)
{
        uid_t uid;

        if (idmap == &nop_mnt_idmap)
                return VFSUIDT_INIT(kuid);
        if (initial_idmapping(fs_userns))
                uid = __kuid_val(kuid);
        else
                uid = from_kuid(fs_userns, kuid);
        if (uid == (uid_t)-1)
                return INVALID_VFSUID;
        return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid));
}
EXPORT_SYMBOL_GPL(make_vfsuid);

/**
 * make_vfsgid - map a filesystem kgid according to an idmapping
 * @idmap: the mount's idmapping
 * @fs_userns: the filesystem's idmapping
 * @kgid : kgid to be mapped
 *
 * Take a @kgid and remap it from @fs_userns into @idmap. Use this
 * function when preparing a @kgid to be reported to userspace.
 *
 * If initial_idmapping() determines that this is not an idmapped mount
 * we can simply return @kgid unchanged.
 * If initial_idmapping() tells us that the filesystem is not mounted with an
 * idmapping we know the value of @kgid won't change when calling
 * from_kgid() so we can simply retrieve the value via __kgid_val()
 * directly.
 *
 * Return: @kgid mapped according to @idmap.
 * If @kgid has no mapping in either @idmap or @fs_userns INVALID_GID is
 * returned.
 */
vfsgid_t make_vfsgid(struct mnt_idmap *idmap,
                     struct user_namespace *fs_userns, kgid_t kgid)
{
        gid_t gid;

        if (idmap == &nop_mnt_idmap)
                return VFSGIDT_INIT(kgid);
        if (initial_idmapping(fs_userns))
                gid = __kgid_val(kgid);
        else
                gid = from_kgid(fs_userns, kgid);
        if (gid == (gid_t)-1)
                return INVALID_VFSGID;
        return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid));
}
EXPORT_SYMBOL_GPL(make_vfsgid);

/**
 * from_vfsuid - map a vfsuid into the filesystem idmapping
 * @idmap: the mount's idmapping
 * @fs_userns: the filesystem's idmapping
 * @vfsuid : vfsuid to be mapped
 *
 * Map @vfsuid into the filesystem idmapping. This function has to be used in
 * order to e.g. write @vfsuid to inode->i_uid.
 *
 * Return: @vfsuid mapped into the filesystem idmapping
 */
kuid_t from_vfsuid(struct mnt_idmap *idmap,
                   struct user_namespace *fs_userns, vfsuid_t vfsuid)
{
        uid_t uid;

        if (idmap == &nop_mnt_idmap)
                return AS_KUIDT(vfsuid);
        uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid));
        if (uid == (uid_t)-1)
                return INVALID_UID;
        if (initial_idmapping(fs_userns))
                return KUIDT_INIT(uid);
        return make_kuid(fs_userns, uid);
}
EXPORT_SYMBOL_GPL(from_vfsuid);

/**
 * from_vfsgid - map a vfsgid into the filesystem idmapping
 * @idmap: the mount's idmapping
 * @fs_userns: the filesystem's idmapping
 * @vfsgid : vfsgid to be mapped
 *
 * Map @vfsgid into the filesystem idmapping. This function has to be used in
 * order to e.g. write @vfsgid to inode->i_gid.
 *
 * Return: @vfsgid mapped into the filesystem idmapping
 */
kgid_t from_vfsgid(struct mnt_idmap *idmap,
                   struct user_namespace *fs_userns, vfsgid_t vfsgid)
{
        gid_t gid;

        if (idmap == &nop_mnt_idmap)
                return AS_KGIDT(vfsgid);
        gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid));
        if (gid == (gid_t)-1)
                return INVALID_GID;
        if (initial_idmapping(fs_userns))
                return KGIDT_INIT(gid);
        return make_kgid(fs_userns, gid);
}
EXPORT_SYMBOL_GPL(from_vfsgid);

#ifdef CONFIG_MULTIUSER
/**
 * vfsgid_in_group_p() - check whether a vfsuid matches the caller's groups
 * @vfsgid: the mnt gid to match
 *
 * This function can be used to determine whether @vfsuid matches any of the
 * caller's groups.
 *
 * Return: 1 if vfsuid matches caller's groups, 0 if not.
 */
int vfsgid_in_group_p(vfsgid_t vfsgid)
{
        return in_group_p(AS_KGIDT(vfsgid));
}
#else
int vfsgid_in_group_p(vfsgid_t vfsgid)
{
        return 1;
}
#endif
EXPORT_SYMBOL_GPL(vfsgid_in_group_p);

static int copy_mnt_idmap(struct uid_gid_map *map_from,
                          struct uid_gid_map *map_to)
{
        struct uid_gid_extent *forward, *reverse;
        u32 nr_extents = READ_ONCE(map_from->nr_extents);
        /* Pairs with smp_wmb() when writing the idmapping. */
        smp_rmb();

        /*
         * Don't blindly copy @map_to into @map_from if nr_extents is
         * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we
         * read @nr_extents someone could have written an idmapping and
         * then we might end up with inconsistent data. So just don't do
         * anything at all.
         */
        if (nr_extents == 0)
                return -EINVAL;

        /*
         * Here we know that nr_extents is greater than zero which means
         * a map has been written. Since idmappings can't be changed
         * once they have been written we know that we can safely copy
         * from @map_to into @map_from.
         */

        if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
                *map_to = *map_from;
                return 0;
        }

        forward = kmemdup(map_from->forward,
                          nr_extents * sizeof(struct uid_gid_extent),
                          GFP_KERNEL_ACCOUNT);
        if (!forward)
                return -ENOMEM;

        reverse = kmemdup(map_from->reverse,
                          nr_extents * sizeof(struct uid_gid_extent),
                          GFP_KERNEL_ACCOUNT);
        if (!reverse) {
                kfree(forward);
                return -ENOMEM;
        }

        /*
         * The idmapping isn't exposed anywhere so we don't need to care
         * about ordering between extent pointers and @nr_extents
         * initialization.
         */
        map_to->forward = forward;
        map_to->reverse = reverse;
        map_to->nr_extents = nr_extents;
        return 0;
}

static void free_mnt_idmap(struct mnt_idmap *idmap)
{
        if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                kfree(idmap->uid_map.forward);
                kfree(idmap->uid_map.reverse);
        }
        if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                kfree(idmap->gid_map.forward);
                kfree(idmap->gid_map.reverse);
        }
        kfree(idmap);
}

struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns)
{
        struct mnt_idmap *idmap;
        int ret;

        idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT);
        if (!idmap)
                return ERR_PTR(-ENOMEM);

        refcount_set(&idmap->count, 1);
        ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map);
        if (!ret)
                ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map);
        if (ret) {
                free_mnt_idmap(idmap);
                idmap = ERR_PTR(ret);
        }
        return idmap;
}

/**
 * mnt_idmap_get - get a reference to an idmapping
 * @idmap: the idmap to bump the reference on
 *
 * If @idmap is not the @nop_mnt_idmap bump the reference count.
 *
 * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed.
 */
struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap)
{
        if (idmap != &nop_mnt_idmap)
                refcount_inc(&idmap->count);

        return idmap;
}
EXPORT_SYMBOL_GPL(mnt_idmap_get);

/**
 * mnt_idmap_put - put a reference to an idmapping
 * @idmap: the idmap to put the reference on
 *
 * If this is a non-initial idmapping, put the reference count when a mount is
 * released and free it if we're the last user.
 */
void mnt_idmap_put(struct mnt_idmap *idmap)
{
        if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count))
                free_mnt_idmap(idmap);
}
EXPORT_SYMBOL_GPL(mnt_idmap_put);































































































































































































































































































































































































































































































































































































































































































































































































































   17 






   14 







































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
// SPDX-License-Identifier: GPL-2.0-only
/* Common code for 32 and 64-bit NUMA */
#include <linux/acpi.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/of.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/memblock.h>
#include <linux/mmzone.h>
#include <linux/ctype.h>
#include <linux/nodemask.h>
#include <linux/sched.h>
#include <linux/topology.h>
#include <linux/sort.h>

#include <asm/e820/api.h>
#include <asm/proto.h>
#include <asm/dma.h>
#include <asm/amd_nb.h>

#include "numa_internal.h"

int numa_off;
nodemask_t numa_nodes_parsed __initdata;

struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);

static struct numa_meminfo numa_meminfo __initdata_or_meminfo;
static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;

static int numa_distance_cnt;
static u8 *numa_distance;

static __init int numa_setup(char *opt)
{
        if (!opt)
                return -EINVAL;
        if (!strncmp(opt, "off", 3))
                numa_off = 1;
        if (!strncmp(opt, "fake=", 5))
                return numa_emu_cmdline(opt + 5);
        if (!strncmp(opt, "noacpi", 6))
                disable_srat();
        if (!strncmp(opt, "nohmat", 6))
                disable_hmat();
        return 0;
}
early_param("numa", numa_setup);

/*
 * apicid, cpu, node mappings
 */
s16 __apicid_to_node[MAX_LOCAL_APIC] = {
        [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};

int numa_cpu_node(int cpu)
{
        u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);

        if (apicid != BAD_APICID)
                return __apicid_to_node[apicid];
        return NUMA_NO_NODE;
}

cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
EXPORT_SYMBOL(node_to_cpumask_map);

/*
 * Map cpu index to node index
 */
DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);

void numa_set_node(int cpu, int node)
{
        int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);

        /* early setting, no percpu area yet */
        if (cpu_to_node_map) {
                cpu_to_node_map[cpu] = node;
                return;
        }

#ifdef CONFIG_DEBUG_PER_CPU_MAPS
        if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
                printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
                dump_stack();
                return;
        }
#endif
        per_cpu(x86_cpu_to_node_map, cpu) = node;

        set_cpu_numa_node(cpu, node);
}

void numa_clear_node(int cpu)
{
        numa_set_node(cpu, NUMA_NO_NODE);
}

/*
 * Allocate node_to_cpumask_map based on number of available nodes
 * Requires node_possible_map to be valid.
 *
 * Note: cpumask_of_node() is not valid until after this is done.
 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
 */
void __init setup_node_to_cpumask_map(void)
{
        unsigned int node;

        /* setup nr_node_ids if not done yet */
        if (nr_node_ids == MAX_NUMNODES)
                setup_nr_node_ids();

        /* allocate the map */
        for (node = 0; node < nr_node_ids; node++)
                alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);

        /* cpumask_of_node() will now work */
        pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
}

static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
                                     struct numa_meminfo *mi)
{
        /* ignore zero length blks */
        if (start == end)
                return 0;

        /* whine about and ignore invalid blks */
        if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
                pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
                        nid, start, end - 1);
                return 0;
        }

        if (mi->nr_blks >= NR_NODE_MEMBLKS) {
                pr_err("too many memblk ranges\n");
                return -EINVAL;
        }

        mi->blk[mi->nr_blks].start = start;
        mi->blk[mi->nr_blks].end = end;
        mi->blk[mi->nr_blks].nid = nid;
        mi->nr_blks++;
        return 0;
}

/**
 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
 * @idx: Index of memblk to remove
 * @mi: numa_meminfo to remove memblk from
 *
 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
 * decrementing @mi->nr_blks.
 */
void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
{
        mi->nr_blks--;
        memmove(&mi->blk[idx], &mi->blk[idx + 1],
                (mi->nr_blks - idx) * sizeof(mi->blk[0]));
}

/**
 * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
 * @dst: numa_meminfo to append block to
 * @idx: Index of memblk to remove
 * @src: numa_meminfo to remove memblk from
 */
static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx,
                                         struct numa_meminfo *src)
{
        dst->blk[dst->nr_blks++] = src->blk[idx];
        numa_remove_memblk_from(idx, src);
}

/**
 * numa_add_memblk - Add one numa_memblk to numa_meminfo
 * @nid: NUMA node ID of the new memblk
 * @start: Start address of the new memblk
 * @end: End address of the new memblk
 *
 * Add a new memblk to the default numa_meminfo.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int __init numa_add_memblk(int nid, u64 start, u64 end)
{
        return numa_add_memblk_to(nid, start, end, &numa_meminfo);
}

/* Allocate NODE_DATA for a node on the local memory */
static void __init alloc_node_data(int nid)
{
        const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
        u64 nd_pa;
        void *nd;
        int tnid;

        /*
         * Allocate node data.  Try node-local memory and then any node.
         * Never allocate in DMA zone.
         */
        nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
        if (!nd_pa) {
                pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
                       nd_size, nid);
                return;
        }
        nd = __va(nd_pa);

        /* report and initialize */
        printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
               nd_pa, nd_pa + nd_size - 1);
        tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
        if (tnid != nid)
                printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);

        node_data[nid] = nd;
        memset(NODE_DATA(nid), 0, sizeof(pg_data_t));

        node_set_online(nid);
}

/**
 * numa_cleanup_meminfo - Cleanup a numa_meminfo
 * @mi: numa_meminfo to clean up
 *
 * Sanitize @mi by merging and removing unnecessary memblks.  Also check for
 * conflicts and clear unused memblks.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
{
        const u64 low = 0;
        const u64 high = PFN_PHYS(max_pfn);
        int i, j, k;

        /* first, trim all entries */
        for (i = 0; i < mi->nr_blks; i++) {
                struct numa_memblk *bi = &mi->blk[i];

                /* move / save reserved memory ranges */
                if (!memblock_overlaps_region(&memblock.memory,
                                        bi->start, bi->end - bi->start)) {
                        numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi);
                        continue;
                }

                /* make sure all non-reserved blocks are inside the limits */
                bi->start = max(bi->start, low);

                /* preserve info for non-RAM areas above 'max_pfn': */
                if (bi->end > high) {
                        numa_add_memblk_to(bi->nid, high, bi->end,
                                           &numa_reserved_meminfo);
                        bi->end = high;
                }

                /* and there's no empty block */
                if (bi->start >= bi->end)
                        numa_remove_memblk_from(i--, mi);
        }

        /* merge neighboring / overlapping entries */
        for (i = 0; i < mi->nr_blks; i++) {
                struct numa_memblk *bi = &mi->blk[i];

                for (j = i + 1; j < mi->nr_blks; j++) {
                        struct numa_memblk *bj = &mi->blk[j];
                        u64 start, end;

                        /*
                         * See whether there are overlapping blocks.  Whine
                         * about but allow overlaps of the same nid.  They
                         * will be merged below.
                         */
                        if (bi->end > bj->start && bi->start < bj->end) {
                                if (bi->nid != bj->nid) {
                                        pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
                                               bi->nid, bi->start, bi->end - 1,
                                               bj->nid, bj->start, bj->end - 1);
                                        return -EINVAL;
                                }
                                pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
                                        bi->nid, bi->start, bi->end - 1,
                                        bj->start, bj->end - 1);
                        }

                        /*
                         * Join together blocks on the same node, holes
                         * between which don't overlap with memory on other
                         * nodes.
                         */
                        if (bi->nid != bj->nid)
                                continue;
                        start = min(bi->start, bj->start);
                        end = max(bi->end, bj->end);
                        for (k = 0; k < mi->nr_blks; k++) {
                                struct numa_memblk *bk = &mi->blk[k];

                                if (bi->nid == bk->nid)
                                        continue;
                                if (start < bk->end && end > bk->start)
                                        break;
                        }
                        if (k < mi->nr_blks)
                                continue;
                        printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
                               bi->nid, bi->start, bi->end - 1, bj->start,
                               bj->end - 1, start, end - 1);
                        bi->start = start;
                        bi->end = end;
                        numa_remove_memblk_from(j--, mi);
                }
        }

        /* clear unused ones */
        for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
                mi->blk[i].start = mi->blk[i].end = 0;
                mi->blk[i].nid = NUMA_NO_NODE;
        }

        return 0;
}

/*
 * Set nodes, which have memory in @mi, in *@nodemask.
 */
static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
                                              const struct numa_meminfo *mi)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
                if (mi->blk[i].start != mi->blk[i].end &&
                    mi->blk[i].nid != NUMA_NO_NODE)
                        node_set(mi->blk[i].nid, *nodemask);
}

/**
 * numa_reset_distance - Reset NUMA distance table
 *
 * The current table is freed.  The next numa_set_distance() call will
 * create a new one.
 */
void __init numa_reset_distance(void)
{
        size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);

        /* numa_distance could be 1LU marking allocation failure, test cnt */
        if (numa_distance_cnt)
                memblock_free(numa_distance, size);
        numa_distance_cnt = 0;
        numa_distance = NULL;        /* enable table creation */
}

static int __init numa_alloc_distance(void)
{
        nodemask_t nodes_parsed;
        size_t size;
        int i, j, cnt = 0;
        u64 phys;

        /* size the new table and allocate it */
        nodes_parsed = numa_nodes_parsed;
        numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);

        for_each_node_mask(i, nodes_parsed)
                cnt = i;
        cnt++;
        size = cnt * cnt * sizeof(numa_distance[0]);

        phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0,
                                         PFN_PHYS(max_pfn_mapped));
        if (!phys) {
                pr_warn("Warning: can't allocate distance table!\n");
                /* don't retry until explicitly reset */
                numa_distance = (void *)1LU;
                return -ENOMEM;
        }

        numa_distance = __va(phys);
        numa_distance_cnt = cnt;

        /* fill with the default distances */
        for (i = 0; i < cnt; i++)
                for (j = 0; j < cnt; j++)
                        numa_distance[i * cnt + j] = i == j ?
                                LOCAL_DISTANCE : REMOTE_DISTANCE;
        printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);

        return 0;
}

/**
 * numa_set_distance - Set NUMA distance from one NUMA to another
 * @from: the 'from' node to set distance
 * @to: the 'to'  node to set distance
 * @distance: NUMA distance
 *
 * Set the distance from node @from to @to to @distance.  If distance table
 * doesn't exist, one which is large enough to accommodate all the currently
 * known nodes will be created.
 *
 * If such table cannot be allocated, a warning is printed and further
 * calls are ignored until the distance table is reset with
 * numa_reset_distance().
 *
 * If @from or @to is higher than the highest known node or lower than zero
 * at the time of table creation or @distance doesn't make sense, the call
 * is ignored.
 * This is to allow simplification of specific NUMA config implementations.
 */
void __init numa_set_distance(int from, int to, int distance)
{
        if (!numa_distance && numa_alloc_distance() < 0)
                return;

        if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
                        from < 0 || to < 0) {
                pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
                             from, to, distance);
                return;
        }

        if ((u8)distance != distance ||
            (from == to && distance != LOCAL_DISTANCE)) {
                pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
                             from, to, distance);
                return;
        }

        numa_distance[from * numa_distance_cnt + to] = distance;
}

int __node_distance(int from, int to)
{
        if (from >= numa_distance_cnt || to >= numa_distance_cnt)
                return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
        return numa_distance[from * numa_distance_cnt + to];
}
EXPORT_SYMBOL(__node_distance);

/*
 * Mark all currently memblock-reserved physical memory (which covers the
 * kernel's own memory ranges) as hot-unswappable.
 */
static void __init numa_clear_kernel_node_hotplug(void)
{
        nodemask_t reserved_nodemask = NODE_MASK_NONE;
        struct memblock_region *mb_region;
        int i;

        /*
         * We have to do some preprocessing of memblock regions, to
         * make them suitable for reservation.
         *
         * At this time, all memory regions reserved by memblock are
         * used by the kernel, but those regions are not split up
         * along node boundaries yet, and don't necessarily have their
         * node ID set yet either.
         *
         * So iterate over all memory known to the x86 architecture,
         * and use those ranges to set the nid in memblock.reserved.
         * This will split up the memblock regions along node
         * boundaries and will set the node IDs as well.
         */
        for (i = 0; i < numa_meminfo.nr_blks; i++) {
                struct numa_memblk *mb = numa_meminfo.blk + i;
                int ret;

                ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
                WARN_ON_ONCE(ret);
        }

        /*
         * Now go over all reserved memblock regions, to construct a
         * node mask of all kernel reserved memory areas.
         *
         * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
         *   numa_meminfo might not include all memblock.reserved
         *   memory ranges, because quirks such as trim_snb_memory()
         *   reserve specific pages for Sandy Bridge graphics. ]
         */
        for_each_reserved_mem_region(mb_region) {
                int nid = memblock_get_region_node(mb_region);

                if (nid != NUMA_NO_NODE)
                        node_set(nid, reserved_nodemask);
        }

        /*
         * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
         * belonging to the reserved node mask.
         *
         * Note that this will include memory regions that reside
         * on nodes that contain kernel memory - entire nodes
         * become hot-unpluggable:
         */
        for (i = 0; i < numa_meminfo.nr_blks; i++) {
                struct numa_memblk *mb = numa_meminfo.blk + i;

                if (!node_isset(mb->nid, reserved_nodemask))
                        continue;

                memblock_clear_hotplug(mb->start, mb->end - mb->start);
        }
}

static int __init numa_register_memblks(struct numa_meminfo *mi)
{
        int i, nid;

        /* Account for nodes with cpus and no memory */
        node_possible_map = numa_nodes_parsed;
        numa_nodemask_from_meminfo(&node_possible_map, mi);
        if (WARN_ON(nodes_empty(node_possible_map)))
                return -EINVAL;

        for (i = 0; i < mi->nr_blks; i++) {
                struct numa_memblk *mb = &mi->blk[i];
                memblock_set_node(mb->start, mb->end - mb->start,
                                  &memblock.memory, mb->nid);
        }

        /*
         * At very early time, the kernel have to use some memory such as
         * loading the kernel image. We cannot prevent this anyway. So any
         * node the kernel resides in should be un-hotpluggable.
         *
         * And when we come here, alloc node data won't fail.
         */
        numa_clear_kernel_node_hotplug();

        /*
         * If sections array is gonna be used for pfn -> nid mapping, check
         * whether its granularity is fine enough.
         */
        if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) {
                unsigned long pfn_align = node_map_pfn_alignment();

                if (pfn_align && pfn_align < PAGES_PER_SECTION) {
                        pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
                                PFN_PHYS(pfn_align) >> 20,
                                PFN_PHYS(PAGES_PER_SECTION) >> 20);
                        return -EINVAL;
                }
        }

        if (!memblock_validate_numa_coverage(SZ_1M))
                return -EINVAL;

        /* Finally register nodes. */
        for_each_node_mask(nid, node_possible_map) {
                u64 start = PFN_PHYS(max_pfn);
                u64 end = 0;

                for (i = 0; i < mi->nr_blks; i++) {
                        if (nid != mi->blk[i].nid)
                                continue;
                        start = min(mi->blk[i].start, start);
                        end = max(mi->blk[i].end, end);
                }

                if (start >= end)
                        continue;

                alloc_node_data(nid);
        }

        /* Dump memblock with node info and return. */
        memblock_dump_all();
        return 0;
}

/*
 * There are unfortunately some poorly designed mainboards around that
 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
 * mapping. To avoid this fill in the mapping for all possible CPUs,
 * as the number of CPUs is not known yet. We round robin the existing
 * nodes.
 */
static void __init numa_init_array(void)
{
        int rr, i;

        rr = first_node(node_online_map);
        for (i = 0; i < nr_cpu_ids; i++) {
                if (early_cpu_to_node(i) != NUMA_NO_NODE)
                        continue;
                numa_set_node(i, rr);
                rr = next_node_in(rr, node_online_map);
        }
}

static int __init numa_init(int (*init_func)(void))
{
        int i;
        int ret;

        for (i = 0; i < MAX_LOCAL_APIC; i++)
                set_apicid_to_node(i, NUMA_NO_NODE);

        nodes_clear(numa_nodes_parsed);
        nodes_clear(node_possible_map);
        nodes_clear(node_online_map);
        memset(&numa_meminfo, 0, sizeof(numa_meminfo));
        WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory,
                                  NUMA_NO_NODE));
        WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved,
                                  NUMA_NO_NODE));
        /* In case that parsing SRAT failed. */
        WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX));
        numa_reset_distance();

        ret = init_func();
        if (ret < 0)
                return ret;

        /*
         * We reset memblock back to the top-down direction
         * here because if we configured ACPI_NUMA, we have
         * parsed SRAT in init_func(). It is ok to have the
         * reset here even if we did't configure ACPI_NUMA
         * or acpi numa init fails and fallbacks to dummy
         * numa init.
         */
        memblock_set_bottom_up(false);

        ret = numa_cleanup_meminfo(&numa_meminfo);
        if (ret < 0)
                return ret;

        numa_emulation(&numa_meminfo, numa_distance_cnt);

        ret = numa_register_memblks(&numa_meminfo);
        if (ret < 0)
                return ret;

        for (i = 0; i < nr_cpu_ids; i++) {
                int nid = early_cpu_to_node(i);

                if (nid == NUMA_NO_NODE)
                        continue;
                if (!node_online(nid))
                        numa_clear_node(i);
        }
        numa_init_array();

        return 0;
}

/**
 * dummy_numa_init - Fallback dummy NUMA init
 *
 * Used if there's no underlying NUMA architecture, NUMA initialization
 * fails, or NUMA is disabled on the command line.
 *
 * Must online at least one node and add memory blocks that cover all
 * allowed memory.  This function must not fail.
 */
static int __init dummy_numa_init(void)
{
        printk(KERN_INFO "%s\n",
               numa_off ? "NUMA turned off" : "No NUMA configuration found");
        printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
               0LLU, PFN_PHYS(max_pfn) - 1);

        node_set(0, numa_nodes_parsed);
        numa_add_memblk(0, 0, PFN_PHYS(max_pfn));

        return 0;
}

/**
 * x86_numa_init - Initialize NUMA
 *
 * Try each configured NUMA initialization method until one succeeds.  The
 * last fallback is dummy single node config encompassing whole memory and
 * never fails.
 */
void __init x86_numa_init(void)
{
        if (!numa_off) {
#ifdef CONFIG_ACPI_NUMA
                if (!numa_init(x86_acpi_numa_init))
                        return;
#endif
#ifdef CONFIG_AMD_NUMA
                if (!numa_init(amd_numa_init))
                        return;
#endif
                if (acpi_disabled && !numa_init(of_numa_init))
                        return;
        }

        numa_init(dummy_numa_init);
}


/*
 * A node may exist which has one or more Generic Initiators but no CPUs and no
 * memory.
 *
 * This function must be called after init_cpu_to_node(), to ensure that any
 * memoryless CPU nodes have already been brought online, and before the
 * node_data[nid] is needed for zone list setup in build_all_zonelists().
 *
 * When this function is called, any nodes containing either memory and/or CPUs
 * will already be online and there is no need to do anything extra, even if
 * they also contain one or more Generic Initiators.
 */
void __init init_gi_nodes(void)
{
        int nid;

        /*
         * Exclude this node from
         * bringup_nonboot_cpus
         *  cpu_up
         *   __try_online_node
         *    register_one_node
         * because node_subsys is not initialized yet.
         * TODO remove dependency on node_online
         */
        for_each_node_state(nid, N_GENERIC_INITIATOR)
                if (!node_online(nid))
                        node_set_online(nid);
}

/*
 * Setup early cpu_to_node.
 *
 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
 * and apicid_to_node[] tables have valid entries for a CPU.
 * This means we skip cpu_to_node[] initialisation for NUMA
 * emulation and faking node case (when running a kernel compiled
 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
 * is already initialized in a round robin manner at numa_init_array,
 * prior to this call, and this initialization is good enough
 * for the fake NUMA cases.
 *
 * Called before the per_cpu areas are setup.
 */
void __init init_cpu_to_node(void)
{
        int cpu;
        u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);

        BUG_ON(cpu_to_apicid == NULL);

        for_each_possible_cpu(cpu) {
                int node = numa_cpu_node(cpu);

                if (node == NUMA_NO_NODE)
                        continue;

                /*
                 * Exclude this node from
                 * bringup_nonboot_cpus
                 *  cpu_up
                 *   __try_online_node
                 *    register_one_node
                 * because node_subsys is not initialized yet.
                 * TODO remove dependency on node_online
                 */
                if (!node_online(node))
                        node_set_online(node);

                numa_set_node(cpu, node);
        }
}

#ifndef CONFIG_DEBUG_PER_CPU_MAPS

# ifndef CONFIG_NUMA_EMU
void numa_add_cpu(int cpu)
{
        cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}

void numa_remove_cpu(int cpu)
{
        cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}
# endif        /* !CONFIG_NUMA_EMU */

#else        /* !CONFIG_DEBUG_PER_CPU_MAPS */

int __cpu_to_node(int cpu)
{
        if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
                printk(KERN_WARNING
                        "cpu_to_node(%d): usage too early!\n", cpu);
                dump_stack();
                return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
        }
        return per_cpu(x86_cpu_to_node_map, cpu);
}
EXPORT_SYMBOL(__cpu_to_node);

/*
 * Same function as cpu_to_node() but used if called before the
 * per_cpu areas are setup.
 */
int early_cpu_to_node(int cpu)
{
        if (early_per_cpu_ptr(x86_cpu_to_node_map))
                return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];

        if (!cpu_possible(cpu)) {
                printk(KERN_WARNING
                        "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
                dump_stack();
                return NUMA_NO_NODE;
        }
        return per_cpu(x86_cpu_to_node_map, cpu);
}

void debug_cpumask_set_cpu(int cpu, int node, bool enable)
{
        struct cpumask *mask;

        if (node == NUMA_NO_NODE) {
                /* early_cpu_to_node() already emits a warning and trace */
                return;
        }
        mask = node_to_cpumask_map[node];
        if (!cpumask_available(mask)) {
                pr_err("node_to_cpumask_map[%i] NULL\n", node);
                dump_stack();
                return;
        }

        if (enable)
                cpumask_set_cpu(cpu, mask);
        else
                cpumask_clear_cpu(cpu, mask);

        printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
                enable ? "numa_add_cpu" : "numa_remove_cpu",
                cpu, node, cpumask_pr_args(mask));
        return;
}

# ifndef CONFIG_NUMA_EMU
static void numa_set_cpumask(int cpu, bool enable)
{
        debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
}

void numa_add_cpu(int cpu)
{
        numa_set_cpumask(cpu, true);
}

void numa_remove_cpu(int cpu)
{
        numa_set_cpumask(cpu, false);
}
# endif        /* !CONFIG_NUMA_EMU */

/*
 * Returns a pointer to the bitmask of CPUs on Node 'node'.
 */
const struct cpumask *cpumask_of_node(int node)
{
        if ((unsigned)node >= nr_node_ids) {
                printk(KERN_WARNING
                        "cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n",
                        node, nr_node_ids);
                dump_stack();
                return cpu_none_mask;
        }
        if (!cpumask_available(node_to_cpumask_map[node])) {
                printk(KERN_WARNING
                        "cpumask_of_node(%d): no node_to_cpumask_map!\n",
                        node);
                dump_stack();
                return cpu_online_mask;
        }
        return node_to_cpumask_map[node];
}
EXPORT_SYMBOL(cpumask_of_node);

#endif        /* !CONFIG_DEBUG_PER_CPU_MAPS */

#ifdef CONFIG_NUMA_KEEP_MEMINFO
static int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
{
        int i;

        for (i = 0; i < mi->nr_blks; i++)
                if (mi->blk[i].start <= start && mi->blk[i].end > start)
                        return mi->blk[i].nid;
        return NUMA_NO_NODE;
}

int phys_to_target_node(phys_addr_t start)
{
        int nid = meminfo_to_nid(&numa_meminfo, start);

        /*
         * Prefer online nodes, but if reserved memory might be
         * hot-added continue the search with reserved ranges.
         */
        if (nid != NUMA_NO_NODE)
                return nid;

        return meminfo_to_nid(&numa_reserved_meminfo, start);
}
EXPORT_SYMBOL_GPL(phys_to_target_node);

int memory_add_physaddr_to_nid(u64 start)
{
        int nid = meminfo_to_nid(&numa_meminfo, start);

        if (nid == NUMA_NO_NODE)
                nid = numa_meminfo.blk[0].nid;
        return nid;
}
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);

#endif

static int __init cmp_memblk(const void *a, const void *b)
{
        const struct numa_memblk *ma = *(const struct numa_memblk **)a;
        const struct numa_memblk *mb = *(const struct numa_memblk **)b;

        return (ma->start > mb->start) - (ma->start < mb->start);
}

static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata;

/**
 * numa_fill_memblks - Fill gaps in numa_meminfo memblks
 * @start: address to begin fill
 * @end: address to end fill
 *
 * Find and extend numa_meminfo memblks to cover the physical
 * address range @start-@end
 *
 * RETURNS:
 * 0                  : Success
 * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end
 */

int __init numa_fill_memblks(u64 start, u64 end)
{
        struct numa_memblk **blk = &numa_memblk_list[0];
        struct numa_meminfo *mi = &numa_meminfo;
        int count = 0;
        u64 prev_end;

        /*
         * Create a list of pointers to numa_meminfo memblks that
         * overlap start, end. The list is used to make in-place
         * changes that fill out the numa_meminfo memblks.
         */
        for (int i = 0; i < mi->nr_blks; i++) {
                struct numa_memblk *bi = &mi->blk[i];

                if (memblock_addrs_overlap(start, end - start, bi->start,
                                           bi->end - bi->start)) {
                        blk[count] = &mi->blk[i];
                        count++;
                }
        }
        if (!count)
                return NUMA_NO_MEMBLK;

        /* Sort the list of pointers in memblk->start order */
        sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL);

        /* Make sure the first/last memblks include start/end */
        blk[0]->start = min(blk[0]->start, start);
        blk[count - 1]->end = max(blk[count - 1]->end, end);

        /*
         * Fill any gaps by tracking the previous memblks
         * end address and backfilling to it if needed.
         */
        prev_end = blk[0]->end;
        for (int i = 1; i < count; i++) {
                struct numa_memblk *curr = blk[i];

                if (prev_end >= curr->start) {
                        if (prev_end < curr->end)
                                prev_end = curr->end;
                } else {
                        curr->start = prev_end;
                        prev_end = curr->end;
                }
        }
        return 0;
}






































   34 





   33 




































































































   31 






























   33 















   35 



   34 

















   35 











    5 
























   36 


   31 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
// SPDX-License-Identifier: GPL-2.0-only
/*
 * This implements the various checks for CONFIG_HARDENED_USERCOPY*,
 * which are designed to protect kernel memory from needless exposure
 * and overwrite under many unintended conditions. This code is based
 * on PAX_USERCOPY, which is:
 *
 * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source
 * Security Inc.
 */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/kstrtox.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/thread_info.h>
#include <linux/vmalloc.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
#include <asm/sections.h>
#include "slab.h"

/*
 * Checks if a given pointer and length is contained by the current
 * stack frame (if possible).
 *
 * Returns:
 *        NOT_STACK: not at all on the stack
 *        GOOD_FRAME: fully within a valid stack frame
 *        GOOD_STACK: within the current stack (when can't frame-check exactly)
 *        BAD_STACK: error condition (invalid stack position or bad stack frame)
 */
static noinline int check_stack_object(const void *obj, unsigned long len)
{
        const void * const stack = task_stack_page(current);
        const void * const stackend = stack + THREAD_SIZE;
        int ret;

        /* Object is not on the stack at all. */
        if (obj + len <= stack || stackend <= obj)
                return NOT_STACK;

        /*
         * Reject: object partially overlaps the stack (passing the
         * check above means at least one end is within the stack,
         * so if this check fails, the other end is outside the stack).
         */
        if (obj < stack || stackend < obj + len)
                return BAD_STACK;

        /* Check if object is safely within a valid frame. */
        ret = arch_within_stack_frames(stack, stackend, obj, len);
        if (ret)
                return ret;

        /* Finally, check stack depth if possible. */
#ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER
        if (IS_ENABLED(CONFIG_STACK_GROWSUP)) {
                if ((void *)current_stack_pointer < obj + len)
                        return BAD_STACK;
        } else {
                if (obj < (void *)current_stack_pointer)
                        return BAD_STACK;
        }
#endif

        return GOOD_STACK;
}

/*
 * If these functions are reached, then CONFIG_HARDENED_USERCOPY has found
 * an unexpected state during a copy_from_user() or copy_to_user() call.
 * There are several checks being performed on the buffer by the
 * __check_object_size() function. Normal stack buffer usage should never
 * trip the checks, and kernel text addressing will always trip the check.
 * For cache objects, it is checking that only the whitelisted range of
 * bytes for a given cache is being accessed (via the cache's usersize and
 * useroffset fields). To adjust a cache whitelist, use the usercopy-aware
 * kmem_cache_create_usercopy() function to create the cache (and
 * carefully audit the whitelist range).
 */
void __noreturn usercopy_abort(const char *name, const char *detail,
                               bool to_user, unsigned long offset,
                               unsigned long len)
{
        pr_emerg("Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n",
                 to_user ? "exposure" : "overwrite",
                 to_user ? "from" : "to",
                 name ? : "unknown?!",
                 detail ? " '" : "", detail ? : "", detail ? "'" : "",
                 offset, len);

        /*
         * For greater effect, it would be nice to do do_group_exit(),
         * but BUG() actually hooks all the lock-breaking and per-arch
         * Oops code, so that is used here instead.
         */
        BUG();
}

/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */
static bool overlaps(const unsigned long ptr, unsigned long n,
                     unsigned long low, unsigned long high)
{
        const unsigned long check_low = ptr;
        unsigned long check_high = check_low + n;

        /* Does not overlap if entirely above or entirely below. */
        if (check_low >= high || check_high <= low)
                return false;

        return true;
}

/* Is this address range in the kernel text area? */
static inline void check_kernel_text_object(const unsigned long ptr,
                                            unsigned long n, bool to_user)
{
        unsigned long textlow = (unsigned long)_stext;
        unsigned long texthigh = (unsigned long)_etext;
        unsigned long textlow_linear, texthigh_linear;

        if (overlaps(ptr, n, textlow, texthigh))
                usercopy_abort("kernel text", NULL, to_user, ptr - textlow, n);

        /*
         * Some architectures have virtual memory mappings with a secondary
         * mapping of the kernel text, i.e. there is more than one virtual
         * kernel address that points to the kernel image. It is usually
         * when there is a separate linear physical memory mapping, in that
         * __pa() is not just the reverse of __va(). This can be detected
         * and checked:
         */
        textlow_linear = (unsigned long)lm_alias(textlow);
        /* No different mapping: we're done. */
        if (textlow_linear == textlow)
                return;

        /* Check the secondary mapping... */
        texthigh_linear = (unsigned long)lm_alias(texthigh);
        if (overlaps(ptr, n, textlow_linear, texthigh_linear))
                usercopy_abort("linear kernel text", NULL, to_user,
                               ptr - textlow_linear, n);
}

static inline void check_bogus_address(const unsigned long ptr, unsigned long n,
                                       bool to_user)
{
        /* Reject if object wraps past end of memory. */
        if (ptr + (n - 1) < ptr)
                usercopy_abort("wrapped address", NULL, to_user, 0, ptr + n);

        /* Reject if NULL or ZERO-allocation. */
        if (ZERO_OR_NULL_PTR(ptr))
                usercopy_abort("null address", NULL, to_user, ptr, n);
}

static inline void check_heap_object(const void *ptr, unsigned long n,
                                     bool to_user)
{
        unsigned long addr = (unsigned long)ptr;
        unsigned long offset;
        struct folio *folio;

        if (is_kmap_addr(ptr)) {
                offset = offset_in_page(ptr);
                if (n > PAGE_SIZE - offset)
                        usercopy_abort("kmap", NULL, to_user, offset, n);
                return;
        }

        if (is_vmalloc_addr(ptr) && !pagefault_disabled()) {
                struct vmap_area *area = find_vmap_area(addr);

                if (!area)
                        usercopy_abort("vmalloc", "no area", to_user, 0, n);

                if (n > area->va_end - addr) {
                        offset = addr - area->va_start;
                        usercopy_abort("vmalloc", NULL, to_user, offset, n);
                }
                return;
        }

        if (!virt_addr_valid(ptr))
                return;

        folio = virt_to_folio(ptr);

        if (folio_test_slab(folio)) {
                /* Check slab allocator for flags and size. */
                __check_heap_object(ptr, n, folio_slab(folio), to_user);
        } else if (folio_test_large(folio)) {
                offset = ptr - folio_address(folio);
                if (n > folio_size(folio) - offset)
                        usercopy_abort("page alloc", NULL, to_user, offset, n);
        }
}

static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);

/*
 * Validates that the given object is:
 * - not bogus address
 * - fully contained by stack (or stack frame, when available)
 * - fully within SLAB object (or object whitelist area, when available)
 * - not in kernel text
 */
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
{
        if (static_branch_unlikely(&bypass_usercopy_checks))
                return;

        /* Skip all tests if size is zero. */
        if (!n)
                return;

        /* Check for invalid addresses. */
        check_bogus_address((const unsigned long)ptr, n, to_user);

        /* Check for bad stack object. */
        switch (check_stack_object(ptr, n)) {
        case NOT_STACK:
                /* Object is not touching the current process stack. */
                break;
        case GOOD_FRAME:
        case GOOD_STACK:
                /*
                 * Object is either in the correct frame (when it
                 * is possible to check) or just generally on the
                 * process stack (when frame checking not available).
                 */
                return;
        default:
                usercopy_abort("process stack", NULL, to_user,
#ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER
                        IS_ENABLED(CONFIG_STACK_GROWSUP) ?
                                ptr - (void *)current_stack_pointer :
                                (void *)current_stack_pointer - ptr,
#else
                        0,
#endif
                        n);
        }

        /* Check for bad heap object. */
        check_heap_object(ptr, n, to_user);

        /* Check for object in kernel to avoid text exposure. */
        check_kernel_text_object((const unsigned long)ptr, n, to_user);
}
EXPORT_SYMBOL(__check_object_size);

static bool enable_checks __initdata = true;

static int __init parse_hardened_usercopy(char *str)
{
        if (kstrtobool(str, &enable_checks))
                pr_warn("Invalid option string for hardened_usercopy: '%s'\n",
                        str);
        return 1;
}

__setup("hardened_usercopy=", parse_hardened_usercopy);

static int __init set_hardened_usercopy(void)
{
        if (enable_checks == false)
                static_branch_enable(&bypass_usercopy_checks);
        return 1;
}

late_initcall(set_hardened_usercopy);
































































    6 


    8 
    7 




    6 










































































   13 


   14 






   13 


























   13 







   13 


















   14 

































    1 







































    6 





    3 
    4 
    5 
    2 

    7 





    7 

















    6 



















    6 








    6 












    1 








    1 





    1 


















    5 










    5 




    5 



    4 



    5 

    4 

    1 

    5 

    5 

    4 
    1 
















    3 






















   31 
   34 



    3 




    3 






















    2 

    2 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/file_table.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
 */

#include <linux/string.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/eventpoll.h>
#include <linux/rcupdate.h>
#include <linux/mount.h>
#include <linux/capability.h>
#include <linux/cdev.h>
#include <linux/fsnotify.h>
#include <linux/sysctl.h>
#include <linux/percpu_counter.h>
#include <linux/percpu.h>
#include <linux/task_work.h>
#include <linux/swap.h>
#include <linux/kmemleak.h>

#include <linux/atomic.h>

#include "internal.h"

/* sysctl tunables... */
static struct files_stat_struct files_stat = {
        .max_files = NR_FILE
};

/* SLAB cache for file structures */
static struct kmem_cache *filp_cachep __ro_after_init;

static struct percpu_counter nr_files __cacheline_aligned_in_smp;

/* Container for backing file with optional user path */
struct backing_file {
        struct file file;
        struct path user_path;
};

static inline struct backing_file *backing_file(struct file *f)
{
        return container_of(f, struct backing_file, file);
}

struct path *backing_file_user_path(struct file *f)
{
        return &backing_file(f)->user_path;
}
EXPORT_SYMBOL_GPL(backing_file_user_path);

static inline void file_free(struct file *f)
{
        security_file_free(f);
        if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
                percpu_counter_dec(&nr_files);
        put_cred(f->f_cred);
        if (unlikely(f->f_mode & FMODE_BACKING)) {
                path_put(backing_file_user_path(f));
                kfree(backing_file(f));
        } else {
                kmem_cache_free(filp_cachep, f);
        }
}

/*
 * Return the total number of open files in the system
 */
static long get_nr_files(void)
{
        return percpu_counter_read_positive(&nr_files);
}

/*
 * Return the maximum number of open files in the system
 */
unsigned long get_max_files(void)
{
        return files_stat.max_files;
}
EXPORT_SYMBOL_GPL(get_max_files);

#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)

/*
 * Handle nr_files sysctl
 */
static int proc_nr_files(struct ctl_table *table, int write, void *buffer,
                         size_t *lenp, loff_t *ppos)
{
        files_stat.nr_files = get_nr_files();
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

static struct ctl_table fs_stat_sysctls[] = {
        {
                .procname        = "file-nr",
                .data                = &files_stat,
                .maxlen                = sizeof(files_stat),
                .mode                = 0444,
                .proc_handler        = proc_nr_files,
        },
        {
                .procname        = "file-max",
                .data                = &files_stat.max_files,
                .maxlen                = sizeof(files_stat.max_files),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
                .extra1                = SYSCTL_LONG_ZERO,
                .extra2                = SYSCTL_LONG_MAX,
        },
        {
                .procname        = "nr_open",
                .data                = &sysctl_nr_open,
                .maxlen                = sizeof(unsigned int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = &sysctl_nr_open_min,
                .extra2                = &sysctl_nr_open_max,
        },
};

static int __init init_fs_stat_sysctls(void)
{
        register_sysctl_init("fs", fs_stat_sysctls);
        if (IS_ENABLED(CONFIG_BINFMT_MISC)) {
                struct ctl_table_header *hdr;
                hdr = register_sysctl_mount_point("fs/binfmt_misc");
                kmemleak_not_leak(hdr);
        }
        return 0;
}
fs_initcall(init_fs_stat_sysctls);
#endif

static int init_file(struct file *f, int flags, const struct cred *cred)
{
        int error;

        f->f_cred = get_cred(cred);
        error = security_file_alloc(f);
        if (unlikely(error)) {
                put_cred(f->f_cred);
                return error;
        }

        rwlock_init(&f->f_owner.lock);
        spin_lock_init(&f->f_lock);
        mutex_init(&f->f_pos_lock);
        f->f_flags = flags;
        f->f_mode = OPEN_FMODE(flags);
        /* f->f_version: 0 */

        /*
         * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
         * fget-rcu pattern users need to be able to handle spurious
         * refcount bumps we should reinitialize the reused file first.
         */
        atomic_long_set(&f->f_count, 1);
        return 0;
}

/* Find an unused file structure and return a pointer to it.
 * Returns an error pointer if some error happend e.g. we over file
 * structures limit, run out of memory or operation is not permitted.
 *
 * Be very careful using this.  You are responsible for
 * getting write access to any mount that you might assign
 * to this filp, if it is opened for write.  If this is not
 * done, you will imbalance int the mount's writer count
 * and a warning at __fput() time.
 */
struct file *alloc_empty_file(int flags, const struct cred *cred)
{
        static long old_max;
        struct file *f;
        int error;

        /*
         * Privileged users can go above max_files
         */
        if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
                /*
                 * percpu_counters are inaccurate.  Do an expensive check before
                 * we go and fail.
                 */
                if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files)
                        goto over;
        }

        f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
        if (unlikely(!f))
                return ERR_PTR(-ENOMEM);

        error = init_file(f, flags, cred);
        if (unlikely(error)) {
                kmem_cache_free(filp_cachep, f);
                return ERR_PTR(error);
        }

        percpu_counter_inc(&nr_files);

        return f;

over:
        /* Ran out of filps - report that */
        if (get_nr_files() > old_max) {
                pr_info("VFS: file-max limit %lu reached\n", get_max_files());
                old_max = get_nr_files();
        }
        return ERR_PTR(-ENFILE);
}

/*
 * Variant of alloc_empty_file() that doesn't check and modify nr_files.
 *
 * This is only for kernel internal use, and the allocate file must not be
 * installed into file tables or such.
 */
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
{
        struct file *f;
        int error;

        f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
        if (unlikely(!f))
                return ERR_PTR(-ENOMEM);

        error = init_file(f, flags, cred);
        if (unlikely(error)) {
                kmem_cache_free(filp_cachep, f);
                return ERR_PTR(error);
        }

        f->f_mode |= FMODE_NOACCOUNT;

        return f;
}

/*
 * Variant of alloc_empty_file() that allocates a backing_file container
 * and doesn't check and modify nr_files.
 *
 * This is only for kernel internal use, and the allocate file must not be
 * installed into file tables or such.
 */
struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
{
        struct backing_file *ff;
        int error;

        ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL);
        if (unlikely(!ff))
                return ERR_PTR(-ENOMEM);

        error = init_file(&ff->file, flags, cred);
        if (unlikely(error)) {
                kfree(ff);
                return ERR_PTR(error);
        }

        ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT;
        return &ff->file;
}

/**
 * file_init_path - initialize a 'struct file' based on path
 *
 * @file: the file to set up
 * @path: the (dentry, vfsmount) pair for the new file
 * @fop: the 'struct file_operations' for the new file
 */
static void file_init_path(struct file *file, const struct path *path,
                           const struct file_operations *fop)
{
        file->f_path = *path;
        file->f_inode = path->dentry->d_inode;
        file->f_mapping = path->dentry->d_inode->i_mapping;
        file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
        file->f_sb_err = file_sample_sb_err(file);
        if (fop->llseek)
                file->f_mode |= FMODE_LSEEK;
        if ((file->f_mode & FMODE_READ) &&
             likely(fop->read || fop->read_iter))
                file->f_mode |= FMODE_CAN_READ;
        if ((file->f_mode & FMODE_WRITE) &&
             likely(fop->write || fop->write_iter))
                file->f_mode |= FMODE_CAN_WRITE;
        file->f_iocb_flags = iocb_flags(file);
        file->f_mode |= FMODE_OPENED;
        file->f_op = fop;
        if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
                i_readcount_inc(path->dentry->d_inode);
}

/**
 * alloc_file - allocate and initialize a 'struct file'
 *
 * @path: the (dentry, vfsmount) pair for the new file
 * @flags: O_... flags with which the new file will be opened
 * @fop: the 'struct file_operations' for the new file
 */
static struct file *alloc_file(const struct path *path, int flags,
                const struct file_operations *fop)
{
        struct file *file;

        file = alloc_empty_file(flags, current_cred());
        if (!IS_ERR(file))
                file_init_path(file, path, fop);
        return file;
}

static inline int alloc_path_pseudo(const char *name, struct inode *inode,
                                    struct vfsmount *mnt, struct path *path)
{
        struct qstr this = QSTR_INIT(name, strlen(name));

        path->dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
        if (!path->dentry)
                return -ENOMEM;
        path->mnt = mntget(mnt);
        d_instantiate(path->dentry, inode);
        return 0;
}

struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
                               const char *name, int flags,
                               const struct file_operations *fops)
{
        int ret;
        struct path path;
        struct file *file;

        ret = alloc_path_pseudo(name, inode, mnt, &path);
        if (ret)
                return ERR_PTR(ret);

        file = alloc_file(&path, flags, fops);
        if (IS_ERR(file)) {
                ihold(inode);
                path_put(&path);
        }
        return file;
}
EXPORT_SYMBOL(alloc_file_pseudo);

struct file *alloc_file_pseudo_noaccount(struct inode *inode,
                                         struct vfsmount *mnt, const char *name,
                                         int flags,
                                         const struct file_operations *fops)
{
        int ret;
        struct path path;
        struct file *file;

        ret = alloc_path_pseudo(name, inode, mnt, &path);
        if (ret)
                return ERR_PTR(ret);

        file = alloc_empty_file_noaccount(flags, current_cred());
        if (IS_ERR(file)) {
                ihold(inode);
                path_put(&path);
                return file;
        }
        file_init_path(file, &path, fops);
        return file;
}
EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount);

struct file *alloc_file_clone(struct file *base, int flags,
                                const struct file_operations *fops)
{
        struct file *f = alloc_file(&base->f_path, flags, fops);
        if (!IS_ERR(f)) {
                path_get(&f->f_path);
                f->f_mapping = base->f_mapping;
        }
        return f;
}

/* the real guts of fput() - releasing the last reference to file
 */
static void __fput(struct file *file)
{
        struct dentry *dentry = file->f_path.dentry;
        struct vfsmount *mnt = file->f_path.mnt;
        struct inode *inode = file->f_inode;
        fmode_t mode = file->f_mode;

        if (unlikely(!(file->f_mode & FMODE_OPENED)))
                goto out;

        might_sleep();

        fsnotify_close(file);
        /*
         * The function eventpoll_release() should be the first called
         * in the file cleanup chain.
         */
        eventpoll_release(file);
        locks_remove_file(file);

        security_file_release(file);
        if (unlikely(file->f_flags & FASYNC)) {
                if (file->f_op->fasync)
                        file->f_op->fasync(-1, file, 0);
        }
        if (file->f_op->release)
                file->f_op->release(inode, file);
        if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
                     !(mode & FMODE_PATH))) {
                cdev_put(inode->i_cdev);
        }
        fops_put(file->f_op);
        put_pid(file->f_owner.pid);
        put_file_access(file);
        dput(dentry);
        if (unlikely(mode & FMODE_NEED_UNMOUNT))
                dissolve_on_fput(mnt);
        mntput(mnt);
out:
        file_free(file);
}

static LLIST_HEAD(delayed_fput_list);
static void delayed_fput(struct work_struct *unused)
{
        struct llist_node *node = llist_del_all(&delayed_fput_list);
        struct file *f, *t;

        llist_for_each_entry_safe(f, t, node, f_llist)
                __fput(f);
}

static void ____fput(struct callback_head *work)
{
        __fput(container_of(work, struct file, f_task_work));
}

/*
 * If kernel thread really needs to have the final fput() it has done
 * to complete, call this.  The only user right now is the boot - we
 * *do* need to make sure our writes to binaries on initramfs has
 * not left us with opened struct file waiting for __fput() - execve()
 * won't work without that.  Please, don't add more callers without
 * very good reasons; in particular, never call that with locks
 * held and never call that from a thread that might need to do
 * some work on any kind of umount.
 */
void flush_delayed_fput(void)
{
        delayed_fput(NULL);
}
EXPORT_SYMBOL_GPL(flush_delayed_fput);

static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);

void fput(struct file *file)
{
        if (atomic_long_dec_and_test(&file->f_count)) {
                struct task_struct *task = current;

                if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
                        file_free(file);
                        return;
                }
                if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
                        init_task_work(&file->f_task_work, ____fput);
                        if (!task_work_add(task, &file->f_task_work, TWA_RESUME))
                                return;
                        /*
                         * After this task has run exit_task_work(),
                         * task_work_add() will fail.  Fall through to delayed
                         * fput to avoid leaking *file.
                         */
                }

                if (llist_add(&file->f_llist, &delayed_fput_list))
                        schedule_delayed_work(&delayed_fput_work, 1);
        }
}

/*
 * synchronous analog of fput(); for kernel threads that might be needed
 * in some umount() (and thus can't use flush_delayed_fput() without
 * risking deadlocks), need to wait for completion of __fput() and know
 * for this specific struct file it won't involve anything that would
 * need them.  Use only if you really need it - at the very least,
 * don't blindly convert fput() by kernel thread to that.
 */
void __fput_sync(struct file *file)
{
        if (atomic_long_dec_and_test(&file->f_count))
                __fput(file);
}

EXPORT_SYMBOL(fput);
EXPORT_SYMBOL(__fput_sync);

void __init files_init(void)
{
        filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
                                SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN |
                                SLAB_PANIC | SLAB_ACCOUNT, NULL);
        percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}

/*
 * One file with associated inode and dcache is very roughly 1K. Per default
 * do not use more than 10% of our memory for files.
 */
void __init files_maxfiles_init(void)
{
        unsigned long n;
        unsigned long nr_pages = totalram_pages();
        unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2;

        memreserve = min(memreserve, nr_pages - 1);
        n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;

        files_stat.max_files = max_t(unsigned long, n, NR_FILE);
}

















































    4 




   15 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM filemap

#if !defined(_TRACE_FILEMAP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_FILEMAP_H

#include <linux/types.h>
#include <linux/tracepoint.h>
#include <linux/mm.h>
#include <linux/memcontrol.h>
#include <linux/device.h>
#include <linux/kdev_t.h>
#include <linux/errseq.h>

DECLARE_EVENT_CLASS(mm_filemap_op_page_cache,

        TP_PROTO(struct folio *folio),

        TP_ARGS(folio),

        TP_STRUCT__entry(
                __field(unsigned long, pfn)
                __field(unsigned long, i_ino)
                __field(unsigned long, index)
                __field(dev_t, s_dev)
                __field(unsigned char, order)
        ),

        TP_fast_assign(
                __entry->pfn = folio_pfn(folio);
                __entry->i_ino = folio->mapping->host->i_ino;
                __entry->index = folio->index;
                if (folio->mapping->host->i_sb)
                        __entry->s_dev = folio->mapping->host->i_sb->s_dev;
                else
                        __entry->s_dev = folio->mapping->host->i_rdev;
                __entry->order = folio_order(folio);
        ),

        TP_printk("dev %d:%d ino %lx pfn=0x%lx ofs=%lu order=%u",
                MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                __entry->i_ino,
                __entry->pfn,
                __entry->index << PAGE_SHIFT,
                __entry->order)
);

DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_delete_from_page_cache,
        TP_PROTO(struct folio *folio),
        TP_ARGS(folio)
        );

DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_add_to_page_cache,
        TP_PROTO(struct folio *folio),
        TP_ARGS(folio)
        );

TRACE_EVENT(filemap_set_wb_err,
                TP_PROTO(struct address_space *mapping, errseq_t eseq),

                TP_ARGS(mapping, eseq),

                TP_STRUCT__entry(
                        __field(unsigned long, i_ino)
                        __field(dev_t, s_dev)
                        __field(errseq_t, errseq)
                ),

                TP_fast_assign(
                        __entry->i_ino = mapping->host->i_ino;
                        __entry->errseq = eseq;
                        if (mapping->host->i_sb)
                                __entry->s_dev = mapping->host->i_sb->s_dev;
                        else
                                __entry->s_dev = mapping->host->i_rdev;
                ),

                TP_printk("dev=%d:%d ino=0x%lx errseq=0x%x",
                        MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                        __entry->i_ino, __entry->errseq)
);

TRACE_EVENT(file_check_and_advance_wb_err,
                TP_PROTO(struct file *file, errseq_t old),

                TP_ARGS(file, old),

                TP_STRUCT__entry(
                        __field(struct file *, file)
                        __field(unsigned long, i_ino)
                        __field(dev_t, s_dev)
                        __field(errseq_t, old)
                        __field(errseq_t, new)
                ),

                TP_fast_assign(
                        __entry->file = file;
                        __entry->i_ino = file->f_mapping->host->i_ino;
                        if (file->f_mapping->host->i_sb)
                                __entry->s_dev =
                                        file->f_mapping->host->i_sb->s_dev;
                        else
                                __entry->s_dev =
                                        file->f_mapping->host->i_rdev;
                        __entry->old = old;
                        __entry->new = file->f_wb_err;
                ),

                TP_printk("file=%p dev=%d:%d ino=0x%lx old=0x%x new=0x%x",
                        __entry->file, MAJOR(__entry->s_dev),
                        MINOR(__entry->s_dev), __entry->i_ino, __entry->old,
                        __entry->new)
);
#endif /* _TRACE_FILEMAP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>


































































































































































































    3 










    1 



































    2 







    2 









    1 














    2 








    2 

















































    3 








    3 
    1 
    1 


















































































































































































































































    1 











    1 


    1 


























    1 






    2 








    1 











    1 


























    1 
















    2 












    2 





































































































































    5 
    5 









































































































    4 



















    4 





    4 



    4 






























































































































































































































































































































































































































































































































































































































































    6 









    5 
    1 
    1 





    5 
















    6 






    1 







    1 

















    5 












    6 




















    1 





































    1 






















    1 










    5 


























    5 










    6 
















    5 






    5 







































































































































































































































































































































































    6 












    5 




    1 











    6 









    6 


























    6 




























    4 




























    5 
























    1 



    4 








    4 

    4 



    4 
























    5 

















    5 


    5 





    5 








































    4 
    1 




    5 
    5 
    5 











































































































































































































































































































































































































































    1 







    1 

    1 




    1 


























    1 


















































    1 
    1 




    1 


    1 



















    1 

































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/fs/ext4/xattr.c
 *
 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
 *
 * Fix by Harrison Xing <harrison@mountainviewdata.com>.
 * Ext4 code with a lot of help from Eric Jarman <ejarman@acm.org>.
 * Extended attributes for symlinks and special files added per
 *  suggestion of Luka Renko <luka.renko@hermes.si>.
 * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
 *  Red Hat Inc.
 * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz
 *  and Andreas Gruenbacher <agruen@suse.de>.
 */

/*
 * Extended attributes are stored directly in inodes (on file systems with
 * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl
 * field contains the block number if an inode uses an additional block. All
 * attributes must fit in the inode and one additional block. Blocks that
 * contain the identical set of attributes may be shared among several inodes.
 * Identical blocks are detected by keeping a cache of blocks that have
 * recently been accessed.
 *
 * The attributes in inodes and on blocks have a different header; the entries
 * are stored in the same format:
 *
 *   +------------------+
 *   | header           |
 *   | entry 1          | |
 *   | entry 2          | | growing downwards
 *   | entry 3          | v
 *   | four null bytes  |
 *   | . . .            |
 *   | value 1          | ^
 *   | value 3          | | growing upwards
 *   | value 2          | |
 *   +------------------+
 *
 * The header is followed by multiple entry descriptors. In disk blocks, the
 * entry descriptors are kept sorted. In inodes, they are unsorted. The
 * attribute values are aligned to the end of the block in no specific order.
 *
 * Locking strategy
 * ----------------
 * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem.
 * EA blocks are only changed if they are exclusive to an inode, so
 * holding xattr_sem also means that nothing but the EA block's reference
 * count can change. Multiple writers to the same block are synchronized
 * by the buffer lock.
 */

#include <linux/init.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/mbcache.h>
#include <linux/quotaops.h>
#include <linux/iversion.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "xattr.h"
#include "acl.h"

#ifdef EXT4_XATTR_DEBUG
# define ea_idebug(inode, fmt, ...)                                        \
        printk(KERN_DEBUG "inode %s:%lu: " fmt "\n",                        \
               inode->i_sb->s_id, inode->i_ino, ##__VA_ARGS__)
# define ea_bdebug(bh, fmt, ...)                                        \
        printk(KERN_DEBUG "block %pg:%lu: " fmt "\n",                        \
               bh->b_bdev, (unsigned long)bh->b_blocknr, ##__VA_ARGS__)
#else
# define ea_idebug(inode, fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
# define ea_bdebug(bh, fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

static void ext4_xattr_block_cache_insert(struct mb_cache *,
                                          struct buffer_head *);
static struct buffer_head *
ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *,
                            struct mb_cache_entry **);
static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value,
                                    size_t value_count);
static __le32 ext4_xattr_hash_entry_signed(char *name, size_t name_len, __le32 *value,
                                    size_t value_count);
static void ext4_xattr_rehash(struct ext4_xattr_header *);

static const struct xattr_handler * const ext4_xattr_handler_map[] = {
        [EXT4_XATTR_INDEX_USER]                     = &ext4_xattr_user_handler,
#ifdef CONFIG_EXT4_FS_POSIX_ACL
        [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &nop_posix_acl_access,
        [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default,
#endif
        [EXT4_XATTR_INDEX_TRUSTED]             = &ext4_xattr_trusted_handler,
#ifdef CONFIG_EXT4_FS_SECURITY
        [EXT4_XATTR_INDEX_SECURITY]             = &ext4_xattr_security_handler,
#endif
        [EXT4_XATTR_INDEX_HURD]                     = &ext4_xattr_hurd_handler,
};

const struct xattr_handler * const ext4_xattr_handlers[] = {
        &ext4_xattr_user_handler,
        &ext4_xattr_trusted_handler,
#ifdef CONFIG_EXT4_FS_SECURITY
        &ext4_xattr_security_handler,
#endif
        &ext4_xattr_hurd_handler,
        NULL
};

#define EA_BLOCK_CACHE(inode)        (((struct ext4_sb_info *) \
                                inode->i_sb->s_fs_info)->s_ea_block_cache)

#define EA_INODE_CACHE(inode)        (((struct ext4_sb_info *) \
                                inode->i_sb->s_fs_info)->s_ea_inode_cache)

static int
ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
                        struct inode *inode);

#ifdef CONFIG_LOCKDEP
void ext4_xattr_inode_set_class(struct inode *ea_inode)
{
        struct ext4_inode_info *ei = EXT4_I(ea_inode);

        lockdep_set_subclass(&ea_inode->i_rwsem, 1);
        (void) ei;        /* shut up clang warning if !CONFIG_LOCKDEP */
        lockdep_set_subclass(&ei->i_data_sem, I_DATA_SEM_EA);
}
#endif

static __le32 ext4_xattr_block_csum(struct inode *inode,
                                    sector_t block_nr,
                                    struct ext4_xattr_header *hdr)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        __u32 csum;
        __le64 dsk_block_nr = cpu_to_le64(block_nr);
        __u32 dummy_csum = 0;
        int offset = offsetof(struct ext4_xattr_header, h_checksum);

        csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
                           sizeof(dsk_block_nr));
        csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset);
        csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
        offset += sizeof(dummy_csum);
        csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset,
                           EXT4_BLOCK_SIZE(inode->i_sb) - offset);

        return cpu_to_le32(csum);
}

static int ext4_xattr_block_csum_verify(struct inode *inode,
                                        struct buffer_head *bh)
{
        struct ext4_xattr_header *hdr = BHDR(bh);
        int ret = 1;

        if (ext4_has_metadata_csum(inode->i_sb)) {
                lock_buffer(bh);
                ret = (hdr->h_checksum == ext4_xattr_block_csum(inode,
                                                        bh->b_blocknr, hdr));
                unlock_buffer(bh);
        }
        return ret;
}

static void ext4_xattr_block_csum_set(struct inode *inode,
                                      struct buffer_head *bh)
{
        if (ext4_has_metadata_csum(inode->i_sb))
                BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode,
                                                bh->b_blocknr, BHDR(bh));
}

static inline const char *ext4_xattr_prefix(int name_index,
                                            struct dentry *dentry)
{
        const struct xattr_handler *handler = NULL;

        if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
                handler = ext4_xattr_handler_map[name_index];

        if (!xattr_handler_can_list(handler, dentry))
                return NULL;

        return xattr_prefix(handler);
}

static int
check_xattrs(struct inode *inode, struct buffer_head *bh,
             struct ext4_xattr_entry *entry, void *end, void *value_start,
             const char *function, unsigned int line)
{
        struct ext4_xattr_entry *e = entry;
        int err = -EFSCORRUPTED;
        char *err_str;

        if (bh) {
                if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
                    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
                        err_str = "invalid header";
                        goto errout;
                }
                if (buffer_verified(bh))
                        return 0;
                if (!ext4_xattr_block_csum_verify(inode, bh)) {
                        err = -EFSBADCRC;
                        err_str = "invalid checksum";
                        goto errout;
                }
        } else {
                struct ext4_xattr_ibody_header *header = value_start;

                header -= 1;
                if (end - (void *)header < sizeof(*header) + sizeof(u32)) {
                        err_str = "in-inode xattr block too small";
                        goto errout;
                }
                if (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
                        err_str = "bad magic number in in-inode xattr";
                        goto errout;
                }
        }

        /* Find the end of the names list */
        while (!IS_LAST_ENTRY(e)) {
                struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
                if ((void *)next >= end) {
                        err_str = "e_name out of bounds";
                        goto errout;
                }
                if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) {
                        err_str = "bad e_name length";
                        goto errout;
                }
                e = next;
        }

        /* Check the values */
        while (!IS_LAST_ENTRY(entry)) {
                u32 size = le32_to_cpu(entry->e_value_size);
                unsigned long ea_ino = le32_to_cpu(entry->e_value_inum);

                if (!ext4_has_feature_ea_inode(inode->i_sb) && ea_ino) {
                        err_str = "ea_inode specified without ea_inode feature enabled";
                        goto errout;
                }
                if (ea_ino && ((ea_ino == EXT4_ROOT_INO) ||
                               !ext4_valid_inum(inode->i_sb, ea_ino))) {
                        err_str = "invalid ea_ino";
                        goto errout;
                }
                if (size > EXT4_XATTR_SIZE_MAX) {
                        err_str = "e_value size too large";
                        goto errout;
                }

                if (size != 0 && entry->e_value_inum == 0) {
                        u16 offs = le16_to_cpu(entry->e_value_offs);
                        void *value;

                        /*
                         * The value cannot overlap the names, and the value
                         * with padding cannot extend beyond 'end'.  Check both
                         * the padded and unpadded sizes, since the size may
                         * overflow to 0 when adding padding.
                         */
                        if (offs > end - value_start) {
                                err_str = "e_value out of bounds";
                                goto errout;
                        }
                        value = value_start + offs;
                        if (value < (void *)e + sizeof(u32) ||
                            size > end - value ||
                            EXT4_XATTR_SIZE(size) > end - value) {
                                err_str = "overlapping e_value ";
                                goto errout;
                        }
                }
                entry = EXT4_XATTR_NEXT(entry);
        }
        if (bh)
                set_buffer_verified(bh);
        return 0;

errout:
        if (bh)
                __ext4_error_inode(inode, function, line, 0, -err,
                                   "corrupted xattr block %llu: %s",
                                   (unsigned long long) bh->b_blocknr,
                                   err_str);
        else
                __ext4_error_inode(inode, function, line, 0, -err,
                                   "corrupted in-inode xattr: %s", err_str);
        return err;
}

static inline int
__ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh,
                         const char *function, unsigned int line)
{
        return check_xattrs(inode, bh, BFIRST(bh), bh->b_data + bh->b_size,
                            bh->b_data, function, line);
}

#define ext4_xattr_check_block(inode, bh) \
        __ext4_xattr_check_block((inode), (bh),  __func__, __LINE__)


static inline int
__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
                         void *end, const char *function, unsigned int line)
{
        return check_xattrs(inode, NULL, IFIRST(header), end, IFIRST(header),
                            function, line);
}

#define xattr_check_inode(inode, header, end) \
        __xattr_check_inode((inode), (header), (end), __func__, __LINE__)

static int
xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry,
                 void *end, int name_index, const char *name, int sorted)
{
        struct ext4_xattr_entry *entry, *next;
        size_t name_len;
        int cmp = 1;

        if (name == NULL)
                return -EINVAL;
        name_len = strlen(name);
        for (entry = *pentry; !IS_LAST_ENTRY(entry); entry = next) {
                next = EXT4_XATTR_NEXT(entry);
                if ((void *) next >= end) {
                        EXT4_ERROR_INODE(inode, "corrupted xattr entries");
                        return -EFSCORRUPTED;
                }
                cmp = name_index - entry->e_name_index;
                if (!cmp)
                        cmp = name_len - entry->e_name_len;
                if (!cmp)
                        cmp = memcmp(name, entry->e_name, name_len);
                if (cmp <= 0 && (sorted || cmp == 0))
                        break;
        }
        *pentry = entry;
        return cmp ? -ENODATA : 0;
}

static u32
ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
{
        return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size);
}

static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
{
        return ((u64) inode_get_ctime_sec(ea_inode) << 32) |
                (u32) inode_peek_iversion_raw(ea_inode);
}

static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
{
        inode_set_ctime(ea_inode, (u32)(ref_count >> 32), 0);
        inode_set_iversion_raw(ea_inode, ref_count & 0xffffffff);
}

static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode)
{
        return (u32) inode_get_atime_sec(ea_inode);
}

static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash)
{
        inode_set_atime(ea_inode, hash, 0);
}

/*
 * Read the EA value from an inode.
 */
static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size)
{
        int blocksize = 1 << ea_inode->i_blkbits;
        int bh_count = (size + blocksize - 1) >> ea_inode->i_blkbits;
        int tail_size = (size % blocksize) ?: blocksize;
        struct buffer_head *bhs_inline[8];
        struct buffer_head **bhs = bhs_inline;
        int i, ret;

        if (bh_count > ARRAY_SIZE(bhs_inline)) {
                bhs = kmalloc_array(bh_count, sizeof(*bhs), GFP_NOFS);
                if (!bhs)
                        return -ENOMEM;
        }

        ret = ext4_bread_batch(ea_inode, 0 /* block */, bh_count,
                               true /* wait */, bhs);
        if (ret)
                goto free_bhs;

        for (i = 0; i < bh_count; i++) {
                /* There shouldn't be any holes in ea_inode. */
                if (!bhs[i]) {
                        ret = -EFSCORRUPTED;
                        goto put_bhs;
                }
                memcpy((char *)buf + blocksize * i, bhs[i]->b_data,
                       i < bh_count - 1 ? blocksize : tail_size);
        }
        ret = 0;
put_bhs:
        for (i = 0; i < bh_count; i++)
                brelse(bhs[i]);
free_bhs:
        if (bhs != bhs_inline)
                kfree(bhs);
        return ret;
}

#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode_get_mtime_sec(inode)))

static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
                                 u32 ea_inode_hash, struct inode **ea_inode)
{
        struct inode *inode;
        int err;

        /*
         * We have to check for this corruption early as otherwise
         * iget_locked() could wait indefinitely for the state of our
         * parent inode.
         */
        if (parent->i_ino == ea_ino) {
                ext4_error(parent->i_sb,
                           "Parent and EA inode have the same ino %lu", ea_ino);
                return -EFSCORRUPTED;
        }

        inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_EA_INODE);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                ext4_error(parent->i_sb,
                           "error while reading EA inode %lu err=%d", ea_ino,
                           err);
                return err;
        }
        ext4_xattr_inode_set_class(inode);

        /*
         * Check whether this is an old Lustre-style xattr inode. Lustre
         * implementation does not have hash validation, rather it has a
         * backpointer from ea_inode to the parent inode.
         */
        if (ea_inode_hash != ext4_xattr_inode_get_hash(inode) &&
            EXT4_XATTR_INODE_GET_PARENT(inode) == parent->i_ino &&
            inode->i_generation == parent->i_generation) {
                ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE);
                ext4_xattr_inode_set_ref(inode, 1);
        } else {
                inode_lock(inode);
                inode->i_flags |= S_NOQUOTA;
                inode_unlock(inode);
        }

        *ea_inode = inode;
        return 0;
}

/* Remove entry from mbcache when EA inode is getting evicted */
void ext4_evict_ea_inode(struct inode *inode)
{
        struct mb_cache_entry *oe;

        if (!EA_INODE_CACHE(inode))
                return;
        /* Wait for entry to get unused so that we can remove it */
        while ((oe = mb_cache_entry_delete_or_get(EA_INODE_CACHE(inode),
                        ext4_xattr_inode_get_hash(inode), inode->i_ino))) {
                mb_cache_entry_wait_unused(oe);
                mb_cache_entry_put(EA_INODE_CACHE(inode), oe);
        }
}

static int
ext4_xattr_inode_verify_hashes(struct inode *ea_inode,
                               struct ext4_xattr_entry *entry, void *buffer,
                               size_t size)
{
        u32 hash;

        /* Verify stored hash matches calculated hash. */
        hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size);
        if (hash != ext4_xattr_inode_get_hash(ea_inode))
                return -EFSCORRUPTED;

        if (entry) {
                __le32 e_hash, tmp_data;

                /* Verify entry hash. */
                tmp_data = cpu_to_le32(hash);
                e_hash = ext4_xattr_hash_entry(entry->e_name, entry->e_name_len,
                                               &tmp_data, 1);
                /* All good? */
                if (e_hash == entry->e_hash)
                        return 0;

                /*
                 * Not good. Maybe the entry hash was calculated
                 * using the buggy signed char version?
                 */
                e_hash = ext4_xattr_hash_entry_signed(entry->e_name, entry->e_name_len,
                                                        &tmp_data, 1);
                /* Still no match - bad */
                if (e_hash != entry->e_hash)
                        return -EFSCORRUPTED;

                /* Let people know about old hash */
                pr_warn_once("ext4: filesystem with signed xattr name hash");
        }
        return 0;
}

/*
 * Read xattr value from the EA inode.
 */
static int
ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry,
                     void *buffer, size_t size)
{
        struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
        struct inode *ea_inode;
        int err;

        err = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum),
                                    le32_to_cpu(entry->e_hash), &ea_inode);
        if (err) {
                ea_inode = NULL;
                goto out;
        }

        if (i_size_read(ea_inode) != size) {
                ext4_warning_inode(ea_inode,
                                   "ea_inode file size=%llu entry size=%zu",
                                   i_size_read(ea_inode), size);
                err = -EFSCORRUPTED;
                goto out;
        }

        err = ext4_xattr_inode_read(ea_inode, buffer, size);
        if (err)
                goto out;

        if (!ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) {
                err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer,
                                                     size);
                if (err) {
                        ext4_warning_inode(ea_inode,
                                           "EA inode hash validation failed");
                        goto out;
                }

                if (ea_inode_cache)
                        mb_cache_entry_create(ea_inode_cache, GFP_NOFS,
                                        ext4_xattr_inode_get_hash(ea_inode),
                                        ea_inode->i_ino, true /* reusable */);
        }
out:
        iput(ea_inode);
        return err;
}

static int
ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
                     void *buffer, size_t buffer_size)
{
        struct buffer_head *bh = NULL;
        struct ext4_xattr_entry *entry;
        size_t size;
        void *end;
        int error;
        struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);

        ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
                  name_index, name, buffer, (long)buffer_size);

        if (!EXT4_I(inode)->i_file_acl)
                return -ENODATA;
        ea_idebug(inode, "reading block %llu",
                  (unsigned long long)EXT4_I(inode)->i_file_acl);
        bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
        if (IS_ERR(bh))
                return PTR_ERR(bh);
        ea_bdebug(bh, "b_count=%d, refcount=%d",
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        error = ext4_xattr_check_block(inode, bh);
        if (error)
                goto cleanup;
        ext4_xattr_block_cache_insert(ea_block_cache, bh);
        entry = BFIRST(bh);
        end = bh->b_data + bh->b_size;
        error = xattr_find_entry(inode, &entry, end, name_index, name, 1);
        if (error)
                goto cleanup;
        size = le32_to_cpu(entry->e_value_size);
        error = -ERANGE;
        if (unlikely(size > EXT4_XATTR_SIZE_MAX))
                goto cleanup;
        if (buffer) {
                if (size > buffer_size)
                        goto cleanup;
                if (entry->e_value_inum) {
                        error = ext4_xattr_inode_get(inode, entry, buffer,
                                                     size);
                        if (error)
                                goto cleanup;
                } else {
                        u16 offset = le16_to_cpu(entry->e_value_offs);
                        void *p = bh->b_data + offset;

                        if (unlikely(p + size > end))
                                goto cleanup;
                        memcpy(buffer, p, size);
                }
        }
        error = size;

cleanup:
        brelse(bh);
        return error;
}

int
ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
                     void *buffer, size_t buffer_size)
{
        struct ext4_xattr_ibody_header *header;
        struct ext4_xattr_entry *entry;
        struct ext4_inode *raw_inode;
        struct ext4_iloc iloc;
        size_t size;
        void *end;
        int error;

        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
                return -ENODATA;
        error = ext4_get_inode_loc(inode, &iloc);
        if (error)
                return error;
        raw_inode = ext4_raw_inode(&iloc);
        header = IHDR(inode, raw_inode);
        end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
        error = xattr_check_inode(inode, header, end);
        if (error)
                goto cleanup;
        entry = IFIRST(header);
        error = xattr_find_entry(inode, &entry, end, name_index, name, 0);
        if (error)
                goto cleanup;
        size = le32_to_cpu(entry->e_value_size);
        error = -ERANGE;
        if (unlikely(size > EXT4_XATTR_SIZE_MAX))
                goto cleanup;
        if (buffer) {
                if (size > buffer_size)
                        goto cleanup;
                if (entry->e_value_inum) {
                        error = ext4_xattr_inode_get(inode, entry, buffer,
                                                     size);
                        if (error)
                                goto cleanup;
                } else {
                        u16 offset = le16_to_cpu(entry->e_value_offs);
                        void *p = (void *)IFIRST(header) + offset;

                        if (unlikely(p + size > end))
                                goto cleanup;
                        memcpy(buffer, p, size);
                }
        }
        error = size;

cleanup:
        brelse(iloc.bh);
        return error;
}

/*
 * ext4_xattr_get()
 *
 * Copy an extended attribute into the buffer
 * provided, or compute the buffer size required.
 * Buffer is NULL to compute the size of the buffer required.
 *
 * Returns a negative error number on failure, or the number of bytes
 * used / required on success.
 */
int
ext4_xattr_get(struct inode *inode, int name_index, const char *name,
               void *buffer, size_t buffer_size)
{
        int error;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;

        if (strlen(name) > 255)
                return -ERANGE;

        down_read(&EXT4_I(inode)->xattr_sem);
        error = ext4_xattr_ibody_get(inode, name_index, name, buffer,
                                     buffer_size);
        if (error == -ENODATA)
                error = ext4_xattr_block_get(inode, name_index, name, buffer,
                                             buffer_size);
        up_read(&EXT4_I(inode)->xattr_sem);
        return error;
}

static int
ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
                        char *buffer, size_t buffer_size)
{
        size_t rest = buffer_size;

        for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
                const char *prefix;

                prefix = ext4_xattr_prefix(entry->e_name_index, dentry);
                if (prefix) {
                        size_t prefix_len = strlen(prefix);
                        size_t size = prefix_len + entry->e_name_len + 1;

                        if (buffer) {
                                if (size > rest)
                                        return -ERANGE;
                                memcpy(buffer, prefix, prefix_len);
                                buffer += prefix_len;
                                memcpy(buffer, entry->e_name, entry->e_name_len);
                                buffer += entry->e_name_len;
                                *buffer++ = 0;
                        }
                        rest -= size;
                }
        }
        return buffer_size - rest;  /* total size */
}

static int
ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
        struct inode *inode = d_inode(dentry);
        struct buffer_head *bh = NULL;
        int error;

        ea_idebug(inode, "buffer=%p, buffer_size=%ld",
                  buffer, (long)buffer_size);

        if (!EXT4_I(inode)->i_file_acl)
                return 0;
        ea_idebug(inode, "reading block %llu",
                  (unsigned long long)EXT4_I(inode)->i_file_acl);
        bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
        if (IS_ERR(bh))
                return PTR_ERR(bh);
        ea_bdebug(bh, "b_count=%d, refcount=%d",
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        error = ext4_xattr_check_block(inode, bh);
        if (error)
                goto cleanup;
        ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh);
        error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer,
                                        buffer_size);
cleanup:
        brelse(bh);
        return error;
}

static int
ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
        struct inode *inode = d_inode(dentry);
        struct ext4_xattr_ibody_header *header;
        struct ext4_inode *raw_inode;
        struct ext4_iloc iloc;
        void *end;
        int error;

        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
                return 0;
        error = ext4_get_inode_loc(inode, &iloc);
        if (error)
                return error;
        raw_inode = ext4_raw_inode(&iloc);
        header = IHDR(inode, raw_inode);
        end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
        error = xattr_check_inode(inode, header, end);
        if (error)
                goto cleanup;
        error = ext4_xattr_list_entries(dentry, IFIRST(header),
                                        buffer, buffer_size);

cleanup:
        brelse(iloc.bh);
        return error;
}

/*
 * Inode operation listxattr()
 *
 * d_inode(dentry)->i_rwsem: don't care
 *
 * Copy a list of attribute names into the buffer
 * provided, or compute the buffer size required.
 * Buffer is NULL to compute the size of the buffer required.
 *
 * Returns a negative error number on failure, or the number of bytes
 * used / required on success.
 */
ssize_t
ext4_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
        int ret, ret2;

        down_read(&EXT4_I(d_inode(dentry))->xattr_sem);
        ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
        if (ret < 0)
                goto errout;
        if (buffer) {
                buffer += ret;
                buffer_size -= ret;
        }
        ret = ext4_xattr_block_list(dentry, buffer, buffer_size);
        if (ret < 0)
                goto errout;
        ret += ret2;
errout:
        up_read(&EXT4_I(d_inode(dentry))->xattr_sem);
        return ret;
}

/*
 * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is
 * not set, set it.
 */
static void ext4_xattr_update_super_block(handle_t *handle,
                                          struct super_block *sb)
{
        if (ext4_has_feature_xattr(sb))
                return;

        BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
        if (ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh,
                                          EXT4_JTR_NONE) == 0) {
                lock_buffer(EXT4_SB(sb)->s_sbh);
                ext4_set_feature_xattr(sb);
                ext4_superblock_csum_set(sb);
                unlock_buffer(EXT4_SB(sb)->s_sbh);
                ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
        }
}

int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
{
        struct ext4_iloc iloc = { .bh = NULL };
        struct buffer_head *bh = NULL;
        struct ext4_inode *raw_inode;
        struct ext4_xattr_ibody_header *header;
        struct ext4_xattr_entry *entry;
        qsize_t ea_inode_refs = 0;
        void *end;
        int ret;

        lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem);

        if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
                ret = ext4_get_inode_loc(inode, &iloc);
                if (ret)
                        goto out;
                raw_inode = ext4_raw_inode(&iloc);
                header = IHDR(inode, raw_inode);
                end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
                ret = xattr_check_inode(inode, header, end);
                if (ret)
                        goto out;

                for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
                     entry = EXT4_XATTR_NEXT(entry))
                        if (entry->e_value_inum)
                                ea_inode_refs++;
        }

        if (EXT4_I(inode)->i_file_acl) {
                bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
                if (IS_ERR(bh)) {
                        ret = PTR_ERR(bh);
                        bh = NULL;
                        goto out;
                }

                ret = ext4_xattr_check_block(inode, bh);
                if (ret)
                        goto out;

                for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
                     entry = EXT4_XATTR_NEXT(entry))
                        if (entry->e_value_inum)
                                ea_inode_refs++;
        }
        *usage = ea_inode_refs + 1;
        ret = 0;
out:
        brelse(iloc.bh);
        brelse(bh);
        return ret;
}

static inline size_t round_up_cluster(struct inode *inode, size_t length)
{
        struct super_block *sb = inode->i_sb;
        size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits +
                                    inode->i_blkbits);
        size_t mask = ~(cluster_size - 1);

        return (length + cluster_size - 1) & mask;
}

static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len)
{
        int err;

        err = dquot_alloc_inode(inode);
        if (err)
                return err;
        err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len));
        if (err)
                dquot_free_inode(inode);
        return err;
}

static void ext4_xattr_inode_free_quota(struct inode *parent,
                                        struct inode *ea_inode,
                                        size_t len)
{
        if (ea_inode &&
            ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE))
                return;
        dquot_free_space_nodirty(parent, round_up_cluster(parent, len));
        dquot_free_inode(parent);
}

int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
                             struct buffer_head *block_bh, size_t value_len,
                             bool is_create)
{
        int credits;
        int blocks;

        /*
         * 1) Owner inode update
         * 2) Ref count update on old xattr block
         * 3) new xattr block
         * 4) block bitmap update for new xattr block
         * 5) group descriptor for new xattr block
         * 6) block bitmap update for old xattr block
         * 7) group descriptor for old block
         *
         * 6 & 7 can happen if we have two racing threads T_a and T_b
         * which are each trying to set an xattr on inodes I_a and I_b
         * which were both initially sharing an xattr block.
         */
        credits = 7;

        /* Quota updates. */
        credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb);

        /*
         * In case of inline data, we may push out the data to a block,
         * so we need to reserve credits for this eventuality
         */
        if (inode && ext4_has_inline_data(inode))
                credits += ext4_writepage_trans_blocks(inode) + 1;

        /* We are done if ea_inode feature is not enabled. */
        if (!ext4_has_feature_ea_inode(sb))
                return credits;

        /* New ea_inode, inode map, block bitmap, group descriptor. */
        credits += 4;

        /* Data blocks. */
        blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits;

        /* Indirection block or one level of extent tree. */
        blocks += 1;

        /* Block bitmap and group descriptor updates for each block. */
        credits += blocks * 2;

        /* Blocks themselves. */
        credits += blocks;

        if (!is_create) {
                /* Dereference ea_inode holding old xattr value.
                 * Old ea_inode, inode map, block bitmap, group descriptor.
                 */
                credits += 4;

                /* Data blocks for old ea_inode. */
                blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits;

                /* Indirection block or one level of extent tree for old
                 * ea_inode.
                 */
                blocks += 1;

                /* Block bitmap and group descriptor updates for each block. */
                credits += blocks * 2;
        }

        /* We may need to clone the existing xattr block in which case we need
         * to increment ref counts for existing ea_inodes referenced by it.
         */
        if (block_bh) {
                struct ext4_xattr_entry *entry = BFIRST(block_bh);

                for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry))
                        if (entry->e_value_inum)
                                /* Ref count update on ea_inode. */
                                credits += 1;
        }
        return credits;
}

static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
                                       int ref_change)
{
        struct ext4_iloc iloc;
        s64 ref_count;
        int ret;

        inode_lock(ea_inode);

        ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
        if (ret)
                goto out;

        ref_count = ext4_xattr_inode_get_ref(ea_inode);
        ref_count += ref_change;
        ext4_xattr_inode_set_ref(ea_inode, ref_count);

        if (ref_change > 0) {
                WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld",
                          ea_inode->i_ino, ref_count);

                if (ref_count == 1) {
                        WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u",
                                  ea_inode->i_ino, ea_inode->i_nlink);

                        set_nlink(ea_inode, 1);
                        ext4_orphan_del(handle, ea_inode);
                }
        } else {
                WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld",
                          ea_inode->i_ino, ref_count);

                if (ref_count == 0) {
                        WARN_ONCE(ea_inode->i_nlink != 1,
                                  "EA inode %lu i_nlink=%u",
                                  ea_inode->i_ino, ea_inode->i_nlink);

                        clear_nlink(ea_inode);
                        ext4_orphan_add(handle, ea_inode);
                }
        }

        ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
        if (ret)
                ext4_warning_inode(ea_inode,
                                   "ext4_mark_iloc_dirty() failed ret=%d", ret);
out:
        inode_unlock(ea_inode);
        return ret;
}

static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
{
        return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
}

static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
{
        return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
}

static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
                                        struct ext4_xattr_entry *first)
{
        struct inode *ea_inode;
        struct ext4_xattr_entry *entry;
        struct ext4_xattr_entry *failed_entry;
        unsigned int ea_ino;
        int err, saved_err;

        for (entry = first; !IS_LAST_ENTRY(entry);
             entry = EXT4_XATTR_NEXT(entry)) {
                if (!entry->e_value_inum)
                        continue;
                ea_ino = le32_to_cpu(entry->e_value_inum);
                err = ext4_xattr_inode_iget(parent, ea_ino,
                                            le32_to_cpu(entry->e_hash),
                                            &ea_inode);
                if (err)
                        goto cleanup;
                err = ext4_xattr_inode_inc_ref(handle, ea_inode);
                if (err) {
                        ext4_warning_inode(ea_inode, "inc ref error %d", err);
                        iput(ea_inode);
                        goto cleanup;
                }
                iput(ea_inode);
        }
        return 0;

cleanup:
        saved_err = err;
        failed_entry = entry;

        for (entry = first; entry != failed_entry;
             entry = EXT4_XATTR_NEXT(entry)) {
                if (!entry->e_value_inum)
                        continue;
                ea_ino = le32_to_cpu(entry->e_value_inum);
                err = ext4_xattr_inode_iget(parent, ea_ino,
                                            le32_to_cpu(entry->e_hash),
                                            &ea_inode);
                if (err) {
                        ext4_warning(parent->i_sb,
                                     "cleanup ea_ino %u iget error %d", ea_ino,
                                     err);
                        continue;
                }
                err = ext4_xattr_inode_dec_ref(handle, ea_inode);
                if (err)
                        ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
                                           err);
                iput(ea_inode);
        }
        return saved_err;
}

static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode,
                        struct buffer_head *bh, bool block_csum, bool dirty)
{
        int error;

        if (bh && dirty) {
                if (block_csum)
                        ext4_xattr_block_csum_set(inode, bh);
                error = ext4_handle_dirty_metadata(handle, NULL, bh);
                if (error) {
                        ext4_warning(inode->i_sb, "Handle metadata (error %d)",
                                     error);
                        return error;
                }
        }
        return 0;
}

static void
ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
                             struct buffer_head *bh,
                             struct ext4_xattr_entry *first, bool block_csum,
                             struct ext4_xattr_inode_array **ea_inode_array,
                             int extra_credits, bool skip_quota)
{
        struct inode *ea_inode;
        struct ext4_xattr_entry *entry;
        bool dirty = false;
        unsigned int ea_ino;
        int err;
        int credits;

        /* One credit for dec ref on ea_inode, one for orphan list addition, */
        credits = 2 + extra_credits;

        for (entry = first; !IS_LAST_ENTRY(entry);
             entry = EXT4_XATTR_NEXT(entry)) {
                if (!entry->e_value_inum)
                        continue;
                ea_ino = le32_to_cpu(entry->e_value_inum);
                err = ext4_xattr_inode_iget(parent, ea_ino,
                                            le32_to_cpu(entry->e_hash),
                                            &ea_inode);
                if (err)
                        continue;

                err = ext4_expand_inode_array(ea_inode_array, ea_inode);
                if (err) {
                        ext4_warning_inode(ea_inode,
                                           "Expand inode array err=%d", err);
                        iput(ea_inode);
                        continue;
                }

                err = ext4_journal_ensure_credits_fn(handle, credits, credits,
                        ext4_free_metadata_revoke_credits(parent->i_sb, 1),
                        ext4_xattr_restart_fn(handle, parent, bh, block_csum,
                                              dirty));
                if (err < 0) {
                        ext4_warning_inode(ea_inode, "Ensure credits err=%d",
                                           err);
                        continue;
                }
                if (err > 0) {
                        err = ext4_journal_get_write_access(handle,
                                        parent->i_sb, bh, EXT4_JTR_NONE);
                        if (err) {
                                ext4_warning_inode(ea_inode,
                                                "Re-get write access err=%d",
                                                err);
                                continue;
                        }
                }

                err = ext4_xattr_inode_dec_ref(handle, ea_inode);
                if (err) {
                        ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
                                           err);
                        continue;
                }

                if (!skip_quota)
                        ext4_xattr_inode_free_quota(parent, ea_inode,
                                              le32_to_cpu(entry->e_value_size));

                /*
                 * Forget about ea_inode within the same transaction that
                 * decrements the ref count. This avoids duplicate decrements in
                 * case the rest of the work spills over to subsequent
                 * transactions.
                 */
                entry->e_value_inum = 0;
                entry->e_value_size = 0;

                dirty = true;
        }

        if (dirty) {
                /*
                 * Note that we are deliberately skipping csum calculation for
                 * the final update because we do not expect any journal
                 * restarts until xattr block is freed.
                 */

                err = ext4_handle_dirty_metadata(handle, NULL, bh);
                if (err)
                        ext4_warning_inode(parent,
                                           "handle dirty metadata err=%d", err);
        }
}

/*
 * Release the xattr block BH: If the reference count is > 1, decrement it;
 * otherwise free the block.
 */
static void
ext4_xattr_release_block(handle_t *handle, struct inode *inode,
                         struct buffer_head *bh,
                         struct ext4_xattr_inode_array **ea_inode_array,
                         int extra_credits)
{
        struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
        u32 hash, ref;
        int error = 0;

        BUFFER_TRACE(bh, "get_write_access");
        error = ext4_journal_get_write_access(handle, inode->i_sb, bh,
                                              EXT4_JTR_NONE);
        if (error)
                goto out;

retry_ref:
        lock_buffer(bh);
        hash = le32_to_cpu(BHDR(bh)->h_hash);
        ref = le32_to_cpu(BHDR(bh)->h_refcount);
        if (ref == 1) {
                ea_bdebug(bh, "refcount now=0; freeing");
                /*
                 * This must happen under buffer lock for
                 * ext4_xattr_block_set() to reliably detect freed block
                 */
                if (ea_block_cache) {
                        struct mb_cache_entry *oe;

                        oe = mb_cache_entry_delete_or_get(ea_block_cache, hash,
                                                          bh->b_blocknr);
                        if (oe) {
                                unlock_buffer(bh);
                                mb_cache_entry_wait_unused(oe);
                                mb_cache_entry_put(ea_block_cache, oe);
                                goto retry_ref;
                        }
                }
                get_bh(bh);
                unlock_buffer(bh);

                if (ext4_has_feature_ea_inode(inode->i_sb))
                        ext4_xattr_inode_dec_ref_all(handle, inode, bh,
                                                     BFIRST(bh),
                                                     true /* block_csum */,
                                                     ea_inode_array,
                                                     extra_credits,
                                                     true /* skip_quota */);
                ext4_free_blocks(handle, inode, bh, 0, 1,
                                 EXT4_FREE_BLOCKS_METADATA |
                                 EXT4_FREE_BLOCKS_FORGET);
        } else {
                ref--;
                BHDR(bh)->h_refcount = cpu_to_le32(ref);
                if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) {
                        struct mb_cache_entry *ce;

                        if (ea_block_cache) {
                                ce = mb_cache_entry_get(ea_block_cache, hash,
                                                        bh->b_blocknr);
                                if (ce) {
                                        set_bit(MBE_REUSABLE_B, &ce->e_flags);
                                        mb_cache_entry_put(ea_block_cache, ce);
                                }
                        }
                }

                ext4_xattr_block_csum_set(inode, bh);
                /*
                 * Beware of this ugliness: Releasing of xattr block references
                 * from different inodes can race and so we have to protect
                 * from a race where someone else frees the block (and releases
                 * its journal_head) before we are done dirtying the buffer. In
                 * nojournal mode this race is harmless and we actually cannot
                 * call ext4_handle_dirty_metadata() with locked buffer as
                 * that function can call sync_dirty_buffer() so for that case
                 * we handle the dirtying after unlocking the buffer.
                 */
                if (ext4_handle_valid(handle))
                        error = ext4_handle_dirty_metadata(handle, inode, bh);
                unlock_buffer(bh);
                if (!ext4_handle_valid(handle))
                        error = ext4_handle_dirty_metadata(handle, inode, bh);
                if (IS_SYNC(inode))
                        ext4_handle_sync(handle);
                dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1));
                ea_bdebug(bh, "refcount now=%d; releasing",
                          le32_to_cpu(BHDR(bh)->h_refcount));
        }
out:
        ext4_std_error(inode->i_sb, error);
        return;
}

/*
 * Find the available free space for EAs. This also returns the total number of
 * bytes used by EA entries.
 */
static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
                                    size_t *min_offs, void *base, int *total)
{
        for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
                if (!last->e_value_inum && last->e_value_size) {
                        size_t offs = le16_to_cpu(last->e_value_offs);
                        if (offs < *min_offs)
                                *min_offs = offs;
                }
                if (total)
                        *total += EXT4_XATTR_LEN(last->e_name_len);
        }
        return (*min_offs - ((void *)last - base) - sizeof(__u32));
}

/*
 * Write the value of the EA in an inode.
 */
static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
                                  const void *buf, int bufsize)
{
        struct buffer_head *bh = NULL;
        unsigned long block = 0;
        int blocksize = ea_inode->i_sb->s_blocksize;
        int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
        int csize, wsize = 0;
        int ret = 0, ret2 = 0;
        int retries = 0;

retry:
        while (ret >= 0 && ret < max_blocks) {
                struct ext4_map_blocks map;
                map.m_lblk = block += ret;
                map.m_len = max_blocks -= ret;

                ret = ext4_map_blocks(handle, ea_inode, &map,
                                      EXT4_GET_BLOCKS_CREATE);
                if (ret <= 0) {
                        ext4_mark_inode_dirty(handle, ea_inode);
                        if (ret == -ENOSPC &&
                            ext4_should_retry_alloc(ea_inode->i_sb, &retries)) {
                                ret = 0;
                                goto retry;
                        }
                        break;
                }
        }

        if (ret < 0)
                return ret;

        block = 0;
        while (wsize < bufsize) {
                brelse(bh);
                csize = (bufsize - wsize) > blocksize ? blocksize :
                                                                bufsize - wsize;
                bh = ext4_getblk(handle, ea_inode, block, 0);
                if (IS_ERR(bh))
                        return PTR_ERR(bh);
                if (!bh) {
                        WARN_ON_ONCE(1);
                        EXT4_ERROR_INODE(ea_inode,
                                         "ext4_getblk() return bh = NULL");
                        return -EFSCORRUPTED;
                }
                ret = ext4_journal_get_write_access(handle, ea_inode->i_sb, bh,
                                                   EXT4_JTR_NONE);
                if (ret)
                        goto out;

                memcpy(bh->b_data, buf, csize);
                set_buffer_uptodate(bh);
                ext4_handle_dirty_metadata(handle, ea_inode, bh);

                buf += csize;
                wsize += csize;
                block += 1;
        }

        inode_lock(ea_inode);
        i_size_write(ea_inode, wsize);
        ext4_update_i_disksize(ea_inode, wsize);
        inode_unlock(ea_inode);

        ret2 = ext4_mark_inode_dirty(handle, ea_inode);
        if (unlikely(ret2 && !ret))
                ret = ret2;

out:
        brelse(bh);

        return ret;
}

/*
 * Create an inode to store the value of a large EA.
 */
static struct inode *ext4_xattr_inode_create(handle_t *handle,
                                             struct inode *inode, u32 hash)
{
        struct inode *ea_inode = NULL;
        uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
        int err;

        if (inode->i_sb->s_root == NULL) {
                ext4_warning(inode->i_sb,
                             "refuse to create EA inode when umounting");
                WARN_ON(1);
                return ERR_PTR(-EINVAL);
        }

        /*
         * Let the next inode be the goal, so we try and allocate the EA inode
         * in the same group, or nearby one.
         */
        ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
                                  S_IFREG | 0600, NULL, inode->i_ino + 1, owner,
                                  EXT4_EA_INODE_FL);
        if (!IS_ERR(ea_inode)) {
                ea_inode->i_op = &ext4_file_inode_operations;
                ea_inode->i_fop = &ext4_file_operations;
                ext4_set_aops(ea_inode);
                ext4_xattr_inode_set_class(ea_inode);
                unlock_new_inode(ea_inode);
                ext4_xattr_inode_set_ref(ea_inode, 1);
                ext4_xattr_inode_set_hash(ea_inode, hash);
                err = ext4_mark_inode_dirty(handle, ea_inode);
                if (!err)
                        err = ext4_inode_attach_jinode(ea_inode);
                if (err) {
                        if (ext4_xattr_inode_dec_ref(handle, ea_inode))
                                ext4_warning_inode(ea_inode,
                                        "cleanup dec ref error %d", err);
                        iput(ea_inode);
                        return ERR_PTR(err);
                }

                /*
                 * Xattr inodes are shared therefore quota charging is performed
                 * at a higher level.
                 */
                dquot_free_inode(ea_inode);
                dquot_drop(ea_inode);
                inode_lock(ea_inode);
                ea_inode->i_flags |= S_NOQUOTA;
                inode_unlock(ea_inode);
        }

        return ea_inode;
}

static struct inode *
ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
                            size_t value_len, u32 hash)
{
        struct inode *ea_inode;
        struct mb_cache_entry *ce;
        struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode);
        void *ea_data;

        if (!ea_inode_cache)
                return NULL;

        ce = mb_cache_entry_find_first(ea_inode_cache, hash);
        if (!ce)
                return NULL;

        WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) &&
                     !(current->flags & PF_MEMALLOC_NOFS));

        ea_data = kvmalloc(value_len, GFP_KERNEL);
        if (!ea_data) {
                mb_cache_entry_put(ea_inode_cache, ce);
                return NULL;
        }

        while (ce) {
                ea_inode = ext4_iget(inode->i_sb, ce->e_value,
                                     EXT4_IGET_EA_INODE);
                if (IS_ERR(ea_inode))
                        goto next_entry;
                ext4_xattr_inode_set_class(ea_inode);
                if (i_size_read(ea_inode) == value_len &&
                    !ext4_xattr_inode_read(ea_inode, ea_data, value_len) &&
                    !ext4_xattr_inode_verify_hashes(ea_inode, NULL, ea_data,
                                                    value_len) &&
                    !memcmp(value, ea_data, value_len)) {
                        mb_cache_entry_touch(ea_inode_cache, ce);
                        mb_cache_entry_put(ea_inode_cache, ce);
                        kvfree(ea_data);
                        return ea_inode;
                }
                iput(ea_inode);
        next_entry:
                ce = mb_cache_entry_find_next(ea_inode_cache, ce);
        }
        kvfree(ea_data);
        return NULL;
}

/*
 * Add value of the EA in an inode.
 */
static struct inode *ext4_xattr_inode_lookup_create(handle_t *handle,
                struct inode *inode, const void *value, size_t value_len)
{
        struct inode *ea_inode;
        u32 hash;
        int err;

        /* Account inode & space to quota even if sharing... */
        err = ext4_xattr_inode_alloc_quota(inode, value_len);
        if (err)
                return ERR_PTR(err);

        hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len);
        ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
        if (ea_inode) {
                err = ext4_xattr_inode_inc_ref(handle, ea_inode);
                if (err)
                        goto out_err;
                return ea_inode;
        }

        /* Create an inode for the EA value */
        ea_inode = ext4_xattr_inode_create(handle, inode, hash);
        if (IS_ERR(ea_inode)) {
                ext4_xattr_inode_free_quota(inode, NULL, value_len);
                return ea_inode;
        }

        err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
        if (err) {
                if (ext4_xattr_inode_dec_ref(handle, ea_inode))
                        ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err);
                goto out_err;
        }

        if (EA_INODE_CACHE(inode))
                mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
                                      ea_inode->i_ino, true /* reusable */);
        return ea_inode;
out_err:
        iput(ea_inode);
        ext4_xattr_inode_free_quota(inode, NULL, value_len);
        return ERR_PTR(err);
}

/*
 * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode
 * feature is enabled.
 */
#define EXT4_XATTR_BLOCK_RESERVE(inode)        min(i_blocksize(inode)/8, 1024U)

static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
                                struct ext4_xattr_search *s,
                                handle_t *handle, struct inode *inode,
                                struct inode *new_ea_inode,
                                bool is_block)
{
        struct ext4_xattr_entry *last, *next;
        struct ext4_xattr_entry *here = s->here;
        size_t min_offs = s->end - s->base, name_len = strlen(i->name);
        int in_inode = i->in_inode;
        struct inode *old_ea_inode = NULL;
        size_t old_size, new_size;
        int ret;

        /* Space used by old and new values. */
        old_size = (!s->not_found && !here->e_value_inum) ?
                        EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0;
        new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0;

        /*
         * Optimization for the simple case when old and new values have the
         * same padded sizes. Not applicable if external inodes are involved.
         */
        if (new_size && new_size == old_size) {
                size_t offs = le16_to_cpu(here->e_value_offs);
                void *val = s->base + offs;

                here->e_value_size = cpu_to_le32(i->value_len);
                if (i->value == EXT4_ZERO_XATTR_VALUE) {
                        memset(val, 0, new_size);
                } else {
                        memcpy(val, i->value, i->value_len);
                        /* Clear padding bytes. */
                        memset(val + i->value_len, 0, new_size - i->value_len);
                }
                goto update_hash;
        }

        /* Compute min_offs and last. */
        last = s->first;
        for (; !IS_LAST_ENTRY(last); last = next) {
                next = EXT4_XATTR_NEXT(last);
                if ((void *)next >= s->end) {
                        EXT4_ERROR_INODE(inode, "corrupted xattr entries");
                        ret = -EFSCORRUPTED;
                        goto out;
                }
                if (!last->e_value_inum && last->e_value_size) {
                        size_t offs = le16_to_cpu(last->e_value_offs);
                        if (offs < min_offs)
                                min_offs = offs;
                }
        }

        /* Check whether we have enough space. */
        if (i->value) {
                size_t free;

                free = min_offs - ((void *)last - s->base) - sizeof(__u32);
                if (!s->not_found)
                        free += EXT4_XATTR_LEN(name_len) + old_size;

                if (free < EXT4_XATTR_LEN(name_len) + new_size) {
                        ret = -ENOSPC;
                        goto out;
                }

                /*
                 * If storing the value in an external inode is an option,
                 * reserve space for xattr entries/names in the external
                 * attribute block so that a long value does not occupy the
                 * whole space and prevent further entries being added.
                 */
                if (ext4_has_feature_ea_inode(inode->i_sb) &&
                    new_size && is_block &&
                    (min_offs + old_size - new_size) <
                                        EXT4_XATTR_BLOCK_RESERVE(inode)) {
                        ret = -ENOSPC;
                        goto out;
                }
        }

        /*
         * Getting access to old and new ea inodes is subject to failures.
         * Finish that work before doing any modifications to the xattr data.
         */
        if (!s->not_found && here->e_value_inum) {
                ret = ext4_xattr_inode_iget(inode,
                                            le32_to_cpu(here->e_value_inum),
                                            le32_to_cpu(here->e_hash),
                                            &old_ea_inode);
                if (ret) {
                        old_ea_inode = NULL;
                        goto out;
                }

                /* We are ready to release ref count on the old_ea_inode. */
                ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode);
                if (ret)
                        goto out;

                ext4_xattr_inode_free_quota(inode, old_ea_inode,
                                            le32_to_cpu(here->e_value_size));
        }

        /* No failures allowed past this point. */

        if (!s->not_found && here->e_value_size && !here->e_value_inum) {
                /* Remove the old value. */
                void *first_val = s->base + min_offs;
                size_t offs = le16_to_cpu(here->e_value_offs);
                void *val = s->base + offs;

                memmove(first_val + old_size, first_val, val - first_val);
                memset(first_val, 0, old_size);
                min_offs += old_size;

                /* Adjust all value offsets. */
                last = s->first;
                while (!IS_LAST_ENTRY(last)) {
                        size_t o = le16_to_cpu(last->e_value_offs);

                        if (!last->e_value_inum &&
                            last->e_value_size && o < offs)
                                last->e_value_offs = cpu_to_le16(o + old_size);
                        last = EXT4_XATTR_NEXT(last);
                }
        }

        if (!i->value) {
                /* Remove old name. */
                size_t size = EXT4_XATTR_LEN(name_len);

                last = ENTRY((void *)last - size);
                memmove(here, (void *)here + size,
                        (void *)last - (void *)here + sizeof(__u32));
                memset(last, 0, size);

                /*
                 * Update i_inline_off - moved ibody region might contain
                 * system.data attribute.  Handling a failure here won't
                 * cause other complications for setting an xattr.
                 */
                if (!is_block && ext4_has_inline_data(inode)) {
                        ret = ext4_find_inline_data_nolock(inode);
                        if (ret) {
                                ext4_warning_inode(inode,
                                        "unable to update i_inline_off");
                                goto out;
                        }
                }
        } else if (s->not_found) {
                /* Insert new name. */
                size_t size = EXT4_XATTR_LEN(name_len);
                size_t rest = (void *)last - (void *)here + sizeof(__u32);

                memmove((void *)here + size, here, rest);
                memset(here, 0, size);
                here->e_name_index = i->name_index;
                here->e_name_len = name_len;
                memcpy(here->e_name, i->name, name_len);
        } else {
                /* This is an update, reset value info. */
                here->e_value_inum = 0;
                here->e_value_offs = 0;
                here->e_value_size = 0;
        }

        if (i->value) {
                /* Insert new value. */
                if (in_inode) {
                        here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino);
                } else if (i->value_len) {
                        void *val = s->base + min_offs - new_size;

                        here->e_value_offs = cpu_to_le16(min_offs - new_size);
                        if (i->value == EXT4_ZERO_XATTR_VALUE) {
                                memset(val, 0, new_size);
                        } else {
                                memcpy(val, i->value, i->value_len);
                                /* Clear padding bytes. */
                                memset(val + i->value_len, 0,
                                       new_size - i->value_len);
                        }
                }
                here->e_value_size = cpu_to_le32(i->value_len);
        }

update_hash:
        if (i->value) {
                __le32 hash = 0;

                /* Entry hash calculation. */
                if (in_inode) {
                        __le32 crc32c_hash;

                        /*
                         * Feed crc32c hash instead of the raw value for entry
                         * hash calculation. This is to avoid walking
                         * potentially long value buffer again.
                         */
                        crc32c_hash = cpu_to_le32(
                                       ext4_xattr_inode_get_hash(new_ea_inode));
                        hash = ext4_xattr_hash_entry(here->e_name,
                                                     here->e_name_len,
                                                     &crc32c_hash, 1);
                } else if (is_block) {
                        __le32 *value = s->base + le16_to_cpu(
                                                        here->e_value_offs);

                        hash = ext4_xattr_hash_entry(here->e_name,
                                                     here->e_name_len, value,
                                                     new_size >> 2);
                }
                here->e_hash = hash;
        }

        if (is_block)
                ext4_xattr_rehash((struct ext4_xattr_header *)s->base);

        ret = 0;
out:
        iput(old_ea_inode);
        return ret;
}

struct ext4_xattr_block_find {
        struct ext4_xattr_search s;
        struct buffer_head *bh;
};

static int
ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
                      struct ext4_xattr_block_find *bs)
{
        struct super_block *sb = inode->i_sb;
        int error;

        ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
                  i->name_index, i->name, i->value, (long)i->value_len);

        if (EXT4_I(inode)->i_file_acl) {
                /* The inode already has an extended attribute block. */
                bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
                if (IS_ERR(bs->bh)) {
                        error = PTR_ERR(bs->bh);
                        bs->bh = NULL;
                        return error;
                }
                ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
                        atomic_read(&(bs->bh->b_count)),
                        le32_to_cpu(BHDR(bs->bh)->h_refcount));
                error = ext4_xattr_check_block(inode, bs->bh);
                if (error)
                        return error;
                /* Find the named attribute. */
                bs->s.base = BHDR(bs->bh);
                bs->s.first = BFIRST(bs->bh);
                bs->s.end = bs->bh->b_data + bs->bh->b_size;
                bs->s.here = bs->s.first;
                error = xattr_find_entry(inode, &bs->s.here, bs->s.end,
                                         i->name_index, i->name, 1);
                if (error && error != -ENODATA)
                        return error;
                bs->s.not_found = error;
        }
        return 0;
}

static int
ext4_xattr_block_set(handle_t *handle, struct inode *inode,
                     struct ext4_xattr_info *i,
                     struct ext4_xattr_block_find *bs)
{
        struct super_block *sb = inode->i_sb;
        struct buffer_head *new_bh = NULL;
        struct ext4_xattr_search s_copy = bs->s;
        struct ext4_xattr_search *s = &s_copy;
        struct mb_cache_entry *ce = NULL;
        int error = 0;
        struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
        struct inode *ea_inode = NULL, *tmp_inode;
        size_t old_ea_inode_quota = 0;
        unsigned int ea_ino;

#define header(x) ((struct ext4_xattr_header *)(x))

        /* If we need EA inode, prepare it before locking the buffer */
        if (i->value && i->in_inode) {
                WARN_ON_ONCE(!i->value_len);

                ea_inode = ext4_xattr_inode_lookup_create(handle, inode,
                                        i->value, i->value_len);
                if (IS_ERR(ea_inode)) {
                        error = PTR_ERR(ea_inode);
                        ea_inode = NULL;
                        goto cleanup;
                }
        }

        if (s->base) {
                int offset = (char *)s->here - bs->bh->b_data;

                BUFFER_TRACE(bs->bh, "get_write_access");
                error = ext4_journal_get_write_access(handle, sb, bs->bh,
                                                      EXT4_JTR_NONE);
                if (error)
                        goto cleanup;

                lock_buffer(bs->bh);

                if (header(s->base)->h_refcount == cpu_to_le32(1)) {
                        __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash);

                        /*
                         * This must happen under buffer lock for
                         * ext4_xattr_block_set() to reliably detect modified
                         * block
                         */
                        if (ea_block_cache) {
                                struct mb_cache_entry *oe;

                                oe = mb_cache_entry_delete_or_get(ea_block_cache,
                                        hash, bs->bh->b_blocknr);
                                if (oe) {
                                        /*
                                         * Xattr block is getting reused. Leave
                                         * it alone.
                                         */
                                        mb_cache_entry_put(ea_block_cache, oe);
                                        goto clone_block;
                                }
                        }
                        ea_bdebug(bs->bh, "modifying in-place");
                        error = ext4_xattr_set_entry(i, s, handle, inode,
                                             ea_inode, true /* is_block */);
                        ext4_xattr_block_csum_set(inode, bs->bh);
                        unlock_buffer(bs->bh);
                        if (error == -EFSCORRUPTED)
                                goto bad_block;
                        if (!error)
                                error = ext4_handle_dirty_metadata(handle,
                                                                   inode,
                                                                   bs->bh);
                        if (error)
                                goto cleanup;
                        goto inserted;
                }
clone_block:
                unlock_buffer(bs->bh);
                ea_bdebug(bs->bh, "cloning");
                s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS);
                error = -ENOMEM;
                if (s->base == NULL)
                        goto cleanup;
                s->first = ENTRY(header(s->base)+1);
                header(s->base)->h_refcount = cpu_to_le32(1);
                s->here = ENTRY(s->base + offset);
                s->end = s->base + bs->bh->b_size;

                /*
                 * If existing entry points to an xattr inode, we need
                 * to prevent ext4_xattr_set_entry() from decrementing
                 * ref count on it because the reference belongs to the
                 * original block. In this case, make the entry look
                 * like it has an empty value.
                 */
                if (!s->not_found && s->here->e_value_inum) {
                        ea_ino = le32_to_cpu(s->here->e_value_inum);
                        error = ext4_xattr_inode_iget(inode, ea_ino,
                                      le32_to_cpu(s->here->e_hash),
                                      &tmp_inode);
                        if (error)
                                goto cleanup;

                        if (!ext4_test_inode_state(tmp_inode,
                                        EXT4_STATE_LUSTRE_EA_INODE)) {
                                /*
                                 * Defer quota free call for previous
                                 * inode until success is guaranteed.
                                 */
                                old_ea_inode_quota = le32_to_cpu(
                                                s->here->e_value_size);
                        }
                        iput(tmp_inode);

                        s->here->e_value_inum = 0;
                        s->here->e_value_size = 0;
                }
        } else {
                /* Allocate a buffer where we construct the new block. */
                s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
                error = -ENOMEM;
                if (s->base == NULL)
                        goto cleanup;
                header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
                header(s->base)->h_blocks = cpu_to_le32(1);
                header(s->base)->h_refcount = cpu_to_le32(1);
                s->first = ENTRY(header(s->base)+1);
                s->here = ENTRY(header(s->base)+1);
                s->end = s->base + sb->s_blocksize;
        }

        error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode,
                                     true /* is_block */);
        if (error == -EFSCORRUPTED)
                goto bad_block;
        if (error)
                goto cleanup;

inserted:
        if (!IS_LAST_ENTRY(s->first)) {
                new_bh = ext4_xattr_block_cache_find(inode, header(s->base), &ce);
                if (IS_ERR(new_bh)) {
                        error = PTR_ERR(new_bh);
                        new_bh = NULL;
                        goto cleanup;
                }

                if (new_bh) {
                        /* We found an identical block in the cache. */
                        if (new_bh == bs->bh)
                                ea_bdebug(new_bh, "keeping");
                        else {
                                u32 ref;

#ifdef EXT4_XATTR_DEBUG
                                WARN_ON_ONCE(dquot_initialize_needed(inode));
#endif
                                /* The old block is released after updating
                                   the inode. */
                                error = dquot_alloc_block(inode,
                                                EXT4_C2B(EXT4_SB(sb), 1));
                                if (error)
                                        goto cleanup;
                                BUFFER_TRACE(new_bh, "get_write_access");
                                error = ext4_journal_get_write_access(
                                                handle, sb, new_bh,
                                                EXT4_JTR_NONE);
                                if (error)
                                        goto cleanup_dquot;
                                lock_buffer(new_bh);
                                /*
                                 * We have to be careful about races with
                                 * adding references to xattr block. Once we
                                 * hold buffer lock xattr block's state is
                                 * stable so we can check the additional
                                 * reference fits.
                                 */
                                ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
                                if (ref > EXT4_XATTR_REFCOUNT_MAX) {
                                        /*
                                         * Undo everything and check mbcache
                                         * again.
                                         */
                                        unlock_buffer(new_bh);
                                        dquot_free_block(inode,
                                                         EXT4_C2B(EXT4_SB(sb),
                                                                  1));
                                        brelse(new_bh);
                                        mb_cache_entry_put(ea_block_cache, ce);
                                        ce = NULL;
                                        new_bh = NULL;
                                        goto inserted;
                                }
                                BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
                                if (ref == EXT4_XATTR_REFCOUNT_MAX)
                                        clear_bit(MBE_REUSABLE_B, &ce->e_flags);
                                ea_bdebug(new_bh, "reusing; refcount now=%d",
                                          ref);
                                ext4_xattr_block_csum_set(inode, new_bh);
                                unlock_buffer(new_bh);
                                error = ext4_handle_dirty_metadata(handle,
                                                                   inode,
                                                                   new_bh);
                                if (error)
                                        goto cleanup_dquot;
                        }
                        mb_cache_entry_touch(ea_block_cache, ce);
                        mb_cache_entry_put(ea_block_cache, ce);
                        ce = NULL;
                } else if (bs->bh && s->base == bs->bh->b_data) {
                        /* We were modifying this block in-place. */
                        ea_bdebug(bs->bh, "keeping this block");
                        ext4_xattr_block_cache_insert(ea_block_cache, bs->bh);
                        new_bh = bs->bh;
                        get_bh(new_bh);
                } else {
                        /* We need to allocate a new block */
                        ext4_fsblk_t goal, block;

#ifdef EXT4_XATTR_DEBUG
                        WARN_ON_ONCE(dquot_initialize_needed(inode));
#endif
                        goal = ext4_group_first_block_no(sb,
                                                EXT4_I(inode)->i_block_group);
                        block = ext4_new_meta_blocks(handle, inode, goal, 0,
                                                     NULL, &error);
                        if (error)
                                goto cleanup;

                        ea_idebug(inode, "creating block %llu",
                                  (unsigned long long)block);

                        new_bh = sb_getblk(sb, block);
                        if (unlikely(!new_bh)) {
                                error = -ENOMEM;
getblk_failed:
                                ext4_free_blocks(handle, inode, NULL, block, 1,
                                                 EXT4_FREE_BLOCKS_METADATA);
                                goto cleanup;
                        }
                        error = ext4_xattr_inode_inc_ref_all(handle, inode,
                                                      ENTRY(header(s->base)+1));
                        if (error)
                                goto getblk_failed;
                        if (ea_inode) {
                                /* Drop the extra ref on ea_inode. */
                                error = ext4_xattr_inode_dec_ref(handle,
                                                                 ea_inode);
                                if (error)
                                        ext4_warning_inode(ea_inode,
                                                           "dec ref error=%d",
                                                           error);
                                iput(ea_inode);
                                ea_inode = NULL;
                        }

                        lock_buffer(new_bh);
                        error = ext4_journal_get_create_access(handle, sb,
                                                        new_bh, EXT4_JTR_NONE);
                        if (error) {
                                unlock_buffer(new_bh);
                                error = -EIO;
                                goto getblk_failed;
                        }
                        memcpy(new_bh->b_data, s->base, new_bh->b_size);
                        ext4_xattr_block_csum_set(inode, new_bh);
                        set_buffer_uptodate(new_bh);
                        unlock_buffer(new_bh);
                        ext4_xattr_block_cache_insert(ea_block_cache, new_bh);
                        error = ext4_handle_dirty_metadata(handle, inode,
                                                           new_bh);
                        if (error)
                                goto cleanup;
                }
        }

        if (old_ea_inode_quota)
                ext4_xattr_inode_free_quota(inode, NULL, old_ea_inode_quota);

        /* Update the inode. */
        EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;

        /* Drop the previous xattr block. */
        if (bs->bh && bs->bh != new_bh) {
                struct ext4_xattr_inode_array *ea_inode_array = NULL;

                ext4_xattr_release_block(handle, inode, bs->bh,
                                         &ea_inode_array,
                                         0 /* extra_credits */);
                ext4_xattr_inode_array_free(ea_inode_array);
        }
        error = 0;

cleanup:
        if (ea_inode) {
                if (error) {
                        int error2;

                        error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
                        if (error2)
                                ext4_warning_inode(ea_inode, "dec ref error=%d",
                                                   error2);
                        ext4_xattr_inode_free_quota(inode, ea_inode,
                                                    i_size_read(ea_inode));
                }
                iput(ea_inode);
        }
        if (ce)
                mb_cache_entry_put(ea_block_cache, ce);
        brelse(new_bh);
        if (!(bs->bh && s->base == bs->bh->b_data))
                kfree(s->base);

        return error;

cleanup_dquot:
        dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1));
        goto cleanup;

bad_block:
        EXT4_ERROR_INODE(inode, "bad block %llu",
                         EXT4_I(inode)->i_file_acl);
        goto cleanup;

#undef header
}

int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
                          struct ext4_xattr_ibody_find *is)
{
        struct ext4_xattr_ibody_header *header;
        struct ext4_inode *raw_inode;
        int error;

        if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
                return 0;

        raw_inode = ext4_raw_inode(&is->iloc);
        header = IHDR(inode, raw_inode);
        is->s.base = is->s.first = IFIRST(header);
        is->s.here = is->s.first;
        is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
        if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
                error = xattr_check_inode(inode, header, is->s.end);
                if (error)
                        return error;
                /* Find the named attribute. */
                error = xattr_find_entry(inode, &is->s.here, is->s.end,
                                         i->name_index, i->name, 0);
                if (error && error != -ENODATA)
                        return error;
                is->s.not_found = error;
        }
        return 0;
}

int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
                                struct ext4_xattr_info *i,
                                struct ext4_xattr_ibody_find *is)
{
        struct ext4_xattr_ibody_header *header;
        struct ext4_xattr_search *s = &is->s;
        struct inode *ea_inode = NULL;
        int error;

        if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
                return -ENOSPC;

        /* If we need EA inode, prepare it before locking the buffer */
        if (i->value && i->in_inode) {
                WARN_ON_ONCE(!i->value_len);

                ea_inode = ext4_xattr_inode_lookup_create(handle, inode,
                                        i->value, i->value_len);
                if (IS_ERR(ea_inode))
                        return PTR_ERR(ea_inode);
        }
        error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode,
                                     false /* is_block */);
        if (error) {
                if (ea_inode) {
                        int error2;

                        error2 = ext4_xattr_inode_dec_ref(handle, ea_inode);
                        if (error2)
                                ext4_warning_inode(ea_inode, "dec ref error=%d",
                                                   error2);

                        ext4_xattr_inode_free_quota(inode, ea_inode,
                                                    i_size_read(ea_inode));
                        iput(ea_inode);
                }
                return error;
        }
        header = IHDR(inode, ext4_raw_inode(&is->iloc));
        if (!IS_LAST_ENTRY(s->first)) {
                header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
                ext4_set_inode_state(inode, EXT4_STATE_XATTR);
        } else {
                header->h_magic = cpu_to_le32(0);
                ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
        }
        iput(ea_inode);
        return 0;
}

static int ext4_xattr_value_same(struct ext4_xattr_search *s,
                                 struct ext4_xattr_info *i)
{
        void *value;

        /* When e_value_inum is set the value is stored externally. */
        if (s->here->e_value_inum)
                return 0;
        if (le32_to_cpu(s->here->e_value_size) != i->value_len)
                return 0;
        value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs);
        return !memcmp(value, i->value, i->value_len);
}

static struct buffer_head *ext4_xattr_get_block(struct inode *inode)
{
        struct buffer_head *bh;
        int error;

        if (!EXT4_I(inode)->i_file_acl)
                return NULL;
        bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
        if (IS_ERR(bh))
                return bh;
        error = ext4_xattr_check_block(inode, bh);
        if (error) {
                brelse(bh);
                return ERR_PTR(error);
        }
        return bh;
}

/*
 * ext4_xattr_set_handle()
 *
 * Create, replace or remove an extended attribute for this inode.  Value
 * is NULL to remove an existing extended attribute, and non-NULL to
 * either replace an existing extended attribute, or create a new extended
 * attribute. The flags XATTR_REPLACE and XATTR_CREATE
 * specify that an extended attribute must exist and must not exist
 * previous to the call, respectively.
 *
 * Returns 0, or a negative error number on failure.
 */
int
ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
                      const char *name, const void *value, size_t value_len,
                      int flags)
{
        struct ext4_xattr_info i = {
                .name_index = name_index,
                .name = name,
                .value = value,
                .value_len = value_len,
                .in_inode = 0,
        };
        struct ext4_xattr_ibody_find is = {
                .s = { .not_found = -ENODATA, },
        };
        struct ext4_xattr_block_find bs = {
                .s = { .not_found = -ENODATA, },
        };
        int no_expand;
        int error;

        if (!name)
                return -EINVAL;
        if (strlen(name) > 255)
                return -ERANGE;

        ext4_write_lock_xattr(inode, &no_expand);

        /* Check journal credits under write lock. */
        if (ext4_handle_valid(handle)) {
                struct buffer_head *bh;
                int credits;

                bh = ext4_xattr_get_block(inode);
                if (IS_ERR(bh)) {
                        error = PTR_ERR(bh);
                        goto cleanup;
                }

                credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh,
                                                   value_len,
                                                   flags & XATTR_CREATE);
                brelse(bh);

                if (jbd2_handle_buffer_credits(handle) < credits) {
                        error = -ENOSPC;
                        goto cleanup;
                }
                WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS));
        }

        error = ext4_reserve_inode_write(handle, inode, &is.iloc);
        if (error)
                goto cleanup;

        if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
                struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
                memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
                ext4_clear_inode_state(inode, EXT4_STATE_NEW);
        }

        error = ext4_xattr_ibody_find(inode, &i, &is);
        if (error)
                goto cleanup;
        if (is.s.not_found)
                error = ext4_xattr_block_find(inode, &i, &bs);
        if (error)
                goto cleanup;
        if (is.s.not_found && bs.s.not_found) {
                error = -ENODATA;
                if (flags & XATTR_REPLACE)
                        goto cleanup;
                error = 0;
                if (!value)
                        goto cleanup;
        } else {
                error = -EEXIST;
                if (flags & XATTR_CREATE)
                        goto cleanup;
        }

        if (!value) {
                if (!is.s.not_found)
                        error = ext4_xattr_ibody_set(handle, inode, &i, &is);
                else if (!bs.s.not_found)
                        error = ext4_xattr_block_set(handle, inode, &i, &bs);
        } else {
                error = 0;
                /* Xattr value did not change? Save us some work and bail out */
                if (!is.s.not_found && ext4_xattr_value_same(&is.s, &i))
                        goto cleanup;
                if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
                        goto cleanup;

                if (ext4_has_feature_ea_inode(inode->i_sb) &&
                    (EXT4_XATTR_SIZE(i.value_len) >
                        EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize)))
                        i.in_inode = 1;
retry_inode:
                error = ext4_xattr_ibody_set(handle, inode, &i, &is);
                if (!error && !bs.s.not_found) {
                        i.value = NULL;
                        error = ext4_xattr_block_set(handle, inode, &i, &bs);
                } else if (error == -ENOSPC) {
                        if (EXT4_I(inode)->i_file_acl && !bs.s.base) {
                                brelse(bs.bh);
                                bs.bh = NULL;
                                error = ext4_xattr_block_find(inode, &i, &bs);
                                if (error)
                                        goto cleanup;
                        }
                        error = ext4_xattr_block_set(handle, inode, &i, &bs);
                        if (!error && !is.s.not_found) {
                                i.value = NULL;
                                error = ext4_xattr_ibody_set(handle, inode, &i,
                                                             &is);
                        } else if (error == -ENOSPC) {
                                /*
                                 * Xattr does not fit in the block, store at
                                 * external inode if possible.
                                 */
                                if (ext4_has_feature_ea_inode(inode->i_sb) &&
                                    i.value_len && !i.in_inode) {
                                        i.in_inode = 1;
                                        goto retry_inode;
                                }
                        }
                }
        }
        if (!error) {
                ext4_xattr_update_super_block(handle, inode->i_sb);
                inode_set_ctime_current(inode);
                inode_inc_iversion(inode);
                if (!value)
                        no_expand = 0;
                error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
                /*
                 * The bh is consumed by ext4_mark_iloc_dirty, even with
                 * error != 0.
                 */
                is.iloc.bh = NULL;
                if (IS_SYNC(inode))
                        ext4_handle_sync(handle);
        }
        ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle);

cleanup:
        brelse(is.iloc.bh);
        brelse(bs.bh);
        ext4_write_unlock_xattr(inode, &no_expand);
        return error;
}

int ext4_xattr_set_credits(struct inode *inode, size_t value_len,
                           bool is_create, int *credits)
{
        struct buffer_head *bh;
        int err;

        *credits = 0;

        if (!EXT4_SB(inode->i_sb)->s_journal)
                return 0;

        down_read(&EXT4_I(inode)->xattr_sem);

        bh = ext4_xattr_get_block(inode);
        if (IS_ERR(bh)) {
                err = PTR_ERR(bh);
        } else {
                *credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh,
                                                    value_len, is_create);
                brelse(bh);
                err = 0;
        }

        up_read(&EXT4_I(inode)->xattr_sem);
        return err;
}

/*
 * ext4_xattr_set()
 *
 * Like ext4_xattr_set_handle, but start from an inode. This extended
 * attribute modification is a filesystem transaction by itself.
 *
 * Returns 0, or a negative error number on failure.
 */
int
ext4_xattr_set(struct inode *inode, int name_index, const char *name,
               const void *value, size_t value_len, int flags)
{
        handle_t *handle;
        struct super_block *sb = inode->i_sb;
        int error, retries = 0;
        int credits;

        error = dquot_initialize(inode);
        if (error)
                return error;

retry:
        error = ext4_xattr_set_credits(inode, value_len, flags & XATTR_CREATE,
                                       &credits);
        if (error)
                return error;

        handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
        if (IS_ERR(handle)) {
                error = PTR_ERR(handle);
        } else {
                int error2;

                error = ext4_xattr_set_handle(handle, inode, name_index, name,
                                              value, value_len, flags);
                error2 = ext4_journal_stop(handle);
                if (error == -ENOSPC &&
                    ext4_should_retry_alloc(sb, &retries))
                        goto retry;
                if (error == 0)
                        error = error2;
        }
        ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, NULL);

        return error;
}

/*
 * Shift the EA entries in the inode to create space for the increased
 * i_extra_isize.
 */
static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
                                     int value_offs_shift, void *to,
                                     void *from, size_t n)
{
        struct ext4_xattr_entry *last = entry;
        int new_offs;

        /* We always shift xattr headers further thus offsets get lower */
        BUG_ON(value_offs_shift > 0);

        /* Adjust the value offsets of the entries */
        for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
                if (!last->e_value_inum && last->e_value_size) {
                        new_offs = le16_to_cpu(last->e_value_offs) +
                                                        value_offs_shift;
                        last->e_value_offs = cpu_to_le16(new_offs);
                }
        }
        /* Shift the entries by n bytes */
        memmove(to, from, n);
}

/*
 * Move xattr pointed to by 'entry' from inode into external xattr block
 */
static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
                                    struct ext4_inode *raw_inode,
                                    struct ext4_xattr_entry *entry)
{
        struct ext4_xattr_ibody_find *is = NULL;
        struct ext4_xattr_block_find *bs = NULL;
        char *buffer = NULL, *b_entry_name = NULL;
        size_t value_size = le32_to_cpu(entry->e_value_size);
        struct ext4_xattr_info i = {
                .value = NULL,
                .value_len = 0,
                .name_index = entry->e_name_index,
                .in_inode = !!entry->e_value_inum,
        };
        struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode);
        int needs_kvfree = 0;
        int error;

        is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
        bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
        b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS);
        if (!is || !bs || !b_entry_name) {
                error = -ENOMEM;
                goto out;
        }

        is->s.not_found = -ENODATA;
        bs->s.not_found = -ENODATA;
        is->iloc.bh = NULL;
        bs->bh = NULL;

        /* Save the entry name and the entry value */
        if (entry->e_value_inum) {
                buffer = kvmalloc(value_size, GFP_NOFS);
                if (!buffer) {
                        error = -ENOMEM;
                        goto out;
                }
                needs_kvfree = 1;
                error = ext4_xattr_inode_get(inode, entry, buffer, value_size);
                if (error)
                        goto out;
        } else {
                size_t value_offs = le16_to_cpu(entry->e_value_offs);
                buffer = (void *)IFIRST(header) + value_offs;
        }

        memcpy(b_entry_name, entry->e_name, entry->e_name_len);
        b_entry_name[entry->e_name_len] = '\0';
        i.name = b_entry_name;

        error = ext4_get_inode_loc(inode, &is->iloc);
        if (error)
                goto out;

        error = ext4_xattr_ibody_find(inode, &i, is);
        if (error)
                goto out;

        i.value = buffer;
        i.value_len = value_size;
        error = ext4_xattr_block_find(inode, &i, bs);
        if (error)
                goto out;

        /* Move ea entry from the inode into the block */
        error = ext4_xattr_block_set(handle, inode, &i, bs);
        if (error)
                goto out;

        /* Remove the chosen entry from the inode */
        i.value = NULL;
        i.value_len = 0;
        error = ext4_xattr_ibody_set(handle, inode, &i, is);

out:
        kfree(b_entry_name);
        if (needs_kvfree && buffer)
                kvfree(buffer);
        if (is)
                brelse(is->iloc.bh);
        if (bs)
                brelse(bs->bh);
        kfree(is);
        kfree(bs);

        return error;
}

static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode,
                                       struct ext4_inode *raw_inode,
                                       int isize_diff, size_t ifree,
                                       size_t bfree, int *total_ino)
{
        struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode);
        struct ext4_xattr_entry *small_entry;
        struct ext4_xattr_entry *entry;
        struct ext4_xattr_entry *last;
        unsigned int entry_size;        /* EA entry size */
        unsigned int total_size;        /* EA entry size + value size */
        unsigned int min_total_size;
        int error;

        while (isize_diff > ifree) {
                entry = NULL;
                small_entry = NULL;
                min_total_size = ~0U;
                last = IFIRST(header);
                /* Find the entry best suited to be pushed into EA block */
                for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
                        /* never move system.data out of the inode */
                        if ((last->e_name_len == 4) &&
                            (last->e_name_index == EXT4_XATTR_INDEX_SYSTEM) &&
                            !memcmp(last->e_name, "data", 4))
                                continue;
                        total_size = EXT4_XATTR_LEN(last->e_name_len);
                        if (!last->e_value_inum)
                                total_size += EXT4_XATTR_SIZE(
                                               le32_to_cpu(last->e_value_size));
                        if (total_size <= bfree &&
                            total_size < min_total_size) {
                                if (total_size + ifree < isize_diff) {
                                        small_entry = last;
                                } else {
                                        entry = last;
                                        min_total_size = total_size;
                                }
                        }
                }

                if (entry == NULL) {
                        if (small_entry == NULL)
                                return -ENOSPC;
                        entry = small_entry;
                }

                entry_size = EXT4_XATTR_LEN(entry->e_name_len);
                total_size = entry_size;
                if (!entry->e_value_inum)
                        total_size += EXT4_XATTR_SIZE(
                                              le32_to_cpu(entry->e_value_size));
                error = ext4_xattr_move_to_block(handle, inode, raw_inode,
                                                 entry);
                if (error)
                        return error;

                *total_ino -= entry_size;
                ifree += total_size;
                bfree -= total_size;
        }

        return 0;
}

/*
 * Expand an inode by new_extra_isize bytes when EAs are present.
 * Returns 0 on success or negative error number on failure.
 */
int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
                               struct ext4_inode *raw_inode, handle_t *handle)
{
        struct ext4_xattr_ibody_header *header;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        static unsigned int mnt_count;
        size_t min_offs;
        size_t ifree, bfree;
        int total_ino;
        void *base, *end;
        int error = 0, tried_min_extra_isize = 0;
        int s_min_extra_isize = le16_to_cpu(sbi->s_es->s_min_extra_isize);
        int isize_diff;        /* How much do we need to grow i_extra_isize */

retry:
        isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize;
        if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
                return 0;

        header = IHDR(inode, raw_inode);

        /*
         * Check if enough free space is available in the inode to shift the
         * entries ahead by new_extra_isize.
         */

        base = IFIRST(header);
        end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
        min_offs = end - base;
        total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32);

        error = xattr_check_inode(inode, header, end);
        if (error)
                goto cleanup;

        ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino);
        if (ifree >= isize_diff)
                goto shift;

        /*
         * Enough free space isn't available in the inode, check if
         * EA block can hold new_extra_isize bytes.
         */
        if (EXT4_I(inode)->i_file_acl) {
                struct buffer_head *bh;

                bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
                if (IS_ERR(bh)) {
                        error = PTR_ERR(bh);
                        goto cleanup;
                }
                error = ext4_xattr_check_block(inode, bh);
                if (error) {
                        brelse(bh);
                        goto cleanup;
                }
                base = BHDR(bh);
                end = bh->b_data + bh->b_size;
                min_offs = end - base;
                bfree = ext4_xattr_free_space(BFIRST(bh), &min_offs, base,
                                              NULL);
                brelse(bh);
                if (bfree + ifree < isize_diff) {
                        if (!tried_min_extra_isize && s_min_extra_isize) {
                                tried_min_extra_isize++;
                                new_extra_isize = s_min_extra_isize;
                                goto retry;
                        }
                        error = -ENOSPC;
                        goto cleanup;
                }
        } else {
                bfree = inode->i_sb->s_blocksize;
        }

        error = ext4_xattr_make_inode_space(handle, inode, raw_inode,
                                            isize_diff, ifree, bfree,
                                            &total_ino);
        if (error) {
                if (error == -ENOSPC && !tried_min_extra_isize &&
                    s_min_extra_isize) {
                        tried_min_extra_isize++;
                        new_extra_isize = s_min_extra_isize;
                        goto retry;
                }
                goto cleanup;
        }
shift:
        /* Adjust the offsets and shift the remaining entries ahead */
        ext4_xattr_shift_entries(IFIRST(header), EXT4_I(inode)->i_extra_isize
                        - new_extra_isize, (void *)raw_inode +
                        EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize,
                        (void *)header, total_ino);
        EXT4_I(inode)->i_extra_isize = new_extra_isize;

        if (ext4_has_inline_data(inode))
                error = ext4_find_inline_data_nolock(inode);

cleanup:
        if (error && (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count))) {
                ext4_warning(inode->i_sb, "Unable to expand inode %lu. Delete some EAs or run e2fsck.",
                             inode->i_ino);
                mnt_count = le16_to_cpu(sbi->s_es->s_mnt_count);
        }
        return error;
}

#define EIA_INCR 16 /* must be 2^n */
#define EIA_MASK (EIA_INCR - 1)

/* Add the large xattr @inode into @ea_inode_array for deferred iput().
 * If @ea_inode_array is new or full it will be grown and the old
 * contents copied over.
 */
static int
ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
                        struct inode *inode)
{
        if (*ea_inode_array == NULL) {
                /*
                 * Start with 15 inodes, so it fits into a power-of-two size.
                 * If *ea_inode_array is NULL, this is essentially offsetof()
                 */
                (*ea_inode_array) =
                        kmalloc(offsetof(struct ext4_xattr_inode_array,
                                         inodes[EIA_MASK]),
                                GFP_NOFS);
                if (*ea_inode_array == NULL)
                        return -ENOMEM;
                (*ea_inode_array)->count = 0;
        } else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) {
                /* expand the array once all 15 + n * 16 slots are full */
                struct ext4_xattr_inode_array *new_array = NULL;
                int count = (*ea_inode_array)->count;

                /* if new_array is NULL, this is essentially offsetof() */
                new_array = kmalloc(
                                offsetof(struct ext4_xattr_inode_array,
                                         inodes[count + EIA_INCR]),
                                GFP_NOFS);
                if (new_array == NULL)
                        return -ENOMEM;
                memcpy(new_array, *ea_inode_array,
                       offsetof(struct ext4_xattr_inode_array, inodes[count]));
                kfree(*ea_inode_array);
                *ea_inode_array = new_array;
        }
        (*ea_inode_array)->inodes[(*ea_inode_array)->count++] = inode;
        return 0;
}

/*
 * ext4_xattr_delete_inode()
 *
 * Free extended attribute resources associated with this inode. Traverse
 * all entries and decrement reference on any xattr inodes associated with this
 * inode. This is called immediately before an inode is freed. We have exclusive
 * access to the inode. If an orphan inode is deleted it will also release its
 * references on xattr block and xattr inodes.
 */
int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
                            struct ext4_xattr_inode_array **ea_inode_array,
                            int extra_credits)
{
        struct buffer_head *bh = NULL;
        struct ext4_xattr_ibody_header *header;
        struct ext4_iloc iloc = { .bh = NULL };
        struct ext4_xattr_entry *entry;
        struct inode *ea_inode;
        int error;

        error = ext4_journal_ensure_credits(handle, extra_credits,
                        ext4_free_metadata_revoke_credits(inode->i_sb, 1));
        if (error < 0) {
                EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error);
                goto cleanup;
        }

        if (ext4_has_feature_ea_inode(inode->i_sb) &&
            ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {

                error = ext4_get_inode_loc(inode, &iloc);
                if (error) {
                        EXT4_ERROR_INODE(inode, "inode loc (error %d)", error);
                        goto cleanup;
                }

                error = ext4_journal_get_write_access(handle, inode->i_sb,
                                                iloc.bh, EXT4_JTR_NONE);
                if (error) {
                        EXT4_ERROR_INODE(inode, "write access (error %d)",
                                         error);
                        goto cleanup;
                }

                header = IHDR(inode, ext4_raw_inode(&iloc));
                if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC))
                        ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
                                                     IFIRST(header),
                                                     false /* block_csum */,
                                                     ea_inode_array,
                                                     extra_credits,
                                                     false /* skip_quota */);
        }

        if (EXT4_I(inode)->i_file_acl) {
                bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
                if (IS_ERR(bh)) {
                        error = PTR_ERR(bh);
                        if (error == -EIO) {
                                EXT4_ERROR_INODE_ERR(inode, EIO,
                                                     "block %llu read error",
                                                     EXT4_I(inode)->i_file_acl);
                        }
                        bh = NULL;
                        goto cleanup;
                }
                error = ext4_xattr_check_block(inode, bh);
                if (error)
                        goto cleanup;

                if (ext4_has_feature_ea_inode(inode->i_sb)) {
                        for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
                             entry = EXT4_XATTR_NEXT(entry)) {
                                if (!entry->e_value_inum)
                                        continue;
                                error = ext4_xattr_inode_iget(inode,
                                              le32_to_cpu(entry->e_value_inum),
                                              le32_to_cpu(entry->e_hash),
                                              &ea_inode);
                                if (error)
                                        continue;
                                ext4_xattr_inode_free_quota(inode, ea_inode,
                                              le32_to_cpu(entry->e_value_size));
                                iput(ea_inode);
                        }

                }

                ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
                                         extra_credits);
                /*
                 * Update i_file_acl value in the same transaction that releases
                 * block.
                 */
                EXT4_I(inode)->i_file_acl = 0;
                error = ext4_mark_inode_dirty(handle, inode);
                if (error) {
                        EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)",
                                         error);
                        goto cleanup;
                }
                ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle);
        }
        error = 0;
cleanup:
        brelse(iloc.bh);
        brelse(bh);
        return error;
}

void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
{
        int idx;

        if (ea_inode_array == NULL)
                return;

        for (idx = 0; idx < ea_inode_array->count; ++idx)
                iput(ea_inode_array->inodes[idx]);
        kfree(ea_inode_array);
}

/*
 * ext4_xattr_block_cache_insert()
 *
 * Create a new entry in the extended attribute block cache, and insert
 * it unless such an entry is already in the cache.
 *
 * Returns 0, or a negative error number on failure.
 */
static void
ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache,
                              struct buffer_head *bh)
{
        struct ext4_xattr_header *header = BHDR(bh);
        __u32 hash = le32_to_cpu(header->h_hash);
        int reusable = le32_to_cpu(header->h_refcount) <
                       EXT4_XATTR_REFCOUNT_MAX;
        int error;

        if (!ea_block_cache)
                return;
        error = mb_cache_entry_create(ea_block_cache, GFP_NOFS, hash,
                                      bh->b_blocknr, reusable);
        if (error) {
                if (error == -EBUSY)
                        ea_bdebug(bh, "already in cache");
        } else
                ea_bdebug(bh, "inserting [%x]", (int)hash);
}

/*
 * ext4_xattr_cmp()
 *
 * Compare two extended attribute blocks for equality.
 *
 * Returns 0 if the blocks are equal, 1 if they differ, and
 * a negative error number on errors.
 */
static int
ext4_xattr_cmp(struct ext4_xattr_header *header1,
               struct ext4_xattr_header *header2)
{
        struct ext4_xattr_entry *entry1, *entry2;

        entry1 = ENTRY(header1+1);
        entry2 = ENTRY(header2+1);
        while (!IS_LAST_ENTRY(entry1)) {
                if (IS_LAST_ENTRY(entry2))
                        return 1;
                if (entry1->e_hash != entry2->e_hash ||
                    entry1->e_name_index != entry2->e_name_index ||
                    entry1->e_name_len != entry2->e_name_len ||
                    entry1->e_value_size != entry2->e_value_size ||
                    entry1->e_value_inum != entry2->e_value_inum ||
                    memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
                        return 1;
                if (!entry1->e_value_inum &&
                    memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
                           (char *)header2 + le16_to_cpu(entry2->e_value_offs),
                           le32_to_cpu(entry1->e_value_size)))
                        return 1;

                entry1 = EXT4_XATTR_NEXT(entry1);
                entry2 = EXT4_XATTR_NEXT(entry2);
        }
        if (!IS_LAST_ENTRY(entry2))
                return 1;
        return 0;
}

/*
 * ext4_xattr_block_cache_find()
 *
 * Find an identical extended attribute block.
 *
 * Returns a pointer to the block found, or NULL if such a block was not
 * found, or an error pointer if an error occurred while reading ea block.
 */
static struct buffer_head *
ext4_xattr_block_cache_find(struct inode *inode,
                            struct ext4_xattr_header *header,
                            struct mb_cache_entry **pce)
{
        __u32 hash = le32_to_cpu(header->h_hash);
        struct mb_cache_entry *ce;
        struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);

        if (!ea_block_cache)
                return NULL;
        if (!header->h_hash)
                return NULL;  /* never share */
        ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
        ce = mb_cache_entry_find_first(ea_block_cache, hash);
        while (ce) {
                struct buffer_head *bh;

                bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO);
                if (IS_ERR(bh)) {
                        if (PTR_ERR(bh) != -ENOMEM)
                                EXT4_ERROR_INODE(inode, "block %lu read error",
                                                 (unsigned long)ce->e_value);
                        mb_cache_entry_put(ea_block_cache, ce);
                        return bh;
                } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
                        *pce = ce;
                        return bh;
                }
                brelse(bh);
                ce = mb_cache_entry_find_next(ea_block_cache, ce);
        }
        return NULL;
}

#define NAME_HASH_SHIFT 5
#define VALUE_HASH_SHIFT 16

/*
 * ext4_xattr_hash_entry()
 *
 * Compute the hash of an extended attribute.
 */
static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value,
                                    size_t value_count)
{
        __u32 hash = 0;

        while (name_len--) {
                hash = (hash << NAME_HASH_SHIFT) ^
                       (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
                       (unsigned char)*name++;
        }
        while (value_count--) {
                hash = (hash << VALUE_HASH_SHIFT) ^
                       (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
                       le32_to_cpu(*value++);
        }
        return cpu_to_le32(hash);
}

/*
 * ext4_xattr_hash_entry_signed()
 *
 * Compute the hash of an extended attribute incorrectly.
 */
static __le32 ext4_xattr_hash_entry_signed(char *name, size_t name_len, __le32 *value, size_t value_count)
{
        __u32 hash = 0;

        while (name_len--) {
                hash = (hash << NAME_HASH_SHIFT) ^
                       (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
                       (signed char)*name++;
        }
        while (value_count--) {
                hash = (hash << VALUE_HASH_SHIFT) ^
                       (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
                       le32_to_cpu(*value++);
        }
        return cpu_to_le32(hash);
}

#undef NAME_HASH_SHIFT
#undef VALUE_HASH_SHIFT

#define BLOCK_HASH_SHIFT 16

/*
 * ext4_xattr_rehash()
 *
 * Re-compute the extended attribute hash value after an entry has changed.
 */
static void ext4_xattr_rehash(struct ext4_xattr_header *header)
{
        struct ext4_xattr_entry *here;
        __u32 hash = 0;

        here = ENTRY(header+1);
        while (!IS_LAST_ENTRY(here)) {
                if (!here->e_hash) {
                        /* Block is not shared if an entry's hash value == 0 */
                        hash = 0;
                        break;
                }
                hash = (hash << BLOCK_HASH_SHIFT) ^
                       (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
                       le32_to_cpu(here->e_hash);
                here = EXT4_XATTR_NEXT(here);
        }
        header->h_hash = cpu_to_le32(hash);
}

#undef BLOCK_HASH_SHIFT

#define        HASH_BUCKET_BITS        10

struct mb_cache *
ext4_xattr_create_cache(void)
{
        return mb_cache_create(HASH_BUCKET_BITS);
}

void ext4_xattr_destroy_cache(struct mb_cache *cache)
{
        if (cache)
                mb_cache_destroy(cache);
}
































































































































































































































































    1 









    1 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
/*
 * linux/fs/nls/nls_cp857.c
 *
 * Charset cp857 translation tables.
 * Generated automatically from the Unicode and charset
 * tables from the Unicode Organization (www.unicode.org).
 * The Unicode to charset table has only exact mappings.
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/nls.h>
#include <linux/errno.h>

static const wchar_t charset2uni[256] = {
        /* 0x00*/
        0x0000, 0x0001, 0x0002, 0x0003,
        0x0004, 0x0005, 0x0006, 0x0007,
        0x0008, 0x0009, 0x000a, 0x000b,
        0x000c, 0x000d, 0x000e, 0x000f,
        /* 0x10*/
        0x0010, 0x0011, 0x0012, 0x0013,
        0x0014, 0x0015, 0x0016, 0x0017,
        0x0018, 0x0019, 0x001a, 0x001b,
        0x001c, 0x001d, 0x001e, 0x001f,
        /* 0x20*/
        0x0020, 0x0021, 0x0022, 0x0023,
        0x0024, 0x0025, 0x0026, 0x0027,
        0x0028, 0x0029, 0x002a, 0x002b,
        0x002c, 0x002d, 0x002e, 0x002f,
        /* 0x30*/
        0x0030, 0x0031, 0x0032, 0x0033,
        0x0034, 0x0035, 0x0036, 0x0037,
        0x0038, 0x0039, 0x003a, 0x003b,
        0x003c, 0x003d, 0x003e, 0x003f,
        /* 0x40*/
        0x0040, 0x0041, 0x0042, 0x0043,
        0x0044, 0x0045, 0x0046, 0x0047,
        0x0048, 0x0049, 0x004a, 0x004b,
        0x004c, 0x004d, 0x004e, 0x004f,
        /* 0x50*/
        0x0050, 0x0051, 0x0052, 0x0053,
        0x0054, 0x0055, 0x0056, 0x0057,
        0x0058, 0x0059, 0x005a, 0x005b,
        0x005c, 0x005d, 0x005e, 0x005f,
        /* 0x60*/
        0x0060, 0x0061, 0x0062, 0x0063,
        0x0064, 0x0065, 0x0066, 0x0067,
        0x0068, 0x0069, 0x006a, 0x006b,
        0x006c, 0x006d, 0x006e, 0x006f,
        /* 0x70*/
        0x0070, 0x0071, 0x0072, 0x0073,
        0x0074, 0x0075, 0x0076, 0x0077,
        0x0078, 0x0079, 0x007a, 0x007b,
        0x007c, 0x007d, 0x007e, 0x007f,
        /* 0x80*/
        0x00c7, 0x00fc, 0x00e9, 0x00e2,
        0x00e4, 0x00e0, 0x00e5, 0x00e7,
        0x00ea, 0x00eb, 0x00e8, 0x00ef,
        0x00ee, 0x0131, 0x00c4, 0x00c5,
        /* 0x90*/
        0x00c9, 0x00e6, 0x00c6, 0x00f4,
        0x00f6, 0x00f2, 0x00fb, 0x00f9,
        0x0130, 0x00d6, 0x00dc, 0x00f8,
        0x00a3, 0x00d8, 0x015e, 0x015f,
        /* 0xa0*/
        0x00e1, 0x00ed, 0x00f3, 0x00fa,
        0x00f1, 0x00d1, 0x011e, 0x011f,
        0x00bf, 0x00ae, 0x00ac, 0x00bd,
        0x00bc, 0x00a1, 0x00ab, 0x00bb,
        /* 0xb0*/
        0x2591, 0x2592, 0x2593, 0x2502,
        0x2524, 0x00c1, 0x00c2, 0x00c0,
        0x00a9, 0x2563, 0x2551, 0x2557,
        0x255d, 0x00a2, 0x00a5, 0x2510,
        /* 0xc0*/
        0x2514, 0x2534, 0x252c, 0x251c,
        0x2500, 0x253c, 0x00e3, 0x00c3,
        0x255a, 0x2554, 0x2569, 0x2566,
        0x2560, 0x2550, 0x256c, 0x00a4,
        /* 0xd0*/
        0x00ba, 0x00aa, 0x00ca, 0x00cb,
        0x00c8, 0x0000, 0x00cd, 0x00ce,
        0x00cf, 0x2518, 0x250c, 0x2588,
        0x2584, 0x00a6, 0x00cc, 0x2580,
        /* 0xe0*/
        0x00d3, 0x00df, 0x00d4, 0x00d2,
        0x00f5, 0x00d5, 0x00b5, 0x0000,
        0x00d7, 0x00da, 0x00db, 0x00d9,
        0x00ec, 0x00ff, 0x00af, 0x00b4,
        /* 0xf0*/
        0x00ad, 0x00b1, 0x0000, 0x00be,
        0x00b6, 0x00a7, 0x00f7, 0x00b8,
        0x00b0, 0x00a8, 0x00b7, 0x00b9,
        0x00b3, 0x00b2, 0x25a0, 0x00a0,
};

static const unsigned char page00[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
        0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
        0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
        0xff, 0xad, 0xbd, 0x9c, 0xcf, 0xbe, 0xdd, 0xf5, /* 0xa0-0xa7 */
        0xf9, 0xb8, 0xd1, 0xae, 0xaa, 0xf0, 0xa9, 0xee, /* 0xa8-0xaf */
        0xf8, 0xf1, 0xfd, 0xfc, 0xef, 0xe6, 0xf4, 0xfa, /* 0xb0-0xb7 */
        0xf7, 0xfb, 0xd0, 0xaf, 0xac, 0xab, 0xf3, 0xa8, /* 0xb8-0xbf */
        0xb7, 0xb5, 0xb6, 0xc7, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */
        0xd4, 0x90, 0xd2, 0xd3, 0xde, 0xd6, 0xd7, 0xd8, /* 0xc8-0xcf */
        0x00, 0xa5, 0xe3, 0xe0, 0xe2, 0xe5, 0x99, 0xe8, /* 0xd0-0xd7 */
        0x9d, 0xeb, 0xe9, 0xea, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */
        0x85, 0xa0, 0x83, 0xc6, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */
        0x8a, 0x82, 0x88, 0x89, 0xec, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */
        0x00, 0xa4, 0x95, 0xa2, 0x93, 0xe4, 0x94, 0xf6, /* 0xf0-0xf7 */
        0x9b, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0xed, /* 0xf8-0xff */
};

static const unsigned char page01[256] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa6, 0xa7, /* 0x18-0x1f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
        0x98, 0x8d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, 0x9f, /* 0x58-0x5f */
};

static const unsigned char page25[256] = {
        0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
        0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
        0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
        0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
        0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
        0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
        0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
        0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
        0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */
        0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */
        0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */
        0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */

        0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
        0xdb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
        0x00, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
        0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
};

static const unsigned char *const page_uni2charset[256] = {
        page00, page01, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   page25, NULL,   NULL,   
};

static const unsigned char charset2lower[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
        0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
        0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
        0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x84, 0x86, /* 0x88-0x8f */
        0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
        0x69, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9f, 0x9f, /* 0x98-0x9f */
        0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa7, 0xa7, /* 0xa0-0xa7 */
        0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
        0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xa0, 0x83, 0x85, /* 0xb0-0xb7 */
        0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
        0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, /* 0xc0-0xc7 */
        0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
        0xd0, 0xd1, 0x88, 0x89, 0x8a, 0x00, 0xa1, 0x8c, /* 0xd0-0xd7 */
        0x8b, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xec, 0xdf, /* 0xd8-0xdf */
        0xa2, 0xe1, 0x93, 0x95, 0xe4, 0xe4, 0xe6, 0x00, /* 0xe0-0xe7 */
        0xe8, 0xa3, 0x96, 0x97, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
        0xf0, 0xf1, 0x00, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
        0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
};

static const unsigned char charset2upper[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
        0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
        0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x80, 0x9a, 0x90, 0xb6, 0x8e, 0xb7, 0x8f, 0x80, /* 0x80-0x87 */
        0xd2, 0xd3, 0xd4, 0xd8, 0xd7, 0x49, 0x8e, 0x8f, /* 0x88-0x8f */
        0x90, 0x92, 0x92, 0xe2, 0x99, 0xe3, 0xea, 0xeb, /* 0x90-0x97 */
        0x98, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x9e, /* 0x98-0x9f */
        0xb5, 0xd6, 0xe0, 0xe9, 0xa5, 0xa5, 0xa6, 0xa6, /* 0xa0-0xa7 */
        0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
        0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
        0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
        0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc7, 0xc7, /* 0xc0-0xc7 */
        0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
        0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */
        0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
        0xe0, 0xe1, 0xe2, 0xe3, 0xe5, 0xe5, 0x00, 0x00, /* 0xe0-0xe7 */
        0xe8, 0xe9, 0xea, 0xeb, 0xde, 0x00, 0xee, 0xef, /* 0xe8-0xef */
        0xf0, 0xf1, 0x00, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
        0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
};

static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
{
        const unsigned char *uni2charset;
        unsigned char cl = uni & 0x00ff;
        unsigned char ch = (uni & 0xff00) >> 8;

        if (boundlen <= 0)
                return -ENAMETOOLONG;

        uni2charset = page_uni2charset[ch];
        if (uni2charset && uni2charset[cl])
                out[0] = uni2charset[cl];
        else
                return -EINVAL;
        return 1;
}

static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
{
        *uni = charset2uni[*rawstring];
        if (*uni == 0x0000)
                return -EINVAL;
        return 1;
}

static struct nls_table table = {
        .charset        = "cp857",
        .uni2char        = uni2char,
        .char2uni        = char2uni,
        .charset2lower        = charset2lower,
        .charset2upper        = charset2upper,
};

static int __init init_nls_cp857(void)
{
        return register_nls(&table);
}

static void __exit exit_nls_cp857(void)
{
        unregister_nls(&table);
}

module_init(init_nls_cp857)
module_exit(exit_nls_cp857)

MODULE_LICENSE("Dual BSD/GPL");







































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 
























































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
/* $Id: capi.c,v 1.1.2.7 2004/04/28 09:48:59 armin Exp $
 *
 * CAPI 2.0 Interface for Linux
 *
 * Copyright 1996 by Carsten Paeth <calle@calle.de>
 *
 * This software may be used and distributed according to the terms
 * of the GNU General Public License, incorporated herein by reference.
 *
 */

#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/ethtool.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/fcntl.h>
#include <linux/fs.h>
#include <linux/signal.h>
#include <linux/mutex.h>
#include <linux/mm.h>
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/tty.h>
#include <linux/netdevice.h>
#include <linux/ppp_defs.h>
#include <linux/ppp-ioctl.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/capi.h>
#include <linux/kernelcapi.h>
#include <linux/init.h>
#include <linux/device.h>
#include <linux/moduleparam.h>
#include <linux/isdn/capiutil.h>
#include <linux/isdn/capicmd.h>

#include "kcapi.h"

MODULE_DESCRIPTION("CAPI4Linux: kernel CAPI layer and /dev/capi20 interface");
MODULE_AUTHOR("Carsten Paeth");
MODULE_LICENSE("GPL");

/* -------- driver information -------------------------------------- */

static DEFINE_MUTEX(capi_mutex);
static const struct class capi_class = {
        .name = "capi",
};
static int capi_major = 68;                /* allocated */

module_param_named(major, capi_major, uint, 0);

#ifdef CONFIG_ISDN_CAPI_MIDDLEWARE
#define CAPINC_NR_PORTS                32
#define CAPINC_MAX_PORTS        256

static int capi_ttyminors = CAPINC_NR_PORTS;

module_param_named(ttyminors, capi_ttyminors, uint, 0);
#endif /* CONFIG_ISDN_CAPI_MIDDLEWARE */

/* -------- defines ------------------------------------------------- */

#define CAPINC_MAX_RECVQUEUE        10
#define CAPINC_MAX_SENDQUEUE        10
#define CAPI_MAX_BLKSIZE        2048

/* -------- data structures ----------------------------------------- */

struct capidev;
struct capincci;
struct capiminor;

struct ackqueue_entry {
        struct list_head        list;
        u16                        datahandle;
};

struct capiminor {
        unsigned int      minor;

        struct capi20_appl        *ap;
        u32                        ncci;
        atomic_t                datahandle;
        atomic_t                msgid;

        struct tty_port port;
        int                ttyinstop;
        int                ttyoutstop;

        struct sk_buff_head        inqueue;

        struct sk_buff_head        outqueue;
        int                        outbytes;
        struct sk_buff                *outskb;
        spinlock_t                outlock;

        /* transmit path */
        struct list_head ackqueue;
        int nack;
        spinlock_t ackqlock;
};

struct capincci {
        struct list_head list;
        u32                 ncci;
        struct capidev        *cdev;
#ifdef CONFIG_ISDN_CAPI_MIDDLEWARE
        struct capiminor *minorp;
#endif /* CONFIG_ISDN_CAPI_MIDDLEWARE */
};

struct capidev {
        struct list_head list;
        struct capi20_appl ap;
        u16                errcode;
        unsigned        userflags;

        struct sk_buff_head recvqueue;
        wait_queue_head_t recvwait;

        struct list_head nccis;

        struct mutex lock;
};

/* -------- global variables ---------------------------------------- */

static DEFINE_MUTEX(capidev_list_lock);
static LIST_HEAD(capidev_list);

#ifdef CONFIG_ISDN_CAPI_MIDDLEWARE

static DEFINE_SPINLOCK(capiminors_lock);
static struct capiminor **capiminors;

static struct tty_driver *capinc_tty_driver;

/* -------- datahandles --------------------------------------------- */

static int capiminor_add_ack(struct capiminor *mp, u16 datahandle)
{
        struct ackqueue_entry *n;

        n = kmalloc(sizeof(*n), GFP_ATOMIC);
        if (unlikely(!n)) {
                printk(KERN_ERR "capi: alloc datahandle failed\n");
                return -1;
        }
        n->datahandle = datahandle;
        INIT_LIST_HEAD(&n->list);
        spin_lock_bh(&mp->ackqlock);
        list_add_tail(&n->list, &mp->ackqueue);
        mp->nack++;
        spin_unlock_bh(&mp->ackqlock);
        return 0;
}

static int capiminor_del_ack(struct capiminor *mp, u16 datahandle)
{
        struct ackqueue_entry *p, *tmp;

        spin_lock_bh(&mp->ackqlock);
        list_for_each_entry_safe(p, tmp, &mp->ackqueue, list) {
                if (p->datahandle == datahandle) {
                        list_del(&p->list);
                        mp->nack--;
                        spin_unlock_bh(&mp->ackqlock);
                        kfree(p);
                        return 0;
                }
        }
        spin_unlock_bh(&mp->ackqlock);
        return -1;
}

static void capiminor_del_all_ack(struct capiminor *mp)
{
        struct ackqueue_entry *p, *tmp;

        list_for_each_entry_safe(p, tmp, &mp->ackqueue, list) {
                list_del(&p->list);
                kfree(p);
                mp->nack--;
        }
}


/* -------- struct capiminor ---------------------------------------- */

static void capiminor_destroy(struct tty_port *port)
{
        struct capiminor *mp = container_of(port, struct capiminor, port);

        kfree_skb(mp->outskb);
        skb_queue_purge(&mp->inqueue);
        skb_queue_purge(&mp->outqueue);
        capiminor_del_all_ack(mp);
        kfree(mp);
}

static const struct tty_port_operations capiminor_port_ops = {
        .destruct = capiminor_destroy,
};

static struct capiminor *capiminor_alloc(struct capi20_appl *ap, u32 ncci)
{
        struct capiminor *mp;
        struct device *dev;
        unsigned int minor;

        mp = kzalloc(sizeof(*mp), GFP_KERNEL);
        if (!mp) {
                printk(KERN_ERR "capi: can't alloc capiminor\n");
                return NULL;
        }

        mp->ap = ap;
        mp->ncci = ncci;
        INIT_LIST_HEAD(&mp->ackqueue);
        spin_lock_init(&mp->ackqlock);

        skb_queue_head_init(&mp->inqueue);
        skb_queue_head_init(&mp->outqueue);
        spin_lock_init(&mp->outlock);

        tty_port_init(&mp->port);
        mp->port.ops = &capiminor_port_ops;

        /* Allocate the least unused minor number. */
        spin_lock(&capiminors_lock);
        for (minor = 0; minor < capi_ttyminors; minor++)
                if (!capiminors[minor]) {
                        capiminors[minor] = mp;
                        break;
                }
        spin_unlock(&capiminors_lock);

        if (minor == capi_ttyminors) {
                printk(KERN_NOTICE "capi: out of minors\n");
                goto err_out1;
        }

        mp->minor = minor;

        dev = tty_port_register_device(&mp->port, capinc_tty_driver, minor,
                        NULL);
        if (IS_ERR(dev))
                goto err_out2;

        return mp;

err_out2:
        spin_lock(&capiminors_lock);
        capiminors[minor] = NULL;
        spin_unlock(&capiminors_lock);

err_out1:
        tty_port_put(&mp->port);
        return NULL;
}

static struct capiminor *capiminor_get(unsigned int minor)
{
        struct capiminor *mp;

        spin_lock(&capiminors_lock);
        mp = capiminors[minor];
        if (mp)
                tty_port_get(&mp->port);
        spin_unlock(&capiminors_lock);

        return mp;
}

static inline void capiminor_put(struct capiminor *mp)
{
        tty_port_put(&mp->port);
}

static void capiminor_free(struct capiminor *mp)
{
        tty_unregister_device(capinc_tty_driver, mp->minor);

        spin_lock(&capiminors_lock);
        capiminors[mp->minor] = NULL;
        spin_unlock(&capiminors_lock);

        capiminor_put(mp);
}

/* -------- struct capincci ----------------------------------------- */

static void capincci_alloc_minor(struct capidev *cdev, struct capincci *np)
{
        if (cdev->userflags & CAPIFLAG_HIGHJACKING)
                np->minorp = capiminor_alloc(&cdev->ap, np->ncci);
}

static void capincci_free_minor(struct capincci *np)
{
        struct capiminor *mp = np->minorp;
        struct tty_struct *tty;

        if (mp) {
                tty = tty_port_tty_get(&mp->port);
                if (tty) {
                        tty_vhangup(tty);
                        tty_kref_put(tty);
                }

                capiminor_free(mp);
        }
}

static inline unsigned int capincci_minor_opencount(struct capincci *np)
{
        struct capiminor *mp = np->minorp;
        unsigned int count = 0;
        struct tty_struct *tty;

        if (mp) {
                tty = tty_port_tty_get(&mp->port);
                if (tty) {
                        count = tty->count;
                        tty_kref_put(tty);
                }
        }
        return count;
}

#else /* !CONFIG_ISDN_CAPI_MIDDLEWARE */

static inline void
capincci_alloc_minor(struct capidev *cdev, struct capincci *np) { }
static inline void capincci_free_minor(struct capincci *np) { }

#endif /* !CONFIG_ISDN_CAPI_MIDDLEWARE */

static struct capincci *capincci_alloc(struct capidev *cdev, u32 ncci)
{
        struct capincci *np;

        np = kzalloc(sizeof(*np), GFP_KERNEL);
        if (!np)
                return NULL;
        np->ncci = ncci;
        np->cdev = cdev;

        capincci_alloc_minor(cdev, np);

        list_add_tail(&np->list, &cdev->nccis);

        return np;
}

static void capincci_free(struct capidev *cdev, u32 ncci)
{
        struct capincci *np, *tmp;

        list_for_each_entry_safe(np, tmp, &cdev->nccis, list)
                if (ncci == 0xffffffff || np->ncci == ncci) {
                        capincci_free_minor(np);
                        list_del(&np->list);
                        kfree(np);
                }
}

#ifdef CONFIG_ISDN_CAPI_MIDDLEWARE
static struct capincci *capincci_find(struct capidev *cdev, u32 ncci)
{
        struct capincci *np;

        list_for_each_entry(np, &cdev->nccis, list)
                if (np->ncci == ncci)
                        return np;
        return NULL;
}

/* -------- handle data queue --------------------------------------- */

static struct sk_buff *
gen_data_b3_resp_for(struct capiminor *mp, struct sk_buff *skb)
{
        struct sk_buff *nskb;
        nskb = alloc_skb(CAPI_DATA_B3_RESP_LEN, GFP_KERNEL);
        if (nskb) {
                u16 datahandle = CAPIMSG_U16(skb->data, CAPIMSG_BASELEN + 4 + 4 + 2);
                unsigned char *s = skb_put(nskb, CAPI_DATA_B3_RESP_LEN);
                capimsg_setu16(s, 0, CAPI_DATA_B3_RESP_LEN);
                capimsg_setu16(s, 2, mp->ap->applid);
                capimsg_setu8 (s, 4, CAPI_DATA_B3);
                capimsg_setu8 (s, 5, CAPI_RESP);
                capimsg_setu16(s, 6, atomic_inc_return(&mp->msgid));
                capimsg_setu32(s, 8, mp->ncci);
                capimsg_setu16(s, 12, datahandle);
        }
        return nskb;
}

static int handle_recv_skb(struct capiminor *mp, struct sk_buff *skb)
{
        unsigned int datalen = skb->len - CAPIMSG_LEN(skb->data);
        struct tty_struct *tty;
        struct sk_buff *nskb;
        u16 errcode, datahandle;
        struct tty_ldisc *ld;
        int ret = -1;

        tty = tty_port_tty_get(&mp->port);
        if (!tty) {
                pr_debug("capi: currently no receiver\n");
                return -1;
        }

        ld = tty_ldisc_ref(tty);
        if (!ld) {
                /* fatal error, do not requeue */
                ret = 0;
                kfree_skb(skb);
                goto deref_tty;
        }

        if (ld->ops->receive_buf == NULL) {
                pr_debug("capi: ldisc has no receive_buf function\n");
                /* fatal error, do not requeue */
                goto free_skb;
        }
        if (mp->ttyinstop) {
                pr_debug("capi: recv tty throttled\n");
                goto deref_ldisc;
        }

        if (tty->receive_room < datalen) {
                pr_debug("capi: no room in tty\n");
                goto deref_ldisc;
        }

        nskb = gen_data_b3_resp_for(mp, skb);
        if (!nskb) {
                printk(KERN_ERR "capi: gen_data_b3_resp failed\n");
                goto deref_ldisc;
        }

        datahandle = CAPIMSG_U16(skb->data, CAPIMSG_BASELEN + 4);

        errcode = capi20_put_message(mp->ap, nskb);

        if (errcode == CAPI_NOERROR) {
                skb_pull(skb, CAPIMSG_LEN(skb->data));
                pr_debug("capi: DATA_B3_RESP %u len=%d => ldisc\n",
                         datahandle, skb->len);
                ld->ops->receive_buf(tty, skb->data, NULL, skb->len);
        } else {
                printk(KERN_ERR "capi: send DATA_B3_RESP failed=%x\n",
                       errcode);
                kfree_skb(nskb);

                if (errcode == CAPI_SENDQUEUEFULL)
                        goto deref_ldisc;
        }

free_skb:
        ret = 0;
        kfree_skb(skb);

deref_ldisc:
        tty_ldisc_deref(ld);

deref_tty:
        tty_kref_put(tty);
        return ret;
}

static void handle_minor_recv(struct capiminor *mp)
{
        struct sk_buff *skb;

        while ((skb = skb_dequeue(&mp->inqueue)) != NULL)
                if (handle_recv_skb(mp, skb) < 0) {
                        skb_queue_head(&mp->inqueue, skb);
                        return;
                }
}

static void handle_minor_send(struct capiminor *mp)
{
        struct tty_struct *tty;
        struct sk_buff *skb;
        u16 len;
        u16 errcode;
        u16 datahandle;

        tty = tty_port_tty_get(&mp->port);
        if (!tty)
                return;

        if (mp->ttyoutstop) {
                pr_debug("capi: send: tty stopped\n");
                tty_kref_put(tty);
                return;
        }

        while (1) {
                spin_lock_bh(&mp->outlock);
                skb = __skb_dequeue(&mp->outqueue);
                if (!skb) {
                        spin_unlock_bh(&mp->outlock);
                        break;
                }
                len = (u16)skb->len;
                mp->outbytes -= len;
                spin_unlock_bh(&mp->outlock);

                datahandle = atomic_inc_return(&mp->datahandle);
                skb_push(skb, CAPI_DATA_B3_REQ_LEN);
                memset(skb->data, 0, CAPI_DATA_B3_REQ_LEN);
                capimsg_setu16(skb->data, 0, CAPI_DATA_B3_REQ_LEN);
                capimsg_setu16(skb->data, 2, mp->ap->applid);
                capimsg_setu8 (skb->data, 4, CAPI_DATA_B3);
                capimsg_setu8 (skb->data, 5, CAPI_REQ);
                capimsg_setu16(skb->data, 6, atomic_inc_return(&mp->msgid));
                capimsg_setu32(skb->data, 8, mp->ncci);        /* NCCI */
                capimsg_setu32(skb->data, 12, (u32)(long)skb->data);/* Data32 */
                capimsg_setu16(skb->data, 16, len);        /* Data length */
                capimsg_setu16(skb->data, 18, datahandle);
                capimsg_setu16(skb->data, 20, 0);        /* Flags */

                if (capiminor_add_ack(mp, datahandle) < 0) {
                        skb_pull(skb, CAPI_DATA_B3_REQ_LEN);

                        spin_lock_bh(&mp->outlock);
                        __skb_queue_head(&mp->outqueue, skb);
                        mp->outbytes += len;
                        spin_unlock_bh(&mp->outlock);

                        break;
                }
                errcode = capi20_put_message(mp->ap, skb);
                if (errcode == CAPI_NOERROR) {
                        pr_debug("capi: DATA_B3_REQ %u len=%u\n",
                                 datahandle, len);
                        continue;
                }
                capiminor_del_ack(mp, datahandle);

                if (errcode == CAPI_SENDQUEUEFULL) {
                        skb_pull(skb, CAPI_DATA_B3_REQ_LEN);

                        spin_lock_bh(&mp->outlock);
                        __skb_queue_head(&mp->outqueue, skb);
                        mp->outbytes += len;
                        spin_unlock_bh(&mp->outlock);

                        break;
                }

                /* ups, drop packet */
                printk(KERN_ERR "capi: put_message = %x\n", errcode);
                kfree_skb(skb);
        }
        tty_kref_put(tty);
}

#endif /* CONFIG_ISDN_CAPI_MIDDLEWARE */
/* -------- function called by lower level -------------------------- */

static void capi_recv_message(struct capi20_appl *ap, struct sk_buff *skb)
{
        struct capidev *cdev = ap->private;
#ifdef CONFIG_ISDN_CAPI_MIDDLEWARE
        struct capiminor *mp;
        u16 datahandle;
        struct capincci *np;
#endif /* CONFIG_ISDN_CAPI_MIDDLEWARE */

        mutex_lock(&cdev->lock);

        if (CAPIMSG_CMD(skb->data) == CAPI_CONNECT_B3_CONF) {
                u16 info = CAPIMSG_U16(skb->data, 12); // Info field
                if ((info & 0xff00) == 0)
                        capincci_alloc(cdev, CAPIMSG_NCCI(skb->data));
        }
        if (CAPIMSG_CMD(skb->data) == CAPI_CONNECT_B3_IND)
                capincci_alloc(cdev, CAPIMSG_NCCI(skb->data));

        if (CAPIMSG_COMMAND(skb->data) != CAPI_DATA_B3) {
                skb_queue_tail(&cdev->recvqueue, skb);
                wake_up_interruptible(&cdev->recvwait);
                goto unlock_out;
        }

#ifndef CONFIG_ISDN_CAPI_MIDDLEWARE
        skb_queue_tail(&cdev->recvqueue, skb);
        wake_up_interruptible(&cdev->recvwait);

#else /* CONFIG_ISDN_CAPI_MIDDLEWARE */

        np = capincci_find(cdev, CAPIMSG_CONTROL(skb->data));
        if (!np) {
                printk(KERN_ERR "BUG: capi_signal: ncci not found\n");
                skb_queue_tail(&cdev->recvqueue, skb);
                wake_up_interruptible(&cdev->recvwait);
                goto unlock_out;
        }

        mp = np->minorp;
        if (!mp) {
                skb_queue_tail(&cdev->recvqueue, skb);
                wake_up_interruptible(&cdev->recvwait);
                goto unlock_out;
        }
        if (CAPIMSG_SUBCOMMAND(skb->data) == CAPI_IND) {
                datahandle = CAPIMSG_U16(skb->data, CAPIMSG_BASELEN + 4 + 4 + 2);
                pr_debug("capi_signal: DATA_B3_IND %u len=%d\n",
                         datahandle, skb->len-CAPIMSG_LEN(skb->data));
                skb_queue_tail(&mp->inqueue, skb);

                handle_minor_recv(mp);

        } else if (CAPIMSG_SUBCOMMAND(skb->data) == CAPI_CONF) {

                datahandle = CAPIMSG_U16(skb->data, CAPIMSG_BASELEN + 4);
                pr_debug("capi_signal: DATA_B3_CONF %u 0x%x\n",
                         datahandle,
                         CAPIMSG_U16(skb->data, CAPIMSG_BASELEN + 4 + 2));
                kfree_skb(skb);
                capiminor_del_ack(mp, datahandle);
                tty_port_tty_wakeup(&mp->port);
                handle_minor_send(mp);

        } else {
                /* ups, let capi application handle it :-) */
                skb_queue_tail(&cdev->recvqueue, skb);
                wake_up_interruptible(&cdev->recvwait);
        }
#endif /* CONFIG_ISDN_CAPI_MIDDLEWARE */

unlock_out:
        mutex_unlock(&cdev->lock);
}

/* -------- file_operations for capidev ----------------------------- */

static ssize_t
capi_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
        struct capidev *cdev = file->private_data;
        struct sk_buff *skb;
        size_t copied;
        int err;

        if (!cdev->ap.applid)
                return -ENODEV;

        skb = skb_dequeue(&cdev->recvqueue);
        if (!skb) {
                if (file->f_flags & O_NONBLOCK)
                        return -EAGAIN;
                err = wait_event_interruptible(cdev->recvwait,
                                               (skb = skb_dequeue(&cdev->recvqueue)));
                if (err)
                        return err;
        }
        if (skb->len > count) {
                skb_queue_head(&cdev->recvqueue, skb);
                return -EMSGSIZE;
        }
        if (copy_to_user(buf, skb->data, skb->len)) {
                skb_queue_head(&cdev->recvqueue, skb);
                return -EFAULT;
        }
        copied = skb->len;

        kfree_skb(skb);

        return copied;
}

static ssize_t
capi_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
        struct capidev *cdev = file->private_data;
        struct sk_buff *skb;
        u16 mlen;

        if (!cdev->ap.applid)
                return -ENODEV;

        if (count < CAPIMSG_BASELEN)
                return -EINVAL;

        skb = alloc_skb(count, GFP_USER);
        if (!skb)
                return -ENOMEM;

        if (copy_from_user(skb_put(skb, count), buf, count)) {
                kfree_skb(skb);
                return -EFAULT;
        }
        mlen = CAPIMSG_LEN(skb->data);
        if (CAPIMSG_CMD(skb->data) == CAPI_DATA_B3_REQ) {
                if (count < CAPI_DATA_B3_REQ_LEN ||
                    (size_t)(mlen + CAPIMSG_DATALEN(skb->data)) != count) {
                        kfree_skb(skb);
                        return -EINVAL;
                }
        } else {
                if (mlen != count) {
                        kfree_skb(skb);
                        return -EINVAL;
                }
        }
        CAPIMSG_SETAPPID(skb->data, cdev->ap.applid);

        if (CAPIMSG_CMD(skb->data) == CAPI_DISCONNECT_B3_RESP) {
                if (count < CAPI_DISCONNECT_B3_RESP_LEN) {
                        kfree_skb(skb);
                        return -EINVAL;
                }
                mutex_lock(&cdev->lock);
                capincci_free(cdev, CAPIMSG_NCCI(skb->data));
                mutex_unlock(&cdev->lock);
        }

        cdev->errcode = capi20_put_message(&cdev->ap, skb);

        if (cdev->errcode) {
                kfree_skb(skb);
                return -EIO;
        }
        return count;
}

static __poll_t
capi_poll(struct file *file, poll_table *wait)
{
        struct capidev *cdev = file->private_data;
        __poll_t mask = 0;

        if (!cdev->ap.applid)
                return EPOLLERR;

        poll_wait(file, &(cdev->recvwait), wait);
        mask = EPOLLOUT | EPOLLWRNORM;
        if (!skb_queue_empty_lockless(&cdev->recvqueue))
                mask |= EPOLLIN | EPOLLRDNORM;
        return mask;
}

static int
capi_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        struct capidev *cdev = file->private_data;
        capi_ioctl_struct data;
        int retval = -EINVAL;
        void __user *argp = (void __user *)arg;

        switch (cmd) {
        case CAPI_REGISTER:
                mutex_lock(&cdev->lock);

                if (cdev->ap.applid) {
                        retval = -EEXIST;
                        goto register_out;
                }
                if (copy_from_user(&cdev->ap.rparam, argp,
                                   sizeof(struct capi_register_params))) {
                        retval = -EFAULT;
                        goto register_out;
                }
                cdev->ap.private = cdev;
                cdev->ap.recv_message = capi_recv_message;
                cdev->errcode = capi20_register(&cdev->ap);
                retval = (int)cdev->ap.applid;
                if (cdev->errcode) {
                        cdev->ap.applid = 0;
                        retval = -EIO;
                }

register_out:
                mutex_unlock(&cdev->lock);
                return retval;

        case CAPI_GET_VERSION:
                if (copy_from_user(&data.contr, argp,
                                   sizeof(data.contr)))
                        return -EFAULT;
                cdev->errcode = capi20_get_version(data.contr, &data.version);
                if (cdev->errcode)
                        return -EIO;
                if (copy_to_user(argp, &data.version,
                                 sizeof(data.version)))
                        return -EFAULT;
                return 0;

        case CAPI_GET_SERIAL:
                if (copy_from_user(&data.contr, argp,
                                   sizeof(data.contr)))
                        return -EFAULT;
                cdev->errcode = capi20_get_serial(data.contr, data.serial);
                if (cdev->errcode)
                        return -EIO;
                if (copy_to_user(argp, data.serial,
                                 sizeof(data.serial)))
                        return -EFAULT;
                return 0;

        case CAPI_GET_PROFILE:
                if (copy_from_user(&data.contr, argp,
                                   sizeof(data.contr)))
                        return -EFAULT;

                if (data.contr == 0) {
                        cdev->errcode = capi20_get_profile(data.contr, &data.profile);
                        if (cdev->errcode)
                                return -EIO;

                        retval = copy_to_user(argp,
                                              &data.profile.ncontroller,
                                              sizeof(data.profile.ncontroller));

                } else {
                        cdev->errcode = capi20_get_profile(data.contr, &data.profile);
                        if (cdev->errcode)
                                return -EIO;

                        retval = copy_to_user(argp, &data.profile,
                                              sizeof(data.profile));
                }
                if (retval)
                        return -EFAULT;
                return 0;

        case CAPI_GET_MANUFACTURER:
                if (copy_from_user(&data.contr, argp,
                                   sizeof(data.contr)))
                        return -EFAULT;
                cdev->errcode = capi20_get_manufacturer(data.contr, data.manufacturer);
                if (cdev->errcode)
                        return -EIO;

                if (copy_to_user(argp, data.manufacturer,
                                 sizeof(data.manufacturer)))
                        return -EFAULT;

                return 0;

        case CAPI_GET_ERRCODE:
                data.errcode = cdev->errcode;
                cdev->errcode = CAPI_NOERROR;
                if (arg) {
                        if (copy_to_user(argp, &data.errcode,
                                         sizeof(data.errcode)))
                                return -EFAULT;
                }
                return data.errcode;

        case CAPI_INSTALLED:
                if (capi20_isinstalled() == CAPI_NOERROR)
                        return 0;
                return -ENXIO;

        case CAPI_MANUFACTURER_CMD: {
                struct capi_manufacturer_cmd mcmd;
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (copy_from_user(&mcmd, argp, sizeof(mcmd)))
                        return -EFAULT;
                return capi20_manufacturer(mcmd.cmd, mcmd.data);
        }
        case CAPI_SET_FLAGS:
        case CAPI_CLR_FLAGS: {
                unsigned userflags;

                if (copy_from_user(&userflags, argp, sizeof(userflags)))
                        return -EFAULT;

                mutex_lock(&cdev->lock);
                if (cmd == CAPI_SET_FLAGS)
                        cdev->userflags |= userflags;
                else
                        cdev->userflags &= ~userflags;
                mutex_unlock(&cdev->lock);
                return 0;
        }
        case CAPI_GET_FLAGS:
                if (copy_to_user(argp, &cdev->userflags,
                                 sizeof(cdev->userflags)))
                        return -EFAULT;
                return 0;

#ifndef CONFIG_ISDN_CAPI_MIDDLEWARE
        case CAPI_NCCI_OPENCOUNT:
                return 0;

#else /* CONFIG_ISDN_CAPI_MIDDLEWARE */
        case CAPI_NCCI_OPENCOUNT: {
                struct capincci *nccip;
                unsigned ncci;
                int count = 0;

                if (copy_from_user(&ncci, argp, sizeof(ncci)))
                        return -EFAULT;

                mutex_lock(&cdev->lock);
                nccip = capincci_find(cdev, (u32)ncci);
                if (nccip)
                        count = capincci_minor_opencount(nccip);
                mutex_unlock(&cdev->lock);
                return count;
        }

        case CAPI_NCCI_GETUNIT: {
                struct capincci *nccip;
                struct capiminor *mp;
                unsigned ncci;
                int unit = -ESRCH;

                if (copy_from_user(&ncci, argp, sizeof(ncci)))
                        return -EFAULT;

                mutex_lock(&cdev->lock);
                nccip = capincci_find(cdev, (u32)ncci);
                if (nccip) {
                        mp = nccip->minorp;
                        if (mp)
                                unit = mp->minor;
                }
                mutex_unlock(&cdev->lock);
                return unit;
        }
#endif /* CONFIG_ISDN_CAPI_MIDDLEWARE */

        default:
                return -EINVAL;
        }
}

static long
capi_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        int ret;

        mutex_lock(&capi_mutex);
        ret = capi_ioctl(file, cmd, arg);
        mutex_unlock(&capi_mutex);

        return ret;
}

#ifdef CONFIG_COMPAT
static long
capi_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        int ret;

        if (cmd == CAPI_MANUFACTURER_CMD) {
                struct {
                        compat_ulong_t cmd;
                        compat_uptr_t data;
                } mcmd32;

                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (copy_from_user(&mcmd32, compat_ptr(arg), sizeof(mcmd32)))
                        return -EFAULT;

                mutex_lock(&capi_mutex);
                ret = capi20_manufacturer(mcmd32.cmd, compat_ptr(mcmd32.data));
                mutex_unlock(&capi_mutex);

                return ret;
        }

        return capi_unlocked_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
}
#endif

static int capi_open(struct inode *inode, struct file *file)
{
        struct capidev *cdev;

        cdev = kzalloc(sizeof(*cdev), GFP_KERNEL);
        if (!cdev)
                return -ENOMEM;

        mutex_init(&cdev->lock);
        skb_queue_head_init(&cdev->recvqueue);
        init_waitqueue_head(&cdev->recvwait);
        INIT_LIST_HEAD(&cdev->nccis);
        file->private_data = cdev;

        mutex_lock(&capidev_list_lock);
        list_add_tail(&cdev->list, &capidev_list);
        mutex_unlock(&capidev_list_lock);

        return stream_open(inode, file);
}

static int capi_release(struct inode *inode, struct file *file)
{
        struct capidev *cdev = file->private_data;

        mutex_lock(&capidev_list_lock);
        list_del(&cdev->list);
        mutex_unlock(&capidev_list_lock);

        if (cdev->ap.applid)
                capi20_release(&cdev->ap);
        skb_queue_purge(&cdev->recvqueue);
        capincci_free(cdev, 0xffffffff);

        kfree(cdev);
        return 0;
}

static const struct file_operations capi_fops =
{
        .owner                = THIS_MODULE,
        .llseek                = no_llseek,
        .read                = capi_read,
        .write                = capi_write,
        .poll                = capi_poll,
        .unlocked_ioctl        = capi_unlocked_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl        = capi_compat_ioctl,
#endif
        .open                = capi_open,
        .release        = capi_release,
};

#ifdef CONFIG_ISDN_CAPI_MIDDLEWARE
/* -------- tty_operations for capincci ----------------------------- */

static int
capinc_tty_install(struct tty_driver *driver, struct tty_struct *tty)
{
        struct capiminor *mp = capiminor_get(tty->index);
        int ret = tty_standard_install(driver, tty);

        if (ret == 0)
                tty->driver_data = mp;
        else
                capiminor_put(mp);
        return ret;
}

static void capinc_tty_cleanup(struct tty_struct *tty)
{
        struct capiminor *mp = tty->driver_data;
        tty->driver_data = NULL;
        capiminor_put(mp);
}

static int capinc_tty_open(struct tty_struct *tty, struct file *filp)
{
        struct capiminor *mp = tty->driver_data;
        int err;

        err = tty_port_open(&mp->port, tty, filp);
        if (err)
                return err;

        handle_minor_recv(mp);
        return 0;
}

static void capinc_tty_close(struct tty_struct *tty, struct file *filp)
{
        struct capiminor *mp = tty->driver_data;

        tty_port_close(&mp->port, tty, filp);
}

static ssize_t capinc_tty_write(struct tty_struct *tty, const u8 *buf,
                                size_t count)
{
        struct capiminor *mp = tty->driver_data;
        struct sk_buff *skb;

        pr_debug("capinc_tty_write(count=%zu)\n", count);

        spin_lock_bh(&mp->outlock);
        skb = mp->outskb;
        if (skb) {
                mp->outskb = NULL;
                __skb_queue_tail(&mp->outqueue, skb);
                mp->outbytes += skb->len;
        }

        skb = alloc_skb(CAPI_DATA_B3_REQ_LEN + count, GFP_ATOMIC);
        if (!skb) {
                printk(KERN_ERR "capinc_tty_write: alloc_skb failed\n");
                spin_unlock_bh(&mp->outlock);
                return -ENOMEM;
        }

        skb_reserve(skb, CAPI_DATA_B3_REQ_LEN);
        skb_put_data(skb, buf, count);

        __skb_queue_tail(&mp->outqueue, skb);
        mp->outbytes += skb->len;
        spin_unlock_bh(&mp->outlock);

        handle_minor_send(mp);

        return count;
}

static int capinc_tty_put_char(struct tty_struct *tty, u8 ch)
{
        struct capiminor *mp = tty->driver_data;
        bool invoke_send = false;
        struct sk_buff *skb;
        int ret = 1;

        pr_debug("capinc_put_char(%u)\n", ch);

        spin_lock_bh(&mp->outlock);
        skb = mp->outskb;
        if (skb) {
                if (skb_tailroom(skb) > 0) {
                        skb_put_u8(skb, ch);
                        goto unlock_out;
                }
                mp->outskb = NULL;
                __skb_queue_tail(&mp->outqueue, skb);
                mp->outbytes += skb->len;
                invoke_send = true;
        }

        skb = alloc_skb(CAPI_DATA_B3_REQ_LEN + CAPI_MAX_BLKSIZE, GFP_ATOMIC);
        if (skb) {
                skb_reserve(skb, CAPI_DATA_B3_REQ_LEN);
                skb_put_u8(skb, ch);
                mp->outskb = skb;
        } else {
                printk(KERN_ERR "capinc_put_char: char %u lost\n", ch);
                ret = 0;
        }

unlock_out:
        spin_unlock_bh(&mp->outlock);

        if (invoke_send)
                handle_minor_send(mp);

        return ret;
}

static void capinc_tty_flush_chars(struct tty_struct *tty)
{
        struct capiminor *mp = tty->driver_data;
        struct sk_buff *skb;

        spin_lock_bh(&mp->outlock);
        skb = mp->outskb;
        if (skb) {
                mp->outskb = NULL;
                __skb_queue_tail(&mp->outqueue, skb);
                mp->outbytes += skb->len;
                spin_unlock_bh(&mp->outlock);

                handle_minor_send(mp);
        } else
                spin_unlock_bh(&mp->outlock);

        handle_minor_recv(mp);
}

static unsigned int capinc_tty_write_room(struct tty_struct *tty)
{
        struct capiminor *mp = tty->driver_data;
        unsigned int room;

        room = CAPINC_MAX_SENDQUEUE-skb_queue_len(&mp->outqueue);
        room *= CAPI_MAX_BLKSIZE;
        pr_debug("capinc_tty_write_room = %u\n", room);
        return room;
}

static unsigned int capinc_tty_chars_in_buffer(struct tty_struct *tty)
{
        struct capiminor *mp = tty->driver_data;

        pr_debug("capinc_tty_chars_in_buffer = %d nack=%d sq=%d rq=%d\n",
                 mp->outbytes, mp->nack,
                 skb_queue_len(&mp->outqueue),
                 skb_queue_len(&mp->inqueue));
        return mp->outbytes;
}

static void capinc_tty_throttle(struct tty_struct *tty)
{
        struct capiminor *mp = tty->driver_data;
        mp->ttyinstop = 1;
}

static void capinc_tty_unthrottle(struct tty_struct *tty)
{
        struct capiminor *mp = tty->driver_data;

        mp->ttyinstop = 0;
        handle_minor_recv(mp);
}

static void capinc_tty_stop(struct tty_struct *tty)
{
        struct capiminor *mp = tty->driver_data;

        mp->ttyoutstop = 1;
}

static void capinc_tty_start(struct tty_struct *tty)
{
        struct capiminor *mp = tty->driver_data;

        mp->ttyoutstop = 0;
        handle_minor_send(mp);
}

static void capinc_tty_hangup(struct tty_struct *tty)
{
        struct capiminor *mp = tty->driver_data;

        tty_port_hangup(&mp->port);
}

static void capinc_tty_send_xchar(struct tty_struct *tty, u8 ch)
{
        pr_debug("capinc_tty_send_xchar(%u)\n", ch);
}

static const struct tty_operations capinc_ops = {
        .open = capinc_tty_open,
        .close = capinc_tty_close,
        .write = capinc_tty_write,
        .put_char = capinc_tty_put_char,
        .flush_chars = capinc_tty_flush_chars,
        .write_room = capinc_tty_write_room,
        .chars_in_buffer = capinc_tty_chars_in_buffer,
        .throttle = capinc_tty_throttle,
        .unthrottle = capinc_tty_unthrottle,
        .stop = capinc_tty_stop,
        .start = capinc_tty_start,
        .hangup = capinc_tty_hangup,
        .send_xchar = capinc_tty_send_xchar,
        .install = capinc_tty_install,
        .cleanup = capinc_tty_cleanup,
};

static int __init capinc_tty_init(void)
{
        struct tty_driver *drv;
        int err;

        if (capi_ttyminors > CAPINC_MAX_PORTS)
                capi_ttyminors = CAPINC_MAX_PORTS;
        if (capi_ttyminors <= 0)
                capi_ttyminors = CAPINC_NR_PORTS;

        capiminors = kcalloc(capi_ttyminors, sizeof(struct capiminor *),
                             GFP_KERNEL);
        if (!capiminors)
                return -ENOMEM;

        drv = tty_alloc_driver(capi_ttyminors, TTY_DRIVER_REAL_RAW |
                        TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_DYNAMIC_DEV);
        if (IS_ERR(drv)) {
                kfree(capiminors);
                return PTR_ERR(drv);
        }
        drv->driver_name = "capi_nc";
        drv->name = "capi!";
        drv->major = 0;
        drv->minor_start = 0;
        drv->type = TTY_DRIVER_TYPE_SERIAL;
        drv->subtype = SERIAL_TYPE_NORMAL;
        drv->init_termios = tty_std_termios;
        drv->init_termios.c_iflag = ICRNL;
        drv->init_termios.c_oflag = OPOST | ONLCR;
        drv->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL;
        drv->init_termios.c_lflag = 0;
        tty_set_operations(drv, &capinc_ops);

        err = tty_register_driver(drv);
        if (err) {
                tty_driver_kref_put(drv);
                kfree(capiminors);
                printk(KERN_ERR "Couldn't register capi_nc driver\n");
                return err;
        }
        capinc_tty_driver = drv;
        return 0;
}

static void __exit capinc_tty_exit(void)
{
        tty_unregister_driver(capinc_tty_driver);
        tty_driver_kref_put(capinc_tty_driver);
        kfree(capiminors);
}

#else /* !CONFIG_ISDN_CAPI_MIDDLEWARE */

static inline int capinc_tty_init(void)
{
        return 0;
}

static inline void capinc_tty_exit(void) { }

#endif /* !CONFIG_ISDN_CAPI_MIDDLEWARE */

/* -------- /proc functions ----------------------------------------- */

/*
 * /proc/capi/capi20:
 *  minor applid nrecvctlpkt nrecvdatapkt nsendctlpkt nsenddatapkt
 */
static int __maybe_unused capi20_proc_show(struct seq_file *m, void *v)
{
        struct capidev *cdev;
        struct list_head *l;

        mutex_lock(&capidev_list_lock);
        list_for_each(l, &capidev_list) {
                cdev = list_entry(l, struct capidev, list);
                seq_printf(m, "0 %d %lu %lu %lu %lu\n",
                           cdev->ap.applid,
                           cdev->ap.nrecvctlpkt,
                           cdev->ap.nrecvdatapkt,
                           cdev->ap.nsentctlpkt,
                           cdev->ap.nsentdatapkt);
        }
        mutex_unlock(&capidev_list_lock);
        return 0;
}

/*
 * /proc/capi/capi20ncci:
 *  applid ncci
 */
static int __maybe_unused capi20ncci_proc_show(struct seq_file *m, void *v)
{
        struct capidev *cdev;
        struct capincci *np;

        mutex_lock(&capidev_list_lock);
        list_for_each_entry(cdev, &capidev_list, list) {
                mutex_lock(&cdev->lock);
                list_for_each_entry(np, &cdev->nccis, list)
                        seq_printf(m, "%d 0x%x\n", cdev->ap.applid, np->ncci);
                mutex_unlock(&cdev->lock);
        }
        mutex_unlock(&capidev_list_lock);
        return 0;
}

static void __init proc_init(void)
{
        proc_create_single("capi/capi20", 0, NULL, capi20_proc_show);
        proc_create_single("capi/capi20ncci", 0, NULL, capi20ncci_proc_show);
}

static void __exit proc_exit(void)
{
        remove_proc_entry("capi/capi20", NULL);
        remove_proc_entry("capi/capi20ncci", NULL);
}

/* -------- init function and module interface ---------------------- */


static int __init capi_init(void)
{
        const char *compileinfo;
        int major_ret;
        int ret;

        ret = kcapi_init();
        if (ret)
                return ret;

        major_ret = register_chrdev(capi_major, "capi20", &capi_fops);
        if (major_ret < 0) {
                printk(KERN_ERR "capi20: unable to get major %d\n", capi_major);
                kcapi_exit();
                return major_ret;
        }

        ret = class_register(&capi_class);
        if (ret) {
                unregister_chrdev(capi_major, "capi20");
                kcapi_exit();
                return ret;
        }

        device_create(&capi_class, NULL, MKDEV(capi_major, 0), NULL, "capi20");

        if (capinc_tty_init() < 0) {
                device_destroy(&capi_class, MKDEV(capi_major, 0));
                class_unregister(&capi_class);
                unregister_chrdev(capi_major, "capi20");
                kcapi_exit();
                return -ENOMEM;
        }

        proc_init();

#ifdef CONFIG_ISDN_CAPI_MIDDLEWARE
        compileinfo = " (middleware)";
#else
        compileinfo = " (no middleware)";
#endif
        printk(KERN_NOTICE "CAPI 2.0 started up with major %d%s\n",
               capi_major, compileinfo);

        return 0;
}

static void __exit capi_exit(void)
{
        proc_exit();

        device_destroy(&capi_class, MKDEV(capi_major, 0));
        class_unregister(&capi_class);
        unregister_chrdev(capi_major, "capi20");

        capinc_tty_exit();

        kcapi_exit();
}

module_init(capi_init);
module_exit(capi_exit);



















































































































    1 
















    1 

    1 























































































































































































































    1 







    1 






















    1 
    1 

























































































































































































































































































































































































    1 







    1 







    1 

    1 







    1 


    1 



    1 

    1 




    1 

    1 


    1 



    1 




























    1 



    1 


    1 
    1 
    1 



    1 




































































































































































    1 







    1 





































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Generic helpers for smp ipi calls
 *
 * (C) Jens Axboe <jens.axboe@oracle.com> 2008
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/irq_work.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/gfp.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/sched.h>
#include <linux/sched/idle.h>
#include <linux/hypervisor.h>
#include <linux/sched/clock.h>
#include <linux/nmi.h>
#include <linux/sched/debug.h>
#include <linux/jump_label.h>

#include <trace/events/ipi.h>
#define CREATE_TRACE_POINTS
#include <trace/events/csd.h>
#undef CREATE_TRACE_POINTS

#include "smpboot.h"
#include "sched/smp.h"

#define CSD_TYPE(_csd)        ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK)

struct call_function_data {
        call_single_data_t        __percpu *csd;
        cpumask_var_t                cpumask;
        cpumask_var_t                cpumask_ipi;
};

static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);

static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);

static DEFINE_PER_CPU(atomic_t, trigger_backtrace) = ATOMIC_INIT(1);

static void __flush_smp_call_function_queue(bool warn_cpu_offline);

int smpcfd_prepare_cpu(unsigned int cpu)
{
        struct call_function_data *cfd = &per_cpu(cfd_data, cpu);

        if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
                                     cpu_to_node(cpu)))
                return -ENOMEM;
        if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
                                     cpu_to_node(cpu))) {
                free_cpumask_var(cfd->cpumask);
                return -ENOMEM;
        }
        cfd->csd = alloc_percpu(call_single_data_t);
        if (!cfd->csd) {
                free_cpumask_var(cfd->cpumask);
                free_cpumask_var(cfd->cpumask_ipi);
                return -ENOMEM;
        }

        return 0;
}

int smpcfd_dead_cpu(unsigned int cpu)
{
        struct call_function_data *cfd = &per_cpu(cfd_data, cpu);

        free_cpumask_var(cfd->cpumask);
        free_cpumask_var(cfd->cpumask_ipi);
        free_percpu(cfd->csd);
        return 0;
}

int smpcfd_dying_cpu(unsigned int cpu)
{
        /*
         * The IPIs for the smp-call-function callbacks queued by other
         * CPUs might arrive late, either due to hardware latencies or
         * because this CPU disabled interrupts (inside stop-machine)
         * before the IPIs were sent. So flush out any pending callbacks
         * explicitly (without waiting for the IPIs to arrive), to
         * ensure that the outgoing CPU doesn't go offline with work
         * still pending.
         */
        __flush_smp_call_function_queue(false);
        irq_work_run();
        return 0;
}

void __init call_function_init(void)
{
        int i;

        for_each_possible_cpu(i)
                init_llist_head(&per_cpu(call_single_queue, i));

        smpcfd_prepare_cpu(smp_processor_id());
}

static __always_inline void
send_call_function_single_ipi(int cpu)
{
        if (call_function_single_prep_ipi(cpu)) {
                trace_ipi_send_cpu(cpu, _RET_IP_,
                                   generic_smp_call_function_single_interrupt);
                arch_send_call_function_single_ipi(cpu);
        }
}

static __always_inline void
send_call_function_ipi_mask(struct cpumask *mask)
{
        trace_ipi_send_cpumask(mask, _RET_IP_,
                               generic_smp_call_function_single_interrupt);
        arch_send_call_function_ipi_mask(mask);
}

static __always_inline void
csd_do_func(smp_call_func_t func, void *info, call_single_data_t *csd)
{
        trace_csd_function_entry(func, csd);
        func(info);
        trace_csd_function_exit(func, csd);
}

#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG

static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled);

/*
 * Parse the csdlock_debug= kernel boot parameter.
 *
 * If you need to restore the old "ext" value that once provided
 * additional debugging information, reapply the following commits:
 *
 * de7b09ef658d ("locking/csd_lock: Prepare more CSD lock debugging")
 * a5aabace5fb8 ("locking/csd_lock: Add more data to CSD lock debugging")
 */
static int __init csdlock_debug(char *str)
{
        int ret;
        unsigned int val = 0;

        ret = get_option(&str, &val);
        if (ret) {
                if (val)
                        static_branch_enable(&csdlock_debug_enabled);
                else
                        static_branch_disable(&csdlock_debug_enabled);
        }

        return 1;
}
__setup("csdlock_debug=", csdlock_debug);

static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
static DEFINE_PER_CPU(void *, cur_csd_info);

static ulong csd_lock_timeout = 5000;  /* CSD lock timeout in milliseconds. */
module_param(csd_lock_timeout, ulong, 0444);
static int panic_on_ipistall;  /* CSD panic timeout in milliseconds, 300000 for five minutes. */
module_param(panic_on_ipistall, int, 0444);

static atomic_t csd_bug_count = ATOMIC_INIT(0);

/* Record current CSD work for current CPU, NULL to erase. */
static void __csd_lock_record(call_single_data_t *csd)
{
        if (!csd) {
                smp_mb(); /* NULL cur_csd after unlock. */
                __this_cpu_write(cur_csd, NULL);
                return;
        }
        __this_cpu_write(cur_csd_func, csd->func);
        __this_cpu_write(cur_csd_info, csd->info);
        smp_wmb(); /* func and info before csd. */
        __this_cpu_write(cur_csd, csd);
        smp_mb(); /* Update cur_csd before function call. */
                  /* Or before unlock, as the case may be. */
}

static __always_inline void csd_lock_record(call_single_data_t *csd)
{
        if (static_branch_unlikely(&csdlock_debug_enabled))
                __csd_lock_record(csd);
}

static int csd_lock_wait_getcpu(call_single_data_t *csd)
{
        unsigned int csd_type;

        csd_type = CSD_TYPE(csd);
        if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC)
                return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */
        return -1;
}

/*
 * Complain if too much time spent waiting.  Note that only
 * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
 * so waiting on other types gets much less information.
 */
static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id)
{
        int cpu = -1;
        int cpux;
        bool firsttime;
        u64 ts2, ts_delta;
        call_single_data_t *cpu_cur_csd;
        unsigned int flags = READ_ONCE(csd->node.u_flags);
        unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC;

        if (!(flags & CSD_FLAG_LOCK)) {
                if (!unlikely(*bug_id))
                        return true;
                cpu = csd_lock_wait_getcpu(csd);
                pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n",
                         *bug_id, raw_smp_processor_id(), cpu);
                return true;
        }

        ts2 = sched_clock();
        /* How long since we last checked for a stuck CSD lock.*/
        ts_delta = ts2 - *ts1;
        if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0))
                return false;

        firsttime = !*bug_id;
        if (firsttime)
                *bug_id = atomic_inc_return(&csd_bug_count);
        cpu = csd_lock_wait_getcpu(csd);
        if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu))
                cpux = 0;
        else
                cpux = cpu;
        cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
        /* How long since this CSD lock was stuck. */
        ts_delta = ts2 - ts0;
        pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n",
                 firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta,
                 cpu, csd->func, csd->info);
        /*
         * If the CSD lock is still stuck after 5 minutes, it is unlikely
         * to become unstuck. Use a signed comparison to avoid triggering
         * on underflows when the TSC is out of sync between sockets.
         */
        BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC));
        if (cpu_cur_csd && csd != cpu_cur_csd) {
                pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n",
                         *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)),
                         READ_ONCE(per_cpu(cur_csd_info, cpux)));
        } else {
                pr_alert("\tcsd: CSD lock (#%d) %s.\n",
                         *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request");
        }
        if (cpu >= 0) {
                if (atomic_cmpxchg_acquire(&per_cpu(trigger_backtrace, cpu), 1, 0))
                        dump_cpu_task(cpu);
                if (!cpu_cur_csd) {
                        pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
                        arch_send_call_function_single_ipi(cpu);
                }
        }
        if (firsttime)
                dump_stack();
        *ts1 = ts2;

        return false;
}

/*
 * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
 *
 * For non-synchronous ipi calls the csd can still be in use by the
 * previous function call. For multi-cpu calls its even more interesting
 * as we'll have to ensure no other cpu is observing our csd.
 */
static void __csd_lock_wait(call_single_data_t *csd)
{
        int bug_id = 0;
        u64 ts0, ts1;

        ts1 = ts0 = sched_clock();
        for (;;) {
                if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id))
                        break;
                cpu_relax();
        }
        smp_acquire__after_ctrl_dep();
}

static __always_inline void csd_lock_wait(call_single_data_t *csd)
{
        if (static_branch_unlikely(&csdlock_debug_enabled)) {
                __csd_lock_wait(csd);
                return;
        }

        smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
}
#else
static void csd_lock_record(call_single_data_t *csd)
{
}

static __always_inline void csd_lock_wait(call_single_data_t *csd)
{
        smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
}
#endif

static __always_inline void csd_lock(call_single_data_t *csd)
{
        csd_lock_wait(csd);
        csd->node.u_flags |= CSD_FLAG_LOCK;

        /*
         * prevent CPU from reordering the above assignment
         * to ->flags with any subsequent assignments to other
         * fields of the specified call_single_data_t structure:
         */
        smp_wmb();
}

static __always_inline void csd_unlock(call_single_data_t *csd)
{
        WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));

        /*
         * ensure we're all done before releasing data:
         */
        smp_store_release(&csd->node.u_flags, 0);
}

static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);

void __smp_call_single_queue(int cpu, struct llist_node *node)
{
        /*
         * We have to check the type of the CSD before queueing it, because
         * once queued it can have its flags cleared by
         *   flush_smp_call_function_queue()
         * even if we haven't sent the smp_call IPI yet (e.g. the stopper
         * executes migration_cpu_stop() on the remote CPU).
         */
        if (trace_csd_queue_cpu_enabled()) {
                call_single_data_t *csd;
                smp_call_func_t func;

                csd = container_of(node, call_single_data_t, node.llist);
                func = CSD_TYPE(csd) == CSD_TYPE_TTWU ?
                        sched_ttwu_pending : csd->func;

                trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
        }

        /*
         * The list addition should be visible to the target CPU when it pops
         * the head of the list to pull the entry off it in the IPI handler
         * because of normal cache coherency rules implied by the underlying
         * llist ops.
         *
         * If IPIs can go out of order to the cache coherency protocol
         * in an architecture, sufficient synchronisation should be added
         * to arch code to make it appear to obey cache coherency WRT
         * locking and barrier primitives. Generic code isn't really
         * equipped to do the right thing...
         */
        if (llist_add(node, &per_cpu(call_single_queue, cpu)))
                send_call_function_single_ipi(cpu);
}

/*
 * Insert a previously allocated call_single_data_t element
 * for execution on the given CPU. data must already have
 * ->func, ->info, and ->flags set.
 */
static int generic_exec_single(int cpu, call_single_data_t *csd)
{
        if (cpu == smp_processor_id()) {
                smp_call_func_t func = csd->func;
                void *info = csd->info;
                unsigned long flags;

                /*
                 * We can unlock early even for the synchronous on-stack case,
                 * since we're doing this from the same CPU..
                 */
                csd_lock_record(csd);
                csd_unlock(csd);
                local_irq_save(flags);
                csd_do_func(func, info, NULL);
                csd_lock_record(NULL);
                local_irq_restore(flags);
                return 0;
        }

        if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
                csd_unlock(csd);
                return -ENXIO;
        }

        __smp_call_single_queue(cpu, &csd->node.llist);

        return 0;
}

/**
 * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
 *
 * Invoked by arch to handle an IPI for call function single.
 * Must be called with interrupts disabled.
 */
void generic_smp_call_function_single_interrupt(void)
{
        __flush_smp_call_function_queue(true);
}

/**
 * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks
 *
 * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
 *                      offline CPU. Skip this check if set to 'false'.
 *
 * Flush any pending smp-call-function callbacks queued on this CPU. This is
 * invoked by the generic IPI handler, as well as by a CPU about to go offline,
 * to ensure that all pending IPI callbacks are run before it goes completely
 * offline.
 *
 * Loop through the call_single_queue and run all the queued callbacks.
 * Must be called with interrupts disabled.
 */
static void __flush_smp_call_function_queue(bool warn_cpu_offline)
{
        call_single_data_t *csd, *csd_next;
        struct llist_node *entry, *prev;
        struct llist_head *head;
        static bool warned;
        atomic_t *tbt;

        lockdep_assert_irqs_disabled();

        /* Allow waiters to send backtrace NMI from here onwards */
        tbt = this_cpu_ptr(&trigger_backtrace);
        atomic_set_release(tbt, 1);

        head = this_cpu_ptr(&call_single_queue);
        entry = llist_del_all(head);
        entry = llist_reverse_order(entry);

        /* There shouldn't be any pending callbacks on an offline CPU. */
        if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
                     !warned && entry != NULL)) {
                warned = true;
                WARN(1, "IPI on offline CPU %d\n", smp_processor_id());

                /*
                 * We don't have to use the _safe() variant here
                 * because we are not invoking the IPI handlers yet.
                 */
                llist_for_each_entry(csd, entry, node.llist) {
                        switch (CSD_TYPE(csd)) {
                        case CSD_TYPE_ASYNC:
                        case CSD_TYPE_SYNC:
                        case CSD_TYPE_IRQ_WORK:
                                pr_warn("IPI callback %pS sent to offline CPU\n",
                                        csd->func);
                                break;

                        case CSD_TYPE_TTWU:
                                pr_warn("IPI task-wakeup sent to offline CPU\n");
                                break;

                        default:
                                pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
                                        CSD_TYPE(csd));
                                break;
                        }
                }
        }

        /*
         * First; run all SYNC callbacks, people are waiting for us.
         */
        prev = NULL;
        llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
                /* Do we wait until *after* callback? */
                if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
                        smp_call_func_t func = csd->func;
                        void *info = csd->info;

                        if (prev) {
                                prev->next = &csd_next->node.llist;
                        } else {
                                entry = &csd_next->node.llist;
                        }

                        csd_lock_record(csd);
                        csd_do_func(func, info, csd);
                        csd_unlock(csd);
                        csd_lock_record(NULL);
                } else {
                        prev = &csd->node.llist;
                }
        }

        if (!entry)
                return;

        /*
         * Second; run all !SYNC callbacks.
         */
        prev = NULL;
        llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
                int type = CSD_TYPE(csd);

                if (type != CSD_TYPE_TTWU) {
                        if (prev) {
                                prev->next = &csd_next->node.llist;
                        } else {
                                entry = &csd_next->node.llist;
                        }

                        if (type == CSD_TYPE_ASYNC) {
                                smp_call_func_t func = csd->func;
                                void *info = csd->info;

                                csd_lock_record(csd);
                                csd_unlock(csd);
                                csd_do_func(func, info, csd);
                                csd_lock_record(NULL);
                        } else if (type == CSD_TYPE_IRQ_WORK) {
                                irq_work_single(csd);
                        }

                } else {
                        prev = &csd->node.llist;
                }
        }

        /*
         * Third; only CSD_TYPE_TTWU is left, issue those.
         */
        if (entry) {
                csd = llist_entry(entry, typeof(*csd), node.llist);
                csd_do_func(sched_ttwu_pending, entry, csd);
        }
}


/**
 * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
 *                                   from task context (idle, migration thread)
 *
 * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it
 * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by
 * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to
 * handle queued SMP function calls before scheduling.
 *
 * The migration thread has to ensure that an eventually pending wakeup has
 * been handled before it migrates a task.
 */
void flush_smp_call_function_queue(void)
{
        unsigned int was_pending;
        unsigned long flags;

        if (llist_empty(this_cpu_ptr(&call_single_queue)))
                return;

        local_irq_save(flags);
        /* Get the already pending soft interrupts for RT enabled kernels */
        was_pending = local_softirq_pending();
        __flush_smp_call_function_queue(true);
        if (local_softirq_pending())
                do_softirq_post_smp_call_flush(was_pending);

        local_irq_restore(flags);
}

/*
 * smp_call_function_single - Run a function on a specific CPU
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait until function has completed on other CPUs.
 *
 * Returns 0 on success, else a negative status code.
 */
int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
                             int wait)
{
        call_single_data_t *csd;
        call_single_data_t csd_stack = {
                .node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, },
        };
        int this_cpu;
        int err;

        /*
         * prevent preemption and reschedule on another processor,
         * as well as CPU removal
         */
        this_cpu = get_cpu();

        /*
         * Can deadlock when called with interrupts disabled.
         * We allow cpu's that are not yet online though, as no one else can
         * send smp call function interrupt to this cpu and as such deadlocks
         * can't happen.
         */
        WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
                     && !oops_in_progress);

        /*
         * When @wait we can deadlock when we interrupt between llist_add() and
         * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
         * csd_lock() on because the interrupt context uses the same csd
         * storage.
         */
        WARN_ON_ONCE(!in_task());

        csd = &csd_stack;
        if (!wait) {
                csd = this_cpu_ptr(&csd_data);
                csd_lock(csd);
        }

        csd->func = func;
        csd->info = info;
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
        csd->node.src = smp_processor_id();
        csd->node.dst = cpu;
#endif

        err = generic_exec_single(cpu, csd);

        if (wait)
                csd_lock_wait(csd);

        put_cpu();

        return err;
}
EXPORT_SYMBOL(smp_call_function_single);

/**
 * smp_call_function_single_async() - Run an asynchronous function on a
 *                                  specific CPU.
 * @cpu: The CPU to run on.
 * @csd: Pre-allocated and setup data structure
 *
 * Like smp_call_function_single(), but the call is asynchonous and
 * can thus be done from contexts with disabled interrupts.
 *
 * The caller passes his own pre-allocated data structure
 * (ie: embedded in an object) and is responsible for synchronizing it
 * such that the IPIs performed on the @csd are strictly serialized.
 *
 * If the function is called with one csd which has not yet been
 * processed by previous call to smp_call_function_single_async(), the
 * function will return immediately with -EBUSY showing that the csd
 * object is still in progress.
 *
 * NOTE: Be careful, there is unfortunately no current debugging facility to
 * validate the correctness of this serialization.
 *
 * Return: %0 on success or negative errno value on error
 */
int smp_call_function_single_async(int cpu, call_single_data_t *csd)
{
        int err = 0;

        preempt_disable();

        if (csd->node.u_flags & CSD_FLAG_LOCK) {
                err = -EBUSY;
                goto out;
        }

        csd->node.u_flags = CSD_FLAG_LOCK;
        smp_wmb();

        err = generic_exec_single(cpu, csd);

out:
        preempt_enable();

        return err;
}
EXPORT_SYMBOL_GPL(smp_call_function_single_async);

/*
 * smp_call_function_any - Run a function on any of the given cpus
 * @mask: The mask of cpus it can run on.
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait until function has completed.
 *
 * Returns 0 on success, else a negative status code (if no cpus were online).
 *
 * Selection preference:
 *        1) current cpu if in @mask
 *        2) any cpu of current node if in @mask
 *        3) any other online cpu in @mask
 */
int smp_call_function_any(const struct cpumask *mask,
                          smp_call_func_t func, void *info, int wait)
{
        unsigned int cpu;
        const struct cpumask *nodemask;
        int ret;

        /* Try for same CPU (cheapest) */
        cpu = get_cpu();
        if (cpumask_test_cpu(cpu, mask))
                goto call;

        /* Try for same node. */
        nodemask = cpumask_of_node(cpu_to_node(cpu));
        for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
             cpu = cpumask_next_and(cpu, nodemask, mask)) {
                if (cpu_online(cpu))
                        goto call;
        }

        /* Any online will do: smp_call_function_single handles nr_cpu_ids. */
        cpu = cpumask_any_and(mask, cpu_online_mask);
call:
        ret = smp_call_function_single(cpu, func, info, wait);
        put_cpu();
        return ret;
}
EXPORT_SYMBOL_GPL(smp_call_function_any);

/*
 * Flags to be used as scf_flags argument of smp_call_function_many_cond().
 *
 * %SCF_WAIT:                Wait until function execution is completed
 * %SCF_RUN_LOCAL:        Run also locally if local cpu is set in cpumask
 */
#define SCF_WAIT        (1U << 0)
#define SCF_RUN_LOCAL        (1U << 1)

static void smp_call_function_many_cond(const struct cpumask *mask,
                                        smp_call_func_t func, void *info,
                                        unsigned int scf_flags,
                                        smp_cond_func_t cond_func)
{
        int cpu, last_cpu, this_cpu = smp_processor_id();
        struct call_function_data *cfd;
        bool wait = scf_flags & SCF_WAIT;
        int nr_cpus = 0;
        bool run_remote = false;
        bool run_local = false;

        lockdep_assert_preemption_disabled();

        /*
         * Can deadlock when called with interrupts disabled.
         * We allow cpu's that are not yet online though, as no one else can
         * send smp call function interrupt to this cpu and as such deadlocks
         * can't happen.
         */
        if (cpu_online(this_cpu) && !oops_in_progress &&
            !early_boot_irqs_disabled)
                lockdep_assert_irqs_enabled();

        /*
         * When @wait we can deadlock when we interrupt between llist_add() and
         * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
         * csd_lock() on because the interrupt context uses the same csd
         * storage.
         */
        WARN_ON_ONCE(!in_task());

        /* Check if we need local execution. */
        if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask))
                run_local = true;

        /* Check if we need remote execution, i.e., any CPU excluding this one. */
        cpu = cpumask_first_and(mask, cpu_online_mask);
        if (cpu == this_cpu)
                cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
        if (cpu < nr_cpu_ids)
                run_remote = true;

        if (run_remote) {
                cfd = this_cpu_ptr(&cfd_data);
                cpumask_and(cfd->cpumask, mask, cpu_online_mask);
                __cpumask_clear_cpu(this_cpu, cfd->cpumask);

                cpumask_clear(cfd->cpumask_ipi);
                for_each_cpu(cpu, cfd->cpumask) {
                        call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);

                        if (cond_func && !cond_func(cpu, info)) {
                                __cpumask_clear_cpu(cpu, cfd->cpumask);
                                continue;
                        }

                        csd_lock(csd);
                        if (wait)
                                csd->node.u_flags |= CSD_TYPE_SYNC;
                        csd->func = func;
                        csd->info = info;
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
                        csd->node.src = smp_processor_id();
                        csd->node.dst = cpu;
#endif
                        trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);

                        if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
                                __cpumask_set_cpu(cpu, cfd->cpumask_ipi);
                                nr_cpus++;
                                last_cpu = cpu;
                        }
                }

                /*
                 * Choose the most efficient way to send an IPI. Note that the
                 * number of CPUs might be zero due to concurrent changes to the
                 * provided mask.
                 */
                if (nr_cpus == 1)
                        send_call_function_single_ipi(last_cpu);
                else if (likely(nr_cpus > 1))
                        send_call_function_ipi_mask(cfd->cpumask_ipi);
        }

        if (run_local && (!cond_func || cond_func(this_cpu, info))) {
                unsigned long flags;

                local_irq_save(flags);
                csd_do_func(func, info, NULL);
                local_irq_restore(flags);
        }

        if (run_remote && wait) {
                for_each_cpu(cpu, cfd->cpumask) {
                        call_single_data_t *csd;

                        csd = per_cpu_ptr(cfd->csd, cpu);
                        csd_lock_wait(csd);
                }
        }
}

/**
 * smp_call_function_many(): Run a function on a set of CPUs.
 * @mask: The set of cpus to run on (only runs on online subset).
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait
 *        (atomically) until function has completed on other CPUs. If
 *        %SCF_RUN_LOCAL is set, the function will also be run locally
 *        if the local CPU is set in the @cpumask.
 *
 * If @wait is true, then returns once @func has returned.
 *
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler or from a bottom half handler. Preemption
 * must be disabled when calling this function.
 */
void smp_call_function_many(const struct cpumask *mask,
                            smp_call_func_t func, void *info, bool wait)
{
        smp_call_function_many_cond(mask, func, info, wait * SCF_WAIT, NULL);
}
EXPORT_SYMBOL(smp_call_function_many);

/**
 * smp_call_function(): Run a function on all other CPUs.
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait (atomically) until function has completed
 *        on other CPUs.
 *
 * Returns 0.
 *
 * If @wait is true, then returns once @func has returned; otherwise
 * it returns just before the target cpu calls @func.
 *
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler or from a bottom half handler.
 */
void smp_call_function(smp_call_func_t func, void *info, int wait)
{
        preempt_disable();
        smp_call_function_many(cpu_online_mask, func, info, wait);
        preempt_enable();
}
EXPORT_SYMBOL(smp_call_function);

/* Setup configured maximum number of CPUs to activate */
unsigned int setup_max_cpus = NR_CPUS;
EXPORT_SYMBOL(setup_max_cpus);


/*
 * Setup routine for controlling SMP activation
 *
 * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
 * activation entirely (the MPS table probe still happens, though).
 *
 * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
 * greater than 0, limits the maximum number of CPUs activated in
 * SMP mode to <NUM>.
 */

void __weak __init arch_disable_smp_support(void) { }

static int __init nosmp(char *str)
{
        setup_max_cpus = 0;
        arch_disable_smp_support();

        return 0;
}

early_param("nosmp", nosmp);

/* this is hard limit */
static int __init nrcpus(char *str)
{
        int nr_cpus;

        if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids)
                set_nr_cpu_ids(nr_cpus);

        return 0;
}

early_param("nr_cpus", nrcpus);

static int __init maxcpus(char *str)
{
        get_option(&str, &setup_max_cpus);
        if (setup_max_cpus == 0)
                arch_disable_smp_support();

        return 0;
}

early_param("maxcpus", maxcpus);

#if (NR_CPUS > 1) && !defined(CONFIG_FORCE_NR_CPUS)
/* Setup number of possible processor ids */
unsigned int nr_cpu_ids __read_mostly = NR_CPUS;
EXPORT_SYMBOL(nr_cpu_ids);
#endif

/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
void __init setup_nr_cpu_ids(void)
{
        set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + 1);
}

/* Called by boot processor to activate the rest. */
void __init smp_init(void)
{
        int num_nodes, num_cpus;

        idle_threads_init();
        cpuhp_threads_init();

        pr_info("Bringing up secondary CPUs ...\n");

        bringup_nonboot_cpus(setup_max_cpus);

        num_nodes = num_online_nodes();
        num_cpus  = num_online_cpus();
        pr_info("Brought up %d node%s, %d CPU%s\n",
                num_nodes, (num_nodes > 1 ? "s" : ""),
                num_cpus,  (num_cpus  > 1 ? "s" : ""));

        /* Any cleanup work */
        smp_cpus_done(setup_max_cpus);
}

/*
 * on_each_cpu_cond(): Call a function on each processor for which
 * the supplied function cond_func returns true, optionally waiting
 * for all the required CPUs to finish. This may include the local
 * processor.
 * @cond_func:        A callback function that is passed a cpu id and
 *                the info parameter. The function is called
 *                with preemption disabled. The function should
 *                return a blooean value indicating whether to IPI
 *                the specified CPU.
 * @func:        The function to run on all applicable CPUs.
 *                This must be fast and non-blocking.
 * @info:        An arbitrary pointer to pass to both functions.
 * @wait:        If true, wait (atomically) until function has
 *                completed on other CPUs.
 *
 * Preemption is disabled to protect against CPUs going offline but not online.
 * CPUs going online during the call will not be seen or sent an IPI.
 *
 * You must not call this function with disabled interrupts or
 * from a hardware interrupt handler or from a bottom half handler.
 */
void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
                           void *info, bool wait, const struct cpumask *mask)
{
        unsigned int scf_flags = SCF_RUN_LOCAL;

        if (wait)
                scf_flags |= SCF_WAIT;

        preempt_disable();
        smp_call_function_many_cond(mask, func, info, scf_flags, cond_func);
        preempt_enable();
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);

static void do_nothing(void *unused)
{
}

/**
 * kick_all_cpus_sync - Force all cpus out of idle
 *
 * Used to synchronize the update of pm_idle function pointer. It's
 * called after the pointer is updated and returns after the dummy
 * callback function has been executed on all cpus. The execution of
 * the function can only happen on the remote cpus after they have
 * left the idle function which had been called via pm_idle function
 * pointer. So it's guaranteed that nothing uses the previous pointer
 * anymore.
 */
void kick_all_cpus_sync(void)
{
        /* Make sure the change is visible before we kick the cpus */
        smp_mb();
        smp_call_function(do_nothing, NULL, 1);
}
EXPORT_SYMBOL_GPL(kick_all_cpus_sync);

/**
 * wake_up_all_idle_cpus - break all cpus out of idle
 * wake_up_all_idle_cpus try to break all cpus which is in idle state even
 * including idle polling cpus, for non-idle cpus, we will do nothing
 * for them.
 */
void wake_up_all_idle_cpus(void)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                preempt_disable();
                if (cpu != smp_processor_id() && cpu_online(cpu))
                        wake_up_if_idle(cpu);
                preempt_enable();
        }
}
EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);

/**
 * struct smp_call_on_cpu_struct - Call a function on a specific CPU
 * @work: &work_struct
 * @done: &completion to signal
 * @func: function to call
 * @data: function's data argument
 * @ret: return value from @func
 * @cpu: target CPU (%-1 for any CPU)
 *
 * Used to call a function on a specific cpu and wait for it to return.
 * Optionally make sure the call is done on a specified physical cpu via vcpu
 * pinning in order to support virtualized environments.
 */
struct smp_call_on_cpu_struct {
        struct work_struct        work;
        struct completion        done;
        int                        (*func)(void *);
        void                        *data;
        int                        ret;
        int                        cpu;
};

static void smp_call_on_cpu_callback(struct work_struct *work)
{
        struct smp_call_on_cpu_struct *sscs;

        sscs = container_of(work, struct smp_call_on_cpu_struct, work);
        if (sscs->cpu >= 0)
                hypervisor_pin_vcpu(sscs->cpu);
        sscs->ret = sscs->func(sscs->data);
        if (sscs->cpu >= 0)
                hypervisor_pin_vcpu(-1);

        complete(&sscs->done);
}

int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
{
        struct smp_call_on_cpu_struct sscs = {
                .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done),
                .func = func,
                .data = par,
                .cpu  = phys ? cpu : -1,
        };

        INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback);

        if (cpu >= nr_cpu_ids || !cpu_online(cpu))
                return -ENXIO;

        queue_work_on(cpu, system_wq, &sscs.work);
        wait_for_completion(&sscs.done);

        return sscs.ret;
}
EXPORT_SYMBOL_GPL(smp_call_on_cpu);





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    7 









    6 













































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/mm/swapfile.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 */

#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/namei.h>
#include <linux/shmem_fs.h>
#include <linux/blk-cgroup.h>
#include <linux/random.h>
#include <linux/writeback.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/backing-dev.h>
#include <linux/mutex.h>
#include <linux/capability.h>
#include <linux/syscalls.h>
#include <linux/memcontrol.h>
#include <linux/poll.h>
#include <linux/oom.h>
#include <linux/swapfile.h>
#include <linux/export.h>
#include <linux/swap_slots.h>
#include <linux/sort.h>
#include <linux/completion.h>
#include <linux/suspend.h>
#include <linux/zswap.h>
#include <linux/plist.h>

#include <asm/tlbflush.h>
#include <linux/swapops.h>
#include <linux/swap_cgroup.h>
#include "internal.h"
#include "swap.h"

static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
                                 unsigned char);
static void free_swap_count_continuations(struct swap_info_struct *);

static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
atomic_long_t nr_swap_pages;
/*
 * Some modules use swappable objects and may try to swap them out under
 * memory pressure (via the shrinker). Before doing so, they may wish to
 * check to see if any swap space is available.
 */
EXPORT_SYMBOL_GPL(nr_swap_pages);
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
static int least_priority = -1;
unsigned long swapfile_maximum_size;
#ifdef CONFIG_MIGRATION
bool swap_migration_ad_supported;
#endif        /* CONFIG_MIGRATION */

static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
static const char Unused_offset[] = "Unused swap offset entry ";

/*
 * all active swap_info_structs
 * protected with swap_lock, and ordered by priority.
 */
static PLIST_HEAD(swap_active_head);

/*
 * all available (active, not full) swap_info_structs
 * protected with swap_avail_lock, ordered by priority.
 * This is used by folio_alloc_swap() instead of swap_active_head
 * because swap_active_head includes all swap_info_structs,
 * but folio_alloc_swap() doesn't need to look at full ones.
 * This uses its own lock instead of swap_lock because when a
 * swap_info_struct changes between not-full/full, it needs to
 * add/remove itself to/from this list, but the swap_info_struct->lock
 * is held and the locking order requires swap_lock to be taken
 * before any swap_info_struct->lock.
 */
static struct plist_head *swap_avail_heads;
static DEFINE_SPINLOCK(swap_avail_lock);

static struct swap_info_struct *swap_info[MAX_SWAPFILES];

static DEFINE_MUTEX(swapon_mutex);

static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
/* Activity counter to indicate that a swapon or swapoff has occurred */
static atomic_t proc_poll_event = ATOMIC_INIT(0);

atomic_t nr_rotate_swap = ATOMIC_INIT(0);

static struct swap_info_struct *swap_type_to_swap_info(int type)
{
        if (type >= MAX_SWAPFILES)
                return NULL;

        return READ_ONCE(swap_info[type]); /* rcu_dereference() */
}

static inline unsigned char swap_count(unsigned char ent)
{
        return ent & ~SWAP_HAS_CACHE;        /* may include COUNT_CONTINUED flag */
}

/* Reclaim the swap entry anyway if possible */
#define TTRS_ANYWAY                0x1
/*
 * Reclaim the swap entry if there are no more mappings of the
 * corresponding page
 */
#define TTRS_UNMAPPED                0x2
/* Reclaim the swap entry if swap is getting full*/
#define TTRS_FULL                0x4

/*
 * returns number of pages in the folio that backs the swap entry. If positive,
 * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
 * folio was associated with the swap entry.
 */
static int __try_to_reclaim_swap(struct swap_info_struct *si,
                                 unsigned long offset, unsigned long flags)
{
        swp_entry_t entry = swp_entry(si->type, offset);
        struct folio *folio;
        int ret = 0;

        folio = filemap_get_folio(swap_address_space(entry), offset);
        if (IS_ERR(folio))
                return 0;
        /*
         * When this function is called from scan_swap_map_slots() and it's
         * called by vmscan.c at reclaiming folios. So we hold a folio lock
         * here. We have to use trylock for avoiding deadlock. This is a special
         * case and you should use folio_free_swap() with explicit folio_lock()
         * in usual operations.
         */
        if (folio_trylock(folio)) {
                if ((flags & TTRS_ANYWAY) ||
                    ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
                    ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)))
                        ret = folio_free_swap(folio);
                folio_unlock(folio);
        }
        ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
        folio_put(folio);
        return ret;
}

static inline struct swap_extent *first_se(struct swap_info_struct *sis)
{
        struct rb_node *rb = rb_first(&sis->swap_extent_root);
        return rb_entry(rb, struct swap_extent, rb_node);
}

static inline struct swap_extent *next_se(struct swap_extent *se)
{
        struct rb_node *rb = rb_next(&se->rb_node);
        return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
}

/*
 * swapon tell device that all the old swap contents can be discarded,
 * to allow the swap device to optimize its wear-levelling.
 */
static int discard_swap(struct swap_info_struct *si)
{
        struct swap_extent *se;
        sector_t start_block;
        sector_t nr_blocks;
        int err = 0;

        /* Do not discard the swap header page! */
        se = first_se(si);
        start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
        nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
        if (nr_blocks) {
                err = blkdev_issue_discard(si->bdev, start_block,
                                nr_blocks, GFP_KERNEL);
                if (err)
                        return err;
                cond_resched();
        }

        for (se = next_se(se); se; se = next_se(se)) {
                start_block = se->start_block << (PAGE_SHIFT - 9);
                nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);

                err = blkdev_issue_discard(si->bdev, start_block,
                                nr_blocks, GFP_KERNEL);
                if (err)
                        break;

                cond_resched();
        }
        return err;                /* That will often be -EOPNOTSUPP */
}

static struct swap_extent *
offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
{
        struct swap_extent *se;
        struct rb_node *rb;

        rb = sis->swap_extent_root.rb_node;
        while (rb) {
                se = rb_entry(rb, struct swap_extent, rb_node);
                if (offset < se->start_page)
                        rb = rb->rb_left;
                else if (offset >= se->start_page + se->nr_pages)
                        rb = rb->rb_right;
                else
                        return se;
        }
        /* It *must* be present */
        BUG();
}

sector_t swap_folio_sector(struct folio *folio)
{
        struct swap_info_struct *sis = swp_swap_info(folio->swap);
        struct swap_extent *se;
        sector_t sector;
        pgoff_t offset;

        offset = swp_offset(folio->swap);
        se = offset_to_swap_extent(sis, offset);
        sector = se->start_block + (offset - se->start_page);
        return sector << (PAGE_SHIFT - 9);
}

/*
 * swap allocation tell device that a cluster of swap can now be discarded,
 * to allow the swap device to optimize its wear-levelling.
 */
static void discard_swap_cluster(struct swap_info_struct *si,
                                 pgoff_t start_page, pgoff_t nr_pages)
{
        struct swap_extent *se = offset_to_swap_extent(si, start_page);

        while (nr_pages) {
                pgoff_t offset = start_page - se->start_page;
                sector_t start_block = se->start_block + offset;
                sector_t nr_blocks = se->nr_pages - offset;

                if (nr_blocks > nr_pages)
                        nr_blocks = nr_pages;
                start_page += nr_blocks;
                nr_pages -= nr_blocks;

                start_block <<= PAGE_SHIFT - 9;
                nr_blocks <<= PAGE_SHIFT - 9;
                if (blkdev_issue_discard(si->bdev, start_block,
                                        nr_blocks, GFP_NOIO))
                        break;

                se = next_se(se);
        }
}

#ifdef CONFIG_THP_SWAP
#define SWAPFILE_CLUSTER        HPAGE_PMD_NR

#define swap_entry_order(order)        (order)
#else
#define SWAPFILE_CLUSTER        256

/*
 * Define swap_entry_order() as constant to let compiler to optimize
 * out some code if !CONFIG_THP_SWAP
 */
#define swap_entry_order(order)        0
#endif
#define LATENCY_LIMIT                256

static inline void cluster_set_flag(struct swap_cluster_info *info,
        unsigned int flag)
{
        info->flags = flag;
}

static inline unsigned int cluster_count(struct swap_cluster_info *info)
{
        return info->data;
}

static inline void cluster_set_count(struct swap_cluster_info *info,
                                     unsigned int c)
{
        info->data = c;
}

static inline void cluster_set_count_flag(struct swap_cluster_info *info,
                                         unsigned int c, unsigned int f)
{
        info->flags = f;
        info->data = c;
}

static inline unsigned int cluster_next(struct swap_cluster_info *info)
{
        return info->data;
}

static inline void cluster_set_next(struct swap_cluster_info *info,
                                    unsigned int n)
{
        info->data = n;
}

static inline void cluster_set_next_flag(struct swap_cluster_info *info,
                                         unsigned int n, unsigned int f)
{
        info->flags = f;
        info->data = n;
}

static inline bool cluster_is_free(struct swap_cluster_info *info)
{
        return info->flags & CLUSTER_FLAG_FREE;
}

static inline bool cluster_is_null(struct swap_cluster_info *info)
{
        return info->flags & CLUSTER_FLAG_NEXT_NULL;
}

static inline void cluster_set_null(struct swap_cluster_info *info)
{
        info->flags = CLUSTER_FLAG_NEXT_NULL;
        info->data = 0;
}

static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
                                                     unsigned long offset)
{
        struct swap_cluster_info *ci;

        ci = si->cluster_info;
        if (ci) {
                ci += offset / SWAPFILE_CLUSTER;
                spin_lock(&ci->lock);
        }
        return ci;
}

static inline void unlock_cluster(struct swap_cluster_info *ci)
{
        if (ci)
                spin_unlock(&ci->lock);
}

/*
 * Determine the locking method in use for this device.  Return
 * swap_cluster_info if SSD-style cluster-based locking is in place.
 */
static inline struct swap_cluster_info *lock_cluster_or_swap_info(
                struct swap_info_struct *si, unsigned long offset)
{
        struct swap_cluster_info *ci;

        /* Try to use fine-grained SSD-style locking if available: */
        ci = lock_cluster(si, offset);
        /* Otherwise, fall back to traditional, coarse locking: */
        if (!ci)
                spin_lock(&si->lock);

        return ci;
}

static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
                                               struct swap_cluster_info *ci)
{
        if (ci)
                unlock_cluster(ci);
        else
                spin_unlock(&si->lock);
}

static inline bool cluster_list_empty(struct swap_cluster_list *list)
{
        return cluster_is_null(&list->head);
}

static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
{
        return cluster_next(&list->head);
}

static void cluster_list_init(struct swap_cluster_list *list)
{
        cluster_set_null(&list->head);
        cluster_set_null(&list->tail);
}

static void cluster_list_add_tail(struct swap_cluster_list *list,
                                  struct swap_cluster_info *ci,
                                  unsigned int idx)
{
        if (cluster_list_empty(list)) {
                cluster_set_next_flag(&list->head, idx, 0);
                cluster_set_next_flag(&list->tail, idx, 0);
        } else {
                struct swap_cluster_info *ci_tail;
                unsigned int tail = cluster_next(&list->tail);

                /*
                 * Nested cluster lock, but both cluster locks are
                 * only acquired when we held swap_info_struct->lock
                 */
                ci_tail = ci + tail;
                spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
                cluster_set_next(ci_tail, idx);
                spin_unlock(&ci_tail->lock);
                cluster_set_next_flag(&list->tail, idx, 0);
        }
}

static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
                                           struct swap_cluster_info *ci)
{
        unsigned int idx;

        idx = cluster_next(&list->head);
        if (cluster_next(&list->tail) == idx) {
                cluster_set_null(&list->head);
                cluster_set_null(&list->tail);
        } else
                cluster_set_next_flag(&list->head,
                                      cluster_next(&ci[idx]), 0);

        return idx;
}

/* Add a cluster to discard list and schedule it to do discard */
static void swap_cluster_schedule_discard(struct swap_info_struct *si,
                unsigned int idx)
{
        /*
         * If scan_swap_map_slots() can't find a free cluster, it will check
         * si->swap_map directly. To make sure the discarding cluster isn't
         * taken by scan_swap_map_slots(), mark the swap entries bad (occupied).
         * It will be cleared after discard
         */
        memset(si->swap_map + idx * SWAPFILE_CLUSTER,
                        SWAP_MAP_BAD, SWAPFILE_CLUSTER);

        cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);

        schedule_work(&si->discard_work);
}

static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
{
        struct swap_cluster_info *ci = si->cluster_info;

        cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
        cluster_list_add_tail(&si->free_clusters, ci, idx);
}

/*
 * Doing discard actually. After a cluster discard is finished, the cluster
 * will be added to free cluster list. caller should hold si->lock.
*/
static void swap_do_scheduled_discard(struct swap_info_struct *si)
{
        struct swap_cluster_info *info, *ci;
        unsigned int idx;

        info = si->cluster_info;

        while (!cluster_list_empty(&si->discard_clusters)) {
                idx = cluster_list_del_first(&si->discard_clusters, info);
                spin_unlock(&si->lock);

                discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
                                SWAPFILE_CLUSTER);

                spin_lock(&si->lock);
                ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
                __free_cluster(si, idx);
                memset(si->swap_map + idx * SWAPFILE_CLUSTER,
                                0, SWAPFILE_CLUSTER);
                unlock_cluster(ci);
        }
}

static void swap_discard_work(struct work_struct *work)
{
        struct swap_info_struct *si;

        si = container_of(work, struct swap_info_struct, discard_work);

        spin_lock(&si->lock);
        swap_do_scheduled_discard(si);
        spin_unlock(&si->lock);
}

static void swap_users_ref_free(struct percpu_ref *ref)
{
        struct swap_info_struct *si;

        si = container_of(ref, struct swap_info_struct, users);
        complete(&si->comp);
}

static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
{
        struct swap_cluster_info *ci = si->cluster_info;

        VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
        cluster_list_del_first(&si->free_clusters, ci);
        cluster_set_count_flag(ci + idx, 0, 0);
}

static void free_cluster(struct swap_info_struct *si, unsigned long idx)
{
        struct swap_cluster_info *ci = si->cluster_info + idx;

        VM_BUG_ON(cluster_count(ci) != 0);
        /*
         * If the swap is discardable, prepare discard the cluster
         * instead of free it immediately. The cluster will be freed
         * after discard.
         */
        if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
            (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
                swap_cluster_schedule_discard(si, idx);
                return;
        }

        __free_cluster(si, idx);
}

/*
 * The cluster corresponding to page_nr will be used. The cluster will be
 * removed from free cluster list and its usage counter will be increased by
 * count.
 */
static void add_cluster_info_page(struct swap_info_struct *p,
        struct swap_cluster_info *cluster_info, unsigned long page_nr,
        unsigned long count)
{
        unsigned long idx = page_nr / SWAPFILE_CLUSTER;

        if (!cluster_info)
                return;
        if (cluster_is_free(&cluster_info[idx]))
                alloc_cluster(p, idx);

        VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
        cluster_set_count(&cluster_info[idx],
                cluster_count(&cluster_info[idx]) + count);
}

/*
 * The cluster corresponding to page_nr will be used. The cluster will be
 * removed from free cluster list and its usage counter will be increased by 1.
 */
static void inc_cluster_info_page(struct swap_info_struct *p,
        struct swap_cluster_info *cluster_info, unsigned long page_nr)
{
        add_cluster_info_page(p, cluster_info, page_nr, 1);
}

/*
 * The cluster corresponding to page_nr decreases one usage. If the usage
 * counter becomes 0, which means no page in the cluster is in using, we can
 * optionally discard the cluster and add it to free cluster list.
 */
static void dec_cluster_info_page(struct swap_info_struct *p,
        struct swap_cluster_info *cluster_info, unsigned long page_nr)
{
        unsigned long idx = page_nr / SWAPFILE_CLUSTER;

        if (!cluster_info)
                return;

        VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
        cluster_set_count(&cluster_info[idx],
                cluster_count(&cluster_info[idx]) - 1);

        if (cluster_count(&cluster_info[idx]) == 0)
                free_cluster(p, idx);
}

/*
 * It's possible scan_swap_map_slots() uses a free cluster in the middle of free
 * cluster list. Avoiding such abuse to avoid list corruption.
 */
static bool
scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
        unsigned long offset, int order)
{
        struct percpu_cluster *percpu_cluster;
        bool conflict;

        offset /= SWAPFILE_CLUSTER;
        conflict = !cluster_list_empty(&si->free_clusters) &&
                offset != cluster_list_first(&si->free_clusters) &&
                cluster_is_free(&si->cluster_info[offset]);

        if (!conflict)
                return false;

        percpu_cluster = this_cpu_ptr(si->percpu_cluster);
        percpu_cluster->next[order] = SWAP_NEXT_INVALID;
        return true;
}

static inline bool swap_range_empty(char *swap_map, unsigned int start,
                                    unsigned int nr_pages)
{
        unsigned int i;

        for (i = 0; i < nr_pages; i++) {
                if (swap_map[start + i])
                        return false;
        }

        return true;
}

/*
 * Try to get swap entries with specified order from current cpu's swap entry
 * pool (a cluster). This might involve allocating a new cluster for current CPU
 * too.
 */
static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
        unsigned long *offset, unsigned long *scan_base, int order)
{
        unsigned int nr_pages = 1 << order;
        struct percpu_cluster *cluster;
        struct swap_cluster_info *ci;
        unsigned int tmp, max;

new_cluster:
        cluster = this_cpu_ptr(si->percpu_cluster);
        tmp = cluster->next[order];
        if (tmp == SWAP_NEXT_INVALID) {
                if (!cluster_list_empty(&si->free_clusters)) {
                        tmp = cluster_next(&si->free_clusters.head) *
                                        SWAPFILE_CLUSTER;
                } else if (!cluster_list_empty(&si->discard_clusters)) {
                        /*
                         * we don't have free cluster but have some clusters in
                         * discarding, do discard now and reclaim them, then
                         * reread cluster_next_cpu since we dropped si->lock
                         */
                        swap_do_scheduled_discard(si);
                        *scan_base = this_cpu_read(*si->cluster_next_cpu);
                        *offset = *scan_base;
                        goto new_cluster;
                } else
                        return false;
        }

        /*
         * Other CPUs can use our cluster if they can't find a free cluster,
         * check if there is still free entry in the cluster, maintaining
         * natural alignment.
         */
        max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER));
        if (tmp < max) {
                ci = lock_cluster(si, tmp);
                while (tmp < max) {
                        if (swap_range_empty(si->swap_map, tmp, nr_pages))
                                break;
                        tmp += nr_pages;
                }
                unlock_cluster(ci);
        }
        if (tmp >= max) {
                cluster->next[order] = SWAP_NEXT_INVALID;
                goto new_cluster;
        }
        *offset = tmp;
        *scan_base = tmp;
        tmp += nr_pages;
        cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID;
        return true;
}

static void __del_from_avail_list(struct swap_info_struct *p)
{
        int nid;

        assert_spin_locked(&p->lock);
        for_each_node(nid)
                plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
}

static void del_from_avail_list(struct swap_info_struct *p)
{
        spin_lock(&swap_avail_lock);
        __del_from_avail_list(p);
        spin_unlock(&swap_avail_lock);
}

static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
                             unsigned int nr_entries)
{
        unsigned int end = offset + nr_entries - 1;

        if (offset == si->lowest_bit)
                si->lowest_bit += nr_entries;
        if (end == si->highest_bit)
                WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
        WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries);
        if (si->inuse_pages == si->pages) {
                si->lowest_bit = si->max;
                si->highest_bit = 0;
                del_from_avail_list(si);
        }
}

static void add_to_avail_list(struct swap_info_struct *p)
{
        int nid;

        spin_lock(&swap_avail_lock);
        for_each_node(nid)
                plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
        spin_unlock(&swap_avail_lock);
}

static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
                            unsigned int nr_entries)
{
        unsigned long begin = offset;
        unsigned long end = offset + nr_entries - 1;
        void (*swap_slot_free_notify)(struct block_device *, unsigned long);

        if (offset < si->lowest_bit)
                si->lowest_bit = offset;
        if (end > si->highest_bit) {
                bool was_full = !si->highest_bit;

                WRITE_ONCE(si->highest_bit, end);
                if (was_full && (si->flags & SWP_WRITEOK))
                        add_to_avail_list(si);
        }
        if (si->flags & SWP_BLKDEV)
                swap_slot_free_notify =
                        si->bdev->bd_disk->fops->swap_slot_free_notify;
        else
                swap_slot_free_notify = NULL;
        while (offset <= end) {
                arch_swap_invalidate_page(si->type, offset);
                if (swap_slot_free_notify)
                        swap_slot_free_notify(si->bdev, offset);
                offset++;
        }
        clear_shadow_from_swap_cache(si->type, begin, end);

        /*
         * Make sure that try_to_unuse() observes si->inuse_pages reaching 0
         * only after the above cleanups are done.
         */
        smp_wmb();
        atomic_long_add(nr_entries, &nr_swap_pages);
        WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
}

static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
{
        unsigned long prev;

        if (!(si->flags & SWP_SOLIDSTATE)) {
                si->cluster_next = next;
                return;
        }

        prev = this_cpu_read(*si->cluster_next_cpu);
        /*
         * Cross the swap address space size aligned trunk, choose
         * another trunk randomly to avoid lock contention on swap
         * address space if possible.
         */
        if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
            (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
                /* No free swap slots available */
                if (si->highest_bit <= si->lowest_bit)
                        return;
                next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit);
                next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
                next = max_t(unsigned int, next, si->lowest_bit);
        }
        this_cpu_write(*si->cluster_next_cpu, next);
}

static bool swap_offset_available_and_locked(struct swap_info_struct *si,
                                             unsigned long offset)
{
        if (data_race(!si->swap_map[offset])) {
                spin_lock(&si->lock);
                return true;
        }

        if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
                spin_lock(&si->lock);
                return true;
        }

        return false;
}

static int scan_swap_map_slots(struct swap_info_struct *si,
                               unsigned char usage, int nr,
                               swp_entry_t slots[], int order)
{
        struct swap_cluster_info *ci;
        unsigned long offset;
        unsigned long scan_base;
        unsigned long last_in_cluster = 0;
        int latency_ration = LATENCY_LIMIT;
        unsigned int nr_pages = 1 << order;
        int n_ret = 0;
        bool scanned_many = false;

        /*
         * We try to cluster swap pages by allocating them sequentially
         * in swap.  Once we've allocated SWAPFILE_CLUSTER pages this
         * way, however, we resort to first-free allocation, starting
         * a new cluster.  This prevents us from scattering swap pages
         * all over the entire swap partition, so that we reduce
         * overall disk seek times between swap pages.  -- sct
         * But we do now try to find an empty cluster.  -Andrea
         * And we let swap pages go all over an SSD partition.  Hugh
         */

        if (order > 0) {
                /*
                 * Should not even be attempting large allocations when huge
                 * page swap is disabled.  Warn and fail the allocation.
                 */
                if (!IS_ENABLED(CONFIG_THP_SWAP) ||
                    nr_pages > SWAPFILE_CLUSTER) {
                        VM_WARN_ON_ONCE(1);
                        return 0;
                }

                /*
                 * Swapfile is not block device or not using clusters so unable
                 * to allocate large entries.
                 */
                if (!(si->flags & SWP_BLKDEV) || !si->cluster_info)
                        return 0;
        }

        si->flags += SWP_SCANNING;
        /*
         * Use percpu scan base for SSD to reduce lock contention on
         * cluster and swap cache.  For HDD, sequential access is more
         * important.
         */
        if (si->flags & SWP_SOLIDSTATE)
                scan_base = this_cpu_read(*si->cluster_next_cpu);
        else
                scan_base = si->cluster_next;
        offset = scan_base;

        /* SSD algorithm */
        if (si->cluster_info) {
                if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) {
                        if (order > 0)
                                goto no_page;
                        goto scan;
                }
        } else if (unlikely(!si->cluster_nr--)) {
                if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
                        si->cluster_nr = SWAPFILE_CLUSTER - 1;
                        goto checks;
                }

                spin_unlock(&si->lock);

                /*
                 * If seek is expensive, start searching for new cluster from
                 * start of partition, to minimize the span of allocated swap.
                 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
                 * case, just handled by scan_swap_map_try_ssd_cluster() above.
                 */
                scan_base = offset = si->lowest_bit;
                last_in_cluster = offset + SWAPFILE_CLUSTER - 1;

                /* Locate the first empty (unaligned) cluster */
                for (; last_in_cluster <= READ_ONCE(si->highest_bit); offset++) {
                        if (si->swap_map[offset])
                                last_in_cluster = offset + SWAPFILE_CLUSTER;
                        else if (offset == last_in_cluster) {
                                spin_lock(&si->lock);
                                offset -= SWAPFILE_CLUSTER - 1;
                                si->cluster_next = offset;
                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
                                goto checks;
                        }
                        if (unlikely(--latency_ration < 0)) {
                                cond_resched();
                                latency_ration = LATENCY_LIMIT;
                        }
                }

                offset = scan_base;
                spin_lock(&si->lock);
                si->cluster_nr = SWAPFILE_CLUSTER - 1;
        }

checks:
        if (si->cluster_info) {
                while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) {
                /* take a break if we already got some slots */
                        if (n_ret)
                                goto done;
                        if (!scan_swap_map_try_ssd_cluster(si, &offset,
                                                        &scan_base, order)) {
                                if (order > 0)
                                        goto no_page;
                                goto scan;
                        }
                }
        }
        if (!(si->flags & SWP_WRITEOK))
                goto no_page;
        if (!si->highest_bit)
                goto no_page;
        if (offset > si->highest_bit)
                scan_base = offset = si->lowest_bit;

        ci = lock_cluster(si, offset);
        /* reuse swap entry of cache-only swap if not busy. */
        if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
                int swap_was_freed;
                unlock_cluster(ci);
                spin_unlock(&si->lock);
                swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
                spin_lock(&si->lock);
                /* entry was freed successfully, try to use this again */
                if (swap_was_freed > 0)
                        goto checks;
                goto scan; /* check next one */
        }

        if (si->swap_map[offset]) {
                unlock_cluster(ci);
                if (!n_ret)
                        goto scan;
                else
                        goto done;
        }
        memset(si->swap_map + offset, usage, nr_pages);
        add_cluster_info_page(si, si->cluster_info, offset, nr_pages);
        unlock_cluster(ci);

        swap_range_alloc(si, offset, nr_pages);
        slots[n_ret++] = swp_entry(si->type, offset);

        /* got enough slots or reach max slots? */
        if ((n_ret == nr) || (offset >= si->highest_bit))
                goto done;

        /* search for next available slot */

        /* time to take a break? */
        if (unlikely(--latency_ration < 0)) {
                if (n_ret)
                        goto done;
                spin_unlock(&si->lock);
                cond_resched();
                spin_lock(&si->lock);
                latency_ration = LATENCY_LIMIT;
        }

        /* try to get more slots in cluster */
        if (si->cluster_info) {
                if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order))
                        goto checks;
                if (order > 0)
                        goto done;
        } else if (si->cluster_nr && !si->swap_map[++offset]) {
                /* non-ssd case, still more slots in cluster? */
                --si->cluster_nr;
                goto checks;
        }

        /*
         * Even if there's no free clusters available (fragmented),
         * try to scan a little more quickly with lock held unless we
         * have scanned too many slots already.
         */
        if (!scanned_many) {
                unsigned long scan_limit;

                if (offset < scan_base)
                        scan_limit = scan_base;
                else
                        scan_limit = si->highest_bit;
                for (; offset <= scan_limit && --latency_ration > 0;
                     offset++) {
                        if (!si->swap_map[offset])
                                goto checks;
                }
        }

done:
        if (order == 0)
                set_cluster_next(si, offset + 1);
        si->flags -= SWP_SCANNING;
        return n_ret;

scan:
        VM_WARN_ON(order > 0);
        spin_unlock(&si->lock);
        while (++offset <= READ_ONCE(si->highest_bit)) {
                if (unlikely(--latency_ration < 0)) {
                        cond_resched();
                        latency_ration = LATENCY_LIMIT;
                        scanned_many = true;
                }
                if (swap_offset_available_and_locked(si, offset))
                        goto checks;
        }
        offset = si->lowest_bit;
        while (offset < scan_base) {
                if (unlikely(--latency_ration < 0)) {
                        cond_resched();
                        latency_ration = LATENCY_LIMIT;
                        scanned_many = true;
                }
                if (swap_offset_available_and_locked(si, offset))
                        goto checks;
                offset++;
        }
        spin_lock(&si->lock);

no_page:
        si->flags -= SWP_SCANNING;
        return n_ret;
}

static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
{
        unsigned long offset = idx * SWAPFILE_CLUSTER;
        struct swap_cluster_info *ci;

        ci = lock_cluster(si, offset);
        memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
        cluster_set_count_flag(ci, 0, 0);
        free_cluster(si, idx);
        unlock_cluster(ci);
        swap_range_free(si, offset, SWAPFILE_CLUSTER);
}

int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
{
        int order = swap_entry_order(entry_order);
        unsigned long size = 1 << order;
        struct swap_info_struct *si, *next;
        long avail_pgs;
        int n_ret = 0;
        int node;

        spin_lock(&swap_avail_lock);

        avail_pgs = atomic_long_read(&nr_swap_pages) / size;
        if (avail_pgs <= 0) {
                spin_unlock(&swap_avail_lock);
                goto noswap;
        }

        n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);

        atomic_long_sub(n_goal * size, &nr_swap_pages);

start_over:
        node = numa_node_id();
        plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
                /* requeue si to after same-priority siblings */
                plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
                spin_unlock(&swap_avail_lock);
                spin_lock(&si->lock);
                if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
                        spin_lock(&swap_avail_lock);
                        if (plist_node_empty(&si->avail_lists[node])) {
                                spin_unlock(&si->lock);
                                goto nextsi;
                        }
                        WARN(!si->highest_bit,
                             "swap_info %d in list but !highest_bit\n",
                             si->type);
                        WARN(!(si->flags & SWP_WRITEOK),
                             "swap_info %d in list but !SWP_WRITEOK\n",
                             si->type);
                        __del_from_avail_list(si);
                        spin_unlock(&si->lock);
                        goto nextsi;
                }
                n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
                                            n_goal, swp_entries, order);
                spin_unlock(&si->lock);
                if (n_ret || size > 1)
                        goto check_out;
                cond_resched();

                spin_lock(&swap_avail_lock);
nextsi:
                /*
                 * if we got here, it's likely that si was almost full before,
                 * and since scan_swap_map_slots() can drop the si->lock,
                 * multiple callers probably all tried to get a page from the
                 * same si and it filled up before we could get one; or, the si
                 * filled up between us dropping swap_avail_lock and taking
                 * si->lock. Since we dropped the swap_avail_lock, the
                 * swap_avail_head list may have been modified; so if next is
                 * still in the swap_avail_head list then try it, otherwise
                 * start over if we have not gotten any slots.
                 */
                if (plist_node_empty(&next->avail_lists[node]))
                        goto start_over;
        }

        spin_unlock(&swap_avail_lock);

check_out:
        if (n_ret < n_goal)
                atomic_long_add((long)(n_goal - n_ret) * size,
                                &nr_swap_pages);
noswap:
        return n_ret;
}

static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
{
        struct swap_info_struct *p;
        unsigned long offset;

        if (!entry.val)
                goto out;
        p = swp_swap_info(entry);
        if (!p)
                goto bad_nofile;
        if (data_race(!(p->flags & SWP_USED)))
                goto bad_device;
        offset = swp_offset(entry);
        if (offset >= p->max)
                goto bad_offset;
        if (data_race(!p->swap_map[swp_offset(entry)]))
                goto bad_free;
        return p;

bad_free:
        pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val);
        goto out;
bad_offset:
        pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
        goto out;
bad_device:
        pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val);
        goto out;
bad_nofile:
        pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
out:
        return NULL;
}

static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
                                        struct swap_info_struct *q)
{
        struct swap_info_struct *p;

        p = _swap_info_get(entry);

        if (p != q) {
                if (q != NULL)
                        spin_unlock(&q->lock);
                if (p != NULL)
                        spin_lock(&p->lock);
        }
        return p;
}

static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
                                              unsigned long offset,
                                              unsigned char usage)
{
        unsigned char count;
        unsigned char has_cache;

        count = p->swap_map[offset];

        has_cache = count & SWAP_HAS_CACHE;
        count &= ~SWAP_HAS_CACHE;

        if (usage == SWAP_HAS_CACHE) {
                VM_BUG_ON(!has_cache);
                has_cache = 0;
        } else if (count == SWAP_MAP_SHMEM) {
                /*
                 * Or we could insist on shmem.c using a special
                 * swap_shmem_free() and free_shmem_swap_and_cache()...
                 */
                count = 0;
        } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
                if (count == COUNT_CONTINUED) {
                        if (swap_count_continued(p, offset, count))
                                count = SWAP_MAP_MAX | COUNT_CONTINUED;
                        else
                                count = SWAP_MAP_MAX;
                } else
                        count--;
        }

        usage = count | has_cache;
        if (usage)
                WRITE_ONCE(p->swap_map[offset], usage);
        else
                WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE);

        return usage;
}

/*
 * When we get a swap entry, if there aren't some other ways to
 * prevent swapoff, such as the folio in swap cache is locked, RCU
 * reader side is locked, etc., the swap entry may become invalid
 * because of swapoff.  Then, we need to enclose all swap related
 * functions with get_swap_device() and put_swap_device(), unless the
 * swap functions call get/put_swap_device() by themselves.
 *
 * RCU reader side lock (including any spinlock) is sufficient to
 * prevent swapoff, because synchronize_rcu() is called in swapoff()
 * before freeing data structures.
 *
 * Check whether swap entry is valid in the swap device.  If so,
 * return pointer to swap_info_struct, and keep the swap entry valid
 * via preventing the swap device from being swapoff, until
 * put_swap_device() is called.  Otherwise return NULL.
 *
 * Notice that swapoff or swapoff+swapon can still happen before the
 * percpu_ref_tryget_live() in get_swap_device() or after the
 * percpu_ref_put() in put_swap_device() if there isn't any other way
 * to prevent swapoff.  The caller must be prepared for that.  For
 * example, the following situation is possible.
 *
 *   CPU1                                CPU2
 *   do_swap_page()
 *     ...                                swapoff+swapon
 *     __read_swap_cache_async()
 *       swapcache_prepare()
 *         __swap_duplicate()
 *           // check swap_map
 *     // verify PTE not changed
 *
 * In __swap_duplicate(), the swap_map need to be checked before
 * changing partly because the specified swap entry may be for another
 * swap device which has been swapoff.  And in do_swap_page(), after
 * the page is read from the swap device, the PTE is verified not
 * changed with the page table locked to check whether the swap device
 * has been swapoff or swapoff+swapon.
 */
struct swap_info_struct *get_swap_device(swp_entry_t entry)
{
        struct swap_info_struct *si;
        unsigned long offset;

        if (!entry.val)
                goto out;
        si = swp_swap_info(entry);
        if (!si)
                goto bad_nofile;
        if (!percpu_ref_tryget_live(&si->users))
                goto out;
        /*
         * Guarantee the si->users are checked before accessing other
         * fields of swap_info_struct.
         *
         * Paired with the spin_unlock() after setup_swap_info() in
         * enable_swap_info().
         */
        smp_rmb();
        offset = swp_offset(entry);
        if (offset >= si->max)
                goto put_out;

        return si;
bad_nofile:
        pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
out:
        return NULL;
put_out:
        pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
        percpu_ref_put(&si->users);
        return NULL;
}

static unsigned char __swap_entry_free(struct swap_info_struct *p,
                                       swp_entry_t entry)
{
        struct swap_cluster_info *ci;
        unsigned long offset = swp_offset(entry);
        unsigned char usage;

        ci = lock_cluster_or_swap_info(p, offset);
        usage = __swap_entry_free_locked(p, offset, 1);
        unlock_cluster_or_swap_info(p, ci);
        if (!usage)
                free_swap_slot(entry);

        return usage;
}

static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
{
        struct swap_cluster_info *ci;
        unsigned long offset = swp_offset(entry);
        unsigned char count;

        ci = lock_cluster(p, offset);
        count = p->swap_map[offset];
        VM_BUG_ON(count != SWAP_HAS_CACHE);
        p->swap_map[offset] = 0;
        dec_cluster_info_page(p, p->cluster_info, offset);
        unlock_cluster(ci);

        mem_cgroup_uncharge_swap(entry, 1);
        swap_range_free(p, offset, 1);
}

/*
 * Caller has made sure that the swap device corresponding to entry
 * is still around or has not been recycled.
 */
void swap_free(swp_entry_t entry)
{
        struct swap_info_struct *p;

        p = _swap_info_get(entry);
        if (p)
                __swap_entry_free(p, entry);
}

/*
 * Called after dropping swapcache to decrease refcnt to swap entries.
 */
void put_swap_folio(struct folio *folio, swp_entry_t entry)
{
        unsigned long offset = swp_offset(entry);
        unsigned long idx = offset / SWAPFILE_CLUSTER;
        struct swap_cluster_info *ci;
        struct swap_info_struct *si;
        unsigned char *map;
        unsigned int i, free_entries = 0;
        unsigned char val;
        int size = 1 << swap_entry_order(folio_order(folio));

        si = _swap_info_get(entry);
        if (!si)
                return;

        ci = lock_cluster_or_swap_info(si, offset);
        if (size == SWAPFILE_CLUSTER) {
                map = si->swap_map + offset;
                for (i = 0; i < SWAPFILE_CLUSTER; i++) {
                        val = map[i];
                        VM_BUG_ON(!(val & SWAP_HAS_CACHE));
                        if (val == SWAP_HAS_CACHE)
                                free_entries++;
                }
                if (free_entries == SWAPFILE_CLUSTER) {
                        unlock_cluster_or_swap_info(si, ci);
                        spin_lock(&si->lock);
                        mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
                        swap_free_cluster(si, idx);
                        spin_unlock(&si->lock);
                        return;
                }
        }
        for (i = 0; i < size; i++, entry.val++) {
                if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
                        unlock_cluster_or_swap_info(si, ci);
                        free_swap_slot(entry);
                        if (i == size - 1)
                                return;
                        lock_cluster_or_swap_info(si, offset);
                }
        }
        unlock_cluster_or_swap_info(si, ci);
}

static int swp_entry_cmp(const void *ent1, const void *ent2)
{
        const swp_entry_t *e1 = ent1, *e2 = ent2;

        return (int)swp_type(*e1) - (int)swp_type(*e2);
}

void swapcache_free_entries(swp_entry_t *entries, int n)
{
        struct swap_info_struct *p, *prev;
        int i;

        if (n <= 0)
                return;

        prev = NULL;
        p = NULL;

        /*
         * Sort swap entries by swap device, so each lock is only taken once.
         * nr_swapfiles isn't absolutely correct, but the overhead of sort() is
         * so low that it isn't necessary to optimize further.
         */
        if (nr_swapfiles > 1)
                sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
        for (i = 0; i < n; ++i) {
                p = swap_info_get_cont(entries[i], prev);
                if (p)
                        swap_entry_free(p, entries[i]);
                prev = p;
        }
        if (p)
                spin_unlock(&p->lock);
}

int __swap_count(swp_entry_t entry)
{
        struct swap_info_struct *si = swp_swap_info(entry);
        pgoff_t offset = swp_offset(entry);

        return swap_count(si->swap_map[offset]);
}

/*
 * How many references to @entry are currently swapped out?
 * This does not give an exact answer when swap count is continued,
 * but does include the high COUNT_CONTINUED flag to allow for that.
 */
int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
{
        pgoff_t offset = swp_offset(entry);
        struct swap_cluster_info *ci;
        int count;

        ci = lock_cluster_or_swap_info(si, offset);
        count = swap_count(si->swap_map[offset]);
        unlock_cluster_or_swap_info(si, ci);
        return count;
}

/*
 * How many references to @entry are currently swapped out?
 * This considers COUNT_CONTINUED so it returns exact answer.
 */
int swp_swapcount(swp_entry_t entry)
{
        int count, tmp_count, n;
        struct swap_info_struct *p;
        struct swap_cluster_info *ci;
        struct page *page;
        pgoff_t offset;
        unsigned char *map;

        p = _swap_info_get(entry);
        if (!p)
                return 0;

        offset = swp_offset(entry);

        ci = lock_cluster_or_swap_info(p, offset);

        count = swap_count(p->swap_map[offset]);
        if (!(count & COUNT_CONTINUED))
                goto out;

        count &= ~COUNT_CONTINUED;
        n = SWAP_MAP_MAX + 1;

        page = vmalloc_to_page(p->swap_map + offset);
        offset &= ~PAGE_MASK;
        VM_BUG_ON(page_private(page) != SWP_CONTINUED);

        do {
                page = list_next_entry(page, lru);
                map = kmap_local_page(page);
                tmp_count = map[offset];
                kunmap_local(map);

                count += (tmp_count & ~COUNT_CONTINUED) * n;
                n *= (SWAP_CONT_MAX + 1);
        } while (tmp_count & COUNT_CONTINUED);
out:
        unlock_cluster_or_swap_info(p, ci);
        return count;
}

static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
                                         swp_entry_t entry, int order)
{
        struct swap_cluster_info *ci;
        unsigned char *map = si->swap_map;
        unsigned int nr_pages = 1 << order;
        unsigned long roffset = swp_offset(entry);
        unsigned long offset = round_down(roffset, nr_pages);
        int i;
        bool ret = false;

        ci = lock_cluster_or_swap_info(si, offset);
        if (!ci || nr_pages == 1) {
                if (swap_count(map[roffset]))
                        ret = true;
                goto unlock_out;
        }
        for (i = 0; i < nr_pages; i++) {
                if (swap_count(map[offset + i])) {
                        ret = true;
                        break;
                }
        }
unlock_out:
        unlock_cluster_or_swap_info(si, ci);
        return ret;
}

static bool folio_swapped(struct folio *folio)
{
        swp_entry_t entry = folio->swap;
        struct swap_info_struct *si = _swap_info_get(entry);

        if (!si)
                return false;

        if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
                return swap_swapcount(si, entry) != 0;

        return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
}

/**
 * folio_free_swap() - Free the swap space used for this folio.
 * @folio: The folio to remove.
 *
 * If swap is getting full, or if there are no more mappings of this folio,
 * then call folio_free_swap to free its swap space.
 *
 * Return: true if we were able to release the swap space.
 */
bool folio_free_swap(struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (!folio_test_swapcache(folio))
                return false;
        if (folio_test_writeback(folio))
                return false;
        if (folio_swapped(folio))
                return false;

        /*
         * Once hibernation has begun to create its image of memory,
         * there's a danger that one of the calls to folio_free_swap()
         * - most probably a call from __try_to_reclaim_swap() while
         * hibernation is allocating its own swap pages for the image,
         * but conceivably even a call from memory reclaim - will free
         * the swap from a folio which has already been recorded in the
         * image as a clean swapcache folio, and then reuse its swap for
         * another page of the image.  On waking from hibernation, the
         * original folio might be freed under memory pressure, then
         * later read back in from swap, now with the wrong data.
         *
         * Hibernation suspends storage while it is writing the image
         * to disk so check that here.
         */
        if (pm_suspended_storage())
                return false;

        delete_from_swap_cache(folio);
        folio_set_dirty(folio);
        return true;
}

/**
 * free_swap_and_cache_nr() - Release reference on range of swap entries and
 *                            reclaim their cache if no more references remain.
 * @entry: First entry of range.
 * @nr: Number of entries in range.
 *
 * For each swap entry in the contiguous range, release a reference. If any swap
 * entries become free, try to reclaim their underlying folios, if present. The
 * offset range is defined by [entry.offset, entry.offset + nr).
 */
void free_swap_and_cache_nr(swp_entry_t entry, int nr)
{
        const unsigned long start_offset = swp_offset(entry);
        const unsigned long end_offset = start_offset + nr;
        unsigned int type = swp_type(entry);
        struct swap_info_struct *si;
        bool any_only_cache = false;
        unsigned long offset;
        unsigned char count;

        if (non_swap_entry(entry))
                return;

        si = get_swap_device(entry);
        if (!si)
                return;

        if (WARN_ON(end_offset > si->max))
                goto out;

        /*
         * First free all entries in the range.
         */
        for (offset = start_offset; offset < end_offset; offset++) {
                if (data_race(si->swap_map[offset])) {
                        count = __swap_entry_free(si, swp_entry(type, offset));
                        if (count == SWAP_HAS_CACHE)
                                any_only_cache = true;
                } else {
                        WARN_ON_ONCE(1);
                }
        }

        /*
         * Short-circuit the below loop if none of the entries had their
         * reference drop to zero.
         */
        if (!any_only_cache)
                goto out;

        /*
         * Now go back over the range trying to reclaim the swap cache. This is
         * more efficient for large folios because we will only try to reclaim
         * the swap once per folio in the common case. If we do
         * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
         * latter will get a reference and lock the folio for every individual
         * page but will only succeed once the swap slot for every subpage is
         * zero.
         */
        for (offset = start_offset; offset < end_offset; offset += nr) {
                nr = 1;
                if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
                        /*
                         * Folios are always naturally aligned in swap so
                         * advance forward to the next boundary. Zero means no
                         * folio was found for the swap entry, so advance by 1
                         * in this case. Negative value means folio was found
                         * but could not be reclaimed. Here we can still advance
                         * to the next boundary.
                         */
                        nr = __try_to_reclaim_swap(si, offset,
                                              TTRS_UNMAPPED | TTRS_FULL);
                        if (nr == 0)
                                nr = 1;
                        else if (nr < 0)
                                nr = -nr;
                        nr = ALIGN(offset + 1, nr) - offset;
                }
        }

out:
        put_swap_device(si);
}

#ifdef CONFIG_HIBERNATION

swp_entry_t get_swap_page_of_type(int type)
{
        struct swap_info_struct *si = swap_type_to_swap_info(type);
        swp_entry_t entry = {0};

        if (!si)
                goto fail;

        /* This is called for allocating swap entry, not cache */
        spin_lock(&si->lock);
        if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
                atomic_long_dec(&nr_swap_pages);
        spin_unlock(&si->lock);
fail:
        return entry;
}

/*
 * Find the swap type that corresponds to given device (if any).
 *
 * @offset - number of the PAGE_SIZE-sized block of the device, starting
 * from 0, in which the swap header is expected to be located.
 *
 * This is needed for the suspend to disk (aka swsusp).
 */
int swap_type_of(dev_t device, sector_t offset)
{
        int type;

        if (!device)
                return -1;

        spin_lock(&swap_lock);
        for (type = 0; type < nr_swapfiles; type++) {
                struct swap_info_struct *sis = swap_info[type];

                if (!(sis->flags & SWP_WRITEOK))
                        continue;

                if (device == sis->bdev->bd_dev) {
                        struct swap_extent *se = first_se(sis);

                        if (se->start_block == offset) {
                                spin_unlock(&swap_lock);
                                return type;
                        }
                }
        }
        spin_unlock(&swap_lock);
        return -ENODEV;
}

int find_first_swap(dev_t *device)
{
        int type;

        spin_lock(&swap_lock);
        for (type = 0; type < nr_swapfiles; type++) {
                struct swap_info_struct *sis = swap_info[type];

                if (!(sis->flags & SWP_WRITEOK))
                        continue;
                *device = sis->bdev->bd_dev;
                spin_unlock(&swap_lock);
                return type;
        }
        spin_unlock(&swap_lock);
        return -ENODEV;
}

/*
 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
 * corresponding to given index in swap_info (swap type).
 */
sector_t swapdev_block(int type, pgoff_t offset)
{
        struct swap_info_struct *si = swap_type_to_swap_info(type);
        struct swap_extent *se;

        if (!si || !(si->flags & SWP_WRITEOK))
                return 0;
        se = offset_to_swap_extent(si, offset);
        return se->start_block + (offset - se->start_page);
}

/*
 * Return either the total number of swap pages of given type, or the number
 * of free pages of that type (depending on @free)
 *
 * This is needed for software suspend
 */
unsigned int count_swap_pages(int type, int free)
{
        unsigned int n = 0;

        spin_lock(&swap_lock);
        if ((unsigned int)type < nr_swapfiles) {
                struct swap_info_struct *sis = swap_info[type];

                spin_lock(&sis->lock);
                if (sis->flags & SWP_WRITEOK) {
                        n = sis->pages;
                        if (free)
                                n -= sis->inuse_pages;
                }
                spin_unlock(&sis->lock);
        }
        spin_unlock(&swap_lock);
        return n;
}
#endif /* CONFIG_HIBERNATION */

static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
{
        return pte_same(pte_swp_clear_flags(pte), swp_pte);
}

/*
 * No need to decide whether this PTE shares the swap entry with others,
 * just let do_wp_page work it out if a write is requested later - to
 * force COW, vm_page_prot omits write permission from any private vma.
 */
static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, swp_entry_t entry, struct folio *folio)
{
        struct page *page;
        struct folio *swapcache;
        spinlock_t *ptl;
        pte_t *pte, new_pte, old_pte;
        bool hwpoisoned = false;
        int ret = 1;

        swapcache = folio;
        folio = ksm_might_need_to_copy(folio, vma, addr);
        if (unlikely(!folio))
                return -ENOMEM;
        else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
                hwpoisoned = true;
                folio = swapcache;
        }

        page = folio_file_page(folio, swp_offset(entry));
        if (PageHWPoison(page))
                hwpoisoned = true;

        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte),
                                                swp_entry_to_pte(entry)))) {
                ret = 0;
                goto out;
        }

        old_pte = ptep_get(pte);

        if (unlikely(hwpoisoned || !folio_test_uptodate(folio))) {
                swp_entry_t swp_entry;

                dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
                if (hwpoisoned) {
                        swp_entry = make_hwpoison_entry(page);
                } else {
                        swp_entry = make_poisoned_swp_entry();
                }
                new_pte = swp_entry_to_pte(swp_entry);
                ret = 0;
                goto setpte;
        }

        /*
         * Some architectures may have to restore extra metadata to the page
         * when reading from swap. This metadata may be indexed by swap entry
         * so this must be called before swap_free().
         */
        arch_swap_restore(folio_swap(entry, folio), folio);

        dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
        inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
        folio_get(folio);
        if (folio == swapcache) {
                rmap_t rmap_flags = RMAP_NONE;

                /*
                 * See do_swap_page(): writeback would be problematic.
                 * However, we do a folio_wait_writeback() just before this
                 * call and have the folio locked.
                 */
                VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
                if (pte_swp_exclusive(old_pte))
                        rmap_flags |= RMAP_EXCLUSIVE;

                folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags);
        } else { /* ksm created a completely new copy */
                folio_add_new_anon_rmap(folio, vma, addr);
                folio_add_lru_vma(folio, vma);
        }
        new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
        if (pte_swp_soft_dirty(old_pte))
                new_pte = pte_mksoft_dirty(new_pte);
        if (pte_swp_uffd_wp(old_pte))
                new_pte = pte_mkuffd_wp(new_pte);
setpte:
        set_pte_at(vma->vm_mm, addr, pte, new_pte);
        swap_free(entry);
out:
        if (pte)
                pte_unmap_unlock(pte, ptl);
        if (folio != swapcache) {
                folio_unlock(folio);
                folio_put(folio);
        }
        return ret;
}

static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                        unsigned long addr, unsigned long end,
                        unsigned int type)
{
        pte_t *pte = NULL;
        struct swap_info_struct *si;

        si = swap_info[type];
        do {
                struct folio *folio;
                unsigned long offset;
                unsigned char swp_count;
                swp_entry_t entry;
                int ret;
                pte_t ptent;

                if (!pte++) {
                        pte = pte_offset_map(pmd, addr);
                        if (!pte)
                                break;
                }

                ptent = ptep_get_lockless(pte);

                if (!is_swap_pte(ptent))
                        continue;

                entry = pte_to_swp_entry(ptent);
                if (swp_type(entry) != type)
                        continue;

                offset = swp_offset(entry);
                pte_unmap(pte);
                pte = NULL;

                folio = swap_cache_get_folio(entry, vma, addr);
                if (!folio) {
                        struct page *page;
                        struct vm_fault vmf = {
                                .vma = vma,
                                .address = addr,
                                .real_address = addr,
                                .pmd = pmd,
                        };

                        page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
                                                &vmf);
                        if (page)
                                folio = page_folio(page);
                }
                if (!folio) {
                        swp_count = READ_ONCE(si->swap_map[offset]);
                        if (swp_count == 0 || swp_count == SWAP_MAP_BAD)
                                continue;
                        return -ENOMEM;
                }

                folio_lock(folio);
                folio_wait_writeback(folio);
                ret = unuse_pte(vma, pmd, addr, entry, folio);
                if (ret < 0) {
                        folio_unlock(folio);
                        folio_put(folio);
                        return ret;
                }

                folio_free_swap(folio);
                folio_unlock(folio);
                folio_put(folio);
        } while (addr += PAGE_SIZE, addr != end);

        if (pte)
                pte_unmap(pte);
        return 0;
}

static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                                unsigned long addr, unsigned long end,
                                unsigned int type)
{
        pmd_t *pmd;
        unsigned long next;
        int ret;

        pmd = pmd_offset(pud, addr);
        do {
                cond_resched();
                next = pmd_addr_end(addr, end);
                ret = unuse_pte_range(vma, pmd, addr, next, type);
                if (ret)
                        return ret;
        } while (pmd++, addr = next, addr != end);
        return 0;
}

static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
                                unsigned long addr, unsigned long end,
                                unsigned int type)
{
        pud_t *pud;
        unsigned long next;
        int ret;

        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
                ret = unuse_pmd_range(vma, pud, addr, next, type);
                if (ret)
                        return ret;
        } while (pud++, addr = next, addr != end);
        return 0;
}

static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
                                unsigned int type)
{
        p4d_t *p4d;
        unsigned long next;
        int ret;

        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(p4d))
                        continue;
                ret = unuse_pud_range(vma, p4d, addr, next, type);
                if (ret)
                        return ret;
        } while (p4d++, addr = next, addr != end);
        return 0;
}

static int unuse_vma(struct vm_area_struct *vma, unsigned int type)
{
        pgd_t *pgd;
        unsigned long addr, end, next;
        int ret;

        addr = vma->vm_start;
        end = vma->vm_end;

        pgd = pgd_offset(vma->vm_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                ret = unuse_p4d_range(vma, pgd, addr, next, type);
                if (ret)
                        return ret;
        } while (pgd++, addr = next, addr != end);
        return 0;
}

static int unuse_mm(struct mm_struct *mm, unsigned int type)
{
        struct vm_area_struct *vma;
        int ret = 0;
        VMA_ITERATOR(vmi, mm, 0);

        mmap_read_lock(mm);
        for_each_vma(vmi, vma) {
                if (vma->anon_vma) {
                        ret = unuse_vma(vma, type);
                        if (ret)
                                break;
                }

                cond_resched();
        }
        mmap_read_unlock(mm);
        return ret;
}

/*
 * Scan swap_map from current position to next entry still in use.
 * Return 0 if there are no inuse entries after prev till end of
 * the map.
 */
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
                                        unsigned int prev)
{
        unsigned int i;
        unsigned char count;

        /*
         * No need for swap_lock here: we're just looking
         * for whether an entry is in use, not modifying it; false
         * hits are okay, and sys_swapoff() has already prevented new
         * allocations from this area (while holding swap_lock).
         */
        for (i = prev + 1; i < si->max; i++) {
                count = READ_ONCE(si->swap_map[i]);
                if (count && swap_count(count) != SWAP_MAP_BAD)
                        break;
                if ((i % LATENCY_LIMIT) == 0)
                        cond_resched();
        }

        if (i == si->max)
                i = 0;

        return i;
}

static int try_to_unuse(unsigned int type)
{
        struct mm_struct *prev_mm;
        struct mm_struct *mm;
        struct list_head *p;
        int retval = 0;
        struct swap_info_struct *si = swap_info[type];
        struct folio *folio;
        swp_entry_t entry;
        unsigned int i;

        if (!READ_ONCE(si->inuse_pages))
                goto success;

retry:
        retval = shmem_unuse(type);
        if (retval)
                return retval;

        prev_mm = &init_mm;
        mmget(prev_mm);

        spin_lock(&mmlist_lock);
        p = &init_mm.mmlist;
        while (READ_ONCE(si->inuse_pages) &&
               !signal_pending(current) &&
               (p = p->next) != &init_mm.mmlist) {

                mm = list_entry(p, struct mm_struct, mmlist);
                if (!mmget_not_zero(mm))
                        continue;
                spin_unlock(&mmlist_lock);
                mmput(prev_mm);
                prev_mm = mm;
                retval = unuse_mm(mm, type);
                if (retval) {
                        mmput(prev_mm);
                        return retval;
                }

                /*
                 * Make sure that we aren't completely killing
                 * interactive performance.
                 */
                cond_resched();
                spin_lock(&mmlist_lock);
        }
        spin_unlock(&mmlist_lock);

        mmput(prev_mm);

        i = 0;
        while (READ_ONCE(si->inuse_pages) &&
               !signal_pending(current) &&
               (i = find_next_to_unuse(si, i)) != 0) {

                entry = swp_entry(type, i);
                folio = filemap_get_folio(swap_address_space(entry), i);
                if (IS_ERR(folio))
                        continue;

                /*
                 * It is conceivable that a racing task removed this folio from
                 * swap cache just before we acquired the page lock. The folio
                 * might even be back in swap cache on another swap area. But
                 * that is okay, folio_free_swap() only removes stale folios.
                 */
                folio_lock(folio);
                folio_wait_writeback(folio);
                folio_free_swap(folio);
                folio_unlock(folio);
                folio_put(folio);
        }

        /*
         * Lets check again to see if there are still swap entries in the map.
         * If yes, we would need to do retry the unuse logic again.
         * Under global memory pressure, swap entries can be reinserted back
         * into process space after the mmlist loop above passes over them.
         *
         * Limit the number of retries? No: when mmget_not_zero()
         * above fails, that mm is likely to be freeing swap from
         * exit_mmap(), which proceeds at its own independent pace;
         * and even shmem_writepage() could have been preempted after
         * folio_alloc_swap(), temporarily hiding that swap.  It's easy
         * and robust (though cpu-intensive) just to keep retrying.
         */
        if (READ_ONCE(si->inuse_pages)) {
                if (!signal_pending(current))
                        goto retry;
                return -EINTR;
        }

success:
        /*
         * Make sure that further cleanups after try_to_unuse() returns happen
         * after swap_range_free() reduces si->inuse_pages to 0.
         */
        smp_mb();
        return 0;
}

/*
 * After a successful try_to_unuse, if no swap is now in use, we know
 * we can empty the mmlist.  swap_lock must be held on entry and exit.
 * Note that mmlist_lock nests inside swap_lock, and an mm must be
 * added to the mmlist just after page_duplicate - before would be racy.
 */
static void drain_mmlist(void)
{
        struct list_head *p, *next;
        unsigned int type;

        for (type = 0; type < nr_swapfiles; type++)
                if (swap_info[type]->inuse_pages)
                        return;
        spin_lock(&mmlist_lock);
        list_for_each_safe(p, next, &init_mm.mmlist)
                list_del_init(p);
        spin_unlock(&mmlist_lock);
}

/*
 * Free all of a swapdev's extent information
 */
static void destroy_swap_extents(struct swap_info_struct *sis)
{
        while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
                struct rb_node *rb = sis->swap_extent_root.rb_node;
                struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);

                rb_erase(rb, &sis->swap_extent_root);
                kfree(se);
        }

        if (sis->flags & SWP_ACTIVATED) {
                struct file *swap_file = sis->swap_file;
                struct address_space *mapping = swap_file->f_mapping;

                sis->flags &= ~SWP_ACTIVATED;
                if (mapping->a_ops->swap_deactivate)
                        mapping->a_ops->swap_deactivate(swap_file);
        }
}

/*
 * Add a block range (and the corresponding page range) into this swapdev's
 * extent tree.
 *
 * This function rather assumes that it is called in ascending page order.
 */
int
add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
                unsigned long nr_pages, sector_t start_block)
{
        struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
        struct swap_extent *se;
        struct swap_extent *new_se;

        /*
         * place the new node at the right most since the
         * function is called in ascending page order.
         */
        while (*link) {
                parent = *link;
                link = &parent->rb_right;
        }

        if (parent) {
                se = rb_entry(parent, struct swap_extent, rb_node);
                BUG_ON(se->start_page + se->nr_pages != start_page);
                if (se->start_block + se->nr_pages == start_block) {
                        /* Merge it */
                        se->nr_pages += nr_pages;
                        return 0;
                }
        }

        /* No merge, insert a new extent. */
        new_se = kmalloc(sizeof(*se), GFP_KERNEL);
        if (new_se == NULL)
                return -ENOMEM;
        new_se->start_page = start_page;
        new_se->nr_pages = nr_pages;
        new_se->start_block = start_block;

        rb_link_node(&new_se->rb_node, parent, link);
        rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
        return 1;
}
EXPORT_SYMBOL_GPL(add_swap_extent);

/*
 * A `swap extent' is a simple thing which maps a contiguous range of pages
 * onto a contiguous range of disk blocks.  A rbtree of swap extents is
 * built at swapon time and is then used at swap_writepage/swap_read_folio
 * time for locating where on disk a page belongs.
 *
 * If the swapfile is an S_ISBLK block device, a single extent is installed.
 * This is done so that the main operating code can treat S_ISBLK and S_ISREG
 * swap files identically.
 *
 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
 * extent rbtree operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
 * swapfiles are handled *identically* after swapon time.
 *
 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
 * and will parse them into a rbtree, in PAGE_SIZE chunks.  If some stray
 * blocks are found which do not fall within the PAGE_SIZE alignment
 * requirements, they are simply tossed out - we will never use those blocks
 * for swapping.
 *
 * For all swap devices we set S_SWAPFILE across the life of the swapon.  This
 * prevents users from writing to the swap device, which will corrupt memory.
 *
 * The amount of disk space which a single swap extent represents varies.
 * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
 * extents in the rbtree. - akpm.
 */
static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
{
        struct file *swap_file = sis->swap_file;
        struct address_space *mapping = swap_file->f_mapping;
        struct inode *inode = mapping->host;
        int ret;

        if (S_ISBLK(inode->i_mode)) {
                ret = add_swap_extent(sis, 0, sis->max, 0);
                *span = sis->pages;
                return ret;
        }

        if (mapping->a_ops->swap_activate) {
                ret = mapping->a_ops->swap_activate(sis, swap_file, span);
                if (ret < 0)
                        return ret;
                sis->flags |= SWP_ACTIVATED;
                if ((sis->flags & SWP_FS_OPS) &&
                    sio_pool_init() != 0) {
                        destroy_swap_extents(sis);
                        return -ENOMEM;
                }
                return ret;
        }

        return generic_swapfile_activate(sis, swap_file, span);
}

static int swap_node(struct swap_info_struct *p)
{
        struct block_device *bdev;

        if (p->bdev)
                bdev = p->bdev;
        else
                bdev = p->swap_file->f_inode->i_sb->s_bdev;

        return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
}

static void setup_swap_info(struct swap_info_struct *p, int prio,
                            unsigned char *swap_map,
                            struct swap_cluster_info *cluster_info)
{
        int i;

        if (prio >= 0)
                p->prio = prio;
        else
                p->prio = --least_priority;
        /*
         * the plist prio is negated because plist ordering is
         * low-to-high, while swap ordering is high-to-low
         */
        p->list.prio = -p->prio;
        for_each_node(i) {
                if (p->prio >= 0)
                        p->avail_lists[i].prio = -p->prio;
                else {
                        if (swap_node(p) == i)
                                p->avail_lists[i].prio = 1;
                        else
                                p->avail_lists[i].prio = -p->prio;
                }
        }
        p->swap_map = swap_map;
        p->cluster_info = cluster_info;
}

static void _enable_swap_info(struct swap_info_struct *p)
{
        p->flags |= SWP_WRITEOK;
        atomic_long_add(p->pages, &nr_swap_pages);
        total_swap_pages += p->pages;

        assert_spin_locked(&swap_lock);
        /*
         * both lists are plists, and thus priority ordered.
         * swap_active_head needs to be priority ordered for swapoff(),
         * which on removal of any swap_info_struct with an auto-assigned
         * (i.e. negative) priority increments the auto-assigned priority
         * of any lower-priority swap_info_structs.
         * swap_avail_head needs to be priority ordered for folio_alloc_swap(),
         * which allocates swap pages from the highest available priority
         * swap_info_struct.
         */
        plist_add(&p->list, &swap_active_head);

        /* add to available list iff swap device is not full */
        if (p->highest_bit)
                add_to_avail_list(p);
}

static void enable_swap_info(struct swap_info_struct *p, int prio,
                                unsigned char *swap_map,
                                struct swap_cluster_info *cluster_info)
{
        spin_lock(&swap_lock);
        spin_lock(&p->lock);
        setup_swap_info(p, prio, swap_map, cluster_info);
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
        /*
         * Finished initializing swap device, now it's safe to reference it.
         */
        percpu_ref_resurrect(&p->users);
        spin_lock(&swap_lock);
        spin_lock(&p->lock);
        _enable_swap_info(p);
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
}

static void reinsert_swap_info(struct swap_info_struct *p)
{
        spin_lock(&swap_lock);
        spin_lock(&p->lock);
        setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
        _enable_swap_info(p);
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
}

static bool __has_usable_swap(void)
{
        return !plist_head_empty(&swap_active_head);
}

bool has_usable_swap(void)
{
        bool ret;

        spin_lock(&swap_lock);
        ret = __has_usable_swap();
        spin_unlock(&swap_lock);
        return ret;
}

SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
        struct swap_info_struct *p = NULL;
        unsigned char *swap_map;
        struct swap_cluster_info *cluster_info;
        struct file *swap_file, *victim;
        struct address_space *mapping;
        struct inode *inode;
        struct filename *pathname;
        int err, found = 0;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        BUG_ON(!current->mm);

        pathname = getname(specialfile);
        if (IS_ERR(pathname))
                return PTR_ERR(pathname);

        victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
        err = PTR_ERR(victim);
        if (IS_ERR(victim))
                goto out;

        mapping = victim->f_mapping;
        spin_lock(&swap_lock);
        plist_for_each_entry(p, &swap_active_head, list) {
                if (p->flags & SWP_WRITEOK) {
                        if (p->swap_file->f_mapping == mapping) {
                                found = 1;
                                break;
                        }
                }
        }
        if (!found) {
                err = -EINVAL;
                spin_unlock(&swap_lock);
                goto out_dput;
        }
        if (!security_vm_enough_memory_mm(current->mm, p->pages))
                vm_unacct_memory(p->pages);
        else {
                err = -ENOMEM;
                spin_unlock(&swap_lock);
                goto out_dput;
        }
        spin_lock(&p->lock);
        del_from_avail_list(p);
        if (p->prio < 0) {
                struct swap_info_struct *si = p;
                int nid;

                plist_for_each_entry_continue(si, &swap_active_head, list) {
                        si->prio++;
                        si->list.prio--;
                        for_each_node(nid) {
                                if (si->avail_lists[nid].prio != 1)
                                        si->avail_lists[nid].prio--;
                        }
                }
                least_priority++;
        }
        plist_del(&p->list, &swap_active_head);
        atomic_long_sub(p->pages, &nr_swap_pages);
        total_swap_pages -= p->pages;
        p->flags &= ~SWP_WRITEOK;
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);

        disable_swap_slots_cache_lock();

        set_current_oom_origin();
        err = try_to_unuse(p->type);
        clear_current_oom_origin();

        if (err) {
                /* re-insert swap space back into swap_list */
                reinsert_swap_info(p);
                reenable_swap_slots_cache_unlock();
                goto out_dput;
        }

        reenable_swap_slots_cache_unlock();

        /*
         * Wait for swap operations protected by get/put_swap_device()
         * to complete.  Because of synchronize_rcu() here, all swap
         * operations protected by RCU reader side lock (including any
         * spinlock) will be waited too.  This makes it easy to
         * prevent folio_test_swapcache() and the following swap cache
         * operations from racing with swapoff.
         */
        percpu_ref_kill(&p->users);
        synchronize_rcu();
        wait_for_completion(&p->comp);

        flush_work(&p->discard_work);

        destroy_swap_extents(p);
        if (p->flags & SWP_CONTINUED)
                free_swap_count_continuations(p);

        if (!p->bdev || !bdev_nonrot(p->bdev))
                atomic_dec(&nr_rotate_swap);

        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
        spin_lock(&p->lock);
        drain_mmlist();

        /* wait for anyone still in scan_swap_map_slots */
        p->highest_bit = 0;                /* cuts scans short */
        while (p->flags >= SWP_SCANNING) {
                spin_unlock(&p->lock);
                spin_unlock(&swap_lock);
                schedule_timeout_uninterruptible(1);
                spin_lock(&swap_lock);
                spin_lock(&p->lock);
        }

        swap_file = p->swap_file;
        p->swap_file = NULL;
        p->max = 0;
        swap_map = p->swap_map;
        p->swap_map = NULL;
        cluster_info = p->cluster_info;
        p->cluster_info = NULL;
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
        arch_swap_invalidate_area(p->type);
        zswap_swapoff(p->type);
        mutex_unlock(&swapon_mutex);
        free_percpu(p->percpu_cluster);
        p->percpu_cluster = NULL;
        free_percpu(p->cluster_next_cpu);
        p->cluster_next_cpu = NULL;
        vfree(swap_map);
        kvfree(cluster_info);
        /* Destroy swap account information */
        swap_cgroup_swapoff(p->type);
        exit_swap_address_space(p->type);

        inode = mapping->host;

        inode_lock(inode);
        inode->i_flags &= ~S_SWAPFILE;
        inode_unlock(inode);
        filp_close(swap_file, NULL);

        /*
         * Clear the SWP_USED flag after all resources are freed so that swapon
         * can reuse this swap_info in alloc_swap_info() safely.  It is ok to
         * not hold p->lock after we cleared its SWP_WRITEOK.
         */
        spin_lock(&swap_lock);
        p->flags = 0;
        spin_unlock(&swap_lock);

        err = 0;
        atomic_inc(&proc_poll_event);
        wake_up_interruptible(&proc_poll_wait);

out_dput:
        filp_close(victim, NULL);
out:
        putname(pathname);
        return err;
}

#ifdef CONFIG_PROC_FS
static __poll_t swaps_poll(struct file *file, poll_table *wait)
{
        struct seq_file *seq = file->private_data;

        poll_wait(file, &proc_poll_wait, wait);

        if (seq->poll_event != atomic_read(&proc_poll_event)) {
                seq->poll_event = atomic_read(&proc_poll_event);
                return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
        }

        return EPOLLIN | EPOLLRDNORM;
}

/* iterator */
static void *swap_start(struct seq_file *swap, loff_t *pos)
{
        struct swap_info_struct *si;
        int type;
        loff_t l = *pos;

        mutex_lock(&swapon_mutex);

        if (!l)
                return SEQ_START_TOKEN;

        for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
                if (!(si->flags & SWP_USED) || !si->swap_map)
                        continue;
                if (!--l)
                        return si;
        }

        return NULL;
}

static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
{
        struct swap_info_struct *si = v;
        int type;

        if (v == SEQ_START_TOKEN)
                type = 0;
        else
                type = si->type + 1;

        ++(*pos);
        for (; (si = swap_type_to_swap_info(type)); type++) {
                if (!(si->flags & SWP_USED) || !si->swap_map)
                        continue;
                return si;
        }

        return NULL;
}

static void swap_stop(struct seq_file *swap, void *v)
{
        mutex_unlock(&swapon_mutex);
}

static int swap_show(struct seq_file *swap, void *v)
{
        struct swap_info_struct *si = v;
        struct file *file;
        int len;
        unsigned long bytes, inuse;

        if (si == SEQ_START_TOKEN) {
                seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
                return 0;
        }

        bytes = K(si->pages);
        inuse = K(READ_ONCE(si->inuse_pages));

        file = si->swap_file;
        len = seq_file_path(swap, file, " \t\n\\");
        seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
                        len < 40 ? 40 - len : 1, " ",
                        S_ISBLK(file_inode(file)->i_mode) ?
                                "partition" : "file\t",
                        bytes, bytes < 10000000 ? "\t" : "",
                        inuse, inuse < 10000000 ? "\t" : "",
                        si->prio);
        return 0;
}

static const struct seq_operations swaps_op = {
        .start =        swap_start,
        .next =                swap_next,
        .stop =                swap_stop,
        .show =                swap_show
};

static int swaps_open(struct inode *inode, struct file *file)
{
        struct seq_file *seq;
        int ret;

        ret = seq_open(file, &swaps_op);
        if (ret)
                return ret;

        seq = file->private_data;
        seq->poll_event = atomic_read(&proc_poll_event);
        return 0;
}

static const struct proc_ops swaps_proc_ops = {
        .proc_flags        = PROC_ENTRY_PERMANENT,
        .proc_open        = swaps_open,
        .proc_read        = seq_read,
        .proc_lseek        = seq_lseek,
        .proc_release        = seq_release,
        .proc_poll        = swaps_poll,
};

static int __init procswaps_init(void)
{
        proc_create("swaps", 0, NULL, &swaps_proc_ops);
        return 0;
}
__initcall(procswaps_init);
#endif /* CONFIG_PROC_FS */

#ifdef MAX_SWAPFILES_CHECK
static int __init max_swapfiles_check(void)
{
        MAX_SWAPFILES_CHECK();
        return 0;
}
late_initcall(max_swapfiles_check);
#endif

static struct swap_info_struct *alloc_swap_info(void)
{
        struct swap_info_struct *p;
        struct swap_info_struct *defer = NULL;
        unsigned int type;
        int i;

        p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (percpu_ref_init(&p->users, swap_users_ref_free,
                            PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
                kvfree(p);
                return ERR_PTR(-ENOMEM);
        }

        spin_lock(&swap_lock);
        for (type = 0; type < nr_swapfiles; type++) {
                if (!(swap_info[type]->flags & SWP_USED))
                        break;
        }
        if (type >= MAX_SWAPFILES) {
                spin_unlock(&swap_lock);
                percpu_ref_exit(&p->users);
                kvfree(p);
                return ERR_PTR(-EPERM);
        }
        if (type >= nr_swapfiles) {
                p->type = type;
                /*
                 * Publish the swap_info_struct after initializing it.
                 * Note that kvzalloc() above zeroes all its fields.
                 */
                smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */
                nr_swapfiles++;
        } else {
                defer = p;
                p = swap_info[type];
                /*
                 * Do not memset this entry: a racing procfs swap_next()
                 * would be relying on p->type to remain valid.
                 */
        }
        p->swap_extent_root = RB_ROOT;
        plist_node_init(&p->list, 0);
        for_each_node(i)
                plist_node_init(&p->avail_lists[i], 0);
        p->flags = SWP_USED;
        spin_unlock(&swap_lock);
        if (defer) {
                percpu_ref_exit(&defer->users);
                kvfree(defer);
        }
        spin_lock_init(&p->lock);
        spin_lock_init(&p->cont_lock);
        init_completion(&p->comp);

        return p;
}

static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
{
        if (S_ISBLK(inode->i_mode)) {
                p->bdev = I_BDEV(inode);
                /*
                 * Zoned block devices contain zones that have a sequential
                 * write only restriction.  Hence zoned block devices are not
                 * suitable for swapping.  Disallow them here.
                 */
                if (bdev_is_zoned(p->bdev))
                        return -EINVAL;
                p->flags |= SWP_BLKDEV;
        } else if (S_ISREG(inode->i_mode)) {
                p->bdev = inode->i_sb->s_bdev;
        }

        return 0;
}


/*
 * Find out how many pages are allowed for a single swap device. There
 * are two limiting factors:
 * 1) the number of bits for the swap offset in the swp_entry_t type, and
 * 2) the number of bits in the swap pte, as defined by the different
 * architectures.
 *
 * In order to find the largest possible bit mask, a swap entry with
 * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
 * decoded to a swp_entry_t again, and finally the swap offset is
 * extracted.
 *
 * This will mask all the bits from the initial ~0UL mask that can't
 * be encoded in either the swp_entry_t or the architecture definition
 * of a swap pte.
 */
unsigned long generic_max_swapfile_size(void)
{
        return swp_offset(pte_to_swp_entry(
                        swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
}

/* Can be overridden by an architecture for additional checks. */
__weak unsigned long arch_max_swapfile_size(void)
{
        return generic_max_swapfile_size();
}

static unsigned long read_swap_header(struct swap_info_struct *p,
                                        union swap_header *swap_header,
                                        struct inode *inode)
{
        int i;
        unsigned long maxpages;
        unsigned long swapfilepages;
        unsigned long last_page;

        if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
                pr_err("Unable to find swap-space signature\n");
                return 0;
        }

        /* swap partition endianness hack... */
        if (swab32(swap_header->info.version) == 1) {
                swab32s(&swap_header->info.version);
                swab32s(&swap_header->info.last_page);
                swab32s(&swap_header->info.nr_badpages);
                if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
                        return 0;
                for (i = 0; i < swap_header->info.nr_badpages; i++)
                        swab32s(&swap_header->info.badpages[i]);
        }
        /* Check the swap header's sub-version */
        if (swap_header->info.version != 1) {
                pr_warn("Unable to handle swap header version %d\n",
                        swap_header->info.version);
                return 0;
        }

        p->lowest_bit  = 1;
        p->cluster_next = 1;
        p->cluster_nr = 0;

        maxpages = swapfile_maximum_size;
        last_page = swap_header->info.last_page;
        if (!last_page) {
                pr_warn("Empty swap-file\n");
                return 0;
        }
        if (last_page > maxpages) {
                pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
                        K(maxpages), K(last_page));
        }
        if (maxpages > last_page) {
                maxpages = last_page + 1;
                /* p->max is an unsigned int: don't overflow it */
                if ((unsigned int)maxpages == 0)
                        maxpages = UINT_MAX;
        }
        p->highest_bit = maxpages - 1;

        if (!maxpages)
                return 0;
        swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
        if (swapfilepages && maxpages > swapfilepages) {
                pr_warn("Swap area shorter than signature indicates\n");
                return 0;
        }
        if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
                return 0;
        if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
                return 0;

        return maxpages;
}

#define SWAP_CLUSTER_INFO_COLS                                                \
        DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
#define SWAP_CLUSTER_SPACE_COLS                                                \
        DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
#define SWAP_CLUSTER_COLS                                                \
        max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)

static int setup_swap_map_and_extents(struct swap_info_struct *p,
                                        union swap_header *swap_header,
                                        unsigned char *swap_map,
                                        struct swap_cluster_info *cluster_info,
                                        unsigned long maxpages,
                                        sector_t *span)
{
        unsigned int j, k;
        unsigned int nr_good_pages;
        int nr_extents;
        unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
        unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
        unsigned long i, idx;

        nr_good_pages = maxpages - 1;        /* omit header page */

        cluster_list_init(&p->free_clusters);
        cluster_list_init(&p->discard_clusters);

        for (i = 0; i < swap_header->info.nr_badpages; i++) {
                unsigned int page_nr = swap_header->info.badpages[i];
                if (page_nr == 0 || page_nr > swap_header->info.last_page)
                        return -EINVAL;
                if (page_nr < maxpages) {
                        swap_map[page_nr] = SWAP_MAP_BAD;
                        nr_good_pages--;
                        /*
                         * Haven't marked the cluster free yet, no list
                         * operation involved
                         */
                        inc_cluster_info_page(p, cluster_info, page_nr);
                }
        }

        /* Haven't marked the cluster free yet, no list operation involved */
        for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
                inc_cluster_info_page(p, cluster_info, i);

        if (nr_good_pages) {
                swap_map[0] = SWAP_MAP_BAD;
                /*
                 * Not mark the cluster free yet, no list
                 * operation involved
                 */
                inc_cluster_info_page(p, cluster_info, 0);
                p->max = maxpages;
                p->pages = nr_good_pages;
                nr_extents = setup_swap_extents(p, span);
                if (nr_extents < 0)
                        return nr_extents;
                nr_good_pages = p->pages;
        }
        if (!nr_good_pages) {
                pr_warn("Empty swap-file\n");
                return -EINVAL;
        }

        if (!cluster_info)
                return nr_extents;


        /*
         * Reduce false cache line sharing between cluster_info and
         * sharing same address space.
         */
        for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
                j = (k + col) % SWAP_CLUSTER_COLS;
                for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
                        idx = i * SWAP_CLUSTER_COLS + j;
                        if (idx >= nr_clusters)
                                continue;
                        if (cluster_count(&cluster_info[idx]))
                                continue;
                        cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
                        cluster_list_add_tail(&p->free_clusters, cluster_info,
                                              idx);
                }
        }
        return nr_extents;
}

SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
{
        struct swap_info_struct *p;
        struct filename *name;
        struct file *swap_file = NULL;
        struct address_space *mapping;
        struct dentry *dentry;
        int prio;
        int error;
        union swap_header *swap_header;
        int nr_extents;
        sector_t span;
        unsigned long maxpages;
        unsigned char *swap_map = NULL;
        struct swap_cluster_info *cluster_info = NULL;
        struct page *page = NULL;
        struct inode *inode = NULL;
        bool inced_nr_rotate_swap = false;

        if (swap_flags & ~SWAP_FLAGS_VALID)
                return -EINVAL;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        if (!swap_avail_heads)
                return -ENOMEM;

        p = alloc_swap_info();
        if (IS_ERR(p))
                return PTR_ERR(p);

        INIT_WORK(&p->discard_work, swap_discard_work);

        name = getname(specialfile);
        if (IS_ERR(name)) {
                error = PTR_ERR(name);
                name = NULL;
                goto bad_swap;
        }
        swap_file = file_open_name(name, O_RDWR | O_LARGEFILE | O_EXCL, 0);
        if (IS_ERR(swap_file)) {
                error = PTR_ERR(swap_file);
                swap_file = NULL;
                goto bad_swap;
        }

        p->swap_file = swap_file;
        mapping = swap_file->f_mapping;
        dentry = swap_file->f_path.dentry;
        inode = mapping->host;

        error = claim_swapfile(p, inode);
        if (unlikely(error))
                goto bad_swap;

        inode_lock(inode);
        if (d_unlinked(dentry) || cant_mount(dentry)) {
                error = -ENOENT;
                goto bad_swap_unlock_inode;
        }
        if (IS_SWAPFILE(inode)) {
                error = -EBUSY;
                goto bad_swap_unlock_inode;
        }

        /*
         * Read the swap header.
         */
        if (!mapping->a_ops->read_folio) {
                error = -EINVAL;
                goto bad_swap_unlock_inode;
        }
        page = read_mapping_page(mapping, 0, swap_file);
        if (IS_ERR(page)) {
                error = PTR_ERR(page);
                goto bad_swap_unlock_inode;
        }
        swap_header = kmap(page);

        maxpages = read_swap_header(p, swap_header, inode);
        if (unlikely(!maxpages)) {
                error = -EINVAL;
                goto bad_swap_unlock_inode;
        }

        /* OK, set up the swap map and apply the bad block list */
        swap_map = vzalloc(maxpages);
        if (!swap_map) {
                error = -ENOMEM;
                goto bad_swap_unlock_inode;
        }

        if (p->bdev && bdev_stable_writes(p->bdev))
                p->flags |= SWP_STABLE_WRITES;

        if (p->bdev && bdev_synchronous(p->bdev))
                p->flags |= SWP_SYNCHRONOUS_IO;

        if (p->bdev && bdev_nonrot(p->bdev)) {
                int cpu, i;
                unsigned long ci, nr_cluster;

                p->flags |= SWP_SOLIDSTATE;
                p->cluster_next_cpu = alloc_percpu(unsigned int);
                if (!p->cluster_next_cpu) {
                        error = -ENOMEM;
                        goto bad_swap_unlock_inode;
                }
                /*
                 * select a random position to start with to help wear leveling
                 * SSD
                 */
                for_each_possible_cpu(cpu) {
                        per_cpu(*p->cluster_next_cpu, cpu) =
                                get_random_u32_inclusive(1, p->highest_bit);
                }
                nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);

                cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
                                        GFP_KERNEL);
                if (!cluster_info) {
                        error = -ENOMEM;
                        goto bad_swap_unlock_inode;
                }

                for (ci = 0; ci < nr_cluster; ci++)
                        spin_lock_init(&((cluster_info + ci)->lock));

                p->percpu_cluster = alloc_percpu(struct percpu_cluster);
                if (!p->percpu_cluster) {
                        error = -ENOMEM;
                        goto bad_swap_unlock_inode;
                }
                for_each_possible_cpu(cpu) {
                        struct percpu_cluster *cluster;

                        cluster = per_cpu_ptr(p->percpu_cluster, cpu);
                        for (i = 0; i < SWAP_NR_ORDERS; i++)
                                cluster->next[i] = SWAP_NEXT_INVALID;
                }
        } else {
                atomic_inc(&nr_rotate_swap);
                inced_nr_rotate_swap = true;
        }

        error = swap_cgroup_swapon(p->type, maxpages);
        if (error)
                goto bad_swap_unlock_inode;

        nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
                cluster_info, maxpages, &span);
        if (unlikely(nr_extents < 0)) {
                error = nr_extents;
                goto bad_swap_unlock_inode;
        }

        if ((swap_flags & SWAP_FLAG_DISCARD) &&
            p->bdev && bdev_max_discard_sectors(p->bdev)) {
                /*
                 * When discard is enabled for swap with no particular
                 * policy flagged, we set all swap discard flags here in
                 * order to sustain backward compatibility with older
                 * swapon(8) releases.
                 */
                p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
                             SWP_PAGE_DISCARD);

                /*
                 * By flagging sys_swapon, a sysadmin can tell us to
                 * either do single-time area discards only, or to just
                 * perform discards for released swap page-clusters.
                 * Now it's time to adjust the p->flags accordingly.
                 */
                if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
                        p->flags &= ~SWP_PAGE_DISCARD;
                else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
                        p->flags &= ~SWP_AREA_DISCARD;

                /* issue a swapon-time discard if it's still required */
                if (p->flags & SWP_AREA_DISCARD) {
                        int err = discard_swap(p);
                        if (unlikely(err))
                                pr_err("swapon: discard_swap(%p): %d\n",
                                        p, err);
                }
        }

        error = init_swap_address_space(p->type, maxpages);
        if (error)
                goto bad_swap_unlock_inode;

        error = zswap_swapon(p->type, maxpages);
        if (error)
                goto free_swap_address_space;

        /*
         * Flush any pending IO and dirty mappings before we start using this
         * swap device.
         */
        inode->i_flags |= S_SWAPFILE;
        error = inode_drain_writes(inode);
        if (error) {
                inode->i_flags &= ~S_SWAPFILE;
                goto free_swap_zswap;
        }

        mutex_lock(&swapon_mutex);
        prio = -1;
        if (swap_flags & SWAP_FLAG_PREFER)
                prio =
                  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
        enable_swap_info(p, prio, swap_map, cluster_info);

        pr_info("Adding %uk swap on %s.  Priority:%d extents:%d across:%lluk %s%s%s%s\n",
                K(p->pages), name->name, p->prio, nr_extents,
                K((unsigned long long)span),
                (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
                (p->flags & SWP_DISCARDABLE) ? "D" : "",
                (p->flags & SWP_AREA_DISCARD) ? "s" : "",
                (p->flags & SWP_PAGE_DISCARD) ? "c" : "");

        mutex_unlock(&swapon_mutex);
        atomic_inc(&proc_poll_event);
        wake_up_interruptible(&proc_poll_wait);

        error = 0;
        goto out;
free_swap_zswap:
        zswap_swapoff(p->type);
free_swap_address_space:
        exit_swap_address_space(p->type);
bad_swap_unlock_inode:
        inode_unlock(inode);
bad_swap:
        free_percpu(p->percpu_cluster);
        p->percpu_cluster = NULL;
        free_percpu(p->cluster_next_cpu);
        p->cluster_next_cpu = NULL;
        inode = NULL;
        destroy_swap_extents(p);
        swap_cgroup_swapoff(p->type);
        spin_lock(&swap_lock);
        p->swap_file = NULL;
        p->flags = 0;
        spin_unlock(&swap_lock);
        vfree(swap_map);
        kvfree(cluster_info);
        if (inced_nr_rotate_swap)
                atomic_dec(&nr_rotate_swap);
        if (swap_file)
                filp_close(swap_file, NULL);
out:
        if (page && !IS_ERR(page)) {
                kunmap(page);
                put_page(page);
        }
        if (name)
                putname(name);
        if (inode)
                inode_unlock(inode);
        if (!error)
                enable_swap_slots_cache();
        return error;
}

void si_swapinfo(struct sysinfo *val)
{
        unsigned int type;
        unsigned long nr_to_be_unused = 0;

        spin_lock(&swap_lock);
        for (type = 0; type < nr_swapfiles; type++) {
                struct swap_info_struct *si = swap_info[type];

                if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
                        nr_to_be_unused += READ_ONCE(si->inuse_pages);
        }
        val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
        val->totalswap = total_swap_pages + nr_to_be_unused;
        spin_unlock(&swap_lock);
}

/*
 * Verify that a swap entry is valid and increment its swap map count.
 *
 * Returns error code in following case.
 * - success -> 0
 * - swp_entry is invalid -> EINVAL
 * - swp_entry is migration entry -> EINVAL
 * - swap-cache reference is requested but there is already one. -> EEXIST
 * - swap-cache reference is requested but the entry is not used. -> ENOENT
 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
 */
static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
{
        struct swap_info_struct *p;
        struct swap_cluster_info *ci;
        unsigned long offset;
        unsigned char count;
        unsigned char has_cache;
        int err;

        p = swp_swap_info(entry);

        offset = swp_offset(entry);
        ci = lock_cluster_or_swap_info(p, offset);

        count = p->swap_map[offset];

        /*
         * swapin_readahead() doesn't check if a swap entry is valid, so the
         * swap entry could be SWAP_MAP_BAD. Check here with lock held.
         */
        if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
                err = -ENOENT;
                goto unlock_out;
        }

        has_cache = count & SWAP_HAS_CACHE;
        count &= ~SWAP_HAS_CACHE;
        err = 0;

        if (usage == SWAP_HAS_CACHE) {

                /* set SWAP_HAS_CACHE if there is no cache and entry is used */
                if (!has_cache && count)
                        has_cache = SWAP_HAS_CACHE;
                else if (has_cache)                /* someone else added cache */
                        err = -EEXIST;
                else                                /* no users remaining */
                        err = -ENOENT;

        } else if (count || has_cache) {

                if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
                        count += usage;
                else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
                        err = -EINVAL;
                else if (swap_count_continued(p, offset, count))
                        count = COUNT_CONTINUED;
                else
                        err = -ENOMEM;
        } else
                err = -ENOENT;                        /* unused swap entry */

        if (!err)
                WRITE_ONCE(p->swap_map[offset], count | has_cache);

unlock_out:
        unlock_cluster_or_swap_info(p, ci);
        return err;
}

/*
 * Help swapoff by noting that swap entry belongs to shmem/tmpfs
 * (in which case its reference count is never incremented).
 */
void swap_shmem_alloc(swp_entry_t entry)
{
        __swap_duplicate(entry, SWAP_MAP_SHMEM);
}

/*
 * Increase reference count of swap entry by 1.
 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
 * but could not be atomically allocated.  Returns 0, just as if it succeeded,
 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
 * might occur if a page table entry has got corrupted.
 */
int swap_duplicate(swp_entry_t entry)
{
        int err = 0;

        while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
                err = add_swap_count_continuation(entry, GFP_ATOMIC);
        return err;
}

/*
 * @entry: swap entry for which we allocate swap cache.
 *
 * Called when allocating swap cache for existing swap entry,
 * This can return error codes. Returns 0 at success.
 * -EEXIST means there is a swap cache.
 * Note: return code is different from swap_duplicate().
 */
int swapcache_prepare(swp_entry_t entry)
{
        return __swap_duplicate(entry, SWAP_HAS_CACHE);
}

void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
{
        struct swap_cluster_info *ci;
        unsigned long offset = swp_offset(entry);
        unsigned char usage;

        ci = lock_cluster_or_swap_info(si, offset);
        usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE);
        unlock_cluster_or_swap_info(si, ci);
        if (!usage)
                free_swap_slot(entry);
}

struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
        return swap_type_to_swap_info(swp_type(entry));
}

/*
 * out-of-line methods to avoid include hell.
 */
struct address_space *swapcache_mapping(struct folio *folio)
{
        return swp_swap_info(folio->swap)->swap_file->f_mapping;
}
EXPORT_SYMBOL_GPL(swapcache_mapping);

pgoff_t __page_file_index(struct page *page)
{
        swp_entry_t swap = page_swap_entry(page);
        return swp_offset(swap);
}
EXPORT_SYMBOL_GPL(__page_file_index);

/*
 * add_swap_count_continuation - called when a swap count is duplicated
 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
 * page of the original vmalloc'ed swap_map, to hold the continuation count
 * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
 *
 * These continuation pages are seldom referenced: the common paths all work
 * on the original swap_map, only referring to a continuation page when the
 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
 *
 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
 * can be called after dropping locks.
 */
int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
{
        struct swap_info_struct *si;
        struct swap_cluster_info *ci;
        struct page *head;
        struct page *page;
        struct page *list_page;
        pgoff_t offset;
        unsigned char count;
        int ret = 0;

        /*
         * When debugging, it's easier to use __GFP_ZERO here; but it's better
         * for latency not to zero a page while GFP_ATOMIC and holding locks.
         */
        page = alloc_page(gfp_mask | __GFP_HIGHMEM);

        si = get_swap_device(entry);
        if (!si) {
                /*
                 * An acceptable race has occurred since the failing
                 * __swap_duplicate(): the swap device may be swapoff
                 */
                goto outer;
        }
        spin_lock(&si->lock);

        offset = swp_offset(entry);

        ci = lock_cluster(si, offset);

        count = swap_count(si->swap_map[offset]);

        if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
                /*
                 * The higher the swap count, the more likely it is that tasks
                 * will race to add swap count continuation: we need to avoid
                 * over-provisioning.
                 */
                goto out;
        }

        if (!page) {
                ret = -ENOMEM;
                goto out;
        }

        head = vmalloc_to_page(si->swap_map + offset);
        offset &= ~PAGE_MASK;

        spin_lock(&si->cont_lock);
        /*
         * Page allocation does not initialize the page's lru field,
         * but it does always reset its private field.
         */
        if (!page_private(head)) {
                BUG_ON(count & COUNT_CONTINUED);
                INIT_LIST_HEAD(&head->lru);
                set_page_private(head, SWP_CONTINUED);
                si->flags |= SWP_CONTINUED;
        }

        list_for_each_entry(list_page, &head->lru, lru) {
                unsigned char *map;

                /*
                 * If the previous map said no continuation, but we've found
                 * a continuation page, free our allocation and use this one.
                 */
                if (!(count & COUNT_CONTINUED))
                        goto out_unlock_cont;

                map = kmap_local_page(list_page) + offset;
                count = *map;
                kunmap_local(map);

                /*
                 * If this continuation count now has some space in it,
                 * free our allocation and use this one.
                 */
                if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
                        goto out_unlock_cont;
        }

        list_add_tail(&page->lru, &head->lru);
        page = NULL;                        /* now it's attached, don't free it */
out_unlock_cont:
        spin_unlock(&si->cont_lock);
out:
        unlock_cluster(ci);
        spin_unlock(&si->lock);
        put_swap_device(si);
outer:
        if (page)
                __free_page(page);
        return ret;
}

/*
 * swap_count_continued - when the original swap_map count is incremented
 * from SWAP_MAP_MAX, check if there is already a continuation page to carry
 * into, carry if so, or else fail until a new continuation page is allocated;
 * when the original swap_map count is decremented from 0 with continuation,
 * borrow from the continuation and report whether it still holds more.
 * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
 * lock.
 */
static bool swap_count_continued(struct swap_info_struct *si,
                                 pgoff_t offset, unsigned char count)
{
        struct page *head;
        struct page *page;
        unsigned char *map;
        bool ret;

        head = vmalloc_to_page(si->swap_map + offset);
        if (page_private(head) != SWP_CONTINUED) {
                BUG_ON(count & COUNT_CONTINUED);
                return false;                /* need to add count continuation */
        }

        spin_lock(&si->cont_lock);
        offset &= ~PAGE_MASK;
        page = list_next_entry(head, lru);
        map = kmap_local_page(page) + offset;

        if (count == SWAP_MAP_MAX)        /* initial increment from swap_map */
                goto init_map;                /* jump over SWAP_CONT_MAX checks */

        if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
                /*
                 * Think of how you add 1 to 999
                 */
                while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
                        kunmap_local(map);
                        page = list_next_entry(page, lru);
                        BUG_ON(page == head);
                        map = kmap_local_page(page) + offset;
                }
                if (*map == SWAP_CONT_MAX) {
                        kunmap_local(map);
                        page = list_next_entry(page, lru);
                        if (page == head) {
                                ret = false;        /* add count continuation */
                                goto out;
                        }
                        map = kmap_local_page(page) + offset;
init_map:                *map = 0;                /* we didn't zero the page */
                }
                *map += 1;
                kunmap_local(map);
                while ((page = list_prev_entry(page, lru)) != head) {
                        map = kmap_local_page(page) + offset;
                        *map = COUNT_CONTINUED;
                        kunmap_local(map);
                }
                ret = true;                        /* incremented */

        } else {                                /* decrementing */
                /*
                 * Think of how you subtract 1 from 1000
                 */
                BUG_ON(count != COUNT_CONTINUED);
                while (*map == COUNT_CONTINUED) {
                        kunmap_local(map);
                        page = list_next_entry(page, lru);
                        BUG_ON(page == head);
                        map = kmap_local_page(page) + offset;
                }
                BUG_ON(*map == 0);
                *map -= 1;
                if (*map == 0)
                        count = 0;
                kunmap_local(map);
                while ((page = list_prev_entry(page, lru)) != head) {
                        map = kmap_local_page(page) + offset;
                        *map = SWAP_CONT_MAX | count;
                        count = COUNT_CONTINUED;
                        kunmap_local(map);
                }
                ret = count == COUNT_CONTINUED;
        }
out:
        spin_unlock(&si->cont_lock);
        return ret;
}

/*
 * free_swap_count_continuations - swapoff free all the continuation pages
 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
 */
static void free_swap_count_continuations(struct swap_info_struct *si)
{
        pgoff_t offset;

        for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
                struct page *head;
                head = vmalloc_to_page(si->swap_map + offset);
                if (page_private(head)) {
                        struct page *page, *next;

                        list_for_each_entry_safe(page, next, &head->lru, lru) {
                                list_del(&page->lru);
                                __free_page(page);
                        }
                }
        }
}

#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
        struct swap_info_struct *si, *next;
        int nid = folio_nid(folio);

        if (!(gfp & __GFP_IO))
                return;

        if (!__has_usable_swap())
                return;

        if (!blk_cgroup_congested())
                return;

        /*
         * We've already scheduled a throttle, avoid taking the global swap
         * lock.
         */
        if (current->throttle_disk)
                return;

        spin_lock(&swap_avail_lock);
        plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
                                  avail_lists[nid]) {
                if (si->bdev) {
                        blkcg_schedule_throttle(si->bdev->bd_disk, true);
                        break;
                }
        }
        spin_unlock(&swap_avail_lock);
}
#endif

static int __init swapfile_init(void)
{
        int nid;

        swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
                                         GFP_KERNEL);
        if (!swap_avail_heads) {
                pr_emerg("Not enough memory for swap heads, swap is disabled\n");
                return -ENOMEM;
        }

        for_each_node(nid)
                plist_head_init(&swap_avail_heads[nid]);

        swapfile_maximum_size = arch_max_swapfile_size();

#ifdef CONFIG_MIGRATION
        if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
                swap_migration_ad_supported = true;
#endif        /* CONFIG_MIGRATION */

        return 0;
}
subsys_initcall(swapfile_init);































































    1 



















    1 












    1 




























    1 


    1 



















    1 














































































































































































    1 






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
// SPDX-License-Identifier: GPL-2.0
/*
 *  file.c - part of debugfs, a tiny little debug file system
 *
 *  Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
 *  Copyright (C) 2004 IBM Inc.
 *
 *  debugfs is for people to use instead of /proc or /sys.
 *  See Documentation/filesystems/ for more details.
 */

#include <linux/module.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/pagemap.h>
#include <linux/debugfs.h>
#include <linux/io.h>
#include <linux/slab.h>
#include <linux/atomic.h>
#include <linux/device.h>
#include <linux/pm_runtime.h>
#include <linux/poll.h>
#include <linux/security.h>

#include "internal.h"

struct poll_table_struct;

static ssize_t default_read_file(struct file *file, char __user *buf,
                                 size_t count, loff_t *ppos)
{
        return 0;
}

static ssize_t default_write_file(struct file *file, const char __user *buf,
                                   size_t count, loff_t *ppos)
{
        return count;
}

const struct file_operations debugfs_noop_file_operations = {
        .read =                default_read_file,
        .write =        default_write_file,
        .open =                simple_open,
        .llseek =        noop_llseek,
};

#define F_DENTRY(filp) ((filp)->f_path.dentry)

const struct file_operations *debugfs_real_fops(const struct file *filp)
{
        struct debugfs_fsdata *fsd = F_DENTRY(filp)->d_fsdata;

        if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT) {
                /*
                 * Urgh, we've been called w/o a protecting
                 * debugfs_file_get().
                 */
                WARN_ON(1);
                return NULL;
        }

        return fsd->real_fops;
}
EXPORT_SYMBOL_GPL(debugfs_real_fops);

/**
 * debugfs_file_get - mark the beginning of file data access
 * @dentry: the dentry object whose data is being accessed.
 *
 * Up to a matching call to debugfs_file_put(), any successive call
 * into the file removing functions debugfs_remove() and
 * debugfs_remove_recursive() will block. Since associated private
 * file data may only get freed after a successful return of any of
 * the removal functions, you may safely access it after a successful
 * call to debugfs_file_get() without worrying about lifetime issues.
 *
 * If -%EIO is returned, the file has already been removed and thus,
 * it is not safe to access any of its data. If, on the other hand,
 * it is allowed to access the file data, zero is returned.
 */
int debugfs_file_get(struct dentry *dentry)
{
        struct debugfs_fsdata *fsd;
        void *d_fsd;

        /*
         * This could only happen if some debugfs user erroneously calls
         * debugfs_file_get() on a dentry that isn't even a file, let
         * them know about it.
         */
        if (WARN_ON(!d_is_reg(dentry)))
                return -EINVAL;

        d_fsd = READ_ONCE(dentry->d_fsdata);
        if (!((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) {
                fsd = d_fsd;
        } else {
                fsd = kmalloc(sizeof(*fsd), GFP_KERNEL);
                if (!fsd)
                        return -ENOMEM;

                fsd->real_fops = (void *)((unsigned long)d_fsd &
                                        ~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT);
                refcount_set(&fsd->active_users, 1);
                init_completion(&fsd->active_users_drained);
                INIT_LIST_HEAD(&fsd->cancellations);
                mutex_init(&fsd->cancellations_mtx);

                if (cmpxchg(&dentry->d_fsdata, d_fsd, fsd) != d_fsd) {
                        mutex_destroy(&fsd->cancellations_mtx);
                        kfree(fsd);
                        fsd = READ_ONCE(dentry->d_fsdata);
                }
        }

        /*
         * In case of a successful cmpxchg() above, this check is
         * strictly necessary and must follow it, see the comment in
         * __debugfs_remove_file().
         * OTOH, if the cmpxchg() hasn't been executed or wasn't
         * successful, this serves the purpose of not starving
         * removers.
         */
        if (d_unlinked(dentry))
                return -EIO;

        if (!refcount_inc_not_zero(&fsd->active_users))
                return -EIO;

        return 0;
}
EXPORT_SYMBOL_GPL(debugfs_file_get);

/**
 * debugfs_file_put - mark the end of file data access
 * @dentry: the dentry object formerly passed to
 *          debugfs_file_get().
 *
 * Allow any ongoing concurrent call into debugfs_remove() or
 * debugfs_remove_recursive() blocked by a former call to
 * debugfs_file_get() to proceed and return to its caller.
 */
void debugfs_file_put(struct dentry *dentry)
{
        struct debugfs_fsdata *fsd = READ_ONCE(dentry->d_fsdata);

        if (refcount_dec_and_test(&fsd->active_users))
                complete(&fsd->active_users_drained);
}
EXPORT_SYMBOL_GPL(debugfs_file_put);

/**
 * debugfs_enter_cancellation - enter a debugfs cancellation
 * @file: the file being accessed
 * @cancellation: the cancellation object, the cancel callback
 *        inside of it must be initialized
 *
 * When a debugfs file is removed it needs to wait for all active
 * operations to complete. However, the operation itself may need
 * to wait for hardware or completion of some asynchronous process
 * or similar. As such, it may need to be cancelled to avoid long
 * waits or even deadlocks.
 *
 * This function can be used inside a debugfs handler that may
 * need to be cancelled. As soon as this function is called, the
 * cancellation's 'cancel' callback may be called, at which point
 * the caller should proceed to call debugfs_leave_cancellation()
 * and leave the debugfs handler function as soon as possible.
 * Note that the 'cancel' callback is only ever called in the
 * context of some kind of debugfs_remove().
 *
 * This function must be paired with debugfs_leave_cancellation().
 */
void debugfs_enter_cancellation(struct file *file,
                                struct debugfs_cancellation *cancellation)
{
        struct debugfs_fsdata *fsd;
        struct dentry *dentry = F_DENTRY(file);

        INIT_LIST_HEAD(&cancellation->list);

        if (WARN_ON(!d_is_reg(dentry)))
                return;

        if (WARN_ON(!cancellation->cancel))
                return;

        fsd = READ_ONCE(dentry->d_fsdata);
        if (WARN_ON(!fsd ||
                    ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)))
                return;

        mutex_lock(&fsd->cancellations_mtx);
        list_add(&cancellation->list, &fsd->cancellations);
        mutex_unlock(&fsd->cancellations_mtx);

        /* if we're already removing wake it up to cancel */
        if (d_unlinked(dentry))
                complete(&fsd->active_users_drained);
}
EXPORT_SYMBOL_GPL(debugfs_enter_cancellation);

/**
 * debugfs_leave_cancellation - leave cancellation section
 * @file: the file being accessed
 * @cancellation: the cancellation previously registered with
 *        debugfs_enter_cancellation()
 *
 * See the documentation of debugfs_enter_cancellation().
 */
void debugfs_leave_cancellation(struct file *file,
                                struct debugfs_cancellation *cancellation)
{
        struct debugfs_fsdata *fsd;
        struct dentry *dentry = F_DENTRY(file);

        if (WARN_ON(!d_is_reg(dentry)))
                return;

        fsd = READ_ONCE(dentry->d_fsdata);
        if (WARN_ON(!fsd ||
                    ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)))
                return;

        mutex_lock(&fsd->cancellations_mtx);
        if (!list_empty(&cancellation->list))
                list_del(&cancellation->list);
        mutex_unlock(&fsd->cancellations_mtx);
}
EXPORT_SYMBOL_GPL(debugfs_leave_cancellation);

/*
 * Only permit access to world-readable files when the kernel is locked down.
 * We also need to exclude any file that has ways to write or alter it as root
 * can bypass the permissions check.
 */
static int debugfs_locked_down(struct inode *inode,
                               struct file *filp,
                               const struct file_operations *real_fops)
{
        if ((inode->i_mode & 07777 & ~0444) == 0 &&
            !(filp->f_mode & FMODE_WRITE) &&
            !real_fops->unlocked_ioctl &&
            !real_fops->compat_ioctl &&
            !real_fops->mmap)
                return 0;

        if (security_locked_down(LOCKDOWN_DEBUGFS))
                return -EPERM;

        return 0;
}

static int open_proxy_open(struct inode *inode, struct file *filp)
{
        struct dentry *dentry = F_DENTRY(filp);
        const struct file_operations *real_fops = NULL;
        int r;

        r = debugfs_file_get(dentry);
        if (r)
                return r == -EIO ? -ENOENT : r;

        real_fops = debugfs_real_fops(filp);

        r = debugfs_locked_down(inode, filp, real_fops);
        if (r)
                goto out;

        if (!fops_get(real_fops)) {
#ifdef CONFIG_MODULES
                if (real_fops->owner &&
                    real_fops->owner->state == MODULE_STATE_GOING) {
                        r = -ENXIO;
                        goto out;
                }
#endif

                /* Huh? Module did not clean up after itself at exit? */
                WARN(1, "debugfs file owner did not clean up at exit: %pd",
                        dentry);
                r = -ENXIO;
                goto out;
        }
        replace_fops(filp, real_fops);

        if (real_fops->open)
                r = real_fops->open(inode, filp);

out:
        debugfs_file_put(dentry);
        return r;
}

const struct file_operations debugfs_open_proxy_file_operations = {
        .open = open_proxy_open,
};

#define PROTO(args...) args
#define ARGS(args...) args

#define FULL_PROXY_FUNC(name, ret_type, filp, proto, args)                \
static ret_type full_proxy_ ## name(proto)                                \
{                                                                        \
        struct dentry *dentry = F_DENTRY(filp);                        \
        const struct file_operations *real_fops;                        \
        ret_type r;                                                        \
                                                                        \
        r = debugfs_file_get(dentry);                                        \
        if (unlikely(r))                                                \
                return r;                                                \
        real_fops = debugfs_real_fops(filp);                                \
        r = real_fops->name(args);                                        \
        debugfs_file_put(dentry);                                        \
        return r;                                                        \
}

FULL_PROXY_FUNC(llseek, loff_t, filp,
                PROTO(struct file *filp, loff_t offset, int whence),
                ARGS(filp, offset, whence));

FULL_PROXY_FUNC(read, ssize_t, filp,
                PROTO(struct file *filp, char __user *buf, size_t size,
                        loff_t *ppos),
                ARGS(filp, buf, size, ppos));

FULL_PROXY_FUNC(write, ssize_t, filp,
                PROTO(struct file *filp, const char __user *buf, size_t size,
                        loff_t *ppos),
                ARGS(filp, buf, size, ppos));

FULL_PROXY_FUNC(unlocked_ioctl, long, filp,
                PROTO(struct file *filp, unsigned int cmd, unsigned long arg),
                ARGS(filp, cmd, arg));

static __poll_t full_proxy_poll(struct file *filp,
                                struct poll_table_struct *wait)
{
        struct dentry *dentry = F_DENTRY(filp);
        __poll_t r = 0;
        const struct file_operations *real_fops;

        if (debugfs_file_get(dentry))
                return EPOLLHUP;

        real_fops = debugfs_real_fops(filp);
        r = real_fops->poll(filp, wait);
        debugfs_file_put(dentry);
        return r;
}

static int full_proxy_release(struct inode *inode, struct file *filp)
{
        const struct dentry *dentry = F_DENTRY(filp);
        const struct file_operations *real_fops = debugfs_real_fops(filp);
        const struct file_operations *proxy_fops = filp->f_op;
        int r = 0;

        /*
         * We must not protect this against removal races here: the
         * original releaser should be called unconditionally in order
         * not to leak any resources. Releasers must not assume that
         * ->i_private is still being meaningful here.
         */
        if (real_fops->release)
                r = real_fops->release(inode, filp);

        replace_fops(filp, d_inode(dentry)->i_fop);
        kfree(proxy_fops);
        fops_put(real_fops);
        return r;
}

static void __full_proxy_fops_init(struct file_operations *proxy_fops,
                                const struct file_operations *real_fops)
{
        proxy_fops->release = full_proxy_release;
        if (real_fops->llseek)
                proxy_fops->llseek = full_proxy_llseek;
        if (real_fops->read)
                proxy_fops->read = full_proxy_read;
        if (real_fops->write)
                proxy_fops->write = full_proxy_write;
        if (real_fops->poll)
                proxy_fops->poll = full_proxy_poll;
        if (real_fops->unlocked_ioctl)
                proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl;
}

static int full_proxy_open(struct inode *inode, struct file *filp)
{
        struct dentry *dentry = F_DENTRY(filp);
        const struct file_operations *real_fops = NULL;
        struct file_operations *proxy_fops = NULL;
        int r;

        r = debugfs_file_get(dentry);
        if (r)
                return r == -EIO ? -ENOENT : r;

        real_fops = debugfs_real_fops(filp);

        r = debugfs_locked_down(inode, filp, real_fops);
        if (r)
                goto out;

        if (!fops_get(real_fops)) {
#ifdef CONFIG_MODULES
                if (real_fops->owner &&
                    real_fops->owner->state == MODULE_STATE_GOING) {
                        r = -ENXIO;
                        goto out;
                }
#endif

                /* Huh? Module did not cleanup after itself at exit? */
                WARN(1, "debugfs file owner did not clean up at exit: %pd",
                        dentry);
                r = -ENXIO;
                goto out;
        }

        proxy_fops = kzalloc(sizeof(*proxy_fops), GFP_KERNEL);
        if (!proxy_fops) {
                r = -ENOMEM;
                goto free_proxy;
        }
        __full_proxy_fops_init(proxy_fops, real_fops);
        replace_fops(filp, proxy_fops);

        if (real_fops->open) {
                r = real_fops->open(inode, filp);
                if (r) {
                        replace_fops(filp, d_inode(dentry)->i_fop);
                        goto free_proxy;
                } else if (filp->f_op != proxy_fops) {
                        /* No protection against file removal anymore. */
                        WARN(1, "debugfs file owner replaced proxy fops: %pd",
                                dentry);
                        goto free_proxy;
                }
        }

        goto out;
free_proxy:
        kfree(proxy_fops);
        fops_put(real_fops);
out:
        debugfs_file_put(dentry);
        return r;
}

const struct file_operations debugfs_full_proxy_file_operations = {
        .open = full_proxy_open,
};

ssize_t debugfs_attr_read(struct file *file, char __user *buf,
                        size_t len, loff_t *ppos)
{
        struct dentry *dentry = F_DENTRY(file);
        ssize_t ret;

        ret = debugfs_file_get(dentry);
        if (unlikely(ret))
                return ret;
        ret = simple_attr_read(file, buf, len, ppos);
        debugfs_file_put(dentry);
        return ret;
}
EXPORT_SYMBOL_GPL(debugfs_attr_read);

static ssize_t debugfs_attr_write_xsigned(struct file *file, const char __user *buf,
                         size_t len, loff_t *ppos, bool is_signed)
{
        struct dentry *dentry = F_DENTRY(file);
        ssize_t ret;

        ret = debugfs_file_get(dentry);
        if (unlikely(ret))
                return ret;
        if (is_signed)
                ret = simple_attr_write_signed(file, buf, len, ppos);
        else
                ret = simple_attr_write(file, buf, len, ppos);
        debugfs_file_put(dentry);
        return ret;
}

ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
                         size_t len, loff_t *ppos)
{
        return debugfs_attr_write_xsigned(file, buf, len, ppos, false);
}
EXPORT_SYMBOL_GPL(debugfs_attr_write);

ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf,
                         size_t len, loff_t *ppos)
{
        return debugfs_attr_write_xsigned(file, buf, len, ppos, true);
}
EXPORT_SYMBOL_GPL(debugfs_attr_write_signed);

static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode,
                                        struct dentry *parent, void *value,
                                        const struct file_operations *fops,
                                        const struct file_operations *fops_ro,
                                        const struct file_operations *fops_wo)
{
        /* if there are no write bits set, make read only */
        if (!(mode & S_IWUGO))
                return debugfs_create_file_unsafe(name, mode, parent, value,
                                                fops_ro);
        /* if there are no read bits set, make write only */
        if (!(mode & S_IRUGO))
                return debugfs_create_file_unsafe(name, mode, parent, value,
                                                fops_wo);

        return debugfs_create_file_unsafe(name, mode, parent, value, fops);
}

static int debugfs_u8_set(void *data, u64 val)
{
        *(u8 *)data = val;
        return 0;
}
static int debugfs_u8_get(void *data, u64 *val)
{
        *val = *(u8 *)data;
        return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n");
DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n");
DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");

/**
 * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @value: a pointer to the variable that the file should read to and write
 *         from.
 *
 * This function creates a file in debugfs with the given name that
 * contains the value of the variable @value.  If the @mode variable is so
 * set, it can be read from, and written to.
 */
void debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent,
                       u8 *value)
{
        debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u8,
                                   &fops_u8_ro, &fops_u8_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u8);

static int debugfs_u16_set(void *data, u64 val)
{
        *(u16 *)data = val;
        return 0;
}
static int debugfs_u16_get(void *data, u64 *val)
{
        *val = *(u16 *)data;
        return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n");
DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n");
DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");

/**
 * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @value: a pointer to the variable that the file should read to and write
 *         from.
 *
 * This function creates a file in debugfs with the given name that
 * contains the value of the variable @value.  If the @mode variable is so
 * set, it can be read from, and written to.
 */
void debugfs_create_u16(const char *name, umode_t mode, struct dentry *parent,
                        u16 *value)
{
        debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u16,
                                   &fops_u16_ro, &fops_u16_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u16);

static int debugfs_u32_set(void *data, u64 val)
{
        *(u32 *)data = val;
        return 0;
}
static int debugfs_u32_get(void *data, u64 *val)
{
        *val = *(u32 *)data;
        return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n");
DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n");
DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");

/**
 * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @value: a pointer to the variable that the file should read to and write
 *         from.
 *
 * This function creates a file in debugfs with the given name that
 * contains the value of the variable @value.  If the @mode variable is so
 * set, it can be read from, and written to.
 */
void debugfs_create_u32(const char *name, umode_t mode, struct dentry *parent,
                        u32 *value)
{
        debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u32,
                                   &fops_u32_ro, &fops_u32_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u32);

static int debugfs_u64_set(void *data, u64 val)
{
        *(u64 *)data = val;
        return 0;
}

static int debugfs_u64_get(void *data, u64 *val)
{
        *val = *(u64 *)data;
        return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n");
DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n");
DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");

/**
 * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @value: a pointer to the variable that the file should read to and write
 *         from.
 *
 * This function creates a file in debugfs with the given name that
 * contains the value of the variable @value.  If the @mode variable is so
 * set, it can be read from, and written to.
 */
void debugfs_create_u64(const char *name, umode_t mode, struct dentry *parent,
                        u64 *value)
{
        debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u64,
                                   &fops_u64_ro, &fops_u64_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u64);

static int debugfs_ulong_set(void *data, u64 val)
{
        *(unsigned long *)data = val;
        return 0;
}

static int debugfs_ulong_get(void *data, u64 *val)
{
        *val = *(unsigned long *)data;
        return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set,
                        "%llu\n");
DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n");
DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n");

/**
 * debugfs_create_ulong - create a debugfs file that is used to read and write
 * an unsigned long value.
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @value: a pointer to the variable that the file should read to and write
 *         from.
 *
 * This function creates a file in debugfs with the given name that
 * contains the value of the variable @value.  If the @mode variable is so
 * set, it can be read from, and written to.
 */
void debugfs_create_ulong(const char *name, umode_t mode, struct dentry *parent,
                          unsigned long *value)
{
        debugfs_create_mode_unsafe(name, mode, parent, value, &fops_ulong,
                                   &fops_ulong_ro, &fops_ulong_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_ulong);

DEFINE_DEBUGFS_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n");
DEFINE_DEBUGFS_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n");
DEFINE_DEBUGFS_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n");

DEFINE_DEBUGFS_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set,
                        "0x%04llx\n");
DEFINE_DEBUGFS_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n");
DEFINE_DEBUGFS_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n");

DEFINE_DEBUGFS_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set,
                        "0x%08llx\n");
DEFINE_DEBUGFS_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
DEFINE_DEBUGFS_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");

DEFINE_DEBUGFS_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set,
                        "0x%016llx\n");
DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n");
DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n");

/*
 * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
 *
 * These functions are exactly the same as the above functions (but use a hex
 * output for the decimal challenged). For details look at the above unsigned
 * decimal functions.
 */

/**
 * debugfs_create_x8 - create a debugfs file that is used to read and write an unsigned 8-bit value
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @value: a pointer to the variable that the file should read to and write
 *         from.
 */
void debugfs_create_x8(const char *name, umode_t mode, struct dentry *parent,
                       u8 *value)
{
        debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x8,
                                   &fops_x8_ro, &fops_x8_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x8);

/**
 * debugfs_create_x16 - create a debugfs file that is used to read and write an unsigned 16-bit value
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @value: a pointer to the variable that the file should read to and write
 *         from.
 */
void debugfs_create_x16(const char *name, umode_t mode, struct dentry *parent,
                        u16 *value)
{
        debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x16,
                                   &fops_x16_ro, &fops_x16_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x16);

/**
 * debugfs_create_x32 - create a debugfs file that is used to read and write an unsigned 32-bit value
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @value: a pointer to the variable that the file should read to and write
 *         from.
 */
void debugfs_create_x32(const char *name, umode_t mode, struct dentry *parent,
                        u32 *value)
{
        debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x32,
                                   &fops_x32_ro, &fops_x32_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x32);

/**
 * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @value: a pointer to the variable that the file should read to and write
 *         from.
 */
void debugfs_create_x64(const char *name, umode_t mode, struct dentry *parent,
                        u64 *value)
{
        debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x64,
                                   &fops_x64_ro, &fops_x64_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x64);


static int debugfs_size_t_set(void *data, u64 val)
{
        *(size_t *)data = val;
        return 0;
}
static int debugfs_size_t_get(void *data, u64 *val)
{
        *val = *(size_t *)data;
        return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
                        "%llu\n"); /* %llu and %zu are more or less the same */
DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n");
DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n");

/**
 * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @value: a pointer to the variable that the file should read to and write
 *         from.
 */
void debugfs_create_size_t(const char *name, umode_t mode,
                           struct dentry *parent, size_t *value)
{
        debugfs_create_mode_unsafe(name, mode, parent, value, &fops_size_t,
                                   &fops_size_t_ro, &fops_size_t_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_size_t);

static int debugfs_atomic_t_set(void *data, u64 val)
{
        atomic_set((atomic_t *)data, val);
        return 0;
}
static int debugfs_atomic_t_get(void *data, u64 *val)
{
        *val = atomic_read((atomic_t *)data);
        return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t, debugfs_atomic_t_get,
                        debugfs_atomic_t_set, "%lld\n");
DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_ro, debugfs_atomic_t_get, NULL,
                        "%lld\n");
DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_wo, NULL, debugfs_atomic_t_set,
                        "%lld\n");

/**
 * debugfs_create_atomic_t - create a debugfs file that is used to read and
 * write an atomic_t value
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @value: a pointer to the variable that the file should read to and write
 *         from.
 */
void debugfs_create_atomic_t(const char *name, umode_t mode,
                             struct dentry *parent, atomic_t *value)
{
        debugfs_create_mode_unsafe(name, mode, parent, value, &fops_atomic_t,
                                   &fops_atomic_t_ro, &fops_atomic_t_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);

ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
                               size_t count, loff_t *ppos)
{
        char buf[2];
        bool val;
        int r;
        struct dentry *dentry = F_DENTRY(file);

        r = debugfs_file_get(dentry);
        if (unlikely(r))
                return r;
        val = *(bool *)file->private_data;
        debugfs_file_put(dentry);

        if (val)
                buf[0] = 'Y';
        else
                buf[0] = 'N';
        buf[1] = '\n';
        return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
}
EXPORT_SYMBOL_GPL(debugfs_read_file_bool);

ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
                                size_t count, loff_t *ppos)
{
        bool bv;
        int r;
        bool *val = file->private_data;
        struct dentry *dentry = F_DENTRY(file);

        r = kstrtobool_from_user(user_buf, count, &bv);
        if (!r) {
                r = debugfs_file_get(dentry);
                if (unlikely(r))
                        return r;
                *val = bv;
                debugfs_file_put(dentry);
        }

        return count;
}
EXPORT_SYMBOL_GPL(debugfs_write_file_bool);

static const struct file_operations fops_bool = {
        .read =                debugfs_read_file_bool,
        .write =        debugfs_write_file_bool,
        .open =                simple_open,
        .llseek =        default_llseek,
};

static const struct file_operations fops_bool_ro = {
        .read =                debugfs_read_file_bool,
        .open =                simple_open,
        .llseek =        default_llseek,
};

static const struct file_operations fops_bool_wo = {
        .write =        debugfs_write_file_bool,
        .open =                simple_open,
        .llseek =        default_llseek,
};

/**
 * debugfs_create_bool - create a debugfs file that is used to read and write a boolean value
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @value: a pointer to the variable that the file should read to and write
 *         from.
 *
 * This function creates a file in debugfs with the given name that
 * contains the value of the variable @value.  If the @mode variable is so
 * set, it can be read from, and written to.
 */
void debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent,
                         bool *value)
{
        debugfs_create_mode_unsafe(name, mode, parent, value, &fops_bool,
                                   &fops_bool_ro, &fops_bool_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_bool);

ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf,
                              size_t count, loff_t *ppos)
{
        struct dentry *dentry = F_DENTRY(file);
        char *str, *copy = NULL;
        int copy_len, len;
        ssize_t ret;

        ret = debugfs_file_get(dentry);
        if (unlikely(ret))
                return ret;

        str = *(char **)file->private_data;
        len = strlen(str) + 1;
        copy = kmalloc(len, GFP_KERNEL);
        if (!copy) {
                debugfs_file_put(dentry);
                return -ENOMEM;
        }

        copy_len = strscpy(copy, str, len);
        debugfs_file_put(dentry);
        if (copy_len < 0) {
                kfree(copy);
                return copy_len;
        }

        copy[copy_len] = '\n';

        ret = simple_read_from_buffer(user_buf, count, ppos, copy, len);
        kfree(copy);

        return ret;
}
EXPORT_SYMBOL_GPL(debugfs_create_str);

static ssize_t debugfs_write_file_str(struct file *file, const char __user *user_buf,
                                      size_t count, loff_t *ppos)
{
        struct dentry *dentry = F_DENTRY(file);
        char *old, *new = NULL;
        int pos = *ppos;
        int r;

        r = debugfs_file_get(dentry);
        if (unlikely(r))
                return r;

        old = *(char **)file->private_data;

        /* only allow strict concatenation */
        r = -EINVAL;
        if (pos && pos != strlen(old))
                goto error;

        r = -E2BIG;
        if (pos + count + 1 > PAGE_SIZE)
                goto error;

        r = -ENOMEM;
        new = kmalloc(pos + count + 1, GFP_KERNEL);
        if (!new)
                goto error;

        if (pos)
                memcpy(new, old, pos);

        r = -EFAULT;
        if (copy_from_user(new + pos, user_buf, count))
                goto error;

        new[pos + count] = '\0';
        strim(new);

        rcu_assign_pointer(*(char __rcu **)file->private_data, new);
        synchronize_rcu();
        kfree(old);

        debugfs_file_put(dentry);
        return count;

error:
        kfree(new);
        debugfs_file_put(dentry);
        return r;
}

static const struct file_operations fops_str = {
        .read =                debugfs_read_file_str,
        .write =        debugfs_write_file_str,
        .open =                simple_open,
        .llseek =        default_llseek,
};

static const struct file_operations fops_str_ro = {
        .read =                debugfs_read_file_str,
        .open =                simple_open,
        .llseek =        default_llseek,
};

static const struct file_operations fops_str_wo = {
        .write =        debugfs_write_file_str,
        .open =                simple_open,
        .llseek =        default_llseek,
};

/**
 * debugfs_create_str - create a debugfs file that is used to read and write a string value
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @value: a pointer to the variable that the file should read to and write
 *         from.
 *
 * This function creates a file in debugfs with the given name that
 * contains the value of the variable @value.  If the @mode variable is so
 * set, it can be read from, and written to.
 */
void debugfs_create_str(const char *name, umode_t mode,
                        struct dentry *parent, char **value)
{
        debugfs_create_mode_unsafe(name, mode, parent, value, &fops_str,
                                   &fops_str_ro, &fops_str_wo);
}

static ssize_t read_file_blob(struct file *file, char __user *user_buf,
                              size_t count, loff_t *ppos)
{
        struct debugfs_blob_wrapper *blob = file->private_data;
        struct dentry *dentry = F_DENTRY(file);
        ssize_t r;

        r = debugfs_file_get(dentry);
        if (unlikely(r))
                return r;
        r = simple_read_from_buffer(user_buf, count, ppos, blob->data,
                                blob->size);
        debugfs_file_put(dentry);
        return r;
}

static ssize_t write_file_blob(struct file *file, const char __user *user_buf,
                               size_t count, loff_t *ppos)
{
        struct debugfs_blob_wrapper *blob = file->private_data;
        struct dentry *dentry = F_DENTRY(file);
        ssize_t r;

        r = debugfs_file_get(dentry);
        if (unlikely(r))
                return r;
        r = simple_write_to_buffer(blob->data, blob->size, ppos, user_buf,
                                   count);

        debugfs_file_put(dentry);
        return r;
}

static const struct file_operations fops_blob = {
        .read =                read_file_blob,
        .write =        write_file_blob,
        .open =                simple_open,
        .llseek =        default_llseek,
};

/**
 * debugfs_create_blob - create a debugfs file that is used to read and write
 * a binary blob
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @blob: a pointer to a struct debugfs_blob_wrapper which contains a pointer
 *        to the blob data and the size of the data.
 *
 * This function creates a file in debugfs with the given name that exports
 * @blob->data as a binary blob. If the @mode variable is so set it can be
 * read from and written to.
 *
 * This function will return a pointer to a dentry if it succeeds.  This
 * pointer must be passed to the debugfs_remove() function when the file is
 * to be removed (no automatic cleanup happens if your module is unloaded,
 * you are responsible here.)  If an error occurs, ERR_PTR(-ERROR) will be
 * returned.
 *
 * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
 * be returned.
 */
struct dentry *debugfs_create_blob(const char *name, umode_t mode,
                                   struct dentry *parent,
                                   struct debugfs_blob_wrapper *blob)
{
        return debugfs_create_file_unsafe(name, mode & 0644, parent, blob, &fops_blob);
}
EXPORT_SYMBOL_GPL(debugfs_create_blob);

static size_t u32_format_array(char *buf, size_t bufsize,
                               u32 *array, int array_size)
{
        size_t ret = 0;

        while (--array_size >= 0) {
                size_t len;
                char term = array_size ? ' ' : '\n';

                len = snprintf(buf, bufsize, "%u%c", *array++, term);
                ret += len;

                buf += len;
                bufsize -= len;
        }
        return ret;
}

static int u32_array_open(struct inode *inode, struct file *file)
{
        struct debugfs_u32_array *data = inode->i_private;
        int size, elements = data->n_elements;
        char *buf;

        /*
         * Max size:
         *  - 10 digits + ' '/'\n' = 11 bytes per number
         *  - terminating NUL character
         */
        size = elements*11;
        buf = kmalloc(size+1, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;
        buf[size] = 0;

        file->private_data = buf;
        u32_format_array(buf, size, data->array, data->n_elements);

        return nonseekable_open(inode, file);
}

static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
                              loff_t *ppos)
{
        size_t size = strlen(file->private_data);

        return simple_read_from_buffer(buf, len, ppos,
                                        file->private_data, size);
}

static int u32_array_release(struct inode *inode, struct file *file)
{
        kfree(file->private_data);

        return 0;
}

static const struct file_operations u32_array_fops = {
        .owner         = THIS_MODULE,
        .open         = u32_array_open,
        .release = u32_array_release,
        .read         = u32_array_read,
        .llseek  = no_llseek,
};

/**
 * debugfs_create_u32_array - create a debugfs file that is used to read u32
 * array.
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have.
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @array: wrapper struct containing data pointer and size of the array.
 *
 * This function creates a file in debugfs with the given name that exports
 * @array as data. If the @mode variable is so set it can be read from.
 * Writing is not supported. Seek within the file is also not supported.
 * Once array is created its size can not be changed.
 */
void debugfs_create_u32_array(const char *name, umode_t mode,
                              struct dentry *parent,
                              struct debugfs_u32_array *array)
{
        debugfs_create_file_unsafe(name, mode, parent, array, &u32_array_fops);
}
EXPORT_SYMBOL_GPL(debugfs_create_u32_array);

#ifdef CONFIG_HAS_IOMEM

/*
 * The regset32 stuff is used to print 32-bit registers using the
 * seq_file utilities. We offer printing a register set in an already-opened
 * sequential file or create a debugfs file that only prints a regset32.
 */

/**
 * debugfs_print_regs32 - use seq_print to describe a set of registers
 * @s: the seq_file structure being used to generate output
 * @regs: an array if struct debugfs_reg32 structures
 * @nregs: the length of the above array
 * @base: the base address to be used in reading the registers
 * @prefix: a string to be prefixed to every output line
 *
 * This function outputs a text block describing the current values of
 * some 32-bit hardware registers. It is meant to be used within debugfs
 * files based on seq_file that need to show registers, intermixed with other
 * information. The prefix argument may be used to specify a leading string,
 * because some peripherals have several blocks of identical registers,
 * for example configuration of dma channels
 */
void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs,
                          int nregs, void __iomem *base, char *prefix)
{
        int i;

        for (i = 0; i < nregs; i++, regs++) {
                if (prefix)
                        seq_printf(s, "%s", prefix);
                seq_printf(s, "%s = 0x%08x\n", regs->name,
                           readl(base + regs->offset));
                if (seq_has_overflowed(s))
                        break;
        }
}
EXPORT_SYMBOL_GPL(debugfs_print_regs32);

static int debugfs_regset32_show(struct seq_file *s, void *data)
{
        struct debugfs_regset32 *regset = s->private;

        if (regset->dev)
                pm_runtime_get_sync(regset->dev);

        debugfs_print_regs32(s, regset->regs, regset->nregs, regset->base, "");

        if (regset->dev)
                pm_runtime_put(regset->dev);

        return 0;
}

DEFINE_SHOW_ATTRIBUTE(debugfs_regset32);

/**
 * debugfs_create_regset32 - create a debugfs file that returns register values
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is %NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @regset: a pointer to a struct debugfs_regset32, which contains a pointer
 *          to an array of register definitions, the array size and the base
 *          address where the register bank is to be found.
 *
 * This function creates a file in debugfs with the given name that reports
 * the names and values of a set of 32-bit registers. If the @mode variable
 * is so set it can be read from. Writing is not supported.
 */
void debugfs_create_regset32(const char *name, umode_t mode,
                             struct dentry *parent,
                             struct debugfs_regset32 *regset)
{
        debugfs_create_file(name, mode, parent, regset, &debugfs_regset32_fops);
}
EXPORT_SYMBOL_GPL(debugfs_create_regset32);

#endif /* CONFIG_HAS_IOMEM */

struct debugfs_devm_entry {
        int (*read)(struct seq_file *seq, void *data);
        struct device *dev;
};

static int debugfs_devm_entry_open(struct inode *inode, struct file *f)
{
        struct debugfs_devm_entry *entry = inode->i_private;

        return single_open(f, entry->read, entry->dev);
}

static const struct file_operations debugfs_devm_entry_ops = {
        .owner = THIS_MODULE,
        .open = debugfs_devm_entry_open,
        .release = single_release,
        .read = seq_read,
        .llseek = seq_lseek
};

/**
 * debugfs_create_devm_seqfile - create a debugfs file that is bound to device.
 *
 * @dev: device related to this debugfs file.
 * @name: name of the debugfs file.
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *        directory dentry if set.  If this parameter is %NULL, then the
 *        file will be created in the root of the debugfs filesystem.
 * @read_fn: function pointer called to print the seq_file content.
 */
void debugfs_create_devm_seqfile(struct device *dev, const char *name,
                                 struct dentry *parent,
                                 int (*read_fn)(struct seq_file *s, void *data))
{
        struct debugfs_devm_entry *entry;

        if (IS_ERR(parent))
                return;

        entry = devm_kzalloc(dev, sizeof(*entry), GFP_KERNEL);
        if (!entry)
                return;

        entry->read = read_fn;
        entry->dev = dev;

        debugfs_create_file(name, S_IRUGO, parent, entry,
                            &debugfs_devm_entry_ops);
}
EXPORT_SYMBOL_GPL(debugfs_create_devm_seqfile);




















































































































    2 


















































































    6 

    5 










    5 
    6 
    6 

    6 











































































































































































































































































































































































































































































































    2 

    2 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RMAP_H
#define _LINUX_RMAP_H
/*
 * Declarations for Reverse Mapping functions in mm/rmap.c
 */

#include <linux/list.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/rwsem.h>
#include <linux/memcontrol.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>

/*
 * The anon_vma heads a list of private "related" vmas, to scan if
 * an anonymous page pointing to this anon_vma needs to be unmapped:
 * the vmas on the list will be related by forking, or by splitting.
 *
 * Since vmas come and go as they are split and merged (particularly
 * in mprotect), the mapping field of an anonymous page cannot point
 * directly to a vma: instead it points to an anon_vma, on whose list
 * the related vmas can be easily linked or unlinked.
 *
 * After unlinking the last vma on the list, we must garbage collect
 * the anon_vma object itself: we're guaranteed no page can be
 * pointing to this anon_vma once its vma list is empty.
 */
struct anon_vma {
        struct anon_vma *root;                /* Root of this anon_vma tree */
        struct rw_semaphore rwsem;        /* W: modification, R: walking the list */
        /*
         * The refcount is taken on an anon_vma when there is no
         * guarantee that the vma of page tables will exist for
         * the duration of the operation. A caller that takes
         * the reference is responsible for clearing up the
         * anon_vma if they are the last user on release
         */
        atomic_t refcount;

        /*
         * Count of child anon_vmas. Equals to the count of all anon_vmas that
         * have ->parent pointing to this one, including itself.
         *
         * This counter is used for making decision about reusing anon_vma
         * instead of forking new one. See comments in function anon_vma_clone.
         */
        unsigned long num_children;
        /* Count of VMAs whose ->anon_vma pointer points to this object. */
        unsigned long num_active_vmas;

        struct anon_vma *parent;        /* Parent of this anon_vma */

        /*
         * NOTE: the LSB of the rb_root.rb_node is set by
         * mm_take_all_locks() _after_ taking the above lock. So the
         * rb_root must only be read/written after taking the above lock
         * to be sure to see a valid next pointer. The LSB bit itself
         * is serialized by a system wide lock only visible to
         * mm_take_all_locks() (mm_all_locks_mutex).
         */

        /* Interval tree of private "related" vmas */
        struct rb_root_cached rb_root;
};

/*
 * The copy-on-write semantics of fork mean that an anon_vma
 * can become associated with multiple processes. Furthermore,
 * each child process will have its own anon_vma, where new
 * pages for that process are instantiated.
 *
 * This structure allows us to find the anon_vmas associated
 * with a VMA, or the VMAs associated with an anon_vma.
 * The "same_vma" list contains the anon_vma_chains linking
 * all the anon_vmas associated with this VMA.
 * The "rb" field indexes on an interval tree the anon_vma_chains
 * which link all the VMAs associated with this anon_vma.
 */
struct anon_vma_chain {
        struct vm_area_struct *vma;
        struct anon_vma *anon_vma;
        struct list_head same_vma;   /* locked by mmap_lock & page_table_lock */
        struct rb_node rb;                        /* locked by anon_vma->rwsem */
        unsigned long rb_subtree_last;
#ifdef CONFIG_DEBUG_VM_RB
        unsigned long cached_vma_start, cached_vma_last;
#endif
};

enum ttu_flags {
        TTU_SPLIT_HUGE_PMD        = 0x4,        /* split huge PMD if any */
        TTU_IGNORE_MLOCK        = 0x8,        /* ignore mlock */
        TTU_SYNC                = 0x10,        /* avoid racy checks with PVMW_SYNC */
        TTU_HWPOISON                = 0x20,        /* do convert pte to hwpoison entry */
        TTU_BATCH_FLUSH                = 0x40,        /* Batch TLB flushes where possible
                                         * and caller guarantees they will
                                         * do a final flush if necessary */
        TTU_RMAP_LOCKED                = 0x80,        /* do not grab rmap lock:
                                         * caller holds it */
};

#ifdef CONFIG_MMU
static inline void get_anon_vma(struct anon_vma *anon_vma)
{
        atomic_inc(&anon_vma->refcount);
}

void __put_anon_vma(struct anon_vma *anon_vma);

static inline void put_anon_vma(struct anon_vma *anon_vma)
{
        if (atomic_dec_and_test(&anon_vma->refcount))
                __put_anon_vma(anon_vma);
}

static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
{
        down_write(&anon_vma->root->rwsem);
}

static inline int anon_vma_trylock_write(struct anon_vma *anon_vma)
{
        return down_write_trylock(&anon_vma->root->rwsem);
}

static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
{
        up_write(&anon_vma->root->rwsem);
}

static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
{
        down_read(&anon_vma->root->rwsem);
}

static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
{
        return down_read_trylock(&anon_vma->root->rwsem);
}

static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
{
        up_read(&anon_vma->root->rwsem);
}


/*
 * anon_vma helper functions.
 */
void anon_vma_init(void);        /* create anon_vma_cachep */
int  __anon_vma_prepare(struct vm_area_struct *);
void unlink_anon_vmas(struct vm_area_struct *);
int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);

static inline int anon_vma_prepare(struct vm_area_struct *vma)
{
        if (likely(vma->anon_vma))
                return 0;

        return __anon_vma_prepare(vma);
}

static inline void anon_vma_merge(struct vm_area_struct *vma,
                                  struct vm_area_struct *next)
{
        VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
        unlink_anon_vmas(next);
}

struct anon_vma *folio_get_anon_vma(struct folio *folio);

/* RMAP flags, currently only relevant for some anon rmap operations. */
typedef int __bitwise rmap_t;

/*
 * No special request: A mapped anonymous (sub)page is possibly shared between
 * processes.
 */
#define RMAP_NONE                ((__force rmap_t)0)

/* The anonymous (sub)page is exclusive to a single process. */
#define RMAP_EXCLUSIVE                ((__force rmap_t)BIT(0))

/*
 * Internally, we're using an enum to specify the granularity. We make the
 * compiler emit specialized code for each granularity.
 */
enum rmap_level {
        RMAP_LEVEL_PTE = 0,
        RMAP_LEVEL_PMD,
};

static inline void __folio_rmap_sanity_checks(struct folio *folio,
                struct page *page, int nr_pages, enum rmap_level level)
{
        /* hugetlb folios are handled separately. */
        VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);

        /*
         * TODO: we get driver-allocated folios that have nothing to do with
         * the rmap using vm_insert_page(); therefore, we cannot assume that
         * folio_test_large_rmappable() holds for large folios. We should
         * handle any desired mapcount+stats accounting for these folios in
         * VM_MIXEDMAP VMAs separately, and then sanity-check here that
         * we really only get rmappable folios.
         */

        VM_WARN_ON_ONCE(nr_pages <= 0);
        VM_WARN_ON_FOLIO(page_folio(page) != folio, folio);
        VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio);

        switch (level) {
        case RMAP_LEVEL_PTE:
                break;
        case RMAP_LEVEL_PMD:
                /*
                 * We don't support folios larger than a single PMD yet. So
                 * when RMAP_LEVEL_PMD is set, we assume that we are creating
                 * a single "entire" mapping of the folio.
                 */
                VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
                VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
                break;
        default:
                VM_WARN_ON_ONCE(true);
        }
}

/*
 * rmap interfaces called when adding or removing pte of page
 */
void folio_move_anon_rmap(struct folio *, struct vm_area_struct *);
void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages,
                struct vm_area_struct *, unsigned long address, rmap_t flags);
#define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \
        folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags)
void folio_add_anon_rmap_pmd(struct folio *, struct page *,
                struct vm_area_struct *, unsigned long address, rmap_t flags);
void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
                unsigned long address);
void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
                struct vm_area_struct *);
#define folio_add_file_rmap_pte(folio, page, vma) \
        folio_add_file_rmap_ptes(folio, page, 1, vma)
void folio_add_file_rmap_pmd(struct folio *, struct page *,
                struct vm_area_struct *);
void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages,
                struct vm_area_struct *);
#define folio_remove_rmap_pte(folio, page, vma) \
        folio_remove_rmap_ptes(folio, page, 1, vma)
void folio_remove_rmap_pmd(struct folio *, struct page *,
                struct vm_area_struct *);

void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
                unsigned long address, rmap_t flags);
void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
                unsigned long address);

/* See folio_try_dup_anon_rmap_*() */
static inline int hugetlb_try_dup_anon_rmap(struct folio *folio,
                struct vm_area_struct *vma)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);

        if (PageAnonExclusive(&folio->page)) {
                if (unlikely(folio_needs_cow_for_dma(vma, folio)))
                        return -EBUSY;
                ClearPageAnonExclusive(&folio->page);
        }
        atomic_inc(&folio->_entire_mapcount);
        atomic_inc(&folio->_large_mapcount);
        return 0;
}

/* See folio_try_share_anon_rmap_*() */
static inline int hugetlb_try_share_anon_rmap(struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
        VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio);

        /* Paired with the memory barrier in try_grab_folio(). */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_mb();

        if (unlikely(folio_maybe_dma_pinned(folio)))
                return -EBUSY;
        ClearPageAnonExclusive(&folio->page);

        /*
         * This is conceptually a smp_wmb() paired with the smp_rmb() in
         * gup_must_unshare().
         */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_mb__after_atomic();
        return 0;
}

static inline void hugetlb_add_file_rmap(struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);

        atomic_inc(&folio->_entire_mapcount);
        atomic_inc(&folio->_large_mapcount);
}

static inline void hugetlb_remove_rmap(struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);

        atomic_dec(&folio->_entire_mapcount);
        atomic_dec(&folio->_large_mapcount);
}

static __always_inline void __folio_dup_file_rmap(struct folio *folio,
                struct page *page, int nr_pages, enum rmap_level level)
{
        const int orig_nr_pages = nr_pages;

        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        switch (level) {
        case RMAP_LEVEL_PTE:
                if (!folio_test_large(folio)) {
                        atomic_inc(&page->_mapcount);
                        break;
                }

                do {
                        atomic_inc(&page->_mapcount);
                } while (page++, --nr_pages > 0);
                atomic_add(orig_nr_pages, &folio->_large_mapcount);
                break;
        case RMAP_LEVEL_PMD:
                atomic_inc(&folio->_entire_mapcount);
                atomic_inc(&folio->_large_mapcount);
                break;
        }
}

/**
 * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio
 * @folio:        The folio to duplicate the mappings of
 * @page:        The first page to duplicate the mappings of
 * @nr_pages:        The number of pages of which the mapping will be duplicated
 *
 * The page range of the folio is defined by [page, page + nr_pages)
 *
 * The caller needs to hold the page table lock.
 */
static inline void folio_dup_file_rmap_ptes(struct folio *folio,
                struct page *page, int nr_pages)
{
        __folio_dup_file_rmap(folio, page, nr_pages, RMAP_LEVEL_PTE);
}

static __always_inline void folio_dup_file_rmap_pte(struct folio *folio,
                struct page *page)
{
        __folio_dup_file_rmap(folio, page, 1, RMAP_LEVEL_PTE);
}

/**
 * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio
 * @folio:        The folio to duplicate the mapping of
 * @page:        The first page to duplicate the mapping of
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock.
 */
static inline void folio_dup_file_rmap_pmd(struct folio *folio,
                struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, RMAP_LEVEL_PTE);
#else
        WARN_ON_ONCE(true);
#endif
}

static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *src_vma,
                enum rmap_level level)
{
        const int orig_nr_pages = nr_pages;
        bool maybe_pinned;
        int i;

        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        /*
         * If this folio may have been pinned by the parent process,
         * don't allow to duplicate the mappings but instead require to e.g.,
         * copy the subpage immediately for the child so that we'll always
         * guarantee the pinned folio won't be randomly replaced in the
         * future on write faults.
         */
        maybe_pinned = likely(!folio_is_device_private(folio)) &&
                       unlikely(folio_needs_cow_for_dma(src_vma, folio));

        /*
         * No need to check+clear for already shared PTEs/PMDs of the
         * folio. But if any page is PageAnonExclusive, we must fallback to
         * copying if the folio maybe pinned.
         */
        switch (level) {
        case RMAP_LEVEL_PTE:
                if (unlikely(maybe_pinned)) {
                        for (i = 0; i < nr_pages; i++)
                                if (PageAnonExclusive(page + i))
                                        return -EBUSY;
                }

                if (!folio_test_large(folio)) {
                        if (PageAnonExclusive(page))
                                ClearPageAnonExclusive(page);
                        atomic_inc(&page->_mapcount);
                        break;
                }

                do {
                        if (PageAnonExclusive(page))
                                ClearPageAnonExclusive(page);
                        atomic_inc(&page->_mapcount);
                } while (page++, --nr_pages > 0);
                atomic_add(orig_nr_pages, &folio->_large_mapcount);
                break;
        case RMAP_LEVEL_PMD:
                if (PageAnonExclusive(page)) {
                        if (unlikely(maybe_pinned))
                                return -EBUSY;
                        ClearPageAnonExclusive(page);
                }
                atomic_inc(&folio->_entire_mapcount);
                atomic_inc(&folio->_large_mapcount);
                break;
        }
        return 0;
}

/**
 * folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range
 *                                  of a folio
 * @folio:        The folio to duplicate the mappings of
 * @page:        The first page to duplicate the mappings of
 * @nr_pages:        The number of pages of which the mapping will be duplicated
 * @src_vma:        The vm area from which the mappings are duplicated
 *
 * The page range of the folio is defined by [page, page + nr_pages)
 *
 * The caller needs to hold the page table lock and the
 * vma->vma_mm->write_protect_seq.
 *
 * Duplicating the mappings can only fail if the folio may be pinned; device
 * private folios cannot get pinned and consequently this function cannot fail
 * for them.
 *
 * If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in
 * the parent and the child. They must *not* be writable after this call
 * succeeded.
 *
 * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise.
 */
static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *src_vma)
{
        return __folio_try_dup_anon_rmap(folio, page, nr_pages, src_vma,
                                         RMAP_LEVEL_PTE);
}

static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
                struct page *page, struct vm_area_struct *src_vma)
{
        return __folio_try_dup_anon_rmap(folio, page, 1, src_vma,
                                         RMAP_LEVEL_PTE);
}

/**
 * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range
 *                                 of a folio
 * @folio:        The folio to duplicate the mapping of
 * @page:        The first page to duplicate the mapping of
 * @src_vma:        The vm area from which the mapping is duplicated
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock and the
 * vma->vma_mm->write_protect_seq.
 *
 * Duplicating the mapping can only fail if the folio may be pinned; device
 * private folios cannot get pinned and consequently this function cannot fail
 * for them.
 *
 * If duplicating the mapping succeeds, the duplicated PMD has to be R/O in
 * the parent and the child. They must *not* be writable after this call
 * succeeded.
 *
 * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
 */
static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
                struct page *page, struct vm_area_struct *src_vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, src_vma,
                                         RMAP_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
        return -EBUSY;
#endif
}

static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
                struct page *page, int nr_pages, enum rmap_level level)
{
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
        VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio);
        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        /* device private folios cannot get pinned via GUP. */
        if (unlikely(folio_is_device_private(folio))) {
                ClearPageAnonExclusive(page);
                return 0;
        }

        /*
         * We have to make sure that when we clear PageAnonExclusive, that
         * the page is not pinned and that concurrent GUP-fast won't succeed in
         * concurrently pinning the page.
         *
         * Conceptually, PageAnonExclusive clearing consists of:
         * (A1) Clear PTE
         * (A2) Check if the page is pinned; back off if so.
         * (A3) Clear PageAnonExclusive
         * (A4) Restore PTE (optional, but certainly not writable)
         *
         * When clearing PageAnonExclusive, we cannot possibly map the page
         * writable again, because anon pages that may be shared must never
         * be writable. So in any case, if the PTE was writable it cannot
         * be writable anymore afterwards and there would be a PTE change. Only
         * if the PTE wasn't writable, there might not be a PTE change.
         *
         * Conceptually, GUP-fast pinning of an anon page consists of:
         * (B1) Read the PTE
         * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so.
         * (B3) Pin the mapped page
         * (B4) Check if the PTE changed by re-reading it; back off if so.
         * (B5) If the original PTE is not writable, check if
         *        PageAnonExclusive is not set; back off if so.
         *
         * If the PTE was writable, we only have to make sure that GUP-fast
         * observes a PTE change and properly backs off.
         *
         * If the PTE was not writable, we have to make sure that GUP-fast either
         * detects a (temporary) PTE change or that PageAnonExclusive is cleared
         * and properly backs off.
         *
         * Consequently, when clearing PageAnonExclusive(), we have to make
         * sure that (A1), (A2)/(A3) and (A4) happen in the right memory
         * order. In GUP-fast pinning code, we have to make sure that (B3),(B4)
         * and (B5) happen in the right memory order.
         *
         * We assume that there might not be a memory barrier after
         * clearing/invalidating the PTE (A1) and before restoring the PTE (A4),
         * so we use explicit ones here.
         */

        /* Paired with the memory barrier in try_grab_folio(). */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_mb();

        if (unlikely(folio_maybe_dma_pinned(folio)))
                return -EBUSY;
        ClearPageAnonExclusive(page);

        /*
         * This is conceptually a smp_wmb() paired with the smp_rmb() in
         * gup_must_unshare().
         */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_mb__after_atomic();
        return 0;
}

/**
 * folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page
 *                                   mapped by a PTE possibly shared to prepare
 *                                   for KSM or temporary unmapping
 * @folio:        The folio to share a mapping of
 * @page:        The mapped exclusive page
 *
 * The caller needs to hold the page table lock and has to have the page table
 * entries cleared/invalidated.
 *
 * This is similar to folio_try_dup_anon_rmap_pte(), however, not used during
 * fork() to duplicate mappings, but instead to prepare for KSM or temporarily
 * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte().
 *
 * Marking the mapped page shared can only fail if the folio maybe pinned;
 * device private folios cannot get pinned and consequently this function cannot
 * fail.
 *
 * Returns 0 if marking the mapped page possibly shared succeeded. Returns
 * -EBUSY otherwise.
 */
static inline int folio_try_share_anon_rmap_pte(struct folio *folio,
                struct page *page)
{
        return __folio_try_share_anon_rmap(folio, page, 1, RMAP_LEVEL_PTE);
}

/**
 * folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page
 *                                   range mapped by a PMD possibly shared to
 *                                   prepare for temporary unmapping
 * @folio:        The folio to share the mapping of
 * @page:        The first page to share the mapping of
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock and has to have the page table
 * entries cleared/invalidated.
 *
 * This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during
 * fork() to duplicate a mapping, but instead to prepare for temporarily
 * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd().
 *
 * Marking the mapped pages shared can only fail if the folio maybe pinned;
 * device private folios cannot get pinned and consequently this function cannot
 * fail.
 *
 * Returns 0 if marking the mapped pages possibly shared succeeded. Returns
 * -EBUSY otherwise.
 */
static inline int folio_try_share_anon_rmap_pmd(struct folio *folio,
                struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR,
                                           RMAP_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
        return -EBUSY;
#endif
}

/*
 * Called from mm/vmscan.c to handle paging out
 */
int folio_referenced(struct folio *, int is_locked,
                        struct mem_cgroup *memcg, unsigned long *vm_flags);

void try_to_migrate(struct folio *folio, enum ttu_flags flags);
void try_to_unmap(struct folio *, enum ttu_flags flags);

int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
                                unsigned long end, struct page **pages,
                                void *arg);

/* Avoid racy checks */
#define PVMW_SYNC                (1 << 0)
/* Look for migration entries rather than present PTEs */
#define PVMW_MIGRATION                (1 << 1)

struct page_vma_mapped_walk {
        unsigned long pfn;
        unsigned long nr_pages;
        pgoff_t pgoff;
        struct vm_area_struct *vma;
        unsigned long address;
        pmd_t *pmd;
        pte_t *pte;
        spinlock_t *ptl;
        unsigned int flags;
};

#define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags)        \
        struct page_vma_mapped_walk name = {                                \
                .pfn = page_to_pfn(_page),                                \
                .nr_pages = compound_nr(_page),                                \
                .pgoff = page_to_pgoff(_page),                                \
                .vma = _vma,                                                \
                .address = _address,                                        \
                .flags = _flags,                                        \
        }

#define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags)        \
        struct page_vma_mapped_walk name = {                                \
                .pfn = folio_pfn(_folio),                                \
                .nr_pages = folio_nr_pages(_folio),                        \
                .pgoff = folio_pgoff(_folio),                                \
                .vma = _vma,                                                \
                .address = _address,                                        \
                .flags = _flags,                                        \
        }

static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
{
        /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
        if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma))
                pte_unmap(pvmw->pte);
        if (pvmw->ptl)
                spin_unlock(pvmw->ptl);
}

bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);

/*
 * Used by swapoff to help locate where page is expected in vma.
 */
unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);

/*
 * Cleans the PTEs of shared mappings.
 * (and since clean PTEs should also be readonly, write protects them too)
 *
 * returns the number of cleaned PTEs.
 */
int folio_mkclean(struct folio *);

int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
                      struct vm_area_struct *vma);

void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);

unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);

/*
 * rmap_walk_control: To control rmap traversing for specific needs
 *
 * arg: passed to rmap_one() and invalid_vma()
 * try_lock: bail out if the rmap lock is contended
 * contended: indicate the rmap traversal bailed out due to lock contention
 * rmap_one: executed on each vma where page is mapped
 * done: for checking traversing termination condition
 * anon_lock: for getting anon_lock by optimized way rather than default
 * invalid_vma: for skipping uninterested vma
 */
struct rmap_walk_control {
        void *arg;
        bool try_lock;
        bool contended;
        /*
         * Return false if page table scanning in rmap_walk should be stopped.
         * Otherwise, return true.
         */
        bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma,
                                        unsigned long addr, void *arg);
        int (*done)(struct folio *folio);
        struct anon_vma *(*anon_lock)(struct folio *folio,
                                      struct rmap_walk_control *rwc);
        bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
};

void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc);
void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc);
struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
                                          struct rmap_walk_control *rwc);

#else        /* !CONFIG_MMU */

#define anon_vma_init()                do {} while (0)
#define anon_vma_prepare(vma)        (0)

static inline int folio_referenced(struct folio *folio, int is_locked,
                                  struct mem_cgroup *memcg,
                                  unsigned long *vm_flags)
{
        *vm_flags = 0;
        return 0;
}

static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags)
{
}

static inline int folio_mkclean(struct folio *folio)
{
        return 0;
}
#endif        /* CONFIG_MMU */

static inline int page_mkclean(struct page *page)
{
        return folio_mkclean(page_folio(page));
}
#endif        /* _LINUX_RMAP_H */





























































































































































































































































































































































































































































































































































































































































    1 





    2 






































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
// SPDX-License-Identifier: GPL-2.0

#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/ktime.h>
#include <linux/list.h>
#include <linux/math64.h>
#include <linux/sizes.h>
#include <linux/workqueue.h>
#include "ctree.h"
#include "block-group.h"
#include "discard.h"
#include "free-space-cache.h"
#include "fs.h"

/*
 * This contains the logic to handle async discard.
 *
 * Async discard manages trimming of free space outside of transaction commit.
 * Discarding is done by managing the block_groups on a LRU list based on free
 * space recency.  Two passes are used to first prioritize discarding extents
 * and then allow for trimming in the bitmap the best opportunity to coalesce.
 * The block_groups are maintained on multiple lists to allow for multiple
 * passes with different discard filter requirements.  A delayed work item is
 * used to manage discarding with timeout determined by a max of the delay
 * incurred by the iops rate limit, the byte rate limit, and the max delay of
 * BTRFS_DISCARD_MAX_DELAY.
 *
 * Note, this only keeps track of block_groups that are explicitly for data.
 * Mixed block_groups are not supported.
 *
 * The first list is special to manage discarding of fully free block groups.
 * This is necessary because we issue a final trim for a full free block group
 * after forgetting it.  When a block group becomes unused, instead of directly
 * being added to the unused_bgs list, we add it to this first list.  Then
 * from there, if it becomes fully discarded, we place it onto the unused_bgs
 * list.
 *
 * The in-memory free space cache serves as the backing state for discard.
 * Consequently this means there is no persistence.  We opt to load all the
 * block groups in as not discarded, so the mount case degenerates to the
 * crashing case.
 *
 * As the free space cache uses bitmaps, there exists a tradeoff between
 * ease/efficiency for find_free_extent() and the accuracy of discard state.
 * Here we opt to let untrimmed regions merge with everything while only letting
 * trimmed regions merge with other trimmed regions.  This can cause
 * overtrimming, but the coalescing benefit seems to be worth it.  Additionally,
 * bitmap state is tracked as a whole.  If we're able to fully trim a bitmap,
 * the trimmed flag is set on the bitmap.  Otherwise, if an allocation comes in,
 * this resets the state and we will retry trimming the whole bitmap.  This is a
 * tradeoff between discard state accuracy and the cost of accounting.
 */

/* This is an initial delay to give some chance for block reuse */
#define BTRFS_DISCARD_DELAY                (120ULL * NSEC_PER_SEC)
#define BTRFS_DISCARD_UNUSED_DELAY        (10ULL * NSEC_PER_SEC)

#define BTRFS_DISCARD_MIN_DELAY_MSEC        (1UL)
#define BTRFS_DISCARD_MAX_DELAY_MSEC        (1000UL)
#define BTRFS_DISCARD_MAX_IOPS                (1000U)

/* Monotonically decreasing minimum length filters after index 0 */
static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
        0,
        BTRFS_ASYNC_DISCARD_MAX_FILTER,
        BTRFS_ASYNC_DISCARD_MIN_FILTER
};

static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
                                          struct btrfs_block_group *block_group)
{
        return &discard_ctl->discard_list[block_group->discard_index];
}

/*
 * Determine if async discard should be running.
 *
 * @discard_ctl: discard control
 *
 * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
 */
static bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
{
        struct btrfs_fs_info *fs_info = container_of(discard_ctl,
                                                     struct btrfs_fs_info,
                                                     discard_ctl);

        return (!(fs_info->sb->s_flags & SB_RDONLY) &&
                test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
}

static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
                                  struct btrfs_block_group *block_group)
{
        lockdep_assert_held(&discard_ctl->lock);
        if (!btrfs_run_discard_work(discard_ctl))
                return;

        if (list_empty(&block_group->discard_list) ||
            block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
                if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED)
                        block_group->discard_index = BTRFS_DISCARD_INDEX_START;
                block_group->discard_eligible_time = (ktime_get_ns() +
                                                      BTRFS_DISCARD_DELAY);
                block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
        }
        if (list_empty(&block_group->discard_list))
                btrfs_get_block_group(block_group);

        list_move_tail(&block_group->discard_list,
                       get_discard_list(discard_ctl, block_group));
}

static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
                                struct btrfs_block_group *block_group)
{
        if (!btrfs_is_block_group_data_only(block_group))
                return;

        spin_lock(&discard_ctl->lock);
        __add_to_discard_list(discard_ctl, block_group);
        spin_unlock(&discard_ctl->lock);
}

static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
                                       struct btrfs_block_group *block_group)
{
        bool queued;

        spin_lock(&discard_ctl->lock);

        queued = !list_empty(&block_group->discard_list);

        if (!btrfs_run_discard_work(discard_ctl)) {
                spin_unlock(&discard_ctl->lock);
                return;
        }

        list_del_init(&block_group->discard_list);

        block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
        block_group->discard_eligible_time = (ktime_get_ns() +
                                              BTRFS_DISCARD_UNUSED_DELAY);
        block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
        if (!queued)
                btrfs_get_block_group(block_group);
        list_add_tail(&block_group->discard_list,
                      &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);

        spin_unlock(&discard_ctl->lock);
}

static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
                                     struct btrfs_block_group *block_group)
{
        bool running = false;
        bool queued = false;

        spin_lock(&discard_ctl->lock);

        if (block_group == discard_ctl->block_group) {
                running = true;
                discard_ctl->block_group = NULL;
        }

        block_group->discard_eligible_time = 0;
        queued = !list_empty(&block_group->discard_list);
        list_del_init(&block_group->discard_list);
        /*
         * If the block group is currently running in the discard workfn, we
         * don't want to deref it, since it's still being used by the workfn.
         * The workfn will notice this case and deref the block group when it is
         * finished.
         */
        if (queued && !running)
                btrfs_put_block_group(block_group);

        spin_unlock(&discard_ctl->lock);

        return running;
}

/*
 * Find block_group that's up next for discarding.
 *
 * @discard_ctl:  discard control
 * @now:          current time
 *
 * Iterate over the discard lists to find the next block_group up for
 * discarding checking the discard_eligible_time of block_group.
 */
static struct btrfs_block_group *find_next_block_group(
                                        struct btrfs_discard_ctl *discard_ctl,
                                        u64 now)
{
        struct btrfs_block_group *ret_block_group = NULL, *block_group;
        int i;

        for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
                struct list_head *discard_list = &discard_ctl->discard_list[i];

                if (!list_empty(discard_list)) {
                        block_group = list_first_entry(discard_list,
                                                       struct btrfs_block_group,
                                                       discard_list);

                        if (!ret_block_group)
                                ret_block_group = block_group;

                        if (ret_block_group->discard_eligible_time < now)
                                break;

                        if (ret_block_group->discard_eligible_time >
                            block_group->discard_eligible_time)
                                ret_block_group = block_group;
                }
        }

        return ret_block_group;
}

/*
 * Look up next block group and set it for use.
 *
 * @discard_ctl:   discard control
 * @discard_state: the discard_state of the block_group after state management
 * @discard_index: the discard_index of the block_group after state management
 * @now:           time when discard was invoked, in ns
 *
 * Wrap find_next_block_group() and set the block_group to be in use.
 * @discard_state's control flow is managed here.  Variables related to
 * @discard_state are reset here as needed (eg. @discard_cursor).  @discard_state
 * and @discard_index are remembered as it may change while we're discarding,
 * but we want the discard to execute in the context determined here.
 */
static struct btrfs_block_group *peek_discard_list(
                                        struct btrfs_discard_ctl *discard_ctl,
                                        enum btrfs_discard_state *discard_state,
                                        int *discard_index, u64 now)
{
        struct btrfs_block_group *block_group;

        spin_lock(&discard_ctl->lock);
again:
        block_group = find_next_block_group(discard_ctl, now);

        if (block_group && now >= block_group->discard_eligible_time) {
                if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
                    block_group->used != 0) {
                        if (btrfs_is_block_group_data_only(block_group)) {
                                __add_to_discard_list(discard_ctl, block_group);
                        } else {
                                list_del_init(&block_group->discard_list);
                                btrfs_put_block_group(block_group);
                        }
                        goto again;
                }
                if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
                        block_group->discard_cursor = block_group->start;
                        block_group->discard_state = BTRFS_DISCARD_EXTENTS;
                }
                discard_ctl->block_group = block_group;
        }
        if (block_group) {
                *discard_state = block_group->discard_state;
                *discard_index = block_group->discard_index;
        }
        spin_unlock(&discard_ctl->lock);

        return block_group;
}

/*
 * Update a block group's filters.
 *
 * @block_group:  block group of interest
 * @bytes:        recently freed region size after coalescing
 *
 * Async discard maintains multiple lists with progressively smaller filters
 * to prioritize discarding based on size.  Should a free space that matches
 * a larger filter be returned to the free_space_cache, prioritize that discard
 * by moving @block_group to the proper filter.
 */
void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
                                u64 bytes)
{
        struct btrfs_discard_ctl *discard_ctl;

        if (!block_group ||
            !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
                return;

        discard_ctl = &block_group->fs_info->discard_ctl;

        if (block_group->discard_index > BTRFS_DISCARD_INDEX_START &&
            bytes >= discard_minlen[block_group->discard_index - 1]) {
                int i;

                remove_from_discard_list(discard_ctl, block_group);

                for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS;
                     i++) {
                        if (bytes >= discard_minlen[i]) {
                                block_group->discard_index = i;
                                add_to_discard_list(discard_ctl, block_group);
                                break;
                        }
                }
        }
}

/*
 * Move a block group along the discard lists.
 *
 * @discard_ctl: discard control
 * @block_group: block_group of interest
 *
 * Increment @block_group's discard_index.  If it falls of the list, let it be.
 * Otherwise add it back to the appropriate list.
 */
static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
                                       struct btrfs_block_group *block_group)
{
        block_group->discard_index++;
        if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) {
                block_group->discard_index = 1;
                return;
        }

        add_to_discard_list(discard_ctl, block_group);
}

/*
 * Remove a block_group from the discard lists.
 *
 * @discard_ctl: discard control
 * @block_group: block_group of interest
 *
 * Remove @block_group from the discard lists.  If necessary, wait on the
 * current work and then reschedule the delayed work.
 */
void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
                               struct btrfs_block_group *block_group)
{
        if (remove_from_discard_list(discard_ctl, block_group)) {
                cancel_delayed_work_sync(&discard_ctl->work);
                btrfs_discard_schedule_work(discard_ctl, true);
        }
}

/*
 * Handles queuing the block_groups.
 *
 * @discard_ctl: discard control
 * @block_group: block_group of interest
 *
 * Maintain the LRU order of the discard lists.
 */
void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
                              struct btrfs_block_group *block_group)
{
        if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
                return;

        if (block_group->used == 0)
                add_to_discard_unused_list(discard_ctl, block_group);
        else
                add_to_discard_list(discard_ctl, block_group);

        if (!delayed_work_pending(&discard_ctl->work))
                btrfs_discard_schedule_work(discard_ctl, false);
}

static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
                                          u64 now, bool override)
{
        struct btrfs_block_group *block_group;

        if (!btrfs_run_discard_work(discard_ctl))
                return;
        if (!override && delayed_work_pending(&discard_ctl->work))
                return;

        block_group = find_next_block_group(discard_ctl, now);
        if (block_group) {
                u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC;
                u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);

                /*
                 * A single delayed workqueue item is responsible for
                 * discarding, so we can manage the bytes rate limit by keeping
                 * track of the previous discard.
                 */
                if (kbps_limit && discard_ctl->prev_discard) {
                        u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
                        u64 bps_delay = div64_u64(discard_ctl->prev_discard *
                                                  NSEC_PER_SEC, bps_limit);

                        delay = max(delay, bps_delay);
                }

                /*
                 * This timeout is to hopefully prevent immediate discarding
                 * in a recently allocated block group.
                 */
                if (now < block_group->discard_eligible_time) {
                        u64 bg_timeout = block_group->discard_eligible_time - now;

                        delay = max(delay, bg_timeout);
                }

                if (override && discard_ctl->prev_discard) {
                        u64 elapsed = now - discard_ctl->prev_discard_time;

                        if (delay > elapsed)
                                delay -= elapsed;
                        else
                                delay = 0;
                }

                mod_delayed_work(discard_ctl->discard_workers,
                                 &discard_ctl->work, nsecs_to_jiffies(delay));
        }
}

/*
 * Responsible for scheduling the discard work.
 *
 * @discard_ctl:  discard control
 * @override:     override the current timer
 *
 * Discards are issued by a delayed workqueue item.  @override is used to
 * update the current delay as the baseline delay interval is reevaluated on
 * transaction commit.  This is also maxed with any other rate limit.
 */
void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
                                 bool override)
{
        const u64 now = ktime_get_ns();

        spin_lock(&discard_ctl->lock);
        __btrfs_discard_schedule_work(discard_ctl, now, override);
        spin_unlock(&discard_ctl->lock);
}

/*
 * Determine next step of a block_group.
 *
 * @discard_ctl: discard control
 * @block_group: block_group of interest
 *
 * Determine the next step for a block group after it's finished going through
 * a pass on a discard list.  If it is unused and fully trimmed, we can mark it
 * unused and send it to the unused_bgs path.  Otherwise, pass it onto the
 * appropriate filter list or let it fall off.
 */
static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
                                      struct btrfs_block_group *block_group)
{
        remove_from_discard_list(discard_ctl, block_group);

        if (block_group->used == 0) {
                if (btrfs_is_free_space_trimmed(block_group))
                        btrfs_mark_bg_unused(block_group);
                else
                        add_to_discard_unused_list(discard_ctl, block_group);
        } else {
                btrfs_update_discard_index(discard_ctl, block_group);
        }
}

/*
 * Discard work queue callback
 *
 * @work: work
 *
 * Find the next block_group to start discarding and then discard a single
 * region.  It does this in a two-pass fashion: first extents and second
 * bitmaps.  Completely discarded block groups are sent to the unused_bgs path.
 */
static void btrfs_discard_workfn(struct work_struct *work)
{
        struct btrfs_discard_ctl *discard_ctl;
        struct btrfs_block_group *block_group;
        enum btrfs_discard_state discard_state;
        int discard_index = 0;
        u64 trimmed = 0;
        u64 minlen = 0;
        u64 now = ktime_get_ns();

        discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);

        block_group = peek_discard_list(discard_ctl, &discard_state,
                                        &discard_index, now);
        if (!block_group || !btrfs_run_discard_work(discard_ctl))
                return;
        if (now < block_group->discard_eligible_time) {
                btrfs_discard_schedule_work(discard_ctl, false);
                return;
        }

        /* Perform discarding */
        minlen = discard_minlen[discard_index];

        if (discard_state == BTRFS_DISCARD_BITMAPS) {
                u64 maxlen = 0;

                /*
                 * Use the previous levels minimum discard length as the max
                 * length filter.  In the case something is added to make a
                 * region go beyond the max filter, the entire bitmap is set
                 * back to BTRFS_TRIM_STATE_UNTRIMMED.
                 */
                if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
                        maxlen = discard_minlen[discard_index - 1];

                btrfs_trim_block_group_bitmaps(block_group, &trimmed,
                                       block_group->discard_cursor,
                                       btrfs_block_group_end(block_group),
                                       minlen, maxlen, true);
                discard_ctl->discard_bitmap_bytes += trimmed;
        } else {
                btrfs_trim_block_group_extents(block_group, &trimmed,
                                       block_group->discard_cursor,
                                       btrfs_block_group_end(block_group),
                                       minlen, true);
                discard_ctl->discard_extent_bytes += trimmed;
        }

        /* Determine next steps for a block_group */
        if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
                if (discard_state == BTRFS_DISCARD_BITMAPS) {
                        btrfs_finish_discard_pass(discard_ctl, block_group);
                } else {
                        block_group->discard_cursor = block_group->start;
                        spin_lock(&discard_ctl->lock);
                        if (block_group->discard_state !=
                            BTRFS_DISCARD_RESET_CURSOR)
                                block_group->discard_state =
                                                        BTRFS_DISCARD_BITMAPS;
                        spin_unlock(&discard_ctl->lock);
                }
        }

        now = ktime_get_ns();
        spin_lock(&discard_ctl->lock);
        discard_ctl->prev_discard = trimmed;
        discard_ctl->prev_discard_time = now;
        /*
         * If the block group was removed from the discard list while it was
         * running in this workfn, then we didn't deref it, since this function
         * still owned that reference. But we set the discard_ctl->block_group
         * back to NULL, so we can use that condition to know that now we need
         * to deref the block_group.
         */
        if (discard_ctl->block_group == NULL)
                btrfs_put_block_group(block_group);
        discard_ctl->block_group = NULL;
        __btrfs_discard_schedule_work(discard_ctl, now, false);
        spin_unlock(&discard_ctl->lock);
}

/*
 * Recalculate the base delay.
 *
 * @discard_ctl: discard control
 *
 * Recalculate the base delay which is based off the total number of
 * discardable_extents.  Clamp this between the lower_limit (iops_limit or 1ms)
 * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC).
 */
void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
{
        s32 discardable_extents;
        s64 discardable_bytes;
        u32 iops_limit;
        unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC;
        unsigned long delay;

        discardable_extents = atomic_read(&discard_ctl->discardable_extents);
        if (!discardable_extents)
                return;

        spin_lock(&discard_ctl->lock);

        /*
         * The following is to fix a potential -1 discrepancy that we're not
         * sure how to reproduce. But given that this is the only place that
         * utilizes these numbers and this is only called by from
         * btrfs_finish_extent_commit() which is synchronized, we can correct
         * here.
         */
        if (discardable_extents < 0)
                atomic_add(-discardable_extents,
                           &discard_ctl->discardable_extents);

        discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes);
        if (discardable_bytes < 0)
                atomic64_add(-discardable_bytes,
                             &discard_ctl->discardable_bytes);

        if (discardable_extents <= 0) {
                spin_unlock(&discard_ctl->lock);
                return;
        }

        iops_limit = READ_ONCE(discard_ctl->iops_limit);

        if (iops_limit) {
                delay = MSEC_PER_SEC / iops_limit;
        } else {
                /*
                 * Unset iops_limit means go as fast as possible, so allow a
                 * delay of 0.
                 */
                delay = 0;
                min_delay = 0;
        }

        delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC);
        discard_ctl->delay_ms = delay;

        spin_unlock(&discard_ctl->lock);
}

/*
 * Propagate discard counters.
 *
 * @block_group: block_group of interest
 *
 * Propagate deltas of counters up to the discard_ctl.  It maintains a current
 * counter and a previous counter passing the delta up to the global stat.
 * Then the current counter value becomes the previous counter value.
 */
void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
{
        struct btrfs_free_space_ctl *ctl;
        struct btrfs_discard_ctl *discard_ctl;
        s32 extents_delta;
        s64 bytes_delta;

        if (!block_group ||
            !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) ||
            !btrfs_is_block_group_data_only(block_group))
                return;

        ctl = block_group->free_space_ctl;
        discard_ctl = &block_group->fs_info->discard_ctl;

        lockdep_assert_held(&ctl->tree_lock);
        extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
                        ctl->discardable_extents[BTRFS_STAT_PREV];
        if (extents_delta) {
                atomic_add(extents_delta, &discard_ctl->discardable_extents);
                ctl->discardable_extents[BTRFS_STAT_PREV] =
                        ctl->discardable_extents[BTRFS_STAT_CURR];
        }

        bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] -
                      ctl->discardable_bytes[BTRFS_STAT_PREV];
        if (bytes_delta) {
                atomic64_add(bytes_delta, &discard_ctl->discardable_bytes);
                ctl->discardable_bytes[BTRFS_STAT_PREV] =
                        ctl->discardable_bytes[BTRFS_STAT_CURR];
        }
}

/*
 * Punt unused_bgs list to discard lists.
 *
 * @fs_info: fs_info of interest
 *
 * The unused_bgs list needs to be punted to the discard lists because the
 * order of operations is changed.  In the normal synchronous discard path, the
 * block groups are trimmed via a single large trim in transaction commit.  This
 * is ultimately what we are trying to avoid with asynchronous discard.  Thus,
 * it must be done before going down the unused_bgs path.
 */
void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
{
        struct btrfs_block_group *block_group, *next;

        spin_lock(&fs_info->unused_bgs_lock);
        /* We enabled async discard, so punt all to the queue */
        list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
                                 bg_list) {
                list_del_init(&block_group->bg_list);
                btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
                /*
                 * This put is for the get done by btrfs_mark_bg_unused.
                 * Queueing discard incremented it for discard's reference.
                 */
                btrfs_put_block_group(block_group);
        }
        spin_unlock(&fs_info->unused_bgs_lock);
}

/*
 * Purge discard lists.
 *
 * @discard_ctl: discard control
 *
 * If we are disabling async discard, we may have intercepted block groups that
 * are completely free and ready for the unused_bgs path.  As discarding will
 * now happen in transaction commit or not at all, we can safely mark the
 * corresponding block groups as unused and they will be sent on their merry
 * way to the unused_bgs list.
 */
static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
{
        struct btrfs_block_group *block_group, *next;
        int i;

        spin_lock(&discard_ctl->lock);
        for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
                list_for_each_entry_safe(block_group, next,
                                         &discard_ctl->discard_list[i],
                                         discard_list) {
                        list_del_init(&block_group->discard_list);
                        spin_unlock(&discard_ctl->lock);
                        if (block_group->used == 0)
                                btrfs_mark_bg_unused(block_group);
                        spin_lock(&discard_ctl->lock);
                        btrfs_put_block_group(block_group);
                }
        }
        spin_unlock(&discard_ctl->lock);
}

void btrfs_discard_resume(struct btrfs_fs_info *fs_info)
{
        if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
                btrfs_discard_cleanup(fs_info);
                return;
        }

        btrfs_discard_punt_unused_bgs_list(fs_info);

        set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
}

void btrfs_discard_stop(struct btrfs_fs_info *fs_info)
{
        clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
}

void btrfs_discard_init(struct btrfs_fs_info *fs_info)
{
        struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
        int i;

        spin_lock_init(&discard_ctl->lock);
        INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn);

        for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++)
                INIT_LIST_HEAD(&discard_ctl->discard_list[i]);

        discard_ctl->prev_discard = 0;
        discard_ctl->prev_discard_time = 0;
        atomic_set(&discard_ctl->discardable_extents, 0);
        atomic64_set(&discard_ctl->discardable_bytes, 0);
        discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
        discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC;
        discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
        discard_ctl->kbps_limit = 0;
        discard_ctl->discard_extent_bytes = 0;
        discard_ctl->discard_bitmap_bytes = 0;
        atomic64_set(&discard_ctl->discard_bytes_saved, 0);
}

void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info)
{
        btrfs_discard_stop(fs_info);
        cancel_delayed_work_sync(&fs_info->discard_ctl.work);
        btrfs_discard_purge_list(&fs_info->discard_ctl);
}
















































    8 













   29 







   27 










    8 



    8 






    1 





    8 






   28 















   30 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PERCPU_RWSEM_H
#define _LINUX_PERCPU_RWSEM_H

#include <linux/atomic.h>
#include <linux/percpu.h>
#include <linux/rcuwait.h>
#include <linux/wait.h>
#include <linux/rcu_sync.h>
#include <linux/lockdep.h>

struct percpu_rw_semaphore {
        struct rcu_sync                rss;
        unsigned int __percpu        *read_count;
        struct rcuwait                writer;
        wait_queue_head_t        waiters;
        atomic_t                block;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
};

#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)        .dep_map = { .name = #lockname },
#else
#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)
#endif

#define __DEFINE_PERCPU_RWSEM(name, is_static)                                \
static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name);                \
is_static struct percpu_rw_semaphore name = {                                \
        .rss = __RCU_SYNC_INITIALIZER(name.rss),                        \
        .read_count = &__percpu_rwsem_rc_##name,                        \
        .writer = __RCUWAIT_INITIALIZER(name.writer),                        \
        .waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters),                \
        .block = ATOMIC_INIT(0),                                        \
        __PERCPU_RWSEM_DEP_MAP_INIT(name)                                \
}

#define DEFINE_PERCPU_RWSEM(name)                \
        __DEFINE_PERCPU_RWSEM(name, /* not static */)
#define DEFINE_STATIC_PERCPU_RWSEM(name)        \
        __DEFINE_PERCPU_RWSEM(name, static)

extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool);

static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
{
        might_sleep();

        rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);

        preempt_disable();
        /*
         * We are in an RCU-sched read-side critical section, so the writer
         * cannot both change sem->state from readers_fast and start checking
         * counters while we are here. So if we see !sem->state, we know that
         * the writer won't be checking until we're past the preempt_enable()
         * and that once the synchronize_rcu() is done, the writer will see
         * anything we did within this RCU-sched read-size critical section.
         */
        if (likely(rcu_sync_is_idle(&sem->rss)))
                this_cpu_inc(*sem->read_count);
        else
                __percpu_down_read(sem, false); /* Unconditional memory barrier */
        /*
         * The preempt_enable() prevents the compiler from
         * bleeding the critical section out.
         */
        preempt_enable();
}

static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
        bool ret = true;

        preempt_disable();
        /*
         * Same as in percpu_down_read().
         */
        if (likely(rcu_sync_is_idle(&sem->rss)))
                this_cpu_inc(*sem->read_count);
        else
                ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
        preempt_enable();
        /*
         * The barrier() from preempt_enable() prevents the compiler from
         * bleeding the critical section out.
         */

        if (ret)
                rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);

        return ret;
}

static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
{
        rwsem_release(&sem->dep_map, _RET_IP_);

        preempt_disable();
        /*
         * Same as in percpu_down_read().
         */
        if (likely(rcu_sync_is_idle(&sem->rss))) {
                this_cpu_dec(*sem->read_count);
        } else {
                /*
                 * slowpath; reader will only ever wake a single blocked
                 * writer.
                 */
                smp_mb(); /* B matches C */
                /*
                 * In other words, if they see our decrement (presumably to
                 * aggregate zero, as that is the only time it matters) they
                 * will also see our critical section.
                 */
                this_cpu_dec(*sem->read_count);
                rcuwait_wake_up(&sem->writer);
        }
        preempt_enable();
}

extern bool percpu_is_read_locked(struct percpu_rw_semaphore *);
extern void percpu_down_write(struct percpu_rw_semaphore *);
extern void percpu_up_write(struct percpu_rw_semaphore *);

static inline bool percpu_is_write_locked(struct percpu_rw_semaphore *sem)
{
        return atomic_read(&sem->block);
}

extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
                                const char *, struct lock_class_key *);

extern void percpu_free_rwsem(struct percpu_rw_semaphore *);

#define percpu_init_rwsem(sem)                                        \
({                                                                \
        static struct lock_class_key rwsem_key;                        \
        __percpu_init_rwsem(sem, #sem, &rwsem_key);                \
})

#define percpu_rwsem_is_held(sem)        lockdep_is_held(sem)
#define percpu_rwsem_assert_held(sem)        lockdep_assert_held(sem)

static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
                                        bool read, unsigned long ip)
{
        lock_release(&sem->dep_map, ip);
}

static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
                                        bool read, unsigned long ip)
{
        lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip);
}

#endif



























   20 





































    4 
















   18 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BIT_SPINLOCK_H
#define __LINUX_BIT_SPINLOCK_H

#include <linux/kernel.h>
#include <linux/preempt.h>
#include <linux/atomic.h>
#include <linux/bug.h>

/*
 *  bit-based spin_lock()
 *
 * Don't use this unless you really need to: spin_lock() and spin_unlock()
 * are significantly faster.
 */
static inline void bit_spin_lock(int bitnum, unsigned long *addr)
{
        /*
         * Assuming the lock is uncontended, this never enters
         * the body of the outer loop. If it is contended, then
         * within the inner loop a non-atomic test is used to
         * busywait with less bus contention for a good time to
         * attempt to acquire the lock bit.
         */
        preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        while (unlikely(test_and_set_bit_lock(bitnum, addr))) {
                preempt_enable();
                do {
                        cpu_relax();
                } while (test_bit(bitnum, addr));
                preempt_disable();
        }
#endif
        __acquire(bitlock);
}

/*
 * Return true if it was acquired
 */
static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
{
        preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        if (unlikely(test_and_set_bit_lock(bitnum, addr))) {
                preempt_enable();
                return 0;
        }
#endif
        __acquire(bitlock);
        return 1;
}

/*
 *  bit-based spin_unlock()
 */
static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
{
#ifdef CONFIG_DEBUG_SPINLOCK
        BUG_ON(!test_bit(bitnum, addr));
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        clear_bit_unlock(bitnum, addr);
#endif
        preempt_enable();
        __release(bitlock);
}

/*
 *  bit-based spin_unlock()
 *  non-atomic version, which can be used eg. if the bit lock itself is
 *  protecting the rest of the flags in the word.
 */
static inline void __bit_spin_unlock(int bitnum, unsigned long *addr)
{
#ifdef CONFIG_DEBUG_SPINLOCK
        BUG_ON(!test_bit(bitnum, addr));
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        __clear_bit_unlock(bitnum, addr);
#endif
        preempt_enable();
        __release(bitlock);
}

/*
 * Return true if the lock is held.
 */
static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
{
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        return test_bit(bitnum, addr);
#elif defined CONFIG_PREEMPT_COUNT
        return preempt_count();
#else
        return 1;
#endif
}

#endif /* __LINUX_BIT_SPINLOCK_H */













































































   23 













    2 

    3 






    3 
    8 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/pagevec.h
 *
 * In many places it is efficient to batch an operation up against multiple
 * folios.  A folio_batch is a container which is used for that.
 */

#ifndef _LINUX_PAGEVEC_H
#define _LINUX_PAGEVEC_H

#include <linux/types.h>

/* 31 pointers + header align the folio_batch structure to a power of two */
#define PAGEVEC_SIZE        31

struct folio;

/**
 * struct folio_batch - A collection of folios.
 *
 * The folio_batch is used to amortise the cost of retrieving and
 * operating on a set of folios.  The order of folios in the batch may be
 * significant (eg delete_from_page_cache_batch()).  Some users of the
 * folio_batch store "exceptional" entries in it which can be removed
 * by calling folio_batch_remove_exceptionals().
 */
struct folio_batch {
        unsigned char nr;
        unsigned char i;
        bool percpu_pvec_drained;
        struct folio *folios[PAGEVEC_SIZE];
};

/**
 * folio_batch_init() - Initialise a batch of folios
 * @fbatch: The folio batch.
 *
 * A freshly initialised folio_batch contains zero folios.
 */
static inline void folio_batch_init(struct folio_batch *fbatch)
{
        fbatch->nr = 0;
        fbatch->i = 0;
        fbatch->percpu_pvec_drained = false;
}

static inline void folio_batch_reinit(struct folio_batch *fbatch)
{
        fbatch->nr = 0;
        fbatch->i = 0;
}

static inline unsigned int folio_batch_count(struct folio_batch *fbatch)
{
        return fbatch->nr;
}

static inline unsigned int folio_batch_space(struct folio_batch *fbatch)
{
        return PAGEVEC_SIZE - fbatch->nr;
}

/**
 * folio_batch_add() - Add a folio to a batch.
 * @fbatch: The folio batch.
 * @folio: The folio to add.
 *
 * The folio is added to the end of the batch.
 * The batch must have previously been initialised using folio_batch_init().
 *
 * Return: The number of slots still available.
 */
static inline unsigned folio_batch_add(struct folio_batch *fbatch,
                struct folio *folio)
{
        fbatch->folios[fbatch->nr++] = folio;
        return folio_batch_space(fbatch);
}

/**
 * folio_batch_next - Return the next folio to process.
 * @fbatch: The folio batch being processed.
 *
 * Use this function to implement a queue of folios.
 *
 * Return: The next folio in the queue, or NULL if the queue is empty.
 */
static inline struct folio *folio_batch_next(struct folio_batch *fbatch)
{
        if (fbatch->i == fbatch->nr)
                return NULL;
        return fbatch->folios[fbatch->i++];
}

void __folio_batch_release(struct folio_batch *pvec);

static inline void folio_batch_release(struct folio_batch *fbatch)
{
        if (folio_batch_count(fbatch))
                __folio_batch_release(fbatch);
}

void folio_batch_remove_exceptionals(struct folio_batch *fbatch);
#endif /* _LINUX_PAGEVEC_H */

















































































































    5 


    5 
























































































































































































































    3 
    2 











































































    4 



    4 





































    1 

    1 
    1 
    1 





    3 

    4 


    4 
    4 






















    4 















    4 


    4 
    4 

    4 
    4 






    3 
    3 























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Generic pidhash and scalable, time-bounded PID allocator
 *
 * (C) 2002-2003 Nadia Yvette Chambers, IBM
 * (C) 2004 Nadia Yvette Chambers, Oracle
 * (C) 2002-2004 Ingo Molnar, Red Hat
 *
 * pid-structures are backing objects for tasks sharing a given ID to chain
 * against. There is very little to them aside from hashing them and
 * parking tasks using given ID's on a list.
 *
 * The hash is always changed with the tasklist_lock write-acquired,
 * and the hash is only accessed with the tasklist_lock at least
 * read-acquired, so there's no additional SMP locking needed here.
 *
 * We have a list of bitmap pages, which bitmaps represent the PID space.
 * Allocating and freeing PIDs is completely lockless. The worst-case
 * allocation scenario when all but one out of 1 million PIDs possible are
 * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
 * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
 *
 * Pid namespaces:
 *    (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
 *    (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
 *     Many thanks to Oleg Nesterov for comments and help
 *
 */

#include <linux/mm.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/rculist.h>
#include <linux/memblock.h>
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>
#include <linux/proc_ns.h>
#include <linux/refcount.h>
#include <linux/anon_inodes.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/idr.h>
#include <linux/pidfs.h>
#include <net/sock.h>
#include <uapi/linux/pidfd.h>

struct pid init_struct_pid = {
        .count                = REFCOUNT_INIT(1),
        .tasks                = {
                { .first = NULL },
                { .first = NULL },
                { .first = NULL },
        },
        .level                = 0,
        .numbers        = { {
                .nr                = 0,
                .ns                = &init_pid_ns,
        }, }
};

int pid_max = PID_MAX_DEFAULT;

int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
/*
 * Pseudo filesystems start inode numbering after one. We use Reserved
 * PIDs as a natural offset.
 */
static u64 pidfs_ino = RESERVED_PIDS;

/*
 * PID-map pages start out as NULL, they get allocated upon
 * first use and are never deallocated. This way a low pid_max
 * value does not cause lots of bitmaps to be allocated, but
 * the scheme scales to up to 4 million PIDs, runtime.
 */
struct pid_namespace init_pid_ns = {
        .ns.count = REFCOUNT_INIT(2),
        .idr = IDR_INIT(init_pid_ns.idr),
        .pid_allocated = PIDNS_ADDING,
        .level = 0,
        .child_reaper = &init_task,
        .user_ns = &init_user_ns,
        .ns.inum = PROC_PID_INIT_INO,
#ifdef CONFIG_PID_NS
        .ns.ops = &pidns_operations,
#endif
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
        .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
#endif
};
EXPORT_SYMBOL_GPL(init_pid_ns);

/*
 * Note: disable interrupts while the pidmap_lock is held as an
 * interrupt might come in and do read_lock(&tasklist_lock).
 *
 * If we don't disable interrupts there is a nasty deadlock between
 * detach_pid()->free_pid() and another cpu that does
 * spin_lock(&pidmap_lock) followed by an interrupt routine that does
 * read_lock(&tasklist_lock);
 *
 * After we clean up the tasklist_lock and know there are no
 * irq handlers that take it we can leave the interrupts enabled.
 * For now it is easier to be safe than to prove it can't happen.
 */

static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);

void put_pid(struct pid *pid)
{
        struct pid_namespace *ns;

        if (!pid)
                return;

        ns = pid->numbers[pid->level].ns;
        if (refcount_dec_and_test(&pid->count)) {
                kmem_cache_free(ns->pid_cachep, pid);
                put_pid_ns(ns);
        }
}
EXPORT_SYMBOL_GPL(put_pid);

static void delayed_put_pid(struct rcu_head *rhp)
{
        struct pid *pid = container_of(rhp, struct pid, rcu);
        put_pid(pid);
}

void free_pid(struct pid *pid)
{
        /* We can be called with write_lock_irq(&tasklist_lock) held */
        int i;
        unsigned long flags;

        spin_lock_irqsave(&pidmap_lock, flags);
        for (i = 0; i <= pid->level; i++) {
                struct upid *upid = pid->numbers + i;
                struct pid_namespace *ns = upid->ns;
                switch (--ns->pid_allocated) {
                case 2:
                case 1:
                        /* When all that is left in the pid namespace
                         * is the reaper wake up the reaper.  The reaper
                         * may be sleeping in zap_pid_ns_processes().
                         */
                        wake_up_process(ns->child_reaper);
                        break;
                case PIDNS_ADDING:
                        /* Handle a fork failure of the first process */
                        WARN_ON(ns->child_reaper);
                        ns->pid_allocated = 0;
                        break;
                }

                idr_remove(&ns->idr, upid->nr);
        }
        spin_unlock_irqrestore(&pidmap_lock, flags);

        call_rcu(&pid->rcu, delayed_put_pid);
}

struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
                      size_t set_tid_size)
{
        struct pid *pid;
        enum pid_type type;
        int i, nr;
        struct pid_namespace *tmp;
        struct upid *upid;
        int retval = -ENOMEM;

        /*
         * set_tid_size contains the size of the set_tid array. Starting at
         * the most nested currently active PID namespace it tells alloc_pid()
         * which PID to set for a process in that most nested PID namespace
         * up to set_tid_size PID namespaces. It does not have to set the PID
         * for a process in all nested PID namespaces but set_tid_size must
         * never be greater than the current ns->level + 1.
         */
        if (set_tid_size > ns->level + 1)
                return ERR_PTR(-EINVAL);

        pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
        if (!pid)
                return ERR_PTR(retval);

        tmp = ns;
        pid->level = ns->level;

        for (i = ns->level; i >= 0; i--) {
                int tid = 0;

                if (set_tid_size) {
                        tid = set_tid[ns->level - i];

                        retval = -EINVAL;
                        if (tid < 1 || tid >= pid_max)
                                goto out_free;
                        /*
                         * Also fail if a PID != 1 is requested and
                         * no PID 1 exists.
                         */
                        if (tid != 1 && !tmp->child_reaper)
                                goto out_free;
                        retval = -EPERM;
                        if (!checkpoint_restore_ns_capable(tmp->user_ns))
                                goto out_free;
                        set_tid_size--;
                }

                idr_preload(GFP_KERNEL);
                spin_lock_irq(&pidmap_lock);

                if (tid) {
                        nr = idr_alloc(&tmp->idr, NULL, tid,
                                       tid + 1, GFP_ATOMIC);
                        /*
                         * If ENOSPC is returned it means that the PID is
                         * alreay in use. Return EEXIST in that case.
                         */
                        if (nr == -ENOSPC)
                                nr = -EEXIST;
                } else {
                        int pid_min = 1;
                        /*
                         * init really needs pid 1, but after reaching the
                         * maximum wrap back to RESERVED_PIDS
                         */
                        if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
                                pid_min = RESERVED_PIDS;

                        /*
                         * Store a null pointer so find_pid_ns does not find
                         * a partially initialized PID (see below).
                         */
                        nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
                                              pid_max, GFP_ATOMIC);
                }
                spin_unlock_irq(&pidmap_lock);
                idr_preload_end();

                if (nr < 0) {
                        retval = (nr == -ENOSPC) ? -EAGAIN : nr;
                        goto out_free;
                }

                pid->numbers[i].nr = nr;
                pid->numbers[i].ns = tmp;
                tmp = tmp->parent;
        }

        /*
         * ENOMEM is not the most obvious choice especially for the case
         * where the child subreaper has already exited and the pid
         * namespace denies the creation of any new processes. But ENOMEM
         * is what we have exposed to userspace for a long time and it is
         * documented behavior for pid namespaces. So we can't easily
         * change it even if there were an error code better suited.
         */
        retval = -ENOMEM;

        get_pid_ns(ns);
        refcount_set(&pid->count, 1);
        spin_lock_init(&pid->lock);
        for (type = 0; type < PIDTYPE_MAX; ++type)
                INIT_HLIST_HEAD(&pid->tasks[type]);

        init_waitqueue_head(&pid->wait_pidfd);
        INIT_HLIST_HEAD(&pid->inodes);

        upid = pid->numbers + ns->level;
        spin_lock_irq(&pidmap_lock);
        if (!(ns->pid_allocated & PIDNS_ADDING))
                goto out_unlock;
        pid->stashed = NULL;
        pid->ino = ++pidfs_ino;
        for ( ; upid >= pid->numbers; --upid) {
                /* Make the PID visible to find_pid_ns. */
                idr_replace(&upid->ns->idr, pid, upid->nr);
                upid->ns->pid_allocated++;
        }
        spin_unlock_irq(&pidmap_lock);

        return pid;

out_unlock:
        spin_unlock_irq(&pidmap_lock);
        put_pid_ns(ns);

out_free:
        spin_lock_irq(&pidmap_lock);
        while (++i <= ns->level) {
                upid = pid->numbers + i;
                idr_remove(&upid->ns->idr, upid->nr);
        }

        /* On failure to allocate the first pid, reset the state */
        if (ns->pid_allocated == PIDNS_ADDING)
                idr_set_cursor(&ns->idr, 0);

        spin_unlock_irq(&pidmap_lock);

        kmem_cache_free(ns->pid_cachep, pid);
        return ERR_PTR(retval);
}

void disable_pid_allocation(struct pid_namespace *ns)
{
        spin_lock_irq(&pidmap_lock);
        ns->pid_allocated &= ~PIDNS_ADDING;
        spin_unlock_irq(&pidmap_lock);
}

struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
        return idr_find(&ns->idr, nr);
}
EXPORT_SYMBOL_GPL(find_pid_ns);

struct pid *find_vpid(int nr)
{
        return find_pid_ns(nr, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(find_vpid);

static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
{
        return (type == PIDTYPE_PID) ?
                &task->thread_pid :
                &task->signal->pids[type];
}

/*
 * attach_pid() must be called with the tasklist_lock write-held.
 */
void attach_pid(struct task_struct *task, enum pid_type type)
{
        struct pid *pid = *task_pid_ptr(task, type);
        hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
}

static void __change_pid(struct task_struct *task, enum pid_type type,
                        struct pid *new)
{
        struct pid **pid_ptr = task_pid_ptr(task, type);
        struct pid *pid;
        int tmp;

        pid = *pid_ptr;

        hlist_del_rcu(&task->pid_links[type]);
        *pid_ptr = new;

        if (type == PIDTYPE_PID) {
                WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID));
                wake_up_all(&pid->wait_pidfd);
        }

        for (tmp = PIDTYPE_MAX; --tmp >= 0; )
                if (pid_has_task(pid, tmp))
                        return;

        free_pid(pid);
}

void detach_pid(struct task_struct *task, enum pid_type type)
{
        __change_pid(task, type, NULL);
}

void change_pid(struct task_struct *task, enum pid_type type,
                struct pid *pid)
{
        __change_pid(task, type, pid);
        attach_pid(task, type);
}

void exchange_tids(struct task_struct *left, struct task_struct *right)
{
        struct pid *pid1 = left->thread_pid;
        struct pid *pid2 = right->thread_pid;
        struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
        struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];

        /* Swap the single entry tid lists */
        hlists_swap_heads_rcu(head1, head2);

        /* Swap the per task_struct pid */
        rcu_assign_pointer(left->thread_pid, pid2);
        rcu_assign_pointer(right->thread_pid, pid1);

        /* Swap the cached value */
        WRITE_ONCE(left->pid, pid_nr(pid2));
        WRITE_ONCE(right->pid, pid_nr(pid1));
}

/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
void transfer_pid(struct task_struct *old, struct task_struct *new,
                           enum pid_type type)
{
        WARN_ON_ONCE(type == PIDTYPE_PID);
        hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
}

struct task_struct *pid_task(struct pid *pid, enum pid_type type)
{
        struct task_struct *result = NULL;
        if (pid) {
                struct hlist_node *first;
                first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
                                              lockdep_tasklist_lock_is_held());
                if (first)
                        result = hlist_entry(first, struct task_struct, pid_links[(type)]);
        }
        return result;
}
EXPORT_SYMBOL(pid_task);

/*
 * Must be called under rcu_read_lock().
 */
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
                         "find_task_by_pid_ns() needs rcu_read_lock() protection");
        return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}

struct task_struct *find_task_by_vpid(pid_t vnr)
{
        return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
}

struct task_struct *find_get_task_by_vpid(pid_t nr)
{
        struct task_struct *task;

        rcu_read_lock();
        task = find_task_by_vpid(nr);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();

        return task;
}

struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
{
        struct pid *pid;
        rcu_read_lock();
        pid = get_pid(rcu_dereference(*task_pid_ptr(task, type)));
        rcu_read_unlock();
        return pid;
}
EXPORT_SYMBOL_GPL(get_task_pid);

struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
{
        struct task_struct *result;
        rcu_read_lock();
        result = pid_task(pid, type);
        if (result)
                get_task_struct(result);
        rcu_read_unlock();
        return result;
}
EXPORT_SYMBOL_GPL(get_pid_task);

struct pid *find_get_pid(pid_t nr)
{
        struct pid *pid;

        rcu_read_lock();
        pid = get_pid(find_vpid(nr));
        rcu_read_unlock();

        return pid;
}
EXPORT_SYMBOL_GPL(find_get_pid);

pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
{
        struct upid *upid;
        pid_t nr = 0;

        if (pid && ns->level <= pid->level) {
                upid = &pid->numbers[ns->level];
                if (upid->ns == ns)
                        nr = upid->nr;
        }
        return nr;
}
EXPORT_SYMBOL_GPL(pid_nr_ns);

pid_t pid_vnr(struct pid *pid)
{
        return pid_nr_ns(pid, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(pid_vnr);

pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
                        struct pid_namespace *ns)
{
        pid_t nr = 0;

        rcu_read_lock();
        if (!ns)
                ns = task_active_pid_ns(current);
        nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
        rcu_read_unlock();

        return nr;
}
EXPORT_SYMBOL(__task_pid_nr_ns);

struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
        return ns_of_pid(task_pid(tsk));
}
EXPORT_SYMBOL_GPL(task_active_pid_ns);

/*
 * Used by proc to find the first pid that is greater than or equal to nr.
 *
 * If there is a pid at nr this function is exactly the same as find_pid_ns.
 */
struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
        return idr_get_next(&ns->idr, &nr);
}
EXPORT_SYMBOL_GPL(find_ge_pid);

struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
{
        struct fd f;
        struct pid *pid;

        f = fdget(fd);
        if (!f.file)
                return ERR_PTR(-EBADF);

        pid = pidfd_pid(f.file);
        if (!IS_ERR(pid)) {
                get_pid(pid);
                *flags = f.file->f_flags;
        }

        fdput(f);
        return pid;
}

/**
 * pidfd_get_task() - Get the task associated with a pidfd
 *
 * @pidfd: pidfd for which to get the task
 * @flags: flags associated with this pidfd
 *
 * Return the task associated with @pidfd. The function takes a reference on
 * the returned task. The caller is responsible for releasing that reference.
 *
 * Return: On success, the task_struct associated with the pidfd.
 *           On error, a negative errno number will be returned.
 */
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
{
        unsigned int f_flags;
        struct pid *pid;
        struct task_struct *task;

        pid = pidfd_get_pid(pidfd, &f_flags);
        if (IS_ERR(pid))
                return ERR_CAST(pid);

        task = get_pid_task(pid, PIDTYPE_TGID);
        put_pid(pid);
        if (!task)
                return ERR_PTR(-ESRCH);

        *flags = f_flags;
        return task;
}

/**
 * pidfd_create() - Create a new pid file descriptor.
 *
 * @pid:   struct pid that the pidfd will reference
 * @flags: flags to pass
 *
 * This creates a new pid file descriptor with the O_CLOEXEC flag set.
 *
 * Note, that this function can only be called after the fd table has
 * been unshared to avoid leaking the pidfd to the new process.
 *
 * This symbol should not be explicitly exported to loadable modules.
 *
 * Return: On success, a cloexec pidfd is returned.
 *         On error, a negative errno number will be returned.
 */
static int pidfd_create(struct pid *pid, unsigned int flags)
{
        int pidfd;
        struct file *pidfd_file;

        pidfd = pidfd_prepare(pid, flags, &pidfd_file);
        if (pidfd < 0)
                return pidfd;

        fd_install(pidfd, pidfd_file);
        return pidfd;
}

/**
 * sys_pidfd_open() - Open new pid file descriptor.
 *
 * @pid:   pid for which to retrieve a pidfd
 * @flags: flags to pass
 *
 * This creates a new pid file descriptor with the O_CLOEXEC flag set for
 * the task identified by @pid. Without PIDFD_THREAD flag the target task
 * must be a thread-group leader.
 *
 * Return: On success, a cloexec pidfd is returned.
 *         On error, a negative errno number will be returned.
 */
SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
{
        int fd;
        struct pid *p;

        if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
                return -EINVAL;

        if (pid <= 0)
                return -EINVAL;

        p = find_get_pid(pid);
        if (!p)
                return -ESRCH;

        fd = pidfd_create(p, flags);

        put_pid(p);
        return fd;
}

void __init pid_idr_init(void)
{
        /* Verify no one has done anything silly: */
        BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);

        /* bump default and minimum pid_max based on number of cpus */
        pid_max = min(pid_max_max, max_t(int, pid_max,
                                PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
        pid_max_min = max_t(int, pid_max_min,
                                PIDS_PER_CPU_MIN * num_possible_cpus());
        pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);

        idr_init(&init_pid_ns.idr);

        init_pid_ns.pid_cachep = kmem_cache_create("pid",
                        struct_size_t(struct pid, numbers, 1),
                        __alignof__(struct pid),
                        SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
                        NULL);
}

static struct file *__pidfd_fget(struct task_struct *task, int fd)
{
        struct file *file;
        int ret;

        ret = down_read_killable(&task->signal->exec_update_lock);
        if (ret)
                return ERR_PTR(ret);

        if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
                file = fget_task(task, fd);
        else
                file = ERR_PTR(-EPERM);

        up_read(&task->signal->exec_update_lock);

        if (!file) {
                /*
                 * It is possible that the target thread is exiting; it can be
                 * either:
                 * 1. before exit_signals(), which gives a real fd
                 * 2. before exit_files() takes the task_lock() gives a real fd
                 * 3. after exit_files() releases task_lock(), ->files is NULL;
                 *    this has PF_EXITING, since it was set in exit_signals(),
                 *    __pidfd_fget() returns EBADF.
                 * In case 3 we get EBADF, but that really means ESRCH, since
                 * the task is currently exiting and has freed its files
                 * struct, so we fix it up.
                 */
                if (task->flags & PF_EXITING)
                        file = ERR_PTR(-ESRCH);
                else
                        file = ERR_PTR(-EBADF);
        }

        return file;
}

static int pidfd_getfd(struct pid *pid, int fd)
{
        struct task_struct *task;
        struct file *file;
        int ret;

        task = get_pid_task(pid, PIDTYPE_PID);
        if (!task)
                return -ESRCH;

        file = __pidfd_fget(task, fd);
        put_task_struct(task);
        if (IS_ERR(file))
                return PTR_ERR(file);

        ret = receive_fd(file, NULL, O_CLOEXEC);
        fput(file);

        return ret;
}

/**
 * sys_pidfd_getfd() - Get a file descriptor from another process
 *
 * @pidfd:        the pidfd file descriptor of the process
 * @fd:                the file descriptor number to get
 * @flags:        flags on how to get the fd (reserved)
 *
 * This syscall gets a copy of a file descriptor from another process
 * based on the pidfd, and file descriptor number. It requires that
 * the calling process has the ability to ptrace the process represented
 * by the pidfd. The process which is having its file descriptor copied
 * is otherwise unaffected.
 *
 * Return: On success, a cloexec file descriptor is returned.
 *         On error, a negative errno number will be returned.
 */
SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
                unsigned int, flags)
{
        struct pid *pid;
        struct fd f;
        int ret;

        /* flags is currently unused - make sure it's unset */
        if (flags)
                return -EINVAL;

        f = fdget(pidfd);
        if (!f.file)
                return -EBADF;

        pid = pidfd_pid(f.file);
        if (IS_ERR(pid))
                ret = PTR_ERR(pid);
        else
                ret = pidfd_getfd(pid, fd);

        fdput(f);
        return ret;
}








































































































































    1 














































    1 

















































































































































































    1 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_UACCESS_H__
#define __LINUX_UACCESS_H__

#include <linux/fault-inject-usercopy.h>
#include <linux/instrumented.h>
#include <linux/minmax.h>
#include <linux/sched.h>
#include <linux/thread_info.h>

#include <asm/uaccess.h>

/*
 * Architectures that support memory tagging (assigning tags to memory regions,
 * embedding these tags into addresses that point to these memory regions, and
 * checking that the memory and the pointer tags match on memory accesses)
 * redefine this macro to strip tags from pointers.
 *
 * Passing down mm_struct allows to define untagging rules on per-process
 * basis.
 *
 * It's defined as noop for architectures that don't support memory tagging.
 */
#ifndef untagged_addr
#define untagged_addr(addr) (addr)
#endif

#ifndef untagged_addr_remote
#define untagged_addr_remote(mm, addr)        ({                \
        mmap_assert_locked(mm);                                \
        untagged_addr(addr);                                \
})
#endif

/*
 * Architectures should provide two primitives (raw_copy_{to,from}_user())
 * and get rid of their private instances of copy_{to,from}_user() and
 * __copy_{to,from}_user{,_inatomic}().
 *
 * raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and
 * return the amount left to copy.  They should assume that access_ok() has
 * already been checked (and succeeded); they should *not* zero-pad anything.
 * No KASAN or object size checks either - those belong here.
 *
 * Both of these functions should attempt to copy size bytes starting at from
 * into the area starting at to.  They must not fetch or store anything
 * outside of those areas.  Return value must be between 0 (everything
 * copied successfully) and size (nothing copied).
 *
 * If raw_copy_{to,from}_user(to, from, size) returns N, size - N bytes starting
 * at to must become equal to the bytes fetched from the corresponding area
 * starting at from.  All data past to + size - N must be left unmodified.
 *
 * If copying succeeds, the return value must be 0.  If some data cannot be
 * fetched, it is permitted to copy less than had been fetched; the only
 * hard requirement is that not storing anything at all (i.e. returning size)
 * should happen only when nothing could be copied.  In other words, you don't
 * have to squeeze as much as possible - it is allowed, but not necessary.
 *
 * For raw_copy_from_user() to always points to kernel memory and no faults
 * on store should happen.  Interpretation of from is affected by set_fs().
 * For raw_copy_to_user() it's the other way round.
 *
 * Both can be inlined - it's up to architectures whether it wants to bother
 * with that.  They should not be used directly; they are used to implement
 * the 6 functions (copy_{to,from}_user(), __copy_{to,from}_user_inatomic())
 * that are used instead.  Out of those, __... ones are inlined.  Plain
 * copy_{to,from}_user() might or might not be inlined.  If you want them
 * inlined, have asm/uaccess.h define INLINE_COPY_{TO,FROM}_USER.
 *
 * NOTE: only copy_from_user() zero-pads the destination in case of short copy.
 * Neither __copy_from_user() nor __copy_from_user_inatomic() zero anything
 * at all; their callers absolutely must check the return value.
 *
 * Biarch ones should also provide raw_copy_in_user() - similar to the above,
 * but both source and destination are __user pointers (affected by set_fs()
 * as usual) and both source and destination can trigger faults.
 */

static __always_inline __must_check unsigned long
__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
{
        unsigned long res;

        instrument_copy_from_user_before(to, from, n);
        check_object_size(to, n, false);
        res = raw_copy_from_user(to, from, n);
        instrument_copy_from_user_after(to, from, n, res);
        return res;
}

static __always_inline __must_check unsigned long
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
        unsigned long res;

        might_fault();
        instrument_copy_from_user_before(to, from, n);
        if (should_fail_usercopy())
                return n;
        check_object_size(to, n, false);
        res = raw_copy_from_user(to, from, n);
        instrument_copy_from_user_after(to, from, n, res);
        return res;
}

/**
 * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking.
 * @to:   Destination address, in user space.
 * @from: Source address, in kernel space.
 * @n:    Number of bytes to copy.
 *
 * Context: User context only.
 *
 * Copy data from kernel space to user space.  Caller must check
 * the specified block with access_ok() before calling this function.
 * The caller should also make sure he pins the user space address
 * so that we don't result in page fault and sleep.
 */
static __always_inline __must_check unsigned long
__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
{
        if (should_fail_usercopy())
                return n;
        instrument_copy_to_user(to, from, n);
        check_object_size(from, n, true);
        return raw_copy_to_user(to, from, n);
}

static __always_inline __must_check unsigned long
__copy_to_user(void __user *to, const void *from, unsigned long n)
{
        might_fault();
        if (should_fail_usercopy())
                return n;
        instrument_copy_to_user(to, from, n);
        check_object_size(from, n, true);
        return raw_copy_to_user(to, from, n);
}

#ifdef INLINE_COPY_FROM_USER
static inline __must_check unsigned long
_copy_from_user(void *to, const void __user *from, unsigned long n)
{
        unsigned long res = n;
        might_fault();
        if (!should_fail_usercopy() && likely(access_ok(from, n))) {
                instrument_copy_from_user_before(to, from, n);
                res = raw_copy_from_user(to, from, n);
                instrument_copy_from_user_after(to, from, n, res);
        }
        if (unlikely(res))
                memset(to + (n - res), 0, res);
        return res;
}
#else
extern __must_check unsigned long
_copy_from_user(void *, const void __user *, unsigned long);
#endif

#ifdef INLINE_COPY_TO_USER
static inline __must_check unsigned long
_copy_to_user(void __user *to, const void *from, unsigned long n)
{
        might_fault();
        if (should_fail_usercopy())
                return n;
        if (access_ok(to, n)) {
                instrument_copy_to_user(to, from, n);
                n = raw_copy_to_user(to, from, n);
        }
        return n;
}
#else
extern __must_check unsigned long
_copy_to_user(void __user *, const void *, unsigned long);
#endif

static __always_inline unsigned long __must_check
copy_from_user(void *to, const void __user *from, unsigned long n)
{
        if (check_copy_size(to, n, false))
                n = _copy_from_user(to, from, n);
        return n;
}

static __always_inline unsigned long __must_check
copy_to_user(void __user *to, const void *from, unsigned long n)
{
        if (check_copy_size(from, n, true))
                n = _copy_to_user(to, from, n);
        return n;
}

#ifndef copy_mc_to_kernel
/*
 * Without arch opt-in this generic copy_mc_to_kernel() will not handle
 * #MC (or arch equivalent) during source read.
 */
static inline unsigned long __must_check
copy_mc_to_kernel(void *dst, const void *src, size_t cnt)
{
        memcpy(dst, src, cnt);
        return 0;
}
#endif

static __always_inline void pagefault_disabled_inc(void)
{
        current->pagefault_disabled++;
}

static __always_inline void pagefault_disabled_dec(void)
{
        current->pagefault_disabled--;
}

/*
 * These routines enable/disable the pagefault handler. If disabled, it will
 * not take any locks and go straight to the fixup table.
 *
 * User access methods will not sleep when called from a pagefault_disabled()
 * environment.
 */
static inline void pagefault_disable(void)
{
        pagefault_disabled_inc();
        /*
         * make sure to have issued the store before a pagefault
         * can hit.
         */
        barrier();
}

static inline void pagefault_enable(void)
{
        /*
         * make sure to issue those last loads/stores before enabling
         * the pagefault handler again.
         */
        barrier();
        pagefault_disabled_dec();
}

/*
 * Is the pagefault handler disabled? If so, user access methods will not sleep.
 */
static inline bool pagefault_disabled(void)
{
        return current->pagefault_disabled != 0;
}

/*
 * The pagefault handler is in general disabled by pagefault_disable() or
 * when in irq context (via in_atomic()).
 *
 * This function should only be used by the fault handlers. Other users should
 * stick to pagefault_disabled().
 * Please NEVER use preempt_disable() to disable the fault handler. With
 * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled.
 * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT.
 */
#define faulthandler_disabled() (pagefault_disabled() || in_atomic())

#ifndef CONFIG_ARCH_HAS_SUBPAGE_FAULTS

/**
 * probe_subpage_writeable: probe the user range for write faults at sub-page
 *                            granularity (e.g. arm64 MTE)
 * @uaddr: start of address range
 * @size: size of address range
 *
 * Returns 0 on success, the number of bytes not probed on fault.
 *
 * It is expected that the caller checked for the write permission of each
 * page in the range either by put_user() or GUP. The architecture port can
 * implement a more efficient get_user() probing if the same sub-page faults
 * are triggered by either a read or a write.
 */
static inline size_t probe_subpage_writeable(char __user *uaddr, size_t size)
{
        return 0;
}

#endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */

#ifndef ARCH_HAS_NOCACHE_UACCESS

static inline __must_check unsigned long
__copy_from_user_inatomic_nocache(void *to, const void __user *from,
                                  unsigned long n)
{
        return __copy_from_user_inatomic(to, from, n);
}

#endif                /* ARCH_HAS_NOCACHE_UACCESS */

extern __must_check int check_zeroed_user(const void __user *from, size_t size);

/**
 * copy_struct_from_user: copy a struct from userspace
 * @dst:   Destination address, in kernel space. This buffer must be @ksize
 *         bytes long.
 * @ksize: Size of @dst struct.
 * @src:   Source address, in userspace.
 * @usize: (Alleged) size of @src struct.
 *
 * Copies a struct from userspace to kernel space, in a way that guarantees
 * backwards-compatibility for struct syscall arguments (as long as future
 * struct extensions are made such that all new fields are *appended* to the
 * old struct, and zeroed-out new fields have the same meaning as the old
 * struct).
 *
 * @ksize is just sizeof(*dst), and @usize should've been passed by userspace.
 * The recommended usage is something like the following:
 *
 *   SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize)
 *   {
 *      int err;
 *      struct foo karg = {};
 *
 *      if (usize > PAGE_SIZE)
 *        return -E2BIG;
 *      if (usize < FOO_SIZE_VER0)
 *        return -EINVAL;
 *
 *      err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize);
 *      if (err)
 *        return err;
 *
 *      // ...
 *   }
 *
 * There are three cases to consider:
 *  * If @usize == @ksize, then it's copied verbatim.
 *  * If @usize < @ksize, then the userspace has passed an old struct to a
 *    newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize)
 *    are to be zero-filled.
 *  * If @usize > @ksize, then the userspace has passed a new struct to an
 *    older kernel. The trailing bytes unknown to the kernel (@usize - @ksize)
 *    are checked to ensure they are zeroed, otherwise -E2BIG is returned.
 *
 * Returns (in all cases, some data may have been copied):
 *  * -E2BIG:  (@usize > @ksize) and there are non-zero trailing bytes in @src.
 *  * -EFAULT: access to userspace failed.
 */
static __always_inline __must_check int
copy_struct_from_user(void *dst, size_t ksize, const void __user *src,
                      size_t usize)
{
        size_t size = min(ksize, usize);
        size_t rest = max(ksize, usize) - size;

        /* Double check if ksize is larger than a known object size. */
        if (WARN_ON_ONCE(ksize > __builtin_object_size(dst, 1)))
                return -E2BIG;

        /* Deal with trailing bytes. */
        if (usize < ksize) {
                memset(dst + size, 0, rest);
        } else if (usize > ksize) {
                int ret = check_zeroed_user(src + size, rest);
                if (ret <= 0)
                        return ret ?: -E2BIG;
        }
        /* Copy the interoperable parts of the struct. */
        if (copy_from_user(dst, src, size))
                return -EFAULT;
        return 0;
}

bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size);

long copy_from_kernel_nofault(void *dst, const void *src, size_t size);
long notrace copy_to_kernel_nofault(void *dst, const void *src, size_t size);

long copy_from_user_nofault(void *dst, const void __user *src, size_t size);
long notrace copy_to_user_nofault(void __user *dst, const void *src,
                size_t size);

long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr,
                long count);

long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
                long count);
long strnlen_user_nofault(const void __user *unsafe_addr, long count);

#ifndef __get_kernel_nofault
#define __get_kernel_nofault(dst, src, type, label)        \
do {                                                        \
        type __user *p = (type __force __user *)(src);        \
        type data;                                        \
        if (__get_user(data, p))                        \
                goto label;                                \
        *(type *)dst = data;                                \
} while (0)

#define __put_kernel_nofault(dst, src, type, label)        \
do {                                                        \
        type __user *p = (type __force __user *)(dst);        \
        type data = *(type *)src;                        \
        if (__put_user(data, p))                        \
                goto label;                                \
} while (0)
#endif

/**
 * get_kernel_nofault(): safely attempt to read from a location
 * @val: read into this variable
 * @ptr: address to read from
 *
 * Returns 0 on success, or -EFAULT.
 */
#define get_kernel_nofault(val, ptr) ({                                \
        const typeof(val) *__gk_ptr = (ptr);                        \
        copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\
})

#ifndef user_access_begin
#define user_access_begin(ptr,len) access_ok(ptr, len)
#define user_access_end() do { } while (0)
#define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0)
#define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e)
#define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e)
#define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e)
#define unsafe_copy_from_user(d,s,l,e) unsafe_op_wrap(__copy_from_user(d,s,l),e)
static inline unsigned long user_access_save(void) { return 0UL; }
static inline void user_access_restore(unsigned long flags) { }
#endif
#ifndef user_write_access_begin
#define user_write_access_begin user_access_begin
#define user_write_access_end user_access_end
#endif
#ifndef user_read_access_begin
#define user_read_access_begin user_access_begin
#define user_read_access_end user_access_end
#endif

#ifdef CONFIG_HARDENED_USERCOPY
void __noreturn usercopy_abort(const char *name, const char *detail,
                               bool to_user, unsigned long offset,
                               unsigned long len);
#endif

#endif                /* __LINUX_UACCESS_H__ */














































































































































































































































































































































































































































































































































































































































































































































































































   66 


























   13 
   66 































































   42 

















   45 



   42 



















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * Read-Copy Update mechanism for mutual exclusion
 *
 * Copyright IBM Corporation, 2001
 *
 * Author: Dipankar Sarma <dipankar@in.ibm.com>
 *
 * Based on the original work by Paul McKenney <paulmck@vnet.ibm.com>
 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
 * Papers:
 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
 *
 * For detailed explanation of Read-Copy Update mechanism see -
 *                http://lse.sourceforge.net/locking/rcupdate.html
 *
 */

#ifndef __LINUX_RCUPDATE_H
#define __LINUX_RCUPDATE_H

#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/atomic.h>
#include <linux/irqflags.h>
#include <linux/preempt.h>
#include <linux/bottom_half.h>
#include <linux/lockdep.h>
#include <linux/cleanup.h>
#include <asm/processor.h>
#include <linux/cpumask.h>
#include <linux/context_tracking_irq.h>

#define ULONG_CMP_GE(a, b)        (ULONG_MAX / 2 >= (a) - (b))
#define ULONG_CMP_LT(a, b)        (ULONG_MAX / 2 < (a) - (b))

/* Exported common interfaces */
void call_rcu(struct rcu_head *head, rcu_callback_t func);
void rcu_barrier_tasks(void);
void rcu_barrier_tasks_rude(void);
void synchronize_rcu(void);

struct rcu_gp_oldstate;
unsigned long get_completed_synchronize_rcu(void);
void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);

// Maximum number of unsigned long values corresponding to
// not-yet-completed RCU grace periods.
#define NUM_ACTIVE_RCU_POLL_OLDSTATE 2

/**
 * same_state_synchronize_rcu - Are two old-state values identical?
 * @oldstate1: First old-state value.
 * @oldstate2: Second old-state value.
 *
 * The two old-state values must have been obtained from either
 * get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or
 * get_completed_synchronize_rcu().  Returns @true if the two values are
 * identical and @false otherwise.  This allows structures whose lifetimes
 * are tracked by old-state values to push these values to a list header,
 * allowing those structures to be slightly smaller.
 */
static inline bool same_state_synchronize_rcu(unsigned long oldstate1, unsigned long oldstate2)
{
        return oldstate1 == oldstate2;
}

#ifdef CONFIG_PREEMPT_RCU

void __rcu_read_lock(void);
void __rcu_read_unlock(void);

/*
 * Defined as a macro as it is a very low level header included from
 * areas that don't even know about current.  This gives the rcu_read_lock()
 * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other
 * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
 */
#define rcu_preempt_depth() READ_ONCE(current->rcu_read_lock_nesting)

#else /* #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_TINY_RCU
#define rcu_read_unlock_strict() do { } while (0)
#else
void rcu_read_unlock_strict(void);
#endif

static inline void __rcu_read_lock(void)
{
        preempt_disable();
}

static inline void __rcu_read_unlock(void)
{
        preempt_enable();
        if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
                rcu_read_unlock_strict();
}

static inline int rcu_preempt_depth(void)
{
        return 0;
}

#endif /* #else #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_RCU_LAZY
void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func);
#else
static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
{
        call_rcu(head, func);
}
#endif

/* Internal to kernel */
void rcu_init(void);
extern int rcu_scheduler_active;
void rcu_sched_clock_irq(int user);

#ifdef CONFIG_TASKS_RCU_GENERIC
void rcu_init_tasks_generic(void);
#else
static inline void rcu_init_tasks_generic(void) { }
#endif

#ifdef CONFIG_RCU_STALL_COMMON
void rcu_sysrq_start(void);
void rcu_sysrq_end(void);
#else /* #ifdef CONFIG_RCU_STALL_COMMON */
static inline void rcu_sysrq_start(void) { }
static inline void rcu_sysrq_end(void) { }
#endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */

#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK))
void rcu_irq_work_resched(void);
#else
static inline void rcu_irq_work_resched(void) { }
#endif

#ifdef CONFIG_RCU_NOCB_CPU
void rcu_init_nohz(void);
int rcu_nocb_cpu_offload(int cpu);
int rcu_nocb_cpu_deoffload(int cpu);
void rcu_nocb_flush_deferred_wakeup(void);
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
static inline void rcu_init_nohz(void) { }
static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; }
static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; }
static inline void rcu_nocb_flush_deferred_wakeup(void) { }
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */

/*
 * Note a quasi-voluntary context switch for RCU-tasks's benefit.
 * This is a macro rather than an inline function to avoid #include hell.
 */
#ifdef CONFIG_TASKS_RCU_GENERIC

# ifdef CONFIG_TASKS_RCU
# define rcu_tasks_classic_qs(t, preempt)                                \
        do {                                                                \
                if (!(preempt) && READ_ONCE((t)->rcu_tasks_holdout))        \
                        WRITE_ONCE((t)->rcu_tasks_holdout, false);        \
        } while (0)
void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func);
void synchronize_rcu_tasks(void);
# else
# define rcu_tasks_classic_qs(t, preempt) do { } while (0)
# define call_rcu_tasks call_rcu
# define synchronize_rcu_tasks synchronize_rcu
# endif

# ifdef CONFIG_TASKS_TRACE_RCU
// Bits for ->trc_reader_special.b.need_qs field.
#define TRC_NEED_QS                0x1  // Task needs a quiescent state.
#define TRC_NEED_QS_CHECKED        0x2  // Task has been checked for needing quiescent state.

u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new);
void rcu_tasks_trace_qs_blkd(struct task_struct *t);

# define rcu_tasks_trace_qs(t)                                                        \
        do {                                                                        \
                int ___rttq_nesting = READ_ONCE((t)->trc_reader_nesting);        \
                                                                                \
                if (unlikely(READ_ONCE((t)->trc_reader_special.b.need_qs) == TRC_NEED_QS) &&        \
                    likely(!___rttq_nesting)) {                                        \
                        rcu_trc_cmpxchg_need_qs((t), TRC_NEED_QS, TRC_NEED_QS_CHECKED);        \
                } else if (___rttq_nesting && ___rttq_nesting != INT_MIN &&        \
                           !READ_ONCE((t)->trc_reader_special.b.blocked)) {        \
                        rcu_tasks_trace_qs_blkd(t);                                \
                }                                                                \
        } while (0)
# else
# define rcu_tasks_trace_qs(t) do { } while (0)
# endif

#define rcu_tasks_qs(t, preempt)                                        \
do {                                                                        \
        rcu_tasks_classic_qs((t), (preempt));                                \
        rcu_tasks_trace_qs(t);                                                \
} while (0)

# ifdef CONFIG_TASKS_RUDE_RCU
void call_rcu_tasks_rude(struct rcu_head *head, rcu_callback_t func);
void synchronize_rcu_tasks_rude(void);
# endif

#define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false)
void exit_tasks_rcu_start(void);
void exit_tasks_rcu_stop(void);
void exit_tasks_rcu_finish(void);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
#define rcu_tasks_classic_qs(t, preempt) do { } while (0)
#define rcu_tasks_qs(t, preempt) do { } while (0)
#define rcu_note_voluntary_context_switch(t) do { } while (0)
#define call_rcu_tasks call_rcu
#define synchronize_rcu_tasks synchronize_rcu
static inline void exit_tasks_rcu_start(void) { }
static inline void exit_tasks_rcu_stop(void) { }
static inline void exit_tasks_rcu_finish(void) { }
#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */

/**
 * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period?
 *
 * As an accident of implementation, an RCU Tasks Trace grace period also
 * acts as an RCU grace period.  However, this could change at any time.
 * Code relying on this accident must call this function to verify that
 * this accident is still happening.
 *
 * You have been warned!
 */
static inline bool rcu_trace_implies_rcu_gp(void) { return true; }

/**
 * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU
 *
 * This macro resembles cond_resched(), except that it is defined to
 * report potential quiescent states to RCU-tasks even if the cond_resched()
 * machinery were to be shut off, as some advocate for PREEMPTION kernels.
 */
#define cond_resched_tasks_rcu_qs() \
do { \
        rcu_tasks_qs(current, false); \
        cond_resched(); \
} while (0)

/**
 * rcu_softirq_qs_periodic - Report RCU and RCU-Tasks quiescent states
 * @old_ts: jiffies at start of processing.
 *
 * This helper is for long-running softirq handlers, such as NAPI threads in
 * networking. The caller should initialize the variable passed in as @old_ts
 * at the beginning of the softirq handler. When invoked frequently, this macro
 * will invoke rcu_softirq_qs() every 100 milliseconds thereafter, which will
 * provide both RCU and RCU-Tasks quiescent states. Note that this macro
 * modifies its old_ts argument.
 *
 * Because regions of code that have disabled softirq act as RCU read-side
 * critical sections, this macro should be invoked with softirq (and
 * preemption) enabled.
 *
 * The macro is not needed when CONFIG_PREEMPT_RT is defined. RT kernels would
 * have more chance to invoke schedule() calls and provide necessary quiescent
 * states. As a contrast, calling cond_resched() only won't achieve the same
 * effect because cond_resched() does not provide RCU-Tasks quiescent states.
 */
#define rcu_softirq_qs_periodic(old_ts) \
do { \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT) && \
            time_after(jiffies, (old_ts) + HZ / 10)) { \
                preempt_disable(); \
                rcu_softirq_qs(); \
                preempt_enable(); \
                (old_ts) = jiffies; \
        } \
} while (0)

/*
 * Infrastructure to implement the synchronize_() primitives in
 * TREE_RCU and rcu_barrier_() primitives in TINY_RCU.
 */

#if defined(CONFIG_TREE_RCU)
#include <linux/rcutree.h>
#elif defined(CONFIG_TINY_RCU)
#include <linux/rcutiny.h>
#else
#error "Unknown RCU implementation specified to kernel configuration"
#endif

/*
 * The init_rcu_head_on_stack() and destroy_rcu_head_on_stack() calls
 * are needed for dynamic initialization and destruction of rcu_head
 * on the stack, and init_rcu_head()/destroy_rcu_head() are needed for
 * dynamic initialization and destruction of statically allocated rcu_head
 * structures.  However, rcu_head structures allocated dynamically in the
 * heap don't need any initialization.
 */
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
void init_rcu_head(struct rcu_head *head);
void destroy_rcu_head(struct rcu_head *head);
void init_rcu_head_on_stack(struct rcu_head *head);
void destroy_rcu_head_on_stack(struct rcu_head *head);
#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
static inline void init_rcu_head(struct rcu_head *head) { }
static inline void destroy_rcu_head(struct rcu_head *head) { }
static inline void init_rcu_head_on_stack(struct rcu_head *head) { }
static inline void destroy_rcu_head_on_stack(struct rcu_head *head) { }
#endif        /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */

#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU)
bool rcu_lockdep_current_cpu_online(void);
#else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */
static inline bool rcu_lockdep_current_cpu_online(void) { return true; }
#endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */

extern struct lockdep_map rcu_lock_map;
extern struct lockdep_map rcu_bh_lock_map;
extern struct lockdep_map rcu_sched_lock_map;
extern struct lockdep_map rcu_callback_map;

#ifdef CONFIG_DEBUG_LOCK_ALLOC

static inline void rcu_lock_acquire(struct lockdep_map *map)
{
        lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_);
}

static inline void rcu_try_lock_acquire(struct lockdep_map *map)
{
        lock_acquire(map, 0, 1, 2, 0, NULL, _THIS_IP_);
}

static inline void rcu_lock_release(struct lockdep_map *map)
{
        lock_release(map, _THIS_IP_);
}

int debug_lockdep_rcu_enabled(void);
int rcu_read_lock_held(void);
int rcu_read_lock_bh_held(void);
int rcu_read_lock_sched_held(void);
int rcu_read_lock_any_held(void);

#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

# define rcu_lock_acquire(a)                do { } while (0)
# define rcu_try_lock_acquire(a)        do { } while (0)
# define rcu_lock_release(a)                do { } while (0)

static inline int rcu_read_lock_held(void)
{
        return 1;
}

static inline int rcu_read_lock_bh_held(void)
{
        return 1;
}

static inline int rcu_read_lock_sched_held(void)
{
        return !preemptible();
}

static inline int rcu_read_lock_any_held(void)
{
        return !preemptible();
}

static inline int debug_lockdep_rcu_enabled(void)
{
        return 0;
}

#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

#ifdef CONFIG_PROVE_RCU

/**
 * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met
 * @c: condition to check
 * @s: informative message
 *
 * This checks debug_lockdep_rcu_enabled() before checking (c) to
 * prevent early boot splats due to lockdep not yet being initialized,
 * and rechecks it after checking (c) to prevent false-positive splats
 * due to races with lockdep being disabled.  See commit 3066820034b5dd
 * ("rcu: Reject RCU_LOCKDEP_WARN() false positives") for more detail.
 */
#define RCU_LOCKDEP_WARN(c, s)                                                \
        do {                                                                \
                static bool __section(".data.unlikely") __warned;        \
                if (debug_lockdep_rcu_enabled() && (c) &&                \
                    debug_lockdep_rcu_enabled() && !__warned) {                \
                        __warned = true;                                \
                        lockdep_rcu_suspicious(__FILE__, __LINE__, s);        \
                }                                                        \
        } while (0)

#ifndef CONFIG_PREEMPT_RCU
static inline void rcu_preempt_sleep_check(void)
{
        RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
                         "Illegal context switch in RCU read-side critical section");
}
#else // #ifndef CONFIG_PREEMPT_RCU
static inline void rcu_preempt_sleep_check(void) { }
#endif // #else // #ifndef CONFIG_PREEMPT_RCU

#define rcu_sleep_check()                                                \
        do {                                                                \
                rcu_preempt_sleep_check();                                \
                if (!IS_ENABLED(CONFIG_PREEMPT_RT))                        \
                    RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),        \
                                 "Illegal context switch in RCU-bh read-side critical section"); \
                RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),        \
                                 "Illegal context switch in RCU-sched read-side critical section"); \
        } while (0)

#else /* #ifdef CONFIG_PROVE_RCU */

#define RCU_LOCKDEP_WARN(c, s) do { } while (0 && (c))
#define rcu_sleep_check() do { } while (0)

#endif /* #else #ifdef CONFIG_PROVE_RCU */

/*
 * Helper functions for rcu_dereference_check(), rcu_dereference_protected()
 * and rcu_assign_pointer().  Some of these could be folded into their
 * callers, but they are left separate in order to ease introduction of
 * multiple pointers markings to match different RCU implementations
 * (e.g., __srcu), should this make sense in the future.
 */

#ifdef __CHECKER__
#define rcu_check_sparse(p, space) \
        ((void)(((typeof(*p) space *)p) == p))
#else /* #ifdef __CHECKER__ */
#define rcu_check_sparse(p, space)
#endif /* #else #ifdef __CHECKER__ */

#define __unrcu_pointer(p, local)                                        \
({                                                                        \
        typeof(*p) *local = (typeof(*p) *__force)(p);                        \
        rcu_check_sparse(p, __rcu);                                        \
        ((typeof(*p) __force __kernel *)(local));                         \
})
/**
 * unrcu_pointer - mark a pointer as not being RCU protected
 * @p: pointer needing to lose its __rcu property
 *
 * Converts @p from an __rcu pointer to a __kernel pointer.
 * This allows an __rcu pointer to be used with xchg() and friends.
 */
#define unrcu_pointer(p) __unrcu_pointer(p, __UNIQUE_ID(rcu))

#define __rcu_access_pointer(p, local, space) \
({ \
        typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(local)); \
})
#define __rcu_dereference_check(p, local, c, space) \
({ \
        /* Dependency order vs. p above. */ \
        typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \
        RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(local)); \
})
#define __rcu_dereference_protected(p, local, c, space) \
({ \
        RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(p)); \
})
#define __rcu_dereference_raw(p, local) \
({ \
        /* Dependency order vs. p above. */ \
        typeof(p) local = READ_ONCE(p); \
        ((typeof(*p) __force __kernel *)(local)); \
})
#define rcu_dereference_raw(p) __rcu_dereference_raw(p, __UNIQUE_ID(rcu))

/**
 * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
 * @v: The value to statically initialize with.
 */
#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)

/**
 * rcu_assign_pointer() - assign to RCU-protected pointer
 * @p: pointer to assign to
 * @v: value to assign (publish)
 *
 * Assigns the specified value to the specified RCU-protected
 * pointer, ensuring that any concurrent RCU readers will see
 * any prior initialization.
 *
 * Inserts memory barriers on architectures that require them
 * (which is most of them), and also prevents the compiler from
 * reordering the code that initializes the structure after the pointer
 * assignment.  More importantly, this call documents which pointers
 * will be dereferenced by RCU read-side code.
 *
 * In some special cases, you may use RCU_INIT_POINTER() instead
 * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
 * to the fact that it does not constrain either the CPU or the compiler.
 * That said, using RCU_INIT_POINTER() when you should have used
 * rcu_assign_pointer() is a very bad thing that results in
 * impossible-to-diagnose memory corruption.  So please be careful.
 * See the RCU_INIT_POINTER() comment header for details.
 *
 * Note that rcu_assign_pointer() evaluates each of its arguments only
 * once, appearances notwithstanding.  One of the "extra" evaluations
 * is in typeof() and the other visible only to sparse (__CHECKER__),
 * neither of which actually execute the argument.  As with most cpp
 * macros, this execute-arguments-only-once property is important, so
 * please be careful when making changes to rcu_assign_pointer() and the
 * other macros that it invokes.
 */
#define rcu_assign_pointer(p, v)                                              \
do {                                                                              \
        uintptr_t _r_a_p__v = (uintptr_t)(v);                                      \
        rcu_check_sparse(p, __rcu);                                              \
                                                                              \
        if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)              \
                WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
        else                                                                      \
                smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
} while (0)

/**
 * rcu_replace_pointer() - replace an RCU pointer, returning its old value
 * @rcu_ptr: RCU pointer, whose old value is returned
 * @ptr: regular pointer
 * @c: the lockdep conditions under which the dereference will take place
 *
 * Perform a replacement, where @rcu_ptr is an RCU-annotated
 * pointer and @c is the lockdep argument that is passed to the
 * rcu_dereference_protected() call used to read that pointer.  The old
 * value of @rcu_ptr is returned, and @rcu_ptr is set to @ptr.
 */
#define rcu_replace_pointer(rcu_ptr, ptr, c)                                \
({                                                                        \
        typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c));        \
        rcu_assign_pointer((rcu_ptr), (ptr));                                \
        __tmp;                                                                \
})

/**
 * rcu_access_pointer() - fetch RCU pointer with no dereferencing
 * @p: The pointer to read
 *
 * Return the value of the specified RCU-protected pointer, but omit the
 * lockdep checks for being in an RCU read-side critical section.  This is
 * useful when the value of this pointer is accessed, but the pointer is
 * not dereferenced, for example, when testing an RCU-protected pointer
 * against NULL.  Although rcu_access_pointer() may also be used in cases
 * where update-side locks prevent the value of the pointer from changing,
 * you should instead use rcu_dereference_protected() for this use case.
 * Within an RCU read-side critical section, there is little reason to
 * use rcu_access_pointer().
 *
 * It is usually best to test the rcu_access_pointer() return value
 * directly in order to avoid accidental dereferences being introduced
 * by later inattentive changes.  In other words, assigning the
 * rcu_access_pointer() return value to a local variable results in an
 * accident waiting to happen.
 *
 * It is also permissible to use rcu_access_pointer() when read-side
 * access to the pointer was removed at least one grace period ago, as is
 * the case in the context of the RCU callback that is freeing up the data,
 * or after a synchronize_rcu() returns.  This can be useful when tearing
 * down multi-linked structures after a grace period has elapsed.  However,
 * rcu_dereference_protected() is normally preferred for this use case.
 */
#define rcu_access_pointer(p) __rcu_access_pointer((p), __UNIQUE_ID(rcu), __rcu)

/**
 * rcu_dereference_check() - rcu_dereference with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * Do an rcu_dereference(), but check that the conditions under which the
 * dereference will take place are correct.  Typically the conditions
 * indicate the various locking conditions that should be held at that
 * point.  The check should return true if the conditions are satisfied.
 * An implicit check for being in an RCU read-side critical section
 * (rcu_read_lock()) is included.
 *
 * For example:
 *
 *        bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock));
 *
 * could be used to indicate to lockdep that foo->bar may only be dereferenced
 * if either rcu_read_lock() is held, or that the lock required to replace
 * the bar struct at foo->bar is held.
 *
 * Note that the list of conditions may also include indications of when a lock
 * need not be held, for example during initialisation or destruction of the
 * target struct:
 *
 *        bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) ||
 *                                              atomic_read(&foo->usage) == 0);
 *
 * Inserts memory barriers on architectures that require them
 * (currently only the Alpha), prevents the compiler from refetching
 * (and from merging fetches), and, more importantly, documents exactly
 * which pointers are protected by RCU and checks that the pointer is
 * annotated as __rcu.
 */
#define rcu_dereference_check(p, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || rcu_read_lock_held(), __rcu)

/**
 * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * This is the RCU-bh counterpart to rcu_dereference_check().  However,
 * please note that starting in v5.0 kernels, vanilla RCU grace periods
 * wait for local_bh_disable() regions of code in addition to regions of
 * code demarked by rcu_read_lock() and rcu_read_unlock().  This means
 * that synchronize_rcu(), call_rcu, and friends all take not only
 * rcu_read_lock() but also rcu_read_lock_bh() into account.
 */
#define rcu_dereference_bh_check(p, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || rcu_read_lock_bh_held(), __rcu)

/**
 * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * This is the RCU-sched counterpart to rcu_dereference_check().
 * However, please note that starting in v5.0 kernels, vanilla RCU grace
 * periods wait for preempt_disable() regions of code in addition to
 * regions of code demarked by rcu_read_lock() and rcu_read_unlock().
 * This means that synchronize_rcu(), call_rcu, and friends all take not
 * only rcu_read_lock() but also rcu_read_lock_sched() into account.
 */
#define rcu_dereference_sched_check(p, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || rcu_read_lock_sched_held(), \
                                __rcu)

/*
 * The tracing infrastructure traces RCU (we want that), but unfortunately
 * some of the RCU checks causes tracing to lock up the system.
 *
 * The no-tracing version of rcu_dereference_raw() must not call
 * rcu_read_lock_held().
 */
#define rcu_dereference_raw_check(p) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), 1, __rcu)

/**
 * rcu_dereference_protected() - fetch RCU pointer when updates prevented
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * Return the value of the specified RCU-protected pointer, but omit
 * the READ_ONCE().  This is useful in cases where update-side locks
 * prevent the value of the pointer from changing.  Please note that this
 * primitive does *not* prevent the compiler from repeating this reference
 * or combining it with other references, so it should not be used without
 * protection of appropriate locks.
 *
 * This function is only for update-side use.  Using this function
 * when protected only by rcu_read_lock() will result in infrequent
 * but very ugly failures.
 */
#define rcu_dereference_protected(p, c) \
        __rcu_dereference_protected((p), __UNIQUE_ID(rcu), (c), __rcu)


/**
 * rcu_dereference() - fetch RCU-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * This is a simple wrapper around rcu_dereference_check().
 */
#define rcu_dereference(p) rcu_dereference_check(p, 0)

/**
 * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * Makes rcu_dereference_check() do the dirty work.
 */
#define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0)

/**
 * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * Makes rcu_dereference_check() do the dirty work.
 */
#define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0)

/**
 * rcu_pointer_handoff() - Hand off a pointer from RCU to other mechanism
 * @p: The pointer to hand off
 *
 * This is simply an identity function, but it documents where a pointer
 * is handed off from RCU to some other synchronization mechanism, for
 * example, reference counting or locking.  In C11, it would map to
 * kill_dependency().  It could be used as follows::
 *
 *        rcu_read_lock();
 *        p = rcu_dereference(gp);
 *        long_lived = is_long_lived(p);
 *        if (long_lived) {
 *                if (!atomic_inc_not_zero(p->refcnt))
 *                        long_lived = false;
 *                else
 *                        p = rcu_pointer_handoff(p);
 *        }
 *        rcu_read_unlock();
 */
#define rcu_pointer_handoff(p) (p)

/**
 * rcu_read_lock() - mark the beginning of an RCU read-side critical section
 *
 * When synchronize_rcu() is invoked on one CPU while other CPUs
 * are within RCU read-side critical sections, then the
 * synchronize_rcu() is guaranteed to block until after all the other
 * CPUs exit their critical sections.  Similarly, if call_rcu() is invoked
 * on one CPU while other CPUs are within RCU read-side critical
 * sections, invocation of the corresponding RCU callback is deferred
 * until after the all the other CPUs exit their critical sections.
 *
 * In v5.0 and later kernels, synchronize_rcu() and call_rcu() also
 * wait for regions of code with preemption disabled, including regions of
 * code with interrupts or softirqs disabled.  In pre-v5.0 kernels, which
 * define synchronize_sched(), only code enclosed within rcu_read_lock()
 * and rcu_read_unlock() are guaranteed to be waited for.
 *
 * Note, however, that RCU callbacks are permitted to run concurrently
 * with new RCU read-side critical sections.  One way that this can happen
 * is via the following sequence of events: (1) CPU 0 enters an RCU
 * read-side critical section, (2) CPU 1 invokes call_rcu() to register
 * an RCU callback, (3) CPU 0 exits the RCU read-side critical section,
 * (4) CPU 2 enters a RCU read-side critical section, (5) the RCU
 * callback is invoked.  This is legal, because the RCU read-side critical
 * section that was running concurrently with the call_rcu() (and which
 * therefore might be referencing something that the corresponding RCU
 * callback would free up) has completed before the corresponding
 * RCU callback is invoked.
 *
 * RCU read-side critical sections may be nested.  Any deferred actions
 * will be deferred until the outermost RCU read-side critical section
 * completes.
 *
 * You can avoid reading and understanding the next paragraph by
 * following this rule: don't put anything in an rcu_read_lock() RCU
 * read-side critical section that would block in a !PREEMPTION kernel.
 * But if you want the full story, read on!
 *
 * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU),
 * it is illegal to block while in an RCU read-side critical section.
 * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION
 * kernel builds, RCU read-side critical sections may be preempted,
 * but explicit blocking is illegal.  Finally, in preemptible RCU
 * implementations in real-time (with -rt patchset) kernel builds, RCU
 * read-side critical sections may be preempted and they may also block, but
 * only when acquiring spinlocks that are subject to priority inheritance.
 */
static __always_inline void rcu_read_lock(void)
{
        __rcu_read_lock();
        __acquire(RCU);
        rcu_lock_acquire(&rcu_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock() used illegally while idle");
}

/*
 * So where is rcu_write_lock()?  It does not exist, as there is no
 * way for writers to lock out RCU readers.  This is a feature, not
 * a bug -- this property is what provides RCU's performance benefits.
 * Of course, writers must coordinate with each other.  The normal
 * spinlock primitives work well for this, but any other technique may be
 * used as well.  RCU does not care how the writers keep out of each
 * others' way, as long as they do so.
 */

/**
 * rcu_read_unlock() - marks the end of an RCU read-side critical section.
 *
 * In almost all situations, rcu_read_unlock() is immune from deadlock.
 * In recent kernels that have consolidated synchronize_sched() and
 * synchronize_rcu_bh() into synchronize_rcu(), this deadlock immunity
 * also extends to the scheduler's runqueue and priority-inheritance
 * spinlocks, courtesy of the quiescent-state deferral that is carried
 * out when rcu_read_unlock() is invoked with interrupts disabled.
 *
 * See rcu_read_lock() for more information.
 */
static inline void rcu_read_unlock(void)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock() used illegally while idle");
        rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
        __release(RCU);
        __rcu_read_unlock();
}

/**
 * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
 *
 * This is equivalent to rcu_read_lock(), but also disables softirqs.
 * Note that anything else that disables softirqs can also serve as an RCU
 * read-side critical section.  However, please note that this equivalence
 * applies only to v5.0 and later.  Before v5.0, rcu_read_lock() and
 * rcu_read_lock_bh() were unrelated.
 *
 * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh()
 * must occur in the same context, for example, it is illegal to invoke
 * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh()
 * was invoked from some other task.
 */
static inline void rcu_read_lock_bh(void)
{
        local_bh_disable();
        __acquire(RCU_BH);
        rcu_lock_acquire(&rcu_bh_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock_bh() used illegally while idle");
}

/**
 * rcu_read_unlock_bh() - marks the end of a softirq-only RCU critical section
 *
 * See rcu_read_lock_bh() for more information.
 */
static inline void rcu_read_unlock_bh(void)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock_bh() used illegally while idle");
        rcu_lock_release(&rcu_bh_lock_map);
        __release(RCU_BH);
        local_bh_enable();
}

/**
 * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section
 *
 * This is equivalent to rcu_read_lock(), but also disables preemption.
 * Read-side critical sections can also be introduced by anything else that
 * disables preemption, including local_irq_disable() and friends.  However,
 * please note that the equivalence to rcu_read_lock() applies only to
 * v5.0 and later.  Before v5.0, rcu_read_lock() and rcu_read_lock_sched()
 * were unrelated.
 *
 * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched()
 * must occur in the same context, for example, it is illegal to invoke
 * rcu_read_unlock_sched() from process context if the matching
 * rcu_read_lock_sched() was invoked from an NMI handler.
 */
static inline void rcu_read_lock_sched(void)
{
        preempt_disable();
        __acquire(RCU_SCHED);
        rcu_lock_acquire(&rcu_sched_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock_sched() used illegally while idle");
}

/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
static inline notrace void rcu_read_lock_sched_notrace(void)
{
        preempt_disable_notrace();
        __acquire(RCU_SCHED);
}

/**
 * rcu_read_unlock_sched() - marks the end of a RCU-classic critical section
 *
 * See rcu_read_lock_sched() for more information.
 */
static inline void rcu_read_unlock_sched(void)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock_sched() used illegally while idle");
        rcu_lock_release(&rcu_sched_lock_map);
        __release(RCU_SCHED);
        preempt_enable();
}

/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
static inline notrace void rcu_read_unlock_sched_notrace(void)
{
        __release(RCU_SCHED);
        preempt_enable_notrace();
}

/**
 * RCU_INIT_POINTER() - initialize an RCU protected pointer
 * @p: The pointer to be initialized.
 * @v: The value to initialized the pointer to.
 *
 * Initialize an RCU-protected pointer in special cases where readers
 * do not need ordering constraints on the CPU or the compiler.  These
 * special cases are:
 *
 * 1.        This use of RCU_INIT_POINTER() is NULLing out the pointer *or*
 * 2.        The caller has taken whatever steps are required to prevent
 *        RCU readers from concurrently accessing this pointer *or*
 * 3.        The referenced data structure has already been exposed to
 *        readers either at compile time or via rcu_assign_pointer() *and*
 *
 *        a.        You have not made *any* reader-visible changes to
 *                this structure since then *or*
 *        b.        It is OK for readers accessing this structure from its
 *                new location to see the old state of the structure.  (For
 *                example, the changes were to statistical counters or to
 *                other state where exact synchronization is not required.)
 *
 * Failure to follow these rules governing use of RCU_INIT_POINTER() will
 * result in impossible-to-diagnose memory corruption.  As in the structures
 * will look OK in crash dumps, but any concurrent RCU readers might
 * see pre-initialized values of the referenced data structure.  So
 * please be very careful how you use RCU_INIT_POINTER()!!!
 *
 * If you are creating an RCU-protected linked structure that is accessed
 * by a single external-to-structure RCU-protected pointer, then you may
 * use RCU_INIT_POINTER() to initialize the internal RCU-protected
 * pointers, but you must use rcu_assign_pointer() to initialize the
 * external-to-structure pointer *after* you have completely initialized
 * the reader-accessible portions of the linked structure.
 *
 * Note that unlike rcu_assign_pointer(), RCU_INIT_POINTER() provides no
 * ordering guarantees for either the CPU or the compiler.
 */
#define RCU_INIT_POINTER(p, v) \
        do { \
                rcu_check_sparse(p, __rcu); \
                WRITE_ONCE(p, RCU_INITIALIZER(v)); \
        } while (0)

/**
 * RCU_POINTER_INITIALIZER() - statically initialize an RCU protected pointer
 * @p: The pointer to be initialized.
 * @v: The value to initialized the pointer to.
 *
 * GCC-style initialization for an RCU-protected pointer in a structure field.
 */
#define RCU_POINTER_INITIALIZER(p, v) \
                .p = RCU_INITIALIZER(v)

/*
 * Does the specified offset indicate that the corresponding rcu_head
 * structure can be handled by kvfree_rcu()?
 */
#define __is_kvfree_rcu_offset(offset) ((offset) < 4096)

/**
 * kfree_rcu() - kfree an object after a grace period.
 * @ptr: pointer to kfree for double-argument invocations.
 * @rhf: the name of the struct rcu_head within the type of @ptr.
 *
 * Many rcu callbacks functions just call kfree() on the base structure.
 * These functions are trivial, but their size adds up, and furthermore
 * when they are used in a kernel module, that module must invoke the
 * high-latency rcu_barrier() function at module-unload time.
 *
 * The kfree_rcu() function handles this issue.  Rather than encoding a
 * function address in the embedded rcu_head structure, kfree_rcu() instead
 * encodes the offset of the rcu_head structure within the base structure.
 * Because the functions are not allowed in the low-order 4096 bytes of
 * kernel virtual memory, offsets up to 4095 bytes can be accommodated.
 * If the offset is larger than 4095 bytes, a compile-time error will
 * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can
 * either fall back to use of call_rcu() or rearrange the structure to
 * position the rcu_head structure into the first 4096 bytes.
 *
 * The object to be freed can be allocated either by kmalloc() or
 * kmem_cache_alloc().
 *
 * Note that the allowable offset might decrease in the future.
 *
 * The BUILD_BUG_ON check must not involve any function calls, hence the
 * checks are done in macros here.
 */
#define kfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf)
#define kvfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf)

/**
 * kfree_rcu_mightsleep() - kfree an object after a grace period.
 * @ptr: pointer to kfree for single-argument invocations.
 *
 * When it comes to head-less variant, only one argument
 * is passed and that is just a pointer which has to be
 * freed after a grace period. Therefore the semantic is
 *
 *     kfree_rcu_mightsleep(ptr);
 *
 * where @ptr is the pointer to be freed by kvfree().
 *
 * Please note, head-less way of freeing is permitted to
 * use from a context that has to follow might_sleep()
 * annotation. Otherwise, please switch and embed the
 * rcu_head structure within the type of @ptr.
 */
#define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr)
#define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr)

#define kvfree_rcu_arg_2(ptr, rhf)                                        \
do {                                                                        \
        typeof (ptr) ___p = (ptr);                                        \
                                                                        \
        if (___p) {                                                                        \
                BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf)));        \
                kvfree_call_rcu(&((___p)->rhf), (void *) (___p));                        \
        }                                                                                \
} while (0)

#define kvfree_rcu_arg_1(ptr)                                        \
do {                                                                \
        typeof(ptr) ___p = (ptr);                                \
                                                                \
        if (___p)                                                \
                kvfree_call_rcu(NULL, (void *) (___p));                \
} while (0)

/*
 * Place this after a lock-acquisition primitive to guarantee that
 * an UNLOCK+LOCK pair acts as a full barrier.  This guarantee applies
 * if the UNLOCK and LOCK are executed by the same CPU or if the
 * UNLOCK and LOCK operate on the same lock variable.
 */
#ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE
#define smp_mb__after_unlock_lock()        smp_mb()  /* Full ordering for lock. */
#else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
#define smp_mb__after_unlock_lock()        do { } while (0)
#endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */


/* Has the specified rcu_head structure been handed to call_rcu()? */

/**
 * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
 * @rhp: The rcu_head structure to initialize.
 *
 * If you intend to invoke rcu_head_after_call_rcu() to test whether a
 * given rcu_head structure has already been passed to call_rcu(), then
 * you must also invoke this rcu_head_init() function on it just after
 * allocating that structure.  Calls to this function must not race with
 * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
 */
static inline void rcu_head_init(struct rcu_head *rhp)
{
        rhp->func = (rcu_callback_t)~0L;
}

/**
 * rcu_head_after_call_rcu() - Has this rcu_head been passed to call_rcu()?
 * @rhp: The rcu_head structure to test.
 * @f: The function passed to call_rcu() along with @rhp.
 *
 * Returns @true if the @rhp has been passed to call_rcu() with @func,
 * and @false otherwise.  Emits a warning in any other case, including
 * the case where @rhp has already been invoked after a grace period.
 * Calls to this function must not race with callback invocation.  One way
 * to avoid such races is to enclose the call to rcu_head_after_call_rcu()
 * in an RCU read-side critical section that includes a read-side fetch
 * of the pointer to the structure containing @rhp.
 */
static inline bool
rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f)
{
        rcu_callback_t func = READ_ONCE(rhp->func);

        if (func == f)
                return true;
        WARN_ON_ONCE(func != (rcu_callback_t)~0L);
        return false;
}

/* kernel/ksysfs.c definitions */
extern int rcu_expedited;
extern int rcu_normal;

DEFINE_LOCK_GUARD_0(rcu,
        do {
                rcu_read_lock();
                /*
                 * sparse doesn't call the cleanup function,
                 * so just release immediately and don't track
                 * the context. We don't need to anyway, since
                 * the whole point of the guard is to not need
                 * the explicit unlock.
                 */
                __release(RCU);
        } while (0),
        rcu_read_unlock())

#endif /* __LINUX_RCUPDATE_H */











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   32 










































   32 
   32 

























































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
// SPDX-License-Identifier: GPL-2.0-only
/*
 * xsave/xrstor support.
 *
 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
 */
#include <linux/bitops.h>
#include <linux/compat.h>
#include <linux/cpu.h>
#include <linux/mman.h>
#include <linux/nospec.h>
#include <linux/pkeys.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/vmalloc.h>

#include <asm/fpu/api.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/xcr.h>

#include <asm/tlbflush.h>
#include <asm/prctl.h>
#include <asm/elf.h>

#include "context.h"
#include "internal.h"
#include "legacy.h"
#include "xstate.h"

#define for_each_extended_xfeature(bit, mask)                                \
        (bit) = FIRST_EXTENDED_XFEATURE;                                \
        for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))

/*
 * Although we spell it out in here, the Processor Trace
 * xfeature is completely unused.  We use other mechanisms
 * to save/restore PT state in Linux.
 */
static const char *xfeature_names[] =
{
        "x87 floating point registers",
        "SSE registers",
        "AVX registers",
        "MPX bounds registers",
        "MPX CSR",
        "AVX-512 opmask",
        "AVX-512 Hi256",
        "AVX-512 ZMM_Hi256",
        "Processor Trace (unused)",
        "Protection Keys User registers",
        "PASID state",
        "Control-flow User registers",
        "Control-flow Kernel registers (unused)",
        "unknown xstate feature",
        "unknown xstate feature",
        "unknown xstate feature",
        "unknown xstate feature",
        "AMX Tile config",
        "AMX Tile data",
        "unknown xstate feature",
};

static unsigned short xsave_cpuid_features[] __initdata = {
        [XFEATURE_FP]                                = X86_FEATURE_FPU,
        [XFEATURE_SSE]                                = X86_FEATURE_XMM,
        [XFEATURE_YMM]                                = X86_FEATURE_AVX,
        [XFEATURE_BNDREGS]                        = X86_FEATURE_MPX,
        [XFEATURE_BNDCSR]                        = X86_FEATURE_MPX,
        [XFEATURE_OPMASK]                        = X86_FEATURE_AVX512F,
        [XFEATURE_ZMM_Hi256]                        = X86_FEATURE_AVX512F,
        [XFEATURE_Hi16_ZMM]                        = X86_FEATURE_AVX512F,
        [XFEATURE_PT_UNIMPLEMENTED_SO_FAR]        = X86_FEATURE_INTEL_PT,
        [XFEATURE_PKRU]                                = X86_FEATURE_OSPKE,
        [XFEATURE_PASID]                        = X86_FEATURE_ENQCMD,
        [XFEATURE_CET_USER]                        = X86_FEATURE_SHSTK,
        [XFEATURE_XTILE_CFG]                        = X86_FEATURE_AMX_TILE,
        [XFEATURE_XTILE_DATA]                        = X86_FEATURE_AMX_TILE,
};

static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
        { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
        { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;

#define XSTATE_FLAG_SUPERVISOR        BIT(0)
#define XSTATE_FLAG_ALIGNED64        BIT(1)

/*
 * Return whether the system supports a given xfeature.
 *
 * Also return the name of the (most advanced) feature that the caller requested:
 */
int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
{
        u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;

        if (unlikely(feature_name)) {
                long xfeature_idx, max_idx;
                u64 xfeatures_print;
                /*
                 * So we use FLS here to be able to print the most advanced
                 * feature that was requested but is missing. So if a driver
                 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
                 * missing AVX feature - this is the most informative message
                 * to users:
                 */
                if (xfeatures_missing)
                        xfeatures_print = xfeatures_missing;
                else
                        xfeatures_print = xfeatures_needed;

                xfeature_idx = fls64(xfeatures_print)-1;
                max_idx = ARRAY_SIZE(xfeature_names)-1;
                xfeature_idx = min(xfeature_idx, max_idx);

                *feature_name = xfeature_names[xfeature_idx];
        }

        if (xfeatures_missing)
                return 0;

        return 1;
}
EXPORT_SYMBOL_GPL(cpu_has_xfeatures);

static bool xfeature_is_aligned64(int xfeature_nr)
{
        return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
}

static bool xfeature_is_supervisor(int xfeature_nr)
{
        return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
}

static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
{
        unsigned int offs, i;

        /*
         * Non-compacted format and legacy features use the cached fixed
         * offsets.
         */
        if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
            xfeature <= XFEATURE_SSE)
                return xstate_offsets[xfeature];

        /*
         * Compacted format offsets depend on the actual content of the
         * compacted xsave area which is determined by the xcomp_bv header
         * field.
         */
        offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
        for_each_extended_xfeature(i, xcomp_bv) {
                if (xfeature_is_aligned64(i))
                        offs = ALIGN(offs, 64);
                if (i == xfeature)
                        break;
                offs += xstate_sizes[i];
        }
        return offs;
}

/*
 * Enable the extended processor state save/restore feature.
 * Called once per CPU onlining.
 */
void fpu__init_cpu_xstate(void)
{
        if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
                return;

        cr4_set_bits(X86_CR4_OSXSAVE);

        /*
         * Must happen after CR4 setup and before xsetbv() to allow KVM
         * lazy passthrough.  Write independent of the dynamic state static
         * key as that does not work on the boot CPU. This also ensures
         * that any stale state is wiped out from XFD. Reset the per CPU
         * xfd cache too.
         */
        if (cpu_feature_enabled(X86_FEATURE_XFD))
                xfd_set_state(init_fpstate.xfd);

        /*
         * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
         * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
         * states can be set here.
         */
        xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);

        /*
         * MSR_IA32_XSS sets supervisor states managed by XSAVES.
         */
        if (boot_cpu_has(X86_FEATURE_XSAVES)) {
                wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
                                     xfeatures_mask_independent());
        }
}

static bool xfeature_enabled(enum xfeature xfeature)
{
        return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
}

/*
 * Record the offsets and sizes of various xstates contained
 * in the XSAVE state memory layout.
 */
static void __init setup_xstate_cache(void)
{
        u32 eax, ebx, ecx, edx, i;
        /* start at the beginning of the "extended state" */
        unsigned int last_good_offset = offsetof(struct xregs_state,
                                                 extended_state_area);
        /*
         * The FP xstates and SSE xstates are legacy states. They are always
         * in the fixed offsets in the xsave area in either compacted form
         * or standard form.
         */
        xstate_offsets[XFEATURE_FP]        = 0;
        xstate_sizes[XFEATURE_FP]        = offsetof(struct fxregs_state,
                                                   xmm_space);

        xstate_offsets[XFEATURE_SSE]        = xstate_sizes[XFEATURE_FP];
        xstate_sizes[XFEATURE_SSE]        = sizeof_field(struct fxregs_state,
                                                       xmm_space);

        for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
                cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);

                xstate_sizes[i] = eax;
                xstate_flags[i] = ecx;

                /*
                 * If an xfeature is supervisor state, the offset in EBX is
                 * invalid, leave it to -1.
                 */
                if (xfeature_is_supervisor(i))
                        continue;

                xstate_offsets[i] = ebx;

                /*
                 * In our xstate size checks, we assume that the highest-numbered
                 * xstate feature has the highest offset in the buffer.  Ensure
                 * it does.
                 */
                WARN_ONCE(last_good_offset > xstate_offsets[i],
                          "x86/fpu: misordered xstate at %d\n", last_good_offset);

                last_good_offset = xstate_offsets[i];
        }
}

static void __init print_xstate_feature(u64 xstate_mask)
{
        const char *feature_name;

        if (cpu_has_xfeatures(xstate_mask, &feature_name))
                pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
}

/*
 * Print out all the supported xstate features:
 */
static void __init print_xstate_features(void)
{
        print_xstate_feature(XFEATURE_MASK_FP);
        print_xstate_feature(XFEATURE_MASK_SSE);
        print_xstate_feature(XFEATURE_MASK_YMM);
        print_xstate_feature(XFEATURE_MASK_BNDREGS);
        print_xstate_feature(XFEATURE_MASK_BNDCSR);
        print_xstate_feature(XFEATURE_MASK_OPMASK);
        print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
        print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
        print_xstate_feature(XFEATURE_MASK_PKRU);
        print_xstate_feature(XFEATURE_MASK_PASID);
        print_xstate_feature(XFEATURE_MASK_CET_USER);
        print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
        print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
}

/*
 * This check is important because it is easy to get XSTATE_*
 * confused with XSTATE_BIT_*.
 */
#define CHECK_XFEATURE(nr) do {                \
        WARN_ON(nr < FIRST_EXTENDED_XFEATURE);        \
        WARN_ON(nr >= XFEATURE_MAX);        \
} while (0)

/*
 * Print out xstate component offsets and sizes
 */
static void __init print_xstate_offset_size(void)
{
        int i;

        for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
                pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
                        i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
                        i, xstate_sizes[i]);
        }
}

/*
 * This function is called only during boot time when x86 caps are not set
 * up and alternative can not be used yet.
 */
static __init void os_xrstor_booting(struct xregs_state *xstate)
{
        u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
        u32 lmask = mask;
        u32 hmask = mask >> 32;
        int err;

        if (cpu_feature_enabled(X86_FEATURE_XSAVES))
                XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
        else
                XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);

        /*
         * We should never fault when copying from a kernel buffer, and the FPU
         * state we set at boot time should be valid.
         */
        WARN_ON_FPU(err);
}

/*
 * All supported features have either init state all zeros or are
 * handled in setup_init_fpu() individually. This is an explicit
 * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
 * newly added supported features at build time and make people
 * actually look at the init state for the new feature.
 */
#define XFEATURES_INIT_FPSTATE_HANDLED                \
        (XFEATURE_MASK_FP |                        \
         XFEATURE_MASK_SSE |                        \
         XFEATURE_MASK_YMM |                        \
         XFEATURE_MASK_OPMASK |                        \
         XFEATURE_MASK_ZMM_Hi256 |                \
         XFEATURE_MASK_Hi16_ZMM         |                \
         XFEATURE_MASK_PKRU |                        \
         XFEATURE_MASK_BNDREGS |                \
         XFEATURE_MASK_BNDCSR |                        \
         XFEATURE_MASK_PASID |                        \
         XFEATURE_MASK_CET_USER |                \
         XFEATURE_MASK_XTILE)

/*
 * setup the xstate image representing the init state
 */
static void __init setup_init_fpu_buf(void)
{
        BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
                      XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
                     XFEATURES_INIT_FPSTATE_HANDLED);

        if (!boot_cpu_has(X86_FEATURE_XSAVE))
                return;

        print_xstate_features();

        xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures);

        /*
         * Init all the features state with header.xfeatures being 0x0
         */
        os_xrstor_booting(&init_fpstate.regs.xsave);

        /*
         * All components are now in init state. Read the state back so
         * that init_fpstate contains all non-zero init state. This only
         * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
         * those use the init optimization which skips writing data for
         * components in init state.
         *
         * XSAVE could be used, but that would require to reshuffle the
         * data when XSAVEC/S is available because XSAVEC/S uses xstate
         * compaction. But doing so is a pointless exercise because most
         * components have an all zeros init state except for the legacy
         * ones (FP and SSE). Those can be saved with FXSAVE into the
         * legacy area. Adding new features requires to ensure that init
         * state is all zeroes or if not to add the necessary handling
         * here.
         */
        fxsave(&init_fpstate.regs.fxsave);
}

int xfeature_size(int xfeature_nr)
{
        u32 eax, ebx, ecx, edx;

        CHECK_XFEATURE(xfeature_nr);
        cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
        return eax;
}

/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
static int validate_user_xstate_header(const struct xstate_header *hdr,
                                       struct fpstate *fpstate)
{
        /* No unknown or supervisor features may be set */
        if (hdr->xfeatures & ~fpstate->user_xfeatures)
                return -EINVAL;

        /* Userspace must use the uncompacted format */
        if (hdr->xcomp_bv)
                return -EINVAL;

        /*
         * If 'reserved' is shrunken to add a new field, make sure to validate
         * that new field here!
         */
        BUILD_BUG_ON(sizeof(hdr->reserved) != 48);

        /* No reserved bits may be set */
        if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
                return -EINVAL;

        return 0;
}

static void __init __xstate_dump_leaves(void)
{
        int i;
        u32 eax, ebx, ecx, edx;
        static int should_dump = 1;

        if (!should_dump)
                return;
        should_dump = 0;
        /*
         * Dump out a few leaves past the ones that we support
         * just in case there are some goodies up there
         */
        for (i = 0; i < XFEATURE_MAX + 10; i++) {
                cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
                pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
                        XSTATE_CPUID, i, eax, ebx, ecx, edx);
        }
}

#define XSTATE_WARN_ON(x, fmt, ...) do {                                        \
        if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) {        \
                __xstate_dump_leaves();                                                \
        }                                                                        \
} while (0)

#define XCHECK_SZ(sz, nr, __struct) ({                                        \
        if (WARN_ONCE(sz != sizeof(__struct),                                \
            "[%s]: struct is %zu bytes, cpu state %d bytes\n",                \
            xfeature_names[nr], sizeof(__struct), sz)) {                \
                __xstate_dump_leaves();                                        \
        }                                                                \
        true;                                                                \
})


/**
 * check_xtile_data_against_struct - Check tile data state size.
 *
 * Calculate the state size by multiplying the single tile size which is
 * recorded in a C struct, and the number of tiles that the CPU informs.
 * Compare the provided size with the calculation.
 *
 * @size:        The tile data state size
 *
 * Returns:        0 on success, -EINVAL on mismatch.
 */
static int __init check_xtile_data_against_struct(int size)
{
        u32 max_palid, palid, state_size;
        u32 eax, ebx, ecx, edx;
        u16 max_tile;

        /*
         * Check the maximum palette id:
         *   eax: the highest numbered palette subleaf.
         */
        cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx);

        /*
         * Cross-check each tile size and find the maximum number of
         * supported tiles.
         */
        for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
                u16 tile_size, max;

                /*
                 * Check the tile size info:
                 *   eax[31:16]:  bytes per title
                 *   ebx[31:16]:  the max names (or max number of tiles)
                 */
                cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx);
                tile_size = eax >> 16;
                max = ebx >> 16;

                if (tile_size != sizeof(struct xtile_data)) {
                        pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
                               __stringify(XFEATURE_XTILE_DATA),
                               sizeof(struct xtile_data), tile_size);
                        __xstate_dump_leaves();
                        return -EINVAL;
                }

                if (max > max_tile)
                        max_tile = max;
        }

        state_size = sizeof(struct xtile_data) * max_tile;
        if (size != state_size) {
                pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
                       __stringify(XFEATURE_XTILE_DATA), state_size, size);
                __xstate_dump_leaves();
                return -EINVAL;
        }
        return 0;
}

/*
 * We have a C struct for each 'xstate'.  We need to ensure
 * that our software representation matches what the CPU
 * tells us about the state's size.
 */
static bool __init check_xstate_against_struct(int nr)
{
        /*
         * Ask the CPU for the size of the state.
         */
        int sz = xfeature_size(nr);

        /*
         * Match each CPU state with the corresponding software
         * structure.
         */
        switch (nr) {
        case XFEATURE_YMM:          return XCHECK_SZ(sz, nr, struct ymmh_struct);
        case XFEATURE_BNDREGS:          return XCHECK_SZ(sz, nr, struct mpx_bndreg_state);
        case XFEATURE_BNDCSR:          return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state);
        case XFEATURE_OPMASK:          return XCHECK_SZ(sz, nr, struct avx_512_opmask_state);
        case XFEATURE_ZMM_Hi256:  return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state);
        case XFEATURE_Hi16_ZMM:          return XCHECK_SZ(sz, nr, struct avx_512_hi16_state);
        case XFEATURE_PKRU:          return XCHECK_SZ(sz, nr, struct pkru_state);
        case XFEATURE_PASID:          return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
        case XFEATURE_XTILE_CFG:  return XCHECK_SZ(sz, nr, struct xtile_cfg);
        case XFEATURE_CET_USER:          return XCHECK_SZ(sz, nr, struct cet_user_state);
        case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
        default:
                XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
                return false;
        }

        return true;
}

static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
{
        unsigned int topmost = fls64(xfeatures) -  1;
        unsigned int offset = xstate_offsets[topmost];

        if (topmost <= XFEATURE_SSE)
                return sizeof(struct xregs_state);

        if (compacted)
                offset = xfeature_get_offset(xfeatures, topmost);
        return offset + xstate_sizes[topmost];
}

/*
 * This essentially double-checks what the cpu told us about
 * how large the XSAVE buffer needs to be.  We are recalculating
 * it to be safe.
 *
 * Independent XSAVE features allocate their own buffers and are not
 * covered by these checks. Only the size of the buffer for task->fpu
 * is checked here.
 */
static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
{
        bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
        bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
        unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
        int i;

        for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
                if (!check_xstate_against_struct(i))
                        return false;
                /*
                 * Supervisor state components can be managed only by
                 * XSAVES.
                 */
                if (!xsaves && xfeature_is_supervisor(i)) {
                        XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
                        return false;
                }
        }
        size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
        XSTATE_WARN_ON(size != kernel_size,
                       "size %u != kernel_size %u\n", size, kernel_size);
        return size == kernel_size;
}

/*
 * Get total size of enabled xstates in XCR0 | IA32_XSS.
 *
 * Note the SDM's wording here.  "sub-function 0" only enumerates
 * the size of the *user* states.  If we use it to size a buffer
 * that we use 'XSAVES' on, we could potentially overflow the
 * buffer because 'XSAVES' saves system states too.
 *
 * This also takes compaction into account. So this works for
 * XSAVEC as well.
 */
static unsigned int __init get_compacted_size(void)
{
        unsigned int eax, ebx, ecx, edx;
        /*
         * - CPUID function 0DH, sub-function 1:
         *    EBX enumerates the size (in bytes) required by
         *    the XSAVES instruction for an XSAVE area
         *    containing all the state components
         *    corresponding to bits currently set in
         *    XCR0 | IA32_XSS.
         *
         * When XSAVES is not available but XSAVEC is (virt), then there
         * are no supervisor states, but XSAVEC still uses compacted
         * format.
         */
        cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
        return ebx;
}

/*
 * Get the total size of the enabled xstates without the independent supervisor
 * features.
 */
static unsigned int __init get_xsave_compacted_size(void)
{
        u64 mask = xfeatures_mask_independent();
        unsigned int size;

        if (!mask)
                return get_compacted_size();

        /* Disable independent features. */
        wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());

        /*
         * Ask the hardware what size is required of the buffer.
         * This is the size required for the task->fpu buffer.
         */
        size = get_compacted_size();

        /* Re-enable independent features so XSAVES will work on them again. */
        wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);

        return size;
}

static unsigned int __init get_xsave_size_user(void)
{
        unsigned int eax, ebx, ecx, edx;
        /*
         * - CPUID function 0DH, sub-function 0:
         *    EBX enumerates the size (in bytes) required by
         *    the XSAVE instruction for an XSAVE area
         *    containing all the *user* state components
         *    corresponding to bits currently set in XCR0.
         */
        cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
        return ebx;
}

static int __init init_xstate_size(void)
{
        /* Recompute the context size for enabled features: */
        unsigned int user_size, kernel_size, kernel_default_size;
        bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);

        /* Uncompacted user space size */
        user_size = get_xsave_size_user();

        /*
         * XSAVES kernel size includes supervisor states and uses compacted
         * format. XSAVEC uses compacted format, but does not save
         * supervisor states.
         *
         * XSAVE[OPT] do not support supervisor states so kernel and user
         * size is identical.
         */
        if (compacted)
                kernel_size = get_xsave_compacted_size();
        else
                kernel_size = user_size;

        kernel_default_size =
                xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);

        if (!paranoid_xstate_size_valid(kernel_size))
                return -EINVAL;

        fpu_kernel_cfg.max_size = kernel_size;
        fpu_user_cfg.max_size = user_size;

        fpu_kernel_cfg.default_size = kernel_default_size;
        fpu_user_cfg.default_size =
                xstate_calculate_size(fpu_user_cfg.default_features, false);

        return 0;
}

/*
 * We enabled the XSAVE hardware, but something went wrong and
 * we can not use it.  Disable it.
 */
static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
{
        fpu_kernel_cfg.max_features = 0;
        cr4_clear_bits(X86_CR4_OSXSAVE);
        setup_clear_cpu_cap(X86_FEATURE_XSAVE);

        /* Restore the legacy size.*/
        fpu_kernel_cfg.max_size = legacy_size;
        fpu_kernel_cfg.default_size = legacy_size;
        fpu_user_cfg.max_size = legacy_size;
        fpu_user_cfg.default_size = legacy_size;

        /*
         * Prevent enabling the static branch which enables writes to the
         * XFD MSR.
         */
        init_fpstate.xfd = 0;

        fpstate_reset(&current->thread.fpu);
}

/*
 * Enable and initialize the xsave feature.
 * Called once per system bootup.
 */
void __init fpu__init_system_xstate(unsigned int legacy_size)
{
        unsigned int eax, ebx, ecx, edx;
        u64 xfeatures;
        int err;
        int i;

        if (!boot_cpu_has(X86_FEATURE_FPU)) {
                pr_info("x86/fpu: No FPU detected\n");
                return;
        }

        if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
                pr_info("x86/fpu: x87 FPU will use %s\n",
                        boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
                return;
        }

        if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
                WARN_ON_FPU(1);
                return;
        }

        /*
         * Find user xstates supported by the processor.
         */
        cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
        fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);

        /*
         * Find supervisor xstates supported by the processor.
         */
        cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
        fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);

        if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
                /*
                 * This indicates that something really unexpected happened
                 * with the enumeration.  Disable XSAVE and try to continue
                 * booting without it.  This is too early to BUG().
                 */
                pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
                       fpu_kernel_cfg.max_features);
                goto out_disable;
        }

        /*
         * Clear XSAVE features that are disabled in the normal CPUID.
         */
        for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
                unsigned short cid = xsave_cpuid_features[i];

                /* Careful: X86_FEATURE_FPU is 0! */
                if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
                        fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
        }

        if (!cpu_feature_enabled(X86_FEATURE_XFD))
                fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;

        if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
                fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
        else
                fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
                                        XFEATURE_MASK_SUPERVISOR_SUPPORTED;

        fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
        fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;

        /* Clean out dynamic features from default */
        fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
        fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;

        fpu_user_cfg.default_features = fpu_user_cfg.max_features;
        fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;

        /* Store it for paranoia check at the end */
        xfeatures = fpu_kernel_cfg.max_features;

        /*
         * Initialize the default XFD state in initfp_state and enable the
         * dynamic sizing mechanism if dynamic states are available.  The
         * static key cannot be enabled here because this runs before
         * jump_label_init(). This is delayed to an initcall.
         */
        init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;

        /* Set up compaction feature bit */
        if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
            cpu_feature_enabled(X86_FEATURE_XSAVES))
                setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);

        /* Enable xstate instructions to be able to continue with initialization: */
        fpu__init_cpu_xstate();

        /* Cache size, offset and flags for initialization */
        setup_xstate_cache();

        err = init_xstate_size();
        if (err)
                goto out_disable;

        /* Reset the state for the current task */
        fpstate_reset(&current->thread.fpu);

        /*
         * Update info used for ptrace frames; use standard-format size and no
         * supervisor xstates:
         */
        update_regset_xstate_info(fpu_user_cfg.max_size,
                                  fpu_user_cfg.max_features);

        /*
         * init_fpstate excludes dynamic states as they are large but init
         * state is zero.
         */
        init_fpstate.size                = fpu_kernel_cfg.default_size;
        init_fpstate.xfeatures                = fpu_kernel_cfg.default_features;

        if (init_fpstate.size > sizeof(init_fpstate.regs)) {
                pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n",
                        sizeof(init_fpstate.regs), init_fpstate.size);
                goto out_disable;
        }

        setup_init_fpu_buf();

        /*
         * Paranoia check whether something in the setup modified the
         * xfeatures mask.
         */
        if (xfeatures != fpu_kernel_cfg.max_features) {
                pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
                       xfeatures, fpu_kernel_cfg.max_features);
                goto out_disable;
        }

        /*
         * CPU capabilities initialization runs before FPU init. So
         * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely
         * functional, set the feature bit so depending code works.
         */
        setup_force_cpu_cap(X86_FEATURE_OSXSAVE);

        print_xstate_offset_size();
        pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
                fpu_kernel_cfg.max_features,
                fpu_kernel_cfg.max_size,
                boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
        return;

out_disable:
        /* something went wrong, try to boot without any XSAVE support */
        fpu__init_disable_system_xstate(legacy_size);
}

/*
 * Restore minimal FPU state after suspend:
 */
void fpu__resume_cpu(void)
{
        /*
         * Restore XCR0 on xsave capable CPUs:
         */
        if (cpu_feature_enabled(X86_FEATURE_XSAVE))
                xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);

        /*
         * Restore IA32_XSS. The same CPUID bit enumerates support
         * of XSAVES and MSR_IA32_XSS.
         */
        if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
                wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
                                     xfeatures_mask_independent());
        }

        if (fpu_state_size_dynamic())
                wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
}

/*
 * Given an xstate feature nr, calculate where in the xsave
 * buffer the state is.  Callers should ensure that the buffer
 * is valid.
 */
static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
{
        u64 xcomp_bv = xsave->header.xcomp_bv;

        if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
                return NULL;

        if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
                if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
                        return NULL;
        }

        return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
}

/*
 * Given the xsave area and a state inside, this function returns the
 * address of the state.
 *
 * This is the API that is called to get xstate address in either
 * standard format or compacted format of xsave area.
 *
 * Note that if there is no data for the field in the xsave buffer
 * this will return NULL.
 *
 * Inputs:
 *        xstate: the thread's storage area for all FPU data
 *        xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
 *        XFEATURE_SSE, etc...)
 * Output:
 *        address of the state in the xsave area, or NULL if the
 *        field is not present in the xsave buffer.
 */
void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
{
        /*
         * Do we even *have* xsave state?
         */
        if (!boot_cpu_has(X86_FEATURE_XSAVE))
                return NULL;

        /*
         * We should not ever be requesting features that we
         * have not enabled.
         */
        if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
                return NULL;

        /*
         * This assumes the last 'xsave*' instruction to
         * have requested that 'xfeature_nr' be saved.
         * If it did not, we might be seeing and old value
         * of the field in the buffer.
         *
         * This can happen because the last 'xsave' did not
         * request that this feature be saved (unlikely)
         * or because the "init optimization" caused it
         * to not be saved.
         */
        if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
                return NULL;

        return __raw_xsave_addr(xsave, xfeature_nr);
}
EXPORT_SYMBOL_GPL(get_xsave_addr);

#ifdef CONFIG_ARCH_HAS_PKEYS

/*
 * This will go out and modify PKRU register to set the access
 * rights for @pkey to @init_val.
 */
int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
                              unsigned long init_val)
{
        u32 old_pkru, new_pkru_bits = 0;
        int pkey_shift;

        /*
         * This check implies XSAVE support.  OSPKE only gets
         * set if we enable XSAVE and we enable PKU in XCR0.
         */
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return -EINVAL;

        /*
         * This code should only be called with valid 'pkey'
         * values originating from in-kernel users.  Complain
         * if a bad value is observed.
         */
        if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
                return -EINVAL;

        /* Set the bits we need in PKRU:  */
        if (init_val & PKEY_DISABLE_ACCESS)
                new_pkru_bits |= PKRU_AD_BIT;
        if (init_val & PKEY_DISABLE_WRITE)
                new_pkru_bits |= PKRU_WD_BIT;

        /* Shift the bits in to the correct place in PKRU for pkey: */
        pkey_shift = pkey * PKRU_BITS_PER_PKEY;
        new_pkru_bits <<= pkey_shift;

        /* Get old PKRU and mask off any old bits in place: */
        old_pkru = read_pkru();
        old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);

        /* Write old part along with new part: */
        write_pkru(old_pkru | new_pkru_bits);

        return 0;
}
#endif /* ! CONFIG_ARCH_HAS_PKEYS */

static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
                         void *init_xstate, unsigned int size)
{
        membuf_write(to, from_xstate ? xstate : init_xstate, size);
}

/**
 * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
 * @to:                membuf descriptor
 * @fpstate:        The fpstate buffer from which to copy
 * @xfeatures:        The mask of xfeatures to save (XSAVE mode only)
 * @pkru_val:        The PKRU value to store in the PKRU component
 * @copy_mode:        The requested copy mode
 *
 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
 * format, i.e. from the kernel internal hardware dependent storage format
 * to the requested @mode. UABI XSTATE is always uncompacted!
 *
 * It supports partial copy but @to.pos always starts from zero.
 */
void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
                               u64 xfeatures, u32 pkru_val,
                               enum xstate_copy_mode copy_mode)
{
        const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
        struct xregs_state *xinit = &init_fpstate.regs.xsave;
        struct xregs_state *xsave = &fpstate->regs.xsave;
        struct xstate_header header;
        unsigned int zerofrom;
        u64 mask;
        int i;

        memset(&header, 0, sizeof(header));
        header.xfeatures = xsave->header.xfeatures;

        /* Mask out the feature bits depending on copy mode */
        switch (copy_mode) {
        case XSTATE_COPY_FP:
                header.xfeatures &= XFEATURE_MASK_FP;
                break;

        case XSTATE_COPY_FX:
                header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
                break;

        case XSTATE_COPY_XSAVE:
                header.xfeatures &= fpstate->user_xfeatures & xfeatures;
                break;
        }

        /* Copy FP state up to MXCSR */
        copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
                     &xinit->i387, off_mxcsr);

        /* Copy MXCSR when SSE or YMM are set in the feature mask */
        copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
                     &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
                     MXCSR_AND_FLAGS_SIZE);

        /* Copy the remaining FP state */
        copy_feature(header.xfeatures & XFEATURE_MASK_FP,
                     &to, &xsave->i387.st_space, &xinit->i387.st_space,
                     sizeof(xsave->i387.st_space));

        /* Copy the SSE state - shared with YMM, but independently managed */
        copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
                     &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
                     sizeof(xsave->i387.xmm_space));

        if (copy_mode != XSTATE_COPY_XSAVE)
                goto out;

        /* Zero the padding area */
        membuf_zero(&to, sizeof(xsave->i387.padding));

        /* Copy xsave->i387.sw_reserved */
        membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));

        /* Copy the user space relevant state of @xsave->header */
        membuf_write(&to, &header, sizeof(header));

        zerofrom = offsetof(struct xregs_state, extended_state_area);

        /*
         * This 'mask' indicates which states to copy from fpstate.
         * Those extended states that are not present in fpstate are
         * either disabled or initialized:
         *
         * In non-compacted format, disabled features still occupy
         * state space but there is no state to copy from in the
         * compacted init_fpstate. The gap tracking will zero these
         * states.
         *
         * The extended features have an all zeroes init state. Thus,
         * remove them from 'mask' to zero those features in the user
         * buffer instead of retrieving them from init_fpstate.
         */
        mask = header.xfeatures;

        for_each_extended_xfeature(i, mask) {
                /*
                 * If there was a feature or alignment gap, zero the space
                 * in the destination buffer.
                 */
                if (zerofrom < xstate_offsets[i])
                        membuf_zero(&to, xstate_offsets[i] - zerofrom);

                if (i == XFEATURE_PKRU) {
                        struct pkru_state pkru = {0};
                        /*
                         * PKRU is not necessarily up to date in the
                         * XSAVE buffer. Use the provided value.
                         */
                        pkru.pkru = pkru_val;
                        membuf_write(&to, &pkru, sizeof(pkru));
                } else {
                        membuf_write(&to,
                                     __raw_xsave_addr(xsave, i),
                                     xstate_sizes[i]);
                }
                /*
                 * Keep track of the last copied state in the non-compacted
                 * target buffer for gap zeroing.
                 */
                zerofrom = xstate_offsets[i] + xstate_sizes[i];
        }

out:
        if (to.left)
                membuf_zero(&to, to.left);
}

/**
 * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
 * @to:                membuf descriptor
 * @tsk:        The task from which to copy the saved xstate
 * @copy_mode:        The requested copy mode
 *
 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
 * format, i.e. from the kernel internal hardware dependent storage format
 * to the requested @mode. UABI XSTATE is always uncompacted!
 *
 * It supports partial copy but @to.pos always starts from zero.
 */
void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
                             enum xstate_copy_mode copy_mode)
{
        __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
                                  tsk->thread.fpu.fpstate->user_xfeatures,
                                  tsk->thread.pkru, copy_mode);
}

static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
                            const void *kbuf, const void __user *ubuf)
{
        if (kbuf) {
                memcpy(dst, kbuf + offset, size);
        } else {
                if (copy_from_user(dst, ubuf + offset, size))
                        return -EFAULT;
        }
        return 0;
}


/**
 * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate
 * @fpstate:        The fpstate buffer to copy to
 * @kbuf:        The UABI format buffer, if it comes from the kernel
 * @ubuf:        The UABI format buffer, if it comes from userspace
 * @pkru:        The location to write the PKRU value to
 *
 * Converts from the UABI format into the kernel internal hardware
 * dependent format.
 *
 * This function ultimately has three different callers with distinct PKRU
 * behavior.
 * 1.        When called from sigreturn the PKRU register will be restored from
 *        @fpstate via an XRSTOR. Correctly copying the UABI format buffer to
 *        @fpstate is sufficient to cover this case, but the caller will also
 *        pass a pointer to the thread_struct's pkru field in @pkru and updating
 *        it is harmless.
 * 2.        When called from ptrace the PKRU register will be restored from the
 *        thread_struct's pkru field. A pointer to that is passed in @pkru.
 *        The kernel will restore it manually, so the XRSTOR behavior that resets
 *        the PKRU register to the hardware init value (0) if the corresponding
 *        xfeatures bit is not set is emulated here.
 * 3.        When called from KVM the PKRU register will be restored from the vcpu's
 *        pkru field. A pointer to that is passed in @pkru. KVM hasn't used
 *        XRSTOR and hasn't had the PKRU resetting behavior described above. To
 *        preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
 *        bit is not set.
 */
static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
                               const void __user *ubuf, u32 *pkru)
{
        struct xregs_state *xsave = &fpstate->regs.xsave;
        unsigned int offset, size;
        struct xstate_header hdr;
        u64 mask;
        int i;

        offset = offsetof(struct xregs_state, header);
        if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
                return -EFAULT;

        if (validate_user_xstate_header(&hdr, fpstate))
                return -EINVAL;

        /* Validate MXCSR when any of the related features is in use */
        mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
        if (hdr.xfeatures & mask) {
                u32 mxcsr[2];

                offset = offsetof(struct fxregs_state, mxcsr);
                if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
                        return -EFAULT;

                /* Reserved bits in MXCSR must be zero. */
                if (mxcsr[0] & ~mxcsr_feature_mask)
                        return -EINVAL;

                /* SSE and YMM require MXCSR even when FP is not in use. */
                if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
                        xsave->i387.mxcsr = mxcsr[0];
                        xsave->i387.mxcsr_mask = mxcsr[1];
                }
        }

        for (i = 0; i < XFEATURE_MAX; i++) {
                mask = BIT_ULL(i);

                if (hdr.xfeatures & mask) {
                        void *dst = __raw_xsave_addr(xsave, i);

                        offset = xstate_offsets[i];
                        size = xstate_sizes[i];

                        if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
                                return -EFAULT;
                }
        }

        if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
                struct pkru_state *xpkru;

                xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
                *pkru = xpkru->pkru;
        } else {
                /*
                 * KVM may pass NULL here to indicate that it does not need
                 * PKRU updated.
                 */
                if (pkru)
                        *pkru = 0;
        }

        /*
         * The state that came in from userspace was user-state only.
         * Mask all the user states out of 'xfeatures':
         */
        xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;

        /*
         * Add back in the features that came in from userspace:
         */
        xsave->header.xfeatures |= hdr.xfeatures;

        return 0;
}

/*
 * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
 * format and copy to the target thread. Used by ptrace and KVM.
 */
int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
{
        return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
}

/*
 * Convert from a sigreturn standard-format user-space buffer to kernel
 * XSAVE[S] format and copy to the target thread. This is called from the
 * sigreturn() and rt_sigreturn() system calls.
 */
int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
                                      const void __user *ubuf)
{
        return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru);
}

static bool validate_independent_components(u64 mask)
{
        u64 xchk;

        if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
                return false;

        xchk = ~xfeatures_mask_independent();

        if (WARN_ON_ONCE(!mask || mask & xchk))
                return false;

        return true;
}

/**
 * xsaves - Save selected components to a kernel xstate buffer
 * @xstate:        Pointer to the buffer
 * @mask:        Feature mask to select the components to save
 *
 * The @xstate buffer must be 64 byte aligned and correctly initialized as
 * XSAVES does not write the full xstate header. Before first use the
 * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
 * can #GP.
 *
 * The feature mask must be a subset of the independent features.
 */
void xsaves(struct xregs_state *xstate, u64 mask)
{
        int err;

        if (!validate_independent_components(mask))
                return;

        XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
        WARN_ON_ONCE(err);
}

/**
 * xrstors - Restore selected components from a kernel xstate buffer
 * @xstate:        Pointer to the buffer
 * @mask:        Feature mask to select the components to restore
 *
 * The @xstate buffer must be 64 byte aligned and correctly initialized
 * otherwise XRSTORS from that buffer can #GP.
 *
 * Proper usage is to restore the state which was saved with
 * xsaves() into @xstate.
 *
 * The feature mask must be a subset of the independent features.
 */
void xrstors(struct xregs_state *xstate, u64 mask)
{
        int err;

        if (!validate_independent_components(mask))
                return;

        XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
        WARN_ON_ONCE(err);
}

#if IS_ENABLED(CONFIG_KVM)
void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
{
        void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);

        if (addr)
                memset(addr, 0, xstate_sizes[xfeature]);
}
EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
#endif

#ifdef CONFIG_X86_64

#ifdef CONFIG_X86_DEBUG_FPU
/*
 * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
 * can safely operate on the @fpstate buffer.
 */
static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
{
        u64 xfd = __this_cpu_read(xfd_state);

        if (fpstate->xfd == xfd)
                return true;

         /*
          * The XFD MSR does not match fpstate->xfd. That's invalid when
          * the passed in fpstate is current's fpstate.
          */
        if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
                return false;

        /*
         * XRSTOR(S) from init_fpstate are always correct as it will just
         * bring all components into init state and not read from the
         * buffer. XSAVE(S) raises #PF after init.
         */
        if (fpstate == &init_fpstate)
                return rstor;

        /*
         * XSAVE(S): clone(), fpu_swap_kvm_fpstate()
         * XRSTORS(S): fpu_swap_kvm_fpstate()
         */

        /*
         * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
         * the buffer area for XFD-disabled state components.
         */
        mask &= ~xfd;

        /*
         * Remove features which are valid in fpstate. They
         * have space allocated in fpstate.
         */
        mask &= ~fpstate->xfeatures;

        /*
         * Any remaining state components in 'mask' might be written
         * by XSAVE/XRSTOR. Fail validation it found.
         */
        return !mask;
}

void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
{
        WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
}
#endif /* CONFIG_X86_DEBUG_FPU */

static int __init xfd_update_static_branch(void)
{
        /*
         * If init_fpstate.xfd has bits set then dynamic features are
         * available and the dynamic sizing must be enabled.
         */
        if (init_fpstate.xfd)
                static_branch_enable(&__fpu_state_size_dynamic);
        return 0;
}
arch_initcall(xfd_update_static_branch)

void fpstate_free(struct fpu *fpu)
{
        if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
                vfree(fpu->fpstate);
}

/**
 * fpstate_realloc - Reallocate struct fpstate for the requested new features
 *
 * @xfeatures:        A bitmap of xstate features which extend the enabled features
 *                of that task
 * @ksize:        The required size for the kernel buffer
 * @usize:        The required size for user space buffers
 * @guest_fpu:        Pointer to a guest FPU container. NULL for host allocations
 *
 * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
 * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
 * with large states are likely to live longer.
 *
 * Returns: 0 on success, -ENOMEM on allocation error.
 */
static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
                           unsigned int usize, struct fpu_guest *guest_fpu)
{
        struct fpu *fpu = &current->thread.fpu;
        struct fpstate *curfps, *newfps = NULL;
        unsigned int fpsize;
        bool in_use;

        fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);

        newfps = vzalloc(fpsize);
        if (!newfps)
                return -ENOMEM;
        newfps->size = ksize;
        newfps->user_size = usize;
        newfps->is_valloc = true;

        /*
         * When a guest FPU is supplied, use @guest_fpu->fpstate
         * as reference independent whether it is in use or not.
         */
        curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;

        /* Determine whether @curfps is the active fpstate */
        in_use = fpu->fpstate == curfps;

        if (guest_fpu) {
                newfps->is_guest = true;
                newfps->is_confidential = curfps->is_confidential;
                newfps->in_use = curfps->in_use;
                guest_fpu->xfeatures |= xfeatures;
                guest_fpu->uabi_size = usize;
        }

        fpregs_lock();
        /*
         * If @curfps is in use, ensure that the current state is in the
         * registers before swapping fpstate as that might invalidate it
         * due to layout changes.
         */
        if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
                fpregs_restore_userregs();

        newfps->xfeatures = curfps->xfeatures | xfeatures;
        newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
        newfps->xfd = curfps->xfd & ~xfeatures;

        /* Do the final updates within the locked region */
        xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);

        if (guest_fpu) {
                guest_fpu->fpstate = newfps;
                /* If curfps is active, update the FPU fpstate pointer */
                if (in_use)
                        fpu->fpstate = newfps;
        } else {
                fpu->fpstate = newfps;
        }

        if (in_use)
                xfd_update_state(fpu->fpstate);
        fpregs_unlock();

        /* Only free valloc'ed state */
        if (curfps && curfps->is_valloc)
                vfree(curfps);

        return 0;
}

static int validate_sigaltstack(unsigned int usize)
{
        struct task_struct *thread, *leader = current->group_leader;
        unsigned long framesize = get_sigframe_size();

        lockdep_assert_held(&current->sighand->siglock);

        /* get_sigframe_size() is based on fpu_user_cfg.max_size */
        framesize -= fpu_user_cfg.max_size;
        framesize += usize;
        for_each_thread(leader, thread) {
                if (thread->sas_ss_size && thread->sas_ss_size < framesize)
                        return -ENOSPC;
        }
        return 0;
}

static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
{
        /*
         * This deliberately does not exclude !XSAVES as we still might
         * decide to optionally context switch XCR0 or talk the silicon
         * vendors into extending XFD for the pre AMX states, especially
         * AVX512.
         */
        bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
        struct fpu *fpu = &current->group_leader->thread.fpu;
        struct fpu_state_perm *perm;
        unsigned int ksize, usize;
        u64 mask;
        int ret = 0;

        /* Check whether fully enabled */
        if ((permitted & requested) == requested)
                return 0;

        /* Calculate the resulting kernel state size */
        mask = permitted | requested;
        /* Take supervisor states into account on the host */
        if (!guest)
                mask |= xfeatures_mask_supervisor();
        ksize = xstate_calculate_size(mask, compacted);

        /* Calculate the resulting user state size */
        mask &= XFEATURE_MASK_USER_SUPPORTED;
        usize = xstate_calculate_size(mask, false);

        if (!guest) {
                ret = validate_sigaltstack(usize);
                if (ret)
                        return ret;
        }

        perm = guest ? &fpu->guest_perm : &fpu->perm;
        /* Pairs with the READ_ONCE() in xstate_get_group_perm() */
        WRITE_ONCE(perm->__state_perm, mask);
        /* Protected by sighand lock */
        perm->__state_size = ksize;
        perm->__user_state_size = usize;
        return ret;
}

/*
 * Permissions array to map facilities with more than one component
 */
static const u64 xstate_prctl_req[XFEATURE_MAX] = {
        [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
};

static int xstate_request_perm(unsigned long idx, bool guest)
{
        u64 permitted, requested;
        int ret;

        if (idx >= XFEATURE_MAX)
                return -EINVAL;

        /*
         * Look up the facility mask which can require more than
         * one xstate component.
         */
        idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
        requested = xstate_prctl_req[idx];
        if (!requested)
                return -EOPNOTSUPP;

        if ((fpu_user_cfg.max_features & requested) != requested)
                return -EOPNOTSUPP;

        /* Lockless quick check */
        permitted = xstate_get_group_perm(guest);
        if ((permitted & requested) == requested)
                return 0;

        /* Protect against concurrent modifications */
        spin_lock_irq(&current->sighand->siglock);
        permitted = xstate_get_group_perm(guest);

        /* First vCPU allocation locks the permissions. */
        if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
                ret = -EBUSY;
        else
                ret = __xstate_request_perm(permitted, requested, guest);
        spin_unlock_irq(&current->sighand->siglock);
        return ret;
}

int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
{
        u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
        struct fpu_state_perm *perm;
        unsigned int ksize, usize;
        struct fpu *fpu;

        if (!xfd_event) {
                if (!guest_fpu)
                        pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
                return 0;
        }

        /* Protect against concurrent modifications */
        spin_lock_irq(&current->sighand->siglock);

        /* If not permitted let it die */
        if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
                spin_unlock_irq(&current->sighand->siglock);
                return -EPERM;
        }

        fpu = &current->group_leader->thread.fpu;
        perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
        ksize = perm->__state_size;
        usize = perm->__user_state_size;

        /*
         * The feature is permitted. State size is sufficient.  Dropping
         * the lock is safe here even if more features are added from
         * another task, the retrieved buffer sizes are valid for the
         * currently requested feature(s).
         */
        spin_unlock_irq(&current->sighand->siglock);

        /*
         * Try to allocate a new fpstate. If that fails there is no way
         * out.
         */
        if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
                return -EFAULT;
        return 0;
}

int xfd_enable_feature(u64 xfd_err)
{
        return __xfd_enable_feature(xfd_err, NULL);
}

#else /* CONFIG_X86_64 */
static inline int xstate_request_perm(unsigned long idx, bool guest)
{
        return -EPERM;
}
#endif  /* !CONFIG_X86_64 */

u64 xstate_get_guest_group_perm(void)
{
        return xstate_get_group_perm(true);
}
EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);

/**
 * fpu_xstate_prctl - xstate permission operations
 * @option:        A subfunction of arch_prctl()
 * @arg2:        option argument
 * Return:        0 if successful; otherwise, an error code
 *
 * Option arguments:
 *
 * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
 * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
 * ARCH_REQ_XCOMP_PERM: Facility number requested
 *
 * For facilities which require more than one XSTATE component, the request
 * must be the highest state component number related to that facility,
 * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
 * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
 */
long fpu_xstate_prctl(int option, unsigned long arg2)
{
        u64 __user *uptr = (u64 __user *)arg2;
        u64 permitted, supported;
        unsigned long idx = arg2;
        bool guest = false;

        switch (option) {
        case ARCH_GET_XCOMP_SUPP:
                supported = fpu_user_cfg.max_features |        fpu_user_cfg.legacy_features;
                return put_user(supported, uptr);

        case ARCH_GET_XCOMP_PERM:
                /*
                 * Lockless snapshot as it can also change right after the
                 * dropping the lock.
                 */
                permitted = xstate_get_host_group_perm();
                permitted &= XFEATURE_MASK_USER_SUPPORTED;
                return put_user(permitted, uptr);

        case ARCH_GET_XCOMP_GUEST_PERM:
                permitted = xstate_get_guest_group_perm();
                permitted &= XFEATURE_MASK_USER_SUPPORTED;
                return put_user(permitted, uptr);

        case ARCH_REQ_XCOMP_GUEST_PERM:
                guest = true;
                fallthrough;

        case ARCH_REQ_XCOMP_PERM:
                if (!IS_ENABLED(CONFIG_X86_64))
                        return -EOPNOTSUPP;

                return xstate_request_perm(idx, guest);

        default:
                return -EINVAL;
        }
}

#ifdef CONFIG_PROC_PID_ARCH_STATUS
/*
 * Report the amount of time elapsed in millisecond since last AVX512
 * use in the task.
 */
static void avx512_status(struct seq_file *m, struct task_struct *task)
{
        unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp);
        long delta;

        if (!timestamp) {
                /*
                 * Report -1 if no AVX512 usage
                 */
                delta = -1;
        } else {
                delta = (long)(jiffies - timestamp);
                /*
                 * Cap to LONG_MAX if time difference > LONG_MAX
                 */
                if (delta < 0)
                        delta = LONG_MAX;
                delta = jiffies_to_msecs(delta);
        }

        seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
        seq_putc(m, '\n');
}

/*
 * Report architecture specific information
 */
int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task)
{
        /*
         * Report AVX512 state if the processor and build option supported.
         */
        if (cpu_feature_enabled(X86_FEATURE_AVX512F))
                avx512_status(m, task);

        return 0;
}
#endif /* CONFIG_PROC_PID_ARCH_STATUS */
















   50 






    2 




   55 







   34 





   34 





   42 












   45 



   45 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
// SPDX-License-Identifier: GPL-2.0
#include <linux/memblock.h>
#include <linux/mmdebug.h>
#include <linux/export.h>
#include <linux/mm.h>

#include <asm/page.h>
#include <linux/vmalloc.h>

#include "physaddr.h"

#ifdef CONFIG_X86_64

#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
        unsigned long y = x - __START_KERNEL_map;

        /* use the carry flag to determine if x was < __START_KERNEL_map */
        if (unlikely(x > y)) {
                x = y + phys_base;

                VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
        } else {
                x = y + (__START_KERNEL_map - PAGE_OFFSET);

                /* carry flag will be set if starting x was >= PAGE_OFFSET */
                VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x));
        }

        return x;
}
EXPORT_SYMBOL(__phys_addr);

unsigned long __phys_addr_symbol(unsigned long x)
{
        unsigned long y = x - __START_KERNEL_map;

        /* only check upper bounds since lower bounds will trigger carry */
        VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);

        return y + phys_base;
}
EXPORT_SYMBOL(__phys_addr_symbol);
#endif

bool __virt_addr_valid(unsigned long x)
{
        unsigned long y = x - __START_KERNEL_map;

        /* use the carry flag to determine if x was < __START_KERNEL_map */
        if (unlikely(x > y)) {
                x = y + phys_base;

                if (y >= KERNEL_IMAGE_SIZE)
                        return false;
        } else {
                x = y + (__START_KERNEL_map - PAGE_OFFSET);

                /* carry flag will be set if starting x was >= PAGE_OFFSET */
                if ((x > y) || !phys_addr_valid(x))
                        return false;
        }

        return pfn_valid(x >> PAGE_SHIFT);
}
EXPORT_SYMBOL(__virt_addr_valid);

#else

#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
        unsigned long phys_addr = x - PAGE_OFFSET;
        /* VMALLOC_* aren't constants  */
        VIRTUAL_BUG_ON(x < PAGE_OFFSET);
        VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
        /* max_low_pfn is set early, but not _that_ early */
        if (max_low_pfn) {
                VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
                BUG_ON(slow_virt_to_phys((void *)x) != phys_addr);
        }
        return phys_addr;
}
EXPORT_SYMBOL(__phys_addr);
#endif

bool __virt_addr_valid(unsigned long x)
{
        if (x < PAGE_OFFSET)
                return false;
        if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
                return false;
        if (x >= FIXADDR_START)
                return false;
        return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
}
EXPORT_SYMBOL(__virt_addr_valid);

#endif        /* CONFIG_X86_64 */


































































   50 

















   24 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Prevent the compiler from merging or refetching reads or writes. The
 * compiler is also forbidden from reordering successive instances of
 * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some
 * particular ordering. One way to make the compiler aware of ordering is to
 * put the two invocations of READ_ONCE or WRITE_ONCE in different C
 * statements.
 *
 * These two macros will also work on aggregate data types like structs or
 * unions.
 *
 * Their two major use cases are: (1) Mediating communication between
 * process-level code and irq/NMI handlers, all running on the same CPU,
 * and (2) Ensuring that the compiler does not fold, spindle, or otherwise
 * mutilate accesses that either do not require ordering or that interact
 * with an explicit memory barrier or atomic instruction that provides the
 * required ordering.
 */
#ifndef __ASM_GENERIC_RWONCE_H
#define __ASM_GENERIC_RWONCE_H

#ifndef __ASSEMBLY__

#include <linux/compiler_types.h>
#include <linux/kasan-checks.h>
#include <linux/kcsan-checks.h>

/*
 * Yes, this permits 64-bit accesses on 32-bit architectures. These will
 * actually be atomic in some cases (namely Armv7 + LPAE), but for others we
 * rely on the access being split into 2x32-bit accesses for a 32-bit quantity
 * (e.g. a virtual address) and a strong prevailing wind.
 */
#define compiletime_assert_rwonce_type(t)                                        \
        compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long),        \
                "Unsupported access size for {READ,WRITE}_ONCE().")

/*
 * Use __READ_ONCE() instead of READ_ONCE() if you do not require any
 * atomicity. Note that this may result in tears!
 */
#ifndef __READ_ONCE
#define __READ_ONCE(x)        (*(const volatile __unqual_scalar_typeof(x) *)&(x))
#endif

#define READ_ONCE(x)                                                        \
({                                                                        \
        compiletime_assert_rwonce_type(x);                                \
        __READ_ONCE(x);                                                        \
})

#define __WRITE_ONCE(x, val)                                                \
do {                                                                        \
        *(volatile typeof(x) *)&(x) = (val);                                \
} while (0)

#define WRITE_ONCE(x, val)                                                \
do {                                                                        \
        compiletime_assert_rwonce_type(x);                                \
        __WRITE_ONCE(x, val);                                                \
} while (0)

static __no_sanitize_or_inline
unsigned long __read_once_word_nocheck(const void *addr)
{
        return __READ_ONCE(*(unsigned long *)addr);
}

/*
 * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a
 * word from memory atomically but without telling KASAN/KCSAN. This is
 * usually used by unwinding code when walking the stack of a running process.
 */
#define READ_ONCE_NOCHECK(x)                                                \
({                                                                        \
        compiletime_assert(sizeof(x) == sizeof(unsigned long),                \
                "Unsupported access size for READ_ONCE_NOCHECK().");        \
        (typeof(x))__read_once_word_nocheck(&(x));                        \
})

static __no_kasan_or_inline
unsigned long read_word_at_a_time(const void *addr)
{
        kasan_check_read(addr, 1);
        return *(unsigned long *)addr;
}

#endif /* __ASSEMBLY__ */
#endif        /* __ASM_GENERIC_RWONCE_H */




























    2 
    1 


    1 





















































































    4 


    4 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2008 IBM Corporation
 *
 * Authors:
 * Mimi Zohar <zohar@us.ibm.com>
 *
 * File: ima_iint.c
 *        - implements the IMA hook: ima_inode_free
 *        - cache integrity information in the inode security blob
 */
#include <linux/slab.h>

#include "ima.h"

static struct kmem_cache *ima_iint_cache __ro_after_init;

/**
 * ima_iint_find - Return the iint associated with an inode
 * @inode: Pointer to the inode
 *
 * Return the IMA integrity information (iint) associated with an inode, if the
 * inode was processed by IMA.
 *
 * Return: Found iint or NULL.
 */
struct ima_iint_cache *ima_iint_find(struct inode *inode)
{
        if (!IS_IMA(inode))
                return NULL;

        return ima_inode_get_iint(inode);
}

#define IMA_MAX_NESTING (FILESYSTEM_MAX_STACK_DEPTH + 1)

/*
 * It is not clear that IMA should be nested at all, but as long is it measures
 * files both on overlayfs and on underlying fs, we need to annotate the iint
 * mutex to avoid lockdep false positives related to IMA + overlayfs.
 * See ovl_lockdep_annotate_inode_mutex_key() for more details.
 */
static inline void ima_iint_lockdep_annotate(struct ima_iint_cache *iint,
                                             struct inode *inode)
{
#ifdef CONFIG_LOCKDEP
        static struct lock_class_key ima_iint_mutex_key[IMA_MAX_NESTING];

        int depth = inode->i_sb->s_stack_depth;

        if (WARN_ON_ONCE(depth < 0 || depth >= IMA_MAX_NESTING))
                depth = 0;

        lockdep_set_class(&iint->mutex, &ima_iint_mutex_key[depth]);
#endif
}

static void ima_iint_init_always(struct ima_iint_cache *iint,
                                 struct inode *inode)
{
        iint->ima_hash = NULL;
        iint->real_inode.version = 0;
        iint->flags = 0UL;
        iint->atomic_flags = 0UL;
        iint->ima_file_status = INTEGRITY_UNKNOWN;
        iint->ima_mmap_status = INTEGRITY_UNKNOWN;
        iint->ima_bprm_status = INTEGRITY_UNKNOWN;
        iint->ima_read_status = INTEGRITY_UNKNOWN;
        iint->ima_creds_status = INTEGRITY_UNKNOWN;
        iint->measured_pcrs = 0;
        mutex_init(&iint->mutex);
        ima_iint_lockdep_annotate(iint, inode);
}

static void ima_iint_free(struct ima_iint_cache *iint)
{
        kfree(iint->ima_hash);
        mutex_destroy(&iint->mutex);
        kmem_cache_free(ima_iint_cache, iint);
}

/**
 * ima_inode_get - Find or allocate an iint associated with an inode
 * @inode: Pointer to the inode
 *
 * Find an iint associated with an inode, and allocate a new one if not found.
 * Caller must lock i_mutex.
 *
 * Return: An iint on success, NULL on error.
 */
struct ima_iint_cache *ima_inode_get(struct inode *inode)
{
        struct ima_iint_cache *iint;

        iint = ima_iint_find(inode);
        if (iint)
                return iint;

        iint = kmem_cache_alloc(ima_iint_cache, GFP_NOFS);
        if (!iint)
                return NULL;

        ima_iint_init_always(iint, inode);

        inode->i_flags |= S_IMA;
        ima_inode_set_iint(inode, iint);

        return iint;
}

/**
 * ima_inode_free - Called on inode free
 * @inode: Pointer to the inode
 *
 * Free the iint associated with an inode.
 */
void ima_inode_free(struct inode *inode)
{
        struct ima_iint_cache *iint;

        if (!IS_IMA(inode))
                return;

        iint = ima_iint_find(inode);
        ima_inode_set_iint(inode, NULL);

        ima_iint_free(iint);
}

static void ima_iint_init_once(void *foo)
{
        struct ima_iint_cache *iint = (struct ima_iint_cache *)foo;

        memset(iint, 0, sizeof(*iint));
}

void __init ima_iintcache_init(void)
{
        ima_iint_cache =
            kmem_cache_create("ima_iint_cache", sizeof(struct ima_iint_cache),
                              0, SLAB_PANIC, ima_iint_init_once);
}





















    1 





    1 













    1 



























    1 































    1 











































    1 





































































    1 



    1 






    1 










































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
// SPDX-License-Identifier: GPL-2.0
#include <linux/file.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/utime.h>
#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <linux/compat.h>
#include <asm/unistd.h>
#include <linux/filelock.h>

static bool nsec_valid(long nsec)
{
        if (nsec == UTIME_OMIT || nsec == UTIME_NOW)
                return true;

        return nsec >= 0 && nsec <= 999999999;
}

int vfs_utimes(const struct path *path, struct timespec64 *times)
{
        int error;
        struct iattr newattrs;
        struct inode *inode = path->dentry->d_inode;
        struct inode *delegated_inode = NULL;

        if (times) {
                if (!nsec_valid(times[0].tv_nsec) ||
                    !nsec_valid(times[1].tv_nsec))
                        return -EINVAL;
                if (times[0].tv_nsec == UTIME_NOW &&
                    times[1].tv_nsec == UTIME_NOW)
                        times = NULL;
        }

        error = mnt_want_write(path->mnt);
        if (error)
                goto out;

        newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
        if (times) {
                if (times[0].tv_nsec == UTIME_OMIT)
                        newattrs.ia_valid &= ~ATTR_ATIME;
                else if (times[0].tv_nsec != UTIME_NOW) {
                        newattrs.ia_atime = times[0];
                        newattrs.ia_valid |= ATTR_ATIME_SET;
                }

                if (times[1].tv_nsec == UTIME_OMIT)
                        newattrs.ia_valid &= ~ATTR_MTIME;
                else if (times[1].tv_nsec != UTIME_NOW) {
                        newattrs.ia_mtime = times[1];
                        newattrs.ia_valid |= ATTR_MTIME_SET;
                }
                /*
                 * Tell setattr_prepare(), that this is an explicit time
                 * update, even if neither ATTR_ATIME_SET nor ATTR_MTIME_SET
                 * were used.
                 */
                newattrs.ia_valid |= ATTR_TIMES_SET;
        } else {
                newattrs.ia_valid |= ATTR_TOUCH;
        }
retry_deleg:
        inode_lock(inode);
        error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs,
                              &delegated_inode);
        inode_unlock(inode);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }

        mnt_drop_write(path->mnt);
out:
        return error;
}

static int do_utimes_path(int dfd, const char __user *filename,
                struct timespec64 *times, int flags)
{
        struct path path;
        int lookup_flags = 0, error;

        if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))
                return -EINVAL;

        if (!(flags & AT_SYMLINK_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;

retry:
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (error)
                return error;

        error = vfs_utimes(&path, times);
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }

        return error;
}

static int do_utimes_fd(int fd, struct timespec64 *times, int flags)
{
        struct fd f;
        int error;

        if (flags)
                return -EINVAL;

        f = fdget(fd);
        if (!f.file)
                return -EBADF;
        error = vfs_utimes(&f.file->f_path, times);
        fdput(f);
        return error;
}

/*
 * do_utimes - change times on filename or file descriptor
 * @dfd: open file descriptor, -1 or AT_FDCWD
 * @filename: path name or NULL
 * @times: new times or NULL
 * @flags: zero or more flags (only AT_SYMLINK_NOFOLLOW for the moment)
 *
 * If filename is NULL and dfd refers to an open file, then operate on
 * the file.  Otherwise look up filename, possibly using dfd as a
 * starting point.
 *
 * If times==NULL, set access and modification to current time,
 * must be owner or have write permission.
 * Else, update from *times, must be owner or super user.
 */
long do_utimes(int dfd, const char __user *filename, struct timespec64 *times,
               int flags)
{
        if (filename == NULL && dfd != AT_FDCWD)
                return do_utimes_fd(dfd, times, flags);
        return do_utimes_path(dfd, filename, times, flags);
}

SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename,
                struct __kernel_timespec __user *, utimes, int, flags)
{
        struct timespec64 tstimes[2];

        if (utimes) {
                if ((get_timespec64(&tstimes[0], &utimes[0]) ||
                        get_timespec64(&tstimes[1], &utimes[1])))
                        return -EFAULT;

                /* Nothing to do, we must not even check the path.  */
                if (tstimes[0].tv_nsec == UTIME_OMIT &&
                    tstimes[1].tv_nsec == UTIME_OMIT)
                        return 0;
        }

        return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags);
}

#ifdef __ARCH_WANT_SYS_UTIME
/*
 * futimesat(), utimes() and utime() are older versions of utimensat()
 * that are provided for compatibility with traditional C libraries.
 * On modern architectures, we always use libc wrappers around
 * utimensat() instead.
 */
static long do_futimesat(int dfd, const char __user *filename,
                         struct __kernel_old_timeval __user *utimes)
{
        struct __kernel_old_timeval times[2];
        struct timespec64 tstimes[2];

        if (utimes) {
                if (copy_from_user(&times, utimes, sizeof(times)))
                        return -EFAULT;

                /* This test is needed to catch all invalid values.  If we
                   would test only in do_utimes we would miss those invalid
                   values truncated by the multiplication with 1000.  Note
                   that we also catch UTIME_{NOW,OMIT} here which are only
                   valid for utimensat.  */
                if (times[0].tv_usec >= 1000000 || times[0].tv_usec < 0 ||
                    times[1].tv_usec >= 1000000 || times[1].tv_usec < 0)
                        return -EINVAL;

                tstimes[0].tv_sec = times[0].tv_sec;
                tstimes[0].tv_nsec = 1000 * times[0].tv_usec;
                tstimes[1].tv_sec = times[1].tv_sec;
                tstimes[1].tv_nsec = 1000 * times[1].tv_usec;
        }

        return do_utimes(dfd, filename, utimes ? tstimes : NULL, 0);
}


SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename,
                struct __kernel_old_timeval __user *, utimes)
{
        return do_futimesat(dfd, filename, utimes);
}

SYSCALL_DEFINE2(utimes, char __user *, filename,
                struct __kernel_old_timeval __user *, utimes)
{
        return do_futimesat(AT_FDCWD, filename, utimes);
}

SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times)
{
        struct timespec64 tv[2];

        if (times) {
                if (get_user(tv[0].tv_sec, &times->actime) ||
                    get_user(tv[1].tv_sec, &times->modtime))
                        return -EFAULT;
                tv[0].tv_nsec = 0;
                tv[1].tv_nsec = 0;
        }
        return do_utimes(AT_FDCWD, filename, times ? tv : NULL, 0);
}
#endif

#ifdef CONFIG_COMPAT_32BIT_TIME
/*
 * Not all architectures have sys_utime, so implement this in terms
 * of sys_utimes.
 */
#ifdef __ARCH_WANT_SYS_UTIME32
SYSCALL_DEFINE2(utime32, const char __user *, filename,
                struct old_utimbuf32 __user *, t)
{
        struct timespec64 tv[2];

        if (t) {
                if (get_user(tv[0].tv_sec, &t->actime) ||
                    get_user(tv[1].tv_sec, &t->modtime))
                        return -EFAULT;
                tv[0].tv_nsec = 0;
                tv[1].tv_nsec = 0;
        }
        return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
}
#endif

SYSCALL_DEFINE4(utimensat_time32, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags)
{
        struct timespec64 tv[2];

        if  (t) {
                if (get_old_timespec32(&tv[0], &t[0]) ||
                    get_old_timespec32(&tv[1], &t[1]))
                        return -EFAULT;

                if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
                        return 0;
        }
        return do_utimes(dfd, filename, t ? tv : NULL, flags);
}

#ifdef __ARCH_WANT_SYS_UTIME32
static long do_compat_futimesat(unsigned int dfd, const char __user *filename,
                                struct old_timeval32 __user *t)
{
        struct timespec64 tv[2];

        if (t) {
                if (get_user(tv[0].tv_sec, &t[0].tv_sec) ||
                    get_user(tv[0].tv_nsec, &t[0].tv_usec) ||
                    get_user(tv[1].tv_sec, &t[1].tv_sec) ||
                    get_user(tv[1].tv_nsec, &t[1].tv_usec))
                        return -EFAULT;
                if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 ||
                    tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0)
                        return -EINVAL;
                tv[0].tv_nsec *= 1000;
                tv[1].tv_nsec *= 1000;
        }
        return do_utimes(dfd, filename, t ? tv : NULL, 0);
}

SYSCALL_DEFINE3(futimesat_time32, unsigned int, dfd,
                       const char __user *, filename,
                       struct old_timeval32 __user *, t)
{
        return do_compat_futimesat(dfd, filename, t);
}

SYSCALL_DEFINE2(utimes_time32, const char __user *, filename, struct old_timeval32 __user *, t)
{
        return do_compat_futimesat(AT_FDCWD, filename, t);
}
#endif
#endif
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   30 












































   12 


























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * security/tomoyo/common.h
 *
 * Header file for TOMOYO.
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#ifndef _SECURITY_TOMOYO_COMMON_H
#define _SECURITY_TOMOYO_COMMON_H

#define pr_fmt(fmt) fmt

#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/file.h>
#include <linux/kmod.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/list.h>
#include <linux/cred.h>
#include <linux/poll.h>
#include <linux/binfmts.h>
#include <linux/highmem.h>
#include <linux/net.h>
#include <linux/inet.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/un.h>
#include <linux/lsm_hooks.h>
#include <net/sock.h>
#include <net/af_unix.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/udp.h>

/********** Constants definitions. **********/

/*
 * TOMOYO uses this hash only when appending a string into the string
 * table. Frequency of appending strings is very low. So we don't need
 * large (e.g. 64k) hash size. 256 will be sufficient.
 */
#define TOMOYO_HASH_BITS  8
#define TOMOYO_MAX_HASH (1u<<TOMOYO_HASH_BITS)

/*
 * TOMOYO checks only SOCK_STREAM, SOCK_DGRAM, SOCK_RAW, SOCK_SEQPACKET.
 * Therefore, we don't need SOCK_MAX.
 */
#define TOMOYO_SOCK_MAX 6

#define TOMOYO_EXEC_TMPSIZE     4096

/* Garbage collector is trying to kfree() this element. */
#define TOMOYO_GC_IN_PROGRESS -1

/* Profile number is an integer between 0 and 255. */
#define TOMOYO_MAX_PROFILES 256

/* Group number is an integer between 0 and 255. */
#define TOMOYO_MAX_ACL_GROUPS 256

/* Index numbers for "struct tomoyo_condition". */
enum tomoyo_conditions_index {
        TOMOYO_TASK_UID,             /* current_uid()   */
        TOMOYO_TASK_EUID,            /* current_euid()  */
        TOMOYO_TASK_SUID,            /* current_suid()  */
        TOMOYO_TASK_FSUID,           /* current_fsuid() */
        TOMOYO_TASK_GID,             /* current_gid()   */
        TOMOYO_TASK_EGID,            /* current_egid()  */
        TOMOYO_TASK_SGID,            /* current_sgid()  */
        TOMOYO_TASK_FSGID,           /* current_fsgid() */
        TOMOYO_TASK_PID,             /* sys_getpid()   */
        TOMOYO_TASK_PPID,            /* sys_getppid()  */
        TOMOYO_EXEC_ARGC,            /* "struct linux_binprm *"->argc */
        TOMOYO_EXEC_ENVC,            /* "struct linux_binprm *"->envc */
        TOMOYO_TYPE_IS_SOCKET,       /* S_IFSOCK */
        TOMOYO_TYPE_IS_SYMLINK,      /* S_IFLNK */
        TOMOYO_TYPE_IS_FILE,         /* S_IFREG */
        TOMOYO_TYPE_IS_BLOCK_DEV,    /* S_IFBLK */
        TOMOYO_TYPE_IS_DIRECTORY,    /* S_IFDIR */
        TOMOYO_TYPE_IS_CHAR_DEV,     /* S_IFCHR */
        TOMOYO_TYPE_IS_FIFO,         /* S_IFIFO */
        TOMOYO_MODE_SETUID,          /* S_ISUID */
        TOMOYO_MODE_SETGID,          /* S_ISGID */
        TOMOYO_MODE_STICKY,          /* S_ISVTX */
        TOMOYO_MODE_OWNER_READ,      /* S_IRUSR */
        TOMOYO_MODE_OWNER_WRITE,     /* S_IWUSR */
        TOMOYO_MODE_OWNER_EXECUTE,   /* S_IXUSR */
        TOMOYO_MODE_GROUP_READ,      /* S_IRGRP */
        TOMOYO_MODE_GROUP_WRITE,     /* S_IWGRP */
        TOMOYO_MODE_GROUP_EXECUTE,   /* S_IXGRP */
        TOMOYO_MODE_OTHERS_READ,     /* S_IROTH */
        TOMOYO_MODE_OTHERS_WRITE,    /* S_IWOTH */
        TOMOYO_MODE_OTHERS_EXECUTE,  /* S_IXOTH */
        TOMOYO_EXEC_REALPATH,
        TOMOYO_SYMLINK_TARGET,
        TOMOYO_PATH1_UID,
        TOMOYO_PATH1_GID,
        TOMOYO_PATH1_INO,
        TOMOYO_PATH1_MAJOR,
        TOMOYO_PATH1_MINOR,
        TOMOYO_PATH1_PERM,
        TOMOYO_PATH1_TYPE,
        TOMOYO_PATH1_DEV_MAJOR,
        TOMOYO_PATH1_DEV_MINOR,
        TOMOYO_PATH2_UID,
        TOMOYO_PATH2_GID,
        TOMOYO_PATH2_INO,
        TOMOYO_PATH2_MAJOR,
        TOMOYO_PATH2_MINOR,
        TOMOYO_PATH2_PERM,
        TOMOYO_PATH2_TYPE,
        TOMOYO_PATH2_DEV_MAJOR,
        TOMOYO_PATH2_DEV_MINOR,
        TOMOYO_PATH1_PARENT_UID,
        TOMOYO_PATH1_PARENT_GID,
        TOMOYO_PATH1_PARENT_INO,
        TOMOYO_PATH1_PARENT_PERM,
        TOMOYO_PATH2_PARENT_UID,
        TOMOYO_PATH2_PARENT_GID,
        TOMOYO_PATH2_PARENT_INO,
        TOMOYO_PATH2_PARENT_PERM,
        TOMOYO_MAX_CONDITION_KEYWORD,
        TOMOYO_NUMBER_UNION,
        TOMOYO_NAME_UNION,
        TOMOYO_ARGV_ENTRY,
        TOMOYO_ENVP_ENTRY,
};


/* Index numbers for stat(). */
enum tomoyo_path_stat_index {
        /* Do not change this order. */
        TOMOYO_PATH1,
        TOMOYO_PATH1_PARENT,
        TOMOYO_PATH2,
        TOMOYO_PATH2_PARENT,
        TOMOYO_MAX_PATH_STAT
};

/* Index numbers for operation mode. */
enum tomoyo_mode_index {
        TOMOYO_CONFIG_DISABLED,
        TOMOYO_CONFIG_LEARNING,
        TOMOYO_CONFIG_PERMISSIVE,
        TOMOYO_CONFIG_ENFORCING,
        TOMOYO_CONFIG_MAX_MODE,
        TOMOYO_CONFIG_WANT_REJECT_LOG =  64,
        TOMOYO_CONFIG_WANT_GRANT_LOG  = 128,
        TOMOYO_CONFIG_USE_DEFAULT     = 255,
};

/* Index numbers for entry type. */
enum tomoyo_policy_id {
        TOMOYO_ID_GROUP,
        TOMOYO_ID_ADDRESS_GROUP,
        TOMOYO_ID_PATH_GROUP,
        TOMOYO_ID_NUMBER_GROUP,
        TOMOYO_ID_TRANSITION_CONTROL,
        TOMOYO_ID_AGGREGATOR,
        TOMOYO_ID_MANAGER,
        TOMOYO_ID_CONDITION,
        TOMOYO_ID_NAME,
        TOMOYO_ID_ACL,
        TOMOYO_ID_DOMAIN,
        TOMOYO_MAX_POLICY
};

/* Index numbers for domain's attributes. */
enum tomoyo_domain_info_flags_index {
        /* Quota warnning flag.   */
        TOMOYO_DIF_QUOTA_WARNED,
        /*
         * This domain was unable to create a new domain at
         * tomoyo_find_next_domain() because the name of the domain to be
         * created was too long or it could not allocate memory.
         * More than one process continued execve() without domain transition.
         */
        TOMOYO_DIF_TRANSITION_FAILED,
        TOMOYO_MAX_DOMAIN_INFO_FLAGS
};

/* Index numbers for audit type. */
enum tomoyo_grant_log {
        /* Follow profile's configuration. */
        TOMOYO_GRANTLOG_AUTO,
        /* Do not generate grant log. */
        TOMOYO_GRANTLOG_NO,
        /* Generate grant_log. */
        TOMOYO_GRANTLOG_YES,
};

/* Index numbers for group entries. */
enum tomoyo_group_id {
        TOMOYO_PATH_GROUP,
        TOMOYO_NUMBER_GROUP,
        TOMOYO_ADDRESS_GROUP,
        TOMOYO_MAX_GROUP
};

/* Index numbers for type of numeric values. */
enum tomoyo_value_type {
        TOMOYO_VALUE_TYPE_INVALID,
        TOMOYO_VALUE_TYPE_DECIMAL,
        TOMOYO_VALUE_TYPE_OCTAL,
        TOMOYO_VALUE_TYPE_HEXADECIMAL,
};

/* Index numbers for domain transition control keywords. */
enum tomoyo_transition_type {
        /* Do not change this order, */
        TOMOYO_TRANSITION_CONTROL_NO_RESET,
        TOMOYO_TRANSITION_CONTROL_RESET,
        TOMOYO_TRANSITION_CONTROL_NO_INITIALIZE,
        TOMOYO_TRANSITION_CONTROL_INITIALIZE,
        TOMOYO_TRANSITION_CONTROL_NO_KEEP,
        TOMOYO_TRANSITION_CONTROL_KEEP,
        TOMOYO_MAX_TRANSITION_TYPE
};

/* Index numbers for Access Controls. */
enum tomoyo_acl_entry_type_index {
        TOMOYO_TYPE_PATH_ACL,
        TOMOYO_TYPE_PATH2_ACL,
        TOMOYO_TYPE_PATH_NUMBER_ACL,
        TOMOYO_TYPE_MKDEV_ACL,
        TOMOYO_TYPE_MOUNT_ACL,
        TOMOYO_TYPE_INET_ACL,
        TOMOYO_TYPE_UNIX_ACL,
        TOMOYO_TYPE_ENV_ACL,
        TOMOYO_TYPE_MANUAL_TASK_ACL,
};

/* Index numbers for access controls with one pathname. */
enum tomoyo_path_acl_index {
        TOMOYO_TYPE_EXECUTE,
        TOMOYO_TYPE_READ,
        TOMOYO_TYPE_WRITE,
        TOMOYO_TYPE_APPEND,
        TOMOYO_TYPE_UNLINK,
        TOMOYO_TYPE_GETATTR,
        TOMOYO_TYPE_RMDIR,
        TOMOYO_TYPE_TRUNCATE,
        TOMOYO_TYPE_SYMLINK,
        TOMOYO_TYPE_CHROOT,
        TOMOYO_TYPE_UMOUNT,
        TOMOYO_MAX_PATH_OPERATION
};

/* Index numbers for /sys/kernel/security/tomoyo/stat interface. */
enum tomoyo_memory_stat_type {
        TOMOYO_MEMORY_POLICY,
        TOMOYO_MEMORY_AUDIT,
        TOMOYO_MEMORY_QUERY,
        TOMOYO_MAX_MEMORY_STAT
};

enum tomoyo_mkdev_acl_index {
        TOMOYO_TYPE_MKBLOCK,
        TOMOYO_TYPE_MKCHAR,
        TOMOYO_MAX_MKDEV_OPERATION
};

/* Index numbers for socket operations. */
enum tomoyo_network_acl_index {
        TOMOYO_NETWORK_BIND,    /* bind() operation. */
        TOMOYO_NETWORK_LISTEN,  /* listen() operation. */
        TOMOYO_NETWORK_CONNECT, /* connect() operation. */
        TOMOYO_NETWORK_SEND,    /* send() operation. */
        TOMOYO_MAX_NETWORK_OPERATION
};

/* Index numbers for access controls with two pathnames. */
enum tomoyo_path2_acl_index {
        TOMOYO_TYPE_LINK,
        TOMOYO_TYPE_RENAME,
        TOMOYO_TYPE_PIVOT_ROOT,
        TOMOYO_MAX_PATH2_OPERATION
};

/* Index numbers for access controls with one pathname and one number. */
enum tomoyo_path_number_acl_index {
        TOMOYO_TYPE_CREATE,
        TOMOYO_TYPE_MKDIR,
        TOMOYO_TYPE_MKFIFO,
        TOMOYO_TYPE_MKSOCK,
        TOMOYO_TYPE_IOCTL,
        TOMOYO_TYPE_CHMOD,
        TOMOYO_TYPE_CHOWN,
        TOMOYO_TYPE_CHGRP,
        TOMOYO_MAX_PATH_NUMBER_OPERATION
};

/* Index numbers for /sys/kernel/security/tomoyo/ interfaces. */
enum tomoyo_securityfs_interface_index {
        TOMOYO_DOMAINPOLICY,
        TOMOYO_EXCEPTIONPOLICY,
        TOMOYO_PROCESS_STATUS,
        TOMOYO_STAT,
        TOMOYO_AUDIT,
        TOMOYO_VERSION,
        TOMOYO_PROFILE,
        TOMOYO_QUERY,
        TOMOYO_MANAGER
};

/* Index numbers for special mount operations. */
enum tomoyo_special_mount {
        TOMOYO_MOUNT_BIND,            /* mount --bind /source /dest   */
        TOMOYO_MOUNT_MOVE,            /* mount --move /old /new       */
        TOMOYO_MOUNT_REMOUNT,         /* mount -o remount /dir        */
        TOMOYO_MOUNT_MAKE_UNBINDABLE, /* mount --make-unbindable /dir */
        TOMOYO_MOUNT_MAKE_PRIVATE,    /* mount --make-private /dir    */
        TOMOYO_MOUNT_MAKE_SLAVE,      /* mount --make-slave /dir      */
        TOMOYO_MOUNT_MAKE_SHARED,     /* mount --make-shared /dir     */
        TOMOYO_MAX_SPECIAL_MOUNT
};

/* Index numbers for functionality. */
enum tomoyo_mac_index {
        TOMOYO_MAC_FILE_EXECUTE,
        TOMOYO_MAC_FILE_OPEN,
        TOMOYO_MAC_FILE_CREATE,
        TOMOYO_MAC_FILE_UNLINK,
        TOMOYO_MAC_FILE_GETATTR,
        TOMOYO_MAC_FILE_MKDIR,
        TOMOYO_MAC_FILE_RMDIR,
        TOMOYO_MAC_FILE_MKFIFO,
        TOMOYO_MAC_FILE_MKSOCK,
        TOMOYO_MAC_FILE_TRUNCATE,
        TOMOYO_MAC_FILE_SYMLINK,
        TOMOYO_MAC_FILE_MKBLOCK,
        TOMOYO_MAC_FILE_MKCHAR,
        TOMOYO_MAC_FILE_LINK,
        TOMOYO_MAC_FILE_RENAME,
        TOMOYO_MAC_FILE_CHMOD,
        TOMOYO_MAC_FILE_CHOWN,
        TOMOYO_MAC_FILE_CHGRP,
        TOMOYO_MAC_FILE_IOCTL,
        TOMOYO_MAC_FILE_CHROOT,
        TOMOYO_MAC_FILE_MOUNT,
        TOMOYO_MAC_FILE_UMOUNT,
        TOMOYO_MAC_FILE_PIVOT_ROOT,
        TOMOYO_MAC_NETWORK_INET_STREAM_BIND,
        TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN,
        TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT,
        TOMOYO_MAC_NETWORK_INET_DGRAM_BIND,
        TOMOYO_MAC_NETWORK_INET_DGRAM_SEND,
        TOMOYO_MAC_NETWORK_INET_RAW_BIND,
        TOMOYO_MAC_NETWORK_INET_RAW_SEND,
        TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND,
        TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN,
        TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT,
        TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND,
        TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND,
        TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND,
        TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN,
        TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT,
        TOMOYO_MAC_ENVIRON,
        TOMOYO_MAX_MAC_INDEX
};

/* Index numbers for category of functionality. */
enum tomoyo_mac_category_index {
        TOMOYO_MAC_CATEGORY_FILE,
        TOMOYO_MAC_CATEGORY_NETWORK,
        TOMOYO_MAC_CATEGORY_MISC,
        TOMOYO_MAX_MAC_CATEGORY_INDEX
};

/*
 * Retry this request. Returned by tomoyo_supervisor() if policy violation has
 * occurred in enforcing mode and the userspace daemon decided to retry.
 *
 * We must choose a positive value in order to distinguish "granted" (which is
 * 0) and "rejected" (which is a negative value) and "retry".
 */
#define TOMOYO_RETRY_REQUEST 1

/* Index numbers for /sys/kernel/security/tomoyo/stat interface. */
enum tomoyo_policy_stat_type {
        /* Do not change this order. */
        TOMOYO_STAT_POLICY_UPDATES,
        TOMOYO_STAT_POLICY_LEARNING,   /* == TOMOYO_CONFIG_LEARNING */
        TOMOYO_STAT_POLICY_PERMISSIVE, /* == TOMOYO_CONFIG_PERMISSIVE */
        TOMOYO_STAT_POLICY_ENFORCING,  /* == TOMOYO_CONFIG_ENFORCING */
        TOMOYO_MAX_POLICY_STAT
};

/* Index numbers for profile's PREFERENCE values. */
enum tomoyo_pref_index {
        TOMOYO_PREF_MAX_AUDIT_LOG,
        TOMOYO_PREF_MAX_LEARNING_ENTRY,
        TOMOYO_MAX_PREF
};

/********** Structure definitions. **********/

/* Common header for holding ACL entries. */
struct tomoyo_acl_head {
        struct list_head list;
        s8 is_deleted; /* true or false or TOMOYO_GC_IN_PROGRESS */
} __packed;

/* Common header for shared entries. */
struct tomoyo_shared_acl_head {
        struct list_head list;
        atomic_t users;
} __packed;

struct tomoyo_policy_namespace;

/* Structure for request info. */
struct tomoyo_request_info {
        /*
         * For holding parameters specific to operations which deal files.
         * NULL if not dealing files.
         */
        struct tomoyo_obj_info *obj;
        /*
         * For holding parameters specific to execve() request.
         * NULL if not dealing execve().
         */
        struct tomoyo_execve *ee;
        struct tomoyo_domain_info *domain;
        /* For holding parameters. */
        union {
                struct {
                        const struct tomoyo_path_info *filename;
                        /* For using wildcards at tomoyo_find_next_domain(). */
                        const struct tomoyo_path_info *matched_path;
                        /* One of values in "enum tomoyo_path_acl_index". */
                        u8 operation;
                } path;
                struct {
                        const struct tomoyo_path_info *filename1;
                        const struct tomoyo_path_info *filename2;
                        /* One of values in "enum tomoyo_path2_acl_index". */
                        u8 operation;
                } path2;
                struct {
                        const struct tomoyo_path_info *filename;
                        unsigned int mode;
                        unsigned int major;
                        unsigned int minor;
                        /* One of values in "enum tomoyo_mkdev_acl_index". */
                        u8 operation;
                } mkdev;
                struct {
                        const struct tomoyo_path_info *filename;
                        unsigned long number;
                        /*
                         * One of values in
                         * "enum tomoyo_path_number_acl_index".
                         */
                        u8 operation;
                } path_number;
                struct {
                        const struct tomoyo_path_info *name;
                } environ;
                struct {
                        const __be32 *address;
                        u16 port;
                        /* One of values smaller than TOMOYO_SOCK_MAX. */
                        u8 protocol;
                        /* One of values in "enum tomoyo_network_acl_index". */
                        u8 operation;
                        bool is_ipv6;
                } inet_network;
                struct {
                        const struct tomoyo_path_info *address;
                        /* One of values smaller than TOMOYO_SOCK_MAX. */
                        u8 protocol;
                        /* One of values in "enum tomoyo_network_acl_index". */
                        u8 operation;
                } unix_network;
                struct {
                        const struct tomoyo_path_info *type;
                        const struct tomoyo_path_info *dir;
                        const struct tomoyo_path_info *dev;
                        unsigned long flags;
                        int need_dev;
                } mount;
                struct {
                        const struct tomoyo_path_info *domainname;
                } task;
        } param;
        struct tomoyo_acl_info *matched_acl;
        u8 param_type;
        bool granted;
        u8 retry;
        u8 profile;
        u8 mode; /* One of tomoyo_mode_index . */
        u8 type;
};

/* Structure for holding a token. */
struct tomoyo_path_info {
        const char *name;
        u32 hash;          /* = full_name_hash(name, strlen(name)) */
        u16 const_len;     /* = tomoyo_const_part_length(name)     */
        bool is_dir;       /* = tomoyo_strendswith(name, "/")      */
        bool is_patterned; /* = tomoyo_path_contains_pattern(name) */
};

/* Structure for holding string data. */
struct tomoyo_name {
        struct tomoyo_shared_acl_head head;
        struct tomoyo_path_info entry;
};

/* Structure for holding a word. */
struct tomoyo_name_union {
        /* Either @filename or @group is NULL. */
        const struct tomoyo_path_info *filename;
        struct tomoyo_group *group;
};

/* Structure for holding a number. */
struct tomoyo_number_union {
        unsigned long values[2];
        struct tomoyo_group *group; /* Maybe NULL. */
        /* One of values in "enum tomoyo_value_type". */
        u8 value_type[2];
};

/* Structure for holding an IP address. */
struct tomoyo_ipaddr_union {
        struct in6_addr ip[2]; /* Big endian. */
        struct tomoyo_group *group; /* Pointer to address group. */
        bool is_ipv6; /* Valid only if @group == NULL. */
};

/* Structure for "path_group"/"number_group"/"address_group" directive. */
struct tomoyo_group {
        struct tomoyo_shared_acl_head head;
        const struct tomoyo_path_info *group_name;
        struct list_head member_list;
};

/* Structure for "path_group" directive. */
struct tomoyo_path_group {
        struct tomoyo_acl_head head;
        const struct tomoyo_path_info *member_name;
};

/* Structure for "number_group" directive. */
struct tomoyo_number_group {
        struct tomoyo_acl_head head;
        struct tomoyo_number_union number;
};

/* Structure for "address_group" directive. */
struct tomoyo_address_group {
        struct tomoyo_acl_head head;
        /* Structure for holding an IP address. */
        struct tomoyo_ipaddr_union address;
};

/* Subset of "struct stat". Used by conditional ACL and audit logs. */
struct tomoyo_mini_stat {
        kuid_t uid;
        kgid_t gid;
        ino_t ino;
        umode_t mode;
        dev_t dev;
        dev_t rdev;
};

/* Structure for dumping argv[] and envp[] of "struct linux_binprm". */
struct tomoyo_page_dump {
        struct page *page;    /* Previously dumped page. */
        char *data;           /* Contents of "page". Size is PAGE_SIZE. */
};

/* Structure for attribute checks in addition to pathname checks. */
struct tomoyo_obj_info {
        /*
         * True if tomoyo_get_attributes() was already called, false otherwise.
         */
        bool validate_done;
        /* True if @stat[] is valid. */
        bool stat_valid[TOMOYO_MAX_PATH_STAT];
        /* First pathname. Initialized with { NULL, NULL } if no path. */
        struct path path1;
        /* Second pathname. Initialized with { NULL, NULL } if no path. */
        struct path path2;
        /*
         * Information on @path1, @path1's parent directory, @path2, @path2's
         * parent directory.
         */
        struct tomoyo_mini_stat stat[TOMOYO_MAX_PATH_STAT];
        /*
         * Content of symbolic link to be created. NULL for operations other
         * than symlink().
         */
        struct tomoyo_path_info *symlink_target;
};

/* Structure for argv[]. */
struct tomoyo_argv {
        unsigned long index;
        const struct tomoyo_path_info *value;
        bool is_not;
};

/* Structure for envp[]. */
struct tomoyo_envp {
        const struct tomoyo_path_info *name;
        const struct tomoyo_path_info *value;
        bool is_not;
};

/* Structure for execve() operation. */
struct tomoyo_execve {
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj;
        struct linux_binprm *bprm;
        const struct tomoyo_path_info *transition;
        /* For dumping argv[] and envp[]. */
        struct tomoyo_page_dump dump;
        /* For temporary use. */
        char *tmp; /* Size is TOMOYO_EXEC_TMPSIZE bytes */
};

/* Structure for entries which follows "struct tomoyo_condition". */
struct tomoyo_condition_element {
        /*
         * Left hand operand. A "struct tomoyo_argv" for TOMOYO_ARGV_ENTRY, a
         * "struct tomoyo_envp" for TOMOYO_ENVP_ENTRY is attached to the tail
         * of the array of this struct.
         */
        u8 left;
        /*
         * Right hand operand. A "struct tomoyo_number_union" for
         * TOMOYO_NUMBER_UNION, a "struct tomoyo_name_union" for
         * TOMOYO_NAME_UNION is attached to the tail of the array of this
         * struct.
         */
        u8 right;
        /* Equation operator. True if equals or overlaps, false otherwise. */
        bool equals;
};

/* Structure for optional arguments. */
struct tomoyo_condition {
        struct tomoyo_shared_acl_head head;
        u32 size; /* Memory size allocated for this entry. */
        u16 condc; /* Number of conditions in this struct. */
        u16 numbers_count; /* Number of "struct tomoyo_number_union values". */
        u16 names_count; /* Number of "struct tomoyo_name_union names". */
        u16 argc; /* Number of "struct tomoyo_argv". */
        u16 envc; /* Number of "struct tomoyo_envp". */
        u8 grant_log; /* One of values in "enum tomoyo_grant_log". */
        const struct tomoyo_path_info *transit; /* Maybe NULL. */
        /*
         * struct tomoyo_condition_element condition[condc];
         * struct tomoyo_number_union values[numbers_count];
         * struct tomoyo_name_union names[names_count];
         * struct tomoyo_argv argv[argc];
         * struct tomoyo_envp envp[envc];
         */
};

/* Common header for individual entries. */
struct tomoyo_acl_info {
        struct list_head list;
        struct tomoyo_condition *cond; /* Maybe NULL. */
        s8 is_deleted; /* true or false or TOMOYO_GC_IN_PROGRESS */
        u8 type; /* One of values in "enum tomoyo_acl_entry_type_index". */
} __packed;

/* Structure for domain information. */
struct tomoyo_domain_info {
        struct list_head list;
        struct list_head acl_info_list;
        /* Name of this domain. Never NULL.          */
        const struct tomoyo_path_info *domainname;
        /* Namespace for this domain. Never NULL. */
        struct tomoyo_policy_namespace *ns;
        /* Group numbers to use.   */
        unsigned long group[TOMOYO_MAX_ACL_GROUPS / BITS_PER_LONG];
        u8 profile;        /* Profile number to use. */
        bool is_deleted;   /* Delete flag.           */
        bool flags[TOMOYO_MAX_DOMAIN_INFO_FLAGS];
        atomic_t users; /* Number of referring tasks. */
};

/*
 * Structure for "task manual_domain_transition" directive.
 */
struct tomoyo_task_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MANUAL_TASK_ACL */
        /* Pointer to domainname. */
        const struct tomoyo_path_info *domainname;
};

/*
 * Structure for "file execute", "file read", "file write", "file append",
 * "file unlink", "file getattr", "file rmdir", "file truncate",
 * "file symlink", "file chroot" and "file unmount" directive.
 */
struct tomoyo_path_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH_ACL */
        u16 perm; /* Bitmask of values in "enum tomoyo_path_acl_index". */
        struct tomoyo_name_union name;
};

/*
 * Structure for "file create", "file mkdir", "file mkfifo", "file mksock",
 * "file ioctl", "file chmod", "file chown" and "file chgrp" directive.
 */
struct tomoyo_path_number_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH_NUMBER_ACL */
        /* Bitmask of values in "enum tomoyo_path_number_acl_index". */
        u8 perm;
        struct tomoyo_name_union name;
        struct tomoyo_number_union number;
};

/* Structure for "file mkblock" and "file mkchar" directive. */
struct tomoyo_mkdev_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MKDEV_ACL */
        u8 perm; /* Bitmask of values in "enum tomoyo_mkdev_acl_index". */
        struct tomoyo_name_union name;
        struct tomoyo_number_union mode;
        struct tomoyo_number_union major;
        struct tomoyo_number_union minor;
};

/*
 * Structure for "file rename", "file link" and "file pivot_root" directive.
 */
struct tomoyo_path2_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH2_ACL */
        u8 perm; /* Bitmask of values in "enum tomoyo_path2_acl_index". */
        struct tomoyo_name_union name1;
        struct tomoyo_name_union name2;
};

/* Structure for "file mount" directive. */
struct tomoyo_mount_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MOUNT_ACL */
        struct tomoyo_name_union dev_name;
        struct tomoyo_name_union dir_name;
        struct tomoyo_name_union fs_type;
        struct tomoyo_number_union flags;
};

/* Structure for "misc env" directive in domain policy. */
struct tomoyo_env_acl {
        struct tomoyo_acl_info head;        /* type = TOMOYO_TYPE_ENV_ACL  */
        const struct tomoyo_path_info *env; /* environment variable */
};

/* Structure for "network inet" directive. */
struct tomoyo_inet_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_INET_ACL */
        u8 protocol;
        u8 perm; /* Bitmask of values in "enum tomoyo_network_acl_index" */
        struct tomoyo_ipaddr_union address;
        struct tomoyo_number_union port;
};

/* Structure for "network unix" directive. */
struct tomoyo_unix_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_UNIX_ACL */
        u8 protocol;
        u8 perm; /* Bitmask of values in "enum tomoyo_network_acl_index" */
        struct tomoyo_name_union name;
};

/* Structure for holding a line from /sys/kernel/security/tomoyo/ interface. */
struct tomoyo_acl_param {
        char *data;
        struct list_head *list;
        struct tomoyo_policy_namespace *ns;
        bool is_delete;
};

#define TOMOYO_MAX_IO_READ_QUEUE 64

/*
 * Structure for reading/writing policy via /sys/kernel/security/tomoyo
 * interfaces.
 */
struct tomoyo_io_buffer {
        void (*read)(struct tomoyo_io_buffer *head);
        int (*write)(struct tomoyo_io_buffer *head);
        __poll_t (*poll)(struct file *file, poll_table *wait);
        /* Exclusive lock for this structure.   */
        struct mutex io_sem;
        char __user *read_user_buf;
        size_t read_user_buf_avail;
        struct {
                struct list_head *ns;
                struct list_head *domain;
                struct list_head *group;
                struct list_head *acl;
                size_t avail;
                unsigned int step;
                unsigned int query_index;
                u16 index;
                u16 cond_index;
                u8 acl_group_index;
                u8 cond_step;
                u8 bit;
                u8 w_pos;
                bool eof;
                bool print_this_domain_only;
                bool print_transition_related_only;
                bool print_cond_part;
                const char *w[TOMOYO_MAX_IO_READ_QUEUE];
        } r;
        struct {
                struct tomoyo_policy_namespace *ns;
                /* The position currently writing to.   */
                struct tomoyo_domain_info *domain;
                /* Bytes available for writing.         */
                size_t avail;
                bool is_delete;
        } w;
        /* Buffer for reading.                  */
        char *read_buf;
        /* Size of read buffer.                 */
        size_t readbuf_size;
        /* Buffer for writing.                  */
        char *write_buf;
        /* Size of write buffer.                */
        size_t writebuf_size;
        /* Type of this interface.              */
        enum tomoyo_securityfs_interface_index type;
        /* Users counter protected by tomoyo_io_buffer_list_lock. */
        u8 users;
        /* List for telling GC not to kfree() elements. */
        struct list_head list;
};

/*
 * Structure for "initialize_domain"/"no_initialize_domain"/"keep_domain"/
 * "no_keep_domain" keyword.
 */
struct tomoyo_transition_control {
        struct tomoyo_acl_head head;
        u8 type; /* One of values in "enum tomoyo_transition_type".  */
        /* True if the domainname is tomoyo_get_last_name(). */
        bool is_last_name;
        const struct tomoyo_path_info *domainname; /* Maybe NULL */
        const struct tomoyo_path_info *program;    /* Maybe NULL */
};

/* Structure for "aggregator" keyword. */
struct tomoyo_aggregator {
        struct tomoyo_acl_head head;
        const struct tomoyo_path_info *original_name;
        const struct tomoyo_path_info *aggregated_name;
};

/* Structure for policy manager. */
struct tomoyo_manager {
        struct tomoyo_acl_head head;
        /* A path to program or a domainname. */
        const struct tomoyo_path_info *manager;
};

struct tomoyo_preference {
        unsigned int learning_max_entry;
        bool enforcing_verbose;
        bool learning_verbose;
        bool permissive_verbose;
};

/* Structure for /sys/kernel/security/tomnoyo/profile interface. */
struct tomoyo_profile {
        const struct tomoyo_path_info *comment;
        struct tomoyo_preference *learning;
        struct tomoyo_preference *permissive;
        struct tomoyo_preference *enforcing;
        struct tomoyo_preference preference;
        u8 default_config;
        u8 config[TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX];
        unsigned int pref[TOMOYO_MAX_PREF];
};

/* Structure for representing YYYY/MM/DD hh/mm/ss. */
struct tomoyo_time {
        u16 year;
        u8 month;
        u8 day;
        u8 hour;
        u8 min;
        u8 sec;
};

/* Structure for policy namespace. */
struct tomoyo_policy_namespace {
        /* Profile table. Memory is allocated as needed. */
        struct tomoyo_profile *profile_ptr[TOMOYO_MAX_PROFILES];
        /* List of "struct tomoyo_group". */
        struct list_head group_list[TOMOYO_MAX_GROUP];
        /* List of policy. */
        struct list_head policy_list[TOMOYO_MAX_POLICY];
        /* The global ACL referred by "use_group" keyword. */
        struct list_head acl_group[TOMOYO_MAX_ACL_GROUPS];
        /* List for connecting to tomoyo_namespace_list list. */
        struct list_head namespace_list;
        /* Profile version. Currently only 20150505 is defined. */
        unsigned int profile_version;
        /* Name of this namespace (e.g. "<kernel>", "</usr/sbin/httpd>" ). */
        const char *name;
};

/* Structure for "struct task_struct"->security. */
struct tomoyo_task {
        struct tomoyo_domain_info *domain_info;
        struct tomoyo_domain_info *old_domain_info;
};

/********** Function prototypes. **********/

bool tomoyo_address_matches_group(const bool is_ipv6, const __be32 *address,
                                  const struct tomoyo_group *group);
bool tomoyo_compare_number_union(const unsigned long value,
                                 const struct tomoyo_number_union *ptr);
bool tomoyo_condition(struct tomoyo_request_info *r,
                      const struct tomoyo_condition *cond);
bool tomoyo_correct_domain(const unsigned char *domainname);
bool tomoyo_correct_path(const char *filename);
bool tomoyo_correct_word(const char *string);
bool tomoyo_domain_def(const unsigned char *buffer);
bool tomoyo_domain_quota_is_ok(struct tomoyo_request_info *r);
bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
                      struct tomoyo_page_dump *dump);
bool tomoyo_memory_ok(void *ptr);
bool tomoyo_number_matches_group(const unsigned long min,
                                 const unsigned long max,
                                 const struct tomoyo_group *group);
bool tomoyo_parse_ipaddr_union(struct tomoyo_acl_param *param,
                               struct tomoyo_ipaddr_union *ptr);
bool tomoyo_parse_name_union(struct tomoyo_acl_param *param,
                             struct tomoyo_name_union *ptr);
bool tomoyo_parse_number_union(struct tomoyo_acl_param *param,
                               struct tomoyo_number_union *ptr);
bool tomoyo_path_matches_pattern(const struct tomoyo_path_info *filename,
                                 const struct tomoyo_path_info *pattern);
bool tomoyo_permstr(const char *string, const char *keyword);
bool tomoyo_str_starts(char **src, const char *find);
char *tomoyo_encode(const char *str);
char *tomoyo_encode2(const char *str, int str_len);
char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt,
                      va_list args) __printf(3, 0);
char *tomoyo_read_token(struct tomoyo_acl_param *param);
char *tomoyo_realpath_from_path(const struct path *path);
char *tomoyo_realpath_nofollow(const char *pathname);
const char *tomoyo_get_exe(void);
const struct tomoyo_path_info *tomoyo_compare_name_union
(const struct tomoyo_path_info *name, const struct tomoyo_name_union *ptr);
const struct tomoyo_path_info *tomoyo_get_domainname
(struct tomoyo_acl_param *param);
const struct tomoyo_path_info *tomoyo_get_name(const char *name);
const struct tomoyo_path_info *tomoyo_path_matches_group
(const struct tomoyo_path_info *pathname, const struct tomoyo_group *group);
int tomoyo_check_open_permission(struct tomoyo_domain_info *domain,
                                 const struct path *path, const int flag);
void tomoyo_close_control(struct tomoyo_io_buffer *head);
int tomoyo_env_perm(struct tomoyo_request_info *r, const char *env);
int tomoyo_execute_permission(struct tomoyo_request_info *r,
                              const struct tomoyo_path_info *filename);
int tomoyo_find_next_domain(struct linux_binprm *bprm);
int tomoyo_get_mode(const struct tomoyo_policy_namespace *ns, const u8 profile,
                    const u8 index);
int tomoyo_init_request_info(struct tomoyo_request_info *r,
                             struct tomoyo_domain_info *domain,
                             const u8 index);
int tomoyo_mkdev_perm(const u8 operation, const struct path *path,
                      const unsigned int mode, unsigned int dev);
int tomoyo_mount_permission(const char *dev_name, const struct path *path,
                            const char *type, unsigned long flags,
                            void *data_page);
int tomoyo_open_control(const u8 type, struct file *file);
int tomoyo_path2_perm(const u8 operation, const struct path *path1,
                      const struct path *path2);
int tomoyo_path_number_perm(const u8 operation, const struct path *path,
                            unsigned long number);
int tomoyo_path_perm(const u8 operation, const struct path *path,
                     const char *target);
__poll_t tomoyo_poll_control(struct file *file, poll_table *wait);
__poll_t tomoyo_poll_log(struct file *file, poll_table *wait);
int tomoyo_socket_bind_permission(struct socket *sock, struct sockaddr *addr,
                                  int addr_len);
int tomoyo_socket_connect_permission(struct socket *sock,
                                     struct sockaddr *addr, int addr_len);
int tomoyo_socket_listen_permission(struct socket *sock);
int tomoyo_socket_sendmsg_permission(struct socket *sock, struct msghdr *msg,
                                     int size);
int tomoyo_supervisor(struct tomoyo_request_info *r, const char *fmt, ...)
        __printf(2, 3);
int tomoyo_update_domain(struct tomoyo_acl_info *new_entry, const int size,
                         struct tomoyo_acl_param *param,
                         bool (*check_duplicate)
                         (const struct tomoyo_acl_info *,
                          const struct tomoyo_acl_info *),
                         bool (*merge_duplicate)
                         (struct tomoyo_acl_info *, struct tomoyo_acl_info *,
                          const bool));
int tomoyo_update_policy(struct tomoyo_acl_head *new_entry, const int size,
                         struct tomoyo_acl_param *param,
                         bool (*check_duplicate)
                         (const struct tomoyo_acl_head *,
                          const struct tomoyo_acl_head *));
int tomoyo_write_aggregator(struct tomoyo_acl_param *param);
int tomoyo_write_file(struct tomoyo_acl_param *param);
int tomoyo_write_group(struct tomoyo_acl_param *param, const u8 type);
int tomoyo_write_misc(struct tomoyo_acl_param *param);
int tomoyo_write_inet_network(struct tomoyo_acl_param *param);
int tomoyo_write_transition_control(struct tomoyo_acl_param *param,
                                    const u8 type);
int tomoyo_write_unix_network(struct tomoyo_acl_param *param);
ssize_t tomoyo_read_control(struct tomoyo_io_buffer *head, char __user *buffer,
                            const int buffer_len);
ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head,
                             const char __user *buffer, const int buffer_len);
struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param);
struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname,
                                                const bool transit);
struct tomoyo_domain_info *tomoyo_domain(void);
struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname);
struct tomoyo_group *tomoyo_get_group(struct tomoyo_acl_param *param,
                                      const u8 idx);
struct tomoyo_policy_namespace *tomoyo_assign_namespace
(const char *domainname);
struct tomoyo_profile *tomoyo_profile(const struct tomoyo_policy_namespace *ns,
                                      const u8 profile);
u8 tomoyo_parse_ulong(unsigned long *result, char **str);
void *tomoyo_commit_ok(void *data, const unsigned int size);
void __init tomoyo_load_builtin_policy(void);
void __init tomoyo_mm_init(void);
void tomoyo_check_acl(struct tomoyo_request_info *r,
                      bool (*check_entry)(struct tomoyo_request_info *,
                                          const struct tomoyo_acl_info *));
void tomoyo_check_profile(void);
void tomoyo_convert_time(time64_t time, struct tomoyo_time *stamp);
void tomoyo_del_condition(struct list_head *element);
void tomoyo_fill_path_info(struct tomoyo_path_info *ptr);
void tomoyo_get_attributes(struct tomoyo_obj_info *obj);
void tomoyo_init_policy_namespace(struct tomoyo_policy_namespace *ns);
void tomoyo_load_policy(const char *filename);
void tomoyo_normalize_line(unsigned char *buffer);
void tomoyo_notify_gc(struct tomoyo_io_buffer *head, const bool is_register);
void tomoyo_print_ip(char *buf, const unsigned int size,
                     const struct tomoyo_ipaddr_union *ptr);
void tomoyo_print_ulong(char *buffer, const int buffer_len,
                        const unsigned long value, const u8 type);
void tomoyo_put_name_union(struct tomoyo_name_union *ptr);
void tomoyo_put_number_union(struct tomoyo_number_union *ptr);
void tomoyo_read_log(struct tomoyo_io_buffer *head);
void tomoyo_update_stat(const u8 index);
void tomoyo_warn_oom(const char *function);
void tomoyo_write_log(struct tomoyo_request_info *r, const char *fmt, ...)
        __printf(2, 3);
void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt,
                       va_list args) __printf(3, 0);

/********** External variable definitions. **********/

extern bool tomoyo_policy_loaded;
extern int tomoyo_enabled;
extern const char * const tomoyo_condition_keyword
[TOMOYO_MAX_CONDITION_KEYWORD];
extern const char * const tomoyo_dif[TOMOYO_MAX_DOMAIN_INFO_FLAGS];
extern const char * const tomoyo_mac_keywords[TOMOYO_MAX_MAC_INDEX
                                              + TOMOYO_MAX_MAC_CATEGORY_INDEX];
extern const char * const tomoyo_mode[TOMOYO_CONFIG_MAX_MODE];
extern const char * const tomoyo_path_keyword[TOMOYO_MAX_PATH_OPERATION];
extern const char * const tomoyo_proto_keyword[TOMOYO_SOCK_MAX];
extern const char * const tomoyo_socket_keyword[TOMOYO_MAX_NETWORK_OPERATION];
extern const u8 tomoyo_index2category[TOMOYO_MAX_MAC_INDEX];
extern const u8 tomoyo_pn2mac[TOMOYO_MAX_PATH_NUMBER_OPERATION];
extern const u8 tomoyo_pnnn2mac[TOMOYO_MAX_MKDEV_OPERATION];
extern const u8 tomoyo_pp2mac[TOMOYO_MAX_PATH2_OPERATION];
extern struct list_head tomoyo_condition_list;
extern struct list_head tomoyo_domain_list;
extern struct list_head tomoyo_name_list[TOMOYO_MAX_HASH];
extern struct list_head tomoyo_namespace_list;
extern struct mutex tomoyo_policy_lock;
extern struct srcu_struct tomoyo_ss;
extern struct tomoyo_domain_info tomoyo_kernel_domain;
extern struct tomoyo_policy_namespace tomoyo_kernel_namespace;
extern unsigned int tomoyo_memory_quota[TOMOYO_MAX_MEMORY_STAT];
extern unsigned int tomoyo_memory_used[TOMOYO_MAX_MEMORY_STAT];
extern struct lsm_blob_sizes tomoyo_blob_sizes;

/********** Inlined functions. **********/

/**
 * tomoyo_read_lock - Take lock for protecting policy.
 *
 * Returns index number for tomoyo_read_unlock().
 */
static inline int tomoyo_read_lock(void)
{
        return srcu_read_lock(&tomoyo_ss);
}

/**
 * tomoyo_read_unlock - Release lock for protecting policy.
 *
 * @idx: Index number returned by tomoyo_read_lock().
 *
 * Returns nothing.
 */
static inline void tomoyo_read_unlock(int idx)
{
        srcu_read_unlock(&tomoyo_ss, idx);
}

/**
 * tomoyo_sys_getppid - Copy of getppid().
 *
 * Returns parent process's PID.
 *
 * Alpha does not have getppid() defined. To be able to build this module on
 * Alpha, I have to copy getppid() from kernel/timer.c.
 */
static inline pid_t tomoyo_sys_getppid(void)
{
        pid_t pid;

        rcu_read_lock();
        pid = task_tgid_vnr(rcu_dereference(current->real_parent));
        rcu_read_unlock();
        return pid;
}

/**
 * tomoyo_sys_getpid - Copy of getpid().
 *
 * Returns current thread's PID.
 *
 * Alpha does not have getpid() defined. To be able to build this module on
 * Alpha, I have to copy getpid() from kernel/timer.c.
 */
static inline pid_t tomoyo_sys_getpid(void)
{
        return task_tgid_vnr(current);
}

/**
 * tomoyo_pathcmp - strcmp() for "struct tomoyo_path_info" structure.
 *
 * @a: Pointer to "struct tomoyo_path_info".
 * @b: Pointer to "struct tomoyo_path_info".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_pathcmp(const struct tomoyo_path_info *a,
                                  const struct tomoyo_path_info *b)
{
        return a->hash != b->hash || strcmp(a->name, b->name);
}

/**
 * tomoyo_put_name - Drop reference on "struct tomoyo_name".
 *
 * @name: Pointer to "struct tomoyo_path_info". Maybe NULL.
 *
 * Returns nothing.
 */
static inline void tomoyo_put_name(const struct tomoyo_path_info *name)
{
        if (name) {
                struct tomoyo_name *ptr =
                        container_of(name, typeof(*ptr), entry);
                atomic_dec(&ptr->head.users);
        }
}

/**
 * tomoyo_put_condition - Drop reference on "struct tomoyo_condition".
 *
 * @cond: Pointer to "struct tomoyo_condition". Maybe NULL.
 *
 * Returns nothing.
 */
static inline void tomoyo_put_condition(struct tomoyo_condition *cond)
{
        if (cond)
                atomic_dec(&cond->head.users);
}

/**
 * tomoyo_put_group - Drop reference on "struct tomoyo_group".
 *
 * @group: Pointer to "struct tomoyo_group". Maybe NULL.
 *
 * Returns nothing.
 */
static inline void tomoyo_put_group(struct tomoyo_group *group)
{
        if (group)
                atomic_dec(&group->head.users);
}

/**
 * tomoyo_task - Get "struct tomoyo_task" for specified thread.
 *
 * @task - Pointer to "struct task_struct".
 *
 * Returns pointer to "struct tomoyo_task" for specified thread.
 */
static inline struct tomoyo_task *tomoyo_task(struct task_struct *task)
{
        return task->security + tomoyo_blob_sizes.lbs_task;
}

/**
 * tomoyo_same_name_union - Check for duplicated "struct tomoyo_name_union" entry.
 *
 * @a: Pointer to "struct tomoyo_name_union".
 * @b: Pointer to "struct tomoyo_name_union".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_same_name_union
(const struct tomoyo_name_union *a, const struct tomoyo_name_union *b)
{
        return a->filename == b->filename && a->group == b->group;
}

/**
 * tomoyo_same_number_union - Check for duplicated "struct tomoyo_number_union" entry.
 *
 * @a: Pointer to "struct tomoyo_number_union".
 * @b: Pointer to "struct tomoyo_number_union".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_same_number_union
(const struct tomoyo_number_union *a, const struct tomoyo_number_union *b)
{
        return a->values[0] == b->values[0] && a->values[1] == b->values[1] &&
                a->group == b->group && a->value_type[0] == b->value_type[0] &&
                a->value_type[1] == b->value_type[1];
}

/**
 * tomoyo_same_ipaddr_union - Check for duplicated "struct tomoyo_ipaddr_union" entry.
 *
 * @a: Pointer to "struct tomoyo_ipaddr_union".
 * @b: Pointer to "struct tomoyo_ipaddr_union".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_same_ipaddr_union
(const struct tomoyo_ipaddr_union *a, const struct tomoyo_ipaddr_union *b)
{
        return !memcmp(a->ip, b->ip, sizeof(a->ip)) && a->group == b->group &&
                a->is_ipv6 == b->is_ipv6;
}

/**
 * tomoyo_current_namespace - Get "struct tomoyo_policy_namespace" for current thread.
 *
 * Returns pointer to "struct tomoyo_policy_namespace" for current thread.
 */
static inline struct tomoyo_policy_namespace *tomoyo_current_namespace(void)
{
        return tomoyo_domain()->ns;
}

/**
 * list_for_each_cookie - iterate over a list with cookie.
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:       the head for your list.
 */
#define list_for_each_cookie(pos, head)                                        \
        if (!pos)                                                        \
                pos =  srcu_dereference((head)->next, &tomoyo_ss);        \
        for ( ; pos != (head); pos = srcu_dereference(pos->next, &tomoyo_ss))

#endif /* !defined(_SECURITY_TOMOYO_COMMON_H) */








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 


    2 








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
// SPDX-License-Identifier: GPL-2.0
/*
 * drivers/base/core.c - core driver model code (device registration, etc)
 *
 * Copyright (c) 2002-3 Patrick Mochel
 * Copyright (c) 2002-3 Open Source Development Labs
 * Copyright (c) 2006 Greg Kroah-Hartman <gregkh@suse.de>
 * Copyright (c) 2006 Novell, Inc.
 */

#include <linux/acpi.h>
#include <linux/cpufreq.h>
#include <linux/device.h>
#include <linux/err.h>
#include <linux/fwnode.h>
#include <linux/init.h>
#include <linux/kstrtox.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/kdev_t.h>
#include <linux/notifier.h>
#include <linux/of.h>
#include <linux/of_device.h>
#include <linux/blkdev.h>
#include <linux/mutex.h>
#include <linux/pm_runtime.h>
#include <linux/netdevice.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/string_helpers.h>
#include <linux/swiotlb.h>
#include <linux/sysfs.h>
#include <linux/dma-map-ops.h> /* for dma_default_coherent */

#include "base.h"
#include "physical_location.h"
#include "power/power.h"

/* Device links support. */
static LIST_HEAD(deferred_sync);
static unsigned int defer_sync_state_count = 1;
static DEFINE_MUTEX(fwnode_link_lock);
static bool fw_devlink_is_permissive(void);
static void __fw_devlink_link_to_consumers(struct device *dev);
static bool fw_devlink_drv_reg_done;
static bool fw_devlink_best_effort;
static struct workqueue_struct *device_link_wq;

/**
 * __fwnode_link_add - Create a link between two fwnode_handles.
 * @con: Consumer end of the link.
 * @sup: Supplier end of the link.
 * @flags: Link flags.
 *
 * Create a fwnode link between fwnode handles @con and @sup. The fwnode link
 * represents the detail that the firmware lists @sup fwnode as supplying a
 * resource to @con.
 *
 * The driver core will use the fwnode link to create a device link between the
 * two device objects corresponding to @con and @sup when they are created. The
 * driver core will automatically delete the fwnode link between @con and @sup
 * after doing that.
 *
 * Attempts to create duplicate links between the same pair of fwnode handles
 * are ignored and there is no reference counting.
 */
static int __fwnode_link_add(struct fwnode_handle *con,
                             struct fwnode_handle *sup, u8 flags)
{
        struct fwnode_link *link;

        list_for_each_entry(link, &sup->consumers, s_hook)
                if (link->consumer == con) {
                        link->flags |= flags;
                        return 0;
                }

        link = kzalloc(sizeof(*link), GFP_KERNEL);
        if (!link)
                return -ENOMEM;

        link->supplier = sup;
        INIT_LIST_HEAD(&link->s_hook);
        link->consumer = con;
        INIT_LIST_HEAD(&link->c_hook);
        link->flags = flags;

        list_add(&link->s_hook, &sup->consumers);
        list_add(&link->c_hook, &con->suppliers);
        pr_debug("%pfwf Linked as a fwnode consumer to %pfwf\n",
                 con, sup);

        return 0;
}

int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup,
                    u8 flags)
{
        int ret;

        mutex_lock(&fwnode_link_lock);
        ret = __fwnode_link_add(con, sup, flags);
        mutex_unlock(&fwnode_link_lock);
        return ret;
}

/**
 * __fwnode_link_del - Delete a link between two fwnode_handles.
 * @link: the fwnode_link to be deleted
 *
 * The fwnode_link_lock needs to be held when this function is called.
 */
static void __fwnode_link_del(struct fwnode_link *link)
{
        pr_debug("%pfwf Dropping the fwnode link to %pfwf\n",
                 link->consumer, link->supplier);
        list_del(&link->s_hook);
        list_del(&link->c_hook);
        kfree(link);
}

/**
 * __fwnode_link_cycle - Mark a fwnode link as being part of a cycle.
 * @link: the fwnode_link to be marked
 *
 * The fwnode_link_lock needs to be held when this function is called.
 */
static void __fwnode_link_cycle(struct fwnode_link *link)
{
        pr_debug("%pfwf: cycle: depends on %pfwf\n",
                 link->consumer, link->supplier);
        link->flags |= FWLINK_FLAG_CYCLE;
}

/**
 * fwnode_links_purge_suppliers - Delete all supplier links of fwnode_handle.
 * @fwnode: fwnode whose supplier links need to be deleted
 *
 * Deletes all supplier links connecting directly to @fwnode.
 */
static void fwnode_links_purge_suppliers(struct fwnode_handle *fwnode)
{
        struct fwnode_link *link, *tmp;

        mutex_lock(&fwnode_link_lock);
        list_for_each_entry_safe(link, tmp, &fwnode->suppliers, c_hook)
                __fwnode_link_del(link);
        mutex_unlock(&fwnode_link_lock);
}

/**
 * fwnode_links_purge_consumers - Delete all consumer links of fwnode_handle.
 * @fwnode: fwnode whose consumer links need to be deleted
 *
 * Deletes all consumer links connecting directly to @fwnode.
 */
static void fwnode_links_purge_consumers(struct fwnode_handle *fwnode)
{
        struct fwnode_link *link, *tmp;

        mutex_lock(&fwnode_link_lock);
        list_for_each_entry_safe(link, tmp, &fwnode->consumers, s_hook)
                __fwnode_link_del(link);
        mutex_unlock(&fwnode_link_lock);
}

/**
 * fwnode_links_purge - Delete all links connected to a fwnode_handle.
 * @fwnode: fwnode whose links needs to be deleted
 *
 * Deletes all links connecting directly to a fwnode.
 */
void fwnode_links_purge(struct fwnode_handle *fwnode)
{
        fwnode_links_purge_suppliers(fwnode);
        fwnode_links_purge_consumers(fwnode);
}

void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode)
{
        struct fwnode_handle *child;

        /* Don't purge consumer links of an added child */
        if (fwnode->dev)
                return;

        fwnode->flags |= FWNODE_FLAG_NOT_DEVICE;
        fwnode_links_purge_consumers(fwnode);

        fwnode_for_each_available_child_node(fwnode, child)
                fw_devlink_purge_absent_suppliers(child);
}
EXPORT_SYMBOL_GPL(fw_devlink_purge_absent_suppliers);

/**
 * __fwnode_links_move_consumers - Move consumer from @from to @to fwnode_handle
 * @from: move consumers away from this fwnode
 * @to: move consumers to this fwnode
 *
 * Move all consumer links from @from fwnode to @to fwnode.
 */
static void __fwnode_links_move_consumers(struct fwnode_handle *from,
                                          struct fwnode_handle *to)
{
        struct fwnode_link *link, *tmp;

        list_for_each_entry_safe(link, tmp, &from->consumers, s_hook) {
                __fwnode_link_add(link->consumer, to, link->flags);
                __fwnode_link_del(link);
        }
}

/**
 * __fw_devlink_pickup_dangling_consumers - Pick up dangling consumers
 * @fwnode: fwnode from which to pick up dangling consumers
 * @new_sup: fwnode of new supplier
 *
 * If the @fwnode has a corresponding struct device and the device supports
 * probing (that is, added to a bus), then we want to let fw_devlink create
 * MANAGED device links to this device, so leave @fwnode and its descendant's
 * fwnode links alone.
 *
 * Otherwise, move its consumers to the new supplier @new_sup.
 */
static void __fw_devlink_pickup_dangling_consumers(struct fwnode_handle *fwnode,
                                                   struct fwnode_handle *new_sup)
{
        struct fwnode_handle *child;

        if (fwnode->dev && fwnode->dev->bus)
                return;

        fwnode->flags |= FWNODE_FLAG_NOT_DEVICE;
        __fwnode_links_move_consumers(fwnode, new_sup);

        fwnode_for_each_available_child_node(fwnode, child)
                __fw_devlink_pickup_dangling_consumers(child, new_sup);
}

static DEFINE_MUTEX(device_links_lock);
DEFINE_STATIC_SRCU(device_links_srcu);

static inline void device_links_write_lock(void)
{
        mutex_lock(&device_links_lock);
}

static inline void device_links_write_unlock(void)
{
        mutex_unlock(&device_links_lock);
}

int device_links_read_lock(void) __acquires(&device_links_srcu)
{
        return srcu_read_lock(&device_links_srcu);
}

void device_links_read_unlock(int idx) __releases(&device_links_srcu)
{
        srcu_read_unlock(&device_links_srcu, idx);
}

int device_links_read_lock_held(void)
{
        return srcu_read_lock_held(&device_links_srcu);
}

static void device_link_synchronize_removal(void)
{
        synchronize_srcu(&device_links_srcu);
}

static void device_link_remove_from_lists(struct device_link *link)
{
        list_del_rcu(&link->s_node);
        list_del_rcu(&link->c_node);
}

static bool device_is_ancestor(struct device *dev, struct device *target)
{
        while (target->parent) {
                target = target->parent;
                if (dev == target)
                        return true;
        }
        return false;
}

#define DL_MARKER_FLAGS                (DL_FLAG_INFERRED | \
                                 DL_FLAG_CYCLE | \
                                 DL_FLAG_MANAGED)
static inline bool device_link_flag_is_sync_state_only(u32 flags)
{
        return (flags & ~DL_MARKER_FLAGS) == DL_FLAG_SYNC_STATE_ONLY;
}

/**
 * device_is_dependent - Check if one device depends on another one
 * @dev: Device to check dependencies for.
 * @target: Device to check against.
 *
 * Check if @target depends on @dev or any device dependent on it (its child or
 * its consumer etc).  Return 1 if that is the case or 0 otherwise.
 */
static int device_is_dependent(struct device *dev, void *target)
{
        struct device_link *link;
        int ret;

        /*
         * The "ancestors" check is needed to catch the case when the target
         * device has not been completely initialized yet and it is still
         * missing from the list of children of its parent device.
         */
        if (dev == target || device_is_ancestor(dev, target))
                return 1;

        ret = device_for_each_child(dev, target, device_is_dependent);
        if (ret)
                return ret;

        list_for_each_entry(link, &dev->links.consumers, s_node) {
                if (device_link_flag_is_sync_state_only(link->flags))
                        continue;

                if (link->consumer == target)
                        return 1;

                ret = device_is_dependent(link->consumer, target);
                if (ret)
                        break;
        }
        return ret;
}

static void device_link_init_status(struct device_link *link,
                                    struct device *consumer,
                                    struct device *supplier)
{
        switch (supplier->links.status) {
        case DL_DEV_PROBING:
                switch (consumer->links.status) {
                case DL_DEV_PROBING:
                        /*
                         * A consumer driver can create a link to a supplier
                         * that has not completed its probing yet as long as it
                         * knows that the supplier is already functional (for
                         * example, it has just acquired some resources from the
                         * supplier).
                         */
                        link->status = DL_STATE_CONSUMER_PROBE;
                        break;
                default:
                        link->status = DL_STATE_DORMANT;
                        break;
                }
                break;
        case DL_DEV_DRIVER_BOUND:
                switch (consumer->links.status) {
                case DL_DEV_PROBING:
                        link->status = DL_STATE_CONSUMER_PROBE;
                        break;
                case DL_DEV_DRIVER_BOUND:
                        link->status = DL_STATE_ACTIVE;
                        break;
                default:
                        link->status = DL_STATE_AVAILABLE;
                        break;
                }
                break;
        case DL_DEV_UNBINDING:
                link->status = DL_STATE_SUPPLIER_UNBIND;
                break;
        default:
                link->status = DL_STATE_DORMANT;
                break;
        }
}

static int device_reorder_to_tail(struct device *dev, void *not_used)
{
        struct device_link *link;

        /*
         * Devices that have not been registered yet will be put to the ends
         * of the lists during the registration, so skip them here.
         */
        if (device_is_registered(dev))
                devices_kset_move_last(dev);

        if (device_pm_initialized(dev))
                device_pm_move_last(dev);

        device_for_each_child(dev, NULL, device_reorder_to_tail);
        list_for_each_entry(link, &dev->links.consumers, s_node) {
                if (device_link_flag_is_sync_state_only(link->flags))
                        continue;
                device_reorder_to_tail(link->consumer, NULL);
        }

        return 0;
}

/**
 * device_pm_move_to_tail - Move set of devices to the end of device lists
 * @dev: Device to move
 *
 * This is a device_reorder_to_tail() wrapper taking the requisite locks.
 *
 * It moves the @dev along with all of its children and all of its consumers
 * to the ends of the device_kset and dpm_list, recursively.
 */
void device_pm_move_to_tail(struct device *dev)
{
        int idx;

        idx = device_links_read_lock();
        device_pm_lock();
        device_reorder_to_tail(dev, NULL);
        device_pm_unlock();
        device_links_read_unlock(idx);
}

#define to_devlink(dev)        container_of((dev), struct device_link, link_dev)

static ssize_t status_show(struct device *dev,
                           struct device_attribute *attr, char *buf)
{
        const char *output;

        switch (to_devlink(dev)->status) {
        case DL_STATE_NONE:
                output = "not tracked";
                break;
        case DL_STATE_DORMANT:
                output = "dormant";
                break;
        case DL_STATE_AVAILABLE:
                output = "available";
                break;
        case DL_STATE_CONSUMER_PROBE:
                output = "consumer probing";
                break;
        case DL_STATE_ACTIVE:
                output = "active";
                break;
        case DL_STATE_SUPPLIER_UNBIND:
                output = "supplier unbinding";
                break;
        default:
                output = "unknown";
                break;
        }

        return sysfs_emit(buf, "%s\n", output);
}
static DEVICE_ATTR_RO(status);

static ssize_t auto_remove_on_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        struct device_link *link = to_devlink(dev);
        const char *output;

        if (link->flags & DL_FLAG_AUTOREMOVE_SUPPLIER)
                output = "supplier unbind";
        else if (link->flags & DL_FLAG_AUTOREMOVE_CONSUMER)
                output = "consumer unbind";
        else
                output = "never";

        return sysfs_emit(buf, "%s\n", output);
}
static DEVICE_ATTR_RO(auto_remove_on);

static ssize_t runtime_pm_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct device_link *link = to_devlink(dev);

        return sysfs_emit(buf, "%d\n", !!(link->flags & DL_FLAG_PM_RUNTIME));
}
static DEVICE_ATTR_RO(runtime_pm);

static ssize_t sync_state_only_show(struct device *dev,
                                    struct device_attribute *attr, char *buf)
{
        struct device_link *link = to_devlink(dev);

        return sysfs_emit(buf, "%d\n",
                          !!(link->flags & DL_FLAG_SYNC_STATE_ONLY));
}
static DEVICE_ATTR_RO(sync_state_only);

static struct attribute *devlink_attrs[] = {
        &dev_attr_status.attr,
        &dev_attr_auto_remove_on.attr,
        &dev_attr_runtime_pm.attr,
        &dev_attr_sync_state_only.attr,
        NULL,
};
ATTRIBUTE_GROUPS(devlink);

static void device_link_release_fn(struct work_struct *work)
{
        struct device_link *link = container_of(work, struct device_link, rm_work);

        /* Ensure that all references to the link object have been dropped. */
        device_link_synchronize_removal();

        pm_runtime_release_supplier(link);
        /*
         * If supplier_preactivated is set, the link has been dropped between
         * the pm_runtime_get_suppliers() and pm_runtime_put_suppliers() calls
         * in __driver_probe_device().  In that case, drop the supplier's
         * PM-runtime usage counter to remove the reference taken by
         * pm_runtime_get_suppliers().
         */
        if (link->supplier_preactivated)
                pm_runtime_put_noidle(link->supplier);

        pm_request_idle(link->supplier);

        put_device(link->consumer);
        put_device(link->supplier);
        kfree(link);
}

static void devlink_dev_release(struct device *dev)
{
        struct device_link *link = to_devlink(dev);

        INIT_WORK(&link->rm_work, device_link_release_fn);
        /*
         * It may take a while to complete this work because of the SRCU
         * synchronization in device_link_release_fn() and if the consumer or
         * supplier devices get deleted when it runs, so put it into the
         * dedicated workqueue.
         */
        queue_work(device_link_wq, &link->rm_work);
}

/**
 * device_link_wait_removal - Wait for ongoing devlink removal jobs to terminate
 */
void device_link_wait_removal(void)
{
        /*
         * devlink removal jobs are queued in the dedicated work queue.
         * To be sure that all removal jobs are terminated, ensure that any
         * scheduled work has run to completion.
         */
        flush_workqueue(device_link_wq);
}
EXPORT_SYMBOL_GPL(device_link_wait_removal);

static struct class devlink_class = {
        .name = "devlink",
        .dev_groups = devlink_groups,
        .dev_release = devlink_dev_release,
};

static int devlink_add_symlinks(struct device *dev)
{
        int ret;
        size_t len;
        struct device_link *link = to_devlink(dev);
        struct device *sup = link->supplier;
        struct device *con = link->consumer;
        char *buf;

        len = max(strlen(dev_bus_name(sup)) + strlen(dev_name(sup)),
                  strlen(dev_bus_name(con)) + strlen(dev_name(con)));
        len += strlen(":");
        len += strlen("supplier:") + 1;
        buf = kzalloc(len, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;

        ret = sysfs_create_link(&link->link_dev.kobj, &sup->kobj, "supplier");
        if (ret)
                goto out;

        ret = sysfs_create_link(&link->link_dev.kobj, &con->kobj, "consumer");
        if (ret)
                goto err_con;

        snprintf(buf, len, "consumer:%s:%s", dev_bus_name(con), dev_name(con));
        ret = sysfs_create_link(&sup->kobj, &link->link_dev.kobj, buf);
        if (ret)
                goto err_con_dev;

        snprintf(buf, len, "supplier:%s:%s", dev_bus_name(sup), dev_name(sup));
        ret = sysfs_create_link(&con->kobj, &link->link_dev.kobj, buf);
        if (ret)
                goto err_sup_dev;

        goto out;

err_sup_dev:
        snprintf(buf, len, "consumer:%s:%s", dev_bus_name(con), dev_name(con));
        sysfs_remove_link(&sup->kobj, buf);
err_con_dev:
        sysfs_remove_link(&link->link_dev.kobj, "consumer");
err_con:
        sysfs_remove_link(&link->link_dev.kobj, "supplier");
out:
        kfree(buf);
        return ret;
}

static void devlink_remove_symlinks(struct device *dev)
{
        struct device_link *link = to_devlink(dev);
        size_t len;
        struct device *sup = link->supplier;
        struct device *con = link->consumer;
        char *buf;

        sysfs_remove_link(&link->link_dev.kobj, "consumer");
        sysfs_remove_link(&link->link_dev.kobj, "supplier");

        len = max(strlen(dev_bus_name(sup)) + strlen(dev_name(sup)),
                  strlen(dev_bus_name(con)) + strlen(dev_name(con)));
        len += strlen(":");
        len += strlen("supplier:") + 1;
        buf = kzalloc(len, GFP_KERNEL);
        if (!buf) {
                WARN(1, "Unable to properly free device link symlinks!\n");
                return;
        }

        if (device_is_registered(con)) {
                snprintf(buf, len, "supplier:%s:%s", dev_bus_name(sup), dev_name(sup));
                sysfs_remove_link(&con->kobj, buf);
        }
        snprintf(buf, len, "consumer:%s:%s", dev_bus_name(con), dev_name(con));
        sysfs_remove_link(&sup->kobj, buf);
        kfree(buf);
}

static struct class_interface devlink_class_intf = {
        .class = &devlink_class,
        .add_dev = devlink_add_symlinks,
        .remove_dev = devlink_remove_symlinks,
};

static int __init devlink_class_init(void)
{
        int ret;

        ret = class_register(&devlink_class);
        if (ret)
                return ret;

        ret = class_interface_register(&devlink_class_intf);
        if (ret)
                class_unregister(&devlink_class);

        return ret;
}
postcore_initcall(devlink_class_init);

#define DL_MANAGED_LINK_FLAGS (DL_FLAG_AUTOREMOVE_CONSUMER | \
                               DL_FLAG_AUTOREMOVE_SUPPLIER | \
                               DL_FLAG_AUTOPROBE_CONSUMER  | \
                               DL_FLAG_SYNC_STATE_ONLY | \
                               DL_FLAG_INFERRED | \
                               DL_FLAG_CYCLE)

#define DL_ADD_VALID_FLAGS (DL_MANAGED_LINK_FLAGS | DL_FLAG_STATELESS | \
                            DL_FLAG_PM_RUNTIME | DL_FLAG_RPM_ACTIVE)

/**
 * device_link_add - Create a link between two devices.
 * @consumer: Consumer end of the link.
 * @supplier: Supplier end of the link.
 * @flags: Link flags.
 *
 * The caller is responsible for the proper synchronization of the link creation
 * with runtime PM.  First, setting the DL_FLAG_PM_RUNTIME flag will cause the
 * runtime PM framework to take the link into account.  Second, if the
 * DL_FLAG_RPM_ACTIVE flag is set in addition to it, the supplier devices will
 * be forced into the active meta state and reference-counted upon the creation
 * of the link.  If DL_FLAG_PM_RUNTIME is not set, DL_FLAG_RPM_ACTIVE will be
 * ignored.
 *
 * If DL_FLAG_STATELESS is set in @flags, the caller of this function is
 * expected to release the link returned by it directly with the help of either
 * device_link_del() or device_link_remove().
 *
 * If that flag is not set, however, the caller of this function is handing the
 * management of the link over to the driver core entirely and its return value
 * can only be used to check whether or not the link is present.  In that case,
 * the DL_FLAG_AUTOREMOVE_CONSUMER and DL_FLAG_AUTOREMOVE_SUPPLIER device link
 * flags can be used to indicate to the driver core when the link can be safely
 * deleted.  Namely, setting one of them in @flags indicates to the driver core
 * that the link is not going to be used (by the given caller of this function)
 * after unbinding the consumer or supplier driver, respectively, from its
 * device, so the link can be deleted at that point.  If none of them is set,
 * the link will be maintained until one of the devices pointed to by it (either
 * the consumer or the supplier) is unregistered.
 *
 * Also, if DL_FLAG_STATELESS, DL_FLAG_AUTOREMOVE_CONSUMER and
 * DL_FLAG_AUTOREMOVE_SUPPLIER are not set in @flags (that is, a persistent
 * managed device link is being added), the DL_FLAG_AUTOPROBE_CONSUMER flag can
 * be used to request the driver core to automatically probe for a consumer
 * driver after successfully binding a driver to the supplier device.
 *
 * The combination of DL_FLAG_STATELESS and one of DL_FLAG_AUTOREMOVE_CONSUMER,
 * DL_FLAG_AUTOREMOVE_SUPPLIER, or DL_FLAG_AUTOPROBE_CONSUMER set in @flags at
 * the same time is invalid and will cause NULL to be returned upfront.
 * However, if a device link between the given @consumer and @supplier pair
 * exists already when this function is called for them, the existing link will
 * be returned regardless of its current type and status (the link's flags may
 * be modified then).  The caller of this function is then expected to treat
 * the link as though it has just been created, so (in particular) if
 * DL_FLAG_STATELESS was passed in @flags, the link needs to be released
 * explicitly when not needed any more (as stated above).
 *
 * A side effect of the link creation is re-ordering of dpm_list and the
 * devices_kset list by moving the consumer device and all devices depending
 * on it to the ends of these lists (that does not happen to devices that have
 * not been registered when this function is called).
 *
 * The supplier device is required to be registered when this function is called
 * and NULL will be returned if that is not the case.  The consumer device need
 * not be registered, however.
 */
struct device_link *device_link_add(struct device *consumer,
                                    struct device *supplier, u32 flags)
{
        struct device_link *link;

        if (!consumer || !supplier || consumer == supplier ||
            flags & ~DL_ADD_VALID_FLAGS ||
            (flags & DL_FLAG_STATELESS && flags & DL_MANAGED_LINK_FLAGS) ||
            (flags & DL_FLAG_AUTOPROBE_CONSUMER &&
             flags & (DL_FLAG_AUTOREMOVE_CONSUMER |
                      DL_FLAG_AUTOREMOVE_SUPPLIER)))
                return NULL;

        if (flags & DL_FLAG_PM_RUNTIME && flags & DL_FLAG_RPM_ACTIVE) {
                if (pm_runtime_get_sync(supplier) < 0) {
                        pm_runtime_put_noidle(supplier);
                        return NULL;
                }
        }

        if (!(flags & DL_FLAG_STATELESS))
                flags |= DL_FLAG_MANAGED;

        if (flags & DL_FLAG_SYNC_STATE_ONLY &&
            !device_link_flag_is_sync_state_only(flags))
                return NULL;

        device_links_write_lock();
        device_pm_lock();

        /*
         * If the supplier has not been fully registered yet or there is a
         * reverse (non-SYNC_STATE_ONLY) dependency between the consumer and
         * the supplier already in the graph, return NULL. If the link is a
         * SYNC_STATE_ONLY link, we don't check for reverse dependencies
         * because it only affects sync_state() callbacks.
         */
        if (!device_pm_initialized(supplier)
            || (!(flags & DL_FLAG_SYNC_STATE_ONLY) &&
                  device_is_dependent(consumer, supplier))) {
                link = NULL;
                goto out;
        }

        /*
         * SYNC_STATE_ONLY links are useless once a consumer device has probed.
         * So, only create it if the consumer hasn't probed yet.
         */
        if (flags & DL_FLAG_SYNC_STATE_ONLY &&
            consumer->links.status != DL_DEV_NO_DRIVER &&
            consumer->links.status != DL_DEV_PROBING) {
                link = NULL;
                goto out;
        }

        /*
         * DL_FLAG_AUTOREMOVE_SUPPLIER indicates that the link will be needed
         * longer than for DL_FLAG_AUTOREMOVE_CONSUMER and setting them both
         * together doesn't make sense, so prefer DL_FLAG_AUTOREMOVE_SUPPLIER.
         */
        if (flags & DL_FLAG_AUTOREMOVE_SUPPLIER)
                flags &= ~DL_FLAG_AUTOREMOVE_CONSUMER;

        list_for_each_entry(link, &supplier->links.consumers, s_node) {
                if (link->consumer != consumer)
                        continue;

                if (link->flags & DL_FLAG_INFERRED &&
                    !(flags & DL_FLAG_INFERRED))
                        link->flags &= ~DL_FLAG_INFERRED;

                if (flags & DL_FLAG_PM_RUNTIME) {
                        if (!(link->flags & DL_FLAG_PM_RUNTIME)) {
                                pm_runtime_new_link(consumer);
                                link->flags |= DL_FLAG_PM_RUNTIME;
                        }
                        if (flags & DL_FLAG_RPM_ACTIVE)
                                refcount_inc(&link->rpm_active);
                }

                if (flags & DL_FLAG_STATELESS) {
                        kref_get(&link->kref);
                        if (link->flags & DL_FLAG_SYNC_STATE_ONLY &&
                            !(link->flags & DL_FLAG_STATELESS)) {
                                link->flags |= DL_FLAG_STATELESS;
                                goto reorder;
                        } else {
                                link->flags |= DL_FLAG_STATELESS;
                                goto out;
                        }
                }

                /*
                 * If the life time of the link following from the new flags is
                 * longer than indicated by the flags of the existing link,
                 * update the existing link to stay around longer.
                 */
                if (flags & DL_FLAG_AUTOREMOVE_SUPPLIER) {
                        if (link->flags & DL_FLAG_AUTOREMOVE_CONSUMER) {
                                link->flags &= ~DL_FLAG_AUTOREMOVE_CONSUMER;
                                link->flags |= DL_FLAG_AUTOREMOVE_SUPPLIER;
                        }
                } else if (!(flags & DL_FLAG_AUTOREMOVE_CONSUMER)) {
                        link->flags &= ~(DL_FLAG_AUTOREMOVE_CONSUMER |
                                         DL_FLAG_AUTOREMOVE_SUPPLIER);
                }
                if (!(link->flags & DL_FLAG_MANAGED)) {
                        kref_get(&link->kref);
                        link->flags |= DL_FLAG_MANAGED;
                        device_link_init_status(link, consumer, supplier);
                }
                if (link->flags & DL_FLAG_SYNC_STATE_ONLY &&
                    !(flags & DL_FLAG_SYNC_STATE_ONLY)) {
                        link->flags &= ~DL_FLAG_SYNC_STATE_ONLY;
                        goto reorder;
                }

                goto out;
        }

        link = kzalloc(sizeof(*link), GFP_KERNEL);
        if (!link)
                goto out;

        refcount_set(&link->rpm_active, 1);

        get_device(supplier);
        link->supplier = supplier;
        INIT_LIST_HEAD(&link->s_node);
        get_device(consumer);
        link->consumer = consumer;
        INIT_LIST_HEAD(&link->c_node);
        link->flags = flags;
        kref_init(&link->kref);

        link->link_dev.class = &devlink_class;
        device_set_pm_not_required(&link->link_dev);
        dev_set_name(&link->link_dev, "%s:%s--%s:%s",
                     dev_bus_name(supplier), dev_name(supplier),
                     dev_bus_name(consumer), dev_name(consumer));
        if (device_register(&link->link_dev)) {
                put_device(&link->link_dev);
                link = NULL;
                goto out;
        }

        if (flags & DL_FLAG_PM_RUNTIME) {
                if (flags & DL_FLAG_RPM_ACTIVE)
                        refcount_inc(&link->rpm_active);

                pm_runtime_new_link(consumer);
        }

        /* Determine the initial link state. */
        if (flags & DL_FLAG_STATELESS)
                link->status = DL_STATE_NONE;
        else
                device_link_init_status(link, consumer, supplier);

        /*
         * Some callers expect the link creation during consumer driver probe to
         * resume the supplier even without DL_FLAG_RPM_ACTIVE.
         */
        if (link->status == DL_STATE_CONSUMER_PROBE &&
            flags & DL_FLAG_PM_RUNTIME)
                pm_runtime_resume(supplier);

        list_add_tail_rcu(&link->s_node, &supplier->links.consumers);
        list_add_tail_rcu(&link->c_node, &consumer->links.suppliers);

        if (flags & DL_FLAG_SYNC_STATE_ONLY) {
                dev_dbg(consumer,
                        "Linked as a sync state only consumer to %s\n",
                        dev_name(supplier));
                goto out;
        }

reorder:
        /*
         * Move the consumer and all of the devices depending on it to the end
         * of dpm_list and the devices_kset list.
         *
         * It is necessary to hold dpm_list locked throughout all that or else
         * we may end up suspending with a wrong ordering of it.
         */
        device_reorder_to_tail(consumer, NULL);

        dev_dbg(consumer, "Linked as a consumer to %s\n", dev_name(supplier));

out:
        device_pm_unlock();
        device_links_write_unlock();

        if ((flags & DL_FLAG_PM_RUNTIME && flags & DL_FLAG_RPM_ACTIVE) && !link)
                pm_runtime_put(supplier);

        return link;
}
EXPORT_SYMBOL_GPL(device_link_add);

static void __device_link_del(struct kref *kref)
{
        struct device_link *link = container_of(kref, struct device_link, kref);

        dev_dbg(link->consumer, "Dropping the link to %s\n",
                dev_name(link->supplier));

        pm_runtime_drop_link(link);

        device_link_remove_from_lists(link);
        device_unregister(&link->link_dev);
}

static void device_link_put_kref(struct device_link *link)
{
        if (link->flags & DL_FLAG_STATELESS)
                kref_put(&link->kref, __device_link_del);
        else if (!device_is_registered(link->consumer))
                __device_link_del(&link->kref);
        else
                WARN(1, "Unable to drop a managed device link reference\n");
}

/**
 * device_link_del - Delete a stateless link between two devices.
 * @link: Device link to delete.
 *
 * The caller must ensure proper synchronization of this function with runtime
 * PM.  If the link was added multiple times, it needs to be deleted as often.
 * Care is required for hotplugged devices:  Their links are purged on removal
 * and calling device_link_del() is then no longer allowed.
 */
void device_link_del(struct device_link *link)
{
        device_links_write_lock();
        device_link_put_kref(link);
        device_links_write_unlock();
}
EXPORT_SYMBOL_GPL(device_link_del);

/**
 * device_link_remove - Delete a stateless link between two devices.
 * @consumer: Consumer end of the link.
 * @supplier: Supplier end of the link.
 *
 * The caller must ensure proper synchronization of this function with runtime
 * PM.
 */
void device_link_remove(void *consumer, struct device *supplier)
{
        struct device_link *link;

        if (WARN_ON(consumer == supplier))
                return;

        device_links_write_lock();

        list_for_each_entry(link, &supplier->links.consumers, s_node) {
                if (link->consumer == consumer) {
                        device_link_put_kref(link);
                        break;
                }
        }

        device_links_write_unlock();
}
EXPORT_SYMBOL_GPL(device_link_remove);

static void device_links_missing_supplier(struct device *dev)
{
        struct device_link *link;

        list_for_each_entry(link, &dev->links.suppliers, c_node) {
                if (link->status != DL_STATE_CONSUMER_PROBE)
                        continue;

                if (link->supplier->links.status == DL_DEV_DRIVER_BOUND) {
                        WRITE_ONCE(link->status, DL_STATE_AVAILABLE);
                } else {
                        WARN_ON(!(link->flags & DL_FLAG_SYNC_STATE_ONLY));
                        WRITE_ONCE(link->status, DL_STATE_DORMANT);
                }
        }
}

static bool dev_is_best_effort(struct device *dev)
{
        return (fw_devlink_best_effort && dev->can_match) ||
                (dev->fwnode && (dev->fwnode->flags & FWNODE_FLAG_BEST_EFFORT));
}

static struct fwnode_handle *fwnode_links_check_suppliers(
                                                struct fwnode_handle *fwnode)
{
        struct fwnode_link *link;

        if (!fwnode || fw_devlink_is_permissive())
                return NULL;

        list_for_each_entry(link, &fwnode->suppliers, c_hook)
                if (!(link->flags &
                      (FWLINK_FLAG_CYCLE | FWLINK_FLAG_IGNORE)))
                        return link->supplier;

        return NULL;
}

/**
 * device_links_check_suppliers - Check presence of supplier drivers.
 * @dev: Consumer device.
 *
 * Check links from this device to any suppliers.  Walk the list of the device's
 * links to suppliers and see if all of them are available.  If not, simply
 * return -EPROBE_DEFER.
 *
 * We need to guarantee that the supplier will not go away after the check has
 * been positive here.  It only can go away in __device_release_driver() and
 * that function  checks the device's links to consumers.  This means we need to
 * mark the link as "consumer probe in progress" to make the supplier removal
 * wait for us to complete (or bad things may happen).
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
int device_links_check_suppliers(struct device *dev)
{
        struct device_link *link;
        int ret = 0, fwnode_ret = 0;
        struct fwnode_handle *sup_fw;

        /*
         * Device waiting for supplier to become available is not allowed to
         * probe.
         */
        mutex_lock(&fwnode_link_lock);
        sup_fw = fwnode_links_check_suppliers(dev->fwnode);
        if (sup_fw) {
                if (!dev_is_best_effort(dev)) {
                        fwnode_ret = -EPROBE_DEFER;
                        dev_err_probe(dev, -EPROBE_DEFER,
                                    "wait for supplier %pfwf\n", sup_fw);
                } else {
                        fwnode_ret = -EAGAIN;
                }
        }
        mutex_unlock(&fwnode_link_lock);
        if (fwnode_ret == -EPROBE_DEFER)
                return fwnode_ret;

        device_links_write_lock();

        list_for_each_entry(link, &dev->links.suppliers, c_node) {
                if (!(link->flags & DL_FLAG_MANAGED))
                        continue;

                if (link->status != DL_STATE_AVAILABLE &&
                    !(link->flags & DL_FLAG_SYNC_STATE_ONLY)) {

                        if (dev_is_best_effort(dev) &&
                            link->flags & DL_FLAG_INFERRED &&
                            !link->supplier->can_match) {
                                ret = -EAGAIN;
                                continue;
                        }

                        device_links_missing_supplier(dev);
                        dev_err_probe(dev, -EPROBE_DEFER,
                                      "supplier %s not ready\n",
                                      dev_name(link->supplier));
                        ret = -EPROBE_DEFER;
                        break;
                }
                WRITE_ONCE(link->status, DL_STATE_CONSUMER_PROBE);
        }
        dev->links.status = DL_DEV_PROBING;

        device_links_write_unlock();

        return ret ? ret : fwnode_ret;
}

/**
 * __device_links_queue_sync_state - Queue a device for sync_state() callback
 * @dev: Device to call sync_state() on
 * @list: List head to queue the @dev on
 *
 * Queues a device for a sync_state() callback when the device links write lock
 * isn't held. This allows the sync_state() execution flow to use device links
 * APIs.  The caller must ensure this function is called with
 * device_links_write_lock() held.
 *
 * This function does a get_device() to make sure the device is not freed while
 * on this list.
 *
 * So the caller must also ensure that device_links_flush_sync_list() is called
 * as soon as the caller releases device_links_write_lock().  This is necessary
 * to make sure the sync_state() is called in a timely fashion and the
 * put_device() is called on this device.
 */
static void __device_links_queue_sync_state(struct device *dev,
                                            struct list_head *list)
{
        struct device_link *link;

        if (!dev_has_sync_state(dev))
                return;
        if (dev->state_synced)
                return;

        list_for_each_entry(link, &dev->links.consumers, s_node) {
                if (!(link->flags & DL_FLAG_MANAGED))
                        continue;
                if (link->status != DL_STATE_ACTIVE)
                        return;
        }

        /*
         * Set the flag here to avoid adding the same device to a list more
         * than once. This can happen if new consumers get added to the device
         * and probed before the list is flushed.
         */
        dev->state_synced = true;

        if (WARN_ON(!list_empty(&dev->links.defer_sync)))
                return;

        get_device(dev);
        list_add_tail(&dev->links.defer_sync, list);
}

/**
 * device_links_flush_sync_list - Call sync_state() on a list of devices
 * @list: List of devices to call sync_state() on
 * @dont_lock_dev: Device for which lock is already held by the caller
 *
 * Calls sync_state() on all the devices that have been queued for it. This
 * function is used in conjunction with __device_links_queue_sync_state(). The
 * @dont_lock_dev parameter is useful when this function is called from a
 * context where a device lock is already held.
 */
static void device_links_flush_sync_list(struct list_head *list,
                                         struct device *dont_lock_dev)
{
        struct device *dev, *tmp;

        list_for_each_entry_safe(dev, tmp, list, links.defer_sync) {
                list_del_init(&dev->links.defer_sync);

                if (dev != dont_lock_dev)
                        device_lock(dev);

                dev_sync_state(dev);

                if (dev != dont_lock_dev)
                        device_unlock(dev);

                put_device(dev);
        }
}

void device_links_supplier_sync_state_pause(void)
{
        device_links_write_lock();
        defer_sync_state_count++;
        device_links_write_unlock();
}

void device_links_supplier_sync_state_resume(void)
{
        struct device *dev, *tmp;
        LIST_HEAD(sync_list);

        device_links_write_lock();
        if (!defer_sync_state_count) {
                WARN(true, "Unmatched sync_state pause/resume!");
                goto out;
        }
        defer_sync_state_count--;
        if (defer_sync_state_count)
                goto out;

        list_for_each_entry_safe(dev, tmp, &deferred_sync, links.defer_sync) {
                /*
                 * Delete from deferred_sync list before queuing it to
                 * sync_list because defer_sync is used for both lists.
                 */
                list_del_init(&dev->links.defer_sync);
                __device_links_queue_sync_state(dev, &sync_list);
        }
out:
        device_links_write_unlock();

        device_links_flush_sync_list(&sync_list, NULL);
}

static int sync_state_resume_initcall(void)
{
        device_links_supplier_sync_state_resume();
        return 0;
}
late_initcall(sync_state_resume_initcall);

static void __device_links_supplier_defer_sync(struct device *sup)
{
        if (list_empty(&sup->links.defer_sync) && dev_has_sync_state(sup))
                list_add_tail(&sup->links.defer_sync, &deferred_sync);
}

static void device_link_drop_managed(struct device_link *link)
{
        link->flags &= ~DL_FLAG_MANAGED;
        WRITE_ONCE(link->status, DL_STATE_NONE);
        kref_put(&link->kref, __device_link_del);
}

static ssize_t waiting_for_supplier_show(struct device *dev,
                                         struct device_attribute *attr,
                                         char *buf)
{
        bool val;

        device_lock(dev);
        mutex_lock(&fwnode_link_lock);
        val = !!fwnode_links_check_suppliers(dev->fwnode);
        mutex_unlock(&fwnode_link_lock);
        device_unlock(dev);
        return sysfs_emit(buf, "%u\n", val);
}
static DEVICE_ATTR_RO(waiting_for_supplier);

/**
 * device_links_force_bind - Prepares device to be force bound
 * @dev: Consumer device.
 *
 * device_bind_driver() force binds a device to a driver without calling any
 * driver probe functions. So the consumer really isn't going to wait for any
 * supplier before it's bound to the driver. We still want the device link
 * states to be sensible when this happens.
 *
 * In preparation for device_bind_driver(), this function goes through each
 * supplier device links and checks if the supplier is bound. If it is, then
 * the device link status is set to CONSUMER_PROBE. Otherwise, the device link
 * is dropped. Links without the DL_FLAG_MANAGED flag set are ignored.
 */
void device_links_force_bind(struct device *dev)
{
        struct device_link *link, *ln;

        device_links_write_lock();

        list_for_each_entry_safe(link, ln, &dev->links.suppliers, c_node) {
                if (!(link->flags & DL_FLAG_MANAGED))
                        continue;

                if (link->status != DL_STATE_AVAILABLE) {
                        device_link_drop_managed(link);
                        continue;
                }
                WRITE_ONCE(link->status, DL_STATE_CONSUMER_PROBE);
        }
        dev->links.status = DL_DEV_PROBING;

        device_links_write_unlock();
}

/**
 * device_links_driver_bound - Update device links after probing its driver.
 * @dev: Device to update the links for.
 *
 * The probe has been successful, so update links from this device to any
 * consumers by changing their status to "available".
 *
 * Also change the status of @dev's links to suppliers to "active".
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
void device_links_driver_bound(struct device *dev)
{
        struct device_link *link, *ln;
        LIST_HEAD(sync_list);

        /*
         * If a device binds successfully, it's expected to have created all
         * the device links it needs to or make new device links as it needs
         * them. So, fw_devlink no longer needs to create device links to any
         * of the device's suppliers.
         *
         * Also, if a child firmware node of this bound device is not added as a
         * device by now, assume it is never going to be added. Make this bound
         * device the fallback supplier to the dangling consumers of the child
         * firmware node because this bound device is probably implementing the
         * child firmware node functionality and we don't want the dangling
         * consumers to defer probe indefinitely waiting for a device for the
         * child firmware node.
         */
        if (dev->fwnode && dev->fwnode->dev == dev) {
                struct fwnode_handle *child;
                fwnode_links_purge_suppliers(dev->fwnode);
                mutex_lock(&fwnode_link_lock);
                fwnode_for_each_available_child_node(dev->fwnode, child)
                        __fw_devlink_pickup_dangling_consumers(child,
                                                               dev->fwnode);
                __fw_devlink_link_to_consumers(dev);
                mutex_unlock(&fwnode_link_lock);
        }
        device_remove_file(dev, &dev_attr_waiting_for_supplier);

        device_links_write_lock();

        list_for_each_entry(link, &dev->links.consumers, s_node) {
                if (!(link->flags & DL_FLAG_MANAGED))
                        continue;

                /*
                 * Links created during consumer probe may be in the "consumer
                 * probe" state to start with if the supplier is still probing
                 * when they are created and they may become "active" if the
                 * consumer probe returns first.  Skip them here.
                 */
                if (link->status == DL_STATE_CONSUMER_PROBE ||
                    link->status == DL_STATE_ACTIVE)
                        continue;

                WARN_ON(link->status != DL_STATE_DORMANT);
                WRITE_ONCE(link->status, DL_STATE_AVAILABLE);

                if (link->flags & DL_FLAG_AUTOPROBE_CONSUMER)
                        driver_deferred_probe_add(link->consumer);
        }

        if (defer_sync_state_count)
                __device_links_supplier_defer_sync(dev);
        else
                __device_links_queue_sync_state(dev, &sync_list);

        list_for_each_entry_safe(link, ln, &dev->links.suppliers, c_node) {
                struct device *supplier;

                if (!(link->flags & DL_FLAG_MANAGED))
                        continue;

                supplier = link->supplier;
                if (link->flags & DL_FLAG_SYNC_STATE_ONLY) {
                        /*
                         * When DL_FLAG_SYNC_STATE_ONLY is set, it means no
                         * other DL_MANAGED_LINK_FLAGS have been set. So, it's
                         * save to drop the managed link completely.
                         */
                        device_link_drop_managed(link);
                } else if (dev_is_best_effort(dev) &&
                           link->flags & DL_FLAG_INFERRED &&
                           link->status != DL_STATE_CONSUMER_PROBE &&
                           !link->supplier->can_match) {
                        /*
                         * When dev_is_best_effort() is true, we ignore device
                         * links to suppliers that don't have a driver.  If the
                         * consumer device still managed to probe, there's no
                         * point in maintaining a device link in a weird state
                         * (consumer probed before supplier). So delete it.
                         */
                        device_link_drop_managed(link);
                } else {
                        WARN_ON(link->status != DL_STATE_CONSUMER_PROBE);
                        WRITE_ONCE(link->status, DL_STATE_ACTIVE);
                }

                /*
                 * This needs to be done even for the deleted
                 * DL_FLAG_SYNC_STATE_ONLY device link in case it was the last
                 * device link that was preventing the supplier from getting a
                 * sync_state() call.
                 */
                if (defer_sync_state_count)
                        __device_links_supplier_defer_sync(supplier);
                else
                        __device_links_queue_sync_state(supplier, &sync_list);
        }

        dev->links.status = DL_DEV_DRIVER_BOUND;

        device_links_write_unlock();

        device_links_flush_sync_list(&sync_list, dev);
}

/**
 * __device_links_no_driver - Update links of a device without a driver.
 * @dev: Device without a drvier.
 *
 * Delete all non-persistent links from this device to any suppliers.
 *
 * Persistent links stay around, but their status is changed to "available",
 * unless they already are in the "supplier unbind in progress" state in which
 * case they need not be updated.
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
static void __device_links_no_driver(struct device *dev)
{
        struct device_link *link, *ln;

        list_for_each_entry_safe_reverse(link, ln, &dev->links.suppliers, c_node) {
                if (!(link->flags & DL_FLAG_MANAGED))
                        continue;

                if (link->flags & DL_FLAG_AUTOREMOVE_CONSUMER) {
                        device_link_drop_managed(link);
                        continue;
                }

                if (link->status != DL_STATE_CONSUMER_PROBE &&
                    link->status != DL_STATE_ACTIVE)
                        continue;

                if (link->supplier->links.status == DL_DEV_DRIVER_BOUND) {
                        WRITE_ONCE(link->status, DL_STATE_AVAILABLE);
                } else {
                        WARN_ON(!(link->flags & DL_FLAG_SYNC_STATE_ONLY));
                        WRITE_ONCE(link->status, DL_STATE_DORMANT);
                }
        }

        dev->links.status = DL_DEV_NO_DRIVER;
}

/**
 * device_links_no_driver - Update links after failing driver probe.
 * @dev: Device whose driver has just failed to probe.
 *
 * Clean up leftover links to consumers for @dev and invoke
 * %__device_links_no_driver() to update links to suppliers for it as
 * appropriate.
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
void device_links_no_driver(struct device *dev)
{
        struct device_link *link;

        device_links_write_lock();

        list_for_each_entry(link, &dev->links.consumers, s_node) {
                if (!(link->flags & DL_FLAG_MANAGED))
                        continue;

                /*
                 * The probe has failed, so if the status of the link is
                 * "consumer probe" or "active", it must have been added by
                 * a probing consumer while this device was still probing.
                 * Change its state to "dormant", as it represents a valid
                 * relationship, but it is not functionally meaningful.
                 */
                if (link->status == DL_STATE_CONSUMER_PROBE ||
                    link->status == DL_STATE_ACTIVE)
                        WRITE_ONCE(link->status, DL_STATE_DORMANT);
        }

        __device_links_no_driver(dev);

        device_links_write_unlock();
}

/**
 * device_links_driver_cleanup - Update links after driver removal.
 * @dev: Device whose driver has just gone away.
 *
 * Update links to consumers for @dev by changing their status to "dormant" and
 * invoke %__device_links_no_driver() to update links to suppliers for it as
 * appropriate.
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
void device_links_driver_cleanup(struct device *dev)
{
        struct device_link *link, *ln;

        device_links_write_lock();

        list_for_each_entry_safe(link, ln, &dev->links.consumers, s_node) {
                if (!(link->flags & DL_FLAG_MANAGED))
                        continue;

                WARN_ON(link->flags & DL_FLAG_AUTOREMOVE_CONSUMER);
                WARN_ON(link->status != DL_STATE_SUPPLIER_UNBIND);

                /*
                 * autoremove the links between this @dev and its consumer
                 * devices that are not active, i.e. where the link state
                 * has moved to DL_STATE_SUPPLIER_UNBIND.
                 */
                if (link->status == DL_STATE_SUPPLIER_UNBIND &&
                    link->flags & DL_FLAG_AUTOREMOVE_SUPPLIER)
                        device_link_drop_managed(link);

                WRITE_ONCE(link->status, DL_STATE_DORMANT);
        }

        list_del_init(&dev->links.defer_sync);
        __device_links_no_driver(dev);

        device_links_write_unlock();
}

/**
 * device_links_busy - Check if there are any busy links to consumers.
 * @dev: Device to check.
 *
 * Check each consumer of the device and return 'true' if its link's status
 * is one of "consumer probe" or "active" (meaning that the given consumer is
 * probing right now or its driver is present).  Otherwise, change the link
 * state to "supplier unbind" to prevent the consumer from being probed
 * successfully going forward.
 *
 * Return 'false' if there are no probing or active consumers.
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
bool device_links_busy(struct device *dev)
{
        struct device_link *link;
        bool ret = false;

        device_links_write_lock();

        list_for_each_entry(link, &dev->links.consumers, s_node) {
                if (!(link->flags & DL_FLAG_MANAGED))
                        continue;

                if (link->status == DL_STATE_CONSUMER_PROBE
                    || link->status == DL_STATE_ACTIVE) {
                        ret = true;
                        break;
                }
                WRITE_ONCE(link->status, DL_STATE_SUPPLIER_UNBIND);
        }

        dev->links.status = DL_DEV_UNBINDING;

        device_links_write_unlock();
        return ret;
}

/**
 * device_links_unbind_consumers - Force unbind consumers of the given device.
 * @dev: Device to unbind the consumers of.
 *
 * Walk the list of links to consumers for @dev and if any of them is in the
 * "consumer probe" state, wait for all device probes in progress to complete
 * and start over.
 *
 * If that's not the case, change the status of the link to "supplier unbind"
 * and check if the link was in the "active" state.  If so, force the consumer
 * driver to unbind and start over (the consumer will not re-probe as we have
 * changed the state of the link already).
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
void device_links_unbind_consumers(struct device *dev)
{
        struct device_link *link;

 start:
        device_links_write_lock();

        list_for_each_entry(link, &dev->links.consumers, s_node) {
                enum device_link_state status;

                if (!(link->flags & DL_FLAG_MANAGED) ||
                    link->flags & DL_FLAG_SYNC_STATE_ONLY)
                        continue;

                status = link->status;
                if (status == DL_STATE_CONSUMER_PROBE) {
                        device_links_write_unlock();

                        wait_for_device_probe();
                        goto start;
                }
                WRITE_ONCE(link->status, DL_STATE_SUPPLIER_UNBIND);
                if (status == DL_STATE_ACTIVE) {
                        struct device *consumer = link->consumer;

                        get_device(consumer);

                        device_links_write_unlock();

                        device_release_driver_internal(consumer, NULL,
                                                       consumer->parent);
                        put_device(consumer);
                        goto start;
                }
        }

        device_links_write_unlock();
}

/**
 * device_links_purge - Delete existing links to other devices.
 * @dev: Target device.
 */
static void device_links_purge(struct device *dev)
{
        struct device_link *link, *ln;

        if (dev->class == &devlink_class)
                return;

        /*
         * Delete all of the remaining links from this device to any other
         * devices (either consumers or suppliers).
         */
        device_links_write_lock();

        list_for_each_entry_safe_reverse(link, ln, &dev->links.suppliers, c_node) {
                WARN_ON(link->status == DL_STATE_ACTIVE);
                __device_link_del(&link->kref);
        }

        list_for_each_entry_safe_reverse(link, ln, &dev->links.consumers, s_node) {
                WARN_ON(link->status != DL_STATE_DORMANT &&
                        link->status != DL_STATE_NONE);
                __device_link_del(&link->kref);
        }

        device_links_write_unlock();
}

#define FW_DEVLINK_FLAGS_PERMISSIVE        (DL_FLAG_INFERRED | \
                                         DL_FLAG_SYNC_STATE_ONLY)
#define FW_DEVLINK_FLAGS_ON                (DL_FLAG_INFERRED | \
                                         DL_FLAG_AUTOPROBE_CONSUMER)
#define FW_DEVLINK_FLAGS_RPM                (FW_DEVLINK_FLAGS_ON | \
                                         DL_FLAG_PM_RUNTIME)

static u32 fw_devlink_flags = FW_DEVLINK_FLAGS_RPM;
static int __init fw_devlink_setup(char *arg)
{
        if (!arg)
                return -EINVAL;

        if (strcmp(arg, "off") == 0) {
                fw_devlink_flags = 0;
        } else if (strcmp(arg, "permissive") == 0) {
                fw_devlink_flags = FW_DEVLINK_FLAGS_PERMISSIVE;
        } else if (strcmp(arg, "on") == 0) {
                fw_devlink_flags = FW_DEVLINK_FLAGS_ON;
        } else if (strcmp(arg, "rpm") == 0) {
                fw_devlink_flags = FW_DEVLINK_FLAGS_RPM;
        }
        return 0;
}
early_param("fw_devlink", fw_devlink_setup);

static bool fw_devlink_strict;
static int __init fw_devlink_strict_setup(char *arg)
{
        return kstrtobool(arg, &fw_devlink_strict);
}
early_param("fw_devlink.strict", fw_devlink_strict_setup);

#define FW_DEVLINK_SYNC_STATE_STRICT        0
#define FW_DEVLINK_SYNC_STATE_TIMEOUT        1

#ifndef CONFIG_FW_DEVLINK_SYNC_STATE_TIMEOUT
static int fw_devlink_sync_state;
#else
static int fw_devlink_sync_state = FW_DEVLINK_SYNC_STATE_TIMEOUT;
#endif

static int __init fw_devlink_sync_state_setup(char *arg)
{
        if (!arg)
                return -EINVAL;

        if (strcmp(arg, "strict") == 0) {
                fw_devlink_sync_state = FW_DEVLINK_SYNC_STATE_STRICT;
                return 0;
        } else if (strcmp(arg, "timeout") == 0) {
                fw_devlink_sync_state = FW_DEVLINK_SYNC_STATE_TIMEOUT;
                return 0;
        }
        return -EINVAL;
}
early_param("fw_devlink.sync_state", fw_devlink_sync_state_setup);

static inline u32 fw_devlink_get_flags(u8 fwlink_flags)
{
        if (fwlink_flags & FWLINK_FLAG_CYCLE)
                return FW_DEVLINK_FLAGS_PERMISSIVE | DL_FLAG_CYCLE;

        return fw_devlink_flags;
}

static bool fw_devlink_is_permissive(void)
{
        return fw_devlink_flags == FW_DEVLINK_FLAGS_PERMISSIVE;
}

bool fw_devlink_is_strict(void)
{
        return fw_devlink_strict && !fw_devlink_is_permissive();
}

static void fw_devlink_parse_fwnode(struct fwnode_handle *fwnode)
{
        if (fwnode->flags & FWNODE_FLAG_LINKS_ADDED)
                return;

        fwnode_call_int_op(fwnode, add_links);
        fwnode->flags |= FWNODE_FLAG_LINKS_ADDED;
}

static void fw_devlink_parse_fwtree(struct fwnode_handle *fwnode)
{
        struct fwnode_handle *child = NULL;

        fw_devlink_parse_fwnode(fwnode);

        while ((child = fwnode_get_next_available_child_node(fwnode, child)))
                fw_devlink_parse_fwtree(child);
}

static void fw_devlink_relax_link(struct device_link *link)
{
        if (!(link->flags & DL_FLAG_INFERRED))
                return;

        if (device_link_flag_is_sync_state_only(link->flags))
                return;

        pm_runtime_drop_link(link);
        link->flags = DL_FLAG_MANAGED | FW_DEVLINK_FLAGS_PERMISSIVE;
        dev_dbg(link->consumer, "Relaxing link with %s\n",
                dev_name(link->supplier));
}

static int fw_devlink_no_driver(struct device *dev, void *data)
{
        struct device_link *link = to_devlink(dev);

        if (!link->supplier->can_match)
                fw_devlink_relax_link(link);

        return 0;
}

void fw_devlink_drivers_done(void)
{
        fw_devlink_drv_reg_done = true;
        device_links_write_lock();
        class_for_each_device(&devlink_class, NULL, NULL,
                              fw_devlink_no_driver);
        device_links_write_unlock();
}

static int fw_devlink_dev_sync_state(struct device *dev, void *data)
{
        struct device_link *link = to_devlink(dev);
        struct device *sup = link->supplier;

        if (!(link->flags & DL_FLAG_MANAGED) ||
            link->status == DL_STATE_ACTIVE || sup->state_synced ||
            !dev_has_sync_state(sup))
                return 0;

        if (fw_devlink_sync_state == FW_DEVLINK_SYNC_STATE_STRICT) {
                dev_warn(sup, "sync_state() pending due to %s\n",
                         dev_name(link->consumer));
                return 0;
        }

        if (!list_empty(&sup->links.defer_sync))
                return 0;

        dev_warn(sup, "Timed out. Forcing sync_state()\n");
        sup->state_synced = true;
        get_device(sup);
        list_add_tail(&sup->links.defer_sync, data);

        return 0;
}

void fw_devlink_probing_done(void)
{
        LIST_HEAD(sync_list);

        device_links_write_lock();
        class_for_each_device(&devlink_class, NULL, &sync_list,
                              fw_devlink_dev_sync_state);
        device_links_write_unlock();
        device_links_flush_sync_list(&sync_list, NULL);
}

/**
 * wait_for_init_devices_probe - Try to probe any device needed for init
 *
 * Some devices might need to be probed and bound successfully before the kernel
 * boot sequence can finish and move on to init/userspace. For example, a
 * network interface might need to be bound to be able to mount a NFS rootfs.
 *
 * With fw_devlink=on by default, some of these devices might be blocked from
 * probing because they are waiting on a optional supplier that doesn't have a
 * driver. While fw_devlink will eventually identify such devices and unblock
 * the probing automatically, it might be too late by the time it unblocks the
 * probing of devices. For example, the IP4 autoconfig might timeout before
 * fw_devlink unblocks probing of the network interface.
 *
 * This function is available to temporarily try and probe all devices that have
 * a driver even if some of their suppliers haven't been added or don't have
 * drivers.
 *
 * The drivers can then decide which of the suppliers are optional vs mandatory
 * and probe the device if possible. By the time this function returns, all such
 * "best effort" probes are guaranteed to be completed. If a device successfully
 * probes in this mode, we delete all fw_devlink discovered dependencies of that
 * device where the supplier hasn't yet probed successfully because they have to
 * be optional dependencies.
 *
 * Any devices that didn't successfully probe go back to being treated as if
 * this function was never called.
 *
 * This also means that some devices that aren't needed for init and could have
 * waited for their optional supplier to probe (when the supplier's module is
 * loaded later on) would end up probing prematurely with limited functionality.
 * So call this function only when boot would fail without it.
 */
void __init wait_for_init_devices_probe(void)
{
        if (!fw_devlink_flags || fw_devlink_is_permissive())
                return;

        /*
         * Wait for all ongoing probes to finish so that the "best effort" is
         * only applied to devices that can't probe otherwise.
         */
        wait_for_device_probe();

        pr_info("Trying to probe devices needed for running init ...\n");
        fw_devlink_best_effort = true;
        driver_deferred_probe_trigger();

        /*
         * Wait for all "best effort" probes to finish before going back to
         * normal enforcement.
         */
        wait_for_device_probe();
        fw_devlink_best_effort = false;
}

static void fw_devlink_unblock_consumers(struct device *dev)
{
        struct device_link *link;

        if (!fw_devlink_flags || fw_devlink_is_permissive())
                return;

        device_links_write_lock();
        list_for_each_entry(link, &dev->links.consumers, s_node)
                fw_devlink_relax_link(link);
        device_links_write_unlock();
}

#define get_dev_from_fwnode(fwnode)        get_device((fwnode)->dev)

static bool fwnode_init_without_drv(struct fwnode_handle *fwnode)
{
        struct device *dev;
        bool ret;

        if (!(fwnode->flags & FWNODE_FLAG_INITIALIZED))
                return false;

        dev = get_dev_from_fwnode(fwnode);
        ret = !dev || dev->links.status == DL_DEV_NO_DRIVER;
        put_device(dev);

        return ret;
}

static bool fwnode_ancestor_init_without_drv(struct fwnode_handle *fwnode)
{
        struct fwnode_handle *parent;

        fwnode_for_each_parent_node(fwnode, parent) {
                if (fwnode_init_without_drv(parent)) {
                        fwnode_handle_put(parent);
                        return true;
                }
        }

        return false;
}

/**
 * fwnode_is_ancestor_of - Test if @ancestor is ancestor of @child
 * @ancestor: Firmware which is tested for being an ancestor
 * @child: Firmware which is tested for being the child
 *
 * A node is considered an ancestor of itself too.
 *
 * Return: true if @ancestor is an ancestor of @child. Otherwise, returns false.
 */
static bool fwnode_is_ancestor_of(const struct fwnode_handle *ancestor,
                                  const struct fwnode_handle *child)
{
        struct fwnode_handle *parent;

        if (IS_ERR_OR_NULL(ancestor))
                return false;

        if (child == ancestor)
                return true;

        fwnode_for_each_parent_node(child, parent) {
                if (parent == ancestor) {
                        fwnode_handle_put(parent);
                        return true;
                }
        }
        return false;
}

/**
 * fwnode_get_next_parent_dev - Find device of closest ancestor fwnode
 * @fwnode: firmware node
 *
 * Given a firmware node (@fwnode), this function finds its closest ancestor
 * firmware node that has a corresponding struct device and returns that struct
 * device.
 *
 * The caller is responsible for calling put_device() on the returned device
 * pointer.
 *
 * Return: a pointer to the device of the @fwnode's closest ancestor.
 */
static struct device *fwnode_get_next_parent_dev(const struct fwnode_handle *fwnode)
{
        struct fwnode_handle *parent;
        struct device *dev;

        fwnode_for_each_parent_node(fwnode, parent) {
                dev = get_dev_from_fwnode(parent);
                if (dev) {
                        fwnode_handle_put(parent);
                        return dev;
                }
        }
        return NULL;
}

/**
 * __fw_devlink_relax_cycles - Relax and mark dependency cycles.
 * @con: Potential consumer device.
 * @sup_handle: Potential supplier's fwnode.
 *
 * Needs to be called with fwnode_lock and device link lock held.
 *
 * Check if @sup_handle or any of its ancestors or suppliers direct/indirectly
 * depend on @con. This function can detect multiple cyles between @sup_handle
 * and @con. When such dependency cycles are found, convert all device links
 * created solely by fw_devlink into SYNC_STATE_ONLY device links. Also, mark
 * all fwnode links in the cycle with FWLINK_FLAG_CYCLE so that when they are
 * converted into a device link in the future, they are created as
 * SYNC_STATE_ONLY device links. This is the equivalent of doing
 * fw_devlink=permissive just between the devices in the cycle. We need to do
 * this because, at this point, fw_devlink can't tell which of these
 * dependencies is not a real dependency.
 *
 * Return true if one or more cycles were found. Otherwise, return false.
 */
static bool __fw_devlink_relax_cycles(struct device *con,
                                 struct fwnode_handle *sup_handle)
{
        struct device *sup_dev = NULL, *par_dev = NULL;
        struct fwnode_link *link;
        struct device_link *dev_link;
        bool ret = false;

        if (!sup_handle)
                return false;

        /*
         * We aren't trying to find all cycles. Just a cycle between con and
         * sup_handle.
         */
        if (sup_handle->flags & FWNODE_FLAG_VISITED)
                return false;

        sup_handle->flags |= FWNODE_FLAG_VISITED;

        sup_dev = get_dev_from_fwnode(sup_handle);

        /* Termination condition. */
        if (sup_dev == con) {
                pr_debug("----- cycle: start -----\n");
                ret = true;
                goto out;
        }

        /*
         * If sup_dev is bound to a driver and @con hasn't started binding to a
         * driver, sup_dev can't be a consumer of @con. So, no need to check
         * further.
         */
        if (sup_dev && sup_dev->links.status ==  DL_DEV_DRIVER_BOUND &&
            con->links.status == DL_DEV_NO_DRIVER) {
                ret = false;
                goto out;
        }

        list_for_each_entry(link, &sup_handle->suppliers, c_hook) {
                if (link->flags & FWLINK_FLAG_IGNORE)
                        continue;

                if (__fw_devlink_relax_cycles(con, link->supplier)) {
                        __fwnode_link_cycle(link);
                        ret = true;
                }
        }

        /*
         * Give priority to device parent over fwnode parent to account for any
         * quirks in how fwnodes are converted to devices.
         */
        if (sup_dev)
                par_dev = get_device(sup_dev->parent);
        else
                par_dev = fwnode_get_next_parent_dev(sup_handle);

        if (par_dev && __fw_devlink_relax_cycles(con, par_dev->fwnode)) {
                pr_debug("%pfwf: cycle: child of %pfwf\n", sup_handle,
                         par_dev->fwnode);
                ret = true;
        }

        if (!sup_dev)
                goto out;

        list_for_each_entry(dev_link, &sup_dev->links.suppliers, c_node) {
                /*
                 * Ignore a SYNC_STATE_ONLY flag only if it wasn't marked as
                 * such due to a cycle.
                 */
                if (device_link_flag_is_sync_state_only(dev_link->flags) &&
                    !(dev_link->flags & DL_FLAG_CYCLE))
                        continue;

                if (__fw_devlink_relax_cycles(con,
                                              dev_link->supplier->fwnode)) {
                        pr_debug("%pfwf: cycle: depends on %pfwf\n", sup_handle,
                                 dev_link->supplier->fwnode);
                        fw_devlink_relax_link(dev_link);
                        dev_link->flags |= DL_FLAG_CYCLE;
                        ret = true;
                }
        }

out:
        sup_handle->flags &= ~FWNODE_FLAG_VISITED;
        put_device(sup_dev);
        put_device(par_dev);
        return ret;
}

/**
 * fw_devlink_create_devlink - Create a device link from a consumer to fwnode
 * @con: consumer device for the device link
 * @sup_handle: fwnode handle of supplier
 * @link: fwnode link that's being converted to a device link
 *
 * This function will try to create a device link between the consumer device
 * @con and the supplier device represented by @sup_handle.
 *
 * The supplier has to be provided as a fwnode because incorrect cycles in
 * fwnode links can sometimes cause the supplier device to never be created.
 * This function detects such cases and returns an error if it cannot create a
 * device link from the consumer to a missing supplier.
 *
 * Returns,
 * 0 on successfully creating a device link
 * -EINVAL if the device link cannot be created as expected
 * -EAGAIN if the device link cannot be created right now, but it may be
 *  possible to do that in the future
 */
static int fw_devlink_create_devlink(struct device *con,
                                     struct fwnode_handle *sup_handle,
                                     struct fwnode_link *link)
{
        struct device *sup_dev;
        int ret = 0;
        u32 flags;

        if (link->flags & FWLINK_FLAG_IGNORE)
                return 0;

        if (con->fwnode == link->consumer)
                flags = fw_devlink_get_flags(link->flags);
        else
                flags = FW_DEVLINK_FLAGS_PERMISSIVE;

        /*
         * In some cases, a device P might also be a supplier to its child node
         * C. However, this would defer the probe of C until the probe of P
         * completes successfully. This is perfectly fine in the device driver
         * model. device_add() doesn't guarantee probe completion of the device
         * by the time it returns.
         *
         * However, there are a few drivers that assume C will finish probing
         * as soon as it's added and before P finishes probing. So, we provide
         * a flag to let fw_devlink know not to delay the probe of C until the
         * probe of P completes successfully.
         *
         * When such a flag is set, we can't create device links where P is the
         * supplier of C as that would delay the probe of C.
         */
        if (sup_handle->flags & FWNODE_FLAG_NEEDS_CHILD_BOUND_ON_ADD &&
            fwnode_is_ancestor_of(sup_handle, con->fwnode))
                return -EINVAL;

        /*
         * SYNC_STATE_ONLY device links don't block probing and supports cycles.
         * So, one might expect that cycle detection isn't necessary for them.
         * However, if the device link was marked as SYNC_STATE_ONLY because
         * it's part of a cycle, then we still need to do cycle detection. This
         * is because the consumer and supplier might be part of multiple cycles
         * and we need to detect all those cycles.
         */
        if (!device_link_flag_is_sync_state_only(flags) ||
            flags & DL_FLAG_CYCLE) {
                device_links_write_lock();
                if (__fw_devlink_relax_cycles(con, sup_handle)) {
                        __fwnode_link_cycle(link);
                        flags = fw_devlink_get_flags(link->flags);
                        pr_debug("----- cycle: end -----\n");
                        dev_info(con, "Fixed dependency cycle(s) with %pfwf\n",
                                 sup_handle);
                }
                device_links_write_unlock();
        }

        if (sup_handle->flags & FWNODE_FLAG_NOT_DEVICE)
                sup_dev = fwnode_get_next_parent_dev(sup_handle);
        else
                sup_dev = get_dev_from_fwnode(sup_handle);

        if (sup_dev) {
                /*
                 * If it's one of those drivers that don't actually bind to
                 * their device using driver core, then don't wait on this
                 * supplier device indefinitely.
                 */
                if (sup_dev->links.status == DL_DEV_NO_DRIVER &&
                    sup_handle->flags & FWNODE_FLAG_INITIALIZED) {
                        dev_dbg(con,
                                "Not linking %pfwf - dev might never probe\n",
                                sup_handle);
                        ret = -EINVAL;
                        goto out;
                }

                if (con != sup_dev && !device_link_add(con, sup_dev, flags)) {
                        dev_err(con, "Failed to create device link (0x%x) with %s\n",
                                flags, dev_name(sup_dev));
                        ret = -EINVAL;
                }

                goto out;
        }

        /*
         * Supplier or supplier's ancestor already initialized without a struct
         * device or being probed by a driver.
         */
        if (fwnode_init_without_drv(sup_handle) ||
            fwnode_ancestor_init_without_drv(sup_handle)) {
                dev_dbg(con, "Not linking %pfwf - might never become dev\n",
                        sup_handle);
                return -EINVAL;
        }

        ret = -EAGAIN;
out:
        put_device(sup_dev);
        return ret;
}

/**
 * __fw_devlink_link_to_consumers - Create device links to consumers of a device
 * @dev: Device that needs to be linked to its consumers
 *
 * This function looks at all the consumer fwnodes of @dev and creates device
 * links between the consumer device and @dev (supplier).
 *
 * If the consumer device has not been added yet, then this function creates a
 * SYNC_STATE_ONLY link between @dev (supplier) and the closest ancestor device
 * of the consumer fwnode. This is necessary to make sure @dev doesn't get a
 * sync_state() callback before the real consumer device gets to be added and
 * then probed.
 *
 * Once device links are created from the real consumer to @dev (supplier), the
 * fwnode links are deleted.
 */
static void __fw_devlink_link_to_consumers(struct device *dev)
{
        struct fwnode_handle *fwnode = dev->fwnode;
        struct fwnode_link *link, *tmp;

        list_for_each_entry_safe(link, tmp, &fwnode->consumers, s_hook) {
                struct device *con_dev;
                bool own_link = true;
                int ret;

                con_dev = get_dev_from_fwnode(link->consumer);
                /*
                 * If consumer device is not available yet, make a "proxy"
                 * SYNC_STATE_ONLY link from the consumer's parent device to
                 * the supplier device. This is necessary to make sure the
                 * supplier doesn't get a sync_state() callback before the real
                 * consumer can create a device link to the supplier.
                 *
                 * This proxy link step is needed to handle the case where the
                 * consumer's parent device is added before the supplier.
                 */
                if (!con_dev) {
                        con_dev = fwnode_get_next_parent_dev(link->consumer);
                        /*
                         * However, if the consumer's parent device is also the
                         * parent of the supplier, don't create a
                         * consumer-supplier link from the parent to its child
                         * device. Such a dependency is impossible.
                         */
                        if (con_dev &&
                            fwnode_is_ancestor_of(con_dev->fwnode, fwnode)) {
                                put_device(con_dev);
                                con_dev = NULL;
                        } else {
                                own_link = false;
                        }
                }

                if (!con_dev)
                        continue;

                ret = fw_devlink_create_devlink(con_dev, fwnode, link);
                put_device(con_dev);
                if (!own_link || ret == -EAGAIN)
                        continue;

                __fwnode_link_del(link);
        }
}

/**
 * __fw_devlink_link_to_suppliers - Create device links to suppliers of a device
 * @dev: The consumer device that needs to be linked to its suppliers
 * @fwnode: Root of the fwnode tree that is used to create device links
 *
 * This function looks at all the supplier fwnodes of fwnode tree rooted at
 * @fwnode and creates device links between @dev (consumer) and all the
 * supplier devices of the entire fwnode tree at @fwnode.
 *
 * The function creates normal (non-SYNC_STATE_ONLY) device links between @dev
 * and the real suppliers of @dev. Once these device links are created, the
 * fwnode links are deleted.
 *
 * In addition, it also looks at all the suppliers of the entire fwnode tree
 * because some of the child devices of @dev that have not been added yet
 * (because @dev hasn't probed) might already have their suppliers added to
 * driver core. So, this function creates SYNC_STATE_ONLY device links between
 * @dev (consumer) and these suppliers to make sure they don't execute their
 * sync_state() callbacks before these child devices have a chance to create
 * their device links. The fwnode links that correspond to the child devices
 * aren't delete because they are needed later to create the device links
 * between the real consumer and supplier devices.
 */
static void __fw_devlink_link_to_suppliers(struct device *dev,
                                           struct fwnode_handle *fwnode)
{
        bool own_link = (dev->fwnode == fwnode);
        struct fwnode_link *link, *tmp;
        struct fwnode_handle *child = NULL;

        list_for_each_entry_safe(link, tmp, &fwnode->suppliers, c_hook) {
                int ret;
                struct fwnode_handle *sup = link->supplier;

                ret = fw_devlink_create_devlink(dev, sup, link);
                if (!own_link || ret == -EAGAIN)
                        continue;

                __fwnode_link_del(link);
        }

        /*
         * Make "proxy" SYNC_STATE_ONLY device links to represent the needs of
         * all the descendants. This proxy link step is needed to handle the
         * case where the supplier is added before the consumer's parent device
         * (@dev).
         */
        while ((child = fwnode_get_next_available_child_node(fwnode, child)))
                __fw_devlink_link_to_suppliers(dev, child);
}

static void fw_devlink_link_device(struct device *dev)
{
        struct fwnode_handle *fwnode = dev->fwnode;

        if (!fw_devlink_flags)
                return;

        fw_devlink_parse_fwtree(fwnode);

        mutex_lock(&fwnode_link_lock);
        __fw_devlink_link_to_consumers(dev);
        __fw_devlink_link_to_suppliers(dev, fwnode);
        mutex_unlock(&fwnode_link_lock);
}

/* Device links support end. */

static struct kobject *dev_kobj;

/* /sys/dev/char */
static struct kobject *sysfs_dev_char_kobj;

/* /sys/dev/block */
static struct kobject *sysfs_dev_block_kobj;

static DEFINE_MUTEX(device_hotplug_lock);

void lock_device_hotplug(void)
{
        mutex_lock(&device_hotplug_lock);
}

void unlock_device_hotplug(void)
{
        mutex_unlock(&device_hotplug_lock);
}

int lock_device_hotplug_sysfs(void)
{
        if (mutex_trylock(&device_hotplug_lock))
                return 0;

        /* Avoid busy looping (5 ms of sleep should do). */
        msleep(5);
        return restart_syscall();
}

#ifdef CONFIG_BLOCK
static inline int device_is_not_partition(struct device *dev)
{
        return !(dev->type == &part_type);
}
#else
static inline int device_is_not_partition(struct device *dev)
{
        return 1;
}
#endif

static void device_platform_notify(struct device *dev)
{
        acpi_device_notify(dev);

        software_node_notify(dev);
}

static void device_platform_notify_remove(struct device *dev)
{
        software_node_notify_remove(dev);

        acpi_device_notify_remove(dev);
}

/**
 * dev_driver_string - Return a device's driver name, if at all possible
 * @dev: struct device to get the name of
 *
 * Will return the device's driver's name if it is bound to a device.  If
 * the device is not bound to a driver, it will return the name of the bus
 * it is attached to.  If it is not attached to a bus either, an empty
 * string will be returned.
 */
const char *dev_driver_string(const struct device *dev)
{
        struct device_driver *drv;

        /* dev->driver can change to NULL underneath us because of unbinding,
         * so be careful about accessing it.  dev->bus and dev->class should
         * never change once they are set, so they don't need special care.
         */
        drv = READ_ONCE(dev->driver);
        return drv ? drv->name : dev_bus_name(dev);
}
EXPORT_SYMBOL(dev_driver_string);

#define to_dev_attr(_attr) container_of(_attr, struct device_attribute, attr)

static ssize_t dev_attr_show(struct kobject *kobj, struct attribute *attr,
                             char *buf)
{
        struct device_attribute *dev_attr = to_dev_attr(attr);
        struct device *dev = kobj_to_dev(kobj);
        ssize_t ret = -EIO;

        if (dev_attr->show)
                ret = dev_attr->show(dev, dev_attr, buf);
        if (ret >= (ssize_t)PAGE_SIZE) {
                printk("dev_attr_show: %pS returned bad count\n",
                                dev_attr->show);
        }
        return ret;
}

static ssize_t dev_attr_store(struct kobject *kobj, struct attribute *attr,
                              const char *buf, size_t count)
{
        struct device_attribute *dev_attr = to_dev_attr(attr);
        struct device *dev = kobj_to_dev(kobj);
        ssize_t ret = -EIO;

        if (dev_attr->store)
                ret = dev_attr->store(dev, dev_attr, buf, count);
        return ret;
}

static const struct sysfs_ops dev_sysfs_ops = {
        .show        = dev_attr_show,
        .store        = dev_attr_store,
};

#define to_ext_attr(x) container_of(x, struct dev_ext_attribute, attr)

ssize_t device_store_ulong(struct device *dev,
                           struct device_attribute *attr,
                           const char *buf, size_t size)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);
        int ret;
        unsigned long new;

        ret = kstrtoul(buf, 0, &new);
        if (ret)
                return ret;
        *(unsigned long *)(ea->var) = new;
        /* Always return full write size even if we didn't consume all */
        return size;
}
EXPORT_SYMBOL_GPL(device_store_ulong);

ssize_t device_show_ulong(struct device *dev,
                          struct device_attribute *attr,
                          char *buf)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);
        return sysfs_emit(buf, "%lx\n", *(unsigned long *)(ea->var));
}
EXPORT_SYMBOL_GPL(device_show_ulong);

ssize_t device_store_int(struct device *dev,
                         struct device_attribute *attr,
                         const char *buf, size_t size)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);
        int ret;
        long new;

        ret = kstrtol(buf, 0, &new);
        if (ret)
                return ret;

        if (new > INT_MAX || new < INT_MIN)
                return -EINVAL;
        *(int *)(ea->var) = new;
        /* Always return full write size even if we didn't consume all */
        return size;
}
EXPORT_SYMBOL_GPL(device_store_int);

ssize_t device_show_int(struct device *dev,
                        struct device_attribute *attr,
                        char *buf)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);

        return sysfs_emit(buf, "%d\n", *(int *)(ea->var));
}
EXPORT_SYMBOL_GPL(device_show_int);

ssize_t device_store_bool(struct device *dev, struct device_attribute *attr,
                          const char *buf, size_t size)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);

        if (kstrtobool(buf, ea->var) < 0)
                return -EINVAL;

        return size;
}
EXPORT_SYMBOL_GPL(device_store_bool);

ssize_t device_show_bool(struct device *dev, struct device_attribute *attr,
                         char *buf)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);

        return sysfs_emit(buf, "%d\n", *(bool *)(ea->var));
}
EXPORT_SYMBOL_GPL(device_show_bool);

ssize_t device_show_string(struct device *dev,
                           struct device_attribute *attr, char *buf)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);

        return sysfs_emit(buf, "%s\n", (char *)ea->var);
}
EXPORT_SYMBOL_GPL(device_show_string);

/**
 * device_release - free device structure.
 * @kobj: device's kobject.
 *
 * This is called once the reference count for the object
 * reaches 0. We forward the call to the device's release
 * method, which should handle actually freeing the structure.
 */
static void device_release(struct kobject *kobj)
{
        struct device *dev = kobj_to_dev(kobj);
        struct device_private *p = dev->p;

        /*
         * Some platform devices are driven without driver attached
         * and managed resources may have been acquired.  Make sure
         * all resources are released.
         *
         * Drivers still can add resources into device after device
         * is deleted but alive, so release devres here to avoid
         * possible memory leak.
         */
        devres_release_all(dev);

        kfree(dev->dma_range_map);

        if (dev->release)
                dev->release(dev);
        else if (dev->type && dev->type->release)
                dev->type->release(dev);
        else if (dev->class && dev->class->dev_release)
                dev->class->dev_release(dev);
        else
                WARN(1, KERN_ERR "Device '%s' does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n",
                        dev_name(dev));
        kfree(p);
}

static const void *device_namespace(const struct kobject *kobj)
{
        const struct device *dev = kobj_to_dev(kobj);
        const void *ns = NULL;

        if (dev->class && dev->class->ns_type)
                ns = dev->class->namespace(dev);

        return ns;
}

static void device_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid)
{
        const struct device *dev = kobj_to_dev(kobj);

        if (dev->class && dev->class->get_ownership)
                dev->class->get_ownership(dev, uid, gid);
}

static const struct kobj_type device_ktype = {
        .release        = device_release,
        .sysfs_ops        = &dev_sysfs_ops,
        .namespace        = device_namespace,
        .get_ownership        = device_get_ownership,
};


static int dev_uevent_filter(const struct kobject *kobj)
{
        const struct kobj_type *ktype = get_ktype(kobj);

        if (ktype == &device_ktype) {
                const struct device *dev = kobj_to_dev(kobj);
                if (dev->bus)
                        return 1;
                if (dev->class)
                        return 1;
        }
        return 0;
}

static const char *dev_uevent_name(const struct kobject *kobj)
{
        const struct device *dev = kobj_to_dev(kobj);

        if (dev->bus)
                return dev->bus->name;
        if (dev->class)
                return dev->class->name;
        return NULL;
}

static int dev_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
{
        const struct device *dev = kobj_to_dev(kobj);
        int retval = 0;

        /* add device node properties if present */
        if (MAJOR(dev->devt)) {
                const char *tmp;
                const char *name;
                umode_t mode = 0;
                kuid_t uid = GLOBAL_ROOT_UID;
                kgid_t gid = GLOBAL_ROOT_GID;

                add_uevent_var(env, "MAJOR=%u", MAJOR(dev->devt));
                add_uevent_var(env, "MINOR=%u", MINOR(dev->devt));
                name = device_get_devnode(dev, &mode, &uid, &gid, &tmp);
                if (name) {
                        add_uevent_var(env, "DEVNAME=%s", name);
                        if (mode)
                                add_uevent_var(env, "DEVMODE=%#o", mode & 0777);
                        if (!uid_eq(uid, GLOBAL_ROOT_UID))
                                add_uevent_var(env, "DEVUID=%u", from_kuid(&init_user_ns, uid));
                        if (!gid_eq(gid, GLOBAL_ROOT_GID))
                                add_uevent_var(env, "DEVGID=%u", from_kgid(&init_user_ns, gid));
                        kfree(tmp);
                }
        }

        if (dev->type && dev->type->name)
                add_uevent_var(env, "DEVTYPE=%s", dev->type->name);

        if (dev->driver)
                add_uevent_var(env, "DRIVER=%s", dev->driver->name);

        /* Add common DT information about the device */
        of_device_uevent(dev, env);

        /* have the bus specific function add its stuff */
        if (dev->bus && dev->bus->uevent) {
                retval = dev->bus->uevent(dev, env);
                if (retval)
                        pr_debug("device: '%s': %s: bus uevent() returned %d\n",
                                 dev_name(dev), __func__, retval);
        }

        /* have the class specific function add its stuff */
        if (dev->class && dev->class->dev_uevent) {
                retval = dev->class->dev_uevent(dev, env);
                if (retval)
                        pr_debug("device: '%s': %s: class uevent() "
                                 "returned %d\n", dev_name(dev),
                                 __func__, retval);
        }

        /* have the device type specific function add its stuff */
        if (dev->type && dev->type->uevent) {
                retval = dev->type->uevent(dev, env);
                if (retval)
                        pr_debug("device: '%s': %s: dev_type uevent() "
                                 "returned %d\n", dev_name(dev),
                                 __func__, retval);
        }

        return retval;
}

static const struct kset_uevent_ops device_uevent_ops = {
        .filter =        dev_uevent_filter,
        .name =                dev_uevent_name,
        .uevent =        dev_uevent,
};

static ssize_t uevent_show(struct device *dev, struct device_attribute *attr,
                           char *buf)
{
        struct kobject *top_kobj;
        struct kset *kset;
        struct kobj_uevent_env *env = NULL;
        int i;
        int len = 0;
        int retval;

        /* search the kset, the device belongs to */
        top_kobj = &dev->kobj;
        while (!top_kobj->kset && top_kobj->parent)
                top_kobj = top_kobj->parent;
        if (!top_kobj->kset)
                goto out;

        kset = top_kobj->kset;
        if (!kset->uevent_ops || !kset->uevent_ops->uevent)
                goto out;

        /* respect filter */
        if (kset->uevent_ops && kset->uevent_ops->filter)
                if (!kset->uevent_ops->filter(&dev->kobj))
                        goto out;

        env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
        if (!env)
                return -ENOMEM;

        /* Synchronize with really_probe() */
        device_lock(dev);
        /* let the kset specific function add its keys */
        retval = kset->uevent_ops->uevent(&dev->kobj, env);
        device_unlock(dev);
        if (retval)
                goto out;

        /* copy keys to file */
        for (i = 0; i < env->envp_idx; i++)
                len += sysfs_emit_at(buf, len, "%s\n", env->envp[i]);
out:
        kfree(env);
        return len;
}

static ssize_t uevent_store(struct device *dev, struct device_attribute *attr,
                            const char *buf, size_t count)
{
        int rc;

        rc = kobject_synth_uevent(&dev->kobj, buf, count);

        if (rc) {
                dev_err(dev, "uevent: failed to send synthetic uevent: %d\n", rc);
                return rc;
        }

        return count;
}
static DEVICE_ATTR_RW(uevent);

static ssize_t online_show(struct device *dev, struct device_attribute *attr,
                           char *buf)
{
        bool val;

        device_lock(dev);
        val = !dev->offline;
        device_unlock(dev);
        return sysfs_emit(buf, "%u\n", val);
}

static ssize_t online_store(struct device *dev, struct device_attribute *attr,
                            const char *buf, size_t count)
{
        bool val;
        int ret;

        ret = kstrtobool(buf, &val);
        if (ret < 0)
                return ret;

        ret = lock_device_hotplug_sysfs();
        if (ret)
                return ret;

        ret = val ? device_online(dev) : device_offline(dev);
        unlock_device_hotplug();
        return ret < 0 ? ret : count;
}
static DEVICE_ATTR_RW(online);

static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
                              char *buf)
{
        const char *loc;

        switch (dev->removable) {
        case DEVICE_REMOVABLE:
                loc = "removable";
                break;
        case DEVICE_FIXED:
                loc = "fixed";
                break;
        default:
                loc = "unknown";
        }
        return sysfs_emit(buf, "%s\n", loc);
}
static DEVICE_ATTR_RO(removable);

int device_add_groups(struct device *dev, const struct attribute_group **groups)
{
        return sysfs_create_groups(&dev->kobj, groups);
}
EXPORT_SYMBOL_GPL(device_add_groups);

void device_remove_groups(struct device *dev,
                          const struct attribute_group **groups)
{
        sysfs_remove_groups(&dev->kobj, groups);
}
EXPORT_SYMBOL_GPL(device_remove_groups);

union device_attr_group_devres {
        const struct attribute_group *group;
        const struct attribute_group **groups;
};

static void devm_attr_group_remove(struct device *dev, void *res)
{
        union device_attr_group_devres *devres = res;
        const struct attribute_group *group = devres->group;

        dev_dbg(dev, "%s: removing group %p\n", __func__, group);
        sysfs_remove_group(&dev->kobj, group);
}

/**
 * devm_device_add_group - given a device, create a managed attribute group
 * @dev:        The device to create the group for
 * @grp:        The attribute group to create
 *
 * This function creates a group for the first time.  It will explicitly
 * warn and error if any of the attribute files being created already exist.
 *
 * Returns 0 on success or error code on failure.
 */
int devm_device_add_group(struct device *dev, const struct attribute_group *grp)
{
        union device_attr_group_devres *devres;
        int error;

        devres = devres_alloc(devm_attr_group_remove,
                              sizeof(*devres), GFP_KERNEL);
        if (!devres)
                return -ENOMEM;

        error = sysfs_create_group(&dev->kobj, grp);
        if (error) {
                devres_free(devres);
                return error;
        }

        devres->group = grp;
        devres_add(dev, devres);
        return 0;
}
EXPORT_SYMBOL_GPL(devm_device_add_group);

static int device_add_attrs(struct device *dev)
{
        const struct class *class = dev->class;
        const struct device_type *type = dev->type;
        int error;

        if (class) {
                error = device_add_groups(dev, class->dev_groups);
                if (error)
                        return error;
        }

        if (type) {
                error = device_add_groups(dev, type->groups);
                if (error)
                        goto err_remove_class_groups;
        }

        error = device_add_groups(dev, dev->groups);
        if (error)
                goto err_remove_type_groups;

        if (device_supports_offline(dev) && !dev->offline_disabled) {
                error = device_create_file(dev, &dev_attr_online);
                if (error)
                        goto err_remove_dev_groups;
        }

        if (fw_devlink_flags && !fw_devlink_is_permissive() && dev->fwnode) {
                error = device_create_file(dev, &dev_attr_waiting_for_supplier);
                if (error)
                        goto err_remove_dev_online;
        }

        if (dev_removable_is_valid(dev)) {
                error = device_create_file(dev, &dev_attr_removable);
                if (error)
                        goto err_remove_dev_waiting_for_supplier;
        }

        if (dev_add_physical_location(dev)) {
                error = device_add_group(dev,
                        &dev_attr_physical_location_group);
                if (error)
                        goto err_remove_dev_removable;
        }

        return 0;

 err_remove_dev_removable:
        device_remove_file(dev, &dev_attr_removable);
 err_remove_dev_waiting_for_supplier:
        device_remove_file(dev, &dev_attr_waiting_for_supplier);
 err_remove_dev_online:
        device_remove_file(dev, &dev_attr_online);
 err_remove_dev_groups:
        device_remove_groups(dev, dev->groups);
 err_remove_type_groups:
        if (type)
                device_remove_groups(dev, type->groups);
 err_remove_class_groups:
        if (class)
                device_remove_groups(dev, class->dev_groups);

        return error;
}

static void device_remove_attrs(struct device *dev)
{
        const struct class *class = dev->class;
        const struct device_type *type = dev->type;

        if (dev->physical_location) {
                device_remove_group(dev, &dev_attr_physical_location_group);
                kfree(dev->physical_location);
        }

        device_remove_file(dev, &dev_attr_removable);
        device_remove_file(dev, &dev_attr_waiting_for_supplier);
        device_remove_file(dev, &dev_attr_online);
        device_remove_groups(dev, dev->groups);

        if (type)
                device_remove_groups(dev, type->groups);

        if (class)
                device_remove_groups(dev, class->dev_groups);
}

static ssize_t dev_show(struct device *dev, struct device_attribute *attr,
                        char *buf)
{
        return print_dev_t(buf, dev->devt);
}
static DEVICE_ATTR_RO(dev);

/* /sys/devices/ */
struct kset *devices_kset;

/**
 * devices_kset_move_before - Move device in the devices_kset's list.
 * @deva: Device to move.
 * @devb: Device @deva should come before.
 */
static void devices_kset_move_before(struct device *deva, struct device *devb)
{
        if (!devices_kset)
                return;
        pr_debug("devices_kset: Moving %s before %s\n",
                 dev_name(deva), dev_name(devb));
        spin_lock(&devices_kset->list_lock);
        list_move_tail(&deva->kobj.entry, &devb->kobj.entry);
        spin_unlock(&devices_kset->list_lock);
}

/**
 * devices_kset_move_after - Move device in the devices_kset's list.
 * @deva: Device to move
 * @devb: Device @deva should come after.
 */
static void devices_kset_move_after(struct device *deva, struct device *devb)
{
        if (!devices_kset)
                return;
        pr_debug("devices_kset: Moving %s after %s\n",
                 dev_name(deva), dev_name(devb));
        spin_lock(&devices_kset->list_lock);
        list_move(&deva->kobj.entry, &devb->kobj.entry);
        spin_unlock(&devices_kset->list_lock);
}

/**
 * devices_kset_move_last - move the device to the end of devices_kset's list.
 * @dev: device to move
 */
void devices_kset_move_last(struct device *dev)
{
        if (!devices_kset)
                return;
        pr_debug("devices_kset: Moving %s to end of list\n", dev_name(dev));
        spin_lock(&devices_kset->list_lock);
        list_move_tail(&dev->kobj.entry, &devices_kset->list);
        spin_unlock(&devices_kset->list_lock);
}

/**
 * device_create_file - create sysfs attribute file for device.
 * @dev: device.
 * @attr: device attribute descriptor.
 */
int device_create_file(struct device *dev,
                       const struct device_attribute *attr)
{
        int error = 0;

        if (dev) {
                WARN(((attr->attr.mode & S_IWUGO) && !attr->store),
                        "Attribute %s: write permission without 'store'\n",
                        attr->attr.name);
                WARN(((attr->attr.mode & S_IRUGO) && !attr->show),
                        "Attribute %s: read permission without 'show'\n",
                        attr->attr.name);
                error = sysfs_create_file(&dev->kobj, &attr->attr);
        }

        return error;
}
EXPORT_SYMBOL_GPL(device_create_file);

/**
 * device_remove_file - remove sysfs attribute file.
 * @dev: device.
 * @attr: device attribute descriptor.
 */
void device_remove_file(struct device *dev,
                        const struct device_attribute *attr)
{
        if (dev)
                sysfs_remove_file(&dev->kobj, &attr->attr);
}
EXPORT_SYMBOL_GPL(device_remove_file);

/**
 * device_remove_file_self - remove sysfs attribute file from its own method.
 * @dev: device.
 * @attr: device attribute descriptor.
 *
 * See kernfs_remove_self() for details.
 */
bool device_remove_file_self(struct device *dev,
                             const struct device_attribute *attr)
{
        if (dev)
                return sysfs_remove_file_self(&dev->kobj, &attr->attr);
        else
                return false;
}
EXPORT_SYMBOL_GPL(device_remove_file_self);

/**
 * device_create_bin_file - create sysfs binary attribute file for device.
 * @dev: device.
 * @attr: device binary attribute descriptor.
 */
int device_create_bin_file(struct device *dev,
                           const struct bin_attribute *attr)
{
        int error = -EINVAL;
        if (dev)
                error = sysfs_create_bin_file(&dev->kobj, attr);
        return error;
}
EXPORT_SYMBOL_GPL(device_create_bin_file);

/**
 * device_remove_bin_file - remove sysfs binary attribute file
 * @dev: device.
 * @attr: device binary attribute descriptor.
 */
void device_remove_bin_file(struct device *dev,
                            const struct bin_attribute *attr)
{
        if (dev)
                sysfs_remove_bin_file(&dev->kobj, attr);
}
EXPORT_SYMBOL_GPL(device_remove_bin_file);

static void klist_children_get(struct klist_node *n)
{
        struct device_private *p = to_device_private_parent(n);
        struct device *dev = p->device;

        get_device(dev);
}

static void klist_children_put(struct klist_node *n)
{
        struct device_private *p = to_device_private_parent(n);
        struct device *dev = p->device;

        put_device(dev);
}

/**
 * device_initialize - init device structure.
 * @dev: device.
 *
 * This prepares the device for use by other layers by initializing
 * its fields.
 * It is the first half of device_register(), if called by
 * that function, though it can also be called separately, so one
 * may use @dev's fields. In particular, get_device()/put_device()
 * may be used for reference counting of @dev after calling this
 * function.
 *
 * All fields in @dev must be initialized by the caller to 0, except
 * for those explicitly set to some other value.  The simplest
 * approach is to use kzalloc() to allocate the structure containing
 * @dev.
 *
 * NOTE: Use put_device() to give up your reference instead of freeing
 * @dev directly once you have called this function.
 */
void device_initialize(struct device *dev)
{
        dev->kobj.kset = devices_kset;
        kobject_init(&dev->kobj, &device_ktype);
        INIT_LIST_HEAD(&dev->dma_pools);
        mutex_init(&dev->mutex);
        lockdep_set_novalidate_class(&dev->mutex);
        spin_lock_init(&dev->devres_lock);
        INIT_LIST_HEAD(&dev->devres_head);
        device_pm_init(dev);
        set_dev_node(dev, NUMA_NO_NODE);
        INIT_LIST_HEAD(&dev->links.consumers);
        INIT_LIST_HEAD(&dev->links.suppliers);
        INIT_LIST_HEAD(&dev->links.defer_sync);
        dev->links.status = DL_DEV_NO_DRIVER;
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
        dev->dma_coherent = dma_default_coherent;
#endif
        swiotlb_dev_init(dev);
}
EXPORT_SYMBOL_GPL(device_initialize);

struct kobject *virtual_device_parent(struct device *dev)
{
        static struct kobject *virtual_dir = NULL;

        if (!virtual_dir)
                virtual_dir = kobject_create_and_add("virtual",
                                                     &devices_kset->kobj);

        return virtual_dir;
}

struct class_dir {
        struct kobject kobj;
        const struct class *class;
};

#define to_class_dir(obj) container_of(obj, struct class_dir, kobj)

static void class_dir_release(struct kobject *kobj)
{
        struct class_dir *dir = to_class_dir(kobj);
        kfree(dir);
}

static const
struct kobj_ns_type_operations *class_dir_child_ns_type(const struct kobject *kobj)
{
        const struct class_dir *dir = to_class_dir(kobj);
        return dir->class->ns_type;
}

static const struct kobj_type class_dir_ktype = {
        .release        = class_dir_release,
        .sysfs_ops        = &kobj_sysfs_ops,
        .child_ns_type        = class_dir_child_ns_type
};

static struct kobject *class_dir_create_and_add(struct subsys_private *sp,
                                                struct kobject *parent_kobj)
{
        struct class_dir *dir;
        int retval;

        dir = kzalloc(sizeof(*dir), GFP_KERNEL);
        if (!dir)
                return ERR_PTR(-ENOMEM);

        dir->class = sp->class;
        kobject_init(&dir->kobj, &class_dir_ktype);

        dir->kobj.kset = &sp->glue_dirs;

        retval = kobject_add(&dir->kobj, parent_kobj, "%s", sp->class->name);
        if (retval < 0) {
                kobject_put(&dir->kobj);
                return ERR_PTR(retval);
        }
        return &dir->kobj;
}

static DEFINE_MUTEX(gdp_mutex);

static struct kobject *get_device_parent(struct device *dev,
                                         struct device *parent)
{
        struct subsys_private *sp = class_to_subsys(dev->class);
        struct kobject *kobj = NULL;

        if (sp) {
                struct kobject *parent_kobj;
                struct kobject *k;

                /*
                 * If we have no parent, we live in "virtual".
                 * Class-devices with a non class-device as parent, live
                 * in a "glue" directory to prevent namespace collisions.
                 */
                if (parent == NULL)
                        parent_kobj = virtual_device_parent(dev);
                else if (parent->class && !dev->class->ns_type) {
                        subsys_put(sp);
                        return &parent->kobj;
                } else {
                        parent_kobj = &parent->kobj;
                }

                mutex_lock(&gdp_mutex);

                /* find our class-directory at the parent and reference it */
                spin_lock(&sp->glue_dirs.list_lock);
                list_for_each_entry(k, &sp->glue_dirs.list, entry)
                        if (k->parent == parent_kobj) {
                                kobj = kobject_get(k);
                                break;
                        }
                spin_unlock(&sp->glue_dirs.list_lock);
                if (kobj) {
                        mutex_unlock(&gdp_mutex);
                        subsys_put(sp);
                        return kobj;
                }

                /* or create a new class-directory at the parent device */
                k = class_dir_create_and_add(sp, parent_kobj);
                /* do not emit an uevent for this simple "glue" directory */
                mutex_unlock(&gdp_mutex);
                subsys_put(sp);
                return k;
        }

        /* subsystems can specify a default root directory for their devices */
        if (!parent && dev->bus) {
                struct device *dev_root = bus_get_dev_root(dev->bus);

                if (dev_root) {
                        kobj = &dev_root->kobj;
                        put_device(dev_root);
                        return kobj;
                }
        }

        if (parent)
                return &parent->kobj;
        return NULL;
}

static inline bool live_in_glue_dir(struct kobject *kobj,
                                    struct device *dev)
{
        struct subsys_private *sp;
        bool retval;

        if (!kobj || !dev->class)
                return false;

        sp = class_to_subsys(dev->class);
        if (!sp)
                return false;

        if (kobj->kset == &sp->glue_dirs)
                retval = true;
        else
                retval = false;

        subsys_put(sp);
        return retval;
}

static inline struct kobject *get_glue_dir(struct device *dev)
{
        return dev->kobj.parent;
}

/**
 * kobject_has_children - Returns whether a kobject has children.
 * @kobj: the object to test
 *
 * This will return whether a kobject has other kobjects as children.
 *
 * It does NOT account for the presence of attribute files, only sub
 * directories. It also assumes there is no concurrent addition or
 * removal of such children, and thus relies on external locking.
 */
static inline bool kobject_has_children(struct kobject *kobj)
{
        WARN_ON_ONCE(kref_read(&kobj->kref) == 0);

        return kobj->sd && kobj->sd->dir.subdirs;
}

/*
 * make sure cleaning up dir as the last step, we need to make
 * sure .release handler of kobject is run with holding the
 * global lock
 */
static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir)
{
        unsigned int ref;

        /* see if we live in a "glue" directory */
        if (!live_in_glue_dir(glue_dir, dev))
                return;

        mutex_lock(&gdp_mutex);
        /**
         * There is a race condition between removing glue directory
         * and adding a new device under the glue directory.
         *
         * CPU1:                                         CPU2:
         *
         * device_add()
         *   get_device_parent()
         *     class_dir_create_and_add()
         *       kobject_add_internal()
         *         create_dir()    // create glue_dir
         *
         *                                               device_add()
         *                                                 get_device_parent()
         *                                                   kobject_get() // get glue_dir
         *
         * device_del()
         *   cleanup_glue_dir()
         *     kobject_del(glue_dir)
         *
         *                                               kobject_add()
         *                                                 kobject_add_internal()
         *                                                   create_dir() // in glue_dir
         *                                                     sysfs_create_dir_ns()
         *                                                       kernfs_create_dir_ns(sd)
         *
         *       sysfs_remove_dir() // glue_dir->sd=NULL
         *       sysfs_put()        // free glue_dir->sd
         *
         *                                                         // sd is freed
         *                                                         kernfs_new_node(sd)
         *                                                           kernfs_get(glue_dir)
         *                                                           kernfs_add_one()
         *                                                           kernfs_put()
         *
         * Before CPU1 remove last child device under glue dir, if CPU2 add
         * a new device under glue dir, the glue_dir kobject reference count
         * will be increase to 2 in kobject_get(k). And CPU2 has been called
         * kernfs_create_dir_ns(). Meanwhile, CPU1 call sysfs_remove_dir()
         * and sysfs_put(). This result in glue_dir->sd is freed.
         *
         * Then the CPU2 will see a stale "empty" but still potentially used
         * glue dir around in kernfs_new_node().
         *
         * In order to avoid this happening, we also should make sure that
         * kernfs_node for glue_dir is released in CPU1 only when refcount
         * for glue_dir kobj is 1.
         */
        ref = kref_read(&glue_dir->kref);
        if (!kobject_has_children(glue_dir) && !--ref)
                kobject_del(glue_dir);
        kobject_put(glue_dir);
        mutex_unlock(&gdp_mutex);
}

static int device_add_class_symlinks(struct device *dev)
{
        struct device_node *of_node = dev_of_node(dev);
        struct subsys_private *sp;
        int error;

        if (of_node) {
                error = sysfs_create_link(&dev->kobj, of_node_kobj(of_node), "of_node");
                if (error)
                        dev_warn(dev, "Error %d creating of_node link\n",error);
                /* An error here doesn't warrant bringing down the device */
        }

        sp = class_to_subsys(dev->class);
        if (!sp)
                return 0;

        error = sysfs_create_link(&dev->kobj, &sp->subsys.kobj, "subsystem");
        if (error)
                goto out_devnode;

        if (dev->parent && device_is_not_partition(dev)) {
                error = sysfs_create_link(&dev->kobj, &dev->parent->kobj,
                                          "device");
                if (error)
                        goto out_subsys;
        }

        /* link in the class directory pointing to the device */
        error = sysfs_create_link(&sp->subsys.kobj, &dev->kobj, dev_name(dev));
        if (error)
                goto out_device;
        goto exit;

out_device:
        sysfs_remove_link(&dev->kobj, "device");
out_subsys:
        sysfs_remove_link(&dev->kobj, "subsystem");
out_devnode:
        sysfs_remove_link(&dev->kobj, "of_node");
exit:
        subsys_put(sp);
        return error;
}

static void device_remove_class_symlinks(struct device *dev)
{
        struct subsys_private *sp = class_to_subsys(dev->class);

        if (dev_of_node(dev))
                sysfs_remove_link(&dev->kobj, "of_node");

        if (!sp)
                return;

        if (dev->parent && device_is_not_partition(dev))
                sysfs_remove_link(&dev->kobj, "device");
        sysfs_remove_link(&dev->kobj, "subsystem");
        sysfs_delete_link(&sp->subsys.kobj, &dev->kobj, dev_name(dev));
        subsys_put(sp);
}

/**
 * dev_set_name - set a device name
 * @dev: device
 * @fmt: format string for the device's name
 */
int dev_set_name(struct device *dev, const char *fmt, ...)
{
        va_list vargs;
        int err;

        va_start(vargs, fmt);
        err = kobject_set_name_vargs(&dev->kobj, fmt, vargs);
        va_end(vargs);
        return err;
}
EXPORT_SYMBOL_GPL(dev_set_name);

/* select a /sys/dev/ directory for the device */
static struct kobject *device_to_dev_kobj(struct device *dev)
{
        if (is_blockdev(dev))
                return sysfs_dev_block_kobj;
        else
                return sysfs_dev_char_kobj;
}

static int device_create_sys_dev_entry(struct device *dev)
{
        struct kobject *kobj = device_to_dev_kobj(dev);
        int error = 0;
        char devt_str[15];

        if (kobj) {
                format_dev_t(devt_str, dev->devt);
                error = sysfs_create_link(kobj, &dev->kobj, devt_str);
        }

        return error;
}

static void device_remove_sys_dev_entry(struct device *dev)
{
        struct kobject *kobj = device_to_dev_kobj(dev);
        char devt_str[15];

        if (kobj) {
                format_dev_t(devt_str, dev->devt);
                sysfs_remove_link(kobj, devt_str);
        }
}

static int device_private_init(struct device *dev)
{
        dev->p = kzalloc(sizeof(*dev->p), GFP_KERNEL);
        if (!dev->p)
                return -ENOMEM;
        dev->p->device = dev;
        klist_init(&dev->p->klist_children, klist_children_get,
                   klist_children_put);
        INIT_LIST_HEAD(&dev->p->deferred_probe);
        return 0;
}

/**
 * device_add - add device to device hierarchy.
 * @dev: device.
 *
 * This is part 2 of device_register(), though may be called
 * separately _iff_ device_initialize() has been called separately.
 *
 * This adds @dev to the kobject hierarchy via kobject_add(), adds it
 * to the global and sibling lists for the device, then
 * adds it to the other relevant subsystems of the driver model.
 *
 * Do not call this routine or device_register() more than once for
 * any device structure.  The driver model core is not designed to work
 * with devices that get unregistered and then spring back to life.
 * (Among other things, it's very hard to guarantee that all references
 * to the previous incarnation of @dev have been dropped.)  Allocate
 * and register a fresh new struct device instead.
 *
 * NOTE: _Never_ directly free @dev after calling this function, even
 * if it returned an error! Always use put_device() to give up your
 * reference instead.
 *
 * Rule of thumb is: if device_add() succeeds, you should call
 * device_del() when you want to get rid of it. If device_add() has
 * *not* succeeded, use *only* put_device() to drop the reference
 * count.
 */
int device_add(struct device *dev)
{
        struct subsys_private *sp;
        struct device *parent;
        struct kobject *kobj;
        struct class_interface *class_intf;
        int error = -EINVAL;
        struct kobject *glue_dir = NULL;

        dev = get_device(dev);
        if (!dev)
                goto done;

        if (!dev->p) {
                error = device_private_init(dev);
                if (error)
                        goto done;
        }

        /*
         * for statically allocated devices, which should all be converted
         * some day, we need to initialize the name. We prevent reading back
         * the name, and force the use of dev_name()
         */
        if (dev->init_name) {
                error = dev_set_name(dev, "%s", dev->init_name);
                dev->init_name = NULL;
        }

        if (dev_name(dev))
                error = 0;
        /* subsystems can specify simple device enumeration */
        else if (dev->bus && dev->bus->dev_name)
                error = dev_set_name(dev, "%s%u", dev->bus->dev_name, dev->id);
        else
                error = -EINVAL;
        if (error)
                goto name_error;

        pr_debug("device: '%s': %s\n", dev_name(dev), __func__);

        parent = get_device(dev->parent);
        kobj = get_device_parent(dev, parent);
        if (IS_ERR(kobj)) {
                error = PTR_ERR(kobj);
                goto parent_error;
        }
        if (kobj)
                dev->kobj.parent = kobj;

        /* use parent numa_node */
        if (parent && (dev_to_node(dev) == NUMA_NO_NODE))
                set_dev_node(dev, dev_to_node(parent));

        /* first, register with generic layer. */
        /* we require the name to be set before, and pass NULL */
        error = kobject_add(&dev->kobj, dev->kobj.parent, NULL);
        if (error) {
                glue_dir = kobj;
                goto Error;
        }

        /* notify platform of device entry */
        device_platform_notify(dev);

        error = device_create_file(dev, &dev_attr_uevent);
        if (error)
                goto attrError;

        error = device_add_class_symlinks(dev);
        if (error)
                goto SymlinkError;
        error = device_add_attrs(dev);
        if (error)
                goto AttrsError;
        error = bus_add_device(dev);
        if (error)
                goto BusError;
        error = dpm_sysfs_add(dev);
        if (error)
                goto DPMError;
        device_pm_add(dev);

        if (MAJOR(dev->devt)) {
                error = device_create_file(dev, &dev_attr_dev);
                if (error)
                        goto DevAttrError;

                error = device_create_sys_dev_entry(dev);
                if (error)
                        goto SysEntryError;

                devtmpfs_create_node(dev);
        }

        /* Notify clients of device addition.  This call must come
         * after dpm_sysfs_add() and before kobject_uevent().
         */
        bus_notify(dev, BUS_NOTIFY_ADD_DEVICE);
        kobject_uevent(&dev->kobj, KOBJ_ADD);

        /*
         * Check if any of the other devices (consumers) have been waiting for
         * this device (supplier) to be added so that they can create a device
         * link to it.
         *
         * This needs to happen after device_pm_add() because device_link_add()
         * requires the supplier be registered before it's called.
         *
         * But this also needs to happen before bus_probe_device() to make sure
         * waiting consumers can link to it before the driver is bound to the
         * device and the driver sync_state callback is called for this device.
         */
        if (dev->fwnode && !dev->fwnode->dev) {
                dev->fwnode->dev = dev;
                fw_devlink_link_device(dev);
        }

        bus_probe_device(dev);

        /*
         * If all driver registration is done and a newly added device doesn't
         * match with any driver, don't block its consumers from probing in
         * case the consumer device is able to operate without this supplier.
         */
        if (dev->fwnode && fw_devlink_drv_reg_done && !dev->can_match)
                fw_devlink_unblock_consumers(dev);

        if (parent)
                klist_add_tail(&dev->p->knode_parent,
                               &parent->p->klist_children);

        sp = class_to_subsys(dev->class);
        if (sp) {
                mutex_lock(&sp->mutex);
                /* tie the class to the device */
                klist_add_tail(&dev->p->knode_class, &sp->klist_devices);

                /* notify any interfaces that the device is here */
                list_for_each_entry(class_intf, &sp->interfaces, node)
                        if (class_intf->add_dev)
                                class_intf->add_dev(dev);
                mutex_unlock(&sp->mutex);
                subsys_put(sp);
        }
done:
        put_device(dev);
        return error;
 SysEntryError:
        if (MAJOR(dev->devt))
                device_remove_file(dev, &dev_attr_dev);
 DevAttrError:
        device_pm_remove(dev);
        dpm_sysfs_remove(dev);
 DPMError:
        dev->driver = NULL;
        bus_remove_device(dev);
 BusError:
        device_remove_attrs(dev);
 AttrsError:
        device_remove_class_symlinks(dev);
 SymlinkError:
        device_remove_file(dev, &dev_attr_uevent);
 attrError:
        device_platform_notify_remove(dev);
        kobject_uevent(&dev->kobj, KOBJ_REMOVE);
        glue_dir = get_glue_dir(dev);
        kobject_del(&dev->kobj);
 Error:
        cleanup_glue_dir(dev, glue_dir);
parent_error:
        put_device(parent);
name_error:
        kfree(dev->p);
        dev->p = NULL;
        goto done;
}
EXPORT_SYMBOL_GPL(device_add);

/**
 * device_register - register a device with the system.
 * @dev: pointer to the device structure
 *
 * This happens in two clean steps - initialize the device
 * and add it to the system. The two steps can be called
 * separately, but this is the easiest and most common.
 * I.e. you should only call the two helpers separately if
 * have a clearly defined need to use and refcount the device
 * before it is added to the hierarchy.
 *
 * For more information, see the kerneldoc for device_initialize()
 * and device_add().
 *
 * NOTE: _Never_ directly free @dev after calling this function, even
 * if it returned an error! Always use put_device() to give up the
 * reference initialized in this function instead.
 */
int device_register(struct device *dev)
{
        device_initialize(dev);
        return device_add(dev);
}
EXPORT_SYMBOL_GPL(device_register);

/**
 * get_device - increment reference count for device.
 * @dev: device.
 *
 * This simply forwards the call to kobject_get(), though
 * we do take care to provide for the case that we get a NULL
 * pointer passed in.
 */
struct device *get_device(struct device *dev)
{
        return dev ? kobj_to_dev(kobject_get(&dev->kobj)) : NULL;
}
EXPORT_SYMBOL_GPL(get_device);

/**
 * put_device - decrement reference count.
 * @dev: device in question.
 */
void put_device(struct device *dev)
{
        /* might_sleep(); */
        if (dev)
                kobject_put(&dev->kobj);
}
EXPORT_SYMBOL_GPL(put_device);

bool kill_device(struct device *dev)
{
        /*
         * Require the device lock and set the "dead" flag to guarantee that
         * the update behavior is consistent with the other bitfields near
         * it and that we cannot have an asynchronous probe routine trying
         * to run while we are tearing out the bus/class/sysfs from
         * underneath the device.
         */
        device_lock_assert(dev);

        if (dev->p->dead)
                return false;
        dev->p->dead = true;
        return true;
}
EXPORT_SYMBOL_GPL(kill_device);

/**
 * device_del - delete device from system.
 * @dev: device.
 *
 * This is the first part of the device unregistration
 * sequence. This removes the device from the lists we control
 * from here, has it removed from the other driver model
 * subsystems it was added to in device_add(), and removes it
 * from the kobject hierarchy.
 *
 * NOTE: this should be called manually _iff_ device_add() was
 * also called manually.
 */
void device_del(struct device *dev)
{
        struct subsys_private *sp;
        struct device *parent = dev->parent;
        struct kobject *glue_dir = NULL;
        struct class_interface *class_intf;
        unsigned int noio_flag;

        device_lock(dev);
        kill_device(dev);
        device_unlock(dev);

        if (dev->fwnode && dev->fwnode->dev == dev)
                dev->fwnode->dev = NULL;

        /* Notify clients of device removal.  This call must come
         * before dpm_sysfs_remove().
         */
        noio_flag = memalloc_noio_save();
        bus_notify(dev, BUS_NOTIFY_DEL_DEVICE);

        dpm_sysfs_remove(dev);
        if (parent)
                klist_del(&dev->p->knode_parent);
        if (MAJOR(dev->devt)) {
                devtmpfs_delete_node(dev);
                device_remove_sys_dev_entry(dev);
                device_remove_file(dev, &dev_attr_dev);
        }

        sp = class_to_subsys(dev->class);
        if (sp) {
                device_remove_class_symlinks(dev);

                mutex_lock(&sp->mutex);
                /* notify any interfaces that the device is now gone */
                list_for_each_entry(class_intf, &sp->interfaces, node)
                        if (class_intf->remove_dev)
                                class_intf->remove_dev(dev);
                /* remove the device from the class list */
                klist_del(&dev->p->knode_class);
                mutex_unlock(&sp->mutex);
                subsys_put(sp);
        }
        device_remove_file(dev, &dev_attr_uevent);
        device_remove_attrs(dev);
        bus_remove_device(dev);
        device_pm_remove(dev);
        driver_deferred_probe_del(dev);
        device_platform_notify_remove(dev);
        device_links_purge(dev);

        /*
         * If a device does not have a driver attached, we need to clean
         * up any managed resources. We do this in device_release(), but
         * it's never called (and we leak the device) if a managed
         * resource holds a reference to the device. So release all
         * managed resources here, like we do in driver_detach(). We
         * still need to do so again in device_release() in case someone
         * adds a new resource after this point, though.
         */
        devres_release_all(dev);

        bus_notify(dev, BUS_NOTIFY_REMOVED_DEVICE);
        kobject_uevent(&dev->kobj, KOBJ_REMOVE);
        glue_dir = get_glue_dir(dev);
        kobject_del(&dev->kobj);
        cleanup_glue_dir(dev, glue_dir);
        memalloc_noio_restore(noio_flag);
        put_device(parent);
}
EXPORT_SYMBOL_GPL(device_del);

/**
 * device_unregister - unregister device from system.
 * @dev: device going away.
 *
 * We do this in two parts, like we do device_register(). First,
 * we remove it from all the subsystems with device_del(), then
 * we decrement the reference count via put_device(). If that
 * is the final reference count, the device will be cleaned up
 * via device_release() above. Otherwise, the structure will
 * stick around until the final reference to the device is dropped.
 */
void device_unregister(struct device *dev)
{
        pr_debug("device: '%s': %s\n", dev_name(dev), __func__);
        device_del(dev);
        put_device(dev);
}
EXPORT_SYMBOL_GPL(device_unregister);

static struct device *prev_device(struct klist_iter *i)
{
        struct klist_node *n = klist_prev(i);
        struct device *dev = NULL;
        struct device_private *p;

        if (n) {
                p = to_device_private_parent(n);
                dev = p->device;
        }
        return dev;
}

static struct device *next_device(struct klist_iter *i)
{
        struct klist_node *n = klist_next(i);
        struct device *dev = NULL;
        struct device_private *p;

        if (n) {
                p = to_device_private_parent(n);
                dev = p->device;
        }
        return dev;
}

/**
 * device_get_devnode - path of device node file
 * @dev: device
 * @mode: returned file access mode
 * @uid: returned file owner
 * @gid: returned file group
 * @tmp: possibly allocated string
 *
 * Return the relative path of a possible device node.
 * Non-default names may need to allocate a memory to compose
 * a name. This memory is returned in tmp and needs to be
 * freed by the caller.
 */
const char *device_get_devnode(const struct device *dev,
                               umode_t *mode, kuid_t *uid, kgid_t *gid,
                               const char **tmp)
{
        char *s;

        *tmp = NULL;

        /* the device type may provide a specific name */
        if (dev->type && dev->type->devnode)
                *tmp = dev->type->devnode(dev, mode, uid, gid);
        if (*tmp)
                return *tmp;

        /* the class may provide a specific name */
        if (dev->class && dev->class->devnode)
                *tmp = dev->class->devnode(dev, mode);
        if (*tmp)
                return *tmp;

        /* return name without allocation, tmp == NULL */
        if (strchr(dev_name(dev), '!') == NULL)
                return dev_name(dev);

        /* replace '!' in the name with '/' */
        s = kstrdup_and_replace(dev_name(dev), '!', '/', GFP_KERNEL);
        if (!s)
                return NULL;
        return *tmp = s;
}

/**
 * device_for_each_child - device child iterator.
 * @parent: parent struct device.
 * @fn: function to be called for each device.
 * @data: data for the callback.
 *
 * Iterate over @parent's child devices, and call @fn for each,
 * passing it @data.
 *
 * We check the return of @fn each time. If it returns anything
 * other than 0, we break out and return that value.
 */
int device_for_each_child(struct device *parent, void *data,
                          int (*fn)(struct device *dev, void *data))
{
        struct klist_iter i;
        struct device *child;
        int error = 0;

        if (!parent->p)
                return 0;

        klist_iter_init(&parent->p->klist_children, &i);
        while (!error && (child = next_device(&i)))
                error = fn(child, data);
        klist_iter_exit(&i);
        return error;
}
EXPORT_SYMBOL_GPL(device_for_each_child);

/**
 * device_for_each_child_reverse - device child iterator in reversed order.
 * @parent: parent struct device.
 * @fn: function to be called for each device.
 * @data: data for the callback.
 *
 * Iterate over @parent's child devices, and call @fn for each,
 * passing it @data.
 *
 * We check the return of @fn each time. If it returns anything
 * other than 0, we break out and return that value.
 */
int device_for_each_child_reverse(struct device *parent, void *data,
                                  int (*fn)(struct device *dev, void *data))
{
        struct klist_iter i;
        struct device *child;
        int error = 0;

        if (!parent->p)
                return 0;

        klist_iter_init(&parent->p->klist_children, &i);
        while ((child = prev_device(&i)) && !error)
                error = fn(child, data);
        klist_iter_exit(&i);
        return error;
}
EXPORT_SYMBOL_GPL(device_for_each_child_reverse);

/**
 * device_find_child - device iterator for locating a particular device.
 * @parent: parent struct device
 * @match: Callback function to check device
 * @data: Data to pass to match function
 *
 * This is similar to the device_for_each_child() function above, but it
 * returns a reference to a device that is 'found' for later use, as
 * determined by the @match callback.
 *
 * The callback should return 0 if the device doesn't match and non-zero
 * if it does.  If the callback returns non-zero and a reference to the
 * current device can be obtained, this function will return to the caller
 * and not iterate over any more devices.
 *
 * NOTE: you will need to drop the reference with put_device() after use.
 */
struct device *device_find_child(struct device *parent, void *data,
                                 int (*match)(struct device *dev, void *data))
{
        struct klist_iter i;
        struct device *child;

        if (!parent)
                return NULL;

        klist_iter_init(&parent->p->klist_children, &i);
        while ((child = next_device(&i)))
                if (match(child, data) && get_device(child))
                        break;
        klist_iter_exit(&i);
        return child;
}
EXPORT_SYMBOL_GPL(device_find_child);

/**
 * device_find_child_by_name - device iterator for locating a child device.
 * @parent: parent struct device
 * @name: name of the child device
 *
 * This is similar to the device_find_child() function above, but it
 * returns a reference to a device that has the name @name.
 *
 * NOTE: you will need to drop the reference with put_device() after use.
 */
struct device *device_find_child_by_name(struct device *parent,
                                         const char *name)
{
        struct klist_iter i;
        struct device *child;

        if (!parent)
                return NULL;

        klist_iter_init(&parent->p->klist_children, &i);
        while ((child = next_device(&i)))
                if (sysfs_streq(dev_name(child), name) && get_device(child))
                        break;
        klist_iter_exit(&i);
        return child;
}
EXPORT_SYMBOL_GPL(device_find_child_by_name);

static int match_any(struct device *dev, void *unused)
{
        return 1;
}

/**
 * device_find_any_child - device iterator for locating a child device, if any.
 * @parent: parent struct device
 *
 * This is similar to the device_find_child() function above, but it
 * returns a reference to a child device, if any.
 *
 * NOTE: you will need to drop the reference with put_device() after use.
 */
struct device *device_find_any_child(struct device *parent)
{
        return device_find_child(parent, NULL, match_any);
}
EXPORT_SYMBOL_GPL(device_find_any_child);

int __init devices_init(void)
{
        devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL);
        if (!devices_kset)
                return -ENOMEM;
        dev_kobj = kobject_create_and_add("dev", NULL);
        if (!dev_kobj)
                goto dev_kobj_err;
        sysfs_dev_block_kobj = kobject_create_and_add("block", dev_kobj);
        if (!sysfs_dev_block_kobj)
                goto block_kobj_err;
        sysfs_dev_char_kobj = kobject_create_and_add("char", dev_kobj);
        if (!sysfs_dev_char_kobj)
                goto char_kobj_err;
        device_link_wq = alloc_workqueue("device_link_wq", 0, 0);
        if (!device_link_wq)
                goto wq_err;

        return 0;

 wq_err:
        kobject_put(sysfs_dev_char_kobj);
 char_kobj_err:
        kobject_put(sysfs_dev_block_kobj);
 block_kobj_err:
        kobject_put(dev_kobj);
 dev_kobj_err:
        kset_unregister(devices_kset);
        return -ENOMEM;
}

static int device_check_offline(struct device *dev, void *not_used)
{
        int ret;

        ret = device_for_each_child(dev, NULL, device_check_offline);
        if (ret)
                return ret;

        return device_supports_offline(dev) && !dev->offline ? -EBUSY : 0;
}

/**
 * device_offline - Prepare the device for hot-removal.
 * @dev: Device to be put offline.
 *
 * Execute the device bus type's .offline() callback, if present, to prepare
 * the device for a subsequent hot-removal.  If that succeeds, the device must
 * not be used until either it is removed or its bus type's .online() callback
 * is executed.
 *
 * Call under device_hotplug_lock.
 */
int device_offline(struct device *dev)
{
        int ret;

        if (dev->offline_disabled)
                return -EPERM;

        ret = device_for_each_child(dev, NULL, device_check_offline);
        if (ret)
                return ret;

        device_lock(dev);
        if (device_supports_offline(dev)) {
                if (dev->offline) {
                        ret = 1;
                } else {
                        ret = dev->bus->offline(dev);
                        if (!ret) {
                                kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
                                dev->offline = true;
                        }
                }
        }
        device_unlock(dev);

        return ret;
}

/**
 * device_online - Put the device back online after successful device_offline().
 * @dev: Device to be put back online.
 *
 * If device_offline() has been successfully executed for @dev, but the device
 * has not been removed subsequently, execute its bus type's .online() callback
 * to indicate that the device can be used again.
 *
 * Call under device_hotplug_lock.
 */
int device_online(struct device *dev)
{
        int ret = 0;

        device_lock(dev);
        if (device_supports_offline(dev)) {
                if (dev->offline) {
                        ret = dev->bus->online(dev);
                        if (!ret) {
                                kobject_uevent(&dev->kobj, KOBJ_ONLINE);
                                dev->offline = false;
                        }
                } else {
                        ret = 1;
                }
        }
        device_unlock(dev);

        return ret;
}

struct root_device {
        struct device dev;
        struct module *owner;
};

static inline struct root_device *to_root_device(struct device *d)
{
        return container_of(d, struct root_device, dev);
}

static void root_device_release(struct device *dev)
{
        kfree(to_root_device(dev));
}

/**
 * __root_device_register - allocate and register a root device
 * @name: root device name
 * @owner: owner module of the root device, usually THIS_MODULE
 *
 * This function allocates a root device and registers it
 * using device_register(). In order to free the returned
 * device, use root_device_unregister().
 *
 * Root devices are dummy devices which allow other devices
 * to be grouped under /sys/devices. Use this function to
 * allocate a root device and then use it as the parent of
 * any device which should appear under /sys/devices/{name}
 *
 * The /sys/devices/{name} directory will also contain a
 * 'module' symlink which points to the @owner directory
 * in sysfs.
 *
 * Returns &struct device pointer on success, or ERR_PTR() on error.
 *
 * Note: You probably want to use root_device_register().
 */
struct device *__root_device_register(const char *name, struct module *owner)
{
        struct root_device *root;
        int err = -ENOMEM;

        root = kzalloc(sizeof(struct root_device), GFP_KERNEL);
        if (!root)
                return ERR_PTR(err);

        err = dev_set_name(&root->dev, "%s", name);
        if (err) {
                kfree(root);
                return ERR_PTR(err);
        }

        root->dev.release = root_device_release;

        err = device_register(&root->dev);
        if (err) {
                put_device(&root->dev);
                return ERR_PTR(err);
        }

#ifdef CONFIG_MODULES        /* gotta find a "cleaner" way to do this */
        if (owner) {
                struct module_kobject *mk = &owner->mkobj;

                err = sysfs_create_link(&root->dev.kobj, &mk->kobj, "module");
                if (err) {
                        device_unregister(&root->dev);
                        return ERR_PTR(err);
                }
                root->owner = owner;
        }
#endif

        return &root->dev;
}
EXPORT_SYMBOL_GPL(__root_device_register);

/**
 * root_device_unregister - unregister and free a root device
 * @dev: device going away
 *
 * This function unregisters and cleans up a device that was created by
 * root_device_register().
 */
void root_device_unregister(struct device *dev)
{
        struct root_device *root = to_root_device(dev);

        if (root->owner)
                sysfs_remove_link(&root->dev.kobj, "module");

        device_unregister(dev);
}
EXPORT_SYMBOL_GPL(root_device_unregister);


static void device_create_release(struct device *dev)
{
        pr_debug("device: '%s': %s\n", dev_name(dev), __func__);
        kfree(dev);
}

static __printf(6, 0) struct device *
device_create_groups_vargs(const struct class *class, struct device *parent,
                           dev_t devt, void *drvdata,
                           const struct attribute_group **groups,
                           const char *fmt, va_list args)
{
        struct device *dev = NULL;
        int retval = -ENODEV;

        if (IS_ERR_OR_NULL(class))
                goto error;

        dev = kzalloc(sizeof(*dev), GFP_KERNEL);
        if (!dev) {
                retval = -ENOMEM;
                goto error;
        }

        device_initialize(dev);
        dev->devt = devt;
        dev->class = class;
        dev->parent = parent;
        dev->groups = groups;
        dev->release = device_create_release;
        dev_set_drvdata(dev, drvdata);

        retval = kobject_set_name_vargs(&dev->kobj, fmt, args);
        if (retval)
                goto error;

        retval = device_add(dev);
        if (retval)
                goto error;

        return dev;

error:
        put_device(dev);
        return ERR_PTR(retval);
}

/**
 * device_create - creates a device and registers it with sysfs
 * @class: pointer to the struct class that this device should be registered to
 * @parent: pointer to the parent struct device of this new device, if any
 * @devt: the dev_t for the char device to be added
 * @drvdata: the data to be added to the device for callbacks
 * @fmt: string for the device's name
 *
 * This function can be used by char device classes.  A struct device
 * will be created in sysfs, registered to the specified class.
 *
 * A "dev" file will be created, showing the dev_t for the device, if
 * the dev_t is not 0,0.
 * If a pointer to a parent struct device is passed in, the newly created
 * struct device will be a child of that device in sysfs.
 * The pointer to the struct device will be returned from the call.
 * Any further sysfs files that might be required can be created using this
 * pointer.
 *
 * Returns &struct device pointer on success, or ERR_PTR() on error.
 */
struct device *device_create(const struct class *class, struct device *parent,
                             dev_t devt, void *drvdata, const char *fmt, ...)
{
        va_list vargs;
        struct device *dev;

        va_start(vargs, fmt);
        dev = device_create_groups_vargs(class, parent, devt, drvdata, NULL,
                                          fmt, vargs);
        va_end(vargs);
        return dev;
}
EXPORT_SYMBOL_GPL(device_create);

/**
 * device_create_with_groups - creates a device and registers it with sysfs
 * @class: pointer to the struct class that this device should be registered to
 * @parent: pointer to the parent struct device of this new device, if any
 * @devt: the dev_t for the char device to be added
 * @drvdata: the data to be added to the device for callbacks
 * @groups: NULL-terminated list of attribute groups to be created
 * @fmt: string for the device's name
 *
 * This function can be used by char device classes.  A struct device
 * will be created in sysfs, registered to the specified class.
 * Additional attributes specified in the groups parameter will also
 * be created automatically.
 *
 * A "dev" file will be created, showing the dev_t for the device, if
 * the dev_t is not 0,0.
 * If a pointer to a parent struct device is passed in, the newly created
 * struct device will be a child of that device in sysfs.
 * The pointer to the struct device will be returned from the call.
 * Any further sysfs files that might be required can be created using this
 * pointer.
 *
 * Returns &struct device pointer on success, or ERR_PTR() on error.
 */
struct device *device_create_with_groups(const struct class *class,
                                         struct device *parent, dev_t devt,
                                         void *drvdata,
                                         const struct attribute_group **groups,
                                         const char *fmt, ...)
{
        va_list vargs;
        struct device *dev;

        va_start(vargs, fmt);
        dev = device_create_groups_vargs(class, parent, devt, drvdata, groups,
                                         fmt, vargs);
        va_end(vargs);
        return dev;
}
EXPORT_SYMBOL_GPL(device_create_with_groups);

/**
 * device_destroy - removes a device that was created with device_create()
 * @class: pointer to the struct class that this device was registered with
 * @devt: the dev_t of the device that was previously registered
 *
 * This call unregisters and cleans up a device that was created with a
 * call to device_create().
 */
void device_destroy(const struct class *class, dev_t devt)
{
        struct device *dev;

        dev = class_find_device_by_devt(class, devt);
        if (dev) {
                put_device(dev);
                device_unregister(dev);
        }
}
EXPORT_SYMBOL_GPL(device_destroy);

/**
 * device_rename - renames a device
 * @dev: the pointer to the struct device to be renamed
 * @new_name: the new name of the device
 *
 * It is the responsibility of the caller to provide mutual
 * exclusion between two different calls of device_rename
 * on the same device to ensure that new_name is valid and
 * won't conflict with other devices.
 *
 * Note: given that some subsystems (networking and infiniband) use this
 * function, with no immediate plans for this to change, we cannot assume or
 * require that this function not be called at all.
 *
 * However, if you're writing new code, do not call this function. The following
 * text from Kay Sievers offers some insight:
 *
 * Renaming devices is racy at many levels, symlinks and other stuff are not
 * replaced atomically, and you get a "move" uevent, but it's not easy to
 * connect the event to the old and new device. Device nodes are not renamed at
 * all, there isn't even support for that in the kernel now.
 *
 * In the meantime, during renaming, your target name might be taken by another
 * driver, creating conflicts. Or the old name is taken directly after you
 * renamed it -- then you get events for the same DEVPATH, before you even see
 * the "move" event. It's just a mess, and nothing new should ever rely on
 * kernel device renaming. Besides that, it's not even implemented now for
 * other things than (driver-core wise very simple) network devices.
 *
 * Make up a "real" name in the driver before you register anything, or add
 * some other attributes for userspace to find the device, or use udev to add
 * symlinks -- but never rename kernel devices later, it's a complete mess. We
 * don't even want to get into that and try to implement the missing pieces in
 * the core. We really have other pieces to fix in the driver core mess. :)
 */
int device_rename(struct device *dev, const char *new_name)
{
        struct kobject *kobj = &dev->kobj;
        char *old_device_name = NULL;
        int error;

        dev = get_device(dev);
        if (!dev)
                return -EINVAL;

        dev_dbg(dev, "renaming to %s\n", new_name);

        old_device_name = kstrdup(dev_name(dev), GFP_KERNEL);
        if (!old_device_name) {
                error = -ENOMEM;
                goto out;
        }

        if (dev->class) {
                struct subsys_private *sp = class_to_subsys(dev->class);

                if (!sp) {
                        error = -EINVAL;
                        goto out;
                }

                error = sysfs_rename_link_ns(&sp->subsys.kobj, kobj, old_device_name,
                                             new_name, kobject_namespace(kobj));
                subsys_put(sp);
                if (error)
                        goto out;
        }

        error = kobject_rename(kobj, new_name);
        if (error)
                goto out;

out:
        put_device(dev);

        kfree(old_device_name);

        return error;
}
EXPORT_SYMBOL_GPL(device_rename);

static int device_move_class_links(struct device *dev,
                                   struct device *old_parent,
                                   struct device *new_parent)
{
        int error = 0;

        if (old_parent)
                sysfs_remove_link(&dev->kobj, "device");
        if (new_parent)
                error = sysfs_create_link(&dev->kobj, &new_parent->kobj,
                                          "device");
        return error;
}

/**
 * device_move - moves a device to a new parent
 * @dev: the pointer to the struct device to be moved
 * @new_parent: the new parent of the device (can be NULL)
 * @dpm_order: how to reorder the dpm_list
 */
int device_move(struct device *dev, struct device *new_parent,
                enum dpm_order dpm_order)
{
        int error;
        struct device *old_parent;
        struct kobject *new_parent_kobj;

        dev = get_device(dev);
        if (!dev)
                return -EINVAL;

        device_pm_lock();
        new_parent = get_device(new_parent);
        new_parent_kobj = get_device_parent(dev, new_parent);
        if (IS_ERR(new_parent_kobj)) {
                error = PTR_ERR(new_parent_kobj);
                put_device(new_parent);
                goto out;
        }

        pr_debug("device: '%s': %s: moving to '%s'\n", dev_name(dev),
                 __func__, new_parent ? dev_name(new_parent) : "<NULL>");
        error = kobject_move(&dev->kobj, new_parent_kobj);
        if (error) {
                cleanup_glue_dir(dev, new_parent_kobj);
                put_device(new_parent);
                goto out;
        }
        old_parent = dev->parent;
        dev->parent = new_parent;
        if (old_parent)
                klist_remove(&dev->p->knode_parent);
        if (new_parent) {
                klist_add_tail(&dev->p->knode_parent,
                               &new_parent->p->klist_children);
                set_dev_node(dev, dev_to_node(new_parent));
        }

        if (dev->class) {
                error = device_move_class_links(dev, old_parent, new_parent);
                if (error) {
                        /* We ignore errors on cleanup since we're hosed anyway... */
                        device_move_class_links(dev, new_parent, old_parent);
                        if (!kobject_move(&dev->kobj, &old_parent->kobj)) {
                                if (new_parent)
                                        klist_remove(&dev->p->knode_parent);
                                dev->parent = old_parent;
                                if (old_parent) {
                                        klist_add_tail(&dev->p->knode_parent,
                                                       &old_parent->p->klist_children);
                                        set_dev_node(dev, dev_to_node(old_parent));
                                }
                        }
                        cleanup_glue_dir(dev, new_parent_kobj);
                        put_device(new_parent);
                        goto out;
                }
        }
        switch (dpm_order) {
        case DPM_ORDER_NONE:
                break;
        case DPM_ORDER_DEV_AFTER_PARENT:
                device_pm_move_after(dev, new_parent);
                devices_kset_move_after(dev, new_parent);
                break;
        case DPM_ORDER_PARENT_BEFORE_DEV:
                device_pm_move_before(new_parent, dev);
                devices_kset_move_before(new_parent, dev);
                break;
        case DPM_ORDER_DEV_LAST:
                device_pm_move_last(dev);
                devices_kset_move_last(dev);
                break;
        }

        put_device(old_parent);
out:
        device_pm_unlock();
        put_device(dev);
        return error;
}
EXPORT_SYMBOL_GPL(device_move);

static int device_attrs_change_owner(struct device *dev, kuid_t kuid,
                                     kgid_t kgid)
{
        struct kobject *kobj = &dev->kobj;
        const struct class *class = dev->class;
        const struct device_type *type = dev->type;
        int error;

        if (class) {
                /*
                 * Change the device groups of the device class for @dev to
                 * @kuid/@kgid.
                 */
                error = sysfs_groups_change_owner(kobj, class->dev_groups, kuid,
                                                  kgid);
                if (error)
                        return error;
        }

        if (type) {
                /*
                 * Change the device groups of the device type for @dev to
                 * @kuid/@kgid.
                 */
                error = sysfs_groups_change_owner(kobj, type->groups, kuid,
                                                  kgid);
                if (error)
                        return error;
        }

        /* Change the device groups of @dev to @kuid/@kgid. */
        error = sysfs_groups_change_owner(kobj, dev->groups, kuid, kgid);
        if (error)
                return error;

        if (device_supports_offline(dev) && !dev->offline_disabled) {
                /* Change online device attributes of @dev to @kuid/@kgid. */
                error = sysfs_file_change_owner(kobj, dev_attr_online.attr.name,
                                                kuid, kgid);
                if (error)
                        return error;
        }

        return 0;
}

/**
 * device_change_owner - change the owner of an existing device.
 * @dev: device.
 * @kuid: new owner's kuid
 * @kgid: new owner's kgid
 *
 * This changes the owner of @dev and its corresponding sysfs entries to
 * @kuid/@kgid. This function closely mirrors how @dev was added via driver
 * core.
 *
 * Returns 0 on success or error code on failure.
 */
int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid)
{
        int error;
        struct kobject *kobj = &dev->kobj;
        struct subsys_private *sp;

        dev = get_device(dev);
        if (!dev)
                return -EINVAL;

        /*
         * Change the kobject and the default attributes and groups of the
         * ktype associated with it to @kuid/@kgid.
         */
        error = sysfs_change_owner(kobj, kuid, kgid);
        if (error)
                goto out;

        /*
         * Change the uevent file for @dev to the new owner. The uevent file
         * was created in a separate step when @dev got added and we mirror
         * that step here.
         */
        error = sysfs_file_change_owner(kobj, dev_attr_uevent.attr.name, kuid,
                                        kgid);
        if (error)
                goto out;

        /*
         * Change the device groups, the device groups associated with the
         * device class, and the groups associated with the device type of @dev
         * to @kuid/@kgid.
         */
        error = device_attrs_change_owner(dev, kuid, kgid);
        if (error)
                goto out;

        error = dpm_sysfs_change_owner(dev, kuid, kgid);
        if (error)
                goto out;

        /*
         * Change the owner of the symlink located in the class directory of
         * the device class associated with @dev which points to the actual
         * directory entry for @dev to @kuid/@kgid. This ensures that the
         * symlink shows the same permissions as its target.
         */
        sp = class_to_subsys(dev->class);
        if (!sp) {
                error = -EINVAL;
                goto out;
        }
        error = sysfs_link_change_owner(&sp->subsys.kobj, &dev->kobj, dev_name(dev), kuid, kgid);
        subsys_put(sp);

out:
        put_device(dev);
        return error;
}
EXPORT_SYMBOL_GPL(device_change_owner);

/**
 * device_shutdown - call ->shutdown() on each device to shutdown.
 */
void device_shutdown(void)
{
        struct device *dev, *parent;

        wait_for_device_probe();
        device_block_probing();

        cpufreq_suspend();

        spin_lock(&devices_kset->list_lock);
        /*
         * Walk the devices list backward, shutting down each in turn.
         * Beware that device unplug events may also start pulling
         * devices offline, even as the system is shutting down.
         */
        while (!list_empty(&devices_kset->list)) {
                dev = list_entry(devices_kset->list.prev, struct device,
                                kobj.entry);

                /*
                 * hold reference count of device's parent to
                 * prevent it from being freed because parent's
                 * lock is to be held
                 */
                parent = get_device(dev->parent);
                get_device(dev);
                /*
                 * Make sure the device is off the kset list, in the
                 * event that dev->*->shutdown() doesn't remove it.
                 */
                list_del_init(&dev->kobj.entry);
                spin_unlock(&devices_kset->list_lock);

                /* hold lock to avoid race with probe/release */
                if (parent)
                        device_lock(parent);
                device_lock(dev);

                /* Don't allow any more runtime suspends */
                pm_runtime_get_noresume(dev);
                pm_runtime_barrier(dev);

                if (dev->class && dev->class->shutdown_pre) {
                        if (initcall_debug)
                                dev_info(dev, "shutdown_pre\n");
                        dev->class->shutdown_pre(dev);
                }
                if (dev->bus && dev->bus->shutdown) {
                        if (initcall_debug)
                                dev_info(dev, "shutdown\n");
                        dev->bus->shutdown(dev);
                } else if (dev->driver && dev->driver->shutdown) {
                        if (initcall_debug)
                                dev_info(dev, "shutdown\n");
                        dev->driver->shutdown(dev);
                }

                device_unlock(dev);
                if (parent)
                        device_unlock(parent);

                put_device(dev);
                put_device(parent);

                spin_lock(&devices_kset->list_lock);
        }
        spin_unlock(&devices_kset->list_lock);
}

/*
 * Device logging functions
 */

#ifdef CONFIG_PRINTK
static void
set_dev_info(const struct device *dev, struct dev_printk_info *dev_info)
{
        const char *subsys;

        memset(dev_info, 0, sizeof(*dev_info));

        if (dev->class)
                subsys = dev->class->name;
        else if (dev->bus)
                subsys = dev->bus->name;
        else
                return;

        strscpy(dev_info->subsystem, subsys, sizeof(dev_info->subsystem));

        /*
         * Add device identifier DEVICE=:
         *   b12:8         block dev_t
         *   c127:3        char dev_t
         *   n8            netdev ifindex
         *   +sound:card0  subsystem:devname
         */
        if (MAJOR(dev->devt)) {
                char c;

                if (strcmp(subsys, "block") == 0)
                        c = 'b';
                else
                        c = 'c';

                snprintf(dev_info->device, sizeof(dev_info->device),
                         "%c%u:%u", c, MAJOR(dev->devt), MINOR(dev->devt));
        } else if (strcmp(subsys, "net") == 0) {
                struct net_device *net = to_net_dev(dev);

                snprintf(dev_info->device, sizeof(dev_info->device),
                         "n%u", net->ifindex);
        } else {
                snprintf(dev_info->device, sizeof(dev_info->device),
                         "+%s:%s", subsys, dev_name(dev));
        }
}

int dev_vprintk_emit(int level, const struct device *dev,
                     const char *fmt, va_list args)
{
        struct dev_printk_info dev_info;

        set_dev_info(dev, &dev_info);

        return vprintk_emit(0, level, &dev_info, fmt, args);
}
EXPORT_SYMBOL(dev_vprintk_emit);

int dev_printk_emit(int level, const struct device *dev, const char *fmt, ...)
{
        va_list args;
        int r;

        va_start(args, fmt);

        r = dev_vprintk_emit(level, dev, fmt, args);

        va_end(args);

        return r;
}
EXPORT_SYMBOL(dev_printk_emit);

static void __dev_printk(const char *level, const struct device *dev,
                        struct va_format *vaf)
{
        if (dev)
                dev_printk_emit(level[1] - '0', dev, "%s %s: %pV",
                                dev_driver_string(dev), dev_name(dev), vaf);
        else
                printk("%s(NULL device *): %pV", level, vaf);
}

void _dev_printk(const char *level, const struct device *dev,
                 const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        va_start(args, fmt);

        vaf.fmt = fmt;
        vaf.va = &args;

        __dev_printk(level, dev, &vaf);

        va_end(args);
}
EXPORT_SYMBOL(_dev_printk);

#define define_dev_printk_level(func, kern_level)                \
void func(const struct device *dev, const char *fmt, ...)        \
{                                                                \
        struct va_format vaf;                                        \
        va_list args;                                                \
                                                                \
        va_start(args, fmt);                                        \
                                                                \
        vaf.fmt = fmt;                                                \
        vaf.va = &args;                                                \
                                                                \
        __dev_printk(kern_level, dev, &vaf);                        \
                                                                \
        va_end(args);                                                \
}                                                                \
EXPORT_SYMBOL(func);

define_dev_printk_level(_dev_emerg, KERN_EMERG);
define_dev_printk_level(_dev_alert, KERN_ALERT);
define_dev_printk_level(_dev_crit, KERN_CRIT);
define_dev_printk_level(_dev_err, KERN_ERR);
define_dev_printk_level(_dev_warn, KERN_WARNING);
define_dev_printk_level(_dev_notice, KERN_NOTICE);
define_dev_printk_level(_dev_info, KERN_INFO);

#endif

/**
 * dev_err_probe - probe error check and log helper
 * @dev: the pointer to the struct device
 * @err: error value to test
 * @fmt: printf-style format string
 * @...: arguments as specified in the format string
 *
 * This helper implements common pattern present in probe functions for error
 * checking: print debug or error message depending if the error value is
 * -EPROBE_DEFER and propagate error upwards.
 * In case of -EPROBE_DEFER it sets also defer probe reason, which can be
 * checked later by reading devices_deferred debugfs attribute.
 * It replaces code sequence::
 *
 *         if (err != -EPROBE_DEFER)
 *                 dev_err(dev, ...);
 *         else
 *                 dev_dbg(dev, ...);
 *         return err;
 *
 * with::
 *
 *         return dev_err_probe(dev, err, ...);
 *
 * Using this helper in your probe function is totally fine even if @err is
 * known to never be -EPROBE_DEFER.
 * The benefit compared to a normal dev_err() is the standardized format
 * of the error code, it being emitted symbolically (i.e. you get "EAGAIN"
 * instead of "-35") and the fact that the error code is returned which allows
 * more compact error paths.
 *
 * Returns @err.
 */
int dev_err_probe(const struct device *dev, int err, const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;

        if (err != -EPROBE_DEFER) {
                dev_err(dev, "error %pe: %pV", ERR_PTR(err), &vaf);
        } else {
                device_set_deferred_probe_reason(dev, &vaf);
                dev_dbg(dev, "error %pe: %pV", ERR_PTR(err), &vaf);
        }

        va_end(args);

        return err;
}
EXPORT_SYMBOL_GPL(dev_err_probe);

static inline bool fwnode_is_primary(struct fwnode_handle *fwnode)
{
        return fwnode && !IS_ERR(fwnode->secondary);
}

/**
 * set_primary_fwnode - Change the primary firmware node of a given device.
 * @dev: Device to handle.
 * @fwnode: New primary firmware node of the device.
 *
 * Set the device's firmware node pointer to @fwnode, but if a secondary
 * firmware node of the device is present, preserve it.
 *
 * Valid fwnode cases are:
 *  - primary --> secondary --> -ENODEV
 *  - primary --> NULL
 *  - secondary --> -ENODEV
 *  - NULL
 */
void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode)
{
        struct device *parent = dev->parent;
        struct fwnode_handle *fn = dev->fwnode;

        if (fwnode) {
                if (fwnode_is_primary(fn))
                        fn = fn->secondary;

                if (fn) {
                        WARN_ON(fwnode->secondary);
                        fwnode->secondary = fn;
                }
                dev->fwnode = fwnode;
        } else {
                if (fwnode_is_primary(fn)) {
                        dev->fwnode = fn->secondary;

                        /* Skip nullifying fn->secondary if the primary is shared */
                        if (parent && fn == parent->fwnode)
                                return;

                        /* Set fn->secondary = NULL, so fn remains the primary fwnode */
                        fn->secondary = NULL;
                } else {
                        dev->fwnode = NULL;
                }
        }
}
EXPORT_SYMBOL_GPL(set_primary_fwnode);

/**
 * set_secondary_fwnode - Change the secondary firmware node of a given device.
 * @dev: Device to handle.
 * @fwnode: New secondary firmware node of the device.
 *
 * If a primary firmware node of the device is present, set its secondary
 * pointer to @fwnode.  Otherwise, set the device's firmware node pointer to
 * @fwnode.
 */
void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode)
{
        if (fwnode)
                fwnode->secondary = ERR_PTR(-ENODEV);

        if (fwnode_is_primary(dev->fwnode))
                dev->fwnode->secondary = fwnode;
        else
                dev->fwnode = fwnode;
}
EXPORT_SYMBOL_GPL(set_secondary_fwnode);

/**
 * device_set_of_node_from_dev - reuse device-tree node of another device
 * @dev: device whose device-tree node is being set
 * @dev2: device whose device-tree node is being reused
 *
 * Takes another reference to the new device-tree node after first dropping
 * any reference held to the old node.
 */
void device_set_of_node_from_dev(struct device *dev, const struct device *dev2)
{
        of_node_put(dev->of_node);
        dev->of_node = of_node_get(dev2->of_node);
        dev->of_node_reused = true;
}
EXPORT_SYMBOL_GPL(device_set_of_node_from_dev);

void device_set_node(struct device *dev, struct fwnode_handle *fwnode)
{
        dev->fwnode = fwnode;
        dev->of_node = to_of_node(fwnode);
}
EXPORT_SYMBOL_GPL(device_set_node);

int device_match_name(struct device *dev, const void *name)
{
        return sysfs_streq(dev_name(dev), name);
}
EXPORT_SYMBOL_GPL(device_match_name);

int device_match_of_node(struct device *dev, const void *np)
{
        return dev->of_node == np;
}
EXPORT_SYMBOL_GPL(device_match_of_node);

int device_match_fwnode(struct device *dev, const void *fwnode)
{
        return dev_fwnode(dev) == fwnode;
}
EXPORT_SYMBOL_GPL(device_match_fwnode);

int device_match_devt(struct device *dev, const void *pdevt)
{
        return dev->devt == *(dev_t *)pdevt;
}
EXPORT_SYMBOL_GPL(device_match_devt);

int device_match_acpi_dev(struct device *dev, const void *adev)
{
        return ACPI_COMPANION(dev) == adev;
}
EXPORT_SYMBOL(device_match_acpi_dev);

int device_match_acpi_handle(struct device *dev, const void *handle)
{
        return ACPI_HANDLE(dev) == handle;
}
EXPORT_SYMBOL(device_match_acpi_handle);

int device_match_any(struct device *dev, const void *unused)
{
        return 1;
}
EXPORT_SYMBOL_GPL(device_match_any);











































































































































    3 



    3 
































    3 





    3 


    3 
    2 





























































    3 







    3 






































    3 
    3 























    3 





    3 














    3 
    3 

    3 
























    3 



    3 












    3 
    2 




























    3 


    3 

















    3 

    3 
    3 
    3 





















    3 






    3 









    3 














    3 
    3 
















































































































    3 


    3 








    3 



















    3 








    3 



















    3 

    3 


    3 






    3 









    1 

    3 
    3 







    3 









    2 





    3 



    3 
    3 



    3 




    2 



























    3 



    3 




    3 







































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
// SPDX-License-Identifier: GPL-2.0
/*
 *  MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
 *  for the blk-mq scheduling framework
 *
 *  Copyright (C) 2016 Jens Axboe <axboe@kernel.dk>
 */
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/compiler.h>
#include <linux/rbtree.h>
#include <linux/sbitmap.h>

#include <trace/events/block.h>

#include "elevator.h"
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-mq-sched.h"

/*
 * See Documentation/block/deadline-iosched.rst
 */
static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
/*
 * Time after which to dispatch lower priority requests even if higher
 * priority requests are pending.
 */
static const int prio_aging_expire = 10 * HZ;
static const int writes_starved = 2;    /* max times reads can starve a write */
static const int fifo_batch = 16;       /* # of sequential requests treated as one
                                     by the above parameters. For throughput. */

enum dd_data_dir {
        DD_READ                = READ,
        DD_WRITE        = WRITE,
};

enum { DD_DIR_COUNT = 2 };

enum dd_prio {
        DD_RT_PRIO        = 0,
        DD_BE_PRIO        = 1,
        DD_IDLE_PRIO        = 2,
        DD_PRIO_MAX        = 2,
};

enum { DD_PRIO_COUNT = 3 };

/*
 * I/O statistics per I/O priority. It is fine if these counters overflow.
 * What matters is that these counters are at least as wide as
 * log2(max_outstanding_requests).
 */
struct io_stats_per_prio {
        uint32_t inserted;
        uint32_t merged;
        uint32_t dispatched;
        atomic_t completed;
};

/*
 * Deadline scheduler data per I/O priority (enum dd_prio). Requests are
 * present on both sort_list[] and fifo_list[].
 */
struct dd_per_prio {
        struct list_head dispatch;
        struct rb_root sort_list[DD_DIR_COUNT];
        struct list_head fifo_list[DD_DIR_COUNT];
        /* Position of the most recently dispatched request. */
        sector_t latest_pos[DD_DIR_COUNT];
        struct io_stats_per_prio stats;
};

struct deadline_data {
        /*
         * run time data
         */

        struct dd_per_prio per_prio[DD_PRIO_COUNT];

        /* Data direction of latest dispatched request. */
        enum dd_data_dir last_dir;
        unsigned int batching;                /* number of sequential requests made */
        unsigned int starved;                /* times reads have starved writes */

        /*
         * settings that change how the i/o scheduler behaves
         */
        int fifo_expire[DD_DIR_COUNT];
        int fifo_batch;
        int writes_starved;
        int front_merges;
        u32 async_depth;
        int prio_aging_expire;

        spinlock_t lock;
};

/* Maps an I/O priority class to a deadline scheduler priority. */
static const enum dd_prio ioprio_class_to_prio[] = {
        [IOPRIO_CLASS_NONE]        = DD_BE_PRIO,
        [IOPRIO_CLASS_RT]        = DD_RT_PRIO,
        [IOPRIO_CLASS_BE]        = DD_BE_PRIO,
        [IOPRIO_CLASS_IDLE]        = DD_IDLE_PRIO,
};

static inline struct rb_root *
deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq)
{
        return &per_prio->sort_list[rq_data_dir(rq)];
}

/*
 * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a
 * request.
 */
static u8 dd_rq_ioclass(struct request *rq)
{
        return IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
}

/*
 * Return the first request for which blk_rq_pos() >= @pos.
 */
static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
                                enum dd_data_dir data_dir, sector_t pos)
{
        struct rb_node *node = per_prio->sort_list[data_dir].rb_node;
        struct request *rq, *res = NULL;

        if (!node)
                return NULL;

        rq = rb_entry_rq(node);
        while (node) {
                rq = rb_entry_rq(node);
                if (blk_rq_pos(rq) >= pos) {
                        res = rq;
                        node = node->rb_left;
                } else {
                        node = node->rb_right;
                }
        }
        return res;
}

static void
deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
{
        struct rb_root *root = deadline_rb_root(per_prio, rq);

        elv_rb_add(root, rq);
}

static inline void
deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
{
        elv_rb_del(deadline_rb_root(per_prio, rq), rq);
}

/*
 * remove rq from rbtree and fifo.
 */
static void deadline_remove_request(struct request_queue *q,
                                    struct dd_per_prio *per_prio,
                                    struct request *rq)
{
        list_del_init(&rq->queuelist);

        /*
         * We might not be on the rbtree, if we are doing an insert merge
         */
        if (!RB_EMPTY_NODE(&rq->rb_node))
                deadline_del_rq_rb(per_prio, rq);

        elv_rqhash_del(q, rq);
        if (q->last_merge == rq)
                q->last_merge = NULL;
}

static void dd_request_merged(struct request_queue *q, struct request *req,
                              enum elv_merge type)
{
        struct deadline_data *dd = q->elevator->elevator_data;
        const u8 ioprio_class = dd_rq_ioclass(req);
        const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
        struct dd_per_prio *per_prio = &dd->per_prio[prio];

        /*
         * if the merge was a front merge, we need to reposition request
         */
        if (type == ELEVATOR_FRONT_MERGE) {
                elv_rb_del(deadline_rb_root(per_prio, req), req);
                deadline_add_rq_rb(per_prio, req);
        }
}

/*
 * Callback function that is invoked after @next has been merged into @req.
 */
static void dd_merged_requests(struct request_queue *q, struct request *req,
                               struct request *next)
{
        struct deadline_data *dd = q->elevator->elevator_data;
        const u8 ioprio_class = dd_rq_ioclass(next);
        const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];

        lockdep_assert_held(&dd->lock);

        dd->per_prio[prio].stats.merged++;

        /*
         * if next expires before rq, assign its expire time to rq
         * and move into next position (next will be deleted) in fifo
         */
        if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
                if (time_before((unsigned long)next->fifo_time,
                                (unsigned long)req->fifo_time)) {
                        list_move(&req->queuelist, &next->queuelist);
                        req->fifo_time = next->fifo_time;
                }
        }

        /*
         * kill knowledge of next, this one is a goner
         */
        deadline_remove_request(q, &dd->per_prio[prio], next);
}

/*
 * move an entry to dispatch queue
 */
static void
deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
                      struct request *rq)
{
        /*
         * take it off the sort and fifo list
         */
        deadline_remove_request(rq->q, per_prio, rq);
}

/* Number of requests queued for a given priority level. */
static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
{
        const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;

        lockdep_assert_held(&dd->lock);

        return stats->inserted - atomic_read(&stats->completed);
}

/*
 * deadline_check_fifo returns true if and only if there are expired requests
 * in the FIFO list. Requires !list_empty(&dd->fifo_list[data_dir]).
 */
static inline bool deadline_check_fifo(struct dd_per_prio *per_prio,
                                       enum dd_data_dir data_dir)
{
        struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);

        return time_is_before_eq_jiffies((unsigned long)rq->fifo_time);
}

/*
 * For the specified data direction, return the next request to
 * dispatch using arrival ordered lists.
 */
static struct request *
deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
                      enum dd_data_dir data_dir)
{
        if (list_empty(&per_prio->fifo_list[data_dir]))
                return NULL;

        return rq_entry_fifo(per_prio->fifo_list[data_dir].next);
}

/*
 * For the specified data direction, return the next request to
 * dispatch using sector position sorted lists.
 */
static struct request *
deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
                      enum dd_data_dir data_dir)
{
        return deadline_from_pos(per_prio, data_dir,
                                 per_prio->latest_pos[data_dir]);
}

/*
 * Returns true if and only if @rq started after @latest_start where
 * @latest_start is in jiffies.
 */
static bool started_after(struct deadline_data *dd, struct request *rq,
                          unsigned long latest_start)
{
        unsigned long start_time = (unsigned long)rq->fifo_time;

        start_time -= dd->fifo_expire[rq_data_dir(rq)];

        return time_after(start_time, latest_start);
}

/*
 * deadline_dispatch_requests selects the best request according to
 * read/write expire, fifo_batch, etc and with a start time <= @latest_start.
 */
static struct request *__dd_dispatch_request(struct deadline_data *dd,
                                             struct dd_per_prio *per_prio,
                                             unsigned long latest_start)
{
        struct request *rq, *next_rq;
        enum dd_data_dir data_dir;
        enum dd_prio prio;
        u8 ioprio_class;

        lockdep_assert_held(&dd->lock);

        if (!list_empty(&per_prio->dispatch)) {
                rq = list_first_entry(&per_prio->dispatch, struct request,
                                      queuelist);
                if (started_after(dd, rq, latest_start))
                        return NULL;
                list_del_init(&rq->queuelist);
                data_dir = rq_data_dir(rq);
                goto done;
        }

        /*
         * batches are currently reads XOR writes
         */
        rq = deadline_next_request(dd, per_prio, dd->last_dir);
        if (rq && dd->batching < dd->fifo_batch) {
                /* we have a next request and are still entitled to batch */
                data_dir = rq_data_dir(rq);
                goto dispatch_request;
        }

        /*
         * at this point we are not running a batch. select the appropriate
         * data direction (read / write)
         */

        if (!list_empty(&per_prio->fifo_list[DD_READ])) {
                BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ]));

                if (deadline_fifo_request(dd, per_prio, DD_WRITE) &&
                    (dd->starved++ >= dd->writes_starved))
                        goto dispatch_writes;

                data_dir = DD_READ;

                goto dispatch_find_request;
        }

        /*
         * there are either no reads or writes have been starved
         */

        if (!list_empty(&per_prio->fifo_list[DD_WRITE])) {
dispatch_writes:
                BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE]));

                dd->starved = 0;

                data_dir = DD_WRITE;

                goto dispatch_find_request;
        }

        return NULL;

dispatch_find_request:
        /*
         * we are not running a batch, find best request for selected data_dir
         */
        next_rq = deadline_next_request(dd, per_prio, data_dir);
        if (deadline_check_fifo(per_prio, data_dir) || !next_rq) {
                /*
                 * A deadline has expired, the last request was in the other
                 * direction, or we have run out of higher-sectored requests.
                 * Start again from the request with the earliest expiry time.
                 */
                rq = deadline_fifo_request(dd, per_prio, data_dir);
        } else {
                /*
                 * The last req was the same dir and we have a next request in
                 * sort order. No expired requests so continue on from here.
                 */
                rq = next_rq;
        }

        if (!rq)
                return NULL;

        dd->last_dir = data_dir;
        dd->batching = 0;

dispatch_request:
        if (started_after(dd, rq, latest_start))
                return NULL;

        /*
         * rq is the selected appropriate request.
         */
        dd->batching++;
        deadline_move_request(dd, per_prio, rq);
done:
        ioprio_class = dd_rq_ioclass(rq);
        prio = ioprio_class_to_prio[ioprio_class];
        dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq);
        dd->per_prio[prio].stats.dispatched++;
        rq->rq_flags |= RQF_STARTED;
        return rq;
}

/*
 * Check whether there are any requests with priority other than DD_RT_PRIO
 * that were inserted more than prio_aging_expire jiffies ago.
 */
static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd,
                                                      unsigned long now)
{
        struct request *rq;
        enum dd_prio prio;
        int prio_cnt;

        lockdep_assert_held(&dd->lock);

        prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) +
                   !!dd_queued(dd, DD_IDLE_PRIO);
        if (prio_cnt < 2)
                return NULL;

        for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) {
                rq = __dd_dispatch_request(dd, &dd->per_prio[prio],
                                           now - dd->prio_aging_expire);
                if (rq)
                        return rq;
        }

        return NULL;
}

/*
 * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests().
 *
 * One confusing aspect here is that we get called for a specific
 * hardware queue, but we may return a request that is for a
 * different hardware queue. This is because mq-deadline has shared
 * state for all hardware queues, in terms of sorting, FIFOs, etc.
 */
static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
        struct deadline_data *dd = hctx->queue->elevator->elevator_data;
        const unsigned long now = jiffies;
        struct request *rq;
        enum dd_prio prio;

        spin_lock(&dd->lock);
        rq = dd_dispatch_prio_aged_requests(dd, now);
        if (rq)
                goto unlock;

        /*
         * Next, dispatch requests in priority order. Ignore lower priority
         * requests if any higher priority requests are pending.
         */
        for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
                rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now);
                if (rq || dd_queued(dd, prio))
                        break;
        }

unlock:
        spin_unlock(&dd->lock);

        return rq;
}

/*
 * Called by __blk_mq_alloc_request(). The shallow_depth value set by this
 * function is used by __blk_mq_get_tag().
 */
static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
{
        struct deadline_data *dd = data->q->elevator->elevator_data;

        /* Do not throttle synchronous reads. */
        if (op_is_sync(opf) && !op_is_write(opf))
                return;

        /*
         * Throttle asynchronous requests and writes such that these requests
         * do not block the allocation of synchronous requests.
         */
        data->shallow_depth = dd->async_depth;
}

/* Called by blk_mq_update_nr_requests(). */
static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
{
        struct request_queue *q = hctx->queue;
        struct deadline_data *dd = q->elevator->elevator_data;
        struct blk_mq_tags *tags = hctx->sched_tags;

        dd->async_depth = max(1UL, 3 * q->nr_requests / 4);

        sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth);
}

/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */
static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
        dd_depth_updated(hctx);
        return 0;
}

static void dd_exit_sched(struct elevator_queue *e)
{
        struct deadline_data *dd = e->elevator_data;
        enum dd_prio prio;

        for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
                struct dd_per_prio *per_prio = &dd->per_prio[prio];
                const struct io_stats_per_prio *stats = &per_prio->stats;
                uint32_t queued;

                WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ]));
                WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE]));

                spin_lock(&dd->lock);
                queued = dd_queued(dd, prio);
                spin_unlock(&dd->lock);

                WARN_ONCE(queued != 0,
                          "statistics for priority %d: i %u m %u d %u c %u\n",
                          prio, stats->inserted, stats->merged,
                          stats->dispatched, atomic_read(&stats->completed));
        }

        kfree(dd);
}

/*
 * initialize elevator private data (deadline_data).
 */
static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
{
        struct deadline_data *dd;
        struct elevator_queue *eq;
        enum dd_prio prio;
        int ret = -ENOMEM;

        eq = elevator_alloc(q, e);
        if (!eq)
                return ret;

        dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
        if (!dd)
                goto put_eq;

        eq->elevator_data = dd;

        for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
                struct dd_per_prio *per_prio = &dd->per_prio[prio];

                INIT_LIST_HEAD(&per_prio->dispatch);
                INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]);
                INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]);
                per_prio->sort_list[DD_READ] = RB_ROOT;
                per_prio->sort_list[DD_WRITE] = RB_ROOT;
        }
        dd->fifo_expire[DD_READ] = read_expire;
        dd->fifo_expire[DD_WRITE] = write_expire;
        dd->writes_starved = writes_starved;
        dd->front_merges = 1;
        dd->last_dir = DD_WRITE;
        dd->fifo_batch = fifo_batch;
        dd->prio_aging_expire = prio_aging_expire;
        spin_lock_init(&dd->lock);

        /* We dispatch from request queue wide instead of hw queue */
        blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);

        q->elevator = eq;
        return 0;

put_eq:
        kobject_put(&eq->kobj);
        return ret;
}

/*
 * Try to merge @bio into an existing request. If @bio has been merged into
 * an existing request, store the pointer to that request into *@rq.
 */
static int dd_request_merge(struct request_queue *q, struct request **rq,
                            struct bio *bio)
{
        struct deadline_data *dd = q->elevator->elevator_data;
        const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio);
        const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
        struct dd_per_prio *per_prio = &dd->per_prio[prio];
        sector_t sector = bio_end_sector(bio);
        struct request *__rq;

        if (!dd->front_merges)
                return ELEVATOR_NO_MERGE;

        __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector);
        if (__rq) {
                BUG_ON(sector != blk_rq_pos(__rq));

                if (elv_bio_merge_ok(__rq, bio)) {
                        *rq = __rq;
                        if (blk_discard_mergable(__rq))
                                return ELEVATOR_DISCARD_MERGE;
                        return ELEVATOR_FRONT_MERGE;
                }
        }

        return ELEVATOR_NO_MERGE;
}

/*
 * Attempt to merge a bio into an existing request. This function is called
 * before @bio is associated with a request.
 */
static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs)
{
        struct deadline_data *dd = q->elevator->elevator_data;
        struct request *free = NULL;
        bool ret;

        spin_lock(&dd->lock);
        ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
        spin_unlock(&dd->lock);

        if (free)
                blk_mq_free_request(free);

        return ret;
}

/*
 * add rq to rbtree and fifo
 */
static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
                              blk_insert_t flags, struct list_head *free)
{
        struct request_queue *q = hctx->queue;
        struct deadline_data *dd = q->elevator->elevator_data;
        const enum dd_data_dir data_dir = rq_data_dir(rq);
        u16 ioprio = req_get_ioprio(rq);
        u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio);
        struct dd_per_prio *per_prio;
        enum dd_prio prio;

        lockdep_assert_held(&dd->lock);

        prio = ioprio_class_to_prio[ioprio_class];
        per_prio = &dd->per_prio[prio];
        if (!rq->elv.priv[0]) {
                per_prio->stats.inserted++;
                rq->elv.priv[0] = (void *)(uintptr_t)1;
        }

        if (blk_mq_sched_try_insert_merge(q, rq, free))
                return;

        trace_block_rq_insert(rq);

        if (flags & BLK_MQ_INSERT_AT_HEAD) {
                list_add(&rq->queuelist, &per_prio->dispatch);
                rq->fifo_time = jiffies;
        } else {
                struct list_head *insert_before;

                deadline_add_rq_rb(per_prio, rq);

                if (rq_mergeable(rq)) {
                        elv_rqhash_add(q, rq);
                        if (!q->last_merge)
                                q->last_merge = rq;
                }

                /*
                 * set expire time and add to fifo list
                 */
                rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
                insert_before = &per_prio->fifo_list[data_dir];
                list_add_tail(&rq->queuelist, insert_before);
        }
}

/*
 * Called from blk_mq_insert_request() or blk_mq_dispatch_plug_list().
 */
static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
                               struct list_head *list,
                               blk_insert_t flags)
{
        struct request_queue *q = hctx->queue;
        struct deadline_data *dd = q->elevator->elevator_data;
        LIST_HEAD(free);

        spin_lock(&dd->lock);
        while (!list_empty(list)) {
                struct request *rq;

                rq = list_first_entry(list, struct request, queuelist);
                list_del_init(&rq->queuelist);
                dd_insert_request(hctx, rq, flags, &free);
        }
        spin_unlock(&dd->lock);

        blk_mq_free_requests(&free);
}

/* Callback from inside blk_mq_rq_ctx_init(). */
static void dd_prepare_request(struct request *rq)
{
        rq->elv.priv[0] = NULL;
}

/*
 * Callback from inside blk_mq_free_request().
 */
static void dd_finish_request(struct request *rq)
{
        struct request_queue *q = rq->q;
        struct deadline_data *dd = q->elevator->elevator_data;
        const u8 ioprio_class = dd_rq_ioclass(rq);
        const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
        struct dd_per_prio *per_prio = &dd->per_prio[prio];

        /*
         * The block layer core may call dd_finish_request() without having
         * called dd_insert_requests(). Skip requests that bypassed I/O
         * scheduling. See also blk_mq_request_bypass_insert().
         */
        if (rq->elv.priv[0])
                atomic_inc(&per_prio->stats.completed);
}

static bool dd_has_work_for_prio(struct dd_per_prio *per_prio)
{
        return !list_empty_careful(&per_prio->dispatch) ||
                !list_empty_careful(&per_prio->fifo_list[DD_READ]) ||
                !list_empty_careful(&per_prio->fifo_list[DD_WRITE]);
}

static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
{
        struct deadline_data *dd = hctx->queue->elevator->elevator_data;
        enum dd_prio prio;

        for (prio = 0; prio <= DD_PRIO_MAX; prio++)
                if (dd_has_work_for_prio(&dd->per_prio[prio]))
                        return true;

        return false;
}

/*
 * sysfs parts below
 */
#define SHOW_INT(__FUNC, __VAR)                                                \
static ssize_t __FUNC(struct elevator_queue *e, char *page)                \
{                                                                        \
        struct deadline_data *dd = e->elevator_data;                        \
                                                                        \
        return sysfs_emit(page, "%d\n", __VAR);                                \
}
#define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR))
SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]);
SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire);
SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
SHOW_INT(deadline_front_merges_show, dd->front_merges);
SHOW_INT(deadline_async_depth_show, dd->async_depth);
SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch);
#undef SHOW_INT
#undef SHOW_JIFFIES

#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)                        \
static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)        \
{                                                                        \
        struct deadline_data *dd = e->elevator_data;                        \
        int __data, __ret;                                                \
                                                                        \
        __ret = kstrtoint(page, 0, &__data);                                \
        if (__ret < 0)                                                        \
                return __ret;                                                \
        if (__data < (MIN))                                                \
                __data = (MIN);                                                \
        else if (__data > (MAX))                                        \
                __data = (MAX);                                                \
        *(__PTR) = __CONV(__data);                                        \
        return count;                                                        \
}
#define STORE_INT(__FUNC, __PTR, MIN, MAX)                                \
        STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, )
#define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX)                                \
        STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies)
STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX);
STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX);
STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX);
STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX);
STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX);
#undef STORE_FUNCTION
#undef STORE_INT
#undef STORE_JIFFIES

#define DD_ATTR(name) \
        __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)

static struct elv_fs_entry deadline_attrs[] = {
        DD_ATTR(read_expire),
        DD_ATTR(write_expire),
        DD_ATTR(writes_starved),
        DD_ATTR(front_merges),
        DD_ATTR(async_depth),
        DD_ATTR(fifo_batch),
        DD_ATTR(prio_aging_expire),
        __ATTR_NULL
};

#ifdef CONFIG_BLK_DEBUG_FS
#define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name)                \
static void *deadline_##name##_fifo_start(struct seq_file *m,                \
                                          loff_t *pos)                        \
        __acquires(&dd->lock)                                                \
{                                                                        \
        struct request_queue *q = m->private;                                \
        struct deadline_data *dd = q->elevator->elevator_data;                \
        struct dd_per_prio *per_prio = &dd->per_prio[prio];                \
                                                                        \
        spin_lock(&dd->lock);                                                \
        return seq_list_start(&per_prio->fifo_list[data_dir], *pos);        \
}                                                                        \
                                                                        \
static void *deadline_##name##_fifo_next(struct seq_file *m, void *v,        \
                                         loff_t *pos)                        \
{                                                                        \
        struct request_queue *q = m->private;                                \
        struct deadline_data *dd = q->elevator->elevator_data;                \
        struct dd_per_prio *per_prio = &dd->per_prio[prio];                \
                                                                        \
        return seq_list_next(v, &per_prio->fifo_list[data_dir], pos);        \
}                                                                        \
                                                                        \
static void deadline_##name##_fifo_stop(struct seq_file *m, void *v)        \
        __releases(&dd->lock)                                                \
{                                                                        \
        struct request_queue *q = m->private;                                \
        struct deadline_data *dd = q->elevator->elevator_data;                \
                                                                        \
        spin_unlock(&dd->lock);                                                \
}                                                                        \
                                                                        \
static const struct seq_operations deadline_##name##_fifo_seq_ops = {        \
        .start        = deadline_##name##_fifo_start,                                \
        .next        = deadline_##name##_fifo_next,                                \
        .stop        = deadline_##name##_fifo_stop,                                \
        .show        = blk_mq_debugfs_rq_show,                                \
};                                                                        \
                                                                        \
static int deadline_##name##_next_rq_show(void *data,                        \
                                          struct seq_file *m)                \
{                                                                        \
        struct request_queue *q = data;                                        \
        struct deadline_data *dd = q->elevator->elevator_data;                \
        struct dd_per_prio *per_prio = &dd->per_prio[prio];                \
        struct request *rq;                                                \
                                                                        \
        rq = deadline_from_pos(per_prio, data_dir,                        \
                               per_prio->latest_pos[data_dir]);                \
        if (rq)                                                                \
                __blk_mq_debugfs_rq_show(m, rq);                        \
        return 0;                                                        \
}

DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0);
DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0);
DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1);
DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1);
DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2);
DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2);
#undef DEADLINE_DEBUGFS_DDIR_ATTRS

static int deadline_batching_show(void *data, struct seq_file *m)
{
        struct request_queue *q = data;
        struct deadline_data *dd = q->elevator->elevator_data;

        seq_printf(m, "%u\n", dd->batching);
        return 0;
}

static int deadline_starved_show(void *data, struct seq_file *m)
{
        struct request_queue *q = data;
        struct deadline_data *dd = q->elevator->elevator_data;

        seq_printf(m, "%u\n", dd->starved);
        return 0;
}

static int dd_async_depth_show(void *data, struct seq_file *m)
{
        struct request_queue *q = data;
        struct deadline_data *dd = q->elevator->elevator_data;

        seq_printf(m, "%u\n", dd->async_depth);
        return 0;
}

static int dd_queued_show(void *data, struct seq_file *m)
{
        struct request_queue *q = data;
        struct deadline_data *dd = q->elevator->elevator_data;
        u32 rt, be, idle;

        spin_lock(&dd->lock);
        rt = dd_queued(dd, DD_RT_PRIO);
        be = dd_queued(dd, DD_BE_PRIO);
        idle = dd_queued(dd, DD_IDLE_PRIO);
        spin_unlock(&dd->lock);

        seq_printf(m, "%u %u %u\n", rt, be, idle);

        return 0;
}

/* Number of requests owned by the block driver for a given priority. */
static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio)
{
        const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;

        lockdep_assert_held(&dd->lock);

        return stats->dispatched + stats->merged -
                atomic_read(&stats->completed);
}

static int dd_owned_by_driver_show(void *data, struct seq_file *m)
{
        struct request_queue *q = data;
        struct deadline_data *dd = q->elevator->elevator_data;
        u32 rt, be, idle;

        spin_lock(&dd->lock);
        rt = dd_owned_by_driver(dd, DD_RT_PRIO);
        be = dd_owned_by_driver(dd, DD_BE_PRIO);
        idle = dd_owned_by_driver(dd, DD_IDLE_PRIO);
        spin_unlock(&dd->lock);

        seq_printf(m, "%u %u %u\n", rt, be, idle);

        return 0;
}

#define DEADLINE_DISPATCH_ATTR(prio)                                        \
static void *deadline_dispatch##prio##_start(struct seq_file *m,        \
                                             loff_t *pos)                \
        __acquires(&dd->lock)                                                \
{                                                                        \
        struct request_queue *q = m->private;                                \
        struct deadline_data *dd = q->elevator->elevator_data;                \
        struct dd_per_prio *per_prio = &dd->per_prio[prio];                \
                                                                        \
        spin_lock(&dd->lock);                                                \
        return seq_list_start(&per_prio->dispatch, *pos);                \
}                                                                        \
                                                                        \
static void *deadline_dispatch##prio##_next(struct seq_file *m,                \
                                            void *v, loff_t *pos)        \
{                                                                        \
        struct request_queue *q = m->private;                                \
        struct deadline_data *dd = q->elevator->elevator_data;                \
        struct dd_per_prio *per_prio = &dd->per_prio[prio];                \
                                                                        \
        return seq_list_next(v, &per_prio->dispatch, pos);                \
}                                                                        \
                                                                        \
static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v)        \
        __releases(&dd->lock)                                                \
{                                                                        \
        struct request_queue *q = m->private;                                \
        struct deadline_data *dd = q->elevator->elevator_data;                \
                                                                        \
        spin_unlock(&dd->lock);                                                \
}                                                                        \
                                                                        \
static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \
        .start        = deadline_dispatch##prio##_start,                        \
        .next        = deadline_dispatch##prio##_next,                        \
        .stop        = deadline_dispatch##prio##_stop,                        \
        .show        = blk_mq_debugfs_rq_show,                                \
}

DEADLINE_DISPATCH_ATTR(0);
DEADLINE_DISPATCH_ATTR(1);
DEADLINE_DISPATCH_ATTR(2);
#undef DEADLINE_DISPATCH_ATTR

#define DEADLINE_QUEUE_DDIR_ATTRS(name)                                        \
        {#name "_fifo_list", 0400,                                        \
                        .seq_ops = &deadline_##name##_fifo_seq_ops}
#define DEADLINE_NEXT_RQ_ATTR(name)                                        \
        {#name "_next_rq", 0400, deadline_##name##_next_rq_show}
static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = {
        DEADLINE_QUEUE_DDIR_ATTRS(read0),
        DEADLINE_QUEUE_DDIR_ATTRS(write0),
        DEADLINE_QUEUE_DDIR_ATTRS(read1),
        DEADLINE_QUEUE_DDIR_ATTRS(write1),
        DEADLINE_QUEUE_DDIR_ATTRS(read2),
        DEADLINE_QUEUE_DDIR_ATTRS(write2),
        DEADLINE_NEXT_RQ_ATTR(read0),
        DEADLINE_NEXT_RQ_ATTR(write0),
        DEADLINE_NEXT_RQ_ATTR(read1),
        DEADLINE_NEXT_RQ_ATTR(write1),
        DEADLINE_NEXT_RQ_ATTR(read2),
        DEADLINE_NEXT_RQ_ATTR(write2),
        {"batching", 0400, deadline_batching_show},
        {"starved", 0400, deadline_starved_show},
        {"async_depth", 0400, dd_async_depth_show},
        {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops},
        {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops},
        {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops},
        {"owned_by_driver", 0400, dd_owned_by_driver_show},
        {"queued", 0400, dd_queued_show},
        {},
};
#undef DEADLINE_QUEUE_DDIR_ATTRS
#endif

static struct elevator_type mq_deadline = {
        .ops = {
                .depth_updated                = dd_depth_updated,
                .limit_depth                = dd_limit_depth,
                .insert_requests        = dd_insert_requests,
                .dispatch_request        = dd_dispatch_request,
                .prepare_request        = dd_prepare_request,
                .finish_request                = dd_finish_request,
                .next_request                = elv_rb_latter_request,
                .former_request                = elv_rb_former_request,
                .bio_merge                = dd_bio_merge,
                .request_merge                = dd_request_merge,
                .requests_merged        = dd_merged_requests,
                .request_merged                = dd_request_merged,
                .has_work                = dd_has_work,
                .init_sched                = dd_init_sched,
                .exit_sched                = dd_exit_sched,
                .init_hctx                = dd_init_hctx,
        },

#ifdef CONFIG_BLK_DEBUG_FS
        .queue_debugfs_attrs = deadline_queue_debugfs_attrs,
#endif
        .elevator_attrs = deadline_attrs,
        .elevator_name = "mq-deadline",
        .elevator_alias = "deadline",
        .elevator_owner = THIS_MODULE,
};
MODULE_ALIAS("mq-deadline-iosched");

static int __init deadline_init(void)
{
        return elv_register(&mq_deadline);
}

static void __exit deadline_exit(void)
{
        elv_unregister(&mq_deadline);
}

module_init(deadline_init);
module_exit(deadline_exit);

MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("MQ deadline IO scheduler");

































    6 




























    1 



    3 


    6 






















































































































































































































































































































































    6 





    7 







































































































































































































































































































































































































































































































































































































































    3 
    3 

















    2 
































    2 












    2 


























































    2 





    2 







































































































































































































    2 

    2 










    3 













    1 


    1 


    1 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
// SPDX-License-Identifier: GPL-2.0-only
/*
  File: fs/xattr.c

  Extended attribute handling.

  Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org>
  Copyright (C) 2001 SGI - Silicon Graphics, Inc <linux-xfs@oss.sgi.com>
  Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
 */
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/fsnotify.h>
#include <linux/audit.h>
#include <linux/vmalloc.h>
#include <linux/posix_acl_xattr.h>

#include <linux/uaccess.h>

#include "internal.h"

static const char *
strcmp_prefix(const char *a, const char *a_prefix)
{
        while (*a_prefix && *a == *a_prefix) {
                a++;
                a_prefix++;
        }
        return *a_prefix ? NULL : a;
}

/*
 * In order to implement different sets of xattr operations for each xattr
 * prefix, a filesystem should create a null-terminated array of struct
 * xattr_handler (one for each prefix) and hang a pointer to it off of the
 * s_xattr field of the superblock.
 */
#define for_each_xattr_handler(handlers, handler)                \
        if (handlers)                                                \
                for ((handler) = *(handlers)++;                        \
                        (handler) != NULL;                        \
                        (handler) = *(handlers)++)

/*
 * Find the xattr_handler with the matching prefix.
 */
static const struct xattr_handler *
xattr_resolve_name(struct inode *inode, const char **name)
{
        const struct xattr_handler * const *handlers = inode->i_sb->s_xattr;
        const struct xattr_handler *handler;

        if (!(inode->i_opflags & IOP_XATTR)) {
                if (unlikely(is_bad_inode(inode)))
                        return ERR_PTR(-EIO);
                return ERR_PTR(-EOPNOTSUPP);
        }
        for_each_xattr_handler(handlers, handler) {
                const char *n;

                n = strcmp_prefix(*name, xattr_prefix(handler));
                if (n) {
                        if (!handler->prefix ^ !*n) {
                                if (*n)
                                        continue;
                                return ERR_PTR(-EINVAL);
                        }
                        *name = n;
                        return handler;
                }
        }
        return ERR_PTR(-EOPNOTSUPP);
}

/**
 * may_write_xattr - check whether inode allows writing xattr
 * @idmap: idmap of the mount the inode was found from
 * @inode: the inode on which to set an xattr
 *
 * Check whether the inode allows writing xattrs. Specifically, we can never
 * set or remove an extended attribute on a read-only filesystem  or on an
 * immutable / append-only inode.
 *
 * We also need to ensure that the inode has a mapping in the mount to
 * not risk writing back invalid i_{g,u}id values.
 *
 * Return: On success zero is returned. On error a negative errno is returned.
 */
int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode)
{
        if (IS_IMMUTABLE(inode))
                return -EPERM;
        if (IS_APPEND(inode))
                return -EPERM;
        if (HAS_UNMAPPED_ID(idmap, inode))
                return -EPERM;
        return 0;
}

/*
 * Check permissions for extended attribute access.  This is a bit complicated
 * because different namespaces have very different rules.
 */
static int
xattr_permission(struct mnt_idmap *idmap, struct inode *inode,
                 const char *name, int mask)
{
        if (mask & MAY_WRITE) {
                int ret;

                ret = may_write_xattr(idmap, inode);
                if (ret)
                        return ret;
        }

        /*
         * No restriction for security.* and system.* from the VFS.  Decision
         * on these is left to the underlying filesystem / security module.
         */
        if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
            !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
                return 0;

        /*
         * The trusted.* namespace can only be accessed by privileged users.
         */
        if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
                if (!capable(CAP_SYS_ADMIN))
                        return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
                return 0;
        }

        /*
         * In the user.* namespace, only regular files and directories can have
         * extended attributes. For sticky directories, only the owner and
         * privileged users can write attributes.
         */
        if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
                if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
                        return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
                if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
                    (mask & MAY_WRITE) &&
                    !inode_owner_or_capable(idmap, inode))
                        return -EPERM;
        }

        return inode_permission(idmap, inode, mask);
}

/*
 * Look for any handler that deals with the specified namespace.
 */
int
xattr_supports_user_prefix(struct inode *inode)
{
        const struct xattr_handler * const *handlers = inode->i_sb->s_xattr;
        const struct xattr_handler *handler;

        if (!(inode->i_opflags & IOP_XATTR)) {
                if (unlikely(is_bad_inode(inode)))
                        return -EIO;
                return -EOPNOTSUPP;
        }

        for_each_xattr_handler(handlers, handler) {
                if (!strncmp(xattr_prefix(handler), XATTR_USER_PREFIX,
                             XATTR_USER_PREFIX_LEN))
                        return 0;
        }

        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(xattr_supports_user_prefix);

int
__vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
               struct inode *inode, const char *name, const void *value,
               size_t size, int flags)
{
        const struct xattr_handler *handler;

        if (is_posix_acl_xattr(name))
                return -EOPNOTSUPP;

        handler = xattr_resolve_name(inode, &name);
        if (IS_ERR(handler))
                return PTR_ERR(handler);
        if (!handler->set)
                return -EOPNOTSUPP;
        if (size == 0)
                value = "";  /* empty EA, do not remove */
        return handler->set(handler, idmap, dentry, inode, name, value,
                            size, flags);
}
EXPORT_SYMBOL(__vfs_setxattr);

/**
 *  __vfs_setxattr_noperm - perform setxattr operation without performing
 *  permission checks.
 *
 *  @idmap: idmap of the mount the inode was found from
 *  @dentry: object to perform setxattr on
 *  @name: xattr name to set
 *  @value: value to set @name to
 *  @size: size of @value
 *  @flags: flags to pass into filesystem operations
 *
 *  returns the result of the internal setxattr or setsecurity operations.
 *
 *  This function requires the caller to lock the inode's i_mutex before it
 *  is executed. It also assumes that the caller will make the appropriate
 *  permission checks.
 */
int __vfs_setxattr_noperm(struct mnt_idmap *idmap,
                          struct dentry *dentry, const char *name,
                          const void *value, size_t size, int flags)
{
        struct inode *inode = dentry->d_inode;
        int error = -EAGAIN;
        int issec = !strncmp(name, XATTR_SECURITY_PREFIX,
                                   XATTR_SECURITY_PREFIX_LEN);

        if (issec)
                inode->i_flags &= ~S_NOSEC;
        if (inode->i_opflags & IOP_XATTR) {
                error = __vfs_setxattr(idmap, dentry, inode, name, value,
                                       size, flags);
                if (!error) {
                        fsnotify_xattr(dentry);
                        security_inode_post_setxattr(dentry, name, value,
                                                     size, flags);
                }
        } else {
                if (unlikely(is_bad_inode(inode)))
                        return -EIO;
        }
        if (error == -EAGAIN) {
                error = -EOPNOTSUPP;

                if (issec) {
                        const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;

                        error = security_inode_setsecurity(inode, suffix, value,
                                                           size, flags);
                        if (!error)
                                fsnotify_xattr(dentry);
                }
        }

        return error;
}

/**
 * __vfs_setxattr_locked - set an extended attribute while holding the inode
 * lock
 *
 *  @idmap: idmap of the mount of the target inode
 *  @dentry: object to perform setxattr on
 *  @name: xattr name to set
 *  @value: value to set @name to
 *  @size: size of @value
 *  @flags: flags to pass into filesystem operations
 *  @delegated_inode: on return, will contain an inode pointer that
 *  a delegation was broken on, NULL if none.
 */
int
__vfs_setxattr_locked(struct mnt_idmap *idmap, struct dentry *dentry,
                      const char *name, const void *value, size_t size,
                      int flags, struct inode **delegated_inode)
{
        struct inode *inode = dentry->d_inode;
        int error;

        error = xattr_permission(idmap, inode, name, MAY_WRITE);
        if (error)
                return error;

        error = security_inode_setxattr(idmap, dentry, name, value, size,
                                        flags);
        if (error)
                goto out;

        error = try_break_deleg(inode, delegated_inode);
        if (error)
                goto out;

        error = __vfs_setxattr_noperm(idmap, dentry, name, value,
                                      size, flags);

out:
        return error;
}
EXPORT_SYMBOL_GPL(__vfs_setxattr_locked);

int
vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
             const char *name, const void *value, size_t size, int flags)
{
        struct inode *inode = dentry->d_inode;
        struct inode *delegated_inode = NULL;
        const void  *orig_value = value;
        int error;

        if (size && strcmp(name, XATTR_NAME_CAPS) == 0) {
                error = cap_convert_nscap(idmap, dentry, &value, size);
                if (error < 0)
                        return error;
                size = error;
        }

retry_deleg:
        inode_lock(inode);
        error = __vfs_setxattr_locked(idmap, dentry, name, value, size,
                                      flags, &delegated_inode);
        inode_unlock(inode);

        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        if (value != orig_value)
                kfree(value);

        return error;
}
EXPORT_SYMBOL_GPL(vfs_setxattr);

static ssize_t
xattr_getsecurity(struct mnt_idmap *idmap, struct inode *inode,
                  const char *name, void *value, size_t size)
{
        void *buffer = NULL;
        ssize_t len;

        if (!value || !size) {
                len = security_inode_getsecurity(idmap, inode, name,
                                                 &buffer, false);
                goto out_noalloc;
        }

        len = security_inode_getsecurity(idmap, inode, name, &buffer,
                                         true);
        if (len < 0)
                return len;
        if (size < len) {
                len = -ERANGE;
                goto out;
        }
        memcpy(value, buffer, len);
out:
        kfree(buffer);
out_noalloc:
        return len;
}

/*
 * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr
 *
 * Allocate memory, if not already allocated, or re-allocate correct size,
 * before retrieving the extended attribute.  The xattr value buffer should
 * always be freed by the caller, even on error.
 *
 * Returns the result of alloc, if failed, or the getxattr operation.
 */
int
vfs_getxattr_alloc(struct mnt_idmap *idmap, struct dentry *dentry,
                   const char *name, char **xattr_value, size_t xattr_size,
                   gfp_t flags)
{
        const struct xattr_handler *handler;
        struct inode *inode = dentry->d_inode;
        char *value = *xattr_value;
        int error;

        error = xattr_permission(idmap, inode, name, MAY_READ);
        if (error)
                return error;

        handler = xattr_resolve_name(inode, &name);
        if (IS_ERR(handler))
                return PTR_ERR(handler);
        if (!handler->get)
                return -EOPNOTSUPP;
        error = handler->get(handler, dentry, inode, name, NULL, 0);
        if (error < 0)
                return error;

        if (!value || (error > xattr_size)) {
                value = krealloc(*xattr_value, error + 1, flags);
                if (!value)
                        return -ENOMEM;
                memset(value, 0, error + 1);
        }

        error = handler->get(handler, dentry, inode, name, value, error);
        *xattr_value = value;
        return error;
}

ssize_t
__vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name,
               void *value, size_t size)
{
        const struct xattr_handler *handler;

        if (is_posix_acl_xattr(name))
                return -EOPNOTSUPP;

        handler = xattr_resolve_name(inode, &name);
        if (IS_ERR(handler))
                return PTR_ERR(handler);
        if (!handler->get)
                return -EOPNOTSUPP;
        return handler->get(handler, dentry, inode, name, value, size);
}
EXPORT_SYMBOL(__vfs_getxattr);

ssize_t
vfs_getxattr(struct mnt_idmap *idmap, struct dentry *dentry,
             const char *name, void *value, size_t size)
{
        struct inode *inode = dentry->d_inode;
        int error;

        error = xattr_permission(idmap, inode, name, MAY_READ);
        if (error)
                return error;

        error = security_inode_getxattr(dentry, name);
        if (error)
                return error;

        if (!strncmp(name, XATTR_SECURITY_PREFIX,
                                XATTR_SECURITY_PREFIX_LEN)) {
                const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
                int ret = xattr_getsecurity(idmap, inode, suffix, value,
                                            size);
                /*
                 * Only overwrite the return value if a security module
                 * is actually active.
                 */
                if (ret == -EOPNOTSUPP)
                        goto nolsm;
                return ret;
        }
nolsm:
        return __vfs_getxattr(dentry, inode, name, value, size);
}
EXPORT_SYMBOL_GPL(vfs_getxattr);

/**
 * vfs_listxattr - retrieve \0 separated list of xattr names
 * @dentry: the dentry from whose inode the xattr names are retrieved
 * @list: buffer to store xattr names into
 * @size: size of the buffer
 *
 * This function returns the names of all xattrs associated with the
 * inode of @dentry.
 *
 * Note, for legacy reasons the vfs_listxattr() function lists POSIX
 * ACLs as well. Since POSIX ACLs are decoupled from IOP_XATTR the
 * vfs_listxattr() function doesn't check for this flag since a
 * filesystem could implement POSIX ACLs without implementing any other
 * xattrs.
 *
 * However, since all codepaths that remove IOP_XATTR also assign of
 * inode operations that either don't implement or implement a stub
 * ->listxattr() operation.
 *
 * Return: On success, the size of the buffer that was used. On error a
 *         negative error code.
 */
ssize_t
vfs_listxattr(struct dentry *dentry, char *list, size_t size)
{
        struct inode *inode = d_inode(dentry);
        ssize_t error;

        error = security_inode_listxattr(dentry);
        if (error)
                return error;

        if (inode->i_op->listxattr) {
                error = inode->i_op->listxattr(dentry, list, size);
        } else {
                error = security_inode_listsecurity(inode, list, size);
                if (size && error > size)
                        error = -ERANGE;
        }
        return error;
}
EXPORT_SYMBOL_GPL(vfs_listxattr);

int
__vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry,
                  const char *name)
{
        struct inode *inode = d_inode(dentry);
        const struct xattr_handler *handler;

        if (is_posix_acl_xattr(name))
                return -EOPNOTSUPP;

        handler = xattr_resolve_name(inode, &name);
        if (IS_ERR(handler))
                return PTR_ERR(handler);
        if (!handler->set)
                return -EOPNOTSUPP;
        return handler->set(handler, idmap, dentry, inode, name, NULL, 0,
                            XATTR_REPLACE);
}
EXPORT_SYMBOL(__vfs_removexattr);

/**
 * __vfs_removexattr_locked - set an extended attribute while holding the inode
 * lock
 *
 *  @idmap: idmap of the mount of the target inode
 *  @dentry: object to perform setxattr on
 *  @name: name of xattr to remove
 *  @delegated_inode: on return, will contain an inode pointer that
 *  a delegation was broken on, NULL if none.
 */
int
__vfs_removexattr_locked(struct mnt_idmap *idmap,
                         struct dentry *dentry, const char *name,
                         struct inode **delegated_inode)
{
        struct inode *inode = dentry->d_inode;
        int error;

        error = xattr_permission(idmap, inode, name, MAY_WRITE);
        if (error)
                return error;

        error = security_inode_removexattr(idmap, dentry, name);
        if (error)
                goto out;

        error = try_break_deleg(inode, delegated_inode);
        if (error)
                goto out;

        error = __vfs_removexattr(idmap, dentry, name);
        if (error)
                return error;

        fsnotify_xattr(dentry);
        security_inode_post_removexattr(dentry, name);

out:
        return error;
}
EXPORT_SYMBOL_GPL(__vfs_removexattr_locked);

int
vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry,
                const char *name)
{
        struct inode *inode = dentry->d_inode;
        struct inode *delegated_inode = NULL;
        int error;

retry_deleg:
        inode_lock(inode);
        error = __vfs_removexattr_locked(idmap, dentry,
                                         name, &delegated_inode);
        inode_unlock(inode);

        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }

        return error;
}
EXPORT_SYMBOL_GPL(vfs_removexattr);

/*
 * Extended attribute SET operations
 */

int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
{
        int error;

        if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE))
                return -EINVAL;

        error = strncpy_from_user(ctx->kname->name, name,
                                sizeof(ctx->kname->name));
        if (error == 0 || error == sizeof(ctx->kname->name))
                return  -ERANGE;
        if (error < 0)
                return error;

        error = 0;
        if (ctx->size) {
                if (ctx->size > XATTR_SIZE_MAX)
                        return -E2BIG;

                ctx->kvalue = vmemdup_user(ctx->cvalue, ctx->size);
                if (IS_ERR(ctx->kvalue)) {
                        error = PTR_ERR(ctx->kvalue);
                        ctx->kvalue = NULL;
                }
        }

        return error;
}

int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
                struct xattr_ctx *ctx)
{
        if (is_posix_acl_xattr(ctx->kname->name))
                return do_set_acl(idmap, dentry, ctx->kname->name,
                                  ctx->kvalue, ctx->size);

        return vfs_setxattr(idmap, dentry, ctx->kname->name,
                        ctx->kvalue, ctx->size, ctx->flags);
}

static long
setxattr(struct mnt_idmap *idmap, struct dentry *d,
        const char __user *name, const void __user *value, size_t size,
        int flags)
{
        struct xattr_name kname;
        struct xattr_ctx ctx = {
                .cvalue   = value,
                .kvalue   = NULL,
                .size     = size,
                .kname    = &kname,
                .flags    = flags,
        };
        int error;

        error = setxattr_copy(name, &ctx);
        if (error)
                return error;

        error = do_setxattr(idmap, d, &ctx);

        kvfree(ctx.kvalue);
        return error;
}

static int path_setxattr(const char __user *pathname,
                         const char __user *name, const void __user *value,
                         size_t size, int flags, unsigned int lookup_flags)
{
        struct path path;
        int error;

retry:
        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (error)
                return error;
        error = mnt_want_write(path.mnt);
        if (!error) {
                error = setxattr(mnt_idmap(path.mnt), path.dentry, name,
                                 value, size, flags);
                mnt_drop_write(path.mnt);
        }
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
                const char __user *, name, const void __user *, value,
                size_t, size, int, flags)
{
        return path_setxattr(pathname, name, value, size, flags, LOOKUP_FOLLOW);
}

SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
                const char __user *, name, const void __user *, value,
                size_t, size, int, flags)
{
        return path_setxattr(pathname, name, value, size, flags, 0);
}

SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
                const void __user *,value, size_t, size, int, flags)
{
        struct fd f = fdget(fd);
        int error = -EBADF;

        if (!f.file)
                return error;
        audit_file(f.file);
        error = mnt_want_write_file(f.file);
        if (!error) {
                error = setxattr(file_mnt_idmap(f.file),
                                 f.file->f_path.dentry, name,
                                 value, size, flags);
                mnt_drop_write_file(f.file);
        }
        fdput(f);
        return error;
}

/*
 * Extended attribute GET operations
 */
ssize_t
do_getxattr(struct mnt_idmap *idmap, struct dentry *d,
        struct xattr_ctx *ctx)
{
        ssize_t error;
        char *kname = ctx->kname->name;

        if (ctx->size) {
                if (ctx->size > XATTR_SIZE_MAX)
                        ctx->size = XATTR_SIZE_MAX;
                ctx->kvalue = kvzalloc(ctx->size, GFP_KERNEL);
                if (!ctx->kvalue)
                        return -ENOMEM;
        }

        if (is_posix_acl_xattr(ctx->kname->name))
                error = do_get_acl(idmap, d, kname, ctx->kvalue, ctx->size);
        else
                error = vfs_getxattr(idmap, d, kname, ctx->kvalue, ctx->size);
        if (error > 0) {
                if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error))
                        error = -EFAULT;
        } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) {
                /* The file system tried to returned a value bigger
                   than XATTR_SIZE_MAX bytes. Not possible. */
                error = -E2BIG;
        }

        return error;
}

static ssize_t
getxattr(struct mnt_idmap *idmap, struct dentry *d,
         const char __user *name, void __user *value, size_t size)
{
        ssize_t error;
        struct xattr_name kname;
        struct xattr_ctx ctx = {
                .value    = value,
                .kvalue   = NULL,
                .size     = size,
                .kname    = &kname,
                .flags    = 0,
        };

        error = strncpy_from_user(kname.name, name, sizeof(kname.name));
        if (error == 0 || error == sizeof(kname.name))
                error = -ERANGE;
        if (error < 0)
                return error;

        error =  do_getxattr(idmap, d, &ctx);

        kvfree(ctx.kvalue);
        return error;
}

static ssize_t path_getxattr(const char __user *pathname,
                             const char __user *name, void __user *value,
                             size_t size, unsigned int lookup_flags)
{
        struct path path;
        ssize_t error;
retry:
        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (error)
                return error;
        error = getxattr(mnt_idmap(path.mnt), path.dentry, name, value, size);
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
                const char __user *, name, void __user *, value, size_t, size)
{
        return path_getxattr(pathname, name, value, size, LOOKUP_FOLLOW);
}

SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
                const char __user *, name, void __user *, value, size_t, size)
{
        return path_getxattr(pathname, name, value, size, 0);
}

SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
                void __user *, value, size_t, size)
{
        struct fd f = fdget(fd);
        ssize_t error = -EBADF;

        if (!f.file)
                return error;
        audit_file(f.file);
        error = getxattr(file_mnt_idmap(f.file), f.file->f_path.dentry,
                         name, value, size);
        fdput(f);
        return error;
}

/*
 * Extended attribute LIST operations
 */
static ssize_t
listxattr(struct dentry *d, char __user *list, size_t size)
{
        ssize_t error;
        char *klist = NULL;

        if (size) {
                if (size > XATTR_LIST_MAX)
                        size = XATTR_LIST_MAX;
                klist = kvmalloc(size, GFP_KERNEL);
                if (!klist)
                        return -ENOMEM;
        }

        error = vfs_listxattr(d, klist, size);
        if (error > 0) {
                if (size && copy_to_user(list, klist, error))
                        error = -EFAULT;
        } else if (error == -ERANGE && size >= XATTR_LIST_MAX) {
                /* The file system tried to returned a list bigger
                   than XATTR_LIST_MAX bytes. Not possible. */
                error = -E2BIG;
        }

        kvfree(klist);

        return error;
}

static ssize_t path_listxattr(const char __user *pathname, char __user *list,
                              size_t size, unsigned int lookup_flags)
{
        struct path path;
        ssize_t error;
retry:
        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (error)
                return error;
        error = listxattr(path.dentry, list, size);
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
                size_t, size)
{
        return path_listxattr(pathname, list, size, LOOKUP_FOLLOW);
}

SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
                size_t, size)
{
        return path_listxattr(pathname, list, size, 0);
}

SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
{
        struct fd f = fdget(fd);
        ssize_t error = -EBADF;

        if (!f.file)
                return error;
        audit_file(f.file);
        error = listxattr(f.file->f_path.dentry, list, size);
        fdput(f);
        return error;
}

/*
 * Extended attribute REMOVE operations
 */
static long
removexattr(struct mnt_idmap *idmap, struct dentry *d,
            const char __user *name)
{
        int error;
        char kname[XATTR_NAME_MAX + 1];

        error = strncpy_from_user(kname, name, sizeof(kname));
        if (error == 0 || error == sizeof(kname))
                error = -ERANGE;
        if (error < 0)
                return error;

        if (is_posix_acl_xattr(kname))
                return vfs_remove_acl(idmap, d, kname);

        return vfs_removexattr(idmap, d, kname);
}

static int path_removexattr(const char __user *pathname,
                            const char __user *name, unsigned int lookup_flags)
{
        struct path path;
        int error;
retry:
        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (error)
                return error;
        error = mnt_want_write(path.mnt);
        if (!error) {
                error = removexattr(mnt_idmap(path.mnt), path.dentry, name);
                mnt_drop_write(path.mnt);
        }
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
                const char __user *, name)
{
        return path_removexattr(pathname, name, LOOKUP_FOLLOW);
}

SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
                const char __user *, name)
{
        return path_removexattr(pathname, name, 0);
}

SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
{
        struct fd f = fdget(fd);
        int error = -EBADF;

        if (!f.file)
                return error;
        audit_file(f.file);
        error = mnt_want_write_file(f.file);
        if (!error) {
                error = removexattr(file_mnt_idmap(f.file),
                                    f.file->f_path.dentry, name);
                mnt_drop_write_file(f.file);
        }
        fdput(f);
        return error;
}

int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name)
{
        size_t len;

        len = strlen(name) + 1;
        if (*buffer) {
                if (*remaining_size < len)
                        return -ERANGE;
                memcpy(*buffer, name, len);
                *buffer += len;
        }
        *remaining_size -= len;
        return 0;
}

/**
 * generic_listxattr - run through a dentry's xattr list() operations
 * @dentry: dentry to list the xattrs
 * @buffer: result buffer
 * @buffer_size: size of @buffer
 *
 * Combine the results of the list() operation from every xattr_handler in the
 * xattr_handler stack.
 *
 * Note that this will not include the entries for POSIX ACLs.
 */
ssize_t
generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
        const struct xattr_handler *handler, * const *handlers = dentry->d_sb->s_xattr;
        ssize_t remaining_size = buffer_size;
        int err = 0;

        for_each_xattr_handler(handlers, handler) {
                if (!handler->name || (handler->list && !handler->list(dentry)))
                        continue;
                err = xattr_list_one(&buffer, &remaining_size, handler->name);
                if (err)
                        return err;
        }

        return err ? err : buffer_size - remaining_size;
}
EXPORT_SYMBOL(generic_listxattr);

/**
 * xattr_full_name  -  Compute full attribute name from suffix
 *
 * @handler:        handler of the xattr_handler operation
 * @name:        name passed to the xattr_handler operation
 *
 * The get and set xattr handler operations are called with the remainder of
 * the attribute name after skipping the handler's prefix: for example, "foo"
 * is passed to the get operation of a handler with prefix "user." to get
 * attribute "user.foo".  The full name is still "there" in the name though.
 *
 * Note: the list xattr handler operation when called from the vfs is passed a
 * NULL name; some file systems use this operation internally, with varying
 * semantics.
 */
const char *xattr_full_name(const struct xattr_handler *handler,
                            const char *name)
{
        size_t prefix_len = strlen(xattr_prefix(handler));

        return name - prefix_len;
}
EXPORT_SYMBOL(xattr_full_name);

/**
 * simple_xattr_space - estimate the memory used by a simple xattr
 * @name: the full name of the xattr
 * @size: the size of its value
 *
 * This takes no account of how much larger the two slab objects actually are:
 * that would depend on the slab implementation, when what is required is a
 * deterministic number, which grows with name length and size and quantity.
 *
 * Return: The approximate number of bytes of memory used by such an xattr.
 */
size_t simple_xattr_space(const char *name, size_t size)
{
        /*
         * Use "40" instead of sizeof(struct simple_xattr), to return the
         * same result on 32-bit and 64-bit, and even if simple_xattr grows.
         */
        return 40 + size + strlen(name);
}

/**
 * simple_xattr_free - free an xattr object
 * @xattr: the xattr object
 *
 * Free the xattr object. Can handle @xattr being NULL.
 */
void simple_xattr_free(struct simple_xattr *xattr)
{
        if (xattr)
                kfree(xattr->name);
        kvfree(xattr);
}

/**
 * simple_xattr_alloc - allocate new xattr object
 * @value: value of the xattr object
 * @size: size of @value
 *
 * Allocate a new xattr object and initialize respective members. The caller is
 * responsible for handling the name of the xattr.
 *
 * Return: On success a new xattr object is returned. On failure NULL is
 * returned.
 */
struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
{
        struct simple_xattr *new_xattr;
        size_t len;

        /* wrap around? */
        len = sizeof(*new_xattr) + size;
        if (len < sizeof(*new_xattr))
                return NULL;

        new_xattr = kvmalloc(len, GFP_KERNEL_ACCOUNT);
        if (!new_xattr)
                return NULL;

        new_xattr->size = size;
        memcpy(new_xattr->value, value, size);
        return new_xattr;
}

/**
 * rbtree_simple_xattr_cmp - compare xattr name with current rbtree xattr entry
 * @key: xattr name
 * @node: current node
 *
 * Compare the xattr name with the xattr name attached to @node in the rbtree.
 *
 * Return: Negative value if continuing left, positive if continuing right, 0
 * if the xattr attached to @node matches @key.
 */
static int rbtree_simple_xattr_cmp(const void *key, const struct rb_node *node)
{
        const char *xattr_name = key;
        const struct simple_xattr *xattr;

        xattr = rb_entry(node, struct simple_xattr, rb_node);
        return strcmp(xattr->name, xattr_name);
}

/**
 * rbtree_simple_xattr_node_cmp - compare two xattr rbtree nodes
 * @new_node: new node
 * @node: current node
 *
 * Compare the xattr attached to @new_node with the xattr attached to @node.
 *
 * Return: Negative value if continuing left, positive if continuing right, 0
 * if the xattr attached to @new_node matches the xattr attached to @node.
 */
static int rbtree_simple_xattr_node_cmp(struct rb_node *new_node,
                                        const struct rb_node *node)
{
        struct simple_xattr *xattr;
        xattr = rb_entry(new_node, struct simple_xattr, rb_node);
        return rbtree_simple_xattr_cmp(xattr->name, node);
}

/**
 * simple_xattr_get - get an xattr object
 * @xattrs: the header of the xattr object
 * @name: the name of the xattr to retrieve
 * @buffer: the buffer to store the value into
 * @size: the size of @buffer
 *
 * Try to find and retrieve the xattr object associated with @name.
 * If @buffer is provided store the value of @xattr in @buffer
 * otherwise just return the length. The size of @buffer is limited
 * to XATTR_SIZE_MAX which currently is 65536.
 *
 * Return: On success the length of the xattr value is returned. On error a
 * negative error code is returned.
 */
int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
                     void *buffer, size_t size)
{
        struct simple_xattr *xattr = NULL;
        struct rb_node *rbp;
        int ret = -ENODATA;

        read_lock(&xattrs->lock);
        rbp = rb_find(name, &xattrs->rb_root, rbtree_simple_xattr_cmp);
        if (rbp) {
                xattr = rb_entry(rbp, struct simple_xattr, rb_node);
                ret = xattr->size;
                if (buffer) {
                        if (size < xattr->size)
                                ret = -ERANGE;
                        else
                                memcpy(buffer, xattr->value, xattr->size);
                }
        }
        read_unlock(&xattrs->lock);
        return ret;
}

/**
 * simple_xattr_set - set an xattr object
 * @xattrs: the header of the xattr object
 * @name: the name of the xattr to retrieve
 * @value: the value to store along the xattr
 * @size: the size of @value
 * @flags: the flags determining how to set the xattr
 *
 * Set a new xattr object.
 * If @value is passed a new xattr object will be allocated. If XATTR_REPLACE
 * is specified in @flags a matching xattr object for @name must already exist.
 * If it does it will be replaced with the new xattr object. If it doesn't we
 * fail. If XATTR_CREATE is specified and a matching xattr does already exist
 * we fail. If it doesn't we create a new xattr. If @flags is zero we simply
 * insert the new xattr replacing any existing one.
 *
 * If @value is empty and a matching xattr object is found we delete it if
 * XATTR_REPLACE is specified in @flags or @flags is zero.
 *
 * If @value is empty and no matching xattr object for @name is found we do
 * nothing if XATTR_CREATE is specified in @flags or @flags is zero. For
 * XATTR_REPLACE we fail as mentioned above.
 *
 * Return: On success, the removed or replaced xattr is returned, to be freed
 * by the caller; or NULL if none. On failure a negative error code is returned.
 */
struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs,
                                      const char *name, const void *value,
                                      size_t size, int flags)
{
        struct simple_xattr *old_xattr = NULL, *new_xattr = NULL;
        struct rb_node *parent = NULL, **rbp;
        int err = 0, ret;

        /* value == NULL means remove */
        if (value) {
                new_xattr = simple_xattr_alloc(value, size);
                if (!new_xattr)
                        return ERR_PTR(-ENOMEM);

                new_xattr->name = kstrdup(name, GFP_KERNEL_ACCOUNT);
                if (!new_xattr->name) {
                        simple_xattr_free(new_xattr);
                        return ERR_PTR(-ENOMEM);
                }
        }

        write_lock(&xattrs->lock);
        rbp = &xattrs->rb_root.rb_node;
        while (*rbp) {
                parent = *rbp;
                ret = rbtree_simple_xattr_cmp(name, *rbp);
                if (ret < 0)
                        rbp = &(*rbp)->rb_left;
                else if (ret > 0)
                        rbp = &(*rbp)->rb_right;
                else
                        old_xattr = rb_entry(*rbp, struct simple_xattr, rb_node);
                if (old_xattr)
                        break;
        }

        if (old_xattr) {
                /* Fail if XATTR_CREATE is requested and the xattr exists. */
                if (flags & XATTR_CREATE) {
                        err = -EEXIST;
                        goto out_unlock;
                }

                if (new_xattr)
                        rb_replace_node(&old_xattr->rb_node,
                                        &new_xattr->rb_node, &xattrs->rb_root);
                else
                        rb_erase(&old_xattr->rb_node, &xattrs->rb_root);
        } else {
                /* Fail if XATTR_REPLACE is requested but no xattr is found. */
                if (flags & XATTR_REPLACE) {
                        err = -ENODATA;
                        goto out_unlock;
                }

                /*
                 * If XATTR_CREATE or no flags are specified together with a
                 * new value simply insert it.
                 */
                if (new_xattr) {
                        rb_link_node(&new_xattr->rb_node, parent, rbp);
                        rb_insert_color(&new_xattr->rb_node, &xattrs->rb_root);
                }

                /*
                 * If XATTR_CREATE or no flags are specified and neither an
                 * old or new xattr exist then we don't need to do anything.
                 */
        }

out_unlock:
        write_unlock(&xattrs->lock);
        if (!err)
                return old_xattr;
        simple_xattr_free(new_xattr);
        return ERR_PTR(err);
}

static bool xattr_is_trusted(const char *name)
{
        return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
}

/**
 * simple_xattr_list - list all xattr objects
 * @inode: inode from which to get the xattrs
 * @xattrs: the header of the xattr object
 * @buffer: the buffer to store all xattrs into
 * @size: the size of @buffer
 *
 * List all xattrs associated with @inode. If @buffer is NULL we returned
 * the required size of the buffer. If @buffer is provided we store the
 * xattrs value into it provided it is big enough.
 *
 * Note, the number of xattr names that can be listed with listxattr(2) is
 * limited to XATTR_LIST_MAX aka 65536 bytes. If a larger buffer is passed
 * then vfs_listxattr() caps it to XATTR_LIST_MAX and if more xattr names
 * are found it will return -E2BIG.
 *
 * Return: On success the required size or the size of the copied xattrs is
 * returned. On error a negative error code is returned.
 */
ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
                          char *buffer, size_t size)
{
        bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
        struct simple_xattr *xattr;
        struct rb_node *rbp;
        ssize_t remaining_size = size;
        int err = 0;

        err = posix_acl_listxattr(inode, &buffer, &remaining_size);
        if (err)
                return err;

        read_lock(&xattrs->lock);
        for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) {
                xattr = rb_entry(rbp, struct simple_xattr, rb_node);

                /* skip "trusted." attributes for unprivileged callers */
                if (!trusted && xattr_is_trusted(xattr->name))
                        continue;

                err = xattr_list_one(&buffer, &remaining_size, xattr->name);
                if (err)
                        break;
        }
        read_unlock(&xattrs->lock);

        return err ? err : size - remaining_size;
}

/**
 * rbtree_simple_xattr_less - compare two xattr rbtree nodes
 * @new_node: new node
 * @node: current node
 *
 * Compare the xattr attached to @new_node with the xattr attached to @node.
 * Note that this function technically tolerates duplicate entries.
 *
 * Return: True if insertion point in the rbtree is found.
 */
static bool rbtree_simple_xattr_less(struct rb_node *new_node,
                                     const struct rb_node *node)
{
        return rbtree_simple_xattr_node_cmp(new_node, node) < 0;
}

/**
 * simple_xattr_add - add xattr objects
 * @xattrs: the header of the xattr object
 * @new_xattr: the xattr object to add
 *
 * Add an xattr object to @xattrs. This assumes no replacement or removal
 * of matching xattrs is wanted. Should only be called during inode
 * initialization when a few distinct initial xattrs are supposed to be set.
 */
void simple_xattr_add(struct simple_xattrs *xattrs,
                      struct simple_xattr *new_xattr)
{
        write_lock(&xattrs->lock);
        rb_add(&new_xattr->rb_node, &xattrs->rb_root, rbtree_simple_xattr_less);
        write_unlock(&xattrs->lock);
}

/**
 * simple_xattrs_init - initialize new xattr header
 * @xattrs: header to initialize
 *
 * Initialize relevant fields of a an xattr header.
 */
void simple_xattrs_init(struct simple_xattrs *xattrs)
{
        xattrs->rb_root = RB_ROOT;
        rwlock_init(&xattrs->lock);
}

/**
 * simple_xattrs_free - free xattrs
 * @xattrs: xattr header whose xattrs to destroy
 * @freed_space: approximate number of bytes of memory freed from @xattrs
 *
 * Destroy all xattrs in @xattr. When this is called no one can hold a
 * reference to any of the xattrs anymore.
 */
void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space)
{
        struct rb_node *rbp;

        if (freed_space)
                *freed_space = 0;
        rbp = rb_first(&xattrs->rb_root);
        while (rbp) {
                struct simple_xattr *xattr;
                struct rb_node *rbp_next;

                rbp_next = rb_next(rbp);
                xattr = rb_entry(rbp, struct simple_xattr, rb_node);
                rb_erase(&xattr->rb_node, &xattrs->rb_root);
                if (freed_space)
                        *freed_space += simple_xattr_space(xattr->name,
                                                           xattr->size);
                simple_xattr_free(xattr);
                rbp = rbp_next;
        }
}
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





    1 





























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
// SPDX-License-Identifier: GPL-2.0-only
/*
   Copyright (c) 2013-2014 Intel Corp.

*/

#include <linux/if_arp.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/module.h>
#include <linux/debugfs.h>

#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
#include <net/pkt_sched.h>

#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
#include <net/bluetooth/l2cap.h>

#include <net/6lowpan.h> /* for the compression support */

#define VERSION "0.1"

static struct dentry *lowpan_enable_debugfs;
static struct dentry *lowpan_control_debugfs;

#define IFACE_NAME_TEMPLATE "bt%d"

struct skb_cb {
        struct in6_addr addr;
        struct in6_addr gw;
        struct l2cap_chan *chan;
};
#define lowpan_cb(skb) ((struct skb_cb *)((skb)->cb))

/* The devices list contains those devices that we are acting
 * as a proxy. The BT 6LoWPAN device is a virtual device that
 * connects to the Bluetooth LE device. The real connection to
 * BT device is done via l2cap layer. There exists one
 * virtual device / one BT 6LoWPAN network (=hciX device).
 * The list contains struct lowpan_dev elements.
 */
static LIST_HEAD(bt_6lowpan_devices);
static DEFINE_SPINLOCK(devices_lock);

static bool enable_6lowpan;

/* We are listening incoming connections via this channel
 */
static struct l2cap_chan *listen_chan;
static DEFINE_MUTEX(set_lock);

struct lowpan_peer {
        struct list_head list;
        struct rcu_head rcu;
        struct l2cap_chan *chan;

        /* peer addresses in various formats */
        unsigned char lladdr[ETH_ALEN];
        struct in6_addr peer_addr;
};

struct lowpan_btle_dev {
        struct list_head list;

        struct hci_dev *hdev;
        struct net_device *netdev;
        struct list_head peers;
        atomic_t peer_count; /* number of items in peers list */

        struct work_struct delete_netdev;
        struct delayed_work notify_peers;
};

static inline struct lowpan_btle_dev *
lowpan_btle_dev(const struct net_device *netdev)
{
        return (struct lowpan_btle_dev *)lowpan_dev(netdev)->priv;
}

static inline void peer_add(struct lowpan_btle_dev *dev,
                            struct lowpan_peer *peer)
{
        list_add_rcu(&peer->list, &dev->peers);
        atomic_inc(&dev->peer_count);
}

static inline bool peer_del(struct lowpan_btle_dev *dev,
                            struct lowpan_peer *peer)
{
        list_del_rcu(&peer->list);
        kfree_rcu(peer, rcu);

        module_put(THIS_MODULE);

        if (atomic_dec_and_test(&dev->peer_count)) {
                BT_DBG("last peer");
                return true;
        }

        return false;
}

static inline struct lowpan_peer *
__peer_lookup_chan(struct lowpan_btle_dev *dev, struct l2cap_chan *chan)
{
        struct lowpan_peer *peer;

        list_for_each_entry_rcu(peer, &dev->peers, list) {
                if (peer->chan == chan)
                        return peer;
        }

        return NULL;
}

static inline struct lowpan_peer *
__peer_lookup_conn(struct lowpan_btle_dev *dev, struct l2cap_conn *conn)
{
        struct lowpan_peer *peer;

        list_for_each_entry_rcu(peer, &dev->peers, list) {
                if (peer->chan->conn == conn)
                        return peer;
        }

        return NULL;
}

static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev,
                                                  struct in6_addr *daddr,
                                                  struct sk_buff *skb)
{
        struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
        int count = atomic_read(&dev->peer_count);
        const struct in6_addr *nexthop;
        struct lowpan_peer *peer;
        struct neighbour *neigh;

        BT_DBG("peers %d addr %pI6c rt %p", count, daddr, rt);

        if (!rt) {
                if (ipv6_addr_any(&lowpan_cb(skb)->gw)) {
                        /* There is neither route nor gateway,
                         * probably the destination is a direct peer.
                         */
                        nexthop = daddr;
                } else {
                        /* There is a known gateway
                         */
                        nexthop = &lowpan_cb(skb)->gw;
                }
        } else {
                nexthop = rt6_nexthop(rt, daddr);

                /* We need to remember the address because it is needed
                 * by bt_xmit() when sending the packet. In bt_xmit(), the
                 * destination routing info is not set.
                 */
                memcpy(&lowpan_cb(skb)->gw, nexthop, sizeof(struct in6_addr));
        }

        BT_DBG("gw %pI6c", nexthop);

        rcu_read_lock();

        list_for_each_entry_rcu(peer, &dev->peers, list) {
                BT_DBG("dst addr %pMR dst type %u ip %pI6c",
                       &peer->chan->dst, peer->chan->dst_type,
                       &peer->peer_addr);

                if (!ipv6_addr_cmp(&peer->peer_addr, nexthop)) {
                        rcu_read_unlock();
                        return peer;
                }
        }

        /* use the neighbour cache for matching addresses assigned by SLAAC */
        neigh = __ipv6_neigh_lookup(dev->netdev, nexthop);
        if (neigh) {
                list_for_each_entry_rcu(peer, &dev->peers, list) {
                        if (!memcmp(neigh->ha, peer->lladdr, ETH_ALEN)) {
                                neigh_release(neigh);
                                rcu_read_unlock();
                                return peer;
                        }
                }
                neigh_release(neigh);
        }

        rcu_read_unlock();

        return NULL;
}

static struct lowpan_peer *lookup_peer(struct l2cap_conn *conn)
{
        struct lowpan_btle_dev *entry;
        struct lowpan_peer *peer = NULL;

        rcu_read_lock();

        list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
                peer = __peer_lookup_conn(entry, conn);
                if (peer)
                        break;
        }

        rcu_read_unlock();

        return peer;
}

static struct lowpan_btle_dev *lookup_dev(struct l2cap_conn *conn)
{
        struct lowpan_btle_dev *entry;
        struct lowpan_btle_dev *dev = NULL;

        rcu_read_lock();

        list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
                if (conn->hcon->hdev == entry->hdev) {
                        dev = entry;
                        break;
                }
        }

        rcu_read_unlock();

        return dev;
}

static int give_skb_to_upper(struct sk_buff *skb, struct net_device *dev)
{
        struct sk_buff *skb_cp;

        skb_cp = skb_copy(skb, GFP_ATOMIC);
        if (!skb_cp)
                return NET_RX_DROP;

        return netif_rx(skb_cp);
}

static int iphc_decompress(struct sk_buff *skb, struct net_device *netdev,
                           struct lowpan_peer *peer)
{
        const u8 *saddr;

        saddr = peer->lladdr;

        return lowpan_header_decompress(skb, netdev, netdev->dev_addr, saddr);
}

static int recv_pkt(struct sk_buff *skb, struct net_device *dev,
                    struct lowpan_peer *peer)
{
        struct sk_buff *local_skb;
        int ret;

        if (!netif_running(dev))
                goto drop;

        if (dev->type != ARPHRD_6LOWPAN || !skb->len)
                goto drop;

        skb_reset_network_header(skb);

        skb = skb_share_check(skb, GFP_ATOMIC);
        if (!skb)
                goto drop;

        /* check that it's our buffer */
        if (lowpan_is_ipv6(*skb_network_header(skb))) {
                /* Pull off the 1-byte of 6lowpan header. */
                skb_pull(skb, 1);

                /* Copy the packet so that the IPv6 header is
                 * properly aligned.
                 */
                local_skb = skb_copy_expand(skb, NET_SKB_PAD - 1,
                                            skb_tailroom(skb), GFP_ATOMIC);
                if (!local_skb)
                        goto drop;

                local_skb->protocol = htons(ETH_P_IPV6);
                local_skb->pkt_type = PACKET_HOST;
                local_skb->dev = dev;

                skb_set_transport_header(local_skb, sizeof(struct ipv6hdr));

                if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) {
                        kfree_skb(local_skb);
                        goto drop;
                }

                dev->stats.rx_bytes += skb->len;
                dev->stats.rx_packets++;

                consume_skb(local_skb);
                consume_skb(skb);
        } else if (lowpan_is_iphc(*skb_network_header(skb))) {
                local_skb = skb_clone(skb, GFP_ATOMIC);
                if (!local_skb)
                        goto drop;

                local_skb->dev = dev;

                ret = iphc_decompress(local_skb, dev, peer);
                if (ret < 0) {
                        BT_DBG("iphc_decompress failed: %d", ret);
                        kfree_skb(local_skb);
                        goto drop;
                }

                local_skb->protocol = htons(ETH_P_IPV6);
                local_skb->pkt_type = PACKET_HOST;

                if (give_skb_to_upper(local_skb, dev)
                                != NET_RX_SUCCESS) {
                        kfree_skb(local_skb);
                        goto drop;
                }

                dev->stats.rx_bytes += skb->len;
                dev->stats.rx_packets++;

                consume_skb(local_skb);
                consume_skb(skb);
        } else {
                BT_DBG("unknown packet type");
                goto drop;
        }

        return NET_RX_SUCCESS;

drop:
        dev->stats.rx_dropped++;
        return NET_RX_DROP;
}

/* Packet from BT LE device */
static int chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
{
        struct lowpan_btle_dev *dev;
        struct lowpan_peer *peer;
        int err;

        peer = lookup_peer(chan->conn);
        if (!peer)
                return -ENOENT;

        dev = lookup_dev(chan->conn);
        if (!dev || !dev->netdev)
                return -ENOENT;

        err = recv_pkt(skb, dev->netdev, peer);
        if (err) {
                BT_DBG("recv pkt %d", err);
                err = -EAGAIN;
        }

        return err;
}

static int setup_header(struct sk_buff *skb, struct net_device *netdev,
                        bdaddr_t *peer_addr, u8 *peer_addr_type)
{
        struct in6_addr ipv6_daddr;
        struct ipv6hdr *hdr;
        struct lowpan_btle_dev *dev;
        struct lowpan_peer *peer;
        u8 *daddr;
        int err, status = 0;

        hdr = ipv6_hdr(skb);

        dev = lowpan_btle_dev(netdev);

        memcpy(&ipv6_daddr, &hdr->daddr, sizeof(ipv6_daddr));

        if (ipv6_addr_is_multicast(&ipv6_daddr)) {
                lowpan_cb(skb)->chan = NULL;
                daddr = NULL;
        } else {
                BT_DBG("dest IP %pI6c", &ipv6_daddr);

                /* The packet might be sent to 6lowpan interface
                 * because of routing (either via default route
                 * or user set route) so get peer according to
                 * the destination address.
                 */
                peer = peer_lookup_dst(dev, &ipv6_daddr, skb);
                if (!peer) {
                        BT_DBG("no such peer");
                        return -ENOENT;
                }

                daddr = peer->lladdr;
                *peer_addr = peer->chan->dst;
                *peer_addr_type = peer->chan->dst_type;
                lowpan_cb(skb)->chan = peer->chan;

                status = 1;
        }

        lowpan_header_compress(skb, netdev, daddr, dev->netdev->dev_addr);

        err = dev_hard_header(skb, netdev, ETH_P_IPV6, NULL, NULL, 0);
        if (err < 0)
                return err;

        return status;
}

static int header_create(struct sk_buff *skb, struct net_device *netdev,
                         unsigned short type, const void *_daddr,
                         const void *_saddr, unsigned int len)
{
        if (type != ETH_P_IPV6)
                return -EINVAL;

        return 0;
}

/* Packet to BT LE device */
static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb,
                    struct net_device *netdev)
{
        struct msghdr msg;
        struct kvec iv;
        int err;

        /* Remember the skb so that we can send EAGAIN to the caller if
         * we run out of credits.
         */
        chan->data = skb;

        iv.iov_base = skb->data;
        iv.iov_len = skb->len;

        memset(&msg, 0, sizeof(msg));
        iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iv, 1, skb->len);

        err = l2cap_chan_send(chan, &msg, skb->len);
        if (err > 0) {
                netdev->stats.tx_bytes += err;
                netdev->stats.tx_packets++;
                return 0;
        }

        if (err < 0)
                netdev->stats.tx_errors++;

        return err;
}

static int send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev)
{
        struct sk_buff *local_skb;
        struct lowpan_btle_dev *entry;
        int err = 0;

        rcu_read_lock();

        list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
                struct lowpan_peer *pentry;
                struct lowpan_btle_dev *dev;

                if (entry->netdev != netdev)
                        continue;

                dev = lowpan_btle_dev(entry->netdev);

                list_for_each_entry_rcu(pentry, &dev->peers, list) {
                        int ret;

                        local_skb = skb_clone(skb, GFP_ATOMIC);

                        BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p",
                               netdev->name,
                               &pentry->chan->dst, pentry->chan->dst_type,
                               &pentry->peer_addr, pentry->chan);
                        ret = send_pkt(pentry->chan, local_skb, netdev);
                        if (ret < 0)
                                err = ret;

                        kfree_skb(local_skb);
                }
        }

        rcu_read_unlock();

        return err;
}

static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev)
{
        int err = 0;
        bdaddr_t addr;
        u8 addr_type;

        /* We must take a copy of the skb before we modify/replace the ipv6
         * header as the header could be used elsewhere
         */
        skb = skb_unshare(skb, GFP_ATOMIC);
        if (!skb)
                return NET_XMIT_DROP;

        /* Return values from setup_header()
         *  <0 - error, packet is dropped
         *   0 - this is a multicast packet
         *   1 - this is unicast packet
         */
        err = setup_header(skb, netdev, &addr, &addr_type);
        if (err < 0) {
                kfree_skb(skb);
                return NET_XMIT_DROP;
        }

        if (err) {
                if (lowpan_cb(skb)->chan) {
                        BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p",
                               netdev->name, &addr, addr_type,
                               &lowpan_cb(skb)->addr, lowpan_cb(skb)->chan);
                        err = send_pkt(lowpan_cb(skb)->chan, skb, netdev);
                } else {
                        err = -ENOENT;
                }
        } else {
                /* We need to send the packet to every device behind this
                 * interface.
                 */
                err = send_mcast_pkt(skb, netdev);
        }

        dev_kfree_skb(skb);

        if (err)
                BT_DBG("ERROR: xmit failed (%d)", err);

        return err < 0 ? NET_XMIT_DROP : err;
}

static int bt_dev_init(struct net_device *dev)
{
        netdev_lockdep_set_classes(dev);

        return 0;
}

static const struct net_device_ops netdev_ops = {
        .ndo_init                = bt_dev_init,
        .ndo_start_xmit                = bt_xmit,
};

static const struct header_ops header_ops = {
        .create        = header_create,
};

static void netdev_setup(struct net_device *dev)
{
        dev->hard_header_len        = 0;
        dev->needed_tailroom        = 0;
        dev->flags                = IFF_RUNNING | IFF_MULTICAST;
        dev->watchdog_timeo        = 0;
        dev->tx_queue_len        = DEFAULT_TX_QUEUE_LEN;

        dev->netdev_ops                = &netdev_ops;
        dev->header_ops                = &header_ops;
        dev->needs_free_netdev        = true;
}

static const struct device_type bt_type = {
        .name        = "bluetooth",
};

static void ifup(struct net_device *netdev)
{
        int err;

        rtnl_lock();
        err = dev_open(netdev, NULL);
        if (err < 0)
                BT_INFO("iface %s cannot be opened (%d)", netdev->name, err);
        rtnl_unlock();
}

static void ifdown(struct net_device *netdev)
{
        rtnl_lock();
        dev_close(netdev);
        rtnl_unlock();
}

static void do_notify_peers(struct work_struct *work)
{
        struct lowpan_btle_dev *dev = container_of(work, struct lowpan_btle_dev,
                                                   notify_peers.work);

        netdev_notify_peers(dev->netdev); /* send neighbour adv at startup */
}

static bool is_bt_6lowpan(struct hci_conn *hcon)
{
        if (hcon->type != LE_LINK)
                return false;

        if (!enable_6lowpan)
                return false;

        return true;
}

static struct l2cap_chan *chan_create(void)
{
        struct l2cap_chan *chan;

        chan = l2cap_chan_create();
        if (!chan)
                return NULL;

        l2cap_chan_set_defaults(chan);

        chan->chan_type = L2CAP_CHAN_CONN_ORIENTED;
        chan->mode = L2CAP_MODE_LE_FLOWCTL;
        chan->imtu = 1280;

        return chan;
}

static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan,
                                        struct lowpan_btle_dev *dev,
                                        bool new_netdev)
{
        struct lowpan_peer *peer;

        peer = kzalloc(sizeof(*peer), GFP_ATOMIC);
        if (!peer)
                return NULL;

        peer->chan = chan;

        baswap((void *)peer->lladdr, &chan->dst);

        lowpan_iphc_uncompress_eui48_lladdr(&peer->peer_addr, peer->lladdr);

        spin_lock(&devices_lock);
        INIT_LIST_HEAD(&peer->list);
        peer_add(dev, peer);
        spin_unlock(&devices_lock);

        /* Notifying peers about us needs to be done without locks held */
        if (new_netdev)
                INIT_DELAYED_WORK(&dev->notify_peers, do_notify_peers);
        schedule_delayed_work(&dev->notify_peers, msecs_to_jiffies(100));

        return peer->chan;
}

static int setup_netdev(struct l2cap_chan *chan, struct lowpan_btle_dev **dev)
{
        struct net_device *netdev;
        bdaddr_t addr;
        int err;

        netdev = alloc_netdev(LOWPAN_PRIV_SIZE(sizeof(struct lowpan_btle_dev)),
                              IFACE_NAME_TEMPLATE, NET_NAME_UNKNOWN,
                              netdev_setup);
        if (!netdev)
                return -ENOMEM;

        netdev->addr_assign_type = NET_ADDR_PERM;
        baswap(&addr, &chan->src);
        __dev_addr_set(netdev, &addr, sizeof(addr));

        netdev->netdev_ops = &netdev_ops;
        SET_NETDEV_DEV(netdev, &chan->conn->hcon->hdev->dev);
        SET_NETDEV_DEVTYPE(netdev, &bt_type);

        *dev = lowpan_btle_dev(netdev);
        (*dev)->netdev = netdev;
        (*dev)->hdev = chan->conn->hcon->hdev;
        INIT_LIST_HEAD(&(*dev)->peers);

        spin_lock(&devices_lock);
        INIT_LIST_HEAD(&(*dev)->list);
        list_add_rcu(&(*dev)->list, &bt_6lowpan_devices);
        spin_unlock(&devices_lock);

        err = lowpan_register_netdev(netdev, LOWPAN_LLTYPE_BTLE);
        if (err < 0) {
                BT_INFO("register_netdev failed %d", err);
                spin_lock(&devices_lock);
                list_del_rcu(&(*dev)->list);
                spin_unlock(&devices_lock);
                free_netdev(netdev);
                goto out;
        }

        BT_DBG("ifindex %d peer bdaddr %pMR type %d my addr %pMR type %d",
               netdev->ifindex, &chan->dst, chan->dst_type,
               &chan->src, chan->src_type);
        set_bit(__LINK_STATE_PRESENT, &netdev->state);

        return 0;

out:
        return err;
}

static inline void chan_ready_cb(struct l2cap_chan *chan)
{
        struct lowpan_btle_dev *dev;
        bool new_netdev = false;

        dev = lookup_dev(chan->conn);

        BT_DBG("chan %p conn %p dev %p", chan, chan->conn, dev);

        if (!dev) {
                if (setup_netdev(chan, &dev) < 0) {
                        l2cap_chan_del(chan, -ENOENT);
                        return;
                }
                new_netdev = true;
        }

        if (!try_module_get(THIS_MODULE))
                return;

        add_peer_chan(chan, dev, new_netdev);
        ifup(dev->netdev);
}

static inline struct l2cap_chan *chan_new_conn_cb(struct l2cap_chan *pchan)
{
        struct l2cap_chan *chan;

        chan = chan_create();
        if (!chan)
                return NULL;

        chan->ops = pchan->ops;

        BT_DBG("chan %p pchan %p", chan, pchan);

        return chan;
}

static void delete_netdev(struct work_struct *work)
{
        struct lowpan_btle_dev *entry = container_of(work,
                                                     struct lowpan_btle_dev,
                                                     delete_netdev);

        lowpan_unregister_netdev(entry->netdev);

        /* The entry pointer is deleted by the netdev destructor. */
}

static void chan_close_cb(struct l2cap_chan *chan)
{
        struct lowpan_btle_dev *entry;
        struct lowpan_btle_dev *dev = NULL;
        struct lowpan_peer *peer;
        int err = -ENOENT;
        bool last = false, remove = true;

        BT_DBG("chan %p conn %p", chan, chan->conn);

        if (chan->conn && chan->conn->hcon) {
                if (!is_bt_6lowpan(chan->conn->hcon))
                        return;

                /* If conn is set, then the netdev is also there and we should
                 * not remove it.
                 */
                remove = false;
        }

        spin_lock(&devices_lock);

        list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
                dev = lowpan_btle_dev(entry->netdev);
                peer = __peer_lookup_chan(dev, chan);
                if (peer) {
                        last = peer_del(dev, peer);
                        err = 0;

                        BT_DBG("dev %p removing %speer %p", dev,
                               last ? "last " : "1 ", peer);
                        BT_DBG("chan %p orig refcnt %u", chan,
                               kref_read(&chan->kref));

                        l2cap_chan_put(chan);
                        break;
                }
        }

        if (!err && last && dev && !atomic_read(&dev->peer_count)) {
                spin_unlock(&devices_lock);

                cancel_delayed_work_sync(&dev->notify_peers);

                ifdown(dev->netdev);

                if (remove) {
                        INIT_WORK(&entry->delete_netdev, delete_netdev);
                        schedule_work(&entry->delete_netdev);
                }
        } else {
                spin_unlock(&devices_lock);
        }
}

static void chan_state_change_cb(struct l2cap_chan *chan, int state, int err)
{
        BT_DBG("chan %p conn %p state %s err %d", chan, chan->conn,
               state_to_string(state), err);
}

static struct sk_buff *chan_alloc_skb_cb(struct l2cap_chan *chan,
                                         unsigned long hdr_len,
                                         unsigned long len, int nb)
{
        /* Note that we must allocate using GFP_ATOMIC here as
         * this function is called originally from netdev hard xmit
         * function in atomic context.
         */
        return bt_skb_alloc(hdr_len + len, GFP_ATOMIC);
}

static void chan_suspend_cb(struct l2cap_chan *chan)
{
        struct lowpan_btle_dev *dev;

        BT_DBG("chan %p suspend", chan);

        dev = lookup_dev(chan->conn);
        if (!dev || !dev->netdev)
                return;

        netif_stop_queue(dev->netdev);
}

static void chan_resume_cb(struct l2cap_chan *chan)
{
        struct lowpan_btle_dev *dev;

        BT_DBG("chan %p resume", chan);

        dev = lookup_dev(chan->conn);
        if (!dev || !dev->netdev)
                return;

        netif_wake_queue(dev->netdev);
}

static long chan_get_sndtimeo_cb(struct l2cap_chan *chan)
{
        return L2CAP_CONN_TIMEOUT;
}

static const struct l2cap_ops bt_6lowpan_chan_ops = {
        .name                        = "L2CAP 6LoWPAN channel",
        .new_connection                = chan_new_conn_cb,
        .recv                        = chan_recv_cb,
        .close                        = chan_close_cb,
        .state_change                = chan_state_change_cb,
        .ready                        = chan_ready_cb,
        .resume                        = chan_resume_cb,
        .suspend                = chan_suspend_cb,
        .get_sndtimeo                = chan_get_sndtimeo_cb,
        .alloc_skb                = chan_alloc_skb_cb,

        .teardown                = l2cap_chan_no_teardown,
        .defer                        = l2cap_chan_no_defer,
        .set_shutdown                = l2cap_chan_no_set_shutdown,
};

static int bt_6lowpan_connect(bdaddr_t *addr, u8 dst_type)
{
        struct l2cap_chan *chan;
        int err;

        chan = chan_create();
        if (!chan)
                return -EINVAL;

        chan->ops = &bt_6lowpan_chan_ops;

        err = l2cap_chan_connect(chan, cpu_to_le16(L2CAP_PSM_IPSP), 0,
                                 addr, dst_type, L2CAP_CONN_TIMEOUT);

        BT_DBG("chan %p err %d", chan, err);
        if (err < 0)
                l2cap_chan_put(chan);

        return err;
}

static int bt_6lowpan_disconnect(struct l2cap_conn *conn, u8 dst_type)
{
        struct lowpan_peer *peer;

        BT_DBG("conn %p dst type %u", conn, dst_type);

        peer = lookup_peer(conn);
        if (!peer)
                return -ENOENT;

        BT_DBG("peer %p chan %p", peer, peer->chan);

        l2cap_chan_close(peer->chan, ENOENT);

        return 0;
}

static struct l2cap_chan *bt_6lowpan_listen(void)
{
        bdaddr_t *addr = BDADDR_ANY;
        struct l2cap_chan *chan;
        int err;

        if (!enable_6lowpan)
                return NULL;

        chan = chan_create();
        if (!chan)
                return NULL;

        chan->ops = &bt_6lowpan_chan_ops;
        chan->state = BT_LISTEN;
        chan->src_type = BDADDR_LE_PUBLIC;

        atomic_set(&chan->nesting, L2CAP_NESTING_PARENT);

        BT_DBG("chan %p src type %u", chan, chan->src_type);

        err = l2cap_add_psm(chan, addr, cpu_to_le16(L2CAP_PSM_IPSP));
        if (err) {
                l2cap_chan_put(chan);
                BT_ERR("psm cannot be added err %d", err);
                return NULL;
        }

        return chan;
}

static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type,
                          struct l2cap_conn **conn)
{
        struct hci_conn *hcon;
        struct hci_dev *hdev;
        int n;

        n = sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx %hhu",
                   &addr->b[5], &addr->b[4], &addr->b[3],
                   &addr->b[2], &addr->b[1], &addr->b[0],
                   addr_type);

        if (n < 7)
                return -EINVAL;

        /* The LE_PUBLIC address type is ignored because of BDADDR_ANY */
        hdev = hci_get_route(addr, BDADDR_ANY, BDADDR_LE_PUBLIC);
        if (!hdev)
                return -ENOENT;

        hci_dev_lock(hdev);
        hcon = hci_conn_hash_lookup_le(hdev, addr, *addr_type);
        hci_dev_unlock(hdev);
        hci_dev_put(hdev);

        if (!hcon)
                return -ENOENT;

        *conn = (struct l2cap_conn *)hcon->l2cap_data;

        BT_DBG("conn %p dst %pMR type %u", *conn, &hcon->dst, hcon->dst_type);

        return 0;
}

static void disconnect_all_peers(void)
{
        struct lowpan_btle_dev *entry;
        struct lowpan_peer *peer, *tmp_peer, *new_peer;
        struct list_head peers;

        INIT_LIST_HEAD(&peers);

        /* We make a separate list of peers as the close_cb() will
         * modify the device peers list so it is better not to mess
         * with the same list at the same time.
         */

        rcu_read_lock();

        list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
                list_for_each_entry_rcu(peer, &entry->peers, list) {
                        new_peer = kmalloc(sizeof(*new_peer), GFP_ATOMIC);
                        if (!new_peer)
                                break;

                        new_peer->chan = peer->chan;
                        INIT_LIST_HEAD(&new_peer->list);

                        list_add(&new_peer->list, &peers);
                }
        }

        rcu_read_unlock();

        spin_lock(&devices_lock);
        list_for_each_entry_safe(peer, tmp_peer, &peers, list) {
                l2cap_chan_close(peer->chan, ENOENT);

                list_del_rcu(&peer->list);
                kfree_rcu(peer, rcu);
        }
        spin_unlock(&devices_lock);
}

struct set_enable {
        struct work_struct work;
        bool flag;
};

static void do_enable_set(struct work_struct *work)
{
        struct set_enable *set_enable = container_of(work,
                                                     struct set_enable, work);

        if (!set_enable->flag || enable_6lowpan != set_enable->flag)
                /* Disconnect existing connections if 6lowpan is
                 * disabled
                 */
                disconnect_all_peers();

        enable_6lowpan = set_enable->flag;

        mutex_lock(&set_lock);
        if (listen_chan) {
                l2cap_chan_close(listen_chan, 0);
                l2cap_chan_put(listen_chan);
        }

        listen_chan = bt_6lowpan_listen();
        mutex_unlock(&set_lock);

        kfree(set_enable);
}

static int lowpan_enable_set(void *data, u64 val)
{
        struct set_enable *set_enable;

        set_enable = kzalloc(sizeof(*set_enable), GFP_KERNEL);
        if (!set_enable)
                return -ENOMEM;

        set_enable->flag = !!val;
        INIT_WORK(&set_enable->work, do_enable_set);

        schedule_work(&set_enable->work);

        return 0;
}

static int lowpan_enable_get(void *data, u64 *val)
{
        *val = enable_6lowpan;
        return 0;
}

DEFINE_DEBUGFS_ATTRIBUTE(lowpan_enable_fops, lowpan_enable_get,
                         lowpan_enable_set, "%llu\n");

static ssize_t lowpan_control_write(struct file *fp,
                                    const char __user *user_buffer,
                                    size_t count,
                                    loff_t *position)
{
        char buf[32];
        size_t buf_size = min(count, sizeof(buf) - 1);
        int ret;
        bdaddr_t addr;
        u8 addr_type;
        struct l2cap_conn *conn = NULL;

        if (copy_from_user(buf, user_buffer, buf_size))
                return -EFAULT;

        buf[buf_size] = '\0';

        if (memcmp(buf, "connect ", 8) == 0) {
                ret = get_l2cap_conn(&buf[8], &addr, &addr_type, &conn);
                if (ret == -EINVAL)
                        return ret;

                mutex_lock(&set_lock);
                if (listen_chan) {
                        l2cap_chan_close(listen_chan, 0);
                        l2cap_chan_put(listen_chan);
                        listen_chan = NULL;
                }
                mutex_unlock(&set_lock);

                if (conn) {
                        struct lowpan_peer *peer;

                        if (!is_bt_6lowpan(conn->hcon))
                                return -EINVAL;

                        peer = lookup_peer(conn);
                        if (peer) {
                                BT_DBG("6LoWPAN connection already exists");
                                return -EALREADY;
                        }

                        BT_DBG("conn %p dst %pMR type %d user %u", conn,
                               &conn->hcon->dst, conn->hcon->dst_type,
                               addr_type);
                }

                ret = bt_6lowpan_connect(&addr, addr_type);
                if (ret < 0)
                        return ret;

                return count;
        }

        if (memcmp(buf, "disconnect ", 11) == 0) {
                ret = get_l2cap_conn(&buf[11], &addr, &addr_type, &conn);
                if (ret < 0)
                        return ret;

                ret = bt_6lowpan_disconnect(conn, addr_type);
                if (ret < 0)
                        return ret;

                return count;
        }

        return count;
}

static int lowpan_control_show(struct seq_file *f, void *ptr)
{
        struct lowpan_btle_dev *entry;
        struct lowpan_peer *peer;

        spin_lock(&devices_lock);

        list_for_each_entry(entry, &bt_6lowpan_devices, list) {
                list_for_each_entry(peer, &entry->peers, list)
                        seq_printf(f, "%pMR (type %u)\n",
                                   &peer->chan->dst, peer->chan->dst_type);
        }

        spin_unlock(&devices_lock);

        return 0;
}

static int lowpan_control_open(struct inode *inode, struct file *file)
{
        return single_open(file, lowpan_control_show, inode->i_private);
}

static const struct file_operations lowpan_control_fops = {
        .open                = lowpan_control_open,
        .read                = seq_read,
        .write                = lowpan_control_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

static void disconnect_devices(void)
{
        struct lowpan_btle_dev *entry, *tmp, *new_dev;
        struct list_head devices;

        INIT_LIST_HEAD(&devices);

        /* We make a separate list of devices because the unregister_netdev()
         * will call device_event() which will also want to modify the same
         * devices list.
         */

        rcu_read_lock();

        list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
                new_dev = kmalloc(sizeof(*new_dev), GFP_ATOMIC);
                if (!new_dev)
                        break;

                new_dev->netdev = entry->netdev;
                INIT_LIST_HEAD(&new_dev->list);

                list_add_rcu(&new_dev->list, &devices);
        }

        rcu_read_unlock();

        list_for_each_entry_safe(entry, tmp, &devices, list) {
                ifdown(entry->netdev);
                BT_DBG("Unregistering netdev %s %p",
                       entry->netdev->name, entry->netdev);
                lowpan_unregister_netdev(entry->netdev);
                kfree(entry);
        }
}

static int device_event(struct notifier_block *unused,
                        unsigned long event, void *ptr)
{
        struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
        struct lowpan_btle_dev *entry;

        if (netdev->type != ARPHRD_6LOWPAN)
                return NOTIFY_DONE;

        switch (event) {
        case NETDEV_UNREGISTER:
                spin_lock(&devices_lock);
                list_for_each_entry(entry, &bt_6lowpan_devices, list) {
                        if (entry->netdev == netdev) {
                                BT_DBG("Unregistered netdev %s %p",
                                       netdev->name, netdev);
                                list_del(&entry->list);
                                break;
                        }
                }
                spin_unlock(&devices_lock);
                break;
        }

        return NOTIFY_DONE;
}

static struct notifier_block bt_6lowpan_dev_notifier = {
        .notifier_call = device_event,
};

static int __init bt_6lowpan_init(void)
{
        lowpan_enable_debugfs = debugfs_create_file_unsafe("6lowpan_enable",
                                                           0644, bt_debugfs,
                                                           NULL,
                                                           &lowpan_enable_fops);
        lowpan_control_debugfs = debugfs_create_file("6lowpan_control", 0644,
                                                     bt_debugfs, NULL,
                                                     &lowpan_control_fops);

        return register_netdevice_notifier(&bt_6lowpan_dev_notifier);
}

static void __exit bt_6lowpan_exit(void)
{
        debugfs_remove(lowpan_enable_debugfs);
        debugfs_remove(lowpan_control_debugfs);

        if (listen_chan) {
                l2cap_chan_close(listen_chan, 0);
                l2cap_chan_put(listen_chan);
        }

        disconnect_devices();

        unregister_netdevice_notifier(&bt_6lowpan_dev_notifier);
}

module_init(bt_6lowpan_init);
module_exit(bt_6lowpan_exit);

MODULE_AUTHOR("Jukka Rissanen <jukka.rissanen@linux.intel.com>");
MODULE_DESCRIPTION("Bluetooth 6LoWPAN");
MODULE_VERSION(VERSION);
MODULE_LICENSE("GPL");


















   30 



   34 




   36 








   34 













   31 




   30 















   29 


   31 








   33 
    9 















   26 


   30 
































   29 
   14 
















   31 


   34 











































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
// SPDX-License-Identifier: GPL-2.0
/*
 * Lockless hierarchical page accounting & limiting
 *
 * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
 */

#include <linux/page_counter.h>
#include <linux/atomic.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/sched.h>
#include <linux/bug.h>
#include <asm/page.h>

static void propagate_protected_usage(struct page_counter *c,
                                      unsigned long usage)
{
        unsigned long protected, old_protected;
        long delta;

        if (!c->parent)
                return;

        protected = min(usage, READ_ONCE(c->min));
        old_protected = atomic_long_read(&c->min_usage);
        if (protected != old_protected) {
                old_protected = atomic_long_xchg(&c->min_usage, protected);
                delta = protected - old_protected;
                if (delta)
                        atomic_long_add(delta, &c->parent->children_min_usage);
        }

        protected = min(usage, READ_ONCE(c->low));
        old_protected = atomic_long_read(&c->low_usage);
        if (protected != old_protected) {
                old_protected = atomic_long_xchg(&c->low_usage, protected);
                delta = protected - old_protected;
                if (delta)
                        atomic_long_add(delta, &c->parent->children_low_usage);
        }
}

/**
 * page_counter_cancel - take pages out of the local counter
 * @counter: counter
 * @nr_pages: number of pages to cancel
 */
void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
{
        long new;

        new = atomic_long_sub_return(nr_pages, &counter->usage);
        /* More uncharges than charges? */
        if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n",
                      new, nr_pages)) {
                new = 0;
                atomic_long_set(&counter->usage, new);
        }
        propagate_protected_usage(counter, new);
}

/**
 * page_counter_charge - hierarchically charge pages
 * @counter: counter
 * @nr_pages: number of pages to charge
 *
 * NOTE: This does not consider any configured counter limits.
 */
void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        for (c = counter; c; c = c->parent) {
                long new;

                new = atomic_long_add_return(nr_pages, &c->usage);
                propagate_protected_usage(c, new);
                /*
                 * This is indeed racy, but we can live with some
                 * inaccuracy in the watermark.
                 */
                if (new > READ_ONCE(c->watermark))
                        WRITE_ONCE(c->watermark, new);
        }
}

/**
 * page_counter_try_charge - try to hierarchically charge pages
 * @counter: counter
 * @nr_pages: number of pages to charge
 * @fail: points first counter to hit its limit, if any
 *
 * Returns %true on success, or %false and @fail if the counter or one
 * of its ancestors has hit its configured limit.
 */
bool page_counter_try_charge(struct page_counter *counter,
                             unsigned long nr_pages,
                             struct page_counter **fail)
{
        struct page_counter *c;

        for (c = counter; c; c = c->parent) {
                long new;
                /*
                 * Charge speculatively to avoid an expensive CAS.  If
                 * a bigger charge fails, it might falsely lock out a
                 * racing smaller charge and send it into reclaim
                 * early, but the error is limited to the difference
                 * between the two sizes, which is less than 2M/4M in
                 * case of a THP locking out a regular page charge.
                 *
                 * The atomic_long_add_return() implies a full memory
                 * barrier between incrementing the count and reading
                 * the limit.  When racing with page_counter_set_max(),
                 * we either see the new limit or the setter sees the
                 * counter has changed and retries.
                 */
                new = atomic_long_add_return(nr_pages, &c->usage);
                if (new > c->max) {
                        atomic_long_sub(nr_pages, &c->usage);
                        /*
                         * This is racy, but we can live with some
                         * inaccuracy in the failcnt which is only used
                         * to report stats.
                         */
                        data_race(c->failcnt++);
                        *fail = c;
                        goto failed;
                }
                propagate_protected_usage(c, new);
                /*
                 * Just like with failcnt, we can live with some
                 * inaccuracy in the watermark.
                 */
                if (new > READ_ONCE(c->watermark))
                        WRITE_ONCE(c->watermark, new);
        }
        return true;

failed:
        for (c = counter; c != *fail; c = c->parent)
                page_counter_cancel(c, nr_pages);

        return false;
}

/**
 * page_counter_uncharge - hierarchically uncharge pages
 * @counter: counter
 * @nr_pages: number of pages to uncharge
 */
void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        for (c = counter; c; c = c->parent)
                page_counter_cancel(c, nr_pages);
}

/**
 * page_counter_set_max - set the maximum number of pages allowed
 * @counter: counter
 * @nr_pages: limit to set
 *
 * Returns 0 on success, -EBUSY if the current number of pages on the
 * counter already exceeds the specified limit.
 *
 * The caller must serialize invocations on the same counter.
 */
int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
{
        for (;;) {
                unsigned long old;
                long usage;

                /*
                 * Update the limit while making sure that it's not
                 * below the concurrently-changing counter value.
                 *
                 * The xchg implies two full memory barriers before
                 * and after, so the read-swap-read is ordered and
                 * ensures coherency with page_counter_try_charge():
                 * that function modifies the count before checking
                 * the limit, so if it sees the old limit, we see the
                 * modified counter and retry.
                 */
                usage = page_counter_read(counter);

                if (usage > nr_pages)
                        return -EBUSY;

                old = xchg(&counter->max, nr_pages);

                if (page_counter_read(counter) <= usage || nr_pages >= old)
                        return 0;

                counter->max = old;
                cond_resched();
        }
}

/**
 * page_counter_set_min - set the amount of protected memory
 * @counter: counter
 * @nr_pages: value to set
 *
 * The caller must serialize invocations on the same counter.
 */
void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        WRITE_ONCE(counter->min, nr_pages);

        for (c = counter; c; c = c->parent)
                propagate_protected_usage(c, atomic_long_read(&c->usage));
}

/**
 * page_counter_set_low - set the amount of protected memory
 * @counter: counter
 * @nr_pages: value to set
 *
 * The caller must serialize invocations on the same counter.
 */
void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        WRITE_ONCE(counter->low, nr_pages);

        for (c = counter; c; c = c->parent)
                propagate_protected_usage(c, atomic_long_read(&c->usage));
}

/**
 * page_counter_memparse - memparse() for page counter limits
 * @buf: string to parse
 * @max: string meaning maximum possible value
 * @nr_pages: returns the result in number of pages
 *
 * Returns -EINVAL, or 0 and @nr_pages on success.  @nr_pages will be
 * limited to %PAGE_COUNTER_MAX.
 */
int page_counter_memparse(const char *buf, const char *max,
                          unsigned long *nr_pages)
{
        char *end;
        u64 bytes;

        if (!strcmp(buf, max)) {
                *nr_pages = PAGE_COUNTER_MAX;
                return 0;
        }

        bytes = memparse(buf, &end);
        if (*end != '\0')
                return -EINVAL;

        *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);

        return 0;
}




























































































































    2 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Kernel Electric-Fence (KFENCE). For more info please see
 * Documentation/dev-tools/kfence.rst.
 *
 * Copyright (C) 2020, Google LLC.
 */

#ifndef MM_KFENCE_KFENCE_H
#define MM_KFENCE_KFENCE_H

#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/types.h>

#include "../slab.h" /* for struct kmem_cache */

/*
 * Get the canary byte pattern for @addr. Use a pattern that varies based on the
 * lower 3 bits of the address, to detect memory corruptions with higher
 * probability, where similar constants are used.
 */
#define KFENCE_CANARY_PATTERN_U8(addr) ((u8)0xaa ^ (u8)((unsigned long)(addr) & 0x7))

/*
 * Define a continuous 8-byte canary starting from a multiple of 8. The canary
 * of each byte is only related to the lowest three bits of its address, so the
 * canary of every 8 bytes is the same. 64-bit memory can be filled and checked
 * at a time instead of byte by byte to improve performance.
 */
#define KFENCE_CANARY_PATTERN_U64 ((u64)0xaaaaaaaaaaaaaaaa ^ (u64)(le64_to_cpu(0x0706050403020100)))

/* Maximum stack depth for reports. */
#define KFENCE_STACK_DEPTH 64

/* KFENCE object states. */
enum kfence_object_state {
        KFENCE_OBJECT_UNUSED,                /* Object is unused. */
        KFENCE_OBJECT_ALLOCATED,        /* Object is currently allocated. */
        KFENCE_OBJECT_FREED,                /* Object was allocated, and then freed. */
};

/* Alloc/free tracking information. */
struct kfence_track {
        pid_t pid;
        int cpu;
        u64 ts_nsec;
        int num_stack_entries;
        unsigned long stack_entries[KFENCE_STACK_DEPTH];
};

/* KFENCE metadata per guarded allocation. */
struct kfence_metadata {
        struct list_head list;                /* Freelist node; access under kfence_freelist_lock. */
        struct rcu_head rcu_head;        /* For delayed freeing. */

        /*
         * Lock protecting below data; to ensure consistency of the below data,
         * since the following may execute concurrently: __kfence_alloc(),
         * __kfence_free(), kfence_handle_page_fault(). However, note that we
         * cannot grab the same metadata off the freelist twice, and multiple
         * __kfence_alloc() cannot run concurrently on the same metadata.
         */
        raw_spinlock_t lock;

        /* The current state of the object; see above. */
        enum kfence_object_state state;

        /*
         * Allocated object address; cannot be calculated from size, because of
         * alignment requirements.
         *
         * Invariant: ALIGN_DOWN(addr, PAGE_SIZE) is constant.
         */
        unsigned long addr;

        /*
         * The size of the original allocation.
         */
        size_t size;

        /*
         * The kmem_cache cache of the last allocation; NULL if never allocated
         * or the cache has already been destroyed.
         */
        struct kmem_cache *cache;

        /*
         * In case of an invalid access, the page that was unprotected; we
         * optimistically only store one address.
         */
        unsigned long unprotected_page;

        /* Allocation and free stack information. */
        struct kfence_track alloc_track;
        struct kfence_track free_track;
        /* For updating alloc_covered on frees. */
        u32 alloc_stack_hash;
#ifdef CONFIG_MEMCG_KMEM
        struct slabobj_ext obj_exts;
#endif
};

#define KFENCE_METADATA_SIZE PAGE_ALIGN(sizeof(struct kfence_metadata) * \
                                        CONFIG_KFENCE_NUM_OBJECTS)

extern struct kfence_metadata *kfence_metadata;

static inline struct kfence_metadata *addr_to_metadata(unsigned long addr)
{
        long index;

        /* The checks do not affect performance; only called from slow-paths. */

        if (!is_kfence_address((void *)addr))
                return NULL;

        /*
         * May be an invalid index if called with an address at the edge of
         * __kfence_pool, in which case we would report an "invalid access"
         * error.
         */
        index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1;
        if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS)
                return NULL;

        return &kfence_metadata[index];
}

/* KFENCE error types for report generation. */
enum kfence_error_type {
        KFENCE_ERROR_OOB,                /* Detected a out-of-bounds access. */
        KFENCE_ERROR_UAF,                /* Detected a use-after-free access. */
        KFENCE_ERROR_CORRUPTION,        /* Detected a memory corruption on free. */
        KFENCE_ERROR_INVALID,                /* Invalid access of unknown type. */
        KFENCE_ERROR_INVALID_FREE,        /* Invalid free. */
};

void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs,
                         const struct kfence_metadata *meta, enum kfence_error_type type);

void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta);

#endif /* MM_KFENCE_KFENCE_H */
























   29 























































   13 
    3 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCULIST_BL_H
#define _LINUX_RCULIST_BL_H

/*
 * RCU-protected bl list version. See include/linux/list_bl.h.
 */
#include <linux/list_bl.h>
#include <linux/rcupdate.h>

static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
                                        struct hlist_bl_node *n)
{
        LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
        LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) !=
                                                        LIST_BL_LOCKMASK);
        rcu_assign_pointer(h->first,
                (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK));
}

static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
{
        return (struct hlist_bl_node *)
                ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK);
}

/**
 * hlist_bl_del_rcu - deletes entry from hash list without re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: hlist_bl_unhashed() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the hash list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_bl_add_head_rcu()
 * or hlist_bl_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_bl_for_each_entry().
 */
static inline void hlist_bl_del_rcu(struct hlist_bl_node *n)
{
        __hlist_bl_del(n);
        n->pprev = LIST_POISON2;
}

/**
 * hlist_bl_add_head_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist_bl,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_bl_add_head_rcu()
 * or hlist_bl_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_bl_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n,
                                        struct hlist_bl_head *h)
{
        struct hlist_bl_node *first;

        /* don't need hlist_bl_first_rcu because we're under lock */
        first = hlist_bl_first(h);

        n->next = first;
        if (first)
                first->pprev = &n->next;
        n->pprev = &h->first;

        /* need _rcu because we can have concurrent lock free readers */
        hlist_bl_set_first_rcu(h, n);
}
/**
 * hlist_bl_for_each_entry_rcu - iterate over rcu list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_bl_node to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_bl_node within the struct.
 *
 */
#define hlist_bl_for_each_entry_rcu(tpos, pos, head, member)                \
        for (pos = hlist_bl_first_rcu(head);                                \
                pos &&                                                        \
                ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \
                pos = rcu_dereference_raw(pos->next))

#endif

































































































































































































































































































































































































   10 





   12 
   12 










   11 































































































    1 






































    1 

















   13 


   12 




    1 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/mm/mempool.c
 *
 *  memory buffer pool support. Such pools are mostly used
 *  for guaranteed, deadlock-free memory allocations during
 *  extreme VM load.
 *
 *  started by Ingo Molnar, Copyright (C) 2001
 *  debugging by David Rientjes, Copyright (C) 2015
 */

#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/kasan.h>
#include <linux/kmemleak.h>
#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/writeback.h>
#include "slab.h"

#ifdef CONFIG_SLUB_DEBUG_ON
static void poison_error(mempool_t *pool, void *element, size_t size,
                         size_t byte)
{
        const int nr = pool->curr_nr;
        const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0);
        const int end = min_t(int, byte + (BITS_PER_LONG / 8), size);
        int i;

        pr_err("BUG: mempool element poison mismatch\n");
        pr_err("Mempool %p size %zu\n", pool, size);
        pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : "");
        for (i = start; i < end; i++)
                pr_cont("%x ", *(u8 *)(element + i));
        pr_cont("%s\n", end < size ? "..." : "");
        dump_stack();
}

static void __check_element(mempool_t *pool, void *element, size_t size)
{
        u8 *obj = element;
        size_t i;

        for (i = 0; i < size; i++) {
                u8 exp = (i < size - 1) ? POISON_FREE : POISON_END;

                if (obj[i] != exp) {
                        poison_error(pool, element, size, i);
                        return;
                }
        }
        memset(obj, POISON_INUSE, size);
}

static void check_element(mempool_t *pool, void *element)
{
        /* Skip checking: KASAN might save its metadata in the element. */
        if (kasan_enabled())
                return;

        /* Mempools backed by slab allocator */
        if (pool->free == mempool_kfree) {
                __check_element(pool, element, (size_t)pool->pool_data);
        } else if (pool->free == mempool_free_slab) {
                __check_element(pool, element, kmem_cache_size(pool->pool_data));
        } else if (pool->free == mempool_free_pages) {
                /* Mempools backed by page allocator */
                int order = (int)(long)pool->pool_data;
                void *addr = kmap_local_page((struct page *)element);

                __check_element(pool, addr, 1UL << (PAGE_SHIFT + order));
                kunmap_local(addr);
        }
}

static void __poison_element(void *element, size_t size)
{
        u8 *obj = element;

        memset(obj, POISON_FREE, size - 1);
        obj[size - 1] = POISON_END;
}

static void poison_element(mempool_t *pool, void *element)
{
        /* Skip poisoning: KASAN might save its metadata in the element. */
        if (kasan_enabled())
                return;

        /* Mempools backed by slab allocator */
        if (pool->alloc == mempool_kmalloc) {
                __poison_element(element, (size_t)pool->pool_data);
        } else if (pool->alloc == mempool_alloc_slab) {
                __poison_element(element, kmem_cache_size(pool->pool_data));
        } else if (pool->alloc == mempool_alloc_pages) {
                /* Mempools backed by page allocator */
                int order = (int)(long)pool->pool_data;
                void *addr = kmap_local_page((struct page *)element);

                __poison_element(addr, 1UL << (PAGE_SHIFT + order));
                kunmap_local(addr);
        }
}
#else /* CONFIG_SLUB_DEBUG_ON */
static inline void check_element(mempool_t *pool, void *element)
{
}
static inline void poison_element(mempool_t *pool, void *element)
{
}
#endif /* CONFIG_SLUB_DEBUG_ON */

static __always_inline bool kasan_poison_element(mempool_t *pool, void *element)
{
        if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
                return kasan_mempool_poison_object(element);
        else if (pool->alloc == mempool_alloc_pages)
                return kasan_mempool_poison_pages(element,
                                                (unsigned long)pool->pool_data);
        return true;
}

static void kasan_unpoison_element(mempool_t *pool, void *element)
{
        if (pool->alloc == mempool_kmalloc)
                kasan_mempool_unpoison_object(element, (size_t)pool->pool_data);
        else if (pool->alloc == mempool_alloc_slab)
                kasan_mempool_unpoison_object(element,
                                              kmem_cache_size(pool->pool_data));
        else if (pool->alloc == mempool_alloc_pages)
                kasan_mempool_unpoison_pages(element,
                                             (unsigned long)pool->pool_data);
}

static __always_inline void add_element(mempool_t *pool, void *element)
{
        BUG_ON(pool->curr_nr >= pool->min_nr);
        poison_element(pool, element);
        if (kasan_poison_element(pool, element))
                pool->elements[pool->curr_nr++] = element;
}

static void *remove_element(mempool_t *pool)
{
        void *element = pool->elements[--pool->curr_nr];

        BUG_ON(pool->curr_nr < 0);
        kasan_unpoison_element(pool, element);
        check_element(pool, element);
        return element;
}

/**
 * mempool_exit - exit a mempool initialized with mempool_init()
 * @pool:      pointer to the memory pool which was initialized with
 *             mempool_init().
 *
 * Free all reserved elements in @pool and @pool itself.  This function
 * only sleeps if the free_fn() function sleeps.
 *
 * May be called on a zeroed but uninitialized mempool (i.e. allocated with
 * kzalloc()).
 */
void mempool_exit(mempool_t *pool)
{
        while (pool->curr_nr) {
                void *element = remove_element(pool);
                pool->free(element, pool->pool_data);
        }
        kfree(pool->elements);
        pool->elements = NULL;
}
EXPORT_SYMBOL(mempool_exit);

/**
 * mempool_destroy - deallocate a memory pool
 * @pool:      pointer to the memory pool which was allocated via
 *             mempool_create().
 *
 * Free all reserved elements in @pool and @pool itself.  This function
 * only sleeps if the free_fn() function sleeps.
 */
void mempool_destroy(mempool_t *pool)
{
        if (unlikely(!pool))
                return;

        mempool_exit(pool);
        kfree(pool);
}
EXPORT_SYMBOL(mempool_destroy);

int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
                      mempool_free_t *free_fn, void *pool_data,
                      gfp_t gfp_mask, int node_id)
{
        spin_lock_init(&pool->lock);
        pool->min_nr        = min_nr;
        pool->pool_data = pool_data;
        pool->alloc        = alloc_fn;
        pool->free        = free_fn;
        init_waitqueue_head(&pool->wait);

        pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
                                            gfp_mask, node_id);
        if (!pool->elements)
                return -ENOMEM;

        /*
         * First pre-allocate the guaranteed number of buffers.
         */
        while (pool->curr_nr < pool->min_nr) {
                void *element;

                element = pool->alloc(gfp_mask, pool->pool_data);
                if (unlikely(!element)) {
                        mempool_exit(pool);
                        return -ENOMEM;
                }
                add_element(pool, element);
        }

        return 0;
}
EXPORT_SYMBOL(mempool_init_node);

/**
 * mempool_init - initialize a memory pool
 * @pool:      pointer to the memory pool that should be initialized
 * @min_nr:    the minimum number of elements guaranteed to be
 *             allocated for this pool.
 * @alloc_fn:  user-defined element-allocation function.
 * @free_fn:   user-defined element-freeing function.
 * @pool_data: optional private data available to the user-defined functions.
 *
 * Like mempool_create(), but initializes the pool in (i.e. embedded in another
 * structure).
 *
 * Return: %0 on success, negative error code otherwise.
 */
int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
                        mempool_free_t *free_fn, void *pool_data)
{
        return mempool_init_node(pool, min_nr, alloc_fn, free_fn,
                                 pool_data, GFP_KERNEL, NUMA_NO_NODE);

}
EXPORT_SYMBOL(mempool_init_noprof);

/**
 * mempool_create_node - create a memory pool
 * @min_nr:    the minimum number of elements guaranteed to be
 *             allocated for this pool.
 * @alloc_fn:  user-defined element-allocation function.
 * @free_fn:   user-defined element-freeing function.
 * @pool_data: optional private data available to the user-defined functions.
 * @gfp_mask:  memory allocation flags
 * @node_id:   numa node to allocate on
 *
 * this function creates and allocates a guaranteed size, preallocated
 * memory pool. The pool can be used from the mempool_alloc() and mempool_free()
 * functions. This function might sleep. Both the alloc_fn() and the free_fn()
 * functions might sleep - as long as the mempool_alloc() function is not called
 * from IRQ contexts.
 *
 * Return: pointer to the created memory pool object or %NULL on error.
 */
mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn,
                                      mempool_free_t *free_fn, void *pool_data,
                                      gfp_t gfp_mask, int node_id)
{
        mempool_t *pool;

        pool = kmalloc_node_noprof(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id);
        if (!pool)
                return NULL;

        if (mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data,
                              gfp_mask, node_id)) {
                kfree(pool);
                return NULL;
        }

        return pool;
}
EXPORT_SYMBOL(mempool_create_node_noprof);

/**
 * mempool_resize - resize an existing memory pool
 * @pool:       pointer to the memory pool which was allocated via
 *              mempool_create().
 * @new_min_nr: the new minimum number of elements guaranteed to be
 *              allocated for this pool.
 *
 * This function shrinks/grows the pool. In the case of growing,
 * it cannot be guaranteed that the pool will be grown to the new
 * size immediately, but new mempool_free() calls will refill it.
 * This function may sleep.
 *
 * Note, the caller must guarantee that no mempool_destroy is called
 * while this function is running. mempool_alloc() & mempool_free()
 * might be called (eg. from IRQ contexts) while this function executes.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int mempool_resize(mempool_t *pool, int new_min_nr)
{
        void *element;
        void **new_elements;
        unsigned long flags;

        BUG_ON(new_min_nr <= 0);
        might_sleep();

        spin_lock_irqsave(&pool->lock, flags);
        if (new_min_nr <= pool->min_nr) {
                while (new_min_nr < pool->curr_nr) {
                        element = remove_element(pool);
                        spin_unlock_irqrestore(&pool->lock, flags);
                        pool->free(element, pool->pool_data);
                        spin_lock_irqsave(&pool->lock, flags);
                }
                pool->min_nr = new_min_nr;
                goto out_unlock;
        }
        spin_unlock_irqrestore(&pool->lock, flags);

        /* Grow the pool */
        new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements),
                                     GFP_KERNEL);
        if (!new_elements)
                return -ENOMEM;

        spin_lock_irqsave(&pool->lock, flags);
        if (unlikely(new_min_nr <= pool->min_nr)) {
                /* Raced, other resize will do our work */
                spin_unlock_irqrestore(&pool->lock, flags);
                kfree(new_elements);
                goto out;
        }
        memcpy(new_elements, pool->elements,
                        pool->curr_nr * sizeof(*new_elements));
        kfree(pool->elements);
        pool->elements = new_elements;
        pool->min_nr = new_min_nr;

        while (pool->curr_nr < pool->min_nr) {
                spin_unlock_irqrestore(&pool->lock, flags);
                element = pool->alloc(GFP_KERNEL, pool->pool_data);
                if (!element)
                        goto out;
                spin_lock_irqsave(&pool->lock, flags);
                if (pool->curr_nr < pool->min_nr) {
                        add_element(pool, element);
                } else {
                        spin_unlock_irqrestore(&pool->lock, flags);
                        pool->free(element, pool->pool_data);        /* Raced */
                        goto out;
                }
        }
out_unlock:
        spin_unlock_irqrestore(&pool->lock, flags);
out:
        return 0;
}
EXPORT_SYMBOL(mempool_resize);

/**
 * mempool_alloc - allocate an element from a specific memory pool
 * @pool:      pointer to the memory pool which was allocated via
 *             mempool_create().
 * @gfp_mask:  the usual allocation bitmask.
 *
 * this function only sleeps if the alloc_fn() function sleeps or
 * returns NULL. Note that due to preallocation, this function
 * *never* fails when called from process contexts. (it might
 * fail if called from an IRQ context.)
 * Note: using __GFP_ZERO is not supported.
 *
 * Return: pointer to the allocated element or %NULL on error.
 */
void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask)
{
        void *element;
        unsigned long flags;
        wait_queue_entry_t wait;
        gfp_t gfp_temp;

        VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
        might_alloc(gfp_mask);

        gfp_mask |= __GFP_NOMEMALLOC;        /* don't allocate emergency reserves */
        gfp_mask |= __GFP_NORETRY;        /* don't loop in __alloc_pages */
        gfp_mask |= __GFP_NOWARN;        /* failures are OK */

        gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO);

repeat_alloc:

        element = pool->alloc(gfp_temp, pool->pool_data);
        if (likely(element != NULL))
                return element;

        spin_lock_irqsave(&pool->lock, flags);
        if (likely(pool->curr_nr)) {
                element = remove_element(pool);
                spin_unlock_irqrestore(&pool->lock, flags);
                /* paired with rmb in mempool_free(), read comment there */
                smp_wmb();
                /*
                 * Update the allocation stack trace as this is more useful
                 * for debugging.
                 */
                kmemleak_update_trace(element);
                return element;
        }

        /*
         * We use gfp mask w/o direct reclaim or IO for the first round.  If
         * alloc failed with that and @pool was empty, retry immediately.
         */
        if (gfp_temp != gfp_mask) {
                spin_unlock_irqrestore(&pool->lock, flags);
                gfp_temp = gfp_mask;
                goto repeat_alloc;
        }

        /* We must not sleep if !__GFP_DIRECT_RECLAIM */
        if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
                spin_unlock_irqrestore(&pool->lock, flags);
                return NULL;
        }

        /* Let's wait for someone else to return an element to @pool */
        init_wait(&wait);
        prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);

        spin_unlock_irqrestore(&pool->lock, flags);

        /*
         * FIXME: this should be io_schedule().  The timeout is there as a
         * workaround for some DM problems in 2.6.18.
         */
        io_schedule_timeout(5*HZ);

        finish_wait(&pool->wait, &wait);
        goto repeat_alloc;
}
EXPORT_SYMBOL(mempool_alloc_noprof);

/**
 * mempool_alloc_preallocated - allocate an element from preallocated elements
 *                              belonging to a specific memory pool
 * @pool:      pointer to the memory pool which was allocated via
 *             mempool_create().
 *
 * This function is similar to mempool_alloc, but it only attempts allocating
 * an element from the preallocated elements. It does not sleep and immediately
 * returns if no preallocated elements are available.
 *
 * Return: pointer to the allocated element or %NULL if no elements are
 * available.
 */
void *mempool_alloc_preallocated(mempool_t *pool)
{
        void *element;
        unsigned long flags;

        spin_lock_irqsave(&pool->lock, flags);
        if (likely(pool->curr_nr)) {
                element = remove_element(pool);
                spin_unlock_irqrestore(&pool->lock, flags);
                /* paired with rmb in mempool_free(), read comment there */
                smp_wmb();
                /*
                 * Update the allocation stack trace as this is more useful
                 * for debugging.
                 */
                kmemleak_update_trace(element);
                return element;
        }
        spin_unlock_irqrestore(&pool->lock, flags);

        return NULL;
}
EXPORT_SYMBOL(mempool_alloc_preallocated);

/**
 * mempool_free - return an element to the pool.
 * @element:   pool element pointer.
 * @pool:      pointer to the memory pool which was allocated via
 *             mempool_create().
 *
 * this function only sleeps if the free_fn() function sleeps.
 */
void mempool_free(void *element, mempool_t *pool)
{
        unsigned long flags;

        if (unlikely(element == NULL))
                return;

        /*
         * Paired with the wmb in mempool_alloc().  The preceding read is
         * for @element and the following @pool->curr_nr.  This ensures
         * that the visible value of @pool->curr_nr is from after the
         * allocation of @element.  This is necessary for fringe cases
         * where @element was passed to this task without going through
         * barriers.
         *
         * For example, assume @p is %NULL at the beginning and one task
         * performs "p = mempool_alloc(...);" while another task is doing
         * "while (!p) cpu_relax(); mempool_free(p, ...);".  This function
         * may end up using curr_nr value which is from before allocation
         * of @p without the following rmb.
         */
        smp_rmb();

        /*
         * For correctness, we need a test which is guaranteed to trigger
         * if curr_nr + #allocated == min_nr.  Testing curr_nr < min_nr
         * without locking achieves that and refilling as soon as possible
         * is desirable.
         *
         * Because curr_nr visible here is always a value after the
         * allocation of @element, any task which decremented curr_nr below
         * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets
         * incremented to min_nr afterwards.  If curr_nr gets incremented
         * to min_nr after the allocation of @element, the elements
         * allocated after that are subject to the same guarantee.
         *
         * Waiters happen iff curr_nr is 0 and the above guarantee also
         * ensures that there will be frees which return elements to the
         * pool waking up the waiters.
         */
        if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) {
                spin_lock_irqsave(&pool->lock, flags);
                if (likely(pool->curr_nr < pool->min_nr)) {
                        add_element(pool, element);
                        spin_unlock_irqrestore(&pool->lock, flags);
                        wake_up(&pool->wait);
                        return;
                }
                spin_unlock_irqrestore(&pool->lock, flags);
        }
        pool->free(element, pool->pool_data);
}
EXPORT_SYMBOL(mempool_free);

/*
 * A commonly used alloc and free fn.
 */
void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
{
        struct kmem_cache *mem = pool_data;
        VM_BUG_ON(mem->ctor);
        return kmem_cache_alloc_noprof(mem, gfp_mask);
}
EXPORT_SYMBOL(mempool_alloc_slab);

void mempool_free_slab(void *element, void *pool_data)
{
        struct kmem_cache *mem = pool_data;
        kmem_cache_free(mem, element);
}
EXPORT_SYMBOL(mempool_free_slab);

/*
 * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory
 * specified by pool_data
 */
void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
{
        size_t size = (size_t)pool_data;
        return kmalloc_noprof(size, gfp_mask);
}
EXPORT_SYMBOL(mempool_kmalloc);

void mempool_kfree(void *element, void *pool_data)
{
        kfree(element);
}
EXPORT_SYMBOL(mempool_kfree);

void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data)
{
        size_t size = (size_t)pool_data;
        return kvmalloc(size, gfp_mask);
}
EXPORT_SYMBOL(mempool_kvmalloc);

void mempool_kvfree(void *element, void *pool_data)
{
        kvfree(element);
}
EXPORT_SYMBOL(mempool_kvfree);

/*
 * A simple mempool-backed page allocator that allocates pages
 * of the order specified by pool_data.
 */
void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data)
{
        int order = (int)(long)pool_data;
        return alloc_pages_noprof(gfp_mask, order);
}
EXPORT_SYMBOL(mempool_alloc_pages);

void mempool_free_pages(void *element, void *pool_data)
{
        int order = (int)(long)pool_data;
        __free_pages(element, order);
}
EXPORT_SYMBOL(mempool_free_pages);


















































    1 












    1 











































































    1 













    1 






















    1 











    1 



































































































    1 













    1 


    1 


    1 


















































    1 






    1 












































































    1 










    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/minix/dir.c
 *
 *  Copyright (C) 1991, 1992 Linus Torvalds
 *
 *  minix directory handling functions
 *
 *  Updated to filesystem version 3 by Daniel Aragones
 */

#include "minix.h"
#include <linux/buffer_head.h>
#include <linux/highmem.h>
#include <linux/swap.h>

typedef struct minix_dir_entry minix_dirent;
typedef struct minix3_dir_entry minix3_dirent;

static int minix_readdir(struct file *, struct dir_context *);

const struct file_operations minix_dir_operations = {
        .llseek                = generic_file_llseek,
        .read                = generic_read_dir,
        .iterate_shared        = minix_readdir,
        .fsync                = generic_file_fsync,
};

/*
 * Return the offset into page `page_nr' of the last valid
 * byte in that page, plus one.
 */
static unsigned
minix_last_byte(struct inode *inode, unsigned long page_nr)
{
        unsigned last_byte = PAGE_SIZE;

        if (page_nr == (inode->i_size >> PAGE_SHIFT))
                last_byte = inode->i_size & (PAGE_SIZE - 1);
        return last_byte;
}

static void dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
{
        struct address_space *mapping = page->mapping;
        struct inode *dir = mapping->host;

        block_write_end(NULL, mapping, pos, len, len, page, NULL);

        if (pos+len > dir->i_size) {
                i_size_write(dir, pos+len);
                mark_inode_dirty(dir);
        }
        unlock_page(page);
}

static int minix_handle_dirsync(struct inode *dir)
{
        int err;

        err = filemap_write_and_wait(dir->i_mapping);
        if (!err)
                err = sync_inode_metadata(dir, 1);
        return err;
}

static void *dir_get_page(struct inode *dir, unsigned long n, struct page **p)
{
        struct address_space *mapping = dir->i_mapping;
        struct page *page = read_mapping_page(mapping, n, NULL);
        if (IS_ERR(page))
                return ERR_CAST(page);
        *p = page;
        return kmap_local_page(page);
}

static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
{
        return (void*)((char*)de + sbi->s_dirsize);
}

static int minix_readdir(struct file *file, struct dir_context *ctx)
{
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        struct minix_sb_info *sbi = minix_sb(sb);
        unsigned chunk_size = sbi->s_dirsize;
        unsigned long npages = dir_pages(inode);
        unsigned long pos = ctx->pos;
        unsigned offset;
        unsigned long n;

        ctx->pos = pos = ALIGN(pos, chunk_size);
        if (pos >= inode->i_size)
                return 0;

        offset = pos & ~PAGE_MASK;
        n = pos >> PAGE_SHIFT;

        for ( ; n < npages; n++, offset = 0) {
                char *p, *kaddr, *limit;
                struct page *page;

                kaddr = dir_get_page(inode, n, &page);
                if (IS_ERR(kaddr))
                        continue;
                p = kaddr+offset;
                limit = kaddr + minix_last_byte(inode, n) - chunk_size;
                for ( ; p <= limit; p = minix_next_entry(p, sbi)) {
                        const char *name;
                        __u32 inumber;
                        if (sbi->s_version == MINIX_V3) {
                                minix3_dirent *de3 = (minix3_dirent *)p;
                                name = de3->name;
                                inumber = de3->inode;
                         } else {
                                minix_dirent *de = (minix_dirent *)p;
                                name = de->name;
                                inumber = de->inode;
                        }
                        if (inumber) {
                                unsigned l = strnlen(name, sbi->s_namelen);
                                if (!dir_emit(ctx, name, l,
                                              inumber, DT_UNKNOWN)) {
                                        unmap_and_put_page(page, p);
                                        return 0;
                                }
                        }
                        ctx->pos += chunk_size;
                }
                unmap_and_put_page(page, kaddr);
        }
        return 0;
}

static inline int namecompare(int len, int maxlen,
        const char * name, const char * buffer)
{
        if (len < maxlen && buffer[len])
                return 0;
        return !memcmp(name, buffer, len);
}

/*
 *        minix_find_entry()
 *
 * finds an entry in the specified directory with the wanted name. It
 * returns the cache buffer in which the entry was found, and the entry
 * itself (as a parameter - res_dir). It does NOT read the inode of the
 * entry - you'll have to do that yourself if you want to.
 */
minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page)
{
        const char * name = dentry->d_name.name;
        int namelen = dentry->d_name.len;
        struct inode * dir = d_inode(dentry->d_parent);
        struct super_block * sb = dir->i_sb;
        struct minix_sb_info * sbi = minix_sb(sb);
        unsigned long n;
        unsigned long npages = dir_pages(dir);
        struct page *page = NULL;
        char *p;

        char *namx;
        __u32 inumber;
        *res_page = NULL;

        for (n = 0; n < npages; n++) {
                char *kaddr, *limit;

                kaddr = dir_get_page(dir, n, &page);
                if (IS_ERR(kaddr))
                        continue;

                limit = kaddr + minix_last_byte(dir, n) - sbi->s_dirsize;
                for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) {
                        if (sbi->s_version == MINIX_V3) {
                                minix3_dirent *de3 = (minix3_dirent *)p;
                                namx = de3->name;
                                inumber = de3->inode;
                         } else {
                                minix_dirent *de = (minix_dirent *)p;
                                namx = de->name;
                                inumber = de->inode;
                        }
                        if (!inumber)
                                continue;
                        if (namecompare(namelen, sbi->s_namelen, name, namx))
                                goto found;
                }
                unmap_and_put_page(page, kaddr);
        }
        return NULL;

found:
        *res_page = page;
        return (minix_dirent *)p;
}

int minix_add_link(struct dentry *dentry, struct inode *inode)
{
        struct inode *dir = d_inode(dentry->d_parent);
        const char * name = dentry->d_name.name;
        int namelen = dentry->d_name.len;
        struct super_block * sb = dir->i_sb;
        struct minix_sb_info * sbi = minix_sb(sb);
        struct page *page = NULL;
        unsigned long npages = dir_pages(dir);
        unsigned long n;
        char *kaddr, *p;
        minix_dirent *de;
        minix3_dirent *de3;
        loff_t pos;
        int err;
        char *namx = NULL;
        __u32 inumber;

        /*
         * We take care of directory expansion in the same loop
         * This code plays outside i_size, so it locks the page
         * to protect that region.
         */
        for (n = 0; n <= npages; n++) {
                char *limit, *dir_end;

                kaddr = dir_get_page(dir, n, &page);
                if (IS_ERR(kaddr))
                        return PTR_ERR(kaddr);
                lock_page(page);
                dir_end = kaddr + minix_last_byte(dir, n);
                limit = kaddr + PAGE_SIZE - sbi->s_dirsize;
                for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) {
                        de = (minix_dirent *)p;
                        de3 = (minix3_dirent *)p;
                        if (sbi->s_version == MINIX_V3) {
                                namx = de3->name;
                                inumber = de3->inode;
                         } else {
                                  namx = de->name;
                                inumber = de->inode;
                        }
                        if (p == dir_end) {
                                /* We hit i_size */
                                if (sbi->s_version == MINIX_V3)
                                        de3->inode = 0;
                                 else
                                        de->inode = 0;
                                goto got_it;
                        }
                        if (!inumber)
                                goto got_it;
                        err = -EEXIST;
                        if (namecompare(namelen, sbi->s_namelen, name, namx))
                                goto out_unlock;
                }
                unlock_page(page);
                unmap_and_put_page(page, kaddr);
        }
        BUG();
        return -EINVAL;

got_it:
        pos = page_offset(page) + offset_in_page(p);
        err = minix_prepare_chunk(page, pos, sbi->s_dirsize);
        if (err)
                goto out_unlock;
        memcpy (namx, name, namelen);
        if (sbi->s_version == MINIX_V3) {
                memset (namx + namelen, 0, sbi->s_dirsize - namelen - 4);
                de3->inode = inode->i_ino;
        } else {
                memset (namx + namelen, 0, sbi->s_dirsize - namelen - 2);
                de->inode = inode->i_ino;
        }
        dir_commit_chunk(page, pos, sbi->s_dirsize);
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        mark_inode_dirty(dir);
        err = minix_handle_dirsync(dir);
out_put:
        unmap_and_put_page(page, kaddr);
        return err;
out_unlock:
        unlock_page(page);
        goto out_put;
}

int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
{
        struct inode *inode = page->mapping->host;
        loff_t pos = page_offset(page) + offset_in_page(de);
        struct minix_sb_info *sbi = minix_sb(inode->i_sb);
        unsigned len = sbi->s_dirsize;
        int err;

        lock_page(page);
        err = minix_prepare_chunk(page, pos, len);
        if (err) {
                unlock_page(page);
                return err;
        }
        if (sbi->s_version == MINIX_V3)
                ((minix3_dirent *)de)->inode = 0;
        else
                de->inode = 0;
        dir_commit_chunk(page, pos, len);
        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        mark_inode_dirty(inode);
        return minix_handle_dirsync(inode);
}

int minix_make_empty(struct inode *inode, struct inode *dir)
{
        struct page *page = grab_cache_page(inode->i_mapping, 0);
        struct minix_sb_info *sbi = minix_sb(inode->i_sb);
        char *kaddr;
        int err;

        if (!page)
                return -ENOMEM;
        err = minix_prepare_chunk(page, 0, 2 * sbi->s_dirsize);
        if (err) {
                unlock_page(page);
                goto fail;
        }

        kaddr = kmap_local_page(page);
        memset(kaddr, 0, PAGE_SIZE);

        if (sbi->s_version == MINIX_V3) {
                minix3_dirent *de3 = (minix3_dirent *)kaddr;

                de3->inode = inode->i_ino;
                strcpy(de3->name, ".");
                de3 = minix_next_entry(de3, sbi);
                de3->inode = dir->i_ino;
                strcpy(de3->name, "..");
        } else {
                minix_dirent *de = (minix_dirent *)kaddr;

                de->inode = inode->i_ino;
                strcpy(de->name, ".");
                de = minix_next_entry(de, sbi);
                de->inode = dir->i_ino;
                strcpy(de->name, "..");
        }
        kunmap_local(kaddr);

        dir_commit_chunk(page, 0, 2 * sbi->s_dirsize);
        err = minix_handle_dirsync(inode);
fail:
        put_page(page);
        return err;
}

/*
 * routine to check that the specified directory is empty (for rmdir)
 */
int minix_empty_dir(struct inode * inode)
{
        struct page *page = NULL;
        unsigned long i, npages = dir_pages(inode);
        struct minix_sb_info *sbi = minix_sb(inode->i_sb);
        char *name, *kaddr;
        __u32 inumber;

        for (i = 0; i < npages; i++) {
                char *p, *limit;

                kaddr = dir_get_page(inode, i, &page);
                if (IS_ERR(kaddr))
                        continue;

                limit = kaddr + minix_last_byte(inode, i) - sbi->s_dirsize;
                for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) {
                        if (sbi->s_version == MINIX_V3) {
                                minix3_dirent *de3 = (minix3_dirent *)p;
                                name = de3->name;
                                inumber = de3->inode;
                        } else {
                                minix_dirent *de = (minix_dirent *)p;
                                name = de->name;
                                inumber = de->inode;
                        }

                        if (inumber != 0) {
                                /* check for . and .. */
                                if (name[0] != '.')
                                        goto not_empty;
                                if (!name[1]) {
                                        if (inumber != inode->i_ino)
                                                goto not_empty;
                                } else if (name[1] != '.')
                                        goto not_empty;
                                else if (name[2])
                                        goto not_empty;
                        }
                }
                unmap_and_put_page(page, kaddr);
        }
        return 1;

not_empty:
        unmap_and_put_page(page, kaddr);
        return 0;
}

/* Releases the page */
int minix_set_link(struct minix_dir_entry *de, struct page *page,
                struct inode *inode)
{
        struct inode *dir = page->mapping->host;
        struct minix_sb_info *sbi = minix_sb(dir->i_sb);
        loff_t pos = page_offset(page) + offset_in_page(de);
        int err;

        lock_page(page);
        err = minix_prepare_chunk(page, pos, sbi->s_dirsize);
        if (err) {
                unlock_page(page);
                return err;
        }
        if (sbi->s_version == MINIX_V3)
                ((minix3_dirent *)de)->inode = inode->i_ino;
        else
                de->inode = inode->i_ino;
        dir_commit_chunk(page, pos, sbi->s_dirsize);
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        mark_inode_dirty(dir);
        return minix_handle_dirsync(dir);
}

struct minix_dir_entry * minix_dotdot (struct inode *dir, struct page **p)
{
        struct minix_sb_info *sbi = minix_sb(dir->i_sb);
        struct minix_dir_entry *de = dir_get_page(dir, 0, p);

        if (!IS_ERR(de))
                return minix_next_entry(de, sbi);
        return NULL;
}

ino_t minix_inode_by_name(struct dentry *dentry)
{
        struct page *page;
        struct minix_dir_entry *de = minix_find_entry(dentry, &page);
        ino_t res = 0;

        if (de) {
                struct address_space *mapping = page->mapping;
                struct inode *inode = mapping->host;
                struct minix_sb_info *sbi = minix_sb(inode->i_sb);

                if (sbi->s_version == MINIX_V3)
                        res = ((minix3_dirent *) de)->inode;
                else
                        res = de->inode;
                unmap_and_put_page(page, de);
        }
        return res;
}





































































































    1 


    1 



























































































































    4 



    3 
    1 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
// SPDX-License-Identifier: GPL-2.0
/*
 * Supplementary group IDs
 */
#include <linux/cred.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/sort.h>
#include <linux/syscalls.h>
#include <linux/user_namespace.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>

struct group_info *groups_alloc(int gidsetsize)
{
        struct group_info *gi;
        gi = kvmalloc(struct_size(gi, gid, gidsetsize), GFP_KERNEL_ACCOUNT);
        if (!gi)
                return NULL;

        refcount_set(&gi->usage, 1);
        gi->ngroups = gidsetsize;
        return gi;
}

EXPORT_SYMBOL(groups_alloc);

void groups_free(struct group_info *group_info)
{
        kvfree(group_info);
}

EXPORT_SYMBOL(groups_free);

/* export the group_info to a user-space array */
static int groups_to_user(gid_t __user *grouplist,
                          const struct group_info *group_info)
{
        struct user_namespace *user_ns = current_user_ns();
        int i;
        unsigned int count = group_info->ngroups;

        for (i = 0; i < count; i++) {
                gid_t gid;
                gid = from_kgid_munged(user_ns, group_info->gid[i]);
                if (put_user(gid, grouplist+i))
                        return -EFAULT;
        }
        return 0;
}

/* fill a group_info from a user-space array - it must be allocated already */
static int groups_from_user(struct group_info *group_info,
    gid_t __user *grouplist)
{
        struct user_namespace *user_ns = current_user_ns();
        int i;
        unsigned int count = group_info->ngroups;

        for (i = 0; i < count; i++) {
                gid_t gid;
                kgid_t kgid;
                if (get_user(gid, grouplist+i))
                        return -EFAULT;

                kgid = make_kgid(user_ns, gid);
                if (!gid_valid(kgid))
                        return -EINVAL;

                group_info->gid[i] = kgid;
        }
        return 0;
}

static int gid_cmp(const void *_a, const void *_b)
{
        kgid_t a = *(kgid_t *)_a;
        kgid_t b = *(kgid_t *)_b;

        return gid_gt(a, b) - gid_lt(a, b);
}

void groups_sort(struct group_info *group_info)
{
        sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid),
             gid_cmp, NULL);
}
EXPORT_SYMBOL(groups_sort);

/* a simple bsearch */
int groups_search(const struct group_info *group_info, kgid_t grp)
{
        unsigned int left, right;

        if (!group_info)
                return 0;

        left = 0;
        right = group_info->ngroups;
        while (left < right) {
                unsigned int mid = (left+right)/2;
                if (gid_gt(grp, group_info->gid[mid]))
                        left = mid + 1;
                else if (gid_lt(grp, group_info->gid[mid]))
                        right = mid;
                else
                        return 1;
        }
        return 0;
}

/**
 * set_groups - Change a group subscription in a set of credentials
 * @new: The newly prepared set of credentials to alter
 * @group_info: The group list to install
 */
void set_groups(struct cred *new, struct group_info *group_info)
{
        put_group_info(new->group_info);
        get_group_info(group_info);
        new->group_info = group_info;
}

EXPORT_SYMBOL(set_groups);

/**
 * set_current_groups - Change current's group subscription
 * @group_info: The group list to impose
 *
 * Validate a group subscription and, if valid, impose it upon current's task
 * security record.
 */
int set_current_groups(struct group_info *group_info)
{
        struct cred *new;
        const struct cred *old;
        int retval;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        old = current_cred();

        set_groups(new, group_info);

        retval = security_task_fix_setgroups(new, old);
        if (retval < 0)
                goto error;

        return commit_creds(new);

error:
        abort_creds(new);
        return retval;
}

EXPORT_SYMBOL(set_current_groups);

SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
{
        const struct cred *cred = current_cred();
        int i;

        if (gidsetsize < 0)
                return -EINVAL;

        /* no need to grab task_lock here; it cannot change */
        i = cred->group_info->ngroups;
        if (gidsetsize) {
                if (i > gidsetsize) {
                        i = -EINVAL;
                        goto out;
                }
                if (groups_to_user(grouplist, cred->group_info)) {
                        i = -EFAULT;
                        goto out;
                }
        }
out:
        return i;
}

bool may_setgroups(void)
{
        struct user_namespace *user_ns = current_user_ns();

        return ns_capable_setid(user_ns, CAP_SETGID) &&
                userns_may_setgroups(user_ns);
}

/*
 *        SMP: Our groups are copy-on-write. We can set them safely
 *        without another task interfering.
 */

SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
{
        struct group_info *group_info;
        int retval;

        if (!may_setgroups())
                return -EPERM;
        if ((unsigned)gidsetsize > NGROUPS_MAX)
                return -EINVAL;

        group_info = groups_alloc(gidsetsize);
        if (!group_info)
                return -ENOMEM;
        retval = groups_from_user(group_info, grouplist);
        if (retval) {
                put_group_info(group_info);
                return retval;
        }

        groups_sort(group_info);
        retval = set_current_groups(group_info);
        put_group_info(group_info);

        return retval;
}

/*
 * Check whether we're fsgid/egid or in the supplemental group..
 */
int in_group_p(kgid_t grp)
{
        const struct cred *cred = current_cred();
        int retval = 1;

        if (!gid_eq(grp, cred->fsgid))
                retval = groups_search(cred->group_info, grp);
        return retval;
}

EXPORT_SYMBOL(in_group_p);

int in_egroup_p(kgid_t grp)
{
        const struct cred *cred = current_cred();
        int retval = 1;

        if (!gid_eq(grp, cred->egid))
                retval = groups_search(cred->group_info, grp);
        return retval;
}

EXPORT_SYMBOL(in_egroup_p);














































































































































































































    1 








    1 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* delayacct.h - per-task delay accounting
 *
 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
 */

#ifndef _LINUX_DELAYACCT_H
#define _LINUX_DELAYACCT_H

#include <uapi/linux/taskstats.h>

#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info {
        raw_spinlock_t        lock;

        /* For each stat XXX, add following, aligned appropriately
         *
         * struct timespec XXX_start, XXX_end;
         * u64 XXX_delay;
         * u32 XXX_count;
         *
         * Atomicity of updates to XXX_delay, XXX_count protected by
         * single lock above (split into XXX_lock if contention is an issue).
         */

        /*
         * XXX_count is incremented on every XXX operation, the delay
         * associated with the operation is added to XXX_delay.
         * XXX_delay contains the accumulated delay time in nanoseconds.
         */
        u64 blkio_start;
        u64 blkio_delay;        /* wait for sync block io completion */
        u64 swapin_start;
        u64 swapin_delay;        /* wait for swapin */
        u32 blkio_count;        /* total count of the number of sync block */
                                /* io operations performed */
        u32 swapin_count;        /* total count of swapin */

        u64 freepages_start;
        u64 freepages_delay;        /* wait for memory reclaim */

        u64 thrashing_start;
        u64 thrashing_delay;        /* wait for thrashing page */

        u64 compact_start;
        u64 compact_delay;        /* wait for memory compact */

        u64 wpcopy_start;
        u64 wpcopy_delay;        /* wait for write-protect copy */

        u64 irq_delay;        /* wait for IRQ/SOFTIRQ */

        u32 freepages_count;        /* total count of memory reclaim */
        u32 thrashing_count;        /* total count of thrash waits */
        u32 compact_count;        /* total count of memory compact */
        u32 wpcopy_count;        /* total count of write-protect copy */
        u32 irq_count;        /* total count of IRQ/SOFTIRQ */
};
#endif

#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/jump_label.h>

#ifdef CONFIG_TASK_DELAY_ACCT
DECLARE_STATIC_KEY_FALSE(delayacct_key);
extern int delayacct_on;        /* Delay accounting turned on/off */
extern struct kmem_cache *delayacct_cache;
extern void delayacct_init(void);

extern void __delayacct_tsk_init(struct task_struct *);
extern void __delayacct_tsk_exit(struct task_struct *);
extern void __delayacct_blkio_start(void);
extern void __delayacct_blkio_end(struct task_struct *);
extern int delayacct_add_tsk(struct taskstats *, struct task_struct *);
extern __u64 __delayacct_blkio_ticks(struct task_struct *);
extern void __delayacct_freepages_start(void);
extern void __delayacct_freepages_end(void);
extern void __delayacct_thrashing_start(bool *in_thrashing);
extern void __delayacct_thrashing_end(bool *in_thrashing);
extern void __delayacct_swapin_start(void);
extern void __delayacct_swapin_end(void);
extern void __delayacct_compact_start(void);
extern void __delayacct_compact_end(void);
extern void __delayacct_wpcopy_start(void);
extern void __delayacct_wpcopy_end(void);
extern void __delayacct_irq(struct task_struct *task, u32 delta);

static inline void delayacct_tsk_init(struct task_struct *tsk)
{
        /* reinitialize in case parent's non-null pointer was dup'ed*/
        tsk->delays = NULL;
        if (delayacct_on)
                __delayacct_tsk_init(tsk);
}

/* Free tsk->delays. Called from bad fork and __put_task_struct
 * where there's no risk of tsk->delays being accessed elsewhere
 */
static inline void delayacct_tsk_free(struct task_struct *tsk)
{
        if (tsk->delays)
                kmem_cache_free(delayacct_cache, tsk->delays);
        tsk->delays = NULL;
}

static inline void delayacct_blkio_start(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_blkio_start();
}

static inline void delayacct_blkio_end(struct task_struct *p)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (p->delays)
                __delayacct_blkio_end(p);
}

static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
{
        if (tsk->delays)
                return __delayacct_blkio_ticks(tsk);
        return 0;
}

static inline void delayacct_freepages_start(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_freepages_start();
}

static inline void delayacct_freepages_end(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_freepages_end();
}

static inline void delayacct_thrashing_start(bool *in_thrashing)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_thrashing_start(in_thrashing);
}

static inline void delayacct_thrashing_end(bool *in_thrashing)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_thrashing_end(in_thrashing);
}

static inline void delayacct_swapin_start(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_swapin_start();
}

static inline void delayacct_swapin_end(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_swapin_end();
}

static inline void delayacct_compact_start(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_compact_start();
}

static inline void delayacct_compact_end(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_compact_end();
}

static inline void delayacct_wpcopy_start(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_wpcopy_start();
}

static inline void delayacct_wpcopy_end(void)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (current->delays)
                __delayacct_wpcopy_end();
}

static inline void delayacct_irq(struct task_struct *task, u32 delta)
{
        if (!static_branch_unlikely(&delayacct_key))
                return;

        if (task->delays)
                __delayacct_irq(task, delta);
}

#else
static inline void delayacct_init(void)
{}
static inline void delayacct_tsk_init(struct task_struct *tsk)
{}
static inline void delayacct_tsk_free(struct task_struct *tsk)
{}
static inline void delayacct_blkio_start(void)
{}
static inline void delayacct_blkio_end(struct task_struct *p)
{}
static inline int delayacct_add_tsk(struct taskstats *d,
                                        struct task_struct *tsk)
{ return 0; }
static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
{ return 0; }
static inline int delayacct_is_task_waiting_on_io(struct task_struct *p)
{ return 0; }
static inline void delayacct_freepages_start(void)
{}
static inline void delayacct_freepages_end(void)
{}
static inline void delayacct_thrashing_start(bool *in_thrashing)
{}
static inline void delayacct_thrashing_end(bool *in_thrashing)
{}
static inline void delayacct_swapin_start(void)
{}
static inline void delayacct_swapin_end(void)
{}
static inline void delayacct_compact_start(void)
{}
static inline void delayacct_compact_end(void)
{}
static inline void delayacct_wpcopy_start(void)
{}
static inline void delayacct_wpcopy_end(void)
{}
static inline void delayacct_irq(struct task_struct *task, u32 delta)
{}

#endif /* CONFIG_TASK_DELAY_ACCT */

#endif































































































    2 
    2 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
// SPDX-License-Identifier: GPL-2.0
/*
 *        linux/mm/madvise.c
 *
 * Copyright (C) 1999  Linus Torvalds
 * Copyright (C) 2002  Christoph Hellwig
 */

#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/mempolicy.h>
#include <linux/page-isolation.h>
#include <linux/page_idle.h>
#include <linux/userfaultfd_k.h>
#include <linux/hugetlb.h>
#include <linux/falloc.h>
#include <linux/fadvise.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/mm_inline.h>
#include <linux/string.h>
#include <linux/uio.h>
#include <linux/ksm.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/pagewalk.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <linux/mmu_notifier.h>

#include <asm/tlb.h>

#include "internal.h"
#include "swap.h"

struct madvise_walk_private {
        struct mmu_gather *tlb;
        bool pageout;
};

/*
 * Any behaviour which results in changes to the vma->vm_flags needs to
 * take mmap_lock for writing. Others, which simply traverse vmas, need
 * to only take it for reading.
 */
static int madvise_need_mmap_write(int behavior)
{
        switch (behavior) {
        case MADV_REMOVE:
        case MADV_WILLNEED:
        case MADV_DONTNEED:
        case MADV_DONTNEED_LOCKED:
        case MADV_COLD:
        case MADV_PAGEOUT:
        case MADV_FREE:
        case MADV_POPULATE_READ:
        case MADV_POPULATE_WRITE:
        case MADV_COLLAPSE:
                return 0;
        default:
                /* be safe, default to 1. list exceptions explicitly */
                return 1;
        }
}

#ifdef CONFIG_ANON_VMA_NAME
struct anon_vma_name *anon_vma_name_alloc(const char *name)
{
        struct anon_vma_name *anon_name;
        size_t count;

        /* Add 1 for NUL terminator at the end of the anon_name->name */
        count = strlen(name) + 1;
        anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
        if (anon_name) {
                kref_init(&anon_name->kref);
                memcpy(anon_name->name, name, count);
        }

        return anon_name;
}

void anon_vma_name_free(struct kref *kref)
{
        struct anon_vma_name *anon_name =
                        container_of(kref, struct anon_vma_name, kref);
        kfree(anon_name);
}

struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
{
        mmap_assert_locked(vma->vm_mm);

        return vma->anon_name;
}

/* mmap_lock should be write-locked */
static int replace_anon_vma_name(struct vm_area_struct *vma,
                                 struct anon_vma_name *anon_name)
{
        struct anon_vma_name *orig_name = anon_vma_name(vma);

        if (!anon_name) {
                vma->anon_name = NULL;
                anon_vma_name_put(orig_name);
                return 0;
        }

        if (anon_vma_name_eq(orig_name, anon_name))
                return 0;

        vma->anon_name = anon_vma_name_reuse(anon_name);
        anon_vma_name_put(orig_name);

        return 0;
}
#else /* CONFIG_ANON_VMA_NAME */
static int replace_anon_vma_name(struct vm_area_struct *vma,
                                 struct anon_vma_name *anon_name)
{
        if (anon_name)
                return -EINVAL;

        return 0;
}
#endif /* CONFIG_ANON_VMA_NAME */
/*
 * Update the vm_flags on region of a vma, splitting it or merging it as
 * necessary.  Must be called with mmap_lock held for writing;
 * Caller should ensure anon_name stability by raising its refcount even when
 * anon_name belongs to a valid vma because this function might free that vma.
 */
static int madvise_update_vma(struct vm_area_struct *vma,
                              struct vm_area_struct **prev, unsigned long start,
                              unsigned long end, unsigned long new_flags,
                              struct anon_vma_name *anon_name)
{
        struct mm_struct *mm = vma->vm_mm;
        int error;
        VMA_ITERATOR(vmi, mm, start);

        if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
                *prev = vma;
                return 0;
        }

        vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags,
                                    anon_name);
        if (IS_ERR(vma))
                return PTR_ERR(vma);

        *prev = vma;

        /* vm_flags is protected by the mmap_lock held in write mode. */
        vma_start_write(vma);
        vm_flags_reset(vma, new_flags);
        if (!vma->vm_file || vma_is_anon_shmem(vma)) {
                error = replace_anon_vma_name(vma, anon_name);
                if (error)
                        return error;
        }

        return 0;
}

#ifdef CONFIG_SWAP
static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
                unsigned long end, struct mm_walk *walk)
{
        struct vm_area_struct *vma = walk->private;
        struct swap_iocb *splug = NULL;
        pte_t *ptep = NULL;
        spinlock_t *ptl;
        unsigned long addr;

        for (addr = start; addr < end; addr += PAGE_SIZE) {
                pte_t pte;
                swp_entry_t entry;
                struct folio *folio;

                if (!ptep++) {
                        ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
                        if (!ptep)
                                break;
                }

                pte = ptep_get(ptep);
                if (!is_swap_pte(pte))
                        continue;
                entry = pte_to_swp_entry(pte);
                if (unlikely(non_swap_entry(entry)))
                        continue;

                pte_unmap_unlock(ptep, ptl);
                ptep = NULL;

                folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
                                             vma, addr, &splug);
                if (folio)
                        folio_put(folio);
        }

        if (ptep)
                pte_unmap_unlock(ptep, ptl);
        swap_read_unplug(splug);
        cond_resched();

        return 0;
}

static const struct mm_walk_ops swapin_walk_ops = {
        .pmd_entry                = swapin_walk_pmd_entry,
        .walk_lock                = PGWALK_RDLOCK,
};

static void shmem_swapin_range(struct vm_area_struct *vma,
                unsigned long start, unsigned long end,
                struct address_space *mapping)
{
        XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
        pgoff_t end_index = linear_page_index(vma, end) - 1;
        struct folio *folio;
        struct swap_iocb *splug = NULL;

        rcu_read_lock();
        xas_for_each(&xas, folio, end_index) {
                unsigned long addr;
                swp_entry_t entry;

                if (!xa_is_value(folio))
                        continue;
                entry = radix_to_swp_entry(folio);
                /* There might be swapin error entries in shmem mapping. */
                if (non_swap_entry(entry))
                        continue;

                addr = vma->vm_start +
                        ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT);
                xas_pause(&xas);
                rcu_read_unlock();

                folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
                                             vma, addr, &splug);
                if (folio)
                        folio_put(folio);

                rcu_read_lock();
        }
        rcu_read_unlock();
        swap_read_unplug(splug);
}
#endif                /* CONFIG_SWAP */

/*
 * Schedule all required I/O operations.  Do not wait for completion.
 */
static long madvise_willneed(struct vm_area_struct *vma,
                             struct vm_area_struct **prev,
                             unsigned long start, unsigned long end)
{
        struct mm_struct *mm = vma->vm_mm;
        struct file *file = vma->vm_file;
        loff_t offset;

        *prev = vma;
#ifdef CONFIG_SWAP
        if (!file) {
                walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
                lru_add_drain(); /* Push any new pages onto the LRU now */
                return 0;
        }

        if (shmem_mapping(file->f_mapping)) {
                shmem_swapin_range(vma, start, end, file->f_mapping);
                lru_add_drain(); /* Push any new pages onto the LRU now */
                return 0;
        }
#else
        if (!file)
                return -EBADF;
#endif

        if (IS_DAX(file_inode(file))) {
                /* no bad return value, but ignore advice */
                return 0;
        }

        /*
         * Filesystem's fadvise may need to take various locks.  We need to
         * explicitly grab a reference because the vma (and hence the
         * vma's reference to the file) can go away as soon as we drop
         * mmap_lock.
         */
        *prev = NULL;        /* tell sys_madvise we drop mmap_lock */
        get_file(file);
        offset = (loff_t)(start - vma->vm_start)
                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
        mmap_read_unlock(mm);
        vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
        fput(file);
        mmap_read_lock(mm);
        return 0;
}

static inline bool can_do_file_pageout(struct vm_area_struct *vma)
{
        if (!vma->vm_file)
                return false;
        /*
         * paging out pagecache only for non-anonymous mappings that correspond
         * to the files the calling process could (if tried) open for writing;
         * otherwise we'd be including shared non-exclusive mappings, which
         * opens a side channel.
         */
        return inode_owner_or_capable(&nop_mnt_idmap,
                                      file_inode(vma->vm_file)) ||
               file_permission(vma->vm_file, MAY_WRITE) == 0;
}

static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end,
                                          struct folio *folio, pte_t *ptep,
                                          pte_t pte, bool *any_young,
                                          bool *any_dirty)
{
        const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
        int max_nr = (end - addr) / PAGE_SIZE;

        return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
                               any_young, any_dirty);
}

static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
                                unsigned long addr, unsigned long end,
                                struct mm_walk *walk)
{
        struct madvise_walk_private *private = walk->private;
        struct mmu_gather *tlb = private->tlb;
        bool pageout = private->pageout;
        struct mm_struct *mm = tlb->mm;
        struct vm_area_struct *vma = walk->vma;
        pte_t *start_pte, *pte, ptent;
        spinlock_t *ptl;
        struct folio *folio = NULL;
        LIST_HEAD(folio_list);
        bool pageout_anon_only_filter;
        unsigned int batch_count = 0;
        int nr;

        if (fatal_signal_pending(current))
                return -EINTR;

        pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) &&
                                        !can_do_file_pageout(vma);

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        if (pmd_trans_huge(*pmd)) {
                pmd_t orig_pmd;
                unsigned long next = pmd_addr_end(addr, end);

                tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
                ptl = pmd_trans_huge_lock(pmd, vma);
                if (!ptl)
                        return 0;

                orig_pmd = *pmd;
                if (is_huge_zero_pmd(orig_pmd))
                        goto huge_unlock;

                if (unlikely(!pmd_present(orig_pmd))) {
                        VM_BUG_ON(thp_migration_supported() &&
                                        !is_pmd_migration_entry(orig_pmd));
                        goto huge_unlock;
                }

                folio = pmd_folio(orig_pmd);

                /* Do not interfere with other mappings of this folio */
                if (folio_likely_mapped_shared(folio))
                        goto huge_unlock;

                if (pageout_anon_only_filter && !folio_test_anon(folio))
                        goto huge_unlock;

                if (next - addr != HPAGE_PMD_SIZE) {
                        int err;

                        folio_get(folio);
                        spin_unlock(ptl);
                        folio_lock(folio);
                        err = split_folio(folio);
                        folio_unlock(folio);
                        folio_put(folio);
                        if (!err)
                                goto regular_folio;
                        return 0;
                }

                if (!pageout && pmd_young(orig_pmd)) {
                        pmdp_invalidate(vma, addr, pmd);
                        orig_pmd = pmd_mkold(orig_pmd);

                        set_pmd_at(mm, addr, pmd, orig_pmd);
                        tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
                }

                folio_clear_referenced(folio);
                folio_test_clear_young(folio);
                if (folio_test_active(folio))
                        folio_set_workingset(folio);
                if (pageout) {
                        if (folio_isolate_lru(folio)) {
                                if (folio_test_unevictable(folio))
                                        folio_putback_lru(folio);
                                else
                                        list_add(&folio->lru, &folio_list);
                        }
                } else
                        folio_deactivate(folio);
huge_unlock:
                spin_unlock(ptl);
                if (pageout)
                        reclaim_pages(&folio_list);
                return 0;
        }

regular_folio:
#endif
        tlb_change_page_size(tlb, PAGE_SIZE);
restart:
        start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (!start_pte)
                return 0;
        flush_tlb_batched_pending(mm);
        arch_enter_lazy_mmu_mode();
        for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
                nr = 1;
                ptent = ptep_get(pte);

                if (++batch_count == SWAP_CLUSTER_MAX) {
                        batch_count = 0;
                        if (need_resched()) {
                                arch_leave_lazy_mmu_mode();
                                pte_unmap_unlock(start_pte, ptl);
                                cond_resched();
                                goto restart;
                        }
                }

                if (pte_none(ptent))
                        continue;

                if (!pte_present(ptent))
                        continue;

                folio = vm_normal_folio(vma, addr, ptent);
                if (!folio || folio_is_zone_device(folio))
                        continue;

                /*
                 * If we encounter a large folio, only split it if it is not
                 * fully mapped within the range we are operating on. Otherwise
                 * leave it as is so that it can be swapped out whole. If we
                 * fail to split a folio, leave it in place and advance to the
                 * next pte in the range.
                 */
                if (folio_test_large(folio)) {
                        bool any_young;

                        nr = madvise_folio_pte_batch(addr, end, folio, pte,
                                                     ptent, &any_young, NULL);
                        if (any_young)
                                ptent = pte_mkyoung(ptent);

                        if (nr < folio_nr_pages(folio)) {
                                int err;

                                if (folio_likely_mapped_shared(folio))
                                        continue;
                                if (pageout_anon_only_filter && !folio_test_anon(folio))
                                        continue;
                                if (!folio_trylock(folio))
                                        continue;
                                folio_get(folio);
                                arch_leave_lazy_mmu_mode();
                                pte_unmap_unlock(start_pte, ptl);
                                start_pte = NULL;
                                err = split_folio(folio);
                                folio_unlock(folio);
                                folio_put(folio);
                                start_pte = pte =
                                        pte_offset_map_lock(mm, pmd, addr, &ptl);
                                if (!start_pte)
                                        break;
                                arch_enter_lazy_mmu_mode();
                                if (!err)
                                        nr = 0;
                                continue;
                        }
                }

                /*
                 * Do not interfere with other mappings of this folio and
                 * non-LRU folio. If we have a large folio at this point, we
                 * know it is fully mapped so if its mapcount is the same as its
                 * number of pages, it must be exclusive.
                 */
                if (!folio_test_lru(folio) ||
                    folio_mapcount(folio) != folio_nr_pages(folio))
                        continue;

                if (pageout_anon_only_filter && !folio_test_anon(folio))
                        continue;

                if (!pageout && pte_young(ptent)) {
                        clear_young_dirty_ptes(vma, addr, pte, nr,
                                               CYDP_CLEAR_YOUNG);
                        tlb_remove_tlb_entries(tlb, pte, nr, addr);
                }

                /*
                 * We are deactivating a folio for accelerating reclaiming.
                 * VM couldn't reclaim the folio unless we clear PG_young.
                 * As a side effect, it makes confuse idle-page tracking
                 * because they will miss recent referenced history.
                 */
                folio_clear_referenced(folio);
                folio_test_clear_young(folio);
                if (folio_test_active(folio))
                        folio_set_workingset(folio);
                if (pageout) {
                        if (folio_isolate_lru(folio)) {
                                if (folio_test_unevictable(folio))
                                        folio_putback_lru(folio);
                                else
                                        list_add(&folio->lru, &folio_list);
                        }
                } else
                        folio_deactivate(folio);
        }

        if (start_pte) {
                arch_leave_lazy_mmu_mode();
                pte_unmap_unlock(start_pte, ptl);
        }
        if (pageout)
                reclaim_pages(&folio_list);
        cond_resched();

        return 0;
}

static const struct mm_walk_ops cold_walk_ops = {
        .pmd_entry = madvise_cold_or_pageout_pte_range,
        .walk_lock = PGWALK_RDLOCK,
};

static void madvise_cold_page_range(struct mmu_gather *tlb,
                             struct vm_area_struct *vma,
                             unsigned long addr, unsigned long end)
{
        struct madvise_walk_private walk_private = {
                .pageout = false,
                .tlb = tlb,
        };

        tlb_start_vma(tlb, vma);
        walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
        tlb_end_vma(tlb, vma);
}

static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
{
        return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
}

static long madvise_cold(struct vm_area_struct *vma,
                        struct vm_area_struct **prev,
                        unsigned long start_addr, unsigned long end_addr)
{
        struct mm_struct *mm = vma->vm_mm;
        struct mmu_gather tlb;

        *prev = vma;
        if (!can_madv_lru_vma(vma))
                return -EINVAL;

        lru_add_drain();
        tlb_gather_mmu(&tlb, mm);
        madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
        tlb_finish_mmu(&tlb);

        return 0;
}

static void madvise_pageout_page_range(struct mmu_gather *tlb,
                             struct vm_area_struct *vma,
                             unsigned long addr, unsigned long end)
{
        struct madvise_walk_private walk_private = {
                .pageout = true,
                .tlb = tlb,
        };

        tlb_start_vma(tlb, vma);
        walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
        tlb_end_vma(tlb, vma);
}

static long madvise_pageout(struct vm_area_struct *vma,
                        struct vm_area_struct **prev,
                        unsigned long start_addr, unsigned long end_addr)
{
        struct mm_struct *mm = vma->vm_mm;
        struct mmu_gather tlb;

        *prev = vma;
        if (!can_madv_lru_vma(vma))
                return -EINVAL;

        /*
         * If the VMA belongs to a private file mapping, there can be private
         * dirty pages which can be paged out if even this process is neither
         * owner nor write capable of the file. We allow private file mappings
         * further to pageout dirty anon pages.
         */
        if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) &&
                                (vma->vm_flags & VM_MAYSHARE)))
                return 0;

        lru_add_drain();
        tlb_gather_mmu(&tlb, mm);
        madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
        tlb_finish_mmu(&tlb);

        return 0;
}

static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
                                unsigned long end, struct mm_walk *walk)

{
        const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
        struct mmu_gather *tlb = walk->private;
        struct mm_struct *mm = tlb->mm;
        struct vm_area_struct *vma = walk->vma;
        spinlock_t *ptl;
        pte_t *start_pte, *pte, ptent;
        struct folio *folio;
        int nr_swap = 0;
        unsigned long next;
        int nr, max_nr;

        next = pmd_addr_end(addr, end);
        if (pmd_trans_huge(*pmd))
                if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
                        return 0;

        tlb_change_page_size(tlb, PAGE_SIZE);
        start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        if (!start_pte)
                return 0;
        flush_tlb_batched_pending(mm);
        arch_enter_lazy_mmu_mode();
        for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
                nr = 1;
                ptent = ptep_get(pte);

                if (pte_none(ptent))
                        continue;
                /*
                 * If the pte has swp_entry, just clear page table to
                 * prevent swap-in which is more expensive rather than
                 * (page allocation + zeroing).
                 */
                if (!pte_present(ptent)) {
                        swp_entry_t entry;

                        entry = pte_to_swp_entry(ptent);
                        if (!non_swap_entry(entry)) {
                                max_nr = (end - addr) / PAGE_SIZE;
                                nr = swap_pte_batch(pte, max_nr, ptent);
                                nr_swap -= nr;
                                free_swap_and_cache_nr(entry, nr);
                                clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
                        } else if (is_hwpoison_entry(entry) ||
                                   is_poisoned_swp_entry(entry)) {
                                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
                        }
                        continue;
                }

                folio = vm_normal_folio(vma, addr, ptent);
                if (!folio || folio_is_zone_device(folio))
                        continue;

                /*
                 * If we encounter a large folio, only split it if it is not
                 * fully mapped within the range we are operating on. Otherwise
                 * leave it as is so that it can be marked as lazyfree. If we
                 * fail to split a folio, leave it in place and advance to the
                 * next pte in the range.
                 */
                if (folio_test_large(folio)) {
                        bool any_young, any_dirty;

                        nr = madvise_folio_pte_batch(addr, end, folio, pte,
                                                     ptent, &any_young, &any_dirty);

                        if (nr < folio_nr_pages(folio)) {
                                int err;

                                if (folio_likely_mapped_shared(folio))
                                        continue;
                                if (!folio_trylock(folio))
                                        continue;
                                folio_get(folio);
                                arch_leave_lazy_mmu_mode();
                                pte_unmap_unlock(start_pte, ptl);
                                start_pte = NULL;
                                err = split_folio(folio);
                                folio_unlock(folio);
                                folio_put(folio);
                                pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
                                start_pte = pte;
                                if (!start_pte)
                                        break;
                                arch_enter_lazy_mmu_mode();
                                if (!err)
                                        nr = 0;
                                continue;
                        }

                        if (any_young)
                                ptent = pte_mkyoung(ptent);
                        if (any_dirty)
                                ptent = pte_mkdirty(ptent);
                }

                if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
                        if (!folio_trylock(folio))
                                continue;
                        /*
                         * If we have a large folio at this point, we know it is
                         * fully mapped so if its mapcount is the same as its
                         * number of pages, it must be exclusive.
                         */
                        if (folio_mapcount(folio) != folio_nr_pages(folio)) {
                                folio_unlock(folio);
                                continue;
                        }

                        if (folio_test_swapcache(folio) &&
                            !folio_free_swap(folio)) {
                                folio_unlock(folio);
                                continue;
                        }

                        folio_clear_dirty(folio);
                        folio_unlock(folio);
                }

                if (pte_young(ptent) || pte_dirty(ptent)) {
                        clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags);
                        tlb_remove_tlb_entries(tlb, pte, nr, addr);
                }
                folio_mark_lazyfree(folio);
        }

        if (nr_swap)
                add_mm_counter(mm, MM_SWAPENTS, nr_swap);
        if (start_pte) {
                arch_leave_lazy_mmu_mode();
                pte_unmap_unlock(start_pte, ptl);
        }
        cond_resched();

        return 0;
}

static const struct mm_walk_ops madvise_free_walk_ops = {
        .pmd_entry                = madvise_free_pte_range,
        .walk_lock                = PGWALK_RDLOCK,
};

static int madvise_free_single_vma(struct vm_area_struct *vma,
                        unsigned long start_addr, unsigned long end_addr)
{
        struct mm_struct *mm = vma->vm_mm;
        struct mmu_notifier_range range;
        struct mmu_gather tlb;

        /* MADV_FREE works for only anon vma at the moment */
        if (!vma_is_anonymous(vma))
                return -EINVAL;

        range.start = max(vma->vm_start, start_addr);
        if (range.start >= vma->vm_end)
                return -EINVAL;
        range.end = min(vma->vm_end, end_addr);
        if (range.end <= vma->vm_start)
                return -EINVAL;
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                range.start, range.end);

        lru_add_drain();
        tlb_gather_mmu(&tlb, mm);
        update_hiwater_rss(mm);

        mmu_notifier_invalidate_range_start(&range);
        tlb_start_vma(&tlb, vma);
        walk_page_range(vma->vm_mm, range.start, range.end,
                        &madvise_free_walk_ops, &tlb);
        tlb_end_vma(&tlb, vma);
        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb);

        return 0;
}

/*
 * Application no longer needs these pages.  If the pages are dirty,
 * it's OK to just throw them away.  The app will be more careful about
 * data it wants to keep.  Be sure to free swap resources too.  The
 * zap_page_range_single call sets things up for shrink_active_list to actually
 * free these pages later if no one else has touched them in the meantime,
 * although we could add these pages to a global reuse list for
 * shrink_active_list to pick up before reclaiming other pages.
 *
 * NB: This interface discards data rather than pushes it out to swap,
 * as some implementations do.  This has performance implications for
 * applications like large transactional databases which want to discard
 * pages in anonymous maps after committing to backing store the data
 * that was kept in them.  There is no reason to write this data out to
 * the swap area if the application is discarding it.
 *
 * An interface that causes the system to free clean pages and flush
 * dirty pages is already available as msync(MS_INVALIDATE).
 */
static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
                                        unsigned long start, unsigned long end)
{
        zap_page_range_single(vma, start, end - start, NULL);
        return 0;
}

static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
                                            unsigned long start,
                                            unsigned long *end,
                                            int behavior)
{
        if (!is_vm_hugetlb_page(vma)) {
                unsigned int forbidden = VM_PFNMAP;

                if (behavior != MADV_DONTNEED_LOCKED)
                        forbidden |= VM_LOCKED;

                return !(vma->vm_flags & forbidden);
        }

        if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
                return false;
        if (start & ~huge_page_mask(hstate_vma(vma)))
                return false;

        /*
         * Madvise callers expect the length to be rounded up to PAGE_SIZE
         * boundaries, and may be unaware that this VMA uses huge pages.
         * Avoid unexpected data loss by rounding down the number of
         * huge pages freed.
         */
        *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));

        return true;
}

static long madvise_dontneed_free(struct vm_area_struct *vma,
                                  struct vm_area_struct **prev,
                                  unsigned long start, unsigned long end,
                                  int behavior)
{
        struct mm_struct *mm = vma->vm_mm;

        *prev = vma;
        if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
                return -EINVAL;

        if (start == end)
                return 0;

        if (!userfaultfd_remove(vma, start, end)) {
                *prev = NULL; /* mmap_lock has been dropped, prev is stale */

                mmap_read_lock(mm);
                vma = vma_lookup(mm, start);
                if (!vma)
                        return -ENOMEM;
                /*
                 * Potential end adjustment for hugetlb vma is OK as
                 * the check below keeps end within vma.
                 */
                if (!madvise_dontneed_free_valid_vma(vma, start, &end,
                                                     behavior))
                        return -EINVAL;
                if (end > vma->vm_end) {
                        /*
                         * Don't fail if end > vma->vm_end. If the old
                         * vma was split while the mmap_lock was
                         * released the effect of the concurrent
                         * operation may not cause madvise() to
                         * have an undefined result. There may be an
                         * adjacent next vma that we'll walk
                         * next. userfaultfd_remove() will generate an
                         * UFFD_EVENT_REMOVE repetition on the
                         * end-vma->vm_end range, but the manager can
                         * handle a repetition fine.
                         */
                        end = vma->vm_end;
                }
                VM_WARN_ON(start >= end);
        }

        if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
                return madvise_dontneed_single_vma(vma, start, end);
        else if (behavior == MADV_FREE)
                return madvise_free_single_vma(vma, start, end);
        else
                return -EINVAL;
}

static long madvise_populate(struct mm_struct *mm, unsigned long start,
                unsigned long end, int behavior)
{
        const bool write = behavior == MADV_POPULATE_WRITE;
        int locked = 1;
        long pages;

        while (start < end) {
                /* Populate (prefault) page tables readable/writable. */
                pages = faultin_page_range(mm, start, end, write, &locked);
                if (!locked) {
                        mmap_read_lock(mm);
                        locked = 1;
                }
                if (pages < 0) {
                        switch (pages) {
                        case -EINTR:
                                return -EINTR;
                        case -EINVAL: /* Incompatible mappings / permissions. */
                                return -EINVAL;
                        case -EHWPOISON:
                                return -EHWPOISON;
                        case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
                                return -EFAULT;
                        default:
                                pr_warn_once("%s: unhandled return value: %ld\n",
                                             __func__, pages);
                                fallthrough;
                        case -ENOMEM: /* No VMA or out of memory. */
                                return -ENOMEM;
                        }
                }
                start += pages * PAGE_SIZE;
        }
        return 0;
}

/*
 * Application wants to free up the pages and associated backing store.
 * This is effectively punching a hole into the middle of a file.
 */
static long madvise_remove(struct vm_area_struct *vma,
                                struct vm_area_struct **prev,
                                unsigned long start, unsigned long end)
{
        loff_t offset;
        int error;
        struct file *f;
        struct mm_struct *mm = vma->vm_mm;

        *prev = NULL;        /* tell sys_madvise we drop mmap_lock */

        if (vma->vm_flags & VM_LOCKED)
                return -EINVAL;

        f = vma->vm_file;

        if (!f || !f->f_mapping || !f->f_mapping->host) {
                        return -EINVAL;
        }

        if (!vma_is_shared_maywrite(vma))
                return -EACCES;

        offset = (loff_t)(start - vma->vm_start)
                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);

        /*
         * Filesystem's fallocate may need to take i_rwsem.  We need to
         * explicitly grab a reference because the vma (and hence the
         * vma's reference to the file) can go away as soon as we drop
         * mmap_lock.
         */
        get_file(f);
        if (userfaultfd_remove(vma, start, end)) {
                /* mmap_lock was not released by userfaultfd_remove() */
                mmap_read_unlock(mm);
        }
        error = vfs_fallocate(f,
                                FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
                                offset, end - start);
        fput(f);
        mmap_read_lock(mm);
        return error;
}

/*
 * Apply an madvise behavior to a region of a vma.  madvise_update_vma
 * will handle splitting a vm area into separate areas, each area with its own
 * behavior.
 */
static int madvise_vma_behavior(struct vm_area_struct *vma,
                                struct vm_area_struct **prev,
                                unsigned long start, unsigned long end,
                                unsigned long behavior)
{
        int error;
        struct anon_vma_name *anon_name;
        unsigned long new_flags = vma->vm_flags;

        switch (behavior) {
        case MADV_REMOVE:
                return madvise_remove(vma, prev, start, end);
        case MADV_WILLNEED:
                return madvise_willneed(vma, prev, start, end);
        case MADV_COLD:
                return madvise_cold(vma, prev, start, end);
        case MADV_PAGEOUT:
                return madvise_pageout(vma, prev, start, end);
        case MADV_FREE:
        case MADV_DONTNEED:
        case MADV_DONTNEED_LOCKED:
                return madvise_dontneed_free(vma, prev, start, end, behavior);
        case MADV_NORMAL:
                new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
                break;
        case MADV_SEQUENTIAL:
                new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
                break;
        case MADV_RANDOM:
                new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
                break;
        case MADV_DONTFORK:
                new_flags |= VM_DONTCOPY;
                break;
        case MADV_DOFORK:
                if (vma->vm_flags & VM_IO)
                        return -EINVAL;
                new_flags &= ~VM_DONTCOPY;
                break;
        case MADV_WIPEONFORK:
                /* MADV_WIPEONFORK is only supported on anonymous memory. */
                if (vma->vm_file || vma->vm_flags & VM_SHARED)
                        return -EINVAL;
                new_flags |= VM_WIPEONFORK;
                break;
        case MADV_KEEPONFORK:
                new_flags &= ~VM_WIPEONFORK;
                break;
        case MADV_DONTDUMP:
                new_flags |= VM_DONTDUMP;
                break;
        case MADV_DODUMP:
                if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
                        return -EINVAL;
                new_flags &= ~VM_DONTDUMP;
                break;
        case MADV_MERGEABLE:
        case MADV_UNMERGEABLE:
                error = ksm_madvise(vma, start, end, behavior, &new_flags);
                if (error)
                        goto out;
                break;
        case MADV_HUGEPAGE:
        case MADV_NOHUGEPAGE:
                error = hugepage_madvise(vma, &new_flags, behavior);
                if (error)
                        goto out;
                break;
        case MADV_COLLAPSE:
                return madvise_collapse(vma, prev, start, end);
        }

        anon_name = anon_vma_name(vma);
        anon_vma_name_get(anon_name);
        error = madvise_update_vma(vma, prev, start, end, new_flags,
                                   anon_name);
        anon_vma_name_put(anon_name);

out:
        /*
         * madvise() returns EAGAIN if kernel resources, such as
         * slab, are temporarily unavailable.
         */
        if (error == -ENOMEM)
                error = -EAGAIN;
        return error;
}

#ifdef CONFIG_MEMORY_FAILURE
/*
 * Error injection support for memory error handling.
 */
static int madvise_inject_error(int behavior,
                unsigned long start, unsigned long end)
{
        unsigned long size;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;


        for (; start < end; start += size) {
                unsigned long pfn;
                struct page *page;
                int ret;

                ret = get_user_pages_fast(start, 1, 0, &page);
                if (ret != 1)
                        return ret;
                pfn = page_to_pfn(page);

                /*
                 * When soft offlining hugepages, after migrating the page
                 * we dissolve it, therefore in the second loop "page" will
                 * no longer be a compound page.
                 */
                size = page_size(compound_head(page));

                if (behavior == MADV_SOFT_OFFLINE) {
                        pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
                                 pfn, start);
                        ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
                } else {
                        pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
                                 pfn, start);
                        ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED);
                        if (ret == -EOPNOTSUPP)
                                ret = 0;
                }

                if (ret)
                        return ret;
        }

        return 0;
}
#endif

static bool
madvise_behavior_valid(int behavior)
{
        switch (behavior) {
        case MADV_DOFORK:
        case MADV_DONTFORK:
        case MADV_NORMAL:
        case MADV_SEQUENTIAL:
        case MADV_RANDOM:
        case MADV_REMOVE:
        case MADV_WILLNEED:
        case MADV_DONTNEED:
        case MADV_DONTNEED_LOCKED:
        case MADV_FREE:
        case MADV_COLD:
        case MADV_PAGEOUT:
        case MADV_POPULATE_READ:
        case MADV_POPULATE_WRITE:
#ifdef CONFIG_KSM
        case MADV_MERGEABLE:
        case MADV_UNMERGEABLE:
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        case MADV_HUGEPAGE:
        case MADV_NOHUGEPAGE:
        case MADV_COLLAPSE:
#endif
        case MADV_DONTDUMP:
        case MADV_DODUMP:
        case MADV_WIPEONFORK:
        case MADV_KEEPONFORK:
#ifdef CONFIG_MEMORY_FAILURE
        case MADV_SOFT_OFFLINE:
        case MADV_HWPOISON:
#endif
                return true;

        default:
                return false;
        }
}

static bool process_madvise_behavior_valid(int behavior)
{
        switch (behavior) {
        case MADV_COLD:
        case MADV_PAGEOUT:
        case MADV_WILLNEED:
        case MADV_COLLAPSE:
                return true;
        default:
                return false;
        }
}

/*
 * Walk the vmas in range [start,end), and call the visit function on each one.
 * The visit function will get start and end parameters that cover the overlap
 * between the current vma and the original range.  Any unmapped regions in the
 * original range will result in this function returning -ENOMEM while still
 * calling the visit function on all of the existing vmas in the range.
 * Must be called with the mmap_lock held for reading or writing.
 */
static
int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
                      unsigned long end, unsigned long arg,
                      int (*visit)(struct vm_area_struct *vma,
                                   struct vm_area_struct **prev, unsigned long start,
                                   unsigned long end, unsigned long arg))
{
        struct vm_area_struct *vma;
        struct vm_area_struct *prev;
        unsigned long tmp;
        int unmapped_error = 0;

        /*
         * If the interval [start,end) covers some unmapped address
         * ranges, just ignore them, but return -ENOMEM at the end.
         * - different from the way of handling in mlock etc.
         */
        vma = find_vma_prev(mm, start, &prev);
        if (vma && start > vma->vm_start)
                prev = vma;

        for (;;) {
                int error;

                /* Still start < end. */
                if (!vma)
                        return -ENOMEM;

                /* Here start < (end|vma->vm_end). */
                if (start < vma->vm_start) {
                        unmapped_error = -ENOMEM;
                        start = vma->vm_start;
                        if (start >= end)
                                break;
                }

                /* Here vma->vm_start <= start < (end|vma->vm_end) */
                tmp = vma->vm_end;
                if (end < tmp)
                        tmp = end;

                /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
                error = visit(vma, &prev, start, tmp, arg);
                if (error)
                        return error;
                start = tmp;
                if (prev && start < prev->vm_end)
                        start = prev->vm_end;
                if (start >= end)
                        break;
                if (prev)
                        vma = find_vma(mm, prev->vm_end);
                else        /* madvise_remove dropped mmap_lock */
                        vma = find_vma(mm, start);
        }

        return unmapped_error;
}

#ifdef CONFIG_ANON_VMA_NAME
static int madvise_vma_anon_name(struct vm_area_struct *vma,
                                 struct vm_area_struct **prev,
                                 unsigned long start, unsigned long end,
                                 unsigned long anon_name)
{
        int error;

        /* Only anonymous mappings can be named */
        if (vma->vm_file && !vma_is_anon_shmem(vma))
                return -EBADF;

        error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
                                   (struct anon_vma_name *)anon_name);

        /*
         * madvise() returns EAGAIN if kernel resources, such as
         * slab, are temporarily unavailable.
         */
        if (error == -ENOMEM)
                error = -EAGAIN;
        return error;
}

int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
                          unsigned long len_in, struct anon_vma_name *anon_name)
{
        unsigned long end;
        unsigned long len;

        if (start & ~PAGE_MASK)
                return -EINVAL;
        len = (len_in + ~PAGE_MASK) & PAGE_MASK;

        /* Check to see whether len was rounded up from small -ve to zero */
        if (len_in && !len)
                return -EINVAL;

        end = start + len;
        if (end < start)
                return -EINVAL;

        if (end == start)
                return 0;

        return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
                                 madvise_vma_anon_name);
}
#endif /* CONFIG_ANON_VMA_NAME */
/*
 * The madvise(2) system call.
 *
 * Applications can use madvise() to advise the kernel how it should
 * handle paging I/O in this VM area.  The idea is to help the kernel
 * use appropriate read-ahead and caching techniques.  The information
 * provided is advisory only, and can be safely disregarded by the
 * kernel without affecting the correct operation of the application.
 *
 * behavior values:
 *  MADV_NORMAL - the default behavior is to read clusters.  This
 *                results in some read-ahead and read-behind.
 *  MADV_RANDOM - the system should read the minimum amount of data
 *                on any access, since it is unlikely that the appli-
 *                cation will need more than what it asks for.
 *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
 *                once, so they can be aggressively read ahead, and
 *                can be freed soon after they are accessed.
 *  MADV_WILLNEED - the application is notifying the system to read
 *                some pages ahead.
 *  MADV_DONTNEED - the application is finished with the given range,
 *                so the kernel can free resources associated with it.
 *  MADV_FREE - the application marks pages in the given range as lazy free,
 *                where actual purges are postponed until memory pressure happens.
 *  MADV_REMOVE - the application wants to free up the given range of
 *                pages and associated backing store.
 *  MADV_DONTFORK - omit this area from child's address space when forking:
 *                typically, to avoid COWing pages pinned by get_user_pages().
 *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
 *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
 *              range after a fork.
 *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
 *  MADV_HWPOISON - trigger memory error handler as if the given memory range
 *                were corrupted by unrecoverable hardware memory failure.
 *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
 *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
 *                this area with pages of identical content from other such areas.
 *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
 *  MADV_HUGEPAGE - the application wants to back the given range by transparent
 *                huge pages in the future. Existing pages might be coalesced and
 *                new pages might be allocated as THP.
 *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
 *                transparent huge pages so the existing pages will not be
 *                coalesced into THP and new pages will not be allocated as THP.
 *  MADV_COLLAPSE - synchronously coalesce pages into new THP.
 *  MADV_DONTDUMP - the application wants to prevent pages in the given range
 *                from being included in its core dump.
 *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
 *  MADV_COLD - the application is not expected to use this memory soon,
 *                deactivate pages in this range so that they can be reclaimed
 *                easily if memory pressure happens.
 *  MADV_PAGEOUT - the application is not expected to use this memory soon,
 *                page out the pages in this range immediately.
 *  MADV_POPULATE_READ - populate (prefault) page tables readable by
 *                triggering read faults if required
 *  MADV_POPULATE_WRITE - populate (prefault) page tables writable by
 *                triggering write faults if required
 *
 * return values:
 *  zero    - success
 *  -EINVAL - start + len < 0, start is not page-aligned,
 *                "behavior" is not a valid value, or application
 *                is attempting to release locked or shared pages,
 *                or the specified address range includes file, Huge TLB,
 *                MAP_SHARED or VMPFNMAP range.
 *  -ENOMEM - addresses in the specified range are not currently
 *                mapped, or are outside the AS of the process.
 *  -EIO    - an I/O error occurred while paging in data.
 *  -EBADF  - map exists, but area maps something that isn't a file.
 *  -EAGAIN - a kernel resource was temporarily unavailable.
 *  -EPERM  - memory is sealed.
 */
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
        unsigned long end;
        int error;
        int write;
        size_t len;
        struct blk_plug plug;

        if (!madvise_behavior_valid(behavior))
                return -EINVAL;

        if (!PAGE_ALIGNED(start))
                return -EINVAL;
        len = PAGE_ALIGN(len_in);

        /* Check to see whether len was rounded up from small -ve to zero */
        if (len_in && !len)
                return -EINVAL;

        end = start + len;
        if (end < start)
                return -EINVAL;

        if (end == start)
                return 0;

#ifdef CONFIG_MEMORY_FAILURE
        if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
                return madvise_inject_error(behavior, start, start + len_in);
#endif

        write = madvise_need_mmap_write(behavior);
        if (write) {
                if (mmap_write_lock_killable(mm))
                        return -EINTR;
        } else {
                mmap_read_lock(mm);
        }

        start = untagged_addr_remote(mm, start);
        end = start + len;

        /*
         * Check if the address range is sealed for do_madvise().
         * can_modify_mm_madv assumes we have acquired the lock on MM.
         */
        if (unlikely(!can_modify_mm_madv(mm, start, end, behavior))) {
                error = -EPERM;
                goto out;
        }

        blk_start_plug(&plug);
        switch (behavior) {
        case MADV_POPULATE_READ:
        case MADV_POPULATE_WRITE:
                error = madvise_populate(mm, start, end, behavior);
                break;
        default:
                error = madvise_walk_vmas(mm, start, end, behavior,
                                          madvise_vma_behavior);
                break;
        }
        blk_finish_plug(&plug);

out:
        if (write)
                mmap_write_unlock(mm);
        else
                mmap_read_unlock(mm);

        return error;
}

SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
        return do_madvise(current->mm, start, len_in, behavior);
}

SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
                size_t, vlen, int, behavior, unsigned int, flags)
{
        ssize_t ret;
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        struct task_struct *task;
        struct mm_struct *mm;
        size_t total_len;
        unsigned int f_flags;

        if (flags != 0) {
                ret = -EINVAL;
                goto out;
        }

        ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
        if (ret < 0)
                goto out;

        task = pidfd_get_task(pidfd, &f_flags);
        if (IS_ERR(task)) {
                ret = PTR_ERR(task);
                goto free_iov;
        }

        if (!process_madvise_behavior_valid(behavior)) {
                ret = -EINVAL;
                goto release_task;
        }

        /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
        mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
        if (IS_ERR_OR_NULL(mm)) {
                ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
                goto release_task;
        }

        /*
         * Require CAP_SYS_NICE for influencing process performance. Note that
         * only non-destructive hints are currently supported.
         */
        if (!capable(CAP_SYS_NICE)) {
                ret = -EPERM;
                goto release_mm;
        }

        total_len = iov_iter_count(&iter);

        while (iov_iter_count(&iter)) {
                ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
                                        iter_iov_len(&iter), behavior);
                if (ret < 0)
                        break;
                iov_iter_advance(&iter, iter_iov_len(&iter));
        }

        ret = (total_len - iov_iter_count(&iter)) ? : ret;

release_mm:
        mmput(mm);
release_task:
        put_task_struct(task);
free_iov:
        kfree(iov);
out:
        return ret;
}





































































































































































































































































    5 












    1 






























































   26 















































































































































































   12 




































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_MM_H
#define _LINUX_SCHED_MM_H

#include <linux/kernel.h>
#include <linux/atomic.h>
#include <linux/sched.h>
#include <linux/mm_types.h>
#include <linux/gfp.h>
#include <linux/sync_core.h>
#include <linux/sched/coredump.h>

/*
 * Routines for handling mm_structs
 */
extern struct mm_struct *mm_alloc(void);

/**
 * mmgrab() - Pin a &struct mm_struct.
 * @mm: The &struct mm_struct to pin.
 *
 * Make sure that @mm will not get freed even after the owning task
 * exits. This doesn't guarantee that the associated address space
 * will still exist later on and mmget_not_zero() has to be used before
 * accessing it.
 *
 * This is a preferred way to pin @mm for a longer/unbounded amount
 * of time.
 *
 * Use mmdrop() to release the reference acquired by mmgrab().
 *
 * See also <Documentation/mm/active_mm.rst> for an in-depth explanation
 * of &mm_struct.mm_count vs &mm_struct.mm_users.
 */
static inline void mmgrab(struct mm_struct *mm)
{
        atomic_inc(&mm->mm_count);
}

static inline void smp_mb__after_mmgrab(void)
{
        smp_mb__after_atomic();
}

extern void __mmdrop(struct mm_struct *mm);

static inline void mmdrop(struct mm_struct *mm)
{
        /*
         * The implicit full barrier implied by atomic_dec_and_test() is
         * required by the membarrier system call before returning to
         * user-space, after storing to rq->curr.
         */
        if (unlikely(atomic_dec_and_test(&mm->mm_count)))
                __mmdrop(mm);
}

#ifdef CONFIG_PREEMPT_RT
/*
 * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is
 * by far the least expensive way to do that.
 */
static inline void __mmdrop_delayed(struct rcu_head *rhp)
{
        struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);

        __mmdrop(mm);
}

/*
 * Invoked from finish_task_switch(). Delegates the heavy lifting on RT
 * kernels via RCU.
 */
static inline void mmdrop_sched(struct mm_struct *mm)
{
        /* Provides a full memory barrier. See mmdrop() */
        if (atomic_dec_and_test(&mm->mm_count))
                call_rcu(&mm->delayed_drop, __mmdrop_delayed);
}
#else
static inline void mmdrop_sched(struct mm_struct *mm)
{
        mmdrop(mm);
}
#endif

/* Helpers for lazy TLB mm refcounting */
static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
{
        if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
                mmgrab(mm);
}

static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
{
        if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
                mmdrop(mm);
        } else {
                /*
                 * mmdrop_lazy_tlb must provide a full memory barrier, see the
                 * membarrier comment finish_task_switch which relies on this.
                 */
                smp_mb();
        }
}

static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm)
{
        if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
                mmdrop_sched(mm);
        else
                smp_mb(); /* see mmdrop_lazy_tlb() above */
}

/**
 * mmget() - Pin the address space associated with a &struct mm_struct.
 * @mm: The address space to pin.
 *
 * Make sure that the address space of the given &struct mm_struct doesn't
 * go away. This does not protect against parts of the address space being
 * modified or freed, however.
 *
 * Never use this function to pin this address space for an
 * unbounded/indefinite amount of time.
 *
 * Use mmput() to release the reference acquired by mmget().
 *
 * See also <Documentation/mm/active_mm.rst> for an in-depth explanation
 * of &mm_struct.mm_count vs &mm_struct.mm_users.
 */
static inline void mmget(struct mm_struct *mm)
{
        atomic_inc(&mm->mm_users);
}

static inline bool mmget_not_zero(struct mm_struct *mm)
{
        return atomic_inc_not_zero(&mm->mm_users);
}

/* mmput gets rid of the mappings and all user-space */
extern void mmput(struct mm_struct *);
#ifdef CONFIG_MMU
/* same as above but performs the slow path from the async context. Can
 * be called from the atomic context as well
 */
void mmput_async(struct mm_struct *);
#endif

/* Grab a reference to a task's mm, if it is not already going away */
extern struct mm_struct *get_task_mm(struct task_struct *task);
/*
 * Grab a reference to a task's mm, if it is not already going away
 * and ptrace_may_access with the mode parameter passed to it
 * succeeds.
 */
extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
/* Remove the current tasks stale references to the old mm_struct on exit() */
extern void exit_mm_release(struct task_struct *, struct mm_struct *);
/* Remove the current tasks stale references to the old mm_struct on exec() */
extern void exec_mm_release(struct task_struct *, struct mm_struct *);

#ifdef CONFIG_MEMCG
extern void mm_update_next_owner(struct mm_struct *mm);
#else
static inline void mm_update_next_owner(struct mm_struct *mm)
{
}
#endif /* CONFIG_MEMCG */

#ifdef CONFIG_MMU
#ifndef arch_get_mmap_end
#define arch_get_mmap_end(addr, len, flags)        (TASK_SIZE)
#endif

#ifndef arch_get_mmap_base
#define arch_get_mmap_base(addr, base) (base)
#endif

extern void arch_pick_mmap_layout(struct mm_struct *mm,
                                  struct rlimit *rlim_stack);
extern unsigned long
arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
                       unsigned long, unsigned long);
extern unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                          unsigned long len, unsigned long pgoff,
                          unsigned long flags);

unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp,
                                   unsigned long addr, unsigned long len,
                                   unsigned long pgoff, unsigned long flags);

unsigned long
arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
                               unsigned long len, unsigned long pgoff,
                               unsigned long flags, vm_flags_t vm_flags);
unsigned long
arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr,
                                       unsigned long len, unsigned long pgoff,
                                       unsigned long flags, vm_flags_t);

unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm,
                                           struct file *filp,
                                           unsigned long addr,
                                           unsigned long len,
                                           unsigned long pgoff,
                                           unsigned long flags,
                                           vm_flags_t vm_flags);

unsigned long
generic_get_unmapped_area(struct file *filp, unsigned long addr,
                          unsigned long len, unsigned long pgoff,
                          unsigned long flags);
unsigned long
generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                                  unsigned long len, unsigned long pgoff,
                                  unsigned long flags);
#else
static inline void arch_pick_mmap_layout(struct mm_struct *mm,
                                         struct rlimit *rlim_stack) {}
#endif

static inline bool in_vfork(struct task_struct *tsk)
{
        bool ret;

        /*
         * need RCU to access ->real_parent if CLONE_VM was used along with
         * CLONE_PARENT.
         *
         * We check real_parent->mm == tsk->mm because CLONE_VFORK does not
         * imply CLONE_VM
         *
         * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus
         * ->real_parent is not necessarily the task doing vfork(), so in
         * theory we can't rely on task_lock() if we want to dereference it.
         *
         * And in this case we can't trust the real_parent->mm == tsk->mm
         * check, it can be false negative. But we do not care, if init or
         * another oom-unkillable task does this it should blame itself.
         */
        rcu_read_lock();
        ret = tsk->vfork_done &&
                        rcu_dereference(tsk->real_parent)->mm == tsk->mm;
        rcu_read_unlock();

        return ret;
}

/*
 * Applies per-task gfp context to the given allocation flags.
 * PF_MEMALLOC_NOIO implies GFP_NOIO
 * PF_MEMALLOC_NOFS implies GFP_NOFS
 * PF_MEMALLOC_PIN  implies !GFP_MOVABLE
 */
static inline gfp_t current_gfp_context(gfp_t flags)
{
        unsigned int pflags = READ_ONCE(current->flags);

        if (unlikely(pflags & (PF_MEMALLOC_NOIO |
                               PF_MEMALLOC_NOFS |
                               PF_MEMALLOC_NORECLAIM |
                               PF_MEMALLOC_NOWARN |
                               PF_MEMALLOC_PIN))) {
                /*
                 * Stronger flags before weaker flags:
                 * NORECLAIM implies NOIO, which in turn implies NOFS
                 */
                if (pflags & PF_MEMALLOC_NORECLAIM)
                        flags &= ~__GFP_DIRECT_RECLAIM;
                else if (pflags & PF_MEMALLOC_NOIO)
                        flags &= ~(__GFP_IO | __GFP_FS);
                else if (pflags & PF_MEMALLOC_NOFS)
                        flags &= ~__GFP_FS;

                if (pflags & PF_MEMALLOC_NOWARN)
                        flags |= __GFP_NOWARN;

                if (pflags & PF_MEMALLOC_PIN)
                        flags &= ~__GFP_MOVABLE;
        }
        return flags;
}

#ifdef CONFIG_LOCKDEP
extern void __fs_reclaim_acquire(unsigned long ip);
extern void __fs_reclaim_release(unsigned long ip);
extern void fs_reclaim_acquire(gfp_t gfp_mask);
extern void fs_reclaim_release(gfp_t gfp_mask);
#else
static inline void __fs_reclaim_acquire(unsigned long ip) { }
static inline void __fs_reclaim_release(unsigned long ip) { }
static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
static inline void fs_reclaim_release(gfp_t gfp_mask) { }
#endif

/* Any memory-allocation retry loop should use
 * memalloc_retry_wait(), and pass the flags for the most
 * constrained allocation attempt that might have failed.
 * This provides useful documentation of where loops are,
 * and a central place to fine tune the waiting as the MM
 * implementation changes.
 */
static inline void memalloc_retry_wait(gfp_t gfp_flags)
{
        /* We use io_schedule_timeout because waiting for memory
         * typically included waiting for dirty pages to be
         * written out, which requires IO.
         */
        __set_current_state(TASK_UNINTERRUPTIBLE);
        gfp_flags = current_gfp_context(gfp_flags);
        if (gfpflags_allow_blocking(gfp_flags) &&
            !(gfp_flags & __GFP_NORETRY))
                /* Probably waited already, no need for much more */
                io_schedule_timeout(1);
        else
                /* Probably didn't wait, and has now released a lock,
                 * so now is a good time to wait
                 */
                io_schedule_timeout(HZ/50);
}

/**
 * might_alloc - Mark possible allocation sites
 * @gfp_mask: gfp_t flags that would be used to allocate
 *
 * Similar to might_sleep() and other annotations, this can be used in functions
 * that might allocate, but often don't. Compiles to nothing without
 * CONFIG_LOCKDEP. Includes a conditional might_sleep() if @gfp allows blocking.
 */
static inline void might_alloc(gfp_t gfp_mask)
{
        fs_reclaim_acquire(gfp_mask);
        fs_reclaim_release(gfp_mask);

        might_sleep_if(gfpflags_allow_blocking(gfp_mask));
}

/**
 * memalloc_flags_save - Add a PF_* flag to current->flags, save old value
 *
 * This allows PF_* flags to be conveniently added, irrespective of current
 * value, and then the old version restored with memalloc_flags_restore().
 */
static inline unsigned memalloc_flags_save(unsigned flags)
{
        unsigned oldflags = ~current->flags & flags;
        current->flags |= flags;
        return oldflags;
}

static inline void memalloc_flags_restore(unsigned flags)
{
        current->flags &= ~flags;
}

/**
 * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope.
 *
 * This functions marks the beginning of the GFP_NOIO allocation scope.
 * All further allocations will implicitly drop __GFP_IO flag and so
 * they are safe for the IO critical section from the allocation recursion
 * point of view. Use memalloc_noio_restore to end the scope with flags
 * returned by this function.
 *
 * Context: This function is safe to be used from any context.
 * Return: The saved flags to be passed to memalloc_noio_restore.
 */
static inline unsigned int memalloc_noio_save(void)
{
        return memalloc_flags_save(PF_MEMALLOC_NOIO);
}

/**
 * memalloc_noio_restore - Ends the implicit GFP_NOIO scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit GFP_NOIO scope started by memalloc_noio_save function.
 * Always make sure that the given flags is the return value from the
 * pairing memalloc_noio_save call.
 */
static inline void memalloc_noio_restore(unsigned int flags)
{
        memalloc_flags_restore(flags);
}

/**
 * memalloc_nofs_save - Marks implicit GFP_NOFS allocation scope.
 *
 * This functions marks the beginning of the GFP_NOFS allocation scope.
 * All further allocations will implicitly drop __GFP_FS flag and so
 * they are safe for the FS critical section from the allocation recursion
 * point of view. Use memalloc_nofs_restore to end the scope with flags
 * returned by this function.
 *
 * Context: This function is safe to be used from any context.
 * Return: The saved flags to be passed to memalloc_nofs_restore.
 */
static inline unsigned int memalloc_nofs_save(void)
{
        return memalloc_flags_save(PF_MEMALLOC_NOFS);
}

/**
 * memalloc_nofs_restore - Ends the implicit GFP_NOFS scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function.
 * Always make sure that the given flags is the return value from the
 * pairing memalloc_nofs_save call.
 */
static inline void memalloc_nofs_restore(unsigned int flags)
{
        memalloc_flags_restore(flags);
}

/**
 * memalloc_noreclaim_save - Marks implicit __GFP_MEMALLOC scope.
 *
 * This function marks the beginning of the __GFP_MEMALLOC allocation scope.
 * All further allocations will implicitly add the __GFP_MEMALLOC flag, which
 * prevents entering reclaim and allows access to all memory reserves. This
 * should only be used when the caller guarantees the allocation will allow more
 * memory to be freed very shortly, i.e. it needs to allocate some memory in
 * the process of freeing memory, and cannot reclaim due to potential recursion.
 *
 * Users of this scope have to be extremely careful to not deplete the reserves
 * completely and implement a throttling mechanism which controls the
 * consumption of the reserve based on the amount of freed memory. Usage of a
 * pre-allocated pool (e.g. mempool) should be always considered before using
 * this scope.
 *
 * Individual allocations under the scope can opt out using __GFP_NOMEMALLOC
 *
 * Context: This function should not be used in an interrupt context as that one
 *          does not give PF_MEMALLOC access to reserves.
 *          See __gfp_pfmemalloc_flags().
 * Return: The saved flags to be passed to memalloc_noreclaim_restore.
 */
static inline unsigned int memalloc_noreclaim_save(void)
{
        return memalloc_flags_save(PF_MEMALLOC);
}

/**
 * memalloc_noreclaim_restore - Ends the implicit __GFP_MEMALLOC scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit __GFP_MEMALLOC scope started by memalloc_noreclaim_save
 * function. Always make sure that the given flags is the return value from the
 * pairing memalloc_noreclaim_save call.
 */
static inline void memalloc_noreclaim_restore(unsigned int flags)
{
        memalloc_flags_restore(flags);
}

/**
 * memalloc_pin_save - Marks implicit ~__GFP_MOVABLE scope.
 *
 * This function marks the beginning of the ~__GFP_MOVABLE allocation scope.
 * All further allocations will implicitly remove the __GFP_MOVABLE flag, which
 * will constraint the allocations to zones that allow long term pinning, i.e.
 * not ZONE_MOVABLE zones.
 *
 * Return: The saved flags to be passed to memalloc_pin_restore.
 */
static inline unsigned int memalloc_pin_save(void)
{
        return memalloc_flags_save(PF_MEMALLOC_PIN);
}

/**
 * memalloc_pin_restore - Ends the implicit ~__GFP_MOVABLE scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit ~__GFP_MOVABLE scope started by memalloc_pin_save function.
 * Always make sure that the given flags is the return value from the pairing
 * memalloc_pin_save call.
 */
static inline void memalloc_pin_restore(unsigned int flags)
{
        memalloc_flags_restore(flags);
}

#ifdef CONFIG_MEMCG
DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
/**
 * set_active_memcg - Starts the remote memcg charging scope.
 * @memcg: memcg to charge.
 *
 * This function marks the beginning of the remote memcg charging scope. All the
 * __GFP_ACCOUNT allocations till the end of the scope will be charged to the
 * given memcg.
 *
 * Please, make sure that caller has a reference to the passed memcg structure,
 * so its lifetime is guaranteed to exceed the scope between two
 * set_active_memcg() calls.
 *
 * NOTE: This function can nest. Users must save the return value and
 * reset the previous value after their own charging scope is over.
 */
static inline struct mem_cgroup *
set_active_memcg(struct mem_cgroup *memcg)
{
        struct mem_cgroup *old;

        if (!in_task()) {
                old = this_cpu_read(int_active_memcg);
                this_cpu_write(int_active_memcg, memcg);
        } else {
                old = current->active_memcg;
                current->active_memcg = memcg;
        }

        return old;
}
#else
static inline struct mem_cgroup *
set_active_memcg(struct mem_cgroup *memcg)
{
        return NULL;
}
#endif

#ifdef CONFIG_MEMBARRIER
enum {
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY                = (1U << 0),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED                        = (1U << 1),
        MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY                        = (1U << 2),
        MEMBARRIER_STATE_GLOBAL_EXPEDITED                        = (1U << 3),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY        = (1U << 4),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE                = (1U << 5),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY                = (1U << 6),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ                        = (1U << 7),
};

enum {
        MEMBARRIER_FLAG_SYNC_CORE        = (1U << 0),
        MEMBARRIER_FLAG_RSEQ                = (1U << 1),
};

#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
#include <asm/membarrier.h>
#endif

static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
        if (current->mm != mm)
                return;
        if (likely(!(atomic_read(&mm->membarrier_state) &
                     MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
                return;
        sync_core_before_usermode();
}

extern void membarrier_exec_mmap(struct mm_struct *mm);

extern void membarrier_update_current_mm(struct mm_struct *next_mm);

#else
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
                                             struct mm_struct *next,
                                             struct task_struct *tsk)
{
}
#endif
static inline void membarrier_exec_mmap(struct mm_struct *mm)
{
}
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
}
static inline void membarrier_update_current_mm(struct mm_struct *next_mm)
{
}
#endif

#endif /* _LINUX_SCHED_MM_H */
































    1 





























    1 










    1 








    1 









    1 

    1 
    1 







    1 







    1 

    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
// SPDX-License-Identifier: GPL-2.0
/*
 *        linux/mm/msync.c
 *
 * Copyright (C) 1994-1999  Linus Torvalds
 */

/*
 * The msync() system call.
 */
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/file.h>
#include <linux/syscalls.h>
#include <linux/sched.h>

/*
 * MS_SYNC syncs the entire file - including mappings.
 *
 * MS_ASYNC does not start I/O (it used to, up to 2.5.67).
 * Nor does it marks the relevant pages dirty (it used to up to 2.6.17).
 * Now it doesn't do anything, since dirty pages are properly tracked.
 *
 * The application may now run fsync() to
 * write out the dirty pages and wait on the writeout and check the result.
 * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
 * async writeout immediately.
 * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
 * applications.
 */
SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
{
        unsigned long end;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        int unmapped_error = 0;
        int error = -EINVAL;

        start = untagged_addr(start);

        if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
                goto out;
        if (offset_in_page(start))
                goto out;
        if ((flags & MS_ASYNC) && (flags & MS_SYNC))
                goto out;
        error = -ENOMEM;
        len = (len + ~PAGE_MASK) & PAGE_MASK;
        end = start + len;
        if (end < start)
                goto out;
        error = 0;
        if (end == start)
                goto out;
        /*
         * If the interval [start,end) covers some unmapped address ranges,
         * just ignore them, but return -ENOMEM at the end. Besides, if the
         * flag is MS_ASYNC (w/o MS_INVALIDATE) the result would be -ENOMEM
         * anyway and there is nothing left to do, so return immediately.
         */
        mmap_read_lock(mm);
        vma = find_vma(mm, start);
        for (;;) {
                struct file *file;
                loff_t fstart, fend;

                /* Still start < end. */
                error = -ENOMEM;
                if (!vma)
                        goto out_unlock;
                /* Here start < vma->vm_end. */
                if (start < vma->vm_start) {
                        if (flags == MS_ASYNC)
                                goto out_unlock;
                        start = vma->vm_start;
                        if (start >= end)
                                goto out_unlock;
                        unmapped_error = -ENOMEM;
                }
                /* Here vma->vm_start <= start < vma->vm_end. */
                if ((flags & MS_INVALIDATE) &&
                                (vma->vm_flags & VM_LOCKED)) {
                        error = -EBUSY;
                        goto out_unlock;
                }
                file = vma->vm_file;
                fstart = (start - vma->vm_start) +
                         ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
                fend = fstart + (min(end, vma->vm_end) - start) - 1;
                start = vma->vm_end;
                if ((flags & MS_SYNC) && file &&
                                (vma->vm_flags & VM_SHARED)) {
                        get_file(file);
                        mmap_read_unlock(mm);
                        error = vfs_fsync_range(file, fstart, fend, 1);
                        fput(file);
                        if (error || start >= end)
                                goto out;
                        mmap_read_lock(mm);
                        vma = find_vma(mm, start);
                } else {
                        if (start >= end) {
                                error = 0;
                                goto out_unlock;
                        }
                        vma = find_vma(mm, vma->vm_end);
                }
        }
out_unlock:
        mmap_read_unlock(mm);
out:
        return error ? : unmapped_error;
}
























































































































    5 



    5 






















































































































































































































































































































































































    1 











































































































    1 


























































































































































































    1 




    1 












    1 



























    1 



    1 


    1 























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 















    1 




































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/radix-tree.h>
#include <linux/writeback.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
#include <linux/slab.h>
#include <linux/migrate.h>
#include <linux/ratelimit.h>
#include <linux/uuid.h>
#include <linux/semaphore.h>
#include <linux/error-injection.h>
#include <linux/crc32c.h>
#include <linux/sched/mm.h>
#include <asm/unaligned.h>
#include <crypto/hash.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "bio.h"
#include "print-tree.h"
#include "locking.h"
#include "tree-log.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
#include "dev-replace.h"
#include "raid56.h"
#include "sysfs.h"
#include "qgroup.h"
#include "compression.h"
#include "tree-checker.h"
#include "ref-verify.h"
#include "block-group.h"
#include "discard.h"
#include "space-info.h"
#include "zoned.h"
#include "subpage.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
#include "defrag.h"
#include "uuid-tree.h"
#include "relocation.h"
#include "scrub.h"
#include "super.h"

#define BTRFS_SUPER_FLAG_SUPP        (BTRFS_HEADER_FLAG_WRITTEN |\
                                 BTRFS_HEADER_FLAG_RELOC |\
                                 BTRFS_SUPER_FLAG_ERROR |\
                                 BTRFS_SUPER_FLAG_SEEDING |\
                                 BTRFS_SUPER_FLAG_METADUMP |\
                                 BTRFS_SUPER_FLAG_METADUMP_V2)

static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);

static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
{
        if (fs_info->csum_shash)
                crypto_free_shash(fs_info->csum_shash);
}

/*
 * Compute the csum of a btree block and store the result to provided buffer.
 */
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
{
        struct btrfs_fs_info *fs_info = buf->fs_info;
        int num_pages;
        u32 first_page_part;
        SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
        char *kaddr;
        int i;

        shash->tfm = fs_info->csum_shash;
        crypto_shash_init(shash);

        if (buf->addr) {
                /* Pages are contiguous, handle them as a big one. */
                kaddr = buf->addr;
                first_page_part = fs_info->nodesize;
                num_pages = 1;
        } else {
                kaddr = folio_address(buf->folios[0]);
                first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
                num_pages = num_extent_pages(buf);
        }

        crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
                            first_page_part - BTRFS_CSUM_SIZE);

        /*
         * Multiple single-page folios case would reach here.
         *
         * nodesize <= PAGE_SIZE and large folio all handled by above
         * crypto_shash_update() already.
         */
        for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
                kaddr = folio_address(buf->folios[i]);
                crypto_shash_update(shash, kaddr, PAGE_SIZE);
        }
        memset(result, 0, BTRFS_CSUM_SIZE);
        crypto_shash_final(shash, result);
}

/*
 * we can't consider a given block up to date unless the transid of the
 * block matches the transid in the parent node's pointer.  This is how we
 * detect blocks that either didn't get written at all or got written
 * in the wrong place.
 */
int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic)
{
        if (!extent_buffer_uptodate(eb))
                return 0;

        if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
                return 1;

        if (atomic)
                return -EAGAIN;

        if (!extent_buffer_uptodate(eb) ||
            btrfs_header_generation(eb) != parent_transid) {
                btrfs_err_rl(eb->fs_info,
"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
                        eb->start, eb->read_mirror,
                        parent_transid, btrfs_header_generation(eb));
                clear_extent_buffer_uptodate(eb);
                return 0;
        }
        return 1;
}

static bool btrfs_supported_super_csum(u16 csum_type)
{
        switch (csum_type) {
        case BTRFS_CSUM_TYPE_CRC32:
        case BTRFS_CSUM_TYPE_XXHASH:
        case BTRFS_CSUM_TYPE_SHA256:
        case BTRFS_CSUM_TYPE_BLAKE2:
                return true;
        default:
                return false;
        }
}

/*
 * Return 0 if the superblock checksum type matches the checksum value of that
 * algorithm. Pass the raw disk superblock data.
 */
int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
                           const struct btrfs_super_block *disk_sb)
{
        char result[BTRFS_CSUM_SIZE];
        SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);

        shash->tfm = fs_info->csum_shash;

        /*
         * The super_block structure does not span the whole
         * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
         * filled with zeros and is included in the checksum.
         */
        crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE,
                            BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);

        if (memcmp(disk_sb->csum, result, fs_info->csum_size))
                return 1;

        return 0;
}

static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
                                      int mirror_num)
{
        struct btrfs_fs_info *fs_info = eb->fs_info;
        int num_folios = num_extent_folios(eb);
        int ret = 0;

        if (sb_rdonly(fs_info->sb))
                return -EROFS;

        for (int i = 0; i < num_folios; i++) {
                struct folio *folio = eb->folios[i];
                u64 start = max_t(u64, eb->start, folio_pos(folio));
                u64 end = min_t(u64, eb->start + eb->len,
                                folio_pos(folio) + eb->folio_size);
                u32 len = end - start;

                ret = btrfs_repair_io_failure(fs_info, 0, start, len,
                                              start, folio, offset_in_folio(folio, start),
                                              mirror_num);
                if (ret)
                        break;
        }

        return ret;
}

/*
 * helper to read a given tree block, doing retries as required when
 * the checksums don't match and we have alternate mirrors to try.
 *
 * @check:                expected tree parentness check, see the comments of the
 *                        structure for details.
 */
int btrfs_read_extent_buffer(struct extent_buffer *eb,
                             struct btrfs_tree_parent_check *check)
{
        struct btrfs_fs_info *fs_info = eb->fs_info;
        int failed = 0;
        int ret;
        int num_copies = 0;
        int mirror_num = 0;
        int failed_mirror = 0;

        ASSERT(check);

        while (1) {
                clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
                ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check);
                if (!ret)
                        break;

                num_copies = btrfs_num_copies(fs_info,
                                              eb->start, eb->len);
                if (num_copies == 1)
                        break;

                if (!failed_mirror) {
                        failed = 1;
                        failed_mirror = eb->read_mirror;
                }

                mirror_num++;
                if (mirror_num == failed_mirror)
                        mirror_num++;

                if (mirror_num > num_copies)
                        break;
        }

        if (failed && !ret && failed_mirror)
                btrfs_repair_eb_io_failure(eb, failed_mirror);

        return ret;
}

/*
 * Checksum a dirty tree block before IO.
 */
blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
{
        struct extent_buffer *eb = bbio->private;
        struct btrfs_fs_info *fs_info = eb->fs_info;
        u64 found_start = btrfs_header_bytenr(eb);
        u64 last_trans;
        u8 result[BTRFS_CSUM_SIZE];
        int ret;

        /* Btree blocks are always contiguous on disk. */
        if (WARN_ON_ONCE(bbio->file_offset != eb->start))
                return BLK_STS_IOERR;
        if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len))
                return BLK_STS_IOERR;

        /*
         * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't
         * checksum it but zero-out its content. This is done to preserve
         * ordering of I/O without unnecessarily writing out data.
         */
        if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) {
                memzero_extent_buffer(eb, 0, eb->len);
                return BLK_STS_OK;
        }

        if (WARN_ON_ONCE(found_start != eb->start))
                return BLK_STS_IOERR;
        if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0],
                                               eb->start, eb->len)))
                return BLK_STS_IOERR;

        ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
                                    offsetof(struct btrfs_header, fsid),
                                    BTRFS_FSID_SIZE) == 0);
        csum_tree_block(eb, result);

        if (btrfs_header_level(eb))
                ret = btrfs_check_node(eb);
        else
                ret = btrfs_check_leaf(eb);

        if (ret < 0)
                goto error;

        /*
         * Also check the generation, the eb reached here must be newer than
         * last committed. Or something seriously wrong happened.
         */
        last_trans = btrfs_get_last_trans_committed(fs_info);
        if (unlikely(btrfs_header_generation(eb) <= last_trans)) {
                ret = -EUCLEAN;
                btrfs_err(fs_info,
                        "block=%llu bad generation, have %llu expect > %llu",
                          eb->start, btrfs_header_generation(eb), last_trans);
                goto error;
        }
        write_extent_buffer(eb, result, 0, fs_info->csum_size);
        return BLK_STS_OK;

error:
        btrfs_print_tree(eb, 0);
        btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
                  eb->start);
        /*
         * Be noisy if this is an extent buffer from a log tree. We don't abort
         * a transaction in case there's a bad log tree extent buffer, we just
         * fallback to a transaction commit. Still we want to know when there is
         * a bad log tree extent buffer, as that may signal a bug somewhere.
         */
        WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
                btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID);
        return errno_to_blk_status(ret);
}

static bool check_tree_block_fsid(struct extent_buffer *eb)
{
        struct btrfs_fs_info *fs_info = eb->fs_info;
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
        u8 fsid[BTRFS_FSID_SIZE];

        read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
                           BTRFS_FSID_SIZE);

        /*
         * alloc_fsid_devices() copies the fsid into fs_devices::metadata_uuid.
         * This is then overwritten by metadata_uuid if it is present in the
         * device_list_add(). The same true for a seed device as well. So use of
         * fs_devices::metadata_uuid is appropriate here.
         */
        if (memcmp(fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0)
                return false;

        list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
                if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
                        return false;

        return true;
}

/* Do basic extent buffer checks at read time */
int btrfs_validate_extent_buffer(struct extent_buffer *eb,
                                 struct btrfs_tree_parent_check *check)
{
        struct btrfs_fs_info *fs_info = eb->fs_info;
        u64 found_start;
        const u32 csum_size = fs_info->csum_size;
        u8 found_level;
        u8 result[BTRFS_CSUM_SIZE];
        const u8 *header_csum;
        int ret = 0;

        ASSERT(check);

        found_start = btrfs_header_bytenr(eb);
        if (found_start != eb->start) {
                btrfs_err_rl(fs_info,
                        "bad tree block start, mirror %u want %llu have %llu",
                             eb->read_mirror, eb->start, found_start);
                ret = -EIO;
                goto out;
        }
        if (check_tree_block_fsid(eb)) {
                btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
                             eb->start, eb->read_mirror);
                ret = -EIO;
                goto out;
        }
        found_level = btrfs_header_level(eb);
        if (found_level >= BTRFS_MAX_LEVEL) {
                btrfs_err(fs_info,
                        "bad tree block level, mirror %u level %d on logical %llu",
                        eb->read_mirror, btrfs_header_level(eb), eb->start);
                ret = -EIO;
                goto out;
        }

        csum_tree_block(eb, result);
        header_csum = folio_address(eb->folios[0]) +
                get_eb_offset_in_folio(eb, offsetof(struct btrfs_header, csum));

        if (memcmp(result, header_csum, csum_size) != 0) {
                btrfs_warn_rl(fs_info,
"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d",
                              eb->start, eb->read_mirror,
                              CSUM_FMT_VALUE(csum_size, header_csum),
                              CSUM_FMT_VALUE(csum_size, result),
                              btrfs_header_level(eb));
                ret = -EUCLEAN;
                goto out;
        }

        if (found_level != check->level) {
                btrfs_err(fs_info,
                "level verify failed on logical %llu mirror %u wanted %u found %u",
                          eb->start, eb->read_mirror, check->level, found_level);
                ret = -EIO;
                goto out;
        }
        if (unlikely(check->transid &&
                     btrfs_header_generation(eb) != check->transid)) {
                btrfs_err_rl(eb->fs_info,
"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
                                eb->start, eb->read_mirror, check->transid,
                                btrfs_header_generation(eb));
                ret = -EIO;
                goto out;
        }
        if (check->has_first_key) {
                struct btrfs_key *expect_key = &check->first_key;
                struct btrfs_key found_key;

                if (found_level)
                        btrfs_node_key_to_cpu(eb, &found_key, 0);
                else
                        btrfs_item_key_to_cpu(eb, &found_key, 0);
                if (unlikely(btrfs_comp_cpu_keys(expect_key, &found_key))) {
                        btrfs_err(fs_info,
"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
                                  eb->start, check->transid,
                                  expect_key->objectid,
                                  expect_key->type, expect_key->offset,
                                  found_key.objectid, found_key.type,
                                  found_key.offset);
                        ret = -EUCLEAN;
                        goto out;
                }
        }
        if (check->owner_root) {
                ret = btrfs_check_eb_owner(eb, check->owner_root);
                if (ret < 0)
                        goto out;
        }

        /*
         * If this is a leaf block and it is corrupt, set the corrupt bit so
         * that we don't try and read the other copies of this block, just
         * return -EIO.
         */
        if (found_level == 0 && btrfs_check_leaf(eb)) {
                set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
                ret = -EIO;
        }

        if (found_level > 0 && btrfs_check_node(eb))
                ret = -EIO;

        if (ret)
                btrfs_err(fs_info,
                "read time tree block corruption detected on logical %llu mirror %u",
                          eb->start, eb->read_mirror);
out:
        return ret;
}

#ifdef CONFIG_MIGRATION
static int btree_migrate_folio(struct address_space *mapping,
                struct folio *dst, struct folio *src, enum migrate_mode mode)
{
        /*
         * we can't safely write a btree page from here,
         * we haven't done the locking hook
         */
        if (folio_test_dirty(src))
                return -EAGAIN;
        /*
         * Buffers may be managed in a filesystem specific way.
         * We must have no buffers or drop them.
         */
        if (folio_get_private(src) &&
            !filemap_release_folio(src, GFP_KERNEL))
                return -EAGAIN;
        return migrate_folio(mapping, dst, src, mode);
}
#else
#define btree_migrate_folio NULL
#endif

static int btree_writepages(struct address_space *mapping,
                            struct writeback_control *wbc)
{
        int ret;

        if (wbc->sync_mode == WB_SYNC_NONE) {
                struct btrfs_fs_info *fs_info;

                if (wbc->for_kupdate)
                        return 0;

                fs_info = inode_to_fs_info(mapping->host);
                /* this is a bit racy, but that's ok */
                ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
                                             BTRFS_DIRTY_METADATA_THRESH,
                                             fs_info->dirty_metadata_batch);
                if (ret < 0)
                        return 0;
        }
        return btree_write_cache_pages(mapping, wbc);
}

static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
{
        if (folio_test_writeback(folio) || folio_test_dirty(folio))
                return false;

        return try_release_extent_buffer(&folio->page);
}

static void btree_invalidate_folio(struct folio *folio, size_t offset,
                                 size_t length)
{
        struct extent_io_tree *tree;

        tree = &folio_to_inode(folio)->io_tree;
        extent_invalidate_folio(tree, folio, offset);
        btree_release_folio(folio, GFP_NOFS);
        if (folio_get_private(folio)) {
                btrfs_warn(folio_to_fs_info(folio),
                           "folio private not zero on folio %llu",
                           (unsigned long long)folio_pos(folio));
                folio_detach_private(folio);
        }
}

#ifdef DEBUG
static bool btree_dirty_folio(struct address_space *mapping,
                struct folio *folio)
{
        struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
        struct btrfs_subpage_info *spi = fs_info->subpage_info;
        struct btrfs_subpage *subpage;
        struct extent_buffer *eb;
        int cur_bit = 0;
        u64 page_start = folio_pos(folio);

        if (fs_info->sectorsize == PAGE_SIZE) {
                eb = folio_get_private(folio);
                BUG_ON(!eb);
                BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
                BUG_ON(!atomic_read(&eb->refs));
                btrfs_assert_tree_write_locked(eb);
                return filemap_dirty_folio(mapping, folio);
        }

        ASSERT(spi);
        subpage = folio_get_private(folio);

        for (cur_bit = spi->dirty_offset;
             cur_bit < spi->dirty_offset + spi->bitmap_nr_bits;
             cur_bit++) {
                unsigned long flags;
                u64 cur;

                spin_lock_irqsave(&subpage->lock, flags);
                if (!test_bit(cur_bit, subpage->bitmaps)) {
                        spin_unlock_irqrestore(&subpage->lock, flags);
                        continue;
                }
                spin_unlock_irqrestore(&subpage->lock, flags);
                cur = page_start + cur_bit * fs_info->sectorsize;

                eb = find_extent_buffer(fs_info, cur);
                ASSERT(eb);
                ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
                ASSERT(atomic_read(&eb->refs));
                btrfs_assert_tree_write_locked(eb);
                free_extent_buffer(eb);

                cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits) - 1;
        }
        return filemap_dirty_folio(mapping, folio);
}
#else
#define btree_dirty_folio filemap_dirty_folio
#endif

static const struct address_space_operations btree_aops = {
        .writepages        = btree_writepages,
        .release_folio        = btree_release_folio,
        .invalidate_folio = btree_invalidate_folio,
        .migrate_folio        = btree_migrate_folio,
        .dirty_folio        = btree_dirty_folio,
};

struct extent_buffer *btrfs_find_create_tree_block(
                                                struct btrfs_fs_info *fs_info,
                                                u64 bytenr, u64 owner_root,
                                                int level)
{
        if (btrfs_is_testing(fs_info))
                return alloc_test_extent_buffer(fs_info, bytenr);
        return alloc_extent_buffer(fs_info, bytenr, owner_root, level);
}

/*
 * Read tree block at logical address @bytenr and do variant basic but critical
 * verification.
 *
 * @check:                expected tree parentness check, see comments of the
 *                        structure for details.
 */
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
                                      struct btrfs_tree_parent_check *check)
{
        struct extent_buffer *buf = NULL;
        int ret;

        ASSERT(check);

        buf = btrfs_find_create_tree_block(fs_info, bytenr, check->owner_root,
                                           check->level);
        if (IS_ERR(buf))
                return buf;

        ret = btrfs_read_extent_buffer(buf, check);
        if (ret) {
                free_extent_buffer_stale(buf);
                return ERR_PTR(ret);
        }
        if (btrfs_check_eb_owner(buf, check->owner_root)) {
                free_extent_buffer_stale(buf);
                return ERR_PTR(-EUCLEAN);
        }
        return buf;

}

static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
                         u64 objectid)
{
        bool dummy = btrfs_is_testing(fs_info);

        memset(&root->root_key, 0, sizeof(root->root_key));
        memset(&root->root_item, 0, sizeof(root->root_item));
        memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
        root->fs_info = fs_info;
        root->root_key.objectid = objectid;
        root->node = NULL;
        root->commit_root = NULL;
        root->state = 0;
        RB_CLEAR_NODE(&root->rb_node);

        root->last_trans = 0;
        root->free_objectid = 0;
        root->nr_delalloc_inodes = 0;
        root->nr_ordered_extents = 0;
        root->inode_tree = RB_ROOT;
        xa_init(&root->delayed_nodes);

        btrfs_init_root_block_rsv(root);

        INIT_LIST_HEAD(&root->dirty_list);
        INIT_LIST_HEAD(&root->root_list);
        INIT_LIST_HEAD(&root->delalloc_inodes);
        INIT_LIST_HEAD(&root->delalloc_root);
        INIT_LIST_HEAD(&root->ordered_extents);
        INIT_LIST_HEAD(&root->ordered_root);
        INIT_LIST_HEAD(&root->reloc_dirty_list);
        spin_lock_init(&root->inode_lock);
        spin_lock_init(&root->delalloc_lock);
        spin_lock_init(&root->ordered_extent_lock);
        spin_lock_init(&root->accounting_lock);
        spin_lock_init(&root->qgroup_meta_rsv_lock);
        mutex_init(&root->objectid_mutex);
        mutex_init(&root->log_mutex);
        mutex_init(&root->ordered_extent_mutex);
        mutex_init(&root->delalloc_mutex);
        init_waitqueue_head(&root->qgroup_flush_wait);
        init_waitqueue_head(&root->log_writer_wait);
        init_waitqueue_head(&root->log_commit_wait[0]);
        init_waitqueue_head(&root->log_commit_wait[1]);
        INIT_LIST_HEAD(&root->log_ctxs[0]);
        INIT_LIST_HEAD(&root->log_ctxs[1]);
        atomic_set(&root->log_commit[0], 0);
        atomic_set(&root->log_commit[1], 0);
        atomic_set(&root->log_writers, 0);
        atomic_set(&root->log_batch, 0);
        refcount_set(&root->refs, 1);
        atomic_set(&root->snapshot_force_cow, 0);
        atomic_set(&root->nr_swapfiles, 0);
        btrfs_set_root_log_transid(root, 0);
        root->log_transid_committed = -1;
        btrfs_set_root_last_log_commit(root, 0);
        root->anon_dev = 0;
        if (!dummy) {
                extent_io_tree_init(fs_info, &root->dirty_log_pages,
                                    IO_TREE_ROOT_DIRTY_LOG_PAGES);
                extent_io_tree_init(fs_info, &root->log_csum_range,
                                    IO_TREE_LOG_CSUM_RANGE);
        }

        spin_lock_init(&root->root_item_lock);
        btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
#ifdef CONFIG_BTRFS_DEBUG
        INIT_LIST_HEAD(&root->leak_list);
        spin_lock(&fs_info->fs_roots_radix_lock);
        list_add_tail(&root->leak_list, &fs_info->allocated_roots);
        spin_unlock(&fs_info->fs_roots_radix_lock);
#endif
}

static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
                                           u64 objectid, gfp_t flags)
{
        struct btrfs_root *root = kzalloc(sizeof(*root), flags);
        if (root)
                __setup_root(root, fs_info, objectid);
        return root;
}

#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/* Should only be used by the testing infrastructure */
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
{
        struct btrfs_root *root;

        if (!fs_info)
                return ERR_PTR(-EINVAL);

        root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
        if (!root)
                return ERR_PTR(-ENOMEM);

        /* We don't use the stripesize in selftest, set it as sectorsize */
        root->alloc_bytenr = 0;

        return root;
}
#endif

static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node)
{
        const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node);
        const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node);

        return btrfs_comp_cpu_keys(&a->root_key, &b->root_key);
}

static int global_root_key_cmp(const void *k, const struct rb_node *node)
{
        const struct btrfs_key *key = k;
        const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node);

        return btrfs_comp_cpu_keys(key, &root->root_key);
}

int btrfs_global_root_insert(struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct rb_node *tmp;
        int ret = 0;

        write_lock(&fs_info->global_root_lock);
        tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp);
        write_unlock(&fs_info->global_root_lock);

        if (tmp) {
                ret = -EEXIST;
                btrfs_warn(fs_info, "global root %llu %llu already exists",
                           btrfs_root_id(root), root->root_key.offset);
        }
        return ret;
}

void btrfs_global_root_delete(struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;

        write_lock(&fs_info->global_root_lock);
        rb_erase(&root->rb_node, &fs_info->global_root_tree);
        write_unlock(&fs_info->global_root_lock);
}

struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
                                     struct btrfs_key *key)
{
        struct rb_node *node;
        struct btrfs_root *root = NULL;

        read_lock(&fs_info->global_root_lock);
        node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp);
        if (node)
                root = container_of(node, struct btrfs_root, rb_node);
        read_unlock(&fs_info->global_root_lock);

        return root;
}

static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr)
{
        struct btrfs_block_group *block_group;
        u64 ret;

        if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
                return 0;

        if (bytenr)
                block_group = btrfs_lookup_block_group(fs_info, bytenr);
        else
                block_group = btrfs_lookup_first_block_group(fs_info, bytenr);
        ASSERT(block_group);
        if (!block_group)
                return 0;
        ret = block_group->global_root_id;
        btrfs_put_block_group(block_group);

        return ret;
}

struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr)
{
        struct btrfs_key key = {
                .objectid = BTRFS_CSUM_TREE_OBJECTID,
                .type = BTRFS_ROOT_ITEM_KEY,
                .offset = btrfs_global_root_id(fs_info, bytenr),
        };

        return btrfs_global_root(fs_info, &key);
}

struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
{
        struct btrfs_key key = {
                .objectid = BTRFS_EXTENT_TREE_OBJECTID,
                .type = BTRFS_ROOT_ITEM_KEY,
                .offset = btrfs_global_root_id(fs_info, bytenr),
        };

        return btrfs_global_root(fs_info, &key);
}

struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
{
        if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
                return fs_info->block_group_root;
        return btrfs_extent_root(fs_info, 0);
}

struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
                                     u64 objectid)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct extent_buffer *leaf;
        struct btrfs_root *tree_root = fs_info->tree_root;
        struct btrfs_root *root;
        struct btrfs_key key;
        unsigned int nofs_flag;
        int ret = 0;

        /*
         * We're holding a transaction handle, so use a NOFS memory allocation
         * context to avoid deadlock if reclaim happens.
         */
        nofs_flag = memalloc_nofs_save();
        root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
        memalloc_nofs_restore(nofs_flag);
        if (!root)
                return ERR_PTR(-ENOMEM);

        root->root_key.objectid = objectid;
        root->root_key.type = BTRFS_ROOT_ITEM_KEY;
        root->root_key.offset = 0;

        leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
                                      0, BTRFS_NESTING_NORMAL);
        if (IS_ERR(leaf)) {
                ret = PTR_ERR(leaf);
                leaf = NULL;
                goto fail;
        }

        root->node = leaf;
        btrfs_mark_buffer_dirty(trans, leaf);

        root->commit_root = btrfs_root_node(root);
        set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);

        btrfs_set_root_flags(&root->root_item, 0);
        btrfs_set_root_limit(&root->root_item, 0);
        btrfs_set_root_bytenr(&root->root_item, leaf->start);
        btrfs_set_root_generation(&root->root_item, trans->transid);
        btrfs_set_root_level(&root->root_item, 0);
        btrfs_set_root_refs(&root->root_item, 1);
        btrfs_set_root_used(&root->root_item, leaf->len);
        btrfs_set_root_last_snapshot(&root->root_item, 0);
        btrfs_set_root_dirid(&root->root_item, 0);
        if (is_fstree(objectid))
                generate_random_guid(root->root_item.uuid);
        else
                export_guid(root->root_item.uuid, &guid_null);
        btrfs_set_root_drop_level(&root->root_item, 0);

        btrfs_tree_unlock(leaf);

        key.objectid = objectid;
        key.type = BTRFS_ROOT_ITEM_KEY;
        key.offset = 0;
        ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
        if (ret)
                goto fail;

        return root;

fail:
        btrfs_put_root(root);

        return ERR_PTR(ret);
}

static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
                                         struct btrfs_fs_info *fs_info)
{
        struct btrfs_root *root;

        root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
        if (!root)
                return ERR_PTR(-ENOMEM);

        root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
        root->root_key.type = BTRFS_ROOT_ITEM_KEY;
        root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;

        return root;
}

int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root)
{
        struct extent_buffer *leaf;

        /*
         * DON'T set SHAREABLE bit for log trees.
         *
         * Log trees are not exposed to user space thus can't be snapshotted,
         * and they go away before a real commit is actually done.
         *
         * They do store pointers to file data extents, and those reference
         * counts still get updated (along with back refs to the log tree).
         */

        leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
                        NULL, 0, 0, 0, 0, BTRFS_NESTING_NORMAL);
        if (IS_ERR(leaf))
                return PTR_ERR(leaf);

        root->node = leaf;

        btrfs_mark_buffer_dirty(trans, root->node);
        btrfs_tree_unlock(root->node);

        return 0;
}

int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *fs_info)
{
        struct btrfs_root *log_root;

        log_root = alloc_log_tree(trans, fs_info);
        if (IS_ERR(log_root))
                return PTR_ERR(log_root);

        if (!btrfs_is_zoned(fs_info)) {
                int ret = btrfs_alloc_log_tree_node(trans, log_root);

                if (ret) {
                        btrfs_put_root(log_root);
                        return ret;
                }
        }

        WARN_ON(fs_info->log_root_tree);
        fs_info->log_root_tree = log_root;
        return 0;
}

int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_root *log_root;
        struct btrfs_inode_item *inode_item;
        int ret;

        log_root = alloc_log_tree(trans, fs_info);
        if (IS_ERR(log_root))
                return PTR_ERR(log_root);

        ret = btrfs_alloc_log_tree_node(trans, log_root);
        if (ret) {
                btrfs_put_root(log_root);
                return ret;
        }

        log_root->last_trans = trans->transid;
        log_root->root_key.offset = btrfs_root_id(root);

        inode_item = &log_root->root_item.inode;
        btrfs_set_stack_inode_generation(inode_item, 1);
        btrfs_set_stack_inode_size(inode_item, 3);
        btrfs_set_stack_inode_nlink(inode_item, 1);
        btrfs_set_stack_inode_nbytes(inode_item,
                                     fs_info->nodesize);
        btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);

        btrfs_set_root_node(&log_root->root_item, log_root->node);

        WARN_ON(root->log_root);
        root->log_root = log_root;
        btrfs_set_root_log_transid(root, 0);
        root->log_transid_committed = -1;
        btrfs_set_root_last_log_commit(root, 0);
        return 0;
}

static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
                                              struct btrfs_path *path,
                                              struct btrfs_key *key)
{
        struct btrfs_root *root;
        struct btrfs_tree_parent_check check = { 0 };
        struct btrfs_fs_info *fs_info = tree_root->fs_info;
        u64 generation;
        int ret;
        int level;

        root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
        if (!root)
                return ERR_PTR(-ENOMEM);

        ret = btrfs_find_root(tree_root, key, path,
                              &root->root_item, &root->root_key);
        if (ret) {
                if (ret > 0)
                        ret = -ENOENT;
                goto fail;
        }

        generation = btrfs_root_generation(&root->root_item);
        level = btrfs_root_level(&root->root_item);
        check.level = level;
        check.transid = generation;
        check.owner_root = key->objectid;
        root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item),
                                     &check);
        if (IS_ERR(root->node)) {
                ret = PTR_ERR(root->node);
                root->node = NULL;
                goto fail;
        }
        if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
                ret = -EIO;
                goto fail;
        }

        /*
         * For real fs, and not log/reloc trees, root owner must
         * match its root node owner
         */
        if (!btrfs_is_testing(fs_info) &&
            btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
            btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
            btrfs_root_id(root) != btrfs_header_owner(root->node)) {
                btrfs_crit(fs_info,
"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
                           btrfs_root_id(root), root->node->start,
                           btrfs_header_owner(root->node),
                           btrfs_root_id(root));
                ret = -EUCLEAN;
                goto fail;
        }
        root->commit_root = btrfs_root_node(root);
        return root;
fail:
        btrfs_put_root(root);
        return ERR_PTR(ret);
}

struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
                                        struct btrfs_key *key)
{
        struct btrfs_root *root;
        struct btrfs_path *path;

        path = btrfs_alloc_path();
        if (!path)
                return ERR_PTR(-ENOMEM);
        root = read_tree_root_path(tree_root, path, key);
        btrfs_free_path(path);

        return root;
}

/*
 * Initialize subvolume root in-memory structure
 *
 * @anon_dev:        anonymous device to attach to the root, if zero, allocate new
 */
static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
{
        int ret;

        btrfs_drew_lock_init(&root->snapshot_lock);

        if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
            !btrfs_is_data_reloc_root(root) &&
            is_fstree(btrfs_root_id(root))) {
                set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
                btrfs_check_and_init_root_item(&root->root_item);
        }

        /*
         * Don't assign anonymous block device to roots that are not exposed to
         * userspace, the id pool is limited to 1M
         */
        if (is_fstree(btrfs_root_id(root)) &&
            btrfs_root_refs(&root->root_item) > 0) {
                if (!anon_dev) {
                        ret = get_anon_bdev(&root->anon_dev);
                        if (ret)
                                goto fail;
                } else {
                        root->anon_dev = anon_dev;
                }
        }

        mutex_lock(&root->objectid_mutex);
        ret = btrfs_init_root_free_objectid(root);
        if (ret) {
                mutex_unlock(&root->objectid_mutex);
                goto fail;
        }

        ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);

        mutex_unlock(&root->objectid_mutex);

        return 0;
fail:
        /* The caller is responsible to call btrfs_free_fs_root */
        return ret;
}

static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
                                               u64 root_id)
{
        struct btrfs_root *root;

        spin_lock(&fs_info->fs_roots_radix_lock);
        root = radix_tree_lookup(&fs_info->fs_roots_radix,
                                 (unsigned long)root_id);
        root = btrfs_grab_root(root);
        spin_unlock(&fs_info->fs_roots_radix_lock);
        return root;
}

static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
                                                u64 objectid)
{
        struct btrfs_key key = {
                .objectid = objectid,
                .type = BTRFS_ROOT_ITEM_KEY,
                .offset = 0,
        };

        switch (objectid) {
        case BTRFS_ROOT_TREE_OBJECTID:
                return btrfs_grab_root(fs_info->tree_root);
        case BTRFS_EXTENT_TREE_OBJECTID:
                return btrfs_grab_root(btrfs_global_root(fs_info, &key));
        case BTRFS_CHUNK_TREE_OBJECTID:
                return btrfs_grab_root(fs_info->chunk_root);
        case BTRFS_DEV_TREE_OBJECTID:
                return btrfs_grab_root(fs_info->dev_root);
        case BTRFS_CSUM_TREE_OBJECTID:
                return btrfs_grab_root(btrfs_global_root(fs_info, &key));
        case BTRFS_QUOTA_TREE_OBJECTID:
                return btrfs_grab_root(fs_info->quota_root);
        case BTRFS_UUID_TREE_OBJECTID:
                return btrfs_grab_root(fs_info->uuid_root);
        case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
                return btrfs_grab_root(fs_info->block_group_root);
        case BTRFS_FREE_SPACE_TREE_OBJECTID:
                return btrfs_grab_root(btrfs_global_root(fs_info, &key));
        case BTRFS_RAID_STRIPE_TREE_OBJECTID:
                return btrfs_grab_root(fs_info->stripe_root);
        default:
                return NULL;
        }
}

int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
                         struct btrfs_root *root)
{
        int ret;

        ret = radix_tree_preload(GFP_NOFS);
        if (ret)
                return ret;

        spin_lock(&fs_info->fs_roots_radix_lock);
        ret = radix_tree_insert(&fs_info->fs_roots_radix,
                                (unsigned long)btrfs_root_id(root),
                                root);
        if (ret == 0) {
                btrfs_grab_root(root);
                set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
        }
        spin_unlock(&fs_info->fs_roots_radix_lock);
        radix_tree_preload_end();

        return ret;
}

void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
{
#ifdef CONFIG_BTRFS_DEBUG
        struct btrfs_root *root;

        while (!list_empty(&fs_info->allocated_roots)) {
                char buf[BTRFS_ROOT_NAME_BUF_LEN];

                root = list_first_entry(&fs_info->allocated_roots,
                                        struct btrfs_root, leak_list);
                btrfs_err(fs_info, "leaked root %s refcount %d",
                          btrfs_root_name(&root->root_key, buf),
                          refcount_read(&root->refs));
                WARN_ON_ONCE(1);
                while (refcount_read(&root->refs) > 1)
                        btrfs_put_root(root);
                btrfs_put_root(root);
        }
#endif
}

static void free_global_roots(struct btrfs_fs_info *fs_info)
{
        struct btrfs_root *root;
        struct rb_node *node;

        while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) {
                root = rb_entry(node, struct btrfs_root, rb_node);
                rb_erase(&root->rb_node, &fs_info->global_root_tree);
                btrfs_put_root(root);
        }
}

void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
{
        struct percpu_counter *em_counter = &fs_info->evictable_extent_maps;

        percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
        percpu_counter_destroy(&fs_info->delalloc_bytes);
        percpu_counter_destroy(&fs_info->ordered_bytes);
        if (percpu_counter_initialized(em_counter))
                ASSERT(percpu_counter_sum_positive(em_counter) == 0);
        percpu_counter_destroy(em_counter);
        percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
        btrfs_free_csum_hash(fs_info);
        btrfs_free_stripe_hash_table(fs_info);
        btrfs_free_ref_cache(fs_info);
        kfree(fs_info->balance_ctl);
        kfree(fs_info->delayed_root);
        free_global_roots(fs_info);
        btrfs_put_root(fs_info->tree_root);
        btrfs_put_root(fs_info->chunk_root);
        btrfs_put_root(fs_info->dev_root);
        btrfs_put_root(fs_info->quota_root);
        btrfs_put_root(fs_info->uuid_root);
        btrfs_put_root(fs_info->fs_root);
        btrfs_put_root(fs_info->data_reloc_root);
        btrfs_put_root(fs_info->block_group_root);
        btrfs_put_root(fs_info->stripe_root);
        btrfs_check_leaked_roots(fs_info);
        btrfs_extent_buffer_leak_debug_check(fs_info);
        kfree(fs_info->super_copy);
        kfree(fs_info->super_for_commit);
        kfree(fs_info->subpage_info);
        kvfree(fs_info);
}


/*
 * Get an in-memory reference of a root structure.
 *
 * For essential trees like root/extent tree, we grab it from fs_info directly.
 * For subvolume trees, we check the cached filesystem roots first. If not
 * found, then read it from disk and add it to cached fs roots.
 *
 * Caller should release the root by calling btrfs_put_root() after the usage.
 *
 * NOTE: Reloc and log trees can't be read by this function as they share the
 *         same root objectid.
 *
 * @objectid:        root id
 * @anon_dev:        preallocated anonymous block device number for new roots,
 *                pass NULL for a new allocation.
 * @check_ref:        whether to check root item references, If true, return -ENOENT
 *                for orphan roots
 */
static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
                                             u64 objectid, dev_t *anon_dev,
                                             bool check_ref)
{
        struct btrfs_root *root;
        struct btrfs_path *path;
        struct btrfs_key key;
        int ret;

        root = btrfs_get_global_root(fs_info, objectid);
        if (root)
                return root;

        /*
         * If we're called for non-subvolume trees, and above function didn't
         * find one, do not try to read it from disk.
         *
         * This is namely for free-space-tree and quota tree, which can change
         * at runtime and should only be grabbed from fs_info.
         */
        if (!is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
                return ERR_PTR(-ENOENT);
again:
        root = btrfs_lookup_fs_root(fs_info, objectid);
        if (root) {
                /*
                 * Some other caller may have read out the newly inserted
                 * subvolume already (for things like backref walk etc).  Not
                 * that common but still possible.  In that case, we just need
                 * to free the anon_dev.
                 */
                if (unlikely(anon_dev && *anon_dev)) {
                        free_anon_bdev(*anon_dev);
                        *anon_dev = 0;
                }

                if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
                        btrfs_put_root(root);
                        return ERR_PTR(-ENOENT);
                }
                return root;
        }

        key.objectid = objectid;
        key.type = BTRFS_ROOT_ITEM_KEY;
        key.offset = (u64)-1;
        root = btrfs_read_tree_root(fs_info->tree_root, &key);
        if (IS_ERR(root))
                return root;

        if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
                ret = -ENOENT;
                goto fail;
        }

        ret = btrfs_init_fs_root(root, anon_dev ? *anon_dev : 0);
        if (ret)
                goto fail;

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto fail;
        }
        key.objectid = BTRFS_ORPHAN_OBJECTID;
        key.type = BTRFS_ORPHAN_ITEM_KEY;
        key.offset = objectid;

        ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
        btrfs_free_path(path);
        if (ret < 0)
                goto fail;
        if (ret == 0)
                set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);

        ret = btrfs_insert_fs_root(fs_info, root);
        if (ret) {
                if (ret == -EEXIST) {
                        btrfs_put_root(root);
                        goto again;
                }
                goto fail;
        }
        return root;
fail:
        /*
         * If our caller provided us an anonymous device, then it's his
         * responsibility to free it in case we fail. So we have to set our
         * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
         * and once again by our caller.
         */
        if (anon_dev && *anon_dev)
                root->anon_dev = 0;
        btrfs_put_root(root);
        return ERR_PTR(ret);
}

/*
 * Get in-memory reference of a root structure
 *
 * @objectid:        tree objectid
 * @check_ref:        if set, verify that the tree exists and the item has at least
 *                one reference
 */
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
                                     u64 objectid, bool check_ref)
{
        return btrfs_get_root_ref(fs_info, objectid, NULL, check_ref);
}

/*
 * Get in-memory reference of a root structure, created as new, optionally pass
 * the anonymous block device id
 *
 * @objectid:        tree objectid
 * @anon_dev:        if NULL, allocate a new anonymous block device or use the
 *                parameter value if not NULL
 */
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
                                         u64 objectid, dev_t *anon_dev)
{
        return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
}

/*
 * Return a root for the given objectid.
 *
 * @fs_info:        the fs_info
 * @objectid:        the objectid we need to lookup
 *
 * This is exclusively used for backref walking, and exists specifically because
 * of how qgroups does lookups.  Qgroups will do a backref lookup at delayed ref
 * creation time, which means we may have to read the tree_root in order to look
 * up a fs root that is not in memory.  If the root is not in memory we will
 * read the tree root commit root and look up the fs root from there.  This is a
 * temporary root, it will not be inserted into the radix tree as it doesn't
 * have the most uptodate information, it'll simply be discarded once the
 * backref code is finished using the root.
 */
struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
                                                 struct btrfs_path *path,
                                                 u64 objectid)
{
        struct btrfs_root *root;
        struct btrfs_key key;

        ASSERT(path->search_commit_root && path->skip_locking);

        /*
         * This can return -ENOENT if we ask for a root that doesn't exist, but
         * since this is called via the backref walking code we won't be looking
         * up a root that doesn't exist, unless there's corruption.  So if root
         * != NULL just return it.
         */
        root = btrfs_get_global_root(fs_info, objectid);
        if (root)
                return root;

        root = btrfs_lookup_fs_root(fs_info, objectid);
        if (root)
                return root;

        key.objectid = objectid;
        key.type = BTRFS_ROOT_ITEM_KEY;
        key.offset = (u64)-1;
        root = read_tree_root_path(fs_info->tree_root, path, &key);
        btrfs_release_path(path);

        return root;
}

static int cleaner_kthread(void *arg)
{
        struct btrfs_fs_info *fs_info = arg;
        int again;

        while (1) {
                again = 0;

                set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);

                /* Make the cleaner go to sleep early. */
                if (btrfs_need_cleaner_sleep(fs_info))
                        goto sleep;

                /*
                 * Do not do anything if we might cause open_ctree() to block
                 * before we have finished mounting the filesystem.
                 */
                if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
                        goto sleep;

                if (!mutex_trylock(&fs_info->cleaner_mutex))
                        goto sleep;

                /*
                 * Avoid the problem that we change the status of the fs
                 * during the above check and trylock.
                 */
                if (btrfs_need_cleaner_sleep(fs_info)) {
                        mutex_unlock(&fs_info->cleaner_mutex);
                        goto sleep;
                }

                if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags))
                        btrfs_sysfs_feature_update(fs_info);

                btrfs_run_delayed_iputs(fs_info);

                again = btrfs_clean_one_deleted_snapshot(fs_info);
                mutex_unlock(&fs_info->cleaner_mutex);

                /*
                 * The defragger has dealt with the R/O remount and umount,
                 * needn't do anything special here.
                 */
                btrfs_run_defrag_inodes(fs_info);

                /*
                 * Acquires fs_info->reclaim_bgs_lock to avoid racing
                 * with relocation (btrfs_relocate_chunk) and relocation
                 * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
                 * after acquiring fs_info->reclaim_bgs_lock. So we
                 * can't hold, nor need to, fs_info->cleaner_mutex when deleting
                 * unused block groups.
                 */
                btrfs_delete_unused_bgs(fs_info);

                /*
                 * Reclaim block groups in the reclaim_bgs list after we deleted
                 * all unused block_groups. This possibly gives us some more free
                 * space.
                 */
                btrfs_reclaim_bgs(fs_info);
sleep:
                clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
                if (kthread_should_park())
                        kthread_parkme();
                if (kthread_should_stop())
                        return 0;
                if (!again) {
                        set_current_state(TASK_INTERRUPTIBLE);
                        schedule();
                        __set_current_state(TASK_RUNNING);
                }
        }
}

static int transaction_kthread(void *arg)
{
        struct btrfs_root *root = arg;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_trans_handle *trans;
        struct btrfs_transaction *cur;
        u64 transid;
        time64_t delta;
        unsigned long delay;
        bool cannot_commit;

        do {
                cannot_commit = false;
                delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
                mutex_lock(&fs_info->transaction_kthread_mutex);

                spin_lock(&fs_info->trans_lock);
                cur = fs_info->running_transaction;
                if (!cur) {
                        spin_unlock(&fs_info->trans_lock);
                        goto sleep;
                }

                delta = ktime_get_seconds() - cur->start_time;
                if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
                    cur->state < TRANS_STATE_COMMIT_PREP &&
                    delta < fs_info->commit_interval) {
                        spin_unlock(&fs_info->trans_lock);
                        delay -= msecs_to_jiffies((delta - 1) * 1000);
                        delay = min(delay,
                                    msecs_to_jiffies(fs_info->commit_interval * 1000));
                        goto sleep;
                }
                transid = cur->transid;
                spin_unlock(&fs_info->trans_lock);

                /* If the file system is aborted, this will always fail. */
                trans = btrfs_attach_transaction(root);
                if (IS_ERR(trans)) {
                        if (PTR_ERR(trans) != -ENOENT)
                                cannot_commit = true;
                        goto sleep;
                }
                if (transid == trans->transid) {
                        btrfs_commit_transaction(trans);
                } else {
                        btrfs_end_transaction(trans);
                }
sleep:
                wake_up_process(fs_info->cleaner_kthread);
                mutex_unlock(&fs_info->transaction_kthread_mutex);

                if (BTRFS_FS_ERROR(fs_info))
                        btrfs_cleanup_transaction(fs_info);
                if (!kthread_should_stop() &&
                                (!btrfs_transaction_blocked(fs_info) ||
                                 cannot_commit))
                        schedule_timeout_interruptible(delay);
        } while (!kthread_should_stop());
        return 0;
}

/*
 * This will find the highest generation in the array of root backups.  The
 * index of the highest array is returned, or -EINVAL if we can't find
 * anything.
 *
 * We check to make sure the array is valid by comparing the
 * generation of the latest  root in the array with the generation
 * in the super block.  If they don't match we pitch it.
 */
static int find_newest_super_backup(struct btrfs_fs_info *info)
{
        const u64 newest_gen = btrfs_super_generation(info->super_copy);
        u64 cur;
        struct btrfs_root_backup *root_backup;
        int i;

        for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
                root_backup = info->super_copy->super_roots + i;
                cur = btrfs_backup_tree_root_gen(root_backup);
                if (cur == newest_gen)
                        return i;
        }

        return -EINVAL;
}

/*
 * copy all the root pointers into the super backup array.
 * this will bump the backup pointer by one when it is
 * done
 */
static void backup_super_roots(struct btrfs_fs_info *info)
{
        const int next_backup = info->backup_root_index;
        struct btrfs_root_backup *root_backup;

        root_backup = info->super_for_commit->super_roots + next_backup;

        /*
         * make sure all of our padding and empty slots get zero filled
         * regardless of which ones we use today
         */
        memset(root_backup, 0, sizeof(*root_backup));

        info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;

        btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
        btrfs_set_backup_tree_root_gen(root_backup,
                               btrfs_header_generation(info->tree_root->node));

        btrfs_set_backup_tree_root_level(root_backup,
                               btrfs_header_level(info->tree_root->node));

        btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
        btrfs_set_backup_chunk_root_gen(root_backup,
                               btrfs_header_generation(info->chunk_root->node));
        btrfs_set_backup_chunk_root_level(root_backup,
                               btrfs_header_level(info->chunk_root->node));

        if (!btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) {
                struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
                struct btrfs_root *csum_root = btrfs_csum_root(info, 0);

                btrfs_set_backup_extent_root(root_backup,
                                             extent_root->node->start);
                btrfs_set_backup_extent_root_gen(root_backup,
                                btrfs_header_generation(extent_root->node));
                btrfs_set_backup_extent_root_level(root_backup,
                                        btrfs_header_level(extent_root->node));

                btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
                btrfs_set_backup_csum_root_gen(root_backup,
                                               btrfs_header_generation(csum_root->node));
                btrfs_set_backup_csum_root_level(root_backup,
                                                 btrfs_header_level(csum_root->node));
        }

        /*
         * we might commit during log recovery, which happens before we set
         * the fs_root.  Make sure it is valid before we fill it in.
         */
        if (info->fs_root && info->fs_root->node) {
                btrfs_set_backup_fs_root(root_backup,
                                         info->fs_root->node->start);
                btrfs_set_backup_fs_root_gen(root_backup,
                               btrfs_header_generation(info->fs_root->node));
                btrfs_set_backup_fs_root_level(root_backup,
                               btrfs_header_level(info->fs_root->node));
        }

        btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
        btrfs_set_backup_dev_root_gen(root_backup,
                               btrfs_header_generation(info->dev_root->node));
        btrfs_set_backup_dev_root_level(root_backup,
                                       btrfs_header_level(info->dev_root->node));

        btrfs_set_backup_total_bytes(root_backup,
                             btrfs_super_total_bytes(info->super_copy));
        btrfs_set_backup_bytes_used(root_backup,
                             btrfs_super_bytes_used(info->super_copy));
        btrfs_set_backup_num_devices(root_backup,
                             btrfs_super_num_devices(info->super_copy));

        /*
         * if we don't copy this out to the super_copy, it won't get remembered
         * for the next commit
         */
        memcpy(&info->super_copy->super_roots,
               &info->super_for_commit->super_roots,
               sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
}

/*
 * Reads a backup root based on the passed priority. Prio 0 is the newest, prio
 * 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
 *
 * @fs_info:  filesystem whose backup roots need to be read
 * @priority: priority of backup root required
 *
 * Returns backup root index on success and -EINVAL otherwise.
 */
static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
{
        int backup_index = find_newest_super_backup(fs_info);
        struct btrfs_super_block *super = fs_info->super_copy;
        struct btrfs_root_backup *root_backup;

        if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
                if (priority == 0)
                        return backup_index;

                backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
                backup_index %= BTRFS_NUM_BACKUP_ROOTS;
        } else {
                return -EINVAL;
        }

        root_backup = super->super_roots + backup_index;

        btrfs_set_super_generation(super,
                                   btrfs_backup_tree_root_gen(root_backup));
        btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
        btrfs_set_super_root_level(super,
                                   btrfs_backup_tree_root_level(root_backup));
        btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));

        /*
         * Fixme: the total bytes and num_devices need to match or we should
         * need a fsck
         */
        btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
        btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));

        return backup_index;
}

/* helper to cleanup workers */
static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
{
        btrfs_destroy_workqueue(fs_info->fixup_workers);
        btrfs_destroy_workqueue(fs_info->delalloc_workers);
        btrfs_destroy_workqueue(fs_info->workers);
        if (fs_info->endio_workers)
                destroy_workqueue(fs_info->endio_workers);
        if (fs_info->rmw_workers)
                destroy_workqueue(fs_info->rmw_workers);
        if (fs_info->compressed_write_workers)
                destroy_workqueue(fs_info->compressed_write_workers);
        btrfs_destroy_workqueue(fs_info->endio_write_workers);
        btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
        btrfs_destroy_workqueue(fs_info->delayed_workers);
        btrfs_destroy_workqueue(fs_info->caching_workers);
        btrfs_destroy_workqueue(fs_info->flush_workers);
        btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
        if (fs_info->discard_ctl.discard_workers)
                destroy_workqueue(fs_info->discard_ctl.discard_workers);
        /*
         * Now that all other work queues are destroyed, we can safely destroy
         * the queues used for metadata I/O, since tasks from those other work
         * queues can do metadata I/O operations.
         */
        if (fs_info->endio_meta_workers)
                destroy_workqueue(fs_info->endio_meta_workers);
}

static void free_root_extent_buffers(struct btrfs_root *root)
{
        if (root) {
                free_extent_buffer(root->node);
                free_extent_buffer(root->commit_root);
                root->node = NULL;
                root->commit_root = NULL;
        }
}

static void free_global_root_pointers(struct btrfs_fs_info *fs_info)
{
        struct btrfs_root *root, *tmp;

        rbtree_postorder_for_each_entry_safe(root, tmp,
                                             &fs_info->global_root_tree,
                                             rb_node)
                free_root_extent_buffers(root);
}

/* helper to cleanup tree roots */
static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
{
        free_root_extent_buffers(info->tree_root);

        free_global_root_pointers(info);
        free_root_extent_buffers(info->dev_root);
        free_root_extent_buffers(info->quota_root);
        free_root_extent_buffers(info->uuid_root);
        free_root_extent_buffers(info->fs_root);
        free_root_extent_buffers(info->data_reloc_root);
        free_root_extent_buffers(info->block_group_root);
        free_root_extent_buffers(info->stripe_root);
        if (free_chunk_root)
                free_root_extent_buffers(info->chunk_root);
}

void btrfs_put_root(struct btrfs_root *root)
{
        if (!root)
                return;

        if (refcount_dec_and_test(&root->refs)) {
                WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
                WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
                if (root->anon_dev)
                        free_anon_bdev(root->anon_dev);
                free_root_extent_buffers(root);
#ifdef CONFIG_BTRFS_DEBUG
                spin_lock(&root->fs_info->fs_roots_radix_lock);
                list_del_init(&root->leak_list);
                spin_unlock(&root->fs_info->fs_roots_radix_lock);
#endif
                kfree(root);
        }
}

void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
{
        int ret;
        struct btrfs_root *gang[8];
        int i;

        while (!list_empty(&fs_info->dead_roots)) {
                gang[0] = list_entry(fs_info->dead_roots.next,
                                     struct btrfs_root, root_list);
                list_del(&gang[0]->root_list);

                if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
                        btrfs_drop_and_free_fs_root(fs_info, gang[0]);
                btrfs_put_root(gang[0]);
        }

        while (1) {
                ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
                                             (void **)gang, 0,
                                             ARRAY_SIZE(gang));
                if (!ret)
                        break;
                for (i = 0; i < ret; i++)
                        btrfs_drop_and_free_fs_root(fs_info, gang[i]);
        }
}

static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
{
        mutex_init(&fs_info->scrub_lock);
        atomic_set(&fs_info->scrubs_running, 0);
        atomic_set(&fs_info->scrub_pause_req, 0);
        atomic_set(&fs_info->scrubs_paused, 0);
        atomic_set(&fs_info->scrub_cancel_req, 0);
        init_waitqueue_head(&fs_info->scrub_pause_wait);
        refcount_set(&fs_info->scrub_workers_refcnt, 0);
}

static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
{
        spin_lock_init(&fs_info->balance_lock);
        mutex_init(&fs_info->balance_mutex);
        atomic_set(&fs_info->balance_pause_req, 0);
        atomic_set(&fs_info->balance_cancel_req, 0);
        fs_info->balance_ctl = NULL;
        init_waitqueue_head(&fs_info->balance_wait_q);
        atomic_set(&fs_info->reloc_cancel_req, 0);
}

static int btrfs_init_btree_inode(struct super_block *sb)
{
        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID,
                                              fs_info->tree_root);
        struct inode *inode;

        inode = new_inode(sb);
        if (!inode)
                return -ENOMEM;

        inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
        set_nlink(inode, 1);
        /*
         * we set the i_size on the btree inode to the max possible int.
         * the real end of the address space is determined by all of
         * the devices in the system
         */
        inode->i_size = OFFSET_MAX;
        inode->i_mapping->a_ops = &btree_aops;
        mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);

        RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
        extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
                            IO_TREE_BTREE_INODE_IO);
        extent_map_tree_init(&BTRFS_I(inode)->extent_tree);

        BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
        BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID;
        BTRFS_I(inode)->location.type = 0;
        BTRFS_I(inode)->location.offset = 0;
        set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
        __insert_inode_hash(inode, hash);
        fs_info->btree_inode = inode;

        return 0;
}

static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
{
        mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
        init_rwsem(&fs_info->dev_replace.rwsem);
        init_waitqueue_head(&fs_info->dev_replace.replace_wait);
}

static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
{
        spin_lock_init(&fs_info->qgroup_lock);
        mutex_init(&fs_info->qgroup_ioctl_lock);
        fs_info->qgroup_tree = RB_ROOT;
        INIT_LIST_HEAD(&fs_info->dirty_qgroups);
        fs_info->qgroup_seq = 1;
        fs_info->qgroup_ulist = NULL;
        fs_info->qgroup_rescan_running = false;
        fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
        mutex_init(&fs_info->qgroup_rescan_lock);
}

static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
{
        u32 max_active = fs_info->thread_pool_size;
        unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
        unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE;

        fs_info->workers =
                btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);

        fs_info->delalloc_workers =
                btrfs_alloc_workqueue(fs_info, "delalloc",
                                      flags, max_active, 2);

        fs_info->flush_workers =
                btrfs_alloc_workqueue(fs_info, "flush_delalloc",
                                      flags, max_active, 0);

        fs_info->caching_workers =
                btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);

        fs_info->fixup_workers =
                btrfs_alloc_ordered_workqueue(fs_info, "fixup", ordered_flags);

        fs_info->endio_workers =
                alloc_workqueue("btrfs-endio", flags, max_active);
        fs_info->endio_meta_workers =
                alloc_workqueue("btrfs-endio-meta", flags, max_active);
        fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
        fs_info->endio_write_workers =
                btrfs_alloc_workqueue(fs_info, "endio-write", flags,
                                      max_active, 2);
        fs_info->compressed_write_workers =
                alloc_workqueue("btrfs-compressed-write", flags, max_active);
        fs_info->endio_freespace_worker =
                btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
                                      max_active, 0);
        fs_info->delayed_workers =
                btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
                                      max_active, 0);
        fs_info->qgroup_rescan_workers =
                btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan",
                                              ordered_flags);
        fs_info->discard_ctl.discard_workers =
                alloc_ordered_workqueue("btrfs_discard", WQ_FREEZABLE);

        if (!(fs_info->workers &&
              fs_info->delalloc_workers && fs_info->flush_workers &&
              fs_info->endio_workers && fs_info->endio_meta_workers &&
              fs_info->compressed_write_workers &&
              fs_info->endio_write_workers &&
              fs_info->endio_freespace_worker && fs_info->rmw_workers &&
              fs_info->caching_workers && fs_info->fixup_workers &&
              fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
              fs_info->discard_ctl.discard_workers)) {
                return -ENOMEM;
        }

        return 0;
}

static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
{
        struct crypto_shash *csum_shash;
        const char *csum_driver = btrfs_super_csum_driver(csum_type);

        csum_shash = crypto_alloc_shash(csum_driver, 0, 0);

        if (IS_ERR(csum_shash)) {
                btrfs_err(fs_info, "error allocating %s hash for checksum",
                          csum_driver);
                return PTR_ERR(csum_shash);
        }

        fs_info->csum_shash = csum_shash;

        /*
         * Check if the checksum implementation is a fast accelerated one.
         * As-is this is a bit of a hack and should be replaced once the csum
         * implementations provide that information themselves.
         */
        switch (csum_type) {
        case BTRFS_CSUM_TYPE_CRC32:
                if (!strstr(crypto_shash_driver_name(csum_shash), "generic"))
                        set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
                break;
        case BTRFS_CSUM_TYPE_XXHASH:
                set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
                break;
        default:
                break;
        }

        btrfs_info(fs_info, "using %s (%s) checksum algorithm",
                        btrfs_super_csum_name(csum_type),
                        crypto_shash_driver_name(csum_shash));
        return 0;
}

static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
                            struct btrfs_fs_devices *fs_devices)
{
        int ret;
        struct btrfs_tree_parent_check check = { 0 };
        struct btrfs_root *log_tree_root;
        struct btrfs_super_block *disk_super = fs_info->super_copy;
        u64 bytenr = btrfs_super_log_root(disk_super);
        int level = btrfs_super_log_root_level(disk_super);

        if (fs_devices->rw_devices == 0) {
                btrfs_warn(fs_info, "log replay required on RO media");
                return -EIO;
        }

        log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
                                         GFP_KERNEL);
        if (!log_tree_root)
                return -ENOMEM;

        check.level = level;
        check.transid = fs_info->generation + 1;
        check.owner_root = BTRFS_TREE_LOG_OBJECTID;
        log_tree_root->node = read_tree_block(fs_info, bytenr, &check);
        if (IS_ERR(log_tree_root->node)) {
                btrfs_warn(fs_info, "failed to read log tree");
                ret = PTR_ERR(log_tree_root->node);
                log_tree_root->node = NULL;
                btrfs_put_root(log_tree_root);
                return ret;
        }
        if (!extent_buffer_uptodate(log_tree_root->node)) {
                btrfs_err(fs_info, "failed to read log tree");
                btrfs_put_root(log_tree_root);
                return -EIO;
        }

        /* returns with log_tree_root freed on success */
        ret = btrfs_recover_log_trees(log_tree_root);
        if (ret) {
                btrfs_handle_fs_error(fs_info, ret,
                                      "Failed to recover log tree");
                btrfs_put_root(log_tree_root);
                return ret;
        }

        if (sb_rdonly(fs_info->sb)) {
                ret = btrfs_commit_super(fs_info);
                if (ret)
                        return ret;
        }

        return 0;
}

static int load_global_roots_objectid(struct btrfs_root *tree_root,
                                      struct btrfs_path *path, u64 objectid,
                                      const char *name)
{
        struct btrfs_fs_info *fs_info = tree_root->fs_info;
        struct btrfs_root *root;
        u64 max_global_id = 0;
        int ret;
        struct btrfs_key key = {
                .objectid = objectid,
                .type = BTRFS_ROOT_ITEM_KEY,
                .offset = 0,
        };
        bool found = false;

        /* If we have IGNOREDATACSUMS skip loading these roots. */
        if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
            btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
                set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
                return 0;
        }

        while (1) {
                ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
                if (ret < 0)
                        break;

                if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
                        ret = btrfs_next_leaf(tree_root, path);
                        if (ret) {
                                if (ret > 0)
                                        ret = 0;
                                break;
                        }
                }
                ret = 0;

                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
                if (key.objectid != objectid)
                        break;
                btrfs_release_path(path);

                /*
                 * Just worry about this for extent tree, it'll be the same for
                 * everybody.
                 */
                if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
                        max_global_id = max(max_global_id, key.offset);

                found = true;
                root = read_tree_root_path(tree_root, path, &key);
                if (IS_ERR(root)) {
                        if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
                                ret = PTR_ERR(root);
                        break;
                }
                set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
                ret = btrfs_global_root_insert(root);
                if (ret) {
                        btrfs_put_root(root);
                        break;
                }
                key.offset++;
        }
        btrfs_release_path(path);

        if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
                fs_info->nr_global_roots = max_global_id + 1;

        if (!found || ret) {
                if (objectid == BTRFS_CSUM_TREE_OBJECTID)
                        set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);

                if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
                        ret = ret ? ret : -ENOENT;
                else
                        ret = 0;
                btrfs_err(fs_info, "failed to load root %s", name);
        }
        return ret;
}

static int load_global_roots(struct btrfs_root *tree_root)
{
        struct btrfs_path *path;
        int ret = 0;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        ret = load_global_roots_objectid(tree_root, path,
                                         BTRFS_EXTENT_TREE_OBJECTID, "extent");
        if (ret)
                goto out;
        ret = load_global_roots_objectid(tree_root, path,
                                         BTRFS_CSUM_TREE_OBJECTID, "csum");
        if (ret)
                goto out;
        if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
                goto out;
        ret = load_global_roots_objectid(tree_root, path,
                                         BTRFS_FREE_SPACE_TREE_OBJECTID,
                                         "free space");
out:
        btrfs_free_path(path);
        return ret;
}

static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
{
        struct btrfs_root *tree_root = fs_info->tree_root;
        struct btrfs_root *root;
        struct btrfs_key location;
        int ret;

        ASSERT(fs_info->tree_root);

        ret = load_global_roots(tree_root);
        if (ret)
                return ret;

        location.type = BTRFS_ROOT_ITEM_KEY;
        location.offset = 0;

        if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
                location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID;
                root = btrfs_read_tree_root(tree_root, &location);
                if (IS_ERR(root)) {
                        if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
                                ret = PTR_ERR(root);
                                goto out;
                        }
                } else {
                        set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
                        fs_info->block_group_root = root;
                }
        }

        location.objectid = BTRFS_DEV_TREE_OBJECTID;
        root = btrfs_read_tree_root(tree_root, &location);
        if (IS_ERR(root)) {
                if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
                        ret = PTR_ERR(root);
                        goto out;
                }
        } else {
                set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
                fs_info->dev_root = root;
        }
        /* Initialize fs_info for all devices in any case */
        ret = btrfs_init_devices_late(fs_info);
        if (ret)
                goto out;

        /*
         * This tree can share blocks with some other fs tree during relocation
         * and we need a proper setup by btrfs_get_fs_root
         */
        root = btrfs_get_fs_root(tree_root->fs_info,
                                 BTRFS_DATA_RELOC_TREE_OBJECTID, true);
        if (IS_ERR(root)) {
                if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
                        ret = PTR_ERR(root);
                        goto out;
                }
        } else {
                set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
                fs_info->data_reloc_root = root;
        }

        location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
        root = btrfs_read_tree_root(tree_root, &location);
        if (!IS_ERR(root)) {
                set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
                fs_info->quota_root = root;
        }

        location.objectid = BTRFS_UUID_TREE_OBJECTID;
        root = btrfs_read_tree_root(tree_root, &location);
        if (IS_ERR(root)) {
                if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
                        ret = PTR_ERR(root);
                        if (ret != -ENOENT)
                                goto out;
                }
        } else {
                set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
                fs_info->uuid_root = root;
        }

        if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) {
                location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
                root = btrfs_read_tree_root(tree_root, &location);
                if (IS_ERR(root)) {
                        if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
                                ret = PTR_ERR(root);
                                goto out;
                        }
                } else {
                        set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
                        fs_info->stripe_root = root;
                }
        }

        return 0;
out:
        btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
                   location.objectid, ret);
        return ret;
}

/*
 * Real super block validation
 * NOTE: super csum type and incompat features will not be checked here.
 *
 * @sb:                super block to check
 * @mirror_num:        the super block number to check its bytenr:
 *                 0        the primary (1st) sb
 *                 1, 2        2nd and 3rd backup copy
 *                -1        skip bytenr check
 */
int btrfs_validate_super(struct btrfs_fs_info *fs_info,
                         struct btrfs_super_block *sb, int mirror_num)
{
        u64 nodesize = btrfs_super_nodesize(sb);
        u64 sectorsize = btrfs_super_sectorsize(sb);
        int ret = 0;

        if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
                btrfs_err(fs_info, "no valid FS found");
                ret = -EINVAL;
        }
        if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
                btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
                                btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
                ret = -EINVAL;
        }
        if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
                btrfs_err(fs_info, "tree_root level too big: %d >= %d",
                                btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
                ret = -EINVAL;
        }
        if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
                btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
                                btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
                ret = -EINVAL;
        }
        if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
                btrfs_err(fs_info, "log_root level too big: %d >= %d",
                                btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
                ret = -EINVAL;
        }

        /*
         * Check sectorsize and nodesize first, other check will need it.
         * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
         */
        if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
            sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
                btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
                ret = -EINVAL;
        }

        /*
         * We only support at most two sectorsizes: 4K and PAGE_SIZE.
         *
         * We can support 16K sectorsize with 64K page size without problem,
         * but such sectorsize/pagesize combination doesn't make much sense.
         * 4K will be our future standard, PAGE_SIZE is supported from the very
         * beginning.
         */
        if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
                btrfs_err(fs_info,
                        "sectorsize %llu not yet supported for page size %lu",
                        sectorsize, PAGE_SIZE);
                ret = -EINVAL;
        }

        if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
            nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
                btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
                ret = -EINVAL;
        }
        if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
                btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
                          le32_to_cpu(sb->__unused_leafsize), nodesize);
                ret = -EINVAL;
        }

        /* Root alignment check */
        if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
                btrfs_warn(fs_info, "tree_root block unaligned: %llu",
                           btrfs_super_root(sb));
                ret = -EINVAL;
        }
        if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
                btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
                           btrfs_super_chunk_root(sb));
                ret = -EINVAL;
        }
        if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
                btrfs_warn(fs_info, "log_root block unaligned: %llu",
                           btrfs_super_log_root(sb));
                ret = -EINVAL;
        }

        if (!fs_info->fs_devices->temp_fsid &&
            memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) {
                btrfs_err(fs_info,
                "superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
                          sb->fsid, fs_info->fs_devices->fsid);
                ret = -EINVAL;
        }

        if (memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb),
                   BTRFS_FSID_SIZE) != 0) {
                btrfs_err(fs_info,
"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
                          btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid);
                ret = -EINVAL;
        }

        if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
                   BTRFS_FSID_SIZE) != 0) {
                btrfs_err(fs_info,
                        "dev_item UUID does not match metadata fsid: %pU != %pU",
                        fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
                ret = -EINVAL;
        }

        /*
         * Artificial requirement for block-group-tree to force newer features
         * (free-space-tree, no-holes) so the test matrix is smaller.
         */
        if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
            (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
             !btrfs_fs_incompat(fs_info, NO_HOLES))) {
                btrfs_err(fs_info,
                "block-group-tree feature requires fres-space-tree and no-holes");
                ret = -EINVAL;
        }

        /*
         * Hint to catch really bogus numbers, bitflips or so, more exact checks are
         * done later
         */
        if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
                btrfs_err(fs_info, "bytes_used is too small %llu",
                          btrfs_super_bytes_used(sb));
                ret = -EINVAL;
        }
        if (!is_power_of_2(btrfs_super_stripesize(sb))) {
                btrfs_err(fs_info, "invalid stripesize %u",
                          btrfs_super_stripesize(sb));
                ret = -EINVAL;
        }
        if (btrfs_super_num_devices(sb) > (1UL << 31))
                btrfs_warn(fs_info, "suspicious number of devices: %llu",
                           btrfs_super_num_devices(sb));
        if (btrfs_super_num_devices(sb) == 0) {
                btrfs_err(fs_info, "number of devices is 0");
                ret = -EINVAL;
        }

        if (mirror_num >= 0 &&
            btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
                btrfs_err(fs_info, "super offset mismatch %llu != %u",
                          btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
                ret = -EINVAL;
        }

        /*
         * Obvious sys_chunk_array corruptions, it must hold at least one key
         * and one chunk
         */
        if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
                btrfs_err(fs_info, "system chunk array too big %u > %u",
                          btrfs_super_sys_array_size(sb),
                          BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
                ret = -EINVAL;
        }
        if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
                        + sizeof(struct btrfs_chunk)) {
                btrfs_err(fs_info, "system chunk array too small %u < %zu",
                          btrfs_super_sys_array_size(sb),
                          sizeof(struct btrfs_disk_key)
                          + sizeof(struct btrfs_chunk));
                ret = -EINVAL;
        }

        /*
         * The generation is a global counter, we'll trust it more than the others
         * but it's still possible that it's the one that's wrong.
         */
        if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
                btrfs_warn(fs_info,
                        "suspicious: generation < chunk_root_generation: %llu < %llu",
                        btrfs_super_generation(sb),
                        btrfs_super_chunk_root_generation(sb));
        if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
            && btrfs_super_cache_generation(sb) != (u64)-1)
                btrfs_warn(fs_info,
                        "suspicious: generation < cache_generation: %llu < %llu",
                        btrfs_super_generation(sb),
                        btrfs_super_cache_generation(sb));

        return ret;
}

/*
 * Validation of super block at mount time.
 * Some checks already done early at mount time, like csum type and incompat
 * flags will be skipped.
 */
static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
{
        return btrfs_validate_super(fs_info, fs_info->super_copy, 0);
}

/*
 * Validation of super block at write time.
 * Some checks like bytenr check will be skipped as their values will be
 * overwritten soon.
 * Extra checks like csum type and incompat flags will be done here.
 */
static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
                                      struct btrfs_super_block *sb)
{
        int ret;

        ret = btrfs_validate_super(fs_info, sb, -1);
        if (ret < 0)
                goto out;
        if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
                ret = -EUCLEAN;
                btrfs_err(fs_info, "invalid csum type, has %u want %u",
                          btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
                goto out;
        }
        if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
                ret = -EUCLEAN;
                btrfs_err(fs_info,
                "invalid incompat flags, has 0x%llx valid mask 0x%llx",
                          btrfs_super_incompat_flags(sb),
                          (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
                goto out;
        }
out:
        if (ret < 0)
                btrfs_err(fs_info,
                "super block corruption detected before writing it to disk");
        return ret;
}

static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
{
        struct btrfs_tree_parent_check check = {
                .level = level,
                .transid = gen,
                .owner_root = btrfs_root_id(root)
        };
        int ret = 0;

        root->node = read_tree_block(root->fs_info, bytenr, &check);
        if (IS_ERR(root->node)) {
                ret = PTR_ERR(root->node);
                root->node = NULL;
                return ret;
        }
        if (!extent_buffer_uptodate(root->node)) {
                free_extent_buffer(root->node);
                root->node = NULL;
                return -EIO;
        }

        btrfs_set_root_node(&root->root_item, root->node);
        root->commit_root = btrfs_root_node(root);
        btrfs_set_root_refs(&root->root_item, 1);
        return ret;
}

static int load_important_roots(struct btrfs_fs_info *fs_info)
{
        struct btrfs_super_block *sb = fs_info->super_copy;
        u64 gen, bytenr;
        int level, ret;

        bytenr = btrfs_super_root(sb);
        gen = btrfs_super_generation(sb);
        level = btrfs_super_root_level(sb);
        ret = load_super_root(fs_info->tree_root, bytenr, gen, level);
        if (ret) {
                btrfs_warn(fs_info, "couldn't read tree root");
                return ret;
        }
        return 0;
}

static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
{
        int backup_index = find_newest_super_backup(fs_info);
        struct btrfs_super_block *sb = fs_info->super_copy;
        struct btrfs_root *tree_root = fs_info->tree_root;
        bool handle_error = false;
        int ret = 0;
        int i;

        for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
                if (handle_error) {
                        if (!IS_ERR(tree_root->node))
                                free_extent_buffer(tree_root->node);
                        tree_root->node = NULL;

                        if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
                                break;

                        free_root_pointers(fs_info, 0);

                        /*
                         * Don't use the log in recovery mode, it won't be
                         * valid
                         */
                        btrfs_set_super_log_root(sb, 0);

                        btrfs_warn(fs_info, "try to load backup roots slot %d", i);
                        ret = read_backup_root(fs_info, i);
                        backup_index = ret;
                        if (ret < 0)
                                return ret;
                }

                ret = load_important_roots(fs_info);
                if (ret) {
                        handle_error = true;
                        continue;
                }

                /*
                 * No need to hold btrfs_root::objectid_mutex since the fs
                 * hasn't been fully initialised and we are the only user
                 */
                ret = btrfs_init_root_free_objectid(tree_root);
                if (ret < 0) {
                        handle_error = true;
                        continue;
                }

                ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);

                ret = btrfs_read_roots(fs_info);
                if (ret < 0) {
                        handle_error = true;
                        continue;
                }

                /* All successful */
                fs_info->generation = btrfs_header_generation(tree_root->node);
                btrfs_set_last_trans_committed(fs_info, fs_info->generation);
                fs_info->last_reloc_trans = 0;

                /* Always begin writing backup roots after the one being used */
                if (backup_index < 0) {
                        fs_info->backup_root_index = 0;
                } else {
                        fs_info->backup_root_index = backup_index + 1;
                        fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
                }
                break;
        }

        return ret;
}

void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
{
        INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
        INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
        INIT_LIST_HEAD(&fs_info->trans_list);
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->delayed_iputs);
        INIT_LIST_HEAD(&fs_info->delalloc_roots);
        INIT_LIST_HEAD(&fs_info->caching_block_groups);
        spin_lock_init(&fs_info->delalloc_root_lock);
        spin_lock_init(&fs_info->trans_lock);
        spin_lock_init(&fs_info->fs_roots_radix_lock);
        spin_lock_init(&fs_info->delayed_iput_lock);
        spin_lock_init(&fs_info->defrag_inodes_lock);
        spin_lock_init(&fs_info->super_lock);
        spin_lock_init(&fs_info->buffer_lock);
        spin_lock_init(&fs_info->unused_bgs_lock);
        spin_lock_init(&fs_info->treelog_bg_lock);
        spin_lock_init(&fs_info->zone_active_bgs_lock);
        spin_lock_init(&fs_info->relocation_bg_lock);
        rwlock_init(&fs_info->tree_mod_log_lock);
        rwlock_init(&fs_info->global_root_lock);
        mutex_init(&fs_info->unused_bg_unpin_mutex);
        mutex_init(&fs_info->reclaim_bgs_lock);
        mutex_init(&fs_info->reloc_mutex);
        mutex_init(&fs_info->delalloc_root_mutex);
        mutex_init(&fs_info->zoned_meta_io_lock);
        mutex_init(&fs_info->zoned_data_reloc_io_lock);
        seqlock_init(&fs_info->profiles_lock);

        btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers);
        btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters);
        btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered);
        btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent);
        btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_prep,
                                     BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
        btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked,
                                     BTRFS_LOCKDEP_TRANS_UNBLOCKED);
        btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed,
                                     BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
        btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed,
                                     BTRFS_LOCKDEP_TRANS_COMPLETED);

        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
        INIT_LIST_HEAD(&fs_info->space_info);
        INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
        INIT_LIST_HEAD(&fs_info->unused_bgs);
        INIT_LIST_HEAD(&fs_info->reclaim_bgs);
        INIT_LIST_HEAD(&fs_info->zone_active_bgs);
#ifdef CONFIG_BTRFS_DEBUG
        INIT_LIST_HEAD(&fs_info->allocated_roots);
        INIT_LIST_HEAD(&fs_info->allocated_ebs);
        spin_lock_init(&fs_info->eb_leak_lock);
#endif
        fs_info->mapping_tree = RB_ROOT_CACHED;
        rwlock_init(&fs_info->mapping_tree_lock);
        btrfs_init_block_rsv(&fs_info->global_block_rsv,
                             BTRFS_BLOCK_RSV_GLOBAL);
        btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
        btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
        btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
        btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
                             BTRFS_BLOCK_RSV_DELOPS);
        btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
                             BTRFS_BLOCK_RSV_DELREFS);

        atomic_set(&fs_info->async_delalloc_pages, 0);
        atomic_set(&fs_info->defrag_running, 0);
        atomic_set(&fs_info->nr_delayed_iputs, 0);
        atomic64_set(&fs_info->tree_mod_seq, 0);
        fs_info->global_root_tree = RB_ROOT;
        fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
        fs_info->metadata_ratio = 0;
        fs_info->defrag_inodes = RB_ROOT;
        atomic64_set(&fs_info->free_chunk_space, 0);
        fs_info->tree_mod_log = RB_ROOT;
        fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
        btrfs_init_ref_verify(fs_info);

        fs_info->thread_pool_size = min_t(unsigned long,
                                          num_online_cpus() + 2, 8);

        INIT_LIST_HEAD(&fs_info->ordered_roots);
        spin_lock_init(&fs_info->ordered_root_lock);

        btrfs_init_scrub(fs_info);
        btrfs_init_balance(fs_info);
        btrfs_init_async_reclaim_work(fs_info);

        rwlock_init(&fs_info->block_group_cache_lock);
        fs_info->block_group_cache_tree = RB_ROOT_CACHED;

        extent_io_tree_init(fs_info, &fs_info->excluded_extents,
                            IO_TREE_FS_EXCLUDED_EXTENTS);

        mutex_init(&fs_info->ordered_operations_mutex);
        mutex_init(&fs_info->tree_log_mutex);
        mutex_init(&fs_info->chunk_mutex);
        mutex_init(&fs_info->transaction_kthread_mutex);
        mutex_init(&fs_info->cleaner_mutex);
        mutex_init(&fs_info->ro_block_group_mutex);
        init_rwsem(&fs_info->commit_root_sem);
        init_rwsem(&fs_info->cleanup_work_sem);
        init_rwsem(&fs_info->subvol_sem);
        sema_init(&fs_info->uuid_tree_rescan_sem, 1);

        btrfs_init_dev_replace_locks(fs_info);
        btrfs_init_qgroup(fs_info);
        btrfs_discard_init(fs_info);

        btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
        btrfs_init_free_cluster(&fs_info->data_alloc_cluster);

        init_waitqueue_head(&fs_info->transaction_throttle);
        init_waitqueue_head(&fs_info->transaction_wait);
        init_waitqueue_head(&fs_info->transaction_blocked_wait);
        init_waitqueue_head(&fs_info->async_submit_wait);
        init_waitqueue_head(&fs_info->delayed_iputs_wait);

        /* Usable values until the real ones are cached from the superblock */
        fs_info->nodesize = 4096;
        fs_info->sectorsize = 4096;
        fs_info->sectorsize_bits = ilog2(4096);
        fs_info->stripesize = 4096;

        /* Default compress algorithm when user does -o compress */
        fs_info->compress_type = BTRFS_COMPRESS_ZLIB;

        fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;

        spin_lock_init(&fs_info->swapfile_pins_lock);
        fs_info->swapfile_pins = RB_ROOT;

        fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
        INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
}

static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
{
        int ret;

        fs_info->sb = sb;
        /* Temporary fixed values for block size until we read the superblock. */
        sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
        sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);

        ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
        if (ret)
                return ret;

        ret = percpu_counter_init(&fs_info->evictable_extent_maps, 0, GFP_KERNEL);
        if (ret)
                return ret;

        ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
        if (ret)
                return ret;

        fs_info->dirty_metadata_batch = PAGE_SIZE *
                                        (1 + ilog2(nr_cpu_ids));

        ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
        if (ret)
                return ret;

        ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
                        GFP_KERNEL);
        if (ret)
                return ret;

        fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
                                        GFP_KERNEL);
        if (!fs_info->delayed_root)
                return -ENOMEM;
        btrfs_init_delayed_root(fs_info->delayed_root);

        if (sb_rdonly(sb))
                set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);

        return btrfs_alloc_stripe_hash_table(fs_info);
}

static int btrfs_uuid_rescan_kthread(void *data)
{
        struct btrfs_fs_info *fs_info = data;
        int ret;

        /*
         * 1st step is to iterate through the existing UUID tree and
         * to delete all entries that contain outdated data.
         * 2nd step is to add all missing entries to the UUID tree.
         */
        ret = btrfs_uuid_tree_iterate(fs_info);
        if (ret < 0) {
                if (ret != -EINTR)
                        btrfs_warn(fs_info, "iterating uuid_tree failed %d",
                                   ret);
                up(&fs_info->uuid_tree_rescan_sem);
                return ret;
        }
        return btrfs_uuid_scan_kthread(data);
}

static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
{
        struct task_struct *task;

        down(&fs_info->uuid_tree_rescan_sem);
        task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
        if (IS_ERR(task)) {
                /* fs_info->update_uuid_tree_gen remains 0 in all error case */
                btrfs_warn(fs_info, "failed to start uuid_rescan task");
                up(&fs_info->uuid_tree_rescan_sem);
                return PTR_ERR(task);
        }

        return 0;
}

static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
{
        u64 root_objectid = 0;
        struct btrfs_root *gang[8];
        int i = 0;
        int err = 0;
        unsigned int ret = 0;

        while (1) {
                spin_lock(&fs_info->fs_roots_radix_lock);
                ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
                                             (void **)gang, root_objectid,
                                             ARRAY_SIZE(gang));
                if (!ret) {
                        spin_unlock(&fs_info->fs_roots_radix_lock);
                        break;
                }
                root_objectid = btrfs_root_id(gang[ret - 1]) + 1;

                for (i = 0; i < ret; i++) {
                        /* Avoid to grab roots in dead_roots. */
                        if (btrfs_root_refs(&gang[i]->root_item) == 0) {
                                gang[i] = NULL;
                                continue;
                        }
                        /* Grab all the search result for later use. */
                        gang[i] = btrfs_grab_root(gang[i]);
                }
                spin_unlock(&fs_info->fs_roots_radix_lock);

                for (i = 0; i < ret; i++) {
                        if (!gang[i])
                                continue;
                        root_objectid = btrfs_root_id(gang[i]);
                        err = btrfs_orphan_cleanup(gang[i]);
                        if (err)
                                goto out;
                        btrfs_put_root(gang[i]);
                }
                root_objectid++;
        }
out:
        /* Release the uncleaned roots due to error. */
        for (; i < ret; i++) {
                if (gang[i])
                        btrfs_put_root(gang[i]);
        }
        return err;
}

/*
 * Mounting logic specific to read-write file systems. Shared by open_ctree
 * and btrfs_remount when remounting from read-only to read-write.
 */
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
{
        int ret;
        const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
        bool rebuild_free_space_tree = false;

        if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
            btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
                if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
                        btrfs_warn(fs_info,
                                   "'clear_cache' option is ignored with extent tree v2");
                else
                        rebuild_free_space_tree = true;
        } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
                   !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
                btrfs_warn(fs_info, "free space tree is invalid");
                rebuild_free_space_tree = true;
        }

        if (rebuild_free_space_tree) {
                btrfs_info(fs_info, "rebuilding free space tree");
                ret = btrfs_rebuild_free_space_tree(fs_info);
                if (ret) {
                        btrfs_warn(fs_info,
                                   "failed to rebuild free space tree: %d", ret);
                        goto out;
                }
        }

        if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
            !btrfs_test_opt(fs_info, FREE_SPACE_TREE)) {
                btrfs_info(fs_info, "disabling free space tree");
                ret = btrfs_delete_free_space_tree(fs_info);
                if (ret) {
                        btrfs_warn(fs_info,
                                   "failed to disable free space tree: %d", ret);
                        goto out;
                }
        }

        /*
         * btrfs_find_orphan_roots() is responsible for finding all the dead
         * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
         * them into the fs_info->fs_roots_radix tree. This must be done before
         * calling btrfs_orphan_cleanup() on the tree root. If we don't do it
         * first, then btrfs_orphan_cleanup() will delete a dead root's orphan
         * item before the root's tree is deleted - this means that if we unmount
         * or crash before the deletion completes, on the next mount we will not
         * delete what remains of the tree because the orphan item does not
         * exists anymore, which is what tells us we have a pending deletion.
         */
        ret = btrfs_find_orphan_roots(fs_info);
        if (ret)
                goto out;

        ret = btrfs_cleanup_fs_roots(fs_info);
        if (ret)
                goto out;

        down_read(&fs_info->cleanup_work_sem);
        if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
            (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
                up_read(&fs_info->cleanup_work_sem);
                goto out;
        }
        up_read(&fs_info->cleanup_work_sem);

        mutex_lock(&fs_info->cleaner_mutex);
        ret = btrfs_recover_relocation(fs_info);
        mutex_unlock(&fs_info->cleaner_mutex);
        if (ret < 0) {
                btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
                goto out;
        }

        if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
            !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
                btrfs_info(fs_info, "creating free space tree");
                ret = btrfs_create_free_space_tree(fs_info);
                if (ret) {
                        btrfs_warn(fs_info,
                                "failed to create free space tree: %d", ret);
                        goto out;
                }
        }

        if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
                ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
                if (ret)
                        goto out;
        }

        ret = btrfs_resume_balance_async(fs_info);
        if (ret)
                goto out;

        ret = btrfs_resume_dev_replace_async(fs_info);
        if (ret) {
                btrfs_warn(fs_info, "failed to resume dev_replace");
                goto out;
        }

        btrfs_qgroup_rescan_resume(fs_info);

        if (!fs_info->uuid_root) {
                btrfs_info(fs_info, "creating UUID tree");
                ret = btrfs_create_uuid_tree(fs_info);
                if (ret) {
                        btrfs_warn(fs_info,
                                   "failed to create the UUID tree %d", ret);
                        goto out;
                }
        }

out:
        return ret;
}

/*
 * Do various sanity and dependency checks of different features.
 *
 * @is_rw_mount:        If the mount is read-write.
 *
 * This is the place for less strict checks (like for subpage or artificial
 * feature dependencies).
 *
 * For strict checks or possible corruption detection, see
 * btrfs_validate_super().
 *
 * This should be called after btrfs_parse_options(), as some mount options
 * (space cache related) can modify on-disk format like free space tree and
 * screw up certain feature dependencies.
 */
int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
{
        struct btrfs_super_block *disk_super = fs_info->super_copy;
        u64 incompat = btrfs_super_incompat_flags(disk_super);
        const u64 compat_ro = btrfs_super_compat_ro_flags(disk_super);
        const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP);

        if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
                btrfs_err(fs_info,
                "cannot mount because of unknown incompat features (0x%llx)",
                    incompat);
                return -EINVAL;
        }

        /* Runtime limitation for mixed block groups. */
        if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
            (fs_info->sectorsize != fs_info->nodesize)) {
                btrfs_err(fs_info,
"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
                        fs_info->nodesize, fs_info->sectorsize);
                return -EINVAL;
        }

        /* Mixed backref is an always-enabled feature. */
        incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;

        /* Set compression related flags just in case. */
        if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
                incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
        else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
                incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;

        /*
         * An ancient flag, which should really be marked deprecated.
         * Such runtime limitation doesn't really need a incompat flag.
         */
        if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
                incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;

        if (compat_ro_unsupp && is_rw_mount) {
                btrfs_err(fs_info,
        "cannot mount read-write because of unknown compat_ro features (0x%llx)",
                       compat_ro);
                return -EINVAL;
        }

        /*
         * We have unsupported RO compat features, although RO mounted, we
         * should not cause any metadata writes, including log replay.
         * Or we could screw up whatever the new feature requires.
         */
        if (compat_ro_unsupp && btrfs_super_log_root(disk_super) &&
            !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
                btrfs_err(fs_info,
"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
                          compat_ro);
                return -EINVAL;
        }

        /*
         * Artificial limitations for block group tree, to force
         * block-group-tree to rely on no-holes and free-space-tree.
         */
        if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
            (!btrfs_fs_incompat(fs_info, NO_HOLES) ||
             !btrfs_test_opt(fs_info, FREE_SPACE_TREE))) {
                btrfs_err(fs_info,
"block-group-tree feature requires no-holes and free-space-tree features");
                return -EINVAL;
        }

        /*
         * Subpage runtime limitation on v1 cache.
         *
         * V1 space cache still has some hard codeed PAGE_SIZE usage, while
         * we're already defaulting to v2 cache, no need to bother v1 as it's
         * going to be deprecated anyway.
         */
        if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
                btrfs_warn(fs_info,
        "v1 space cache is not supported for page size %lu with sectorsize %u",
                           PAGE_SIZE, fs_info->sectorsize);
                return -EINVAL;
        }

        /* This can be called by remount, we need to protect the super block. */
        spin_lock(&fs_info->super_lock);
        btrfs_set_super_incompat_flags(disk_super, incompat);
        spin_unlock(&fs_info->super_lock);

        return 0;
}

int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
                      char *options)
{
        u32 sectorsize;
        u32 nodesize;
        u32 stripesize;
        u64 generation;
        u16 csum_type;
        struct btrfs_super_block *disk_super;
        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        struct btrfs_root *tree_root;
        struct btrfs_root *chunk_root;
        int ret;
        int level;

        ret = init_mount_fs_info(fs_info, sb);
        if (ret)
                goto fail;

        /* These need to be init'ed before we start creating inodes and such. */
        tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
                                     GFP_KERNEL);
        fs_info->tree_root = tree_root;
        chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
                                      GFP_KERNEL);
        fs_info->chunk_root = chunk_root;
        if (!tree_root || !chunk_root) {
                ret = -ENOMEM;
                goto fail;
        }

        ret = btrfs_init_btree_inode(sb);
        if (ret)
                goto fail;

        invalidate_bdev(fs_devices->latest_dev->bdev);

        /*
         * Read super block and check the signature bytes only
         */
        disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
        if (IS_ERR(disk_super)) {
                ret = PTR_ERR(disk_super);
                goto fail_alloc;
        }

        btrfs_info(fs_info, "first mount of filesystem %pU", disk_super->fsid);
        /*
         * Verify the type first, if that or the checksum value are
         * corrupted, we'll find out
         */
        csum_type = btrfs_super_csum_type(disk_super);
        if (!btrfs_supported_super_csum(csum_type)) {
                btrfs_err(fs_info, "unsupported checksum algorithm: %u",
                          csum_type);
                ret = -EINVAL;
                btrfs_release_disk_super(disk_super);
                goto fail_alloc;
        }

        fs_info->csum_size = btrfs_super_csum_size(disk_super);

        ret = btrfs_init_csum_hash(fs_info, csum_type);
        if (ret) {
                btrfs_release_disk_super(disk_super);
                goto fail_alloc;
        }

        /*
         * We want to check superblock checksum, the type is stored inside.
         * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
         */
        if (btrfs_check_super_csum(fs_info, disk_super)) {
                btrfs_err(fs_info, "superblock checksum mismatch");
                ret = -EINVAL;
                btrfs_release_disk_super(disk_super);
                goto fail_alloc;
        }

        /*
         * super_copy is zeroed at allocation time and we never touch the
         * following bytes up to INFO_SIZE, the checksum is calculated from
         * the whole block of INFO_SIZE
         */
        memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
        btrfs_release_disk_super(disk_super);

        disk_super = fs_info->super_copy;

        memcpy(fs_info->super_for_commit, fs_info->super_copy,
               sizeof(*fs_info->super_for_commit));

        ret = btrfs_validate_mount_super(fs_info);
        if (ret) {
                btrfs_err(fs_info, "superblock contains fatal errors");
                ret = -EINVAL;
                goto fail_alloc;
        }

        if (!btrfs_super_root(disk_super)) {
                btrfs_err(fs_info, "invalid superblock tree root bytenr");
                ret = -EINVAL;
                goto fail_alloc;
        }

        /* check FS state, whether FS is broken. */
        if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
                WRITE_ONCE(fs_info->fs_error, -EUCLEAN);

        /* Set up fs_info before parsing mount options */
        nodesize = btrfs_super_nodesize(disk_super);
        sectorsize = btrfs_super_sectorsize(disk_super);
        stripesize = sectorsize;
        fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
        fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));

        fs_info->nodesize = nodesize;
        fs_info->sectorsize = sectorsize;
        fs_info->sectorsize_bits = ilog2(sectorsize);
        fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
        fs_info->stripesize = stripesize;

        /*
         * Handle the space caching options appropriately now that we have the
         * super block loaded and validated.
         */
        btrfs_set_free_space_cache_settings(fs_info);

        if (!btrfs_check_options(fs_info, &fs_info->mount_opt, sb->s_flags)) {
                ret = -EINVAL;
                goto fail_alloc;
        }

        ret = btrfs_check_features(fs_info, !sb_rdonly(sb));
        if (ret < 0)
                goto fail_alloc;

        /*
         * At this point our mount options are validated, if we set ->max_inline
         * to something non-standard make sure we truncate it to sectorsize.
         */
        fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);

        if (sectorsize < PAGE_SIZE) {
                struct btrfs_subpage_info *subpage_info;

                btrfs_warn(fs_info,
                "read-write for sector size %u with page size %lu is experimental",
                           sectorsize, PAGE_SIZE);
                subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
                if (!subpage_info) {
                        ret = -ENOMEM;
                        goto fail_alloc;
                }
                btrfs_init_subpage_info(subpage_info, sectorsize);
                fs_info->subpage_info = subpage_info;
        }

        ret = btrfs_init_workqueues(fs_info);
        if (ret)
                goto fail_sb_buffer;

        sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
        sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);

        /* Update the values for the current filesystem. */
        sb->s_blocksize = sectorsize;
        sb->s_blocksize_bits = blksize_bits(sectorsize);
        memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);

        mutex_lock(&fs_info->chunk_mutex);
        ret = btrfs_read_sys_array(fs_info);
        mutex_unlock(&fs_info->chunk_mutex);
        if (ret) {
                btrfs_err(fs_info, "failed to read the system array: %d", ret);
                goto fail_sb_buffer;
        }

        generation = btrfs_super_chunk_root_generation(disk_super);
        level = btrfs_super_chunk_root_level(disk_super);
        ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super),
                              generation, level);
        if (ret) {
                btrfs_err(fs_info, "failed to read chunk root");
                goto fail_tree_roots;
        }

        read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
                           offsetof(struct btrfs_header, chunk_tree_uuid),
                           BTRFS_UUID_SIZE);

        ret = btrfs_read_chunk_tree(fs_info);
        if (ret) {
                btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
                goto fail_tree_roots;
        }

        /*
         * At this point we know all the devices that make this filesystem,
         * including the seed devices but we don't know yet if the replace
         * target is required. So free devices that are not part of this
         * filesystem but skip the replace target device which is checked
         * below in btrfs_init_dev_replace().
         */
        btrfs_free_extra_devids(fs_devices);
        if (!fs_devices->latest_dev->bdev) {
                btrfs_err(fs_info, "failed to read devices");
                ret = -EIO;
                goto fail_tree_roots;
        }

        ret = init_tree_roots(fs_info);
        if (ret)
                goto fail_tree_roots;

        /*
         * Get zone type information of zoned block devices. This will also
         * handle emulation of a zoned filesystem if a regular device has the
         * zoned incompat feature flag set.
         */
        ret = btrfs_get_dev_zone_info_all_devices(fs_info);
        if (ret) {
                btrfs_err(fs_info,
                          "zoned: failed to read device zone info: %d", ret);
                goto fail_block_groups;
        }

        /*
         * If we have a uuid root and we're not being told to rescan we need to
         * check the generation here so we can set the
         * BTRFS_FS_UPDATE_UUID_TREE_GEN bit.  Otherwise we could commit the
         * transaction during a balance or the log replay without updating the
         * uuid generation, and then if we crash we would rescan the uuid tree,
         * even though it was perfectly fine.
         */
        if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) &&
            fs_info->generation == btrfs_super_uuid_tree_generation(disk_super))
                set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);

        ret = btrfs_verify_dev_extents(fs_info);
        if (ret) {
                btrfs_err(fs_info,
                          "failed to verify dev extents against chunks: %d",
                          ret);
                goto fail_block_groups;
        }
        ret = btrfs_recover_balance(fs_info);
        if (ret) {
                btrfs_err(fs_info, "failed to recover balance: %d", ret);
                goto fail_block_groups;
        }

        ret = btrfs_init_dev_stats(fs_info);
        if (ret) {
                btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
                goto fail_block_groups;
        }

        ret = btrfs_init_dev_replace(fs_info);
        if (ret) {
                btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
                goto fail_block_groups;
        }

        ret = btrfs_check_zoned_mode(fs_info);
        if (ret) {
                btrfs_err(fs_info, "failed to initialize zoned mode: %d",
                          ret);
                goto fail_block_groups;
        }

        ret = btrfs_sysfs_add_fsid(fs_devices);
        if (ret) {
                btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
                                ret);
                goto fail_block_groups;
        }

        ret = btrfs_sysfs_add_mounted(fs_info);
        if (ret) {
                btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
                goto fail_fsdev_sysfs;
        }

        ret = btrfs_init_space_info(fs_info);
        if (ret) {
                btrfs_err(fs_info, "failed to initialize space info: %d", ret);
                goto fail_sysfs;
        }

        ret = btrfs_read_block_groups(fs_info);
        if (ret) {
                btrfs_err(fs_info, "failed to read block groups: %d", ret);
                goto fail_sysfs;
        }

        btrfs_free_zone_cache(fs_info);

        btrfs_check_active_zone_reservation(fs_info);

        if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
            !btrfs_check_rw_degradable(fs_info, NULL)) {
                btrfs_warn(fs_info,
                "writable mount is not allowed due to too many missing devices");
                ret = -EINVAL;
                goto fail_sysfs;
        }

        fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info,
                                               "btrfs-cleaner");
        if (IS_ERR(fs_info->cleaner_kthread)) {
                ret = PTR_ERR(fs_info->cleaner_kthread);
                goto fail_sysfs;
        }

        fs_info->transaction_kthread = kthread_run(transaction_kthread,
                                                   tree_root,
                                                   "btrfs-transaction");
        if (IS_ERR(fs_info->transaction_kthread)) {
                ret = PTR_ERR(fs_info->transaction_kthread);
                goto fail_cleaner;
        }

        ret = btrfs_read_qgroup_config(fs_info);
        if (ret)
                goto fail_trans_kthread;

        if (btrfs_build_ref_tree(fs_info))
                btrfs_err(fs_info, "couldn't build ref tree");

        /* do not make disk changes in broken FS or nologreplay is given */
        if (btrfs_super_log_root(disk_super) != 0 &&
            !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
                btrfs_info(fs_info, "start tree-log replay");
                ret = btrfs_replay_log(fs_info, fs_devices);
                if (ret)
                        goto fail_qgroup;
        }

        fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
        if (IS_ERR(fs_info->fs_root)) {
                ret = PTR_ERR(fs_info->fs_root);
                btrfs_warn(fs_info, "failed to read fs tree: %d", ret);
                fs_info->fs_root = NULL;
                goto fail_qgroup;
        }

        if (sb_rdonly(sb))
                return 0;

        ret = btrfs_start_pre_rw_mount(fs_info);
        if (ret) {
                close_ctree(fs_info);
                return ret;
        }
        btrfs_discard_resume(fs_info);

        if (fs_info->uuid_root &&
            (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
             fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
                btrfs_info(fs_info, "checking UUID tree");
                ret = btrfs_check_uuid_tree(fs_info);
                if (ret) {
                        btrfs_warn(fs_info,
                                "failed to check the UUID tree: %d", ret);
                        close_ctree(fs_info);
                        return ret;
                }
        }

        set_bit(BTRFS_FS_OPEN, &fs_info->flags);

        /* Kick the cleaner thread so it'll start deleting snapshots. */
        if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
                wake_up_process(fs_info->cleaner_kthread);

        return 0;

fail_qgroup:
        btrfs_free_qgroup_config(fs_info);
fail_trans_kthread:
        kthread_stop(fs_info->transaction_kthread);
        btrfs_cleanup_transaction(fs_info);
        btrfs_free_fs_roots(fs_info);
fail_cleaner:
        kthread_stop(fs_info->cleaner_kthread);

        /*
         * make sure we're done with the btree inode before we stop our
         * kthreads
         */
        filemap_write_and_wait(fs_info->btree_inode->i_mapping);

fail_sysfs:
        btrfs_sysfs_remove_mounted(fs_info);

fail_fsdev_sysfs:
        btrfs_sysfs_remove_fsid(fs_info->fs_devices);

fail_block_groups:
        btrfs_put_block_group_cache(fs_info);

fail_tree_roots:
        if (fs_info->data_reloc_root)
                btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
        free_root_pointers(fs_info, true);
        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);

fail_sb_buffer:
        btrfs_stop_all_workers(fs_info);
        btrfs_free_block_groups(fs_info);
fail_alloc:
        btrfs_mapping_tree_free(fs_info);

        iput(fs_info->btree_inode);
fail:
        btrfs_close_devices(fs_info->fs_devices);
        ASSERT(ret < 0);
        return ret;
}
ALLOW_ERROR_INJECTION(open_ctree, ERRNO);

static void btrfs_end_super_write(struct bio *bio)
{
        struct btrfs_device *device = bio->bi_private;
        struct folio_iter fi;

        bio_for_each_folio_all(fi, bio) {
                if (bio->bi_status) {
                        btrfs_warn_rl_in_rcu(device->fs_info,
                                "lost super block write due to IO error on %s (%d)",
                                btrfs_dev_name(device),
                                blk_status_to_errno(bio->bi_status));
                        btrfs_dev_stat_inc_and_print(device,
                                                     BTRFS_DEV_STAT_WRITE_ERRS);
                        /* Ensure failure if the primary sb fails. */
                        if (bio->bi_opf & REQ_FUA)
                                atomic_add(BTRFS_SUPER_PRIMARY_WRITE_ERROR,
                                           &device->sb_write_errors);
                        else
                                atomic_inc(&device->sb_write_errors);
                }
                folio_unlock(fi.folio);
                folio_put(fi.folio);
        }

        bio_put(bio);
}

struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
                                                   int copy_num, bool drop_cache)
{
        struct btrfs_super_block *super;
        struct page *page;
        u64 bytenr, bytenr_orig;
        struct address_space *mapping = bdev->bd_mapping;
        int ret;

        bytenr_orig = btrfs_sb_offset(copy_num);
        ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
        if (ret == -ENOENT)
                return ERR_PTR(-EINVAL);
        else if (ret)
                return ERR_PTR(ret);

        if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
                return ERR_PTR(-EINVAL);

        if (drop_cache) {
                /* This should only be called with the primary sb. */
                ASSERT(copy_num == 0);

                /*
                 * Drop the page of the primary superblock, so later read will
                 * always read from the device.
                 */
                invalidate_inode_pages2_range(mapping,
                                bytenr >> PAGE_SHIFT,
                                (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
        }

        page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
        if (IS_ERR(page))
                return ERR_CAST(page);

        super = page_address(page);
        if (btrfs_super_magic(super) != BTRFS_MAGIC) {
                btrfs_release_disk_super(super);
                return ERR_PTR(-ENODATA);
        }

        if (btrfs_super_bytenr(super) != bytenr_orig) {
                btrfs_release_disk_super(super);
                return ERR_PTR(-EINVAL);
        }

        return super;
}


struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
{
        struct btrfs_super_block *super, *latest = NULL;
        int i;
        u64 transid = 0;

        /* we would like to check all the supers, but that would make
         * a btrfs mount succeed after a mkfs from a different FS.
         * So, we need to add a special mount option to scan for
         * later supers, using BTRFS_SUPER_MIRROR_MAX instead
         */
        for (i = 0; i < 1; i++) {
                super = btrfs_read_dev_one_super(bdev, i, false);
                if (IS_ERR(super))
                        continue;

                if (!latest || btrfs_super_generation(super) > transid) {
                        if (latest)
                                btrfs_release_disk_super(super);

                        latest = super;
                        transid = btrfs_super_generation(super);
                }
        }

        return super;
}

/*
 * Write superblock @sb to the @device. Do not wait for completion, all the
 * folios we use for writing are locked.
 *
 * Write @max_mirrors copies of the superblock, where 0 means default that fit
 * the expected device size at commit time. Note that max_mirrors must be
 * same for write and wait phases.
 *
 * Return number of errors when folio is not found or submission fails.
 */
static int write_dev_supers(struct btrfs_device *device,
                            struct btrfs_super_block *sb, int max_mirrors)
{
        struct btrfs_fs_info *fs_info = device->fs_info;
        struct address_space *mapping = device->bdev->bd_mapping;
        SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
        int i;
        int ret;
        u64 bytenr, bytenr_orig;

        atomic_set(&device->sb_write_errors, 0);

        if (max_mirrors == 0)
                max_mirrors = BTRFS_SUPER_MIRROR_MAX;

        shash->tfm = fs_info->csum_shash;

        for (i = 0; i < max_mirrors; i++) {
                struct folio *folio;
                struct bio *bio;
                struct btrfs_super_block *disk_super;
                size_t offset;

                bytenr_orig = btrfs_sb_offset(i);
                ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
                if (ret == -ENOENT) {
                        continue;
                } else if (ret < 0) {
                        btrfs_err(device->fs_info,
                                "couldn't get super block location for mirror %d",
                                i);
                        atomic_inc(&device->sb_write_errors);
                        continue;
                }
                if (bytenr + BTRFS_SUPER_INFO_SIZE >=
                    device->commit_total_bytes)
                        break;

                btrfs_set_super_bytenr(sb, bytenr_orig);

                crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
                                    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
                                    sb->csum);

                folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT,
                                            FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
                                            GFP_NOFS);
                if (IS_ERR(folio)) {
                        btrfs_err(device->fs_info,
                            "couldn't get super block page for bytenr %llu",
                            bytenr);
                        atomic_inc(&device->sb_write_errors);
                        continue;
                }
                ASSERT(folio_order(folio) == 0);

                offset = offset_in_folio(folio, bytenr);
                disk_super = folio_address(folio) + offset;
                memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);

                /*
                 * Directly use bios here instead of relying on the page cache
                 * to do I/O, so we don't lose the ability to do integrity
                 * checking.
                 */
                bio = bio_alloc(device->bdev, 1,
                                REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
                                GFP_NOFS);
                bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
                bio->bi_private = device;
                bio->bi_end_io = btrfs_end_super_write;
                bio_add_folio_nofail(bio, folio, BTRFS_SUPER_INFO_SIZE, offset);

                /*
                 * We FUA only the first super block.  The others we allow to
                 * go down lazy and there's a short window where the on-disk
                 * copies might still contain the older version.
                 */
                if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
                        bio->bi_opf |= REQ_FUA;
                submit_bio(bio);

                if (btrfs_advance_sb_log(device, i))
                        atomic_inc(&device->sb_write_errors);
        }
        return atomic_read(&device->sb_write_errors) < i ? 0 : -1;
}

/*
 * Wait for write completion of superblocks done by write_dev_supers,
 * @max_mirrors same for write and wait phases.
 *
 * Return -1 if primary super block write failed or when there were no super block
 * copies written. Otherwise 0.
 */
static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
{
        int i;
        int errors = 0;
        bool primary_failed = false;
        int ret;
        u64 bytenr;

        if (max_mirrors == 0)
                max_mirrors = BTRFS_SUPER_MIRROR_MAX;

        for (i = 0; i < max_mirrors; i++) {
                struct folio *folio;

                ret = btrfs_sb_log_location(device, i, READ, &bytenr);
                if (ret == -ENOENT) {
                        break;
                } else if (ret < 0) {
                        errors++;
                        if (i == 0)
                                primary_failed = true;
                        continue;
                }
                if (bytenr + BTRFS_SUPER_INFO_SIZE >=
                    device->commit_total_bytes)
                        break;

                folio = filemap_get_folio(device->bdev->bd_mapping,
                                          bytenr >> PAGE_SHIFT);
                /* If the folio has been removed, then we know it completed. */
                if (IS_ERR(folio))
                        continue;
                ASSERT(folio_order(folio) == 0);

                /* Folio will be unlocked once the write completes. */
                folio_wait_locked(folio);
                folio_put(folio);
        }

        errors += atomic_read(&device->sb_write_errors);
        if (errors >= BTRFS_SUPER_PRIMARY_WRITE_ERROR)
                primary_failed = true;
        if (primary_failed) {
                btrfs_err(device->fs_info, "error writing primary super block to device %llu",
                          device->devid);
                return -1;
        }

        return errors < i ? 0 : -1;
}

/*
 * endio for the write_dev_flush, this will wake anyone waiting
 * for the barrier when it is done
 */
static void btrfs_end_empty_barrier(struct bio *bio)
{
        bio_uninit(bio);
        complete(bio->bi_private);
}

/*
 * Submit a flush request to the device if it supports it. Error handling is
 * done in the waiting counterpart.
 */
static void write_dev_flush(struct btrfs_device *device)
{
        struct bio *bio = &device->flush_bio;

        device->last_flush_error = BLK_STS_OK;

        bio_init(bio, device->bdev, NULL, 0,
                 REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
        bio->bi_end_io = btrfs_end_empty_barrier;
        init_completion(&device->flush_wait);
        bio->bi_private = &device->flush_wait;
        submit_bio(bio);
        set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
}

/*
 * If the flush bio has been submitted by write_dev_flush, wait for it.
 * Return true for any error, and false otherwise.
 */
static bool wait_dev_flush(struct btrfs_device *device)
{
        struct bio *bio = &device->flush_bio;

        if (!test_and_clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
                return false;

        wait_for_completion_io(&device->flush_wait);

        if (bio->bi_status) {
                device->last_flush_error = bio->bi_status;
                btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS);
                return true;
        }

        return false;
}

/*
 * send an empty flush down to each device in parallel,
 * then wait for them
 */
static int barrier_all_devices(struct btrfs_fs_info *info)
{
        struct list_head *head;
        struct btrfs_device *dev;
        int errors_wait = 0;

        lockdep_assert_held(&info->fs_devices->device_list_mutex);
        /* send down all the barriers */
        head = &info->fs_devices->devices;
        list_for_each_entry(dev, head, dev_list) {
                if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
                        continue;
                if (!dev->bdev)
                        continue;
                if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
                    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
                        continue;

                write_dev_flush(dev);
        }

        /* wait for all the barriers */
        list_for_each_entry(dev, head, dev_list) {
                if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
                        continue;
                if (!dev->bdev) {
                        errors_wait++;
                        continue;
                }
                if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
                    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
                        continue;

                if (wait_dev_flush(dev))
                        errors_wait++;
        }

        /*
         * Checks last_flush_error of disks in order to determine the device
         * state.
         */
        if (errors_wait && !btrfs_check_rw_degradable(info, NULL))
                return -EIO;

        return 0;
}

int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
{
        int raid_type;
        int min_tolerated = INT_MAX;

        if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
            (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
                min_tolerated = min_t(int, min_tolerated,
                                    btrfs_raid_array[BTRFS_RAID_SINGLE].
                                    tolerated_failures);

        for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
                if (raid_type == BTRFS_RAID_SINGLE)
                        continue;
                if (!(flags & btrfs_raid_array[raid_type].bg_flag))
                        continue;
                min_tolerated = min_t(int, min_tolerated,
                                    btrfs_raid_array[raid_type].
                                    tolerated_failures);
        }

        if (min_tolerated == INT_MAX) {
                pr_warn("BTRFS: unknown raid flag: %llu", flags);
                min_tolerated = 0;
        }

        return min_tolerated;
}

int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
{
        struct list_head *head;
        struct btrfs_device *dev;
        struct btrfs_super_block *sb;
        struct btrfs_dev_item *dev_item;
        int ret;
        int do_barriers;
        int max_errors;
        int total_errors = 0;
        u64 flags;

        do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);

        /*
         * max_mirrors == 0 indicates we're from commit_transaction,
         * not from fsync where the tree roots in fs_info have not
         * been consistent on disk.
         */
        if (max_mirrors == 0)
                backup_super_roots(fs_info);

        sb = fs_info->super_for_commit;
        dev_item = &sb->dev_item;

        mutex_lock(&fs_info->fs_devices->device_list_mutex);
        head = &fs_info->fs_devices->devices;
        max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;

        if (do_barriers) {
                ret = barrier_all_devices(fs_info);
                if (ret) {
                        mutex_unlock(
                                &fs_info->fs_devices->device_list_mutex);
                        btrfs_handle_fs_error(fs_info, ret,
                                              "errors while submitting device barriers.");
                        return ret;
                }
        }

        list_for_each_entry(dev, head, dev_list) {
                if (!dev->bdev) {
                        total_errors++;
                        continue;
                }
                if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
                    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
                        continue;

                btrfs_set_stack_device_generation(dev_item, 0);
                btrfs_set_stack_device_type(dev_item, dev->type);
                btrfs_set_stack_device_id(dev_item, dev->devid);
                btrfs_set_stack_device_total_bytes(dev_item,
                                                   dev->commit_total_bytes);
                btrfs_set_stack_device_bytes_used(dev_item,
                                                  dev->commit_bytes_used);
                btrfs_set_stack_device_io_align(dev_item, dev->io_align);
                btrfs_set_stack_device_io_width(dev_item, dev->io_width);
                btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
                memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
                memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
                       BTRFS_FSID_SIZE);

                flags = btrfs_super_flags(sb);
                btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);

                ret = btrfs_validate_write_super(fs_info, sb);
                if (ret < 0) {
                        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
                        btrfs_handle_fs_error(fs_info, -EUCLEAN,
                                "unexpected superblock corruption detected");
                        return -EUCLEAN;
                }

                ret = write_dev_supers(dev, sb, max_mirrors);
                if (ret)
                        total_errors++;
        }
        if (total_errors > max_errors) {
                btrfs_err(fs_info, "%d errors while writing supers",
                          total_errors);
                mutex_unlock(&fs_info->fs_devices->device_list_mutex);

                /* FUA is masked off if unsupported and can't be the reason */
                btrfs_handle_fs_error(fs_info, -EIO,
                                      "%d errors while writing supers",
                                      total_errors);
                return -EIO;
        }

        total_errors = 0;
        list_for_each_entry(dev, head, dev_list) {
                if (!dev->bdev)
                        continue;
                if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
                    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
                        continue;

                ret = wait_dev_supers(dev, max_mirrors);
                if (ret)
                        total_errors++;
        }
        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
        if (total_errors > max_errors) {
                btrfs_handle_fs_error(fs_info, -EIO,
                                      "%d errors while writing supers",
                                      total_errors);
                return -EIO;
        }
        return 0;
}

/* Drop a fs root from the radix tree and free it. */
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
                                  struct btrfs_root *root)
{
        bool drop_ref = false;

        spin_lock(&fs_info->fs_roots_radix_lock);
        radix_tree_delete(&fs_info->fs_roots_radix,
                          (unsigned long)btrfs_root_id(root));
        if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
                drop_ref = true;
        spin_unlock(&fs_info->fs_roots_radix_lock);

        if (BTRFS_FS_ERROR(fs_info)) {
                ASSERT(root->log_root == NULL);
                if (root->reloc_root) {
                        btrfs_put_root(root->reloc_root);
                        root->reloc_root = NULL;
                }
        }

        if (drop_ref)
                btrfs_put_root(root);
}

int btrfs_commit_super(struct btrfs_fs_info *fs_info)
{
        struct btrfs_root *root = fs_info->tree_root;
        struct btrfs_trans_handle *trans;

        mutex_lock(&fs_info->cleaner_mutex);
        btrfs_run_delayed_iputs(fs_info);
        mutex_unlock(&fs_info->cleaner_mutex);
        wake_up_process(fs_info->cleaner_kthread);

        /* wait until ongoing cleanup work done */
        down_write(&fs_info->cleanup_work_sem);
        up_write(&fs_info->cleanup_work_sem);

        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
        return btrfs_commit_transaction(trans);
}

static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
{
        struct btrfs_transaction *trans;
        struct btrfs_transaction *tmp;
        bool found = false;

        /*
         * This function is only called at the very end of close_ctree(),
         * thus no other running transaction, no need to take trans_lock.
         */
        ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags));
        list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) {
                struct extent_state *cached = NULL;
                u64 dirty_bytes = 0;
                u64 cur = 0;
                u64 found_start;
                u64 found_end;

                found = true;
                while (find_first_extent_bit(&trans->dirty_pages, cur,
                        &found_start, &found_end, EXTENT_DIRTY, &cached)) {
                        dirty_bytes += found_end + 1 - found_start;
                        cur = found_end + 1;
                }
                btrfs_warn(fs_info,
        "transaction %llu (with %llu dirty metadata bytes) is not committed",
                           trans->transid, dirty_bytes);
                btrfs_cleanup_one_transaction(trans, fs_info);

                if (trans == fs_info->running_transaction)
                        fs_info->running_transaction = NULL;
                list_del_init(&trans->list);

                btrfs_put_transaction(trans);
                trace_btrfs_transaction_commit(fs_info);
        }
        ASSERT(!found);
}

void __cold close_ctree(struct btrfs_fs_info *fs_info)
{
        int ret;

        set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);

        /*
         * If we had UNFINISHED_DROPS we could still be processing them, so
         * clear that bit and wake up relocation so it can stop.
         * We must do this before stopping the block group reclaim task, because
         * at btrfs_relocate_block_group() we wait for this bit, and after the
         * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we
         * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will
         * return 1.
         */
        btrfs_wake_unfinished_drop(fs_info);

        /*
         * We may have the reclaim task running and relocating a data block group,
         * in which case it may create delayed iputs. So stop it before we park
         * the cleaner kthread otherwise we can get new delayed iputs after
         * parking the cleaner, and that can make the async reclaim task to hang
         * if it's waiting for delayed iputs to complete, since the cleaner is
         * parked and can not run delayed iputs - this will make us hang when
         * trying to stop the async reclaim task.
         */
        cancel_work_sync(&fs_info->reclaim_bgs_work);
        /*
         * We don't want the cleaner to start new transactions, add more delayed
         * iputs, etc. while we're closing. We can't use kthread_stop() yet
         * because that frees the task_struct, and the transaction kthread might
         * still try to wake up the cleaner.
         */
        kthread_park(fs_info->cleaner_kthread);

        /* wait for the qgroup rescan worker to stop */
        btrfs_qgroup_wait_for_completion(fs_info, false);

        /* wait for the uuid_scan task to finish */
        down(&fs_info->uuid_tree_rescan_sem);
        /* avoid complains from lockdep et al., set sem back to initial state */
        up(&fs_info->uuid_tree_rescan_sem);

        /* pause restriper - we want to resume on mount */
        btrfs_pause_balance(fs_info);

        btrfs_dev_replace_suspend_for_unmount(fs_info);

        btrfs_scrub_cancel(fs_info);

        /* wait for any defraggers to finish */
        wait_event(fs_info->transaction_wait,
                   (atomic_read(&fs_info->defrag_running) == 0));

        /* clear out the rbtree of defraggable inodes */
        btrfs_cleanup_defrag_inodes(fs_info);

        /*
         * After we parked the cleaner kthread, ordered extents may have
         * completed and created new delayed iputs. If one of the async reclaim
         * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
         * can hang forever trying to stop it, because if a delayed iput is
         * added after it ran btrfs_run_delayed_iputs() and before it called
         * btrfs_wait_on_delayed_iputs(), it will hang forever since there is
         * no one else to run iputs.
         *
         * So wait for all ongoing ordered extents to complete and then run
         * delayed iputs. This works because once we reach this point no one
         * can either create new ordered extents nor create delayed iputs
         * through some other means.
         *
         * Also note that btrfs_wait_ordered_roots() is not safe here, because
         * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
         * but the delayed iput for the respective inode is made only when doing
         * the final btrfs_put_ordered_extent() (which must happen at
         * btrfs_finish_ordered_io() when we are unmounting).
         */
        btrfs_flush_workqueue(fs_info->endio_write_workers);
        /* Ordered extents for free space inodes. */
        btrfs_flush_workqueue(fs_info->endio_freespace_worker);
        btrfs_run_delayed_iputs(fs_info);

        cancel_work_sync(&fs_info->async_reclaim_work);
        cancel_work_sync(&fs_info->async_data_reclaim_work);
        cancel_work_sync(&fs_info->preempt_reclaim_work);

        /* Cancel or finish ongoing discard work */
        btrfs_discard_cleanup(fs_info);

        if (!sb_rdonly(fs_info->sb)) {
                /*
                 * The cleaner kthread is stopped, so do one final pass over
                 * unused block groups.
                 */
                btrfs_delete_unused_bgs(fs_info);

                /*
                 * There might be existing delayed inode workers still running
                 * and holding an empty delayed inode item. We must wait for
                 * them to complete first because they can create a transaction.
                 * This happens when someone calls btrfs_balance_delayed_items()
                 * and then a transaction commit runs the same delayed nodes
                 * before any delayed worker has done something with the nodes.
                 * We must wait for any worker here and not at transaction
                 * commit time since that could cause a deadlock.
                 * This is a very rare case.
                 */
                btrfs_flush_workqueue(fs_info->delayed_workers);

                ret = btrfs_commit_super(fs_info);
                if (ret)
                        btrfs_err(fs_info, "commit super ret %d", ret);
        }

        if (BTRFS_FS_ERROR(fs_info))
                btrfs_error_commit_super(fs_info);

        kthread_stop(fs_info->transaction_kthread);
        kthread_stop(fs_info->cleaner_kthread);

        ASSERT(list_empty(&fs_info->delayed_iputs));
        set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);

        if (btrfs_check_quota_leak(fs_info)) {
                WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
                btrfs_err(fs_info, "qgroup reserved space leaked");
        }

        btrfs_free_qgroup_config(fs_info);
        ASSERT(list_empty(&fs_info->delalloc_roots));

        if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
                btrfs_info(fs_info, "at unmount delalloc count %lld",
                       percpu_counter_sum(&fs_info->delalloc_bytes));
        }

        if (percpu_counter_sum(&fs_info->ordered_bytes))
                btrfs_info(fs_info, "at unmount dio bytes count %lld",
                           percpu_counter_sum(&fs_info->ordered_bytes));

        btrfs_sysfs_remove_mounted(fs_info);
        btrfs_sysfs_remove_fsid(fs_info->fs_devices);

        btrfs_put_block_group_cache(fs_info);

        /*
         * we must make sure there is not any read request to
         * submit after we stopping all workers.
         */
        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
        btrfs_stop_all_workers(fs_info);

        /* We shouldn't have any transaction open at this point */
        warn_about_uncommitted_trans(fs_info);

        clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
        free_root_pointers(fs_info, true);
        btrfs_free_fs_roots(fs_info);

        /*
         * We must free the block groups after dropping the fs_roots as we could
         * have had an IO error and have left over tree log blocks that aren't
         * cleaned up until the fs roots are freed.  This makes the block group
         * accounting appear to be wrong because there's pending reserved bytes,
         * so make sure we do the block group cleanup afterwards.
         */
        btrfs_free_block_groups(fs_info);

        iput(fs_info->btree_inode);

        btrfs_mapping_tree_free(fs_info);
        btrfs_close_devices(fs_info->fs_devices);
}

void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
                             struct extent_buffer *buf)
{
        struct btrfs_fs_info *fs_info = buf->fs_info;
        u64 transid = btrfs_header_generation(buf);

#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
        /*
         * This is a fast path so only do this check if we have sanity tests
         * enabled.  Normal people shouldn't be using unmapped buffers as dirty
         * outside of the sanity tests.
         */
        if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
                return;
#endif
        /* This is an active transaction (its state < TRANS_STATE_UNBLOCKED). */
        ASSERT(trans->transid == fs_info->generation);
        btrfs_assert_tree_write_locked(buf);
        if (unlikely(transid != fs_info->generation)) {
                btrfs_abort_transaction(trans, -EUCLEAN);
                btrfs_crit(fs_info,
"dirty buffer transid mismatch, logical %llu found transid %llu running transid %llu",
                           buf->start, transid, fs_info->generation);
        }
        set_extent_buffer_dirty(buf);
}

static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
                                        int flush_delayed)
{
        /*
         * looks as though older kernels can get into trouble with
         * this code, they end up stuck in balance_dirty_pages forever
         */
        int ret;

        if (current->flags & PF_MEMALLOC)
                return;

        if (flush_delayed)
                btrfs_balance_delayed_items(fs_info);

        ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
                                     BTRFS_DIRTY_METADATA_THRESH,
                                     fs_info->dirty_metadata_batch);
        if (ret > 0) {
                balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
        }
}

void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
{
        __btrfs_btree_balance_dirty(fs_info, 1);
}

void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
{
        __btrfs_btree_balance_dirty(fs_info, 0);
}

static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
{
        /* cleanup FS via transaction */
        btrfs_cleanup_transaction(fs_info);

        mutex_lock(&fs_info->cleaner_mutex);
        btrfs_run_delayed_iputs(fs_info);
        mutex_unlock(&fs_info->cleaner_mutex);

        down_write(&fs_info->cleanup_work_sem);
        up_write(&fs_info->cleanup_work_sem);
}

static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
{
        struct btrfs_root *gang[8];
        u64 root_objectid = 0;
        int ret;

        spin_lock(&fs_info->fs_roots_radix_lock);
        while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
                                             (void **)gang, root_objectid,
                                             ARRAY_SIZE(gang))) != 0) {
                int i;

                for (i = 0; i < ret; i++)
                        gang[i] = btrfs_grab_root(gang[i]);
                spin_unlock(&fs_info->fs_roots_radix_lock);

                for (i = 0; i < ret; i++) {
                        if (!gang[i])
                                continue;
                        root_objectid = btrfs_root_id(gang[i]);
                        btrfs_free_log(NULL, gang[i]);
                        btrfs_put_root(gang[i]);
                }
                root_objectid++;
                spin_lock(&fs_info->fs_roots_radix_lock);
        }
        spin_unlock(&fs_info->fs_roots_radix_lock);
        btrfs_free_log_root_tree(NULL, fs_info);
}

static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
{
        struct btrfs_ordered_extent *ordered;

        spin_lock(&root->ordered_extent_lock);
        /*
         * This will just short circuit the ordered completion stuff which will
         * make sure the ordered extent gets properly cleaned up.
         */
        list_for_each_entry(ordered, &root->ordered_extents,
                            root_extent_list)
                set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
        spin_unlock(&root->ordered_extent_lock);
}

static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
{
        struct btrfs_root *root;
        LIST_HEAD(splice);

        spin_lock(&fs_info->ordered_root_lock);
        list_splice_init(&fs_info->ordered_roots, &splice);
        while (!list_empty(&splice)) {
                root = list_first_entry(&splice, struct btrfs_root,
                                        ordered_root);
                list_move_tail(&root->ordered_root,
                               &fs_info->ordered_roots);

                spin_unlock(&fs_info->ordered_root_lock);
                btrfs_destroy_ordered_extents(root);

                cond_resched();
                spin_lock(&fs_info->ordered_root_lock);
        }
        spin_unlock(&fs_info->ordered_root_lock);

        /*
         * We need this here because if we've been flipped read-only we won't
         * get sync() from the umount, so we need to make sure any ordered
         * extents that haven't had their dirty pages IO start writeout yet
         * actually get run and error out properly.
         */
        btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
}

static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
                                       struct btrfs_fs_info *fs_info)
{
        struct rb_node *node;
        struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
        struct btrfs_delayed_ref_node *ref;

        spin_lock(&delayed_refs->lock);
        while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
                struct btrfs_delayed_ref_head *head;
                struct rb_node *n;
                bool pin_bytes = false;

                head = rb_entry(node, struct btrfs_delayed_ref_head,
                                href_node);
                if (btrfs_delayed_ref_lock(delayed_refs, head))
                        continue;

                spin_lock(&head->lock);
                while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
                        ref = rb_entry(n, struct btrfs_delayed_ref_node,
                                       ref_node);
                        rb_erase_cached(&ref->ref_node, &head->ref_tree);
                        RB_CLEAR_NODE(&ref->ref_node);
                        if (!list_empty(&ref->add_list))
                                list_del(&ref->add_list);
                        atomic_dec(&delayed_refs->num_entries);
                        btrfs_put_delayed_ref(ref);
                        btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
                }
                if (head->must_insert_reserved)
                        pin_bytes = true;
                btrfs_free_delayed_extent_op(head->extent_op);
                btrfs_delete_ref_head(delayed_refs, head);
                spin_unlock(&head->lock);
                spin_unlock(&delayed_refs->lock);
                mutex_unlock(&head->mutex);

                if (pin_bytes) {
                        struct btrfs_block_group *cache;

                        cache = btrfs_lookup_block_group(fs_info, head->bytenr);
                        BUG_ON(!cache);

                        spin_lock(&cache->space_info->lock);
                        spin_lock(&cache->lock);
                        cache->pinned += head->num_bytes;
                        btrfs_space_info_update_bytes_pinned(fs_info,
                                cache->space_info, head->num_bytes);
                        cache->reserved -= head->num_bytes;
                        cache->space_info->bytes_reserved -= head->num_bytes;
                        spin_unlock(&cache->lock);
                        spin_unlock(&cache->space_info->lock);

                        btrfs_put_block_group(cache);

                        btrfs_error_unpin_extent_range(fs_info, head->bytenr,
                                head->bytenr + head->num_bytes - 1);
                }
                btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
                btrfs_put_delayed_ref_head(head);
                cond_resched();
                spin_lock(&delayed_refs->lock);
        }
        btrfs_qgroup_destroy_extent_records(trans);

        spin_unlock(&delayed_refs->lock);
}

static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
{
        struct btrfs_inode *btrfs_inode;
        LIST_HEAD(splice);

        spin_lock(&root->delalloc_lock);
        list_splice_init(&root->delalloc_inodes, &splice);

        while (!list_empty(&splice)) {
                struct inode *inode = NULL;
                btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
                                               delalloc_inodes);
                btrfs_del_delalloc_inode(btrfs_inode);
                spin_unlock(&root->delalloc_lock);

                /*
                 * Make sure we get a live inode and that it'll not disappear
                 * meanwhile.
                 */
                inode = igrab(&btrfs_inode->vfs_inode);
                if (inode) {
                        unsigned int nofs_flag;

                        nofs_flag = memalloc_nofs_save();
                        invalidate_inode_pages2(inode->i_mapping);
                        memalloc_nofs_restore(nofs_flag);
                        iput(inode);
                }
                spin_lock(&root->delalloc_lock);
        }
        spin_unlock(&root->delalloc_lock);
}

static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
{
        struct btrfs_root *root;
        LIST_HEAD(splice);

        spin_lock(&fs_info->delalloc_root_lock);
        list_splice_init(&fs_info->delalloc_roots, &splice);
        while (!list_empty(&splice)) {
                root = list_first_entry(&splice, struct btrfs_root,
                                         delalloc_root);
                root = btrfs_grab_root(root);
                BUG_ON(!root);
                spin_unlock(&fs_info->delalloc_root_lock);

                btrfs_destroy_delalloc_inodes(root);
                btrfs_put_root(root);

                spin_lock(&fs_info->delalloc_root_lock);
        }
        spin_unlock(&fs_info->delalloc_root_lock);
}

static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
                                         struct extent_io_tree *dirty_pages,
                                         int mark)
{
        struct extent_buffer *eb;
        u64 start = 0;
        u64 end;

        while (find_first_extent_bit(dirty_pages, start, &start, &end,
                                     mark, NULL)) {
                clear_extent_bits(dirty_pages, start, end, mark);
                while (start <= end) {
                        eb = find_extent_buffer(fs_info, start);
                        start += fs_info->nodesize;
                        if (!eb)
                                continue;

                        btrfs_tree_lock(eb);
                        wait_on_extent_buffer_writeback(eb);
                        btrfs_clear_buffer_dirty(NULL, eb);
                        btrfs_tree_unlock(eb);

                        free_extent_buffer_stale(eb);
                }
        }
}

static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
                                        struct extent_io_tree *unpin)
{
        u64 start;
        u64 end;

        while (1) {
                struct extent_state *cached_state = NULL;

                /*
                 * The btrfs_finish_extent_commit() may get the same range as
                 * ours between find_first_extent_bit and clear_extent_dirty.
                 * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
                 * the same extent range.
                 */
                mutex_lock(&fs_info->unused_bg_unpin_mutex);
                if (!find_first_extent_bit(unpin, 0, &start, &end,
                                           EXTENT_DIRTY, &cached_state)) {
                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                        break;
                }

                clear_extent_dirty(unpin, start, end, &cached_state);
                free_extent_state(cached_state);
                btrfs_error_unpin_extent_range(fs_info, start, end);
                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                cond_resched();
        }
}

static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
{
        struct inode *inode;

        inode = cache->io_ctl.inode;
        if (inode) {
                unsigned int nofs_flag;

                nofs_flag = memalloc_nofs_save();
                invalidate_inode_pages2(inode->i_mapping);
                memalloc_nofs_restore(nofs_flag);

                BTRFS_I(inode)->generation = 0;
                cache->io_ctl.inode = NULL;
                iput(inode);
        }
        ASSERT(cache->io_ctl.pages == NULL);
        btrfs_put_block_group(cache);
}

void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
                             struct btrfs_fs_info *fs_info)
{
        struct btrfs_block_group *cache;

        spin_lock(&cur_trans->dirty_bgs_lock);
        while (!list_empty(&cur_trans->dirty_bgs)) {
                cache = list_first_entry(&cur_trans->dirty_bgs,
                                         struct btrfs_block_group,
                                         dirty_list);

                if (!list_empty(&cache->io_list)) {
                        spin_unlock(&cur_trans->dirty_bgs_lock);
                        list_del_init(&cache->io_list);
                        btrfs_cleanup_bg_io(cache);
                        spin_lock(&cur_trans->dirty_bgs_lock);
                }

                list_del_init(&cache->dirty_list);
                spin_lock(&cache->lock);
                cache->disk_cache_state = BTRFS_DC_ERROR;
                spin_unlock(&cache->lock);

                spin_unlock(&cur_trans->dirty_bgs_lock);
                btrfs_put_block_group(cache);
                btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
                spin_lock(&cur_trans->dirty_bgs_lock);
        }
        spin_unlock(&cur_trans->dirty_bgs_lock);

        /*
         * Refer to the definition of io_bgs member for details why it's safe
         * to use it without any locking
         */
        while (!list_empty(&cur_trans->io_bgs)) {
                cache = list_first_entry(&cur_trans->io_bgs,
                                         struct btrfs_block_group,
                                         io_list);

                list_del_init(&cache->io_list);
                spin_lock(&cache->lock);
                cache->disk_cache_state = BTRFS_DC_ERROR;
                spin_unlock(&cache->lock);
                btrfs_cleanup_bg_io(cache);
        }
}

static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info)
{
        struct btrfs_root *gang[8];
        int i;
        int ret;

        spin_lock(&fs_info->fs_roots_radix_lock);
        while (1) {
                ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
                                                 (void **)gang, 0,
                                                 ARRAY_SIZE(gang),
                                                 BTRFS_ROOT_TRANS_TAG);
                if (ret == 0)
                        break;
                for (i = 0; i < ret; i++) {
                        struct btrfs_root *root = gang[i];

                        btrfs_qgroup_free_meta_all_pertrans(root);
                        radix_tree_tag_clear(&fs_info->fs_roots_radix,
                                        (unsigned long)btrfs_root_id(root),
                                        BTRFS_ROOT_TRANS_TAG);
                }
        }
        spin_unlock(&fs_info->fs_roots_radix_lock);
}

void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
                                   struct btrfs_fs_info *fs_info)
{
        struct btrfs_device *dev, *tmp;

        btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
        ASSERT(list_empty(&cur_trans->dirty_bgs));
        ASSERT(list_empty(&cur_trans->io_bgs));

        list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
                                 post_commit_list) {
                list_del_init(&dev->post_commit_list);
        }

        btrfs_destroy_delayed_refs(cur_trans, fs_info);

        cur_trans->state = TRANS_STATE_COMMIT_START;
        wake_up(&fs_info->transaction_blocked_wait);

        cur_trans->state = TRANS_STATE_UNBLOCKED;
        wake_up(&fs_info->transaction_wait);

        btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
                                     EXTENT_DIRTY);
        btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);

        cur_trans->state =TRANS_STATE_COMPLETED;
        wake_up(&cur_trans->commit_wait);
}

static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
{
        struct btrfs_transaction *t;

        mutex_lock(&fs_info->transaction_kthread_mutex);

        spin_lock(&fs_info->trans_lock);
        while (!list_empty(&fs_info->trans_list)) {
                t = list_first_entry(&fs_info->trans_list,
                                     struct btrfs_transaction, list);
                if (t->state >= TRANS_STATE_COMMIT_PREP) {
                        refcount_inc(&t->use_count);
                        spin_unlock(&fs_info->trans_lock);
                        btrfs_wait_for_commit(fs_info, t->transid);
                        btrfs_put_transaction(t);
                        spin_lock(&fs_info->trans_lock);
                        continue;
                }
                if (t == fs_info->running_transaction) {
                        t->state = TRANS_STATE_COMMIT_DOING;
                        spin_unlock(&fs_info->trans_lock);
                        /*
                         * We wait for 0 num_writers since we don't hold a trans
                         * handle open currently for this transaction.
                         */
                        wait_event(t->writer_wait,
                                   atomic_read(&t->num_writers) == 0);
                } else {
                        spin_unlock(&fs_info->trans_lock);
                }
                btrfs_cleanup_one_transaction(t, fs_info);

                spin_lock(&fs_info->trans_lock);
                if (t == fs_info->running_transaction)
                        fs_info->running_transaction = NULL;
                list_del_init(&t->list);
                spin_unlock(&fs_info->trans_lock);

                btrfs_put_transaction(t);
                trace_btrfs_transaction_commit(fs_info);
                spin_lock(&fs_info->trans_lock);
        }
        spin_unlock(&fs_info->trans_lock);
        btrfs_destroy_all_ordered_extents(fs_info);
        btrfs_destroy_delayed_inodes(fs_info);
        btrfs_assert_delayed_root_empty(fs_info);
        btrfs_destroy_all_delalloc_inodes(fs_info);
        btrfs_drop_all_logs(fs_info);
        btrfs_free_all_qgroup_pertrans(fs_info);
        mutex_unlock(&fs_info->transaction_kthread_mutex);

        return 0;
}

int btrfs_init_root_free_objectid(struct btrfs_root *root)
{
        struct btrfs_path *path;
        int ret;
        struct extent_buffer *l;
        struct btrfs_key search_key;
        struct btrfs_key found_key;
        int slot;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
        search_key.type = -1;
        search_key.offset = (u64)-1;
        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
        if (ret < 0)
                goto error;
        if (ret == 0) {
                /*
                 * Key with offset -1 found, there would have to exist a root
                 * with such id, but this is out of valid range.
                 */
                ret = -EUCLEAN;
                goto error;
        }
        if (path->slots[0] > 0) {
                slot = path->slots[0] - 1;
                l = path->nodes[0];
                btrfs_item_key_to_cpu(l, &found_key, slot);
                root->free_objectid = max_t(u64, found_key.objectid + 1,
                                            BTRFS_FIRST_FREE_OBJECTID);
        } else {
                root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
        }
        ret = 0;
error:
        btrfs_free_path(path);
        return ret;
}

int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
{
        int ret;
        mutex_lock(&root->objectid_mutex);

        if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
                btrfs_warn(root->fs_info,
                           "the objectid of root %llu reaches its highest value",
                           btrfs_root_id(root));
                ret = -ENOSPC;
                goto out;
        }

        *objectid = root->free_objectid++;
        ret = 0;
out:
        mutex_unlock(&root->objectid_mutex);
        return ret;
}





































   12 












































































   32 
























   32 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
// SPDX-License-Identifier: GPL-2.0
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/kasan-checks.h>
#include <linux/thread_info.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/mm.h>

#include <asm/byteorder.h>
#include <asm/word-at-a-time.h>

#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
#define IS_UNALIGNED(src, dst)        0
#else
#define IS_UNALIGNED(src, dst)        \
        (((long) dst | (long) src) & (sizeof(long) - 1))
#endif

/*
 * Do a strncpy, return length of string without final '\0'.
 * 'count' is the user-supplied count (return 'count' if we
 * hit it), 'max' is the address space maximum (and we return
 * -EFAULT if we hit it).
 */
static __always_inline long do_strncpy_from_user(char *dst, const char __user *src,
                                        unsigned long count, unsigned long max)
{
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
        unsigned long res = 0;

        if (IS_UNALIGNED(src, dst))
                goto byte_at_a_time;

        while (max >= sizeof(unsigned long)) {
                unsigned long c, data, mask;

                /* Fall back to byte-at-a-time if we get a page fault */
                unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time);

                /*
                 * Note that we mask out the bytes following the NUL. This is
                 * important to do because string oblivious code may read past
                 * the NUL. For those routines, we don't want to give them
                 * potentially random bytes after the NUL in `src`.
                 *
                 * One example of such code is BPF map keys. BPF treats map keys
                 * as an opaque set of bytes. Without the post-NUL mask, any BPF
                 * maps keyed by strings returned from strncpy_from_user() may
                 * have multiple entries for semantically identical strings.
                 */
                if (has_zero(c, &data, &constants)) {
                        data = prep_zero_mask(c, data, &constants);
                        data = create_zero_mask(data);
                        mask = zero_bytemask(data);
                        *(unsigned long *)(dst+res) = c & mask;
                        return res + find_zero(data);
                }

                *(unsigned long *)(dst+res) = c;

                res += sizeof(unsigned long);
                max -= sizeof(unsigned long);
        }

byte_at_a_time:
        while (max) {
                char c;

                unsafe_get_user(c,src+res, efault);
                dst[res] = c;
                if (!c)
                        return res;
                res++;
                max--;
        }

        /*
         * Uhhuh. We hit 'max'. But was that the user-specified maximum
         * too? If so, that's ok - we got as much as the user asked for.
         */
        if (res >= count)
                return res;

        /*
         * Nope: we hit the address space limit, and we still had more
         * characters the caller would have wanted. That's an EFAULT.
         */
efault:
        return -EFAULT;
}

/**
 * strncpy_from_user: - Copy a NUL terminated string from userspace.
 * @dst:   Destination address, in kernel space.  This buffer must be at
 *         least @count bytes long.
 * @src:   Source address, in user space.
 * @count: Maximum number of bytes to copy, including the trailing NUL.
 *
 * Copies a NUL-terminated string from userspace to kernel space.
 *
 * On success, returns the length of the string (not including the trailing
 * NUL).
 *
 * If access to userspace fails, returns -EFAULT (some data may have been
 * copied).
 *
 * If @count is smaller than the length of the string, copies @count bytes
 * and returns @count.
 */
long strncpy_from_user(char *dst, const char __user *src, long count)
{
        unsigned long max_addr, src_addr;

        might_fault();
        if (should_fail_usercopy())
                return -EFAULT;
        if (unlikely(count <= 0))
                return 0;

        max_addr = TASK_SIZE_MAX;
        src_addr = (unsigned long)untagged_addr(src);
        if (likely(src_addr < max_addr)) {
                unsigned long max = max_addr - src_addr;
                long retval;

                /*
                 * Truncate 'max' to the user-specified limit, so that
                 * we only have one limit we need to check in the loop
                 */
                if (max > count)
                        max = count;

                kasan_check_write(dst, count);
                check_object_size(dst, count, false);
                if (user_read_access_begin(src, max)) {
                        retval = do_strncpy_from_user(dst, src, count, max);
                        user_read_access_end();
                        return retval;
                }
        }
        return -EFAULT;
}
EXPORT_SYMBOL(strncpy_from_user);















































































    3 




















































































    4 






    3 



    1 



    2 



    2 












    2 


















    2 




    2 
























    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
// SPDX-License-Identifier: GPL-2.0
#include <linux/compat.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/syscalls.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/smp.h>
#include <linux/sem.h>
#include <linux/msg.h>
#include <linux/shm.h>
#include <linux/stat.h>
#include <linux/mman.h>
#include <linux/file.h>
#include <linux/utsname.h>
#include <linux/personality.h>
#include <linux/random.h>
#include <linux/uaccess.h>
#include <linux/elf.h>

#include <asm/elf.h>
#include <asm/ia32.h>

/*
 * Align a virtual address to avoid aliasing in the I$ on AMD F15h.
 */
static unsigned long get_align_mask(void)
{
        /* handle 32- and 64-bit case with a single conditional */
        if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32())))
                return 0;

        if (!(current->flags & PF_RANDOMIZE))
                return 0;

        return va_align.mask;
}

/*
 * To avoid aliasing in the I$ on AMD F15h, the bits defined by the
 * va_align.bits, [12:upper_bit), are set to a random value instead of
 * zeroing them. This random value is computed once per boot. This form
 * of ASLR is known as "per-boot ASLR".
 *
 * To achieve this, the random value is added to the info.align_offset
 * value before calling vm_unmapped_area() or ORed directly to the
 * address.
 */
static unsigned long get_align_bits(void)
{
        return va_align.bits & get_align_mask();
}

static int __init control_va_addr_alignment(char *str)
{
        /* guard against enabling this on other CPU families */
        if (va_align.flags < 0)
                return 1;

        if (*str == 0)
                return 1;

        if (!strcmp(str, "32"))
                va_align.flags = ALIGN_VA_32;
        else if (!strcmp(str, "64"))
                va_align.flags = ALIGN_VA_64;
        else if (!strcmp(str, "off"))
                va_align.flags = 0;
        else if (!strcmp(str, "on"))
                va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
        else
                pr_warn("invalid option value: 'align_va_addr=%s'\n", str);

        return 1;
}
__setup("align_va_addr=", control_va_addr_alignment);

SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
                unsigned long, prot, unsigned long, flags,
                unsigned long, fd, unsigned long, off)
{
        if (off & ~PAGE_MASK)
                return -EINVAL;

        return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
}

static void find_start_end(unsigned long addr, unsigned long flags,
                unsigned long *begin, unsigned long *end)
{
        if (!in_32bit_syscall() && (flags & MAP_32BIT)) {
                /* This is usually used needed to map code in small
                   model, so it needs to be in the first 31bit. Limit
                   it to that.  This means we need to move the
                   unmapped base down for this case. This can give
                   conflicts with the heap, but we assume that glibc
                   malloc knows how to fall back to mmap. Give it 1GB
                   of playground for now. -AK */
                *begin = 0x40000000;
                *end = 0x80000000;
                if (current->flags & PF_RANDOMIZE) {
                        *begin = randomize_page(*begin, 0x02000000);
                }
                return;
        }

        *begin        = get_mmap_base(1);
        if (in_32bit_syscall())
                *end = task_size_32bit();
        else
                *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW);
}

static inline unsigned long stack_guard_placement(vm_flags_t vm_flags)
{
        if (vm_flags & VM_SHADOW_STACK)
                return PAGE_SIZE;

        return 0;
}

unsigned long
arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len,
                       unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        struct vm_unmapped_area_info info = {};
        unsigned long begin, end;

        if (flags & MAP_FIXED)
                return addr;

        find_start_end(addr, flags, &begin, &end);

        if (len > end)
                return -ENOMEM;

        if (addr) {
                addr = PAGE_ALIGN(addr);
                vma = find_vma(mm, addr);
                if (end - len >= addr &&
                    (!vma || addr + len <= vm_start_gap(vma)))
                        return addr;
        }

        info.length = len;
        info.low_limit = begin;
        info.high_limit = end;
        info.align_offset = pgoff << PAGE_SHIFT;
        info.start_gap = stack_guard_placement(vm_flags);
        if (filp) {
                info.align_mask = get_align_mask();
                info.align_offset += get_align_bits();
        }
        return vm_unmapped_area(&info);
}

unsigned long
arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr0,
                          unsigned long len, unsigned long pgoff,
                          unsigned long flags, vm_flags_t vm_flags)
{
        struct vm_area_struct *vma;
        struct mm_struct *mm = current->mm;
        unsigned long addr = addr0;
        struct vm_unmapped_area_info info = {};

        /* requested length too big for entire address space */
        if (len > TASK_SIZE)
                return -ENOMEM;

        /* No address checking. See comment at mmap_address_hint_valid() */
        if (flags & MAP_FIXED)
                return addr;

        /* for MAP_32BIT mappings we force the legacy mmap base */
        if (!in_32bit_syscall() && (flags & MAP_32BIT))
                goto bottomup;

        /* requesting a specific address */
        if (addr) {
                addr &= PAGE_MASK;
                if (!mmap_address_hint_valid(addr, len))
                        goto get_unmapped_area;

                vma = find_vma(mm, addr);
                if (!vma || addr + len <= vm_start_gap(vma))
                        return addr;
        }
get_unmapped_area:

        info.flags = VM_UNMAPPED_AREA_TOPDOWN;
        info.length = len;
        if (!in_32bit_syscall() && (flags & MAP_ABOVE4G))
                info.low_limit = SZ_4G;
        else
                info.low_limit = PAGE_SIZE;

        info.high_limit = get_mmap_base(0);
        info.start_gap = stack_guard_placement(vm_flags);

        /*
         * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
         * in the full address space.
         *
         * !in_32bit_syscall() check to avoid high addresses for x32
         * (and make it no op on native i386).
         */
        if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall())
                info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;

        info.align_offset = pgoff << PAGE_SHIFT;
        if (filp) {
                info.align_mask = get_align_mask();
                info.align_offset += get_align_bits();
        }
        addr = vm_unmapped_area(&info);
        if (!(addr & ~PAGE_MASK))
                return addr;
        VM_BUG_ON(addr != -ENOMEM);

bottomup:
        /*
         * A failed mmap() very likely causes application failure,
         * so fall back to the bottom-up function here. This scenario
         * can happen with large stack limits and large mmap()
         * allocations.
         */
        return arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
}

unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags)
{
        return arch_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0);
}

unsigned long
arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr,
                          const unsigned long len, const unsigned long pgoff,
                          const unsigned long flags)
{
        return arch_get_unmapped_area_topdown_vmflags(filp, addr, len, pgoff, flags, 0);
}




























































































































































































































   14 














   14 






























   11 













   11 











































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Credentials management - see Documentation/security/credentials.rst
 *
 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_CRED_H
#define _LINUX_CRED_H

#include <linux/capability.h>
#include <linux/init.h>
#include <linux/key.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/uidgid.h>
#include <linux/sched.h>
#include <linux/sched/user.h>

struct cred;
struct inode;

/*
 * COW Supplementary groups list
 */
struct group_info {
        refcount_t        usage;
        int                ngroups;
        kgid_t                gid[];
} __randomize_layout;

/**
 * get_group_info - Get a reference to a group info structure
 * @group_info: The group info to reference
 *
 * This gets a reference to a set of supplementary groups.
 *
 * If the caller is accessing a task's credentials, they must hold the RCU read
 * lock when reading.
 */
static inline struct group_info *get_group_info(struct group_info *gi)
{
        refcount_inc(&gi->usage);
        return gi;
}

/**
 * put_group_info - Release a reference to a group info structure
 * @group_info: The group info to release
 */
#define put_group_info(group_info)                        \
do {                                                        \
        if (refcount_dec_and_test(&(group_info)->usage))        \
                groups_free(group_info);                \
} while (0)

#ifdef CONFIG_MULTIUSER
extern struct group_info *groups_alloc(int);
extern void groups_free(struct group_info *);

extern int in_group_p(kgid_t);
extern int in_egroup_p(kgid_t);
extern int groups_search(const struct group_info *, kgid_t);

extern int set_current_groups(struct group_info *);
extern void set_groups(struct cred *, struct group_info *);
extern bool may_setgroups(void);
extern void groups_sort(struct group_info *);
#else
static inline void groups_free(struct group_info *group_info)
{
}

static inline int in_group_p(kgid_t grp)
{
        return 1;
}
static inline int in_egroup_p(kgid_t grp)
{
        return 1;
}
static inline int groups_search(const struct group_info *group_info, kgid_t grp)
{
        return 1;
}
#endif

/*
 * The security context of a task
 *
 * The parts of the context break down into two categories:
 *
 *  (1) The objective context of a task.  These parts are used when some other
 *        task is attempting to affect this one.
 *
 *  (2) The subjective context.  These details are used when the task is acting
 *        upon another object, be that a file, a task, a key or whatever.
 *
 * Note that some members of this structure belong to both categories - the
 * LSM security pointer for instance.
 *
 * A task has two security pointers.  task->real_cred points to the objective
 * context that defines that task's actual details.  The objective part of this
 * context is used whenever that task is acted upon.
 *
 * task->cred points to the subjective context that defines the details of how
 * that task is going to act upon another object.  This may be overridden
 * temporarily to point to another security context, but normally points to the
 * same context as task->real_cred.
 */
struct cred {
        atomic_long_t        usage;
        kuid_t                uid;                /* real UID of the task */
        kgid_t                gid;                /* real GID of the task */
        kuid_t                suid;                /* saved UID of the task */
        kgid_t                sgid;                /* saved GID of the task */
        kuid_t                euid;                /* effective UID of the task */
        kgid_t                egid;                /* effective GID of the task */
        kuid_t                fsuid;                /* UID for VFS ops */
        kgid_t                fsgid;                /* GID for VFS ops */
        unsigned        securebits;        /* SUID-less security management */
        kernel_cap_t        cap_inheritable; /* caps our children can inherit */
        kernel_cap_t        cap_permitted;        /* caps we're permitted */
        kernel_cap_t        cap_effective;        /* caps we can actually use */
        kernel_cap_t        cap_bset;        /* capability bounding set */
        kernel_cap_t        cap_ambient;        /* Ambient capability set */
#ifdef CONFIG_KEYS
        unsigned char        jit_keyring;        /* default keyring to attach requested
                                         * keys to */
        struct key        *session_keyring; /* keyring inherited over fork */
        struct key        *process_keyring; /* keyring private to this process */
        struct key        *thread_keyring; /* keyring private to this thread */
        struct key        *request_key_auth; /* assumed request_key authority */
#endif
#ifdef CONFIG_SECURITY
        void                *security;        /* LSM security */
#endif
        struct user_struct *user;        /* real user ID subscription */
        struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
        struct ucounts *ucounts;
        struct group_info *group_info;        /* supplementary groups for euid/fsgid */
        /* RCU deletion */
        union {
                int non_rcu;                        /* Can we skip RCU deletion? */
                struct rcu_head        rcu;                /* RCU deletion hook */
        };
} __randomize_layout;

extern void __put_cred(struct cred *);
extern void exit_creds(struct task_struct *);
extern int copy_creds(struct task_struct *, unsigned long);
extern const struct cred *get_task_cred(struct task_struct *);
extern struct cred *cred_alloc_blank(void);
extern struct cred *prepare_creds(void);
extern struct cred *prepare_exec_creds(void);
extern int commit_creds(struct cred *);
extern void abort_creds(struct cred *);
extern const struct cred *override_creds(const struct cred *);
extern void revert_creds(const struct cred *);
extern struct cred *prepare_kernel_cred(struct task_struct *);
extern int set_security_override(struct cred *, u32);
extern int set_security_override_from_ctx(struct cred *, const char *);
extern int set_create_files_as(struct cred *, struct inode *);
extern int cred_fscmp(const struct cred *, const struct cred *);
extern void __init cred_init(void);
extern int set_cred_ucounts(struct cred *);

static inline bool cap_ambient_invariant_ok(const struct cred *cred)
{
        return cap_issubset(cred->cap_ambient,
                            cap_intersect(cred->cap_permitted,
                                          cred->cap_inheritable));
}

/**
 * get_new_cred_many - Get references on a new set of credentials
 * @cred: The new credentials to reference
 * @nr: Number of references to acquire
 *
 * Get references on the specified set of new credentials.  The caller must
 * release all acquired references.
 */
static inline struct cred *get_new_cred_many(struct cred *cred, int nr)
{
        atomic_long_add(nr, &cred->usage);
        return cred;
}

/**
 * get_new_cred - Get a reference on a new set of credentials
 * @cred: The new credentials to reference
 *
 * Get a reference on the specified set of new credentials.  The caller must
 * release the reference.
 */
static inline struct cred *get_new_cred(struct cred *cred)
{
        return get_new_cred_many(cred, 1);
}

/**
 * get_cred_many - Get references on a set of credentials
 * @cred: The credentials to reference
 * @nr: Number of references to acquire
 *
 * Get references on the specified set of credentials.  The caller must release
 * all acquired reference.  If %NULL is passed, it is returned with no action.
 *
 * This is used to deal with a committed set of credentials.  Although the
 * pointer is const, this will temporarily discard the const and increment the
 * usage count.  The purpose of this is to attempt to catch at compile time the
 * accidental alteration of a set of credentials that should be considered
 * immutable.
 */
static inline const struct cred *get_cred_many(const struct cred *cred, int nr)
{
        struct cred *nonconst_cred = (struct cred *) cred;
        if (!cred)
                return cred;
        nonconst_cred->non_rcu = 0;
        return get_new_cred_many(nonconst_cred, nr);
}

/*
 * get_cred - Get a reference on a set of credentials
 * @cred: The credentials to reference
 *
 * Get a reference on the specified set of credentials.  The caller must
 * release the reference.  If %NULL is passed, it is returned with no action.
 *
 * This is used to deal with a committed set of credentials.
 */
static inline const struct cred *get_cred(const struct cred *cred)
{
        return get_cred_many(cred, 1);
}

static inline const struct cred *get_cred_rcu(const struct cred *cred)
{
        struct cred *nonconst_cred = (struct cred *) cred;
        if (!cred)
                return NULL;
        if (!atomic_long_inc_not_zero(&nonconst_cred->usage))
                return NULL;
        nonconst_cred->non_rcu = 0;
        return cred;
}

/**
 * put_cred - Release a reference to a set of credentials
 * @cred: The credentials to release
 * @nr: Number of references to release
 *
 * Release a reference to a set of credentials, deleting them when the last ref
 * is released.  If %NULL is passed, nothing is done.
 *
 * This takes a const pointer to a set of credentials because the credentials
 * on task_struct are attached by const pointers to prevent accidental
 * alteration of otherwise immutable credential sets.
 */
static inline void put_cred_many(const struct cred *_cred, int nr)
{
        struct cred *cred = (struct cred *) _cred;

        if (cred) {
                if (atomic_long_sub_and_test(nr, &cred->usage))
                        __put_cred(cred);
        }
}

/*
 * put_cred - Release a reference to a set of credentials
 * @cred: The credentials to release
 *
 * Release a reference to a set of credentials, deleting them when the last ref
 * is released.  If %NULL is passed, nothing is done.
 */
static inline void put_cred(const struct cred *cred)
{
        put_cred_many(cred, 1);
}

/**
 * current_cred - Access the current task's subjective credentials
 *
 * Access the subjective credentials of the current task.  RCU-safe,
 * since nobody else can modify it.
 */
#define current_cred() \
        rcu_dereference_protected(current->cred, 1)

/**
 * current_real_cred - Access the current task's objective credentials
 *
 * Access the objective credentials of the current task.  RCU-safe,
 * since nobody else can modify it.
 */
#define current_real_cred() \
        rcu_dereference_protected(current->real_cred, 1)

/**
 * __task_cred - Access a task's objective credentials
 * @task: The task to query
 *
 * Access the objective credentials of a task.  The caller must hold the RCU
 * readlock.
 *
 * The result of this function should not be passed directly to get_cred();
 * rather get_task_cred() should be used instead.
 */
#define __task_cred(task)        \
        rcu_dereference((task)->real_cred)

/**
 * get_current_cred - Get the current task's subjective credentials
 *
 * Get the subjective credentials of the current task, pinning them so that
 * they can't go away.  Accessing the current task's credentials directly is
 * not permitted.
 */
#define get_current_cred()                                \
        (get_cred(current_cred()))

/**
 * get_current_user - Get the current task's user_struct
 *
 * Get the user record of the current task, pinning it so that it can't go
 * away.
 */
#define get_current_user()                                \
({                                                        \
        struct user_struct *__u;                        \
        const struct cred *__cred;                        \
        __cred = current_cred();                        \
        __u = get_uid(__cred->user);                        \
        __u;                                                \
})

/**
 * get_current_groups - Get the current task's supplementary group list
 *
 * Get the supplementary group list of the current task, pinning it so that it
 * can't go away.
 */
#define get_current_groups()                                \
({                                                        \
        struct group_info *__groups;                        \
        const struct cred *__cred;                        \
        __cred = current_cred();                        \
        __groups = get_group_info(__cred->group_info);        \
        __groups;                                        \
})

#define task_cred_xxx(task, xxx)                        \
({                                                        \
        __typeof__(((struct cred *)NULL)->xxx) ___val;        \
        rcu_read_lock();                                \
        ___val = __task_cred((task))->xxx;                \
        rcu_read_unlock();                                \
        ___val;                                                \
})

#define task_uid(task)                (task_cred_xxx((task), uid))
#define task_euid(task)                (task_cred_xxx((task), euid))
#define task_ucounts(task)        (task_cred_xxx((task), ucounts))

#define current_cred_xxx(xxx)                        \
({                                                \
        current_cred()->xxx;                        \
})

#define current_uid()                (current_cred_xxx(uid))
#define current_gid()                (current_cred_xxx(gid))
#define current_euid()                (current_cred_xxx(euid))
#define current_egid()                (current_cred_xxx(egid))
#define current_suid()                (current_cred_xxx(suid))
#define current_sgid()                (current_cred_xxx(sgid))
#define current_fsuid()         (current_cred_xxx(fsuid))
#define current_fsgid()         (current_cred_xxx(fsgid))
#define current_cap()                (current_cred_xxx(cap_effective))
#define current_user()                (current_cred_xxx(user))
#define current_ucounts()        (current_cred_xxx(ucounts))

extern struct user_namespace init_user_ns;
#ifdef CONFIG_USER_NS
#define current_user_ns()        (current_cred_xxx(user_ns))
#else
static inline struct user_namespace *current_user_ns(void)
{
        return &init_user_ns;
}
#endif


#define current_uid_gid(_uid, _gid)                \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_uid) = __cred->uid;                        \
        *(_gid) = __cred->gid;                        \
} while(0)

#define current_euid_egid(_euid, _egid)                \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_euid) = __cred->euid;                \
        *(_egid) = __cred->egid;                \
} while(0)

#define current_fsuid_fsgid(_fsuid, _fsgid)        \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_fsuid) = __cred->fsuid;                \
        *(_fsgid) = __cred->fsgid;                \
} while(0)

#endif /* _LINUX_CRED_H */






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 




    2 





    2 




    2 








    2 






























































    2 














    2 












    1 



    1 


    1 




    1 




























    1 









































































    2 


    2 


    1 










    2 


    2 

    2 




    2 

    2 

















































































































































































    1 




    2 



























    2 















































    2 













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 














    2 















    2 




    1 



    2 



    1 

    2 





    2 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2008 Red Hat.  All rights reserved.
 */

#include <linux/pagemap.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/math64.h>
#include <linux/ratelimit.h>
#include <linux/error-injection.h>
#include <linux/sched/mm.h>
#include "ctree.h"
#include "fs.h"
#include "messages.h"
#include "misc.h"
#include "free-space-cache.h"
#include "transaction.h"
#include "disk-io.h"
#include "extent_io.h"
#include "space-info.h"
#include "block-group.h"
#include "discard.h"
#include "subpage.h"
#include "inode-item.h"
#include "accessors.h"
#include "file-item.h"
#include "file.h"
#include "super.h"

#define BITS_PER_BITMAP                (PAGE_SIZE * 8UL)
#define MAX_CACHE_BYTES_PER_GIG        SZ_64K
#define FORCE_EXTENT_THRESHOLD        SZ_1M

static struct kmem_cache *btrfs_free_space_cachep;
static struct kmem_cache *btrfs_free_space_bitmap_cachep;

struct btrfs_trim_range {
        u64 start;
        u64 bytes;
        struct list_head list;
};

static int link_free_space(struct btrfs_free_space_ctl *ctl,
                           struct btrfs_free_space *info);
static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
                              struct btrfs_free_space *info, bool update_stat);
static int search_bitmap(struct btrfs_free_space_ctl *ctl,
                         struct btrfs_free_space *bitmap_info, u64 *offset,
                         u64 *bytes, bool for_alloc);
static void free_bitmap(struct btrfs_free_space_ctl *ctl,
                        struct btrfs_free_space *bitmap_info);
static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
                              struct btrfs_free_space *info, u64 offset,
                              u64 bytes, bool update_stats);

static void btrfs_crc32c_final(u32 crc, u8 *result)
{
        put_unaligned_le32(~crc, result);
}

static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
{
        struct btrfs_free_space *info;
        struct rb_node *node;

        while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
                info = rb_entry(node, struct btrfs_free_space, offset_index);
                if (!info->bitmap) {
                        unlink_free_space(ctl, info, true);
                        kmem_cache_free(btrfs_free_space_cachep, info);
                } else {
                        free_bitmap(ctl, info);
                }

                cond_resched_lock(&ctl->tree_lock);
        }
}

static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
                                               struct btrfs_path *path,
                                               u64 offset)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_key key;
        struct btrfs_key location;
        struct btrfs_disk_key disk_key;
        struct btrfs_free_space_header *header;
        struct extent_buffer *leaf;
        struct inode *inode = NULL;
        unsigned nofs_flag;
        int ret;

        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
        key.offset = offset;
        key.type = 0;

        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                return ERR_PTR(ret);
        if (ret > 0) {
                btrfs_release_path(path);
                return ERR_PTR(-ENOENT);
        }

        leaf = path->nodes[0];
        header = btrfs_item_ptr(leaf, path->slots[0],
                                struct btrfs_free_space_header);
        btrfs_free_space_key(leaf, header, &disk_key);
        btrfs_disk_key_to_cpu(&location, &disk_key);
        btrfs_release_path(path);

        /*
         * We are often under a trans handle at this point, so we need to make
         * sure NOFS is set to keep us from deadlocking.
         */
        nofs_flag = memalloc_nofs_save();
        inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
        btrfs_release_path(path);
        memalloc_nofs_restore(nofs_flag);
        if (IS_ERR(inode))
                return inode;

        mapping_set_gfp_mask(inode->i_mapping,
                        mapping_gfp_constraint(inode->i_mapping,
                        ~(__GFP_FS | __GFP_HIGHMEM)));

        return inode;
}

struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
                struct btrfs_path *path)
{
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        struct inode *inode = NULL;
        u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;

        spin_lock(&block_group->lock);
        if (block_group->inode)
                inode = igrab(block_group->inode);
        spin_unlock(&block_group->lock);
        if (inode)
                return inode;

        inode = __lookup_free_space_inode(fs_info->tree_root, path,
                                          block_group->start);
        if (IS_ERR(inode))
                return inode;

        spin_lock(&block_group->lock);
        if (!((BTRFS_I(inode)->flags & flags) == flags)) {
                btrfs_info(fs_info, "Old style space inode found, converting.");
                BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
                        BTRFS_INODE_NODATACOW;
                block_group->disk_cache_state = BTRFS_DC_CLEAR;
        }

        if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags))
                block_group->inode = igrab(inode);
        spin_unlock(&block_group->lock);

        return inode;
}

static int __create_free_space_inode(struct btrfs_root *root,
                                     struct btrfs_trans_handle *trans,
                                     struct btrfs_path *path,
                                     u64 ino, u64 offset)
{
        struct btrfs_key key;
        struct btrfs_disk_key disk_key;
        struct btrfs_free_space_header *header;
        struct btrfs_inode_item *inode_item;
        struct extent_buffer *leaf;
        /* We inline CRCs for the free disk space cache */
        const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC |
                          BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
        int ret;

        ret = btrfs_insert_empty_inode(trans, root, path, ino);
        if (ret)
                return ret;

        leaf = path->nodes[0];
        inode_item = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_inode_item);
        btrfs_item_key(leaf, &disk_key, path->slots[0]);
        memzero_extent_buffer(leaf, (unsigned long)inode_item,
                             sizeof(*inode_item));
        btrfs_set_inode_generation(leaf, inode_item, trans->transid);
        btrfs_set_inode_size(leaf, inode_item, 0);
        btrfs_set_inode_nbytes(leaf, inode_item, 0);
        btrfs_set_inode_uid(leaf, inode_item, 0);
        btrfs_set_inode_gid(leaf, inode_item, 0);
        btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
        btrfs_set_inode_flags(leaf, inode_item, flags);
        btrfs_set_inode_nlink(leaf, inode_item, 1);
        btrfs_set_inode_transid(leaf, inode_item, trans->transid);
        btrfs_set_inode_block_group(leaf, inode_item, offset);
        btrfs_mark_buffer_dirty(trans, leaf);
        btrfs_release_path(path);

        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
        key.offset = offset;
        key.type = 0;
        ret = btrfs_insert_empty_item(trans, root, path, &key,
                                      sizeof(struct btrfs_free_space_header));
        if (ret < 0) {
                btrfs_release_path(path);
                return ret;
        }

        leaf = path->nodes[0];
        header = btrfs_item_ptr(leaf, path->slots[0],
                                struct btrfs_free_space_header);
        memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
        btrfs_set_free_space_key(leaf, header, &disk_key);
        btrfs_mark_buffer_dirty(trans, leaf);
        btrfs_release_path(path);

        return 0;
}

int create_free_space_inode(struct btrfs_trans_handle *trans,
                            struct btrfs_block_group *block_group,
                            struct btrfs_path *path)
{
        int ret;
        u64 ino;

        ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino);
        if (ret < 0)
                return ret;

        return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
                                         ino, block_group->start);
}

/*
 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode
 * handles lookup, otherwise it takes ownership and iputs the inode.
 * Don't reuse an inode pointer after passing it into this function.
 */
int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
                                  struct inode *inode,
                                  struct btrfs_block_group *block_group)
{
        struct btrfs_path *path;
        struct btrfs_key key;
        int ret = 0;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        if (!inode)
                inode = lookup_free_space_inode(block_group, path);
        if (IS_ERR(inode)) {
                if (PTR_ERR(inode) != -ENOENT)
                        ret = PTR_ERR(inode);
                goto out;
        }
        ret = btrfs_orphan_add(trans, BTRFS_I(inode));
        if (ret) {
                btrfs_add_delayed_iput(BTRFS_I(inode));
                goto out;
        }
        clear_nlink(inode);
        /* One for the block groups ref */
        spin_lock(&block_group->lock);
        if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) {
                block_group->inode = NULL;
                spin_unlock(&block_group->lock);
                iput(inode);
        } else {
                spin_unlock(&block_group->lock);
        }
        /* One for the lookup ref */
        btrfs_add_delayed_iput(BTRFS_I(inode));

        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
        key.type = 0;
        key.offset = block_group->start;
        ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path,
                                -1, 1);
        if (ret) {
                if (ret > 0)
                        ret = 0;
                goto out;
        }
        ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
out:
        btrfs_free_path(path);
        return ret;
}

int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
                                    struct btrfs_block_group *block_group,
                                    struct inode *vfs_inode)
{
        struct btrfs_truncate_control control = {
                .inode = BTRFS_I(vfs_inode),
                .new_size = 0,
                .ino = btrfs_ino(BTRFS_I(vfs_inode)),
                .min_type = BTRFS_EXTENT_DATA_KEY,
                .clear_extent_range = true,
        };
        struct btrfs_inode *inode = BTRFS_I(vfs_inode);
        struct btrfs_root *root = inode->root;
        struct extent_state *cached_state = NULL;
        int ret = 0;
        bool locked = false;

        if (block_group) {
                struct btrfs_path *path = btrfs_alloc_path();

                if (!path) {
                        ret = -ENOMEM;
                        goto fail;
                }
                locked = true;
                mutex_lock(&trans->transaction->cache_write_mutex);
                if (!list_empty(&block_group->io_list)) {
                        list_del_init(&block_group->io_list);

                        btrfs_wait_cache_io(trans, block_group, path);
                        btrfs_put_block_group(block_group);
                }

                /*
                 * now that we've truncated the cache away, its no longer
                 * setup or written
                 */
                spin_lock(&block_group->lock);
                block_group->disk_cache_state = BTRFS_DC_CLEAR;
                spin_unlock(&block_group->lock);
                btrfs_free_path(path);
        }

        btrfs_i_size_write(inode, 0);
        truncate_pagecache(vfs_inode, 0);

        lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
        btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);

        /*
         * We skip the throttling logic for free space cache inodes, so we don't
         * need to check for -EAGAIN.
         */
        ret = btrfs_truncate_inode_items(trans, root, &control);

        inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
        btrfs_inode_safe_disk_i_size_write(inode, control.last_size);

        unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
        if (ret)
                goto fail;

        ret = btrfs_update_inode(trans, inode);

fail:
        if (locked)
                mutex_unlock(&trans->transaction->cache_write_mutex);
        if (ret)
                btrfs_abort_transaction(trans, ret);

        return ret;
}

static void readahead_cache(struct inode *inode)
{
        struct file_ra_state ra;
        unsigned long last_index;

        file_ra_state_init(&ra, inode->i_mapping);
        last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;

        page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index);
}

static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
                       int write)
{
        int num_pages;

        num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);

        /* Make sure we can fit our crcs and generation into the first page */
        if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
                return -ENOSPC;

        memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));

        io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
        if (!io_ctl->pages)
                return -ENOMEM;

        io_ctl->num_pages = num_pages;
        io_ctl->fs_info = inode_to_fs_info(inode);
        io_ctl->inode = inode;

        return 0;
}
ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);

static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
{
        kfree(io_ctl->pages);
        io_ctl->pages = NULL;
}

static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
{
        if (io_ctl->cur) {
                io_ctl->cur = NULL;
                io_ctl->orig = NULL;
        }
}

static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
{
        ASSERT(io_ctl->index < io_ctl->num_pages);
        io_ctl->page = io_ctl->pages[io_ctl->index++];
        io_ctl->cur = page_address(io_ctl->page);
        io_ctl->orig = io_ctl->cur;
        io_ctl->size = PAGE_SIZE;
        if (clear)
                clear_page(io_ctl->cur);
}

static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
{
        int i;

        io_ctl_unmap_page(io_ctl);

        for (i = 0; i < io_ctl->num_pages; i++) {
                if (io_ctl->pages[i]) {
                        btrfs_folio_clear_checked(io_ctl->fs_info,
                                        page_folio(io_ctl->pages[i]),
                                        page_offset(io_ctl->pages[i]),
                                        PAGE_SIZE);
                        unlock_page(io_ctl->pages[i]);
                        put_page(io_ctl->pages[i]);
                }
        }
}

static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
{
        struct page *page;
        struct inode *inode = io_ctl->inode;
        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
        int i;

        for (i = 0; i < io_ctl->num_pages; i++) {
                int ret;

                page = find_or_create_page(inode->i_mapping, i, mask);
                if (!page) {
                        io_ctl_drop_pages(io_ctl);
                        return -ENOMEM;
                }

                ret = set_page_extent_mapped(page);
                if (ret < 0) {
                        unlock_page(page);
                        put_page(page);
                        io_ctl_drop_pages(io_ctl);
                        return ret;
                }

                io_ctl->pages[i] = page;
                if (uptodate && !PageUptodate(page)) {
                        btrfs_read_folio(NULL, page_folio(page));
                        lock_page(page);
                        if (page->mapping != inode->i_mapping) {
                                btrfs_err(BTRFS_I(inode)->root->fs_info,
                                          "free space cache page truncated");
                                io_ctl_drop_pages(io_ctl);
                                return -EIO;
                        }
                        if (!PageUptodate(page)) {
                                btrfs_err(BTRFS_I(inode)->root->fs_info,
                                           "error reading free space cache");
                                io_ctl_drop_pages(io_ctl);
                                return -EIO;
                        }
                }
        }

        for (i = 0; i < io_ctl->num_pages; i++)
                clear_page_dirty_for_io(io_ctl->pages[i]);

        return 0;
}

static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
        io_ctl_map_page(io_ctl, 1);

        /*
         * Skip the csum areas.  If we don't check crcs then we just have a
         * 64bit chunk at the front of the first page.
         */
        io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
        io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);

        put_unaligned_le64(generation, io_ctl->cur);
        io_ctl->cur += sizeof(u64);
}

static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
        u64 cache_gen;

        /*
         * Skip the crc area.  If we don't check crcs then we just have a 64bit
         * chunk at the front of the first page.
         */
        io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
        io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);

        cache_gen = get_unaligned_le64(io_ctl->cur);
        if (cache_gen != generation) {
                btrfs_err_rl(io_ctl->fs_info,
                        "space cache generation (%llu) does not match inode (%llu)",
                                cache_gen, generation);
                io_ctl_unmap_page(io_ctl);
                return -EIO;
        }
        io_ctl->cur += sizeof(u64);
        return 0;
}

static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
{
        u32 *tmp;
        u32 crc = ~(u32)0;
        unsigned offset = 0;

        if (index == 0)
                offset = sizeof(u32) * io_ctl->num_pages;

        crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
        btrfs_crc32c_final(crc, (u8 *)&crc);
        io_ctl_unmap_page(io_ctl);
        tmp = page_address(io_ctl->pages[0]);
        tmp += index;
        *tmp = crc;
}

static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
{
        u32 *tmp, val;
        u32 crc = ~(u32)0;
        unsigned offset = 0;

        if (index == 0)
                offset = sizeof(u32) * io_ctl->num_pages;

        tmp = page_address(io_ctl->pages[0]);
        tmp += index;
        val = *tmp;

        io_ctl_map_page(io_ctl, 0);
        crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
        btrfs_crc32c_final(crc, (u8 *)&crc);
        if (val != crc) {
                btrfs_err_rl(io_ctl->fs_info,
                        "csum mismatch on free space cache");
                io_ctl_unmap_page(io_ctl);
                return -EIO;
        }

        return 0;
}

static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
                            void *bitmap)
{
        struct btrfs_free_space_entry *entry;

        if (!io_ctl->cur)
                return -ENOSPC;

        entry = io_ctl->cur;
        put_unaligned_le64(offset, &entry->offset);
        put_unaligned_le64(bytes, &entry->bytes);
        entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
                BTRFS_FREE_SPACE_EXTENT;
        io_ctl->cur += sizeof(struct btrfs_free_space_entry);
        io_ctl->size -= sizeof(struct btrfs_free_space_entry);

        if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
                return 0;

        io_ctl_set_crc(io_ctl, io_ctl->index - 1);

        /* No more pages to map */
        if (io_ctl->index >= io_ctl->num_pages)
                return 0;

        /* map the next page */
        io_ctl_map_page(io_ctl, 1);
        return 0;
}

static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
{
        if (!io_ctl->cur)
                return -ENOSPC;

        /*
         * If we aren't at the start of the current page, unmap this one and
         * map the next one if there is any left.
         */
        if (io_ctl->cur != io_ctl->orig) {
                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
                if (io_ctl->index >= io_ctl->num_pages)
                        return -ENOSPC;
                io_ctl_map_page(io_ctl, 0);
        }

        copy_page(io_ctl->cur, bitmap);
        io_ctl_set_crc(io_ctl, io_ctl->index - 1);
        if (io_ctl->index < io_ctl->num_pages)
                io_ctl_map_page(io_ctl, 0);
        return 0;
}

static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
{
        /*
         * If we're not on the boundary we know we've modified the page and we
         * need to crc the page.
         */
        if (io_ctl->cur != io_ctl->orig)
                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
        else
                io_ctl_unmap_page(io_ctl);

        while (io_ctl->index < io_ctl->num_pages) {
                io_ctl_map_page(io_ctl, 1);
                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
        }
}

static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
                            struct btrfs_free_space *entry, u8 *type)
{
        struct btrfs_free_space_entry *e;
        int ret;

        if (!io_ctl->cur) {
                ret = io_ctl_check_crc(io_ctl, io_ctl->index);
                if (ret)
                        return ret;
        }

        e = io_ctl->cur;
        entry->offset = get_unaligned_le64(&e->offset);
        entry->bytes = get_unaligned_le64(&e->bytes);
        *type = e->type;
        io_ctl->cur += sizeof(struct btrfs_free_space_entry);
        io_ctl->size -= sizeof(struct btrfs_free_space_entry);

        if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
                return 0;

        io_ctl_unmap_page(io_ctl);

        return 0;
}

static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
                              struct btrfs_free_space *entry)
{
        int ret;

        ret = io_ctl_check_crc(io_ctl, io_ctl->index);
        if (ret)
                return ret;

        copy_page(entry->bitmap, io_ctl->cur);
        io_ctl_unmap_page(io_ctl);

        return 0;
}

static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
{
        struct btrfs_block_group *block_group = ctl->block_group;
        u64 max_bytes;
        u64 bitmap_bytes;
        u64 extent_bytes;
        u64 size = block_group->length;
        u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
        u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);

        max_bitmaps = max_t(u64, max_bitmaps, 1);

        if (ctl->total_bitmaps > max_bitmaps)
                btrfs_err(block_group->fs_info,
"invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu",
                          block_group->start, block_group->length,
                          ctl->total_bitmaps, ctl->unit, max_bitmaps,
                          bytes_per_bg);
        ASSERT(ctl->total_bitmaps <= max_bitmaps);

        /*
         * We are trying to keep the total amount of memory used per 1GiB of
         * space to be MAX_CACHE_BYTES_PER_GIG.  However, with a reclamation
         * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
         * bitmaps, we may end up using more memory than this.
         */
        if (size < SZ_1G)
                max_bytes = MAX_CACHE_BYTES_PER_GIG;
        else
                max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);

        bitmap_bytes = ctl->total_bitmaps * ctl->unit;

        /*
         * we want the extent entry threshold to always be at most 1/2 the max
         * bytes we can have, or whatever is less than that.
         */
        extent_bytes = max_bytes - bitmap_bytes;
        extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);

        ctl->extents_thresh =
                div_u64(extent_bytes, sizeof(struct btrfs_free_space));
}

static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
                                   struct btrfs_free_space_ctl *ctl,
                                   struct btrfs_path *path, u64 offset)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_free_space_header *header;
        struct extent_buffer *leaf;
        struct btrfs_io_ctl io_ctl;
        struct btrfs_key key;
        struct btrfs_free_space *e, *n;
        LIST_HEAD(bitmaps);
        u64 num_entries;
        u64 num_bitmaps;
        u64 generation;
        u8 type;
        int ret = 0;

        /* Nothing in the space cache, goodbye */
        if (!i_size_read(inode))
                return 0;

        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
        key.offset = offset;
        key.type = 0;

        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                return 0;
        else if (ret > 0) {
                btrfs_release_path(path);
                return 0;
        }

        ret = -1;

        leaf = path->nodes[0];
        header = btrfs_item_ptr(leaf, path->slots[0],
                                struct btrfs_free_space_header);
        num_entries = btrfs_free_space_entries(leaf, header);
        num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
        generation = btrfs_free_space_generation(leaf, header);
        btrfs_release_path(path);

        if (!BTRFS_I(inode)->generation) {
                btrfs_info(fs_info,
                           "the free space cache file (%llu) is invalid, skip it",
                           offset);
                return 0;
        }

        if (BTRFS_I(inode)->generation != generation) {
                btrfs_err(fs_info,
                          "free space inode generation (%llu) did not match free space cache generation (%llu)",
                          BTRFS_I(inode)->generation, generation);
                return 0;
        }

        if (!num_entries)
                return 0;

        ret = io_ctl_init(&io_ctl, inode, 0);
        if (ret)
                return ret;

        readahead_cache(inode);

        ret = io_ctl_prepare_pages(&io_ctl, true);
        if (ret)
                goto out;

        ret = io_ctl_check_crc(&io_ctl, 0);
        if (ret)
                goto free_cache;

        ret = io_ctl_check_generation(&io_ctl, generation);
        if (ret)
                goto free_cache;

        while (num_entries) {
                e = kmem_cache_zalloc(btrfs_free_space_cachep,
                                      GFP_NOFS);
                if (!e) {
                        ret = -ENOMEM;
                        goto free_cache;
                }

                ret = io_ctl_read_entry(&io_ctl, e, &type);
                if (ret) {
                        kmem_cache_free(btrfs_free_space_cachep, e);
                        goto free_cache;
                }

                if (!e->bytes) {
                        ret = -1;
                        kmem_cache_free(btrfs_free_space_cachep, e);
                        goto free_cache;
                }

                if (type == BTRFS_FREE_SPACE_EXTENT) {
                        spin_lock(&ctl->tree_lock);
                        ret = link_free_space(ctl, e);
                        spin_unlock(&ctl->tree_lock);
                        if (ret) {
                                btrfs_err(fs_info,
                                        "Duplicate entries in free space cache, dumping");
                                kmem_cache_free(btrfs_free_space_cachep, e);
                                goto free_cache;
                        }
                } else {
                        ASSERT(num_bitmaps);
                        num_bitmaps--;
                        e->bitmap = kmem_cache_zalloc(
                                        btrfs_free_space_bitmap_cachep, GFP_NOFS);
                        if (!e->bitmap) {
                                ret = -ENOMEM;
                                kmem_cache_free(
                                        btrfs_free_space_cachep, e);
                                goto free_cache;
                        }
                        spin_lock(&ctl->tree_lock);
                        ret = link_free_space(ctl, e);
                        if (ret) {
                                spin_unlock(&ctl->tree_lock);
                                btrfs_err(fs_info,
                                        "Duplicate entries in free space cache, dumping");
                                kmem_cache_free(btrfs_free_space_cachep, e);
                                goto free_cache;
                        }
                        ctl->total_bitmaps++;
                        recalculate_thresholds(ctl);
                        spin_unlock(&ctl->tree_lock);
                        list_add_tail(&e->list, &bitmaps);
                }

                num_entries--;
        }

        io_ctl_unmap_page(&io_ctl);

        /*
         * We add the bitmaps at the end of the entries in order that
         * the bitmap entries are added to the cache.
         */
        list_for_each_entry_safe(e, n, &bitmaps, list) {
                list_del_init(&e->list);
                ret = io_ctl_read_bitmap(&io_ctl, e);
                if (ret)
                        goto free_cache;
        }

        io_ctl_drop_pages(&io_ctl);
        ret = 1;
out:
        io_ctl_free(&io_ctl);
        return ret;
free_cache:
        io_ctl_drop_pages(&io_ctl);

        spin_lock(&ctl->tree_lock);
        __btrfs_remove_free_space_cache(ctl);
        spin_unlock(&ctl->tree_lock);
        goto out;
}

static int copy_free_space_cache(struct btrfs_block_group *block_group,
                                 struct btrfs_free_space_ctl *ctl)
{
        struct btrfs_free_space *info;
        struct rb_node *n;
        int ret = 0;

        while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
                info = rb_entry(n, struct btrfs_free_space, offset_index);
                if (!info->bitmap) {
                        const u64 offset = info->offset;
                        const u64 bytes = info->bytes;

                        unlink_free_space(ctl, info, true);
                        spin_unlock(&ctl->tree_lock);
                        kmem_cache_free(btrfs_free_space_cachep, info);
                        ret = btrfs_add_free_space(block_group, offset, bytes);
                        spin_lock(&ctl->tree_lock);
                } else {
                        u64 offset = info->offset;
                        u64 bytes = ctl->unit;

                        ret = search_bitmap(ctl, info, &offset, &bytes, false);
                        if (ret == 0) {
                                bitmap_clear_bits(ctl, info, offset, bytes, true);
                                spin_unlock(&ctl->tree_lock);
                                ret = btrfs_add_free_space(block_group, offset,
                                                           bytes);
                                spin_lock(&ctl->tree_lock);
                        } else {
                                free_bitmap(ctl, info);
                                ret = 0;
                        }
                }
                cond_resched_lock(&ctl->tree_lock);
        }
        return ret;
}

static struct lock_class_key btrfs_free_space_inode_key;

int load_free_space_cache(struct btrfs_block_group *block_group)
{
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space_ctl tmp_ctl = {};
        struct inode *inode;
        struct btrfs_path *path;
        int ret = 0;
        bool matched;
        u64 used = block_group->used;

        /*
         * Because we could potentially discard our loaded free space, we want
         * to load everything into a temporary structure first, and then if it's
         * valid copy it all into the actual free space ctl.
         */
        btrfs_init_free_space_ctl(block_group, &tmp_ctl);

        /*
         * If this block group has been marked to be cleared for one reason or
         * another then we can't trust the on disk cache, so just return.
         */
        spin_lock(&block_group->lock);
        if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
                spin_unlock(&block_group->lock);
                return 0;
        }
        spin_unlock(&block_group->lock);

        path = btrfs_alloc_path();
        if (!path)
                return 0;
        path->search_commit_root = 1;
        path->skip_locking = 1;

        /*
         * We must pass a path with search_commit_root set to btrfs_iget in
         * order to avoid a deadlock when allocating extents for the tree root.
         *
         * When we are COWing an extent buffer from the tree root, when looking
         * for a free extent, at extent-tree.c:find_free_extent(), we can find
         * block group without its free space cache loaded. When we find one
         * we must load its space cache which requires reading its free space
         * cache's inode item from the root tree. If this inode item is located
         * in the same leaf that we started COWing before, then we end up in
         * deadlock on the extent buffer (trying to read lock it when we
         * previously write locked it).
         *
         * It's safe to read the inode item using the commit root because
         * block groups, once loaded, stay in memory forever (until they are
         * removed) as well as their space caches once loaded. New block groups
         * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
         * we will never try to read their inode item while the fs is mounted.
         */
        inode = lookup_free_space_inode(block_group, path);
        if (IS_ERR(inode)) {
                btrfs_free_path(path);
                return 0;
        }

        /* We may have converted the inode and made the cache invalid. */
        spin_lock(&block_group->lock);
        if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
                spin_unlock(&block_group->lock);
                btrfs_free_path(path);
                goto out;
        }
        spin_unlock(&block_group->lock);

        /*
         * Reinitialize the class of struct inode's mapping->invalidate_lock for
         * free space inodes to prevent false positives related to locks for normal
         * inodes.
         */
        lockdep_set_class(&(&inode->i_data)->invalidate_lock,
                          &btrfs_free_space_inode_key);

        ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
                                      path, block_group->start);
        btrfs_free_path(path);
        if (ret <= 0)
                goto out;

        matched = (tmp_ctl.free_space == (block_group->length - used -
                                          block_group->bytes_super));

        if (matched) {
                spin_lock(&tmp_ctl.tree_lock);
                ret = copy_free_space_cache(block_group, &tmp_ctl);
                spin_unlock(&tmp_ctl.tree_lock);
                /*
                 * ret == 1 means we successfully loaded the free space cache,
                 * so we need to re-set it here.
                 */
                if (ret == 0)
                        ret = 1;
        } else {
                /*
                 * We need to call the _locked variant so we don't try to update
                 * the discard counters.
                 */
                spin_lock(&tmp_ctl.tree_lock);
                __btrfs_remove_free_space_cache(&tmp_ctl);
                spin_unlock(&tmp_ctl.tree_lock);
                btrfs_warn(fs_info,
                           "block group %llu has wrong amount of free space",
                           block_group->start);
                ret = -1;
        }
out:
        if (ret < 0) {
                /* This cache is bogus, make sure it gets cleared */
                spin_lock(&block_group->lock);
                block_group->disk_cache_state = BTRFS_DC_CLEAR;
                spin_unlock(&block_group->lock);
                ret = 0;

                btrfs_warn(fs_info,
                           "failed to load free space cache for block group %llu, rebuilding it now",
                           block_group->start);
        }

        spin_lock(&ctl->tree_lock);
        btrfs_discard_update_discardable(block_group);
        spin_unlock(&ctl->tree_lock);
        iput(inode);
        return ret;
}

static noinline_for_stack
int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
                              struct btrfs_free_space_ctl *ctl,
                              struct btrfs_block_group *block_group,
                              int *entries, int *bitmaps,
                              struct list_head *bitmap_list)
{
        int ret;
        struct btrfs_free_cluster *cluster = NULL;
        struct btrfs_free_cluster *cluster_locked = NULL;
        struct rb_node *node = rb_first(&ctl->free_space_offset);
        struct btrfs_trim_range *trim_entry;

        /* Get the cluster for this block_group if it exists */
        if (block_group && !list_empty(&block_group->cluster_list)) {
                cluster = list_entry(block_group->cluster_list.next,
                                     struct btrfs_free_cluster,
                                     block_group_list);
        }

        if (!node && cluster) {
                cluster_locked = cluster;
                spin_lock(&cluster_locked->lock);
                node = rb_first(&cluster->root);
                cluster = NULL;
        }

        /* Write out the extent entries */
        while (node) {
                struct btrfs_free_space *e;

                e = rb_entry(node, struct btrfs_free_space, offset_index);
                *entries += 1;

                ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
                                       e->bitmap);
                if (ret)
                        goto fail;

                if (e->bitmap) {
                        list_add_tail(&e->list, bitmap_list);
                        *bitmaps += 1;
                }
                node = rb_next(node);
                if (!node && cluster) {
                        node = rb_first(&cluster->root);
                        cluster_locked = cluster;
                        spin_lock(&cluster_locked->lock);
                        cluster = NULL;
                }
        }
        if (cluster_locked) {
                spin_unlock(&cluster_locked->lock);
                cluster_locked = NULL;
        }

        /*
         * Make sure we don't miss any range that was removed from our rbtree
         * because trimming is running. Otherwise after a umount+mount (or crash
         * after committing the transaction) we would leak free space and get
         * an inconsistent free space cache report from fsck.
         */
        list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
                ret = io_ctl_add_entry(io_ctl, trim_entry->start,
                                       trim_entry->bytes, NULL);
                if (ret)
                        goto fail;
                *entries += 1;
        }

        return 0;
fail:
        if (cluster_locked)
                spin_unlock(&cluster_locked->lock);
        return -ENOSPC;
}

static noinline_for_stack int
update_cache_item(struct btrfs_trans_handle *trans,
                  struct btrfs_root *root,
                  struct inode *inode,
                  struct btrfs_path *path, u64 offset,
                  int entries, int bitmaps)
{
        struct btrfs_key key;
        struct btrfs_free_space_header *header;
        struct extent_buffer *leaf;
        int ret;

        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
        key.offset = offset;
        key.type = 0;

        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        if (ret < 0) {
                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
                                 EXTENT_DELALLOC, NULL);
                goto fail;
        }
        leaf = path->nodes[0];
        if (ret > 0) {
                struct btrfs_key found_key;
                ASSERT(path->slots[0]);
                path->slots[0]--;
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
                    found_key.offset != offset) {
                        clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
                                         inode->i_size - 1, EXTENT_DELALLOC,
                                         NULL);
                        btrfs_release_path(path);
                        goto fail;
                }
        }

        BTRFS_I(inode)->generation = trans->transid;
        header = btrfs_item_ptr(leaf, path->slots[0],
                                struct btrfs_free_space_header);
        btrfs_set_free_space_entries(leaf, header, entries);
        btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
        btrfs_set_free_space_generation(leaf, header, trans->transid);
        btrfs_mark_buffer_dirty(trans, leaf);
        btrfs_release_path(path);

        return 0;

fail:
        return -1;
}

static noinline_for_stack int write_pinned_extent_entries(
                            struct btrfs_trans_handle *trans,
                            struct btrfs_block_group *block_group,
                            struct btrfs_io_ctl *io_ctl,
                            int *entries)
{
        u64 start, extent_start, extent_end, len;
        struct extent_io_tree *unpin = NULL;
        int ret;

        if (!block_group)
                return 0;

        /*
         * We want to add any pinned extents to our free space cache
         * so we don't leak the space
         *
         * We shouldn't have switched the pinned extents yet so this is the
         * right one
         */
        unpin = &trans->transaction->pinned_extents;

        start = block_group->start;

        while (start < block_group->start + block_group->length) {
                if (!find_first_extent_bit(unpin, start,
                                           &extent_start, &extent_end,
                                           EXTENT_DIRTY, NULL))
                        return 0;

                /* This pinned extent is out of our range */
                if (extent_start >= block_group->start + block_group->length)
                        return 0;

                extent_start = max(extent_start, start);
                extent_end = min(block_group->start + block_group->length,
                                 extent_end + 1);
                len = extent_end - extent_start;

                *entries += 1;
                ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
                if (ret)
                        return -ENOSPC;

                start = extent_end;
        }

        return 0;
}

static noinline_for_stack int
write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
{
        struct btrfs_free_space *entry, *next;
        int ret;

        /* Write out the bitmaps */
        list_for_each_entry_safe(entry, next, bitmap_list, list) {
                ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
                if (ret)
                        return -ENOSPC;
                list_del_init(&entry->list);
        }

        return 0;
}

static int flush_dirty_cache(struct inode *inode)
{
        int ret;

        ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
        if (ret)
                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
                                 EXTENT_DELALLOC, NULL);

        return ret;
}

static void noinline_for_stack
cleanup_bitmap_list(struct list_head *bitmap_list)
{
        struct btrfs_free_space *entry, *next;

        list_for_each_entry_safe(entry, next, bitmap_list, list)
                list_del_init(&entry->list);
}

static void noinline_for_stack
cleanup_write_cache_enospc(struct inode *inode,
                           struct btrfs_io_ctl *io_ctl,
                           struct extent_state **cached_state)
{
        io_ctl_drop_pages(io_ctl);
        unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
                      cached_state);
}

static int __btrfs_wait_cache_io(struct btrfs_root *root,
                                 struct btrfs_trans_handle *trans,
                                 struct btrfs_block_group *block_group,
                                 struct btrfs_io_ctl *io_ctl,
                                 struct btrfs_path *path, u64 offset)
{
        int ret;
        struct inode *inode = io_ctl->inode;

        if (!inode)
                return 0;

        /* Flush the dirty pages in the cache file. */
        ret = flush_dirty_cache(inode);
        if (ret)
                goto out;

        /* Update the cache item to tell everyone this cache file is valid. */
        ret = update_cache_item(trans, root, inode, path, offset,
                                io_ctl->entries, io_ctl->bitmaps);
out:
        if (ret) {
                invalidate_inode_pages2(inode->i_mapping);
                BTRFS_I(inode)->generation = 0;
                if (block_group)
                        btrfs_debug(root->fs_info,
          "failed to write free space cache for block group %llu error %d",
                                  block_group->start, ret);
        }
        btrfs_update_inode(trans, BTRFS_I(inode));

        if (block_group) {
                /* the dirty list is protected by the dirty_bgs_lock */
                spin_lock(&trans->transaction->dirty_bgs_lock);

                /* the disk_cache_state is protected by the block group lock */
                spin_lock(&block_group->lock);

                /*
                 * only mark this as written if we didn't get put back on
                 * the dirty list while waiting for IO.   Otherwise our
                 * cache state won't be right, and we won't get written again
                 */
                if (!ret && list_empty(&block_group->dirty_list))
                        block_group->disk_cache_state = BTRFS_DC_WRITTEN;
                else if (ret)
                        block_group->disk_cache_state = BTRFS_DC_ERROR;

                spin_unlock(&block_group->lock);
                spin_unlock(&trans->transaction->dirty_bgs_lock);
                io_ctl->inode = NULL;
                iput(inode);
        }

        return ret;

}

int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
                        struct btrfs_block_group *block_group,
                        struct btrfs_path *path)
{
        return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
                                     block_group, &block_group->io_ctl,
                                     path, block_group->start);
}

/*
 * Write out cached info to an inode.
 *
 * @inode:       freespace inode we are writing out
 * @ctl:         free space cache we are going to write out
 * @block_group: block_group for this cache if it belongs to a block_group
 * @io_ctl:      holds context for the io
 * @trans:       the trans handle
 *
 * This function writes out a free space cache struct to disk for quick recovery
 * on mount.  This will return 0 if it was successful in writing the cache out,
 * or an errno if it was not.
 */
static int __btrfs_write_out_cache(struct inode *inode,
                                   struct btrfs_free_space_ctl *ctl,
                                   struct btrfs_block_group *block_group,
                                   struct btrfs_io_ctl *io_ctl,
                                   struct btrfs_trans_handle *trans)
{
        struct extent_state *cached_state = NULL;
        LIST_HEAD(bitmap_list);
        int entries = 0;
        int bitmaps = 0;
        int ret;
        int must_iput = 0;

        if (!i_size_read(inode))
                return -EIO;

        WARN_ON(io_ctl->pages);
        ret = io_ctl_init(io_ctl, inode, 1);
        if (ret)
                return ret;

        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
                down_write(&block_group->data_rwsem);
                spin_lock(&block_group->lock);
                if (block_group->delalloc_bytes) {
                        block_group->disk_cache_state = BTRFS_DC_WRITTEN;
                        spin_unlock(&block_group->lock);
                        up_write(&block_group->data_rwsem);
                        BTRFS_I(inode)->generation = 0;
                        ret = 0;
                        must_iput = 1;
                        goto out;
                }
                spin_unlock(&block_group->lock);
        }

        /* Lock all pages first so we can lock the extent safely. */
        ret = io_ctl_prepare_pages(io_ctl, false);
        if (ret)
                goto out_unlock;

        lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
                    &cached_state);

        io_ctl_set_generation(io_ctl, trans->transid);

        mutex_lock(&ctl->cache_writeout_mutex);
        /* Write out the extent entries in the free space cache */
        spin_lock(&ctl->tree_lock);
        ret = write_cache_extent_entries(io_ctl, ctl,
                                         block_group, &entries, &bitmaps,
                                         &bitmap_list);
        if (ret)
                goto out_nospc_locked;

        /*
         * Some spaces that are freed in the current transaction are pinned,
         * they will be added into free space cache after the transaction is
         * committed, we shouldn't lose them.
         *
         * If this changes while we are working we'll get added back to
         * the dirty list and redo it.  No locking needed
         */
        ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
        if (ret)
                goto out_nospc_locked;

        /*
         * At last, we write out all the bitmaps and keep cache_writeout_mutex
         * locked while doing it because a concurrent trim can be manipulating
         * or freeing the bitmap.
         */
        ret = write_bitmap_entries(io_ctl, &bitmap_list);
        spin_unlock(&ctl->tree_lock);
        mutex_unlock(&ctl->cache_writeout_mutex);
        if (ret)
                goto out_nospc;

        /* Zero out the rest of the pages just to make sure */
        io_ctl_zero_remaining_pages(io_ctl);

        /* Everything is written out, now we dirty the pages in the file. */
        ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
                                io_ctl->num_pages, 0, i_size_read(inode),
                                &cached_state, false);
        if (ret)
                goto out_nospc;

        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
                up_write(&block_group->data_rwsem);
        /*
         * Release the pages and unlock the extent, we will flush
         * them out later
         */
        io_ctl_drop_pages(io_ctl);
        io_ctl_free(io_ctl);

        unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
                      &cached_state);

        /*
         * at this point the pages are under IO and we're happy,
         * The caller is responsible for waiting on them and updating
         * the cache and the inode
         */
        io_ctl->entries = entries;
        io_ctl->bitmaps = bitmaps;

        ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
        if (ret)
                goto out;

        return 0;

out_nospc_locked:
        cleanup_bitmap_list(&bitmap_list);
        spin_unlock(&ctl->tree_lock);
        mutex_unlock(&ctl->cache_writeout_mutex);

out_nospc:
        cleanup_write_cache_enospc(inode, io_ctl, &cached_state);

out_unlock:
        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
                up_write(&block_group->data_rwsem);

out:
        io_ctl->inode = NULL;
        io_ctl_free(io_ctl);
        if (ret) {
                invalidate_inode_pages2(inode->i_mapping);
                BTRFS_I(inode)->generation = 0;
        }
        btrfs_update_inode(trans, BTRFS_I(inode));
        if (must_iput)
                iput(inode);
        return ret;
}

int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
                          struct btrfs_block_group *block_group,
                          struct btrfs_path *path)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct inode *inode;
        int ret = 0;

        spin_lock(&block_group->lock);
        if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
                spin_unlock(&block_group->lock);
                return 0;
        }
        spin_unlock(&block_group->lock);

        inode = lookup_free_space_inode(block_group, path);
        if (IS_ERR(inode))
                return 0;

        ret = __btrfs_write_out_cache(inode, ctl, block_group,
                                      &block_group->io_ctl, trans);
        if (ret) {
                btrfs_debug(fs_info,
          "failed to write free space cache for block group %llu error %d",
                          block_group->start, ret);
                spin_lock(&block_group->lock);
                block_group->disk_cache_state = BTRFS_DC_ERROR;
                spin_unlock(&block_group->lock);

                block_group->io_ctl.inode = NULL;
                iput(inode);
        }

        /*
         * if ret == 0 the caller is expected to call btrfs_wait_cache_io
         * to wait for IO and put the inode
         */

        return ret;
}

static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
                                          u64 offset)
{
        ASSERT(offset >= bitmap_start);
        offset -= bitmap_start;
        return (unsigned long)(div_u64(offset, unit));
}

static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
{
        return (unsigned long)(div_u64(bytes, unit));
}

static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
                                   u64 offset)
{
        u64 bitmap_start;
        u64 bytes_per_bitmap;

        bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
        bitmap_start = offset - ctl->start;
        bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
        bitmap_start *= bytes_per_bitmap;
        bitmap_start += ctl->start;

        return bitmap_start;
}

static int tree_insert_offset(struct btrfs_free_space_ctl *ctl,
                              struct btrfs_free_cluster *cluster,
                              struct btrfs_free_space *new_entry)
{
        struct rb_root *root;
        struct rb_node **p;
        struct rb_node *parent = NULL;

        lockdep_assert_held(&ctl->tree_lock);

        if (cluster) {
                lockdep_assert_held(&cluster->lock);
                root = &cluster->root;
        } else {
                root = &ctl->free_space_offset;
        }

        p = &root->rb_node;

        while (*p) {
                struct btrfs_free_space *info;

                parent = *p;
                info = rb_entry(parent, struct btrfs_free_space, offset_index);

                if (new_entry->offset < info->offset) {
                        p = &(*p)->rb_left;
                } else if (new_entry->offset > info->offset) {
                        p = &(*p)->rb_right;
                } else {
                        /*
                         * we could have a bitmap entry and an extent entry
                         * share the same offset.  If this is the case, we want
                         * the extent entry to always be found first if we do a
                         * linear search through the tree, since we want to have
                         * the quickest allocation time, and allocating from an
                         * extent is faster than allocating from a bitmap.  So
                         * if we're inserting a bitmap and we find an entry at
                         * this offset, we want to go right, or after this entry
                         * logically.  If we are inserting an extent and we've
                         * found a bitmap, we want to go left, or before
                         * logically.
                         */
                        if (new_entry->bitmap) {
                                if (info->bitmap) {
                                        WARN_ON_ONCE(1);
                                        return -EEXIST;
                                }
                                p = &(*p)->rb_right;
                        } else {
                                if (!info->bitmap) {
                                        WARN_ON_ONCE(1);
                                        return -EEXIST;
                                }
                                p = &(*p)->rb_left;
                        }
                }
        }

        rb_link_node(&new_entry->offset_index, parent, p);
        rb_insert_color(&new_entry->offset_index, root);

        return 0;
}

/*
 * This is a little subtle.  We *only* have ->max_extent_size set if we actually
 * searched through the bitmap and figured out the largest ->max_extent_size,
 * otherwise it's 0.  In the case that it's 0 we don't want to tell the
 * allocator the wrong thing, we want to use the actual real max_extent_size
 * we've found already if it's larger, or we want to use ->bytes.
 *
 * This matters because find_free_space() will skip entries who's ->bytes is
 * less than the required bytes.  So if we didn't search down this bitmap, we
 * may pick some previous entry that has a smaller ->max_extent_size than we
 * have.  For example, assume we have two entries, one that has
 * ->max_extent_size set to 4K and ->bytes set to 1M.  A second entry hasn't set
 * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous.  We will
 *  call into find_free_space(), and return with max_extent_size == 4K, because
 *  that first bitmap entry had ->max_extent_size set, but the second one did
 *  not.  If instead we returned 8K we'd come in searching for 8K, and find the
 *  8K contiguous range.
 *
 *  Consider the other case, we have 2 8K chunks in that second entry and still
 *  don't have ->max_extent_size set.  We'll return 16K, and the next time the
 *  allocator comes in it'll fully search our second bitmap, and this time it'll
 *  get an uptodate value of 8K as the maximum chunk size.  Then we'll get the
 *  right allocation the next loop through.
 */
static inline u64 get_max_extent_size(const struct btrfs_free_space *entry)
{
        if (entry->bitmap && entry->max_extent_size)
                return entry->max_extent_size;
        return entry->bytes;
}

/*
 * We want the largest entry to be leftmost, so this is inverted from what you'd
 * normally expect.
 */
static bool entry_less(struct rb_node *node, const struct rb_node *parent)
{
        const struct btrfs_free_space *entry, *exist;

        entry = rb_entry(node, struct btrfs_free_space, bytes_index);
        exist = rb_entry(parent, struct btrfs_free_space, bytes_index);
        return get_max_extent_size(exist) < get_max_extent_size(entry);
}

/*
 * searches the tree for the given offset.
 *
 * fuzzy - If this is set, then we are trying to make an allocation, and we just
 * want a section that has at least bytes size and comes at or after the given
 * offset.
 */
static struct btrfs_free_space *
tree_search_offset(struct btrfs_free_space_ctl *ctl,
                   u64 offset, int bitmap_only, int fuzzy)
{
        struct rb_node *n = ctl->free_space_offset.rb_node;
        struct btrfs_free_space *entry = NULL, *prev = NULL;

        lockdep_assert_held(&ctl->tree_lock);

        /* find entry that is closest to the 'offset' */
        while (n) {
                entry = rb_entry(n, struct btrfs_free_space, offset_index);
                prev = entry;

                if (offset < entry->offset)
                        n = n->rb_left;
                else if (offset > entry->offset)
                        n = n->rb_right;
                else
                        break;

                entry = NULL;
        }

        if (bitmap_only) {
                if (!entry)
                        return NULL;
                if (entry->bitmap)
                        return entry;

                /*
                 * bitmap entry and extent entry may share same offset,
                 * in that case, bitmap entry comes after extent entry.
                 */
                n = rb_next(n);
                if (!n)
                        return NULL;
                entry = rb_entry(n, struct btrfs_free_space, offset_index);
                if (entry->offset != offset)
                        return NULL;

                WARN_ON(!entry->bitmap);
                return entry;
        } else if (entry) {
                if (entry->bitmap) {
                        /*
                         * if previous extent entry covers the offset,
                         * we should return it instead of the bitmap entry
                         */
                        n = rb_prev(&entry->offset_index);
                        if (n) {
                                prev = rb_entry(n, struct btrfs_free_space,
                                                offset_index);
                                if (!prev->bitmap &&
                                    prev->offset + prev->bytes > offset)
                                        entry = prev;
                        }
                }
                return entry;
        }

        if (!prev)
                return NULL;

        /* find last entry before the 'offset' */
        entry = prev;
        if (entry->offset > offset) {
                n = rb_prev(&entry->offset_index);
                if (n) {
                        entry = rb_entry(n, struct btrfs_free_space,
                                        offset_index);
                        ASSERT(entry->offset <= offset);
                } else {
                        if (fuzzy)
                                return entry;
                        else
                                return NULL;
                }
        }

        if (entry->bitmap) {
                n = rb_prev(&entry->offset_index);
                if (n) {
                        prev = rb_entry(n, struct btrfs_free_space,
                                        offset_index);
                        if (!prev->bitmap &&
                            prev->offset + prev->bytes > offset)
                                return prev;
                }
                if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
                        return entry;
        } else if (entry->offset + entry->bytes > offset)
                return entry;

        if (!fuzzy)
                return NULL;

        while (1) {
                n = rb_next(&entry->offset_index);
                if (!n)
                        return NULL;
                entry = rb_entry(n, struct btrfs_free_space, offset_index);
                if (entry->bitmap) {
                        if (entry->offset + BITS_PER_BITMAP *
                            ctl->unit > offset)
                                break;
                } else {
                        if (entry->offset + entry->bytes > offset)
                                break;
                }
        }
        return entry;
}

static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl,
                                     struct btrfs_free_space *info,
                                     bool update_stat)
{
        lockdep_assert_held(&ctl->tree_lock);

        rb_erase(&info->offset_index, &ctl->free_space_offset);
        rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
        ctl->free_extents--;

        if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
                ctl->discardable_extents[BTRFS_STAT_CURR]--;
                ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
        }

        if (update_stat)
                ctl->free_space -= info->bytes;
}

static int link_free_space(struct btrfs_free_space_ctl *ctl,
                           struct btrfs_free_space *info)
{
        int ret = 0;

        lockdep_assert_held(&ctl->tree_lock);

        ASSERT(info->bytes || info->bitmap);
        ret = tree_insert_offset(ctl, NULL, info);
        if (ret)
                return ret;

        rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);

        if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
                ctl->discardable_extents[BTRFS_STAT_CURR]++;
                ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
        }

        ctl->free_space += info->bytes;
        ctl->free_extents++;
        return ret;
}

static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl,
                                struct btrfs_free_space *info)
{
        ASSERT(info->bitmap);

        /*
         * If our entry is empty it's because we're on a cluster and we don't
         * want to re-link it into our ctl bytes index.
         */
        if (RB_EMPTY_NODE(&info->bytes_index))
                return;

        lockdep_assert_held(&ctl->tree_lock);

        rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
        rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
}

static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
                                     struct btrfs_free_space *info,
                                     u64 offset, u64 bytes, bool update_stat)
{
        unsigned long start, count, end;
        int extent_delta = -1;

        start = offset_to_bit(info->offset, ctl->unit, offset);
        count = bytes_to_bits(bytes, ctl->unit);
        end = start + count;
        ASSERT(end <= BITS_PER_BITMAP);

        bitmap_clear(info->bitmap, start, count);

        info->bytes -= bytes;
        if (info->max_extent_size > ctl->unit)
                info->max_extent_size = 0;

        relink_bitmap_entry(ctl, info);

        if (start && test_bit(start - 1, info->bitmap))
                extent_delta++;

        if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
                extent_delta++;

        info->bitmap_extents += extent_delta;
        if (!btrfs_free_space_trimmed(info)) {
                ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
                ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
        }

        if (update_stat)
                ctl->free_space -= bytes;
}

static void btrfs_bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
                                  struct btrfs_free_space *info, u64 offset,
                                  u64 bytes)
{
        unsigned long start, count, end;
        int extent_delta = 1;

        start = offset_to_bit(info->offset, ctl->unit, offset);
        count = bytes_to_bits(bytes, ctl->unit);
        end = start + count;
        ASSERT(end <= BITS_PER_BITMAP);

        bitmap_set(info->bitmap, start, count);

        /*
         * We set some bytes, we have no idea what the max extent size is
         * anymore.
         */
        info->max_extent_size = 0;
        info->bytes += bytes;
        ctl->free_space += bytes;

        relink_bitmap_entry(ctl, info);

        if (start && test_bit(start - 1, info->bitmap))
                extent_delta--;

        if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
                extent_delta--;

        info->bitmap_extents += extent_delta;
        if (!btrfs_free_space_trimmed(info)) {
                ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
                ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes;
        }
}

/*
 * If we can not find suitable extent, we will use bytes to record
 * the size of the max extent.
 */
static int search_bitmap(struct btrfs_free_space_ctl *ctl,
                         struct btrfs_free_space *bitmap_info, u64 *offset,
                         u64 *bytes, bool for_alloc)
{
        unsigned long found_bits = 0;
        unsigned long max_bits = 0;
        unsigned long bits, i;
        unsigned long next_zero;
        unsigned long extent_bits;

        /*
         * Skip searching the bitmap if we don't have a contiguous section that
         * is large enough for this allocation.
         */
        if (for_alloc &&
            bitmap_info->max_extent_size &&
            bitmap_info->max_extent_size < *bytes) {
                *bytes = bitmap_info->max_extent_size;
                return -1;
        }

        i = offset_to_bit(bitmap_info->offset, ctl->unit,
                          max_t(u64, *offset, bitmap_info->offset));
        bits = bytes_to_bits(*bytes, ctl->unit);

        for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
                if (for_alloc && bits == 1) {
                        found_bits = 1;
                        break;
                }
                next_zero = find_next_zero_bit(bitmap_info->bitmap,
                                               BITS_PER_BITMAP, i);
                extent_bits = next_zero - i;
                if (extent_bits >= bits) {
                        found_bits = extent_bits;
                        break;
                } else if (extent_bits > max_bits) {
                        max_bits = extent_bits;
                }
                i = next_zero;
        }

        if (found_bits) {
                *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
                *bytes = (u64)(found_bits) * ctl->unit;
                return 0;
        }

        *bytes = (u64)(max_bits) * ctl->unit;
        bitmap_info->max_extent_size = *bytes;
        relink_bitmap_entry(ctl, bitmap_info);
        return -1;
}

/* Cache the size of the max extent in bytes */
static struct btrfs_free_space *
find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
                unsigned long align, u64 *max_extent_size, bool use_bytes_index)
{
        struct btrfs_free_space *entry;
        struct rb_node *node;
        u64 tmp;
        u64 align_off;
        int ret;

        if (!ctl->free_space_offset.rb_node)
                goto out;
again:
        if (use_bytes_index) {
                node = rb_first_cached(&ctl->free_space_bytes);
        } else {
                entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
                                           0, 1);
                if (!entry)
                        goto out;
                node = &entry->offset_index;
        }

        for (; node; node = rb_next(node)) {
                if (use_bytes_index)
                        entry = rb_entry(node, struct btrfs_free_space,
                                         bytes_index);
                else
                        entry = rb_entry(node, struct btrfs_free_space,
                                         offset_index);

                /*
                 * If we are using the bytes index then all subsequent entries
                 * in this tree are going to be < bytes, so simply set the max
                 * extent size and exit the loop.
                 *
                 * If we're using the offset index then we need to keep going
                 * through the rest of the tree.
                 */
                if (entry->bytes < *bytes) {
                        *max_extent_size = max(get_max_extent_size(entry),
                                               *max_extent_size);
                        if (use_bytes_index)
                                break;
                        continue;
                }

                /* make sure the space returned is big enough
                 * to match our requested alignment
                 */
                if (*bytes >= align) {
                        tmp = entry->offset - ctl->start + align - 1;
                        tmp = div64_u64(tmp, align);
                        tmp = tmp * align + ctl->start;
                        align_off = tmp - entry->offset;
                } else {
                        align_off = 0;
                        tmp = entry->offset;
                }

                /*
                 * We don't break here if we're using the bytes index because we
                 * may have another entry that has the correct alignment that is
                 * the right size, so we don't want to miss that possibility.
                 * At worst this adds another loop through the logic, but if we
                 * broke here we could prematurely ENOSPC.
                 */
                if (entry->bytes < *bytes + align_off) {
                        *max_extent_size = max(get_max_extent_size(entry),
                                               *max_extent_size);
                        continue;
                }

                if (entry->bitmap) {
                        struct rb_node *old_next = rb_next(node);
                        u64 size = *bytes;

                        ret = search_bitmap(ctl, entry, &tmp, &size, true);
                        if (!ret) {
                                *offset = tmp;
                                *bytes = size;
                                return entry;
                        } else {
                                *max_extent_size =
                                        max(get_max_extent_size(entry),
                                            *max_extent_size);
                        }

                        /*
                         * The bitmap may have gotten re-arranged in the space
                         * index here because the max_extent_size may have been
                         * updated.  Start from the beginning again if this
                         * happened.
                         */
                        if (use_bytes_index && old_next != rb_next(node))
                                goto again;
                        continue;
                }

                *offset = tmp;
                *bytes = entry->bytes - align_off;
                return entry;
        }
out:
        return NULL;
}

static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
                           struct btrfs_free_space *info, u64 offset)
{
        info->offset = offset_to_bitmap(ctl, offset);
        info->bytes = 0;
        info->bitmap_extents = 0;
        INIT_LIST_HEAD(&info->list);
        link_free_space(ctl, info);
        ctl->total_bitmaps++;
        recalculate_thresholds(ctl);
}

static void free_bitmap(struct btrfs_free_space_ctl *ctl,
                        struct btrfs_free_space *bitmap_info)
{
        /*
         * Normally when this is called, the bitmap is completely empty. However,
         * if we are blowing up the free space cache for one reason or another
         * via __btrfs_remove_free_space_cache(), then it may not be freed and
         * we may leave stats on the table.
         */
        if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) {
                ctl->discardable_extents[BTRFS_STAT_CURR] -=
                        bitmap_info->bitmap_extents;
                ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;

        }
        unlink_free_space(ctl, bitmap_info, true);
        kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
        kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
        ctl->total_bitmaps--;
        recalculate_thresholds(ctl);
}

static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
                              struct btrfs_free_space *bitmap_info,
                              u64 *offset, u64 *bytes)
{
        u64 end;
        u64 search_start, search_bytes;
        int ret;

again:
        end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;

        /*
         * We need to search for bits in this bitmap.  We could only cover some
         * of the extent in this bitmap thanks to how we add space, so we need
         * to search for as much as it as we can and clear that amount, and then
         * go searching for the next bit.
         */
        search_start = *offset;
        search_bytes = ctl->unit;
        search_bytes = min(search_bytes, end - search_start + 1);
        ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
                            false);
        if (ret < 0 || search_start != *offset)
                return -EINVAL;

        /* We may have found more bits than what we need */
        search_bytes = min(search_bytes, *bytes);

        /* Cannot clear past the end of the bitmap */
        search_bytes = min(search_bytes, end - search_start + 1);

        bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true);
        *offset += search_bytes;
        *bytes -= search_bytes;

        if (*bytes) {
                struct rb_node *next = rb_next(&bitmap_info->offset_index);
                if (!bitmap_info->bytes)
                        free_bitmap(ctl, bitmap_info);

                /*
                 * no entry after this bitmap, but we still have bytes to
                 * remove, so something has gone wrong.
                 */
                if (!next)
                        return -EINVAL;

                bitmap_info = rb_entry(next, struct btrfs_free_space,
                                       offset_index);

                /*
                 * if the next entry isn't a bitmap we need to return to let the
                 * extent stuff do its work.
                 */
                if (!bitmap_info->bitmap)
                        return -EAGAIN;

                /*
                 * Ok the next item is a bitmap, but it may not actually hold
                 * the information for the rest of this free space stuff, so
                 * look for it, and if we don't find it return so we can try
                 * everything over again.
                 */
                search_start = *offset;
                search_bytes = ctl->unit;
                ret = search_bitmap(ctl, bitmap_info, &search_start,
                                    &search_bytes, false);
                if (ret < 0 || search_start != *offset)
                        return -EAGAIN;

                goto again;
        } else if (!bitmap_info->bytes)
                free_bitmap(ctl, bitmap_info);

        return 0;
}

static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
                               struct btrfs_free_space *info, u64 offset,
                               u64 bytes, enum btrfs_trim_state trim_state)
{
        u64 bytes_to_set = 0;
        u64 end;

        /*
         * This is a tradeoff to make bitmap trim state minimal.  We mark the
         * whole bitmap untrimmed if at any point we add untrimmed regions.
         */
        if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) {
                if (btrfs_free_space_trimmed(info)) {
                        ctl->discardable_extents[BTRFS_STAT_CURR] +=
                                info->bitmap_extents;
                        ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
                }
                info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
        }

        end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);

        bytes_to_set = min(end - offset, bytes);

        btrfs_bitmap_set_bits(ctl, info, offset, bytes_to_set);

        return bytes_to_set;

}

static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
                      struct btrfs_free_space *info)
{
        struct btrfs_block_group *block_group = ctl->block_group;
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        bool forced = false;

#ifdef CONFIG_BTRFS_DEBUG
        if (btrfs_should_fragment_free_space(block_group))
                forced = true;
#endif

        /* This is a way to reclaim large regions from the bitmaps. */
        if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD)
                return false;

        /*
         * If we are below the extents threshold then we can add this as an
         * extent, and don't have to deal with the bitmap
         */
        if (!forced && ctl->free_extents < ctl->extents_thresh) {
                /*
                 * If this block group has some small extents we don't want to
                 * use up all of our free slots in the cache with them, we want
                 * to reserve them to larger extents, however if we have plenty
                 * of cache left then go ahead an dadd them, no sense in adding
                 * the overhead of a bitmap if we don't have to.
                 */
                if (info->bytes <= fs_info->sectorsize * 8) {
                        if (ctl->free_extents * 3 <= ctl->extents_thresh)
                                return false;
                } else {
                        return false;
                }
        }

        /*
         * The original block groups from mkfs can be really small, like 8
         * megabytes, so don't bother with a bitmap for those entries.  However
         * some block groups can be smaller than what a bitmap would cover but
         * are still large enough that they could overflow the 32k memory limit,
         * so allow those block groups to still be allowed to have a bitmap
         * entry.
         */
        if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
                return false;

        return true;
}

static const struct btrfs_free_space_op free_space_op = {
        .use_bitmap                = use_bitmap,
};

static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
                              struct btrfs_free_space *info)
{
        struct btrfs_free_space *bitmap_info;
        struct btrfs_block_group *block_group = NULL;
        int added = 0;
        u64 bytes, offset, bytes_added;
        enum btrfs_trim_state trim_state;
        int ret;

        bytes = info->bytes;
        offset = info->offset;
        trim_state = info->trim_state;

        if (!ctl->op->use_bitmap(ctl, info))
                return 0;

        if (ctl->op == &free_space_op)
                block_group = ctl->block_group;
again:
        /*
         * Since we link bitmaps right into the cluster we need to see if we
         * have a cluster here, and if so and it has our bitmap we need to add
         * the free space to that bitmap.
         */
        if (block_group && !list_empty(&block_group->cluster_list)) {
                struct btrfs_free_cluster *cluster;
                struct rb_node *node;
                struct btrfs_free_space *entry;

                cluster = list_entry(block_group->cluster_list.next,
                                     struct btrfs_free_cluster,
                                     block_group_list);
                spin_lock(&cluster->lock);
                node = rb_first(&cluster->root);
                if (!node) {
                        spin_unlock(&cluster->lock);
                        goto no_cluster_bitmap;
                }

                entry = rb_entry(node, struct btrfs_free_space, offset_index);
                if (!entry->bitmap) {
                        spin_unlock(&cluster->lock);
                        goto no_cluster_bitmap;
                }

                if (entry->offset == offset_to_bitmap(ctl, offset)) {
                        bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
                                                          bytes, trim_state);
                        bytes -= bytes_added;
                        offset += bytes_added;
                }
                spin_unlock(&cluster->lock);
                if (!bytes) {
                        ret = 1;
                        goto out;
                }
        }

no_cluster_bitmap:
        bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
                                         1, 0);
        if (!bitmap_info) {
                ASSERT(added == 0);
                goto new_bitmap;
        }

        bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
                                          trim_state);
        bytes -= bytes_added;
        offset += bytes_added;
        added = 0;

        if (!bytes) {
                ret = 1;
                goto out;
        } else
                goto again;

new_bitmap:
        if (info && info->bitmap) {
                add_new_bitmap(ctl, info, offset);
                added = 1;
                info = NULL;
                goto again;
        } else {
                spin_unlock(&ctl->tree_lock);

                /* no pre-allocated info, allocate a new one */
                if (!info) {
                        info = kmem_cache_zalloc(btrfs_free_space_cachep,
                                                 GFP_NOFS);
                        if (!info) {
                                spin_lock(&ctl->tree_lock);
                                ret = -ENOMEM;
                                goto out;
                        }
                }

                /* allocate the bitmap */
                info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
                                                 GFP_NOFS);
                info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
                spin_lock(&ctl->tree_lock);
                if (!info->bitmap) {
                        ret = -ENOMEM;
                        goto out;
                }
                goto again;
        }

out:
        if (info) {
                if (info->bitmap)
                        kmem_cache_free(btrfs_free_space_bitmap_cachep,
                                        info->bitmap);
                kmem_cache_free(btrfs_free_space_cachep, info);
        }

        return ret;
}

/*
 * Free space merging rules:
 *  1) Merge trimmed areas together
 *  2) Let untrimmed areas coalesce with trimmed areas
 *  3) Always pull neighboring regions from bitmaps
 *
 * The above rules are for when we merge free space based on btrfs_trim_state.
 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
 * same reason: to promote larger extent regions which makes life easier for
 * find_free_extent().  Rule 2 enables coalescing based on the common path
 * being returning free space from btrfs_finish_extent_commit().  So when free
 * space is trimmed, it will prevent aggregating trimmed new region and
 * untrimmed regions in the rb_tree.  Rule 3 is purely to obtain larger extents
 * and provide find_free_extent() with the largest extents possible hoping for
 * the reuse path.
 */
static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
                          struct btrfs_free_space *info, bool update_stat)
{
        struct btrfs_free_space *left_info = NULL;
        struct btrfs_free_space *right_info;
        bool merged = false;
        u64 offset = info->offset;
        u64 bytes = info->bytes;
        const bool is_trimmed = btrfs_free_space_trimmed(info);
        struct rb_node *right_prev = NULL;

        /*
         * first we want to see if there is free space adjacent to the range we
         * are adding, if there is remove that struct and add a new one to
         * cover the entire range
         */
        right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
        if (right_info)
                right_prev = rb_prev(&right_info->offset_index);

        if (right_prev)
                left_info = rb_entry(right_prev, struct btrfs_free_space, offset_index);
        else if (!right_info)
                left_info = tree_search_offset(ctl, offset - 1, 0, 0);

        /* See try_merge_free_space() comment. */
        if (right_info && !right_info->bitmap &&
            (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
                unlink_free_space(ctl, right_info, update_stat);
                info->bytes += right_info->bytes;
                kmem_cache_free(btrfs_free_space_cachep, right_info);
                merged = true;
        }

        /* See try_merge_free_space() comment. */
        if (left_info && !left_info->bitmap &&
            left_info->offset + left_info->bytes == offset &&
            (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
                unlink_free_space(ctl, left_info, update_stat);
                info->offset = left_info->offset;
                info->bytes += left_info->bytes;
                kmem_cache_free(btrfs_free_space_cachep, left_info);
                merged = true;
        }

        return merged;
}

static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
                                     struct btrfs_free_space *info,
                                     bool update_stat)
{
        struct btrfs_free_space *bitmap;
        unsigned long i;
        unsigned long j;
        const u64 end = info->offset + info->bytes;
        const u64 bitmap_offset = offset_to_bitmap(ctl, end);
        u64 bytes;

        bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
        if (!bitmap)
                return false;

        i = offset_to_bit(bitmap->offset, ctl->unit, end);
        j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
        if (j == i)
                return false;
        bytes = (j - i) * ctl->unit;
        info->bytes += bytes;

        /* See try_merge_free_space() comment. */
        if (!btrfs_free_space_trimmed(bitmap))
                info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;

        bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat);

        if (!bitmap->bytes)
                free_bitmap(ctl, bitmap);

        return true;
}

static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
                                       struct btrfs_free_space *info,
                                       bool update_stat)
{
        struct btrfs_free_space *bitmap;
        u64 bitmap_offset;
        unsigned long i;
        unsigned long j;
        unsigned long prev_j;
        u64 bytes;

        bitmap_offset = offset_to_bitmap(ctl, info->offset);
        /* If we're on a boundary, try the previous logical bitmap. */
        if (bitmap_offset == info->offset) {
                if (info->offset == 0)
                        return false;
                bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
        }

        bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
        if (!bitmap)
                return false;

        i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
        j = 0;
        prev_j = (unsigned long)-1;
        for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
                if (j > i)
                        break;
                prev_j = j;
        }
        if (prev_j == i)
                return false;

        if (prev_j == (unsigned long)-1)
                bytes = (i + 1) * ctl->unit;
        else
                bytes = (i - prev_j) * ctl->unit;

        info->offset -= bytes;
        info->bytes += bytes;

        /* See try_merge_free_space() comment. */
        if (!btrfs_free_space_trimmed(bitmap))
                info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;

        bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat);

        if (!bitmap->bytes)
                free_bitmap(ctl, bitmap);

        return true;
}

/*
 * We prefer always to allocate from extent entries, both for clustered and
 * non-clustered allocation requests. So when attempting to add a new extent
 * entry, try to see if there's adjacent free space in bitmap entries, and if
 * there is, migrate that space from the bitmaps to the extent.
 * Like this we get better chances of satisfying space allocation requests
 * because we attempt to satisfy them based on a single cache entry, and never
 * on 2 or more entries - even if the entries represent a contiguous free space
 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
 * ends).
 */
static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
                              struct btrfs_free_space *info,
                              bool update_stat)
{
        /*
         * Only work with disconnected entries, as we can change their offset,
         * and must be extent entries.
         */
        ASSERT(!info->bitmap);
        ASSERT(RB_EMPTY_NODE(&info->offset_index));

        if (ctl->total_bitmaps > 0) {
                bool stole_end;
                bool stole_front = false;

                stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
                if (ctl->total_bitmaps > 0)
                        stole_front = steal_from_bitmap_to_front(ctl, info,
                                                                 update_stat);

                if (stole_end || stole_front)
                        try_merge_free_space(ctl, info, update_stat);
        }
}

static int __btrfs_add_free_space(struct btrfs_block_group *block_group,
                           u64 offset, u64 bytes,
                           enum btrfs_trim_state trim_state)
{
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *info;
        int ret = 0;
        u64 filter_bytes = bytes;

        ASSERT(!btrfs_is_zoned(fs_info));

        info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
        if (!info)
                return -ENOMEM;

        info->offset = offset;
        info->bytes = bytes;
        info->trim_state = trim_state;
        RB_CLEAR_NODE(&info->offset_index);
        RB_CLEAR_NODE(&info->bytes_index);

        spin_lock(&ctl->tree_lock);

        if (try_merge_free_space(ctl, info, true))
                goto link;

        /*
         * There was no extent directly to the left or right of this new
         * extent then we know we're going to have to allocate a new extent, so
         * before we do that see if we need to drop this into a bitmap
         */
        ret = insert_into_bitmap(ctl, info);
        if (ret < 0) {
                goto out;
        } else if (ret) {
                ret = 0;
                goto out;
        }
link:
        /*
         * Only steal free space from adjacent bitmaps if we're sure we're not
         * going to add the new free space to existing bitmap entries - because
         * that would mean unnecessary work that would be reverted. Therefore
         * attempt to steal space from bitmaps if we're adding an extent entry.
         */
        steal_from_bitmap(ctl, info, true);

        filter_bytes = max(filter_bytes, info->bytes);

        ret = link_free_space(ctl, info);
        if (ret)
                kmem_cache_free(btrfs_free_space_cachep, info);
out:
        btrfs_discard_update_discardable(block_group);
        spin_unlock(&ctl->tree_lock);

        if (ret) {
                btrfs_crit(fs_info, "unable to add free space :%d", ret);
                ASSERT(ret != -EEXIST);
        }

        if (trim_state != BTRFS_TRIM_STATE_TRIMMED) {
                btrfs_discard_check_filter(block_group, filter_bytes);
                btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
        }

        return ret;
}

static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
                                        u64 bytenr, u64 size, bool used)
{
        struct btrfs_space_info *sinfo = block_group->space_info;
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        u64 offset = bytenr - block_group->start;
        u64 to_free, to_unusable;
        int bg_reclaim_threshold = 0;
        bool initial = (size == block_group->length);
        u64 reclaimable_unusable;

        WARN_ON(!initial && offset + size > block_group->zone_capacity);

        if (!initial)
                bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);

        spin_lock(&ctl->tree_lock);
        if (!used)
                to_free = size;
        else if (initial)
                to_free = block_group->zone_capacity;
        else if (offset >= block_group->alloc_offset)
                to_free = size;
        else if (offset + size <= block_group->alloc_offset)
                to_free = 0;
        else
                to_free = offset + size - block_group->alloc_offset;
        to_unusable = size - to_free;

        ctl->free_space += to_free;
        /*
         * If the block group is read-only, we should account freed space into
         * bytes_readonly.
         */
        if (!block_group->ro)
                block_group->zone_unusable += to_unusable;
        spin_unlock(&ctl->tree_lock);
        if (!used) {
                spin_lock(&block_group->lock);
                block_group->alloc_offset -= size;
                spin_unlock(&block_group->lock);
        }

        reclaimable_unusable = block_group->zone_unusable -
                               (block_group->length - block_group->zone_capacity);
        /* All the region is now unusable. Mark it as unused and reclaim */
        if (block_group->zone_unusable == block_group->length) {
                btrfs_mark_bg_unused(block_group);
        } else if (bg_reclaim_threshold &&
                   reclaimable_unusable >=
                   mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) {
                btrfs_mark_bg_to_reclaim(block_group);
        }

        return 0;
}

int btrfs_add_free_space(struct btrfs_block_group *block_group,
                         u64 bytenr, u64 size)
{
        enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;

        if (btrfs_is_zoned(block_group->fs_info))
                return __btrfs_add_free_space_zoned(block_group, bytenr, size,
                                                    true);

        if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
                trim_state = BTRFS_TRIM_STATE_TRIMMED;

        return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
}

int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
                                u64 bytenr, u64 size)
{
        if (btrfs_is_zoned(block_group->fs_info))
                return __btrfs_add_free_space_zoned(block_group, bytenr, size,
                                                    false);

        return btrfs_add_free_space(block_group, bytenr, size);
}

/*
 * This is a subtle distinction because when adding free space back in general,
 * we want it to be added as untrimmed for async. But in the case where we add
 * it on loading of a block group, we want to consider it trimmed.
 */
int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
                                       u64 bytenr, u64 size)
{
        enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;

        if (btrfs_is_zoned(block_group->fs_info))
                return __btrfs_add_free_space_zoned(block_group, bytenr, size,
                                                    true);

        if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
            btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
                trim_state = BTRFS_TRIM_STATE_TRIMMED;

        return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
}

int btrfs_remove_free_space(struct btrfs_block_group *block_group,
                            u64 offset, u64 bytes)
{
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *info;
        int ret;
        bool re_search = false;

        if (btrfs_is_zoned(block_group->fs_info)) {
                /*
                 * This can happen with conventional zones when replaying log.
                 * Since the allocation info of tree-log nodes are not recorded
                 * to the extent-tree, calculate_alloc_pointer() failed to
                 * advance the allocation pointer after last allocated tree log
                 * node blocks.
                 *
                 * This function is called from
                 * btrfs_pin_extent_for_log_replay() when replaying the log.
                 * Advance the pointer not to overwrite the tree-log nodes.
                 */
                if (block_group->start + block_group->alloc_offset <
                    offset + bytes) {
                        block_group->alloc_offset =
                                offset + bytes - block_group->start;
                }
                return 0;
        }

        spin_lock(&ctl->tree_lock);

again:
        ret = 0;
        if (!bytes)
                goto out_lock;

        info = tree_search_offset(ctl, offset, 0, 0);
        if (!info) {
                /*
                 * oops didn't find an extent that matched the space we wanted
                 * to remove, look for a bitmap instead
                 */
                info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
                                          1, 0);
                if (!info) {
                        /*
                         * If we found a partial bit of our free space in a
                         * bitmap but then couldn't find the other part this may
                         * be a problem, so WARN about it.
                         */
                        WARN_ON(re_search);
                        goto out_lock;
                }
        }

        re_search = false;
        if (!info->bitmap) {
                unlink_free_space(ctl, info, true);
                if (offset == info->offset) {
                        u64 to_free = min(bytes, info->bytes);

                        info->bytes -= to_free;
                        info->offset += to_free;
                        if (info->bytes) {
                                ret = link_free_space(ctl, info);
                                WARN_ON(ret);
                        } else {
                                kmem_cache_free(btrfs_free_space_cachep, info);
                        }

                        offset += to_free;
                        bytes -= to_free;
                        goto again;
                } else {
                        u64 old_end = info->bytes + info->offset;

                        info->bytes = offset - info->offset;
                        ret = link_free_space(ctl, info);
                        WARN_ON(ret);
                        if (ret)
                                goto out_lock;

                        /* Not enough bytes in this entry to satisfy us */
                        if (old_end < offset + bytes) {
                                bytes -= old_end - offset;
                                offset = old_end;
                                goto again;
                        } else if (old_end == offset + bytes) {
                                /* all done */
                                goto out_lock;
                        }
                        spin_unlock(&ctl->tree_lock);

                        ret = __btrfs_add_free_space(block_group,
                                                     offset + bytes,
                                                     old_end - (offset + bytes),
                                                     info->trim_state);
                        WARN_ON(ret);
                        goto out;
                }
        }

        ret = remove_from_bitmap(ctl, info, &offset, &bytes);
        if (ret == -EAGAIN) {
                re_search = true;
                goto again;
        }
out_lock:
        btrfs_discard_update_discardable(block_group);
        spin_unlock(&ctl->tree_lock);
out:
        return ret;
}

void btrfs_dump_free_space(struct btrfs_block_group *block_group,
                           u64 bytes)
{
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *info;
        struct rb_node *n;
        int count = 0;

        /*
         * Zoned btrfs does not use free space tree and cluster. Just print
         * out the free space after the allocation offset.
         */
        if (btrfs_is_zoned(fs_info)) {
                btrfs_info(fs_info, "free space %llu active %d",
                           block_group->zone_capacity - block_group->alloc_offset,
                           test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
                                    &block_group->runtime_flags));
                return;
        }

        spin_lock(&ctl->tree_lock);
        for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
                info = rb_entry(n, struct btrfs_free_space, offset_index);
                if (info->bytes >= bytes && !block_group->ro)
                        count++;
                btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
                           info->offset, info->bytes,
                       (info->bitmap) ? "yes" : "no");
        }
        spin_unlock(&ctl->tree_lock);
        btrfs_info(fs_info, "block group has cluster?: %s",
               list_empty(&block_group->cluster_list) ? "no" : "yes");
        btrfs_info(fs_info,
                   "%d free space entries at or bigger than %llu bytes",
                   count, bytes);
}

void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
                               struct btrfs_free_space_ctl *ctl)
{
        struct btrfs_fs_info *fs_info = block_group->fs_info;

        spin_lock_init(&ctl->tree_lock);
        ctl->unit = fs_info->sectorsize;
        ctl->start = block_group->start;
        ctl->block_group = block_group;
        ctl->op = &free_space_op;
        ctl->free_space_bytes = RB_ROOT_CACHED;
        INIT_LIST_HEAD(&ctl->trimming_ranges);
        mutex_init(&ctl->cache_writeout_mutex);

        /*
         * we only want to have 32k of ram per block group for keeping
         * track of free space, and if we pass 1/2 of that we want to
         * start converting things over to using bitmaps
         */
        ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
}

/*
 * for a given cluster, put all of its extents back into the free
 * space cache.  If the block group passed doesn't match the block group
 * pointed to by the cluster, someone else raced in and freed the
 * cluster already.  In that case, we just return without changing anything
 */
static void __btrfs_return_cluster_to_free_space(
                             struct btrfs_block_group *block_group,
                             struct btrfs_free_cluster *cluster)
{
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct rb_node *node;

        lockdep_assert_held(&ctl->tree_lock);

        spin_lock(&cluster->lock);
        if (cluster->block_group != block_group) {
                spin_unlock(&cluster->lock);
                return;
        }

        cluster->block_group = NULL;
        cluster->window_start = 0;
        list_del_init(&cluster->block_group_list);

        node = rb_first(&cluster->root);
        while (node) {
                struct btrfs_free_space *entry;

                entry = rb_entry(node, struct btrfs_free_space, offset_index);
                node = rb_next(&entry->offset_index);
                rb_erase(&entry->offset_index, &cluster->root);
                RB_CLEAR_NODE(&entry->offset_index);

                if (!entry->bitmap) {
                        /* Merging treats extents as if they were new */
                        if (!btrfs_free_space_trimmed(entry)) {
                                ctl->discardable_extents[BTRFS_STAT_CURR]--;
                                ctl->discardable_bytes[BTRFS_STAT_CURR] -=
                                        entry->bytes;
                        }

                        try_merge_free_space(ctl, entry, false);
                        steal_from_bitmap(ctl, entry, false);

                        /* As we insert directly, update these statistics */
                        if (!btrfs_free_space_trimmed(entry)) {
                                ctl->discardable_extents[BTRFS_STAT_CURR]++;
                                ctl->discardable_bytes[BTRFS_STAT_CURR] +=
                                        entry->bytes;
                        }
                }
                tree_insert_offset(ctl, NULL, entry);
                rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes,
                              entry_less);
        }
        cluster->root = RB_ROOT;
        spin_unlock(&cluster->lock);
        btrfs_put_block_group(block_group);
}

void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
{
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_cluster *cluster;
        struct list_head *head;

        spin_lock(&ctl->tree_lock);
        while ((head = block_group->cluster_list.next) !=
               &block_group->cluster_list) {
                cluster = list_entry(head, struct btrfs_free_cluster,
                                     block_group_list);

                WARN_ON(cluster->block_group != block_group);
                __btrfs_return_cluster_to_free_space(block_group, cluster);

                cond_resched_lock(&ctl->tree_lock);
        }
        __btrfs_remove_free_space_cache(ctl);
        btrfs_discard_update_discardable(block_group);
        spin_unlock(&ctl->tree_lock);

}

/*
 * Walk @block_group's free space rb_tree to determine if everything is trimmed.
 */
bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
{
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *info;
        struct rb_node *node;
        bool ret = true;

        spin_lock(&ctl->tree_lock);
        node = rb_first(&ctl->free_space_offset);

        while (node) {
                info = rb_entry(node, struct btrfs_free_space, offset_index);

                if (!btrfs_free_space_trimmed(info)) {
                        ret = false;
                        break;
                }

                node = rb_next(node);
        }

        spin_unlock(&ctl->tree_lock);
        return ret;
}

u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
                               u64 offset, u64 bytes, u64 empty_size,
                               u64 *max_extent_size)
{
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_discard_ctl *discard_ctl =
                                        &block_group->fs_info->discard_ctl;
        struct btrfs_free_space *entry = NULL;
        u64 bytes_search = bytes + empty_size;
        u64 ret = 0;
        u64 align_gap = 0;
        u64 align_gap_len = 0;
        enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
        bool use_bytes_index = (offset == block_group->start);

        ASSERT(!btrfs_is_zoned(block_group->fs_info));

        spin_lock(&ctl->tree_lock);
        entry = find_free_space(ctl, &offset, &bytes_search,
                                block_group->full_stripe_len, max_extent_size,
                                use_bytes_index);
        if (!entry)
                goto out;

        ret = offset;
        if (entry->bitmap) {
                bitmap_clear_bits(ctl, entry, offset, bytes, true);

                if (!btrfs_free_space_trimmed(entry))
                        atomic64_add(bytes, &discard_ctl->discard_bytes_saved);

                if (!entry->bytes)
                        free_bitmap(ctl, entry);
        } else {
                unlink_free_space(ctl, entry, true);
                align_gap_len = offset - entry->offset;
                align_gap = entry->offset;
                align_gap_trim_state = entry->trim_state;

                if (!btrfs_free_space_trimmed(entry))
                        atomic64_add(bytes, &discard_ctl->discard_bytes_saved);

                entry->offset = offset + bytes;
                WARN_ON(entry->bytes < bytes + align_gap_len);

                entry->bytes -= bytes + align_gap_len;
                if (!entry->bytes)
                        kmem_cache_free(btrfs_free_space_cachep, entry);
                else
                        link_free_space(ctl, entry);
        }
out:
        btrfs_discard_update_discardable(block_group);
        spin_unlock(&ctl->tree_lock);

        if (align_gap_len)
                __btrfs_add_free_space(block_group, align_gap, align_gap_len,
                                       align_gap_trim_state);
        return ret;
}

/*
 * given a cluster, put all of its extents back into the free space
 * cache.  If a block group is passed, this function will only free
 * a cluster that belongs to the passed block group.
 *
 * Otherwise, it'll get a reference on the block group pointed to by the
 * cluster and remove the cluster from it.
 */
void btrfs_return_cluster_to_free_space(
                               struct btrfs_block_group *block_group,
                               struct btrfs_free_cluster *cluster)
{
        struct btrfs_free_space_ctl *ctl;

        /* first, get a safe pointer to the block group */
        spin_lock(&cluster->lock);
        if (!block_group) {
                block_group = cluster->block_group;
                if (!block_group) {
                        spin_unlock(&cluster->lock);
                        return;
                }
        } else if (cluster->block_group != block_group) {
                /* someone else has already freed it don't redo their work */
                spin_unlock(&cluster->lock);
                return;
        }
        btrfs_get_block_group(block_group);
        spin_unlock(&cluster->lock);

        ctl = block_group->free_space_ctl;

        /* now return any extents the cluster had on it */
        spin_lock(&ctl->tree_lock);
        __btrfs_return_cluster_to_free_space(block_group, cluster);
        spin_unlock(&ctl->tree_lock);

        btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);

        /* finally drop our ref */
        btrfs_put_block_group(block_group);
}

static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
                                   struct btrfs_free_cluster *cluster,
                                   struct btrfs_free_space *entry,
                                   u64 bytes, u64 min_start,
                                   u64 *max_extent_size)
{
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        int err;
        u64 search_start = cluster->window_start;
        u64 search_bytes = bytes;
        u64 ret = 0;

        search_start = min_start;
        search_bytes = bytes;

        err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
        if (err) {
                *max_extent_size = max(get_max_extent_size(entry),
                                       *max_extent_size);
                return 0;
        }

        ret = search_start;
        bitmap_clear_bits(ctl, entry, ret, bytes, false);

        return ret;
}

/*
 * given a cluster, try to allocate 'bytes' from it, returns 0
 * if it couldn't find anything suitably large, or a logical disk offset
 * if things worked out
 */
u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
                             struct btrfs_free_cluster *cluster, u64 bytes,
                             u64 min_start, u64 *max_extent_size)
{
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_discard_ctl *discard_ctl =
                                        &block_group->fs_info->discard_ctl;
        struct btrfs_free_space *entry = NULL;
        struct rb_node *node;
        u64 ret = 0;

        ASSERT(!btrfs_is_zoned(block_group->fs_info));

        spin_lock(&cluster->lock);
        if (bytes > cluster->max_size)
                goto out;

        if (cluster->block_group != block_group)
                goto out;

        node = rb_first(&cluster->root);
        if (!node)
                goto out;

        entry = rb_entry(node, struct btrfs_free_space, offset_index);
        while (1) {
                if (entry->bytes < bytes)
                        *max_extent_size = max(get_max_extent_size(entry),
                                               *max_extent_size);

                if (entry->bytes < bytes ||
                    (!entry->bitmap && entry->offset < min_start)) {
                        node = rb_next(&entry->offset_index);
                        if (!node)
                                break;
                        entry = rb_entry(node, struct btrfs_free_space,
                                         offset_index);
                        continue;
                }

                if (entry->bitmap) {
                        ret = btrfs_alloc_from_bitmap(block_group,
                                                      cluster, entry, bytes,
                                                      cluster->window_start,
                                                      max_extent_size);
                        if (ret == 0) {
                                node = rb_next(&entry->offset_index);
                                if (!node)
                                        break;
                                entry = rb_entry(node, struct btrfs_free_space,
                                                 offset_index);
                                continue;
                        }
                        cluster->window_start += bytes;
                } else {
                        ret = entry->offset;

                        entry->offset += bytes;
                        entry->bytes -= bytes;
                }

                break;
        }
out:
        spin_unlock(&cluster->lock);

        if (!ret)
                return 0;

        spin_lock(&ctl->tree_lock);

        if (!btrfs_free_space_trimmed(entry))
                atomic64_add(bytes, &discard_ctl->discard_bytes_saved);

        ctl->free_space -= bytes;
        if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
                ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;

        spin_lock(&cluster->lock);
        if (entry->bytes == 0) {
                rb_erase(&entry->offset_index, &cluster->root);
                ctl->free_extents--;
                if (entry->bitmap) {
                        kmem_cache_free(btrfs_free_space_bitmap_cachep,
                                        entry->bitmap);
                        ctl->total_bitmaps--;
                        recalculate_thresholds(ctl);
                } else if (!btrfs_free_space_trimmed(entry)) {
                        ctl->discardable_extents[BTRFS_STAT_CURR]--;
                }
                kmem_cache_free(btrfs_free_space_cachep, entry);
        }

        spin_unlock(&cluster->lock);
        spin_unlock(&ctl->tree_lock);

        return ret;
}

static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
                                struct btrfs_free_space *entry,
                                struct btrfs_free_cluster *cluster,
                                u64 offset, u64 bytes,
                                u64 cont1_bytes, u64 min_bytes)
{
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        unsigned long next_zero;
        unsigned long i;
        unsigned long want_bits;
        unsigned long min_bits;
        unsigned long found_bits;
        unsigned long max_bits = 0;
        unsigned long start = 0;
        unsigned long total_found = 0;
        int ret;

        lockdep_assert_held(&ctl->tree_lock);

        i = offset_to_bit(entry->offset, ctl->unit,
                          max_t(u64, offset, entry->offset));
        want_bits = bytes_to_bits(bytes, ctl->unit);
        min_bits = bytes_to_bits(min_bytes, ctl->unit);

        /*
         * Don't bother looking for a cluster in this bitmap if it's heavily
         * fragmented.
         */
        if (entry->max_extent_size &&
            entry->max_extent_size < cont1_bytes)
                return -ENOSPC;
again:
        found_bits = 0;
        for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
                next_zero = find_next_zero_bit(entry->bitmap,
                                               BITS_PER_BITMAP, i);
                if (next_zero - i >= min_bits) {
                        found_bits = next_zero - i;
                        if (found_bits > max_bits)
                                max_bits = found_bits;
                        break;
                }
                if (next_zero - i > max_bits)
                        max_bits = next_zero - i;
                i = next_zero;
        }

        if (!found_bits) {
                entry->max_extent_size = (u64)max_bits * ctl->unit;
                return -ENOSPC;
        }

        if (!total_found) {
                start = i;
                cluster->max_size = 0;
        }

        total_found += found_bits;

        if (cluster->max_size < found_bits * ctl->unit)
                cluster->max_size = found_bits * ctl->unit;

        if (total_found < want_bits || cluster->max_size < cont1_bytes) {
                i = next_zero + 1;
                goto again;
        }

        cluster->window_start = start * ctl->unit + entry->offset;
        rb_erase(&entry->offset_index, &ctl->free_space_offset);
        rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);

        /*
         * We need to know if we're currently on the normal space index when we
         * manipulate the bitmap so that we know we need to remove and re-insert
         * it into the space_index tree.  Clear the bytes_index node here so the
         * bitmap manipulation helpers know not to mess with the space_index
         * until this bitmap entry is added back into the normal cache.
         */
        RB_CLEAR_NODE(&entry->bytes_index);

        ret = tree_insert_offset(ctl, cluster, entry);
        ASSERT(!ret); /* -EEXIST; Logic error */

        trace_btrfs_setup_cluster(block_group, cluster,
                                  total_found * ctl->unit, 1);
        return 0;
}

/*
 * This searches the block group for just extents to fill the cluster with.
 * Try to find a cluster with at least bytes total bytes, at least one
 * extent of cont1_bytes, and other clusters of at least min_bytes.
 */
static noinline int
setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
                        struct btrfs_free_cluster *cluster,
                        struct list_head *bitmaps, u64 offset, u64 bytes,
                        u64 cont1_bytes, u64 min_bytes)
{
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *first = NULL;
        struct btrfs_free_space *entry = NULL;
        struct btrfs_free_space *last;
        struct rb_node *node;
        u64 window_free;
        u64 max_extent;
        u64 total_size = 0;

        lockdep_assert_held(&ctl->tree_lock);

        entry = tree_search_offset(ctl, offset, 0, 1);
        if (!entry)
                return -ENOSPC;

        /*
         * We don't want bitmaps, so just move along until we find a normal
         * extent entry.
         */
        while (entry->bitmap || entry->bytes < min_bytes) {
                if (entry->bitmap && list_empty(&entry->list))
                        list_add_tail(&entry->list, bitmaps);
                node = rb_next(&entry->offset_index);
                if (!node)
                        return -ENOSPC;
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
        }

        window_free = entry->bytes;
        max_extent = entry->bytes;
        first = entry;
        last = entry;

        for (node = rb_next(&entry->offset_index); node;
             node = rb_next(&entry->offset_index)) {
                entry = rb_entry(node, struct btrfs_free_space, offset_index);

                if (entry->bitmap) {
                        if (list_empty(&entry->list))
                                list_add_tail(&entry->list, bitmaps);
                        continue;
                }

                if (entry->bytes < min_bytes)
                        continue;

                last = entry;
                window_free += entry->bytes;
                if (entry->bytes > max_extent)
                        max_extent = entry->bytes;
        }

        if (window_free < bytes || max_extent < cont1_bytes)
                return -ENOSPC;

        cluster->window_start = first->offset;

        node = &first->offset_index;

        /*
         * now we've found our entries, pull them out of the free space
         * cache and put them into the cluster rbtree
         */
        do {
                int ret;

                entry = rb_entry(node, struct btrfs_free_space, offset_index);
                node = rb_next(&entry->offset_index);
                if (entry->bitmap || entry->bytes < min_bytes)
                        continue;

                rb_erase(&entry->offset_index, &ctl->free_space_offset);
                rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
                ret = tree_insert_offset(ctl, cluster, entry);
                total_size += entry->bytes;
                ASSERT(!ret); /* -EEXIST; Logic error */
        } while (node && entry != last);

        cluster->max_size = max_extent;
        trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
        return 0;
}

/*
 * This specifically looks for bitmaps that may work in the cluster, we assume
 * that we have already failed to find extents that will work.
 */
static noinline int
setup_cluster_bitmap(struct btrfs_block_group *block_group,
                     struct btrfs_free_cluster *cluster,
                     struct list_head *bitmaps, u64 offset, u64 bytes,
                     u64 cont1_bytes, u64 min_bytes)
{
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *entry = NULL;
        int ret = -ENOSPC;
        u64 bitmap_offset = offset_to_bitmap(ctl, offset);

        if (ctl->total_bitmaps == 0)
                return -ENOSPC;

        /*
         * The bitmap that covers offset won't be in the list unless offset
         * is just its start offset.
         */
        if (!list_empty(bitmaps))
                entry = list_first_entry(bitmaps, struct btrfs_free_space, list);

        if (!entry || entry->offset != bitmap_offset) {
                entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
                if (entry && list_empty(&entry->list))
                        list_add(&entry->list, bitmaps);
        }

        list_for_each_entry(entry, bitmaps, list) {
                if (entry->bytes < bytes)
                        continue;
                ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
                                           bytes, cont1_bytes, min_bytes);
                if (!ret)
                        return 0;
        }

        /*
         * The bitmaps list has all the bitmaps that record free space
         * starting after offset, so no more search is required.
         */
        return -ENOSPC;
}

/*
 * here we try to find a cluster of blocks in a block group.  The goal
 * is to find at least bytes+empty_size.
 * We might not find them all in one contiguous area.
 *
 * returns zero and sets up cluster if things worked out, otherwise
 * it returns -enospc
 */
int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
                             struct btrfs_free_cluster *cluster,
                             u64 offset, u64 bytes, u64 empty_size)
{
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *entry, *tmp;
        LIST_HEAD(bitmaps);
        u64 min_bytes;
        u64 cont1_bytes;
        int ret;

        /*
         * Choose the minimum extent size we'll require for this
         * cluster.  For SSD_SPREAD, don't allow any fragmentation.
         * For metadata, allow allocates with smaller extents.  For
         * data, keep it dense.
         */
        if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
                cont1_bytes = bytes + empty_size;
                min_bytes = cont1_bytes;
        } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
                cont1_bytes = bytes;
                min_bytes = fs_info->sectorsize;
        } else {
                cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
                min_bytes = fs_info->sectorsize;
        }

        spin_lock(&ctl->tree_lock);

        /*
         * If we know we don't have enough space to make a cluster don't even
         * bother doing all the work to try and find one.
         */
        if (ctl->free_space < bytes) {
                spin_unlock(&ctl->tree_lock);
                return -ENOSPC;
        }

        spin_lock(&cluster->lock);

        /* someone already found a cluster, hooray */
        if (cluster->block_group) {
                ret = 0;
                goto out;
        }

        trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
                                 min_bytes);

        ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
                                      bytes + empty_size,
                                      cont1_bytes, min_bytes);
        if (ret)
                ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
                                           offset, bytes + empty_size,
                                           cont1_bytes, min_bytes);

        /* Clear our temporary list */
        list_for_each_entry_safe(entry, tmp, &bitmaps, list)
                list_del_init(&entry->list);

        if (!ret) {
                btrfs_get_block_group(block_group);
                list_add_tail(&cluster->block_group_list,
                              &block_group->cluster_list);
                cluster->block_group = block_group;
        } else {
                trace_btrfs_failed_cluster_setup(block_group);
        }
out:
        spin_unlock(&cluster->lock);
        spin_unlock(&ctl->tree_lock);

        return ret;
}

/*
 * simple code to zero out a cluster
 */
void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
{
        spin_lock_init(&cluster->lock);
        spin_lock_init(&cluster->refill_lock);
        cluster->root = RB_ROOT;
        cluster->max_size = 0;
        cluster->fragmented = false;
        INIT_LIST_HEAD(&cluster->block_group_list);
        cluster->block_group = NULL;
}

static int do_trimming(struct btrfs_block_group *block_group,
                       u64 *total_trimmed, u64 start, u64 bytes,
                       u64 reserved_start, u64 reserved_bytes,
                       enum btrfs_trim_state reserved_trim_state,
                       struct btrfs_trim_range *trim_entry)
{
        struct btrfs_space_info *space_info = block_group->space_info;
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        int ret;
        int update = 0;
        const u64 end = start + bytes;
        const u64 reserved_end = reserved_start + reserved_bytes;
        enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
        u64 trimmed = 0;

        spin_lock(&space_info->lock);
        spin_lock(&block_group->lock);
        if (!block_group->ro) {
                block_group->reserved += reserved_bytes;
                space_info->bytes_reserved += reserved_bytes;
                update = 1;
        }
        spin_unlock(&block_group->lock);
        spin_unlock(&space_info->lock);

        ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
        if (!ret) {
                *total_trimmed += trimmed;
                trim_state = BTRFS_TRIM_STATE_TRIMMED;
        }

        mutex_lock(&ctl->cache_writeout_mutex);
        if (reserved_start < start)
                __btrfs_add_free_space(block_group, reserved_start,
                                       start - reserved_start,
                                       reserved_trim_state);
        if (end < reserved_end)
                __btrfs_add_free_space(block_group, end, reserved_end - end,
                                       reserved_trim_state);
        __btrfs_add_free_space(block_group, start, bytes, trim_state);
        list_del(&trim_entry->list);
        mutex_unlock(&ctl->cache_writeout_mutex);

        if (update) {
                spin_lock(&space_info->lock);
                spin_lock(&block_group->lock);
                if (block_group->ro)
                        space_info->bytes_readonly += reserved_bytes;
                block_group->reserved -= reserved_bytes;
                space_info->bytes_reserved -= reserved_bytes;
                spin_unlock(&block_group->lock);
                spin_unlock(&space_info->lock);
        }

        return ret;
}

/*
 * If @async is set, then we will trim 1 region and return.
 */
static int trim_no_bitmap(struct btrfs_block_group *block_group,
                          u64 *total_trimmed, u64 start, u64 end, u64 minlen,
                          bool async)
{
        struct btrfs_discard_ctl *discard_ctl =
                                        &block_group->fs_info->discard_ctl;
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *entry;
        struct rb_node *node;
        int ret = 0;
        u64 extent_start;
        u64 extent_bytes;
        enum btrfs_trim_state extent_trim_state;
        u64 bytes;
        const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);

        while (start < end) {
                struct btrfs_trim_range trim_entry;

                mutex_lock(&ctl->cache_writeout_mutex);
                spin_lock(&ctl->tree_lock);

                if (ctl->free_space < minlen)
                        goto out_unlock;

                entry = tree_search_offset(ctl, start, 0, 1);
                if (!entry)
                        goto out_unlock;

                /* Skip bitmaps and if async, already trimmed entries */
                while (entry->bitmap ||
                       (async && btrfs_free_space_trimmed(entry))) {
                        node = rb_next(&entry->offset_index);
                        if (!node)
                                goto out_unlock;
                        entry = rb_entry(node, struct btrfs_free_space,
                                         offset_index);
                }

                if (entry->offset >= end)
                        goto out_unlock;

                extent_start = entry->offset;
                extent_bytes = entry->bytes;
                extent_trim_state = entry->trim_state;
                if (async) {
                        start = entry->offset;
                        bytes = entry->bytes;
                        if (bytes < minlen) {
                                spin_unlock(&ctl->tree_lock);
                                mutex_unlock(&ctl->cache_writeout_mutex);
                                goto next;
                        }
                        unlink_free_space(ctl, entry, true);
                        /*
                         * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
                         * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
                         * X when we come back around.  So trim it now.
                         */
                        if (max_discard_size &&
                            bytes >= (max_discard_size +
                                      BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
                                bytes = max_discard_size;
                                extent_bytes = max_discard_size;
                                entry->offset += max_discard_size;
                                entry->bytes -= max_discard_size;
                                link_free_space(ctl, entry);
                        } else {
                                kmem_cache_free(btrfs_free_space_cachep, entry);
                        }
                } else {
                        start = max(start, extent_start);
                        bytes = min(extent_start + extent_bytes, end) - start;
                        if (bytes < minlen) {
                                spin_unlock(&ctl->tree_lock);
                                mutex_unlock(&ctl->cache_writeout_mutex);
                                goto next;
                        }

                        unlink_free_space(ctl, entry, true);
                        kmem_cache_free(btrfs_free_space_cachep, entry);
                }

                spin_unlock(&ctl->tree_lock);
                trim_entry.start = extent_start;
                trim_entry.bytes = extent_bytes;
                list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
                mutex_unlock(&ctl->cache_writeout_mutex);

                ret = do_trimming(block_group, total_trimmed, start, bytes,
                                  extent_start, extent_bytes, extent_trim_state,
                                  &trim_entry);
                if (ret) {
                        block_group->discard_cursor = start + bytes;
                        break;
                }
next:
                start += bytes;
                block_group->discard_cursor = start;
                if (async && *total_trimmed)
                        break;

                if (fatal_signal_pending(current)) {
                        ret = -ERESTARTSYS;
                        break;
                }

                cond_resched();
        }

        return ret;

out_unlock:
        block_group->discard_cursor = btrfs_block_group_end(block_group);
        spin_unlock(&ctl->tree_lock);
        mutex_unlock(&ctl->cache_writeout_mutex);

        return ret;
}

/*
 * If we break out of trimming a bitmap prematurely, we should reset the
 * trimming bit.  In a rather contrieved case, it's possible to race here so
 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
 *
 * start = start of bitmap
 * end = near end of bitmap
 *
 * Thread 1:                        Thread 2:
 * trim_bitmaps(start)
 *                                trim_bitmaps(end)
 *                                end_trimming_bitmap()
 * reset_trimming_bitmap()
 */
static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
{
        struct btrfs_free_space *entry;

        spin_lock(&ctl->tree_lock);
        entry = tree_search_offset(ctl, offset, 1, 0);
        if (entry) {
                if (btrfs_free_space_trimmed(entry)) {
                        ctl->discardable_extents[BTRFS_STAT_CURR] +=
                                entry->bitmap_extents;
                        ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes;
                }
                entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
        }

        spin_unlock(&ctl->tree_lock);
}

static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
                                struct btrfs_free_space *entry)
{
        if (btrfs_free_space_trimming_bitmap(entry)) {
                entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
                ctl->discardable_extents[BTRFS_STAT_CURR] -=
                        entry->bitmap_extents;
                ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes;
        }
}

/*
 * If @async is set, then we will trim 1 region and return.
 */
static int trim_bitmaps(struct btrfs_block_group *block_group,
                        u64 *total_trimmed, u64 start, u64 end, u64 minlen,
                        u64 maxlen, bool async)
{
        struct btrfs_discard_ctl *discard_ctl =
                                        &block_group->fs_info->discard_ctl;
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *entry;
        int ret = 0;
        int ret2;
        u64 bytes;
        u64 offset = offset_to_bitmap(ctl, start);
        const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);

        while (offset < end) {
                bool next_bitmap = false;
                struct btrfs_trim_range trim_entry;

                mutex_lock(&ctl->cache_writeout_mutex);
                spin_lock(&ctl->tree_lock);

                if (ctl->free_space < minlen) {
                        block_group->discard_cursor =
                                btrfs_block_group_end(block_group);
                        spin_unlock(&ctl->tree_lock);
                        mutex_unlock(&ctl->cache_writeout_mutex);
                        break;
                }

                entry = tree_search_offset(ctl, offset, 1, 0);
                /*
                 * Bitmaps are marked trimmed lossily now to prevent constant
                 * discarding of the same bitmap (the reason why we are bound
                 * by the filters).  So, retrim the block group bitmaps when we
                 * are preparing to punt to the unused_bgs list.  This uses
                 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
                 * which is the only discard index which sets minlen to 0.
                 */
                if (!entry || (async && minlen && start == offset &&
                               btrfs_free_space_trimmed(entry))) {
                        spin_unlock(&ctl->tree_lock);
                        mutex_unlock(&ctl->cache_writeout_mutex);
                        next_bitmap = true;
                        goto next;
                }

                /*
                 * Async discard bitmap trimming begins at by setting the start
                 * to be key.objectid and the offset_to_bitmap() aligns to the
                 * start of the bitmap.  This lets us know we are fully
                 * scanning the bitmap rather than only some portion of it.
                 */
                if (start == offset)
                        entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;

                bytes = minlen;
                ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
                if (ret2 || start >= end) {
                        /*
                         * We lossily consider a bitmap trimmed if we only skip
                         * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
                         */
                        if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER)
                                end_trimming_bitmap(ctl, entry);
                        else
                                entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
                        spin_unlock(&ctl->tree_lock);
                        mutex_unlock(&ctl->cache_writeout_mutex);
                        next_bitmap = true;
                        goto next;
                }

                /*
                 * We already trimmed a region, but are using the locking above
                 * to reset the trim_state.
                 */
                if (async && *total_trimmed) {
                        spin_unlock(&ctl->tree_lock);
                        mutex_unlock(&ctl->cache_writeout_mutex);
                        goto out;
                }

                bytes = min(bytes, end - start);
                if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
                        spin_unlock(&ctl->tree_lock);
                        mutex_unlock(&ctl->cache_writeout_mutex);
                        goto next;
                }

                /*
                 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
                 * If X < @minlen, we won't trim X when we come back around.
                 * So trim it now.  We differ here from trimming extents as we
                 * don't keep individual state per bit.
                 */
                if (async &&
                    max_discard_size &&
                    bytes > (max_discard_size + minlen))
                        bytes = max_discard_size;

                bitmap_clear_bits(ctl, entry, start, bytes, true);
                if (entry->bytes == 0)
                        free_bitmap(ctl, entry);

                spin_unlock(&ctl->tree_lock);
                trim_entry.start = start;
                trim_entry.bytes = bytes;
                list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
                mutex_unlock(&ctl->cache_writeout_mutex);

                ret = do_trimming(block_group, total_trimmed, start, bytes,
                                  start, bytes, 0, &trim_entry);
                if (ret) {
                        reset_trimming_bitmap(ctl, offset);
                        block_group->discard_cursor =
                                btrfs_block_group_end(block_group);
                        break;
                }
next:
                if (next_bitmap) {
                        offset += BITS_PER_BITMAP * ctl->unit;
                        start = offset;
                } else {
                        start += bytes;
                }
                block_group->discard_cursor = start;

                if (fatal_signal_pending(current)) {
                        if (start != offset)
                                reset_trimming_bitmap(ctl, offset);
                        ret = -ERESTARTSYS;
                        break;
                }

                cond_resched();
        }

        if (offset >= end)
                block_group->discard_cursor = end;

out:
        return ret;
}

int btrfs_trim_block_group(struct btrfs_block_group *block_group,
                           u64 *trimmed, u64 start, u64 end, u64 minlen)
{
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        int ret;
        u64 rem = 0;

        ASSERT(!btrfs_is_zoned(block_group->fs_info));

        *trimmed = 0;

        spin_lock(&block_group->lock);
        if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
                spin_unlock(&block_group->lock);
                return 0;
        }
        btrfs_freeze_block_group(block_group);
        spin_unlock(&block_group->lock);

        ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
        if (ret)
                goto out;

        ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
        div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
        /* If we ended in the middle of a bitmap, reset the trimming flag */
        if (rem)
                reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
out:
        btrfs_unfreeze_block_group(block_group);
        return ret;
}

int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
                                   u64 *trimmed, u64 start, u64 end, u64 minlen,
                                   bool async)
{
        int ret;

        *trimmed = 0;

        spin_lock(&block_group->lock);
        if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
                spin_unlock(&block_group->lock);
                return 0;
        }
        btrfs_freeze_block_group(block_group);
        spin_unlock(&block_group->lock);

        ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
        btrfs_unfreeze_block_group(block_group);

        return ret;
}

int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
                                   u64 *trimmed, u64 start, u64 end, u64 minlen,
                                   u64 maxlen, bool async)
{
        int ret;

        *trimmed = 0;

        spin_lock(&block_group->lock);
        if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
                spin_unlock(&block_group->lock);
                return 0;
        }
        btrfs_freeze_block_group(block_group);
        spin_unlock(&block_group->lock);

        ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
                           async);

        btrfs_unfreeze_block_group(block_group);

        return ret;
}

bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info)
{
        return btrfs_super_cache_generation(fs_info->super_copy);
}

static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
                                       struct btrfs_trans_handle *trans)
{
        struct btrfs_block_group *block_group;
        struct rb_node *node;
        int ret = 0;

        btrfs_info(fs_info, "cleaning free space cache v1");

        node = rb_first_cached(&fs_info->block_group_cache_tree);
        while (node) {
                block_group = rb_entry(node, struct btrfs_block_group, cache_node);
                ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
                if (ret)
                        goto out;
                node = rb_next(node);
        }
out:
        return ret;
}

int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active)
{
        struct btrfs_trans_handle *trans;
        int ret;

        /*
         * update_super_roots will appropriately set or unset
         * super_copy->cache_generation based on SPACE_CACHE and
         * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a
         * transaction commit whether we are enabling space cache v1 and don't
         * have any other work to do, or are disabling it and removing free
         * space inodes.
         */
        trans = btrfs_start_transaction(fs_info->tree_root, 0);
        if (IS_ERR(trans))
                return PTR_ERR(trans);

        if (!active) {
                set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
                ret = cleanup_free_space_cache_v1(fs_info, trans);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        btrfs_end_transaction(trans);
                        goto out;
                }
        }

        ret = btrfs_commit_transaction(trans);
out:
        clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);

        return ret;
}

int __init btrfs_free_space_init(void)
{
        btrfs_free_space_cachep = KMEM_CACHE(btrfs_free_space, 0);
        if (!btrfs_free_space_cachep)
                return -ENOMEM;

        btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
                                                        PAGE_SIZE, PAGE_SIZE,
                                                        0, NULL);
        if (!btrfs_free_space_bitmap_cachep) {
                kmem_cache_destroy(btrfs_free_space_cachep);
                return -ENOMEM;
        }

        return 0;
}

void __cold btrfs_free_space_exit(void)
{
        kmem_cache_destroy(btrfs_free_space_cachep);
        kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
}

#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/*
 * Use this if you need to make a bitmap or extent entry specifically, it
 * doesn't do any of the merging that add_free_space does, this acts a lot like
 * how the free space cache loading stuff works, so you can get really weird
 * configurations.
 */
int test_add_free_space_entry(struct btrfs_block_group *cache,
                              u64 offset, u64 bytes, bool bitmap)
{
        struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
        struct btrfs_free_space *info = NULL, *bitmap_info;
        void *map = NULL;
        enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
        u64 bytes_added;
        int ret;

again:
        if (!info) {
                info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
                if (!info)
                        return -ENOMEM;
        }

        if (!bitmap) {
                spin_lock(&ctl->tree_lock);
                info->offset = offset;
                info->bytes = bytes;
                info->max_extent_size = 0;
                ret = link_free_space(ctl, info);
                spin_unlock(&ctl->tree_lock);
                if (ret)
                        kmem_cache_free(btrfs_free_space_cachep, info);
                return ret;
        }

        if (!map) {
                map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
                if (!map) {
                        kmem_cache_free(btrfs_free_space_cachep, info);
                        return -ENOMEM;
                }
        }

        spin_lock(&ctl->tree_lock);
        bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
                                         1, 0);
        if (!bitmap_info) {
                info->bitmap = map;
                map = NULL;
                add_new_bitmap(ctl, info, offset);
                bitmap_info = info;
                info = NULL;
        }

        bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
                                          trim_state);

        bytes -= bytes_added;
        offset += bytes_added;
        spin_unlock(&ctl->tree_lock);

        if (bytes)
                goto again;

        if (info)
                kmem_cache_free(btrfs_free_space_cachep, info);
        if (map)
                kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
        return 0;
}

/*
 * Checks to see if the given range is in the free space cache.  This is really
 * just used to check the absence of space, so if there is free space in the
 * range at all we will return 1.
 */
int test_check_exists(struct btrfs_block_group *cache,
                      u64 offset, u64 bytes)
{
        struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
        struct btrfs_free_space *info;
        int ret = 0;

        spin_lock(&ctl->tree_lock);
        info = tree_search_offset(ctl, offset, 0, 0);
        if (!info) {
                info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
                                          1, 0);
                if (!info)
                        goto out;
        }

have_info:
        if (info->bitmap) {
                u64 bit_off, bit_bytes;
                struct rb_node *n;
                struct btrfs_free_space *tmp;

                bit_off = offset;
                bit_bytes = ctl->unit;
                ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
                if (!ret) {
                        if (bit_off == offset) {
                                ret = 1;
                                goto out;
                        } else if (bit_off > offset &&
                                   offset + bytes > bit_off) {
                                ret = 1;
                                goto out;
                        }
                }

                n = rb_prev(&info->offset_index);
                while (n) {
                        tmp = rb_entry(n, struct btrfs_free_space,
                                       offset_index);
                        if (tmp->offset + tmp->bytes < offset)
                                break;
                        if (offset + bytes < tmp->offset) {
                                n = rb_prev(&tmp->offset_index);
                                continue;
                        }
                        info = tmp;
                        goto have_info;
                }

                n = rb_next(&info->offset_index);
                while (n) {
                        tmp = rb_entry(n, struct btrfs_free_space,
                                       offset_index);
                        if (offset + bytes < tmp->offset)
                                break;
                        if (tmp->offset + tmp->bytes < offset) {
                                n = rb_next(&tmp->offset_index);
                                continue;
                        }
                        info = tmp;
                        goto have_info;
                }

                ret = 0;
                goto out;
        }

        if (info->offset == offset) {
                ret = 1;
                goto out;
        }

        if (offset > info->offset && offset < info->offset + info->bytes)
                ret = 1;
out:
        spin_unlock(&ctl->tree_lock);
        return ret;
}
#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */





























    1 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
// SPDX-License-Identifier: GPL-2.0

struct io_tctx_node {
        struct list_head        ctx_node;
        struct task_struct        *task;
        struct io_ring_ctx        *ctx;
};

int io_uring_alloc_task_context(struct task_struct *task,
                                struct io_ring_ctx *ctx);
void io_uring_del_tctx_node(unsigned long index);
int __io_uring_add_tctx_node(struct io_ring_ctx *ctx);
int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx);
void io_uring_clean_tctx(struct io_uring_task *tctx);

void io_uring_unreg_ringfd(void);
int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
                       unsigned nr_args);
int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
                         unsigned nr_args);

/*
 * Note that this task has used io_uring. We use it for cancelation purposes.
 */
static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
{
        struct io_uring_task *tctx = current->io_uring;

        if (likely(tctx && tctx->last == ctx))
                return 0;

        return __io_uring_add_tctx_node_from_submit(ctx);
}














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    2 

























































    1 
    1 



























































































































































































































































































































































































































































































































































    4 



    4 





































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
// SPDX-License-Identifier: GPL-2.0+
/*
 * User-space Probes (UProbes)
 *
 * Copyright (C) IBM Corporation, 2008-2012
 * Authors:
 *        Srikar Dronamraju
 *        Jim Keniston
 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
 */

#include <linux/kernel.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>        /* read_mapping_page */
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/export.h>
#include <linux/rmap.h>                /* anon_vma_prepare */
#include <linux/mmu_notifier.h>
#include <linux/swap.h>                /* folio_free_swap */
#include <linux/ptrace.h>        /* user_enable_single_step */
#include <linux/kdebug.h>        /* notifier mechanism */
#include <linux/percpu-rwsem.h>
#include <linux/task_work.h>
#include <linux/shmem_fs.h>
#include <linux/khugepaged.h>

#include <linux/uprobes.h>

#define UINSNS_PER_PAGE                        (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
#define MAX_UPROBE_XOL_SLOTS                UINSNS_PER_PAGE

static struct rb_root uprobes_tree = RB_ROOT;
/*
 * allows us to skip the uprobe_mmap if there are no uprobe events active
 * at this time.  Probably a fine grained per inode count is better?
 */
#define no_uprobe_events()        RB_EMPTY_ROOT(&uprobes_tree)

static DEFINE_RWLOCK(uprobes_treelock);        /* serialize rbtree access */

#define UPROBES_HASH_SZ        13
/* serialize uprobe->pending_list */
static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
#define uprobes_mmap_hash(v)        (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])

DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);

/* Have a copy of original instruction */
#define UPROBE_COPY_INSN        0

struct uprobe {
        struct rb_node                rb_node;        /* node in the rb tree */
        refcount_t                ref;
        struct rw_semaphore        register_rwsem;
        struct rw_semaphore        consumer_rwsem;
        struct list_head        pending_list;
        struct uprobe_consumer        *consumers;
        struct inode                *inode;                /* Also hold a ref to inode */
        loff_t                        offset;
        loff_t                        ref_ctr_offset;
        unsigned long                flags;

        /*
         * The generic code assumes that it has two members of unknown type
         * owned by the arch-specific code:
         *
         *         insn -        copy_insn() saves the original instruction here for
         *                arch_uprobe_analyze_insn().
         *
         *        ixol -        potentially modified instruction to execute out of
         *                line, copied to xol_area by xol_get_insn_slot().
         */
        struct arch_uprobe        arch;
};

struct delayed_uprobe {
        struct list_head list;
        struct uprobe *uprobe;
        struct mm_struct *mm;
};

static DEFINE_MUTEX(delayed_uprobe_lock);
static LIST_HEAD(delayed_uprobe_list);

/*
 * Execute out of line area: anonymous executable mapping installed
 * by the probed task to execute the copy of the original instruction
 * mangled by set_swbp().
 *
 * On a breakpoint hit, thread contests for a slot.  It frees the
 * slot after singlestep. Currently a fixed number of slots are
 * allocated.
 */
struct xol_area {
        wait_queue_head_t                 wq;                /* if all slots are busy */
        atomic_t                         slot_count;        /* number of in-use slots */
        unsigned long                         *bitmap;        /* 0 = free slot */

        struct vm_special_mapping        xol_mapping;
        struct page                         *pages[2];
        /*
         * We keep the vma's vm_start rather than a pointer to the vma
         * itself.  The probed process or a naughty kernel module could make
         * the vma go away, and we must handle that reasonably gracefully.
         */
        unsigned long                         vaddr;                /* Page(s) of instruction slots */
};

/*
 * valid_vma: Verify if the specified vma is an executable vma
 * Relax restrictions while unregistering: vm_flags might have
 * changed after breakpoint was inserted.
 *        - is_register: indicates if we are in register context.
 *        - Return 1 if the specified virtual address is in an
 *          executable vma.
 */
static bool valid_vma(struct vm_area_struct *vma, bool is_register)
{
        vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;

        if (is_register)
                flags |= VM_WRITE;

        return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
}

static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
{
        return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
}

static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
{
        return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
}

/**
 * __replace_page - replace page in vma by new page.
 * based on replace_page in mm/ksm.c
 *
 * @vma:      vma that holds the pte pointing to page
 * @addr:     address the old @page is mapped at
 * @old_page: the page we are replacing by new_page
 * @new_page: the modified page we replace page by
 *
 * If @new_page is NULL, only unmap @old_page.
 *
 * Returns 0 on success, negative error code otherwise.
 */
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
                                struct page *old_page, struct page *new_page)
{
        struct folio *old_folio = page_folio(old_page);
        struct folio *new_folio;
        struct mm_struct *mm = vma->vm_mm;
        DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0);
        int err;
        struct mmu_notifier_range range;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
                                addr + PAGE_SIZE);

        if (new_page) {
                new_folio = page_folio(new_page);
                err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL);
                if (err)
                        return err;
        }

        /* For folio_free_swap() below */
        folio_lock(old_folio);

        mmu_notifier_invalidate_range_start(&range);
        err = -EAGAIN;
        if (!page_vma_mapped_walk(&pvmw))
                goto unlock;
        VM_BUG_ON_PAGE(addr != pvmw.address, old_page);

        if (new_page) {
                folio_get(new_folio);
                folio_add_new_anon_rmap(new_folio, vma, addr);
                folio_add_lru_vma(new_folio, vma);
        } else
                /* no new page, just dec_mm_counter for old_page */
                dec_mm_counter(mm, MM_ANONPAGES);

        if (!folio_test_anon(old_folio)) {
                dec_mm_counter(mm, mm_counter_file(old_folio));
                inc_mm_counter(mm, MM_ANONPAGES);
        }

        flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte)));
        ptep_clear_flush(vma, addr, pvmw.pte);
        if (new_page)
                set_pte_at(mm, addr, pvmw.pte,
                           mk_pte(new_page, vma->vm_page_prot));

        folio_remove_rmap_pte(old_folio, old_page, vma);
        if (!folio_mapped(old_folio))
                folio_free_swap(old_folio);
        page_vma_mapped_walk_done(&pvmw);
        folio_put(old_folio);

        err = 0;
 unlock:
        mmu_notifier_invalidate_range_end(&range);
        folio_unlock(old_folio);
        return err;
}

/**
 * is_swbp_insn - check if instruction is breakpoint instruction.
 * @insn: instruction to be checked.
 * Default implementation of is_swbp_insn
 * Returns true if @insn is a breakpoint instruction.
 */
bool __weak is_swbp_insn(uprobe_opcode_t *insn)
{
        return *insn == UPROBE_SWBP_INSN;
}

/**
 * is_trap_insn - check if instruction is breakpoint instruction.
 * @insn: instruction to be checked.
 * Default implementation of is_trap_insn
 * Returns true if @insn is a breakpoint instruction.
 *
 * This function is needed for the case where an architecture has multiple
 * trap instructions (like powerpc).
 */
bool __weak is_trap_insn(uprobe_opcode_t *insn)
{
        return is_swbp_insn(insn);
}

static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
{
        void *kaddr = kmap_atomic(page);
        memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
        kunmap_atomic(kaddr);
}

static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
{
        void *kaddr = kmap_atomic(page);
        memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
        kunmap_atomic(kaddr);
}

static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
{
        uprobe_opcode_t old_opcode;
        bool is_swbp;

        /*
         * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
         * We do not check if it is any other 'trap variant' which could
         * be conditional trap instruction such as the one powerpc supports.
         *
         * The logic is that we do not care if the underlying instruction
         * is a trap variant; uprobes always wins over any other (gdb)
         * breakpoint.
         */
        copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
        is_swbp = is_swbp_insn(&old_opcode);

        if (is_swbp_insn(new_opcode)) {
                if (is_swbp)                /* register: already installed? */
                        return 0;
        } else {
                if (!is_swbp)                /* unregister: was it changed by us? */
                        return 0;
        }

        return 1;
}

static struct delayed_uprobe *
delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm)
{
        struct delayed_uprobe *du;

        list_for_each_entry(du, &delayed_uprobe_list, list)
                if (du->uprobe == uprobe && du->mm == mm)
                        return du;
        return NULL;
}

static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm)
{
        struct delayed_uprobe *du;

        if (delayed_uprobe_check(uprobe, mm))
                return 0;

        du  = kzalloc(sizeof(*du), GFP_KERNEL);
        if (!du)
                return -ENOMEM;

        du->uprobe = uprobe;
        du->mm = mm;
        list_add(&du->list, &delayed_uprobe_list);
        return 0;
}

static void delayed_uprobe_delete(struct delayed_uprobe *du)
{
        if (WARN_ON(!du))
                return;
        list_del(&du->list);
        kfree(du);
}

static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm)
{
        struct list_head *pos, *q;
        struct delayed_uprobe *du;

        if (!uprobe && !mm)
                return;

        list_for_each_safe(pos, q, &delayed_uprobe_list) {
                du = list_entry(pos, struct delayed_uprobe, list);

                if (uprobe && du->uprobe != uprobe)
                        continue;
                if (mm && du->mm != mm)
                        continue;

                delayed_uprobe_delete(du);
        }
}

static bool valid_ref_ctr_vma(struct uprobe *uprobe,
                              struct vm_area_struct *vma)
{
        unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset);

        return uprobe->ref_ctr_offset &&
                vma->vm_file &&
                file_inode(vma->vm_file) == uprobe->inode &&
                (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
                vma->vm_start <= vaddr &&
                vma->vm_end > vaddr;
}

static struct vm_area_struct *
find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
{
        VMA_ITERATOR(vmi, mm, 0);
        struct vm_area_struct *tmp;

        for_each_vma(vmi, tmp)
                if (valid_ref_ctr_vma(uprobe, tmp))
                        return tmp;

        return NULL;
}

static int
__update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
{
        void *kaddr;
        struct page *page;
        int ret;
        short *ptr;

        if (!vaddr || !d)
                return -EINVAL;

        ret = get_user_pages_remote(mm, vaddr, 1,
                                    FOLL_WRITE, &page, NULL);
        if (unlikely(ret <= 0)) {
                /*
                 * We are asking for 1 page. If get_user_pages_remote() fails,
                 * it may return 0, in that case we have to return error.
                 */
                return ret == 0 ? -EBUSY : ret;
        }

        kaddr = kmap_atomic(page);
        ptr = kaddr + (vaddr & ~PAGE_MASK);

        if (unlikely(*ptr + d < 0)) {
                pr_warn("ref_ctr going negative. vaddr: 0x%lx, "
                        "curr val: %d, delta: %d\n", vaddr, *ptr, d);
                ret = -EINVAL;
                goto out;
        }

        *ptr += d;
        ret = 0;
out:
        kunmap_atomic(kaddr);
        put_page(page);
        return ret;
}

static void update_ref_ctr_warn(struct uprobe *uprobe,
                                struct mm_struct *mm, short d)
{
        pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
                "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n",
                d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
                (unsigned long long) uprobe->offset,
                (unsigned long long) uprobe->ref_ctr_offset, mm);
}

static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
                          short d)
{
        struct vm_area_struct *rc_vma;
        unsigned long rc_vaddr;
        int ret = 0;

        rc_vma = find_ref_ctr_vma(uprobe, mm);

        if (rc_vma) {
                rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset);
                ret = __update_ref_ctr(mm, rc_vaddr, d);
                if (ret)
                        update_ref_ctr_warn(uprobe, mm, d);

                if (d > 0)
                        return ret;
        }

        mutex_lock(&delayed_uprobe_lock);
        if (d > 0)
                ret = delayed_uprobe_add(uprobe, mm);
        else
                delayed_uprobe_remove(uprobe, mm);
        mutex_unlock(&delayed_uprobe_lock);

        return ret;
}

/*
 * NOTE:
 * Expect the breakpoint instruction to be the smallest size instruction for
 * the architecture. If an arch has variable length instruction and the
 * breakpoint instruction is not of the smallest length instruction
 * supported by that architecture then we need to modify is_trap_at_addr and
 * uprobe_write_opcode accordingly. This would never be a problem for archs
 * that have fixed length instructions.
 *
 * uprobe_write_opcode - write the opcode at a given virtual address.
 * @auprobe: arch specific probepoint information.
 * @mm: the probed process address space.
 * @vaddr: the virtual address to store the opcode.
 * @opcode: opcode to be written at @vaddr.
 *
 * Called with mm->mmap_lock held for write.
 * Return 0 (success) or a negative errno.
 */
int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
                        unsigned long vaddr, uprobe_opcode_t opcode)
{
        struct uprobe *uprobe;
        struct page *old_page, *new_page;
        struct vm_area_struct *vma;
        int ret, is_register, ref_ctr_updated = 0;
        bool orig_page_huge = false;
        unsigned int gup_flags = FOLL_FORCE;

        is_register = is_swbp_insn(&opcode);
        uprobe = container_of(auprobe, struct uprobe, arch);

retry:
        if (is_register)
                gup_flags |= FOLL_SPLIT_PMD;
        /* Read the page with vaddr into memory */
        old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
        if (IS_ERR(old_page))
                return PTR_ERR(old_page);

        ret = verify_opcode(old_page, vaddr, &opcode);
        if (ret <= 0)
                goto put_old;

        if (WARN(!is_register && PageCompound(old_page),
                 "uprobe unregister should never work on compound page\n")) {
                ret = -EINVAL;
                goto put_old;
        }

        /* We are going to replace instruction, update ref_ctr. */
        if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
                ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
                if (ret)
                        goto put_old;

                ref_ctr_updated = 1;
        }

        ret = 0;
        if (!is_register && !PageAnon(old_page))
                goto put_old;

        ret = anon_vma_prepare(vma);
        if (ret)
                goto put_old;

        ret = -ENOMEM;
        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
        if (!new_page)
                goto put_old;

        __SetPageUptodate(new_page);
        copy_highpage(new_page, old_page);
        copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);

        if (!is_register) {
                struct page *orig_page;
                pgoff_t index;

                VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);

                index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
                orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
                                          index);

                if (orig_page) {
                        if (PageUptodate(orig_page) &&
                            pages_identical(new_page, orig_page)) {
                                /* let go new_page */
                                put_page(new_page);
                                new_page = NULL;

                                if (PageCompound(orig_page))
                                        orig_page_huge = true;
                        }
                        put_page(orig_page);
                }
        }

        ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page);
        if (new_page)
                put_page(new_page);
put_old:
        put_page(old_page);

        if (unlikely(ret == -EAGAIN))
                goto retry;

        /* Revert back reference counter if instruction update failed. */
        if (ret && is_register && ref_ctr_updated)
                update_ref_ctr(uprobe, mm, -1);

        /* try collapse pmd for compound page */
        if (!ret && orig_page_huge)
                collapse_pte_mapped_thp(mm, vaddr, false);

        return ret;
}

/**
 * set_swbp - store breakpoint at a given address.
 * @auprobe: arch specific probepoint information.
 * @mm: the probed process address space.
 * @vaddr: the virtual address to insert the opcode.
 *
 * For mm @mm, store the breakpoint instruction at @vaddr.
 * Return 0 (success) or a negative errno.
 */
int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
        return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
}

/**
 * set_orig_insn - Restore the original instruction.
 * @mm: the probed process address space.
 * @auprobe: arch specific probepoint information.
 * @vaddr: the virtual address to insert the opcode.
 *
 * For mm @mm, restore the original opcode (opcode) at @vaddr.
 * Return 0 (success) or a negative errno.
 */
int __weak
set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
        return uprobe_write_opcode(auprobe, mm, vaddr,
                        *(uprobe_opcode_t *)&auprobe->insn);
}

static struct uprobe *get_uprobe(struct uprobe *uprobe)
{
        refcount_inc(&uprobe->ref);
        return uprobe;
}

static void put_uprobe(struct uprobe *uprobe)
{
        if (refcount_dec_and_test(&uprobe->ref)) {
                /*
                 * If application munmap(exec_vma) before uprobe_unregister()
                 * gets called, we don't get a chance to remove uprobe from
                 * delayed_uprobe_list from remove_breakpoint(). Do it here.
                 */
                mutex_lock(&delayed_uprobe_lock);
                delayed_uprobe_remove(uprobe, NULL);
                mutex_unlock(&delayed_uprobe_lock);
                kfree(uprobe);
        }
}

static __always_inline
int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset,
               const struct uprobe *r)
{
        if (l_inode < r->inode)
                return -1;

        if (l_inode > r->inode)
                return 1;

        if (l_offset < r->offset)
                return -1;

        if (l_offset > r->offset)
                return 1;

        return 0;
}

#define __node_2_uprobe(node) \
        rb_entry((node), struct uprobe, rb_node)

struct __uprobe_key {
        struct inode *inode;
        loff_t offset;
};

static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b)
{
        const struct __uprobe_key *a = key;
        return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b));
}

static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b)
{
        struct uprobe *u = __node_2_uprobe(a);
        return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
}

static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
{
        struct __uprobe_key key = {
                .inode = inode,
                .offset = offset,
        };
        struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);

        if (node)
                return get_uprobe(__node_2_uprobe(node));

        return NULL;
}

/*
 * Find a uprobe corresponding to a given inode:offset
 * Acquires uprobes_treelock
 */
static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
{
        struct uprobe *uprobe;

        read_lock(&uprobes_treelock);
        uprobe = __find_uprobe(inode, offset);
        read_unlock(&uprobes_treelock);

        return uprobe;
}

static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
{
        struct rb_node *node;

        node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
        if (node)
                return get_uprobe(__node_2_uprobe(node));

        /* get access + creation ref */
        refcount_set(&uprobe->ref, 2);
        return NULL;
}

/*
 * Acquire uprobes_treelock.
 * Matching uprobe already exists in rbtree;
 *        increment (access refcount) and return the matching uprobe.
 *
 * No matching uprobe; insert the uprobe in rb_tree;
 *        get a double refcount (access + creation) and return NULL.
 */
static struct uprobe *insert_uprobe(struct uprobe *uprobe)
{
        struct uprobe *u;

        write_lock(&uprobes_treelock);
        u = __insert_uprobe(uprobe);
        write_unlock(&uprobes_treelock);

        return u;
}

static void
ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe)
{
        pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx "
                "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n",
                uprobe->inode->i_ino, (unsigned long long) uprobe->offset,
                (unsigned long long) cur_uprobe->ref_ctr_offset,
                (unsigned long long) uprobe->ref_ctr_offset);
}

static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
                                   loff_t ref_ctr_offset)
{
        struct uprobe *uprobe, *cur_uprobe;

        uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
        if (!uprobe)
                return NULL;

        uprobe->inode = inode;
        uprobe->offset = offset;
        uprobe->ref_ctr_offset = ref_ctr_offset;
        init_rwsem(&uprobe->register_rwsem);
        init_rwsem(&uprobe->consumer_rwsem);

        /* add to uprobes_tree, sorted on inode:offset */
        cur_uprobe = insert_uprobe(uprobe);
        /* a uprobe exists for this inode:offset combination */
        if (cur_uprobe) {
                if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
                        ref_ctr_mismatch_warn(cur_uprobe, uprobe);
                        put_uprobe(cur_uprobe);
                        kfree(uprobe);
                        return ERR_PTR(-EINVAL);
                }
                kfree(uprobe);
                uprobe = cur_uprobe;
        }

        return uprobe;
}

static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
        down_write(&uprobe->consumer_rwsem);
        uc->next = uprobe->consumers;
        uprobe->consumers = uc;
        up_write(&uprobe->consumer_rwsem);
}

/*
 * For uprobe @uprobe, delete the consumer @uc.
 * Return true if the @uc is deleted successfully
 * or return false.
 */
static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
        struct uprobe_consumer **con;
        bool ret = false;

        down_write(&uprobe->consumer_rwsem);
        for (con = &uprobe->consumers; *con; con = &(*con)->next) {
                if (*con == uc) {
                        *con = uc->next;
                        ret = true;
                        break;
                }
        }
        up_write(&uprobe->consumer_rwsem);

        return ret;
}

static int __copy_insn(struct address_space *mapping, struct file *filp,
                        void *insn, int nbytes, loff_t offset)
{
        struct page *page;
        /*
         * Ensure that the page that has the original instruction is populated
         * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(),
         * see uprobe_register().
         */
        if (mapping->a_ops->read_folio)
                page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
        else
                page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
        if (IS_ERR(page))
                return PTR_ERR(page);

        copy_from_page(page, offset, insn, nbytes);
        put_page(page);

        return 0;
}

static int copy_insn(struct uprobe *uprobe, struct file *filp)
{
        struct address_space *mapping = uprobe->inode->i_mapping;
        loff_t offs = uprobe->offset;
        void *insn = &uprobe->arch.insn;
        int size = sizeof(uprobe->arch.insn);
        int len, err = -EIO;

        /* Copy only available bytes, -EIO if nothing was read */
        do {
                if (offs >= i_size_read(uprobe->inode))
                        break;

                len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
                err = __copy_insn(mapping, filp, insn, len, offs);
                if (err)
                        break;

                insn += len;
                offs += len;
                size -= len;
        } while (size);

        return err;
}

static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
                                struct mm_struct *mm, unsigned long vaddr)
{
        int ret = 0;

        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
                return ret;

        /* TODO: move this into _register, until then we abuse this sem. */
        down_write(&uprobe->consumer_rwsem);
        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
                goto out;

        ret = copy_insn(uprobe, file);
        if (ret)
                goto out;

        ret = -ENOTSUPP;
        if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
                goto out;

        ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
        if (ret)
                goto out;

        smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
        set_bit(UPROBE_COPY_INSN, &uprobe->flags);

 out:
        up_write(&uprobe->consumer_rwsem);

        return ret;
}

static inline bool consumer_filter(struct uprobe_consumer *uc,
                                   enum uprobe_filter_ctx ctx, struct mm_struct *mm)
{
        return !uc->filter || uc->filter(uc, ctx, mm);
}

static bool filter_chain(struct uprobe *uprobe,
                         enum uprobe_filter_ctx ctx, struct mm_struct *mm)
{
        struct uprobe_consumer *uc;
        bool ret = false;

        down_read(&uprobe->consumer_rwsem);
        for (uc = uprobe->consumers; uc; uc = uc->next) {
                ret = consumer_filter(uc, ctx, mm);
                if (ret)
                        break;
        }
        up_read(&uprobe->consumer_rwsem);

        return ret;
}

static int
install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
                        struct vm_area_struct *vma, unsigned long vaddr)
{
        bool first_uprobe;
        int ret;

        ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
        if (ret)
                return ret;

        /*
         * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
         * the task can hit this breakpoint right after __replace_page().
         */
        first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
        if (first_uprobe)
                set_bit(MMF_HAS_UPROBES, &mm->flags);

        ret = set_swbp(&uprobe->arch, mm, vaddr);
        if (!ret)
                clear_bit(MMF_RECALC_UPROBES, &mm->flags);
        else if (first_uprobe)
                clear_bit(MMF_HAS_UPROBES, &mm->flags);

        return ret;
}

static int
remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
{
        set_bit(MMF_RECALC_UPROBES, &mm->flags);
        return set_orig_insn(&uprobe->arch, mm, vaddr);
}

static inline bool uprobe_is_active(struct uprobe *uprobe)
{
        return !RB_EMPTY_NODE(&uprobe->rb_node);
}
/*
 * There could be threads that have already hit the breakpoint. They
 * will recheck the current insn and restart if find_uprobe() fails.
 * See find_active_uprobe().
 */
static void delete_uprobe(struct uprobe *uprobe)
{
        if (WARN_ON(!uprobe_is_active(uprobe)))
                return;

        write_lock(&uprobes_treelock);
        rb_erase(&uprobe->rb_node, &uprobes_tree);
        write_unlock(&uprobes_treelock);
        RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
        put_uprobe(uprobe);
}

struct map_info {
        struct map_info *next;
        struct mm_struct *mm;
        unsigned long vaddr;
};

static inline struct map_info *free_map_info(struct map_info *info)
{
        struct map_info *next = info->next;
        kfree(info);
        return next;
}

static struct map_info *
build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
{
        unsigned long pgoff = offset >> PAGE_SHIFT;
        struct vm_area_struct *vma;
        struct map_info *curr = NULL;
        struct map_info *prev = NULL;
        struct map_info *info;
        int more = 0;

 again:
        i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                if (!valid_vma(vma, is_register))
                        continue;

                if (!prev && !more) {
                        /*
                         * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
                         * reclaim. This is optimistic, no harm done if it fails.
                         */
                        prev = kmalloc(sizeof(struct map_info),
                                        GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
                        if (prev)
                                prev->next = NULL;
                }
                if (!prev) {
                        more++;
                        continue;
                }

                if (!mmget_not_zero(vma->vm_mm))
                        continue;

                info = prev;
                prev = prev->next;
                info->next = curr;
                curr = info;

                info->mm = vma->vm_mm;
                info->vaddr = offset_to_vaddr(vma, offset);
        }
        i_mmap_unlock_read(mapping);

        if (!more)
                goto out;

        prev = curr;
        while (curr) {
                mmput(curr->mm);
                curr = curr->next;
        }

        do {
                info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
                if (!info) {
                        curr = ERR_PTR(-ENOMEM);
                        goto out;
                }
                info->next = prev;
                prev = info;
        } while (--more);

        goto again;
 out:
        while (prev)
                prev = free_map_info(prev);
        return curr;
}

static int
register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
{
        bool is_register = !!new;
        struct map_info *info;
        int err = 0;

        percpu_down_write(&dup_mmap_sem);
        info = build_map_info(uprobe->inode->i_mapping,
                                        uprobe->offset, is_register);
        if (IS_ERR(info)) {
                err = PTR_ERR(info);
                goto out;
        }

        while (info) {
                struct mm_struct *mm = info->mm;
                struct vm_area_struct *vma;

                if (err && is_register)
                        goto free;

                mmap_write_lock(mm);
                vma = find_vma(mm, info->vaddr);
                if (!vma || !valid_vma(vma, is_register) ||
                    file_inode(vma->vm_file) != uprobe->inode)
                        goto unlock;

                if (vma->vm_start > info->vaddr ||
                    vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
                        goto unlock;

                if (is_register) {
                        /* consult only the "caller", new consumer. */
                        if (consumer_filter(new,
                                        UPROBE_FILTER_REGISTER, mm))
                                err = install_breakpoint(uprobe, mm, vma, info->vaddr);
                } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
                        if (!filter_chain(uprobe,
                                        UPROBE_FILTER_UNREGISTER, mm))
                                err |= remove_breakpoint(uprobe, mm, info->vaddr);
                }

 unlock:
                mmap_write_unlock(mm);
 free:
                mmput(mm);
                info = free_map_info(info);
        }
 out:
        percpu_up_write(&dup_mmap_sem);
        return err;
}

static void
__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
        int err;

        if (WARN_ON(!consumer_del(uprobe, uc)))
                return;

        err = register_for_each_vma(uprobe, NULL);
        /* TODO : cant unregister? schedule a worker thread */
        if (!uprobe->consumers && !err)
                delete_uprobe(uprobe);
}

/*
 * uprobe_unregister - unregister an already registered probe.
 * @inode: the file in which the probe has to be removed.
 * @offset: offset from the start of the file.
 * @uc: identify which probe if multiple probes are colocated.
 */
void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
{
        struct uprobe *uprobe;

        uprobe = find_uprobe(inode, offset);
        if (WARN_ON(!uprobe))
                return;

        down_write(&uprobe->register_rwsem);
        __uprobe_unregister(uprobe, uc);
        up_write(&uprobe->register_rwsem);
        put_uprobe(uprobe);
}
EXPORT_SYMBOL_GPL(uprobe_unregister);

/*
 * __uprobe_register - register a probe
 * @inode: the file in which the probe has to be placed.
 * @offset: offset from the start of the file.
 * @uc: information on howto handle the probe..
 *
 * Apart from the access refcount, __uprobe_register() takes a creation
 * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
 * inserted into the rbtree (i.e first consumer for a @inode:@offset
 * tuple).  Creation refcount stops uprobe_unregister from freeing the
 * @uprobe even before the register operation is complete. Creation
 * refcount is released when the last @uc for the @uprobe
 * unregisters. Caller of __uprobe_register() is required to keep @inode
 * (and the containing mount) referenced.
 *
 * Return errno if it cannot successully install probes
 * else return 0 (success)
 */
static int __uprobe_register(struct inode *inode, loff_t offset,
                             loff_t ref_ctr_offset, struct uprobe_consumer *uc)
{
        struct uprobe *uprobe;
        int ret;

        /* Uprobe must have at least one set consumer */
        if (!uc->handler && !uc->ret_handler)
                return -EINVAL;

        /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
        if (!inode->i_mapping->a_ops->read_folio &&
            !shmem_mapping(inode->i_mapping))
                return -EIO;
        /* Racy, just to catch the obvious mistakes */
        if (offset > i_size_read(inode))
                return -EINVAL;

        /*
         * This ensures that copy_from_page(), copy_to_page() and
         * __update_ref_ctr() can't cross page boundary.
         */
        if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
                return -EINVAL;
        if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
                return -EINVAL;

 retry:
        uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
        if (!uprobe)
                return -ENOMEM;
        if (IS_ERR(uprobe))
                return PTR_ERR(uprobe);

        /*
         * We can race with uprobe_unregister()->delete_uprobe().
         * Check uprobe_is_active() and retry if it is false.
         */
        down_write(&uprobe->register_rwsem);
        ret = -EAGAIN;
        if (likely(uprobe_is_active(uprobe))) {
                consumer_add(uprobe, uc);
                ret = register_for_each_vma(uprobe, uc);
                if (ret)
                        __uprobe_unregister(uprobe, uc);
        }
        up_write(&uprobe->register_rwsem);
        put_uprobe(uprobe);

        if (unlikely(ret == -EAGAIN))
                goto retry;
        return ret;
}

int uprobe_register(struct inode *inode, loff_t offset,
                    struct uprobe_consumer *uc)
{
        return __uprobe_register(inode, offset, 0, uc);
}
EXPORT_SYMBOL_GPL(uprobe_register);

int uprobe_register_refctr(struct inode *inode, loff_t offset,
                           loff_t ref_ctr_offset, struct uprobe_consumer *uc)
{
        return __uprobe_register(inode, offset, ref_ctr_offset, uc);
}
EXPORT_SYMBOL_GPL(uprobe_register_refctr);

/*
 * uprobe_apply - unregister an already registered probe.
 * @inode: the file in which the probe has to be removed.
 * @offset: offset from the start of the file.
 * @uc: consumer which wants to add more or remove some breakpoints
 * @add: add or remove the breakpoints
 */
int uprobe_apply(struct inode *inode, loff_t offset,
                        struct uprobe_consumer *uc, bool add)
{
        struct uprobe *uprobe;
        struct uprobe_consumer *con;
        int ret = -ENOENT;

        uprobe = find_uprobe(inode, offset);
        if (WARN_ON(!uprobe))
                return ret;

        down_write(&uprobe->register_rwsem);
        for (con = uprobe->consumers; con && con != uc ; con = con->next)
                ;
        if (con)
                ret = register_for_each_vma(uprobe, add ? uc : NULL);
        up_write(&uprobe->register_rwsem);
        put_uprobe(uprobe);

        return ret;
}

static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
{
        VMA_ITERATOR(vmi, mm, 0);
        struct vm_area_struct *vma;
        int err = 0;

        mmap_read_lock(mm);
        for_each_vma(vmi, vma) {
                unsigned long vaddr;
                loff_t offset;

                if (!valid_vma(vma, false) ||
                    file_inode(vma->vm_file) != uprobe->inode)
                        continue;

                offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
                if (uprobe->offset <  offset ||
                    uprobe->offset >= offset + vma->vm_end - vma->vm_start)
                        continue;

                vaddr = offset_to_vaddr(vma, uprobe->offset);
                err |= remove_breakpoint(uprobe, mm, vaddr);
        }
        mmap_read_unlock(mm);

        return err;
}

static struct rb_node *
find_node_in_range(struct inode *inode, loff_t min, loff_t max)
{
        struct rb_node *n = uprobes_tree.rb_node;

        while (n) {
                struct uprobe *u = rb_entry(n, struct uprobe, rb_node);

                if (inode < u->inode) {
                        n = n->rb_left;
                } else if (inode > u->inode) {
                        n = n->rb_right;
                } else {
                        if (max < u->offset)
                                n = n->rb_left;
                        else if (min > u->offset)
                                n = n->rb_right;
                        else
                                break;
                }
        }

        return n;
}

/*
 * For a given range in vma, build a list of probes that need to be inserted.
 */
static void build_probe_list(struct inode *inode,
                                struct vm_area_struct *vma,
                                unsigned long start, unsigned long end,
                                struct list_head *head)
{
        loff_t min, max;
        struct rb_node *n, *t;
        struct uprobe *u;

        INIT_LIST_HEAD(head);
        min = vaddr_to_offset(vma, start);
        max = min + (end - start) - 1;

        read_lock(&uprobes_treelock);
        n = find_node_in_range(inode, min, max);
        if (n) {
                for (t = n; t; t = rb_prev(t)) {
                        u = rb_entry(t, struct uprobe, rb_node);
                        if (u->inode != inode || u->offset < min)
                                break;
                        list_add(&u->pending_list, head);
                        get_uprobe(u);
                }
                for (t = n; (t = rb_next(t)); ) {
                        u = rb_entry(t, struct uprobe, rb_node);
                        if (u->inode != inode || u->offset > max)
                                break;
                        list_add(&u->pending_list, head);
                        get_uprobe(u);
                }
        }
        read_unlock(&uprobes_treelock);
}

/* @vma contains reference counter, not the probed instruction. */
static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
{
        struct list_head *pos, *q;
        struct delayed_uprobe *du;
        unsigned long vaddr;
        int ret = 0, err = 0;

        mutex_lock(&delayed_uprobe_lock);
        list_for_each_safe(pos, q, &delayed_uprobe_list) {
                du = list_entry(pos, struct delayed_uprobe, list);

                if (du->mm != vma->vm_mm ||
                    !valid_ref_ctr_vma(du->uprobe, vma))
                        continue;

                vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset);
                ret = __update_ref_ctr(vma->vm_mm, vaddr, 1);
                if (ret) {
                        update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1);
                        if (!err)
                                err = ret;
                }
                delayed_uprobe_delete(du);
        }
        mutex_unlock(&delayed_uprobe_lock);
        return err;
}

/*
 * Called from mmap_region/vma_merge with mm->mmap_lock acquired.
 *
 * Currently we ignore all errors and always return 0, the callers
 * can't handle the failure anyway.
 */
int uprobe_mmap(struct vm_area_struct *vma)
{
        struct list_head tmp_list;
        struct uprobe *uprobe, *u;
        struct inode *inode;

        if (no_uprobe_events())
                return 0;

        if (vma->vm_file &&
            (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
            test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags))
                delayed_ref_ctr_inc(vma);

        if (!valid_vma(vma, true))
                return 0;

        inode = file_inode(vma->vm_file);
        if (!inode)
                return 0;

        mutex_lock(uprobes_mmap_hash(inode));
        build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
        /*
         * We can race with uprobe_unregister(), this uprobe can be already
         * removed. But in this case filter_chain() must return false, all
         * consumers have gone away.
         */
        list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
                if (!fatal_signal_pending(current) &&
                    filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
                        unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
                        install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
                }
                put_uprobe(uprobe);
        }
        mutex_unlock(uprobes_mmap_hash(inode));

        return 0;
}

static bool
vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
{
        loff_t min, max;
        struct inode *inode;
        struct rb_node *n;

        inode = file_inode(vma->vm_file);

        min = vaddr_to_offset(vma, start);
        max = min + (end - start) - 1;

        read_lock(&uprobes_treelock);
        n = find_node_in_range(inode, min, max);
        read_unlock(&uprobes_treelock);

        return !!n;
}

/*
 * Called in context of a munmap of a vma.
 */
void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
{
        if (no_uprobe_events() || !valid_vma(vma, false))
                return;

        if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
                return;

        if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
             test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
                return;

        if (vma_has_uprobes(vma, start, end))
                set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
}

/* Slot allocation for XOL */
static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{
        struct vm_area_struct *vma;
        int ret;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        if (mm->uprobes_state.xol_area) {
                ret = -EALREADY;
                goto fail;
        }

        if (!area->vaddr) {
                /* Try to map as high as possible, this is only a hint. */
                area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
                                                PAGE_SIZE, 0, 0);
                if (IS_ERR_VALUE(area->vaddr)) {
                        ret = area->vaddr;
                        goto fail;
                }
        }

        vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
                                VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
                                &area->xol_mapping);
        if (IS_ERR(vma)) {
                ret = PTR_ERR(vma);
                goto fail;
        }

        ret = 0;
        /* pairs with get_xol_area() */
        smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */
 fail:
        mmap_write_unlock(mm);

        return ret;
}

static struct xol_area *__create_xol_area(unsigned long vaddr)
{
        struct mm_struct *mm = current->mm;
        uprobe_opcode_t insn = UPROBE_SWBP_INSN;
        struct xol_area *area;

        area = kmalloc(sizeof(*area), GFP_KERNEL);
        if (unlikely(!area))
                goto out;

        area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long),
                               GFP_KERNEL);
        if (!area->bitmap)
                goto free_area;

        area->xol_mapping.name = "[uprobes]";
        area->xol_mapping.fault = NULL;
        area->xol_mapping.pages = area->pages;
        area->pages[0] = alloc_page(GFP_HIGHUSER);
        if (!area->pages[0])
                goto free_bitmap;
        area->pages[1] = NULL;

        area->vaddr = vaddr;
        init_waitqueue_head(&area->wq);
        /* Reserve the 1st slot for get_trampoline_vaddr() */
        set_bit(0, area->bitmap);
        atomic_set(&area->slot_count, 1);
        arch_uprobe_copy_ixol(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);

        if (!xol_add_vma(mm, area))
                return area;

        __free_page(area->pages[0]);
 free_bitmap:
        kfree(area->bitmap);
 free_area:
        kfree(area);
 out:
        return NULL;
}

/*
 * get_xol_area - Allocate process's xol_area if necessary.
 * This area will be used for storing instructions for execution out of line.
 *
 * Returns the allocated area or NULL.
 */
static struct xol_area *get_xol_area(void)
{
        struct mm_struct *mm = current->mm;
        struct xol_area *area;

        if (!mm->uprobes_state.xol_area)
                __create_xol_area(0);

        /* Pairs with xol_add_vma() smp_store_release() */
        area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */
        return area;
}

/*
 * uprobe_clear_state - Free the area allocated for slots.
 */
void uprobe_clear_state(struct mm_struct *mm)
{
        struct xol_area *area = mm->uprobes_state.xol_area;

        mutex_lock(&delayed_uprobe_lock);
        delayed_uprobe_remove(NULL, mm);
        mutex_unlock(&delayed_uprobe_lock);

        if (!area)
                return;

        put_page(area->pages[0]);
        kfree(area->bitmap);
        kfree(area);
}

void uprobe_start_dup_mmap(void)
{
        percpu_down_read(&dup_mmap_sem);
}

void uprobe_end_dup_mmap(void)
{
        percpu_up_read(&dup_mmap_sem);
}

void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
{
        if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
                set_bit(MMF_HAS_UPROBES, &newmm->flags);
                /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
                set_bit(MMF_RECALC_UPROBES, &newmm->flags);
        }
}

/*
 *  - search for a free slot.
 */
static unsigned long xol_take_insn_slot(struct xol_area *area)
{
        unsigned long slot_addr;
        int slot_nr;

        do {
                slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
                if (slot_nr < UINSNS_PER_PAGE) {
                        if (!test_and_set_bit(slot_nr, area->bitmap))
                                break;

                        slot_nr = UINSNS_PER_PAGE;
                        continue;
                }
                wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
        } while (slot_nr >= UINSNS_PER_PAGE);

        slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
        atomic_inc(&area->slot_count);

        return slot_addr;
}

/*
 * xol_get_insn_slot - allocate a slot for xol.
 * Returns the allocated slot address or 0.
 */
static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
{
        struct xol_area *area;
        unsigned long xol_vaddr;

        area = get_xol_area();
        if (!area)
                return 0;

        xol_vaddr = xol_take_insn_slot(area);
        if (unlikely(!xol_vaddr))
                return 0;

        arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
                              &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));

        return xol_vaddr;
}

/*
 * xol_free_insn_slot - If slot was earlier allocated by
 * @xol_get_insn_slot(), make the slot available for
 * subsequent requests.
 */
static void xol_free_insn_slot(struct task_struct *tsk)
{
        struct xol_area *area;
        unsigned long vma_end;
        unsigned long slot_addr;

        if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
                return;

        slot_addr = tsk->utask->xol_vaddr;
        if (unlikely(!slot_addr))
                return;

        area = tsk->mm->uprobes_state.xol_area;
        vma_end = area->vaddr + PAGE_SIZE;
        if (area->vaddr <= slot_addr && slot_addr < vma_end) {
                unsigned long offset;
                int slot_nr;

                offset = slot_addr - area->vaddr;
                slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
                if (slot_nr >= UINSNS_PER_PAGE)
                        return;

                clear_bit(slot_nr, area->bitmap);
                atomic_dec(&area->slot_count);
                smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
                if (waitqueue_active(&area->wq))
                        wake_up(&area->wq);

                tsk->utask->xol_vaddr = 0;
        }
}

void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
                                  void *src, unsigned long len)
{
        /* Initialize the slot */
        copy_to_page(page, vaddr, src, len);

        /*
         * We probably need flush_icache_user_page() but it needs vma.
         * This should work on most of architectures by default. If
         * architecture needs to do something different it can define
         * its own version of the function.
         */
        flush_dcache_page(page);
}

/**
 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
 * @regs: Reflects the saved state of the task after it has hit a breakpoint
 * instruction.
 * Return the address of the breakpoint instruction.
 */
unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
{
        return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
}

unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
{
        struct uprobe_task *utask = current->utask;

        if (unlikely(utask && utask->active_uprobe))
                return utask->vaddr;

        return instruction_pointer(regs);
}

static struct return_instance *free_ret_instance(struct return_instance *ri)
{
        struct return_instance *next = ri->next;
        put_uprobe(ri->uprobe);
        kfree(ri);
        return next;
}

/*
 * Called with no locks held.
 * Called in context of an exiting or an exec-ing thread.
 */
void uprobe_free_utask(struct task_struct *t)
{
        struct uprobe_task *utask = t->utask;
        struct return_instance *ri;

        if (!utask)
                return;

        if (utask->active_uprobe)
                put_uprobe(utask->active_uprobe);

        ri = utask->return_instances;
        while (ri)
                ri = free_ret_instance(ri);

        xol_free_insn_slot(t);
        kfree(utask);
        t->utask = NULL;
}

/*
 * Allocate a uprobe_task object for the task if necessary.
 * Called when the thread hits a breakpoint.
 *
 * Returns:
 * - pointer to new uprobe_task on success
 * - NULL otherwise
 */
static struct uprobe_task *get_utask(void)
{
        if (!current->utask)
                current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
        return current->utask;
}

static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
{
        struct uprobe_task *n_utask;
        struct return_instance **p, *o, *n;

        n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
        if (!n_utask)
                return -ENOMEM;
        t->utask = n_utask;

        p = &n_utask->return_instances;
        for (o = o_utask->return_instances; o; o = o->next) {
                n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
                if (!n)
                        return -ENOMEM;

                *n = *o;
                get_uprobe(n->uprobe);
                n->next = NULL;

                *p = n;
                p = &n->next;
                n_utask->depth++;
        }

        return 0;
}

static void uprobe_warn(struct task_struct *t, const char *msg)
{
        pr_warn("uprobe: %s:%d failed to %s\n",
                        current->comm, current->pid, msg);
}

static void dup_xol_work(struct callback_head *work)
{
        if (current->flags & PF_EXITING)
                return;

        if (!__create_xol_area(current->utask->dup_xol_addr) &&
                        !fatal_signal_pending(current))
                uprobe_warn(current, "dup xol area");
}

/*
 * Called in context of a new clone/fork from copy_process.
 */
void uprobe_copy_process(struct task_struct *t, unsigned long flags)
{
        struct uprobe_task *utask = current->utask;
        struct mm_struct *mm = current->mm;
        struct xol_area *area;

        t->utask = NULL;

        if (!utask || !utask->return_instances)
                return;

        if (mm == t->mm && !(flags & CLONE_VFORK))
                return;

        if (dup_utask(t, utask))
                return uprobe_warn(t, "dup ret instances");

        /* The task can fork() after dup_xol_work() fails */
        area = mm->uprobes_state.xol_area;
        if (!area)
                return uprobe_warn(t, "dup xol area");

        if (mm == t->mm)
                return;

        t->utask->dup_xol_addr = area->vaddr;
        init_task_work(&t->utask->dup_xol_work, dup_xol_work);
        task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME);
}

/*
 * Current area->vaddr notion assume the trampoline address is always
 * equal area->vaddr.
 *
 * Returns -1 in case the xol_area is not allocated.
 */
static unsigned long get_trampoline_vaddr(void)
{
        struct xol_area *area;
        unsigned long trampoline_vaddr = -1;

        /* Pairs with xol_add_vma() smp_store_release() */
        area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */
        if (area)
                trampoline_vaddr = area->vaddr;

        return trampoline_vaddr;
}

static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
                                        struct pt_regs *regs)
{
        struct return_instance *ri = utask->return_instances;
        enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;

        while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
                ri = free_ret_instance(ri);
                utask->depth--;
        }
        utask->return_instances = ri;
}

static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
{
        struct return_instance *ri;
        struct uprobe_task *utask;
        unsigned long orig_ret_vaddr, trampoline_vaddr;
        bool chained;

        if (!get_xol_area())
                return;

        utask = get_utask();
        if (!utask)
                return;

        if (utask->depth >= MAX_URETPROBE_DEPTH) {
                printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
                                " nestedness limit pid/tgid=%d/%d\n",
                                current->pid, current->tgid);
                return;
        }

        ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
        if (!ri)
                return;

        trampoline_vaddr = get_trampoline_vaddr();
        orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
        if (orig_ret_vaddr == -1)
                goto fail;

        /* drop the entries invalidated by longjmp() */
        chained = (orig_ret_vaddr == trampoline_vaddr);
        cleanup_return_instances(utask, chained, regs);

        /*
         * We don't want to keep trampoline address in stack, rather keep the
         * original return address of first caller thru all the consequent
         * instances. This also makes breakpoint unwrapping easier.
         */
        if (chained) {
                if (!utask->return_instances) {
                        /*
                         * This situation is not possible. Likely we have an
                         * attack from user-space.
                         */
                        uprobe_warn(current, "handle tail call");
                        goto fail;
                }
                orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
        }

        ri->uprobe = get_uprobe(uprobe);
        ri->func = instruction_pointer(regs);
        ri->stack = user_stack_pointer(regs);
        ri->orig_ret_vaddr = orig_ret_vaddr;
        ri->chained = chained;

        utask->depth++;
        ri->next = utask->return_instances;
        utask->return_instances = ri;

        return;
 fail:
        kfree(ri);
}

/* Prepare to single-step probed instruction out of line. */
static int
pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
{
        struct uprobe_task *utask;
        unsigned long xol_vaddr;
        int err;

        utask = get_utask();
        if (!utask)
                return -ENOMEM;

        xol_vaddr = xol_get_insn_slot(uprobe);
        if (!xol_vaddr)
                return -ENOMEM;

        utask->xol_vaddr = xol_vaddr;
        utask->vaddr = bp_vaddr;

        err = arch_uprobe_pre_xol(&uprobe->arch, regs);
        if (unlikely(err)) {
                xol_free_insn_slot(current);
                return err;
        }

        utask->active_uprobe = uprobe;
        utask->state = UTASK_SSTEP;
        return 0;
}

/*
 * If we are singlestepping, then ensure this thread is not connected to
 * non-fatal signals until completion of singlestep.  When xol insn itself
 * triggers the signal,  restart the original insn even if the task is
 * already SIGKILL'ed (since coredump should report the correct ip).  This
 * is even more important if the task has a handler for SIGSEGV/etc, The
 * _same_ instruction should be repeated again after return from the signal
 * handler, and SSTEP can never finish in this case.
 */
bool uprobe_deny_signal(void)
{
        struct task_struct *t = current;
        struct uprobe_task *utask = t->utask;

        if (likely(!utask || !utask->active_uprobe))
                return false;

        WARN_ON_ONCE(utask->state != UTASK_SSTEP);

        if (task_sigpending(t)) {
                spin_lock_irq(&t->sighand->siglock);
                clear_tsk_thread_flag(t, TIF_SIGPENDING);
                spin_unlock_irq(&t->sighand->siglock);

                if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
                        utask->state = UTASK_SSTEP_TRAPPED;
                        set_tsk_thread_flag(t, TIF_UPROBE);
                }
        }

        return true;
}

static void mmf_recalc_uprobes(struct mm_struct *mm)
{
        VMA_ITERATOR(vmi, mm, 0);
        struct vm_area_struct *vma;

        for_each_vma(vmi, vma) {
                if (!valid_vma(vma, false))
                        continue;
                /*
                 * This is not strictly accurate, we can race with
                 * uprobe_unregister() and see the already removed
                 * uprobe if delete_uprobe() was not yet called.
                 * Or this uprobe can be filtered out.
                 */
                if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
                        return;
        }

        clear_bit(MMF_HAS_UPROBES, &mm->flags);
}

static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
{
        struct page *page;
        uprobe_opcode_t opcode;
        int result;

        if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE)))
                return -EINVAL;

        pagefault_disable();
        result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
        pagefault_enable();

        if (likely(result == 0))
                goto out;

        /*
         * The NULL 'tsk' here ensures that any faults that occur here
         * will not be accounted to the task.  'mm' *is* current->mm,
         * but we treat this as a 'remote' access since it is
         * essentially a kernel access to the memory.
         */
        result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL);
        if (result < 0)
                return result;

        copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
        put_page(page);
 out:
        /* This needs to return true for any variant of the trap insn */
        return is_trap_insn(&opcode);
}

static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
{
        struct mm_struct *mm = current->mm;
        struct uprobe *uprobe = NULL;
        struct vm_area_struct *vma;

        mmap_read_lock(mm);
        vma = vma_lookup(mm, bp_vaddr);
        if (vma) {
                if (valid_vma(vma, false)) {
                        struct inode *inode = file_inode(vma->vm_file);
                        loff_t offset = vaddr_to_offset(vma, bp_vaddr);

                        uprobe = find_uprobe(inode, offset);
                }

                if (!uprobe)
                        *is_swbp = is_trap_at_addr(mm, bp_vaddr);
        } else {
                *is_swbp = -EFAULT;
        }

        if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
                mmf_recalc_uprobes(mm);
        mmap_read_unlock(mm);

        return uprobe;
}

static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
{
        struct uprobe_consumer *uc;
        int remove = UPROBE_HANDLER_REMOVE;
        bool need_prep = false; /* prepare return uprobe, when needed */

        down_read(&uprobe->register_rwsem);
        for (uc = uprobe->consumers; uc; uc = uc->next) {
                int rc = 0;

                if (uc->handler) {
                        rc = uc->handler(uc, regs);
                        WARN(rc & ~UPROBE_HANDLER_MASK,
                                "bad rc=0x%x from %ps()\n", rc, uc->handler);
                }

                if (uc->ret_handler)
                        need_prep = true;

                remove &= rc;
        }

        if (need_prep && !remove)
                prepare_uretprobe(uprobe, regs); /* put bp at return */

        if (remove && uprobe->consumers) {
                WARN_ON(!uprobe_is_active(uprobe));
                unapply_uprobe(uprobe, current->mm);
        }
        up_read(&uprobe->register_rwsem);
}

static void
handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
{
        struct uprobe *uprobe = ri->uprobe;
        struct uprobe_consumer *uc;

        down_read(&uprobe->register_rwsem);
        for (uc = uprobe->consumers; uc; uc = uc->next) {
                if (uc->ret_handler)
                        uc->ret_handler(uc, ri->func, regs);
        }
        up_read(&uprobe->register_rwsem);
}

static struct return_instance *find_next_ret_chain(struct return_instance *ri)
{
        bool chained;

        do {
                chained = ri->chained;
                ri = ri->next;        /* can't be NULL if chained */
        } while (chained);

        return ri;
}

static void handle_trampoline(struct pt_regs *regs)
{
        struct uprobe_task *utask;
        struct return_instance *ri, *next;
        bool valid;

        utask = current->utask;
        if (!utask)
                goto sigill;

        ri = utask->return_instances;
        if (!ri)
                goto sigill;

        do {
                /*
                 * We should throw out the frames invalidated by longjmp().
                 * If this chain is valid, then the next one should be alive
                 * or NULL; the latter case means that nobody but ri->func
                 * could hit this trampoline on return. TODO: sigaltstack().
                 */
                next = find_next_ret_chain(ri);
                valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);

                instruction_pointer_set(regs, ri->orig_ret_vaddr);
                do {
                        if (valid)
                                handle_uretprobe_chain(ri, regs);
                        ri = free_ret_instance(ri);
                        utask->depth--;
                } while (ri != next);
        } while (!valid);

        utask->return_instances = ri;
        return;

 sigill:
        uprobe_warn(current, "handle uretprobe, sending SIGILL.");
        force_sig(SIGILL);

}

bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
{
        return false;
}

bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
                                        struct pt_regs *regs)
{
        return true;
}

/*
 * Run handler and ask thread to singlestep.
 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
 */
static void handle_swbp(struct pt_regs *regs)
{
        struct uprobe *uprobe;
        unsigned long bp_vaddr;
        int is_swbp;

        bp_vaddr = uprobe_get_swbp_addr(regs);
        if (bp_vaddr == get_trampoline_vaddr())
                return handle_trampoline(regs);

        uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
        if (!uprobe) {
                if (is_swbp > 0) {
                        /* No matching uprobe; signal SIGTRAP. */
                        force_sig(SIGTRAP);
                } else {
                        /*
                         * Either we raced with uprobe_unregister() or we can't
                         * access this memory. The latter is only possible if
                         * another thread plays with our ->mm. In both cases
                         * we can simply restart. If this vma was unmapped we
                         * can pretend this insn was not executed yet and get
                         * the (correct) SIGSEGV after restart.
                         */
                        instruction_pointer_set(regs, bp_vaddr);
                }
                return;
        }

        /* change it in advance for ->handler() and restart */
        instruction_pointer_set(regs, bp_vaddr);

        /*
         * TODO: move copy_insn/etc into _register and remove this hack.
         * After we hit the bp, _unregister + _register can install the
         * new and not-yet-analyzed uprobe at the same address, restart.
         */
        if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
                goto out;

        /*
         * Pairs with the smp_wmb() in prepare_uprobe().
         *
         * Guarantees that if we see the UPROBE_COPY_INSN bit set, then
         * we must also see the stores to &uprobe->arch performed by the
         * prepare_uprobe() call.
         */
        smp_rmb();

        /* Tracing handlers use ->utask to communicate with fetch methods */
        if (!get_utask())
                goto out;

        if (arch_uprobe_ignore(&uprobe->arch, regs))
                goto out;

        handler_chain(uprobe, regs);

        if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
                goto out;

        if (!pre_ssout(uprobe, regs, bp_vaddr))
                return;

        /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
out:
        put_uprobe(uprobe);
}

/*
 * Perform required fix-ups and disable singlestep.
 * Allow pending signals to take effect.
 */
static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
{
        struct uprobe *uprobe;
        int err = 0;

        uprobe = utask->active_uprobe;
        if (utask->state == UTASK_SSTEP_ACK)
                err = arch_uprobe_post_xol(&uprobe->arch, regs);
        else if (utask->state == UTASK_SSTEP_TRAPPED)
                arch_uprobe_abort_xol(&uprobe->arch, regs);
        else
                WARN_ON_ONCE(1);

        put_uprobe(uprobe);
        utask->active_uprobe = NULL;
        utask->state = UTASK_RUNNING;
        xol_free_insn_slot(current);

        spin_lock_irq(&current->sighand->siglock);
        recalc_sigpending(); /* see uprobe_deny_signal() */
        spin_unlock_irq(&current->sighand->siglock);

        if (unlikely(err)) {
                uprobe_warn(current, "execute the probed insn, sending SIGILL.");
                force_sig(SIGILL);
        }
}

/*
 * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
 * allows the thread to return from interrupt. After that handle_swbp()
 * sets utask->active_uprobe.
 *
 * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
 * and allows the thread to return from interrupt.
 *
 * While returning to userspace, thread notices the TIF_UPROBE flag and calls
 * uprobe_notify_resume().
 */
void uprobe_notify_resume(struct pt_regs *regs)
{
        struct uprobe_task *utask;

        clear_thread_flag(TIF_UPROBE);

        utask = current->utask;
        if (utask && utask->active_uprobe)
                handle_singlestep(utask, regs);
        else
                handle_swbp(regs);
}

/*
 * uprobe_pre_sstep_notifier gets called from interrupt context as part of
 * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
 */
int uprobe_pre_sstep_notifier(struct pt_regs *regs)
{
        if (!current->mm)
                return 0;

        if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
            (!current->utask || !current->utask->return_instances))
                return 0;

        set_thread_flag(TIF_UPROBE);
        return 1;
}

/*
 * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
 * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
 */
int uprobe_post_sstep_notifier(struct pt_regs *regs)
{
        struct uprobe_task *utask = current->utask;

        if (!current->mm || !utask || !utask->active_uprobe)
                /* task is currently not uprobed */
                return 0;

        utask->state = UTASK_SSTEP_ACK;
        set_thread_flag(TIF_UPROBE);
        return 1;
}

static struct notifier_block uprobe_exception_nb = {
        .notifier_call                = arch_uprobe_exception_notify,
        .priority                = INT_MAX-1,        /* notified after kprobes, kgdb */
};

void __init uprobes_init(void)
{
        int i;

        for (i = 0; i < UPROBES_HASH_SZ; i++)
                mutex_init(&uprobes_mmap_mutex[i]);

        BUG_ON(register_die_notifier(&uprobe_exception_nb));
}


























































































































































































































    1 








    1 
    1 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
/*
 * linux/fs/nls/nls_iso8859-6.c
 *
 * Charset iso8859-6 translation tables.
 * Generated automatically from the Unicode and charset
 * tables from the Unicode Organization (www.unicode.org).
 * The Unicode to charset table has only exact mappings.
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/nls.h>
#include <linux/errno.h>

static const wchar_t charset2uni[256] = {
        /* 0x00*/
        0x0000, 0x0001, 0x0002, 0x0003,
        0x0004, 0x0005, 0x0006, 0x0007,
        0x0008, 0x0009, 0x000a, 0x000b,
        0x000c, 0x000d, 0x000e, 0x000f,
        /* 0x10*/
        0x0010, 0x0011, 0x0012, 0x0013,
        0x0014, 0x0015, 0x0016, 0x0017,
        0x0018, 0x0019, 0x001a, 0x001b,
        0x001c, 0x001d, 0x001e, 0x001f,
        /* 0x20*/
        0x0020, 0x0021, 0x0022, 0x0023,
        0x0024, 0x0025, 0x0026, 0x0027,
        0x0028, 0x0029, 0x002a, 0x002b,
        0x002c, 0x002d, 0x002e, 0x002f,
        /* 0x30*/
        0x0660, 0x0661, 0x0662, 0x0663,
        0x0664, 0x0665, 0x0666, 0x0667,
        0x0668, 0x0669, 0x003a, 0x003b,
        0x003c, 0x003d, 0x003e, 0x003f,
        /* 0x40*/
        0x0040, 0x0041, 0x0042, 0x0043,
        0x0044, 0x0045, 0x0046, 0x0047,
        0x0048, 0x0049, 0x004a, 0x004b,
        0x004c, 0x004d, 0x004e, 0x004f,
        /* 0x50*/
        0x0050, 0x0051, 0x0052, 0x0053,
        0x0054, 0x0055, 0x0056, 0x0057,
        0x0058, 0x0059, 0x005a, 0x005b,
        0x005c, 0x005d, 0x005e, 0x005f,
        /* 0x60*/
        0x0060, 0x0061, 0x0062, 0x0063,
        0x0064, 0x0065, 0x0066, 0x0067,
        0x0068, 0x0069, 0x006a, 0x006b,
        0x006c, 0x006d, 0x006e, 0x006f,
        /* 0x70*/
        0x0070, 0x0071, 0x0072, 0x0073,
        0x0074, 0x0075, 0x0076, 0x0077,
        0x0078, 0x0079, 0x007a, 0x007b,
        0x007c, 0x007d, 0x007e, 0x007f,
        /* 0x80*/
        0x0080, 0x0081, 0x0082, 0x0083,
        0x0084, 0x0085, 0x0086, 0x0087,
        0x0088, 0x0089, 0x008a, 0x008b,
        0x008c, 0x008d, 0x008e, 0x008f,
        /* 0x90*/
        0x0090, 0x0091, 0x0092, 0x0093,
        0x0094, 0x0095, 0x0096, 0x0097,
        0x0098, 0x0099, 0x009a, 0x009b,
        0x009c, 0x009d, 0x009e, 0x009f,
        /* 0xa0*/
        0x00a0, 0x0000, 0x0000, 0x0000,
        0x00a4, 0x0000, 0x0000, 0x0000,
        0x0000, 0x0000, 0x0000, 0x0000,
        0x060c, 0x00ad, 0x0000, 0x0000,
        /* 0xb0*/
        0x0000, 0x0000, 0x0000, 0x0000,
        0x0000, 0x0000, 0x0000, 0x0000,
        0x0000, 0x0000, 0x0000, 0x061b,
        0x0000, 0x0000, 0x0000, 0x061f,
        /* 0xc0*/
        0x0000, 0x0621, 0x0622, 0x0623,
        0x0624, 0x0625, 0x0626, 0x0627,
        0x0628, 0x0629, 0x062a, 0x062b,
        0x062c, 0x062d, 0x062e, 0x062f,
        /* 0xd0*/
        0x0630, 0x0631, 0x0632, 0x0633,
        0x0634, 0x0635, 0x0636, 0x0637,
        0x0638, 0x0639, 0x063a, 0x0000,
        0x0000, 0x0000, 0x0000, 0x0000,
        /* 0xe0*/
        0x0640, 0x0641, 0x0642, 0x0643,
        0x0644, 0x0645, 0x0646, 0x0647,
        0x0648, 0x0649, 0x064a, 0x064b,
        0x064c, 0x064d, 0x064e, 0x064f,
        /* 0xf0*/
        0x0650, 0x0651, 0x0652, 0x0000,
        0x0000, 0x0000, 0x0000, 0x0000,
        0x0000, 0x0000, 0x0000, 0x0000,
        0x0000, 0x0000, 0x0000, 0x0000,
};

static const unsigned char page00[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
        0x00, 0x00, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
        0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
        0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
        0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
        0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
        0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */
};

static const unsigned char page06[256] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
        0x00, 0x00, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00, /* 0x08-0x0f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
        0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0x18-0x1f */
        0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x20-0x27 */
        0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x28-0x2f */
        0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x30-0x37 */
        0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
        0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x40-0x47 */
        0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x48-0x4f */
        0xf0, 0xf1, 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x60-0x67 */
        0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
};

static const unsigned char *const page_uni2charset[256] = {
        page00, NULL,   NULL,   NULL,   NULL,   NULL,   page06, NULL,   
};

static const unsigned char charset2lower[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
        0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
        0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
        0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
        0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
        0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
        0x00, 0x00, 0x00, 0x00, 0xac, 0xad, 0x00, 0x00, /* 0xa8-0xaf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
        0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0xb8-0xbf */
        0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
        0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
        0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
        0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
        0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
        0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
        0xf0, 0xf1, 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */
};

static const unsigned char charset2upper[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
        0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
        0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
        0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
        0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
        0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
        0x00, 0x00, 0x00, 0x00, 0xac, 0xad, 0x00, 0x00, /* 0xa8-0xaf */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
        0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbf, /* 0xb8-0xbf */
        0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
        0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
        0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
        0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
        0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
        0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
        0xf0, 0xf1, 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
};

static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
{
        const unsigned char *uni2charset;
        unsigned char cl = uni & 0x00ff;
        unsigned char ch = (uni & 0xff00) >> 8;

        if (boundlen <= 0)
                return -ENAMETOOLONG;

        uni2charset = page_uni2charset[ch];
        if (uni2charset && uni2charset[cl])
                out[0] = uni2charset[cl];
        else
                return -EINVAL;
        return 1;
}

static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
{
        *uni = charset2uni[*rawstring];
        if (*uni == 0x0000)
                return -EINVAL;
        return 1;
}

static struct nls_table table = {
        .charset        = "iso8859-6",
        .uni2char        = uni2char,
        .char2uni        = char2uni,
        .charset2lower        = charset2lower,
        .charset2upper        = charset2upper,
};

static int __init init_nls_iso8859_6(void)
{
        return register_nls(&table);
}

static void __exit exit_nls_iso8859_6(void)
{
        unregister_nls(&table);
}

module_init(init_nls_iso8859_6)
module_exit(exit_nls_iso8859_6)

MODULE_LICENSE("Dual BSD/GPL");



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 










    1 




































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * KVM paravirt_ops implementation
 *
 * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 * Copyright IBM Corporation, 2007
 *   Authors: Anthony Liguori <aliguori@us.ibm.com>
 */

#define pr_fmt(fmt) "kvm-guest: " fmt

#include <linux/context_tracking.h>
#include <linux/init.h>
#include <linux/irq.h>
#include <linux/kernel.h>
#include <linux/kvm_para.h>
#include <linux/cpu.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hardirq.h>
#include <linux/notifier.h>
#include <linux/reboot.h>
#include <linux/hash.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/kprobes.h>
#include <linux/nmi.h>
#include <linux/swait.h>
#include <linux/syscore_ops.h>
#include <linux/cc_platform.h>
#include <linux/efi.h>
#include <asm/timer.h>
#include <asm/cpu.h>
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/tlbflush.h>
#include <asm/apic.h>
#include <asm/apicdef.h>
#include <asm/hypervisor.h>
#include <asm/tlb.h>
#include <asm/cpuidle_haltpoll.h>
#include <asm/ptrace.h>
#include <asm/reboot.h>
#include <asm/svm.h>
#include <asm/e820/api.h>

DEFINE_STATIC_KEY_FALSE_RO(kvm_async_pf_enabled);

static int kvmapf = 1;

static int __init parse_no_kvmapf(char *arg)
{
        kvmapf = 0;
        return 0;
}

early_param("no-kvmapf", parse_no_kvmapf);

static int steal_acc = 1;
static int __init parse_no_stealacc(char *arg)
{
        steal_acc = 0;
        return 0;
}

early_param("no-steal-acc", parse_no_stealacc);

static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled);
static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
static int has_steal_clock = 0;

static int has_guest_poll = 0;
/*
 * No need for any "IO delay" on KVM
 */
static void kvm_io_delay(void)
{
}

#define KVM_TASK_SLEEP_HASHBITS 8
#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)

struct kvm_task_sleep_node {
        struct hlist_node link;
        struct swait_queue_head wq;
        u32 token;
        int cpu;
};

static struct kvm_task_sleep_head {
        raw_spinlock_t lock;
        struct hlist_head list;
} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];

static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
                                                  u32 token)
{
        struct hlist_node *p;

        hlist_for_each(p, &b->list) {
                struct kvm_task_sleep_node *n =
                        hlist_entry(p, typeof(*n), link);
                if (n->token == token)
                        return n;
        }

        return NULL;
}

static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
{
        u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
        struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
        struct kvm_task_sleep_node *e;

        raw_spin_lock(&b->lock);
        e = _find_apf_task(b, token);
        if (e) {
                /* dummy entry exist -> wake up was delivered ahead of PF */
                hlist_del(&e->link);
                raw_spin_unlock(&b->lock);
                kfree(e);
                return false;
        }

        n->token = token;
        n->cpu = smp_processor_id();
        init_swait_queue_head(&n->wq);
        hlist_add_head(&n->link, &b->list);
        raw_spin_unlock(&b->lock);
        return true;
}

/*
 * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled
 * @token:        Token to identify the sleep node entry
 *
 * Invoked from the async pagefault handling code or from the VM exit page
 * fault handler. In both cases RCU is watching.
 */
void kvm_async_pf_task_wait_schedule(u32 token)
{
        struct kvm_task_sleep_node n;
        DECLARE_SWAITQUEUE(wait);

        lockdep_assert_irqs_disabled();

        if (!kvm_async_pf_queue_task(token, &n))
                return;

        for (;;) {
                prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
                if (hlist_unhashed(&n.link))
                        break;

                local_irq_enable();
                schedule();
                local_irq_disable();
        }
        finish_swait(&n.wq, &wait);
}
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);

static void apf_task_wake_one(struct kvm_task_sleep_node *n)
{
        hlist_del_init(&n->link);
        if (swq_has_sleeper(&n->wq))
                swake_up_one(&n->wq);
}

static void apf_task_wake_all(void)
{
        int i;

        for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
                struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
                struct kvm_task_sleep_node *n;
                struct hlist_node *p, *next;

                raw_spin_lock(&b->lock);
                hlist_for_each_safe(p, next, &b->list) {
                        n = hlist_entry(p, typeof(*n), link);
                        if (n->cpu == smp_processor_id())
                                apf_task_wake_one(n);
                }
                raw_spin_unlock(&b->lock);
        }
}

void kvm_async_pf_task_wake(u32 token)
{
        u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
        struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
        struct kvm_task_sleep_node *n, *dummy = NULL;

        if (token == ~0) {
                apf_task_wake_all();
                return;
        }

again:
        raw_spin_lock(&b->lock);
        n = _find_apf_task(b, token);
        if (!n) {
                /*
                 * Async #PF not yet handled, add a dummy entry for the token.
                 * Allocating the token must be down outside of the raw lock
                 * as the allocator is preemptible on PREEMPT_RT kernels.
                 */
                if (!dummy) {
                        raw_spin_unlock(&b->lock);
                        dummy = kzalloc(sizeof(*dummy), GFP_ATOMIC);

                        /*
                         * Continue looping on allocation failure, eventually
                         * the async #PF will be handled and allocating a new
                         * node will be unnecessary.
                         */
                        if (!dummy)
                                cpu_relax();

                        /*
                         * Recheck for async #PF completion before enqueueing
                         * the dummy token to avoid duplicate list entries.
                         */
                        goto again;
                }
                dummy->token = token;
                dummy->cpu = smp_processor_id();
                init_swait_queue_head(&dummy->wq);
                hlist_add_head(&dummy->link, &b->list);
                dummy = NULL;
        } else {
                apf_task_wake_one(n);
        }
        raw_spin_unlock(&b->lock);

        /* A dummy token might be allocated and ultimately not used.  */
        kfree(dummy);
}
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);

noinstr u32 kvm_read_and_reset_apf_flags(void)
{
        u32 flags = 0;

        if (__this_cpu_read(async_pf_enabled)) {
                flags = __this_cpu_read(apf_reason.flags);
                __this_cpu_write(apf_reason.flags, 0);
        }

        return flags;
}
EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);

noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
{
        u32 flags = kvm_read_and_reset_apf_flags();
        irqentry_state_t state;

        if (!flags)
                return false;

        state = irqentry_enter(regs);
        instrumentation_begin();

        /*
         * If the host managed to inject an async #PF into an interrupt
         * disabled region, then die hard as this is not going to end well
         * and the host side is seriously broken.
         */
        if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
                panic("Host injected async #PF in interrupt disabled region\n");

        if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
                if (unlikely(!(user_mode(regs))))
                        panic("Host injected async #PF in kernel mode\n");
                /* Page is swapped out by the host. */
                kvm_async_pf_task_wait_schedule(token);
        } else {
                WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags);
        }

        instrumentation_end();
        irqentry_exit(regs, state);
        return true;
}

DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
{
        struct pt_regs *old_regs = set_irq_regs(regs);
        u32 token;

        apic_eoi();

        inc_irq_stat(irq_hv_callback_count);

        if (__this_cpu_read(async_pf_enabled)) {
                token = __this_cpu_read(apf_reason.token);
                kvm_async_pf_task_wake(token);
                __this_cpu_write(apf_reason.token, 0);
                wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1);
        }

        set_irq_regs(old_regs);
}

static void __init paravirt_ops_setup(void)
{
        pv_info.name = "KVM";

        if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
                pv_ops.cpu.io_delay = kvm_io_delay;

#ifdef CONFIG_X86_IO_APIC
        no_timer_check = 1;
#endif
}

static void kvm_register_steal_time(void)
{
        int cpu = smp_processor_id();
        struct kvm_steal_time *st = &per_cpu(steal_time, cpu);

        if (!has_steal_clock)
                return;

        wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
        pr_debug("stealtime: cpu %d, msr %llx\n", cpu,
                (unsigned long long) slow_virt_to_phys(st));
}

static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;

static notrace __maybe_unused void kvm_guest_apic_eoi_write(void)
{
        /**
         * This relies on __test_and_clear_bit to modify the memory
         * in a way that is atomic with respect to the local CPU.
         * The hypervisor only accesses this memory from the local CPU so
         * there's no need for lock or memory barriers.
         * An optimization barrier is implied in apic write.
         */
        if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
                return;
        apic_native_eoi();
}

static void kvm_guest_cpu_init(void)
{
        if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
                u64 pa;

                WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));

                pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
                pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;

                if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
                        pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;

                wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);

                wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
                __this_cpu_write(async_pf_enabled, true);
                pr_debug("setup async PF for cpu %d\n", smp_processor_id());
        }

        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
                unsigned long pa;

                /* Size alignment is implied but just to make it explicit. */
                BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
                __this_cpu_write(kvm_apic_eoi, 0);
                pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
                        | KVM_MSR_ENABLED;
                wrmsrl(MSR_KVM_PV_EOI_EN, pa);
        }

        if (has_steal_clock)
                kvm_register_steal_time();
}

static void kvm_pv_disable_apf(void)
{
        if (!__this_cpu_read(async_pf_enabled))
                return;

        wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
        __this_cpu_write(async_pf_enabled, false);

        pr_debug("disable async PF for cpu %d\n", smp_processor_id());
}

static void kvm_disable_steal_time(void)
{
        if (!has_steal_clock)
                return;

        wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
}

static u64 kvm_steal_clock(int cpu)
{
        u64 steal;
        struct kvm_steal_time *src;
        int version;

        src = &per_cpu(steal_time, cpu);
        do {
                version = src->version;
                virt_rmb();
                steal = src->steal;
                virt_rmb();
        } while ((version & 1) || (version != src->version));

        return steal;
}

static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
{
        early_set_memory_decrypted((unsigned long) ptr, size);
}

/*
 * Iterate through all possible CPUs and map the memory region pointed
 * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
 *
 * Note: we iterate through all possible CPUs to ensure that CPUs
 * hotplugged will have their per-cpu variable already mapped as
 * decrypted.
 */
static void __init sev_map_percpu_data(void)
{
        int cpu;

        if (cc_vendor != CC_VENDOR_AMD ||
            !cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
                return;

        for_each_possible_cpu(cpu) {
                __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason));
                __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time));
                __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi));
        }
}

static void kvm_guest_cpu_offline(bool shutdown)
{
        kvm_disable_steal_time();
        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
                wrmsrl(MSR_KVM_PV_EOI_EN, 0);
        if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
                wrmsrl(MSR_KVM_MIGRATION_CONTROL, 0);
        kvm_pv_disable_apf();
        if (!shutdown)
                apf_task_wake_all();
        kvmclock_disable();
}

static int kvm_cpu_online(unsigned int cpu)
{
        unsigned long flags;

        local_irq_save(flags);
        kvm_guest_cpu_init();
        local_irq_restore(flags);
        return 0;
}

#ifdef CONFIG_SMP

static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);

static bool pv_tlb_flush_supported(void)
{
        return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
                !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
                kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
                !boot_cpu_has(X86_FEATURE_MWAIT) &&
                (num_possible_cpus() != 1));
}

static bool pv_ipi_supported(void)
{
        return (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI) &&
               (num_possible_cpus() != 1));
}

static bool pv_sched_yield_supported(void)
{
        return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
                !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
            kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
            !boot_cpu_has(X86_FEATURE_MWAIT) &&
            (num_possible_cpus() != 1));
}

#define KVM_IPI_CLUSTER_SIZE        (2 * BITS_PER_LONG)

static void __send_ipi_mask(const struct cpumask *mask, int vector)
{
        unsigned long flags;
        int cpu, min = 0, max = 0;
#ifdef CONFIG_X86_64
        __uint128_t ipi_bitmap = 0;
#else
        u64 ipi_bitmap = 0;
#endif
        u32 apic_id, icr;
        long ret;

        if (cpumask_empty(mask))
                return;

        local_irq_save(flags);

        switch (vector) {
        default:
                icr = APIC_DM_FIXED | vector;
                break;
        case NMI_VECTOR:
                icr = APIC_DM_NMI;
                break;
        }

        for_each_cpu(cpu, mask) {
                apic_id = per_cpu(x86_cpu_to_apicid, cpu);
                if (!ipi_bitmap) {
                        min = max = apic_id;
                } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
                        ipi_bitmap <<= min - apic_id;
                        min = apic_id;
                } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) {
                        max = apic_id < max ? max : apic_id;
                } else {
                        ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
                                (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
                        WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
                                  ret);
                        min = max = apic_id;
                        ipi_bitmap = 0;
                }
                __set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
        }

        if (ipi_bitmap) {
                ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
                        (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
                WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
                          ret);
        }

        local_irq_restore(flags);
}

static void kvm_send_ipi_mask(const struct cpumask *mask, int vector)
{
        __send_ipi_mask(mask, vector);
}

static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
{
        unsigned int this_cpu = smp_processor_id();
        struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
        const struct cpumask *local_mask;

        cpumask_copy(new_mask, mask);
        cpumask_clear_cpu(this_cpu, new_mask);
        local_mask = new_mask;
        __send_ipi_mask(local_mask, vector);
}

static int __init setup_efi_kvm_sev_migration(void)
{
        efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled";
        efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID;
        efi_status_t status;
        unsigned long size;
        bool enabled;

        if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) ||
            !kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
                return 0;

        if (!efi_enabled(EFI_BOOT))
                return 0;

        if (!efi_enabled(EFI_RUNTIME_SERVICES)) {
                pr_info("%s : EFI runtime services are not enabled\n", __func__);
                return 0;
        }

        size = sizeof(enabled);

        /* Get variable contents into buffer */
        status = efi.get_variable(efi_sev_live_migration_enabled,
                                  &efi_variable_guid, NULL, &size, &enabled);

        if (status == EFI_NOT_FOUND) {
                pr_info("%s : EFI live migration variable not found\n", __func__);
                return 0;
        }

        if (status != EFI_SUCCESS) {
                pr_info("%s : EFI variable retrieval failed\n", __func__);
                return 0;
        }

        if (enabled == 0) {
                pr_info("%s: live migration disabled in EFI\n", __func__);
                return 0;
        }

        pr_info("%s : live migration enabled in EFI\n", __func__);
        wrmsrl(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY);

        return 1;
}

late_initcall(setup_efi_kvm_sev_migration);

/*
 * Set the IPI entry points
 */
static __init void kvm_setup_pv_ipi(void)
{
        apic_update_callback(send_IPI_mask, kvm_send_ipi_mask);
        apic_update_callback(send_IPI_mask_allbutself, kvm_send_ipi_mask_allbutself);
        pr_info("setup PV IPIs\n");
}

static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
{
        int cpu;

        native_send_call_func_ipi(mask);

        /* Make sure other vCPUs get a chance to run if they need to. */
        for_each_cpu(cpu, mask) {
                if (!idle_cpu(cpu) && vcpu_is_preempted(cpu)) {
                        kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
                        break;
                }
        }
}

static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
                        const struct flush_tlb_info *info)
{
        u8 state;
        int cpu;
        struct kvm_steal_time *src;
        struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);

        cpumask_copy(flushmask, cpumask);
        /*
         * We have to call flush only on online vCPUs. And
         * queue flush_on_enter for pre-empted vCPUs
         */
        for_each_cpu(cpu, flushmask) {
                /*
                 * The local vCPU is never preempted, so we do not explicitly
                 * skip check for local vCPU - it will never be cleared from
                 * flushmask.
                 */
                src = &per_cpu(steal_time, cpu);
                state = READ_ONCE(src->preempted);
                if ((state & KVM_VCPU_PREEMPTED)) {
                        if (try_cmpxchg(&src->preempted, &state,
                                        state | KVM_VCPU_FLUSH_TLB))
                                __cpumask_clear_cpu(cpu, flushmask);
                }
        }

        native_flush_tlb_multi(flushmask, info);
}

static __init int kvm_alloc_cpumask(void)
{
        int cpu;

        if (!kvm_para_available() || nopv)
                return 0;

        if (pv_tlb_flush_supported() || pv_ipi_supported())
                for_each_possible_cpu(cpu) {
                        zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
                                GFP_KERNEL, cpu_to_node(cpu));
                }

        return 0;
}
arch_initcall(kvm_alloc_cpumask);

static void __init kvm_smp_prepare_boot_cpu(void)
{
        /*
         * Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
         * shares the guest physical address with the hypervisor.
         */
        sev_map_percpu_data();

        kvm_guest_cpu_init();
        native_smp_prepare_boot_cpu();
        kvm_spinlock_init();
}

static int kvm_cpu_down_prepare(unsigned int cpu)
{
        unsigned long flags;

        local_irq_save(flags);
        kvm_guest_cpu_offline(false);
        local_irq_restore(flags);
        return 0;
}

#endif

static int kvm_suspend(void)
{
        u64 val = 0;

        kvm_guest_cpu_offline(false);

#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
        if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
                rdmsrl(MSR_KVM_POLL_CONTROL, val);
        has_guest_poll = !(val & 1);
#endif
        return 0;
}

static void kvm_resume(void)
{
        kvm_cpu_online(raw_smp_processor_id());

#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
        if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
                wrmsrl(MSR_KVM_POLL_CONTROL, 0);
#endif
}

static struct syscore_ops kvm_syscore_ops = {
        .suspend        = kvm_suspend,
        .resume                = kvm_resume,
};

static void kvm_pv_guest_cpu_reboot(void *unused)
{
        kvm_guest_cpu_offline(true);
}

static int kvm_pv_reboot_notify(struct notifier_block *nb,
                                unsigned long code, void *unused)
{
        if (code == SYS_RESTART)
                on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
        return NOTIFY_DONE;
}

static struct notifier_block kvm_pv_reboot_nb = {
        .notifier_call = kvm_pv_reboot_notify,
};

/*
 * After a PV feature is registered, the host will keep writing to the
 * registered memory location. If the guest happens to shutdown, this memory
 * won't be valid. In cases like kexec, in which you install a new kernel, this
 * means a random memory location will be kept being written.
 */
#ifdef CONFIG_CRASH_DUMP
static void kvm_crash_shutdown(struct pt_regs *regs)
{
        kvm_guest_cpu_offline(true);
        native_machine_crash_shutdown(regs);
}
#endif

#if defined(CONFIG_X86_32) || !defined(CONFIG_SMP)
bool __kvm_vcpu_is_preempted(long cpu);

__visible bool __kvm_vcpu_is_preempted(long cpu)
{
        struct kvm_steal_time *src = &per_cpu(steal_time, cpu);

        return !!(src->preempted & KVM_VCPU_PREEMPTED);
}
PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);

#else

#include <asm/asm-offsets.h>

extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);

/*
 * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
 * restoring to/from the stack.
 */
#define PV_VCPU_PREEMPTED_ASM                                                     \
 "movq   __per_cpu_offset(,%rdi,8), %rax\n\t"                                     \
 "cmpb   $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \
 "setne  %al\n\t"

DEFINE_ASM_FUNC(__raw_callee_save___kvm_vcpu_is_preempted,
                PV_VCPU_PREEMPTED_ASM, .text);
#endif

static void __init kvm_guest_init(void)
{
        int i;

        paravirt_ops_setup();
        register_reboot_notifier(&kvm_pv_reboot_nb);
        for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
                raw_spin_lock_init(&async_pf_sleepers[i].lock);

        if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
                has_steal_clock = 1;
                static_call_update(pv_steal_clock, kvm_steal_clock);

                pv_ops.lock.vcpu_is_preempted =
                        PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
        }

        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
                apic_update_callback(eoi, kvm_guest_apic_eoi_write);

        if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
                static_branch_enable(&kvm_async_pf_enabled);
                sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt);
        }

#ifdef CONFIG_SMP
        if (pv_tlb_flush_supported()) {
                pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
                pv_ops.mmu.tlb_remove_table = tlb_remove_table;
                pr_info("KVM setup pv remote TLB flush\n");
        }

        smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
        if (pv_sched_yield_supported()) {
                smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
                pr_info("setup PV sched yield\n");
        }
        if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
                                      kvm_cpu_online, kvm_cpu_down_prepare) < 0)
                pr_err("failed to install cpu hotplug callbacks\n");
#else
        sev_map_percpu_data();
        kvm_guest_cpu_init();
#endif

#ifdef CONFIG_CRASH_DUMP
        machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif

        register_syscore_ops(&kvm_syscore_ops);

        /*
         * Hard lockup detection is enabled by default. Disable it, as guests
         * can get false positives too easily, for example if the host is
         * overcommitted.
         */
        hardlockup_detector_disable();
}

static noinline uint32_t __kvm_cpuid_base(void)
{
        if (boot_cpu_data.cpuid_level < 0)
                return 0;        /* So we don't blow up on old processors */

        if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
                return hypervisor_cpuid_base(KVM_SIGNATURE, 0);

        return 0;
}

static inline uint32_t kvm_cpuid_base(void)
{
        static int kvm_cpuid_base = -1;

        if (kvm_cpuid_base == -1)
                kvm_cpuid_base = __kvm_cpuid_base();

        return kvm_cpuid_base;
}

bool kvm_para_available(void)
{
        return kvm_cpuid_base() != 0;
}
EXPORT_SYMBOL_GPL(kvm_para_available);

unsigned int kvm_arch_para_features(void)
{
        return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
}

unsigned int kvm_arch_para_hints(void)
{
        return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES);
}
EXPORT_SYMBOL_GPL(kvm_arch_para_hints);

static uint32_t __init kvm_detect(void)
{
        return kvm_cpuid_base();
}

static void __init kvm_apic_init(void)
{
#ifdef CONFIG_SMP
        if (pv_ipi_supported())
                kvm_setup_pv_ipi();
#endif
}

static bool __init kvm_msi_ext_dest_id(void)
{
        return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID);
}

static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc)
{
        kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, pfn << PAGE_SHIFT, npages,
                           KVM_MAP_GPA_RANGE_ENC_STAT(enc) | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
}

static void __init kvm_init_platform(void)
{
        if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
            kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) {
                unsigned long nr_pages;
                int i;

                pv_ops.mmu.notify_page_enc_status_changed =
                        kvm_sev_hc_page_enc_status;

                /*
                 * Reset the host's shared pages list related to kernel
                 * specific page encryption status settings before we load a
                 * new kernel by kexec. Reset the page encryption status
                 * during early boot instead of just before kexec to avoid SMP
                 * races during kvm_pv_guest_cpu_reboot().
                 * NOTE: We cannot reset the complete shared pages list
                 * here as we need to retain the UEFI/OVMF firmware
                 * specific settings.
                 */

                for (i = 0; i < e820_table->nr_entries; i++) {
                        struct e820_entry *entry = &e820_table->entries[i];

                        if (entry->type != E820_TYPE_RAM)
                                continue;

                        nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE);

                        kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, entry->addr,
                                       nr_pages,
                                       KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
                }

                /*
                 * Ensure that _bss_decrypted section is marked as decrypted in the
                 * shared pages list.
                 */
                early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted,
                                                __end_bss_decrypted - __start_bss_decrypted, 0);

                /*
                 * If not booted using EFI, enable Live migration support.
                 */
                if (!efi_enabled(EFI_BOOT))
                        wrmsrl(MSR_KVM_MIGRATION_CONTROL,
                               KVM_MIGRATION_READY);
        }
        kvmclock_init();
        x86_platform.apic_post_init = kvm_apic_init;
}

#if defined(CONFIG_AMD_MEM_ENCRYPT)
static void kvm_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs)
{
        /* RAX and CPL are already in the GHCB */
        ghcb_set_rbx(ghcb, regs->bx);
        ghcb_set_rcx(ghcb, regs->cx);
        ghcb_set_rdx(ghcb, regs->dx);
        ghcb_set_rsi(ghcb, regs->si);
}

static bool kvm_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
{
        /* No checking of the return state needed */
        return true;
}
#endif

const __initconst struct hypervisor_x86 x86_hyper_kvm = {
        .name                                = "KVM",
        .detect                                = kvm_detect,
        .type                                = X86_HYPER_KVM,
        .init.guest_late_init                = kvm_guest_init,
        .init.x2apic_available                = kvm_para_available,
        .init.msi_ext_dest_id                = kvm_msi_ext_dest_id,
        .init.init_platform                = kvm_init_platform,
#if defined(CONFIG_AMD_MEM_ENCRYPT)
        .runtime.sev_es_hcall_prepare        = kvm_sev_es_hcall_prepare,
        .runtime.sev_es_hcall_finish        = kvm_sev_es_hcall_finish,
#endif
};

static __init int activate_jump_labels(void)
{
        if (has_steal_clock) {
                static_key_slow_inc(&paravirt_steal_enabled);
                if (steal_acc)
                        static_key_slow_inc(&paravirt_steal_rq_enabled);
        }

        return 0;
}
arch_initcall(activate_jump_labels);

#ifdef CONFIG_PARAVIRT_SPINLOCKS

/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
static void kvm_kick_cpu(int cpu)
{
        unsigned long flags = 0;
        u32 apicid;

        apicid = per_cpu(x86_cpu_to_apicid, cpu);
        kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
}

#include <asm/qspinlock.h>

static void kvm_wait(u8 *ptr, u8 val)
{
        if (in_nmi())
                return;

        /*
         * halt until it's our turn and kicked. Note that we do safe halt
         * for irq enabled case to avoid hang when lock info is overwritten
         * in irq spinlock slowpath and no spurious interrupt occur to save us.
         */
        if (irqs_disabled()) {
                if (READ_ONCE(*ptr) == val)
                        halt();
        } else {
                local_irq_disable();

                /* safe_halt() will enable IRQ */
                if (READ_ONCE(*ptr) == val)
                        safe_halt();
                else
                        local_irq_enable();
        }
}

/*
 * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
 */
void __init kvm_spinlock_init(void)
{
        /*
         * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an
         * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is
         * preferred over native qspinlock when vCPU is preempted.
         */
        if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) {
                pr_info("PV spinlocks disabled, no host support\n");
                return;
        }

        /*
         * Disable PV spinlocks and use native qspinlock when dedicated pCPUs
         * are available.
         */
        if (kvm_para_has_hint(KVM_HINTS_REALTIME)) {
                pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n");
                goto out;
        }

        if (num_possible_cpus() == 1) {
                pr_info("PV spinlocks disabled, single CPU\n");
                goto out;
        }

        if (nopvspin) {
                pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n");
                goto out;
        }

        pr_info("PV spinlocks enabled\n");

        __pv_init_lock_hash();
        pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
        pv_ops.lock.queued_spin_unlock =
                PV_CALLEE_SAVE(__pv_queued_spin_unlock);
        pv_ops.lock.wait = kvm_wait;
        pv_ops.lock.kick = kvm_kick_cpu;

        /*
         * When PV spinlock is enabled which is preferred over
         * virt_spin_lock(), virt_spin_lock_key's value is meaningless.
         * Just disable it anyway.
         */
out:
        static_branch_disable(&virt_spin_lock_key);
}

#endif        /* CONFIG_PARAVIRT_SPINLOCKS */

#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL

static void kvm_disable_host_haltpoll(void *i)
{
        wrmsrl(MSR_KVM_POLL_CONTROL, 0);
}

static void kvm_enable_host_haltpoll(void *i)
{
        wrmsrl(MSR_KVM_POLL_CONTROL, 1);
}

void arch_haltpoll_enable(unsigned int cpu)
{
        if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) {
                pr_err_once("host does not support poll control\n");
                pr_err_once("host upgrade recommended\n");
                return;
        }

        /* Enable guest halt poll disables host halt poll */
        smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1);
}
EXPORT_SYMBOL_GPL(arch_haltpoll_enable);

void arch_haltpoll_disable(unsigned int cpu)
{
        if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
                return;

        /* Disable guest halt poll enables host halt poll */
        smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1);
}
EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
#endif














































































































































































































































































































































   29 






   28 


























   30 




   30 










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/audit.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include "common.h"
#include <linux/slab.h>

/**
 * tomoyo_print_bprm - Print "struct linux_binprm" for auditing.
 *
 * @bprm: Pointer to "struct linux_binprm".
 * @dump: Pointer to "struct tomoyo_page_dump".
 *
 * Returns the contents of @bprm on success, NULL otherwise.
 *
 * This function uses kzalloc(), so caller must kfree() if this function
 * didn't return NULL.
 */
static char *tomoyo_print_bprm(struct linux_binprm *bprm,
                               struct tomoyo_page_dump *dump)
{
        static const int tomoyo_buffer_len = 4096 * 2;
        char *buffer = kzalloc(tomoyo_buffer_len, GFP_NOFS);
        char *cp;
        char *last_start;
        int len;
        unsigned long pos = bprm->p;
        int offset = pos % PAGE_SIZE;
        int argv_count = bprm->argc;
        int envp_count = bprm->envc;
        bool truncated = false;

        if (!buffer)
                return NULL;
        len = snprintf(buffer, tomoyo_buffer_len - 1, "argv[]={ ");
        cp = buffer + len;
        if (!argv_count) {
                memmove(cp, "} envp[]={ ", 11);
                cp += 11;
        }
        last_start = cp;
        while (argv_count || envp_count) {
                if (!tomoyo_dump_page(bprm, pos, dump))
                        goto out;
                pos += PAGE_SIZE - offset;
                /* Read. */
                while (offset < PAGE_SIZE) {
                        const char *kaddr = dump->data;
                        const unsigned char c = kaddr[offset++];

                        if (cp == last_start)
                                *cp++ = '"';
                        if (cp >= buffer + tomoyo_buffer_len - 32) {
                                /* Reserve some room for "..." string. */
                                truncated = true;
                        } else if (c == '\\') {
                                *cp++ = '\\';
                                *cp++ = '\\';
                        } else if (c > ' ' && c < 127) {
                                *cp++ = c;
                        } else if (!c) {
                                *cp++ = '"';
                                *cp++ = ' ';
                                last_start = cp;
                        } else {
                                *cp++ = '\\';
                                *cp++ = (c >> 6) + '0';
                                *cp++ = ((c >> 3) & 7) + '0';
                                *cp++ = (c & 7) + '0';
                        }
                        if (c)
                                continue;
                        if (argv_count) {
                                if (--argv_count == 0) {
                                        if (truncated) {
                                                cp = last_start;
                                                memmove(cp, "... ", 4);
                                                cp += 4;
                                        }
                                        memmove(cp, "} envp[]={ ", 11);
                                        cp += 11;
                                        last_start = cp;
                                        truncated = false;
                                }
                        } else if (envp_count) {
                                if (--envp_count == 0) {
                                        if (truncated) {
                                                cp = last_start;
                                                memmove(cp, "... ", 4);
                                                cp += 4;
                                        }
                                }
                        }
                        if (!argv_count && !envp_count)
                                break;
                }
                offset = 0;
        }
        *cp++ = '}';
        *cp = '\0';
        return buffer;
out:
        snprintf(buffer, tomoyo_buffer_len - 1,
                 "argv[]={ ... } envp[]= { ... }");
        return buffer;
}

/**
 * tomoyo_filetype - Get string representation of file type.
 *
 * @mode: Mode value for stat().
 *
 * Returns file type string.
 */
static inline const char *tomoyo_filetype(const umode_t mode)
{
        switch (mode & S_IFMT) {
        case S_IFREG:
        case 0:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_FILE];
        case S_IFDIR:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_DIRECTORY];
        case S_IFLNK:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_SYMLINK];
        case S_IFIFO:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_FIFO];
        case S_IFSOCK:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_SOCKET];
        case S_IFBLK:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_BLOCK_DEV];
        case S_IFCHR:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_CHAR_DEV];
        }
        return "unknown"; /* This should not happen. */
}

/**
 * tomoyo_print_header - Get header line of audit log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns string representation.
 *
 * This function uses kmalloc(), so caller must kfree() if this function
 * didn't return NULL.
 */
static char *tomoyo_print_header(struct tomoyo_request_info *r)
{
        struct tomoyo_time stamp;
        const pid_t gpid = task_pid_nr(current);
        struct tomoyo_obj_info *obj = r->obj;
        static const int tomoyo_buffer_len = 4096;
        char *buffer = kmalloc(tomoyo_buffer_len, GFP_NOFS);
        int pos;
        u8 i;

        if (!buffer)
                return NULL;

        tomoyo_convert_time(ktime_get_real_seconds(), &stamp);

        pos = snprintf(buffer, tomoyo_buffer_len - 1,
                       "#%04u/%02u/%02u %02u:%02u:%02u# profile=%u mode=%s granted=%s (global-pid=%u) task={ pid=%u ppid=%u uid=%u gid=%u euid=%u egid=%u suid=%u sgid=%u fsuid=%u fsgid=%u }",
                       stamp.year, stamp.month, stamp.day, stamp.hour,
                       stamp.min, stamp.sec, r->profile, tomoyo_mode[r->mode],
                       str_yes_no(r->granted), gpid, tomoyo_sys_getpid(),
                       tomoyo_sys_getppid(),
                       from_kuid(&init_user_ns, current_uid()),
                       from_kgid(&init_user_ns, current_gid()),
                       from_kuid(&init_user_ns, current_euid()),
                       from_kgid(&init_user_ns, current_egid()),
                       from_kuid(&init_user_ns, current_suid()),
                       from_kgid(&init_user_ns, current_sgid()),
                       from_kuid(&init_user_ns, current_fsuid()),
                       from_kgid(&init_user_ns, current_fsgid()));
        if (!obj)
                goto no_obj_info;
        if (!obj->validate_done) {
                tomoyo_get_attributes(obj);
                obj->validate_done = true;
        }
        for (i = 0; i < TOMOYO_MAX_PATH_STAT; i++) {
                struct tomoyo_mini_stat *stat;
                unsigned int dev;
                umode_t mode;

                if (!obj->stat_valid[i])
                        continue;
                stat = &obj->stat[i];
                dev = stat->dev;
                mode = stat->mode;
                if (i & 1) {
                        pos += snprintf(buffer + pos,
                                        tomoyo_buffer_len - 1 - pos,
                                        " path%u.parent={ uid=%u gid=%u ino=%lu perm=0%o }",
                                        (i >> 1) + 1,
                                        from_kuid(&init_user_ns, stat->uid),
                                        from_kgid(&init_user_ns, stat->gid),
                                        (unsigned long)stat->ino,
                                        stat->mode & S_IALLUGO);
                        continue;
                }
                pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos,
                                " path%u={ uid=%u gid=%u ino=%lu major=%u minor=%u perm=0%o type=%s",
                                (i >> 1) + 1,
                                from_kuid(&init_user_ns, stat->uid),
                                from_kgid(&init_user_ns, stat->gid),
                                (unsigned long)stat->ino,
                                MAJOR(dev), MINOR(dev),
                                mode & S_IALLUGO, tomoyo_filetype(mode));
                if (S_ISCHR(mode) || S_ISBLK(mode)) {
                        dev = stat->rdev;
                        pos += snprintf(buffer + pos,
                                        tomoyo_buffer_len - 1 - pos,
                                        " dev_major=%u dev_minor=%u",
                                        MAJOR(dev), MINOR(dev));
                }
                pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos,
                                " }");
        }
no_obj_info:
        if (pos < tomoyo_buffer_len - 1)
                return buffer;
        kfree(buffer);
        return NULL;
}

/**
 * tomoyo_init_log - Allocate buffer for audit logs.
 *
 * @r:    Pointer to "struct tomoyo_request_info".
 * @len:  Buffer size needed for @fmt and @args.
 * @fmt:  The printf()'s format string.
 * @args: va_list structure for @fmt.
 *
 * Returns pointer to allocated memory.
 *
 * This function uses kzalloc(), so caller must kfree() if this function
 * didn't return NULL.
 */
char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt,
                      va_list args)
{
        char *buf = NULL;
        char *bprm_info = NULL;
        const char *header = NULL;
        char *realpath = NULL;
        const char *symlink = NULL;
        int pos;
        const char *domainname = r->domain->domainname->name;

        header = tomoyo_print_header(r);
        if (!header)
                return NULL;
        /* +10 is for '\n' etc. and '\0'. */
        len += strlen(domainname) + strlen(header) + 10;
        if (r->ee) {
                struct file *file = r->ee->bprm->file;

                realpath = tomoyo_realpath_from_path(&file->f_path);
                bprm_info = tomoyo_print_bprm(r->ee->bprm, &r->ee->dump);
                if (!realpath || !bprm_info)
                        goto out;
                /* +80 is for " exec={ realpath=\"%s\" argc=%d envc=%d %s }" */
                len += strlen(realpath) + 80 + strlen(bprm_info);
        } else if (r->obj && r->obj->symlink_target) {
                symlink = r->obj->symlink_target->name;
                /* +18 is for " symlink.target=\"%s\"" */
                len += 18 + strlen(symlink);
        }
        len = kmalloc_size_roundup(len);
        buf = kzalloc(len, GFP_NOFS);
        if (!buf)
                goto out;
        len--;
        pos = snprintf(buf, len, "%s", header);
        if (realpath) {
                struct linux_binprm *bprm = r->ee->bprm;

                pos += snprintf(buf + pos, len - pos,
                                " exec={ realpath=\"%s\" argc=%d envc=%d %s }",
                                realpath, bprm->argc, bprm->envc, bprm_info);
        } else if (symlink)
                pos += snprintf(buf + pos, len - pos, " symlink.target=\"%s\"",
                                symlink);
        pos += snprintf(buf + pos, len - pos, "\n%s\n", domainname);
        vsnprintf(buf + pos, len - pos, fmt, args);
out:
        kfree(realpath);
        kfree(bprm_info);
        kfree(header);
        return buf;
}

/* Wait queue for /sys/kernel/security/tomoyo/audit. */
static DECLARE_WAIT_QUEUE_HEAD(tomoyo_log_wait);

/* Structure for audit log. */
struct tomoyo_log {
        struct list_head list;
        char *log;
        int size;
};

/* The list for "struct tomoyo_log". */
static LIST_HEAD(tomoyo_log);

/* Lock for "struct list_head tomoyo_log". */
static DEFINE_SPINLOCK(tomoyo_log_lock);

/* Length of "struct list_head tomoyo_log". */
static unsigned int tomoyo_log_count;

/**
 * tomoyo_get_audit - Get audit mode.
 *
 * @ns:          Pointer to "struct tomoyo_policy_namespace".
 * @profile:     Profile number.
 * @index:       Index number of functionality.
 * @matched_acl: Pointer to "struct tomoyo_acl_info".
 * @is_granted:  True if granted log, false otherwise.
 *
 * Returns true if this request should be audited, false otherwise.
 */
static bool tomoyo_get_audit(const struct tomoyo_policy_namespace *ns,
                             const u8 profile, const u8 index,
                             const struct tomoyo_acl_info *matched_acl,
                             const bool is_granted)
{
        u8 mode;
        const u8 category = tomoyo_index2category[index] +
                TOMOYO_MAX_MAC_INDEX;
        struct tomoyo_profile *p;

        if (!tomoyo_policy_loaded)
                return false;
        p = tomoyo_profile(ns, profile);
        if (tomoyo_log_count >= p->pref[TOMOYO_PREF_MAX_AUDIT_LOG])
                return false;
        if (is_granted && matched_acl && matched_acl->cond &&
            matched_acl->cond->grant_log != TOMOYO_GRANTLOG_AUTO)
                return matched_acl->cond->grant_log == TOMOYO_GRANTLOG_YES;
        mode = p->config[index];
        if (mode == TOMOYO_CONFIG_USE_DEFAULT)
                mode = p->config[category];
        if (mode == TOMOYO_CONFIG_USE_DEFAULT)
                mode = p->default_config;
        if (is_granted)
                return mode & TOMOYO_CONFIG_WANT_GRANT_LOG;
        return mode & TOMOYO_CONFIG_WANT_REJECT_LOG;
}

/**
 * tomoyo_write_log2 - Write an audit log.
 *
 * @r:    Pointer to "struct tomoyo_request_info".
 * @len:  Buffer size needed for @fmt and @args.
 * @fmt:  The printf()'s format string.
 * @args: va_list structure for @fmt.
 *
 * Returns nothing.
 */
void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt,
                       va_list args)
{
        char *buf;
        struct tomoyo_log *entry;
        bool quota_exceeded = false;

        if (!tomoyo_get_audit(r->domain->ns, r->profile, r->type,
                              r->matched_acl, r->granted))
                goto out;
        buf = tomoyo_init_log(r, len, fmt, args);
        if (!buf)
                goto out;
        entry = kzalloc(sizeof(*entry), GFP_NOFS);
        if (!entry) {
                kfree(buf);
                goto out;
        }
        entry->log = buf;
        len = kmalloc_size_roundup(strlen(buf) + 1);
        /*
         * The entry->size is used for memory quota checks.
         * Don't go beyond strlen(entry->log).
         */
        entry->size = len + kmalloc_size_roundup(sizeof(*entry));
        spin_lock(&tomoyo_log_lock);
        if (tomoyo_memory_quota[TOMOYO_MEMORY_AUDIT] &&
            tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] + entry->size >=
            tomoyo_memory_quota[TOMOYO_MEMORY_AUDIT]) {
                quota_exceeded = true;
        } else {
                tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] += entry->size;
                list_add_tail(&entry->list, &tomoyo_log);
                tomoyo_log_count++;
        }
        spin_unlock(&tomoyo_log_lock);
        if (quota_exceeded) {
                kfree(buf);
                kfree(entry);
                goto out;
        }
        wake_up(&tomoyo_log_wait);
out:
        return;
}

/**
 * tomoyo_write_log - Write an audit log.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @fmt: The printf()'s format string, followed by parameters.
 *
 * Returns nothing.
 */
void tomoyo_write_log(struct tomoyo_request_info *r, const char *fmt, ...)
{
        va_list args;
        int len;

        va_start(args, fmt);
        len = vsnprintf(NULL, 0, fmt, args) + 1;
        va_end(args);
        va_start(args, fmt);
        tomoyo_write_log2(r, len, fmt, args);
        va_end(args);
}

/**
 * tomoyo_read_log - Read an audit log.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
void tomoyo_read_log(struct tomoyo_io_buffer *head)
{
        struct tomoyo_log *ptr = NULL;

        if (head->r.w_pos)
                return;
        kfree(head->read_buf);
        head->read_buf = NULL;
        spin_lock(&tomoyo_log_lock);
        if (!list_empty(&tomoyo_log)) {
                ptr = list_entry(tomoyo_log.next, typeof(*ptr), list);
                list_del(&ptr->list);
                tomoyo_log_count--;
                tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] -= ptr->size;
        }
        spin_unlock(&tomoyo_log_lock);
        if (ptr) {
                head->read_buf = ptr->log;
                head->r.w[head->r.w_pos++] = head->read_buf;
                kfree(ptr);
        }
}

/**
 * tomoyo_poll_log - Wait for an audit log.
 *
 * @file: Pointer to "struct file".
 * @wait: Pointer to "poll_table". Maybe NULL.
 *
 * Returns EPOLLIN | EPOLLRDNORM when ready to read an audit log.
 */
__poll_t tomoyo_poll_log(struct file *file, poll_table *wait)
{
        if (tomoyo_log_count)
                return EPOLLIN | EPOLLRDNORM;
        poll_wait(file, &tomoyo_log_wait, wait);
        if (tomoyo_log_count)
                return EPOLLIN | EPOLLRDNORM;
        return 0;
}




































    1 






    1 






    1 

























































































































    1 






























































































































































































































































































































































    1 



    1 















    1 

















































































































    1 


    1 
    1 




    1 
    1 




















































    1 



























    1 




    1 











    1 








    1 
    1 










    1 





    1 



















































































    1 


















    1 
















    1 


    1 




    1 











































































    1 


    1 







    1 





    1 






    1 


















    1 




    1 
    1 





    1 








    1 








    1 

































    1 











    1 









    1 


    1 














    1 





    1 







    1 
    1 


















    1 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
// SPDX-License-Identifier: GPL-2.0
/*
 *
 * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
 *
 */

#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/fs.h>
#include <linux/kernel.h>

#include "debug.h"
#include "ntfs.h"
#include "ntfs_fs.h"

static const struct INDEX_NAMES {
        const __le16 *name;
        u8 name_len;
} s_index_names[INDEX_MUTEX_TOTAL] = {
        { I30_NAME, ARRAY_SIZE(I30_NAME) }, { SII_NAME, ARRAY_SIZE(SII_NAME) },
        { SDH_NAME, ARRAY_SIZE(SDH_NAME) }, { SO_NAME, ARRAY_SIZE(SO_NAME) },
        { SQ_NAME, ARRAY_SIZE(SQ_NAME) },   { SR_NAME, ARRAY_SIZE(SR_NAME) },
};

/*
 * cmp_fnames - Compare two names in index.
 *
 * if l1 != 0
 *   Both names are little endian on-disk ATTR_FILE_NAME structs.
 * else
 *   key1 - cpu_str, key2 - ATTR_FILE_NAME
 */
static int cmp_fnames(const void *key1, size_t l1, const void *key2, size_t l2,
                      const void *data)
{
        const struct ATTR_FILE_NAME *f2 = key2;
        const struct ntfs_sb_info *sbi = data;
        const struct ATTR_FILE_NAME *f1;
        u16 fsize2;
        bool both_case;

        if (l2 <= offsetof(struct ATTR_FILE_NAME, name))
                return -1;

        fsize2 = fname_full_size(f2);
        if (l2 < fsize2)
                return -1;

        both_case = f2->type != FILE_NAME_DOS && !sbi->options->nocase;
        if (!l1) {
                const struct le_str *s2 = (struct le_str *)&f2->name_len;

                /*
                 * If names are equal (case insensitive)
                 * try to compare it case sensitive.
                 */
                return ntfs_cmp_names_cpu(key1, s2, sbi->upcase, both_case);
        }

        f1 = key1;
        return ntfs_cmp_names(f1->name, f1->name_len, f2->name, f2->name_len,
                              sbi->upcase, both_case);
}

/*
 * cmp_uint - $SII of $Secure and $Q of Quota
 */
static int cmp_uint(const void *key1, size_t l1, const void *key2, size_t l2,
                    const void *data)
{
        const u32 *k1 = key1;
        const u32 *k2 = key2;

        if (l2 < sizeof(u32))
                return -1;

        if (*k1 < *k2)
                return -1;
        if (*k1 > *k2)
                return 1;
        return 0;
}

/*
 * cmp_sdh - $SDH of $Secure
 */
static int cmp_sdh(const void *key1, size_t l1, const void *key2, size_t l2,
                   const void *data)
{
        const struct SECURITY_KEY *k1 = key1;
        const struct SECURITY_KEY *k2 = key2;
        u32 t1, t2;

        if (l2 < sizeof(struct SECURITY_KEY))
                return -1;

        t1 = le32_to_cpu(k1->hash);
        t2 = le32_to_cpu(k2->hash);

        /* First value is a hash value itself. */
        if (t1 < t2)
                return -1;
        if (t1 > t2)
                return 1;

        /* Second value is security Id. */
        if (data) {
                t1 = le32_to_cpu(k1->sec_id);
                t2 = le32_to_cpu(k2->sec_id);
                if (t1 < t2)
                        return -1;
                if (t1 > t2)
                        return 1;
        }

        return 0;
}

/*
 * cmp_uints - $O of ObjId and "$R" for Reparse.
 */
static int cmp_uints(const void *key1, size_t l1, const void *key2, size_t l2,
                     const void *data)
{
        const __le32 *k1 = key1;
        const __le32 *k2 = key2;
        size_t count;

        if ((size_t)data == 1) {
                /*
                 * ni_delete_all -> ntfs_remove_reparse ->
                 * delete all with this reference.
                 * k1, k2 - pointers to REPARSE_KEY
                 */

                k1 += 1; // Skip REPARSE_KEY.ReparseTag
                k2 += 1; // Skip REPARSE_KEY.ReparseTag
                if (l2 <= sizeof(int))
                        return -1;
                l2 -= sizeof(int);
                if (l1 <= sizeof(int))
                        return 1;
                l1 -= sizeof(int);
        }

        if (l2 < sizeof(int))
                return -1;

        for (count = min(l1, l2) >> 2; count > 0; --count, ++k1, ++k2) {
                u32 t1 = le32_to_cpu(*k1);
                u32 t2 = le32_to_cpu(*k2);

                if (t1 > t2)
                        return 1;
                if (t1 < t2)
                        return -1;
        }

        if (l1 > l2)
                return 1;
        if (l1 < l2)
                return -1;

        return 0;
}

static inline NTFS_CMP_FUNC get_cmp_func(const struct INDEX_ROOT *root)
{
        switch (root->type) {
        case ATTR_NAME:
                if (root->rule == NTFS_COLLATION_TYPE_FILENAME)
                        return &cmp_fnames;
                break;
        case ATTR_ZERO:
                switch (root->rule) {
                case NTFS_COLLATION_TYPE_UINT:
                        return &cmp_uint;
                case NTFS_COLLATION_TYPE_SECURITY_HASH:
                        return &cmp_sdh;
                case NTFS_COLLATION_TYPE_UINTS:
                        return &cmp_uints;
                default:
                        break;
                }
                break;
        default:
                break;
        }

        return NULL;
}

struct bmp_buf {
        struct ATTRIB *b;
        struct mft_inode *mi;
        struct buffer_head *bh;
        ulong *buf;
        size_t bit;
        u32 nbits;
        u64 new_valid;
};

static int bmp_buf_get(struct ntfs_index *indx, struct ntfs_inode *ni,
                       size_t bit, struct bmp_buf *bbuf)
{
        struct ATTRIB *b;
        size_t data_size, valid_size, vbo, off = bit >> 3;
        struct ntfs_sb_info *sbi = ni->mi.sbi;
        CLST vcn = off >> sbi->cluster_bits;
        struct ATTR_LIST_ENTRY *le = NULL;
        struct buffer_head *bh;
        struct super_block *sb;
        u32 blocksize;
        const struct INDEX_NAMES *in = &s_index_names[indx->type];

        bbuf->bh = NULL;

        b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len,
                         &vcn, &bbuf->mi);
        bbuf->b = b;
        if (!b)
                return -EINVAL;

        if (!b->non_res) {
                data_size = le32_to_cpu(b->res.data_size);

                if (off >= data_size)
                        return -EINVAL;

                bbuf->buf = (ulong *)resident_data(b);
                bbuf->bit = 0;
                bbuf->nbits = data_size * 8;

                return 0;
        }

        data_size = le64_to_cpu(b->nres.data_size);
        if (WARN_ON(off >= data_size)) {
                /* Looks like filesystem error. */
                return -EINVAL;
        }

        valid_size = le64_to_cpu(b->nres.valid_size);

        bh = ntfs_bread_run(sbi, &indx->bitmap_run, off);
        if (!bh)
                return -EIO;

        if (IS_ERR(bh))
                return PTR_ERR(bh);

        bbuf->bh = bh;

        if (buffer_locked(bh))
                __wait_on_buffer(bh);

        lock_buffer(bh);

        sb = sbi->sb;
        blocksize = sb->s_blocksize;

        vbo = off & ~(size_t)sbi->block_mask;

        bbuf->new_valid = vbo + blocksize;
        if (bbuf->new_valid <= valid_size)
                bbuf->new_valid = 0;
        else if (bbuf->new_valid > data_size)
                bbuf->new_valid = data_size;

        if (vbo >= valid_size) {
                memset(bh->b_data, 0, blocksize);
        } else if (vbo + blocksize > valid_size) {
                u32 voff = valid_size & sbi->block_mask;

                memset(bh->b_data + voff, 0, blocksize - voff);
        }

        bbuf->buf = (ulong *)bh->b_data;
        bbuf->bit = 8 * (off & ~(size_t)sbi->block_mask);
        bbuf->nbits = 8 * blocksize;

        return 0;
}

static void bmp_buf_put(struct bmp_buf *bbuf, bool dirty)
{
        struct buffer_head *bh = bbuf->bh;
        struct ATTRIB *b = bbuf->b;

        if (!bh) {
                if (b && !b->non_res && dirty)
                        bbuf->mi->dirty = true;
                return;
        }

        if (!dirty)
                goto out;

        if (bbuf->new_valid) {
                b->nres.valid_size = cpu_to_le64(bbuf->new_valid);
                bbuf->mi->dirty = true;
        }

        set_buffer_uptodate(bh);
        mark_buffer_dirty(bh);

out:
        unlock_buffer(bh);
        put_bh(bh);
}

/*
 * indx_mark_used - Mark the bit @bit as used.
 */
static int indx_mark_used(struct ntfs_index *indx, struct ntfs_inode *ni,
                          size_t bit)
{
        int err;
        struct bmp_buf bbuf;

        err = bmp_buf_get(indx, ni, bit, &bbuf);
        if (err)
                return err;

        __set_bit_le(bit - bbuf.bit, bbuf.buf);

        bmp_buf_put(&bbuf, true);

        return 0;
}

/*
 * indx_mark_free - Mark the bit @bit as free.
 */
static int indx_mark_free(struct ntfs_index *indx, struct ntfs_inode *ni,
                          size_t bit)
{
        int err;
        struct bmp_buf bbuf;

        err = bmp_buf_get(indx, ni, bit, &bbuf);
        if (err)
                return err;

        __clear_bit_le(bit - bbuf.bit, bbuf.buf);

        bmp_buf_put(&bbuf, true);

        return 0;
}

/*
 * scan_nres_bitmap
 *
 * If ntfs_readdir calls this function (indx_used_bit -> scan_nres_bitmap),
 * inode is shared locked and no ni_lock.
 * Use rw_semaphore for read/write access to bitmap_run.
 */
static int scan_nres_bitmap(struct ntfs_inode *ni, struct ATTRIB *bitmap,
                            struct ntfs_index *indx, size_t from,
                            bool (*fn)(const ulong *buf, u32 bit, u32 bits,
                                       size_t *ret),
                            size_t *ret)
{
        struct ntfs_sb_info *sbi = ni->mi.sbi;
        struct super_block *sb = sbi->sb;
        struct runs_tree *run = &indx->bitmap_run;
        struct rw_semaphore *lock = &indx->run_lock;
        u32 nbits = sb->s_blocksize * 8;
        u32 blocksize = sb->s_blocksize;
        u64 valid_size = le64_to_cpu(bitmap->nres.valid_size);
        u64 data_size = le64_to_cpu(bitmap->nres.data_size);
        sector_t eblock = bytes_to_block(sb, data_size);
        size_t vbo = from >> 3;
        sector_t blk = (vbo & sbi->cluster_mask) >> sb->s_blocksize_bits;
        sector_t vblock = vbo >> sb->s_blocksize_bits;
        sector_t blen, block;
        CLST lcn, clen, vcn, vcn_next;
        size_t idx;
        struct buffer_head *bh;
        bool ok;

        *ret = MINUS_ONE_T;

        if (vblock >= eblock)
                return 0;

        from &= nbits - 1;
        vcn = vbo >> sbi->cluster_bits;

        down_read(lock);
        ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx);
        up_read(lock);

next_run:
        if (!ok) {
                int err;
                const struct INDEX_NAMES *name = &s_index_names[indx->type];

                down_write(lock);
                err = attr_load_runs_vcn(ni, ATTR_BITMAP, name->name,
                                         name->name_len, run, vcn);
                up_write(lock);
                if (err)
                        return err;
                down_read(lock);
                ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx);
                up_read(lock);
                if (!ok)
                        return -EINVAL;
        }

        blen = (sector_t)clen * sbi->blocks_per_cluster;
        block = (sector_t)lcn * sbi->blocks_per_cluster;

        for (; blk < blen; blk++, from = 0) {
                bh = ntfs_bread(sb, block + blk);
                if (!bh)
                        return -EIO;

                vbo = (u64)vblock << sb->s_blocksize_bits;
                if (vbo >= valid_size) {
                        memset(bh->b_data, 0, blocksize);
                } else if (vbo + blocksize > valid_size) {
                        u32 voff = valid_size & sbi->block_mask;

                        memset(bh->b_data + voff, 0, blocksize - voff);
                }

                if (vbo + blocksize > data_size)
                        nbits = 8 * (data_size - vbo);

                ok = nbits > from ?
                             (*fn)((ulong *)bh->b_data, from, nbits, ret) :
                             false;
                put_bh(bh);

                if (ok) {
                        *ret += 8 * vbo;
                        return 0;
                }

                if (++vblock >= eblock) {
                        *ret = MINUS_ONE_T;
                        return 0;
                }
        }
        blk = 0;
        vcn_next = vcn + clen;
        down_read(lock);
        ok = run_get_entry(run, ++idx, &vcn, &lcn, &clen) && vcn == vcn_next;
        if (!ok)
                vcn = vcn_next;
        up_read(lock);
        goto next_run;
}

static bool scan_for_free(const ulong *buf, u32 bit, u32 bits, size_t *ret)
{
        size_t pos = find_next_zero_bit_le(buf, bits, bit);

        if (pos >= bits)
                return false;
        *ret = pos;
        return true;
}

/*
 * indx_find_free - Look for free bit.
 *
 * Return: -1 if no free bits.
 */
static int indx_find_free(struct ntfs_index *indx, struct ntfs_inode *ni,
                          size_t *bit, struct ATTRIB **bitmap)
{
        struct ATTRIB *b;
        struct ATTR_LIST_ENTRY *le = NULL;
        const struct INDEX_NAMES *in = &s_index_names[indx->type];
        int err;

        b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len,
                         NULL, NULL);

        if (!b)
                return -ENOENT;

        *bitmap = b;
        *bit = MINUS_ONE_T;

        if (!b->non_res) {
                u32 nbits = 8 * le32_to_cpu(b->res.data_size);
                size_t pos = find_next_zero_bit_le(resident_data(b), nbits, 0);

                if (pos < nbits)
                        *bit = pos;
        } else {
                err = scan_nres_bitmap(ni, b, indx, 0, &scan_for_free, bit);

                if (err)
                        return err;
        }

        return 0;
}

static bool scan_for_used(const ulong *buf, u32 bit, u32 bits, size_t *ret)
{
        size_t pos = find_next_bit_le(buf, bits, bit);

        if (pos >= bits)
                return false;
        *ret = pos;
        return true;
}

/*
 * indx_used_bit - Look for used bit.
 *
 * Return: MINUS_ONE_T if no used bits.
 */
int indx_used_bit(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit)
{
        struct ATTRIB *b;
        struct ATTR_LIST_ENTRY *le = NULL;
        size_t from = *bit;
        const struct INDEX_NAMES *in = &s_index_names[indx->type];
        int err;

        b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len,
                         NULL, NULL);

        if (!b)
                return -ENOENT;

        *bit = MINUS_ONE_T;

        if (!b->non_res) {
                u32 nbits = le32_to_cpu(b->res.data_size) * 8;
                size_t pos = find_next_bit_le(resident_data(b), nbits, from);

                if (pos < nbits)
                        *bit = pos;
        } else {
                err = scan_nres_bitmap(ni, b, indx, from, &scan_for_used, bit);
                if (err)
                        return err;
        }

        return 0;
}

/*
 * hdr_find_split
 *
 * Find a point at which the index allocation buffer would like to be split.
 * NOTE: This function should never return 'END' entry NULL returns on error.
 */
static const struct NTFS_DE *hdr_find_split(const struct INDEX_HDR *hdr)
{
        size_t o;
        const struct NTFS_DE *e = hdr_first_de(hdr);
        u32 used_2 = le32_to_cpu(hdr->used) >> 1;
        u16 esize;

        if (!e || de_is_last(e))
                return NULL;

        esize = le16_to_cpu(e->size);
        for (o = le32_to_cpu(hdr->de_off) + esize; o < used_2; o += esize) {
                const struct NTFS_DE *p = e;

                e = Add2Ptr(hdr, o);

                /* We must not return END entry. */
                if (de_is_last(e))
                        return p;

                esize = le16_to_cpu(e->size);
        }

        return e;
}

/*
 * hdr_insert_head - Insert some entries at the beginning of the buffer.
 *
 * It is used to insert entries into a newly-created buffer.
 */
static const struct NTFS_DE *hdr_insert_head(struct INDEX_HDR *hdr,
                                             const void *ins, u32 ins_bytes)
{
        u32 to_move;
        struct NTFS_DE *e = hdr_first_de(hdr);
        u32 used = le32_to_cpu(hdr->used);

        if (!e)
                return NULL;

        /* Now we just make room for the inserted entries and jam it in. */
        to_move = used - le32_to_cpu(hdr->de_off);
        memmove(Add2Ptr(e, ins_bytes), e, to_move);
        memcpy(e, ins, ins_bytes);
        hdr->used = cpu_to_le32(used + ins_bytes);

        return e;
}

/*
 * index_hdr_check
 *
 * return true if INDEX_HDR is valid
 */
static bool index_hdr_check(const struct INDEX_HDR *hdr, u32 bytes)
{
        u32 end = le32_to_cpu(hdr->used);
        u32 tot = le32_to_cpu(hdr->total);
        u32 off = le32_to_cpu(hdr->de_off);

        if (!IS_ALIGNED(off, 8) || tot > bytes || end > tot ||
            off + sizeof(struct NTFS_DE) > end) {
                /* incorrect index buffer. */
                return false;
        }

        return true;
}

/*
 * index_buf_check
 *
 * return true if INDEX_BUFFER seems is valid
 */
static bool index_buf_check(const struct INDEX_BUFFER *ib, u32 bytes,
                            const CLST *vbn)
{
        const struct NTFS_RECORD_HEADER *rhdr = &ib->rhdr;
        u16 fo = le16_to_cpu(rhdr->fix_off);
        u16 fn = le16_to_cpu(rhdr->fix_num);

        if (bytes <= offsetof(struct INDEX_BUFFER, ihdr) ||
            rhdr->sign != NTFS_INDX_SIGNATURE ||
            fo < sizeof(struct INDEX_BUFFER)
            /* Check index buffer vbn. */
            || (vbn && *vbn != le64_to_cpu(ib->vbn)) || (fo % sizeof(short)) ||
            fo + fn * sizeof(short) >= bytes ||
            fn != ((bytes >> SECTOR_SHIFT) + 1)) {
                /* incorrect index buffer. */
                return false;
        }

        return index_hdr_check(&ib->ihdr,
                               bytes - offsetof(struct INDEX_BUFFER, ihdr));
}

void fnd_clear(struct ntfs_fnd *fnd)
{
        int i;

        for (i = fnd->level - 1; i >= 0; i--) {
                struct indx_node *n = fnd->nodes[i];

                if (!n)
                        continue;

                put_indx_node(n);
                fnd->nodes[i] = NULL;
        }
        fnd->level = 0;
        fnd->root_de = NULL;
}

static int fnd_push(struct ntfs_fnd *fnd, struct indx_node *n,
                    struct NTFS_DE *e)
{
        int i = fnd->level;

        if (i < 0 || i >= ARRAY_SIZE(fnd->nodes))
                return -EINVAL;
        fnd->nodes[i] = n;
        fnd->de[i] = e;
        fnd->level += 1;
        return 0;
}

static struct indx_node *fnd_pop(struct ntfs_fnd *fnd)
{
        struct indx_node *n;
        int i = fnd->level;

        i -= 1;
        n = fnd->nodes[i];
        fnd->nodes[i] = NULL;
        fnd->level = i;

        return n;
}

static bool fnd_is_empty(struct ntfs_fnd *fnd)
{
        if (!fnd->level)
                return !fnd->root_de;

        return !fnd->de[fnd->level - 1];
}

/*
 * hdr_find_e - Locate an entry the index buffer.
 *
 * If no matching entry is found, it returns the first entry which is greater
 * than the desired entry If the search key is greater than all the entries the
 * buffer, it returns the 'end' entry. This function does a binary search of the
 * current index buffer, for the first entry that is <= to the search value.
 *
 * Return: NULL if error.
 */
static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx,
                                  const struct INDEX_HDR *hdr, const void *key,
                                  size_t key_len, const void *ctx, int *diff)
{
        struct NTFS_DE *e, *found = NULL;
        NTFS_CMP_FUNC cmp = indx->cmp;
        int min_idx = 0, mid_idx, max_idx = 0;
        int diff2;
        int table_size = 8;
        u32 e_size, e_key_len;
        u32 end = le32_to_cpu(hdr->used);
        u32 off = le32_to_cpu(hdr->de_off);
        u32 total = le32_to_cpu(hdr->total);
        u16 offs[128];

        if (unlikely(!cmp))
                return NULL;

fill_table:
        if (end > total)
                return NULL;

        if (off + sizeof(struct NTFS_DE) > end)
                return NULL;

        e = Add2Ptr(hdr, off);
        e_size = le16_to_cpu(e->size);

        if (e_size < sizeof(struct NTFS_DE) || off + e_size > end)
                return NULL;

        if (!de_is_last(e)) {
                offs[max_idx] = off;
                off += e_size;

                max_idx++;
                if (max_idx < table_size)
                        goto fill_table;

                max_idx--;
        }

binary_search:
        e_key_len = le16_to_cpu(e->key_size);

        diff2 = (*cmp)(key, key_len, e + 1, e_key_len, ctx);
        if (diff2 > 0) {
                if (found) {
                        min_idx = mid_idx + 1;
                } else {
                        if (de_is_last(e))
                                return NULL;

                        max_idx = 0;
                        table_size = min(table_size * 2, (int)ARRAY_SIZE(offs));
                        goto fill_table;
                }
        } else if (diff2 < 0) {
                if (found)
                        max_idx = mid_idx - 1;
                else
                        max_idx--;

                found = e;
        } else {
                *diff = 0;
                return e;
        }

        if (min_idx > max_idx) {
                *diff = -1;
                return found;
        }

        mid_idx = (min_idx + max_idx) >> 1;
        e = Add2Ptr(hdr, offs[mid_idx]);

        goto binary_search;
}

/*
 * hdr_insert_de - Insert an index entry into the buffer.
 *
 * 'before' should be a pointer previously returned from hdr_find_e.
 */
static struct NTFS_DE *hdr_insert_de(const struct ntfs_index *indx,
                                     struct INDEX_HDR *hdr,
                                     const struct NTFS_DE *de,
                                     struct NTFS_DE *before, const void *ctx)
{
        int diff;
        size_t off = PtrOffset(hdr, before);
        u32 used = le32_to_cpu(hdr->used);
        u32 total = le32_to_cpu(hdr->total);
        u16 de_size = le16_to_cpu(de->size);

        /* First, check to see if there's enough room. */
        if (used + de_size > total)
                return NULL;

        /* We know there's enough space, so we know we'll succeed. */
        if (before) {
                /* Check that before is inside Index. */
                if (off >= used || off < le32_to_cpu(hdr->de_off) ||
                    off + le16_to_cpu(before->size) > total) {
                        return NULL;
                }
                goto ok;
        }
        /* No insert point is applied. Get it manually. */
        before = hdr_find_e(indx, hdr, de + 1, le16_to_cpu(de->key_size), ctx,
                            &diff);
        if (!before)
                return NULL;
        off = PtrOffset(hdr, before);

ok:
        /* Now we just make room for the entry and jam it in. */
        memmove(Add2Ptr(before, de_size), before, used - off);

        hdr->used = cpu_to_le32(used + de_size);
        memcpy(before, de, de_size);

        return before;
}

/*
 * hdr_delete_de - Remove an entry from the index buffer.
 */
static inline struct NTFS_DE *hdr_delete_de(struct INDEX_HDR *hdr,
                                            struct NTFS_DE *re)
{
        u32 used = le32_to_cpu(hdr->used);
        u16 esize = le16_to_cpu(re->size);
        u32 off = PtrOffset(hdr, re);
        int bytes = used - (off + esize);

        /* check INDEX_HDR valid before using INDEX_HDR */
        if (!check_index_header(hdr, le32_to_cpu(hdr->total)))
                return NULL;

        if (off >= used || esize < sizeof(struct NTFS_DE) ||
            bytes < sizeof(struct NTFS_DE))
                return NULL;

        hdr->used = cpu_to_le32(used - esize);
        memmove(re, Add2Ptr(re, esize), bytes);

        return re;
}

void indx_clear(struct ntfs_index *indx)
{
        run_close(&indx->alloc_run);
        run_close(&indx->bitmap_run);
}

int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi,
              const struct ATTRIB *attr, enum index_mutex_classed type)
{
        u32 t32;
        const struct INDEX_ROOT *root = resident_data(attr);

        t32 = le32_to_cpu(attr->res.data_size);
        if (t32 <= offsetof(struct INDEX_ROOT, ihdr) ||
            !index_hdr_check(&root->ihdr,
                             t32 - offsetof(struct INDEX_ROOT, ihdr))) {
                goto out;
        }

        /* Check root fields. */
        if (!root->index_block_clst)
                goto out;

        indx->type = type;
        indx->idx2vbn_bits = __ffs(root->index_block_clst);

        t32 = le32_to_cpu(root->index_block_size);
        indx->index_bits = blksize_bits(t32);

        /* Check index record size. */
        if (t32 < sbi->cluster_size) {
                /* Index record is smaller than a cluster, use 512 blocks. */
                if (t32 != root->index_block_clst * SECTOR_SIZE)
                        goto out;

                /* Check alignment to a cluster. */
                if ((sbi->cluster_size >> SECTOR_SHIFT) &
                    (root->index_block_clst - 1)) {
                        goto out;
                }

                indx->vbn2vbo_bits = SECTOR_SHIFT;
        } else {
                /* Index record must be a multiple of cluster size. */
                if (t32 != root->index_block_clst << sbi->cluster_bits)
                        goto out;

                indx->vbn2vbo_bits = sbi->cluster_bits;
        }

        init_rwsem(&indx->run_lock);

        indx->cmp = get_cmp_func(root);
        if (!indx->cmp)
                goto out;

        return 0;

out:
        ntfs_set_state(sbi, NTFS_DIRTY_DIRTY);
        return -EINVAL;
}

static struct indx_node *indx_new(struct ntfs_index *indx,
                                  struct ntfs_inode *ni, CLST vbn,
                                  const __le64 *sub_vbn)
{
        int err;
        struct NTFS_DE *e;
        struct indx_node *r;
        struct INDEX_HDR *hdr;
        struct INDEX_BUFFER *index;
        u64 vbo = (u64)vbn << indx->vbn2vbo_bits;
        u32 bytes = 1u << indx->index_bits;
        u16 fn;
        u32 eo;

        r = kzalloc(sizeof(struct indx_node), GFP_NOFS);
        if (!r)
                return ERR_PTR(-ENOMEM);

        index = kzalloc(bytes, GFP_NOFS);
        if (!index) {
                kfree(r);
                return ERR_PTR(-ENOMEM);
        }

        err = ntfs_get_bh(ni->mi.sbi, &indx->alloc_run, vbo, bytes, &r->nb);

        if (err) {
                kfree(index);
                kfree(r);
                return ERR_PTR(err);
        }

        /* Create header. */
        index->rhdr.sign = NTFS_INDX_SIGNATURE;
        index->rhdr.fix_off = cpu_to_le16(sizeof(struct INDEX_BUFFER)); // 0x28
        fn = (bytes >> SECTOR_SHIFT) + 1; // 9
        index->rhdr.fix_num = cpu_to_le16(fn);
        index->vbn = cpu_to_le64(vbn);
        hdr = &index->ihdr;
        eo = ALIGN(sizeof(struct INDEX_BUFFER) + fn * sizeof(short), 8);
        hdr->de_off = cpu_to_le32(eo);

        e = Add2Ptr(hdr, eo);

        if (sub_vbn) {
                e->flags = NTFS_IE_LAST | NTFS_IE_HAS_SUBNODES;
                e->size = cpu_to_le16(sizeof(struct NTFS_DE) + sizeof(u64));
                hdr->used =
                        cpu_to_le32(eo + sizeof(struct NTFS_DE) + sizeof(u64));
                de_set_vbn_le(e, *sub_vbn);
                hdr->flags = 1;
        } else {
                e->size = cpu_to_le16(sizeof(struct NTFS_DE));
                hdr->used = cpu_to_le32(eo + sizeof(struct NTFS_DE));
                e->flags = NTFS_IE_LAST;
        }

        hdr->total = cpu_to_le32(bytes - offsetof(struct INDEX_BUFFER, ihdr));

        r->index = index;
        return r;
}

struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni,
                                 struct ATTRIB **attr, struct mft_inode **mi)
{
        struct ATTR_LIST_ENTRY *le = NULL;
        struct ATTRIB *a;
        const struct INDEX_NAMES *in = &s_index_names[indx->type];
        struct INDEX_ROOT *root;

        a = ni_find_attr(ni, NULL, &le, ATTR_ROOT, in->name, in->name_len, NULL,
                         mi);
        if (!a)
                return NULL;

        if (attr)
                *attr = a;

        root = resident_data_ex(a, sizeof(struct INDEX_ROOT));

        /* length check */
        if (root &&
            offsetof(struct INDEX_ROOT, ihdr) + le32_to_cpu(root->ihdr.used) >
                    le32_to_cpu(a->res.data_size)) {
                return NULL;
        }

        return root;
}

static int indx_write(struct ntfs_index *indx, struct ntfs_inode *ni,
                      struct indx_node *node, int sync)
{
        struct INDEX_BUFFER *ib = node->index;

        return ntfs_write_bh(ni->mi.sbi, &ib->rhdr, &node->nb, sync);
}

/*
 * indx_read
 *
 * If ntfs_readdir calls this function
 * inode is shared locked and no ni_lock.
 * Use rw_semaphore for read/write access to alloc_run.
 */
int indx_read(struct ntfs_index *indx, struct ntfs_inode *ni, CLST vbn,
              struct indx_node **node)
{
        int err;
        struct INDEX_BUFFER *ib;
        struct runs_tree *run = &indx->alloc_run;
        struct rw_semaphore *lock = &indx->run_lock;
        u64 vbo = (u64)vbn << indx->vbn2vbo_bits;
        u32 bytes = 1u << indx->index_bits;
        struct indx_node *in = *node;
        const struct INDEX_NAMES *name;

        if (!in) {
                in = kzalloc(sizeof(struct indx_node), GFP_NOFS);
                if (!in)
                        return -ENOMEM;
        } else {
                nb_put(&in->nb);
        }

        ib = in->index;
        if (!ib) {
                ib = kmalloc(bytes, GFP_NOFS);
                if (!ib) {
                        err = -ENOMEM;
                        goto out;
                }
        }

        down_read(lock);
        err = ntfs_read_bh(ni->mi.sbi, run, vbo, &ib->rhdr, bytes, &in->nb);
        up_read(lock);
        if (!err)
                goto ok;

        if (err == -E_NTFS_FIXUP)
                goto ok;

        if (err != -ENOENT)
                goto out;

        name = &s_index_names[indx->type];
        down_write(lock);
        err = attr_load_runs_range(ni, ATTR_ALLOC, name->name, name->name_len,
                                   run, vbo, vbo + bytes);
        up_write(lock);
        if (err)
                goto out;

        down_read(lock);
        err = ntfs_read_bh(ni->mi.sbi, run, vbo, &ib->rhdr, bytes, &in->nb);
        up_read(lock);
        if (err == -E_NTFS_FIXUP)
                goto ok;

        if (err)
                goto out;

ok:
        if (!index_buf_check(ib, bytes, &vbn)) {
                ntfs_inode_err(&ni->vfs_inode, "directory corrupted");
                ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR);
                err = -EINVAL;
                goto out;
        }

        if (err == -E_NTFS_FIXUP) {
                ntfs_write_bh(ni->mi.sbi, &ib->rhdr, &in->nb, 0);
                err = 0;
        }

        /* check for index header length */
        if (offsetof(struct INDEX_BUFFER, ihdr) + le32_to_cpu(ib->ihdr.used) >
            bytes) {
                err = -EINVAL;
                goto out;
        }

        in->index = ib;
        *node = in;

out:
        if (err == -E_NTFS_CORRUPT) {
                ntfs_inode_err(&ni->vfs_inode, "directory corrupted");
                ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR);
                err = -EINVAL;
        }

        if (ib != in->index)
                kfree(ib);

        if (*node != in) {
                nb_put(&in->nb);
                kfree(in);
        }

        return err;
}

/*
 * indx_find - Scan NTFS directory for given entry.
 */
int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni,
              const struct INDEX_ROOT *root, const void *key, size_t key_len,
              const void *ctx, int *diff, struct NTFS_DE **entry,
              struct ntfs_fnd *fnd)
{
        int err;
        struct NTFS_DE *e;
        struct indx_node *node;

        if (!root)
                root = indx_get_root(&ni->dir, ni, NULL, NULL);

        if (!root) {
                /* Should not happen. */
                return -EINVAL;
        }

        /* Check cache. */
        e = fnd->level ? fnd->de[fnd->level - 1] : fnd->root_de;
        if (e && !de_is_last(e) &&
            !(*indx->cmp)(key, key_len, e + 1, le16_to_cpu(e->key_size), ctx)) {
                *entry = e;
                *diff = 0;
                return 0;
        }

        /* Soft finder reset. */
        fnd_clear(fnd);

        /* Lookup entry that is <= to the search value. */
        e = hdr_find_e(indx, &root->ihdr, key, key_len, ctx, diff);
        if (!e)
                return -EINVAL;

        fnd->root_de = e;

        for (;;) {
                node = NULL;
                if (*diff >= 0 || !de_has_vcn_ex(e))
                        break;

                /* Read next level. */
                err = indx_read(indx, ni, de_get_vbn(e), &node);
                if (err) {
                        /* io error? */
                        return err;
                }

                /* Lookup entry that is <= to the search value. */
                e = hdr_find_e(indx, &node->index->ihdr, key, key_len, ctx,
                               diff);
                if (!e) {
                        put_indx_node(node);
                        return -EINVAL;
                }

                fnd_push(fnd, node, e);
        }

        *entry = e;
        return 0;
}

int indx_find_sort(struct ntfs_index *indx, struct ntfs_inode *ni,
                   const struct INDEX_ROOT *root, struct NTFS_DE **entry,
                   struct ntfs_fnd *fnd)
{
        int err;
        struct indx_node *n = NULL;
        struct NTFS_DE *e;
        size_t iter = 0;
        int level = fnd->level;

        if (!*entry) {
                /* Start find. */
                e = hdr_first_de(&root->ihdr);
                if (!e)
                        return 0;
                fnd_clear(fnd);
                fnd->root_de = e;
        } else if (!level) {
                if (de_is_last(fnd->root_de)) {
                        *entry = NULL;
                        return 0;
                }

                e = hdr_next_de(&root->ihdr, fnd->root_de);
                if (!e)
                        return -EINVAL;
                fnd->root_de = e;
        } else {
                n = fnd->nodes[level - 1];
                e = fnd->de[level - 1];

                if (de_is_last(e))
                        goto pop_level;

                e = hdr_next_de(&n->index->ihdr, e);
                if (!e)
                        return -EINVAL;

                fnd->de[level - 1] = e;
        }

        /* Just to avoid tree cycle. */
next_iter:
        if (iter++ >= 1000)
                return -EINVAL;

        while (de_has_vcn_ex(e)) {
                if (le16_to_cpu(e->size) <
                    sizeof(struct NTFS_DE) + sizeof(u64)) {
                        if (n) {
                                fnd_pop(fnd);
                                kfree(n);
                        }
                        return -EINVAL;
                }

                /* Read next level. */
                err = indx_read(indx, ni, de_get_vbn(e), &n);
                if (err)
                        return err;

                /* Try next level. */
                e = hdr_first_de(&n->index->ihdr);
                if (!e) {
                        kfree(n);
                        return -EINVAL;
                }

                fnd_push(fnd, n, e);
        }

        if (le16_to_cpu(e->size) > sizeof(struct NTFS_DE)) {
                *entry = e;
                return 0;
        }

pop_level:
        for (;;) {
                if (!de_is_last(e))
                        goto next_iter;

                /* Pop one level. */
                if (n) {
                        fnd_pop(fnd);
                        kfree(n);
                }

                level = fnd->level;

                if (level) {
                        n = fnd->nodes[level - 1];
                        e = fnd->de[level - 1];
                } else if (fnd->root_de) {
                        n = NULL;
                        e = fnd->root_de;
                        fnd->root_de = NULL;
                } else {
                        *entry = NULL;
                        return 0;
                }

                if (le16_to_cpu(e->size) > sizeof(struct NTFS_DE)) {
                        *entry = e;
                        if (!fnd->root_de)
                                fnd->root_de = e;
                        return 0;
                }
        }
}

int indx_find_raw(struct ntfs_index *indx, struct ntfs_inode *ni,
                  const struct INDEX_ROOT *root, struct NTFS_DE **entry,
                  size_t *off, struct ntfs_fnd *fnd)
{
        int err;
        struct indx_node *n = NULL;
        struct NTFS_DE *e = NULL;
        struct NTFS_DE *e2;
        size_t bit;
        CLST next_used_vbn;
        CLST next_vbn;
        u32 record_size = ni->mi.sbi->record_size;

        /* Use non sorted algorithm. */
        if (!*entry) {
                /* This is the first call. */
                e = hdr_first_de(&root->ihdr);
                if (!e)
                        return 0;
                fnd_clear(fnd);
                fnd->root_de = e;

                /* The first call with setup of initial element. */
                if (*off >= record_size) {
                        next_vbn = (((*off - record_size) >> indx->index_bits))
                                   << indx->idx2vbn_bits;
                        /* Jump inside cycle 'for'. */
                        goto next;
                }

                /* Start enumeration from root. */
                *off = 0;
        } else if (!fnd->root_de)
                return -EINVAL;

        for (;;) {
                /* Check if current entry can be used. */
                if (e && le16_to_cpu(e->size) > sizeof(struct NTFS_DE))
                        goto ok;

                if (!fnd->level) {
                        /* Continue to enumerate root. */
                        if (!de_is_last(fnd->root_de)) {
                                e = hdr_next_de(&root->ihdr, fnd->root_de);
                                if (!e)
                                        return -EINVAL;
                                fnd->root_de = e;
                                continue;
                        }

                        /* Start to enumerate indexes from 0. */
                        next_vbn = 0;
                } else {
                        /* Continue to enumerate indexes. */
                        e2 = fnd->de[fnd->level - 1];

                        n = fnd->nodes[fnd->level - 1];

                        if (!de_is_last(e2)) {
                                e = hdr_next_de(&n->index->ihdr, e2);
                                if (!e)
                                        return -EINVAL;
                                fnd->de[fnd->level - 1] = e;
                                continue;
                        }

                        /* Continue with next index. */
                        next_vbn = le64_to_cpu(n->index->vbn) +
                                   root->index_block_clst;
                }

next:
                /* Release current index. */
                if (n) {
                        fnd_pop(fnd);
                        put_indx_node(n);
                        n = NULL;
                }

                /* Skip all free indexes. */
                bit = next_vbn >> indx->idx2vbn_bits;
                err = indx_used_bit(indx, ni, &bit);
                if (err == -ENOENT || bit == MINUS_ONE_T) {
                        /* No used indexes. */
                        *entry = NULL;
                        return 0;
                }

                next_used_vbn = bit << indx->idx2vbn_bits;

                /* Read buffer into memory. */
                err = indx_read(indx, ni, next_used_vbn, &n);
                if (err)
                        return err;

                e = hdr_first_de(&n->index->ihdr);
                fnd_push(fnd, n, e);
                if (!e)
                        return -EINVAL;
        }

ok:
        /* Return offset to restore enumerator if necessary. */
        if (!n) {
                /* 'e' points in root, */
                *off = PtrOffset(&root->ihdr, e);
        } else {
                /* 'e' points in index, */
                *off = (le64_to_cpu(n->index->vbn) << indx->vbn2vbo_bits) +
                       record_size + PtrOffset(&n->index->ihdr, e);
        }

        *entry = e;
        return 0;
}

/*
 * indx_create_allocate - Create "Allocation + Bitmap" attributes.
 */
static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
                                CLST *vbn)
{
        int err;
        struct ntfs_sb_info *sbi = ni->mi.sbi;
        struct ATTRIB *bitmap;
        struct ATTRIB *alloc;
        u32 data_size = 1u << indx->index_bits;
        u32 alloc_size = ntfs_up_cluster(sbi, data_size);
        CLST len = alloc_size >> sbi->cluster_bits;
        const struct INDEX_NAMES *in = &s_index_names[indx->type];
        CLST alen;
        struct runs_tree run;

        run_init(&run);

        err = attr_allocate_clusters(sbi, &run, 0, 0, len, NULL, ALLOCATE_DEF,
                                     &alen, 0, NULL, NULL);
        if (err)
                goto out;

        err = ni_insert_nonresident(ni, ATTR_ALLOC, in->name, in->name_len,
                                    &run, 0, len, 0, &alloc, NULL, NULL);
        if (err)
                goto out1;

        alloc->nres.valid_size = alloc->nres.data_size = cpu_to_le64(data_size);

        err = ni_insert_resident(ni, ntfs3_bitmap_size(1), ATTR_BITMAP,
                                 in->name, in->name_len, &bitmap, NULL, NULL);
        if (err)
                goto out2;

        if (in->name == I30_NAME) {
                i_size_write(&ni->vfs_inode, data_size);
                inode_set_bytes(&ni->vfs_inode, alloc_size);
        }

        memcpy(&indx->alloc_run, &run, sizeof(run));

        *vbn = 0;

        return 0;

out2:
        mi_remove_attr(NULL, &ni->mi, alloc);

out1:
        run_deallocate(sbi, &run, false);

out:
        return err;
}

/*
 * indx_add_allocate - Add clusters to index.
 */
static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
                             CLST *vbn)
{
        int err;
        size_t bit;
        u64 data_size;
        u64 bmp_size, bmp_size_v;
        struct ATTRIB *bmp, *alloc;
        struct mft_inode *mi;
        const struct INDEX_NAMES *in = &s_index_names[indx->type];

        err = indx_find_free(indx, ni, &bit, &bmp);
        if (err)
                goto out1;

        if (bit != MINUS_ONE_T) {
                bmp = NULL;
        } else {
                if (bmp->non_res) {
                        bmp_size = le64_to_cpu(bmp->nres.data_size);
                        bmp_size_v = le64_to_cpu(bmp->nres.valid_size);
                } else {
                        bmp_size = bmp_size_v = le32_to_cpu(bmp->res.data_size);
                }

                bit = bmp_size << 3;
        }

        data_size = (u64)(bit + 1) << indx->index_bits;

        if (bmp) {
                /* Increase bitmap. */
                err = attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len,
                                    &indx->bitmap_run,
                                    ntfs3_bitmap_size(bit + 1), NULL, true,
                                    NULL);
                if (err)
                        goto out1;
        }

        alloc = ni_find_attr(ni, NULL, NULL, ATTR_ALLOC, in->name, in->name_len,
                             NULL, &mi);
        if (!alloc) {
                err = -EINVAL;
                if (bmp)
                        goto out2;
                goto out1;
        }

        if (data_size <= le64_to_cpu(alloc->nres.data_size)) {
                /* Reuse index. */
                goto out;
        }

        /* Increase allocation. */
        err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len,
                            &indx->alloc_run, data_size, &data_size, true,
                            NULL);
        if (err) {
                if (bmp)
                        goto out2;
                goto out1;
        }

        if (in->name == I30_NAME)
                i_size_write(&ni->vfs_inode, data_size);

out:
        *vbn = bit << indx->idx2vbn_bits;

        return 0;

out2:
        /* Ops. No space? */
        attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len,
                      &indx->bitmap_run, bmp_size, &bmp_size_v, false, NULL);

out1:
        return err;
}

/*
 * indx_insert_into_root - Attempt to insert an entry into the index root.
 *
 * @undo - True if we undoing previous remove.
 * If necessary, it will twiddle the index b-tree.
 */
static int indx_insert_into_root(struct ntfs_index *indx, struct ntfs_inode *ni,
                                 const struct NTFS_DE *new_de,
                                 struct NTFS_DE *root_de, const void *ctx,
                                 struct ntfs_fnd *fnd, bool undo)
{
        int err = 0;
        struct NTFS_DE *e, *e0, *re;
        struct mft_inode *mi;
        struct ATTRIB *attr;
        struct INDEX_HDR *hdr;
        struct indx_node *n;
        CLST new_vbn;
        __le64 *sub_vbn, t_vbn;
        u16 new_de_size;
        u32 hdr_used, hdr_total, asize, to_move;
        u32 root_size, new_root_size;
        struct ntfs_sb_info *sbi;
        int ds_root;
        struct INDEX_ROOT *root, *a_root;

        /* Get the record this root placed in. */
        root = indx_get_root(indx, ni, &attr, &mi);
        if (!root)
                return -EINVAL;

        /*
         * Try easy case:
         * hdr_insert_de will succeed if there's
         * room the root for the new entry.
         */
        hdr = &root->ihdr;
        sbi = ni->mi.sbi;
        new_de_size = le16_to_cpu(new_de->size);
        hdr_used = le32_to_cpu(hdr->used);
        hdr_total = le32_to_cpu(hdr->total);
        asize = le32_to_cpu(attr->size);
        root_size = le32_to_cpu(attr->res.data_size);

        ds_root = new_de_size + hdr_used - hdr_total;

        /* If 'undo' is set then reduce requirements. */
        if ((undo || asize + ds_root < sbi->max_bytes_per_attr) &&
            mi_resize_attr(mi, attr, ds_root)) {
                hdr->total = cpu_to_le32(hdr_total + ds_root);
                e = hdr_insert_de(indx, hdr, new_de, root_de, ctx);
                WARN_ON(!e);
                fnd_clear(fnd);
                fnd->root_de = e;

                return 0;
        }

        /* Make a copy of root attribute to restore if error. */
        a_root = kmemdup(attr, asize, GFP_NOFS);
        if (!a_root)
                return -ENOMEM;

        /*
         * Copy all the non-end entries from
         * the index root to the new buffer.
         */
        to_move = 0;
        e0 = hdr_first_de(hdr);

        /* Calculate the size to copy. */
        for (e = e0;; e = hdr_next_de(hdr, e)) {
                if (!e) {
                        err = -EINVAL;
                        goto out_free_root;
                }

                if (de_is_last(e))
                        break;
                to_move += le16_to_cpu(e->size);
        }

        if (!to_move) {
                re = NULL;
        } else {
                re = kmemdup(e0, to_move, GFP_NOFS);
                if (!re) {
                        err = -ENOMEM;
                        goto out_free_root;
                }
        }

        sub_vbn = NULL;
        if (de_has_vcn(e)) {
                t_vbn = de_get_vbn_le(e);
                sub_vbn = &t_vbn;
        }

        new_root_size = sizeof(struct INDEX_ROOT) + sizeof(struct NTFS_DE) +
                        sizeof(u64);
        ds_root = new_root_size - root_size;

        if (ds_root > 0 && asize + ds_root > sbi->max_bytes_per_attr) {
                /* Make root external. */
                err = -EOPNOTSUPP;
                goto out_free_re;
        }

        if (ds_root)
                mi_resize_attr(mi, attr, ds_root);

        /* Fill first entry (vcn will be set later). */
        e = (struct NTFS_DE *)(root + 1);
        memset(e, 0, sizeof(struct NTFS_DE));
        e->size = cpu_to_le16(sizeof(struct NTFS_DE) + sizeof(u64));
        e->flags = NTFS_IE_HAS_SUBNODES | NTFS_IE_LAST;

        hdr->flags = 1;
        hdr->used = hdr->total =
                cpu_to_le32(new_root_size - offsetof(struct INDEX_ROOT, ihdr));

        fnd->root_de = hdr_first_de(hdr);
        mi->dirty = true;

        /* Create alloc and bitmap attributes (if not). */
        err = run_is_empty(&indx->alloc_run) ?
                      indx_create_allocate(indx, ni, &new_vbn) :
                      indx_add_allocate(indx, ni, &new_vbn);

        /* Layout of record may be changed, so rescan root. */
        root = indx_get_root(indx, ni, &attr, &mi);
        if (!root) {
                /* Bug? */
                ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
                err = -EINVAL;
                goto out_free_re;
        }

        if (err) {
                /* Restore root. */
                if (mi_resize_attr(mi, attr, -ds_root)) {
                        memcpy(attr, a_root, asize);
                } else {
                        /* Bug? */
                        ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
                }
                goto out_free_re;
        }

        e = (struct NTFS_DE *)(root + 1);
        *(__le64 *)(e + 1) = cpu_to_le64(new_vbn);
        mi->dirty = true;

        /* Now we can create/format the new buffer and copy the entries into. */
        n = indx_new(indx, ni, new_vbn, sub_vbn);
        if (IS_ERR(n)) {
                err = PTR_ERR(n);
                goto out_free_re;
        }

        hdr = &n->index->ihdr;
        hdr_used = le32_to_cpu(hdr->used);
        hdr_total = le32_to_cpu(hdr->total);

        /* Copy root entries into new buffer. */
        hdr_insert_head(hdr, re, to_move);

        /* Update bitmap attribute. */
        indx_mark_used(indx, ni, new_vbn >> indx->idx2vbn_bits);

        /* Check if we can insert new entry new index buffer. */
        if (hdr_used + new_de_size > hdr_total) {
                /*
                 * This occurs if MFT record is the same or bigger than index
                 * buffer. Move all root new index and have no space to add
                 * new entry classic case when MFT record is 1K and index
                 * buffer 4K the problem should not occurs.
                 */
                kfree(re);
                indx_write(indx, ni, n, 0);

                put_indx_node(n);
                fnd_clear(fnd);
                err = indx_insert_entry(indx, ni, new_de, ctx, fnd, undo);
                goto out_free_root;
        }

        /*
         * Now root is a parent for new index buffer.
         * Insert NewEntry a new buffer.
         */
        e = hdr_insert_de(indx, hdr, new_de, NULL, ctx);
        if (!e) {
                err = -EINVAL;
                goto out_put_n;
        }
        fnd_push(fnd, n, e);

        /* Just write updates index into disk. */
        indx_write(indx, ni, n, 0);

        n = NULL;

out_put_n:
        put_indx_node(n);
out_free_re:
        kfree(re);
out_free_root:
        kfree(a_root);
        return err;
}

/*
 * indx_insert_into_buffer
 *
 * Attempt to insert an entry into an Index Allocation Buffer.
 * If necessary, it will split the buffer.
 */
static int
indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni,
                        struct INDEX_ROOT *root, const struct NTFS_DE *new_de,
                        const void *ctx, int level, struct ntfs_fnd *fnd)
{
        int err;
        const struct NTFS_DE *sp;
        struct NTFS_DE *e, *de_t, *up_e;
        struct indx_node *n2;
        struct indx_node *n1 = fnd->nodes[level];
        struct INDEX_HDR *hdr1 = &n1->index->ihdr;
        struct INDEX_HDR *hdr2;
        u32 to_copy, used, used1;
        CLST new_vbn;
        __le64 t_vbn, *sub_vbn;
        u16 sp_size;
        void *hdr1_saved = NULL;

        /* Try the most easy case. */
        e = fnd->level - 1 == level ? fnd->de[level] : NULL;
        e = hdr_insert_de(indx, hdr1, new_de, e, ctx);
        fnd->de[level] = e;
        if (e) {
                /* Just write updated index into disk. */
                indx_write(indx, ni, n1, 0);
                return 0;
        }

        /*
         * No space to insert into buffer. Split it.
         * To split we:
         *  - Save split point ('cause index buffers will be changed)
         * - Allocate NewBuffer and copy all entries <= sp into new buffer
         * - Remove all entries (sp including) from TargetBuffer
         * - Insert NewEntry into left or right buffer (depending on sp <=>
         *     NewEntry)
         * - Insert sp into parent buffer (or root)
         * - Make sp a parent for new buffer
         */
        sp = hdr_find_split(hdr1);
        if (!sp)
                return -EINVAL;

        sp_size = le16_to_cpu(sp->size);
        up_e = kmalloc(sp_size + sizeof(u64), GFP_NOFS);
        if (!up_e)
                return -ENOMEM;
        memcpy(up_e, sp, sp_size);

        used1 = le32_to_cpu(hdr1->used);
        hdr1_saved = kmemdup(hdr1, used1, GFP_NOFS);
        if (!hdr1_saved) {
                err = -ENOMEM;
                goto out;
        }

        if (!hdr1->flags) {
                up_e->flags |= NTFS_IE_HAS_SUBNODES;
                up_e->size = cpu_to_le16(sp_size + sizeof(u64));
                sub_vbn = NULL;
        } else {
                t_vbn = de_get_vbn_le(up_e);
                sub_vbn = &t_vbn;
        }

        /* Allocate on disk a new index allocation buffer. */
        err = indx_add_allocate(indx, ni, &new_vbn);
        if (err)
                goto out;

        /* Allocate and format memory a new index buffer. */
        n2 = indx_new(indx, ni, new_vbn, sub_vbn);
        if (IS_ERR(n2)) {
                err = PTR_ERR(n2);
                goto out;
        }

        hdr2 = &n2->index->ihdr;

        /* Make sp a parent for new buffer. */
        de_set_vbn(up_e, new_vbn);

        /* Copy all the entries <= sp into the new buffer. */
        de_t = hdr_first_de(hdr1);
        to_copy = PtrOffset(de_t, sp);
        hdr_insert_head(hdr2, de_t, to_copy);

        /* Remove all entries (sp including) from hdr1. */
        used = used1 - to_copy - sp_size;
        memmove(de_t, Add2Ptr(sp, sp_size), used - le32_to_cpu(hdr1->de_off));
        hdr1->used = cpu_to_le32(used);

        /*
         * Insert new entry into left or right buffer
         * (depending on sp <=> new_de).
         */
        hdr_insert_de(indx,
                      (*indx->cmp)(new_de + 1, le16_to_cpu(new_de->key_size),
                                   up_e + 1, le16_to_cpu(up_e->key_size),
                                   ctx) < 0 ?
                              hdr2 :
                              hdr1,
                      new_de, NULL, ctx);

        indx_mark_used(indx, ni, new_vbn >> indx->idx2vbn_bits);

        indx_write(indx, ni, n1, 0);
        indx_write(indx, ni, n2, 0);

        put_indx_node(n2);

        /*
         * We've finished splitting everybody, so we are ready to
         * insert the promoted entry into the parent.
         */
        if (!level) {
                /* Insert in root. */
                err = indx_insert_into_root(indx, ni, up_e, NULL, ctx, fnd, 0);
        } else {
                /*
                 * The target buffer's parent is another index buffer.
                 * TODO: Remove recursion.
                 */
                err = indx_insert_into_buffer(indx, ni, root, up_e, ctx,
                                              level - 1, fnd);
        }

        if (err) {
                /*
                 * Undo critical operations.
                 */
                indx_mark_free(indx, ni, new_vbn >> indx->idx2vbn_bits);
                memcpy(hdr1, hdr1_saved, used1);
                indx_write(indx, ni, n1, 0);
        }

out:
        kfree(up_e);
        kfree(hdr1_saved);

        return err;
}

/*
 * indx_insert_entry - Insert new entry into index.
 *
 * @undo - True if we undoing previous remove.
 */
int indx_insert_entry(struct ntfs_index *indx, struct ntfs_inode *ni,
                      const struct NTFS_DE *new_de, const void *ctx,
                      struct ntfs_fnd *fnd, bool undo)
{
        int err;
        int diff;
        struct NTFS_DE *e;
        struct ntfs_fnd *fnd_a = NULL;
        struct INDEX_ROOT *root;

        if (!fnd) {
                fnd_a = fnd_get();
                if (!fnd_a) {
                        err = -ENOMEM;
                        goto out1;
                }
                fnd = fnd_a;
        }

        root = indx_get_root(indx, ni, NULL, NULL);
        if (!root) {
                err = -EINVAL;
                goto out;
        }

        if (fnd_is_empty(fnd)) {
                /*
                 * Find the spot the tree where we want to
                 * insert the new entry.
                 */
                err = indx_find(indx, ni, root, new_de + 1,
                                le16_to_cpu(new_de->key_size), ctx, &diff, &e,
                                fnd);
                if (err)
                        goto out;

                if (!diff) {
                        err = -EEXIST;
                        goto out;
                }
        }

        if (!fnd->level) {
                /*
                 * The root is also a leaf, so we'll insert the
                 * new entry into it.
                 */
                err = indx_insert_into_root(indx, ni, new_de, fnd->root_de, ctx,
                                            fnd, undo);
        } else {
                /*
                 * Found a leaf buffer, so we'll insert the new entry into it.
                 */
                err = indx_insert_into_buffer(indx, ni, root, new_de, ctx,
                                              fnd->level - 1, fnd);
        }

out:
        fnd_put(fnd_a);
out1:
        return err;
}

/*
 * indx_find_buffer - Locate a buffer from the tree.
 */
static struct indx_node *indx_find_buffer(struct ntfs_index *indx,
                                          struct ntfs_inode *ni,
                                          const struct INDEX_ROOT *root,
                                          __le64 vbn, struct indx_node *n)
{
        int err;
        const struct NTFS_DE *e;
        struct indx_node *r;
        const struct INDEX_HDR *hdr = n ? &n->index->ihdr : &root->ihdr;

        /* Step 1: Scan one level. */
        for (e = hdr_first_de(hdr);; e = hdr_next_de(hdr, e)) {
                if (!e)
                        return ERR_PTR(-EINVAL);

                if (de_has_vcn(e) && vbn == de_get_vbn_le(e))
                        return n;

                if (de_is_last(e))
                        break;
        }

        /* Step2: Do recursion. */
        e = Add2Ptr(hdr, le32_to_cpu(hdr->de_off));
        for (;;) {
                if (de_has_vcn_ex(e)) {
                        err = indx_read(indx, ni, de_get_vbn(e), &n);
                        if (err)
                                return ERR_PTR(err);

                        r = indx_find_buffer(indx, ni, root, vbn, n);
                        if (r)
                                return r;
                }

                if (de_is_last(e))
                        break;

                e = Add2Ptr(e, le16_to_cpu(e->size));
        }

        return NULL;
}

/*
 * indx_shrink - Deallocate unused tail indexes.
 */
static int indx_shrink(struct ntfs_index *indx, struct ntfs_inode *ni,
                       size_t bit)
{
        int err = 0;
        u64 bpb, new_data;
        size_t nbits;
        struct ATTRIB *b;
        struct ATTR_LIST_ENTRY *le = NULL;
        const struct INDEX_NAMES *in = &s_index_names[indx->type];

        b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len,
                         NULL, NULL);

        if (!b)
                return -ENOENT;

        if (!b->non_res) {
                unsigned long pos;
                const unsigned long *bm = resident_data(b);

                nbits = (size_t)le32_to_cpu(b->res.data_size) * 8;

                if (bit >= nbits)
                        return 0;

                pos = find_next_bit_le(bm, nbits, bit);
                if (pos < nbits)
                        return 0;
        } else {
                size_t used = MINUS_ONE_T;

                nbits = le64_to_cpu(b->nres.data_size) * 8;

                if (bit >= nbits)
                        return 0;

                err = scan_nres_bitmap(ni, b, indx, bit, &scan_for_used, &used);
                if (err)
                        return err;

                if (used != MINUS_ONE_T)
                        return 0;
        }

        new_data = (u64)bit << indx->index_bits;

        err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len,
                            &indx->alloc_run, new_data, &new_data, false, NULL);
        if (err)
                return err;

        if (in->name == I30_NAME)
                i_size_write(&ni->vfs_inode, new_data);

        bpb = ntfs3_bitmap_size(bit);
        if (bpb * 8 == nbits)
                return 0;

        err = attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len,
                            &indx->bitmap_run, bpb, &bpb, false, NULL);

        return err;
}

static int indx_free_children(struct ntfs_index *indx, struct ntfs_inode *ni,
                              const struct NTFS_DE *e, bool trim)
{
        int err;
        struct indx_node *n = NULL;
        struct INDEX_HDR *hdr;
        CLST vbn = de_get_vbn(e);
        size_t i;

        err = indx_read(indx, ni, vbn, &n);
        if (err)
                return err;

        hdr = &n->index->ihdr;
        /* First, recurse into the children, if any. */
        if (hdr_has_subnode(hdr)) {
                for (e = hdr_first_de(hdr); e; e = hdr_next_de(hdr, e)) {
                        indx_free_children(indx, ni, e, false);
                        if (de_is_last(e))
                                break;
                }
        }

        put_indx_node(n);

        i = vbn >> indx->idx2vbn_bits;
        /*
         * We've gotten rid of the children; add this buffer to the free list.
         */
        indx_mark_free(indx, ni, i);

        if (!trim)
                return 0;

        /*
         * If there are no used indexes after current free index
         * then we can truncate allocation and bitmap.
         * Use bitmap to estimate the case.
         */
        indx_shrink(indx, ni, i + 1);
        return 0;
}

/*
 * indx_get_entry_to_replace
 *
 * Find a replacement entry for a deleted entry.
 * Always returns a node entry:
 * NTFS_IE_HAS_SUBNODES is set the flags and the size includes the sub_vcn.
 */
static int indx_get_entry_to_replace(struct ntfs_index *indx,
                                     struct ntfs_inode *ni,
                                     const struct NTFS_DE *de_next,
                                     struct NTFS_DE **de_to_replace,
                                     struct ntfs_fnd *fnd)
{
        int err;
        int level = -1;
        CLST vbn;
        struct NTFS_DE *e, *te, *re;
        struct indx_node *n;
        struct INDEX_BUFFER *ib;

        *de_to_replace = NULL;

        /* Find first leaf entry down from de_next. */
        vbn = de_get_vbn(de_next);
        for (;;) {
                n = NULL;
                err = indx_read(indx, ni, vbn, &n);
                if (err)
                        goto out;

                e = hdr_first_de(&n->index->ihdr);
                fnd_push(fnd, n, e);

                if (!de_is_last(e)) {
                        /*
                         * This buffer is non-empty, so its first entry
                         * could be used as the replacement entry.
                         */
                        level = fnd->level - 1;
                }

                if (!de_has_vcn(e))
                        break;

                /* This buffer is a node. Continue to go down. */
                vbn = de_get_vbn(e);
        }

        if (level == -1)
                goto out;

        n = fnd->nodes[level];
        te = hdr_first_de(&n->index->ihdr);
        /* Copy the candidate entry into the replacement entry buffer. */
        re = kmalloc(le16_to_cpu(te->size) + sizeof(u64), GFP_NOFS);
        if (!re) {
                err = -ENOMEM;
                goto out;
        }

        *de_to_replace = re;
        memcpy(re, te, le16_to_cpu(te->size));

        if (!de_has_vcn(re)) {
                /*
                 * The replacement entry we found doesn't have a sub_vcn.
                 * increase its size to hold one.
                 */
                le16_add_cpu(&re->size, sizeof(u64));
                re->flags |= NTFS_IE_HAS_SUBNODES;
        } else {
                /*
                 * The replacement entry we found was a node entry, which
                 * means that all its child buffers are empty. Return them
                 * to the free pool.
                 */
                indx_free_children(indx, ni, te, true);
        }

        /*
         * Expunge the replacement entry from its former location,
         * and then write that buffer.
         */
        ib = n->index;
        e = hdr_delete_de(&ib->ihdr, te);

        fnd->de[level] = e;
        indx_write(indx, ni, n, 0);

        if (ib_is_leaf(ib) && ib_is_empty(ib)) {
                /* An empty leaf. */
                return 0;
        }

out:
        fnd_clear(fnd);
        return err;
}

/*
 * indx_delete_entry - Delete an entry from the index.
 */
int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni,
                      const void *key, u32 key_len, const void *ctx)
{
        int err, diff;
        struct INDEX_ROOT *root;
        struct INDEX_HDR *hdr;
        struct ntfs_fnd *fnd, *fnd2;
        struct INDEX_BUFFER *ib;
        struct NTFS_DE *e, *re, *next, *prev, *me;
        struct indx_node *n, *n2d = NULL;
        __le64 sub_vbn;
        int level, level2;
        struct ATTRIB *attr;
        struct mft_inode *mi;
        u32 e_size, root_size, new_root_size;
        size_t trim_bit;
        const struct INDEX_NAMES *in;

        fnd = fnd_get();
        if (!fnd) {
                err = -ENOMEM;
                goto out2;
        }

        fnd2 = fnd_get();
        if (!fnd2) {
                err = -ENOMEM;
                goto out1;
        }

        root = indx_get_root(indx, ni, &attr, &mi);
        if (!root) {
                err = -EINVAL;
                goto out;
        }

        /* Locate the entry to remove. */
        err = indx_find(indx, ni, root, key, key_len, ctx, &diff, &e, fnd);
        if (err)
                goto out;

        if (!e || diff) {
                err = -ENOENT;
                goto out;
        }

        level = fnd->level;

        if (level) {
                n = fnd->nodes[level - 1];
                e = fnd->de[level - 1];
                ib = n->index;
                hdr = &ib->ihdr;
        } else {
                hdr = &root->ihdr;
                e = fnd->root_de;
                n = NULL;
        }

        e_size = le16_to_cpu(e->size);

        if (!de_has_vcn_ex(e)) {
                /* The entry to delete is a leaf, so we can just rip it out. */
                hdr_delete_de(hdr, e);

                if (!level) {
                        hdr->total = hdr->used;

                        /* Shrink resident root attribute. */
                        mi_resize_attr(mi, attr, 0 - e_size);
                        goto out;
                }

                indx_write(indx, ni, n, 0);

                /*
                 * Check to see if removing that entry made
                 * the leaf empty.
                 */
                if (ib_is_leaf(ib) && ib_is_empty(ib)) {
                        fnd_pop(fnd);
                        fnd_push(fnd2, n, e);
                }
        } else {
                /*
                 * The entry we wish to delete is a node buffer, so we
                 * have to find a replacement for it.
                 */
                next = de_get_next(e);

                err = indx_get_entry_to_replace(indx, ni, next, &re, fnd2);
                if (err)
                        goto out;

                if (re) {
                        de_set_vbn_le(re, de_get_vbn_le(e));
                        hdr_delete_de(hdr, e);

                        err = level ? indx_insert_into_buffer(indx, ni, root,
                                                              re, ctx,
                                                              fnd->level - 1,
                                                              fnd) :
                                      indx_insert_into_root(indx, ni, re, e,
                                                            ctx, fnd, 0);
                        kfree(re);

                        if (err)
                                goto out;
                } else {
                        /*
                         * There is no replacement for the current entry.
                         * This means that the subtree rooted at its node
                         * is empty, and can be deleted, which turn means
                         * that the node can just inherit the deleted
                         * entry sub_vcn.
                         */
                        indx_free_children(indx, ni, next, true);

                        de_set_vbn_le(next, de_get_vbn_le(e));
                        hdr_delete_de(hdr, e);
                        if (level) {
                                indx_write(indx, ni, n, 0);
                        } else {
                                hdr->total = hdr->used;

                                /* Shrink resident root attribute. */
                                mi_resize_attr(mi, attr, 0 - e_size);
                        }
                }
        }

        /* Delete a branch of tree. */
        if (!fnd2 || !fnd2->level)
                goto out;

        /* Reinit root 'cause it can be changed. */
        root = indx_get_root(indx, ni, &attr, &mi);
        if (!root) {
                err = -EINVAL;
                goto out;
        }

        n2d = NULL;
        sub_vbn = fnd2->nodes[0]->index->vbn;
        level2 = 0;
        level = fnd->level;

        hdr = level ? &fnd->nodes[level - 1]->index->ihdr : &root->ihdr;

        /* Scan current level. */
        for (e = hdr_first_de(hdr);; e = hdr_next_de(hdr, e)) {
                if (!e) {
                        err = -EINVAL;
                        goto out;
                }

                if (de_has_vcn(e) && sub_vbn == de_get_vbn_le(e))
                        break;

                if (de_is_last(e)) {
                        e = NULL;
                        break;
                }
        }

        if (!e) {
                /* Do slow search from root. */
                struct indx_node *in;

                fnd_clear(fnd);

                in = indx_find_buffer(indx, ni, root, sub_vbn, NULL);
                if (IS_ERR(in)) {
                        err = PTR_ERR(in);
                        goto out;
                }

                if (in)
                        fnd_push(fnd, in, NULL);
        }

        /* Merge fnd2 -> fnd. */
        for (level = 0; level < fnd2->level; level++) {
                fnd_push(fnd, fnd2->nodes[level], fnd2->de[level]);
                fnd2->nodes[level] = NULL;
        }
        fnd2->level = 0;

        hdr = NULL;
        for (level = fnd->level; level; level--) {
                struct indx_node *in = fnd->nodes[level - 1];

                ib = in->index;
                if (ib_is_empty(ib)) {
                        sub_vbn = ib->vbn;
                } else {
                        hdr = &ib->ihdr;
                        n2d = in;
                        level2 = level;
                        break;
                }
        }

        if (!hdr)
                hdr = &root->ihdr;

        e = hdr_first_de(hdr);
        if (!e) {
                err = -EINVAL;
                goto out;
        }

        if (hdr != &root->ihdr || !de_is_last(e)) {
                prev = NULL;
                while (!de_is_last(e)) {
                        if (de_has_vcn(e) && sub_vbn == de_get_vbn_le(e))
                                break;
                        prev = e;
                        e = hdr_next_de(hdr, e);
                        if (!e) {
                                err = -EINVAL;
                                goto out;
                        }
                }

                if (sub_vbn != de_get_vbn_le(e)) {
                        /*
                         * Didn't find the parent entry, although this buffer
                         * is the parent trail. Something is corrupt.
                         */
                        err = -EINVAL;
                        goto out;
                }

                if (de_is_last(e)) {
                        /*
                         * Since we can't remove the end entry, we'll remove
                         * its predecessor instead. This means we have to
                         * transfer the predecessor's sub_vcn to the end entry.
                         * Note: This index block is not empty, so the
                         * predecessor must exist.
                         */
                        if (!prev) {
                                err = -EINVAL;
                                goto out;
                        }

                        if (de_has_vcn(prev)) {
                                de_set_vbn_le(e, de_get_vbn_le(prev));
                        } else if (de_has_vcn(e)) {
                                le16_sub_cpu(&e->size, sizeof(u64));
                                e->flags &= ~NTFS_IE_HAS_SUBNODES;
                                le32_sub_cpu(&hdr->used, sizeof(u64));
                        }
                        e = prev;
                }

                /*
                 * Copy the current entry into a temporary buffer (stripping
                 * off its down-pointer, if any) and delete it from the current
                 * buffer or root, as appropriate.
                 */
                e_size = le16_to_cpu(e->size);
                me = kmemdup(e, e_size, GFP_NOFS);
                if (!me) {
                        err = -ENOMEM;
                        goto out;
                }

                if (de_has_vcn(me)) {
                        me->flags &= ~NTFS_IE_HAS_SUBNODES;
                        le16_sub_cpu(&me->size, sizeof(u64));
                }

                hdr_delete_de(hdr, e);

                if (hdr == &root->ihdr) {
                        level = 0;
                        hdr->total = hdr->used;

                        /* Shrink resident root attribute. */
                        mi_resize_attr(mi, attr, 0 - e_size);
                } else {
                        indx_write(indx, ni, n2d, 0);
                        level = level2;
                }

                /* Mark unused buffers as free. */
                trim_bit = -1;
                for (; level < fnd->level; level++) {
                        ib = fnd->nodes[level]->index;
                        if (ib_is_empty(ib)) {
                                size_t k = le64_to_cpu(ib->vbn) >>
                                           indx->idx2vbn_bits;

                                indx_mark_free(indx, ni, k);
                                if (k < trim_bit)
                                        trim_bit = k;
                        }
                }

                fnd_clear(fnd);
                /*fnd->root_de = NULL;*/

                /*
                 * Re-insert the entry into the tree.
                 * Find the spot the tree where we want to insert the new entry.
                 */
                err = indx_insert_entry(indx, ni, me, ctx, fnd, 0);
                kfree(me);
                if (err)
                        goto out;

                if (trim_bit != -1)
                        indx_shrink(indx, ni, trim_bit);
        } else {
                /*
                 * This tree needs to be collapsed down to an empty root.
                 * Recreate the index root as an empty leaf and free all
                 * the bits the index allocation bitmap.
                 */
                fnd_clear(fnd);
                fnd_clear(fnd2);

                in = &s_index_names[indx->type];

                err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len,
                                    &indx->alloc_run, 0, NULL, false, NULL);
                if (in->name == I30_NAME)
                        i_size_write(&ni->vfs_inode, 0);

                err = ni_remove_attr(ni, ATTR_ALLOC, in->name, in->name_len,
                                     false, NULL);
                run_close(&indx->alloc_run);

                err = attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len,
                                    &indx->bitmap_run, 0, NULL, false, NULL);
                err = ni_remove_attr(ni, ATTR_BITMAP, in->name, in->name_len,
                                     false, NULL);
                run_close(&indx->bitmap_run);

                root = indx_get_root(indx, ni, &attr, &mi);
                if (!root) {
                        err = -EINVAL;
                        goto out;
                }

                root_size = le32_to_cpu(attr->res.data_size);
                new_root_size =
                        sizeof(struct INDEX_ROOT) + sizeof(struct NTFS_DE);

                if (new_root_size != root_size &&
                    !mi_resize_attr(mi, attr, new_root_size - root_size)) {
                        err = -EINVAL;
                        goto out;
                }

                /* Fill first entry. */
                e = (struct NTFS_DE *)(root + 1);
                e->ref.low = 0;
                e->ref.high = 0;
                e->ref.seq = 0;
                e->size = cpu_to_le16(sizeof(struct NTFS_DE));
                e->flags = NTFS_IE_LAST; // 0x02
                e->key_size = 0;
                e->res = 0;

                hdr = &root->ihdr;
                hdr->flags = 0;
                hdr->used = hdr->total = cpu_to_le32(
                        new_root_size - offsetof(struct INDEX_ROOT, ihdr));
                mi->dirty = true;
        }

out:
        fnd_put(fnd2);
out1:
        fnd_put(fnd);
out2:
        return err;
}

/*
 * Update duplicated information in directory entry
 * 'dup' - info from MFT record
 */
int indx_update_dup(struct ntfs_inode *ni, struct ntfs_sb_info *sbi,
                    const struct ATTR_FILE_NAME *fname,
                    const struct NTFS_DUP_INFO *dup, int sync)
{
        int err, diff;
        struct NTFS_DE *e = NULL;
        struct ATTR_FILE_NAME *e_fname;
        struct ntfs_fnd *fnd;
        struct INDEX_ROOT *root;
        struct mft_inode *mi;
        struct ntfs_index *indx = &ni->dir;

        fnd = fnd_get();
        if (!fnd)
                return -ENOMEM;

        root = indx_get_root(indx, ni, NULL, &mi);
        if (!root) {
                err = -EINVAL;
                goto out;
        }

        /* Find entry in directory. */
        err = indx_find(indx, ni, root, fname, fname_full_size(fname), sbi,
                        &diff, &e, fnd);
        if (err)
                goto out;

        if (!e) {
                err = -EINVAL;
                goto out;
        }

        if (diff) {
                err = -EINVAL;
                goto out;
        }

        e_fname = (struct ATTR_FILE_NAME *)(e + 1);

        if (!memcmp(&e_fname->dup, dup, sizeof(*dup))) {
                /*
                 * Nothing to update in index! Try to avoid this call.
                 */
                goto out;
        }

        memcpy(&e_fname->dup, dup, sizeof(*dup));

        if (fnd->level) {
                /* Directory entry in index. */
                err = indx_write(indx, ni, fnd->nodes[fnd->level - 1], sync);
        } else {
                /* Directory entry in directory MFT record. */
                mi->dirty = true;
                if (sync)
                        err = mi_write(mi, 1);
                else
                        mark_inode_dirty(&ni->vfs_inode);
        }

out:
        fnd_put(fnd);
        return err;
}










































































































































































































































































































































































































   21 













































   20 
   24 

   21 























   10 

    9 




















































































   24 




























































































































































    1 







    5 




























    9 










   10 














   10 
   10 






































































































   12 











































































    3 





    3 



























































































   11 

   11 













   11 





   11 
   11 

   11 
   11 



























































































































































































































































































































































































































































































































































   10 

   10 























   12 




















   13 
   15 


























   10 
    1 
    9 
















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* memcontrol.h - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 */

#ifndef _LINUX_MEMCONTROL_H
#define _LINUX_MEMCONTROL_H
#include <linux/cgroup.h>
#include <linux/vm_event_item.h>
#include <linux/hardirq.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/page_counter.h>
#include <linux/vmpressure.h>
#include <linux/eventfd.h>
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/writeback.h>
#include <linux/page-flags.h>
#include <linux/shrinker.h>

struct mem_cgroup;
struct obj_cgroup;
struct page;
struct mm_struct;
struct kmem_cache;

/* Cgroup-specific page state, on top of universal node page state */
enum memcg_stat_item {
        MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
        MEMCG_SOCK,
        MEMCG_PERCPU_B,
        MEMCG_VMALLOC,
        MEMCG_KMEM,
        MEMCG_ZSWAP_B,
        MEMCG_ZSWAPPED,
        MEMCG_NR_STAT,
};

enum memcg_memory_event {
        MEMCG_LOW,
        MEMCG_HIGH,
        MEMCG_MAX,
        MEMCG_OOM,
        MEMCG_OOM_KILL,
        MEMCG_OOM_GROUP_KILL,
        MEMCG_SWAP_HIGH,
        MEMCG_SWAP_MAX,
        MEMCG_SWAP_FAIL,
        MEMCG_NR_MEMORY_EVENTS,
};

struct mem_cgroup_reclaim_cookie {
        pg_data_t *pgdat;
        unsigned int generation;
};

#ifdef CONFIG_MEMCG

#define MEM_CGROUP_ID_SHIFT        16

struct mem_cgroup_id {
        int id;
        refcount_t ref;
};

/*
 * Per memcg event counter is incremented at every pagein/pageout. With THP,
 * it will be incremented by the number of pages. This counter is used
 * to trigger some periodic events. This is straightforward and better
 * than using jiffies etc. to handle periodic memcg event.
 */
enum mem_cgroup_events_target {
        MEM_CGROUP_TARGET_THRESH,
        MEM_CGROUP_TARGET_SOFTLIMIT,
        MEM_CGROUP_NTARGETS,
};

struct memcg_vmstats_percpu;
struct memcg_vmstats;
struct lruvec_stats_percpu;
struct lruvec_stats;

struct mem_cgroup_reclaim_iter {
        struct mem_cgroup *position;
        /* scan generation, increased every round-trip */
        unsigned int generation;
};

/*
 * per-node information in memory controller.
 */
struct mem_cgroup_per_node {
        struct lruvec                lruvec;

        struct lruvec_stats_percpu __percpu        *lruvec_stats_percpu;
        struct lruvec_stats                        *lruvec_stats;

        unsigned long                lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];

        struct mem_cgroup_reclaim_iter        iter;

        struct shrinker_info __rcu        *shrinker_info;

        struct rb_node                tree_node;        /* RB tree node */
        unsigned long                usage_in_excess;/* Set to the value by which */
                                                /* the soft limit is exceeded*/
        bool                        on_tree;
        struct mem_cgroup        *memcg;                /* Back pointer, we cannot */
                                                /* use container_of           */
};

struct mem_cgroup_threshold {
        struct eventfd_ctx *eventfd;
        unsigned long threshold;
};

/* For threshold */
struct mem_cgroup_threshold_ary {
        /* An array index points to threshold just below or equal to usage. */
        int current_threshold;
        /* Size of entries[] */
        unsigned int size;
        /* Array of thresholds */
        struct mem_cgroup_threshold entries[] __counted_by(size);
};

struct mem_cgroup_thresholds {
        /* Primary thresholds array */
        struct mem_cgroup_threshold_ary *primary;
        /*
         * Spare threshold array.
         * This is needed to make mem_cgroup_unregister_event() "never fail".
         * It must be able to store at least primary->size - 1 entries.
         */
        struct mem_cgroup_threshold_ary *spare;
};

/*
 * Remember four most recent foreign writebacks with dirty pages in this
 * cgroup.  Inode sharing is expected to be uncommon and, even if we miss
 * one in a given round, we're likely to catch it later if it keeps
 * foreign-dirtying, so a fairly low count should be enough.
 *
 * See mem_cgroup_track_foreign_dirty_slowpath() for details.
 */
#define MEMCG_CGWB_FRN_CNT        4

struct memcg_cgwb_frn {
        u64 bdi_id;                        /* bdi->id of the foreign inode */
        int memcg_id;                        /* memcg->css.id of foreign inode */
        u64 at;                                /* jiffies_64 at the time of dirtying */
        struct wb_completion done;        /* tracks in-flight foreign writebacks */
};

/*
 * Bucket for arbitrarily byte-sized objects charged to a memory
 * cgroup. The bucket can be reparented in one piece when the cgroup
 * is destroyed, without having to round up the individual references
 * of all live memory objects in the wild.
 */
struct obj_cgroup {
        struct percpu_ref refcnt;
        struct mem_cgroup *memcg;
        atomic_t nr_charged_bytes;
        union {
                struct list_head list; /* protected by objcg_lock */
                struct rcu_head rcu;
        };
};

/*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 */
struct mem_cgroup {
        struct cgroup_subsys_state css;

        /* Private memcg ID. Used to ID objects that outlive the cgroup */
        struct mem_cgroup_id id;

        /* Accounted resources */
        struct page_counter memory;                /* Both v1 & v2 */

        union {
                struct page_counter swap;        /* v2 only */
                struct page_counter memsw;        /* v1 only */
        };

        /* Legacy consumer-oriented counters */
        struct page_counter kmem;                /* v1 only */
        struct page_counter tcpmem;                /* v1 only */

        /* Range enforcement for interrupt charges */
        struct work_struct high_work;

#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
        unsigned long zswap_max;

        /*
         * Prevent pages from this memcg from being written back from zswap to
         * swap, and from being swapped out on zswap store failures.
         */
        bool zswap_writeback;
#endif

        unsigned long soft_limit;

        /* vmpressure notifications */
        struct vmpressure vmpressure;

        /*
         * Should the OOM killer kill all belonging tasks, had it kill one?
         */
        bool oom_group;

        /* protected by memcg_oom_lock */
        bool                oom_lock;
        int                under_oom;

        int        swappiness;
        /* OOM-Killer disable */
        int                oom_kill_disable;

        /* memory.events and memory.events.local */
        struct cgroup_file events_file;
        struct cgroup_file events_local_file;

        /* handle for "memory.swap.events" */
        struct cgroup_file swap_events_file;

        /* protect arrays of thresholds */
        struct mutex thresholds_lock;

        /* thresholds for memory usage. RCU-protected */
        struct mem_cgroup_thresholds thresholds;

        /* thresholds for mem+swap usage. RCU-protected */
        struct mem_cgroup_thresholds memsw_thresholds;

        /* For oom notifier event fd */
        struct list_head oom_notify;

        /*
         * Should we move charges of a task when a task is moved into this
         * mem_cgroup ? And what type of charges should we move ?
         */
        unsigned long move_charge_at_immigrate;
        /* taken only while moving_account > 0 */
        spinlock_t                move_lock;
        unsigned long                move_lock_flags;

        CACHELINE_PADDING(_pad1_);

        /* memory.stat */
        struct memcg_vmstats        *vmstats;

        /* memory.events */
        atomic_long_t                memory_events[MEMCG_NR_MEMORY_EVENTS];
        atomic_long_t                memory_events_local[MEMCG_NR_MEMORY_EVENTS];

        /*
         * Hint of reclaim pressure for socket memroy management. Note
         * that this indicator should NOT be used in legacy cgroup mode
         * where socket memory is accounted/charged separately.
         */
        unsigned long                socket_pressure;

        /* Legacy tcp memory accounting */
        bool                        tcpmem_active;
        int                        tcpmem_pressure;

#ifdef CONFIG_MEMCG_KMEM
        int kmemcg_id;
        /*
         * memcg->objcg is wiped out as a part of the objcg repaprenting
         * process. memcg->orig_objcg preserves a pointer (and a reference)
         * to the original objcg until the end of live of memcg.
         */
        struct obj_cgroup __rcu        *objcg;
        struct obj_cgroup        *orig_objcg;
        /* list of inherited objcgs, protected by objcg_lock */
        struct list_head objcg_list;
#endif

        CACHELINE_PADDING(_pad2_);

        /*
         * set > 0 if pages under this cgroup are moving to other cgroup.
         */
        atomic_t                moving_account;
        struct task_struct        *move_lock_task;

        struct memcg_vmstats_percpu __percpu *vmstats_percpu;

#ifdef CONFIG_CGROUP_WRITEBACK
        struct list_head cgwb_list;
        struct wb_domain cgwb_domain;
        struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
#endif

        /* List of events which userspace want to receive */
        struct list_head event_list;
        spinlock_t event_list_lock;

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        struct deferred_split deferred_split_queue;
#endif

#ifdef CONFIG_LRU_GEN_WALKS_MMU
        /* per-memcg mm_struct list */
        struct lru_gen_mm_list mm_list;
#endif

        struct mem_cgroup_per_node *nodeinfo[];
};

/*
 * size of first charge trial.
 * TODO: maybe necessary to use big numbers in big irons or dynamic based of the
 * workload.
 */
#define MEMCG_CHARGE_BATCH 64U

extern struct mem_cgroup *root_mem_cgroup;

enum page_memcg_data_flags {
        /* page->memcg_data is a pointer to an slabobj_ext vector */
        MEMCG_DATA_OBJEXTS = (1UL << 0),
        /* page has been accounted as a non-slab kernel page */
        MEMCG_DATA_KMEM = (1UL << 1),
        /* the next bit after the last actual flag */
        __NR_MEMCG_DATA_FLAGS  = (1UL << 2),
};

#define __FIRST_OBJEXT_FLAG        __NR_MEMCG_DATA_FLAGS

#else /* CONFIG_MEMCG */

#define __FIRST_OBJEXT_FLAG        (1UL << 0)

#endif /* CONFIG_MEMCG */

enum objext_flags {
        /* slabobj_ext vector failed to allocate */
        OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG,
        /* the next bit after the last actual flag */
        __NR_OBJEXTS_FLAGS  = (__FIRST_OBJEXT_FLAG << 1),
};

#define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1)

#ifdef CONFIG_MEMCG

static inline bool folio_memcg_kmem(struct folio *folio);

/*
 * After the initialization objcg->memcg is always pointing at
 * a valid memcg, but can be atomically swapped to the parent memcg.
 *
 * The caller must ensure that the returned memcg won't be released:
 * e.g. acquire the rcu_read_lock or css_set_lock.
 */
static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
{
        return READ_ONCE(objcg->memcg);
}

/*
 * __folio_memcg - Get the memory cgroup associated with a non-kmem folio
 * @folio: Pointer to the folio.
 *
 * Returns a pointer to the memory cgroup associated with the folio,
 * or NULL. This function assumes that the folio is known to have a
 * proper memory cgroup pointer. It's not safe to call this function
 * against some type of folios, e.g. slab folios or ex-slab folios or
 * kmem folios.
 */
static inline struct mem_cgroup *__folio_memcg(struct folio *folio)
{
        unsigned long memcg_data = folio->memcg_data;

        VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
        VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
        VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio);

        return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}

/*
 * __folio_objcg - get the object cgroup associated with a kmem folio.
 * @folio: Pointer to the folio.
 *
 * Returns a pointer to the object cgroup associated with the folio,
 * or NULL. This function assumes that the folio is known to have a
 * proper object cgroup pointer. It's not safe to call this function
 * against some type of folios, e.g. slab folios or ex-slab folios or
 * LRU folios.
 */
static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
{
        unsigned long memcg_data = folio->memcg_data;

        VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
        VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
        VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio);

        return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}

/*
 * folio_memcg - Get the memory cgroup associated with a folio.
 * @folio: Pointer to the folio.
 *
 * Returns a pointer to the memory cgroup associated with the folio,
 * or NULL. This function assumes that the folio is known to have a
 * proper memory cgroup pointer. It's not safe to call this function
 * against some type of folios, e.g. slab folios or ex-slab folios.
 *
 * For a non-kmem folio any of the following ensures folio and memcg binding
 * stability:
 *
 * - the folio lock
 * - LRU isolation
 * - folio_memcg_lock()
 * - exclusive reference
 * - mem_cgroup_trylock_pages()
 *
 * For a kmem folio a caller should hold an rcu read lock to protect memcg
 * associated with a kmem folio from being released.
 */
static inline struct mem_cgroup *folio_memcg(struct folio *folio)
{
        if (folio_memcg_kmem(folio))
                return obj_cgroup_memcg(__folio_objcg(folio));
        return __folio_memcg(folio);
}

static inline struct mem_cgroup *page_memcg(struct page *page)
{
        return folio_memcg(page_folio(page));
}

/**
 * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio.
 * @folio: Pointer to the folio.
 *
 * This function assumes that the folio is known to have a
 * proper memory cgroup pointer. It's not safe to call this function
 * against some type of folios, e.g. slab folios or ex-slab folios.
 *
 * Return: A pointer to the memory cgroup associated with the folio,
 * or NULL.
 */
static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
{
        unsigned long memcg_data = READ_ONCE(folio->memcg_data);

        VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
        WARN_ON_ONCE(!rcu_read_lock_held());

        if (memcg_data & MEMCG_DATA_KMEM) {
                struct obj_cgroup *objcg;

                objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
                return obj_cgroup_memcg(objcg);
        }

        return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}

/*
 * folio_memcg_check - Get the memory cgroup associated with a folio.
 * @folio: Pointer to the folio.
 *
 * Returns a pointer to the memory cgroup associated with the folio,
 * or NULL. This function unlike folio_memcg() can take any folio
 * as an argument. It has to be used in cases when it's not known if a folio
 * has an associated memory cgroup pointer or an object cgroups vector or
 * an object cgroup.
 *
 * For a non-kmem folio any of the following ensures folio and memcg binding
 * stability:
 *
 * - the folio lock
 * - LRU isolation
 * - lock_folio_memcg()
 * - exclusive reference
 * - mem_cgroup_trylock_pages()
 *
 * For a kmem folio a caller should hold an rcu read lock to protect memcg
 * associated with a kmem folio from being released.
 */
static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
{
        /*
         * Because folio->memcg_data might be changed asynchronously
         * for slabs, READ_ONCE() should be used here.
         */
        unsigned long memcg_data = READ_ONCE(folio->memcg_data);

        if (memcg_data & MEMCG_DATA_OBJEXTS)
                return NULL;

        if (memcg_data & MEMCG_DATA_KMEM) {
                struct obj_cgroup *objcg;

                objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
                return obj_cgroup_memcg(objcg);
        }

        return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}

static inline struct mem_cgroup *page_memcg_check(struct page *page)
{
        if (PageTail(page))
                return NULL;
        return folio_memcg_check((struct folio *)page);
}

static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
{
        struct mem_cgroup *memcg;

        rcu_read_lock();
retry:
        memcg = obj_cgroup_memcg(objcg);
        if (unlikely(!css_tryget(&memcg->css)))
                goto retry;
        rcu_read_unlock();

        return memcg;
}

#ifdef CONFIG_MEMCG_KMEM
/*
 * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set.
 * @folio: Pointer to the folio.
 *
 * Checks if the folio has MemcgKmem flag set. The caller must ensure
 * that the folio has an associated memory cgroup. It's not safe to call
 * this function against some types of folios, e.g. slab folios.
 */
static inline bool folio_memcg_kmem(struct folio *folio)
{
        VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page);
        VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJEXTS, folio);
        return folio->memcg_data & MEMCG_DATA_KMEM;
}


#else
static inline bool folio_memcg_kmem(struct folio *folio)
{
        return false;
}

#endif

static inline bool PageMemcgKmem(struct page *page)
{
        return folio_memcg_kmem(page_folio(page));
}

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
        return (memcg == root_mem_cgroup);
}

static inline bool mem_cgroup_disabled(void)
{
        return !cgroup_subsys_enabled(memory_cgrp_subsys);
}

static inline void mem_cgroup_protection(struct mem_cgroup *root,
                                         struct mem_cgroup *memcg,
                                         unsigned long *min,
                                         unsigned long *low)
{
        *min = *low = 0;

        if (mem_cgroup_disabled())
                return;

        /*
         * There is no reclaim protection applied to a targeted reclaim.
         * We are special casing this specific case here because
         * mem_cgroup_calculate_protection is not robust enough to keep
         * the protection invariant for calculated effective values for
         * parallel reclaimers with different reclaim target. This is
         * especially a problem for tail memcgs (as they have pages on LRU)
         * which would want to have effective values 0 for targeted reclaim
         * but a different value for external reclaim.
         *
         * Example
         * Let's have global and A's reclaim in parallel:
         *  |
         *  A (low=2G, usage = 3G, max = 3G, children_low_usage = 1.5G)
         *  |\
         *  | C (low = 1G, usage = 2.5G)
         *  B (low = 1G, usage = 0.5G)
         *
         * For the global reclaim
         * A.elow = A.low
         * B.elow = min(B.usage, B.low) because children_low_usage <= A.elow
         * C.elow = min(C.usage, C.low)
         *
         * With the effective values resetting we have A reclaim
         * A.elow = 0
         * B.elow = B.low
         * C.elow = C.low
         *
         * If the global reclaim races with A's reclaim then
         * B.elow = C.elow = 0 because children_low_usage > A.elow)
         * is possible and reclaiming B would be violating the protection.
         *
         */
        if (root == memcg)
                return;

        *min = READ_ONCE(memcg->memory.emin);
        *low = READ_ONCE(memcg->memory.elow);
}

void mem_cgroup_calculate_protection(struct mem_cgroup *root,
                                     struct mem_cgroup *memcg);

static inline bool mem_cgroup_unprotected(struct mem_cgroup *target,
                                          struct mem_cgroup *memcg)
{
        /*
         * The root memcg doesn't account charges, and doesn't support
         * protection. The target memcg's protection is ignored, see
         * mem_cgroup_calculate_protection() and mem_cgroup_protection()
         */
        return mem_cgroup_disabled() || mem_cgroup_is_root(memcg) ||
                memcg == target;
}

static inline bool mem_cgroup_below_low(struct mem_cgroup *target,
                                        struct mem_cgroup *memcg)
{
        if (mem_cgroup_unprotected(target, memcg))
                return false;

        return READ_ONCE(memcg->memory.elow) >=
                page_counter_read(&memcg->memory);
}

static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
                                        struct mem_cgroup *memcg)
{
        if (mem_cgroup_unprotected(target, memcg))
                return false;

        return READ_ONCE(memcg->memory.emin) >=
                page_counter_read(&memcg->memory);
}

void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg);

int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp);

/**
 * mem_cgroup_charge - Charge a newly allocated folio to a cgroup.
 * @folio: Folio to charge.
 * @mm: mm context of the allocating task.
 * @gfp: Reclaim mode.
 *
 * Try to charge @folio to the memcg that @mm belongs to, reclaiming
 * pages according to @gfp if necessary.  If @mm is NULL, try to
 * charge to the active memcg.
 *
 * Do not use this for folios allocated for swapin.
 *
 * Return: 0 on success. Otherwise, an error code is returned.
 */
static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
                                    gfp_t gfp)
{
        if (mem_cgroup_disabled())
                return 0;
        return __mem_cgroup_charge(folio, mm, gfp);
}

int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
                long nr_pages);

int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
                                  gfp_t gfp, swp_entry_t entry);
void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);

void __mem_cgroup_uncharge(struct folio *folio);

/**
 * mem_cgroup_uncharge - Uncharge a folio.
 * @folio: Folio to uncharge.
 *
 * Uncharge a folio previously charged with mem_cgroup_charge().
 */
static inline void mem_cgroup_uncharge(struct folio *folio)
{
        if (mem_cgroup_disabled())
                return;
        __mem_cgroup_uncharge(folio);
}

void __mem_cgroup_uncharge_folios(struct folio_batch *folios);
static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
{
        if (mem_cgroup_disabled())
                return;
        __mem_cgroup_uncharge_folios(folios);
}

void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages);
void mem_cgroup_replace_folio(struct folio *old, struct folio *new);
void mem_cgroup_migrate(struct folio *old, struct folio *new);

/**
 * mem_cgroup_lruvec - get the lru list vector for a memcg & node
 * @memcg: memcg of the wanted lruvec
 * @pgdat: pglist_data
 *
 * Returns the lru list vector holding pages for a given @memcg &
 * @pgdat combination. This can be the node lruvec, if the memory
 * controller is disabled.
 */
static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
                                               struct pglist_data *pgdat)
{
        struct mem_cgroup_per_node *mz;
        struct lruvec *lruvec;

        if (mem_cgroup_disabled()) {
                lruvec = &pgdat->__lruvec;
                goto out;
        }

        if (!memcg)
                memcg = root_mem_cgroup;

        mz = memcg->nodeinfo[pgdat->node_id];
        lruvec = &mz->lruvec;
out:
        /*
         * Since a node can be onlined after the mem_cgroup was created,
         * we have to be prepared to initialize lruvec->pgdat here;
         * and if offlined then reonlined, we need to reinitialize it.
         */
        if (unlikely(lruvec->pgdat != pgdat))
                lruvec->pgdat = pgdat;
        return lruvec;
}

/**
 * folio_lruvec - return lruvec for isolating/putting an LRU folio
 * @folio: Pointer to the folio.
 *
 * This function relies on folio->mem_cgroup being stable.
 */
static inline struct lruvec *folio_lruvec(struct folio *folio)
{
        struct mem_cgroup *memcg = folio_memcg(folio);

        VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled(), folio);
        return mem_cgroup_lruvec(memcg, folio_pgdat(folio));
}

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);

struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);

struct mem_cgroup *get_mem_cgroup_from_current(void);

struct lruvec *folio_lruvec_lock(struct folio *folio);
struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
                                                unsigned long *flags);

#ifdef CONFIG_DEBUG_VM
void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio);
#else
static inline
void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
{
}
#endif

static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
        return css ? container_of(css, struct mem_cgroup, css) : NULL;
}

static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg)
{
        return percpu_ref_tryget(&objcg->refcnt);
}

static inline void obj_cgroup_get(struct obj_cgroup *objcg)
{
        percpu_ref_get(&objcg->refcnt);
}

static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
                                       unsigned long nr)
{
        percpu_ref_get_many(&objcg->refcnt, nr);
}

static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{
        if (objcg)
                percpu_ref_put(&objcg->refcnt);
}

static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
{
        return !memcg || css_tryget(&memcg->css);
}

static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg)
{
        return !memcg || css_tryget_online(&memcg->css);
}

static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
        if (memcg)
                css_put(&memcg->css);
}

#define mem_cgroup_from_counter(counter, member)        \
        container_of(counter, struct mem_cgroup, member)

struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
                                   struct mem_cgroup *,
                                   struct mem_cgroup_reclaim_cookie *);
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
                           int (*)(struct task_struct *, void *), void *arg);

static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
        if (mem_cgroup_disabled())
                return 0;

        return memcg->id.id;
}
struct mem_cgroup *mem_cgroup_from_id(unsigned short id);

#ifdef CONFIG_SHRINKER_DEBUG
static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg)
{
        return memcg ? cgroup_ino(memcg->css.cgroup) : 0;
}

struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino);
#endif

static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
{
        return mem_cgroup_from_css(seq_css(m));
}

static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
{
        struct mem_cgroup_per_node *mz;

        if (mem_cgroup_disabled())
                return NULL;

        mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        return mz->memcg;
}

/**
 * parent_mem_cgroup - find the accounting parent of a memcg
 * @memcg: memcg whose parent to find
 *
 * Returns the parent memcg, or NULL if this is the root.
 */
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
        return mem_cgroup_from_css(memcg->css.parent);
}

static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
                              struct mem_cgroup *root)
{
        if (root == memcg)
                return true;
        return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
}

static inline bool mm_match_cgroup(struct mm_struct *mm,
                                   struct mem_cgroup *memcg)
{
        struct mem_cgroup *task_memcg;
        bool match = false;

        rcu_read_lock();
        task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (task_memcg)
                match = mem_cgroup_is_descendant(task_memcg, memcg);
        rcu_read_unlock();
        return match;
}

struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio);
ino_t page_cgroup_ino(struct page *page);

static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
{
        if (mem_cgroup_disabled())
                return true;
        return !!(memcg->css.flags & CSS_ONLINE);
}

void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
                int zid, int nr_pages);

static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
                enum lru_list lru, int zone_idx)
{
        struct mem_cgroup_per_node *mz;

        mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        return READ_ONCE(mz->lru_zone_size[zone_idx][lru]);
}

void mem_cgroup_handle_over_high(gfp_t gfp_mask);

unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);

unsigned long mem_cgroup_size(struct mem_cgroup *memcg);

void mem_cgroup_print_oom_context(struct mem_cgroup *memcg,
                                struct task_struct *p);

void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg);

static inline void mem_cgroup_enter_user_fault(void)
{
        WARN_ON(current->in_user_fault);
        current->in_user_fault = 1;
}

static inline void mem_cgroup_exit_user_fault(void)
{
        WARN_ON(!current->in_user_fault);
        current->in_user_fault = 0;
}

static inline bool task_in_memcg_oom(struct task_struct *p)
{
        return p->memcg_in_oom;
}

bool mem_cgroup_oom_synchronize(bool wait);
struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
                                            struct mem_cgroup *oom_domain);
void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);

void folio_memcg_lock(struct folio *folio);
void folio_memcg_unlock(struct folio *folio);

void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
                       int val);

/* try to stablize folio_memcg() for all the pages in a memcg */
static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
{
        rcu_read_lock();

        if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
                return true;

        rcu_read_unlock();
        return false;
}

static inline void mem_cgroup_unlock_pages(void)
{
        rcu_read_unlock();
}

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void mod_memcg_state(struct mem_cgroup *memcg,
                                   enum memcg_stat_item idx, int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_memcg_state(memcg, idx, val);
        local_irq_restore(flags);
}

static inline void mod_memcg_page_state(struct page *page,
                                        enum memcg_stat_item idx, int val)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        rcu_read_lock();
        memcg = page_memcg(page);
        if (memcg)
                mod_memcg_state(memcg, idx, val);
        rcu_read_unlock();
}

unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
unsigned long lruvec_page_state_local(struct lruvec *lruvec,
                                      enum node_stat_item idx);

void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);

void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);

static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
                                         int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_lruvec_kmem_state(p, idx, val);
        local_irq_restore(flags);
}

void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
                          unsigned long count);

static inline void count_memcg_events(struct mem_cgroup *memcg,
                                      enum vm_event_item idx,
                                      unsigned long count)
{
        unsigned long flags;

        local_irq_save(flags);
        __count_memcg_events(memcg, idx, count);
        local_irq_restore(flags);
}

static inline void count_memcg_folio_events(struct folio *folio,
                enum vm_event_item idx, unsigned long nr)
{
        struct mem_cgroup *memcg = folio_memcg(folio);

        if (memcg)
                count_memcg_events(memcg, idx, nr);
}

static inline void count_memcg_event_mm(struct mm_struct *mm,
                                        enum vm_event_item idx)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (likely(memcg))
                count_memcg_events(memcg, idx, 1);
        rcu_read_unlock();
}

static inline void memcg_memory_event(struct mem_cgroup *memcg,
                                      enum memcg_memory_event event)
{
        bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX ||
                          event == MEMCG_SWAP_FAIL;

        atomic_long_inc(&memcg->memory_events_local[event]);
        if (!swap_event)
                cgroup_file_notify(&memcg->events_local_file);

        do {
                atomic_long_inc(&memcg->memory_events[event]);
                if (swap_event)
                        cgroup_file_notify(&memcg->swap_events_file);
                else
                        cgroup_file_notify(&memcg->events_file);

                if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
                        break;
                if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
                        break;
        } while ((memcg = parent_mem_cgroup(memcg)) &&
                 !mem_cgroup_is_root(memcg));
}

static inline void memcg_memory_event_mm(struct mm_struct *mm,
                                         enum memcg_memory_event event)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (likely(memcg))
                memcg_memory_event(memcg, event);
        rcu_read_unlock();
}

void split_page_memcg(struct page *head, int old_order, int new_order);

unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
                                                gfp_t gfp_mask,
                                                unsigned long *total_scanned);

#else /* CONFIG_MEMCG */

#define MEM_CGROUP_ID_SHIFT        0

static inline struct mem_cgroup *folio_memcg(struct folio *folio)
{
        return NULL;
}

static inline struct mem_cgroup *page_memcg(struct page *page)
{
        return NULL;
}

static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
{
        WARN_ON_ONCE(!rcu_read_lock_held());
        return NULL;
}

static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
{
        return NULL;
}

static inline struct mem_cgroup *page_memcg_check(struct page *page)
{
        return NULL;
}

static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
{
        return NULL;
}

static inline bool folio_memcg_kmem(struct folio *folio)
{
        return false;
}

static inline bool PageMemcgKmem(struct page *page)
{
        return false;
}

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
        return true;
}

static inline bool mem_cgroup_disabled(void)
{
        return true;
}

static inline void memcg_memory_event(struct mem_cgroup *memcg,
                                      enum memcg_memory_event event)
{
}

static inline void memcg_memory_event_mm(struct mm_struct *mm,
                                         enum memcg_memory_event event)
{
}

static inline void mem_cgroup_protection(struct mem_cgroup *root,
                                         struct mem_cgroup *memcg,
                                         unsigned long *min,
                                         unsigned long *low)
{
        *min = *low = 0;
}

static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root,
                                                   struct mem_cgroup *memcg)
{
}

static inline bool mem_cgroup_unprotected(struct mem_cgroup *target,
                                          struct mem_cgroup *memcg)
{
        return true;
}
static inline bool mem_cgroup_below_low(struct mem_cgroup *target,
                                        struct mem_cgroup *memcg)
{
        return false;
}

static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
                                        struct mem_cgroup *memcg)
{
        return false;
}

static inline void mem_cgroup_commit_charge(struct folio *folio,
                struct mem_cgroup *memcg)
{
}

static inline int mem_cgroup_charge(struct folio *folio,
                struct mm_struct *mm, gfp_t gfp)
{
        return 0;
}

static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg,
                gfp_t gfp, long nr_pages)
{
        return 0;
}

static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
                        struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
{
        return 0;
}

static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
{
}

static inline void mem_cgroup_uncharge(struct folio *folio)
{
}

static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
{
}

static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
                unsigned int nr_pages)
{
}

static inline void mem_cgroup_replace_folio(struct folio *old,
                struct folio *new)
{
}

static inline void mem_cgroup_migrate(struct folio *old, struct folio *new)
{
}

static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
                                               struct pglist_data *pgdat)
{
        return &pgdat->__lruvec;
}

static inline struct lruvec *folio_lruvec(struct folio *folio)
{
        struct pglist_data *pgdat = folio_pgdat(folio);
        return &pgdat->__lruvec;
}

static inline
void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
{
}

static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
        return NULL;
}

static inline bool mm_match_cgroup(struct mm_struct *mm,
                struct mem_cgroup *memcg)
{
        return true;
}

static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
        return NULL;
}

static inline struct mem_cgroup *get_mem_cgroup_from_current(void)
{
        return NULL;
}

static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
{
        return NULL;
}

static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{
}

static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
{
        return true;
}

static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg)
{
        return true;
}

static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
}

static inline struct lruvec *folio_lruvec_lock(struct folio *folio)
{
        struct pglist_data *pgdat = folio_pgdat(folio);

        spin_lock(&pgdat->__lruvec.lru_lock);
        return &pgdat->__lruvec;
}

static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
{
        struct pglist_data *pgdat = folio_pgdat(folio);

        spin_lock_irq(&pgdat->__lruvec.lru_lock);
        return &pgdat->__lruvec;
}

static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
                unsigned long *flagsp)
{
        struct pglist_data *pgdat = folio_pgdat(folio);

        spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
        return &pgdat->__lruvec;
}

static inline struct mem_cgroup *
mem_cgroup_iter(struct mem_cgroup *root,
                struct mem_cgroup *prev,
                struct mem_cgroup_reclaim_cookie *reclaim)
{
        return NULL;
}

static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
                                         struct mem_cgroup *prev)
{
}

static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
                int (*fn)(struct task_struct *, void *), void *arg)
{
}

static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
        return 0;
}

static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
        WARN_ON_ONCE(id);
        /* XXX: This should always return root_mem_cgroup */
        return NULL;
}

#ifdef CONFIG_SHRINKER_DEBUG
static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg)
{
        return 0;
}

static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
{
        return NULL;
}
#endif

static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
{
        return NULL;
}

static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
{
        return NULL;
}

static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
{
        return true;
}

static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
                enum lru_list lru, int zone_idx)
{
        return 0;
}

static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
        return 0;
}

static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
{
        return 0;
}

static inline void
mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
{
}

static inline void
mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
}

static inline void folio_memcg_lock(struct folio *folio)
{
}

static inline void folio_memcg_unlock(struct folio *folio)
{
}

static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
{
        /* to match folio_memcg_rcu() */
        rcu_read_lock();
        return true;
}

static inline void mem_cgroup_unlock_pages(void)
{
        rcu_read_unlock();
}

static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask)
{
}

static inline void mem_cgroup_enter_user_fault(void)
{
}

static inline void mem_cgroup_exit_user_fault(void)
{
}

static inline bool task_in_memcg_oom(struct task_struct *p)
{
        return false;
}

static inline bool mem_cgroup_oom_synchronize(bool wait)
{
        return false;
}

static inline struct mem_cgroup *mem_cgroup_get_oom_group(
        struct task_struct *victim, struct mem_cgroup *oom_domain)
{
        return NULL;
}

static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
{
}

static inline void __mod_memcg_state(struct mem_cgroup *memcg,
                                     enum memcg_stat_item idx,
                                     int nr)
{
}

static inline void mod_memcg_state(struct mem_cgroup *memcg,
                                   enum memcg_stat_item idx,
                                   int nr)
{
}

static inline void mod_memcg_page_state(struct page *page,
                                        enum memcg_stat_item idx, int val)
{
}

static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
{
        return 0;
}

static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
                                              enum node_stat_item idx)
{
        return node_page_state(lruvec_pgdat(lruvec), idx);
}

static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
                                                    enum node_stat_item idx)
{
        return node_page_state(lruvec_pgdat(lruvec), idx);
}

static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
{
}

static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
{
}

static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
                                           int val)
{
        struct page *page = virt_to_head_page(p);

        __mod_node_page_state(page_pgdat(page), idx, val);
}

static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
                                         int val)
{
        struct page *page = virt_to_head_page(p);

        mod_node_page_state(page_pgdat(page), idx, val);
}

static inline void count_memcg_events(struct mem_cgroup *memcg,
                                      enum vm_event_item idx,
                                      unsigned long count)
{
}

static inline void __count_memcg_events(struct mem_cgroup *memcg,
                                        enum vm_event_item idx,
                                        unsigned long count)
{
}

static inline void count_memcg_folio_events(struct folio *folio,
                enum vm_event_item idx, unsigned long nr)
{
}

static inline
void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
{
}

static inline void split_page_memcg(struct page *head, int old_order, int new_order)
{
}

static inline
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
                                            gfp_t gfp_mask,
                                            unsigned long *total_scanned)
{
        return 0;
}
#endif /* CONFIG_MEMCG */

/*
 * Extended information for slab objects stored as an array in page->memcg_data
 * if MEMCG_DATA_OBJEXTS is set.
 */
struct slabobj_ext {
#ifdef CONFIG_MEMCG_KMEM
        struct obj_cgroup *objcg;
#endif
#ifdef CONFIG_MEM_ALLOC_PROFILING
        union codetag_ref ref;
#endif
} __aligned(8);

static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
{
        __mod_lruvec_kmem_state(p, idx, 1);
}

static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx)
{
        __mod_lruvec_kmem_state(p, idx, -1);
}

static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
{
        struct mem_cgroup *memcg;

        memcg = lruvec_memcg(lruvec);
        if (!memcg)
                return NULL;
        memcg = parent_mem_cgroup(memcg);
        if (!memcg)
                return NULL;
        return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
}

static inline void unlock_page_lruvec(struct lruvec *lruvec)
{
        spin_unlock(&lruvec->lru_lock);
}

static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
{
        spin_unlock_irq(&lruvec->lru_lock);
}

static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
                unsigned long flags)
{
        spin_unlock_irqrestore(&lruvec->lru_lock, flags);
}

/* Test requires a stable page->memcg binding, see page_memcg() */
static inline bool folio_matches_lruvec(struct folio *folio,
                struct lruvec *lruvec)
{
        return lruvec_pgdat(lruvec) == folio_pgdat(folio) &&
               lruvec_memcg(lruvec) == folio_memcg(folio);
}

/* Don't lock again iff page's lruvec locked */
static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio,
                struct lruvec *locked_lruvec)
{
        if (locked_lruvec) {
                if (folio_matches_lruvec(folio, locked_lruvec))
                        return locked_lruvec;

                unlock_page_lruvec_irq(locked_lruvec);
        }

        return folio_lruvec_lock_irq(folio);
}

/* Don't lock again iff folio's lruvec locked */
static inline void folio_lruvec_relock_irqsave(struct folio *folio,
                struct lruvec **lruvecp, unsigned long *flags)
{
        if (*lruvecp) {
                if (folio_matches_lruvec(folio, *lruvecp))
                        return;

                unlock_page_lruvec_irqrestore(*lruvecp, *flags);
        }

        *lruvecp = folio_lruvec_lock_irqsave(folio, flags);
}

#ifdef CONFIG_CGROUP_WRITEBACK

struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
                         unsigned long *pheadroom, unsigned long *pdirty,
                         unsigned long *pwriteback);

void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
                                             struct bdi_writeback *wb);

static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
                                                  struct bdi_writeback *wb)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        memcg = folio_memcg(folio);
        if (unlikely(memcg && &memcg->css != wb->memcg_css))
                mem_cgroup_track_foreign_dirty_slowpath(folio, wb);
}

void mem_cgroup_flush_foreign(struct bdi_writeback *wb);

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
{
        return NULL;
}

static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
                                       unsigned long *pfilepages,
                                       unsigned long *pheadroom,
                                       unsigned long *pdirty,
                                       unsigned long *pwriteback)
{
}

static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
                                                  struct bdi_writeback *wb)
{
}

static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
{
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

struct sock;
bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
                             gfp_t gfp_mask);
void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
#ifdef CONFIG_MEMCG
extern struct static_key_false memcg_sockets_enabled_key;
#define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)
void mem_cgroup_sk_alloc(struct sock *sk);
void mem_cgroup_sk_free(struct sock *sk);
static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
{
        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
                return !!memcg->tcpmem_pressure;
        do {
                if (time_before(jiffies, READ_ONCE(memcg->socket_pressure)))
                        return true;
        } while ((memcg = parent_mem_cgroup(memcg)));
        return false;
}

int alloc_shrinker_info(struct mem_cgroup *memcg);
void free_shrinker_info(struct mem_cgroup *memcg);
void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
void reparent_shrinker_deferred(struct mem_cgroup *memcg);
#else
#define mem_cgroup_sockets_enabled 0
static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
static inline void mem_cgroup_sk_free(struct sock *sk) { };
static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
{
        return false;
}

static inline void set_shrinker_bit(struct mem_cgroup *memcg,
                                    int nid, int shrinker_id)
{
}
#endif

#ifdef CONFIG_MEMCG_KMEM
bool mem_cgroup_kmem_disabled(void);
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
void __memcg_kmem_uncharge_page(struct page *page, int order);

/*
 * The returned objcg pointer is safe to use without additional
 * protection within a scope. The scope is defined either by
 * the current task (similar to the "current" global variable)
 * or by set_active_memcg() pair.
 * Please, use obj_cgroup_get() to get a reference if the pointer
 * needs to be used outside of the local scope.
 */
struct obj_cgroup *current_obj_cgroup(void);
struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio);

static inline struct obj_cgroup *get_obj_cgroup_from_current(void)
{
        struct obj_cgroup *objcg = current_obj_cgroup();

        if (objcg)
                obj_cgroup_get(objcg);

        return objcg;
}

int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);

extern struct static_key_false memcg_bpf_enabled_key;
static inline bool memcg_bpf_enabled(void)
{
        return static_branch_likely(&memcg_bpf_enabled_key);
}

extern struct static_key_false memcg_kmem_online_key;

static inline bool memcg_kmem_online(void)
{
        return static_branch_likely(&memcg_kmem_online_key);
}

static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                         int order)
{
        if (memcg_kmem_online())
                return __memcg_kmem_charge_page(page, gfp, order);
        return 0;
}

static inline void memcg_kmem_uncharge_page(struct page *page, int order)
{
        if (memcg_kmem_online())
                __memcg_kmem_uncharge_page(page, order);
}

/*
 * A helper for accessing memcg's kmem_id, used for getting
 * corresponding LRU lists.
 */
static inline int memcg_kmem_id(struct mem_cgroup *memcg)
{
        return memcg ? memcg->kmemcg_id : -1;
}

struct mem_cgroup *mem_cgroup_from_obj(void *p);
struct mem_cgroup *mem_cgroup_from_slab_obj(void *p);

static inline void count_objcg_event(struct obj_cgroup *objcg,
                                     enum vm_event_item idx)
{
        struct mem_cgroup *memcg;

        if (!memcg_kmem_online())
                return;

        rcu_read_lock();
        memcg = obj_cgroup_memcg(objcg);
        count_memcg_events(memcg, idx, 1);
        rcu_read_unlock();
}

#else
static inline bool mem_cgroup_kmem_disabled(void)
{
        return true;
}

static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                         int order)
{
        return 0;
}

static inline void memcg_kmem_uncharge_page(struct page *page, int order)
{
}

static inline int __memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                           int order)
{
        return 0;
}

static inline void __memcg_kmem_uncharge_page(struct page *page, int order)
{
}

static inline struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
{
        return NULL;
}

static inline bool memcg_bpf_enabled(void)
{
        return false;
}

static inline bool memcg_kmem_online(void)
{
        return false;
}

static inline int memcg_kmem_id(struct mem_cgroup *memcg)
{
        return -1;
}

static inline struct mem_cgroup *mem_cgroup_from_obj(void *p)
{
        return NULL;
}

static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
{
        return NULL;
}

static inline void count_objcg_event(struct obj_cgroup *objcg,
                                     enum vm_event_item idx)
{
}

#endif /* CONFIG_MEMCG_KMEM */

#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
bool obj_cgroup_may_zswap(struct obj_cgroup *objcg);
void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size);
void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size);
bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg);
#else
static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
{
        return true;
}
static inline void obj_cgroup_charge_zswap(struct obj_cgroup *objcg,
                                           size_t size)
{
}
static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg,
                                             size_t size)
{
}
static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg)
{
        /* if zswap is disabled, do not block pages going to the swapping device */
        return true;
}
#endif

#endif /* _LINUX_MEMCONTROL_H */











    9 
    3 


    2 


    4 


    2 



    4 










    2 














    3 













   17 










   15 






   16 










   17 



   18 



    5 







    3 
    2 
   15 




   17 





    3 














   14 






























    3 
    1 



    2 












































    3 













   16 






   13 





    3 
   14 



















    1 




    1 


















    1 
















    1 













    4 


    2 








    2 










   18 




   18 
   15 
   18 



   13 



























    2 


    2 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
// SPDX-License-Identifier: GPL-2.0
/*
 * Interface between ext4 and JBD
 */

#include "ext4_jbd2.h"

#include <trace/events/ext4.h>

int ext4_inode_journal_mode(struct inode *inode)
{
        if (EXT4_JOURNAL(inode) == NULL)
                return EXT4_INODE_WRITEBACK_DATA_MODE;        /* writeback */
        /* We do not support data journalling with delayed allocation */
        if (!S_ISREG(inode->i_mode) ||
            ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
            test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
            (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
            !test_opt(inode->i_sb, DELALLOC))) {
                /* We do not support data journalling for encrypted data */
                if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
                        return EXT4_INODE_ORDERED_DATA_MODE;  /* ordered */
                return EXT4_INODE_JOURNAL_DATA_MODE;        /* journal data */
        }
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
                return EXT4_INODE_ORDERED_DATA_MODE;        /* ordered */
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
                return EXT4_INODE_WRITEBACK_DATA_MODE;        /* writeback */
        BUG();
}

/* Just increment the non-pointer handle value */
static handle_t *ext4_get_nojournal(void)
{
        handle_t *handle = current->journal_info;
        unsigned long ref_cnt = (unsigned long)handle;

        BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);

        ref_cnt++;
        handle = (handle_t *)ref_cnt;

        current->journal_info = handle;
        return handle;
}


/* Decrement the non-pointer handle value */
static void ext4_put_nojournal(handle_t *handle)
{
        unsigned long ref_cnt = (unsigned long)handle;

        BUG_ON(ref_cnt == 0);

        ref_cnt--;
        handle = (handle_t *)ref_cnt;

        current->journal_info = handle;
}

/*
 * Wrappers for jbd2_journal_start/end.
 */
static int ext4_journal_check_start(struct super_block *sb)
{
        journal_t *journal;

        might_sleep();

        if (unlikely(ext4_forced_shutdown(sb)))
                return -EIO;

        if (WARN_ON_ONCE(sb_rdonly(sb)))
                return -EROFS;

        WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
        journal = EXT4_SB(sb)->s_journal;
        /*
         * Special case here: if the journal has aborted behind our
         * backs (eg. EIO in the commit thread), then we still need to
         * take the FS itself readonly cleanly.
         */
        if (journal && is_journal_aborted(journal)) {
                ext4_abort(sb, -journal->j_errno, "Detected aborted journal");
                return -EROFS;
        }
        return 0;
}

handle_t *__ext4_journal_start_sb(struct inode *inode,
                                  struct super_block *sb, unsigned int line,
                                  int type, int blocks, int rsv_blocks,
                                  int revoke_creds)
{
        journal_t *journal;
        int err;
        if (inode)
                trace_ext4_journal_start_inode(inode, blocks, rsv_blocks,
                                        revoke_creds, type,
                                        _RET_IP_);
        else
                trace_ext4_journal_start_sb(sb, blocks, rsv_blocks,
                                        revoke_creds, type,
                                        _RET_IP_);
        err = ext4_journal_check_start(sb);
        if (err < 0)
                return ERR_PTR(err);

        journal = EXT4_SB(sb)->s_journal;
        if (!journal || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
                return ext4_get_nojournal();
        return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds,
                                   GFP_NOFS, type, line);
}

int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
{
        struct super_block *sb;
        int err;
        int rc;

        if (!ext4_handle_valid(handle)) {
                ext4_put_nojournal(handle);
                return 0;
        }

        err = handle->h_err;
        if (!handle->h_transaction) {
                rc = jbd2_journal_stop(handle);
                return err ? err : rc;
        }

        sb = handle->h_transaction->t_journal->j_private;
        rc = jbd2_journal_stop(handle);

        if (!err)
                err = rc;
        if (err)
                __ext4_std_error(sb, where, line, err);
        return err;
}

handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
                                        int type)
{
        struct super_block *sb;
        int err;

        if (!ext4_handle_valid(handle))
                return ext4_get_nojournal();

        sb = handle->h_journal->j_private;
        trace_ext4_journal_start_reserved(sb,
                                jbd2_handle_buffer_credits(handle), _RET_IP_);
        err = ext4_journal_check_start(sb);
        if (err < 0) {
                jbd2_journal_free_reserved(handle);
                return ERR_PTR(err);
        }

        err = jbd2_journal_start_reserved(handle, type, line);
        if (err < 0)
                return ERR_PTR(err);
        return handle;
}

int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
                                  int extend_cred, int revoke_cred)
{
        if (!ext4_handle_valid(handle))
                return 0;
        if (is_handle_aborted(handle))
                return -EROFS;
        if (jbd2_handle_buffer_credits(handle) >= check_cred &&
            handle->h_revoke_credits >= revoke_cred)
                return 0;
        extend_cred = max(0, extend_cred - jbd2_handle_buffer_credits(handle));
        revoke_cred = max(0, revoke_cred - handle->h_revoke_credits);
        return ext4_journal_extend(handle, extend_cred, revoke_cred);
}

static void ext4_journal_abort_handle(const char *caller, unsigned int line,
                                      const char *err_fn,
                                      struct buffer_head *bh,
                                      handle_t *handle, int err)
{
        char nbuf[16];
        const char *errstr = ext4_decode_error(NULL, err, nbuf);

        BUG_ON(!ext4_handle_valid(handle));

        if (bh)
                BUFFER_TRACE(bh, "abort");

        if (!handle->h_err)
                handle->h_err = err;

        if (is_handle_aborted(handle))
                return;

        printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
               caller, line, errstr, err_fn);

        jbd2_journal_abort_handle(handle);
}

static void ext4_check_bdev_write_error(struct super_block *sb)
{
        struct address_space *mapping = sb->s_bdev->bd_mapping;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int err;

        /*
         * If the block device has write error flag, it may have failed to
         * async write out metadata buffers in the background. In this case,
         * we could read old data from disk and write it out again, which
         * may lead to on-disk filesystem inconsistency.
         */
        if (errseq_check(&mapping->wb_err, READ_ONCE(sbi->s_bdev_wb_err))) {
                spin_lock(&sbi->s_bdev_wb_lock);
                err = errseq_check_and_advance(&mapping->wb_err, &sbi->s_bdev_wb_err);
                spin_unlock(&sbi->s_bdev_wb_lock);
                if (err)
                        ext4_error_err(sb, -err,
                                       "Error while async write back metadata");
        }
}

int __ext4_journal_get_write_access(const char *where, unsigned int line,
                                    handle_t *handle, struct super_block *sb,
                                    struct buffer_head *bh,
                                    enum ext4_journal_trigger_type trigger_type)
{
        int err;

        might_sleep();

        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_get_write_access(handle, bh);
                if (err) {
                        ext4_journal_abort_handle(where, line, __func__, bh,
                                                  handle, err);
                        return err;
                }
        } else
                ext4_check_bdev_write_error(sb);
        if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb))
                return 0;
        BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
        jbd2_journal_set_triggers(bh,
                &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers);
        return 0;
}

/*
 * The ext4 forget function must perform a revoke if we are freeing data
 * which has been journaled.  Metadata (eg. indirect blocks) must be
 * revoked in all cases.
 *
 * "bh" may be NULL: a metadata block may have been freed from memory
 * but there may still be a record of it in the journal, and that record
 * still needs to be revoked.
 */
int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
                  int is_metadata, struct inode *inode,
                  struct buffer_head *bh, ext4_fsblk_t blocknr)
{
        int err;

        might_sleep();

        trace_ext4_forget(inode, is_metadata, blocknr);
        BUFFER_TRACE(bh, "enter");

        ext4_debug("forgetting bh %p: is_metadata=%d, mode %o, data mode %x\n",
                  bh, is_metadata, inode->i_mode,
                  test_opt(inode->i_sb, DATA_FLAGS));

        /* In the no journal case, we can just do a bforget and return */
        if (!ext4_handle_valid(handle)) {
                bforget(bh);
                return 0;
        }

        /* Never use the revoke function if we are doing full data
         * journaling: there is no need to, and a V1 superblock won't
         * support it.  Otherwise, only skip the revoke on un-journaled
         * data blocks. */

        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
            (!is_metadata && !ext4_should_journal_data(inode))) {
                if (bh) {
                        BUFFER_TRACE(bh, "call jbd2_journal_forget");
                        err = jbd2_journal_forget(handle, bh);
                        if (err)
                                ext4_journal_abort_handle(where, line, __func__,
                                                          bh, handle, err);
                        return err;
                }
                return 0;
        }

        /*
         * data!=journal && (is_metadata || should_journal_data(inode))
         */
        BUFFER_TRACE(bh, "call jbd2_journal_revoke");
        err = jbd2_journal_revoke(handle, blocknr, bh);
        if (err) {
                ext4_journal_abort_handle(where, line, __func__,
                                          bh, handle, err);
                __ext4_error(inode->i_sb, where, line, true, -err, 0,
                             "error %d when attempting revoke", err);
        }
        BUFFER_TRACE(bh, "exit");
        return err;
}

int __ext4_journal_get_create_access(const char *where, unsigned int line,
                                handle_t *handle, struct super_block *sb,
                                struct buffer_head *bh,
                                enum ext4_journal_trigger_type trigger_type)
{
        int err;

        if (!ext4_handle_valid(handle))
                return 0;

        err = jbd2_journal_get_create_access(handle, bh);
        if (err) {
                ext4_journal_abort_handle(where, line, __func__, bh, handle,
                                          err);
                return err;
        }
        if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb))
                return 0;
        BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
        jbd2_journal_set_triggers(bh,
                &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers);
        return 0;
}

int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
                                 handle_t *handle, struct inode *inode,
                                 struct buffer_head *bh)
{
        int err = 0;

        might_sleep();

        set_buffer_meta(bh);
        set_buffer_prio(bh);
        set_buffer_uptodate(bh);
        if (ext4_handle_valid(handle)) {
                err = jbd2_journal_dirty_metadata(handle, bh);
                /* Errors can only happen due to aborted journal or a nasty bug */
                if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) {
                        ext4_journal_abort_handle(where, line, __func__, bh,
                                                  handle, err);
                        if (inode == NULL) {
                                pr_err("EXT4: jbd2_journal_dirty_metadata "
                                       "failed: handle type %u started at "
                                       "line %u, credits %u/%u, errcode %d",
                                       handle->h_type,
                                       handle->h_line_no,
                                       handle->h_requested_credits,
                                       jbd2_handle_buffer_credits(handle), err);
                                return err;
                        }
                        ext4_error_inode(inode, where, line,
                                         bh->b_blocknr,
                                         "journal_dirty_metadata failed: "
                                         "handle type %u started at line %u, "
                                         "credits %u/%u, errcode %d",
                                         handle->h_type,
                                         handle->h_line_no,
                                         handle->h_requested_credits,
                                         jbd2_handle_buffer_credits(handle),
                                         err);
                }
        } else {
                if (inode)
                        mark_buffer_dirty_inode(bh, inode);
                else
                        mark_buffer_dirty(bh);
                if (inode && inode_needs_sync(inode)) {
                        sync_dirty_buffer(bh);
                        if (buffer_req(bh) && !buffer_uptodate(bh)) {
                                ext4_error_inode_err(inode, where, line,
                                                     bh->b_blocknr, EIO,
                                        "IO error syncing itable block");
                                err = -EIO;
                        }
                }
        }
        return err;
}




























































    1 




























































































    1 




    1 



















































    1 













    1 



































    3 







    1 



    3 


    3 



    3 







    3 

















    3 




    1 


    1 




    1 

    1 






    1 










    1 
    1 

















    1 


















    1 





    1 










    1 







































    1 







    1 
    1 






































    1 




    1 











































    1 



    1 
















    1 




















    1 















    1 







    1 

    1 






















































































    3 










    1 

    3 



















































































































































    1 












    1 





    1 
    1 






    1 









    1 













































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/file.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/file.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  ext4 fs regular file handling primitives
 *
 *  64-bit file support on 64-bit platforms by Jakub Jelinek
 *        (jj@sunsite.ms.mff.cuni.cz)
 */

#include <linux/time.h>
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/mount.h>
#include <linux/path.h>
#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/pagevec.h>
#include <linux/uio.h>
#include <linux/mman.h>
#include <linux/backing-dev.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
#include "truncate.h"

/*
 * Returns %true if the given DIO request should be attempted with DIO, or
 * %false if it should fall back to buffered I/O.
 *
 * DIO isn't well specified; when it's unsupported (either due to the request
 * being misaligned, or due to the file not supporting DIO at all), filesystems
 * either fall back to buffered I/O or return EINVAL.  For files that don't use
 * any special features like encryption or verity, ext4 has traditionally
 * returned EINVAL for misaligned DIO.  iomap_dio_rw() uses this convention too.
 * In this case, we should attempt the DIO, *not* fall back to buffered I/O.
 *
 * In contrast, in cases where DIO is unsupported due to ext4 features, ext4
 * traditionally falls back to buffered I/O.
 *
 * This function implements the traditional ext4 behavior in all these cases.
 */
static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter)
{
        struct inode *inode = file_inode(iocb->ki_filp);
        u32 dio_align = ext4_dio_alignment(inode);

        if (dio_align == 0)
                return false;

        if (dio_align == 1)
                return true;

        return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align);
}

static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        ssize_t ret;
        struct inode *inode = file_inode(iocb->ki_filp);

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (!inode_trylock_shared(inode))
                        return -EAGAIN;
        } else {
                inode_lock_shared(inode);
        }

        if (!ext4_should_use_dio(iocb, to)) {
                inode_unlock_shared(inode);
                /*
                 * Fallback to buffered I/O if the operation being performed on
                 * the inode is not supported by direct I/O. The IOCB_DIRECT
                 * flag needs to be cleared here in order to ensure that the
                 * direct I/O path within generic_file_read_iter() is not
                 * taken.
                 */
                iocb->ki_flags &= ~IOCB_DIRECT;
                return generic_file_read_iter(iocb, to);
        }

        ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0);
        inode_unlock_shared(inode);

        file_accessed(iocb->ki_filp);
        return ret;
}

#ifdef CONFIG_FS_DAX
static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct inode *inode = file_inode(iocb->ki_filp);
        ssize_t ret;

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (!inode_trylock_shared(inode))
                        return -EAGAIN;
        } else {
                inode_lock_shared(inode);
        }
        /*
         * Recheck under inode lock - at this point we are sure it cannot
         * change anymore
         */
        if (!IS_DAX(inode)) {
                inode_unlock_shared(inode);
                /* Fallback to buffered IO in case we cannot support DAX */
                return generic_file_read_iter(iocb, to);
        }
        ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops);
        inode_unlock_shared(inode);

        file_accessed(iocb->ki_filp);
        return ret;
}
#endif

static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct inode *inode = file_inode(iocb->ki_filp);

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;

        if (!iov_iter_count(to))
                return 0; /* skip atime */

#ifdef CONFIG_FS_DAX
        if (IS_DAX(inode))
                return ext4_dax_read_iter(iocb, to);
#endif
        if (iocb->ki_flags & IOCB_DIRECT)
                return ext4_dio_read_iter(iocb, to);

        return generic_file_read_iter(iocb, to);
}

static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos,
                                     struct pipe_inode_info *pipe,
                                     size_t len, unsigned int flags)
{
        struct inode *inode = file_inode(in);

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;
        return filemap_splice_read(in, ppos, pipe, len, flags);
}

/*
 * Called when an inode is released. Note that this is different
 * from ext4_file_open: open gets called at every open, but release
 * gets called only when /all/ the files are closed.
 */
static int ext4_release_file(struct inode *inode, struct file *filp)
{
        if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
                ext4_alloc_da_blocks(inode);
                ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
        }
        /* if we are the last writer on the inode, drop the block reservation */
        if ((filp->f_mode & FMODE_WRITE) &&
                        (atomic_read(&inode->i_writecount) == 1) &&
                        !EXT4_I(inode)->i_reserved_data_blocks) {
                down_write(&EXT4_I(inode)->i_data_sem);
                ext4_discard_preallocations(inode);
                up_write(&EXT4_I(inode)->i_data_sem);
        }
        if (is_dx(inode) && filp->private_data)
                ext4_htree_free_dir_info(filp->private_data);

        return 0;
}

/*
 * This tests whether the IO in question is block-aligned or not.
 * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
 * are converted to written only after the IO is complete.  Until they are
 * mapped, these blocks appear as holes, so dio_zero_block() will assume that
 * it needs to zero out portions of the start and/or end block.  If 2 AIO
 * threads are at work on the same unwritten block, they must be synchronized
 * or one thread will zero the other's data, causing corruption.
 */
static bool
ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos)
{
        struct super_block *sb = inode->i_sb;
        unsigned long blockmask = sb->s_blocksize - 1;

        if ((pos | iov_iter_alignment(from)) & blockmask)
                return true;

        return false;
}

static bool
ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
{
        if (offset + len > i_size_read(inode) ||
            offset + len > EXT4_I(inode)->i_disksize)
                return true;
        return false;
}

/* Is IO overwriting allocated or initialized blocks? */
static bool ext4_overwrite_io(struct inode *inode,
                              loff_t pos, loff_t len, bool *unwritten)
{
        struct ext4_map_blocks map;
        unsigned int blkbits = inode->i_blkbits;
        int err, blklen;

        if (pos + len > i_size_read(inode))
                return false;

        map.m_lblk = pos >> blkbits;
        map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits);
        blklen = map.m_len;

        err = ext4_map_blocks(NULL, inode, &map, 0);
        if (err != blklen)
                return false;
        /*
         * 'err==len' means that all of the blocks have been preallocated,
         * regardless of whether they have been initialized or not. We need to
         * check m_flags to distinguish the unwritten extents.
         */
        *unwritten = !(map.m_flags & EXT4_MAP_MAPPED);
        return true;
}

static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
                                         struct iov_iter *from)
{
        struct inode *inode = file_inode(iocb->ki_filp);
        ssize_t ret;

        if (unlikely(IS_IMMUTABLE(inode)))
                return -EPERM;

        ret = generic_write_checks(iocb, from);
        if (ret <= 0)
                return ret;

        /*
         * If we have encountered a bitmap-format file, the size limit
         * is smaller than s_maxbytes, which is for extent-mapped files.
         */
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

                if (iocb->ki_pos >= sbi->s_bitmap_maxbytes)
                        return -EFBIG;
                iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
        }

        return iov_iter_count(from);
}

static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
{
        ssize_t ret, count;

        count = ext4_generic_write_checks(iocb, from);
        if (count <= 0)
                return count;

        ret = file_modified(iocb->ki_filp);
        if (ret)
                return ret;
        return count;
}

static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
                                        struct iov_iter *from)
{
        ssize_t ret;
        struct inode *inode = file_inode(iocb->ki_filp);

        if (iocb->ki_flags & IOCB_NOWAIT)
                return -EOPNOTSUPP;

        inode_lock(inode);
        ret = ext4_write_checks(iocb, from);
        if (ret <= 0)
                goto out;

        ret = generic_perform_write(iocb, from);

out:
        inode_unlock(inode);
        if (unlikely(ret <= 0))
                return ret;
        return generic_write_sync(iocb, ret);
}

static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
                                           ssize_t count)
{
        handle_t *handle;

        lockdep_assert_held_write(&inode->i_rwsem);
        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        if (ext4_update_inode_size(inode, offset + count)) {
                int ret = ext4_mark_inode_dirty(handle, inode);
                if (unlikely(ret)) {
                        ext4_journal_stop(handle);
                        return ret;
                }
        }

        if (inode->i_nlink)
                ext4_orphan_del(handle, inode);
        ext4_journal_stop(handle);

        return count;
}

/*
 * Clean up the inode after DIO or DAX extending write has completed and the
 * inode size has been updated using ext4_handle_inode_extension().
 */
static void ext4_inode_extension_cleanup(struct inode *inode, ssize_t count)
{
        lockdep_assert_held_write(&inode->i_rwsem);
        if (count < 0) {
                ext4_truncate_failed_write(inode);
                /*
                 * If the truncate operation failed early, then the inode may
                 * still be on the orphan list. In that case, we need to try
                 * remove the inode from the in-memory linked list.
                 */
                if (inode->i_nlink)
                        ext4_orphan_del(NULL, inode);
                return;
        }
        /*
         * If i_disksize got extended either due to writeback of delalloc
         * blocks or extending truncate while the DIO was running we could fail
         * to cleanup the orphan list in ext4_handle_inode_extension(). Do it
         * now.
         */
        if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
                handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);

                if (IS_ERR(handle)) {
                        /*
                         * The write has successfully completed. Not much to
                         * do with the error here so just cleanup the orphan
                         * list and hope for the best.
                         */
                        ext4_orphan_del(NULL, inode);
                        return;
                }
                ext4_orphan_del(handle, inode);
                ext4_journal_stop(handle);
        }
}

static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
                                 int error, unsigned int flags)
{
        loff_t pos = iocb->ki_pos;
        struct inode *inode = file_inode(iocb->ki_filp);

        if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
                error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
        if (error)
                return error;
        /*
         * Note that EXT4_I(inode)->i_disksize can get extended up to
         * inode->i_size while the I/O was running due to writeback of delalloc
         * blocks. But the code in ext4_iomap_alloc() is careful to use
         * zeroed/unwritten extents if this is possible; thus we won't leave
         * uninitialized blocks in a file even if we didn't succeed in writing
         * as much as we intended. Also we can race with truncate or write
         * expanding the file so we have to be a bit careful here.
         */
        if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) &&
            pos + size <= i_size_read(inode))
                return size;
        return ext4_handle_inode_extension(inode, pos, size);
}

static const struct iomap_dio_ops ext4_dio_write_ops = {
        .end_io = ext4_dio_write_end_io,
};

/*
 * The intention here is to start with shared lock acquired then see if any
 * condition requires an exclusive inode lock. If yes, then we restart the
 * whole operation by releasing the shared lock and acquiring exclusive lock.
 *
 * - For unaligned_io we never take shared lock as it may cause data corruption
 *   when two unaligned IO tries to modify the same block e.g. while zeroing.
 *
 * - For extending writes case we don't take the shared lock, since it requires
 *   updating inode i_disksize and/or orphan handling with exclusive lock.
 *
 * - shared locking will only be true mostly with overwrites, including
 *   initialized blocks and unwritten blocks. For overwrite unwritten blocks
 *   we protect splitting extents by i_data_sem in ext4_inode_info, so we can
 *   also release exclusive i_rwsem lock.
 *
 * - Otherwise we will switch to exclusive i_rwsem lock.
 */
static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
                                     bool *ilock_shared, bool *extend,
                                     bool *unwritten, int *dio_flags)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        loff_t offset;
        size_t count;
        ssize_t ret;
        bool overwrite, unaligned_io;

restart:
        ret = ext4_generic_write_checks(iocb, from);
        if (ret <= 0)
                goto out;

        offset = iocb->ki_pos;
        count = ret;

        unaligned_io = ext4_unaligned_io(inode, from, offset);
        *extend = ext4_extending_io(inode, offset, count);
        overwrite = ext4_overwrite_io(inode, offset, count, unwritten);

        /*
         * Determine whether we need to upgrade to an exclusive lock. This is
         * required to change security info in file_modified(), for extending
         * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten
         * extents (as partial block zeroing may be required).
         *
         * Note that unaligned writes are allowed under shared lock so long as
         * they are pure overwrites. Otherwise, concurrent unaligned writes risk
         * data corruption due to partial block zeroing in the dio layer, and so
         * the I/O must occur exclusively.
         */
        if (*ilock_shared &&
            ((!IS_NOSEC(inode) || *extend || !overwrite ||
             (unaligned_io && *unwritten)))) {
                if (iocb->ki_flags & IOCB_NOWAIT) {
                        ret = -EAGAIN;
                        goto out;
                }
                inode_unlock_shared(inode);
                *ilock_shared = false;
                inode_lock(inode);
                goto restart;
        }

        /*
         * Now that locking is settled, determine dio flags and exclusivity
         * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce
         * behavior already. The inode lock is already held exclusive if the
         * write is non-overwrite or extending, so drain all outstanding dio and
         * set the force wait dio flag.
         */
        if (!*ilock_shared && (unaligned_io || *extend)) {
                if (iocb->ki_flags & IOCB_NOWAIT) {
                        ret = -EAGAIN;
                        goto out;
                }
                if (unaligned_io && (!overwrite || *unwritten))
                        inode_dio_wait(inode);
                *dio_flags = IOMAP_DIO_FORCE_WAIT;
        }

        ret = file_modified(file);
        if (ret < 0)
                goto out;

        return count;
out:
        if (*ilock_shared)
                inode_unlock_shared(inode);
        else
                inode_unlock(inode);
        return ret;
}

static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        ssize_t ret;
        handle_t *handle;
        struct inode *inode = file_inode(iocb->ki_filp);
        loff_t offset = iocb->ki_pos;
        size_t count = iov_iter_count(from);
        const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
        bool extend = false, unwritten = false;
        bool ilock_shared = true;
        int dio_flags = 0;

        /*
         * Quick check here without any i_rwsem lock to see if it is extending
         * IO. A more reliable check is done in ext4_dio_write_checks() with
         * proper locking in place.
         */
        if (offset + count > i_size_read(inode))
                ilock_shared = false;

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (ilock_shared) {
                        if (!inode_trylock_shared(inode))
                                return -EAGAIN;
                } else {
                        if (!inode_trylock(inode))
                                return -EAGAIN;
                }
        } else {
                if (ilock_shared)
                        inode_lock_shared(inode);
                else
                        inode_lock(inode);
        }

        /* Fallback to buffered I/O if the inode does not support direct I/O. */
        if (!ext4_should_use_dio(iocb, from)) {
                if (ilock_shared)
                        inode_unlock_shared(inode);
                else
                        inode_unlock(inode);
                return ext4_buffered_write_iter(iocb, from);
        }

        /*
         * Prevent inline data from being created since we are going to allocate
         * blocks for DIO. We know the inode does not currently have inline data
         * because ext4_should_use_dio() checked for it, but we have to clear
         * the state flag before the write checks because a lock cycle could
         * introduce races with other writers.
         */
        ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);

        ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend,
                                    &unwritten, &dio_flags);
        if (ret <= 0)
                return ret;

        offset = iocb->ki_pos;
        count = ret;

        if (extend) {
                handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        goto out;
                }

                ret = ext4_orphan_add(handle, inode);
                if (ret) {
                        ext4_journal_stop(handle);
                        goto out;
                }

                ext4_journal_stop(handle);
        }

        if (ilock_shared && !unwritten)
                iomap_ops = &ext4_iomap_overwrite_ops;
        ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
                           dio_flags, NULL, 0);
        if (ret == -ENOTBLK)
                ret = 0;
        if (extend) {
                /*
                 * We always perform extending DIO write synchronously so by
                 * now the IO is completed and ext4_handle_inode_extension()
                 * was called. Cleanup the inode in case of error or race with
                 * writeback of delalloc blocks.
                 */
                WARN_ON_ONCE(ret == -EIOCBQUEUED);
                ext4_inode_extension_cleanup(inode, ret);
        }

out:
        if (ilock_shared)
                inode_unlock_shared(inode);
        else
                inode_unlock(inode);

        if (ret >= 0 && iov_iter_count(from)) {
                ssize_t err;
                loff_t endbyte;

                offset = iocb->ki_pos;
                err = ext4_buffered_write_iter(iocb, from);
                if (err < 0)
                        return err;

                /*
                 * We need to ensure that the pages within the page cache for
                 * the range covered by this I/O are written to disk and
                 * invalidated. This is in attempt to preserve the expected
                 * direct I/O semantics in the case we fallback to buffered I/O
                 * to complete off the I/O request.
                 */
                ret += err;
                endbyte = offset + err - 1;
                err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
                                                   offset, endbyte);
                if (!err)
                        invalidate_mapping_pages(iocb->ki_filp->f_mapping,
                                                 offset >> PAGE_SHIFT,
                                                 endbyte >> PAGE_SHIFT);
        }

        return ret;
}

#ifdef CONFIG_FS_DAX
static ssize_t
ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        ssize_t ret;
        size_t count;
        loff_t offset;
        handle_t *handle;
        bool extend = false;
        struct inode *inode = file_inode(iocb->ki_filp);

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (!inode_trylock(inode))
                        return -EAGAIN;
        } else {
                inode_lock(inode);
        }

        ret = ext4_write_checks(iocb, from);
        if (ret <= 0)
                goto out;

        offset = iocb->ki_pos;
        count = iov_iter_count(from);

        if (offset + count > EXT4_I(inode)->i_disksize) {
                handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        goto out;
                }

                ret = ext4_orphan_add(handle, inode);
                if (ret) {
                        ext4_journal_stop(handle);
                        goto out;
                }

                extend = true;
                ext4_journal_stop(handle);
        }

        ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);

        if (extend) {
                ret = ext4_handle_inode_extension(inode, offset, ret);
                ext4_inode_extension_cleanup(inode, ret);
        }
out:
        inode_unlock(inode);
        if (ret > 0)
                ret = generic_write_sync(iocb, ret);
        return ret;
}
#endif

static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct inode *inode = file_inode(iocb->ki_filp);

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;

#ifdef CONFIG_FS_DAX
        if (IS_DAX(inode))
                return ext4_dax_write_iter(iocb, from);
#endif
        if (iocb->ki_flags & IOCB_DIRECT)
                return ext4_dio_write_iter(iocb, from);
        else
                return ext4_buffered_write_iter(iocb, from);
}

#ifdef CONFIG_FS_DAX
static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order)
{
        int error = 0;
        vm_fault_t result;
        int retries = 0;
        handle_t *handle = NULL;
        struct inode *inode = file_inode(vmf->vma->vm_file);
        struct super_block *sb = inode->i_sb;

        /*
         * We have to distinguish real writes from writes which will result in a
         * COW page; COW writes should *not* poke the journal (the file will not
         * be changed). Doing so would cause unintended failures when mounted
         * read-only.
         *
         * We check for VM_SHARED rather than vmf->cow_page since the latter is
         * unset for order != 0 (i.e. only in do_cow_fault); for
         * other sizes, dax_iomap_fault will handle splitting / fallback so that
         * we eventually come back with a COW page.
         */
        bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
                (vmf->vma->vm_flags & VM_SHARED);
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        pfn_t pfn;

        if (write) {
                sb_start_pagefault(sb);
                file_update_time(vmf->vma->vm_file);
                filemap_invalidate_lock_shared(mapping);
retry:
                handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
                                               EXT4_DATA_TRANS_BLOCKS(sb));
                if (IS_ERR(handle)) {
                        filemap_invalidate_unlock_shared(mapping);
                        sb_end_pagefault(sb);
                        return VM_FAULT_SIGBUS;
                }
        } else {
                filemap_invalidate_lock_shared(mapping);
        }
        result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops);
        if (write) {
                ext4_journal_stop(handle);

                if ((result & VM_FAULT_ERROR) && error == -ENOSPC &&
                    ext4_should_retry_alloc(sb, &retries))
                        goto retry;
                /* Handling synchronous page fault? */
                if (result & VM_FAULT_NEEDDSYNC)
                        result = dax_finish_sync_fault(vmf, order, pfn);
                filemap_invalidate_unlock_shared(mapping);
                sb_end_pagefault(sb);
        } else {
                filemap_invalidate_unlock_shared(mapping);
        }

        return result;
}

static vm_fault_t ext4_dax_fault(struct vm_fault *vmf)
{
        return ext4_dax_huge_fault(vmf, 0);
}

static const struct vm_operations_struct ext4_dax_vm_ops = {
        .fault                = ext4_dax_fault,
        .huge_fault        = ext4_dax_huge_fault,
        .page_mkwrite        = ext4_dax_fault,
        .pfn_mkwrite        = ext4_dax_fault,
};
#else
#define ext4_dax_vm_ops        ext4_file_vm_ops
#endif

static const struct vm_operations_struct ext4_file_vm_ops = {
        .fault                = filemap_fault,
        .map_pages        = filemap_map_pages,
        .page_mkwrite   = ext4_page_mkwrite,
};

static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct inode *inode = file->f_mapping->host;
        struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;

        /*
         * We don't support synchronous mappings for non-DAX files and
         * for DAX files if underneath dax_device is not synchronous.
         */
        if (!daxdev_mapping_supported(vma, dax_dev))
                return -EOPNOTSUPP;

        file_accessed(file);
        if (IS_DAX(file_inode(file))) {
                vma->vm_ops = &ext4_dax_vm_ops;
                vm_flags_set(vma, VM_HUGEPAGE);
        } else {
                vma->vm_ops = &ext4_file_vm_ops;
        }
        return 0;
}

static int ext4_sample_last_mounted(struct super_block *sb,
                                    struct vfsmount *mnt)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct path path;
        char buf[64], *cp;
        handle_t *handle;
        int err;

        if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))
                return 0;

        if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
                return 0;

        ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);
        /*
         * Sample where the filesystem has been mounted and
         * store it in the superblock for sysadmin convenience
         * when trying to sort through large numbers of block
         * devices or filesystem images.
         */
        memset(buf, 0, sizeof(buf));
        path.mnt = mnt;
        path.dentry = mnt->mnt_root;
        cp = d_path(&path, buf, sizeof(buf));
        err = 0;
        if (IS_ERR(cp))
                goto out;

        handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
        err = PTR_ERR(handle);
        if (IS_ERR(handle))
                goto out;
        BUFFER_TRACE(sbi->s_sbh, "get_write_access");
        err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
                                            EXT4_JTR_NONE);
        if (err)
                goto out_journal;
        lock_buffer(sbi->s_sbh);
        strtomem_pad(sbi->s_es->s_last_mounted, cp, 0);
        ext4_superblock_csum_set(sb);
        unlock_buffer(sbi->s_sbh);
        ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
out_journal:
        ext4_journal_stop(handle);
out:
        sb_end_intwrite(sb);
        return err;
}

static int ext4_file_open(struct inode *inode, struct file *filp)
{
        int ret;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;

        ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
        if (ret)
                return ret;

        ret = fscrypt_file_open(inode, filp);
        if (ret)
                return ret;

        ret = fsverity_file_open(inode, filp);
        if (ret)
                return ret;

        /*
         * Set up the jbd2_inode if we are opening the inode for
         * writing and the journal is present
         */
        if (filp->f_mode & FMODE_WRITE) {
                ret = ext4_inode_attach_jinode(inode);
                if (ret < 0)
                        return ret;
        }

        filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
        return dquot_file_open(inode, filp);
}

/*
 * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
 * by calling generic_file_llseek_size() with the appropriate maxbytes
 * value for each.
 */
loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *inode = file->f_mapping->host;
        loff_t maxbytes;

        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
        else
                maxbytes = inode->i_sb->s_maxbytes;

        switch (whence) {
        default:
                return generic_file_llseek_size(file, offset, whence,
                                                maxbytes, i_size_read(inode));
        case SEEK_HOLE:
                inode_lock_shared(inode);
                offset = iomap_seek_hole(inode, offset,
                                         &ext4_iomap_report_ops);
                inode_unlock_shared(inode);
                break;
        case SEEK_DATA:
                inode_lock_shared(inode);
                offset = iomap_seek_data(inode, offset,
                                         &ext4_iomap_report_ops);
                inode_unlock_shared(inode);
                break;
        }

        if (offset < 0)
                return offset;
        return vfs_setpos(file, offset, maxbytes);
}

const struct file_operations ext4_file_operations = {
        .llseek                = ext4_llseek,
        .read_iter        = ext4_file_read_iter,
        .write_iter        = ext4_file_write_iter,
        .iopoll                = iocb_bio_iopoll,
        .unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl        = ext4_compat_ioctl,
#endif
        .mmap                = ext4_file_mmap,
        .open                = ext4_file_open,
        .release        = ext4_release_file,
        .fsync                = ext4_sync_file,
        .get_unmapped_area = thp_get_unmapped_area,
        .splice_read        = ext4_file_splice_read,
        .splice_write        = iter_file_splice_write,
        .fallocate        = ext4_fallocate,
        .fop_flags        = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
                          FOP_DIO_PARALLEL_WRITE,
};

const struct inode_operations ext4_file_inode_operations = {
        .setattr        = ext4_setattr,
        .getattr        = ext4_file_getattr,
        .listxattr        = ext4_listxattr,
        .get_inode_acl        = ext4_get_acl,
        .set_acl        = ext4_set_acl,
        .fiemap                = ext4_fiemap,
        .fileattr_get        = ext4_fileattr_get,
        .fileattr_set        = ext4_fileattr_set,
};






































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
#ifndef IOU_CORE_H
#define IOU_CORE_H

#include <linux/errno.h>
#include <linux/lockdep.h>
#include <linux/resume_user_mode.h>
#include <linux/kasan.h>
#include <linux/poll.h>
#include <linux/io_uring_types.h>
#include <uapi/linux/eventpoll.h>
#include "io-wq.h"
#include "slist.h"
#include "filetable.h"

#ifndef CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
#endif

enum {
        IOU_OK                        = 0,
        IOU_ISSUE_SKIP_COMPLETE        = -EIOCBQUEUED,

        /*
         * Requeue the task_work to restart operations on this request. The
         * actual value isn't important, should just be not an otherwise
         * valid error code, yet less than -MAX_ERRNO and valid internally.
         */
        IOU_REQUEUE                = -3072,

        /*
         * Intended only when both IO_URING_F_MULTISHOT is passed
         * to indicate to the poll runner that multishot should be
         * removed and the result is set on req->cqe.res.
         */
        IOU_STOP_MULTISHOT        = -ECANCELED,
};

struct io_wait_queue {
        struct wait_queue_entry wq;
        struct io_ring_ctx *ctx;
        unsigned cq_tail;
        unsigned nr_timeouts;
        ktime_t timeout;

#ifdef CONFIG_NET_RX_BUSY_POLL
        unsigned int napi_busy_poll_to;
        bool napi_prefer_busy_poll;
#endif
};

static inline bool io_should_wake(struct io_wait_queue *iowq)
{
        struct io_ring_ctx *ctx = iowq->ctx;
        int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail;

        /*
         * Wake up if we have enough events, or if a timeout occurred since we
         * started waiting. For timeouts, we always want to return to userspace,
         * regardless of event count.
         */
        return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
}

bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
int io_run_task_work_sig(struct io_ring_ctx *ctx);
void io_req_defer_failed(struct io_kiocb *req, s32 res);
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags);
void __io_commit_cqring_flush(struct io_ring_ctx *ctx);

struct file *io_file_get_normal(struct io_kiocb *req, int fd);
struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
                               unsigned issue_flags);

void __io_req_task_work_add(struct io_kiocb *req, unsigned flags);
bool io_alloc_async_data(struct io_kiocb *req);
void io_req_task_queue(struct io_kiocb *req);
void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts);
void io_req_task_queue_fail(struct io_kiocb *req, int ret);
void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts);
struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries);
struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count);
void tctx_task_work(struct callback_head *cb);
__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
int io_uring_alloc_task_context(struct task_struct *task,
                                struct io_ring_ctx *ctx);

int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
                                     int start, int end);

int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts);
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
void __io_submit_flush_completions(struct io_ring_ctx *ctx);

struct io_wq_work *io_wq_free_work(struct io_wq_work *work);
void io_wq_submit_work(struct io_wq_work *work);

void io_free_req(struct io_kiocb *req);
void io_queue_next(struct io_kiocb *req);
void io_task_refs_refill(struct io_uring_task *tctx);
bool __io_alloc_req_refill(struct io_ring_ctx *ctx);

bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
                        bool cancel_all);

enum {
        IO_EVENTFD_OP_SIGNAL_BIT,
        IO_EVENTFD_OP_FREE_BIT,
};

void io_eventfd_ops(struct rcu_head *rcu);
void io_activate_pollwq(struct io_ring_ctx *ctx);

static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
{
#if defined(CONFIG_PROVE_LOCKING)
        lockdep_assert(in_task());

        if (ctx->flags & IORING_SETUP_IOPOLL) {
                lockdep_assert_held(&ctx->uring_lock);
        } else if (!ctx->task_complete) {
                lockdep_assert_held(&ctx->completion_lock);
        } else if (ctx->submitter_task) {
                /*
                 * ->submitter_task may be NULL and we can still post a CQE,
                 * if the ring has been setup with IORING_SETUP_R_DISABLED.
                 * Not from an SQE, as those cannot be submitted, but via
                 * updating tagged resources.
                 */
                if (ctx->submitter_task->flags & PF_EXITING)
                        lockdep_assert(current_work());
                else
                        lockdep_assert(current == ctx->submitter_task);
        }
#endif
}

static inline void io_req_task_work_add(struct io_kiocb *req)
{
        __io_req_task_work_add(req, 0);
}

static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
{
        if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
            ctx->submit_state.cq_flush)
                __io_submit_flush_completions(ctx);
}

#define io_for_each_link(pos, head) \
        for (pos = (head); pos; pos = pos->link)

static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx,
                                        struct io_uring_cqe **ret,
                                        bool overflow)
{
        io_lockdep_assert_cq_locked(ctx);

        if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) {
                if (unlikely(!io_cqe_cache_refill(ctx, overflow)))
                        return false;
        }
        *ret = ctx->cqe_cached;
        ctx->cached_cq_tail++;
        ctx->cqe_cached++;
        if (ctx->flags & IORING_SETUP_CQE32)
                ctx->cqe_cached++;
        return true;
}

static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret)
{
        return io_get_cqe_overflow(ctx, ret, false);
}

static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
                                            struct io_kiocb *req)
{
        struct io_uring_cqe *cqe;

        /*
         * If we can't get a cq entry, userspace overflowed the
         * submission (by quite a lot). Increment the overflow count in
         * the ring.
         */
        if (unlikely(!io_get_cqe(ctx, &cqe)))
                return false;

        if (trace_io_uring_complete_enabled())
                trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
                                        req->cqe.res, req->cqe.flags,
                                        req->big_cqe.extra1, req->big_cqe.extra2);

        memcpy(cqe, &req->cqe, sizeof(*cqe));
        if (ctx->flags & IORING_SETUP_CQE32) {
                memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe));
                memset(&req->big_cqe, 0, sizeof(req->big_cqe));
        }
        return true;
}

static inline void req_set_fail(struct io_kiocb *req)
{
        req->flags |= REQ_F_FAIL;
        if (req->flags & REQ_F_CQE_SKIP) {
                req->flags &= ~REQ_F_CQE_SKIP;
                req->flags |= REQ_F_SKIP_LINK_CQES;
        }
}

static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags)
{
        req->cqe.res = res;
        req->cqe.flags = cflags;
}

static inline bool req_has_async_data(struct io_kiocb *req)
{
        return req->flags & REQ_F_ASYNC_DATA;
}

static inline void io_put_file(struct io_kiocb *req)
{
        if (!(req->flags & REQ_F_FIXED_FILE) && req->file)
                fput(req->file);
}

static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx,
                                         unsigned issue_flags)
{
        lockdep_assert_held(&ctx->uring_lock);
        if (unlikely(issue_flags & IO_URING_F_UNLOCKED))
                mutex_unlock(&ctx->uring_lock);
}

static inline void io_ring_submit_lock(struct io_ring_ctx *ctx,
                                       unsigned issue_flags)
{
        /*
         * "Normal" inline submissions always hold the uring_lock, since we
         * grab it from the system call. Same is true for the SQPOLL offload.
         * The only exception is when we've detached the request and issue it
         * from an async worker thread, grab the lock for that case.
         */
        if (unlikely(issue_flags & IO_URING_F_UNLOCKED))
                mutex_lock(&ctx->uring_lock);
        lockdep_assert_held(&ctx->uring_lock);
}

static inline void io_commit_cqring(struct io_ring_ctx *ctx)
{
        /* order cqe stores with ring update */
        smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
}

static inline void io_poll_wq_wake(struct io_ring_ctx *ctx)
{
        if (wq_has_sleeper(&ctx->poll_wq))
                __wake_up(&ctx->poll_wq, TASK_NORMAL, 0,
                                poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
}

static inline void io_cqring_wake(struct io_ring_ctx *ctx)
{
        /*
         * Trigger waitqueue handler on all waiters on our waitqueue. This
         * won't necessarily wake up all the tasks, io_should_wake() will make
         * that decision.
         *
         * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter
         * set in the mask so that if we recurse back into our own poll
         * waitqueue handlers, we know we have a dependency between eventfd or
         * epoll and should terminate multishot poll at that point.
         */
        if (wq_has_sleeper(&ctx->cq_wait))
                __wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
                                poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
}

static inline bool io_sqring_full(struct io_ring_ctx *ctx)
{
        struct io_rings *r = ctx->rings;

        return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
}

static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
{
        struct io_rings *rings = ctx->rings;
        unsigned int entries;

        /* make sure SQ entry isn't read before tail */
        entries = smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
        return min(entries, ctx->sq_entries);
}

static inline int io_run_task_work(void)
{
        bool ret = false;

        /*
         * Always check-and-clear the task_work notification signal. With how
         * signaling works for task_work, we can find it set with nothing to
         * run. We need to clear it for that case, like get_signal() does.
         */
        if (test_thread_flag(TIF_NOTIFY_SIGNAL))
                clear_notify_signal();
        /*
         * PF_IO_WORKER never returns to userspace, so check here if we have
         * notify work that needs processing.
         */
        if (current->flags & PF_IO_WORKER) {
                if (test_thread_flag(TIF_NOTIFY_RESUME)) {
                        __set_current_state(TASK_RUNNING);
                        resume_user_mode_work(NULL);
                }
                if (current->io_uring) {
                        unsigned int count = 0;

                        tctx_task_work_run(current->io_uring, UINT_MAX, &count);
                        if (count)
                                ret = true;
                }
        }
        if (task_work_pending(current)) {
                __set_current_state(TASK_RUNNING);
                task_work_run();
                ret = true;
        }

        return ret;
}

static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
{
        return task_work_pending(current) || !llist_empty(&ctx->work_llist);
}

static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts)
{
        lockdep_assert_held(&ctx->uring_lock);
}

/*
 * Don't complete immediately but use deferred completion infrastructure.
 * Protected by ->uring_lock and can only be used either with
 * IO_URING_F_COMPLETE_DEFER or inside a tw handler holding the mutex.
 */
static inline void io_req_complete_defer(struct io_kiocb *req)
        __must_hold(&req->ctx->uring_lock)
{
        struct io_submit_state *state = &req->ctx->submit_state;

        lockdep_assert_held(&req->ctx->uring_lock);

        wq_list_add_tail(&req->comp_list, &state->compl_reqs);
}

static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
        if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
                     ctx->has_evfd || ctx->poll_activated))
                __io_commit_cqring_flush(ctx);
}

static inline void io_get_task_refs(int nr)
{
        struct io_uring_task *tctx = current->io_uring;

        tctx->cached_refs -= nr;
        if (unlikely(tctx->cached_refs < 0))
                io_task_refs_refill(tctx);
}

static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
{
        return !ctx->submit_state.free_list.next;
}

extern struct kmem_cache *req_cachep;
extern struct kmem_cache *io_buf_cachep;

static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx)
{
        struct io_kiocb *req;

        req = container_of(ctx->submit_state.free_list.next, struct io_kiocb, comp_list);
        wq_stack_extract(&ctx->submit_state.free_list);
        return req;
}

static inline bool io_alloc_req(struct io_ring_ctx *ctx, struct io_kiocb **req)
{
        if (unlikely(io_req_cache_empty(ctx))) {
                if (!__io_alloc_req_refill(ctx))
                        return false;
        }
        *req = io_extract_req(ctx);
        return true;
}

static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx)
{
        return likely(ctx->submitter_task == current);
}

static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx)
{
        return likely(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN) ||
                      ctx->submitter_task == current);
}

static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res)
{
        io_req_set_res(req, res, 0);
        req->io_task_work.func = io_req_task_complete;
        io_req_task_work_add(req);
}

/*
 * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each
 * slot.
 */
static inline size_t uring_sqe_size(struct io_ring_ctx *ctx)
{
        if (ctx->flags & IORING_SETUP_SQE128)
                return 2 * sizeof(struct io_uring_sqe);
        return sizeof(struct io_uring_sqe);
}

static inline bool io_file_can_poll(struct io_kiocb *req)
{
        if (req->flags & REQ_F_CAN_POLL)
                return true;
        if (req->file && file_can_poll(req->file)) {
                req->flags |= REQ_F_CAN_POLL;
                return true;
        }
        return false;
}

enum {
        IO_CHECK_CQ_OVERFLOW_BIT,
        IO_CHECK_CQ_DROPPED_BIT,
};

static inline bool io_has_work(struct io_ring_ctx *ctx)
{
        return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) ||
               !llist_empty(&ctx->work_llist);
}
#endif






















































































































    5 








    4 










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_TASK_H
#define _LINUX_SCHED_TASK_H

/*
 * Interface between the scheduler and various task lifetime (fork()/exit())
 * functionality:
 */

#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/uaccess.h>

struct task_struct;
struct rusage;
union thread_union;
struct css_set;

/* All the bits taken by the old clone syscall. */
#define CLONE_LEGACY_FLAGS 0xffffffffULL

struct kernel_clone_args {
        u64 flags;
        int __user *pidfd;
        int __user *child_tid;
        int __user *parent_tid;
        const char *name;
        int exit_signal;
        u32 kthread:1;
        u32 io_thread:1;
        u32 user_worker:1;
        u32 no_files:1;
        unsigned long stack;
        unsigned long stack_size;
        unsigned long tls;
        pid_t *set_tid;
        /* Number of elements in *set_tid */
        size_t set_tid_size;
        int cgroup;
        int idle;
        int (*fn)(void *);
        void *fn_arg;
        struct cgroup *cgrp;
        struct css_set *cset;
};

/*
 * This serializes "schedule()" and also protects
 * the run-queue from deletions/modifications (but
 * _adding_ to the beginning of the run-queue has
 * a separate lock).
 */
extern rwlock_t tasklist_lock;
extern spinlock_t mmlist_lock;

extern union thread_union init_thread_union;
extern struct task_struct init_task;

extern int lockdep_tasklist_lock_is_held(void);

extern asmlinkage void schedule_tail(struct task_struct *prev);
extern void init_idle(struct task_struct *idle, int cpu);

extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
extern void sched_post_fork(struct task_struct *p);
extern void sched_dead(struct task_struct *p);

void __noreturn do_task_dead(void);
void __noreturn make_task_dead(int signr);

extern void mm_cache_init(void);
extern void proc_caches_init(void);

extern void fork_init(void);

extern void release_task(struct task_struct * p);

extern int copy_thread(struct task_struct *, const struct kernel_clone_args *);

extern void flush_thread(void);

#ifdef CONFIG_HAVE_EXIT_THREAD
extern void exit_thread(struct task_struct *tsk);
#else
static inline void exit_thread(struct task_struct *tsk)
{
}
#endif
extern __noreturn void do_group_exit(int);

extern void exit_files(struct task_struct *);
extern void exit_itimers(struct task_struct *);

extern pid_t kernel_clone(struct kernel_clone_args *kargs);
struct task_struct *copy_process(struct pid *pid, int trace, int node,
                                 struct kernel_clone_args *args);
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
struct task_struct *fork_idle(int);
extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
                            unsigned long flags);
extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags);
extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
int kernel_wait(pid_t pid, int *stat);

extern void free_task(struct task_struct *tsk);

/* sched_exec is called by processes performing an exec */
#ifdef CONFIG_SMP
extern void sched_exec(void);
#else
#define sched_exec()   {}
#endif

static inline struct task_struct *get_task_struct(struct task_struct *t)
{
        refcount_inc(&t->usage);
        return t;
}

extern void __put_task_struct(struct task_struct *t);
extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);

static inline void put_task_struct(struct task_struct *t)
{
        if (!refcount_dec_and_test(&t->usage))
                return;

        /*
         * In !RT, it is always safe to call __put_task_struct().
         * Under RT, we can only call it in preemptible context.
         */
        if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
                static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP);

                lock_map_acquire_try(&put_task_map);
                __put_task_struct(t);
                lock_map_release(&put_task_map);
                return;
        }

        /*
         * under PREEMPT_RT, we can't call put_task_struct
         * in atomic context because it will indirectly
         * acquire sleeping locks.
         *
         * call_rcu() will schedule delayed_put_task_struct_rcu()
         * to be called in process context.
         *
         * __put_task_struct() is called when
         * refcount_dec_and_test(&t->usage) succeeds.
         *
         * This means that it can't "conflict" with
         * put_task_struct_rcu_user() which abuses ->rcu the same
         * way; rcu_users has a reference so task->usage can't be
         * zero after rcu_users 1 -> 0 transition.
         *
         * delayed_free_task() also uses ->rcu, but it is only called
         * when it fails to fork a process. Therefore, there is no
         * way it can conflict with put_task_struct().
         */
        call_rcu(&t->rcu, __put_task_struct_rcu_cb);
}

DEFINE_FREE(put_task, struct task_struct *, if (_T) put_task_struct(_T))

static inline void put_task_struct_many(struct task_struct *t, int nr)
{
        if (refcount_sub_and_test(nr, &t->usage))
                __put_task_struct(t);
}

void put_task_struct_rcu_user(struct task_struct *task);

/* Free all architecture-specific resources held by a thread. */
void release_thread(struct task_struct *dead_task);

#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
extern int arch_task_struct_size __read_mostly;
#else
# define arch_task_struct_size (sizeof(struct task_struct))
#endif

#ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST
/*
 * If an architecture has not declared a thread_struct whitelist we
 * must assume something there may need to be copied to userspace.
 */
static inline void arch_thread_struct_whitelist(unsigned long *offset,
                                                unsigned long *size)
{
        *offset = 0;
        /* Handle dynamically sized thread_struct. */
        *size = arch_task_struct_size - offsetof(struct task_struct, thread);
}
#endif

#ifdef CONFIG_VMAP_STACK
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
{
        return t->stack_vm_area;
}
#else
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
{
        return NULL;
}
#endif

/*
 * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
 * subscriptions and synchronises with wait4().  Also used in procfs.  Also
 * pins the final release of task.io_context.  Also protects ->cpuset and
 * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist.
 *
 * Nests both inside and outside of read_lock(&tasklist_lock).
 * It must not be nested with write_lock_irq(&tasklist_lock),
 * neither inside nor outside.
 */
static inline void task_lock(struct task_struct *p)
{
        spin_lock(&p->alloc_lock);
}

static inline void task_unlock(struct task_struct *p)
{
        spin_unlock(&p->alloc_lock);
}

DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T))

#endif /* _LINUX_SCHED_TASK_H */






















































































































































































































































































    1 
















    1 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef LINUX_EXPORTFS_H
#define LINUX_EXPORTFS_H 1

#include <linux/types.h>

struct dentry;
struct iattr;
struct inode;
struct iomap;
struct super_block;
struct vfsmount;

/* limit the handle size to NFSv4 handle size now */
#define MAX_HANDLE_SZ 128

/*
 * The fileid_type identifies how the file within the filesystem is encoded.
 * In theory this is freely set and parsed by the filesystem, but we try to
 * stick to conventions so we can share some generic code and don't confuse
 * sniffers like ethereal/wireshark.
 *
 * The filesystem must not use the value '0' or '0xff'.
 */
enum fid_type {
        /*
         * The root, or export point, of the filesystem.
         * (Never actually passed down to the filesystem.
         */
        FILEID_ROOT = 0,

        /*
         * 32bit inode number, 32 bit generation number.
         */
        FILEID_INO32_GEN = 1,

        /*
         * 32bit inode number, 32 bit generation number,
         * 32 bit parent directory inode number.
         */
        FILEID_INO32_GEN_PARENT = 2,

        /*
         * 64 bit object ID, 64 bit root object ID,
         * 32 bit generation number.
         */
        FILEID_BTRFS_WITHOUT_PARENT = 0x4d,

        /*
         * 64 bit object ID, 64 bit root object ID,
         * 32 bit generation number,
         * 64 bit parent object ID, 32 bit parent generation.
         */
        FILEID_BTRFS_WITH_PARENT = 0x4e,

        /*
         * 64 bit object ID, 64 bit root object ID,
         * 32 bit generation number,
         * 64 bit parent object ID, 32 bit parent generation,
         * 64 bit parent root object ID.
         */
        FILEID_BTRFS_WITH_PARENT_ROOT = 0x4f,

        /*
         * 32 bit block number, 16 bit partition reference,
         * 16 bit unused, 32 bit generation number.
         */
        FILEID_UDF_WITHOUT_PARENT = 0x51,

        /*
         * 32 bit block number, 16 bit partition reference,
         * 16 bit unused, 32 bit generation number,
         * 32 bit parent block number, 32 bit parent generation number
         */
        FILEID_UDF_WITH_PARENT = 0x52,

        /*
         * 64 bit checkpoint number, 64 bit inode number,
         * 32 bit generation number.
         */
        FILEID_NILFS_WITHOUT_PARENT = 0x61,

        /*
         * 64 bit checkpoint number, 64 bit inode number,
         * 32 bit generation number, 32 bit parent generation.
         * 64 bit parent inode number.
         */
        FILEID_NILFS_WITH_PARENT = 0x62,

        /*
         * 32 bit generation number, 40 bit i_pos.
         */
        FILEID_FAT_WITHOUT_PARENT = 0x71,

        /*
         * 32 bit generation number, 40 bit i_pos,
         * 32 bit parent generation number, 40 bit parent i_pos
         */
        FILEID_FAT_WITH_PARENT = 0x72,

        /*
         * 64 bit inode number, 32 bit generation number.
         */
        FILEID_INO64_GEN = 0x81,

        /*
         * 64 bit inode number, 32 bit generation number,
         * 64 bit parent inode number, 32 bit parent generation.
         */
        FILEID_INO64_GEN_PARENT = 0x82,

        /*
         * 128 bit child FID (struct lu_fid)
         * 128 bit parent FID (struct lu_fid)
         */
        FILEID_LUSTRE = 0x97,

        /*
         * 64 bit inode number, 32 bit subvolume, 32 bit generation number:
         */
        FILEID_BCACHEFS_WITHOUT_PARENT = 0xb1,
        FILEID_BCACHEFS_WITH_PARENT = 0xb2,

        /*
         * 64 bit unique kernfs id
         */
        FILEID_KERNFS = 0xfe,

        /*
         * Filesystems must not use 0xff file ID.
         */
        FILEID_INVALID = 0xff,
};

struct fid {
        union {
                struct {
                        u32 ino;
                        u32 gen;
                        u32 parent_ino;
                        u32 parent_gen;
                } i32;
                struct {
                        u64 ino;
                        u32 gen;
                } __packed i64;
                struct {
                         u32 block;
                         u16 partref;
                         u16 parent_partref;
                         u32 generation;
                         u32 parent_block;
                         u32 parent_generation;
                 } udf;
                DECLARE_FLEX_ARRAY(__u32, raw);
        };
};

#define EXPORT_FH_CONNECTABLE        0x1 /* Encode file handle with parent */
#define EXPORT_FH_FID                0x2 /* File handle may be non-decodeable */

/**
 * struct export_operations - for nfsd to communicate with file systems
 * @encode_fh:      encode a file handle fragment from a dentry
 * @fh_to_dentry:   find the implied object and get a dentry for it
 * @fh_to_parent:   find the implied object's parent and get a dentry for it
 * @get_name:       find the name for a given inode in a given directory
 * @get_parent:     find the parent of a given directory
 * @commit_metadata: commit metadata changes to stable storage
 *
 * See Documentation/filesystems/nfs/exporting.rst for details on how to use
 * this interface correctly.
 *
 * encode_fh:
 *    @encode_fh should store in the file handle fragment @fh (using at most
 *    @max_len bytes) information that can be used by @decode_fh to recover the
 *    file referred to by the &struct dentry @de.  If @flag has CONNECTABLE bit
 *    set, the encode_fh() should store sufficient information so that a good
 *    attempt can be made to find not only the file but also it's place in the
 *    filesystem.   This typically means storing a reference to de->d_parent in
 *    the filehandle fragment.  encode_fh() should return the fileid_type on
 *    success and on error returns 255 (if the space needed to encode fh is
 *    greater than @max_len*4 bytes). On error @max_len contains the minimum
 *    size(in 4 byte unit) needed to encode the file handle.
 *
 * fh_to_dentry:
 *    @fh_to_dentry is given a &struct super_block (@sb) and a file handle
 *    fragment (@fh, @fh_len). It should return a &struct dentry which refers
 *    to the same file that the file handle fragment refers to.  If it cannot,
 *    it should return a %NULL pointer if the file cannot be found, or an
 *    %ERR_PTR error code of %ENOMEM if a memory allocation failure occurred.
 *    Any other error code is treated like %NULL, and will cause an %ESTALE error
 *    for callers of exportfs_decode_fh().
 *    Any suitable dentry can be returned including, if necessary, a new dentry
 *    created with d_alloc_root.  The caller can then find any other extant
 *    dentries by following the d_alias links.
 *
 * fh_to_parent:
 *    Same as @fh_to_dentry, except that it returns a pointer to the parent
 *    dentry if it was encoded into the filehandle fragment by @encode_fh.
 *
 * get_name:
 *    @get_name should find a name for the given @child in the given @parent
 *    directory.  The name should be stored in the @name (with the
 *    understanding that it is already pointing to a %NAME_MAX+1 sized
 *    buffer.   get_name() should return %0 on success, a negative error code
 *    or error.  @get_name will be called without @parent->i_mutex held.
 *
 * get_parent:
 *    @get_parent should find the parent directory for the given @child which
 *    is also a directory.  In the event that it cannot be found, or storage
 *    space cannot be allocated, a %ERR_PTR should be returned.
 *
 * commit_metadata:
 *    @commit_metadata should commit metadata changes to stable storage.
 *
 * Locking rules:
 *    get_parent is called with child->d_inode->i_mutex down
 *    get_name is not (which is possibly inconsistent)
 */

struct export_operations {
        int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
                        struct inode *parent);
        struct dentry * (*fh_to_dentry)(struct super_block *sb, struct fid *fid,
                        int fh_len, int fh_type);
        struct dentry * (*fh_to_parent)(struct super_block *sb, struct fid *fid,
                        int fh_len, int fh_type);
        int (*get_name)(struct dentry *parent, char *name,
                        struct dentry *child);
        struct dentry * (*get_parent)(struct dentry *child);
        int (*commit_metadata)(struct inode *inode);

        int (*get_uuid)(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
        int (*map_blocks)(struct inode *inode, loff_t offset,
                          u64 len, struct iomap *iomap,
                          bool write, u32 *device_generation);
        int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
                             int nr_iomaps, struct iattr *iattr);
#define        EXPORT_OP_NOWCC                        (0x1) /* don't collect v3 wcc data */
#define        EXPORT_OP_NOSUBTREECHK                (0x2) /* no subtree checking */
#define        EXPORT_OP_CLOSE_BEFORE_UNLINK        (0x4) /* close files before unlink */
#define EXPORT_OP_REMOTE_FS                (0x8) /* Filesystem is remote */
#define EXPORT_OP_NOATOMIC_ATTR                (0x10) /* Filesystem cannot supply
                                                  atomic attribute updates
                                                */
#define EXPORT_OP_FLUSH_ON_CLOSE        (0x20) /* fs flushes file data on close */
#define EXPORT_OP_ASYNC_LOCK                (0x40) /* fs can do async lock request */
        unsigned long        flags;
};

/**
 * exportfs_lock_op_is_async() - export op supports async lock operation
 * @export_ops:        the nfs export operations to check
 *
 * Returns true if the nfs export_operations structure has
 * EXPORT_OP_ASYNC_LOCK in their flags set
 */
static inline bool
exportfs_lock_op_is_async(const struct export_operations *export_ops)
{
        return export_ops->flags & EXPORT_OP_ASYNC_LOCK;
}

extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
                                    int *max_len, struct inode *parent,
                                    int flags);
extern int exportfs_encode_fh(struct dentry *dentry, struct fid *fid,
                              int *max_len, int flags);

static inline bool exportfs_can_encode_fid(const struct export_operations *nop)
{
        return !nop || nop->encode_fh;
}

static inline bool exportfs_can_decode_fh(const struct export_operations *nop)
{
        return nop && nop->fh_to_dentry;
}

static inline bool exportfs_can_encode_fh(const struct export_operations *nop,
                                          int fh_flags)
{
        /*
         * If a non-decodeable file handle was requested, we only need to make
         * sure that filesystem did not opt-out of encoding fid.
         */
        if (fh_flags & EXPORT_FH_FID)
                return exportfs_can_encode_fid(nop);

        /*
         * If a decodeable file handle was requested, we need to make sure that
         * filesystem can also decode file handles.
         */
        return exportfs_can_decode_fh(nop);
}

static inline int exportfs_encode_fid(struct inode *inode, struct fid *fid,
                                      int *max_len)
{
        return exportfs_encode_inode_fh(inode, fid, max_len, NULL,
                                        EXPORT_FH_FID);
}

extern struct dentry *exportfs_decode_fh_raw(struct vfsmount *mnt,
                                             struct fid *fid, int fh_len,
                                             int fileid_type,
                                             int (*acceptable)(void *, struct dentry *),
                                             void *context);
extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
        int fh_len, int fileid_type, int (*acceptable)(void *, struct dentry *),
        void *context);

/*
 * Generic helpers for filesystems.
 */
int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len,
                            struct inode *parent);
struct dentry *generic_fh_to_dentry(struct super_block *sb,
        struct fid *fid, int fh_len, int fh_type,
        struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen));
struct dentry *generic_fh_to_parent(struct super_block *sb,
        struct fid *fid, int fh_len, int fh_type,
        struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen));

#endif /* LINUX_EXPORTFS_H */


































































    1 








    1 












    1 


    1 











































































    1 











































































































































































































    1 





    1 

    1 






    1 
    1 





























    1 


































    1 
    1 


    1 
    1 
    1 




    1 













    1 
    1 
    1 









































    1 







































    1 




    1 

    1 








    1 


    1 

    1 





















    1 


    1 

    1 

    1 






    1 














    1 








    1 
    1 
    1 

    1 



    1 



















    1 

    1 
    1 







































































































































    1 






















































    1 




















    1 







    1 







    1 

    1 
















    1 










    1 






































    1 

















    1 




    1 


    1 



    1 




























    1 



















    1 








    1 













    1 











    1 
    1 


    1 











    1 

    1 


















































    1 

    1 










    1 
    1 

    1 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2009 Oracle.  All rights reserved.
 */

#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/sort.h>
#include "messages.h"
#include "ctree.h"
#include "delayed-ref.h"
#include "transaction.h"
#include "qgroup.h"
#include "space-info.h"
#include "tree-mod-log.h"
#include "fs.h"

struct kmem_cache *btrfs_delayed_ref_head_cachep;
struct kmem_cache *btrfs_delayed_ref_node_cachep;
struct kmem_cache *btrfs_delayed_extent_op_cachep;
/*
 * delayed back reference update tracking.  For subvolume trees
 * we queue up extent allocations and backref maintenance for
 * delayed processing.   This avoids deep call chains where we
 * add extents in the middle of btrfs_search_slot, and it allows
 * us to buffer up frequently modified backrefs in an rb tree instead
 * of hammering updates on the extent allocation tree.
 */

bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
{
        struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
        bool ret = false;
        u64 reserved;

        spin_lock(&global_rsv->lock);
        reserved = global_rsv->reserved;
        spin_unlock(&global_rsv->lock);

        /*
         * Since the global reserve is just kind of magic we don't really want
         * to rely on it to save our bacon, so if our size is more than the
         * delayed_refs_rsv and the global rsv then it's time to think about
         * bailing.
         */
        spin_lock(&delayed_refs_rsv->lock);
        reserved += delayed_refs_rsv->reserved;
        if (delayed_refs_rsv->size >= reserved)
                ret = true;
        spin_unlock(&delayed_refs_rsv->lock);
        return ret;
}

/*
 * Release a ref head's reservation.
 *
 * @fs_info:  the filesystem
 * @nr_refs:  number of delayed refs to drop
 * @nr_csums: number of csum items to drop
 *
 * Drops the delayed ref head's count from the delayed refs rsv and free any
 * excess reservation we had.
 */
void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums)
{
        struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
        u64 num_bytes;
        u64 released;

        num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr_refs);
        num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums);

        released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
        if (released)
                trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
                                              0, released, 0);
}

/*
 * Adjust the size of the delayed refs rsv.
 *
 * This is to be called anytime we may have adjusted trans->delayed_ref_updates
 * or trans->delayed_ref_csum_deletions, it'll calculate the additional size and
 * add it to the delayed_refs_rsv.
 */
void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
        struct btrfs_block_rsv *local_rsv = &trans->delayed_rsv;
        u64 num_bytes;
        u64 reserved_bytes;

        num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates);
        num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info,
                                                       trans->delayed_ref_csum_deletions);

        if (num_bytes == 0)
                return;

        /*
         * Try to take num_bytes from the transaction's local delayed reserve.
         * If not possible, try to take as much as it's available. If the local
         * reserve doesn't have enough reserved space, the delayed refs reserve
         * will be refilled next time btrfs_delayed_refs_rsv_refill() is called
         * by someone or if a transaction commit is triggered before that, the
         * global block reserve will be used. We want to minimize using the
         * global block reserve for cases we can account for in advance, to
         * avoid exhausting it and reach -ENOSPC during a transaction commit.
         */
        spin_lock(&local_rsv->lock);
        reserved_bytes = min(num_bytes, local_rsv->reserved);
        local_rsv->reserved -= reserved_bytes;
        local_rsv->full = (local_rsv->reserved >= local_rsv->size);
        spin_unlock(&local_rsv->lock);

        spin_lock(&delayed_rsv->lock);
        delayed_rsv->size += num_bytes;
        delayed_rsv->reserved += reserved_bytes;
        delayed_rsv->full = (delayed_rsv->reserved >= delayed_rsv->size);
        spin_unlock(&delayed_rsv->lock);
        trans->delayed_ref_updates = 0;
        trans->delayed_ref_csum_deletions = 0;
}

/*
 * Adjust the size of the delayed refs block reserve for 1 block group item
 * insertion, used after allocating a block group.
 */
void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info)
{
        struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;

        spin_lock(&delayed_rsv->lock);
        /*
         * Inserting a block group item does not require changing the free space
         * tree, only the extent tree or the block group tree, so this is all we
         * need.
         */
        delayed_rsv->size += btrfs_calc_insert_metadata_size(fs_info, 1);
        delayed_rsv->full = false;
        spin_unlock(&delayed_rsv->lock);
}

/*
 * Adjust the size of the delayed refs block reserve to release space for 1
 * block group item insertion.
 */
void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info)
{
        struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
        const u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
        u64 released;

        released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL);
        if (released > 0)
                trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
                                              0, released, 0);
}

/*
 * Adjust the size of the delayed refs block reserve for 1 block group item
 * update.
 */
void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info)
{
        struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;

        spin_lock(&delayed_rsv->lock);
        /*
         * Updating a block group item does not result in new nodes/leaves and
         * does not require changing the free space tree, only the extent tree
         * or the block group tree, so this is all we need.
         */
        delayed_rsv->size += btrfs_calc_metadata_size(fs_info, 1);
        delayed_rsv->full = false;
        spin_unlock(&delayed_rsv->lock);
}

/*
 * Adjust the size of the delayed refs block reserve to release space for 1
 * block group item update.
 */
void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info)
{
        struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
        const u64 num_bytes = btrfs_calc_metadata_size(fs_info, 1);
        u64 released;

        released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL);
        if (released > 0)
                trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
                                              0, released, 0);
}

/*
 * Transfer bytes to our delayed refs rsv.
 *
 * @fs_info:   the filesystem
 * @num_bytes: number of bytes to transfer
 *
 * This transfers up to the num_bytes amount, previously reserved, to the
 * delayed_refs_rsv.  Any extra bytes are returned to the space info.
 */
void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
                                       u64 num_bytes)
{
        struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
        u64 to_free = 0;

        spin_lock(&delayed_refs_rsv->lock);
        if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
                u64 delta = delayed_refs_rsv->size -
                        delayed_refs_rsv->reserved;
                if (num_bytes > delta) {
                        to_free = num_bytes - delta;
                        num_bytes = delta;
                }
        } else {
                to_free = num_bytes;
                num_bytes = 0;
        }

        if (num_bytes)
                delayed_refs_rsv->reserved += num_bytes;
        if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
                delayed_refs_rsv->full = true;
        spin_unlock(&delayed_refs_rsv->lock);

        if (num_bytes)
                trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
                                              0, num_bytes, 1);
        if (to_free)
                btrfs_space_info_free_bytes_may_use(fs_info,
                                delayed_refs_rsv->space_info, to_free);
}

/*
 * Refill based on our delayed refs usage.
 *
 * @fs_info: the filesystem
 * @flush:   control how we can flush for this reservation.
 *
 * This will refill the delayed block_rsv up to 1 items size worth of space and
 * will return -ENOSPC if we can't make the reservation.
 */
int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
                                  enum btrfs_reserve_flush_enum flush)
{
        struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
        struct btrfs_space_info *space_info = block_rsv->space_info;
        u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1);
        u64 num_bytes = 0;
        u64 refilled_bytes;
        u64 to_free;
        int ret = -ENOSPC;

        spin_lock(&block_rsv->lock);
        if (block_rsv->reserved < block_rsv->size) {
                num_bytes = block_rsv->size - block_rsv->reserved;
                num_bytes = min(num_bytes, limit);
        }
        spin_unlock(&block_rsv->lock);

        if (!num_bytes)
                return 0;

        ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush);
        if (ret)
                return ret;

        /*
         * We may have raced with someone else, so check again if we the block
         * reserve is still not full and release any excess space.
         */
        spin_lock(&block_rsv->lock);
        if (block_rsv->reserved < block_rsv->size) {
                u64 needed = block_rsv->size - block_rsv->reserved;

                if (num_bytes >= needed) {
                        block_rsv->reserved += needed;
                        block_rsv->full = true;
                        to_free = num_bytes - needed;
                        refilled_bytes = needed;
                } else {
                        block_rsv->reserved += num_bytes;
                        to_free = 0;
                        refilled_bytes = num_bytes;
                }
        } else {
                to_free = num_bytes;
                refilled_bytes = 0;
        }
        spin_unlock(&block_rsv->lock);

        if (to_free > 0)
                btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free);

        if (refilled_bytes > 0)
                trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0,
                                              refilled_bytes, 1);
        return 0;
}

/*
 * compare two delayed data backrefs with same bytenr and type
 */
static int comp_data_refs(struct btrfs_delayed_ref_node *ref1,
                          struct btrfs_delayed_ref_node *ref2)
{
        if (ref1->data_ref.objectid < ref2->data_ref.objectid)
                return -1;
        if (ref1->data_ref.objectid > ref2->data_ref.objectid)
                return 1;
        if (ref1->data_ref.offset < ref2->data_ref.offset)
                return -1;
        if (ref1->data_ref.offset > ref2->data_ref.offset)
                return 1;
        return 0;
}

static int comp_refs(struct btrfs_delayed_ref_node *ref1,
                     struct btrfs_delayed_ref_node *ref2,
                     bool check_seq)
{
        int ret = 0;

        if (ref1->type < ref2->type)
                return -1;
        if (ref1->type > ref2->type)
                return 1;
        if (ref1->type == BTRFS_SHARED_BLOCK_REF_KEY ||
            ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
                if (ref1->parent < ref2->parent)
                        return -1;
                if (ref1->parent > ref2->parent)
                        return 1;
        } else {
                if (ref1->ref_root < ref2->ref_root)
                        return -1;
                if (ref1->ref_root > ref2->ref_root)
                        return -1;
                if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY)
                        ret = comp_data_refs(ref1, ref2);
        }
        if (ret)
                return ret;
        if (check_seq) {
                if (ref1->seq < ref2->seq)
                        return -1;
                if (ref1->seq > ref2->seq)
                        return 1;
        }
        return 0;
}

/* insert a new ref to head ref rbtree */
static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
                                                   struct rb_node *node)
{
        struct rb_node **p = &root->rb_root.rb_node;
        struct rb_node *parent_node = NULL;
        struct btrfs_delayed_ref_head *entry;
        struct btrfs_delayed_ref_head *ins;
        u64 bytenr;
        bool leftmost = true;

        ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
        bytenr = ins->bytenr;
        while (*p) {
                parent_node = *p;
                entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
                                 href_node);

                if (bytenr < entry->bytenr) {
                        p = &(*p)->rb_left;
                } else if (bytenr > entry->bytenr) {
                        p = &(*p)->rb_right;
                        leftmost = false;
                } else {
                        return entry;
                }
        }

        rb_link_node(node, parent_node, p);
        rb_insert_color_cached(node, root, leftmost);
        return NULL;
}

static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
                struct btrfs_delayed_ref_node *ins)
{
        struct rb_node **p = &root->rb_root.rb_node;
        struct rb_node *node = &ins->ref_node;
        struct rb_node *parent_node = NULL;
        struct btrfs_delayed_ref_node *entry;
        bool leftmost = true;

        while (*p) {
                int comp;

                parent_node = *p;
                entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
                                 ref_node);
                comp = comp_refs(ins, entry, true);
                if (comp < 0) {
                        p = &(*p)->rb_left;
                } else if (comp > 0) {
                        p = &(*p)->rb_right;
                        leftmost = false;
                } else {
                        return entry;
                }
        }

        rb_link_node(node, parent_node, p);
        rb_insert_color_cached(node, root, leftmost);
        return NULL;
}

static struct btrfs_delayed_ref_head *find_first_ref_head(
                struct btrfs_delayed_ref_root *dr)
{
        struct rb_node *n;
        struct btrfs_delayed_ref_head *entry;

        n = rb_first_cached(&dr->href_root);
        if (!n)
                return NULL;

        entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);

        return entry;
}

/*
 * Find a head entry based on bytenr. This returns the delayed ref head if it
 * was able to find one, or NULL if nothing was in that spot.  If return_bigger
 * is given, the next bigger entry is returned if no exact match is found.
 */
static struct btrfs_delayed_ref_head *find_ref_head(
                struct btrfs_delayed_ref_root *dr, u64 bytenr,
                bool return_bigger)
{
        struct rb_root *root = &dr->href_root.rb_root;
        struct rb_node *n;
        struct btrfs_delayed_ref_head *entry;

        n = root->rb_node;
        entry = NULL;
        while (n) {
                entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);

                if (bytenr < entry->bytenr)
                        n = n->rb_left;
                else if (bytenr > entry->bytenr)
                        n = n->rb_right;
                else
                        return entry;
        }
        if (entry && return_bigger) {
                if (bytenr > entry->bytenr) {
                        n = rb_next(&entry->href_node);
                        if (!n)
                                return NULL;
                        entry = rb_entry(n, struct btrfs_delayed_ref_head,
                                         href_node);
                }
                return entry;
        }
        return NULL;
}

int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
                           struct btrfs_delayed_ref_head *head)
{
        lockdep_assert_held(&delayed_refs->lock);
        if (mutex_trylock(&head->mutex))
                return 0;

        refcount_inc(&head->refs);
        spin_unlock(&delayed_refs->lock);

        mutex_lock(&head->mutex);
        spin_lock(&delayed_refs->lock);
        if (RB_EMPTY_NODE(&head->href_node)) {
                mutex_unlock(&head->mutex);
                btrfs_put_delayed_ref_head(head);
                return -EAGAIN;
        }
        btrfs_put_delayed_ref_head(head);
        return 0;
}

static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info,
                                    struct btrfs_delayed_ref_root *delayed_refs,
                                    struct btrfs_delayed_ref_head *head,
                                    struct btrfs_delayed_ref_node *ref)
{
        lockdep_assert_held(&head->lock);
        rb_erase_cached(&ref->ref_node, &head->ref_tree);
        RB_CLEAR_NODE(&ref->ref_node);
        if (!list_empty(&ref->add_list))
                list_del(&ref->add_list);
        btrfs_put_delayed_ref(ref);
        atomic_dec(&delayed_refs->num_entries);
        btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
}

static bool merge_ref(struct btrfs_fs_info *fs_info,
                      struct btrfs_delayed_ref_root *delayed_refs,
                      struct btrfs_delayed_ref_head *head,
                      struct btrfs_delayed_ref_node *ref,
                      u64 seq)
{
        struct btrfs_delayed_ref_node *next;
        struct rb_node *node = rb_next(&ref->ref_node);
        bool done = false;

        while (!done && node) {
                int mod;

                next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
                node = rb_next(node);
                if (seq && next->seq >= seq)
                        break;
                if (comp_refs(ref, next, false))
                        break;

                if (ref->action == next->action) {
                        mod = next->ref_mod;
                } else {
                        if (ref->ref_mod < next->ref_mod) {
                                swap(ref, next);
                                done = true;
                        }
                        mod = -next->ref_mod;
                }

                drop_delayed_ref(fs_info, delayed_refs, head, next);
                ref->ref_mod += mod;
                if (ref->ref_mod == 0) {
                        drop_delayed_ref(fs_info, delayed_refs, head, ref);
                        done = true;
                } else {
                        /*
                         * Can't have multiples of the same ref on a tree block.
                         */
                        WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
                                ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
                }
        }

        return done;
}

void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
                              struct btrfs_delayed_ref_root *delayed_refs,
                              struct btrfs_delayed_ref_head *head)
{
        struct btrfs_delayed_ref_node *ref;
        struct rb_node *node;
        u64 seq = 0;

        lockdep_assert_held(&head->lock);

        if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
                return;

        /* We don't have too many refs to merge for data. */
        if (head->is_data)
                return;

        seq = btrfs_tree_mod_log_lowest_seq(fs_info);
again:
        for (node = rb_first_cached(&head->ref_tree); node;
             node = rb_next(node)) {
                ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
                if (seq && ref->seq >= seq)
                        continue;
                if (merge_ref(fs_info, delayed_refs, head, ref, seq))
                        goto again;
        }
}

int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
{
        int ret = 0;
        u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info);

        if (min_seq != 0 && seq >= min_seq) {
                btrfs_debug(fs_info,
                            "holding back delayed_ref %llu, lowest is %llu",
                            seq, min_seq);
                ret = 1;
        }

        return ret;
}

struct btrfs_delayed_ref_head *btrfs_select_ref_head(
                struct btrfs_delayed_ref_root *delayed_refs)
{
        struct btrfs_delayed_ref_head *head;

        lockdep_assert_held(&delayed_refs->lock);
again:
        head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
                             true);
        if (!head && delayed_refs->run_delayed_start != 0) {
                delayed_refs->run_delayed_start = 0;
                head = find_first_ref_head(delayed_refs);
        }
        if (!head)
                return NULL;

        while (head->processing) {
                struct rb_node *node;

                node = rb_next(&head->href_node);
                if (!node) {
                        if (delayed_refs->run_delayed_start == 0)
                                return NULL;
                        delayed_refs->run_delayed_start = 0;
                        goto again;
                }
                head = rb_entry(node, struct btrfs_delayed_ref_head,
                                href_node);
        }

        head->processing = true;
        WARN_ON(delayed_refs->num_heads_ready == 0);
        delayed_refs->num_heads_ready--;
        delayed_refs->run_delayed_start = head->bytenr +
                head->num_bytes;
        return head;
}

void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
                           struct btrfs_delayed_ref_head *head)
{
        lockdep_assert_held(&delayed_refs->lock);
        lockdep_assert_held(&head->lock);

        rb_erase_cached(&head->href_node, &delayed_refs->href_root);
        RB_CLEAR_NODE(&head->href_node);
        atomic_dec(&delayed_refs->num_entries);
        delayed_refs->num_heads--;
        if (!head->processing)
                delayed_refs->num_heads_ready--;
}

/*
 * Helper to insert the ref_node to the tail or merge with tail.
 *
 * Return false if the ref was inserted.
 * Return true if the ref was merged into an existing one (and therefore can be
 * freed by the caller).
 */
static bool insert_delayed_ref(struct btrfs_trans_handle *trans,
                               struct btrfs_delayed_ref_head *href,
                               struct btrfs_delayed_ref_node *ref)
{
        struct btrfs_delayed_ref_root *root = &trans->transaction->delayed_refs;
        struct btrfs_delayed_ref_node *exist;
        int mod;

        spin_lock(&href->lock);
        exist = tree_insert(&href->ref_tree, ref);
        if (!exist) {
                if (ref->action == BTRFS_ADD_DELAYED_REF)
                        list_add_tail(&ref->add_list, &href->ref_add_list);
                atomic_inc(&root->num_entries);
                spin_unlock(&href->lock);
                trans->delayed_ref_updates++;
                return false;
        }

        /* Now we are sure we can merge */
        if (exist->action == ref->action) {
                mod = ref->ref_mod;
        } else {
                /* Need to change action */
                if (exist->ref_mod < ref->ref_mod) {
                        exist->action = ref->action;
                        mod = -exist->ref_mod;
                        exist->ref_mod = ref->ref_mod;
                        if (ref->action == BTRFS_ADD_DELAYED_REF)
                                list_add_tail(&exist->add_list,
                                              &href->ref_add_list);
                        else if (ref->action == BTRFS_DROP_DELAYED_REF) {
                                ASSERT(!list_empty(&exist->add_list));
                                list_del(&exist->add_list);
                        } else {
                                ASSERT(0);
                        }
                } else
                        mod = -ref->ref_mod;
        }
        exist->ref_mod += mod;

        /* remove existing tail if its ref_mod is zero */
        if (exist->ref_mod == 0)
                drop_delayed_ref(trans->fs_info, root, href, exist);
        spin_unlock(&href->lock);
        return true;
}

/*
 * helper function to update the accounting in the head ref
 * existing and update must have the same bytenr
 */
static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_delayed_ref_head *existing,
                         struct btrfs_delayed_ref_head *update)
{
        struct btrfs_delayed_ref_root *delayed_refs =
                &trans->transaction->delayed_refs;
        struct btrfs_fs_info *fs_info = trans->fs_info;
        int old_ref_mod;

        BUG_ON(existing->is_data != update->is_data);

        spin_lock(&existing->lock);

        /*
         * When freeing an extent, we may not know the owning root when we
         * first create the head_ref. However, some deref before the last deref
         * will know it, so we just need to update the head_ref accordingly.
         */
        if (!existing->owning_root)
                existing->owning_root = update->owning_root;

        if (update->must_insert_reserved) {
                /* if the extent was freed and then
                 * reallocated before the delayed ref
                 * entries were processed, we can end up
                 * with an existing head ref without
                 * the must_insert_reserved flag set.
                 * Set it again here
                 */
                existing->must_insert_reserved = update->must_insert_reserved;
                existing->owning_root = update->owning_root;

                /*
                 * update the num_bytes so we make sure the accounting
                 * is done correctly
                 */
                existing->num_bytes = update->num_bytes;

        }

        if (update->extent_op) {
                if (!existing->extent_op) {
                        existing->extent_op = update->extent_op;
                } else {
                        if (update->extent_op->update_key) {
                                memcpy(&existing->extent_op->key,
                                       &update->extent_op->key,
                                       sizeof(update->extent_op->key));
                                existing->extent_op->update_key = true;
                        }
                        if (update->extent_op->update_flags) {
                                existing->extent_op->flags_to_set |=
                                        update->extent_op->flags_to_set;
                                existing->extent_op->update_flags = true;
                        }
                        btrfs_free_delayed_extent_op(update->extent_op);
                }
        }
        /*
         * update the reference mod on the head to reflect this new operation,
         * only need the lock for this case cause we could be processing it
         * currently, for refs we just added we know we're a-ok.
         */
        old_ref_mod = existing->total_ref_mod;
        existing->ref_mod += update->ref_mod;
        existing->total_ref_mod += update->ref_mod;

        /*
         * If we are going to from a positive ref mod to a negative or vice
         * versa we need to make sure to adjust pending_csums accordingly.
         * We reserve bytes for csum deletion when adding or updating a ref head
         * see add_delayed_ref_head() for more details.
         */
        if (existing->is_data) {
                u64 csum_leaves =
                        btrfs_csum_bytes_to_leaves(fs_info,
                                                   existing->num_bytes);

                if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
                        delayed_refs->pending_csums -= existing->num_bytes;
                        btrfs_delayed_refs_rsv_release(fs_info, 0, csum_leaves);
                }
                if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
                        delayed_refs->pending_csums += existing->num_bytes;
                        trans->delayed_ref_csum_deletions += csum_leaves;
                }
        }

        spin_unlock(&existing->lock);
}

static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
                                  struct btrfs_ref *generic_ref,
                                  struct btrfs_qgroup_extent_record *qrecord,
                                  u64 reserved)
{
        int count_mod = 1;
        bool must_insert_reserved = false;

        /* If reserved is provided, it must be a data extent. */
        BUG_ON(generic_ref->type != BTRFS_REF_DATA && reserved);

        switch (generic_ref->action) {
        case BTRFS_ADD_DELAYED_REF:
                /* count_mod is already set to 1. */
                break;
        case BTRFS_UPDATE_DELAYED_HEAD:
                count_mod = 0;
                break;
        case BTRFS_DROP_DELAYED_REF:
                /*
                 * The head node stores the sum of all the mods, so dropping a ref
                 * should drop the sum in the head node by one.
                 */
                count_mod = -1;
                break;
        case BTRFS_ADD_DELAYED_EXTENT:
                /*
                 * BTRFS_ADD_DELAYED_EXTENT means that we need to update the
                 * reserved accounting when the extent is finally added, or if a
                 * later modification deletes the delayed ref without ever
                 * inserting the extent into the extent allocation tree.
                 * ref->must_insert_reserved is the flag used to record that
                 * accounting mods are required.
                 *
                 * Once we record must_insert_reserved, switch the action to
                 * BTRFS_ADD_DELAYED_REF because other special casing is not
                 * required.
                 */
                must_insert_reserved = true;
                break;
        }

        refcount_set(&head_ref->refs, 1);
        head_ref->bytenr = generic_ref->bytenr;
        head_ref->num_bytes = generic_ref->num_bytes;
        head_ref->ref_mod = count_mod;
        head_ref->reserved_bytes = reserved;
        head_ref->must_insert_reserved = must_insert_reserved;
        head_ref->owning_root = generic_ref->owning_root;
        head_ref->is_data = (generic_ref->type == BTRFS_REF_DATA);
        head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID);
        head_ref->ref_tree = RB_ROOT_CACHED;
        INIT_LIST_HEAD(&head_ref->ref_add_list);
        RB_CLEAR_NODE(&head_ref->href_node);
        head_ref->processing = false;
        head_ref->total_ref_mod = count_mod;
        spin_lock_init(&head_ref->lock);
        mutex_init(&head_ref->mutex);

        if (qrecord) {
                if (generic_ref->ref_root && reserved) {
                        qrecord->data_rsv = reserved;
                        qrecord->data_rsv_refroot = generic_ref->ref_root;
                }
                qrecord->bytenr = generic_ref->bytenr;
                qrecord->num_bytes = generic_ref->num_bytes;
                qrecord->old_roots = NULL;
        }
}

/*
 * helper function to actually insert a head node into the rbtree.
 * this does all the dirty work in terms of maintaining the correct
 * overall modification count.
 */
static noinline struct btrfs_delayed_ref_head *
add_delayed_ref_head(struct btrfs_trans_handle *trans,
                     struct btrfs_delayed_ref_head *head_ref,
                     struct btrfs_qgroup_extent_record *qrecord,
                     int action, bool *qrecord_inserted_ret)
{
        struct btrfs_delayed_ref_head *existing;
        struct btrfs_delayed_ref_root *delayed_refs;
        bool qrecord_inserted = false;

        delayed_refs = &trans->transaction->delayed_refs;

        /* Record qgroup extent info if provided */
        if (qrecord) {
                if (btrfs_qgroup_trace_extent_nolock(trans->fs_info,
                                        delayed_refs, qrecord))
                        kfree(qrecord);
                else
                        qrecord_inserted = true;
        }

        trace_add_delayed_ref_head(trans->fs_info, head_ref, action);

        existing = htree_insert(&delayed_refs->href_root,
                                &head_ref->href_node);
        if (existing) {
                update_existing_head_ref(trans, existing, head_ref);
                /*
                 * we've updated the existing ref, free the newly
                 * allocated ref
                 */
                kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
                head_ref = existing;
        } else {
                /*
                 * We reserve the amount of bytes needed to delete csums when
                 * adding the ref head and not when adding individual drop refs
                 * since the csum items are deleted only after running the last
                 * delayed drop ref (the data extent's ref count drops to 0).
                 */
                if (head_ref->is_data && head_ref->ref_mod < 0) {
                        delayed_refs->pending_csums += head_ref->num_bytes;
                        trans->delayed_ref_csum_deletions +=
                                btrfs_csum_bytes_to_leaves(trans->fs_info,
                                                           head_ref->num_bytes);
                }
                delayed_refs->num_heads++;
                delayed_refs->num_heads_ready++;
                atomic_inc(&delayed_refs->num_entries);
        }
        if (qrecord_inserted_ret)
                *qrecord_inserted_ret = qrecord_inserted;

        return head_ref;
}

/*
 * Initialize the structure which represents a modification to a an extent.
 *
 * @fs_info:    Internal to the mounted filesystem mount structure.
 *
 * @ref:        The structure which is going to be initialized.
 *
 * @bytenr:        The logical address of the extent for which a modification is
 *                going to be recorded.
 *
 * @num_bytes:  Size of the extent whose modification is being recorded.
 *
 * @ref_root:        The id of the root where this modification has originated, this
 *                can be either one of the well-known metadata trees or the
 *                subvolume id which references this extent.
 *
 * @action:        Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or
 *                BTRFS_ADD_DELAYED_EXTENT
 *
 * @ref_type:        Holds the type of the extent which is being recorded, can be
 *                one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY
 *                when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/
 *                BTRFS_EXTENT_DATA_REF_KEY when recording data extent
 */
static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
                                    struct btrfs_delayed_ref_node *ref,
                                    struct btrfs_ref *generic_ref)
{
        int action = generic_ref->action;
        u64 seq = 0;

        if (action == BTRFS_ADD_DELAYED_EXTENT)
                action = BTRFS_ADD_DELAYED_REF;

        if (is_fstree(generic_ref->ref_root))
                seq = atomic64_read(&fs_info->tree_mod_seq);

        refcount_set(&ref->refs, 1);
        ref->bytenr = generic_ref->bytenr;
        ref->num_bytes = generic_ref->num_bytes;
        ref->ref_mod = 1;
        ref->action = action;
        ref->seq = seq;
        ref->type = btrfs_ref_type(generic_ref);
        ref->ref_root = generic_ref->ref_root;
        ref->parent = generic_ref->parent;
        RB_CLEAR_NODE(&ref->ref_node);
        INIT_LIST_HEAD(&ref->add_list);

        if (generic_ref->type == BTRFS_REF_DATA)
                ref->data_ref = generic_ref->data_ref;
        else
                ref->tree_ref = generic_ref->tree_ref;
}

void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root,
                         bool skip_qgroup)
{
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
        /* If @real_root not set, use @root as fallback */
        generic_ref->real_root = mod_root ?: generic_ref->ref_root;
#endif
        generic_ref->tree_ref.level = level;
        generic_ref->type = BTRFS_REF_METADATA;
        if (skip_qgroup || !(is_fstree(generic_ref->ref_root) &&
                             (!mod_root || is_fstree(mod_root))))
                generic_ref->skip_qgroup = true;
        else
                generic_ref->skip_qgroup = false;

}

void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset,
                         u64 mod_root, bool skip_qgroup)
{
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
        /* If @real_root not set, use @root as fallback */
        generic_ref->real_root = mod_root ?: generic_ref->ref_root;
#endif
        generic_ref->data_ref.objectid = ino;
        generic_ref->data_ref.offset = offset;
        generic_ref->type = BTRFS_REF_DATA;
        if (skip_qgroup || !(is_fstree(generic_ref->ref_root) &&
                             (!mod_root || is_fstree(mod_root))))
                generic_ref->skip_qgroup = true;
        else
                generic_ref->skip_qgroup = false;
}

static int add_delayed_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_ref *generic_ref,
                           struct btrfs_delayed_extent_op *extent_op,
                           u64 reserved)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_delayed_ref_node *node;
        struct btrfs_delayed_ref_head *head_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_qgroup_extent_record *record = NULL;
        bool qrecord_inserted;
        int action = generic_ref->action;
        bool merged;

        node = kmem_cache_alloc(btrfs_delayed_ref_node_cachep, GFP_NOFS);
        if (!node)
                return -ENOMEM;

        head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
        if (!head_ref) {
                kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
                return -ENOMEM;
        }

        if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
                record = kzalloc(sizeof(*record), GFP_NOFS);
                if (!record) {
                        kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
                        kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
                        return -ENOMEM;
                }
        }

        init_delayed_ref_common(fs_info, node, generic_ref);
        init_delayed_ref_head(head_ref, generic_ref, record, reserved);
        head_ref->extent_op = extent_op;

        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);

        /*
         * insert both the head node and the new ref without dropping
         * the spin lock
         */
        head_ref = add_delayed_ref_head(trans, head_ref, record,
                                        action, &qrecord_inserted);

        merged = insert_delayed_ref(trans, head_ref, node);
        spin_unlock(&delayed_refs->lock);

        /*
         * Need to update the delayed_refs_rsv with any changes we may have
         * made.
         */
        btrfs_update_delayed_refs_rsv(trans);

        if (generic_ref->type == BTRFS_REF_DATA)
                trace_add_delayed_data_ref(trans->fs_info, node);
        else
                trace_add_delayed_tree_ref(trans->fs_info, node);
        if (merged)
                kmem_cache_free(btrfs_delayed_ref_node_cachep, node);

        if (qrecord_inserted)
                return btrfs_qgroup_trace_extent_post(trans, record);
        return 0;
}

/*
 * Add a delayed tree ref. This does all of the accounting required to make sure
 * the delayed ref is eventually processed before this transaction commits.
 */
int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
                               struct btrfs_ref *generic_ref,
                               struct btrfs_delayed_extent_op *extent_op)
{
        ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
        return add_delayed_ref(trans, generic_ref, extent_op, 0);
}

/*
 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
 */
int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
                               struct btrfs_ref *generic_ref,
                               u64 reserved)
{
        ASSERT(generic_ref->type == BTRFS_REF_DATA && generic_ref->action);
        return add_delayed_ref(trans, generic_ref, NULL, reserved);
}

int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes,
                                struct btrfs_delayed_extent_op *extent_op)
{
        struct btrfs_delayed_ref_head *head_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_ref generic_ref = {
                .type = BTRFS_REF_METADATA,
                .action = BTRFS_UPDATE_DELAYED_HEAD,
                .bytenr = bytenr,
                .num_bytes = num_bytes,
        };

        head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
        if (!head_ref)
                return -ENOMEM;

        init_delayed_ref_head(head_ref, &generic_ref, NULL, 0);
        head_ref->extent_op = extent_op;

        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);

        add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD,
                             NULL);

        spin_unlock(&delayed_refs->lock);

        /*
         * Need to update the delayed_refs_rsv with any changes we may have
         * made.
         */
        btrfs_update_delayed_refs_rsv(trans);
        return 0;
}

void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
{
        if (refcount_dec_and_test(&ref->refs)) {
                WARN_ON(!RB_EMPTY_NODE(&ref->ref_node));
                kmem_cache_free(btrfs_delayed_ref_node_cachep, ref);
        }
}

/*
 * This does a simple search for the head node for a given extent.  Returns the
 * head node if found, or NULL if not.
 */
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
{
        lockdep_assert_held(&delayed_refs->lock);

        return find_ref_head(delayed_refs, bytenr, false);
}

void __cold btrfs_delayed_ref_exit(void)
{
        kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
        kmem_cache_destroy(btrfs_delayed_ref_node_cachep);
        kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
}

int __init btrfs_delayed_ref_init(void)
{
        btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0);
        if (!btrfs_delayed_ref_head_cachep)
                goto fail;

        btrfs_delayed_ref_node_cachep = KMEM_CACHE(btrfs_delayed_ref_node, 0);
        if (!btrfs_delayed_ref_node_cachep)
                goto fail;

        btrfs_delayed_extent_op_cachep = KMEM_CACHE(btrfs_delayed_extent_op, 0);
        if (!btrfs_delayed_extent_op_cachep)
                goto fail;

        return 0;
fail:
        btrfs_delayed_ref_exit();
        return -ENOMEM;
}
























































    3 




    3 




    3 



























    3 


































    8 




    7 





























    8 


    1 




    8 





    8 











































































































































































































    6 














    1 


    1 




















    7 








    6 






    7 
    7 






















































































    1 






















    1 
    1 
































































    1 




































































































































































































    1 















    1 



























    1 



    1 




























    1 





















    1 


























    1 











































    1 













































    1 













































    1 




    1 











    1 


    1 













    1 
    1 

















    1 





    1 


    1 


    1 


























    1 


    1 















    1 





















    1 

    1 


























































































    8 






















    1 


    1 































    1 












    1 




    1 







































































    8 




    7 





































    7 


    7 









    8 




































    8 


    7 






    4 





    1 












    1 

























    8 


















    8 









    8 









    8 
    1 





    1 





    8 




    8 



















    8 







    8 








    7 
    1 









    7 
    8 
    8 



















    8 
    7 






    8 








    3 










    4 



















    8 



























    1 
    1 



    1 






    1 














    1 




    8 







    8 



    8 






























    8 

















































    1 

    1 



























    1 

    1 
    1 
    1 

    1 
















    1 













    1 

























    1 





    1 



    1 













    1 











    1 

    1 






    1 










    1 










    1 
















    7 

    7 






    7 





    6 









    7 








    7 



    7 





    7 




    7 


    7 




















    7 




    7 


    1 


    6 

























    7 




    6 











    1 













    1 







































    1 










    1 












    1 




    1 




    1 






    1 
























    1 






    1 

    1 
    1 















    6 











    7 









    7 








    6 









    7 
















    7 













































    7 




























































































































































































    3 







    3 





    3 

































    3 


    3 







    3 















    3 























    4 










    1 












    3 





    2 
    1 





















    2 








    2 












    2 


    2 
    2 

    2 

























































































    2 













    2 






    2 







    2 


    2 












    2 


    2 





    2 

    2 





    2 




    2 






    2 






























    2 

    2 





    2 




    2 

    2 








    1 





    1 







































    1 
    1 



















    1 

    1 




    1 



















    1 















    1 







    1 





    1 
















    1 











    1 






    1 












































































































































    1 

























    1 






















    1 



















    1 












    1 

    1 





    2 



    2 





    1 
    1 


    2 








    2 








    2 






    2 






    2 



    2 







    2 




















































































































































































    1 




    1 


    1 








    3 










    1 





    2 





    2 
    1 
















































































    3 





    3 
















    3 



























    3 





    3 


    3 



    3 












    3 
    1 

    3 





















    3 

    3 















    3 







    3 



















    3 





    3 

































    3 

    3 










    3 





















    3 


    3 
    3 
    3 





















































































































































    3 








    3 









    3 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/namei.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 *  Directory entry file type support and forward compatibility hooks
 *        for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
 *  Hash Tree Directory indexing (c)
 *        Daniel Phillips, 2001
 *  Hash Tree Directory indexing porting
 *        Christopher Li, 2002
 *  Hash Tree Directory indexing cleanup
 *        Theodore Ts'o, 2002
 */

#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/time.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/bio.h>
#include <linux/iversion.h>
#include <linux/unicode.h>
#include "ext4.h"
#include "ext4_jbd2.h"

#include "xattr.h"
#include "acl.h"

#include <trace/events/ext4.h>
/*
 * define how far ahead to read directories while searching them.
 */
#define NAMEI_RA_CHUNKS  2
#define NAMEI_RA_BLOCKS  4
#define NAMEI_RA_SIZE             (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)

static struct buffer_head *ext4_append(handle_t *handle,
                                        struct inode *inode,
                                        ext4_lblk_t *block)
{
        struct ext4_map_blocks map;
        struct buffer_head *bh;
        int err;

        if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
                     ((inode->i_size >> 10) >=
                      EXT4_SB(inode->i_sb)->s_max_dir_size_kb)))
                return ERR_PTR(-ENOSPC);

        *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
        map.m_lblk = *block;
        map.m_len = 1;

        /*
         * We're appending new directory block. Make sure the block is not
         * allocated yet, otherwise we will end up corrupting the
         * directory.
         */
        err = ext4_map_blocks(NULL, inode, &map, 0);
        if (err < 0)
                return ERR_PTR(err);
        if (err) {
                EXT4_ERROR_INODE(inode, "Logical block already allocated");
                return ERR_PTR(-EFSCORRUPTED);
        }

        bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE);
        if (IS_ERR(bh))
                return bh;
        inode->i_size += inode->i_sb->s_blocksize;
        EXT4_I(inode)->i_disksize = inode->i_size;
        err = ext4_mark_inode_dirty(handle, inode);
        if (err)
                goto out;
        BUFFER_TRACE(bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
                                            EXT4_JTR_NONE);
        if (err)
                goto out;
        return bh;

out:
        brelse(bh);
        ext4_std_error(inode->i_sb, err);
        return ERR_PTR(err);
}

static int ext4_dx_csum_verify(struct inode *inode,
                               struct ext4_dir_entry *dirent);

/*
 * Hints to ext4_read_dirblock regarding whether we expect a directory
 * block being read to be an index block, or a block containing
 * directory entries (and if the latter, whether it was found via a
 * logical block in an htree index block).  This is used to control
 * what sort of sanity checkinig ext4_read_dirblock() will do on the
 * directory block read from the storage device.  EITHER will means
 * the caller doesn't know what kind of directory block will be read,
 * so no specific verification will be done.
 */
typedef enum {
        EITHER, INDEX, DIRENT, DIRENT_HTREE
} dirblock_type_t;

#define ext4_read_dirblock(inode, block, type) \
        __ext4_read_dirblock((inode), (block), (type), __func__, __LINE__)

static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
                                                ext4_lblk_t block,
                                                dirblock_type_t type,
                                                const char *func,
                                                unsigned int line)
{
        struct buffer_head *bh;
        struct ext4_dir_entry *dirent;
        int is_dx_block = 0;

        if (block >= inode->i_size >> inode->i_blkbits) {
                ext4_error_inode(inode, func, line, block,
                       "Attempting to read directory block (%u) that is past i_size (%llu)",
                       block, inode->i_size);
                return ERR_PTR(-EFSCORRUPTED);
        }

        if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO))
                bh = ERR_PTR(-EIO);
        else
                bh = ext4_bread(NULL, inode, block, 0);
        if (IS_ERR(bh)) {
                __ext4_warning(inode->i_sb, func, line,
                               "inode #%lu: lblock %lu: comm %s: "
                               "error %ld reading directory block",
                               inode->i_ino, (unsigned long)block,
                               current->comm, PTR_ERR(bh));

                return bh;
        }
        if (!bh && (type == INDEX || type == DIRENT_HTREE)) {
                ext4_error_inode(inode, func, line, block,
                                 "Directory hole found for htree %s block",
                                 (type == INDEX) ? "index" : "leaf");
                return ERR_PTR(-EFSCORRUPTED);
        }
        if (!bh)
                return NULL;
        dirent = (struct ext4_dir_entry *) bh->b_data;
        /* Determine whether or not we have an index block */
        if (is_dx(inode)) {
                if (block == 0)
                        is_dx_block = 1;
                else if (ext4_rec_len_from_disk(dirent->rec_len,
                                                inode->i_sb->s_blocksize) ==
                         inode->i_sb->s_blocksize)
                        is_dx_block = 1;
        }
        if (!is_dx_block && type == INDEX) {
                ext4_error_inode(inode, func, line, block,
                       "directory leaf block found instead of index block");
                brelse(bh);
                return ERR_PTR(-EFSCORRUPTED);
        }
        if (!ext4_has_metadata_csum(inode->i_sb) ||
            buffer_verified(bh))
                return bh;

        /*
         * An empty leaf block can get mistaken for a index block; for
         * this reason, we can only check the index checksum when the
         * caller is sure it should be an index block.
         */
        if (is_dx_block && type == INDEX) {
                if (ext4_dx_csum_verify(inode, dirent) &&
                    !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
                        set_buffer_verified(bh);
                else {
                        ext4_error_inode_err(inode, func, line, block,
                                             EFSBADCRC,
                                             "Directory index failed checksum");
                        brelse(bh);
                        return ERR_PTR(-EFSBADCRC);
                }
        }
        if (!is_dx_block) {
                if (ext4_dirblock_csum_verify(inode, bh) &&
                    !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
                        set_buffer_verified(bh);
                else {
                        ext4_error_inode_err(inode, func, line, block,
                                             EFSBADCRC,
                                             "Directory block failed checksum");
                        brelse(bh);
                        return ERR_PTR(-EFSBADCRC);
                }
        }
        return bh;
}

#ifdef DX_DEBUG
#define dxtrace(command) command
#else
#define dxtrace(command)
#endif

struct fake_dirent
{
        __le32 inode;
        __le16 rec_len;
        u8 name_len;
        u8 file_type;
};

struct dx_countlimit
{
        __le16 limit;
        __le16 count;
};

struct dx_entry
{
        __le32 hash;
        __le32 block;
};

/*
 * dx_root_info is laid out so that if it should somehow get overlaid by a
 * dirent the two low bits of the hash version will be zero.  Therefore, the
 * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
 */

struct dx_root
{
        struct fake_dirent dot;
        char dot_name[4];
        struct fake_dirent dotdot;
        char dotdot_name[4];
        struct dx_root_info
        {
                __le32 reserved_zero;
                u8 hash_version;
                u8 info_length; /* 8 */
                u8 indirect_levels;
                u8 unused_flags;
        }
        info;
        struct dx_entry        entries[];
};

struct dx_node
{
        struct fake_dirent fake;
        struct dx_entry        entries[];
};


struct dx_frame
{
        struct buffer_head *bh;
        struct dx_entry *entries;
        struct dx_entry *at;
};

struct dx_map_entry
{
        u32 hash;
        u16 offs;
        u16 size;
};

/*
 * This goes at the end of each htree block.
 */
struct dx_tail {
        u32 dt_reserved;
        __le32 dt_checksum;        /* crc32c(uuid+inum+dirblock) */
};

static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
static inline unsigned dx_get_hash(struct dx_entry *entry);
static void dx_set_hash(struct dx_entry *entry, unsigned value);
static unsigned dx_get_count(struct dx_entry *entries);
static unsigned dx_get_limit(struct dx_entry *entries);
static void dx_set_count(struct dx_entry *entries, unsigned value);
static void dx_set_limit(struct dx_entry *entries, unsigned value);
static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
static unsigned dx_node_limit(struct inode *dir);
static struct dx_frame *dx_probe(struct ext4_filename *fname,
                                 struct inode *dir,
                                 struct dx_hash_info *hinfo,
                                 struct dx_frame *frame);
static void dx_release(struct dx_frame *frames);
static int dx_make_map(struct inode *dir, struct buffer_head *bh,
                       struct dx_hash_info *hinfo,
                       struct dx_map_entry *map_tail);
static void dx_sort_map(struct dx_map_entry *map, unsigned count);
static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from,
                                        char *to, struct dx_map_entry *offsets,
                                        int count, unsigned int blocksize);
static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base,
                                                unsigned int blocksize);
static void dx_insert_block(struct dx_frame *frame,
                                        u32 hash, ext4_lblk_t block);
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
                                 struct dx_frame *frame,
                                 struct dx_frame *frames,
                                 __u32 *start_hash);
static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
                struct ext4_filename *fname,
                struct ext4_dir_entry_2 **res_dir);
static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
                             struct inode *dir, struct inode *inode);

/* checksumming functions */
void ext4_initialize_dirent_tail(struct buffer_head *bh,
                                 unsigned int blocksize)
{
        struct ext4_dir_entry_tail *t = EXT4_DIRENT_TAIL(bh->b_data, blocksize);

        memset(t, 0, sizeof(struct ext4_dir_entry_tail));
        t->det_rec_len = ext4_rec_len_to_disk(
                        sizeof(struct ext4_dir_entry_tail), blocksize);
        t->det_reserved_ft = EXT4_FT_DIR_CSUM;
}

/* Walk through a dirent block to find a checksum "dirent" at the tail */
static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
                                                   struct buffer_head *bh)
{
        struct ext4_dir_entry_tail *t;
        int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);

#ifdef PARANOID
        struct ext4_dir_entry *d, *top;

        d = (struct ext4_dir_entry *)bh->b_data;
        top = (struct ext4_dir_entry *)(bh->b_data +
                (blocksize - sizeof(struct ext4_dir_entry_tail)));
        while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize))
                d = (struct ext4_dir_entry *)(((void *)d) +
                    ext4_rec_len_from_disk(d->rec_len, blocksize));

        if (d != top)
                return NULL;

        t = (struct ext4_dir_entry_tail *)d;
#else
        t = EXT4_DIRENT_TAIL(bh->b_data, EXT4_BLOCK_SIZE(inode->i_sb));
#endif

        if (t->det_reserved_zero1 ||
            (ext4_rec_len_from_disk(t->det_rec_len, blocksize) !=
             sizeof(struct ext4_dir_entry_tail)) ||
            t->det_reserved_zero2 ||
            t->det_reserved_ft != EXT4_FT_DIR_CSUM)
                return NULL;

        return t;
}

static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
        __u32 csum;

        csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
        return cpu_to_le32(csum);
}

#define warn_no_space_for_csum(inode)                                        \
        __warn_no_space_for_csum((inode), __func__, __LINE__)

static void __warn_no_space_for_csum(struct inode *inode, const char *func,
                                     unsigned int line)
{
        __ext4_warning_inode(inode, func, line,
                "No space for directory leaf checksum. Please run e2fsck -D.");
}

int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh)
{
        struct ext4_dir_entry_tail *t;

        if (!ext4_has_metadata_csum(inode->i_sb))
                return 1;

        t = get_dirent_tail(inode, bh);
        if (!t) {
                warn_no_space_for_csum(inode);
                return 0;
        }

        if (t->det_checksum != ext4_dirblock_csum(inode, bh->b_data,
                                                  (char *)t - bh->b_data))
                return 0;

        return 1;
}

static void ext4_dirblock_csum_set(struct inode *inode,
                                 struct buffer_head *bh)
{
        struct ext4_dir_entry_tail *t;

        if (!ext4_has_metadata_csum(inode->i_sb))
                return;

        t = get_dirent_tail(inode, bh);
        if (!t) {
                warn_no_space_for_csum(inode);
                return;
        }

        t->det_checksum = ext4_dirblock_csum(inode, bh->b_data,
                                             (char *)t - bh->b_data);
}

int ext4_handle_dirty_dirblock(handle_t *handle,
                               struct inode *inode,
                               struct buffer_head *bh)
{
        ext4_dirblock_csum_set(inode, bh);
        return ext4_handle_dirty_metadata(handle, inode, bh);
}

static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
                                               struct ext4_dir_entry *dirent,
                                               int *offset)
{
        struct ext4_dir_entry *dp;
        struct dx_root_info *root;
        int count_offset;
        int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
        unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize);

        if (rlen == blocksize)
                count_offset = 8;
        else if (rlen == 12) {
                dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
                if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12)
                        return NULL;
                root = (struct dx_root_info *)(((void *)dp + 12));
                if (root->reserved_zero ||
                    root->info_length != sizeof(struct dx_root_info))
                        return NULL;
                count_offset = 32;
        } else
                return NULL;

        if (offset)
                *offset = count_offset;
        return (struct dx_countlimit *)(((void *)dirent) + count_offset);
}

static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
                           int count_offset, int count, struct dx_tail *t)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
        __u32 csum;
        int size;
        __u32 dummy_csum = 0;
        int offset = offsetof(struct dx_tail, dt_checksum);

        size = count_offset + (count * sizeof(struct dx_entry));
        csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
        csum = ext4_chksum(sbi, csum, (__u8 *)t, offset);
        csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));

        return cpu_to_le32(csum);
}

static int ext4_dx_csum_verify(struct inode *inode,
                               struct ext4_dir_entry *dirent)
{
        struct dx_countlimit *c;
        struct dx_tail *t;
        int count_offset, limit, count;

        if (!ext4_has_metadata_csum(inode->i_sb))
                return 1;

        c = get_dx_countlimit(inode, dirent, &count_offset);
        if (!c) {
                EXT4_ERROR_INODE(inode, "dir seems corrupt?  Run e2fsck -D.");
                return 0;
        }
        limit = le16_to_cpu(c->limit);
        count = le16_to_cpu(c->count);
        if (count_offset + (limit * sizeof(struct dx_entry)) >
            EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
                warn_no_space_for_csum(inode);
                return 0;
        }
        t = (struct dx_tail *)(((struct dx_entry *)c) + limit);

        if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset,
                                            count, t))
                return 0;
        return 1;
}

static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
{
        struct dx_countlimit *c;
        struct dx_tail *t;
        int count_offset, limit, count;

        if (!ext4_has_metadata_csum(inode->i_sb))
                return;

        c = get_dx_countlimit(inode, dirent, &count_offset);
        if (!c) {
                EXT4_ERROR_INODE(inode, "dir seems corrupt?  Run e2fsck -D.");
                return;
        }
        limit = le16_to_cpu(c->limit);
        count = le16_to_cpu(c->count);
        if (count_offset + (limit * sizeof(struct dx_entry)) >
            EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
                warn_no_space_for_csum(inode);
                return;
        }
        t = (struct dx_tail *)(((struct dx_entry *)c) + limit);

        t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t);
}

static inline int ext4_handle_dirty_dx_node(handle_t *handle,
                                            struct inode *inode,
                                            struct buffer_head *bh)
{
        ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
        return ext4_handle_dirty_metadata(handle, inode, bh);
}

/*
 * p is at least 6 bytes before the end of page
 */
static inline struct ext4_dir_entry_2 *
ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
{
        return (struct ext4_dir_entry_2 *)((char *)p +
                ext4_rec_len_from_disk(p->rec_len, blocksize));
}

/*
 * Future: use high four bits of block for coalesce-on-delete flags
 * Mask them off for now.
 */

static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
{
        return le32_to_cpu(entry->block) & 0x0fffffff;
}

static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
{
        entry->block = cpu_to_le32(value);
}

static inline unsigned dx_get_hash(struct dx_entry *entry)
{
        return le32_to_cpu(entry->hash);
}

static inline void dx_set_hash(struct dx_entry *entry, unsigned value)
{
        entry->hash = cpu_to_le32(value);
}

static inline unsigned dx_get_count(struct dx_entry *entries)
{
        return le16_to_cpu(((struct dx_countlimit *) entries)->count);
}

static inline unsigned dx_get_limit(struct dx_entry *entries)
{
        return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
}

static inline void dx_set_count(struct dx_entry *entries, unsigned value)
{
        ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
}

static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
{
        ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
}

static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
{
        unsigned int entry_space = dir->i_sb->s_blocksize -
                        ext4_dir_rec_len(1, NULL) -
                        ext4_dir_rec_len(2, NULL) - infosize;

        if (ext4_has_metadata_csum(dir->i_sb))
                entry_space -= sizeof(struct dx_tail);
        return entry_space / sizeof(struct dx_entry);
}

static inline unsigned dx_node_limit(struct inode *dir)
{
        unsigned int entry_space = dir->i_sb->s_blocksize -
                        ext4_dir_rec_len(0, dir);

        if (ext4_has_metadata_csum(dir->i_sb))
                entry_space -= sizeof(struct dx_tail);
        return entry_space / sizeof(struct dx_entry);
}

/*
 * Debug
 */
#ifdef DX_DEBUG
static void dx_show_index(char * label, struct dx_entry *entries)
{
        int i, n = dx_get_count (entries);
        printk(KERN_DEBUG "%s index", label);
        for (i = 0; i < n; i++) {
                printk(KERN_CONT " %x->%lu",
                       i ? dx_get_hash(entries + i) : 0,
                       (unsigned long)dx_get_block(entries + i));
        }
        printk(KERN_CONT "\n");
}

struct stats
{
        unsigned names;
        unsigned space;
        unsigned bcount;
};

static struct stats dx_show_leaf(struct inode *dir,
                                struct dx_hash_info *hinfo,
                                struct ext4_dir_entry_2 *de,
                                int size, int show_names)
{
        unsigned names = 0, space = 0;
        char *base = (char *) de;
        struct dx_hash_info h = *hinfo;

        printk("names: ");
        while ((char *) de < base + size)
        {
                if (de->inode)
                {
                        if (show_names)
                        {
#ifdef CONFIG_FS_ENCRYPTION
                                int len;
                                char *name;
                                struct fscrypt_str fname_crypto_str =
                                        FSTR_INIT(NULL, 0);
                                int res = 0;

                                name  = de->name;
                                len = de->name_len;
                                if (!IS_ENCRYPTED(dir)) {
                                        /* Directory is not encrypted */
                                        (void) ext4fs_dirhash(dir, de->name,
                                                de->name_len, &h);
                                        printk("%*.s:(U)%x.%u ", len,
                                               name, h.hash,
                                               (unsigned) ((char *) de
                                                           - base));
                                } else {
                                        struct fscrypt_str de_name =
                                                FSTR_INIT(name, len);

                                        /* Directory is encrypted */
                                        res = fscrypt_fname_alloc_buffer(
                                                len, &fname_crypto_str);
                                        if (res)
                                                printk(KERN_WARNING "Error "
                                                        "allocating crypto "
                                                        "buffer--skipping "
                                                        "crypto\n");
                                        res = fscrypt_fname_disk_to_usr(dir,
                                                0, 0, &de_name,
                                                &fname_crypto_str);
                                        if (res) {
                                                printk(KERN_WARNING "Error "
                                                        "converting filename "
                                                        "from disk to usr"
                                                        "\n");
                                                name = "??";
                                                len = 2;
                                        } else {
                                                name = fname_crypto_str.name;
                                                len = fname_crypto_str.len;
                                        }
                                        if (IS_CASEFOLDED(dir))
                                                h.hash = EXT4_DIRENT_HASH(de);
                                        else
                                                (void) ext4fs_dirhash(dir,
                                                        de->name,
                                                        de->name_len, &h);
                                        printk("%*.s:(E)%x.%u ", len, name,
                                               h.hash, (unsigned) ((char *) de
                                                                   - base));
                                        fscrypt_fname_free_buffer(
                                                        &fname_crypto_str);
                                }
#else
                                int len = de->name_len;
                                char *name = de->name;
                                (void) ext4fs_dirhash(dir, de->name,
                                                      de->name_len, &h);
                                printk("%*.s:%x.%u ", len, name, h.hash,
                                       (unsigned) ((char *) de - base));
#endif
                        }
                        space += ext4_dir_rec_len(de->name_len, dir);
                        names++;
                }
                de = ext4_next_entry(de, size);
        }
        printk(KERN_CONT "(%i)\n", names);
        return (struct stats) { names, space, 1 };
}

struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
                             struct dx_entry *entries, int levels)
{
        unsigned blocksize = dir->i_sb->s_blocksize;
        unsigned count = dx_get_count(entries), names = 0, space = 0, i;
        unsigned bcount = 0;
        struct buffer_head *bh;
        printk("%i indexed blocks...\n", count);
        for (i = 0; i < count; i++, entries++)
        {
                ext4_lblk_t block = dx_get_block(entries);
                ext4_lblk_t hash  = i ? dx_get_hash(entries): 0;
                u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
                struct stats stats;
                printk("%s%3u:%03u hash %8x/%8x ",levels?"":"   ", i, block, hash, range);
                bh = ext4_bread(NULL,dir, block, 0);
                if (!bh || IS_ERR(bh))
                        continue;
                stats = levels?
                   dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
                   dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *)
                        bh->b_data, blocksize, 0);
                names += stats.names;
                space += stats.space;
                bcount += stats.bcount;
                brelse(bh);
        }
        if (bcount)
                printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
                       levels ? "" : "   ", names, space/bcount,
                       (space/bcount)*100/blocksize);
        return (struct stats) { names, space, bcount};
}

/*
 * Linear search cross check
 */
static inline void htree_rep_invariant_check(struct dx_entry *at,
                                             struct dx_entry *target,
                                             u32 hash, unsigned int n)
{
        while (n--) {
                dxtrace(printk(KERN_CONT ","));
                if (dx_get_hash(++at) > hash) {
                        at--;
                        break;
                }
        }
        ASSERT(at == target - 1);
}
#else /* DX_DEBUG */
static inline void htree_rep_invariant_check(struct dx_entry *at,
                                             struct dx_entry *target,
                                             u32 hash, unsigned int n)
{
}
#endif /* DX_DEBUG */

/*
 * Probe for a directory leaf block to search.
 *
 * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
 * error in the directory index, and the caller should fall back to
 * searching the directory normally.  The callers of dx_probe **MUST**
 * check for this error code, and make sure it never gets reflected
 * back to userspace.
 */
static struct dx_frame *
dx_probe(struct ext4_filename *fname, struct inode *dir,
         struct dx_hash_info *hinfo, struct dx_frame *frame_in)
{
        unsigned count, indirect, level, i;
        struct dx_entry *at, *entries, *p, *q, *m;
        struct dx_root *root;
        struct dx_frame *frame = frame_in;
        struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
        u32 hash;
        ext4_lblk_t block;
        ext4_lblk_t blocks[EXT4_HTREE_LEVEL];

        memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
        frame->bh = ext4_read_dirblock(dir, 0, INDEX);
        if (IS_ERR(frame->bh))
                return (struct dx_frame *) frame->bh;

        root = (struct dx_root *) frame->bh->b_data;
        if (root->info.hash_version != DX_HASH_TEA &&
            root->info.hash_version != DX_HASH_HALF_MD4 &&
            root->info.hash_version != DX_HASH_LEGACY &&
            root->info.hash_version != DX_HASH_SIPHASH) {
                ext4_warning_inode(dir, "Unrecognised inode hash code %u",
                                   root->info.hash_version);
                goto fail;
        }
        if (ext4_hash_in_dirent(dir)) {
                if (root->info.hash_version != DX_HASH_SIPHASH) {
                        ext4_warning_inode(dir,
                                "Hash in dirent, but hash is not SIPHASH");
                        goto fail;
                }
        } else {
                if (root->info.hash_version == DX_HASH_SIPHASH) {
                        ext4_warning_inode(dir,
                                "Hash code is SIPHASH, but hash not in dirent");
                        goto fail;
                }
        }
        if (fname)
                hinfo = &fname->hinfo;
        hinfo->hash_version = root->info.hash_version;
        if (hinfo->hash_version <= DX_HASH_TEA)
                hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
        hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
        /* hash is already computed for encrypted casefolded directory */
        if (fname && fname_name(fname) &&
            !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir))) {
                int ret = ext4fs_dirhash(dir, fname_name(fname),
                                         fname_len(fname), hinfo);
                if (ret < 0) {
                        ret_err = ERR_PTR(ret);
                        goto fail;
                }
        }
        hash = hinfo->hash;

        if (root->info.unused_flags & 1) {
                ext4_warning_inode(dir, "Unimplemented hash flags: %#06x",
                                   root->info.unused_flags);
                goto fail;
        }

        indirect = root->info.indirect_levels;
        if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
                ext4_warning(dir->i_sb,
                             "Directory (ino: %lu) htree depth %#06x exceed"
                             "supported value", dir->i_ino,
                             ext4_dir_htree_level(dir->i_sb));
                if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
                        ext4_warning(dir->i_sb, "Enable large directory "
                                                "feature to access it");
                }
                goto fail;
        }

        entries = (struct dx_entry *)(((char *)&root->info) +
                                      root->info.info_length);

        if (dx_get_limit(entries) != dx_root_limit(dir,
                                                   root->info.info_length)) {
                ext4_warning_inode(dir, "dx entry: limit %u != root limit %u",
                                   dx_get_limit(entries),
                                   dx_root_limit(dir, root->info.info_length));
                goto fail;
        }

        dxtrace(printk("Look up %x", hash));
        level = 0;
        blocks[0] = 0;
        while (1) {
                count = dx_get_count(entries);
                if (!count || count > dx_get_limit(entries)) {
                        ext4_warning_inode(dir,
                                           "dx entry: count %u beyond limit %u",
                                           count, dx_get_limit(entries));
                        goto fail;
                }

                p = entries + 1;
                q = entries + count - 1;
                while (p <= q) {
                        m = p + (q - p) / 2;
                        dxtrace(printk(KERN_CONT "."));
                        if (dx_get_hash(m) > hash)
                                q = m - 1;
                        else
                                p = m + 1;
                }

                htree_rep_invariant_check(entries, p, hash, count - 1);

                at = p - 1;
                dxtrace(printk(KERN_CONT " %x->%u\n",
                               at == entries ? 0 : dx_get_hash(at),
                               dx_get_block(at)));
                frame->entries = entries;
                frame->at = at;

                block = dx_get_block(at);
                for (i = 0; i <= level; i++) {
                        if (blocks[i] == block) {
                                ext4_warning_inode(dir,
                                        "dx entry: tree cycle block %u points back to block %u",
                                        blocks[level], block);
                                goto fail;
                        }
                }
                if (++level > indirect)
                        return frame;
                blocks[level] = block;
                frame++;
                frame->bh = ext4_read_dirblock(dir, block, INDEX);
                if (IS_ERR(frame->bh)) {
                        ret_err = (struct dx_frame *) frame->bh;
                        frame->bh = NULL;
                        goto fail;
                }

                entries = ((struct dx_node *) frame->bh->b_data)->entries;

                if (dx_get_limit(entries) != dx_node_limit(dir)) {
                        ext4_warning_inode(dir,
                                "dx entry: limit %u != node limit %u",
                                dx_get_limit(entries), dx_node_limit(dir));
                        goto fail;
                }
        }
fail:
        while (frame >= frame_in) {
                brelse(frame->bh);
                frame--;
        }

        if (ret_err == ERR_PTR(ERR_BAD_DX_DIR))
                ext4_warning_inode(dir,
                        "Corrupt directory, running e2fsck is recommended");
        return ret_err;
}

static void dx_release(struct dx_frame *frames)
{
        struct dx_root_info *info;
        int i;
        unsigned int indirect_levels;

        if (frames[0].bh == NULL)
                return;

        info = &((struct dx_root *)frames[0].bh->b_data)->info;
        /* save local copy, "info" may be freed after brelse() */
        indirect_levels = info->indirect_levels;
        for (i = 0; i <= indirect_levels; i++) {
                if (frames[i].bh == NULL)
                        break;
                brelse(frames[i].bh);
                frames[i].bh = NULL;
        }
}

/*
 * This function increments the frame pointer to search the next leaf
 * block, and reads in the necessary intervening nodes if the search
 * should be necessary.  Whether or not the search is necessary is
 * controlled by the hash parameter.  If the hash value is even, then
 * the search is only continued if the next block starts with that
 * hash value.  This is used if we are searching for a specific file.
 *
 * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
 *
 * This function returns 1 if the caller should continue to search,
 * or 0 if it should not.  If there is an error reading one of the
 * index blocks, it will a negative error code.
 *
 * If start_hash is non-null, it will be filled in with the starting
 * hash of the next page.
 */
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
                                 struct dx_frame *frame,
                                 struct dx_frame *frames,
                                 __u32 *start_hash)
{
        struct dx_frame *p;
        struct buffer_head *bh;
        int num_frames = 0;
        __u32 bhash;

        p = frame;
        /*
         * Find the next leaf page by incrementing the frame pointer.
         * If we run out of entries in the interior node, loop around and
         * increment pointer in the parent node.  When we break out of
         * this loop, num_frames indicates the number of interior
         * nodes need to be read.
         */
        while (1) {
                if (++(p->at) < p->entries + dx_get_count(p->entries))
                        break;
                if (p == frames)
                        return 0;
                num_frames++;
                p--;
        }

        /*
         * If the hash is 1, then continue only if the next page has a
         * continuation hash of any value.  This is used for readdir
         * handling.  Otherwise, check to see if the hash matches the
         * desired continuation hash.  If it doesn't, return since
         * there's no point to read in the successive index pages.
         */
        bhash = dx_get_hash(p->at);
        if (start_hash)
                *start_hash = bhash;
        if ((hash & 1) == 0) {
                if ((bhash & ~1) != hash)
                        return 0;
        }
        /*
         * If the hash is HASH_NB_ALWAYS, we always go to the next
         * block so no check is necessary
         */
        while (num_frames--) {
                bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX);
                if (IS_ERR(bh))
                        return PTR_ERR(bh);
                p++;
                brelse(p->bh);
                p->bh = bh;
                p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
        }
        return 1;
}


/*
 * This function fills a red-black tree with information from a
 * directory block.  It returns the number directory entries loaded
 * into the tree.  If there is an error it is returned in err.
 */
static int htree_dirblock_to_tree(struct file *dir_file,
                                  struct inode *dir, ext4_lblk_t block,
                                  struct dx_hash_info *hinfo,
                                  __u32 start_hash, __u32 start_minor_hash)
{
        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de, *top;
        int err = 0, count = 0;
        struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str;
        int csum = ext4_has_metadata_csum(dir->i_sb);

        dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
                                                        (unsigned long)block));
        bh = ext4_read_dirblock(dir, block, DIRENT_HTREE);
        if (IS_ERR(bh))
                return PTR_ERR(bh);

        de = (struct ext4_dir_entry_2 *) bh->b_data;
        /* csum entries are not larger in the casefolded encrypted case */
        top = (struct ext4_dir_entry_2 *) ((char *) de +
                                           dir->i_sb->s_blocksize -
                                           ext4_dir_rec_len(0,
                                                           csum ? NULL : dir));
        /* Check if the directory is encrypted */
        if (IS_ENCRYPTED(dir)) {
                err = fscrypt_prepare_readdir(dir);
                if (err < 0) {
                        brelse(bh);
                        return err;
                }
                err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN,
                                                 &fname_crypto_str);
                if (err < 0) {
                        brelse(bh);
                        return err;
                }
        }

        for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
                if (ext4_check_dir_entry(dir, NULL, de, bh,
                                bh->b_data, bh->b_size,
                                (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
                                         + ((char *)de - bh->b_data))) {
                        /* silently ignore the rest of the block */
                        break;
                }
                if (ext4_hash_in_dirent(dir)) {
                        if (de->name_len && de->inode) {
                                hinfo->hash = EXT4_DIRENT_HASH(de);
                                hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de);
                        } else {
                                hinfo->hash = 0;
                                hinfo->minor_hash = 0;
                        }
                } else {
                        err = ext4fs_dirhash(dir, de->name,
                                             de->name_len, hinfo);
                        if (err < 0) {
                                count = err;
                                goto errout;
                        }
                }
                if ((hinfo->hash < start_hash) ||
                    ((hinfo->hash == start_hash) &&
                     (hinfo->minor_hash < start_minor_hash)))
                        continue;
                if (de->inode == 0)
                        continue;
                if (!IS_ENCRYPTED(dir)) {
                        tmp_str.name = de->name;
                        tmp_str.len = de->name_len;
                        err = ext4_htree_store_dirent(dir_file,
                                   hinfo->hash, hinfo->minor_hash, de,
                                   &tmp_str);
                } else {
                        int save_len = fname_crypto_str.len;
                        struct fscrypt_str de_name = FSTR_INIT(de->name,
                                                                de->name_len);

                        /* Directory is encrypted */
                        err = fscrypt_fname_disk_to_usr(dir, hinfo->hash,
                                        hinfo->minor_hash, &de_name,
                                        &fname_crypto_str);
                        if (err) {
                                count = err;
                                goto errout;
                        }
                        err = ext4_htree_store_dirent(dir_file,
                                   hinfo->hash, hinfo->minor_hash, de,
                                        &fname_crypto_str);
                        fname_crypto_str.len = save_len;
                }
                if (err != 0) {
                        count = err;
                        goto errout;
                }
                count++;
        }
errout:
        brelse(bh);
        fscrypt_fname_free_buffer(&fname_crypto_str);
        return count;
}


/*
 * This function fills a red-black tree with information from a
 * directory.  We start scanning the directory in hash order, starting
 * at start_hash and start_minor_hash.
 *
 * This function returns the number of entries inserted into the tree,
 * or a negative error code.
 */
int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
                         __u32 start_minor_hash, __u32 *next_hash)
{
        struct dx_hash_info hinfo;
        struct ext4_dir_entry_2 *de;
        struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
        struct inode *dir;
        ext4_lblk_t block;
        int count = 0;
        int ret, err;
        __u32 hashval;
        struct fscrypt_str tmp_str;

        dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
                       start_hash, start_minor_hash));
        dir = file_inode(dir_file);
        if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
                if (ext4_hash_in_dirent(dir))
                        hinfo.hash_version = DX_HASH_SIPHASH;
                else
                        hinfo.hash_version =
                                        EXT4_SB(dir->i_sb)->s_def_hash_version;
                if (hinfo.hash_version <= DX_HASH_TEA)
                        hinfo.hash_version +=
                                EXT4_SB(dir->i_sb)->s_hash_unsigned;
                hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
                if (ext4_has_inline_data(dir)) {
                        int has_inline_data = 1;
                        count = ext4_inlinedir_to_tree(dir_file, dir, 0,
                                                       &hinfo, start_hash,
                                                       start_minor_hash,
                                                       &has_inline_data);
                        if (has_inline_data) {
                                *next_hash = ~0;
                                return count;
                        }
                }
                count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
                                               start_hash, start_minor_hash);
                *next_hash = ~0;
                return count;
        }
        hinfo.hash = start_hash;
        hinfo.minor_hash = 0;
        frame = dx_probe(NULL, dir, &hinfo, frames);
        if (IS_ERR(frame))
                return PTR_ERR(frame);

        /* Add '.' and '..' from the htree header */
        if (!start_hash && !start_minor_hash) {
                de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
                tmp_str.name = de->name;
                tmp_str.len = de->name_len;
                err = ext4_htree_store_dirent(dir_file, 0, 0,
                                              de, &tmp_str);
                if (err != 0)
                        goto errout;
                count++;
        }
        if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
                de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
                de = ext4_next_entry(de, dir->i_sb->s_blocksize);
                tmp_str.name = de->name;
                tmp_str.len = de->name_len;
                err = ext4_htree_store_dirent(dir_file, 2, 0,
                                              de, &tmp_str);
                if (err != 0)
                        goto errout;
                count++;
        }

        while (1) {
                if (fatal_signal_pending(current)) {
                        err = -ERESTARTSYS;
                        goto errout;
                }
                cond_resched();
                block = dx_get_block(frame->at);
                ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
                                             start_hash, start_minor_hash);
                if (ret < 0) {
                        err = ret;
                        goto errout;
                }
                count += ret;
                hashval = ~0;
                ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
                                            frame, frames, &hashval);
                *next_hash = hashval;
                if (ret < 0) {
                        err = ret;
                        goto errout;
                }
                /*
                 * Stop if:  (a) there are no more entries, or
                 * (b) we have inserted at least one entry and the
                 * next hash value is not a continuation
                 */
                if ((ret == 0) ||
                    (count && ((hashval & 1) == 0)))
                        break;
        }
        dx_release(frames);
        dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
                       "next hash: %x\n", count, *next_hash));
        return count;
errout:
        dx_release(frames);
        return (err);
}

static inline int search_dirblock(struct buffer_head *bh,
                                  struct inode *dir,
                                  struct ext4_filename *fname,
                                  unsigned int offset,
                                  struct ext4_dir_entry_2 **res_dir)
{
        return ext4_search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir,
                               fname, offset, res_dir);
}

/*
 * Directory block splitting, compacting
 */

/*
 * Create map of hash values, offsets, and sizes, stored at end of block.
 * Returns number of entries mapped.
 */
static int dx_make_map(struct inode *dir, struct buffer_head *bh,
                       struct dx_hash_info *hinfo,
                       struct dx_map_entry *map_tail)
{
        int count = 0;
        struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)bh->b_data;
        unsigned int buflen = bh->b_size;
        char *base = bh->b_data;
        struct dx_hash_info h = *hinfo;
        int blocksize = EXT4_BLOCK_SIZE(dir->i_sb);

        if (ext4_has_metadata_csum(dir->i_sb))
                buflen -= sizeof(struct ext4_dir_entry_tail);

        while ((char *) de < base + buflen) {
                if (ext4_check_dir_entry(dir, NULL, de, bh, base, buflen,
                                         ((char *)de) - base))
                        return -EFSCORRUPTED;
                if (de->name_len && de->inode) {
                        if (ext4_hash_in_dirent(dir))
                                h.hash = EXT4_DIRENT_HASH(de);
                        else {
                                int err = ext4fs_dirhash(dir, de->name,
                                                     de->name_len, &h);
                                if (err < 0)
                                        return err;
                        }
                        map_tail--;
                        map_tail->hash = h.hash;
                        map_tail->offs = ((char *) de - base)>>2;
                        map_tail->size = ext4_rec_len_from_disk(de->rec_len,
                                                                blocksize);
                        count++;
                        cond_resched();
                }
                de = ext4_next_entry(de, blocksize);
        }
        return count;
}

/* Sort map by hash value */
static void dx_sort_map (struct dx_map_entry *map, unsigned count)
{
        struct dx_map_entry *p, *q, *top = map + count - 1;
        int more;
        /* Combsort until bubble sort doesn't suck */
        while (count > 2) {
                count = count*10/13;
                if (count - 9 < 2) /* 9, 10 -> 11 */
                        count = 11;
                for (p = top, q = p - count; q >= map; p--, q--)
                        if (p->hash < q->hash)
                                swap(*p, *q);
        }
        /* Garden variety bubble sort */
        do {
                more = 0;
                q = top;
                while (q-- > map) {
                        if (q[1].hash >= q[0].hash)
                                continue;
                        swap(*(q+1), *q);
                        more = 1;
                }
        } while(more);
}

static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
{
        struct dx_entry *entries = frame->entries;
        struct dx_entry *old = frame->at, *new = old + 1;
        int count = dx_get_count(entries);

        ASSERT(count < dx_get_limit(entries));
        ASSERT(old < entries + count);
        memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
        dx_set_hash(new, hash);
        dx_set_block(new, block);
        dx_set_count(entries, count + 1);
}

#if IS_ENABLED(CONFIG_UNICODE)
/*
 * Test whether a case-insensitive directory entry matches the filename
 * being searched for.  If quick is set, assume the name being looked up
 * is already in the casefolded form.
 *
 * Returns: 0 if the directory entry matches, more than 0 if it
 * doesn't match or less than zero on error.
 */
static int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
                           u8 *de_name, size_t de_name_len, bool quick)
{
        const struct super_block *sb = parent->i_sb;
        const struct unicode_map *um = sb->s_encoding;
        struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len);
        struct qstr entry = QSTR_INIT(de_name, de_name_len);
        int ret;

        if (IS_ENCRYPTED(parent)) {
                const struct fscrypt_str encrypted_name =
                                FSTR_INIT(de_name, de_name_len);

                decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL);
                if (!decrypted_name.name)
                        return -ENOMEM;
                ret = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name,
                                                &decrypted_name);
                if (ret < 0)
                        goto out;
                entry.name = decrypted_name.name;
                entry.len = decrypted_name.len;
        }

        if (quick)
                ret = utf8_strncasecmp_folded(um, name, &entry);
        else
                ret = utf8_strncasecmp(um, name, &entry);
        if (ret < 0) {
                /* Handle invalid character sequence as either an error
                 * or as an opaque byte sequence.
                 */
                if (sb_has_strict_encoding(sb))
                        ret = -EINVAL;
                else if (name->len != entry.len)
                        ret = 1;
                else
                        ret = !!memcmp(name->name, entry.name, entry.len);
        }
out:
        kfree(decrypted_name.name);
        return ret;
}

int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
                                  struct ext4_filename *name)
{
        struct fscrypt_str *cf_name = &name->cf_name;
        struct dx_hash_info *hinfo = &name->hinfo;
        int len;

        if (!IS_CASEFOLDED(dir) ||
            (IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir))) {
                cf_name->name = NULL;
                return 0;
        }

        cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS);
        if (!cf_name->name)
                return -ENOMEM;

        len = utf8_casefold(dir->i_sb->s_encoding,
                            iname, cf_name->name,
                            EXT4_NAME_LEN);
        if (len <= 0) {
                kfree(cf_name->name);
                cf_name->name = NULL;
        }
        cf_name->len = (unsigned) len;
        if (!IS_ENCRYPTED(dir))
                return 0;

        hinfo->hash_version = DX_HASH_SIPHASH;
        hinfo->seed = NULL;
        if (cf_name->name)
                return ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo);
        else
                return ext4fs_dirhash(dir, iname->name, iname->len, hinfo);
}
#endif

/*
 * Test whether a directory entry matches the filename being searched for.
 *
 * Return: %true if the directory entry matches, otherwise %false.
 */
static bool ext4_match(struct inode *parent,
                              const struct ext4_filename *fname,
                              struct ext4_dir_entry_2 *de)
{
        struct fscrypt_name f;

        if (!de->inode)
                return false;

        f.usr_fname = fname->usr_fname;
        f.disk_name = fname->disk_name;
#ifdef CONFIG_FS_ENCRYPTION
        f.crypto_buf = fname->crypto_buf;
#endif

#if IS_ENABLED(CONFIG_UNICODE)
        if (IS_CASEFOLDED(parent) &&
            (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) {
                if (fname->cf_name.name) {
                        struct qstr cf = {.name = fname->cf_name.name,
                                          .len = fname->cf_name.len};
                        if (IS_ENCRYPTED(parent)) {
                                if (fname->hinfo.hash != EXT4_DIRENT_HASH(de) ||
                                        fname->hinfo.minor_hash !=
                                                EXT4_DIRENT_MINOR_HASH(de)) {

                                        return false;
                                }
                        }
                        return !ext4_ci_compare(parent, &cf, de->name,
                                                        de->name_len, true);
                }
                return !ext4_ci_compare(parent, fname->usr_fname, de->name,
                                                de->name_len, false);
        }
#endif

        return fscrypt_match_name(&f, de->name, de->name_len);
}

/*
 * Returns 0 if not found, -1 on failure, and 1 on success
 */
int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
                    struct inode *dir, struct ext4_filename *fname,
                    unsigned int offset, struct ext4_dir_entry_2 **res_dir)
{
        struct ext4_dir_entry_2 * de;
        char * dlimit;
        int de_len;

        de = (struct ext4_dir_entry_2 *)search_buf;
        dlimit = search_buf + buf_size;
        while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) {
                /* this code is executed quadratically often */
                /* do minimal checking `by hand' */
                if (de->name + de->name_len <= dlimit &&
                    ext4_match(dir, fname, de)) {
                        /* found a match - just to be sure, do
                         * a full check */
                        if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf,
                                                 buf_size, offset))
                                return -1;
                        *res_dir = de;
                        return 1;
                }
                /* prevent looping on a bad block */
                de_len = ext4_rec_len_from_disk(de->rec_len,
                                                dir->i_sb->s_blocksize);
                if (de_len <= 0)
                        return -1;
                offset += de_len;
                de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
        }
        return 0;
}

static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
                               struct ext4_dir_entry *de)
{
        struct super_block *sb = dir->i_sb;

        if (!is_dx(dir))
                return 0;
        if (block == 0)
                return 1;
        if (de->inode == 0 &&
            ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) ==
                        sb->s_blocksize)
                return 1;
        return 0;
}

/*
 *        __ext4_find_entry()
 *
 * finds an entry in the specified directory with the wanted name. It
 * returns the cache buffer in which the entry was found, and the entry
 * itself (as a parameter - res_dir). It does NOT read the inode of the
 * entry - you'll have to do that yourself if you want to.
 *
 * The returned buffer_head has ->b_count elevated.  The caller is expected
 * to brelse() it when appropriate.
 */
static struct buffer_head *__ext4_find_entry(struct inode *dir,
                                             struct ext4_filename *fname,
                                             struct ext4_dir_entry_2 **res_dir,
                                             int *inlined)
{
        struct super_block *sb;
        struct buffer_head *bh_use[NAMEI_RA_SIZE];
        struct buffer_head *bh, *ret = NULL;
        ext4_lblk_t start, block;
        const u8 *name = fname->usr_fname->name;
        size_t ra_max = 0;        /* Number of bh's in the readahead
                                   buffer, bh_use[] */
        size_t ra_ptr = 0;        /* Current index into readahead
                                   buffer */
        ext4_lblk_t  nblocks;
        int i, namelen, retval;

        *res_dir = NULL;
        sb = dir->i_sb;
        namelen = fname->usr_fname->len;
        if (namelen > EXT4_NAME_LEN)
                return NULL;

        if (ext4_has_inline_data(dir)) {
                int has_inline_data = 1;
                ret = ext4_find_inline_entry(dir, fname, res_dir,
                                             &has_inline_data);
                if (inlined)
                        *inlined = has_inline_data;
                if (has_inline_data)
                        goto cleanup_and_exit;
        }

        if ((namelen <= 2) && (name[0] == '.') &&
            (name[1] == '.' || name[1] == '\0')) {
                /*
                 * "." or ".." will only be in the first block
                 * NFS may look up ".."; "." should be handled by the VFS
                 */
                block = start = 0;
                nblocks = 1;
                goto restart;
        }
        if (is_dx(dir)) {
                ret = ext4_dx_find_entry(dir, fname, res_dir);
                /*
                 * On success, or if the error was file not found,
                 * return.  Otherwise, fall back to doing a search the
                 * old fashioned way.
                 */
                if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR)
                        goto cleanup_and_exit;
                dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
                               "falling back\n"));
                ret = NULL;
        }
        nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
        if (!nblocks) {
                ret = NULL;
                goto cleanup_and_exit;
        }
        start = EXT4_I(dir)->i_dir_start_lookup;
        if (start >= nblocks)
                start = 0;
        block = start;
restart:
        do {
                /*
                 * We deal with the read-ahead logic here.
                 */
                cond_resched();
                if (ra_ptr >= ra_max) {
                        /* Refill the readahead buffer */
                        ra_ptr = 0;
                        if (block < start)
                                ra_max = start - block;
                        else
                                ra_max = nblocks - block;
                        ra_max = min(ra_max, ARRAY_SIZE(bh_use));
                        retval = ext4_bread_batch(dir, block, ra_max,
                                                  false /* wait */, bh_use);
                        if (retval) {
                                ret = ERR_PTR(retval);
                                ra_max = 0;
                                goto cleanup_and_exit;
                        }
                }
                if ((bh = bh_use[ra_ptr++]) == NULL)
                        goto next;
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
                        EXT4_ERROR_INODE_ERR(dir, EIO,
                                             "reading directory lblock %lu",
                                             (unsigned long) block);
                        brelse(bh);
                        ret = ERR_PTR(-EIO);
                        goto cleanup_and_exit;
                }
                if (!buffer_verified(bh) &&
                    !is_dx_internal_node(dir, block,
                                         (struct ext4_dir_entry *)bh->b_data) &&
                    !ext4_dirblock_csum_verify(dir, bh)) {
                        EXT4_ERROR_INODE_ERR(dir, EFSBADCRC,
                                             "checksumming directory "
                                             "block %lu", (unsigned long)block);
                        brelse(bh);
                        ret = ERR_PTR(-EFSBADCRC);
                        goto cleanup_and_exit;
                }
                set_buffer_verified(bh);
                i = search_dirblock(bh, dir, fname,
                            block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
                if (i == 1) {
                        EXT4_I(dir)->i_dir_start_lookup = block;
                        ret = bh;
                        goto cleanup_and_exit;
                } else {
                        brelse(bh);
                        if (i < 0)
                                goto cleanup_and_exit;
                }
        next:
                if (++block >= nblocks)
                        block = 0;
        } while (block != start);

        /*
         * If the directory has grown while we were searching, then
         * search the last part of the directory before giving up.
         */
        block = nblocks;
        nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
        if (block < nblocks) {
                start = 0;
                goto restart;
        }

cleanup_and_exit:
        /* Clean up the read-ahead blocks */
        for (; ra_ptr < ra_max; ra_ptr++)
                brelse(bh_use[ra_ptr]);
        return ret;
}

static struct buffer_head *ext4_find_entry(struct inode *dir,
                                           const struct qstr *d_name,
                                           struct ext4_dir_entry_2 **res_dir,
                                           int *inlined)
{
        int err;
        struct ext4_filename fname;
        struct buffer_head *bh;

        err = ext4_fname_setup_filename(dir, d_name, 1, &fname);
        if (err == -ENOENT)
                return NULL;
        if (err)
                return ERR_PTR(err);

        bh = __ext4_find_entry(dir, &fname, res_dir, inlined);

        ext4_fname_free_filename(&fname);
        return bh;
}

static struct buffer_head *ext4_lookup_entry(struct inode *dir,
                                             struct dentry *dentry,
                                             struct ext4_dir_entry_2 **res_dir)
{
        int err;
        struct ext4_filename fname;
        struct buffer_head *bh;

        err = ext4_fname_prepare_lookup(dir, dentry, &fname);
        if (err == -ENOENT)
                return NULL;
        if (err)
                return ERR_PTR(err);

        bh = __ext4_find_entry(dir, &fname, res_dir, NULL);

        ext4_fname_free_filename(&fname);
        return bh;
}

static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
                        struct ext4_filename *fname,
                        struct ext4_dir_entry_2 **res_dir)
{
        struct super_block * sb = dir->i_sb;
        struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
        struct buffer_head *bh;
        ext4_lblk_t block;
        int retval;

#ifdef CONFIG_FS_ENCRYPTION
        *res_dir = NULL;
#endif
        frame = dx_probe(fname, dir, NULL, frames);
        if (IS_ERR(frame))
                return (struct buffer_head *) frame;
        do {
                block = dx_get_block(frame->at);
                bh = ext4_read_dirblock(dir, block, DIRENT_HTREE);
                if (IS_ERR(bh))
                        goto errout;

                retval = search_dirblock(bh, dir, fname,
                                         block << EXT4_BLOCK_SIZE_BITS(sb),
                                         res_dir);
                if (retval == 1)
                        goto success;
                brelse(bh);
                if (retval == -1) {
                        bh = ERR_PTR(ERR_BAD_DX_DIR);
                        goto errout;
                }

                /* Check to see if we should continue to search */
                retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame,
                                               frames, NULL);
                if (retval < 0) {
                        ext4_warning_inode(dir,
                                "error %d reading directory index block",
                                retval);
                        bh = ERR_PTR(retval);
                        goto errout;
                }
        } while (retval == 1);

        bh = NULL;
errout:
        dxtrace(printk(KERN_DEBUG "%s not found\n", fname->usr_fname->name));
success:
        dx_release(frames);
        return bh;
}

static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        struct inode *inode;
        struct ext4_dir_entry_2 *de;
        struct buffer_head *bh;

        if (dentry->d_name.len > EXT4_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);

        bh = ext4_lookup_entry(dir, dentry, &de);
        if (IS_ERR(bh))
                return ERR_CAST(bh);
        inode = NULL;
        if (bh) {
                __u32 ino = le32_to_cpu(de->inode);
                brelse(bh);
                if (!ext4_valid_inum(dir->i_sb, ino)) {
                        EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
                        return ERR_PTR(-EFSCORRUPTED);
                }
                if (unlikely(ino == dir->i_ino)) {
                        EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir",
                                         dentry);
                        return ERR_PTR(-EFSCORRUPTED);
                }
                inode = ext4_iget(dir->i_sb, ino, EXT4_IGET_NORMAL);
                if (inode == ERR_PTR(-ESTALE)) {
                        EXT4_ERROR_INODE(dir,
                                         "deleted inode referenced: %u",
                                         ino);
                        return ERR_PTR(-EFSCORRUPTED);
                }
                if (!IS_ERR(inode) && IS_ENCRYPTED(dir) &&
                    (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
                    !fscrypt_has_permitted_context(dir, inode)) {
                        ext4_warning(inode->i_sb,
                                     "Inconsistent encryption contexts: %lu/%lu",
                                     dir->i_ino, inode->i_ino);
                        iput(inode);
                        return ERR_PTR(-EPERM);
                }
        }

#if IS_ENABLED(CONFIG_UNICODE)
        if (!inode && IS_CASEFOLDED(dir)) {
                /* Eventually we want to call d_add_ci(dentry, NULL)
                 * for negative dentries in the encoding case as
                 * well.  For now, prevent the negative dentry
                 * from being cached.
                 */
                return NULL;
        }
#endif
        return d_splice_alias(inode, dentry);
}


struct dentry *ext4_get_parent(struct dentry *child)
{
        __u32 ino;
        struct ext4_dir_entry_2 * de;
        struct buffer_head *bh;

        bh = ext4_find_entry(d_inode(child), &dotdot_name, &de, NULL);
        if (IS_ERR(bh))
                return ERR_CAST(bh);
        if (!bh)
                return ERR_PTR(-ENOENT);
        ino = le32_to_cpu(de->inode);
        brelse(bh);

        if (!ext4_valid_inum(child->d_sb, ino)) {
                EXT4_ERROR_INODE(d_inode(child),
                                 "bad parent inode number: %u", ino);
                return ERR_PTR(-EFSCORRUPTED);
        }

        return d_obtain_alias(ext4_iget(child->d_sb, ino, EXT4_IGET_NORMAL));
}

/*
 * Move count entries from end of map between two memory locations.
 * Returns pointer to last entry moved.
 */
static struct ext4_dir_entry_2 *
dx_move_dirents(struct inode *dir, char *from, char *to,
                struct dx_map_entry *map, int count,
                unsigned blocksize)
{
        unsigned rec_len = 0;

        while (count--) {
                struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
                                                (from + (map->offs<<2));
                rec_len = ext4_dir_rec_len(de->name_len, dir);

                memcpy (to, de, rec_len);
                ((struct ext4_dir_entry_2 *) to)->rec_len =
                                ext4_rec_len_to_disk(rec_len, blocksize);

                /* wipe dir_entry excluding the rec_len field */
                de->inode = 0;
                memset(&de->name_len, 0, ext4_rec_len_from_disk(de->rec_len,
                                                                blocksize) -
                                         offsetof(struct ext4_dir_entry_2,
                                                                name_len));

                map++;
                to += rec_len;
        }
        return (struct ext4_dir_entry_2 *) (to - rec_len);
}

/*
 * Compact each dir entry in the range to the minimal rec_len.
 * Returns pointer to last entry in range.
 */
static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base,
                                                        unsigned int blocksize)
{
        struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
        unsigned rec_len = 0;

        prev = to = de;
        while ((char*)de < base + blocksize) {
                next = ext4_next_entry(de, blocksize);
                if (de->inode && de->name_len) {
                        rec_len = ext4_dir_rec_len(de->name_len, dir);
                        if (de > to)
                                memmove(to, de, rec_len);
                        to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
                        prev = to;
                        to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
                }
                de = next;
        }
        return prev;
}

/*
 * Split a full leaf block to make room for a new dir entry.
 * Allocate a new block, and move entries so that they are approx. equally full.
 * Returns pointer to de in block into which the new entry will be inserted.
 */
static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
                        struct buffer_head **bh,struct dx_frame *frame,
                        struct dx_hash_info *hinfo)
{
        unsigned blocksize = dir->i_sb->s_blocksize;
        unsigned continued;
        int count;
        struct buffer_head *bh2;
        ext4_lblk_t newblock;
        u32 hash2;
        struct dx_map_entry *map;
        char *data1 = (*bh)->b_data, *data2;
        unsigned split, move, size;
        struct ext4_dir_entry_2 *de = NULL, *de2;
        int        csum_size = 0;
        int        err = 0, i;

        if (ext4_has_metadata_csum(dir->i_sb))
                csum_size = sizeof(struct ext4_dir_entry_tail);

        bh2 = ext4_append(handle, dir, &newblock);
        if (IS_ERR(bh2)) {
                brelse(*bh);
                *bh = NULL;
                return (struct ext4_dir_entry_2 *) bh2;
        }

        BUFFER_TRACE(*bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, dir->i_sb, *bh,
                                            EXT4_JTR_NONE);
        if (err)
                goto journal_error;

        BUFFER_TRACE(frame->bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, dir->i_sb, frame->bh,
                                            EXT4_JTR_NONE);
        if (err)
                goto journal_error;

        data2 = bh2->b_data;

        /* create map in the end of data2 block */
        map = (struct dx_map_entry *) (data2 + blocksize);
        count = dx_make_map(dir, *bh, hinfo, map);
        if (count < 0) {
                err = count;
                goto journal_error;
        }
        map -= count;
        dx_sort_map(map, count);
        /* Ensure that neither split block is over half full */
        size = 0;
        move = 0;
        for (i = count-1; i >= 0; i--) {
                /* is more than half of this entry in 2nd half of the block? */
                if (size + map[i].size/2 > blocksize/2)
                        break;
                size += map[i].size;
                move++;
        }
        /*
         * map index at which we will split
         *
         * If the sum of active entries didn't exceed half the block size, just
         * split it in half by count; each resulting block will have at least
         * half the space free.
         */
        if (i > 0)
                split = count - move;
        else
                split = count/2;

        hash2 = map[split].hash;
        continued = hash2 == map[split - 1].hash;
        dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
                        (unsigned long)dx_get_block(frame->at),
                                        hash2, split, count-split));

        /* Fancy dance to stay within two buffers */
        de2 = dx_move_dirents(dir, data1, data2, map + split, count - split,
                              blocksize);
        de = dx_pack_dirents(dir, data1, blocksize);
        de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
                                           (char *) de,
                                           blocksize);
        de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
                                            (char *) de2,
                                            blocksize);
        if (csum_size) {
                ext4_initialize_dirent_tail(*bh, blocksize);
                ext4_initialize_dirent_tail(bh2, blocksize);
        }

        dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data1,
                        blocksize, 1));
        dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2,
                        blocksize, 1));

        /* Which block gets the new entry? */
        if (hinfo->hash >= hash2) {
                swap(*bh, bh2);
                de = de2;
        }
        dx_insert_block(frame, hash2 + continued, newblock);
        err = ext4_handle_dirty_dirblock(handle, dir, bh2);
        if (err)
                goto journal_error;
        err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
        if (err)
                goto journal_error;
        brelse(bh2);
        dxtrace(dx_show_index("frame", frame->entries));
        return de;

journal_error:
        brelse(*bh);
        brelse(bh2);
        *bh = NULL;
        ext4_std_error(dir->i_sb, err);
        return ERR_PTR(err);
}

int ext4_find_dest_de(struct inode *dir, struct inode *inode,
                      struct buffer_head *bh,
                      void *buf, int buf_size,
                      struct ext4_filename *fname,
                      struct ext4_dir_entry_2 **dest_de)
{
        struct ext4_dir_entry_2 *de;
        unsigned short reclen = ext4_dir_rec_len(fname_len(fname), dir);
        int nlen, rlen;
        unsigned int offset = 0;
        char *top;

        de = buf;
        top = buf + buf_size - reclen;
        while ((char *) de <= top) {
                if (ext4_check_dir_entry(dir, NULL, de, bh,
                                         buf, buf_size, offset))
                        return -EFSCORRUPTED;
                if (ext4_match(dir, fname, de))
                        return -EEXIST;
                nlen = ext4_dir_rec_len(de->name_len, dir);
                rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
                if ((de->inode ? rlen - nlen : rlen) >= reclen)
                        break;
                de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
                offset += rlen;
        }
        if ((char *) de > top)
                return -ENOSPC;

        *dest_de = de;
        return 0;
}

void ext4_insert_dentry(struct inode *dir,
                        struct inode *inode,
                        struct ext4_dir_entry_2 *de,
                        int buf_size,
                        struct ext4_filename *fname)
{

        int nlen, rlen;

        nlen = ext4_dir_rec_len(de->name_len, dir);
        rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
        if (de->inode) {
                struct ext4_dir_entry_2 *de1 =
                        (struct ext4_dir_entry_2 *)((char *)de + nlen);
                de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size);
                de->rec_len = ext4_rec_len_to_disk(nlen, buf_size);
                de = de1;
        }
        de->file_type = EXT4_FT_UNKNOWN;
        de->inode = cpu_to_le32(inode->i_ino);
        ext4_set_de_type(inode->i_sb, de, inode->i_mode);
        de->name_len = fname_len(fname);
        memcpy(de->name, fname_name(fname), fname_len(fname));
        if (ext4_hash_in_dirent(dir)) {
                struct dx_hash_info *hinfo = &fname->hinfo;

                EXT4_DIRENT_HASHES(de)->hash = cpu_to_le32(hinfo->hash);
                EXT4_DIRENT_HASHES(de)->minor_hash =
                                                cpu_to_le32(hinfo->minor_hash);
        }
}

/*
 * Add a new entry into a directory (leaf) block.  If de is non-NULL,
 * it points to a directory entry which is guaranteed to be large
 * enough for new directory entry.  If de is NULL, then
 * add_dirent_to_buf will attempt search the directory block for
 * space.  It will return -ENOSPC if no space is available, and -EIO
 * and -EEXIST if directory entry already exists.
 */
static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
                             struct inode *dir,
                             struct inode *inode, struct ext4_dir_entry_2 *de,
                             struct buffer_head *bh)
{
        unsigned int        blocksize = dir->i_sb->s_blocksize;
        int                csum_size = 0;
        int                err, err2;

        if (ext4_has_metadata_csum(inode->i_sb))
                csum_size = sizeof(struct ext4_dir_entry_tail);

        if (!de) {
                err = ext4_find_dest_de(dir, inode, bh, bh->b_data,
                                        blocksize - csum_size, fname, &de);
                if (err)
                        return err;
        }
        BUFFER_TRACE(bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, dir->i_sb, bh,
                                            EXT4_JTR_NONE);
        if (err) {
                ext4_std_error(dir->i_sb, err);
                return err;
        }

        /* By now the buffer is marked for journaling */
        ext4_insert_dentry(dir, inode, de, blocksize, fname);

        /*
         * XXX shouldn't update any times until successful
         * completion of syscall, but too many callers depend
         * on this.
         *
         * XXX similarly, too many callers depend on
         * ext4_new_inode() setting the times, but error
         * recovery deletes the inode, so the worst that can
         * happen is that the times are slightly out of date
         * and/or different from the directory change time.
         */
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        ext4_update_dx_flag(dir);
        inode_inc_iversion(dir);
        err2 = ext4_mark_inode_dirty(handle, dir);
        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_dirblock(handle, dir, bh);
        if (err)
                ext4_std_error(dir->i_sb, err);
        return err ? err : err2;
}

/*
 * This converts a one block unindexed directory to a 3 block indexed
 * directory, and adds the dentry to the indexed directory.
 */
static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
                            struct inode *dir,
                            struct inode *inode, struct buffer_head *bh)
{
        struct buffer_head *bh2;
        struct dx_root        *root;
        struct dx_frame        frames[EXT4_HTREE_LEVEL], *frame;
        struct dx_entry *entries;
        struct ext4_dir_entry_2        *de, *de2;
        char                *data2, *top;
        unsigned        len;
        int                retval;
        unsigned        blocksize;
        ext4_lblk_t  block;
        struct fake_dirent *fde;
        int csum_size = 0;

        if (ext4_has_metadata_csum(inode->i_sb))
                csum_size = sizeof(struct ext4_dir_entry_tail);

        blocksize =  dir->i_sb->s_blocksize;
        dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
        BUFFER_TRACE(bh, "get_write_access");
        retval = ext4_journal_get_write_access(handle, dir->i_sb, bh,
                                               EXT4_JTR_NONE);
        if (retval) {
                ext4_std_error(dir->i_sb, retval);
                brelse(bh);
                return retval;
        }
        root = (struct dx_root *) bh->b_data;

        /* The 0th block becomes the root, move the dirents out */
        fde = &root->dotdot;
        de = (struct ext4_dir_entry_2 *)((char *)fde +
                ext4_rec_len_from_disk(fde->rec_len, blocksize));
        if ((char *) de >= (((char *) root) + blocksize)) {
                EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
                brelse(bh);
                return -EFSCORRUPTED;
        }
        len = ((char *) root) + (blocksize - csum_size) - (char *) de;

        /* Allocate new block for the 0th block's dirents */
        bh2 = ext4_append(handle, dir, &block);
        if (IS_ERR(bh2)) {
                brelse(bh);
                return PTR_ERR(bh2);
        }
        ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
        data2 = bh2->b_data;

        memcpy(data2, de, len);
        memset(de, 0, len); /* wipe old data */
        de = (struct ext4_dir_entry_2 *) data2;
        top = data2 + len;
        while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) {
                if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len,
                                        (char *)de - data2)) {
                        brelse(bh2);
                        brelse(bh);
                        return -EFSCORRUPTED;
                }
                de = de2;
        }
        de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
                                           (char *) de, blocksize);

        if (csum_size)
                ext4_initialize_dirent_tail(bh2, blocksize);

        /* Initialize the root; the dot dirents already exist */
        de = (struct ext4_dir_entry_2 *) (&root->dotdot);
        de->rec_len = ext4_rec_len_to_disk(
                        blocksize - ext4_dir_rec_len(2, NULL), blocksize);
        memset (&root->info, 0, sizeof(root->info));
        root->info.info_length = sizeof(root->info);
        if (ext4_hash_in_dirent(dir))
                root->info.hash_version = DX_HASH_SIPHASH;
        else
                root->info.hash_version =
                                EXT4_SB(dir->i_sb)->s_def_hash_version;

        entries = root->entries;
        dx_set_block(entries, 1);
        dx_set_count(entries, 1);
        dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));

        /* Initialize as for dx_probe */
        fname->hinfo.hash_version = root->info.hash_version;
        if (fname->hinfo.hash_version <= DX_HASH_TEA)
                fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
        fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;

        /* casefolded encrypted hashes are computed on fname setup */
        if (!ext4_hash_in_dirent(dir)) {
                int err = ext4fs_dirhash(dir, fname_name(fname),
                                         fname_len(fname), &fname->hinfo);
                if (err < 0) {
                        brelse(bh2);
                        brelse(bh);
                        return err;
                }
        }
        memset(frames, 0, sizeof(frames));
        frame = frames;
        frame->entries = entries;
        frame->at = entries;
        frame->bh = bh;

        retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
        if (retval)
                goto out_frames;
        retval = ext4_handle_dirty_dirblock(handle, dir, bh2);
        if (retval)
                goto out_frames;

        de = do_split(handle,dir, &bh2, frame, &fname->hinfo);
        if (IS_ERR(de)) {
                retval = PTR_ERR(de);
                goto out_frames;
        }

        retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh2);
out_frames:
        /*
         * Even if the block split failed, we have to properly write
         * out all the changes we did so far. Otherwise we can end up
         * with corrupted filesystem.
         */
        if (retval)
                ext4_mark_inode_dirty(handle, dir);
        dx_release(frames);
        brelse(bh2);
        return retval;
}

/*
 *        ext4_add_entry()
 *
 * adds a file entry to the specified directory, using the same
 * semantics as ext4_find_entry(). It returns NULL if it failed.
 *
 * NOTE!! The inode part of 'de' is left at 0 - which means you
 * may not sleep between calling this and putting something into
 * the entry, as someone else might have used it while you slept.
 */
static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                          struct inode *inode)
{
        struct inode *dir = d_inode(dentry->d_parent);
        struct buffer_head *bh = NULL;
        struct ext4_dir_entry_2 *de;
        struct super_block *sb;
        struct ext4_filename fname;
        int        retval;
        int        dx_fallback=0;
        unsigned blocksize;
        ext4_lblk_t block, blocks;
        int        csum_size = 0;

        if (ext4_has_metadata_csum(inode->i_sb))
                csum_size = sizeof(struct ext4_dir_entry_tail);

        sb = dir->i_sb;
        blocksize = sb->s_blocksize;

        if (fscrypt_is_nokey_name(dentry))
                return -ENOKEY;

#if IS_ENABLED(CONFIG_UNICODE)
        if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) &&
            utf8_validate(sb->s_encoding, &dentry->d_name))
                return -EINVAL;
#endif

        retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname);
        if (retval)
                return retval;

        if (ext4_has_inline_data(dir)) {
                retval = ext4_try_add_inline_entry(handle, &fname, dir, inode);
                if (retval < 0)
                        goto out;
                if (retval == 1) {
                        retval = 0;
                        goto out;
                }
        }

        if (is_dx(dir)) {
                retval = ext4_dx_add_entry(handle, &fname, dir, inode);
                if (!retval || (retval != ERR_BAD_DX_DIR))
                        goto out;
                /* Can we just ignore htree data? */
                if (ext4_has_metadata_csum(sb)) {
                        EXT4_ERROR_INODE(dir,
                                "Directory has corrupted htree index.");
                        retval = -EFSCORRUPTED;
                        goto out;
                }
                ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
                dx_fallback++;
                retval = ext4_mark_inode_dirty(handle, dir);
                if (unlikely(retval))
                        goto out;
        }
        blocks = dir->i_size >> sb->s_blocksize_bits;
        for (block = 0; block < blocks; block++) {
                bh = ext4_read_dirblock(dir, block, DIRENT);
                if (bh == NULL) {
                        bh = ext4_bread(handle, dir, block,
                                        EXT4_GET_BLOCKS_CREATE);
                        goto add_to_new_block;
                }
                if (IS_ERR(bh)) {
                        retval = PTR_ERR(bh);
                        bh = NULL;
                        goto out;
                }
                retval = add_dirent_to_buf(handle, &fname, dir, inode,
                                           NULL, bh);
                if (retval != -ENOSPC)
                        goto out;

                if (blocks == 1 && !dx_fallback &&
                    ext4_has_feature_dir_index(sb)) {
                        retval = make_indexed_dir(handle, &fname, dir,
                                                  inode, bh);
                        bh = NULL; /* make_indexed_dir releases bh */
                        goto out;
                }
                brelse(bh);
        }
        bh = ext4_append(handle, dir, &block);
add_to_new_block:
        if (IS_ERR(bh)) {
                retval = PTR_ERR(bh);
                bh = NULL;
                goto out;
        }
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        de->inode = 0;
        de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);

        if (csum_size)
                ext4_initialize_dirent_tail(bh, blocksize);

        retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh);
out:
        ext4_fname_free_filename(&fname);
        brelse(bh);
        if (retval == 0)
                ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
        return retval;
}

/*
 * Returns 0 for success, or a negative error value
 */
static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
                             struct inode *dir, struct inode *inode)
{
        struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
        struct dx_entry *entries, *at;
        struct buffer_head *bh;
        struct super_block *sb = dir->i_sb;
        struct ext4_dir_entry_2 *de;
        int restart;
        int err;

again:
        restart = 0;
        frame = dx_probe(fname, dir, NULL, frames);
        if (IS_ERR(frame))
                return PTR_ERR(frame);
        entries = frame->entries;
        at = frame->at;
        bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT_HTREE);
        if (IS_ERR(bh)) {
                err = PTR_ERR(bh);
                bh = NULL;
                goto cleanup;
        }

        BUFFER_TRACE(bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE);
        if (err)
                goto journal_error;

        err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh);
        if (err != -ENOSPC)
                goto cleanup;

        err = 0;
        /* Block full, should compress but for now just split */
        dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
                       dx_get_count(entries), dx_get_limit(entries)));
        /* Need to split index? */
        if (dx_get_count(entries) == dx_get_limit(entries)) {
                ext4_lblk_t newblock;
                int levels = frame - frames + 1;
                unsigned int icount;
                int add_level = 1;
                struct dx_entry *entries2;
                struct dx_node *node2;
                struct buffer_head *bh2;

                while (frame > frames) {
                        if (dx_get_count((frame - 1)->entries) <
                            dx_get_limit((frame - 1)->entries)) {
                                add_level = 0;
                                break;
                        }
                        frame--; /* split higher index block */
                        at = frame->at;
                        entries = frame->entries;
                        restart = 1;
                }
                if (add_level && levels == ext4_dir_htree_level(sb)) {
                        ext4_warning(sb, "Directory (ino: %lu) index full, "
                                         "reach max htree level :%d",
                                         dir->i_ino, levels);
                        if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
                                ext4_warning(sb, "Large directory feature is "
                                                 "not enabled on this "
                                                 "filesystem");
                        }
                        err = -ENOSPC;
                        goto cleanup;
                }
                icount = dx_get_count(entries);
                bh2 = ext4_append(handle, dir, &newblock);
                if (IS_ERR(bh2)) {
                        err = PTR_ERR(bh2);
                        goto cleanup;
                }
                node2 = (struct dx_node *)(bh2->b_data);
                entries2 = node2->entries;
                memset(&node2->fake, 0, sizeof(struct fake_dirent));
                node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
                                                           sb->s_blocksize);
                BUFFER_TRACE(frame->bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, sb, frame->bh,
                                                    EXT4_JTR_NONE);
                if (err)
                        goto journal_error;
                if (!add_level) {
                        unsigned icount1 = icount/2, icount2 = icount - icount1;
                        unsigned hash2 = dx_get_hash(entries + icount1);
                        dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
                                       icount1, icount2));

                        BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
                        err = ext4_journal_get_write_access(handle, sb,
                                                            (frame - 1)->bh,
                                                            EXT4_JTR_NONE);
                        if (err)
                                goto journal_error;

                        memcpy((char *) entries2, (char *) (entries + icount1),
                               icount2 * sizeof(struct dx_entry));
                        dx_set_count(entries, icount1);
                        dx_set_count(entries2, icount2);
                        dx_set_limit(entries2, dx_node_limit(dir));

                        /* Which index block gets the new entry? */
                        if (at - entries >= icount1) {
                                frame->at = at - entries - icount1 + entries2;
                                frame->entries = entries = entries2;
                                swap(frame->bh, bh2);
                        }
                        dx_insert_block((frame - 1), hash2, newblock);
                        dxtrace(dx_show_index("node", frame->entries));
                        dxtrace(dx_show_index("node",
                               ((struct dx_node *) bh2->b_data)->entries));
                        err = ext4_handle_dirty_dx_node(handle, dir, bh2);
                        if (err)
                                goto journal_error;
                        brelse (bh2);
                        err = ext4_handle_dirty_dx_node(handle, dir,
                                                   (frame - 1)->bh);
                        if (err)
                                goto journal_error;
                        err = ext4_handle_dirty_dx_node(handle, dir,
                                                        frame->bh);
                        if (restart || err)
                                goto journal_error;
                } else {
                        struct dx_root *dxroot;
                        memcpy((char *) entries2, (char *) entries,
                               icount * sizeof(struct dx_entry));
                        dx_set_limit(entries2, dx_node_limit(dir));

                        /* Set up root */
                        dx_set_count(entries, 1);
                        dx_set_block(entries + 0, newblock);
                        dxroot = (struct dx_root *)frames[0].bh->b_data;
                        dxroot->info.indirect_levels += 1;
                        dxtrace(printk(KERN_DEBUG
                                       "Creating %d level index...\n",
                                       dxroot->info.indirect_levels));
                        err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
                        if (err)
                                goto journal_error;
                        err = ext4_handle_dirty_dx_node(handle, dir, bh2);
                        brelse(bh2);
                        restart = 1;
                        goto journal_error;
                }
        }
        de = do_split(handle, dir, &bh, frame, &fname->hinfo);
        if (IS_ERR(de)) {
                err = PTR_ERR(de);
                goto cleanup;
        }
        err = add_dirent_to_buf(handle, fname, dir, inode, de, bh);
        goto cleanup;

journal_error:
        ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */
cleanup:
        brelse(bh);
        dx_release(frames);
        /* @restart is true means htree-path has been changed, we need to
         * repeat dx_probe() to find out valid htree-path
         */
        if (restart && err == 0)
                goto again;
        return err;
}

/*
 * ext4_generic_delete_entry deletes a directory entry by merging it
 * with the previous entry
 */
int ext4_generic_delete_entry(struct inode *dir,
                              struct ext4_dir_entry_2 *de_del,
                              struct buffer_head *bh,
                              void *entry_buf,
                              int buf_size,
                              int csum_size)
{
        struct ext4_dir_entry_2 *de, *pde;
        unsigned int blocksize = dir->i_sb->s_blocksize;
        int i;

        i = 0;
        pde = NULL;
        de = entry_buf;
        while (i < buf_size - csum_size) {
                if (ext4_check_dir_entry(dir, NULL, de, bh,
                                         entry_buf, buf_size, i))
                        return -EFSCORRUPTED;
                if (de == de_del)  {
                        if (pde) {
                                pde->rec_len = ext4_rec_len_to_disk(
                                        ext4_rec_len_from_disk(pde->rec_len,
                                                               blocksize) +
                                        ext4_rec_len_from_disk(de->rec_len,
                                                               blocksize),
                                        blocksize);

                                /* wipe entire dir_entry */
                                memset(de, 0, ext4_rec_len_from_disk(de->rec_len,
                                                                blocksize));
                        } else {
                                /* wipe dir_entry excluding the rec_len field */
                                de->inode = 0;
                                memset(&de->name_len, 0,
                                        ext4_rec_len_from_disk(de->rec_len,
                                                                blocksize) -
                                        offsetof(struct ext4_dir_entry_2,
                                                                name_len));
                        }

                        inode_inc_iversion(dir);
                        return 0;
                }
                i += ext4_rec_len_from_disk(de->rec_len, blocksize);
                pde = de;
                de = ext4_next_entry(de, blocksize);
        }
        return -ENOENT;
}

static int ext4_delete_entry(handle_t *handle,
                             struct inode *dir,
                             struct ext4_dir_entry_2 *de_del,
                             struct buffer_head *bh)
{
        int err, csum_size = 0;

        if (ext4_has_inline_data(dir)) {
                int has_inline_data = 1;
                err = ext4_delete_inline_entry(handle, dir, de_del, bh,
                                               &has_inline_data);
                if (has_inline_data)
                        return err;
        }

        if (ext4_has_metadata_csum(dir->i_sb))
                csum_size = sizeof(struct ext4_dir_entry_tail);

        BUFFER_TRACE(bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, dir->i_sb, bh,
                                            EXT4_JTR_NONE);
        if (unlikely(err))
                goto out;

        err = ext4_generic_delete_entry(dir, de_del, bh, bh->b_data,
                                        dir->i_sb->s_blocksize, csum_size);
        if (err)
                goto out;

        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_dirblock(handle, dir, bh);
        if (unlikely(err))
                goto out;

        return 0;
out:
        if (err != -ENOENT)
                ext4_std_error(dir->i_sb, err);
        return err;
}

/*
 * Set directory link count to 1 if nlinks > EXT4_LINK_MAX, or if nlinks == 2
 * since this indicates that nlinks count was previously 1 to avoid overflowing
 * the 16-bit i_links_count field on disk.  Directories with i_nlink == 1 mean
 * that subdirectory link counts are not being maintained accurately.
 *
 * The caller has already checked for i_nlink overflow in case the DIR_LINK
 * feature is not enabled and returned -EMLINK.  The is_dx() check is a proxy
 * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set
 * on regular files) and to avoid creating huge/slow non-HTREE directories.
 */
static void ext4_inc_count(struct inode *inode)
{
        inc_nlink(inode);
        if (is_dx(inode) &&
            (inode->i_nlink > EXT4_LINK_MAX || inode->i_nlink == 2))
                set_nlink(inode, 1);
}

/*
 * If a directory had nlink == 1, then we should let it be 1. This indicates
 * directory has >EXT4_LINK_MAX subdirs.
 */
static void ext4_dec_count(struct inode *inode)
{
        if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
                drop_nlink(inode);
}


/*
 * Add non-directory inode to a directory. On success, the inode reference is
 * consumed by dentry is instantiation. This is also indicated by clearing of
 * *inodep pointer. On failure, the caller is responsible for dropping the
 * inode reference in the safe context.
 */
static int ext4_add_nondir(handle_t *handle,
                struct dentry *dentry, struct inode **inodep)
{
        struct inode *dir = d_inode(dentry->d_parent);
        struct inode *inode = *inodep;
        int err = ext4_add_entry(handle, dentry, inode);
        if (!err) {
                err = ext4_mark_inode_dirty(handle, inode);
                if (IS_DIRSYNC(dir))
                        ext4_handle_sync(handle);
                d_instantiate_new(dentry, inode);
                *inodep = NULL;
                return err;
        }
        drop_nlink(inode);
        ext4_mark_inode_dirty(handle, inode);
        ext4_orphan_add(handle, inode);
        unlock_new_inode(inode);
        return err;
}

/*
 * By the time this is called, we already have created
 * the directory cache entry for the new file, but it
 * is so far negative - it has no inode.
 *
 * If the create succeeds, we fill in the inode information
 * with d_instantiate().
 */
static int ext4_create(struct mnt_idmap *idmap, struct inode *dir,
                       struct dentry *dentry, umode_t mode, bool excl)
{
        handle_t *handle;
        struct inode *inode;
        int err, credits, retries = 0;

        err = dquot_initialize(dir);
        if (err)
                return err;

        credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
        inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name,
                                            0, NULL, EXT4_HT_DIR, credits);
        handle = ext4_journal_current_handle();
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ext4_file_inode_operations;
                inode->i_fop = &ext4_file_operations;
                ext4_set_aops(inode);
                err = ext4_add_nondir(handle, dentry, &inode);
                if (!err)
                        ext4_fc_track_create(handle, dentry);
        }
        if (handle)
                ext4_journal_stop(handle);
        if (!IS_ERR_OR_NULL(inode))
                iput(inode);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        return err;
}

static int ext4_mknod(struct mnt_idmap *idmap, struct inode *dir,
                      struct dentry *dentry, umode_t mode, dev_t rdev)
{
        handle_t *handle;
        struct inode *inode;
        int err, credits, retries = 0;

        err = dquot_initialize(dir);
        if (err)
                return err;

        credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
        inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name,
                                            0, NULL, EXT4_HT_DIR, credits);
        handle = ext4_journal_current_handle();
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                init_special_inode(inode, inode->i_mode, rdev);
                inode->i_op = &ext4_special_inode_operations;
                err = ext4_add_nondir(handle, dentry, &inode);
                if (!err)
                        ext4_fc_track_create(handle, dentry);
        }
        if (handle)
                ext4_journal_stop(handle);
        if (!IS_ERR_OR_NULL(inode))
                iput(inode);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        return err;
}

static int ext4_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
                        struct file *file, umode_t mode)
{
        handle_t *handle;
        struct inode *inode;
        int err, retries = 0;

        err = dquot_initialize(dir);
        if (err)
                return err;

retry:
        inode = ext4_new_inode_start_handle(idmap, dir, mode,
                                            NULL, 0, NULL,
                                            EXT4_HT_DIR,
                        EXT4_MAXQUOTAS_TRANS_BLOCKS(dir->i_sb) +
                          4 + EXT4_XATTR_TRANS_BLOCKS);
        handle = ext4_journal_current_handle();
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ext4_file_inode_operations;
                inode->i_fop = &ext4_file_operations;
                ext4_set_aops(inode);
                d_tmpfile(file, inode);
                err = ext4_orphan_add(handle, inode);
                if (err)
                        goto err_unlock_inode;
                mark_inode_dirty(inode);
                unlock_new_inode(inode);
        }
        if (handle)
                ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        return finish_open_simple(file, err);
err_unlock_inode:
        ext4_journal_stop(handle);
        unlock_new_inode(inode);
        return err;
}

struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
                          struct ext4_dir_entry_2 *de,
                          int blocksize, int csum_size,
                          unsigned int parent_ino, int dotdot_real_len)
{
        de->inode = cpu_to_le32(inode->i_ino);
        de->name_len = 1;
        de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL),
                                           blocksize);
        strcpy(de->name, ".");
        ext4_set_de_type(inode->i_sb, de, S_IFDIR);

        de = ext4_next_entry(de, blocksize);
        de->inode = cpu_to_le32(parent_ino);
        de->name_len = 2;
        if (!dotdot_real_len)
                de->rec_len = ext4_rec_len_to_disk(blocksize -
                                        (csum_size + ext4_dir_rec_len(1, NULL)),
                                        blocksize);
        else
                de->rec_len = ext4_rec_len_to_disk(
                                        ext4_dir_rec_len(de->name_len, NULL),
                                        blocksize);
        strcpy(de->name, "..");
        ext4_set_de_type(inode->i_sb, de, S_IFDIR);

        return ext4_next_entry(de, blocksize);
}

int ext4_init_new_dir(handle_t *handle, struct inode *dir,
                             struct inode *inode)
{
        struct buffer_head *dir_block = NULL;
        struct ext4_dir_entry_2 *de;
        ext4_lblk_t block = 0;
        unsigned int blocksize = dir->i_sb->s_blocksize;
        int csum_size = 0;
        int err;

        if (ext4_has_metadata_csum(dir->i_sb))
                csum_size = sizeof(struct ext4_dir_entry_tail);

        if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
                err = ext4_try_create_inline_dir(handle, dir, inode);
                if (err < 0 && err != -ENOSPC)
                        goto out;
                if (!err)
                        goto out;
        }

        inode->i_size = 0;
        dir_block = ext4_append(handle, inode, &block);
        if (IS_ERR(dir_block))
                return PTR_ERR(dir_block);
        de = (struct ext4_dir_entry_2 *)dir_block->b_data;
        ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
        set_nlink(inode, 2);
        if (csum_size)
                ext4_initialize_dirent_tail(dir_block, blocksize);

        BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_dirblock(handle, inode, dir_block);
        if (err)
                goto out;
        set_buffer_verified(dir_block);
out:
        brelse(dir_block);
        return err;
}

static int ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir,
                      struct dentry *dentry, umode_t mode)
{
        handle_t *handle;
        struct inode *inode;
        int err, err2 = 0, credits, retries = 0;

        if (EXT4_DIR_LINK_MAX(dir))
                return -EMLINK;

        err = dquot_initialize(dir);
        if (err)
                return err;

        credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
        inode = ext4_new_inode_start_handle(idmap, dir, S_IFDIR | mode,
                                            &dentry->d_name,
                                            0, NULL, EXT4_HT_DIR, credits);
        handle = ext4_journal_current_handle();
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;

        inode->i_op = &ext4_dir_inode_operations;
        inode->i_fop = &ext4_dir_operations;
        err = ext4_init_new_dir(handle, dir, inode);
        if (err)
                goto out_clear_inode;
        err = ext4_mark_inode_dirty(handle, inode);
        if (!err)
                err = ext4_add_entry(handle, dentry, inode);
        if (err) {
out_clear_inode:
                clear_nlink(inode);
                ext4_orphan_add(handle, inode);
                unlock_new_inode(inode);
                err2 = ext4_mark_inode_dirty(handle, inode);
                if (unlikely(err2))
                        err = err2;
                ext4_journal_stop(handle);
                iput(inode);
                goto out_retry;
        }
        ext4_inc_count(dir);

        ext4_update_dx_flag(dir);
        err = ext4_mark_inode_dirty(handle, dir);
        if (err)
                goto out_clear_inode;
        d_instantiate_new(dentry, inode);
        ext4_fc_track_create(handle, dentry);
        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);

out_stop:
        if (handle)
                ext4_journal_stop(handle);
out_retry:
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        return err;
}

/*
 * routine to check that the specified directory is empty (for rmdir)
 */
bool ext4_empty_dir(struct inode *inode)
{
        unsigned int offset;
        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de;
        struct super_block *sb;

        if (ext4_has_inline_data(inode)) {
                int has_inline_data = 1;
                int ret;

                ret = empty_inline_dir(inode, &has_inline_data);
                if (has_inline_data)
                        return ret;
        }

        sb = inode->i_sb;
        if (inode->i_size < ext4_dir_rec_len(1, NULL) +
                                        ext4_dir_rec_len(2, NULL)) {
                EXT4_ERROR_INODE(inode, "invalid size");
                return false;
        }
        /* The first directory block must not be a hole,
         * so treat it as DIRENT_HTREE
         */
        bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
        if (IS_ERR(bh))
                return false;

        de = (struct ext4_dir_entry_2 *) bh->b_data;
        if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
                                 0) ||
            le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) {
                ext4_warning_inode(inode, "directory missing '.'");
                brelse(bh);
                return false;
        }
        offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
        de = ext4_next_entry(de, sb->s_blocksize);
        if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
                                 offset) ||
            le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
                ext4_warning_inode(inode, "directory missing '..'");
                brelse(bh);
                return false;
        }
        offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
        while (offset < inode->i_size) {
                if (!(offset & (sb->s_blocksize - 1))) {
                        unsigned int lblock;
                        brelse(bh);
                        lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
                        bh = ext4_read_dirblock(inode, lblock, EITHER);
                        if (bh == NULL) {
                                offset += sb->s_blocksize;
                                continue;
                        }
                        if (IS_ERR(bh))
                                return false;
                }
                de = (struct ext4_dir_entry_2 *) (bh->b_data +
                                        (offset & (sb->s_blocksize - 1)));
                if (ext4_check_dir_entry(inode, NULL, de, bh,
                                         bh->b_data, bh->b_size, offset) ||
                    le32_to_cpu(de->inode)) {
                        brelse(bh);
                        return false;
                }
                offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
        }
        brelse(bh);
        return true;
}

static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
{
        int retval;
        struct inode *inode;
        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de;
        handle_t *handle = NULL;

        if (unlikely(ext4_forced_shutdown(dir->i_sb)))
                return -EIO;

        /* Initialize quotas before so that eventual writes go in
         * separate transaction */
        retval = dquot_initialize(dir);
        if (retval)
                return retval;
        retval = dquot_initialize(d_inode(dentry));
        if (retval)
                return retval;

        retval = -ENOENT;
        bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
        if (IS_ERR(bh))
                return PTR_ERR(bh);
        if (!bh)
                goto end_rmdir;

        inode = d_inode(dentry);

        retval = -EFSCORRUPTED;
        if (le32_to_cpu(de->inode) != inode->i_ino)
                goto end_rmdir;

        retval = -ENOTEMPTY;
        if (!ext4_empty_dir(inode))
                goto end_rmdir;

        handle = ext4_journal_start(dir, EXT4_HT_DIR,
                                    EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
        if (IS_ERR(handle)) {
                retval = PTR_ERR(handle);
                handle = NULL;
                goto end_rmdir;
        }

        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);

        retval = ext4_delete_entry(handle, dir, de, bh);
        if (retval)
                goto end_rmdir;
        if (!EXT4_DIR_LINK_EMPTY(inode))
                ext4_warning_inode(inode,
                             "empty directory '%.*s' has too many links (%u)",
                             dentry->d_name.len, dentry->d_name.name,
                             inode->i_nlink);
        inode_inc_iversion(inode);
        clear_nlink(inode);
        /* There's no need to set i_disksize: the fact that i_nlink is
         * zero will ensure that the right thing happens during any
         * recovery. */
        inode->i_size = 0;
        ext4_orphan_add(handle, inode);
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        inode_set_ctime_current(inode);
        retval = ext4_mark_inode_dirty(handle, inode);
        if (retval)
                goto end_rmdir;
        ext4_dec_count(dir);
        ext4_update_dx_flag(dir);
        ext4_fc_track_unlink(handle, dentry);
        retval = ext4_mark_inode_dirty(handle, dir);

#if IS_ENABLED(CONFIG_UNICODE)
        /* VFS negative dentries are incompatible with Encoding and
         * Case-insensitiveness. Eventually we'll want avoid
         * invalidating the dentries here, alongside with returning the
         * negative dentries at ext4_lookup(), when it is better
         * supported by the VFS for the CI case.
         */
        if (IS_CASEFOLDED(dir))
                d_invalidate(dentry);
#endif

end_rmdir:
        brelse(bh);
        if (handle)
                ext4_journal_stop(handle);
        return retval;
}

int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
                  struct inode *inode,
                  struct dentry *dentry /* NULL during fast_commit recovery */)
{
        int retval = -ENOENT;
        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de;
        handle_t *handle;
        int skip_remove_dentry = 0;

        /*
         * Keep this outside the transaction; it may have to set up the
         * directory's encryption key, which isn't GFP_NOFS-safe.
         */
        bh = ext4_find_entry(dir, d_name, &de, NULL);
        if (IS_ERR(bh))
                return PTR_ERR(bh);

        if (!bh)
                return -ENOENT;

        if (le32_to_cpu(de->inode) != inode->i_ino) {
                /*
                 * It's okay if we find dont find dentry which matches
                 * the inode. That's because it might have gotten
                 * renamed to a different inode number
                 */
                if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                        skip_remove_dentry = 1;
                else
                        goto out_bh;
        }

        handle = ext4_journal_start(dir, EXT4_HT_DIR,
                                    EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
        if (IS_ERR(handle)) {
                retval = PTR_ERR(handle);
                goto out_bh;
        }

        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);

        if (!skip_remove_dentry) {
                retval = ext4_delete_entry(handle, dir, de, bh);
                if (retval)
                        goto out_handle;
                inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
                ext4_update_dx_flag(dir);
                retval = ext4_mark_inode_dirty(handle, dir);
                if (retval)
                        goto out_handle;
        } else {
                retval = 0;
        }
        if (inode->i_nlink == 0)
                ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
                                   d_name->len, d_name->name);
        else
                drop_nlink(inode);
        if (!inode->i_nlink)
                ext4_orphan_add(handle, inode);
        inode_set_ctime_current(inode);
        retval = ext4_mark_inode_dirty(handle, inode);
        if (dentry && !retval)
                ext4_fc_track_unlink(handle, dentry);
out_handle:
        ext4_journal_stop(handle);
out_bh:
        brelse(bh);
        return retval;
}

static int ext4_unlink(struct inode *dir, struct dentry *dentry)
{
        int retval;

        if (unlikely(ext4_forced_shutdown(dir->i_sb)))
                return -EIO;

        trace_ext4_unlink_enter(dir, dentry);
        /*
         * Initialize quotas before so that eventual writes go
         * in separate transaction
         */
        retval = dquot_initialize(dir);
        if (retval)
                goto out_trace;
        retval = dquot_initialize(d_inode(dentry));
        if (retval)
                goto out_trace;

        retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry);
#if IS_ENABLED(CONFIG_UNICODE)
        /* VFS negative dentries are incompatible with Encoding and
         * Case-insensitiveness. Eventually we'll want avoid
         * invalidating the dentries here, alongside with returning the
         * negative dentries at ext4_lookup(), when it is  better
         * supported by the VFS for the CI case.
         */
        if (IS_CASEFOLDED(dir))
                d_invalidate(dentry);
#endif

out_trace:
        trace_ext4_unlink_exit(dentry, retval);
        return retval;
}

static int ext4_init_symlink_block(handle_t *handle, struct inode *inode,
                                   struct fscrypt_str *disk_link)
{
        struct buffer_head *bh;
        char *kaddr;
        int err = 0;

        bh = ext4_bread(handle, inode, 0, EXT4_GET_BLOCKS_CREATE);
        if (IS_ERR(bh))
                return PTR_ERR(bh);

        BUFFER_TRACE(bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE);
        if (err)
                goto out;

        kaddr = (char *)bh->b_data;
        memcpy(kaddr, disk_link->name, disk_link->len);
        inode->i_size = disk_link->len - 1;
        EXT4_I(inode)->i_disksize = inode->i_size;
        err = ext4_handle_dirty_metadata(handle, inode, bh);
out:
        brelse(bh);
        return err;
}

static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir,
                        struct dentry *dentry, const char *symname)
{
        handle_t *handle;
        struct inode *inode;
        int err, len = strlen(symname);
        int credits;
        struct fscrypt_str disk_link;
        int retries = 0;

        if (unlikely(ext4_forced_shutdown(dir->i_sb)))
                return -EIO;

        err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize,
                                      &disk_link);
        if (err)
                return err;

        err = dquot_initialize(dir);
        if (err)
                return err;

        /*
         * EXT4_INDEX_EXTRA_TRANS_BLOCKS for addition of entry into the
         * directory. +3 for inode, inode bitmap, group descriptor allocation.
         * EXT4_DATA_TRANS_BLOCKS for the data block allocation and
         * modification.
         */
        credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
retry:
        inode = ext4_new_inode_start_handle(idmap, dir, S_IFLNK|S_IRWXUGO,
                                            &dentry->d_name, 0, NULL,
                                            EXT4_HT_DIR, credits);
        handle = ext4_journal_current_handle();
        if (IS_ERR(inode)) {
                if (handle)
                        ext4_journal_stop(handle);
                err = PTR_ERR(inode);
                goto out_retry;
        }

        if (IS_ENCRYPTED(inode)) {
                err = fscrypt_encrypt_symlink(inode, symname, len, &disk_link);
                if (err)
                        goto err_drop_inode;
                inode->i_op = &ext4_encrypted_symlink_inode_operations;
        } else {
                if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
                        inode->i_op = &ext4_symlink_inode_operations;
                } else {
                        inode->i_op = &ext4_fast_symlink_inode_operations;
                        inode->i_link = (char *)&EXT4_I(inode)->i_data;
                }
        }

        if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
                /* alloc symlink block and fill it */
                err = ext4_init_symlink_block(handle, inode, &disk_link);
                if (err)
                        goto err_drop_inode;
        } else {
                /* clear the extent format for fast symlink */
                ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
                memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name,
                       disk_link.len);
                inode->i_size = disk_link.len - 1;
                EXT4_I(inode)->i_disksize = inode->i_size;
        }
        err = ext4_add_nondir(handle, dentry, &inode);
        if (handle)
                ext4_journal_stop(handle);
        iput(inode);
        goto out_retry;

err_drop_inode:
        clear_nlink(inode);
        ext4_mark_inode_dirty(handle, inode);
        ext4_orphan_add(handle, inode);
        unlock_new_inode(inode);
        if (handle)
                ext4_journal_stop(handle);
        iput(inode);
out_retry:
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        if (disk_link.name != (unsigned char *)symname)
                kfree(disk_link.name);
        return err;
}

int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry)
{
        handle_t *handle;
        int err, retries = 0;
retry:
        handle = ext4_journal_start(dir, EXT4_HT_DIR,
                (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                 EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);

        inode_set_ctime_current(inode);
        ext4_inc_count(inode);
        ihold(inode);

        err = ext4_add_entry(handle, dentry, inode);
        if (!err) {
                err = ext4_mark_inode_dirty(handle, inode);
                /* this can happen only for tmpfile being
                 * linked the first time
                 */
                if (inode->i_nlink == 1)
                        ext4_orphan_del(handle, inode);
                d_instantiate(dentry, inode);
                ext4_fc_track_link(handle, dentry);
        } else {
                drop_nlink(inode);
                iput(inode);
        }
        ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        return err;
}

static int ext4_link(struct dentry *old_dentry,
                     struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(old_dentry);
        int err;

        if (inode->i_nlink >= EXT4_LINK_MAX)
                return -EMLINK;

        err = fscrypt_prepare_link(old_dentry, dir, dentry);
        if (err)
                return err;

        if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) &&
            (!projid_eq(EXT4_I(dir)->i_projid,
                        EXT4_I(old_dentry->d_inode)->i_projid)))
                return -EXDEV;

        err = dquot_initialize(dir);
        if (err)
                return err;
        return __ext4_link(dir, inode, dentry);
}

/*
 * Try to find buffer head where contains the parent block.
 * It should be the inode block if it is inlined or the 1st block
 * if it is a normal dir.
 */
static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
                                        struct inode *inode,
                                        int *retval,
                                        struct ext4_dir_entry_2 **parent_de,
                                        int *inlined)
{
        struct buffer_head *bh;

        if (!ext4_has_inline_data(inode)) {
                struct ext4_dir_entry_2 *de;
                unsigned int offset;

                /* The first directory block must not be a hole, so
                 * treat it as DIRENT_HTREE
                 */
                bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
                if (IS_ERR(bh)) {
                        *retval = PTR_ERR(bh);
                        return NULL;
                }

                de = (struct ext4_dir_entry_2 *) bh->b_data;
                if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
                                         bh->b_size, 0) ||
                    le32_to_cpu(de->inode) != inode->i_ino ||
                    strcmp(".", de->name)) {
                        EXT4_ERROR_INODE(inode, "directory missing '.'");
                        brelse(bh);
                        *retval = -EFSCORRUPTED;
                        return NULL;
                }
                offset = ext4_rec_len_from_disk(de->rec_len,
                                                inode->i_sb->s_blocksize);
                de = ext4_next_entry(de, inode->i_sb->s_blocksize);
                if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
                                         bh->b_size, offset) ||
                    le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
                        EXT4_ERROR_INODE(inode, "directory missing '..'");
                        brelse(bh);
                        *retval = -EFSCORRUPTED;
                        return NULL;
                }
                *parent_de = de;

                return bh;
        }

        *inlined = 1;
        return ext4_get_first_inline_block(inode, parent_de, retval);
}

struct ext4_renament {
        struct inode *dir;
        struct dentry *dentry;
        struct inode *inode;
        bool is_dir;
        int dir_nlink_delta;

        /* entry for "dentry" */
        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de;
        int inlined;

        /* entry for ".." in inode if it's a directory */
        struct buffer_head *dir_bh;
        struct ext4_dir_entry_2 *parent_de;
        int dir_inlined;
};

static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent, bool is_cross)
{
        int retval;

        ent->is_dir = true;
        if (!is_cross)
                return 0;

        ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode,
                                              &retval, &ent->parent_de,
                                              &ent->dir_inlined);
        if (!ent->dir_bh)
                return retval;
        if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino)
                return -EFSCORRUPTED;
        BUFFER_TRACE(ent->dir_bh, "get_write_access");
        return ext4_journal_get_write_access(handle, ent->dir->i_sb,
                                             ent->dir_bh, EXT4_JTR_NONE);
}

static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent,
                                  unsigned dir_ino)
{
        int retval;

        if (!ent->dir_bh)
                return 0;

        ent->parent_de->inode = cpu_to_le32(dir_ino);
        BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata");
        if (!ent->dir_inlined) {
                if (is_dx(ent->inode)) {
                        retval = ext4_handle_dirty_dx_node(handle,
                                                           ent->inode,
                                                           ent->dir_bh);
                } else {
                        retval = ext4_handle_dirty_dirblock(handle, ent->inode,
                                                            ent->dir_bh);
                }
        } else {
                retval = ext4_mark_inode_dirty(handle, ent->inode);
        }
        if (retval) {
                ext4_std_error(ent->dir->i_sb, retval);
                return retval;
        }
        return 0;
}

static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
                       unsigned ino, unsigned file_type)
{
        int retval, retval2;

        BUFFER_TRACE(ent->bh, "get write access");
        retval = ext4_journal_get_write_access(handle, ent->dir->i_sb, ent->bh,
                                               EXT4_JTR_NONE);
        if (retval)
                return retval;
        ent->de->inode = cpu_to_le32(ino);
        if (ext4_has_feature_filetype(ent->dir->i_sb))
                ent->de->file_type = file_type;
        inode_inc_iversion(ent->dir);
        inode_set_mtime_to_ts(ent->dir, inode_set_ctime_current(ent->dir));
        retval = ext4_mark_inode_dirty(handle, ent->dir);
        BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
        if (!ent->inlined) {
                retval2 = ext4_handle_dirty_dirblock(handle, ent->dir, ent->bh);
                if (unlikely(retval2)) {
                        ext4_std_error(ent->dir->i_sb, retval2);
                        return retval2;
                }
        }
        return retval;
}

static void ext4_resetent(handle_t *handle, struct ext4_renament *ent,
                          unsigned ino, unsigned file_type)
{
        struct ext4_renament old = *ent;
        int retval = 0;

        /*
         * old->de could have moved from under us during make indexed dir,
         * so the old->de may no longer valid and need to find it again
         * before reset old inode info.
         */
        old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de,
                                 &old.inlined);
        if (IS_ERR(old.bh))
                retval = PTR_ERR(old.bh);
        if (!old.bh)
                retval = -ENOENT;
        if (retval) {
                ext4_std_error(old.dir->i_sb, retval);
                return;
        }

        ext4_setent(handle, &old, ino, file_type);
        brelse(old.bh);
}

static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
                                  const struct qstr *d_name)
{
        int retval = -ENOENT;
        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de;

        bh = ext4_find_entry(dir, d_name, &de, NULL);
        if (IS_ERR(bh))
                return PTR_ERR(bh);
        if (bh) {
                retval = ext4_delete_entry(handle, dir, de, bh);
                brelse(bh);
        }
        return retval;
}

static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent,
                               int force_reread)
{
        int retval;
        /*
         * ent->de could have moved from under us during htree split, so make
         * sure that we are deleting the right entry.  We might also be pointing
         * to a stale entry in the unused part of ent->bh so just checking inum
         * and the name isn't enough.
         */
        if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino ||
            ent->de->name_len != ent->dentry->d_name.len ||
            strncmp(ent->de->name, ent->dentry->d_name.name,
                    ent->de->name_len) ||
            force_reread) {
                retval = ext4_find_delete_entry(handle, ent->dir,
                                                &ent->dentry->d_name);
        } else {
                retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh);
                if (retval == -ENOENT) {
                        retval = ext4_find_delete_entry(handle, ent->dir,
                                                        &ent->dentry->d_name);
                }
        }

        if (retval) {
                ext4_warning_inode(ent->dir,
                                   "Deleting old file: nlink %d, error=%d",
                                   ent->dir->i_nlink, retval);
        }
}

static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
{
        if (ent->dir_nlink_delta) {
                if (ent->dir_nlink_delta == -1)
                        ext4_dec_count(ent->dir);
                else
                        ext4_inc_count(ent->dir);
                ext4_mark_inode_dirty(handle, ent->dir);
        }
}

static struct inode *ext4_whiteout_for_rename(struct mnt_idmap *idmap,
                                              struct ext4_renament *ent,
                                              int credits, handle_t **h)
{
        struct inode *wh;
        handle_t *handle;
        int retries = 0;

        /*
         * for inode block, sb block, group summaries,
         * and inode bitmap
         */
        credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) +
                    EXT4_XATTR_TRANS_BLOCKS + 4);
retry:
        wh = ext4_new_inode_start_handle(idmap, ent->dir,
                                         S_IFCHR | WHITEOUT_MODE,
                                         &ent->dentry->d_name, 0, NULL,
                                         EXT4_HT_DIR, credits);

        handle = ext4_journal_current_handle();
        if (IS_ERR(wh)) {
                if (handle)
                        ext4_journal_stop(handle);
                if (PTR_ERR(wh) == -ENOSPC &&
                    ext4_should_retry_alloc(ent->dir->i_sb, &retries))
                        goto retry;
        } else {
                *h = handle;
                init_special_inode(wh, wh->i_mode, WHITEOUT_DEV);
                wh->i_op = &ext4_special_inode_operations;
        }
        return wh;
}

/*
 * Anybody can rename anything with this: the permission checks are left to the
 * higher-level routines.
 *
 * n.b.  old_{dentry,inode) refers to the source dentry/inode
 * while new_{dentry,inode) refers to the destination dentry/inode
 * This comes from rename(const char *oldpath, const char *newpath)
 */
static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
                       struct dentry *old_dentry, struct inode *new_dir,
                       struct dentry *new_dentry, unsigned int flags)
{
        handle_t *handle = NULL;
        struct ext4_renament old = {
                .dir = old_dir,
                .dentry = old_dentry,
                .inode = d_inode(old_dentry),
        };
        struct ext4_renament new = {
                .dir = new_dir,
                .dentry = new_dentry,
                .inode = d_inode(new_dentry),
        };
        int force_reread;
        int retval;
        struct inode *whiteout = NULL;
        int credits;
        u8 old_file_type;

        if (new.inode && new.inode->i_nlink == 0) {
                EXT4_ERROR_INODE(new.inode,
                                 "target of rename is already freed");
                return -EFSCORRUPTED;
        }

        if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) &&
            (!projid_eq(EXT4_I(new_dir)->i_projid,
                        EXT4_I(old_dentry->d_inode)->i_projid)))
                return -EXDEV;

        retval = dquot_initialize(old.dir);
        if (retval)
                return retval;
        retval = dquot_initialize(old.inode);
        if (retval)
                return retval;
        retval = dquot_initialize(new.dir);
        if (retval)
                return retval;

        /* Initialize quotas before so that eventual writes go
         * in separate transaction */
        if (new.inode) {
                retval = dquot_initialize(new.inode);
                if (retval)
                        return retval;
        }

        old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de,
                                 &old.inlined);
        if (IS_ERR(old.bh))
                return PTR_ERR(old.bh);

        /*
         *  Check for inode number is _not_ due to possible IO errors.
         *  We might rmdir the source, keep it as pwd of some process
         *  and merrily kill the link to whatever was created under the
         *  same name. Goodbye sticky bit ;-<
         */
        retval = -ENOENT;
        if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
                goto release_bh;

        new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
                                 &new.de, &new.inlined);
        if (IS_ERR(new.bh)) {
                retval = PTR_ERR(new.bh);
                new.bh = NULL;
                goto release_bh;
        }
        if (new.bh) {
                if (!new.inode) {
                        brelse(new.bh);
                        new.bh = NULL;
                }
        }
        if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC))
                ext4_alloc_da_blocks(old.inode);

        credits = (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
                   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
        if (!(flags & RENAME_WHITEOUT)) {
                handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits);
                if (IS_ERR(handle)) {
                        retval = PTR_ERR(handle);
                        goto release_bh;
                }
        } else {
                whiteout = ext4_whiteout_for_rename(idmap, &old, credits, &handle);
                if (IS_ERR(whiteout)) {
                        retval = PTR_ERR(whiteout);
                        goto release_bh;
                }
        }

        old_file_type = old.de->file_type;
        if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
                ext4_handle_sync(handle);

        if (S_ISDIR(old.inode->i_mode)) {
                if (new.inode) {
                        retval = -ENOTEMPTY;
                        if (!ext4_empty_dir(new.inode))
                                goto end_rename;
                } else {
                        retval = -EMLINK;
                        if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir))
                                goto end_rename;
                }
                retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir);
                if (retval)
                        goto end_rename;
        }
        /*
         * If we're renaming a file within an inline_data dir and adding or
         * setting the new dirent causes a conversion from inline_data to
         * extents/blockmap, we need to force the dirent delete code to
         * re-read the directory, or else we end up trying to delete a dirent
         * from what is now the extent tree root (or a block map).
         */
        force_reread = (new.dir->i_ino == old.dir->i_ino &&
                        ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA));

        if (whiteout) {
                /*
                 * Do this before adding a new entry, so the old entry is sure
                 * to be still pointing to the valid old entry.
                 */
                retval = ext4_setent(handle, &old, whiteout->i_ino,
                                     EXT4_FT_CHRDEV);
                if (retval)
                        goto end_rename;
                retval = ext4_mark_inode_dirty(handle, whiteout);
                if (unlikely(retval))
                        goto end_rename;

        }
        if (!new.bh) {
                retval = ext4_add_entry(handle, new.dentry, old.inode);
                if (retval)
                        goto end_rename;
        } else {
                retval = ext4_setent(handle, &new,
                                     old.inode->i_ino, old_file_type);
                if (retval)
                        goto end_rename;
        }
        if (force_reread)
                force_reread = !ext4_test_inode_flag(new.dir,
                                                     EXT4_INODE_INLINE_DATA);

        /*
         * Like most other Unix systems, set the ctime for inodes on a
         * rename.
         */
        inode_set_ctime_current(old.inode);
        retval = ext4_mark_inode_dirty(handle, old.inode);
        if (unlikely(retval))
                goto end_rename;

        if (!whiteout) {
                /*
                 * ok, that's it
                 */
                ext4_rename_delete(handle, &old, force_reread);
        }

        if (new.inode) {
                ext4_dec_count(new.inode);
                inode_set_ctime_current(new.inode);
        }
        inode_set_mtime_to_ts(old.dir, inode_set_ctime_current(old.dir));
        ext4_update_dx_flag(old.dir);
        if (old.is_dir) {
                retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
                if (retval)
                        goto end_rename;

                ext4_dec_count(old.dir);
                if (new.inode) {
                        /* checked ext4_empty_dir above, can't have another
                         * parent, ext4_dec_count() won't work for many-linked
                         * dirs */
                        clear_nlink(new.inode);
                } else {
                        ext4_inc_count(new.dir);
                        ext4_update_dx_flag(new.dir);
                        retval = ext4_mark_inode_dirty(handle, new.dir);
                        if (unlikely(retval))
                                goto end_rename;
                }
        }
        retval = ext4_mark_inode_dirty(handle, old.dir);
        if (unlikely(retval))
                goto end_rename;

        if (old.is_dir) {
                /*
                 * We disable fast commits here that's because the
                 * replay code is not yet capable of changing dot dot
                 * dirents in directories.
                 */
                ext4_fc_mark_ineligible(old.inode->i_sb,
                        EXT4_FC_REASON_RENAME_DIR, handle);
        } else {
                struct super_block *sb = old.inode->i_sb;

                if (new.inode)
                        ext4_fc_track_unlink(handle, new.dentry);
                if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
                    !(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
                    !(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE))) {
                        __ext4_fc_track_link(handle, old.inode, new.dentry);
                        __ext4_fc_track_unlink(handle, old.inode, old.dentry);
                        if (whiteout)
                                __ext4_fc_track_create(handle, whiteout,
                                                       old.dentry);
                }
        }

        if (new.inode) {
                retval = ext4_mark_inode_dirty(handle, new.inode);
                if (unlikely(retval))
                        goto end_rename;
                if (!new.inode->i_nlink)
                        ext4_orphan_add(handle, new.inode);
        }
        retval = 0;

end_rename:
        if (whiteout) {
                if (retval) {
                        ext4_resetent(handle, &old,
                                      old.inode->i_ino, old_file_type);
                        drop_nlink(whiteout);
                        ext4_mark_inode_dirty(handle, whiteout);
                        ext4_orphan_add(handle, whiteout);
                }
                unlock_new_inode(whiteout);
                ext4_journal_stop(handle);
                iput(whiteout);
        } else {
                ext4_journal_stop(handle);
        }
release_bh:
        brelse(old.dir_bh);
        brelse(old.bh);
        brelse(new.bh);

        return retval;
}

static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry)
{
        handle_t *handle = NULL;
        struct ext4_renament old = {
                .dir = old_dir,
                .dentry = old_dentry,
                .inode = d_inode(old_dentry),
        };
        struct ext4_renament new = {
                .dir = new_dir,
                .dentry = new_dentry,
                .inode = d_inode(new_dentry),
        };
        u8 new_file_type;
        int retval;

        if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) &&
             !projid_eq(EXT4_I(new_dir)->i_projid,
                        EXT4_I(old_dentry->d_inode)->i_projid)) ||
            (ext4_test_inode_flag(old_dir, EXT4_INODE_PROJINHERIT) &&
             !projid_eq(EXT4_I(old_dir)->i_projid,
                        EXT4_I(new_dentry->d_inode)->i_projid)))
                return -EXDEV;

        retval = dquot_initialize(old.dir);
        if (retval)
                return retval;
        retval = dquot_initialize(new.dir);
        if (retval)
                return retval;

        old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
                                 &old.de, &old.inlined);
        if (IS_ERR(old.bh))
                return PTR_ERR(old.bh);
        /*
         *  Check for inode number is _not_ due to possible IO errors.
         *  We might rmdir the source, keep it as pwd of some process
         *  and merrily kill the link to whatever was created under the
         *  same name. Goodbye sticky bit ;-<
         */
        retval = -ENOENT;
        if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
                goto end_rename;

        new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
                                 &new.de, &new.inlined);
        if (IS_ERR(new.bh)) {
                retval = PTR_ERR(new.bh);
                new.bh = NULL;
                goto end_rename;
        }

        /* RENAME_EXCHANGE case: old *and* new must both exist */
        if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
                goto end_rename;

        handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
                (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
                 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
        if (IS_ERR(handle)) {
                retval = PTR_ERR(handle);
                handle = NULL;
                goto end_rename;
        }

        if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
                ext4_handle_sync(handle);

        if (S_ISDIR(old.inode->i_mode)) {
                retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir);
                if (retval)
                        goto end_rename;
        }
        if (S_ISDIR(new.inode->i_mode)) {
                retval = ext4_rename_dir_prepare(handle, &new, new.dir != old.dir);
                if (retval)
                        goto end_rename;
        }

        /*
         * Other than the special case of overwriting a directory, parents'
         * nlink only needs to be modified if this is a cross directory rename.
         */
        if (old.dir != new.dir && old.is_dir != new.is_dir) {
                old.dir_nlink_delta = old.is_dir ? -1 : 1;
                new.dir_nlink_delta = -old.dir_nlink_delta;
                retval = -EMLINK;
                if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) ||
                    (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir)))
                        goto end_rename;
        }

        new_file_type = new.de->file_type;
        retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type);
        if (retval)
                goto end_rename;

        retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type);
        if (retval)
                goto end_rename;

        /*
         * Like most other Unix systems, set the ctime for inodes on a
         * rename.
         */
        inode_set_ctime_current(old.inode);
        inode_set_ctime_current(new.inode);
        retval = ext4_mark_inode_dirty(handle, old.inode);
        if (unlikely(retval))
                goto end_rename;
        retval = ext4_mark_inode_dirty(handle, new.inode);
        if (unlikely(retval))
                goto end_rename;
        ext4_fc_mark_ineligible(new.inode->i_sb,
                                EXT4_FC_REASON_CROSS_RENAME, handle);
        if (old.dir_bh) {
                retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
                if (retval)
                        goto end_rename;
        }
        if (new.dir_bh) {
                retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino);
                if (retval)
                        goto end_rename;
        }
        ext4_update_dir_count(handle, &old);
        ext4_update_dir_count(handle, &new);
        retval = 0;

end_rename:
        brelse(old.dir_bh);
        brelse(new.dir_bh);
        brelse(old.bh);
        brelse(new.bh);
        if (handle)
                ext4_journal_stop(handle);
        return retval;
}

static int ext4_rename2(struct mnt_idmap *idmap,
                        struct inode *old_dir, struct dentry *old_dentry,
                        struct inode *new_dir, struct dentry *new_dentry,
                        unsigned int flags)
{
        int err;

        if (unlikely(ext4_forced_shutdown(old_dir->i_sb)))
                return -EIO;

        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                return -EINVAL;

        err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry,
                                     flags);
        if (err)
                return err;

        if (flags & RENAME_EXCHANGE) {
                return ext4_cross_rename(old_dir, old_dentry,
                                         new_dir, new_dentry);
        }

        return ext4_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags);
}

/*
 * directories can handle most operations...
 */
const struct inode_operations ext4_dir_inode_operations = {
        .create                = ext4_create,
        .lookup                = ext4_lookup,
        .link                = ext4_link,
        .unlink                = ext4_unlink,
        .symlink        = ext4_symlink,
        .mkdir                = ext4_mkdir,
        .rmdir                = ext4_rmdir,
        .mknod                = ext4_mknod,
        .tmpfile        = ext4_tmpfile,
        .rename                = ext4_rename2,
        .setattr        = ext4_setattr,
        .getattr        = ext4_getattr,
        .listxattr        = ext4_listxattr,
        .get_inode_acl        = ext4_get_acl,
        .set_acl        = ext4_set_acl,
        .fiemap         = ext4_fiemap,
        .fileattr_get        = ext4_fileattr_get,
        .fileattr_set        = ext4_fileattr_set,
};

const struct inode_operations ext4_special_inode_operations = {
        .setattr        = ext4_setattr,
        .getattr        = ext4_getattr,
        .listxattr        = ext4_listxattr,
        .get_inode_acl        = ext4_get_acl,
        .set_acl        = ext4_set_acl,
};







   53 




1
2
3
4
5
6
7
8
9
10
11
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/processor.h>

static inline int phys_addr_valid(resource_size_t addr)
{
#ifdef CONFIG_PHYS_ADDR_T_64BIT
        return !(addr >> boot_cpu_data.x86_phys_bits);
#else
        return 1;
#endif
}





































































































































































    4 
    4 



























    4 






































    4 

































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/unistd.h>
#include <linux/uaccess.h>
#include <linux/syscalls.h>

#include <asm/ucontext.h>
#include <asm/fpu/signal.h>
#include <asm/sighandling.h>

#include <asm/syscall.h>
#include <asm/sigframe.h>
#include <asm/signal.h>

/*
 * If regs->ss will cause an IRET fault, change it.  Otherwise leave it
 * alone.  Using this generally makes no sense unless
 * user_64bit_mode(regs) would return true.
 */
static void force_valid_ss(struct pt_regs *regs)
{
        u32 ar;
        asm volatile ("lar %[old_ss], %[ar]\n\t"
                      "jz 1f\n\t"                /* If invalid: */
                      "xorl %[ar], %[ar]\n\t"        /* set ar = 0 */
                      "1:"
                      : [ar] "=r" (ar)
                      : [old_ss] "rm" ((u16)regs->ss));

        /*
         * For a valid 64-bit user context, we need DPL 3, type
         * read-write data or read-write exp-down data, and S and P
         * set.  We can't use VERW because VERW doesn't check the
         * P bit.
         */
        ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK;
        if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) &&
            ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN))
                regs->ss = __USER_DS;
}

static bool restore_sigcontext(struct pt_regs *regs,
                               struct sigcontext __user *usc,
                               unsigned long uc_flags)
{
        struct sigcontext sc;

        /* Always make any pending restarted system calls return -EINTR */
        current->restart_block.fn = do_no_restart_syscall;

        if (copy_from_user(&sc, usc, offsetof(struct sigcontext, reserved1)))
                return false;

        regs->bx = sc.bx;
        regs->cx = sc.cx;
        regs->dx = sc.dx;
        regs->si = sc.si;
        regs->di = sc.di;
        regs->bp = sc.bp;
        regs->ax = sc.ax;
        regs->sp = sc.sp;
        regs->ip = sc.ip;
        regs->r8 = sc.r8;
        regs->r9 = sc.r9;
        regs->r10 = sc.r10;
        regs->r11 = sc.r11;
        regs->r12 = sc.r12;
        regs->r13 = sc.r13;
        regs->r14 = sc.r14;
        regs->r15 = sc.r15;

        /* Get CS/SS and force CPL3 */
        regs->cs = sc.cs | 0x03;
        regs->ss = sc.ss | 0x03;

        regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS);
        /* disable syscall checks */
        regs->orig_ax = -1;

        /*
         * Fix up SS if needed for the benefit of old DOSEMU and
         * CRIU.
         */
        if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs)))
                force_valid_ss(regs);

        return fpu__restore_sig((void __user *)sc.fpstate, 0);
}

static __always_inline int
__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
                     struct pt_regs *regs, unsigned long mask)
{
        unsafe_put_user(regs->di, &sc->di, Efault);
        unsafe_put_user(regs->si, &sc->si, Efault);
        unsafe_put_user(regs->bp, &sc->bp, Efault);
        unsafe_put_user(regs->sp, &sc->sp, Efault);
        unsafe_put_user(regs->bx, &sc->bx, Efault);
        unsafe_put_user(regs->dx, &sc->dx, Efault);
        unsafe_put_user(regs->cx, &sc->cx, Efault);
        unsafe_put_user(regs->ax, &sc->ax, Efault);
        unsafe_put_user(regs->r8, &sc->r8, Efault);
        unsafe_put_user(regs->r9, &sc->r9, Efault);
        unsafe_put_user(regs->r10, &sc->r10, Efault);
        unsafe_put_user(regs->r11, &sc->r11, Efault);
        unsafe_put_user(regs->r12, &sc->r12, Efault);
        unsafe_put_user(regs->r13, &sc->r13, Efault);
        unsafe_put_user(regs->r14, &sc->r14, Efault);
        unsafe_put_user(regs->r15, &sc->r15, Efault);

        unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault);
        unsafe_put_user(current->thread.error_code, &sc->err, Efault);
        unsafe_put_user(regs->ip, &sc->ip, Efault);
        unsafe_put_user(regs->flags, &sc->flags, Efault);
        unsafe_put_user(regs->cs, &sc->cs, Efault);
        unsafe_put_user(0, &sc->gs, Efault);
        unsafe_put_user(0, &sc->fs, Efault);
        unsafe_put_user(regs->ss, &sc->ss, Efault);

        unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault);

        /* non-iBCS2 extensions.. */
        unsafe_put_user(mask, &sc->oldmask, Efault);
        unsafe_put_user(current->thread.cr2, &sc->cr2, Efault);
        return 0;
Efault:
        return -EFAULT;
}

#define unsafe_put_sigcontext(sc, fp, regs, set, label)                        \
do {                                                                        \
        if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0]))        \
                goto label;                                                \
} while(0);

#define unsafe_put_sigmask(set, frame, label) \
        unsafe_put_user(*(__u64 *)(set), \
                        (__u64 __user *)&(frame)->uc.uc_sigmask, \
                        label)

static unsigned long frame_uc_flags(struct pt_regs *regs)
{
        unsigned long flags;

        if (boot_cpu_has(X86_FEATURE_XSAVE))
                flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS;
        else
                flags = UC_SIGCONTEXT_SS;

        if (likely(user_64bit_mode(regs)))
                flags |= UC_STRICT_RESTORE_SS;

        return flags;
}

int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
{
        sigset_t *set = sigmask_to_save();
        struct rt_sigframe __user *frame;
        void __user *fp = NULL;
        unsigned long uc_flags;

        /* x86-64 should always use SA_RESTORER. */
        if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
                return -EFAULT;

        frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp);
        uc_flags = frame_uc_flags(regs);

        if (!user_access_begin(frame, sizeof(*frame)))
                return -EFAULT;

        /* Create the ucontext.  */
        unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
        unsafe_put_user(0, &frame->uc.uc_link, Efault);
        unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);

        /* Set up to return from userspace.  If provided, use a stub
           already in userspace.  */
        unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault);
        unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
        unsafe_put_sigmask(set, frame, Efault);
        user_access_end();

        if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
                if (copy_siginfo_to_user(&frame->info, &ksig->info))
                        return -EFAULT;
        }

        if (setup_signal_shadow_stack(ksig))
                return -EFAULT;

        /* Set up registers for signal handler */
        regs->di = ksig->sig;
        /* In case the signal handler was declared without prototypes */
        regs->ax = 0;

        /* This also works for non SA_SIGINFO handlers because they expect the
           next argument after the signal number on the stack. */
        regs->si = (unsigned long)&frame->info;
        regs->dx = (unsigned long)&frame->uc;
        regs->ip = (unsigned long) ksig->ka.sa.sa_handler;

        regs->sp = (unsigned long)frame;

        /*
         * Set up the CS and SS registers to run signal handlers in
         * 64-bit mode, even if the handler happens to be interrupting
         * 32-bit or 16-bit code.
         *
         * SS is subtle.  In 64-bit mode, we don't need any particular
         * SS descriptor, but we do need SS to be valid.  It's possible
         * that the old SS is entirely bogus -- this can happen if the
         * signal we're trying to deliver is #GP or #SS caused by a bad
         * SS value.  We also have a compatibility issue here: DOSEMU
         * relies on the contents of the SS register indicating the
         * SS value at the time of the signal, even though that code in
         * DOSEMU predates sigreturn's ability to restore SS.  (DOSEMU
         * avoids relying on sigreturn to restore SS; instead it uses
         * a trampoline.)  So we do our best: if the old SS was valid,
         * we keep it.  Otherwise we replace it.
         */
        regs->cs = __USER_CS;

        if (unlikely(regs->ss != __USER_DS))
                force_valid_ss(regs);

        return 0;

Efault:
        user_access_end();
        return -EFAULT;
}

/*
 * Do a signal return; undo the signal stack.
 */
SYSCALL_DEFINE0(rt_sigreturn)
{
        struct pt_regs *regs = current_pt_regs();
        struct rt_sigframe __user *frame;
        sigset_t set;
        unsigned long uc_flags;

        frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
        if (!access_ok(frame, sizeof(*frame)))
                goto badframe;
        if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask))
                goto badframe;
        if (__get_user(uc_flags, &frame->uc.uc_flags))
                goto badframe;

        set_current_blocked(&set);

        if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
                goto badframe;

        if (restore_signal_shadow_stack())
                goto badframe;

        if (restore_altstack(&frame->uc.uc_stack))
                goto badframe;

        return regs->ax;

badframe:
        signal_fault(regs, frame, "rt_sigreturn");
        return 0;
}

#ifdef CONFIG_X86_X32_ABI
static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to,
                const struct kernel_siginfo *from)
{
        struct compat_siginfo new;

        copy_siginfo_to_external32(&new, from);
        if (from->si_signo == SIGCHLD) {
                new._sifields._sigchld_x32._utime = from->si_utime;
                new._sifields._sigchld_x32._stime = from->si_stime;
        }
        if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
                return -EFAULT;
        return 0;
}

int copy_siginfo_to_user32(struct compat_siginfo __user *to,
                           const struct kernel_siginfo *from)
{
        if (in_x32_syscall())
                return x32_copy_siginfo_to_user(to, from);
        return __copy_siginfo_to_user32(to, from);
}

int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
{
        compat_sigset_t *set = (compat_sigset_t *) sigmask_to_save();
        struct rt_sigframe_x32 __user *frame;
        unsigned long uc_flags;
        void __user *restorer;
        void __user *fp = NULL;

        if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
                return -EFAULT;

        frame = get_sigframe(ksig, regs, sizeof(*frame), &fp);

        uc_flags = frame_uc_flags(regs);

        if (setup_signal_shadow_stack(ksig))
                return -EFAULT;

        if (!user_access_begin(frame, sizeof(*frame)))
                return -EFAULT;

        /* Create the ucontext.  */
        unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
        unsafe_put_user(0, &frame->uc.uc_link, Efault);
        unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
        unsafe_put_user(0, &frame->uc.uc__pad0, Efault);
        restorer = ksig->ka.sa.sa_restorer;
        unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault);
        unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
        unsafe_put_sigmask(set, frame, Efault);
        user_access_end();

        if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
                if (x32_copy_siginfo_to_user(&frame->info, &ksig->info))
                        return -EFAULT;
        }

        /* Set up registers for signal handler */
        regs->sp = (unsigned long) frame;
        regs->ip = (unsigned long) ksig->ka.sa.sa_handler;

        /* We use the x32 calling convention here... */
        regs->di = ksig->sig;
        regs->si = (unsigned long) &frame->info;
        regs->dx = (unsigned long) &frame->uc;

        loadsegment(ds, __USER_DS);
        loadsegment(es, __USER_DS);

        regs->cs = __USER_CS;
        regs->ss = __USER_DS;

        return 0;

Efault:
        user_access_end();
        return -EFAULT;
}

COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
{
        struct pt_regs *regs = current_pt_regs();
        struct rt_sigframe_x32 __user *frame;
        sigset_t set;
        unsigned long uc_flags;

        frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);

        if (!access_ok(frame, sizeof(*frame)))
                goto badframe;
        if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask))
                goto badframe;
        if (__get_user(uc_flags, &frame->uc.uc_flags))
                goto badframe;

        set_current_blocked(&set);

        if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
                goto badframe;

        if (restore_signal_shadow_stack())
                goto badframe;

        if (compat_restore_altstack(&frame->uc.uc_stack))
                goto badframe;

        return regs->ax;

badframe:
        signal_fault(regs, frame, "x32 rt_sigreturn");
        return 0;
}
#endif /* CONFIG_X86_X32_ABI */

#ifdef CONFIG_COMPAT
void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
{
        if (!act)
                return;

        if (in_ia32_syscall())
                act->sa.sa_flags |= SA_IA32_ABI;
        if (in_x32_syscall())
                act->sa.sa_flags |= SA_X32_ABI;
}
#endif /* CONFIG_COMPAT */

/*
* If adding a new si_code, there is probably new data in
* the siginfo.  Make sure folks bumping the si_code
* limits also have to look at this code.  Make sure any
* new fields are handled in copy_siginfo_to_user32()!
*/
static_assert(NSIGILL  == 11);
static_assert(NSIGFPE  == 15);
static_assert(NSIGSEGV == 10);
static_assert(NSIGBUS  == 5);
static_assert(NSIGTRAP == 6);
static_assert(NSIGCHLD == 6);
static_assert(NSIGSYS  == 2);

/* This is part of the ABI and can never change in size: */
static_assert(sizeof(siginfo_t) == 128);

/* This is a part of the ABI and can never change in alignment */
static_assert(__alignof__(siginfo_t) == 8);

/*
* The offsets of all the (unioned) si_fields are fixed
* in the ABI, of course.  Make sure none of them ever
* move and are always at the beginning:
*/
static_assert(offsetof(siginfo_t, si_signo) == 0);
static_assert(offsetof(siginfo_t, si_errno) == 4);
static_assert(offsetof(siginfo_t, si_code)  == 8);

/*
* Ensure that the size of each si_field never changes.
* If it does, it is a sign that the
* copy_siginfo_to_user32() code below needs to updated
* along with the size in the CHECK_SI_SIZE().
*
* We repeat this check for both the generic and compat
* siginfos.
*
* Note: it is OK for these to grow as long as the whole
* structure stays within the padding size (checked
* above).
*/

#define CHECK_SI_OFFSET(name)                                                \
        static_assert(offsetof(siginfo_t, _sifields) ==                 \
                      offsetof(siginfo_t, _sifields.name))
#define CHECK_SI_SIZE(name, size)                                        \
        static_assert(sizeof_field(siginfo_t, _sifields.name) == size)

CHECK_SI_OFFSET(_kill);
CHECK_SI_SIZE  (_kill, 2*sizeof(int));
static_assert(offsetof(siginfo_t, si_pid) == 0x10);
static_assert(offsetof(siginfo_t, si_uid) == 0x14);

CHECK_SI_OFFSET(_timer);
CHECK_SI_SIZE  (_timer, 6*sizeof(int));
static_assert(offsetof(siginfo_t, si_tid)     == 0x10);
static_assert(offsetof(siginfo_t, si_overrun) == 0x14);
static_assert(offsetof(siginfo_t, si_value)   == 0x18);

CHECK_SI_OFFSET(_rt);
CHECK_SI_SIZE  (_rt, 4*sizeof(int));
static_assert(offsetof(siginfo_t, si_pid)   == 0x10);
static_assert(offsetof(siginfo_t, si_uid)   == 0x14);
static_assert(offsetof(siginfo_t, si_value) == 0x18);

CHECK_SI_OFFSET(_sigchld);
CHECK_SI_SIZE  (_sigchld, 8*sizeof(int));
static_assert(offsetof(siginfo_t, si_pid)    == 0x10);
static_assert(offsetof(siginfo_t, si_uid)    == 0x14);
static_assert(offsetof(siginfo_t, si_status) == 0x18);
static_assert(offsetof(siginfo_t, si_utime)  == 0x20);
static_assert(offsetof(siginfo_t, si_stime)  == 0x28);

#ifdef CONFIG_X86_X32_ABI
/* no _sigchld_x32 in the generic siginfo_t */
static_assert(sizeof_field(compat_siginfo_t, _sifields._sigchld_x32) ==
              7*sizeof(int));
static_assert(offsetof(compat_siginfo_t, _sifields) ==
              offsetof(compat_siginfo_t, _sifields._sigchld_x32));
static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._utime)  == 0x18);
static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._stime)  == 0x20);
#endif

CHECK_SI_OFFSET(_sigfault);
CHECK_SI_SIZE  (_sigfault, 8*sizeof(int));
static_assert(offsetof(siginfo_t, si_addr)        == 0x10);

static_assert(offsetof(siginfo_t, si_trapno)        == 0x18);

static_assert(offsetof(siginfo_t, si_addr_lsb)        == 0x18);

static_assert(offsetof(siginfo_t, si_lower)        == 0x20);
static_assert(offsetof(siginfo_t, si_upper)        == 0x28);

static_assert(offsetof(siginfo_t, si_pkey)        == 0x20);

static_assert(offsetof(siginfo_t, si_perf_data)         == 0x18);
static_assert(offsetof(siginfo_t, si_perf_type)         == 0x20);
static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24);

CHECK_SI_OFFSET(_sigpoll);
CHECK_SI_SIZE  (_sigpoll, 4*sizeof(int));
static_assert(offsetof(siginfo_t, si_band) == 0x10);
static_assert(offsetof(siginfo_t, si_fd)   == 0x18);

CHECK_SI_OFFSET(_sigsys);
CHECK_SI_SIZE  (_sigsys, 4*sizeof(int));
static_assert(offsetof(siginfo_t, si_call_addr) == 0x10);
static_assert(offsetof(siginfo_t, si_syscall)   == 0x18);
static_assert(offsetof(siginfo_t, si_arch)      == 0x1C);

/* any new si_fields should be added here */















































































































































































































































































































































































    1 






    1 





















    1 







    1 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef BTRFS_ZONED_H
#define BTRFS_ZONED_H

#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/blkdev.h>
#include <linux/blkzoned.h>
#include <linux/errno.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
#include "messages.h"
#include "volumes.h"
#include "disk-io.h"
#include "block-group.h"
#include "btrfs_inode.h"
#include "fs.h"

struct block_device;
struct extent_buffer;
struct btrfs_bio;
struct btrfs_ordered_extent;
struct btrfs_fs_info;
struct btrfs_space_info;
struct btrfs_eb_write_context;
struct btrfs_fs_devices;

#define BTRFS_DEFAULT_RECLAIM_THRESH                                   (75)

struct btrfs_zoned_device_info {
        /*
         * Number of zones, zone size and types of zones if bdev is a
         * zoned block device.
         */
        u64 zone_size;
        u8  zone_size_shift;
        u32 nr_zones;
        unsigned int max_active_zones;
        /*
         * Reserved active zones for one metadata and one system block group.
         * It can vary per-device depending on the allocation status.
         */
        int reserved_active_zones;
        atomic_t active_zones_left;
        unsigned long *seq_zones;
        unsigned long *empty_zones;
        unsigned long *active_zones;
        struct blk_zone *zone_cache;
        struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
};

void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered);

#ifdef CONFIG_BLK_DEV_ZONED
int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
                       struct blk_zone *zone);
int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info);
int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev);
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt);
int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
                               u64 *bytenr_ret);
int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
                          u64 *bytenr_ret);
int btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror);
u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
                                 u64 hole_end, u64 num_bytes);
int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
                            u64 length, u64 *bytes);
int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size);
int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new);
void btrfs_calc_zone_unusable(struct btrfs_block_group *cache);
bool btrfs_use_zone_append(struct btrfs_bio *bbio);
void btrfs_record_physical_zoned(struct btrfs_bio *bbio);
int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
                                   struct btrfs_eb_write_context *ctx);
int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length);
int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
                                  u64 physical_start, u64 physical_pos);
bool btrfs_zone_activate(struct btrfs_block_group *block_group);
int btrfs_zone_finish(struct btrfs_block_group *block_group);
bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
                             u64 length);
void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
                                   struct extent_buffer *eb);
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
                                       u64 length);
int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
                                struct btrfs_space_info *space_info, bool do_finish);
void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
                                     struct blk_zone *zone)
{
        return 0;
}

static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
{
        return 0;
}

static inline int btrfs_get_dev_zone_info(struct btrfs_device *device,
                                          bool populate_cache)
{
        return 0;
}

static inline void btrfs_destroy_dev_zone_info(struct btrfs_device *device) { }

/*
 * In case the kernel is compiled without CONFIG_BLK_DEV_ZONED we'll never call
 * into btrfs_clone_dev_zone_info() so it's safe to return NULL here.
 */
static inline struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(
                                                 struct btrfs_device *orig_dev)
{
        return NULL;
}

static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info)
{
        if (!btrfs_is_zoned(fs_info))
                return 0;

        btrfs_err(fs_info, "zoned block devices support is not enabled");
        return -EOPNOTSUPP;
}

static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info,
                                              unsigned long *mount_opt)
{
        return 0;
}

static inline int btrfs_sb_log_location_bdev(struct block_device *bdev,
                                             int mirror, int rw, u64 *bytenr_ret)
{
        *bytenr_ret = btrfs_sb_offset(mirror);
        return 0;
}

static inline int btrfs_sb_log_location(struct btrfs_device *device, int mirror,
                                        int rw, u64 *bytenr_ret)
{
        *bytenr_ret = btrfs_sb_offset(mirror);
        return 0;
}

static inline int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
{
        return 0;
}

static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
{
        return 0;
}

static inline u64 btrfs_find_allocatable_zones(struct btrfs_device *device,
                                               u64 hole_start, u64 hole_end,
                                               u64 num_bytes)
{
        return hole_start;
}

static inline int btrfs_reset_device_zone(struct btrfs_device *device,
                                          u64 physical, u64 length, u64 *bytes)
{
        *bytes = 0;
        return 0;
}

static inline int btrfs_ensure_empty_zones(struct btrfs_device *device,
                                           u64 start, u64 size)
{
        return 0;
}

static inline int btrfs_load_block_group_zone_info(
                struct btrfs_block_group *cache, bool new)
{
        return 0;
}

static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { }

static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio)
{
        return false;
}

static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
{
}

static inline int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
                                                 struct btrfs_eb_write_context *ctx)
{
        return 0;
}

static inline int btrfs_zoned_issue_zeroout(struct btrfs_device *device,
                                            u64 physical, u64 length)
{
        return -EOPNOTSUPP;
}

static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev,
                                                u64 logical, u64 physical_start,
                                                u64 physical_pos)
{
        return -EOPNOTSUPP;
}

static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group)
{
        return true;
}

static inline int btrfs_zone_finish(struct btrfs_block_group *block_group)
{
        return 0;
}

static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
                                           u64 flags)
{
        return true;
}

static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
                                           u64 logical, u64 length) { }

static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
                                                 struct extent_buffer *eb) { }

static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }

static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }

static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
{
        return false;
}

static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info,
                                                     u64 logical, u64 length) { }

static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
{
        return 1;
}

static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
                                              struct btrfs_space_info *space_info,
                                              bool do_finish)
{
        /* Consider all the block groups are active */
        return 0;
}

static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { }

#endif

static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
{
        struct btrfs_zoned_device_info *zone_info = device->zone_info;

        if (!zone_info)
                return false;

        return test_bit(pos >> zone_info->zone_size_shift, zone_info->seq_zones);
}

static inline bool btrfs_dev_is_empty_zone(struct btrfs_device *device, u64 pos)
{
        struct btrfs_zoned_device_info *zone_info = device->zone_info;

        if (!zone_info)
                return true;

        return test_bit(pos >> zone_info->zone_size_shift, zone_info->empty_zones);
}

static inline void btrfs_dev_set_empty_zone_bit(struct btrfs_device *device,
                                                u64 pos, bool set)
{
        struct btrfs_zoned_device_info *zone_info = device->zone_info;
        unsigned int zno;

        if (!zone_info)
                return;

        zno = pos >> zone_info->zone_size_shift;
        if (set)
                set_bit(zno, zone_info->empty_zones);
        else
                clear_bit(zno, zone_info->empty_zones);
}

static inline void btrfs_dev_set_zone_empty(struct btrfs_device *device, u64 pos)
{
        btrfs_dev_set_empty_zone_bit(device, pos, true);
}

static inline void btrfs_dev_clear_zone_empty(struct btrfs_device *device, u64 pos)
{
        btrfs_dev_set_empty_zone_bit(device, pos, false);
}

static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_info,
                                                struct block_device *bdev)
{
        if (btrfs_is_zoned(fs_info)) {
                /*
                 * We can allow a regular device on a zoned filesystem, because
                 * we will emulate the zoned capabilities.
                 */
                if (!bdev_is_zoned(bdev))
                        return true;

                return fs_info->zone_size ==
                        (bdev_zone_sectors(bdev) << SECTOR_SHIFT);
        }

        /* Do not allow Host Managed zoned device. */
        return !bdev_is_zoned(bdev);
}

static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 pos)
{
        /*
         * On a non-zoned device, any address is OK. On a zoned device,
         * non-SEQUENTIAL WRITE REQUIRED zones are capable.
         */
        return device->zone_info == NULL || !btrfs_dev_is_sequential(device, pos);
}

static inline bool btrfs_can_zone_reset(struct btrfs_device *device,
                                        u64 physical, u64 length)
{
        u64 zone_size;

        if (!btrfs_dev_is_sequential(device, physical))
                return false;

        zone_size = device->zone_info->zone_size;
        if (!IS_ALIGNED(physical, zone_size) || !IS_ALIGNED(length, zone_size))
                return false;

        return true;
}

static inline void btrfs_zoned_meta_io_lock(struct btrfs_fs_info *fs_info)
{
        if (!btrfs_is_zoned(fs_info))
                return;
        mutex_lock(&fs_info->zoned_meta_io_lock);
}

static inline void btrfs_zoned_meta_io_unlock(struct btrfs_fs_info *fs_info)
{
        if (!btrfs_is_zoned(fs_info))
                return;
        mutex_unlock(&fs_info->zoned_meta_io_lock);
}

static inline void btrfs_clear_treelog_bg(struct btrfs_block_group *bg)
{
        struct btrfs_fs_info *fs_info = bg->fs_info;

        if (!btrfs_is_zoned(fs_info))
                return;

        spin_lock(&fs_info->treelog_bg_lock);
        if (fs_info->treelog_bg == bg->start)
                fs_info->treelog_bg = 0;
        spin_unlock(&fs_info->treelog_bg_lock);
}

static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode)
{
        struct btrfs_root *root = inode->root;

        if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
                mutex_lock(&root->fs_info->zoned_data_reloc_io_lock);
}

static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
{
        struct btrfs_root *root = inode->root;

        if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
                mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock);
}

static inline bool btrfs_zoned_bg_is_full(const struct btrfs_block_group *bg)
{
        ASSERT(btrfs_is_zoned(bg->fs_info));
        return (bg->alloc_offset == bg->zone_capacity);
}

#endif
















































    2 



































    2 








    2 
    2 


















    2 








    2 





    2 









    2 



    1 
    2 



    2 
    1 








































































    2 
















    2 











    1 



















    2 


































































































    2 





    1 














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 




    4 


























































































































































































































































































































































































































































































































































































    1 





    1 

































































































































































































































































































































    1 


































    1 





    1 





    1 















    1 









    1 
    1 

    1 











    1 






    1 
    1 




















    2 











    1 









    2 

    1 
    1 





    2 



































































































































































































































































































































































































































































































































































































































































































































































































    2 




























































    2 


    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
// SPDX-License-Identifier: GPL-2.0

#include <linux/sizes.h>
#include <linux/list_sort.h>
#include "misc.h"
#include "ctree.h"
#include "block-group.h"
#include "space-info.h"
#include "disk-io.h"
#include "free-space-cache.h"
#include "free-space-tree.h"
#include "volumes.h"
#include "transaction.h"
#include "ref-verify.h"
#include "sysfs.h"
#include "tree-log.h"
#include "delalloc-space.h"
#include "discard.h"
#include "raid56.h"
#include "zoned.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"

#ifdef CONFIG_BTRFS_DEBUG
int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
{
        struct btrfs_fs_info *fs_info = block_group->fs_info;

        return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
                block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
               (btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
                block_group->flags &  BTRFS_BLOCK_GROUP_DATA);
}
#endif

/*
 * Return target flags in extended format or 0 if restripe for this chunk_type
 * is not in progress
 *
 * Should be called with balance_lock held
 */
static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
{
        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
        u64 target = 0;

        if (!bctl)
                return 0;

        if (flags & BTRFS_BLOCK_GROUP_DATA &&
            bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
                target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
        } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
                   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
                target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
        } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
                   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
                target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
        }

        return target;
}

/*
 * @flags: available profiles in extended format (see ctree.h)
 *
 * Return reduced profile in chunk format.  If profile changing is in progress
 * (either running or paused) picks the target profile (if it's already
 * available), otherwise falls back to plain reducing.
 */
static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
{
        u64 num_devices = fs_info->fs_devices->rw_devices;
        u64 target;
        u64 raid_type;
        u64 allowed = 0;

        /*
         * See if restripe for this chunk_type is in progress, if so try to
         * reduce to the target profile
         */
        spin_lock(&fs_info->balance_lock);
        target = get_restripe_target(fs_info, flags);
        if (target) {
                spin_unlock(&fs_info->balance_lock);
                return extended_to_chunk(target);
        }
        spin_unlock(&fs_info->balance_lock);

        /* First, mask out the RAID levels which aren't possible */
        for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
                if (num_devices >= btrfs_raid_array[raid_type].devs_min)
                        allowed |= btrfs_raid_array[raid_type].bg_flag;
        }
        allowed &= flags;

        /* Select the highest-redundancy RAID level. */
        if (allowed & BTRFS_BLOCK_GROUP_RAID1C4)
                allowed = BTRFS_BLOCK_GROUP_RAID1C4;
        else if (allowed & BTRFS_BLOCK_GROUP_RAID6)
                allowed = BTRFS_BLOCK_GROUP_RAID6;
        else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3)
                allowed = BTRFS_BLOCK_GROUP_RAID1C3;
        else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
                allowed = BTRFS_BLOCK_GROUP_RAID5;
        else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
                allowed = BTRFS_BLOCK_GROUP_RAID10;
        else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
                allowed = BTRFS_BLOCK_GROUP_RAID1;
        else if (allowed & BTRFS_BLOCK_GROUP_DUP)
                allowed = BTRFS_BLOCK_GROUP_DUP;
        else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
                allowed = BTRFS_BLOCK_GROUP_RAID0;

        flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;

        return extended_to_chunk(flags | allowed);
}

u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
{
        unsigned seq;
        u64 flags;

        do {
                flags = orig_flags;
                seq = read_seqbegin(&fs_info->profiles_lock);

                if (flags & BTRFS_BLOCK_GROUP_DATA)
                        flags |= fs_info->avail_data_alloc_bits;
                else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
                        flags |= fs_info->avail_system_alloc_bits;
                else if (flags & BTRFS_BLOCK_GROUP_METADATA)
                        flags |= fs_info->avail_metadata_alloc_bits;
        } while (read_seqretry(&fs_info->profiles_lock, seq));

        return btrfs_reduce_alloc_profile(fs_info, flags);
}

void btrfs_get_block_group(struct btrfs_block_group *cache)
{
        refcount_inc(&cache->refs);
}

void btrfs_put_block_group(struct btrfs_block_group *cache)
{
        if (refcount_dec_and_test(&cache->refs)) {
                WARN_ON(cache->pinned > 0);
                /*
                 * If there was a failure to cleanup a log tree, very likely due
                 * to an IO failure on a writeback attempt of one or more of its
                 * extent buffers, we could not do proper (and cheap) unaccounting
                 * of their reserved space, so don't warn on reserved > 0 in that
                 * case.
                 */
                if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
                    !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
                        WARN_ON(cache->reserved > 0);

                /*
                 * A block_group shouldn't be on the discard_list anymore.
                 * Remove the block_group from the discard_list to prevent us
                 * from causing a panic due to NULL pointer dereference.
                 */
                if (WARN_ON(!list_empty(&cache->discard_list)))
                        btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
                                                  cache);

                kfree(cache->free_space_ctl);
                btrfs_free_chunk_map(cache->physical_map);
                kfree(cache);
        }
}

/*
 * This adds the block group to the fs_info rb tree for the block group cache
 */
static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
                                       struct btrfs_block_group *block_group)
{
        struct rb_node **p;
        struct rb_node *parent = NULL;
        struct btrfs_block_group *cache;
        bool leftmost = true;

        ASSERT(block_group->length != 0);

        write_lock(&info->block_group_cache_lock);
        p = &info->block_group_cache_tree.rb_root.rb_node;

        while (*p) {
                parent = *p;
                cache = rb_entry(parent, struct btrfs_block_group, cache_node);
                if (block_group->start < cache->start) {
                        p = &(*p)->rb_left;
                } else if (block_group->start > cache->start) {
                        p = &(*p)->rb_right;
                        leftmost = false;
                } else {
                        write_unlock(&info->block_group_cache_lock);
                        return -EEXIST;
                }
        }

        rb_link_node(&block_group->cache_node, parent, p);
        rb_insert_color_cached(&block_group->cache_node,
                               &info->block_group_cache_tree, leftmost);

        write_unlock(&info->block_group_cache_lock);

        return 0;
}

/*
 * This will return the block group at or after bytenr if contains is 0, else
 * it will return the block group that contains the bytenr
 */
static struct btrfs_block_group *block_group_cache_tree_search(
                struct btrfs_fs_info *info, u64 bytenr, int contains)
{
        struct btrfs_block_group *cache, *ret = NULL;
        struct rb_node *n;
        u64 end, start;

        read_lock(&info->block_group_cache_lock);
        n = info->block_group_cache_tree.rb_root.rb_node;

        while (n) {
                cache = rb_entry(n, struct btrfs_block_group, cache_node);
                end = cache->start + cache->length - 1;
                start = cache->start;

                if (bytenr < start) {
                        if (!contains && (!ret || start < ret->start))
                                ret = cache;
                        n = n->rb_left;
                } else if (bytenr > start) {
                        if (contains && bytenr <= end) {
                                ret = cache;
                                break;
                        }
                        n = n->rb_right;
                } else {
                        ret = cache;
                        break;
                }
        }
        if (ret)
                btrfs_get_block_group(ret);
        read_unlock(&info->block_group_cache_lock);

        return ret;
}

/*
 * Return the block group that starts at or after bytenr
 */
struct btrfs_block_group *btrfs_lookup_first_block_group(
                struct btrfs_fs_info *info, u64 bytenr)
{
        return block_group_cache_tree_search(info, bytenr, 0);
}

/*
 * Return the block group that contains the given bytenr
 */
struct btrfs_block_group *btrfs_lookup_block_group(
                struct btrfs_fs_info *info, u64 bytenr)
{
        return block_group_cache_tree_search(info, bytenr, 1);
}

struct btrfs_block_group *btrfs_next_block_group(
                struct btrfs_block_group *cache)
{
        struct btrfs_fs_info *fs_info = cache->fs_info;
        struct rb_node *node;

        read_lock(&fs_info->block_group_cache_lock);

        /* If our block group was removed, we need a full search. */
        if (RB_EMPTY_NODE(&cache->cache_node)) {
                const u64 next_bytenr = cache->start + cache->length;

                read_unlock(&fs_info->block_group_cache_lock);
                btrfs_put_block_group(cache);
                return btrfs_lookup_first_block_group(fs_info, next_bytenr);
        }
        node = rb_next(&cache->cache_node);
        btrfs_put_block_group(cache);
        if (node) {
                cache = rb_entry(node, struct btrfs_block_group, cache_node);
                btrfs_get_block_group(cache);
        } else
                cache = NULL;
        read_unlock(&fs_info->block_group_cache_lock);
        return cache;
}

/*
 * Check if we can do a NOCOW write for a given extent.
 *
 * @fs_info:       The filesystem information object.
 * @bytenr:        Logical start address of the extent.
 *
 * Check if we can do a NOCOW write for the given extent, and increments the
 * number of NOCOW writers in the block group that contains the extent, as long
 * as the block group exists and it's currently not in read-only mode.
 *
 * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
 *          is responsible for calling btrfs_dec_nocow_writers() later.
 *
 *          Or NULL if we can not do a NOCOW write
 */
struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
                                                  u64 bytenr)
{
        struct btrfs_block_group *bg;
        bool can_nocow = true;

        bg = btrfs_lookup_block_group(fs_info, bytenr);
        if (!bg)
                return NULL;

        spin_lock(&bg->lock);
        if (bg->ro)
                can_nocow = false;
        else
                atomic_inc(&bg->nocow_writers);
        spin_unlock(&bg->lock);

        if (!can_nocow) {
                btrfs_put_block_group(bg);
                return NULL;
        }

        /* No put on block group, done by btrfs_dec_nocow_writers(). */
        return bg;
}

/*
 * Decrement the number of NOCOW writers in a block group.
 *
 * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
 * and on the block group returned by that call. Typically this is called after
 * creating an ordered extent for a NOCOW write, to prevent races with scrub and
 * relocation.
 *
 * After this call, the caller should not use the block group anymore. It it wants
 * to use it, then it should get a reference on it before calling this function.
 */
void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
{
        if (atomic_dec_and_test(&bg->nocow_writers))
                wake_up_var(&bg->nocow_writers);

        /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
        btrfs_put_block_group(bg);
}

void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
{
        wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
}

void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
                                        const u64 start)
{
        struct btrfs_block_group *bg;

        bg = btrfs_lookup_block_group(fs_info, start);
        ASSERT(bg);
        if (atomic_dec_and_test(&bg->reservations))
                wake_up_var(&bg->reservations);
        btrfs_put_block_group(bg);
}

void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
{
        struct btrfs_space_info *space_info = bg->space_info;

        ASSERT(bg->ro);

        if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
                return;

        /*
         * Our block group is read only but before we set it to read only,
         * some task might have had allocated an extent from it already, but it
         * has not yet created a respective ordered extent (and added it to a
         * root's list of ordered extents).
         * Therefore wait for any task currently allocating extents, since the
         * block group's reservations counter is incremented while a read lock
         * on the groups' semaphore is held and decremented after releasing
         * the read access on that semaphore and creating the ordered extent.
         */
        down_write(&space_info->groups_sem);
        up_write(&space_info->groups_sem);

        wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
}

struct btrfs_caching_control *btrfs_get_caching_control(
                struct btrfs_block_group *cache)
{
        struct btrfs_caching_control *ctl;

        spin_lock(&cache->lock);
        if (!cache->caching_ctl) {
                spin_unlock(&cache->lock);
                return NULL;
        }

        ctl = cache->caching_ctl;
        refcount_inc(&ctl->count);
        spin_unlock(&cache->lock);
        return ctl;
}

static void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
{
        if (refcount_dec_and_test(&ctl->count))
                kfree(ctl);
}

/*
 * When we wait for progress in the block group caching, its because our
 * allocation attempt failed at least once.  So, we must sleep and let some
 * progress happen before we try again.
 *
 * This function will sleep at least once waiting for new free space to show
 * up, and then it will check the block group free space numbers for our min
 * num_bytes.  Another option is to have it go ahead and look in the rbtree for
 * a free extent of a given size, but this is a good start.
 *
 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
 * any of the information in this block group.
 */
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
                                           u64 num_bytes)
{
        struct btrfs_caching_control *caching_ctl;
        int progress;

        caching_ctl = btrfs_get_caching_control(cache);
        if (!caching_ctl)
                return;

        /*
         * We've already failed to allocate from this block group, so even if
         * there's enough space in the block group it isn't contiguous enough to
         * allow for an allocation, so wait for at least the next wakeup tick,
         * or for the thing to be done.
         */
        progress = atomic_read(&caching_ctl->progress);

        wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
                   (progress != atomic_read(&caching_ctl->progress) &&
                    (cache->free_space_ctl->free_space >= num_bytes)));

        btrfs_put_caching_control(caching_ctl);
}

static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
                                       struct btrfs_caching_control *caching_ctl)
{
        wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
        return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
}

static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
{
        struct btrfs_caching_control *caching_ctl;
        int ret;

        caching_ctl = btrfs_get_caching_control(cache);
        if (!caching_ctl)
                return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
        ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
        btrfs_put_caching_control(caching_ctl);
        return ret;
}

#ifdef CONFIG_BTRFS_DEBUG
static void fragment_free_space(struct btrfs_block_group *block_group)
{
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        u64 start = block_group->start;
        u64 len = block_group->length;
        u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
                fs_info->nodesize : fs_info->sectorsize;
        u64 step = chunk << 1;

        while (len > chunk) {
                btrfs_remove_free_space(block_group, start, chunk);
                start += step;
                if (len < step)
                        len = 0;
                else
                        len -= step;
        }
}
#endif

/*
 * Add a free space range to the in memory free space cache of a block group.
 * This checks if the range contains super block locations and any such
 * locations are not added to the free space cache.
 *
 * @block_group:      The target block group.
 * @start:            Start offset of the range.
 * @end:              End offset of the range (exclusive).
 * @total_added_ret:  Optional pointer to return the total amount of space
 *                    added to the block group's free space cache.
 *
 * Returns 0 on success or < 0 on error.
 */
int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start,
                             u64 end, u64 *total_added_ret)
{
        struct btrfs_fs_info *info = block_group->fs_info;
        u64 extent_start, extent_end, size;
        int ret;

        if (total_added_ret)
                *total_added_ret = 0;

        while (start < end) {
                if (!find_first_extent_bit(&info->excluded_extents, start,
                                           &extent_start, &extent_end,
                                           EXTENT_DIRTY | EXTENT_UPTODATE,
                                           NULL))
                        break;

                if (extent_start <= start) {
                        start = extent_end + 1;
                } else if (extent_start > start && extent_start < end) {
                        size = extent_start - start;
                        ret = btrfs_add_free_space_async_trimmed(block_group,
                                                                 start, size);
                        if (ret)
                                return ret;
                        if (total_added_ret)
                                *total_added_ret += size;
                        start = extent_end + 1;
                } else {
                        break;
                }
        }

        if (start < end) {
                size = end - start;
                ret = btrfs_add_free_space_async_trimmed(block_group, start,
                                                         size);
                if (ret)
                        return ret;
                if (total_added_ret)
                        *total_added_ret += size;
        }

        return 0;
}

/*
 * Get an arbitrary extent item index / max_index through the block group
 *
 * @block_group   the block group to sample from
 * @index:        the integral step through the block group to grab from
 * @max_index:    the granularity of the sampling
 * @key:          return value parameter for the item we find
 *
 * Pre-conditions on indices:
 * 0 <= index <= max_index
 * 0 < max_index
 *
 * Returns: 0 on success, 1 if the search didn't yield a useful item, negative
 * error code on error.
 */
static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
                                          struct btrfs_block_group *block_group,
                                          int index, int max_index,
                                          struct btrfs_key *found_key)
{
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        struct btrfs_root *extent_root;
        u64 search_offset;
        u64 search_end = block_group->start + block_group->length;
        struct btrfs_path *path;
        struct btrfs_key search_key;
        int ret = 0;

        ASSERT(index >= 0);
        ASSERT(index <= max_index);
        ASSERT(max_index > 0);
        lockdep_assert_held(&caching_ctl->mutex);
        lockdep_assert_held_read(&fs_info->commit_root_sem);

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
                                                       BTRFS_SUPER_INFO_OFFSET));

        path->skip_locking = 1;
        path->search_commit_root = 1;
        path->reada = READA_FORWARD;

        search_offset = index * div_u64(block_group->length, max_index);
        search_key.objectid = block_group->start + search_offset;
        search_key.type = BTRFS_EXTENT_ITEM_KEY;
        search_key.offset = 0;

        btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) {
                /* Success; sampled an extent item in the block group */
                if (found_key->type == BTRFS_EXTENT_ITEM_KEY &&
                    found_key->objectid >= block_group->start &&
                    found_key->objectid + found_key->offset <= search_end)
                        break;

                /* We can't possibly find a valid extent item anymore */
                if (found_key->objectid >= search_end) {
                        ret = 1;
                        break;
                }
        }

        lockdep_assert_held(&caching_ctl->mutex);
        lockdep_assert_held_read(&fs_info->commit_root_sem);
        btrfs_free_path(path);
        return ret;
}

/*
 * Best effort attempt to compute a block group's size class while caching it.
 *
 * @block_group: the block group we are caching
 *
 * We cannot infer the size class while adding free space extents, because that
 * logic doesn't care about contiguous file extents (it doesn't differentiate
 * between a 100M extent and 100 contiguous 1M extents). So we need to read the
 * file extent items. Reading all of them is quite wasteful, because usually
 * only a handful are enough to give a good answer. Therefore, we just grab 5 of
 * them at even steps through the block group and pick the smallest size class
 * we see. Since size class is best effort, and not guaranteed in general,
 * inaccuracy is acceptable.
 *
 * To be more explicit about why this algorithm makes sense:
 *
 * If we are caching in a block group from disk, then there are three major cases
 * to consider:
 * 1. the block group is well behaved and all extents in it are the same size
 *    class.
 * 2. the block group is mostly one size class with rare exceptions for last
 *    ditch allocations
 * 3. the block group was populated before size classes and can have a totally
 *    arbitrary mix of size classes.
 *
 * In case 1, looking at any extent in the block group will yield the correct
 * result. For the mixed cases, taking the minimum size class seems like a good
 * approximation, since gaps from frees will be usable to the size class. For
 * 2., a small handful of file extents is likely to yield the right answer. For
 * 3, we can either read every file extent, or admit that this is best effort
 * anyway and try to stay fast.
 *
 * Returns: 0 on success, negative error code on error.
 */
static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
                                       struct btrfs_block_group *block_group)
{
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        struct btrfs_key key;
        int i;
        u64 min_size = block_group->length;
        enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
        int ret;

        if (!btrfs_block_group_should_use_size_class(block_group))
                return 0;

        lockdep_assert_held(&caching_ctl->mutex);
        lockdep_assert_held_read(&fs_info->commit_root_sem);
        for (i = 0; i < 5; ++i) {
                ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key);
                if (ret < 0)
                        goto out;
                if (ret > 0)
                        continue;
                min_size = min_t(u64, min_size, key.offset);
                size_class = btrfs_calc_block_group_size_class(min_size);
        }
        if (size_class != BTRFS_BG_SZ_NONE) {
                spin_lock(&block_group->lock);
                block_group->size_class = size_class;
                spin_unlock(&block_group->lock);
        }
out:
        return ret;
}

static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
{
        struct btrfs_block_group *block_group = caching_ctl->block_group;
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        struct btrfs_root *extent_root;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_key key;
        u64 total_found = 0;
        u64 last = 0;
        u32 nritems;
        int ret;
        bool wakeup = true;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
        extent_root = btrfs_extent_root(fs_info, last);

#ifdef CONFIG_BTRFS_DEBUG
        /*
         * If we're fragmenting we don't want to make anybody think we can
         * allocate from this block group until we've had a chance to fragment
         * the free space.
         */
        if (btrfs_should_fragment_free_space(block_group))
                wakeup = false;
#endif
        /*
         * We don't want to deadlock with somebody trying to allocate a new
         * extent for the extent root while also trying to search the extent
         * root to add free space.  So we skip locking and search the commit
         * root, since its read-only
         */
        path->skip_locking = 1;
        path->search_commit_root = 1;
        path->reada = READA_FORWARD;

        key.objectid = last;
        key.offset = 0;
        key.type = BTRFS_EXTENT_ITEM_KEY;

next:
        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
        if (ret < 0)
                goto out;

        leaf = path->nodes[0];
        nritems = btrfs_header_nritems(leaf);

        while (1) {
                if (btrfs_fs_closing(fs_info) > 1) {
                        last = (u64)-1;
                        break;
                }

                if (path->slots[0] < nritems) {
                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
                } else {
                        ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
                        if (ret)
                                break;

                        if (need_resched() ||
                            rwsem_is_contended(&fs_info->commit_root_sem)) {
                                btrfs_release_path(path);
                                up_read(&fs_info->commit_root_sem);
                                mutex_unlock(&caching_ctl->mutex);
                                cond_resched();
                                mutex_lock(&caching_ctl->mutex);
                                down_read(&fs_info->commit_root_sem);
                                goto next;
                        }

                        ret = btrfs_next_leaf(extent_root, path);
                        if (ret < 0)
                                goto out;
                        if (ret)
                                break;
                        leaf = path->nodes[0];
                        nritems = btrfs_header_nritems(leaf);
                        continue;
                }

                if (key.objectid < last) {
                        key.objectid = last;
                        key.offset = 0;
                        key.type = BTRFS_EXTENT_ITEM_KEY;
                        btrfs_release_path(path);
                        goto next;
                }

                if (key.objectid < block_group->start) {
                        path->slots[0]++;
                        continue;
                }

                if (key.objectid >= block_group->start + block_group->length)
                        break;

                if (key.type == BTRFS_EXTENT_ITEM_KEY ||
                    key.type == BTRFS_METADATA_ITEM_KEY) {
                        u64 space_added;

                        ret = btrfs_add_new_free_space(block_group, last,
                                                       key.objectid, &space_added);
                        if (ret)
                                goto out;
                        total_found += space_added;
                        if (key.type == BTRFS_METADATA_ITEM_KEY)
                                last = key.objectid +
                                        fs_info->nodesize;
                        else
                                last = key.objectid + key.offset;

                        if (total_found > CACHING_CTL_WAKE_UP) {
                                total_found = 0;
                                if (wakeup) {
                                        atomic_inc(&caching_ctl->progress);
                                        wake_up(&caching_ctl->wait);
                                }
                        }
                }
                path->slots[0]++;
        }

        ret = btrfs_add_new_free_space(block_group, last,
                                       block_group->start + block_group->length,
                                       NULL);
out:
        btrfs_free_path(path);
        return ret;
}

static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg)
{
        clear_extent_bits(&bg->fs_info->excluded_extents, bg->start,
                          bg->start + bg->length - 1, EXTENT_UPTODATE);
}

static noinline void caching_thread(struct btrfs_work *work)
{
        struct btrfs_block_group *block_group;
        struct btrfs_fs_info *fs_info;
        struct btrfs_caching_control *caching_ctl;
        int ret;

        caching_ctl = container_of(work, struct btrfs_caching_control, work);
        block_group = caching_ctl->block_group;
        fs_info = block_group->fs_info;

        mutex_lock(&caching_ctl->mutex);
        down_read(&fs_info->commit_root_sem);

        load_block_group_size_class(caching_ctl, block_group);
        if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
                ret = load_free_space_cache(block_group);
                if (ret == 1) {
                        ret = 0;
                        goto done;
                }

                /*
                 * We failed to load the space cache, set ourselves to
                 * CACHE_STARTED and carry on.
                 */
                spin_lock(&block_group->lock);
                block_group->cached = BTRFS_CACHE_STARTED;
                spin_unlock(&block_group->lock);
                wake_up(&caching_ctl->wait);
        }

        /*
         * If we are in the transaction that populated the free space tree we
         * can't actually cache from the free space tree as our commit root and
         * real root are the same, so we could change the contents of the blocks
         * while caching.  Instead do the slow caching in this case, and after
         * the transaction has committed we will be safe.
         */
        if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
            !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
                ret = load_free_space_tree(caching_ctl);
        else
                ret = load_extent_tree_free(caching_ctl);
done:
        spin_lock(&block_group->lock);
        block_group->caching_ctl = NULL;
        block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
        spin_unlock(&block_group->lock);

#ifdef CONFIG_BTRFS_DEBUG
        if (btrfs_should_fragment_free_space(block_group)) {
                u64 bytes_used;

                spin_lock(&block_group->space_info->lock);
                spin_lock(&block_group->lock);
                bytes_used = block_group->length - block_group->used;
                block_group->space_info->bytes_used += bytes_used >> 1;
                spin_unlock(&block_group->lock);
                spin_unlock(&block_group->space_info->lock);
                fragment_free_space(block_group);
        }
#endif

        up_read(&fs_info->commit_root_sem);
        btrfs_free_excluded_extents(block_group);
        mutex_unlock(&caching_ctl->mutex);

        wake_up(&caching_ctl->wait);

        btrfs_put_caching_control(caching_ctl);
        btrfs_put_block_group(block_group);
}

int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
{
        struct btrfs_fs_info *fs_info = cache->fs_info;
        struct btrfs_caching_control *caching_ctl = NULL;
        int ret = 0;

        /* Allocator for zoned filesystems does not use the cache at all */
        if (btrfs_is_zoned(fs_info))
                return 0;

        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
        if (!caching_ctl)
                return -ENOMEM;

        INIT_LIST_HEAD(&caching_ctl->list);
        mutex_init(&caching_ctl->mutex);
        init_waitqueue_head(&caching_ctl->wait);
        caching_ctl->block_group = cache;
        refcount_set(&caching_ctl->count, 2);
        atomic_set(&caching_ctl->progress, 0);
        btrfs_init_work(&caching_ctl->work, caching_thread, NULL);

        spin_lock(&cache->lock);
        if (cache->cached != BTRFS_CACHE_NO) {
                kfree(caching_ctl);

                caching_ctl = cache->caching_ctl;
                if (caching_ctl)
                        refcount_inc(&caching_ctl->count);
                spin_unlock(&cache->lock);
                goto out;
        }
        WARN_ON(cache->caching_ctl);
        cache->caching_ctl = caching_ctl;
        cache->cached = BTRFS_CACHE_STARTED;
        spin_unlock(&cache->lock);

        write_lock(&fs_info->block_group_cache_lock);
        refcount_inc(&caching_ctl->count);
        list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
        write_unlock(&fs_info->block_group_cache_lock);

        btrfs_get_block_group(cache);

        btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
out:
        if (wait && caching_ctl)
                ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
        if (caching_ctl)
                btrfs_put_caching_control(caching_ctl);

        return ret;
}

static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
{
        u64 extra_flags = chunk_to_extended(flags) &
                                BTRFS_EXTENDED_PROFILE_MASK;

        write_seqlock(&fs_info->profiles_lock);
        if (flags & BTRFS_BLOCK_GROUP_DATA)
                fs_info->avail_data_alloc_bits &= ~extra_flags;
        if (flags & BTRFS_BLOCK_GROUP_METADATA)
                fs_info->avail_metadata_alloc_bits &= ~extra_flags;
        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
                fs_info->avail_system_alloc_bits &= ~extra_flags;
        write_sequnlock(&fs_info->profiles_lock);
}

/*
 * Clear incompat bits for the following feature(s):
 *
 * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
 *            in the whole filesystem
 *
 * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
 */
static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
{
        bool found_raid56 = false;
        bool found_raid1c34 = false;

        if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
            (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
            (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
                struct list_head *head = &fs_info->space_info;
                struct btrfs_space_info *sinfo;

                list_for_each_entry_rcu(sinfo, head, list) {
                        down_read(&sinfo->groups_sem);
                        if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
                                found_raid56 = true;
                        if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
                                found_raid56 = true;
                        if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
                                found_raid1c34 = true;
                        if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
                                found_raid1c34 = true;
                        up_read(&sinfo->groups_sem);
                }
                if (!found_raid56)
                        btrfs_clear_fs_incompat(fs_info, RAID56);
                if (!found_raid1c34)
                        btrfs_clear_fs_incompat(fs_info, RAID1C34);
        }
}

static int remove_block_group_item(struct btrfs_trans_handle *trans,
                                   struct btrfs_path *path,
                                   struct btrfs_block_group *block_group)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *root;
        struct btrfs_key key;
        int ret;

        root = btrfs_block_group_root(fs_info);
        key.objectid = block_group->start;
        key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
        key.offset = block_group->length;

        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret > 0)
                ret = -ENOENT;
        if (ret < 0)
                return ret;

        ret = btrfs_del_item(trans, root, path);
        return ret;
}

int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_chunk_map *map)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_path *path;
        struct btrfs_block_group *block_group;
        struct btrfs_free_cluster *cluster;
        struct inode *inode;
        struct kobject *kobj = NULL;
        int ret;
        int index;
        int factor;
        struct btrfs_caching_control *caching_ctl = NULL;
        bool remove_map;
        bool remove_rsv = false;

        block_group = btrfs_lookup_block_group(fs_info, map->start);
        if (!block_group)
                return -ENOENT;

        BUG_ON(!block_group->ro);

        trace_btrfs_remove_block_group(block_group);
        /*
         * Free the reserved super bytes from this block group before
         * remove it.
         */
        btrfs_free_excluded_extents(block_group);
        btrfs_free_ref_tree_range(fs_info, block_group->start,
                                  block_group->length);

        index = btrfs_bg_flags_to_raid_index(block_group->flags);
        factor = btrfs_bg_type_to_factor(block_group->flags);

        /* make sure this block group isn't part of an allocation cluster */
        cluster = &fs_info->data_alloc_cluster;
        spin_lock(&cluster->refill_lock);
        btrfs_return_cluster_to_free_space(block_group, cluster);
        spin_unlock(&cluster->refill_lock);

        /*
         * make sure this block group isn't part of a metadata
         * allocation cluster
         */
        cluster = &fs_info->meta_alloc_cluster;
        spin_lock(&cluster->refill_lock);
        btrfs_return_cluster_to_free_space(block_group, cluster);
        spin_unlock(&cluster->refill_lock);

        btrfs_clear_treelog_bg(block_group);
        btrfs_clear_data_reloc_bg(block_group);

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }

        /*
         * get the inode first so any iput calls done for the io_list
         * aren't the final iput (no unlinks allowed now)
         */
        inode = lookup_free_space_inode(block_group, path);

        mutex_lock(&trans->transaction->cache_write_mutex);
        /*
         * Make sure our free space cache IO is done before removing the
         * free space inode
         */
        spin_lock(&trans->transaction->dirty_bgs_lock);
        if (!list_empty(&block_group->io_list)) {
                list_del_init(&block_group->io_list);

                WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);

                spin_unlock(&trans->transaction->dirty_bgs_lock);
                btrfs_wait_cache_io(trans, block_group, path);
                btrfs_put_block_group(block_group);
                spin_lock(&trans->transaction->dirty_bgs_lock);
        }

        if (!list_empty(&block_group->dirty_list)) {
                list_del_init(&block_group->dirty_list);
                remove_rsv = true;
                btrfs_put_block_group(block_group);
        }
        spin_unlock(&trans->transaction->dirty_bgs_lock);
        mutex_unlock(&trans->transaction->cache_write_mutex);

        ret = btrfs_remove_free_space_inode(trans, inode, block_group);
        if (ret)
                goto out;

        write_lock(&fs_info->block_group_cache_lock);
        rb_erase_cached(&block_group->cache_node,
                        &fs_info->block_group_cache_tree);
        RB_CLEAR_NODE(&block_group->cache_node);

        /* Once for the block groups rbtree */
        btrfs_put_block_group(block_group);

        write_unlock(&fs_info->block_group_cache_lock);

        down_write(&block_group->space_info->groups_sem);
        /*
         * we must use list_del_init so people can check to see if they
         * are still on the list after taking the semaphore
         */
        list_del_init(&block_group->list);
        if (list_empty(&block_group->space_info->block_groups[index])) {
                kobj = block_group->space_info->block_group_kobjs[index];
                block_group->space_info->block_group_kobjs[index] = NULL;
                clear_avail_alloc_bits(fs_info, block_group->flags);
        }
        up_write(&block_group->space_info->groups_sem);
        clear_incompat_bg_bits(fs_info, block_group->flags);
        if (kobj) {
                kobject_del(kobj);
                kobject_put(kobj);
        }

        if (block_group->cached == BTRFS_CACHE_STARTED)
                btrfs_wait_block_group_cache_done(block_group);

        write_lock(&fs_info->block_group_cache_lock);
        caching_ctl = btrfs_get_caching_control(block_group);
        if (!caching_ctl) {
                struct btrfs_caching_control *ctl;

                list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
                        if (ctl->block_group == block_group) {
                                caching_ctl = ctl;
                                refcount_inc(&caching_ctl->count);
                                break;
                        }
                }
        }
        if (caching_ctl)
                list_del_init(&caching_ctl->list);
        write_unlock(&fs_info->block_group_cache_lock);

        if (caching_ctl) {
                /* Once for the caching bgs list and once for us. */
                btrfs_put_caching_control(caching_ctl);
                btrfs_put_caching_control(caching_ctl);
        }

        spin_lock(&trans->transaction->dirty_bgs_lock);
        WARN_ON(!list_empty(&block_group->dirty_list));
        WARN_ON(!list_empty(&block_group->io_list));
        spin_unlock(&trans->transaction->dirty_bgs_lock);

        btrfs_remove_free_space_cache(block_group);

        spin_lock(&block_group->space_info->lock);
        list_del_init(&block_group->ro_list);

        if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
                WARN_ON(block_group->space_info->total_bytes
                        < block_group->length);
                WARN_ON(block_group->space_info->bytes_readonly
                        < block_group->length - block_group->zone_unusable);
                WARN_ON(block_group->space_info->bytes_zone_unusable
                        < block_group->zone_unusable);
                WARN_ON(block_group->space_info->disk_total
                        < block_group->length * factor);
        }
        block_group->space_info->total_bytes -= block_group->length;
        block_group->space_info->bytes_readonly -=
                (block_group->length - block_group->zone_unusable);
        block_group->space_info->bytes_zone_unusable -=
                block_group->zone_unusable;
        block_group->space_info->disk_total -= block_group->length * factor;

        spin_unlock(&block_group->space_info->lock);

        /*
         * Remove the free space for the block group from the free space tree
         * and the block group's item from the extent tree before marking the
         * block group as removed. This is to prevent races with tasks that
         * freeze and unfreeze a block group, this task and another task
         * allocating a new block group - the unfreeze task ends up removing
         * the block group's extent map before the task calling this function
         * deletes the block group item from the extent tree, allowing for
         * another task to attempt to create another block group with the same
         * item key (and failing with -EEXIST and a transaction abort).
         */
        ret = remove_block_group_free_space(trans, block_group);
        if (ret)
                goto out;

        ret = remove_block_group_item(trans, path, block_group);
        if (ret < 0)
                goto out;

        spin_lock(&block_group->lock);
        set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);

        /*
         * At this point trimming or scrub can't start on this block group,
         * because we removed the block group from the rbtree
         * fs_info->block_group_cache_tree so no one can't find it anymore and
         * even if someone already got this block group before we removed it
         * from the rbtree, they have already incremented block_group->frozen -
         * if they didn't, for the trimming case they won't find any free space
         * entries because we already removed them all when we called
         * btrfs_remove_free_space_cache().
         *
         * And we must not remove the chunk map from the fs_info->mapping_tree
         * to prevent the same logical address range and physical device space
         * ranges from being reused for a new block group. This is needed to
         * avoid races with trimming and scrub.
         *
         * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
         * completely transactionless, so while it is trimming a range the
         * currently running transaction might finish and a new one start,
         * allowing for new block groups to be created that can reuse the same
         * physical device locations unless we take this special care.
         *
         * There may also be an implicit trim operation if the file system
         * is mounted with -odiscard. The same protections must remain
         * in place until the extents have been discarded completely when
         * the transaction commit has completed.
         */
        remove_map = (atomic_read(&block_group->frozen) == 0);
        spin_unlock(&block_group->lock);

        if (remove_map)
                btrfs_remove_chunk_map(fs_info, map);

out:
        /* Once for the lookup reference */
        btrfs_put_block_group(block_group);
        if (remove_rsv)
                btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
        btrfs_free_path(path);
        return ret;
}

struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
                struct btrfs_fs_info *fs_info, const u64 chunk_offset)
{
        struct btrfs_root *root = btrfs_block_group_root(fs_info);
        struct btrfs_chunk_map *map;
        unsigned int num_items;

        map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
        ASSERT(map != NULL);
        ASSERT(map->start == chunk_offset);

        /*
         * We need to reserve 3 + N units from the metadata space info in order
         * to remove a block group (done at btrfs_remove_chunk() and at
         * btrfs_remove_block_group()), which are used for:
         *
         * 1 unit for adding the free space inode's orphan (located in the tree
         * of tree roots).
         * 1 unit for deleting the block group item (located in the extent
         * tree).
         * 1 unit for deleting the free space item (located in tree of tree
         * roots).
         * N units for deleting N device extent items corresponding to each
         * stripe (located in the device tree).
         *
         * In order to remove a block group we also need to reserve units in the
         * system space info in order to update the chunk tree (update one or
         * more device items and remove one chunk item), but this is done at
         * btrfs_remove_chunk() through a call to check_system_chunk().
         */
        num_items = 3 + map->num_stripes;
        btrfs_free_chunk_map(map);

        return btrfs_start_transaction_fallback_global_rsv(root, num_items);
}

/*
 * Mark block group @cache read-only, so later write won't happen to block
 * group @cache.
 *
 * If @force is not set, this function will only mark the block group readonly
 * if we have enough free space (1M) in other metadata/system block groups.
 * If @force is not set, this function will mark the block group readonly
 * without checking free space.
 *
 * NOTE: This function doesn't care if other block groups can contain all the
 * data in this block group. That check should be done by relocation routine,
 * not this function.
 */
static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
{
        struct btrfs_space_info *sinfo = cache->space_info;
        u64 num_bytes;
        int ret = -ENOSPC;

        spin_lock(&sinfo->lock);
        spin_lock(&cache->lock);

        if (cache->swap_extents) {
                ret = -ETXTBSY;
                goto out;
        }

        if (cache->ro) {
                cache->ro++;
                ret = 0;
                goto out;
        }

        num_bytes = cache->length - cache->reserved - cache->pinned -
                    cache->bytes_super - cache->zone_unusable - cache->used;

        /*
         * Data never overcommits, even in mixed mode, so do just the straight
         * check of left over space in how much we have allocated.
         */
        if (force) {
                ret = 0;
        } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
                u64 sinfo_used = btrfs_space_info_used(sinfo, true);

                /*
                 * Here we make sure if we mark this bg RO, we still have enough
                 * free space as buffer.
                 */
                if (sinfo_used + num_bytes <= sinfo->total_bytes)
                        ret = 0;
        } else {
                /*
                 * We overcommit metadata, so we need to do the
                 * btrfs_can_overcommit check here, and we need to pass in
                 * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
                 * leeway to allow us to mark this block group as read only.
                 */
                if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
                                         BTRFS_RESERVE_NO_FLUSH))
                        ret = 0;
        }

        if (!ret) {
                sinfo->bytes_readonly += num_bytes;
                if (btrfs_is_zoned(cache->fs_info)) {
                        /* Migrate zone_unusable bytes to readonly */
                        sinfo->bytes_readonly += cache->zone_unusable;
                        sinfo->bytes_zone_unusable -= cache->zone_unusable;
                        cache->zone_unusable = 0;
                }
                cache->ro++;
                list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
        }
out:
        spin_unlock(&cache->lock);
        spin_unlock(&sinfo->lock);
        if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
                btrfs_info(cache->fs_info,
                        "unable to make block group %llu ro", cache->start);
                btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
        }
        return ret;
}

static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
                                 struct btrfs_block_group *bg)
{
        struct btrfs_fs_info *fs_info = bg->fs_info;
        struct btrfs_transaction *prev_trans = NULL;
        const u64 start = bg->start;
        const u64 end = start + bg->length - 1;
        int ret;

        spin_lock(&fs_info->trans_lock);
        if (trans->transaction->list.prev != &fs_info->trans_list) {
                prev_trans = list_last_entry(&trans->transaction->list,
                                             struct btrfs_transaction, list);
                refcount_inc(&prev_trans->use_count);
        }
        spin_unlock(&fs_info->trans_lock);

        /*
         * Hold the unused_bg_unpin_mutex lock to avoid racing with
         * btrfs_finish_extent_commit(). If we are at transaction N, another
         * task might be running finish_extent_commit() for the previous
         * transaction N - 1, and have seen a range belonging to the block
         * group in pinned_extents before we were able to clear the whole block
         * group range from pinned_extents. This means that task can lookup for
         * the block group after we unpinned it from pinned_extents and removed
         * it, leading to an error at unpin_extent_range().
         */
        mutex_lock(&fs_info->unused_bg_unpin_mutex);
        if (prev_trans) {
                ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
                                        EXTENT_DIRTY);
                if (ret)
                        goto out;
        }

        ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
                                EXTENT_DIRTY);
out:
        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
        if (prev_trans)
                btrfs_put_transaction(prev_trans);

        return ret == 0;
}

/*
 * Process the unused_bgs list and remove any that don't have any allocated
 * space inside of them.
 */
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
{
        LIST_HEAD(retry_list);
        struct btrfs_block_group *block_group;
        struct btrfs_space_info *space_info;
        struct btrfs_trans_handle *trans;
        const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
        int ret = 0;

        if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
                return;

        if (btrfs_fs_closing(fs_info))
                return;

        /*
         * Long running balances can keep us blocked here for eternity, so
         * simply skip deletion if we're unable to get the mutex.
         */
        if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
                return;

        spin_lock(&fs_info->unused_bgs_lock);
        while (!list_empty(&fs_info->unused_bgs)) {
                u64 used;
                int trimming;

                block_group = list_first_entry(&fs_info->unused_bgs,
                                               struct btrfs_block_group,
                                               bg_list);
                list_del_init(&block_group->bg_list);

                space_info = block_group->space_info;

                if (ret || btrfs_mixed_space_info(space_info)) {
                        btrfs_put_block_group(block_group);
                        continue;
                }
                spin_unlock(&fs_info->unused_bgs_lock);

                btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);

                /* Don't want to race with allocators so take the groups_sem */
                down_write(&space_info->groups_sem);

                /*
                 * Async discard moves the final block group discard to be prior
                 * to the unused_bgs code path.  Therefore, if it's not fully
                 * trimmed, punt it back to the async discard lists.
                 */
                if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
                    !btrfs_is_free_space_trimmed(block_group)) {
                        trace_btrfs_skip_unused_block_group(block_group);
                        up_write(&space_info->groups_sem);
                        /* Requeue if we failed because of async discard */
                        btrfs_discard_queue_work(&fs_info->discard_ctl,
                                                 block_group);
                        goto next;
                }

                spin_lock(&space_info->lock);
                spin_lock(&block_group->lock);
                if (btrfs_is_block_group_used(block_group) || block_group->ro ||
                    list_is_singular(&block_group->list)) {
                        /*
                         * We want to bail if we made new allocations or have
                         * outstanding allocations in this block group.  We do
                         * the ro check in case balance is currently acting on
                         * this block group.
                         *
                         * Also bail out if this is the only block group for its
                         * type, because otherwise we would lose profile
                         * information from fs_info->avail_*_alloc_bits and the
                         * next block group of this type would be created with a
                         * "single" profile (even if we're in a raid fs) because
                         * fs_info->avail_*_alloc_bits would be 0.
                         */
                        trace_btrfs_skip_unused_block_group(block_group);
                        spin_unlock(&block_group->lock);
                        spin_unlock(&space_info->lock);
                        up_write(&space_info->groups_sem);
                        goto next;
                }

                /*
                 * The block group may be unused but there may be space reserved
                 * accounting with the existence of that block group, that is,
                 * space_info->bytes_may_use was incremented by a task but no
                 * space was yet allocated from the block group by the task.
                 * That space may or may not be allocated, as we are generally
                 * pessimistic about space reservation for metadata as well as
                 * for data when using compression (as we reserve space based on
                 * the worst case, when data can't be compressed, and before
                 * actually attempting compression, before starting writeback).
                 *
                 * So check if the total space of the space_info minus the size
                 * of this block group is less than the used space of the
                 * space_info - if that's the case, then it means we have tasks
                 * that might be relying on the block group in order to allocate
                 * extents, and add back the block group to the unused list when
                 * we finish, so that we retry later in case no tasks ended up
                 * needing to allocate extents from the block group.
                 */
                used = btrfs_space_info_used(space_info, true);
                if (space_info->total_bytes - block_group->length < used &&
                    block_group->zone_unusable < block_group->length) {
                        /*
                         * Add a reference for the list, compensate for the ref
                         * drop under the "next" label for the
                         * fs_info->unused_bgs list.
                         */
                        btrfs_get_block_group(block_group);
                        list_add_tail(&block_group->bg_list, &retry_list);

                        trace_btrfs_skip_unused_block_group(block_group);
                        spin_unlock(&block_group->lock);
                        spin_unlock(&space_info->lock);
                        up_write(&space_info->groups_sem);
                        goto next;
                }

                spin_unlock(&block_group->lock);
                spin_unlock(&space_info->lock);

                /* We don't want to force the issue, only flip if it's ok. */
                ret = inc_block_group_ro(block_group, 0);
                up_write(&space_info->groups_sem);
                if (ret < 0) {
                        ret = 0;
                        goto next;
                }

                ret = btrfs_zone_finish(block_group);
                if (ret < 0) {
                        btrfs_dec_block_group_ro(block_group);
                        if (ret == -EAGAIN)
                                ret = 0;
                        goto next;
                }

                /*
                 * Want to do this before we do anything else so we can recover
                 * properly if we fail to join the transaction.
                 */
                trans = btrfs_start_trans_remove_block_group(fs_info,
                                                     block_group->start);
                if (IS_ERR(trans)) {
                        btrfs_dec_block_group_ro(block_group);
                        ret = PTR_ERR(trans);
                        goto next;
                }

                /*
                 * We could have pending pinned extents for this block group,
                 * just delete them, we don't care about them anymore.
                 */
                if (!clean_pinned_extents(trans, block_group)) {
                        btrfs_dec_block_group_ro(block_group);
                        goto end_trans;
                }

                /*
                 * At this point, the block_group is read only and should fail
                 * new allocations.  However, btrfs_finish_extent_commit() can
                 * cause this block_group to be placed back on the discard
                 * lists because now the block_group isn't fully discarded.
                 * Bail here and try again later after discarding everything.
                 */
                spin_lock(&fs_info->discard_ctl.lock);
                if (!list_empty(&block_group->discard_list)) {
                        spin_unlock(&fs_info->discard_ctl.lock);
                        btrfs_dec_block_group_ro(block_group);
                        btrfs_discard_queue_work(&fs_info->discard_ctl,
                                                 block_group);
                        goto end_trans;
                }
                spin_unlock(&fs_info->discard_ctl.lock);

                /* Reset pinned so btrfs_put_block_group doesn't complain */
                spin_lock(&space_info->lock);
                spin_lock(&block_group->lock);

                btrfs_space_info_update_bytes_pinned(fs_info, space_info,
                                                     -block_group->pinned);
                space_info->bytes_readonly += block_group->pinned;
                block_group->pinned = 0;

                spin_unlock(&block_group->lock);
                spin_unlock(&space_info->lock);

                /*
                 * The normal path here is an unused block group is passed here,
                 * then trimming is handled in the transaction commit path.
                 * Async discard interposes before this to do the trimming
                 * before coming down the unused block group path as trimming
                 * will no longer be done later in the transaction commit path.
                 */
                if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
                        goto flip_async;

                /*
                 * DISCARD can flip during remount. On zoned filesystems, we
                 * need to reset sequential-required zones.
                 */
                trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
                                btrfs_is_zoned(fs_info);

                /* Implicit trim during transaction commit. */
                if (trimming)
                        btrfs_freeze_block_group(block_group);

                /*
                 * Btrfs_remove_chunk will abort the transaction if things go
                 * horribly wrong.
                 */
                ret = btrfs_remove_chunk(trans, block_group->start);

                if (ret) {
                        if (trimming)
                                btrfs_unfreeze_block_group(block_group);
                        goto end_trans;
                }

                /*
                 * If we're not mounted with -odiscard, we can just forget
                 * about this block group. Otherwise we'll need to wait
                 * until transaction commit to do the actual discard.
                 */
                if (trimming) {
                        spin_lock(&fs_info->unused_bgs_lock);
                        /*
                         * A concurrent scrub might have added us to the list
                         * fs_info->unused_bgs, so use a list_move operation
                         * to add the block group to the deleted_bgs list.
                         */
                        list_move(&block_group->bg_list,
                                  &trans->transaction->deleted_bgs);
                        spin_unlock(&fs_info->unused_bgs_lock);
                        btrfs_get_block_group(block_group);
                }
end_trans:
                btrfs_end_transaction(trans);
next:
                btrfs_put_block_group(block_group);
                spin_lock(&fs_info->unused_bgs_lock);
        }
        list_splice_tail(&retry_list, &fs_info->unused_bgs);
        spin_unlock(&fs_info->unused_bgs_lock);
        mutex_unlock(&fs_info->reclaim_bgs_lock);
        return;

flip_async:
        btrfs_end_transaction(trans);
        spin_lock(&fs_info->unused_bgs_lock);
        list_splice_tail(&retry_list, &fs_info->unused_bgs);
        spin_unlock(&fs_info->unused_bgs_lock);
        mutex_unlock(&fs_info->reclaim_bgs_lock);
        btrfs_put_block_group(block_group);
        btrfs_discard_punt_unused_bgs_list(fs_info);
}

void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
{
        struct btrfs_fs_info *fs_info = bg->fs_info;

        spin_lock(&fs_info->unused_bgs_lock);
        if (list_empty(&bg->bg_list)) {
                btrfs_get_block_group(bg);
                trace_btrfs_add_unused_block_group(bg);
                list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
        } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) {
                /* Pull out the block group from the reclaim_bgs list. */
                trace_btrfs_add_unused_block_group(bg);
                list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
        }
        spin_unlock(&fs_info->unused_bgs_lock);
}

/*
 * We want block groups with a low number of used bytes to be in the beginning
 * of the list, so they will get reclaimed first.
 */
static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
                           const struct list_head *b)
{
        const struct btrfs_block_group *bg1, *bg2;

        bg1 = list_entry(a, struct btrfs_block_group, bg_list);
        bg2 = list_entry(b, struct btrfs_block_group, bg_list);

        return bg1->used > bg2->used;
}

static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
{
        if (btrfs_is_zoned(fs_info))
                return btrfs_zoned_should_reclaim(fs_info);
        return true;
}

static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed)
{
        const struct btrfs_space_info *space_info = bg->space_info;
        const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
        const u64 new_val = bg->used;
        const u64 old_val = new_val + bytes_freed;
        u64 thresh;

        if (reclaim_thresh == 0)
                return false;

        thresh = mult_perc(bg->length, reclaim_thresh);

        /*
         * If we were below the threshold before don't reclaim, we are likely a
         * brand new block group and we don't want to relocate new block groups.
         */
        if (old_val < thresh)
                return false;
        if (new_val >= thresh)
                return false;
        return true;
}

void btrfs_reclaim_bgs_work(struct work_struct *work)
{
        struct btrfs_fs_info *fs_info =
                container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
        struct btrfs_block_group *bg;
        struct btrfs_space_info *space_info;
        LIST_HEAD(retry_list);

        if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
                return;

        if (btrfs_fs_closing(fs_info))
                return;

        if (!btrfs_should_reclaim(fs_info))
                return;

        sb_start_write(fs_info->sb);

        if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
                sb_end_write(fs_info->sb);
                return;
        }

        /*
         * Long running balances can keep us blocked here for eternity, so
         * simply skip reclaim if we're unable to get the mutex.
         */
        if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
                btrfs_exclop_finish(fs_info);
                sb_end_write(fs_info->sb);
                return;
        }

        spin_lock(&fs_info->unused_bgs_lock);
        /*
         * Sort happens under lock because we can't simply splice it and sort.
         * The block groups might still be in use and reachable via bg_list,
         * and their presence in the reclaim_bgs list must be preserved.
         */
        list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
        while (!list_empty(&fs_info->reclaim_bgs)) {
                u64 zone_unusable;
                int ret = 0;

                bg = list_first_entry(&fs_info->reclaim_bgs,
                                      struct btrfs_block_group,
                                      bg_list);
                list_del_init(&bg->bg_list);

                space_info = bg->space_info;
                spin_unlock(&fs_info->unused_bgs_lock);

                /* Don't race with allocators so take the groups_sem */
                down_write(&space_info->groups_sem);

                spin_lock(&bg->lock);
                if (bg->reserved || bg->pinned || bg->ro) {
                        /*
                         * We want to bail if we made new allocations or have
                         * outstanding allocations in this block group.  We do
                         * the ro check in case balance is currently acting on
                         * this block group.
                         */
                        spin_unlock(&bg->lock);
                        up_write(&space_info->groups_sem);
                        goto next;
                }
                if (bg->used == 0) {
                        /*
                         * It is possible that we trigger relocation on a block
                         * group as its extents are deleted and it first goes
                         * below the threshold, then shortly after goes empty.
                         *
                         * In this case, relocating it does delete it, but has
                         * some overhead in relocation specific metadata, looking
                         * for the non-existent extents and running some extra
                         * transactions, which we can avoid by using one of the
                         * other mechanisms for dealing with empty block groups.
                         */
                        if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
                                btrfs_mark_bg_unused(bg);
                        spin_unlock(&bg->lock);
                        up_write(&space_info->groups_sem);
                        goto next;

                }
                /*
                 * The block group might no longer meet the reclaim condition by
                 * the time we get around to reclaiming it, so to avoid
                 * reclaiming overly full block_groups, skip reclaiming them.
                 *
                 * Since the decision making process also depends on the amount
                 * being freed, pass in a fake giant value to skip that extra
                 * check, which is more meaningful when adding to the list in
                 * the first place.
                 */
                if (!should_reclaim_block_group(bg, bg->length)) {
                        spin_unlock(&bg->lock);
                        up_write(&space_info->groups_sem);
                        goto next;
                }
                spin_unlock(&bg->lock);

                /*
                 * Get out fast, in case we're read-only or unmounting the
                 * filesystem. It is OK to drop block groups from the list even
                 * for the read-only case. As we did sb_start_write(),
                 * "mount -o remount,ro" won't happen and read-only filesystem
                 * means it is forced read-only due to a fatal error. So, it
                 * never gets back to read-write to let us reclaim again.
                 */
                if (btrfs_need_cleaner_sleep(fs_info)) {
                        up_write(&space_info->groups_sem);
                        goto next;
                }

                /*
                 * Cache the zone_unusable value before turning the block group
                 * to read only. As soon as the blog group is read only it's
                 * zone_unusable value gets moved to the block group's read-only
                 * bytes and isn't available for calculations anymore.
                 */
                zone_unusable = bg->zone_unusable;
                ret = inc_block_group_ro(bg, 0);
                up_write(&space_info->groups_sem);
                if (ret < 0)
                        goto next;

                btrfs_info(fs_info,
                        "reclaiming chunk %llu with %llu%% used %llu%% unusable",
                                bg->start,
                                div64_u64(bg->used * 100, bg->length),
                                div64_u64(zone_unusable * 100, bg->length));
                trace_btrfs_reclaim_block_group(bg);
                ret = btrfs_relocate_chunk(fs_info, bg->start);
                if (ret) {
                        btrfs_dec_block_group_ro(bg);
                        btrfs_err(fs_info, "error relocating chunk %llu",
                                  bg->start);
                }

next:
                if (ret) {
                        /* Refcount held by the reclaim_bgs list after splice. */
                        btrfs_get_block_group(bg);
                        list_add_tail(&bg->bg_list, &retry_list);
                }
                btrfs_put_block_group(bg);

                mutex_unlock(&fs_info->reclaim_bgs_lock);
                /*
                 * Reclaiming all the block groups in the list can take really
                 * long.  Prioritize cleaning up unused block groups.
                 */
                btrfs_delete_unused_bgs(fs_info);
                /*
                 * If we are interrupted by a balance, we can just bail out. The
                 * cleaner thread restart again if necessary.
                 */
                if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
                        goto end;
                spin_lock(&fs_info->unused_bgs_lock);
        }
        spin_unlock(&fs_info->unused_bgs_lock);
        mutex_unlock(&fs_info->reclaim_bgs_lock);
end:
        spin_lock(&fs_info->unused_bgs_lock);
        list_splice_tail(&retry_list, &fs_info->reclaim_bgs);
        spin_unlock(&fs_info->unused_bgs_lock);
        btrfs_exclop_finish(fs_info);
        sb_end_write(fs_info->sb);
}

void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
{
        spin_lock(&fs_info->unused_bgs_lock);
        if (!list_empty(&fs_info->reclaim_bgs))
                queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
        spin_unlock(&fs_info->unused_bgs_lock);
}

void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
{
        struct btrfs_fs_info *fs_info = bg->fs_info;

        spin_lock(&fs_info->unused_bgs_lock);
        if (list_empty(&bg->bg_list)) {
                btrfs_get_block_group(bg);
                trace_btrfs_add_reclaim_block_group(bg);
                list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
        }
        spin_unlock(&fs_info->unused_bgs_lock);
}

static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
                           struct btrfs_path *path)
{
        struct btrfs_chunk_map *map;
        struct btrfs_block_group_item bg;
        struct extent_buffer *leaf;
        int slot;
        u64 flags;
        int ret = 0;

        slot = path->slots[0];
        leaf = path->nodes[0];

        map = btrfs_find_chunk_map(fs_info, key->objectid, key->offset);
        if (!map) {
                btrfs_err(fs_info,
                          "logical %llu len %llu found bg but no related chunk",
                          key->objectid, key->offset);
                return -ENOENT;
        }

        if (map->start != key->objectid || map->chunk_len != key->offset) {
                btrfs_err(fs_info,
                        "block group %llu len %llu mismatch with chunk %llu len %llu",
                          key->objectid, key->offset, map->start, map->chunk_len);
                ret = -EUCLEAN;
                goto out_free_map;
        }

        read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
                           sizeof(bg));
        flags = btrfs_stack_block_group_flags(&bg) &
                BTRFS_BLOCK_GROUP_TYPE_MASK;

        if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
                btrfs_err(fs_info,
"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
                          key->objectid, key->offset, flags,
                          (BTRFS_BLOCK_GROUP_TYPE_MASK & map->type));
                ret = -EUCLEAN;
        }

out_free_map:
        btrfs_free_chunk_map(map);
        return ret;
}

static int find_first_block_group(struct btrfs_fs_info *fs_info,
                                  struct btrfs_path *path,
                                  struct btrfs_key *key)
{
        struct btrfs_root *root = btrfs_block_group_root(fs_info);
        int ret;
        struct btrfs_key found_key;

        btrfs_for_each_slot(root, key, &found_key, path, ret) {
                if (found_key.objectid >= key->objectid &&
                    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
                        return read_bg_from_eb(fs_info, &found_key, path);
                }
        }
        return ret;
}

static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
{
        u64 extra_flags = chunk_to_extended(flags) &
                                BTRFS_EXTENDED_PROFILE_MASK;

        write_seqlock(&fs_info->profiles_lock);
        if (flags & BTRFS_BLOCK_GROUP_DATA)
                fs_info->avail_data_alloc_bits |= extra_flags;
        if (flags & BTRFS_BLOCK_GROUP_METADATA)
                fs_info->avail_metadata_alloc_bits |= extra_flags;
        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
                fs_info->avail_system_alloc_bits |= extra_flags;
        write_sequnlock(&fs_info->profiles_lock);
}

/*
 * Map a physical disk address to a list of logical addresses.
 *
 * @fs_info:       the filesystem
 * @chunk_start:   logical address of block group
 * @physical:           physical address to map to logical addresses
 * @logical:           return array of logical addresses which map to @physical
 * @naddrs:           length of @logical
 * @stripe_len:    size of IO stripe for the given block group
 *
 * Maps a particular @physical disk address to a list of @logical addresses.
 * Used primarily to exclude those portions of a block group that contain super
 * block copies.
 */
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
                     u64 physical, u64 **logical, int *naddrs, int *stripe_len)
{
        struct btrfs_chunk_map *map;
        u64 *buf;
        u64 bytenr;
        u64 data_stripe_length;
        u64 io_stripe_size;
        int i, nr = 0;
        int ret = 0;

        map = btrfs_get_chunk_map(fs_info, chunk_start, 1);
        if (IS_ERR(map))
                return -EIO;

        data_stripe_length = map->stripe_size;
        io_stripe_size = BTRFS_STRIPE_LEN;
        chunk_start = map->start;

        /* For RAID5/6 adjust to a full IO stripe length */
        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
                io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map));

        buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
        if (!buf) {
                ret = -ENOMEM;
                goto out;
        }

        for (i = 0; i < map->num_stripes; i++) {
                bool already_inserted = false;
                u32 stripe_nr;
                u32 offset;
                int j;

                if (!in_range(physical, map->stripes[i].physical,
                              data_stripe_length))
                        continue;

                stripe_nr = (physical - map->stripes[i].physical) >>
                            BTRFS_STRIPE_LEN_SHIFT;
                offset = (physical - map->stripes[i].physical) &
                         BTRFS_STRIPE_LEN_MASK;

                if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
                                 BTRFS_BLOCK_GROUP_RAID10))
                        stripe_nr = div_u64(stripe_nr * map->num_stripes + i,
                                            map->sub_stripes);
                /*
                 * The remaining case would be for RAID56, multiply by
                 * nr_data_stripes().  Alternatively, just use rmap_len below
                 * instead of map->stripe_len
                 */
                bytenr = chunk_start + stripe_nr * io_stripe_size + offset;

                /* Ensure we don't add duplicate addresses */
                for (j = 0; j < nr; j++) {
                        if (buf[j] == bytenr) {
                                already_inserted = true;
                                break;
                        }
                }

                if (!already_inserted)
                        buf[nr++] = bytenr;
        }

        *logical = buf;
        *naddrs = nr;
        *stripe_len = io_stripe_size;
out:
        btrfs_free_chunk_map(map);
        return ret;
}

static int exclude_super_stripes(struct btrfs_block_group *cache)
{
        struct btrfs_fs_info *fs_info = cache->fs_info;
        const bool zoned = btrfs_is_zoned(fs_info);
        u64 bytenr;
        u64 *logical;
        int stripe_len;
        int i, nr, ret;

        if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
                stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
                cache->bytes_super += stripe_len;
                ret = set_extent_bit(&fs_info->excluded_extents, cache->start,
                                     cache->start + stripe_len - 1,
                                     EXTENT_UPTODATE, NULL);
                if (ret)
                        return ret;
        }

        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
                bytenr = btrfs_sb_offset(i);
                ret = btrfs_rmap_block(fs_info, cache->start,
                                       bytenr, &logical, &nr, &stripe_len);
                if (ret)
                        return ret;

                /* Shouldn't have super stripes in sequential zones */
                if (zoned && nr) {
                        kfree(logical);
                        btrfs_err(fs_info,
                        "zoned: block group %llu must not contain super block",
                                  cache->start);
                        return -EUCLEAN;
                }

                while (nr--) {
                        u64 len = min_t(u64, stripe_len,
                                cache->start + cache->length - logical[nr]);

                        cache->bytes_super += len;
                        ret = set_extent_bit(&fs_info->excluded_extents, logical[nr],
                                             logical[nr] + len - 1,
                                             EXTENT_UPTODATE, NULL);
                        if (ret) {
                                kfree(logical);
                                return ret;
                        }
                }

                kfree(logical);
        }
        return 0;
}

static struct btrfs_block_group *btrfs_create_block_group_cache(
                struct btrfs_fs_info *fs_info, u64 start)
{
        struct btrfs_block_group *cache;

        cache = kzalloc(sizeof(*cache), GFP_NOFS);
        if (!cache)
                return NULL;

        cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
                                        GFP_NOFS);
        if (!cache->free_space_ctl) {
                kfree(cache);
                return NULL;
        }

        cache->start = start;

        cache->fs_info = fs_info;
        cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);

        cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;

        refcount_set(&cache->refs, 1);
        spin_lock_init(&cache->lock);
        init_rwsem(&cache->data_rwsem);
        INIT_LIST_HEAD(&cache->list);
        INIT_LIST_HEAD(&cache->cluster_list);
        INIT_LIST_HEAD(&cache->bg_list);
        INIT_LIST_HEAD(&cache->ro_list);
        INIT_LIST_HEAD(&cache->discard_list);
        INIT_LIST_HEAD(&cache->dirty_list);
        INIT_LIST_HEAD(&cache->io_list);
        INIT_LIST_HEAD(&cache->active_bg_list);
        btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
        atomic_set(&cache->frozen, 0);
        mutex_init(&cache->free_space_lock);

        return cache;
}

/*
 * Iterate all chunks and verify that each of them has the corresponding block
 * group
 */
static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
{
        u64 start = 0;
        int ret = 0;

        while (1) {
                struct btrfs_chunk_map *map;
                struct btrfs_block_group *bg;

                /*
                 * btrfs_find_chunk_map() will return the first chunk map
                 * intersecting the range, so setting @length to 1 is enough to
                 * get the first chunk.
                 */
                map = btrfs_find_chunk_map(fs_info, start, 1);
                if (!map)
                        break;

                bg = btrfs_lookup_block_group(fs_info, map->start);
                if (!bg) {
                        btrfs_err(fs_info,
        "chunk start=%llu len=%llu doesn't have corresponding block group",
                                     map->start, map->chunk_len);
                        ret = -EUCLEAN;
                        btrfs_free_chunk_map(map);
                        break;
                }
                if (bg->start != map->start || bg->length != map->chunk_len ||
                    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
                    (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
                        btrfs_err(fs_info,
"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
                                map->start, map->chunk_len,
                                map->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
                                bg->start, bg->length,
                                bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
                        ret = -EUCLEAN;
                        btrfs_free_chunk_map(map);
                        btrfs_put_block_group(bg);
                        break;
                }
                start = map->start + map->chunk_len;
                btrfs_free_chunk_map(map);
                btrfs_put_block_group(bg);
        }
        return ret;
}

static int read_one_block_group(struct btrfs_fs_info *info,
                                struct btrfs_block_group_item *bgi,
                                const struct btrfs_key *key,
                                int need_clear)
{
        struct btrfs_block_group *cache;
        const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
        int ret;

        ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);

        cache = btrfs_create_block_group_cache(info, key->objectid);
        if (!cache)
                return -ENOMEM;

        cache->length = key->offset;
        cache->used = btrfs_stack_block_group_used(bgi);
        cache->commit_used = cache->used;
        cache->flags = btrfs_stack_block_group_flags(bgi);
        cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);

        set_free_space_tree_thresholds(cache);

        if (need_clear) {
                /*
                 * When we mount with old space cache, we need to
                 * set BTRFS_DC_CLEAR and set dirty flag.
                 *
                 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
                 *    truncate the old free space cache inode and
                 *    setup a new one.
                 * b) Setting 'dirty flag' makes sure that we flush
                 *    the new space cache info onto disk.
                 */
                if (btrfs_test_opt(info, SPACE_CACHE))
                        cache->disk_cache_state = BTRFS_DC_CLEAR;
        }
        if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
            (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
                        btrfs_err(info,
"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
                                  cache->start);
                        ret = -EINVAL;
                        goto error;
        }

        ret = btrfs_load_block_group_zone_info(cache, false);
        if (ret) {
                btrfs_err(info, "zoned: failed to load zone info of bg %llu",
                          cache->start);
                goto error;
        }

        /*
         * We need to exclude the super stripes now so that the space info has
         * super bytes accounted for, otherwise we'll think we have more space
         * than we actually do.
         */
        ret = exclude_super_stripes(cache);
        if (ret) {
                /* We may have excluded something, so call this just in case. */
                btrfs_free_excluded_extents(cache);
                goto error;
        }

        /*
         * For zoned filesystem, space after the allocation offset is the only
         * free space for a block group. So, we don't need any caching work.
         * btrfs_calc_zone_unusable() will set the amount of free space and
         * zone_unusable space.
         *
         * For regular filesystem, check for two cases, either we are full, and
         * therefore don't need to bother with the caching work since we won't
         * find any space, or we are empty, and we can just add all the space
         * in and be done with it.  This saves us _a_lot_ of time, particularly
         * in the full case.
         */
        if (btrfs_is_zoned(info)) {
                btrfs_calc_zone_unusable(cache);
                /* Should not have any excluded extents. Just in case, though. */
                btrfs_free_excluded_extents(cache);
        } else if (cache->length == cache->used) {
                cache->cached = BTRFS_CACHE_FINISHED;
                btrfs_free_excluded_extents(cache);
        } else if (cache->used == 0) {
                cache->cached = BTRFS_CACHE_FINISHED;
                ret = btrfs_add_new_free_space(cache, cache->start,
                                               cache->start + cache->length, NULL);
                btrfs_free_excluded_extents(cache);
                if (ret)
                        goto error;
        }

        ret = btrfs_add_block_group_cache(info, cache);
        if (ret) {
                btrfs_remove_free_space_cache(cache);
                goto error;
        }
        trace_btrfs_add_block_group(info, cache, 0);
        btrfs_add_bg_to_space_info(info, cache);

        set_avail_alloc_bits(info, cache->flags);
        if (btrfs_chunk_writeable(info, cache->start)) {
                if (cache->used == 0) {
                        ASSERT(list_empty(&cache->bg_list));
                        if (btrfs_test_opt(info, DISCARD_ASYNC))
                                btrfs_discard_queue_work(&info->discard_ctl, cache);
                        else
                                btrfs_mark_bg_unused(cache);
                }
        } else {
                inc_block_group_ro(cache, 1);
        }

        return 0;
error:
        btrfs_put_block_group(cache);
        return ret;
}

static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
{
        struct rb_node *node;
        int ret = 0;

        for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
                struct btrfs_chunk_map *map;
                struct btrfs_block_group *bg;

                map = rb_entry(node, struct btrfs_chunk_map, rb_node);
                bg = btrfs_create_block_group_cache(fs_info, map->start);
                if (!bg) {
                        ret = -ENOMEM;
                        break;
                }

                /* Fill dummy cache as FULL */
                bg->length = map->chunk_len;
                bg->flags = map->type;
                bg->cached = BTRFS_CACHE_FINISHED;
                bg->used = map->chunk_len;
                bg->flags = map->type;
                ret = btrfs_add_block_group_cache(fs_info, bg);
                /*
                 * We may have some valid block group cache added already, in
                 * that case we skip to the next one.
                 */
                if (ret == -EEXIST) {
                        ret = 0;
                        btrfs_put_block_group(bg);
                        continue;
                }

                if (ret) {
                        btrfs_remove_free_space_cache(bg);
                        btrfs_put_block_group(bg);
                        break;
                }

                btrfs_add_bg_to_space_info(fs_info, bg);

                set_avail_alloc_bits(fs_info, bg->flags);
        }
        if (!ret)
                btrfs_init_global_block_rsv(fs_info);
        return ret;
}

int btrfs_read_block_groups(struct btrfs_fs_info *info)
{
        struct btrfs_root *root = btrfs_block_group_root(info);
        struct btrfs_path *path;
        int ret;
        struct btrfs_block_group *cache;
        struct btrfs_space_info *space_info;
        struct btrfs_key key;
        int need_clear = 0;
        u64 cache_gen;

        /*
         * Either no extent root (with ibadroots rescue option) or we have
         * unsupported RO options. The fs can never be mounted read-write, so no
         * need to waste time searching block group items.
         *
         * This also allows new extent tree related changes to be RO compat,
         * no need for a full incompat flag.
         */
        if (!root || (btrfs_super_compat_ro_flags(info->super_copy) &
                      ~BTRFS_FEATURE_COMPAT_RO_SUPP))
                return fill_dummy_bgs(info);

        key.objectid = 0;
        key.offset = 0;
        key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        cache_gen = btrfs_super_cache_generation(info->super_copy);
        if (btrfs_test_opt(info, SPACE_CACHE) &&
            btrfs_super_generation(info->super_copy) != cache_gen)
                need_clear = 1;
        if (btrfs_test_opt(info, CLEAR_CACHE))
                need_clear = 1;

        while (1) {
                struct btrfs_block_group_item bgi;
                struct extent_buffer *leaf;
                int slot;

                ret = find_first_block_group(info, path, &key);
                if (ret > 0)
                        break;
                if (ret != 0)
                        goto error;

                leaf = path->nodes[0];
                slot = path->slots[0];

                read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
                                   sizeof(bgi));

                btrfs_item_key_to_cpu(leaf, &key, slot);
                btrfs_release_path(path);
                ret = read_one_block_group(info, &bgi, &key, need_clear);
                if (ret < 0)
                        goto error;
                key.objectid += key.offset;
                key.offset = 0;
        }
        btrfs_release_path(path);

        list_for_each_entry(space_info, &info->space_info, list) {
                int i;

                for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
                        if (list_empty(&space_info->block_groups[i]))
                                continue;
                        cache = list_first_entry(&space_info->block_groups[i],
                                                 struct btrfs_block_group,
                                                 list);
                        btrfs_sysfs_add_block_group_type(cache);
                }

                if (!(btrfs_get_alloc_profile(info, space_info->flags) &
                      (BTRFS_BLOCK_GROUP_RAID10 |
                       BTRFS_BLOCK_GROUP_RAID1_MASK |
                       BTRFS_BLOCK_GROUP_RAID56_MASK |
                       BTRFS_BLOCK_GROUP_DUP)))
                        continue;
                /*
                 * Avoid allocating from un-mirrored block group if there are
                 * mirrored block groups.
                 */
                list_for_each_entry(cache,
                                &space_info->block_groups[BTRFS_RAID_RAID0],
                                list)
                        inc_block_group_ro(cache, 1);
                list_for_each_entry(cache,
                                &space_info->block_groups[BTRFS_RAID_SINGLE],
                                list)
                        inc_block_group_ro(cache, 1);
        }

        btrfs_init_global_block_rsv(info);
        ret = check_chunk_block_group_mappings(info);
error:
        btrfs_free_path(path);
        /*
         * We've hit some error while reading the extent tree, and have
         * rescue=ibadroots mount option.
         * Try to fill the tree using dummy block groups so that the user can
         * continue to mount and grab their data.
         */
        if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
                ret = fill_dummy_bgs(info);
        return ret;
}

/*
 * This function, insert_block_group_item(), belongs to the phase 2 of chunk
 * allocation.
 *
 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
 * phases.
 */
static int insert_block_group_item(struct btrfs_trans_handle *trans,
                                   struct btrfs_block_group *block_group)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group_item bgi;
        struct btrfs_root *root = btrfs_block_group_root(fs_info);
        struct btrfs_key key;
        u64 old_commit_used;
        int ret;

        spin_lock(&block_group->lock);
        btrfs_set_stack_block_group_used(&bgi, block_group->used);
        btrfs_set_stack_block_group_chunk_objectid(&bgi,
                                                   block_group->global_root_id);
        btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
        old_commit_used = block_group->commit_used;
        block_group->commit_used = block_group->used;
        key.objectid = block_group->start;
        key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
        key.offset = block_group->length;
        spin_unlock(&block_group->lock);

        ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
        if (ret < 0) {
                spin_lock(&block_group->lock);
                block_group->commit_used = old_commit_used;
                spin_unlock(&block_group->lock);
        }

        return ret;
}

static int insert_dev_extent(struct btrfs_trans_handle *trans,
                            struct btrfs_device *device, u64 chunk_offset,
                            u64 start, u64 num_bytes)
{
        struct btrfs_fs_info *fs_info = device->fs_info;
        struct btrfs_root *root = fs_info->dev_root;
        struct btrfs_path *path;
        struct btrfs_dev_extent *extent;
        struct extent_buffer *leaf;
        struct btrfs_key key;
        int ret;

        WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
        WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = device->devid;
        key.type = BTRFS_DEV_EXTENT_KEY;
        key.offset = start;
        ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
        if (ret)
                goto out;

        leaf = path->nodes[0];
        extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
        btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
        btrfs_set_dev_extent_chunk_objectid(leaf, extent,
                                            BTRFS_FIRST_CHUNK_TREE_OBJECTID);
        btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);

        btrfs_set_dev_extent_length(leaf, extent, num_bytes);
        btrfs_mark_buffer_dirty(trans, leaf);
out:
        btrfs_free_path(path);
        return ret;
}

/*
 * This function belongs to phase 2.
 *
 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
 * phases.
 */
static int insert_dev_extents(struct btrfs_trans_handle *trans,
                                   u64 chunk_offset, u64 chunk_size)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_device *device;
        struct btrfs_chunk_map *map;
        u64 dev_offset;
        int i;
        int ret = 0;

        map = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
        if (IS_ERR(map))
                return PTR_ERR(map);

        /*
         * Take the device list mutex to prevent races with the final phase of
         * a device replace operation that replaces the device object associated
         * with the map's stripes, because the device object's id can change
         * at any time during that final phase of the device replace operation
         * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
         * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
         * resulting in persisting a device extent item with such ID.
         */
        mutex_lock(&fs_info->fs_devices->device_list_mutex);
        for (i = 0; i < map->num_stripes; i++) {
                device = map->stripes[i].dev;
                dev_offset = map->stripes[i].physical;

                ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
                                        map->stripe_size);
                if (ret)
                        break;
        }
        mutex_unlock(&fs_info->fs_devices->device_list_mutex);

        btrfs_free_chunk_map(map);
        return ret;
}

/*
 * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
 * chunk allocation.
 *
 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
 * phases.
 */
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group *block_group;
        int ret = 0;

        while (!list_empty(&trans->new_bgs)) {
                int index;

                block_group = list_first_entry(&trans->new_bgs,
                                               struct btrfs_block_group,
                                               bg_list);
                if (ret)
                        goto next;

                index = btrfs_bg_flags_to_raid_index(block_group->flags);

                ret = insert_block_group_item(trans, block_group);
                if (ret)
                        btrfs_abort_transaction(trans, ret);
                if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
                              &block_group->runtime_flags)) {
                        mutex_lock(&fs_info->chunk_mutex);
                        ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
                        mutex_unlock(&fs_info->chunk_mutex);
                        if (ret)
                                btrfs_abort_transaction(trans, ret);
                }
                ret = insert_dev_extents(trans, block_group->start,
                                         block_group->length);
                if (ret)
                        btrfs_abort_transaction(trans, ret);
                add_block_group_free_space(trans, block_group);

                /*
                 * If we restriped during balance, we may have added a new raid
                 * type, so now add the sysfs entries when it is safe to do so.
                 * We don't have to worry about locking here as it's handled in
                 * btrfs_sysfs_add_block_group_type.
                 */
                if (block_group->space_info->block_group_kobjs[index] == NULL)
                        btrfs_sysfs_add_block_group_type(block_group);

                /* Already aborted the transaction if it failed. */
next:
                btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
                list_del_init(&block_group->bg_list);
                clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);

                /*
                 * If the block group is still unused, add it to the list of
                 * unused block groups. The block group may have been created in
                 * order to satisfy a space reservation, in which case the
                 * extent allocation only happens later. But often we don't
                 * actually need to allocate space that we previously reserved,
                 * so the block group may become unused for a long time. For
                 * example for metadata we generally reserve space for a worst
                 * possible scenario, but then don't end up allocating all that
                 * space or none at all (due to no need to COW, extent buffers
                 * were already COWed in the current transaction and still
                 * unwritten, tree heights lower than the maximum possible
                 * height, etc). For data we generally reserve the axact amount
                 * of space we are going to allocate later, the exception is
                 * when using compression, as we must reserve space based on the
                 * uncompressed data size, because the compression is only done
                 * when writeback triggered and we don't know how much space we
                 * are actually going to need, so we reserve the uncompressed
                 * size because the data may be uncompressible in the worst case.
                 */
                if (ret == 0) {
                        bool used;

                        spin_lock(&block_group->lock);
                        used = btrfs_is_block_group_used(block_group);
                        spin_unlock(&block_group->lock);

                        if (!used)
                                btrfs_mark_bg_unused(block_group);
                }
        }
        btrfs_trans_release_chunk_metadata(trans);
}

/*
 * For extent tree v2 we use the block_group_item->chunk_offset to point at our
 * global root id.  For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
 */
static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
{
        u64 div = SZ_1G;
        u64 index;

        if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
                return BTRFS_FIRST_CHUNK_TREE_OBJECTID;

        /* If we have a smaller fs index based on 128MiB. */
        if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
                div = SZ_128M;

        offset = div64_u64(offset, div);
        div64_u64_rem(offset, fs_info->nr_global_roots, &index);
        return index;
}

struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
                                                 u64 type,
                                                 u64 chunk_offset, u64 size)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group *cache;
        int ret;

        btrfs_set_log_full_commit(trans);

        cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
        if (!cache)
                return ERR_PTR(-ENOMEM);

        /*
         * Mark it as new before adding it to the rbtree of block groups or any
         * list, so that no other task finds it and calls btrfs_mark_bg_unused()
         * before the new flag is set.
         */
        set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags);

        cache->length = size;
        set_free_space_tree_thresholds(cache);
        cache->flags = type;
        cache->cached = BTRFS_CACHE_FINISHED;
        cache->global_root_id = calculate_global_root_id(fs_info, cache->start);

        if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
                set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);

        ret = btrfs_load_block_group_zone_info(cache, true);
        if (ret) {
                btrfs_put_block_group(cache);
                return ERR_PTR(ret);
        }

        ret = exclude_super_stripes(cache);
        if (ret) {
                /* We may have excluded something, so call this just in case */
                btrfs_free_excluded_extents(cache);
                btrfs_put_block_group(cache);
                return ERR_PTR(ret);
        }

        ret = btrfs_add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL);
        btrfs_free_excluded_extents(cache);
        if (ret) {
                btrfs_put_block_group(cache);
                return ERR_PTR(ret);
        }

        /*
         * Ensure the corresponding space_info object is created and
         * assigned to our block group. We want our bg to be added to the rbtree
         * with its ->space_info set.
         */
        cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
        ASSERT(cache->space_info);

        ret = btrfs_add_block_group_cache(fs_info, cache);
        if (ret) {
                btrfs_remove_free_space_cache(cache);
                btrfs_put_block_group(cache);
                return ERR_PTR(ret);
        }

        /*
         * Now that our block group has its ->space_info set and is inserted in
         * the rbtree, update the space info's counters.
         */
        trace_btrfs_add_block_group(fs_info, cache, 1);
        btrfs_add_bg_to_space_info(fs_info, cache);
        btrfs_update_global_block_rsv(fs_info);

#ifdef CONFIG_BTRFS_DEBUG
        if (btrfs_should_fragment_free_space(cache)) {
                cache->space_info->bytes_used += size >> 1;
                fragment_free_space(cache);
        }
#endif

        list_add_tail(&cache->bg_list, &trans->new_bgs);
        btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info);

        set_avail_alloc_bits(fs_info, type);
        return cache;
}

/*
 * Mark one block group RO, can be called several times for the same block
 * group.
 *
 * @cache:                the destination block group
 * @do_chunk_alloc:        whether need to do chunk pre-allocation, this is to
 *                         ensure we still have some free space after marking this
 *                         block group RO.
 */
int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
                             bool do_chunk_alloc)
{
        struct btrfs_fs_info *fs_info = cache->fs_info;
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = btrfs_block_group_root(fs_info);
        u64 alloc_flags;
        int ret;
        bool dirty_bg_running;

        /*
         * This can only happen when we are doing read-only scrub on read-only
         * mount.
         * In that case we should not start a new transaction on read-only fs.
         * Thus here we skip all chunk allocations.
         */
        if (sb_rdonly(fs_info->sb)) {
                mutex_lock(&fs_info->ro_block_group_mutex);
                ret = inc_block_group_ro(cache, 0);
                mutex_unlock(&fs_info->ro_block_group_mutex);
                return ret;
        }

        do {
                trans = btrfs_join_transaction(root);
                if (IS_ERR(trans))
                        return PTR_ERR(trans);

                dirty_bg_running = false;

                /*
                 * We're not allowed to set block groups readonly after the dirty
                 * block group cache has started writing.  If it already started,
                 * back off and let this transaction commit.
                 */
                mutex_lock(&fs_info->ro_block_group_mutex);
                if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
                        u64 transid = trans->transid;

                        mutex_unlock(&fs_info->ro_block_group_mutex);
                        btrfs_end_transaction(trans);

                        ret = btrfs_wait_for_commit(fs_info, transid);
                        if (ret)
                                return ret;
                        dirty_bg_running = true;
                }
        } while (dirty_bg_running);

        if (do_chunk_alloc) {
                /*
                 * If we are changing raid levels, try to allocate a
                 * corresponding block group with the new raid level.
                 */
                alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
                if (alloc_flags != cache->flags) {
                        ret = btrfs_chunk_alloc(trans, alloc_flags,
                                                CHUNK_ALLOC_FORCE);
                        /*
                         * ENOSPC is allowed here, we may have enough space
                         * already allocated at the new raid level to carry on
                         */
                        if (ret == -ENOSPC)
                                ret = 0;
                        if (ret < 0)
                                goto out;
                }
        }

        ret = inc_block_group_ro(cache, 0);
        if (!ret)
                goto out;
        if (ret == -ETXTBSY)
                goto unlock_out;

        /*
         * Skip chunk allocation if the bg is SYSTEM, this is to avoid system
         * chunk allocation storm to exhaust the system chunk array.  Otherwise
         * we still want to try our best to mark the block group read-only.
         */
        if (!do_chunk_alloc && ret == -ENOSPC &&
            (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM))
                goto unlock_out;

        alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
        ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
        if (ret < 0)
                goto out;
        /*
         * We have allocated a new chunk. We also need to activate that chunk to
         * grant metadata tickets for zoned filesystem.
         */
        ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true);
        if (ret < 0)
                goto out;

        ret = inc_block_group_ro(cache, 0);
        if (ret == -ETXTBSY)
                goto unlock_out;
out:
        if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
                alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
                mutex_lock(&fs_info->chunk_mutex);
                check_system_chunk(trans, alloc_flags);
                mutex_unlock(&fs_info->chunk_mutex);
        }
unlock_out:
        mutex_unlock(&fs_info->ro_block_group_mutex);

        btrfs_end_transaction(trans);
        return ret;
}

void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
{
        struct btrfs_space_info *sinfo = cache->space_info;
        u64 num_bytes;

        BUG_ON(!cache->ro);

        spin_lock(&sinfo->lock);
        spin_lock(&cache->lock);
        if (!--cache->ro) {
                if (btrfs_is_zoned(cache->fs_info)) {
                        /* Migrate zone_unusable bytes back */
                        cache->zone_unusable =
                                (cache->alloc_offset - cache->used) +
                                (cache->length - cache->zone_capacity);
                        sinfo->bytes_zone_unusable += cache->zone_unusable;
                        sinfo->bytes_readonly -= cache->zone_unusable;
                }
                num_bytes = cache->length - cache->reserved -
                            cache->pinned - cache->bytes_super -
                            cache->zone_unusable - cache->used;
                sinfo->bytes_readonly -= num_bytes;
                list_del_init(&cache->ro_list);
        }
        spin_unlock(&cache->lock);
        spin_unlock(&sinfo->lock);
}

static int update_block_group_item(struct btrfs_trans_handle *trans,
                                   struct btrfs_path *path,
                                   struct btrfs_block_group *cache)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        int ret;
        struct btrfs_root *root = btrfs_block_group_root(fs_info);
        unsigned long bi;
        struct extent_buffer *leaf;
        struct btrfs_block_group_item bgi;
        struct btrfs_key key;
        u64 old_commit_used;
        u64 used;

        /*
         * Block group items update can be triggered out of commit transaction
         * critical section, thus we need a consistent view of used bytes.
         * We cannot use cache->used directly outside of the spin lock, as it
         * may be changed.
         */
        spin_lock(&cache->lock);
        old_commit_used = cache->commit_used;
        used = cache->used;
        /* No change in used bytes, can safely skip it. */
        if (cache->commit_used == used) {
                spin_unlock(&cache->lock);
                return 0;
        }
        cache->commit_used = used;
        spin_unlock(&cache->lock);

        key.objectid = cache->start;
        key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
        key.offset = cache->length;

        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        if (ret) {
                if (ret > 0)
                        ret = -ENOENT;
                goto fail;
        }

        leaf = path->nodes[0];
        bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
        btrfs_set_stack_block_group_used(&bgi, used);
        btrfs_set_stack_block_group_chunk_objectid(&bgi,
                                                   cache->global_root_id);
        btrfs_set_stack_block_group_flags(&bgi, cache->flags);
        write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
        btrfs_mark_buffer_dirty(trans, leaf);
fail:
        btrfs_release_path(path);
        /*
         * We didn't update the block group item, need to revert commit_used
         * unless the block group item didn't exist yet - this is to prevent a
         * race with a concurrent insertion of the block group item, with
         * insert_block_group_item(), that happened just after we attempted to
         * update. In that case we would reset commit_used to 0 just after the
         * insertion set it to a value greater than 0 - if the block group later
         * becomes with 0 used bytes, we would incorrectly skip its update.
         */
        if (ret < 0 && ret != -ENOENT) {
                spin_lock(&cache->lock);
                cache->commit_used = old_commit_used;
                spin_unlock(&cache->lock);
        }
        return ret;

}

static int cache_save_setup(struct btrfs_block_group *block_group,
                            struct btrfs_trans_handle *trans,
                            struct btrfs_path *path)
{
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        struct inode *inode = NULL;
        struct extent_changeset *data_reserved = NULL;
        u64 alloc_hint = 0;
        int dcs = BTRFS_DC_ERROR;
        u64 cache_size = 0;
        int retries = 0;
        int ret = 0;

        if (!btrfs_test_opt(fs_info, SPACE_CACHE))
                return 0;

        /*
         * If this block group is smaller than 100 megs don't bother caching the
         * block group.
         */
        if (block_group->length < (100 * SZ_1M)) {
                spin_lock(&block_group->lock);
                block_group->disk_cache_state = BTRFS_DC_WRITTEN;
                spin_unlock(&block_group->lock);
                return 0;
        }

        if (TRANS_ABORTED(trans))
                return 0;
again:
        inode = lookup_free_space_inode(block_group, path);
        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
                ret = PTR_ERR(inode);
                btrfs_release_path(path);
                goto out;
        }

        if (IS_ERR(inode)) {
                BUG_ON(retries);
                retries++;

                if (block_group->ro)
                        goto out_free;

                ret = create_free_space_inode(trans, block_group, path);
                if (ret)
                        goto out_free;
                goto again;
        }

        /*
         * We want to set the generation to 0, that way if anything goes wrong
         * from here on out we know not to trust this cache when we load up next
         * time.
         */
        BTRFS_I(inode)->generation = 0;
        ret = btrfs_update_inode(trans, BTRFS_I(inode));
        if (ret) {
                /*
                 * So theoretically we could recover from this, simply set the
                 * super cache generation to 0 so we know to invalidate the
                 * cache, but then we'd have to keep track of the block groups
                 * that fail this way so we know we _have_ to reset this cache
                 * before the next commit or risk reading stale cache.  So to
                 * limit our exposure to horrible edge cases lets just abort the
                 * transaction, this only happens in really bad situations
                 * anyway.
                 */
                btrfs_abort_transaction(trans, ret);
                goto out_put;
        }
        WARN_ON(ret);

        /* We've already setup this transaction, go ahead and exit */
        if (block_group->cache_generation == trans->transid &&
            i_size_read(inode)) {
                dcs = BTRFS_DC_SETUP;
                goto out_put;
        }

        if (i_size_read(inode) > 0) {
                ret = btrfs_check_trunc_cache_free_space(fs_info,
                                        &fs_info->global_block_rsv);
                if (ret)
                        goto out_put;

                ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
                if (ret)
                        goto out_put;
        }

        spin_lock(&block_group->lock);
        if (block_group->cached != BTRFS_CACHE_FINISHED ||
            !btrfs_test_opt(fs_info, SPACE_CACHE)) {
                /*
                 * don't bother trying to write stuff out _if_
                 * a) we're not cached,
                 * b) we're with nospace_cache mount option,
                 * c) we're with v2 space_cache (FREE_SPACE_TREE).
                 */
                dcs = BTRFS_DC_WRITTEN;
                spin_unlock(&block_group->lock);
                goto out_put;
        }
        spin_unlock(&block_group->lock);

        /*
         * We hit an ENOSPC when setting up the cache in this transaction, just
         * skip doing the setup, we've already cleared the cache so we're safe.
         */
        if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
                ret = -ENOSPC;
                goto out_put;
        }

        /*
         * Try to preallocate enough space based on how big the block group is.
         * Keep in mind this has to include any pinned space which could end up
         * taking up quite a bit since it's not folded into the other space
         * cache.
         */
        cache_size = div_u64(block_group->length, SZ_256M);
        if (!cache_size)
                cache_size = 1;

        cache_size *= 16;
        cache_size *= fs_info->sectorsize;

        ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
                                          cache_size, false);
        if (ret)
                goto out_put;

        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
                                              cache_size, cache_size,
                                              &alloc_hint);
        /*
         * Our cache requires contiguous chunks so that we don't modify a bunch
         * of metadata or split extents when writing the cache out, which means
         * we can enospc if we are heavily fragmented in addition to just normal
         * out of space conditions.  So if we hit this just skip setting up any
         * other block groups for this transaction, maybe we'll unpin enough
         * space the next time around.
         */
        if (!ret)
                dcs = BTRFS_DC_SETUP;
        else if (ret == -ENOSPC)
                set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);

out_put:
        iput(inode);
out_free:
        btrfs_release_path(path);
out:
        spin_lock(&block_group->lock);
        if (!ret && dcs == BTRFS_DC_SETUP)
                block_group->cache_generation = trans->transid;
        block_group->disk_cache_state = dcs;
        spin_unlock(&block_group->lock);

        extent_changeset_free(data_reserved);
        return ret;
}

int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group *cache, *tmp;
        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_path *path;

        if (list_empty(&cur_trans->dirty_bgs) ||
            !btrfs_test_opt(fs_info, SPACE_CACHE))
                return 0;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        /* Could add new block groups, use _safe just in case */
        list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
                                 dirty_list) {
                if (cache->disk_cache_state == BTRFS_DC_CLEAR)
                        cache_save_setup(cache, trans, path);
        }

        btrfs_free_path(path);
        return 0;
}

/*
 * Transaction commit does final block group cache writeback during a critical
 * section where nothing is allowed to change the FS.  This is required in
 * order for the cache to actually match the block group, but can introduce a
 * lot of latency into the commit.
 *
 * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
 * There's a chance we'll have to redo some of it if the block group changes
 * again during the commit, but it greatly reduces the commit latency by
 * getting rid of the easy block groups while we're still allowing others to
 * join the commit.
 */
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group *cache;
        struct btrfs_transaction *cur_trans = trans->transaction;
        int ret = 0;
        int should_put;
        struct btrfs_path *path = NULL;
        LIST_HEAD(dirty);
        struct list_head *io = &cur_trans->io_bgs;
        int loops = 0;

        spin_lock(&cur_trans->dirty_bgs_lock);
        if (list_empty(&cur_trans->dirty_bgs)) {
                spin_unlock(&cur_trans->dirty_bgs_lock);
                return 0;
        }
        list_splice_init(&cur_trans->dirty_bgs, &dirty);
        spin_unlock(&cur_trans->dirty_bgs_lock);

again:
        /* Make sure all the block groups on our dirty list actually exist */
        btrfs_create_pending_block_groups(trans);

        if (!path) {
                path = btrfs_alloc_path();
                if (!path) {
                        ret = -ENOMEM;
                        goto out;
                }
        }

        /*
         * cache_write_mutex is here only to save us from balance or automatic
         * removal of empty block groups deleting this block group while we are
         * writing out the cache
         */
        mutex_lock(&trans->transaction->cache_write_mutex);
        while (!list_empty(&dirty)) {
                bool drop_reserve = true;

                cache = list_first_entry(&dirty, struct btrfs_block_group,
                                         dirty_list);
                /*
                 * This can happen if something re-dirties a block group that
                 * is already under IO.  Just wait for it to finish and then do
                 * it all again
                 */
                if (!list_empty(&cache->io_list)) {
                        list_del_init(&cache->io_list);
                        btrfs_wait_cache_io(trans, cache, path);
                        btrfs_put_block_group(cache);
                }


                /*
                 * btrfs_wait_cache_io uses the cache->dirty_list to decide if
                 * it should update the cache_state.  Don't delete until after
                 * we wait.
                 *
                 * Since we're not running in the commit critical section
                 * we need the dirty_bgs_lock to protect from update_block_group
                 */
                spin_lock(&cur_trans->dirty_bgs_lock);
                list_del_init(&cache->dirty_list);
                spin_unlock(&cur_trans->dirty_bgs_lock);

                should_put = 1;

                cache_save_setup(cache, trans, path);

                if (cache->disk_cache_state == BTRFS_DC_SETUP) {
                        cache->io_ctl.inode = NULL;
                        ret = btrfs_write_out_cache(trans, cache, path);
                        if (ret == 0 && cache->io_ctl.inode) {
                                should_put = 0;

                                /*
                                 * The cache_write_mutex is protecting the
                                 * io_list, also refer to the definition of
                                 * btrfs_transaction::io_bgs for more details
                                 */
                                list_add_tail(&cache->io_list, io);
                        } else {
                                /*
                                 * If we failed to write the cache, the
                                 * generation will be bad and life goes on
                                 */
                                ret = 0;
                        }
                }
                if (!ret) {
                        ret = update_block_group_item(trans, path, cache);
                        /*
                         * Our block group might still be attached to the list
                         * of new block groups in the transaction handle of some
                         * other task (struct btrfs_trans_handle->new_bgs). This
                         * means its block group item isn't yet in the extent
                         * tree. If this happens ignore the error, as we will
                         * try again later in the critical section of the
                         * transaction commit.
                         */
                        if (ret == -ENOENT) {
                                ret = 0;
                                spin_lock(&cur_trans->dirty_bgs_lock);
                                if (list_empty(&cache->dirty_list)) {
                                        list_add_tail(&cache->dirty_list,
                                                      &cur_trans->dirty_bgs);
                                        btrfs_get_block_group(cache);
                                        drop_reserve = false;
                                }
                                spin_unlock(&cur_trans->dirty_bgs_lock);
                        } else if (ret) {
                                btrfs_abort_transaction(trans, ret);
                        }
                }

                /* If it's not on the io list, we need to put the block group */
                if (should_put)
                        btrfs_put_block_group(cache);
                if (drop_reserve)
                        btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
                /*
                 * Avoid blocking other tasks for too long. It might even save
                 * us from writing caches for block groups that are going to be
                 * removed.
                 */
                mutex_unlock(&trans->transaction->cache_write_mutex);
                if (ret)
                        goto out;
                mutex_lock(&trans->transaction->cache_write_mutex);
        }
        mutex_unlock(&trans->transaction->cache_write_mutex);

        /*
         * Go through delayed refs for all the stuff we've just kicked off
         * and then loop back (just once)
         */
        if (!ret)
                ret = btrfs_run_delayed_refs(trans, 0);
        if (!ret && loops == 0) {
                loops++;
                spin_lock(&cur_trans->dirty_bgs_lock);
                list_splice_init(&cur_trans->dirty_bgs, &dirty);
                /*
                 * dirty_bgs_lock protects us from concurrent block group
                 * deletes too (not just cache_write_mutex).
                 */
                if (!list_empty(&dirty)) {
                        spin_unlock(&cur_trans->dirty_bgs_lock);
                        goto again;
                }
                spin_unlock(&cur_trans->dirty_bgs_lock);
        }
out:
        if (ret < 0) {
                spin_lock(&cur_trans->dirty_bgs_lock);
                list_splice_init(&dirty, &cur_trans->dirty_bgs);
                spin_unlock(&cur_trans->dirty_bgs_lock);
                btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
        }

        btrfs_free_path(path);
        return ret;
}

int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group *cache;
        struct btrfs_transaction *cur_trans = trans->transaction;
        int ret = 0;
        int should_put;
        struct btrfs_path *path;
        struct list_head *io = &cur_trans->io_bgs;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        /*
         * Even though we are in the critical section of the transaction commit,
         * we can still have concurrent tasks adding elements to this
         * transaction's list of dirty block groups. These tasks correspond to
         * endio free space workers started when writeback finishes for a
         * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
         * allocate new block groups as a result of COWing nodes of the root
         * tree when updating the free space inode. The writeback for the space
         * caches is triggered by an earlier call to
         * btrfs_start_dirty_block_groups() and iterations of the following
         * loop.
         * Also we want to do the cache_save_setup first and then run the
         * delayed refs to make sure we have the best chance at doing this all
         * in one shot.
         */
        spin_lock(&cur_trans->dirty_bgs_lock);
        while (!list_empty(&cur_trans->dirty_bgs)) {
                cache = list_first_entry(&cur_trans->dirty_bgs,
                                         struct btrfs_block_group,
                                         dirty_list);

                /*
                 * This can happen if cache_save_setup re-dirties a block group
                 * that is already under IO.  Just wait for it to finish and
                 * then do it all again
                 */
                if (!list_empty(&cache->io_list)) {
                        spin_unlock(&cur_trans->dirty_bgs_lock);
                        list_del_init(&cache->io_list);
                        btrfs_wait_cache_io(trans, cache, path);
                        btrfs_put_block_group(cache);
                        spin_lock(&cur_trans->dirty_bgs_lock);
                }

                /*
                 * Don't remove from the dirty list until after we've waited on
                 * any pending IO
                 */
                list_del_init(&cache->dirty_list);
                spin_unlock(&cur_trans->dirty_bgs_lock);
                should_put = 1;

                cache_save_setup(cache, trans, path);

                if (!ret)
                        ret = btrfs_run_delayed_refs(trans, U64_MAX);

                if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
                        cache->io_ctl.inode = NULL;
                        ret = btrfs_write_out_cache(trans, cache, path);
                        if (ret == 0 && cache->io_ctl.inode) {
                                should_put = 0;
                                list_add_tail(&cache->io_list, io);
                        } else {
                                /*
                                 * If we failed to write the cache, the
                                 * generation will be bad and life goes on
                                 */
                                ret = 0;
                        }
                }
                if (!ret) {
                        ret = update_block_group_item(trans, path, cache);
                        /*
                         * One of the free space endio workers might have
                         * created a new block group while updating a free space
                         * cache's inode (at inode.c:btrfs_finish_ordered_io())
                         * and hasn't released its transaction handle yet, in
                         * which case the new block group is still attached to
                         * its transaction handle and its creation has not
                         * finished yet (no block group item in the extent tree
                         * yet, etc). If this is the case, wait for all free
                         * space endio workers to finish and retry. This is a
                         * very rare case so no need for a more efficient and
                         * complex approach.
                         */
                        if (ret == -ENOENT) {
                                wait_event(cur_trans->writer_wait,
                                   atomic_read(&cur_trans->num_writers) == 1);
                                ret = update_block_group_item(trans, path, cache);
                        }
                        if (ret)
                                btrfs_abort_transaction(trans, ret);
                }

                /* If its not on the io list, we need to put the block group */
                if (should_put)
                        btrfs_put_block_group(cache);
                btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
                spin_lock(&cur_trans->dirty_bgs_lock);
        }
        spin_unlock(&cur_trans->dirty_bgs_lock);

        /*
         * Refer to the definition of io_bgs member for details why it's safe
         * to use it without any locking
         */
        while (!list_empty(io)) {
                cache = list_first_entry(io, struct btrfs_block_group,
                                         io_list);
                list_del_init(&cache->io_list);
                btrfs_wait_cache_io(trans, cache, path);
                btrfs_put_block_group(cache);
        }

        btrfs_free_path(path);
        return ret;
}

int btrfs_update_block_group(struct btrfs_trans_handle *trans,
                             u64 bytenr, u64 num_bytes, bool alloc)
{
        struct btrfs_fs_info *info = trans->fs_info;
        struct btrfs_space_info *space_info;
        struct btrfs_block_group *cache;
        u64 old_val;
        bool reclaim = false;
        bool bg_already_dirty = true;
        int factor;

        /* Block accounting for super block */
        spin_lock(&info->delalloc_root_lock);
        old_val = btrfs_super_bytes_used(info->super_copy);
        if (alloc)
                old_val += num_bytes;
        else
                old_val -= num_bytes;
        btrfs_set_super_bytes_used(info->super_copy, old_val);
        spin_unlock(&info->delalloc_root_lock);

        cache = btrfs_lookup_block_group(info, bytenr);
        if (!cache)
                return -ENOENT;

        /* An extent can not span multiple block groups. */
        ASSERT(bytenr + num_bytes <= cache->start + cache->length);

        space_info = cache->space_info;
        factor = btrfs_bg_type_to_factor(cache->flags);

        /*
         * If this block group has free space cache written out, we need to make
         * sure to load it if we are removing space.  This is because we need
         * the unpinning stage to actually add the space back to the block group,
         * otherwise we will leak space.
         */
        if (!alloc && !btrfs_block_group_done(cache))
                btrfs_cache_block_group(cache, true);

        spin_lock(&space_info->lock);
        spin_lock(&cache->lock);

        if (btrfs_test_opt(info, SPACE_CACHE) &&
            cache->disk_cache_state < BTRFS_DC_CLEAR)
                cache->disk_cache_state = BTRFS_DC_CLEAR;

        old_val = cache->used;
        if (alloc) {
                old_val += num_bytes;
                cache->used = old_val;
                cache->reserved -= num_bytes;
                space_info->bytes_reserved -= num_bytes;
                space_info->bytes_used += num_bytes;
                space_info->disk_used += num_bytes * factor;
                spin_unlock(&cache->lock);
                spin_unlock(&space_info->lock);
        } else {
                old_val -= num_bytes;
                cache->used = old_val;
                cache->pinned += num_bytes;
                btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes);
                space_info->bytes_used -= num_bytes;
                space_info->disk_used -= num_bytes * factor;

                reclaim = should_reclaim_block_group(cache, num_bytes);

                spin_unlock(&cache->lock);
                spin_unlock(&space_info->lock);

                set_extent_bit(&trans->transaction->pinned_extents, bytenr,
                               bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
        }

        spin_lock(&trans->transaction->dirty_bgs_lock);
        if (list_empty(&cache->dirty_list)) {
                list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs);
                bg_already_dirty = false;
                btrfs_get_block_group(cache);
        }
        spin_unlock(&trans->transaction->dirty_bgs_lock);

        /*
         * No longer have used bytes in this block group, queue it for deletion.
         * We do this after adding the block group to the dirty list to avoid
         * races between cleaner kthread and space cache writeout.
         */
        if (!alloc && old_val == 0) {
                if (!btrfs_test_opt(info, DISCARD_ASYNC))
                        btrfs_mark_bg_unused(cache);
        } else if (!alloc && reclaim) {
                btrfs_mark_bg_to_reclaim(cache);
        }

        btrfs_put_block_group(cache);

        /* Modified block groups are accounted for in the delayed_refs_rsv. */
        if (!bg_already_dirty)
                btrfs_inc_delayed_refs_rsv_bg_updates(info);

        return 0;
}

/*
 * Update the block_group and space info counters.
 *
 * @cache:        The cache we are manipulating
 * @ram_bytes:  The number of bytes of file content, and will be same to
 *              @num_bytes except for the compress path.
 * @num_bytes:        The number of bytes in question
 * @delalloc:   The blocks are allocated for the delalloc write
 *
 * This is called by the allocator when it reserves space. If this is a
 * reservation and the block group has become read only we cannot make the
 * reservation and return -EAGAIN, otherwise this function always succeeds.
 */
int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
                             u64 ram_bytes, u64 num_bytes, int delalloc,
                             bool force_wrong_size_class)
{
        struct btrfs_space_info *space_info = cache->space_info;
        enum btrfs_block_group_size_class size_class;
        int ret = 0;

        spin_lock(&space_info->lock);
        spin_lock(&cache->lock);
        if (cache->ro) {
                ret = -EAGAIN;
                goto out;
        }

        if (btrfs_block_group_should_use_size_class(cache)) {
                size_class = btrfs_calc_block_group_size_class(num_bytes);
                ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class);
                if (ret)
                        goto out;
        }
        cache->reserved += num_bytes;
        space_info->bytes_reserved += num_bytes;
        trace_btrfs_space_reservation(cache->fs_info, "space_info",
                                      space_info->flags, num_bytes, 1);
        btrfs_space_info_update_bytes_may_use(cache->fs_info,
                                              space_info, -ram_bytes);
        if (delalloc)
                cache->delalloc_bytes += num_bytes;

        /*
         * Compression can use less space than we reserved, so wake tickets if
         * that happens.
         */
        if (num_bytes < ram_bytes)
                btrfs_try_granting_tickets(cache->fs_info, space_info);
out:
        spin_unlock(&cache->lock);
        spin_unlock(&space_info->lock);
        return ret;
}

/*
 * Update the block_group and space info counters.
 *
 * @cache:      The cache we are manipulating
 * @num_bytes:  The number of bytes in question
 * @delalloc:   The blocks are allocated for the delalloc write
 *
 * This is called by somebody who is freeing space that was never actually used
 * on disk.  For example if you reserve some space for a new leaf in transaction
 * A and before transaction A commits you free that leaf, you call this with
 * reserve set to 0 in order to clear the reservation.
 */
void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
                               u64 num_bytes, int delalloc)
{
        struct btrfs_space_info *space_info = cache->space_info;

        spin_lock(&space_info->lock);
        spin_lock(&cache->lock);
        if (cache->ro)
                space_info->bytes_readonly += num_bytes;
        cache->reserved -= num_bytes;
        space_info->bytes_reserved -= num_bytes;
        space_info->max_extent_size = 0;

        if (delalloc)
                cache->delalloc_bytes -= num_bytes;
        spin_unlock(&cache->lock);

        btrfs_try_granting_tickets(cache->fs_info, space_info);
        spin_unlock(&space_info->lock);
}

static void force_metadata_allocation(struct btrfs_fs_info *info)
{
        struct list_head *head = &info->space_info;
        struct btrfs_space_info *found;

        list_for_each_entry(found, head, list) {
                if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
                        found->force_alloc = CHUNK_ALLOC_FORCE;
        }
}

static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
                              struct btrfs_space_info *sinfo, int force)
{
        u64 bytes_used = btrfs_space_info_used(sinfo, false);
        u64 thresh;

        if (force == CHUNK_ALLOC_FORCE)
                return 1;

        /*
         * in limited mode, we want to have some free space up to
         * about 1% of the FS size.
         */
        if (force == CHUNK_ALLOC_LIMITED) {
                thresh = btrfs_super_total_bytes(fs_info->super_copy);
                thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1));

                if (sinfo->total_bytes - bytes_used < thresh)
                        return 1;
        }

        if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80))
                return 0;
        return 1;
}

int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
{
        u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);

        return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
}

static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
{
        struct btrfs_block_group *bg;
        int ret;

        /*
         * Check if we have enough space in the system space info because we
         * will need to update device items in the chunk btree and insert a new
         * chunk item in the chunk btree as well. This will allocate a new
         * system block group if needed.
         */
        check_system_chunk(trans, flags);

        bg = btrfs_create_chunk(trans, flags);
        if (IS_ERR(bg)) {
                ret = PTR_ERR(bg);
                goto out;
        }

        ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
        /*
         * Normally we are not expected to fail with -ENOSPC here, since we have
         * previously reserved space in the system space_info and allocated one
         * new system chunk if necessary. However there are three exceptions:
         *
         * 1) We may have enough free space in the system space_info but all the
         *    existing system block groups have a profile which can not be used
         *    for extent allocation.
         *
         *    This happens when mounting in degraded mode. For example we have a
         *    RAID1 filesystem with 2 devices, lose one device and mount the fs
         *    using the other device in degraded mode. If we then allocate a chunk,
         *    we may have enough free space in the existing system space_info, but
         *    none of the block groups can be used for extent allocation since they
         *    have a RAID1 profile, and because we are in degraded mode with a
         *    single device, we are forced to allocate a new system chunk with a
         *    SINGLE profile. Making check_system_chunk() iterate over all system
         *    block groups and check if they have a usable profile and enough space
         *    can be slow on very large filesystems, so we tolerate the -ENOSPC and
         *    try again after forcing allocation of a new system chunk. Like this
         *    we avoid paying the cost of that search in normal circumstances, when
         *    we were not mounted in degraded mode;
         *
         * 2) We had enough free space info the system space_info, and one suitable
         *    block group to allocate from when we called check_system_chunk()
         *    above. However right after we called it, the only system block group
         *    with enough free space got turned into RO mode by a running scrub,
         *    and in this case we have to allocate a new one and retry. We only
         *    need do this allocate and retry once, since we have a transaction
         *    handle and scrub uses the commit root to search for block groups;
         *
         * 3) We had one system block group with enough free space when we called
         *    check_system_chunk(), but after that, right before we tried to
         *    allocate the last extent buffer we needed, a discard operation came
         *    in and it temporarily removed the last free space entry from the
         *    block group (discard removes a free space entry, discards it, and
         *    then adds back the entry to the block group cache).
         */
        if (ret == -ENOSPC) {
                const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
                struct btrfs_block_group *sys_bg;

                sys_bg = btrfs_create_chunk(trans, sys_flags);
                if (IS_ERR(sys_bg)) {
                        ret = PTR_ERR(sys_bg);
                        btrfs_abort_transaction(trans, ret);
                        goto out;
                }

                ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        goto out;
                }

                ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        goto out;
                }
        } else if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out;
        }
out:
        btrfs_trans_release_chunk_metadata(trans);

        if (ret)
                return ERR_PTR(ret);

        btrfs_get_block_group(bg);
        return bg;
}

/*
 * Chunk allocation is done in 2 phases:
 *
 * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
 *    the chunk, the chunk mapping, create its block group and add the items
 *    that belong in the chunk btree to it - more specifically, we need to
 *    update device items in the chunk btree and add a new chunk item to it.
 *
 * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
 *    group item to the extent btree and the device extent items to the devices
 *    btree.
 *
 * This is done to prevent deadlocks. For example when COWing a node from the
 * extent btree we are holding a write lock on the node's parent and if we
 * trigger chunk allocation and attempted to insert the new block group item
 * in the extent btree right way, we could deadlock because the path for the
 * insertion can include that parent node. At first glance it seems impossible
 * to trigger chunk allocation after starting a transaction since tasks should
 * reserve enough transaction units (metadata space), however while that is true
 * most of the time, chunk allocation may still be triggered for several reasons:
 *
 * 1) When reserving metadata, we check if there is enough free space in the
 *    metadata space_info and therefore don't trigger allocation of a new chunk.
 *    However later when the task actually tries to COW an extent buffer from
 *    the extent btree or from the device btree for example, it is forced to
 *    allocate a new block group (chunk) because the only one that had enough
 *    free space was just turned to RO mode by a running scrub for example (or
 *    device replace, block group reclaim thread, etc), so we can not use it
 *    for allocating an extent and end up being forced to allocate a new one;
 *
 * 2) Because we only check that the metadata space_info has enough free bytes,
 *    we end up not allocating a new metadata chunk in that case. However if
 *    the filesystem was mounted in degraded mode, none of the existing block
 *    groups might be suitable for extent allocation due to their incompatible
 *    profile (for e.g. mounting a 2 devices filesystem, where all block groups
 *    use a RAID1 profile, in degraded mode using a single device). In this case
 *    when the task attempts to COW some extent buffer of the extent btree for
 *    example, it will trigger allocation of a new metadata block group with a
 *    suitable profile (SINGLE profile in the example of the degraded mount of
 *    the RAID1 filesystem);
 *
 * 3) The task has reserved enough transaction units / metadata space, but when
 *    it attempts to COW an extent buffer from the extent or device btree for
 *    example, it does not find any free extent in any metadata block group,
 *    therefore forced to try to allocate a new metadata block group.
 *    This is because some other task allocated all available extents in the
 *    meanwhile - this typically happens with tasks that don't reserve space
 *    properly, either intentionally or as a bug. One example where this is
 *    done intentionally is fsync, as it does not reserve any transaction units
 *    and ends up allocating a variable number of metadata extents for log
 *    tree extent buffers;
 *
 * 4) The task has reserved enough transaction units / metadata space, but right
 *    before it tries to allocate the last extent buffer it needs, a discard
 *    operation comes in and, temporarily, removes the last free space entry from
 *    the only metadata block group that had free space (discard starts by
 *    removing a free space entry from a block group, then does the discard
 *    operation and, once it's done, it adds back the free space entry to the
 *    block group).
 *
 * We also need this 2 phases setup when adding a device to a filesystem with
 * a seed device - we must create new metadata and system chunks without adding
 * any of the block group items to the chunk, extent and device btrees. If we
 * did not do it this way, we would get ENOSPC when attempting to update those
 * btrees, since all the chunks from the seed device are read-only.
 *
 * Phase 1 does the updates and insertions to the chunk btree because if we had
 * it done in phase 2 and have a thundering herd of tasks allocating chunks in
 * parallel, we risk having too many system chunks allocated by many tasks if
 * many tasks reach phase 1 without the previous ones completing phase 2. In the
 * extreme case this leads to exhaustion of the system chunk array in the
 * superblock. This is easier to trigger if using a btree node/leaf size of 64K
 * and with RAID filesystems (so we have more device items in the chunk btree).
 * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
 * the system chunk array due to concurrent allocations") provides more details.
 *
 * Allocation of system chunks does not happen through this function. A task that
 * needs to update the chunk btree (the only btree that uses system chunks), must
 * preallocate chunk space by calling either check_system_chunk() or
 * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
 * metadata chunk or when removing a chunk, while the later is used before doing
 * a modification to the chunk btree - use cases for the later are adding,
 * removing and resizing a device as well as relocation of a system chunk.
 * See the comment below for more details.
 *
 * The reservation of system space, done through check_system_chunk(), as well
 * as all the updates and insertions into the chunk btree must be done while
 * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
 * an extent buffer from the chunks btree we never trigger allocation of a new
 * system chunk, which would result in a deadlock (trying to lock twice an
 * extent buffer of the chunk btree, first time before triggering the chunk
 * allocation and the second time during chunk allocation while attempting to
 * update the chunks btree). The system chunk array is also updated while holding
 * that mutex. The same logic applies to removing chunks - we must reserve system
 * space, update the chunk btree and the system chunk array in the superblock
 * while holding fs_info->chunk_mutex.
 *
 * This function, btrfs_chunk_alloc(), belongs to phase 1.
 *
 * If @force is CHUNK_ALLOC_FORCE:
 *    - return 1 if it successfully allocates a chunk,
 *    - return errors including -ENOSPC otherwise.
 * If @force is NOT CHUNK_ALLOC_FORCE:
 *    - return 0 if it doesn't need to allocate a new chunk,
 *    - return 1 if it successfully allocates a chunk,
 *    - return errors including -ENOSPC otherwise.
 */
int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
                      enum btrfs_chunk_alloc_enum force)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_space_info *space_info;
        struct btrfs_block_group *ret_bg;
        bool wait_for_alloc = false;
        bool should_alloc = false;
        bool from_extent_allocation = false;
        int ret = 0;

        if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
                from_extent_allocation = true;
                force = CHUNK_ALLOC_FORCE;
        }

        /* Don't re-enter if we're already allocating a chunk */
        if (trans->allocating_chunk)
                return -ENOSPC;
        /*
         * Allocation of system chunks can not happen through this path, as we
         * could end up in a deadlock if we are allocating a data or metadata
         * chunk and there is another task modifying the chunk btree.
         *
         * This is because while we are holding the chunk mutex, we will attempt
         * to add the new chunk item to the chunk btree or update an existing
         * device item in the chunk btree, while the other task that is modifying
         * the chunk btree is attempting to COW an extent buffer while holding a
         * lock on it and on its parent - if the COW operation triggers a system
         * chunk allocation, then we can deadlock because we are holding the
         * chunk mutex and we may need to access that extent buffer or its parent
         * in order to add the chunk item or update a device item.
         *
         * Tasks that want to modify the chunk tree should reserve system space
         * before updating the chunk btree, by calling either
         * btrfs_reserve_chunk_metadata() or check_system_chunk().
         * It's possible that after a task reserves the space, it still ends up
         * here - this happens in the cases described above at do_chunk_alloc().
         * The task will have to either retry or fail.
         */
        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
                return -ENOSPC;

        space_info = btrfs_find_space_info(fs_info, flags);
        ASSERT(space_info);

        do {
                spin_lock(&space_info->lock);
                if (force < space_info->force_alloc)
                        force = space_info->force_alloc;
                should_alloc = should_alloc_chunk(fs_info, space_info, force);
                if (space_info->full) {
                        /* No more free physical space */
                        if (should_alloc)
                                ret = -ENOSPC;
                        else
                                ret = 0;
                        spin_unlock(&space_info->lock);
                        return ret;
                } else if (!should_alloc) {
                        spin_unlock(&space_info->lock);
                        return 0;
                } else if (space_info->chunk_alloc) {
                        /*
                         * Someone is already allocating, so we need to block
                         * until this someone is finished and then loop to
                         * recheck if we should continue with our allocation
                         * attempt.
                         */
                        wait_for_alloc = true;
                        force = CHUNK_ALLOC_NO_FORCE;
                        spin_unlock(&space_info->lock);
                        mutex_lock(&fs_info->chunk_mutex);
                        mutex_unlock(&fs_info->chunk_mutex);
                } else {
                        /* Proceed with allocation */
                        space_info->chunk_alloc = 1;
                        wait_for_alloc = false;
                        spin_unlock(&space_info->lock);
                }

                cond_resched();
        } while (wait_for_alloc);

        mutex_lock(&fs_info->chunk_mutex);
        trans->allocating_chunk = true;

        /*
         * If we have mixed data/metadata chunks we want to make sure we keep
         * allocating mixed chunks instead of individual chunks.
         */
        if (btrfs_mixed_space_info(space_info))
                flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);

        /*
         * if we're doing a data chunk, go ahead and make sure that
         * we keep a reasonable number of metadata chunks allocated in the
         * FS as well.
         */
        if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
                fs_info->data_chunk_allocations++;
                if (!(fs_info->data_chunk_allocations %
                      fs_info->metadata_ratio))
                        force_metadata_allocation(fs_info);
        }

        ret_bg = do_chunk_alloc(trans, flags);
        trans->allocating_chunk = false;

        if (IS_ERR(ret_bg)) {
                ret = PTR_ERR(ret_bg);
        } else if (from_extent_allocation && (flags & BTRFS_BLOCK_GROUP_DATA)) {
                /*
                 * New block group is likely to be used soon. Try to activate
                 * it now. Failure is OK for now.
                 */
                btrfs_zone_activate(ret_bg);
        }

        if (!ret)
                btrfs_put_block_group(ret_bg);

        spin_lock(&space_info->lock);
        if (ret < 0) {
                if (ret == -ENOSPC)
                        space_info->full = 1;
                else
                        goto out;
        } else {
                ret = 1;
                space_info->max_extent_size = 0;
        }

        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
out:
        space_info->chunk_alloc = 0;
        spin_unlock(&space_info->lock);
        mutex_unlock(&fs_info->chunk_mutex);

        return ret;
}

static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
{
        u64 num_dev;

        num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
        if (!num_dev)
                num_dev = fs_info->fs_devices->rw_devices;

        return num_dev;
}

static void reserve_chunk_space(struct btrfs_trans_handle *trans,
                                u64 bytes,
                                u64 type)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_space_info *info;
        u64 left;
        int ret = 0;

        /*
         * Needed because we can end up allocating a system chunk and for an
         * atomic and race free space reservation in the chunk block reserve.
         */
        lockdep_assert_held(&fs_info->chunk_mutex);

        info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
        spin_lock(&info->lock);
        left = info->total_bytes - btrfs_space_info_used(info, true);
        spin_unlock(&info->lock);

        if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
                btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
                           left, bytes, type);
                btrfs_dump_space_info(fs_info, info, 0, 0);
        }

        if (left < bytes) {
                u64 flags = btrfs_system_alloc_profile(fs_info);
                struct btrfs_block_group *bg;

                /*
                 * Ignore failure to create system chunk. We might end up not
                 * needing it, as we might not need to COW all nodes/leafs from
                 * the paths we visit in the chunk tree (they were already COWed
                 * or created in the current transaction for example).
                 */
                bg = btrfs_create_chunk(trans, flags);
                if (IS_ERR(bg)) {
                        ret = PTR_ERR(bg);
                } else {
                        /*
                         * We have a new chunk. We also need to activate it for
                         * zoned filesystem.
                         */
                        ret = btrfs_zoned_activate_one_bg(fs_info, info, true);
                        if (ret < 0)
                                return;

                        /*
                         * If we fail to add the chunk item here, we end up
                         * trying again at phase 2 of chunk allocation, at
                         * btrfs_create_pending_block_groups(). So ignore
                         * any error here. An ENOSPC here could happen, due to
                         * the cases described at do_chunk_alloc() - the system
                         * block group we just created was just turned into RO
                         * mode by a scrub for example, or a running discard
                         * temporarily removed its free space entries, etc.
                         */
                        btrfs_chunk_alloc_add_chunk_item(trans, bg);
                }
        }

        if (!ret) {
                ret = btrfs_block_rsv_add(fs_info,
                                          &fs_info->chunk_block_rsv,
                                          bytes, BTRFS_RESERVE_NO_FLUSH);
                if (!ret)
                        trans->chunk_bytes_reserved += bytes;
        }
}

/*
 * Reserve space in the system space for allocating or removing a chunk.
 * The caller must be holding fs_info->chunk_mutex.
 */
void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        const u64 num_devs = get_profile_num_devs(fs_info, type);
        u64 bytes;

        /* num_devs device items to update and 1 chunk item to add or remove. */
        bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
                btrfs_calc_insert_metadata_size(fs_info, 1);

        reserve_chunk_space(trans, bytes, type);
}

/*
 * Reserve space in the system space, if needed, for doing a modification to the
 * chunk btree.
 *
 * @trans:                A transaction handle.
 * @is_item_insertion:        Indicate if the modification is for inserting a new item
 *                        in the chunk btree or if it's for the deletion or update
 *                        of an existing item.
 *
 * This is used in a context where we need to update the chunk btree outside
 * block group allocation and removal, to avoid a deadlock with a concurrent
 * task that is allocating a metadata or data block group and therefore needs to
 * update the chunk btree while holding the chunk mutex. After the update to the
 * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
 *
 */
void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
                                  bool is_item_insertion)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        u64 bytes;

        if (is_item_insertion)
                bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
        else
                bytes = btrfs_calc_metadata_size(fs_info, 1);

        mutex_lock(&fs_info->chunk_mutex);
        reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
        mutex_unlock(&fs_info->chunk_mutex);
}

void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
        struct btrfs_block_group *block_group;

        block_group = btrfs_lookup_first_block_group(info, 0);
        while (block_group) {
                btrfs_wait_block_group_cache_done(block_group);
                spin_lock(&block_group->lock);
                if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
                                       &block_group->runtime_flags)) {
                        struct inode *inode = block_group->inode;

                        block_group->inode = NULL;
                        spin_unlock(&block_group->lock);

                        ASSERT(block_group->io_ctl.inode == NULL);
                        iput(inode);
                } else {
                        spin_unlock(&block_group->lock);
                }
                block_group = btrfs_next_block_group(block_group);
        }
}

/*
 * Must be called only after stopping all workers, since we could have block
 * group caching kthreads running, and therefore they could race with us if we
 * freed the block groups before stopping them.
 */
int btrfs_free_block_groups(struct btrfs_fs_info *info)
{
        struct btrfs_block_group *block_group;
        struct btrfs_space_info *space_info;
        struct btrfs_caching_control *caching_ctl;
        struct rb_node *n;

        if (btrfs_is_zoned(info)) {
                if (info->active_meta_bg) {
                        btrfs_put_block_group(info->active_meta_bg);
                        info->active_meta_bg = NULL;
                }
                if (info->active_system_bg) {
                        btrfs_put_block_group(info->active_system_bg);
                        info->active_system_bg = NULL;
                }
        }

        write_lock(&info->block_group_cache_lock);
        while (!list_empty(&info->caching_block_groups)) {
                caching_ctl = list_entry(info->caching_block_groups.next,
                                         struct btrfs_caching_control, list);
                list_del(&caching_ctl->list);
                btrfs_put_caching_control(caching_ctl);
        }
        write_unlock(&info->block_group_cache_lock);

        spin_lock(&info->unused_bgs_lock);
        while (!list_empty(&info->unused_bgs)) {
                block_group = list_first_entry(&info->unused_bgs,
                                               struct btrfs_block_group,
                                               bg_list);
                list_del_init(&block_group->bg_list);
                btrfs_put_block_group(block_group);
        }

        while (!list_empty(&info->reclaim_bgs)) {
                block_group = list_first_entry(&info->reclaim_bgs,
                                               struct btrfs_block_group,
                                               bg_list);
                list_del_init(&block_group->bg_list);
                btrfs_put_block_group(block_group);
        }
        spin_unlock(&info->unused_bgs_lock);

        spin_lock(&info->zone_active_bgs_lock);
        while (!list_empty(&info->zone_active_bgs)) {
                block_group = list_first_entry(&info->zone_active_bgs,
                                               struct btrfs_block_group,
                                               active_bg_list);
                list_del_init(&block_group->active_bg_list);
                btrfs_put_block_group(block_group);
        }
        spin_unlock(&info->zone_active_bgs_lock);

        write_lock(&info->block_group_cache_lock);
        while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
                block_group = rb_entry(n, struct btrfs_block_group,
                                       cache_node);
                rb_erase_cached(&block_group->cache_node,
                                &info->block_group_cache_tree);
                RB_CLEAR_NODE(&block_group->cache_node);
                write_unlock(&info->block_group_cache_lock);

                down_write(&block_group->space_info->groups_sem);
                list_del(&block_group->list);
                up_write(&block_group->space_info->groups_sem);

                /*
                 * We haven't cached this block group, which means we could
                 * possibly have excluded extents on this block group.
                 */
                if (block_group->cached == BTRFS_CACHE_NO ||
                    block_group->cached == BTRFS_CACHE_ERROR)
                        btrfs_free_excluded_extents(block_group);

                btrfs_remove_free_space_cache(block_group);
                ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
                ASSERT(list_empty(&block_group->dirty_list));
                ASSERT(list_empty(&block_group->io_list));
                ASSERT(list_empty(&block_group->bg_list));
                ASSERT(refcount_read(&block_group->refs) == 1);
                ASSERT(block_group->swap_extents == 0);
                btrfs_put_block_group(block_group);

                write_lock(&info->block_group_cache_lock);
        }
        write_unlock(&info->block_group_cache_lock);

        btrfs_release_global_block_rsv(info);

        while (!list_empty(&info->space_info)) {
                space_info = list_entry(info->space_info.next,
                                        struct btrfs_space_info,
                                        list);

                /*
                 * Do not hide this behind enospc_debug, this is actually
                 * important and indicates a real bug if this happens.
                 */
                if (WARN_ON(space_info->bytes_pinned > 0 ||
                            space_info->bytes_may_use > 0))
                        btrfs_dump_space_info(info, space_info, 0, 0);

                /*
                 * If there was a failure to cleanup a log tree, very likely due
                 * to an IO failure on a writeback attempt of one or more of its
                 * extent buffers, we could not do proper (and cheap) unaccounting
                 * of their reserved space, so don't warn on bytes_reserved > 0 in
                 * that case.
                 */
                if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
                    !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
                        if (WARN_ON(space_info->bytes_reserved > 0))
                                btrfs_dump_space_info(info, space_info, 0, 0);
                }

                WARN_ON(space_info->reclaim_size > 0);
                list_del(&space_info->list);
                btrfs_sysfs_remove_space_info(space_info);
        }
        return 0;
}

void btrfs_freeze_block_group(struct btrfs_block_group *cache)
{
        atomic_inc(&cache->frozen);
}

void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
{
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        bool cleanup;

        spin_lock(&block_group->lock);
        cleanup = (atomic_dec_and_test(&block_group->frozen) &&
                   test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
        spin_unlock(&block_group->lock);

        if (cleanup) {
                struct btrfs_chunk_map *map;

                map = btrfs_find_chunk_map(fs_info, block_group->start, 1);
                /* Logic error, can't happen. */
                ASSERT(map);

                btrfs_remove_chunk_map(fs_info, map);

                /* Once for our lookup reference. */
                btrfs_free_chunk_map(map);

                /*
                 * We may have left one free space entry and other possible
                 * tasks trimming this block group have left 1 entry each one.
                 * Free them if any.
                 */
                btrfs_remove_free_space_cache(block_group);
        }
}

bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
{
        bool ret = true;

        spin_lock(&bg->lock);
        if (bg->ro)
                ret = false;
        else
                bg->swap_extents++;
        spin_unlock(&bg->lock);

        return ret;
}

void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
{
        spin_lock(&bg->lock);
        ASSERT(!bg->ro);
        ASSERT(bg->swap_extents >= amount);
        bg->swap_extents -= amount;
        spin_unlock(&bg->lock);
}

enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size)
{
        if (size <= SZ_128K)
                return BTRFS_BG_SZ_SMALL;
        if (size <= SZ_8M)
                return BTRFS_BG_SZ_MEDIUM;
        return BTRFS_BG_SZ_LARGE;
}

/*
 * Handle a block group allocating an extent in a size class
 *
 * @bg:                                The block group we allocated in.
 * @size_class:                        The size class of the allocation.
 * @force_wrong_size_class:        Whether we are desperate enough to allow
 *                                mismatched size classes.
 *
 * Returns: 0 if the size class was valid for this block_group, -EAGAIN in the
 * case of a race that leads to the wrong size class without
 * force_wrong_size_class set.
 *
 * find_free_extent will skip block groups with a mismatched size class until
 * it really needs to avoid ENOSPC. In that case it will set
 * force_wrong_size_class. However, if a block group is newly allocated and
 * doesn't yet have a size class, then it is possible for two allocations of
 * different sizes to race and both try to use it. The loser is caught here and
 * has to retry.
 */
int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
                                     enum btrfs_block_group_size_class size_class,
                                     bool force_wrong_size_class)
{
        ASSERT(size_class != BTRFS_BG_SZ_NONE);

        /* The new allocation is in the right size class, do nothing */
        if (bg->size_class == size_class)
                return 0;
        /*
         * The new allocation is in a mismatched size class.
         * This means one of two things:
         *
         * 1. Two tasks in find_free_extent for different size_classes raced
         *    and hit the same empty block_group. Make the loser try again.
         * 2. A call to find_free_extent got desperate enough to set
         *    'force_wrong_slab'. Don't change the size_class, but allow the
         *    allocation.
         */
        if (bg->size_class != BTRFS_BG_SZ_NONE) {
                if (force_wrong_size_class)
                        return 0;
                return -EAGAIN;
        }
        /*
         * The happy new block group case: the new allocation is the first
         * one in the block_group so we set size_class.
         */
        bg->size_class = size_class;

        return 0;
}

bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg)
{
        if (btrfs_is_zoned(bg->fs_info))
                return false;
        if (!btrfs_is_block_group_data_only(bg))
                return false;
        return true;
}




















































    7 





    1 



























    1 








    1 


    1 










































    1 


    1 




    1 



    1 








    1 






    1 
    1 
    1 
    1 







    1 





    1 
















    1 
    1 










    1 
    1 

    2 






















































































































































































































































































    1 

















    1 




















    1 







    1 

















    1 







    1 


    1 














    1 



    1 











    1 








































































    1 



    1 











    1 

    1 





    1 

































    2 

















    2 

















    2 




    2 


















    2 























    2 

    1 










    1 















































    2 







    1 


    2 












    2 





    1 

    1 










    1 














    1 

    1 








    2 


















    1 



















    1 













    1 









    1 








































































































































    2 



















    2 









































    1 



    1 




    1 





























    1 

    1 
































    1 





    1 
    1 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
 *  Copyright (C) 2016 - 2020 Christoph Hellwig
 */

#include <linux/init.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/major.h>
#include <linux/device_cgroup.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/backing-dev.h>
#include <linux/module.h>
#include <linux/blkpg.h>
#include <linux/magic.h>
#include <linux/buffer_head.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/uio.h>
#include <linux/namei.h>
#include <linux/part_stat.h>
#include <linux/uaccess.h>
#include <linux/stat.h>
#include "../fs/internal.h"
#include "blk.h"

/* Should we allow writing to mounted block devices? */
static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED);

struct bdev_inode {
        struct block_device bdev;
        struct inode vfs_inode;
};

static inline struct bdev_inode *BDEV_I(struct inode *inode)
{
        return container_of(inode, struct bdev_inode, vfs_inode);
}

static inline struct inode *BD_INODE(struct block_device *bdev)
{
        return &container_of(bdev, struct bdev_inode, bdev)->vfs_inode;
}

struct block_device *I_BDEV(struct inode *inode)
{
        return &BDEV_I(inode)->bdev;
}
EXPORT_SYMBOL(I_BDEV);

struct block_device *file_bdev(struct file *bdev_file)
{
        return I_BDEV(bdev_file->f_mapping->host);
}
EXPORT_SYMBOL(file_bdev);

static void bdev_write_inode(struct block_device *bdev)
{
        struct inode *inode = BD_INODE(bdev);
        int ret;

        spin_lock(&inode->i_lock);
        while (inode->i_state & I_DIRTY) {
                spin_unlock(&inode->i_lock);
                ret = write_inode_now(inode, true);
                if (ret)
                        pr_warn_ratelimited(
        "VFS: Dirty inode writeback failed for block device %pg (err=%d).\n",
                                bdev, ret);
                spin_lock(&inode->i_lock);
        }
        spin_unlock(&inode->i_lock);
}

/* Kill _all_ buffers and pagecache , dirty or not.. */
static void kill_bdev(struct block_device *bdev)
{
        struct address_space *mapping = bdev->bd_mapping;

        if (mapping_empty(mapping))
                return;

        invalidate_bh_lrus();
        truncate_inode_pages(mapping, 0);
}

/* Invalidate clean unused buffers and pagecache. */
void invalidate_bdev(struct block_device *bdev)
{
        struct address_space *mapping = bdev->bd_mapping;

        if (mapping->nrpages) {
                invalidate_bh_lrus();
                lru_add_drain_all();        /* make sure all lru add caches are flushed */
                invalidate_mapping_pages(mapping, 0, -1);
        }
}
EXPORT_SYMBOL(invalidate_bdev);

/*
 * Drop all buffers & page cache for given bdev range. This function bails
 * with error if bdev has other exclusive owner (such as filesystem).
 */
int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode,
                        loff_t lstart, loff_t lend)
{
        /*
         * If we don't hold exclusive handle for the device, upgrade to it
         * while we discard the buffer cache to avoid discarding buffers
         * under live filesystem.
         */
        if (!(mode & BLK_OPEN_EXCL)) {
                int err = bd_prepare_to_claim(bdev, truncate_bdev_range, NULL);
                if (err)
                        goto invalidate;
        }

        truncate_inode_pages_range(bdev->bd_mapping, lstart, lend);
        if (!(mode & BLK_OPEN_EXCL))
                bd_abort_claiming(bdev, truncate_bdev_range);
        return 0;

invalidate:
        /*
         * Someone else has handle exclusively open. Try invalidating instead.
         * The 'end' argument is inclusive so the rounding is safe.
         */
        return invalidate_inode_pages2_range(bdev->bd_mapping,
                                             lstart >> PAGE_SHIFT,
                                             lend >> PAGE_SHIFT);
}

static void set_init_blocksize(struct block_device *bdev)
{
        unsigned int bsize = bdev_logical_block_size(bdev);
        loff_t size = i_size_read(BD_INODE(bdev));

        while (bsize < PAGE_SIZE) {
                if (size & bsize)
                        break;
                bsize <<= 1;
        }
        BD_INODE(bdev)->i_blkbits = blksize_bits(bsize);
}

int set_blocksize(struct file *file, int size)
{
        struct inode *inode = file->f_mapping->host;
        struct block_device *bdev = I_BDEV(inode);

        /* Size must be a power of two, and between 512 and PAGE_SIZE */
        if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
                return -EINVAL;

        /* Size cannot be smaller than the size supported by the device */
        if (size < bdev_logical_block_size(bdev))
                return -EINVAL;

        if (!file->private_data)
                return -EINVAL;

        /* Don't change the size if it is same as current */
        if (inode->i_blkbits != blksize_bits(size)) {
                sync_blockdev(bdev);
                inode->i_blkbits = blksize_bits(size);
                kill_bdev(bdev);
        }
        return 0;
}

EXPORT_SYMBOL(set_blocksize);

int sb_set_blocksize(struct super_block *sb, int size)
{
        if (set_blocksize(sb->s_bdev_file, size))
                return 0;
        /* If we get here, we know size is power of two
         * and it's value is between 512 and PAGE_SIZE */
        sb->s_blocksize = size;
        sb->s_blocksize_bits = blksize_bits(size);
        return sb->s_blocksize;
}

EXPORT_SYMBOL(sb_set_blocksize);

int sb_min_blocksize(struct super_block *sb, int size)
{
        int minsize = bdev_logical_block_size(sb->s_bdev);
        if (size < minsize)
                size = minsize;
        return sb_set_blocksize(sb, size);
}

EXPORT_SYMBOL(sb_min_blocksize);

int sync_blockdev_nowait(struct block_device *bdev)
{
        if (!bdev)
                return 0;
        return filemap_flush(bdev->bd_mapping);
}
EXPORT_SYMBOL_GPL(sync_blockdev_nowait);

/*
 * Write out and wait upon all the dirty data associated with a block
 * device via its mapping.  Does not take the superblock lock.
 */
int sync_blockdev(struct block_device *bdev)
{
        if (!bdev)
                return 0;
        return filemap_write_and_wait(bdev->bd_mapping);
}
EXPORT_SYMBOL(sync_blockdev);

int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend)
{
        return filemap_write_and_wait_range(bdev->bd_mapping,
                        lstart, lend);
}
EXPORT_SYMBOL(sync_blockdev_range);

/**
 * bdev_freeze - lock a filesystem and force it into a consistent state
 * @bdev:        blockdevice to lock
 *
 * If a superblock is found on this device, we take the s_umount semaphore
 * on it to make sure nobody unmounts until the snapshot creation is done.
 * The reference counter (bd_fsfreeze_count) guarantees that only the last
 * unfreeze process can unfreeze the frozen filesystem actually when multiple
 * freeze requests arrive simultaneously. It counts up in bdev_freeze() and
 * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze
 * actually.
 *
 * Return: On success zero is returned, negative error code on failure.
 */
int bdev_freeze(struct block_device *bdev)
{
        int error = 0;

        mutex_lock(&bdev->bd_fsfreeze_mutex);

        if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) {
                mutex_unlock(&bdev->bd_fsfreeze_mutex);
                return 0;
        }

        mutex_lock(&bdev->bd_holder_lock);
        if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) {
                error = bdev->bd_holder_ops->freeze(bdev);
                lockdep_assert_not_held(&bdev->bd_holder_lock);
        } else {
                mutex_unlock(&bdev->bd_holder_lock);
                error = sync_blockdev(bdev);
        }

        if (error)
                atomic_dec(&bdev->bd_fsfreeze_count);

        mutex_unlock(&bdev->bd_fsfreeze_mutex);
        return error;
}
EXPORT_SYMBOL(bdev_freeze);

/**
 * bdev_thaw - unlock filesystem
 * @bdev:        blockdevice to unlock
 *
 * Unlocks the filesystem and marks it writeable again after bdev_freeze().
 *
 * Return: On success zero is returned, negative error code on failure.
 */
int bdev_thaw(struct block_device *bdev)
{
        int error = -EINVAL, nr_freeze;

        mutex_lock(&bdev->bd_fsfreeze_mutex);

        /*
         * If this returns < 0 it means that @bd_fsfreeze_count was
         * already 0 and no decrement was performed.
         */
        nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count);
        if (nr_freeze < 0)
                goto out;

        error = 0;
        if (nr_freeze > 0)
                goto out;

        mutex_lock(&bdev->bd_holder_lock);
        if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) {
                error = bdev->bd_holder_ops->thaw(bdev);
                lockdep_assert_not_held(&bdev->bd_holder_lock);
        } else {
                mutex_unlock(&bdev->bd_holder_lock);
        }

        if (error)
                atomic_inc(&bdev->bd_fsfreeze_count);
out:
        mutex_unlock(&bdev->bd_fsfreeze_mutex);
        return error;
}
EXPORT_SYMBOL(bdev_thaw);

/*
 * pseudo-fs
 */

static  __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock);
static struct kmem_cache *bdev_cachep __ro_after_init;

static struct inode *bdev_alloc_inode(struct super_block *sb)
{
        struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL);

        if (!ei)
                return NULL;
        memset(&ei->bdev, 0, sizeof(ei->bdev));
        return &ei->vfs_inode;
}

static void bdev_free_inode(struct inode *inode)
{
        struct block_device *bdev = I_BDEV(inode);

        free_percpu(bdev->bd_stats);
        kfree(bdev->bd_meta_info);

        if (!bdev_is_partition(bdev)) {
                if (bdev->bd_disk && bdev->bd_disk->bdi)
                        bdi_put(bdev->bd_disk->bdi);
                kfree(bdev->bd_disk);
        }

        if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR)
                blk_free_ext_minor(MINOR(bdev->bd_dev));

        kmem_cache_free(bdev_cachep, BDEV_I(inode));
}

static void init_once(void *data)
{
        struct bdev_inode *ei = data;

        inode_init_once(&ei->vfs_inode);
}

static void bdev_evict_inode(struct inode *inode)
{
        truncate_inode_pages_final(&inode->i_data);
        invalidate_inode_buffers(inode); /* is it needed here? */
        clear_inode(inode);
}

static const struct super_operations bdev_sops = {
        .statfs = simple_statfs,
        .alloc_inode = bdev_alloc_inode,
        .free_inode = bdev_free_inode,
        .drop_inode = generic_delete_inode,
        .evict_inode = bdev_evict_inode,
};

static int bd_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        fc->s_iflags |= SB_I_CGROUPWB;
        ctx->ops = &bdev_sops;
        return 0;
}

static struct file_system_type bd_type = {
        .name                = "bdev",
        .init_fs_context = bd_init_fs_context,
        .kill_sb        = kill_anon_super,
};

struct super_block *blockdev_superblock __ro_after_init;
struct vfsmount *blockdev_mnt __ro_after_init;
EXPORT_SYMBOL_GPL(blockdev_superblock);

void __init bdev_cache_init(void)
{
        int err;

        bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
                        0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
                                SLAB_ACCOUNT|SLAB_PANIC),
                        init_once);
        err = register_filesystem(&bd_type);
        if (err)
                panic("Cannot register bdev pseudo-fs");
        blockdev_mnt = kern_mount(&bd_type);
        if (IS_ERR(blockdev_mnt))
                panic("Cannot create bdev pseudo-fs");
        blockdev_superblock = blockdev_mnt->mnt_sb;   /* For writeback */
}

struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
{
        struct block_device *bdev;
        struct inode *inode;

        inode = new_inode(blockdev_superblock);
        if (!inode)
                return NULL;
        inode->i_mode = S_IFBLK;
        inode->i_rdev = 0;
        inode->i_data.a_ops = &def_blk_aops;
        mapping_set_gfp_mask(&inode->i_data, GFP_USER);

        bdev = I_BDEV(inode);
        mutex_init(&bdev->bd_fsfreeze_mutex);
        spin_lock_init(&bdev->bd_size_lock);
        mutex_init(&bdev->bd_holder_lock);
        atomic_set(&bdev->__bd_flags, partno);
        bdev->bd_mapping = &inode->i_data;
        bdev->bd_queue = disk->queue;
        if (partno && bdev_test_flag(disk->part0, BD_HAS_SUBMIT_BIO))
                bdev_set_flag(bdev, BD_HAS_SUBMIT_BIO);
        bdev->bd_stats = alloc_percpu(struct disk_stats);
        if (!bdev->bd_stats) {
                iput(inode);
                return NULL;
        }
        bdev->bd_disk = disk;
        return bdev;
}

void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors)
{
        spin_lock(&bdev->bd_size_lock);
        i_size_write(BD_INODE(bdev), (loff_t)sectors << SECTOR_SHIFT);
        bdev->bd_nr_sectors = sectors;
        spin_unlock(&bdev->bd_size_lock);
}

void bdev_add(struct block_device *bdev, dev_t dev)
{
        struct inode *inode = BD_INODE(bdev);
        if (bdev_stable_writes(bdev))
                mapping_set_stable_writes(bdev->bd_mapping);
        bdev->bd_dev = dev;
        inode->i_rdev = dev;
        inode->i_ino = dev;
        insert_inode_hash(inode);
}

void bdev_unhash(struct block_device *bdev)
{
        remove_inode_hash(BD_INODE(bdev));
}

void bdev_drop(struct block_device *bdev)
{
        iput(BD_INODE(bdev));
}

long nr_blockdev_pages(void)
{
        struct inode *inode;
        long ret = 0;

        spin_lock(&blockdev_superblock->s_inode_list_lock);
        list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
                ret += inode->i_mapping->nrpages;
        spin_unlock(&blockdev_superblock->s_inode_list_lock);

        return ret;
}

/**
 * bd_may_claim - test whether a block device can be claimed
 * @bdev: block device of interest
 * @holder: holder trying to claim @bdev
 * @hops: holder ops
 *
 * Test whether @bdev can be claimed by @holder.
 *
 * RETURNS:
 * %true if @bdev can be claimed, %false otherwise.
 */
static bool bd_may_claim(struct block_device *bdev, void *holder,
                const struct blk_holder_ops *hops)
{
        struct block_device *whole = bdev_whole(bdev);

        lockdep_assert_held(&bdev_lock);

        if (bdev->bd_holder) {
                /*
                 * The same holder can always re-claim.
                 */
                if (bdev->bd_holder == holder) {
                        if (WARN_ON_ONCE(bdev->bd_holder_ops != hops))
                                return false;
                        return true;
                }
                return false;
        }

        /*
         * If the whole devices holder is set to bd_may_claim, a partition on
         * the device is claimed, but not the whole device.
         */
        if (whole != bdev &&
            whole->bd_holder && whole->bd_holder != bd_may_claim)
                return false;
        return true;
}

/**
 * bd_prepare_to_claim - claim a block device
 * @bdev: block device of interest
 * @holder: holder trying to claim @bdev
 * @hops: holder ops.
 *
 * Claim @bdev.  This function fails if @bdev is already claimed by another
 * holder and waits if another claiming is in progress. return, the caller
 * has ownership of bd_claiming and bd_holder[s].
 *
 * RETURNS:
 * 0 if @bdev can be claimed, -EBUSY otherwise.
 */
int bd_prepare_to_claim(struct block_device *bdev, void *holder,
                const struct blk_holder_ops *hops)
{
        struct block_device *whole = bdev_whole(bdev);

        if (WARN_ON_ONCE(!holder))
                return -EINVAL;
retry:
        mutex_lock(&bdev_lock);
        /* if someone else claimed, fail */
        if (!bd_may_claim(bdev, holder, hops)) {
                mutex_unlock(&bdev_lock);
                return -EBUSY;
        }

        /* if claiming is already in progress, wait for it to finish */
        if (whole->bd_claiming) {
                wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
                DEFINE_WAIT(wait);

                prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
                mutex_unlock(&bdev_lock);
                schedule();
                finish_wait(wq, &wait);
                goto retry;
        }

        /* yay, all mine */
        whole->bd_claiming = holder;
        mutex_unlock(&bdev_lock);
        return 0;
}
EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */

static void bd_clear_claiming(struct block_device *whole, void *holder)
{
        lockdep_assert_held(&bdev_lock);
        /* tell others that we're done */
        BUG_ON(whole->bd_claiming != holder);
        whole->bd_claiming = NULL;
        wake_up_bit(&whole->bd_claiming, 0);
}

/**
 * bd_finish_claiming - finish claiming of a block device
 * @bdev: block device of interest
 * @holder: holder that has claimed @bdev
 * @hops: block device holder operations
 *
 * Finish exclusive open of a block device. Mark the device as exlusively
 * open by the holder and wake up all waiters for exclusive open to finish.
 */
static void bd_finish_claiming(struct block_device *bdev, void *holder,
                const struct blk_holder_ops *hops)
{
        struct block_device *whole = bdev_whole(bdev);

        mutex_lock(&bdev_lock);
        BUG_ON(!bd_may_claim(bdev, holder, hops));
        /*
         * Note that for a whole device bd_holders will be incremented twice,
         * and bd_holder will be set to bd_may_claim before being set to holder
         */
        whole->bd_holders++;
        whole->bd_holder = bd_may_claim;
        bdev->bd_holders++;
        mutex_lock(&bdev->bd_holder_lock);
        bdev->bd_holder = holder;
        bdev->bd_holder_ops = hops;
        mutex_unlock(&bdev->bd_holder_lock);
        bd_clear_claiming(whole, holder);
        mutex_unlock(&bdev_lock);
}

/**
 * bd_abort_claiming - abort claiming of a block device
 * @bdev: block device of interest
 * @holder: holder that has claimed @bdev
 *
 * Abort claiming of a block device when the exclusive open failed. This can be
 * also used when exclusive open is not actually desired and we just needed
 * to block other exclusive openers for a while.
 */
void bd_abort_claiming(struct block_device *bdev, void *holder)
{
        mutex_lock(&bdev_lock);
        bd_clear_claiming(bdev_whole(bdev), holder);
        mutex_unlock(&bdev_lock);
}
EXPORT_SYMBOL(bd_abort_claiming);

static void bd_end_claim(struct block_device *bdev, void *holder)
{
        struct block_device *whole = bdev_whole(bdev);
        bool unblock = false;

        /*
         * Release a claim on the device.  The holder fields are protected with
         * bdev_lock.  open_mutex is used to synchronize disk_holder unlinking.
         */
        mutex_lock(&bdev_lock);
        WARN_ON_ONCE(bdev->bd_holder != holder);
        WARN_ON_ONCE(--bdev->bd_holders < 0);
        WARN_ON_ONCE(--whole->bd_holders < 0);
        if (!bdev->bd_holders) {
                mutex_lock(&bdev->bd_holder_lock);
                bdev->bd_holder = NULL;
                bdev->bd_holder_ops = NULL;
                mutex_unlock(&bdev->bd_holder_lock);
                if (bdev_test_flag(bdev, BD_WRITE_HOLDER))
                        unblock = true;
        }
        if (!whole->bd_holders)
                whole->bd_holder = NULL;
        mutex_unlock(&bdev_lock);

        /*
         * If this was the last claim, remove holder link and unblock evpoll if
         * it was a write holder.
         */
        if (unblock) {
                disk_unblock_events(bdev->bd_disk);
                bdev_clear_flag(bdev, BD_WRITE_HOLDER);
        }
}

static void blkdev_flush_mapping(struct block_device *bdev)
{
        WARN_ON_ONCE(bdev->bd_holders);
        sync_blockdev(bdev);
        kill_bdev(bdev);
        bdev_write_inode(bdev);
}

static void blkdev_put_whole(struct block_device *bdev)
{
        if (atomic_dec_and_test(&bdev->bd_openers))
                blkdev_flush_mapping(bdev);
        if (bdev->bd_disk->fops->release)
                bdev->bd_disk->fops->release(bdev->bd_disk);
}

static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode)
{
        struct gendisk *disk = bdev->bd_disk;
        int ret;

        if (disk->fops->open) {
                ret = disk->fops->open(disk, mode);
                if (ret) {
                        /* avoid ghost partitions on a removed medium */
                        if (ret == -ENOMEDIUM &&
                             test_bit(GD_NEED_PART_SCAN, &disk->state))
                                bdev_disk_changed(disk, true);
                        return ret;
                }
        }

        if (!atomic_read(&bdev->bd_openers))
                set_init_blocksize(bdev);
        atomic_inc(&bdev->bd_openers);
        if (test_bit(GD_NEED_PART_SCAN, &disk->state)) {
                /*
                 * Only return scanning errors if we are called from contexts
                 * that explicitly want them, e.g. the BLKRRPART ioctl.
                 */
                ret = bdev_disk_changed(disk, false);
                if (ret && (mode & BLK_OPEN_STRICT_SCAN)) {
                        blkdev_put_whole(bdev);
                        return ret;
                }
        }
        return 0;
}

static int blkdev_get_part(struct block_device *part, blk_mode_t mode)
{
        struct gendisk *disk = part->bd_disk;
        int ret;

        ret = blkdev_get_whole(bdev_whole(part), mode);
        if (ret)
                return ret;

        ret = -ENXIO;
        if (!bdev_nr_sectors(part))
                goto out_blkdev_put;

        if (!atomic_read(&part->bd_openers)) {
                disk->open_partitions++;
                set_init_blocksize(part);
        }
        atomic_inc(&part->bd_openers);
        return 0;

out_blkdev_put:
        blkdev_put_whole(bdev_whole(part));
        return ret;
}

int bdev_permission(dev_t dev, blk_mode_t mode, void *holder)
{
        int ret;

        ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
                        MAJOR(dev), MINOR(dev),
                        ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) |
                        ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0));
        if (ret)
                return ret;

        /* Blocking writes requires exclusive opener */
        if (mode & BLK_OPEN_RESTRICT_WRITES && !holder)
                return -EINVAL;

        /*
         * We're using error pointers to indicate to ->release() when we
         * failed to open that block device. Also this doesn't make sense.
         */
        if (WARN_ON_ONCE(IS_ERR(holder)))
                return -EINVAL;

        return 0;
}

static void blkdev_put_part(struct block_device *part)
{
        struct block_device *whole = bdev_whole(part);

        if (atomic_dec_and_test(&part->bd_openers)) {
                blkdev_flush_mapping(part);
                whole->bd_disk->open_partitions--;
        }
        blkdev_put_whole(whole);
}

struct block_device *blkdev_get_no_open(dev_t dev)
{
        struct block_device *bdev;
        struct inode *inode;

        inode = ilookup(blockdev_superblock, dev);
        if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) {
                blk_request_module(dev);
                inode = ilookup(blockdev_superblock, dev);
                if (inode)
                        pr_warn_ratelimited(
"block device autoloading is deprecated and will be removed.\n");
        }
        if (!inode)
                return NULL;

        /* switch from the inode reference to a device mode one: */
        bdev = &BDEV_I(inode)->bdev;
        if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
                bdev = NULL;
        iput(inode);
        return bdev;
}

void blkdev_put_no_open(struct block_device *bdev)
{
        put_device(&bdev->bd_device);
}

static bool bdev_writes_blocked(struct block_device *bdev)
{
        return bdev->bd_writers < 0;
}

static void bdev_block_writes(struct block_device *bdev)
{
        bdev->bd_writers--;
}

static void bdev_unblock_writes(struct block_device *bdev)
{
        bdev->bd_writers++;
}

static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode)
{
        if (bdev_allow_write_mounted)
                return true;
        /* Writes blocked? */
        if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev))
                return false;
        if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0)
                return false;
        return true;
}

static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode)
{
        if (bdev_allow_write_mounted)
                return;

        /* Claim exclusive or shared write access. */
        if (mode & BLK_OPEN_RESTRICT_WRITES)
                bdev_block_writes(bdev);
        else if (mode & BLK_OPEN_WRITE)
                bdev->bd_writers++;
}

static inline bool bdev_unclaimed(const struct file *bdev_file)
{
        return bdev_file->private_data == BDEV_I(bdev_file->f_mapping->host);
}

static void bdev_yield_write_access(struct file *bdev_file)
{
        struct block_device *bdev;

        if (bdev_allow_write_mounted)
                return;

        if (bdev_unclaimed(bdev_file))
                return;

        bdev = file_bdev(bdev_file);

        if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED)
                bdev_unblock_writes(bdev);
        else if (bdev_file->f_mode & FMODE_WRITE)
                bdev->bd_writers--;
}

/**
 * bdev_open - open a block device
 * @bdev: block device to open
 * @mode: open mode (BLK_OPEN_*)
 * @holder: exclusive holder identifier
 * @hops: holder operations
 * @bdev_file: file for the block device
 *
 * Open the block device. If @holder is not %NULL, the block device is opened
 * with exclusive access.  Exclusive opens may nest for the same @holder.
 *
 * CONTEXT:
 * Might sleep.
 *
 * RETURNS:
 * zero on success, -errno on failure.
 */
int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
              const struct blk_holder_ops *hops, struct file *bdev_file)
{
        bool unblock_events = true;
        struct gendisk *disk = bdev->bd_disk;
        int ret;

        if (holder) {
                mode |= BLK_OPEN_EXCL;
                ret = bd_prepare_to_claim(bdev, holder, hops);
                if (ret)
                        return ret;
        } else {
                if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL))
                        return -EIO;
        }

        disk_block_events(disk);

        mutex_lock(&disk->open_mutex);
        ret = -ENXIO;
        if (!disk_live(disk))
                goto abort_claiming;
        if (!try_module_get(disk->fops->owner))
                goto abort_claiming;
        ret = -EBUSY;
        if (!bdev_may_open(bdev, mode))
                goto put_module;
        if (bdev_is_partition(bdev))
                ret = blkdev_get_part(bdev, mode);
        else
                ret = blkdev_get_whole(bdev, mode);
        if (ret)
                goto put_module;
        bdev_claim_write_access(bdev, mode);
        if (holder) {
                bd_finish_claiming(bdev, holder, hops);

                /*
                 * Block event polling for write claims if requested.  Any write
                 * holder makes the write_holder state stick until all are
                 * released.  This is good enough and tracking individual
                 * writeable reference is too fragile given the way @mode is
                 * used in blkdev_get/put().
                 */
                if ((mode & BLK_OPEN_WRITE) &&
                    !bdev_test_flag(bdev, BD_WRITE_HOLDER) &&
                    (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) {
                        bdev_set_flag(bdev, BD_WRITE_HOLDER);
                        unblock_events = false;
                }
        }
        mutex_unlock(&disk->open_mutex);

        if (unblock_events)
                disk_unblock_events(disk);

        bdev_file->f_flags |= O_LARGEFILE;
        bdev_file->f_mode |= FMODE_CAN_ODIRECT;
        if (bdev_nowait(bdev))
                bdev_file->f_mode |= FMODE_NOWAIT;
        if (mode & BLK_OPEN_RESTRICT_WRITES)
                bdev_file->f_mode |= FMODE_WRITE_RESTRICTED;
        bdev_file->f_mapping = bdev->bd_mapping;
        bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping);
        bdev_file->private_data = holder;

        return 0;
put_module:
        module_put(disk->fops->owner);
abort_claiming:
        if (holder)
                bd_abort_claiming(bdev, holder);
        mutex_unlock(&disk->open_mutex);
        disk_unblock_events(disk);
        return ret;
}

/*
 * If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk
 * associated with the floppy driver where it has allowed ioctls if the
 * file was opened for writing, but does not allow reads or writes.
 * Make sure that this quirk is reflected in @f_flags.
 *
 * It can also happen if a block device is opened as O_RDWR | O_WRONLY.
 */
static unsigned blk_to_file_flags(blk_mode_t mode)
{
        unsigned int flags = 0;

        if ((mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) ==
            (BLK_OPEN_READ | BLK_OPEN_WRITE))
                flags |= O_RDWR;
        else if (mode & BLK_OPEN_WRITE_IOCTL)
                flags |= O_RDWR | O_WRONLY;
        else if (mode & BLK_OPEN_WRITE)
                flags |= O_WRONLY;
        else if (mode & BLK_OPEN_READ)
                flags |= O_RDONLY; /* homeopathic, because O_RDONLY is 0 */
        else
                WARN_ON_ONCE(true);

        if (mode & BLK_OPEN_NDELAY)
                flags |= O_NDELAY;

        return flags;
}

struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
                                   const struct blk_holder_ops *hops)
{
        struct file *bdev_file;
        struct block_device *bdev;
        unsigned int flags;
        int ret;

        ret = bdev_permission(dev, mode, holder);
        if (ret)
                return ERR_PTR(ret);

        bdev = blkdev_get_no_open(dev);
        if (!bdev)
                return ERR_PTR(-ENXIO);

        flags = blk_to_file_flags(mode);
        bdev_file = alloc_file_pseudo_noaccount(BD_INODE(bdev),
                        blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops);
        if (IS_ERR(bdev_file)) {
                blkdev_put_no_open(bdev);
                return bdev_file;
        }
        ihold(BD_INODE(bdev));

        ret = bdev_open(bdev, mode, holder, hops, bdev_file);
        if (ret) {
                /* We failed to open the block device. Let ->release() know. */
                bdev_file->private_data = ERR_PTR(ret);
                fput(bdev_file);
                return ERR_PTR(ret);
        }
        return bdev_file;
}
EXPORT_SYMBOL(bdev_file_open_by_dev);

struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
                                    void *holder,
                                    const struct blk_holder_ops *hops)
{
        struct file *file;
        dev_t dev;
        int error;

        error = lookup_bdev(path, &dev);
        if (error)
                return ERR_PTR(error);

        file = bdev_file_open_by_dev(dev, mode, holder, hops);
        if (!IS_ERR(file) && (mode & BLK_OPEN_WRITE)) {
                if (bdev_read_only(file_bdev(file))) {
                        fput(file);
                        file = ERR_PTR(-EACCES);
                }
        }

        return file;
}
EXPORT_SYMBOL(bdev_file_open_by_path);

static inline void bd_yield_claim(struct file *bdev_file)
{
        struct block_device *bdev = file_bdev(bdev_file);
        void *holder = bdev_file->private_data;

        lockdep_assert_held(&bdev->bd_disk->open_mutex);

        if (WARN_ON_ONCE(IS_ERR_OR_NULL(holder)))
                return;

        if (!bdev_unclaimed(bdev_file))
                bd_end_claim(bdev, holder);
}

void bdev_release(struct file *bdev_file)
{
        struct block_device *bdev = file_bdev(bdev_file);
        void *holder = bdev_file->private_data;
        struct gendisk *disk = bdev->bd_disk;

        /* We failed to open that block device. */
        if (IS_ERR(holder))
                goto put_no_open;

        /*
         * Sync early if it looks like we're the last one.  If someone else
         * opens the block device between now and the decrement of bd_openers
         * then we did a sync that we didn't need to, but that's not the end
         * of the world and we want to avoid long (could be several minute)
         * syncs while holding the mutex.
         */
        if (atomic_read(&bdev->bd_openers) == 1)
                sync_blockdev(bdev);

        mutex_lock(&disk->open_mutex);
        bdev_yield_write_access(bdev_file);

        if (holder)
                bd_yield_claim(bdev_file);

        /*
         * Trigger event checking and tell drivers to flush MEDIA_CHANGE
         * event.  This is to ensure detection of media removal commanded
         * from userland - e.g. eject(1).
         */
        disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE);

        if (bdev_is_partition(bdev))
                blkdev_put_part(bdev);
        else
                blkdev_put_whole(bdev);
        mutex_unlock(&disk->open_mutex);

        module_put(disk->fops->owner);
put_no_open:
        blkdev_put_no_open(bdev);
}

/**
 * bdev_fput - yield claim to the block device and put the file
 * @bdev_file: open block device
 *
 * Yield claim on the block device and put the file. Ensure that the
 * block device can be reclaimed before the file is closed which is a
 * deferred operation.
 */
void bdev_fput(struct file *bdev_file)
{
        if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops))
                return;

        if (bdev_file->private_data) {
                struct block_device *bdev = file_bdev(bdev_file);
                struct gendisk *disk = bdev->bd_disk;

                mutex_lock(&disk->open_mutex);
                bdev_yield_write_access(bdev_file);
                bd_yield_claim(bdev_file);
                /*
                 * Tell release we already gave up our hold on the
                 * device and if write restrictions are available that
                 * we already gave up write access to the device.
                 */
                bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host);
                mutex_unlock(&disk->open_mutex);
        }

        fput(bdev_file);
}
EXPORT_SYMBOL(bdev_fput);

/**
 * lookup_bdev() - Look up a struct block_device by name.
 * @pathname: Name of the block device in the filesystem.
 * @dev: Pointer to the block device's dev_t, if found.
 *
 * Lookup the block device's dev_t at @pathname in the current
 * namespace if possible and return it in @dev.
 *
 * Context: May sleep.
 * Return: 0 if succeeded, negative errno otherwise.
 */
int lookup_bdev(const char *pathname, dev_t *dev)
{
        struct inode *inode;
        struct path path;
        int error;

        if (!pathname || !*pathname)
                return -EINVAL;

        error = kern_path(pathname, LOOKUP_FOLLOW, &path);
        if (error)
                return error;

        inode = d_backing_inode(path.dentry);
        error = -ENOTBLK;
        if (!S_ISBLK(inode->i_mode))
                goto out_path_put;
        error = -EACCES;
        if (!may_open_dev(&path))
                goto out_path_put;

        *dev = inode->i_rdev;
        error = 0;
out_path_put:
        path_put(&path);
        return error;
}
EXPORT_SYMBOL(lookup_bdev);

/**
 * bdev_mark_dead - mark a block device as dead
 * @bdev: block device to operate on
 * @surprise: indicate a surprise removal
 *
 * Tell the file system that this devices or media is dead.  If @surprise is set
 * to %true the device or media is already gone, if not we are preparing for an
 * orderly removal.
 *
 * This calls into the file system, which then typicall syncs out all dirty data
 * and writes back inodes and then invalidates any cached data in the inodes on
 * the file system.  In addition we also invalidate the block device mapping.
 */
void bdev_mark_dead(struct block_device *bdev, bool surprise)
{
        mutex_lock(&bdev->bd_holder_lock);
        if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead)
                bdev->bd_holder_ops->mark_dead(bdev, surprise);
        else {
                mutex_unlock(&bdev->bd_holder_lock);
                sync_blockdev(bdev);
        }

        invalidate_bdev(bdev);
}
/*
 * New drivers should not use this directly.  There are some drivers however
 * that needs this for historical reasons. For example, the DASD driver has
 * historically had a shutdown to offline mode that doesn't actually remove the
 * gendisk that otherwise looks a lot like a safe device removal.
 */
EXPORT_SYMBOL_GPL(bdev_mark_dead);

void sync_bdevs(bool wait)
{
        struct inode *inode, *old_inode = NULL;

        spin_lock(&blockdev_superblock->s_inode_list_lock);
        list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
                struct address_space *mapping = inode->i_mapping;
                struct block_device *bdev;

                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
                    mapping->nrpages == 0) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                spin_unlock(&blockdev_superblock->s_inode_list_lock);
                /*
                 * We hold a reference to 'inode' so it couldn't have been
                 * removed from s_inodes list while we dropped the
                 * s_inode_list_lock  We cannot iput the inode now as we can
                 * be holding the last reference and we cannot iput it under
                 * s_inode_list_lock. So we keep the reference and iput it
                 * later.
                 */
                iput(old_inode);
                old_inode = inode;
                bdev = I_BDEV(inode);

                mutex_lock(&bdev->bd_disk->open_mutex);
                if (!atomic_read(&bdev->bd_openers)) {
                        ; /* skip */
                } else if (wait) {
                        /*
                         * We keep the error status of individual mapping so
                         * that applications can catch the writeback error using
                         * fsync(2). See filemap_fdatawait_keep_errors() for
                         * details.
                         */
                        filemap_fdatawait_keep_errors(inode->i_mapping);
                } else {
                        filemap_fdatawrite(inode->i_mapping);
                }
                mutex_unlock(&bdev->bd_disk->open_mutex);

                spin_lock(&blockdev_superblock->s_inode_list_lock);
        }
        spin_unlock(&blockdev_superblock->s_inode_list_lock);
        iput(old_inode);
}

/*
 * Handle STATX_DIOALIGN for block devices.
 *
 * Note that the inode passed to this is the inode of a block device node file,
 * not the block device's internal inode.  Therefore it is *not* valid to use
 * I_BDEV() here; the block device has to be looked up by i_rdev instead.
 */
void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
{
        struct block_device *bdev;

        bdev = blkdev_get_no_open(inode->i_rdev);
        if (!bdev)
                return;

        stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
        stat->dio_offset_align = bdev_logical_block_size(bdev);
        stat->result_mask |= STATX_DIOALIGN;

        blkdev_put_no_open(bdev);
}

bool disk_live(struct gendisk *disk)
{
        return !inode_unhashed(BD_INODE(disk->part0));
}
EXPORT_SYMBOL_GPL(disk_live);

unsigned int block_size(struct block_device *bdev)
{
        return 1 << BD_INODE(bdev)->i_blkbits;
}
EXPORT_SYMBOL_GPL(block_size);

static int __init setup_bdev_allow_write_mounted(char *str)
{
        if (kstrtobool(str, &bdev_allow_write_mounted))
                pr_warn("Invalid option string for bdev_allow_write_mounted:"
                        " '%s'\n", str);
        return 1;
}
__setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted);





























































































































































































































































































   30 















    3 


















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * Sleepable Read-Copy Update mechanism for mutual exclusion
 *
 * Copyright (C) IBM Corporation, 2006
 * Copyright (C) Fujitsu, 2012
 *
 * Author: Paul McKenney <paulmck@linux.ibm.com>
 *           Lai Jiangshan <laijs@cn.fujitsu.com>
 *
 * For detailed explanation of Read-Copy Update mechanism see -
 *                Documentation/RCU/ *.txt
 *
 */

#ifndef _LINUX_SRCU_H
#define _LINUX_SRCU_H

#include <linux/mutex.h>
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
#include <linux/rcu_segcblist.h>

struct srcu_struct;

#ifdef CONFIG_DEBUG_LOCK_ALLOC

int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
                       struct lock_class_key *key);

#define init_srcu_struct(ssp) \
({ \
        static struct lock_class_key __srcu_key; \
        \
        __init_srcu_struct((ssp), #ssp, &__srcu_key); \
})

#define __SRCU_DEP_MAP_INIT(srcu_name)        .dep_map = { .name = #srcu_name },
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

int init_srcu_struct(struct srcu_struct *ssp);

#define __SRCU_DEP_MAP_INIT(srcu_name)
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

#ifdef CONFIG_TINY_SRCU
#include <linux/srcutiny.h>
#elif defined(CONFIG_TREE_SRCU)
#include <linux/srcutree.h>
#else
#error "Unknown SRCU implementation specified to kernel configuration"
#endif

void call_srcu(struct srcu_struct *ssp, struct rcu_head *head,
                void (*func)(struct rcu_head *head));
void cleanup_srcu_struct(struct srcu_struct *ssp);
int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);
void synchronize_srcu(struct srcu_struct *ssp);
unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp);
unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp);
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie);

#ifdef CONFIG_NEED_SRCU_NMI_SAFE
int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp);
void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp);
#else
static inline int __srcu_read_lock_nmisafe(struct srcu_struct *ssp)
{
        return __srcu_read_lock(ssp);
}
static inline void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
{
        __srcu_read_unlock(ssp, idx);
}
#endif /* CONFIG_NEED_SRCU_NMI_SAFE */

void srcu_init(void);

#ifdef CONFIG_DEBUG_LOCK_ALLOC

/**
 * srcu_read_lock_held - might we be in SRCU read-side critical section?
 * @ssp: The srcu_struct structure to check
 *
 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an SRCU
 * read-side critical section.  In absence of CONFIG_DEBUG_LOCK_ALLOC,
 * this assumes we are in an SRCU read-side critical section unless it can
 * prove otherwise.
 *
 * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
 * and while lockdep is disabled.
 *
 * Note that SRCU is based on its own statemachine and it doesn't
 * relies on normal RCU, it can be called from the CPU which
 * is in the idle loop from an RCU point of view or offline.
 */
static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
{
        if (!debug_lockdep_rcu_enabled())
                return 1;
        return lock_is_held(&ssp->dep_map);
}

/*
 * Annotations provide deadlock detection for SRCU.
 *
 * Similar to other lockdep annotations, except there is an additional
 * srcu_lock_sync(), which is basically an empty *write*-side critical section,
 * see lock_sync() for more information.
 */

/* Annotates a srcu_read_lock() */
static inline void srcu_lock_acquire(struct lockdep_map *map)
{
        lock_map_acquire_read(map);
}

/* Annotates a srcu_read_lock() */
static inline void srcu_lock_release(struct lockdep_map *map)
{
        lock_map_release(map);
}

/* Annotates a synchronize_srcu() */
static inline void srcu_lock_sync(struct lockdep_map *map)
{
        lock_map_sync(map);
}

#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
{
        return 1;
}

#define srcu_lock_acquire(m) do { } while (0)
#define srcu_lock_release(m) do { } while (0)
#define srcu_lock_sync(m) do { } while (0)

#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

#define SRCU_NMI_UNKNOWN        0x0
#define SRCU_NMI_UNSAFE                0x1
#define SRCU_NMI_SAFE                0x2

#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU)
void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe);
#else
static inline void srcu_check_nmi_safety(struct srcu_struct *ssp,
                                         bool nmi_safe) { }
#endif


/**
 * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 * @c: condition to check for update-side use
 *
 * If PROVE_RCU is enabled, invoking this outside of an RCU read-side
 * critical section will result in an RCU-lockdep splat, unless @c evaluates
 * to 1.  The @c argument will normally be a logical expression containing
 * lockdep_is_held() calls.
 */
#define srcu_dereference_check(p, ssp, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || srcu_read_lock_held(ssp), __rcu)

/**
 * srcu_dereference - fetch SRCU-protected pointer for later dereferencing
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 *
 * Makes rcu_dereference_check() do the dirty work.  If PROVE_RCU
 * is enabled, invoking this outside of an RCU read-side critical
 * section will result in an RCU-lockdep splat.
 */
#define srcu_dereference(p, ssp) srcu_dereference_check((p), (ssp), 0)

/**
 * srcu_dereference_notrace - no tracing and no lockdep calls from here
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 */
#define srcu_dereference_notrace(p, ssp) srcu_dereference_check((p), (ssp), 1)

/**
 * srcu_read_lock - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter an SRCU read-side critical section.  Note that SRCU read-side
 * critical sections may be nested.  However, it is illegal to
 * call anything that waits on an SRCU grace period for the same
 * srcu_struct, whether directly or indirectly.  Please note that
 * one way to indirectly wait on an SRCU grace period is to acquire
 * a mutex that is held elsewhere while calling synchronize_srcu() or
 * synchronize_srcu_expedited().
 *
 * Note that srcu_read_lock() and the matching srcu_read_unlock() must
 * occur in the same context, for example, it is illegal to invoke
 * srcu_read_unlock() in an irq handler if the matching srcu_read_lock()
 * was invoked in process context.
 */
static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp)
{
        int retval;

        srcu_check_nmi_safety(ssp, false);
        retval = __srcu_read_lock(ssp);
        srcu_lock_acquire(&ssp->dep_map);
        return retval;
}

/**
 * srcu_read_lock_nmisafe - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter an SRCU read-side critical section, but in an NMI-safe manner.
 * See srcu_read_lock() for more information.
 */
static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp)
{
        int retval;

        srcu_check_nmi_safety(ssp, true);
        retval = __srcu_read_lock_nmisafe(ssp);
        rcu_try_lock_acquire(&ssp->dep_map);
        return retval;
}

/* Used by tracing, cannot be traced and cannot invoke lockdep. */
static inline notrace int
srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp)
{
        int retval;

        srcu_check_nmi_safety(ssp, false);
        retval = __srcu_read_lock(ssp);
        return retval;
}

/**
 * srcu_down_read - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter a semaphore-like SRCU read-side critical section.  Note that
 * SRCU read-side critical sections may be nested.  However, it is
 * illegal to call anything that waits on an SRCU grace period for the
 * same srcu_struct, whether directly or indirectly.  Please note that
 * one way to indirectly wait on an SRCU grace period is to acquire
 * a mutex that is held elsewhere while calling synchronize_srcu() or
 * synchronize_srcu_expedited().  But if you want lockdep to help you
 * keep this stuff straight, you should instead use srcu_read_lock().
 *
 * The semaphore-like nature of srcu_down_read() means that the matching
 * srcu_up_read() can be invoked from some other context, for example,
 * from some other task or from an irq handler.  However, neither
 * srcu_down_read() nor srcu_up_read() may be invoked from an NMI handler.
 *
 * Calls to srcu_down_read() may be nested, similar to the manner in
 * which calls to down_read() may be nested.
 */
static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp)
{
        WARN_ON_ONCE(in_nmi());
        srcu_check_nmi_safety(ssp, false);
        return __srcu_read_lock(ssp);
}

/**
 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @idx: return value from corresponding srcu_read_lock().
 *
 * Exit an SRCU read-side critical section.
 */
static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx)
        __releases(ssp)
{
        WARN_ON_ONCE(idx & ~0x1);
        srcu_check_nmi_safety(ssp, false);
        srcu_lock_release(&ssp->dep_map);
        __srcu_read_unlock(ssp, idx);
}

/**
 * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @idx: return value from corresponding srcu_read_lock().
 *
 * Exit an SRCU read-side critical section, but in an NMI-safe manner.
 */
static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
        __releases(ssp)
{
        WARN_ON_ONCE(idx & ~0x1);
        srcu_check_nmi_safety(ssp, true);
        rcu_lock_release(&ssp->dep_map);
        __srcu_read_unlock_nmisafe(ssp, idx);
}

/* Used by tracing, cannot be traced and cannot call lockdep. */
static inline notrace void
srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp)
{
        srcu_check_nmi_safety(ssp, false);
        __srcu_read_unlock(ssp, idx);
}

/**
 * srcu_up_read - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @idx: return value from corresponding srcu_read_lock().
 *
 * Exit an SRCU read-side critical section, but not necessarily from
 * the same context as the maching srcu_down_read().
 */
static inline void srcu_up_read(struct srcu_struct *ssp, int idx)
        __releases(ssp)
{
        WARN_ON_ONCE(idx & ~0x1);
        WARN_ON_ONCE(in_nmi());
        srcu_check_nmi_safety(ssp, false);
        __srcu_read_unlock(ssp, idx);
}

/**
 * smp_mb__after_srcu_read_unlock - ensure full ordering after srcu_read_unlock
 *
 * Converts the preceding srcu_read_unlock into a two-way memory barrier.
 *
 * Call this after srcu_read_unlock, to guarantee that all memory operations
 * that occur after smp_mb__after_srcu_read_unlock will appear to happen after
 * the preceding srcu_read_unlock.
 */
static inline void smp_mb__after_srcu_read_unlock(void)
{
        /* __srcu_read_unlock has smp_mb() internally so nothing to do here. */
}

DEFINE_LOCK_GUARD_1(srcu, struct srcu_struct,
                    _T->idx = srcu_read_lock(_T->lock),
                    srcu_read_unlock(_T->lock, _T->idx),
                    int idx)

#endif



















































































































































































































































































    3 
    3 




































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2013 Politecnico di Torino, Italy
 *                    TORSEC group -- https://security.polito.it
 *
 * Author: Roberto Sassu <roberto.sassu@polito.it>
 *
 * File: ima_template.c
 *      Helpers to manage template descriptors.
 */

#include <linux/rculist.h>
#include "ima.h"
#include "ima_template_lib.h"

enum header_fields { HDR_PCR, HDR_DIGEST, HDR_TEMPLATE_NAME,
                     HDR_TEMPLATE_DATA, HDR__LAST };

static struct ima_template_desc builtin_templates[] = {
        {.name = IMA_TEMPLATE_IMA_NAME, .fmt = IMA_TEMPLATE_IMA_FMT},
        {.name = "ima-ng", .fmt = "d-ng|n-ng"},
        {.name = "ima-sig", .fmt = "d-ng|n-ng|sig"},
        {.name = "ima-ngv2", .fmt = "d-ngv2|n-ng"},
        {.name = "ima-sigv2", .fmt = "d-ngv2|n-ng|sig"},
        {.name = "ima-buf", .fmt = "d-ng|n-ng|buf"},
        {.name = "ima-modsig", .fmt = "d-ng|n-ng|sig|d-modsig|modsig"},
        {.name = "evm-sig",
         .fmt = "d-ng|n-ng|evmsig|xattrnames|xattrlengths|xattrvalues|iuid|igid|imode"},
        {.name = "", .fmt = ""},        /* placeholder for a custom format */
};

static LIST_HEAD(defined_templates);
static DEFINE_SPINLOCK(template_list);
static int template_setup_done;

static const struct ima_template_field supported_fields[] = {
        {.field_id = "d", .field_init = ima_eventdigest_init,
         .field_show = ima_show_template_digest},
        {.field_id = "n", .field_init = ima_eventname_init,
         .field_show = ima_show_template_string},
        {.field_id = "d-ng", .field_init = ima_eventdigest_ng_init,
         .field_show = ima_show_template_digest_ng},
        {.field_id = "d-ngv2", .field_init = ima_eventdigest_ngv2_init,
         .field_show = ima_show_template_digest_ngv2},
        {.field_id = "n-ng", .field_init = ima_eventname_ng_init,
         .field_show = ima_show_template_string},
        {.field_id = "sig", .field_init = ima_eventsig_init,
         .field_show = ima_show_template_sig},
        {.field_id = "buf", .field_init = ima_eventbuf_init,
         .field_show = ima_show_template_buf},
        {.field_id = "d-modsig", .field_init = ima_eventdigest_modsig_init,
         .field_show = ima_show_template_digest_ng},
        {.field_id = "modsig", .field_init = ima_eventmodsig_init,
         .field_show = ima_show_template_sig},
        {.field_id = "evmsig", .field_init = ima_eventevmsig_init,
         .field_show = ima_show_template_sig},
        {.field_id = "iuid", .field_init = ima_eventinodeuid_init,
         .field_show = ima_show_template_uint},
        {.field_id = "igid", .field_init = ima_eventinodegid_init,
         .field_show = ima_show_template_uint},
        {.field_id = "imode", .field_init = ima_eventinodemode_init,
         .field_show = ima_show_template_uint},
        {.field_id = "xattrnames",
         .field_init = ima_eventinodexattrnames_init,
         .field_show = ima_show_template_string},
        {.field_id = "xattrlengths",
         .field_init = ima_eventinodexattrlengths_init,
         .field_show = ima_show_template_sig},
        {.field_id = "xattrvalues",
         .field_init = ima_eventinodexattrvalues_init,
         .field_show = ima_show_template_sig},
};

/*
 * Used when restoring measurements carried over from a kexec. 'd' and 'n' don't
 * need to be accounted for since they shouldn't be defined in the same template
 * description as 'd-ng' and 'n-ng' respectively.
 */
#define MAX_TEMPLATE_NAME_LEN \
        sizeof("d-ng|n-ng|evmsig|xattrnames|xattrlengths|xattrvalues|iuid|igid|imode")

static struct ima_template_desc *ima_template;
static struct ima_template_desc *ima_buf_template;

/**
 * ima_template_has_modsig - Check whether template has modsig-related fields.
 * @ima_template: IMA template to check.
 *
 * Tells whether the given template has fields referencing a file's appended
 * signature.
 */
bool ima_template_has_modsig(const struct ima_template_desc *ima_template)
{
        int i;

        for (i = 0; i < ima_template->num_fields; i++)
                if (!strcmp(ima_template->fields[i]->field_id, "modsig") ||
                    !strcmp(ima_template->fields[i]->field_id, "d-modsig"))
                        return true;

        return false;
}

static int __init ima_template_setup(char *str)
{
        struct ima_template_desc *template_desc;
        int template_len = strlen(str);

        if (template_setup_done)
                return 1;

        if (!ima_template)
                ima_init_template_list();

        /*
         * Verify that a template with the supplied name exists.
         * If not, use CONFIG_IMA_DEFAULT_TEMPLATE.
         */
        template_desc = lookup_template_desc(str);
        if (!template_desc) {
                pr_err("template %s not found, using %s\n",
                       str, CONFIG_IMA_DEFAULT_TEMPLATE);
                return 1;
        }

        /*
         * Verify whether the current hash algorithm is supported
         * by the 'ima' template.
         */
        if (template_len == 3 && strcmp(str, IMA_TEMPLATE_IMA_NAME) == 0 &&
            ima_hash_algo != HASH_ALGO_SHA1 && ima_hash_algo != HASH_ALGO_MD5) {
                pr_err("template does not support hash alg\n");
                return 1;
        }

        ima_template = template_desc;
        template_setup_done = 1;
        return 1;
}
__setup("ima_template=", ima_template_setup);

static int __init ima_template_fmt_setup(char *str)
{
        int num_templates = ARRAY_SIZE(builtin_templates);

        if (template_setup_done)
                return 1;

        if (template_desc_init_fields(str, NULL, NULL) < 0) {
                pr_err("format string '%s' not valid, using template %s\n",
                       str, CONFIG_IMA_DEFAULT_TEMPLATE);
                return 1;
        }

        builtin_templates[num_templates - 1].fmt = str;
        ima_template = builtin_templates + num_templates - 1;
        template_setup_done = 1;

        return 1;
}
__setup("ima_template_fmt=", ima_template_fmt_setup);

struct ima_template_desc *lookup_template_desc(const char *name)
{
        struct ima_template_desc *template_desc;
        int found = 0;

        rcu_read_lock();
        list_for_each_entry_rcu(template_desc, &defined_templates, list) {
                if ((strcmp(template_desc->name, name) == 0) ||
                    (strcmp(template_desc->fmt, name) == 0)) {
                        found = 1;
                        break;
                }
        }
        rcu_read_unlock();
        return found ? template_desc : NULL;
}

static const struct ima_template_field *
lookup_template_field(const char *field_id)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(supported_fields); i++)
                if (strncmp(supported_fields[i].field_id, field_id,
                            IMA_TEMPLATE_FIELD_ID_MAX_LEN) == 0)
                        return &supported_fields[i];
        return NULL;
}

static int template_fmt_size(const char *template_fmt)
{
        char c;
        int template_fmt_len = strlen(template_fmt);
        int i = 0, j = 0;

        while (i < template_fmt_len) {
                c = template_fmt[i];
                if (c == '|')
                        j++;
                i++;
        }

        return j + 1;
}

int template_desc_init_fields(const char *template_fmt,
                              const struct ima_template_field ***fields,
                              int *num_fields)
{
        const char *template_fmt_ptr;
        const struct ima_template_field *found_fields[IMA_TEMPLATE_NUM_FIELDS_MAX];
        int template_num_fields;
        int i, len;

        if (num_fields && *num_fields > 0) /* already initialized? */
                return 0;

        template_num_fields = template_fmt_size(template_fmt);

        if (template_num_fields > IMA_TEMPLATE_NUM_FIELDS_MAX) {
                pr_err("format string '%s' contains too many fields\n",
                       template_fmt);
                return -EINVAL;
        }

        for (i = 0, template_fmt_ptr = template_fmt; i < template_num_fields;
             i++, template_fmt_ptr += len + 1) {
                char tmp_field_id[IMA_TEMPLATE_FIELD_ID_MAX_LEN + 1];

                len = strchrnul(template_fmt_ptr, '|') - template_fmt_ptr;
                if (len == 0 || len > IMA_TEMPLATE_FIELD_ID_MAX_LEN) {
                        pr_err("Invalid field with length %d\n", len);
                        return -EINVAL;
                }

                memcpy(tmp_field_id, template_fmt_ptr, len);
                tmp_field_id[len] = '\0';
                found_fields[i] = lookup_template_field(tmp_field_id);
                if (!found_fields[i]) {
                        pr_err("field '%s' not found\n", tmp_field_id);
                        return -ENOENT;
                }
        }

        if (fields && num_fields) {
                *fields = kmalloc_array(i, sizeof(**fields), GFP_KERNEL);
                if (*fields == NULL)
                        return -ENOMEM;

                memcpy(*fields, found_fields, i * sizeof(**fields));
                *num_fields = i;
        }

        return 0;
}

void ima_init_template_list(void)
{
        int i;

        if (!list_empty(&defined_templates))
                return;

        spin_lock(&template_list);
        for (i = 0; i < ARRAY_SIZE(builtin_templates); i++) {
                list_add_tail_rcu(&builtin_templates[i].list,
                                  &defined_templates);
        }
        spin_unlock(&template_list);
}

struct ima_template_desc *ima_template_desc_current(void)
{
        if (!ima_template) {
                ima_init_template_list();
                ima_template =
                    lookup_template_desc(CONFIG_IMA_DEFAULT_TEMPLATE);
        }
        return ima_template;
}

struct ima_template_desc *ima_template_desc_buf(void)
{
        if (!ima_buf_template) {
                ima_init_template_list();
                ima_buf_template = lookup_template_desc("ima-buf");
        }
        return ima_buf_template;
}

int __init ima_init_template(void)
{
        struct ima_template_desc *template = ima_template_desc_current();
        int result;

        result = template_desc_init_fields(template->fmt,
                                           &(template->fields),
                                           &(template->num_fields));
        if (result < 0) {
                pr_err("template %s init failed, result: %d\n",
                       (strlen(template->name) ?
                       template->name : template->fmt), result);
                return result;
        }

        template = ima_template_desc_buf();
        if (!template) {
                pr_err("Failed to get ima-buf template\n");
                return -EINVAL;
        }

        result = template_desc_init_fields(template->fmt,
                                           &(template->fields),
                                           &(template->num_fields));
        if (result < 0)
                pr_err("template %s init failed, result: %d\n",
                       (strlen(template->name) ?
                       template->name : template->fmt), result);

        return result;
}

static struct ima_template_desc *restore_template_fmt(char *template_name)
{
        struct ima_template_desc *template_desc = NULL;
        int ret;

        ret = template_desc_init_fields(template_name, NULL, NULL);
        if (ret < 0) {
                pr_err("attempting to initialize the template \"%s\" failed\n",
                        template_name);
                goto out;
        }

        template_desc = kzalloc(sizeof(*template_desc), GFP_KERNEL);
        if (!template_desc)
                goto out;

        template_desc->name = "";
        template_desc->fmt = kstrdup(template_name, GFP_KERNEL);
        if (!template_desc->fmt) {
                kfree(template_desc);
                template_desc = NULL;
                goto out;
        }

        spin_lock(&template_list);
        list_add_tail_rcu(&template_desc->list, &defined_templates);
        spin_unlock(&template_list);
out:
        return template_desc;
}

static int ima_restore_template_data(struct ima_template_desc *template_desc,
                                     void *template_data,
                                     int template_data_size,
                                     struct ima_template_entry **entry)
{
        struct tpm_digest *digests;
        int ret = 0;
        int i;

        *entry = kzalloc(struct_size(*entry, template_data,
                                     template_desc->num_fields), GFP_NOFS);
        if (!*entry)
                return -ENOMEM;

        digests = kcalloc(NR_BANKS(ima_tpm_chip) + ima_extra_slots,
                          sizeof(*digests), GFP_NOFS);
        if (!digests) {
                kfree(*entry);
                return -ENOMEM;
        }

        (*entry)->digests = digests;

        ret = ima_parse_buf(template_data, template_data + template_data_size,
                            NULL, template_desc->num_fields,
                            (*entry)->template_data, NULL, NULL,
                            ENFORCE_FIELDS | ENFORCE_BUFEND, "template data");
        if (ret < 0) {
                kfree((*entry)->digests);
                kfree(*entry);
                return ret;
        }

        (*entry)->template_desc = template_desc;
        for (i = 0; i < template_desc->num_fields; i++) {
                struct ima_field_data *field_data = &(*entry)->template_data[i];
                u8 *data = field_data->data;

                (*entry)->template_data[i].data =
                        kzalloc(field_data->len + 1, GFP_KERNEL);
                if (!(*entry)->template_data[i].data) {
                        ret = -ENOMEM;
                        break;
                }
                memcpy((*entry)->template_data[i].data, data, field_data->len);
                (*entry)->template_data_len += sizeof(field_data->len);
                (*entry)->template_data_len += field_data->len;
        }

        if (ret < 0) {
                ima_free_template_entry(*entry);
                *entry = NULL;
        }

        return ret;
}

/* Restore the serialized binary measurement list without extending PCRs. */
int ima_restore_measurement_list(loff_t size, void *buf)
{
        char template_name[MAX_TEMPLATE_NAME_LEN];
        unsigned char zero[TPM_DIGEST_SIZE] = { 0 };

        struct ima_kexec_hdr *khdr = buf;
        struct ima_field_data hdr[HDR__LAST] = {
                [HDR_PCR] = {.len = sizeof(u32)},
                [HDR_DIGEST] = {.len = TPM_DIGEST_SIZE},
        };

        void *bufp = buf + sizeof(*khdr);
        void *bufendp;
        struct ima_template_entry *entry;
        struct ima_template_desc *template_desc;
        DECLARE_BITMAP(hdr_mask, HDR__LAST);
        unsigned long count = 0;
        int ret = 0;

        if (!buf || size < sizeof(*khdr))
                return 0;

        if (ima_canonical_fmt) {
                khdr->version = le16_to_cpu((__force __le16)khdr->version);
                khdr->count = le64_to_cpu((__force __le64)khdr->count);
                khdr->buffer_size = le64_to_cpu((__force __le64)khdr->buffer_size);
        }

        if (khdr->version != 1) {
                pr_err("attempting to restore a incompatible measurement list");
                return -EINVAL;
        }

        if (khdr->count > ULONG_MAX - 1) {
                pr_err("attempting to restore too many measurements");
                return -EINVAL;
        }

        bitmap_zero(hdr_mask, HDR__LAST);
        bitmap_set(hdr_mask, HDR_PCR, 1);
        bitmap_set(hdr_mask, HDR_DIGEST, 1);

        /*
         * ima kexec buffer prefix: version, buffer size, count
         * v1 format: pcr, digest, template-name-len, template-name,
         *              template-data-size, template-data
         */
        bufendp = buf + khdr->buffer_size;
        while ((bufp < bufendp) && (count++ < khdr->count)) {
                int enforce_mask = ENFORCE_FIELDS;

                enforce_mask |= (count == khdr->count) ? ENFORCE_BUFEND : 0;
                ret = ima_parse_buf(bufp, bufendp, &bufp, HDR__LAST, hdr, NULL,
                                    hdr_mask, enforce_mask, "entry header");
                if (ret < 0)
                        break;

                if (hdr[HDR_TEMPLATE_NAME].len >= MAX_TEMPLATE_NAME_LEN) {
                        pr_err("attempting to restore a template name that is too long\n");
                        ret = -EINVAL;
                        break;
                }

                /* template name is not null terminated */
                memcpy(template_name, hdr[HDR_TEMPLATE_NAME].data,
                       hdr[HDR_TEMPLATE_NAME].len);
                template_name[hdr[HDR_TEMPLATE_NAME].len] = 0;

                if (strcmp(template_name, "ima") == 0) {
                        pr_err("attempting to restore an unsupported template \"%s\" failed\n",
                               template_name);
                        ret = -EINVAL;
                        break;
                }

                template_desc = lookup_template_desc(template_name);
                if (!template_desc) {
                        template_desc = restore_template_fmt(template_name);
                        if (!template_desc)
                                break;
                }

                /*
                 * Only the running system's template format is initialized
                 * on boot.  As needed, initialize the other template formats.
                 */
                ret = template_desc_init_fields(template_desc->fmt,
                                                &(template_desc->fields),
                                                &(template_desc->num_fields));
                if (ret < 0) {
                        pr_err("attempting to restore the template fmt \"%s\" failed\n",
                               template_desc->fmt);
                        ret = -EINVAL;
                        break;
                }

                ret = ima_restore_template_data(template_desc,
                                                hdr[HDR_TEMPLATE_DATA].data,
                                                hdr[HDR_TEMPLATE_DATA].len,
                                                &entry);
                if (ret < 0)
                        break;

                if (memcmp(hdr[HDR_DIGEST].data, zero, sizeof(zero))) {
                        ret = ima_calc_field_array_hash(
                                                &entry->template_data[0],
                                                entry);
                        if (ret < 0) {
                                pr_err("cannot calculate template digest\n");
                                ret = -EINVAL;
                                break;
                        }
                }

                entry->pcr = !ima_canonical_fmt ? *(u32 *)(hdr[HDR_PCR].data) :
                             le32_to_cpu(*(__le32 *)(hdr[HDR_PCR].data));
                ret = ima_restore_measurement_entry(entry);
                if (ret < 0)
                        break;

        }
        return ret;
}


























































    4 







    4 

    4 

    4 







































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2019 Facebook
 * Copyright 2020 Google LLC.
 */

#include <linux/rculist.h>
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/bpf.h>
#include <linux/bpf_local_storage.h>
#include <net/sock.h>
#include <uapi/linux/sock_diag.h>
#include <uapi/linux/btf.h>
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
#include <linux/fdtable.h>
#include <linux/rcupdate_trace.h>

DEFINE_BPF_STORAGE_CACHE(inode_cache);

static struct bpf_local_storage __rcu **
inode_storage_ptr(void *owner)
{
        struct inode *inode = owner;
        struct bpf_storage_blob *bsb;

        bsb = bpf_inode(inode);
        if (!bsb)
                return NULL;
        return &bsb->storage;
}

static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
                                                           struct bpf_map *map,
                                                           bool cacheit_lockit)
{
        struct bpf_local_storage *inode_storage;
        struct bpf_local_storage_map *smap;
        struct bpf_storage_blob *bsb;

        bsb = bpf_inode(inode);
        if (!bsb)
                return NULL;

        inode_storage =
                rcu_dereference_check(bsb->storage, bpf_rcu_lock_held());
        if (!inode_storage)
                return NULL;

        smap = (struct bpf_local_storage_map *)map;
        return bpf_local_storage_lookup(inode_storage, smap, cacheit_lockit);
}

void bpf_inode_storage_free(struct inode *inode)
{
        struct bpf_local_storage *local_storage;
        struct bpf_storage_blob *bsb;

        bsb = bpf_inode(inode);
        if (!bsb)
                return;

        rcu_read_lock();

        local_storage = rcu_dereference(bsb->storage);
        if (!local_storage) {
                rcu_read_unlock();
                return;
        }

        bpf_local_storage_destroy(local_storage);
        rcu_read_unlock();
}

static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
{
        struct bpf_local_storage_data *sdata;
        struct fd f = fdget_raw(*(int *)key);

        if (!f.file)
                return ERR_PTR(-EBADF);

        sdata = inode_storage_lookup(file_inode(f.file), map, true);
        fdput(f);
        return sdata ? sdata->data : NULL;
}

static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
                                             void *value, u64 map_flags)
{
        struct bpf_local_storage_data *sdata;
        struct fd f = fdget_raw(*(int *)key);

        if (!f.file)
                return -EBADF;
        if (!inode_storage_ptr(file_inode(f.file))) {
                fdput(f);
                return -EBADF;
        }

        sdata = bpf_local_storage_update(file_inode(f.file),
                                         (struct bpf_local_storage_map *)map,
                                         value, map_flags, GFP_ATOMIC);
        fdput(f);
        return PTR_ERR_OR_ZERO(sdata);
}

static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
{
        struct bpf_local_storage_data *sdata;

        sdata = inode_storage_lookup(inode, map, false);
        if (!sdata)
                return -ENOENT;

        bpf_selem_unlink(SELEM(sdata), false);

        return 0;
}

static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
{
        struct fd f = fdget_raw(*(int *)key);
        int err;

        if (!f.file)
                return -EBADF;

        err = inode_storage_delete(file_inode(f.file), map);
        fdput(f);
        return err;
}

/* *gfp_flags* is a hidden argument provided by the verifier */
BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
           void *, value, u64, flags, gfp_t, gfp_flags)
{
        struct bpf_local_storage_data *sdata;

        WARN_ON_ONCE(!bpf_rcu_lock_held());
        if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
                return (unsigned long)NULL;

        /* explicitly check that the inode_storage_ptr is not
         * NULL as inode_storage_lookup returns NULL in this case and
         * bpf_local_storage_update expects the owner to have a
         * valid storage pointer.
         */
        if (!inode || !inode_storage_ptr(inode))
                return (unsigned long)NULL;

        sdata = inode_storage_lookup(inode, map, true);
        if (sdata)
                return (unsigned long)sdata->data;

        /* This helper must only called from where the inode is guaranteed
         * to have a refcount and cannot be freed.
         */
        if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
                sdata = bpf_local_storage_update(
                        inode, (struct bpf_local_storage_map *)map, value,
                        BPF_NOEXIST, gfp_flags);
                return IS_ERR(sdata) ? (unsigned long)NULL :
                                             (unsigned long)sdata->data;
        }

        return (unsigned long)NULL;
}

BPF_CALL_2(bpf_inode_storage_delete,
           struct bpf_map *, map, struct inode *, inode)
{
        WARN_ON_ONCE(!bpf_rcu_lock_held());
        if (!inode)
                return -EINVAL;

        /* This helper must only called from where the inode is guaranteed
         * to have a refcount and cannot be freed.
         */
        return inode_storage_delete(inode, map);
}

static int notsupp_get_next_key(struct bpf_map *map, void *key,
                                void *next_key)
{
        return -ENOTSUPP;
}

static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
{
        return bpf_local_storage_map_alloc(attr, &inode_cache, false);
}

static void inode_storage_map_free(struct bpf_map *map)
{
        bpf_local_storage_map_free(map, &inode_cache, NULL);
}

const struct bpf_map_ops inode_storage_map_ops = {
        .map_meta_equal = bpf_map_meta_equal,
        .map_alloc_check = bpf_local_storage_map_alloc_check,
        .map_alloc = inode_storage_map_alloc,
        .map_free = inode_storage_map_free,
        .map_get_next_key = notsupp_get_next_key,
        .map_lookup_elem = bpf_fd_inode_storage_lookup_elem,
        .map_update_elem = bpf_fd_inode_storage_update_elem,
        .map_delete_elem = bpf_fd_inode_storage_delete_elem,
        .map_check_btf = bpf_local_storage_map_check_btf,
        .map_mem_usage = bpf_local_storage_map_mem_usage,
        .map_btf_id = &bpf_local_storage_map_btf_id[0],
        .map_owner_storage_ptr = inode_storage_ptr,
};

BTF_ID_LIST_SINGLE(bpf_inode_storage_btf_ids, struct, inode)

const struct bpf_func_proto bpf_inode_storage_get_proto = {
        .func                = bpf_inode_storage_get,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_MAP_VALUE_OR_NULL,
        .arg1_type        = ARG_CONST_MAP_PTR,
        .arg2_type        = ARG_PTR_TO_BTF_ID_OR_NULL,
        .arg2_btf_id        = &bpf_inode_storage_btf_ids[0],
        .arg3_type        = ARG_PTR_TO_MAP_VALUE_OR_NULL,
        .arg4_type        = ARG_ANYTHING,
};

const struct bpf_func_proto bpf_inode_storage_delete_proto = {
        .func                = bpf_inode_storage_delete,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_CONST_MAP_PTR,
        .arg2_type        = ARG_PTR_TO_BTF_ID_OR_NULL,
        .arg2_btf_id        = &bpf_inode_storage_btf_ids[0],
};













































    1 












































































































    1 











    1 




    1 








    1 






















































































    1 
    1 
    1 










































    1 







    1 























    1 






    1 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
// SPDX-License-Identifier: GPL-2.0
/* Generic part */

typedef struct {
        block_t        *p;
        block_t        key;
        struct buffer_head *bh;
} Indirect;

static DEFINE_RWLOCK(pointers_lock);

static inline void add_chain(Indirect *p, struct buffer_head *bh, block_t *v)
{
        p->key = *(p->p = v);
        p->bh = bh;
}

static inline int verify_chain(Indirect *from, Indirect *to)
{
        while (from <= to && from->key == *from->p)
                from++;
        return (from > to);
}

static inline block_t *block_end(struct buffer_head *bh)
{
        return (block_t *)((char*)bh->b_data + bh->b_size);
}

static inline Indirect *get_branch(struct inode *inode,
                                        int depth,
                                        int *offsets,
                                        Indirect chain[DEPTH],
                                        int *err)
{
        struct super_block *sb = inode->i_sb;
        Indirect *p = chain;
        struct buffer_head *bh;

        *err = 0;
        /* i_data is not going away, no lock needed */
        add_chain (chain, NULL, i_data(inode) + *offsets);
        if (!p->key)
                goto no_block;
        while (--depth) {
                bh = sb_bread(sb, block_to_cpu(p->key));
                if (!bh)
                        goto failure;
                read_lock(&pointers_lock);
                if (!verify_chain(chain, p))
                        goto changed;
                add_chain(++p, bh, (block_t *)bh->b_data + *++offsets);
                read_unlock(&pointers_lock);
                if (!p->key)
                        goto no_block;
        }
        return NULL;

changed:
        read_unlock(&pointers_lock);
        brelse(bh);
        *err = -EAGAIN;
        goto no_block;
failure:
        *err = -EIO;
no_block:
        return p;
}

static int alloc_branch(struct inode *inode,
                             int num,
                             int *offsets,
                             Indirect *branch)
{
        int n = 0;
        int i;
        int parent = minix_new_block(inode);
        int err = -ENOSPC;

        branch[0].key = cpu_to_block(parent);
        if (parent) for (n = 1; n < num; n++) {
                struct buffer_head *bh;
                /* Allocate the next block */
                int nr = minix_new_block(inode);
                if (!nr)
                        break;
                branch[n].key = cpu_to_block(nr);
                bh = sb_getblk(inode->i_sb, parent);
                if (!bh) {
                        minix_free_block(inode, nr);
                        err = -ENOMEM;
                        break;
                }
                lock_buffer(bh);
                memset(bh->b_data, 0, bh->b_size);
                branch[n].bh = bh;
                branch[n].p = (block_t*) bh->b_data + offsets[n];
                *branch[n].p = branch[n].key;
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
                mark_buffer_dirty_inode(bh, inode);
                parent = nr;
        }
        if (n == num)
                return 0;

        /* Allocation failed, free what we already allocated */
        for (i = 1; i < n; i++)
                bforget(branch[i].bh);
        for (i = 0; i < n; i++)
                minix_free_block(inode, block_to_cpu(branch[i].key));
        return err;
}

static inline int splice_branch(struct inode *inode,
                                     Indirect chain[DEPTH],
                                     Indirect *where,
                                     int num)
{
        int i;

        write_lock(&pointers_lock);

        /* Verify that place we are splicing to is still there and vacant */
        if (!verify_chain(chain, where-1) || *where->p)
                goto changed;

        *where->p = where->key;

        write_unlock(&pointers_lock);

        /* We are done with atomic stuff, now do the rest of housekeeping */

        inode_set_ctime_current(inode);

        /* had we spliced it onto indirect block? */
        if (where->bh)
                mark_buffer_dirty_inode(where->bh, inode);

        mark_inode_dirty(inode);
        return 0;

changed:
        write_unlock(&pointers_lock);
        for (i = 1; i < num; i++)
                bforget(where[i].bh);
        for (i = 0; i < num; i++)
                minix_free_block(inode, block_to_cpu(where[i].key));
        return -EAGAIN;
}

static int get_block(struct inode * inode, sector_t block,
                        struct buffer_head *bh, int create)
{
        int err = -EIO;
        int offsets[DEPTH];
        Indirect chain[DEPTH];
        Indirect *partial;
        int left;
        int depth = block_to_path(inode, block, offsets);

        if (depth == 0)
                goto out;

reread:
        partial = get_branch(inode, depth, offsets, chain, &err);

        /* Simplest case - block found, no allocation needed */
        if (!partial) {
got_it:
                map_bh(bh, inode->i_sb, block_to_cpu(chain[depth-1].key));
                /* Clean up and exit */
                partial = chain+depth-1; /* the whole chain */
                goto cleanup;
        }

        /* Next simple case - plain lookup or failed read of indirect block */
        if (!create || err == -EIO) {
cleanup:
                while (partial > chain) {
                        brelse(partial->bh);
                        partial--;
                }
out:
                return err;
        }

        /*
         * Indirect block might be removed by truncate while we were
         * reading it. Handling of that case (forget what we've got and
         * reread) is taken out of the main path.
         */
        if (err == -EAGAIN)
                goto changed;

        left = (chain + depth) - partial;
        err = alloc_branch(inode, left, offsets+(partial-chain), partial);
        if (err)
                goto cleanup;

        if (splice_branch(inode, chain, partial, left) < 0)
                goto changed;

        set_buffer_new(bh);
        goto got_it;

changed:
        while (partial > chain) {
                brelse(partial->bh);
                partial--;
        }
        goto reread;
}

static inline int all_zeroes(block_t *p, block_t *q)
{
        while (p < q)
                if (*p++)
                        return 0;
        return 1;
}

static Indirect *find_shared(struct inode *inode,
                                int depth,
                                int offsets[DEPTH],
                                Indirect chain[DEPTH],
                                block_t *top)
{
        Indirect *partial, *p;
        int k, err;

        *top = 0;
        for (k = depth; k > 1 && !offsets[k-1]; k--)
                ;
        partial = get_branch(inode, k, offsets, chain, &err);

        write_lock(&pointers_lock);
        if (!partial)
                partial = chain + k-1;
        if (!partial->key && *partial->p) {
                write_unlock(&pointers_lock);
                goto no_top;
        }
        for (p=partial;p>chain && all_zeroes((block_t*)p->bh->b_data,p->p);p--)
                ;
        if (p == chain + k - 1 && p > chain) {
                p->p--;
        } else {
                *top = *p->p;
                *p->p = 0;
        }
        write_unlock(&pointers_lock);

        while(partial > p)
        {
                brelse(partial->bh);
                partial--;
        }
no_top:
        return partial;
}

static inline void free_data(struct inode *inode, block_t *p, block_t *q)
{
        unsigned long nr;

        for ( ; p < q ; p++) {
                nr = block_to_cpu(*p);
                if (nr) {
                        *p = 0;
                        minix_free_block(inode, nr);
                }
        }
}

static void free_branches(struct inode *inode, block_t *p, block_t *q, int depth)
{
        struct buffer_head * bh;
        unsigned long nr;

        if (depth--) {
                for ( ; p < q ; p++) {
                        nr = block_to_cpu(*p);
                        if (!nr)
                                continue;
                        *p = 0;
                        bh = sb_bread(inode->i_sb, nr);
                        if (!bh)
                                continue;
                        free_branches(inode, (block_t*)bh->b_data,
                                      block_end(bh), depth);
                        bforget(bh);
                        minix_free_block(inode, nr);
                        mark_inode_dirty(inode);
                }
        } else
                free_data(inode, p, q);
}

static inline void truncate (struct inode * inode)
{
        struct super_block *sb = inode->i_sb;
        block_t *idata = i_data(inode);
        int offsets[DEPTH];
        Indirect chain[DEPTH];
        Indirect *partial;
        block_t nr = 0;
        int n;
        int first_whole;
        long iblock;

        iblock = (inode->i_size + sb->s_blocksize -1) >> sb->s_blocksize_bits;
        block_truncate_page(inode->i_mapping, inode->i_size, get_block);

        n = block_to_path(inode, iblock, offsets);
        if (!n)
                return;

        if (n == 1) {
                free_data(inode, idata+offsets[0], idata + DIRECT);
                first_whole = 0;
                goto do_indirects;
        }

        first_whole = offsets[0] + 1 - DIRECT;
        partial = find_shared(inode, n, offsets, chain, &nr);
        if (nr) {
                if (partial == chain)
                        mark_inode_dirty(inode);
                else
                        mark_buffer_dirty_inode(partial->bh, inode);
                free_branches(inode, &nr, &nr+1, (chain+n-1) - partial);
        }
        /* Clear the ends of indirect blocks on the shared branch */
        while (partial > chain) {
                free_branches(inode, partial->p + 1, block_end(partial->bh),
                                (chain+n-1) - partial);
                mark_buffer_dirty_inode(partial->bh, inode);
                brelse (partial->bh);
                partial--;
        }
do_indirects:
        /* Kill the remaining (whole) subtrees */
        while (first_whole < DEPTH-1) {
                nr = idata[DIRECT+first_whole];
                if (nr) {
                        idata[DIRECT+first_whole] = 0;
                        mark_inode_dirty(inode);
                        free_branches(inode, &nr, &nr+1, first_whole+1);
                }
                first_whole++;
        }
        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        mark_inode_dirty(inode);
}

static inline unsigned nblocks(loff_t size, struct super_block *sb)
{
        int k = sb->s_blocksize_bits - 10;
        unsigned blocks, res, direct = DIRECT, i = DEPTH;
        blocks = (size + sb->s_blocksize - 1) >> (BLOCK_SIZE_BITS + k);
        res = blocks;
        while (--i && blocks > direct) {
                blocks -= direct;
                blocks += sb->s_blocksize/sizeof(block_t) - 1;
                blocks /= sb->s_blocksize/sizeof(block_t);
                res += blocks;
                direct = 1;
        }
        return res;
}



































   11 















   12 




































































































    6 

























    6 



    5 











   10 



    1 
    6 
   13 





   12 
















   10 






   13 









   11 







   11 
   13 
   12 











    3 






    3 







    3 
    3 
    3 






    5 


    3 
    2 





















































































































































































   12 


























































    3 
    3 













































    1 



    1 


















    1 






































    1 











    1 









    1 






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2016 Facebook
 * Copyright (C) 2013-2014 Jens Axboe
 */

#include <linux/sched.h>
#include <linux/random.h>
#include <linux/sbitmap.h>
#include <linux/seq_file.h>

static int init_alloc_hint(struct sbitmap *sb, gfp_t flags)
{
        unsigned depth = sb->depth;

        sb->alloc_hint = alloc_percpu_gfp(unsigned int, flags);
        if (!sb->alloc_hint)
                return -ENOMEM;

        if (depth && !sb->round_robin) {
                int i;

                for_each_possible_cpu(i)
                        *per_cpu_ptr(sb->alloc_hint, i) = get_random_u32_below(depth);
        }
        return 0;
}

static inline unsigned update_alloc_hint_before_get(struct sbitmap *sb,
                                                    unsigned int depth)
{
        unsigned hint;

        hint = this_cpu_read(*sb->alloc_hint);
        if (unlikely(hint >= depth)) {
                hint = depth ? get_random_u32_below(depth) : 0;
                this_cpu_write(*sb->alloc_hint, hint);
        }

        return hint;
}

static inline void update_alloc_hint_after_get(struct sbitmap *sb,
                                               unsigned int depth,
                                               unsigned int hint,
                                               unsigned int nr)
{
        if (nr == -1) {
                /* If the map is full, a hint won't do us much good. */
                this_cpu_write(*sb->alloc_hint, 0);
        } else if (nr == hint || unlikely(sb->round_robin)) {
                /* Only update the hint if we used it. */
                hint = nr + 1;
                if (hint >= depth - 1)
                        hint = 0;
                this_cpu_write(*sb->alloc_hint, hint);
        }
}

/*
 * See if we have deferred clears that we can batch move
 */
static inline bool sbitmap_deferred_clear(struct sbitmap_word *map)
{
        unsigned long mask;

        if (!READ_ONCE(map->cleared))
                return false;

        /*
         * First get a stable cleared mask, setting the old mask to 0.
         */
        mask = xchg(&map->cleared, 0);

        /*
         * Now clear the masked bits in our free word
         */
        atomic_long_andnot(mask, (atomic_long_t *)&map->word);
        BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(map->word));
        return true;
}

int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
                      gfp_t flags, int node, bool round_robin,
                      bool alloc_hint)
{
        unsigned int bits_per_word;

        if (shift < 0)
                shift = sbitmap_calculate_shift(depth);

        bits_per_word = 1U << shift;
        if (bits_per_word > BITS_PER_LONG)
                return -EINVAL;

        sb->shift = shift;
        sb->depth = depth;
        sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
        sb->round_robin = round_robin;

        if (depth == 0) {
                sb->map = NULL;
                return 0;
        }

        if (alloc_hint) {
                if (init_alloc_hint(sb, flags))
                        return -ENOMEM;
        } else {
                sb->alloc_hint = NULL;
        }

        sb->map = kvzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node);
        if (!sb->map) {
                free_percpu(sb->alloc_hint);
                return -ENOMEM;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(sbitmap_init_node);

void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
{
        unsigned int bits_per_word = 1U << sb->shift;
        unsigned int i;

        for (i = 0; i < sb->map_nr; i++)
                sbitmap_deferred_clear(&sb->map[i]);

        sb->depth = depth;
        sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
}
EXPORT_SYMBOL_GPL(sbitmap_resize);

static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
                              unsigned int hint, bool wrap)
{
        int nr;

        /* don't wrap if starting from 0 */
        wrap = wrap && hint;

        while (1) {
                nr = find_next_zero_bit(word, depth, hint);
                if (unlikely(nr >= depth)) {
                        /*
                         * We started with an offset, and we didn't reset the
                         * offset to 0 in a failure case, so start from 0 to
                         * exhaust the map.
                         */
                        if (hint && wrap) {
                                hint = 0;
                                continue;
                        }
                        return -1;
                }

                if (!test_and_set_bit_lock(nr, word))
                        break;

                hint = nr + 1;
                if (hint >= depth - 1)
                        hint = 0;
        }

        return nr;
}

static int sbitmap_find_bit_in_word(struct sbitmap_word *map,
                                    unsigned int depth,
                                    unsigned int alloc_hint,
                                    bool wrap)
{
        int nr;

        do {
                nr = __sbitmap_get_word(&map->word, depth,
                                        alloc_hint, wrap);
                if (nr != -1)
                        break;
                if (!sbitmap_deferred_clear(map))
                        break;
        } while (1);

        return nr;
}

static int sbitmap_find_bit(struct sbitmap *sb,
                            unsigned int depth,
                            unsigned int index,
                            unsigned int alloc_hint,
                            bool wrap)
{
        unsigned int i;
        int nr = -1;

        for (i = 0; i < sb->map_nr; i++) {
                nr = sbitmap_find_bit_in_word(&sb->map[index],
                                              min_t(unsigned int,
                                                    __map_depth(sb, index),
                                                    depth),
                                              alloc_hint, wrap);

                if (nr != -1) {
                        nr += index << sb->shift;
                        break;
                }

                /* Jump to next index. */
                alloc_hint = 0;
                if (++index >= sb->map_nr)
                        index = 0;
        }

        return nr;
}

static int __sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint)
{
        unsigned int index;

        index = SB_NR_TO_INDEX(sb, alloc_hint);

        /*
         * Unless we're doing round robin tag allocation, just use the
         * alloc_hint to find the right word index. No point in looping
         * twice in find_next_zero_bit() for that case.
         */
        if (sb->round_robin)
                alloc_hint = SB_NR_TO_BIT(sb, alloc_hint);
        else
                alloc_hint = 0;

        return sbitmap_find_bit(sb, UINT_MAX, index, alloc_hint,
                                !sb->round_robin);
}

int sbitmap_get(struct sbitmap *sb)
{
        int nr;
        unsigned int hint, depth;

        if (WARN_ON_ONCE(unlikely(!sb->alloc_hint)))
                return -1;

        depth = READ_ONCE(sb->depth);
        hint = update_alloc_hint_before_get(sb, depth);
        nr = __sbitmap_get(sb, hint);
        update_alloc_hint_after_get(sb, depth, hint, nr);

        return nr;
}
EXPORT_SYMBOL_GPL(sbitmap_get);

static int __sbitmap_get_shallow(struct sbitmap *sb,
                                 unsigned int alloc_hint,
                                 unsigned long shallow_depth)
{
        unsigned int index;

        index = SB_NR_TO_INDEX(sb, alloc_hint);
        alloc_hint = SB_NR_TO_BIT(sb, alloc_hint);

        return sbitmap_find_bit(sb, shallow_depth, index, alloc_hint, true);
}

int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth)
{
        int nr;
        unsigned int hint, depth;

        if (WARN_ON_ONCE(unlikely(!sb->alloc_hint)))
                return -1;

        depth = READ_ONCE(sb->depth);
        hint = update_alloc_hint_before_get(sb, depth);
        nr = __sbitmap_get_shallow(sb, hint, shallow_depth);
        update_alloc_hint_after_get(sb, depth, hint, nr);

        return nr;
}
EXPORT_SYMBOL_GPL(sbitmap_get_shallow);

bool sbitmap_any_bit_set(const struct sbitmap *sb)
{
        unsigned int i;

        for (i = 0; i < sb->map_nr; i++) {
                if (sb->map[i].word & ~sb->map[i].cleared)
                        return true;
        }
        return false;
}
EXPORT_SYMBOL_GPL(sbitmap_any_bit_set);

static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set)
{
        unsigned int i, weight = 0;

        for (i = 0; i < sb->map_nr; i++) {
                const struct sbitmap_word *word = &sb->map[i];
                unsigned int word_depth = __map_depth(sb, i);

                if (set)
                        weight += bitmap_weight(&word->word, word_depth);
                else
                        weight += bitmap_weight(&word->cleared, word_depth);
        }
        return weight;
}

static unsigned int sbitmap_cleared(const struct sbitmap *sb)
{
        return __sbitmap_weight(sb, false);
}

unsigned int sbitmap_weight(const struct sbitmap *sb)
{
        return __sbitmap_weight(sb, true) - sbitmap_cleared(sb);
}
EXPORT_SYMBOL_GPL(sbitmap_weight);

void sbitmap_show(struct sbitmap *sb, struct seq_file *m)
{
        seq_printf(m, "depth=%u\n", sb->depth);
        seq_printf(m, "busy=%u\n", sbitmap_weight(sb));
        seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb));
        seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift);
        seq_printf(m, "map_nr=%u\n", sb->map_nr);
}
EXPORT_SYMBOL_GPL(sbitmap_show);

static inline void emit_byte(struct seq_file *m, unsigned int offset, u8 byte)
{
        if ((offset & 0xf) == 0) {
                if (offset != 0)
                        seq_putc(m, '\n');
                seq_printf(m, "%08x:", offset);
        }
        if ((offset & 0x1) == 0)
                seq_putc(m, ' ');
        seq_printf(m, "%02x", byte);
}

void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m)
{
        u8 byte = 0;
        unsigned int byte_bits = 0;
        unsigned int offset = 0;
        int i;

        for (i = 0; i < sb->map_nr; i++) {
                unsigned long word = READ_ONCE(sb->map[i].word);
                unsigned long cleared = READ_ONCE(sb->map[i].cleared);
                unsigned int word_bits = __map_depth(sb, i);

                word &= ~cleared;

                while (word_bits > 0) {
                        unsigned int bits = min(8 - byte_bits, word_bits);

                        byte |= (word & (BIT(bits) - 1)) << byte_bits;
                        byte_bits += bits;
                        if (byte_bits == 8) {
                                emit_byte(m, offset, byte);
                                byte = 0;
                                byte_bits = 0;
                                offset++;
                        }
                        word >>= bits;
                        word_bits -= bits;
                }
        }
        if (byte_bits) {
                emit_byte(m, offset, byte);
                offset++;
        }
        if (offset)
                seq_putc(m, '\n');
}
EXPORT_SYMBOL_GPL(sbitmap_bitmap_show);

static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq,
                                        unsigned int depth)
{
        unsigned int wake_batch;
        unsigned int shallow_depth;

        /*
         * Each full word of the bitmap has bits_per_word bits, and there might
         * be a partial word. There are depth / bits_per_word full words and
         * depth % bits_per_word bits left over. In bitwise arithmetic:
         *
         * bits_per_word = 1 << shift
         * depth / bits_per_word = depth >> shift
         * depth % bits_per_word = depth & ((1 << shift) - 1)
         *
         * Each word can be limited to sbq->min_shallow_depth bits.
         */
        shallow_depth = min(1U << sbq->sb.shift, sbq->min_shallow_depth);
        depth = ((depth >> sbq->sb.shift) * shallow_depth +
                 min(depth & ((1U << sbq->sb.shift) - 1), shallow_depth));
        wake_batch = clamp_t(unsigned int, depth / SBQ_WAIT_QUEUES, 1,
                             SBQ_WAKE_BATCH);

        return wake_batch;
}

int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
                            int shift, bool round_robin, gfp_t flags, int node)
{
        int ret;
        int i;

        ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node,
                                round_robin, true);
        if (ret)
                return ret;

        sbq->min_shallow_depth = UINT_MAX;
        sbq->wake_batch = sbq_calc_wake_batch(sbq, depth);
        atomic_set(&sbq->wake_index, 0);
        atomic_set(&sbq->ws_active, 0);
        atomic_set(&sbq->completion_cnt, 0);
        atomic_set(&sbq->wakeup_cnt, 0);

        sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
        if (!sbq->ws) {
                sbitmap_free(&sbq->sb);
                return -ENOMEM;
        }

        for (i = 0; i < SBQ_WAIT_QUEUES; i++)
                init_waitqueue_head(&sbq->ws[i].wait);

        return 0;
}
EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);

static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
                                            unsigned int depth)
{
        unsigned int wake_batch;

        wake_batch = sbq_calc_wake_batch(sbq, depth);
        if (sbq->wake_batch != wake_batch)
                WRITE_ONCE(sbq->wake_batch, wake_batch);
}

void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
                                            unsigned int users)
{
        unsigned int wake_batch;
        unsigned int depth = (sbq->sb.depth + users - 1) / users;

        wake_batch = clamp_val(depth / SBQ_WAIT_QUEUES,
                        1, SBQ_WAKE_BATCH);

        WRITE_ONCE(sbq->wake_batch, wake_batch);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch);

void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
{
        sbitmap_queue_update_wake_batch(sbq, depth);
        sbitmap_resize(&sbq->sb, depth);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_resize);

int __sbitmap_queue_get(struct sbitmap_queue *sbq)
{
        return sbitmap_get(&sbq->sb);
}
EXPORT_SYMBOL_GPL(__sbitmap_queue_get);

unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
                                        unsigned int *offset)
{
        struct sbitmap *sb = &sbq->sb;
        unsigned int hint, depth;
        unsigned long index, nr;
        int i;

        if (unlikely(sb->round_robin))
                return 0;

        depth = READ_ONCE(sb->depth);
        hint = update_alloc_hint_before_get(sb, depth);

        index = SB_NR_TO_INDEX(sb, hint);

        for (i = 0; i < sb->map_nr; i++) {
                struct sbitmap_word *map = &sb->map[index];
                unsigned long get_mask;
                unsigned int map_depth = __map_depth(sb, index);
                unsigned long val;

                sbitmap_deferred_clear(map);
                val = READ_ONCE(map->word);
                if (val == (1UL << (map_depth - 1)) - 1)
                        goto next;

                nr = find_first_zero_bit(&val, map_depth);
                if (nr + nr_tags <= map_depth) {
                        atomic_long_t *ptr = (atomic_long_t *) &map->word;

                        get_mask = ((1UL << nr_tags) - 1) << nr;
                        while (!atomic_long_try_cmpxchg(ptr, &val,
                                                          get_mask | val))
                                ;
                        get_mask = (get_mask & ~val) >> nr;
                        if (get_mask) {
                                *offset = nr + (index << sb->shift);
                                update_alloc_hint_after_get(sb, depth, hint,
                                                        *offset + nr_tags - 1);
                                return get_mask;
                        }
                }
next:
                /* Jump to next index. */
                if (++index >= sb->map_nr)
                        index = 0;
        }

        return 0;
}

int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
                              unsigned int shallow_depth)
{
        WARN_ON_ONCE(shallow_depth < sbq->min_shallow_depth);

        return sbitmap_get_shallow(&sbq->sb, shallow_depth);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_get_shallow);

void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
                                     unsigned int min_shallow_depth)
{
        sbq->min_shallow_depth = min_shallow_depth;
        sbitmap_queue_update_wake_batch(sbq, sbq->sb.depth);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth);

static void __sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr)
{
        int i, wake_index, woken;

        if (!atomic_read(&sbq->ws_active))
                return;

        wake_index = atomic_read(&sbq->wake_index);
        for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
                struct sbq_wait_state *ws = &sbq->ws[wake_index];

                /*
                 * Advance the index before checking the current queue.
                 * It improves fairness, by ensuring the queue doesn't
                 * need to be fully emptied before trying to wake up
                 * from the next one.
                 */
                wake_index = sbq_index_inc(wake_index);

                if (waitqueue_active(&ws->wait)) {
                        woken = wake_up_nr(&ws->wait, nr);
                        if (woken == nr)
                                break;
                        nr -= woken;
                }
        }

        if (wake_index != atomic_read(&sbq->wake_index))
                atomic_set(&sbq->wake_index, wake_index);
}

void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr)
{
        unsigned int wake_batch = READ_ONCE(sbq->wake_batch);
        unsigned int wakeups;

        if (!atomic_read(&sbq->ws_active))
                return;

        atomic_add(nr, &sbq->completion_cnt);
        wakeups = atomic_read(&sbq->wakeup_cnt);

        do {
                if (atomic_read(&sbq->completion_cnt) - wakeups < wake_batch)
                        return;
        } while (!atomic_try_cmpxchg(&sbq->wakeup_cnt,
                                     &wakeups, wakeups + wake_batch));

        __sbitmap_queue_wake_up(sbq, wake_batch);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);

static inline void sbitmap_update_cpu_hint(struct sbitmap *sb, int cpu, int tag)
{
        if (likely(!sb->round_robin && tag < sb->depth))
                data_race(*per_cpu_ptr(sb->alloc_hint, cpu) = tag);
}

void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
                                int *tags, int nr_tags)
{
        struct sbitmap *sb = &sbq->sb;
        unsigned long *addr = NULL;
        unsigned long mask = 0;
        int i;

        smp_mb__before_atomic();
        for (i = 0; i < nr_tags; i++) {
                const int tag = tags[i] - offset;
                unsigned long *this_addr;

                /* since we're clearing a batch, skip the deferred map */
                this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word;
                if (!addr) {
                        addr = this_addr;
                } else if (addr != this_addr) {
                        atomic_long_andnot(mask, (atomic_long_t *) addr);
                        mask = 0;
                        addr = this_addr;
                }
                mask |= (1UL << SB_NR_TO_BIT(sb, tag));
        }

        if (mask)
                atomic_long_andnot(mask, (atomic_long_t *) addr);

        smp_mb__after_atomic();
        sbitmap_queue_wake_up(sbq, nr_tags);
        sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(),
                                        tags[nr_tags - 1] - offset);
}

void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
                         unsigned int cpu)
{
        /*
         * Once the clear bit is set, the bit may be allocated out.
         *
         * Orders READ/WRITE on the associated instance(such as request
         * of blk_mq) by this bit for avoiding race with re-allocation,
         * and its pair is the memory barrier implied in __sbitmap_get_word.
         *
         * One invariant is that the clear bit has to be zero when the bit
         * is in use.
         */
        smp_mb__before_atomic();
        sbitmap_deferred_clear_bit(&sbq->sb, nr);

        /*
         * Pairs with the memory barrier in set_current_state() to ensure the
         * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker
         * and test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the
         * waiter. See the comment on waitqueue_active().
         */
        smp_mb__after_atomic();
        sbitmap_queue_wake_up(sbq, 1);
        sbitmap_update_cpu_hint(&sbq->sb, cpu, nr);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_clear);

void sbitmap_queue_wake_all(struct sbitmap_queue *sbq)
{
        int i, wake_index;

        /*
         * Pairs with the memory barrier in set_current_state() like in
         * sbitmap_queue_wake_up().
         */
        smp_mb();
        wake_index = atomic_read(&sbq->wake_index);
        for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
                struct sbq_wait_state *ws = &sbq->ws[wake_index];

                if (waitqueue_active(&ws->wait))
                        wake_up(&ws->wait);

                wake_index = sbq_index_inc(wake_index);
        }
}
EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all);

void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
{
        bool first;
        int i;

        sbitmap_show(&sbq->sb, m);

        seq_puts(m, "alloc_hint={");
        first = true;
        for_each_possible_cpu(i) {
                if (!first)
                        seq_puts(m, ", ");
                first = false;
                seq_printf(m, "%u", *per_cpu_ptr(sbq->sb.alloc_hint, i));
        }
        seq_puts(m, "}\n");

        seq_printf(m, "wake_batch=%u\n", sbq->wake_batch);
        seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index));
        seq_printf(m, "ws_active=%d\n", atomic_read(&sbq->ws_active));

        seq_puts(m, "ws={\n");
        for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
                struct sbq_wait_state *ws = &sbq->ws[i];
                seq_printf(m, "\t{.wait=%s},\n",
                           waitqueue_active(&ws->wait) ? "active" : "inactive");
        }
        seq_puts(m, "}\n");

        seq_printf(m, "round_robin=%d\n", sbq->sb.round_robin);
        seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_show);

void sbitmap_add_wait_queue(struct sbitmap_queue *sbq,
                            struct sbq_wait_state *ws,
                            struct sbq_wait *sbq_wait)
{
        if (!sbq_wait->sbq) {
                sbq_wait->sbq = sbq;
                atomic_inc(&sbq->ws_active);
                add_wait_queue(&ws->wait, &sbq_wait->wait);
        }
}
EXPORT_SYMBOL_GPL(sbitmap_add_wait_queue);

void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait)
{
        list_del_init(&sbq_wait->wait.entry);
        if (sbq_wait->sbq) {
                atomic_dec(&sbq_wait->sbq->ws_active);
                sbq_wait->sbq = NULL;
        }
}
EXPORT_SYMBOL_GPL(sbitmap_del_wait_queue);

void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
                             struct sbq_wait_state *ws,
                             struct sbq_wait *sbq_wait, int state)
{
        if (!sbq_wait->sbq) {
                atomic_inc(&sbq->ws_active);
                sbq_wait->sbq = sbq;
        }
        prepare_to_wait_exclusive(&ws->wait, &sbq_wait->wait, state);
}
EXPORT_SYMBOL_GPL(sbitmap_prepare_to_wait);

void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
                         struct sbq_wait *sbq_wait)
{
        finish_wait(&ws->wait, &sbq_wait->wait);
        if (sbq_wait->sbq) {
                atomic_dec(&sbq->ws_active);
                sbq_wait->sbq = NULL;
        }
}
EXPORT_SYMBOL_GPL(sbitmap_finish_wait);

















































    5 
    3 








    4 












































































































    1 
    1 































    3 



    2 








    1 

    1 
    1 



    1 


    1 




    1 



    1 


    1 










    2 
















    1 
   10 









































































    1 


    1 



































































































































































   11 








    9 
































   10 

    1 




   11 













    4 
    3 







    5 

   10 

    1 














































































































   10 














   11 







































































































    1 
    1 







    1 




    1 








    1 



    1 

    1 





















































































    4 



    1 



    1 


    1 





















































































































































   10 
   10 
   11 



















    6 







    4 
    3 







    6 





























    4 


    5 






















































    1 
    1 






    1 

















































    1 


















    1 









    1 









    1 






    1 


























    1 












    1 

    1 
    1 






















    2 







    2 


























    1 
    1 


    1 
    1 























































































































































    1 


























    1 

    1 

    1 


    1 

    1 

    1 




















    1 



















    1 















    1 


    1 


    1 


































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
 */
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/uio.h>
#include <linux/iocontext.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
#include <linux/highmem.h>
#include <linux/blk-crypto.h>
#include <linux/xarray.h>

#include <trace/events/block.h>
#include "blk.h"
#include "blk-rq-qos.h"
#include "blk-cgroup.h"

#define ALLOC_CACHE_THRESHOLD        16
#define ALLOC_CACHE_MAX                256

struct bio_alloc_cache {
        struct bio                *free_list;
        struct bio                *free_list_irq;
        unsigned int                nr;
        unsigned int                nr_irq;
};

static struct biovec_slab {
        int nr_vecs;
        char *name;
        struct kmem_cache *slab;
} bvec_slabs[] __read_mostly = {
        { .nr_vecs = 16, .name = "biovec-16" },
        { .nr_vecs = 64, .name = "biovec-64" },
        { .nr_vecs = 128, .name = "biovec-128" },
        { .nr_vecs = BIO_MAX_VECS, .name = "biovec-max" },
};

static struct biovec_slab *biovec_slab(unsigned short nr_vecs)
{
        switch (nr_vecs) {
        /* smaller bios use inline vecs */
        case 5 ... 16:
                return &bvec_slabs[0];
        case 17 ... 64:
                return &bvec_slabs[1];
        case 65 ... 128:
                return &bvec_slabs[2];
        case 129 ... BIO_MAX_VECS:
                return &bvec_slabs[3];
        default:
                BUG();
                return NULL;
        }
}

/*
 * fs_bio_set is the bio_set containing bio and iovec memory pools used by
 * IO code that does not need private memory pools.
 */
struct bio_set fs_bio_set;
EXPORT_SYMBOL(fs_bio_set);

/*
 * Our slab pool management
 */
struct bio_slab {
        struct kmem_cache *slab;
        unsigned int slab_ref;
        unsigned int slab_size;
        char name[8];
};
static DEFINE_MUTEX(bio_slab_lock);
static DEFINE_XARRAY(bio_slabs);

static struct bio_slab *create_bio_slab(unsigned int size)
{
        struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL);

        if (!bslab)
                return NULL;

        snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size);
        bslab->slab = kmem_cache_create(bslab->name, size,
                        ARCH_KMALLOC_MINALIGN,
                        SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL);
        if (!bslab->slab)
                goto fail_alloc_slab;

        bslab->slab_ref = 1;
        bslab->slab_size = size;

        if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL)))
                return bslab;

        kmem_cache_destroy(bslab->slab);

fail_alloc_slab:
        kfree(bslab);
        return NULL;
}

static inline unsigned int bs_bio_slab_size(struct bio_set *bs)
{
        return bs->front_pad + sizeof(struct bio) + bs->back_pad;
}

static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs)
{
        unsigned int size = bs_bio_slab_size(bs);
        struct bio_slab *bslab;

        mutex_lock(&bio_slab_lock);
        bslab = xa_load(&bio_slabs, size);
        if (bslab)
                bslab->slab_ref++;
        else
                bslab = create_bio_slab(size);
        mutex_unlock(&bio_slab_lock);

        if (bslab)
                return bslab->slab;
        return NULL;
}

static void bio_put_slab(struct bio_set *bs)
{
        struct bio_slab *bslab = NULL;
        unsigned int slab_size = bs_bio_slab_size(bs);

        mutex_lock(&bio_slab_lock);

        bslab = xa_load(&bio_slabs, slab_size);
        if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
                goto out;

        WARN_ON_ONCE(bslab->slab != bs->bio_slab);

        WARN_ON(!bslab->slab_ref);

        if (--bslab->slab_ref)
                goto out;

        xa_erase(&bio_slabs, slab_size);

        kmem_cache_destroy(bslab->slab);
        kfree(bslab);

out:
        mutex_unlock(&bio_slab_lock);
}

void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
{
        BUG_ON(nr_vecs > BIO_MAX_VECS);

        if (nr_vecs == BIO_MAX_VECS)
                mempool_free(bv, pool);
        else if (nr_vecs > BIO_INLINE_VECS)
                kmem_cache_free(biovec_slab(nr_vecs)->slab, bv);
}

/*
 * Make the first allocation restricted and don't dump info on allocation
 * failures, since we'll fall back to the mempool in case of failure.
 */
static inline gfp_t bvec_alloc_gfp(gfp_t gfp)
{
        return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) |
                __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
}

struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
                gfp_t gfp_mask)
{
        struct biovec_slab *bvs = biovec_slab(*nr_vecs);

        if (WARN_ON_ONCE(!bvs))
                return NULL;

        /*
         * Upgrade the nr_vecs request to take full advantage of the allocation.
         * We also rely on this in the bvec_free path.
         */
        *nr_vecs = bvs->nr_vecs;

        /*
         * Try a slab allocation first for all smaller allocations.  If that
         * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool.
         * The mempool is sized to handle up to BIO_MAX_VECS entries.
         */
        if (*nr_vecs < BIO_MAX_VECS) {
                struct bio_vec *bvl;

                bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask));
                if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM))
                        return bvl;
                *nr_vecs = BIO_MAX_VECS;
        }

        return mempool_alloc(pool, gfp_mask);
}

void bio_uninit(struct bio *bio)
{
#ifdef CONFIG_BLK_CGROUP
        if (bio->bi_blkg) {
                blkg_put(bio->bi_blkg);
                bio->bi_blkg = NULL;
        }
#endif
        if (bio_integrity(bio))
                bio_integrity_free(bio);

        bio_crypt_free_ctx(bio);
}
EXPORT_SYMBOL(bio_uninit);

static void bio_free(struct bio *bio)
{
        struct bio_set *bs = bio->bi_pool;
        void *p = bio;

        WARN_ON_ONCE(!bs);

        bio_uninit(bio);
        bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs);
        mempool_free(p - bs->front_pad, &bs->bio_pool);
}

/*
 * Users of this function have their own bio allocation. Subsequently,
 * they must remember to pair any call to bio_init() with bio_uninit()
 * when IO has completed, or when the bio is released.
 */
void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
              unsigned short max_vecs, blk_opf_t opf)
{
        bio->bi_next = NULL;
        bio->bi_bdev = bdev;
        bio->bi_opf = opf;
        bio->bi_flags = 0;
        bio->bi_ioprio = 0;
        bio->bi_write_hint = 0;
        bio->bi_status = 0;
        bio->bi_iter.bi_sector = 0;
        bio->bi_iter.bi_size = 0;
        bio->bi_iter.bi_idx = 0;
        bio->bi_iter.bi_bvec_done = 0;
        bio->bi_end_io = NULL;
        bio->bi_private = NULL;
#ifdef CONFIG_BLK_CGROUP
        bio->bi_blkg = NULL;
        bio->bi_issue.value = 0;
        if (bdev)
                bio_associate_blkg(bio);
#ifdef CONFIG_BLK_CGROUP_IOCOST
        bio->bi_iocost_cost = 0;
#endif
#endif
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
        bio->bi_crypt_context = NULL;
#endif
#ifdef CONFIG_BLK_DEV_INTEGRITY
        bio->bi_integrity = NULL;
#endif
        bio->bi_vcnt = 0;

        atomic_set(&bio->__bi_remaining, 1);
        atomic_set(&bio->__bi_cnt, 1);
        bio->bi_cookie = BLK_QC_T_NONE;

        bio->bi_max_vecs = max_vecs;
        bio->bi_io_vec = table;
        bio->bi_pool = NULL;
}
EXPORT_SYMBOL(bio_init);

/**
 * bio_reset - reinitialize a bio
 * @bio:        bio to reset
 * @bdev:        block device to use the bio for
 * @opf:        operation and flags for bio
 *
 * Description:
 *   After calling bio_reset(), @bio will be in the same state as a freshly
 *   allocated bio returned bio bio_alloc_bioset() - the only fields that are
 *   preserved are the ones that are initialized by bio_alloc_bioset(). See
 *   comment in struct bio.
 */
void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf)
{
        bio_uninit(bio);
        memset(bio, 0, BIO_RESET_BYTES);
        atomic_set(&bio->__bi_remaining, 1);
        bio->bi_bdev = bdev;
        if (bio->bi_bdev)
                bio_associate_blkg(bio);
        bio->bi_opf = opf;
}
EXPORT_SYMBOL(bio_reset);

static struct bio *__bio_chain_endio(struct bio *bio)
{
        struct bio *parent = bio->bi_private;

        if (bio->bi_status && !parent->bi_status)
                parent->bi_status = bio->bi_status;
        bio_put(bio);
        return parent;
}

static void bio_chain_endio(struct bio *bio)
{
        bio_endio(__bio_chain_endio(bio));
}

/**
 * bio_chain - chain bio completions
 * @bio: the target bio
 * @parent: the parent bio of @bio
 *
 * The caller won't have a bi_end_io called when @bio completes - instead,
 * @parent's bi_end_io won't be called until both @parent and @bio have
 * completed; the chained bio will also be freed when it completes.
 *
 * The caller must not set bi_private or bi_end_io in @bio.
 */
void bio_chain(struct bio *bio, struct bio *parent)
{
        BUG_ON(bio->bi_private || bio->bi_end_io);

        bio->bi_private = parent;
        bio->bi_end_io        = bio_chain_endio;
        bio_inc_remaining(parent);
}
EXPORT_SYMBOL(bio_chain);

/**
 * bio_chain_and_submit - submit a bio after chaining it to another one
 * @prev: bio to chain and submit
 * @new: bio to chain to
 *
 * If @prev is non-NULL, chain it to @new and submit it.
 *
 * Return: @new.
 */
struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new)
{
        if (prev) {
                bio_chain(prev, new);
                submit_bio(prev);
        }
        return new;
}

struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
                unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
{
        return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp));
}
EXPORT_SYMBOL_GPL(blk_next_bio);

static void bio_alloc_rescue(struct work_struct *work)
{
        struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
        struct bio *bio;

        while (1) {
                spin_lock(&bs->rescue_lock);
                bio = bio_list_pop(&bs->rescue_list);
                spin_unlock(&bs->rescue_lock);

                if (!bio)
                        break;

                submit_bio_noacct(bio);
        }
}

static void punt_bios_to_rescuer(struct bio_set *bs)
{
        struct bio_list punt, nopunt;
        struct bio *bio;

        if (WARN_ON_ONCE(!bs->rescue_workqueue))
                return;
        /*
         * In order to guarantee forward progress we must punt only bios that
         * were allocated from this bio_set; otherwise, if there was a bio on
         * there for a stacking driver higher up in the stack, processing it
         * could require allocating bios from this bio_set, and doing that from
         * our own rescuer would be bad.
         *
         * Since bio lists are singly linked, pop them all instead of trying to
         * remove from the middle of the list:
         */

        bio_list_init(&punt);
        bio_list_init(&nopunt);

        while ((bio = bio_list_pop(&current->bio_list[0])))
                bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
        current->bio_list[0] = nopunt;

        bio_list_init(&nopunt);
        while ((bio = bio_list_pop(&current->bio_list[1])))
                bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
        current->bio_list[1] = nopunt;

        spin_lock(&bs->rescue_lock);
        bio_list_merge(&bs->rescue_list, &punt);
        spin_unlock(&bs->rescue_lock);

        queue_work(bs->rescue_workqueue, &bs->rescue_work);
}

static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache)
{
        unsigned long flags;

        /* cache->free_list must be empty */
        if (WARN_ON_ONCE(cache->free_list))
                return;

        local_irq_save(flags);
        cache->free_list = cache->free_list_irq;
        cache->free_list_irq = NULL;
        cache->nr += cache->nr_irq;
        cache->nr_irq = 0;
        local_irq_restore(flags);
}

static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
                unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp,
                struct bio_set *bs)
{
        struct bio_alloc_cache *cache;
        struct bio *bio;

        cache = per_cpu_ptr(bs->cache, get_cpu());
        if (!cache->free_list) {
                if (READ_ONCE(cache->nr_irq) >= ALLOC_CACHE_THRESHOLD)
                        bio_alloc_irq_cache_splice(cache);
                if (!cache->free_list) {
                        put_cpu();
                        return NULL;
                }
        }
        bio = cache->free_list;
        cache->free_list = bio->bi_next;
        cache->nr--;
        put_cpu();

        bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs, opf);
        bio->bi_pool = bs;
        return bio;
}

/**
 * bio_alloc_bioset - allocate a bio for I/O
 * @bdev:        block device to allocate the bio for (can be %NULL)
 * @nr_vecs:        number of bvecs to pre-allocate
 * @opf:        operation and flags for bio
 * @gfp_mask:   the GFP_* mask given to the slab allocator
 * @bs:                the bio_set to allocate from.
 *
 * Allocate a bio from the mempools in @bs.
 *
 * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to
 * allocate a bio.  This is due to the mempool guarantees.  To make this work,
 * callers must never allocate more than 1 bio at a time from the general pool.
 * Callers that need to allocate more than 1 bio must always submit the
 * previously allocated bio for IO before attempting to allocate a new one.
 * Failure to do so can cause deadlocks under memory pressure.
 *
 * Note that when running under submit_bio_noacct() (i.e. any block driver),
 * bios are not submitted until after you return - see the code in
 * submit_bio_noacct() that converts recursion into iteration, to prevent
 * stack overflows.
 *
 * This would normally mean allocating multiple bios under submit_bio_noacct()
 * would be susceptible to deadlocks, but we have
 * deadlock avoidance code that resubmits any blocked bios from a rescuer
 * thread.
 *
 * However, we do not guarantee forward progress for allocations from other
 * mempools. Doing multiple allocations from the same mempool under
 * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
 * for per bio allocations.
 *
 * Returns: Pointer to new bio on success, NULL on failure.
 */
struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
                             blk_opf_t opf, gfp_t gfp_mask,
                             struct bio_set *bs)
{
        gfp_t saved_gfp = gfp_mask;
        struct bio *bio;
        void *p;

        /* should not use nobvec bioset for nr_vecs > 0 */
        if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0))
                return NULL;

        if (opf & REQ_ALLOC_CACHE) {
                if (bs->cache && nr_vecs <= BIO_INLINE_VECS) {
                        bio = bio_alloc_percpu_cache(bdev, nr_vecs, opf,
                                                     gfp_mask, bs);
                        if (bio)
                                return bio;
                        /*
                         * No cached bio available, bio returned below marked with
                         * REQ_ALLOC_CACHE to particpate in per-cpu alloc cache.
                         */
                } else {
                        opf &= ~REQ_ALLOC_CACHE;
                }
        }

        /*
         * submit_bio_noacct() converts recursion to iteration; this means if
         * we're running beneath it, any bios we allocate and submit will not be
         * submitted (and thus freed) until after we return.
         *
         * This exposes us to a potential deadlock if we allocate multiple bios
         * from the same bio_set() while running underneath submit_bio_noacct().
         * If we were to allocate multiple bios (say a stacking block driver
         * that was splitting bios), we would deadlock if we exhausted the
         * mempool's reserve.
         *
         * We solve this, and guarantee forward progress, with a rescuer
         * workqueue per bio_set. If we go to allocate and there are bios on
         * current->bio_list, we first try the allocation without
         * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be
         * blocking to the rescuer workqueue before we retry with the original
         * gfp_flags.
         */
        if (current->bio_list &&
            (!bio_list_empty(&current->bio_list[0]) ||
             !bio_list_empty(&current->bio_list[1])) &&
            bs->rescue_workqueue)
                gfp_mask &= ~__GFP_DIRECT_RECLAIM;

        p = mempool_alloc(&bs->bio_pool, gfp_mask);
        if (!p && gfp_mask != saved_gfp) {
                punt_bios_to_rescuer(bs);
                gfp_mask = saved_gfp;
                p = mempool_alloc(&bs->bio_pool, gfp_mask);
        }
        if (unlikely(!p))
                return NULL;
        if (!mempool_is_saturated(&bs->bio_pool))
                opf &= ~REQ_ALLOC_CACHE;

        bio = p + bs->front_pad;
        if (nr_vecs > BIO_INLINE_VECS) {
                struct bio_vec *bvl = NULL;

                bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
                if (!bvl && gfp_mask != saved_gfp) {
                        punt_bios_to_rescuer(bs);
                        gfp_mask = saved_gfp;
                        bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
                }
                if (unlikely(!bvl))
                        goto err_free;

                bio_init(bio, bdev, bvl, nr_vecs, opf);
        } else if (nr_vecs) {
                bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf);
        } else {
                bio_init(bio, bdev, NULL, 0, opf);
        }

        bio->bi_pool = bs;
        return bio;

err_free:
        mempool_free(p, &bs->bio_pool);
        return NULL;
}
EXPORT_SYMBOL(bio_alloc_bioset);

/**
 * bio_kmalloc - kmalloc a bio
 * @nr_vecs:        number of bio_vecs to allocate
 * @gfp_mask:   the GFP_* mask given to the slab allocator
 *
 * Use kmalloc to allocate a bio (including bvecs).  The bio must be initialized
 * using bio_init() before use.  To free a bio returned from this function use
 * kfree() after calling bio_uninit().  A bio returned from this function can
 * be reused by calling bio_uninit() before calling bio_init() again.
 *
 * Note that unlike bio_alloc() or bio_alloc_bioset() allocations from this
 * function are not backed by a mempool can fail.  Do not use this function
 * for allocations in the file system I/O path.
 *
 * Returns: Pointer to new bio on success, NULL on failure.
 */
struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask)
{
        struct bio *bio;

        if (nr_vecs > UIO_MAXIOV)
                return NULL;
        return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask);
}
EXPORT_SYMBOL(bio_kmalloc);

void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
{
        struct bio_vec bv;
        struct bvec_iter iter;

        __bio_for_each_segment(bv, bio, iter, start)
                memzero_bvec(&bv);
}
EXPORT_SYMBOL(zero_fill_bio_iter);

/**
 * bio_truncate - truncate the bio to small size of @new_size
 * @bio:        the bio to be truncated
 * @new_size:        new size for truncating the bio
 *
 * Description:
 *   Truncate the bio to new size of @new_size. If bio_op(bio) is
 *   REQ_OP_READ, zero the truncated part. This function should only
 *   be used for handling corner cases, such as bio eod.
 */
static void bio_truncate(struct bio *bio, unsigned new_size)
{
        struct bio_vec bv;
        struct bvec_iter iter;
        unsigned int done = 0;
        bool truncated = false;

        if (new_size >= bio->bi_iter.bi_size)
                return;

        if (bio_op(bio) != REQ_OP_READ)
                goto exit;

        bio_for_each_segment(bv, bio, iter) {
                if (done + bv.bv_len > new_size) {
                        unsigned offset;

                        if (!truncated)
                                offset = new_size - done;
                        else
                                offset = 0;
                        zero_user(bv.bv_page, bv.bv_offset + offset,
                                  bv.bv_len - offset);
                        truncated = true;
                }
                done += bv.bv_len;
        }

 exit:
        /*
         * Don't touch bvec table here and make it really immutable, since
         * fs bio user has to retrieve all pages via bio_for_each_segment_all
         * in its .end_bio() callback.
         *
         * It is enough to truncate bio by updating .bi_size since we can make
         * correct bvec with the updated .bi_size for drivers.
         */
        bio->bi_iter.bi_size = new_size;
}

/**
 * guard_bio_eod - truncate a BIO to fit the block device
 * @bio:        bio to truncate
 *
 * This allows us to do IO even on the odd last sectors of a device, even if the
 * block size is some multiple of the physical sector size.
 *
 * We'll just truncate the bio to the size of the device, and clear the end of
 * the buffer head manually.  Truly out-of-range accesses will turn into actual
 * I/O errors, this only handles the "we need to be able to do I/O at the final
 * sector" case.
 */
void guard_bio_eod(struct bio *bio)
{
        sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);

        if (!maxsector)
                return;

        /*
         * If the *whole* IO is past the end of the device,
         * let it through, and the IO layer will turn it into
         * an EIO.
         */
        if (unlikely(bio->bi_iter.bi_sector >= maxsector))
                return;

        maxsector -= bio->bi_iter.bi_sector;
        if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
                return;

        bio_truncate(bio, maxsector << 9);
}

static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache,
                                   unsigned int nr)
{
        unsigned int i = 0;
        struct bio *bio;

        while ((bio = cache->free_list) != NULL) {
                cache->free_list = bio->bi_next;
                cache->nr--;
                bio_free(bio);
                if (++i == nr)
                        break;
        }
        return i;
}

static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
                                  unsigned int nr)
{
        nr -= __bio_alloc_cache_prune(cache, nr);
        if (!READ_ONCE(cache->free_list)) {
                bio_alloc_irq_cache_splice(cache);
                __bio_alloc_cache_prune(cache, nr);
        }
}

static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node)
{
        struct bio_set *bs;

        bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead);
        if (bs->cache) {
                struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu);

                bio_alloc_cache_prune(cache, -1U);
        }
        return 0;
}

static void bio_alloc_cache_destroy(struct bio_set *bs)
{
        int cpu;

        if (!bs->cache)
                return;

        cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
        for_each_possible_cpu(cpu) {
                struct bio_alloc_cache *cache;

                cache = per_cpu_ptr(bs->cache, cpu);
                bio_alloc_cache_prune(cache, -1U);
        }
        free_percpu(bs->cache);
        bs->cache = NULL;
}

static inline void bio_put_percpu_cache(struct bio *bio)
{
        struct bio_alloc_cache *cache;

        cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
        if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX)
                goto out_free;

        if (in_task()) {
                bio_uninit(bio);
                bio->bi_next = cache->free_list;
                /* Not necessary but helps not to iopoll already freed bios */
                bio->bi_bdev = NULL;
                cache->free_list = bio;
                cache->nr++;
        } else if (in_hardirq()) {
                lockdep_assert_irqs_disabled();

                bio_uninit(bio);
                bio->bi_next = cache->free_list_irq;
                cache->free_list_irq = bio;
                cache->nr_irq++;
        } else {
                goto out_free;
        }
        put_cpu();
        return;
out_free:
        put_cpu();
        bio_free(bio);
}

/**
 * bio_put - release a reference to a bio
 * @bio:   bio to release reference to
 *
 * Description:
 *   Put a reference to a &struct bio, either one you have gotten with
 *   bio_alloc, bio_get or bio_clone_*. The last put of a bio will free it.
 **/
void bio_put(struct bio *bio)
{
        if (unlikely(bio_flagged(bio, BIO_REFFED))) {
                BUG_ON(!atomic_read(&bio->__bi_cnt));
                if (!atomic_dec_and_test(&bio->__bi_cnt))
                        return;
        }
        if (bio->bi_opf & REQ_ALLOC_CACHE)
                bio_put_percpu_cache(bio);
        else
                bio_free(bio);
}
EXPORT_SYMBOL(bio_put);

static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
{
        bio_set_flag(bio, BIO_CLONED);
        bio->bi_ioprio = bio_src->bi_ioprio;
        bio->bi_write_hint = bio_src->bi_write_hint;
        bio->bi_iter = bio_src->bi_iter;

        if (bio->bi_bdev) {
                if (bio->bi_bdev == bio_src->bi_bdev &&
                    bio_flagged(bio_src, BIO_REMAPPED))
                        bio_set_flag(bio, BIO_REMAPPED);
                bio_clone_blkg_association(bio, bio_src);
        }

        if (bio_crypt_clone(bio, bio_src, gfp) < 0)
                return -ENOMEM;
        if (bio_integrity(bio_src) &&
            bio_integrity_clone(bio, bio_src, gfp) < 0)
                return -ENOMEM;
        return 0;
}

/**
 * bio_alloc_clone - clone a bio that shares the original bio's biovec
 * @bdev: block_device to clone onto
 * @bio_src: bio to clone from
 * @gfp: allocation priority
 * @bs: bio_set to allocate from
 *
 * Allocate a new bio that is a clone of @bio_src. The caller owns the returned
 * bio, but not the actual data it points to.
 *
 * The caller must ensure that the return bio is not freed before @bio_src.
 */
struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src,
                gfp_t gfp, struct bio_set *bs)
{
        struct bio *bio;

        bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs);
        if (!bio)
                return NULL;

        if (__bio_clone(bio, bio_src, gfp) < 0) {
                bio_put(bio);
                return NULL;
        }
        bio->bi_io_vec = bio_src->bi_io_vec;

        return bio;
}
EXPORT_SYMBOL(bio_alloc_clone);

/**
 * bio_init_clone - clone a bio that shares the original bio's biovec
 * @bdev: block_device to clone onto
 * @bio: bio to clone into
 * @bio_src: bio to clone from
 * @gfp: allocation priority
 *
 * Initialize a new bio in caller provided memory that is a clone of @bio_src.
 * The caller owns the returned bio, but not the actual data it points to.
 *
 * The caller must ensure that @bio_src is not freed before @bio.
 */
int bio_init_clone(struct block_device *bdev, struct bio *bio,
                struct bio *bio_src, gfp_t gfp)
{
        int ret;

        bio_init(bio, bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf);
        ret = __bio_clone(bio, bio_src, gfp);
        if (ret)
                bio_uninit(bio);
        return ret;
}
EXPORT_SYMBOL(bio_init_clone);

/**
 * bio_full - check if the bio is full
 * @bio:        bio to check
 * @len:        length of one segment to be added
 *
 * Return true if @bio is full and one segment with @len bytes can't be
 * added to the bio, otherwise return false
 */
static inline bool bio_full(struct bio *bio, unsigned len)
{
        if (bio->bi_vcnt >= bio->bi_max_vecs)
                return true;
        if (bio->bi_iter.bi_size > UINT_MAX - len)
                return true;
        return false;
}

static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
                unsigned int len, unsigned int off, bool *same_page)
{
        size_t bv_end = bv->bv_offset + bv->bv_len;
        phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
        phys_addr_t page_addr = page_to_phys(page);

        if (vec_end_addr + 1 != page_addr + off)
                return false;
        if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
                return false;
        if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
                return false;

        *same_page = ((vec_end_addr & PAGE_MASK) == page_addr);
        if (!*same_page) {
                if (IS_ENABLED(CONFIG_KMSAN))
                        return false;
                if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE)
                        return false;
        }

        bv->bv_len += len;
        return true;
}

/*
 * Try to merge a page into a segment, while obeying the hardware segment
 * size limit.  This is not for normal read/write bios, but for passthrough
 * or Zone Append operations that we can't split.
 */
bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
                struct page *page, unsigned len, unsigned offset,
                bool *same_page)
{
        unsigned long mask = queue_segment_boundary(q);
        phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset;
        phys_addr_t addr2 = page_to_phys(page) + offset + len - 1;

        if ((addr1 | mask) != (addr2 | mask))
                return false;
        if (len > queue_max_segment_size(q) - bv->bv_len)
                return false;
        return bvec_try_merge_page(bv, page, len, offset, same_page);
}

/**
 * bio_add_hw_page - attempt to add a page to a bio with hw constraints
 * @q: the target queue
 * @bio: destination bio
 * @page: page to add
 * @len: vec entry length
 * @offset: vec entry offset
 * @max_sectors: maximum number of sectors that can be added
 * @same_page: return if the segment has been merged inside the same page
 *
 * Add a page to a bio while respecting the hardware max_sectors, max_segment
 * and gap limitations.
 */
int bio_add_hw_page(struct request_queue *q, struct bio *bio,
                struct page *page, unsigned int len, unsigned int offset,
                unsigned int max_sectors, bool *same_page)
{
        unsigned int max_size = max_sectors << SECTOR_SHIFT;

        if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
                return 0;

        len = min3(len, max_size, queue_max_segment_size(q));
        if (len > max_size - bio->bi_iter.bi_size)
                return 0;

        if (bio->bi_vcnt > 0) {
                struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];

                if (bvec_try_merge_hw_page(q, bv, page, len, offset,
                                same_page)) {
                        bio->bi_iter.bi_size += len;
                        return len;
                }

                if (bio->bi_vcnt >=
                    min(bio->bi_max_vecs, queue_max_segments(q)))
                        return 0;

                /*
                 * If the queue doesn't support SG gaps and adding this segment
                 * would create a gap, disallow it.
                 */
                if (bvec_gap_to_prev(&q->limits, bv, offset))
                        return 0;
        }

        bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset);
        bio->bi_vcnt++;
        bio->bi_iter.bi_size += len;
        return len;
}

/**
 * bio_add_pc_page        - attempt to add page to passthrough bio
 * @q: the target queue
 * @bio: destination bio
 * @page: page to add
 * @len: vec entry length
 * @offset: vec entry offset
 *
 * Attempt to add a page to the bio_vec maplist. This can fail for a
 * number of reasons, such as the bio being full or target block device
 * limitations. The target block device must allow bio's up to PAGE_SIZE,
 * so it is always possible to add a single page to an empty bio.
 *
 * This should only be used by passthrough bios.
 */
int bio_add_pc_page(struct request_queue *q, struct bio *bio,
                struct page *page, unsigned int len, unsigned int offset)
{
        bool same_page = false;
        return bio_add_hw_page(q, bio, page, len, offset,
                        queue_max_hw_sectors(q), &same_page);
}
EXPORT_SYMBOL(bio_add_pc_page);

/**
 * bio_add_zone_append_page - attempt to add page to zone-append bio
 * @bio: destination bio
 * @page: page to add
 * @len: vec entry length
 * @offset: vec entry offset
 *
 * Attempt to add a page to the bio_vec maplist of a bio that will be submitted
 * for a zone-append request. This can fail for a number of reasons, such as the
 * bio being full or the target block device is not a zoned block device or
 * other limitations of the target block device. The target block device must
 * allow bio's up to PAGE_SIZE, so it is always possible to add a single page
 * to an empty bio.
 *
 * Returns: number of bytes added to the bio, or 0 in case of a failure.
 */
int bio_add_zone_append_page(struct bio *bio, struct page *page,
                             unsigned int len, unsigned int offset)
{
        struct request_queue *q = bdev_get_queue(bio->bi_bdev);
        bool same_page = false;

        if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
                return 0;

        if (WARN_ON_ONCE(!bdev_is_zoned(bio->bi_bdev)))
                return 0;

        return bio_add_hw_page(q, bio, page, len, offset,
                               queue_max_zone_append_sectors(q), &same_page);
}
EXPORT_SYMBOL_GPL(bio_add_zone_append_page);

/**
 * __bio_add_page - add page(s) to a bio in a new segment
 * @bio: destination bio
 * @page: start page to add
 * @len: length of the data to add, may cross pages
 * @off: offset of the data relative to @page, may cross pages
 *
 * Add the data at @page + @off to @bio as a new bvec.  The caller must ensure
 * that @bio has space for another bvec.
 */
void __bio_add_page(struct bio *bio, struct page *page,
                unsigned int len, unsigned int off)
{
        WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
        WARN_ON_ONCE(bio_full(bio, len));

        bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off);
        bio->bi_iter.bi_size += len;
        bio->bi_vcnt++;
}
EXPORT_SYMBOL_GPL(__bio_add_page);

/**
 *        bio_add_page        -        attempt to add page(s) to bio
 *        @bio: destination bio
 *        @page: start page to add
 *        @len: vec entry length, may cross pages
 *        @offset: vec entry offset relative to @page, may cross pages
 *
 *        Attempt to add page(s) to the bio_vec maplist. This will only fail
 *        if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio.
 */
int bio_add_page(struct bio *bio, struct page *page,
                 unsigned int len, unsigned int offset)
{
        bool same_page = false;

        if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
                return 0;
        if (bio->bi_iter.bi_size > UINT_MAX - len)
                return 0;

        if (bio->bi_vcnt > 0 &&
            bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
                                page, len, offset, &same_page)) {
                bio->bi_iter.bi_size += len;
                return len;
        }

        if (bio->bi_vcnt >= bio->bi_max_vecs)
                return 0;
        __bio_add_page(bio, page, len, offset);
        return len;
}
EXPORT_SYMBOL(bio_add_page);

void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len,
                          size_t off)
{
        WARN_ON_ONCE(len > UINT_MAX);
        WARN_ON_ONCE(off > UINT_MAX);
        __bio_add_page(bio, &folio->page, len, off);
}
EXPORT_SYMBOL_GPL(bio_add_folio_nofail);

/**
 * bio_add_folio - Attempt to add part of a folio to a bio.
 * @bio: BIO to add to.
 * @folio: Folio to add.
 * @len: How many bytes from the folio to add.
 * @off: First byte in this folio to add.
 *
 * Filesystems that use folios can call this function instead of calling
 * bio_add_page() for each page in the folio.  If @off is bigger than
 * PAGE_SIZE, this function can create a bio_vec that starts in a page
 * after the bv_page.  BIOs do not support folios that are 4GiB or larger.
 *
 * Return: Whether the addition was successful.
 */
bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
                   size_t off)
{
        if (len > UINT_MAX || off > UINT_MAX)
                return false;
        return bio_add_page(bio, &folio->page, len, off) > 0;
}
EXPORT_SYMBOL(bio_add_folio);

void __bio_release_pages(struct bio *bio, bool mark_dirty)
{
        struct folio_iter fi;

        bio_for_each_folio_all(fi, bio) {
                struct page *page;
                size_t nr_pages;

                if (mark_dirty) {
                        folio_lock(fi.folio);
                        folio_mark_dirty(fi.folio);
                        folio_unlock(fi.folio);
                }
                page = folio_page(fi.folio, fi.offset / PAGE_SIZE);
                nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE -
                           fi.offset / PAGE_SIZE + 1;
                do {
                        bio_release_page(bio, page++);
                } while (--nr_pages != 0);
        }
}
EXPORT_SYMBOL_GPL(__bio_release_pages);

void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
{
        size_t size = iov_iter_count(iter);

        WARN_ON_ONCE(bio->bi_max_vecs);

        if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
                struct request_queue *q = bdev_get_queue(bio->bi_bdev);
                size_t max_sectors = queue_max_zone_append_sectors(q);

                size = min(size, max_sectors << SECTOR_SHIFT);
        }

        bio->bi_vcnt = iter->nr_segs;
        bio->bi_io_vec = (struct bio_vec *)iter->bvec;
        bio->bi_iter.bi_bvec_done = iter->iov_offset;
        bio->bi_iter.bi_size = size;
        bio_set_flag(bio, BIO_CLONED);
}

static int bio_iov_add_page(struct bio *bio, struct page *page,
                unsigned int len, unsigned int offset)
{
        bool same_page = false;

        if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len))
                return -EIO;

        if (bio->bi_vcnt > 0 &&
            bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
                                page, len, offset, &same_page)) {
                bio->bi_iter.bi_size += len;
                if (same_page)
                        bio_release_page(bio, page);
                return 0;
        }
        __bio_add_page(bio, page, len, offset);
        return 0;
}

static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
                unsigned int len, unsigned int offset)
{
        struct request_queue *q = bdev_get_queue(bio->bi_bdev);
        bool same_page = false;

        if (bio_add_hw_page(q, bio, page, len, offset,
                        queue_max_zone_append_sectors(q), &same_page) != len)
                return -EINVAL;
        if (same_page)
                bio_release_page(bio, page);
        return 0;
}

#define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))

/**
 * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
 * @bio: bio to add pages to
 * @iter: iov iterator describing the region to be mapped
 *
 * Extracts pages from *iter and appends them to @bio's bvec array.  The pages
 * will have to be cleaned up in the way indicated by the BIO_PAGE_PINNED flag.
 * For a multi-segment *iter, this function only adds pages from the next
 * non-empty segment of the iov iterator.
 */
static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{
        iov_iter_extraction_t extraction_flags = 0;
        unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
        unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
        struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
        struct page **pages = (struct page **)bv;
        ssize_t size, left;
        unsigned len, i = 0;
        size_t offset;
        int ret = 0;

        /*
         * Move page array up in the allocated memory for the bio vecs as far as
         * possible so that we can start filling biovecs from the beginning
         * without overwriting the temporary page array.
         */
        BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
        pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);

        if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
                extraction_flags |= ITER_ALLOW_P2PDMA;

        /*
         * Each segment in the iov is required to be a block size multiple.
         * However, we may not be able to get the entire segment if it spans
         * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the
         * result to ensure the bio's total size is correct. The remainder of
         * the iov data will be picked up in the next bio iteration.
         */
        size = iov_iter_extract_pages(iter, &pages,
                                      UINT_MAX - bio->bi_iter.bi_size,
                                      nr_pages, extraction_flags, &offset);
        if (unlikely(size <= 0))
                return size ? size : -EFAULT;

        nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);

        if (bio->bi_bdev) {
                size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1);
                iov_iter_revert(iter, trim);
                size -= trim;
        }

        if (unlikely(!size)) {
                ret = -EFAULT;
                goto out;
        }

        for (left = size, i = 0; left > 0; left -= len, i++) {
                struct page *page = pages[i];

                len = min_t(size_t, PAGE_SIZE - offset, left);
                if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
                        ret = bio_iov_add_zone_append_page(bio, page, len,
                                        offset);
                        if (ret)
                                break;
                } else
                        bio_iov_add_page(bio, page, len, offset);

                offset = 0;
        }

        iov_iter_revert(iter, left);
out:
        while (i < nr_pages)
                bio_release_page(bio, pages[i++]);

        return ret;
}

/**
 * bio_iov_iter_get_pages - add user or kernel pages to a bio
 * @bio: bio to add pages to
 * @iter: iov iterator describing the region to be added
 *
 * This takes either an iterator pointing to user memory, or one pointing to
 * kernel pages (BVEC iterator). If we're adding user pages, we pin them and
 * map them into the kernel. On IO completion, the caller should put those
 * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided
 * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs
 * to ensure the bvecs and pages stay referenced until the submitted I/O is
 * completed by a call to ->ki_complete() or returns with an error other than
 * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF
 * on IO completion. If it isn't, then pages should be released.
 *
 * The function tries, but does not guarantee, to pin as many pages as
 * fit into the bio, or are requested in @iter, whatever is smaller. If
 * MM encounters an error pinning the requested pages, it stops. Error
 * is returned only if 0 pages could be pinned.
 */
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{
        int ret = 0;

        if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
                return -EIO;

        if (iov_iter_is_bvec(iter)) {
                bio_iov_bvec_set(bio, iter);
                iov_iter_advance(iter, bio->bi_iter.bi_size);
                return 0;
        }

        if (iov_iter_extract_will_pin(iter))
                bio_set_flag(bio, BIO_PAGE_PINNED);
        do {
                ret = __bio_iov_iter_get_pages(bio, iter);
        } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));

        return bio->bi_vcnt ? 0 : ret;
}
EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);

static void submit_bio_wait_endio(struct bio *bio)
{
        complete(bio->bi_private);
}

/**
 * submit_bio_wait - submit a bio, and wait until it completes
 * @bio: The &struct bio which describes the I/O
 *
 * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
 * bio_endio() on failure.
 *
 * WARNING: Unlike to how submit_bio() is usually used, this function does not
 * result in bio reference to be consumed. The caller must drop the reference
 * on his own.
 */
int submit_bio_wait(struct bio *bio)
{
        DECLARE_COMPLETION_ONSTACK_MAP(done,
                        bio->bi_bdev->bd_disk->lockdep_map);

        bio->bi_private = &done;
        bio->bi_end_io = submit_bio_wait_endio;
        bio->bi_opf |= REQ_SYNC;
        submit_bio(bio);
        blk_wait_io(&done);

        return blk_status_to_errno(bio->bi_status);
}
EXPORT_SYMBOL(submit_bio_wait);

static void bio_wait_end_io(struct bio *bio)
{
        complete(bio->bi_private);
        bio_put(bio);
}

/*
 * bio_await_chain - ends @bio and waits for every chained bio to complete
 */
void bio_await_chain(struct bio *bio)
{
        DECLARE_COMPLETION_ONSTACK_MAP(done,
                        bio->bi_bdev->bd_disk->lockdep_map);

        bio->bi_private = &done;
        bio->bi_end_io = bio_wait_end_io;
        bio_endio(bio);
        blk_wait_io(&done);
}

void __bio_advance(struct bio *bio, unsigned bytes)
{
        if (bio_integrity(bio))
                bio_integrity_advance(bio, bytes);

        bio_crypt_advance(bio, bytes);
        bio_advance_iter(bio, &bio->bi_iter, bytes);
}
EXPORT_SYMBOL(__bio_advance);

void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
                        struct bio *src, struct bvec_iter *src_iter)
{
        while (src_iter->bi_size && dst_iter->bi_size) {
                struct bio_vec src_bv = bio_iter_iovec(src, *src_iter);
                struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter);
                unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
                void *src_buf = bvec_kmap_local(&src_bv);
                void *dst_buf = bvec_kmap_local(&dst_bv);

                memcpy(dst_buf, src_buf, bytes);

                kunmap_local(dst_buf);
                kunmap_local(src_buf);

                bio_advance_iter_single(src, src_iter, bytes);
                bio_advance_iter_single(dst, dst_iter, bytes);
        }
}
EXPORT_SYMBOL(bio_copy_data_iter);

/**
 * bio_copy_data - copy contents of data buffers from one bio to another
 * @src: source bio
 * @dst: destination bio
 *
 * Stops when it reaches the end of either @src or @dst - that is, copies
 * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
 */
void bio_copy_data(struct bio *dst, struct bio *src)
{
        struct bvec_iter src_iter = src->bi_iter;
        struct bvec_iter dst_iter = dst->bi_iter;

        bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
}
EXPORT_SYMBOL(bio_copy_data);

void bio_free_pages(struct bio *bio)
{
        struct bio_vec *bvec;
        struct bvec_iter_all iter_all;

        bio_for_each_segment_all(bvec, bio, iter_all)
                __free_page(bvec->bv_page);
}
EXPORT_SYMBOL(bio_free_pages);

/*
 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
 * for performing direct-IO in BIOs.
 *
 * The problem is that we cannot run folio_mark_dirty() from interrupt context
 * because the required locks are not interrupt-safe.  So what we can do is to
 * mark the pages dirty _before_ performing IO.  And in interrupt context,
 * check that the pages are still dirty.   If so, fine.  If not, redirty them
 * in process context.
 *
 * Note that this code is very hard to test under normal circumstances because
 * direct-io pins the pages with get_user_pages().  This makes
 * is_page_cache_freeable return false, and the VM will not clean the pages.
 * But other code (eg, flusher threads) could clean the pages if they are mapped
 * pagecache.
 *
 * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
 * deferred bio dirtying paths.
 */

/*
 * bio_set_pages_dirty() will mark all the bio's pages as dirty.
 */
void bio_set_pages_dirty(struct bio *bio)
{
        struct folio_iter fi;

        bio_for_each_folio_all(fi, bio) {
                folio_lock(fi.folio);
                folio_mark_dirty(fi.folio);
                folio_unlock(fi.folio);
        }
}
EXPORT_SYMBOL_GPL(bio_set_pages_dirty);

/*
 * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
 * If they are, then fine.  If, however, some pages are clean then they must
 * have been written out during the direct-IO read.  So we take another ref on
 * the BIO and re-dirty the pages in process context.
 *
 * It is expected that bio_check_pages_dirty() will wholly own the BIO from
 * here on.  It will unpin each page and will run one bio_put() against the
 * BIO.
 */

static void bio_dirty_fn(struct work_struct *work);

static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
static DEFINE_SPINLOCK(bio_dirty_lock);
static struct bio *bio_dirty_list;

/*
 * This runs in process context
 */
static void bio_dirty_fn(struct work_struct *work)
{
        struct bio *bio, *next;

        spin_lock_irq(&bio_dirty_lock);
        next = bio_dirty_list;
        bio_dirty_list = NULL;
        spin_unlock_irq(&bio_dirty_lock);

        while ((bio = next) != NULL) {
                next = bio->bi_private;

                bio_release_pages(bio, true);
                bio_put(bio);
        }
}

void bio_check_pages_dirty(struct bio *bio)
{
        struct folio_iter fi;
        unsigned long flags;

        bio_for_each_folio_all(fi, bio) {
                if (!folio_test_dirty(fi.folio))
                        goto defer;
        }

        bio_release_pages(bio, false);
        bio_put(bio);
        return;
defer:
        spin_lock_irqsave(&bio_dirty_lock, flags);
        bio->bi_private = bio_dirty_list;
        bio_dirty_list = bio;
        spin_unlock_irqrestore(&bio_dirty_lock, flags);
        schedule_work(&bio_dirty_work);
}
EXPORT_SYMBOL_GPL(bio_check_pages_dirty);

static inline bool bio_remaining_done(struct bio *bio)
{
        /*
         * If we're not chaining, then ->__bi_remaining is always 1 and
         * we always end io on the first invocation.
         */
        if (!bio_flagged(bio, BIO_CHAIN))
                return true;

        BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);

        if (atomic_dec_and_test(&bio->__bi_remaining)) {
                bio_clear_flag(bio, BIO_CHAIN);
                return true;
        }

        return false;
}

/**
 * bio_endio - end I/O on a bio
 * @bio:        bio
 *
 * Description:
 *   bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
 *   way to end I/O on a bio. No one should call bi_end_io() directly on a
 *   bio unless they own it and thus know that it has an end_io function.
 *
 *   bio_endio() can be called several times on a bio that has been chained
 *   using bio_chain().  The ->bi_end_io() function will only be called the
 *   last time.
 **/
void bio_endio(struct bio *bio)
{
again:
        if (!bio_remaining_done(bio))
                return;
        if (!bio_integrity_endio(bio))
                return;

        blk_zone_bio_endio(bio);

        rq_qos_done_bio(bio);

        if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
                trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio);
                bio_clear_flag(bio, BIO_TRACE_COMPLETION);
        }

        /*
         * Need to have a real endio function for chained bios, otherwise
         * various corner cases will break (like stacking block devices that
         * save/restore bi_end_io) - however, we want to avoid unbounded
         * recursion and blowing the stack. Tail call optimization would
         * handle this, but compiling with frame pointers also disables
         * gcc's sibling call optimization.
         */
        if (bio->bi_end_io == bio_chain_endio) {
                bio = __bio_chain_endio(bio);
                goto again;
        }

        /* release cgroup info */
        bio_uninit(bio);
        if (bio->bi_end_io)
                bio->bi_end_io(bio);
}
EXPORT_SYMBOL(bio_endio);

/**
 * bio_split - split a bio
 * @bio:        bio to split
 * @sectors:        number of sectors to split from the front of @bio
 * @gfp:        gfp mask
 * @bs:                bio set to allocate from
 *
 * Allocates and returns a new bio which represents @sectors from the start of
 * @bio, and updates @bio to represent the remaining sectors.
 *
 * Unless this is a discard request the newly allocated bio will point
 * to @bio's bi_io_vec. It is the caller's responsibility to ensure that
 * neither @bio nor @bs are freed before the split bio.
 */
struct bio *bio_split(struct bio *bio, int sectors,
                      gfp_t gfp, struct bio_set *bs)
{
        struct bio *split;

        BUG_ON(sectors <= 0);
        BUG_ON(sectors >= bio_sectors(bio));

        /* Zone append commands cannot be split */
        if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
                return NULL;

        split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs);
        if (!split)
                return NULL;

        split->bi_iter.bi_size = sectors << 9;

        if (bio_integrity(split))
                bio_integrity_trim(split);

        bio_advance(bio, split->bi_iter.bi_size);

        if (bio_flagged(bio, BIO_TRACE_COMPLETION))
                bio_set_flag(split, BIO_TRACE_COMPLETION);

        return split;
}
EXPORT_SYMBOL(bio_split);

/**
 * bio_trim - trim a bio
 * @bio:        bio to trim
 * @offset:        number of sectors to trim from the front of @bio
 * @size:        size we want to trim @bio to, in sectors
 *
 * This function is typically used for bios that are cloned and submitted
 * to the underlying device in parts.
 */
void bio_trim(struct bio *bio, sector_t offset, sector_t size)
{
        if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS ||
                         offset + size > bio_sectors(bio)))
                return;

        size <<= 9;
        if (offset == 0 && size == bio->bi_iter.bi_size)
                return;

        bio_advance(bio, offset << 9);
        bio->bi_iter.bi_size = size;

        if (bio_integrity(bio))
                bio_integrity_trim(bio);
}
EXPORT_SYMBOL_GPL(bio_trim);

/*
 * create memory pools for biovec's in a bio_set.
 * use the global biovec slabs created for general use.
 */
int biovec_init_pool(mempool_t *pool, int pool_entries)
{
        struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1;

        return mempool_init_slab_pool(pool, pool_entries, bp->slab);
}

/*
 * bioset_exit - exit a bioset initialized with bioset_init()
 *
 * May be called on a zeroed but uninitialized bioset (i.e. allocated with
 * kzalloc()).
 */
void bioset_exit(struct bio_set *bs)
{
        bio_alloc_cache_destroy(bs);
        if (bs->rescue_workqueue)
                destroy_workqueue(bs->rescue_workqueue);
        bs->rescue_workqueue = NULL;

        mempool_exit(&bs->bio_pool);
        mempool_exit(&bs->bvec_pool);

        bioset_integrity_free(bs);
        if (bs->bio_slab)
                bio_put_slab(bs);
        bs->bio_slab = NULL;
}
EXPORT_SYMBOL(bioset_exit);

/**
 * bioset_init - Initialize a bio_set
 * @bs:                pool to initialize
 * @pool_size:        Number of bio and bio_vecs to cache in the mempool
 * @front_pad:        Number of bytes to allocate in front of the returned bio
 * @flags:        Flags to modify behavior, currently %BIOSET_NEED_BVECS
 *              and %BIOSET_NEED_RESCUER
 *
 * Description:
 *    Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
 *    to ask for a number of bytes to be allocated in front of the bio.
 *    Front pad allocation is useful for embedding the bio inside
 *    another structure, to avoid allocating extra data to go with the bio.
 *    Note that the bio must be embedded at the END of that structure always,
 *    or things will break badly.
 *    If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated
 *    for allocating iovecs.  This pool is not needed e.g. for bio_init_clone().
 *    If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used
 *    to dispatch queued requests when the mempool runs out of space.
 *
 */
int bioset_init(struct bio_set *bs,
                unsigned int pool_size,
                unsigned int front_pad,
                int flags)
{
        bs->front_pad = front_pad;
        if (flags & BIOSET_NEED_BVECS)
                bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
        else
                bs->back_pad = 0;

        spin_lock_init(&bs->rescue_lock);
        bio_list_init(&bs->rescue_list);
        INIT_WORK(&bs->rescue_work, bio_alloc_rescue);

        bs->bio_slab = bio_find_or_create_slab(bs);
        if (!bs->bio_slab)
                return -ENOMEM;

        if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab))
                goto bad;

        if ((flags & BIOSET_NEED_BVECS) &&
            biovec_init_pool(&bs->bvec_pool, pool_size))
                goto bad;

        if (flags & BIOSET_NEED_RESCUER) {
                bs->rescue_workqueue = alloc_workqueue("bioset",
                                                        WQ_MEM_RECLAIM, 0);
                if (!bs->rescue_workqueue)
                        goto bad;
        }
        if (flags & BIOSET_PERCPU_CACHE) {
                bs->cache = alloc_percpu(struct bio_alloc_cache);
                if (!bs->cache)
                        goto bad;
                cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
        }

        return 0;
bad:
        bioset_exit(bs);
        return -ENOMEM;
}
EXPORT_SYMBOL(bioset_init);

static int __init init_bio(void)
{
        int i;

        BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags));

        bio_integrity_init();

        for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) {
                struct biovec_slab *bvs = bvec_slabs + i;

                bvs->slab = kmem_cache_create(bvs->name,
                                bvs->nr_vecs * sizeof(struct bio_vec), 0,
                                SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
        }

        cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
                                        bio_cpu_dead);

        if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0,
                        BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE))
                panic("bio: can't allocate bios\n");

        if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))
                panic("bio: can't create integrity pool\n");

        return 0;
}
subsys_initcall(init_bio);











































































    2 




















































































































































































































































































































































































































































































































































































   10 




   10 



















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2011 IBM Corporation
 *
 * Author:
 * Mimi Zohar <zohar@us.ibm.com>
 */
#include <linux/module.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/xattr.h>
#include <linux/magic.h>
#include <linux/ima.h>
#include <linux/evm.h>
#include <linux/fsverity.h>
#include <keys/system_keyring.h>
#include <uapi/linux/fsverity.h>

#include "ima.h"

#ifdef CONFIG_IMA_APPRAISE_BOOTPARAM
static char *ima_appraise_cmdline_default __initdata;
core_param(ima_appraise, ima_appraise_cmdline_default, charp, 0);

void __init ima_appraise_parse_cmdline(void)
{
        const char *str = ima_appraise_cmdline_default;
        bool sb_state = arch_ima_get_secureboot();
        int appraisal_state = ima_appraise;

        if (!str)
                return;

        if (strncmp(str, "off", 3) == 0)
                appraisal_state = 0;
        else if (strncmp(str, "log", 3) == 0)
                appraisal_state = IMA_APPRAISE_LOG;
        else if (strncmp(str, "fix", 3) == 0)
                appraisal_state = IMA_APPRAISE_FIX;
        else if (strncmp(str, "enforce", 7) == 0)
                appraisal_state = IMA_APPRAISE_ENFORCE;
        else
                pr_err("invalid \"%s\" appraise option", str);

        /* If appraisal state was changed, but secure boot is enabled,
         * keep its default */
        if (sb_state) {
                if (!(appraisal_state & IMA_APPRAISE_ENFORCE))
                        pr_info("Secure boot enabled: ignoring ima_appraise=%s option",
                                str);
        } else {
                ima_appraise = appraisal_state;
        }
}
#endif

/*
 * is_ima_appraise_enabled - return appraise status
 *
 * Only return enabled, if not in ima_appraise="fix" or "log" modes.
 */
bool is_ima_appraise_enabled(void)
{
        return ima_appraise & IMA_APPRAISE_ENFORCE;
}

/*
 * ima_must_appraise - set appraise flag
 *
 * Return 1 to appraise or hash
 */
int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode,
                      int mask, enum ima_hooks func)
{
        u32 secid;

        if (!ima_appraise)
                return 0;

        security_current_getsecid_subj(&secid);
        return ima_match_policy(idmap, inode, current_cred(), secid,
                                func, mask, IMA_APPRAISE | IMA_HASH, NULL,
                                NULL, NULL, NULL);
}

static int ima_fix_xattr(struct dentry *dentry, struct ima_iint_cache *iint)
{
        int rc, offset;
        u8 algo = iint->ima_hash->algo;

        if (algo <= HASH_ALGO_SHA1) {
                offset = 1;
                iint->ima_hash->xattr.sha1.type = IMA_XATTR_DIGEST;
        } else {
                offset = 0;
                iint->ima_hash->xattr.ng.type = IMA_XATTR_DIGEST_NG;
                iint->ima_hash->xattr.ng.algo = algo;
        }
        rc = __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_IMA,
                                   &iint->ima_hash->xattr.data[offset],
                                   (sizeof(iint->ima_hash->xattr) - offset) +
                                   iint->ima_hash->length, 0);
        return rc;
}

/* Return specific func appraised cached result */
enum integrity_status ima_get_cache_status(struct ima_iint_cache *iint,
                                           enum ima_hooks func)
{
        switch (func) {
        case MMAP_CHECK:
        case MMAP_CHECK_REQPROT:
                return iint->ima_mmap_status;
        case BPRM_CHECK:
                return iint->ima_bprm_status;
        case CREDS_CHECK:
                return iint->ima_creds_status;
        case FILE_CHECK:
        case POST_SETATTR:
                return iint->ima_file_status;
        case MODULE_CHECK ... MAX_CHECK - 1:
        default:
                return iint->ima_read_status;
        }
}

static void ima_set_cache_status(struct ima_iint_cache *iint,
                                 enum ima_hooks func,
                                 enum integrity_status status)
{
        switch (func) {
        case MMAP_CHECK:
        case MMAP_CHECK_REQPROT:
                iint->ima_mmap_status = status;
                break;
        case BPRM_CHECK:
                iint->ima_bprm_status = status;
                break;
        case CREDS_CHECK:
                iint->ima_creds_status = status;
                break;
        case FILE_CHECK:
        case POST_SETATTR:
                iint->ima_file_status = status;
                break;
        case MODULE_CHECK ... MAX_CHECK - 1:
        default:
                iint->ima_read_status = status;
                break;
        }
}

static void ima_cache_flags(struct ima_iint_cache *iint, enum ima_hooks func)
{
        switch (func) {
        case MMAP_CHECK:
        case MMAP_CHECK_REQPROT:
                iint->flags |= (IMA_MMAP_APPRAISED | IMA_APPRAISED);
                break;
        case BPRM_CHECK:
                iint->flags |= (IMA_BPRM_APPRAISED | IMA_APPRAISED);
                break;
        case CREDS_CHECK:
                iint->flags |= (IMA_CREDS_APPRAISED | IMA_APPRAISED);
                break;
        case FILE_CHECK:
        case POST_SETATTR:
                iint->flags |= (IMA_FILE_APPRAISED | IMA_APPRAISED);
                break;
        case MODULE_CHECK ... MAX_CHECK - 1:
        default:
                iint->flags |= (IMA_READ_APPRAISED | IMA_APPRAISED);
                break;
        }
}

enum hash_algo ima_get_hash_algo(const struct evm_ima_xattr_data *xattr_value,
                                 int xattr_len)
{
        struct signature_v2_hdr *sig;
        enum hash_algo ret;

        if (!xattr_value || xattr_len < 2)
                /* return default hash algo */
                return ima_hash_algo;

        switch (xattr_value->type) {
        case IMA_VERITY_DIGSIG:
                sig = (typeof(sig))xattr_value;
                if (sig->version != 3 || xattr_len <= sizeof(*sig) ||
                    sig->hash_algo >= HASH_ALGO__LAST)
                        return ima_hash_algo;
                return sig->hash_algo;
        case EVM_IMA_XATTR_DIGSIG:
                sig = (typeof(sig))xattr_value;
                if (sig->version != 2 || xattr_len <= sizeof(*sig)
                    || sig->hash_algo >= HASH_ALGO__LAST)
                        return ima_hash_algo;
                return sig->hash_algo;
        case IMA_XATTR_DIGEST_NG:
                /* first byte contains algorithm id */
                ret = xattr_value->data[0];
                if (ret < HASH_ALGO__LAST)
                        return ret;
                break;
        case IMA_XATTR_DIGEST:
                /* this is for backward compatibility */
                if (xattr_len == 21) {
                        unsigned int zero = 0;
                        if (!memcmp(&xattr_value->data[16], &zero, 4))
                                return HASH_ALGO_MD5;
                        else
                                return HASH_ALGO_SHA1;
                } else if (xattr_len == 17)
                        return HASH_ALGO_MD5;
                break;
        }

        /* return default hash algo */
        return ima_hash_algo;
}

int ima_read_xattr(struct dentry *dentry,
                   struct evm_ima_xattr_data **xattr_value, int xattr_len)
{
        int ret;

        ret = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_IMA,
                                 (char **)xattr_value, xattr_len, GFP_NOFS);
        if (ret == -EOPNOTSUPP)
                ret = 0;
        return ret;
}

/*
 * calc_file_id_hash - calculate the hash of the ima_file_id struct data
 * @type: xattr type [enum evm_ima_xattr_type]
 * @algo: hash algorithm [enum hash_algo]
 * @digest: pointer to the digest to be hashed
 * @hash: (out) pointer to the hash
 *
 * IMA signature version 3 disambiguates the data that is signed by
 * indirectly signing the hash of the ima_file_id structure data.
 *
 * Signing the ima_file_id struct is currently only supported for
 * IMA_VERITY_DIGSIG type xattrs.
 *
 * Return 0 on success, error code otherwise.
 */
static int calc_file_id_hash(enum evm_ima_xattr_type type,
                             enum hash_algo algo, const u8 *digest,
                             struct ima_digest_data *hash)
{
        struct ima_file_id file_id = {
                .hash_type = IMA_VERITY_DIGSIG, .hash_algorithm = algo};
        unsigned int unused = HASH_MAX_DIGESTSIZE - hash_digest_size[algo];

        if (type != IMA_VERITY_DIGSIG)
                return -EINVAL;

        memcpy(file_id.hash, digest, hash_digest_size[algo]);

        hash->algo = algo;
        hash->length = hash_digest_size[algo];

        return ima_calc_buffer_hash(&file_id, sizeof(file_id) - unused, hash);
}

/*
 * xattr_verify - verify xattr digest or signature
 *
 * Verify whether the hash or signature matches the file contents.
 *
 * Return 0 on success, error code otherwise.
 */
static int xattr_verify(enum ima_hooks func, struct ima_iint_cache *iint,
                        struct evm_ima_xattr_data *xattr_value, int xattr_len,
                        enum integrity_status *status, const char **cause)
{
        struct ima_max_digest_data hash;
        struct signature_v2_hdr *sig;
        int rc = -EINVAL, hash_start = 0;
        int mask;

        switch (xattr_value->type) {
        case IMA_XATTR_DIGEST_NG:
                /* first byte contains algorithm id */
                hash_start = 1;
                fallthrough;
        case IMA_XATTR_DIGEST:
                if (*status != INTEGRITY_PASS_IMMUTABLE) {
                        if (iint->flags & IMA_DIGSIG_REQUIRED) {
                                if (iint->flags & IMA_VERITY_REQUIRED)
                                        *cause = "verity-signature-required";
                                else
                                        *cause = "IMA-signature-required";
                                *status = INTEGRITY_FAIL;
                                break;
                        }
                        clear_bit(IMA_DIGSIG, &iint->atomic_flags);
                } else {
                        set_bit(IMA_DIGSIG, &iint->atomic_flags);
                }
                if (xattr_len - sizeof(xattr_value->type) - hash_start >=
                                iint->ima_hash->length)
                        /*
                         * xattr length may be longer. md5 hash in previous
                         * version occupied 20 bytes in xattr, instead of 16
                         */
                        rc = memcmp(&xattr_value->data[hash_start],
                                    iint->ima_hash->digest,
                                    iint->ima_hash->length);
                else
                        rc = -EINVAL;
                if (rc) {
                        *cause = "invalid-hash";
                        *status = INTEGRITY_FAIL;
                        break;
                }
                *status = INTEGRITY_PASS;
                break;
        case EVM_IMA_XATTR_DIGSIG:
                set_bit(IMA_DIGSIG, &iint->atomic_flags);

                mask = IMA_DIGSIG_REQUIRED | IMA_VERITY_REQUIRED;
                if ((iint->flags & mask) == mask) {
                        *cause = "verity-signature-required";
                        *status = INTEGRITY_FAIL;
                        break;
                }

                sig = (typeof(sig))xattr_value;
                if (sig->version >= 3) {
                        *cause = "invalid-signature-version";
                        *status = INTEGRITY_FAIL;
                        break;
                }
                rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA,
                                             (const char *)xattr_value,
                                             xattr_len,
                                             iint->ima_hash->digest,
                                             iint->ima_hash->length);
                if (rc == -EOPNOTSUPP) {
                        *status = INTEGRITY_UNKNOWN;
                        break;
                }
                if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc &&
                    func == KEXEC_KERNEL_CHECK)
                        rc = integrity_digsig_verify(INTEGRITY_KEYRING_PLATFORM,
                                                     (const char *)xattr_value,
                                                     xattr_len,
                                                     iint->ima_hash->digest,
                                                     iint->ima_hash->length);
                if (rc) {
                        *cause = "invalid-signature";
                        *status = INTEGRITY_FAIL;
                } else {
                        *status = INTEGRITY_PASS;
                }
                break;
        case IMA_VERITY_DIGSIG:
                set_bit(IMA_DIGSIG, &iint->atomic_flags);

                if (iint->flags & IMA_DIGSIG_REQUIRED) {
                        if (!(iint->flags & IMA_VERITY_REQUIRED)) {
                                *cause = "IMA-signature-required";
                                *status = INTEGRITY_FAIL;
                                break;
                        }
                }

                sig = (typeof(sig))xattr_value;
                if (sig->version != 3) {
                        *cause = "invalid-signature-version";
                        *status = INTEGRITY_FAIL;
                        break;
                }

                rc = calc_file_id_hash(IMA_VERITY_DIGSIG, iint->ima_hash->algo,
                                       iint->ima_hash->digest,
                                       container_of(&hash.hdr,
                                               struct ima_digest_data, hdr));
                if (rc) {
                        *cause = "sigv3-hashing-error";
                        *status = INTEGRITY_FAIL;
                        break;
                }

                rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA,
                                             (const char *)xattr_value,
                                             xattr_len, hash.digest,
                                             hash.hdr.length);
                if (rc) {
                        *cause = "invalid-verity-signature";
                        *status = INTEGRITY_FAIL;
                } else {
                        *status = INTEGRITY_PASS;
                }

                break;
        default:
                *status = INTEGRITY_UNKNOWN;
                *cause = "unknown-ima-data";
                break;
        }

        return rc;
}

/*
 * modsig_verify - verify modsig signature
 *
 * Verify whether the signature matches the file contents.
 *
 * Return 0 on success, error code otherwise.
 */
static int modsig_verify(enum ima_hooks func, const struct modsig *modsig,
                         enum integrity_status *status, const char **cause)
{
        int rc;

        rc = integrity_modsig_verify(INTEGRITY_KEYRING_IMA, modsig);
        if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc &&
            func == KEXEC_KERNEL_CHECK)
                rc = integrity_modsig_verify(INTEGRITY_KEYRING_PLATFORM,
                                             modsig);
        if (rc) {
                *cause = "invalid-signature";
                *status = INTEGRITY_FAIL;
        } else {
                *status = INTEGRITY_PASS;
        }

        return rc;
}

/*
 * ima_check_blacklist - determine if the binary is blacklisted.
 *
 * Add the hash of the blacklisted binary to the measurement list, based
 * on policy.
 *
 * Returns -EPERM if the hash is blacklisted.
 */
int ima_check_blacklist(struct ima_iint_cache *iint,
                        const struct modsig *modsig, int pcr)
{
        enum hash_algo hash_algo;
        const u8 *digest = NULL;
        u32 digestsize = 0;
        int rc = 0;

        if (!(iint->flags & IMA_CHECK_BLACKLIST))
                return 0;

        if (iint->flags & IMA_MODSIG_ALLOWED && modsig) {
                ima_get_modsig_digest(modsig, &hash_algo, &digest, &digestsize);

                rc = is_binary_blacklisted(digest, digestsize);
        } else if (iint->flags & IMA_DIGSIG_REQUIRED && iint->ima_hash)
                rc = is_binary_blacklisted(iint->ima_hash->digest, iint->ima_hash->length);

        if ((rc == -EPERM) && (iint->flags & IMA_MEASURE))
                process_buffer_measurement(&nop_mnt_idmap, NULL, digest, digestsize,
                                           "blacklisted-hash", NONE,
                                           pcr, NULL, false, NULL, 0);

        return rc;
}

/*
 * ima_appraise_measurement - appraise file measurement
 *
 * Call evm_verifyxattr() to verify the integrity of 'security.ima'.
 * Assuming success, compare the xattr hash with the collected measurement.
 *
 * Return 0 on success, error code otherwise
 */
int ima_appraise_measurement(enum ima_hooks func, struct ima_iint_cache *iint,
                             struct file *file, const unsigned char *filename,
                             struct evm_ima_xattr_data *xattr_value,
                             int xattr_len, const struct modsig *modsig)
{
        static const char op[] = "appraise_data";
        const char *cause = "unknown";
        struct dentry *dentry = file_dentry(file);
        struct inode *inode = d_backing_inode(dentry);
        enum integrity_status status = INTEGRITY_UNKNOWN;
        int rc = xattr_len;
        bool try_modsig = iint->flags & IMA_MODSIG_ALLOWED && modsig;

        /* If not appraising a modsig, we need an xattr. */
        if (!(inode->i_opflags & IOP_XATTR) && !try_modsig)
                return INTEGRITY_UNKNOWN;

        /* If reading the xattr failed and there's no modsig, error out. */
        if (rc <= 0 && !try_modsig) {
                if (rc && rc != -ENODATA)
                        goto out;

                if (iint->flags & IMA_DIGSIG_REQUIRED) {
                        if (iint->flags & IMA_VERITY_REQUIRED)
                                cause = "verity-signature-required";
                        else
                                cause = "IMA-signature-required";
                } else {
                        cause = "missing-hash";
                }

                status = INTEGRITY_NOLABEL;
                if (file->f_mode & FMODE_CREATED)
                        iint->flags |= IMA_NEW_FILE;
                if ((iint->flags & IMA_NEW_FILE) &&
                    (!(iint->flags & IMA_DIGSIG_REQUIRED) ||
                     (inode->i_size == 0)))
                        status = INTEGRITY_PASS;
                goto out;
        }

        status = evm_verifyxattr(dentry, XATTR_NAME_IMA, xattr_value,
                                 rc < 0 ? 0 : rc);
        switch (status) {
        case INTEGRITY_PASS:
        case INTEGRITY_PASS_IMMUTABLE:
        case INTEGRITY_UNKNOWN:
                break;
        case INTEGRITY_NOXATTRS:        /* No EVM protected xattrs. */
                /* It's fine not to have xattrs when using a modsig. */
                if (try_modsig)
                        break;
                fallthrough;
        case INTEGRITY_NOLABEL:                /* No security.evm xattr. */
                cause = "missing-HMAC";
                goto out;
        case INTEGRITY_FAIL_IMMUTABLE:
                set_bit(IMA_DIGSIG, &iint->atomic_flags);
                cause = "invalid-fail-immutable";
                goto out;
        case INTEGRITY_FAIL:                /* Invalid HMAC/signature. */
                cause = "invalid-HMAC";
                goto out;
        default:
                WARN_ONCE(true, "Unexpected integrity status %d\n", status);
        }

        if (xattr_value)
                rc = xattr_verify(func, iint, xattr_value, xattr_len, &status,
                                  &cause);

        /*
         * If we have a modsig and either no imasig or the imasig's key isn't
         * known, then try verifying the modsig.
         */
        if (try_modsig &&
            (!xattr_value || xattr_value->type == IMA_XATTR_DIGEST_NG ||
             rc == -ENOKEY))
                rc = modsig_verify(func, modsig, &status, &cause);

out:
        /*
         * File signatures on some filesystems can not be properly verified.
         * When such filesystems are mounted by an untrusted mounter or on a
         * system not willing to accept such a risk, fail the file signature
         * verification.
         */
        if ((inode->i_sb->s_iflags & SB_I_IMA_UNVERIFIABLE_SIGNATURE) &&
            ((inode->i_sb->s_iflags & SB_I_UNTRUSTED_MOUNTER) ||
             (iint->flags & IMA_FAIL_UNVERIFIABLE_SIGS))) {
                status = INTEGRITY_FAIL;
                cause = "unverifiable-signature";
                integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, filename,
                                    op, cause, rc, 0);
        } else if (status != INTEGRITY_PASS) {
                /* Fix mode, but don't replace file signatures. */
                if ((ima_appraise & IMA_APPRAISE_FIX) && !try_modsig &&
                    (!xattr_value ||
                     xattr_value->type != EVM_IMA_XATTR_DIGSIG)) {
                        if (!ima_fix_xattr(dentry, iint))
                                status = INTEGRITY_PASS;
                }

                /*
                 * Permit new files with file/EVM portable signatures, but
                 * without data.
                 */
                if (inode->i_size == 0 && iint->flags & IMA_NEW_FILE &&
                    test_bit(IMA_DIGSIG, &iint->atomic_flags)) {
                        status = INTEGRITY_PASS;
                }

                integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, filename,
                                    op, cause, rc, 0);
        } else {
                ima_cache_flags(iint, func);
        }

        ima_set_cache_status(iint, func, status);
        return status;
}

/*
 * ima_update_xattr - update 'security.ima' hash value
 */
void ima_update_xattr(struct ima_iint_cache *iint, struct file *file)
{
        struct dentry *dentry = file_dentry(file);
        int rc = 0;

        /* do not collect and update hash for digital signatures */
        if (test_bit(IMA_DIGSIG, &iint->atomic_flags))
                return;

        if ((iint->ima_file_status != INTEGRITY_PASS) &&
            !(iint->flags & IMA_HASH))
                return;

        rc = ima_collect_measurement(iint, file, NULL, 0, ima_hash_algo, NULL);
        if (rc < 0)
                return;

        inode_lock(file_inode(file));
        ima_fix_xattr(dentry, iint);
        inode_unlock(file_inode(file));
}

/**
 * ima_inode_post_setattr - reflect file metadata changes
 * @idmap:  idmap of the mount the inode was found from
 * @dentry: pointer to the affected dentry
 * @ia_valid: for the UID and GID status
 *
 * Changes to a dentry's metadata might result in needing to appraise.
 *
 * This function is called from notify_change(), which expects the caller
 * to lock the inode's i_mutex.
 */
static void ima_inode_post_setattr(struct mnt_idmap *idmap,
                                   struct dentry *dentry, int ia_valid)
{
        struct inode *inode = d_backing_inode(dentry);
        struct ima_iint_cache *iint;
        int action;

        if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode)
            || !(inode->i_opflags & IOP_XATTR))
                return;

        action = ima_must_appraise(idmap, inode, MAY_ACCESS, POST_SETATTR);
        iint = ima_iint_find(inode);
        if (iint) {
                set_bit(IMA_CHANGE_ATTR, &iint->atomic_flags);
                if (!action)
                        clear_bit(IMA_UPDATE_XATTR, &iint->atomic_flags);
        }
}

/*
 * ima_protect_xattr - protect 'security.ima'
 *
 * Ensure that not just anyone can modify or remove 'security.ima'.
 */
static int ima_protect_xattr(struct dentry *dentry, const char *xattr_name,
                             const void *xattr_value, size_t xattr_value_len)
{
        if (strcmp(xattr_name, XATTR_NAME_IMA) == 0) {
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                return 1;
        }
        return 0;
}

static void ima_reset_appraise_flags(struct inode *inode, int digsig)
{
        struct ima_iint_cache *iint;

        if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode))
                return;

        iint = ima_iint_find(inode);
        if (!iint)
                return;
        iint->measured_pcrs = 0;
        set_bit(IMA_CHANGE_XATTR, &iint->atomic_flags);
        if (digsig)
                set_bit(IMA_DIGSIG, &iint->atomic_flags);
        else
                clear_bit(IMA_DIGSIG, &iint->atomic_flags);
}

/**
 * validate_hash_algo() - Block setxattr with unsupported hash algorithms
 * @dentry: object of the setxattr()
 * @xattr_value: userland supplied xattr value
 * @xattr_value_len: length of xattr_value
 *
 * The xattr value is mapped to its hash algorithm, and this algorithm
 * must be built in the kernel for the setxattr to be allowed.
 *
 * Emit an audit message when the algorithm is invalid.
 *
 * Return: 0 on success, else an error.
 */
static int validate_hash_algo(struct dentry *dentry,
                              const struct evm_ima_xattr_data *xattr_value,
                              size_t xattr_value_len)
{
        char *path = NULL, *pathbuf = NULL;
        enum hash_algo xattr_hash_algo;
        const char *errmsg = "unavailable-hash-algorithm";
        unsigned int allowed_hashes;

        xattr_hash_algo = ima_get_hash_algo(xattr_value, xattr_value_len);

        allowed_hashes = atomic_read(&ima_setxattr_allowed_hash_algorithms);

        if (allowed_hashes) {
                /* success if the algorithm is allowed in the ima policy */
                if (allowed_hashes & (1U << xattr_hash_algo))
                        return 0;

                /*
                 * We use a different audit message when the hash algorithm
                 * is denied by a policy rule, instead of not being built
                 * in the kernel image
                 */
                errmsg = "denied-hash-algorithm";
        } else {
                if (likely(xattr_hash_algo == ima_hash_algo))
                        return 0;

                /* allow any xattr using an algorithm built in the kernel */
                if (crypto_has_alg(hash_algo_name[xattr_hash_algo], 0, 0))
                        return 0;
        }

        pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
        if (!pathbuf)
                return -EACCES;

        path = dentry_path(dentry, pathbuf, PATH_MAX);

        integrity_audit_msg(AUDIT_INTEGRITY_DATA, d_inode(dentry), path,
                            "set_data", errmsg, -EACCES, 0);

        kfree(pathbuf);

        return -EACCES;
}

static int ima_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
                              const char *xattr_name, const void *xattr_value,
                              size_t xattr_value_len, int flags)
{
        const struct evm_ima_xattr_data *xvalue = xattr_value;
        int digsig = 0;
        int result;
        int err;

        result = ima_protect_xattr(dentry, xattr_name, xattr_value,
                                   xattr_value_len);
        if (result == 1) {
                if (!xattr_value_len || (xvalue->type >= IMA_XATTR_LAST))
                        return -EINVAL;

                err = validate_hash_algo(dentry, xvalue, xattr_value_len);
                if (err)
                        return err;

                digsig = (xvalue->type == EVM_IMA_XATTR_DIGSIG);
        } else if (!strcmp(xattr_name, XATTR_NAME_EVM) && xattr_value_len > 0) {
                digsig = (xvalue->type == EVM_XATTR_PORTABLE_DIGSIG);
        }
        if (result == 1 || evm_revalidate_status(xattr_name)) {
                ima_reset_appraise_flags(d_backing_inode(dentry), digsig);
                if (result == 1)
                        result = 0;
        }
        return result;
}

static int ima_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                             const char *acl_name, struct posix_acl *kacl)
{
        if (evm_revalidate_status(acl_name))
                ima_reset_appraise_flags(d_backing_inode(dentry), 0);

        return 0;
}

static int ima_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry,
                                 const char *xattr_name)
{
        int result;

        result = ima_protect_xattr(dentry, xattr_name, NULL, 0);
        if (result == 1 || evm_revalidate_status(xattr_name)) {
                ima_reset_appraise_flags(d_backing_inode(dentry), 0);
                if (result == 1)
                        result = 0;
        }
        return result;
}

static int ima_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                                const char *acl_name)
{
        return ima_inode_set_acl(idmap, dentry, acl_name, NULL);
}

static struct security_hook_list ima_appraise_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(inode_post_setattr, ima_inode_post_setattr),
        LSM_HOOK_INIT(inode_setxattr, ima_inode_setxattr),
        LSM_HOOK_INIT(inode_set_acl, ima_inode_set_acl),
        LSM_HOOK_INIT(inode_removexattr, ima_inode_removexattr),
        LSM_HOOK_INIT(inode_remove_acl, ima_inode_remove_acl),
};

void __init init_ima_appraise_lsm(const struct lsm_id *lsmid)
{
        security_add_hooks(ima_appraise_hooks, ARRAY_SIZE(ima_appraise_hooks),
                           lsmid);
}

































































































































































































































































   12 

   11 



   11 














   10 





   11 






































    2 
    2 










   12 
   12 








    1 




































   11 


















































































    1 














































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BLK_CGROUP_PRIVATE_H
#define _BLK_CGROUP_PRIVATE_H
/*
 * block cgroup private header
 *
 * Based on ideas and code from CFQ, CFS and BFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *                      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 *                       Nauman Rafique <nauman@google.com>
 */

#include <linux/blk-cgroup.h>
#include <linux/cgroup.h>
#include <linux/kthread.h>
#include <linux/blk-mq.h>
#include <linux/llist.h>
#include "blk.h"

struct blkcg_gq;
struct blkg_policy_data;


/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
#define BLKG_STAT_CPU_BATCH        (INT_MAX / 2)

#ifdef CONFIG_BLK_CGROUP

enum blkg_iostat_type {
        BLKG_IOSTAT_READ,
        BLKG_IOSTAT_WRITE,
        BLKG_IOSTAT_DISCARD,

        BLKG_IOSTAT_NR,
};

struct blkg_iostat {
        u64                                bytes[BLKG_IOSTAT_NR];
        u64                                ios[BLKG_IOSTAT_NR];
};

struct blkg_iostat_set {
        struct u64_stats_sync                sync;
        struct blkcg_gq                       *blkg;
        struct llist_node                lnode;
        int                                lqueued;        /* queued in llist */
        struct blkg_iostat                cur;
        struct blkg_iostat                last;
};

/* association between a blk cgroup and a request queue */
struct blkcg_gq {
        /* Pointer to the associated request_queue */
        struct request_queue                *q;
        struct list_head                q_node;
        struct hlist_node                blkcg_node;
        struct blkcg                        *blkcg;

        /* all non-root blkcg_gq's are guaranteed to have access to parent */
        struct blkcg_gq                        *parent;

        /* reference count */
        struct percpu_ref                refcnt;

        /* is this blkg online? protected by both blkcg and q locks */
        bool                                online;

        struct blkg_iostat_set __percpu        *iostat_cpu;
        struct blkg_iostat_set                iostat;

        struct blkg_policy_data                *pd[BLKCG_MAX_POLS];
#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
        spinlock_t                        async_bio_lock;
        struct bio_list                        async_bios;
#endif
        union {
                struct work_struct        async_bio_work;
                struct work_struct        free_work;
        };

        atomic_t                        use_delay;
        atomic64_t                        delay_nsec;
        atomic64_t                        delay_start;
        u64                                last_delay;
        int                                last_use;

        struct rcu_head                        rcu_head;
};

struct blkcg {
        struct cgroup_subsys_state        css;
        spinlock_t                        lock;
        refcount_t                        online_pin;

        struct radix_tree_root                blkg_tree;
        struct blkcg_gq        __rcu                *blkg_hint;
        struct hlist_head                blkg_list;

        struct blkcg_policy_data        *cpd[BLKCG_MAX_POLS];

        struct list_head                all_blkcgs_node;

        /*
         * List of updated percpu blkg_iostat_set's since the last flush.
         */
        struct llist_head __percpu        *lhead;

#ifdef CONFIG_BLK_CGROUP_FC_APPID
        char                            fc_app_id[FC_APPID_LEN];
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
        struct list_head                cgwb_list;
#endif
};

static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
{
        return css ? container_of(css, struct blkcg, css) : NULL;
}

/*
 * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
 * request_queue (q).  This is used by blkcg policies which need to track
 * information per blkcg - q pair.
 *
 * There can be multiple active blkcg policies and each blkg:policy pair is
 * represented by a blkg_policy_data which is allocated and freed by each
 * policy's pd_alloc/free_fn() methods.  A policy can allocate private data
 * area by allocating larger data structure which embeds blkg_policy_data
 * at the beginning.
 */
struct blkg_policy_data {
        /* the blkg and policy id this per-policy data belongs to */
        struct blkcg_gq                        *blkg;
        int                                plid;
        bool                                online;
};

/*
 * Policies that need to keep per-blkcg data which is independent from any
 * request_queue associated to it should implement cpd_alloc/free_fn()
 * methods.  A policy can allocate private data area by allocating larger
 * data structure which embeds blkcg_policy_data at the beginning.
 * cpd_init() is invoked to let each policy handle per-blkcg data.
 */
struct blkcg_policy_data {
        /* the blkcg and policy id this per-policy data belongs to */
        struct blkcg                        *blkcg;
        int                                plid;
};

typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(struct gendisk *disk,
                struct blkcg *blkcg, gfp_t gfp);
typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd,
                                struct seq_file *s);

struct blkcg_policy {
        int                                plid;
        /* cgroup files for the policy */
        struct cftype                        *dfl_cftypes;
        struct cftype                        *legacy_cftypes;

        /* operations */
        blkcg_pol_alloc_cpd_fn                *cpd_alloc_fn;
        blkcg_pol_free_cpd_fn                *cpd_free_fn;

        blkcg_pol_alloc_pd_fn                *pd_alloc_fn;
        blkcg_pol_init_pd_fn                *pd_init_fn;
        blkcg_pol_online_pd_fn                *pd_online_fn;
        blkcg_pol_offline_pd_fn                *pd_offline_fn;
        blkcg_pol_free_pd_fn                *pd_free_fn;
        blkcg_pol_reset_pd_stats_fn        *pd_reset_stats_fn;
        blkcg_pol_stat_pd_fn                *pd_stat_fn;
};

extern struct blkcg blkcg_root;
extern bool blkcg_debug_stats;

void blkg_init_queue(struct request_queue *q);
int blkcg_init_disk(struct gendisk *disk);
void blkcg_exit_disk(struct gendisk *disk);

/* Blkio controller policy registration */
int blkcg_policy_register(struct blkcg_policy *pol);
void blkcg_policy_unregister(struct blkcg_policy *pol);
int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol);
void blkcg_deactivate_policy(struct gendisk *disk,
                             const struct blkcg_policy *pol);

const char *blkg_dev_name(struct blkcg_gq *blkg);
void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
                       u64 (*prfill)(struct seq_file *,
                                     struct blkg_policy_data *, int),
                       const struct blkcg_policy *pol, int data,
                       bool show_total);
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);

struct blkg_conf_ctx {
        char                                *input;
        char                                *body;
        struct block_device                *bdev;
        struct blkcg_gq                        *blkg;
};

void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input);
int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx);
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
                   struct blkg_conf_ctx *ctx);
void blkg_conf_exit(struct blkg_conf_ctx *ctx);

/**
 * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
 * @return: true if this bio needs to be submitted with the root blkg context.
 *
 * In order to avoid priority inversions we sometimes need to issue a bio as if
 * it were attached to the root blkg, and then backcharge to the actual owning
 * blkg.  The idea is we do bio_blkcg_css() to look up the actual context for
 * the bio and attach the appropriate blkg to the bio.  Then we call this helper
 * and if it is true run with the root blkg for that queue and then do any
 * backcharging to the originating cgroup once the io is complete.
 */
static inline bool bio_issue_as_root_blkg(struct bio *bio)
{
        return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0;
}

/**
 * blkg_lookup - lookup blkg for the specified blkcg - q pair
 * @blkcg: blkcg of interest
 * @q: request_queue of interest
 *
 * Lookup blkg for the @blkcg - @q pair.

 * Must be called in a RCU critical section.
 */
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
                                           struct request_queue *q)
{
        struct blkcg_gq *blkg;

        if (blkcg == &blkcg_root)
                return q->root_blkg;

        blkg = rcu_dereference_check(blkcg->blkg_hint,
                        lockdep_is_held(&q->queue_lock));
        if (blkg && blkg->q == q)
                return blkg;

        blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
        if (blkg && blkg->q != q)
                blkg = NULL;
        return blkg;
}

/**
 * blkg_to_pdata - get policy private data
 * @blkg: blkg of interest
 * @pol: policy of interest
 *
 * Return pointer to private data associated with the @blkg-@pol pair.
 */
static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
                                                  struct blkcg_policy *pol)
{
        return blkg ? blkg->pd[pol->plid] : NULL;
}

static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
                                                     struct blkcg_policy *pol)
{
        return blkcg ? blkcg->cpd[pol->plid] : NULL;
}

/**
 * pdata_to_blkg - get blkg associated with policy private data
 * @pd: policy private data of interest
 *
 * @pd is policy private data.  Determine the blkg it's associated with.
 */
static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
{
        return pd ? pd->blkg : NULL;
}

static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
{
        return cpd ? cpd->blkcg : NULL;
}

/**
 * blkg_path - format cgroup path of blkg
 * @blkg: blkg of interest
 * @buf: target buffer
 * @buflen: target buffer length
 *
 * Format the path of the cgroup of @blkg into @buf.
 */
static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
{
        return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
}

/**
 * blkg_get - get a blkg reference
 * @blkg: blkg to get
 *
 * The caller should be holding an existing reference.
 */
static inline void blkg_get(struct blkcg_gq *blkg)
{
        percpu_ref_get(&blkg->refcnt);
}

/**
 * blkg_tryget - try and get a blkg reference
 * @blkg: blkg to get
 *
 * This is for use when doing an RCU lookup of the blkg.  We may be in the midst
 * of freeing this blkg, so we can only use it if the refcnt is not zero.
 */
static inline bool blkg_tryget(struct blkcg_gq *blkg)
{
        return blkg && percpu_ref_tryget(&blkg->refcnt);
}

/**
 * blkg_put - put a blkg reference
 * @blkg: blkg to put
 */
static inline void blkg_put(struct blkcg_gq *blkg)
{
        percpu_ref_put(&blkg->refcnt);
}

/**
 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
 * @d_blkg: loop cursor pointing to the current descendant
 * @pos_css: used for iteration
 * @p_blkg: target blkg to walk descendants of
 *
 * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
 * read locked.  If called under either blkcg or queue lock, the iteration
 * is guaranteed to include all and only online blkgs.  The caller may
 * update @pos_css by calling css_rightmost_descendant() to skip subtree.
 * @p_blkg is included in the iteration and the first node to be visited.
 */
#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)                \
        css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)        \
                if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css),        \
                                            (p_blkg)->q)))

/**
 * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
 * @d_blkg: loop cursor pointing to the current descendant
 * @pos_css: used for iteration
 * @p_blkg: target blkg to walk descendants of
 *
 * Similar to blkg_for_each_descendant_pre() but performs post-order
 * traversal instead.  Synchronization rules are the same.  @p_blkg is
 * included in the iteration and the last node to be visited.
 */
#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)                \
        css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)        \
                if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css),        \
                                            (p_blkg)->q)))

static inline void blkcg_bio_issue_init(struct bio *bio)
{
        bio_issue_init(&bio->bi_issue, bio_sectors(bio));
}

static inline void blkcg_use_delay(struct blkcg_gq *blkg)
{
        if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
                return;
        if (atomic_add_return(1, &blkg->use_delay) == 1)
                atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
}

static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
{
        int old = atomic_read(&blkg->use_delay);

        if (WARN_ON_ONCE(old < 0))
                return 0;
        if (old == 0)
                return 0;

        /*
         * We do this song and dance because we can race with somebody else
         * adding or removing delay.  If we just did an atomic_dec we'd end up
         * negative and we'd already be in trouble.  We need to subtract 1 and
         * then check to see if we were the last delay so we can drop the
         * congestion count on the cgroup.
         */
        while (old && !atomic_try_cmpxchg(&blkg->use_delay, &old, old - 1))
                ;

        if (old == 0)
                return 0;
        if (old == 1)
                atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
        return 1;
}

/**
 * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount
 * @blkg: target blkg
 * @delay: delay duration in nsecs
 *
 * When enabled with this function, the delay is not decayed and must be
 * explicitly cleared with blkcg_clear_delay(). Must not be mixed with
 * blkcg_[un]use_delay() and blkcg_add_delay() usages.
 */
static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay)
{
        int old = atomic_read(&blkg->use_delay);

        /* We only want 1 person setting the congestion count for this blkg. */
        if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1))
                atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);

        atomic64_set(&blkg->delay_nsec, delay);
}

/**
 * blkcg_clear_delay - Disable allocator delay mechanism
 * @blkg: target blkg
 *
 * Disable use_delay mechanism. See blkcg_set_delay().
 */
static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
{
        int old = atomic_read(&blkg->use_delay);

        /* We only want 1 person clearing the congestion count for this blkg. */
        if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0))
                atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
}

/**
 * blk_cgroup_mergeable - Determine whether to allow or disallow merges
 * @rq: request to merge into
 * @bio: bio to merge
 *
 * @bio and @rq should belong to the same cgroup and their issue_as_root should
 * match. The latter is necessary as we don't want to throttle e.g. a metadata
 * update because it happens to be next to a regular IO.
 */
static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio)
{
        return rq->bio->bi_blkg == bio->bi_blkg &&
                bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio);
}

void blk_cgroup_bio_start(struct bio *bio);
void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
#else        /* CONFIG_BLK_CGROUP */

struct blkg_policy_data {
};

struct blkcg_policy_data {
};

struct blkcg_policy {
};

struct blkcg {
};

static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
static inline void blkg_init_queue(struct request_queue *q) { }
static inline int blkcg_init_disk(struct gendisk *disk) { return 0; }
static inline void blkcg_exit_disk(struct gendisk *disk) { }
static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
static inline int blkcg_activate_policy(struct gendisk *disk,
                                        const struct blkcg_policy *pol) { return 0; }
static inline void blkcg_deactivate_policy(struct gendisk *disk,
                                           const struct blkcg_policy *pol) { }

static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
                                                  struct blkcg_policy *pol) { return NULL; }
static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
static inline void blkg_get(struct blkcg_gq *blkg) { }
static inline void blkg_put(struct blkcg_gq *blkg) { }
static inline void blkcg_bio_issue_init(struct bio *bio) { }
static inline void blk_cgroup_bio_start(struct bio *bio) { }
static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; }

#define blk_queue_for_each_rl(rl, q)        \
        for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)

#endif        /* CONFIG_BLK_CGROUP */

#endif /* _BLK_CGROUP_PRIVATE_H */




































































































































































































































































































































































































































































    1 











































    2 
    2 









    2 
































































    8 





    9 



























































































































































































































































    2 

















    2 


    2 


    2 


















    1 










    2 



















    1 
























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  This file contains the interface functions for the various time related
 *  system calls: time, stime, gettimeofday, settimeofday, adjtime
 *
 * Modification history:
 *
 * 1993-09-02    Philip Gladstone
 *      Created file with time related functions from sched/core.c and adjtimex()
 * 1993-10-08    Torsten Duwe
 *      adjtime interface update and CMOS clock write code
 * 1995-08-13    Torsten Duwe
 *      kernel PLL updated to 1994-12-13 specs (rfc-1589)
 * 1999-01-16    Ulrich Windl
 *        Introduced error checking for many cases in adjtimex().
 *        Updated NTP code according to technical memorandum Jan '96
 *        "A Kernel Model for Precision Timekeeping" by Dave Mills
 *        Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10)
 *        (Even though the technical memorandum forbids it)
 * 2004-07-14         Christoph Lameter
 *        Added getnstimeofday to allow the posix timer functions to return
 *        with nanosecond accuracy
 */

#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/timex.h>
#include <linux/capability.h>
#include <linux/timekeeper_internal.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/fs.h>
#include <linux/math64.h>
#include <linux/ptrace.h>

#include <linux/uaccess.h>
#include <linux/compat.h>
#include <asm/unistd.h>

#include <generated/timeconst.h>
#include "timekeeping.h"

/*
 * The timezone where the local system is located.  Used as a default by some
 * programs who obtain this value by using gettimeofday.
 */
struct timezone sys_tz;

EXPORT_SYMBOL(sys_tz);

#ifdef __ARCH_WANT_SYS_TIME

/*
 * sys_time() can be implemented in user-level using
 * sys_gettimeofday().  Is this for backwards compatibility?  If so,
 * why not move it into the appropriate arch directory (for those
 * architectures that need it).
 */
SYSCALL_DEFINE1(time, __kernel_old_time_t __user *, tloc)
{
        __kernel_old_time_t i = (__kernel_old_time_t)ktime_get_real_seconds();

        if (tloc) {
                if (put_user(i,tloc))
                        return -EFAULT;
        }
        force_successful_syscall_return();
        return i;
}

/*
 * sys_stime() can be implemented in user-level using
 * sys_settimeofday().  Is this for backwards compatibility?  If so,
 * why not move it into the appropriate arch directory (for those
 * architectures that need it).
 */

SYSCALL_DEFINE1(stime, __kernel_old_time_t __user *, tptr)
{
        struct timespec64 tv;
        int err;

        if (get_user(tv.tv_sec, tptr))
                return -EFAULT;

        tv.tv_nsec = 0;

        err = security_settime64(&tv, NULL);
        if (err)
                return err;

        do_settimeofday64(&tv);
        return 0;
}

#endif /* __ARCH_WANT_SYS_TIME */

#ifdef CONFIG_COMPAT_32BIT_TIME
#ifdef __ARCH_WANT_SYS_TIME32

/* old_time32_t is a 32 bit "long" and needs to get converted. */
SYSCALL_DEFINE1(time32, old_time32_t __user *, tloc)
{
        old_time32_t i;

        i = (old_time32_t)ktime_get_real_seconds();

        if (tloc) {
                if (put_user(i,tloc))
                        return -EFAULT;
        }
        force_successful_syscall_return();
        return i;
}

SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr)
{
        struct timespec64 tv;
        int err;

        if (get_user(tv.tv_sec, tptr))
                return -EFAULT;

        tv.tv_nsec = 0;

        err = security_settime64(&tv, NULL);
        if (err)
                return err;

        do_settimeofday64(&tv);
        return 0;
}

#endif /* __ARCH_WANT_SYS_TIME32 */
#endif

SYSCALL_DEFINE2(gettimeofday, struct __kernel_old_timeval __user *, tv,
                struct timezone __user *, tz)
{
        if (likely(tv != NULL)) {
                struct timespec64 ts;

                ktime_get_real_ts64(&ts);
                if (put_user(ts.tv_sec, &tv->tv_sec) ||
                    put_user(ts.tv_nsec / 1000, &tv->tv_usec))
                        return -EFAULT;
        }
        if (unlikely(tz != NULL)) {
                if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
                        return -EFAULT;
        }
        return 0;
}

/*
 * In case for some reason the CMOS clock has not already been running
 * in UTC, but in some local time: The first time we set the timezone,
 * we will warp the clock so that it is ticking UTC time instead of
 * local time. Presumably, if someone is setting the timezone then we
 * are running in an environment where the programs understand about
 * timezones. This should be done at boot time in the /etc/rc script,
 * as soon as possible, so that the clock can be set right. Otherwise,
 * various programs will get confused when the clock gets warped.
 */

int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz)
{
        static int firsttime = 1;
        int error = 0;

        if (tv && !timespec64_valid_settod(tv))
                return -EINVAL;

        error = security_settime64(tv, tz);
        if (error)
                return error;

        if (tz) {
                /* Verify we're within the +-15 hrs range */
                if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60)
                        return -EINVAL;

                sys_tz = *tz;
                update_vsyscall_tz();
                if (firsttime) {
                        firsttime = 0;
                        if (!tv)
                                timekeeping_warp_clock();
                }
        }
        if (tv)
                return do_settimeofday64(tv);
        return 0;
}

SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv,
                struct timezone __user *, tz)
{
        struct timespec64 new_ts;
        struct timezone new_tz;

        if (tv) {
                if (get_user(new_ts.tv_sec, &tv->tv_sec) ||
                    get_user(new_ts.tv_nsec, &tv->tv_usec))
                        return -EFAULT;

                if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
                        return -EINVAL;

                new_ts.tv_nsec *= NSEC_PER_USEC;
        }
        if (tz) {
                if (copy_from_user(&new_tz, tz, sizeof(*tz)))
                        return -EFAULT;
        }

        return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(gettimeofday, struct old_timeval32 __user *, tv,
                       struct timezone __user *, tz)
{
        if (tv) {
                struct timespec64 ts;

                ktime_get_real_ts64(&ts);
                if (put_user(ts.tv_sec, &tv->tv_sec) ||
                    put_user(ts.tv_nsec / 1000, &tv->tv_usec))
                        return -EFAULT;
        }
        if (tz) {
                if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
                        return -EFAULT;
        }

        return 0;
}

COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv,
                       struct timezone __user *, tz)
{
        struct timespec64 new_ts;
        struct timezone new_tz;

        if (tv) {
                if (get_user(new_ts.tv_sec, &tv->tv_sec) ||
                    get_user(new_ts.tv_nsec, &tv->tv_usec))
                        return -EFAULT;

                if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
                        return -EINVAL;

                new_ts.tv_nsec *= NSEC_PER_USEC;
        }
        if (tz) {
                if (copy_from_user(&new_tz, tz, sizeof(*tz)))
                        return -EFAULT;
        }

        return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}
#endif

#ifdef CONFIG_64BIT
SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p)
{
        struct __kernel_timex txc;                /* Local copy of parameter */
        int ret;

        /* Copy the user data space into the kernel copy
         * structure. But bear in mind that the structures
         * may change
         */
        if (copy_from_user(&txc, txc_p, sizeof(struct __kernel_timex)))
                return -EFAULT;
        ret = do_adjtimex(&txc);
        return copy_to_user(txc_p, &txc, sizeof(struct __kernel_timex)) ? -EFAULT : ret;
}
#endif

#ifdef CONFIG_COMPAT_32BIT_TIME
int get_old_timex32(struct __kernel_timex *txc, const struct old_timex32 __user *utp)
{
        struct old_timex32 tx32;

        memset(txc, 0, sizeof(struct __kernel_timex));
        if (copy_from_user(&tx32, utp, sizeof(struct old_timex32)))
                return -EFAULT;

        txc->modes = tx32.modes;
        txc->offset = tx32.offset;
        txc->freq = tx32.freq;
        txc->maxerror = tx32.maxerror;
        txc->esterror = tx32.esterror;
        txc->status = tx32.status;
        txc->constant = tx32.constant;
        txc->precision = tx32.precision;
        txc->tolerance = tx32.tolerance;
        txc->time.tv_sec = tx32.time.tv_sec;
        txc->time.tv_usec = tx32.time.tv_usec;
        txc->tick = tx32.tick;
        txc->ppsfreq = tx32.ppsfreq;
        txc->jitter = tx32.jitter;
        txc->shift = tx32.shift;
        txc->stabil = tx32.stabil;
        txc->jitcnt = tx32.jitcnt;
        txc->calcnt = tx32.calcnt;
        txc->errcnt = tx32.errcnt;
        txc->stbcnt = tx32.stbcnt;

        return 0;
}

int put_old_timex32(struct old_timex32 __user *utp, const struct __kernel_timex *txc)
{
        struct old_timex32 tx32;

        memset(&tx32, 0, sizeof(struct old_timex32));
        tx32.modes = txc->modes;
        tx32.offset = txc->offset;
        tx32.freq = txc->freq;
        tx32.maxerror = txc->maxerror;
        tx32.esterror = txc->esterror;
        tx32.status = txc->status;
        tx32.constant = txc->constant;
        tx32.precision = txc->precision;
        tx32.tolerance = txc->tolerance;
        tx32.time.tv_sec = txc->time.tv_sec;
        tx32.time.tv_usec = txc->time.tv_usec;
        tx32.tick = txc->tick;
        tx32.ppsfreq = txc->ppsfreq;
        tx32.jitter = txc->jitter;
        tx32.shift = txc->shift;
        tx32.stabil = txc->stabil;
        tx32.jitcnt = txc->jitcnt;
        tx32.calcnt = txc->calcnt;
        tx32.errcnt = txc->errcnt;
        tx32.stbcnt = txc->stbcnt;
        tx32.tai = txc->tai;
        if (copy_to_user(utp, &tx32, sizeof(struct old_timex32)))
                return -EFAULT;
        return 0;
}

SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp)
{
        struct __kernel_timex txc;
        int err, ret;

        err = get_old_timex32(&txc, utp);
        if (err)
                return err;

        ret = do_adjtimex(&txc);

        err = put_old_timex32(utp, &txc);
        if (err)
                return err;

        return ret;
}
#endif

/**
 * jiffies_to_msecs - Convert jiffies to milliseconds
 * @j: jiffies value
 *
 * Avoid unnecessary multiplications/divisions in the
 * two most common HZ cases.
 *
 * Return: milliseconds value
 */
unsigned int jiffies_to_msecs(const unsigned long j)
{
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
        return (MSEC_PER_SEC / HZ) * j;
#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
        return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
#else
# if BITS_PER_LONG == 32
        return (HZ_TO_MSEC_MUL32 * j + (1ULL << HZ_TO_MSEC_SHR32) - 1) >>
               HZ_TO_MSEC_SHR32;
# else
        return DIV_ROUND_UP(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN);
# endif
#endif
}
EXPORT_SYMBOL(jiffies_to_msecs);

/**
 * jiffies_to_usecs - Convert jiffies to microseconds
 * @j: jiffies value
 *
 * Return: microseconds value
 */
unsigned int jiffies_to_usecs(const unsigned long j)
{
        /*
         * Hz usually doesn't go much further MSEC_PER_SEC.
         * jiffies_to_usecs() and usecs_to_jiffies() depend on that.
         */
        BUILD_BUG_ON(HZ > USEC_PER_SEC);

#if !(USEC_PER_SEC % HZ)
        return (USEC_PER_SEC / HZ) * j;
#else
# if BITS_PER_LONG == 32
        return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
# else
        return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
# endif
#endif
}
EXPORT_SYMBOL(jiffies_to_usecs);

/**
 * mktime64 - Converts date to seconds.
 * @year0: year to convert
 * @mon0: month to convert
 * @day: day to convert
 * @hour: hour to convert
 * @min: minute to convert
 * @sec: second to convert
 *
 * Converts Gregorian date to seconds since 1970-01-01 00:00:00.
 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
 *
 * [For the Julian calendar (which was used in Russia before 1917,
 * Britain & colonies before 1752, anywhere else before 1582,
 * and is still in use by some communities) leave out the
 * -year/100+year/400 terms, and add 10.]
 *
 * This algorithm was first published by Gauss (I think).
 *
 * A leap second can be indicated by calling this function with sec as
 * 60 (allowable under ISO 8601).  The leap second is treated the same
 * as the following second since they don't exist in UNIX time.
 *
 * An encoding of midnight at the end of the day as 24:00:00 - ie. midnight
 * tomorrow - (allowable under ISO 8601) is supported.
 *
 * Return: seconds since the epoch time for the given input date
 */
time64_t mktime64(const unsigned int year0, const unsigned int mon0,
                const unsigned int day, const unsigned int hour,
                const unsigned int min, const unsigned int sec)
{
        unsigned int mon = mon0, year = year0;

        /* 1..12 -> 11,12,1..10 */
        if (0 >= (int) (mon -= 2)) {
                mon += 12;        /* Puts Feb last since it has leap day */
                year -= 1;
        }

        return ((((time64_t)
                  (year/4 - year/100 + year/400 + 367*mon/12 + day) +
                  year*365 - 719499
            )*24 + hour /* now have hours - midnight tomorrow handled here */
          )*60 + min /* now have minutes */
        )*60 + sec; /* finally seconds */
}
EXPORT_SYMBOL(mktime64);

struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec)
{
        struct timespec64 ts = ns_to_timespec64(nsec);
        struct __kernel_old_timeval tv;

        tv.tv_sec = ts.tv_sec;
        tv.tv_usec = (suseconds_t)ts.tv_nsec / 1000;

        return tv;
}
EXPORT_SYMBOL(ns_to_kernel_old_timeval);

/**
 * set_normalized_timespec64 - set timespec sec and nsec parts and normalize
 *
 * @ts:                pointer to timespec variable to be set
 * @sec:        seconds to set
 * @nsec:        nanoseconds to set
 *
 * Set seconds and nanoseconds field of a timespec variable and
 * normalize to the timespec storage format
 *
 * Note: The tv_nsec part is always in the range of 0 <= tv_nsec < NSEC_PER_SEC.
 * For negative values only the tv_sec field is negative !
 */
void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
{
        while (nsec >= NSEC_PER_SEC) {
                /*
                 * The following asm() prevents the compiler from
                 * optimising this loop into a modulo operation. See
                 * also __iter_div_u64_rem() in include/linux/time.h
                 */
                asm("" : "+rm"(nsec));
                nsec -= NSEC_PER_SEC;
                ++sec;
        }
        while (nsec < 0) {
                asm("" : "+rm"(nsec));
                nsec += NSEC_PER_SEC;
                --sec;
        }
        ts->tv_sec = sec;
        ts->tv_nsec = nsec;
}
EXPORT_SYMBOL(set_normalized_timespec64);

/**
 * ns_to_timespec64 - Convert nanoseconds to timespec64
 * @nsec:       the nanoseconds value to be converted
 *
 * Return: the timespec64 representation of the nsec parameter.
 */
struct timespec64 ns_to_timespec64(s64 nsec)
{
        struct timespec64 ts = { 0, 0 };
        s32 rem;

        if (likely(nsec > 0)) {
                ts.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
                ts.tv_nsec = rem;
        } else if (nsec < 0) {
                /*
                 * With negative times, tv_sec points to the earlier
                 * second, and tv_nsec counts the nanoseconds since
                 * then, so tv_nsec is always a positive number.
                 */
                ts.tv_sec = -div_u64_rem(-nsec - 1, NSEC_PER_SEC, &rem) - 1;
                ts.tv_nsec = NSEC_PER_SEC - rem - 1;
        }

        return ts;
}
EXPORT_SYMBOL(ns_to_timespec64);

/**
 * __msecs_to_jiffies: - convert milliseconds to jiffies
 * @m:        time in milliseconds
 *
 * conversion is done as follows:
 *
 * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
 *
 * - 'too large' values [that would result in larger than
 *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
 *
 * - all other values are converted to jiffies by either multiplying
 *   the input value by a factor or dividing it with a factor and
 *   handling any 32-bit overflows.
 *   for the details see __msecs_to_jiffies()
 *
 * __msecs_to_jiffies() checks for the passed in value being a constant
 * via __builtin_constant_p() allowing gcc to eliminate most of the
 * code, __msecs_to_jiffies() is called if the value passed does not
 * allow constant folding and the actual conversion must be done at
 * runtime.
 * The _msecs_to_jiffies helpers are the HZ dependent conversion
 * routines found in include/linux/jiffies.h
 *
 * Return: jiffies value
 */
unsigned long __msecs_to_jiffies(const unsigned int m)
{
        /*
         * Negative value, means infinite timeout:
         */
        if ((int)m < 0)
                return MAX_JIFFY_OFFSET;
        return _msecs_to_jiffies(m);
}
EXPORT_SYMBOL(__msecs_to_jiffies);

/**
 * __usecs_to_jiffies: - convert microseconds to jiffies
 * @u:        time in milliseconds
 *
 * Return: jiffies value
 */
unsigned long __usecs_to_jiffies(const unsigned int u)
{
        if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
                return MAX_JIFFY_OFFSET;
        return _usecs_to_jiffies(u);
}
EXPORT_SYMBOL(__usecs_to_jiffies);

/**
 * timespec64_to_jiffies - convert a timespec64 value to jiffies
 * @value: pointer to &struct timespec64
 *
 * The TICK_NSEC - 1 rounds up the value to the next resolution.  Note
 * that a remainder subtract here would not do the right thing as the
 * resolution values don't fall on second boundaries.  I.e. the line:
 * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
 * Note that due to the small error in the multiplier here, this
 * rounding is incorrect for sufficiently large values of tv_nsec, but
 * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're
 * OK.
 *
 * Rather, we just shift the bits off the right.
 *
 * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
 * value to a scaled second value.
 *
 * Return: jiffies value
 */
unsigned long
timespec64_to_jiffies(const struct timespec64 *value)
{
        u64 sec = value->tv_sec;
        long nsec = value->tv_nsec + TICK_NSEC - 1;

        if (sec >= MAX_SEC_IN_JIFFIES){
                sec = MAX_SEC_IN_JIFFIES;
                nsec = 0;
        }
        return ((sec * SEC_CONVERSION) +
                (((u64)nsec * NSEC_CONVERSION) >>
                 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;

}
EXPORT_SYMBOL(timespec64_to_jiffies);

/**
 * jiffies_to_timespec64 - convert jiffies value to &struct timespec64
 * @jiffies: jiffies value
 * @value: pointer to &struct timespec64
 */
void
jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
{
        /*
         * Convert jiffies to nanoseconds and separate with
         * one divide.
         */
        u32 rem;
        value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
                                    NSEC_PER_SEC, &rem);
        value->tv_nsec = rem;
}
EXPORT_SYMBOL(jiffies_to_timespec64);

/*
 * Convert jiffies/jiffies_64 to clock_t and back.
 */

/**
 * jiffies_to_clock_t - Convert jiffies to clock_t
 * @x: jiffies value
 *
 * Return: jiffies converted to clock_t (CLOCKS_PER_SEC)
 */
clock_t jiffies_to_clock_t(unsigned long x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
# if HZ < USER_HZ
        return x * (USER_HZ / HZ);
# else
        return x / (HZ / USER_HZ);
# endif
#else
        return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);
#endif
}
EXPORT_SYMBOL(jiffies_to_clock_t);

/**
 * clock_t_to_jiffies - Convert clock_t to jiffies
 * @x: clock_t value
 *
 * Return: clock_t value converted to jiffies
 */
unsigned long clock_t_to_jiffies(unsigned long x)
{
#if (HZ % USER_HZ)==0
        if (x >= ~0UL / (HZ / USER_HZ))
                return ~0UL;
        return x * (HZ / USER_HZ);
#else
        /* Don't worry about loss of precision here .. */
        if (x >= ~0UL / HZ * USER_HZ)
                return ~0UL;

        /* .. but do try to contain it here */
        return div_u64((u64)x * HZ, USER_HZ);
#endif
}
EXPORT_SYMBOL(clock_t_to_jiffies);

/**
 * jiffies_64_to_clock_t - Convert jiffies_64 to clock_t
 * @x: jiffies_64 value
 *
 * Return: jiffies_64 value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
 */
u64 jiffies_64_to_clock_t(u64 x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
# if HZ < USER_HZ
        x = div_u64(x * USER_HZ, HZ);
# elif HZ > USER_HZ
        x = div_u64(x, HZ / USER_HZ);
# else
        /* Nothing to do */
# endif
#else
        /*
         * There are better ways that don't overflow early,
         * but even this doesn't overflow in hundreds of years
         * in 64 bits, so..
         */
        x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ));
#endif
        return x;
}
EXPORT_SYMBOL(jiffies_64_to_clock_t);

/**
 * nsec_to_clock_t - Convert nsec value to clock_t
 * @x: nsec value
 *
 * Return: nsec value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
 */
u64 nsec_to_clock_t(u64 x)
{
#if (NSEC_PER_SEC % USER_HZ) == 0
        return div_u64(x, NSEC_PER_SEC / USER_HZ);
#elif (USER_HZ % 512) == 0
        return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512);
#else
        /*
         * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
         * overflow after 64.99 years.
         * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
         */
        return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ);
#endif
}

/**
 * jiffies64_to_nsecs - Convert jiffies64 to nanoseconds
 * @j: jiffies64 value
 *
 * Return: nanoseconds value
 */
u64 jiffies64_to_nsecs(u64 j)
{
#if !(NSEC_PER_SEC % HZ)
        return (NSEC_PER_SEC / HZ) * j;
# else
        return div_u64(j * HZ_TO_NSEC_NUM, HZ_TO_NSEC_DEN);
#endif
}
EXPORT_SYMBOL(jiffies64_to_nsecs);

/**
 * jiffies64_to_msecs - Convert jiffies64 to milliseconds
 * @j: jiffies64 value
 *
 * Return: milliseconds value
 */
u64 jiffies64_to_msecs(const u64 j)
{
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
        return (MSEC_PER_SEC / HZ) * j;
#else
        return div_u64(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN);
#endif
}
EXPORT_SYMBOL(jiffies64_to_msecs);

/**
 * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
 *
 * @n:        nsecs in u64
 *
 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
 * for scheduler, not for use in device drivers to calculate timeout value.
 *
 * note:
 *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
 *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
 *
 * Return: nsecs converted to jiffies64 value
 */
u64 nsecs_to_jiffies64(u64 n)
{
#if (NSEC_PER_SEC % HZ) == 0
        /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
        return div_u64(n, NSEC_PER_SEC / HZ);
#elif (HZ % 512) == 0
        /* overflow after 292 years if HZ = 1024 */
        return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
#else
        /*
         * Generic case - optimized for cases where HZ is a multiple of 3.
         * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
         */
        return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
#endif
}
EXPORT_SYMBOL(nsecs_to_jiffies64);

/**
 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
 *
 * @n:        nsecs in u64
 *
 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
 * for scheduler, not for use in device drivers to calculate timeout value.
 *
 * note:
 *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
 *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
 *
 * Return: nsecs converted to jiffies value
 */
unsigned long nsecs_to_jiffies(u64 n)
{
        return (unsigned long)nsecs_to_jiffies64(n);
}
EXPORT_SYMBOL_GPL(nsecs_to_jiffies);

/**
 * timespec64_add_safe - Add two timespec64 values and do a safety check
 * for overflow.
 * @lhs: first (left) timespec64 to add
 * @rhs: second (right) timespec64 to add
 *
 * It's assumed that both values are valid (>= 0).
 * And, each timespec64 is in normalized form.
 *
 * Return: sum of @lhs + @rhs
 */
struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
                                const struct timespec64 rhs)
{
        struct timespec64 res;

        set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec,
                        lhs.tv_nsec + rhs.tv_nsec);

        if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) {
                res.tv_sec = TIME64_MAX;
                res.tv_nsec = 0;
        }

        return res;
}

/**
 * get_timespec64 - get user's time value into kernel space
 * @ts: destination &struct timespec64
 * @uts: user's time value as &struct __kernel_timespec
 *
 * Handles compat or 32-bit modes.
 *
 * Return: %0 on success or negative errno on error
 */
int get_timespec64(struct timespec64 *ts,
                   const struct __kernel_timespec __user *uts)
{
        struct __kernel_timespec kts;
        int ret;

        ret = copy_from_user(&kts, uts, sizeof(kts));
        if (ret)
                return -EFAULT;

        ts->tv_sec = kts.tv_sec;

        /* Zero out the padding in compat mode */
        if (in_compat_syscall())
                kts.tv_nsec &= 0xFFFFFFFFUL;

        /* In 32-bit mode, this drops the padding */
        ts->tv_nsec = kts.tv_nsec;

        return 0;
}
EXPORT_SYMBOL_GPL(get_timespec64);

/**
 * put_timespec64 - convert timespec64 value to __kernel_timespec format and
 *                     copy the latter to userspace
 * @ts: input &struct timespec64
 * @uts: user's &struct __kernel_timespec
 *
 * Return: %0 on success or negative errno on error
 */
int put_timespec64(const struct timespec64 *ts,
                   struct __kernel_timespec __user *uts)
{
        struct __kernel_timespec kts = {
                .tv_sec = ts->tv_sec,
                .tv_nsec = ts->tv_nsec
        };

        return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0;
}
EXPORT_SYMBOL_GPL(put_timespec64);

static int __get_old_timespec32(struct timespec64 *ts64,
                                   const struct old_timespec32 __user *cts)
{
        struct old_timespec32 ts;
        int ret;

        ret = copy_from_user(&ts, cts, sizeof(ts));
        if (ret)
                return -EFAULT;

        ts64->tv_sec = ts.tv_sec;
        ts64->tv_nsec = ts.tv_nsec;

        return 0;
}

static int __put_old_timespec32(const struct timespec64 *ts64,
                                   struct old_timespec32 __user *cts)
{
        struct old_timespec32 ts = {
                .tv_sec = ts64->tv_sec,
                .tv_nsec = ts64->tv_nsec
        };
        return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
}

/**
 * get_old_timespec32 - get user's old-format time value into kernel space
 * @ts: destination &struct timespec64
 * @uts: user's old-format time value (&struct old_timespec32)
 *
 * Handles X86_X32_ABI compatibility conversion.
 *
 * Return: %0 on success or negative errno on error
 */
int get_old_timespec32(struct timespec64 *ts, const void __user *uts)
{
        if (COMPAT_USE_64BIT_TIME)
                return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
        else
                return __get_old_timespec32(ts, uts);
}
EXPORT_SYMBOL_GPL(get_old_timespec32);

/**
 * put_old_timespec32 - convert timespec64 value to &struct old_timespec32 and
 *                         copy the latter to userspace
 * @ts: input &struct timespec64
 * @uts: user's &struct old_timespec32
 *
 * Handles X86_X32_ABI compatibility conversion.
 *
 * Return: %0 on success or negative errno on error
 */
int put_old_timespec32(const struct timespec64 *ts, void __user *uts)
{
        if (COMPAT_USE_64BIT_TIME)
                return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
        else
                return __put_old_timespec32(ts, uts);
}
EXPORT_SYMBOL_GPL(put_old_timespec32);

/**
 * get_itimerspec64 - get user's &struct __kernel_itimerspec into kernel space
 * @it: destination &struct itimerspec64
 * @uit: user's &struct __kernel_itimerspec
 *
 * Return: %0 on success or negative errno on error
 */
int get_itimerspec64(struct itimerspec64 *it,
                        const struct __kernel_itimerspec __user *uit)
{
        int ret;

        ret = get_timespec64(&it->it_interval, &uit->it_interval);
        if (ret)
                return ret;

        ret = get_timespec64(&it->it_value, &uit->it_value);

        return ret;
}
EXPORT_SYMBOL_GPL(get_itimerspec64);

/**
 * put_itimerspec64 - convert &struct itimerspec64 to __kernel_itimerspec format
 *                       and copy the latter to userspace
 * @it: input &struct itimerspec64
 * @uit: user's &struct __kernel_itimerspec
 *
 * Return: %0 on success or negative errno on error
 */
int put_itimerspec64(const struct itimerspec64 *it,
                        struct __kernel_itimerspec __user *uit)
{
        int ret;

        ret = put_timespec64(&it->it_interval, &uit->it_interval);
        if (ret)
                return ret;

        ret = put_timespec64(&it->it_value, &uit->it_value);

        return ret;
}
EXPORT_SYMBOL_GPL(put_itimerspec64);

/**
 * get_old_itimerspec32 - get user's &struct old_itimerspec32 into kernel space
 * @its: destination &struct itimerspec64
 * @uits: user's &struct old_itimerspec32
 *
 * Return: %0 on success or negative errno on error
 */
int get_old_itimerspec32(struct itimerspec64 *its,
                        const struct old_itimerspec32 __user *uits)
{

        if (__get_old_timespec32(&its->it_interval, &uits->it_interval) ||
            __get_old_timespec32(&its->it_value, &uits->it_value))
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL_GPL(get_old_itimerspec32);

/**
 * put_old_itimerspec32 - convert &struct itimerspec64 to &struct
 *                          old_itimerspec32 and copy the latter to userspace
 * @its: input &struct itimerspec64
 * @uits: user's &struct old_itimerspec32
 *
 * Return: %0 on success or negative errno on error
 */
int put_old_itimerspec32(const struct itimerspec64 *its,
                        struct old_itimerspec32 __user *uits)
{
        if (__put_old_timespec32(&its->it_interval, &uits->it_interval) ||
            __put_old_timespec32(&its->it_value, &uits->it_value))
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL_GPL(put_old_itimerspec32);





























































    2 







    2 
    2 


    2 




















































































    2 














    3 
    3 


    1 
    2 






    2 
    3 



    1 



    2 































































































































    5 









    3 





















    2 


    2 
    2 

    2 
    2 
    2 





    2 











    2 


    2 








    1 




    2 






    1 
    1 







    1 

    1 



    1 


















    1 














    3 









    3 











































































































    1 





    1 













    1 



















    1 













    1 
    1 




    1 






    1 














    1 


    1 



    1 

    1 







    1 









    1 





































    2 
































    2 



    1 

























    1 
    1 



    1 

    1 


























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/truncate.c - code for taking down pages from address_spaces
 *
 * Copyright (C) 2002, Linus Torvalds
 *
 * 10Sep2002        Andrew Morton
 *                Initial version.
 */

#include <linux/kernel.h>
#include <linux/backing-dev.h>
#include <linux/dax.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/pagevec.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/shmem_fs.h>
#include <linux/rmap.h>
#include "internal.h"

/*
 * Regular page slots are stabilized by the page lock even without the tree
 * itself locked.  These unlocked entries need verification under the tree
 * lock.
 */
static inline void __clear_shadow_entry(struct address_space *mapping,
                                pgoff_t index, void *entry)
{
        XA_STATE(xas, &mapping->i_pages, index);

        xas_set_update(&xas, workingset_update_node);
        if (xas_load(&xas) != entry)
                return;
        xas_store(&xas, NULL);
}

static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
                               void *entry)
{
        spin_lock(&mapping->host->i_lock);
        xa_lock_irq(&mapping->i_pages);
        __clear_shadow_entry(mapping, index, entry);
        xa_unlock_irq(&mapping->i_pages);
        if (mapping_shrinkable(mapping))
                inode_add_lru(mapping->host);
        spin_unlock(&mapping->host->i_lock);
}

/*
 * Unconditionally remove exceptional entries. Usually called from truncate
 * path. Note that the folio_batch may be altered by this function by removing
 * exceptional entries similar to what folio_batch_remove_exceptionals() does.
 */
static void truncate_folio_batch_exceptionals(struct address_space *mapping,
                                struct folio_batch *fbatch, pgoff_t *indices)
{
        int i, j;
        bool dax;

        /* Handled by shmem itself */
        if (shmem_mapping(mapping))
                return;

        for (j = 0; j < folio_batch_count(fbatch); j++)
                if (xa_is_value(fbatch->folios[j]))
                        break;

        if (j == folio_batch_count(fbatch))
                return;

        dax = dax_mapping(mapping);
        if (!dax) {
                spin_lock(&mapping->host->i_lock);
                xa_lock_irq(&mapping->i_pages);
        }

        for (i = j; i < folio_batch_count(fbatch); i++) {
                struct folio *folio = fbatch->folios[i];
                pgoff_t index = indices[i];

                if (!xa_is_value(folio)) {
                        fbatch->folios[j++] = folio;
                        continue;
                }

                if (unlikely(dax)) {
                        dax_delete_mapping_entry(mapping, index);
                        continue;
                }

                __clear_shadow_entry(mapping, index, folio);
        }

        if (!dax) {
                xa_unlock_irq(&mapping->i_pages);
                if (mapping_shrinkable(mapping))
                        inode_add_lru(mapping->host);
                spin_unlock(&mapping->host->i_lock);
        }
        fbatch->nr = j;
}

/*
 * Invalidate exceptional entry if easily possible. This handles exceptional
 * entries for invalidate_inode_pages().
 */
static int invalidate_exceptional_entry(struct address_space *mapping,
                                        pgoff_t index, void *entry)
{
        /* Handled by shmem itself, or for DAX we do nothing. */
        if (shmem_mapping(mapping) || dax_mapping(mapping))
                return 1;
        clear_shadow_entry(mapping, index, entry);
        return 1;
}

/*
 * Invalidate exceptional entry if clean. This handles exceptional entries for
 * invalidate_inode_pages2() so for DAX it evicts only clean entries.
 */
static int invalidate_exceptional_entry2(struct address_space *mapping,
                                         pgoff_t index, void *entry)
{
        /* Handled by shmem itself */
        if (shmem_mapping(mapping))
                return 1;
        if (dax_mapping(mapping))
                return dax_invalidate_mapping_entry_sync(mapping, index);
        clear_shadow_entry(mapping, index, entry);
        return 1;
}

/**
 * folio_invalidate - Invalidate part or all of a folio.
 * @folio: The folio which is affected.
 * @offset: start of the range to invalidate
 * @length: length of the range to invalidate
 *
 * folio_invalidate() is called when all or part of the folio has become
 * invalidated by a truncate operation.
 *
 * folio_invalidate() does not have to release all buffers, but it must
 * ensure that no dirty buffer is left outside @offset and that no I/O
 * is underway against any of the blocks which are outside the truncation
 * point.  Because the caller is about to free (and possibly reuse) those
 * blocks on-disk.
 */
void folio_invalidate(struct folio *folio, size_t offset, size_t length)
{
        const struct address_space_operations *aops = folio->mapping->a_ops;

        if (aops->invalidate_folio)
                aops->invalidate_folio(folio, offset, length);
}
EXPORT_SYMBOL_GPL(folio_invalidate);

/*
 * If truncate cannot remove the fs-private metadata from the page, the page
 * becomes orphaned.  It will be left on the LRU and may even be mapped into
 * user pagetables if we're racing with filemap_fault().
 *
 * We need to bail out if page->mapping is no longer equal to the original
 * mapping.  This happens a) when the VM reclaimed the page while we waited on
 * its lock, b) when a concurrent invalidate_mapping_pages got there first and
 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
 */
static void truncate_cleanup_folio(struct folio *folio)
{
        if (folio_mapped(folio))
                unmap_mapping_folio(folio);

        if (folio_has_private(folio))
                folio_invalidate(folio, 0, folio_size(folio));

        /*
         * Some filesystems seem to re-dirty the page even after
         * the VM has canceled the dirty bit (eg ext3 journaling).
         * Hence dirty accounting check is placed after invalidation.
         */
        folio_cancel_dirty(folio);
        folio_clear_mappedtodisk(folio);
}

int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
{
        if (folio->mapping != mapping)
                return -EIO;

        truncate_cleanup_folio(folio);
        filemap_remove_folio(folio);
        return 0;
}

/*
 * Handle partial folios.  The folio may be entirely within the
 * range if a split has raced with us.  If not, we zero the part of the
 * folio that's within the [start, end] range, and then split the folio if
 * it's large.  split_page_range() will discard pages which now lie beyond
 * i_size, and we rely on the caller to discard pages which lie within a
 * newly created hole.
 *
 * Returns false if splitting failed so the caller can avoid
 * discarding the entire folio which is stubbornly unsplit.
 */
bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
{
        loff_t pos = folio_pos(folio);
        unsigned int offset, length;

        if (pos < start)
                offset = start - pos;
        else
                offset = 0;
        length = folio_size(folio);
        if (pos + length <= (u64)end)
                length = length - offset;
        else
                length = end + 1 - pos - offset;

        folio_wait_writeback(folio);
        if (length == folio_size(folio)) {
                truncate_inode_folio(folio->mapping, folio);
                return true;
        }

        /*
         * We may be zeroing pages we're about to discard, but it avoids
         * doing a complex calculation here, and then doing the zeroing
         * anyway if the page split fails.
         */
        folio_zero_range(folio, offset, length);

        if (folio_has_private(folio))
                folio_invalidate(folio, offset, length);
        if (!folio_test_large(folio))
                return true;
        if (split_folio(folio) == 0)
                return true;
        if (folio_test_dirty(folio))
                return false;
        truncate_inode_folio(folio->mapping, folio);
        return true;
}

/*
 * Used to get rid of pages on hardware memory corruption.
 */
int generic_error_remove_folio(struct address_space *mapping,
                struct folio *folio)
{
        if (!mapping)
                return -EINVAL;
        /*
         * Only punch for normal data pages for now.
         * Handling other types like directories would need more auditing.
         */
        if (!S_ISREG(mapping->host->i_mode))
                return -EIO;
        return truncate_inode_folio(mapping, folio);
}
EXPORT_SYMBOL(generic_error_remove_folio);

/**
 * mapping_evict_folio() - Remove an unused folio from the page-cache.
 * @mapping: The mapping this folio belongs to.
 * @folio: The folio to remove.
 *
 * Safely remove one folio from the page cache.
 * It only drops clean, unused folios.
 *
 * Context: Folio must be locked.
 * Return: The number of pages successfully removed.
 */
long mapping_evict_folio(struct address_space *mapping, struct folio *folio)
{
        /* The page may have been truncated before it was locked */
        if (!mapping)
                return 0;
        if (folio_test_dirty(folio) || folio_test_writeback(folio))
                return 0;
        /* The refcount will be elevated if any page in the folio is mapped */
        if (folio_ref_count(folio) >
                        folio_nr_pages(folio) + folio_has_private(folio) + 1)
                return 0;
        if (!filemap_release_folio(folio, 0))
                return 0;

        return remove_mapping(mapping, folio);
}

/**
 * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
 * @mapping: mapping to truncate
 * @lstart: offset from which to truncate
 * @lend: offset to which to truncate (inclusive)
 *
 * Truncate the page cache, removing the pages that are between
 * specified offsets (and zeroing out partial pages
 * if lstart or lend + 1 is not page aligned).
 *
 * Truncate takes two passes - the first pass is nonblocking.  It will not
 * block on page locks and it will not block on writeback.  The second pass
 * will wait.  This is to prevent as much IO as possible in the affected region.
 * The first pass will remove most pages, so the search cost of the second pass
 * is low.
 *
 * We pass down the cache-hot hint to the page freeing code.  Even if the
 * mapping is large, it is probably the case that the final pages are the most
 * recently touched, and freeing happens in ascending file offset order.
 *
 * Note that since ->invalidate_folio() accepts range to invalidate
 * truncate_inode_pages_range is able to handle cases where lend + 1 is not
 * page aligned properly.
 */
void truncate_inode_pages_range(struct address_space *mapping,
                                loff_t lstart, loff_t lend)
{
        pgoff_t                start;                /* inclusive */
        pgoff_t                end;                /* exclusive */
        struct folio_batch fbatch;
        pgoff_t                indices[PAGEVEC_SIZE];
        pgoff_t                index;
        int                i;
        struct folio        *folio;
        bool                same_folio;

        if (mapping_empty(mapping))
                return;

        /*
         * 'start' and 'end' always covers the range of pages to be fully
         * truncated. Partial pages are covered with 'partial_start' at the
         * start of the range and 'partial_end' at the end of the range.
         * Note that 'end' is exclusive while 'lend' is inclusive.
         */
        start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
        if (lend == -1)
                /*
                 * lend == -1 indicates end-of-file so we have to set 'end'
                 * to the highest possible pgoff_t and since the type is
                 * unsigned we're using -1.
                 */
                end = -1;
        else
                end = (lend + 1) >> PAGE_SHIFT;

        folio_batch_init(&fbatch);
        index = start;
        while (index < end && find_lock_entries(mapping, &index, end - 1,
                        &fbatch, indices)) {
                truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
                for (i = 0; i < folio_batch_count(&fbatch); i++)
                        truncate_cleanup_folio(fbatch.folios[i]);
                delete_from_page_cache_batch(mapping, &fbatch);
                for (i = 0; i < folio_batch_count(&fbatch); i++)
                        folio_unlock(fbatch.folios[i]);
                folio_batch_release(&fbatch);
                cond_resched();
        }

        same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
        folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0);
        if (!IS_ERR(folio)) {
                same_folio = lend < folio_pos(folio) + folio_size(folio);
                if (!truncate_inode_partial_folio(folio, lstart, lend)) {
                        start = folio_next_index(folio);
                        if (same_folio)
                                end = folio->index;
                }
                folio_unlock(folio);
                folio_put(folio);
                folio = NULL;
        }

        if (!same_folio) {
                folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT,
                                                FGP_LOCK, 0);
                if (!IS_ERR(folio)) {
                        if (!truncate_inode_partial_folio(folio, lstart, lend))
                                end = folio->index;
                        folio_unlock(folio);
                        folio_put(folio);
                }
        }

        index = start;
        while (index < end) {
                cond_resched();
                if (!find_get_entries(mapping, &index, end - 1, &fbatch,
                                indices)) {
                        /* If all gone from start onwards, we're done */
                        if (index == start)
                                break;
                        /* Otherwise restart to make sure all gone */
                        index = start;
                        continue;
                }

                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];

                        /* We rely upon deletion not changing page->index */

                        if (xa_is_value(folio))
                                continue;

                        folio_lock(folio);
                        VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
                        folio_wait_writeback(folio);
                        truncate_inode_folio(mapping, folio);
                        folio_unlock(folio);
                }
                truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
                folio_batch_release(&fbatch);
        }
}
EXPORT_SYMBOL(truncate_inode_pages_range);

/**
 * truncate_inode_pages - truncate *all* the pages from an offset
 * @mapping: mapping to truncate
 * @lstart: offset from which to truncate
 *
 * Called under (and serialised by) inode->i_rwsem and
 * mapping->invalidate_lock.
 *
 * Note: When this function returns, there can be a page in the process of
 * deletion (inside __filemap_remove_folio()) in the specified range.  Thus
 * mapping->nrpages can be non-zero when this function returns even after
 * truncation of the whole mapping.
 */
void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
{
        truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
}
EXPORT_SYMBOL(truncate_inode_pages);

/**
 * truncate_inode_pages_final - truncate *all* pages before inode dies
 * @mapping: mapping to truncate
 *
 * Called under (and serialized by) inode->i_rwsem.
 *
 * Filesystems have to use this in the .evict_inode path to inform the
 * VM that this is the final truncate and the inode is going away.
 */
void truncate_inode_pages_final(struct address_space *mapping)
{
        /*
         * Page reclaim can not participate in regular inode lifetime
         * management (can't call iput()) and thus can race with the
         * inode teardown.  Tell it when the address space is exiting,
         * so that it does not install eviction information after the
         * final truncate has begun.
         */
        mapping_set_exiting(mapping);

        if (!mapping_empty(mapping)) {
                /*
                 * As truncation uses a lockless tree lookup, cycle
                 * the tree lock to make sure any ongoing tree
                 * modification that does not see AS_EXITING is
                 * completed before starting the final truncate.
                 */
                xa_lock_irq(&mapping->i_pages);
                xa_unlock_irq(&mapping->i_pages);
        }

        truncate_inode_pages(mapping, 0);
}
EXPORT_SYMBOL(truncate_inode_pages_final);

/**
 * mapping_try_invalidate - Invalidate all the evictable folios of one inode
 * @mapping: the address_space which holds the folios to invalidate
 * @start: the offset 'from' which to invalidate
 * @end: the offset 'to' which to invalidate (inclusive)
 * @nr_failed: How many folio invalidations failed
 *
 * This function is similar to invalidate_mapping_pages(), except that it
 * returns the number of folios which could not be evicted in @nr_failed.
 */
unsigned long mapping_try_invalidate(struct address_space *mapping,
                pgoff_t start, pgoff_t end, unsigned long *nr_failed)
{
        pgoff_t indices[PAGEVEC_SIZE];
        struct folio_batch fbatch;
        pgoff_t index = start;
        unsigned long ret;
        unsigned long count = 0;
        int i;

        folio_batch_init(&fbatch);
        while (find_lock_entries(mapping, &index, end, &fbatch, indices)) {
                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];

                        /* We rely upon deletion not changing folio->index */

                        if (xa_is_value(folio)) {
                                count += invalidate_exceptional_entry(mapping,
                                                             indices[i], folio);
                                continue;
                        }

                        ret = mapping_evict_folio(mapping, folio);
                        folio_unlock(folio);
                        /*
                         * Invalidation is a hint that the folio is no longer
                         * of interest and try to speed up its reclaim.
                         */
                        if (!ret) {
                                deactivate_file_folio(folio);
                                /* Likely in the lru cache of a remote CPU */
                                if (nr_failed)
                                        (*nr_failed)++;
                        }
                        count += ret;
                }
                folio_batch_remove_exceptionals(&fbatch);
                folio_batch_release(&fbatch);
                cond_resched();
        }
        return count;
}

/**
 * invalidate_mapping_pages - Invalidate all clean, unlocked cache of one inode
 * @mapping: the address_space which holds the cache to invalidate
 * @start: the offset 'from' which to invalidate
 * @end: the offset 'to' which to invalidate (inclusive)
 *
 * This function removes pages that are clean, unmapped and unlocked,
 * as well as shadow entries. It will not block on IO activity.
 *
 * If you want to remove all the pages of one inode, regardless of
 * their use and writeback state, use truncate_inode_pages().
 *
 * Return: The number of indices that had their contents invalidated
 */
unsigned long invalidate_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t end)
{
        return mapping_try_invalidate(mapping, start, end, NULL);
}
EXPORT_SYMBOL(invalidate_mapping_pages);

/*
 * This is like mapping_evict_folio(), except it ignores the folio's
 * refcount.  We do this because invalidate_inode_pages2() needs stronger
 * invalidation guarantees, and cannot afford to leave folios behind because
 * shrink_page_list() has a temp ref on them, or because they're transiently
 * sitting in the folio_add_lru() caches.
 */
static int invalidate_complete_folio2(struct address_space *mapping,
                                        struct folio *folio)
{
        if (folio->mapping != mapping)
                return 0;

        if (!filemap_release_folio(folio, GFP_KERNEL))
                return 0;

        spin_lock(&mapping->host->i_lock);
        xa_lock_irq(&mapping->i_pages);
        if (folio_test_dirty(folio))
                goto failed;

        BUG_ON(folio_has_private(folio));
        __filemap_remove_folio(folio, NULL);
        xa_unlock_irq(&mapping->i_pages);
        if (mapping_shrinkable(mapping))
                inode_add_lru(mapping->host);
        spin_unlock(&mapping->host->i_lock);

        filemap_free_folio(mapping, folio);
        return 1;
failed:
        xa_unlock_irq(&mapping->i_pages);
        spin_unlock(&mapping->host->i_lock);
        return 0;
}

static int folio_launder(struct address_space *mapping, struct folio *folio)
{
        if (!folio_test_dirty(folio))
                return 0;
        if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL)
                return 0;
        return mapping->a_ops->launder_folio(folio);
}

/**
 * invalidate_inode_pages2_range - remove range of pages from an address_space
 * @mapping: the address_space
 * @start: the page offset 'from' which to invalidate
 * @end: the page offset 'to' which to invalidate (inclusive)
 *
 * Any pages which are found to be mapped into pagetables are unmapped prior to
 * invalidation.
 *
 * Return: -EBUSY if any pages could not be invalidated.
 */
int invalidate_inode_pages2_range(struct address_space *mapping,
                                  pgoff_t start, pgoff_t end)
{
        pgoff_t indices[PAGEVEC_SIZE];
        struct folio_batch fbatch;
        pgoff_t index;
        int i;
        int ret = 0;
        int ret2 = 0;
        int did_range_unmap = 0;

        if (mapping_empty(mapping))
                return 0;

        folio_batch_init(&fbatch);
        index = start;
        while (find_get_entries(mapping, &index, end, &fbatch, indices)) {
                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];

                        /* We rely upon deletion not changing folio->index */

                        if (xa_is_value(folio)) {
                                if (!invalidate_exceptional_entry2(mapping,
                                                indices[i], folio))
                                        ret = -EBUSY;
                                continue;
                        }

                        if (!did_range_unmap && folio_mapped(folio)) {
                                /*
                                 * If folio is mapped, before taking its lock,
                                 * zap the rest of the file in one hit.
                                 */
                                unmap_mapping_pages(mapping, indices[i],
                                                (1 + end - indices[i]), false);
                                did_range_unmap = 1;
                        }

                        folio_lock(folio);
                        if (unlikely(folio->mapping != mapping)) {
                                folio_unlock(folio);
                                continue;
                        }
                        VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
                        folio_wait_writeback(folio);

                        if (folio_mapped(folio))
                                unmap_mapping_folio(folio);
                        BUG_ON(folio_mapped(folio));

                        ret2 = folio_launder(mapping, folio);
                        if (ret2 == 0) {
                                if (!invalidate_complete_folio2(mapping, folio))
                                        ret2 = -EBUSY;
                        }
                        if (ret2 < 0)
                                ret = ret2;
                        folio_unlock(folio);
                }
                folio_batch_remove_exceptionals(&fbatch);
                folio_batch_release(&fbatch);
                cond_resched();
        }
        /*
         * For DAX we invalidate page tables after invalidating page cache.  We
         * could invalidate page tables while invalidating each entry however
         * that would be expensive. And doing range unmapping before doesn't
         * work as we have no cheap way to find whether page cache entry didn't
         * get remapped later.
         */
        if (dax_mapping(mapping)) {
                unmap_mapping_pages(mapping, start, end - start + 1, false);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);

/**
 * invalidate_inode_pages2 - remove all pages from an address_space
 * @mapping: the address_space
 *
 * Any pages which are found to be mapped into pagetables are unmapped prior to
 * invalidation.
 *
 * Return: -EBUSY if any pages could not be invalidated.
 */
int invalidate_inode_pages2(struct address_space *mapping)
{
        return invalidate_inode_pages2_range(mapping, 0, -1);
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2);

/**
 * truncate_pagecache - unmap and remove pagecache that has been truncated
 * @inode: inode
 * @newsize: new file size
 *
 * inode's new i_size must already be written before truncate_pagecache
 * is called.
 *
 * This function should typically be called before the filesystem
 * releases resources associated with the freed range (eg. deallocates
 * blocks). This way, pagecache will always stay logically coherent
 * with on-disk format, and the filesystem would not have to deal with
 * situations such as writepage being called for a page that has already
 * had its underlying blocks deallocated.
 */
void truncate_pagecache(struct inode *inode, loff_t newsize)
{
        struct address_space *mapping = inode->i_mapping;
        loff_t holebegin = round_up(newsize, PAGE_SIZE);

        /*
         * unmap_mapping_range is called twice, first simply for
         * efficiency so that truncate_inode_pages does fewer
         * single-page unmaps.  However after this first call, and
         * before truncate_inode_pages finishes, it is possible for
         * private pages to be COWed, which remain after
         * truncate_inode_pages finishes, hence the second
         * unmap_mapping_range call must be made for correctness.
         */
        unmap_mapping_range(mapping, holebegin, 0, 1);
        truncate_inode_pages(mapping, newsize);
        unmap_mapping_range(mapping, holebegin, 0, 1);
}
EXPORT_SYMBOL(truncate_pagecache);

/**
 * truncate_setsize - update inode and pagecache for a new file size
 * @inode: inode
 * @newsize: new file size
 *
 * truncate_setsize updates i_size and performs pagecache truncation (if
 * necessary) to @newsize. It will be typically be called from the filesystem's
 * setattr function when ATTR_SIZE is passed in.
 *
 * Must be called with a lock serializing truncates and writes (generally
 * i_rwsem but e.g. xfs uses a different lock) and before all filesystem
 * specific block truncation has been performed.
 */
void truncate_setsize(struct inode *inode, loff_t newsize)
{
        loff_t oldsize = inode->i_size;

        i_size_write(inode, newsize);
        if (newsize > oldsize)
                pagecache_isize_extended(inode, oldsize, newsize);
        truncate_pagecache(inode, newsize);
}
EXPORT_SYMBOL(truncate_setsize);

/**
 * pagecache_isize_extended - update pagecache after extension of i_size
 * @inode:        inode for which i_size was extended
 * @from:        original inode size
 * @to:                new inode size
 *
 * Handle extension of inode size either caused by extending truncate or
 * by write starting after current i_size.  We mark the page straddling
 * current i_size RO so that page_mkwrite() is called on the first
 * write access to the page.  The filesystem will update its per-block
 * information before user writes to the page via mmap after the i_size
 * has been changed.
 *
 * The function must be called after i_size is updated so that page fault
 * coming after we unlock the folio will already see the new i_size.
 * The function must be called while we still hold i_rwsem - this not only
 * makes sure i_size is stable but also that userspace cannot observe new
 * i_size value before we are prepared to store mmap writes at new inode size.
 */
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
{
        int bsize = i_blocksize(inode);
        loff_t rounded_from;
        struct folio *folio;

        WARN_ON(to > inode->i_size);

        if (from >= to || bsize >= PAGE_SIZE)
                return;
        /* Page straddling @from will not have any hole block created? */
        rounded_from = round_up(from, bsize);
        if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1)))
                return;

        folio = filemap_lock_folio(inode->i_mapping, from / PAGE_SIZE);
        /* Folio not cached? Nothing to do */
        if (IS_ERR(folio))
                return;
        /*
         * See folio_clear_dirty_for_io() for details why folio_mark_dirty()
         * is needed.
         */
        if (folio_mkclean(folio))
                folio_mark_dirty(folio);
        folio_unlock(folio);
        folio_put(folio);
}
EXPORT_SYMBOL(pagecache_isize_extended);

/**
 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
 * @inode: inode
 * @lstart: offset of beginning of hole
 * @lend: offset of last byte of hole
 *
 * This function should typically be called before the filesystem
 * releases resources associated with the freed range (eg. deallocates
 * blocks). This way, pagecache will always stay logically coherent
 * with on-disk format, and the filesystem would not have to deal with
 * situations such as writepage being called for a page that has already
 * had its underlying blocks deallocated.
 */
void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
{
        struct address_space *mapping = inode->i_mapping;
        loff_t unmap_start = round_up(lstart, PAGE_SIZE);
        loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1;
        /*
         * This rounding is currently just for example: unmap_mapping_range
         * expands its hole outwards, whereas we want it to contract the hole
         * inwards.  However, existing callers of truncate_pagecache_range are
         * doing their own page rounding first.  Note that unmap_mapping_range
         * allows holelen 0 for all, and we allow lend -1 for end of file.
         */

        /*
         * Unlike in truncate_pagecache, unmap_mapping_range is called only
         * once (before truncating pagecache), and without "even_cows" flag:
         * hole-punching should not remove private COWed pages from the hole.
         */
        if ((u64)unmap_end > (u64)unmap_start)
                unmap_mapping_range(mapping, unmap_start,
                                    1 + unmap_end - unmap_start, 0);
        truncate_inode_pages_range(mapping, lstart, lend);
}
EXPORT_SYMBOL(truncate_pagecache_range);
































    1 




























    1 










    1 
    1 



























































































































    1 



    1 





    1 




    1 




















    1 


    1 






    1 










    1 





    1 











    1 










    1 






















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/io_uring.h>
#include <linux/io_uring_types.h>
#include <asm/shmparam.h>

#include "memmap.h"
#include "kbuf.h"

static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
                                   size_t size, gfp_t gfp)
{
        struct page *page;
        int i, order;

        order = get_order(size);
        if (order > MAX_PAGE_ORDER)
                return ERR_PTR(-ENOMEM);
        else if (order)
                gfp |= __GFP_COMP;

        page = alloc_pages(gfp, order);
        if (!page)
                return ERR_PTR(-ENOMEM);

        for (i = 0; i < nr_pages; i++)
                pages[i] = page + i;

        return page_address(page);
}

static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size,
                                 gfp_t gfp)
{
        void *ret;
        int i;

        for (i = 0; i < nr_pages; i++) {
                pages[i] = alloc_page(gfp);
                if (!pages[i])
                        goto err;
        }

        ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
        if (ret)
                return ret;
err:
        while (i--)
                put_page(pages[i]);
        return ERR_PTR(-ENOMEM);
}

void *io_pages_map(struct page ***out_pages, unsigned short *npages,
                   size_t size)
{
        gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
        struct page **pages;
        int nr_pages;
        void *ret;

        nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
        pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp);
        if (!pages)
                return ERR_PTR(-ENOMEM);

        ret = io_mem_alloc_compound(pages, nr_pages, size, gfp);
        if (!IS_ERR(ret))
                goto done;

        ret = io_mem_alloc_single(pages, nr_pages, size, gfp);
        if (!IS_ERR(ret)) {
done:
                *out_pages = pages;
                *npages = nr_pages;
                return ret;
        }

        kvfree(pages);
        *out_pages = NULL;
        *npages = 0;
        return ret;
}

void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
                    bool put_pages)
{
        bool do_vunmap = false;

        if (!ptr)
                return;

        if (put_pages && *npages) {
                struct page **to_free = *pages;
                int i;

                /*
                 * Only did vmap for the non-compound multiple page case.
                 * For the compound page, we just need to put the head.
                 */
                if (PageCompound(to_free[0]))
                        *npages = 1;
                else if (*npages > 1)
                        do_vunmap = true;
                for (i = 0; i < *npages; i++)
                        put_page(to_free[i]);
        }
        if (do_vunmap)
                vunmap(ptr);
        kvfree(*pages);
        *pages = NULL;
        *npages = 0;
}

void io_pages_free(struct page ***pages, int npages)
{
        struct page **page_array = *pages;

        if (!page_array)
                return;

        unpin_user_pages(page_array, npages);
        kvfree(page_array);
        *pages = NULL;
}

struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
{
        unsigned long start, end, nr_pages;
        struct page **pages;
        int ret;

        end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
        start = uaddr >> PAGE_SHIFT;
        nr_pages = end - start;
        if (WARN_ON_ONCE(!nr_pages))
                return ERR_PTR(-EINVAL);

        pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
        if (!pages)
                return ERR_PTR(-ENOMEM);

        ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
                                        pages);
        /* success, mapped all pages */
        if (ret == nr_pages) {
                *npages = nr_pages;
                return pages;
        }

        /* partial map, or didn't map anything */
        if (ret >= 0) {
                /* if we did partial map, release any pages we did get */
                if (ret)
                        unpin_user_pages(pages, ret);
                ret = -EFAULT;
        }
        kvfree(pages);
        return ERR_PTR(ret);
}

void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
                     unsigned long uaddr, size_t size)
{
        struct page **page_array;
        unsigned int nr_pages;
        void *page_addr;

        *npages = 0;

        if (uaddr & (PAGE_SIZE - 1) || !size)
                return ERR_PTR(-EINVAL);

        nr_pages = 0;
        page_array = io_pin_pages(uaddr, size, &nr_pages);
        if (IS_ERR(page_array))
                return page_array;

        page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL);
        if (page_addr) {
                *pages = page_array;
                *npages = nr_pages;
                return page_addr;
        }

        io_pages_free(&page_array, nr_pages);
        return ERR_PTR(-ENOMEM);
}

static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
                                            size_t sz)
{
        struct io_ring_ctx *ctx = file->private_data;
        loff_t offset = pgoff << PAGE_SHIFT;

        switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) {
        case IORING_OFF_SQ_RING:
        case IORING_OFF_CQ_RING:
                /* Don't allow mmap if the ring was setup without it */
                if (ctx->flags & IORING_SETUP_NO_MMAP)
                        return ERR_PTR(-EINVAL);
                return ctx->rings;
        case IORING_OFF_SQES:
                /* Don't allow mmap if the ring was setup without it */
                if (ctx->flags & IORING_SETUP_NO_MMAP)
                        return ERR_PTR(-EINVAL);
                return ctx->sq_sqes;
        case IORING_OFF_PBUF_RING: {
                struct io_buffer_list *bl;
                unsigned int bgid;
                void *ptr;

                bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
                bl = io_pbuf_get_bl(ctx, bgid);
                if (IS_ERR(bl))
                        return bl;
                ptr = bl->buf_ring;
                io_put_bl(ctx, bl);
                return ptr;
                }
        }

        return ERR_PTR(-EINVAL);
}

int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
                        struct page **pages, int npages)
{
        unsigned long nr_pages = npages;

        vm_flags_set(vma, VM_DONTEXPAND);
        return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
}

#ifdef CONFIG_MMU

__cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct io_ring_ctx *ctx = file->private_data;
        size_t sz = vma->vm_end - vma->vm_start;
        long offset = vma->vm_pgoff << PAGE_SHIFT;
        unsigned int npages;
        void *ptr;

        ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);

        switch (offset & IORING_OFF_MMAP_MASK) {
        case IORING_OFF_SQ_RING:
        case IORING_OFF_CQ_RING:
                npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT);
                return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages);
        case IORING_OFF_SQES:
                return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
                                                ctx->n_sqe_pages);
        case IORING_OFF_PBUF_RING:
                return io_pbuf_mmap(file, vma);
        }

        return -EINVAL;
}

unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
                                         unsigned long len, unsigned long pgoff,
                                         unsigned long flags)
{
        void *ptr;

        /*
         * Do not allow to map to user-provided address to avoid breaking the
         * aliasing rules. Userspace is not able to guess the offset address of
         * kernel kmalloc()ed memory area.
         */
        if (addr)
                return -EINVAL;

        ptr = io_uring_validate_mmap_request(filp, pgoff, len);
        if (IS_ERR(ptr))
                return -ENOMEM;

        /*
         * Some architectures have strong cache aliasing requirements.
         * For such architectures we need a coherent mapping which aliases
         * kernel memory *and* userspace memory. To achieve that:
         * - use a NULL file pointer to reference physical memory, and
         * - use the kernel virtual address of the shared io_uring context
         *   (instead of the userspace-provided address, which has to be 0UL
         *   anyway).
         * - use the same pgoff which the get_unmapped_area() uses to
         *   calculate the page colouring.
         * For architectures without such aliasing requirements, the
         * architecture will return any suitable mapping because addr is 0.
         */
        filp = NULL;
        flags |= MAP_SHARED;
        pgoff = 0;        /* has been translated to ptr above */
#ifdef SHM_COLOUR
        addr = (uintptr_t) ptr;
        pgoff = addr >> PAGE_SHIFT;
#else
        addr = 0UL;
#endif
        return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
}

#else /* !CONFIG_MMU */

int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
        return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL;
}

unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
{
        return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
}

unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
                                         unsigned long len, unsigned long pgoff,
                                         unsigned long flags)
{
        void *ptr;

        ptr = io_uring_validate_mmap_request(file, pgoff, len);
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);

        return (unsigned long) ptr;
}

#endif /* !CONFIG_MMU */

































    2 
























































   10 





   11 
   10 

    7 
    4 










   11 


   10 






























































































































































































































    7 

























































































































    6 




   24 








   23 












    7 
    6 



    7 









    7 









    7 




















    7 


















    3 




    2 











    3 

    2 





    2 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
 * Authors: David Chinner and Glauber Costa
 *
 * Generic LRU infrastructure
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/list_lru.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/memcontrol.h>
#include "slab.h"
#include "internal.h"

#ifdef CONFIG_MEMCG_KMEM
static LIST_HEAD(memcg_list_lrus);
static DEFINE_MUTEX(list_lrus_mutex);

static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
        return lru->memcg_aware;
}

static void list_lru_register(struct list_lru *lru)
{
        if (!list_lru_memcg_aware(lru))
                return;

        mutex_lock(&list_lrus_mutex);
        list_add(&lru->list, &memcg_list_lrus);
        mutex_unlock(&list_lrus_mutex);
}

static void list_lru_unregister(struct list_lru *lru)
{
        if (!list_lru_memcg_aware(lru))
                return;

        mutex_lock(&list_lrus_mutex);
        list_del(&lru->list);
        mutex_unlock(&list_lrus_mutex);
}

static int lru_shrinker_id(struct list_lru *lru)
{
        return lru->shrinker_id;
}

static inline struct list_lru_one *
list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
{
        if (list_lru_memcg_aware(lru) && idx >= 0) {
                struct list_lru_memcg *mlru = xa_load(&lru->xa, idx);

                return mlru ? &mlru->node[nid] : NULL;
        }
        return &lru->node[nid].lru;
}
#else
static void list_lru_register(struct list_lru *lru)
{
}

static void list_lru_unregister(struct list_lru *lru)
{
}

static int lru_shrinker_id(struct list_lru *lru)
{
        return -1;
}

static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
        return false;
}

static inline struct list_lru_one *
list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
{
        return &lru->node[nid].lru;
}
#endif /* CONFIG_MEMCG_KMEM */

bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid,
                    struct mem_cgroup *memcg)
{
        struct list_lru_node *nlru = &lru->node[nid];
        struct list_lru_one *l;

        spin_lock(&nlru->lock);
        if (list_empty(item)) {
                l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
                list_add_tail(item, &l->list);
                /* Set shrinker bit if the first element was added */
                if (!l->nr_items++)
                        set_shrinker_bit(memcg, nid, lru_shrinker_id(lru));
                nlru->nr_items++;
                spin_unlock(&nlru->lock);
                return true;
        }
        spin_unlock(&nlru->lock);
        return false;
}
EXPORT_SYMBOL_GPL(list_lru_add);

bool list_lru_add_obj(struct list_lru *lru, struct list_head *item)
{
        int nid = page_to_nid(virt_to_page(item));
        struct mem_cgroup *memcg = list_lru_memcg_aware(lru) ?
                mem_cgroup_from_slab_obj(item) : NULL;

        return list_lru_add(lru, item, nid, memcg);
}
EXPORT_SYMBOL_GPL(list_lru_add_obj);

bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid,
                    struct mem_cgroup *memcg)
{
        struct list_lru_node *nlru = &lru->node[nid];
        struct list_lru_one *l;

        spin_lock(&nlru->lock);
        if (!list_empty(item)) {
                l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
                list_del_init(item);
                l->nr_items--;
                nlru->nr_items--;
                spin_unlock(&nlru->lock);
                return true;
        }
        spin_unlock(&nlru->lock);
        return false;
}
EXPORT_SYMBOL_GPL(list_lru_del);

bool list_lru_del_obj(struct list_lru *lru, struct list_head *item)
{
        int nid = page_to_nid(virt_to_page(item));
        struct mem_cgroup *memcg = list_lru_memcg_aware(lru) ?
                mem_cgroup_from_slab_obj(item) : NULL;

        return list_lru_del(lru, item, nid, memcg);
}
EXPORT_SYMBOL_GPL(list_lru_del_obj);

void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
{
        list_del_init(item);
        list->nr_items--;
}
EXPORT_SYMBOL_GPL(list_lru_isolate);

void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
                           struct list_head *head)
{
        list_move(item, head);
        list->nr_items--;
}
EXPORT_SYMBOL_GPL(list_lru_isolate_move);

unsigned long list_lru_count_one(struct list_lru *lru,
                                 int nid, struct mem_cgroup *memcg)
{
        struct list_lru_one *l;
        long count;

        rcu_read_lock();
        l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
        count = l ? READ_ONCE(l->nr_items) : 0;
        rcu_read_unlock();

        if (unlikely(count < 0))
                count = 0;

        return count;
}
EXPORT_SYMBOL_GPL(list_lru_count_one);

unsigned long list_lru_count_node(struct list_lru *lru, int nid)
{
        struct list_lru_node *nlru;

        nlru = &lru->node[nid];
        return nlru->nr_items;
}
EXPORT_SYMBOL_GPL(list_lru_count_node);

static unsigned long
__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
                    list_lru_walk_cb isolate, void *cb_arg,
                    unsigned long *nr_to_walk)
{
        struct list_lru_node *nlru = &lru->node[nid];
        struct list_lru_one *l;
        struct list_head *item, *n;
        unsigned long isolated = 0;

restart:
        l = list_lru_from_memcg_idx(lru, nid, memcg_idx);
        if (!l)
                goto out;

        list_for_each_safe(item, n, &l->list) {
                enum lru_status ret;

                /*
                 * decrement nr_to_walk first so that we don't livelock if we
                 * get stuck on large numbers of LRU_RETRY items
                 */
                if (!*nr_to_walk)
                        break;
                --*nr_to_walk;

                ret = isolate(item, l, &nlru->lock, cb_arg);
                switch (ret) {
                case LRU_REMOVED_RETRY:
                        assert_spin_locked(&nlru->lock);
                        fallthrough;
                case LRU_REMOVED:
                        isolated++;
                        nlru->nr_items--;
                        /*
                         * If the lru lock has been dropped, our list
                         * traversal is now invalid and so we have to
                         * restart from scratch.
                         */
                        if (ret == LRU_REMOVED_RETRY)
                                goto restart;
                        break;
                case LRU_ROTATE:
                        list_move_tail(item, &l->list);
                        break;
                case LRU_SKIP:
                        break;
                case LRU_RETRY:
                        /*
                         * The lru lock has been dropped, our list traversal is
                         * now invalid and so we have to restart from scratch.
                         */
                        assert_spin_locked(&nlru->lock);
                        goto restart;
                case LRU_STOP:
                        assert_spin_locked(&nlru->lock);
                        goto out;
                default:
                        BUG();
                }
        }
out:
        return isolated;
}

unsigned long
list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
                  list_lru_walk_cb isolate, void *cb_arg,
                  unsigned long *nr_to_walk)
{
        struct list_lru_node *nlru = &lru->node[nid];
        unsigned long ret;

        spin_lock(&nlru->lock);
        ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate,
                                  cb_arg, nr_to_walk);
        spin_unlock(&nlru->lock);
        return ret;
}
EXPORT_SYMBOL_GPL(list_lru_walk_one);

unsigned long
list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
                      list_lru_walk_cb isolate, void *cb_arg,
                      unsigned long *nr_to_walk)
{
        struct list_lru_node *nlru = &lru->node[nid];
        unsigned long ret;

        spin_lock_irq(&nlru->lock);
        ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate,
                                  cb_arg, nr_to_walk);
        spin_unlock_irq(&nlru->lock);
        return ret;
}

unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
                                 list_lru_walk_cb isolate, void *cb_arg,
                                 unsigned long *nr_to_walk)
{
        long isolated = 0;

        isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
                                      nr_to_walk);

#ifdef CONFIG_MEMCG_KMEM
        if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
                struct list_lru_memcg *mlru;
                unsigned long index;

                xa_for_each(&lru->xa, index, mlru) {
                        struct list_lru_node *nlru = &lru->node[nid];

                        spin_lock(&nlru->lock);
                        isolated += __list_lru_walk_one(lru, nid, index,
                                                        isolate, cb_arg,
                                                        nr_to_walk);
                        spin_unlock(&nlru->lock);

                        if (*nr_to_walk <= 0)
                                break;
                }
        }
#endif

        return isolated;
}
EXPORT_SYMBOL_GPL(list_lru_walk_node);

static void init_one_lru(struct list_lru_one *l)
{
        INIT_LIST_HEAD(&l->list);
        l->nr_items = 0;
}

#ifdef CONFIG_MEMCG_KMEM
static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp)
{
        int nid;
        struct list_lru_memcg *mlru;

        mlru = kmalloc(struct_size(mlru, node, nr_node_ids), gfp);
        if (!mlru)
                return NULL;

        for_each_node(nid)
                init_one_lru(&mlru->node[nid]);

        return mlru;
}

static void memcg_list_lru_free(struct list_lru *lru, int src_idx)
{
        struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx);

        /*
         * The __list_lru_walk_one() can walk the list of this node.
         * We need kvfree_rcu() here. And the walking of the list
         * is under lru->node[nid]->lock, which can serve as a RCU
         * read-side critical section.
         */
        if (mlru)
                kvfree_rcu(mlru, rcu);
}

static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
        if (memcg_aware)
                xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ);
        lru->memcg_aware = memcg_aware;
}

static void memcg_destroy_list_lru(struct list_lru *lru)
{
        XA_STATE(xas, &lru->xa, 0);
        struct list_lru_memcg *mlru;

        if (!list_lru_memcg_aware(lru))
                return;

        xas_lock_irq(&xas);
        xas_for_each(&xas, mlru, ULONG_MAX) {
                kfree(mlru);
                xas_store(&xas, NULL);
        }
        xas_unlock_irq(&xas);
}

static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid,
                                         int src_idx, struct mem_cgroup *dst_memcg)
{
        struct list_lru_node *nlru = &lru->node[nid];
        int dst_idx = dst_memcg->kmemcg_id;
        struct list_lru_one *src, *dst;

        /*
         * Since list_lru_{add,del} may be called under an IRQ-safe lock,
         * we have to use IRQ-safe primitives here to avoid deadlock.
         */
        spin_lock_irq(&nlru->lock);

        src = list_lru_from_memcg_idx(lru, nid, src_idx);
        if (!src)
                goto out;
        dst = list_lru_from_memcg_idx(lru, nid, dst_idx);

        list_splice_init(&src->list, &dst->list);

        if (src->nr_items) {
                dst->nr_items += src->nr_items;
                set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
                src->nr_items = 0;
        }
out:
        spin_unlock_irq(&nlru->lock);
}

static void memcg_reparent_list_lru(struct list_lru *lru,
                                    int src_idx, struct mem_cgroup *dst_memcg)
{
        int i;

        for_each_node(i)
                memcg_reparent_list_lru_node(lru, i, src_idx, dst_memcg);

        memcg_list_lru_free(lru, src_idx);
}

void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
        struct cgroup_subsys_state *css;
        struct list_lru *lru;
        int src_idx = memcg->kmemcg_id;

        /*
         * Change kmemcg_id of this cgroup and all its descendants to the
         * parent's id, and then move all entries from this cgroup's list_lrus
         * to ones of the parent.
         *
         * After we have finished, all list_lrus corresponding to this cgroup
         * are guaranteed to remain empty. So we can safely free this cgroup's
         * list lrus in memcg_list_lru_free().
         *
         * Changing ->kmemcg_id to the parent can prevent memcg_list_lru_alloc()
         * from allocating list lrus for this cgroup after memcg_list_lru_free()
         * call.
         */
        rcu_read_lock();
        css_for_each_descendant_pre(css, &memcg->css) {
                struct mem_cgroup *child;

                child = mem_cgroup_from_css(css);
                WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id);
        }
        rcu_read_unlock();

        mutex_lock(&list_lrus_mutex);
        list_for_each_entry(lru, &memcg_list_lrus, list)
                memcg_reparent_list_lru(lru, src_idx, parent);
        mutex_unlock(&list_lrus_mutex);
}

static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg,
                                            struct list_lru *lru)
{
        int idx = memcg->kmemcg_id;

        return idx < 0 || xa_load(&lru->xa, idx);
}

int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
                         gfp_t gfp)
{
        int i;
        unsigned long flags;
        struct list_lru_memcg_table {
                struct list_lru_memcg *mlru;
                struct mem_cgroup *memcg;
        } *table;
        XA_STATE(xas, &lru->xa, 0);

        if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru))
                return 0;

        gfp &= GFP_RECLAIM_MASK;
        table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp);
        if (!table)
                return -ENOMEM;

        /*
         * Because the list_lru can be reparented to the parent cgroup's
         * list_lru, we should make sure that this cgroup and all its
         * ancestors have allocated list_lru_memcg.
         */
        for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) {
                if (memcg_list_lru_allocated(memcg, lru))
                        break;

                table[i].memcg = memcg;
                table[i].mlru = memcg_init_list_lru_one(gfp);
                if (!table[i].mlru) {
                        while (i--)
                                kfree(table[i].mlru);
                        kfree(table);
                        return -ENOMEM;
                }
        }

        xas_lock_irqsave(&xas, flags);
        while (i--) {
                int index = READ_ONCE(table[i].memcg->kmemcg_id);
                struct list_lru_memcg *mlru = table[i].mlru;

                xas_set(&xas, index);
retry:
                if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) {
                        kfree(mlru);
                } else {
                        xas_store(&xas, mlru);
                        if (xas_error(&xas) == -ENOMEM) {
                                xas_unlock_irqrestore(&xas, flags);
                                if (xas_nomem(&xas, gfp))
                                        xas_set_err(&xas, 0);
                                xas_lock_irqsave(&xas, flags);
                                /*
                                 * The xas lock has been released, this memcg
                                 * can be reparented before us. So reload
                                 * memcg id. More details see the comments
                                 * in memcg_reparent_list_lrus().
                                 */
                                index = READ_ONCE(table[i].memcg->kmemcg_id);
                                if (index < 0)
                                        xas_set_err(&xas, 0);
                                else if (!xas_error(&xas) && index != xas.xa_index)
                                        xas_set(&xas, index);
                                goto retry;
                        }
                }
        }
        /* xas_nomem() is used to free memory instead of memory allocation. */
        if (xas.xa_alloc)
                xas_nomem(&xas, gfp);
        xas_unlock_irqrestore(&xas, flags);
        kfree(table);

        return xas_error(&xas);
}
#else
static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
}

static void memcg_destroy_list_lru(struct list_lru *lru)
{
}
#endif /* CONFIG_MEMCG_KMEM */

int __list_lru_init(struct list_lru *lru, bool memcg_aware,
                    struct lock_class_key *key, struct shrinker *shrinker)
{
        int i;

#ifdef CONFIG_MEMCG_KMEM
        if (shrinker)
                lru->shrinker_id = shrinker->id;
        else
                lru->shrinker_id = -1;

        if (mem_cgroup_kmem_disabled())
                memcg_aware = false;
#endif

        lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL);
        if (!lru->node)
                return -ENOMEM;

        for_each_node(i) {
                spin_lock_init(&lru->node[i].lock);
                if (key)
                        lockdep_set_class(&lru->node[i].lock, key);
                init_one_lru(&lru->node[i].lru);
        }

        memcg_init_list_lru(lru, memcg_aware);
        list_lru_register(lru);

        return 0;
}
EXPORT_SYMBOL_GPL(__list_lru_init);

void list_lru_destroy(struct list_lru *lru)
{
        /* Already destroyed or not yet initialized? */
        if (!lru->node)
                return;

        list_lru_unregister(lru);

        memcg_destroy_list_lru(lru);
        kfree(lru->node);
        lru->node = NULL;

#ifdef CONFIG_MEMCG_KMEM
        lru->shrinker_id = -1;
#endif
}
EXPORT_SYMBOL_GPL(list_lru_destroy);


























































































































































































































































   37 


   33 








































































































































































































































































   36 


   35 
   38 
   35 


















   35 



   35 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/memblock.h>
#include <linux/page_ext.h>
#include <linux/memory.h>
#include <linux/vmalloc.h>
#include <linux/kmemleak.h>
#include <linux/page_owner.h>
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
#include <linux/rcupdate.h>
#include <linux/pgalloc_tag.h>

/*
 * struct page extension
 *
 * This is the feature to manage memory for extended data per page.
 *
 * Until now, we must modify struct page itself to store extra data per page.
 * This requires rebuilding the kernel and it is really time consuming process.
 * And, sometimes, rebuild is impossible due to third party module dependency.
 * At last, enlarging struct page could cause un-wanted system behaviour change.
 *
 * This feature is intended to overcome above mentioned problems. This feature
 * allocates memory for extended data per page in certain place rather than
 * the struct page itself. This memory can be accessed by the accessor
 * functions provided by this code. During the boot process, it checks whether
 * allocation of huge chunk of memory is needed or not. If not, it avoids
 * allocating memory at all. With this advantage, we can include this feature
 * into the kernel in default and can avoid rebuild and solve related problems.
 *
 * To help these things to work well, there are two callbacks for clients. One
 * is the need callback which is mandatory if user wants to avoid useless
 * memory allocation at boot-time. The other is optional, init callback, which
 * is used to do proper initialization after memory is allocated.
 *
 * The need callback is used to decide whether extended memory allocation is
 * needed or not. Sometimes users want to deactivate some features in this
 * boot and extra memory would be unnecessary. In this case, to avoid
 * allocating huge chunk of memory, each clients represent their need of
 * extra memory through the need callback. If one of the need callbacks
 * returns true, it means that someone needs extra memory so that
 * page extension core should allocates memory for page extension. If
 * none of need callbacks return true, memory isn't needed at all in this boot
 * and page extension core can skip to allocate memory. As result,
 * none of memory is wasted.
 *
 * When need callback returns true, page_ext checks if there is a request for
 * extra memory through size in struct page_ext_operations. If it is non-zero,
 * extra space is allocated for each page_ext entry and offset is returned to
 * user through offset in struct page_ext_operations.
 *
 * The init callback is used to do proper initialization after page extension
 * is completely initialized. In sparse memory system, extra memory is
 * allocated some time later than memmap is allocated. In other words, lifetime
 * of memory for page extension isn't same with memmap for struct page.
 * Therefore, clients can't store extra data until page extension is
 * initialized, even if pages are allocated and used freely. This could
 * cause inadequate state of extra data per page, so, to prevent it, client
 * can utilize this callback to initialize the state of it correctly.
 */

#ifdef CONFIG_SPARSEMEM
#define PAGE_EXT_INVALID       (0x1)
#endif

#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
static bool need_page_idle(void)
{
        return true;
}
static struct page_ext_operations page_idle_ops __initdata = {
        .need = need_page_idle,
        .need_shared_flags = true,
};
#endif

static struct page_ext_operations *page_ext_ops[] __initdata = {
#ifdef CONFIG_PAGE_OWNER
        &page_owner_ops,
#endif
#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
        &page_idle_ops,
#endif
#ifdef CONFIG_MEM_ALLOC_PROFILING
        &page_alloc_tagging_ops,
#endif
#ifdef CONFIG_PAGE_TABLE_CHECK
        &page_table_check_ops,
#endif
};

unsigned long page_ext_size;

static unsigned long total_usage;

#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
/*
 * To ensure correct allocation tagging for pages, page_ext should be available
 * before the first page allocation. Otherwise early task stacks will be
 * allocated before page_ext initialization and missing tags will be flagged.
 */
bool early_page_ext __meminitdata = true;
#else
bool early_page_ext __meminitdata;
#endif
static int __init setup_early_page_ext(char *str)
{
        early_page_ext = true;
        return 0;
}
early_param("early_page_ext", setup_early_page_ext);

static bool __init invoke_need_callbacks(void)
{
        int i;
        int entries = ARRAY_SIZE(page_ext_ops);
        bool need = false;

        for (i = 0; i < entries; i++) {
                if (page_ext_ops[i]->need()) {
                        if (page_ext_ops[i]->need_shared_flags) {
                                page_ext_size = sizeof(struct page_ext);
                                break;
                        }
                }
        }

        for (i = 0; i < entries; i++) {
                if (page_ext_ops[i]->need()) {
                        page_ext_ops[i]->offset = page_ext_size;
                        page_ext_size += page_ext_ops[i]->size;
                        need = true;
                }
        }

        return need;
}

static void __init invoke_init_callbacks(void)
{
        int i;
        int entries = ARRAY_SIZE(page_ext_ops);

        for (i = 0; i < entries; i++) {
                if (page_ext_ops[i]->init)
                        page_ext_ops[i]->init();
        }
}

static inline struct page_ext *get_entry(void *base, unsigned long index)
{
        return base + page_ext_size * index;
}

#ifndef CONFIG_SPARSEMEM
void __init page_ext_init_flatmem_late(void)
{
        invoke_init_callbacks();
}

void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
{
        pgdat->node_page_ext = NULL;
}

static struct page_ext *lookup_page_ext(const struct page *page)
{
        unsigned long pfn = page_to_pfn(page);
        unsigned long index;
        struct page_ext *base;

        WARN_ON_ONCE(!rcu_read_lock_held());
        base = NODE_DATA(page_to_nid(page))->node_page_ext;
        /*
         * The sanity checks the page allocator does upon freeing a
         * page can reach here before the page_ext arrays are
         * allocated when feeding a range of pages to the allocator
         * for the first time during bootup or memory hotplug.
         */
        if (unlikely(!base))
                return NULL;
        index = pfn - round_down(node_start_pfn(page_to_nid(page)),
                                        MAX_ORDER_NR_PAGES);
        return get_entry(base, index);
}

static int __init alloc_node_page_ext(int nid)
{
        struct page_ext *base;
        unsigned long table_size;
        unsigned long nr_pages;

        nr_pages = NODE_DATA(nid)->node_spanned_pages;
        if (!nr_pages)
                return 0;

        /*
         * Need extra space if node range is not aligned with
         * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm
         * checks buddy's status, range could be out of exact node range.
         */
        if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) ||
                !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
                nr_pages += MAX_ORDER_NR_PAGES;

        table_size = page_ext_size * nr_pages;

        base = memblock_alloc_try_nid(
                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
                        MEMBLOCK_ALLOC_ACCESSIBLE, nid);
        if (!base)
                return -ENOMEM;
        NODE_DATA(nid)->node_page_ext = base;
        total_usage += table_size;
        return 0;
}

void __init page_ext_init_flatmem(void)
{

        int nid, fail;

        if (!invoke_need_callbacks())
                return;

        for_each_online_node(nid)  {
                fail = alloc_node_page_ext(nid);
                if (fail)
                        goto fail;
        }
        pr_info("allocated %ld bytes of page_ext\n", total_usage);
        return;

fail:
        pr_crit("allocation of page_ext failed.\n");
        panic("Out of memory");
}

#else /* CONFIG_SPARSEMEM */
static bool page_ext_invalid(struct page_ext *page_ext)
{
        return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID);
}

static struct page_ext *lookup_page_ext(const struct page *page)
{
        unsigned long pfn = page_to_pfn(page);
        struct mem_section *section = __pfn_to_section(pfn);
        struct page_ext *page_ext = READ_ONCE(section->page_ext);

        WARN_ON_ONCE(!rcu_read_lock_held());
        /*
         * The sanity checks the page allocator does upon freeing a
         * page can reach here before the page_ext arrays are
         * allocated when feeding a range of pages to the allocator
         * for the first time during bootup or memory hotplug.
         */
        if (page_ext_invalid(page_ext))
                return NULL;
        return get_entry(page_ext, pfn);
}

static void *__meminit alloc_page_ext(size_t size, int nid)
{
        gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
        void *addr = NULL;

        addr = alloc_pages_exact_nid(nid, size, flags);
        if (addr) {
                kmemleak_alloc(addr, size, 1, flags);
                return addr;
        }

        addr = vzalloc_node(size, nid);

        return addr;
}

static int __meminit init_section_page_ext(unsigned long pfn, int nid)
{
        struct mem_section *section;
        struct page_ext *base;
        unsigned long table_size;

        section = __pfn_to_section(pfn);

        if (section->page_ext)
                return 0;

        table_size = page_ext_size * PAGES_PER_SECTION;
        base = alloc_page_ext(table_size, nid);

        /*
         * The value stored in section->page_ext is (base - pfn)
         * and it does not point to the memory block allocated above,
         * causing kmemleak false positives.
         */
        kmemleak_not_leak(base);

        if (!base) {
                pr_err("page ext allocation failure\n");
                return -ENOMEM;
        }

        /*
         * The passed "pfn" may not be aligned to SECTION.  For the calculation
         * we need to apply a mask.
         */
        pfn &= PAGE_SECTION_MASK;
        section->page_ext = (void *)base - page_ext_size * pfn;
        total_usage += table_size;
        return 0;
}

static void free_page_ext(void *addr)
{
        if (is_vmalloc_addr(addr)) {
                vfree(addr);
        } else {
                struct page *page = virt_to_page(addr);
                size_t table_size;

                table_size = page_ext_size * PAGES_PER_SECTION;

                BUG_ON(PageReserved(page));
                kmemleak_free(addr);
                free_pages_exact(addr, table_size);
        }
}

static void __free_page_ext(unsigned long pfn)
{
        struct mem_section *ms;
        struct page_ext *base;

        ms = __pfn_to_section(pfn);
        if (!ms || !ms->page_ext)
                return;

        base = READ_ONCE(ms->page_ext);
        /*
         * page_ext here can be valid while doing the roll back
         * operation in online_page_ext().
         */
        if (page_ext_invalid(base))
                base = (void *)base - PAGE_EXT_INVALID;
        WRITE_ONCE(ms->page_ext, NULL);

        base = get_entry(base, pfn);
        free_page_ext(base);
}

static void __invalidate_page_ext(unsigned long pfn)
{
        struct mem_section *ms;
        void *val;

        ms = __pfn_to_section(pfn);
        if (!ms || !ms->page_ext)
                return;
        val = (void *)ms->page_ext + PAGE_EXT_INVALID;
        WRITE_ONCE(ms->page_ext, val);
}

static int __meminit online_page_ext(unsigned long start_pfn,
                                unsigned long nr_pages,
                                int nid)
{
        unsigned long start, end, pfn;
        int fail = 0;

        start = SECTION_ALIGN_DOWN(start_pfn);
        end = SECTION_ALIGN_UP(start_pfn + nr_pages);

        if (nid == NUMA_NO_NODE) {
                /*
                 * In this case, "nid" already exists and contains valid memory.
                 * "start_pfn" passed to us is a pfn which is an arg for
                 * online__pages(), and start_pfn should exist.
                 */
                nid = pfn_to_nid(start_pfn);
                VM_BUG_ON(!node_online(nid));
        }

        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION)
                fail = init_section_page_ext(pfn, nid);
        if (!fail)
                return 0;

        /* rollback */
        end = pfn - PAGES_PER_SECTION;
        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
                __free_page_ext(pfn);

        return -ENOMEM;
}

static void __meminit offline_page_ext(unsigned long start_pfn,
                                unsigned long nr_pages)
{
        unsigned long start, end, pfn;

        start = SECTION_ALIGN_DOWN(start_pfn);
        end = SECTION_ALIGN_UP(start_pfn + nr_pages);

        /*
         * Freeing of page_ext is done in 3 steps to avoid
         * use-after-free of it:
         * 1) Traverse all the sections and mark their page_ext
         *    as invalid.
         * 2) Wait for all the existing users of page_ext who
         *    started before invalidation to finish.
         * 3) Free the page_ext.
         */
        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
                __invalidate_page_ext(pfn);

        synchronize_rcu();

        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
                __free_page_ext(pfn);
}

static int __meminit page_ext_callback(struct notifier_block *self,
                               unsigned long action, void *arg)
{
        struct memory_notify *mn = arg;
        int ret = 0;

        switch (action) {
        case MEM_GOING_ONLINE:
                ret = online_page_ext(mn->start_pfn,
                                   mn->nr_pages, mn->status_change_nid);
                break;
        case MEM_OFFLINE:
                offline_page_ext(mn->start_pfn,
                                mn->nr_pages);
                break;
        case MEM_CANCEL_ONLINE:
                offline_page_ext(mn->start_pfn,
                                mn->nr_pages);
                break;
        case MEM_GOING_OFFLINE:
                break;
        case MEM_ONLINE:
        case MEM_CANCEL_OFFLINE:
                break;
        }

        return notifier_from_errno(ret);
}

void __init page_ext_init(void)
{
        unsigned long pfn;
        int nid;

        if (!invoke_need_callbacks())
                return;

        for_each_node_state(nid, N_MEMORY) {
                unsigned long start_pfn, end_pfn;

                start_pfn = node_start_pfn(nid);
                end_pfn = node_end_pfn(nid);
                /*
                 * start_pfn and end_pfn may not be aligned to SECTION and the
                 * page->flags of out of node pages are not initialized.  So we
                 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
                 */
                for (pfn = start_pfn; pfn < end_pfn;
                        pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {

                        if (!pfn_valid(pfn))
                                continue;
                        /*
                         * Nodes's pfns can be overlapping.
                         * We know some arch can have a nodes layout such as
                         * -------------pfn-------------->
                         * N0 | N1 | N2 | N0 | N1 | N2|....
                         */
                        if (pfn_to_nid(pfn) != nid)
                                continue;
                        if (init_section_page_ext(pfn, nid))
                                goto oom;
                        cond_resched();
                }
        }
        hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI);
        pr_info("allocated %ld bytes of page_ext\n", total_usage);
        invoke_init_callbacks();
        return;

oom:
        panic("Out of memory");
}

void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
{
}

#endif

/**
 * page_ext_get() - Get the extended information for a page.
 * @page: The page we're interested in.
 *
 * Ensures that the page_ext will remain valid until page_ext_put()
 * is called.
 *
 * Return: NULL if no page_ext exists for this page.
 * Context: Any context.  Caller may not sleep until they have called
 * page_ext_put().
 */
struct page_ext *page_ext_get(const struct page *page)
{
        struct page_ext *page_ext;

        rcu_read_lock();
        page_ext = lookup_page_ext(page);
        if (!page_ext) {
                rcu_read_unlock();
                return NULL;
        }

        return page_ext;
}

/**
 * page_ext_put() - Working with page extended information is done.
 * @page_ext: Page extended information received from page_ext_get().
 *
 * The page extended information of the page may not be valid after this
 * function is called.
 *
 * Return: None.
 * Context: Any context with corresponding page_ext_get() is called.
 */
void page_ext_put(struct page_ext *page_ext)
{
        if (unlikely(!page_ext))
                return;

        rcu_read_unlock();
}
















    2 











    2 
















    2 










    2 



















    2 








    2 
    1 





















    2 







    2 










    2 











    2 






















    2 







    2 



































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/hfs/bfind.c
 *
 * Copyright (C) 2001
 * Brad Boyer (flar@allandria.com)
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 *
 * Search routines for btrees
 */

#include <linux/slab.h>
#include "btree.h"

int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
{
        void *ptr;

        fd->tree = tree;
        fd->bnode = NULL;
        ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL);
        if (!ptr)
                return -ENOMEM;
        fd->search_key = ptr;
        fd->key = ptr + tree->max_key_len + 2;
        hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n",
                tree->cnid, __builtin_return_address(0));
        switch (tree->cnid) {
        case HFS_CAT_CNID:
                mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX);
                break;
        case HFS_EXT_CNID:
                mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX);
                break;
        case HFS_ATTR_CNID:
                mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX);
                break;
        default:
                return -EINVAL;
        }
        return 0;
}

void hfs_find_exit(struct hfs_find_data *fd)
{
        hfs_bnode_put(fd->bnode);
        kfree(fd->search_key);
        hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n",
                fd->tree->cnid, __builtin_return_address(0));
        mutex_unlock(&fd->tree->tree_lock);
        fd->tree = NULL;
}

/* Find the record in bnode that best matches key (not greater than...)*/
int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
{
        int cmpval;
        u16 off, len, keylen;
        int rec;
        int b, e;
        int res;

        b = 0;
        e = bnode->num_recs - 1;
        res = -ENOENT;
        do {
                rec = (e + b) / 2;
                len = hfs_brec_lenoff(bnode, rec, &off);
                keylen = hfs_brec_keylen(bnode, rec);
                if (keylen == 0) {
                        res = -EINVAL;
                        goto fail;
                }
                hfs_bnode_read(bnode, fd->key, off, keylen);
                cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
                if (!cmpval) {
                        e = rec;
                        res = 0;
                        goto done;
                }
                if (cmpval < 0)
                        b = rec + 1;
                else
                        e = rec - 1;
        } while (b <= e);
        if (rec != e && e >= 0) {
                len = hfs_brec_lenoff(bnode, e, &off);
                keylen = hfs_brec_keylen(bnode, e);
                if (keylen == 0) {
                        res = -EINVAL;
                        goto fail;
                }
                hfs_bnode_read(bnode, fd->key, off, keylen);
        }
done:
        fd->record = e;
        fd->keyoffset = off;
        fd->keylength = keylen;
        fd->entryoffset = off + keylen;
        fd->entrylength = len - keylen;
fail:
        return res;
}

/* Traverse a B*Tree from the root to a leaf finding best fit to key */
/* Return allocated copy of node found, set recnum to best record */
int hfs_brec_find(struct hfs_find_data *fd)
{
        struct hfs_btree *tree;
        struct hfs_bnode *bnode;
        u32 nidx, parent;
        __be32 data;
        int height, res;

        tree = fd->tree;
        if (fd->bnode)
                hfs_bnode_put(fd->bnode);
        fd->bnode = NULL;
        nidx = tree->root;
        if (!nidx)
                return -ENOENT;
        height = tree->depth;
        res = 0;
        parent = 0;
        for (;;) {
                bnode = hfs_bnode_find(tree, nidx);
                if (IS_ERR(bnode)) {
                        res = PTR_ERR(bnode);
                        bnode = NULL;
                        break;
                }
                if (bnode->height != height)
                        goto invalid;
                if (bnode->type != (--height ? HFS_NODE_INDEX : HFS_NODE_LEAF))
                        goto invalid;
                bnode->parent = parent;

                res = __hfs_brec_find(bnode, fd);
                if (!height)
                        break;
                if (fd->record < 0)
                        goto release;

                parent = nidx;
                hfs_bnode_read(bnode, &data, fd->entryoffset, 4);
                nidx = be32_to_cpu(data);
                hfs_bnode_put(bnode);
        }
        fd->bnode = bnode;
        return res;

invalid:
        pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n",
               height, bnode->height, bnode->type, nidx, parent);
        res = -EIO;
release:
        hfs_bnode_put(bnode);
        return res;
}

int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len)
{
        int res;

        res = hfs_brec_find(fd);
        if (res)
                return res;
        if (fd->entrylength > rec_len)
                return -EINVAL;
        hfs_bnode_read(fd->bnode, rec, fd->entryoffset, fd->entrylength);
        return 0;
}

int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
{
        struct hfs_btree *tree;
        struct hfs_bnode *bnode;
        int idx, res = 0;
        u16 off, len, keylen;

        bnode = fd->bnode;
        tree = bnode->tree;

        if (cnt < 0) {
                cnt = -cnt;
                while (cnt > fd->record) {
                        cnt -= fd->record + 1;
                        fd->record = bnode->num_recs - 1;
                        idx = bnode->prev;
                        if (!idx) {
                                res = -ENOENT;
                                goto out;
                        }
                        hfs_bnode_put(bnode);
                        bnode = hfs_bnode_find(tree, idx);
                        if (IS_ERR(bnode)) {
                                res = PTR_ERR(bnode);
                                bnode = NULL;
                                goto out;
                        }
                }
                fd->record -= cnt;
        } else {
                while (cnt >= bnode->num_recs - fd->record) {
                        cnt -= bnode->num_recs - fd->record;
                        fd->record = 0;
                        idx = bnode->next;
                        if (!idx) {
                                res = -ENOENT;
                                goto out;
                        }
                        hfs_bnode_put(bnode);
                        bnode = hfs_bnode_find(tree, idx);
                        if (IS_ERR(bnode)) {
                                res = PTR_ERR(bnode);
                                bnode = NULL;
                                goto out;
                        }
                }
                fd->record += cnt;
        }

        len = hfs_brec_lenoff(bnode, fd->record, &off);
        keylen = hfs_brec_keylen(bnode, fd->record);
        if (keylen == 0) {
                res = -EINVAL;
                goto out;
        }
        fd->keyoffset = off;
        fd->keylength = keylen;
        fd->entryoffset = off + keylen;
        fd->entrylength = len - keylen;
        hfs_bnode_read(bnode, fd->key, off, keylen);
out:
        fd->bnode = bnode;
        return res;
}




































    2 

    2 



























































































































    1 

    2 









































































































































    2 

































    2 






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/extable.h>
#include <linux/uaccess.h>
#include <linux/sched/debug.h>
#include <linux/bitfield.h>
#include <xen/xen.h>

#include <asm/fpu/api.h>
#include <asm/fred.h>
#include <asm/sev.h>
#include <asm/traps.h>
#include <asm/kdebug.h>
#include <asm/insn-eval.h>
#include <asm/sgx.h>

static inline unsigned long *pt_regs_nr(struct pt_regs *regs, int nr)
{
        int reg_offset = pt_regs_offset(regs, nr);
        static unsigned long __dummy;

        if (WARN_ON_ONCE(reg_offset < 0))
                return &__dummy;

        return (unsigned long *)((unsigned long)regs + reg_offset);
}

static inline unsigned long
ex_fixup_addr(const struct exception_table_entry *x)
{
        return (unsigned long)&x->fixup + x->fixup;
}

static bool ex_handler_default(const struct exception_table_entry *e,
                               struct pt_regs *regs)
{
        if (e->data & EX_FLAG_CLEAR_AX)
                regs->ax = 0;
        if (e->data & EX_FLAG_CLEAR_DX)
                regs->dx = 0;

        regs->ip = ex_fixup_addr(e);
        return true;
}

/*
 * This is the *very* rare case where we do a "load_unaligned_zeropad()"
 * and it's a page crosser into a non-existent page.
 *
 * This happens when we optimistically load a pathname a word-at-a-time
 * and the name is less than the full word and the  next page is not
 * mapped. Typically that only happens for CONFIG_DEBUG_PAGEALLOC.
 *
 * NOTE! The faulting address is always a 'mov mem,reg' type instruction
 * of size 'long', and the exception fixup must always point to right
 * after the instruction.
 */
static bool ex_handler_zeropad(const struct exception_table_entry *e,
                               struct pt_regs *regs,
                               unsigned long fault_addr)
{
        struct insn insn;
        const unsigned long mask = sizeof(long) - 1;
        unsigned long offset, addr, next_ip, len;
        unsigned long *reg;

        next_ip = ex_fixup_addr(e);
        len = next_ip - regs->ip;
        if (len > MAX_INSN_SIZE)
                return false;

        if (insn_decode(&insn, (void *) regs->ip, len, INSN_MODE_KERN))
                return false;
        if (insn.length != len)
                return false;

        if (insn.opcode.bytes[0] != 0x8b)
                return false;
        if (insn.opnd_bytes != sizeof(long))
                return false;

        addr = (unsigned long) insn_get_addr_ref(&insn, regs);
        if (addr == ~0ul)
                return false;

        offset = addr & mask;
        addr = addr & ~mask;
        if (fault_addr != addr + sizeof(long))
                return false;

        reg = insn_get_modrm_reg_ptr(&insn, regs);
        if (!reg)
                return false;

        *reg = *(unsigned long *)addr >> (offset * 8);
        return ex_handler_default(e, regs);
}

static bool ex_handler_fault(const struct exception_table_entry *fixup,
                             struct pt_regs *regs, int trapnr)
{
        regs->ax = trapnr;
        return ex_handler_default(fixup, regs);
}

static bool ex_handler_sgx(const struct exception_table_entry *fixup,
                           struct pt_regs *regs, int trapnr)
{
        regs->ax = trapnr | SGX_ENCLS_FAULT_FLAG;
        return ex_handler_default(fixup, regs);
}

/*
 * Handler for when we fail to restore a task's FPU state.  We should never get
 * here because the FPU state of a task using the FPU (task->thread.fpu.state)
 * should always be valid.  However, past bugs have allowed userspace to set
 * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn().
 * These caused XRSTOR to fail when switching to the task, leaking the FPU
 * registers of the task previously executing on the CPU.  Mitigate this class
 * of vulnerability by restoring from the initial state (essentially, zeroing
 * out all the FPU registers) if we can't restore from the task's FPU state.
 */
static bool ex_handler_fprestore(const struct exception_table_entry *fixup,
                                 struct pt_regs *regs)
{
        regs->ip = ex_fixup_addr(fixup);

        WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
                  (void *)instruction_pointer(regs));

        fpu_reset_from_exception_fixup();
        return true;
}

/*
 * On x86-64, we end up being imprecise with 'access_ok()', and allow
 * non-canonical user addresses to make the range comparisons simpler,
 * and to not have to worry about LAM being enabled.
 *
 * In fact, we allow up to one page of "slop" at the sign boundary,
 * which means that we can do access_ok() by just checking the sign
 * of the pointer for the common case of having a small access size.
 */
static bool gp_fault_address_ok(unsigned long fault_address)
{
#ifdef CONFIG_X86_64
        /* Is it in the "user space" part of the non-canonical space? */
        if (valid_user_address(fault_address))
                return true;

        /* .. or just above it? */
        fault_address -= PAGE_SIZE;
        if (valid_user_address(fault_address))
                return true;
#endif
        return false;
}

static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
                               struct pt_regs *regs, int trapnr,
                               unsigned long fault_address)
{
        WARN_ONCE(trapnr == X86_TRAP_GP && !gp_fault_address_ok(fault_address),
                "General protection fault in user access. Non-canonical address?");
        return ex_handler_default(fixup, regs);
}

static bool ex_handler_msr(const struct exception_table_entry *fixup,
                           struct pt_regs *regs, bool wrmsr, bool safe, int reg)
{
        if (__ONCE_LITE_IF(!safe && wrmsr)) {
                pr_warn("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
                        (unsigned int)regs->cx, (unsigned int)regs->dx,
                        (unsigned int)regs->ax,  regs->ip, (void *)regs->ip);
                show_stack_regs(regs);
        }

        if (__ONCE_LITE_IF(!safe && !wrmsr)) {
                pr_warn("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
                        (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
                show_stack_regs(regs);
        }

        if (!wrmsr) {
                /* Pretend that the read succeeded and returned 0. */
                regs->ax = 0;
                regs->dx = 0;
        }

        if (safe)
                *pt_regs_nr(regs, reg) = -EIO;

        return ex_handler_default(fixup, regs);
}

static bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
                                struct pt_regs *regs)
{
        if (static_cpu_has(X86_BUG_NULL_SEG))
                asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS));
        asm volatile ("mov %0, %%fs" : : "rm" (0));
        return ex_handler_default(fixup, regs);
}

static bool ex_handler_imm_reg(const struct exception_table_entry *fixup,
                               struct pt_regs *regs, int reg, int imm)
{
        *pt_regs_nr(regs, reg) = (long)imm;
        return ex_handler_default(fixup, regs);
}

static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup,
                                  struct pt_regs *regs, int trapnr,
                                  unsigned long fault_address,
                                  int reg, int imm)
{
        regs->cx = imm * regs->cx + *pt_regs_nr(regs, reg);
        return ex_handler_uaccess(fixup, regs, trapnr, fault_address);
}

#ifdef CONFIG_X86_FRED
static bool ex_handler_eretu(const struct exception_table_entry *fixup,
                             struct pt_regs *regs, unsigned long error_code)
{
        struct pt_regs *uregs = (struct pt_regs *)(regs->sp - offsetof(struct pt_regs, orig_ax));
        unsigned short ss = uregs->ss;
        unsigned short cs = uregs->cs;

        /*
         * Move the NMI bit from the invalid stack frame, which caused ERETU
         * to fault, to the fault handler's stack frame, thus to unblock NMI
         * with the fault handler's ERETS instruction ASAP if NMI is blocked.
         */
        regs->fred_ss.nmi = uregs->fred_ss.nmi;

        /*
         * Sync event information to uregs, i.e., the ERETU return frame, but
         * is it safe to write to the ERETU return frame which is just above
         * current event stack frame?
         *
         * The RSP used by FRED to push a stack frame is not the value in %rsp,
         * it is calculated from %rsp with the following 2 steps:
         * 1) RSP = %rsp - (IA32_FRED_CONFIG & 0x1c0)        // Reserve N*64 bytes
         * 2) RSP = RSP & ~0x3f                // Align to a 64-byte cache line
         * when an event delivery doesn't trigger a stack level change.
         *
         * Here is an example with N*64 (N=1) bytes reserved:
         *
         *  64-byte cache line ==>  ______________
         *                         |___Reserved___|
         *                         |__Event_data__|
         *                         |_____SS_______|
         *                         |_____RSP______|
         *                         |_____FLAGS____|
         *                         |_____CS_______|
         *                         |_____IP_______|
         *  64-byte cache line ==> |__Error_code__| <== ERETU return frame
         *                         |______________|
         *                         |______________|
         *                         |______________|
         *                         |______________|
         *                         |______________|
         *                         |______________|
         *                         |______________|
         *  64-byte cache line ==> |______________| <== RSP after step 1) and 2)
         *                         |___Reserved___|
         *                         |__Event_data__|
         *                         |_____SS_______|
         *                         |_____RSP______|
         *                         |_____FLAGS____|
         *                         |_____CS_______|
         *                         |_____IP_______|
         *  64-byte cache line ==> |__Error_code__| <== ERETS return frame
         *
         * Thus a new FRED stack frame will always be pushed below a previous
         * FRED stack frame ((N*64) bytes may be reserved between), and it is
         * safe to write to a previous FRED stack frame as they never overlap.
         */
        fred_info(uregs)->edata = fred_event_data(regs);
        uregs->ssx = regs->ssx;
        uregs->fred_ss.ss = ss;
        /* The NMI bit was moved away above */
        uregs->fred_ss.nmi = 0;
        uregs->csx = regs->csx;
        uregs->fred_cs.sl = 0;
        uregs->fred_cs.wfe = 0;
        uregs->cs = cs;
        uregs->orig_ax = error_code;

        return ex_handler_default(fixup, regs);
}
#endif

int ex_get_fixup_type(unsigned long ip)
{
        const struct exception_table_entry *e = search_exception_tables(ip);

        return e ? FIELD_GET(EX_DATA_TYPE_MASK, e->data) : EX_TYPE_NONE;
}

int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
                    unsigned long fault_addr)
{
        const struct exception_table_entry *e;
        int type, reg, imm;

#ifdef CONFIG_PNPBIOS
        if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
                extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
                extern u32 pnp_bios_is_utter_crap;
                pnp_bios_is_utter_crap = 1;
                printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
                __asm__ volatile(
                        "movl %0, %%esp\n\t"
                        "jmp *%1\n\t"
                        : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
                panic("do_trap: can't hit this");
        }
#endif

        e = search_exception_tables(regs->ip);
        if (!e)
                return 0;

        type = FIELD_GET(EX_DATA_TYPE_MASK, e->data);
        reg  = FIELD_GET(EX_DATA_REG_MASK,  e->data);
        imm  = FIELD_GET(EX_DATA_IMM_MASK,  e->data);

        switch (type) {
        case EX_TYPE_DEFAULT:
        case EX_TYPE_DEFAULT_MCE_SAFE:
                return ex_handler_default(e, regs);
        case EX_TYPE_FAULT:
        case EX_TYPE_FAULT_MCE_SAFE:
                return ex_handler_fault(e, regs, trapnr);
        case EX_TYPE_UACCESS:
                return ex_handler_uaccess(e, regs, trapnr, fault_addr);
        case EX_TYPE_CLEAR_FS:
                return ex_handler_clear_fs(e, regs);
        case EX_TYPE_FPU_RESTORE:
                return ex_handler_fprestore(e, regs);
        case EX_TYPE_BPF:
                return ex_handler_bpf(e, regs);
        case EX_TYPE_WRMSR:
                return ex_handler_msr(e, regs, true, false, reg);
        case EX_TYPE_RDMSR:
                return ex_handler_msr(e, regs, false, false, reg);
        case EX_TYPE_WRMSR_SAFE:
                return ex_handler_msr(e, regs, true, true, reg);
        case EX_TYPE_RDMSR_SAFE:
                return ex_handler_msr(e, regs, false, true, reg);
        case EX_TYPE_WRMSR_IN_MCE:
                ex_handler_msr_mce(regs, true);
                break;
        case EX_TYPE_RDMSR_IN_MCE:
                ex_handler_msr_mce(regs, false);
                break;
        case EX_TYPE_POP_REG:
                regs->sp += sizeof(long);
                fallthrough;
        case EX_TYPE_IMM_REG:
                return ex_handler_imm_reg(e, regs, reg, imm);
        case EX_TYPE_FAULT_SGX:
                return ex_handler_sgx(e, regs, trapnr);
        case EX_TYPE_UCOPY_LEN:
                return ex_handler_ucopy_len(e, regs, trapnr, fault_addr, reg, imm);
        case EX_TYPE_ZEROPAD:
                return ex_handler_zeropad(e, regs, fault_addr);
#ifdef CONFIG_X86_FRED
        case EX_TYPE_ERETU:
                return ex_handler_eretu(e, regs, error_code);
#endif
        }
        BUG();
}

extern unsigned int early_recursion_flag;

/* Restricted version used during very early boot */
void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
{
        /* Ignore early NMIs. */
        if (trapnr == X86_TRAP_NMI)
                return;

        if (early_recursion_flag > 2)
                goto halt_loop;

        /*
         * Old CPUs leave the high bits of CS on the stack
         * undefined.  I'm not sure which CPUs do this, but at least
         * the 486 DX works this way.
         * Xen pv domains are not using the default __KERNEL_CS.
         */
        if (!xen_pv_domain() && regs->cs != __KERNEL_CS)
                goto fail;

        /*
         * The full exception fixup machinery is available as soon as
         * the early IDT is loaded.  This means that it is the
         * responsibility of extable users to either function correctly
         * when handlers are invoked early or to simply avoid causing
         * exceptions before they're ready to handle them.
         *
         * This is better than filtering which handlers can be used,
         * because refusing to call a handler here is guaranteed to
         * result in a hard-to-debug panic.
         *
         * Keep in mind that not all vectors actually get here.  Early
         * page faults, for example, are special.
         */
        if (fixup_exception(regs, trapnr, regs->orig_ax, 0))
                return;

        if (trapnr == X86_TRAP_UD) {
                if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
                        /* Skip the ud2. */
                        regs->ip += LEN_UD2;
                        return;
                }

                /*
                 * If this was a BUG and report_bug returns or if this
                 * was just a normal #UD, we want to continue onward and
                 * crash.
                 */
        }

fail:
        early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n",
                     (unsigned)trapnr, (unsigned long)regs->cs, regs->ip,
                     regs->orig_ax, read_cr2());

        show_regs(regs);

halt_loop:
        while (true)
                halt();
}



















































































































   12 























































































































































   12 




   11 


























































































































































































































































































































































































































































    2 















































































































































































































































































































































































































































































































































    1 










    1 

































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PGTABLE_H
#define _LINUX_PGTABLE_H

#include <linux/pfn.h>
#include <asm/pgtable.h>

#define PMD_ORDER        (PMD_SHIFT - PAGE_SHIFT)
#define PUD_ORDER        (PUD_SHIFT - PAGE_SHIFT)

#ifndef __ASSEMBLY__
#ifdef CONFIG_MMU

#include <linux/mm_types.h>
#include <linux/bug.h>
#include <linux/errno.h>
#include <asm-generic/pgtable_uffd.h>
#include <linux/page_table_check.h>

#if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \
        defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS
#error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED
#endif

/*
 * On almost all architectures and configurations, 0 can be used as the
 * upper ceiling to free_pgtables(): on many architectures it has the same
 * effect as using TASK_SIZE.  However, there is one configuration which
 * must impose a more careful limit, to avoid freeing kernel pgtables.
 */
#ifndef USER_PGTABLES_CEILING
#define USER_PGTABLES_CEILING        0UL
#endif

/*
 * This defines the first usable user address. Platforms
 * can override its value with custom FIRST_USER_ADDRESS
 * defined in their respective <asm/pgtable.h>.
 */
#ifndef FIRST_USER_ADDRESS
#define FIRST_USER_ADDRESS        0UL
#endif

/*
 * This defines the generic helper for accessing PMD page
 * table page. Although platforms can still override this
 * via their respective <asm/pgtable.h>.
 */
#ifndef pmd_pgtable
#define pmd_pgtable(pmd) pmd_page(pmd)
#endif

#define pmd_folio(pmd) page_folio(pmd_page(pmd))

/*
 * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD]
 *
 * The pXx_index() functions return the index of the entry in the page
 * table page which would control the given virtual address
 *
 * As these functions may be used by the same code for different levels of
 * the page table folding, they are always available, regardless of
 * CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0
 * because in such cases PTRS_PER_PxD equals 1.
 */

static inline unsigned long pte_index(unsigned long address)
{
        return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
}

#ifndef pmd_index
static inline unsigned long pmd_index(unsigned long address)
{
        return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
}
#define pmd_index pmd_index
#endif

#ifndef pud_index
static inline unsigned long pud_index(unsigned long address)
{
        return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
}
#define pud_index pud_index
#endif

#ifndef pgd_index
/* Must be a compile-time constant, so implement it as a macro */
#define pgd_index(a)  (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
#endif

#ifndef pte_offset_kernel
static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
{
        return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
}
#define pte_offset_kernel pte_offset_kernel
#endif

#ifdef CONFIG_HIGHPTE
#define __pte_map(pmd, address) \
        ((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address)))
#define pte_unmap(pte)        do {        \
        kunmap_local((pte));        \
        rcu_read_unlock();        \
} while (0)
#else
static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address)
{
        return pte_offset_kernel(pmd, address);
}
static inline void pte_unmap(pte_t *pte)
{
        rcu_read_unlock();
}
#endif

void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable);

/* Find an entry in the second-level page table.. */
#ifndef pmd_offset
static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
{
        return pud_pgtable(*pud) + pmd_index(address);
}
#define pmd_offset pmd_offset
#endif

#ifndef pud_offset
static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
{
        return p4d_pgtable(*p4d) + pud_index(address);
}
#define pud_offset pud_offset
#endif

static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address)
{
        return (pgd + pgd_index(address));
};

/*
 * a shortcut to get a pgd_t in a given mm
 */
#ifndef pgd_offset
#define pgd_offset(mm, address)                pgd_offset_pgd((mm)->pgd, (address))
#endif

/*
 * a shortcut which implies the use of the kernel's pgd, instead
 * of a process's
 */
#define pgd_offset_k(address)                pgd_offset(&init_mm, (address))

/*
 * In many cases it is known that a virtual address is mapped at PMD or PTE
 * level, so instead of traversing all the page table levels, we can get a
 * pointer to the PMD entry in user or kernel page table or translate a virtual
 * address to the pointer in the PTE in the kernel page tables with simple
 * helpers.
 */
static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va)
{
        return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va);
}

static inline pmd_t *pmd_off_k(unsigned long va)
{
        return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va);
}

static inline pte_t *virt_to_kpte(unsigned long vaddr)
{
        pmd_t *pmd = pmd_off_k(vaddr);

        return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr);
}

#ifndef pmd_young
static inline int pmd_young(pmd_t pmd)
{
        return 0;
}
#endif

#ifndef pmd_dirty
static inline int pmd_dirty(pmd_t pmd)
{
        return 0;
}
#endif

/*
 * A facility to provide lazy MMU batching.  This allows PTE updates and
 * page invalidations to be delayed until a call to leave lazy MMU mode
 * is issued.  Some architectures may benefit from doing this, and it is
 * beneficial for both shadow and direct mode hypervisors, which may batch
 * the PTE updates which happen during this window.  Note that using this
 * interface requires that read hazards be removed from the code.  A read
 * hazard could result in the direct mode hypervisor case, since the actual
 * write to the page tables may not yet have taken place, so reads though
 * a raw PTE pointer after it has been modified are not guaranteed to be
 * up to date.  This mode can only be entered and left under the protection of
 * the page table locks for all page tables which may be modified.  In the UP
 * case, this is required so that preemption is disabled, and in the SMP case,
 * it must synchronize the delayed page table writes properly on other CPUs.
 */
#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
#define arch_enter_lazy_mmu_mode()        do {} while (0)
#define arch_leave_lazy_mmu_mode()        do {} while (0)
#define arch_flush_lazy_mmu_mode()        do {} while (0)
#endif

#ifndef pte_batch_hint
/**
 * pte_batch_hint - Number of pages that can be added to batch without scanning.
 * @ptep: Page table pointer for the entry.
 * @pte: Page table entry.
 *
 * Some architectures know that a set of contiguous ptes all map the same
 * contiguous memory with the same permissions. In this case, it can provide a
 * hint to aid pte batching without the core code needing to scan every pte.
 *
 * An architecture implementation may ignore the PTE accessed state. Further,
 * the dirty state must apply atomically to all the PTEs described by the hint.
 *
 * May be overridden by the architecture, else pte_batch_hint is always 1.
 */
static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte)
{
        return 1;
}
#endif

#ifndef pte_advance_pfn
static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
{
        return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
}
#endif

#define pte_next_pfn(pte) pte_advance_pfn(pte, 1)

#ifndef set_ptes
/**
 * set_ptes - Map consecutive pages to a contiguous range of addresses.
 * @mm: Address space to map the pages into.
 * @addr: Address to map the first page at.
 * @ptep: Page table pointer for the first entry.
 * @pte: Page table entry for the first page.
 * @nr: Number of pages to map.
 *
 * When nr==1, initial state of pte may be present or not present, and new state
 * may be present or not present. When nr>1, initial state of all ptes must be
 * not present, and new state must be present.
 *
 * May be overridden by the architecture, or the architecture can define
 * set_pte() and PFN_PTE_SHIFT.
 *
 * Context: The caller holds the page table lock.  The pages all belong
 * to the same folio.  The PTEs are all in the same PMD.
 */
static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, pte_t pte, unsigned int nr)
{
        page_table_check_ptes_set(mm, ptep, pte, nr);

        arch_enter_lazy_mmu_mode();
        for (;;) {
                set_pte(ptep, pte);
                if (--nr == 0)
                        break;
                ptep++;
                pte = pte_next_pfn(pte);
        }
        arch_leave_lazy_mmu_mode();
}
#endif
#define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1)

#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pte_t *ptep,
                                 pte_t entry, int dirty);
#endif

#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp,
                                 pmd_t entry, int dirty);
extern int pudp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pud_t *pudp,
                                 pud_t entry, int dirty);
#else
static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
                                        unsigned long address, pmd_t *pmdp,
                                        pmd_t entry, int dirty)
{
        BUILD_BUG();
        return 0;
}
static inline int pudp_set_access_flags(struct vm_area_struct *vma,
                                        unsigned long address, pud_t *pudp,
                                        pud_t entry, int dirty)
{
        BUILD_BUG();
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef ptep_get
static inline pte_t ptep_get(pte_t *ptep)
{
        return READ_ONCE(*ptep);
}
#endif

#ifndef pmdp_get
static inline pmd_t pmdp_get(pmd_t *pmdp)
{
        return READ_ONCE(*pmdp);
}
#endif

#ifndef pudp_get
static inline pud_t pudp_get(pud_t *pudp)
{
        return READ_ONCE(*pudp);
}
#endif

#ifndef p4dp_get
static inline p4d_t p4dp_get(p4d_t *p4dp)
{
        return READ_ONCE(*p4dp);
}
#endif

#ifndef pgdp_get
static inline pgd_t pgdp_get(pgd_t *pgdp)
{
        return READ_ONCE(*pgdp);
}
#endif

#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long address,
                                            pte_t *ptep)
{
        pte_t pte = ptep_get(ptep);
        int r = 1;
        if (!pte_young(pte))
                r = 0;
        else
                set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte));
        return r;
}
#endif

#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long address,
                                            pmd_t *pmdp)
{
        pmd_t pmd = *pmdp;
        int r = 1;
        if (!pmd_young(pmd))
                r = 0;
        else
                set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd));
        return r;
}
#else
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long address,
                                            pmd_t *pmdp)
{
        BUILD_BUG();
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
int ptep_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pte_t *ptep);
#endif

#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp);
#else
/*
 * Despite relevant to THP only, this API is called from generic rmap code
 * under PageTransHuge(), hence needs a dummy implementation for !THP
 */
static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                         unsigned long address, pmd_t *pmdp)
{
        BUILD_BUG();
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef arch_has_hw_nonleaf_pmd_young
/*
 * Return whether the accessed bit in non-leaf PMD entries is supported on the
 * local CPU.
 */
static inline bool arch_has_hw_nonleaf_pmd_young(void)
{
        return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG);
}
#endif

#ifndef arch_has_hw_pte_young
/*
 * Return whether the accessed bit is supported on the local CPU.
 *
 * This stub assumes accessing through an old PTE triggers a page fault.
 * Architectures that automatically set the access bit should overwrite it.
 */
static inline bool arch_has_hw_pte_young(void)
{
        return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG);
}
#endif

#ifndef arch_check_zapped_pte
static inline void arch_check_zapped_pte(struct vm_area_struct *vma,
                                         pte_t pte)
{
}
#endif

#ifndef arch_check_zapped_pmd
static inline void arch_check_zapped_pmd(struct vm_area_struct *vma,
                                         pmd_t pmd)
{
}
#endif

#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
                                       unsigned long address,
                                       pte_t *ptep)
{
        pte_t pte = ptep_get(ptep);
        pte_clear(mm, address, ptep);
        page_table_check_pte_clear(mm, pte);
        return pte;
}
#endif

#ifndef clear_young_dirty_ptes
/**
 * clear_young_dirty_ptes - Mark PTEs that map consecutive pages of the
 *                same folio as old/clean.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to mark old/clean.
 * @flags: Flags to modify the PTE batch semantics.
 *
 * May be overridden by the architecture; otherwise, implemented by
 * get_and_clear/modify/set for each pte in the range.
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline void clear_young_dirty_ptes(struct vm_area_struct *vma,
                                          unsigned long addr, pte_t *ptep,
                                          unsigned int nr, cydp_t flags)
{
        pte_t pte;

        for (;;) {
                if (flags == CYDP_CLEAR_YOUNG)
                        ptep_test_and_clear_young(vma, addr, ptep);
                else {
                        pte = ptep_get_and_clear(vma->vm_mm, addr, ptep);
                        if (flags & CYDP_CLEAR_YOUNG)
                                pte = pte_mkold(pte);
                        if (flags & CYDP_CLEAR_DIRTY)
                                pte = pte_mkclean(pte);
                        set_pte_at(vma->vm_mm, addr, ptep, pte);
                }
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}
#endif

static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
                              pte_t *ptep)
{
        ptep_get_and_clear(mm, addr, ptep);
}

#ifdef CONFIG_GUP_GET_PXX_LOW_HIGH
/*
 * For walking the pagetables without holding any locks.  Some architectures
 * (eg x86-32 PAE) cannot load the entries atomically without using expensive
 * instructions.  We are guaranteed that a PTE will only either go from not
 * present to present, or present to not present -- it will not switch to a
 * completely different present page without a TLB flush inbetween; which we
 * are blocking by holding interrupts off.
 *
 * Setting ptes from not present to present goes:
 *
 *   ptep->pte_high = h;
 *   smp_wmb();
 *   ptep->pte_low = l;
 *
 * And present to not present goes:
 *
 *   ptep->pte_low = 0;
 *   smp_wmb();
 *   ptep->pte_high = 0;
 *
 * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
 * We load pte_high *after* loading pte_low, which ensures we don't see an older
 * value of pte_high.  *Then* we recheck pte_low, which ensures that we haven't
 * picked up a changed pte high. We might have gotten rubbish values from
 * pte_low and pte_high, but we are guaranteed that pte_low will not have the
 * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
 * operates on present ptes we're safe.
 */
static inline pte_t ptep_get_lockless(pte_t *ptep)
{
        pte_t pte;

        do {
                pte.pte_low = ptep->pte_low;
                smp_rmb();
                pte.pte_high = ptep->pte_high;
                smp_rmb();
        } while (unlikely(pte.pte_low != ptep->pte_low));

        return pte;
}
#define ptep_get_lockless ptep_get_lockless

#if CONFIG_PGTABLE_LEVELS > 2
static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
{
        pmd_t pmd;

        do {
                pmd.pmd_low = pmdp->pmd_low;
                smp_rmb();
                pmd.pmd_high = pmdp->pmd_high;
                smp_rmb();
        } while (unlikely(pmd.pmd_low != pmdp->pmd_low));

        return pmd;
}
#define pmdp_get_lockless pmdp_get_lockless
#define pmdp_get_lockless_sync() tlb_remove_table_sync_one()
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
#endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */

/*
 * We require that the PTE can be read atomically.
 */
#ifndef ptep_get_lockless
static inline pte_t ptep_get_lockless(pte_t *ptep)
{
        return ptep_get(ptep);
}
#endif

#ifndef pmdp_get_lockless
static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
{
        return pmdp_get(pmdp);
}
static inline void pmdp_get_lockless_sync(void)
{
}
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
                                            unsigned long address,
                                            pmd_t *pmdp)
{
        pmd_t pmd = *pmdp;

        pmd_clear(pmdp);
        page_table_check_pmd_clear(mm, pmd);

        return pmd;
}
#endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */
#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
                                            unsigned long address,
                                            pud_t *pudp)
{
        pud_t pud = *pudp;

        pud_clear(pudp);
        page_table_check_pud_clear(mm, pud);

        return pud;
}
#endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
                                            unsigned long address, pmd_t *pmdp,
                                            int full)
{
        return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
}
#endif

#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
                                            unsigned long address, pud_t *pudp,
                                            int full)
{
        return pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
                                            unsigned long address, pte_t *ptep,
                                            int full)
{
        return ptep_get_and_clear(mm, address, ptep);
}
#endif

#ifndef get_and_clear_full_ptes
/**
 * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of
 *                             the same folio, collecting dirty/accessed bits.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear.
 * @full: Whether we are clearing a full mm.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the
 * returned PTE.
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
                unsigned long addr, pte_t *ptep, unsigned int nr, int full)
{
        pte_t pte, tmp_pte;

        pte = ptep_get_and_clear_full(mm, addr, ptep, full);
        while (--nr) {
                ptep++;
                addr += PAGE_SIZE;
                tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full);
                if (pte_dirty(tmp_pte))
                        pte = pte_mkdirty(pte);
                if (pte_young(tmp_pte))
                        pte = pte_mkyoung(pte);
        }
        return pte;
}
#endif

#ifndef clear_full_ptes
/**
 * clear_full_ptes - Clear present PTEs that map consecutive pages of the same
 *                     folio.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear.
 * @full: Whether we are clearing a full mm.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_get_and_clear_full().
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, unsigned int nr, int full)
{
        for (;;) {
                ptep_get_and_clear_full(mm, addr, ptep, full);
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}
#endif

/*
 * If two threads concurrently fault at the same page, the thread that
 * won the race updates the PTE and its local TLB/Cache. The other thread
 * gives up, simply does nothing, and continues; on architectures where
 * software can update TLB,  local TLB can be updated here to avoid next page
 * fault. This function updates TLB only, do nothing with cache or others.
 * It is the difference with function update_mmu_cache.
 */
#ifndef __HAVE_ARCH_UPDATE_MMU_TLB
static inline void update_mmu_tlb(struct vm_area_struct *vma,
                                unsigned long address, pte_t *ptep)
{
}
#define __HAVE_ARCH_UPDATE_MMU_TLB
#endif

/*
 * Some architectures may be able to avoid expensive synchronization
 * primitives when modifications are made to PTE's which are already
 * not present, or in the process of an address space destruction.
 */
#ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL
static inline void pte_clear_not_present_full(struct mm_struct *mm,
                                              unsigned long address,
                                              pte_t *ptep,
                                              int full)
{
        pte_clear(mm, address, ptep);
}
#endif

#ifndef clear_not_present_full_ptes
/**
 * clear_not_present_full_ptes - Clear multiple not present PTEs which are
 *                                 consecutive in the pgtable.
 * @mm: Address space the ptes represent.
 * @addr: Address of the first pte.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear.
 * @full: Whether we are clearing a full mm.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over pte_clear_not_present_full().
 *
 * Context: The caller holds the page table lock.  The PTEs are all not present.
 * The PTEs are all in the same PMD.
 */
static inline void clear_not_present_full_ptes(struct mm_struct *mm,
                unsigned long addr, pte_t *ptep, unsigned int nr, int full)
{
        for (;;) {
                pte_clear_not_present_full(mm, addr, ptep, full);
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pte_t *ptep);
#endif

#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pmd_t *pmdp);
extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pud_t *pudp);
#endif

#ifndef pte_mkwrite
static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
        return pte_mkwrite_novma(pte);
}
#endif

#if defined(CONFIG_ARCH_WANT_PMD_MKWRITE) && !defined(pmd_mkwrite)
static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
        return pmd_mkwrite_novma(pmd);
}
#endif

#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
struct mm_struct;
static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
{
        pte_t old_pte = ptep_get(ptep);
        set_pte_at(mm, address, ptep, pte_wrprotect(old_pte));
}
#endif

#ifndef wrprotect_ptes
/**
 * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same
 *                    folio.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to write-protect.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_set_wrprotect().
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, unsigned int nr)
{
        for (;;) {
                ptep_set_wrprotect(mm, addr, ptep);
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}
#endif

/*
 * On some architectures hardware does not set page access bit when accessing
 * memory page, it is responsibility of software setting this bit. It brings
 * out extra page fault penalty to track page access bit. For optimization page
 * access bit can be set during all page fault flow on these arches.
 * To be differentiate with macro pte_mkyoung, this macro is used on platforms
 * where software maintains page access bit.
 */
#ifndef pte_sw_mkyoung
static inline pte_t pte_sw_mkyoung(pte_t pte)
{
        return pte;
}
#define pte_sw_mkyoung        pte_sw_mkyoung
#endif

#ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pmd_t *pmdp)
{
        pmd_t old_pmd = *pmdp;
        set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd));
}
#else
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pmd_t *pmdp)
{
        BUILD_BUG();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
#ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void pudp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pud_t *pudp)
{
        pud_t old_pud = *pudp;

        set_pud_at(mm, address, pudp, pud_wrprotect(old_pud));
}
#else
static inline void pudp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pud_t *pudp)
{
        BUILD_BUG();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#endif

#ifndef pmdp_collapse_flush
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp);
#else
static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                        unsigned long address,
                                        pmd_t *pmdp)
{
        BUILD_BUG();
        return *pmdp;
}
#define pmdp_collapse_flush pmdp_collapse_flush
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                       pgtable_t pgtable);
#endif

#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
#endif

#ifndef arch_needs_pgtable_deposit
#define arch_needs_pgtable_deposit() (false)
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * This is an implementation of pmdp_establish() that is only suitable for an
 * architecture that doesn't have hardware dirty/accessed bits. In this case we
 * can't race with CPU which sets these bits and non-atomic approach is fine.
 */
static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
        pmd_t old_pmd = *pmdp;
        set_pmd_at(vma->vm_mm, address, pmdp, pmd);
        return old_pmd;
}
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE
extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                            pmd_t *pmdp);
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD

/*
 * pmdp_invalidate_ad() invalidates the PMD while changing a transparent
 * hugepage mapping in the page tables. This function is similar to
 * pmdp_invalidate(), but should only be used if the access and dirty bits would
 * not be cleared by the software in the new PMD value. The function ensures
 * that hardware changes of the access and dirty bits updates would not be lost.
 *
 * Doing so can allow in certain architectures to avoid a TLB flush in most
 * cases. Yet, another TLB flush might be necessary later if the PMD update
 * itself requires such flush (e.g., if protection was set to be stricter). Yet,
 * even when a TLB flush is needed because of the update, the caller may be able
 * to batch these TLB flushing operations, so fewer TLB flush operations are
 * needed.
 */
extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma,
                                unsigned long address, pmd_t *pmdp);
#endif

#ifndef __HAVE_ARCH_PTE_SAME
static inline int pte_same(pte_t pte_a, pte_t pte_b)
{
        return pte_val(pte_a) == pte_val(pte_b);
}
#endif

#ifndef __HAVE_ARCH_PTE_UNUSED
/*
 * Some architectures provide facilities to virtualization guests
 * so that they can flag allocated pages as unused. This allows the
 * host to transparently reclaim unused pages. This function returns
 * whether the pte's page is unused.
 */
static inline int pte_unused(pte_t pte)
{
        return 0;
}
#endif

#ifndef pte_access_permitted
#define pte_access_permitted(pte, write) \
        (pte_present(pte) && (!(write) || pte_write(pte)))
#endif

#ifndef pmd_access_permitted
#define pmd_access_permitted(pmd, write) \
        (pmd_present(pmd) && (!(write) || pmd_write(pmd)))
#endif

#ifndef pud_access_permitted
#define pud_access_permitted(pud, write) \
        (pud_present(pud) && (!(write) || pud_write(pud)))
#endif

#ifndef p4d_access_permitted
#define p4d_access_permitted(p4d, write) \
        (p4d_present(p4d) && (!(write) || p4d_write(p4d)))
#endif

#ifndef pgd_access_permitted
#define pgd_access_permitted(pgd, write) \
        (pgd_present(pgd) && (!(write) || pgd_write(pgd)))
#endif

#ifndef __HAVE_ARCH_PMD_SAME
static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
{
        return pmd_val(pmd_a) == pmd_val(pmd_b);
}
#endif

#ifndef pud_same
static inline int pud_same(pud_t pud_a, pud_t pud_b)
{
        return pud_val(pud_a) == pud_val(pud_b);
}
#define pud_same pud_same
#endif

#ifndef __HAVE_ARCH_P4D_SAME
static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b)
{
        return p4d_val(p4d_a) == p4d_val(p4d_b);
}
#endif

#ifndef __HAVE_ARCH_PGD_SAME
static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b)
{
        return pgd_val(pgd_a) == pgd_val(pgd_b);
}
#endif

/*
 * Use set_p*_safe(), and elide TLB flushing, when confident that *no*
 * TLB flush will be required as a result of the "set". For example, use
 * in scenarios where it is known ahead of time that the routine is
 * setting non-present entries, or re-setting an existing entry to the
 * same value. Otherwise, use the typical "set" helpers and flush the
 * TLB.
 */
#define set_pte_safe(ptep, pte) \
({ \
        WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \
        set_pte(ptep, pte); \
})

#define set_pmd_safe(pmdp, pmd) \
({ \
        WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \
        set_pmd(pmdp, pmd); \
})

#define set_pud_safe(pudp, pud) \
({ \
        WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \
        set_pud(pudp, pud); \
})

#define set_p4d_safe(p4dp, p4d) \
({ \
        WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \
        set_p4d(p4dp, p4d); \
})

#define set_pgd_safe(pgdp, pgd) \
({ \
        WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \
        set_pgd(pgdp, pgd); \
})

#ifndef __HAVE_ARCH_DO_SWAP_PAGE
/*
 * Some architectures support metadata associated with a page. When a
 * page is being swapped out, this metadata must be saved so it can be
 * restored when the page is swapped back in. SPARC M7 and newer
 * processors support an ADI (Application Data Integrity) tag for the
 * page as metadata for the page. arch_do_swap_page() can restore this
 * metadata when a page is swapped back in.
 */
static inline void arch_do_swap_page(struct mm_struct *mm,
                                     struct vm_area_struct *vma,
                                     unsigned long addr,
                                     pte_t pte, pte_t oldpte)
{

}
#endif

#ifndef __HAVE_ARCH_UNMAP_ONE
/*
 * Some architectures support metadata associated with a page. When a
 * page is being swapped out, this metadata must be saved so it can be
 * restored when the page is swapped back in. SPARC M7 and newer
 * processors support an ADI (Application Data Integrity) tag for the
 * page as metadata for the page. arch_unmap_one() can save this
 * metadata on a swap-out of a page.
 */
static inline int arch_unmap_one(struct mm_struct *mm,
                                  struct vm_area_struct *vma,
                                  unsigned long addr,
                                  pte_t orig_pte)
{
        return 0;
}
#endif

/*
 * Allow architectures to preserve additional metadata associated with
 * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function
 * prototypes must be defined in the arch-specific asm/pgtable.h file.
 */
#ifndef __HAVE_ARCH_PREPARE_TO_SWAP
static inline int arch_prepare_to_swap(struct folio *folio)
{
        return 0;
}
#endif

#ifndef __HAVE_ARCH_SWAP_INVALIDATE
static inline void arch_swap_invalidate_page(int type, pgoff_t offset)
{
}

static inline void arch_swap_invalidate_area(int type)
{
}
#endif

#ifndef __HAVE_ARCH_SWAP_RESTORE
static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
{
}
#endif

#ifndef __HAVE_ARCH_PGD_OFFSET_GATE
#define pgd_offset_gate(mm, addr)        pgd_offset(mm, addr)
#endif

#ifndef __HAVE_ARCH_MOVE_PTE
#define move_pte(pte, old_addr, new_addr)        (pte)
#endif

#ifndef pte_accessible
# define pte_accessible(mm, pte)        ((void)(pte), 1)
#endif

#ifndef flush_tlb_fix_spurious_fault
#define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address)
#endif

/*
 * When walking page tables, get the address of the next boundary,
 * or the end address of the range if that comes earlier.  Although no
 * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
 */

#define pgd_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})

#ifndef p4d_addr_end
#define p4d_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

#ifndef pud_addr_end
#define pud_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

#ifndef pmd_addr_end
#define pmd_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

/*
 * When walking page tables, we usually want to skip any p?d_none entries;
 * and any p?d_bad entries - reporting the error before resetting to none.
 * Do the tests inline, but report and clear the bad entry in mm/memory.c.
 */
void pgd_clear_bad(pgd_t *);

#ifndef __PAGETABLE_P4D_FOLDED
void p4d_clear_bad(p4d_t *);
#else
#define p4d_clear_bad(p4d)        do { } while (0)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
void pud_clear_bad(pud_t *);
#else
#define pud_clear_bad(p4d)        do { } while (0)
#endif

void pmd_clear_bad(pmd_t *);

static inline int pgd_none_or_clear_bad(pgd_t *pgd)
{
        if (pgd_none(*pgd))
                return 1;
        if (unlikely(pgd_bad(*pgd))) {
                pgd_clear_bad(pgd);
                return 1;
        }
        return 0;
}

static inline int p4d_none_or_clear_bad(p4d_t *p4d)
{
        if (p4d_none(*p4d))
                return 1;
        if (unlikely(p4d_bad(*p4d))) {
                p4d_clear_bad(p4d);
                return 1;
        }
        return 0;
}

static inline int pud_none_or_clear_bad(pud_t *pud)
{
        if (pud_none(*pud))
                return 1;
        if (unlikely(pud_bad(*pud))) {
                pud_clear_bad(pud);
                return 1;
        }
        return 0;
}

static inline int pmd_none_or_clear_bad(pmd_t *pmd)
{
        if (pmd_none(*pmd))
                return 1;
        if (unlikely(pmd_bad(*pmd))) {
                pmd_clear_bad(pmd);
                return 1;
        }
        return 0;
}

static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma,
                                             unsigned long addr,
                                             pte_t *ptep)
{
        /*
         * Get the current pte state, but zero it out to make it
         * non-present, preventing the hardware from asynchronously
         * updating it.
         */
        return ptep_get_and_clear(vma->vm_mm, addr, ptep);
}

static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma,
                                             unsigned long addr,
                                             pte_t *ptep, pte_t pte)
{
        /*
         * The pte is non-present, so there's no hardware state to
         * preserve.
         */
        set_pte_at(vma->vm_mm, addr, ptep, pte);
}

#ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
/*
 * Start a pte protection read-modify-write transaction, which
 * protects against asynchronous hardware modifications to the pte.
 * The intention is not to prevent the hardware from making pte
 * updates, but to prevent any updates it may make from being lost.
 *
 * This does not protect against other software modifications of the
 * pte; the appropriate pte lock must be held over the transaction.
 *
 * Note that this interface is intended to be batchable, meaning that
 * ptep_modify_prot_commit may not actually update the pte, but merely
 * queue the update to be done at some later time.  The update must be
 * actually committed before the pte lock is released, however.
 */
static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           pte_t *ptep)
{
        return __ptep_modify_prot_start(vma, addr, ptep);
}

/*
 * Commit an update to a pte, leaving any hardware-controlled bits in
 * the PTE unmodified.
 */
static inline void ptep_modify_prot_commit(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           pte_t *ptep, pte_t old_pte, pte_t pte)
{
        __ptep_modify_prot_commit(vma, addr, ptep, pte);
}
#endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
#endif /* CONFIG_MMU */

/*
 * No-op macros that just return the current protection value. Defined here
 * because these macros can be used even if CONFIG_MMU is not defined.
 */

#ifndef pgprot_nx
#define pgprot_nx(prot)        (prot)
#endif

#ifndef pgprot_noncached
#define pgprot_noncached(prot)        (prot)
#endif

#ifndef pgprot_writecombine
#define pgprot_writecombine pgprot_noncached
#endif

#ifndef pgprot_writethrough
#define pgprot_writethrough pgprot_noncached
#endif

#ifndef pgprot_device
#define pgprot_device pgprot_noncached
#endif

#ifndef pgprot_mhp
#define pgprot_mhp(prot)        (prot)
#endif

#ifdef CONFIG_MMU
#ifndef pgprot_modify
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
{
        if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot)))
                newprot = pgprot_noncached(newprot);
        if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot)))
                newprot = pgprot_writecombine(newprot);
        if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot)))
                newprot = pgprot_device(newprot);
        return newprot;
}
#endif
#endif /* CONFIG_MMU */

#ifndef pgprot_encrypted
#define pgprot_encrypted(prot)        (prot)
#endif

#ifndef pgprot_decrypted
#define pgprot_decrypted(prot)        (prot)
#endif

/*
 * A facility to provide batching of the reload of page tables and
 * other process state with the actual context switch code for
 * paravirtualized guests.  By convention, only one of the batched
 * update (lazy) modes (CPU, MMU) should be active at any given time,
 * entry should never be nested, and entry and exits should always be
 * paired.  This is for sanity of maintaining and reasoning about the
 * kernel code.  In this case, the exit (end of the context switch) is
 * in architecture-specific code, and so doesn't need a generic
 * definition.
 */
#ifndef __HAVE_ARCH_START_CONTEXT_SWITCH
#define arch_start_context_switch(prev)        do {} while (0)
#endif

#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
#ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION
static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}
#endif
#else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */
static inline int pte_soft_dirty(pte_t pte)
{
        return 0;
}

static inline int pmd_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pte_t pte_mksoft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline pte_t pte_clear_soft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
        return pte;
}

static inline int pte_swp_soft_dirty(pte_t pte)
{
        return 0;
}

static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}
#endif

#ifndef __HAVE_PFNMAP_TRACKING
/*
 * Interfaces that can be used by architecture code to keep track of
 * memory type of pfn mappings specified by the remap_pfn_range,
 * vmf_insert_pfn.
 */

/*
 * track_pfn_remap is called when a _new_ pfn mapping is being established
 * by remap_pfn_range() for physical range indicated by pfn and size.
 */
static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
                                  unsigned long pfn, unsigned long addr,
                                  unsigned long size)
{
        return 0;
}

/*
 * track_pfn_insert is called when a _new_ single pfn is established
 * by vmf_insert_pfn().
 */
static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
                                    pfn_t pfn)
{
}

/*
 * track_pfn_copy is called when vma that is covering the pfnmap gets
 * copied through copy_page_range().
 */
static inline int track_pfn_copy(struct vm_area_struct *vma)
{
        return 0;
}

/*
 * untrack_pfn is called while unmapping a pfnmap for a region.
 * untrack can be called for a specific region indicated by pfn and size or
 * can be for the entire vma (in which case pfn, size are zero).
 */
static inline void untrack_pfn(struct vm_area_struct *vma,
                               unsigned long pfn, unsigned long size,
                               bool mm_wr_locked)
{
}

/*
 * untrack_pfn_clear is called while mremapping a pfnmap for a new region
 * or fails to copy pgtable during duplicate vm area.
 */
static inline void untrack_pfn_clear(struct vm_area_struct *vma)
{
}
#else
extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
                           unsigned long pfn, unsigned long addr,
                           unsigned long size);
extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
                             pfn_t pfn);
extern int track_pfn_copy(struct vm_area_struct *vma);
extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
                        unsigned long size, bool mm_wr_locked);
extern void untrack_pfn_clear(struct vm_area_struct *vma);
#endif

#ifdef CONFIG_MMU
#ifdef __HAVE_COLOR_ZERO_PAGE
static inline int is_zero_pfn(unsigned long pfn)
{
        extern unsigned long zero_pfn;
        unsigned long offset_from_zero_pfn = pfn - zero_pfn;
        return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
}

#define my_zero_pfn(addr)        page_to_pfn(ZERO_PAGE(addr))

#else
static inline int is_zero_pfn(unsigned long pfn)
{
        extern unsigned long zero_pfn;
        return pfn == zero_pfn;
}

static inline unsigned long my_zero_pfn(unsigned long addr)
{
        extern unsigned long zero_pfn;
        return zero_pfn;
}
#endif
#else
static inline int is_zero_pfn(unsigned long pfn)
{
        return 0;
}

static inline unsigned long my_zero_pfn(unsigned long addr)
{
        return 0;
}
#endif /* CONFIG_MMU */

#ifdef CONFIG_MMU

#ifndef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmd_trans_huge(pmd_t pmd)
{
        return 0;
}
#ifndef pmd_write
static inline int pmd_write(pmd_t pmd)
{
        BUG();
        return 0;
}
#endif /* pmd_write */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifndef pud_write
static inline int pud_write(pud_t pud)
{
        BUG();
        return 0;
}
#endif /* pud_write */

#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
static inline int pmd_devmap(pmd_t pmd)
{
        return 0;
}
static inline int pud_devmap(pud_t pud)
{
        return 0;
}
static inline int pgd_devmap(pgd_t pgd)
{
        return 0;
}
#endif

#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
        !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
static inline int pud_trans_huge(pud_t pud)
{
        return 0;
}
#endif

static inline int pud_trans_unstable(pud_t *pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        pud_t pudval = READ_ONCE(*pud);

        if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval))
                return 1;
        if (unlikely(pud_bad(pudval))) {
                pud_clear_bad(pud);
                return 1;
        }
#endif
        return 0;
}

#ifndef CONFIG_NUMA_BALANCING
/*
 * In an inaccessible (PROT_NONE) VMA, pte_protnone() may indicate "yes". It is
 * perfectly valid to indicate "no" in that case, which is why our default
 * implementation defaults to "always no".
 *
 * In an accessible VMA, however, pte_protnone() reliably indicates PROT_NONE
 * page protection due to NUMA hinting. NUMA hinting faults only apply in
 * accessible VMAs.
 *
 * So, to reliably identify PROT_NONE PTEs that require a NUMA hinting fault,
 * looking at the VMA accessibility is sufficient.
 */
static inline int pte_protnone(pte_t pte)
{
        return 0;
}

static inline int pmd_protnone(pmd_t pmd)
{
        return 0;
}
#endif /* CONFIG_NUMA_BALANCING */

#endif /* CONFIG_MMU */

#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP

#ifndef __PAGETABLE_P4D_FOLDED
int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot);
void p4d_clear_huge(p4d_t *p4d);
#else
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline void p4d_clear_huge(p4d_t *p4d) { }
#endif /* !__PAGETABLE_P4D_FOLDED */

int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
int pud_clear_huge(pud_t *pud);
int pmd_clear_huge(pmd_t *pmd);
int p4d_free_pud_page(p4d_t *p4d, unsigned long addr);
int pud_free_pmd_page(pud_t *pud, unsigned long addr);
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
#else        /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline void p4d_clear_huge(p4d_t *p4d) { }
static inline int pud_clear_huge(pud_t *pud)
{
        return 0;
}
static inline int pmd_clear_huge(pmd_t *pmd)
{
        return 0;
}
static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
{
        return 0;
}
static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
        return 0;
}
static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
        return 0;
}
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMAP */

#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * ARCHes with special requirements for evicting THP backing TLB entries can
 * implement this. Otherwise also, it can help optimize normal TLB flush in
 * THP regime. Stock flush_tlb_range() typically has optimization to nuke the
 * entire TLB if flush span is greater than a threshold, which will
 * likely be true for a single huge page. Thus a single THP flush will
 * invalidate the entire TLB which is not desirable.
 * e.g. see arch/arc: flush_pmd_tlb_range
 */
#define flush_pmd_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
#define flush_pud_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
#else
#define flush_pmd_tlb_range(vma, addr, end)        BUILD_BUG()
#define flush_pud_tlb_range(vma, addr, end)        BUILD_BUG()
#endif
#endif

struct file;
int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
                        unsigned long size, pgprot_t *vma_prot);

#ifndef CONFIG_X86_ESPFIX64
static inline void init_espfix_bsp(void) { }
#endif

extern void __init pgtable_cache_init(void);

#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
{
        return true;
}

static inline bool arch_has_pfn_modify_check(void)
{
        return false;
}
#endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */

/*
 * Architecture PAGE_KERNEL_* fallbacks
 *
 * Some architectures don't define certain PAGE_KERNEL_* flags. This is either
 * because they really don't support them, or the port needs to be updated to
 * reflect the required functionality. Below are a set of relatively safe
 * fallbacks, as best effort, which we can count on in lieu of the architectures
 * not defining them on their own yet.
 */

#ifndef PAGE_KERNEL_RO
# define PAGE_KERNEL_RO PAGE_KERNEL
#endif

#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif

/*
 * Page Table Modification bits for pgtbl_mod_mask.
 *
 * These are used by the p?d_alloc_track*() set of functions an in the generic
 * vmalloc/ioremap code to track at which page-table levels entries have been
 * modified. Based on that the code can better decide when vmalloc and ioremap
 * mapping changes need to be synchronized to other page-tables in the system.
 */
#define                __PGTBL_PGD_MODIFIED        0
#define                __PGTBL_P4D_MODIFIED        1
#define                __PGTBL_PUD_MODIFIED        2
#define                __PGTBL_PMD_MODIFIED        3
#define                __PGTBL_PTE_MODIFIED        4

#define                PGTBL_PGD_MODIFIED        BIT(__PGTBL_PGD_MODIFIED)
#define                PGTBL_P4D_MODIFIED        BIT(__PGTBL_P4D_MODIFIED)
#define                PGTBL_PUD_MODIFIED        BIT(__PGTBL_PUD_MODIFIED)
#define                PGTBL_PMD_MODIFIED        BIT(__PGTBL_PMD_MODIFIED)
#define                PGTBL_PTE_MODIFIED        BIT(__PGTBL_PTE_MODIFIED)

/* Page-Table Modification Mask */
typedef unsigned int pgtbl_mod_mask;

#endif /* !__ASSEMBLY__ */

#if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT)
#ifdef CONFIG_PHYS_ADDR_T_64BIT
/*
 * ZSMALLOC needs to know the highest PFN on 32-bit architectures
 * with physical address space extension, but falls back to
 * BITS_PER_LONG otherwise.
 */
#error Missing MAX_POSSIBLE_PHYSMEM_BITS definition
#else
#define MAX_POSSIBLE_PHYSMEM_BITS 32
#endif
#endif

#ifndef has_transparent_hugepage
#define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE)
#endif

#ifndef has_transparent_pud_hugepage
#define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
#endif
/*
 * On some architectures it depends on the mm if the p4d/pud or pmd
 * layer of the page table hierarchy is folded or not.
 */
#ifndef mm_p4d_folded
#define mm_p4d_folded(mm)        __is_defined(__PAGETABLE_P4D_FOLDED)
#endif

#ifndef mm_pud_folded
#define mm_pud_folded(mm)        __is_defined(__PAGETABLE_PUD_FOLDED)
#endif

#ifndef mm_pmd_folded
#define mm_pmd_folded(mm)        __is_defined(__PAGETABLE_PMD_FOLDED)
#endif

#ifndef p4d_offset_lockless
#define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address)
#endif
#ifndef pud_offset_lockless
#define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address)
#endif
#ifndef pmd_offset_lockless
#define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address)
#endif

/*
 * pXd_leaf() is the API to check whether a pgtable entry is a huge page
 * mapping.  It should work globally across all archs, without any
 * dependency on CONFIG_* options.  For architectures that do not support
 * huge mappings on specific levels, below fallbacks will be used.
 *
 * A leaf pgtable entry should always imply the following:
 *
 * - It is a "present" entry.  IOW, before using this API, please check it
 *   with pXd_present() first. NOTE: it may not always mean the "present
 *   bit" is set.  For example, PROT_NONE entries are always "present".
 *
 * - It should _never_ be a swap entry of any type.  Above "present" check
 *   should have guarded this, but let's be crystal clear on this.
 *
 * - It should contain a huge PFN, which points to a huge page larger than
 *   PAGE_SIZE of the platform.  The PFN format isn't important here.
 *
 * - It should cover all kinds of huge mappings (e.g., pXd_trans_huge(),
 *   pXd_devmap(), or hugetlb mappings).
 */
#ifndef pgd_leaf
#define pgd_leaf(x)        false
#endif
#ifndef p4d_leaf
#define p4d_leaf(x)        false
#endif
#ifndef pud_leaf
#define pud_leaf(x)        false
#endif
#ifndef pmd_leaf
#define pmd_leaf(x)        false
#endif

#ifndef pgd_leaf_size
#define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT)
#endif
#ifndef p4d_leaf_size
#define p4d_leaf_size(x) P4D_SIZE
#endif
#ifndef pud_leaf_size
#define pud_leaf_size(x) PUD_SIZE
#endif
#ifndef pmd_leaf_size
#define pmd_leaf_size(x) PMD_SIZE
#endif
#ifndef pte_leaf_size
#define pte_leaf_size(x) PAGE_SIZE
#endif

/*
 * We always define pmd_pfn for all archs as it's used in lots of generic
 * code.  Now it happens too for pud_pfn (and can happen for larger
 * mappings too in the future; we're not there yet).  Instead of defining
 * it for all archs (like pmd_pfn), provide a fallback.
 *
 * Note that returning 0 here means any arch that didn't define this can
 * get severely wrong when it hits a real pud leaf.  It's arch's
 * responsibility to properly define it when a huge pud is possible.
 */
#ifndef pud_pfn
#define pud_pfn(x) 0
#endif

/*
 * Some architectures have MMUs that are configurable or selectable at boot
 * time. These lead to variable PTRS_PER_x. For statically allocated arrays it
 * helps to have a static maximum value.
 */

#ifndef MAX_PTRS_PER_PTE
#define MAX_PTRS_PER_PTE PTRS_PER_PTE
#endif

#ifndef MAX_PTRS_PER_PMD
#define MAX_PTRS_PER_PMD PTRS_PER_PMD
#endif

#ifndef MAX_PTRS_PER_PUD
#define MAX_PTRS_PER_PUD PTRS_PER_PUD
#endif

#ifndef MAX_PTRS_PER_P4D
#define MAX_PTRS_PER_P4D PTRS_PER_P4D
#endif

/* description of effects of mapping type and prot in current implementation.
 * this is due to the limited x86 page protection hardware.  The expected
 * behavior is in parens:
 *
 * map_type        prot
 *                PROT_NONE        PROT_READ        PROT_WRITE        PROT_EXEC
 * MAP_SHARED        r: (no) no        r: (yes) yes        r: (no) yes        r: (no) yes
 *                w: (no) no        w: (no) no        w: (yes) yes        w: (no) no
 *                x: (no) no        x: (no) yes        x: (no) yes        x: (yes) yes
 *
 * MAP_PRIVATE        r: (no) no        r: (yes) yes        r: (no) yes        r: (no) yes
 *                w: (no) no        w: (no) no        w: (copy) copy        w: (no) no
 *                x: (no) no        x: (no) yes        x: (no) yes        x: (yes) yes
 *
 * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
 * MAP_PRIVATE (with Enhanced PAN supported):
 *                                                                r: (no) no
 *                                                                w: (no) no
 *                                                                x: (yes) yes
 */
#define DECLARE_VM_GET_PAGE_PROT                                        \
pgprot_t vm_get_page_prot(unsigned long vm_flags)                        \
{                                                                        \
                return protection_map[vm_flags &                        \
                        (VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)];        \
}                                                                        \
EXPORT_SYMBOL(vm_get_page_prot);

#endif /* _LINUX_PGTABLE_H */


























































































    3 













    3 

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  scsi.c Copyright (C) 1992 Drew Eckhardt
 *         Copyright (C) 1993, 1994, 1995, 1999 Eric Youngdale
 *         Copyright (C) 2002, 2003 Christoph Hellwig
 *
 *  generic mid-level SCSI driver
 *      Initial versions: Drew Eckhardt
 *      Subsequent revisions: Eric Youngdale
 *
 *  <drew@colorado.edu>
 *
 *  Bug correction thanks go to :
 *      Rik Faith <faith@cs.unc.edu>
 *      Tommy Thorn <tthorn>
 *      Thomas Wuensche <tw@fgb1.fgb.mw.tu-muenchen.de>
 *
 *  Modified by Eric Youngdale eric@andante.org or ericy@gnu.ai.mit.edu to
 *  add scatter-gather, multiple outstanding request, and other
 *  enhancements.
 *
 *  Native multichannel, wide scsi, /proc/scsi and hot plugging
 *  support added by Michael Neuffer <mike@i-connect.net>
 *
 *  Added request_module("scsi_hostadapter") for kerneld:
 *  (Put an "alias scsi_hostadapter your_hostadapter" in /etc/modprobe.conf)
 *  Bjorn Ekwall  <bj0rn@blox.se>
 *  (changed to kmod)
 *
 *  Major improvements to the timeout, abort, and reset processing,
 *  as well as performance modifications for large queue depths by
 *  Leonard N. Zubkoff <lnz@dandelion.com>
 *
 *  Converted cli() code to spinlocks, Ingo Molnar
 *
 *  Jiffies wrap fixes (host->resetting), 3 Dec 1998 Andrea Arcangeli
 *
 *  out_of_space hacks, D. Gilbert (dpg) 990608
 */

#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/completion.h>
#include <linux/unistd.h>
#include <linux/spinlock.h>
#include <linux/kmod.h>
#include <linux/interrupt.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/mutex.h>
#include <asm/unaligned.h>

#include <scsi/scsi.h>
#include <scsi/scsi_cmnd.h>
#include <scsi/scsi_dbg.h>
#include <scsi/scsi_device.h>
#include <scsi/scsi_driver.h>
#include <scsi/scsi_eh.h>
#include <scsi/scsi_host.h>
#include <scsi/scsi_tcq.h>

#include "scsi_priv.h"
#include "scsi_logging.h"

#define CREATE_TRACE_POINTS
#include <trace/events/scsi.h>

/*
 * Definitions and constants.
 */

/*
 * Note - the initial logging level can be set here to log events at boot time.
 * After the system is up, you may enable logging via the /proc interface.
 */
unsigned int scsi_logging_level;
#if defined(CONFIG_SCSI_LOGGING)
EXPORT_SYMBOL(scsi_logging_level);
#endif

#ifdef CONFIG_SCSI_LOGGING
void scsi_log_send(struct scsi_cmnd *cmd)
{
        unsigned int level;

        /*
         * If ML QUEUE log level is greater than or equal to:
         *
         * 1: nothing (match completion)
         *
         * 2: log opcode + command of all commands + cmd address
         *
         * 3: same as 2
         *
         * 4: same as 3
         */
        if (unlikely(scsi_logging_level)) {
                level = SCSI_LOG_LEVEL(SCSI_LOG_MLQUEUE_SHIFT,
                                       SCSI_LOG_MLQUEUE_BITS);
                if (level > 1) {
                        scmd_printk(KERN_INFO, cmd,
                                    "Send: scmd 0x%p\n", cmd);
                        scsi_print_command(cmd);
                }
        }
}

void scsi_log_completion(struct scsi_cmnd *cmd, int disposition)
{
        unsigned int level;

        /*
         * If ML COMPLETE log level is greater than or equal to:
         *
         * 1: log disposition, result, opcode + command, and conditionally
         * sense data for failures or non SUCCESS dispositions.
         *
         * 2: same as 1 but for all command completions.
         *
         * 3: same as 2
         *
         * 4: same as 3 plus dump extra junk
         */
        if (unlikely(scsi_logging_level)) {
                level = SCSI_LOG_LEVEL(SCSI_LOG_MLCOMPLETE_SHIFT,
                                       SCSI_LOG_MLCOMPLETE_BITS);
                if (((level > 0) && (cmd->result || disposition != SUCCESS)) ||
                    (level > 1)) {
                        scsi_print_result(cmd, "Done", disposition);
                        scsi_print_command(cmd);
                        if (scsi_status_is_check_condition(cmd->result))
                                scsi_print_sense(cmd);
                        if (level > 3)
                                scmd_printk(KERN_INFO, cmd,
                                            "scsi host busy %d failed %d\n",
                                            scsi_host_busy(cmd->device->host),
                                            cmd->device->host->host_failed);
                }
        }
}
#endif

/**
 * scsi_finish_command - cleanup and pass command back to upper layer
 * @cmd: the command
 *
 * Description: Pass command off to upper layer for finishing of I/O
 *              request, waking processes that are waiting on results,
 *              etc.
 */
void scsi_finish_command(struct scsi_cmnd *cmd)
{
        struct scsi_device *sdev = cmd->device;
        struct scsi_target *starget = scsi_target(sdev);
        struct Scsi_Host *shost = sdev->host;
        struct scsi_driver *drv;
        unsigned int good_bytes;

        scsi_device_unbusy(sdev, cmd);

        /*
         * Clear the flags that say that the device/target/host is no longer
         * capable of accepting new commands.
         */
        if (atomic_read(&shost->host_blocked))
                atomic_set(&shost->host_blocked, 0);
        if (atomic_read(&starget->target_blocked))
                atomic_set(&starget->target_blocked, 0);
        if (atomic_read(&sdev->device_blocked))
                atomic_set(&sdev->device_blocked, 0);

        SCSI_LOG_MLCOMPLETE(4, sdev_printk(KERN_INFO, sdev,
                                "Notifying upper driver of completion "
                                "(result %x)\n", cmd->result));

        good_bytes = scsi_bufflen(cmd);
        if (!blk_rq_is_passthrough(scsi_cmd_to_rq(cmd))) {
                int old_good_bytes = good_bytes;
                drv = scsi_cmd_to_driver(cmd);
                if (drv->done)
                        good_bytes = drv->done(cmd);
                /*
                 * USB may not give sense identifying bad sector and
                 * simply return a residue instead, so subtract off the
                 * residue if drv->done() error processing indicates no
                 * change to the completion length.
                 */
                if (good_bytes == old_good_bytes)
                        good_bytes -= scsi_get_resid(cmd);
        }
        scsi_io_completion(cmd, good_bytes);
}


/*
 * 4096 is big enough for saturating fast SCSI LUNs.
 */
int scsi_device_max_queue_depth(struct scsi_device *sdev)
{
        return min_t(int, sdev->host->can_queue, 4096);
}

/**
 * scsi_change_queue_depth - change a device's queue depth
 * @sdev: SCSI Device in question
 * @depth: number of commands allowed to be queued to the driver
 *
 * Sets the device queue depth and returns the new value.
 */
int scsi_change_queue_depth(struct scsi_device *sdev, int depth)
{
        depth = min_t(int, depth, scsi_device_max_queue_depth(sdev));

        if (depth > 0) {
                sdev->queue_depth = depth;
                wmb();
        }

        if (sdev->request_queue)
                blk_set_queue_depth(sdev->request_queue, depth);

        sbitmap_resize(&sdev->budget_map, sdev->queue_depth);

        return sdev->queue_depth;
}
EXPORT_SYMBOL(scsi_change_queue_depth);

/**
 * scsi_track_queue_full - track QUEUE_FULL events to adjust queue depth
 * @sdev: SCSI Device in question
 * @depth: Current number of outstanding SCSI commands on this device,
 *         not counting the one returned as QUEUE_FULL.
 *
 * Description:        This function will track successive QUEUE_FULL events on a
 *                 specific SCSI device to determine if and when there is a
 *                 need to adjust the queue depth on the device.
 *
 * Returns:        0 - No change needed, >0 - Adjust queue depth to this new depth,
 *                 -1 - Drop back to untagged operation using host->cmd_per_lun
 *                         as the untagged command depth
 *
 * Lock Status:        None held on entry
 *
 * Notes:        Low level drivers may call this at any time and we will do
 *                 "The Right Thing."  We are interrupt context safe.
 */
int scsi_track_queue_full(struct scsi_device *sdev, int depth)
{

        /*
         * Don't let QUEUE_FULLs on the same
         * jiffies count, they could all be from
         * same event.
         */
        if ((jiffies >> 4) == (sdev->last_queue_full_time >> 4))
                return 0;

        sdev->last_queue_full_time = jiffies;
        if (sdev->last_queue_full_depth != depth) {
                sdev->last_queue_full_count = 1;
                sdev->last_queue_full_depth = depth;
        } else {
                sdev->last_queue_full_count++;
        }

        if (sdev->last_queue_full_count <= 10)
                return 0;

        return scsi_change_queue_depth(sdev, depth);
}
EXPORT_SYMBOL(scsi_track_queue_full);

/**
 * scsi_vpd_inquiry - Request a device provide us with a VPD page
 * @sdev: The device to ask
 * @buffer: Where to put the result
 * @page: Which Vital Product Data to return
 * @len: The length of the buffer
 *
 * This is an internal helper function.  You probably want to use
 * scsi_get_vpd_page instead.
 *
 * Returns size of the vpd page on success or a negative error number.
 */
static int scsi_vpd_inquiry(struct scsi_device *sdev, unsigned char *buffer,
                                                        u8 page, unsigned len)
{
        int result;
        unsigned char cmd[16];

        if (len < 4)
                return -EINVAL;

        cmd[0] = INQUIRY;
        cmd[1] = 1;                /* EVPD */
        cmd[2] = page;
        cmd[3] = len >> 8;
        cmd[4] = len & 0xff;
        cmd[5] = 0;                /* Control byte */

        /*
         * I'm not convinced we need to try quite this hard to get VPD, but
         * all the existing users tried this hard.
         */
        result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, buffer, len,
                                  30 * HZ, 3, NULL);
        if (result)
                return -EIO;

        /*
         * Sanity check that we got the page back that we asked for and that
         * the page size is not 0.
         */
        if (buffer[1] != page)
                return -EIO;

        result = get_unaligned_be16(&buffer[2]);
        if (!result)
                return -EIO;

        return result + 4;
}

enum scsi_vpd_parameters {
        SCSI_VPD_HEADER_SIZE = 4,
        SCSI_VPD_LIST_SIZE = 36,
};

static int scsi_get_vpd_size(struct scsi_device *sdev, u8 page)
{
        unsigned char vpd[SCSI_VPD_LIST_SIZE] __aligned(4);
        int result;

        if (sdev->no_vpd_size)
                return SCSI_DEFAULT_VPD_LEN;

        /*
         * Fetch the supported pages VPD and validate that the requested page
         * number is present.
         */
        if (page != 0) {
                result = scsi_vpd_inquiry(sdev, vpd, 0, sizeof(vpd));
                if (result < SCSI_VPD_HEADER_SIZE)
                        return 0;

                if (result > sizeof(vpd)) {
                        dev_warn_once(&sdev->sdev_gendev,
                                      "%s: long VPD page 0 length: %d bytes\n",
                                      __func__, result);
                        result = sizeof(vpd);
                }

                result -= SCSI_VPD_HEADER_SIZE;
                if (!memchr(&vpd[SCSI_VPD_HEADER_SIZE], page, result))
                        return 0;
        }
        /*
         * Fetch the VPD page header to find out how big the page
         * is. This is done to prevent problems on legacy devices
         * which can not handle allocation lengths as large as
         * potentially requested by the caller.
         */
        result = scsi_vpd_inquiry(sdev, vpd, page, SCSI_VPD_HEADER_SIZE);
        if (result < 0)
                return 0;

        if (result < SCSI_VPD_HEADER_SIZE) {
                dev_warn_once(&sdev->sdev_gendev,
                              "%s: short VPD page 0x%02x length: %d bytes\n",
                              __func__, page, result);
                return 0;
        }

        return result;
}

/**
 * scsi_get_vpd_page - Get Vital Product Data from a SCSI device
 * @sdev: The device to ask
 * @page: Which Vital Product Data to return
 * @buf: where to store the VPD
 * @buf_len: number of bytes in the VPD buffer area
 *
 * SCSI devices may optionally supply Vital Product Data.  Each 'page'
 * of VPD is defined in the appropriate SCSI document (eg SPC, SBC).
 * If the device supports this VPD page, this routine fills @buf
 * with the data from that page and return 0. If the VPD page is not
 * supported or its content cannot be retrieved, -EINVAL is returned.
 */
int scsi_get_vpd_page(struct scsi_device *sdev, u8 page, unsigned char *buf,
                      int buf_len)
{
        int result, vpd_len;

        if (!scsi_device_supports_vpd(sdev))
                return -EINVAL;

        vpd_len = scsi_get_vpd_size(sdev, page);
        if (vpd_len <= 0)
                return -EINVAL;

        vpd_len = min(vpd_len, buf_len);

        /*
         * Fetch the actual page. Since the appropriate size was reported
         * by the device it is now safe to ask for something bigger.
         */
        memset(buf, 0, buf_len);
        result = scsi_vpd_inquiry(sdev, buf, page, vpd_len);
        if (result < 0)
                return -EINVAL;
        else if (result > vpd_len)
                dev_warn_once(&sdev->sdev_gendev,
                              "%s: VPD page 0x%02x result %d > %d bytes\n",
                              __func__, page, result, vpd_len);

        return 0;
}
EXPORT_SYMBOL_GPL(scsi_get_vpd_page);

/**
 * scsi_get_vpd_buf - Get Vital Product Data from a SCSI device
 * @sdev: The device to ask
 * @page: Which Vital Product Data to return
 *
 * Returns %NULL upon failure.
 */
static struct scsi_vpd *scsi_get_vpd_buf(struct scsi_device *sdev, u8 page)
{
        struct scsi_vpd *vpd_buf;
        int vpd_len, result;

        vpd_len = scsi_get_vpd_size(sdev, page);
        if (vpd_len <= 0)
                return NULL;

retry_pg:
        /*
         * Fetch the actual page. Since the appropriate size was reported
         * by the device it is now safe to ask for something bigger.
         */
        vpd_buf = kmalloc(sizeof(*vpd_buf) + vpd_len, GFP_KERNEL);
        if (!vpd_buf)
                return NULL;

        result = scsi_vpd_inquiry(sdev, vpd_buf->data, page, vpd_len);
        if (result < 0) {
                kfree(vpd_buf);
                return NULL;
        }
        if (result > vpd_len) {
                dev_warn_once(&sdev->sdev_gendev,
                              "%s: VPD page 0x%02x result %d > %d bytes\n",
                              __func__, page, result, vpd_len);
                vpd_len = result;
                kfree(vpd_buf);
                goto retry_pg;
        }

        vpd_buf->len = result;

        return vpd_buf;
}

static void scsi_update_vpd_page(struct scsi_device *sdev, u8 page,
                                 struct scsi_vpd __rcu **sdev_vpd_buf)
{
        struct scsi_vpd *vpd_buf;

        vpd_buf = scsi_get_vpd_buf(sdev, page);
        if (!vpd_buf)
                return;

        mutex_lock(&sdev->inquiry_mutex);
        vpd_buf = rcu_replace_pointer(*sdev_vpd_buf, vpd_buf,
                                      lockdep_is_held(&sdev->inquiry_mutex));
        mutex_unlock(&sdev->inquiry_mutex);

        if (vpd_buf)
                kfree_rcu(vpd_buf, rcu);
}

/**
 * scsi_attach_vpd - Attach Vital Product Data to a SCSI device structure
 * @sdev: The device to ask
 *
 * Attach the 'Device Identification' VPD page (0x83) and the
 * 'Unit Serial Number' VPD page (0x80) to a SCSI device
 * structure. This information can be used to identify the device
 * uniquely.
 */
void scsi_attach_vpd(struct scsi_device *sdev)
{
        int i;
        struct scsi_vpd *vpd_buf;

        if (!scsi_device_supports_vpd(sdev))
                return;

        /* Ask for all the pages supported by this device */
        vpd_buf = scsi_get_vpd_buf(sdev, 0);
        if (!vpd_buf)
                return;

        for (i = 4; i < vpd_buf->len; i++) {
                if (vpd_buf->data[i] == 0x0)
                        scsi_update_vpd_page(sdev, 0x0, &sdev->vpd_pg0);
                if (vpd_buf->data[i] == 0x80)
                        scsi_update_vpd_page(sdev, 0x80, &sdev->vpd_pg80);
                if (vpd_buf->data[i] == 0x83)
                        scsi_update_vpd_page(sdev, 0x83, &sdev->vpd_pg83);
                if (vpd_buf->data[i] == 0x89)
                        scsi_update_vpd_page(sdev, 0x89, &sdev->vpd_pg89);
                if (vpd_buf->data[i] == 0xb0)
                        scsi_update_vpd_page(sdev, 0xb0, &sdev->vpd_pgb0);
                if (vpd_buf->data[i] == 0xb1)
                        scsi_update_vpd_page(sdev, 0xb1, &sdev->vpd_pgb1);
                if (vpd_buf->data[i] == 0xb2)
                        scsi_update_vpd_page(sdev, 0xb2, &sdev->vpd_pgb2);
                if (vpd_buf->data[i] == 0xb7)
                        scsi_update_vpd_page(sdev, 0xb7, &sdev->vpd_pgb7);
        }
        kfree(vpd_buf);
}

/**
 * scsi_report_opcode - Find out if a given command is supported
 * @sdev:        scsi device to query
 * @buffer:        scratch buffer (must be at least 20 bytes long)
 * @len:        length of buffer
 * @opcode:        opcode for the command to look up
 * @sa:                service action for the command to look up
 *
 * Uses the REPORT SUPPORTED OPERATION CODES to check support for the
 * command identified with @opcode and @sa. If the command does not
 * have a service action, @sa must be 0. Returns -EINVAL if RSOC fails,
 * 0 if the command is not supported and 1 if the device claims to
 * support the command.
 */
int scsi_report_opcode(struct scsi_device *sdev, unsigned char *buffer,
                       unsigned int len, unsigned char opcode,
                       unsigned short sa)
{
        unsigned char cmd[16];
        struct scsi_sense_hdr sshdr;
        int result, request_len;
        const struct scsi_exec_args exec_args = {
                .sshdr = &sshdr,
        };

        if (sdev->no_report_opcodes || sdev->scsi_level < SCSI_SPC_3)
                return -EINVAL;

        /* RSOC header + size of command we are asking about */
        request_len = 4 + COMMAND_SIZE(opcode);
        if (request_len > len) {
                dev_warn_once(&sdev->sdev_gendev,
                              "%s: len %u bytes, opcode 0x%02x needs %u\n",
                              __func__, len, opcode, request_len);
                return -EINVAL;
        }

        memset(cmd, 0, 16);
        cmd[0] = MAINTENANCE_IN;
        cmd[1] = MI_REPORT_SUPPORTED_OPERATION_CODES;
        if (!sa) {
                cmd[2] = 1;        /* One command format */
                cmd[3] = opcode;
        } else {
                cmd[2] = 3;        /* One command format with service action */
                cmd[3] = opcode;
                put_unaligned_be16(sa, &cmd[4]);
        }
        put_unaligned_be32(request_len, &cmd[6]);
        memset(buffer, 0, len);

        result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, buffer,
                                  request_len, 30 * HZ, 3, &exec_args);
        if (result < 0)
                return result;
        if (result && scsi_sense_valid(&sshdr) &&
            sshdr.sense_key == ILLEGAL_REQUEST &&
            (sshdr.asc == 0x20 || sshdr.asc == 0x24) && sshdr.ascq == 0x00)
                return -EINVAL;

        if ((buffer[1] & 3) == 3) /* Command supported */
                return 1;

        return 0;
}
EXPORT_SYMBOL(scsi_report_opcode);

#define SCSI_CDL_CHECK_BUF_LEN        64

static bool scsi_cdl_check_cmd(struct scsi_device *sdev, u8 opcode, u16 sa,
                               unsigned char *buf)
{
        int ret;
        u8 cdlp;

        /* Check operation code */
        ret = scsi_report_opcode(sdev, buf, SCSI_CDL_CHECK_BUF_LEN, opcode, sa);
        if (ret <= 0)
                return false;

        if ((buf[1] & 0x03) != 0x03)
                return false;

        /*
         * See SPC-6, One_command parameter data format for
         * REPORT SUPPORTED OPERATION CODES. We have the following cases
         * depending on rwcdlp (buf[0] & 0x01) value:
         *  - rwcdlp == 0: then cdlp indicates support for the A mode page when
         *                   it is equal to 1 and for the B mode page when it is
         *                   equal to 2.
         *  - rwcdlp == 1: then cdlp indicates support for the T2A mode page
         *                   when it is equal to 1 and for the T2B mode page when
         *                   it is equal to 2.
         * Overall, to detect support for command duration limits, we only need
         * to check that cdlp is 1 or 2.
         */
        cdlp = (buf[1] & 0x18) >> 3;

        return cdlp == 0x01 || cdlp == 0x02;
}

/**
 * scsi_cdl_check - Check if a SCSI device supports Command Duration Limits
 * @sdev: The device to check
 */
void scsi_cdl_check(struct scsi_device *sdev)
{
        bool cdl_supported;
        unsigned char *buf;

        /*
         * Support for CDL was defined in SPC-5. Ignore devices reporting an
         * lower SPC version. This also avoids problems with old drives choking
         * on MAINTENANCE_IN / MI_REPORT_SUPPORTED_OPERATION_CODES with a
         * service action specified, as done in scsi_cdl_check_cmd().
         */
        if (sdev->scsi_level < SCSI_SPC_5) {
                sdev->cdl_supported = 0;
                return;
        }

        buf = kmalloc(SCSI_CDL_CHECK_BUF_LEN, GFP_KERNEL);
        if (!buf) {
                sdev->cdl_supported = 0;
                return;
        }

        /* Check support for READ_16, WRITE_16, READ_32 and WRITE_32 commands */
        cdl_supported =
                scsi_cdl_check_cmd(sdev, READ_16, 0, buf) ||
                scsi_cdl_check_cmd(sdev, WRITE_16, 0, buf) ||
                scsi_cdl_check_cmd(sdev, VARIABLE_LENGTH_CMD, READ_32, buf) ||
                scsi_cdl_check_cmd(sdev, VARIABLE_LENGTH_CMD, WRITE_32, buf);
        if (cdl_supported) {
                /*
                 * We have CDL support: force the use of READ16/WRITE16.
                 * READ32 and WRITE32 will be used for devices that support
                 * the T10_PI_TYPE2_PROTECTION protection type.
                 */
                sdev->use_16_for_rw = 1;
                sdev->use_10_for_rw = 0;

                sdev->cdl_supported = 1;

                /*
                 * If the device supports CDL, make sure that the current drive
                 * feature status is consistent with the user controlled
                 * cdl_enable state.
                 */
                scsi_cdl_enable(sdev, sdev->cdl_enable);
        } else {
                sdev->cdl_supported = 0;
        }

        kfree(buf);
}

/**
 * scsi_cdl_enable - Enable or disable a SCSI device supports for Command
 *                   Duration Limits
 * @sdev: The target device
 * @enable: the target state
 */
int scsi_cdl_enable(struct scsi_device *sdev, bool enable)
{
        struct scsi_mode_data data;
        struct scsi_sense_hdr sshdr;
        struct scsi_vpd *vpd;
        bool is_ata = false;
        char buf[64];
        int ret;

        if (!sdev->cdl_supported)
                return -EOPNOTSUPP;

        rcu_read_lock();
        vpd = rcu_dereference(sdev->vpd_pg89);
        if (vpd)
                is_ata = true;
        rcu_read_unlock();

        /*
         * For ATA devices, CDL needs to be enabled with a SET FEATURES command.
         */
        if (is_ata) {
                char *buf_data;
                int len;

                ret = scsi_mode_sense(sdev, 0x08, 0x0a, 0xf2, buf, sizeof(buf),
                                      5 * HZ, 3, &data, NULL);
                if (ret)
                        return -EINVAL;

                /* Enable CDL using the ATA feature page */
                len = min_t(size_t, sizeof(buf),
                            data.length - data.header_length -
                            data.block_descriptor_length);
                buf_data = buf + data.header_length +
                        data.block_descriptor_length;
                if (enable)
                        buf_data[4] = 0x02;
                else
                        buf_data[4] = 0;

                ret = scsi_mode_select(sdev, 1, 0, buf_data, len, 5 * HZ, 3,
                                       &data, &sshdr);
                if (ret) {
                        if (ret > 0 && scsi_sense_valid(&sshdr))
                                scsi_print_sense_hdr(sdev,
                                        dev_name(&sdev->sdev_gendev), &sshdr);
                        return ret;
                }
        }

        sdev->cdl_enable = enable;

        return 0;
}

/**
 * scsi_device_get  -  get an additional reference to a scsi_device
 * @sdev:        device to get a reference to
 *
 * Description: Gets a reference to the scsi_device and increments the use count
 * of the underlying LLDD module.  You must hold host_lock of the
 * parent Scsi_Host or already have a reference when calling this.
 *
 * This will fail if a device is deleted or cancelled, or when the LLD module
 * is in the process of being unloaded.
 */
int scsi_device_get(struct scsi_device *sdev)
{
        if (sdev->sdev_state == SDEV_DEL || sdev->sdev_state == SDEV_CANCEL)
                goto fail;
        if (!try_module_get(sdev->host->hostt->module))
                goto fail;
        if (!get_device(&sdev->sdev_gendev))
                goto fail_put_module;
        return 0;

fail_put_module:
        module_put(sdev->host->hostt->module);
fail:
        return -ENXIO;
}
EXPORT_SYMBOL(scsi_device_get);

/**
 * scsi_device_put  -  release a reference to a scsi_device
 * @sdev:        device to release a reference on.
 *
 * Description: Release a reference to the scsi_device and decrements the use
 * count of the underlying LLDD module.  The device is freed once the last
 * user vanishes.
 */
void scsi_device_put(struct scsi_device *sdev)
{
        struct module *mod = sdev->host->hostt->module;

        put_device(&sdev->sdev_gendev);
        module_put(mod);
}
EXPORT_SYMBOL(scsi_device_put);

/* helper for shost_for_each_device, see that for documentation */
struct scsi_device *__scsi_iterate_devices(struct Scsi_Host *shost,
                                           struct scsi_device *prev)
{
        struct list_head *list = (prev ? &prev->siblings : &shost->__devices);
        struct scsi_device *next = NULL;
        unsigned long flags;

        spin_lock_irqsave(shost->host_lock, flags);
        while (list->next != &shost->__devices) {
                next = list_entry(list->next, struct scsi_device, siblings);
                /* skip devices that we can't get a reference to */
                if (!scsi_device_get(next))
                        break;
                next = NULL;
                list = list->next;
        }
        spin_unlock_irqrestore(shost->host_lock, flags);

        if (prev)
                scsi_device_put(prev);
        return next;
}
EXPORT_SYMBOL(__scsi_iterate_devices);

/**
 * starget_for_each_device  -  helper to walk all devices of a target
 * @starget:        target whose devices we want to iterate over.
 * @data:        Opaque passed to each function call.
 * @fn:                Function to call on each device
 *
 * This traverses over each device of @starget.  The devices have
 * a reference that must be released by scsi_host_put when breaking
 * out of the loop.
 */
void starget_for_each_device(struct scsi_target *starget, void *data,
                     void (*fn)(struct scsi_device *, void *))
{
        struct Scsi_Host *shost = dev_to_shost(starget->dev.parent);
        struct scsi_device *sdev;

        shost_for_each_device(sdev, shost) {
                if ((sdev->channel == starget->channel) &&
                    (sdev->id == starget->id))
                        fn(sdev, data);
        }
}
EXPORT_SYMBOL(starget_for_each_device);

/**
 * __starget_for_each_device - helper to walk all devices of a target (UNLOCKED)
 * @starget:        target whose devices we want to iterate over.
 * @data:        parameter for callback @fn()
 * @fn:                callback function that is invoked for each device
 *
 * This traverses over each device of @starget.  It does _not_
 * take a reference on the scsi_device, so the whole loop must be
 * protected by shost->host_lock.
 *
 * Note:  The only reason why drivers would want to use this is because
 * they need to access the device list in irq context.  Otherwise you
 * really want to use starget_for_each_device instead.
 **/
void __starget_for_each_device(struct scsi_target *starget, void *data,
                               void (*fn)(struct scsi_device *, void *))
{
        struct Scsi_Host *shost = dev_to_shost(starget->dev.parent);
        struct scsi_device *sdev;

        __shost_for_each_device(sdev, shost) {
                if ((sdev->channel == starget->channel) &&
                    (sdev->id == starget->id))
                        fn(sdev, data);
        }
}
EXPORT_SYMBOL(__starget_for_each_device);

/**
 * __scsi_device_lookup_by_target - find a device given the target (UNLOCKED)
 * @starget:        SCSI target pointer
 * @lun:        SCSI Logical Unit Number
 *
 * Description: Looks up the scsi_device with the specified @lun for a given
 * @starget.  The returned scsi_device does not have an additional
 * reference.  You must hold the host's host_lock over this call and
 * any access to the returned scsi_device. A scsi_device in state
 * SDEV_DEL is skipped.
 *
 * Note:  The only reason why drivers should use this is because
 * they need to access the device list in irq context.  Otherwise you
 * really want to use scsi_device_lookup_by_target instead.
 **/
struct scsi_device *__scsi_device_lookup_by_target(struct scsi_target *starget,
                                                   u64 lun)
{
        struct scsi_device *sdev;

        list_for_each_entry(sdev, &starget->devices, same_target_siblings) {
                if (sdev->sdev_state == SDEV_DEL)
                        continue;
                if (sdev->lun ==lun)
                        return sdev;
        }

        return NULL;
}
EXPORT_SYMBOL(__scsi_device_lookup_by_target);

/**
 * scsi_device_lookup_by_target - find a device given the target
 * @starget:        SCSI target pointer
 * @lun:        SCSI Logical Unit Number
 *
 * Description: Looks up the scsi_device with the specified @lun for a given
 * @starget.  The returned scsi_device has an additional reference that
 * needs to be released with scsi_device_put once you're done with it.
 **/
struct scsi_device *scsi_device_lookup_by_target(struct scsi_target *starget,
                                                 u64 lun)
{
        struct scsi_device *sdev;
        struct Scsi_Host *shost = dev_to_shost(starget->dev.parent);
        unsigned long flags;

        spin_lock_irqsave(shost->host_lock, flags);
        sdev = __scsi_device_lookup_by_target(starget, lun);
        if (sdev && scsi_device_get(sdev))
                sdev = NULL;
        spin_unlock_irqrestore(shost->host_lock, flags);

        return sdev;
}
EXPORT_SYMBOL(scsi_device_lookup_by_target);

/**
 * __scsi_device_lookup - find a device given the host (UNLOCKED)
 * @shost:        SCSI host pointer
 * @channel:        SCSI channel (zero if only one channel)
 * @id:                SCSI target number (physical unit number)
 * @lun:        SCSI Logical Unit Number
 *
 * Description: Looks up the scsi_device with the specified @channel, @id, @lun
 * for a given host. The returned scsi_device does not have an additional
 * reference.  You must hold the host's host_lock over this call and any access
 * to the returned scsi_device.
 *
 * Note:  The only reason why drivers would want to use this is because
 * they need to access the device list in irq context.  Otherwise you
 * really want to use scsi_device_lookup instead.
 **/
struct scsi_device *__scsi_device_lookup(struct Scsi_Host *shost,
                uint channel, uint id, u64 lun)
{
        struct scsi_device *sdev;

        list_for_each_entry(sdev, &shost->__devices, siblings) {
                if (sdev->sdev_state == SDEV_DEL)
                        continue;
                if (sdev->channel == channel && sdev->id == id &&
                                sdev->lun ==lun)
                        return sdev;
        }

        return NULL;
}
EXPORT_SYMBOL(__scsi_device_lookup);

/**
 * scsi_device_lookup - find a device given the host
 * @shost:        SCSI host pointer
 * @channel:        SCSI channel (zero if only one channel)
 * @id:                SCSI target number (physical unit number)
 * @lun:        SCSI Logical Unit Number
 *
 * Description: Looks up the scsi_device with the specified @channel, @id, @lun
 * for a given host.  The returned scsi_device has an additional reference that
 * needs to be released with scsi_device_put once you're done with it.
 **/
struct scsi_device *scsi_device_lookup(struct Scsi_Host *shost,
                uint channel, uint id, u64 lun)
{
        struct scsi_device *sdev;
        unsigned long flags;

        spin_lock_irqsave(shost->host_lock, flags);
        sdev = __scsi_device_lookup(shost, channel, id, lun);
        if (sdev && scsi_device_get(sdev))
                sdev = NULL;
        spin_unlock_irqrestore(shost->host_lock, flags);

        return sdev;
}
EXPORT_SYMBOL(scsi_device_lookup);

MODULE_DESCRIPTION("SCSI core");
MODULE_LICENSE("GPL");

module_param(scsi_logging_level, int, S_IRUGO|S_IWUSR);
MODULE_PARM_DESC(scsi_logging_level, "a bit mask of logging levels");

static int __init init_scsi(void)
{
        int error;

        error = scsi_init_procfs();
        if (error)
                goto cleanup_queue;
        error = scsi_init_devinfo();
        if (error)
                goto cleanup_procfs;
        error = scsi_init_hosts();
        if (error)
                goto cleanup_devlist;
        error = scsi_init_sysctl();
        if (error)
                goto cleanup_hosts;
        error = scsi_sysfs_register();
        if (error)
                goto cleanup_sysctl;

        scsi_netlink_init();

        printk(KERN_NOTICE "SCSI subsystem initialized\n");
        return 0;

cleanup_sysctl:
        scsi_exit_sysctl();
cleanup_hosts:
        scsi_exit_hosts();
cleanup_devlist:
        scsi_exit_devinfo();
cleanup_procfs:
        scsi_exit_procfs();
cleanup_queue:
        scsi_exit_queue();
        printk(KERN_ERR "SCSI subsystem failed to initialize, error = %d\n",
               -error);
        return error;
}

static void __exit exit_scsi(void)
{
        scsi_netlink_exit();
        scsi_sysfs_unregister();
        scsi_exit_sysctl();
        scsi_exit_hosts();
        scsi_exit_devinfo();
        scsi_exit_procfs();
        scsi_exit_queue();
}

subsys_initcall(init_scsi);
module_exit(exit_scsi);












































   14 

    1 




    2 








    1 

    3 



    1 


    3 

    2 
    1 


    1 

    3 




    1 








    1 
    2 

    2 
    4 

    2 

    1 
    2 

    2 

    4 


    3 
    4 
    1 





   14 
    3 
    6 


    1 

    4 



    2 

    1 
    2 

    2 

    2 


    3 

    2 
    2 

   32 
    9 

    8 

    6 





















    5 














   15 
   13 
    5 
    8 
    1 



    3 
    3 



    2 
    1 

    1 



    8 
    6 
    1 




























    9 
















    1 





























   17 












































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * Linux Security Module Hook declarations.
 *
 * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com>
 * Copyright (C) 2001 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com>
 * Copyright (C) 2001 James Morris <jmorris@intercode.com.au>
 * Copyright (C) 2001 Silicon Graphics, Inc. (Trust Technology Group)
 * Copyright (C) 2015 Intel Corporation.
 * Copyright (C) 2015 Casey Schaufler <casey@schaufler-ca.com>
 * Copyright (C) 2016 Mellanox Techonologies
 * Copyright (C) 2020 Google LLC.
 */

/*
 * The macro LSM_HOOK is used to define the data structures required by
 * the LSM framework using the pattern:
 *
 *        LSM_HOOK(<return_type>, <default_value>, <hook_name>, args...)
 *
 * struct security_hook_heads {
 *   #define LSM_HOOK(RET, DEFAULT, NAME, ...) struct hlist_head NAME;
 *   #include <linux/lsm_hook_defs.h>
 *   #undef LSM_HOOK
 * };
 */
LSM_HOOK(int, 0, binder_set_context_mgr, const struct cred *mgr)
LSM_HOOK(int, 0, binder_transaction, const struct cred *from,
         const struct cred *to)
LSM_HOOK(int, 0, binder_transfer_binder, const struct cred *from,
         const struct cred *to)
LSM_HOOK(int, 0, binder_transfer_file, const struct cred *from,
         const struct cred *to, const struct file *file)
LSM_HOOK(int, 0, ptrace_access_check, struct task_struct *child,
         unsigned int mode)
LSM_HOOK(int, 0, ptrace_traceme, struct task_struct *parent)
LSM_HOOK(int, 0, capget, const struct task_struct *target, kernel_cap_t *effective,
         kernel_cap_t *inheritable, kernel_cap_t *permitted)
LSM_HOOK(int, 0, capset, struct cred *new, const struct cred *old,
         const kernel_cap_t *effective, const kernel_cap_t *inheritable,
         const kernel_cap_t *permitted)
LSM_HOOK(int, 0, capable, const struct cred *cred, struct user_namespace *ns,
         int cap, unsigned int opts)
LSM_HOOK(int, 0, quotactl, int cmds, int type, int id, const struct super_block *sb)
LSM_HOOK(int, 0, quota_on, struct dentry *dentry)
LSM_HOOK(int, 0, syslog, int type)
LSM_HOOK(int, 0, settime, const struct timespec64 *ts,
         const struct timezone *tz)
LSM_HOOK(int, 1, vm_enough_memory, struct mm_struct *mm, long pages)
LSM_HOOK(int, 0, bprm_creds_for_exec, struct linux_binprm *bprm)
LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, const struct file *file)
LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm)
LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, const struct linux_binprm *bprm)
LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, const struct linux_binprm *bprm)
LSM_HOOK(int, 0, fs_context_submount, struct fs_context *fc, struct super_block *reference)
LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc,
         struct fs_context *src_sc)
LSM_HOOK(int, -ENOPARAM, fs_context_parse_param, struct fs_context *fc,
         struct fs_parameter *param)
LSM_HOOK(int, 0, sb_alloc_security, struct super_block *sb)
LSM_HOOK(void, LSM_RET_VOID, sb_delete, struct super_block *sb)
LSM_HOOK(void, LSM_RET_VOID, sb_free_security, struct super_block *sb)
LSM_HOOK(void, LSM_RET_VOID, sb_free_mnt_opts, void *mnt_opts)
LSM_HOOK(int, 0, sb_eat_lsm_opts, char *orig, void **mnt_opts)
LSM_HOOK(int, 0, sb_mnt_opts_compat, struct super_block *sb, void *mnt_opts)
LSM_HOOK(int, 0, sb_remount, struct super_block *sb, void *mnt_opts)
LSM_HOOK(int, 0, sb_kern_mount, const struct super_block *sb)
LSM_HOOK(int, 0, sb_show_options, struct seq_file *m, struct super_block *sb)
LSM_HOOK(int, 0, sb_statfs, struct dentry *dentry)
LSM_HOOK(int, 0, sb_mount, const char *dev_name, const struct path *path,
         const char *type, unsigned long flags, void *data)
LSM_HOOK(int, 0, sb_umount, struct vfsmount *mnt, int flags)
LSM_HOOK(int, 0, sb_pivotroot, const struct path *old_path,
         const struct path *new_path)
LSM_HOOK(int, 0, sb_set_mnt_opts, struct super_block *sb, void *mnt_opts,
         unsigned long kern_flags, unsigned long *set_kern_flags)
LSM_HOOK(int, 0, sb_clone_mnt_opts, const struct super_block *oldsb,
         struct super_block *newsb, unsigned long kern_flags,
         unsigned long *set_kern_flags)
LSM_HOOK(int, 0, move_mount, const struct path *from_path,
         const struct path *to_path)
LSM_HOOK(int, -EOPNOTSUPP, dentry_init_security, struct dentry *dentry,
         int mode, const struct qstr *name, const char **xattr_name,
         void **ctx, u32 *ctxlen)
LSM_HOOK(int, 0, dentry_create_files_as, struct dentry *dentry, int mode,
         struct qstr *name, const struct cred *old, struct cred *new)

#ifdef CONFIG_SECURITY_PATH
LSM_HOOK(int, 0, path_unlink, const struct path *dir, struct dentry *dentry)
LSM_HOOK(int, 0, path_mkdir, const struct path *dir, struct dentry *dentry,
         umode_t mode)
LSM_HOOK(int, 0, path_rmdir, const struct path *dir, struct dentry *dentry)
LSM_HOOK(int, 0, path_mknod, const struct path *dir, struct dentry *dentry,
         umode_t mode, unsigned int dev)
LSM_HOOK(void, LSM_RET_VOID, path_post_mknod, struct mnt_idmap *idmap,
         struct dentry *dentry)
LSM_HOOK(int, 0, path_truncate, const struct path *path)
LSM_HOOK(int, 0, path_symlink, const struct path *dir, struct dentry *dentry,
         const char *old_name)
LSM_HOOK(int, 0, path_link, struct dentry *old_dentry,
         const struct path *new_dir, struct dentry *new_dentry)
LSM_HOOK(int, 0, path_rename, const struct path *old_dir,
         struct dentry *old_dentry, const struct path *new_dir,
         struct dentry *new_dentry, unsigned int flags)
LSM_HOOK(int, 0, path_chmod, const struct path *path, umode_t mode)
LSM_HOOK(int, 0, path_chown, const struct path *path, kuid_t uid, kgid_t gid)
LSM_HOOK(int, 0, path_chroot, const struct path *path)
#endif /* CONFIG_SECURITY_PATH */

/* Needed for inode based security check */
LSM_HOOK(int, 0, path_notify, const struct path *path, u64 mask,
         unsigned int obj_type)
LSM_HOOK(int, 0, inode_alloc_security, struct inode *inode)
LSM_HOOK(void, LSM_RET_VOID, inode_free_security, struct inode *inode)
LSM_HOOK(int, -EOPNOTSUPP, inode_init_security, struct inode *inode,
         struct inode *dir, const struct qstr *qstr, struct xattr *xattrs,
         int *xattr_count)
LSM_HOOK(int, 0, inode_init_security_anon, struct inode *inode,
         const struct qstr *name, const struct inode *context_inode)
LSM_HOOK(int, 0, inode_create, struct inode *dir, struct dentry *dentry,
         umode_t mode)
LSM_HOOK(void, LSM_RET_VOID, inode_post_create_tmpfile, struct mnt_idmap *idmap,
         struct inode *inode)
LSM_HOOK(int, 0, inode_link, struct dentry *old_dentry, struct inode *dir,
         struct dentry *new_dentry)
LSM_HOOK(int, 0, inode_unlink, struct inode *dir, struct dentry *dentry)
LSM_HOOK(int, 0, inode_symlink, struct inode *dir, struct dentry *dentry,
         const char *old_name)
LSM_HOOK(int, 0, inode_mkdir, struct inode *dir, struct dentry *dentry,
         umode_t mode)
LSM_HOOK(int, 0, inode_rmdir, struct inode *dir, struct dentry *dentry)
LSM_HOOK(int, 0, inode_mknod, struct inode *dir, struct dentry *dentry,
         umode_t mode, dev_t dev)
LSM_HOOK(int, 0, inode_rename, struct inode *old_dir, struct dentry *old_dentry,
         struct inode *new_dir, struct dentry *new_dentry)
LSM_HOOK(int, 0, inode_readlink, struct dentry *dentry)
LSM_HOOK(int, 0, inode_follow_link, struct dentry *dentry, struct inode *inode,
         bool rcu)
LSM_HOOK(int, 0, inode_permission, struct inode *inode, int mask)
LSM_HOOK(int, 0, inode_setattr, struct mnt_idmap *idmap, struct dentry *dentry,
         struct iattr *attr)
LSM_HOOK(void, LSM_RET_VOID, inode_post_setattr, struct mnt_idmap *idmap,
         struct dentry *dentry, int ia_valid)
LSM_HOOK(int, 0, inode_getattr, const struct path *path)
LSM_HOOK(int, 0, inode_setxattr, struct mnt_idmap *idmap,
         struct dentry *dentry, const char *name, const void *value,
         size_t size, int flags)
LSM_HOOK(void, LSM_RET_VOID, inode_post_setxattr, struct dentry *dentry,
         const char *name, const void *value, size_t size, int flags)
LSM_HOOK(int, 0, inode_getxattr, struct dentry *dentry, const char *name)
LSM_HOOK(int, 0, inode_listxattr, struct dentry *dentry)
LSM_HOOK(int, 0, inode_removexattr, struct mnt_idmap *idmap,
         struct dentry *dentry, const char *name)
LSM_HOOK(void, LSM_RET_VOID, inode_post_removexattr, struct dentry *dentry,
         const char *name)
LSM_HOOK(int, 0, inode_set_acl, struct mnt_idmap *idmap,
         struct dentry *dentry, const char *acl_name, struct posix_acl *kacl)
LSM_HOOK(void, LSM_RET_VOID, inode_post_set_acl, struct dentry *dentry,
         const char *acl_name, struct posix_acl *kacl)
LSM_HOOK(int, 0, inode_get_acl, struct mnt_idmap *idmap,
         struct dentry *dentry, const char *acl_name)
LSM_HOOK(int, 0, inode_remove_acl, struct mnt_idmap *idmap,
         struct dentry *dentry, const char *acl_name)
LSM_HOOK(void, LSM_RET_VOID, inode_post_remove_acl, struct mnt_idmap *idmap,
         struct dentry *dentry, const char *acl_name)
LSM_HOOK(int, 0, inode_need_killpriv, struct dentry *dentry)
LSM_HOOK(int, 0, inode_killpriv, struct mnt_idmap *idmap,
         struct dentry *dentry)
LSM_HOOK(int, -EOPNOTSUPP, inode_getsecurity, struct mnt_idmap *idmap,
         struct inode *inode, const char *name, void **buffer, bool alloc)
LSM_HOOK(int, -EOPNOTSUPP, inode_setsecurity, struct inode *inode,
         const char *name, const void *value, size_t size, int flags)
LSM_HOOK(int, 0, inode_listsecurity, struct inode *inode, char *buffer,
         size_t buffer_size)
LSM_HOOK(void, LSM_RET_VOID, inode_getsecid, struct inode *inode, u32 *secid)
LSM_HOOK(int, 0, inode_copy_up, struct dentry *src, struct cred **new)
LSM_HOOK(int, -EOPNOTSUPP, inode_copy_up_xattr, struct dentry *src,
         const char *name)
LSM_HOOK(int, 0, kernfs_init_security, struct kernfs_node *kn_dir,
         struct kernfs_node *kn)
LSM_HOOK(int, 0, file_permission, struct file *file, int mask)
LSM_HOOK(int, 0, file_alloc_security, struct file *file)
LSM_HOOK(void, LSM_RET_VOID, file_release, struct file *file)
LSM_HOOK(void, LSM_RET_VOID, file_free_security, struct file *file)
LSM_HOOK(int, 0, file_ioctl, struct file *file, unsigned int cmd,
         unsigned long arg)
LSM_HOOK(int, 0, file_ioctl_compat, struct file *file, unsigned int cmd,
         unsigned long arg)
LSM_HOOK(int, 0, mmap_addr, unsigned long addr)
LSM_HOOK(int, 0, mmap_file, struct file *file, unsigned long reqprot,
         unsigned long prot, unsigned long flags)
LSM_HOOK(int, 0, file_mprotect, struct vm_area_struct *vma,
         unsigned long reqprot, unsigned long prot)
LSM_HOOK(int, 0, file_lock, struct file *file, unsigned int cmd)
LSM_HOOK(int, 0, file_fcntl, struct file *file, unsigned int cmd,
         unsigned long arg)
LSM_HOOK(void, LSM_RET_VOID, file_set_fowner, struct file *file)
LSM_HOOK(int, 0, file_send_sigiotask, struct task_struct *tsk,
         struct fown_struct *fown, int sig)
LSM_HOOK(int, 0, file_receive, struct file *file)
LSM_HOOK(int, 0, file_open, struct file *file)
LSM_HOOK(int, 0, file_post_open, struct file *file, int mask)
LSM_HOOK(int, 0, file_truncate, struct file *file)
LSM_HOOK(int, 0, task_alloc, struct task_struct *task,
         unsigned long clone_flags)
LSM_HOOK(void, LSM_RET_VOID, task_free, struct task_struct *task)
LSM_HOOK(int, 0, cred_alloc_blank, struct cred *cred, gfp_t gfp)
LSM_HOOK(void, LSM_RET_VOID, cred_free, struct cred *cred)
LSM_HOOK(int, 0, cred_prepare, struct cred *new, const struct cred *old,
         gfp_t gfp)
LSM_HOOK(void, LSM_RET_VOID, cred_transfer, struct cred *new,
         const struct cred *old)
LSM_HOOK(void, LSM_RET_VOID, cred_getsecid, const struct cred *c, u32 *secid)
LSM_HOOK(int, 0, kernel_act_as, struct cred *new, u32 secid)
LSM_HOOK(int, 0, kernel_create_files_as, struct cred *new, struct inode *inode)
LSM_HOOK(int, 0, kernel_module_request, char *kmod_name)
LSM_HOOK(int, 0, kernel_load_data, enum kernel_load_data_id id, bool contents)
LSM_HOOK(int, 0, kernel_post_load_data, char *buf, loff_t size,
         enum kernel_load_data_id id, char *description)
LSM_HOOK(int, 0, kernel_read_file, struct file *file,
         enum kernel_read_file_id id, bool contents)
LSM_HOOK(int, 0, kernel_post_read_file, struct file *file, char *buf,
         loff_t size, enum kernel_read_file_id id)
LSM_HOOK(int, 0, task_fix_setuid, struct cred *new, const struct cred *old,
         int flags)
LSM_HOOK(int, 0, task_fix_setgid, struct cred *new, const struct cred * old,
         int flags)
LSM_HOOK(int, 0, task_fix_setgroups, struct cred *new, const struct cred * old)
LSM_HOOK(int, 0, task_setpgid, struct task_struct *p, pid_t pgid)
LSM_HOOK(int, 0, task_getpgid, struct task_struct *p)
LSM_HOOK(int, 0, task_getsid, struct task_struct *p)
LSM_HOOK(void, LSM_RET_VOID, current_getsecid_subj, u32 *secid)
LSM_HOOK(void, LSM_RET_VOID, task_getsecid_obj,
         struct task_struct *p, u32 *secid)
LSM_HOOK(int, 0, task_setnice, struct task_struct *p, int nice)
LSM_HOOK(int, 0, task_setioprio, struct task_struct *p, int ioprio)
LSM_HOOK(int, 0, task_getioprio, struct task_struct *p)
LSM_HOOK(int, 0, task_prlimit, const struct cred *cred,
         const struct cred *tcred, unsigned int flags)
LSM_HOOK(int, 0, task_setrlimit, struct task_struct *p, unsigned int resource,
         struct rlimit *new_rlim)
LSM_HOOK(int, 0, task_setscheduler, struct task_struct *p)
LSM_HOOK(int, 0, task_getscheduler, struct task_struct *p)
LSM_HOOK(int, 0, task_movememory, struct task_struct *p)
LSM_HOOK(int, 0, task_kill, struct task_struct *p, struct kernel_siginfo *info,
         int sig, const struct cred *cred)
LSM_HOOK(int, -ENOSYS, task_prctl, int option, unsigned long arg2,
         unsigned long arg3, unsigned long arg4, unsigned long arg5)
LSM_HOOK(void, LSM_RET_VOID, task_to_inode, struct task_struct *p,
         struct inode *inode)
LSM_HOOK(int, 0, userns_create, const struct cred *cred)
LSM_HOOK(int, 0, ipc_permission, struct kern_ipc_perm *ipcp, short flag)
LSM_HOOK(void, LSM_RET_VOID, ipc_getsecid, struct kern_ipc_perm *ipcp,
         u32 *secid)
LSM_HOOK(int, 0, msg_msg_alloc_security, struct msg_msg *msg)
LSM_HOOK(void, LSM_RET_VOID, msg_msg_free_security, struct msg_msg *msg)
LSM_HOOK(int, 0, msg_queue_alloc_security, struct kern_ipc_perm *perm)
LSM_HOOK(void, LSM_RET_VOID, msg_queue_free_security,
         struct kern_ipc_perm *perm)
LSM_HOOK(int, 0, msg_queue_associate, struct kern_ipc_perm *perm, int msqflg)
LSM_HOOK(int, 0, msg_queue_msgctl, struct kern_ipc_perm *perm, int cmd)
LSM_HOOK(int, 0, msg_queue_msgsnd, struct kern_ipc_perm *perm,
         struct msg_msg *msg, int msqflg)
LSM_HOOK(int, 0, msg_queue_msgrcv, struct kern_ipc_perm *perm,
         struct msg_msg *msg, struct task_struct *target, long type, int mode)
LSM_HOOK(int, 0, shm_alloc_security, struct kern_ipc_perm *perm)
LSM_HOOK(void, LSM_RET_VOID, shm_free_security, struct kern_ipc_perm *perm)
LSM_HOOK(int, 0, shm_associate, struct kern_ipc_perm *perm, int shmflg)
LSM_HOOK(int, 0, shm_shmctl, struct kern_ipc_perm *perm, int cmd)
LSM_HOOK(int, 0, shm_shmat, struct kern_ipc_perm *perm, char __user *shmaddr,
         int shmflg)
LSM_HOOK(int, 0, sem_alloc_security, struct kern_ipc_perm *perm)
LSM_HOOK(void, LSM_RET_VOID, sem_free_security, struct kern_ipc_perm *perm)
LSM_HOOK(int, 0, sem_associate, struct kern_ipc_perm *perm, int semflg)
LSM_HOOK(int, 0, sem_semctl, struct kern_ipc_perm *perm, int cmd)
LSM_HOOK(int, 0, sem_semop, struct kern_ipc_perm *perm, struct sembuf *sops,
         unsigned nsops, int alter)
LSM_HOOK(int, 0, netlink_send, struct sock *sk, struct sk_buff *skb)
LSM_HOOK(void, LSM_RET_VOID, d_instantiate, struct dentry *dentry,
         struct inode *inode)
LSM_HOOK(int, -EOPNOTSUPP, getselfattr, unsigned int attr,
         struct lsm_ctx __user *ctx, u32 *size, u32 flags)
LSM_HOOK(int, -EOPNOTSUPP, setselfattr, unsigned int attr,
         struct lsm_ctx *ctx, u32 size, u32 flags)
LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, const char *name,
         char **value)
LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size)
LSM_HOOK(int, 0, ismaclabel, const char *name)
LSM_HOOK(int, -EOPNOTSUPP, secid_to_secctx, u32 secid, char **secdata,
         u32 *seclen)
LSM_HOOK(int, 0, secctx_to_secid, const char *secdata, u32 seclen, u32 *secid)
LSM_HOOK(void, LSM_RET_VOID, release_secctx, char *secdata, u32 seclen)
LSM_HOOK(void, LSM_RET_VOID, inode_invalidate_secctx, struct inode *inode)
LSM_HOOK(int, 0, inode_notifysecctx, struct inode *inode, void *ctx, u32 ctxlen)
LSM_HOOK(int, 0, inode_setsecctx, struct dentry *dentry, void *ctx, u32 ctxlen)
LSM_HOOK(int, -EOPNOTSUPP, inode_getsecctx, struct inode *inode, void **ctx,
         u32 *ctxlen)

#if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE)
LSM_HOOK(int, 0, post_notification, const struct cred *w_cred,
         const struct cred *cred, struct watch_notification *n)
#endif /* CONFIG_SECURITY && CONFIG_WATCH_QUEUE */

#if defined(CONFIG_SECURITY) && defined(CONFIG_KEY_NOTIFICATIONS)
LSM_HOOK(int, 0, watch_key, struct key *key)
#endif /* CONFIG_SECURITY && CONFIG_KEY_NOTIFICATIONS */

#ifdef CONFIG_SECURITY_NETWORK
LSM_HOOK(int, 0, unix_stream_connect, struct sock *sock, struct sock *other,
         struct sock *newsk)
LSM_HOOK(int, 0, unix_may_send, struct socket *sock, struct socket *other)
LSM_HOOK(int, 0, socket_create, int family, int type, int protocol, int kern)
LSM_HOOK(int, 0, socket_post_create, struct socket *sock, int family, int type,
         int protocol, int kern)
LSM_HOOK(int, 0, socket_socketpair, struct socket *socka, struct socket *sockb)
LSM_HOOK(int, 0, socket_bind, struct socket *sock, struct sockaddr *address,
         int addrlen)
LSM_HOOK(int, 0, socket_connect, struct socket *sock, struct sockaddr *address,
         int addrlen)
LSM_HOOK(int, 0, socket_listen, struct socket *sock, int backlog)
LSM_HOOK(int, 0, socket_accept, struct socket *sock, struct socket *newsock)
LSM_HOOK(int, 0, socket_sendmsg, struct socket *sock, struct msghdr *msg,
         int size)
LSM_HOOK(int, 0, socket_recvmsg, struct socket *sock, struct msghdr *msg,
         int size, int flags)
LSM_HOOK(int, 0, socket_getsockname, struct socket *sock)
LSM_HOOK(int, 0, socket_getpeername, struct socket *sock)
LSM_HOOK(int, 0, socket_getsockopt, struct socket *sock, int level, int optname)
LSM_HOOK(int, 0, socket_setsockopt, struct socket *sock, int level, int optname)
LSM_HOOK(int, 0, socket_shutdown, struct socket *sock, int how)
LSM_HOOK(int, 0, socket_sock_rcv_skb, struct sock *sk, struct sk_buff *skb)
LSM_HOOK(int, -ENOPROTOOPT, socket_getpeersec_stream, struct socket *sock,
         sockptr_t optval, sockptr_t optlen, unsigned int len)
LSM_HOOK(int, -ENOPROTOOPT, socket_getpeersec_dgram, struct socket *sock,
         struct sk_buff *skb, u32 *secid)
LSM_HOOK(int, 0, sk_alloc_security, struct sock *sk, int family, gfp_t priority)
LSM_HOOK(void, LSM_RET_VOID, sk_free_security, struct sock *sk)
LSM_HOOK(void, LSM_RET_VOID, sk_clone_security, const struct sock *sk,
         struct sock *newsk)
LSM_HOOK(void, LSM_RET_VOID, sk_getsecid, const struct sock *sk, u32 *secid)
LSM_HOOK(void, LSM_RET_VOID, sock_graft, struct sock *sk, struct socket *parent)
LSM_HOOK(int, 0, inet_conn_request, const struct sock *sk, struct sk_buff *skb,
         struct request_sock *req)
LSM_HOOK(void, LSM_RET_VOID, inet_csk_clone, struct sock *newsk,
         const struct request_sock *req)
LSM_HOOK(void, LSM_RET_VOID, inet_conn_established, struct sock *sk,
         struct sk_buff *skb)
LSM_HOOK(int, 0, secmark_relabel_packet, u32 secid)
LSM_HOOK(void, LSM_RET_VOID, secmark_refcount_inc, void)
LSM_HOOK(void, LSM_RET_VOID, secmark_refcount_dec, void)
LSM_HOOK(void, LSM_RET_VOID, req_classify_flow, const struct request_sock *req,
         struct flowi_common *flic)
LSM_HOOK(int, 0, tun_dev_alloc_security, void **security)
LSM_HOOK(void, LSM_RET_VOID, tun_dev_free_security, void *security)
LSM_HOOK(int, 0, tun_dev_create, void)
LSM_HOOK(int, 0, tun_dev_attach_queue, void *security)
LSM_HOOK(int, 0, tun_dev_attach, struct sock *sk, void *security)
LSM_HOOK(int, 0, tun_dev_open, void *security)
LSM_HOOK(int, 0, sctp_assoc_request, struct sctp_association *asoc,
         struct sk_buff *skb)
LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname,
         struct sockaddr *address, int addrlen)
LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc,
         struct sock *sk, struct sock *newsk)
LSM_HOOK(int, 0, sctp_assoc_established, struct sctp_association *asoc,
         struct sk_buff *skb)
LSM_HOOK(int, 0, mptcp_add_subflow, struct sock *sk, struct sock *ssk)
#endif /* CONFIG_SECURITY_NETWORK */

#ifdef CONFIG_SECURITY_INFINIBAND
LSM_HOOK(int, 0, ib_pkey_access, void *sec, u64 subnet_prefix, u16 pkey)
LSM_HOOK(int, 0, ib_endport_manage_subnet, void *sec, const char *dev_name,
         u8 port_num)
LSM_HOOK(int, 0, ib_alloc_security, void **sec)
LSM_HOOK(void, LSM_RET_VOID, ib_free_security, void *sec)
#endif /* CONFIG_SECURITY_INFINIBAND */

#ifdef CONFIG_SECURITY_NETWORK_XFRM
LSM_HOOK(int, 0, xfrm_policy_alloc_security, struct xfrm_sec_ctx **ctxp,
         struct xfrm_user_sec_ctx *sec_ctx, gfp_t gfp)
LSM_HOOK(int, 0, xfrm_policy_clone_security, struct xfrm_sec_ctx *old_ctx,
         struct xfrm_sec_ctx **new_ctx)
LSM_HOOK(void, LSM_RET_VOID, xfrm_policy_free_security,
         struct xfrm_sec_ctx *ctx)
LSM_HOOK(int, 0, xfrm_policy_delete_security, struct xfrm_sec_ctx *ctx)
LSM_HOOK(int, 0, xfrm_state_alloc, struct xfrm_state *x,
         struct xfrm_user_sec_ctx *sec_ctx)
LSM_HOOK(int, 0, xfrm_state_alloc_acquire, struct xfrm_state *x,
         struct xfrm_sec_ctx *polsec, u32 secid)
LSM_HOOK(void, LSM_RET_VOID, xfrm_state_free_security, struct xfrm_state *x)
LSM_HOOK(int, 0, xfrm_state_delete_security, struct xfrm_state *x)
LSM_HOOK(int, 0, xfrm_policy_lookup, struct xfrm_sec_ctx *ctx, u32 fl_secid)
LSM_HOOK(int, 1, xfrm_state_pol_flow_match, struct xfrm_state *x,
         struct xfrm_policy *xp, const struct flowi_common *flic)
LSM_HOOK(int, 0, xfrm_decode_session, struct sk_buff *skb, u32 *secid,
         int ckall)
#endif /* CONFIG_SECURITY_NETWORK_XFRM */

/* key management security hooks */
#ifdef CONFIG_KEYS
LSM_HOOK(int, 0, key_alloc, struct key *key, const struct cred *cred,
         unsigned long flags)
LSM_HOOK(void, LSM_RET_VOID, key_free, struct key *key)
LSM_HOOK(int, 0, key_permission, key_ref_t key_ref, const struct cred *cred,
         enum key_need_perm need_perm)
LSM_HOOK(int, 0, key_getsecurity, struct key *key, char **buffer)
LSM_HOOK(void, LSM_RET_VOID, key_post_create_or_update, struct key *keyring,
         struct key *key, const void *payload, size_t payload_len,
         unsigned long flags, bool create)
#endif /* CONFIG_KEYS */

#ifdef CONFIG_AUDIT
LSM_HOOK(int, 0, audit_rule_init, u32 field, u32 op, char *rulestr,
         void **lsmrule, gfp_t gfp)
LSM_HOOK(int, 0, audit_rule_known, struct audit_krule *krule)
LSM_HOOK(int, 0, audit_rule_match, u32 secid, u32 field, u32 op, void *lsmrule)
LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule)
#endif /* CONFIG_AUDIT */

#ifdef CONFIG_BPF_SYSCALL
LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size)
LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode)
LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog)
LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr,
         struct bpf_token *token)
LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map)
LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr,
         struct bpf_token *token)
LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog)
LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr,
         struct path *path)
LSM_HOOK(void, LSM_RET_VOID, bpf_token_free, struct bpf_token *token)
LSM_HOOK(int, 0, bpf_token_cmd, const struct bpf_token *token, enum bpf_cmd cmd)
LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap)
#endif /* CONFIG_BPF_SYSCALL */

LSM_HOOK(int, 0, locked_down, enum lockdown_reason what)

#ifdef CONFIG_PERF_EVENTS
LSM_HOOK(int, 0, perf_event_open, struct perf_event_attr *attr, int type)
LSM_HOOK(int, 0, perf_event_alloc, struct perf_event *event)
LSM_HOOK(void, LSM_RET_VOID, perf_event_free, struct perf_event *event)
LSM_HOOK(int, 0, perf_event_read, struct perf_event *event)
LSM_HOOK(int, 0, perf_event_write, struct perf_event *event)
#endif /* CONFIG_PERF_EVENTS */

#ifdef CONFIG_IO_URING
LSM_HOOK(int, 0, uring_override_creds, const struct cred *new)
LSM_HOOK(int, 0, uring_sqpoll, void)
LSM_HOOK(int, 0, uring_cmd, struct io_uring_cmd *ioucmd)
#endif /* CONFIG_IO_URING */











































































































   12 
    3 
   10 



    3 

   10 


















   12 















   12 




   11 


































































   12 




    1 




    1 






























































































































































































































































































































































































































































    3 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
// SPDX-License-Identifier: GPL-2.0
/*
 * Tag allocation using scalable bitmaps. Uses active queue tracking to support
 * fairer distribution of tags between multiple submitters when a shared tag map
 * is used.
 *
 * Copyright (C) 2013-2014 Jens Axboe
 */
#include <linux/kernel.h>
#include <linux/module.h>

#include <linux/delay.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-sched.h"

/*
 * Recalculate wakeup batch when tag is shared by hctx.
 */
static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
                unsigned int users)
{
        if (!users)
                return;

        sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags,
                        users);
        sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags,
                        users);
}

/*
 * If a previously inactive queue goes active, bump the active user count.
 * We need to do this before try to allocate driver tag, then even if fail
 * to get tag when first time, the other shared-tag users could reserve
 * budget for it.
 */
void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
        unsigned int users;
        struct blk_mq_tags *tags = hctx->tags;

        /*
         * calling test_bit() prior to test_and_set_bit() is intentional,
         * it avoids dirtying the cacheline if the queue is already active.
         */
        if (blk_mq_is_shared_tags(hctx->flags)) {
                struct request_queue *q = hctx->queue;

                if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) ||
                    test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
                        return;
        } else {
                if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) ||
                    test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
                        return;
        }

        spin_lock_irq(&tags->lock);
        users = tags->active_queues + 1;
        WRITE_ONCE(tags->active_queues, users);
        blk_mq_update_wake_batch(tags, users);
        spin_unlock_irq(&tags->lock);
}

/*
 * Wakeup all potentially sleeping on tags
 */
void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
{
        sbitmap_queue_wake_all(&tags->bitmap_tags);
        if (include_reserve)
                sbitmap_queue_wake_all(&tags->breserved_tags);
}

/*
 * If a previously busy queue goes inactive, potential waiters could now
 * be allowed to queue. Wake them up and check.
 */
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
        struct blk_mq_tags *tags = hctx->tags;
        unsigned int users;

        if (blk_mq_is_shared_tags(hctx->flags)) {
                struct request_queue *q = hctx->queue;

                if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
                                        &q->queue_flags))
                        return;
        } else {
                if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
                        return;
        }

        spin_lock_irq(&tags->lock);
        users = tags->active_queues - 1;
        WRITE_ONCE(tags->active_queues, users);
        blk_mq_update_wake_batch(tags, users);
        spin_unlock_irq(&tags->lock);

        blk_mq_tag_wakeup_all(tags, false);
}

static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
                            struct sbitmap_queue *bt)
{
        if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) &&
                        !hctx_may_queue(data->hctx, bt))
                return BLK_MQ_NO_TAG;

        if (data->shallow_depth)
                return sbitmap_queue_get_shallow(bt, data->shallow_depth);
        else
                return __sbitmap_queue_get(bt);
}

unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
                              unsigned int *offset)
{
        struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
        struct sbitmap_queue *bt = &tags->bitmap_tags;
        unsigned long ret;

        if (data->shallow_depth ||data->flags & BLK_MQ_REQ_RESERVED ||
            data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
                return 0;
        ret = __sbitmap_queue_get_batch(bt, nr_tags, offset);
        *offset += tags->nr_reserved_tags;
        return ret;
}

unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
{
        struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
        struct sbitmap_queue *bt;
        struct sbq_wait_state *ws;
        DEFINE_SBQ_WAIT(wait);
        unsigned int tag_offset;
        int tag;

        if (data->flags & BLK_MQ_REQ_RESERVED) {
                if (unlikely(!tags->nr_reserved_tags)) {
                        WARN_ON_ONCE(1);
                        return BLK_MQ_NO_TAG;
                }
                bt = &tags->breserved_tags;
                tag_offset = 0;
        } else {
                bt = &tags->bitmap_tags;
                tag_offset = tags->nr_reserved_tags;
        }

        tag = __blk_mq_get_tag(data, bt);
        if (tag != BLK_MQ_NO_TAG)
                goto found_tag;

        if (data->flags & BLK_MQ_REQ_NOWAIT)
                return BLK_MQ_NO_TAG;

        ws = bt_wait_ptr(bt, data->hctx);
        do {
                struct sbitmap_queue *bt_prev;

                /*
                 * We're out of tags on this hardware queue, kick any
                 * pending IO submits before going to sleep waiting for
                 * some to complete.
                 */
                blk_mq_run_hw_queue(data->hctx, false);

                /*
                 * Retry tag allocation after running the hardware queue,
                 * as running the queue may also have found completions.
                 */
                tag = __blk_mq_get_tag(data, bt);
                if (tag != BLK_MQ_NO_TAG)
                        break;

                sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);

                tag = __blk_mq_get_tag(data, bt);
                if (tag != BLK_MQ_NO_TAG)
                        break;

                bt_prev = bt;
                io_schedule();

                sbitmap_finish_wait(bt, ws, &wait);

                data->ctx = blk_mq_get_ctx(data->q);
                data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
                                                data->ctx);
                tags = blk_mq_tags_from_data(data);
                if (data->flags & BLK_MQ_REQ_RESERVED)
                        bt = &tags->breserved_tags;
                else
                        bt = &tags->bitmap_tags;

                /*
                 * If destination hw queue is changed, fake wake up on
                 * previous queue for compensating the wake up miss, so
                 * other allocations on previous queue won't be starved.
                 */
                if (bt != bt_prev)
                        sbitmap_queue_wake_up(bt_prev, 1);

                ws = bt_wait_ptr(bt, data->hctx);
        } while (1);

        sbitmap_finish_wait(bt, ws, &wait);

found_tag:
        /*
         * Give up this allocation if the hctx is inactive.  The caller will
         * retry on an active hctx.
         */
        if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) {
                blk_mq_put_tag(tags, data->ctx, tag + tag_offset);
                return BLK_MQ_NO_TAG;
        }
        return tag + tag_offset;
}

void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
                    unsigned int tag)
{
        if (!blk_mq_tag_is_reserved(tags, tag)) {
                const int real_tag = tag - tags->nr_reserved_tags;

                BUG_ON(real_tag >= tags->nr_tags);
                sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
        } else {
                sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
        }
}

void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags)
{
        sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags,
                                        tag_array, nr_tags);
}

struct bt_iter_data {
        struct blk_mq_hw_ctx *hctx;
        struct request_queue *q;
        busy_tag_iter_fn *fn;
        void *data;
        bool reserved;
};

static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
                unsigned int bitnr)
{
        struct request *rq;
        unsigned long flags;

        spin_lock_irqsave(&tags->lock, flags);
        rq = tags->rqs[bitnr];
        if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq))
                rq = NULL;
        spin_unlock_irqrestore(&tags->lock, flags);
        return rq;
}

static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
        struct bt_iter_data *iter_data = data;
        struct blk_mq_hw_ctx *hctx = iter_data->hctx;
        struct request_queue *q = iter_data->q;
        struct blk_mq_tag_set *set = q->tag_set;
        struct blk_mq_tags *tags;
        struct request *rq;
        bool ret = true;

        if (blk_mq_is_shared_tags(set->flags))
                tags = set->shared_tags;
        else
                tags = hctx->tags;

        if (!iter_data->reserved)
                bitnr += tags->nr_reserved_tags;
        /*
         * We can hit rq == NULL here, because the tagging functions
         * test and set the bit before assigning ->rqs[].
         */
        rq = blk_mq_find_and_get_req(tags, bitnr);
        if (!rq)
                return true;

        if (rq->q == q && (!hctx || rq->mq_hctx == hctx))
                ret = iter_data->fn(rq, iter_data->data);
        blk_mq_put_rq_ref(rq);
        return ret;
}

/**
 * bt_for_each - iterate over the requests associated with a hardware queue
 * @hctx:        Hardware queue to examine.
 * @q:                Request queue to examine.
 * @bt:                sbitmap to examine. This is either the breserved_tags member
 *                or the bitmap_tags member of struct blk_mq_tags.
 * @fn:                Pointer to the function that will be called for each request
 *                associated with @hctx that has been assigned a driver tag.
 *                @fn will be called as follows: @fn(@hctx, rq, @data, @reserved)
 *                where rq is a pointer to a request. Return true to continue
 *                iterating tags, false to stop.
 * @data:        Will be passed as third argument to @fn.
 * @reserved:        Indicates whether @bt is the breserved_tags member or the
 *                bitmap_tags member of struct blk_mq_tags.
 */
static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct request_queue *q,
                        struct sbitmap_queue *bt, busy_tag_iter_fn *fn,
                        void *data, bool reserved)
{
        struct bt_iter_data iter_data = {
                .hctx = hctx,
                .fn = fn,
                .data = data,
                .reserved = reserved,
                .q = q,
        };

        sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
}

struct bt_tags_iter_data {
        struct blk_mq_tags *tags;
        busy_tag_iter_fn *fn;
        void *data;
        unsigned int flags;
};

#define BT_TAG_ITER_RESERVED                (1 << 0)
#define BT_TAG_ITER_STARTED                (1 << 1)
#define BT_TAG_ITER_STATIC_RQS                (1 << 2)

static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
        struct bt_tags_iter_data *iter_data = data;
        struct blk_mq_tags *tags = iter_data->tags;
        struct request *rq;
        bool ret = true;
        bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS);

        if (!(iter_data->flags & BT_TAG_ITER_RESERVED))
                bitnr += tags->nr_reserved_tags;

        /*
         * We can hit rq == NULL here, because the tagging functions
         * test and set the bit before assigning ->rqs[].
         */
        if (iter_static_rqs)
                rq = tags->static_rqs[bitnr];
        else
                rq = blk_mq_find_and_get_req(tags, bitnr);
        if (!rq)
                return true;

        if (!(iter_data->flags & BT_TAG_ITER_STARTED) ||
            blk_mq_request_started(rq))
                ret = iter_data->fn(rq, iter_data->data);
        if (!iter_static_rqs)
                blk_mq_put_rq_ref(rq);
        return ret;
}

/**
 * bt_tags_for_each - iterate over the requests in a tag map
 * @tags:        Tag map to iterate over.
 * @bt:                sbitmap to examine. This is either the breserved_tags member
 *                or the bitmap_tags member of struct blk_mq_tags.
 * @fn:                Pointer to the function that will be called for each started
 *                request. @fn will be called as follows: @fn(rq, @data,
 *                @reserved) where rq is a pointer to a request. Return true
 *                to continue iterating tags, false to stop.
 * @data:        Will be passed as second argument to @fn.
 * @flags:        BT_TAG_ITER_*
 */
static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
                             busy_tag_iter_fn *fn, void *data, unsigned int flags)
{
        struct bt_tags_iter_data iter_data = {
                .tags = tags,
                .fn = fn,
                .data = data,
                .flags = flags,
        };

        if (tags->rqs)
                sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
}

static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
                busy_tag_iter_fn *fn, void *priv, unsigned int flags)
{
        WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED);

        if (tags->nr_reserved_tags)
                bt_tags_for_each(tags, &tags->breserved_tags, fn, priv,
                                 flags | BT_TAG_ITER_RESERVED);
        bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags);
}

/**
 * blk_mq_all_tag_iter - iterate over all requests in a tag map
 * @tags:        Tag map to iterate over.
 * @fn:                Pointer to the function that will be called for each
 *                request. @fn will be called as follows: @fn(rq, @priv,
 *                reserved) where rq is a pointer to a request. 'reserved'
 *                indicates whether or not @rq is a reserved request. Return
 *                true to continue iterating tags, false to stop.
 * @priv:        Will be passed as second argument to @fn.
 *
 * Caller has to pass the tag map from which requests are allocated.
 */
void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
                void *priv)
{
        __blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS);
}

/**
 * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set
 * @tagset:        Tag set to iterate over.
 * @fn:                Pointer to the function that will be called for each started
 *                request. @fn will be called as follows: @fn(rq, @priv,
 *                reserved) where rq is a pointer to a request. 'reserved'
 *                indicates whether or not @rq is a reserved request. Return
 *                true to continue iterating tags, false to stop.
 * @priv:        Will be passed as second argument to @fn.
 *
 * We grab one request reference before calling @fn and release it after
 * @fn returns.
 */
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
                busy_tag_iter_fn *fn, void *priv)
{
        unsigned int flags = tagset->flags;
        int i, nr_tags;

        nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues;

        for (i = 0; i < nr_tags; i++) {
                if (tagset->tags && tagset->tags[i])
                        __blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
                                              BT_TAG_ITER_STARTED);
        }
}
EXPORT_SYMBOL(blk_mq_tagset_busy_iter);

static bool blk_mq_tagset_count_completed_rqs(struct request *rq, void *data)
{
        unsigned *count = data;

        if (blk_mq_request_completed(rq))
                (*count)++;
        return true;
}

/**
 * blk_mq_tagset_wait_completed_request - Wait until all scheduled request
 * completions have finished.
 * @tagset:        Tag set to drain completed request
 *
 * Note: This function has to be run after all IO queues are shutdown
 */
void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset)
{
        while (true) {
                unsigned count = 0;

                blk_mq_tagset_busy_iter(tagset,
                                blk_mq_tagset_count_completed_rqs, &count);
                if (!count)
                        break;
                msleep(5);
        }
}
EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);

/**
 * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag
 * @q:                Request queue to examine.
 * @fn:                Pointer to the function that will be called for each request
 *                on @q. @fn will be called as follows: @fn(hctx, rq, @priv,
 *                reserved) where rq is a pointer to a request and hctx points
 *                to the hardware queue associated with the request. 'reserved'
 *                indicates whether or not @rq is a reserved request.
 * @priv:        Will be passed as third argument to @fn.
 *
 * Note: if @q->tag_set is shared with other request queues then @fn will be
 * called for all requests on all queues that share that tag set and not only
 * for requests associated with @q.
 */
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
                void *priv)
{
        /*
         * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table
         * while the queue is frozen. So we can use q_usage_counter to avoid
         * racing with it.
         */
        if (!percpu_ref_tryget(&q->q_usage_counter))
                return;

        if (blk_mq_is_shared_tags(q->tag_set->flags)) {
                struct blk_mq_tags *tags = q->tag_set->shared_tags;
                struct sbitmap_queue *bresv = &tags->breserved_tags;
                struct sbitmap_queue *btags = &tags->bitmap_tags;

                if (tags->nr_reserved_tags)
                        bt_for_each(NULL, q, bresv, fn, priv, true);
                bt_for_each(NULL, q, btags, fn, priv, false);
        } else {
                struct blk_mq_hw_ctx *hctx;
                unsigned long i;

                queue_for_each_hw_ctx(q, hctx, i) {
                        struct blk_mq_tags *tags = hctx->tags;
                        struct sbitmap_queue *bresv = &tags->breserved_tags;
                        struct sbitmap_queue *btags = &tags->bitmap_tags;

                        /*
                         * If no software queues are currently mapped to this
                         * hardware queue, there's nothing to check
                         */
                        if (!blk_mq_hw_queue_mapped(hctx))
                                continue;

                        if (tags->nr_reserved_tags)
                                bt_for_each(hctx, q, bresv, fn, priv, true);
                        bt_for_each(hctx, q, btags, fn, priv, false);
                }
        }
        blk_queue_exit(q);
}

static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
                    bool round_robin, int node)
{
        return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL,
                                       node);
}

int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
                        struct sbitmap_queue *breserved_tags,
                        unsigned int queue_depth, unsigned int reserved,
                        int node, int alloc_policy)
{
        unsigned int depth = queue_depth - reserved;
        bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;

        if (bt_alloc(bitmap_tags, depth, round_robin, node))
                return -ENOMEM;
        if (bt_alloc(breserved_tags, reserved, round_robin, node))
                goto free_bitmap_tags;

        return 0;

free_bitmap_tags:
        sbitmap_queue_free(bitmap_tags);
        return -ENOMEM;
}

struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
                                     unsigned int reserved_tags,
                                     int node, int alloc_policy)
{
        struct blk_mq_tags *tags;

        if (total_tags > BLK_MQ_TAG_MAX) {
                pr_err("blk-mq: tag depth too large\n");
                return NULL;
        }

        tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node);
        if (!tags)
                return NULL;

        tags->nr_tags = total_tags;
        tags->nr_reserved_tags = reserved_tags;
        spin_lock_init(&tags->lock);

        if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags,
                                total_tags, reserved_tags, node,
                                alloc_policy) < 0) {
                kfree(tags);
                return NULL;
        }
        return tags;
}

void blk_mq_free_tags(struct blk_mq_tags *tags)
{
        sbitmap_queue_free(&tags->bitmap_tags);
        sbitmap_queue_free(&tags->breserved_tags);
        kfree(tags);
}

int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
                            struct blk_mq_tags **tagsptr, unsigned int tdepth,
                            bool can_grow)
{
        struct blk_mq_tags *tags = *tagsptr;

        if (tdepth <= tags->nr_reserved_tags)
                return -EINVAL;

        /*
         * If we are allowed to grow beyond the original size, allocate
         * a new set of tags before freeing the old one.
         */
        if (tdepth > tags->nr_tags) {
                struct blk_mq_tag_set *set = hctx->queue->tag_set;
                struct blk_mq_tags *new;

                if (!can_grow)
                        return -EINVAL;

                /*
                 * We need some sort of upper limit, set it high enough that
                 * no valid use cases should require more.
                 */
                if (tdepth > MAX_SCHED_RQ)
                        return -EINVAL;

                /*
                 * Only the sbitmap needs resizing since we allocated the max
                 * initially.
                 */
                if (blk_mq_is_shared_tags(set->flags))
                        return 0;

                new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth);
                if (!new)
                        return -ENOMEM;

                blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num);
                *tagsptr = new;
        } else {
                /*
                 * Don't need (or can't) update reserved tags here, they
                 * remain static and should never need resizing.
                 */
                sbitmap_queue_resize(&tags->bitmap_tags,
                                tdepth - tags->nr_reserved_tags);
        }

        return 0;
}

void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size)
{
        struct blk_mq_tags *tags = set->shared_tags;

        sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags);
}

void blk_mq_tag_update_sched_shared_tags(struct request_queue *q)
{
        sbitmap_queue_resize(&q->sched_shared_tags->bitmap_tags,
                             q->nr_requests - q->tag_set->reserved_tags);
}

/**
 * blk_mq_unique_tag() - return a tag that is unique queue-wide
 * @rq: request for which to compute a unique tag
 *
 * The tag field in struct request is unique per hardware queue but not over
 * all hardware queues. Hence this function that returns a tag with the
 * hardware context index in the upper bits and the per hardware queue tag in
 * the lower bits.
 *
 * Note: When called for a request that is queued on a non-multiqueue request
 * queue, the hardware context index is set to zero.
 */
u32 blk_mq_unique_tag(struct request *rq)
{
        return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) |
                (rq->tag & BLK_MQ_UNIQUE_TAG_MASK);
}
EXPORT_SYMBOL(blk_mq_unique_tag);



























    1 























    1 























    3 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_MQ_SCHED_H
#define BLK_MQ_SCHED_H

#include "elevator.h"
#include "blk-mq.h"

#define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ)

bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs, struct request **merged_request);
bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs);
bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
                                   struct list_head *free);
void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);

void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);

int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
void blk_mq_sched_free_rqs(struct request_queue *q);

static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
        if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
                __blk_mq_sched_restart(hctx);
}

static inline bool bio_mergeable(struct bio *bio)
{
        return !(bio->bi_opf & REQ_NOMERGE_FLAGS);
}

static inline bool
blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
                         struct bio *bio)
{
        if (rq->rq_flags & RQF_USE_SCHED) {
                struct elevator_queue *e = q->elevator;

                if (e->type->ops.allow_merge)
                        return e->type->ops.allow_merge(q, rq, bio);
        }
        return true;
}

static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
{
        if (rq->rq_flags & RQF_USE_SCHED) {
                struct elevator_queue *e = rq->q->elevator;

                if (e->type->ops.completed_request)
                        e->type->ops.completed_request(rq, now);
        }
}

static inline void blk_mq_sched_requeue_request(struct request *rq)
{
        if (rq->rq_flags & RQF_USE_SCHED) {
                struct request_queue *q = rq->q;
                struct elevator_queue *e = q->elevator;

                if (e->type->ops.requeue_request)
                        e->type->ops.requeue_request(rq);
        }
}

static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
{
        struct elevator_queue *e = hctx->queue->elevator;

        if (e && e->type->ops.has_work)
                return e->type->ops.has_work(hctx);

        return false;
}

static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
{
        return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
}

#endif

















































    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CURRENT_H
#define _ASM_X86_CURRENT_H

#include <linux/build_bug.h>
#include <linux/compiler.h>

#ifndef __ASSEMBLY__

#include <linux/cache.h>
#include <asm/percpu.h>

struct task_struct;

struct pcpu_hot {
        union {
                struct {
                        struct task_struct        *current_task;
                        int                        preempt_count;
                        int                        cpu_number;
#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
                        u64                        call_depth;
#endif
                        unsigned long                top_of_stack;
                        void                        *hardirq_stack_ptr;
                        u16                        softirq_pending;
#ifdef CONFIG_X86_64
                        bool                        hardirq_stack_inuse;
#else
                        void                        *softirq_stack_ptr;
#endif
                };
                u8        pad[64];
        };
};
static_assert(sizeof(struct pcpu_hot) == 64);

DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot);

/* const-qualified alias to pcpu_hot, aliased by linker. */
DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override,
                        const_pcpu_hot);

static __always_inline struct task_struct *get_current(void)
{
        if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
                return this_cpu_read_const(const_pcpu_hot.current_task);

        return this_cpu_read_stable(pcpu_hot.current_task);
}

#define current get_current()

#endif /* __ASSEMBLY__ */

#endif /* _ASM_X86_CURRENT_H */























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 






    1 




































































































































































































    1 




    1 











































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
// SPDX-License-Identifier: GPL-2.0

#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/sched/mm.h>
#include <linux/atomic.h>
#include <linux/vmalloc.h>
#include "ctree.h"
#include "volumes.h"
#include "zoned.h"
#include "rcu-string.h"
#include "disk-io.h"
#include "block-group.h"
#include "dev-replace.h"
#include "space-info.h"
#include "fs.h"
#include "accessors.h"
#include "bio.h"

/* Maximum number of zones to report per blkdev_report_zones() call */
#define BTRFS_REPORT_NR_ZONES   4096
/* Invalid allocation pointer value for missing devices */
#define WP_MISSING_DEV ((u64)-1)
/* Pseudo write pointer value for conventional zone */
#define WP_CONVENTIONAL ((u64)-2)

/*
 * Location of the first zone of superblock logging zone pairs.
 *
 * - primary superblock:    0B (zone 0)
 * - first copy:          512G (zone starting at that offset)
 * - second copy:           4T (zone starting at that offset)
 */
#define BTRFS_SB_LOG_PRIMARY_OFFSET        (0ULL)
#define BTRFS_SB_LOG_FIRST_OFFSET        (512ULL * SZ_1G)
#define BTRFS_SB_LOG_SECOND_OFFSET        (4096ULL * SZ_1G)

#define BTRFS_SB_LOG_FIRST_SHIFT        const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
#define BTRFS_SB_LOG_SECOND_SHIFT        const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)

/* Number of superblock log zones */
#define BTRFS_NR_SB_LOG_ZONES 2

/*
 * Minimum of active zones we need:
 *
 * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
 * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
 * - 1 zone for tree-log dedicated block group
 * - 1 zone for relocation
 */
#define BTRFS_MIN_ACTIVE_ZONES                (BTRFS_SUPER_MIRROR_MAX + 5)

/*
 * Minimum / maximum supported zone size. Currently, SMR disks have a zone
 * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
 * We do not expect the zone size to become larger than 8GiB or smaller than
 * 4MiB in the near future.
 */
#define BTRFS_MAX_ZONE_SIZE                SZ_8G
#define BTRFS_MIN_ZONE_SIZE                SZ_4M

#define SUPER_INFO_SECTORS        ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)

static void wait_eb_writebacks(struct btrfs_block_group *block_group);
static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written);

static inline bool sb_zone_is_full(const struct blk_zone *zone)
{
        return (zone->cond == BLK_ZONE_COND_FULL) ||
                (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity);
}

static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
{
        struct blk_zone *zones = data;

        memcpy(&zones[idx], zone, sizeof(*zone));

        return 0;
}

static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
                            u64 *wp_ret)
{
        bool empty[BTRFS_NR_SB_LOG_ZONES];
        bool full[BTRFS_NR_SB_LOG_ZONES];
        sector_t sector;
        int i;

        for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
                ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
                empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
                full[i] = sb_zone_is_full(&zones[i]);
        }

        /*
         * Possible states of log buffer zones
         *
         *           Empty[0]  In use[0]  Full[0]
         * Empty[1]         *          0        1
         * In use[1]        x          x        1
         * Full[1]          0          0        C
         *
         * Log position:
         *   *: Special case, no superblock is written
         *   0: Use write pointer of zones[0]
         *   1: Use write pointer of zones[1]
         *   C: Compare super blocks from zones[0] and zones[1], use the latest
         *      one determined by generation
         *   x: Invalid state
         */

        if (empty[0] && empty[1]) {
                /* Special case to distinguish no superblock to read */
                *wp_ret = zones[0].start << SECTOR_SHIFT;
                return -ENOENT;
        } else if (full[0] && full[1]) {
                /* Compare two super blocks */
                struct address_space *mapping = bdev->bd_mapping;
                struct page *page[BTRFS_NR_SB_LOG_ZONES];
                struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
                int i;

                for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
                        u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT;
                        u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) -
                                                BTRFS_SUPER_INFO_SIZE;

                        page[i] = read_cache_page_gfp(mapping,
                                        bytenr >> PAGE_SHIFT, GFP_NOFS);
                        if (IS_ERR(page[i])) {
                                if (i == 1)
                                        btrfs_release_disk_super(super[0]);
                                return PTR_ERR(page[i]);
                        }
                        super[i] = page_address(page[i]);
                }

                if (btrfs_super_generation(super[0]) >
                    btrfs_super_generation(super[1]))
                        sector = zones[1].start;
                else
                        sector = zones[0].start;

                for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
                        btrfs_release_disk_super(super[i]);
        } else if (!full[0] && (empty[1] || full[1])) {
                sector = zones[0].wp;
        } else if (full[0]) {
                sector = zones[1].wp;
        } else {
                return -EUCLEAN;
        }
        *wp_ret = sector << SECTOR_SHIFT;
        return 0;
}

/*
 * Get the first zone number of the superblock mirror
 */
static inline u32 sb_zone_number(int shift, int mirror)
{
        u64 zone = U64_MAX;

        ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
        switch (mirror) {
        case 0: zone = 0; break;
        case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
        case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
        }

        ASSERT(zone <= U32_MAX);

        return (u32)zone;
}

static inline sector_t zone_start_sector(u32 zone_number,
                                         struct block_device *bdev)
{
        return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev));
}

static inline u64 zone_start_physical(u32 zone_number,
                                      struct btrfs_zoned_device_info *zone_info)
{
        return (u64)zone_number << zone_info->zone_size_shift;
}

/*
 * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
 * device into static sized chunks and fake a conventional zone on each of
 * them.
 */
static int emulate_report_zones(struct btrfs_device *device, u64 pos,
                                struct blk_zone *zones, unsigned int nr_zones)
{
        const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT;
        sector_t bdev_size = bdev_nr_sectors(device->bdev);
        unsigned int i;

        pos >>= SECTOR_SHIFT;
        for (i = 0; i < nr_zones; i++) {
                zones[i].start = i * zone_sectors + pos;
                zones[i].len = zone_sectors;
                zones[i].capacity = zone_sectors;
                zones[i].wp = zones[i].start + zone_sectors;
                zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
                zones[i].cond = BLK_ZONE_COND_NOT_WP;

                if (zones[i].wp >= bdev_size) {
                        i++;
                        break;
                }
        }

        return i;
}

static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
                               struct blk_zone *zones, unsigned int *nr_zones)
{
        struct btrfs_zoned_device_info *zinfo = device->zone_info;
        int ret;

        if (!*nr_zones)
                return 0;

        if (!bdev_is_zoned(device->bdev)) {
                ret = emulate_report_zones(device, pos, zones, *nr_zones);
                *nr_zones = ret;
                return 0;
        }

        /* Check cache */
        if (zinfo->zone_cache) {
                unsigned int i;
                u32 zno;

                ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
                zno = pos >> zinfo->zone_size_shift;
                /*
                 * We cannot report zones beyond the zone end. So, it is OK to
                 * cap *nr_zones to at the end.
                 */
                *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);

                for (i = 0; i < *nr_zones; i++) {
                        struct blk_zone *zone_info;

                        zone_info = &zinfo->zone_cache[zno + i];
                        if (!zone_info->len)
                                break;
                }

                if (i == *nr_zones) {
                        /* Cache hit on all the zones */
                        memcpy(zones, zinfo->zone_cache + zno,
                               sizeof(*zinfo->zone_cache) * *nr_zones);
                        return 0;
                }
        }

        ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
                                  copy_zone_info_cb, zones);
        if (ret < 0) {
                btrfs_err_in_rcu(device->fs_info,
                                 "zoned: failed to read zone %llu on %s (devid %llu)",
                                 pos, rcu_str_deref(device->name),
                                 device->devid);
                return ret;
        }
        *nr_zones = ret;
        if (!ret)
                return -EIO;

        /* Populate cache */
        if (zinfo->zone_cache) {
                u32 zno = pos >> zinfo->zone_size_shift;

                memcpy(zinfo->zone_cache + zno, zones,
                       sizeof(*zinfo->zone_cache) * *nr_zones);
        }

        return 0;
}

/* The emulated zone size is determined from the size of device extent */
static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
{
        struct btrfs_path *path;
        struct btrfs_root *root = fs_info->dev_root;
        struct btrfs_key key;
        struct extent_buffer *leaf;
        struct btrfs_dev_extent *dext;
        int ret = 0;

        key.objectid = 1;
        key.type = BTRFS_DEV_EXTENT_KEY;
        key.offset = 0;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto out;

        if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
                ret = btrfs_next_leaf(root, path);
                if (ret < 0)
                        goto out;
                /* No dev extents at all? Not good */
                if (ret > 0) {
                        ret = -EUCLEAN;
                        goto out;
                }
        }

        leaf = path->nodes[0];
        dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
        fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
        ret = 0;

out:
        btrfs_free_path(path);

        return ret;
}

int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
{
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        struct btrfs_device *device;
        int ret = 0;

        /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
        if (!btrfs_fs_incompat(fs_info, ZONED))
                return 0;

        mutex_lock(&fs_devices->device_list_mutex);
        list_for_each_entry(device, &fs_devices->devices, dev_list) {
                /* We can skip reading of zone info for missing devices */
                if (!device->bdev)
                        continue;

                ret = btrfs_get_dev_zone_info(device, true);
                if (ret)
                        break;
        }
        mutex_unlock(&fs_devices->device_list_mutex);

        return ret;
}

int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
{
        struct btrfs_fs_info *fs_info = device->fs_info;
        struct btrfs_zoned_device_info *zone_info = NULL;
        struct block_device *bdev = device->bdev;
        unsigned int max_active_zones;
        unsigned int nactive;
        sector_t nr_sectors;
        sector_t sector = 0;
        struct blk_zone *zones = NULL;
        unsigned int i, nreported = 0, nr_zones;
        sector_t zone_sectors;
        char *model, *emulated;
        int ret;

        /*
         * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
         * yet be set.
         */
        if (!btrfs_fs_incompat(fs_info, ZONED))
                return 0;

        if (device->zone_info)
                return 0;

        zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
        if (!zone_info)
                return -ENOMEM;

        device->zone_info = zone_info;

        if (!bdev_is_zoned(bdev)) {
                if (!fs_info->zone_size) {
                        ret = calculate_emulated_zone_size(fs_info);
                        if (ret)
                                goto out;
                }

                ASSERT(fs_info->zone_size);
                zone_sectors = fs_info->zone_size >> SECTOR_SHIFT;
        } else {
                zone_sectors = bdev_zone_sectors(bdev);
        }

        ASSERT(is_power_of_two_u64(zone_sectors));
        zone_info->zone_size = zone_sectors << SECTOR_SHIFT;

        /* We reject devices with a zone size larger than 8GB */
        if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
                btrfs_err_in_rcu(fs_info,
                "zoned: %s: zone size %llu larger than supported maximum %llu",
                                 rcu_str_deref(device->name),
                                 zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
                ret = -EINVAL;
                goto out;
        } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
                btrfs_err_in_rcu(fs_info,
                "zoned: %s: zone size %llu smaller than supported minimum %u",
                                 rcu_str_deref(device->name),
                                 zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
                ret = -EINVAL;
                goto out;
        }

        nr_sectors = bdev_nr_sectors(bdev);
        zone_info->zone_size_shift = ilog2(zone_info->zone_size);
        zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
        if (!IS_ALIGNED(nr_sectors, zone_sectors))
                zone_info->nr_zones++;

        max_active_zones = bdev_max_active_zones(bdev);
        if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
                btrfs_err_in_rcu(fs_info,
"zoned: %s: max active zones %u is too small, need at least %u active zones",
                                 rcu_str_deref(device->name), max_active_zones,
                                 BTRFS_MIN_ACTIVE_ZONES);
                ret = -EINVAL;
                goto out;
        }
        zone_info->max_active_zones = max_active_zones;

        zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
        if (!zone_info->seq_zones) {
                ret = -ENOMEM;
                goto out;
        }

        zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
        if (!zone_info->empty_zones) {
                ret = -ENOMEM;
                goto out;
        }

        zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
        if (!zone_info->active_zones) {
                ret = -ENOMEM;
                goto out;
        }

        zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
        if (!zones) {
                ret = -ENOMEM;
                goto out;
        }

        /*
         * Enable zone cache only for a zoned device. On a non-zoned device, we
         * fill the zone info with emulated CONVENTIONAL zones, so no need to
         * use the cache.
         */
        if (populate_cache && bdev_is_zoned(device->bdev)) {
                zone_info->zone_cache = vcalloc(zone_info->nr_zones,
                                                sizeof(struct blk_zone));
                if (!zone_info->zone_cache) {
                        btrfs_err_in_rcu(device->fs_info,
                                "zoned: failed to allocate zone cache for %s",
                                rcu_str_deref(device->name));
                        ret = -ENOMEM;
                        goto out;
                }
        }

        /* Get zones type */
        nactive = 0;
        while (sector < nr_sectors) {
                nr_zones = BTRFS_REPORT_NR_ZONES;
                ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
                                          &nr_zones);
                if (ret)
                        goto out;

                for (i = 0; i < nr_zones; i++) {
                        if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
                                __set_bit(nreported, zone_info->seq_zones);
                        switch (zones[i].cond) {
                        case BLK_ZONE_COND_EMPTY:
                                __set_bit(nreported, zone_info->empty_zones);
                                break;
                        case BLK_ZONE_COND_IMP_OPEN:
                        case BLK_ZONE_COND_EXP_OPEN:
                        case BLK_ZONE_COND_CLOSED:
                                __set_bit(nreported, zone_info->active_zones);
                                nactive++;
                                break;
                        }
                        nreported++;
                }
                sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
        }

        if (nreported != zone_info->nr_zones) {
                btrfs_err_in_rcu(device->fs_info,
                                 "inconsistent number of zones on %s (%u/%u)",
                                 rcu_str_deref(device->name), nreported,
                                 zone_info->nr_zones);
                ret = -EIO;
                goto out;
        }

        if (max_active_zones) {
                if (nactive > max_active_zones) {
                        btrfs_err_in_rcu(device->fs_info,
                        "zoned: %u active zones on %s exceeds max_active_zones %u",
                                         nactive, rcu_str_deref(device->name),
                                         max_active_zones);
                        ret = -EIO;
                        goto out;
                }
                atomic_set(&zone_info->active_zones_left,
                           max_active_zones - nactive);
                set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags);
        }

        /* Validate superblock log */
        nr_zones = BTRFS_NR_SB_LOG_ZONES;
        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
                u32 sb_zone;
                u64 sb_wp;
                int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;

                sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
                if (sb_zone + 1 >= zone_info->nr_zones)
                        continue;

                ret = btrfs_get_dev_zones(device,
                                          zone_start_physical(sb_zone, zone_info),
                                          &zone_info->sb_zones[sb_pos],
                                          &nr_zones);
                if (ret)
                        goto out;

                if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
                        btrfs_err_in_rcu(device->fs_info,
        "zoned: failed to read super block log zone info at devid %llu zone %u",
                                         device->devid, sb_zone);
                        ret = -EUCLEAN;
                        goto out;
                }

                /*
                 * If zones[0] is conventional, always use the beginning of the
                 * zone to record superblock. No need to validate in that case.
                 */
                if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
                    BLK_ZONE_TYPE_CONVENTIONAL)
                        continue;

                ret = sb_write_pointer(device->bdev,
                                       &zone_info->sb_zones[sb_pos], &sb_wp);
                if (ret != -ENOENT && ret) {
                        btrfs_err_in_rcu(device->fs_info,
                        "zoned: super block log zone corrupted devid %llu zone %u",
                                         device->devid, sb_zone);
                        ret = -EUCLEAN;
                        goto out;
                }
        }


        kvfree(zones);

        if (bdev_is_zoned(bdev)) {
                model = "host-managed zoned";
                emulated = "";
        } else {
                model = "regular";
                emulated = "emulated ";
        }

        btrfs_info_in_rcu(fs_info,
                "%s block device %s, %u %szones of %llu bytes",
                model, rcu_str_deref(device->name), zone_info->nr_zones,
                emulated, zone_info->zone_size);

        return 0;

out:
        kvfree(zones);
        btrfs_destroy_dev_zone_info(device);
        return ret;
}

void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
{
        struct btrfs_zoned_device_info *zone_info = device->zone_info;

        if (!zone_info)
                return;

        bitmap_free(zone_info->active_zones);
        bitmap_free(zone_info->seq_zones);
        bitmap_free(zone_info->empty_zones);
        vfree(zone_info->zone_cache);
        kfree(zone_info);
        device->zone_info = NULL;
}

struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev)
{
        struct btrfs_zoned_device_info *zone_info;

        zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL);
        if (!zone_info)
                return NULL;

        zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
        if (!zone_info->seq_zones)
                goto out;

        bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones,
                    zone_info->nr_zones);

        zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
        if (!zone_info->empty_zones)
                goto out;

        bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones,
                    zone_info->nr_zones);

        zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
        if (!zone_info->active_zones)
                goto out;

        bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones,
                    zone_info->nr_zones);
        zone_info->zone_cache = NULL;

        return zone_info;

out:
        bitmap_free(zone_info->seq_zones);
        bitmap_free(zone_info->empty_zones);
        bitmap_free(zone_info->active_zones);
        kfree(zone_info);
        return NULL;
}

int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
                       struct blk_zone *zone)
{
        unsigned int nr_zones = 1;
        int ret;

        ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
        if (ret != 0 || !nr_zones)
                return ret ? ret : -EIO;

        return 0;
}

static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
{
        struct btrfs_device *device;

        list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
                if (device->bdev && bdev_is_zoned(device->bdev)) {
                        btrfs_err(fs_info,
                                "zoned: mode not enabled but zoned device found: %pg",
                                device->bdev);
                        return -EINVAL;
                }
        }

        return 0;
}

int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
{
        struct queue_limits *lim = &fs_info->limits;
        struct btrfs_device *device;
        u64 zone_size = 0;
        int ret;

        /*
         * Host-Managed devices can't be used without the ZONED flag.  With the
         * ZONED all devices can be used, using zone emulation if required.
         */
        if (!btrfs_fs_incompat(fs_info, ZONED))
                return btrfs_check_for_zoned_device(fs_info);

        blk_set_stacking_limits(lim);

        list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
                struct btrfs_zoned_device_info *zone_info = device->zone_info;

                if (!device->bdev)
                        continue;

                if (!zone_size) {
                        zone_size = zone_info->zone_size;
                } else if (zone_info->zone_size != zone_size) {
                        btrfs_err(fs_info,
                "zoned: unequal block device zone sizes: have %llu found %llu",
                                  zone_info->zone_size, zone_size);
                        return -EINVAL;
                }

                /*
                 * With the zoned emulation, we can have non-zoned device on the
                 * zoned mode. In this case, we don't have a valid max zone
                 * append size.
                 */
                if (bdev_is_zoned(device->bdev)) {
                        blk_stack_limits(lim,
                                         &bdev_get_queue(device->bdev)->limits,
                                         0);
                }
        }

        /*
         * stripe_size is always aligned to BTRFS_STRIPE_LEN in
         * btrfs_create_chunk(). Since we want stripe_len == zone_size,
         * check the alignment here.
         */
        if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
                btrfs_err(fs_info,
                          "zoned: zone size %llu not aligned to stripe %u",
                          zone_size, BTRFS_STRIPE_LEN);
                return -EINVAL;
        }

        if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
                btrfs_err(fs_info, "zoned: mixed block groups not supported");
                return -EINVAL;
        }

        fs_info->zone_size = zone_size;
        /*
         * Also limit max_zone_append_size by max_segments * PAGE_SIZE.
         * Technically, we can have multiple pages per segment. But, since
         * we add the pages one by one to a bio, and cannot increase the
         * metadata reservation even if it increases the number of extents, it
         * is safe to stick with the limit.
         */
        fs_info->max_zone_append_size = ALIGN_DOWN(
                min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT,
                     (u64)lim->max_sectors << SECTOR_SHIFT,
                     (u64)lim->max_segments << PAGE_SHIFT),
                fs_info->sectorsize);
        fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
        if (fs_info->max_zone_append_size < fs_info->max_extent_size)
                fs_info->max_extent_size = fs_info->max_zone_append_size;

        /*
         * Check mount options here, because we might change fs_info->zoned
         * from fs_info->zone_size.
         */
        ret = btrfs_check_mountopts_zoned(fs_info, &fs_info->mount_opt);
        if (ret)
                return ret;

        btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
        return 0;
}

int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt)
{
        if (!btrfs_is_zoned(info))
                return 0;

        /*
         * Space cache writing is not COWed. Disable that to avoid write errors
         * in sequential zones.
         */
        if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) {
                btrfs_err(info, "zoned: space cache v1 is not supported");
                return -EINVAL;
        }

        if (btrfs_raw_test_opt(*mount_opt, NODATACOW)) {
                btrfs_err(info, "zoned: NODATACOW not supported");
                return -EINVAL;
        }

        if (btrfs_raw_test_opt(*mount_opt, DISCARD_ASYNC)) {
                btrfs_info(info,
                           "zoned: async discard ignored and disabled for zoned mode");
                btrfs_clear_opt(*mount_opt, DISCARD_ASYNC);
        }

        return 0;
}

static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
                           int rw, u64 *bytenr_ret)
{
        u64 wp;
        int ret;

        if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
                *bytenr_ret = zones[0].start << SECTOR_SHIFT;
                return 0;
        }

        ret = sb_write_pointer(bdev, zones, &wp);
        if (ret != -ENOENT && ret < 0)
                return ret;

        if (rw == WRITE) {
                struct blk_zone *reset = NULL;

                if (wp == zones[0].start << SECTOR_SHIFT)
                        reset = &zones[0];
                else if (wp == zones[1].start << SECTOR_SHIFT)
                        reset = &zones[1];

                if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
                        unsigned int nofs_flags;

                        ASSERT(sb_zone_is_full(reset));

                        nofs_flags = memalloc_nofs_save();
                        ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
                                               reset->start, reset->len);
                        memalloc_nofs_restore(nofs_flags);
                        if (ret)
                                return ret;

                        reset->cond = BLK_ZONE_COND_EMPTY;
                        reset->wp = reset->start;
                }
        } else if (ret != -ENOENT) {
                /*
                 * For READ, we want the previous one. Move write pointer to
                 * the end of a zone, if it is at the head of a zone.
                 */
                u64 zone_end = 0;

                if (wp == zones[0].start << SECTOR_SHIFT)
                        zone_end = zones[1].start + zones[1].capacity;
                else if (wp == zones[1].start << SECTOR_SHIFT)
                        zone_end = zones[0].start + zones[0].capacity;
                if (zone_end)
                        wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT,
                                        BTRFS_SUPER_INFO_SIZE);

                wp -= BTRFS_SUPER_INFO_SIZE;
        }

        *bytenr_ret = wp;
        return 0;

}

int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
                               u64 *bytenr_ret)
{
        struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
        sector_t zone_sectors;
        u32 sb_zone;
        int ret;
        u8 zone_sectors_shift;
        sector_t nr_sectors;
        u32 nr_zones;

        if (!bdev_is_zoned(bdev)) {
                *bytenr_ret = btrfs_sb_offset(mirror);
                return 0;
        }

        ASSERT(rw == READ || rw == WRITE);

        zone_sectors = bdev_zone_sectors(bdev);
        if (!is_power_of_2(zone_sectors))
                return -EINVAL;
        zone_sectors_shift = ilog2(zone_sectors);
        nr_sectors = bdev_nr_sectors(bdev);
        nr_zones = nr_sectors >> zone_sectors_shift;

        sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
        if (sb_zone + 1 >= nr_zones)
                return -ENOENT;

        ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
                                  BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
                                  zones);
        if (ret < 0)
                return ret;
        if (ret != BTRFS_NR_SB_LOG_ZONES)
                return -EIO;

        return sb_log_location(bdev, zones, rw, bytenr_ret);
}

int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
                          u64 *bytenr_ret)
{
        struct btrfs_zoned_device_info *zinfo = device->zone_info;
        u32 zone_num;

        /*
         * For a zoned filesystem on a non-zoned block device, use the same
         * super block locations as regular filesystem. Doing so, the super
         * block can always be retrieved and the zoned flag of the volume
         * detected from the super block information.
         */
        if (!bdev_is_zoned(device->bdev)) {
                *bytenr_ret = btrfs_sb_offset(mirror);
                return 0;
        }

        zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
        if (zone_num + 1 >= zinfo->nr_zones)
                return -ENOENT;

        return sb_log_location(device->bdev,
                               &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
                               rw, bytenr_ret);
}

static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
                                  int mirror)
{
        u32 zone_num;

        if (!zinfo)
                return false;

        zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
        if (zone_num + 1 >= zinfo->nr_zones)
                return false;

        if (!test_bit(zone_num, zinfo->seq_zones))
                return false;

        return true;
}

int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
{
        struct btrfs_zoned_device_info *zinfo = device->zone_info;
        struct blk_zone *zone;
        int i;

        if (!is_sb_log_zone(zinfo, mirror))
                return 0;

        zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
        for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
                /* Advance the next zone */
                if (zone->cond == BLK_ZONE_COND_FULL) {
                        zone++;
                        continue;
                }

                if (zone->cond == BLK_ZONE_COND_EMPTY)
                        zone->cond = BLK_ZONE_COND_IMP_OPEN;

                zone->wp += SUPER_INFO_SECTORS;

                if (sb_zone_is_full(zone)) {
                        /*
                         * No room left to write new superblock. Since
                         * superblock is written with REQ_SYNC, it is safe to
                         * finish the zone now.
                         *
                         * If the write pointer is exactly at the capacity,
                         * explicit ZONE_FINISH is not necessary.
                         */
                        if (zone->wp != zone->start + zone->capacity) {
                                unsigned int nofs_flags;
                                int ret;

                                nofs_flags = memalloc_nofs_save();
                                ret = blkdev_zone_mgmt(device->bdev,
                                                REQ_OP_ZONE_FINISH, zone->start,
                                                zone->len);
                                memalloc_nofs_restore(nofs_flags);
                                if (ret)
                                        return ret;
                        }

                        zone->wp = zone->start + zone->len;
                        zone->cond = BLK_ZONE_COND_FULL;
                }
                return 0;
        }

        /* All the zones are FULL. Should not reach here. */
        ASSERT(0);
        return -EIO;
}

int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
{
        unsigned int nofs_flags;
        sector_t zone_sectors;
        sector_t nr_sectors;
        u8 zone_sectors_shift;
        u32 sb_zone;
        u32 nr_zones;
        int ret;

        zone_sectors = bdev_zone_sectors(bdev);
        zone_sectors_shift = ilog2(zone_sectors);
        nr_sectors = bdev_nr_sectors(bdev);
        nr_zones = nr_sectors >> zone_sectors_shift;

        sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
        if (sb_zone + 1 >= nr_zones)
                return -ENOENT;

        nofs_flags = memalloc_nofs_save();
        ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
                               zone_start_sector(sb_zone, bdev),
                               zone_sectors * BTRFS_NR_SB_LOG_ZONES);
        memalloc_nofs_restore(nofs_flags);
        return ret;
}

/*
 * Find allocatable zones within a given region.
 *
 * @device:        the device to allocate a region on
 * @hole_start: the position of the hole to allocate the region
 * @num_bytes:        size of wanted region
 * @hole_end:        the end of the hole
 * @return:        position of allocatable zones
 *
 * Allocatable region should not contain any superblock locations.
 */
u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
                                 u64 hole_end, u64 num_bytes)
{
        struct btrfs_zoned_device_info *zinfo = device->zone_info;
        const u8 shift = zinfo->zone_size_shift;
        u64 nzones = num_bytes >> shift;
        u64 pos = hole_start;
        u64 begin, end;
        bool have_sb;
        int i;

        ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
        ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));

        while (pos < hole_end) {
                begin = pos >> shift;
                end = begin + nzones;

                if (end > zinfo->nr_zones)
                        return hole_end;

                /* Check if zones in the region are all empty */
                if (btrfs_dev_is_sequential(device, pos) &&
                    !bitmap_test_range_all_set(zinfo->empty_zones, begin, nzones)) {
                        pos += zinfo->zone_size;
                        continue;
                }

                have_sb = false;
                for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
                        u32 sb_zone;
                        u64 sb_pos;

                        sb_zone = sb_zone_number(shift, i);
                        if (!(end <= sb_zone ||
                              sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
                                have_sb = true;
                                pos = zone_start_physical(
                                        sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo);
                                break;
                        }

                        /* We also need to exclude regular superblock positions */
                        sb_pos = btrfs_sb_offset(i);
                        if (!(pos + num_bytes <= sb_pos ||
                              sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
                                have_sb = true;
                                pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
                                            zinfo->zone_size);
                                break;
                        }
                }
                if (!have_sb)
                        break;
        }

        return pos;
}

static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos)
{
        struct btrfs_zoned_device_info *zone_info = device->zone_info;
        unsigned int zno = (pos >> zone_info->zone_size_shift);

        /* We can use any number of zones */
        if (zone_info->max_active_zones == 0)
                return true;

        if (!test_bit(zno, zone_info->active_zones)) {
                /* Active zone left? */
                if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0)
                        return false;
                if (test_and_set_bit(zno, zone_info->active_zones)) {
                        /* Someone already set the bit */
                        atomic_inc(&zone_info->active_zones_left);
                }
        }

        return true;
}

static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
{
        struct btrfs_zoned_device_info *zone_info = device->zone_info;
        unsigned int zno = (pos >> zone_info->zone_size_shift);

        /* We can use any number of zones */
        if (zone_info->max_active_zones == 0)
                return;

        if (test_and_clear_bit(zno, zone_info->active_zones))
                atomic_inc(&zone_info->active_zones_left);
}

int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
                            u64 length, u64 *bytes)
{
        unsigned int nofs_flags;
        int ret;

        *bytes = 0;
        nofs_flags = memalloc_nofs_save();
        ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
                               physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT);
        memalloc_nofs_restore(nofs_flags);
        if (ret)
                return ret;

        *bytes = length;
        while (length) {
                btrfs_dev_set_zone_empty(device, physical);
                btrfs_dev_clear_active_zone(device, physical);
                physical += device->zone_info->zone_size;
                length -= device->zone_info->zone_size;
        }

        return 0;
}

int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
{
        struct btrfs_zoned_device_info *zinfo = device->zone_info;
        const u8 shift = zinfo->zone_size_shift;
        unsigned long begin = start >> shift;
        unsigned long nbits = size >> shift;
        u64 pos;
        int ret;

        ASSERT(IS_ALIGNED(start, zinfo->zone_size));
        ASSERT(IS_ALIGNED(size, zinfo->zone_size));

        if (begin + nbits > zinfo->nr_zones)
                return -ERANGE;

        /* All the zones are conventional */
        if (bitmap_test_range_all_zero(zinfo->seq_zones, begin, nbits))
                return 0;

        /* All the zones are sequential and empty */
        if (bitmap_test_range_all_set(zinfo->seq_zones, begin, nbits) &&
            bitmap_test_range_all_set(zinfo->empty_zones, begin, nbits))
                return 0;

        for (pos = start; pos < start + size; pos += zinfo->zone_size) {
                u64 reset_bytes;

                if (!btrfs_dev_is_sequential(device, pos) ||
                    btrfs_dev_is_empty_zone(device, pos))
                        continue;

                /* Free regions should be empty */
                btrfs_warn_in_rcu(
                        device->fs_info,
                "zoned: resetting device %s (devid %llu) zone %llu for allocation",
                        rcu_str_deref(device->name), device->devid, pos >> shift);
                WARN_ON_ONCE(1);

                ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
                                              &reset_bytes);
                if (ret)
                        return ret;
        }

        return 0;
}

/*
 * Calculate an allocation pointer from the extent allocation information
 * for a block group consist of conventional zones. It is pointed to the
 * end of the highest addressed extent in the block group as an allocation
 * offset.
 */
static int calculate_alloc_pointer(struct btrfs_block_group *cache,
                                   u64 *offset_ret, bool new)
{
        struct btrfs_fs_info *fs_info = cache->fs_info;
        struct btrfs_root *root;
        struct btrfs_path *path;
        struct btrfs_key key;
        struct btrfs_key found_key;
        int ret;
        u64 length;

        /*
         * Avoid  tree lookups for a new block group, there's no use for it.
         * It must always be 0.
         *
         * Also, we have a lock chain of extent buffer lock -> chunk mutex.
         * For new a block group, this function is called from
         * btrfs_make_block_group() which is already taking the chunk mutex.
         * Thus, we cannot call calculate_alloc_pointer() which takes extent
         * buffer locks to avoid deadlock.
         */
        if (new) {
                *offset_ret = 0;
                return 0;
        }

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = cache->start + cache->length;
        key.type = 0;
        key.offset = 0;

        root = btrfs_extent_root(fs_info, key.objectid);
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        /* We should not find the exact match */
        if (!ret)
                ret = -EUCLEAN;
        if (ret < 0)
                goto out;

        ret = btrfs_previous_extent_item(root, path, cache->start);
        if (ret) {
                if (ret == 1) {
                        ret = 0;
                        *offset_ret = 0;
                }
                goto out;
        }

        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);

        if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
                length = found_key.offset;
        else
                length = fs_info->nodesize;

        if (!(found_key.objectid >= cache->start &&
               found_key.objectid + length <= cache->start + cache->length)) {
                ret = -EUCLEAN;
                goto out;
        }
        *offset_ret = found_key.objectid + length - cache->start;
        ret = 0;

out:
        btrfs_free_path(path);
        return ret;
}

struct zone_info {
        u64 physical;
        u64 capacity;
        u64 alloc_offset;
};

static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
                                struct zone_info *info, unsigned long *active,
                                struct btrfs_chunk_map *map)
{
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
        struct btrfs_device *device;
        int dev_replace_is_ongoing = 0;
        unsigned int nofs_flag;
        struct blk_zone zone;
        int ret;

        info->physical = map->stripes[zone_idx].physical;

        down_read(&dev_replace->rwsem);
        device = map->stripes[zone_idx].dev;

        if (!device->bdev) {
                up_read(&dev_replace->rwsem);
                info->alloc_offset = WP_MISSING_DEV;
                return 0;
        }

        /* Consider a zone as active if we can allow any number of active zones. */
        if (!device->zone_info->max_active_zones)
                __set_bit(zone_idx, active);

        if (!btrfs_dev_is_sequential(device, info->physical)) {
                up_read(&dev_replace->rwsem);
                info->alloc_offset = WP_CONVENTIONAL;
                return 0;
        }

        /* This zone will be used for allocation, so mark this zone non-empty. */
        btrfs_dev_clear_zone_empty(device, info->physical);

        dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
        if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
                btrfs_dev_clear_zone_empty(dev_replace->tgtdev, info->physical);

        /*
         * The group is mapped to a sequential zone. Get the zone write pointer
         * to determine the allocation offset within the zone.
         */
        WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size));
        nofs_flag = memalloc_nofs_save();
        ret = btrfs_get_dev_zone(device, info->physical, &zone);
        memalloc_nofs_restore(nofs_flag);
        if (ret) {
                up_read(&dev_replace->rwsem);
                if (ret != -EIO && ret != -EOPNOTSUPP)
                        return ret;
                info->alloc_offset = WP_MISSING_DEV;
                return 0;
        }

        if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
                btrfs_err_in_rcu(fs_info,
                "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
                        zone.start << SECTOR_SHIFT, rcu_str_deref(device->name),
                        device->devid);
                up_read(&dev_replace->rwsem);
                return -EIO;
        }

        info->capacity = (zone.capacity << SECTOR_SHIFT);

        switch (zone.cond) {
        case BLK_ZONE_COND_OFFLINE:
        case BLK_ZONE_COND_READONLY:
                btrfs_err(fs_info,
                "zoned: offline/readonly zone %llu on device %s (devid %llu)",
                          (info->physical >> device->zone_info->zone_size_shift),
                          rcu_str_deref(device->name), device->devid);
                info->alloc_offset = WP_MISSING_DEV;
                break;
        case BLK_ZONE_COND_EMPTY:
                info->alloc_offset = 0;
                break;
        case BLK_ZONE_COND_FULL:
                info->alloc_offset = info->capacity;
                break;
        default:
                /* Partially used zone. */
                info->alloc_offset = ((zone.wp - zone.start) << SECTOR_SHIFT);
                __set_bit(zone_idx, active);
                break;
        }

        up_read(&dev_replace->rwsem);

        return 0;
}

static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
                                         struct zone_info *info,
                                         unsigned long *active)
{
        if (info->alloc_offset == WP_MISSING_DEV) {
                btrfs_err(bg->fs_info,
                        "zoned: cannot recover write pointer for zone %llu",
                        info->physical);
                return -EIO;
        }

        bg->alloc_offset = info->alloc_offset;
        bg->zone_capacity = info->capacity;
        if (test_bit(0, active))
                set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
        return 0;
}

static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
                                      struct btrfs_chunk_map *map,
                                      struct zone_info *zone_info,
                                      unsigned long *active)
{
        struct btrfs_fs_info *fs_info = bg->fs_info;

        if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
                btrfs_err(fs_info, "zoned: data DUP profile needs raid-stripe-tree");
                return -EINVAL;
        }

        if (zone_info[0].alloc_offset == WP_MISSING_DEV) {
                btrfs_err(bg->fs_info,
                          "zoned: cannot recover write pointer for zone %llu",
                          zone_info[0].physical);
                return -EIO;
        }
        if (zone_info[1].alloc_offset == WP_MISSING_DEV) {
                btrfs_err(bg->fs_info,
                          "zoned: cannot recover write pointer for zone %llu",
                          zone_info[1].physical);
                return -EIO;
        }
        if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) {
                btrfs_err(bg->fs_info,
                          "zoned: write pointer offset mismatch of zones in DUP profile");
                return -EIO;
        }

        if (test_bit(0, active) != test_bit(1, active)) {
                if (!btrfs_zone_activate(bg))
                        return -EIO;
        } else if (test_bit(0, active)) {
                set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
        }

        bg->alloc_offset = zone_info[0].alloc_offset;
        bg->zone_capacity = min(zone_info[0].capacity, zone_info[1].capacity);
        return 0;
}

static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
                                        struct btrfs_chunk_map *map,
                                        struct zone_info *zone_info,
                                        unsigned long *active)
{
        struct btrfs_fs_info *fs_info = bg->fs_info;
        int i;

        if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
                btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
                          btrfs_bg_type_to_raid_name(map->type));
                return -EINVAL;
        }

        for (i = 0; i < map->num_stripes; i++) {
                if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
                    zone_info[i].alloc_offset == WP_CONVENTIONAL)
                        continue;

                if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
                    !btrfs_test_opt(fs_info, DEGRADED)) {
                        btrfs_err(fs_info,
                        "zoned: write pointer offset mismatch of zones in %s profile",
                                  btrfs_bg_type_to_raid_name(map->type));
                        return -EIO;
                }
                if (test_bit(0, active) != test_bit(i, active)) {
                        if (!btrfs_test_opt(fs_info, DEGRADED) &&
                            !btrfs_zone_activate(bg)) {
                                return -EIO;
                        }
                } else {
                        if (test_bit(0, active))
                                set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
                }
                /* In case a device is missing we have a cap of 0, so don't use it. */
                bg->zone_capacity = min_not_zero(zone_info[0].capacity,
                                                 zone_info[1].capacity);
        }

        if (zone_info[0].alloc_offset != WP_MISSING_DEV)
                bg->alloc_offset = zone_info[0].alloc_offset;
        else
                bg->alloc_offset = zone_info[i - 1].alloc_offset;

        return 0;
}

static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
                                        struct btrfs_chunk_map *map,
                                        struct zone_info *zone_info,
                                        unsigned long *active)
{
        struct btrfs_fs_info *fs_info = bg->fs_info;

        if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
                btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
                          btrfs_bg_type_to_raid_name(map->type));
                return -EINVAL;
        }

        for (int i = 0; i < map->num_stripes; i++) {
                if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
                    zone_info[i].alloc_offset == WP_CONVENTIONAL)
                        continue;

                if (test_bit(0, active) != test_bit(i, active)) {
                        if (!btrfs_zone_activate(bg))
                                return -EIO;
                } else {
                        if (test_bit(0, active))
                                set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
                }
                bg->zone_capacity += zone_info[i].capacity;
                bg->alloc_offset += zone_info[i].alloc_offset;
        }

        return 0;
}

static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
                                         struct btrfs_chunk_map *map,
                                         struct zone_info *zone_info,
                                         unsigned long *active)
{
        struct btrfs_fs_info *fs_info = bg->fs_info;

        if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
                btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
                          btrfs_bg_type_to_raid_name(map->type));
                return -EINVAL;
        }

        for (int i = 0; i < map->num_stripes; i++) {
                if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
                    zone_info[i].alloc_offset == WP_CONVENTIONAL)
                        continue;

                if (test_bit(0, active) != test_bit(i, active)) {
                        if (!btrfs_zone_activate(bg))
                                return -EIO;
                } else {
                        if (test_bit(0, active))
                                set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
                }

                if ((i % map->sub_stripes) == 0) {
                        bg->zone_capacity += zone_info[i].capacity;
                        bg->alloc_offset += zone_info[i].alloc_offset;
                }
        }

        return 0;
}

int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
{
        struct btrfs_fs_info *fs_info = cache->fs_info;
        struct btrfs_chunk_map *map;
        u64 logical = cache->start;
        u64 length = cache->length;
        struct zone_info *zone_info = NULL;
        int ret;
        int i;
        unsigned long *active = NULL;
        u64 last_alloc = 0;
        u32 num_sequential = 0, num_conventional = 0;

        if (!btrfs_is_zoned(fs_info))
                return 0;

        /* Sanity check */
        if (!IS_ALIGNED(length, fs_info->zone_size)) {
                btrfs_err(fs_info,
                "zoned: block group %llu len %llu unaligned to zone size %llu",
                          logical, length, fs_info->zone_size);
                return -EIO;
        }

        map = btrfs_find_chunk_map(fs_info, logical, length);
        if (!map)
                return -EINVAL;

        cache->physical_map = map;

        zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS);
        if (!zone_info) {
                ret = -ENOMEM;
                goto out;
        }

        active = bitmap_zalloc(map->num_stripes, GFP_NOFS);
        if (!active) {
                ret = -ENOMEM;
                goto out;
        }

        for (i = 0; i < map->num_stripes; i++) {
                ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map);
                if (ret)
                        goto out;

                if (zone_info[i].alloc_offset == WP_CONVENTIONAL)
                        num_conventional++;
                else
                        num_sequential++;
        }

        if (num_sequential > 0)
                set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);

        if (num_conventional > 0) {
                /* Zone capacity is always zone size in emulation */
                cache->zone_capacity = cache->length;
                ret = calculate_alloc_pointer(cache, &last_alloc, new);
                if (ret) {
                        btrfs_err(fs_info,
                        "zoned: failed to determine allocation offset of bg %llu",
                                  cache->start);
                        goto out;
                } else if (map->num_stripes == num_conventional) {
                        cache->alloc_offset = last_alloc;
                        set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
                        goto out;
                }
        }

        switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
        case 0: /* single */
                ret = btrfs_load_block_group_single(cache, &zone_info[0], active);
                break;
        case BTRFS_BLOCK_GROUP_DUP:
                ret = btrfs_load_block_group_dup(cache, map, zone_info, active);
                break;
        case BTRFS_BLOCK_GROUP_RAID1:
        case BTRFS_BLOCK_GROUP_RAID1C3:
        case BTRFS_BLOCK_GROUP_RAID1C4:
                ret = btrfs_load_block_group_raid1(cache, map, zone_info, active);
                break;
        case BTRFS_BLOCK_GROUP_RAID0:
                ret = btrfs_load_block_group_raid0(cache, map, zone_info, active);
                break;
        case BTRFS_BLOCK_GROUP_RAID10:
                ret = btrfs_load_block_group_raid10(cache, map, zone_info, active);
                break;
        case BTRFS_BLOCK_GROUP_RAID5:
        case BTRFS_BLOCK_GROUP_RAID6:
        default:
                btrfs_err(fs_info, "zoned: profile %s not yet supported",
                          btrfs_bg_type_to_raid_name(map->type));
                ret = -EINVAL;
                goto out;
        }

out:
        /* Reject non SINGLE data profiles without RST */
        if ((map->type & BTRFS_BLOCK_GROUP_DATA) &&
            (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
            !fs_info->stripe_root) {
                btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
                          btrfs_bg_type_to_raid_name(map->type));
                return -EINVAL;
        }

        if (cache->alloc_offset > cache->zone_capacity) {
                btrfs_err(fs_info,
"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
                          cache->alloc_offset, cache->zone_capacity,
                          cache->start);
                ret = -EIO;
        }

        /* An extent is allocated after the write pointer */
        if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
                btrfs_err(fs_info,
                          "zoned: got wrong write pointer in BG %llu: %llu > %llu",
                          logical, last_alloc, cache->alloc_offset);
                ret = -EIO;
        }

        if (!ret) {
                cache->meta_write_pointer = cache->alloc_offset + cache->start;
                if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) {
                        btrfs_get_block_group(cache);
                        spin_lock(&fs_info->zone_active_bgs_lock);
                        list_add_tail(&cache->active_bg_list,
                                      &fs_info->zone_active_bgs);
                        spin_unlock(&fs_info->zone_active_bgs_lock);
                }
        } else {
                btrfs_free_chunk_map(cache->physical_map);
                cache->physical_map = NULL;
        }
        bitmap_free(active);
        kfree(zone_info);

        return ret;
}

void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
{
        u64 unusable, free;

        if (!btrfs_is_zoned(cache->fs_info))
                return;

        WARN_ON(cache->bytes_super != 0);
        unusable = (cache->alloc_offset - cache->used) +
                   (cache->length - cache->zone_capacity);
        free = cache->zone_capacity - cache->alloc_offset;

        /* We only need ->free_space in ALLOC_SEQ block groups */
        cache->cached = BTRFS_CACHE_FINISHED;
        cache->free_space_ctl->free_space = free;
        cache->zone_unusable = unusable;
}

bool btrfs_use_zone_append(struct btrfs_bio *bbio)
{
        u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
        struct btrfs_inode *inode = bbio->inode;
        struct btrfs_fs_info *fs_info = bbio->fs_info;
        struct btrfs_block_group *cache;
        bool ret = false;

        if (!btrfs_is_zoned(fs_info))
                return false;

        if (!inode || !is_data_inode(&inode->vfs_inode))
                return false;

        if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
                return false;

        /*
         * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
         * extent layout the relocation code has.
         * Furthermore we have set aside own block-group from which only the
         * relocation "process" can allocate and make sure only one process at a
         * time can add pages to an extent that gets relocated, so it's safe to
         * use regular REQ_OP_WRITE for this special case.
         */
        if (btrfs_is_data_reloc_root(inode->root))
                return false;

        cache = btrfs_lookup_block_group(fs_info, start);
        ASSERT(cache);
        if (!cache)
                return false;

        ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
        btrfs_put_block_group(cache);

        return ret;
}

void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
{
        const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
        struct btrfs_ordered_sum *sum = bbio->sums;

        if (physical < bbio->orig_physical)
                sum->logical -= bbio->orig_physical - physical;
        else
                sum->logical += physical - bbio->orig_physical;
}

static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered,
                                        u64 logical)
{
        struct extent_map_tree *em_tree = &BTRFS_I(ordered->inode)->extent_tree;
        struct extent_map *em;

        ordered->disk_bytenr = logical;

        write_lock(&em_tree->lock);
        em = search_extent_mapping(em_tree, ordered->file_offset,
                                   ordered->num_bytes);
        em->block_start = logical;
        free_extent_map(em);
        write_unlock(&em_tree->lock);
}

static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered,
                                      u64 logical, u64 len)
{
        struct btrfs_ordered_extent *new;

        if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
            split_extent_map(BTRFS_I(ordered->inode), ordered->file_offset,
                             ordered->num_bytes, len, logical))
                return false;

        new = btrfs_split_ordered_extent(ordered, len);
        if (IS_ERR(new))
                return false;
        new->disk_bytenr = logical;
        btrfs_finish_one_ordered(new);
        return true;
}

void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered)
{
        struct btrfs_inode *inode = BTRFS_I(ordered->inode);
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct btrfs_ordered_sum *sum;
        u64 logical, len;

        /*
         * Write to pre-allocated region is for the data relocation, and so
         * it should use WRITE operation. No split/rewrite are necessary.
         */
        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
                return;

        ASSERT(!list_empty(&ordered->list));
        /* The ordered->list can be empty in the above pre-alloc case. */
        sum = list_first_entry(&ordered->list, struct btrfs_ordered_sum, list);
        logical = sum->logical;
        len = sum->len;

        while (len < ordered->disk_num_bytes) {
                sum = list_next_entry(sum, list);
                if (sum->logical == logical + len) {
                        len += sum->len;
                        continue;
                }
                if (!btrfs_zoned_split_ordered(ordered, logical, len)) {
                        set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
                        btrfs_err(fs_info, "failed to split ordered extent");
                        goto out;
                }
                logical = sum->logical;
                len = sum->len;
        }

        if (ordered->disk_bytenr != logical)
                btrfs_rewrite_logical_zoned(ordered, logical);

out:
        /*
         * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures
         * were allocated by btrfs_alloc_dummy_sum only to record the logical
         * addresses and don't contain actual checksums.  We thus must free them
         * here so that we don't attempt to log the csums later.
         */
        if ((inode->flags & BTRFS_INODE_NODATASUM) ||
            test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) {
                while ((sum = list_first_entry_or_null(&ordered->list,
                                                       typeof(*sum), list))) {
                        list_del(&sum->list);
                        kfree(sum);
                }
        }
}

static bool check_bg_is_active(struct btrfs_eb_write_context *ctx,
                               struct btrfs_block_group **active_bg)
{
        const struct writeback_control *wbc = ctx->wbc;
        struct btrfs_block_group *block_group = ctx->zoned_bg;
        struct btrfs_fs_info *fs_info = block_group->fs_info;

        if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
                return true;

        if (fs_info->treelog_bg == block_group->start) {
                if (!btrfs_zone_activate(block_group)) {
                        int ret_fin = btrfs_zone_finish_one_bg(fs_info);

                        if (ret_fin != 1 || !btrfs_zone_activate(block_group))
                                return false;
                }
        } else if (*active_bg != block_group) {
                struct btrfs_block_group *tgt = *active_bg;

                /* zoned_meta_io_lock protects fs_info->active_{meta,system}_bg. */
                lockdep_assert_held(&fs_info->zoned_meta_io_lock);

                if (tgt) {
                        /*
                         * If there is an unsent IO left in the allocated area,
                         * we cannot wait for them as it may cause a deadlock.
                         */
                        if (tgt->meta_write_pointer < tgt->start + tgt->alloc_offset) {
                                if (wbc->sync_mode == WB_SYNC_NONE ||
                                    (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync))
                                        return false;
                        }

                        /* Pivot active metadata/system block group. */
                        btrfs_zoned_meta_io_unlock(fs_info);
                        wait_eb_writebacks(tgt);
                        do_zone_finish(tgt, true);
                        btrfs_zoned_meta_io_lock(fs_info);
                        if (*active_bg == tgt) {
                                btrfs_put_block_group(tgt);
                                *active_bg = NULL;
                        }
                }
                if (!btrfs_zone_activate(block_group))
                        return false;
                if (*active_bg != block_group) {
                        ASSERT(*active_bg == NULL);
                        *active_bg = block_group;
                        btrfs_get_block_group(block_group);
                }
        }

        return true;
}

/*
 * Check if @ctx->eb is aligned to the write pointer.
 *
 * Return:
 *   0:        @ctx->eb is at the write pointer. You can write it.
 *   -EAGAIN:  There is a hole. The caller should handle the case.
 *   -EBUSY:   There is a hole, but the caller can just bail out.
 */
int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
                                   struct btrfs_eb_write_context *ctx)
{
        const struct writeback_control *wbc = ctx->wbc;
        const struct extent_buffer *eb = ctx->eb;
        struct btrfs_block_group *block_group = ctx->zoned_bg;

        if (!btrfs_is_zoned(fs_info))
                return 0;

        if (block_group) {
                if (block_group->start > eb->start ||
                    block_group->start + block_group->length <= eb->start) {
                        btrfs_put_block_group(block_group);
                        block_group = NULL;
                        ctx->zoned_bg = NULL;
                }
        }

        if (!block_group) {
                block_group = btrfs_lookup_block_group(fs_info, eb->start);
                if (!block_group)
                        return 0;
                ctx->zoned_bg = block_group;
        }

        if (block_group->meta_write_pointer == eb->start) {
                struct btrfs_block_group **tgt;

                if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags))
                        return 0;

                if (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)
                        tgt = &fs_info->active_system_bg;
                else
                        tgt = &fs_info->active_meta_bg;
                if (check_bg_is_active(ctx, tgt))
                        return 0;
        }

        /*
         * Since we may release fs_info->zoned_meta_io_lock, someone can already
         * start writing this eb. In that case, we can just bail out.
         */
        if (block_group->meta_write_pointer > eb->start)
                return -EBUSY;

        /* If for_sync, this hole will be filled with trasnsaction commit. */
        if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
                return -EAGAIN;
        return -EBUSY;
}

int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
{
        if (!btrfs_dev_is_sequential(device, physical))
                return -EOPNOTSUPP;

        return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT,
                                    length >> SECTOR_SHIFT, GFP_NOFS, 0);
}

static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
                          struct blk_zone *zone)
{
        struct btrfs_io_context *bioc = NULL;
        u64 mapped_length = PAGE_SIZE;
        unsigned int nofs_flag;
        int nmirrors;
        int i, ret;

        ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
                              &mapped_length, &bioc, NULL, NULL);
        if (ret || !bioc || mapped_length < PAGE_SIZE) {
                ret = -EIO;
                goto out_put_bioc;
        }

        if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                ret = -EINVAL;
                goto out_put_bioc;
        }

        nofs_flag = memalloc_nofs_save();
        nmirrors = (int)bioc->num_stripes;
        for (i = 0; i < nmirrors; i++) {
                u64 physical = bioc->stripes[i].physical;
                struct btrfs_device *dev = bioc->stripes[i].dev;

                /* Missing device */
                if (!dev->bdev)
                        continue;

                ret = btrfs_get_dev_zone(dev, physical, zone);
                /* Failing device */
                if (ret == -EIO || ret == -EOPNOTSUPP)
                        continue;
                break;
        }
        memalloc_nofs_restore(nofs_flag);
out_put_bioc:
        btrfs_put_bioc(bioc);
        return ret;
}

/*
 * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
 * filling zeros between @physical_pos to a write pointer of dev-replace
 * source device.
 */
int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
                                    u64 physical_start, u64 physical_pos)
{
        struct btrfs_fs_info *fs_info = tgt_dev->fs_info;
        struct blk_zone zone;
        u64 length;
        u64 wp;
        int ret;

        if (!btrfs_dev_is_sequential(tgt_dev, physical_pos))
                return 0;

        ret = read_zone_info(fs_info, logical, &zone);
        if (ret)
                return ret;

        wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT);

        if (physical_pos == wp)
                return 0;

        if (physical_pos > wp)
                return -EUCLEAN;

        length = wp - physical_pos;
        return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
}

/*
 * Activate block group and underlying device zones
 *
 * @block_group: the block group to activate
 *
 * Return: true on success, false otherwise
 */
bool btrfs_zone_activate(struct btrfs_block_group *block_group)
{
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        struct btrfs_chunk_map *map;
        struct btrfs_device *device;
        u64 physical;
        const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA);
        bool ret;
        int i;

        if (!btrfs_is_zoned(block_group->fs_info))
                return true;

        map = block_group->physical_map;

        spin_lock(&fs_info->zone_active_bgs_lock);
        spin_lock(&block_group->lock);
        if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
                ret = true;
                goto out_unlock;
        }

        /* No space left */
        if (btrfs_zoned_bg_is_full(block_group)) {
                ret = false;
                goto out_unlock;
        }

        for (i = 0; i < map->num_stripes; i++) {
                struct btrfs_zoned_device_info *zinfo;
                int reserved = 0;

                device = map->stripes[i].dev;
                physical = map->stripes[i].physical;
                zinfo = device->zone_info;

                if (zinfo->max_active_zones == 0)
                        continue;

                if (is_data)
                        reserved = zinfo->reserved_active_zones;
                /*
                 * For the data block group, leave active zones for one
                 * metadata block group and one system block group.
                 */
                if (atomic_read(&zinfo->active_zones_left) <= reserved) {
                        ret = false;
                        goto out_unlock;
                }

                if (!btrfs_dev_set_active_zone(device, physical)) {
                        /* Cannot activate the zone */
                        ret = false;
                        goto out_unlock;
                }
                if (!is_data)
                        zinfo->reserved_active_zones--;
        }

        /* Successfully activated all the zones */
        set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
        spin_unlock(&block_group->lock);

        /* For the active block group list */
        btrfs_get_block_group(block_group);
        list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
        spin_unlock(&fs_info->zone_active_bgs_lock);

        return true;

out_unlock:
        spin_unlock(&block_group->lock);
        spin_unlock(&fs_info->zone_active_bgs_lock);
        return ret;
}

static void wait_eb_writebacks(struct btrfs_block_group *block_group)
{
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        const u64 end = block_group->start + block_group->length;
        struct radix_tree_iter iter;
        struct extent_buffer *eb;
        void __rcu **slot;

        rcu_read_lock();
        radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter,
                                 block_group->start >> fs_info->sectorsize_bits) {
                eb = radix_tree_deref_slot(slot);
                if (!eb)
                        continue;
                if (radix_tree_deref_retry(eb)) {
                        slot = radix_tree_iter_retry(&iter);
                        continue;
                }

                if (eb->start < block_group->start)
                        continue;
                if (eb->start >= end)
                        break;

                slot = radix_tree_iter_resume(slot, &iter);
                rcu_read_unlock();
                wait_on_extent_buffer_writeback(eb);
                rcu_read_lock();
        }
        rcu_read_unlock();
}

static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
{
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        struct btrfs_chunk_map *map;
        const bool is_metadata = (block_group->flags &
                        (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM));
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
        int ret = 0;
        int i;

        spin_lock(&block_group->lock);
        if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
                spin_unlock(&block_group->lock);
                return 0;
        }

        /* Check if we have unwritten allocated space */
        if (is_metadata &&
            block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
                spin_unlock(&block_group->lock);
                return -EAGAIN;
        }

        /*
         * If we are sure that the block group is full (= no more room left for
         * new allocation) and the IO for the last usable block is completed, we
         * don't need to wait for the other IOs. This holds because we ensure
         * the sequential IO submissions using the ZONE_APPEND command for data
         * and block_group->meta_write_pointer for metadata.
         */
        if (!fully_written) {
                if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
                        spin_unlock(&block_group->lock);
                        return -EAGAIN;
                }
                spin_unlock(&block_group->lock);

                ret = btrfs_inc_block_group_ro(block_group, false);
                if (ret)
                        return ret;

                /* Ensure all writes in this block group finish */
                btrfs_wait_block_group_reservations(block_group);
                /* No need to wait for NOCOW writers. Zoned mode does not allow that */
                btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
                                         block_group->length);
                /* Wait for extent buffers to be written. */
                if (is_metadata)
                        wait_eb_writebacks(block_group);

                spin_lock(&block_group->lock);

                /*
                 * Bail out if someone already deactivated the block group, or
                 * allocated space is left in the block group.
                 */
                if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
                              &block_group->runtime_flags)) {
                        spin_unlock(&block_group->lock);
                        btrfs_dec_block_group_ro(block_group);
                        return 0;
                }

                if (block_group->reserved ||
                    test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
                             &block_group->runtime_flags)) {
                        spin_unlock(&block_group->lock);
                        btrfs_dec_block_group_ro(block_group);
                        return -EAGAIN;
                }
        }

        clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
        block_group->alloc_offset = block_group->zone_capacity;
        if (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))
                block_group->meta_write_pointer = block_group->start +
                                                  block_group->zone_capacity;
        block_group->free_space_ctl->free_space = 0;
        btrfs_clear_treelog_bg(block_group);
        btrfs_clear_data_reloc_bg(block_group);
        spin_unlock(&block_group->lock);

        down_read(&dev_replace->rwsem);
        map = block_group->physical_map;
        for (i = 0; i < map->num_stripes; i++) {
                struct btrfs_device *device = map->stripes[i].dev;
                const u64 physical = map->stripes[i].physical;
                struct btrfs_zoned_device_info *zinfo = device->zone_info;
                unsigned int nofs_flags;

                if (zinfo->max_active_zones == 0)
                        continue;

                nofs_flags = memalloc_nofs_save();
                ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
                                       physical >> SECTOR_SHIFT,
                                       zinfo->zone_size >> SECTOR_SHIFT);
                memalloc_nofs_restore(nofs_flags);

                if (ret) {
                        up_read(&dev_replace->rwsem);
                        return ret;
                }

                if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
                        zinfo->reserved_active_zones++;
                btrfs_dev_clear_active_zone(device, physical);
        }
        up_read(&dev_replace->rwsem);

        if (!fully_written)
                btrfs_dec_block_group_ro(block_group);

        spin_lock(&fs_info->zone_active_bgs_lock);
        ASSERT(!list_empty(&block_group->active_bg_list));
        list_del_init(&block_group->active_bg_list);
        spin_unlock(&fs_info->zone_active_bgs_lock);

        /* For active_bg_list */
        btrfs_put_block_group(block_group);

        clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);

        return 0;
}

int btrfs_zone_finish(struct btrfs_block_group *block_group)
{
        if (!btrfs_is_zoned(block_group->fs_info))
                return 0;

        return do_zone_finish(block_group, false);
}

bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
{
        struct btrfs_fs_info *fs_info = fs_devices->fs_info;
        struct btrfs_device *device;
        bool ret = false;

        if (!btrfs_is_zoned(fs_info))
                return true;

        /* Check if there is a device with active zones left */
        mutex_lock(&fs_info->chunk_mutex);
        spin_lock(&fs_info->zone_active_bgs_lock);
        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
                struct btrfs_zoned_device_info *zinfo = device->zone_info;
                int reserved = 0;

                if (!device->bdev)
                        continue;

                if (!zinfo->max_active_zones) {
                        ret = true;
                        break;
                }

                if (flags & BTRFS_BLOCK_GROUP_DATA)
                        reserved = zinfo->reserved_active_zones;

                switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
                case 0: /* single */
                        ret = (atomic_read(&zinfo->active_zones_left) >= (1 + reserved));
                        break;
                case BTRFS_BLOCK_GROUP_DUP:
                        ret = (atomic_read(&zinfo->active_zones_left) >= (2 + reserved));
                        break;
                }
                if (ret)
                        break;
        }
        spin_unlock(&fs_info->zone_active_bgs_lock);
        mutex_unlock(&fs_info->chunk_mutex);

        if (!ret)
                set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);

        return ret;
}

void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
{
        struct btrfs_block_group *block_group;
        u64 min_alloc_bytes;

        if (!btrfs_is_zoned(fs_info))
                return;

        block_group = btrfs_lookup_block_group(fs_info, logical);
        ASSERT(block_group);

        /* No MIXED_BG on zoned btrfs. */
        if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
                min_alloc_bytes = fs_info->sectorsize;
        else
                min_alloc_bytes = fs_info->nodesize;

        /* Bail out if we can allocate more data from this block group. */
        if (logical + length + min_alloc_bytes <=
            block_group->start + block_group->zone_capacity)
                goto out;

        do_zone_finish(block_group, true);

out:
        btrfs_put_block_group(block_group);
}

static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
{
        struct btrfs_block_group *bg =
                container_of(work, struct btrfs_block_group, zone_finish_work);

        wait_on_extent_buffer_writeback(bg->last_eb);
        free_extent_buffer(bg->last_eb);
        btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
        btrfs_put_block_group(bg);
}

void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
                                   struct extent_buffer *eb)
{
        if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) ||
            eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
                return;

        if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
                btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing",
                          bg->start);
                return;
        }

        /* For the work */
        btrfs_get_block_group(bg);
        atomic_inc(&eb->refs);
        bg->last_eb = eb;
        INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
        queue_work(system_unbound_wq, &bg->zone_finish_work);
}

void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
{
        struct btrfs_fs_info *fs_info = bg->fs_info;

        spin_lock(&fs_info->relocation_bg_lock);
        if (fs_info->data_reloc_bg == bg->start)
                fs_info->data_reloc_bg = 0;
        spin_unlock(&fs_info->relocation_bg_lock);
}

void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
{
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        struct btrfs_device *device;

        if (!btrfs_is_zoned(fs_info))
                return;

        mutex_lock(&fs_devices->device_list_mutex);
        list_for_each_entry(device, &fs_devices->devices, dev_list) {
                if (device->zone_info) {
                        vfree(device->zone_info->zone_cache);
                        device->zone_info->zone_cache = NULL;
                }
        }
        mutex_unlock(&fs_devices->device_list_mutex);
}

bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
{
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        struct btrfs_device *device;
        u64 used = 0;
        u64 total = 0;
        u64 factor;

        ASSERT(btrfs_is_zoned(fs_info));

        if (fs_info->bg_reclaim_threshold == 0)
                return false;

        mutex_lock(&fs_devices->device_list_mutex);
        list_for_each_entry(device, &fs_devices->devices, dev_list) {
                if (!device->bdev)
                        continue;

                total += device->disk_total_bytes;
                used += device->bytes_used;
        }
        mutex_unlock(&fs_devices->device_list_mutex);

        factor = div64_u64(used * 100, total);
        return factor >= fs_info->bg_reclaim_threshold;
}

void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
                                       u64 length)
{
        struct btrfs_block_group *block_group;

        if (!btrfs_is_zoned(fs_info))
                return;

        block_group = btrfs_lookup_block_group(fs_info, logical);
        /* It should be called on a previous data relocation block group. */
        ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));

        spin_lock(&block_group->lock);
        if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))
                goto out;

        /* All relocation extents are written. */
        if (block_group->start + block_group->alloc_offset == logical + length) {
                /*
                 * Now, release this block group for further allocations and
                 * zone finish.
                 */
                clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
                          &block_group->runtime_flags);
        }

out:
        spin_unlock(&block_group->lock);
        btrfs_put_block_group(block_group);
}

int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
{
        struct btrfs_block_group *block_group;
        struct btrfs_block_group *min_bg = NULL;
        u64 min_avail = U64_MAX;
        int ret;

        spin_lock(&fs_info->zone_active_bgs_lock);
        list_for_each_entry(block_group, &fs_info->zone_active_bgs,
                            active_bg_list) {
                u64 avail;

                spin_lock(&block_group->lock);
                if (block_group->reserved || block_group->alloc_offset == 0 ||
                    (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) ||
                    test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
                        spin_unlock(&block_group->lock);
                        continue;
                }

                avail = block_group->zone_capacity - block_group->alloc_offset;
                if (min_avail > avail) {
                        if (min_bg)
                                btrfs_put_block_group(min_bg);
                        min_bg = block_group;
                        min_avail = avail;
                        btrfs_get_block_group(min_bg);
                }
                spin_unlock(&block_group->lock);
        }
        spin_unlock(&fs_info->zone_active_bgs_lock);

        if (!min_bg)
                return 0;

        ret = btrfs_zone_finish(min_bg);
        btrfs_put_block_group(min_bg);

        return ret < 0 ? ret : 1;
}

int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
                                struct btrfs_space_info *space_info,
                                bool do_finish)
{
        struct btrfs_block_group *bg;
        int index;

        if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
                return 0;

        for (;;) {
                int ret;
                bool need_finish = false;

                down_read(&space_info->groups_sem);
                for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) {
                        list_for_each_entry(bg, &space_info->block_groups[index],
                                            list) {
                                if (!spin_trylock(&bg->lock))
                                        continue;
                                if (btrfs_zoned_bg_is_full(bg) ||
                                    test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
                                             &bg->runtime_flags)) {
                                        spin_unlock(&bg->lock);
                                        continue;
                                }
                                spin_unlock(&bg->lock);

                                if (btrfs_zone_activate(bg)) {
                                        up_read(&space_info->groups_sem);
                                        return 1;
                                }

                                need_finish = true;
                        }
                }
                up_read(&space_info->groups_sem);

                if (!do_finish || !need_finish)
                        break;

                ret = btrfs_zone_finish_one_bg(fs_info);
                if (ret == 0)
                        break;
                if (ret < 0)
                        return ret;
        }

        return 0;
}

/*
 * Reserve zones for one metadata block group, one tree-log block group, and one
 * system block group.
 */
void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
{
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        struct btrfs_block_group *block_group;
        struct btrfs_device *device;
        /* Reserve zones for normal SINGLE metadata and tree-log block group. */
        unsigned int metadata_reserve = 2;
        /* Reserve a zone for SINGLE system block group. */
        unsigned int system_reserve = 1;

        if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags))
                return;

        /*
         * This function is called from the mount context. So, there is no
         * parallel process touching the bits. No need for read_seqretry().
         */
        if (fs_info->avail_metadata_alloc_bits & BTRFS_BLOCK_GROUP_DUP)
                metadata_reserve = 4;
        if (fs_info->avail_system_alloc_bits & BTRFS_BLOCK_GROUP_DUP)
                system_reserve = 2;

        /* Apply the reservation on all the devices. */
        mutex_lock(&fs_devices->device_list_mutex);
        list_for_each_entry(device, &fs_devices->devices, dev_list) {
                if (!device->bdev)
                        continue;

                device->zone_info->reserved_active_zones =
                        metadata_reserve + system_reserve;
        }
        mutex_unlock(&fs_devices->device_list_mutex);

        /* Release reservation for currently active block groups. */
        spin_lock(&fs_info->zone_active_bgs_lock);
        list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
                struct btrfs_chunk_map *map = block_group->physical_map;

                if (!(block_group->flags &
                      (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)))
                        continue;

                for (int i = 0; i < map->num_stripes; i++)
                        map->stripes[i].dev->zone_info->reserved_active_zones--;
        }
        spin_unlock(&fs_info->zone_active_bgs_lock);
}































































































































































































































































































































    2 

    4 

    1 

    1 



    9 












    4 
    4 




    4 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   15 










































































   13 

    1 


























































    6 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * linux/include/linux/jbd2.h
 *
 * Written by Stephen C. Tweedie <sct@redhat.com>
 *
 * Copyright 1998-2000 Red Hat, Inc --- All Rights Reserved
 *
 * Definitions for transaction data structures for the buffer cache
 * filesystem journaling support.
 */

#ifndef _LINUX_JBD2_H
#define _LINUX_JBD2_H

/* Allow this file to be included directly into e2fsprogs */
#ifndef __KERNEL__
#include "jfs_compat.h"
#define JBD2_DEBUG
#else

#include <linux/types.h>
#include <linux/buffer_head.h>
#include <linux/journal-head.h>
#include <linux/stddef.h>
#include <linux/mutex.h>
#include <linux/timer.h>
#include <linux/slab.h>
#include <linux/bit_spinlock.h>
#include <linux/blkdev.h>
#include <crypto/hash.h>
#endif

#define journal_oom_retry 1

/*
 * Define JBD2_PARANIOD_IOFAIL to cause a kernel BUG() if ext4 finds
 * certain classes of error which can occur due to failed IOs.  Under
 * normal use we want ext4 to continue after such errors, because
 * hardware _can_ fail, but for debugging purposes when running tests on
 * known-good hardware we may want to trap these errors.
 */
#undef JBD2_PARANOID_IOFAIL

/*
 * The default maximum commit age, in seconds.
 */
#define JBD2_DEFAULT_MAX_COMMIT_AGE 5

#ifdef CONFIG_JBD2_DEBUG
/*
 * Define JBD2_EXPENSIVE_CHECKING to enable more expensive internal
 * consistency checks.  By default we don't do this unless
 * CONFIG_JBD2_DEBUG is on.
 */
#define JBD2_EXPENSIVE_CHECKING
void __jbd2_debug(int level, const char *file, const char *func,
                  unsigned int line, const char *fmt, ...);

#define jbd2_debug(n, fmt, a...) \
        __jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a)
#else
#define jbd2_debug(n, fmt, a...)  no_printk(fmt, ##a)
#endif

extern void *jbd2_alloc(size_t size, gfp_t flags);
extern void jbd2_free(void *ptr, size_t size);

#define JBD2_MIN_JOURNAL_BLOCKS 1024
#define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256

#ifdef __KERNEL__

/**
 * typedef handle_t - The handle_t type represents a single atomic update being performed by some process.
 *
 * All filesystem modifications made by the process go
 * through this handle.  Recursive operations (such as quota operations)
 * are gathered into a single update.
 *
 * The buffer credits field is used to account for journaled buffers
 * being modified by the running process.  To ensure that there is
 * enough log space for all outstanding operations, we need to limit the
 * number of outstanding buffers possible at any time.  When the
 * operation completes, any buffer credits not used are credited back to
 * the transaction, so that at all times we know how many buffers the
 * outstanding updates on a transaction might possibly touch.
 *
 * This is an opaque datatype.
 **/
typedef struct jbd2_journal_handle handle_t;        /* Atomic operation type */


/**
 * typedef journal_t - The journal_t maintains all of the journaling state information for a single filesystem.
 *
 * journal_t is linked to from the fs superblock structure.
 *
 * We use the journal_t to keep track of all outstanding transaction
 * activity on the filesystem, and to manage the state of the log
 * writing process.
 *
 * This is an opaque datatype.
 **/
typedef struct journal_s        journal_t;        /* Journal control structure */
#endif

/*
 * Internal structures used by the logging mechanism:
 */

#define JBD2_MAGIC_NUMBER 0xc03b3998U /* The first 4 bytes of /dev/random! */

/*
 * On-disk structures
 */

/*
 * Descriptor block types:
 */

#define JBD2_DESCRIPTOR_BLOCK        1
#define JBD2_COMMIT_BLOCK        2
#define JBD2_SUPERBLOCK_V1        3
#define JBD2_SUPERBLOCK_V2        4
#define JBD2_REVOKE_BLOCK        5

/*
 * Standard header for all descriptor blocks:
 */
typedef struct journal_header_s
{
        __be32                h_magic;
        __be32                h_blocktype;
        __be32                h_sequence;
} journal_header_t;

/*
 * Checksum types.
 */
#define JBD2_CRC32_CHKSUM   1
#define JBD2_MD5_CHKSUM     2
#define JBD2_SHA1_CHKSUM    3
#define JBD2_CRC32C_CHKSUM  4

#define JBD2_CRC32_CHKSUM_SIZE 4

#define JBD2_CHECKSUM_BYTES (32 / sizeof(u32))
/*
 * Commit block header for storing transactional checksums:
 *
 * NOTE: If FEATURE_COMPAT_CHECKSUM (checksum v1) is set, the h_chksum*
 * fields are used to store a checksum of the descriptor and data blocks.
 *
 * If FEATURE_INCOMPAT_CSUM_V2 (checksum v2) is set, then the h_chksum
 * field is used to store crc32c(uuid+commit_block).  Each journal metadata
 * block gets its own checksum, and data block checksums are stored in
 * journal_block_tag (in the descriptor).  The other h_chksum* fields are
 * not used.
 *
 * If FEATURE_INCOMPAT_CSUM_V3 is set, the descriptor block uses
 * journal_block_tag3_t to store a full 32-bit checksum.  Everything else
 * is the same as v2.
 *
 * Checksum v1, v2, and v3 are mutually exclusive features.
 */
struct commit_header {
        __be32                h_magic;
        __be32          h_blocktype;
        __be32          h_sequence;
        unsigned char   h_chksum_type;
        unsigned char   h_chksum_size;
        unsigned char         h_padding[2];
        __be32                 h_chksum[JBD2_CHECKSUM_BYTES];
        __be64                h_commit_sec;
        __be32                h_commit_nsec;
};

/*
 * The block tag: used to describe a single buffer in the journal.
 * t_blocknr_high is only used if INCOMPAT_64BIT is set, so this
 * raw struct shouldn't be used for pointer math or sizeof() - use
 * journal_tag_bytes(journal) instead to compute this.
 */
typedef struct journal_block_tag3_s
{
        __be32                t_blocknr;        /* The on-disk block number */
        __be32                t_flags;        /* See below */
        __be32                t_blocknr_high; /* most-significant high 32bits. */
        __be32                t_checksum;        /* crc32c(uuid+seq+block) */
} journal_block_tag3_t;

typedef struct journal_block_tag_s
{
        __be32                t_blocknr;        /* The on-disk block number */
        __be16                t_checksum;        /* truncated crc32c(uuid+seq+block) */
        __be16                t_flags;        /* See below */
        __be32                t_blocknr_high; /* most-significant high 32bits. */
} journal_block_tag_t;

/* Tail of descriptor or revoke block, for checksumming */
struct jbd2_journal_block_tail {
        __be32                t_checksum;        /* crc32c(uuid+descr_block) */
};

/*
 * The revoke descriptor: used on disk to describe a series of blocks to
 * be revoked from the log
 */
typedef struct jbd2_journal_revoke_header_s
{
        journal_header_t r_header;
        __be32                 r_count;        /* Count of bytes used in the block */
} jbd2_journal_revoke_header_t;

/* Definitions for the journal tag flags word: */
#define JBD2_FLAG_ESCAPE                1        /* on-disk block is escaped */
#define JBD2_FLAG_SAME_UUID        2        /* block has same uuid as previous */
#define JBD2_FLAG_DELETED        4        /* block deleted by this transaction */
#define JBD2_FLAG_LAST_TAG        8        /* last tag in this descriptor block */


/*
 * The journal superblock.  All fields are in big-endian byte order.
 */
typedef struct journal_superblock_s
{
/* 0x0000 */
        journal_header_t s_header;

/* 0x000C */
        /* Static information describing the journal */
        __be32        s_blocksize;                /* journal device blocksize */
        __be32        s_maxlen;                /* total blocks in journal file */
        __be32        s_first;                /* first block of log information */

/* 0x0018 */
        /* Dynamic information describing the current state of the log */
        __be32        s_sequence;                /* first commit ID expected in log */
        __be32        s_start;                /* blocknr of start of log */

/* 0x0020 */
        /* Error value, as set by jbd2_journal_abort(). */
        __be32        s_errno;

/* 0x0024 */
        /* Remaining fields are only valid in a version-2 superblock */
        __be32        s_feature_compat;        /* compatible feature set */
        __be32        s_feature_incompat;        /* incompatible feature set */
        __be32        s_feature_ro_compat;        /* readonly-compatible feature set */
/* 0x0030 */
        __u8        s_uuid[16];                /* 128-bit uuid for journal */

/* 0x0040 */
        __be32        s_nr_users;                /* Nr of filesystems sharing log */

        __be32        s_dynsuper;                /* Blocknr of dynamic superblock copy*/

/* 0x0048 */
        __be32        s_max_transaction;        /* Limit of journal blocks per trans.*/
        __be32        s_max_trans_data;        /* Limit of data blocks per trans. */

/* 0x0050 */
        __u8        s_checksum_type;        /* checksum type */
        __u8        s_padding2[3];
/* 0x0054 */
        __be32        s_num_fc_blks;                /* Number of fast commit blocks */
        __be32        s_head;                        /* blocknr of head of log, only uptodate
                                         * while the filesystem is clean */
/* 0x005C */
        __u32        s_padding[40];
        __be32        s_checksum;                /* crc32c(superblock) */

/* 0x0100 */
        __u8        s_users[16*48];                /* ids of all fs'es sharing the log */
/* 0x0400 */
} journal_superblock_t;

#define JBD2_FEATURE_COMPAT_CHECKSUM                0x00000001

#define JBD2_FEATURE_INCOMPAT_REVOKE                0x00000001
#define JBD2_FEATURE_INCOMPAT_64BIT                0x00000002
#define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT        0x00000004
#define JBD2_FEATURE_INCOMPAT_CSUM_V2                0x00000008
#define JBD2_FEATURE_INCOMPAT_CSUM_V3                0x00000010
#define JBD2_FEATURE_INCOMPAT_FAST_COMMIT        0x00000020

/* See "journal feature predicate functions" below */

/* Features known to this kernel version: */
#define JBD2_KNOWN_COMPAT_FEATURES        JBD2_FEATURE_COMPAT_CHECKSUM
#define JBD2_KNOWN_ROCOMPAT_FEATURES        0
#define JBD2_KNOWN_INCOMPAT_FEATURES        (JBD2_FEATURE_INCOMPAT_REVOKE | \
                                        JBD2_FEATURE_INCOMPAT_64BIT | \
                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \
                                        JBD2_FEATURE_INCOMPAT_CSUM_V2 | \
                                        JBD2_FEATURE_INCOMPAT_CSUM_V3 | \
                                        JBD2_FEATURE_INCOMPAT_FAST_COMMIT)

#ifdef __KERNEL__

#include <linux/fs.h>
#include <linux/sched.h>

enum jbd_state_bits {
        BH_JBD                        /* Has an attached ext3 journal_head */
          = BH_PrivateStart,
        BH_JWrite,                /* Being written to log (@@@ DEBUGGING) */
        BH_Freed,                /* Has been freed (truncated) */
        BH_Revoked,                /* Has been revoked from the log */
        BH_RevokeValid,                /* Revoked flag is valid */
        BH_JBDDirty,                /* Is dirty but journaled */
        BH_JournalHead,                /* Pins bh->b_private and jh->b_bh */
        BH_Shadow,                /* IO on shadow buffer is running */
        BH_Verified,                /* Metadata block has been verified ok */
        BH_JBDPrivateStart,        /* First bit available for private use by FS */
};

BUFFER_FNS(JBD, jbd)
BUFFER_FNS(JWrite, jwrite)
BUFFER_FNS(JBDDirty, jbddirty)
TAS_BUFFER_FNS(JBDDirty, jbddirty)
BUFFER_FNS(Revoked, revoked)
TAS_BUFFER_FNS(Revoked, revoked)
BUFFER_FNS(RevokeValid, revokevalid)
TAS_BUFFER_FNS(RevokeValid, revokevalid)
BUFFER_FNS(Freed, freed)
BUFFER_FNS(Shadow, shadow)
BUFFER_FNS(Verified, verified)

static inline struct buffer_head *jh2bh(struct journal_head *jh)
{
        return jh->b_bh;
}

static inline struct journal_head *bh2jh(struct buffer_head *bh)
{
        return bh->b_private;
}

static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
{
        bit_spin_lock(BH_JournalHead, &bh->b_state);
}

static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
{
        bit_spin_unlock(BH_JournalHead, &bh->b_state);
}

#define J_ASSERT(assert)        BUG_ON(!(assert))

#define J_ASSERT_BH(bh, expr)        J_ASSERT(expr)
#define J_ASSERT_JH(jh, expr)        J_ASSERT(expr)

#if defined(JBD2_PARANOID_IOFAIL)
#define J_EXPECT(expr, why...)                J_ASSERT(expr)
#define J_EXPECT_BH(bh, expr, why...)        J_ASSERT_BH(bh, expr)
#define J_EXPECT_JH(jh, expr, why...)        J_ASSERT_JH(jh, expr)
#else
#define __journal_expect(expr, why...)                                             \
        ({                                                                     \
                int val = (expr);                                             \
                if (!val) {                                                     \
                        printk(KERN_ERR                                             \
                               "JBD2 unexpected failure: %s: %s;\n",             \
                               __func__, #expr);                             \
                        printk(KERN_ERR why "\n");                             \
                }                                                             \
                val;                                                             \
        })
#define J_EXPECT(expr, why...)                __journal_expect(expr, ## why)
#define J_EXPECT_BH(bh, expr, why...)        __journal_expect(expr, ## why)
#define J_EXPECT_JH(jh, expr, why...)        __journal_expect(expr, ## why)
#endif

/* Flags in jbd_inode->i_flags */
#define __JI_COMMIT_RUNNING 0
#define __JI_WRITE_DATA 1
#define __JI_WAIT_DATA 2

/*
 * Commit of the inode data in progress. We use this flag to protect us from
 * concurrent deletion of inode. We cannot use reference to inode for this
 * since we cannot afford doing last iput() on behalf of kjournald
 */
#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
/* Write allocated dirty buffers in this inode before commit */
#define JI_WRITE_DATA (1 << __JI_WRITE_DATA)
/* Wait for outstanding data writes for this inode before commit */
#define JI_WAIT_DATA (1 << __JI_WAIT_DATA)

/**
 * struct jbd2_inode - The jbd_inode type is the structure linking inodes in
 * ordered mode present in a transaction so that we can sync them during commit.
 */
struct jbd2_inode {
        /**
         * @i_transaction:
         *
         * Which transaction does this inode belong to? Either the running
         * transaction or the committing one. [j_list_lock]
         */
        transaction_t *i_transaction;

        /**
         * @i_next_transaction:
         *
         * Pointer to the running transaction modifying inode's data in case
         * there is already a committing transaction touching it. [j_list_lock]
         */
        transaction_t *i_next_transaction;

        /**
         * @i_list: List of inodes in the i_transaction [j_list_lock]
         */
        struct list_head i_list;

        /**
         * @i_vfs_inode:
         *
         * VFS inode this inode belongs to [constant for lifetime of structure]
         */
        struct inode *i_vfs_inode;

        /**
         * @i_flags: Flags of inode [j_list_lock]
         */
        unsigned long i_flags;

        /**
         * @i_dirty_start:
         *
         * Offset in bytes where the dirty range for this inode starts.
         * [j_list_lock]
         */
        loff_t i_dirty_start;

        /**
         * @i_dirty_end:
         *
         * Inclusive offset in bytes where the dirty range for this inode
         * ends. [j_list_lock]
         */
        loff_t i_dirty_end;
};

struct jbd2_revoke_table_s;

/**
 * struct jbd2_journal_handle - The jbd2_journal_handle type is the concrete
 *     type associated with handle_t.
 * @h_transaction: Which compound transaction is this update a part of?
 * @h_journal: Which journal handle belongs to - used iff h_reserved set.
 * @h_rsv_handle: Handle reserved for finishing the logical operation.
 * @h_total_credits: Number of remaining buffers we are allowed to add to
 *        journal. These are dirty buffers and revoke descriptor blocks.
 * @h_revoke_credits: Number of remaining revoke records available for handle
 * @h_ref: Reference count on this handle.
 * @h_err: Field for caller's use to track errors through large fs operations.
 * @h_sync: Flag for sync-on-close.
 * @h_jdata: Flag to force data journaling.
 * @h_reserved: Flag for handle for reserved credits.
 * @h_aborted: Flag indicating fatal error on handle.
 * @h_type: For handle statistics.
 * @h_line_no: For handle statistics.
 * @h_start_jiffies: Handle Start time.
 * @h_requested_credits: Holds @h_total_credits after handle is started.
 * @h_revoke_credits_requested: Holds @h_revoke_credits after handle is started.
 * @saved_alloc_context: Saved context while transaction is open.
 **/

/* Docbook can't yet cope with the bit fields, but will leave the documentation
 * in so it can be fixed later.
 */

struct jbd2_journal_handle
{
        union {
                transaction_t        *h_transaction;
                /* Which journal handle belongs to - used iff h_reserved set */
                journal_t        *h_journal;
        };

        handle_t                *h_rsv_handle;
        int                        h_total_credits;
        int                        h_revoke_credits;
        int                        h_revoke_credits_requested;
        int                        h_ref;
        int                        h_err;

        /* Flags [no locking] */
        unsigned int        h_sync:                1;
        unsigned int        h_jdata:        1;
        unsigned int        h_reserved:        1;
        unsigned int        h_aborted:        1;
        unsigned int        h_type:                8;
        unsigned int        h_line_no:        16;

        unsigned long                h_start_jiffies;
        unsigned int                h_requested_credits;

        unsigned int                saved_alloc_context;
};


/*
 * Some stats for checkpoint phase
 */
struct transaction_chp_stats_s {
        unsigned long                cs_chp_time;
        __u32                        cs_forced_to_close;
        __u32                        cs_written;
        __u32                        cs_dropped;
};

/* The transaction_t type is the guts of the journaling mechanism.  It
 * tracks a compound transaction through its various states:
 *
 * RUNNING:        accepting new updates
 * LOCKED:        Updates still running but we don't accept new ones
 * RUNDOWN:        Updates are tidying up but have finished requesting
 *                new buffers to modify (state not used for now)
 * FLUSH:       All updates complete, but we are still writing to disk
 * COMMIT:      All data on disk, writing commit record
 * FINISHED:        We still have to keep the transaction for checkpointing.
 *
 * The transaction keeps track of all of the buffers modified by a
 * running transaction, and all of the buffers committed but not yet
 * flushed to home for finished transactions.
 * (Locking Documentation improved by LockDoc)
 */

/*
 * Lock ranking:
 *
 *    j_list_lock
 *      ->jbd_lock_bh_journal_head()        (This is "innermost")
 *
 *    j_state_lock
 *    ->b_state_lock
 *
 *    b_state_lock
 *    ->j_list_lock
 *
 *    j_state_lock
 *    ->j_list_lock                        (journal_unmap_buffer)
 *
 */

struct transaction_s
{
        /* Pointer to the journal for this transaction. [no locking] */
        journal_t                *t_journal;

        /* Sequence number for this transaction [no locking] */
        tid_t                        t_tid;

        /*
         * Transaction's current state
         * [no locking - only kjournald2 alters this]
         * [j_list_lock] guards transition of a transaction into T_FINISHED
         * state and subsequent call of __jbd2_journal_drop_transaction()
         * FIXME: needs barriers
         * KLUDGE: [use j_state_lock]
         */
        enum {
                T_RUNNING,
                T_LOCKED,
                T_SWITCH,
                T_FLUSH,
                T_COMMIT,
                T_COMMIT_DFLUSH,
                T_COMMIT_JFLUSH,
                T_COMMIT_CALLBACK,
                T_FINISHED
        }                        t_state;

        /*
         * Where in the log does this transaction's commit start? [no locking]
         */
        unsigned long                t_log_start;

        /*
         * Number of buffers on the t_buffers list [j_list_lock, no locks
         * needed for jbd2 thread]
         */
        int                        t_nr_buffers;

        /*
         * Doubly-linked circular list of all buffers reserved but not yet
         * modified by this transaction [j_list_lock, no locks needed fo
         * jbd2 thread]
         */
        struct journal_head        *t_reserved_list;

        /*
         * Doubly-linked circular list of all metadata buffers owned by this
         * transaction [j_list_lock, no locks needed for jbd2 thread]
         */
        struct journal_head        *t_buffers;

        /*
         * Doubly-linked circular list of all forget buffers (superseded
         * buffers which we can un-checkpoint once this transaction commits)
         * [j_list_lock]
         */
        struct journal_head        *t_forget;

        /*
         * Doubly-linked circular list of all buffers still to be flushed before
         * this transaction can be checkpointed. [j_list_lock]
         */
        struct journal_head        *t_checkpoint_list;

        /*
         * Doubly-linked circular list of metadata buffers being
         * shadowed by log IO.  The IO buffers on the iobuf list and
         * the shadow buffers on this list match each other one for
         * one at all times. [j_list_lock, no locks needed for jbd2
         * thread]
         */
        struct journal_head        *t_shadow_list;

        /*
         * List of inodes associated with the transaction; e.g., ext4 uses
         * this to track inodes in data=ordered and data=journal mode that
         * need special handling on transaction commit; also used by ocfs2.
         * [j_list_lock]
         */
        struct list_head        t_inode_list;

        /*
         * Longest time some handle had to wait for running transaction
         */
        unsigned long                t_max_wait;

        /*
         * When transaction started
         */
        unsigned long                t_start;

        /*
         * When commit was requested [j_state_lock]
         */
        unsigned long                t_requested;

        /*
         * Checkpointing stats [j_list_lock]
         */
        struct transaction_chp_stats_s t_chp_stats;

        /*
         * Number of outstanding updates running on this transaction
         * [none]
         */
        atomic_t                t_updates;

        /*
         * Number of blocks reserved for this transaction in the journal.
         * This is including all credits reserved when starting transaction
         * handles as well as all journal descriptor blocks needed for this
         * transaction. [none]
         */
        atomic_t                t_outstanding_credits;

        /*
         * Number of revoke records for this transaction added by already
         * stopped handles. [none]
         */
        atomic_t                t_outstanding_revokes;

        /*
         * How many handles used this transaction? [none]
         */
        atomic_t                t_handle_count;

        /*
         * Forward and backward links for the circular list of all transactions
         * awaiting checkpoint. [j_list_lock]
         */
        transaction_t                *t_cpnext, *t_cpprev;

        /*
         * When will the transaction expire (become due for commit), in jiffies?
         * [no locking]
         */
        unsigned long                t_expires;

        /*
         * When this transaction started, in nanoseconds [no locking]
         */
        ktime_t                        t_start_time;

        /*
         * This transaction is being forced and some process is
         * waiting for it to finish.
         */
        unsigned int t_synchronous_commit:1;

        /* Disk flush needs to be sent to fs partition [no locking] */
        int                        t_need_data_flush;

        /*
         * For use by the filesystem to store fs-specific data
         * structures associated with the transaction
         */
        struct list_head        t_private_list;
};

struct transaction_run_stats_s {
        unsigned long                rs_wait;
        unsigned long                rs_request_delay;
        unsigned long                rs_running;
        unsigned long                rs_locked;
        unsigned long                rs_flushing;
        unsigned long                rs_logging;

        __u32                        rs_handle_count;
        __u32                        rs_blocks;
        __u32                        rs_blocks_logged;
};

struct transaction_stats_s {
        unsigned long                ts_tid;
        unsigned long                ts_requested;
        struct transaction_run_stats_s run;
};

static inline unsigned long
jbd2_time_diff(unsigned long start, unsigned long end)
{
        if (end >= start)
                return end - start;

        return end + (MAX_JIFFY_OFFSET - start);
}

#define JBD2_NR_BATCH        64

enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};

#define JBD2_FC_REPLAY_STOP        0
#define JBD2_FC_REPLAY_CONTINUE        1

/**
 * struct journal_s - The journal_s type is the concrete type associated with
 *     journal_t.
 */
struct journal_s
{
        /**
         * @j_flags: General journaling state flags [j_state_lock,
         * no lock for quick racy checks]
         */
        unsigned long                j_flags;

        /**
         * @j_errno:
         *
         * Is there an outstanding uncleared error on the journal (from a prior
         * abort)? [j_state_lock]
         */
        int                        j_errno;

        /**
         * @j_abort_mutex: Lock the whole aborting procedure.
         */
        struct mutex                j_abort_mutex;

        /**
         * @j_sb_buffer: The first part of the superblock buffer.
         */
        struct buffer_head        *j_sb_buffer;

        /**
         * @j_superblock: The second part of the superblock buffer.
         */
        journal_superblock_t        *j_superblock;

        /**
         * @j_state_lock: Protect the various scalars in the journal.
         */
        rwlock_t                j_state_lock;

        /**
         * @j_barrier_count:
         *
         * Number of processes waiting to create a barrier lock [j_state_lock,
         * no lock for quick racy checks]
         */
        int                        j_barrier_count;

        /**
         * @j_barrier: The barrier lock itself.
         */
        struct mutex                j_barrier;

        /**
         * @j_running_transaction:
         *
         * Transactions: The current running transaction...
         * [j_state_lock, no lock for quick racy checks] [caller holding
         * open handle]
         */
        transaction_t                *j_running_transaction;

        /**
         * @j_committing_transaction:
         *
         * the transaction we are pushing to disk
         * [j_state_lock] [caller holding open handle]
         */
        transaction_t                *j_committing_transaction;

        /**
         * @j_checkpoint_transactions:
         *
         * ... and a linked circular list of all transactions waiting for
         * checkpointing. [j_list_lock]
         */
        transaction_t                *j_checkpoint_transactions;

        /**
         * @j_wait_transaction_locked:
         *
         * Wait queue for waiting for a locked transaction to start committing,
         * or for a barrier lock to be released.
         */
        wait_queue_head_t        j_wait_transaction_locked;

        /**
         * @j_wait_done_commit: Wait queue for waiting for commit to complete.
         */
        wait_queue_head_t        j_wait_done_commit;

        /**
         * @j_wait_commit: Wait queue to trigger commit.
         */
        wait_queue_head_t        j_wait_commit;

        /**
         * @j_wait_updates: Wait queue to wait for updates to complete.
         */
        wait_queue_head_t        j_wait_updates;

        /**
         * @j_wait_reserved:
         *
         * Wait queue to wait for reserved buffer credits to drop.
         */
        wait_queue_head_t        j_wait_reserved;

        /**
         * @j_fc_wait:
         *
         * Wait queue to wait for completion of async fast commits.
         */
        wait_queue_head_t        j_fc_wait;

        /**
         * @j_checkpoint_mutex:
         *
         * Semaphore for locking against concurrent checkpoints.
         */
        struct mutex                j_checkpoint_mutex;

        /**
         * @j_chkpt_bhs:
         *
         * List of buffer heads used by the checkpoint routine.  This
         * was moved from jbd2_log_do_checkpoint() to reduce stack
         * usage.  Access to this array is controlled by the
         * @j_checkpoint_mutex.  [j_checkpoint_mutex]
         */
        struct buffer_head        *j_chkpt_bhs[JBD2_NR_BATCH];

        /**
         * @j_shrinker:
         *
         * Journal head shrinker, reclaim buffer's journal head which
         * has been written back.
         */
        struct shrinker                *j_shrinker;

        /**
         * @j_checkpoint_jh_count:
         *
         * Number of journal buffers on the checkpoint list. [j_list_lock]
         */
        struct percpu_counter        j_checkpoint_jh_count;

        /**
         * @j_shrink_transaction:
         *
         * Record next transaction will shrink on the checkpoint list.
         * [j_list_lock]
         */
        transaction_t                *j_shrink_transaction;

        /**
         * @j_head:
         *
         * Journal head: identifies the first unused block in the journal.
         * [j_state_lock]
         */
        unsigned long                j_head;

        /**
         * @j_tail:
         *
         * Journal tail: identifies the oldest still-used block in the journal.
         * [j_state_lock]
         */
        unsigned long                j_tail;

        /**
         * @j_free:
         *
         * Journal free: how many free blocks are there in the journal?
         * [j_state_lock]
         */
        unsigned long                j_free;

        /**
         * @j_first:
         *
         * The block number of the first usable block in the journal
         * [j_state_lock].
         */
        unsigned long                j_first;

        /**
         * @j_last:
         *
         * The block number one beyond the last usable block in the journal
         * [j_state_lock].
         */
        unsigned long                j_last;

        /**
         * @j_fc_first:
         *
         * The block number of the first fast commit block in the journal
         * [j_state_lock].
         */
        unsigned long                j_fc_first;

        /**
         * @j_fc_off:
         *
         * Number of fast commit blocks currently allocated. Accessed only
         * during fast commit. Currently only process can do fast commit, so
         * this field is not protected by any lock.
         */
        unsigned long                j_fc_off;

        /**
         * @j_fc_last:
         *
         * The block number one beyond the last fast commit block in the journal
         * [j_state_lock].
         */
        unsigned long                j_fc_last;

        /**
         * @j_dev: Device where we store the journal.
         */
        struct block_device        *j_dev;

        /**
         * @j_blocksize: Block size for the location where we store the journal.
         */
        int                        j_blocksize;

        /**
         * @j_blk_offset:
         *
         * Starting block offset into the device where we store the journal.
         */
        unsigned long long        j_blk_offset;

        /**
         * @j_devname: Journal device name.
         */
        char                        j_devname[BDEVNAME_SIZE+24];

        /**
         * @j_fs_dev:
         *
         * Device which holds the client fs.  For internal journal this will be
         * equal to j_dev.
         */
        struct block_device        *j_fs_dev;

        /**
         * @j_fs_dev_wb_err:
         *
         * Records the errseq of the client fs's backing block device.
         */
        errseq_t                j_fs_dev_wb_err;

        /**
         * @j_total_len: Total maximum capacity of the journal region on disk.
         */
        unsigned int                j_total_len;

        /**
         * @j_reserved_credits:
         *
         * Number of buffers reserved from the running transaction.
         */
        atomic_t                j_reserved_credits;

        /**
         * @j_list_lock: Protects the buffer lists and internal buffer state.
         */
        spinlock_t                j_list_lock;

        /**
         * @j_inode:
         *
         * Optional inode where we store the journal.  If present, all
         * journal block numbers are mapped into this inode via bmap().
         */
        struct inode                *j_inode;

        /**
         * @j_tail_sequence:
         *
         * Sequence number of the oldest transaction in the log [j_state_lock]
         */
        tid_t                        j_tail_sequence;

        /**
         * @j_transaction_sequence:
         *
         * Sequence number of the next transaction to grant [j_state_lock]
         */
        tid_t                        j_transaction_sequence;

        /**
         * @j_commit_sequence:
         *
         * Sequence number of the most recently committed transaction
         * [j_state_lock, no lock for quick racy checks]
         */
        tid_t                        j_commit_sequence;

        /**
         * @j_commit_request:
         *
         * Sequence number of the most recent transaction wanting commit
         * [j_state_lock, no lock for quick racy checks]
         */
        tid_t                        j_commit_request;

        /**
         * @j_uuid:
         *
         * Journal uuid: identifies the object (filesystem, LVM volume etc)
         * backed by this journal.  This will eventually be replaced by an array
         * of uuids, allowing us to index multiple devices within a single
         * journal and to perform atomic updates across them.
         */
        __u8                        j_uuid[16];

        /**
         * @j_task: Pointer to the current commit thread for this journal.
         */
        struct task_struct        *j_task;

        /**
         * @j_max_transaction_buffers:
         *
         * Maximum number of metadata buffers to allow in a single compound
         * commit transaction.
         */
        int                        j_max_transaction_buffers;

        /**
         * @j_revoke_records_per_block:
         *
         * Number of revoke records that fit in one descriptor block.
         */
        int                        j_revoke_records_per_block;

        /**
         * @j_commit_interval:
         *
         * What is the maximum transaction lifetime before we begin a commit?
         */
        unsigned long                j_commit_interval;

        /**
         * @j_commit_timer: The timer used to wakeup the commit thread.
         */
        struct timer_list        j_commit_timer;

        /**
         * @j_revoke_lock: Protect the revoke table.
         */
        spinlock_t                j_revoke_lock;

        /**
         * @j_revoke:
         *
         * The revoke table - maintains the list of revoked blocks in the
         * current transaction.
         */
        struct jbd2_revoke_table_s *j_revoke;

        /**
         * @j_revoke_table: Alternate revoke tables for j_revoke.
         */
        struct jbd2_revoke_table_s *j_revoke_table[2];

        /**
         * @j_wbuf: Array of bhs for jbd2_journal_commit_transaction.
         */
        struct buffer_head        **j_wbuf;

        /**
         * @j_fc_wbuf: Array of fast commit bhs for fast commit. Accessed only
         * during a fast commit. Currently only process can do fast commit, so
         * this field is not protected by any lock.
         */
        struct buffer_head        **j_fc_wbuf;

        /**
         * @j_wbufsize:
         *
         * Size of @j_wbuf array.
         */
        int                        j_wbufsize;

        /**
         * @j_fc_wbufsize:
         *
         * Size of @j_fc_wbuf array.
         */
        int                        j_fc_wbufsize;

        /**
         * @j_last_sync_writer:
         *
         * The pid of the last person to run a synchronous operation
         * through the journal.
         */
        pid_t                        j_last_sync_writer;

        /**
         * @j_average_commit_time:
         *
         * The average amount of time in nanoseconds it takes to commit a
         * transaction to disk. [j_state_lock]
         */
        u64                        j_average_commit_time;

        /**
         * @j_min_batch_time:
         *
         * Minimum time that we should wait for additional filesystem operations
         * to get batched into a synchronous handle in microseconds.
         */
        u32                        j_min_batch_time;

        /**
         * @j_max_batch_time:
         *
         * Maximum time that we should wait for additional filesystem operations
         * to get batched into a synchronous handle in microseconds.
         */
        u32                        j_max_batch_time;

        /**
         * @j_commit_callback:
         *
         * This function is called when a transaction is closed.
         */
        void                        (*j_commit_callback)(journal_t *,
                                                     transaction_t *);

        /**
         * @j_submit_inode_data_buffers:
         *
         * This function is called for all inodes associated with the
         * committing transaction marked with JI_WRITE_DATA flag
         * before we start to write out the transaction to the journal.
         */
        int                        (*j_submit_inode_data_buffers)
                                        (struct jbd2_inode *);

        /**
         * @j_finish_inode_data_buffers:
         *
         * This function is called for all inodes associated with the
         * committing transaction marked with JI_WAIT_DATA flag
         * after we have written the transaction to the journal
         * but before we write out the commit block.
         */
        int                        (*j_finish_inode_data_buffers)
                                        (struct jbd2_inode *);

        /*
         * Journal statistics
         */

        /**
         * @j_history_lock: Protect the transactions statistics history.
         */
        spinlock_t                j_history_lock;

        /**
         * @j_proc_entry: procfs entry for the jbd statistics directory.
         */
        struct proc_dir_entry        *j_proc_entry;

        /**
         * @j_stats: Overall statistics.
         */
        struct transaction_stats_s j_stats;

        /**
         * @j_failed_commit: Failed journal commit ID.
         */
        unsigned int                j_failed_commit;

        /**
         * @j_private:
         *
         * An opaque pointer to fs-private information.  ext3 puts its
         * superblock pointer here.
         */
        void *j_private;

        /**
         * @j_chksum_driver:
         *
         * Reference to checksum algorithm driver via cryptoapi.
         */
        struct crypto_shash *j_chksum_driver;

        /**
         * @j_csum_seed:
         *
         * Precomputed journal UUID checksum for seeding other checksums.
         */
        __u32 j_csum_seed;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        /**
         * @j_trans_commit_map:
         *
         * Lockdep entity to track transaction commit dependencies. Handles
         * hold this "lock" for read, when we wait for commit, we acquire the
         * "lock" for writing. This matches the properties of jbd2 journalling
         * where the running transaction has to wait for all handles to be
         * dropped to commit that transaction and also acquiring a handle may
         * require transaction commit to finish.
         */
        struct lockdep_map        j_trans_commit_map;
#endif

        /**
         * @j_fc_cleanup_callback:
         *
         * Clean-up after fast commit or full commit. JBD2 calls this function
         * after every commit operation.
         */
        void (*j_fc_cleanup_callback)(struct journal_s *journal, int full, tid_t tid);

        /**
         * @j_fc_replay_callback:
         *
         * File-system specific function that performs replay of a fast
         * commit. JBD2 calls this function for each fast commit block found in
         * the journal. This function should return JBD2_FC_REPLAY_CONTINUE
         * to indicate that the block was processed correctly and more fast
         * commit replay should continue. Return value of JBD2_FC_REPLAY_STOP
         * indicates the end of replay (no more blocks remaining). A negative
         * return value indicates error.
         */
        int (*j_fc_replay_callback)(struct journal_s *journal,
                                    struct buffer_head *bh,
                                    enum passtype pass, int off,
                                    tid_t expected_commit_id);

        /**
         * @j_bmap:
         *
         * Bmap function that should be used instead of the generic
         * VFS bmap function.
         */
        int (*j_bmap)(struct journal_s *journal, sector_t *block);
};

#define jbd2_might_wait_for_commit(j) \
        do { \
                rwsem_acquire(&j->j_trans_commit_map, 0, 0, _THIS_IP_); \
                rwsem_release(&j->j_trans_commit_map, _THIS_IP_); \
        } while (0)

/*
 * We can support any known requested features iff the
 * superblock is not in version 1.  Otherwise we fail to support any
 * extended sb features.
 */
static inline bool jbd2_format_support_feature(journal_t *j)
{
        return j->j_superblock->s_header.h_blocktype !=
                                        cpu_to_be32(JBD2_SUPERBLOCK_V1);
}

/* journal feature predicate functions */
#define JBD2_FEATURE_COMPAT_FUNCS(name, flagname) \
static inline bool jbd2_has_feature_##name(journal_t *j) \
{ \
        return (jbd2_format_support_feature(j) && \
                ((j)->j_superblock->s_feature_compat & \
                 cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname)) != 0); \
} \
static inline void jbd2_set_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_compat |= \
                cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname); \
} \
static inline void jbd2_clear_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_compat &= \
                ~cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname); \
}

#define JBD2_FEATURE_RO_COMPAT_FUNCS(name, flagname) \
static inline bool jbd2_has_feature_##name(journal_t *j) \
{ \
        return (jbd2_format_support_feature(j) && \
                ((j)->j_superblock->s_feature_ro_compat & \
                 cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname)) != 0); \
} \
static inline void jbd2_set_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_ro_compat |= \
                cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname); \
} \
static inline void jbd2_clear_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_ro_compat &= \
                ~cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname); \
}

#define JBD2_FEATURE_INCOMPAT_FUNCS(name, flagname) \
static inline bool jbd2_has_feature_##name(journal_t *j) \
{ \
        return (jbd2_format_support_feature(j) && \
                ((j)->j_superblock->s_feature_incompat & \
                 cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname)) != 0); \
} \
static inline void jbd2_set_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_incompat |= \
                cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname); \
} \
static inline void jbd2_clear_feature_##name(journal_t *j) \
{ \
        (j)->j_superblock->s_feature_incompat &= \
                ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname); \
}

JBD2_FEATURE_COMPAT_FUNCS(checksum,                CHECKSUM)

JBD2_FEATURE_INCOMPAT_FUNCS(revoke,                REVOKE)
JBD2_FEATURE_INCOMPAT_FUNCS(64bit,                64BIT)
JBD2_FEATURE_INCOMPAT_FUNCS(async_commit,        ASYNC_COMMIT)
JBD2_FEATURE_INCOMPAT_FUNCS(csum2,                CSUM_V2)
JBD2_FEATURE_INCOMPAT_FUNCS(csum3,                CSUM_V3)
JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit,        FAST_COMMIT)

/* Journal high priority write IO operation flags */
#define JBD2_JOURNAL_REQ_FLAGS                (REQ_META | REQ_SYNC | REQ_IDLE)

/*
 * Journal flag definitions
 */
#define JBD2_UNMOUNT        0x001        /* Journal thread is being destroyed */
#define JBD2_ABORT        0x002        /* Journaling has been aborted for errors. */
#define JBD2_ACK_ERR        0x004        /* The errno in the sb has been acked */
#define JBD2_FLUSHED        0x008        /* The journal superblock has been flushed */
#define JBD2_LOADED        0x010        /* The journal superblock has been loaded */
#define JBD2_BARRIER        0x020        /* Use IDE barriers */
#define JBD2_ABORT_ON_SYNCDATA_ERR        0x040        /* Abort the journal on file
                                                 * data write error in ordered
                                                 * mode */
#define JBD2_CYCLE_RECORD                0x080        /* Journal cycled record log on
                                                 * clean and empty filesystem
                                                 * logging area */
#define JBD2_FAST_COMMIT_ONGOING        0x100        /* Fast commit is ongoing */
#define JBD2_FULL_COMMIT_ONGOING        0x200        /* Full commit is ongoing */
#define JBD2_JOURNAL_FLUSH_DISCARD        0x0001
#define JBD2_JOURNAL_FLUSH_ZEROOUT        0x0002
#define JBD2_JOURNAL_FLUSH_VALID        (JBD2_JOURNAL_FLUSH_DISCARD | \
                                        JBD2_JOURNAL_FLUSH_ZEROOUT)

/*
 * Function declarations for the journaling transaction and buffer
 * management
 */

/* Filing buffers */
extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *);
extern bool __jbd2_journal_refile_buffer(struct journal_head *);
extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *);
extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int);
extern void jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int);
static inline void jbd2_file_log_bh(struct list_head *head, struct buffer_head *bh)
{
        list_add_tail(&bh->b_assoc_buffers, head);
}
static inline void jbd2_unfile_log_bh(struct buffer_head *bh)
{
        list_del_init(&bh->b_assoc_buffers);
}

/* Log buffer allocation */
struct buffer_head *jbd2_journal_get_descriptor_buffer(transaction_t *, int);
void jbd2_descriptor_block_csum_set(journal_t *, struct buffer_head *);
int jbd2_journal_next_log_block(journal_t *, unsigned long long *);
int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
                              unsigned long *block);
int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);

/* Commit management */
extern void jbd2_journal_commit_transaction(journal_t *);

/* Checkpoint list management */
enum jbd2_shrink_type {JBD2_SHRINK_DESTROY, JBD2_SHRINK_BUSY_STOP, JBD2_SHRINK_BUSY_SKIP};

void __jbd2_journal_clean_checkpoint_list(journal_t *journal, enum jbd2_shrink_type type);
unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan);
int __jbd2_journal_remove_checkpoint(struct journal_head *);
int jbd2_journal_try_remove_checkpoint(struct journal_head *jh);
void jbd2_journal_destroy_checkpoint(journal_t *journal);
void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *);


/*
 * Triggers
 */

struct jbd2_buffer_trigger_type {
        /*
         * Fired a the moment data to write to the journal are known to be
         * stable - so either at the moment b_frozen_data is created or just
         * before a buffer is written to the journal.  mapped_data is a mapped
         * buffer that is the frozen data for commit.
         */
        void (*t_frozen)(struct jbd2_buffer_trigger_type *type,
                         struct buffer_head *bh, void *mapped_data,
                         size_t size);

        /*
         * Fired during journal abort for dirty buffers that will not be
         * committed.
         */
        void (*t_abort)(struct jbd2_buffer_trigger_type *type,
                        struct buffer_head *bh);
};

extern void jbd2_buffer_frozen_trigger(struct journal_head *jh,
                                       void *mapped_data,
                                       struct jbd2_buffer_trigger_type *triggers);
extern void jbd2_buffer_abort_trigger(struct journal_head *jh,
                                      struct jbd2_buffer_trigger_type *triggers);

/* Buffer IO */
extern int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
                                              struct journal_head *jh_in,
                                              struct buffer_head **bh_out,
                                              sector_t blocknr);

/* Transaction cache support */
extern void jbd2_journal_destroy_transaction_cache(void);
extern int __init jbd2_journal_init_transaction_cache(void);
extern void jbd2_journal_free_transaction(transaction_t *);

/*
 * Journal locking.
 *
 * We need to lock the journal during transaction state changes so that nobody
 * ever tries to take a handle on the running transaction while we are in the
 * middle of moving it to the commit phase.  j_state_lock does this.
 *
 * Note that the locking is completely interrupt unsafe.  We never touch
 * journal structures from interrupts.
 */

static inline handle_t *journal_current_handle(void)
{
        return current->journal_info;
}

/* The journaling code user interface:
 *
 * Create and destroy handles
 * Register buffer modifications against the current transaction.
 */

extern handle_t *jbd2_journal_start(journal_t *, int nblocks);
extern handle_t *jbd2__journal_start(journal_t *, int blocks, int rsv_blocks,
                                     int revoke_records, gfp_t gfp_mask,
                                     unsigned int type, unsigned int line_no);
extern int         jbd2_journal_restart(handle_t *, int nblocks);
extern int         jbd2__journal_restart(handle_t *, int nblocks,
                                       int revoke_records, gfp_t gfp_mask);
extern int         jbd2_journal_start_reserved(handle_t *handle,
                                unsigned int type, unsigned int line_no);
extern void         jbd2_journal_free_reserved(handle_t *handle);
extern int         jbd2_journal_extend(handle_t *handle, int nblocks,
                                     int revoke_records);
extern int         jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
extern int         jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
extern int         jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
void                 jbd2_journal_set_triggers(struct buffer_head *,
                                           struct jbd2_buffer_trigger_type *type);
extern int         jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
extern int         jbd2_journal_forget (handle_t *, struct buffer_head *);
int jbd2_journal_invalidate_folio(journal_t *, struct folio *,
                                        size_t offset, size_t length);
bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio);
extern int         jbd2_journal_stop(handle_t *);
extern int         jbd2_journal_flush(journal_t *journal, unsigned int flags);
extern void         jbd2_journal_lock_updates (journal_t *);
extern void         jbd2_journal_unlock_updates (journal_t *);

void jbd2_journal_wait_updates(journal_t *);

extern journal_t * jbd2_journal_init_dev(struct block_device *bdev,
                                struct block_device *fs_dev,
                                unsigned long long start, int len, int bsize);
extern journal_t * jbd2_journal_init_inode (struct inode *);
extern int           jbd2_journal_update_format (journal_t *);
extern int           jbd2_journal_check_used_features
                   (journal_t *, unsigned long, unsigned long, unsigned long);
extern int           jbd2_journal_check_available_features
                   (journal_t *, unsigned long, unsigned long, unsigned long);
extern int           jbd2_journal_set_features
                   (journal_t *, unsigned long, unsigned long, unsigned long);
extern void           jbd2_journal_clear_features
                   (journal_t *, unsigned long, unsigned long, unsigned long);
extern int           jbd2_journal_load       (journal_t *journal);
extern int           jbd2_journal_destroy    (journal_t *);
extern int           jbd2_journal_recover    (journal_t *journal);
extern int           jbd2_journal_wipe       (journal_t *, int);
extern int           jbd2_journal_skip_recovery        (journal_t *);
extern void           jbd2_journal_update_sb_errno(journal_t *);
extern int           jbd2_journal_update_sb_log_tail        (journal_t *, tid_t,
                                unsigned long, blk_opf_t);
extern void           jbd2_journal_abort      (journal_t *, int);
extern int           jbd2_journal_errno      (journal_t *);
extern void           jbd2_journal_ack_err    (journal_t *);
extern int           jbd2_journal_clear_err  (journal_t *);
extern int           jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
extern int           jbd2_journal_force_commit(journal_t *);
extern int           jbd2_journal_force_commit_nested(journal_t *);
extern int           jbd2_journal_inode_ranged_write(handle_t *handle,
                        struct jbd2_inode *inode, loff_t start_byte,
                        loff_t length);
extern int           jbd2_journal_inode_ranged_wait(handle_t *handle,
                        struct jbd2_inode *inode, loff_t start_byte,
                        loff_t length);
extern int           jbd2_journal_finish_inode_data_buffers(
                        struct jbd2_inode *jinode);
extern int           jbd2_journal_begin_ordered_truncate(journal_t *journal,
                                struct jbd2_inode *inode, loff_t new_size);
extern void           jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
extern void           jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);

/*
 * journal_head management
 */
struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh);
struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh);
void jbd2_journal_put_journal_head(struct journal_head *jh);

/*
 * handle management
 */
extern struct kmem_cache *jbd2_handle_cache;

#define jbd2_alloc_handle(_gfp_flags)        \
                ((handle_t *)kmem_cache_zalloc(jbd2_handle_cache, _gfp_flags))

static inline void jbd2_free_handle(handle_t *handle)
{
        kmem_cache_free(jbd2_handle_cache, handle);
}

/*
 * jbd2_inode management (optional, for those file systems that want to use
 * dynamically allocated jbd2_inode structures)
 */
extern struct kmem_cache *jbd2_inode_cache;

#define jbd2_alloc_inode(_gfp_flags)        \
                ((struct jbd2_inode *)kmem_cache_alloc(jbd2_inode_cache, _gfp_flags))

static inline void jbd2_free_inode(struct jbd2_inode *jinode)
{
        kmem_cache_free(jbd2_inode_cache, jinode);
}

/* Primary revoke support */
#define JOURNAL_REVOKE_DEFAULT_HASH 256
extern int           jbd2_journal_init_revoke(journal_t *, int);
extern void           jbd2_journal_destroy_revoke_record_cache(void);
extern void           jbd2_journal_destroy_revoke_table_cache(void);
extern int __init jbd2_journal_init_revoke_record_cache(void);
extern int __init jbd2_journal_init_revoke_table_cache(void);

extern void           jbd2_journal_destroy_revoke(journal_t *);
extern int           jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *);
extern int           jbd2_journal_cancel_revoke(handle_t *, struct journal_head *);
extern void           jbd2_journal_write_revoke_records(transaction_t *transaction,
                                                     struct list_head *log_bufs);

/* Recovery revoke support */
extern int        jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t);
extern int        jbd2_journal_test_revoke(journal_t *, unsigned long long, tid_t);
extern void        jbd2_journal_clear_revoke(journal_t *);
extern void        jbd2_journal_switch_revoke_table(journal_t *journal);
extern void        jbd2_clear_buffer_revoked_flags(journal_t *journal);

/*
 * The log thread user interface:
 *
 * Request space in the current transaction, and force transaction commit
 * transitions on demand.
 */

int jbd2_log_start_commit(journal_t *journal, tid_t tid);
int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
int jbd2_transaction_committed(journal_t *journal, tid_t tid);
int jbd2_complete_transaction(journal_t *journal, tid_t tid);
int jbd2_log_do_checkpoint(journal_t *journal);
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);

void __jbd2_log_wait_for_space(journal_t *journal);
extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
extern int jbd2_cleanup_journal_tail(journal_t *);

/* Fast commit related APIs */
int jbd2_fc_begin_commit(journal_t *journal, tid_t tid);
int jbd2_fc_end_commit(journal_t *journal);
int jbd2_fc_end_commit_fallback(journal_t *journal);
int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out);
int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode);
int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode);
int jbd2_fc_wait_bufs(journal_t *journal, int num_blks);
int jbd2_fc_release_bufs(journal_t *journal);

static inline int jbd2_journal_get_max_txn_bufs(journal_t *journal)
{
        return (journal->j_total_len - journal->j_fc_wbufsize) / 4;
}

/*
 * is_journal_abort
 *
 * Simple test wrapper function to test the JBD2_ABORT state flag.  This
 * bit, when set, indicates that we have had a fatal error somewhere,
 * either inside the journaling layer or indicated to us by the client
 * (eg. ext3), and that we and should not commit any further
 * transactions.
 */

static inline int is_journal_aborted(journal_t *journal)
{
        return journal->j_flags & JBD2_ABORT;
}

static inline int is_handle_aborted(handle_t *handle)
{
        if (handle->h_aborted || !handle->h_transaction)
                return 1;
        return is_journal_aborted(handle->h_transaction->t_journal);
}

static inline void jbd2_journal_abort_handle(handle_t *handle)
{
        handle->h_aborted = 1;
}

static inline void jbd2_init_fs_dev_write_error(journal_t *journal)
{
        struct address_space *mapping = journal->j_fs_dev->bd_mapping;

        /*
         * Save the original wb_err value of client fs's bdev mapping which
         * could be used to detect the client fs's metadata async write error.
         */
        errseq_check_and_advance(&mapping->wb_err, &journal->j_fs_dev_wb_err);
}

static inline int jbd2_check_fs_dev_write_error(journal_t *journal)
{
        struct address_space *mapping = journal->j_fs_dev->bd_mapping;

        return errseq_check(&mapping->wb_err,
                            READ_ONCE(journal->j_fs_dev_wb_err));
}

#endif /* __KERNEL__   */

/* Comparison functions for transaction IDs: perform comparisons using
 * modulo arithmetic so that they work over sequence number wraps. */

static inline int tid_gt(tid_t x, tid_t y)
{
        int difference = (x - y);
        return (difference > 0);
}

static inline int tid_geq(tid_t x, tid_t y)
{
        int difference = (x - y);
        return (difference >= 0);
}

extern int jbd2_journal_blocks_per_page(struct inode *inode);
extern size_t journal_tag_bytes(journal_t *journal);

static inline bool jbd2_journal_has_csum_v2or3_feature(journal_t *j)
{
        return jbd2_has_feature_csum2(j) || jbd2_has_feature_csum3(j);
}

static inline int jbd2_journal_has_csum_v2or3(journal_t *journal)
{
        WARN_ON_ONCE(jbd2_journal_has_csum_v2or3_feature(journal) &&
                     journal->j_chksum_driver == NULL);

        return journal->j_chksum_driver != NULL;
}

static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb)
{
        int num_fc_blocks = be32_to_cpu(jsb->s_num_fc_blks);

        return num_fc_blocks ? num_fc_blocks : JBD2_DEFAULT_FAST_COMMIT_BLOCKS;
}

/*
 * Return number of free blocks in the log. Must be called under j_state_lock.
 */
static inline unsigned long jbd2_log_space_left(journal_t *journal)
{
        /* Allow for rounding errors */
        long free = journal->j_free - 32;

        if (journal->j_committing_transaction) {
                free -= atomic_read(&journal->
                        j_committing_transaction->t_outstanding_credits);
        }
        return max_t(long, free, 0);
}

/*
 * Definitions which augment the buffer_head layer
 */

/* journaling buffer types */
#define BJ_None                0        /* Not journaled */
#define BJ_Metadata        1        /* Normal journaled metadata */
#define BJ_Forget        2        /* Buffer superseded by this transaction */
#define BJ_Shadow        3        /* Buffer contents being shadowed to the log */
#define BJ_Reserved        4        /* Buffer is reserved for access by journal */
#define BJ_Types        5

/* JBD uses a CRC32 checksum */
#define JBD_MAX_CHECKSUM_SIZE 4

static inline u32 jbd2_chksum(journal_t *journal, u32 crc,
                              const void *address, unsigned int length)
{
        struct {
                struct shash_desc shash;
                char ctx[JBD_MAX_CHECKSUM_SIZE];
        } desc;
        int err;

        BUG_ON(crypto_shash_descsize(journal->j_chksum_driver) >
                JBD_MAX_CHECKSUM_SIZE);

        desc.shash.tfm = journal->j_chksum_driver;
        *(u32 *)desc.ctx = crc;

        err = crypto_shash_update(&desc.shash, address, length);
        BUG_ON(err);

        return *(u32 *)desc.ctx;
}

/* Return most recent uncommitted transaction */
static inline tid_t  jbd2_get_latest_transaction(journal_t *journal)
{
        tid_t tid;

        read_lock(&journal->j_state_lock);
        tid = journal->j_commit_request;
        if (journal->j_running_transaction)
                tid = journal->j_running_transaction->t_tid;
        read_unlock(&journal->j_state_lock);
        return tid;
}

static inline int jbd2_handle_buffer_credits(handle_t *handle)
{
        journal_t *journal;

        if (!handle->h_reserved)
                journal = handle->h_transaction->t_journal;
        else
                journal = handle->h_journal;

        return handle->h_total_credits -
                DIV_ROUND_UP(handle->h_revoke_credits_requested,
                             journal->j_revoke_records_per_block);
}

#ifdef __KERNEL__

#define buffer_trace_init(bh)        do {} while (0)
#define print_buffer_fields(bh)        do {} while (0)
#define print_buffer_trace(bh)        do {} while (0)
#define BUFFER_TRACE(bh, info)        do {} while (0)
#define BUFFER_TRACE2(bh, bh2, info)        do {} while (0)
#define JBUFFER_TRACE(jh, info)        do {} while (0)

#endif        /* __KERNEL__ */

#define EFSBADCRC        EBADMSG                /* Bad CRC detected */
#define EFSCORRUPTED        EUCLEAN                /* Filesystem is corrupted */

#endif        /* _LINUX_JBD2_H */
























































































































































































































































































    3 




















































































































































































    3 







    3 















    3 
    3 












































































    1 
















    1 



    1 



    1 








































































































































































































































































































































































































































































































































































































































































    3 







    3 





























































































































































































































    2 


















    3 












    3 






















    7 

    4 











































































































































































































































































































































































































































































































    7 







    2 


    5 















    6 


    6 








    6 
    6 





    6 

    6 


    6 

















    7 






















































































































































































































































































    4 





    5 

    5 



























    3 


    3 




    3 


















    3 








    3 
    2 
    3 

    3 

    2 
    3 



    3 
    3 




    3 































    2 














    3 














    2 

    3 

    2 













    3 







    3 











    3 
































    2 



    3 

    2 































    7 



    3 

    6 
    7 


    6 






    6 
















    6 

    3 







    3 
    3 
    3 
















   11 



   11 

   10 
   10 


   10 







   10 









    1 
    1 

    1 
    1 






















   10 




    8 
   11 


























    1 

    2 
    1 



    1 




    1 























































    5 














    6 











































    2 








    1 

    1 
    1 

    1 


    1 



















    6 



    6 































    5 
    1 








    7 
    6 
    7 
    6 
    6 



    7 





























    1 
    1 










    1 



    1 



    1 



    1 








    1 
    1 







    4 
    5 



    6 











    6 





    6 


    3 
    5 








    6 
    6 
    6 

    6 





    6 
    5 


    5 





















    6 
    6 



    4 









































    4 
    4 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/page-writeback.c
 *
 * Copyright (C) 2002, Linus Torvalds.
 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
 *
 * Contains functions related to writing back dirty pages at the
 * address_space level.
 *
 * 10Apr2002        Andrew Morton
 *                Initial version
 */

#include <linux/kernel.h>
#include <linux/math64.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/init.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/blkdev.h>
#include <linux/mpage.h>
#include <linux/rmap.h>
#include <linux/percpu.h>
#include <linux/smp.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
#include <linux/pagevec.h>
#include <linux/timer.h>
#include <linux/sched/rt.h>
#include <linux/sched/signal.h>
#include <linux/mm_inline.h>
#include <trace/events/writeback.h>

#include "internal.h"

/*
 * Sleep at most 200ms at a time in balance_dirty_pages().
 */
#define MAX_PAUSE                max(HZ/5, 1)

/*
 * Try to keep balance_dirty_pages() call intervals higher than this many pages
 * by raising pause time to max_pause when falls below it.
 */
#define DIRTY_POLL_THRESH        (128 >> (PAGE_SHIFT - 10))

/*
 * Estimate write bandwidth at 200ms intervals.
 */
#define BANDWIDTH_INTERVAL        max(HZ/5, 1)

#define RATELIMIT_CALC_SHIFT        10

/*
 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
 * will look to see if it needs to force writeback or throttling.
 */
static long ratelimit_pages = 32;

/* The following parameters are exported via /proc/sys/vm */

/*
 * Start background writeback (via writeback threads) at this percentage
 */
static int dirty_background_ratio = 10;

/*
 * dirty_background_bytes starts at 0 (disabled) so that it is a function of
 * dirty_background_ratio * the amount of dirtyable memory
 */
static unsigned long dirty_background_bytes;

/*
 * free highmem will not be subtracted from the total free memory
 * for calculating free ratios if vm_highmem_is_dirtyable is true
 */
static int vm_highmem_is_dirtyable;

/*
 * The generator of dirty data starts writeback at this percentage
 */
static int vm_dirty_ratio = 20;

/*
 * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
 * vm_dirty_ratio * the amount of dirtyable memory
 */
static unsigned long vm_dirty_bytes;

/*
 * The interval between `kupdate'-style writebacks
 */
unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */

EXPORT_SYMBOL_GPL(dirty_writeback_interval);

/*
 * The longest time for which data is allowed to remain dirty
 */
unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */

/*
 * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
 * a full sync is triggered after this time elapses without any disk activity.
 */
int laptop_mode;

EXPORT_SYMBOL(laptop_mode);

/* End of sysctl-exported parameters */

struct wb_domain global_wb_domain;

/* consolidated parameters for balance_dirty_pages() and its subroutines */
struct dirty_throttle_control {
#ifdef CONFIG_CGROUP_WRITEBACK
        struct wb_domain        *dom;
        struct dirty_throttle_control *gdtc;        /* only set in memcg dtc's */
#endif
        struct bdi_writeback        *wb;
        struct fprop_local_percpu *wb_completions;

        unsigned long                avail;                /* dirtyable */
        unsigned long                dirty;                /* file_dirty + write + nfs */
        unsigned long                thresh;                /* dirty threshold */
        unsigned long                bg_thresh;        /* dirty background threshold */

        unsigned long                wb_dirty;        /* per-wb counterparts */
        unsigned long                wb_thresh;
        unsigned long                wb_bg_thresh;

        unsigned long                pos_ratio;
};

/*
 * Length of period for aging writeout fractions of bdis. This is an
 * arbitrarily chosen number. The longer the period, the slower fractions will
 * reflect changes in current writeout rate.
 */
#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)

#ifdef CONFIG_CGROUP_WRITEBACK

#define GDTC_INIT(__wb)                .wb = (__wb),                                \
                                .dom = &global_wb_domain,                \
                                .wb_completions = &(__wb)->completions

#define GDTC_INIT_NO_WB                .dom = &global_wb_domain

#define MDTC_INIT(__wb, __gdtc)        .wb = (__wb),                                \
                                .dom = mem_cgroup_wb_domain(__wb),        \
                                .wb_completions = &(__wb)->memcg_completions, \
                                .gdtc = __gdtc

static bool mdtc_valid(struct dirty_throttle_control *dtc)
{
        return dtc->dom;
}

static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
{
        return dtc->dom;
}

static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
{
        return mdtc->gdtc;
}

static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
{
        return &wb->memcg_completions;
}

static void wb_min_max_ratio(struct bdi_writeback *wb,
                             unsigned long *minp, unsigned long *maxp)
{
        unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth);
        unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
        unsigned long long min = wb->bdi->min_ratio;
        unsigned long long max = wb->bdi->max_ratio;

        /*
         * @wb may already be clean by the time control reaches here and
         * the total may not include its bw.
         */
        if (this_bw < tot_bw) {
                if (min) {
                        min *= this_bw;
                        min = div64_ul(min, tot_bw);
                }
                if (max < 100 * BDI_RATIO_SCALE) {
                        max *= this_bw;
                        max = div64_ul(max, tot_bw);
                }
        }

        *minp = min;
        *maxp = max;
}

#else        /* CONFIG_CGROUP_WRITEBACK */

#define GDTC_INIT(__wb)                .wb = (__wb),                           \
                                .wb_completions = &(__wb)->completions
#define GDTC_INIT_NO_WB
#define MDTC_INIT(__wb, __gdtc)

static bool mdtc_valid(struct dirty_throttle_control *dtc)
{
        return false;
}

static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
{
        return &global_wb_domain;
}

static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
{
        return NULL;
}

static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
{
        return NULL;
}

static void wb_min_max_ratio(struct bdi_writeback *wb,
                             unsigned long *minp, unsigned long *maxp)
{
        *minp = wb->bdi->min_ratio;
        *maxp = wb->bdi->max_ratio;
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

/*
 * In a memory zone, there is a certain amount of pages we consider
 * available for the page cache, which is essentially the number of
 * free and reclaimable pages, minus some zone reserves to protect
 * lowmem and the ability to uphold the zone's watermarks without
 * requiring writeback.
 *
 * This number of dirtyable pages is the base value of which the
 * user-configurable dirty ratio is the effective number of pages that
 * are allowed to be actually dirtied.  Per individual zone, or
 * globally by using the sum of dirtyable pages over all zones.
 *
 * Because the user is allowed to specify the dirty limit globally as
 * absolute number of bytes, calculating the per-zone dirty limit can
 * require translating the configured limit into a percentage of
 * global dirtyable memory first.
 */

/**
 * node_dirtyable_memory - number of dirtyable pages in a node
 * @pgdat: the node
 *
 * Return: the node's number of pages potentially available for dirty
 * page cache.  This is the base value for the per-node dirty limits.
 */
static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
{
        unsigned long nr_pages = 0;
        int z;

        for (z = 0; z < MAX_NR_ZONES; z++) {
                struct zone *zone = pgdat->node_zones + z;

                if (!populated_zone(zone))
                        continue;

                nr_pages += zone_page_state(zone, NR_FREE_PAGES);
        }

        /*
         * Pages reserved for the kernel should not be considered
         * dirtyable, to prevent a situation where reclaim has to
         * clean pages in order to balance the zones.
         */
        nr_pages -= min(nr_pages, pgdat->totalreserve_pages);

        nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE);
        nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE);

        return nr_pages;
}

static unsigned long highmem_dirtyable_memory(unsigned long total)
{
#ifdef CONFIG_HIGHMEM
        int node;
        unsigned long x = 0;
        int i;

        for_each_node_state(node, N_HIGH_MEMORY) {
                for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) {
                        struct zone *z;
                        unsigned long nr_pages;

                        if (!is_highmem_idx(i))
                                continue;

                        z = &NODE_DATA(node)->node_zones[i];
                        if (!populated_zone(z))
                                continue;

                        nr_pages = zone_page_state(z, NR_FREE_PAGES);
                        /* watch for underflows */
                        nr_pages -= min(nr_pages, high_wmark_pages(z));
                        nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE);
                        nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE);
                        x += nr_pages;
                }
        }

        /*
         * Make sure that the number of highmem pages is never larger
         * than the number of the total dirtyable memory. This can only
         * occur in very strange VM situations but we want to make sure
         * that this does not occur.
         */
        return min(x, total);
#else
        return 0;
#endif
}

/**
 * global_dirtyable_memory - number of globally dirtyable pages
 *
 * Return: the global number of pages potentially available for dirty
 * page cache.  This is the base value for the global dirty limits.
 */
static unsigned long global_dirtyable_memory(void)
{
        unsigned long x;

        x = global_zone_page_state(NR_FREE_PAGES);
        /*
         * Pages reserved for the kernel should not be considered
         * dirtyable, to prevent a situation where reclaim has to
         * clean pages in order to balance the zones.
         */
        x -= min(x, totalreserve_pages);

        x += global_node_page_state(NR_INACTIVE_FILE);
        x += global_node_page_state(NR_ACTIVE_FILE);

        if (!vm_highmem_is_dirtyable)
                x -= highmem_dirtyable_memory(x);

        return x + 1;        /* Ensure that we never return 0 */
}

/**
 * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain
 * @dtc: dirty_throttle_control of interest
 *
 * Calculate @dtc->thresh and ->bg_thresh considering
 * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}.  The caller
 * must ensure that @dtc->avail is set before calling this function.  The
 * dirty limits will be lifted by 1/4 for real-time tasks.
 */
static void domain_dirty_limits(struct dirty_throttle_control *dtc)
{
        const unsigned long available_memory = dtc->avail;
        struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
        unsigned long bytes = vm_dirty_bytes;
        unsigned long bg_bytes = dirty_background_bytes;
        /* convert ratios to per-PAGE_SIZE for higher precision */
        unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100;
        unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100;
        unsigned long thresh;
        unsigned long bg_thresh;
        struct task_struct *tsk;

        /* gdtc is !NULL iff @dtc is for memcg domain */
        if (gdtc) {
                unsigned long global_avail = gdtc->avail;

                /*
                 * The byte settings can't be applied directly to memcg
                 * domains.  Convert them to ratios by scaling against
                 * globally available memory.  As the ratios are in
                 * per-PAGE_SIZE, they can be obtained by dividing bytes by
                 * number of pages.
                 */
                if (bytes)
                        ratio = min(DIV_ROUND_UP(bytes, global_avail),
                                    PAGE_SIZE);
                if (bg_bytes)
                        bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
                                       PAGE_SIZE);
                bytes = bg_bytes = 0;
        }

        if (bytes)
                thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
        else
                thresh = (ratio * available_memory) / PAGE_SIZE;

        if (bg_bytes)
                bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
        else
                bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;

        if (bg_thresh >= thresh)
                bg_thresh = thresh / 2;
        tsk = current;
        if (rt_task(tsk)) {
                bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
                thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
        }
        dtc->thresh = thresh;
        dtc->bg_thresh = bg_thresh;

        /* we should eventually report the domain in the TP */
        if (!gdtc)
                trace_global_dirty_state(bg_thresh, thresh);
}

/**
 * global_dirty_limits - background-writeback and dirty-throttling thresholds
 * @pbackground: out parameter for bg_thresh
 * @pdirty: out parameter for thresh
 *
 * Calculate bg_thresh and thresh for global_wb_domain.  See
 * domain_dirty_limits() for details.
 */
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
{
        struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };

        gdtc.avail = global_dirtyable_memory();
        domain_dirty_limits(&gdtc);

        *pbackground = gdtc.bg_thresh;
        *pdirty = gdtc.thresh;
}

/**
 * node_dirty_limit - maximum number of dirty pages allowed in a node
 * @pgdat: the node
 *
 * Return: the maximum number of dirty pages allowed in a node, based
 * on the node's dirtyable memory.
 */
static unsigned long node_dirty_limit(struct pglist_data *pgdat)
{
        unsigned long node_memory = node_dirtyable_memory(pgdat);
        struct task_struct *tsk = current;
        unsigned long dirty;

        if (vm_dirty_bytes)
                dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
                        node_memory / global_dirtyable_memory();
        else
                dirty = vm_dirty_ratio * node_memory / 100;

        if (rt_task(tsk))
                dirty += dirty / 4;

        return dirty;
}

/**
 * node_dirty_ok - tells whether a node is within its dirty limits
 * @pgdat: the node to check
 *
 * Return: %true when the dirty pages in @pgdat are within the node's
 * dirty limit, %false if the limit is exceeded.
 */
bool node_dirty_ok(struct pglist_data *pgdat)
{
        unsigned long limit = node_dirty_limit(pgdat);
        unsigned long nr_pages = 0;

        nr_pages += node_page_state(pgdat, NR_FILE_DIRTY);
        nr_pages += node_page_state(pgdat, NR_WRITEBACK);

        return nr_pages <= limit;
}

#ifdef CONFIG_SYSCTL
static int dirty_background_ratio_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                dirty_background_bytes = 0;
        return ret;
}

static int dirty_background_bytes_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                dirty_background_ratio = 0;
        return ret;
}

static int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
{
        int old_ratio = vm_dirty_ratio;
        int ret;

        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
                writeback_set_ratelimit();
                vm_dirty_bytes = 0;
        }
        return ret;
}

static int dirty_bytes_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        unsigned long old_bytes = vm_dirty_bytes;
        int ret;

        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
                writeback_set_ratelimit();
                vm_dirty_ratio = 0;
        }
        return ret;
}
#endif

static unsigned long wp_next_time(unsigned long cur_time)
{
        cur_time += VM_COMPLETIONS_PERIOD_LEN;
        /* 0 has a special meaning... */
        if (!cur_time)
                return 1;
        return cur_time;
}

static void wb_domain_writeout_add(struct wb_domain *dom,
                                   struct fprop_local_percpu *completions,
                                   unsigned int max_prop_frac, long nr)
{
        __fprop_add_percpu_max(&dom->completions, completions,
                               max_prop_frac, nr);
        /* First event after period switching was turned off? */
        if (unlikely(!dom->period_time)) {
                /*
                 * We can race with other __bdi_writeout_inc calls here but
                 * it does not cause any harm since the resulting time when
                 * timer will fire and what is in writeout_period_time will be
                 * roughly the same.
                 */
                dom->period_time = wp_next_time(jiffies);
                mod_timer(&dom->period_timer, dom->period_time);
        }
}

/*
 * Increment @wb's writeout completion count and the global writeout
 * completion count. Called from __folio_end_writeback().
 */
static inline void __wb_writeout_add(struct bdi_writeback *wb, long nr)
{
        struct wb_domain *cgdom;

        wb_stat_mod(wb, WB_WRITTEN, nr);
        wb_domain_writeout_add(&global_wb_domain, &wb->completions,
                               wb->bdi->max_prop_frac, nr);

        cgdom = mem_cgroup_wb_domain(wb);
        if (cgdom)
                wb_domain_writeout_add(cgdom, wb_memcg_completions(wb),
                                       wb->bdi->max_prop_frac, nr);
}

void wb_writeout_inc(struct bdi_writeback *wb)
{
        unsigned long flags;

        local_irq_save(flags);
        __wb_writeout_add(wb, 1);
        local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(wb_writeout_inc);

/*
 * On idle system, we can be called long after we scheduled because we use
 * deferred timers so count with missed periods.
 */
static void writeout_period(struct timer_list *t)
{
        struct wb_domain *dom = from_timer(dom, t, period_timer);
        int miss_periods = (jiffies - dom->period_time) /
                                                 VM_COMPLETIONS_PERIOD_LEN;

        if (fprop_new_period(&dom->completions, miss_periods + 1)) {
                dom->period_time = wp_next_time(dom->period_time +
                                miss_periods * VM_COMPLETIONS_PERIOD_LEN);
                mod_timer(&dom->period_timer, dom->period_time);
        } else {
                /*
                 * Aging has zeroed all fractions. Stop wasting CPU on period
                 * updates.
                 */
                dom->period_time = 0;
        }
}

int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
{
        memset(dom, 0, sizeof(*dom));

        spin_lock_init(&dom->lock);

        timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE);

        dom->dirty_limit_tstamp = jiffies;

        return fprop_global_init(&dom->completions, gfp);
}

#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom)
{
        del_timer_sync(&dom->period_timer);
        fprop_global_destroy(&dom->completions);
}
#endif

/*
 * bdi_min_ratio keeps the sum of the minimum dirty shares of all
 * registered backing devices, which, for obvious reasons, can not
 * exceed 100%.
 */
static unsigned int bdi_min_ratio;

static int bdi_check_pages_limit(unsigned long pages)
{
        unsigned long max_dirty_pages = global_dirtyable_memory();

        if (pages > max_dirty_pages)
                return -EINVAL;

        return 0;
}

static unsigned long bdi_ratio_from_pages(unsigned long pages)
{
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        unsigned long ratio;

        global_dirty_limits(&background_thresh, &dirty_thresh);
        ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh);

        return ratio;
}

static u64 bdi_get_bytes(unsigned int ratio)
{
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        u64 bytes;

        global_dirty_limits(&background_thresh, &dirty_thresh);
        bytes = (dirty_thresh * PAGE_SIZE * ratio) / BDI_RATIO_SCALE / 100;

        return bytes;
}

static int __bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
        unsigned int delta;
        int ret = 0;

        if (min_ratio > 100 * BDI_RATIO_SCALE)
                return -EINVAL;

        spin_lock_bh(&bdi_lock);
        if (min_ratio > bdi->max_ratio) {
                ret = -EINVAL;
        } else {
                if (min_ratio < bdi->min_ratio) {
                        delta = bdi->min_ratio - min_ratio;
                        bdi_min_ratio -= delta;
                        bdi->min_ratio = min_ratio;
                } else {
                        delta = min_ratio - bdi->min_ratio;
                        if (bdi_min_ratio + delta < 100 * BDI_RATIO_SCALE) {
                                bdi_min_ratio += delta;
                                bdi->min_ratio = min_ratio;
                        } else {
                                ret = -EINVAL;
                        }
                }
        }
        spin_unlock_bh(&bdi_lock);

        return ret;
}

static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio)
{
        int ret = 0;

        if (max_ratio > 100 * BDI_RATIO_SCALE)
                return -EINVAL;

        spin_lock_bh(&bdi_lock);
        if (bdi->min_ratio > max_ratio) {
                ret = -EINVAL;
        } else {
                bdi->max_ratio = max_ratio;
                bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) /
                                                (100 * BDI_RATIO_SCALE);
        }
        spin_unlock_bh(&bdi_lock);

        return ret;
}

int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio)
{
        return __bdi_set_min_ratio(bdi, min_ratio);
}

int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio)
{
        return __bdi_set_max_ratio(bdi, max_ratio);
}

int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
        return __bdi_set_min_ratio(bdi, min_ratio * BDI_RATIO_SCALE);
}

int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio)
{
        return __bdi_set_max_ratio(bdi, max_ratio * BDI_RATIO_SCALE);
}
EXPORT_SYMBOL(bdi_set_max_ratio);

u64 bdi_get_min_bytes(struct backing_dev_info *bdi)
{
        return bdi_get_bytes(bdi->min_ratio);
}

int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes)
{
        int ret;
        unsigned long pages = min_bytes >> PAGE_SHIFT;
        unsigned long min_ratio;

        ret = bdi_check_pages_limit(pages);
        if (ret)
                return ret;

        min_ratio = bdi_ratio_from_pages(pages);
        return __bdi_set_min_ratio(bdi, min_ratio);
}

u64 bdi_get_max_bytes(struct backing_dev_info *bdi)
{
        return bdi_get_bytes(bdi->max_ratio);
}

int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes)
{
        int ret;
        unsigned long pages = max_bytes >> PAGE_SHIFT;
        unsigned long max_ratio;

        ret = bdi_check_pages_limit(pages);
        if (ret)
                return ret;

        max_ratio = bdi_ratio_from_pages(pages);
        return __bdi_set_max_ratio(bdi, max_ratio);
}

int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit)
{
        if (strict_limit > 1)
                return -EINVAL;

        spin_lock_bh(&bdi_lock);
        if (strict_limit)
                bdi->capabilities |= BDI_CAP_STRICTLIMIT;
        else
                bdi->capabilities &= ~BDI_CAP_STRICTLIMIT;
        spin_unlock_bh(&bdi_lock);

        return 0;
}

static unsigned long dirty_freerun_ceiling(unsigned long thresh,
                                           unsigned long bg_thresh)
{
        return (thresh + bg_thresh) / 2;
}

static unsigned long hard_dirty_limit(struct wb_domain *dom,
                                      unsigned long thresh)
{
        return max(thresh, dom->dirty_limit);
}

/*
 * Memory which can be further allocated to a memcg domain is capped by
 * system-wide clean memory excluding the amount being used in the domain.
 */
static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
                            unsigned long filepages, unsigned long headroom)
{
        struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
        unsigned long clean = filepages - min(filepages, mdtc->dirty);
        unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
        unsigned long other_clean = global_clean - min(global_clean, clean);

        mdtc->avail = filepages + min(headroom, other_clean);
}

/**
 * __wb_calc_thresh - @wb's share of dirty threshold
 * @dtc: dirty_throttle_context of interest
 * @thresh: dirty throttling or dirty background threshold of wb_domain in @dtc
 *
 * Note that balance_dirty_pages() will only seriously take dirty throttling
 * threshold as a hard limit when sleeping max_pause per page is not enough
 * to keep the dirty pages under control. For example, when the device is
 * completely stalled due to some error conditions, or when there are 1000
 * dd tasks writing to a slow 10MB/s USB key.
 * In the other normal situations, it acts more gently by throttling the tasks
 * more (rather than completely block them) when the wb dirty pages go high.
 *
 * It allocates high/low dirty limits to fast/slow devices, in order to prevent
 * - starving fast devices
 * - piling up dirty pages (that will take long time to sync) on slow devices
 *
 * The wb's share of dirty limit will be adapting to its throughput and
 * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
 *
 * Return: @wb's dirty limit in pages. For dirty throttling limit, the term
 * "dirty" in the context of dirty balancing includes all PG_dirty and
 * PG_writeback pages.
 */
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc,
                                      unsigned long thresh)
{
        struct wb_domain *dom = dtc_dom(dtc);
        u64 wb_thresh;
        unsigned long numerator, denominator;
        unsigned long wb_min_ratio, wb_max_ratio;

        /*
         * Calculate this wb's share of the thresh ratio.
         */
        fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
                              &numerator, &denominator);

        wb_thresh = (thresh * (100 * BDI_RATIO_SCALE - bdi_min_ratio)) / (100 * BDI_RATIO_SCALE);
        wb_thresh *= numerator;
        wb_thresh = div64_ul(wb_thresh, denominator);

        wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);

        wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE);
        if (wb_thresh > (thresh * wb_max_ratio) / (100 * BDI_RATIO_SCALE))
                wb_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE);

        return wb_thresh;
}

unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
{
        struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };

        return __wb_calc_thresh(&gdtc, thresh);
}

unsigned long cgwb_calc_thresh(struct bdi_writeback *wb)
{
        struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
        struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };
        unsigned long filepages = 0, headroom = 0, writeback = 0;

        gdtc.avail = global_dirtyable_memory();
        gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) +
                     global_node_page_state(NR_WRITEBACK);

        mem_cgroup_wb_stats(wb, &filepages, &headroom,
                            &mdtc.dirty, &writeback);
        mdtc.dirty += writeback;
        mdtc_calc_avail(&mdtc, filepages, headroom);
        domain_dirty_limits(&mdtc);

        return __wb_calc_thresh(&mdtc, mdtc.thresh);
}

/*
 *                           setpoint - dirty 3
 *        f(dirty) := 1.0 + (----------------)
 *                           limit - setpoint
 *
 * it's a 3rd order polynomial that subjects to
 *
 * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
 * (2) f(setpoint) = 1.0 => the balance point
 * (3) f(limit)    = 0   => the hard limit
 * (4) df/dx      <= 0         => negative feedback control
 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
 *     => fast response on large errors; small oscillation near setpoint
 */
static long long pos_ratio_polynom(unsigned long setpoint,
                                          unsigned long dirty,
                                          unsigned long limit)
{
        long long pos_ratio;
        long x;

        x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
                      (limit - setpoint) | 1);
        pos_ratio = x;
        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
        pos_ratio += 1 << RATELIMIT_CALC_SHIFT;

        return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
}

/*
 * Dirty position control.
 *
 * (o) global/bdi setpoints
 *
 * We want the dirty pages be balanced around the global/wb setpoints.
 * When the number of dirty pages is higher/lower than the setpoint, the
 * dirty position control ratio (and hence task dirty ratelimit) will be
 * decreased/increased to bring the dirty pages back to the setpoint.
 *
 *     pos_ratio = 1 << RATELIMIT_CALC_SHIFT
 *
 *     if (dirty < setpoint) scale up   pos_ratio
 *     if (dirty > setpoint) scale down pos_ratio
 *
 *     if (wb_dirty < wb_setpoint) scale up   pos_ratio
 *     if (wb_dirty > wb_setpoint) scale down pos_ratio
 *
 *     task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
 *
 * (o) global control line
 *
 *     ^ pos_ratio
 *     |
 *     |            |<===== global dirty control scope ======>|
 * 2.0  * * * * * * *
 *     |            .*
 *     |            . *
 *     |            .   *
 *     |            .     *
 *     |            .        *
 *     |            .            *
 * 1.0 ................................*
 *     |            .                  .     *
 *     |            .                  .          *
 *     |            .                  .              *
 *     |            .                  .                 *
 *     |            .                  .                    *
 *   0 +------------.------------------.----------------------*------------->
 *           freerun^          setpoint^                 limit^   dirty pages
 *
 * (o) wb control line
 *
 *     ^ pos_ratio
 *     |
 *     |            *
 *     |              *
 *     |                *
 *     |                  *
 *     |                    * |<=========== span ============>|
 * 1.0 .......................*
 *     |                      . *
 *     |                      .   *
 *     |                      .     *
 *     |                      .       *
 *     |                      .         *
 *     |                      .           *
 *     |                      .             *
 *     |                      .               *
 *     |                      .                 *
 *     |                      .                   *
 *     |                      .                     *
 * 1/4 ...............................................* * * * * * * * * * * *
 *     |                      .                         .
 *     |                      .                           .
 *     |                      .                             .
 *   0 +----------------------.-------------------------------.------------->
 *                wb_setpoint^                    x_intercept^
 *
 * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can
 * be smoothly throttled down to normal if it starts high in situations like
 * - start writing to a slow SD card and a fast disk at the same time. The SD
 *   card's wb_dirty may rush to many times higher than wb_setpoint.
 * - the wb dirty thresh drops quickly due to change of JBOD workload
 */
static void wb_position_ratio(struct dirty_throttle_control *dtc)
{
        struct bdi_writeback *wb = dtc->wb;
        unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth);
        unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
        unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
        unsigned long wb_thresh = dtc->wb_thresh;
        unsigned long x_intercept;
        unsigned long setpoint;                /* dirty pages' target balance point */
        unsigned long wb_setpoint;
        unsigned long span;
        long long pos_ratio;                /* for scaling up/down the rate limit */
        long x;

        dtc->pos_ratio = 0;

        if (unlikely(dtc->dirty >= limit))
                return;

        /*
         * global setpoint
         *
         * See comment for pos_ratio_polynom().
         */
        setpoint = (freerun + limit) / 2;
        pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);

        /*
         * The strictlimit feature is a tool preventing mistrusted filesystems
         * from growing a large number of dirty pages before throttling. For
         * such filesystems balance_dirty_pages always checks wb counters
         * against wb limits. Even if global "nr_dirty" is under "freerun".
         * This is especially important for fuse which sets bdi->max_ratio to
         * 1% by default. Without strictlimit feature, fuse writeback may
         * consume arbitrary amount of RAM because it is accounted in
         * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
         *
         * Here, in wb_position_ratio(), we calculate pos_ratio based on
         * two values: wb_dirty and wb_thresh. Let's consider an example:
         * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
         * limits are set by default to 10% and 20% (background and throttle).
         * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
         * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is
         * about ~6K pages (as the average of background and throttle wb
         * limits). The 3rd order polynomial will provide positive feedback if
         * wb_dirty is under wb_setpoint and vice versa.
         *
         * Note, that we cannot use global counters in these calculations
         * because we want to throttle process writing to a strictlimit wb
         * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
         * in the example above).
         */
        if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
                long long wb_pos_ratio;

                if (dtc->wb_dirty < 8) {
                        dtc->pos_ratio = min_t(long long, pos_ratio * 2,
                                           2 << RATELIMIT_CALC_SHIFT);
                        return;
                }

                if (dtc->wb_dirty >= wb_thresh)
                        return;

                wb_setpoint = dirty_freerun_ceiling(wb_thresh,
                                                    dtc->wb_bg_thresh);

                if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
                        return;

                wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
                                                 wb_thresh);

                /*
                 * Typically, for strictlimit case, wb_setpoint << setpoint
                 * and pos_ratio >> wb_pos_ratio. In the other words global
                 * state ("dirty") is not limiting factor and we have to
                 * make decision based on wb counters. But there is an
                 * important case when global pos_ratio should get precedence:
                 * global limits are exceeded (e.g. due to activities on other
                 * wb's) while given strictlimit wb is below limit.
                 *
                 * "pos_ratio * wb_pos_ratio" would work for the case above,
                 * but it would look too non-natural for the case of all
                 * activity in the system coming from a single strictlimit wb
                 * with bdi->max_ratio == 100%.
                 *
                 * Note that min() below somewhat changes the dynamics of the
                 * control system. Normally, pos_ratio value can be well over 3
                 * (when globally we are at freerun and wb is well below wb
                 * setpoint). Now the maximum pos_ratio in the same situation
                 * is 2. We might want to tweak this if we observe the control
                 * system is too slow to adapt.
                 */
                dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
                return;
        }

        /*
         * We have computed basic pos_ratio above based on global situation. If
         * the wb is over/under its share of dirty pages, we want to scale
         * pos_ratio further down/up. That is done by the following mechanism.
         */

        /*
         * wb setpoint
         *
         *        f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint)
         *
         *                        x_intercept - wb_dirty
         *                     := --------------------------
         *                        x_intercept - wb_setpoint
         *
         * The main wb control line is a linear function that subjects to
         *
         * (1) f(wb_setpoint) = 1.0
         * (2) k = - 1 / (8 * write_bw)  (in single wb case)
         *     or equally: x_intercept = wb_setpoint + 8 * write_bw
         *
         * For single wb case, the dirty pages are observed to fluctuate
         * regularly within range
         *        [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2]
         * for various filesystems, where (2) can yield in a reasonable 12.5%
         * fluctuation range for pos_ratio.
         *
         * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its
         * own size, so move the slope over accordingly and choose a slope that
         * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh.
         */
        if (unlikely(wb_thresh > dtc->thresh))
                wb_thresh = dtc->thresh;
        /*
         * It's very possible that wb_thresh is close to 0 not because the
         * device is slow, but that it has remained inactive for long time.
         * Honour such devices a reasonable good (hopefully IO efficient)
         * threshold, so that the occasional writes won't be blocked and active
         * writes can rampup the threshold quickly.
         */
        wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
        /*
         * scale global setpoint to wb's:
         *        wb_setpoint = setpoint * wb_thresh / thresh
         */
        x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
        wb_setpoint = setpoint * (u64)x >> 16;
        /*
         * Use span=(8*write_bw) in single wb case as indicated by
         * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
         *
         *        wb_thresh                    thresh - wb_thresh
         * span = --------- * (8 * write_bw) + ------------------ * wb_thresh
         *         thresh                           thresh
         */
        span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
        x_intercept = wb_setpoint + span;

        if (dtc->wb_dirty < x_intercept - span / 4) {
                pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
                                      (x_intercept - wb_setpoint) | 1);
        } else
                pos_ratio /= 4;

        /*
         * wb reserve area, safeguard against dirty pool underrun and disk idle
         * It may push the desired control point of global dirty pages higher
         * than setpoint.
         */
        x_intercept = wb_thresh / 2;
        if (dtc->wb_dirty < x_intercept) {
                if (dtc->wb_dirty > x_intercept / 8)
                        pos_ratio = div_u64(pos_ratio * x_intercept,
                                            dtc->wb_dirty);
                else
                        pos_ratio *= 8;
        }

        dtc->pos_ratio = pos_ratio;
}

static void wb_update_write_bandwidth(struct bdi_writeback *wb,
                                      unsigned long elapsed,
                                      unsigned long written)
{
        const unsigned long period = roundup_pow_of_two(3 * HZ);
        unsigned long avg = wb->avg_write_bandwidth;
        unsigned long old = wb->write_bandwidth;
        u64 bw;

        /*
         * bw = written * HZ / elapsed
         *
         *                   bw * elapsed + write_bandwidth * (period - elapsed)
         * write_bandwidth = ---------------------------------------------------
         *                                          period
         *
         * @written may have decreased due to folio_redirty_for_writepage().
         * Avoid underflowing @bw calculation.
         */
        bw = written - min(written, wb->written_stamp);
        bw *= HZ;
        if (unlikely(elapsed > period)) {
                bw = div64_ul(bw, elapsed);
                avg = bw;
                goto out;
        }
        bw += (u64)wb->write_bandwidth * (period - elapsed);
        bw >>= ilog2(period);

        /*
         * one more level of smoothing, for filtering out sudden spikes
         */
        if (avg > old && old >= (unsigned long)bw)
                avg -= (avg - old) >> 3;

        if (avg < old && old <= (unsigned long)bw)
                avg += (old - avg) >> 3;

out:
        /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */
        avg = max(avg, 1LU);
        if (wb_has_dirty_io(wb)) {
                long delta = avg - wb->avg_write_bandwidth;
                WARN_ON_ONCE(atomic_long_add_return(delta,
                                        &wb->bdi->tot_write_bandwidth) <= 0);
        }
        wb->write_bandwidth = bw;
        WRITE_ONCE(wb->avg_write_bandwidth, avg);
}

static void update_dirty_limit(struct dirty_throttle_control *dtc)
{
        struct wb_domain *dom = dtc_dom(dtc);
        unsigned long thresh = dtc->thresh;
        unsigned long limit = dom->dirty_limit;

        /*
         * Follow up in one step.
         */
        if (limit < thresh) {
                limit = thresh;
                goto update;
        }

        /*
         * Follow down slowly. Use the higher one as the target, because thresh
         * may drop below dirty. This is exactly the reason to introduce
         * dom->dirty_limit which is guaranteed to lie above the dirty pages.
         */
        thresh = max(thresh, dtc->dirty);
        if (limit > thresh) {
                limit -= (limit - thresh) >> 5;
                goto update;
        }
        return;
update:
        dom->dirty_limit = limit;
}

static void domain_update_dirty_limit(struct dirty_throttle_control *dtc,
                                      unsigned long now)
{
        struct wb_domain *dom = dtc_dom(dtc);

        /*
         * check locklessly first to optimize away locking for the most time
         */
        if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL))
                return;

        spin_lock(&dom->lock);
        if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) {
                update_dirty_limit(dtc);
                dom->dirty_limit_tstamp = now;
        }
        spin_unlock(&dom->lock);
}

/*
 * Maintain wb->dirty_ratelimit, the base dirty throttle rate.
 *
 * Normal wb tasks will be curbed at or below it in long term.
 * Obviously it should be around (write_bw / N) when there are N dd tasks.
 */
static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
                                      unsigned long dirtied,
                                      unsigned long elapsed)
{
        struct bdi_writeback *wb = dtc->wb;
        unsigned long dirty = dtc->dirty;
        unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
        unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
        unsigned long setpoint = (freerun + limit) / 2;
        unsigned long write_bw = wb->avg_write_bandwidth;
        unsigned long dirty_ratelimit = wb->dirty_ratelimit;
        unsigned long dirty_rate;
        unsigned long task_ratelimit;
        unsigned long balanced_dirty_ratelimit;
        unsigned long step;
        unsigned long x;
        unsigned long shift;

        /*
         * The dirty rate will match the writeout rate in long term, except
         * when dirty pages are truncated by userspace or re-dirtied by FS.
         */
        dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;

        /*
         * task_ratelimit reflects each dd's dirty rate for the past 200ms.
         */
        task_ratelimit = (u64)dirty_ratelimit *
                                        dtc->pos_ratio >> RATELIMIT_CALC_SHIFT;
        task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */

        /*
         * A linear estimation of the "balanced" throttle rate. The theory is,
         * if there are N dd tasks, each throttled at task_ratelimit, the wb's
         * dirty_rate will be measured to be (N * task_ratelimit). So the below
         * formula will yield the balanced rate limit (write_bw / N).
         *
         * Note that the expanded form is not a pure rate feedback:
         *        rate_(i+1) = rate_(i) * (write_bw / dirty_rate)                     (1)
         * but also takes pos_ratio into account:
         *        rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio  (2)
         *
         * (1) is not realistic because pos_ratio also takes part in balancing
         * the dirty rate.  Consider the state
         *        pos_ratio = 0.5                                                     (3)
         *        rate = 2 * (write_bw / N)                                     (4)
         * If (1) is used, it will stuck in that state! Because each dd will
         * be throttled at
         *        task_ratelimit = pos_ratio * rate = (write_bw / N)             (5)
         * yielding
         *        dirty_rate = N * task_ratelimit = write_bw                     (6)
         * put (6) into (1) we get
         *        rate_(i+1) = rate_(i)                                             (7)
         *
         * So we end up using (2) to always keep
         *        rate_(i+1) ~= (write_bw / N)                                     (8)
         * regardless of the value of pos_ratio. As long as (8) is satisfied,
         * pos_ratio is able to drive itself to 1.0, which is not only where
         * the dirty count meet the setpoint, but also where the slope of
         * pos_ratio is most flat and hence task_ratelimit is least fluctuated.
         */
        balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
                                           dirty_rate | 1);
        /*
         * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw
         */
        if (unlikely(balanced_dirty_ratelimit > write_bw))
                balanced_dirty_ratelimit = write_bw;

        /*
         * We could safely do this and return immediately:
         *
         *        wb->dirty_ratelimit = balanced_dirty_ratelimit;
         *
         * However to get a more stable dirty_ratelimit, the below elaborated
         * code makes use of task_ratelimit to filter out singular points and
         * limit the step size.
         *
         * The below code essentially only uses the relative value of
         *
         *        task_ratelimit - dirty_ratelimit
         *        = (pos_ratio - 1) * dirty_ratelimit
         *
         * which reflects the direction and size of dirty position error.
         */

        /*
         * dirty_ratelimit will follow balanced_dirty_ratelimit iff
         * task_ratelimit is on the same side of dirty_ratelimit, too.
         * For example, when
         * - dirty_ratelimit > balanced_dirty_ratelimit
         * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
         * lowering dirty_ratelimit will help meet both the position and rate
         * control targets. Otherwise, don't update dirty_ratelimit if it will
         * only help meet the rate target. After all, what the users ultimately
         * feel and care are stable dirty rate and small position error.
         *
         * |task_ratelimit - dirty_ratelimit| is used to limit the step size
         * and filter out the singular points of balanced_dirty_ratelimit. Which
         * keeps jumping around randomly and can even leap far away at times
         * due to the small 200ms estimation period of dirty_rate (we want to
         * keep that period small to reduce time lags).
         */
        step = 0;

        /*
         * For strictlimit case, calculations above were based on wb counters
         * and limits (starting from pos_ratio = wb_position_ratio() and up to
         * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
         * Hence, to calculate "step" properly, we have to use wb_dirty as
         * "dirty" and wb_setpoint as "setpoint".
         *
         * We rampup dirty_ratelimit forcibly if wb_dirty is low because
         * it's possible that wb_thresh is close to zero due to inactivity
         * of backing device.
         */
        if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
                dirty = dtc->wb_dirty;
                if (dtc->wb_dirty < 8)
                        setpoint = dtc->wb_dirty + 1;
                else
                        setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
        }

        if (dirty < setpoint) {
                x = min3(wb->balanced_dirty_ratelimit,
                         balanced_dirty_ratelimit, task_ratelimit);
                if (dirty_ratelimit < x)
                        step = x - dirty_ratelimit;
        } else {
                x = max3(wb->balanced_dirty_ratelimit,
                         balanced_dirty_ratelimit, task_ratelimit);
                if (dirty_ratelimit > x)
                        step = dirty_ratelimit - x;
        }

        /*
         * Don't pursue 100% rate matching. It's impossible since the balanced
         * rate itself is constantly fluctuating. So decrease the track speed
         * when it gets close to the target. Helps eliminate pointless tremors.
         */
        shift = dirty_ratelimit / (2 * step + 1);
        if (shift < BITS_PER_LONG)
                step = DIV_ROUND_UP(step >> shift, 8);
        else
                step = 0;

        if (dirty_ratelimit < balanced_dirty_ratelimit)
                dirty_ratelimit += step;
        else
                dirty_ratelimit -= step;

        WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL));
        wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;

        trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
}

static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
                                  struct dirty_throttle_control *mdtc,
                                  bool update_ratelimit)
{
        struct bdi_writeback *wb = gdtc->wb;
        unsigned long now = jiffies;
        unsigned long elapsed;
        unsigned long dirtied;
        unsigned long written;

        spin_lock(&wb->list_lock);

        /*
         * Lockless checks for elapsed time are racy and delayed update after
         * IO completion doesn't do it at all (to make sure written pages are
         * accounted reasonably quickly). Make sure elapsed >= 1 to avoid
         * division errors.
         */
        elapsed = max(now - wb->bw_time_stamp, 1UL);
        dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
        written = percpu_counter_read(&wb->stat[WB_WRITTEN]);

        if (update_ratelimit) {
                domain_update_dirty_limit(gdtc, now);
                wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);

                /*
                 * @mdtc is always NULL if !CGROUP_WRITEBACK but the
                 * compiler has no way to figure that out.  Help it.
                 */
                if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
                        domain_update_dirty_limit(mdtc, now);
                        wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
                }
        }
        wb_update_write_bandwidth(wb, elapsed, written);

        wb->dirtied_stamp = dirtied;
        wb->written_stamp = written;
        WRITE_ONCE(wb->bw_time_stamp, now);
        spin_unlock(&wb->list_lock);
}

void wb_update_bandwidth(struct bdi_writeback *wb)
{
        struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };

        __wb_update_bandwidth(&gdtc, NULL, false);
}

/* Interval after which we consider wb idle and don't estimate bandwidth */
#define WB_BANDWIDTH_IDLE_JIF (HZ)

static void wb_bandwidth_estimate_start(struct bdi_writeback *wb)
{
        unsigned long now = jiffies;
        unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp);

        if (elapsed > WB_BANDWIDTH_IDLE_JIF &&
            !atomic_read(&wb->writeback_inodes)) {
                spin_lock(&wb->list_lock);
                wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED);
                wb->written_stamp = wb_stat(wb, WB_WRITTEN);
                WRITE_ONCE(wb->bw_time_stamp, now);
                spin_unlock(&wb->list_lock);
        }
}

/*
 * After a task dirtied this many pages, balance_dirty_pages_ratelimited()
 * will look to see if it needs to start dirty throttling.
 *
 * If dirty_poll_interval is too low, big NUMA machines will call the expensive
 * global_zone_page_state() too often. So scale it near-sqrt to the safety margin
 * (the number of pages we may dirty without exceeding the dirty limits).
 */
static unsigned long dirty_poll_interval(unsigned long dirty,
                                         unsigned long thresh)
{
        if (thresh > dirty)
                return 1UL << (ilog2(thresh - dirty) >> 1);

        return 1;
}

static unsigned long wb_max_pause(struct bdi_writeback *wb,
                                  unsigned long wb_dirty)
{
        unsigned long bw = READ_ONCE(wb->avg_write_bandwidth);
        unsigned long t;

        /*
         * Limit pause time for small memory systems. If sleeping for too long
         * time, a small pool of dirty/writeback pages may go empty and disk go
         * idle.
         *
         * 8 serves as the safety ratio.
         */
        t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
        t++;

        return min_t(unsigned long, t, MAX_PAUSE);
}

static long wb_min_pause(struct bdi_writeback *wb,
                         long max_pause,
                         unsigned long task_ratelimit,
                         unsigned long dirty_ratelimit,
                         int *nr_dirtied_pause)
{
        long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth));
        long lo = ilog2(READ_ONCE(wb->dirty_ratelimit));
        long t;                /* target pause */
        long pause;        /* estimated next pause */
        int pages;        /* target nr_dirtied_pause */

        /* target for 10ms pause on 1-dd case */
        t = max(1, HZ / 100);

        /*
         * Scale up pause time for concurrent dirtiers in order to reduce CPU
         * overheads.
         *
         * (N * 10ms) on 2^N concurrent tasks.
         */
        if (hi > lo)
                t += (hi - lo) * (10 * HZ) / 1024;

        /*
         * This is a bit convoluted. We try to base the next nr_dirtied_pause
         * on the much more stable dirty_ratelimit. However the next pause time
         * will be computed based on task_ratelimit and the two rate limits may
         * depart considerably at some time. Especially if task_ratelimit goes
         * below dirty_ratelimit/2 and the target pause is max_pause, the next
         * pause time will be max_pause*2 _trimmed down_ to max_pause.  As a
         * result task_ratelimit won't be executed faithfully, which could
         * eventually bring down dirty_ratelimit.
         *
         * We apply two rules to fix it up:
         * 1) try to estimate the next pause time and if necessary, use a lower
         *    nr_dirtied_pause so as not to exceed max_pause. When this happens,
         *    nr_dirtied_pause will be "dancing" with task_ratelimit.
         * 2) limit the target pause time to max_pause/2, so that the normal
         *    small fluctuations of task_ratelimit won't trigger rule (1) and
         *    nr_dirtied_pause will remain as stable as dirty_ratelimit.
         */
        t = min(t, 1 + max_pause / 2);
        pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);

        /*
         * Tiny nr_dirtied_pause is found to hurt I/O performance in the test
         * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
         * When the 16 consecutive reads are often interrupted by some dirty
         * throttling pause during the async writes, cfq will go into idles
         * (deadline is fine). So push nr_dirtied_pause as high as possible
         * until reaches DIRTY_POLL_THRESH=32 pages.
         */
        if (pages < DIRTY_POLL_THRESH) {
                t = max_pause;
                pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
                if (pages > DIRTY_POLL_THRESH) {
                        pages = DIRTY_POLL_THRESH;
                        t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
                }
        }

        pause = HZ * pages / (task_ratelimit + 1);
        if (pause > max_pause) {
                t = max_pause;
                pages = task_ratelimit * t / roundup_pow_of_two(HZ);
        }

        *nr_dirtied_pause = pages;
        /*
         * The minimal pause time will normally be half the target pause time.
         */
        return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
}

static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
{
        struct bdi_writeback *wb = dtc->wb;
        unsigned long wb_reclaimable;

        /*
         * wb_thresh is not treated as some limiting factor as
         * dirty_thresh, due to reasons
         * - in JBOD setup, wb_thresh can fluctuate a lot
         * - in a system with HDD and USB key, the USB key may somehow
         *   go into state (wb_dirty >> wb_thresh) either because
         *   wb_dirty starts high, or because wb_thresh drops low.
         *   In this case we don't want to hard throttle the USB key
         *   dirtiers for 100 seconds until wb_dirty drops under
         *   wb_thresh. Instead the auxiliary wb control line in
         *   wb_position_ratio() will let the dirtier task progress
         *   at some rate <= (write_bw / 2) for bringing down wb_dirty.
         */
        dtc->wb_thresh = __wb_calc_thresh(dtc, dtc->thresh);
        dtc->wb_bg_thresh = dtc->thresh ?
                div64_u64(dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;

        /*
         * In order to avoid the stacked BDI deadlock we need
         * to ensure we accurately count the 'dirty' pages when
         * the threshold is low.
         *
         * Otherwise it would be possible to get thresh+n pages
         * reported dirty, even though there are thresh-m pages
         * actually dirty; with m+n sitting in the percpu
         * deltas.
         */
        if (dtc->wb_thresh < 2 * wb_stat_error()) {
                wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
                dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
        } else {
                wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
                dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
        }
}

/*
 * balance_dirty_pages() must be called by processes which are generating dirty
 * data.  It looks at the number of dirty pages in the machine and will force
 * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
 * If we're over `background_thresh' then the writeback threads are woken to
 * perform some writeout.
 */
static int balance_dirty_pages(struct bdi_writeback *wb,
                               unsigned long pages_dirtied, unsigned int flags)
{
        struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
        struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
        struct dirty_throttle_control * const gdtc = &gdtc_stor;
        struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
                                                     &mdtc_stor : NULL;
        struct dirty_throttle_control *sdtc;
        unsigned long nr_dirty;
        long period;
        long pause;
        long max_pause;
        long min_pause;
        int nr_dirtied_pause;
        bool dirty_exceeded = false;
        unsigned long task_ratelimit;
        unsigned long dirty_ratelimit;
        struct backing_dev_info *bdi = wb->bdi;
        bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
        unsigned long start_time = jiffies;
        int ret = 0;

        for (;;) {
                unsigned long now = jiffies;
                unsigned long dirty, thresh, bg_thresh;
                unsigned long m_dirty = 0;        /* stop bogus uninit warnings */
                unsigned long m_thresh = 0;
                unsigned long m_bg_thresh = 0;

                nr_dirty = global_node_page_state(NR_FILE_DIRTY);
                gdtc->avail = global_dirtyable_memory();
                gdtc->dirty = nr_dirty + global_node_page_state(NR_WRITEBACK);

                domain_dirty_limits(gdtc);

                if (unlikely(strictlimit)) {
                        wb_dirty_limits(gdtc);

                        dirty = gdtc->wb_dirty;
                        thresh = gdtc->wb_thresh;
                        bg_thresh = gdtc->wb_bg_thresh;
                } else {
                        dirty = gdtc->dirty;
                        thresh = gdtc->thresh;
                        bg_thresh = gdtc->bg_thresh;
                }

                if (mdtc) {
                        unsigned long filepages, headroom, writeback;

                        /*
                         * If @wb belongs to !root memcg, repeat the same
                         * basic calculations for the memcg domain.
                         */
                        mem_cgroup_wb_stats(wb, &filepages, &headroom,
                                            &mdtc->dirty, &writeback);
                        mdtc->dirty += writeback;
                        mdtc_calc_avail(mdtc, filepages, headroom);

                        domain_dirty_limits(mdtc);

                        if (unlikely(strictlimit)) {
                                wb_dirty_limits(mdtc);
                                m_dirty = mdtc->wb_dirty;
                                m_thresh = mdtc->wb_thresh;
                                m_bg_thresh = mdtc->wb_bg_thresh;
                        } else {
                                m_dirty = mdtc->dirty;
                                m_thresh = mdtc->thresh;
                                m_bg_thresh = mdtc->bg_thresh;
                        }
                }

                /*
                 * In laptop mode, we wait until hitting the higher threshold
                 * before starting background writeout, and then write out all
                 * the way down to the lower threshold.  So slow writers cause
                 * minimal disk activity.
                 *
                 * In normal mode, we start background writeout at the lower
                 * background_thresh, to keep the amount of dirty memory low.
                 */
                if (!laptop_mode && nr_dirty > gdtc->bg_thresh &&
                    !writeback_in_progress(wb))
                        wb_start_background_writeback(wb);

                /*
                 * Throttle it only when the background writeback cannot
                 * catch-up. This avoids (excessively) small writeouts
                 * when the wb limits are ramping up in case of !strictlimit.
                 *
                 * In strictlimit case make decision based on the wb counters
                 * and limits. Small writeouts when the wb limits are ramping
                 * up are the price we consciously pay for strictlimit-ing.
                 *
                 * If memcg domain is in effect, @dirty should be under
                 * both global and memcg freerun ceilings.
                 */
                if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
                    (!mdtc ||
                     m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
                        unsigned long intv;
                        unsigned long m_intv;

free_running:
                        intv = dirty_poll_interval(dirty, thresh);
                        m_intv = ULONG_MAX;

                        current->dirty_paused_when = now;
                        current->nr_dirtied = 0;
                        if (mdtc)
                                m_intv = dirty_poll_interval(m_dirty, m_thresh);
                        current->nr_dirtied_pause = min(intv, m_intv);
                        break;
                }

                /* Start writeback even when in laptop mode */
                if (unlikely(!writeback_in_progress(wb)))
                        wb_start_background_writeback(wb);

                mem_cgroup_flush_foreign(wb);

                /*
                 * Calculate global domain's pos_ratio and select the
                 * global dtc by default.
                 */
                if (!strictlimit) {
                        wb_dirty_limits(gdtc);

                        if ((current->flags & PF_LOCAL_THROTTLE) &&
                            gdtc->wb_dirty <
                            dirty_freerun_ceiling(gdtc->wb_thresh,
                                                  gdtc->wb_bg_thresh))
                                /*
                                 * LOCAL_THROTTLE tasks must not be throttled
                                 * when below the per-wb freerun ceiling.
                                 */
                                goto free_running;
                }

                dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
                        ((gdtc->dirty > gdtc->thresh) || strictlimit);

                wb_position_ratio(gdtc);
                sdtc = gdtc;

                if (mdtc) {
                        /*
                         * If memcg domain is in effect, calculate its
                         * pos_ratio.  @wb should satisfy constraints from
                         * both global and memcg domains.  Choose the one
                         * w/ lower pos_ratio.
                         */
                        if (!strictlimit) {
                                wb_dirty_limits(mdtc);

                                if ((current->flags & PF_LOCAL_THROTTLE) &&
                                    mdtc->wb_dirty <
                                    dirty_freerun_ceiling(mdtc->wb_thresh,
                                                          mdtc->wb_bg_thresh))
                                        /*
                                         * LOCAL_THROTTLE tasks must not be
                                         * throttled when below the per-wb
                                         * freerun ceiling.
                                         */
                                        goto free_running;
                        }
                        dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
                                ((mdtc->dirty > mdtc->thresh) || strictlimit);

                        wb_position_ratio(mdtc);
                        if (mdtc->pos_ratio < gdtc->pos_ratio)
                                sdtc = mdtc;
                }

                if (dirty_exceeded != wb->dirty_exceeded)
                        wb->dirty_exceeded = dirty_exceeded;

                if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
                                           BANDWIDTH_INTERVAL))
                        __wb_update_bandwidth(gdtc, mdtc, true);

                /* throttle according to the chosen dtc */
                dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
                task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
                                                        RATELIMIT_CALC_SHIFT;
                max_pause = wb_max_pause(wb, sdtc->wb_dirty);
                min_pause = wb_min_pause(wb, max_pause,
                                         task_ratelimit, dirty_ratelimit,
                                         &nr_dirtied_pause);

                if (unlikely(task_ratelimit == 0)) {
                        period = max_pause;
                        pause = max_pause;
                        goto pause;
                }
                period = HZ * pages_dirtied / task_ratelimit;
                pause = period;
                if (current->dirty_paused_when)
                        pause -= now - current->dirty_paused_when;
                /*
                 * For less than 1s think time (ext3/4 may block the dirtier
                 * for up to 800ms from time to time on 1-HDD; so does xfs,
                 * however at much less frequency), try to compensate it in
                 * future periods by updating the virtual time; otherwise just
                 * do a reset, as it may be a light dirtier.
                 */
                if (pause < min_pause) {
                        trace_balance_dirty_pages(wb,
                                                  sdtc->thresh,
                                                  sdtc->bg_thresh,
                                                  sdtc->dirty,
                                                  sdtc->wb_thresh,
                                                  sdtc->wb_dirty,
                                                  dirty_ratelimit,
                                                  task_ratelimit,
                                                  pages_dirtied,
                                                  period,
                                                  min(pause, 0L),
                                                  start_time);
                        if (pause < -HZ) {
                                current->dirty_paused_when = now;
                                current->nr_dirtied = 0;
                        } else if (period) {
                                current->dirty_paused_when += period;
                                current->nr_dirtied = 0;
                        } else if (current->nr_dirtied_pause <= pages_dirtied)
                                current->nr_dirtied_pause += pages_dirtied;
                        break;
                }
                if (unlikely(pause > max_pause)) {
                        /* for occasional dropped task_ratelimit */
                        now += min(pause - max_pause, max_pause);
                        pause = max_pause;
                }

pause:
                trace_balance_dirty_pages(wb,
                                          sdtc->thresh,
                                          sdtc->bg_thresh,
                                          sdtc->dirty,
                                          sdtc->wb_thresh,
                                          sdtc->wb_dirty,
                                          dirty_ratelimit,
                                          task_ratelimit,
                                          pages_dirtied,
                                          period,
                                          pause,
                                          start_time);
                if (flags & BDP_ASYNC) {
                        ret = -EAGAIN;
                        break;
                }
                __set_current_state(TASK_KILLABLE);
                bdi->last_bdp_sleep = jiffies;
                io_schedule_timeout(pause);

                current->dirty_paused_when = now + pause;
                current->nr_dirtied = 0;
                current->nr_dirtied_pause = nr_dirtied_pause;

                /*
                 * This is typically equal to (dirty < thresh) and can also
                 * keep "1000+ dd on a slow USB stick" under control.
                 */
                if (task_ratelimit)
                        break;

                /*
                 * In the case of an unresponsive NFS server and the NFS dirty
                 * pages exceeds dirty_thresh, give the other good wb's a pipe
                 * to go through, so that tasks on them still remain responsive.
                 *
                 * In theory 1 page is enough to keep the consumer-producer
                 * pipe going: the flusher cleans 1 page => the task dirties 1
                 * more page. However wb_dirty has accounting errors.  So use
                 * the larger and more IO friendly wb_stat_error.
                 */
                if (sdtc->wb_dirty <= wb_stat_error())
                        break;

                if (fatal_signal_pending(current))
                        break;
        }
        return ret;
}

static DEFINE_PER_CPU(int, bdp_ratelimits);

/*
 * Normal tasks are throttled by
 *        loop {
 *                dirty tsk->nr_dirtied_pause pages;
 *                take a snap in balance_dirty_pages();
 *        }
 * However there is a worst case. If every task exit immediately when dirtied
 * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
 * called to throttle the page dirties. The solution is to save the not yet
 * throttled page dirties in dirty_throttle_leaks on task exit and charge them
 * randomly into the running tasks. This works well for the above worst case,
 * as the new task will pick up and accumulate the old task's leaked dirty
 * count and eventually get throttled.
 */
DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;

/**
 * balance_dirty_pages_ratelimited_flags - Balance dirty memory state.
 * @mapping: address_space which was dirtied.
 * @flags: BDP flags.
 *
 * Processes which are dirtying memory should call in here once for each page
 * which was newly dirtied.  The function will periodically check the system's
 * dirty state and will initiate writeback if needed.
 *
 * See balance_dirty_pages_ratelimited() for details.
 *
 * Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to
 * indicate that memory is out of balance and the caller must wait
 * for I/O to complete.  Otherwise, it will return 0 to indicate
 * that either memory was already in balance, or it was able to sleep
 * until the amount of dirty memory returned to balance.
 */
int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
                                        unsigned int flags)
{
        struct inode *inode = mapping->host;
        struct backing_dev_info *bdi = inode_to_bdi(inode);
        struct bdi_writeback *wb = NULL;
        int ratelimit;
        int ret = 0;
        int *p;

        if (!(bdi->capabilities & BDI_CAP_WRITEBACK))
                return ret;

        if (inode_cgwb_enabled(inode))
                wb = wb_get_create_current(bdi, GFP_KERNEL);
        if (!wb)
                wb = &bdi->wb;

        ratelimit = current->nr_dirtied_pause;
        if (wb->dirty_exceeded)
                ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));

        preempt_disable();
        /*
         * This prevents one CPU to accumulate too many dirtied pages without
         * calling into balance_dirty_pages(), which can happen when there are
         * 1000+ tasks, all of them start dirtying pages at exactly the same
         * time, hence all honoured too large initial task->nr_dirtied_pause.
         */
        p =  this_cpu_ptr(&bdp_ratelimits);
        if (unlikely(current->nr_dirtied >= ratelimit))
                *p = 0;
        else if (unlikely(*p >= ratelimit_pages)) {
                *p = 0;
                ratelimit = 0;
        }
        /*
         * Pick up the dirtied pages by the exited tasks. This avoids lots of
         * short-lived tasks (eg. gcc invocations in a kernel build) escaping
         * the dirty throttling and livelock other long-run dirtiers.
         */
        p = this_cpu_ptr(&dirty_throttle_leaks);
        if (*p > 0 && current->nr_dirtied < ratelimit) {
                unsigned long nr_pages_dirtied;
                nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
                *p -= nr_pages_dirtied;
                current->nr_dirtied += nr_pages_dirtied;
        }
        preempt_enable();

        if (unlikely(current->nr_dirtied >= ratelimit))
                ret = balance_dirty_pages(wb, current->nr_dirtied, flags);

        wb_put(wb);
        return ret;
}
EXPORT_SYMBOL_GPL(balance_dirty_pages_ratelimited_flags);

/**
 * balance_dirty_pages_ratelimited - balance dirty memory state.
 * @mapping: address_space which was dirtied.
 *
 * Processes which are dirtying memory should call in here once for each page
 * which was newly dirtied.  The function will periodically check the system's
 * dirty state and will initiate writeback if needed.
 *
 * Once we're over the dirty memory limit we decrease the ratelimiting
 * by a lot, to prevent individual processes from overshooting the limit
 * by (ratelimit_pages) each.
 */
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
        balance_dirty_pages_ratelimited_flags(mapping, 0);
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited);

/**
 * wb_over_bg_thresh - does @wb need to be written back?
 * @wb: bdi_writeback of interest
 *
 * Determines whether background writeback should keep writing @wb or it's
 * clean enough.
 *
 * Return: %true if writeback should continue.
 */
bool wb_over_bg_thresh(struct bdi_writeback *wb)
{
        struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
        struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
        struct dirty_throttle_control * const gdtc = &gdtc_stor;
        struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
                                                     &mdtc_stor : NULL;
        unsigned long reclaimable;
        unsigned long thresh;

        /*
         * Similar to balance_dirty_pages() but ignores pages being written
         * as we're trying to decide whether to put more under writeback.
         */
        gdtc->avail = global_dirtyable_memory();
        gdtc->dirty = global_node_page_state(NR_FILE_DIRTY);
        domain_dirty_limits(gdtc);

        if (gdtc->dirty > gdtc->bg_thresh)
                return true;

        thresh = __wb_calc_thresh(gdtc, gdtc->bg_thresh);
        if (thresh < 2 * wb_stat_error())
                reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
        else
                reclaimable = wb_stat(wb, WB_RECLAIMABLE);

        if (reclaimable > thresh)
                return true;

        if (mdtc) {
                unsigned long filepages, headroom, writeback;

                mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty,
                                    &writeback);
                mdtc_calc_avail(mdtc, filepages, headroom);
                domain_dirty_limits(mdtc);        /* ditto, ignore writeback */

                if (mdtc->dirty > mdtc->bg_thresh)
                        return true;

                thresh = __wb_calc_thresh(mdtc, mdtc->bg_thresh);
                if (thresh < 2 * wb_stat_error())
                        reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
                else
                        reclaimable = wb_stat(wb, WB_RECLAIMABLE);

                if (reclaimable > thresh)
                        return true;
        }

        return false;
}

#ifdef CONFIG_SYSCTL
/*
 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
 */
static int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
                void *buffer, size_t *length, loff_t *ppos)
{
        unsigned int old_interval = dirty_writeback_interval;
        int ret;

        ret = proc_dointvec(table, write, buffer, length, ppos);

        /*
         * Writing 0 to dirty_writeback_interval will disable periodic writeback
         * and a different non-zero value will wakeup the writeback threads.
         * wb_wakeup_delayed() would be more appropriate, but it's a pain to
         * iterate over all bdis and wbs.
         * The reason we do this is to make the change take effect immediately.
         */
        if (!ret && write && dirty_writeback_interval &&
                dirty_writeback_interval != old_interval)
                wakeup_flusher_threads(WB_REASON_PERIODIC);

        return ret;
}
#endif

void laptop_mode_timer_fn(struct timer_list *t)
{
        struct backing_dev_info *backing_dev_info =
                from_timer(backing_dev_info, t, laptop_mode_wb_timer);

        wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER);
}

/*
 * We've spun up the disk and we're in laptop mode: schedule writeback
 * of all dirty data a few seconds from now.  If the flush is already scheduled
 * then push it back - the user is still using the disk.
 */
void laptop_io_completion(struct backing_dev_info *info)
{
        mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
}

/*
 * We're in laptop mode and we've just synced. The sync's writes will have
 * caused another writeback to be scheduled by laptop_io_completion.
 * Nothing needs to be written back anymore, so we unschedule the writeback.
 */
void laptop_sync_completion(void)
{
        struct backing_dev_info *bdi;

        rcu_read_lock();

        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
                del_timer(&bdi->laptop_mode_wb_timer);

        rcu_read_unlock();
}

/*
 * If ratelimit_pages is too high then we can get into dirty-data overload
 * if a large number of processes all perform writes at the same time.
 *
 * Here we set ratelimit_pages to a level which ensures that when all CPUs are
 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
 * thresholds.
 */

void writeback_set_ratelimit(void)
{
        struct wb_domain *dom = &global_wb_domain;
        unsigned long background_thresh;
        unsigned long dirty_thresh;

        global_dirty_limits(&background_thresh, &dirty_thresh);
        dom->dirty_limit = dirty_thresh;
        ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
        if (ratelimit_pages < 16)
                ratelimit_pages = 16;
}

static int page_writeback_cpu_online(unsigned int cpu)
{
        writeback_set_ratelimit();
        return 0;
}

#ifdef CONFIG_SYSCTL

/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE;

static struct ctl_table vm_page_writeback_sysctls[] = {
        {
                .procname   = "dirty_background_ratio",
                .data       = &dirty_background_ratio,
                .maxlen     = sizeof(dirty_background_ratio),
                .mode       = 0644,
                .proc_handler   = dirty_background_ratio_handler,
                .extra1     = SYSCTL_ZERO,
                .extra2     = SYSCTL_ONE_HUNDRED,
        },
        {
                .procname   = "dirty_background_bytes",
                .data       = &dirty_background_bytes,
                .maxlen     = sizeof(dirty_background_bytes),
                .mode       = 0644,
                .proc_handler   = dirty_background_bytes_handler,
                .extra1     = SYSCTL_LONG_ONE,
        },
        {
                .procname   = "dirty_ratio",
                .data       = &vm_dirty_ratio,
                .maxlen     = sizeof(vm_dirty_ratio),
                .mode       = 0644,
                .proc_handler   = dirty_ratio_handler,
                .extra1     = SYSCTL_ZERO,
                .extra2     = SYSCTL_ONE_HUNDRED,
        },
        {
                .procname   = "dirty_bytes",
                .data       = &vm_dirty_bytes,
                .maxlen     = sizeof(vm_dirty_bytes),
                .mode       = 0644,
                .proc_handler   = dirty_bytes_handler,
                .extra1     = (void *)&dirty_bytes_min,
        },
        {
                .procname   = "dirty_writeback_centisecs",
                .data       = &dirty_writeback_interval,
                .maxlen     = sizeof(dirty_writeback_interval),
                .mode       = 0644,
                .proc_handler   = dirty_writeback_centisecs_handler,
        },
        {
                .procname   = "dirty_expire_centisecs",
                .data       = &dirty_expire_interval,
                .maxlen     = sizeof(dirty_expire_interval),
                .mode       = 0644,
                .proc_handler   = proc_dointvec_minmax,
                .extra1     = SYSCTL_ZERO,
        },
#ifdef CONFIG_HIGHMEM
        {
                .procname        = "highmem_is_dirtyable",
                .data                = &vm_highmem_is_dirtyable,
                .maxlen                = sizeof(vm_highmem_is_dirtyable),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
#endif
        {
                .procname        = "laptop_mode",
                .data                = &laptop_mode,
                .maxlen                = sizeof(laptop_mode),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_jiffies,
        },
};
#endif

/*
 * Called early on to tune the page writeback dirty limits.
 *
 * We used to scale dirty pages according to how total memory
 * related to pages that could be allocated for buffers.
 *
 * However, that was when we used "dirty_ratio" to scale with
 * all memory, and we don't do that any more. "dirty_ratio"
 * is now applied to total non-HIGHPAGE memory, and as such we can't
 * get into the old insane situation any more where we had
 * large amounts of dirty pages compared to a small amount of
 * non-HIGHMEM memory.
 *
 * But we might still want to scale the dirty_ratio by how
 * much memory the box has..
 */
void __init page_writeback_init(void)
{
        BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));

        cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online",
                          page_writeback_cpu_online, NULL);
        cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL,
                          page_writeback_cpu_online);
#ifdef CONFIG_SYSCTL
        register_sysctl_init("vm", vm_page_writeback_sysctls);
#endif
}

/**
 * tag_pages_for_writeback - tag pages to be written by writeback
 * @mapping: address space structure to write
 * @start: starting page index
 * @end: ending page index (inclusive)
 *
 * This function scans the page range from @start to @end (inclusive) and tags
 * all pages that have DIRTY tag set with a special TOWRITE tag.  The caller
 * can then use the TOWRITE tag to identify pages eligible for writeback.
 * This mechanism is used to avoid livelocking of writeback by a process
 * steadily creating new dirty pages in the file (thus it is important for this
 * function to be quick so that it can tag pages faster than a dirtying process
 * can create them).
 */
void tag_pages_for_writeback(struct address_space *mapping,
                             pgoff_t start, pgoff_t end)
{
        XA_STATE(xas, &mapping->i_pages, start);
        unsigned int tagged = 0;
        void *page;

        xas_lock_irq(&xas);
        xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) {
                xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
                if (++tagged % XA_CHECK_SCHED)
                        continue;

                xas_pause(&xas);
                xas_unlock_irq(&xas);
                cond_resched();
                xas_lock_irq(&xas);
        }
        xas_unlock_irq(&xas);
}
EXPORT_SYMBOL(tag_pages_for_writeback);

static bool folio_prepare_writeback(struct address_space *mapping,
                struct writeback_control *wbc, struct folio *folio)
{
        /*
         * Folio truncated or invalidated. We can freely skip it then,
         * even for data integrity operations: the folio has disappeared
         * concurrently, so there could be no real expectation of this
         * data integrity operation even if there is now a new, dirty
         * folio at the same pagecache index.
         */
        if (unlikely(folio->mapping != mapping))
                return false;

        /*
         * Did somebody else write it for us?
         */
        if (!folio_test_dirty(folio))
                return false;

        if (folio_test_writeback(folio)) {
                if (wbc->sync_mode == WB_SYNC_NONE)
                        return false;
                folio_wait_writeback(folio);
        }
        BUG_ON(folio_test_writeback(folio));

        if (!folio_clear_dirty_for_io(folio))
                return false;

        return true;
}

static xa_mark_t wbc_to_tag(struct writeback_control *wbc)
{
        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                return PAGECACHE_TAG_TOWRITE;
        return PAGECACHE_TAG_DIRTY;
}

static pgoff_t wbc_end(struct writeback_control *wbc)
{
        if (wbc->range_cyclic)
                return -1;
        return wbc->range_end >> PAGE_SHIFT;
}

static struct folio *writeback_get_folio(struct address_space *mapping,
                struct writeback_control *wbc)
{
        struct folio *folio;

retry:
        folio = folio_batch_next(&wbc->fbatch);
        if (!folio) {
                folio_batch_release(&wbc->fbatch);
                cond_resched();
                filemap_get_folios_tag(mapping, &wbc->index, wbc_end(wbc),
                                wbc_to_tag(wbc), &wbc->fbatch);
                folio = folio_batch_next(&wbc->fbatch);
                if (!folio)
                        return NULL;
        }

        folio_lock(folio);
        if (unlikely(!folio_prepare_writeback(mapping, wbc, folio))) {
                folio_unlock(folio);
                goto retry;
        }

        trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
        return folio;
}

/**
 * writeback_iter - iterate folio of a mapping for writeback
 * @mapping: address space structure to write
 * @wbc: writeback context
 * @folio: previously iterated folio (%NULL to start)
 * @error: in-out pointer for writeback errors (see below)
 *
 * This function returns the next folio for the writeback operation described by
 * @wbc on @mapping and  should be called in a while loop in the ->writepages
 * implementation.
 *
 * To start the writeback operation, %NULL is passed in the @folio argument, and
 * for every subsequent iteration the folio returned previously should be passed
 * back in.
 *
 * If there was an error in the per-folio writeback inside the writeback_iter()
 * loop, @error should be set to the error value.
 *
 * Once the writeback described in @wbc has finished, this function will return
 * %NULL and if there was an error in any iteration restore it to @error.
 *
 * Note: callers should not manually break out of the loop using break or goto
 * but must keep calling writeback_iter() until it returns %NULL.
 *
 * Return: the folio to write or %NULL if the loop is done.
 */
struct folio *writeback_iter(struct address_space *mapping,
                struct writeback_control *wbc, struct folio *folio, int *error)
{
        if (!folio) {
                folio_batch_init(&wbc->fbatch);
                wbc->saved_err = *error = 0;

                /*
                 * For range cyclic writeback we remember where we stopped so
                 * that we can continue where we stopped.
                 *
                 * For non-cyclic writeback we always start at the beginning of
                 * the passed in range.
                 */
                if (wbc->range_cyclic)
                        wbc->index = mapping->writeback_index;
                else
                        wbc->index = wbc->range_start >> PAGE_SHIFT;

                /*
                 * To avoid livelocks when other processes dirty new pages, we
                 * first tag pages which should be written back and only then
                 * start writing them.
                 *
                 * For data-integrity writeback we have to be careful so that we
                 * do not miss some pages (e.g., because some other process has
                 * cleared the TOWRITE tag we set).  The rule we follow is that
                 * TOWRITE tag can be cleared only by the process clearing the
                 * DIRTY tag (and submitting the page for I/O).
                 */
                if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                        tag_pages_for_writeback(mapping, wbc->index,
                                        wbc_end(wbc));
        } else {
                wbc->nr_to_write -= folio_nr_pages(folio);

                WARN_ON_ONCE(*error > 0);

                /*
                 * For integrity writeback we have to keep going until we have
                 * written all the folios we tagged for writeback above, even if
                 * we run past wbc->nr_to_write or encounter errors.
                 * We stash away the first error we encounter in wbc->saved_err
                 * so that it can be retrieved when we're done.  This is because
                 * the file system may still have state to clear for each folio.
                 *
                 * For background writeback we exit as soon as we run past
                 * wbc->nr_to_write or encounter the first error.
                 */
                if (wbc->sync_mode == WB_SYNC_ALL) {
                        if (*error && !wbc->saved_err)
                                wbc->saved_err = *error;
                } else {
                        if (*error || wbc->nr_to_write <= 0)
                                goto done;
                }
        }

        folio = writeback_get_folio(mapping, wbc);
        if (!folio) {
                /*
                 * To avoid deadlocks between range_cyclic writeback and callers
                 * that hold pages in PageWriteback to aggregate I/O until
                 * the writeback iteration finishes, we do not loop back to the
                 * start of the file.  Doing so causes a page lock/page
                 * writeback access order inversion - we should only ever lock
                 * multiple pages in ascending page->index order, and looping
                 * back to the start of the file violates that rule and causes
                 * deadlocks.
                 */
                if (wbc->range_cyclic)
                        mapping->writeback_index = 0;

                /*
                 * Return the first error we encountered (if there was any) to
                 * the caller.
                 */
                *error = wbc->saved_err;
        }
        return folio;

done:
        if (wbc->range_cyclic)
                mapping->writeback_index = folio->index + folio_nr_pages(folio);
        folio_batch_release(&wbc->fbatch);
        return NULL;
}
EXPORT_SYMBOL_GPL(writeback_iter);

/**
 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
 * @mapping: address space structure to write
 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
 * @writepage: function called for each page
 * @data: data passed to writepage function
 *
 * Return: %0 on success, negative error code otherwise
 *
 * Note: please use writeback_iter() instead.
 */
int write_cache_pages(struct address_space *mapping,
                      struct writeback_control *wbc, writepage_t writepage,
                      void *data)
{
        struct folio *folio = NULL;
        int error;

        while ((folio = writeback_iter(mapping, wbc, folio, &error))) {
                error = writepage(folio, wbc, data);
                if (error == AOP_WRITEPAGE_ACTIVATE) {
                        folio_unlock(folio);
                        error = 0;
                }
        }

        return error;
}
EXPORT_SYMBOL(write_cache_pages);

static int writeback_use_writepage(struct address_space *mapping,
                struct writeback_control *wbc)
{
        struct folio *folio = NULL;
        struct blk_plug plug;
        int err;

        blk_start_plug(&plug);
        while ((folio = writeback_iter(mapping, wbc, folio, &err))) {
                err = mapping->a_ops->writepage(&folio->page, wbc);
                if (err == AOP_WRITEPAGE_ACTIVATE) {
                        folio_unlock(folio);
                        err = 0;
                }
                mapping_set_error(mapping, err);
        }
        blk_finish_plug(&plug);

        return err;
}

int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
        int ret;
        struct bdi_writeback *wb;

        if (wbc->nr_to_write <= 0)
                return 0;
        wb = inode_to_wb_wbc(mapping->host, wbc);
        wb_bandwidth_estimate_start(wb);
        while (1) {
                if (mapping->a_ops->writepages) {
                        ret = mapping->a_ops->writepages(mapping, wbc);
                } else if (mapping->a_ops->writepage) {
                        ret = writeback_use_writepage(mapping, wbc);
                } else {
                        /* deal with chardevs and other special files */
                        ret = 0;
                }
                if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL)
                        break;

                /*
                 * Lacking an allocation context or the locality or writeback
                 * state of any of the inode's pages, throttle based on
                 * writeback activity on the local node. It's as good a
                 * guess as any.
                 */
                reclaim_throttle(NODE_DATA(numa_node_id()),
                        VMSCAN_THROTTLE_WRITEBACK);
        }
        /*
         * Usually few pages are written by now from those we've just submitted
         * but if there's constant writeback being submitted, this makes sure
         * writeback bandwidth is updated once in a while.
         */
        if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
                                   BANDWIDTH_INTERVAL))
                wb_update_bandwidth(wb);
        return ret;
}

/*
 * For address_spaces which do not use buffers nor write back.
 */
bool noop_dirty_folio(struct address_space *mapping, struct folio *folio)
{
        if (!folio_test_dirty(folio))
                return !folio_test_set_dirty(folio);
        return false;
}
EXPORT_SYMBOL(noop_dirty_folio);

/*
 * Helper function for set_page_dirty family.
 *
 * Caller must hold folio_memcg_lock().
 *
 * NOTE: This relies on being atomic wrt interrupts.
 */
static void folio_account_dirtied(struct folio *folio,
                struct address_space *mapping)
{
        struct inode *inode = mapping->host;

        trace_writeback_dirty_folio(folio, mapping);

        if (mapping_can_writeback(mapping)) {
                struct bdi_writeback *wb;
                long nr = folio_nr_pages(folio);

                inode_attach_wb(inode, folio);
                wb = inode_to_wb(inode);

                __lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr);
                __zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
                __node_stat_mod_folio(folio, NR_DIRTIED, nr);
                wb_stat_mod(wb, WB_RECLAIMABLE, nr);
                wb_stat_mod(wb, WB_DIRTIED, nr);
                task_io_account_write(nr * PAGE_SIZE);
                current->nr_dirtied += nr;
                __this_cpu_add(bdp_ratelimits, nr);

                mem_cgroup_track_foreign_dirty(folio, wb);
        }
}

/*
 * Helper function for deaccounting dirty page without writeback.
 *
 * Caller must hold folio_memcg_lock().
 */
void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
{
        long nr = folio_nr_pages(folio);

        lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
        zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
        wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
        task_io_account_cancelled_write(nr * PAGE_SIZE);
}

/*
 * Mark the folio dirty, and set it dirty in the page cache.
 *
 * If warn is true, then emit a warning if the folio is not uptodate and has
 * not been truncated.
 *
 * The caller must hold folio_memcg_lock().  It is the caller's
 * responsibility to prevent the folio from being truncated while
 * this function is in progress, although it may have been truncated
 * before this function is called.  Most callers have the folio locked.
 * A few have the folio blocked from truncation through other means (e.g.
 * zap_vma_pages() has it mapped and is holding the page table lock).
 * When called from mark_buffer_dirty(), the filesystem should hold a
 * reference to the buffer_head that is being marked dirty, which causes
 * try_to_free_buffers() to fail.
 */
void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
                             int warn)
{
        unsigned long flags;

        xa_lock_irqsave(&mapping->i_pages, flags);
        if (folio->mapping) {        /* Race with truncate? */
                WARN_ON_ONCE(warn && !folio_test_uptodate(folio));
                folio_account_dirtied(folio, mapping);
                __xa_set_mark(&mapping->i_pages, folio_index(folio),
                                PAGECACHE_TAG_DIRTY);
        }
        xa_unlock_irqrestore(&mapping->i_pages, flags);
}

/**
 * filemap_dirty_folio - Mark a folio dirty for filesystems which do not use buffer_heads.
 * @mapping: Address space this folio belongs to.
 * @folio: Folio to be marked as dirty.
 *
 * Filesystems which do not use buffer heads should call this function
 * from their dirty_folio address space operation.  It ignores the
 * contents of folio_get_private(), so if the filesystem marks individual
 * blocks as dirty, the filesystem should handle that itself.
 *
 * This is also sometimes used by filesystems which use buffer_heads when
 * a single buffer is being dirtied: we want to set the folio dirty in
 * that case, but not all the buffers.  This is a "bottom-up" dirtying,
 * whereas block_dirty_folio() is a "top-down" dirtying.
 *
 * The caller must ensure this doesn't race with truncation.  Most will
 * simply hold the folio lock, but e.g. zap_pte_range() calls with the
 * folio mapped and the pte lock held, which also locks out truncation.
 */
bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio)
{
        folio_memcg_lock(folio);
        if (folio_test_set_dirty(folio)) {
                folio_memcg_unlock(folio);
                return false;
        }

        __folio_mark_dirty(folio, mapping, !folio_test_private(folio));
        folio_memcg_unlock(folio);

        if (mapping->host) {
                /* !PageAnon && !swapper_space */
                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
        }
        return true;
}
EXPORT_SYMBOL(filemap_dirty_folio);

/**
 * folio_redirty_for_writepage - Decline to write a dirty folio.
 * @wbc: The writeback control.
 * @folio: The folio.
 *
 * When a writepage implementation decides that it doesn't want to write
 * @folio for some reason, it should call this function, unlock @folio and
 * return 0.
 *
 * Return: True if we redirtied the folio.  False if someone else dirtied
 * it first.
 */
bool folio_redirty_for_writepage(struct writeback_control *wbc,
                struct folio *folio)
{
        struct address_space *mapping = folio->mapping;
        long nr = folio_nr_pages(folio);
        bool ret;

        wbc->pages_skipped += nr;
        ret = filemap_dirty_folio(mapping, folio);
        if (mapping && mapping_can_writeback(mapping)) {
                struct inode *inode = mapping->host;
                struct bdi_writeback *wb;
                struct wb_lock_cookie cookie = {};

                wb = unlocked_inode_to_wb_begin(inode, &cookie);
                current->nr_dirtied -= nr;
                node_stat_mod_folio(folio, NR_DIRTIED, -nr);
                wb_stat_mod(wb, WB_DIRTIED, -nr);
                unlocked_inode_to_wb_end(inode, &cookie);
        }
        return ret;
}
EXPORT_SYMBOL(folio_redirty_for_writepage);

/**
 * folio_mark_dirty - Mark a folio as being modified.
 * @folio: The folio.
 *
 * The folio may not be truncated while this function is running.
 * Holding the folio lock is sufficient to prevent truncation, but some
 * callers cannot acquire a sleeping lock.  These callers instead hold
 * the page table lock for a page table which contains at least one page
 * in this folio.  Truncation will block on the page table lock as it
 * unmaps pages before removing the folio from its mapping.
 *
 * Return: True if the folio was newly dirtied, false if it was already dirty.
 */
bool folio_mark_dirty(struct folio *folio)
{
        struct address_space *mapping = folio_mapping(folio);

        if (likely(mapping)) {
                /*
                 * readahead/folio_deactivate could remain
                 * PG_readahead/PG_reclaim due to race with folio_end_writeback
                 * About readahead, if the folio is written, the flags would be
                 * reset. So no problem.
                 * About folio_deactivate, if the folio is redirtied,
                 * the flag will be reset. So no problem. but if the
                 * folio is used by readahead it will confuse readahead
                 * and make it restart the size rampup process. But it's
                 * a trivial problem.
                 */
                if (folio_test_reclaim(folio))
                        folio_clear_reclaim(folio);
                return mapping->a_ops->dirty_folio(mapping, folio);
        }

        return noop_dirty_folio(mapping, folio);
}
EXPORT_SYMBOL(folio_mark_dirty);

/*
 * set_page_dirty() is racy if the caller has no reference against
 * page->mapping->host, and if the page is unlocked.  This is because another
 * CPU could truncate the page off the mapping and then free the mapping.
 *
 * Usually, the page _is_ locked, or the caller is a user-space process which
 * holds a reference on the inode by having an open file.
 *
 * In other cases, the page should be locked before running set_page_dirty().
 */
int set_page_dirty_lock(struct page *page)
{
        int ret;

        lock_page(page);
        ret = set_page_dirty(page);
        unlock_page(page);
        return ret;
}
EXPORT_SYMBOL(set_page_dirty_lock);

/*
 * This cancels just the dirty bit on the kernel page itself, it does NOT
 * actually remove dirty bits on any mmap's that may be around. It also
 * leaves the page tagged dirty, so any sync activity will still find it on
 * the dirty lists, and in particular, clear_page_dirty_for_io() will still
 * look at the dirty bits in the VM.
 *
 * Doing this should *normally* only ever be done when a page is truncated,
 * and is not actually mapped anywhere at all. However, fs/buffer.c does
 * this when it notices that somebody has cleaned out all the buffers on a
 * page without actually doing it through the VM. Can you say "ext3 is
 * horribly ugly"? Thought you could.
 */
void __folio_cancel_dirty(struct folio *folio)
{
        struct address_space *mapping = folio_mapping(folio);

        if (mapping_can_writeback(mapping)) {
                struct inode *inode = mapping->host;
                struct bdi_writeback *wb;
                struct wb_lock_cookie cookie = {};

                folio_memcg_lock(folio);
                wb = unlocked_inode_to_wb_begin(inode, &cookie);

                if (folio_test_clear_dirty(folio))
                        folio_account_cleaned(folio, wb);

                unlocked_inode_to_wb_end(inode, &cookie);
                folio_memcg_unlock(folio);
        } else {
                folio_clear_dirty(folio);
        }
}
EXPORT_SYMBOL(__folio_cancel_dirty);

/*
 * Clear a folio's dirty flag, while caring for dirty memory accounting.
 * Returns true if the folio was previously dirty.
 *
 * This is for preparing to put the folio under writeout.  We leave
 * the folio tagged as dirty in the xarray so that a concurrent
 * write-for-sync can discover it via a PAGECACHE_TAG_DIRTY walk.
 * The ->writepage implementation will run either folio_start_writeback()
 * or folio_mark_dirty(), at which stage we bring the folio's dirty flag
 * and xarray dirty tag back into sync.
 *
 * This incoherency between the folio's dirty flag and xarray tag is
 * unfortunate, but it only exists while the folio is locked.
 */
bool folio_clear_dirty_for_io(struct folio *folio)
{
        struct address_space *mapping = folio_mapping(folio);
        bool ret = false;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (mapping && mapping_can_writeback(mapping)) {
                struct inode *inode = mapping->host;
                struct bdi_writeback *wb;
                struct wb_lock_cookie cookie = {};

                /*
                 * Yes, Virginia, this is indeed insane.
                 *
                 * We use this sequence to make sure that
                 *  (a) we account for dirty stats properly
                 *  (b) we tell the low-level filesystem to
                 *      mark the whole folio dirty if it was
                 *      dirty in a pagetable. Only to then
                 *  (c) clean the folio again and return 1 to
                 *      cause the writeback.
                 *
                 * This way we avoid all nasty races with the
                 * dirty bit in multiple places and clearing
                 * them concurrently from different threads.
                 *
                 * Note! Normally the "folio_mark_dirty(folio)"
                 * has no effect on the actual dirty bit - since
                 * that will already usually be set. But we
                 * need the side effects, and it can help us
                 * avoid races.
                 *
                 * We basically use the folio "master dirty bit"
                 * as a serialization point for all the different
                 * threads doing their things.
                 */
                if (folio_mkclean(folio))
                        folio_mark_dirty(folio);
                /*
                 * We carefully synchronise fault handlers against
                 * installing a dirty pte and marking the folio dirty
                 * at this point.  We do this by having them hold the
                 * page lock while dirtying the folio, and folios are
                 * always locked coming in here, so we get the desired
                 * exclusion.
                 */
                wb = unlocked_inode_to_wb_begin(inode, &cookie);
                if (folio_test_clear_dirty(folio)) {
                        long nr = folio_nr_pages(folio);
                        lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
                        zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
                        wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
                        ret = true;
                }
                unlocked_inode_to_wb_end(inode, &cookie);
                return ret;
        }
        return folio_test_clear_dirty(folio);
}
EXPORT_SYMBOL(folio_clear_dirty_for_io);

static void wb_inode_writeback_start(struct bdi_writeback *wb)
{
        atomic_inc(&wb->writeback_inodes);
}

static void wb_inode_writeback_end(struct bdi_writeback *wb)
{
        unsigned long flags;
        atomic_dec(&wb->writeback_inodes);
        /*
         * Make sure estimate of writeback throughput gets updated after
         * writeback completed. We delay the update by BANDWIDTH_INTERVAL
         * (which is the interval other bandwidth updates use for batching) so
         * that if multiple inodes end writeback at a similar time, they get
         * batched into one bandwidth update.
         */
        spin_lock_irqsave(&wb->work_lock, flags);
        if (test_bit(WB_registered, &wb->state))
                queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL);
        spin_unlock_irqrestore(&wb->work_lock, flags);
}

bool __folio_end_writeback(struct folio *folio)
{
        long nr = folio_nr_pages(folio);
        struct address_space *mapping = folio_mapping(folio);
        bool ret;

        folio_memcg_lock(folio);
        if (mapping && mapping_use_writeback_tags(mapping)) {
                struct inode *inode = mapping->host;
                struct backing_dev_info *bdi = inode_to_bdi(inode);
                unsigned long flags;

                xa_lock_irqsave(&mapping->i_pages, flags);
                ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback);
                __xa_clear_mark(&mapping->i_pages, folio_index(folio),
                                        PAGECACHE_TAG_WRITEBACK);
                if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
                        struct bdi_writeback *wb = inode_to_wb(inode);

                        wb_stat_mod(wb, WB_WRITEBACK, -nr);
                        __wb_writeout_add(wb, nr);
                        if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
                                wb_inode_writeback_end(wb);
                }

                if (mapping->host && !mapping_tagged(mapping,
                                                     PAGECACHE_TAG_WRITEBACK))
                        sb_clear_inode_writeback(mapping->host);

                xa_unlock_irqrestore(&mapping->i_pages, flags);
        } else {
                ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback);
        }

        lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr);
        zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
        node_stat_mod_folio(folio, NR_WRITTEN, nr);
        folio_memcg_unlock(folio);

        return ret;
}

void __folio_start_writeback(struct folio *folio, bool keep_write)
{
        long nr = folio_nr_pages(folio);
        struct address_space *mapping = folio_mapping(folio);
        int access_ret;

        VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);

        folio_memcg_lock(folio);
        if (mapping && mapping_use_writeback_tags(mapping)) {
                XA_STATE(xas, &mapping->i_pages, folio_index(folio));
                struct inode *inode = mapping->host;
                struct backing_dev_info *bdi = inode_to_bdi(inode);
                unsigned long flags;
                bool on_wblist;

                xas_lock_irqsave(&xas, flags);
                xas_load(&xas);
                folio_test_set_writeback(folio);

                on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);

                xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
                if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
                        struct bdi_writeback *wb = inode_to_wb(inode);

                        wb_stat_mod(wb, WB_WRITEBACK, nr);
                        if (!on_wblist)
                                wb_inode_writeback_start(wb);
                }

                /*
                 * We can come through here when swapping anonymous
                 * folios, so we don't necessarily have an inode to
                 * track for sync.
                 */
                if (mapping->host && !on_wblist)
                        sb_mark_inode_writeback(mapping->host);
                if (!folio_test_dirty(folio))
                        xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
                if (!keep_write)
                        xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
                xas_unlock_irqrestore(&xas, flags);
        } else {
                folio_test_set_writeback(folio);
        }

        lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr);
        zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
        folio_memcg_unlock(folio);

        access_ret = arch_make_folio_accessible(folio);
        /*
         * If writeback has been triggered on a page that cannot be made
         * accessible, it is too late to recover here.
         */
        VM_BUG_ON_FOLIO(access_ret != 0, folio);
}
EXPORT_SYMBOL(__folio_start_writeback);

/**
 * folio_wait_writeback - Wait for a folio to finish writeback.
 * @folio: The folio to wait for.
 *
 * If the folio is currently being written back to storage, wait for the
 * I/O to complete.
 *
 * Context: Sleeps.  Must be called in process context and with
 * no spinlocks held.  Caller should hold a reference on the folio.
 * If the folio is not locked, writeback may start again after writeback
 * has finished.
 */
void folio_wait_writeback(struct folio *folio)
{
        while (folio_test_writeback(folio)) {
                trace_folio_wait_writeback(folio, folio_mapping(folio));
                folio_wait_bit(folio, PG_writeback);
        }
}
EXPORT_SYMBOL_GPL(folio_wait_writeback);

/**
 * folio_wait_writeback_killable - Wait for a folio to finish writeback.
 * @folio: The folio to wait for.
 *
 * If the folio is currently being written back to storage, wait for the
 * I/O to complete or a fatal signal to arrive.
 *
 * Context: Sleeps.  Must be called in process context and with
 * no spinlocks held.  Caller should hold a reference on the folio.
 * If the folio is not locked, writeback may start again after writeback
 * has finished.
 * Return: 0 on success, -EINTR if we get a fatal signal while waiting.
 */
int folio_wait_writeback_killable(struct folio *folio)
{
        while (folio_test_writeback(folio)) {
                trace_folio_wait_writeback(folio, folio_mapping(folio));
                if (folio_wait_bit_killable(folio, PG_writeback))
                        return -EINTR;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(folio_wait_writeback_killable);

/**
 * folio_wait_stable() - wait for writeback to finish, if necessary.
 * @folio: The folio to wait on.
 *
 * This function determines if the given folio is related to a backing
 * device that requires folio contents to be held stable during writeback.
 * If so, then it will wait for any pending writeback to complete.
 *
 * Context: Sleeps.  Must be called in process context and with
 * no spinlocks held.  Caller should hold a reference on the folio.
 * If the folio is not locked, writeback may start again after writeback
 * has finished.
 */
void folio_wait_stable(struct folio *folio)
{
        if (mapping_stable_writes(folio_mapping(folio)))
                folio_wait_writeback(folio);
}
EXPORT_SYMBOL_GPL(folio_wait_stable);






























































































































































































    1 



    1 


    1 
    1 



















































    1 






    1 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
























    1 







































































    1 


























    1 











    1 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
// SPDX-License-Identifier: GPL-2.0
/*
 *
 * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
 *
 */

#include <linux/fiemap.h>
#include <linux/fs.h>
#include <linux/minmax.h>
#include <linux/vmalloc.h>

#include "debug.h"
#include "ntfs.h"
#include "ntfs_fs.h"
#ifdef CONFIG_NTFS3_LZX_XPRESS
#include "lib/lib.h"
#endif

static struct mft_inode *ni_ins_mi(struct ntfs_inode *ni, struct rb_root *tree,
                                   CLST ino, struct rb_node *ins)
{
        struct rb_node **p = &tree->rb_node;
        struct rb_node *pr = NULL;

        while (*p) {
                struct mft_inode *mi;

                pr = *p;
                mi = rb_entry(pr, struct mft_inode, node);
                if (mi->rno > ino)
                        p = &pr->rb_left;
                else if (mi->rno < ino)
                        p = &pr->rb_right;
                else
                        return mi;
        }

        if (!ins)
                return NULL;

        rb_link_node(ins, pr, p);
        rb_insert_color(ins, tree);
        return rb_entry(ins, struct mft_inode, node);
}

/*
 * ni_find_mi - Find mft_inode by record number.
 */
static struct mft_inode *ni_find_mi(struct ntfs_inode *ni, CLST rno)
{
        return ni_ins_mi(ni, &ni->mi_tree, rno, NULL);
}

/*
 * ni_add_mi - Add new mft_inode into ntfs_inode.
 */
static void ni_add_mi(struct ntfs_inode *ni, struct mft_inode *mi)
{
        ni_ins_mi(ni, &ni->mi_tree, mi->rno, &mi->node);
}

/*
 * ni_remove_mi - Remove mft_inode from ntfs_inode.
 */
void ni_remove_mi(struct ntfs_inode *ni, struct mft_inode *mi)
{
        rb_erase(&mi->node, &ni->mi_tree);
}

/*
 * ni_std - Return: Pointer into std_info from primary record.
 */
struct ATTR_STD_INFO *ni_std(struct ntfs_inode *ni)
{
        const struct ATTRIB *attr;

        attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL);
        return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO)) :
                      NULL;
}

/*
 * ni_std5
 *
 * Return: Pointer into std_info from primary record.
 */
struct ATTR_STD_INFO5 *ni_std5(struct ntfs_inode *ni)
{
        const struct ATTRIB *attr;

        attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL);

        return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO5)) :
                      NULL;
}

/*
 * ni_clear - Clear resources allocated by ntfs_inode.
 */
void ni_clear(struct ntfs_inode *ni)
{
        struct rb_node *node;

        if (!ni->vfs_inode.i_nlink && ni->mi.mrec && is_rec_inuse(ni->mi.mrec))
                ni_delete_all(ni);

        al_destroy(ni);

        for (node = rb_first(&ni->mi_tree); node;) {
                struct rb_node *next = rb_next(node);
                struct mft_inode *mi = rb_entry(node, struct mft_inode, node);

                rb_erase(node, &ni->mi_tree);
                mi_put(mi);
                node = next;
        }

        /* Bad inode always has mode == S_IFREG. */
        if (ni->ni_flags & NI_FLAG_DIR)
                indx_clear(&ni->dir);
        else {
                run_close(&ni->file.run);
#ifdef CONFIG_NTFS3_LZX_XPRESS
                if (ni->file.offs_page) {
                        /* On-demand allocated page for offsets. */
                        put_page(ni->file.offs_page);
                        ni->file.offs_page = NULL;
                }
#endif
        }

        mi_clear(&ni->mi);
}

/*
 * ni_load_mi_ex - Find mft_inode by record number.
 */
int ni_load_mi_ex(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi)
{
        int err;
        struct mft_inode *r;

        r = ni_find_mi(ni, rno);
        if (r)
                goto out;

        err = mi_get(ni->mi.sbi, rno, &r);
        if (err)
                return err;

        ni_add_mi(ni, r);

out:
        if (mi)
                *mi = r;
        return 0;
}

/*
 * ni_load_mi - Load mft_inode corresponded list_entry.
 */
int ni_load_mi(struct ntfs_inode *ni, const struct ATTR_LIST_ENTRY *le,
               struct mft_inode **mi)
{
        CLST rno;

        if (!le) {
                *mi = &ni->mi;
                return 0;
        }

        rno = ino_get(&le->ref);
        if (rno == ni->mi.rno) {
                *mi = &ni->mi;
                return 0;
        }
        return ni_load_mi_ex(ni, rno, mi);
}

/*
 * ni_find_attr
 *
 * Return: Attribute and record this attribute belongs to.
 */
struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr,
                            struct ATTR_LIST_ENTRY **le_o, enum ATTR_TYPE type,
                            const __le16 *name, u8 name_len, const CLST *vcn,
                            struct mft_inode **mi)
{
        struct ATTR_LIST_ENTRY *le;
        struct mft_inode *m;

        if (!ni->attr_list.size ||
            (!name_len && (type == ATTR_LIST || type == ATTR_STD))) {
                if (le_o)
                        *le_o = NULL;
                if (mi)
                        *mi = &ni->mi;

                /* Look for required attribute in primary record. */
                return mi_find_attr(&ni->mi, attr, type, name, name_len, NULL);
        }

        /* First look for list entry of required type. */
        le = al_find_ex(ni, le_o ? *le_o : NULL, type, name, name_len, vcn);
        if (!le)
                return NULL;

        if (le_o)
                *le_o = le;

        /* Load record that contains this attribute. */
        if (ni_load_mi(ni, le, &m))
                return NULL;

        /* Look for required attribute. */
        attr = mi_find_attr(m, NULL, type, name, name_len, &le->id);

        if (!attr)
                goto out;

        if (!attr->non_res) {
                if (vcn && *vcn)
                        goto out;
        } else if (!vcn) {
                if (attr->nres.svcn)
                        goto out;
        } else if (le64_to_cpu(attr->nres.svcn) > *vcn ||
                   *vcn > le64_to_cpu(attr->nres.evcn)) {
                goto out;
        }

        if (mi)
                *mi = m;
        return attr;

out:
        ntfs_inode_err(&ni->vfs_inode, "failed to parse mft record");
        ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR);
        return NULL;
}

/*
 * ni_enum_attr_ex - Enumerates attributes in ntfs_inode.
 */
struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr,
                               struct ATTR_LIST_ENTRY **le,
                               struct mft_inode **mi)
{
        struct mft_inode *mi2;
        struct ATTR_LIST_ENTRY *le2;

        /* Do we have an attribute list? */
        if (!ni->attr_list.size) {
                *le = NULL;
                if (mi)
                        *mi = &ni->mi;
                /* Enum attributes in primary record. */
                return mi_enum_attr(&ni->mi, attr);
        }

        /* Get next list entry. */
        le2 = *le = al_enumerate(ni, attr ? *le : NULL);
        if (!le2)
                return NULL;

        /* Load record that contains the required attribute. */
        if (ni_load_mi(ni, le2, &mi2))
                return NULL;

        if (mi)
                *mi = mi2;

        /* Find attribute in loaded record. */
        return rec_find_attr_le(mi2, le2);
}

/*
 * ni_load_attr - Load attribute that contains given VCN.
 */
struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
                            const __le16 *name, u8 name_len, CLST vcn,
                            struct mft_inode **pmi)
{
        struct ATTR_LIST_ENTRY *le;
        struct ATTRIB *attr;
        struct mft_inode *mi;
        struct ATTR_LIST_ENTRY *next;

        if (!ni->attr_list.size) {
                if (pmi)
                        *pmi = &ni->mi;
                return mi_find_attr(&ni->mi, NULL, type, name, name_len, NULL);
        }

        le = al_find_ex(ni, NULL, type, name, name_len, NULL);
        if (!le)
                return NULL;

        /*
         * Unfortunately ATTR_LIST_ENTRY contains only start VCN.
         * So to find the ATTRIB segment that contains 'vcn' we should
         * enumerate some entries.
         */
        if (vcn) {
                for (;; le = next) {
                        next = al_find_ex(ni, le, type, name, name_len, NULL);
                        if (!next || le64_to_cpu(next->vcn) > vcn)
                                break;
                }
        }

        if (ni_load_mi(ni, le, &mi))
                return NULL;

        if (pmi)
                *pmi = mi;

        attr = mi_find_attr(mi, NULL, type, name, name_len, &le->id);
        if (!attr)
                return NULL;

        if (!attr->non_res)
                return attr;

        if (le64_to_cpu(attr->nres.svcn) <= vcn &&
            vcn <= le64_to_cpu(attr->nres.evcn))
                return attr;

        return NULL;
}

/*
 * ni_load_all_mi - Load all subrecords.
 */
int ni_load_all_mi(struct ntfs_inode *ni)
{
        int err;
        struct ATTR_LIST_ENTRY *le;

        if (!ni->attr_list.size)
                return 0;

        le = NULL;

        while ((le = al_enumerate(ni, le))) {
                CLST rno = ino_get(&le->ref);

                if (rno == ni->mi.rno)
                        continue;

                err = ni_load_mi_ex(ni, rno, NULL);
                if (err)
                        return err;
        }

        return 0;
}

/*
 * ni_add_subrecord - Allocate + format + attach a new subrecord.
 */
bool ni_add_subrecord(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi)
{
        struct mft_inode *m;

        m = kzalloc(sizeof(struct mft_inode), GFP_NOFS);
        if (!m)
                return false;

        if (mi_format_new(m, ni->mi.sbi, rno, 0, ni->mi.rno == MFT_REC_MFT)) {
                mi_put(m);
                return false;
        }

        mi_get_ref(&ni->mi, &m->mrec->parent_ref);

        ni_add_mi(ni, m);
        *mi = m;
        return true;
}

/*
 * ni_remove_attr - Remove all attributes for the given type/name/id.
 */
int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
                   const __le16 *name, u8 name_len, bool base_only,
                   const __le16 *id)
{
        int err;
        struct ATTRIB *attr;
        struct ATTR_LIST_ENTRY *le;
        struct mft_inode *mi;
        u32 type_in;
        int diff;

        if (base_only || type == ATTR_LIST || !ni->attr_list.size) {
                attr = mi_find_attr(&ni->mi, NULL, type, name, name_len, id);
                if (!attr)
                        return -ENOENT;

                mi_remove_attr(ni, &ni->mi, attr);
                return 0;
        }

        type_in = le32_to_cpu(type);
        le = NULL;

        for (;;) {
                le = al_enumerate(ni, le);
                if (!le)
                        return 0;

next_le2:
                diff = le32_to_cpu(le->type) - type_in;
                if (diff < 0)
                        continue;

                if (diff > 0)
                        return 0;

                if (le->name_len != name_len)
                        continue;

                if (name_len &&
                    memcmp(le_name(le), name, name_len * sizeof(short)))
                        continue;

                if (id && le->id != *id)
                        continue;
                err = ni_load_mi(ni, le, &mi);
                if (err)
                        return err;

                al_remove_le(ni, le);

                attr = mi_find_attr(mi, NULL, type, name, name_len, id);
                if (!attr)
                        return -ENOENT;

                mi_remove_attr(ni, mi, attr);

                if (PtrOffset(ni->attr_list.le, le) >= ni->attr_list.size)
                        return 0;
                goto next_le2;
        }
}

/*
 * ni_ins_new_attr - Insert the attribute into record.
 *
 * Return: Not full constructed attribute or NULL if not possible to create.
 */
static struct ATTRIB *
ni_ins_new_attr(struct ntfs_inode *ni, struct mft_inode *mi,
                struct ATTR_LIST_ENTRY *le, enum ATTR_TYPE type,
                const __le16 *name, u8 name_len, u32 asize, u16 name_off,
                CLST svcn, struct ATTR_LIST_ENTRY **ins_le)
{
        int err;
        struct ATTRIB *attr;
        bool le_added = false;
        struct MFT_REF ref;

        mi_get_ref(mi, &ref);

        if (type != ATTR_LIST && !le && ni->attr_list.size) {
                err = al_add_le(ni, type, name, name_len, svcn, cpu_to_le16(-1),
                                &ref, &le);
                if (err) {
                        /* No memory or no space. */
                        return ERR_PTR(err);
                }
                le_added = true;

                /*
                 * al_add_le -> attr_set_size (list) -> ni_expand_list
                 * which moves some attributes out of primary record
                 * this means that name may point into moved memory
                 * reinit 'name' from le.
                 */
                name = le->name;
        }

        attr = mi_insert_attr(mi, type, name, name_len, asize, name_off);
        if (!attr) {
                if (le_added)
                        al_remove_le(ni, le);
                return NULL;
        }

        if (type == ATTR_LIST) {
                /* Attr list is not in list entry array. */
                goto out;
        }

        if (!le)
                goto out;

        /* Update ATTRIB Id and record reference. */
        le->id = attr->id;
        ni->attr_list.dirty = true;
        le->ref = ref;

out:
        if (ins_le)
                *ins_le = le;
        return attr;
}

/*
 * ni_repack
 *
 * Random write access to sparsed or compressed file may result to
 * not optimized packed runs.
 * Here is the place to optimize it.
 */
static int ni_repack(struct ntfs_inode *ni)
{
#if 1
        return 0;
#else
        int err = 0;
        struct ntfs_sb_info *sbi = ni->mi.sbi;
        struct mft_inode *mi, *mi_p = NULL;
        struct ATTRIB *attr = NULL, *attr_p;
        struct ATTR_LIST_ENTRY *le = NULL, *le_p;
        CLST alloc = 0;
        u8 cluster_bits = sbi->cluster_bits;
        CLST svcn, evcn = 0, svcn_p, evcn_p, next_svcn;
        u32 roff, rs = sbi->record_size;
        struct runs_tree run;

        run_init(&run);

        while ((attr = ni_enum_attr_ex(ni, attr, &le, &mi))) {
                if (!attr->non_res)
                        continue;

                svcn = le64_to_cpu(attr->nres.svcn);
                if (svcn != le64_to_cpu(le->vcn)) {
                        err = -EINVAL;
                        break;
                }

                if (!svcn) {
                        alloc = le64_to_cpu(attr->nres.alloc_size) >>
                                cluster_bits;
                        mi_p = NULL;
                } else if (svcn != evcn + 1) {
                        err = -EINVAL;
                        break;
                }

                evcn = le64_to_cpu(attr->nres.evcn);

                if (svcn > evcn + 1) {
                        err = -EINVAL;
                        break;
                }

                if (!mi_p) {
                        /* Do not try if not enough free space. */
                        if (le32_to_cpu(mi->mrec->used) + 8 >= rs)
                                continue;

                        /* Do not try if last attribute segment. */
                        if (evcn + 1 == alloc)
                                continue;
                        run_close(&run);
                }

                roff = le16_to_cpu(attr->nres.run_off);

                if (roff > le32_to_cpu(attr->size)) {
                        err = -EINVAL;
                        break;
                }

                err = run_unpack(&run, sbi, ni->mi.rno, svcn, evcn, svcn,
                                 Add2Ptr(attr, roff),
                                 le32_to_cpu(attr->size) - roff);
                if (err < 0)
                        break;

                if (!mi_p) {
                        mi_p = mi;
                        attr_p = attr;
                        svcn_p = svcn;
                        evcn_p = evcn;
                        le_p = le;
                        err = 0;
                        continue;
                }

                /*
                 * Run contains data from two records: mi_p and mi
                 * Try to pack in one.
                 */
                err = mi_pack_runs(mi_p, attr_p, &run, evcn + 1 - svcn_p);
                if (err)
                        break;

                next_svcn = le64_to_cpu(attr_p->nres.evcn) + 1;

                if (next_svcn >= evcn + 1) {
                        /* We can remove this attribute segment. */
                        al_remove_le(ni, le);
                        mi_remove_attr(NULL, mi, attr);
                        le = le_p;
                        continue;
                }

                attr->nres.svcn = le->vcn = cpu_to_le64(next_svcn);
                mi->dirty = true;
                ni->attr_list.dirty = true;

                if (evcn + 1 == alloc) {
                        err = mi_pack_runs(mi, attr, &run,
                                           evcn + 1 - next_svcn);
                        if (err)
                                break;
                        mi_p = NULL;
                } else {
                        mi_p = mi;
                        attr_p = attr;
                        svcn_p = next_svcn;
                        evcn_p = evcn;
                        le_p = le;
                        run_truncate_head(&run, next_svcn);
                }
        }

        if (err) {
                ntfs_inode_warn(&ni->vfs_inode, "repack problem");
                ntfs_set_state(sbi, NTFS_DIRTY_ERROR);

                /* Pack loaded but not packed runs. */
                if (mi_p)
                        mi_pack_runs(mi_p, attr_p, &run, evcn_p + 1 - svcn_p);
        }

        run_close(&run);
        return err;
#endif
}

/*
 * ni_try_remove_attr_list
 *
 * Can we remove attribute list?
 * Check the case when primary record contains enough space for all attributes.
 */
static int ni_try_remove_attr_list(struct ntfs_inode *ni)
{
        int err = 0;
        struct ntfs_sb_info *sbi = ni->mi.sbi;
        struct ATTRIB *attr, *attr_list, *attr_ins;
        struct ATTR_LIST_ENTRY *le;
        struct mft_inode *mi;
        u32 asize, free;
        struct MFT_REF ref;
        struct MFT_REC *mrec;
        __le16 id;

        if (!ni->attr_list.dirty)
                return 0;

        err = ni_repack(ni);
        if (err)
                return err;

        attr_list = mi_find_attr(&ni->mi, NULL, ATTR_LIST, NULL, 0, NULL);
        if (!attr_list)
                return 0;

        asize = le32_to_cpu(attr_list->size);

        /* Free space in primary record without attribute list. */
        free = sbi->record_size - le32_to_cpu(ni->mi.mrec->used) + asize;
        mi_get_ref(&ni->mi, &ref);

        le = NULL;
        while ((le = al_enumerate(ni, le))) {
                if (!memcmp(&le->ref, &ref, sizeof(ref)))
                        continue;

                if (le->vcn)
                        return 0;

                mi = ni_find_mi(ni, ino_get(&le->ref));
                if (!mi)
                        return 0;

                attr = mi_find_attr(mi, NULL, le->type, le_name(le),
                                    le->name_len, &le->id);
                if (!attr)
                        return 0;

                asize = le32_to_cpu(attr->size);
                if (asize > free)
                        return 0;

                free -= asize;
        }

        /* Make a copy of primary record to restore if error. */
        mrec = kmemdup(ni->mi.mrec, sbi->record_size, GFP_NOFS);
        if (!mrec)
                return 0; /* Not critical. */

        /* It seems that attribute list can be removed from primary record. */
        mi_remove_attr(NULL, &ni->mi, attr_list);

        /*
         * Repeat the cycle above and copy all attributes to primary record.
         * Do not remove original attributes from subrecords!
         * It should be success!
         */
        le = NULL;
        while ((le = al_enumerate(ni, le))) {
                if (!memcmp(&le->ref, &ref, sizeof(ref)))
                        continue;

                mi = ni_find_mi(ni, ino_get(&le->ref));
                if (!mi) {
                        /* Should never happened, 'cause already checked. */
                        goto out;
                }

                attr = mi_find_attr(mi, NULL, le->type, le_name(le),
                                    le->name_len, &le->id);
                if (!attr) {
                        /* Should never happened, 'cause already checked. */
                        goto out;
                }
                asize = le32_to_cpu(attr->size);

                /* Insert into primary record. */
                attr_ins = mi_insert_attr(&ni->mi, le->type, le_name(le),
                                          le->name_len, asize,
                                          le16_to_cpu(attr->name_off));
                if (!attr_ins) {
                        /*
                         * No space in primary record (already checked).
                         */
                        goto out;
                }

                /* Copy all except id. */
                id = attr_ins->id;
                memcpy(attr_ins, attr, asize);
                attr_ins->id = id;
        }

        /*
         * Repeat the cycle above and remove all attributes from subrecords.
         */
        le = NULL;
        while ((le = al_enumerate(ni, le))) {
                if (!memcmp(&le->ref, &ref, sizeof(ref)))
                        continue;

                mi = ni_find_mi(ni, ino_get(&le->ref));
                if (!mi)
                        continue;

                attr = mi_find_attr(mi, NULL, le->type, le_name(le),
                                    le->name_len, &le->id);
                if (!attr)
                        continue;

                /* Remove from original record. */
                mi_remove_attr(NULL, mi, attr);
        }

        run_deallocate(sbi, &ni->attr_list.run, true);
        run_close(&ni->attr_list.run);
        ni->attr_list.size = 0;
        kvfree(ni->attr_list.le);
        ni->attr_list.le = NULL;
        ni->attr_list.dirty = false;

        kfree(mrec);
        return 0;
out:
        /* Restore primary record. */
        swap(mrec, ni->mi.mrec);
        kfree(mrec);
        return 0;
}

/*
 * ni_create_attr_list - Generates an attribute list for this primary record.
 */
int ni_create_attr_list(struct ntfs_inode *ni)
{
        struct ntfs_sb_info *sbi = ni->mi.sbi;
        int err;
        u32 lsize;
        struct ATTRIB *attr;
        struct ATTRIB *arr_move[7];
        struct ATTR_LIST_ENTRY *le, *le_b[7];
        struct MFT_REC *rec;
        bool is_mft;
        CLST rno = 0;
        struct mft_inode *mi;
        u32 free_b, nb, to_free, rs;
        u16 sz;

        is_mft = ni->mi.rno == MFT_REC_MFT;
        rec = ni->mi.mrec;
        rs = sbi->record_size;

        /*
         * Skip estimating exact memory requirement.
         * Looks like one record_size is always enough.
         */
        le = kmalloc(al_aligned(rs), GFP_NOFS);
        if (!le)
                return -ENOMEM;

        mi_get_ref(&ni->mi, &le->ref);
        ni->attr_list.le = le;

        attr = NULL;
        nb = 0;
        free_b = 0;
        attr = NULL;

        for (; (attr = mi_enum_attr(&ni->mi, attr)); le = Add2Ptr(le, sz)) {
                sz = le_size(attr->name_len);
                le->type = attr->type;
                le->size = cpu_to_le16(sz);
                le->name_len = attr->name_len;
                le->name_off = offsetof(struct ATTR_LIST_ENTRY, name);
                le->vcn = 0;
                if (le != ni->attr_list.le)
                        le->ref = ni->attr_list.le->ref;
                le->id = attr->id;

                if (attr->name_len)
                        memcpy(le->name, attr_name(attr),
                               sizeof(short) * attr->name_len);
                else if (attr->type == ATTR_STD)
                        continue;
                else if (attr->type == ATTR_LIST)
                        continue;
                else if (is_mft && attr->type == ATTR_DATA)
                        continue;

                if (!nb || nb < ARRAY_SIZE(arr_move)) {
                        le_b[nb] = le;
                        arr_move[nb++] = attr;
                        free_b += le32_to_cpu(attr->size);
                }
        }

        lsize = PtrOffset(ni->attr_list.le, le);
        ni->attr_list.size = lsize;

        to_free = le32_to_cpu(rec->used) + lsize + SIZEOF_RESIDENT;
        if (to_free <= rs) {
                to_free = 0;
        } else {
                to_free -= rs;

                if (to_free > free_b) {
                        err = -EINVAL;
                        goto out;
                }
        }

        /* Allocate child MFT. */
        err = ntfs_look_free_mft(sbi, &rno, is_mft, ni, &mi);
        if (err)
                goto out;

        err = -EINVAL;
        /* Call mi_remove_attr() in reverse order to keep pointers 'arr_move' valid. */
        while (to_free > 0) {
                struct ATTRIB *b = arr_move[--nb];
                u32 asize = le32_to_cpu(b->size);
                u16 name_off = le16_to_cpu(b->name_off);

                attr = mi_insert_attr(mi, b->type, Add2Ptr(b, name_off),
                                      b->name_len, asize, name_off);
                if (!attr)
                        goto out;

                mi_get_ref(mi, &le_b[nb]->ref);
                le_b[nb]->id = attr->id;

                /* Copy all except id. */
                memcpy(attr, b, asize);
                attr->id = le_b[nb]->id;

                /* Remove from primary record. */
                if (!mi_remove_attr(NULL, &ni->mi, b))
                        goto out;

                if (to_free <= asize)
                        break;
                to_free -= asize;
                if (!nb)
                        goto out;
        }

        attr = mi_insert_attr(&ni->mi, ATTR_LIST, NULL, 0,
                              lsize + SIZEOF_RESIDENT, SIZEOF_RESIDENT);
        if (!attr)
                goto out;

        attr->non_res = 0;
        attr->flags = 0;
        attr->res.data_size = cpu_to_le32(lsize);
        attr->res.data_off = SIZEOF_RESIDENT_LE;
        attr->res.flags = 0;
        attr->res.res = 0;

        memcpy(resident_data_ex(attr, lsize), ni->attr_list.le, lsize);

        ni->attr_list.dirty = false;

        mark_inode_dirty(&ni->vfs_inode);
        return 0;

out:
        kvfree(ni->attr_list.le);
        ni->attr_list.le = NULL;
        ni->attr_list.size = 0;
        return err;
}

/*
 * ni_ins_attr_ext - Add an external attribute to the ntfs_inode.
 */
static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le,
                           enum ATTR_TYPE type, const __le16 *name, u8 name_len,
                           u32 asize, CLST svcn, u16 name_off, bool force_ext,
                           struct ATTRIB **ins_attr, struct mft_inode **ins_mi,
                           struct ATTR_LIST_ENTRY **ins_le)
{
        struct ATTRIB *attr;
        struct mft_inode *mi;
        CLST rno;
        u64 vbo;
        struct rb_node *node;
        int err;
        bool is_mft, is_mft_data;
        struct ntfs_sb_info *sbi = ni->mi.sbi;

        is_mft = ni->mi.rno == MFT_REC_MFT;
        is_mft_data = is_mft && type == ATTR_DATA && !name_len;

        if (asize > sbi->max_bytes_per_attr) {
                err = -EINVAL;
                goto out;
        }

        /*
         * Standard information and attr_list cannot be made external.
         * The Log File cannot have any external attributes.
         */
        if (type == ATTR_STD || type == ATTR_LIST ||
            ni->mi.rno == MFT_REC_LOG) {
                err = -EINVAL;
                goto out;
        }

        /* Create attribute list if it is not already existed. */
        if (!ni->attr_list.size) {
                err = ni_create_attr_list(ni);
                if (err)
                        goto out;
        }

        vbo = is_mft_data ? ((u64)svcn << sbi->cluster_bits) : 0;

        if (force_ext)
                goto insert_ext;

        /* Load all subrecords into memory. */
        err = ni_load_all_mi(ni);
        if (err)
                goto out;

        /* Check each of loaded subrecord. */
        for (node = rb_first(&ni->mi_tree); node; node = rb_next(node)) {
                mi = rb_entry(node, struct mft_inode, node);

                if (is_mft_data &&
                    (mi_enum_attr(mi, NULL) ||
                     vbo <= ((u64)mi->rno << sbi->record_bits))) {
                        /* We can't accept this record 'cause MFT's bootstrapping. */
                        continue;
                }
                if (is_mft &&
                    mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, NULL)) {
                        /*
                         * This child record already has a ATTR_DATA.
                         * So it can't accept any other records.
                         */
                        continue;
                }

                if ((type != ATTR_NAME || name_len) &&
                    mi_find_attr(mi, NULL, type, name, name_len, NULL)) {
                        /* Only indexed attributes can share same record. */
                        continue;
                }

                /*
                 * Do not try to insert this attribute
                 * if there is no room in record.
                 */
                if (le32_to_cpu(mi->mrec->used) + asize > sbi->record_size)
                        continue;

                /* Try to insert attribute into this subrecord. */
                attr = ni_ins_new_attr(ni, mi, le, type, name, name_len, asize,
                                       name_off, svcn, ins_le);
                if (!attr)
                        continue;
                if (IS_ERR(attr))
                        return PTR_ERR(attr);

                if (ins_attr)
                        *ins_attr = attr;
                if (ins_mi)
                        *ins_mi = mi;
                return 0;
        }

insert_ext:
        /* We have to allocate a new child subrecord. */
        err = ntfs_look_free_mft(sbi, &rno, is_mft_data, ni, &mi);
        if (err)
                goto out;

        if (is_mft_data && vbo <= ((u64)rno << sbi->record_bits)) {
                err = -EINVAL;
                goto out1;
        }

        attr = ni_ins_new_attr(ni, mi, le, type, name, name_len, asize,
                               name_off, svcn, ins_le);
        if (!attr) {
                err = -EINVAL;
                goto out2;
        }

        if (IS_ERR(attr)) {
                err = PTR_ERR(attr);
                goto out2;
        }

        if (ins_attr)
                *ins_attr = attr;
        if (ins_mi)
                *ins_mi = mi;

        return 0;

out2:
        ni_remove_mi(ni, mi);
        mi_put(mi);

out1:
        ntfs_mark_rec_free(sbi, rno, is_mft);

out:
        return err;
}

/*
 * ni_insert_attr - Insert an attribute into the file.
 *
 * If the primary record has room, it will just insert the attribute.
 * If not, it may make the attribute external.
 * For $MFT::Data it may make room for the attribute by
 * making other attributes external.
 *
 * NOTE:
 * The ATTR_LIST and ATTR_STD cannot be made external.
 * This function does not fill new attribute full.
 * It only fills 'size'/'type'/'id'/'name_len' fields.
 */
static int ni_insert_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
                          const __le16 *name, u8 name_len, u32 asize,
                          u16 name_off, CLST svcn, struct ATTRIB **ins_attr,
                          struct mft_inode **ins_mi,
                          struct ATTR_LIST_ENTRY **ins_le)
{
        struct ntfs_sb_info *sbi = ni->mi.sbi;
        int err;
        struct ATTRIB *attr, *eattr;
        struct MFT_REC *rec;
        bool is_mft;
        struct ATTR_LIST_ENTRY *le;
        u32 list_reserve, max_free, free, used, t32;
        __le16 id;
        u16 t16;

        is_mft = ni->mi.rno == MFT_REC_MFT;
        rec = ni->mi.mrec;

        list_reserve = SIZEOF_NONRESIDENT + 3 * (1 + 2 * sizeof(u32));
        used = le32_to_cpu(rec->used);
        free = sbi->record_size - used;

        if (is_mft && type != ATTR_LIST) {
                /* Reserve space for the ATTRIB list. */
                if (free < list_reserve)
                        free = 0;
                else
                        free -= list_reserve;
        }

        if (asize <= free) {
                attr = ni_ins_new_attr(ni, &ni->mi, NULL, type, name, name_len,
                                       asize, name_off, svcn, ins_le);
                if (IS_ERR(attr)) {
                        err = PTR_ERR(attr);
                        goto out;
                }

                if (attr) {
                        if (ins_attr)
                                *ins_attr = attr;
                        if (ins_mi)
                                *ins_mi = &ni->mi;
                        err = 0;
                        goto out;
                }
        }

        if (!is_mft || type != ATTR_DATA || svcn) {
                /* This ATTRIB will be external. */
                err = ni_ins_attr_ext(ni, NULL, type, name, name_len, asize,
                                      svcn, name_off, false, ins_attr, ins_mi,
                                      ins_le);
                goto out;
        }

        /*
         * Here we have: "is_mft && type == ATTR_DATA && !svcn"
         *
         * The first chunk of the $MFT::Data ATTRIB must be the base record.
         * Evict as many other attributes as possible.
         */
        max_free = free;

        /* Estimate the result of moving all possible attributes away. */
        attr = NULL;

        while ((attr = mi_enum_attr(&ni->mi, attr))) {
                if (attr->type == ATTR_STD)
                        continue;
                if (attr->type == ATTR_LIST)
                        continue;
                max_free += le32_to_cpu(attr->size);
        }

        if (max_free < asize + list_reserve) {
                /* Impossible to insert this attribute into primary record. */
                err = -EINVAL;
                goto out;
        }

        /* Start real attribute moving. */
        attr = NULL;

        for (;;) {
                attr = mi_enum_attr(&ni->mi, attr);
                if (!attr) {
                        /* We should never be here 'cause we have already check this case. */
                        err = -EINVAL;
                        goto out;
                }

                /* Skip attributes that MUST be primary record. */
                if (attr->type == ATTR_STD || attr->type == ATTR_LIST)
                        continue;

                le = NULL;
                if (ni->attr_list.size) {
                        le = al_find_le(ni, NULL, attr);
                        if (!le) {
                                /* Really this is a serious bug. */
                                err = -EINVAL;
                                goto out;
                        }
                }

                t32 = le32_to_cpu(attr->size);
                t16 = le16_to_cpu(attr->name_off);
                err = ni_ins_attr_ext(ni, le, attr->type, Add2Ptr(attr, t16),
                                      attr->name_len, t32, attr_svcn(attr), t16,
                                      false, &eattr, NULL, NULL);
                if (err)
                        return err;

                id = eattr->id;
                memcpy(eattr, attr, t32);
                eattr->id = id;

                /* Remove from primary record. */
                mi_remove_attr(NULL, &ni->mi, attr);

                /* attr now points to next attribute. */
                if (attr->type == ATTR_END)
                        goto out;
        }
        while (asize + list_reserve > sbi->record_size - le32_to_cpu(rec->used))
                ;

        attr = ni_ins_new_attr(ni, &ni->mi, NULL, type, name, name_len, asize,
                               name_off, svcn, ins_le);
        if (!attr) {
                err = -EINVAL;
                goto out;
        }

        if (IS_ERR(attr)) {
                err = PTR_ERR(attr);
                goto out;
        }

        if (ins_attr)
                *ins_attr = attr;
        if (ins_mi)
                *ins_mi = &ni->mi;

out:
        return err;
}

/* ni_expand_mft_list - Split ATTR_DATA of $MFT. */
static int ni_expand_mft_list(struct ntfs_inode *ni)
{
        int err = 0;
        struct runs_tree *run = &ni->file.run;
        u32 asize, run_size, done = 0;
        struct ATTRIB *attr;
        struct rb_node *node;
        CLST mft_min, mft_new, svcn, evcn, plen;
        struct mft_inode *mi, *mi_min, *mi_new;
        struct ntfs_sb_info *sbi = ni->mi.sbi;

        /* Find the nearest MFT. */
        mft_min = 0;
        mft_new = 0;
        mi_min = NULL;

        for (node = rb_first(&ni->mi_tree); node; node = rb_next(node)) {
                mi = rb_entry(node, struct mft_inode, node);

                attr = mi_enum_attr(mi, NULL);

                if (!attr) {
                        mft_min = mi->rno;
                        mi_min = mi;
                        break;
                }
        }

        if (ntfs_look_free_mft(sbi, &mft_new, true, ni, &mi_new)) {
                mft_new = 0;
                /* Really this is not critical. */
        } else if (mft_min > mft_new) {
                mft_min = mft_new;
                mi_min = mi_new;
        } else {
                ntfs_mark_rec_free(sbi, mft_new, true);
                mft_new = 0;
                ni_remove_mi(ni, mi_new);
        }

        attr = mi_find_attr(&ni->mi, NULL, ATTR_DATA, NULL, 0, NULL);
        if (!attr) {
                err = -EINVAL;
                goto out;
        }

        asize = le32_to_cpu(attr->size);

        evcn = le64_to_cpu(attr->nres.evcn);
        svcn = bytes_to_cluster(sbi, (u64)(mft_min + 1) << sbi->record_bits);
        if (evcn + 1 >= svcn) {
                err = -EINVAL;
                goto out;
        }

        /*
         * Split primary attribute [0 evcn] in two parts [0 svcn) + [svcn evcn].
         *
         * Update first part of ATTR_DATA in 'primary MFT.
         */
        err = run_pack(run, 0, svcn, Add2Ptr(attr, SIZEOF_NONRESIDENT),
                       asize - SIZEOF_NONRESIDENT, &plen);
        if (err < 0)
                goto out;

        run_size = ALIGN(err, 8);
        err = 0;

        if (plen < svcn) {
                err = -EINVAL;
                goto out;
        }

        attr->nres.evcn = cpu_to_le64(svcn - 1);
        attr->size = cpu_to_le32(run_size + SIZEOF_NONRESIDENT);
        /* 'done' - How many bytes of primary MFT becomes free. */
        done = asize - run_size - SIZEOF_NONRESIDENT;
        le32_sub_cpu(&ni->mi.mrec->used, done);

        /* Estimate packed size (run_buf=NULL). */
        err = run_pack(run, svcn, evcn + 1 - svcn, NULL, sbi->record_size,
                       &plen);
        if (err < 0)
                goto out;

        run_size = ALIGN(err, 8);
        err = 0;

        if (plen < evcn + 1 - svcn) {
                err = -EINVAL;
                goto out;
        }

        /*
         * This function may implicitly call expand attr_list.
         * Insert second part of ATTR_DATA in 'mi_min'.
         */
        attr = ni_ins_new_attr(ni, mi_min, NULL, ATTR_DATA, NULL, 0,
                               SIZEOF_NONRESIDENT + run_size,
                               SIZEOF_NONRESIDENT, svcn, NULL);
        if (!attr) {
                err = -EINVAL;
                goto out;
        }

        if (IS_ERR(attr)) {
                err = PTR_ERR(attr);
                goto out;
        }

        attr->non_res = 1;
        attr->name_off = SIZEOF_NONRESIDENT_LE;
        attr->flags = 0;

        /* This function can't fail - cause already checked above. */
        run_pack(run, svcn, evcn + 1 - svcn, Add2Ptr(attr, SIZEOF_NONRESIDENT),
                 run_size, &plen);

        attr->nres.svcn = cpu_to_le64(svcn);
        attr->nres.evcn = cpu_to_le64(evcn);
        attr->nres.run_off = cpu_to_le16(SIZEOF_NONRESIDENT);

out:
        if (mft_new) {
                ntfs_mark_rec_free(sbi, mft_new, true);
                ni_remove_mi(ni, mi_new);
        }

        return !err && !done ? -EOPNOTSUPP : err;
}

/*
 * ni_expand_list - Move all possible attributes out of primary record.
 */
int ni_expand_list(struct ntfs_inode *ni)
{
        int err = 0;
        u32 asize, done = 0;
        struct ATTRIB *attr, *ins_attr;
        struct ATTR_LIST_ENTRY *le;
        bool is_mft = ni->mi.rno == MFT_REC_MFT;
        struct MFT_REF ref;

        mi_get_ref(&ni->mi, &ref);
        le = NULL;

        while ((le = al_enumerate(ni, le))) {
                if (le->type == ATTR_STD)
                        continue;

                if (memcmp(&ref, &le->ref, sizeof(struct MFT_REF)))
                        continue;

                if (is_mft && le->type == ATTR_DATA)
                        continue;

                /* Find attribute in primary record. */
                attr = rec_find_attr_le(&ni->mi, le);
                if (!attr) {
                        err = -EINVAL;
                        goto out;
                }

                asize = le32_to_cpu(attr->size);

                /* Always insert into new record to avoid collisions (deep recursive). */
                err = ni_ins_attr_ext(ni, le, attr->type, attr_name(attr),
                                      attr->name_len, asize, attr_svcn(attr),
                                      le16_to_cpu(attr->name_off), true,
                                      &ins_attr, NULL, NULL);

                if (err)
                        goto out;

                memcpy(ins_attr, attr, asize);
                ins_attr->id = le->id;
                /* Remove from primary record. */
                mi_remove_attr(NULL, &ni->mi, attr);

                done += asize;
                goto out;
        }

        if (!is_mft) {
                err = -EFBIG; /* Attr list is too big(?) */
                goto out;
        }

        /* Split MFT data as much as possible. */
        err = ni_expand_mft_list(ni);

out:
        return !err && !done ? -EOPNOTSUPP : err;
}

/*
 * ni_insert_nonresident - Insert new nonresident attribute.
 */
int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type,
                          const __le16 *name, u8 name_len,
                          const struct runs_tree *run, CLST svcn, CLST len,
                          __le16 flags, struct ATTRIB **new_attr,
                          struct mft_inode **mi, struct ATTR_LIST_ENTRY **le)
{
        int err;
        CLST plen;
        struct ATTRIB *attr;
        bool is_ext = (flags & (ATTR_FLAG_SPARSED | ATTR_FLAG_COMPRESSED)) &&
                      !svcn;
        u32 name_size = ALIGN(name_len * sizeof(short), 8);
        u32 name_off = is_ext ? SIZEOF_NONRESIDENT_EX : SIZEOF_NONRESIDENT;
        u32 run_off = name_off + name_size;
        u32 run_size, asize;
        struct ntfs_sb_info *sbi = ni->mi.sbi;

        /* Estimate packed size (run_buf=NULL). */
        err = run_pack(run, svcn, len, NULL, sbi->max_bytes_per_attr - run_off,
                       &plen);
        if (err < 0)
                goto out;

        run_size = ALIGN(err, 8);

        if (plen < len) {
                err = -EINVAL;
                goto out;
        }

        asize = run_off + run_size;

        if (asize > sbi->max_bytes_per_attr) {
                err = -EINVAL;
                goto out;
        }

        err = ni_insert_attr(ni, type, name, name_len, asize, name_off, svcn,
                             &attr, mi, le);

        if (err)
                goto out;

        attr->non_res = 1;
        attr->name_off = cpu_to_le16(name_off);
        attr->flags = flags;

        /* This function can't fail - cause already checked above. */
        run_pack(run, svcn, len, Add2Ptr(attr, run_off), run_size, &plen);

        attr->nres.svcn = cpu_to_le64(svcn);
        attr->nres.evcn = cpu_to_le64((u64)svcn + len - 1);

        if (new_attr)
                *new_attr = attr;

        *(__le64 *)&attr->nres.run_off = cpu_to_le64(run_off);

        attr->nres.alloc_size =
                svcn ? 0 : cpu_to_le64((u64)len << ni->mi.sbi->cluster_bits);
        attr->nres.data_size = attr->nres.alloc_size;
        attr->nres.valid_size = attr->nres.alloc_size;

        if (is_ext) {
                if (flags & ATTR_FLAG_COMPRESSED)
                        attr->nres.c_unit = COMPRESSION_UNIT;
                attr->nres.total_size = attr->nres.alloc_size;
        }

out:
        return err;
}

/*
 * ni_insert_resident - Inserts new resident attribute.
 */
int ni_insert_resident(struct ntfs_inode *ni, u32 data_size,
                       enum ATTR_TYPE type, const __le16 *name, u8 name_len,
                       struct ATTRIB **new_attr, struct mft_inode **mi,
                       struct ATTR_LIST_ENTRY **le)
{
        int err;
        u32 name_size = ALIGN(name_len * sizeof(short), 8);
        u32 asize = SIZEOF_RESIDENT + name_size + ALIGN(data_size, 8);
        struct ATTRIB *attr;

        err = ni_insert_attr(ni, type, name, name_len, asize, SIZEOF_RESIDENT,
                             0, &attr, mi, le);
        if (err)
                return err;

        attr->non_res = 0;
        attr->flags = 0;

        attr->res.data_size = cpu_to_le32(data_size);
        attr->res.data_off = cpu_to_le16(SIZEOF_RESIDENT + name_size);
        if (type == ATTR_NAME) {
                attr->res.flags = RESIDENT_FLAG_INDEXED;

                /* is_attr_indexed(attr)) == true */
                le16_add_cpu(&ni->mi.mrec->hard_links, 1);
                ni->mi.dirty = true;
        }
        attr->res.res = 0;

        if (new_attr)
                *new_attr = attr;

        return 0;
}

/*
 * ni_remove_attr_le - Remove attribute from record.
 */
void ni_remove_attr_le(struct ntfs_inode *ni, struct ATTRIB *attr,
                       struct mft_inode *mi, struct ATTR_LIST_ENTRY *le)
{
        mi_remove_attr(ni, mi, attr);

        if (le)
                al_remove_le(ni, le);
}

/*
 * ni_delete_all - Remove all attributes and frees allocates space.
 *
 * ntfs_evict_inode->ntfs_clear_inode->ni_delete_all (if no links).
 */
int ni_delete_all(struct ntfs_inode *ni)
{
        int err;
        struct ATTR_LIST_ENTRY *le = NULL;
        struct ATTRIB *attr = NULL;
        struct rb_node *node;
        u16 roff;
        u32 asize;
        CLST svcn, evcn;
        struct ntfs_sb_info *sbi = ni->mi.sbi;
        bool nt3 = is_ntfs3(sbi);
        struct MFT_REF ref;

        while ((attr = ni_enum_attr_ex(ni, attr, &le, NULL))) {
                if (!nt3 || attr->name_len) {
                        ;
                } else if (attr->type == ATTR_REPARSE) {
                        mi_get_ref(&ni->mi, &ref);
                        ntfs_remove_reparse(sbi, 0, &ref);
                } else if (attr->type == ATTR_ID && !attr->non_res &&
                           le32_to_cpu(attr->res.data_size) >=
                                   sizeof(struct GUID)) {
                        ntfs_objid_remove(sbi, resident_data(attr));
                }

                if (!attr->non_res)
                        continue;

                svcn = le64_to_cpu(attr->nres.svcn);
                evcn = le64_to_cpu(attr->nres.evcn);

                if (evcn + 1 <= svcn)
                        continue;

                asize = le32_to_cpu(attr->size);
                roff = le16_to_cpu(attr->nres.run_off);

                if (roff > asize)
                        return -EINVAL;

                /* run==1 means unpack and deallocate. */
                run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn, evcn, svcn,
                              Add2Ptr(attr, roff), asize - roff);
        }

        if (ni->attr_list.size) {
                run_deallocate(ni->mi.sbi, &ni->attr_list.run, true);
                al_destroy(ni);
        }

        /* Free all subrecords. */
        for (node = rb_first(&ni->mi_tree); node;) {
                struct rb_node *next = rb_next(node);
                struct mft_inode *mi = rb_entry(node, struct mft_inode, node);

                clear_rec_inuse(mi->mrec);
                mi->dirty = true;
                mi_write(mi, 0);

                ntfs_mark_rec_free(sbi, mi->rno, false);
                ni_remove_mi(ni, mi);
                mi_put(mi);
                node = next;
        }

        /* Free base record. */
        clear_rec_inuse(ni->mi.mrec);
        ni->mi.dirty = true;
        err = mi_write(&ni->mi, 0);

        ntfs_mark_rec_free(sbi, ni->mi.rno, false);

        return err;
}

/* ni_fname_name
 *
 * Return: File name attribute by its value.
 */
struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni,
                                     const struct le_str *uni,
                                     const struct MFT_REF *home_dir,
                                     struct mft_inode **mi,
                                     struct ATTR_LIST_ENTRY **le)
{
        struct ATTRIB *attr = NULL;
        struct ATTR_FILE_NAME *fname;

        if (le)
                *le = NULL;

        /* Enumerate all names. */
next:
        attr = ni_find_attr(ni, attr, le, ATTR_NAME, NULL, 0, NULL, mi);
        if (!attr)
                return NULL;

        fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME);
        if (!fname)
                goto next;

        if (home_dir && memcmp(home_dir, &fname->home, sizeof(*home_dir)))
                goto next;

        if (!uni)
                return fname;

        if (uni->len != fname->name_len)
                goto next;

        if (ntfs_cmp_names(uni->name, uni->len, fname->name, uni->len, NULL,
                           false))
                goto next;
        return fname;
}

/*
 * ni_fname_type
 *
 * Return: File name attribute with given type.
 */
struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type,
                                     struct mft_inode **mi,
                                     struct ATTR_LIST_ENTRY **le)
{
        struct ATTRIB *attr = NULL;
        struct ATTR_FILE_NAME *fname;

        *le = NULL;

        if (name_type == FILE_NAME_POSIX)
                return NULL;

        /* Enumerate all names. */
        for (;;) {
                attr = ni_find_attr(ni, attr, le, ATTR_NAME, NULL, 0, NULL, mi);
                if (!attr)
                        return NULL;

                fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME);
                if (fname && name_type == fname->type)
                        return fname;
        }
}

/*
 * ni_new_attr_flags
 *
 * Process compressed/sparsed in special way.
 * NOTE: You need to set ni->std_fa = new_fa
 * after this function to keep internal structures in consistency.
 */
int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa)
{
        struct ATTRIB *attr;
        struct mft_inode *mi;
        __le16 new_aflags;
        u32 new_asize;

        attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi);
        if (!attr)
                return -EINVAL;

        new_aflags = attr->flags;

        if (new_fa & FILE_ATTRIBUTE_SPARSE_FILE)
                new_aflags |= ATTR_FLAG_SPARSED;
        else
                new_aflags &= ~ATTR_FLAG_SPARSED;

        if (new_fa & FILE_ATTRIBUTE_COMPRESSED)
                new_aflags |= ATTR_FLAG_COMPRESSED;
        else
                new_aflags &= ~ATTR_FLAG_COMPRESSED;

        if (new_aflags == attr->flags)
                return 0;

        if ((new_aflags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) ==
            (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) {
                ntfs_inode_warn(&ni->vfs_inode,
                                "file can't be sparsed and compressed");
                return -EOPNOTSUPP;
        }

        if (!attr->non_res)
                goto out;

        if (attr->nres.data_size) {
                ntfs_inode_warn(
                        &ni->vfs_inode,
                        "one can change sparsed/compressed only for empty files");
                return -EOPNOTSUPP;
        }

        /* Resize nonresident empty attribute in-place only. */
        new_asize = (new_aflags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) ?
                            (SIZEOF_NONRESIDENT_EX + 8) :
                            (SIZEOF_NONRESIDENT + 8);

        if (!mi_resize_attr(mi, attr, new_asize - le32_to_cpu(attr->size)))
                return -EOPNOTSUPP;

        if (new_aflags & ATTR_FLAG_SPARSED) {
                attr->name_off = SIZEOF_NONRESIDENT_EX_LE;
                /* Windows uses 16 clusters per frame but supports one cluster per frame too. */
                attr->nres.c_unit = 0;
                ni->vfs_inode.i_mapping->a_ops = &ntfs_aops;
        } else if (new_aflags & ATTR_FLAG_COMPRESSED) {
                attr->name_off = SIZEOF_NONRESIDENT_EX_LE;
                /* The only allowed: 16 clusters per frame. */
                attr->nres.c_unit = NTFS_LZNT_CUNIT;
                ni->vfs_inode.i_mapping->a_ops = &ntfs_aops_cmpr;
        } else {
                attr->name_off = SIZEOF_NONRESIDENT_LE;
                /* Normal files. */
                attr->nres.c_unit = 0;
                ni->vfs_inode.i_mapping->a_ops = &ntfs_aops;
        }
        attr->nres.run_off = attr->name_off;
out:
        attr->flags = new_aflags;
        mi->dirty = true;

        return 0;
}

/*
 * ni_parse_reparse
 *
 * buffer - memory for reparse buffer header
 */
enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr,
                                   struct REPARSE_DATA_BUFFER *buffer)
{
        const struct REPARSE_DATA_BUFFER *rp = NULL;
        u8 bits;
        u16 len;
        typeof(rp->CompressReparseBuffer) *cmpr;

        /* Try to estimate reparse point. */
        if (!attr->non_res) {
                rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER));
        } else if (le64_to_cpu(attr->nres.data_size) >=
                   sizeof(struct REPARSE_DATA_BUFFER)) {
                struct runs_tree run;

                run_init(&run);

                if (!attr_load_runs_vcn(ni, ATTR_REPARSE, NULL, 0, &run, 0) &&
                    !ntfs_read_run_nb(ni->mi.sbi, &run, 0, buffer,
                                      sizeof(struct REPARSE_DATA_BUFFER),
                                      NULL)) {
                        rp = buffer;
                }

                run_close(&run);
        }

        if (!rp)
                return REPARSE_NONE;

        len = le16_to_cpu(rp->ReparseDataLength);
        switch (rp->ReparseTag) {
        case (IO_REPARSE_TAG_MICROSOFT | IO_REPARSE_TAG_SYMBOLIC_LINK):
                break; /* Symbolic link. */
        case IO_REPARSE_TAG_MOUNT_POINT:
                break; /* Mount points and junctions. */
        case IO_REPARSE_TAG_SYMLINK:
                break;
        case IO_REPARSE_TAG_COMPRESS:
                /*
                 * WOF - Windows Overlay Filter - Used to compress files with
                 * LZX/Xpress.
                 *
                 * Unlike native NTFS file compression, the Windows
                 * Overlay Filter supports only read operations. This means
                 * that it doesn't need to sector-align each compressed chunk,
                 * so the compressed data can be packed more tightly together.
                 * If you open the file for writing, the WOF just decompresses
                 * the entire file, turning it back into a plain file.
                 *
                 * Ntfs3 driver decompresses the entire file only on write or
                 * change size requests.
                 */

                cmpr = &rp->CompressReparseBuffer;
                if (len < sizeof(*cmpr) ||
                    cmpr->WofVersion != WOF_CURRENT_VERSION ||
                    cmpr->WofProvider != WOF_PROVIDER_SYSTEM ||
                    cmpr->ProviderVer != WOF_PROVIDER_CURRENT_VERSION) {
                        return REPARSE_NONE;
                }

                switch (cmpr->CompressionFormat) {
                case WOF_COMPRESSION_XPRESS4K:
                        bits = 0xc; // 4k
                        break;
                case WOF_COMPRESSION_XPRESS8K:
                        bits = 0xd; // 8k
                        break;
                case WOF_COMPRESSION_XPRESS16K:
                        bits = 0xe; // 16k
                        break;
                case WOF_COMPRESSION_LZX32K:
                        bits = 0xf; // 32k
                        break;
                default:
                        bits = 0x10; // 64k
                        break;
                }
                ni_set_ext_compress_bits(ni, bits);
                return REPARSE_COMPRESSED;

        case IO_REPARSE_TAG_DEDUP:
                ni->ni_flags |= NI_FLAG_DEDUPLICATED;
                return REPARSE_DEDUPLICATED;

        default:
                if (rp->ReparseTag & IO_REPARSE_TAG_NAME_SURROGATE)
                        break;

                return REPARSE_NONE;
        }

        if (buffer != rp)
                memcpy(buffer, rp, sizeof(struct REPARSE_DATA_BUFFER));

        /* Looks like normal symlink. */
        return REPARSE_LINK;
}

/*
 * ni_fiemap - Helper for file_fiemap().
 *
 * Assumed ni_lock.
 * TODO: Less aggressive locks.
 */
int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo,
              __u64 vbo, __u64 len)
{
        int err = 0;
        struct ntfs_sb_info *sbi = ni->mi.sbi;
        u8 cluster_bits = sbi->cluster_bits;
        struct runs_tree *run;
        struct rw_semaphore *run_lock;
        struct ATTRIB *attr;
        CLST vcn = vbo >> cluster_bits;
        CLST lcn, clen;
        u64 valid = ni->i_valid;
        u64 lbo, bytes;
        u64 end, alloc_size;
        size_t idx = -1;
        u32 flags;
        bool ok;

        if (S_ISDIR(ni->vfs_inode.i_mode)) {
                run = &ni->dir.alloc_run;
                attr = ni_find_attr(ni, NULL, NULL, ATTR_ALLOC, I30_NAME,
                                    ARRAY_SIZE(I30_NAME), NULL, NULL);
                run_lock = &ni->dir.run_lock;
        } else {
                run = &ni->file.run;
                attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL,
                                    NULL);
                if (!attr) {
                        err = -EINVAL;
                        goto out;
                }
                if (is_attr_compressed(attr)) {
                        /* Unfortunately cp -r incorrectly treats compressed clusters. */
                        err = -EOPNOTSUPP;
                        ntfs_inode_warn(
                                &ni->vfs_inode,
                                "fiemap is not supported for compressed file (cp -r)");
                        goto out;
                }
                run_lock = &ni->file.run_lock;
        }

        if (!attr || !attr->non_res) {
                err = fiemap_fill_next_extent(
                        fieinfo, 0, 0,
                        attr ? le32_to_cpu(attr->res.data_size) : 0,
                        FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST |
                                FIEMAP_EXTENT_MERGED);
                goto out;
        }

        end = vbo + len;
        alloc_size = le64_to_cpu(attr->nres.alloc_size);
        if (end > alloc_size)
                end = alloc_size;

        down_read(run_lock);

        while (vbo < end) {
                if (idx == -1) {
                        ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx);
                } else {
                        CLST vcn_next = vcn;

                        ok = run_get_entry(run, ++idx, &vcn, &lcn, &clen) &&
                             vcn == vcn_next;
                        if (!ok)
                                vcn = vcn_next;
                }

                if (!ok) {
                        up_read(run_lock);
                        down_write(run_lock);

                        err = attr_load_runs_vcn(ni, attr->type,
                                                 attr_name(attr),
                                                 attr->name_len, run, vcn);

                        up_write(run_lock);
                        down_read(run_lock);

                        if (err)
                                break;

                        ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx);

                        if (!ok) {
                                err = -EINVAL;
                                break;
                        }
                }

                if (!clen) {
                        err = -EINVAL; // ?
                        break;
                }

                if (lcn == SPARSE_LCN) {
                        vcn += clen;
                        vbo = (u64)vcn << cluster_bits;
                        continue;
                }

                flags = FIEMAP_EXTENT_MERGED;
                if (S_ISDIR(ni->vfs_inode.i_mode)) {
                        ;
                } else if (is_attr_compressed(attr)) {
                        CLST clst_data;

                        err = attr_is_frame_compressed(
                                ni, attr, vcn >> attr->nres.c_unit, &clst_data);
                        if (err)
                                break;
                        if (clst_data < NTFS_LZNT_CLUSTERS)
                                flags |= FIEMAP_EXTENT_ENCODED;
                } else if (is_attr_encrypted(attr)) {
                        flags |= FIEMAP_EXTENT_DATA_ENCRYPTED;
                }

                vbo = (u64)vcn << cluster_bits;
                bytes = (u64)clen << cluster_bits;
                lbo = (u64)lcn << cluster_bits;

                vcn += clen;

                if (vbo + bytes >= end)
                        bytes = end - vbo;

                if (vbo + bytes <= valid) {
                        ;
                } else if (vbo >= valid) {
                        flags |= FIEMAP_EXTENT_UNWRITTEN;
                } else {
                        /* vbo < valid && valid < vbo + bytes */
                        u64 dlen = valid - vbo;

                        if (vbo + dlen >= end)
                                flags |= FIEMAP_EXTENT_LAST;

                        err = fiemap_fill_next_extent(fieinfo, vbo, lbo, dlen,
                                                      flags);
                        if (err < 0)
                                break;
                        if (err == 1) {
                                err = 0;
                                break;
                        }

                        vbo = valid;
                        bytes -= dlen;
                        if (!bytes)
                                continue;

                        lbo += dlen;
                        flags |= FIEMAP_EXTENT_UNWRITTEN;
                }

                if (vbo + bytes >= end)
                        flags |= FIEMAP_EXTENT_LAST;

                err = fiemap_fill_next_extent(fieinfo, vbo, lbo, bytes, flags);
                if (err < 0)
                        break;
                if (err == 1) {
                        err = 0;
                        break;
                }

                vbo += bytes;
        }

        up_read(run_lock);

out:
        return err;
}

/*
 * ni_readpage_cmpr
 *
 * When decompressing, we typically obtain more than one page per reference.
 * We inject the additional pages into the page cache.
 */
int ni_readpage_cmpr(struct ntfs_inode *ni, struct page *page)
{
        int err;
        struct ntfs_sb_info *sbi = ni->mi.sbi;
        struct address_space *mapping = page->mapping;
        pgoff_t index = page->index;
        u64 frame_vbo, vbo = (u64)index << PAGE_SHIFT;
        struct page **pages = NULL; /* Array of at most 16 pages. stack? */
        u8 frame_bits;
        CLST frame;
        u32 i, idx, frame_size, pages_per_frame;
        gfp_t gfp_mask;
        struct page *pg;

        if (vbo >= i_size_read(&ni->vfs_inode)) {
                SetPageUptodate(page);
                err = 0;
                goto out;
        }

        if (ni->ni_flags & NI_FLAG_COMPRESSED_MASK) {
                /* Xpress or LZX. */
                frame_bits = ni_ext_compress_bits(ni);
        } else {
                /* LZNT compression. */
                frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits;
        }
        frame_size = 1u << frame_bits;
        frame = vbo >> frame_bits;
        frame_vbo = (u64)frame << frame_bits;
        idx = (vbo - frame_vbo) >> PAGE_SHIFT;

        pages_per_frame = frame_size >> PAGE_SHIFT;
        pages = kcalloc(pages_per_frame, sizeof(struct page *), GFP_NOFS);
        if (!pages) {
                err = -ENOMEM;
                goto out;
        }

        pages[idx] = page;
        index = frame_vbo >> PAGE_SHIFT;
        gfp_mask = mapping_gfp_mask(mapping);

        for (i = 0; i < pages_per_frame; i++, index++) {
                if (i == idx)
                        continue;

                pg = find_or_create_page(mapping, index, gfp_mask);
                if (!pg) {
                        err = -ENOMEM;
                        goto out1;
                }
                pages[i] = pg;
        }

        err = ni_read_frame(ni, frame_vbo, pages, pages_per_frame);

out1:
        if (err)
                SetPageError(page);

        for (i = 0; i < pages_per_frame; i++) {
                pg = pages[i];
                if (i == idx || !pg)
                        continue;
                unlock_page(pg);
                put_page(pg);
        }

out:
        /* At this point, err contains 0 or -EIO depending on the "critical" page. */
        kfree(pages);
        unlock_page(page);

        return err;
}

#ifdef CONFIG_NTFS3_LZX_XPRESS
/*
 * ni_decompress_file - Decompress LZX/Xpress compressed file.
 *
 * Remove ATTR_DATA::WofCompressedData.
 * Remove ATTR_REPARSE.
 */
int ni_decompress_file(struct ntfs_inode *ni)
{
        struct ntfs_sb_info *sbi = ni->mi.sbi;
        struct inode *inode = &ni->vfs_inode;
        loff_t i_size = i_size_read(inode);
        struct address_space *mapping = inode->i_mapping;
        gfp_t gfp_mask = mapping_gfp_mask(mapping);
        struct page **pages = NULL;
        struct ATTR_LIST_ENTRY *le;
        struct ATTRIB *attr;
        CLST vcn, cend, lcn, clen, end;
        pgoff_t index;
        u64 vbo;
        u8 frame_bits;
        u32 i, frame_size, pages_per_frame, bytes;
        struct mft_inode *mi;
        int err;

        /* Clusters for decompressed data. */
        cend = bytes_to_cluster(sbi, i_size);

        if (!i_size)
                goto remove_wof;

        /* Check in advance. */
        if (cend > wnd_zeroes(&sbi->used.bitmap)) {
                err = -ENOSPC;
                goto out;
        }

        frame_bits = ni_ext_compress_bits(ni);
        frame_size = 1u << frame_bits;
        pages_per_frame = frame_size >> PAGE_SHIFT;
        pages = kcalloc(pages_per_frame, sizeof(struct page *), GFP_NOFS);
        if (!pages) {
                err = -ENOMEM;
                goto out;
        }

        /*
         * Step 1: Decompress data and copy to new allocated clusters.
         */
        index = 0;
        for (vbo = 0; vbo < i_size; vbo += bytes) {
                u32 nr_pages;
                bool new;

                if (vbo + frame_size > i_size) {
                        bytes = i_size - vbo;
                        nr_pages = (bytes + PAGE_SIZE - 1) >> PAGE_SHIFT;
                } else {
                        nr_pages = pages_per_frame;
                        bytes = frame_size;
                }

                end = bytes_to_cluster(sbi, vbo + bytes);

                for (vcn = vbo >> sbi->cluster_bits; vcn < end; vcn += clen) {
                        err = attr_data_get_block(ni, vcn, cend - vcn, &lcn,
                                                  &clen, &new, false);
                        if (err)
                                goto out;
                }

                for (i = 0; i < pages_per_frame; i++, index++) {
                        struct page *pg;

                        pg = find_or_create_page(mapping, index, gfp_mask);
                        if (!pg) {
                                while (i--) {
                                        unlock_page(pages[i]);
                                        put_page(pages[i]);
                                }
                                err = -ENOMEM;
                                goto out;
                        }
                        pages[i] = pg;
                }

                err = ni_read_frame(ni, vbo, pages, pages_per_frame);

                if (!err) {
                        down_read(&ni->file.run_lock);
                        err = ntfs_bio_pages(sbi, &ni->file.run, pages,
                                             nr_pages, vbo, bytes,
                                             REQ_OP_WRITE);
                        up_read(&ni->file.run_lock);
                }

                for (i = 0; i < pages_per_frame; i++) {
                        unlock_page(pages[i]);
                        put_page(pages[i]);
                }

                if (err)
                        goto out;

                cond_resched();
        }

remove_wof:
        /*
         * Step 2: Deallocate attributes ATTR_DATA::WofCompressedData
         * and ATTR_REPARSE.
         */
        attr = NULL;
        le = NULL;
        while ((attr = ni_enum_attr_ex(ni, attr, &le, NULL))) {
                CLST svcn, evcn;
                u32 asize, roff;

                if (attr->type == ATTR_REPARSE) {
                        struct MFT_REF ref;

                        mi_get_ref(&ni->mi, &ref);
                        ntfs_remove_reparse(sbi, 0, &ref);
                }

                if (!attr->non_res)
                        continue;

                if (attr->type != ATTR_REPARSE &&
                    (attr->type != ATTR_DATA ||
                     attr->name_len != ARRAY_SIZE(WOF_NAME) ||
                     memcmp(attr_name(attr), WOF_NAME, sizeof(WOF_NAME))))
                        continue;

                svcn = le64_to_cpu(attr->nres.svcn);
                evcn = le64_to_cpu(attr->nres.evcn);

                if (evcn + 1 <= svcn)
                        continue;

                asize = le32_to_cpu(attr->size);
                roff = le16_to_cpu(attr->nres.run_off);

                if (roff > asize) {
                        err = -EINVAL;
                        goto out;
                }

                /*run==1  Means unpack and deallocate. */
                run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn, evcn, svcn,
                              Add2Ptr(attr, roff), asize - roff);
        }

        /*
         * Step 3: Remove attribute ATTR_DATA::WofCompressedData.
         */
        err = ni_remove_attr(ni, ATTR_DATA, WOF_NAME, ARRAY_SIZE(WOF_NAME),
                             false, NULL);
        if (err)
                goto out;

        /*
         * Step 4: Remove ATTR_REPARSE.
         */
        err = ni_remove_attr(ni, ATTR_REPARSE, NULL, 0, false, NULL);
        if (err)
                goto out;

        /*
         * Step 5: Remove sparse flag from data attribute.
         */
        attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi);
        if (!attr) {
                err = -EINVAL;
                goto out;
        }

        if (attr->non_res && is_attr_sparsed(attr)) {
                /* Sparsed attribute header is 8 bytes bigger than normal. */
                struct MFT_REC *rec = mi->mrec;
                u32 used = le32_to_cpu(rec->used);
                u32 asize = le32_to_cpu(attr->size);
                u16 roff = le16_to_cpu(attr->nres.run_off);
                char *rbuf = Add2Ptr(attr, roff);

                memmove(rbuf - 8, rbuf, used - PtrOffset(rec, rbuf));
                attr->size = cpu_to_le32(asize - 8);
                attr->flags &= ~ATTR_FLAG_SPARSED;
                attr->nres.run_off = cpu_to_le16(roff - 8);
                attr->nres.c_unit = 0;
                rec->used = cpu_to_le32(used - 8);
                mi->dirty = true;
                ni->std_fa &= ~(FILE_ATTRIBUTE_SPARSE_FILE |
                                FILE_ATTRIBUTE_REPARSE_POINT);

                mark_inode_dirty(inode);
        }

        /* Clear cached flag. */
        ni->ni_flags &= ~NI_FLAG_COMPRESSED_MASK;
        if (ni->file.offs_page) {
                put_page(ni->file.offs_page);
                ni->file.offs_page = NULL;
        }
        mapping->a_ops = &ntfs_aops;

out:
        kfree(pages);
        if (err)
                _ntfs_bad_inode(inode);

        return err;
}

/*
 * decompress_lzx_xpress - External compression LZX/Xpress.
 */
static int decompress_lzx_xpress(struct ntfs_sb_info *sbi, const char *cmpr,
                                 size_t cmpr_size, void *unc, size_t unc_size,
                                 u32 frame_size)
{
        int err;
        void *ctx;

        if (cmpr_size == unc_size) {
                /* Frame not compressed. */
                memcpy(unc, cmpr, unc_size);
                return 0;
        }

        err = 0;
        if (frame_size == 0x8000) {
                mutex_lock(&sbi->compress.mtx_lzx);
                /* LZX: Frame compressed. */
                ctx = sbi->compress.lzx;
                if (!ctx) {
                        /* Lazy initialize LZX decompress context. */
                        ctx = lzx_allocate_decompressor();
                        if (!ctx) {
                                err = -ENOMEM;
                                goto out1;
                        }

                        sbi->compress.lzx = ctx;
                }

                if (lzx_decompress(ctx, cmpr, cmpr_size, unc, unc_size)) {
                        /* Treat all errors as "invalid argument". */
                        err = -EINVAL;
                }
out1:
                mutex_unlock(&sbi->compress.mtx_lzx);
        } else {
                /* XPRESS: Frame compressed. */
                mutex_lock(&sbi->compress.mtx_xpress);
                ctx = sbi->compress.xpress;
                if (!ctx) {
                        /* Lazy initialize Xpress decompress context. */
                        ctx = xpress_allocate_decompressor();
                        if (!ctx) {
                                err = -ENOMEM;
                                goto out2;
                        }

                        sbi->compress.xpress = ctx;
                }

                if (xpress_decompress(ctx, cmpr, cmpr_size, unc, unc_size)) {
                        /* Treat all errors as "invalid argument". */
                        err = -EINVAL;
                }
out2:
                mutex_unlock(&sbi->compress.mtx_xpress);
        }
        return err;
}
#endif

/*
 * ni_read_frame
 *
 * Pages - Array of locked pages.
 */
int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages,
                  u32 pages_per_frame)
{
        int err;
        struct ntfs_sb_info *sbi = ni->mi.sbi;
        u8 cluster_bits = sbi->cluster_bits;
        char *frame_ondisk = NULL;
        char *frame_mem = NULL;
        struct page **pages_disk = NULL;
        struct ATTR_LIST_ENTRY *le = NULL;
        struct runs_tree *run = &ni->file.run;
        u64 valid_size = ni->i_valid;
        u64 vbo_disk;
        size_t unc_size;
        u32 frame_size, i, npages_disk, ondisk_size;
        struct page *pg;
        struct ATTRIB *attr;
        CLST frame, clst_data;

        /*
         * To simplify decompress algorithm do vmap for source
         * and target pages.
         */
        for (i = 0; i < pages_per_frame; i++)
                kmap(pages[i]);

        frame_size = pages_per_frame << PAGE_SHIFT;
        frame_mem = vmap(pages, pages_per_frame, VM_MAP, PAGE_KERNEL);
        if (!frame_mem) {
                err = -ENOMEM;
                goto out;
        }

        attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL, NULL);
        if (!attr) {
                err = -ENOENT;
                goto out1;
        }

        if (!attr->non_res) {
                u32 data_size = le32_to_cpu(attr->res.data_size);

                memset(frame_mem, 0, frame_size);
                if (frame_vbo < data_size) {
                        ondisk_size = data_size - frame_vbo;
                        memcpy(frame_mem, resident_data(attr) + frame_vbo,
                               min(ondisk_size, frame_size));
                }
                err = 0;
                goto out1;
        }

        if (frame_vbo >= valid_size) {
                memset(frame_mem, 0, frame_size);
                err = 0;
                goto out1;
        }

        if (ni->ni_flags & NI_FLAG_COMPRESSED_MASK) {
#ifndef CONFIG_NTFS3_LZX_XPRESS
                err = -EOPNOTSUPP;
                goto out1;
#else
                loff_t i_size = i_size_read(&ni->vfs_inode);
                u32 frame_bits = ni_ext_compress_bits(ni);
                u64 frame64 = frame_vbo >> frame_bits;
                u64 frames, vbo_data;

                if (frame_size != (1u << frame_bits)) {
                        err = -EINVAL;
                        goto out1;
                }
                switch (frame_size) {
                case 0x1000:
                case 0x2000:
                case 0x4000:
                case 0x8000:
                        break;
                default:
                        /* Unknown compression. */
                        err = -EOPNOTSUPP;
                        goto out1;
                }

                attr = ni_find_attr(ni, attr, &le, ATTR_DATA, WOF_NAME,
                                    ARRAY_SIZE(WOF_NAME), NULL, NULL);
                if (!attr) {
                        ntfs_inode_err(
                                &ni->vfs_inode,
                                "external compressed file should contains data attribute \"WofCompressedData\"");
                        err = -EINVAL;
                        goto out1;
                }

                if (!attr->non_res) {
                        run = NULL;
                } else {
                        run = run_alloc();
                        if (!run) {
                                err = -ENOMEM;
                                goto out1;
                        }
                }

                frames = (i_size - 1) >> frame_bits;

                err = attr_wof_frame_info(ni, attr, run, frame64, frames,
                                          frame_bits, &ondisk_size, &vbo_data);
                if (err)
                        goto out2;

                if (frame64 == frames) {
                        unc_size = 1 + ((i_size - 1) & (frame_size - 1));
                        ondisk_size = attr_size(attr) - vbo_data;
                } else {
                        unc_size = frame_size;
                }

                if (ondisk_size > frame_size) {
                        err = -EINVAL;
                        goto out2;
                }

                if (!attr->non_res) {
                        if (vbo_data + ondisk_size >
                            le32_to_cpu(attr->res.data_size)) {
                                err = -EINVAL;
                                goto out1;
                        }

                        err = decompress_lzx_xpress(
                                sbi, Add2Ptr(resident_data(attr), vbo_data),
                                ondisk_size, frame_mem, unc_size, frame_size);
                        goto out1;
                }
                vbo_disk = vbo_data;
                /* Load all runs to read [vbo_disk-vbo_to). */
                err = attr_load_runs_range(ni, ATTR_DATA, WOF_NAME,
                                           ARRAY_SIZE(WOF_NAME), run, vbo_disk,
                                           vbo_data + ondisk_size);
                if (err)
                        goto out2;
                npages_disk = (ondisk_size + (vbo_disk & (PAGE_SIZE - 1)) +
                               PAGE_SIZE - 1) >>
                              PAGE_SHIFT;
#endif
        } else if (is_attr_compressed(attr)) {
                /* LZNT compression. */
                if (sbi->cluster_size > NTFS_LZNT_MAX_CLUSTER) {
                        err = -EOPNOTSUPP;
                        goto out1;
                }

                if (attr->nres.c_unit != NTFS_LZNT_CUNIT) {
                        err = -EOPNOTSUPP;
                        goto out1;
                }

                down_write(&ni->file.run_lock);
                run_truncate_around(run, le64_to_cpu(attr->nres.svcn));
                frame = frame_vbo >> (cluster_bits + NTFS_LZNT_CUNIT);
                err = attr_is_frame_compressed(ni, attr, frame, &clst_data);
                up_write(&ni->file.run_lock);
                if (err)
                        goto out1;

                if (!clst_data) {
                        memset(frame_mem, 0, frame_size);
                        goto out1;
                }

                frame_size = sbi->cluster_size << NTFS_LZNT_CUNIT;
                ondisk_size = clst_data << cluster_bits;

                if (clst_data >= NTFS_LZNT_CLUSTERS) {
                        /* Frame is not compressed. */
                        down_read(&ni->file.run_lock);
                        err = ntfs_bio_pages(sbi, run, pages, pages_per_frame,
                                             frame_vbo, ondisk_size,
                                             REQ_OP_READ);
                        up_read(&ni->file.run_lock);
                        goto out1;
                }
                vbo_disk = frame_vbo;
                npages_disk = (ondisk_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
        } else {
                __builtin_unreachable();
                err = -EINVAL;
                goto out1;
        }

        pages_disk = kcalloc(npages_disk, sizeof(*pages_disk), GFP_NOFS);
        if (!pages_disk) {
                err = -ENOMEM;
                goto out2;
        }

        for (i = 0; i < npages_disk; i++) {
                pg = alloc_page(GFP_KERNEL);
                if (!pg) {
                        err = -ENOMEM;
                        goto out3;
                }
                pages_disk[i] = pg;
                lock_page(pg);
                kmap(pg);
        }

        /* Read 'ondisk_size' bytes from disk. */
        down_read(&ni->file.run_lock);
        err = ntfs_bio_pages(sbi, run, pages_disk, npages_disk, vbo_disk,
                             ondisk_size, REQ_OP_READ);
        up_read(&ni->file.run_lock);
        if (err)
                goto out3;

        /*
         * To simplify decompress algorithm do vmap for source and target pages.
         */
        frame_ondisk = vmap(pages_disk, npages_disk, VM_MAP, PAGE_KERNEL_RO);
        if (!frame_ondisk) {
                err = -ENOMEM;
                goto out3;
        }

        /* Decompress: Frame_ondisk -> frame_mem. */
#ifdef CONFIG_NTFS3_LZX_XPRESS
        if (run != &ni->file.run) {
                /* LZX or XPRESS */
                err = decompress_lzx_xpress(
                        sbi, frame_ondisk + (vbo_disk & (PAGE_SIZE - 1)),
                        ondisk_size, frame_mem, unc_size, frame_size);
        } else
#endif
        {
                /* LZNT - Native NTFS compression. */
                unc_size = decompress_lznt(frame_ondisk, ondisk_size, frame_mem,
                                           frame_size);
                if ((ssize_t)unc_size < 0)
                        err = unc_size;
                else if (!unc_size || unc_size > frame_size)
                        err = -EINVAL;
        }
        if (!err && valid_size < frame_vbo + frame_size) {
                size_t ok = valid_size - frame_vbo;

                memset(frame_mem + ok, 0, frame_size - ok);
        }

        vunmap(frame_ondisk);

out3:
        for (i = 0; i < npages_disk; i++) {
                pg = pages_disk[i];
                if (pg) {
                        kunmap(pg);
                        unlock_page(pg);
                        put_page(pg);
                }
        }
        kfree(pages_disk);

out2:
#ifdef CONFIG_NTFS3_LZX_XPRESS
        if (run != &ni->file.run)
                run_free(run);
#endif
out1:
        vunmap(frame_mem);
out:
        for (i = 0; i < pages_per_frame; i++) {
                pg = pages[i];
                kunmap(pg);
                ClearPageError(pg);
                SetPageUptodate(pg);
        }

        return err;
}

/*
 * ni_write_frame
 *
 * Pages - Array of locked pages.
 */
int ni_write_frame(struct ntfs_inode *ni, struct page **pages,
                   u32 pages_per_frame)
{
        int err;
        struct ntfs_sb_info *sbi = ni->mi.sbi;
        u8 frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits;
        u32 frame_size = sbi->cluster_size << NTFS_LZNT_CUNIT;
        u64 frame_vbo = (u64)pages[0]->index << PAGE_SHIFT;
        CLST frame = frame_vbo >> frame_bits;
        char *frame_ondisk = NULL;
        struct page **pages_disk = NULL;
        struct ATTR_LIST_ENTRY *le = NULL;
        char *frame_mem;
        struct ATTRIB *attr;
        struct mft_inode *mi;
        u32 i;
        struct page *pg;
        size_t compr_size, ondisk_size;
        struct lznt *lznt;

        attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL, &mi);
        if (!attr) {
                err = -ENOENT;
                goto out;
        }

        if (WARN_ON(!is_attr_compressed(attr))) {
                err = -EINVAL;
                goto out;
        }

        if (sbi->cluster_size > NTFS_LZNT_MAX_CLUSTER) {
                err = -EOPNOTSUPP;
                goto out;
        }

        if (!attr->non_res) {
                down_write(&ni->file.run_lock);
                err = attr_make_nonresident(ni, attr, le, mi,
                                            le32_to_cpu(attr->res.data_size),
                                            &ni->file.run, &attr, pages[0]);
                up_write(&ni->file.run_lock);
                if (err)
                        goto out;
        }

        if (attr->nres.c_unit != NTFS_LZNT_CUNIT) {
                err = -EOPNOTSUPP;
                goto out;
        }

        pages_disk = kcalloc(pages_per_frame, sizeof(struct page *), GFP_NOFS);
        if (!pages_disk) {
                err = -ENOMEM;
                goto out;
        }

        for (i = 0; i < pages_per_frame; i++) {
                pg = alloc_page(GFP_KERNEL);
                if (!pg) {
                        err = -ENOMEM;
                        goto out1;
                }
                pages_disk[i] = pg;
                lock_page(pg);
                kmap(pg);
        }

        /* To simplify compress algorithm do vmap for source and target pages. */
        frame_ondisk = vmap(pages_disk, pages_per_frame, VM_MAP, PAGE_KERNEL);
        if (!frame_ondisk) {
                err = -ENOMEM;
                goto out1;
        }

        for (i = 0; i < pages_per_frame; i++)
                kmap(pages[i]);

        /* Map in-memory frame for read-only. */
        frame_mem = vmap(pages, pages_per_frame, VM_MAP, PAGE_KERNEL_RO);
        if (!frame_mem) {
                err = -ENOMEM;
                goto out2;
        }

        mutex_lock(&sbi->compress.mtx_lznt);
        lznt = NULL;
        if (!sbi->compress.lznt) {
                /*
                 * LZNT implements two levels of compression:
                 * 0 - Standard compression
                 * 1 - Best compression, requires a lot of cpu
                 * use mount option?
                 */
                lznt = get_lznt_ctx(0);
                if (!lznt) {
                        mutex_unlock(&sbi->compress.mtx_lznt);
                        err = -ENOMEM;
                        goto out3;
                }

                sbi->compress.lznt = lznt;
                lznt = NULL;
        }

        /* Compress: frame_mem -> frame_ondisk */
        compr_size = compress_lznt(frame_mem, frame_size, frame_ondisk,
                                   frame_size, sbi->compress.lznt);
        mutex_unlock(&sbi->compress.mtx_lznt);
        kfree(lznt);

        if (compr_size + sbi->cluster_size > frame_size) {
                /* Frame is not compressed. */
                compr_size = frame_size;
                ondisk_size = frame_size;
        } else if (compr_size) {
                /* Frame is compressed. */
                ondisk_size = ntfs_up_cluster(sbi, compr_size);
                memset(frame_ondisk + compr_size, 0, ondisk_size - compr_size);
        } else {
                /* Frame is sparsed. */
                ondisk_size = 0;
        }

        down_write(&ni->file.run_lock);
        run_truncate_around(&ni->file.run, le64_to_cpu(attr->nres.svcn));
        err = attr_allocate_frame(ni, frame, compr_size, ni->i_valid);
        up_write(&ni->file.run_lock);
        if (err)
                goto out2;

        if (!ondisk_size)
                goto out2;

        down_read(&ni->file.run_lock);
        err = ntfs_bio_pages(sbi, &ni->file.run,
                             ondisk_size < frame_size ? pages_disk : pages,
                             pages_per_frame, frame_vbo, ondisk_size,
                             REQ_OP_WRITE);
        up_read(&ni->file.run_lock);

out3:
        vunmap(frame_mem);

out2:
        for (i = 0; i < pages_per_frame; i++)
                kunmap(pages[i]);

        vunmap(frame_ondisk);
out1:
        for (i = 0; i < pages_per_frame; i++) {
                pg = pages_disk[i];
                if (pg) {
                        kunmap(pg);
                        unlock_page(pg);
                        put_page(pg);
                }
        }
        kfree(pages_disk);
out:
        return err;
}

/*
 * ni_remove_name - Removes name 'de' from MFT and from directory.
 * 'de2' and 'undo_step' are used to restore MFT/dir, if error occurs.
 */
int ni_remove_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
                   struct NTFS_DE *de, struct NTFS_DE **de2, int *undo_step)
{
        int err;
        struct ntfs_sb_info *sbi = ni->mi.sbi;
        struct ATTR_FILE_NAME *de_name = (struct ATTR_FILE_NAME *)(de + 1);
        struct ATTR_FILE_NAME *fname;
        struct ATTR_LIST_ENTRY *le;
        struct mft_inode *mi;
        u16 de_key_size = le16_to_cpu(de->key_size);
        u8 name_type;

        *undo_step = 0;

        /* Find name in record. */
        mi_get_ref(&dir_ni->mi, &de_name->home);

        fname = ni_fname_name(ni, (struct le_str *)&de_name->name_len,
                              &de_name->home, &mi, &le);
        if (!fname)
                return -ENOENT;

        memcpy(&de_name->dup, &fname->dup, sizeof(struct NTFS_DUP_INFO));
        name_type = paired_name(fname->type);

        /* Mark ntfs as dirty. It will be cleared at umount. */
        ntfs_set_state(sbi, NTFS_DIRTY_DIRTY);

        /* Step 1: Remove name from directory. */
        err = indx_delete_entry(&dir_ni->dir, dir_ni, fname, de_key_size, sbi);
        if (err)
                return err;

        /* Step 2: Remove name from MFT. */
        ni_remove_attr_le(ni, attr_from_name(fname), mi, le);

        *undo_step = 2;

        /* Get paired name. */
        fname = ni_fname_type(ni, name_type, &mi, &le);
        if (fname) {
                u16 de2_key_size = fname_full_size(fname);

                *de2 = Add2Ptr(de, 1024);
                (*de2)->key_size = cpu_to_le16(de2_key_size);

                memcpy(*de2 + 1, fname, de2_key_size);

                /* Step 3: Remove paired name from directory. */
                err = indx_delete_entry(&dir_ni->dir, dir_ni, fname,
                                        de2_key_size, sbi);
                if (err)
                        return err;

                /* Step 4: Remove paired name from MFT. */
                ni_remove_attr_le(ni, attr_from_name(fname), mi, le);

                *undo_step = 4;
        }
        return 0;
}

/*
 * ni_remove_name_undo - Paired function for ni_remove_name.
 *
 * Return: True if ok
 */
bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
                         struct NTFS_DE *de, struct NTFS_DE *de2, int undo_step)
{
        struct ntfs_sb_info *sbi = ni->mi.sbi;
        struct ATTRIB *attr;
        u16 de_key_size;

        switch (undo_step) {
        case 4:
                de_key_size = le16_to_cpu(de2->key_size);
                if (ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0,
                                       &attr, NULL, NULL))
                        return false;
                memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de2 + 1, de_key_size);

                mi_get_ref(&ni->mi, &de2->ref);
                de2->size = cpu_to_le16(ALIGN(de_key_size, 8) +
                                        sizeof(struct NTFS_DE));
                de2->flags = 0;
                de2->res = 0;

                if (indx_insert_entry(&dir_ni->dir, dir_ni, de2, sbi, NULL, 1))
                        return false;
                fallthrough;

        case 2:
                de_key_size = le16_to_cpu(de->key_size);

                if (ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0,
                                       &attr, NULL, NULL))
                        return false;

                memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de + 1, de_key_size);
                mi_get_ref(&ni->mi, &de->ref);

                if (indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 1))
                        return false;
        }

        return true;
}

/*
 * ni_add_name - Add new name into MFT and into directory.
 */
int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
                struct NTFS_DE *de)
{
        int err;
        struct ntfs_sb_info *sbi = ni->mi.sbi;
        struct ATTRIB *attr;
        struct ATTR_LIST_ENTRY *le;
        struct mft_inode *mi;
        struct ATTR_FILE_NAME *fname;
        struct ATTR_FILE_NAME *de_name = (struct ATTR_FILE_NAME *)(de + 1);
        u16 de_key_size = le16_to_cpu(de->key_size);

        if (sbi->options->windows_names &&
            !valid_windows_name(sbi, (struct le_str *)&de_name->name_len))
                return -EINVAL;

        /* If option "hide_dot_files" then set hidden attribute for dot files. */
        if (ni->mi.sbi->options->hide_dot_files) {
                if (de_name->name_len > 0 &&
                    le16_to_cpu(de_name->name[0]) == '.')
                        ni->std_fa |= FILE_ATTRIBUTE_HIDDEN;
                else
                        ni->std_fa &= ~FILE_ATTRIBUTE_HIDDEN;
        }

        mi_get_ref(&ni->mi, &de->ref);
        mi_get_ref(&dir_ni->mi, &de_name->home);

        /* Fill duplicate from any ATTR_NAME. */
        fname = ni_fname_name(ni, NULL, NULL, NULL, NULL);
        if (fname)
                memcpy(&de_name->dup, &fname->dup, sizeof(fname->dup));
        de_name->dup.fa = ni->std_fa;

        /* Insert new name into MFT. */
        err = ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0, &attr,
                                 &mi, &le);
        if (err)
                return err;

        memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de_name, de_key_size);

        /* Insert new name into directory. */
        err = indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 0);
        if (err)
                ni_remove_attr_le(ni, attr, mi, le);

        return err;
}

/*
 * ni_rename - Remove one name and insert new name.
 */
int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni,
              struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de,
              bool *is_bad)
{
        int err;
        struct NTFS_DE *de2 = NULL;
        int undo = 0;

        /*
         * There are two possible ways to rename:
         * 1) Add new name and remove old name.
         * 2) Remove old name and add new name.
         *
         * In most cases (not all!) adding new name into MFT and into directory can
         * allocate additional cluster(s).
         * Second way may result to bad inode if we can't add new name
         * and then can't restore (add) old name.
         */

        /*
         * Way 1 - Add new + remove old.
         */
        err = ni_add_name(new_dir_ni, ni, new_de);
        if (!err) {
                err = ni_remove_name(dir_ni, ni, de, &de2, &undo);
                if (err && ni_remove_name(new_dir_ni, ni, new_de, &de2, &undo))
                        *is_bad = true;
        }

        /*
         * Way 2 - Remove old + add new.
         */
        /*
         *        err = ni_remove_name(dir_ni, ni, de, &de2, &undo);
         *        if (!err) {
         *                err = ni_add_name(new_dir_ni, ni, new_de);
         *                if (err && !ni_remove_name_undo(dir_ni, ni, de, de2, undo))
         *                        *is_bad = true;
         *        }
         */

        return err;
}

/*
 * ni_is_dirty - Return: True if 'ni' requires ni_write_inode.
 */
bool ni_is_dirty(struct inode *inode)
{
        struct ntfs_inode *ni = ntfs_i(inode);
        struct rb_node *node;

        if (ni->mi.dirty || ni->attr_list.dirty ||
            (ni->ni_flags & NI_FLAG_UPDATE_PARENT))
                return true;

        for (node = rb_first(&ni->mi_tree); node; node = rb_next(node)) {
                if (rb_entry(node, struct mft_inode, node)->dirty)
                        return true;
        }

        return false;
}

/*
 * ni_update_parent
 *
 * Update duplicate info of ATTR_FILE_NAME in MFT and in parent directories.
 */
static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup,
                             int sync)
{
        struct ATTRIB *attr;
        struct mft_inode *mi;
        struct ATTR_LIST_ENTRY *le = NULL;
        struct ntfs_sb_info *sbi = ni->mi.sbi;
        struct super_block *sb = sbi->sb;
        bool re_dirty = false;

        if (ni->mi.mrec->flags & RECORD_FLAG_DIR) {
                dup->fa |= FILE_ATTRIBUTE_DIRECTORY;
                attr = NULL;
                dup->alloc_size = 0;
                dup->data_size = 0;
        } else {
                dup->fa &= ~FILE_ATTRIBUTE_DIRECTORY;

                attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL,
                                    &mi);
                if (!attr) {
                        dup->alloc_size = dup->data_size = 0;
                } else if (!attr->non_res) {
                        u32 data_size = le32_to_cpu(attr->res.data_size);

                        dup->alloc_size = cpu_to_le64(ALIGN(data_size, 8));
                        dup->data_size = cpu_to_le64(data_size);
                } else {
                        u64 new_valid = ni->i_valid;
                        u64 data_size = le64_to_cpu(attr->nres.data_size);
                        __le64 valid_le;

                        dup->alloc_size = is_attr_ext(attr) ?
                                                  attr->nres.total_size :
                                                  attr->nres.alloc_size;
                        dup->data_size = attr->nres.data_size;

                        if (new_valid > data_size)
                                new_valid = data_size;

                        valid_le = cpu_to_le64(new_valid);
                        if (valid_le != attr->nres.valid_size) {
                                attr->nres.valid_size = valid_le;
                                mi->dirty = true;
                        }
                }
        }

        /* TODO: Fill reparse info. */
        dup->reparse = 0;
        dup->ea_size = 0;

        if (ni->ni_flags & NI_FLAG_EA) {
                attr = ni_find_attr(ni, attr, &le, ATTR_EA_INFO, NULL, 0, NULL,
                                    NULL);
                if (attr) {
                        const struct EA_INFO *info;

                        info = resident_data_ex(attr, sizeof(struct EA_INFO));
                        /* If ATTR_EA_INFO exists 'info' can't be NULL. */
                        if (info)
                                dup->ea_size = info->size_pack;
                }
        }

        attr = NULL;
        le = NULL;

        while ((attr = ni_find_attr(ni, attr, &le, ATTR_NAME, NULL, 0, NULL,
                                    &mi))) {
                struct inode *dir;
                struct ATTR_FILE_NAME *fname;

                fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME);
                if (!fname || !memcmp(&fname->dup, dup, sizeof(fname->dup)))
                        continue;

                /* Check simple case when parent inode equals current inode. */
                if (ino_get(&fname->home) == ni->vfs_inode.i_ino) {
                        ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
                        continue;
                }

                /* ntfs_iget5 may sleep. */
                dir = ntfs_iget5(sb, &fname->home, NULL);
                if (IS_ERR(dir)) {
                        ntfs_inode_warn(
                                &ni->vfs_inode,
                                "failed to open parent directory r=%lx to update",
                                (long)ino_get(&fname->home));
                        continue;
                }

                if (!is_bad_inode(dir)) {
                        struct ntfs_inode *dir_ni = ntfs_i(dir);

                        if (!ni_trylock(dir_ni)) {
                                re_dirty = true;
                        } else {
                                indx_update_dup(dir_ni, sbi, fname, dup, sync);
                                ni_unlock(dir_ni);
                                memcpy(&fname->dup, dup, sizeof(fname->dup));
                                mi->dirty = true;
                        }
                }
                iput(dir);
        }

        return re_dirty;
}

/*
 * ni_write_inode - Write MFT base record and all subrecords to disk.
 */
int ni_write_inode(struct inode *inode, int sync, const char *hint)
{
        int err = 0, err2;
        struct ntfs_inode *ni = ntfs_i(inode);
        struct super_block *sb = inode->i_sb;
        struct ntfs_sb_info *sbi = sb->s_fs_info;
        bool re_dirty = false;
        struct ATTR_STD_INFO *std;
        struct rb_node *node, *next;
        struct NTFS_DUP_INFO dup;

        if (is_bad_inode(inode) || sb_rdonly(sb))
                return 0;

        if (unlikely(ntfs3_forced_shutdown(sb)))
                return -EIO;

        if (!ni_trylock(ni)) {
                /* 'ni' is under modification, skip for now. */
                mark_inode_dirty_sync(inode);
                return 0;
        }

        if (!ni->mi.mrec)
                goto out;

        if (is_rec_inuse(ni->mi.mrec) &&
            !(sbi->flags & NTFS_FLAGS_LOG_REPLAYING) && inode->i_nlink) {
                bool modified = false;
                struct timespec64 ts;

                /* Update times in standard attribute. */
                std = ni_std(ni);
                if (!std) {
                        err = -EINVAL;
                        goto out;
                }

                /* Update the access times if they have changed. */
                ts = inode_get_mtime(inode);
                dup.m_time = kernel2nt(&ts);
                if (std->m_time != dup.m_time) {
                        std->m_time = dup.m_time;
                        modified = true;
                }

                ts = inode_get_ctime(inode);
                dup.c_time = kernel2nt(&ts);
                if (std->c_time != dup.c_time) {
                        std->c_time = dup.c_time;
                        modified = true;
                }

                ts = inode_get_atime(inode);
                dup.a_time = kernel2nt(&ts);
                if (std->a_time != dup.a_time) {
                        std->a_time = dup.a_time;
                        modified = true;
                }

                dup.fa = ni->std_fa;
                if (std->fa != dup.fa) {
                        std->fa = dup.fa;
                        modified = true;
                }

                /* std attribute is always in primary MFT record. */
                if (modified)
                        ni->mi.dirty = true;

                if (!ntfs_is_meta_file(sbi, inode->i_ino) &&
                    (modified || (ni->ni_flags & NI_FLAG_UPDATE_PARENT))
                    /* Avoid __wait_on_freeing_inode(inode). */
                    && (sb->s_flags & SB_ACTIVE)) {
                        dup.cr_time = std->cr_time;
                        /* Not critical if this function fail. */
                        re_dirty = ni_update_parent(ni, &dup, sync);

                        if (re_dirty)
                                ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
                        else
                                ni->ni_flags &= ~NI_FLAG_UPDATE_PARENT;
                }

                /* Update attribute list. */
                if (ni->attr_list.size && ni->attr_list.dirty) {
                        if (inode->i_ino != MFT_REC_MFT || sync) {
                                err = ni_try_remove_attr_list(ni);
                                if (err)
                                        goto out;
                        }

                        err = al_update(ni, sync);
                        if (err)
                                goto out;
                }
        }

        for (node = rb_first(&ni->mi_tree); node; node = next) {
                struct mft_inode *mi = rb_entry(node, struct mft_inode, node);
                bool is_empty;

                next = rb_next(node);

                if (!mi->dirty)
                        continue;

                is_empty = !mi_enum_attr(mi, NULL);

                if (is_empty)
                        clear_rec_inuse(mi->mrec);

                err2 = mi_write(mi, sync);
                if (!err && err2)
                        err = err2;

                if (is_empty) {
                        ntfs_mark_rec_free(sbi, mi->rno, false);
                        rb_erase(node, &ni->mi_tree);
                        mi_put(mi);
                }
        }

        if (ni->mi.dirty) {
                err2 = mi_write(&ni->mi, sync);
                if (!err && err2)
                        err = err2;
        }
out:
        ni_unlock(ni);

        if (err) {
                ntfs_inode_err(inode, "%s failed, %d.", hint, err);
                ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
                return err;
        }

        if (re_dirty)
                mark_inode_dirty_sync(inode);

        return 0;
}




















































































    1 





    4 
    1 
    4 




    5 
    1 
    4 





    1 





    1 
    1 




    1 







    1 
    1 

























































    1 
    1 




























































    1 





    1 





























    1 




























    1 








    1 























    1 

    1 























































    1 
    1 



    1 





















    1 












































    1 











































    1 























    1 


























































    1 


    1 

    1 



















































    1 













    1 


    1 










    1 







    1 
































































































    1 


    1 







    1 




    1 




    1 






    1 














































































































































































































































    1 
    1 
    1 
    1 




































































































































































































































































































































    3 


    1 

    4 


































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/pipe.c
 *
 *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
 */

#include <linux/mm.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/log2.h>
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/magic.h>
#include <linux/pipe_fs_i.h>
#include <linux/uio.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/audit.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <linux/memcontrol.h>
#include <linux/watch_queue.h>
#include <linux/sysctl.h>

#include <linux/uaccess.h>
#include <asm/ioctls.h>

#include "internal.h"

/*
 * New pipe buffers will be restricted to this size while the user is exceeding
 * their pipe buffer quota. The general pipe use case needs at least two
 * buffers: one for data yet to be read, and one for new data. If this is less
 * than two, then a write to a non-empty pipe may block even if the pipe is not
 * full. This can occur with GNU make jobserver or similar uses of pipes as
 * semaphores: multiple processes may be waiting to write tokens back to the
 * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/.
 *
 * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their
 * own risk, namely: pipe writes to non-full pipes may block until the pipe is
 * emptied.
 */
#define PIPE_MIN_DEF_BUFFERS 2

/*
 * The max size that a non-root user is allowed to grow the pipe. Can
 * be set by root in /proc/sys/fs/pipe-max-size
 */
static unsigned int pipe_max_size = 1048576;

/* Maximum allocatable pages per user. Hard limit is unset by default, soft
 * matches default values.
 */
static unsigned long pipe_user_pages_hard;
static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;

/*
 * We use head and tail indices that aren't masked off, except at the point of
 * dereference, but rather they're allowed to wrap naturally.  This means there
 * isn't a dead spot in the buffer, but the ring has to be a power of two and
 * <= 2^31.
 * -- David Howells 2019-09-23.
 *
 * Reads with count = 0 should always return 0.
 * -- Julian Bradfield 1999-06-07.
 *
 * FIFOs and Pipes now generate SIGIO for both readers and writers.
 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
 *
 * pipe_read & write cleanup
 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
 */

#define cmp_int(l, r)                ((l > r) - (l < r))

#ifdef CONFIG_PROVE_LOCKING
static int pipe_lock_cmp_fn(const struct lockdep_map *a,
                            const struct lockdep_map *b)
{
        return cmp_int((unsigned long) a, (unsigned long) b);
}
#endif

void pipe_lock(struct pipe_inode_info *pipe)
{
        if (pipe->files)
                mutex_lock(&pipe->mutex);
}
EXPORT_SYMBOL(pipe_lock);

void pipe_unlock(struct pipe_inode_info *pipe)
{
        if (pipe->files)
                mutex_unlock(&pipe->mutex);
}
EXPORT_SYMBOL(pipe_unlock);

void pipe_double_lock(struct pipe_inode_info *pipe1,
                      struct pipe_inode_info *pipe2)
{
        BUG_ON(pipe1 == pipe2);

        if (pipe1 > pipe2)
                swap(pipe1, pipe2);

        pipe_lock(pipe1);
        pipe_lock(pipe2);
}

static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
                                  struct pipe_buffer *buf)
{
        struct page *page = buf->page;

        /*
         * If nobody else uses this page, and we don't already have a
         * temporary page, let's keep track of it as a one-deep
         * allocation cache. (Otherwise just release our reference to it)
         */
        if (page_count(page) == 1 && !pipe->tmp_page)
                pipe->tmp_page = page;
        else
                put_page(page);
}

static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
                struct pipe_buffer *buf)
{
        struct page *page = buf->page;

        if (page_count(page) != 1)
                return false;
        memcg_kmem_uncharge_page(page, 0);
        __SetPageLocked(page);
        return true;
}

/**
 * generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to attempt to steal
 *
 * Description:
 *        This function attempts to steal the &struct page attached to
 *        @buf. If successful, this function returns 0 and returns with
 *        the page locked. The caller may then reuse the page for whatever
 *        he wishes; the typical use is insertion into a different file
 *        page cache.
 */
bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe,
                struct pipe_buffer *buf)
{
        struct page *page = buf->page;

        /*
         * A reference of one is golden, that means that the owner of this
         * page is the only one holding a reference to it. lock the page
         * and return OK.
         */
        if (page_count(page) == 1) {
                lock_page(page);
                return true;
        }
        return false;
}
EXPORT_SYMBOL(generic_pipe_buf_try_steal);

/**
 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to get a reference to
 *
 * Description:
 *        This function grabs an extra reference to @buf. It's used in
 *        the tee() system call, when we duplicate the buffers in one
 *        pipe into another.
 */
bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
        return try_get_page(buf->page);
}
EXPORT_SYMBOL(generic_pipe_buf_get);

/**
 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to put a reference to
 *
 * Description:
 *        This function releases a reference to @buf.
 */
void generic_pipe_buf_release(struct pipe_inode_info *pipe,
                              struct pipe_buffer *buf)
{
        put_page(buf->page);
}
EXPORT_SYMBOL(generic_pipe_buf_release);

static const struct pipe_buf_operations anon_pipe_buf_ops = {
        .release        = anon_pipe_buf_release,
        .try_steal        = anon_pipe_buf_try_steal,
        .get                = generic_pipe_buf_get,
};

/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
static inline bool pipe_readable(const struct pipe_inode_info *pipe)
{
        unsigned int head = READ_ONCE(pipe->head);
        unsigned int tail = READ_ONCE(pipe->tail);
        unsigned int writers = READ_ONCE(pipe->writers);

        return !pipe_empty(head, tail) || !writers;
}

static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
                                            struct pipe_buffer *buf,
                                            unsigned int tail)
{
        pipe_buf_release(pipe, buf);

        /*
         * If the pipe has a watch_queue, we need additional protection
         * by the spinlock because notifications get posted with only
         * this spinlock, no mutex
         */
        if (pipe_has_watch_queue(pipe)) {
                spin_lock_irq(&pipe->rd_wait.lock);
#ifdef CONFIG_WATCH_QUEUE
                if (buf->flags & PIPE_BUF_FLAG_LOSS)
                        pipe->note_loss = true;
#endif
                pipe->tail = ++tail;
                spin_unlock_irq(&pipe->rd_wait.lock);
                return tail;
        }

        /*
         * Without a watch_queue, we can simply increment the tail
         * without the spinlock - the mutex is enough.
         */
        pipe->tail = ++tail;
        return tail;
}

static ssize_t
pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
        size_t total_len = iov_iter_count(to);
        struct file *filp = iocb->ki_filp;
        struct pipe_inode_info *pipe = filp->private_data;
        bool was_full, wake_next_reader = false;
        ssize_t ret;

        /* Null read succeeds. */
        if (unlikely(total_len == 0))
                return 0;

        ret = 0;
        mutex_lock(&pipe->mutex);

        /*
         * We only wake up writers if the pipe was full when we started
         * reading in order to avoid unnecessary wakeups.
         *
         * But when we do wake up writers, we do so using a sync wakeup
         * (WF_SYNC), because we want them to get going and generate more
         * data for us.
         */
        was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
        for (;;) {
                /* Read ->head with a barrier vs post_one_notification() */
                unsigned int head = smp_load_acquire(&pipe->head);
                unsigned int tail = pipe->tail;
                unsigned int mask = pipe->ring_size - 1;

#ifdef CONFIG_WATCH_QUEUE
                if (pipe->note_loss) {
                        struct watch_notification n;

                        if (total_len < 8) {
                                if (ret == 0)
                                        ret = -ENOBUFS;
                                break;
                        }

                        n.type = WATCH_TYPE_META;
                        n.subtype = WATCH_META_LOSS_NOTIFICATION;
                        n.info = watch_sizeof(n);
                        if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
                                if (ret == 0)
                                        ret = -EFAULT;
                                break;
                        }
                        ret += sizeof(n);
                        total_len -= sizeof(n);
                        pipe->note_loss = false;
                }
#endif

                if (!pipe_empty(head, tail)) {
                        struct pipe_buffer *buf = &pipe->bufs[tail & mask];
                        size_t chars = buf->len;
                        size_t written;
                        int error;

                        if (chars > total_len) {
                                if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
                                        if (ret == 0)
                                                ret = -ENOBUFS;
                                        break;
                                }
                                chars = total_len;
                        }

                        error = pipe_buf_confirm(pipe, buf);
                        if (error) {
                                if (!ret)
                                        ret = error;
                                break;
                        }

                        written = copy_page_to_iter(buf->page, buf->offset, chars, to);
                        if (unlikely(written < chars)) {
                                if (!ret)
                                        ret = -EFAULT;
                                break;
                        }
                        ret += chars;
                        buf->offset += chars;
                        buf->len -= chars;

                        /* Was it a packet buffer? Clean up and exit */
                        if (buf->flags & PIPE_BUF_FLAG_PACKET) {
                                total_len = chars;
                                buf->len = 0;
                        }

                        if (!buf->len)
                                tail = pipe_update_tail(pipe, buf, tail);
                        total_len -= chars;
                        if (!total_len)
                                break;        /* common path: read succeeded */
                        if (!pipe_empty(head, tail))        /* More to do? */
                                continue;
                }

                if (!pipe->writers)
                        break;
                if (ret)
                        break;
                if ((filp->f_flags & O_NONBLOCK) ||
                    (iocb->ki_flags & IOCB_NOWAIT)) {
                        ret = -EAGAIN;
                        break;
                }
                mutex_unlock(&pipe->mutex);

                /*
                 * We only get here if we didn't actually read anything.
                 *
                 * However, we could have seen (and removed) a zero-sized
                 * pipe buffer, and might have made space in the buffers
                 * that way.
                 *
                 * You can't make zero-sized pipe buffers by doing an empty
                 * write (not even in packet mode), but they can happen if
                 * the writer gets an EFAULT when trying to fill a buffer
                 * that already got allocated and inserted in the buffer
                 * array.
                 *
                 * So we still need to wake up any pending writers in the
                 * _very_ unlikely case that the pipe was full, but we got
                 * no data.
                 */
                if (unlikely(was_full))
                        wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);

                /*
                 * But because we didn't read anything, at this point we can
                 * just return directly with -ERESTARTSYS if we're interrupted,
                 * since we've done any required wakeups and there's no need
                 * to mark anything accessed. And we've dropped the lock.
                 */
                if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
                        return -ERESTARTSYS;

                mutex_lock(&pipe->mutex);
                was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
                wake_next_reader = true;
        }
        if (pipe_empty(pipe->head, pipe->tail))
                wake_next_reader = false;
        mutex_unlock(&pipe->mutex);

        if (was_full)
                wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
        if (wake_next_reader)
                wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
        if (ret > 0)
                file_accessed(filp);
        return ret;
}

static inline int is_packetized(struct file *file)
{
        return (file->f_flags & O_DIRECT) != 0;
}

/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
static inline bool pipe_writable(const struct pipe_inode_info *pipe)
{
        unsigned int head = READ_ONCE(pipe->head);
        unsigned int tail = READ_ONCE(pipe->tail);
        unsigned int max_usage = READ_ONCE(pipe->max_usage);

        return !pipe_full(head, tail, max_usage) ||
                !READ_ONCE(pipe->readers);
}

static ssize_t
pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *filp = iocb->ki_filp;
        struct pipe_inode_info *pipe = filp->private_data;
        unsigned int head;
        ssize_t ret = 0;
        size_t total_len = iov_iter_count(from);
        ssize_t chars;
        bool was_empty = false;
        bool wake_next_writer = false;

        /*
         * Reject writing to watch queue pipes before the point where we lock
         * the pipe.
         * Otherwise, lockdep would be unhappy if the caller already has another
         * pipe locked.
         * If we had to support locking a normal pipe and a notification pipe at
         * the same time, we could set up lockdep annotations for that, but
         * since we don't actually need that, it's simpler to just bail here.
         */
        if (pipe_has_watch_queue(pipe))
                return -EXDEV;

        /* Null write succeeds. */
        if (unlikely(total_len == 0))
                return 0;

        mutex_lock(&pipe->mutex);

        if (!pipe->readers) {
                send_sig(SIGPIPE, current, 0);
                ret = -EPIPE;
                goto out;
        }

        /*
         * If it wasn't empty we try to merge new data into
         * the last buffer.
         *
         * That naturally merges small writes, but it also
         * page-aligns the rest of the writes for large writes
         * spanning multiple pages.
         */
        head = pipe->head;
        was_empty = pipe_empty(head, pipe->tail);
        chars = total_len & (PAGE_SIZE-1);
        if (chars && !was_empty) {
                unsigned int mask = pipe->ring_size - 1;
                struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
                int offset = buf->offset + buf->len;

                if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
                    offset + chars <= PAGE_SIZE) {
                        ret = pipe_buf_confirm(pipe, buf);
                        if (ret)
                                goto out;

                        ret = copy_page_from_iter(buf->page, offset, chars, from);
                        if (unlikely(ret < chars)) {
                                ret = -EFAULT;
                                goto out;
                        }

                        buf->len += ret;
                        if (!iov_iter_count(from))
                                goto out;
                }
        }

        for (;;) {
                if (!pipe->readers) {
                        send_sig(SIGPIPE, current, 0);
                        if (!ret)
                                ret = -EPIPE;
                        break;
                }

                head = pipe->head;
                if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
                        unsigned int mask = pipe->ring_size - 1;
                        struct pipe_buffer *buf;
                        struct page *page = pipe->tmp_page;
                        int copied;

                        if (!page) {
                                page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
                                if (unlikely(!page)) {
                                        ret = ret ? : -ENOMEM;
                                        break;
                                }
                                pipe->tmp_page = page;
                        }

                        /* Allocate a slot in the ring in advance and attach an
                         * empty buffer.  If we fault or otherwise fail to use
                         * it, either the reader will consume it or it'll still
                         * be there for the next write.
                         */
                        pipe->head = head + 1;

                        /* Insert it into the buffer array */
                        buf = &pipe->bufs[head & mask];
                        buf->page = page;
                        buf->ops = &anon_pipe_buf_ops;
                        buf->offset = 0;
                        buf->len = 0;
                        if (is_packetized(filp))
                                buf->flags = PIPE_BUF_FLAG_PACKET;
                        else
                                buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
                        pipe->tmp_page = NULL;

                        copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
                        if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
                                if (!ret)
                                        ret = -EFAULT;
                                break;
                        }
                        ret += copied;
                        buf->len = copied;

                        if (!iov_iter_count(from))
                                break;
                }

                if (!pipe_full(head, pipe->tail, pipe->max_usage))
                        continue;

                /* Wait for buffer space to become available. */
                if ((filp->f_flags & O_NONBLOCK) ||
                    (iocb->ki_flags & IOCB_NOWAIT)) {
                        if (!ret)
                                ret = -EAGAIN;
                        break;
                }
                if (signal_pending(current)) {
                        if (!ret)
                                ret = -ERESTARTSYS;
                        break;
                }

                /*
                 * We're going to release the pipe lock and wait for more
                 * space. We wake up any readers if necessary, and then
                 * after waiting we need to re-check whether the pipe
                 * become empty while we dropped the lock.
                 */
                mutex_unlock(&pipe->mutex);
                if (was_empty)
                        wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
                wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
                mutex_lock(&pipe->mutex);
                was_empty = pipe_empty(pipe->head, pipe->tail);
                wake_next_writer = true;
        }
out:
        if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
                wake_next_writer = false;
        mutex_unlock(&pipe->mutex);

        /*
         * If we do do a wakeup event, we do a 'sync' wakeup, because we
         * want the reader to start processing things asap, rather than
         * leave the data pending.
         *
         * This is particularly important for small writes, because of
         * how (for example) the GNU make jobserver uses small writes to
         * wake up pending jobs
         *
         * Epoll nonsensically wants a wakeup whether the pipe
         * was already empty or not.
         */
        if (was_empty || pipe->poll_usage)
                wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
        if (wake_next_writer)
                wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
        if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
                int err = file_update_time(filp);
                if (err)
                        ret = err;
                sb_end_write(file_inode(filp)->i_sb);
        }
        return ret;
}

static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
        struct pipe_inode_info *pipe = filp->private_data;
        unsigned int count, head, tail, mask;

        switch (cmd) {
        case FIONREAD:
                mutex_lock(&pipe->mutex);
                count = 0;
                head = pipe->head;
                tail = pipe->tail;
                mask = pipe->ring_size - 1;

                while (tail != head) {
                        count += pipe->bufs[tail & mask].len;
                        tail++;
                }
                mutex_unlock(&pipe->mutex);

                return put_user(count, (int __user *)arg);

#ifdef CONFIG_WATCH_QUEUE
        case IOC_WATCH_QUEUE_SET_SIZE: {
                int ret;
                mutex_lock(&pipe->mutex);
                ret = watch_queue_set_size(pipe, arg);
                mutex_unlock(&pipe->mutex);
                return ret;
        }

        case IOC_WATCH_QUEUE_SET_FILTER:
                return watch_queue_set_filter(
                        pipe, (struct watch_notification_filter __user *)arg);
#endif

        default:
                return -ENOIOCTLCMD;
        }
}

/* No kernel lock held - fine */
static __poll_t
pipe_poll(struct file *filp, poll_table *wait)
{
        __poll_t mask;
        struct pipe_inode_info *pipe = filp->private_data;
        unsigned int head, tail;

        /* Epoll has some historical nasty semantics, this enables them */
        WRITE_ONCE(pipe->poll_usage, true);

        /*
         * Reading pipe state only -- no need for acquiring the semaphore.
         *
         * But because this is racy, the code has to add the
         * entry to the poll table _first_ ..
         */
        if (filp->f_mode & FMODE_READ)
                poll_wait(filp, &pipe->rd_wait, wait);
        if (filp->f_mode & FMODE_WRITE)
                poll_wait(filp, &pipe->wr_wait, wait);

        /*
         * .. and only then can you do the racy tests. That way,
         * if something changes and you got it wrong, the poll
         * table entry will wake you up and fix it.
         */
        head = READ_ONCE(pipe->head);
        tail = READ_ONCE(pipe->tail);

        mask = 0;
        if (filp->f_mode & FMODE_READ) {
                if (!pipe_empty(head, tail))
                        mask |= EPOLLIN | EPOLLRDNORM;
                if (!pipe->writers && filp->f_version != pipe->w_counter)
                        mask |= EPOLLHUP;
        }

        if (filp->f_mode & FMODE_WRITE) {
                if (!pipe_full(head, tail, pipe->max_usage))
                        mask |= EPOLLOUT | EPOLLWRNORM;
                /*
                 * Most Unices do not set EPOLLERR for FIFOs but on Linux they
                 * behave exactly like pipes for poll().
                 */
                if (!pipe->readers)
                        mask |= EPOLLERR;
        }

        return mask;
}

static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
{
        int kill = 0;

        spin_lock(&inode->i_lock);
        if (!--pipe->files) {
                inode->i_pipe = NULL;
                kill = 1;
        }
        spin_unlock(&inode->i_lock);

        if (kill)
                free_pipe_info(pipe);
}

static int
pipe_release(struct inode *inode, struct file *file)
{
        struct pipe_inode_info *pipe = file->private_data;

        mutex_lock(&pipe->mutex);
        if (file->f_mode & FMODE_READ)
                pipe->readers--;
        if (file->f_mode & FMODE_WRITE)
                pipe->writers--;

        /* Was that the last reader or writer, but not the other side? */
        if (!pipe->readers != !pipe->writers) {
                wake_up_interruptible_all(&pipe->rd_wait);
                wake_up_interruptible_all(&pipe->wr_wait);
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
        }
        mutex_unlock(&pipe->mutex);

        put_pipe_info(inode, pipe);
        return 0;
}

static int
pipe_fasync(int fd, struct file *filp, int on)
{
        struct pipe_inode_info *pipe = filp->private_data;
        int retval = 0;

        mutex_lock(&pipe->mutex);
        if (filp->f_mode & FMODE_READ)
                retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
        if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
                retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
                if (retval < 0 && (filp->f_mode & FMODE_READ))
                        /* this can happen only if on == T */
                        fasync_helper(-1, filp, 0, &pipe->fasync_readers);
        }
        mutex_unlock(&pipe->mutex);
        return retval;
}

unsigned long account_pipe_buffers(struct user_struct *user,
                                   unsigned long old, unsigned long new)
{
        return atomic_long_add_return(new - old, &user->pipe_bufs);
}

bool too_many_pipe_buffers_soft(unsigned long user_bufs)
{
        unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);

        return soft_limit && user_bufs > soft_limit;
}

bool too_many_pipe_buffers_hard(unsigned long user_bufs)
{
        unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);

        return hard_limit && user_bufs > hard_limit;
}

bool pipe_is_unprivileged_user(void)
{
        return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
}

struct pipe_inode_info *alloc_pipe_info(void)
{
        struct pipe_inode_info *pipe;
        unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
        struct user_struct *user = get_current_user();
        unsigned long user_bufs;
        unsigned int max_size = READ_ONCE(pipe_max_size);

        pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
        if (pipe == NULL)
                goto out_free_uid;

        if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
                pipe_bufs = max_size >> PAGE_SHIFT;

        user_bufs = account_pipe_buffers(user, 0, pipe_bufs);

        if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
                user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS);
                pipe_bufs = PIPE_MIN_DEF_BUFFERS;
        }

        if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
                goto out_revert_acct;

        pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
                             GFP_KERNEL_ACCOUNT);

        if (pipe->bufs) {
                init_waitqueue_head(&pipe->rd_wait);
                init_waitqueue_head(&pipe->wr_wait);
                pipe->r_counter = pipe->w_counter = 1;
                pipe->max_usage = pipe_bufs;
                pipe->ring_size = pipe_bufs;
                pipe->nr_accounted = pipe_bufs;
                pipe->user = user;
                mutex_init(&pipe->mutex);
                lock_set_cmp_fn(&pipe->mutex, pipe_lock_cmp_fn, NULL);
                return pipe;
        }

out_revert_acct:
        (void) account_pipe_buffers(user, pipe_bufs, 0);
        kfree(pipe);
out_free_uid:
        free_uid(user);
        return NULL;
}

void free_pipe_info(struct pipe_inode_info *pipe)
{
        unsigned int i;

#ifdef CONFIG_WATCH_QUEUE
        if (pipe->watch_queue)
                watch_queue_clear(pipe->watch_queue);
#endif

        (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
        free_uid(pipe->user);
        for (i = 0; i < pipe->ring_size; i++) {
                struct pipe_buffer *buf = pipe->bufs + i;
                if (buf->ops)
                        pipe_buf_release(pipe, buf);
        }
#ifdef CONFIG_WATCH_QUEUE
        if (pipe->watch_queue)
                put_watch_queue(pipe->watch_queue);
#endif
        if (pipe->tmp_page)
                __free_page(pipe->tmp_page);
        kfree(pipe->bufs);
        kfree(pipe);
}

static struct vfsmount *pipe_mnt __ro_after_init;

/*
 * pipefs_dname() is called from d_path().
 */
static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
{
        return dynamic_dname(buffer, buflen, "pipe:[%lu]",
                                d_inode(dentry)->i_ino);
}

static const struct dentry_operations pipefs_dentry_operations = {
        .d_dname        = pipefs_dname,
};

static struct inode * get_pipe_inode(void)
{
        struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
        struct pipe_inode_info *pipe;

        if (!inode)
                goto fail_inode;

        inode->i_ino = get_next_ino();

        pipe = alloc_pipe_info();
        if (!pipe)
                goto fail_iput;

        inode->i_pipe = pipe;
        pipe->files = 2;
        pipe->readers = pipe->writers = 1;
        inode->i_fop = &pipefifo_fops;

        /*
         * Mark the inode dirty from the very beginning,
         * that way it will never be moved to the dirty
         * list because "mark_inode_dirty()" will think
         * that it already _is_ on the dirty list.
         */
        inode->i_state = I_DIRTY;
        inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
        simple_inode_init_ts(inode);

        return inode;

fail_iput:
        iput(inode);

fail_inode:
        return NULL;
}

int create_pipe_files(struct file **res, int flags)
{
        struct inode *inode = get_pipe_inode();
        struct file *f;
        int error;

        if (!inode)
                return -ENFILE;

        if (flags & O_NOTIFICATION_PIPE) {
                error = watch_queue_init(inode->i_pipe);
                if (error) {
                        free_pipe_info(inode->i_pipe);
                        iput(inode);
                        return error;
                }
        }

        f = alloc_file_pseudo(inode, pipe_mnt, "",
                                O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
                                &pipefifo_fops);
        if (IS_ERR(f)) {
                free_pipe_info(inode->i_pipe);
                iput(inode);
                return PTR_ERR(f);
        }

        f->private_data = inode->i_pipe;

        res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
                                  &pipefifo_fops);
        if (IS_ERR(res[0])) {
                put_pipe_info(inode, inode->i_pipe);
                fput(f);
                return PTR_ERR(res[0]);
        }
        res[0]->private_data = inode->i_pipe;
        res[1] = f;
        stream_open(inode, res[0]);
        stream_open(inode, res[1]);
        return 0;
}

static int __do_pipe_flags(int *fd, struct file **files, int flags)
{
        int error;
        int fdw, fdr;

        if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
                return -EINVAL;

        error = create_pipe_files(files, flags);
        if (error)
                return error;

        error = get_unused_fd_flags(flags);
        if (error < 0)
                goto err_read_pipe;
        fdr = error;

        error = get_unused_fd_flags(flags);
        if (error < 0)
                goto err_fdr;
        fdw = error;

        audit_fd_pair(fdr, fdw);
        fd[0] = fdr;
        fd[1] = fdw;
        /* pipe groks IOCB_NOWAIT */
        files[0]->f_mode |= FMODE_NOWAIT;
        files[1]->f_mode |= FMODE_NOWAIT;
        return 0;

 err_fdr:
        put_unused_fd(fdr);
 err_read_pipe:
        fput(files[0]);
        fput(files[1]);
        return error;
}

int do_pipe_flags(int *fd, int flags)
{
        struct file *files[2];
        int error = __do_pipe_flags(fd, files, flags);
        if (!error) {
                fd_install(fd[0], files[0]);
                fd_install(fd[1], files[1]);
        }
        return error;
}

/*
 * sys_pipe() is the normal C calling standard for creating
 * a pipe. It's not the way Unix traditionally does this, though.
 */
static int do_pipe2(int __user *fildes, int flags)
{
        struct file *files[2];
        int fd[2];
        int error;

        error = __do_pipe_flags(fd, files, flags);
        if (!error) {
                if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
                        fput(files[0]);
                        fput(files[1]);
                        put_unused_fd(fd[0]);
                        put_unused_fd(fd[1]);
                        error = -EFAULT;
                } else {
                        fd_install(fd[0], files[0]);
                        fd_install(fd[1], files[1]);
                }
        }
        return error;
}

SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
{
        return do_pipe2(fildes, flags);
}

SYSCALL_DEFINE1(pipe, int __user *, fildes)
{
        return do_pipe2(fildes, 0);
}

/*
 * This is the stupid "wait for pipe to be readable or writable"
 * model.
 *
 * See pipe_read/write() for the proper kind of exclusive wait,
 * but that requires that we wake up any other readers/writers
 * if we then do not end up reading everything (ie the whole
 * "wake_next_reader/writer" logic in pipe_read/write()).
 */
void pipe_wait_readable(struct pipe_inode_info *pipe)
{
        pipe_unlock(pipe);
        wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe));
        pipe_lock(pipe);
}

void pipe_wait_writable(struct pipe_inode_info *pipe)
{
        pipe_unlock(pipe);
        wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe));
        pipe_lock(pipe);
}

/*
 * This depends on both the wait (here) and the wakeup (wake_up_partner)
 * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot
 * race with the count check and waitqueue prep.
 *
 * Normally in order to avoid races, you'd do the prepare_to_wait() first,
 * then check the condition you're waiting for, and only then sleep. But
 * because of the pipe lock, we can check the condition before being on
 * the wait queue.
 *
 * We use the 'rd_wait' waitqueue for pipe partner waiting.
 */
static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
{
        DEFINE_WAIT(rdwait);
        int cur = *cnt;

        while (cur == *cnt) {
                prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
                pipe_unlock(pipe);
                schedule();
                finish_wait(&pipe->rd_wait, &rdwait);
                pipe_lock(pipe);
                if (signal_pending(current))
                        break;
        }
        return cur == *cnt ? -ERESTARTSYS : 0;
}

static void wake_up_partner(struct pipe_inode_info *pipe)
{
        wake_up_interruptible_all(&pipe->rd_wait);
}

static int fifo_open(struct inode *inode, struct file *filp)
{
        struct pipe_inode_info *pipe;
        bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
        int ret;

        filp->f_version = 0;

        spin_lock(&inode->i_lock);
        if (inode->i_pipe) {
                pipe = inode->i_pipe;
                pipe->files++;
                spin_unlock(&inode->i_lock);
        } else {
                spin_unlock(&inode->i_lock);
                pipe = alloc_pipe_info();
                if (!pipe)
                        return -ENOMEM;
                pipe->files = 1;
                spin_lock(&inode->i_lock);
                if (unlikely(inode->i_pipe)) {
                        inode->i_pipe->files++;
                        spin_unlock(&inode->i_lock);
                        free_pipe_info(pipe);
                        pipe = inode->i_pipe;
                } else {
                        inode->i_pipe = pipe;
                        spin_unlock(&inode->i_lock);
                }
        }
        filp->private_data = pipe;
        /* OK, we have a pipe and it's pinned down */

        mutex_lock(&pipe->mutex);

        /* We can only do regular read/write on fifos */
        stream_open(inode, filp);

        switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) {
        case FMODE_READ:
        /*
         *  O_RDONLY
         *  POSIX.1 says that O_NONBLOCK means return with the FIFO
         *  opened, even when there is no process writing the FIFO.
         */
                pipe->r_counter++;
                if (pipe->readers++ == 0)
                        wake_up_partner(pipe);

                if (!is_pipe && !pipe->writers) {
                        if ((filp->f_flags & O_NONBLOCK)) {
                                /* suppress EPOLLHUP until we have
                                 * seen a writer */
                                filp->f_version = pipe->w_counter;
                        } else {
                                if (wait_for_partner(pipe, &pipe->w_counter))
                                        goto err_rd;
                        }
                }
                break;

        case FMODE_WRITE:
        /*
         *  O_WRONLY
         *  POSIX.1 says that O_NONBLOCK means return -1 with
         *  errno=ENXIO when there is no process reading the FIFO.
         */
                ret = -ENXIO;
                if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
                        goto err;

                pipe->w_counter++;
                if (!pipe->writers++)
                        wake_up_partner(pipe);

                if (!is_pipe && !pipe->readers) {
                        if (wait_for_partner(pipe, &pipe->r_counter))
                                goto err_wr;
                }
                break;

        case FMODE_READ | FMODE_WRITE:
        /*
         *  O_RDWR
         *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
         *  This implementation will NEVER block on a O_RDWR open, since
         *  the process can at least talk to itself.
         */

                pipe->readers++;
                pipe->writers++;
                pipe->r_counter++;
                pipe->w_counter++;
                if (pipe->readers == 1 || pipe->writers == 1)
                        wake_up_partner(pipe);
                break;

        default:
                ret = -EINVAL;
                goto err;
        }

        /* Ok! */
        mutex_unlock(&pipe->mutex);
        return 0;

err_rd:
        if (!--pipe->readers)
                wake_up_interruptible(&pipe->wr_wait);
        ret = -ERESTARTSYS;
        goto err;

err_wr:
        if (!--pipe->writers)
                wake_up_interruptible_all(&pipe->rd_wait);
        ret = -ERESTARTSYS;
        goto err;

err:
        mutex_unlock(&pipe->mutex);

        put_pipe_info(inode, pipe);
        return ret;
}

const struct file_operations pipefifo_fops = {
        .open                = fifo_open,
        .llseek                = no_llseek,
        .read_iter        = pipe_read,
        .write_iter        = pipe_write,
        .poll                = pipe_poll,
        .unlocked_ioctl        = pipe_ioctl,
        .release        = pipe_release,
        .fasync                = pipe_fasync,
        .splice_write        = iter_file_splice_write,
};

/*
 * Currently we rely on the pipe array holding a power-of-2 number
 * of pages. Returns 0 on error.
 */
unsigned int round_pipe_size(unsigned int size)
{
        if (size > (1U << 31))
                return 0;

        /* Minimum pipe size, as required by POSIX */
        if (size < PAGE_SIZE)
                return PAGE_SIZE;

        return roundup_pow_of_two(size);
}

/*
 * Resize the pipe ring to a number of slots.
 *
 * Note the pipe can be reduced in capacity, but only if the current
 * occupancy doesn't exceed nr_slots; if it does, EBUSY will be
 * returned instead.
 */
int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
{
        struct pipe_buffer *bufs;
        unsigned int head, tail, mask, n;

        bufs = kcalloc(nr_slots, sizeof(*bufs),
                       GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
        if (unlikely(!bufs))
                return -ENOMEM;

        spin_lock_irq(&pipe->rd_wait.lock);
        mask = pipe->ring_size - 1;
        head = pipe->head;
        tail = pipe->tail;

        n = pipe_occupancy(head, tail);
        if (nr_slots < n) {
                spin_unlock_irq(&pipe->rd_wait.lock);
                kfree(bufs);
                return -EBUSY;
        }

        /*
         * The pipe array wraps around, so just start the new one at zero
         * and adjust the indices.
         */
        if (n > 0) {
                unsigned int h = head & mask;
                unsigned int t = tail & mask;
                if (h > t) {
                        memcpy(bufs, pipe->bufs + t,
                               n * sizeof(struct pipe_buffer));
                } else {
                        unsigned int tsize = pipe->ring_size - t;
                        if (h > 0)
                                memcpy(bufs + tsize, pipe->bufs,
                                       h * sizeof(struct pipe_buffer));
                        memcpy(bufs, pipe->bufs + t,
                               tsize * sizeof(struct pipe_buffer));
                }
        }

        head = n;
        tail = 0;

        kfree(pipe->bufs);
        pipe->bufs = bufs;
        pipe->ring_size = nr_slots;
        if (pipe->max_usage > nr_slots)
                pipe->max_usage = nr_slots;
        pipe->tail = tail;
        pipe->head = head;

        if (!pipe_has_watch_queue(pipe)) {
                pipe->max_usage = nr_slots;
                pipe->nr_accounted = nr_slots;
        }

        spin_unlock_irq(&pipe->rd_wait.lock);

        /* This might have made more room for writers */
        wake_up_interruptible(&pipe->wr_wait);
        return 0;
}

/*
 * Allocate a new array of pipe buffers and copy the info over. Returns the
 * pipe size if successful, or return -ERROR on error.
 */
static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg)
{
        unsigned long user_bufs;
        unsigned int nr_slots, size;
        long ret = 0;

        if (pipe_has_watch_queue(pipe))
                return -EBUSY;

        size = round_pipe_size(arg);
        nr_slots = size >> PAGE_SHIFT;

        if (!nr_slots)
                return -EINVAL;

        /*
         * If trying to increase the pipe capacity, check that an
         * unprivileged user is not trying to exceed various limits
         * (soft limit check here, hard limit check just below).
         * Decreasing the pipe capacity is always permitted, even
         * if the user is currently over a limit.
         */
        if (nr_slots > pipe->max_usage &&
                        size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
                return -EPERM;

        user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots);

        if (nr_slots > pipe->max_usage &&
                        (too_many_pipe_buffers_hard(user_bufs) ||
                         too_many_pipe_buffers_soft(user_bufs)) &&
                        pipe_is_unprivileged_user()) {
                ret = -EPERM;
                goto out_revert_acct;
        }

        ret = pipe_resize_ring(pipe, nr_slots);
        if (ret < 0)
                goto out_revert_acct;

        return pipe->max_usage * PAGE_SIZE;

out_revert_acct:
        (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted);
        return ret;
}

/*
 * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is
 * not enough to verify that this is a pipe.
 */
struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
{
        struct pipe_inode_info *pipe = file->private_data;

        if (file->f_op != &pipefifo_fops || !pipe)
                return NULL;
        if (for_splice && pipe_has_watch_queue(pipe))
                return NULL;
        return pipe;
}

long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
{
        struct pipe_inode_info *pipe;
        long ret;

        pipe = get_pipe_info(file, false);
        if (!pipe)
                return -EBADF;

        mutex_lock(&pipe->mutex);

        switch (cmd) {
        case F_SETPIPE_SZ:
                ret = pipe_set_size(pipe, arg);
                break;
        case F_GETPIPE_SZ:
                ret = pipe->max_usage * PAGE_SIZE;
                break;
        default:
                ret = -EINVAL;
                break;
        }

        mutex_unlock(&pipe->mutex);
        return ret;
}

static const struct super_operations pipefs_ops = {
        .destroy_inode = free_inode_nonrcu,
        .statfs = simple_statfs,
};

/*
 * pipefs should _never_ be mounted by userland - too much of security hassle,
 * no real gain from having the whole whorehouse mounted. So we don't need
 * any operations on the root directory. However, we need a non-trivial
 * d_name - pipe: will go nicely and kill the special-casing in procfs.
 */

static int pipefs_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        ctx->ops = &pipefs_ops;
        ctx->dops = &pipefs_dentry_operations;
        return 0;
}

static struct file_system_type pipe_fs_type = {
        .name                = "pipefs",
        .init_fs_context = pipefs_init_fs_context,
        .kill_sb        = kill_anon_super,
};

#ifdef CONFIG_SYSCTL
static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
                                        unsigned int *valp,
                                        int write, void *data)
{
        if (write) {
                unsigned int val;

                val = round_pipe_size(*lvalp);
                if (val == 0)
                        return -EINVAL;

                *valp = val;
        } else {
                unsigned int val = *valp;
                *lvalp = (unsigned long) val;
        }

        return 0;
}

static int proc_dopipe_max_size(struct ctl_table *table, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
        return do_proc_douintvec(table, write, buffer, lenp, ppos,
                                 do_proc_dopipe_max_size_conv, NULL);
}

static struct ctl_table fs_pipe_sysctls[] = {
        {
                .procname        = "pipe-max-size",
                .data                = &pipe_max_size,
                .maxlen                = sizeof(pipe_max_size),
                .mode                = 0644,
                .proc_handler        = proc_dopipe_max_size,
        },
        {
                .procname        = "pipe-user-pages-hard",
                .data                = &pipe_user_pages_hard,
                .maxlen                = sizeof(pipe_user_pages_hard),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
        },
        {
                .procname        = "pipe-user-pages-soft",
                .data                = &pipe_user_pages_soft,
                .maxlen                = sizeof(pipe_user_pages_soft),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
        },
};
#endif

static int __init init_pipe_fs(void)
{
        int err = register_filesystem(&pipe_fs_type);

        if (!err) {
                pipe_mnt = kern_mount(&pipe_fs_type);
                if (IS_ERR(pipe_mnt)) {
                        err = PTR_ERR(pipe_mnt);
                        unregister_filesystem(&pipe_fs_type);
                }
        }
#ifdef CONFIG_SYSCTL
        register_sysctl_init("fs", fs_pipe_sysctls);
#endif
        return err;
}

fs_initcall(init_pipe_fs);

































   30 
















   30 






   28 

    2 



















   26 
   30 





















    1 


   15 
   15 




























    1 
    2 


















    3 

    3 




    1 









    2 




    1 




























































   29 


















   29 







   26 

    2 



   27 




   26 
    1 









   29 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/realpath.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include "common.h"
#include <linux/magic.h>
#include <linux/proc_fs.h>

/**
 * tomoyo_encode2 - Encode binary string to ascii string.
 *
 * @str:     String in binary format.
 * @str_len: Size of @str in byte.
 *
 * Returns pointer to @str in ascii format on success, NULL otherwise.
 *
 * This function uses kzalloc(), so caller must kfree() if this function
 * didn't return NULL.
 */
char *tomoyo_encode2(const char *str, int str_len)
{
        int i;
        int len = 0;
        const char *p = str;
        char *cp;
        char *cp0;

        if (!p)
                return NULL;
        for (i = 0; i < str_len; i++) {
                const unsigned char c = p[i];

                if (c == '\\')
                        len += 2;
                else if (c > ' ' && c < 127)
                        len++;
                else
                        len += 4;
        }
        len++;
        /* Reserve space for appending "/". */
        cp = kzalloc(len + 10, GFP_NOFS);
        if (!cp)
                return NULL;
        cp0 = cp;
        p = str;
        for (i = 0; i < str_len; i++) {
                const unsigned char c = p[i];

                if (c == '\\') {
                        *cp++ = '\\';
                        *cp++ = '\\';
                } else if (c > ' ' && c < 127) {
                        *cp++ = c;
                } else {
                        *cp++ = '\\';
                        *cp++ = (c >> 6) + '0';
                        *cp++ = ((c >> 3) & 7) + '0';
                        *cp++ = (c & 7) + '0';
                }
        }
        return cp0;
}

/**
 * tomoyo_encode - Encode binary string to ascii string.
 *
 * @str: String in binary format.
 *
 * Returns pointer to @str in ascii format on success, NULL otherwise.
 *
 * This function uses kzalloc(), so caller must kfree() if this function
 * didn't return NULL.
 */
char *tomoyo_encode(const char *str)
{
        return str ? tomoyo_encode2(str, strlen(str)) : NULL;
}

/**
 * tomoyo_get_absolute_path - Get the path of a dentry but ignores chroot'ed root.
 *
 * @path:   Pointer to "struct path".
 * @buffer: Pointer to buffer to return value in.
 * @buflen: Sizeof @buffer.
 *
 * Returns the buffer on success, an error code otherwise.
 *
 * If dentry is a directory, trailing '/' is appended.
 */
static char *tomoyo_get_absolute_path(const struct path *path, char * const buffer,
                                      const int buflen)
{
        char *pos = ERR_PTR(-ENOMEM);

        if (buflen >= 256) {
                /* go to whatever namespace root we are under */
                pos = d_absolute_path(path, buffer, buflen - 1);
                if (!IS_ERR(pos) && *pos == '/' && pos[1]) {
                        struct inode *inode = d_backing_inode(path->dentry);

                        if (inode && S_ISDIR(inode->i_mode)) {
                                buffer[buflen - 2] = '/';
                                buffer[buflen - 1] = '\0';
                        }
                }
        }
        return pos;
}

/**
 * tomoyo_get_dentry_path - Get the path of a dentry.
 *
 * @dentry: Pointer to "struct dentry".
 * @buffer: Pointer to buffer to return value in.
 * @buflen: Sizeof @buffer.
 *
 * Returns the buffer on success, an error code otherwise.
 *
 * If dentry is a directory, trailing '/' is appended.
 */
static char *tomoyo_get_dentry_path(struct dentry *dentry, char * const buffer,
                                    const int buflen)
{
        char *pos = ERR_PTR(-ENOMEM);

        if (buflen >= 256) {
                pos = dentry_path_raw(dentry, buffer, buflen - 1);
                if (!IS_ERR(pos) && *pos == '/' && pos[1]) {
                        struct inode *inode = d_backing_inode(dentry);

                        if (inode && S_ISDIR(inode->i_mode)) {
                                buffer[buflen - 2] = '/';
                                buffer[buflen - 1] = '\0';
                        }
                }
        }
        return pos;
}

/**
 * tomoyo_get_local_path - Get the path of a dentry.
 *
 * @dentry: Pointer to "struct dentry".
 * @buffer: Pointer to buffer to return value in.
 * @buflen: Sizeof @buffer.
 *
 * Returns the buffer on success, an error code otherwise.
 */
static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer,
                                   const int buflen)
{
        struct super_block *sb = dentry->d_sb;
        char *pos = tomoyo_get_dentry_path(dentry, buffer, buflen);

        if (IS_ERR(pos))
                return pos;
        /* Convert from $PID to self if $PID is current thread. */
        if (sb->s_magic == PROC_SUPER_MAGIC && *pos == '/') {
                char *ep;
                const pid_t pid = (pid_t) simple_strtoul(pos + 1, &ep, 10);
                struct pid_namespace *proc_pidns = proc_pid_ns(sb);

                if (*ep == '/' && pid && pid ==
                    task_tgid_nr_ns(current, proc_pidns)) {
                        pos = ep - 5;
                        if (pos < buffer)
                                goto out;
                        memmove(pos, "/self", 5);
                }
                goto prepend_filesystem_name;
        }
        /* Use filesystem name for unnamed devices. */
        if (!MAJOR(sb->s_dev))
                goto prepend_filesystem_name;
        {
                struct inode *inode = d_backing_inode(sb->s_root);

                /*
                 * Use filesystem name if filesystem does not support rename()
                 * operation.
                 */
                if (!inode->i_op->rename)
                        goto prepend_filesystem_name;
        }
        /* Prepend device name. */
        {
                char name[64];
                int name_len;
                const dev_t dev = sb->s_dev;

                name[sizeof(name) - 1] = '\0';
                snprintf(name, sizeof(name) - 1, "dev(%u,%u):", MAJOR(dev),
                         MINOR(dev));
                name_len = strlen(name);
                pos -= name_len;
                if (pos < buffer)
                        goto out;
                memmove(pos, name, name_len);
                return pos;
        }
        /* Prepend filesystem name. */
prepend_filesystem_name:
        {
                const char *name = sb->s_type->name;
                const int name_len = strlen(name);

                pos -= name_len + 1;
                if (pos < buffer)
                        goto out;
                memmove(pos, name, name_len);
                pos[name_len] = ':';
        }
        return pos;
out:
        return ERR_PTR(-ENOMEM);
}

/**
 * tomoyo_realpath_from_path - Returns realpath(3) of the given pathname but ignores chroot'ed root.
 *
 * @path: Pointer to "struct path".
 *
 * Returns the realpath of the given @path on success, NULL otherwise.
 *
 * If dentry is a directory, trailing '/' is appended.
 * Characters out of 0x20 < c < 0x7F range are converted to
 * \ooo style octal string.
 * Character \ is converted to \\ string.
 *
 * These functions use kzalloc(), so the caller must call kfree()
 * if these functions didn't return NULL.
 */
char *tomoyo_realpath_from_path(const struct path *path)
{
        char *buf = NULL;
        char *name = NULL;
        unsigned int buf_len = PAGE_SIZE / 2;
        struct dentry *dentry = path->dentry;
        struct super_block *sb = dentry->d_sb;

        while (1) {
                char *pos;
                struct inode *inode;

                buf_len <<= 1;
                kfree(buf);
                buf = kmalloc(buf_len, GFP_NOFS);
                if (!buf)
                        break;
                /* To make sure that pos is '\0' terminated. */
                buf[buf_len - 1] = '\0';
                /* For "pipe:[\$]" and "socket:[\$]". */
                if (dentry->d_op && dentry->d_op->d_dname) {
                        pos = dentry->d_op->d_dname(dentry, buf, buf_len - 1);
                        goto encode;
                }
                inode = d_backing_inode(sb->s_root);
                /*
                 * Get local name for filesystems without rename() operation
                 */
                if ((!inode->i_op->rename &&
                     !(sb->s_type->fs_flags & FS_REQUIRES_DEV)))
                        pos = tomoyo_get_local_path(path->dentry, buf,
                                                    buf_len - 1);
                /* Get absolute name for the rest. */
                else {
                        pos = tomoyo_get_absolute_path(path, buf, buf_len - 1);
                        /*
                         * Fall back to local name if absolute name is not
                         * available.
                         */
                        if (pos == ERR_PTR(-EINVAL))
                                pos = tomoyo_get_local_path(path->dentry, buf,
                                                            buf_len - 1);
                }
encode:
                if (IS_ERR(pos))
                        continue;
                name = tomoyo_encode(pos);
                break;
        }
        kfree(buf);
        if (!name)
                tomoyo_warn_oom(__func__);
        return name;
}

/**
 * tomoyo_realpath_nofollow - Get realpath of a pathname.
 *
 * @pathname: The pathname to solve.
 *
 * Returns the realpath of @pathname on success, NULL otherwise.
 */
char *tomoyo_realpath_nofollow(const char *pathname)
{
        struct path path;

        if (pathname && kern_path(pathname, 0, &path) == 0) {
                char *buf = tomoyo_realpath_from_path(&path);

                path_put(&path);
                return buf;
        }
        return NULL;
}










    6 










    2 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __VDSO_MATH64_H
#define __VDSO_MATH64_H

static __always_inline u32
__iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
{
        u32 ret = 0;

        while (dividend >= divisor) {
                /* The following asm() prevents the compiler from
                   optimising this loop into a modulo operation.  */
                asm("" : "+rm"(dividend));

                dividend -= divisor;
                ret++;
        }

        *remainder = dividend;

        return ret;
}

#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)

#ifndef mul_u64_u32_add_u64_shr
static __always_inline u64 mul_u64_u32_add_u64_shr(u64 a, u32 mul, u64 b, unsigned int shift)
{
        return (u64)((((unsigned __int128)a * mul) + b) >> shift);
}
#endif /* mul_u64_u32_add_u64_shr */

#else

#ifndef mul_u64_u32_add_u64_shr
#ifndef mul_u32_u32
static inline u64 mul_u32_u32(u32 a, u32 b)
{
        return (u64)a * b;
}
#define mul_u32_u32 mul_u32_u32
#endif
static __always_inline u64 mul_u64_u32_add_u64_shr(u64 a, u32 mul, u64 b, unsigned int shift)
{
        u32 ah = a >> 32, al = a;
        bool ovf;
        u64 ret;

        ovf = __builtin_add_overflow(mul_u32_u32(al, mul), b, &ret);
        ret >>= shift;
        if (ovf && shift)
                ret += 1ULL << (64 - shift);
        if (ah)
                ret += mul_u32_u32(ah, mul) << (32 - shift);

        return ret;
}
#endif /* mul_u64_u32_add_u64_shr */

#endif

#endif /* __VDSO_MATH64_H */












































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#ifndef _LINUX_SCHED_ISOLATION_H
#define _LINUX_SCHED_ISOLATION_H

#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/init.h>
#include <linux/tick.h>

enum hk_type {
        HK_TYPE_TIMER,
        HK_TYPE_RCU,
        HK_TYPE_MISC,
        HK_TYPE_SCHED,
        HK_TYPE_TICK,
        HK_TYPE_DOMAIN,
        HK_TYPE_WQ,
        HK_TYPE_MANAGED_IRQ,
        HK_TYPE_KTHREAD,
        HK_TYPE_MAX
};

#ifdef CONFIG_CPU_ISOLATION
DECLARE_STATIC_KEY_FALSE(housekeeping_overridden);
extern int housekeeping_any_cpu(enum hk_type type);
extern const struct cpumask *housekeeping_cpumask(enum hk_type type);
extern bool housekeeping_enabled(enum hk_type type);
extern void housekeeping_affine(struct task_struct *t, enum hk_type type);
extern bool housekeeping_test_cpu(int cpu, enum hk_type type);
extern void __init housekeeping_init(void);

#else

static inline int housekeeping_any_cpu(enum hk_type type)
{
        return smp_processor_id();
}

static inline const struct cpumask *housekeeping_cpumask(enum hk_type type)
{
        return cpu_possible_mask;
}

static inline bool housekeeping_enabled(enum hk_type type)
{
        return false;
}

static inline void housekeeping_affine(struct task_struct *t,
                                       enum hk_type type) { }

static inline bool housekeeping_test_cpu(int cpu, enum hk_type type)
{
        return true;
}

static inline void housekeeping_init(void) { }
#endif /* CONFIG_CPU_ISOLATION */

static inline bool housekeeping_cpu(int cpu, enum hk_type type)
{
#ifdef CONFIG_CPU_ISOLATION
        if (static_branch_unlikely(&housekeeping_overridden))
                return housekeeping_test_cpu(cpu, type);
#endif
        return true;
}

static inline bool cpu_is_isolated(int cpu)
{
        return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) ||
               !housekeeping_test_cpu(cpu, HK_TYPE_TICK) ||
               cpuset_cpu_is_isolated(cpu);
}

#endif /* _LINUX_SCHED_ISOLATION_H */
































































    3 














    2 
    2 

    3 




    2 


    2 













   12 














   12 
   11 

   11 




    6 


    7 











   37 



   38 






   35 




   37 

   37 



    3 
   12 



    3 






































   11 





   12 





   11 

   12 
   12 

   12 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
// SPDX-License-Identifier: GPL-2.0

/*
 * Copyright (c) 2021, Google LLC.
 * Pasha Tatashin <pasha.tatashin@soleen.com>
 */
#include <linux/kstrtox.h>
#include <linux/mm.h>
#include <linux/page_table_check.h>
#include <linux/swap.h>
#include <linux/swapops.h>

#undef pr_fmt
#define pr_fmt(fmt)        "page_table_check: " fmt

struct page_table_check {
        atomic_t anon_map_count;
        atomic_t file_map_count;
};

static bool __page_table_check_enabled __initdata =
                                IS_ENABLED(CONFIG_PAGE_TABLE_CHECK_ENFORCED);

DEFINE_STATIC_KEY_TRUE(page_table_check_disabled);
EXPORT_SYMBOL(page_table_check_disabled);

static int __init early_page_table_check_param(char *buf)
{
        return kstrtobool(buf, &__page_table_check_enabled);
}

early_param("page_table_check", early_page_table_check_param);

static bool __init need_page_table_check(void)
{
        return __page_table_check_enabled;
}

static void __init init_page_table_check(void)
{
        if (!__page_table_check_enabled)
                return;
        static_branch_disable(&page_table_check_disabled);
}

struct page_ext_operations page_table_check_ops = {
        .size = sizeof(struct page_table_check),
        .need = need_page_table_check,
        .init = init_page_table_check,
        .need_shared_flags = false,
};

static struct page_table_check *get_page_table_check(struct page_ext *page_ext)
{
        BUG_ON(!page_ext);
        return page_ext_data(page_ext, &page_table_check_ops);
}

/*
 * An entry is removed from the page table, decrement the counters for that page
 * verify that it is of correct type and counters do not become negative.
 */
static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt)
{
        struct page_ext *page_ext;
        struct page *page;
        unsigned long i;
        bool anon;

        if (!pfn_valid(pfn))
                return;

        page = pfn_to_page(pfn);
        page_ext = page_ext_get(page);

        if (!page_ext)
                return;

        BUG_ON(PageSlab(page));
        anon = PageAnon(page);

        for (i = 0; i < pgcnt; i++) {
                struct page_table_check *ptc = get_page_table_check(page_ext);

                if (anon) {
                        BUG_ON(atomic_read(&ptc->file_map_count));
                        BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0);
                } else {
                        BUG_ON(atomic_read(&ptc->anon_map_count));
                        BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0);
                }
                page_ext = page_ext_next(page_ext);
        }
        page_ext_put(page_ext);
}

/*
 * A new entry is added to the page table, increment the counters for that page
 * verify that it is of correct type and is not being mapped with a different
 * type to a different process.
 */
static void page_table_check_set(unsigned long pfn, unsigned long pgcnt,
                                 bool rw)
{
        struct page_ext *page_ext;
        struct page *page;
        unsigned long i;
        bool anon;

        if (!pfn_valid(pfn))
                return;

        page = pfn_to_page(pfn);
        page_ext = page_ext_get(page);

        if (!page_ext)
                return;

        BUG_ON(PageSlab(page));
        anon = PageAnon(page);

        for (i = 0; i < pgcnt; i++) {
                struct page_table_check *ptc = get_page_table_check(page_ext);

                if (anon) {
                        BUG_ON(atomic_read(&ptc->file_map_count));
                        BUG_ON(atomic_inc_return(&ptc->anon_map_count) > 1 && rw);
                } else {
                        BUG_ON(atomic_read(&ptc->anon_map_count));
                        BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0);
                }
                page_ext = page_ext_next(page_ext);
        }
        page_ext_put(page_ext);
}

/*
 * page is on free list, or is being allocated, verify that counters are zeroes
 * crash if they are not.
 */
void __page_table_check_zero(struct page *page, unsigned int order)
{
        struct page_ext *page_ext;
        unsigned long i;

        BUG_ON(PageSlab(page));

        page_ext = page_ext_get(page);

        if (!page_ext)
                return;

        for (i = 0; i < (1ul << order); i++) {
                struct page_table_check *ptc = get_page_table_check(page_ext);

                BUG_ON(atomic_read(&ptc->anon_map_count));
                BUG_ON(atomic_read(&ptc->file_map_count));
                page_ext = page_ext_next(page_ext);
        }
        page_ext_put(page_ext);
}

void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
{
        if (&init_mm == mm)
                return;

        if (pte_user_accessible_page(pte)) {
                page_table_check_clear(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT);
        }
}
EXPORT_SYMBOL(__page_table_check_pte_clear);

void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd)
{
        if (&init_mm == mm)
                return;

        if (pmd_user_accessible_page(pmd)) {
                page_table_check_clear(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT);
        }
}
EXPORT_SYMBOL(__page_table_check_pmd_clear);

void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
{
        if (&init_mm == mm)
                return;

        if (pud_user_accessible_page(pud)) {
                page_table_check_clear(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT);
        }
}
EXPORT_SYMBOL(__page_table_check_pud_clear);

/* Whether the swap entry cached writable information */
static inline bool swap_cached_writable(swp_entry_t entry)
{
        return is_writable_device_exclusive_entry(entry) ||
            is_writable_device_private_entry(entry) ||
            is_writable_migration_entry(entry);
}

static inline void page_table_check_pte_flags(pte_t pte)
{
        if (pte_present(pte) && pte_uffd_wp(pte))
                WARN_ON_ONCE(pte_write(pte));
        else if (is_swap_pte(pte) && pte_swp_uffd_wp(pte))
                WARN_ON_ONCE(swap_cached_writable(pte_to_swp_entry(pte)));
}

void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte,
                unsigned int nr)
{
        unsigned int i;

        if (&init_mm == mm)
                return;

        page_table_check_pte_flags(pte);

        for (i = 0; i < nr; i++)
                __page_table_check_pte_clear(mm, ptep_get(ptep + i));
        if (pte_user_accessible_page(pte))
                page_table_check_set(pte_pfn(pte), nr, pte_write(pte));
}
EXPORT_SYMBOL(__page_table_check_ptes_set);

static inline void page_table_check_pmd_flags(pmd_t pmd)
{
        if (pmd_present(pmd) && pmd_uffd_wp(pmd))
                WARN_ON_ONCE(pmd_write(pmd));
        else if (is_swap_pmd(pmd) && pmd_swp_uffd_wp(pmd))
                WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd)));
}

void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd)
{
        if (&init_mm == mm)
                return;

        page_table_check_pmd_flags(pmd);

        __page_table_check_pmd_clear(mm, *pmdp);
        if (pmd_user_accessible_page(pmd)) {
                page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT,
                                     pmd_write(pmd));
        }
}
EXPORT_SYMBOL(__page_table_check_pmd_set);

void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud)
{
        if (&init_mm == mm)
                return;

        __page_table_check_pud_clear(mm, *pudp);
        if (pud_user_accessible_page(pud)) {
                page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT,
                                     pud_write(pud));
        }
}
EXPORT_SYMBOL(__page_table_check_pud_set);

void __page_table_check_pte_clear_range(struct mm_struct *mm,
                                        unsigned long addr,
                                        pmd_t pmd)
{
        if (&init_mm == mm)
                return;

        if (!pmd_bad(pmd) && !pmd_leaf(pmd)) {
                pte_t *ptep = pte_offset_map(&pmd, addr);
                unsigned long i;

                if (WARN_ON(!ptep))
                        return;
                for (i = 0; i < PTRS_PER_PTE; i++) {
                        __page_table_check_pte_clear(mm, ptep_get(ptep));
                        addr += PAGE_SIZE;
                        ptep++;
                }
                pte_unmap(ptep - PTRS_PER_PTE);
        }
}

























   37 


    6 













   34 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
// SPDX-License-Identifier: GPL-2.0
#include <linux/fault-inject.h>
#include <linux/mm.h>

static struct {
        struct fault_attr attr;

        bool ignore_gfp_highmem;
        bool ignore_gfp_reclaim;
        u32 min_order;
} fail_page_alloc = {
        .attr = FAULT_ATTR_INITIALIZER,
        .ignore_gfp_reclaim = true,
        .ignore_gfp_highmem = true,
        .min_order = 1,
};

static int __init setup_fail_page_alloc(char *str)
{
        return setup_fault_attr(&fail_page_alloc.attr, str);
}
__setup("fail_page_alloc=", setup_fail_page_alloc);

bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
        int flags = 0;

        if (order < fail_page_alloc.min_order)
                return false;
        if (gfp_mask & __GFP_NOFAIL)
                return false;
        if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
                return false;
        if (fail_page_alloc.ignore_gfp_reclaim &&
                        (gfp_mask & __GFP_DIRECT_RECLAIM))
                return false;

        /* See comment in __should_failslab() */
        if (gfp_mask & __GFP_NOWARN)
                flags |= FAULT_NOWARN;

        return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags);
}

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

static int __init fail_page_alloc_debugfs(void)
{
        umode_t mode = S_IFREG | 0600;
        struct dentry *dir;

        dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
                                        &fail_page_alloc.attr);

        debugfs_create_bool("ignore-gfp-wait", mode, dir,
                            &fail_page_alloc.ignore_gfp_reclaim);
        debugfs_create_bool("ignore-gfp-highmem", mode, dir,
                            &fail_page_alloc.ignore_gfp_highmem);
        debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);

        return 0;
}

late_initcall(fail_page_alloc_debugfs);

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */





































































































































    2 




    1 












    1 

    1 












    2 

































































































































































    1 


















    1 









































    1 
    1 


    1 






























    4 



















    4 
    2 

























    2 
















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
// SPDX-License-Identifier: GPL-2.0+
/*
 * linux/fs/jbd2/revoke.c
 *
 * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
 *
 * Copyright 2000 Red Hat corp --- All Rights Reserved
 *
 * Journal revoke routines for the generic filesystem journaling code;
 * part of the ext2fs journaling system.
 *
 * Revoke is the mechanism used to prevent old log records for deleted
 * metadata from being replayed on top of newer data using the same
 * blocks.  The revoke mechanism is used in two separate places:
 *
 * + Commit: during commit we write the entire list of the current
 *   transaction's revoked blocks to the journal
 *
 * + Recovery: during recovery we record the transaction ID of all
 *   revoked blocks.  If there are multiple revoke records in the log
 *   for a single block, only the last one counts, and if there is a log
 *   entry for a block beyond the last revoke, then that log entry still
 *   gets replayed.
 *
 * We can get interactions between revokes and new log data within a
 * single transaction:
 *
 * Block is revoked and then journaled:
 *   The desired end result is the journaling of the new block, so we
 *   cancel the revoke before the transaction commits.
 *
 * Block is journaled and then revoked:
 *   The revoke must take precedence over the write of the block, so we
 *   need either to cancel the journal entry or to write the revoke
 *   later in the log than the log block.  In this case, we choose the
 *   latter: journaling a block cancels any revoke record for that block
 *   in the current transaction, so any revoke for that block in the
 *   transaction must have happened after the block was journaled and so
 *   the revoke must take precedence.
 *
 * Block is revoked and then written as data:
 *   The data write is allowed to succeed, but the revoke is _not_
 *   cancelled.  We still need to prevent old log records from
 *   overwriting the new data.  We don't even need to clear the revoke
 *   bit here.
 *
 * We cache revoke status of a buffer in the current transaction in b_states
 * bits.  As the name says, revokevalid flag indicates that the cached revoke
 * status of a buffer is valid and we can rely on the cached status.
 *
 * Revoke information on buffers is a tri-state value:
 *
 * RevokeValid clear:        no cached revoke status, need to look it up
 * RevokeValid set, Revoked clear:
 *                        buffer has not been revoked, and cancel_revoke
 *                        need do nothing.
 * RevokeValid set, Revoked set:
 *                        buffer has been revoked.
 *
 * Locking rules:
 * We keep two hash tables of revoke records. One hashtable belongs to the
 * running transaction (is pointed to by journal->j_revoke), the other one
 * belongs to the committing transaction. Accesses to the second hash table
 * happen only from the kjournald and no other thread touches this table.  Also
 * journal_switch_revoke_table() which switches which hashtable belongs to the
 * running and which to the committing transaction is called only from
 * kjournald. Therefore we need no locks when accessing the hashtable belonging
 * to the committing transaction.
 *
 * All users operating on the hash table belonging to the running transaction
 * have a handle to the transaction. Therefore they are safe from kjournald
 * switching hash tables under them. For operations on the lists of entries in
 * the hash table j_revoke_lock is used.
 *
 * Finally, also replay code uses the hash tables but at this moment no one else
 * can touch them (filesystem isn't mounted yet) and hence no locking is
 * needed.
 */

#ifndef __KERNEL__
#include "jfs_user.h"
#else
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/log2.h>
#include <linux/hash.h>
#endif

static struct kmem_cache *jbd2_revoke_record_cache;
static struct kmem_cache *jbd2_revoke_table_cache;

/* Each revoke record represents one single revoked block.  During
   journal replay, this involves recording the transaction ID of the
   last transaction to revoke this block. */

struct jbd2_revoke_record_s
{
        struct list_head  hash;
        tid_t                  sequence;        /* Used for recovery only */
        unsigned long long          blocknr;
};


/* The revoke table is just a simple hash table of revoke records. */
struct jbd2_revoke_table_s
{
        /* It is conceivable that we might want a larger hash table
         * for recovery.  Must be a power of two. */
        int                  hash_size;
        int                  hash_shift;
        struct list_head *hash_table;
};


#ifdef __KERNEL__
static void write_one_revoke_record(transaction_t *,
                                    struct list_head *,
                                    struct buffer_head **, int *,
                                    struct jbd2_revoke_record_s *);
static void flush_descriptor(journal_t *, struct buffer_head *, int);
#endif

/* Utility functions to maintain the revoke table */

static inline int hash(journal_t *journal, unsigned long long block)
{
        return hash_64(block, journal->j_revoke->hash_shift);
}

static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
                              tid_t seq)
{
        struct list_head *hash_list;
        struct jbd2_revoke_record_s *record;
        gfp_t gfp_mask = GFP_NOFS;

        if (journal_oom_retry)
                gfp_mask |= __GFP_NOFAIL;
        record = kmem_cache_alloc(jbd2_revoke_record_cache, gfp_mask);
        if (!record)
                return -ENOMEM;

        record->sequence = seq;
        record->blocknr = blocknr;
        hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
        spin_lock(&journal->j_revoke_lock);
        list_add(&record->hash, hash_list);
        spin_unlock(&journal->j_revoke_lock);
        return 0;
}

/* Find a revoke record in the journal's hash table. */

static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal,
                                                      unsigned long long blocknr)
{
        struct list_head *hash_list;
        struct jbd2_revoke_record_s *record;

        hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];

        spin_lock(&journal->j_revoke_lock);
        record = (struct jbd2_revoke_record_s *) hash_list->next;
        while (&(record->hash) != hash_list) {
                if (record->blocknr == blocknr) {
                        spin_unlock(&journal->j_revoke_lock);
                        return record;
                }
                record = (struct jbd2_revoke_record_s *) record->hash.next;
        }
        spin_unlock(&journal->j_revoke_lock);
        return NULL;
}

void jbd2_journal_destroy_revoke_record_cache(void)
{
        kmem_cache_destroy(jbd2_revoke_record_cache);
        jbd2_revoke_record_cache = NULL;
}

void jbd2_journal_destroy_revoke_table_cache(void)
{
        kmem_cache_destroy(jbd2_revoke_table_cache);
        jbd2_revoke_table_cache = NULL;
}

int __init jbd2_journal_init_revoke_record_cache(void)
{
        J_ASSERT(!jbd2_revoke_record_cache);
        jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s,
                                        SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY);

        if (!jbd2_revoke_record_cache) {
                pr_emerg("JBD2: failed to create revoke_record cache\n");
                return -ENOMEM;
        }
        return 0;
}

int __init jbd2_journal_init_revoke_table_cache(void)
{
        J_ASSERT(!jbd2_revoke_table_cache);
        jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s,
                                             SLAB_TEMPORARY);
        if (!jbd2_revoke_table_cache) {
                pr_emerg("JBD2: failed to create revoke_table cache\n");
                return -ENOMEM;
        }
        return 0;
}

static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
{
        int shift = 0;
        int tmp = hash_size;
        struct jbd2_revoke_table_s *table;

        table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
        if (!table)
                goto out;

        while((tmp >>= 1UL) != 0UL)
                shift++;

        table->hash_size = hash_size;
        table->hash_shift = shift;
        table->hash_table =
                kmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL);
        if (!table->hash_table) {
                kmem_cache_free(jbd2_revoke_table_cache, table);
                table = NULL;
                goto out;
        }

        for (tmp = 0; tmp < hash_size; tmp++)
                INIT_LIST_HEAD(&table->hash_table[tmp]);

out:
        return table;
}

static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
{
        int i;
        struct list_head *hash_list;

        for (i = 0; i < table->hash_size; i++) {
                hash_list = &table->hash_table[i];
                J_ASSERT(list_empty(hash_list));
        }

        kfree(table->hash_table);
        kmem_cache_free(jbd2_revoke_table_cache, table);
}

/* Initialise the revoke table for a given journal to a given size. */
int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
{
        J_ASSERT(journal->j_revoke_table[0] == NULL);
        J_ASSERT(is_power_of_2(hash_size));

        journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size);
        if (!journal->j_revoke_table[0])
                goto fail0;

        journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size);
        if (!journal->j_revoke_table[1])
                goto fail1;

        journal->j_revoke = journal->j_revoke_table[1];

        spin_lock_init(&journal->j_revoke_lock);

        return 0;

fail1:
        jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]);
        journal->j_revoke_table[0] = NULL;
fail0:
        return -ENOMEM;
}

/* Destroy a journal's revoke table.  The table must already be empty! */
void jbd2_journal_destroy_revoke(journal_t *journal)
{
        journal->j_revoke = NULL;
        if (journal->j_revoke_table[0])
                jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]);
        if (journal->j_revoke_table[1])
                jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]);
}


#ifdef __KERNEL__

/*
 * jbd2_journal_revoke: revoke a given buffer_head from the journal.  This
 * prevents the block from being replayed during recovery if we take a
 * crash after this current transaction commits.  Any subsequent
 * metadata writes of the buffer in this transaction cancel the
 * revoke.
 *
 * Note that this call may block --- it is up to the caller to make
 * sure that there are no further calls to journal_write_metadata
 * before the revoke is complete.  In ext3, this implies calling the
 * revoke before clearing the block bitmap when we are deleting
 * metadata.
 *
 * Revoke performs a jbd2_journal_forget on any buffer_head passed in as a
 * parameter, but does _not_ forget the buffer_head if the bh was only
 * found implicitly.
 *
 * bh_in may not be a journalled buffer - it may have come off
 * the hash tables without an attached journal_head.
 *
 * If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count
 * by one.
 */

int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
                   struct buffer_head *bh_in)
{
        struct buffer_head *bh = NULL;
        journal_t *journal;
        struct block_device *bdev;
        int err;

        might_sleep();
        if (bh_in)
                BUFFER_TRACE(bh_in, "enter");

        journal = handle->h_transaction->t_journal;
        if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){
                J_ASSERT (!"Cannot set revoke feature!");
                return -EINVAL;
        }

        bdev = journal->j_fs_dev;
        bh = bh_in;

        if (!bh) {
                bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
                if (bh)
                        BUFFER_TRACE(bh, "found on hash");
        }
#ifdef JBD2_EXPENSIVE_CHECKING
        else {
                struct buffer_head *bh2;

                /* If there is a different buffer_head lying around in
                 * memory anywhere... */
                bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
                if (bh2) {
                        /* ... and it has RevokeValid status... */
                        if (bh2 != bh && buffer_revokevalid(bh2))
                                /* ...then it better be revoked too,
                                 * since it's illegal to create a revoke
                                 * record against a buffer_head which is
                                 * not marked revoked --- that would
                                 * risk missing a subsequent revoke
                                 * cancel. */
                                J_ASSERT_BH(bh2, buffer_revoked(bh2));
                        put_bh(bh2);
                }
        }
#endif

        if (WARN_ON_ONCE(handle->h_revoke_credits <= 0)) {
                if (!bh_in)
                        brelse(bh);
                return -EIO;
        }
        /* We really ought not ever to revoke twice in a row without
           first having the revoke cancelled: it's illegal to free a
           block twice without allocating it in between! */
        if (bh) {
                if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
                                 "inconsistent data on disk")) {
                        if (!bh_in)
                                brelse(bh);
                        return -EIO;
                }
                set_buffer_revoked(bh);
                set_buffer_revokevalid(bh);
                if (bh_in) {
                        BUFFER_TRACE(bh_in, "call jbd2_journal_forget");
                        jbd2_journal_forget(handle, bh_in);
                } else {
                        BUFFER_TRACE(bh, "call brelse");
                        __brelse(bh);
                }
        }
        handle->h_revoke_credits--;

        jbd2_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
        err = insert_revoke_hash(journal, blocknr,
                                handle->h_transaction->t_tid);
        BUFFER_TRACE(bh_in, "exit");
        return err;
}

/*
 * Cancel an outstanding revoke.  For use only internally by the
 * journaling code (called from jbd2_journal_get_write_access).
 *
 * We trust buffer_revoked() on the buffer if the buffer is already
 * being journaled: if there is no revoke pending on the buffer, then we
 * don't do anything here.
 *
 * This would break if it were possible for a buffer to be revoked and
 * discarded, and then reallocated within the same transaction.  In such
 * a case we would have lost the revoked bit, but when we arrived here
 * the second time we would still have a pending revoke to cancel.  So,
 * do not trust the Revoked bit on buffers unless RevokeValid is also
 * set.
 */
int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
{
        struct jbd2_revoke_record_s *record;
        journal_t *journal = handle->h_transaction->t_journal;
        int need_cancel;
        int did_revoke = 0;        /* akpm: debug */
        struct buffer_head *bh = jh2bh(jh);

        jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh);

        /* Is the existing Revoke bit valid?  If so, we trust it, and
         * only perform the full cancel if the revoke bit is set.  If
         * not, we can't trust the revoke bit, and we need to do the
         * full search for a revoke record. */
        if (test_set_buffer_revokevalid(bh)) {
                need_cancel = test_clear_buffer_revoked(bh);
        } else {
                need_cancel = 1;
                clear_buffer_revoked(bh);
        }

        if (need_cancel) {
                record = find_revoke_record(journal, bh->b_blocknr);
                if (record) {
                        jbd2_debug(4, "cancelled existing revoke on "
                                  "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
                        spin_lock(&journal->j_revoke_lock);
                        list_del(&record->hash);
                        spin_unlock(&journal->j_revoke_lock);
                        kmem_cache_free(jbd2_revoke_record_cache, record);
                        did_revoke = 1;
                }
        }

#ifdef JBD2_EXPENSIVE_CHECKING
        /* There better not be one left behind by now! */
        record = find_revoke_record(journal, bh->b_blocknr);
        J_ASSERT_JH(jh, record == NULL);
#endif

        /* Finally, have we just cleared revoke on an unhashed
         * buffer_head?  If so, we'd better make sure we clear the
         * revoked status on any hashed alias too, otherwise the revoke
         * state machine will get very upset later on. */
        if (need_cancel) {
                struct buffer_head *bh2;
                bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
                if (bh2) {
                        if (bh2 != bh)
                                clear_buffer_revoked(bh2);
                        __brelse(bh2);
                }
        }
        return did_revoke;
}

/*
 * journal_clear_revoked_flag clears revoked flag of buffers in
 * revoke table to reflect there is no revoked buffers in the next
 * transaction which is going to be started.
 */
void jbd2_clear_buffer_revoked_flags(journal_t *journal)
{
        struct jbd2_revoke_table_s *revoke = journal->j_revoke;
        int i = 0;

        for (i = 0; i < revoke->hash_size; i++) {
                struct list_head *hash_list;
                struct list_head *list_entry;
                hash_list = &revoke->hash_table[i];

                list_for_each(list_entry, hash_list) {
                        struct jbd2_revoke_record_s *record;
                        struct buffer_head *bh;
                        record = (struct jbd2_revoke_record_s *)list_entry;
                        bh = __find_get_block(journal->j_fs_dev,
                                              record->blocknr,
                                              journal->j_blocksize);
                        if (bh) {
                                clear_buffer_revoked(bh);
                                __brelse(bh);
                        }
                }
        }
}

/* journal_switch_revoke table select j_revoke for next transaction
 * we do not want to suspend any processing until all revokes are
 * written -bzzz
 */
void jbd2_journal_switch_revoke_table(journal_t *journal)
{
        int i;

        if (journal->j_revoke == journal->j_revoke_table[0])
                journal->j_revoke = journal->j_revoke_table[1];
        else
                journal->j_revoke = journal->j_revoke_table[0];

        for (i = 0; i < journal->j_revoke->hash_size; i++)
                INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
}

/*
 * Write revoke records to the journal for all entries in the current
 * revoke hash, deleting the entries as we go.
 */
void jbd2_journal_write_revoke_records(transaction_t *transaction,
                                       struct list_head *log_bufs)
{
        journal_t *journal = transaction->t_journal;
        struct buffer_head *descriptor;
        struct jbd2_revoke_record_s *record;
        struct jbd2_revoke_table_s *revoke;
        struct list_head *hash_list;
        int i, offset, count;

        descriptor = NULL;
        offset = 0;
        count = 0;

        /* select revoke table for committing transaction */
        revoke = journal->j_revoke == journal->j_revoke_table[0] ?
                journal->j_revoke_table[1] : journal->j_revoke_table[0];

        for (i = 0; i < revoke->hash_size; i++) {
                hash_list = &revoke->hash_table[i];

                while (!list_empty(hash_list)) {
                        record = (struct jbd2_revoke_record_s *)
                                hash_list->next;
                        write_one_revoke_record(transaction, log_bufs,
                                                &descriptor, &offset, record);
                        count++;
                        list_del(&record->hash);
                        kmem_cache_free(jbd2_revoke_record_cache, record);
                }
        }
        if (descriptor)
                flush_descriptor(journal, descriptor, offset);
        jbd2_debug(1, "Wrote %d revoke records\n", count);
}

/*
 * Write out one revoke record.  We need to create a new descriptor
 * block if the old one is full or if we have not already created one.
 */

static void write_one_revoke_record(transaction_t *transaction,
                                    struct list_head *log_bufs,
                                    struct buffer_head **descriptorp,
                                    int *offsetp,
                                    struct jbd2_revoke_record_s *record)
{
        journal_t *journal = transaction->t_journal;
        int csum_size = 0;
        struct buffer_head *descriptor;
        int sz, offset;

        /* If we are already aborting, this all becomes a noop.  We
           still need to go round the loop in
           jbd2_journal_write_revoke_records in order to free all of the
           revoke records: only the IO to the journal is omitted. */
        if (is_journal_aborted(journal))
                return;

        descriptor = *descriptorp;
        offset = *offsetp;

        /* Do we need to leave space at the end for a checksum? */
        if (jbd2_journal_has_csum_v2or3(journal))
                csum_size = sizeof(struct jbd2_journal_block_tail);

        if (jbd2_has_feature_64bit(journal))
                sz = 8;
        else
                sz = 4;

        /* Make sure we have a descriptor with space left for the record */
        if (descriptor) {
                if (offset + sz > journal->j_blocksize - csum_size) {
                        flush_descriptor(journal, descriptor, offset);
                        descriptor = NULL;
                }
        }

        if (!descriptor) {
                descriptor = jbd2_journal_get_descriptor_buffer(transaction,
                                                        JBD2_REVOKE_BLOCK);
                if (!descriptor)
                        return;

                /* Record it so that we can wait for IO completion later */
                BUFFER_TRACE(descriptor, "file in log_bufs");
                jbd2_file_log_bh(log_bufs, descriptor);

                offset = sizeof(jbd2_journal_revoke_header_t);
                *descriptorp = descriptor;
        }

        if (jbd2_has_feature_64bit(journal))
                * ((__be64 *)(&descriptor->b_data[offset])) =
                        cpu_to_be64(record->blocknr);
        else
                * ((__be32 *)(&descriptor->b_data[offset])) =
                        cpu_to_be32(record->blocknr);
        offset += sz;

        *offsetp = offset;
}

/*
 * Flush a revoke descriptor out to the journal.  If we are aborting,
 * this is a noop; otherwise we are generating a buffer which needs to
 * be waited for during commit, so it has to go onto the appropriate
 * journal buffer list.
 */

static void flush_descriptor(journal_t *journal,
                             struct buffer_head *descriptor,
                             int offset)
{
        jbd2_journal_revoke_header_t *header;

        if (is_journal_aborted(journal))
                return;

        header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
        header->r_count = cpu_to_be32(offset);
        jbd2_descriptor_block_csum_set(journal, descriptor);

        set_buffer_jwrite(descriptor);
        BUFFER_TRACE(descriptor, "write");
        set_buffer_dirty(descriptor);
        write_dirty_buffer(descriptor, REQ_SYNC);
}
#endif

/*
 * Revoke support for recovery.
 *
 * Recovery needs to be able to:
 *
 *  record all revoke records, including the tid of the latest instance
 *  of each revoke in the journal
 *
 *  check whether a given block in a given transaction should be replayed
 *  (ie. has not been revoked by a revoke record in that or a subsequent
 *  transaction)
 *
 *  empty the revoke table after recovery.
 */

/*
 * First, setting revoke records.  We create a new revoke record for
 * every block ever revoked in the log as we scan it for recovery, and
 * we update the existing records if we find multiple revokes for a
 * single block.
 */

int jbd2_journal_set_revoke(journal_t *journal,
                       unsigned long long blocknr,
                       tid_t sequence)
{
        struct jbd2_revoke_record_s *record;

        record = find_revoke_record(journal, blocknr);
        if (record) {
                /* If we have multiple occurrences, only record the
                 * latest sequence number in the hashed record */
                if (tid_gt(sequence, record->sequence))
                        record->sequence = sequence;
                return 0;
        }
        return insert_revoke_hash(journal, blocknr, sequence);
}

/*
 * Test revoke records.  For a given block referenced in the log, has
 * that block been revoked?  A revoke record with a given transaction
 * sequence number revokes all blocks in that transaction and earlier
 * ones, but later transactions still need replayed.
 */

int jbd2_journal_test_revoke(journal_t *journal,
                        unsigned long long blocknr,
                        tid_t sequence)
{
        struct jbd2_revoke_record_s *record;

        record = find_revoke_record(journal, blocknr);
        if (!record)
                return 0;
        if (tid_gt(sequence, record->sequence))
                return 0;
        return 1;
}

/*
 * Finally, once recovery is over, we need to clear the revoke table so
 * that it can be reused by the running filesystem.
 */

void jbd2_journal_clear_revoke(journal_t *journal)
{
        int i;
        struct list_head *hash_list;
        struct jbd2_revoke_record_s *record;
        struct jbd2_revoke_table_s *revoke;

        revoke = journal->j_revoke;

        for (i = 0; i < revoke->hash_size; i++) {
                hash_list = &revoke->hash_table[i];
                while (!list_empty(hash_list)) {
                        record = (struct jbd2_revoke_record_s*) hash_list->next;
                        list_del(&record->hash);
                        kmem_cache_free(jbd2_revoke_record_cache, record);
                }
        }
}
































































    2 































































































































































































































































































































































































    1 
    1 

    1 





    1 





    1 






    1 








    1 



























    1 





    1 

























































































































    1 





















    1 



























    2 










    2 








    2 






    2 
    2 




    2 















    1 


    1 
    1 






    1 






















    2 

    1 

    2 









































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 * Copyright (C) 2022 Christoph Hellwig.
 */

#include <linux/bio.h>
#include "bio.h"
#include "ctree.h"
#include "volumes.h"
#include "raid56.h"
#include "async-thread.h"
#include "dev-replace.h"
#include "zoned.h"
#include "file-item.h"
#include "raid-stripe-tree.h"

static struct bio_set btrfs_bioset;
static struct bio_set btrfs_clone_bioset;
static struct bio_set btrfs_repair_bioset;
static mempool_t btrfs_failed_bio_pool;

struct btrfs_failed_bio {
        struct btrfs_bio *bbio;
        int num_copies;
        atomic_t repair_count;
};

/* Is this a data path I/O that needs storage layer checksum and repair? */
static inline bool is_data_bbio(struct btrfs_bio *bbio)
{
        return bbio->inode && is_data_inode(&bbio->inode->vfs_inode);
}

static bool bbio_has_ordered_extent(struct btrfs_bio *bbio)
{
        return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE;
}

/*
 * Initialize a btrfs_bio structure.  This skips the embedded bio itself as it
 * is already initialized by the block layer.
 */
void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
                    btrfs_bio_end_io_t end_io, void *private)
{
        memset(bbio, 0, offsetof(struct btrfs_bio, bio));
        bbio->fs_info = fs_info;
        bbio->end_io = end_io;
        bbio->private = private;
        atomic_set(&bbio->pending_ios, 1);
}

/*
 * Allocate a btrfs_bio structure.  The btrfs_bio is the main I/O container for
 * btrfs, and is used for all I/O submitted through btrfs_submit_bio.
 *
 * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
 * a mempool.
 */
struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
                                  struct btrfs_fs_info *fs_info,
                                  btrfs_bio_end_io_t end_io, void *private)
{
        struct btrfs_bio *bbio;
        struct bio *bio;

        bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
        bbio = btrfs_bio(bio);
        btrfs_bio_init(bbio, fs_info, end_io, private);
        return bbio;
}

static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
                                         struct btrfs_bio *orig_bbio,
                                         u64 map_length, bool use_append)
{
        struct btrfs_bio *bbio;
        struct bio *bio;

        if (use_append) {
                unsigned int nr_segs;

                bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs,
                                   &btrfs_clone_bioset, map_length);
        } else {
                bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT,
                                GFP_NOFS, &btrfs_clone_bioset);
        }
        bbio = btrfs_bio(bio);
        btrfs_bio_init(bbio, fs_info, NULL, orig_bbio);
        bbio->inode = orig_bbio->inode;
        bbio->file_offset = orig_bbio->file_offset;
        orig_bbio->file_offset += map_length;
        if (bbio_has_ordered_extent(bbio)) {
                refcount_inc(&orig_bbio->ordered->refs);
                bbio->ordered = orig_bbio->ordered;
        }
        atomic_inc(&orig_bbio->pending_ios);
        return bbio;
}

/* Free a bio that was never submitted to the underlying device. */
static void btrfs_cleanup_bio(struct btrfs_bio *bbio)
{
        if (bbio_has_ordered_extent(bbio))
                btrfs_put_ordered_extent(bbio->ordered);
        bio_put(&bbio->bio);
}

static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
{
        if (bbio_has_ordered_extent(bbio)) {
                struct btrfs_ordered_extent *ordered = bbio->ordered;

                bbio->end_io(bbio);
                btrfs_put_ordered_extent(ordered);
        } else {
                bbio->end_io(bbio);
        }
}

void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
{
        bbio->bio.bi_status = status;
        __btrfs_bio_end_io(bbio);
}

static void btrfs_orig_write_end_io(struct bio *bio);

static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
                                       struct btrfs_bio *orig_bbio)
{
        /*
         * For writes we tolerate nr_mirrors - 1 write failures, so we can't
         * just blindly propagate a write failure here.  Instead increment the
         * error count in the original I/O context so that it is guaranteed to
         * be larger than the error tolerance.
         */
        if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) {
                struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private;
                struct btrfs_io_context *orig_bioc = orig_stripe->bioc;

                atomic_add(orig_bioc->max_errors, &orig_bioc->error);
        } else {
                orig_bbio->bio.bi_status = bbio->bio.bi_status;
        }
}

static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio)
{
        if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
                struct btrfs_bio *orig_bbio = bbio->private;

                if (bbio->bio.bi_status)
                        btrfs_bbio_propagate_error(bbio, orig_bbio);
                btrfs_cleanup_bio(bbio);
                bbio = orig_bbio;
        }

        if (atomic_dec_and_test(&bbio->pending_ios))
                __btrfs_bio_end_io(bbio);
}

static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
{
        if (cur_mirror == fbio->num_copies)
                return cur_mirror + 1 - fbio->num_copies;
        return cur_mirror + 1;
}

static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
{
        if (cur_mirror == 1)
                return fbio->num_copies;
        return cur_mirror - 1;
}

static void btrfs_repair_done(struct btrfs_failed_bio *fbio)
{
        if (atomic_dec_and_test(&fbio->repair_count)) {
                btrfs_orig_bbio_end_io(fbio->bbio);
                mempool_free(fbio, &btrfs_failed_bio_pool);
        }
}

static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
                                 struct btrfs_device *dev)
{
        struct btrfs_failed_bio *fbio = repair_bbio->private;
        struct btrfs_inode *inode = repair_bbio->inode;
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio);
        int mirror = repair_bbio->mirror_num;

        /*
         * We can only trigger this for data bio, which doesn't support larger
         * folios yet.
         */
        ASSERT(folio_order(page_folio(bv->bv_page)) == 0);

        if (repair_bbio->bio.bi_status ||
            !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) {
                bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
                repair_bbio->bio.bi_iter = repair_bbio->saved_iter;

                mirror = next_repair_mirror(fbio, mirror);
                if (mirror == fbio->bbio->mirror_num) {
                        btrfs_debug(fs_info, "no mirror left");
                        fbio->bbio->bio.bi_status = BLK_STS_IOERR;
                        goto done;
                }

                btrfs_submit_bio(repair_bbio, mirror);
                return;
        }

        do {
                mirror = prev_repair_mirror(fbio, mirror);
                btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
                                  repair_bbio->file_offset, fs_info->sectorsize,
                                  repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
                                  page_folio(bv->bv_page), bv->bv_offset, mirror);
        } while (mirror != fbio->bbio->mirror_num);

done:
        btrfs_repair_done(fbio);
        bio_put(&repair_bbio->bio);
}

/*
 * Try to kick off a repair read to the next available mirror for a bad sector.
 *
 * This primarily tries to recover good data to serve the actual read request,
 * but also tries to write the good data back to the bad mirror(s) when a
 * read succeeded to restore the redundancy.
 */
static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
                                                  u32 bio_offset,
                                                  struct bio_vec *bv,
                                                  struct btrfs_failed_bio *fbio)
{
        struct btrfs_inode *inode = failed_bbio->inode;
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        const u32 sectorsize = fs_info->sectorsize;
        const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT);
        struct btrfs_bio *repair_bbio;
        struct bio *repair_bio;
        int num_copies;
        int mirror;

        btrfs_debug(fs_info, "repair read error: read error at %llu",
                    failed_bbio->file_offset + bio_offset);

        num_copies = btrfs_num_copies(fs_info, logical, sectorsize);
        if (num_copies == 1) {
                btrfs_debug(fs_info, "no copy to repair from");
                failed_bbio->bio.bi_status = BLK_STS_IOERR;
                return fbio;
        }

        if (!fbio) {
                fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS);
                fbio->bbio = failed_bbio;
                fbio->num_copies = num_copies;
                atomic_set(&fbio->repair_count, 1);
        }

        atomic_inc(&fbio->repair_count);

        repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS,
                                      &btrfs_repair_bioset);
        repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector;
        __bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);

        repair_bbio = btrfs_bio(repair_bio);
        btrfs_bio_init(repair_bbio, fs_info, NULL, fbio);
        repair_bbio->inode = failed_bbio->inode;
        repair_bbio->file_offset = failed_bbio->file_offset + bio_offset;

        mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
        btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
        btrfs_submit_bio(repair_bbio, mirror);
        return fbio;
}

static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev)
{
        struct btrfs_inode *inode = bbio->inode;
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        u32 sectorsize = fs_info->sectorsize;
        struct bvec_iter *iter = &bbio->saved_iter;
        blk_status_t status = bbio->bio.bi_status;
        struct btrfs_failed_bio *fbio = NULL;
        u32 offset = 0;

        /* Read-repair requires the inode field to be set by the submitter. */
        ASSERT(inode);

        /*
         * Hand off repair bios to the repair code as there is no upper level
         * submitter for them.
         */
        if (bbio->bio.bi_pool == &btrfs_repair_bioset) {
                btrfs_end_repair_bio(bbio, dev);
                return;
        }

        /* Clear the I/O error. A failed repair will reset it. */
        bbio->bio.bi_status = BLK_STS_OK;

        while (iter->bi_size) {
                struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter);

                bv.bv_len = min(bv.bv_len, sectorsize);
                if (status || !btrfs_data_csum_ok(bbio, dev, offset, &bv))
                        fbio = repair_one_sector(bbio, offset, &bv, fbio);

                bio_advance_iter_single(&bbio->bio, iter, sectorsize);
                offset += sectorsize;
        }

        if (bbio->csum != bbio->csum_inline)
                kfree(bbio->csum);

        if (fbio)
                btrfs_repair_done(fbio);
        else
                btrfs_orig_bbio_end_io(bbio);
}

static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
{
        if (!dev || !dev->bdev)
                return;
        if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
                return;

        if (btrfs_op(bio) == BTRFS_MAP_WRITE)
                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
        else if (!(bio->bi_opf & REQ_RAHEAD))
                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
        if (bio->bi_opf & REQ_PREFLUSH)
                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
}

static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info,
                                                struct bio *bio)
{
        if (bio->bi_opf & REQ_META)
                return fs_info->endio_meta_workers;
        return fs_info->endio_workers;
}

static void btrfs_end_bio_work(struct work_struct *work)
{
        struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);

        /* Metadata reads are checked and repaired by the submitter. */
        if (is_data_bbio(bbio))
                btrfs_check_read_bio(bbio, bbio->bio.bi_private);
        else
                btrfs_orig_bbio_end_io(bbio);
}

static void btrfs_simple_end_io(struct bio *bio)
{
        struct btrfs_bio *bbio = btrfs_bio(bio);
        struct btrfs_device *dev = bio->bi_private;
        struct btrfs_fs_info *fs_info = bbio->fs_info;

        btrfs_bio_counter_dec(fs_info);

        if (bio->bi_status)
                btrfs_log_dev_io_error(bio, dev);

        if (bio_op(bio) == REQ_OP_READ) {
                INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
                queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
        } else {
                if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
                        btrfs_record_physical_zoned(bbio);
                btrfs_orig_bbio_end_io(bbio);
        }
}

static void btrfs_raid56_end_io(struct bio *bio)
{
        struct btrfs_io_context *bioc = bio->bi_private;
        struct btrfs_bio *bbio = btrfs_bio(bio);

        btrfs_bio_counter_dec(bioc->fs_info);
        bbio->mirror_num = bioc->mirror_num;
        if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio))
                btrfs_check_read_bio(bbio, NULL);
        else
                btrfs_orig_bbio_end_io(bbio);

        btrfs_put_bioc(bioc);
}

static void btrfs_orig_write_end_io(struct bio *bio)
{
        struct btrfs_io_stripe *stripe = bio->bi_private;
        struct btrfs_io_context *bioc = stripe->bioc;
        struct btrfs_bio *bbio = btrfs_bio(bio);

        btrfs_bio_counter_dec(bioc->fs_info);

        if (bio->bi_status) {
                atomic_inc(&bioc->error);
                btrfs_log_dev_io_error(bio, stripe->dev);
        }

        /*
         * Only send an error to the higher layers if it is beyond the tolerance
         * threshold.
         */
        if (atomic_read(&bioc->error) > bioc->max_errors)
                bio->bi_status = BLK_STS_IOERR;
        else
                bio->bi_status = BLK_STS_OK;

        if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
                stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;

        btrfs_orig_bbio_end_io(bbio);
        btrfs_put_bioc(bioc);
}

static void btrfs_clone_write_end_io(struct bio *bio)
{
        struct btrfs_io_stripe *stripe = bio->bi_private;

        if (bio->bi_status) {
                atomic_inc(&stripe->bioc->error);
                btrfs_log_dev_io_error(bio, stripe->dev);
        } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
                stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
        }

        /* Pass on control to the original bio this one was cloned from */
        bio_endio(stripe->bioc->orig_bio);
        bio_put(bio);
}

static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
{
        if (!dev || !dev->bdev ||
            test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
            (btrfs_op(bio) == BTRFS_MAP_WRITE &&
             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
                bio_io_error(bio);
                return;
        }

        bio_set_dev(bio, dev->bdev);

        /*
         * For zone append writing, bi_sector must point the beginning of the
         * zone
         */
        if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
                u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
                u64 zone_start = round_down(physical, dev->fs_info->zone_size);

                ASSERT(btrfs_dev_is_sequential(dev, physical));
                bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
        }
        btrfs_debug_in_rcu(dev->fs_info,
        "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
                __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
                (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
                dev->devid, bio->bi_iter.bi_size);

        if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
                blkcg_punt_bio_submit(bio);
        else
                submit_bio(bio);
}

static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
{
        struct bio *orig_bio = bioc->orig_bio, *bio;

        ASSERT(bio_op(orig_bio) != REQ_OP_READ);

        /* Reuse the bio embedded into the btrfs_bio for the last mirror */
        if (dev_nr == bioc->num_stripes - 1) {
                bio = orig_bio;
                bio->bi_end_io = btrfs_orig_write_end_io;
        } else {
                bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set);
                bio_inc_remaining(orig_bio);
                bio->bi_end_io = btrfs_clone_write_end_io;
        }

        bio->bi_private = &bioc->stripes[dev_nr];
        bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
        bioc->stripes[dev_nr].bioc = bioc;
        bioc->size = bio->bi_iter.bi_size;
        btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
}

static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
                               struct btrfs_io_stripe *smap, int mirror_num)
{
        if (!bioc) {
                /* Single mirror read/write fast path. */
                btrfs_bio(bio)->mirror_num = mirror_num;
                bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
                if (bio_op(bio) != REQ_OP_READ)
                        btrfs_bio(bio)->orig_physical = smap->physical;
                bio->bi_private = smap->dev;
                bio->bi_end_io = btrfs_simple_end_io;
                btrfs_submit_dev_bio(smap->dev, bio);
        } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                /* Parity RAID write or read recovery. */
                bio->bi_private = bioc;
                bio->bi_end_io = btrfs_raid56_end_io;
                if (bio_op(bio) == REQ_OP_READ)
                        raid56_parity_recover(bio, bioc, mirror_num);
                else
                        raid56_parity_write(bio, bioc);
        } else {
                /* Write to multiple mirrors. */
                int total_devs = bioc->num_stripes;

                bioc->orig_bio = bio;
                for (int dev_nr = 0; dev_nr < total_devs; dev_nr++)
                        btrfs_submit_mirrored_bio(bioc, dev_nr);
        }
}

static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio)
{
        if (bbio->bio.bi_opf & REQ_META)
                return btree_csum_one_bio(bbio);
        return btrfs_csum_one_bio(bbio);
}

/*
 * Async submit bios are used to offload expensive checksumming onto the worker
 * threads.
 */
struct async_submit_bio {
        struct btrfs_bio *bbio;
        struct btrfs_io_context *bioc;
        struct btrfs_io_stripe smap;
        int mirror_num;
        struct btrfs_work work;
};

/*
 * In order to insert checksums into the metadata in large chunks, we wait
 * until bio submission time.   All the pages in the bio are checksummed and
 * sums are attached onto the ordered extent record.
 *
 * At IO completion time the csums attached on the ordered extent record are
 * inserted into the btree.
 */
static void run_one_async_start(struct btrfs_work *work)
{
        struct async_submit_bio *async =
                container_of(work, struct async_submit_bio, work);
        blk_status_t ret;

        ret = btrfs_bio_csum(async->bbio);
        if (ret)
                async->bbio->bio.bi_status = ret;
}

/*
 * In order to insert checksums into the metadata in large chunks, we wait
 * until bio submission time.   All the pages in the bio are checksummed and
 * sums are attached onto the ordered extent record.
 *
 * At IO completion time the csums attached on the ordered extent record are
 * inserted into the tree.
 *
 * If called with @do_free == true, then it will free the work struct.
 */
static void run_one_async_done(struct btrfs_work *work, bool do_free)
{
        struct async_submit_bio *async =
                container_of(work, struct async_submit_bio, work);
        struct bio *bio = &async->bbio->bio;

        if (do_free) {
                kfree(container_of(work, struct async_submit_bio, work));
                return;
        }

        /* If an error occurred we just want to clean up the bio and move on. */
        if (bio->bi_status) {
                btrfs_orig_bbio_end_io(async->bbio);
                return;
        }

        /*
         * All of the bios that pass through here are from async helpers.
         * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's
         * context.  This changes nothing when cgroups aren't in use.
         */
        bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT;
        __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
}

static bool should_async_write(struct btrfs_bio *bbio)
{
        bool auto_csum_mode = true;

#ifdef CONFIG_BTRFS_DEBUG
        struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices;
        enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode);

        if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_OFF)
                return false;

        auto_csum_mode = (csum_mode == BTRFS_OFFLOAD_CSUM_AUTO);
#endif

        /* Submit synchronously if the checksum implementation is fast. */
        if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
                return false;

        /*
         * Try to defer the submission to a workqueue to parallelize the
         * checksum calculation unless the I/O is issued synchronously.
         */
        if (op_is_sync(bbio->bio.bi_opf))
                return false;

        /* Zoned devices require I/O to be submitted in order. */
        if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info))
                return false;

        return true;
}

/*
 * Submit bio to an async queue.
 *
 * Return true if the work has been successfully submitted, else false.
 */
static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
                                struct btrfs_io_context *bioc,
                                struct btrfs_io_stripe *smap, int mirror_num)
{
        struct btrfs_fs_info *fs_info = bbio->fs_info;
        struct async_submit_bio *async;

        async = kmalloc(sizeof(*async), GFP_NOFS);
        if (!async)
                return false;

        async->bbio = bbio;
        async->bioc = bioc;
        async->smap = *smap;
        async->mirror_num = mirror_num;

        btrfs_init_work(&async->work, run_one_async_start, run_one_async_done);
        btrfs_queue_work(fs_info->workers, &async->work);
        return true;
}

static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
{
        struct btrfs_inode *inode = bbio->inode;
        struct btrfs_fs_info *fs_info = bbio->fs_info;
        struct btrfs_bio *orig_bbio = bbio;
        struct bio *bio = &bbio->bio;
        u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
        u64 length = bio->bi_iter.bi_size;
        u64 map_length = length;
        bool use_append = btrfs_use_zone_append(bbio);
        struct btrfs_io_context *bioc = NULL;
        struct btrfs_io_stripe smap;
        blk_status_t ret;
        int error;

        smap.is_scrub = !bbio->inode;

        btrfs_bio_counter_inc_blocked(fs_info);
        error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
                                &bioc, &smap, &mirror_num);
        if (error) {
                ret = errno_to_blk_status(error);
                goto fail;
        }

        map_length = min(map_length, length);
        if (use_append)
                map_length = min(map_length, fs_info->max_zone_append_size);

        if (map_length < length) {
                bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append);
                bio = &bbio->bio;
        }

        /*
         * Save the iter for the end_io handler and preload the checksums for
         * data reads.
         */
        if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) {
                bbio->saved_iter = bio->bi_iter;
                ret = btrfs_lookup_bio_sums(bbio);
                if (ret)
                        goto fail_put_bio;
        }

        if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
                if (use_append) {
                        bio->bi_opf &= ~REQ_OP_WRITE;
                        bio->bi_opf |= REQ_OP_ZONE_APPEND;
                }

                if (is_data_bbio(bbio) && bioc &&
                    btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
                        /*
                         * No locking for the list update, as we only add to
                         * the list in the I/O submission path, and list
                         * iteration only happens in the completion path, which
                         * can't happen until after the last submission.
                         */
                        btrfs_get_bioc(bioc);
                        list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list);
                }

                /*
                 * Csum items for reloc roots have already been cloned at this
                 * point, so they are handled as part of the no-checksum case.
                 */
                if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) &&
                    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
                    !btrfs_is_data_reloc_root(inode->root)) {
                        if (should_async_write(bbio) &&
                            btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num))
                                goto done;

                        ret = btrfs_bio_csum(bbio);
                        if (ret)
                                goto fail_put_bio;
                } else if (use_append ||
                           (btrfs_is_zoned(fs_info) && inode &&
                            inode->flags & BTRFS_INODE_NODATASUM)) {
                        ret = btrfs_alloc_dummy_sum(bbio);
                        if (ret)
                                goto fail_put_bio;
                }
        }

        __btrfs_submit_bio(bio, bioc, &smap, mirror_num);
done:
        return map_length == length;

fail_put_bio:
        if (map_length < length)
                btrfs_cleanup_bio(bbio);
fail:
        btrfs_bio_counter_dec(fs_info);
        btrfs_bio_end_io(orig_bbio, ret);
        /* Do not submit another chunk */
        return true;
}

void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
{
        /* If bbio->inode is not populated, its file_offset must be 0. */
        ASSERT(bbio->inode || bbio->file_offset == 0);

        while (!btrfs_submit_chunk(bbio, mirror_num))
                ;
}

/*
 * Submit a repair write.
 *
 * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a
 * RAID setup.  Here we only want to write the one bad copy, so we do the
 * mapping ourselves and submit the bio directly.
 *
 * The I/O is issued synchronously to block the repair read completion from
 * freeing the bio.
 */
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
                            u64 length, u64 logical, struct folio *folio,
                            unsigned int folio_offset, int mirror_num)
{
        struct btrfs_io_stripe smap = { 0 };
        struct bio_vec bvec;
        struct bio bio;
        int ret = 0;

        ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
        BUG_ON(!mirror_num);

        if (btrfs_repair_one_zone(fs_info, logical))
                return 0;

        /*
         * Avoid races with device replace and make sure our bioc has devices
         * associated to its stripes that don't go away while we are doing the
         * read repair operation.
         */
        btrfs_bio_counter_inc_blocked(fs_info);
        ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
        if (ret < 0)
                goto out_counter_dec;

        if (!smap.dev->bdev ||
            !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) {
                ret = -EIO;
                goto out_counter_dec;
        }

        bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
        bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
        ret = bio_add_folio(&bio, folio, length, folio_offset);
        ASSERT(ret);
        ret = submit_bio_wait(&bio);
        if (ret) {
                /* try to remap that extent elsewhere? */
                btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS);
                goto out_bio_uninit;
        }

        btrfs_info_rl_in_rcu(fs_info,
                "read error corrected: ino %llu off %llu (dev %s sector %llu)",
                             ino, start, btrfs_dev_name(smap.dev),
                             smap.physical >> SECTOR_SHIFT);
        ret = 0;

out_bio_uninit:
        bio_uninit(&bio);
out_counter_dec:
        btrfs_bio_counter_dec(fs_info);
        return ret;
}

/*
 * Submit a btrfs_bio based repair write.
 *
 * If @dev_replace is true, the write would be submitted to dev-replace target.
 */
void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace)
{
        struct btrfs_fs_info *fs_info = bbio->fs_info;
        u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
        u64 length = bbio->bio.bi_iter.bi_size;
        struct btrfs_io_stripe smap = { 0 };
        int ret;

        ASSERT(fs_info);
        ASSERT(mirror_num > 0);
        ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE);
        ASSERT(!bbio->inode);

        btrfs_bio_counter_inc_blocked(fs_info);
        ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
        if (ret < 0)
                goto fail;

        if (dev_replace) {
                ASSERT(smap.dev == fs_info->dev_replace.srcdev);
                smap.dev = fs_info->dev_replace.tgtdev;
        }
        __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
        return;

fail:
        btrfs_bio_counter_dec(fs_info);
        btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
}

int __init btrfs_bioset_init(void)
{
        if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
                        offsetof(struct btrfs_bio, bio),
                        BIOSET_NEED_BVECS))
                return -ENOMEM;
        if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE,
                        offsetof(struct btrfs_bio, bio), 0))
                goto out_free_bioset;
        if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE,
                        offsetof(struct btrfs_bio, bio),
                        BIOSET_NEED_BVECS))
                goto out_free_clone_bioset;
        if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE,
                                      sizeof(struct btrfs_failed_bio)))
                goto out_free_repair_bioset;
        return 0;

out_free_repair_bioset:
        bioset_exit(&btrfs_repair_bioset);
out_free_clone_bioset:
        bioset_exit(&btrfs_clone_bioset);
out_free_bioset:
        bioset_exit(&btrfs_bioset);
        return -ENOMEM;
}

void __cold btrfs_bioset_exit(void)
{
        mempool_exit(&btrfs_failed_bio_pool);
        bioset_exit(&btrfs_repair_bioset);
        bioset_exit(&btrfs_clone_bioset);
        bioset_exit(&btrfs_bioset);
}














































    1 















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_KSM_H
#define __LINUX_KSM_H
/*
 * Memory merging support.
 *
 * This code enables dynamic sharing of identical pages found in different
 * memory areas, even if they are not shared by fork().
 */

#include <linux/bitops.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/sched.h>
#include <linux/sched/coredump.h>

#ifdef CONFIG_KSM
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                unsigned long end, int advice, unsigned long *vm_flags);

void ksm_add_vma(struct vm_area_struct *vma);
int ksm_enable_merge_any(struct mm_struct *mm);
int ksm_disable_merge_any(struct mm_struct *mm);
int ksm_disable(struct mm_struct *mm);

int __ksm_enter(struct mm_struct *mm);
void __ksm_exit(struct mm_struct *mm);
/*
 * To identify zeropages that were mapped by KSM, we reuse the dirty bit
 * in the PTE. If the PTE is dirty, the zeropage was mapped by KSM when
 * deduplicating memory.
 */
#define is_ksm_zero_pte(pte)        (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte))

extern atomic_long_t ksm_zero_pages;

static inline void ksm_map_zero_page(struct mm_struct *mm)
{
        atomic_long_inc(&ksm_zero_pages);
        atomic_long_inc(&mm->ksm_zero_pages);
}

static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
{
        if (is_ksm_zero_pte(pte)) {
                atomic_long_dec(&ksm_zero_pages);
                atomic_long_dec(&mm->ksm_zero_pages);
        }
}

static inline long mm_ksm_zero_pages(struct mm_struct *mm)
{
        return atomic_long_read(&mm->ksm_zero_pages);
}

static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
        if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
                return __ksm_enter(mm);

        return 0;
}

static inline int ksm_execve(struct mm_struct *mm)
{
        if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
                return __ksm_enter(mm);

        return 0;
}

static inline void ksm_exit(struct mm_struct *mm)
{
        if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
                __ksm_exit(mm);
}

/*
 * When do_swap_page() first faults in from swap what used to be a KSM page,
 * no problem, it will be assigned to this vma's anon_vma; but thereafter,
 * it might be faulted into a different anon_vma (or perhaps to a different
 * offset in the same anon_vma).  do_swap_page() cannot do all the locking
 * needed to reconstitute a cross-anon_vma KSM page: for now it has to make
 * a copy, and leave remerging the pages to a later pass of ksmd.
 *
 * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
 * but what if the vma was unmerged while the page was swapped out?
 */
struct folio *ksm_might_need_to_copy(struct folio *folio,
                        struct vm_area_struct *vma, unsigned long addr);

void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc);
void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
void collect_procs_ksm(struct folio *folio, struct page *page,
                struct list_head *to_kill, int force_early);
long ksm_process_profit(struct mm_struct *);

#else  /* !CONFIG_KSM */

static inline void ksm_add_vma(struct vm_area_struct *vma)
{
}

static inline int ksm_disable(struct mm_struct *mm)
{
        return 0;
}

static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
        return 0;
}

static inline int ksm_execve(struct mm_struct *mm)
{
        return 0;
}

static inline void ksm_exit(struct mm_struct *mm)
{
}

static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
{
}

static inline void collect_procs_ksm(struct folio *folio, struct page *page,
                                     struct list_head *to_kill, int force_early)
{
}

#ifdef CONFIG_MMU
static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                unsigned long end, int advice, unsigned long *vm_flags)
{
        return 0;
}

static inline struct folio *ksm_might_need_to_copy(struct folio *folio,
                        struct vm_area_struct *vma, unsigned long addr)
{
        return folio;
}

static inline void rmap_walk_ksm(struct folio *folio,
                        struct rmap_walk_control *rwc)
{
}

static inline void folio_migrate_ksm(struct folio *newfolio, struct folio *old)
{
}
#endif /* CONFIG_MMU */
#endif /* !CONFIG_KSM */

#endif /* __LINUX_KSM_H */
















































































































































































































































































































































































































































































































































































































































































































    3 
    2 














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
 *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
 *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
 *
 *  NOHZ implementation for low and high resolution timers
 *
 *  Started by: Thomas Gleixner and Ingo Molnar
 */
#include <linux/compiler.h>
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/hrtimer.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/percpu.h>
#include <linux/nmi.h>
#include <linux/profile.h>
#include <linux/sched/signal.h>
#include <linux/sched/clock.h>
#include <linux/sched/stat.h>
#include <linux/sched/nohz.h>
#include <linux/sched/loadavg.h>
#include <linux/module.h>
#include <linux/irq_work.h>
#include <linux/posix-timers.h>
#include <linux/context_tracking.h>
#include <linux/mm.h>

#include <asm/irq_regs.h>

#include "tick-internal.h"

#include <trace/events/timer.h>

/*
 * Per-CPU nohz control structure
 */
static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);

struct tick_sched *tick_get_tick_sched(int cpu)
{
        return &per_cpu(tick_cpu_sched, cpu);
}

/*
 * The time when the last jiffy update happened. Write access must hold
 * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a
 * consistent view of jiffies and last_jiffies_update.
 */
static ktime_t last_jiffies_update;

/*
 * Must be called with interrupts disabled !
 */
static void tick_do_update_jiffies64(ktime_t now)
{
        unsigned long ticks = 1;
        ktime_t delta, nextp;

        /*
         * 64-bit can do a quick check without holding the jiffies lock and
         * without looking at the sequence count. The smp_load_acquire()
         * pairs with the update done later in this function.
         *
         * 32-bit cannot do that because the store of 'tick_next_period'
         * consists of two 32-bit stores, and the first store could be
         * moved by the CPU to a random point in the future.
         */
        if (IS_ENABLED(CONFIG_64BIT)) {
                if (ktime_before(now, smp_load_acquire(&tick_next_period)))
                        return;
        } else {
                unsigned int seq;

                /*
                 * Avoid contention on 'jiffies_lock' and protect the quick
                 * check with the sequence count.
                 */
                do {
                        seq = read_seqcount_begin(&jiffies_seq);
                        nextp = tick_next_period;
                } while (read_seqcount_retry(&jiffies_seq, seq));

                if (ktime_before(now, nextp))
                        return;
        }

        /* Quick check failed, i.e. update is required. */
        raw_spin_lock(&jiffies_lock);
        /*
         * Re-evaluate with the lock held. Another CPU might have done the
         * update already.
         */
        if (ktime_before(now, tick_next_period)) {
                raw_spin_unlock(&jiffies_lock);
                return;
        }

        write_seqcount_begin(&jiffies_seq);

        delta = ktime_sub(now, tick_next_period);
        if (unlikely(delta >= TICK_NSEC)) {
                /* Slow path for long idle sleep times */
                s64 incr = TICK_NSEC;

                ticks += ktime_divns(delta, incr);

                last_jiffies_update = ktime_add_ns(last_jiffies_update,
                                                   incr * ticks);
        } else {
                last_jiffies_update = ktime_add_ns(last_jiffies_update,
                                                   TICK_NSEC);
        }

        /* Advance jiffies to complete the 'jiffies_seq' protected job */
        jiffies_64 += ticks;

        /* Keep the tick_next_period variable up to date */
        nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC);

        if (IS_ENABLED(CONFIG_64BIT)) {
                /*
                 * Pairs with smp_load_acquire() in the lockless quick
                 * check above, and ensures that the update to 'jiffies_64' is
                 * not reordered vs. the store to 'tick_next_period', neither
                 * by the compiler nor by the CPU.
                 */
                smp_store_release(&tick_next_period, nextp);
        } else {
                /*
                 * A plain store is good enough on 32-bit, as the quick check
                 * above is protected by the sequence count.
                 */
                tick_next_period = nextp;
        }

        /*
         * Release the sequence count. calc_global_load() below is not
         * protected by it, but 'jiffies_lock' needs to be held to prevent
         * concurrent invocations.
         */
        write_seqcount_end(&jiffies_seq);

        calc_global_load();

        raw_spin_unlock(&jiffies_lock);
        update_wall_time();
}

/*
 * Initialize and return retrieve the jiffies update.
 */
static ktime_t tick_init_jiffy_update(void)
{
        ktime_t period;

        raw_spin_lock(&jiffies_lock);
        write_seqcount_begin(&jiffies_seq);

        /* Have we started the jiffies update yet ? */
        if (last_jiffies_update == 0) {
                u32 rem;

                /*
                 * Ensure that the tick is aligned to a multiple of
                 * TICK_NSEC.
                 */
                div_u64_rem(tick_next_period, TICK_NSEC, &rem);
                if (rem)
                        tick_next_period += TICK_NSEC - rem;

                last_jiffies_update = tick_next_period;
        }
        period = last_jiffies_update;

        write_seqcount_end(&jiffies_seq);
        raw_spin_unlock(&jiffies_lock);

        return period;
}

static inline int tick_sched_flag_test(struct tick_sched *ts,
                                       unsigned long flag)
{
        return !!(ts->flags & flag);
}

static inline void tick_sched_flag_set(struct tick_sched *ts,
                                       unsigned long flag)
{
        lockdep_assert_irqs_disabled();
        ts->flags |= flag;
}

static inline void tick_sched_flag_clear(struct tick_sched *ts,
                                         unsigned long flag)
{
        lockdep_assert_irqs_disabled();
        ts->flags &= ~flag;
}

#define MAX_STALLED_JIFFIES 5

static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
{
        int tick_cpu, cpu = smp_processor_id();

        /*
         * Check if the do_timer duty was dropped. We don't care about
         * concurrency: This happens only when the CPU in charge went
         * into a long sleep. If two CPUs happen to assign themselves to
         * this duty, then the jiffies update is still serialized by
         * 'jiffies_lock'.
         *
         * If nohz_full is enabled, this should not happen because the
         * 'tick_do_timer_cpu' CPU never relinquishes.
         */
        tick_cpu = READ_ONCE(tick_do_timer_cpu);

        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) {
#ifdef CONFIG_NO_HZ_FULL
                WARN_ON_ONCE(tick_nohz_full_running);
#endif
                WRITE_ONCE(tick_do_timer_cpu, cpu);
                tick_cpu = cpu;
        }

        /* Check if jiffies need an update */
        if (tick_cpu == cpu)
                tick_do_update_jiffies64(now);

        /*
         * If the jiffies update stalled for too long (timekeeper in stop_machine()
         * or VMEXIT'ed for several msecs), force an update.
         */
        if (ts->last_tick_jiffies != jiffies) {
                ts->stalled_jiffies = 0;
                ts->last_tick_jiffies = READ_ONCE(jiffies);
        } else {
                if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) {
                        tick_do_update_jiffies64(now);
                        ts->stalled_jiffies = 0;
                        ts->last_tick_jiffies = READ_ONCE(jiffies);
                }
        }

        if (tick_sched_flag_test(ts, TS_FLAG_INIDLE))
                ts->got_idle_tick = 1;
}

static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
{
        /*
         * When we are idle and the tick is stopped, we have to touch
         * the watchdog as we might not schedule for a really long
         * time. This happens on completely idle SMP systems while
         * waiting on the login prompt. We also increment the "start of
         * idle" jiffy stamp so the idle accounting adjustment we do
         * when we go busy again does not account too many ticks.
         */
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) &&
            tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                touch_softlockup_watchdog_sched();
                if (is_idle_task(current))
                        ts->idle_jiffies++;
                /*
                 * In case the current tick fired too early past its expected
                 * expiration, make sure we don't bypass the next clock reprogramming
                 * to the same deadline.
                 */
                ts->next_tick = 0;
        }

        update_process_times(user_mode(regs));
        profile_tick(CPU_PROFILING);
}

/*
 * We rearm the timer until we get disabled by the idle code.
 * Called with interrupts disabled.
 */
static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer)
{
        struct tick_sched *ts =        container_of(timer, struct tick_sched, sched_timer);
        struct pt_regs *regs = get_irq_regs();
        ktime_t now = ktime_get();

        tick_sched_do_timer(ts, now);

        /*
         * Do not call when we are not in IRQ context and have
         * no valid 'regs' pointer
         */
        if (regs)
                tick_sched_handle(ts, regs);
        else
                ts->next_tick = 0;

        /*
         * In dynticks mode, tick reprogram is deferred:
         * - to the idle task if in dynticks-idle
         * - to IRQ exit if in full-dynticks.
         */
        if (unlikely(tick_sched_flag_test(ts, TS_FLAG_STOPPED)))
                return HRTIMER_NORESTART;

        hrtimer_forward(timer, now, TICK_NSEC);

        return HRTIMER_RESTART;
}

static void tick_sched_timer_cancel(struct tick_sched *ts)
{
        if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES))
                hrtimer_cancel(&ts->sched_timer);
        else if (tick_sched_flag_test(ts, TS_FLAG_NOHZ))
                tick_program_event(KTIME_MAX, 1);
}

#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
EXPORT_SYMBOL_GPL(tick_nohz_full_mask);
bool tick_nohz_full_running;
EXPORT_SYMBOL_GPL(tick_nohz_full_running);
static atomic_t tick_dep_mask;

static bool check_tick_dependency(atomic_t *dep)
{
        int val = atomic_read(dep);

        if (val & TICK_DEP_MASK_POSIX_TIMER) {
                trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
                return true;
        }

        if (val & TICK_DEP_MASK_PERF_EVENTS) {
                trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS);
                return true;
        }

        if (val & TICK_DEP_MASK_SCHED) {
                trace_tick_stop(0, TICK_DEP_MASK_SCHED);
                return true;
        }

        if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) {
                trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
                return true;
        }

        if (val & TICK_DEP_MASK_RCU) {
                trace_tick_stop(0, TICK_DEP_MASK_RCU);
                return true;
        }

        if (val & TICK_DEP_MASK_RCU_EXP) {
                trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP);
                return true;
        }

        return false;
}

static bool can_stop_full_tick(int cpu, struct tick_sched *ts)
{
        lockdep_assert_irqs_disabled();

        if (unlikely(!cpu_online(cpu)))
                return false;

        if (check_tick_dependency(&tick_dep_mask))
                return false;

        if (check_tick_dependency(&ts->tick_dep_mask))
                return false;

        if (check_tick_dependency(&current->tick_dep_mask))
                return false;

        if (check_tick_dependency(&current->signal->tick_dep_mask))
                return false;

        return true;
}

static void nohz_full_kick_func(struct irq_work *work)
{
        /* Empty, the tick restart happens on tick_nohz_irq_exit() */
}

static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) =
        IRQ_WORK_INIT_HARD(nohz_full_kick_func);

/*
 * Kick this CPU if it's full dynticks in order to force it to
 * re-evaluate its dependency on the tick and restart it if necessary.
 * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
 * is NMI safe.
 */
static void tick_nohz_full_kick(void)
{
        if (!tick_nohz_full_cpu(smp_processor_id()))
                return;

        irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
}

/*
 * Kick the CPU if it's full dynticks in order to force it to
 * re-evaluate its dependency on the tick and restart it if necessary.
 */
void tick_nohz_full_kick_cpu(int cpu)
{
        if (!tick_nohz_full_cpu(cpu))
                return;

        irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
}

static void tick_nohz_kick_task(struct task_struct *tsk)
{
        int cpu;

        /*
         * If the task is not running, run_posix_cpu_timers()
         * has nothing to elapse, and an IPI can then be optimized out.
         *
         * activate_task()                      STORE p->tick_dep_mask
         *   STORE p->on_rq
         * __schedule() (switch to task 'p')    smp_mb() (atomic_fetch_or())
         *   LOCK rq->lock                      LOAD p->on_rq
         *   smp_mb__after_spin_lock()
         *   tick_nohz_task_switch()
         *     LOAD p->tick_dep_mask
         */
        if (!sched_task_on_rq(tsk))
                return;

        /*
         * If the task concurrently migrates to another CPU,
         * we guarantee it sees the new tick dependency upon
         * schedule.
         *
         * set_task_cpu(p, cpu);
         *   STORE p->cpu = @cpu
         * __schedule() (switch to task 'p')
         *   LOCK rq->lock
         *   smp_mb__after_spin_lock()          STORE p->tick_dep_mask
         *   tick_nohz_task_switch()            smp_mb() (atomic_fetch_or())
         *      LOAD p->tick_dep_mask           LOAD p->cpu
         */
        cpu = task_cpu(tsk);

        preempt_disable();
        if (cpu_online(cpu))
                tick_nohz_full_kick_cpu(cpu);
        preempt_enable();
}

/*
 * Kick all full dynticks CPUs in order to force these to re-evaluate
 * their dependency on the tick and restart it if necessary.
 */
static void tick_nohz_full_kick_all(void)
{
        int cpu;

        if (!tick_nohz_full_running)
                return;

        preempt_disable();
        for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask)
                tick_nohz_full_kick_cpu(cpu);
        preempt_enable();
}

static void tick_nohz_dep_set_all(atomic_t *dep,
                                  enum tick_dep_bits bit)
{
        int prev;

        prev = atomic_fetch_or(BIT(bit), dep);
        if (!prev)
                tick_nohz_full_kick_all();
}

/*
 * Set a global tick dependency. Used by perf events that rely on freq and
 * unstable clocks.
 */
void tick_nohz_dep_set(enum tick_dep_bits bit)
{
        tick_nohz_dep_set_all(&tick_dep_mask, bit);
}

void tick_nohz_dep_clear(enum tick_dep_bits bit)
{
        atomic_andnot(BIT(bit), &tick_dep_mask);
}

/*
 * Set per-CPU tick dependency. Used by scheduler and perf events in order to
 * manage event-throttling.
 */
void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
{
        int prev;
        struct tick_sched *ts;

        ts = per_cpu_ptr(&tick_cpu_sched, cpu);

        prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask);
        if (!prev) {
                preempt_disable();
                /* Perf needs local kick that is NMI safe */
                if (cpu == smp_processor_id()) {
                        tick_nohz_full_kick();
                } else {
                        /* Remote IRQ work not NMI-safe */
                        if (!WARN_ON_ONCE(in_nmi()))
                                tick_nohz_full_kick_cpu(cpu);
                }
                preempt_enable();
        }
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu);

void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
{
        struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);

        atomic_andnot(BIT(bit), &ts->tick_dep_mask);
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);

/*
 * Set a per-task tick dependency. RCU needs this. Also posix CPU timers
 * in order to elapse per task timers.
 */
void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
        if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask))
                tick_nohz_kick_task(tsk);
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);

void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
        atomic_andnot(BIT(bit), &tsk->tick_dep_mask);
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task);

/*
 * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
 * per process timers.
 */
void tick_nohz_dep_set_signal(struct task_struct *tsk,
                              enum tick_dep_bits bit)
{
        int prev;
        struct signal_struct *sig = tsk->signal;

        prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask);
        if (!prev) {
                struct task_struct *t;

                lockdep_assert_held(&tsk->sighand->siglock);
                __for_each_thread(sig, t)
                        tick_nohz_kick_task(t);
        }
}

void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
{
        atomic_andnot(BIT(bit), &sig->tick_dep_mask);
}

/*
 * Re-evaluate the need for the tick as we switch the current task.
 * It might need the tick due to per task/process properties:
 * perf events, posix CPU timers, ...
 */
void __tick_nohz_task_switch(void)
{
        struct tick_sched *ts;

        if (!tick_nohz_full_cpu(smp_processor_id()))
                return;

        ts = this_cpu_ptr(&tick_cpu_sched);

        if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                if (atomic_read(&current->tick_dep_mask) ||
                    atomic_read(&current->signal->tick_dep_mask))
                        tick_nohz_full_kick();
        }
}

/* Get the boot-time nohz CPU list from the kernel parameters. */
void __init tick_nohz_full_setup(cpumask_var_t cpumask)
{
        alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
        cpumask_copy(tick_nohz_full_mask, cpumask);
        tick_nohz_full_running = true;
}

bool tick_nohz_cpu_hotpluggable(unsigned int cpu)
{
        /*
         * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound
         * timers, workqueues, timekeeping, ...) on behalf of full dynticks
         * CPUs. It must remain online when nohz full is enabled.
         */
        if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu)
                return false;
        return true;
}

static int tick_nohz_cpu_down(unsigned int cpu)
{
        return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY;
}

void __init tick_nohz_init(void)
{
        int cpu, ret;

        if (!tick_nohz_full_running)
                return;

        /*
         * Full dynticks uses IRQ work to drive the tick rescheduling on safe
         * locking contexts. But then we need IRQ work to raise its own
         * interrupts to avoid circular dependency on the tick.
         */
        if (!arch_irq_work_has_interrupt()) {
                pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n");
                cpumask_clear(tick_nohz_full_mask);
                tick_nohz_full_running = false;
                return;
        }

        if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) &&
                        !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) {
                cpu = smp_processor_id();

                if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
                        pr_warn("NO_HZ: Clearing %d from nohz_full range "
                                "for timekeeping\n", cpu);
                        cpumask_clear_cpu(cpu, tick_nohz_full_mask);
                }
        }

        for_each_cpu(cpu, tick_nohz_full_mask)
                ct_cpu_track_user(cpu);

        ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
                                        "kernel/nohz:predown", NULL,
                                        tick_nohz_cpu_down);
        WARN_ON(ret < 0);
        pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
                cpumask_pr_args(tick_nohz_full_mask));
}
#endif /* #ifdef CONFIG_NO_HZ_FULL */

/*
 * NOHZ - aka dynamic tick functionality
 */
#ifdef CONFIG_NO_HZ_COMMON
/*
 * NO HZ enabled ?
 */
bool tick_nohz_enabled __read_mostly  = true;
unsigned long tick_nohz_active  __read_mostly;
/*
 * Enable / Disable tickless mode
 */
static int __init setup_tick_nohz(char *str)
{
        return (kstrtobool(str, &tick_nohz_enabled) == 0);
}

__setup("nohz=", setup_tick_nohz);

bool tick_nohz_tick_stopped(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        return tick_sched_flag_test(ts, TS_FLAG_STOPPED);
}

bool tick_nohz_tick_stopped_cpu(int cpu)
{
        struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);

        return tick_sched_flag_test(ts, TS_FLAG_STOPPED);
}

/**
 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
 * @now: current ktime_t
 *
 * Called from interrupt entry when the CPU was idle
 *
 * In case the sched_tick was stopped on this CPU, we have to check if jiffies
 * must be updated. Otherwise an interrupt handler could use a stale jiffy
 * value. We do this unconditionally on any CPU, as we don't know whether the
 * CPU, which has the update task assigned, is in a long sleep.
 */
static void tick_nohz_update_jiffies(ktime_t now)
{
        unsigned long flags;

        __this_cpu_write(tick_cpu_sched.idle_waketime, now);

        local_irq_save(flags);
        tick_do_update_jiffies64(now);
        local_irq_restore(flags);

        touch_softlockup_watchdog_sched();
}

static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
{
        ktime_t delta;

        if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)))
                return;

        delta = ktime_sub(now, ts->idle_entrytime);

        write_seqcount_begin(&ts->idle_sleeptime_seq);
        if (nr_iowait_cpu(smp_processor_id()) > 0)
                ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
        else
                ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);

        ts->idle_entrytime = now;
        tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE);
        write_seqcount_end(&ts->idle_sleeptime_seq);

        sched_clock_idle_wakeup_event();
}

static void tick_nohz_start_idle(struct tick_sched *ts)
{
        write_seqcount_begin(&ts->idle_sleeptime_seq);
        ts->idle_entrytime = ktime_get();
        tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE);
        write_seqcount_end(&ts->idle_sleeptime_seq);

        sched_clock_idle_sleep_event();
}

static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
                                 bool compute_delta, u64 *last_update_time)
{
        ktime_t now, idle;
        unsigned int seq;

        if (!tick_nohz_active)
                return -1;

        now = ktime_get();
        if (last_update_time)
                *last_update_time = ktime_to_us(now);

        do {
                seq = read_seqcount_begin(&ts->idle_sleeptime_seq);

                if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) {
                        ktime_t delta = ktime_sub(now, ts->idle_entrytime);

                        idle = ktime_add(*sleeptime, delta);
                } else {
                        idle = *sleeptime;
                }
        } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq));

        return ktime_to_us(idle);

}

/**
 * get_cpu_idle_time_us - get the total idle time of a CPU
 * @cpu: CPU number to query
 * @last_update_time: variable to store update time in. Do not update
 * counters if NULL.
 *
 * Return the cumulative idle time (since boot) for a given
 * CPU, in microseconds. Note that this is partially broken due to
 * the counter of iowait tasks that can be remotely updated without
 * any synchronization. Therefore it is possible to observe backward
 * values within two consecutive reads.
 *
 * This time is measured via accounting rather than sampling,
 * and is as accurate as ktime_get() is.
 *
 * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu
 */
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);

        return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime,
                                     !nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);

/**
 * get_cpu_iowait_time_us - get the total iowait time of a CPU
 * @cpu: CPU number to query
 * @last_update_time: variable to store update time in. Do not update
 * counters if NULL.
 *
 * Return the cumulative iowait time (since boot) for a given
 * CPU, in microseconds. Note this is partially broken due to
 * the counter of iowait tasks that can be remotely updated without
 * any synchronization. Therefore it is possible to observe backward
 * values within two consecutive reads.
 *
 * This time is measured via accounting rather than sampling,
 * and is as accurate as ktime_get() is.
 *
 * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu
 */
u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
{
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);

        return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime,
                                     nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);

static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
{
        hrtimer_cancel(&ts->sched_timer);
        hrtimer_set_expires(&ts->sched_timer, ts->last_tick);

        /* Forward the time to expire in the future */
        hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);

        if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) {
                hrtimer_start_expires(&ts->sched_timer,
                                      HRTIMER_MODE_ABS_PINNED_HARD);
        } else {
                tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
        }

        /*
         * Reset to make sure the next tick stop doesn't get fooled by past
         * cached clock deadline.
         */
        ts->next_tick = 0;
}

static inline bool local_timer_softirq_pending(void)
{
        return local_softirq_pending() & BIT(TIMER_SOFTIRQ);
}

/*
 * Read jiffies and the time when jiffies were updated last
 */
u64 get_jiffies_update(unsigned long *basej)
{
        unsigned long basejiff;
        unsigned int seq;
        u64 basemono;

        do {
                seq = read_seqcount_begin(&jiffies_seq);
                basemono = last_jiffies_update;
                basejiff = jiffies;
        } while (read_seqcount_retry(&jiffies_seq, seq));
        *basej = basejiff;
        return basemono;
}

/**
 * tick_nohz_next_event() - return the clock monotonic based next event
 * @ts:                pointer to tick_sched struct
 * @cpu:        CPU number
 *
 * Return:
 * *%0                - When the next event is a maximum of TICK_NSEC in the future
 *                  and the tick is not stopped yet
 * *%next_event        - Next event based on clock monotonic
 */
static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
{
        u64 basemono, next_tick, delta, expires;
        unsigned long basejiff;
        int tick_cpu;

        basemono = get_jiffies_update(&basejiff);
        ts->last_jiffies = basejiff;
        ts->timer_expires_base = basemono;

        /*
         * Keep the periodic tick, when RCU, architecture or irq_work
         * requests it.
         * Aside of that, check whether the local timer softirq is
         * pending. If so, its a bad idea to call get_next_timer_interrupt(),
         * because there is an already expired timer, so it will request
         * immediate expiry, which rearms the hardware timer with a
         * minimal delta, which brings us back to this place
         * immediately. Lather, rinse and repeat...
         */
        if (rcu_needs_cpu() || arch_needs_cpu() ||
            irq_work_needs_cpu() || local_timer_softirq_pending()) {
                next_tick = basemono + TICK_NSEC;
        } else {
                /*
                 * Get the next pending timer. If high resolution
                 * timers are enabled this only takes the timer wheel
                 * timers into account. If high resolution timers are
                 * disabled this also looks at the next expiring
                 * hrtimer.
                 */
                next_tick = get_next_timer_interrupt(basejiff, basemono);
                ts->next_timer = next_tick;
        }

        /* Make sure next_tick is never before basemono! */
        if (WARN_ON_ONCE(basemono > next_tick))
                next_tick = basemono;

        /*
         * If the tick is due in the next period, keep it ticking or
         * force prod the timer.
         */
        delta = next_tick - basemono;
        if (delta <= (u64)TICK_NSEC) {
                /*
                 * We've not stopped the tick yet, and there's a timer in the
                 * next period, so no point in stopping it either, bail.
                 */
                if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                        ts->timer_expires = 0;
                        goto out;
                }
        }

        /*
         * If this CPU is the one which had the do_timer() duty last, we limit
         * the sleep time to the timekeeping 'max_deferment' value.
         * Otherwise we can sleep as long as we want.
         */
        delta = timekeeping_max_deferment();
        tick_cpu = READ_ONCE(tick_do_timer_cpu);
        if (tick_cpu != cpu &&
            (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST)))
                delta = KTIME_MAX;

        /* Calculate the next expiry time */
        if (delta < (KTIME_MAX - basemono))
                expires = basemono + delta;
        else
                expires = KTIME_MAX;

        ts->timer_expires = min_t(u64, expires, next_tick);

out:
        return ts->timer_expires;
}

static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
{
        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
        unsigned long basejiff = ts->last_jiffies;
        u64 basemono = ts->timer_expires_base;
        bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
        int tick_cpu;
        u64 expires;

        /* Make sure we won't be trying to stop it twice in a row. */
        ts->timer_expires_base = 0;

        /*
         * Now the tick should be stopped definitely - so the timer base needs
         * to be marked idle as well to not miss a newly queued timer.
         */
        expires = timer_base_try_to_set_idle(basejiff, basemono, &timer_idle);
        if (expires > ts->timer_expires) {
                /*
                 * This path could only happen when the first timer was removed
                 * between calculating the possible sleep length and now (when
                 * high resolution mode is not active, timer could also be a
                 * hrtimer).
                 *
                 * We have to stick to the original calculated expiry value to
                 * not stop the tick for too long with a shallow C-state (which
                 * was programmed by cpuidle because of an early next expiration
                 * value).
                 */
                expires = ts->timer_expires;
        }

        /* If the timer base is not idle, retain the not yet stopped tick. */
        if (!timer_idle)
                return;

        /*
         * If this CPU is the one which updates jiffies, then give up
         * the assignment and let it be taken by the CPU which runs
         * the tick timer next, which might be this CPU as well. If we
         * don't drop this here, the jiffies might be stale and
         * do_timer() never gets invoked. Keep track of the fact that it
         * was the one which had the do_timer() duty last.
         */
        tick_cpu = READ_ONCE(tick_do_timer_cpu);
        if (tick_cpu == cpu) {
                WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE);
                tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST);
        } else if (tick_cpu != TICK_DO_TIMER_NONE) {
                tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST);
        }

        /* Skip reprogram of event if it's not changed */
        if (tick_sched_flag_test(ts, TS_FLAG_STOPPED) && (expires == ts->next_tick)) {
                /* Sanity check: make sure clockevent is actually programmed */
                if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
                        return;

                WARN_ON_ONCE(1);
                printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n",
                            basemono, ts->next_tick, dev->next_event,
                            hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer));
        }

        /*
         * tick_nohz_stop_tick() can be called several times before
         * tick_nohz_restart_sched_tick() is called. This happens when
         * interrupts arrive which do not cause a reschedule. In the first
         * call we save the current tick time, so we can restart the
         * scheduler tick in tick_nohz_restart_sched_tick().
         */
        if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                calc_load_nohz_start();
                quiet_vmstat();

                ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
                tick_sched_flag_set(ts, TS_FLAG_STOPPED);
                trace_tick_stop(1, TICK_DEP_MASK_NONE);
        }

        ts->next_tick = expires;

        /*
         * If the expiration time == KTIME_MAX, then we simply stop
         * the tick timer.
         */
        if (unlikely(expires == KTIME_MAX)) {
                tick_sched_timer_cancel(ts);
                return;
        }

        if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) {
                hrtimer_start(&ts->sched_timer, expires,
                              HRTIMER_MODE_ABS_PINNED_HARD);
        } else {
                hrtimer_set_expires(&ts->sched_timer, expires);
                tick_program_event(expires, 1);
        }
}

static void tick_nohz_retain_tick(struct tick_sched *ts)
{
        ts->timer_expires_base = 0;
}

#ifdef CONFIG_NO_HZ_FULL
static void tick_nohz_full_stop_tick(struct tick_sched *ts, int cpu)
{
        if (tick_nohz_next_event(ts, cpu))
                tick_nohz_stop_tick(ts, cpu);
        else
                tick_nohz_retain_tick(ts);
}
#endif /* CONFIG_NO_HZ_FULL */

static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
{
        /* Update jiffies first */
        tick_do_update_jiffies64(now);

        /*
         * Clear the timer idle flag, so we avoid IPIs on remote queueing and
         * the clock forward checks in the enqueue path:
         */
        timer_clear_idle();

        calc_load_nohz_stop();
        touch_softlockup_watchdog_sched();

        /* Cancel the scheduled timer and restore the tick: */
        tick_sched_flag_clear(ts, TS_FLAG_STOPPED);
        tick_nohz_restart(ts, now);
}

static void __tick_nohz_full_update_tick(struct tick_sched *ts,
                                         ktime_t now)
{
#ifdef CONFIG_NO_HZ_FULL
        int cpu = smp_processor_id();

        if (can_stop_full_tick(cpu, ts))
                tick_nohz_full_stop_tick(ts, cpu);
        else if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
                tick_nohz_restart_sched_tick(ts, now);
#endif
}

static void tick_nohz_full_update_tick(struct tick_sched *ts)
{
        if (!tick_nohz_full_cpu(smp_processor_id()))
                return;

        if (!tick_sched_flag_test(ts, TS_FLAG_NOHZ))
                return;

        __tick_nohz_full_update_tick(ts, ktime_get());
}

/*
 * A pending softirq outside an IRQ (or softirq disabled section) context
 * should be waiting for ksoftirqd to handle it. Therefore we shouldn't
 * reach this code due to the need_resched() early check in can_stop_idle_tick().
 *
 * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the
 * cpu_down() process, softirqs can still be raised while ksoftirqd is parked,
 * triggering the code below, since wakep_softirqd() is ignored.
 *
 */
static bool report_idle_softirq(void)
{
        static int ratelimit;
        unsigned int pending = local_softirq_pending();

        if (likely(!pending))
                return false;

        /* Some softirqs claim to be safe against hotplug and ksoftirqd parking */
        if (!cpu_active(smp_processor_id())) {
                pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK;
                if (!pending)
                        return false;
        }

        if (ratelimit >= 10)
                return false;

        /* On RT, softirq handling may be waiting on some lock */
        if (local_bh_blocked())
                return false;

        pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
                pending);
        ratelimit++;

        return true;
}

static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
{
        WARN_ON_ONCE(cpu_is_offline(cpu));

        if (unlikely(!tick_sched_flag_test(ts, TS_FLAG_NOHZ)))
                return false;

        if (need_resched())
                return false;

        if (unlikely(report_idle_softirq()))
                return false;

        if (tick_nohz_full_enabled()) {
                int tick_cpu = READ_ONCE(tick_do_timer_cpu);

                /*
                 * Keep the tick alive to guarantee timekeeping progression
                 * if there are full dynticks CPUs around
                 */
                if (tick_cpu == cpu)
                        return false;

                /* Should not happen for nohz-full */
                if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE))
                        return false;
        }

        return true;
}

/**
 * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
 *
 * When the next event is more than a tick into the future, stop the idle tick
 */
void tick_nohz_idle_stop_tick(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        int cpu = smp_processor_id();
        ktime_t expires;

        /*
         * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
         * tick timer expiration time is known already.
         */
        if (ts->timer_expires_base)
                expires = ts->timer_expires;
        else if (can_stop_idle_tick(cpu, ts))
                expires = tick_nohz_next_event(ts, cpu);
        else
                return;

        ts->idle_calls++;

        if (expires > 0LL) {
                int was_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);

                tick_nohz_stop_tick(ts, cpu);

                ts->idle_sleeps++;
                ts->idle_expires = expires;

                if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                        ts->idle_jiffies = ts->last_jiffies;
                        nohz_balance_enter_idle(cpu);
                }
        } else {
                tick_nohz_retain_tick(ts);
        }
}

void tick_nohz_idle_retain_tick(void)
{
        tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
}

/**
 * tick_nohz_idle_enter - prepare for entering idle on the current CPU
 *
 * Called when we start the idle loop.
 */
void tick_nohz_idle_enter(void)
{
        struct tick_sched *ts;

        lockdep_assert_irqs_enabled();

        local_irq_disable();

        ts = this_cpu_ptr(&tick_cpu_sched);

        WARN_ON_ONCE(ts->timer_expires_base);

        tick_sched_flag_set(ts, TS_FLAG_INIDLE);
        tick_nohz_start_idle(ts);

        local_irq_enable();
}

/**
 * tick_nohz_irq_exit - Notify the tick about IRQ exit
 *
 * A timer may have been added/modified/deleted either by the current IRQ,
 * or by another place using this IRQ as a notification. This IRQ may have
 * also updated the RCU callback list. These events may require a
 * re-evaluation of the next tick. Depending on the context:
 *
 * 1) If the CPU is idle and no resched is pending, just proceed with idle
 *    time accounting. The next tick will be re-evaluated on the next idle
 *    loop iteration.
 *
 * 2) If the CPU is nohz_full:
 *
 *    2.1) If there is any tick dependency, restart the tick if stopped.
 *
 *    2.2) If there is no tick dependency, (re-)evaluate the next tick and
 *         stop/update it accordingly.
 */
void tick_nohz_irq_exit(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        if (tick_sched_flag_test(ts, TS_FLAG_INIDLE))
                tick_nohz_start_idle(ts);
        else
                tick_nohz_full_update_tick(ts);
}

/**
 * tick_nohz_idle_got_tick - Check whether or not the tick handler has run
 *
 * Return: %true if the tick handler has run, otherwise %false
 */
bool tick_nohz_idle_got_tick(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        if (ts->got_idle_tick) {
                ts->got_idle_tick = 0;
                return true;
        }
        return false;
}

/**
 * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer
 * or the tick, whichever expires first. Note that, if the tick has been
 * stopped, it returns the next hrtimer.
 *
 * Called from power state control code with interrupts disabled
 *
 * Return: the next expiration time
 */
ktime_t tick_nohz_get_next_hrtimer(void)
{
        return __this_cpu_read(tick_cpu_device.evtdev)->next_event;
}

/**
 * tick_nohz_get_sleep_length - return the expected length of the current sleep
 * @delta_next: duration until the next event if the tick cannot be stopped
 *
 * Called from power state control code with interrupts disabled.
 *
 * The return value of this function and/or the value returned by it through the
 * @delta_next pointer can be negative which must be taken into account by its
 * callers.
 *
 * Return: the expected length of the current sleep
 */
ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
{
        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        int cpu = smp_processor_id();
        /*
         * The idle entry time is expected to be a sufficient approximation of
         * the current time at this point.
         */
        ktime_t now = ts->idle_entrytime;
        ktime_t next_event;

        WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE));

        *delta_next = ktime_sub(dev->next_event, now);

        if (!can_stop_idle_tick(cpu, ts))
                return *delta_next;

        next_event = tick_nohz_next_event(ts, cpu);
        if (!next_event)
                return *delta_next;

        /*
         * If the next highres timer to expire is earlier than 'next_event', the
         * idle governor needs to know that.
         */
        next_event = min_t(u64, next_event,
                           hrtimer_next_event_without(&ts->sched_timer));

        return ktime_sub(next_event, now);
}

/**
 * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
 * for a particular CPU.
 * @cpu: target CPU number
 *
 * Called from the schedutil frequency scaling governor in scheduler context.
 *
 * Return: the current idle calls counter value for @cpu
 */
unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
{
        struct tick_sched *ts = tick_get_tick_sched(cpu);

        return ts->idle_calls;
}

/**
 * tick_nohz_get_idle_calls - return the current idle calls counter value
 *
 * Called from the schedutil frequency scaling governor in scheduler context.
 *
 * Return: the current idle calls counter value for the current CPU
 */
unsigned long tick_nohz_get_idle_calls(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        return ts->idle_calls;
}

static void tick_nohz_account_idle_time(struct tick_sched *ts,
                                        ktime_t now)
{
        unsigned long ticks;

        ts->idle_exittime = now;

        if (vtime_accounting_enabled_this_cpu())
                return;
        /*
         * We stopped the tick in idle. update_process_times() would miss the
         * time we slept, as it does only a 1 tick accounting.
         * Enforce that this is accounted to idle !
         */
        ticks = jiffies - ts->idle_jiffies;
        /*
         * We might be one off. Do not randomly account a huge number of ticks!
         */
        if (ticks && ticks < LONG_MAX)
                account_idle_ticks(ticks);
}

void tick_nohz_idle_restart_tick(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                ktime_t now = ktime_get();
                tick_nohz_restart_sched_tick(ts, now);
                tick_nohz_account_idle_time(ts, now);
        }
}

static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
{
        if (tick_nohz_full_cpu(smp_processor_id()))
                __tick_nohz_full_update_tick(ts, now);
        else
                tick_nohz_restart_sched_tick(ts, now);

        tick_nohz_account_idle_time(ts, now);
}

/**
 * tick_nohz_idle_exit - Update the tick upon idle task exit
 *
 * When the idle task exits, update the tick depending on the
 * following situations:
 *
 * 1) If the CPU is not in nohz_full mode (most cases), then
 *    restart the tick.
 *
 * 2) If the CPU is in nohz_full mode (corner case):
 *   2.1) If the tick can be kept stopped (no tick dependencies)
 *        then re-evaluate the next tick and try to keep it stopped
 *        as long as possible.
 *   2.2) If the tick has dependencies, restart the tick.
 *
 */
void tick_nohz_idle_exit(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        bool idle_active, tick_stopped;
        ktime_t now;

        local_irq_disable();

        WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE));
        WARN_ON_ONCE(ts->timer_expires_base);

        tick_sched_flag_clear(ts, TS_FLAG_INIDLE);
        idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE);
        tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);

        if (idle_active || tick_stopped)
                now = ktime_get();

        if (idle_active)
                tick_nohz_stop_idle(ts, now);

        if (tick_stopped)
                tick_nohz_idle_update_tick(ts, now);

        local_irq_enable();
}

/*
 * In low-resolution mode, the tick handler must be implemented directly
 * at the clockevent level. hrtimer can't be used instead, because its
 * infrastructure actually relies on the tick itself as a backend in
 * low-resolution mode (see hrtimer_run_queues()).
 */
static void tick_nohz_lowres_handler(struct clock_event_device *dev)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        dev->next_event = KTIME_MAX;

        if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART))
                tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
}

static inline void tick_nohz_activate(struct tick_sched *ts)
{
        if (!tick_nohz_enabled)
                return;
        tick_sched_flag_set(ts, TS_FLAG_NOHZ);
        /* One update is enough */
        if (!test_and_set_bit(0, &tick_nohz_active))
                timers_update_nohz();
}

/**
 * tick_nohz_switch_to_nohz - switch to NOHZ mode
 */
static void tick_nohz_switch_to_nohz(void)
{
        if (!tick_nohz_enabled)
                return;

        if (tick_switch_to_oneshot(tick_nohz_lowres_handler))
                return;

        /*
         * Recycle the hrtimer in 'ts', so we can share the
         * highres code.
         */
        tick_setup_sched_timer(false);
}

static inline void tick_nohz_irq_enter(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        ktime_t now;

        if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE))
                return;
        now = ktime_get();
        if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))
                tick_nohz_stop_idle(ts, now);
        /*
         * If all CPUs are idle we may need to update a stale jiffies value.
         * Note nohz_full is a special case: a timekeeper is guaranteed to stay
         * alive but it might be busy looping with interrupts disabled in some
         * rare case (typically stop machine). So we must make sure we have a
         * last resort.
         */
        if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
                tick_nohz_update_jiffies(now);
}

#else

static inline void tick_nohz_switch_to_nohz(void) { }
static inline void tick_nohz_irq_enter(void) { }
static inline void tick_nohz_activate(struct tick_sched *ts) { }

#endif /* CONFIG_NO_HZ_COMMON */

/*
 * Called from irq_enter() to notify about the possible interruption of idle()
 */
void tick_irq_enter(void)
{
        tick_check_oneshot_broadcast_this_cpu();
        tick_nohz_irq_enter();
}

static int sched_skew_tick;

static int __init skew_tick(char *str)
{
        get_option(&str, &sched_skew_tick);

        return 0;
}
early_param("skew_tick", skew_tick);

/**
 * tick_setup_sched_timer - setup the tick emulation timer
 * @hrtimer: whether to use the hrtimer or not
 */
void tick_setup_sched_timer(bool hrtimer)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        /* Emulate tick processing via per-CPU hrtimers: */
        hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);

        if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) {
                tick_sched_flag_set(ts, TS_FLAG_HIGHRES);
                ts->sched_timer.function = tick_nohz_handler;
        }

        /* Get the next period (per-CPU) */
        hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());

        /* Offset the tick to avert 'jiffies_lock' contention. */
        if (sched_skew_tick) {
                u64 offset = TICK_NSEC >> 1;
                do_div(offset, num_possible_cpus());
                offset *= smp_processor_id();
                hrtimer_add_expires_ns(&ts->sched_timer, offset);
        }

        hrtimer_forward_now(&ts->sched_timer, TICK_NSEC);
        if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer)
                hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD);
        else
                tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
        tick_nohz_activate(ts);
}

/*
 * Shut down the tick and make sure the CPU won't try to retake the timekeeping
 * duty before disabling IRQs in idle for the last time.
 */
void tick_sched_timer_dying(int cpu)
{
        struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        struct clock_event_device *dev = td->evtdev;
        ktime_t idle_sleeptime, iowait_sleeptime;
        unsigned long idle_calls, idle_sleeps;

        /* This must happen before hrtimers are migrated! */
        tick_sched_timer_cancel(ts);

        /*
         * If the clockevents doesn't support CLOCK_EVT_STATE_ONESHOT_STOPPED,
         * make sure not to call low-res tick handler.
         */
        if (tick_sched_flag_test(ts, TS_FLAG_NOHZ))
                dev->event_handler = clockevents_handle_noop;

        idle_sleeptime = ts->idle_sleeptime;
        iowait_sleeptime = ts->iowait_sleeptime;
        idle_calls = ts->idle_calls;
        idle_sleeps = ts->idle_sleeps;
        memset(ts, 0, sizeof(*ts));
        ts->idle_sleeptime = idle_sleeptime;
        ts->iowait_sleeptime = iowait_sleeptime;
        ts->idle_calls = idle_calls;
        ts->idle_sleeps = idle_sleeps;
}

/*
 * Async notification about clocksource changes
 */
void tick_clock_notify(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
}

/*
 * Async notification about clock event changes
 */
void tick_oneshot_notify(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        set_bit(0, &ts->check_clocks);
}

/*
 * Check if a change happened, which makes oneshot possible.
 *
 * Called cyclically from the hrtimer softirq (driven by the timer
 * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ
 * mode, because high resolution timers are disabled (either compile
 * or runtime). Called with interrupts disabled.
 */
int tick_check_oneshot_change(int allow_nohz)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        if (!test_and_clear_bit(0, &ts->check_clocks))
                return 0;

        if (tick_sched_flag_test(ts, TS_FLAG_NOHZ))
                return 0;

        if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
                return 0;

        if (!allow_nohz)
                return 1;

        tick_nohz_switch_to_nohz();
        return 0;
}





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   10 













































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
/*
 * Performance events:
 *
 *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
 *    Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar
 *    Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra
 *
 * Data type definitions, declarations, prototypes.
 *
 *    Started by: Thomas Gleixner and Ingo Molnar
 *
 * For licencing details see kernel-base/COPYING
 */
#ifndef _LINUX_PERF_EVENT_H
#define _LINUX_PERF_EVENT_H

#include <uapi/linux/perf_event.h>
#include <uapi/linux/bpf_perf_event.h>

/*
 * Kernel-internal data types and definitions:
 */

#ifdef CONFIG_PERF_EVENTS
# include <asm/perf_event.h>
# include <asm/local64.h>
#endif

#define PERF_GUEST_ACTIVE        0x01
#define PERF_GUEST_USER        0x02

struct perf_guest_info_callbacks {
        unsigned int                        (*state)(void);
        unsigned long                        (*get_ip)(void);
        unsigned int                        (*handle_intel_pt_intr)(void);
};

#ifdef CONFIG_HAVE_HW_BREAKPOINT
#include <linux/rhashtable-types.h>
#include <asm/hw_breakpoint.h>
#endif

#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/hrtimer.h>
#include <linux/fs.h>
#include <linux/pid_namespace.h>
#include <linux/workqueue.h>
#include <linux/ftrace.h>
#include <linux/cpu.h>
#include <linux/irq_work.h>
#include <linux/static_key.h>
#include <linux/jump_label_ratelimit.h>
#include <linux/atomic.h>
#include <linux/sysfs.h>
#include <linux/perf_regs.h>
#include <linux/cgroup.h>
#include <linux/refcount.h>
#include <linux/security.h>
#include <linux/static_call.h>
#include <linux/lockdep.h>
#include <asm/local.h>

struct perf_callchain_entry {
        __u64                                nr;
        __u64                                ip[]; /* /proc/sys/kernel/perf_event_max_stack */
};

struct perf_callchain_entry_ctx {
        struct perf_callchain_entry *entry;
        u32                            max_stack;
        u32                            nr;
        short                            contexts;
        bool                            contexts_maxed;
};

typedef unsigned long (*perf_copy_f)(void *dst, const void *src,
                                     unsigned long off, unsigned long len);

struct perf_raw_frag {
        union {
                struct perf_raw_frag        *next;
                unsigned long                pad;
        };
        perf_copy_f                        copy;
        void                                *data;
        u32                                size;
} __packed;

struct perf_raw_record {
        struct perf_raw_frag                frag;
        u32                                size;
};

static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag)
{
        return frag->pad < sizeof(u64);
}

/*
 * branch stack layout:
 *  nr: number of taken branches stored in entries[]
 *  hw_idx: The low level index of raw branch records
 *          for the most recent branch.
 *          -1ULL means invalid/unknown.
 *
 * Note that nr can vary from sample to sample
 * branches (to, from) are stored from most recent
 * to least recent, i.e., entries[0] contains the most
 * recent branch.
 * The entries[] is an abstraction of raw branch records,
 * which may not be stored in age order in HW, e.g. Intel LBR.
 * The hw_idx is to expose the low level index of raw
 * branch record for the most recent branch aka entries[0].
 * The hw_idx index is between -1 (unknown) and max depth,
 * which can be retrieved in /sys/devices/cpu/caps/branches.
 * For the architectures whose raw branch records are
 * already stored in age order, the hw_idx should be 0.
 */
struct perf_branch_stack {
        __u64                                nr;
        __u64                                hw_idx;
        struct perf_branch_entry        entries[];
};

struct task_struct;

/*
 * extra PMU register associated with an event
 */
struct hw_perf_event_extra {
        u64                config;        /* register value */
        unsigned int        reg;        /* register address or index */
        int                alloc;        /* extra register already allocated */
        int                idx;        /* index in shared_regs->regs[] */
};

/**
 * hw_perf_event::flag values
 *
 * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific
 * usage.
 */
#define PERF_EVENT_FLAG_ARCH                        0x000fffff
#define PERF_EVENT_FLAG_USER_READ_CNT                0x80000000

static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0);

/**
 * struct hw_perf_event - performance event hardware details:
 */
struct hw_perf_event {
#ifdef CONFIG_PERF_EVENTS
        union {
                struct { /* hardware */
                        u64                config;
                        u64                last_tag;
                        unsigned long        config_base;
                        unsigned long        event_base;
                        int                event_base_rdpmc;
                        int                idx;
                        int                last_cpu;
                        int                flags;

                        struct hw_perf_event_extra extra_reg;
                        struct hw_perf_event_extra branch_reg;
                };
                struct { /* software */
                        struct hrtimer        hrtimer;
                };
                struct { /* tracepoint */
                        /* for tp_event->class */
                        struct list_head        tp_list;
                };
                struct { /* amd_power */
                        u64        pwr_acc;
                        u64        ptsc;
                };
#ifdef CONFIG_HAVE_HW_BREAKPOINT
                struct { /* breakpoint */
                        /*
                         * Crufty hack to avoid the chicken and egg
                         * problem hw_breakpoint has with context
                         * creation and event initalization.
                         */
                        struct arch_hw_breakpoint        info;
                        struct rhlist_head                bp_list;
                };
#endif
                struct { /* amd_iommu */
                        u8        iommu_bank;
                        u8        iommu_cntr;
                        u16        padding;
                        u64        conf;
                        u64        conf1;
                };
        };
        /*
         * If the event is a per task event, this will point to the task in
         * question. See the comment in perf_event_alloc().
         */
        struct task_struct                *target;

        /*
         * PMU would store hardware filter configuration
         * here.
         */
        void                                *addr_filters;

        /* Last sync'ed generation of filters */
        unsigned long                        addr_filters_gen;

/*
 * hw_perf_event::state flags; used to track the PERF_EF_* state.
 */
#define PERF_HES_STOPPED        0x01 /* the counter is stopped */
#define PERF_HES_UPTODATE        0x02 /* event->count up-to-date */
#define PERF_HES_ARCH                0x04

        int                                state;

        /*
         * The last observed hardware counter value, updated with a
         * local64_cmpxchg() such that pmu::read() can be called nested.
         */
        local64_t                        prev_count;

        /*
         * The period to start the next sample with.
         */
        u64                                sample_period;

        union {
                struct { /* Sampling */
                        /*
                         * The period we started this sample with.
                         */
                        u64                                last_period;

                        /*
                         * However much is left of the current period;
                         * note that this is a full 64bit value and
                         * allows for generation of periods longer
                         * than hardware might allow.
                         */
                        local64_t                        period_left;
                };
                struct { /* Topdown events counting for context switch */
                        u64                                saved_metric;
                        u64                                saved_slots;
                };
        };

        /*
         * State for throttling the event, see __perf_event_overflow() and
         * perf_adjust_freq_unthr_context().
         */
        u64                             interrupts_seq;
        u64                                interrupts;

        /*
         * State for freq target events, see __perf_event_overflow() and
         * perf_adjust_freq_unthr_context().
         */
        u64                                freq_time_stamp;
        u64                                freq_count_stamp;
#endif
};

struct perf_event;
struct perf_event_pmu_context;

/*
 * Common implementation detail of pmu::{start,commit,cancel}_txn
 */
#define PERF_PMU_TXN_ADD  0x1                /* txn to add/schedule event on PMU */
#define PERF_PMU_TXN_READ 0x2                /* txn to read event group from PMU */

/**
 * pmu::capabilities flags
 */
#define PERF_PMU_CAP_NO_INTERRUPT                0x0001
#define PERF_PMU_CAP_NO_NMI                        0x0002
#define PERF_PMU_CAP_AUX_NO_SG                        0x0004
#define PERF_PMU_CAP_EXTENDED_REGS                0x0008
#define PERF_PMU_CAP_EXCLUSIVE                        0x0010
#define PERF_PMU_CAP_ITRACE                        0x0020
#define PERF_PMU_CAP_NO_EXCLUDE                        0x0040
#define PERF_PMU_CAP_AUX_OUTPUT                        0x0080
#define PERF_PMU_CAP_EXTENDED_HW_TYPE                0x0100

struct perf_output_handle;

#define PMU_NULL_DEV        ((void *)(~0UL))

/**
 * struct pmu - generic performance monitoring unit
 */
struct pmu {
        struct list_head                entry;

        struct module                        *module;
        struct device                        *dev;
        struct device                        *parent;
        const struct attribute_group        **attr_groups;
        const struct attribute_group        **attr_update;
        const char                        *name;
        int                                type;

        /*
         * various common per-pmu feature flags
         */
        int                                capabilities;

        int __percpu                        *pmu_disable_count;
        struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
        atomic_t                        exclusive_cnt; /* < 0: cpu; > 0: tsk */
        int                                task_ctx_nr;
        int                                hrtimer_interval_ms;

        /* number of address filters this PMU can do */
        unsigned int                        nr_addr_filters;

        /*
         * Fully disable/enable this PMU, can be used to protect from the PMI
         * as well as for lazy/batch writing of the MSRs.
         */
        void (*pmu_enable)                (struct pmu *pmu); /* optional */
        void (*pmu_disable)                (struct pmu *pmu); /* optional */

        /*
         * Try and initialize the event for this PMU.
         *
         * Returns:
         *  -ENOENT        -- @event is not for this PMU
         *
         *  -ENODEV        -- @event is for this PMU but PMU not present
         *  -EBUSY        -- @event is for this PMU but PMU temporarily unavailable
         *  -EINVAL        -- @event is for this PMU but @event is not valid
         *  -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported
         *  -EACCES        -- @event is for this PMU, @event is valid, but no privileges
         *
         *  0                -- @event is for this PMU and valid
         *
         * Other error return values are allowed.
         */
        int (*event_init)                (struct perf_event *event);

        /*
         * Notification that the event was mapped or unmapped.  Called
         * in the context of the mapping task.
         */
        void (*event_mapped)                (struct perf_event *event, struct mm_struct *mm); /* optional */
        void (*event_unmapped)                (struct perf_event *event, struct mm_struct *mm); /* optional */

        /*
         * Flags for ->add()/->del()/ ->start()/->stop(). There are
         * matching hw_perf_event::state flags.
         */
#define PERF_EF_START        0x01                /* start the counter when adding    */
#define PERF_EF_RELOAD        0x02                /* reload the counter when starting */
#define PERF_EF_UPDATE        0x04                /* update the counter when stopping */

        /*
         * Adds/Removes a counter to/from the PMU, can be done inside a
         * transaction, see the ->*_txn() methods.
         *
         * The add/del callbacks will reserve all hardware resources required
         * to service the event, this includes any counter constraint
         * scheduling etc.
         *
         * Called with IRQs disabled and the PMU disabled on the CPU the event
         * is on.
         *
         * ->add() called without PERF_EF_START should result in the same state
         *  as ->add() followed by ->stop().
         *
         * ->del() must always PERF_EF_UPDATE stop an event. If it calls
         *  ->stop() that must deal with already being stopped without
         *  PERF_EF_UPDATE.
         */
        int  (*add)                        (struct perf_event *event, int flags);
        void (*del)                        (struct perf_event *event, int flags);

        /*
         * Starts/Stops a counter present on the PMU.
         *
         * The PMI handler should stop the counter when perf_event_overflow()
         * returns !0. ->start() will be used to continue.
         *
         * Also used to change the sample period.
         *
         * Called with IRQs disabled and the PMU disabled on the CPU the event
         * is on -- will be called from NMI context with the PMU generates
         * NMIs.
         *
         * ->stop() with PERF_EF_UPDATE will read the counter and update
         *  period/count values like ->read() would.
         *
         * ->start() with PERF_EF_RELOAD will reprogram the counter
         *  value, must be preceded by a ->stop() with PERF_EF_UPDATE.
         */
        void (*start)                        (struct perf_event *event, int flags);
        void (*stop)                        (struct perf_event *event, int flags);

        /*
         * Updates the counter value of the event.
         *
         * For sampling capable PMUs this will also update the software period
         * hw_perf_event::period_left field.
         */
        void (*read)                        (struct perf_event *event);

        /*
         * Group events scheduling is treated as a transaction, add
         * group events as a whole and perform one schedulability test.
         * If the test fails, roll back the whole group
         *
         * Start the transaction, after this ->add() doesn't need to
         * do schedulability tests.
         *
         * Optional.
         */
        void (*start_txn)                (struct pmu *pmu, unsigned int txn_flags);
        /*
         * If ->start_txn() disabled the ->add() schedulability test
         * then ->commit_txn() is required to perform one. On success
         * the transaction is closed. On error the transaction is kept
         * open until ->cancel_txn() is called.
         *
         * Optional.
         */
        int  (*commit_txn)                (struct pmu *pmu);
        /*
         * Will cancel the transaction, assumes ->del() is called
         * for each successful ->add() during the transaction.
         *
         * Optional.
         */
        void (*cancel_txn)                (struct pmu *pmu);

        /*
         * Will return the value for perf_event_mmap_page::index for this event,
         * if no implementation is provided it will default to 0 (see
         * perf_event_idx_default).
         */
        int (*event_idx)                (struct perf_event *event); /*optional */

        /*
         * context-switches callback
         */
        void (*sched_task)                (struct perf_event_pmu_context *pmu_ctx,
                                        bool sched_in);

        /*
         * Kmem cache of PMU specific data
         */
        struct kmem_cache                *task_ctx_cache;

        /*
         * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data)
         * can be synchronized using this function. See Intel LBR callstack support
         * implementation and Perf core context switch handling callbacks for usage
         * examples.
         */
        void (*swap_task_ctx)                (struct perf_event_pmu_context *prev_epc,
                                         struct perf_event_pmu_context *next_epc);
                                        /* optional */

        /*
         * Set up pmu-private data structures for an AUX area
         */
        void *(*setup_aux)                (struct perf_event *event, void **pages,
                                         int nr_pages, bool overwrite);
                                        /* optional */

        /*
         * Free pmu-private AUX data structures
         */
        void (*free_aux)                (void *aux); /* optional */

        /*
         * Take a snapshot of the AUX buffer without touching the event
         * state, so that preempting ->start()/->stop() callbacks does
         * not interfere with their logic. Called in PMI context.
         *
         * Returns the size of AUX data copied to the output handle.
         *
         * Optional.
         */
        long (*snapshot_aux)                (struct perf_event *event,
                                         struct perf_output_handle *handle,
                                         unsigned long size);

        /*
         * Validate address range filters: make sure the HW supports the
         * requested configuration and number of filters; return 0 if the
         * supplied filters are valid, -errno otherwise.
         *
         * Runs in the context of the ioctl()ing process and is not serialized
         * with the rest of the PMU callbacks.
         */
        int (*addr_filters_validate)        (struct list_head *filters);
                                        /* optional */

        /*
         * Synchronize address range filter configuration:
         * translate hw-agnostic filters into hardware configuration in
         * event::hw::addr_filters.
         *
         * Runs as a part of filter sync sequence that is done in ->start()
         * callback by calling perf_event_addr_filters_sync().
         *
         * May (and should) traverse event::addr_filters::list, for which its
         * caller provides necessary serialization.
         */
        void (*addr_filters_sync)        (struct perf_event *event);
                                        /* optional */

        /*
         * Check if event can be used for aux_output purposes for
         * events of this PMU.
         *
         * Runs from perf_event_open(). Should return 0 for "no match"
         * or non-zero for "match".
         */
        int (*aux_output_match)                (struct perf_event *event);
                                        /* optional */

        /*
         * Skip programming this PMU on the given CPU. Typically needed for
         * big.LITTLE things.
         */
        bool (*filter)                        (struct pmu *pmu, int cpu); /* optional */

        /*
         * Check period value for PERF_EVENT_IOC_PERIOD ioctl.
         */
        int (*check_period)                (struct perf_event *event, u64 value); /* optional */
};

enum perf_addr_filter_action_t {
        PERF_ADDR_FILTER_ACTION_STOP = 0,
        PERF_ADDR_FILTER_ACTION_START,
        PERF_ADDR_FILTER_ACTION_FILTER,
};

/**
 * struct perf_addr_filter - address range filter definition
 * @entry:        event's filter list linkage
 * @path:        object file's path for file-based filters
 * @offset:        filter range offset
 * @size:        filter range size (size==0 means single address trigger)
 * @action:        filter/start/stop
 *
 * This is a hardware-agnostic filter configuration as specified by the user.
 */
struct perf_addr_filter {
        struct list_head        entry;
        struct path                path;
        unsigned long                offset;
        unsigned long                size;
        enum perf_addr_filter_action_t        action;
};

/**
 * struct perf_addr_filters_head - container for address range filters
 * @list:        list of filters for this event
 * @lock:        spinlock that serializes accesses to the @list and event's
 *                (and its children's) filter generations.
 * @nr_file_filters:        number of file-based filters
 *
 * A child event will use parent's @list (and therefore @lock), so they are
 * bundled together; see perf_event_addr_filters().
 */
struct perf_addr_filters_head {
        struct list_head        list;
        raw_spinlock_t                lock;
        unsigned int                nr_file_filters;
};

struct perf_addr_filter_range {
        unsigned long                start;
        unsigned long                size;
};

/**
 * enum perf_event_state - the states of an event:
 */
enum perf_event_state {
        PERF_EVENT_STATE_DEAD                = -4,
        PERF_EVENT_STATE_EXIT                = -3,
        PERF_EVENT_STATE_ERROR                = -2,
        PERF_EVENT_STATE_OFF                = -1,
        PERF_EVENT_STATE_INACTIVE        =  0,
        PERF_EVENT_STATE_ACTIVE                =  1,
};

struct file;
struct perf_sample_data;

typedef void (*perf_overflow_handler_t)(struct perf_event *,
                                        struct perf_sample_data *,
                                        struct pt_regs *regs);

/*
 * Event capabilities. For event_caps and groups caps.
 *
 * PERF_EV_CAP_SOFTWARE: Is a software event.
 * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read
 * from any CPU in the package where it is active.
 * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and
 * cannot be a group leader. If an event with this flag is detached from the
 * group it is scheduled out and moved into an unrecoverable ERROR state.
 */
#define PERF_EV_CAP_SOFTWARE                BIT(0)
#define PERF_EV_CAP_READ_ACTIVE_PKG        BIT(1)
#define PERF_EV_CAP_SIBLING                BIT(2)

#define SWEVENT_HLIST_BITS                8
#define SWEVENT_HLIST_SIZE                (1 << SWEVENT_HLIST_BITS)

struct swevent_hlist {
        struct hlist_head                heads[SWEVENT_HLIST_SIZE];
        struct rcu_head                        rcu_head;
};

#define PERF_ATTACH_CONTEXT        0x01
#define PERF_ATTACH_GROUP        0x02
#define PERF_ATTACH_TASK        0x04
#define PERF_ATTACH_TASK_DATA        0x08
#define PERF_ATTACH_ITRACE        0x10
#define PERF_ATTACH_SCHED_CB        0x20
#define PERF_ATTACH_CHILD        0x40

struct bpf_prog;
struct perf_cgroup;
struct perf_buffer;

struct pmu_event_list {
        raw_spinlock_t                lock;
        struct list_head        list;
};

/*
 * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex
 * as such iteration must hold either lock. However, since ctx->lock is an IRQ
 * safe lock, and is only held by the CPU doing the modification, having IRQs
 * disabled is sufficient since it will hold-off the IPIs.
 */
#ifdef CONFIG_PROVE_LOCKING
#define lockdep_assert_event_ctx(event)                                \
        WARN_ON_ONCE(__lockdep_enabled &&                        \
                     (this_cpu_read(hardirqs_enabled) &&        \
                      lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD))
#else
#define lockdep_assert_event_ctx(event)
#endif

#define for_each_sibling_event(sibling, event)                        \
        lockdep_assert_event_ctx(event);                        \
        if ((event)->group_leader == (event))                        \
                list_for_each_entry((sibling), &(event)->sibling_list, sibling_list)

/**
 * struct perf_event - performance event kernel representation:
 */
struct perf_event {
#ifdef CONFIG_PERF_EVENTS
        /*
         * entry onto perf_event_context::event_list;
         *   modifications require ctx->lock
         *   RCU safe iterations.
         */
        struct list_head                event_entry;

        /*
         * Locked for modification by both ctx->mutex and ctx->lock; holding
         * either sufficies for read.
         */
        struct list_head                sibling_list;
        struct list_head                active_list;
        /*
         * Node on the pinned or flexible tree located at the event context;
         */
        struct rb_node                        group_node;
        u64                                group_index;
        /*
         * We need storage to track the entries in perf_pmu_migrate_context; we
         * cannot use the event_entry because of RCU and we want to keep the
         * group in tact which avoids us using the other two entries.
         */
        struct list_head                migrate_entry;

        struct hlist_node                hlist_entry;
        struct list_head                active_entry;
        int                                nr_siblings;

        /* Not serialized. Only written during event initialization. */
        int                                event_caps;
        /* The cumulative AND of all event_caps for events in this group. */
        int                                group_caps;

        unsigned int                        group_generation;
        struct perf_event                *group_leader;
        /*
         * event->pmu will always point to pmu in which this event belongs.
         * Whereas event->pmu_ctx->pmu may point to other pmu when group of
         * different pmu events is created.
         */
        struct pmu                        *pmu;
        void                                *pmu_private;

        enum perf_event_state                state;
        unsigned int                        attach_state;
        local64_t                        count;
        atomic64_t                        child_count;

        /*
         * These are the total time in nanoseconds that the event
         * has been enabled (i.e. eligible to run, and the task has
         * been scheduled in, if this is a per-task event)
         * and running (scheduled onto the CPU), respectively.
         */
        u64                                total_time_enabled;
        u64                                total_time_running;
        u64                                tstamp;

        struct perf_event_attr                attr;
        u16                                header_size;
        u16                                id_header_size;
        u16                                read_size;
        struct hw_perf_event                hw;

        struct perf_event_context        *ctx;
        /*
         * event->pmu_ctx points to perf_event_pmu_context in which the event
         * is added. This pmu_ctx can be of other pmu for sw event when that
         * sw event is part of a group which also contains non-sw events.
         */
        struct perf_event_pmu_context        *pmu_ctx;
        atomic_long_t                        refcount;

        /*
         * These accumulate total time (in nanoseconds) that children
         * events have been enabled and running, respectively.
         */
        atomic64_t                        child_total_time_enabled;
        atomic64_t                        child_total_time_running;

        /*
         * Protect attach/detach and child_list:
         */
        struct mutex                        child_mutex;
        struct list_head                child_list;
        struct perf_event                *parent;

        int                                oncpu;
        int                                cpu;

        struct list_head                owner_entry;
        struct task_struct                *owner;

        /* mmap bits */
        struct mutex                        mmap_mutex;
        atomic_t                        mmap_count;

        struct perf_buffer                *rb;
        struct list_head                rb_entry;
        unsigned long                        rcu_batches;
        int                                rcu_pending;

        /* poll related */
        wait_queue_head_t                waitq;
        struct fasync_struct                *fasync;

        /* delayed work for NMIs and such */
        unsigned int                        pending_wakeup;
        unsigned int                        pending_kill;
        unsigned int                        pending_disable;
        unsigned int                        pending_sigtrap;
        unsigned long                        pending_addr;        /* SIGTRAP */
        struct irq_work                        pending_irq;
        struct callback_head                pending_task;
        unsigned int                        pending_work;

        atomic_t                        event_limit;

        /* address range filters */
        struct perf_addr_filters_head        addr_filters;
        /* vma address array for file-based filders */
        struct perf_addr_filter_range        *addr_filter_ranges;
        unsigned long                        addr_filters_gen;

        /* for aux_output events */
        struct perf_event                *aux_event;

        void (*destroy)(struct perf_event *);
        struct rcu_head                        rcu_head;

        struct pid_namespace                *ns;
        u64                                id;

        atomic64_t                        lost_samples;

        u64                                (*clock)(void);
        perf_overflow_handler_t                overflow_handler;
        void                                *overflow_handler_context;
        struct bpf_prog                        *prog;
        u64                                bpf_cookie;

#ifdef CONFIG_EVENT_TRACING
        struct trace_event_call                *tp_event;
        struct event_filter                *filter;
#ifdef CONFIG_FUNCTION_TRACER
        struct ftrace_ops               ftrace_ops;
#endif
#endif

#ifdef CONFIG_CGROUP_PERF
        struct perf_cgroup                *cgrp; /* cgroup event is attach to */
#endif

#ifdef CONFIG_SECURITY
        void *security;
#endif
        struct list_head                sb_list;

        /*
         * Certain events gets forwarded to another pmu internally by over-
         * writing kernel copy of event->attr.type without user being aware
         * of it. event->orig_type contains original 'type' requested by
         * user.
         */
        __u32                                orig_type;
#endif /* CONFIG_PERF_EVENTS */
};

/*
 *           ,-----------------------[1:n]------------------------.
 *           V                                                    V
 * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event
 *                                        |                       |
 *                                        `--[n:1]-> pmu <-[1:n]--'
 *
 *
 * struct perf_event_pmu_context  lifetime is refcount based and RCU freed
 * (similar to perf_event_context). Locking is as if it were a member of
 * perf_event_context; specifically:
 *
 *   modification, both: ctx->mutex && ctx->lock
 *   reading, either:    ctx->mutex || ctx->lock
 *
 * There is one exception to this; namely put_pmu_ctx() isn't always called
 * with ctx->mutex held; this means that as long as we can guarantee the epc
 * has events the above rules hold.
 *
 * Specificially, sys_perf_event_open()'s group_leader case depends on
 * ctx->mutex pinning the configuration. Since we hold a reference on
 * group_leader (through the filedesc) it can't go away, therefore it's
 * associated pmu_ctx must exist and cannot change due to ctx->mutex.
 *
 * perf_event holds a refcount on perf_event_context
 * perf_event holds a refcount on perf_event_pmu_context
 */
struct perf_event_pmu_context {
        struct pmu                        *pmu;
        struct perf_event_context       *ctx;

        struct list_head                pmu_ctx_entry;

        struct list_head                pinned_active;
        struct list_head                flexible_active;

        /* Used to avoid freeing per-cpu perf_event_pmu_context */
        unsigned int                        embedded : 1;

        unsigned int                        nr_events;
        unsigned int                        nr_cgroups;
        unsigned int                        nr_freq;

        atomic_t                        refcount; /* event <-> epc */
        struct rcu_head                        rcu_head;

        void                                *task_ctx_data; /* pmu specific data */
        /*
         * Set when one or more (plausibly active) event can't be scheduled
         * due to pmu overcommit or pmu constraints, except tolerant to
         * events not necessary to be active due to scheduling constraints,
         * such as cgroups.
         */
        int                                rotate_necessary;
};

static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc)
{
        return !list_empty(&epc->flexible_active) || !list_empty(&epc->pinned_active);
}

struct perf_event_groups {
        struct rb_root        tree;
        u64                index;
};


/**
 * struct perf_event_context - event context structure
 *
 * Used as a container for task events and CPU events as well:
 */
struct perf_event_context {
        /*
         * Protect the states of the events in the list,
         * nr_active, and the list:
         */
        raw_spinlock_t                        lock;
        /*
         * Protect the list of events.  Locking either mutex or lock
         * is sufficient to ensure the list doesn't change; to change
         * the list you need to lock both the mutex and the spinlock.
         */
        struct mutex                        mutex;

        struct list_head                pmu_ctx_list;
        struct perf_event_groups        pinned_groups;
        struct perf_event_groups        flexible_groups;
        struct list_head                event_list;

        int                                nr_events;
        int                                nr_user;
        int                                is_active;

        int                                nr_task_data;
        int                                nr_stat;
        int                                nr_freq;
        int                                rotate_disable;

        refcount_t                        refcount; /* event <-> ctx */
        struct task_struct                *task;

        /*
         * Context clock, runs when context enabled.
         */
        u64                                time;
        u64                                timestamp;
        u64                                timeoffset;

        /*
         * These fields let us detect when two contexts have both
         * been cloned (inherited) from a common ancestor.
         */
        struct perf_event_context        *parent_ctx;
        u64                                parent_gen;
        u64                                generation;
        int                                pin_count;
#ifdef CONFIG_CGROUP_PERF
        int                                nr_cgroups;         /* cgroup evts */
#endif
        struct rcu_head                        rcu_head;

        /*
         * Sum (event->pending_sigtrap + event->pending_work)
         *
         * The SIGTRAP is targeted at ctx->task, as such it won't do changing
         * that until the signal is delivered.
         */
        local_t                                nr_pending;
};

/*
 * Number of contexts where an event can trigger:
 *        task, softirq, hardirq, nmi.
 */
#define PERF_NR_CONTEXTS        4

struct perf_cpu_pmu_context {
        struct perf_event_pmu_context        epc;
        struct perf_event_pmu_context        *task_epc;

        struct list_head                sched_cb_entry;
        int                                sched_cb_usage;

        int                                active_oncpu;
        int                                exclusive;

        raw_spinlock_t                        hrtimer_lock;
        struct hrtimer                        hrtimer;
        ktime_t                                hrtimer_interval;
        unsigned int                        hrtimer_active;
};

/**
 * struct perf_event_cpu_context - per cpu event context structure
 */
struct perf_cpu_context {
        struct perf_event_context        ctx;
        struct perf_event_context        *task_ctx;
        int                                online;

#ifdef CONFIG_CGROUP_PERF
        struct perf_cgroup                *cgrp;
#endif

        /*
         * Per-CPU storage for iterators used in visit_groups_merge. The default
         * storage is of size 2 to hold the CPU and any CPU event iterators.
         */
        int                                heap_size;
        struct perf_event                **heap;
        struct perf_event                *heap_default[2];
};

struct perf_output_handle {
        struct perf_event                *event;
        struct perf_buffer                *rb;
        unsigned long                        wakeup;
        unsigned long                        size;
        u64                                aux_flags;
        union {
                void                        *addr;
                unsigned long                head;
        };
        int                                page;
};

struct bpf_perf_event_data_kern {
        bpf_user_pt_regs_t *regs;
        struct perf_sample_data *data;
        struct perf_event *event;
};

#ifdef CONFIG_CGROUP_PERF

/*
 * perf_cgroup_info keeps track of time_enabled for a cgroup.
 * This is a per-cpu dynamically allocated data structure.
 */
struct perf_cgroup_info {
        u64                                time;
        u64                                timestamp;
        u64                                timeoffset;
        int                                active;
};

struct perf_cgroup {
        struct cgroup_subsys_state        css;
        struct perf_cgroup_info        __percpu *info;
};

/*
 * Must ensure cgroup is pinned (css_get) before calling
 * this function. In other words, we cannot call this function
 * if there is no cgroup event for the current CPU context.
 */
static inline struct perf_cgroup *
perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx)
{
        return container_of(task_css_check(task, perf_event_cgrp_id,
                                           ctx ? lockdep_is_held(&ctx->lock)
                                               : true),
                            struct perf_cgroup, css);
}
#endif /* CONFIG_CGROUP_PERF */

#ifdef CONFIG_PERF_EVENTS

extern struct perf_event_context *perf_cpu_task_ctx(void);

extern void *perf_aux_output_begin(struct perf_output_handle *handle,
                                   struct perf_event *event);
extern void perf_aux_output_end(struct perf_output_handle *handle,
                                unsigned long size);
extern int perf_aux_output_skip(struct perf_output_handle *handle,
                                unsigned long size);
extern void *perf_get_aux(struct perf_output_handle *handle);
extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags);
extern void perf_event_itrace_started(struct perf_event *event);

extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
extern void perf_pmu_unregister(struct pmu *pmu);

extern void __perf_event_task_sched_in(struct task_struct *prev,
                                       struct task_struct *task);
extern void __perf_event_task_sched_out(struct task_struct *prev,
                                        struct task_struct *next);
extern int perf_event_init_task(struct task_struct *child, u64 clone_flags);
extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
extern void perf_event_delayed_put(struct task_struct *task);
extern struct file *perf_event_get(unsigned int fd);
extern const struct perf_event *perf_get_event(struct file *file);
extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
extern void perf_pmu_enable(struct pmu *pmu);
extern void perf_sched_cb_dec(struct pmu *pmu);
extern void perf_sched_cb_inc(struct pmu *pmu);
extern int perf_event_task_disable(void);
extern int perf_event_task_enable(void);

extern void perf_pmu_resched(struct pmu *pmu);

extern int perf_event_refresh(struct perf_event *event, int refresh);
extern void perf_event_update_userpage(struct perf_event *event);
extern int perf_event_release_kernel(struct perf_event *event);
extern struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr,
                                int cpu,
                                struct task_struct *task,
                                perf_overflow_handler_t callback,
                                void *context);
extern void perf_pmu_migrate_context(struct pmu *pmu,
                                int src_cpu, int dst_cpu);
int perf_event_read_local(struct perf_event *event, u64 *value,
                          u64 *enabled, u64 *running);
extern u64 perf_event_read_value(struct perf_event *event,
                                 u64 *enabled, u64 *running);

extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs);

static inline bool branch_sample_no_flags(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS;
}

static inline bool branch_sample_no_cycles(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES;
}

static inline bool branch_sample_type(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE;
}

static inline bool branch_sample_hw_index(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
}

static inline bool branch_sample_priv(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE;
}

static inline bool branch_sample_counters(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS;
}

static inline bool branch_sample_call_stack(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK;
}

struct perf_sample_data {
        /*
         * Fields set by perf_sample_data_init() unconditionally,
         * group so as to minimize the cachelines touched.
         */
        u64                                sample_flags;
        u64                                period;
        u64                                dyn_size;

        /*
         * Fields commonly set by __perf_event_header__init_id(),
         * group so as to minimize the cachelines touched.
         */
        u64                                type;
        struct {
                u32        pid;
                u32        tid;
        }                                tid_entry;
        u64                                time;
        u64                                id;
        struct {
                u32        cpu;
                u32        reserved;
        }                                cpu_entry;

        /*
         * The other fields, optionally {set,used} by
         * perf_{prepare,output}_sample().
         */
        u64                                ip;
        struct perf_callchain_entry        *callchain;
        struct perf_raw_record                *raw;
        struct perf_branch_stack        *br_stack;
        u64                                *br_stack_cntr;
        union perf_sample_weight        weight;
        union  perf_mem_data_src        data_src;
        u64                                txn;

        struct perf_regs                regs_user;
        struct perf_regs                regs_intr;
        u64                                stack_user_size;

        u64                                stream_id;
        u64                                cgroup;
        u64                                addr;
        u64                                phys_addr;
        u64                                data_page_size;
        u64                                code_page_size;
        u64                                aux_size;
} ____cacheline_aligned;

/* default value for data source */
#define PERF_MEM_NA (PERF_MEM_S(OP, NA)   |\
                    PERF_MEM_S(LVL, NA)   |\
                    PERF_MEM_S(SNOOP, NA) |\
                    PERF_MEM_S(LOCK, NA)  |\
                    PERF_MEM_S(TLB, NA)   |\
                    PERF_MEM_S(LVLNUM, NA))

static inline void perf_sample_data_init(struct perf_sample_data *data,
                                         u64 addr, u64 period)
{
        /* remaining struct members initialized in perf_prepare_sample() */
        data->sample_flags = PERF_SAMPLE_PERIOD;
        data->period = period;
        data->dyn_size = 0;

        if (addr) {
                data->addr = addr;
                data->sample_flags |= PERF_SAMPLE_ADDR;
        }
}

static inline void perf_sample_save_callchain(struct perf_sample_data *data,
                                              struct perf_event *event,
                                              struct pt_regs *regs)
{
        int size = 1;

        data->callchain = perf_callchain(event, regs);
        size += data->callchain->nr;

        data->dyn_size += size * sizeof(u64);
        data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
}

static inline void perf_sample_save_raw_data(struct perf_sample_data *data,
                                             struct perf_raw_record *raw)
{
        struct perf_raw_frag *frag = &raw->frag;
        u32 sum = 0;
        int size;

        do {
                sum += frag->size;
                if (perf_raw_frag_last(frag))
                        break;
                frag = frag->next;
        } while (1);

        size = round_up(sum + sizeof(u32), sizeof(u64));
        raw->size = size - sizeof(u32);
        frag->pad = raw->size - sum;

        data->raw = raw;
        data->dyn_size += size;
        data->sample_flags |= PERF_SAMPLE_RAW;
}

static inline void perf_sample_save_brstack(struct perf_sample_data *data,
                                            struct perf_event *event,
                                            struct perf_branch_stack *brs,
                                            u64 *brs_cntr)
{
        int size = sizeof(u64); /* nr */

        if (branch_sample_hw_index(event))
                size += sizeof(u64);
        size += brs->nr * sizeof(struct perf_branch_entry);

        /*
         * The extension space for counters is appended after the
         * struct perf_branch_stack. It is used to store the occurrences
         * of events of each branch.
         */
        if (brs_cntr)
                size += brs->nr * sizeof(u64);

        data->br_stack = brs;
        data->br_stack_cntr = brs_cntr;
        data->dyn_size += size;
        data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
}

static inline u32 perf_sample_data_size(struct perf_sample_data *data,
                                        struct perf_event *event)
{
        u32 size = sizeof(struct perf_event_header);

        size += event->header_size + event->id_header_size;
        size += data->dyn_size;

        return size;
}

/*
 * Clear all bitfields in the perf_branch_entry.
 * The to and from fields are not cleared because they are
 * systematically modified by caller.
 */
static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br)
{
        br->mispred = 0;
        br->predicted = 0;
        br->in_tx = 0;
        br->abort = 0;
        br->cycles = 0;
        br->type = 0;
        br->spec = PERF_BR_SPEC_NA;
        br->reserved = 0;
}

extern void perf_output_sample(struct perf_output_handle *handle,
                               struct perf_event_header *header,
                               struct perf_sample_data *data,
                               struct perf_event *event);
extern void perf_prepare_sample(struct perf_sample_data *data,
                                struct perf_event *event,
                                struct pt_regs *regs);
extern void perf_prepare_header(struct perf_event_header *header,
                                struct perf_sample_data *data,
                                struct perf_event *event,
                                struct pt_regs *regs);

extern int perf_event_overflow(struct perf_event *event,
                                 struct perf_sample_data *data,
                                 struct pt_regs *regs);

extern void perf_event_output_forward(struct perf_event *event,
                                     struct perf_sample_data *data,
                                     struct pt_regs *regs);
extern void perf_event_output_backward(struct perf_event *event,
                                       struct perf_sample_data *data,
                                       struct pt_regs *regs);
extern int perf_event_output(struct perf_event *event,
                             struct perf_sample_data *data,
                             struct pt_regs *regs);

static inline bool
is_default_overflow_handler(struct perf_event *event)
{
        perf_overflow_handler_t overflow_handler = event->overflow_handler;

        if (likely(overflow_handler == perf_event_output_forward))
                return true;
        if (unlikely(overflow_handler == perf_event_output_backward))
                return true;
        return false;
}

extern void
perf_event_header__init_id(struct perf_event_header *header,
                           struct perf_sample_data *data,
                           struct perf_event *event);
extern void
perf_event__output_id_sample(struct perf_event *event,
                             struct perf_output_handle *handle,
                             struct perf_sample_data *sample);

extern void
perf_log_lost_samples(struct perf_event *event, u64 lost);

static inline bool event_has_any_exclude_flag(struct perf_event *event)
{
        struct perf_event_attr *attr = &event->attr;

        return attr->exclude_idle || attr->exclude_user ||
               attr->exclude_kernel || attr->exclude_hv ||
               attr->exclude_guest || attr->exclude_host;
}

static inline bool is_sampling_event(struct perf_event *event)
{
        return event->attr.sample_period != 0;
}

/*
 * Return 1 for a software event, 0 for a hardware event
 */
static inline int is_software_event(struct perf_event *event)
{
        return event->event_caps & PERF_EV_CAP_SOFTWARE;
}

/*
 * Return 1 for event in sw context, 0 for event in hw context
 */
static inline int in_software_context(struct perf_event *event)
{
        return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context;
}

static inline int is_exclusive_pmu(struct pmu *pmu)
{
        return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE;
}

extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];

extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64);
extern void __perf_sw_event(u32, u64, struct pt_regs *, u64);

#ifndef perf_arch_fetch_caller_regs
static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { }
#endif

/*
 * When generating a perf sample in-line, instead of from an interrupt /
 * exception, we lack a pt_regs. This is typically used from software events
 * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints.
 *
 * We typically don't need a full set, but (for x86) do require:
 * - ip for PERF_SAMPLE_IP
 * - cs for user_mode() tests
 * - sp for PERF_SAMPLE_CALLCHAIN
 * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs())
 *
 * NOTE: assumes @regs is otherwise already 0 filled; this is important for
 * things like PERF_SAMPLE_REGS_INTR.
 */
static inline void perf_fetch_caller_regs(struct pt_regs *regs)
{
        perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
}

static __always_inline void
perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
        if (static_key_false(&perf_swevent_enabled[event_id]))
                __perf_sw_event(event_id, nr, regs, addr);
}

DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]);

/*
 * 'Special' version for the scheduler, it hard assumes no recursion,
 * which is guaranteed by us not actually scheduling inside other swevents
 * because those disable preemption.
 */
static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
{
        struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);

        perf_fetch_caller_regs(regs);
        ___perf_sw_event(event_id, nr, regs, addr);
}

extern struct static_key_false perf_sched_events;

static __always_inline bool __perf_sw_enabled(int swevt)
{
        return static_key_false(&perf_swevent_enabled[swevt]);
}

static inline void perf_event_task_migrate(struct task_struct *task)
{
        if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS))
                task->sched_migrated = 1;
}

static inline void perf_event_task_sched_in(struct task_struct *prev,
                                            struct task_struct *task)
{
        if (static_branch_unlikely(&perf_sched_events))
                __perf_event_task_sched_in(prev, task);

        if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) &&
            task->sched_migrated) {
                __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
                task->sched_migrated = 0;
        }
}

static inline void perf_event_task_sched_out(struct task_struct *prev,
                                             struct task_struct *next)
{
        if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES))
                __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);

#ifdef CONFIG_CGROUP_PERF
        if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) &&
            perf_cgroup_from_task(prev, NULL) !=
            perf_cgroup_from_task(next, NULL))
                __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);
#endif

        if (static_branch_unlikely(&perf_sched_events))
                __perf_event_task_sched_out(prev, next);
}

extern void perf_event_mmap(struct vm_area_struct *vma);

extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
                               bool unregister, const char *sym);
extern void perf_event_bpf_event(struct bpf_prog *prog,
                                 enum perf_bpf_event_type type,
                                 u16 flags);

#ifdef CONFIG_GUEST_PERF_EVENTS
extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs;

DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state);
DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);

static inline unsigned int perf_guest_state(void)
{
        return static_call(__perf_guest_state)();
}
static inline unsigned long perf_guest_get_ip(void)
{
        return static_call(__perf_guest_get_ip)();
}
static inline unsigned int perf_guest_handle_intel_pt_intr(void)
{
        return static_call(__perf_guest_handle_intel_pt_intr)();
}
extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);
extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);
#else
static inline unsigned int perf_guest_state(void)                 { return 0; }
static inline unsigned long perf_guest_get_ip(void)                 { return 0; }
static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; }
#endif /* CONFIG_GUEST_PERF_EVENTS */

extern void perf_event_exec(void);
extern void perf_event_comm(struct task_struct *tsk, bool exec);
extern void perf_event_namespaces(struct task_struct *tsk);
extern void perf_event_fork(struct task_struct *tsk);
extern void perf_event_text_poke(const void *addr,
                                 const void *old_bytes, size_t old_len,
                                 const void *new_bytes, size_t new_len);

/* Callchains */
DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);

extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
                   u32 max_stack, bool crosstask, bool add_mark);
extern int get_callchain_buffers(int max_stack);
extern void put_callchain_buffers(void);
extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
extern void put_callchain_entry(int rctx);

extern int sysctl_perf_event_max_stack;
extern int sysctl_perf_event_max_contexts_per_stack;

static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip)
{
        if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) {
                struct perf_callchain_entry *entry = ctx->entry;
                entry->ip[entry->nr++] = ip;
                ++ctx->contexts;
                return 0;
        } else {
                ctx->contexts_maxed = true;
                return -1; /* no more room, stop walking the stack */
        }
}

static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
{
        if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) {
                struct perf_callchain_entry *entry = ctx->entry;
                entry->ip[entry->nr++] = ip;
                ++ctx->nr;
                return 0;
        } else {
                return -1; /* no more room, stop walking the stack */
        }
}

extern int sysctl_perf_event_paranoid;
extern int sysctl_perf_event_mlock;
extern int sysctl_perf_event_sample_rate;
extern int sysctl_perf_cpu_time_max_percent;

extern void perf_sample_event_took(u64 sample_len_ns);

int perf_event_max_sample_rate_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos);
int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos);
int perf_event_max_stack_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos);

/* Access to perf_event_open(2) syscall. */
#define PERF_SECURITY_OPEN                0

/* Finer grained perf_event_open(2) access control. */
#define PERF_SECURITY_CPU                1
#define PERF_SECURITY_KERNEL                2
#define PERF_SECURITY_TRACEPOINT        3

static inline int perf_is_paranoid(void)
{
        return sysctl_perf_event_paranoid > -1;
}

static inline int perf_allow_kernel(struct perf_event_attr *attr)
{
        if (sysctl_perf_event_paranoid > 1 && !perfmon_capable())
                return -EACCES;

        return security_perf_event_open(attr, PERF_SECURITY_KERNEL);
}

static inline int perf_allow_cpu(struct perf_event_attr *attr)
{
        if (sysctl_perf_event_paranoid > 0 && !perfmon_capable())
                return -EACCES;

        return security_perf_event_open(attr, PERF_SECURITY_CPU);
}

static inline int perf_allow_tracepoint(struct perf_event_attr *attr)
{
        if (sysctl_perf_event_paranoid > -1 && !perfmon_capable())
                return -EPERM;

        return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT);
}

extern void perf_event_init(void);
extern void perf_tp_event(u16 event_type, u64 count, void *record,
                          int entry_size, struct pt_regs *regs,
                          struct hlist_head *head, int rctx,
                          struct task_struct *task);
extern void perf_bp_event(struct perf_event *event, void *data);

#ifndef perf_misc_flags
# define perf_misc_flags(regs) \
                (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL)
# define perf_instruction_pointer(regs)        instruction_pointer(regs)
#endif
#ifndef perf_arch_bpf_user_pt_regs
# define perf_arch_bpf_user_pt_regs(regs) regs
#endif

static inline bool has_branch_stack(struct perf_event *event)
{
        return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
}

static inline bool needs_branch_stack(struct perf_event *event)
{
        return event->attr.branch_sample_type != 0;
}

static inline bool has_aux(struct perf_event *event)
{
        return event->pmu->setup_aux;
}

static inline bool is_write_backward(struct perf_event *event)
{
        return !!event->attr.write_backward;
}

static inline bool has_addr_filter(struct perf_event *event)
{
        return event->pmu->nr_addr_filters;
}

/*
 * An inherited event uses parent's filters
 */
static inline struct perf_addr_filters_head *
perf_event_addr_filters(struct perf_event *event)
{
        struct perf_addr_filters_head *ifh = &event->addr_filters;

        if (event->parent)
                ifh = &event->parent->addr_filters;

        return ifh;
}

static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
{
        /* Only the parent has fasync state */
        if (event->parent)
                event = event->parent;
        return &event->fasync;
}

extern void perf_event_addr_filters_sync(struct perf_event *event);
extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id);

extern int perf_output_begin(struct perf_output_handle *handle,
                             struct perf_sample_data *data,
                             struct perf_event *event, unsigned int size);
extern int perf_output_begin_forward(struct perf_output_handle *handle,
                                     struct perf_sample_data *data,
                                     struct perf_event *event,
                                     unsigned int size);
extern int perf_output_begin_backward(struct perf_output_handle *handle,
                                      struct perf_sample_data *data,
                                      struct perf_event *event,
                                      unsigned int size);

extern void perf_output_end(struct perf_output_handle *handle);
extern unsigned int perf_output_copy(struct perf_output_handle *handle,
                             const void *buf, unsigned int len);
extern unsigned int perf_output_skip(struct perf_output_handle *handle,
                                     unsigned int len);
extern long perf_output_copy_aux(struct perf_output_handle *aux_handle,
                                 struct perf_output_handle *handle,
                                 unsigned long from, unsigned long to);
extern int perf_swevent_get_recursion_context(void);
extern void perf_swevent_put_recursion_context(int rctx);
extern u64 perf_swevent_set_period(struct perf_event *event);
extern void perf_event_enable(struct perf_event *event);
extern void perf_event_disable(struct perf_event *event);
extern void perf_event_disable_local(struct perf_event *event);
extern void perf_event_disable_inatomic(struct perf_event *event);
extern void perf_event_task_tick(void);
extern int perf_event_account_interrupt(struct perf_event *event);
extern int perf_event_period(struct perf_event *event, u64 value);
extern u64 perf_event_pause(struct perf_event *event, bool reset);
#else /* !CONFIG_PERF_EVENTS: */
static inline void *
perf_aux_output_begin(struct perf_output_handle *handle,
                      struct perf_event *event)                                { return NULL; }
static inline void
perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
                                                                        { }
static inline int
perf_aux_output_skip(struct perf_output_handle *handle,
                     unsigned long size)                                { return -EINVAL; }
static inline void *
perf_get_aux(struct perf_output_handle *handle)                                { return NULL; }
static inline void
perf_event_task_migrate(struct task_struct *task)                        { }
static inline void
perf_event_task_sched_in(struct task_struct *prev,
                         struct task_struct *task)                        { }
static inline void
perf_event_task_sched_out(struct task_struct *prev,
                          struct task_struct *next)                        { }
static inline int perf_event_init_task(struct task_struct *child,
                                       u64 clone_flags)                        { return 0; }
static inline void perf_event_exit_task(struct task_struct *child)        { }
static inline void perf_event_free_task(struct task_struct *task)        { }
static inline void perf_event_delayed_put(struct task_struct *task)        { }
static inline struct file *perf_event_get(unsigned int fd)        { return ERR_PTR(-EINVAL); }
static inline const struct perf_event *perf_get_event(struct file *file)
{
        return ERR_PTR(-EINVAL);
}
static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
        return ERR_PTR(-EINVAL);
}
static inline int perf_event_read_local(struct perf_event *event, u64 *value,
                                        u64 *enabled, u64 *running)
{
        return -EINVAL;
}
static inline void perf_event_print_debug(void)                                { }
static inline int perf_event_task_disable(void)                                { return -EINVAL; }
static inline int perf_event_task_enable(void)                                { return -EINVAL; }
static inline int perf_event_refresh(struct perf_event *event, int refresh)
{
        return -EINVAL;
}

static inline void
perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)        { }
static inline void
perf_bp_event(struct perf_event *event, void *data)                        { }

static inline void perf_event_mmap(struct vm_area_struct *vma)                { }

typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data);
static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
                                      bool unregister, const char *sym)        { }
static inline void perf_event_bpf_event(struct bpf_prog *prog,
                                        enum perf_bpf_event_type type,
                                        u16 flags)                        { }
static inline void perf_event_exec(void)                                { }
static inline void perf_event_comm(struct task_struct *tsk, bool exec)        { }
static inline void perf_event_namespaces(struct task_struct *tsk)        { }
static inline void perf_event_fork(struct task_struct *tsk)                { }
static inline void perf_event_text_poke(const void *addr,
                                        const void *old_bytes,
                                        size_t old_len,
                                        const void *new_bytes,
                                        size_t new_len)                        { }
static inline void perf_event_init(void)                                { }
static inline int  perf_swevent_get_recursion_context(void)                { return -1; }
static inline void perf_swevent_put_recursion_context(int rctx)                { }
static inline u64 perf_swevent_set_period(struct perf_event *event)        { return 0; }
static inline void perf_event_enable(struct perf_event *event)                { }
static inline void perf_event_disable(struct perf_event *event)                { }
static inline int __perf_event_disable(void *info)                        { return -1; }
static inline void perf_event_task_tick(void)                                { }
static inline int perf_event_release_kernel(struct perf_event *event)        { return 0; }
static inline int perf_event_period(struct perf_event *event, u64 value)
{
        return -EINVAL;
}
static inline u64 perf_event_pause(struct perf_event *event, bool reset)
{
        return 0;
}
#endif

#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
extern void perf_restore_debug_store(void);
#else
static inline void perf_restore_debug_store(void)                        { }
#endif

#define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x))

struct perf_pmu_events_attr {
        struct device_attribute attr;
        u64 id;
        const char *event_str;
};

struct perf_pmu_events_ht_attr {
        struct device_attribute                        attr;
        u64                                        id;
        const char                                *event_str_ht;
        const char                                *event_str_noht;
};

struct perf_pmu_events_hybrid_attr {
        struct device_attribute                        attr;
        u64                                        id;
        const char                                *event_str;
        u64                                        pmu_type;
};

struct perf_pmu_format_hybrid_attr {
        struct device_attribute                        attr;
        u64                                        pmu_type;
};

ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
                              char *page);

#define PMU_EVENT_ATTR(_name, _var, _id, _show)                                \
static struct perf_pmu_events_attr _var = {                                \
        .attr = __ATTR(_name, 0444, _show, NULL),                        \
        .id   =  _id,                                                        \
};

#define PMU_EVENT_ATTR_STRING(_name, _var, _str)                            \
static struct perf_pmu_events_attr _var = {                                    \
        .attr                = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
        .id                = 0,                                                    \
        .event_str        = _str,                                                    \
};

#define PMU_EVENT_ATTR_ID(_name, _show, _id)                                \
        (&((struct perf_pmu_events_attr[]) {                                \
                { .attr = __ATTR(_name, 0444, _show, NULL),                \
                  .id = _id, }                                                \
        })[0].attr.attr)

#define PMU_FORMAT_ATTR_SHOW(_name, _format)                                \
static ssize_t                                                                \
_name##_show(struct device *dev,                                        \
                               struct device_attribute *attr,                \
                               char *page)                                \
{                                                                        \
        BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);                        \
        return sprintf(page, _format "\n");                                \
}                                                                        \

#define PMU_FORMAT_ATTR(_name, _format)                                        \
        PMU_FORMAT_ATTR_SHOW(_name, _format)                                \
                                                                        \
static struct device_attribute format_attr_##_name = __ATTR_RO(_name)

/* Performance counter hotplug functions */
#ifdef CONFIG_PERF_EVENTS
int perf_event_init_cpu(unsigned int cpu);
int perf_event_exit_cpu(unsigned int cpu);
#else
#define perf_event_init_cpu        NULL
#define perf_event_exit_cpu        NULL
#endif

extern void arch_perf_update_userpage(struct perf_event *event,
                                      struct perf_event_mmap_page *userpg,
                                      u64 now);

/*
 * Snapshot branch stack on software events.
 *
 * Branch stack can be very useful in understanding software events. For
 * example, when a long function, e.g. sys_perf_event_open, returns an
 * errno, it is not obvious why the function failed. Branch stack could
 * provide very helpful information in this type of scenarios.
 *
 * On software event, it is necessary to stop the hardware branch recorder
 * fast. Otherwise, the hardware register/buffer will be flushed with
 * entries of the triggering event. Therefore, static call is used to
 * stop the hardware recorder.
 */

/*
 * cnt is the number of entries allocated for entries.
 * Return number of entries copied to .
 */
typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries,
                                           unsigned int cnt);
DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);

#ifndef PERF_NEEDS_LOPWR_CB
static inline void perf_lopwr_cb(bool mode)
{
}
#endif

#endif /* _LINUX_PERF_EVENT_H */














































































    4 










    4 


    4 








    4 









    4 










    4 





















































    5 
    5 





































































































































































































































    4 












    4 
    4 

    4 



    4 








    4 











    4 




















































































































































































































































































    4 














    4 


    4 

    4 















    4 
    4 











    4 

































































































































    4 






























































    4 












    4 











    4 



    4 








    4 





























    4 





























    4 





    4 




    3 





    4 





    4 






    4 



















    4 






    3 























    4 





















    4 
    4 


    4 





















    2 






















    4 









    4 



    2 
    2 
    2 



    2 





    2 




















































































    4 








    4 
    4 










    4 




    4 

























































































































































































































































































































    2 


































































    2 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 





    4 













    4 









    4 

















































    4 































    4 
    4 






    4 






    4 










    4 


































































































































    4 


    4 

    4 




    4 



    4 









































































































    1 























    5 






    4 



    1 

















































    3 


    3 






    1 

























































































































































































































    4 



    4 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/signal.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  1997-11-02  Modified for POSIX.1b signals by Richard Henderson
 *
 *  2003-06-02  Jim Houston - Concurrent Computer Corp.
 *                Changes to use preallocated sigqueue structures
 *                to allow signals to be sent reliably.
 */

#include <linux/slab.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/sched/mm.h>
#include <linux/sched/user.h>
#include <linux/sched/debug.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/proc_fs.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
#include <linux/coredump.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ptrace.h>
#include <linux/signal.h>
#include <linux/signalfd.h>
#include <linux/ratelimit.h>
#include <linux/task_work.h>
#include <linux/capability.h>
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
#include <linux/uprobes.h>
#include <linux/compat.h>
#include <linux/cn_proc.h>
#include <linux/compiler.h>
#include <linux/posix-timers.h>
#include <linux/cgroup.h>
#include <linux/audit.h>
#include <linux/sysctl.h>
#include <uapi/linux/pidfd.h>

#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>

#include <asm/param.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/siginfo.h>
#include <asm/cacheflush.h>
#include <asm/syscall.h>        /* for syscall_get_* */

/*
 * SLAB caches for signal bits.
 */

static struct kmem_cache *sigqueue_cachep;

int print_fatal_signals __read_mostly;

static void __user *sig_handler(struct task_struct *t, int sig)
{
        return t->sighand->action[sig - 1].sa.sa_handler;
}

static inline bool sig_handler_ignored(void __user *handler, int sig)
{
        /* Is it explicitly or implicitly ignored? */
        return handler == SIG_IGN ||
               (handler == SIG_DFL && sig_kernel_ignore(sig));
}

static bool sig_task_ignored(struct task_struct *t, int sig, bool force)
{
        void __user *handler;

        handler = sig_handler(t, sig);

        /* SIGKILL and SIGSTOP may not be sent to the global init */
        if (unlikely(is_global_init(t) && sig_kernel_only(sig)))
                return true;

        if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
            handler == SIG_DFL && !(force && sig_kernel_only(sig)))
                return true;

        /* Only allow kernel generated signals to this kthread */
        if (unlikely((t->flags & PF_KTHREAD) &&
                     (handler == SIG_KTHREAD_KERNEL) && !force))
                return true;

        return sig_handler_ignored(handler, sig);
}

static bool sig_ignored(struct task_struct *t, int sig, bool force)
{
        /*
         * Blocked signals are never ignored, since the
         * signal handler may change by the time it is
         * unblocked.
         */
        if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
                return false;

        /*
         * Tracers may want to know about even ignored signal unless it
         * is SIGKILL which can't be reported anyway but can be ignored
         * by SIGNAL_UNKILLABLE task.
         */
        if (t->ptrace && sig != SIGKILL)
                return false;

        return sig_task_ignored(t, sig, force);
}

/*
 * Re-calculate pending state from the set of locally pending
 * signals, globally pending signals, and blocked signals.
 */
static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked)
{
        unsigned long ready;
        long i;

        switch (_NSIG_WORDS) {
        default:
                for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;)
                        ready |= signal->sig[i] &~ blocked->sig[i];
                break;

        case 4: ready  = signal->sig[3] &~ blocked->sig[3];
                ready |= signal->sig[2] &~ blocked->sig[2];
                ready |= signal->sig[1] &~ blocked->sig[1];
                ready |= signal->sig[0] &~ blocked->sig[0];
                break;

        case 2: ready  = signal->sig[1] &~ blocked->sig[1];
                ready |= signal->sig[0] &~ blocked->sig[0];
                break;

        case 1: ready  = signal->sig[0] &~ blocked->sig[0];
        }
        return ready !=        0;
}

#define PENDING(p,b) has_pending_signals(&(p)->signal, (b))

static bool recalc_sigpending_tsk(struct task_struct *t)
{
        if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) ||
            PENDING(&t->pending, &t->blocked) ||
            PENDING(&t->signal->shared_pending, &t->blocked) ||
            cgroup_task_frozen(t)) {
                set_tsk_thread_flag(t, TIF_SIGPENDING);
                return true;
        }

        /*
         * We must never clear the flag in another thread, or in current
         * when it's possible the current syscall is returning -ERESTART*.
         * So we don't clear it here, and only callers who know they should do.
         */
        return false;
}

void recalc_sigpending(void)
{
        if (!recalc_sigpending_tsk(current) && !freezing(current))
                clear_thread_flag(TIF_SIGPENDING);

}
EXPORT_SYMBOL(recalc_sigpending);

void calculate_sigpending(void)
{
        /* Have any signals or users of TIF_SIGPENDING been delayed
         * until after fork?
         */
        spin_lock_irq(&current->sighand->siglock);
        set_tsk_thread_flag(current, TIF_SIGPENDING);
        recalc_sigpending();
        spin_unlock_irq(&current->sighand->siglock);
}

/* Given the mask, find the first available signal that should be serviced. */

#define SYNCHRONOUS_MASK \
        (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
         sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))

int next_signal(struct sigpending *pending, sigset_t *mask)
{
        unsigned long i, *s, *m, x;
        int sig = 0;

        s = pending->signal.sig;
        m = mask->sig;

        /*
         * Handle the first word specially: it contains the
         * synchronous signals that need to be dequeued first.
         */
        x = *s &~ *m;
        if (x) {
                if (x & SYNCHRONOUS_MASK)
                        x &= SYNCHRONOUS_MASK;
                sig = ffz(~x) + 1;
                return sig;
        }

        switch (_NSIG_WORDS) {
        default:
                for (i = 1; i < _NSIG_WORDS; ++i) {
                        x = *++s &~ *++m;
                        if (!x)
                                continue;
                        sig = ffz(~x) + i*_NSIG_BPW + 1;
                        break;
                }
                break;

        case 2:
                x = s[1] &~ m[1];
                if (!x)
                        break;
                sig = ffz(~x) + _NSIG_BPW + 1;
                break;

        case 1:
                /* Nothing to do */
                break;
        }

        return sig;
}

static inline void print_dropped_signal(int sig)
{
        static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);

        if (!print_fatal_signals)
                return;

        if (!__ratelimit(&ratelimit_state))
                return;

        pr_info("%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
                                current->comm, current->pid, sig);
}

/**
 * task_set_jobctl_pending - set jobctl pending bits
 * @task: target task
 * @mask: pending bits to set
 *
 * Clear @mask from @task->jobctl.  @mask must be subset of
 * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK |
 * %JOBCTL_TRAPPING.  If stop signo is being set, the existing signo is
 * cleared.  If @task is already being killed or exiting, this function
 * becomes noop.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 *
 * RETURNS:
 * %true if @mask is set, %false if made noop because @task was dying.
 */
bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask)
{
        BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
                        JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
        BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));

        if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
                return false;

        if (mask & JOBCTL_STOP_SIGMASK)
                task->jobctl &= ~JOBCTL_STOP_SIGMASK;

        task->jobctl |= mask;
        return true;
}

/**
 * task_clear_jobctl_trapping - clear jobctl trapping bit
 * @task: target task
 *
 * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED.
 * Clear it and wake up the ptracer.  Note that we don't need any further
 * locking.  @task->siglock guarantees that @task->parent points to the
 * ptracer.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 */
void task_clear_jobctl_trapping(struct task_struct *task)
{
        if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
                task->jobctl &= ~JOBCTL_TRAPPING;
                smp_mb();        /* advised by wake_up_bit() */
                wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
        }
}

/**
 * task_clear_jobctl_pending - clear jobctl pending bits
 * @task: target task
 * @mask: pending bits to clear
 *
 * Clear @mask from @task->jobctl.  @mask must be subset of
 * %JOBCTL_PENDING_MASK.  If %JOBCTL_STOP_PENDING is being cleared, other
 * STOP bits are cleared together.
 *
 * If clearing of @mask leaves no stop or trap pending, this function calls
 * task_clear_jobctl_trapping().
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 */
void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
{
        BUG_ON(mask & ~JOBCTL_PENDING_MASK);

        if (mask & JOBCTL_STOP_PENDING)
                mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED;

        task->jobctl &= ~mask;

        if (!(task->jobctl & JOBCTL_PENDING_MASK))
                task_clear_jobctl_trapping(task);
}

/**
 * task_participate_group_stop - participate in a group stop
 * @task: task participating in a group stop
 *
 * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
 * Group stop states are cleared and the group stop count is consumed if
 * %JOBCTL_STOP_CONSUME was set.  If the consumption completes the group
 * stop, the appropriate `SIGNAL_*` flags are set.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 *
 * RETURNS:
 * %true if group stop completion should be notified to the parent, %false
 * otherwise.
 */
static bool task_participate_group_stop(struct task_struct *task)
{
        struct signal_struct *sig = task->signal;
        bool consume = task->jobctl & JOBCTL_STOP_CONSUME;

        WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));

        task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING);

        if (!consume)
                return false;

        if (!WARN_ON_ONCE(sig->group_stop_count == 0))
                sig->group_stop_count--;

        /*
         * Tell the caller to notify completion iff we are entering into a
         * fresh group stop.  Read comment in do_signal_stop() for details.
         */
        if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
                signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED);
                return true;
        }
        return false;
}

void task_join_group_stop(struct task_struct *task)
{
        unsigned long mask = current->jobctl & JOBCTL_STOP_SIGMASK;
        struct signal_struct *sig = current->signal;

        if (sig->group_stop_count) {
                sig->group_stop_count++;
                mask |= JOBCTL_STOP_CONSUME;
        } else if (!(sig->flags & SIGNAL_STOP_STOPPED))
                return;

        /* Have the new thread join an on-going signal group stop */
        task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING);
}

/*
 * allocate a new signal queue record
 * - this may be called without locks if and only if t == current, otherwise an
 *   appropriate lock must be held to stop the target task from exiting
 */
static struct sigqueue *
__sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
                 int override_rlimit, const unsigned int sigqueue_flags)
{
        struct sigqueue *q = NULL;
        struct ucounts *ucounts;
        long sigpending;

        /*
         * Protect access to @t credentials. This can go away when all
         * callers hold rcu read lock.
         *
         * NOTE! A pending signal will hold on to the user refcount,
         * and we get/put the refcount only when the sigpending count
         * changes from/to zero.
         */
        rcu_read_lock();
        ucounts = task_ucounts(t);
        sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
        rcu_read_unlock();
        if (!sigpending)
                return NULL;

        if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
                q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
        } else {
                print_dropped_signal(sig);
        }

        if (unlikely(q == NULL)) {
                dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
        } else {
                INIT_LIST_HEAD(&q->list);
                q->flags = sigqueue_flags;
                q->ucounts = ucounts;
        }
        return q;
}

static void __sigqueue_free(struct sigqueue *q)
{
        if (q->flags & SIGQUEUE_PREALLOC)
                return;
        if (q->ucounts) {
                dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING);
                q->ucounts = NULL;
        }
        kmem_cache_free(sigqueue_cachep, q);
}

void flush_sigqueue(struct sigpending *queue)
{
        struct sigqueue *q;

        sigemptyset(&queue->signal);
        while (!list_empty(&queue->list)) {
                q = list_entry(queue->list.next, struct sigqueue , list);
                list_del_init(&q->list);
                __sigqueue_free(q);
        }
}

/*
 * Flush all pending signals for this kthread.
 */
void flush_signals(struct task_struct *t)
{
        unsigned long flags;

        spin_lock_irqsave(&t->sighand->siglock, flags);
        clear_tsk_thread_flag(t, TIF_SIGPENDING);
        flush_sigqueue(&t->pending);
        flush_sigqueue(&t->signal->shared_pending);
        spin_unlock_irqrestore(&t->sighand->siglock, flags);
}
EXPORT_SYMBOL(flush_signals);

#ifdef CONFIG_POSIX_TIMERS
static void __flush_itimer_signals(struct sigpending *pending)
{
        sigset_t signal, retain;
        struct sigqueue *q, *n;

        signal = pending->signal;
        sigemptyset(&retain);

        list_for_each_entry_safe(q, n, &pending->list, list) {
                int sig = q->info.si_signo;

                if (likely(q->info.si_code != SI_TIMER)) {
                        sigaddset(&retain, sig);
                } else {
                        sigdelset(&signal, sig);
                        list_del_init(&q->list);
                        __sigqueue_free(q);
                }
        }

        sigorsets(&pending->signal, &signal, &retain);
}

void flush_itimer_signals(void)
{
        struct task_struct *tsk = current;
        unsigned long flags;

        spin_lock_irqsave(&tsk->sighand->siglock, flags);
        __flush_itimer_signals(&tsk->pending);
        __flush_itimer_signals(&tsk->signal->shared_pending);
        spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
}
#endif

void ignore_signals(struct task_struct *t)
{
        int i;

        for (i = 0; i < _NSIG; ++i)
                t->sighand->action[i].sa.sa_handler = SIG_IGN;

        flush_signals(t);
}

/*
 * Flush all handlers for a task.
 */

void
flush_signal_handlers(struct task_struct *t, int force_default)
{
        int i;
        struct k_sigaction *ka = &t->sighand->action[0];
        for (i = _NSIG ; i != 0 ; i--) {
                if (force_default || ka->sa.sa_handler != SIG_IGN)
                        ka->sa.sa_handler = SIG_DFL;
                ka->sa.sa_flags = 0;
#ifdef __ARCH_HAS_SA_RESTORER
                ka->sa.sa_restorer = NULL;
#endif
                sigemptyset(&ka->sa.sa_mask);
                ka++;
        }
}

bool unhandled_signal(struct task_struct *tsk, int sig)
{
        void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
        if (is_global_init(tsk))
                return true;

        if (handler != SIG_IGN && handler != SIG_DFL)
                return false;

        /* If dying, we handle all new signals by ignoring them */
        if (fatal_signal_pending(tsk))
                return false;

        /* if ptraced, let the tracer determine */
        return !tsk->ptrace;
}

static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *info,
                           bool *resched_timer)
{
        struct sigqueue *q, *first = NULL;

        /*
         * Collect the siginfo appropriate to this signal.  Check if
         * there is another siginfo for the same signal.
        */
        list_for_each_entry(q, &list->list, list) {
                if (q->info.si_signo == sig) {
                        if (first)
                                goto still_pending;
                        first = q;
                }
        }

        sigdelset(&list->signal, sig);

        if (first) {
still_pending:
                list_del_init(&first->list);
                copy_siginfo(info, &first->info);

                *resched_timer =
                        (first->flags & SIGQUEUE_PREALLOC) &&
                        (info->si_code == SI_TIMER) &&
                        (info->si_sys_private);

                __sigqueue_free(first);
        } else {
                /*
                 * Ok, it wasn't in the queue.  This must be
                 * a fast-pathed signal or we must have been
                 * out of queue space.  So zero out the info.
                 */
                clear_siginfo(info);
                info->si_signo = sig;
                info->si_errno = 0;
                info->si_code = SI_USER;
                info->si_pid = 0;
                info->si_uid = 0;
        }
}

static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
                        kernel_siginfo_t *info, bool *resched_timer)
{
        int sig = next_signal(pending, mask);

        if (sig)
                collect_signal(sig, pending, info, resched_timer);
        return sig;
}

/*
 * Dequeue a signal and return the element to the caller, which is
 * expected to free it.
 *
 * All callers have to hold the siglock.
 */
int dequeue_signal(struct task_struct *tsk, sigset_t *mask,
                   kernel_siginfo_t *info, enum pid_type *type)
{
        bool resched_timer = false;
        int signr;

        /* We only dequeue private signals from ourselves, we don't let
         * signalfd steal them
         */
        *type = PIDTYPE_PID;
        signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer);
        if (!signr) {
                *type = PIDTYPE_TGID;
                signr = __dequeue_signal(&tsk->signal->shared_pending,
                                         mask, info, &resched_timer);
#ifdef CONFIG_POSIX_TIMERS
                /*
                 * itimer signal ?
                 *
                 * itimers are process shared and we restart periodic
                 * itimers in the signal delivery path to prevent DoS
                 * attacks in the high resolution timer case. This is
                 * compliant with the old way of self-restarting
                 * itimers, as the SIGALRM is a legacy signal and only
                 * queued once. Changing the restart behaviour to
                 * restart the timer in the signal dequeue path is
                 * reducing the timer noise on heavy loaded !highres
                 * systems too.
                 */
                if (unlikely(signr == SIGALRM)) {
                        struct hrtimer *tmr = &tsk->signal->real_timer;

                        if (!hrtimer_is_queued(tmr) &&
                            tsk->signal->it_real_incr != 0) {
                                hrtimer_forward(tmr, tmr->base->get_time(),
                                                tsk->signal->it_real_incr);
                                hrtimer_restart(tmr);
                        }
                }
#endif
        }

        recalc_sigpending();
        if (!signr)
                return 0;

        if (unlikely(sig_kernel_stop(signr))) {
                /*
                 * Set a marker that we have dequeued a stop signal.  Our
                 * caller might release the siglock and then the pending
                 * stop signal it is about to process is no longer in the
                 * pending bitmasks, but must still be cleared by a SIGCONT
                 * (and overruled by a SIGKILL).  So those cases clear this
                 * shared flag after we've set it.  Note that this flag may
                 * remain set after the signal we return is ignored or
                 * handled.  That doesn't matter because its only purpose
                 * is to alert stop-signal processing code when another
                 * processor has come along and cleared the flag.
                 */
                current->jobctl |= JOBCTL_STOP_DEQUEUED;
        }
#ifdef CONFIG_POSIX_TIMERS
        if (resched_timer) {
                /*
                 * Release the siglock to ensure proper locking order
                 * of timer locks outside of siglocks.  Note, we leave
                 * irqs disabled here, since the posix-timers code is
                 * about to disable them again anyway.
                 */
                spin_unlock(&tsk->sighand->siglock);
                posixtimer_rearm(info);
                spin_lock(&tsk->sighand->siglock);

                /* Don't expose the si_sys_private value to userspace */
                info->si_sys_private = 0;
        }
#endif
        return signr;
}
EXPORT_SYMBOL_GPL(dequeue_signal);

static int dequeue_synchronous_signal(kernel_siginfo_t *info)
{
        struct task_struct *tsk = current;
        struct sigpending *pending = &tsk->pending;
        struct sigqueue *q, *sync = NULL;

        /*
         * Might a synchronous signal be in the queue?
         */
        if (!((pending->signal.sig[0] & ~tsk->blocked.sig[0]) & SYNCHRONOUS_MASK))
                return 0;

        /*
         * Return the first synchronous signal in the queue.
         */
        list_for_each_entry(q, &pending->list, list) {
                /* Synchronous signals have a positive si_code */
                if ((q->info.si_code > SI_USER) &&
                    (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) {
                        sync = q;
                        goto next;
                }
        }
        return 0;
next:
        /*
         * Check if there is another siginfo for the same signal.
         */
        list_for_each_entry_continue(q, &pending->list, list) {
                if (q->info.si_signo == sync->info.si_signo)
                        goto still_pending;
        }

        sigdelset(&pending->signal, sync->info.si_signo);
        recalc_sigpending();
still_pending:
        list_del_init(&sync->list);
        copy_siginfo(info, &sync->info);
        __sigqueue_free(sync);
        return info->si_signo;
}

/*
 * Tell a process that it has a new active signal..
 *
 * NOTE! we rely on the previous spin_lock to
 * lock interrupts for us! We can only be called with
 * "siglock" held, and the local interrupt must
 * have been disabled when that got acquired!
 *
 * No need to set need_resched since signal event passing
 * goes through ->blocked
 */
void signal_wake_up_state(struct task_struct *t, unsigned int state)
{
        lockdep_assert_held(&t->sighand->siglock);

        set_tsk_thread_flag(t, TIF_SIGPENDING);

        /*
         * TASK_WAKEKILL also means wake it up in the stopped/traced/killable
         * case. We don't check t->state here because there is a race with it
         * executing another processor and just now entering stopped state.
         * By using wake_up_state, we ensure the process will wake up and
         * handle its death signal.
         */
        if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
                kick_process(t);
}

/*
 * Remove signals in mask from the pending set and queue.
 * Returns 1 if any signals were found.
 *
 * All callers must be holding the siglock.
 */
static void flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
{
        struct sigqueue *q, *n;
        sigset_t m;

        sigandsets(&m, mask, &s->signal);
        if (sigisemptyset(&m))
                return;

        sigandnsets(&s->signal, &s->signal, mask);
        list_for_each_entry_safe(q, n, &s->list, list) {
                if (sigismember(mask, q->info.si_signo)) {
                        list_del_init(&q->list);
                        __sigqueue_free(q);
                }
        }
}

static inline int is_si_special(const struct kernel_siginfo *info)
{
        return info <= SEND_SIG_PRIV;
}

static inline bool si_fromuser(const struct kernel_siginfo *info)
{
        return info == SEND_SIG_NOINFO ||
                (!is_si_special(info) && SI_FROMUSER(info));
}

/*
 * called with RCU read lock from check_kill_permission()
 */
static bool kill_ok_by_cred(struct task_struct *t)
{
        const struct cred *cred = current_cred();
        const struct cred *tcred = __task_cred(t);

        return uid_eq(cred->euid, tcred->suid) ||
               uid_eq(cred->euid, tcred->uid) ||
               uid_eq(cred->uid, tcred->suid) ||
               uid_eq(cred->uid, tcred->uid) ||
               ns_capable(tcred->user_ns, CAP_KILL);
}

/*
 * Bad permissions for sending the signal
 * - the caller must hold the RCU read lock
 */
static int check_kill_permission(int sig, struct kernel_siginfo *info,
                                 struct task_struct *t)
{
        struct pid *sid;
        int error;

        if (!valid_signal(sig))
                return -EINVAL;

        if (!si_fromuser(info))
                return 0;

        error = audit_signal_info(sig, t); /* Let audit system see the signal */
        if (error)
                return error;

        if (!same_thread_group(current, t) &&
            !kill_ok_by_cred(t)) {
                switch (sig) {
                case SIGCONT:
                        sid = task_session(t);
                        /*
                         * We don't return the error if sid == NULL. The
                         * task was unhashed, the caller must notice this.
                         */
                        if (!sid || sid == task_session(current))
                                break;
                        fallthrough;
                default:
                        return -EPERM;
                }
        }

        return security_task_kill(t, info, sig, NULL);
}

/**
 * ptrace_trap_notify - schedule trap to notify ptracer
 * @t: tracee wanting to notify tracer
 *
 * This function schedules sticky ptrace trap which is cleared on the next
 * TRAP_STOP to notify ptracer of an event.  @t must have been seized by
 * ptracer.
 *
 * If @t is running, STOP trap will be taken.  If trapped for STOP and
 * ptracer is listening for events, tracee is woken up so that it can
 * re-trap for the new event.  If trapped otherwise, STOP trap will be
 * eventually taken without returning to userland after the existing traps
 * are finished by PTRACE_CONT.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 */
static void ptrace_trap_notify(struct task_struct *t)
{
        WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
        lockdep_assert_held(&t->sighand->siglock);

        task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
        ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
}

/*
 * Handle magic process-wide effects of stop/continue signals. Unlike
 * the signal actions, these happen immediately at signal-generation
 * time regardless of blocking, ignoring, or handling.  This does the
 * actual continuing for SIGCONT, but not the actual stopping for stop
 * signals. The process stop is done as a signal action for SIG_DFL.
 *
 * Returns true if the signal should be actually delivered, otherwise
 * it should be dropped.
 */
static bool prepare_signal(int sig, struct task_struct *p, bool force)
{
        struct signal_struct *signal = p->signal;
        struct task_struct *t;
        sigset_t flush;

        if (signal->flags & SIGNAL_GROUP_EXIT) {
                if (signal->core_state)
                        return sig == SIGKILL;
                /*
                 * The process is in the middle of dying, drop the signal.
                 */
                return false;
        } else if (sig_kernel_stop(sig)) {
                /*
                 * This is a stop signal.  Remove SIGCONT from all queues.
                 */
                siginitset(&flush, sigmask(SIGCONT));
                flush_sigqueue_mask(&flush, &signal->shared_pending);
                for_each_thread(p, t)
                        flush_sigqueue_mask(&flush, &t->pending);
        } else if (sig == SIGCONT) {
                unsigned int why;
                /*
                 * Remove all stop signals from all queues, wake all threads.
                 */
                siginitset(&flush, SIG_KERNEL_STOP_MASK);
                flush_sigqueue_mask(&flush, &signal->shared_pending);
                for_each_thread(p, t) {
                        flush_sigqueue_mask(&flush, &t->pending);
                        task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
                        if (likely(!(t->ptrace & PT_SEIZED))) {
                                t->jobctl &= ~JOBCTL_STOPPED;
                                wake_up_state(t, __TASK_STOPPED);
                        } else
                                ptrace_trap_notify(t);
                }

                /*
                 * Notify the parent with CLD_CONTINUED if we were stopped.
                 *
                 * If we were in the middle of a group stop, we pretend it
                 * was already finished, and then continued. Since SIGCHLD
                 * doesn't queue we report only CLD_STOPPED, as if the next
                 * CLD_CONTINUED was dropped.
                 */
                why = 0;
                if (signal->flags & SIGNAL_STOP_STOPPED)
                        why |= SIGNAL_CLD_CONTINUED;
                else if (signal->group_stop_count)
                        why |= SIGNAL_CLD_STOPPED;

                if (why) {
                        /*
                         * The first thread which returns from do_signal_stop()
                         * will take ->siglock, notice SIGNAL_CLD_MASK, and
                         * notify its parent. See get_signal().
                         */
                        signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED);
                        signal->group_stop_count = 0;
                        signal->group_exit_code = 0;
                }
        }

        return !sig_ignored(p, sig, force);
}

/*
 * Test if P wants to take SIG.  After we've checked all threads with this,
 * it's equivalent to finding no threads not blocking SIG.  Any threads not
 * blocking SIG were ruled out because they are not running and already
 * have pending signals.  Such threads will dequeue from the shared queue
 * as soon as they're available, so putting the signal on the shared queue
 * will be equivalent to sending it to one such thread.
 */
static inline bool wants_signal(int sig, struct task_struct *p)
{
        if (sigismember(&p->blocked, sig))
                return false;

        if (p->flags & PF_EXITING)
                return false;

        if (sig == SIGKILL)
                return true;

        if (task_is_stopped_or_traced(p))
                return false;

        return task_curr(p) || !task_sigpending(p);
}

static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
{
        struct signal_struct *signal = p->signal;
        struct task_struct *t;

        /*
         * Now find a thread we can wake up to take the signal off the queue.
         *
         * Try the suggested task first (may or may not be the main thread).
         */
        if (wants_signal(sig, p))
                t = p;
        else if ((type == PIDTYPE_PID) || thread_group_empty(p))
                /*
                 * There is just one thread and it does not need to be woken.
                 * It will dequeue unblocked signals before it runs again.
                 */
                return;
        else {
                /*
                 * Otherwise try to find a suitable thread.
                 */
                t = signal->curr_target;
                while (!wants_signal(sig, t)) {
                        t = next_thread(t);
                        if (t == signal->curr_target)
                                /*
                                 * No thread needs to be woken.
                                 * Any eligible threads will see
                                 * the signal in the queue soon.
                                 */
                                return;
                }
                signal->curr_target = t;
        }

        /*
         * Found a killable thread.  If the signal will be fatal,
         * then start taking the whole group down immediately.
         */
        if (sig_fatal(p, sig) &&
            (signal->core_state || !(signal->flags & SIGNAL_GROUP_EXIT)) &&
            !sigismember(&t->real_blocked, sig) &&
            (sig == SIGKILL || !p->ptrace)) {
                /*
                 * This signal will be fatal to the whole group.
                 */
                if (!sig_kernel_coredump(sig)) {
                        /*
                         * Start a group exit and wake everybody up.
                         * This way we don't have other threads
                         * running and doing things after a slower
                         * thread has the fatal signal pending.
                         */
                        signal->flags = SIGNAL_GROUP_EXIT;
                        signal->group_exit_code = sig;
                        signal->group_stop_count = 0;
                        __for_each_thread(signal, t) {
                                task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                                sigaddset(&t->pending.signal, SIGKILL);
                                signal_wake_up(t, 1);
                        }
                        return;
                }
        }

        /*
         * The signal is already in the shared-pending queue.
         * Tell the chosen thread to wake up and dequeue it.
         */
        signal_wake_up(t, sig == SIGKILL);
        return;
}

static inline bool legacy_queue(struct sigpending *signals, int sig)
{
        return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
}

static int __send_signal_locked(int sig, struct kernel_siginfo *info,
                                struct task_struct *t, enum pid_type type, bool force)
{
        struct sigpending *pending;
        struct sigqueue *q;
        int override_rlimit;
        int ret = 0, result;

        lockdep_assert_held(&t->sighand->siglock);

        result = TRACE_SIGNAL_IGNORED;
        if (!prepare_signal(sig, t, force))
                goto ret;

        pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
        /*
         * Short-circuit ignored signals and support queuing
         * exactly one non-rt signal, so that we can get more
         * detailed information about the cause of the signal.
         */
        result = TRACE_SIGNAL_ALREADY_PENDING;
        if (legacy_queue(pending, sig))
                goto ret;

        result = TRACE_SIGNAL_DELIVERED;
        /*
         * Skip useless siginfo allocation for SIGKILL and kernel threads.
         */
        if ((sig == SIGKILL) || (t->flags & PF_KTHREAD))
                goto out_set;

        /*
         * Real-time signals must be queued if sent by sigqueue, or
         * some other real-time mechanism.  It is implementation
         * defined whether kill() does so.  We attempt to do so, on
         * the principle of least surprise, but since kill is not
         * allowed to fail with EAGAIN when low on memory we just
         * make sure at least one signal gets delivered and don't
         * pass on the info struct.
         */
        if (sig < SIGRTMIN)
                override_rlimit = (is_si_special(info) || info->si_code >= 0);
        else
                override_rlimit = 0;

        q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit, 0);

        if (q) {
                list_add_tail(&q->list, &pending->list);
                switch ((unsigned long) info) {
                case (unsigned long) SEND_SIG_NOINFO:
                        clear_siginfo(&q->info);
                        q->info.si_signo = sig;
                        q->info.si_errno = 0;
                        q->info.si_code = SI_USER;
                        q->info.si_pid = task_tgid_nr_ns(current,
                                                        task_active_pid_ns(t));
                        rcu_read_lock();
                        q->info.si_uid =
                                from_kuid_munged(task_cred_xxx(t, user_ns),
                                                 current_uid());
                        rcu_read_unlock();
                        break;
                case (unsigned long) SEND_SIG_PRIV:
                        clear_siginfo(&q->info);
                        q->info.si_signo = sig;
                        q->info.si_errno = 0;
                        q->info.si_code = SI_KERNEL;
                        q->info.si_pid = 0;
                        q->info.si_uid = 0;
                        break;
                default:
                        copy_siginfo(&q->info, info);
                        break;
                }
        } else if (!is_si_special(info) &&
                   sig >= SIGRTMIN && info->si_code != SI_USER) {
                /*
                 * Queue overflow, abort.  We may abort if the
                 * signal was rt and sent by user using something
                 * other than kill().
                 */
                result = TRACE_SIGNAL_OVERFLOW_FAIL;
                ret = -EAGAIN;
                goto ret;
        } else {
                /*
                 * This is a silent loss of information.  We still
                 * send the signal, but the *info bits are lost.
                 */
                result = TRACE_SIGNAL_LOSE_INFO;
        }

out_set:
        signalfd_notify(t, sig);
        sigaddset(&pending->signal, sig);

        /* Let multiprocess signals appear after on-going forks */
        if (type > PIDTYPE_TGID) {
                struct multiprocess_signals *delayed;
                hlist_for_each_entry(delayed, &t->signal->multiprocess, node) {
                        sigset_t *signal = &delayed->signal;
                        /* Can't queue both a stop and a continue signal */
                        if (sig == SIGCONT)
                                sigdelsetmask(signal, SIG_KERNEL_STOP_MASK);
                        else if (sig_kernel_stop(sig))
                                sigdelset(signal, SIGCONT);
                        sigaddset(signal, sig);
                }
        }

        complete_signal(sig, t, type);
ret:
        trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result);
        return ret;
}

static inline bool has_si_pid_and_uid(struct kernel_siginfo *info)
{
        bool ret = false;
        switch (siginfo_layout(info->si_signo, info->si_code)) {
        case SIL_KILL:
        case SIL_CHLD:
        case SIL_RT:
                ret = true;
                break;
        case SIL_TIMER:
        case SIL_POLL:
        case SIL_FAULT:
        case SIL_FAULT_TRAPNO:
        case SIL_FAULT_MCEERR:
        case SIL_FAULT_BNDERR:
        case SIL_FAULT_PKUERR:
        case SIL_FAULT_PERF_EVENT:
        case SIL_SYS:
                ret = false;
                break;
        }
        return ret;
}

int send_signal_locked(int sig, struct kernel_siginfo *info,
                       struct task_struct *t, enum pid_type type)
{
        /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */
        bool force = false;

        if (info == SEND_SIG_NOINFO) {
                /* Force if sent from an ancestor pid namespace */
                force = !task_pid_nr_ns(current, task_active_pid_ns(t));
        } else if (info == SEND_SIG_PRIV) {
                /* Don't ignore kernel generated signals */
                force = true;
        } else if (has_si_pid_and_uid(info)) {
                /* SIGKILL and SIGSTOP is special or has ids */
                struct user_namespace *t_user_ns;

                rcu_read_lock();
                t_user_ns = task_cred_xxx(t, user_ns);
                if (current_user_ns() != t_user_ns) {
                        kuid_t uid = make_kuid(current_user_ns(), info->si_uid);
                        info->si_uid = from_kuid_munged(t_user_ns, uid);
                }
                rcu_read_unlock();

                /* A kernel generated signal? */
                force = (info->si_code == SI_KERNEL);

                /* From an ancestor pid namespace? */
                if (!task_pid_nr_ns(current, task_active_pid_ns(t))) {
                        info->si_pid = 0;
                        force = true;
                }
        }
        return __send_signal_locked(sig, info, t, type, force);
}

static void print_fatal_signal(int signr)
{
        struct pt_regs *regs = task_pt_regs(current);
        struct file *exe_file;

        exe_file = get_task_exe_file(current);
        if (exe_file) {
                pr_info("%pD: %s: potentially unexpected fatal signal %d.\n",
                        exe_file, current->comm, signr);
                fput(exe_file);
        } else {
                pr_info("%s: potentially unexpected fatal signal %d.\n",
                        current->comm, signr);
        }

#if defined(__i386__) && !defined(__arch_um__)
        pr_info("code at %08lx: ", regs->ip);
        {
                int i;
                for (i = 0; i < 16; i++) {
                        unsigned char insn;

                        if (get_user(insn, (unsigned char *)(regs->ip + i)))
                                break;
                        pr_cont("%02x ", insn);
                }
        }
        pr_cont("\n");
#endif
        preempt_disable();
        show_regs(regs);
        preempt_enable();
}

static int __init setup_print_fatal_signals(char *str)
{
        get_option (&str, &print_fatal_signals);

        return 1;
}

__setup("print-fatal-signals=", setup_print_fatal_signals);

int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p,
                        enum pid_type type)
{
        unsigned long flags;
        int ret = -ESRCH;

        if (lock_task_sighand(p, &flags)) {
                ret = send_signal_locked(sig, info, p, type);
                unlock_task_sighand(p, &flags);
        }

        return ret;
}

enum sig_handler {
        HANDLER_CURRENT, /* If reachable use the current handler */
        HANDLER_SIG_DFL, /* Always use SIG_DFL handler semantics */
        HANDLER_EXIT,         /* Only visible as the process exit code */
};

/*
 * Force a signal that the process can't ignore: if necessary
 * we unblock the signal and change any SIG_IGN to SIG_DFL.
 *
 * Note: If we unblock the signal, we always reset it to SIG_DFL,
 * since we do not want to have a signal handler that was blocked
 * be invoked when user space had explicitly blocked it.
 *
 * We don't want to have recursive SIGSEGV's etc, for example,
 * that is why we also clear SIGNAL_UNKILLABLE.
 */
static int
force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
        enum sig_handler handler)
{
        unsigned long int flags;
        int ret, blocked, ignored;
        struct k_sigaction *action;
        int sig = info->si_signo;

        spin_lock_irqsave(&t->sighand->siglock, flags);
        action = &t->sighand->action[sig-1];
        ignored = action->sa.sa_handler == SIG_IGN;
        blocked = sigismember(&t->blocked, sig);
        if (blocked || ignored || (handler != HANDLER_CURRENT)) {
                action->sa.sa_handler = SIG_DFL;
                if (handler == HANDLER_EXIT)
                        action->sa.sa_flags |= SA_IMMUTABLE;
                if (blocked)
                        sigdelset(&t->blocked, sig);
        }
        /*
         * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
         * debugging to leave init killable. But HANDLER_EXIT is always fatal.
         */
        if (action->sa.sa_handler == SIG_DFL &&
            (!t->ptrace || (handler == HANDLER_EXIT)))
                t->signal->flags &= ~SIGNAL_UNKILLABLE;
        ret = send_signal_locked(sig, info, t, PIDTYPE_PID);
        /* This can happen if the signal was already pending and blocked */
        if (!task_sigpending(t))
                signal_wake_up(t, 0);
        spin_unlock_irqrestore(&t->sighand->siglock, flags);

        return ret;
}

int force_sig_info(struct kernel_siginfo *info)
{
        return force_sig_info_to_task(info, current, HANDLER_CURRENT);
}

/*
 * Nuke all other threads in the group.
 */
int zap_other_threads(struct task_struct *p)
{
        struct task_struct *t;
        int count = 0;

        p->signal->group_stop_count = 0;

        for_other_threads(p, t) {
                task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                count++;

                /* Don't bother with already dead threads */
                if (t->exit_state)
                        continue;
                sigaddset(&t->pending.signal, SIGKILL);
                signal_wake_up(t, 1);
        }

        return count;
}

struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
                                           unsigned long *flags)
{
        struct sighand_struct *sighand;

        rcu_read_lock();
        for (;;) {
                sighand = rcu_dereference(tsk->sighand);
                if (unlikely(sighand == NULL))
                        break;

                /*
                 * This sighand can be already freed and even reused, but
                 * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which
                 * initializes ->siglock: this slab can't go away, it has
                 * the same object type, ->siglock can't be reinitialized.
                 *
                 * We need to ensure that tsk->sighand is still the same
                 * after we take the lock, we can race with de_thread() or
                 * __exit_signal(). In the latter case the next iteration
                 * must see ->sighand == NULL.
                 */
                spin_lock_irqsave(&sighand->siglock, *flags);
                if (likely(sighand == rcu_access_pointer(tsk->sighand)))
                        break;
                spin_unlock_irqrestore(&sighand->siglock, *flags);
        }
        rcu_read_unlock();

        return sighand;
}

#ifdef CONFIG_LOCKDEP
void lockdep_assert_task_sighand_held(struct task_struct *task)
{
        struct sighand_struct *sighand;

        rcu_read_lock();
        sighand = rcu_dereference(task->sighand);
        if (sighand)
                lockdep_assert_held(&sighand->siglock);
        else
                WARN_ON_ONCE(1);
        rcu_read_unlock();
}
#endif

/*
 * send signal info to all the members of a thread group or to the
 * individual thread if type == PIDTYPE_PID.
 */
int group_send_sig_info(int sig, struct kernel_siginfo *info,
                        struct task_struct *p, enum pid_type type)
{
        int ret;

        rcu_read_lock();
        ret = check_kill_permission(sig, info, p);
        rcu_read_unlock();

        if (!ret && sig)
                ret = do_send_sig_info(sig, info, p, type);

        return ret;
}

/*
 * __kill_pgrp_info() sends a signal to a process group: this is what the tty
 * control characters do (^C, ^Z etc)
 * - the caller must hold at least a readlock on tasklist_lock
 */
int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
{
        struct task_struct *p = NULL;
        int ret = -ESRCH;

        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
                int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID);
                /*
                 * If group_send_sig_info() succeeds at least once ret
                 * becomes 0 and after that the code below has no effect.
                 * Otherwise we return the last err or -ESRCH if this
                 * process group is empty.
                 */
                if (ret)
                        ret = err;
        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);

        return ret;
}

static int kill_pid_info_type(int sig, struct kernel_siginfo *info,
                                struct pid *pid, enum pid_type type)
{
        int error = -ESRCH;
        struct task_struct *p;

        for (;;) {
                rcu_read_lock();
                p = pid_task(pid, PIDTYPE_PID);
                if (p)
                        error = group_send_sig_info(sig, info, p, type);
                rcu_read_unlock();
                if (likely(!p || error != -ESRCH))
                        return error;
                /*
                 * The task was unhashed in between, try again.  If it
                 * is dead, pid_task() will return NULL, if we race with
                 * de_thread() it will find the new leader.
                 */
        }
}

int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
{
        return kill_pid_info_type(sig, info, pid, PIDTYPE_TGID);
}

static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid)
{
        int error;
        rcu_read_lock();
        error = kill_pid_info(sig, info, find_vpid(pid));
        rcu_read_unlock();
        return error;
}

static inline bool kill_as_cred_perm(const struct cred *cred,
                                     struct task_struct *target)
{
        const struct cred *pcred = __task_cred(target);

        return uid_eq(cred->euid, pcred->suid) ||
               uid_eq(cred->euid, pcred->uid) ||
               uid_eq(cred->uid, pcred->suid) ||
               uid_eq(cred->uid, pcred->uid);
}

/*
 * The usb asyncio usage of siginfo is wrong.  The glibc support
 * for asyncio which uses SI_ASYNCIO assumes the layout is SIL_RT.
 * AKA after the generic fields:
 *        kernel_pid_t        si_pid;
 *        kernel_uid32_t        si_uid;
 *        sigval_t        si_value;
 *
 * Unfortunately when usb generates SI_ASYNCIO it assumes the layout
 * after the generic fields is:
 *        void __user         *si_addr;
 *
 * This is a practical problem when there is a 64bit big endian kernel
 * and a 32bit userspace.  As the 32bit address will encoded in the low
 * 32bits of the pointer.  Those low 32bits will be stored at higher
 * address than appear in a 32 bit pointer.  So userspace will not
 * see the address it was expecting for it's completions.
 *
 * There is nothing in the encoding that can allow
 * copy_siginfo_to_user32 to detect this confusion of formats, so
 * handle this by requiring the caller of kill_pid_usb_asyncio to
 * notice when this situration takes place and to store the 32bit
 * pointer in sival_int, instead of sival_addr of the sigval_t addr
 * parameter.
 */
int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr,
                         struct pid *pid, const struct cred *cred)
{
        struct kernel_siginfo info;
        struct task_struct *p;
        unsigned long flags;
        int ret = -EINVAL;

        if (!valid_signal(sig))
                return ret;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = errno;
        info.si_code = SI_ASYNCIO;
        *((sigval_t *)&info.si_pid) = addr;

        rcu_read_lock();
        p = pid_task(pid, PIDTYPE_PID);
        if (!p) {
                ret = -ESRCH;
                goto out_unlock;
        }
        if (!kill_as_cred_perm(cred, p)) {
                ret = -EPERM;
                goto out_unlock;
        }
        ret = security_task_kill(p, &info, sig, cred);
        if (ret)
                goto out_unlock;

        if (sig) {
                if (lock_task_sighand(p, &flags)) {
                        ret = __send_signal_locked(sig, &info, p, PIDTYPE_TGID, false);
                        unlock_task_sighand(p, &flags);
                } else
                        ret = -ESRCH;
        }
out_unlock:
        rcu_read_unlock();
        return ret;
}
EXPORT_SYMBOL_GPL(kill_pid_usb_asyncio);

/*
 * kill_something_info() interprets pid in interesting ways just like kill(2).
 *
 * POSIX specifies that kill(-1,sig) is unspecified, but what we have
 * is probably wrong.  Should make it like BSD or SYSV.
 */

static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid)
{
        int ret;

        if (pid > 0)
                return kill_proc_info(sig, info, pid);

        /* -INT_MIN is undefined.  Exclude this case to avoid a UBSAN warning */
        if (pid == INT_MIN)
                return -ESRCH;

        read_lock(&tasklist_lock);
        if (pid != -1) {
                ret = __kill_pgrp_info(sig, info,
                                pid ? find_vpid(-pid) : task_pgrp(current));
        } else {
                int retval = 0, count = 0;
                struct task_struct * p;

                for_each_process(p) {
                        if (task_pid_vnr(p) > 1 &&
                                        !same_thread_group(p, current)) {
                                int err = group_send_sig_info(sig, info, p,
                                                              PIDTYPE_MAX);
                                ++count;
                                if (err != -EPERM)
                                        retval = err;
                        }
                }
                ret = count ? retval : -ESRCH;
        }
        read_unlock(&tasklist_lock);

        return ret;
}

/*
 * These are for backward compatibility with the rest of the kernel source.
 */

int send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p)
{
        /*
         * Make sure legacy kernel users don't send in bad values
         * (normal paths check this in check_kill_permission).
         */
        if (!valid_signal(sig))
                return -EINVAL;

        return do_send_sig_info(sig, info, p, PIDTYPE_PID);
}
EXPORT_SYMBOL(send_sig_info);

#define __si_special(priv) \
        ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO)

int
send_sig(int sig, struct task_struct *p, int priv)
{
        return send_sig_info(sig, __si_special(priv), p);
}
EXPORT_SYMBOL(send_sig);

void force_sig(int sig)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code = SI_KERNEL;
        info.si_pid = 0;
        info.si_uid = 0;
        force_sig_info(&info);
}
EXPORT_SYMBOL(force_sig);

void force_fatal_sig(int sig)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code = SI_KERNEL;
        info.si_pid = 0;
        info.si_uid = 0;
        force_sig_info_to_task(&info, current, HANDLER_SIG_DFL);
}

void force_exit_sig(int sig)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code = SI_KERNEL;
        info.si_pid = 0;
        info.si_uid = 0;
        force_sig_info_to_task(&info, current, HANDLER_EXIT);
}

/*
 * When things go south during signal handling, we
 * will force a SIGSEGV. And if the signal that caused
 * the problem was already a SIGSEGV, we'll want to
 * make sure we don't even try to deliver the signal..
 */
void force_sigsegv(int sig)
{
        if (sig == SIGSEGV)
                force_fatal_sig(SIGSEGV);
        else
                force_sig(SIGSEGV);
}

int force_sig_fault_to_task(int sig, int code, void __user *addr,
                            struct task_struct *t)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code  = code;
        info.si_addr  = addr;
        return force_sig_info_to_task(&info, t, HANDLER_CURRENT);
}

int force_sig_fault(int sig, int code, void __user *addr)
{
        return force_sig_fault_to_task(sig, code, addr, current);
}

int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code  = code;
        info.si_addr  = addr;
        return send_sig_info(info.si_signo, &info, t);
}

int force_sig_mceerr(int code, void __user *addr, short lsb)
{
        struct kernel_siginfo info;

        WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR));
        clear_siginfo(&info);
        info.si_signo = SIGBUS;
        info.si_errno = 0;
        info.si_code = code;
        info.si_addr = addr;
        info.si_addr_lsb = lsb;
        return force_sig_info(&info);
}

int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
{
        struct kernel_siginfo info;

        WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR));
        clear_siginfo(&info);
        info.si_signo = SIGBUS;
        info.si_errno = 0;
        info.si_code = code;
        info.si_addr = addr;
        info.si_addr_lsb = lsb;
        return send_sig_info(info.si_signo, &info, t);
}
EXPORT_SYMBOL(send_sig_mceerr);

int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = SIGSEGV;
        info.si_errno = 0;
        info.si_code  = SEGV_BNDERR;
        info.si_addr  = addr;
        info.si_lower = lower;
        info.si_upper = upper;
        return force_sig_info(&info);
}

#ifdef SEGV_PKUERR
int force_sig_pkuerr(void __user *addr, u32 pkey)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = SIGSEGV;
        info.si_errno = 0;
        info.si_code  = SEGV_PKUERR;
        info.si_addr  = addr;
        info.si_pkey  = pkey;
        return force_sig_info(&info);
}
#endif

int send_sig_perf(void __user *addr, u32 type, u64 sig_data)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo     = SIGTRAP;
        info.si_errno     = 0;
        info.si_code      = TRAP_PERF;
        info.si_addr      = addr;
        info.si_perf_data = sig_data;
        info.si_perf_type = type;

        /*
         * Signals generated by perf events should not terminate the whole
         * process if SIGTRAP is blocked, however, delivering the signal
         * asynchronously is better than not delivering at all. But tell user
         * space if the signal was asynchronous, so it can clearly be
         * distinguished from normal synchronous ones.
         */
        info.si_perf_flags = sigismember(&current->blocked, info.si_signo) ?
                                     TRAP_PERF_FLAG_ASYNC :
                                     0;

        return send_sig_info(info.si_signo, &info, current);
}

/**
 * force_sig_seccomp - signals the task to allow in-process syscall emulation
 * @syscall: syscall number to send to userland
 * @reason: filter-supplied reason code to send to userland (via si_errno)
 * @force_coredump: true to trigger a coredump
 *
 * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
 */
int force_sig_seccomp(int syscall, int reason, bool force_coredump)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = SIGSYS;
        info.si_code = SYS_SECCOMP;
        info.si_call_addr = (void __user *)KSTK_EIP(current);
        info.si_errno = reason;
        info.si_arch = syscall_get_arch(current);
        info.si_syscall = syscall;
        return force_sig_info_to_task(&info, current,
                force_coredump ? HANDLER_EXIT : HANDLER_CURRENT);
}

/* For the crazy architectures that include trap information in
 * the errno field, instead of an actual errno value.
 */
int force_sig_ptrace_errno_trap(int errno, void __user *addr)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = SIGTRAP;
        info.si_errno = errno;
        info.si_code  = TRAP_HWBKPT;
        info.si_addr  = addr;
        return force_sig_info(&info);
}

/* For the rare architectures that include trap information using
 * si_trapno.
 */
int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code  = code;
        info.si_addr  = addr;
        info.si_trapno = trapno;
        return force_sig_info(&info);
}

/* For the rare architectures that include trap information using
 * si_trapno.
 */
int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno,
                          struct task_struct *t)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code  = code;
        info.si_addr  = addr;
        info.si_trapno = trapno;
        return send_sig_info(info.si_signo, &info, t);
}

static int kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
{
        int ret;
        read_lock(&tasklist_lock);
        ret = __kill_pgrp_info(sig, info, pgrp);
        read_unlock(&tasklist_lock);
        return ret;
}

int kill_pgrp(struct pid *pid, int sig, int priv)
{
        return kill_pgrp_info(sig, __si_special(priv), pid);
}
EXPORT_SYMBOL(kill_pgrp);

int kill_pid(struct pid *pid, int sig, int priv)
{
        return kill_pid_info(sig, __si_special(priv), pid);
}
EXPORT_SYMBOL(kill_pid);

/*
 * These functions support sending signals using preallocated sigqueue
 * structures.  This is needed "because realtime applications cannot
 * afford to lose notifications of asynchronous events, like timer
 * expirations or I/O completions".  In the case of POSIX Timers
 * we allocate the sigqueue structure from the timer_create.  If this
 * allocation fails we are able to report the failure to the application
 * with an EAGAIN error.
 */
struct sigqueue *sigqueue_alloc(void)
{
        return __sigqueue_alloc(-1, current, GFP_KERNEL, 0, SIGQUEUE_PREALLOC);
}

void sigqueue_free(struct sigqueue *q)
{
        unsigned long flags;
        spinlock_t *lock = &current->sighand->siglock;

        BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
        /*
         * We must hold ->siglock while testing q->list
         * to serialize with collect_signal() or with
         * __exit_signal()->flush_sigqueue().
         */
        spin_lock_irqsave(lock, flags);
        q->flags &= ~SIGQUEUE_PREALLOC;
        /*
         * If it is queued it will be freed when dequeued,
         * like the "regular" sigqueue.
         */
        if (!list_empty(&q->list))
                q = NULL;
        spin_unlock_irqrestore(lock, flags);

        if (q)
                __sigqueue_free(q);
}

int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
{
        int sig = q->info.si_signo;
        struct sigpending *pending;
        struct task_struct *t;
        unsigned long flags;
        int ret, result;

        BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));

        ret = -1;
        rcu_read_lock();

        /*
         * This function is used by POSIX timers to deliver a timer signal.
         * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID
         * set), the signal must be delivered to the specific thread (queues
         * into t->pending).
         *
         * Where type is not PIDTYPE_PID, signals must be delivered to the
         * process. In this case, prefer to deliver to current if it is in
         * the same thread group as the target process, which avoids
         * unnecessarily waking up a potentially idle task.
         */
        t = pid_task(pid, type);
        if (!t)
                goto ret;
        if (type != PIDTYPE_PID && same_thread_group(t, current))
                t = current;
        if (!likely(lock_task_sighand(t, &flags)))
                goto ret;

        ret = 1; /* the signal is ignored */
        result = TRACE_SIGNAL_IGNORED;
        if (!prepare_signal(sig, t, false))
                goto out;

        ret = 0;
        if (unlikely(!list_empty(&q->list))) {
                /*
                 * If an SI_TIMER entry is already queue just increment
                 * the overrun count.
                 */
                BUG_ON(q->info.si_code != SI_TIMER);
                q->info.si_overrun++;
                result = TRACE_SIGNAL_ALREADY_PENDING;
                goto out;
        }
        q->info.si_overrun = 0;

        signalfd_notify(t, sig);
        pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
        list_add_tail(&q->list, &pending->list);
        sigaddset(&pending->signal, sig);
        complete_signal(sig, t, type);
        result = TRACE_SIGNAL_DELIVERED;
out:
        trace_signal_generate(sig, &q->info, t, type != PIDTYPE_PID, result);
        unlock_task_sighand(t, &flags);
ret:
        rcu_read_unlock();
        return ret;
}

void do_notify_pidfd(struct task_struct *task)
{
        struct pid *pid = task_pid(task);

        WARN_ON(task->exit_state == 0);

        __wake_up(&pid->wait_pidfd, TASK_NORMAL, 0,
                        poll_to_key(EPOLLIN | EPOLLRDNORM));
}

/*
 * Let a parent know about the death of a child.
 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
 *
 * Returns true if our parent ignored us and so we've switched to
 * self-reaping.
 */
bool do_notify_parent(struct task_struct *tsk, int sig)
{
        struct kernel_siginfo info;
        unsigned long flags;
        struct sighand_struct *psig;
        bool autoreap = false;
        u64 utime, stime;

        WARN_ON_ONCE(sig == -1);

        /* do_notify_parent_cldstop should have been called instead.  */
        WARN_ON_ONCE(task_is_stopped_or_traced(tsk));

        WARN_ON_ONCE(!tsk->ptrace &&
               (tsk->group_leader != tsk || !thread_group_empty(tsk)));
        /*
         * tsk is a group leader and has no threads, wake up the
         * non-PIDFD_THREAD waiters.
         */
        if (thread_group_empty(tsk))
                do_notify_pidfd(tsk);

        if (sig != SIGCHLD) {
                /*
                 * This is only possible if parent == real_parent.
                 * Check if it has changed security domain.
                 */
                if (tsk->parent_exec_id != READ_ONCE(tsk->parent->self_exec_id))
                        sig = SIGCHLD;
        }

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        /*
         * We are under tasklist_lock here so our parent is tied to
         * us and cannot change.
         *
         * task_active_pid_ns will always return the same pid namespace
         * until a task passes through release_task.
         *
         * write_lock() currently calls preempt_disable() which is the
         * same as rcu_read_lock(), but according to Oleg, this is not
         * correct to rely on this
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent));
        info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns),
                                       task_uid(tsk));
        rcu_read_unlock();

        task_cputime(tsk, &utime, &stime);
        info.si_utime = nsec_to_clock_t(utime + tsk->signal->utime);
        info.si_stime = nsec_to_clock_t(stime + tsk->signal->stime);

        info.si_status = tsk->exit_code & 0x7f;
        if (tsk->exit_code & 0x80)
                info.si_code = CLD_DUMPED;
        else if (tsk->exit_code & 0x7f)
                info.si_code = CLD_KILLED;
        else {
                info.si_code = CLD_EXITED;
                info.si_status = tsk->exit_code >> 8;
        }

        psig = tsk->parent->sighand;
        spin_lock_irqsave(&psig->siglock, flags);
        if (!tsk->ptrace && sig == SIGCHLD &&
            (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
             (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
                /*
                 * We are exiting and our parent doesn't care.  POSIX.1
                 * defines special semantics for setting SIGCHLD to SIG_IGN
                 * or setting the SA_NOCLDWAIT flag: we should be reaped
                 * automatically and not left for our parent's wait4 call.
                 * Rather than having the parent do it as a magic kind of
                 * signal handler, we just set this to tell do_exit that we
                 * can be cleaned up without becoming a zombie.  Note that
                 * we still call __wake_up_parent in this case, because a
                 * blocked sys_wait4 might now return -ECHILD.
                 *
                 * Whether we send SIGCHLD or not for SA_NOCLDWAIT
                 * is implementation-defined: we do (if you don't want
                 * it, just use SIG_IGN instead).
                 */
                autoreap = true;
                if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
                        sig = 0;
        }
        /*
         * Send with __send_signal as si_pid and si_uid are in the
         * parent's namespaces.
         */
        if (valid_signal(sig) && sig)
                __send_signal_locked(sig, &info, tsk->parent, PIDTYPE_TGID, false);
        __wake_up_parent(tsk, tsk->parent);
        spin_unlock_irqrestore(&psig->siglock, flags);

        return autoreap;
}

/**
 * do_notify_parent_cldstop - notify parent of stopped/continued state change
 * @tsk: task reporting the state change
 * @for_ptracer: the notification is for ptracer
 * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report
 *
 * Notify @tsk's parent that the stopped/continued state has changed.  If
 * @for_ptracer is %false, @tsk's group leader notifies to its real parent.
 * If %true, @tsk reports to @tsk->parent which should be the ptracer.
 *
 * CONTEXT:
 * Must be called with tasklist_lock at least read locked.
 */
static void do_notify_parent_cldstop(struct task_struct *tsk,
                                     bool for_ptracer, int why)
{
        struct kernel_siginfo info;
        unsigned long flags;
        struct task_struct *parent;
        struct sighand_struct *sighand;
        u64 utime, stime;

        if (for_ptracer) {
                parent = tsk->parent;
        } else {
                tsk = tsk->group_leader;
                parent = tsk->real_parent;
        }

        clear_siginfo(&info);
        info.si_signo = SIGCHLD;
        info.si_errno = 0;
        /*
         * see comment in do_notify_parent() about the following 4 lines
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
        info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
        rcu_read_unlock();

        task_cputime(tsk, &utime, &stime);
        info.si_utime = nsec_to_clock_t(utime);
        info.si_stime = nsec_to_clock_t(stime);

         info.si_code = why;
         switch (why) {
         case CLD_CONTINUED:
                 info.si_status = SIGCONT;
                 break;
         case CLD_STOPPED:
                 info.si_status = tsk->signal->group_exit_code & 0x7f;
                 break;
         case CLD_TRAPPED:
                 info.si_status = tsk->exit_code & 0x7f;
                 break;
         default:
                 BUG();
         }

        sighand = parent->sighand;
        spin_lock_irqsave(&sighand->siglock, flags);
        if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
            !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP))
                send_signal_locked(SIGCHLD, &info, parent, PIDTYPE_TGID);
        /*
         * Even if SIGCHLD is not generated, we must wake up wait4 calls.
         */
        __wake_up_parent(tsk, parent);
        spin_unlock_irqrestore(&sighand->siglock, flags);
}

/*
 * This must be called with current->sighand->siglock held.
 *
 * This should be the path for all ptrace stops.
 * We always set current->last_siginfo while stopped here.
 * That makes it a way to test a stopped process for
 * being ptrace-stopped vs being job-control-stopped.
 *
 * Returns the signal the ptracer requested the code resume
 * with.  If the code did not stop because the tracer is gone,
 * the stop signal remains unchanged unless clear_code.
 */
static int ptrace_stop(int exit_code, int why, unsigned long message,
                       kernel_siginfo_t *info)
        __releases(&current->sighand->siglock)
        __acquires(&current->sighand->siglock)
{
        bool gstop_done = false;

        if (arch_ptrace_stop_needed()) {
                /*
                 * The arch code has something special to do before a
                 * ptrace stop.  This is allowed to block, e.g. for faults
                 * on user stack pages.  We can't keep the siglock while
                 * calling arch_ptrace_stop, so we must release it now.
                 * To preserve proper semantics, we must do this before
                 * any signal bookkeeping like checking group_stop_count.
                 */
                spin_unlock_irq(&current->sighand->siglock);
                arch_ptrace_stop();
                spin_lock_irq(&current->sighand->siglock);
        }

        /*
         * After this point ptrace_signal_wake_up or signal_wake_up
         * will clear TASK_TRACED if ptrace_unlink happens or a fatal
         * signal comes in.  Handle previous ptrace_unlinks and fatal
         * signals here to prevent ptrace_stop sleeping in schedule.
         */
        if (!current->ptrace || __fatal_signal_pending(current))
                return exit_code;

        set_special_state(TASK_TRACED);
        current->jobctl |= JOBCTL_TRACED;

        /*
         * We're committing to trapping.  TRACED should be visible before
         * TRAPPING is cleared; otherwise, the tracer might fail do_wait().
         * Also, transition to TRACED and updates to ->jobctl should be
         * atomic with respect to siglock and should be done after the arch
         * hook as siglock is released and regrabbed across it.
         *
         *     TRACER                                    TRACEE
         *
         *     ptrace_attach()
         * [L]   wait_on_bit(JOBCTL_TRAPPING)        [S] set_special_state(TRACED)
         *     do_wait()
         *       set_current_state()                smp_wmb();
         *       ptrace_do_wait()
         *         wait_task_stopped()
         *           task_stopped_code()
         * [L]         task_is_traced()                [S] task_clear_jobctl_trapping();
         */
        smp_wmb();

        current->ptrace_message = message;
        current->last_siginfo = info;
        current->exit_code = exit_code;

        /*
         * If @why is CLD_STOPPED, we're trapping to participate in a group
         * stop.  Do the bookkeeping.  Note that if SIGCONT was delievered
         * across siglock relocks since INTERRUPT was scheduled, PENDING
         * could be clear now.  We act as if SIGCONT is received after
         * TASK_TRACED is entered - ignore it.
         */
        if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
                gstop_done = task_participate_group_stop(current);

        /* any trap clears pending STOP trap, STOP trap clears NOTIFY */
        task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP);
        if (info && info->si_code >> 8 == PTRACE_EVENT_STOP)
                task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY);

        /* entering a trap, clear TRAPPING */
        task_clear_jobctl_trapping(current);

        spin_unlock_irq(&current->sighand->siglock);
        read_lock(&tasklist_lock);
        /*
         * Notify parents of the stop.
         *
         * While ptraced, there are two parents - the ptracer and
         * the real_parent of the group_leader.  The ptracer should
         * know about every stop while the real parent is only
         * interested in the completion of group stop.  The states
         * for the two don't interact with each other.  Notify
         * separately unless they're gonna be duplicates.
         */
        if (current->ptrace)
                do_notify_parent_cldstop(current, true, why);
        if (gstop_done && (!current->ptrace || ptrace_reparented(current)))
                do_notify_parent_cldstop(current, false, why);

        /*
         * The previous do_notify_parent_cldstop() invocation woke ptracer.
         * One a PREEMPTION kernel this can result in preemption requirement
         * which will be fulfilled after read_unlock() and the ptracer will be
         * put on the CPU.
         * The ptracer is in wait_task_inactive(, __TASK_TRACED) waiting for
         * this task wait in schedule(). If this task gets preempted then it
         * remains enqueued on the runqueue. The ptracer will observe this and
         * then sleep for a delay of one HZ tick. In the meantime this task
         * gets scheduled, enters schedule() and will wait for the ptracer.
         *
         * This preemption point is not bad from a correctness point of
         * view but extends the runtime by one HZ tick time due to the
         * ptracer's sleep.  The preempt-disable section ensures that there
         * will be no preemption between unlock and schedule() and so
         * improving the performance since the ptracer will observe that
         * the tracee is scheduled out once it gets on the CPU.
         *
         * On PREEMPT_RT locking tasklist_lock does not disable preemption.
         * Therefore the task can be preempted after do_notify_parent_cldstop()
         * before unlocking tasklist_lock so there is no benefit in doing this.
         *
         * In fact disabling preemption is harmful on PREEMPT_RT because
         * the spinlock_t in cgroup_enter_frozen() must not be acquired
         * with preemption disabled due to the 'sleeping' spinlock
         * substitution of RT.
         */
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_disable();
        read_unlock(&tasklist_lock);
        cgroup_enter_frozen();
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_enable_no_resched();
        schedule();
        cgroup_leave_frozen(true);

        /*
         * We are back.  Now reacquire the siglock before touching
         * last_siginfo, so that we are sure to have synchronized with
         * any signal-sending on another CPU that wants to examine it.
         */
        spin_lock_irq(&current->sighand->siglock);
        exit_code = current->exit_code;
        current->last_siginfo = NULL;
        current->ptrace_message = 0;
        current->exit_code = 0;

        /* LISTENING can be set only during STOP traps, clear it */
        current->jobctl &= ~(JOBCTL_LISTENING | JOBCTL_PTRACE_FROZEN);

        /*
         * Queued signals ignored us while we were stopped for tracing.
         * So check for any that we should take before resuming user mode.
         * This sets TIF_SIGPENDING, but never clears it.
         */
        recalc_sigpending_tsk(current);
        return exit_code;
}

static int ptrace_do_notify(int signr, int exit_code, int why, unsigned long message)
{
        kernel_siginfo_t info;

        clear_siginfo(&info);
        info.si_signo = signr;
        info.si_code = exit_code;
        info.si_pid = task_pid_vnr(current);
        info.si_uid = from_kuid_munged(current_user_ns(), current_uid());

        /* Let the debugger run.  */
        return ptrace_stop(exit_code, why, message, &info);
}

int ptrace_notify(int exit_code, unsigned long message)
{
        int signr;

        BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
        if (unlikely(task_work_pending(current)))
                task_work_run();

        spin_lock_irq(&current->sighand->siglock);
        signr = ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED, message);
        spin_unlock_irq(&current->sighand->siglock);
        return signr;
}

/**
 * do_signal_stop - handle group stop for SIGSTOP and other stop signals
 * @signr: signr causing group stop if initiating
 *
 * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr
 * and participate in it.  If already set, participate in the existing
 * group stop.  If participated in a group stop (and thus slept), %true is
 * returned with siglock released.
 *
 * If ptraced, this function doesn't handle stop itself.  Instead,
 * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock
 * untouched.  The caller must ensure that INTERRUPT trap handling takes
 * places afterwards.
 *
 * CONTEXT:
 * Must be called with @current->sighand->siglock held, which is released
 * on %true return.
 *
 * RETURNS:
 * %false if group stop is already cancelled or ptrace trap is scheduled.
 * %true if participated in group stop.
 */
static bool do_signal_stop(int signr)
        __releases(&current->sighand->siglock)
{
        struct signal_struct *sig = current->signal;

        if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
                unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
                struct task_struct *t;

                /* signr will be recorded in task->jobctl for retries */
                WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);

                if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
                    unlikely(sig->flags & SIGNAL_GROUP_EXIT) ||
                    unlikely(sig->group_exec_task))
                        return false;
                /*
                 * There is no group stop already in progress.  We must
                 * initiate one now.
                 *
                 * While ptraced, a task may be resumed while group stop is
                 * still in effect and then receive a stop signal and
                 * initiate another group stop.  This deviates from the
                 * usual behavior as two consecutive stop signals can't
                 * cause two group stops when !ptraced.  That is why we
                 * also check !task_is_stopped(t) below.
                 *
                 * The condition can be distinguished by testing whether
                 * SIGNAL_STOP_STOPPED is already set.  Don't generate
                 * group_exit_code in such case.
                 *
                 * This is not necessary for SIGNAL_STOP_CONTINUED because
                 * an intervening stop signal is required to cause two
                 * continued events regardless of ptrace.
                 */
                if (!(sig->flags & SIGNAL_STOP_STOPPED))
                        sig->group_exit_code = signr;

                sig->group_stop_count = 0;
                if (task_set_jobctl_pending(current, signr | gstop))
                        sig->group_stop_count++;

                for_other_threads(current, t) {
                        /*
                         * Setting state to TASK_STOPPED for a group
                         * stop is always done with the siglock held,
                         * so this check has no races.
                         */
                        if (!task_is_stopped(t) &&
                            task_set_jobctl_pending(t, signr | gstop)) {
                                sig->group_stop_count++;
                                if (likely(!(t->ptrace & PT_SEIZED)))
                                        signal_wake_up(t, 0);
                                else
                                        ptrace_trap_notify(t);
                        }
                }
        }

        if (likely(!current->ptrace)) {
                int notify = 0;

                /*
                 * If there are no other threads in the group, or if there
                 * is a group stop in progress and we are the last to stop,
                 * report to the parent.
                 */
                if (task_participate_group_stop(current))
                        notify = CLD_STOPPED;

                current->jobctl |= JOBCTL_STOPPED;
                set_special_state(TASK_STOPPED);
                spin_unlock_irq(&current->sighand->siglock);

                /*
                 * Notify the parent of the group stop completion.  Because
                 * we're not holding either the siglock or tasklist_lock
                 * here, ptracer may attach inbetween; however, this is for
                 * group stop and should always be delivered to the real
                 * parent of the group leader.  The new ptracer will get
                 * its notification when this task transitions into
                 * TASK_TRACED.
                 */
                if (notify) {
                        read_lock(&tasklist_lock);
                        do_notify_parent_cldstop(current, false, notify);
                        read_unlock(&tasklist_lock);
                }

                /* Now we don't run again until woken by SIGCONT or SIGKILL */
                cgroup_enter_frozen();
                schedule();
                return true;
        } else {
                /*
                 * While ptraced, group stop is handled by STOP trap.
                 * Schedule it and let the caller deal with it.
                 */
                task_set_jobctl_pending(current, JOBCTL_TRAP_STOP);
                return false;
        }
}

/**
 * do_jobctl_trap - take care of ptrace jobctl traps
 *
 * When PT_SEIZED, it's used for both group stop and explicit
 * SEIZE/INTERRUPT traps.  Both generate PTRACE_EVENT_STOP trap with
 * accompanying siginfo.  If stopped, lower eight bits of exit_code contain
 * the stop signal; otherwise, %SIGTRAP.
 *
 * When !PT_SEIZED, it's used only for group stop trap with stop signal
 * number as exit_code and no siginfo.
 *
 * CONTEXT:
 * Must be called with @current->sighand->siglock held, which may be
 * released and re-acquired before returning with intervening sleep.
 */
static void do_jobctl_trap(void)
{
        struct signal_struct *signal = current->signal;
        int signr = current->jobctl & JOBCTL_STOP_SIGMASK;

        if (current->ptrace & PT_SEIZED) {
                if (!signal->group_stop_count &&
                    !(signal->flags & SIGNAL_STOP_STOPPED))
                        signr = SIGTRAP;
                WARN_ON_ONCE(!signr);
                ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
                                 CLD_STOPPED, 0);
        } else {
                WARN_ON_ONCE(!signr);
                ptrace_stop(signr, CLD_STOPPED, 0, NULL);
        }
}

/**
 * do_freezer_trap - handle the freezer jobctl trap
 *
 * Puts the task into frozen state, if only the task is not about to quit.
 * In this case it drops JOBCTL_TRAP_FREEZE.
 *
 * CONTEXT:
 * Must be called with @current->sighand->siglock held,
 * which is always released before returning.
 */
static void do_freezer_trap(void)
        __releases(&current->sighand->siglock)
{
        /*
         * If there are other trap bits pending except JOBCTL_TRAP_FREEZE,
         * let's make another loop to give it a chance to be handled.
         * In any case, we'll return back.
         */
        if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) !=
             JOBCTL_TRAP_FREEZE) {
                spin_unlock_irq(&current->sighand->siglock);
                return;
        }

        /*
         * Now we're sure that there is no pending fatal signal and no
         * pending traps. Clear TIF_SIGPENDING to not get out of schedule()
         * immediately (if there is a non-fatal signal pending), and
         * put the task into sleep.
         */
        __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
        clear_thread_flag(TIF_SIGPENDING);
        spin_unlock_irq(&current->sighand->siglock);
        cgroup_enter_frozen();
        schedule();
}

static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
{
        /*
         * We do not check sig_kernel_stop(signr) but set this marker
         * unconditionally because we do not know whether debugger will
         * change signr. This flag has no meaning unless we are going
         * to stop after return from ptrace_stop(). In this case it will
         * be checked in do_signal_stop(), we should only stop if it was
         * not cleared by SIGCONT while we were sleeping. See also the
         * comment in dequeue_signal().
         */
        current->jobctl |= JOBCTL_STOP_DEQUEUED;
        signr = ptrace_stop(signr, CLD_TRAPPED, 0, info);

        /* We're back.  Did the debugger cancel the sig?  */
        if (signr == 0)
                return signr;

        /*
         * Update the siginfo structure if the signal has
         * changed.  If the debugger wanted something
         * specific in the siginfo structure then it should
         * have updated *info via PTRACE_SETSIGINFO.
         */
        if (signr != info->si_signo) {
                clear_siginfo(info);
                info->si_signo = signr;
                info->si_errno = 0;
                info->si_code = SI_USER;
                rcu_read_lock();
                info->si_pid = task_pid_vnr(current->parent);
                info->si_uid = from_kuid_munged(current_user_ns(),
                                                task_uid(current->parent));
                rcu_read_unlock();
        }

        /* If the (new) signal is now blocked, requeue it.  */
        if (sigismember(&current->blocked, signr) ||
            fatal_signal_pending(current)) {
                send_signal_locked(signr, info, current, type);
                signr = 0;
        }

        return signr;
}

static void hide_si_addr_tag_bits(struct ksignal *ksig)
{
        switch (siginfo_layout(ksig->sig, ksig->info.si_code)) {
        case SIL_FAULT:
        case SIL_FAULT_TRAPNO:
        case SIL_FAULT_MCEERR:
        case SIL_FAULT_BNDERR:
        case SIL_FAULT_PKUERR:
        case SIL_FAULT_PERF_EVENT:
                ksig->info.si_addr = arch_untagged_si_addr(
                        ksig->info.si_addr, ksig->sig, ksig->info.si_code);
                break;
        case SIL_KILL:
        case SIL_TIMER:
        case SIL_POLL:
        case SIL_CHLD:
        case SIL_RT:
        case SIL_SYS:
                break;
        }
}

bool get_signal(struct ksignal *ksig)
{
        struct sighand_struct *sighand = current->sighand;
        struct signal_struct *signal = current->signal;
        int signr;

        clear_notify_signal();
        if (unlikely(task_work_pending(current)))
                task_work_run();

        if (!task_sigpending(current))
                return false;

        if (unlikely(uprobe_deny_signal()))
                return false;

        /*
         * Do this once, we can't return to user-mode if freezing() == T.
         * do_signal_stop() and ptrace_stop() do freezable_schedule() and
         * thus do not need another check after return.
         */
        try_to_freeze();

relock:
        spin_lock_irq(&sighand->siglock);

        /*
         * Every stopped thread goes here after wakeup. Check to see if
         * we should notify the parent, prepare_signal(SIGCONT) encodes
         * the CLD_ si_code into SIGNAL_CLD_MASK bits.
         */
        if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
                int why;

                if (signal->flags & SIGNAL_CLD_CONTINUED)
                        why = CLD_CONTINUED;
                else
                        why = CLD_STOPPED;

                signal->flags &= ~SIGNAL_CLD_MASK;

                spin_unlock_irq(&sighand->siglock);

                /*
                 * Notify the parent that we're continuing.  This event is
                 * always per-process and doesn't make whole lot of sense
                 * for ptracers, who shouldn't consume the state via
                 * wait(2) either, but, for backward compatibility, notify
                 * the ptracer of the group leader too unless it's gonna be
                 * a duplicate.
                 */
                read_lock(&tasklist_lock);
                do_notify_parent_cldstop(current, false, why);

                if (ptrace_reparented(current->group_leader))
                        do_notify_parent_cldstop(current->group_leader,
                                                true, why);
                read_unlock(&tasklist_lock);

                goto relock;
        }

        for (;;) {
                struct k_sigaction *ka;
                enum pid_type type;

                /* Has this task already been marked for death? */
                if ((signal->flags & SIGNAL_GROUP_EXIT) ||
                     signal->group_exec_task) {
                        signr = SIGKILL;
                        sigdelset(&current->pending.signal, SIGKILL);
                        trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
                                             &sighand->action[SIGKILL-1]);
                        recalc_sigpending();
                        /*
                         * implies do_group_exit() or return to PF_USER_WORKER,
                         * no need to initialize ksig->info/etc.
                         */
                        goto fatal;
                }

                if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
                    do_signal_stop(0))
                        goto relock;

                if (unlikely(current->jobctl &
                             (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) {
                        if (current->jobctl & JOBCTL_TRAP_MASK) {
                                do_jobctl_trap();
                                spin_unlock_irq(&sighand->siglock);
                        } else if (current->jobctl & JOBCTL_TRAP_FREEZE)
                                do_freezer_trap();

                        goto relock;
                }

                /*
                 * If the task is leaving the frozen state, let's update
                 * cgroup counters and reset the frozen bit.
                 */
                if (unlikely(cgroup_task_frozen(current))) {
                        spin_unlock_irq(&sighand->siglock);
                        cgroup_leave_frozen(false);
                        goto relock;
                }

                /*
                 * Signals generated by the execution of an instruction
                 * need to be delivered before any other pending signals
                 * so that the instruction pointer in the signal stack
                 * frame points to the faulting instruction.
                 */
                type = PIDTYPE_PID;
                signr = dequeue_synchronous_signal(&ksig->info);
                if (!signr)
                        signr = dequeue_signal(current, &current->blocked,
                                               &ksig->info, &type);

                if (!signr)
                        break; /* will return 0 */

                if (unlikely(current->ptrace) && (signr != SIGKILL) &&
                    !(sighand->action[signr -1].sa.sa_flags & SA_IMMUTABLE)) {
                        signr = ptrace_signal(signr, &ksig->info, type);
                        if (!signr)
                                continue;
                }

                ka = &sighand->action[signr-1];

                /* Trace actually delivered signals. */
                trace_signal_deliver(signr, &ksig->info, ka);

                if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
                        continue;
                if (ka->sa.sa_handler != SIG_DFL) {
                        /* Run the handler.  */
                        ksig->ka = *ka;

                        if (ka->sa.sa_flags & SA_ONESHOT)
                                ka->sa.sa_handler = SIG_DFL;

                        break; /* will return non-zero "signr" value */
                }

                /*
                 * Now we are doing the default action for this signal.
                 */
                if (sig_kernel_ignore(signr)) /* Default is nothing. */
                        continue;

                /*
                 * Global init gets no signals it doesn't want.
                 * Container-init gets no signals it doesn't want from same
                 * container.
                 *
                 * Note that if global/container-init sees a sig_kernel_only()
                 * signal here, the signal must have been generated internally
                 * or must have come from an ancestor namespace. In either
                 * case, the signal cannot be dropped.
                 */
                if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
                                !sig_kernel_only(signr))
                        continue;

                if (sig_kernel_stop(signr)) {
                        /*
                         * The default action is to stop all threads in
                         * the thread group.  The job control signals
                         * do nothing in an orphaned pgrp, but SIGSTOP
                         * always works.  Note that siglock needs to be
                         * dropped during the call to is_orphaned_pgrp()
                         * because of lock ordering with tasklist_lock.
                         * This allows an intervening SIGCONT to be posted.
                         * We need to check for that and bail out if necessary.
                         */
                        if (signr != SIGSTOP) {
                                spin_unlock_irq(&sighand->siglock);

                                /* signals can be posted during this window */

                                if (is_current_pgrp_orphaned())
                                        goto relock;

                                spin_lock_irq(&sighand->siglock);
                        }

                        if (likely(do_signal_stop(signr))) {
                                /* It released the siglock.  */
                                goto relock;
                        }

                        /*
                         * We didn't actually stop, due to a race
                         * with SIGCONT or something like that.
                         */
                        continue;
                }

        fatal:
                spin_unlock_irq(&sighand->siglock);
                if (unlikely(cgroup_task_frozen(current)))
                        cgroup_leave_frozen(true);

                /*
                 * Anything else is fatal, maybe with a core dump.
                 */
                current->flags |= PF_SIGNALED;

                if (sig_kernel_coredump(signr)) {
                        if (print_fatal_signals)
                                print_fatal_signal(signr);
                        proc_coredump_connector(current);
                        /*
                         * If it was able to dump core, this kills all
                         * other threads in the group and synchronizes with
                         * their demise.  If we lost the race with another
                         * thread getting here, it set group_exit_code
                         * first and our do_group_exit call below will use
                         * that value and ignore the one we pass it.
                         */
                        do_coredump(&ksig->info);
                }

                /*
                 * PF_USER_WORKER threads will catch and exit on fatal signals
                 * themselves. They have cleanup that must be performed, so we
                 * cannot call do_exit() on their behalf. Note that ksig won't
                 * be properly initialized, PF_USER_WORKER's shouldn't use it.
                 */
                if (current->flags & PF_USER_WORKER)
                        goto out;

                /*
                 * Death signals, no core dump.
                 */
                do_group_exit(signr);
                /* NOTREACHED */
        }
        spin_unlock_irq(&sighand->siglock);

        ksig->sig = signr;

        if (signr && !(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS))
                hide_si_addr_tag_bits(ksig);
out:
        return signr > 0;
}

/**
 * signal_delivered - called after signal delivery to update blocked signals
 * @ksig:                kernel signal struct
 * @stepping:                nonzero if debugger single-step or block-step in use
 *
 * This function should be called when a signal has successfully been
 * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask
 * is always blocked), and the signal itself is blocked unless %SA_NODEFER
 * is set in @ksig->ka.sa.sa_flags.  Tracing is notified.
 */
static void signal_delivered(struct ksignal *ksig, int stepping)
{
        sigset_t blocked;

        /* A signal was successfully delivered, and the
           saved sigmask was stored on the signal frame,
           and will be restored by sigreturn.  So we can
           simply clear the restore sigmask flag.  */
        clear_restore_sigmask();

        sigorsets(&blocked, &current->blocked, &ksig->ka.sa.sa_mask);
        if (!(ksig->ka.sa.sa_flags & SA_NODEFER))
                sigaddset(&blocked, ksig->sig);
        set_current_blocked(&blocked);
        if (current->sas_ss_flags & SS_AUTODISARM)
                sas_ss_reset(current);
        if (stepping)
                ptrace_notify(SIGTRAP, 0);
}

void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
{
        if (failed)
                force_sigsegv(ksig->sig);
        else
                signal_delivered(ksig, stepping);
}

/*
 * It could be that complete_signal() picked us to notify about the
 * group-wide signal. Other threads should be notified now to take
 * the shared signals in @which since we will not.
 */
static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
{
        sigset_t retarget;
        struct task_struct *t;

        sigandsets(&retarget, &tsk->signal->shared_pending.signal, which);
        if (sigisemptyset(&retarget))
                return;

        for_other_threads(tsk, t) {
                if (t->flags & PF_EXITING)
                        continue;

                if (!has_pending_signals(&retarget, &t->blocked))
                        continue;
                /* Remove the signals this thread can handle. */
                sigandsets(&retarget, &retarget, &t->blocked);

                if (!task_sigpending(t))
                        signal_wake_up(t, 0);

                if (sigisemptyset(&retarget))
                        break;
        }
}

void exit_signals(struct task_struct *tsk)
{
        int group_stop = 0;
        sigset_t unblocked;

        /*
         * @tsk is about to have PF_EXITING set - lock out users which
         * expect stable threadgroup.
         */
        cgroup_threadgroup_change_begin(tsk);

        if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
                sched_mm_cid_exit_signals(tsk);
                tsk->flags |= PF_EXITING;
                cgroup_threadgroup_change_end(tsk);
                return;
        }

        spin_lock_irq(&tsk->sighand->siglock);
        /*
         * From now this task is not visible for group-wide signals,
         * see wants_signal(), do_signal_stop().
         */
        sched_mm_cid_exit_signals(tsk);
        tsk->flags |= PF_EXITING;

        cgroup_threadgroup_change_end(tsk);

        if (!task_sigpending(tsk))
                goto out;

        unblocked = tsk->blocked;
        signotset(&unblocked);
        retarget_shared_pending(tsk, &unblocked);

        if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) &&
            task_participate_group_stop(tsk))
                group_stop = CLD_STOPPED;
out:
        spin_unlock_irq(&tsk->sighand->siglock);

        /*
         * If group stop has completed, deliver the notification.  This
         * should always go to the real parent of the group leader.
         */
        if (unlikely(group_stop)) {
                read_lock(&tasklist_lock);
                do_notify_parent_cldstop(tsk, false, group_stop);
                read_unlock(&tasklist_lock);
        }
}

/*
 * System call entry points.
 */

/**
 *  sys_restart_syscall - restart a system call
 */
SYSCALL_DEFINE0(restart_syscall)
{
        struct restart_block *restart = &current->restart_block;
        return restart->fn(restart);
}

long do_no_restart_syscall(struct restart_block *param)
{
        return -EINTR;
}

static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
{
        if (task_sigpending(tsk) && !thread_group_empty(tsk)) {
                sigset_t newblocked;
                /* A set of now blocked but previously unblocked signals. */
                sigandnsets(&newblocked, newset, &current->blocked);
                retarget_shared_pending(tsk, &newblocked);
        }
        tsk->blocked = *newset;
        recalc_sigpending();
}

/**
 * set_current_blocked - change current->blocked mask
 * @newset: new mask
 *
 * It is wrong to change ->blocked directly, this helper should be used
 * to ensure the process can't miss a shared signal we are going to block.
 */
void set_current_blocked(sigset_t *newset)
{
        sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP));
        __set_current_blocked(newset);
}

void __set_current_blocked(const sigset_t *newset)
{
        struct task_struct *tsk = current;

        /*
         * In case the signal mask hasn't changed, there is nothing we need
         * to do. The current->blocked shouldn't be modified by other task.
         */
        if (sigequalsets(&tsk->blocked, newset))
                return;

        spin_lock_irq(&tsk->sighand->siglock);
        __set_task_blocked(tsk, newset);
        spin_unlock_irq(&tsk->sighand->siglock);
}

/*
 * This is also useful for kernel threads that want to temporarily
 * (or permanently) block certain signals.
 *
 * NOTE! Unlike the user-mode sys_sigprocmask(), the kernel
 * interface happily blocks "unblockable" signals like SIGKILL
 * and friends.
 */
int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
{
        struct task_struct *tsk = current;
        sigset_t newset;

        /* Lockless, only current can change ->blocked, never from irq */
        if (oldset)
                *oldset = tsk->blocked;

        switch (how) {
        case SIG_BLOCK:
                sigorsets(&newset, &tsk->blocked, set);
                break;
        case SIG_UNBLOCK:
                sigandnsets(&newset, &tsk->blocked, set);
                break;
        case SIG_SETMASK:
                newset = *set;
                break;
        default:
                return -EINVAL;
        }

        __set_current_blocked(&newset);
        return 0;
}
EXPORT_SYMBOL(sigprocmask);

/*
 * The api helps set app-provided sigmasks.
 *
 * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
 * epoll_pwait where a new sigmask is passed from userland for the syscalls.
 *
 * Note that it does set_restore_sigmask() in advance, so it must be always
 * paired with restore_saved_sigmask_unless() before return from syscall.
 */
int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
{
        sigset_t kmask;

        if (!umask)
                return 0;
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;
        if (copy_from_user(&kmask, umask, sizeof(sigset_t)))
                return -EFAULT;

        set_restore_sigmask();
        current->saved_sigmask = current->blocked;
        set_current_blocked(&kmask);

        return 0;
}

#ifdef CONFIG_COMPAT
int set_compat_user_sigmask(const compat_sigset_t __user *umask,
                            size_t sigsetsize)
{
        sigset_t kmask;

        if (!umask)
                return 0;
        if (sigsetsize != sizeof(compat_sigset_t))
                return -EINVAL;
        if (get_compat_sigset(&kmask, umask))
                return -EFAULT;

        set_restore_sigmask();
        current->saved_sigmask = current->blocked;
        set_current_blocked(&kmask);

        return 0;
}
#endif

/**
 *  sys_rt_sigprocmask - change the list of currently blocked signals
 *  @how: whether to add, remove, or set signals
 *  @nset: stores pending signals
 *  @oset: previous value of signal mask if non-null
 *  @sigsetsize: size of sigset_t type
 */
SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
                sigset_t __user *, oset, size_t, sigsetsize)
{
        sigset_t old_set, new_set;
        int error;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        old_set = current->blocked;

        if (nset) {
                if (copy_from_user(&new_set, nset, sizeof(sigset_t)))
                        return -EFAULT;
                sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));

                error = sigprocmask(how, &new_set, NULL);
                if (error)
                        return error;
        }

        if (oset) {
                if (copy_to_user(oset, &old_set, sizeof(sigset_t)))
                        return -EFAULT;
        }

        return 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
                compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
{
        sigset_t old_set = current->blocked;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (nset) {
                sigset_t new_set;
                int error;
                if (get_compat_sigset(&new_set, nset))
                        return -EFAULT;
                sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));

                error = sigprocmask(how, &new_set, NULL);
                if (error)
                        return error;
        }
        return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0;
}
#endif

static void do_sigpending(sigset_t *set)
{
        spin_lock_irq(&current->sighand->siglock);
        sigorsets(set, &current->pending.signal,
                  &current->signal->shared_pending.signal);
        spin_unlock_irq(&current->sighand->siglock);

        /* Outside the lock because only this thread touches it.  */
        sigandsets(set, &current->blocked, set);
}

/**
 *  sys_rt_sigpending - examine a pending signal that has been raised
 *                        while blocked
 *  @uset: stores pending signals
 *  @sigsetsize: size of sigset_t type or larger
 */
SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
{
        sigset_t set;

        if (sigsetsize > sizeof(*uset))
                return -EINVAL;

        do_sigpending(&set);

        if (copy_to_user(uset, &set, sigsetsize))
                return -EFAULT;

        return 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
                compat_size_t, sigsetsize)
{
        sigset_t set;

        if (sigsetsize > sizeof(*uset))
                return -EINVAL;

        do_sigpending(&set);

        return put_compat_sigset(uset, &set, sigsetsize);
}
#endif

static const struct {
        unsigned char limit, layout;
} sig_sicodes[] = {
        [SIGILL]  = { NSIGILL,  SIL_FAULT },
        [SIGFPE]  = { NSIGFPE,  SIL_FAULT },
        [SIGSEGV] = { NSIGSEGV, SIL_FAULT },
        [SIGBUS]  = { NSIGBUS,  SIL_FAULT },
        [SIGTRAP] = { NSIGTRAP, SIL_FAULT },
#if defined(SIGEMT)
        [SIGEMT]  = { NSIGEMT,  SIL_FAULT },
#endif
        [SIGCHLD] = { NSIGCHLD, SIL_CHLD },
        [SIGPOLL] = { NSIGPOLL, SIL_POLL },
        [SIGSYS]  = { NSIGSYS,  SIL_SYS },
};

static bool known_siginfo_layout(unsigned sig, int si_code)
{
        if (si_code == SI_KERNEL)
                return true;
        else if ((si_code > SI_USER)) {
                if (sig_specific_sicodes(sig)) {
                        if (si_code <= sig_sicodes[sig].limit)
                                return true;
                }
                else if (si_code <= NSIGPOLL)
                        return true;
        }
        else if (si_code >= SI_DETHREAD)
                return true;
        else if (si_code == SI_ASYNCNL)
                return true;
        return false;
}

enum siginfo_layout siginfo_layout(unsigned sig, int si_code)
{
        enum siginfo_layout layout = SIL_KILL;
        if ((si_code > SI_USER) && (si_code < SI_KERNEL)) {
                if ((sig < ARRAY_SIZE(sig_sicodes)) &&
                    (si_code <= sig_sicodes[sig].limit)) {
                        layout = sig_sicodes[sig].layout;
                        /* Handle the exceptions */
                        if ((sig == SIGBUS) &&
                            (si_code >= BUS_MCEERR_AR) && (si_code <= BUS_MCEERR_AO))
                                layout = SIL_FAULT_MCEERR;
                        else if ((sig == SIGSEGV) && (si_code == SEGV_BNDERR))
                                layout = SIL_FAULT_BNDERR;
#ifdef SEGV_PKUERR
                        else if ((sig == SIGSEGV) && (si_code == SEGV_PKUERR))
                                layout = SIL_FAULT_PKUERR;
#endif
                        else if ((sig == SIGTRAP) && (si_code == TRAP_PERF))
                                layout = SIL_FAULT_PERF_EVENT;
                        else if (IS_ENABLED(CONFIG_SPARC) &&
                                 (sig == SIGILL) && (si_code == ILL_ILLTRP))
                                layout = SIL_FAULT_TRAPNO;
                        else if (IS_ENABLED(CONFIG_ALPHA) &&
                                 ((sig == SIGFPE) ||
                                  ((sig == SIGTRAP) && (si_code == TRAP_UNK))))
                                layout = SIL_FAULT_TRAPNO;
                }
                else if (si_code <= NSIGPOLL)
                        layout = SIL_POLL;
        } else {
                if (si_code == SI_TIMER)
                        layout = SIL_TIMER;
                else if (si_code == SI_SIGIO)
                        layout = SIL_POLL;
                else if (si_code < 0)
                        layout = SIL_RT;
        }
        return layout;
}

static inline char __user *si_expansion(const siginfo_t __user *info)
{
        return ((char __user *)info) + sizeof(struct kernel_siginfo);
}

int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from)
{
        char __user *expansion = si_expansion(to);
        if (copy_to_user(to, from , sizeof(struct kernel_siginfo)))
                return -EFAULT;
        if (clear_user(expansion, SI_EXPANSION_SIZE))
                return -EFAULT;
        return 0;
}

static int post_copy_siginfo_from_user(kernel_siginfo_t *info,
                                       const siginfo_t __user *from)
{
        if (unlikely(!known_siginfo_layout(info->si_signo, info->si_code))) {
                char __user *expansion = si_expansion(from);
                char buf[SI_EXPANSION_SIZE];
                int i;
                /*
                 * An unknown si_code might need more than
                 * sizeof(struct kernel_siginfo) bytes.  Verify all of the
                 * extra bytes are 0.  This guarantees copy_siginfo_to_user
                 * will return this data to userspace exactly.
                 */
                if (copy_from_user(&buf, expansion, SI_EXPANSION_SIZE))
                        return -EFAULT;
                for (i = 0; i < SI_EXPANSION_SIZE; i++) {
                        if (buf[i] != 0)
                                return -E2BIG;
                }
        }
        return 0;
}

static int __copy_siginfo_from_user(int signo, kernel_siginfo_t *to,
                                    const siginfo_t __user *from)
{
        if (copy_from_user(to, from, sizeof(struct kernel_siginfo)))
                return -EFAULT;
        to->si_signo = signo;
        return post_copy_siginfo_from_user(to, from);
}

int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from)
{
        if (copy_from_user(to, from, sizeof(struct kernel_siginfo)))
                return -EFAULT;
        return post_copy_siginfo_from_user(to, from);
}

#ifdef CONFIG_COMPAT
/**
 * copy_siginfo_to_external32 - copy a kernel siginfo into a compat user siginfo
 * @to: compat siginfo destination
 * @from: kernel siginfo source
 *
 * Note: This function does not work properly for the SIGCHLD on x32, but
 * fortunately it doesn't have to.  The only valid callers for this function are
 * copy_siginfo_to_user32, which is overriden for x32 and the coredump code.
 * The latter does not care because SIGCHLD will never cause a coredump.
 */
void copy_siginfo_to_external32(struct compat_siginfo *to,
                const struct kernel_siginfo *from)
{
        memset(to, 0, sizeof(*to));

        to->si_signo = from->si_signo;
        to->si_errno = from->si_errno;
        to->si_code  = from->si_code;
        switch(siginfo_layout(from->si_signo, from->si_code)) {
        case SIL_KILL:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                break;
        case SIL_TIMER:
                to->si_tid     = from->si_tid;
                to->si_overrun = from->si_overrun;
                to->si_int     = from->si_int;
                break;
        case SIL_POLL:
                to->si_band = from->si_band;
                to->si_fd   = from->si_fd;
                break;
        case SIL_FAULT:
                to->si_addr = ptr_to_compat(from->si_addr);
                break;
        case SIL_FAULT_TRAPNO:
                to->si_addr = ptr_to_compat(from->si_addr);
                to->si_trapno = from->si_trapno;
                break;
        case SIL_FAULT_MCEERR:
                to->si_addr = ptr_to_compat(from->si_addr);
                to->si_addr_lsb = from->si_addr_lsb;
                break;
        case SIL_FAULT_BNDERR:
                to->si_addr = ptr_to_compat(from->si_addr);
                to->si_lower = ptr_to_compat(from->si_lower);
                to->si_upper = ptr_to_compat(from->si_upper);
                break;
        case SIL_FAULT_PKUERR:
                to->si_addr = ptr_to_compat(from->si_addr);
                to->si_pkey = from->si_pkey;
                break;
        case SIL_FAULT_PERF_EVENT:
                to->si_addr = ptr_to_compat(from->si_addr);
                to->si_perf_data = from->si_perf_data;
                to->si_perf_type = from->si_perf_type;
                to->si_perf_flags = from->si_perf_flags;
                break;
        case SIL_CHLD:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                to->si_status = from->si_status;
                to->si_utime = from->si_utime;
                to->si_stime = from->si_stime;
                break;
        case SIL_RT:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                to->si_int = from->si_int;
                break;
        case SIL_SYS:
                to->si_call_addr = ptr_to_compat(from->si_call_addr);
                to->si_syscall   = from->si_syscall;
                to->si_arch      = from->si_arch;
                break;
        }
}

int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
                           const struct kernel_siginfo *from)
{
        struct compat_siginfo new;

        copy_siginfo_to_external32(&new, from);
        if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
                return -EFAULT;
        return 0;
}

static int post_copy_siginfo_from_user32(kernel_siginfo_t *to,
                                         const struct compat_siginfo *from)
{
        clear_siginfo(to);
        to->si_signo = from->si_signo;
        to->si_errno = from->si_errno;
        to->si_code  = from->si_code;
        switch(siginfo_layout(from->si_signo, from->si_code)) {
        case SIL_KILL:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                break;
        case SIL_TIMER:
                to->si_tid     = from->si_tid;
                to->si_overrun = from->si_overrun;
                to->si_int     = from->si_int;
                break;
        case SIL_POLL:
                to->si_band = from->si_band;
                to->si_fd   = from->si_fd;
                break;
        case SIL_FAULT:
                to->si_addr = compat_ptr(from->si_addr);
                break;
        case SIL_FAULT_TRAPNO:
                to->si_addr = compat_ptr(from->si_addr);
                to->si_trapno = from->si_trapno;
                break;
        case SIL_FAULT_MCEERR:
                to->si_addr = compat_ptr(from->si_addr);
                to->si_addr_lsb = from->si_addr_lsb;
                break;
        case SIL_FAULT_BNDERR:
                to->si_addr = compat_ptr(from->si_addr);
                to->si_lower = compat_ptr(from->si_lower);
                to->si_upper = compat_ptr(from->si_upper);
                break;
        case SIL_FAULT_PKUERR:
                to->si_addr = compat_ptr(from->si_addr);
                to->si_pkey = from->si_pkey;
                break;
        case SIL_FAULT_PERF_EVENT:
                to->si_addr = compat_ptr(from->si_addr);
                to->si_perf_data = from->si_perf_data;
                to->si_perf_type = from->si_perf_type;
                to->si_perf_flags = from->si_perf_flags;
                break;
        case SIL_CHLD:
                to->si_pid    = from->si_pid;
                to->si_uid    = from->si_uid;
                to->si_status = from->si_status;
#ifdef CONFIG_X86_X32_ABI
                if (in_x32_syscall()) {
                        to->si_utime = from->_sifields._sigchld_x32._utime;
                        to->si_stime = from->_sifields._sigchld_x32._stime;
                } else
#endif
                {
                        to->si_utime = from->si_utime;
                        to->si_stime = from->si_stime;
                }
                break;
        case SIL_RT:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                to->si_int = from->si_int;
                break;
        case SIL_SYS:
                to->si_call_addr = compat_ptr(from->si_call_addr);
                to->si_syscall   = from->si_syscall;
                to->si_arch      = from->si_arch;
                break;
        }
        return 0;
}

static int __copy_siginfo_from_user32(int signo, struct kernel_siginfo *to,
                                      const struct compat_siginfo __user *ufrom)
{
        struct compat_siginfo from;

        if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo)))
                return -EFAULT;

        from.si_signo = signo;
        return post_copy_siginfo_from_user32(to, &from);
}

int copy_siginfo_from_user32(struct kernel_siginfo *to,
                             const struct compat_siginfo __user *ufrom)
{
        struct compat_siginfo from;

        if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo)))
                return -EFAULT;

        return post_copy_siginfo_from_user32(to, &from);
}
#endif /* CONFIG_COMPAT */

/**
 *  do_sigtimedwait - wait for queued signals specified in @which
 *  @which: queued signals to wait for
 *  @info: if non-null, the signal's siginfo is returned here
 *  @ts: upper bound on process time suspension
 */
static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info,
                    const struct timespec64 *ts)
{
        ktime_t *to = NULL, timeout = KTIME_MAX;
        struct task_struct *tsk = current;
        sigset_t mask = *which;
        enum pid_type type;
        int sig, ret = 0;

        if (ts) {
                if (!timespec64_valid(ts))
                        return -EINVAL;
                timeout = timespec64_to_ktime(*ts);
                to = &timeout;
        }

        /*
         * Invert the set of allowed signals to get those we want to block.
         */
        sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
        signotset(&mask);

        spin_lock_irq(&tsk->sighand->siglock);
        sig = dequeue_signal(tsk, &mask, info, &type);
        if (!sig && timeout) {
                /*
                 * None ready, temporarily unblock those we're interested
                 * while we are sleeping in so that we'll be awakened when
                 * they arrive. Unblocking is always fine, we can avoid
                 * set_current_blocked().
                 */
                tsk->real_blocked = tsk->blocked;
                sigandsets(&tsk->blocked, &tsk->blocked, &mask);
                recalc_sigpending();
                spin_unlock_irq(&tsk->sighand->siglock);

                __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
                ret = schedule_hrtimeout_range(to, tsk->timer_slack_ns,
                                               HRTIMER_MODE_REL);
                spin_lock_irq(&tsk->sighand->siglock);
                __set_task_blocked(tsk, &tsk->real_blocked);
                sigemptyset(&tsk->real_blocked);
                sig = dequeue_signal(tsk, &mask, info, &type);
        }
        spin_unlock_irq(&tsk->sighand->siglock);

        if (sig)
                return sig;
        return ret ? -EINTR : -EAGAIN;
}

/**
 *  sys_rt_sigtimedwait - synchronously wait for queued signals specified
 *                        in @uthese
 *  @uthese: queued signals to wait for
 *  @uinfo: if non-null, the signal's siginfo is returned here
 *  @uts: upper bound on process time suspension
 *  @sigsetsize: size of sigset_t type
 */
SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
                siginfo_t __user *, uinfo,
                const struct __kernel_timespec __user *, uts,
                size_t, sigsetsize)
{
        sigset_t these;
        struct timespec64 ts;
        kernel_siginfo_t info;
        int ret;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (copy_from_user(&these, uthese, sizeof(these)))
                return -EFAULT;

        if (uts) {
                if (get_timespec64(&ts, uts))
                        return -EFAULT;
        }

        ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);

        if (ret > 0 && uinfo) {
                if (copy_siginfo_to_user(uinfo, &info))
                        ret = -EFAULT;
        }

        return ret;
}

#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE4(rt_sigtimedwait_time32, const sigset_t __user *, uthese,
                siginfo_t __user *, uinfo,
                const struct old_timespec32 __user *, uts,
                size_t, sigsetsize)
{
        sigset_t these;
        struct timespec64 ts;
        kernel_siginfo_t info;
        int ret;

        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (copy_from_user(&these, uthese, sizeof(these)))
                return -EFAULT;

        if (uts) {
                if (get_old_timespec32(&ts, uts))
                        return -EFAULT;
        }

        ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);

        if (ret > 0 && uinfo) {
                if (copy_siginfo_to_user(uinfo, &info))
                        ret = -EFAULT;
        }

        return ret;
}
#endif

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese,
                struct compat_siginfo __user *, uinfo,
                struct __kernel_timespec __user *, uts, compat_size_t, sigsetsize)
{
        sigset_t s;
        struct timespec64 t;
        kernel_siginfo_t info;
        long ret;

        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (get_compat_sigset(&s, uthese))
                return -EFAULT;

        if (uts) {
                if (get_timespec64(&t, uts))
                        return -EFAULT;
        }

        ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);

        if (ret > 0 && uinfo) {
                if (copy_siginfo_to_user32(uinfo, &info))
                        ret = -EFAULT;
        }

        return ret;
}

#ifdef CONFIG_COMPAT_32BIT_TIME
COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese,
                struct compat_siginfo __user *, uinfo,
                struct old_timespec32 __user *, uts, compat_size_t, sigsetsize)
{
        sigset_t s;
        struct timespec64 t;
        kernel_siginfo_t info;
        long ret;

        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (get_compat_sigset(&s, uthese))
                return -EFAULT;

        if (uts) {
                if (get_old_timespec32(&t, uts))
                        return -EFAULT;
        }

        ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);

        if (ret > 0 && uinfo) {
                if (copy_siginfo_to_user32(uinfo, &info))
                        ret = -EFAULT;
        }

        return ret;
}
#endif
#endif

static void prepare_kill_siginfo(int sig, struct kernel_siginfo *info,
                                 enum pid_type type)
{
        clear_siginfo(info);
        info->si_signo = sig;
        info->si_errno = 0;
        info->si_code = (type == PIDTYPE_PID) ? SI_TKILL : SI_USER;
        info->si_pid = task_tgid_vnr(current);
        info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
}

/**
 *  sys_kill - send a signal to a process
 *  @pid: the PID of the process
 *  @sig: signal to be sent
 */
SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
{
        struct kernel_siginfo info;

        prepare_kill_siginfo(sig, &info, PIDTYPE_TGID);

        return kill_something_info(sig, &info, pid);
}

/*
 * Verify that the signaler and signalee either are in the same pid namespace
 * or that the signaler's pid namespace is an ancestor of the signalee's pid
 * namespace.
 */
static bool access_pidfd_pidns(struct pid *pid)
{
        struct pid_namespace *active = task_active_pid_ns(current);
        struct pid_namespace *p = ns_of_pid(pid);

        for (;;) {
                if (!p)
                        return false;
                if (p == active)
                        break;
                p = p->parent;
        }

        return true;
}

static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo,
                siginfo_t __user *info)
{
#ifdef CONFIG_COMPAT
        /*
         * Avoid hooking up compat syscalls and instead handle necessary
         * conversions here. Note, this is a stop-gap measure and should not be
         * considered a generic solution.
         */
        if (in_compat_syscall())
                return copy_siginfo_from_user32(
                        kinfo, (struct compat_siginfo __user *)info);
#endif
        return copy_siginfo_from_user(kinfo, info);
}

static struct pid *pidfd_to_pid(const struct file *file)
{
        struct pid *pid;

        pid = pidfd_pid(file);
        if (!IS_ERR(pid))
                return pid;

        return tgid_pidfd_to_pid(file);
}

#define PIDFD_SEND_SIGNAL_FLAGS                            \
        (PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \
         PIDFD_SIGNAL_PROCESS_GROUP)

/**
 * sys_pidfd_send_signal - Signal a process through a pidfd
 * @pidfd:  file descriptor of the process
 * @sig:    signal to send
 * @info:   signal info
 * @flags:  future flags
 *
 * Send the signal to the thread group or to the individual thread depending
 * on PIDFD_THREAD.
 * In the future extension to @flags may be used to override the default scope
 * of @pidfd.
 *
 * Return: 0 on success, negative errno on failure
 */
SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
                siginfo_t __user *, info, unsigned int, flags)
{
        int ret;
        struct fd f;
        struct pid *pid;
        kernel_siginfo_t kinfo;
        enum pid_type type;

        /* Enforce flags be set to 0 until we add an extension. */
        if (flags & ~PIDFD_SEND_SIGNAL_FLAGS)
                return -EINVAL;

        /* Ensure that only a single signal scope determining flag is set. */
        if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1)
                return -EINVAL;

        f = fdget(pidfd);
        if (!f.file)
                return -EBADF;

        /* Is this a pidfd? */
        pid = pidfd_to_pid(f.file);
        if (IS_ERR(pid)) {
                ret = PTR_ERR(pid);
                goto err;
        }

        ret = -EINVAL;
        if (!access_pidfd_pidns(pid))
                goto err;

        switch (flags) {
        case 0:
                /* Infer scope from the type of pidfd. */
                if (f.file->f_flags & PIDFD_THREAD)
                        type = PIDTYPE_PID;
                else
                        type = PIDTYPE_TGID;
                break;
        case PIDFD_SIGNAL_THREAD:
                type = PIDTYPE_PID;
                break;
        case PIDFD_SIGNAL_THREAD_GROUP:
                type = PIDTYPE_TGID;
                break;
        case PIDFD_SIGNAL_PROCESS_GROUP:
                type = PIDTYPE_PGID;
                break;
        }

        if (info) {
                ret = copy_siginfo_from_user_any(&kinfo, info);
                if (unlikely(ret))
                        goto err;

                ret = -EINVAL;
                if (unlikely(sig != kinfo.si_signo))
                        goto err;

                /* Only allow sending arbitrary signals to yourself. */
                ret = -EPERM;
                if ((task_pid(current) != pid || type > PIDTYPE_TGID) &&
                    (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
                        goto err;
        } else {
                prepare_kill_siginfo(sig, &kinfo, type);
        }

        if (type == PIDTYPE_PGID)
                ret = kill_pgrp_info(sig, &kinfo, pid);
        else
                ret = kill_pid_info_type(sig, &kinfo, pid, type);
err:
        fdput(f);
        return ret;
}

static int
do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
{
        struct task_struct *p;
        int error = -ESRCH;

        rcu_read_lock();
        p = find_task_by_vpid(pid);
        if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
                error = check_kill_permission(sig, info, p);
                /*
                 * The null signal is a permissions and process existence
                 * probe.  No signal is actually delivered.
                 */
                if (!error && sig) {
                        error = do_send_sig_info(sig, info, p, PIDTYPE_PID);
                        /*
                         * If lock_task_sighand() failed we pretend the task
                         * dies after receiving the signal. The window is tiny,
                         * and the signal is private anyway.
                         */
                        if (unlikely(error == -ESRCH))
                                error = 0;
                }
        }
        rcu_read_unlock();

        return error;
}

static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
        struct kernel_siginfo info;

        prepare_kill_siginfo(sig, &info, PIDTYPE_PID);

        return do_send_specific(tgid, pid, sig, &info);
}

/**
 *  sys_tgkill - send signal to one specific thread
 *  @tgid: the thread group ID of the thread
 *  @pid: the PID of the thread
 *  @sig: signal to be sent
 *
 *  This syscall also checks the @tgid and returns -ESRCH even if the PID
 *  exists but it's not belonging to the target process anymore. This
 *  method solves the problem of threads exiting and PIDs getting reused.
 */
SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
{
        /* This is only valid for single tasks */
        if (pid <= 0 || tgid <= 0)
                return -EINVAL;

        return do_tkill(tgid, pid, sig);
}

/**
 *  sys_tkill - send signal to one specific task
 *  @pid: the PID of the task
 *  @sig: signal to be sent
 *
 *  Send a signal to only one task, even if it's a CLONE_THREAD task.
 */
SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
{
        /* This is only valid for single tasks */
        if (pid <= 0)
                return -EINVAL;

        return do_tkill(0, pid, sig);
}

static int do_rt_sigqueueinfo(pid_t pid, int sig, kernel_siginfo_t *info)
{
        /* Not even root can pretend to send signals from the kernel.
         * Nor can they impersonate a kill()/tgkill(), which adds source info.
         */
        if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
            (task_pid_vnr(current) != pid))
                return -EPERM;

        /* POSIX.1b doesn't mention process groups.  */
        return kill_proc_info(sig, info, pid);
}

/**
 *  sys_rt_sigqueueinfo - send signal information to a signal
 *  @pid: the PID of the thread
 *  @sig: signal to be sent
 *  @uinfo: signal info to be sent
 */
SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
                siginfo_t __user *, uinfo)
{
        kernel_siginfo_t info;
        int ret = __copy_siginfo_from_user(sig, &info, uinfo);
        if (unlikely(ret))
                return ret;
        return do_rt_sigqueueinfo(pid, sig, &info);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
                        compat_pid_t, pid,
                        int, sig,
                        struct compat_siginfo __user *, uinfo)
{
        kernel_siginfo_t info;
        int ret = __copy_siginfo_from_user32(sig, &info, uinfo);
        if (unlikely(ret))
                return ret;
        return do_rt_sigqueueinfo(pid, sig, &info);
}
#endif

static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, kernel_siginfo_t *info)
{
        /* This is only valid for single tasks */
        if (pid <= 0 || tgid <= 0)
                return -EINVAL;

        /* Not even root can pretend to send signals from the kernel.
         * Nor can they impersonate a kill()/tgkill(), which adds source info.
         */
        if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
            (task_pid_vnr(current) != pid))
                return -EPERM;

        return do_send_specific(tgid, pid, sig, info);
}

SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
                siginfo_t __user *, uinfo)
{
        kernel_siginfo_t info;
        int ret = __copy_siginfo_from_user(sig, &info, uinfo);
        if (unlikely(ret))
                return ret;
        return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
                        compat_pid_t, tgid,
                        compat_pid_t, pid,
                        int, sig,
                        struct compat_siginfo __user *, uinfo)
{
        kernel_siginfo_t info;
        int ret = __copy_siginfo_from_user32(sig, &info, uinfo);
        if (unlikely(ret))
                return ret;
        return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}
#endif

/*
 * For kthreads only, must not be used if cloned with CLONE_SIGHAND
 */
void kernel_sigaction(int sig, __sighandler_t action)
{
        spin_lock_irq(&current->sighand->siglock);
        current->sighand->action[sig - 1].sa.sa_handler = action;
        if (action == SIG_IGN) {
                sigset_t mask;

                sigemptyset(&mask);
                sigaddset(&mask, sig);

                flush_sigqueue_mask(&mask, &current->signal->shared_pending);
                flush_sigqueue_mask(&mask, &current->pending);
                recalc_sigpending();
        }
        spin_unlock_irq(&current->sighand->siglock);
}
EXPORT_SYMBOL(kernel_sigaction);

void __weak sigaction_compat_abi(struct k_sigaction *act,
                struct k_sigaction *oact)
{
}

int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
{
        struct task_struct *p = current, *t;
        struct k_sigaction *k;
        sigset_t mask;

        if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
                return -EINVAL;

        k = &p->sighand->action[sig-1];

        spin_lock_irq(&p->sighand->siglock);
        if (k->sa.sa_flags & SA_IMMUTABLE) {
                spin_unlock_irq(&p->sighand->siglock);
                return -EINVAL;
        }
        if (oact)
                *oact = *k;

        /*
         * Make sure that we never accidentally claim to support SA_UNSUPPORTED,
         * e.g. by having an architecture use the bit in their uapi.
         */
        BUILD_BUG_ON(UAPI_SA_FLAGS & SA_UNSUPPORTED);

        /*
         * Clear unknown flag bits in order to allow userspace to detect missing
         * support for flag bits and to allow the kernel to use non-uapi bits
         * internally.
         */
        if (act)
                act->sa.sa_flags &= UAPI_SA_FLAGS;
        if (oact)
                oact->sa.sa_flags &= UAPI_SA_FLAGS;

        sigaction_compat_abi(act, oact);

        if (act) {
                sigdelsetmask(&act->sa.sa_mask,
                              sigmask(SIGKILL) | sigmask(SIGSTOP));
                *k = *act;
                /*
                 * POSIX 3.3.1.3:
                 *  "Setting a signal action to SIG_IGN for a signal that is
                 *   pending shall cause the pending signal to be discarded,
                 *   whether or not it is blocked."
                 *
                 *  "Setting a signal action to SIG_DFL for a signal that is
                 *   pending and whose default action is to ignore the signal
                 *   (for example, SIGCHLD), shall cause the pending signal to
                 *   be discarded, whether or not it is blocked"
                 */
                if (sig_handler_ignored(sig_handler(p, sig), sig)) {
                        sigemptyset(&mask);
                        sigaddset(&mask, sig);
                        flush_sigqueue_mask(&mask, &p->signal->shared_pending);
                        for_each_thread(p, t)
                                flush_sigqueue_mask(&mask, &t->pending);
                }
        }

        spin_unlock_irq(&p->sighand->siglock);
        return 0;
}

#ifdef CONFIG_DYNAMIC_SIGFRAME
static inline void sigaltstack_lock(void)
        __acquires(&current->sighand->siglock)
{
        spin_lock_irq(&current->sighand->siglock);
}

static inline void sigaltstack_unlock(void)
        __releases(&current->sighand->siglock)
{
        spin_unlock_irq(&current->sighand->siglock);
}
#else
static inline void sigaltstack_lock(void) { }
static inline void sigaltstack_unlock(void) { }
#endif

static int
do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp,
                size_t min_ss_size)
{
        struct task_struct *t = current;
        int ret = 0;

        if (oss) {
                memset(oss, 0, sizeof(stack_t));
                oss->ss_sp = (void __user *) t->sas_ss_sp;
                oss->ss_size = t->sas_ss_size;
                oss->ss_flags = sas_ss_flags(sp) |
                        (current->sas_ss_flags & SS_FLAG_BITS);
        }

        if (ss) {
                void __user *ss_sp = ss->ss_sp;
                size_t ss_size = ss->ss_size;
                unsigned ss_flags = ss->ss_flags;
                int ss_mode;

                if (unlikely(on_sig_stack(sp)))
                        return -EPERM;

                ss_mode = ss_flags & ~SS_FLAG_BITS;
                if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK &&
                                ss_mode != 0))
                        return -EINVAL;

                /*
                 * Return before taking any locks if no actual
                 * sigaltstack changes were requested.
                 */
                if (t->sas_ss_sp == (unsigned long)ss_sp &&
                    t->sas_ss_size == ss_size &&
                    t->sas_ss_flags == ss_flags)
                        return 0;

                sigaltstack_lock();
                if (ss_mode == SS_DISABLE) {
                        ss_size = 0;
                        ss_sp = NULL;
                } else {
                        if (unlikely(ss_size < min_ss_size))
                                ret = -ENOMEM;
                        if (!sigaltstack_size_valid(ss_size))
                                ret = -ENOMEM;
                }
                if (!ret) {
                        t->sas_ss_sp = (unsigned long) ss_sp;
                        t->sas_ss_size = ss_size;
                        t->sas_ss_flags = ss_flags;
                }
                sigaltstack_unlock();
        }
        return ret;
}

SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
{
        stack_t new, old;
        int err;
        if (uss && copy_from_user(&new, uss, sizeof(stack_t)))
                return -EFAULT;
        err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL,
                              current_user_stack_pointer(),
                              MINSIGSTKSZ);
        if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t)))
                err = -EFAULT;
        return err;
}

int restore_altstack(const stack_t __user *uss)
{
        stack_t new;
        if (copy_from_user(&new, uss, sizeof(stack_t)))
                return -EFAULT;
        (void)do_sigaltstack(&new, NULL, current_user_stack_pointer(),
                             MINSIGSTKSZ);
        /* squash all but EFAULT for now */
        return 0;
}

int __save_altstack(stack_t __user *uss, unsigned long sp)
{
        struct task_struct *t = current;
        int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
                __put_user(t->sas_ss_flags, &uss->ss_flags) |
                __put_user(t->sas_ss_size, &uss->ss_size);
        return err;
}

#ifdef CONFIG_COMPAT
static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr,
                                 compat_stack_t __user *uoss_ptr)
{
        stack_t uss, uoss;
        int ret;

        if (uss_ptr) {
                compat_stack_t uss32;
                if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
                        return -EFAULT;
                uss.ss_sp = compat_ptr(uss32.ss_sp);
                uss.ss_flags = uss32.ss_flags;
                uss.ss_size = uss32.ss_size;
        }
        ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss,
                             compat_user_stack_pointer(),
                             COMPAT_MINSIGSTKSZ);
        if (ret >= 0 && uoss_ptr)  {
                compat_stack_t old;
                memset(&old, 0, sizeof(old));
                old.ss_sp = ptr_to_compat(uoss.ss_sp);
                old.ss_flags = uoss.ss_flags;
                old.ss_size = uoss.ss_size;
                if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t)))
                        ret = -EFAULT;
        }
        return ret;
}

COMPAT_SYSCALL_DEFINE2(sigaltstack,
                        const compat_stack_t __user *, uss_ptr,
                        compat_stack_t __user *, uoss_ptr)
{
        return do_compat_sigaltstack(uss_ptr, uoss_ptr);
}

int compat_restore_altstack(const compat_stack_t __user *uss)
{
        int err = do_compat_sigaltstack(uss, NULL);
        /* squash all but -EFAULT for now */
        return err == -EFAULT ? err : 0;
}

int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
{
        int err;
        struct task_struct *t = current;
        err = __put_user(ptr_to_compat((void __user *)t->sas_ss_sp),
                         &uss->ss_sp) |
                __put_user(t->sas_ss_flags, &uss->ss_flags) |
                __put_user(t->sas_ss_size, &uss->ss_size);
        return err;
}
#endif

#ifdef __ARCH_WANT_SYS_SIGPENDING

/**
 *  sys_sigpending - examine pending signals
 *  @uset: where mask of pending signal is returned
 */
SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset)
{
        sigset_t set;

        if (sizeof(old_sigset_t) > sizeof(*uset))
                return -EINVAL;

        do_sigpending(&set);

        if (copy_to_user(uset, &set, sizeof(old_sigset_t)))
                return -EFAULT;

        return 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
{
        sigset_t set;

        do_sigpending(&set);

        return put_user(set.sig[0], set32);
}
#endif

#endif

#ifdef __ARCH_WANT_SYS_SIGPROCMASK
/**
 *  sys_sigprocmask - examine and change blocked signals
 *  @how: whether to add, remove, or set signals
 *  @nset: signals to add or remove (if non-null)
 *  @oset: previous value of signal mask if non-null
 *
 * Some platforms have their own version with special arguments;
 * others support only sys_rt_sigprocmask.
 */

SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
                old_sigset_t __user *, oset)
{
        old_sigset_t old_set, new_set;
        sigset_t new_blocked;

        old_set = current->blocked.sig[0];

        if (nset) {
                if (copy_from_user(&new_set, nset, sizeof(*nset)))
                        return -EFAULT;

                new_blocked = current->blocked;

                switch (how) {
                case SIG_BLOCK:
                        sigaddsetmask(&new_blocked, new_set);
                        break;
                case SIG_UNBLOCK:
                        sigdelsetmask(&new_blocked, new_set);
                        break;
                case SIG_SETMASK:
                        new_blocked.sig[0] = new_set;
                        break;
                default:
                        return -EINVAL;
                }

                set_current_blocked(&new_blocked);
        }

        if (oset) {
                if (copy_to_user(oset, &old_set, sizeof(*oset)))
                        return -EFAULT;
        }

        return 0;
}
#endif /* __ARCH_WANT_SYS_SIGPROCMASK */

#ifndef CONFIG_ODD_RT_SIGACTION
/**
 *  sys_rt_sigaction - alter an action taken by a process
 *  @sig: signal to be sent
 *  @act: new sigaction
 *  @oact: used to save the previous sigaction
 *  @sigsetsize: size of sigset_t type
 */
SYSCALL_DEFINE4(rt_sigaction, int, sig,
                const struct sigaction __user *, act,
                struct sigaction __user *, oact,
                size_t, sigsetsize)
{
        struct k_sigaction new_sa, old_sa;
        int ret;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (act && copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
                return -EFAULT;

        ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL);
        if (ret)
                return ret;

        if (oact && copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
                return -EFAULT;

        return 0;
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
                const struct compat_sigaction __user *, act,
                struct compat_sigaction __user *, oact,
                compat_size_t, sigsetsize)
{
        struct k_sigaction new_ka, old_ka;
#ifdef __ARCH_HAS_SA_RESTORER
        compat_uptr_t restorer;
#endif
        int ret;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(compat_sigset_t))
                return -EINVAL;

        if (act) {
                compat_uptr_t handler;
                ret = get_user(handler, &act->sa_handler);
                new_ka.sa.sa_handler = compat_ptr(handler);
#ifdef __ARCH_HAS_SA_RESTORER
                ret |= get_user(restorer, &act->sa_restorer);
                new_ka.sa.sa_restorer = compat_ptr(restorer);
#endif
                ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask);
                ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
                if (ret)
                        return -EFAULT;
        }

        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
        if (!ret && oact) {
                ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), 
                               &oact->sa_handler);
                ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask,
                                         sizeof(oact->sa_mask));
                ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
#ifdef __ARCH_HAS_SA_RESTORER
                ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
                                &oact->sa_restorer);
#endif
        }
        return ret;
}
#endif
#endif /* !CONFIG_ODD_RT_SIGACTION */

#ifdef CONFIG_OLD_SIGACTION
SYSCALL_DEFINE3(sigaction, int, sig,
                const struct old_sigaction __user *, act,
                struct old_sigaction __user *, oact)
{
        struct k_sigaction new_ka, old_ka;
        int ret;

        if (act) {
                old_sigset_t mask;
                if (!access_ok(act, sizeof(*act)) ||
                    __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
                    __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
                    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
                    __get_user(mask, &act->sa_mask))
                        return -EFAULT;
#ifdef __ARCH_HAS_KA_RESTORER
                new_ka.ka_restorer = NULL;
#endif
                siginitset(&new_ka.sa.sa_mask, mask);
        }

        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);

        if (!ret && oact) {
                if (!access_ok(oact, sizeof(*oact)) ||
                    __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
                    __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
                    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
                    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
                        return -EFAULT;
        }

        return ret;
}
#endif
#ifdef CONFIG_COMPAT_OLD_SIGACTION
COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
                const struct compat_old_sigaction __user *, act,
                struct compat_old_sigaction __user *, oact)
{
        struct k_sigaction new_ka, old_ka;
        int ret;
        compat_old_sigset_t mask;
        compat_uptr_t handler, restorer;

        if (act) {
                if (!access_ok(act, sizeof(*act)) ||
                    __get_user(handler, &act->sa_handler) ||
                    __get_user(restorer, &act->sa_restorer) ||
                    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
                    __get_user(mask, &act->sa_mask))
                        return -EFAULT;

#ifdef __ARCH_HAS_KA_RESTORER
                new_ka.ka_restorer = NULL;
#endif
                new_ka.sa.sa_handler = compat_ptr(handler);
                new_ka.sa.sa_restorer = compat_ptr(restorer);
                siginitset(&new_ka.sa.sa_mask, mask);
        }

        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);

        if (!ret && oact) {
                if (!access_ok(oact, sizeof(*oact)) ||
                    __put_user(ptr_to_compat(old_ka.sa.sa_handler),
                               &oact->sa_handler) ||
                    __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
                               &oact->sa_restorer) ||
                    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
                    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
                        return -EFAULT;
        }
        return ret;
}
#endif

#ifdef CONFIG_SGETMASK_SYSCALL

/*
 * For backwards compatibility.  Functionality superseded by sigprocmask.
 */
SYSCALL_DEFINE0(sgetmask)
{
        /* SMP safe */
        return current->blocked.sig[0];
}

SYSCALL_DEFINE1(ssetmask, int, newmask)
{
        int old = current->blocked.sig[0];
        sigset_t newset;

        siginitset(&newset, newmask);
        set_current_blocked(&newset);

        return old;
}
#endif /* CONFIG_SGETMASK_SYSCALL */

#ifdef __ARCH_WANT_SYS_SIGNAL
/*
 * For backwards compatibility.  Functionality superseded by sigaction.
 */
SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
{
        struct k_sigaction new_sa, old_sa;
        int ret;

        new_sa.sa.sa_handler = handler;
        new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK;
        sigemptyset(&new_sa.sa.sa_mask);

        ret = do_sigaction(sig, &new_sa, &old_sa);

        return ret ? ret : (unsigned long)old_sa.sa.sa_handler;
}
#endif /* __ARCH_WANT_SYS_SIGNAL */

#ifdef __ARCH_WANT_SYS_PAUSE

SYSCALL_DEFINE0(pause)
{
        while (!signal_pending(current)) {
                __set_current_state(TASK_INTERRUPTIBLE);
                schedule();
        }
        return -ERESTARTNOHAND;
}

#endif

static int sigsuspend(sigset_t *set)
{
        current->saved_sigmask = current->blocked;
        set_current_blocked(set);

        while (!signal_pending(current)) {
                __set_current_state(TASK_INTERRUPTIBLE);
                schedule();
        }
        set_restore_sigmask();
        return -ERESTARTNOHAND;
}

/**
 *  sys_rt_sigsuspend - replace the signal mask for a value with the
 *        @unewset value until a signal is received
 *  @unewset: new signal mask value
 *  @sigsetsize: size of sigset_t type
 */
SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
{
        sigset_t newset;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (copy_from_user(&newset, unewset, sizeof(newset)))
                return -EFAULT;
        return sigsuspend(&newset);
}
 
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
{
        sigset_t newset;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (get_compat_sigset(&newset, unewset))
                return -EFAULT;
        return sigsuspend(&newset);
}
#endif

#ifdef CONFIG_OLD_SIGSUSPEND
SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask)
{
        sigset_t blocked;
        siginitset(&blocked, mask);
        return sigsuspend(&blocked);
}
#endif
#ifdef CONFIG_OLD_SIGSUSPEND3
SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask)
{
        sigset_t blocked;
        siginitset(&blocked, mask);
        return sigsuspend(&blocked);
}
#endif

__weak const char *arch_vma_name(struct vm_area_struct *vma)
{
        return NULL;
}

static inline void siginfo_buildtime_checks(void)
{
        BUILD_BUG_ON(sizeof(struct siginfo) != SI_MAX_SIZE);

        /* Verify the offsets in the two siginfos match */
#define CHECK_OFFSET(field) \
        BUILD_BUG_ON(offsetof(siginfo_t, field) != offsetof(kernel_siginfo_t, field))

        /* kill */
        CHECK_OFFSET(si_pid);
        CHECK_OFFSET(si_uid);

        /* timer */
        CHECK_OFFSET(si_tid);
        CHECK_OFFSET(si_overrun);
        CHECK_OFFSET(si_value);

        /* rt */
        CHECK_OFFSET(si_pid);
        CHECK_OFFSET(si_uid);
        CHECK_OFFSET(si_value);

        /* sigchld */
        CHECK_OFFSET(si_pid);
        CHECK_OFFSET(si_uid);
        CHECK_OFFSET(si_status);
        CHECK_OFFSET(si_utime);
        CHECK_OFFSET(si_stime);

        /* sigfault */
        CHECK_OFFSET(si_addr);
        CHECK_OFFSET(si_trapno);
        CHECK_OFFSET(si_addr_lsb);
        CHECK_OFFSET(si_lower);
        CHECK_OFFSET(si_upper);
        CHECK_OFFSET(si_pkey);
        CHECK_OFFSET(si_perf_data);
        CHECK_OFFSET(si_perf_type);
        CHECK_OFFSET(si_perf_flags);

        /* sigpoll */
        CHECK_OFFSET(si_band);
        CHECK_OFFSET(si_fd);

        /* sigsys */
        CHECK_OFFSET(si_call_addr);
        CHECK_OFFSET(si_syscall);
        CHECK_OFFSET(si_arch);
#undef CHECK_OFFSET

        /* usb asyncio */
        BUILD_BUG_ON(offsetof(struct siginfo, si_pid) !=
                     offsetof(struct siginfo, si_addr));
        if (sizeof(int) == sizeof(void __user *)) {
                BUILD_BUG_ON(sizeof_field(struct siginfo, si_pid) !=
                             sizeof(void __user *));
        } else {
                BUILD_BUG_ON((sizeof_field(struct siginfo, si_pid) +
                              sizeof_field(struct siginfo, si_uid)) !=
                             sizeof(void __user *));
                BUILD_BUG_ON(offsetofend(struct siginfo, si_pid) !=
                             offsetof(struct siginfo, si_uid));
        }
#ifdef CONFIG_COMPAT
        BUILD_BUG_ON(offsetof(struct compat_siginfo, si_pid) !=
                     offsetof(struct compat_siginfo, si_addr));
        BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) !=
                     sizeof(compat_uptr_t));
        BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) !=
                     sizeof_field(struct siginfo, si_pid));
#endif
}

#if defined(CONFIG_SYSCTL)
static struct ctl_table signal_debug_table[] = {
#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
        {
                .procname        = "exception-trace",
                .data                = &show_unhandled_signals,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec
        },
#endif
};

static int __init init_signal_sysctls(void)
{
        register_sysctl_init("debug", signal_debug_table);
        return 0;
}
early_initcall(init_signal_sysctls);
#endif /* CONFIG_SYSCTL */

void __init signals_init(void)
{
        siginfo_buildtime_checks();

        sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC | SLAB_ACCOUNT);
}

#ifdef CONFIG_KGDB_KDB
#include <linux/kdb.h>
/*
 * kdb_send_sig - Allows kdb to send signals without exposing
 * signal internals.  This function checks if the required locks are
 * available before calling the main signal code, to avoid kdb
 * deadlocks.
 */
void kdb_send_sig(struct task_struct *t, int sig)
{
        static struct task_struct *kdb_prev_t;
        int new_t, ret;
        if (!spin_trylock(&t->sighand->siglock)) {
                kdb_printf("Can't do kill command now.\n"
                           "The sigmask lock is held somewhere else in "
                           "kernel, try again later\n");
                return;
        }
        new_t = kdb_prev_t != t;
        kdb_prev_t = t;
        if (!task_is_running(t) && new_t) {
                spin_unlock(&t->sighand->siglock);
                kdb_printf("Process is not RUNNING, sending a signal from "
                           "kdb risks deadlock\n"
                           "on the run queue locks. "
                           "The signal has _not_ been sent.\n"
                           "Reissue the kill command if you want to risk "
                           "the deadlock.\n");
                return;
        }
        ret = send_signal_locked(sig, SEND_SIG_PRIV, t, PIDTYPE_PID);
        spin_unlock(&t->sighand->siglock);
        if (ret)
                kdb_printf("Fail to deliver Signal %d to process %d.\n",
                           sig, t->pid);
        else
                kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid);
}
#endif        /* CONFIG_KGDB_KDB */























































































































    1 











    1 



































    1 




























































































































































































































































































































    3 

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#include <linux/err.h>
#include <linux/uuid.h>
#include "ctree.h"
#include "fs.h"
#include "messages.h"
#include "transaction.h"
#include "disk-io.h"
#include "qgroup.h"
#include "space-info.h"
#include "accessors.h"
#include "root-tree.h"
#include "orphan.h"

/*
 * Read a root item from the tree. In case we detect a root item smaller then
 * sizeof(root_item), we know it's an old version of the root structure and
 * initialize all new fields to zero. The same happens if we detect mismatching
 * generation numbers as then we know the root was once mounted with an older
 * kernel that was not aware of the root item structure change.
 */
static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
                                struct btrfs_root_item *item)
{
        u32 len;
        int need_reset = 0;

        len = btrfs_item_size(eb, slot);
        read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
                           min_t(u32, len, sizeof(*item)));
        if (len < sizeof(*item))
                need_reset = 1;
        if (!need_reset && btrfs_root_generation(item)
                != btrfs_root_generation_v2(item)) {
                if (btrfs_root_generation_v2(item) != 0) {
                        btrfs_warn(eb->fs_info,
                                        "mismatching generation and generation_v2 found in root item. This root was probably mounted with an older kernel. Resetting all new fields.");
                }
                need_reset = 1;
        }
        if (need_reset) {
                /* Clear all members from generation_v2 onwards. */
                memset_startat(item, 0, generation_v2);
                generate_random_guid(item->uuid);
        }
}

/*
 * Lookup the root by the key.
 *
 * root: the root of the root tree
 * search_key: the key to search
 * path: the path we search
 * root_item: the root item of the tree we look for
 * root_key: the root key of the tree we look for
 *
 * If ->offset of 'search_key' is -1ULL, it means we are not sure the offset
 * of the search key, just lookup the root with the highest offset for a
 * given objectid.
 *
 * If we find something return 0, otherwise > 0, < 0 on error.
 */
int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
                    struct btrfs_path *path, struct btrfs_root_item *root_item,
                    struct btrfs_key *root_key)
{
        struct btrfs_key found_key;
        struct extent_buffer *l;
        int ret;
        int slot;

        ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0);
        if (ret < 0)
                return ret;

        if (search_key->offset != -1ULL) {        /* the search key is exact */
                if (ret > 0)
                        goto out;
        } else {
                /*
                 * Key with offset -1 found, there would have to exist a root
                 * with such id, but this is out of the valid range.
                 */
                if (ret == 0) {
                        ret = -EUCLEAN;
                        goto out;
                }
                if (path->slots[0] == 0)
                        goto out;
                path->slots[0]--;
                ret = 0;
        }

        l = path->nodes[0];
        slot = path->slots[0];

        btrfs_item_key_to_cpu(l, &found_key, slot);
        if (found_key.objectid != search_key->objectid ||
            found_key.type != BTRFS_ROOT_ITEM_KEY) {
                ret = 1;
                goto out;
        }

        if (root_item)
                btrfs_read_root_item(l, slot, root_item);
        if (root_key)
                memcpy(root_key, &found_key, sizeof(found_key));
out:
        btrfs_release_path(path);
        return ret;
}

void btrfs_set_root_node(struct btrfs_root_item *item,
                         struct extent_buffer *node)
{
        btrfs_set_root_bytenr(item, node->start);
        btrfs_set_root_level(item, btrfs_header_level(node));
        btrfs_set_root_generation(item, btrfs_header_generation(node));
}

/*
 * copy the data in 'item' into the btree
 */
int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
                      *root, struct btrfs_key *key, struct btrfs_root_item
                      *item)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_path *path;
        struct extent_buffer *l;
        int ret;
        int slot;
        unsigned long ptr;
        u32 old_len;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
        if (ret < 0)
                goto out;

        if (ret > 0) {
                btrfs_crit(fs_info,
                        "unable to find root key (%llu %u %llu) in tree %llu",
                        key->objectid, key->type, key->offset, btrfs_root_id(root));
                ret = -EUCLEAN;
                btrfs_abort_transaction(trans, ret);
                goto out;
        }

        l = path->nodes[0];
        slot = path->slots[0];
        ptr = btrfs_item_ptr_offset(l, slot);
        old_len = btrfs_item_size(l, slot);

        /*
         * If this is the first time we update the root item which originated
         * from an older kernel, we need to enlarge the item size to make room
         * for the added fields.
         */
        if (old_len < sizeof(*item)) {
                btrfs_release_path(path);
                ret = btrfs_search_slot(trans, root, key, path,
                                -1, 1);
                if (ret < 0) {
                        btrfs_abort_transaction(trans, ret);
                        goto out;
                }

                ret = btrfs_del_item(trans, root, path);
                if (ret < 0) {
                        btrfs_abort_transaction(trans, ret);
                        goto out;
                }
                btrfs_release_path(path);
                ret = btrfs_insert_empty_item(trans, root, path,
                                key, sizeof(*item));
                if (ret < 0) {
                        btrfs_abort_transaction(trans, ret);
                        goto out;
                }
                l = path->nodes[0];
                slot = path->slots[0];
                ptr = btrfs_item_ptr_offset(l, slot);
        }

        /*
         * Update generation_v2 so at the next mount we know the new root
         * fields are valid.
         */
        btrfs_set_root_generation_v2(item, btrfs_root_generation(item));

        write_extent_buffer(l, item, ptr, sizeof(*item));
        btrfs_mark_buffer_dirty(trans, path->nodes[0]);
out:
        btrfs_free_path(path);
        return ret;
}

int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                      const struct btrfs_key *key, struct btrfs_root_item *item)
{
        /*
         * Make sure generation v1 and v2 match. See update_root for details.
         */
        btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
        return btrfs_insert_item(trans, root, key, item, sizeof(*item));
}

int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
{
        struct btrfs_root *tree_root = fs_info->tree_root;
        struct extent_buffer *leaf;
        struct btrfs_path *path;
        struct btrfs_key key;
        struct btrfs_root *root;
        int err = 0;
        int ret;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = BTRFS_ORPHAN_OBJECTID;
        key.type = BTRFS_ORPHAN_ITEM_KEY;
        key.offset = 0;

        while (1) {
                u64 root_objectid;

                ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
                if (ret < 0) {
                        err = ret;
                        break;
                }

                leaf = path->nodes[0];
                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
                        ret = btrfs_next_leaf(tree_root, path);
                        if (ret < 0)
                                err = ret;
                        if (ret != 0)
                                break;
                        leaf = path->nodes[0];
                }

                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
                btrfs_release_path(path);

                if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
                    key.type != BTRFS_ORPHAN_ITEM_KEY)
                        break;

                root_objectid = key.offset;
                key.offset++;

                root = btrfs_get_fs_root(fs_info, root_objectid, false);
                err = PTR_ERR_OR_ZERO(root);
                if (err && err != -ENOENT) {
                        break;
                } else if (err == -ENOENT) {
                        struct btrfs_trans_handle *trans;

                        btrfs_release_path(path);

                        trans = btrfs_join_transaction(tree_root);
                        if (IS_ERR(trans)) {
                                err = PTR_ERR(trans);
                                btrfs_handle_fs_error(fs_info, err,
                                            "Failed to start trans to delete orphan item");
                                break;
                        }
                        err = btrfs_del_orphan_item(trans, tree_root,
                                                    root_objectid);
                        btrfs_end_transaction(trans);
                        if (err) {
                                btrfs_handle_fs_error(fs_info, err,
                                            "Failed to delete root orphan item");
                                break;
                        }
                        continue;
                }

                WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state));
                if (btrfs_root_refs(&root->root_item) == 0) {
                        struct btrfs_key drop_key;

                        btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress);
                        /*
                         * If we have a non-zero drop_progress then we know we
                         * made it partly through deleting this snapshot, and
                         * thus we need to make sure we block any balance from
                         * happening until this snapshot is completely dropped.
                         */
                        if (drop_key.objectid != 0 || drop_key.type != 0 ||
                            drop_key.offset != 0) {
                                set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
                                set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
                        }

                        set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
                        btrfs_add_dead_root(root);
                }
                btrfs_put_root(root);
        }

        btrfs_free_path(path);
        return err;
}

/* drop the root item for 'key' from the tree root */
int btrfs_del_root(struct btrfs_trans_handle *trans,
                   const struct btrfs_key *key)
{
        struct btrfs_root *root = trans->fs_info->tree_root;
        struct btrfs_path *path;
        int ret;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        ret = btrfs_search_slot(trans, root, key, path, -1, 1);
        if (ret < 0)
                goto out;
        if (ret != 0) {
                /* The root must exist but we did not find it by the key. */
                ret = -EUCLEAN;
                goto out;
        }

        ret = btrfs_del_item(trans, root, path);
out:
        btrfs_free_path(path);
        return ret;
}

int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
                       u64 ref_id, u64 dirid, u64 *sequence,
                       const struct fscrypt_str *name)
{
        struct btrfs_root *tree_root = trans->fs_info->tree_root;
        struct btrfs_path *path;
        struct btrfs_root_ref *ref;
        struct extent_buffer *leaf;
        struct btrfs_key key;
        unsigned long ptr;
        int ret;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = root_id;
        key.type = BTRFS_ROOT_BACKREF_KEY;
        key.offset = ref_id;
again:
        ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
        if (ret < 0) {
                goto out;
        } else if (ret == 0) {
                leaf = path->nodes[0];
                ref = btrfs_item_ptr(leaf, path->slots[0],
                                     struct btrfs_root_ref);
                ptr = (unsigned long)(ref + 1);
                if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
                    (btrfs_root_ref_name_len(leaf, ref) != name->len) ||
                    memcmp_extent_buffer(leaf, name->name, ptr, name->len)) {
                        ret = -ENOENT;
                        goto out;
                }
                *sequence = btrfs_root_ref_sequence(leaf, ref);

                ret = btrfs_del_item(trans, tree_root, path);
                if (ret)
                        goto out;
        } else {
                ret = -ENOENT;
                goto out;
        }

        if (key.type == BTRFS_ROOT_BACKREF_KEY) {
                btrfs_release_path(path);
                key.objectid = ref_id;
                key.type = BTRFS_ROOT_REF_KEY;
                key.offset = root_id;
                goto again;
        }

out:
        btrfs_free_path(path);
        return ret;
}

/*
 * add a btrfs_root_ref item.  type is either BTRFS_ROOT_REF_KEY
 * or BTRFS_ROOT_BACKREF_KEY.
 *
 * The dirid, sequence, name and name_len refer to the directory entry
 * that is referencing the root.
 *
 * For a forward ref, the root_id is the id of the tree referencing
 * the root and ref_id is the id of the subvol  or snapshot.
 *
 * For a back ref the root_id is the id of the subvol or snapshot and
 * ref_id is the id of the tree referencing it.
 *
 * Will return 0, -ENOMEM, or anything from the CoW path
 */
int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
                       u64 ref_id, u64 dirid, u64 sequence,
                       const struct fscrypt_str *name)
{
        struct btrfs_root *tree_root = trans->fs_info->tree_root;
        struct btrfs_key key;
        int ret;
        struct btrfs_path *path;
        struct btrfs_root_ref *ref;
        struct extent_buffer *leaf;
        unsigned long ptr;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = root_id;
        key.type = BTRFS_ROOT_BACKREF_KEY;
        key.offset = ref_id;
again:
        ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
                                      sizeof(*ref) + name->len);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                btrfs_free_path(path);
                return ret;
        }

        leaf = path->nodes[0];
        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
        btrfs_set_root_ref_dirid(leaf, ref, dirid);
        btrfs_set_root_ref_sequence(leaf, ref, sequence);
        btrfs_set_root_ref_name_len(leaf, ref, name->len);
        ptr = (unsigned long)(ref + 1);
        write_extent_buffer(leaf, name->name, ptr, name->len);
        btrfs_mark_buffer_dirty(trans, leaf);

        if (key.type == BTRFS_ROOT_BACKREF_KEY) {
                btrfs_release_path(path);
                key.objectid = ref_id;
                key.type = BTRFS_ROOT_REF_KEY;
                key.offset = root_id;
                goto again;
        }

        btrfs_free_path(path);
        return 0;
}

/*
 * Old btrfs forgets to init root_item->flags and root_item->byte_limit
 * for subvolumes. To work around this problem, we steal a bit from
 * root_item->inode_item->flags, and use it to indicate if those fields
 * have been properly initialized.
 */
void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
{
        u64 inode_flags = btrfs_stack_inode_flags(&root_item->inode);

        if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) {
                inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT;
                btrfs_set_stack_inode_flags(&root_item->inode, inode_flags);
                btrfs_set_root_flags(root_item, 0);
                btrfs_set_root_limit(root_item, 0);
        }
}

void btrfs_update_root_times(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root)
{
        struct btrfs_root_item *item = &root->root_item;
        struct timespec64 ct;

        ktime_get_real_ts64(&ct);
        spin_lock(&root->root_item_lock);
        btrfs_set_root_ctransid(item, trans->transid);
        btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec);
        btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec);
        spin_unlock(&root->root_item_lock);
}

/*
 * Reserve space for subvolume operation.
 *
 * root: the root of the parent directory
 * rsv: block reservation
 * items: the number of items that we need do reservation
 * use_global_rsv: allow fallback to the global block reservation
 *
 * This function is used to reserve the space for snapshot/subvolume
 * creation and deletion. Those operations are different with the
 * common file/directory operations, they change two fs/file trees
 * and root tree, the number of items that the qgroup reserves is
 * different with the free space reservation. So we can not use
 * the space reservation mechanism in start_transaction().
 */
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
                                     struct btrfs_block_rsv *rsv, int items,
                                     bool use_global_rsv)
{
        u64 qgroup_num_bytes = 0;
        u64 num_bytes;
        int ret;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;

        if (btrfs_qgroup_enabled(fs_info)) {
                /* One for parent inode, two for dir entries */
                qgroup_num_bytes = 3 * fs_info->nodesize;
                ret = btrfs_qgroup_reserve_meta_prealloc(root,
                                                         qgroup_num_bytes, true,
                                                         false);
                if (ret)
                        return ret;
        }

        num_bytes = btrfs_calc_insert_metadata_size(fs_info, items);
        rsv->space_info = btrfs_find_space_info(fs_info,
                                            BTRFS_BLOCK_GROUP_METADATA);
        ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes,
                                  BTRFS_RESERVE_FLUSH_ALL);

        if (ret == -ENOSPC && use_global_rsv)
                ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);

        if (ret && qgroup_num_bytes)
                btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);

        if (!ret) {
                spin_lock(&rsv->lock);
                rsv->qgroup_rsv_reserved += qgroup_num_bytes;
                spin_unlock(&rsv->lock);
        }
        return ret;
}
































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 



















    1 



    1 




























































    4 



    4 




















    3 



    5 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  fs/userfaultfd.c
 *
 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
 *  Copyright (C) 2008-2009 Red Hat, Inc.
 *  Copyright (C) 2015  Red Hat, Inc.
 *
 *  Some part derived from fs/eventfd.c (anon inode setup) and
 *  mm/ksm.c (mm hashing).
 */

#include <linux/list.h>
#include <linux/hashtable.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/file.h>
#include <linux/bug.h>
#include <linux/anon_inodes.h>
#include <linux/syscalls.h>
#include <linux/userfaultfd_k.h>
#include <linux/mempolicy.h>
#include <linux/ioctl.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/swapops.h>
#include <linux/miscdevice.h>
#include <linux/uio.h>

static int sysctl_unprivileged_userfaultfd __read_mostly;

#ifdef CONFIG_SYSCTL
static struct ctl_table vm_userfaultfd_table[] = {
        {
                .procname        = "unprivileged_userfaultfd",
                .data                = &sysctl_unprivileged_userfaultfd,
                .maxlen                = sizeof(sysctl_unprivileged_userfaultfd),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
};
#endif

static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;

struct userfaultfd_fork_ctx {
        struct userfaultfd_ctx *orig;
        struct userfaultfd_ctx *new;
        struct list_head list;
};

struct userfaultfd_unmap_ctx {
        struct userfaultfd_ctx *ctx;
        unsigned long start;
        unsigned long end;
        struct list_head list;
};

struct userfaultfd_wait_queue {
        struct uffd_msg msg;
        wait_queue_entry_t wq;
        struct userfaultfd_ctx *ctx;
        bool waken;
};

struct userfaultfd_wake_range {
        unsigned long start;
        unsigned long len;
};

/* internal indication that UFFD_API ioctl was successfully executed */
#define UFFD_FEATURE_INITIALIZED                (1u << 31)

static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
{
        return ctx->features & UFFD_FEATURE_INITIALIZED;
}

static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
{
        return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
}

/*
 * Whether WP_UNPOPULATED is enabled on the uffd context.  It is only
 * meaningful when userfaultfd_wp()==true on the vma and when it's
 * anonymous.
 */
bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
{
        struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;

        if (!ctx)
                return false;

        return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
}

static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
                                     vm_flags_t flags)
{
        const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;

        vm_flags_reset(vma, flags);
        /*
         * For shared mappings, we want to enable writenotify while
         * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
         * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
         */
        if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
                vma_set_page_prot(vma);
}

static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
                                     int wake_flags, void *key)
{
        struct userfaultfd_wake_range *range = key;
        int ret;
        struct userfaultfd_wait_queue *uwq;
        unsigned long start, len;

        uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
        ret = 0;
        /* len == 0 means wake all */
        start = range->start;
        len = range->len;
        if (len && (start > uwq->msg.arg.pagefault.address ||
                    start + len <= uwq->msg.arg.pagefault.address))
                goto out;
        WRITE_ONCE(uwq->waken, true);
        /*
         * The Program-Order guarantees provided by the scheduler
         * ensure uwq->waken is visible before the task is woken.
         */
        ret = wake_up_state(wq->private, mode);
        if (ret) {
                /*
                 * Wake only once, autoremove behavior.
                 *
                 * After the effect of list_del_init is visible to the other
                 * CPUs, the waitqueue may disappear from under us, see the
                 * !list_empty_careful() in handle_userfault().
                 *
                 * try_to_wake_up() has an implicit smp_mb(), and the
                 * wq->private is read before calling the extern function
                 * "wake_up_state" (which in turns calls try_to_wake_up).
                 */
                list_del_init(&wq->entry);
        }
out:
        return ret;
}

/**
 * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
 * context.
 * @ctx: [in] Pointer to the userfaultfd context.
 */
static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
{
        refcount_inc(&ctx->refcount);
}

/**
 * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
 * context.
 * @ctx: [in] Pointer to userfaultfd context.
 *
 * The userfaultfd context reference must have been previously acquired either
 * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
 */
static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
{
        if (refcount_dec_and_test(&ctx->refcount)) {
                VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
                VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
                VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
                VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
                VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
                VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
                VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
                VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
                mmdrop(ctx->mm);
                kmem_cache_free(userfaultfd_ctx_cachep, ctx);
        }
}

static inline void msg_init(struct uffd_msg *msg)
{
        BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
        /*
         * Must use memset to zero out the paddings or kernel data is
         * leaked to userland.
         */
        memset(msg, 0, sizeof(struct uffd_msg));
}

static inline struct uffd_msg userfault_msg(unsigned long address,
                                            unsigned long real_address,
                                            unsigned int flags,
                                            unsigned long reason,
                                            unsigned int features)
{
        struct uffd_msg msg;

        msg_init(&msg);
        msg.event = UFFD_EVENT_PAGEFAULT;

        msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
                                    real_address : address;

        /*
         * These flags indicate why the userfault occurred:
         * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
         * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
         * - Neither of these flags being set indicates a MISSING fault.
         *
         * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
         * fault. Otherwise, it was a read fault.
         */
        if (flags & FAULT_FLAG_WRITE)
                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
        if (reason & VM_UFFD_WP)
                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
        if (reason & VM_UFFD_MINOR)
                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
        if (features & UFFD_FEATURE_THREAD_ID)
                msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
        return msg;
}

#ifdef CONFIG_HUGETLB_PAGE
/*
 * Same functionality as userfaultfd_must_wait below with modifications for
 * hugepmd ranges.
 */
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
                                              struct vm_fault *vmf,
                                              unsigned long reason)
{
        struct vm_area_struct *vma = vmf->vma;
        pte_t *ptep, pte;
        bool ret = true;

        assert_fault_locked(vmf);

        ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
        if (!ptep)
                goto out;

        ret = false;
        pte = huge_ptep_get(ptep);

        /*
         * Lockless access: we're in a wait_event so it's ok if it
         * changes under us.  PTE markers should be handled the same as none
         * ptes here.
         */
        if (huge_pte_none_mostly(pte))
                ret = true;
        if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
                ret = true;
out:
        return ret;
}
#else
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
                                              struct vm_fault *vmf,
                                              unsigned long reason)
{
        return false;        /* should never get here */
}
#endif /* CONFIG_HUGETLB_PAGE */

/*
 * Verify the pagetables are still not ok after having reigstered into
 * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
 * userfault that has already been resolved, if userfaultfd_read_iter and
 * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
 * threads.
 */
static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
                                         struct vm_fault *vmf,
                                         unsigned long reason)
{
        struct mm_struct *mm = ctx->mm;
        unsigned long address = vmf->address;
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd, _pmd;
        pte_t *pte;
        pte_t ptent;
        bool ret = true;

        assert_fault_locked(vmf);

        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
                goto out;
        p4d = p4d_offset(pgd, address);
        if (!p4d_present(*p4d))
                goto out;
        pud = pud_offset(p4d, address);
        if (!pud_present(*pud))
                goto out;
        pmd = pmd_offset(pud, address);
again:
        _pmd = pmdp_get_lockless(pmd);
        if (pmd_none(_pmd))
                goto out;

        ret = false;
        if (!pmd_present(_pmd) || pmd_devmap(_pmd))
                goto out;

        if (pmd_trans_huge(_pmd)) {
                if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
                        ret = true;
                goto out;
        }

        pte = pte_offset_map(pmd, address);
        if (!pte) {
                ret = true;
                goto again;
        }
        /*
         * Lockless access: we're in a wait_event so it's ok if it
         * changes under us.  PTE markers should be handled the same as none
         * ptes here.
         */
        ptent = ptep_get(pte);
        if (pte_none_mostly(ptent))
                ret = true;
        if (!pte_write(ptent) && (reason & VM_UFFD_WP))
                ret = true;
        pte_unmap(pte);

out:
        return ret;
}

static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
{
        if (flags & FAULT_FLAG_INTERRUPTIBLE)
                return TASK_INTERRUPTIBLE;

        if (flags & FAULT_FLAG_KILLABLE)
                return TASK_KILLABLE;

        return TASK_UNINTERRUPTIBLE;
}

/*
 * The locking rules involved in returning VM_FAULT_RETRY depending on
 * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
 * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
 * recommendation in __lock_page_or_retry is not an understatement.
 *
 * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
 * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
 * not set.
 *
 * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
 * set, VM_FAULT_RETRY can still be returned if and only if there are
 * fatal_signal_pending()s, and the mmap_lock must be released before
 * returning it.
 */
vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
{
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        struct userfaultfd_ctx *ctx;
        struct userfaultfd_wait_queue uwq;
        vm_fault_t ret = VM_FAULT_SIGBUS;
        bool must_wait;
        unsigned int blocking_state;

        /*
         * We don't do userfault handling for the final child pid update.
         *
         * We also don't do userfault handling during
         * coredumping. hugetlbfs has the special
         * hugetlb_follow_page_mask() to skip missing pages in the
         * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
         * the no_page_table() helper in follow_page_mask(), but the
         * shmem_vm_ops->fault method is invoked even during
         * coredumping and it ends up here.
         */
        if (current->flags & (PF_EXITING|PF_DUMPCORE))
                goto out;

        assert_fault_locked(vmf);

        ctx = vma->vm_userfaultfd_ctx.ctx;
        if (!ctx)
                goto out;

        BUG_ON(ctx->mm != mm);

        /* Any unrecognized flag is a bug. */
        VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
        /* 0 or > 1 flags set is a bug; we expect exactly 1. */
        VM_BUG_ON(!reason || (reason & (reason - 1)));

        if (ctx->features & UFFD_FEATURE_SIGBUS)
                goto out;
        if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
                goto out;

        /*
         * If it's already released don't get it. This avoids to loop
         * in __get_user_pages if userfaultfd_release waits on the
         * caller of handle_userfault to release the mmap_lock.
         */
        if (unlikely(READ_ONCE(ctx->released))) {
                /*
                 * Don't return VM_FAULT_SIGBUS in this case, so a non
                 * cooperative manager can close the uffd after the
                 * last UFFDIO_COPY, without risking to trigger an
                 * involuntary SIGBUS if the process was starting the
                 * userfaultfd while the userfaultfd was still armed
                 * (but after the last UFFDIO_COPY). If the uffd
                 * wasn't already closed when the userfault reached
                 * this point, that would normally be solved by
                 * userfaultfd_must_wait returning 'false'.
                 *
                 * If we were to return VM_FAULT_SIGBUS here, the non
                 * cooperative manager would be instead forced to
                 * always call UFFDIO_UNREGISTER before it can safely
                 * close the uffd.
                 */
                ret = VM_FAULT_NOPAGE;
                goto out;
        }

        /*
         * Check that we can return VM_FAULT_RETRY.
         *
         * NOTE: it should become possible to return VM_FAULT_RETRY
         * even if FAULT_FLAG_TRIED is set without leading to gup()
         * -EBUSY failures, if the userfaultfd is to be extended for
         * VM_UFFD_WP tracking and we intend to arm the userfault
         * without first stopping userland access to the memory. For
         * VM_UFFD_MISSING userfaults this is enough for now.
         */
        if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
                /*
                 * Validate the invariant that nowait must allow retry
                 * to be sure not to return SIGBUS erroneously on
                 * nowait invocations.
                 */
                BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
#ifdef CONFIG_DEBUG_VM
                if (printk_ratelimit()) {
                        printk(KERN_WARNING
                               "FAULT_FLAG_ALLOW_RETRY missing %x\n",
                               vmf->flags);
                        dump_stack();
                }
#endif
                goto out;
        }

        /*
         * Handle nowait, not much to do other than tell it to retry
         * and wait.
         */
        ret = VM_FAULT_RETRY;
        if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
                goto out;

        /* take the reference before dropping the mmap_lock */
        userfaultfd_ctx_get(ctx);

        init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
        uwq.wq.private = current;
        uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
                                reason, ctx->features);
        uwq.ctx = ctx;
        uwq.waken = false;

        blocking_state = userfaultfd_get_blocking_state(vmf->flags);

        /*
         * Take the vma lock now, in order to safely call
         * userfaultfd_huge_must_wait() later. Since acquiring the
         * (sleepable) vma lock can modify the current task state, that
         * must be before explicitly calling set_current_state().
         */
        if (is_vm_hugetlb_page(vma))
                hugetlb_vma_lock_read(vma);

        spin_lock_irq(&ctx->fault_pending_wqh.lock);
        /*
         * After the __add_wait_queue the uwq is visible to userland
         * through poll/read().
         */
        __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
        /*
         * The smp_mb() after __set_current_state prevents the reads
         * following the spin_unlock to happen before the list_add in
         * __add_wait_queue.
         */
        set_current_state(blocking_state);
        spin_unlock_irq(&ctx->fault_pending_wqh.lock);

        if (!is_vm_hugetlb_page(vma))
                must_wait = userfaultfd_must_wait(ctx, vmf, reason);
        else
                must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
        if (is_vm_hugetlb_page(vma))
                hugetlb_vma_unlock_read(vma);
        release_fault_lock(vmf);

        if (likely(must_wait && !READ_ONCE(ctx->released))) {
                wake_up_poll(&ctx->fd_wqh, EPOLLIN);
                schedule();
        }

        __set_current_state(TASK_RUNNING);

        /*
         * Here we race with the list_del; list_add in
         * userfaultfd_ctx_read(), however because we don't ever run
         * list_del_init() to refile across the two lists, the prev
         * and next pointers will never point to self. list_add also
         * would never let any of the two pointers to point to
         * self. So list_empty_careful won't risk to see both pointers
         * pointing to self at any time during the list refile. The
         * only case where list_del_init() is called is the full
         * removal in the wake function and there we don't re-list_add
         * and it's fine not to block on the spinlock. The uwq on this
         * kernel stack can be released after the list_del_init.
         */
        if (!list_empty_careful(&uwq.wq.entry)) {
                spin_lock_irq(&ctx->fault_pending_wqh.lock);
                /*
                 * No need of list_del_init(), the uwq on the stack
                 * will be freed shortly anyway.
                 */
                list_del(&uwq.wq.entry);
                spin_unlock_irq(&ctx->fault_pending_wqh.lock);
        }

        /*
         * ctx may go away after this if the userfault pseudo fd is
         * already released.
         */
        userfaultfd_ctx_put(ctx);

out:
        return ret;
}

static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
                                              struct userfaultfd_wait_queue *ewq)
{
        struct userfaultfd_ctx *release_new_ctx;

        if (WARN_ON_ONCE(current->flags & PF_EXITING))
                goto out;

        ewq->ctx = ctx;
        init_waitqueue_entry(&ewq->wq, current);
        release_new_ctx = NULL;

        spin_lock_irq(&ctx->event_wqh.lock);
        /*
         * After the __add_wait_queue the uwq is visible to userland
         * through poll/read().
         */
        __add_wait_queue(&ctx->event_wqh, &ewq->wq);
        for (;;) {
                set_current_state(TASK_KILLABLE);
                if (ewq->msg.event == 0)
                        break;
                if (READ_ONCE(ctx->released) ||
                    fatal_signal_pending(current)) {
                        /*
                         * &ewq->wq may be queued in fork_event, but
                         * __remove_wait_queue ignores the head
                         * parameter. It would be a problem if it
                         * didn't.
                         */
                        __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
                        if (ewq->msg.event == UFFD_EVENT_FORK) {
                                struct userfaultfd_ctx *new;

                                new = (struct userfaultfd_ctx *)
                                        (unsigned long)
                                        ewq->msg.arg.reserved.reserved1;
                                release_new_ctx = new;
                        }
                        break;
                }

                spin_unlock_irq(&ctx->event_wqh.lock);

                wake_up_poll(&ctx->fd_wqh, EPOLLIN);
                schedule();

                spin_lock_irq(&ctx->event_wqh.lock);
        }
        __set_current_state(TASK_RUNNING);
        spin_unlock_irq(&ctx->event_wqh.lock);

        if (release_new_ctx) {
                struct vm_area_struct *vma;
                struct mm_struct *mm = release_new_ctx->mm;
                VMA_ITERATOR(vmi, mm, 0);

                /* the various vma->vm_userfaultfd_ctx still points to it */
                mmap_write_lock(mm);
                for_each_vma(vmi, vma) {
                        if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
                                vma_start_write(vma);
                                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
                                userfaultfd_set_vm_flags(vma,
                                                         vma->vm_flags & ~__VM_UFFD_FLAGS);
                        }
                }
                mmap_write_unlock(mm);

                userfaultfd_ctx_put(release_new_ctx);
        }

        /*
         * ctx may go away after this if the userfault pseudo fd is
         * already released.
         */
out:
        atomic_dec(&ctx->mmap_changing);
        VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0);
        userfaultfd_ctx_put(ctx);
}

static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
                                       struct userfaultfd_wait_queue *ewq)
{
        ewq->msg.event = 0;
        wake_up_locked(&ctx->event_wqh);
        __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
}

int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
{
        struct userfaultfd_ctx *ctx = NULL, *octx;
        struct userfaultfd_fork_ctx *fctx;

        octx = vma->vm_userfaultfd_ctx.ctx;
        if (!octx)
                return 0;

        if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) {
                vma_start_write(vma);
                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
                userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
                return 0;
        }

        list_for_each_entry(fctx, fcs, list)
                if (fctx->orig == octx) {
                        ctx = fctx->new;
                        break;
                }

        if (!ctx) {
                fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
                if (!fctx)
                        return -ENOMEM;

                ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
                if (!ctx) {
                        kfree(fctx);
                        return -ENOMEM;
                }

                refcount_set(&ctx->refcount, 1);
                ctx->flags = octx->flags;
                ctx->features = octx->features;
                ctx->released = false;
                init_rwsem(&ctx->map_changing_lock);
                atomic_set(&ctx->mmap_changing, 0);
                ctx->mm = vma->vm_mm;
                mmgrab(ctx->mm);

                userfaultfd_ctx_get(octx);
                down_write(&octx->map_changing_lock);
                atomic_inc(&octx->mmap_changing);
                up_write(&octx->map_changing_lock);
                fctx->orig = octx;
                fctx->new = ctx;
                list_add_tail(&fctx->list, fcs);
        }

        vma->vm_userfaultfd_ctx.ctx = ctx;
        return 0;
}

static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
{
        struct userfaultfd_ctx *ctx = fctx->orig;
        struct userfaultfd_wait_queue ewq;

        msg_init(&ewq.msg);

        ewq.msg.event = UFFD_EVENT_FORK;
        ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;

        userfaultfd_event_wait_completion(ctx, &ewq);
}

void dup_userfaultfd_complete(struct list_head *fcs)
{
        struct userfaultfd_fork_ctx *fctx, *n;

        list_for_each_entry_safe(fctx, n, fcs, list) {
                dup_fctx(fctx);
                list_del(&fctx->list);
                kfree(fctx);
        }
}

void mremap_userfaultfd_prep(struct vm_area_struct *vma,
                             struct vm_userfaultfd_ctx *vm_ctx)
{
        struct userfaultfd_ctx *ctx;

        ctx = vma->vm_userfaultfd_ctx.ctx;

        if (!ctx)
                return;

        if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
                vm_ctx->ctx = ctx;
                userfaultfd_ctx_get(ctx);
                down_write(&ctx->map_changing_lock);
                atomic_inc(&ctx->mmap_changing);
                up_write(&ctx->map_changing_lock);
        } else {
                /* Drop uffd context if remap feature not enabled */
                vma_start_write(vma);
                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
                userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
        }
}

void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
                                 unsigned long from, unsigned long to,
                                 unsigned long len)
{
        struct userfaultfd_ctx *ctx = vm_ctx->ctx;
        struct userfaultfd_wait_queue ewq;

        if (!ctx)
                return;

        if (to & ~PAGE_MASK) {
                userfaultfd_ctx_put(ctx);
                return;
        }

        msg_init(&ewq.msg);

        ewq.msg.event = UFFD_EVENT_REMAP;
        ewq.msg.arg.remap.from = from;
        ewq.msg.arg.remap.to = to;
        ewq.msg.arg.remap.len = len;

        userfaultfd_event_wait_completion(ctx, &ewq);
}

bool userfaultfd_remove(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end)
{
        struct mm_struct *mm = vma->vm_mm;
        struct userfaultfd_ctx *ctx;
        struct userfaultfd_wait_queue ewq;

        ctx = vma->vm_userfaultfd_ctx.ctx;
        if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
                return true;

        userfaultfd_ctx_get(ctx);
        down_write(&ctx->map_changing_lock);
        atomic_inc(&ctx->mmap_changing);
        up_write(&ctx->map_changing_lock);
        mmap_read_unlock(mm);

        msg_init(&ewq.msg);

        ewq.msg.event = UFFD_EVENT_REMOVE;
        ewq.msg.arg.remove.start = start;
        ewq.msg.arg.remove.end = end;

        userfaultfd_event_wait_completion(ctx, &ewq);

        return false;
}

static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
                          unsigned long start, unsigned long end)
{
        struct userfaultfd_unmap_ctx *unmap_ctx;

        list_for_each_entry(unmap_ctx, unmaps, list)
                if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
                    unmap_ctx->end == end)
                        return true;

        return false;
}

int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
                           unsigned long end, struct list_head *unmaps)
{
        struct userfaultfd_unmap_ctx *unmap_ctx;
        struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;

        if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
            has_unmap_ctx(ctx, unmaps, start, end))
                return 0;

        unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
        if (!unmap_ctx)
                return -ENOMEM;

        userfaultfd_ctx_get(ctx);
        down_write(&ctx->map_changing_lock);
        atomic_inc(&ctx->mmap_changing);
        up_write(&ctx->map_changing_lock);
        unmap_ctx->ctx = ctx;
        unmap_ctx->start = start;
        unmap_ctx->end = end;
        list_add_tail(&unmap_ctx->list, unmaps);

        return 0;
}

void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
{
        struct userfaultfd_unmap_ctx *ctx, *n;
        struct userfaultfd_wait_queue ewq;

        list_for_each_entry_safe(ctx, n, uf, list) {
                msg_init(&ewq.msg);

                ewq.msg.event = UFFD_EVENT_UNMAP;
                ewq.msg.arg.remove.start = ctx->start;
                ewq.msg.arg.remove.end = ctx->end;

                userfaultfd_event_wait_completion(ctx->ctx, &ewq);

                list_del(&ctx->list);
                kfree(ctx);
        }
}

static int userfaultfd_release(struct inode *inode, struct file *file)
{
        struct userfaultfd_ctx *ctx = file->private_data;
        struct mm_struct *mm = ctx->mm;
        struct vm_area_struct *vma, *prev;
        /* len == 0 means wake all */
        struct userfaultfd_wake_range range = { .len = 0, };
        unsigned long new_flags;
        VMA_ITERATOR(vmi, mm, 0);

        WRITE_ONCE(ctx->released, true);

        if (!mmget_not_zero(mm))
                goto wakeup;

        /*
         * Flush page faults out of all CPUs. NOTE: all page faults
         * must be retried without returning VM_FAULT_SIGBUS if
         * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
         * changes while handle_userfault released the mmap_lock. So
         * it's critical that released is set to true (above), before
         * taking the mmap_lock for writing.
         */
        mmap_write_lock(mm);
        prev = NULL;
        for_each_vma(vmi, vma) {
                cond_resched();
                BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
                       !!(vma->vm_flags & __VM_UFFD_FLAGS));
                if (vma->vm_userfaultfd_ctx.ctx != ctx) {
                        prev = vma;
                        continue;
                }
                /* Reset ptes for the whole vma range if wr-protected */
                if (userfaultfd_wp(vma))
                        uffd_wp_range(vma, vma->vm_start,
                                      vma->vm_end - vma->vm_start, false);
                new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
                vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start,
                                            vma->vm_end, new_flags,
                                            NULL_VM_UFFD_CTX);

                vma_start_write(vma);
                userfaultfd_set_vm_flags(vma, new_flags);
                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;

                prev = vma;
        }
        mmap_write_unlock(mm);
        mmput(mm);
wakeup:
        /*
         * After no new page faults can wait on this fault_*wqh, flush
         * the last page faults that may have been already waiting on
         * the fault_*wqh.
         */
        spin_lock_irq(&ctx->fault_pending_wqh.lock);
        __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
        __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
        spin_unlock_irq(&ctx->fault_pending_wqh.lock);

        /* Flush pending events that may still wait on event_wqh */
        wake_up_all(&ctx->event_wqh);

        wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
        userfaultfd_ctx_put(ctx);
        return 0;
}

/* fault_pending_wqh.lock must be hold by the caller */
static inline struct userfaultfd_wait_queue *find_userfault_in(
                wait_queue_head_t *wqh)
{
        wait_queue_entry_t *wq;
        struct userfaultfd_wait_queue *uwq;

        lockdep_assert_held(&wqh->lock);

        uwq = NULL;
        if (!waitqueue_active(wqh))
                goto out;
        /* walk in reverse to provide FIFO behavior to read userfaults */
        wq = list_last_entry(&wqh->head, typeof(*wq), entry);
        uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
out:
        return uwq;
}

static inline struct userfaultfd_wait_queue *find_userfault(
                struct userfaultfd_ctx *ctx)
{
        return find_userfault_in(&ctx->fault_pending_wqh);
}

static inline struct userfaultfd_wait_queue *find_userfault_evt(
                struct userfaultfd_ctx *ctx)
{
        return find_userfault_in(&ctx->event_wqh);
}

static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
{
        struct userfaultfd_ctx *ctx = file->private_data;
        __poll_t ret;

        poll_wait(file, &ctx->fd_wqh, wait);

        if (!userfaultfd_is_initialized(ctx))
                return EPOLLERR;

        /*
         * poll() never guarantees that read won't block.
         * userfaults can be waken before they're read().
         */
        if (unlikely(!(file->f_flags & O_NONBLOCK)))
                return EPOLLERR;
        /*
         * lockless access to see if there are pending faults
         * __pollwait last action is the add_wait_queue but
         * the spin_unlock would allow the waitqueue_active to
         * pass above the actual list_add inside
         * add_wait_queue critical section. So use a full
         * memory barrier to serialize the list_add write of
         * add_wait_queue() with the waitqueue_active read
         * below.
         */
        ret = 0;
        smp_mb();
        if (waitqueue_active(&ctx->fault_pending_wqh))
                ret = EPOLLIN;
        else if (waitqueue_active(&ctx->event_wqh))
                ret = EPOLLIN;

        return ret;
}

static const struct file_operations userfaultfd_fops;

static int resolve_userfault_fork(struct userfaultfd_ctx *new,
                                  struct inode *inode,
                                  struct uffd_msg *msg)
{
        int fd;

        fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new,
                        O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
        if (fd < 0)
                return fd;

        msg->arg.reserved.reserved1 = 0;
        msg->arg.fork.ufd = fd;
        return 0;
}

static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
                                    struct uffd_msg *msg, struct inode *inode)
{
        ssize_t ret;
        DECLARE_WAITQUEUE(wait, current);
        struct userfaultfd_wait_queue *uwq;
        /*
         * Handling fork event requires sleeping operations, so
         * we drop the event_wqh lock, then do these ops, then
         * lock it back and wake up the waiter. While the lock is
         * dropped the ewq may go away so we keep track of it
         * carefully.
         */
        LIST_HEAD(fork_event);
        struct userfaultfd_ctx *fork_nctx = NULL;

        /* always take the fd_wqh lock before the fault_pending_wqh lock */
        spin_lock_irq(&ctx->fd_wqh.lock);
        __add_wait_queue(&ctx->fd_wqh, &wait);
        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
                spin_lock(&ctx->fault_pending_wqh.lock);
                uwq = find_userfault(ctx);
                if (uwq) {
                        /*
                         * Use a seqcount to repeat the lockless check
                         * in wake_userfault() to avoid missing
                         * wakeups because during the refile both
                         * waitqueue could become empty if this is the
                         * only userfault.
                         */
                        write_seqcount_begin(&ctx->refile_seq);

                        /*
                         * The fault_pending_wqh.lock prevents the uwq
                         * to disappear from under us.
                         *
                         * Refile this userfault from
                         * fault_pending_wqh to fault_wqh, it's not
                         * pending anymore after we read it.
                         *
                         * Use list_del() by hand (as
                         * userfaultfd_wake_function also uses
                         * list_del_init() by hand) to be sure nobody
                         * changes __remove_wait_queue() to use
                         * list_del_init() in turn breaking the
                         * !list_empty_careful() check in
                         * handle_userfault(). The uwq->wq.head list
                         * must never be empty at any time during the
                         * refile, or the waitqueue could disappear
                         * from under us. The "wait_queue_head_t"
                         * parameter of __remove_wait_queue() is unused
                         * anyway.
                         */
                        list_del(&uwq->wq.entry);
                        add_wait_queue(&ctx->fault_wqh, &uwq->wq);

                        write_seqcount_end(&ctx->refile_seq);

                        /* careful to always initialize msg if ret == 0 */
                        *msg = uwq->msg;
                        spin_unlock(&ctx->fault_pending_wqh.lock);
                        ret = 0;
                        break;
                }
                spin_unlock(&ctx->fault_pending_wqh.lock);

                spin_lock(&ctx->event_wqh.lock);
                uwq = find_userfault_evt(ctx);
                if (uwq) {
                        *msg = uwq->msg;

                        if (uwq->msg.event == UFFD_EVENT_FORK) {
                                fork_nctx = (struct userfaultfd_ctx *)
                                        (unsigned long)
                                        uwq->msg.arg.reserved.reserved1;
                                list_move(&uwq->wq.entry, &fork_event);
                                /*
                                 * fork_nctx can be freed as soon as
                                 * we drop the lock, unless we take a
                                 * reference on it.
                                 */
                                userfaultfd_ctx_get(fork_nctx);
                                spin_unlock(&ctx->event_wqh.lock);
                                ret = 0;
                                break;
                        }

                        userfaultfd_event_complete(ctx, uwq);
                        spin_unlock(&ctx->event_wqh.lock);
                        ret = 0;
                        break;
                }
                spin_unlock(&ctx->event_wqh.lock);

                if (signal_pending(current)) {
                        ret = -ERESTARTSYS;
                        break;
                }
                if (no_wait) {
                        ret = -EAGAIN;
                        break;
                }
                spin_unlock_irq(&ctx->fd_wqh.lock);
                schedule();
                spin_lock_irq(&ctx->fd_wqh.lock);
        }
        __remove_wait_queue(&ctx->fd_wqh, &wait);
        __set_current_state(TASK_RUNNING);
        spin_unlock_irq(&ctx->fd_wqh.lock);

        if (!ret && msg->event == UFFD_EVENT_FORK) {
                ret = resolve_userfault_fork(fork_nctx, inode, msg);
                spin_lock_irq(&ctx->event_wqh.lock);
                if (!list_empty(&fork_event)) {
                        /*
                         * The fork thread didn't abort, so we can
                         * drop the temporary refcount.
                         */
                        userfaultfd_ctx_put(fork_nctx);

                        uwq = list_first_entry(&fork_event,
                                               typeof(*uwq),
                                               wq.entry);
                        /*
                         * If fork_event list wasn't empty and in turn
                         * the event wasn't already released by fork
                         * (the event is allocated on fork kernel
                         * stack), put the event back to its place in
                         * the event_wq. fork_event head will be freed
                         * as soon as we return so the event cannot
                         * stay queued there no matter the current
                         * "ret" value.
                         */
                        list_del(&uwq->wq.entry);
                        __add_wait_queue(&ctx->event_wqh, &uwq->wq);

                        /*
                         * Leave the event in the waitqueue and report
                         * error to userland if we failed to resolve
                         * the userfault fork.
                         */
                        if (likely(!ret))
                                userfaultfd_event_complete(ctx, uwq);
                } else {
                        /*
                         * Here the fork thread aborted and the
                         * refcount from the fork thread on fork_nctx
                         * has already been released. We still hold
                         * the reference we took before releasing the
                         * lock above. If resolve_userfault_fork
                         * failed we've to drop it because the
                         * fork_nctx has to be freed in such case. If
                         * it succeeded we'll hold it because the new
                         * uffd references it.
                         */
                        if (ret)
                                userfaultfd_ctx_put(fork_nctx);
                }
                spin_unlock_irq(&ctx->event_wqh.lock);
        }

        return ret;
}

static ssize_t userfaultfd_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct userfaultfd_ctx *ctx = file->private_data;
        ssize_t _ret, ret = 0;
        struct uffd_msg msg;
        struct inode *inode = file_inode(file);
        bool no_wait;

        if (!userfaultfd_is_initialized(ctx))
                return -EINVAL;

        no_wait = file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT;
        for (;;) {
                if (iov_iter_count(to) < sizeof(msg))
                        return ret ? ret : -EINVAL;
                _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
                if (_ret < 0)
                        return ret ? ret : _ret;
                _ret = !copy_to_iter_full(&msg, sizeof(msg), to);
                if (_ret)
                        return ret ? ret : -EFAULT;
                ret += sizeof(msg);
                /*
                 * Allow to read more than one fault at time but only
                 * block if waiting for the very first one.
                 */
                no_wait = true;
        }
}

static void __wake_userfault(struct userfaultfd_ctx *ctx,
                             struct userfaultfd_wake_range *range)
{
        spin_lock_irq(&ctx->fault_pending_wqh.lock);
        /* wake all in the range and autoremove */
        if (waitqueue_active(&ctx->fault_pending_wqh))
                __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
                                     range);
        if (waitqueue_active(&ctx->fault_wqh))
                __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
        spin_unlock_irq(&ctx->fault_pending_wqh.lock);
}

static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
                                           struct userfaultfd_wake_range *range)
{
        unsigned seq;
        bool need_wakeup;

        /*
         * To be sure waitqueue_active() is not reordered by the CPU
         * before the pagetable update, use an explicit SMP memory
         * barrier here. PT lock release or mmap_read_unlock(mm) still
         * have release semantics that can allow the
         * waitqueue_active() to be reordered before the pte update.
         */
        smp_mb();

        /*
         * Use waitqueue_active because it's very frequent to
         * change the address space atomically even if there are no
         * userfaults yet. So we take the spinlock only when we're
         * sure we've userfaults to wake.
         */
        do {
                seq = read_seqcount_begin(&ctx->refile_seq);
                need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
                        waitqueue_active(&ctx->fault_wqh);
                cond_resched();
        } while (read_seqcount_retry(&ctx->refile_seq, seq));
        if (need_wakeup)
                __wake_userfault(ctx, range);
}

static __always_inline int validate_unaligned_range(
        struct mm_struct *mm, __u64 start, __u64 len)
{
        __u64 task_size = mm->task_size;

        if (len & ~PAGE_MASK)
                return -EINVAL;
        if (!len)
                return -EINVAL;
        if (start < mmap_min_addr)
                return -EINVAL;
        if (start >= task_size)
                return -EINVAL;
        if (len > task_size - start)
                return -EINVAL;
        if (start + len <= start)
                return -EINVAL;
        return 0;
}

static __always_inline int validate_range(struct mm_struct *mm,
                                          __u64 start, __u64 len)
{
        if (start & ~PAGE_MASK)
                return -EINVAL;

        return validate_unaligned_range(mm, start, len);
}

static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                                unsigned long arg)
{
        struct mm_struct *mm = ctx->mm;
        struct vm_area_struct *vma, *prev, *cur;
        int ret;
        struct uffdio_register uffdio_register;
        struct uffdio_register __user *user_uffdio_register;
        unsigned long vm_flags, new_flags;
        bool found;
        bool basic_ioctls;
        unsigned long start, end, vma_end;
        struct vma_iterator vmi;
        bool wp_async = userfaultfd_wp_async_ctx(ctx);

        user_uffdio_register = (struct uffdio_register __user *) arg;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_register, user_uffdio_register,
                           sizeof(uffdio_register)-sizeof(__u64)))
                goto out;

        ret = -EINVAL;
        if (!uffdio_register.mode)
                goto out;
        if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
                goto out;
        vm_flags = 0;
        if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
                vm_flags |= VM_UFFD_MISSING;
        if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
                goto out;
#endif
                vm_flags |= VM_UFFD_WP;
        }
        if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
                goto out;
#endif
                vm_flags |= VM_UFFD_MINOR;
        }

        ret = validate_range(mm, uffdio_register.range.start,
                             uffdio_register.range.len);
        if (ret)
                goto out;

        start = uffdio_register.range.start;
        end = start + uffdio_register.range.len;

        ret = -ENOMEM;
        if (!mmget_not_zero(mm))
                goto out;

        ret = -EINVAL;
        mmap_write_lock(mm);
        vma_iter_init(&vmi, mm, start);
        vma = vma_find(&vmi, end);
        if (!vma)
                goto out_unlock;

        /*
         * If the first vma contains huge pages, make sure start address
         * is aligned to huge page size.
         */
        if (is_vm_hugetlb_page(vma)) {
                unsigned long vma_hpagesize = vma_kernel_pagesize(vma);

                if (start & (vma_hpagesize - 1))
                        goto out_unlock;
        }

        /*
         * Search for not compatible vmas.
         */
        found = false;
        basic_ioctls = false;
        cur = vma;
        do {
                cond_resched();

                BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
                       !!(cur->vm_flags & __VM_UFFD_FLAGS));

                /* check not compatible vmas */
                ret = -EINVAL;
                if (!vma_can_userfault(cur, vm_flags, wp_async))
                        goto out_unlock;

                /*
                 * UFFDIO_COPY will fill file holes even without
                 * PROT_WRITE. This check enforces that if this is a
                 * MAP_SHARED, the process has write permission to the backing
                 * file. If VM_MAYWRITE is set it also enforces that on a
                 * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
                 * F_WRITE_SEAL can be taken until the vma is destroyed.
                 */
                ret = -EPERM;
                if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
                        goto out_unlock;

                /*
                 * If this vma contains ending address, and huge pages
                 * check alignment.
                 */
                if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
                    end > cur->vm_start) {
                        unsigned long vma_hpagesize = vma_kernel_pagesize(cur);

                        ret = -EINVAL;

                        if (end & (vma_hpagesize - 1))
                                goto out_unlock;
                }
                if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
                        goto out_unlock;

                /*
                 * Check that this vma isn't already owned by a
                 * different userfaultfd. We can't allow more than one
                 * userfaultfd to own a single vma simultaneously or we
                 * wouldn't know which one to deliver the userfaults to.
                 */
                ret = -EBUSY;
                if (cur->vm_userfaultfd_ctx.ctx &&
                    cur->vm_userfaultfd_ctx.ctx != ctx)
                        goto out_unlock;

                /*
                 * Note vmas containing huge pages
                 */
                if (is_vm_hugetlb_page(cur))
                        basic_ioctls = true;

                found = true;
        } for_each_vma_range(vmi, cur, end);
        BUG_ON(!found);

        vma_iter_set(&vmi, start);
        prev = vma_prev(&vmi);
        if (vma->vm_start < start)
                prev = vma;

        ret = 0;
        for_each_vma_range(vmi, vma, end) {
                cond_resched();

                BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
                BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
                       vma->vm_userfaultfd_ctx.ctx != ctx);
                WARN_ON(!(vma->vm_flags & VM_MAYWRITE));

                /*
                 * Nothing to do: this vma is already registered into this
                 * userfaultfd and with the right tracking mode too.
                 */
                if (vma->vm_userfaultfd_ctx.ctx == ctx &&
                    (vma->vm_flags & vm_flags) == vm_flags)
                        goto skip;

                if (vma->vm_start > start)
                        start = vma->vm_start;
                vma_end = min(end, vma->vm_end);

                new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
                vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
                                            new_flags,
                                            (struct vm_userfaultfd_ctx){ctx});
                if (IS_ERR(vma)) {
                        ret = PTR_ERR(vma);
                        break;
                }

                /*
                 * In the vma_merge() successful mprotect-like case 8:
                 * the next vma was merged into the current one and
                 * the current one has not been updated yet.
                 */
                vma_start_write(vma);
                userfaultfd_set_vm_flags(vma, new_flags);
                vma->vm_userfaultfd_ctx.ctx = ctx;

                if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
                        hugetlb_unshare_all_pmds(vma);

        skip:
                prev = vma;
                start = vma->vm_end;
        }

out_unlock:
        mmap_write_unlock(mm);
        mmput(mm);
        if (!ret) {
                __u64 ioctls_out;

                ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
                    UFFD_API_RANGE_IOCTLS;

                /*
                 * Declare the WP ioctl only if the WP mode is
                 * specified and all checks passed with the range
                 */
                if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
                        ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);

                /* CONTINUE ioctl is only supported for MINOR ranges. */
                if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
                        ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);

                /*
                 * Now that we scanned all vmas we can already tell
                 * userland which ioctls methods are guaranteed to
                 * succeed on this range.
                 */
                if (put_user(ioctls_out, &user_uffdio_register->ioctls))
                        ret = -EFAULT;
        }
out:
        return ret;
}

static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
                                  unsigned long arg)
{
        struct mm_struct *mm = ctx->mm;
        struct vm_area_struct *vma, *prev, *cur;
        int ret;
        struct uffdio_range uffdio_unregister;
        unsigned long new_flags;
        bool found;
        unsigned long start, end, vma_end;
        const void __user *buf = (void __user *)arg;
        struct vma_iterator vmi;
        bool wp_async = userfaultfd_wp_async_ctx(ctx);

        ret = -EFAULT;
        if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
                goto out;

        ret = validate_range(mm, uffdio_unregister.start,
                             uffdio_unregister.len);
        if (ret)
                goto out;

        start = uffdio_unregister.start;
        end = start + uffdio_unregister.len;

        ret = -ENOMEM;
        if (!mmget_not_zero(mm))
                goto out;

        mmap_write_lock(mm);
        ret = -EINVAL;
        vma_iter_init(&vmi, mm, start);
        vma = vma_find(&vmi, end);
        if (!vma)
                goto out_unlock;

        /*
         * If the first vma contains huge pages, make sure start address
         * is aligned to huge page size.
         */
        if (is_vm_hugetlb_page(vma)) {
                unsigned long vma_hpagesize = vma_kernel_pagesize(vma);

                if (start & (vma_hpagesize - 1))
                        goto out_unlock;
        }

        /*
         * Search for not compatible vmas.
         */
        found = false;
        cur = vma;
        do {
                cond_resched();

                BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
                       !!(cur->vm_flags & __VM_UFFD_FLAGS));

                /*
                 * Check not compatible vmas, not strictly required
                 * here as not compatible vmas cannot have an
                 * userfaultfd_ctx registered on them, but this
                 * provides for more strict behavior to notice
                 * unregistration errors.
                 */
                if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
                        goto out_unlock;

                found = true;
        } for_each_vma_range(vmi, cur, end);
        BUG_ON(!found);

        vma_iter_set(&vmi, start);
        prev = vma_prev(&vmi);
        if (vma->vm_start < start)
                prev = vma;

        ret = 0;
        for_each_vma_range(vmi, vma, end) {
                cond_resched();

                BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));

                /*
                 * Nothing to do: this vma is already registered into this
                 * userfaultfd and with the right tracking mode too.
                 */
                if (!vma->vm_userfaultfd_ctx.ctx)
                        goto skip;

                WARN_ON(!(vma->vm_flags & VM_MAYWRITE));

                if (vma->vm_start > start)
                        start = vma->vm_start;
                vma_end = min(end, vma->vm_end);

                if (userfaultfd_missing(vma)) {
                        /*
                         * Wake any concurrent pending userfault while
                         * we unregister, so they will not hang
                         * permanently and it avoids userland to call
                         * UFFDIO_WAKE explicitly.
                         */
                        struct userfaultfd_wake_range range;
                        range.start = start;
                        range.len = vma_end - start;
                        wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
                }

                /* Reset ptes for the whole vma range if wr-protected */
                if (userfaultfd_wp(vma))
                        uffd_wp_range(vma, start, vma_end - start, false);

                new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
                vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
                                            new_flags, NULL_VM_UFFD_CTX);
                if (IS_ERR(vma)) {
                        ret = PTR_ERR(vma);
                        break;
                }

                /*
                 * In the vma_merge() successful mprotect-like case 8:
                 * the next vma was merged into the current one and
                 * the current one has not been updated yet.
                 */
                vma_start_write(vma);
                userfaultfd_set_vm_flags(vma, new_flags);
                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;

        skip:
                prev = vma;
                start = vma->vm_end;
        }

out_unlock:
        mmap_write_unlock(mm);
        mmput(mm);
out:
        return ret;
}

/*
 * userfaultfd_wake may be used in combination with the
 * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
 */
static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
                            unsigned long arg)
{
        int ret;
        struct uffdio_range uffdio_wake;
        struct userfaultfd_wake_range range;
        const void __user *buf = (void __user *)arg;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
                goto out;

        ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
        if (ret)
                goto out;

        range.start = uffdio_wake.start;
        range.len = uffdio_wake.len;

        /*
         * len == 0 means wake all and we don't want to wake all here,
         * so check it again to be sure.
         */
        VM_BUG_ON(!range.len);

        wake_userfault(ctx, &range);
        ret = 0;

out:
        return ret;
}

static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
                            unsigned long arg)
{
        __s64 ret;
        struct uffdio_copy uffdio_copy;
        struct uffdio_copy __user *user_uffdio_copy;
        struct userfaultfd_wake_range range;
        uffd_flags_t flags = 0;

        user_uffdio_copy = (struct uffdio_copy __user *) arg;

        ret = -EAGAIN;
        if (atomic_read(&ctx->mmap_changing))
                goto out;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_copy, user_uffdio_copy,
                           /* don't copy "copy" last field */
                           sizeof(uffdio_copy)-sizeof(__s64)))
                goto out;

        ret = validate_unaligned_range(ctx->mm, uffdio_copy.src,
                                       uffdio_copy.len);
        if (ret)
                goto out;
        ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
        if (ret)
                goto out;

        ret = -EINVAL;
        if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
                goto out;
        if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
                flags |= MFILL_ATOMIC_WP;
        if (mmget_not_zero(ctx->mm)) {
                ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
                                        uffdio_copy.len, flags);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
        }
        if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
                return -EFAULT;
        if (ret < 0)
                goto out;
        BUG_ON(!ret);
        /* len == 0 would wake all */
        range.len = ret;
        if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
                range.start = uffdio_copy.dst;
                wake_userfault(ctx, &range);
        }
        ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
out:
        return ret;
}

static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
                                unsigned long arg)
{
        __s64 ret;
        struct uffdio_zeropage uffdio_zeropage;
        struct uffdio_zeropage __user *user_uffdio_zeropage;
        struct userfaultfd_wake_range range;

        user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;

        ret = -EAGAIN;
        if (atomic_read(&ctx->mmap_changing))
                goto out;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
                           /* don't copy "zeropage" last field */
                           sizeof(uffdio_zeropage)-sizeof(__s64)))
                goto out;

        ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
                             uffdio_zeropage.range.len);
        if (ret)
                goto out;
        ret = -EINVAL;
        if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
                goto out;

        if (mmget_not_zero(ctx->mm)) {
                ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
                                           uffdio_zeropage.range.len);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
        }
        if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
                return -EFAULT;
        if (ret < 0)
                goto out;
        /* len == 0 would wake all */
        BUG_ON(!ret);
        range.len = ret;
        if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
                range.start = uffdio_zeropage.range.start;
                wake_userfault(ctx, &range);
        }
        ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
out:
        return ret;
}

static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
                                    unsigned long arg)
{
        int ret;
        struct uffdio_writeprotect uffdio_wp;
        struct uffdio_writeprotect __user *user_uffdio_wp;
        struct userfaultfd_wake_range range;
        bool mode_wp, mode_dontwake;

        if (atomic_read(&ctx->mmap_changing))
                return -EAGAIN;

        user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;

        if (copy_from_user(&uffdio_wp, user_uffdio_wp,
                           sizeof(struct uffdio_writeprotect)))
                return -EFAULT;

        ret = validate_range(ctx->mm, uffdio_wp.range.start,
                             uffdio_wp.range.len);
        if (ret)
                return ret;

        if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
                               UFFDIO_WRITEPROTECT_MODE_WP))
                return -EINVAL;

        mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
        mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;

        if (mode_wp && mode_dontwake)
                return -EINVAL;

        if (mmget_not_zero(ctx->mm)) {
                ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
                                          uffdio_wp.range.len, mode_wp);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
        }

        if (ret)
                return ret;

        if (!mode_wp && !mode_dontwake) {
                range.start = uffdio_wp.range.start;
                range.len = uffdio_wp.range.len;
                wake_userfault(ctx, &range);
        }
        return ret;
}

static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
{
        __s64 ret;
        struct uffdio_continue uffdio_continue;
        struct uffdio_continue __user *user_uffdio_continue;
        struct userfaultfd_wake_range range;
        uffd_flags_t flags = 0;

        user_uffdio_continue = (struct uffdio_continue __user *)arg;

        ret = -EAGAIN;
        if (atomic_read(&ctx->mmap_changing))
                goto out;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_continue, user_uffdio_continue,
                           /* don't copy the output fields */
                           sizeof(uffdio_continue) - (sizeof(__s64))))
                goto out;

        ret = validate_range(ctx->mm, uffdio_continue.range.start,
                             uffdio_continue.range.len);
        if (ret)
                goto out;

        ret = -EINVAL;
        if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
                                     UFFDIO_CONTINUE_MODE_WP))
                goto out;
        if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
                flags |= MFILL_ATOMIC_WP;

        if (mmget_not_zero(ctx->mm)) {
                ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
                                            uffdio_continue.range.len, flags);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
        }

        if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
                return -EFAULT;
        if (ret < 0)
                goto out;

        /* len == 0 would wake all */
        BUG_ON(!ret);
        range.len = ret;
        if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
                range.start = uffdio_continue.range.start;
                wake_userfault(ctx, &range);
        }
        ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;

out:
        return ret;
}

static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
{
        __s64 ret;
        struct uffdio_poison uffdio_poison;
        struct uffdio_poison __user *user_uffdio_poison;
        struct userfaultfd_wake_range range;

        user_uffdio_poison = (struct uffdio_poison __user *)arg;

        ret = -EAGAIN;
        if (atomic_read(&ctx->mmap_changing))
                goto out;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_poison, user_uffdio_poison,
                           /* don't copy the output fields */
                           sizeof(uffdio_poison) - (sizeof(__s64))))
                goto out;

        ret = validate_range(ctx->mm, uffdio_poison.range.start,
                             uffdio_poison.range.len);
        if (ret)
                goto out;

        ret = -EINVAL;
        if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
                goto out;

        if (mmget_not_zero(ctx->mm)) {
                ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
                                          uffdio_poison.range.len, 0);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
        }

        if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
                return -EFAULT;
        if (ret < 0)
                goto out;

        /* len == 0 would wake all */
        BUG_ON(!ret);
        range.len = ret;
        if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
                range.start = uffdio_poison.range.start;
                wake_userfault(ctx, &range);
        }
        ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;

out:
        return ret;
}

bool userfaultfd_wp_async(struct vm_area_struct *vma)
{
        return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
}

static inline unsigned int uffd_ctx_features(__u64 user_features)
{
        /*
         * For the current set of features the bits just coincide. Set
         * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
         */
        return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
}

static int userfaultfd_move(struct userfaultfd_ctx *ctx,
                            unsigned long arg)
{
        __s64 ret;
        struct uffdio_move uffdio_move;
        struct uffdio_move __user *user_uffdio_move;
        struct userfaultfd_wake_range range;
        struct mm_struct *mm = ctx->mm;

        user_uffdio_move = (struct uffdio_move __user *) arg;

        if (atomic_read(&ctx->mmap_changing))
                return -EAGAIN;

        if (copy_from_user(&uffdio_move, user_uffdio_move,
                           /* don't copy "move" last field */
                           sizeof(uffdio_move)-sizeof(__s64)))
                return -EFAULT;

        /* Do not allow cross-mm moves. */
        if (mm != current->mm)
                return -EINVAL;

        ret = validate_range(mm, uffdio_move.dst, uffdio_move.len);
        if (ret)
                return ret;

        ret = validate_range(mm, uffdio_move.src, uffdio_move.len);
        if (ret)
                return ret;

        if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES|
                                  UFFDIO_MOVE_MODE_DONTWAKE))
                return -EINVAL;

        if (mmget_not_zero(mm)) {
                ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
                                 uffdio_move.len, uffdio_move.mode);
                mmput(mm);
        } else {
                return -ESRCH;
        }

        if (unlikely(put_user(ret, &user_uffdio_move->move)))
                return -EFAULT;
        if (ret < 0)
                goto out;

        /* len == 0 would wake all */
        VM_WARN_ON(!ret);
        range.len = ret;
        if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) {
                range.start = uffdio_move.dst;
                wake_userfault(ctx, &range);
        }
        ret = range.len == uffdio_move.len ? 0 : -EAGAIN;

out:
        return ret;
}

/*
 * userland asks for a certain API version and we return which bits
 * and ioctl commands are implemented in this kernel for such API
 * version or -EINVAL if unknown.
 */
static int userfaultfd_api(struct userfaultfd_ctx *ctx,
                           unsigned long arg)
{
        struct uffdio_api uffdio_api;
        void __user *buf = (void __user *)arg;
        unsigned int ctx_features;
        int ret;
        __u64 features;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
                goto out;
        features = uffdio_api.features;
        ret = -EINVAL;
        if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
                goto err_out;
        ret = -EPERM;
        if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
                goto err_out;

        /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */
        if (features & UFFD_FEATURE_WP_ASYNC)
                features |= UFFD_FEATURE_WP_UNPOPULATED;

        /* report all available features and ioctls to userland */
        uffdio_api.features = UFFD_API_FEATURES;
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
        uffdio_api.features &=
                ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
#endif
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
        uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
#endif
#ifndef CONFIG_PTE_MARKER_UFFD_WP
        uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
        uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
        uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
#endif
        uffdio_api.ioctls = UFFD_API_IOCTLS;
        ret = -EFAULT;
        if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
                goto out;

        /* only enable the requested features for this uffd context */
        ctx_features = uffd_ctx_features(features);
        ret = -EINVAL;
        if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
                goto err_out;

        ret = 0;
out:
        return ret;
err_out:
        memset(&uffdio_api, 0, sizeof(uffdio_api));
        if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
                ret = -EFAULT;
        goto out;
}

static long userfaultfd_ioctl(struct file *file, unsigned cmd,
                              unsigned long arg)
{
        int ret = -EINVAL;
        struct userfaultfd_ctx *ctx = file->private_data;

        if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
                return -EINVAL;

        switch(cmd) {
        case UFFDIO_API:
                ret = userfaultfd_api(ctx, arg);
                break;
        case UFFDIO_REGISTER:
                ret = userfaultfd_register(ctx, arg);
                break;
        case UFFDIO_UNREGISTER:
                ret = userfaultfd_unregister(ctx, arg);
                break;
        case UFFDIO_WAKE:
                ret = userfaultfd_wake(ctx, arg);
                break;
        case UFFDIO_COPY:
                ret = userfaultfd_copy(ctx, arg);
                break;
        case UFFDIO_ZEROPAGE:
                ret = userfaultfd_zeropage(ctx, arg);
                break;
        case UFFDIO_MOVE:
                ret = userfaultfd_move(ctx, arg);
                break;
        case UFFDIO_WRITEPROTECT:
                ret = userfaultfd_writeprotect(ctx, arg);
                break;
        case UFFDIO_CONTINUE:
                ret = userfaultfd_continue(ctx, arg);
                break;
        case UFFDIO_POISON:
                ret = userfaultfd_poison(ctx, arg);
                break;
        }
        return ret;
}

#ifdef CONFIG_PROC_FS
static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
{
        struct userfaultfd_ctx *ctx = f->private_data;
        wait_queue_entry_t *wq;
        unsigned long pending = 0, total = 0;

        spin_lock_irq(&ctx->fault_pending_wqh.lock);
        list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
                pending++;
                total++;
        }
        list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
                total++;
        }
        spin_unlock_irq(&ctx->fault_pending_wqh.lock);

        /*
         * If more protocols will be added, there will be all shown
         * separated by a space. Like this:
         *        protocols: aa:... bb:...
         */
        seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
                   pending, total, UFFD_API, ctx->features,
                   UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
}
#endif

static const struct file_operations userfaultfd_fops = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = userfaultfd_show_fdinfo,
#endif
        .release        = userfaultfd_release,
        .poll                = userfaultfd_poll,
        .read_iter        = userfaultfd_read_iter,
        .unlocked_ioctl = userfaultfd_ioctl,
        .compat_ioctl        = compat_ptr_ioctl,
        .llseek                = noop_llseek,
};

static void init_once_userfaultfd_ctx(void *mem)
{
        struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;

        init_waitqueue_head(&ctx->fault_pending_wqh);
        init_waitqueue_head(&ctx->fault_wqh);
        init_waitqueue_head(&ctx->event_wqh);
        init_waitqueue_head(&ctx->fd_wqh);
        seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
}

static int new_userfaultfd(int flags)
{
        struct userfaultfd_ctx *ctx;
        struct file *file;
        int fd;

        BUG_ON(!current->mm);

        /* Check the UFFD_* constants for consistency.  */
        BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
        BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
        BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);

        if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
                return -EINVAL;

        ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        refcount_set(&ctx->refcount, 1);
        ctx->flags = flags;
        ctx->features = 0;
        ctx->released = false;
        init_rwsem(&ctx->map_changing_lock);
        atomic_set(&ctx->mmap_changing, 0);
        ctx->mm = current->mm;

        fd = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
        if (fd < 0)
                goto err_out;

        /* Create a new inode so that the LSM can block the creation.  */
        file = anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
                        O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
        if (IS_ERR(file)) {
                put_unused_fd(fd);
                fd = PTR_ERR(file);
                goto err_out;
        }
        /* prevent the mm struct to be freed */
        mmgrab(ctx->mm);
        file->f_mode |= FMODE_NOWAIT;
        fd_install(fd, file);
        return fd;
err_out:
        kmem_cache_free(userfaultfd_ctx_cachep, ctx);
        return fd;
}

static inline bool userfaultfd_syscall_allowed(int flags)
{
        /* Userspace-only page faults are always allowed */
        if (flags & UFFD_USER_MODE_ONLY)
                return true;

        /*
         * The user is requesting a userfaultfd which can handle kernel faults.
         * Privileged users are always allowed to do this.
         */
        if (capable(CAP_SYS_PTRACE))
                return true;

        /* Otherwise, access to kernel fault handling is sysctl controlled. */
        return sysctl_unprivileged_userfaultfd;
}

SYSCALL_DEFINE1(userfaultfd, int, flags)
{
        if (!userfaultfd_syscall_allowed(flags))
                return -EPERM;

        return new_userfaultfd(flags);
}

static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
{
        if (cmd != USERFAULTFD_IOC_NEW)
                return -EINVAL;

        return new_userfaultfd(flags);
}

static const struct file_operations userfaultfd_dev_fops = {
        .unlocked_ioctl = userfaultfd_dev_ioctl,
        .compat_ioctl = userfaultfd_dev_ioctl,
        .owner = THIS_MODULE,
        .llseek = noop_llseek,
};

static struct miscdevice userfaultfd_misc = {
        .minor = MISC_DYNAMIC_MINOR,
        .name = "userfaultfd",
        .fops = &userfaultfd_dev_fops
};

static int __init userfaultfd_init(void)
{
        int ret;

        ret = misc_register(&userfaultfd_misc);
        if (ret)
                return ret;

        userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
                                                sizeof(struct userfaultfd_ctx),
                                                0,
                                                SLAB_HWCACHE_ALIGN|SLAB_PANIC,
                                                init_once_userfaultfd_ctx);
#ifdef CONFIG_SYSCTL
        register_sysctl_init("vm", vm_userfaultfd_table);
#endif
        return 0;
}
__initcall(userfaultfd_init);









































































    2 


























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_GENERIC_PGALLOC_H
#define __ASM_GENERIC_PGALLOC_H

#ifdef CONFIG_MMU

#define GFP_PGTABLE_KERNEL        (GFP_KERNEL | __GFP_ZERO)
#define GFP_PGTABLE_USER        (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT)

/**
 * __pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table
 * @mm: the mm_struct of the current context
 *
 * This function is intended for architectures that need
 * anything beyond simple page allocation.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm)
{
        struct ptdesc *ptdesc = pagetable_alloc_noprof(GFP_PGTABLE_KERNEL &
                        ~__GFP_HIGHMEM, 0);

        if (!ptdesc)
                return NULL;
        return ptdesc_address(ptdesc);
}
#define __pte_alloc_one_kernel(...)        alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__))

#ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
/**
 * pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table
 * @mm: the mm_struct of the current context
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm)
{
        return __pte_alloc_one_kernel_noprof(mm);
}
#define pte_alloc_one_kernel(...)        alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__))
#endif

/**
 * pte_free_kernel - free PTE-level kernel page table memory
 * @mm: the mm_struct of the current context
 * @pte: pointer to the memory containing the page table
 */
static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
{
        pagetable_free(virt_to_ptdesc(pte));
}

/**
 * __pte_alloc_one - allocate memory for a PTE-level user page table
 * @mm: the mm_struct of the current context
 * @gfp: GFP flags to use for the allocation
 *
 * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor().
 *
 * This function is intended for architectures that need
 * anything beyond simple page allocation or must have custom GFP flags.
 *
 * Return: `struct page` referencing the ptdesc or %NULL on error
 */
static inline pgtable_t __pte_alloc_one_noprof(struct mm_struct *mm, gfp_t gfp)
{
        struct ptdesc *ptdesc;

        ptdesc = pagetable_alloc_noprof(gfp, 0);
        if (!ptdesc)
                return NULL;
        if (!pagetable_pte_ctor(ptdesc)) {
                pagetable_free(ptdesc);
                return NULL;
        }

        return ptdesc_page(ptdesc);
}
#define __pte_alloc_one(...)        alloc_hooks(__pte_alloc_one_noprof(__VA_ARGS__))

#ifndef __HAVE_ARCH_PTE_ALLOC_ONE
/**
 * pte_alloc_one - allocate a page for PTE-level user page table
 * @mm: the mm_struct of the current context
 *
 * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor().
 *
 * Return: `struct page` referencing the ptdesc or %NULL on error
 */
static inline pgtable_t pte_alloc_one_noprof(struct mm_struct *mm)
{
        return __pte_alloc_one_noprof(mm, GFP_PGTABLE_USER);
}
#define pte_alloc_one(...)        alloc_hooks(pte_alloc_one_noprof(__VA_ARGS__))
#endif

/*
 * Should really implement gc for free page table pages. This could be
 * done with a reference count in struct page.
 */

/**
 * pte_free - free PTE-level user page table memory
 * @mm: the mm_struct of the current context
 * @pte_page: the `struct page` referencing the ptdesc
 */
static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
{
        struct ptdesc *ptdesc = page_ptdesc(pte_page);

        pagetable_pte_dtor(ptdesc);
        pagetable_free(ptdesc);
}


#if CONFIG_PGTABLE_LEVELS > 2

#ifndef __HAVE_ARCH_PMD_ALLOC_ONE
/**
 * pmd_alloc_one - allocate memory for a PMD-level page table
 * @mm: the mm_struct of the current context
 *
 * Allocate memory for a page table and ptdesc and runs pagetable_pmd_ctor().
 *
 * Allocations use %GFP_PGTABLE_USER in user context and
 * %GFP_PGTABLE_KERNEL in kernel context.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        struct ptdesc *ptdesc;
        gfp_t gfp = GFP_PGTABLE_USER;

        if (mm == &init_mm)
                gfp = GFP_PGTABLE_KERNEL;
        ptdesc = pagetable_alloc_noprof(gfp, 0);
        if (!ptdesc)
                return NULL;
        if (!pagetable_pmd_ctor(ptdesc)) {
                pagetable_free(ptdesc);
                return NULL;
        }
        return ptdesc_address(ptdesc);
}
#define pmd_alloc_one(...)        alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__))
#endif

#ifndef __HAVE_ARCH_PMD_FREE
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pmd);

        BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
        pagetable_pmd_dtor(ptdesc);
        pagetable_free(ptdesc);
}
#endif

#endif /* CONFIG_PGTABLE_LEVELS > 2 */

#if CONFIG_PGTABLE_LEVELS > 3

static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        gfp_t gfp = GFP_PGTABLE_USER;
        struct ptdesc *ptdesc;

        if (mm == &init_mm)
                gfp = GFP_PGTABLE_KERNEL;
        gfp &= ~__GFP_HIGHMEM;

        ptdesc = pagetable_alloc_noprof(gfp, 0);
        if (!ptdesc)
                return NULL;

        pagetable_pud_ctor(ptdesc);
        return ptdesc_address(ptdesc);
}
#define __pud_alloc_one(...)        alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__))

#ifndef __HAVE_ARCH_PUD_ALLOC_ONE
/**
 * pud_alloc_one - allocate memory for a PUD-level page table
 * @mm: the mm_struct of the current context
 *
 * Allocate memory for a page table using %GFP_PGTABLE_USER for user context
 * and %GFP_PGTABLE_KERNEL for kernel context.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pud_t *pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        return __pud_alloc_one_noprof(mm, addr);
}
#define pud_alloc_one(...)        alloc_hooks(pud_alloc_one_noprof(__VA_ARGS__))
#endif

static inline void __pud_free(struct mm_struct *mm, pud_t *pud)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pud);

        BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
        pagetable_pud_dtor(ptdesc);
        pagetable_free(ptdesc);
}

#ifndef __HAVE_ARCH_PUD_FREE
static inline void pud_free(struct mm_struct *mm, pud_t *pud)
{
        __pud_free(mm, pud);
}
#endif

#endif /* CONFIG_PGTABLE_LEVELS > 3 */

#ifndef __HAVE_ARCH_PGD_FREE
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
        pagetable_free(virt_to_ptdesc(pgd));
}
#endif

#endif /* CONFIG_MMU */

#endif /* __ASM_GENERIC_PGALLOC_H */



























































    9 
















    9 




















    9 


















    9 
    9 
    8 
    9 

















   30 






   32 







   31 















   33 





























































































































































































































































    9 


    9 
    9 



    9 

    7 






    9 

    9 





    9 
    9 


    9 























































































































































































































































    4 






    4 



































    4 
    4 












    4 




    4 





















   29 



   32 




























   62 


   28 


   54 































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 1994 Linus Torvalds
 *
 *  Pentium III FXSR, SSE support
 *  General FPU state handling cleanups
 *        Gareth Hughes <gareth@valinux.com>, May 2000
 */
#include <asm/fpu/api.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/sched.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/types.h>
#include <asm/traps.h>
#include <asm/irq_regs.h>

#include <uapi/asm/kvm.h>

#include <linux/hardirq.h>
#include <linux/pkeys.h>
#include <linux/vmalloc.h>

#include "context.h"
#include "internal.h"
#include "legacy.h"
#include "xstate.h"

#define CREATE_TRACE_POINTS
#include <asm/trace/fpu.h>

#ifdef CONFIG_X86_64
DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
DEFINE_PER_CPU(u64, xfd_state);
#endif

/* The FPU state configuration data for kernel and user space */
struct fpu_state_config        fpu_kernel_cfg __ro_after_init;
struct fpu_state_config fpu_user_cfg __ro_after_init;

/*
 * Represents the initial FPU state. It's mostly (but not completely) zeroes,
 * depending on the FPU hardware format:
 */
struct fpstate init_fpstate __ro_after_init;

/* Track in-kernel FPU usage */
static DEFINE_PER_CPU(bool, in_kernel_fpu);

/*
 * Track which context is using the FPU on the CPU:
 */
DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);

/*
 * Can we use the FPU in kernel mode with the
 * whole "kernel_fpu_begin/end()" sequence?
 */
bool irq_fpu_usable(void)
{
        if (WARN_ON_ONCE(in_nmi()))
                return false;

        /* In kernel FPU usage already active? */
        if (this_cpu_read(in_kernel_fpu))
                return false;

        /*
         * When not in NMI or hard interrupt context, FPU can be used in:
         *
         * - Task context except from within fpregs_lock()'ed critical
         *   regions.
         *
         * - Soft interrupt processing context which cannot happen
         *   while in a fpregs_lock()'ed critical region.
         */
        if (!in_hardirq())
                return true;

        /*
         * In hard interrupt context it's safe when soft interrupts
         * are enabled, which means the interrupt did not hit in
         * a fpregs_lock()'ed critical region.
         */
        return !softirq_count();
}
EXPORT_SYMBOL(irq_fpu_usable);

/*
 * Track AVX512 state use because it is known to slow the max clock
 * speed of the core.
 */
static void update_avx_timestamp(struct fpu *fpu)
{

#define AVX512_TRACKING_MASK        (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM)

        if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK)
                fpu->avx512_timestamp = jiffies;
}

/*
 * Save the FPU register state in fpu->fpstate->regs. The register state is
 * preserved.
 *
 * Must be called with fpregs_lock() held.
 *
 * The legacy FNSAVE instruction clears all FPU state unconditionally, so
 * register state has to be reloaded. That might be a pointless exercise
 * when the FPU is going to be used by another task right after that. But
 * this only affects 20+ years old 32bit systems and avoids conditionals all
 * over the place.
 *
 * FXSAVE and all XSAVE variants preserve the FPU register state.
 */
void save_fpregs_to_fpstate(struct fpu *fpu)
{
        if (likely(use_xsave())) {
                os_xsave(fpu->fpstate);
                update_avx_timestamp(fpu);
                return;
        }

        if (likely(use_fxsr())) {
                fxsave(&fpu->fpstate->regs.fxsave);
                return;
        }

        /*
         * Legacy FPU register saving, FNSAVE always clears FPU registers,
         * so we have to reload them from the memory state.
         */
        asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave));
        frstor(&fpu->fpstate->regs.fsave);
}

void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
{
        /*
         * AMD K7/K8 and later CPUs up to Zen don't save/restore
         * FDP/FIP/FOP unless an exception is pending. Clear the x87 state
         * here by setting it to fixed values.  "m" is a random variable
         * that should be in L1.
         */
        if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
                asm volatile(
                        "fnclex\n\t"
                        "emms\n\t"
                        "fildl %[addr]"        /* set F?P to defined value */
                        : : [addr] "m" (*fpstate));
        }

        if (use_xsave()) {
                /*
                 * Dynamically enabled features are enabled in XCR0, but
                 * usage requires also that the corresponding bits in XFD
                 * are cleared.  If the bits are set then using a related
                 * instruction will raise #NM. This allows to do the
                 * allocation of the larger FPU buffer lazy from #NM or if
                 * the task has no permission to kill it which would happen
                 * via #UD if the feature is disabled in XCR0.
                 *
                 * XFD state is following the same life time rules as
                 * XSTATE and to restore state correctly XFD has to be
                 * updated before XRSTORS otherwise the component would
                 * stay in or go into init state even if the bits are set
                 * in fpstate::regs::xsave::xfeatures.
                 */
                xfd_update_state(fpstate);

                /*
                 * Restoring state always needs to modify all features
                 * which are in @mask even if the current task cannot use
                 * extended features.
                 *
                 * So fpstate->xfeatures cannot be used here, because then
                 * a feature for which the task has no permission but was
                 * used by the previous task would not go into init state.
                 */
                mask = fpu_kernel_cfg.max_features & mask;

                os_xrstor(fpstate, mask);
        } else {
                if (use_fxsr())
                        fxrstor(&fpstate->regs.fxsave);
                else
                        frstor(&fpstate->regs.fsave);
        }
}

void fpu_reset_from_exception_fixup(void)
{
        restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE);
}

#if IS_ENABLED(CONFIG_KVM)
static void __fpstate_reset(struct fpstate *fpstate, u64 xfd);

static void fpu_init_guest_permissions(struct fpu_guest *gfpu)
{
        struct fpu_state_perm *fpuperm;
        u64 perm;

        if (!IS_ENABLED(CONFIG_X86_64))
                return;

        spin_lock_irq(&current->sighand->siglock);
        fpuperm = &current->group_leader->thread.fpu.guest_perm;
        perm = fpuperm->__state_perm;

        /* First fpstate allocation locks down permissions. */
        WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED);

        spin_unlock_irq(&current->sighand->siglock);

        gfpu->perm = perm & ~FPU_GUEST_PERM_LOCKED;
}

bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
{
        struct fpstate *fpstate;
        unsigned int size;

        size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64);
        fpstate = vzalloc(size);
        if (!fpstate)
                return false;

        /* Leave xfd to 0 (the reset value defined by spec) */
        __fpstate_reset(fpstate, 0);
        fpstate_init_user(fpstate);
        fpstate->is_valloc        = true;
        fpstate->is_guest        = true;

        gfpu->fpstate                = fpstate;
        gfpu->xfeatures                = fpu_user_cfg.default_features;
        gfpu->perm                = fpu_user_cfg.default_features;

        /*
         * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state
         * to userspace, even when XSAVE is unsupported, so that restoring FPU
         * state on a different CPU that does support XSAVE can cleanly load
         * the incoming state using its natural XSAVE.  In other words, KVM's
         * uABI size may be larger than this host's default size.  Conversely,
         * the default size should never be larger than KVM's base uABI size;
         * all features that can expand the uABI size must be opt-in.
         */
        gfpu->uabi_size                = sizeof(struct kvm_xsave);
        if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size))
                gfpu->uabi_size = fpu_user_cfg.default_size;

        fpu_init_guest_permissions(gfpu);

        return true;
}
EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate);

void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
{
        struct fpstate *fps = gfpu->fpstate;

        if (!fps)
                return;

        if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use))
                return;

        gfpu->fpstate = NULL;
        vfree(fps);
}
EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate);

/*
  * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable
  * @guest_fpu:         Pointer to the guest FPU container
  * @xfeatures:         Features requested by guest CPUID
  *
  * Enable all dynamic xfeatures according to guest perm and requested CPUID.
  *
  * Return: 0 on success, error code otherwise
  */
int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures)
{
        lockdep_assert_preemption_enabled();

        /* Nothing to do if all requested features are already enabled. */
        xfeatures &= ~guest_fpu->xfeatures;
        if (!xfeatures)
                return 0;

        return __xfd_enable_feature(xfeatures, guest_fpu);
}
EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features);

#ifdef CONFIG_X86_64
void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
{
        fpregs_lock();
        guest_fpu->fpstate->xfd = xfd;
        if (guest_fpu->fpstate->in_use)
                xfd_update_state(guest_fpu->fpstate);
        fpregs_unlock();
}
EXPORT_SYMBOL_GPL(fpu_update_guest_xfd);

/**
 * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state
 *
 * Must be invoked from KVM after a VMEXIT before enabling interrupts when
 * XFD write emulation is disabled. This is required because the guest can
 * freely modify XFD and the state at VMEXIT is not guaranteed to be the
 * same as the state on VMENTER. So software state has to be updated before
 * any operation which depends on it can take place.
 *
 * Note: It can be invoked unconditionally even when write emulation is
 * enabled for the price of a then pointless MSR read.
 */
void fpu_sync_guest_vmexit_xfd_state(void)
{
        struct fpstate *fps = current->thread.fpu.fpstate;

        lockdep_assert_irqs_disabled();
        if (fpu_state_size_dynamic()) {
                rdmsrl(MSR_IA32_XFD, fps->xfd);
                __this_cpu_write(xfd_state, fps->xfd);
        }
}
EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state);
#endif /* CONFIG_X86_64 */

int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
{
        struct fpstate *guest_fps = guest_fpu->fpstate;
        struct fpu *fpu = &current->thread.fpu;
        struct fpstate *cur_fps = fpu->fpstate;

        fpregs_lock();
        if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD))
                save_fpregs_to_fpstate(fpu);

        /* Swap fpstate */
        if (enter_guest) {
                fpu->__task_fpstate = cur_fps;
                fpu->fpstate = guest_fps;
                guest_fps->in_use = true;
        } else {
                guest_fps->in_use = false;
                fpu->fpstate = fpu->__task_fpstate;
                fpu->__task_fpstate = NULL;
        }

        cur_fps = fpu->fpstate;

        if (!cur_fps->is_confidential) {
                /* Includes XFD update */
                restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE);
        } else {
                /*
                 * XSTATE is restored by firmware from encrypted
                 * memory. Make sure XFD state is correct while
                 * running with guest fpstate
                 */
                xfd_update_state(cur_fps);
        }

        fpregs_mark_activate();
        fpregs_unlock();
        return 0;
}
EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate);

void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
                                    unsigned int size, u64 xfeatures, u32 pkru)
{
        struct fpstate *kstate = gfpu->fpstate;
        union fpregs_state *ustate = buf;
        struct membuf mb = { .p = buf, .left = size };

        if (cpu_feature_enabled(X86_FEATURE_XSAVE)) {
                __copy_xstate_to_uabi_buf(mb, kstate, xfeatures, pkru,
                                          XSTATE_COPY_XSAVE);
        } else {
                memcpy(&ustate->fxsave, &kstate->regs.fxsave,
                       sizeof(ustate->fxsave));
                /* Make it restorable on a XSAVE enabled host */
                ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE;
        }
}
EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi);

int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
                                   u64 xcr0, u32 *vpkru)
{
        struct fpstate *kstate = gfpu->fpstate;
        const union fpregs_state *ustate = buf;

        if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) {
                if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE)
                        return -EINVAL;
                if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask)
                        return -EINVAL;
                memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave));
                return 0;
        }

        if (ustate->xsave.header.xfeatures & ~xcr0)
                return -EINVAL;

        /*
         * Nullify @vpkru to preserve its current value if PKRU's bit isn't set
         * in the header.  KVM's odd ABI is to leave PKRU untouched in this
         * case (all other components are eventually re-initialized).
         */
        if (!(ustate->xsave.header.xfeatures & XFEATURE_MASK_PKRU))
                vpkru = NULL;

        return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru);
}
EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
#endif /* CONFIG_KVM */

void kernel_fpu_begin_mask(unsigned int kfpu_mask)
{
        preempt_disable();

        WARN_ON_FPU(!irq_fpu_usable());
        WARN_ON_FPU(this_cpu_read(in_kernel_fpu));

        this_cpu_write(in_kernel_fpu, true);

        if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) &&
            !test_thread_flag(TIF_NEED_FPU_LOAD)) {
                set_thread_flag(TIF_NEED_FPU_LOAD);
                save_fpregs_to_fpstate(&current->thread.fpu);
        }
        __cpu_invalidate_fpregs_state();

        /* Put sane initial values into the control registers. */
        if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM))
                ldmxcsr(MXCSR_DEFAULT);

        if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU))
                asm volatile ("fninit");
}
EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask);

void kernel_fpu_end(void)
{
        WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));

        this_cpu_write(in_kernel_fpu, false);
        preempt_enable();
}
EXPORT_SYMBOL_GPL(kernel_fpu_end);

/*
 * Sync the FPU register state to current's memory register state when the
 * current task owns the FPU. The hardware register state is preserved.
 */
void fpu_sync_fpstate(struct fpu *fpu)
{
        WARN_ON_FPU(fpu != &current->thread.fpu);

        fpregs_lock();
        trace_x86_fpu_before_save(fpu);

        if (!test_thread_flag(TIF_NEED_FPU_LOAD))
                save_fpregs_to_fpstate(fpu);

        trace_x86_fpu_after_save(fpu);
        fpregs_unlock();
}

static inline unsigned int init_fpstate_copy_size(void)
{
        if (!use_xsave())
                return fpu_kernel_cfg.default_size;

        /* XSAVE(S) just needs the legacy and the xstate header part */
        return sizeof(init_fpstate.regs.xsave);
}

static inline void fpstate_init_fxstate(struct fpstate *fpstate)
{
        fpstate->regs.fxsave.cwd = 0x37f;
        fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT;
}

/*
 * Legacy x87 fpstate state init:
 */
static inline void fpstate_init_fstate(struct fpstate *fpstate)
{
        fpstate->regs.fsave.cwd = 0xffff037fu;
        fpstate->regs.fsave.swd = 0xffff0000u;
        fpstate->regs.fsave.twd = 0xffffffffu;
        fpstate->regs.fsave.fos = 0xffff0000u;
}

/*
 * Used in two places:
 * 1) Early boot to setup init_fpstate for non XSAVE systems
 * 2) fpu_init_fpstate_user() which is invoked from KVM
 */
void fpstate_init_user(struct fpstate *fpstate)
{
        if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
                fpstate_init_soft(&fpstate->regs.soft);
                return;
        }

        xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures);

        if (cpu_feature_enabled(X86_FEATURE_FXSR))
                fpstate_init_fxstate(fpstate);
        else
                fpstate_init_fstate(fpstate);
}

static void __fpstate_reset(struct fpstate *fpstate, u64 xfd)
{
        /* Initialize sizes and feature masks */
        fpstate->size                = fpu_kernel_cfg.default_size;
        fpstate->user_size        = fpu_user_cfg.default_size;
        fpstate->xfeatures        = fpu_kernel_cfg.default_features;
        fpstate->user_xfeatures        = fpu_user_cfg.default_features;
        fpstate->xfd                = xfd;
}

void fpstate_reset(struct fpu *fpu)
{
        /* Set the fpstate pointer to the default fpstate */
        fpu->fpstate = &fpu->__fpstate;
        __fpstate_reset(fpu->fpstate, init_fpstate.xfd);

        /* Initialize the permission related info in fpu */
        fpu->perm.__state_perm                = fpu_kernel_cfg.default_features;
        fpu->perm.__state_size                = fpu_kernel_cfg.default_size;
        fpu->perm.__user_state_size        = fpu_user_cfg.default_size;
        /* Same defaults for guests */
        fpu->guest_perm = fpu->perm;
}

static inline void fpu_inherit_perms(struct fpu *dst_fpu)
{
        if (fpu_state_size_dynamic()) {
                struct fpu *src_fpu = &current->group_leader->thread.fpu;

                spin_lock_irq(&current->sighand->siglock);
                /* Fork also inherits the permissions of the parent */
                dst_fpu->perm = src_fpu->perm;
                dst_fpu->guest_perm = src_fpu->guest_perm;
                spin_unlock_irq(&current->sighand->siglock);
        }
}

/* A passed ssp of zero will not cause any update */
static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp)
{
#ifdef CONFIG_X86_USER_SHADOW_STACK
        struct cet_user_state *xstate;

        /* If ssp update is not needed. */
        if (!ssp)
                return 0;

        xstate = get_xsave_addr(&dst->thread.fpu.fpstate->regs.xsave,
                                XFEATURE_CET_USER);

        /*
         * If there is a non-zero ssp, then 'dst' must be configured with a shadow
         * stack and the fpu state should be up to date since it was just copied
         * from the parent in fpu_clone(). So there must be a valid non-init CET
         * state location in the buffer.
         */
        if (WARN_ON_ONCE(!xstate))
                return 1;

        xstate->user_ssp = (u64)ssp;
#endif
        return 0;
}

/* Clone current's FPU state on fork */
int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal,
              unsigned long ssp)
{
        struct fpu *src_fpu = &current->thread.fpu;
        struct fpu *dst_fpu = &dst->thread.fpu;

        /* The new task's FPU state cannot be valid in the hardware. */
        dst_fpu->last_cpu = -1;

        fpstate_reset(dst_fpu);

        if (!cpu_feature_enabled(X86_FEATURE_FPU))
                return 0;

        /*
         * Enforce reload for user space tasks and prevent kernel threads
         * from trying to save the FPU registers on context switch.
         */
        set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);

        /*
         * No FPU state inheritance for kernel threads and IO
         * worker threads.
         */
        if (minimal) {
                /* Clear out the minimal state */
                memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs,
                       init_fpstate_copy_size());
                return 0;
        }

        /*
         * If a new feature is added, ensure all dynamic features are
         * caller-saved from here!
         */
        BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA);

        /*
         * Save the default portion of the current FPU state into the
         * clone. Assume all dynamic features to be defined as caller-
         * saved, which enables skipping both the expansion of fpstate
         * and the copying of any dynamic state.
         *
         * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because
         * copying is not valid when current uses non-default states.
         */
        fpregs_lock();
        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                fpregs_restore_userregs();
        save_fpregs_to_fpstate(dst_fpu);
        fpregs_unlock();
        if (!(clone_flags & CLONE_THREAD))
                fpu_inherit_perms(dst_fpu);

        /*
         * Children never inherit PASID state.
         * Force it to have its init value:
         */
        if (use_xsave())
                dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID;

        /*
         * Update shadow stack pointer, in case it changed during clone.
         */
        if (update_fpu_shstk(dst, ssp))
                return 1;

        trace_x86_fpu_copy_src(src_fpu);
        trace_x86_fpu_copy_dst(dst_fpu);

        return 0;
}

/*
 * Whitelist the FPU register state embedded into task_struct for hardened
 * usercopy.
 */
void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size)
{
        *offset = offsetof(struct thread_struct, fpu.__fpstate.regs);
        *size = fpu_kernel_cfg.default_size;
}

/*
 * Drops current FPU state: deactivates the fpregs and
 * the fpstate. NOTE: it still leaves previous contents
 * in the fpregs in the eager-FPU case.
 *
 * This function can be used in cases where we know that
 * a state-restore is coming: either an explicit one,
 * or a reschedule.
 */
void fpu__drop(struct fpu *fpu)
{
        preempt_disable();

        if (fpu == &current->thread.fpu) {
                /* Ignore delayed exceptions from user space */
                asm volatile("1: fwait\n"
                             "2:\n"
                             _ASM_EXTABLE(1b, 2b));
                fpregs_deactivate(fpu);
        }

        trace_x86_fpu_dropped(fpu);

        preempt_enable();
}

/*
 * Clear FPU registers by setting them up from the init fpstate.
 * Caller must do fpregs_[un]lock() around it.
 */
static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
{
        if (use_xsave())
                os_xrstor(&init_fpstate, features_mask);
        else if (use_fxsr())
                fxrstor(&init_fpstate.regs.fxsave);
        else
                frstor(&init_fpstate.regs.fsave);

        pkru_write_default();
}

/*
 * Reset current->fpu memory state to the init values.
 */
static void fpu_reset_fpregs(void)
{
        struct fpu *fpu = &current->thread.fpu;

        fpregs_lock();
        __fpu_invalidate_fpregs_state(fpu);
        /*
         * This does not change the actual hardware registers. It just
         * resets the memory image and sets TIF_NEED_FPU_LOAD so a
         * subsequent return to usermode will reload the registers from the
         * task's memory image.
         *
         * Do not use fpstate_init() here. Just copy init_fpstate which has
         * the correct content already except for PKRU.
         *
         * PKRU handling does not rely on the xstate when restoring for
         * user space as PKRU is eagerly written in switch_to() and
         * flush_thread().
         */
        memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size());
        set_thread_flag(TIF_NEED_FPU_LOAD);
        fpregs_unlock();
}

/*
 * Reset current's user FPU states to the init states.  current's
 * supervisor states, if any, are not modified by this function.  The
 * caller guarantees that the XSTATE header in memory is intact.
 */
void fpu__clear_user_states(struct fpu *fpu)
{
        WARN_ON_FPU(fpu != &current->thread.fpu);

        fpregs_lock();
        if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
                fpu_reset_fpregs();
                fpregs_unlock();
                return;
        }

        /*
         * Ensure that current's supervisor states are loaded into their
         * corresponding registers.
         */
        if (xfeatures_mask_supervisor() &&
            !fpregs_state_valid(fpu, smp_processor_id()))
                os_xrstor_supervisor(fpu->fpstate);

        /* Reset user states in registers. */
        restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE);

        /*
         * Now all FPU registers have their desired values.  Inform the FPU
         * state machine that current's FPU registers are in the hardware
         * registers. The memory image does not need to be updated because
         * any operation relying on it has to save the registers first when
         * current's FPU is marked active.
         */
        fpregs_mark_activate();
        fpregs_unlock();
}

void fpu_flush_thread(void)
{
        fpstate_reset(&current->thread.fpu);
        fpu_reset_fpregs();
}
/*
 * Load FPU context before returning to userspace.
 */
void switch_fpu_return(void)
{
        if (!static_cpu_has(X86_FEATURE_FPU))
                return;

        fpregs_restore_userregs();
}
EXPORT_SYMBOL_GPL(switch_fpu_return);

void fpregs_lock_and_load(void)
{
        /*
         * fpregs_lock() only disables preemption (mostly). So modifying state
         * in an interrupt could screw up some in progress fpregs operation.
         * Warn about it.
         */
        WARN_ON_ONCE(!irq_fpu_usable());
        WARN_ON_ONCE(current->flags & PF_KTHREAD);

        fpregs_lock();

        fpregs_assert_state_consistent();

        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                fpregs_restore_userregs();
}

#ifdef CONFIG_X86_DEBUG_FPU
/*
 * If current FPU state according to its tracking (loaded FPU context on this
 * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is
 * loaded on return to userland.
 */
void fpregs_assert_state_consistent(void)
{
        struct fpu *fpu = &current->thread.fpu;

        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                return;

        WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id()));
}
EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent);
#endif

void fpregs_mark_activate(void)
{
        struct fpu *fpu = &current->thread.fpu;

        fpregs_activate(fpu);
        fpu->last_cpu = smp_processor_id();
        clear_thread_flag(TIF_NEED_FPU_LOAD);
}

/*
 * x87 math exception handling:
 */

int fpu__exception_code(struct fpu *fpu, int trap_nr)
{
        int err;

        if (trap_nr == X86_TRAP_MF) {
                unsigned short cwd, swd;
                /*
                 * (~cwd & swd) will mask out exceptions that are not set to unmasked
                 * status.  0x3f is the exception bits in these regs, 0x200 is the
                 * C1 reg you need in case of a stack fault, 0x040 is the stack
                 * fault bit.  We should only be taking one exception at a time,
                 * so if this combination doesn't produce any single exception,
                 * then we have a bad program that isn't synchronizing its FPU usage
                 * and it will suffer the consequences since we won't be able to
                 * fully reproduce the context of the exception.
                 */
                if (boot_cpu_has(X86_FEATURE_FXSR)) {
                        cwd = fpu->fpstate->regs.fxsave.cwd;
                        swd = fpu->fpstate->regs.fxsave.swd;
                } else {
                        cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd;
                        swd = (unsigned short)fpu->fpstate->regs.fsave.swd;
                }

                err = swd & ~cwd;
        } else {
                /*
                 * The SIMD FPU exceptions are handled a little differently, as there
                 * is only a single status/control register.  Thus, to determine which
                 * unmasked exception was caught we must mask the exception mask bits
                 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
                 */
                unsigned short mxcsr = MXCSR_DEFAULT;

                if (boot_cpu_has(X86_FEATURE_XMM))
                        mxcsr = fpu->fpstate->regs.fxsave.mxcsr;

                err = ~(mxcsr >> 7) & mxcsr;
        }

        if (err & 0x001) {        /* Invalid op */
                /*
                 * swd & 0x240 == 0x040: Stack Underflow
                 * swd & 0x240 == 0x240: Stack Overflow
                 * User must clear the SF bit (0x40) if set
                 */
                return FPE_FLTINV;
        } else if (err & 0x004) { /* Divide by Zero */
                return FPE_FLTDIV;
        } else if (err & 0x008) { /* Overflow */
                return FPE_FLTOVF;
        } else if (err & 0x012) { /* Denormal, Underflow */
                return FPE_FLTUND;
        } else if (err & 0x020) { /* Precision */
                return FPE_FLTRES;
        }

        /*
         * If we're using IRQ 13, or supposedly even some trap
         * X86_TRAP_MF implementations, it's possible
         * we get a spurious trap, which is not an error.
         */
        return 0;
}

/*
 * Initialize register state that may prevent from entering low-power idle.
 * This function will be invoked from the cpuidle driver only when needed.
 */
noinstr void fpu_idle_fpregs(void)
{
        /* Note: AMX_TILE being enabled implies XGETBV1 support */
        if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) &&
            (xfeatures_in_use() & XFEATURE_MASK_XTILE)) {
                tile_release();
                __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
        }
}


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 















































    3 























    3 


















































































































































































































































































    2 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
// SPDX-License-Identifier: GPL-2.0

// Generated by scripts/atomic/gen-atomic-long.sh
// DO NOT MODIFY THIS FILE DIRECTLY

#ifndef _LINUX_ATOMIC_LONG_H
#define _LINUX_ATOMIC_LONG_H

#include <linux/compiler.h>
#include <asm/types.h>

#ifdef CONFIG_64BIT
typedef atomic64_t atomic_long_t;
#define ATOMIC_LONG_INIT(i)                ATOMIC64_INIT(i)
#define atomic_long_cond_read_acquire        atomic64_cond_read_acquire
#define atomic_long_cond_read_relaxed        atomic64_cond_read_relaxed
#else
typedef atomic_t atomic_long_t;
#define ATOMIC_LONG_INIT(i)                ATOMIC_INIT(i)
#define atomic_long_cond_read_acquire        atomic_cond_read_acquire
#define atomic_long_cond_read_relaxed        atomic_cond_read_relaxed
#endif

/**
 * raw_atomic_long_read() - atomic load with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_read() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline long
raw_atomic_long_read(const atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_read(v);
#else
        return raw_atomic_read(v);
#endif
}

/**
 * raw_atomic_long_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_read_acquire() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline long
raw_atomic_long_read_acquire(const atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_read_acquire(v);
#else
        return raw_atomic_read_acquire(v);
#endif
}

/**
 * raw_atomic_long_set() - atomic set with relaxed ordering
 * @v: pointer to atomic_long_t
 * @i: long value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_set() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_set(atomic_long_t *v, long i)
{
#ifdef CONFIG_64BIT
        raw_atomic64_set(v, i);
#else
        raw_atomic_set(v, i);
#endif
}

/**
 * raw_atomic_long_set_release() - atomic set with release ordering
 * @v: pointer to atomic_long_t
 * @i: long value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_set_release() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_set_release(atomic_long_t *v, long i)
{
#ifdef CONFIG_64BIT
        raw_atomic64_set_release(v, i);
#else
        raw_atomic_set_release(v, i);
#endif
}

/**
 * raw_atomic_long_add() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_add(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_add(i, v);
#else
        raw_atomic_add(i, v);
#endif
}

/**
 * raw_atomic_long_add_return() - atomic add with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_add_return(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_return(i, v);
#else
        return raw_atomic_add_return(i, v);
#endif
}

/**
 * raw_atomic_long_add_return_acquire() - atomic add with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_add_return_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_return_acquire(i, v);
#else
        return raw_atomic_add_return_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_add_return_release() - atomic add with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_add_return_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_return_release(i, v);
#else
        return raw_atomic_add_return_release(i, v);
#endif
}

/**
 * raw_atomic_long_add_return_relaxed() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_add_return_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_return_relaxed(i, v);
#else
        return raw_atomic_add_return_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add() - atomic add with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add(i, v);
#else
        return raw_atomic_fetch_add(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add_acquire() - atomic add with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add_acquire(i, v);
#else
        return raw_atomic_fetch_add_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add_release() - atomic add with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add_release(i, v);
#else
        return raw_atomic_fetch_add_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add_relaxed(i, v);
#else
        return raw_atomic_fetch_add_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_sub() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_sub(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_sub(i, v);
#else
        raw_atomic_sub(i, v);
#endif
}

/**
 * raw_atomic_long_sub_return() - atomic subtract with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_sub_return(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_return(i, v);
#else
        return raw_atomic_sub_return(i, v);
#endif
}

/**
 * raw_atomic_long_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_sub_return_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_return_acquire(i, v);
#else
        return raw_atomic_sub_return_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_sub_return_release() - atomic subtract with release ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_sub_return_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_return_release(i, v);
#else
        return raw_atomic_sub_return_release(i, v);
#endif
}

/**
 * raw_atomic_long_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_sub_return_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_return_relaxed(i, v);
#else
        return raw_atomic_sub_return_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_sub() - atomic subtract with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_sub() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_sub(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_sub(i, v);
#else
        return raw_atomic_fetch_sub(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_sub_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_sub_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_sub_acquire(i, v);
#else
        return raw_atomic_fetch_sub_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_sub_release() - atomic subtract with release ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_sub_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_sub_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_sub_release(i, v);
#else
        return raw_atomic_fetch_sub_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_sub_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_sub_relaxed(i, v);
#else
        return raw_atomic_fetch_sub_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_inc(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_inc(v);
#else
        raw_atomic_inc(v);
#endif
}

/**
 * raw_atomic_long_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_inc_return(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_return(v);
#else
        return raw_atomic_inc_return(v);
#endif
}

/**
 * raw_atomic_long_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_inc_return_acquire(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_return_acquire(v);
#else
        return raw_atomic_inc_return_acquire(v);
#endif
}

/**
 * raw_atomic_long_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_inc_return_release(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_return_release(v);
#else
        return raw_atomic_inc_return_release(v);
#endif
}

/**
 * raw_atomic_long_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_inc_return_relaxed(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_return_relaxed(v);
#else
        return raw_atomic_inc_return_relaxed(v);
#endif
}

/**
 * raw_atomic_long_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_inc() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_inc(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_inc(v);
#else
        return raw_atomic_fetch_inc(v);
#endif
}

/**
 * raw_atomic_long_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_inc_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_inc_acquire(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_inc_acquire(v);
#else
        return raw_atomic_fetch_inc_acquire(v);
#endif
}

/**
 * raw_atomic_long_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_inc_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_inc_release(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_inc_release(v);
#else
        return raw_atomic_fetch_inc_release(v);
#endif
}

/**
 * raw_atomic_long_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_inc_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_inc_relaxed(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_inc_relaxed(v);
#else
        return raw_atomic_fetch_inc_relaxed(v);
#endif
}

/**
 * raw_atomic_long_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_dec(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_dec(v);
#else
        raw_atomic_dec(v);
#endif
}

/**
 * raw_atomic_long_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_dec_return(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_return(v);
#else
        return raw_atomic_dec_return(v);
#endif
}

/**
 * raw_atomic_long_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_dec_return_acquire(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_return_acquire(v);
#else
        return raw_atomic_dec_return_acquire(v);
#endif
}

/**
 * raw_atomic_long_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_dec_return_release(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_return_release(v);
#else
        return raw_atomic_dec_return_release(v);
#endif
}

/**
 * raw_atomic_long_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_dec_return_relaxed(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_return_relaxed(v);
#else
        return raw_atomic_dec_return_relaxed(v);
#endif
}

/**
 * raw_atomic_long_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_dec() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_dec(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_dec(v);
#else
        return raw_atomic_fetch_dec(v);
#endif
}

/**
 * raw_atomic_long_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_dec_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_dec_acquire(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_dec_acquire(v);
#else
        return raw_atomic_fetch_dec_acquire(v);
#endif
}

/**
 * raw_atomic_long_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_dec_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_dec_release(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_dec_release(v);
#else
        return raw_atomic_fetch_dec_release(v);
#endif
}

/**
 * raw_atomic_long_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_dec_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_dec_relaxed(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_dec_relaxed(v);
#else
        return raw_atomic_fetch_dec_relaxed(v);
#endif
}

/**
 * raw_atomic_long_and() - atomic bitwise AND with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_and() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_and(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_and(i, v);
#else
        raw_atomic_and(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_and() - atomic bitwise AND with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_and() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_and(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_and(i, v);
#else
        return raw_atomic_fetch_and(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_and_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_and_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_and_acquire(i, v);
#else
        return raw_atomic_fetch_and_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_and_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_and_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_and_release(i, v);
#else
        return raw_atomic_fetch_and_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_and_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_and_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_and_relaxed(i, v);
#else
        return raw_atomic_fetch_and_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_andnot() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_andnot(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_andnot(i, v);
#else
        raw_atomic_andnot(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_andnot() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_andnot(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_andnot(i, v);
#else
        return raw_atomic_fetch_andnot(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_andnot_acquire(i, v);
#else
        return raw_atomic_fetch_andnot_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_andnot_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_andnot_release(i, v);
#else
        return raw_atomic_fetch_andnot_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_andnot_relaxed(i, v);
#else
        return raw_atomic_fetch_andnot_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_or() - atomic bitwise OR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_or() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_or(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_or(i, v);
#else
        raw_atomic_or(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_or() - atomic bitwise OR with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_or() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_or(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_or(i, v);
#else
        return raw_atomic_fetch_or(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_or_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_or_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_or_acquire(i, v);
#else
        return raw_atomic_fetch_or_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_or_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_or_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_or_release(i, v);
#else
        return raw_atomic_fetch_or_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_or_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_or_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_or_relaxed(i, v);
#else
        return raw_atomic_fetch_or_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_xor() - atomic bitwise XOR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xor() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_xor(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_xor(i, v);
#else
        raw_atomic_xor(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_xor() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_xor(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_xor(i, v);
#else
        return raw_atomic_fetch_xor(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_xor_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_xor_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_xor_acquire(i, v);
#else
        return raw_atomic_fetch_xor_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_xor_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_xor_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_xor_release(i, v);
#else
        return raw_atomic_fetch_xor_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_xor_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_xor_relaxed(i, v);
#else
        return raw_atomic_fetch_xor_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_xchg(atomic_long_t *v, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_xchg(v, new);
#else
        return raw_atomic_xchg(v, new);
#endif
}

/**
 * raw_atomic_long_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_xchg_acquire(atomic_long_t *v, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_xchg_acquire(v, new);
#else
        return raw_atomic_xchg_acquire(v, new);
#endif
}

/**
 * raw_atomic_long_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_xchg_release(atomic_long_t *v, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_xchg_release(v, new);
#else
        return raw_atomic_xchg_release(v, new);
#endif
}

/**
 * raw_atomic_long_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_xchg_relaxed(atomic_long_t *v, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_xchg_relaxed(v, new);
#else
        return raw_atomic_xchg_relaxed(v, new);
#endif
}

/**
 * raw_atomic_long_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_cmpxchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_cmpxchg(atomic_long_t *v, long old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_cmpxchg(v, old, new);
#else
        return raw_atomic_cmpxchg(v, old, new);
#endif
}

/**
 * raw_atomic_long_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_cmpxchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_cmpxchg_acquire(v, old, new);
#else
        return raw_atomic_cmpxchg_acquire(v, old, new);
#endif
}

/**
 * raw_atomic_long_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_cmpxchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_cmpxchg_release(v, old, new);
#else
        return raw_atomic_cmpxchg_release(v, old, new);
#endif
}

/**
 * raw_atomic_long_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_cmpxchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_cmpxchg_relaxed(v, old, new);
#else
        return raw_atomic_cmpxchg_relaxed(v, old, new);
#endif
}

/**
 * raw_atomic_long_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_try_cmpxchg(v, (s64 *)old, new);
#else
        return raw_atomic_try_cmpxchg(v, (int *)old, new);
#endif
}

/**
 * raw_atomic_long_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_acquire() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_try_cmpxchg_acquire(v, (s64 *)old, new);
#else
        return raw_atomic_try_cmpxchg_acquire(v, (int *)old, new);
#endif
}

/**
 * raw_atomic_long_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_release() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_try_cmpxchg_release(v, (s64 *)old, new);
#else
        return raw_atomic_try_cmpxchg_release(v, (int *)old, new);
#endif
}

/**
 * raw_atomic_long_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_relaxed() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_try_cmpxchg_relaxed(v, (s64 *)old, new);
#else
        return raw_atomic_try_cmpxchg_relaxed(v, (int *)old, new);
#endif
}

/**
 * raw_atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_sub_and_test(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_and_test(i, v);
#else
        return raw_atomic_sub_and_test(i, v);
#endif
}

/**
 * raw_atomic_long_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_dec_and_test(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_and_test(v);
#else
        return raw_atomic_dec_and_test(v);
#endif
}

/**
 * raw_atomic_long_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_inc_and_test(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_and_test(v);
#else
        return raw_atomic_inc_and_test(v);
#endif
}

/**
 * raw_atomic_long_add_negative() - atomic add and test if negative with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_negative() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_negative(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_negative(i, v);
#else
        return raw_atomic_add_negative(i, v);
#endif
}

/**
 * raw_atomic_long_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_negative_acquire() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_negative_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_negative_acquire(i, v);
#else
        return raw_atomic_add_negative_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_add_negative_release() - atomic add and test if negative with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_negative_release() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_negative_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_negative_release(i, v);
#else
        return raw_atomic_add_negative_release(i, v);
#endif
}

/**
 * raw_atomic_long_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_negative_relaxed() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_negative_relaxed(i, v);
#else
        return raw_atomic_add_negative_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_long_t
 * @a: long value to add
 * @u: long value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add_unless() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add_unless(v, a, u);
#else
        return raw_atomic_fetch_add_unless(v, a, u);
#endif
}

/**
 * raw_atomic_long_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_long_t
 * @a: long value to add
 * @u: long value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_unless() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_unless(atomic_long_t *v, long a, long u)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_unless(v, a, u);
#else
        return raw_atomic_add_unless(v, a, u);
#endif
}

/**
 * raw_atomic_long_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_not_zero() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_inc_not_zero(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_not_zero(v);
#else
        return raw_atomic_inc_not_zero(v);
#endif
}

/**
 * raw_atomic_long_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_unless_negative() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_inc_unless_negative(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_unless_negative(v);
#else
        return raw_atomic_inc_unless_negative(v);
#endif
}

/**
 * raw_atomic_long_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_unless_positive() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_dec_unless_positive(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_unless_positive(v);
#else
        return raw_atomic_dec_unless_positive(v);
#endif
}

/**
 * raw_atomic_long_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_if_positive() elsewhere.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline long
raw_atomic_long_dec_if_positive(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_if_positive(v);
#else
        return raw_atomic_dec_if_positive(v);
#endif
}

#endif /* _LINUX_ATOMIC_LONG_H */
// eadf183c3600b8b92b91839dd3be6bcc560c752d














































































































































































































































































































































    1 


    1 






































































































    7 






    8 




















































































    8 





    7 

    4 
















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
// SPDX-License-Identifier: GPL-2.0
/*
 * This contains functions for filename crypto management
 *
 * Copyright (C) 2015, Google, Inc.
 * Copyright (C) 2015, Motorola Mobility
 *
 * Written by Uday Savagaonkar, 2014.
 * Modified by Jaegeuk Kim, 2015.
 *
 * This has not yet undergone a rigorous security audit.
 */

#include <linux/namei.h>
#include <linux/scatterlist.h>
#include <crypto/hash.h>
#include <crypto/sha2.h>
#include <crypto/skcipher.h>
#include "fscrypt_private.h"

/*
 * The minimum message length (input and output length), in bytes, for all
 * filenames encryption modes.  Filenames shorter than this will be zero-padded
 * before being encrypted.
 */
#define FSCRYPT_FNAME_MIN_MSG_LEN 16

/*
 * struct fscrypt_nokey_name - identifier for directory entry when key is absent
 *
 * When userspace lists an encrypted directory without access to the key, the
 * filesystem must present a unique "no-key name" for each filename that allows
 * it to find the directory entry again if requested.  Naively, that would just
 * mean using the ciphertext filenames.  However, since the ciphertext filenames
 * can contain illegal characters ('\0' and '/'), they must be encoded in some
 * way.  We use base64url.  But that can cause names to exceed NAME_MAX (255
 * bytes), so we also need to use a strong hash to abbreviate long names.
 *
 * The filesystem may also need another kind of hash, the "dirhash", to quickly
 * find the directory entry.  Since filesystems normally compute the dirhash
 * over the on-disk filename (i.e. the ciphertext), it's not computable from
 * no-key names that abbreviate the ciphertext using the strong hash to fit in
 * NAME_MAX.  It's also not computable if it's a keyed hash taken over the
 * plaintext (but it may still be available in the on-disk directory entry);
 * casefolded directories use this type of dirhash.  At least in these cases,
 * each no-key name must include the name's dirhash too.
 *
 * To meet all these requirements, we base64url-encode the following
 * variable-length structure.  It contains the dirhash, or 0's if the filesystem
 * didn't provide one; up to 149 bytes of the ciphertext name; and for
 * ciphertexts longer than 149 bytes, also the SHA-256 of the remaining bytes.
 *
 * This ensures that each no-key name contains everything needed to find the
 * directory entry again, contains only legal characters, doesn't exceed
 * NAME_MAX, is unambiguous unless there's a SHA-256 collision, and that we only
 * take the performance hit of SHA-256 on very long filenames (which are rare).
 */
struct fscrypt_nokey_name {
        u32 dirhash[2];
        u8 bytes[149];
        u8 sha256[SHA256_DIGEST_SIZE];
}; /* 189 bytes => 252 bytes base64url-encoded, which is <= NAME_MAX (255) */

/*
 * Decoded size of max-size no-key name, i.e. a name that was abbreviated using
 * the strong hash and thus includes the 'sha256' field.  This isn't simply
 * sizeof(struct fscrypt_nokey_name), as the padding at the end isn't included.
 */
#define FSCRYPT_NOKEY_NAME_MAX        offsetofend(struct fscrypt_nokey_name, sha256)

/* Encoded size of max-size no-key name */
#define FSCRYPT_NOKEY_NAME_MAX_ENCODED \
                FSCRYPT_BASE64URL_CHARS(FSCRYPT_NOKEY_NAME_MAX)

static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
{
        return is_dot_dotdot(str->name, str->len);
}

/**
 * fscrypt_fname_encrypt() - encrypt a filename
 * @inode: inode of the parent directory (for regular filenames)
 *           or of the symlink (for symlink targets). Key must already be
 *           set up.
 * @iname: the filename to encrypt
 * @out: (output) the encrypted filename
 * @olen: size of the encrypted filename.  It must be at least @iname->len.
 *          Any extra space is filled with NUL padding before encryption.
 *
 * Return: 0 on success, -errno on failure
 */
int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
                          u8 *out, unsigned int olen)
{
        struct skcipher_request *req = NULL;
        DECLARE_CRYPTO_WAIT(wait);
        const struct fscrypt_inode_info *ci = inode->i_crypt_info;
        struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
        union fscrypt_iv iv;
        struct scatterlist sg;
        int res;

        /*
         * Copy the filename to the output buffer for encrypting in-place and
         * pad it with the needed number of NUL bytes.
         */
        if (WARN_ON_ONCE(olen < iname->len))
                return -ENOBUFS;
        memcpy(out, iname->name, iname->len);
        memset(out + iname->len, 0, olen - iname->len);

        /* Initialize the IV */
        fscrypt_generate_iv(&iv, 0, ci);

        /* Set up the encryption request */
        req = skcipher_request_alloc(tfm, GFP_NOFS);
        if (!req)
                return -ENOMEM;
        skcipher_request_set_callback(req,
                        CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
                        crypto_req_done, &wait);
        sg_init_one(&sg, out, olen);
        skcipher_request_set_crypt(req, &sg, &sg, olen, &iv);

        /* Do the encryption */
        res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
        skcipher_request_free(req);
        if (res < 0) {
                fscrypt_err(inode, "Filename encryption failed: %d", res);
                return res;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(fscrypt_fname_encrypt);

/**
 * fname_decrypt() - decrypt a filename
 * @inode: inode of the parent directory (for regular filenames)
 *           or of the symlink (for symlink targets)
 * @iname: the encrypted filename to decrypt
 * @oname: (output) the decrypted filename.  The caller must have allocated
 *           enough space for this, e.g. using fscrypt_fname_alloc_buffer().
 *
 * Return: 0 on success, -errno on failure
 */
static int fname_decrypt(const struct inode *inode,
                         const struct fscrypt_str *iname,
                         struct fscrypt_str *oname)
{
        struct skcipher_request *req = NULL;
        DECLARE_CRYPTO_WAIT(wait);
        struct scatterlist src_sg, dst_sg;
        const struct fscrypt_inode_info *ci = inode->i_crypt_info;
        struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
        union fscrypt_iv iv;
        int res;

        /* Allocate request */
        req = skcipher_request_alloc(tfm, GFP_NOFS);
        if (!req)
                return -ENOMEM;
        skcipher_request_set_callback(req,
                CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
                crypto_req_done, &wait);

        /* Initialize IV */
        fscrypt_generate_iv(&iv, 0, ci);

        /* Create decryption request */
        sg_init_one(&src_sg, iname->name, iname->len);
        sg_init_one(&dst_sg, oname->name, oname->len);
        skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, &iv);
        res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
        skcipher_request_free(req);
        if (res < 0) {
                fscrypt_err(inode, "Filename decryption failed: %d", res);
                return res;
        }

        oname->len = strnlen(oname->name, iname->len);
        return 0;
}

static const char base64url_table[65] =
        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";

#define FSCRYPT_BASE64URL_CHARS(nbytes)        DIV_ROUND_UP((nbytes) * 4, 3)

/**
 * fscrypt_base64url_encode() - base64url-encode some binary data
 * @src: the binary data to encode
 * @srclen: the length of @src in bytes
 * @dst: (output) the base64url-encoded string.  Not NUL-terminated.
 *
 * Encodes data using base64url encoding, i.e. the "Base 64 Encoding with URL
 * and Filename Safe Alphabet" specified by RFC 4648.  '='-padding isn't used,
 * as it's unneeded and not required by the RFC.  base64url is used instead of
 * base64 to avoid the '/' character, which isn't allowed in filenames.
 *
 * Return: the length of the resulting base64url-encoded string in bytes.
 *           This will be equal to FSCRYPT_BASE64URL_CHARS(srclen).
 */
static int fscrypt_base64url_encode(const u8 *src, int srclen, char *dst)
{
        u32 ac = 0;
        int bits = 0;
        int i;
        char *cp = dst;

        for (i = 0; i < srclen; i++) {
                ac = (ac << 8) | src[i];
                bits += 8;
                do {
                        bits -= 6;
                        *cp++ = base64url_table[(ac >> bits) & 0x3f];
                } while (bits >= 6);
        }
        if (bits)
                *cp++ = base64url_table[(ac << (6 - bits)) & 0x3f];
        return cp - dst;
}

/**
 * fscrypt_base64url_decode() - base64url-decode a string
 * @src: the string to decode.  Doesn't need to be NUL-terminated.
 * @srclen: the length of @src in bytes
 * @dst: (output) the decoded binary data
 *
 * Decodes a string using base64url encoding, i.e. the "Base 64 Encoding with
 * URL and Filename Safe Alphabet" specified by RFC 4648.  '='-padding isn't
 * accepted, nor are non-encoding characters such as whitespace.
 *
 * This implementation hasn't been optimized for performance.
 *
 * Return: the length of the resulting decoded binary data in bytes,
 *           or -1 if the string isn't a valid base64url string.
 */
static int fscrypt_base64url_decode(const char *src, int srclen, u8 *dst)
{
        u32 ac = 0;
        int bits = 0;
        int i;
        u8 *bp = dst;

        for (i = 0; i < srclen; i++) {
                const char *p = strchr(base64url_table, src[i]);

                if (p == NULL || src[i] == 0)
                        return -1;
                ac = (ac << 6) | (p - base64url_table);
                bits += 6;
                if (bits >= 8) {
                        bits -= 8;
                        *bp++ = (u8)(ac >> bits);
                }
        }
        if (ac & ((1 << bits) - 1))
                return -1;
        return bp - dst;
}

bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
                                    u32 orig_len, u32 max_len,
                                    u32 *encrypted_len_ret)
{
        int padding = 4 << (fscrypt_policy_flags(policy) &
                            FSCRYPT_POLICY_FLAGS_PAD_MASK);
        u32 encrypted_len;

        if (orig_len > max_len)
                return false;
        encrypted_len = max_t(u32, orig_len, FSCRYPT_FNAME_MIN_MSG_LEN);
        encrypted_len = round_up(encrypted_len, padding);
        *encrypted_len_ret = min(encrypted_len, max_len);
        return true;
}

/**
 * fscrypt_fname_encrypted_size() - calculate length of encrypted filename
 * @inode:                parent inode of dentry name being encrypted. Key must
 *                        already be set up.
 * @orig_len:                length of the original filename
 * @max_len:                maximum length to return
 * @encrypted_len_ret:        where calculated length should be returned (on success)
 *
 * Filenames that are shorter than the maximum length may have their lengths
 * increased slightly by encryption, due to padding that is applied.
 *
 * Return: false if the orig_len is greater than max_len. Otherwise, true and
 *           fill out encrypted_len_ret with the length (up to max_len).
 */
bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
                                  u32 max_len, u32 *encrypted_len_ret)
{
        return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy,
                                              orig_len, max_len,
                                              encrypted_len_ret);
}
EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size);

/**
 * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames
 * @max_encrypted_len: maximum length of encrypted filenames the buffer will be
 *                       used to present
 * @crypto_str: (output) buffer to allocate
 *
 * Allocate a buffer that is large enough to hold any decrypted or encoded
 * filename (null-terminated), for the given maximum encrypted filename length.
 *
 * Return: 0 on success, -errno on failure
 */
int fscrypt_fname_alloc_buffer(u32 max_encrypted_len,
                               struct fscrypt_str *crypto_str)
{
        u32 max_presented_len = max_t(u32, FSCRYPT_NOKEY_NAME_MAX_ENCODED,
                                      max_encrypted_len);

        crypto_str->name = kmalloc(max_presented_len + 1, GFP_NOFS);
        if (!crypto_str->name)
                return -ENOMEM;
        crypto_str->len = max_presented_len;
        return 0;
}
EXPORT_SYMBOL(fscrypt_fname_alloc_buffer);

/**
 * fscrypt_fname_free_buffer() - free a buffer for presented filenames
 * @crypto_str: the buffer to free
 *
 * Free a buffer that was allocated by fscrypt_fname_alloc_buffer().
 */
void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
{
        if (!crypto_str)
                return;
        kfree(crypto_str->name);
        crypto_str->name = NULL;
}
EXPORT_SYMBOL(fscrypt_fname_free_buffer);

/**
 * fscrypt_fname_disk_to_usr() - convert an encrypted filename to
 *                                 user-presentable form
 * @inode: inode of the parent directory (for regular filenames)
 *           or of the symlink (for symlink targets)
 * @hash: first part of the name's dirhash, if applicable.  This only needs to
 *          be provided if the filename is located in an indexed directory whose
 *          encryption key may be unavailable.  Not needed for symlink targets.
 * @minor_hash: second part of the name's dirhash, if applicable
 * @iname: encrypted filename to convert.  May also be "." or "..", which
 *           aren't actually encrypted.
 * @oname: output buffer for the user-presentable filename.  The caller must
 *           have allocated enough space for this, e.g. using
 *           fscrypt_fname_alloc_buffer().
 *
 * If the key is available, we'll decrypt the disk name.  Otherwise, we'll
 * encode it for presentation in fscrypt_nokey_name format.
 * See struct fscrypt_nokey_name for details.
 *
 * Return: 0 on success, -errno on failure
 */
int fscrypt_fname_disk_to_usr(const struct inode *inode,
                              u32 hash, u32 minor_hash,
                              const struct fscrypt_str *iname,
                              struct fscrypt_str *oname)
{
        const struct qstr qname = FSTR_TO_QSTR(iname);
        struct fscrypt_nokey_name nokey_name;
        u32 size; /* size of the unencoded no-key name */

        if (fscrypt_is_dot_dotdot(&qname)) {
                oname->name[0] = '.';
                oname->name[iname->len - 1] = '.';
                oname->len = iname->len;
                return 0;
        }

        if (iname->len < FSCRYPT_FNAME_MIN_MSG_LEN)
                return -EUCLEAN;

        if (fscrypt_has_encryption_key(inode))
                return fname_decrypt(inode, iname, oname);

        /*
         * Sanity check that struct fscrypt_nokey_name doesn't have padding
         * between fields and that its encoded size never exceeds NAME_MAX.
         */
        BUILD_BUG_ON(offsetofend(struct fscrypt_nokey_name, dirhash) !=
                     offsetof(struct fscrypt_nokey_name, bytes));
        BUILD_BUG_ON(offsetofend(struct fscrypt_nokey_name, bytes) !=
                     offsetof(struct fscrypt_nokey_name, sha256));
        BUILD_BUG_ON(FSCRYPT_NOKEY_NAME_MAX_ENCODED > NAME_MAX);

        nokey_name.dirhash[0] = hash;
        nokey_name.dirhash[1] = minor_hash;

        if (iname->len <= sizeof(nokey_name.bytes)) {
                memcpy(nokey_name.bytes, iname->name, iname->len);
                size = offsetof(struct fscrypt_nokey_name, bytes[iname->len]);
        } else {
                memcpy(nokey_name.bytes, iname->name, sizeof(nokey_name.bytes));
                /* Compute strong hash of remaining part of name. */
                sha256(&iname->name[sizeof(nokey_name.bytes)],
                       iname->len - sizeof(nokey_name.bytes),
                       nokey_name.sha256);
                size = FSCRYPT_NOKEY_NAME_MAX;
        }
        oname->len = fscrypt_base64url_encode((const u8 *)&nokey_name, size,
                                              oname->name);
        return 0;
}
EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);

/**
 * fscrypt_setup_filename() - prepare to search a possibly encrypted directory
 * @dir: the directory that will be searched
 * @iname: the user-provided filename being searched for
 * @lookup: 1 if we're allowed to proceed without the key because it's
 *        ->lookup() or we're finding the dir_entry for deletion; 0 if we cannot
 *        proceed without the key because we're going to create the dir_entry.
 * @fname: the filename information to be filled in
 *
 * Given a user-provided filename @iname, this function sets @fname->disk_name
 * to the name that would be stored in the on-disk directory entry, if possible.
 * If the directory is unencrypted this is simply @iname.  Else, if we have the
 * directory's encryption key, then @iname is the plaintext, so we encrypt it to
 * get the disk_name.
 *
 * Else, for keyless @lookup operations, @iname should be a no-key name, so we
 * decode it to get the struct fscrypt_nokey_name.  Non-@lookup operations will
 * be impossible in this case, so we fail them with ENOKEY.
 *
 * If successful, fscrypt_free_filename() must be called later to clean up.
 *
 * Return: 0 on success, -errno on failure
 */
int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
                              int lookup, struct fscrypt_name *fname)
{
        struct fscrypt_nokey_name *nokey_name;
        int ret;

        memset(fname, 0, sizeof(struct fscrypt_name));
        fname->usr_fname = iname;

        if (!IS_ENCRYPTED(dir) || fscrypt_is_dot_dotdot(iname)) {
                fname->disk_name.name = (unsigned char *)iname->name;
                fname->disk_name.len = iname->len;
                return 0;
        }
        ret = fscrypt_get_encryption_info(dir, lookup);
        if (ret)
                return ret;

        if (fscrypt_has_encryption_key(dir)) {
                if (!fscrypt_fname_encrypted_size(dir, iname->len, NAME_MAX,
                                                  &fname->crypto_buf.len))
                        return -ENAMETOOLONG;
                fname->crypto_buf.name = kmalloc(fname->crypto_buf.len,
                                                 GFP_NOFS);
                if (!fname->crypto_buf.name)
                        return -ENOMEM;

                ret = fscrypt_fname_encrypt(dir, iname, fname->crypto_buf.name,
                                            fname->crypto_buf.len);
                if (ret)
                        goto errout;
                fname->disk_name.name = fname->crypto_buf.name;
                fname->disk_name.len = fname->crypto_buf.len;
                return 0;
        }
        if (!lookup)
                return -ENOKEY;
        fname->is_nokey_name = true;

        /*
         * We don't have the key and we are doing a lookup; decode the
         * user-supplied name
         */

        if (iname->len > FSCRYPT_NOKEY_NAME_MAX_ENCODED)
                return -ENOENT;

        fname->crypto_buf.name = kmalloc(FSCRYPT_NOKEY_NAME_MAX, GFP_KERNEL);
        if (fname->crypto_buf.name == NULL)
                return -ENOMEM;

        ret = fscrypt_base64url_decode(iname->name, iname->len,
                                       fname->crypto_buf.name);
        if (ret < (int)offsetof(struct fscrypt_nokey_name, bytes[1]) ||
            (ret > offsetof(struct fscrypt_nokey_name, sha256) &&
             ret != FSCRYPT_NOKEY_NAME_MAX)) {
                ret = -ENOENT;
                goto errout;
        }
        fname->crypto_buf.len = ret;

        nokey_name = (void *)fname->crypto_buf.name;
        fname->hash = nokey_name->dirhash[0];
        fname->minor_hash = nokey_name->dirhash[1];
        if (ret != FSCRYPT_NOKEY_NAME_MAX) {
                /* The full ciphertext filename is available. */
                fname->disk_name.name = nokey_name->bytes;
                fname->disk_name.len =
                        ret - offsetof(struct fscrypt_nokey_name, bytes);
        }
        return 0;

errout:
        kfree(fname->crypto_buf.name);
        return ret;
}
EXPORT_SYMBOL(fscrypt_setup_filename);

/**
 * fscrypt_match_name() - test whether the given name matches a directory entry
 * @fname: the name being searched for
 * @de_name: the name from the directory entry
 * @de_name_len: the length of @de_name in bytes
 *
 * Normally @fname->disk_name will be set, and in that case we simply compare
 * that to the name stored in the directory entry.  The only exception is that
 * if we don't have the key for an encrypted directory and the name we're
 * looking for is very long, then we won't have the full disk_name and instead
 * we'll need to match against a fscrypt_nokey_name that includes a strong hash.
 *
 * Return: %true if the name matches, otherwise %false.
 */
bool fscrypt_match_name(const struct fscrypt_name *fname,
                        const u8 *de_name, u32 de_name_len)
{
        const struct fscrypt_nokey_name *nokey_name =
                (const void *)fname->crypto_buf.name;
        u8 digest[SHA256_DIGEST_SIZE];

        if (likely(fname->disk_name.name)) {
                if (de_name_len != fname->disk_name.len)
                        return false;
                return !memcmp(de_name, fname->disk_name.name, de_name_len);
        }
        if (de_name_len <= sizeof(nokey_name->bytes))
                return false;
        if (memcmp(de_name, nokey_name->bytes, sizeof(nokey_name->bytes)))
                return false;
        sha256(&de_name[sizeof(nokey_name->bytes)],
               de_name_len - sizeof(nokey_name->bytes), digest);
        return !memcmp(digest, nokey_name->sha256, sizeof(digest));
}
EXPORT_SYMBOL_GPL(fscrypt_match_name);

/**
 * fscrypt_fname_siphash() - calculate the SipHash of a filename
 * @dir: the parent directory
 * @name: the filename to calculate the SipHash of
 *
 * Given a plaintext filename @name and a directory @dir which uses SipHash as
 * its dirhash method and has had its fscrypt key set up, this function
 * calculates the SipHash of that name using the directory's secret dirhash key.
 *
 * Return: the SipHash of @name using the hash key of @dir
 */
u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name)
{
        const struct fscrypt_inode_info *ci = dir->i_crypt_info;

        WARN_ON_ONCE(!ci->ci_dirhash_key_initialized);

        return siphash(name->name, name->len, &ci->ci_dirhash_key);
}
EXPORT_SYMBOL_GPL(fscrypt_fname_siphash);

/*
 * Validate dentries in encrypted directories to make sure we aren't potentially
 * caching stale dentries after a key has been added.
 */
int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
{
        struct dentry *dir;
        int err;
        int valid;

        /*
         * Plaintext names are always valid, since fscrypt doesn't support
         * reverting to no-key names without evicting the directory's inode
         * -- which implies eviction of the dentries in the directory.
         */
        if (!(dentry->d_flags & DCACHE_NOKEY_NAME))
                return 1;

        /*
         * No-key name; valid if the directory's key is still unavailable.
         *
         * Although fscrypt forbids rename() on no-key names, we still must use
         * dget_parent() here rather than use ->d_parent directly.  That's
         * because a corrupted fs image may contain directory hard links, which
         * the VFS handles by moving the directory's dentry tree in the dcache
         * each time ->lookup() finds the directory and it already has a dentry
         * elsewhere.  Thus ->d_parent can be changing, and we must safely grab
         * a reference to some ->d_parent to prevent it from being freed.
         */

        if (flags & LOOKUP_RCU)
                return -ECHILD;

        dir = dget_parent(dentry);
        /*
         * Pass allow_unsupported=true, so that files with an unsupported
         * encryption policy can be deleted.
         */
        err = fscrypt_get_encryption_info(d_inode(dir), true);
        valid = !fscrypt_has_encryption_key(d_inode(dir));
        dput(dir);

        if (err < 0)
                return err;

        return valid;
}
EXPORT_SYMBOL_GPL(fscrypt_d_revalidate);






















































    7 









    7 
















    4 
    2 


    2 




































































































    1 







    1 
















    4 






    3 










































    2 
















    3 




























































































































































































































































































    3 







    3 



    3 

    1 
    1 


































    4 










    4 















    4 




















































































































































   14 






   17 



































































































































































    2 












    2 































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/sched/task_stack.h>
#include <linux/security.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mman.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/userfaultfd_k.h>
#include <linux/elf.h>
#include <linux/elf-randomize.h>
#include <linux/personality.h>
#include <linux/random.h>
#include <linux/processor.h>
#include <linux/sizes.h>
#include <linux/compat.h>

#include <linux/uaccess.h>

#include "internal.h"
#include "swap.h"

/**
 * kfree_const - conditionally free memory
 * @x: pointer to the memory
 *
 * Function calls kfree only if @x is not in .rodata section.
 */
void kfree_const(const void *x)
{
        if (!is_kernel_rodata((unsigned long)x))
                kfree(x);
}
EXPORT_SYMBOL(kfree_const);

/**
 * kstrdup - allocate space for and copy an existing string
 * @s: the string to duplicate
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Return: newly allocated copy of @s or %NULL in case of error
 */
noinline
char *kstrdup(const char *s, gfp_t gfp)
{
        size_t len;
        char *buf;

        if (!s)
                return NULL;

        len = strlen(s) + 1;
        buf = kmalloc_track_caller(len, gfp);
        if (buf)
                memcpy(buf, s, len);
        return buf;
}
EXPORT_SYMBOL(kstrdup);

/**
 * kstrdup_const - conditionally duplicate an existing const string
 * @s: the string to duplicate
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
 * must not be passed to krealloc().
 *
 * Return: source string if it is in .rodata section otherwise
 * fallback to kstrdup.
 */
const char *kstrdup_const(const char *s, gfp_t gfp)
{
        if (is_kernel_rodata((unsigned long)s))
                return s;

        return kstrdup(s, gfp);
}
EXPORT_SYMBOL(kstrdup_const);

/**
 * kstrndup - allocate space for and copy an existing string
 * @s: the string to duplicate
 * @max: read at most @max chars from @s
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Note: Use kmemdup_nul() instead if the size is known exactly.
 *
 * Return: newly allocated copy of @s or %NULL in case of error
 */
char *kstrndup(const char *s, size_t max, gfp_t gfp)
{
        size_t len;
        char *buf;

        if (!s)
                return NULL;

        len = strnlen(s, max);
        buf = kmalloc_track_caller(len+1, gfp);
        if (buf) {
                memcpy(buf, s, len);
                buf[len] = '\0';
        }
        return buf;
}
EXPORT_SYMBOL(kstrndup);

/**
 * kmemdup - duplicate region of memory
 *
 * @src: memory region to duplicate
 * @len: memory region length
 * @gfp: GFP mask to use
 *
 * Return: newly allocated copy of @src or %NULL in case of error,
 * result is physically contiguous. Use kfree() to free.
 */
void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp)
{
        void *p;

        p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_);
        if (p)
                memcpy(p, src, len);
        return p;
}
EXPORT_SYMBOL(kmemdup_noprof);

/**
 * kmemdup_array - duplicate a given array.
 *
 * @src: array to duplicate.
 * @count: number of elements to duplicate from array.
 * @element_size: size of each element of array.
 * @gfp: GFP mask to use.
 *
 * Return: duplicated array of @src or %NULL in case of error,
 * result is physically contiguous. Use kfree() to free.
 */
void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp)
{
        return kmemdup(src, size_mul(element_size, count), gfp);
}
EXPORT_SYMBOL(kmemdup_array);

/**
 * kvmemdup - duplicate region of memory
 *
 * @src: memory region to duplicate
 * @len: memory region length
 * @gfp: GFP mask to use
 *
 * Return: newly allocated copy of @src or %NULL in case of error,
 * result may be not physically contiguous. Use kvfree() to free.
 */
void *kvmemdup(const void *src, size_t len, gfp_t gfp)
{
        void *p;

        p = kvmalloc(len, gfp);
        if (p)
                memcpy(p, src, len);
        return p;
}
EXPORT_SYMBOL(kvmemdup);

/**
 * kmemdup_nul - Create a NUL-terminated string from unterminated data
 * @s: The data to stringify
 * @len: The size of the data
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Return: newly allocated copy of @s with NUL-termination or %NULL in
 * case of error
 */
char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
{
        char *buf;

        if (!s)
                return NULL;

        buf = kmalloc_track_caller(len + 1, gfp);
        if (buf) {
                memcpy(buf, s, len);
                buf[len] = '\0';
        }
        return buf;
}
EXPORT_SYMBOL(kmemdup_nul);

/**
 * memdup_user - duplicate memory region from user space
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.  Result is physically
 * contiguous, to be freed by kfree().
 */
void *memdup_user(const void __user *src, size_t len)
{
        void *p;

        p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }

        return p;
}
EXPORT_SYMBOL(memdup_user);

/**
 * vmemdup_user - duplicate memory region from user space
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.  Result may be not
 * physically contiguous.  Use kvfree() to free.
 */
void *vmemdup_user(const void __user *src, size_t len)
{
        void *p;

        p = kvmalloc(len, GFP_USER);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kvfree(p);
                return ERR_PTR(-EFAULT);
        }

        return p;
}
EXPORT_SYMBOL(vmemdup_user);

/**
 * strndup_user - duplicate an existing string from user space
 * @s: The string to duplicate
 * @n: Maximum number of bytes to copy, including the trailing NUL.
 *
 * Return: newly allocated copy of @s or an ERR_PTR() in case of error
 */
char *strndup_user(const char __user *s, long n)
{
        char *p;
        long length;

        length = strnlen_user(s, n);

        if (!length)
                return ERR_PTR(-EFAULT);

        if (length > n)
                return ERR_PTR(-EINVAL);

        p = memdup_user(s, length);

        if (IS_ERR(p))
                return p;

        p[length - 1] = '\0';

        return p;
}
EXPORT_SYMBOL(strndup_user);

/**
 * memdup_user_nul - duplicate memory region from user space and NUL-terminate
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.
 */
void *memdup_user_nul(const void __user *src, size_t len)
{
        char *p;

        /*
         * Always use GFP_KERNEL, since copy_from_user() can sleep and
         * cause pagefault, which makes it pointless to use GFP_NOFS
         * or GFP_ATOMIC.
         */
        p = kmalloc_track_caller(len + 1, GFP_KERNEL);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }
        p[len] = '\0';

        return p;
}
EXPORT_SYMBOL(memdup_user_nul);

/* Check if the vma is being used as a stack by this task */
int vma_is_stack_for_current(struct vm_area_struct *vma)
{
        struct task_struct * __maybe_unused t = current;

        return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}

/*
 * Change backing file, only valid to use during initial VMA setup.
 */
void vma_set_file(struct vm_area_struct *vma, struct file *file)
{
        /* Changing an anonymous vma with this is illegal */
        get_file(file);
        swap(vma->vm_file, file);
        fput(file);
}
EXPORT_SYMBOL(vma_set_file);

#ifndef STACK_RND_MASK
#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))     /* 8MB of VA */
#endif

unsigned long randomize_stack_top(unsigned long stack_top)
{
        unsigned long random_variable = 0;

        if (current->flags & PF_RANDOMIZE) {
                random_variable = get_random_long();
                random_variable &= STACK_RND_MASK;
                random_variable <<= PAGE_SHIFT;
        }
#ifdef CONFIG_STACK_GROWSUP
        return PAGE_ALIGN(stack_top) + random_variable;
#else
        return PAGE_ALIGN(stack_top) - random_variable;
#endif
}

/**
 * randomize_page - Generate a random, page aligned address
 * @start:        The smallest acceptable address the caller will take.
 * @range:        The size of the area, starting at @start, within which the
 *                random address must fall.
 *
 * If @start + @range would overflow, @range is capped.
 *
 * NOTE: Historical use of randomize_range, which this replaces, presumed that
 * @start was already page aligned.  We now align it regardless.
 *
 * Return: A page aligned address within [start, start + range).  On error,
 * @start is returned.
 */
unsigned long randomize_page(unsigned long start, unsigned long range)
{
        if (!PAGE_ALIGNED(start)) {
                range -= PAGE_ALIGN(start) - start;
                start = PAGE_ALIGN(start);
        }

        if (start > ULONG_MAX - range)
                range = ULONG_MAX - start;

        range >>= PAGE_SHIFT;

        if (range == 0)
                return start;

        return start + (get_random_long() % range << PAGE_SHIFT);
}

#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
unsigned long __weak arch_randomize_brk(struct mm_struct *mm)
{
        /* Is the current task 32bit ? */
        if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
                return randomize_page(mm->brk, SZ_32M);

        return randomize_page(mm->brk, SZ_1G);
}

unsigned long arch_mmap_rnd(void)
{
        unsigned long rnd;

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
        if (is_compat_task())
                rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
        else
#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
                rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);

        return rnd << PAGE_SHIFT;
}

static int mmap_is_legacy(struct rlimit *rlim_stack)
{
        if (current->personality & ADDR_COMPAT_LAYOUT)
                return 1;

        /* On parisc the stack always grows up - so a unlimited stack should
         * not be an indicator to use the legacy memory layout. */
        if (rlim_stack->rlim_cur == RLIM_INFINITY &&
                !IS_ENABLED(CONFIG_STACK_GROWSUP))
                return 1;

        return sysctl_legacy_va_layout;
}

/*
 * Leave enough space between the mmap area and the stack to honour ulimit in
 * the face of randomisation.
 */
#define MIN_GAP                (SZ_128M)
#define MAX_GAP                (STACK_TOP / 6 * 5)

static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
{
#ifdef CONFIG_STACK_GROWSUP
        /*
         * For an upwards growing stack the calculation is much simpler.
         * Memory for the maximum stack size is reserved at the top of the
         * task. mmap_base starts directly below the stack and grows
         * downwards.
         */
        return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd);
#else
        unsigned long gap = rlim_stack->rlim_cur;
        unsigned long pad = stack_guard_gap;

        /* Account for stack randomization if necessary */
        if (current->flags & PF_RANDOMIZE)
                pad += (STACK_RND_MASK << PAGE_SHIFT);

        /* Values close to RLIM_INFINITY can overflow. */
        if (gap + pad > gap)
                gap += pad;

        if (gap < MIN_GAP)
                gap = MIN_GAP;
        else if (gap > MAX_GAP)
                gap = MAX_GAP;

        return PAGE_ALIGN(STACK_TOP - gap - rnd);
#endif
}

void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
        unsigned long random_factor = 0UL;

        if (current->flags & PF_RANDOMIZE)
                random_factor = arch_mmap_rnd();

        if (mmap_is_legacy(rlim_stack)) {
                mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
                clear_bit(MMF_TOPDOWN, &mm->flags);
        } else {
                mm->mmap_base = mmap_base(random_factor, rlim_stack);
                set_bit(MMF_TOPDOWN, &mm->flags);
        }
}
#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
        mm->mmap_base = TASK_UNMAPPED_BASE;
        clear_bit(MMF_TOPDOWN, &mm->flags);
}
#endif

/**
 * __account_locked_vm - account locked pages to an mm's locked_vm
 * @mm:          mm to account against
 * @pages:       number of pages to account
 * @inc:         %true if @pages should be considered positive, %false if not
 * @task:        task used to check RLIMIT_MEMLOCK
 * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
 *
 * Assumes @task and @mm are valid (i.e. at least one reference on each), and
 * that mmap_lock is held as writer.
 *
 * Return:
 * * 0       on success
 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
 */
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
                        struct task_struct *task, bool bypass_rlim)
{
        unsigned long locked_vm, limit;
        int ret = 0;

        mmap_assert_write_locked(mm);

        locked_vm = mm->locked_vm;
        if (inc) {
                if (!bypass_rlim) {
                        limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
                        if (locked_vm + pages > limit)
                                ret = -ENOMEM;
                }
                if (!ret)
                        mm->locked_vm = locked_vm + pages;
        } else {
                WARN_ON_ONCE(pages > locked_vm);
                mm->locked_vm = locked_vm - pages;
        }

        pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
                 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
                 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
                 ret ? " - exceeded" : "");

        return ret;
}
EXPORT_SYMBOL_GPL(__account_locked_vm);

/**
 * account_locked_vm - account locked pages to an mm's locked_vm
 * @mm:          mm to account against, may be NULL
 * @pages:       number of pages to account
 * @inc:         %true if @pages should be considered positive, %false if not
 *
 * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
 *
 * Return:
 * * 0       on success, or if mm is NULL
 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
 */
int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
{
        int ret;

        if (pages == 0 || !mm)
                return 0;

        mmap_write_lock(mm);
        ret = __account_locked_vm(mm, pages, inc, current,
                                  capable(CAP_IPC_LOCK));
        mmap_write_unlock(mm);

        return ret;
}
EXPORT_SYMBOL_GPL(account_locked_vm);

unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot,
        unsigned long flag, unsigned long pgoff)
{
        unsigned long ret;
        struct mm_struct *mm = current->mm;
        unsigned long populate;
        LIST_HEAD(uf);

        ret = security_mmap_file(file, prot, flag);
        if (!ret) {
                if (mmap_write_lock_killable(mm))
                        return -EINTR;
                ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate,
                              &uf);
                mmap_write_unlock(mm);
                userfaultfd_unmap_complete(mm, &uf);
                if (populate)
                        mm_populate(ret, populate);
        }
        return ret;
}

unsigned long vm_mmap(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot,
        unsigned long flag, unsigned long offset)
{
        if (unlikely(offset + PAGE_ALIGN(len) < offset))
                return -EINVAL;
        if (unlikely(offset_in_page(offset)))
                return -EINVAL;

        return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
}
EXPORT_SYMBOL(vm_mmap);

/**
 * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
 * failure, fall back to non-contiguous (vmalloc) allocation.
 * @size: size of the request.
 * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
 * @node: numa node to allocate from
 *
 * Uses kmalloc to get the memory but if the allocation fails then falls back
 * to the vmalloc allocator. Use kvfree for freeing the memory.
 *
 * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
 * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
 * preferable to the vmalloc fallback, due to visible performance drawbacks.
 *
 * Return: pointer to the allocated memory of %NULL in case of failure
 */
void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node)
{
        gfp_t kmalloc_flags = flags;
        void *ret;

        /*
         * We want to attempt a large physically contiguous block first because
         * it is less likely to fragment multiple larger blocks and therefore
         * contribute to a long term fragmentation less than vmalloc fallback.
         * However make sure that larger requests are not too disruptive - no
         * OOM killer and no allocation failure warnings as we have a fallback.
         */
        if (size > PAGE_SIZE) {
                kmalloc_flags |= __GFP_NOWARN;

                if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
                        kmalloc_flags |= __GFP_NORETRY;

                /* nofail semantic is implemented by the vmalloc fallback */
                kmalloc_flags &= ~__GFP_NOFAIL;
        }

        ret = kmalloc_node_noprof(size, kmalloc_flags, node);

        /*
         * It doesn't really make sense to fallback to vmalloc for sub page
         * requests
         */
        if (ret || size <= PAGE_SIZE)
                return ret;

        /* non-sleeping allocations are not supported by vmalloc */
        if (!gfpflags_allow_blocking(flags))
                return NULL;

        /* Don't even allow crazy sizes */
        if (unlikely(size > INT_MAX)) {
                WARN_ON_ONCE(!(flags & __GFP_NOWARN));
                return NULL;
        }

        /*
         * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
         * since the callers already cannot assume anything
         * about the resulting pointer, and cannot play
         * protection games.
         */
        return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
                        flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
                        node, __builtin_return_address(0));
}
EXPORT_SYMBOL(kvmalloc_node_noprof);

/**
 * kvfree() - Free memory.
 * @addr: Pointer to allocated memory.
 *
 * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
 * It is slightly more efficient to use kfree() or vfree() if you are certain
 * that you know which one to use.
 *
 * Context: Either preemptible task context or not-NMI interrupt.
 */
void kvfree(const void *addr)
{
        if (is_vmalloc_addr(addr))
                vfree(addr);
        else
                kfree(addr);
}
EXPORT_SYMBOL(kvfree);

/**
 * kvfree_sensitive - Free a data object containing sensitive information.
 * @addr: address of the data object to be freed.
 * @len: length of the data object.
 *
 * Use the special memzero_explicit() function to clear the content of a
 * kvmalloc'ed object containing sensitive data to make sure that the
 * compiler won't optimize out the data clearing.
 */
void kvfree_sensitive(const void *addr, size_t len)
{
        if (likely(!ZERO_OR_NULL_PTR(addr))) {
                memzero_explicit((void *)addr, len);
                kvfree(addr);
        }
}
EXPORT_SYMBOL(kvfree_sensitive);

void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
{
        void *newp;

        if (oldsize >= newsize)
                return (void *)p;
        newp = kvmalloc_noprof(newsize, flags);
        if (!newp)
                return NULL;
        memcpy(newp, p, oldsize);
        kvfree(p);
        return newp;
}
EXPORT_SYMBOL(kvrealloc_noprof);

/**
 * __vmalloc_array - allocate memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        return __vmalloc_noprof(bytes, flags);
}
EXPORT_SYMBOL(__vmalloc_array_noprof);

/**
 * vmalloc_array - allocate memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 */
void *vmalloc_array_noprof(size_t n, size_t size)
{
        return __vmalloc_array_noprof(n, size, GFP_KERNEL);
}
EXPORT_SYMBOL(vmalloc_array_noprof);

/**
 * __vcalloc - allocate and zero memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags)
{
        return __vmalloc_array_noprof(n, size, flags | __GFP_ZERO);
}
EXPORT_SYMBOL(__vcalloc_noprof);

/**
 * vcalloc - allocate and zero memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 */
void *vcalloc_noprof(size_t n, size_t size)
{
        return __vmalloc_array_noprof(n, size, GFP_KERNEL | __GFP_ZERO);
}
EXPORT_SYMBOL(vcalloc_noprof);

struct anon_vma *folio_anon_vma(struct folio *folio)
{
        unsigned long mapping = (unsigned long)folio->mapping;

        if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                return NULL;
        return (void *)(mapping - PAGE_MAPPING_ANON);
}

/**
 * folio_mapping - Find the mapping where this folio is stored.
 * @folio: The folio.
 *
 * For folios which are in the page cache, return the mapping that this
 * page belongs to.  Folios in the swap cache return the swap mapping
 * this page is stored in (which is different from the mapping for the
 * swap file or swap device where the data is stored).
 *
 * You can call this for folios which aren't in the swap cache or page
 * cache and it will return NULL.
 */
struct address_space *folio_mapping(struct folio *folio)
{
        struct address_space *mapping;

        /* This happens if someone calls flush_dcache_page on slab page */
        if (unlikely(folio_test_slab(folio)))
                return NULL;

        if (unlikely(folio_test_swapcache(folio)))
                return swap_address_space(folio->swap);

        mapping = folio->mapping;
        if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
                return NULL;

        return mapping;
}
EXPORT_SYMBOL(folio_mapping);

/**
 * folio_copy - Copy the contents of one folio to another.
 * @dst: Folio to copy to.
 * @src: Folio to copy from.
 *
 * The bytes in the folio represented by @src are copied to @dst.
 * Assumes the caller has validated that @dst is at least as large as @src.
 * Can be called in atomic context for order-0 folios, but if the folio is
 * larger, it may sleep.
 */
void folio_copy(struct folio *dst, struct folio *src)
{
        long i = 0;
        long nr = folio_nr_pages(src);

        for (;;) {
                copy_highpage(folio_page(dst, i), folio_page(src, i));
                if (++i == nr)
                        break;
                cond_resched();
        }
}
EXPORT_SYMBOL(folio_copy);

int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
int sysctl_overcommit_ratio __read_mostly = 50;
unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */

int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_dointvec(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                sysctl_overcommit_kbytes = 0;
        return ret;
}

static void sync_overcommit_as(struct work_struct *dummy)
{
        percpu_counter_sync(&vm_committed_as);
}

int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
{
        struct ctl_table t;
        int new_policy = -1;
        int ret;

        /*
         * The deviation of sync_overcommit_as could be big with loose policy
         * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
         * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
         * with the strict "NEVER", and to avoid possible race condition (even
         * though user usually won't too frequently do the switching to policy
         * OVERCOMMIT_NEVER), the switch is done in the following order:
         *        1. changing the batch
         *        2. sync percpu count on each CPU
         *        3. switch the policy
         */
        if (write) {
                t = *table;
                t.data = &new_policy;
                ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
                if (ret || new_policy == -1)
                        return ret;

                mm_compute_batch(new_policy);
                if (new_policy == OVERCOMMIT_NEVER)
                        schedule_on_each_cpu(sync_overcommit_as);
                sysctl_overcommit_memory = new_policy;
        } else {
                ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        }

        return ret;
}

int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                sysctl_overcommit_ratio = 0;
        return ret;
}

/*
 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
 */
unsigned long vm_commit_limit(void)
{
        unsigned long allowed;

        if (sysctl_overcommit_kbytes)
                allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
        else
                allowed = ((totalram_pages() - hugetlb_total_pages())
                           * sysctl_overcommit_ratio / 100);
        allowed += total_swap_pages;

        return allowed;
}

/*
 * Make sure vm_committed_as in one cacheline and not cacheline shared with
 * other variables. It can be updated by several CPUs frequently.
 */
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;

/*
 * The global memory commitment made in the system can be a metric
 * that can be used to drive ballooning decisions when Linux is hosted
 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
 * balancing memory across competing virtual machines that are hosted.
 * Several metrics drive this policy engine including the guest reported
 * memory commitment.
 *
 * The time cost of this is very low for small platforms, and for big
 * platform like a 2S/36C/72T Skylake server, in worst case where
 * vm_committed_as's spinlock is under severe contention, the time cost
 * could be about 30~40 microseconds.
 */
unsigned long vm_memory_committed(void)
{
        return percpu_counter_sum_positive(&vm_committed_as);
}
EXPORT_SYMBOL_GPL(vm_memory_committed);

/*
 * Check that a process has enough memory to allocate a new virtual
 * mapping. 0 means there is enough memory for the allocation to
 * succeed and -ENOMEM implies there is not.
 *
 * We currently support three overcommit policies, which are set via the
 * vm.overcommit_memory sysctl.  See Documentation/mm/overcommit-accounting.rst
 *
 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
 * Additional code 2002 Jul 20 by Robert Love.
 *
 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
 *
 * Note this is a helper function intended to be used by LSMs which
 * wish to use this logic.
 */
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
        long allowed;
        unsigned long bytes_failed;

        vm_acct_memory(pages);

        /*
         * Sometimes we want to use more memory than we have
         */
        if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
                return 0;

        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
                if (pages > totalram_pages() + total_swap_pages)
                        goto error;
                return 0;
        }

        allowed = vm_commit_limit();
        /*
         * Reserve some for root
         */
        if (!cap_sys_admin)
                allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);

        /*
         * Don't let a single process grow so big a user can't recover
         */
        if (mm) {
                long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);

                allowed -= min_t(long, mm->total_vm / 32, reserve);
        }

        if (percpu_counter_read_positive(&vm_committed_as) < allowed)
                return 0;
error:
        bytes_failed = pages << PAGE_SHIFT;
        pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n",
                            __func__, current->pid, current->comm, bytes_failed);
        vm_unacct_memory(pages);

        return -ENOMEM;
}

/**
 * get_cmdline() - copy the cmdline value to a buffer.
 * @task:     the task whose cmdline value to copy.
 * @buffer:   the buffer to copy to.
 * @buflen:   the length of the buffer. Larger cmdline values are truncated
 *            to this length.
 *
 * Return: the size of the cmdline field copied. Note that the copy does
 * not guarantee an ending NULL byte.
 */
int get_cmdline(struct task_struct *task, char *buffer, int buflen)
{
        int res = 0;
        unsigned int len;
        struct mm_struct *mm = get_task_mm(task);
        unsigned long arg_start, arg_end, env_start, env_end;
        if (!mm)
                goto out;
        if (!mm->arg_end)
                goto out_mm;        /* Shh! No looking before we're done */

        spin_lock(&mm->arg_lock);
        arg_start = mm->arg_start;
        arg_end = mm->arg_end;
        env_start = mm->env_start;
        env_end = mm->env_end;
        spin_unlock(&mm->arg_lock);

        len = arg_end - arg_start;

        if (len > buflen)
                len = buflen;

        res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);

        /*
         * If the nul at the end of args has been overwritten, then
         * assume application is using setproctitle(3).
         */
        if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
                len = strnlen(buffer, res);
                if (len < res) {
                        res = len;
                } else {
                        len = env_end - env_start;
                        if (len > buflen - res)
                                len = buflen - res;
                        res += access_process_vm(task, env_start,
                                                 buffer+res, len,
                                                 FOLL_FORCE);
                        res = strnlen(buffer, res);
                }
        }
out_mm:
        mmput(mm);
out:
        return res;
}

int __weak memcmp_pages(struct page *page1, struct page *page2)
{
        char *addr1, *addr2;
        int ret;

        addr1 = kmap_local_page(page1);
        addr2 = kmap_local_page(page2);
        ret = memcmp(addr1, addr2, PAGE_SIZE);
        kunmap_local(addr2);
        kunmap_local(addr1);
        return ret;
}

#ifdef CONFIG_PRINTK
/**
 * mem_dump_obj - Print available provenance information
 * @object: object for which to find provenance information.
 *
 * This function uses pr_cont(), so that the caller is expected to have
 * printed out whatever preamble is appropriate.  The provenance information
 * depends on the type of object and on how much debugging is enabled.
 * For example, for a slab-cache object, the slab name is printed, and,
 * if available, the return address and stack trace from the allocation
 * and last free path of that object.
 */
void mem_dump_obj(void *object)
{
        const char *type;

        if (kmem_dump_obj(object))
                return;

        if (vmalloc_dump_obj(object))
                return;

        if (is_vmalloc_addr(object))
                type = "vmalloc memory";
        else if (virt_addr_valid(object))
                type = "non-slab/vmalloc memory";
        else if (object == NULL)
                type = "NULL pointer";
        else if (object == ZERO_SIZE_PTR)
                type = "zero-size pointer";
        else
                type = "non-paged memory";

        pr_cont(" %s\n", type);
}
EXPORT_SYMBOL_GPL(mem_dump_obj);
#endif

/*
 * A driver might set a page logically offline -- PageOffline() -- and
 * turn the page inaccessible in the hypervisor; after that, access to page
 * content can be fatal.
 *
 * Some special PFN walkers -- i.e., /proc/kcore -- read content of random
 * pages after checking PageOffline(); however, these PFN walkers can race
 * with drivers that set PageOffline().
 *
 * page_offline_freeze()/page_offline_thaw() allows for a subsystem to
 * synchronize with such drivers, achieving that a page cannot be set
 * PageOffline() while frozen.
 *
 * page_offline_begin()/page_offline_end() is used by drivers that care about
 * such races when setting a page PageOffline().
 */
static DECLARE_RWSEM(page_offline_rwsem);

void page_offline_freeze(void)
{
        down_read(&page_offline_rwsem);
}

void page_offline_thaw(void)
{
        up_read(&page_offline_rwsem);
}

void page_offline_begin(void)
{
        down_write(&page_offline_rwsem);
}
EXPORT_SYMBOL(page_offline_begin);

void page_offline_end(void)
{
        up_write(&page_offline_rwsem);
}
EXPORT_SYMBOL(page_offline_end);

#ifndef flush_dcache_folio
void flush_dcache_folio(struct folio *folio)
{
        long i, nr = folio_nr_pages(folio);

        for (i = 0; i < nr; i++)
                flush_dcache_page(folio_page(folio, i));
}
EXPORT_SYMBOL(flush_dcache_folio);
#endif

































































    2 
    2 












































































































































    2 









    1 


    1 












    2 







































    1 





    1 





    1 


    1 




    1 

















    1 


    1 








































































































    1 

    1 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 

    1 

































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2011 Novell Inc.
 * Copyright (C) 2016 Red Hat, Inc.
 */

#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/xattr.h>
#include <linux/exportfs.h>
#include <linux/file.h>
#include <linux/fileattr.h>
#include <linux/uuid.h>
#include <linux/namei.h>
#include <linux/ratelimit.h>
#include "overlayfs.h"

/* Get write access to upper mnt - may fail if upper sb was remounted ro */
int ovl_get_write_access(struct dentry *dentry)
{
        struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
        return mnt_get_write_access(ovl_upper_mnt(ofs));
}

/* Get write access to upper sb - may block if upper sb is frozen */
void ovl_start_write(struct dentry *dentry)
{
        struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
        sb_start_write(ovl_upper_mnt(ofs)->mnt_sb);
}

int ovl_want_write(struct dentry *dentry)
{
        struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
        return mnt_want_write(ovl_upper_mnt(ofs));
}

void ovl_put_write_access(struct dentry *dentry)
{
        struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
        mnt_put_write_access(ovl_upper_mnt(ofs));
}

void ovl_end_write(struct dentry *dentry)
{
        struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
        sb_end_write(ovl_upper_mnt(ofs)->mnt_sb);
}

void ovl_drop_write(struct dentry *dentry)
{
        struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
        mnt_drop_write(ovl_upper_mnt(ofs));
}

struct dentry *ovl_workdir(struct dentry *dentry)
{
        struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
        return ofs->workdir;
}

const struct cred *ovl_override_creds(struct super_block *sb)
{
        struct ovl_fs *ofs = OVL_FS(sb);

        return override_creds(ofs->creator_cred);
}

/*
 * Check if underlying fs supports file handles and try to determine encoding
 * type, in order to deduce maximum inode number used by fs.
 *
 * Return 0 if file handles are not supported.
 * Return 1 (FILEID_INO32_GEN) if fs uses the default 32bit inode encoding.
 * Return -1 if fs uses a non default encoding with unknown inode size.
 */
int ovl_can_decode_fh(struct super_block *sb)
{
        if (!capable(CAP_DAC_READ_SEARCH))
                return 0;

        if (!exportfs_can_decode_fh(sb->s_export_op))
                return 0;

        return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN;
}

struct dentry *ovl_indexdir(struct super_block *sb)
{
        struct ovl_fs *ofs = OVL_FS(sb);

        return ofs->config.index ? ofs->workdir : NULL;
}

/* Index all files on copy up. For now only enabled for NFS export */
bool ovl_index_all(struct super_block *sb)
{
        struct ovl_fs *ofs = OVL_FS(sb);

        return ofs->config.nfs_export && ofs->config.index;
}

/* Verify lower origin on lookup. For now only enabled for NFS export */
bool ovl_verify_lower(struct super_block *sb)
{
        struct ovl_fs *ofs = OVL_FS(sb);

        return ofs->config.nfs_export && ofs->config.index;
}

struct ovl_path *ovl_stack_alloc(unsigned int n)
{
        return kcalloc(n, sizeof(struct ovl_path), GFP_KERNEL);
}

void ovl_stack_cpy(struct ovl_path *dst, struct ovl_path *src, unsigned int n)
{
        unsigned int i;

        memcpy(dst, src, sizeof(struct ovl_path) * n);
        for (i = 0; i < n; i++)
                dget(src[i].dentry);
}

void ovl_stack_put(struct ovl_path *stack, unsigned int n)
{
        unsigned int i;

        for (i = 0; stack && i < n; i++)
                dput(stack[i].dentry);
}

void ovl_stack_free(struct ovl_path *stack, unsigned int n)
{
        ovl_stack_put(stack, n);
        kfree(stack);
}

struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
{
        size_t size = offsetof(struct ovl_entry, __lowerstack[numlower]);
        struct ovl_entry *oe = kzalloc(size, GFP_KERNEL);

        if (oe)
                oe->__numlower = numlower;

        return oe;
}

void ovl_free_entry(struct ovl_entry *oe)
{
        ovl_stack_put(ovl_lowerstack(oe), ovl_numlower(oe));
        kfree(oe);
}

#define OVL_D_REVALIDATE (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE)

bool ovl_dentry_remote(struct dentry *dentry)
{
        return dentry->d_flags & OVL_D_REVALIDATE;
}

void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *realdentry)
{
        if (!ovl_dentry_remote(realdentry))
                return;

        spin_lock(&dentry->d_lock);
        dentry->d_flags |= realdentry->d_flags & OVL_D_REVALIDATE;
        spin_unlock(&dentry->d_lock);
}

void ovl_dentry_init_reval(struct dentry *dentry, struct dentry *upperdentry,
                           struct ovl_entry *oe)
{
        return ovl_dentry_init_flags(dentry, upperdentry, oe, OVL_D_REVALIDATE);
}

void ovl_dentry_init_flags(struct dentry *dentry, struct dentry *upperdentry,
                           struct ovl_entry *oe, unsigned int mask)
{
        struct ovl_path *lowerstack = ovl_lowerstack(oe);
        unsigned int i, flags = 0;

        if (upperdentry)
                flags |= upperdentry->d_flags;
        for (i = 0; i < ovl_numlower(oe) && lowerstack[i].dentry; i++)
                flags |= lowerstack[i].dentry->d_flags;

        spin_lock(&dentry->d_lock);
        dentry->d_flags &= ~mask;
        dentry->d_flags |= flags & mask;
        spin_unlock(&dentry->d_lock);
}

bool ovl_dentry_weird(struct dentry *dentry)
{
        return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT |
                                  DCACHE_MANAGE_TRANSIT |
                                  DCACHE_OP_HASH |
                                  DCACHE_OP_COMPARE);
}

enum ovl_path_type ovl_path_type(struct dentry *dentry)
{
        struct ovl_entry *oe = OVL_E(dentry);
        enum ovl_path_type type = 0;

        if (ovl_dentry_upper(dentry)) {
                type = __OVL_PATH_UPPER;

                /*
                 * Non-dir dentry can hold lower dentry of its copy up origin.
                 */
                if (ovl_numlower(oe)) {
                        if (ovl_test_flag(OVL_CONST_INO, d_inode(dentry)))
                                type |= __OVL_PATH_ORIGIN;
                        if (d_is_dir(dentry) ||
                            !ovl_has_upperdata(d_inode(dentry)))
                                type |= __OVL_PATH_MERGE;
                }
        } else {
                if (ovl_numlower(oe) > 1)
                        type |= __OVL_PATH_MERGE;
        }
        return type;
}

void ovl_path_upper(struct dentry *dentry, struct path *path)
{
        struct ovl_fs *ofs = OVL_FS(dentry->d_sb);

        path->mnt = ovl_upper_mnt(ofs);
        path->dentry = ovl_dentry_upper(dentry);
}

void ovl_path_lower(struct dentry *dentry, struct path *path)
{
        struct ovl_entry *oe = OVL_E(dentry);
        struct ovl_path *lowerpath = ovl_lowerstack(oe);

        if (ovl_numlower(oe)) {
                path->mnt = lowerpath->layer->mnt;
                path->dentry = lowerpath->dentry;
        } else {
                *path = (struct path) { };
        }
}

void ovl_path_lowerdata(struct dentry *dentry, struct path *path)
{
        struct ovl_entry *oe = OVL_E(dentry);
        struct ovl_path *lowerdata = ovl_lowerdata(oe);
        struct dentry *lowerdata_dentry = ovl_lowerdata_dentry(oe);

        if (lowerdata_dentry) {
                path->dentry = lowerdata_dentry;
                /*
                 * Pairs with smp_wmb() in ovl_dentry_set_lowerdata().
                 * Make sure that if lowerdata->dentry is visible, then
                 * datapath->layer is visible as well.
                 */
                smp_rmb();
                path->mnt = READ_ONCE(lowerdata->layer)->mnt;
        } else {
                *path = (struct path) { };
        }
}

enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
{
        enum ovl_path_type type = ovl_path_type(dentry);

        if (!OVL_TYPE_UPPER(type))
                ovl_path_lower(dentry, path);
        else
                ovl_path_upper(dentry, path);

        return type;
}

enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path)
{
        enum ovl_path_type type = ovl_path_type(dentry);

        WARN_ON_ONCE(d_is_dir(dentry));

        if (!OVL_TYPE_UPPER(type) || OVL_TYPE_MERGE(type))
                ovl_path_lowerdata(dentry, path);
        else
                ovl_path_upper(dentry, path);

        return type;
}

struct dentry *ovl_dentry_upper(struct dentry *dentry)
{
        return ovl_upperdentry_dereference(OVL_I(d_inode(dentry)));
}

struct dentry *ovl_dentry_lower(struct dentry *dentry)
{
        struct ovl_entry *oe = OVL_E(dentry);

        return ovl_numlower(oe) ? ovl_lowerstack(oe)->dentry : NULL;
}

const struct ovl_layer *ovl_layer_lower(struct dentry *dentry)
{
        struct ovl_entry *oe = OVL_E(dentry);

        return ovl_numlower(oe) ? ovl_lowerstack(oe)->layer : NULL;
}

/*
 * ovl_dentry_lower() could return either a data dentry or metacopy dentry
 * depending on what is stored in lowerstack[0]. At times we need to find
 * lower dentry which has data (and not metacopy dentry). This helper
 * returns the lower data dentry.
 */
struct dentry *ovl_dentry_lowerdata(struct dentry *dentry)
{
        return ovl_lowerdata_dentry(OVL_E(dentry));
}

int ovl_dentry_set_lowerdata(struct dentry *dentry, struct ovl_path *datapath)
{
        struct ovl_entry *oe = OVL_E(dentry);
        struct ovl_path *lowerdata = ovl_lowerdata(oe);
        struct dentry *datadentry = datapath->dentry;

        if (WARN_ON_ONCE(ovl_numlower(oe) <= 1))
                return -EIO;

        WRITE_ONCE(lowerdata->layer, datapath->layer);
        /*
         * Pairs with smp_rmb() in ovl_path_lowerdata().
         * Make sure that if lowerdata->dentry is visible, then
         * lowerdata->layer is visible as well.
         */
        smp_wmb();
        WRITE_ONCE(lowerdata->dentry, dget(datadentry));

        ovl_dentry_update_reval(dentry, datadentry);

        return 0;
}

struct dentry *ovl_dentry_real(struct dentry *dentry)
{
        return ovl_dentry_upper(dentry) ?: ovl_dentry_lower(dentry);
}

struct dentry *ovl_i_dentry_upper(struct inode *inode)
{
        return ovl_upperdentry_dereference(OVL_I(inode));
}

struct inode *ovl_i_path_real(struct inode *inode, struct path *path)
{
        struct ovl_path *lowerpath = ovl_lowerpath(OVL_I_E(inode));

        path->dentry = ovl_i_dentry_upper(inode);
        if (!path->dentry) {
                path->dentry = lowerpath->dentry;
                path->mnt = lowerpath->layer->mnt;
        } else {
                path->mnt = ovl_upper_mnt(OVL_FS(inode->i_sb));
        }

        return path->dentry ? d_inode_rcu(path->dentry) : NULL;
}

struct inode *ovl_inode_upper(struct inode *inode)
{
        struct dentry *upperdentry = ovl_i_dentry_upper(inode);

        return upperdentry ? d_inode(upperdentry) : NULL;
}

struct inode *ovl_inode_lower(struct inode *inode)
{
        struct ovl_path *lowerpath = ovl_lowerpath(OVL_I_E(inode));

        return lowerpath ? d_inode(lowerpath->dentry) : NULL;
}

struct inode *ovl_inode_real(struct inode *inode)
{
        return ovl_inode_upper(inode) ?: ovl_inode_lower(inode);
}

/* Return inode which contains lower data. Do not return metacopy */
struct inode *ovl_inode_lowerdata(struct inode *inode)
{
        struct dentry *lowerdata = ovl_lowerdata_dentry(OVL_I_E(inode));

        if (WARN_ON(!S_ISREG(inode->i_mode)))
                return NULL;

        return lowerdata ? d_inode(lowerdata) : NULL;
}

/* Return real inode which contains data. Does not return metacopy inode */
struct inode *ovl_inode_realdata(struct inode *inode)
{
        struct inode *upperinode;

        upperinode = ovl_inode_upper(inode);
        if (upperinode && ovl_has_upperdata(inode))
                return upperinode;

        return ovl_inode_lowerdata(inode);
}

const char *ovl_lowerdata_redirect(struct inode *inode)
{
        return inode && S_ISREG(inode->i_mode) ?
                OVL_I(inode)->lowerdata_redirect : NULL;
}

struct ovl_dir_cache *ovl_dir_cache(struct inode *inode)
{
        return inode && S_ISDIR(inode->i_mode) ? OVL_I(inode)->cache : NULL;
}

void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache)
{
        OVL_I(inode)->cache = cache;
}

void ovl_dentry_set_flag(unsigned long flag, struct dentry *dentry)
{
        set_bit(flag, OVL_E_FLAGS(dentry));
}

void ovl_dentry_clear_flag(unsigned long flag, struct dentry *dentry)
{
        clear_bit(flag, OVL_E_FLAGS(dentry));
}

bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry)
{
        return test_bit(flag, OVL_E_FLAGS(dentry));
}

bool ovl_dentry_is_opaque(struct dentry *dentry)
{
        return ovl_dentry_test_flag(OVL_E_OPAQUE, dentry);
}

bool ovl_dentry_is_whiteout(struct dentry *dentry)
{
        return !dentry->d_inode && ovl_dentry_is_opaque(dentry);
}

void ovl_dentry_set_opaque(struct dentry *dentry)
{
        ovl_dentry_set_flag(OVL_E_OPAQUE, dentry);
}

bool ovl_dentry_has_xwhiteouts(struct dentry *dentry)
{
        return ovl_dentry_test_flag(OVL_E_XWHITEOUTS, dentry);
}

void ovl_dentry_set_xwhiteouts(struct dentry *dentry)
{
        ovl_dentry_set_flag(OVL_E_XWHITEOUTS, dentry);
}

/*
 * ovl_layer_set_xwhiteouts() is called before adding the overlay dir
 * dentry to dcache, while readdir of that same directory happens after
 * the overlay dir dentry is in dcache, so if some cpu observes that
 * ovl_dentry_is_xwhiteouts(), it will also observe layer->has_xwhiteouts
 * for the layers where xwhiteouts marker was found in that merge dir.
 */
void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs,
                              const struct ovl_layer *layer)
{
        if (layer->has_xwhiteouts)
                return;

        /* Write once to read-mostly layer properties */
        ofs->layers[layer->idx].has_xwhiteouts = true;
}

/*
 * For hard links and decoded file handles, it's possible for ovl_dentry_upper()
 * to return positive, while there's no actual upper alias for the inode.
 * Copy up code needs to know about the existence of the upper alias, so it
 * can't use ovl_dentry_upper().
 */
bool ovl_dentry_has_upper_alias(struct dentry *dentry)
{
        return ovl_dentry_test_flag(OVL_E_UPPER_ALIAS, dentry);
}

void ovl_dentry_set_upper_alias(struct dentry *dentry)
{
        ovl_dentry_set_flag(OVL_E_UPPER_ALIAS, dentry);
}

static bool ovl_should_check_upperdata(struct inode *inode)
{
        if (!S_ISREG(inode->i_mode))
                return false;

        if (!ovl_inode_lower(inode))
                return false;

        return true;
}

bool ovl_has_upperdata(struct inode *inode)
{
        if (!ovl_should_check_upperdata(inode))
                return true;

        if (!ovl_test_flag(OVL_UPPERDATA, inode))
                return false;
        /*
         * Pairs with smp_wmb() in ovl_set_upperdata(). Main user of
         * ovl_has_upperdata() is ovl_copy_up_meta_inode_data(). Make sure
         * if setting of OVL_UPPERDATA is visible, then effects of writes
         * before that are visible too.
         */
        smp_rmb();
        return true;
}

void ovl_set_upperdata(struct inode *inode)
{
        /*
         * Pairs with smp_rmb() in ovl_has_upperdata(). Make sure
         * if OVL_UPPERDATA flag is visible, then effects of write operations
         * before it are visible as well.
         */
        smp_wmb();
        ovl_set_flag(OVL_UPPERDATA, inode);
}

/* Caller should hold ovl_inode->lock */
bool ovl_dentry_needs_data_copy_up_locked(struct dentry *dentry, int flags)
{
        if (!ovl_open_flags_need_copy_up(flags))
                return false;

        return !ovl_test_flag(OVL_UPPERDATA, d_inode(dentry));
}

bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags)
{
        if (!ovl_open_flags_need_copy_up(flags))
                return false;

        return !ovl_has_upperdata(d_inode(dentry));
}

const char *ovl_dentry_get_redirect(struct dentry *dentry)
{
        return OVL_I(d_inode(dentry))->redirect;
}

void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect)
{
        struct ovl_inode *oi = OVL_I(d_inode(dentry));

        kfree(oi->redirect);
        oi->redirect = redirect;
}

void ovl_inode_update(struct inode *inode, struct dentry *upperdentry)
{
        struct inode *upperinode = d_inode(upperdentry);

        WARN_ON(OVL_I(inode)->__upperdentry);

        /*
         * Make sure upperdentry is consistent before making it visible
         */
        smp_wmb();
        OVL_I(inode)->__upperdentry = upperdentry;
        if (inode_unhashed(inode)) {
                inode->i_private = upperinode;
                __insert_inode_hash(inode, (unsigned long) upperinode);
        }
}

static void ovl_dir_version_inc(struct dentry *dentry, bool impurity)
{
        struct inode *inode = d_inode(dentry);

        WARN_ON(!inode_is_locked(inode));
        WARN_ON(!d_is_dir(dentry));
        /*
         * Version is used by readdir code to keep cache consistent.
         * For merge dirs (or dirs with origin) all changes need to be noted.
         * For non-merge dirs, cache contains only impure entries (i.e. ones
         * which have been copied up and have origins), so only need to note
         * changes to impure entries.
         */
        if (!ovl_dir_is_real(inode) || impurity)
                OVL_I(inode)->version++;
}

void ovl_dir_modified(struct dentry *dentry, bool impurity)
{
        /* Copy mtime/ctime */
        ovl_copyattr(d_inode(dentry));

        ovl_dir_version_inc(dentry, impurity);
}

u64 ovl_inode_version_get(struct inode *inode)
{
        WARN_ON(!inode_is_locked(inode));
        return OVL_I(inode)->version;
}

bool ovl_is_whiteout(struct dentry *dentry)
{
        struct inode *inode = dentry->d_inode;

        return inode && IS_WHITEOUT(inode);
}

/*
 * Use this over ovl_is_whiteout for upper and lower files, as it also
 * handles overlay.whiteout xattr whiteout files.
 */
bool ovl_path_is_whiteout(struct ovl_fs *ofs, const struct path *path)
{
        return ovl_is_whiteout(path->dentry) ||
                ovl_path_check_xwhiteout_xattr(ofs, path);
}

struct file *ovl_path_open(const struct path *path, int flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct mnt_idmap *real_idmap = mnt_idmap(path->mnt);
        int err, acc_mode;

        if (flags & ~(O_ACCMODE | O_LARGEFILE))
                BUG();

        switch (flags & O_ACCMODE) {
        case O_RDONLY:
                acc_mode = MAY_READ;
                break;
        case O_WRONLY:
                acc_mode = MAY_WRITE;
                break;
        default:
                BUG();
        }

        err = inode_permission(real_idmap, inode, acc_mode | MAY_OPEN);
        if (err)
                return ERR_PTR(err);

        /* O_NOATIME is an optimization, don't fail if not permitted */
        if (inode_owner_or_capable(real_idmap, inode))
                flags |= O_NOATIME;

        return dentry_open(path, flags, current_cred());
}

/* Caller should hold ovl_inode->lock */
static bool ovl_already_copied_up_locked(struct dentry *dentry, int flags)
{
        bool disconnected = dentry->d_flags & DCACHE_DISCONNECTED;

        if (ovl_dentry_upper(dentry) &&
            (ovl_dentry_has_upper_alias(dentry) || disconnected) &&
            !ovl_dentry_needs_data_copy_up_locked(dentry, flags))
                return true;

        return false;
}

bool ovl_already_copied_up(struct dentry *dentry, int flags)
{
        bool disconnected = dentry->d_flags & DCACHE_DISCONNECTED;

        /*
         * Check if copy-up has happened as well as for upper alias (in
         * case of hard links) is there.
         *
         * Both checks are lockless:
         *  - false negatives: will recheck under oi->lock
         *  - false positives:
         *    + ovl_dentry_upper() uses memory barriers to ensure the
         *      upper dentry is up-to-date
         *    + ovl_dentry_has_upper_alias() relies on locking of
         *      upper parent i_rwsem to prevent reordering copy-up
         *      with rename.
         */
        if (ovl_dentry_upper(dentry) &&
            (ovl_dentry_has_upper_alias(dentry) || disconnected) &&
            !ovl_dentry_needs_data_copy_up(dentry, flags))
                return true;

        return false;
}

/*
 * The copy up "transaction" keeps an elevated mnt write count on upper mnt,
 * but leaves taking freeze protection on upper sb to lower level helpers.
 */
int ovl_copy_up_start(struct dentry *dentry, int flags)
{
        struct inode *inode = d_inode(dentry);
        int err;

        err = ovl_inode_lock_interruptible(inode);
        if (err)
                return err;

        if (ovl_already_copied_up_locked(dentry, flags))
                err = 1; /* Already copied up */
        else
                err = ovl_get_write_access(dentry);
        if (err)
                goto out_unlock;

        return 0;

out_unlock:
        ovl_inode_unlock(inode);
        return err;
}

void ovl_copy_up_end(struct dentry *dentry)
{
        ovl_put_write_access(dentry);
        ovl_inode_unlock(d_inode(dentry));
}

bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path)
{
        int res;

        res = ovl_path_getxattr(ofs, path, OVL_XATTR_ORIGIN, NULL, 0);

        /* Zero size value means "copied up but origin unknown" */
        if (res >= 0)
                return true;

        return false;
}

bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path)
{
        struct dentry *dentry = path->dentry;
        int res;

        /* xattr.whiteout must be a zero size regular file */
        if (!d_is_reg(dentry) || i_size_read(d_inode(dentry)) != 0)
                return false;

        res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUT, NULL, 0);
        return res >= 0;
}

/*
 * Load persistent uuid from xattr into s_uuid if found, or store a new
 * random generated value in s_uuid and in xattr.
 */
bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs,
                         const struct path *upperpath)
{
        bool set = false;
        uuid_t uuid;
        int res;

        /* Try to load existing persistent uuid */
        res = ovl_path_getxattr(ofs, upperpath, OVL_XATTR_UUID, uuid.b,
                                UUID_SIZE);
        if (res == UUID_SIZE)
                goto set_uuid;

        if (res != -ENODATA)
                goto fail;

        /*
         * With uuid=auto, if uuid xattr is found, it will be used.
         * If uuid xattrs is not found, generate a persistent uuid only on mount
         * of new overlays where upper root dir is not yet marked as impure.
         * An upper dir is marked as impure on copy up or lookup of its subdirs.
         */
        if (ofs->config.uuid == OVL_UUID_AUTO) {
                res = ovl_path_getxattr(ofs, upperpath, OVL_XATTR_IMPURE, NULL,
                                        0);
                if (res > 0) {
                        /* Any mount of old overlay - downgrade to uuid=null */
                        ofs->config.uuid = OVL_UUID_NULL;
                        return true;
                } else if (res == -ENODATA) {
                        /* First mount of new overlay - upgrade to uuid=on */
                        ofs->config.uuid = OVL_UUID_ON;
                } else if (res < 0) {
                        goto fail;
                }

        }

        /* Generate overlay instance uuid */
        uuid_gen(&uuid);

        /* Try to store persistent uuid */
        set = true;
        res = ovl_setxattr(ofs, upperpath->dentry, OVL_XATTR_UUID, uuid.b,
                           UUID_SIZE);
        if (res)
                goto fail;

set_uuid:
        super_set_uuid(sb, uuid.b, sizeof(uuid));
        return true;

fail:
        ofs->config.uuid = OVL_UUID_NULL;
        pr_warn("failed to %s uuid (%pd2, err=%i); falling back to uuid=null.\n",
                set ? "set" : "get", upperpath->dentry, res);
        return false;
}

char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path,
                           enum ovl_xattr ox)
{
        int res;
        char val;

        if (!d_is_dir(path->dentry))
                return 0;

        res = ovl_path_getxattr(ofs, path, ox, &val, 1);
        return res == 1 ? val : 0;
}

#define OVL_XATTR_OPAQUE_POSTFIX        "opaque"
#define OVL_XATTR_REDIRECT_POSTFIX        "redirect"
#define OVL_XATTR_ORIGIN_POSTFIX        "origin"
#define OVL_XATTR_IMPURE_POSTFIX        "impure"
#define OVL_XATTR_NLINK_POSTFIX                "nlink"
#define OVL_XATTR_UPPER_POSTFIX                "upper"
#define OVL_XATTR_UUID_POSTFIX                "uuid"
#define OVL_XATTR_METACOPY_POSTFIX        "metacopy"
#define OVL_XATTR_PROTATTR_POSTFIX        "protattr"
#define OVL_XATTR_XWHITEOUT_POSTFIX        "whiteout"

#define OVL_XATTR_TAB_ENTRY(x) \
        [x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \
                [true] = OVL_XATTR_USER_PREFIX x ## _POSTFIX }

const char *const ovl_xattr_table[][2] = {
        OVL_XATTR_TAB_ENTRY(OVL_XATTR_OPAQUE),
        OVL_XATTR_TAB_ENTRY(OVL_XATTR_REDIRECT),
        OVL_XATTR_TAB_ENTRY(OVL_XATTR_ORIGIN),
        OVL_XATTR_TAB_ENTRY(OVL_XATTR_IMPURE),
        OVL_XATTR_TAB_ENTRY(OVL_XATTR_NLINK),
        OVL_XATTR_TAB_ENTRY(OVL_XATTR_UPPER),
        OVL_XATTR_TAB_ENTRY(OVL_XATTR_UUID),
        OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY),
        OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR),
        OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUT),
};

int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry,
                       enum ovl_xattr ox, const void *value, size_t size,
                       int xerr)
{
        int err;

        if (ofs->noxattr)
                return xerr;

        err = ovl_setxattr(ofs, upperdentry, ox, value, size);

        if (err == -EOPNOTSUPP) {
                pr_warn("cannot set %s xattr on upper\n", ovl_xattr(ofs, ox));
                ofs->noxattr = true;
                return xerr;
        }

        return err;
}

int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry)
{
        struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
        int err;

        if (ovl_test_flag(OVL_IMPURE, d_inode(dentry)))
                return 0;

        /*
         * Do not fail when upper doesn't support xattrs.
         * Upper inodes won't have origin nor redirect xattr anyway.
         */
        err = ovl_check_setxattr(ofs, upperdentry, OVL_XATTR_IMPURE, "y", 1, 0);
        if (!err)
                ovl_set_flag(OVL_IMPURE, d_inode(dentry));

        return err;
}


#define OVL_PROTATTR_MAX 32 /* Reserved for future flags */

void ovl_check_protattr(struct inode *inode, struct dentry *upper)
{
        struct ovl_fs *ofs = OVL_FS(inode->i_sb);
        u32 iflags = inode->i_flags & OVL_PROT_I_FLAGS_MASK;
        char buf[OVL_PROTATTR_MAX+1];
        int res, n;

        res = ovl_getxattr_upper(ofs, upper, OVL_XATTR_PROTATTR, buf,
                                 OVL_PROTATTR_MAX);
        if (res < 0)
                return;

        /*
         * Initialize inode flags from overlay.protattr xattr and upper inode
         * flags.  If upper inode has those fileattr flags set (i.e. from old
         * kernel), we do not clear them on ovl_get_inode(), but we will clear
         * them on next fileattr_set().
         */
        for (n = 0; n < res; n++) {
                if (buf[n] == 'a')
                        iflags |= S_APPEND;
                else if (buf[n] == 'i')
                        iflags |= S_IMMUTABLE;
                else
                        break;
        }

        if (!res || n < res) {
                pr_warn_ratelimited("incompatible overlay.protattr format (%pd2, len=%d)\n",
                                    upper, res);
        } else {
                inode_set_flags(inode, iflags, OVL_PROT_I_FLAGS_MASK);
        }
}

int ovl_set_protattr(struct inode *inode, struct dentry *upper,
                      struct fileattr *fa)
{
        struct ovl_fs *ofs = OVL_FS(inode->i_sb);
        char buf[OVL_PROTATTR_MAX];
        int len = 0, err = 0;
        u32 iflags = 0;

        BUILD_BUG_ON(HWEIGHT32(OVL_PROT_FS_FLAGS_MASK) > OVL_PROTATTR_MAX);

        if (fa->flags & FS_APPEND_FL) {
                buf[len++] = 'a';
                iflags |= S_APPEND;
        }
        if (fa->flags & FS_IMMUTABLE_FL) {
                buf[len++] = 'i';
                iflags |= S_IMMUTABLE;
        }

        /*
         * Do not allow to set protection flags when upper doesn't support
         * xattrs, because we do not set those fileattr flags on upper inode.
         * Remove xattr if it exist and all protection flags are cleared.
         */
        if (len) {
                err = ovl_check_setxattr(ofs, upper, OVL_XATTR_PROTATTR,
                                         buf, len, -EPERM);
        } else if (inode->i_flags & OVL_PROT_I_FLAGS_MASK) {
                err = ovl_removexattr(ofs, upper, OVL_XATTR_PROTATTR);
                if (err == -EOPNOTSUPP || err == -ENODATA)
                        err = 0;
        }
        if (err)
                return err;

        inode_set_flags(inode, iflags, OVL_PROT_I_FLAGS_MASK);

        /* Mask out the fileattr flags that should not be set in upper inode */
        fa->flags &= ~OVL_PROT_FS_FLAGS_MASK;
        fa->fsx_xflags &= ~OVL_PROT_FSX_FLAGS_MASK;

        return 0;
}

/*
 * Caller must hold a reference to inode to prevent it from being freed while
 * it is marked inuse.
 */
bool ovl_inuse_trylock(struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        bool locked = false;

        spin_lock(&inode->i_lock);
        if (!(inode->i_state & I_OVL_INUSE)) {
                inode->i_state |= I_OVL_INUSE;
                locked = true;
        }
        spin_unlock(&inode->i_lock);

        return locked;
}

void ovl_inuse_unlock(struct dentry *dentry)
{
        if (dentry) {
                struct inode *inode = d_inode(dentry);

                spin_lock(&inode->i_lock);
                WARN_ON(!(inode->i_state & I_OVL_INUSE));
                inode->i_state &= ~I_OVL_INUSE;
                spin_unlock(&inode->i_lock);
        }
}

bool ovl_is_inuse(struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        bool inuse;

        spin_lock(&inode->i_lock);
        inuse = (inode->i_state & I_OVL_INUSE);
        spin_unlock(&inode->i_lock);

        return inuse;
}

/*
 * Does this overlay dentry need to be indexed on copy up?
 */
bool ovl_need_index(struct dentry *dentry)
{
        struct dentry *lower = ovl_dentry_lower(dentry);

        if (!lower || !ovl_indexdir(dentry->d_sb))
                return false;

        /* Index all files for NFS export and consistency verification */
        if (ovl_index_all(dentry->d_sb))
                return true;

        /* Index only lower hardlinks on copy up */
        if (!d_is_dir(lower) && d_inode(lower)->i_nlink > 1)
                return true;

        return false;
}

/* Caller must hold OVL_I(inode)->lock */
static void ovl_cleanup_index(struct dentry *dentry)
{
        struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
        struct dentry *indexdir = ovl_indexdir(dentry->d_sb);
        struct inode *dir = indexdir->d_inode;
        struct dentry *lowerdentry = ovl_dentry_lower(dentry);
        struct dentry *upperdentry = ovl_dentry_upper(dentry);
        struct dentry *index = NULL;
        struct inode *inode;
        struct qstr name = { };
        bool got_write = false;
        int err;

        err = ovl_get_index_name(ofs, lowerdentry, &name);
        if (err)
                goto fail;

        err = ovl_want_write(dentry);
        if (err)
                goto fail;

        got_write = true;
        inode = d_inode(upperdentry);
        if (!S_ISDIR(inode->i_mode) && inode->i_nlink != 1) {
                pr_warn_ratelimited("cleanup linked index (%pd2, ino=%lu, nlink=%u)\n",
                                    upperdentry, inode->i_ino, inode->i_nlink);
                /*
                 * We either have a bug with persistent union nlink or a lower
                 * hardlink was added while overlay is mounted. Adding a lower
                 * hardlink and then unlinking all overlay hardlinks would drop
                 * overlay nlink to zero before all upper inodes are unlinked.
                 * As a safety measure, when that situation is detected, set
                 * the overlay nlink to the index inode nlink minus one for the
                 * index entry itself.
                 */
                set_nlink(d_inode(dentry), inode->i_nlink - 1);
                ovl_set_nlink_upper(dentry);
                goto out;
        }

        inode_lock_nested(dir, I_MUTEX_PARENT);
        index = ovl_lookup_upper(ofs, name.name, indexdir, name.len);
        err = PTR_ERR(index);
        if (IS_ERR(index)) {
                index = NULL;
        } else if (ovl_index_all(dentry->d_sb)) {
                /* Whiteout orphan index to block future open by handle */
                err = ovl_cleanup_and_whiteout(OVL_FS(dentry->d_sb),
                                               dir, index);
        } else {
                /* Cleanup orphan index entries */
                err = ovl_cleanup(ofs, dir, index);
        }

        inode_unlock(dir);
        if (err)
                goto fail;

out:
        if (got_write)
                ovl_drop_write(dentry);
        kfree(name.name);
        dput(index);
        return;

fail:
        pr_err("cleanup index of '%pd2' failed (%i)\n", dentry, err);
        goto out;
}

/*
 * Operations that change overlay inode and upper inode nlink need to be
 * synchronized with copy up for persistent nlink accounting.
 */
int ovl_nlink_start(struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        const struct cred *old_cred;
        int err;

        if (WARN_ON(!inode))
                return -ENOENT;

        /*
         * With inodes index is enabled, we store the union overlay nlink
         * in an xattr on the index inode. When whiting out an indexed lower,
         * we need to decrement the overlay persistent nlink, but before the
         * first copy up, we have no upper index inode to store the xattr.
         *
         * As a workaround, before whiteout/rename over an indexed lower,
         * copy up to create the upper index. Creating the upper index will
         * initialize the overlay nlink, so it could be dropped if unlink
         * or rename succeeds.
         *
         * TODO: implement metadata only index copy up when called with
         *       ovl_copy_up_flags(dentry, O_PATH).
         */
        if (ovl_need_index(dentry) && !ovl_dentry_has_upper_alias(dentry)) {
                err = ovl_copy_up(dentry);
                if (err)
                        return err;
        }

        err = ovl_inode_lock_interruptible(inode);
        if (err)
                return err;

        err = ovl_want_write(dentry);
        if (err)
                goto out_unlock;

        if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, inode))
                return 0;

        old_cred = ovl_override_creds(dentry->d_sb);
        /*
         * The overlay inode nlink should be incremented/decremented IFF the
         * upper operation succeeds, along with nlink change of upper inode.
         * Therefore, before link/unlink/rename, we store the union nlink
         * value relative to the upper inode nlink in an upper inode xattr.
         */
        err = ovl_set_nlink_upper(dentry);
        revert_creds(old_cred);
        if (err)
                goto out_drop_write;

        return 0;

out_drop_write:
        ovl_drop_write(dentry);
out_unlock:
        ovl_inode_unlock(inode);

        return err;
}

void ovl_nlink_end(struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        ovl_drop_write(dentry);

        if (ovl_test_flag(OVL_INDEX, inode) && inode->i_nlink == 0) {
                const struct cred *old_cred;

                old_cred = ovl_override_creds(dentry->d_sb);
                ovl_cleanup_index(dentry);
                revert_creds(old_cred);
        }

        ovl_inode_unlock(inode);
}

int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir)
{
        struct dentry *trap;

        /* Workdir should not be the same as upperdir */
        if (workdir == upperdir)
                goto err;

        /* Workdir should not be subdir of upperdir and vice versa */
        trap = lock_rename(workdir, upperdir);
        if (IS_ERR(trap))
                goto err;
        if (trap)
                goto err_unlock;

        return 0;

err_unlock:
        unlock_rename(workdir, upperdir);
err:
        pr_err("failed to lock workdir+upperdir\n");
        return -EIO;
}

/*
 * err < 0, 0 if no metacopy xattr, metacopy data size if xattr found.
 * an empty xattr returns OVL_METACOPY_MIN_SIZE to distinguish from no xattr value.
 */
int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path,
                             struct ovl_metacopy *data)
{
        int res;

        /* Only regular files can have metacopy xattr */
        if (!S_ISREG(d_inode(path->dentry)->i_mode))
                return 0;

        res = ovl_path_getxattr(ofs, path, OVL_XATTR_METACOPY,
                                data, data ? OVL_METACOPY_MAX_SIZE : 0);
        if (res < 0) {
                if (res == -ENODATA || res == -EOPNOTSUPP)
                        return 0;
                /*
                 * getxattr on user.* may fail with EACCES in case there's no
                 * read permission on the inode.  Not much we can do, other than
                 * tell the caller that this is not a metacopy inode.
                 */
                if (ofs->config.userxattr && res == -EACCES)
                        return 0;
                goto out;
        }

        if (res == 0) {
                /* Emulate empty data for zero size metacopy xattr */
                res = OVL_METACOPY_MIN_SIZE;
                if (data) {
                        memset(data, 0, res);
                        data->len = res;
                }
        } else if (res < OVL_METACOPY_MIN_SIZE) {
                pr_warn_ratelimited("metacopy file '%pd' has too small xattr\n",
                                    path->dentry);
                return -EIO;
        } else if (data) {
                if (data->version != 0) {
                        pr_warn_ratelimited("metacopy file '%pd' has unsupported version\n",
                                            path->dentry);
                        return -EIO;
                }
                if (res != data->len) {
                        pr_warn_ratelimited("metacopy file '%pd' has invalid xattr size\n",
                                            path->dentry);
                        return -EIO;
                }
        }

        return res;
out:
        pr_warn_ratelimited("failed to get metacopy (%i)\n", res);
        return res;
}

int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d, struct ovl_metacopy *metacopy)
{
        size_t len = metacopy->len;

        /* If no flags or digest fall back to empty metacopy file */
        if (metacopy->version == 0 && metacopy->flags == 0 && metacopy->digest_algo == 0)
                len = 0;

        return ovl_check_setxattr(ofs, d, OVL_XATTR_METACOPY,
                                  metacopy, len, -EOPNOTSUPP);
}

bool ovl_is_metacopy_dentry(struct dentry *dentry)
{
        struct ovl_entry *oe = OVL_E(dentry);

        if (!d_is_reg(dentry))
                return false;

        if (ovl_dentry_upper(dentry)) {
                if (!ovl_has_upperdata(d_inode(dentry)))
                        return true;
                return false;
        }

        return (ovl_numlower(oe) > 1);
}

char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding)
{
        int res;
        char *s, *next, *buf = NULL;

        res = ovl_path_getxattr(ofs, path, OVL_XATTR_REDIRECT, NULL, 0);
        if (res == -ENODATA || res == -EOPNOTSUPP)
                return NULL;
        if (res < 0)
                goto fail;
        if (res == 0)
                goto invalid;

        buf = kzalloc(res + padding + 1, GFP_KERNEL);
        if (!buf)
                return ERR_PTR(-ENOMEM);

        res = ovl_path_getxattr(ofs, path, OVL_XATTR_REDIRECT, buf, res);
        if (res < 0)
                goto fail;
        if (res == 0)
                goto invalid;

        if (buf[0] == '/') {
                for (s = buf; *s++ == '/'; s = next) {
                        next = strchrnul(s, '/');
                        if (s == next)
                                goto invalid;
                }
        } else {
                if (strchr(buf, '/') != NULL)
                        goto invalid;
        }

        return buf;
invalid:
        pr_warn_ratelimited("invalid redirect (%s)\n", buf);
        res = -EINVAL;
        goto err_free;
fail:
        pr_warn_ratelimited("failed to get redirect (%i)\n", res);
err_free:
        kfree(buf);
        return ERR_PTR(res);
}

/* Call with mounter creds as it may open the file */
int ovl_ensure_verity_loaded(struct path *datapath)
{
        struct inode *inode = d_inode(datapath->dentry);
        struct file *filp;

        if (!fsverity_active(inode) && IS_VERITY(inode)) {
                /*
                 * If this inode was not yet opened, the verity info hasn't been
                 * loaded yet, so we need to do that here to force it into memory.
                 */
                filp = kernel_file_open(datapath, O_RDONLY, current_cred());
                if (IS_ERR(filp))
                        return PTR_ERR(filp);
                fput(filp);
        }

        return 0;
}

int ovl_validate_verity(struct ovl_fs *ofs,
                        struct path *metapath,
                        struct path *datapath)
{
        struct ovl_metacopy metacopy_data;
        u8 actual_digest[FS_VERITY_MAX_DIGEST_SIZE];
        int xattr_digest_size, digest_size;
        int xattr_size, err;
        u8 verity_algo;

        if (!ofs->config.verity_mode ||
            /* Verity only works on regular files */
            !S_ISREG(d_inode(metapath->dentry)->i_mode))
                return 0;

        xattr_size = ovl_check_metacopy_xattr(ofs, metapath, &metacopy_data);
        if (xattr_size < 0)
                return xattr_size;

        if (!xattr_size || !metacopy_data.digest_algo) {
                if (ofs->config.verity_mode == OVL_VERITY_REQUIRE) {
                        pr_warn_ratelimited("metacopy file '%pd' has no digest specified\n",
                                            metapath->dentry);
                        return -EIO;
                }
                return 0;
        }

        xattr_digest_size = ovl_metadata_digest_size(&metacopy_data);

        err = ovl_ensure_verity_loaded(datapath);
        if (err < 0) {
                pr_warn_ratelimited("lower file '%pd' failed to load fs-verity info\n",
                                    datapath->dentry);
                return -EIO;
        }

        digest_size = fsverity_get_digest(d_inode(datapath->dentry), actual_digest,
                                          &verity_algo, NULL);
        if (digest_size == 0) {
                pr_warn_ratelimited("lower file '%pd' has no fs-verity digest\n", datapath->dentry);
                return -EIO;
        }

        if (xattr_digest_size != digest_size ||
            metacopy_data.digest_algo != verity_algo ||
            memcmp(metacopy_data.digest, actual_digest, xattr_digest_size) != 0) {
                pr_warn_ratelimited("lower file '%pd' has the wrong fs-verity digest\n",
                                    datapath->dentry);
                return -EIO;
        }

        return 0;
}

int ovl_get_verity_digest(struct ovl_fs *ofs, struct path *src,
                          struct ovl_metacopy *metacopy)
{
        int err, digest_size;

        if (!ofs->config.verity_mode || !S_ISREG(d_inode(src->dentry)->i_mode))
                return 0;

        err = ovl_ensure_verity_loaded(src);
        if (err < 0) {
                pr_warn_ratelimited("lower file '%pd' failed to load fs-verity info\n",
                                    src->dentry);
                return -EIO;
        }

        digest_size = fsverity_get_digest(d_inode(src->dentry),
                                          metacopy->digest, &metacopy->digest_algo, NULL);
        if (digest_size == 0 ||
            WARN_ON_ONCE(digest_size > FS_VERITY_MAX_DIGEST_SIZE)) {
                if (ofs->config.verity_mode == OVL_VERITY_REQUIRE) {
                        pr_warn_ratelimited("lower file '%pd' has no fs-verity digest\n",
                                            src->dentry);
                        return -EIO;
                }
                return 0;
        }

        metacopy->len += digest_size;
        return 0;
}

/*
 * ovl_sync_status() - Check fs sync status for volatile mounts
 *
 * Returns 1 if this is not a volatile mount and a real sync is required.
 *
 * Returns 0 if syncing can be skipped because mount is volatile, and no errors
 * have occurred on the upperdir since the mount.
 *
 * Returns -errno if it is a volatile mount, and the error that occurred since
 * the last mount. If the error code changes, it'll return the latest error
 * code.
 */

int ovl_sync_status(struct ovl_fs *ofs)
{
        struct vfsmount *mnt;

        if (ovl_should_sync(ofs))
                return 1;

        mnt = ovl_upper_mnt(ofs);
        if (!mnt)
                return 0;

        return errseq_check(&mnt->mnt_sb->s_wb_err, ofs->errseq);
}

/*
 * ovl_copyattr() - copy inode attributes from layer to ovl inode
 *
 * When overlay copies inode information from an upper or lower layer to the
 * relevant overlay inode it will apply the idmapping of the upper or lower
 * layer when doing so ensuring that the ovl inode ownership will correctly
 * reflect the ownership of the idmapped upper or lower layer. For example, an
 * idmapped upper or lower layer mapping id 1001 to id 1000 will take care to
 * map any lower or upper inode owned by id 1001 to id 1000. These mapping
 * helpers are nops when the relevant layer isn't idmapped.
 */
void ovl_copyattr(struct inode *inode)
{
        struct path realpath;
        struct inode *realinode;
        struct mnt_idmap *real_idmap;
        vfsuid_t vfsuid;
        vfsgid_t vfsgid;

        realinode = ovl_i_path_real(inode, &realpath);
        real_idmap = mnt_idmap(realpath.mnt);

        spin_lock(&inode->i_lock);
        vfsuid = i_uid_into_vfsuid(real_idmap, realinode);
        vfsgid = i_gid_into_vfsgid(real_idmap, realinode);

        inode->i_uid = vfsuid_into_kuid(vfsuid);
        inode->i_gid = vfsgid_into_kgid(vfsgid);
        inode->i_mode = realinode->i_mode;
        inode_set_atime_to_ts(inode, inode_get_atime(realinode));
        inode_set_mtime_to_ts(inode, inode_get_mtime(realinode));
        inode_set_ctime_to_ts(inode, inode_get_ctime(realinode));
        i_size_write(inode, i_size_read(realinode));
        spin_unlock(&inode->i_lock);
}


































































    1 











































    2 
    2 


    2 












    2 
    1 



    2 
























































































































































































    2 








    3 






























    3 
































    3 














    2 
    2 
    1 
    1 
    1 
    1 




























































































    1 
    1 


























































































































































































    2 

    1 

    2 





































    2 


    3 




    2 
    1 







    3 





















    2 
    3 








































































































































    2 



    2 






    1 

    2 




    2 
    2 



    2 












































    1 



    1 
    1 











































































































































































































































    2 





















    2 
    2 

































    2 










    2 
    2 




























    1 



















































































































































































































    1 

















    1 

















    1 
    1 



















    1 

































    1 



    1 

















































































































































    3 























    3 






















    3 










































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/super.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  super.c contains code to handle: - mount structures
 *                                   - super-block tables
 *                                   - filesystem drivers list
 *                                   - mount system call
 *                                   - umount system call
 *                                   - ustat system call
 *
 * GK 2/5/95  -  Changed to support mounting the root fs via NFS
 *
 *  Added kerneld support: Jacques Gelinas and Bjorn Ekwall
 *  Added change_root: Werner Almesberger & Hans Lermen, Feb '96
 *  Added options to /proc/mounts:
 *    Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996.
 *  Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998
 *  Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
 */

#include <linux/export.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/writeback.h>                /* for the emergency remount stuff */
#include <linux/idr.h>
#include <linux/mutex.h>
#include <linux/backing-dev.h>
#include <linux/rculist_bl.h>
#include <linux/fscrypt.h>
#include <linux/fsnotify.h>
#include <linux/lockdep.h>
#include <linux/user_namespace.h>
#include <linux/fs_context.h>
#include <uapi/linux/mount.h>
#include "internal.h"

static int thaw_super_locked(struct super_block *sb, enum freeze_holder who);

static LIST_HEAD(super_blocks);
static DEFINE_SPINLOCK(sb_lock);

static char *sb_writers_name[SB_FREEZE_LEVELS] = {
        "sb_writers",
        "sb_pagefaults",
        "sb_internal",
};

static inline void __super_lock(struct super_block *sb, bool excl)
{
        if (excl)
                down_write(&sb->s_umount);
        else
                down_read(&sb->s_umount);
}

static inline void super_unlock(struct super_block *sb, bool excl)
{
        if (excl)
                up_write(&sb->s_umount);
        else
                up_read(&sb->s_umount);
}

static inline void __super_lock_excl(struct super_block *sb)
{
        __super_lock(sb, true);
}

static inline void super_unlock_excl(struct super_block *sb)
{
        super_unlock(sb, true);
}

static inline void super_unlock_shared(struct super_block *sb)
{
        super_unlock(sb, false);
}

static bool super_flags(const struct super_block *sb, unsigned int flags)
{
        /*
         * Pairs with smp_store_release() in super_wake() and ensures
         * that we see @flags after we're woken.
         */
        return smp_load_acquire(&sb->s_flags) & flags;
}

/**
 * super_lock - wait for superblock to become ready and lock it
 * @sb: superblock to wait for
 * @excl: whether exclusive access is required
 *
 * If the superblock has neither passed through vfs_get_tree() or
 * generic_shutdown_super() yet wait for it to happen. Either superblock
 * creation will succeed and SB_BORN is set by vfs_get_tree() or we're
 * woken and we'll see SB_DYING.
 *
 * The caller must have acquired a temporary reference on @sb->s_count.
 *
 * Return: The function returns true if SB_BORN was set and with
 *         s_umount held. The function returns false if SB_DYING was
 *         set and without s_umount held.
 */
static __must_check bool super_lock(struct super_block *sb, bool excl)
{
        lockdep_assert_not_held(&sb->s_umount);

        /* wait until the superblock is ready or dying */
        wait_var_event(&sb->s_flags, super_flags(sb, SB_BORN | SB_DYING));

        /* Don't pointlessly acquire s_umount. */
        if (super_flags(sb, SB_DYING))
                return false;

        __super_lock(sb, excl);

        /*
         * Has gone through generic_shutdown_super() in the meantime.
         * @sb->s_root is NULL and @sb->s_active is 0. No one needs to
         * grab a reference to this. Tell them so.
         */
        if (sb->s_flags & SB_DYING) {
                super_unlock(sb, excl);
                return false;
        }

        WARN_ON_ONCE(!(sb->s_flags & SB_BORN));
        return true;
}

/* wait and try to acquire read-side of @sb->s_umount */
static inline bool super_lock_shared(struct super_block *sb)
{
        return super_lock(sb, false);
}

/* wait and try to acquire write-side of @sb->s_umount */
static inline bool super_lock_excl(struct super_block *sb)
{
        return super_lock(sb, true);
}

/* wake waiters */
#define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING | SB_DEAD)
static void super_wake(struct super_block *sb, unsigned int flag)
{
        WARN_ON_ONCE((flag & ~SUPER_WAKE_FLAGS));
        WARN_ON_ONCE(hweight32(flag & SUPER_WAKE_FLAGS) > 1);

        /*
         * Pairs with smp_load_acquire() in super_lock() to make sure
         * all initializations in the superblock are seen by the user
         * seeing SB_BORN sent.
         */
        smp_store_release(&sb->s_flags, sb->s_flags | flag);
        /*
         * Pairs with the barrier in prepare_to_wait_event() to make sure
         * ___wait_var_event() either sees SB_BORN set or
         * waitqueue_active() check in wake_up_var() sees the waiter.
         */
        smp_mb();
        wake_up_var(&sb->s_flags);
}

/*
 * One thing we have to be careful of with a per-sb shrinker is that we don't
 * drop the last active reference to the superblock from within the shrinker.
 * If that happens we could trigger unregistering the shrinker from within the
 * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we
 * take a passive reference to the superblock to avoid this from occurring.
 */
static unsigned long super_cache_scan(struct shrinker *shrink,
                                      struct shrink_control *sc)
{
        struct super_block *sb;
        long        fs_objects = 0;
        long        total_objects;
        long        freed = 0;
        long        dentries;
        long        inodes;

        sb = shrink->private_data;

        /*
         * Deadlock avoidance.  We may hold various FS locks, and we don't want
         * to recurse into the FS that called us in clear_inode() and friends..
         */
        if (!(sc->gfp_mask & __GFP_FS))
                return SHRINK_STOP;

        if (!super_trylock_shared(sb))
                return SHRINK_STOP;

        if (sb->s_op->nr_cached_objects)
                fs_objects = sb->s_op->nr_cached_objects(sb, sc);

        inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
        dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
        total_objects = dentries + inodes + fs_objects + 1;
        if (!total_objects)
                total_objects = 1;

        /* proportion the scan between the caches */
        dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
        inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
        fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);

        /*
         * prune the dcache first as the icache is pinned by it, then
         * prune the icache, followed by the filesystem specific caches
         *
         * Ensure that we always scan at least one object - memcg kmem
         * accounting uses this to fully empty the caches.
         */
        sc->nr_to_scan = dentries + 1;
        freed = prune_dcache_sb(sb, sc);
        sc->nr_to_scan = inodes + 1;
        freed += prune_icache_sb(sb, sc);

        if (fs_objects) {
                sc->nr_to_scan = fs_objects + 1;
                freed += sb->s_op->free_cached_objects(sb, sc);
        }

        super_unlock_shared(sb);
        return freed;
}

static unsigned long super_cache_count(struct shrinker *shrink,
                                       struct shrink_control *sc)
{
        struct super_block *sb;
        long        total_objects = 0;

        sb = shrink->private_data;

        /*
         * We don't call super_trylock_shared() here as it is a scalability
         * bottleneck, so we're exposed to partial setup state. The shrinker
         * rwsem does not protect filesystem operations backing
         * list_lru_shrink_count() or s_op->nr_cached_objects(). Counts can
         * change between super_cache_count and super_cache_scan, so we really
         * don't need locks here.
         *
         * However, if we are currently mounting the superblock, the underlying
         * filesystem might be in a state of partial construction and hence it
         * is dangerous to access it.  super_trylock_shared() uses a SB_BORN check
         * to avoid this situation, so do the same here. The memory barrier is
         * matched with the one in mount_fs() as we don't hold locks here.
         */
        if (!(sb->s_flags & SB_BORN))
                return 0;
        smp_rmb();

        if (sb->s_op && sb->s_op->nr_cached_objects)
                total_objects = sb->s_op->nr_cached_objects(sb, sc);

        total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
        total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);

        if (!total_objects)
                return SHRINK_EMPTY;

        total_objects = vfs_pressure_ratio(total_objects);
        return total_objects;
}

static void destroy_super_work(struct work_struct *work)
{
        struct super_block *s = container_of(work, struct super_block,
                                                        destroy_work);
        fsnotify_sb_free(s);
        security_sb_free(s);
        put_user_ns(s->s_user_ns);
        kfree(s->s_subtype);
        for (int i = 0; i < SB_FREEZE_LEVELS; i++)
                percpu_free_rwsem(&s->s_writers.rw_sem[i]);
        kfree(s);
}

static void destroy_super_rcu(struct rcu_head *head)
{
        struct super_block *s = container_of(head, struct super_block, rcu);
        INIT_WORK(&s->destroy_work, destroy_super_work);
        schedule_work(&s->destroy_work);
}

/* Free a superblock that has never been seen by anyone */
static void destroy_unused_super(struct super_block *s)
{
        if (!s)
                return;
        super_unlock_excl(s);
        list_lru_destroy(&s->s_dentry_lru);
        list_lru_destroy(&s->s_inode_lru);
        shrinker_free(s->s_shrink);
        /* no delays needed */
        destroy_super_work(&s->destroy_work);
}

/**
 *        alloc_super        -        create new superblock
 *        @type:        filesystem type superblock should belong to
 *        @flags: the mount flags
 *        @user_ns: User namespace for the super_block
 *
 *        Allocates and initializes a new &struct super_block.  alloc_super()
 *        returns a pointer new superblock or %NULL if allocation had failed.
 */
static struct super_block *alloc_super(struct file_system_type *type, int flags,
                                       struct user_namespace *user_ns)
{
        struct super_block *s = kzalloc(sizeof(struct super_block), GFP_KERNEL);
        static const struct super_operations default_op;
        int i;

        if (!s)
                return NULL;

        INIT_LIST_HEAD(&s->s_mounts);
        s->s_user_ns = get_user_ns(user_ns);
        init_rwsem(&s->s_umount);
        lockdep_set_class(&s->s_umount, &type->s_umount_key);
        /*
         * sget() can have s_umount recursion.
         *
         * When it cannot find a suitable sb, it allocates a new
         * one (this one), and tries again to find a suitable old
         * one.
         *
         * In case that succeeds, it will acquire the s_umount
         * lock of the old one. Since these are clearly distrinct
         * locks, and this object isn't exposed yet, there's no
         * risk of deadlocks.
         *
         * Annotate this by putting this lock in a different
         * subclass.
         */
        down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);

        if (security_sb_alloc(s))
                goto fail;

        for (i = 0; i < SB_FREEZE_LEVELS; i++) {
                if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
                                        sb_writers_name[i],
                                        &type->s_writers_key[i]))
                        goto fail;
        }
        s->s_bdi = &noop_backing_dev_info;
        s->s_flags = flags;
        if (s->s_user_ns != &init_user_ns)
                s->s_iflags |= SB_I_NODEV;
        INIT_HLIST_NODE(&s->s_instances);
        INIT_HLIST_BL_HEAD(&s->s_roots);
        mutex_init(&s->s_sync_lock);
        INIT_LIST_HEAD(&s->s_inodes);
        spin_lock_init(&s->s_inode_list_lock);
        INIT_LIST_HEAD(&s->s_inodes_wb);
        spin_lock_init(&s->s_inode_wblist_lock);

        s->s_count = 1;
        atomic_set(&s->s_active, 1);
        mutex_init(&s->s_vfs_rename_mutex);
        lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
        init_rwsem(&s->s_dquot.dqio_sem);
        s->s_maxbytes = MAX_NON_LFS;
        s->s_op = &default_op;
        s->s_time_gran = 1000000000;
        s->s_time_min = TIME64_MIN;
        s->s_time_max = TIME64_MAX;

        s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
                                     "sb-%s", type->name);
        if (!s->s_shrink)
                goto fail;

        s->s_shrink->scan_objects = super_cache_scan;
        s->s_shrink->count_objects = super_cache_count;
        s->s_shrink->batch = 1024;
        s->s_shrink->private_data = s;

        if (list_lru_init_memcg(&s->s_dentry_lru, s->s_shrink))
                goto fail;
        if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink))
                goto fail;
        return s;

fail:
        destroy_unused_super(s);
        return NULL;
}

/* Superblock refcounting  */

/*
 * Drop a superblock's refcount.  The caller must hold sb_lock.
 */
static void __put_super(struct super_block *s)
{
        if (!--s->s_count) {
                list_del_init(&s->s_list);
                WARN_ON(s->s_dentry_lru.node);
                WARN_ON(s->s_inode_lru.node);
                WARN_ON(!list_empty(&s->s_mounts));
                call_rcu(&s->rcu, destroy_super_rcu);
        }
}

/**
 *        put_super        -        drop a temporary reference to superblock
 *        @sb: superblock in question
 *
 *        Drops a temporary reference, frees superblock if there's no
 *        references left.
 */
void put_super(struct super_block *sb)
{
        spin_lock(&sb_lock);
        __put_super(sb);
        spin_unlock(&sb_lock);
}

static void kill_super_notify(struct super_block *sb)
{
        lockdep_assert_not_held(&sb->s_umount);

        /* already notified earlier */
        if (sb->s_flags & SB_DEAD)
                return;

        /*
         * Remove it from @fs_supers so it isn't found by new
         * sget{_fc}() walkers anymore. Any concurrent mounter still
         * managing to grab a temporary reference is guaranteed to
         * already see SB_DYING and will wait until we notify them about
         * SB_DEAD.
         */
        spin_lock(&sb_lock);
        hlist_del_init(&sb->s_instances);
        spin_unlock(&sb_lock);

        /*
         * Let concurrent mounts know that this thing is really dead.
         * We don't need @sb->s_umount here as every concurrent caller
         * will see SB_DYING and either discard the superblock or wait
         * for SB_DEAD.
         */
        super_wake(sb, SB_DEAD);
}

/**
 *        deactivate_locked_super        -        drop an active reference to superblock
 *        @s: superblock to deactivate
 *
 *        Drops an active reference to superblock, converting it into a temporary
 *        one if there is no other active references left.  In that case we
 *        tell fs driver to shut it down and drop the temporary reference we
 *        had just acquired.
 *
 *        Caller holds exclusive lock on superblock; that lock is released.
 */
void deactivate_locked_super(struct super_block *s)
{
        struct file_system_type *fs = s->s_type;
        if (atomic_dec_and_test(&s->s_active)) {
                shrinker_free(s->s_shrink);
                fs->kill_sb(s);

                kill_super_notify(s);

                /*
                 * Since list_lru_destroy() may sleep, we cannot call it from
                 * put_super(), where we hold the sb_lock. Therefore we destroy
                 * the lru lists right now.
                 */
                list_lru_destroy(&s->s_dentry_lru);
                list_lru_destroy(&s->s_inode_lru);

                put_filesystem(fs);
                put_super(s);
        } else {
                super_unlock_excl(s);
        }
}

EXPORT_SYMBOL(deactivate_locked_super);

/**
 *        deactivate_super        -        drop an active reference to superblock
 *        @s: superblock to deactivate
 *
 *        Variant of deactivate_locked_super(), except that superblock is *not*
 *        locked by caller.  If we are going to drop the final active reference,
 *        lock will be acquired prior to that.
 */
void deactivate_super(struct super_block *s)
{
        if (!atomic_add_unless(&s->s_active, -1, 1)) {
                __super_lock_excl(s);
                deactivate_locked_super(s);
        }
}

EXPORT_SYMBOL(deactivate_super);

/**
 * grab_super - acquire an active reference to a superblock
 * @sb: superblock to acquire
 *
 * Acquire a temporary reference on a superblock and try to trade it for
 * an active reference. This is used in sget{_fc}() to wait for a
 * superblock to either become SB_BORN or for it to pass through
 * sb->kill() and be marked as SB_DEAD.
 *
 * Return: This returns true if an active reference could be acquired,
 *         false if not.
 */
static bool grab_super(struct super_block *sb)
{
        bool locked;

        sb->s_count++;
        spin_unlock(&sb_lock);
        locked = super_lock_excl(sb);
        if (locked) {
                if (atomic_inc_not_zero(&sb->s_active)) {
                        put_super(sb);
                        return true;
                }
                super_unlock_excl(sb);
        }
        wait_var_event(&sb->s_flags, super_flags(sb, SB_DEAD));
        put_super(sb);
        return false;
}

/*
 *        super_trylock_shared - try to grab ->s_umount shared
 *        @sb: reference we are trying to grab
 *
 *        Try to prevent fs shutdown.  This is used in places where we
 *        cannot take an active reference but we need to ensure that the
 *        filesystem is not shut down while we are working on it. It returns
 *        false if we cannot acquire s_umount or if we lose the race and
 *        filesystem already got into shutdown, and returns true with the s_umount
 *        lock held in read mode in case of success. On successful return,
 *        the caller must drop the s_umount lock when done.
 *
 *        Note that unlike get_super() et.al. this one does *not* bump ->s_count.
 *        The reason why it's safe is that we are OK with doing trylock instead
 *        of down_read().  There's a couple of places that are OK with that, but
 *        it's very much not a general-purpose interface.
 */
bool super_trylock_shared(struct super_block *sb)
{
        if (down_read_trylock(&sb->s_umount)) {
                if (!(sb->s_flags & SB_DYING) && sb->s_root &&
                    (sb->s_flags & SB_BORN))
                        return true;
                super_unlock_shared(sb);
        }

        return false;
}

/**
 *        retire_super        -        prevents superblock from being reused
 *        @sb: superblock to retire
 *
 *        The function marks superblock to be ignored in superblock test, which
 *        prevents it from being reused for any new mounts.  If the superblock has
 *        a private bdi, it also unregisters it, but doesn't reduce the refcount
 *        of the superblock to prevent potential races.  The refcount is reduced
 *        by generic_shutdown_super().  The function can not be called
 *        concurrently with generic_shutdown_super().  It is safe to call the
 *        function multiple times, subsequent calls have no effect.
 *
 *        The marker will affect the re-use only for block-device-based
 *        superblocks.  Other superblocks will still get marked if this function
 *        is used, but that will not affect their reusability.
 */
void retire_super(struct super_block *sb)
{
        WARN_ON(!sb->s_bdev);
        __super_lock_excl(sb);
        if (sb->s_iflags & SB_I_PERSB_BDI) {
                bdi_unregister(sb->s_bdi);
                sb->s_iflags &= ~SB_I_PERSB_BDI;
        }
        sb->s_iflags |= SB_I_RETIRED;
        super_unlock_excl(sb);
}
EXPORT_SYMBOL(retire_super);

/**
 *        generic_shutdown_super        -        common helper for ->kill_sb()
 *        @sb: superblock to kill
 *
 *        generic_shutdown_super() does all fs-independent work on superblock
 *        shutdown.  Typical ->kill_sb() should pick all fs-specific objects
 *        that need destruction out of superblock, call generic_shutdown_super()
 *        and release aforementioned objects.  Note: dentries and inodes _are_
 *        taken care of and do not need specific handling.
 *
 *        Upon calling this function, the filesystem may no longer alter or
 *        rearrange the set of dentries belonging to this super_block, nor may it
 *        change the attachments of dentries to inodes.
 */
void generic_shutdown_super(struct super_block *sb)
{
        const struct super_operations *sop = sb->s_op;

        if (sb->s_root) {
                shrink_dcache_for_umount(sb);
                sync_filesystem(sb);
                sb->s_flags &= ~SB_ACTIVE;

                cgroup_writeback_umount();

                /* Evict all inodes with zero refcount. */
                evict_inodes(sb);

                /*
                 * Clean up and evict any inodes that still have references due
                 * to fsnotify or the security policy.
                 */
                fsnotify_sb_delete(sb);
                security_sb_delete(sb);

                if (sb->s_dio_done_wq) {
                        destroy_workqueue(sb->s_dio_done_wq);
                        sb->s_dio_done_wq = NULL;
                }

                if (sop->put_super)
                        sop->put_super(sb);

                /*
                 * Now that all potentially-encrypted inodes have been evicted,
                 * the fscrypt keyring can be destroyed.
                 */
                fscrypt_destroy_keyring(sb);

                if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes),
                                "VFS: Busy inodes after unmount of %s (%s)",
                                sb->s_id, sb->s_type->name)) {
                        /*
                         * Adding a proper bailout path here would be hard, but
                         * we can at least make it more likely that a later
                         * iput_final() or such crashes cleanly.
                         */
                        struct inode *inode;

                        spin_lock(&sb->s_inode_list_lock);
                        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                                inode->i_op = VFS_PTR_POISON;
                                inode->i_sb = VFS_PTR_POISON;
                                inode->i_mapping = VFS_PTR_POISON;
                        }
                        spin_unlock(&sb->s_inode_list_lock);
                }
        }
        /*
         * Broadcast to everyone that grabbed a temporary reference to this
         * superblock before we removed it from @fs_supers that the superblock
         * is dying. Every walker of @fs_supers outside of sget{_fc}() will now
         * discard this superblock and treat it as dead.
         *
         * We leave the superblock on @fs_supers so it can be found by
         * sget{_fc}() until we passed sb->kill_sb().
         */
        super_wake(sb, SB_DYING);
        super_unlock_excl(sb);
        if (sb->s_bdi != &noop_backing_dev_info) {
                if (sb->s_iflags & SB_I_PERSB_BDI)
                        bdi_unregister(sb->s_bdi);
                bdi_put(sb->s_bdi);
                sb->s_bdi = &noop_backing_dev_info;
        }
}

EXPORT_SYMBOL(generic_shutdown_super);

bool mount_capable(struct fs_context *fc)
{
        if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT))
                return capable(CAP_SYS_ADMIN);
        else
                return ns_capable(fc->user_ns, CAP_SYS_ADMIN);
}

/**
 * sget_fc - Find or create a superblock
 * @fc:        Filesystem context.
 * @test: Comparison callback
 * @set: Setup callback
 *
 * Create a new superblock or find an existing one.
 *
 * The @test callback is used to find a matching existing superblock.
 * Whether or not the requested parameters in @fc are taken into account
 * is specific to the @test callback that is used. They may even be
 * completely ignored.
 *
 * If an extant superblock is matched, it will be returned unless:
 *
 * (1) the namespace the filesystem context @fc and the extant
 *     superblock's namespace differ
 *
 * (2) the filesystem context @fc has requested that reusing an extant
 *     superblock is not allowed
 *
 * In both cases EBUSY will be returned.
 *
 * If no match is made, a new superblock will be allocated and basic
 * initialisation will be performed (s_type, s_fs_info and s_id will be
 * set and the @set callback will be invoked), the superblock will be
 * published and it will be returned in a partially constructed state
 * with SB_BORN and SB_ACTIVE as yet unset.
 *
 * Return: On success, an extant or newly created superblock is
 *         returned. On failure an error pointer is returned.
 */
struct super_block *sget_fc(struct fs_context *fc,
                            int (*test)(struct super_block *, struct fs_context *),
                            int (*set)(struct super_block *, struct fs_context *))
{
        struct super_block *s = NULL;
        struct super_block *old;
        struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns;
        int err;

retry:
        spin_lock(&sb_lock);
        if (test) {
                hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) {
                        if (test(old, fc))
                                goto share_extant_sb;
                }
        }
        if (!s) {
                spin_unlock(&sb_lock);
                s = alloc_super(fc->fs_type, fc->sb_flags, user_ns);
                if (!s)
                        return ERR_PTR(-ENOMEM);
                goto retry;
        }

        s->s_fs_info = fc->s_fs_info;
        err = set(s, fc);
        if (err) {
                s->s_fs_info = NULL;
                spin_unlock(&sb_lock);
                destroy_unused_super(s);
                return ERR_PTR(err);
        }
        fc->s_fs_info = NULL;
        s->s_type = fc->fs_type;
        s->s_iflags |= fc->s_iflags;
        strscpy(s->s_id, s->s_type->name, sizeof(s->s_id));
        /*
         * Make the superblock visible on @super_blocks and @fs_supers.
         * It's in a nascent state and users should wait on SB_BORN or
         * SB_DYING to be set.
         */
        list_add_tail(&s->s_list, &super_blocks);
        hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
        spin_unlock(&sb_lock);
        get_filesystem(s->s_type);
        shrinker_register(s->s_shrink);
        return s;

share_extant_sb:
        if (user_ns != old->s_user_ns || fc->exclusive) {
                spin_unlock(&sb_lock);
                destroy_unused_super(s);
                if (fc->exclusive)
                        warnfc(fc, "reusing existing filesystem not allowed");
                else
                        warnfc(fc, "reusing existing filesystem in another namespace not allowed");
                return ERR_PTR(-EBUSY);
        }
        if (!grab_super(old))
                goto retry;
        destroy_unused_super(s);
        return old;
}
EXPORT_SYMBOL(sget_fc);

/**
 *        sget        -        find or create a superblock
 *        @type:          filesystem type superblock should belong to
 *        @test:          comparison callback
 *        @set:          setup callback
 *        @flags:          mount flags
 *        @data:          argument to each of them
 */
struct super_block *sget(struct file_system_type *type,
                        int (*test)(struct super_block *,void *),
                        int (*set)(struct super_block *,void *),
                        int flags,
                        void *data)
{
        struct user_namespace *user_ns = current_user_ns();
        struct super_block *s = NULL;
        struct super_block *old;
        int err;

        /* We don't yet pass the user namespace of the parent
         * mount through to here so always use &init_user_ns
         * until that changes.
         */
        if (flags & SB_SUBMOUNT)
                user_ns = &init_user_ns;

retry:
        spin_lock(&sb_lock);
        if (test) {
                hlist_for_each_entry(old, &type->fs_supers, s_instances) {
                        if (!test(old, data))
                                continue;
                        if (user_ns != old->s_user_ns) {
                                spin_unlock(&sb_lock);
                                destroy_unused_super(s);
                                return ERR_PTR(-EBUSY);
                        }
                        if (!grab_super(old))
                                goto retry;
                        destroy_unused_super(s);
                        return old;
                }
        }
        if (!s) {
                spin_unlock(&sb_lock);
                s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns);
                if (!s)
                        return ERR_PTR(-ENOMEM);
                goto retry;
        }

        err = set(s, data);
        if (err) {
                spin_unlock(&sb_lock);
                destroy_unused_super(s);
                return ERR_PTR(err);
        }
        s->s_type = type;
        strscpy(s->s_id, type->name, sizeof(s->s_id));
        list_add_tail(&s->s_list, &super_blocks);
        hlist_add_head(&s->s_instances, &type->fs_supers);
        spin_unlock(&sb_lock);
        get_filesystem(type);
        shrinker_register(s->s_shrink);
        return s;
}
EXPORT_SYMBOL(sget);

void drop_super(struct super_block *sb)
{
        super_unlock_shared(sb);
        put_super(sb);
}

EXPORT_SYMBOL(drop_super);

void drop_super_exclusive(struct super_block *sb)
{
        super_unlock_excl(sb);
        put_super(sb);
}
EXPORT_SYMBOL(drop_super_exclusive);

static void __iterate_supers(void (*f)(struct super_block *))
{
        struct super_block *sb, *p = NULL;

        spin_lock(&sb_lock);
        list_for_each_entry(sb, &super_blocks, s_list) {
                if (super_flags(sb, SB_DYING))
                        continue;
                sb->s_count++;
                spin_unlock(&sb_lock);

                f(sb);

                spin_lock(&sb_lock);
                if (p)
                        __put_super(p);
                p = sb;
        }
        if (p)
                __put_super(p);
        spin_unlock(&sb_lock);
}
/**
 *        iterate_supers - call function for all active superblocks
 *        @f: function to call
 *        @arg: argument to pass to it
 *
 *        Scans the superblock list and calls given function, passing it
 *        locked superblock and given argument.
 */
void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
{
        struct super_block *sb, *p = NULL;

        spin_lock(&sb_lock);
        list_for_each_entry(sb, &super_blocks, s_list) {
                bool locked;

                sb->s_count++;
                spin_unlock(&sb_lock);

                locked = super_lock_shared(sb);
                if (locked) {
                        if (sb->s_root)
                                f(sb, arg);
                        super_unlock_shared(sb);
                }

                spin_lock(&sb_lock);
                if (p)
                        __put_super(p);
                p = sb;
        }
        if (p)
                __put_super(p);
        spin_unlock(&sb_lock);
}

/**
 *        iterate_supers_type - call function for superblocks of given type
 *        @type: fs type
 *        @f: function to call
 *        @arg: argument to pass to it
 *
 *        Scans the superblock list and calls given function, passing it
 *        locked superblock and given argument.
 */
void iterate_supers_type(struct file_system_type *type,
        void (*f)(struct super_block *, void *), void *arg)
{
        struct super_block *sb, *p = NULL;

        spin_lock(&sb_lock);
        hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
                bool locked;

                sb->s_count++;
                spin_unlock(&sb_lock);

                locked = super_lock_shared(sb);
                if (locked) {
                        if (sb->s_root)
                                f(sb, arg);
                        super_unlock_shared(sb);
                }

                spin_lock(&sb_lock);
                if (p)
                        __put_super(p);
                p = sb;
        }
        if (p)
                __put_super(p);
        spin_unlock(&sb_lock);
}

EXPORT_SYMBOL(iterate_supers_type);

struct super_block *user_get_super(dev_t dev, bool excl)
{
        struct super_block *sb;

        spin_lock(&sb_lock);
        list_for_each_entry(sb, &super_blocks, s_list) {
                if (sb->s_dev ==  dev) {
                        bool locked;

                        sb->s_count++;
                        spin_unlock(&sb_lock);
                        /* still alive? */
                        locked = super_lock(sb, excl);
                        if (locked) {
                                if (sb->s_root)
                                        return sb;
                                super_unlock(sb, excl);
                        }
                        /* nope, got unmounted */
                        spin_lock(&sb_lock);
                        __put_super(sb);
                        break;
                }
        }
        spin_unlock(&sb_lock);
        return NULL;
}

/**
 * reconfigure_super - asks filesystem to change superblock parameters
 * @fc: The superblock and configuration
 *
 * Alters the configuration parameters of a live superblock.
 */
int reconfigure_super(struct fs_context *fc)
{
        struct super_block *sb = fc->root->d_sb;
        int retval;
        bool remount_ro = false;
        bool remount_rw = false;
        bool force = fc->sb_flags & SB_FORCE;

        if (fc->sb_flags_mask & ~MS_RMT_MASK)
                return -EINVAL;
        if (sb->s_writers.frozen != SB_UNFROZEN)
                return -EBUSY;

        retval = security_sb_remount(sb, fc->security);
        if (retval)
                return retval;

        if (fc->sb_flags_mask & SB_RDONLY) {
#ifdef CONFIG_BLOCK
                if (!(fc->sb_flags & SB_RDONLY) && sb->s_bdev &&
                    bdev_read_only(sb->s_bdev))
                        return -EACCES;
#endif
                remount_rw = !(fc->sb_flags & SB_RDONLY) && sb_rdonly(sb);
                remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb);
        }

        if (remount_ro) {
                if (!hlist_empty(&sb->s_pins)) {
                        super_unlock_excl(sb);
                        group_pin_kill(&sb->s_pins);
                        __super_lock_excl(sb);
                        if (!sb->s_root)
                                return 0;
                        if (sb->s_writers.frozen != SB_UNFROZEN)
                                return -EBUSY;
                        remount_ro = !sb_rdonly(sb);
                }
        }
        shrink_dcache_sb(sb);

        /* If we are reconfiguring to RDONLY and current sb is read/write,
         * make sure there are no files open for writing.
         */
        if (remount_ro) {
                if (force) {
                        sb_start_ro_state_change(sb);
                } else {
                        retval = sb_prepare_remount_readonly(sb);
                        if (retval)
                                return retval;
                }
        } else if (remount_rw) {
                /*
                 * Protect filesystem's reconfigure code from writes from
                 * userspace until reconfigure finishes.
                 */
                sb_start_ro_state_change(sb);
        }

        if (fc->ops->reconfigure) {
                retval = fc->ops->reconfigure(fc);
                if (retval) {
                        if (!force)
                                goto cancel_readonly;
                        /* If forced remount, go ahead despite any errors */
                        WARN(1, "forced remount of a %s fs returned %i\n",
                             sb->s_type->name, retval);
                }
        }

        WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) |
                                 (fc->sb_flags & fc->sb_flags_mask)));
        sb_end_ro_state_change(sb);

        /*
         * Some filesystems modify their metadata via some other path than the
         * bdev buffer cache (eg. use a private mapping, or directories in
         * pagecache, etc). Also file data modifications go via their own
         * mappings. So If we try to mount readonly then copy the filesystem
         * from bdev, we could get stale data, so invalidate it to give a best
         * effort at coherency.
         */
        if (remount_ro && sb->s_bdev)
                invalidate_bdev(sb->s_bdev);
        return 0;

cancel_readonly:
        sb_end_ro_state_change(sb);
        return retval;
}

static void do_emergency_remount_callback(struct super_block *sb)
{
        bool locked = super_lock_excl(sb);

        if (locked && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) {
                struct fs_context *fc;

                fc = fs_context_for_reconfigure(sb->s_root,
                                        SB_RDONLY | SB_FORCE, SB_RDONLY);
                if (!IS_ERR(fc)) {
                        if (parse_monolithic_mount_data(fc, NULL) == 0)
                                (void)reconfigure_super(fc);
                        put_fs_context(fc);
                }
        }
        if (locked)
                super_unlock_excl(sb);
}

static void do_emergency_remount(struct work_struct *work)
{
        __iterate_supers(do_emergency_remount_callback);
        kfree(work);
        printk("Emergency Remount complete\n");
}

void emergency_remount(void)
{
        struct work_struct *work;

        work = kmalloc(sizeof(*work), GFP_ATOMIC);
        if (work) {
                INIT_WORK(work, do_emergency_remount);
                schedule_work(work);
        }
}

static void do_thaw_all_callback(struct super_block *sb)
{
        bool locked = super_lock_excl(sb);

        if (locked && sb->s_root) {
                if (IS_ENABLED(CONFIG_BLOCK))
                        while (sb->s_bdev && !bdev_thaw(sb->s_bdev))
                                pr_warn("Emergency Thaw on %pg\n", sb->s_bdev);
                thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE);
                return;
        }
        if (locked)
                super_unlock_excl(sb);
}

static void do_thaw_all(struct work_struct *work)
{
        __iterate_supers(do_thaw_all_callback);
        kfree(work);
        printk(KERN_WARNING "Emergency Thaw complete\n");
}

/**
 * emergency_thaw_all -- forcibly thaw every frozen filesystem
 *
 * Used for emergency unfreeze of all filesystems via SysRq
 */
void emergency_thaw_all(void)
{
        struct work_struct *work;

        work = kmalloc(sizeof(*work), GFP_ATOMIC);
        if (work) {
                INIT_WORK(work, do_thaw_all);
                schedule_work(work);
        }
}

static DEFINE_IDA(unnamed_dev_ida);

/**
 * get_anon_bdev - Allocate a block device for filesystems which don't have one.
 * @p: Pointer to a dev_t.
 *
 * Filesystems which don't use real block devices can call this function
 * to allocate a virtual block device.
 *
 * Context: Any context.  Frequently called while holding sb_lock.
 * Return: 0 on success, -EMFILE if there are no anonymous bdevs left
 * or -ENOMEM if memory allocation failed.
 */
int get_anon_bdev(dev_t *p)
{
        int dev;

        /*
         * Many userspace utilities consider an FSID of 0 invalid.
         * Always return at least 1 from get_anon_bdev.
         */
        dev = ida_alloc_range(&unnamed_dev_ida, 1, (1 << MINORBITS) - 1,
                        GFP_ATOMIC);
        if (dev == -ENOSPC)
                dev = -EMFILE;
        if (dev < 0)
                return dev;

        *p = MKDEV(0, dev);
        return 0;
}
EXPORT_SYMBOL(get_anon_bdev);

void free_anon_bdev(dev_t dev)
{
        ida_free(&unnamed_dev_ida, MINOR(dev));
}
EXPORT_SYMBOL(free_anon_bdev);

int set_anon_super(struct super_block *s, void *data)
{
        return get_anon_bdev(&s->s_dev);
}
EXPORT_SYMBOL(set_anon_super);

void kill_anon_super(struct super_block *sb)
{
        dev_t dev = sb->s_dev;
        generic_shutdown_super(sb);
        kill_super_notify(sb);
        free_anon_bdev(dev);
}
EXPORT_SYMBOL(kill_anon_super);

void kill_litter_super(struct super_block *sb)
{
        if (sb->s_root)
                d_genocide(sb->s_root);
        kill_anon_super(sb);
}
EXPORT_SYMBOL(kill_litter_super);

int set_anon_super_fc(struct super_block *sb, struct fs_context *fc)
{
        return set_anon_super(sb, NULL);
}
EXPORT_SYMBOL(set_anon_super_fc);

static int test_keyed_super(struct super_block *sb, struct fs_context *fc)
{
        return sb->s_fs_info == fc->s_fs_info;
}

static int test_single_super(struct super_block *s, struct fs_context *fc)
{
        return 1;
}

static int vfs_get_super(struct fs_context *fc,
                int (*test)(struct super_block *, struct fs_context *),
                int (*fill_super)(struct super_block *sb,
                                  struct fs_context *fc))
{
        struct super_block *sb;
        int err;

        sb = sget_fc(fc, test, set_anon_super_fc);
        if (IS_ERR(sb))
                return PTR_ERR(sb);

        if (!sb->s_root) {
                err = fill_super(sb, fc);
                if (err)
                        goto error;

                sb->s_flags |= SB_ACTIVE;
        }

        fc->root = dget(sb->s_root);
        return 0;

error:
        deactivate_locked_super(sb);
        return err;
}

int get_tree_nodev(struct fs_context *fc,
                  int (*fill_super)(struct super_block *sb,
                                    struct fs_context *fc))
{
        return vfs_get_super(fc, NULL, fill_super);
}
EXPORT_SYMBOL(get_tree_nodev);

int get_tree_single(struct fs_context *fc,
                  int (*fill_super)(struct super_block *sb,
                                    struct fs_context *fc))
{
        return vfs_get_super(fc, test_single_super, fill_super);
}
EXPORT_SYMBOL(get_tree_single);

int get_tree_keyed(struct fs_context *fc,
                  int (*fill_super)(struct super_block *sb,
                                    struct fs_context *fc),
                void *key)
{
        fc->s_fs_info = key;
        return vfs_get_super(fc, test_keyed_super, fill_super);
}
EXPORT_SYMBOL(get_tree_keyed);

static int set_bdev_super(struct super_block *s, void *data)
{
        s->s_dev = *(dev_t *)data;
        return 0;
}

static int super_s_dev_set(struct super_block *s, struct fs_context *fc)
{
        return set_bdev_super(s, fc->sget_key);
}

static int super_s_dev_test(struct super_block *s, struct fs_context *fc)
{
        return !(s->s_iflags & SB_I_RETIRED) &&
                s->s_dev == *(dev_t *)fc->sget_key;
}

/**
 * sget_dev - Find or create a superblock by device number
 * @fc: Filesystem context.
 * @dev: device number
 *
 * Find or create a superblock using the provided device number that
 * will be stored in fc->sget_key.
 *
 * If an extant superblock is matched, then that will be returned with
 * an elevated reference count that the caller must transfer or discard.
 *
 * If no match is made, a new superblock will be allocated and basic
 * initialisation will be performed (s_type, s_fs_info, s_id, s_dev will
 * be set). The superblock will be published and it will be returned in
 * a partially constructed state with SB_BORN and SB_ACTIVE as yet
 * unset.
 *
 * Return: an existing or newly created superblock on success, an error
 *         pointer on failure.
 */
struct super_block *sget_dev(struct fs_context *fc, dev_t dev)
{
        fc->sget_key = &dev;
        return sget_fc(fc, super_s_dev_test, super_s_dev_set);
}
EXPORT_SYMBOL(sget_dev);

#ifdef CONFIG_BLOCK
/*
 * Lock the superblock that is holder of the bdev. Returns the superblock
 * pointer if we successfully locked the superblock and it is alive. Otherwise
 * we return NULL and just unlock bdev->bd_holder_lock.
 *
 * The function must be called with bdev->bd_holder_lock and releases it.
 */
static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl)
        __releases(&bdev->bd_holder_lock)
{
        struct super_block *sb = bdev->bd_holder;
        bool locked;

        lockdep_assert_held(&bdev->bd_holder_lock);
        lockdep_assert_not_held(&sb->s_umount);
        lockdep_assert_not_held(&bdev->bd_disk->open_mutex);

        /* Make sure sb doesn't go away from under us */
        spin_lock(&sb_lock);
        sb->s_count++;
        spin_unlock(&sb_lock);

        mutex_unlock(&bdev->bd_holder_lock);

        locked = super_lock(sb, excl);

        /*
         * If the superblock wasn't already SB_DYING then we hold
         * s_umount and can safely drop our temporary reference.
         */
        put_super(sb);

        if (!locked)
                return NULL;

        if (!sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
                super_unlock(sb, excl);
                return NULL;
        }

        return sb;
}

static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
{
        struct super_block *sb;

        sb = bdev_super_lock(bdev, false);
        if (!sb)
                return;

        if (!surprise)
                sync_filesystem(sb);
        shrink_dcache_sb(sb);
        invalidate_inodes(sb);
        if (sb->s_op->shutdown)
                sb->s_op->shutdown(sb);

        super_unlock_shared(sb);
}

static void fs_bdev_sync(struct block_device *bdev)
{
        struct super_block *sb;

        sb = bdev_super_lock(bdev, false);
        if (!sb)
                return;

        sync_filesystem(sb);
        super_unlock_shared(sb);
}

static struct super_block *get_bdev_super(struct block_device *bdev)
{
        bool active = false;
        struct super_block *sb;

        sb = bdev_super_lock(bdev, true);
        if (sb) {
                active = atomic_inc_not_zero(&sb->s_active);
                super_unlock_excl(sb);
        }
        if (!active)
                return NULL;
        return sb;
}

/**
 * fs_bdev_freeze - freeze owning filesystem of block device
 * @bdev: block device
 *
 * Freeze the filesystem that owns this block device if it is still
 * active.
 *
 * A filesystem that owns multiple block devices may be frozen from each
 * block device and won't be unfrozen until all block devices are
 * unfrozen. Each block device can only freeze the filesystem once as we
 * nest freezes for block devices in the block layer.
 *
 * Return: If the freeze was successful zero is returned. If the freeze
 *         failed a negative error code is returned.
 */
static int fs_bdev_freeze(struct block_device *bdev)
{
        struct super_block *sb;
        int error = 0;

        lockdep_assert_held(&bdev->bd_fsfreeze_mutex);

        sb = get_bdev_super(bdev);
        if (!sb)
                return -EINVAL;

        if (sb->s_op->freeze_super)
                error = sb->s_op->freeze_super(sb,
                                FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
        else
                error = freeze_super(sb,
                                FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
        if (!error)
                error = sync_blockdev(bdev);
        deactivate_super(sb);
        return error;
}

/**
 * fs_bdev_thaw - thaw owning filesystem of block device
 * @bdev: block device
 *
 * Thaw the filesystem that owns this block device.
 *
 * A filesystem that owns multiple block devices may be frozen from each
 * block device and won't be unfrozen until all block devices are
 * unfrozen. Each block device can only freeze the filesystem once as we
 * nest freezes for block devices in the block layer.
 *
 * Return: If the thaw was successful zero is returned. If the thaw
 *         failed a negative error code is returned. If this function
 *         returns zero it doesn't mean that the filesystem is unfrozen
 *         as it may have been frozen multiple times (kernel may hold a
 *         freeze or might be frozen from other block devices).
 */
static int fs_bdev_thaw(struct block_device *bdev)
{
        struct super_block *sb;
        int error;

        lockdep_assert_held(&bdev->bd_fsfreeze_mutex);

        sb = get_bdev_super(bdev);
        if (WARN_ON_ONCE(!sb))
                return -EINVAL;

        if (sb->s_op->thaw_super)
                error = sb->s_op->thaw_super(sb,
                                FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
        else
                error = thaw_super(sb,
                                FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
        deactivate_super(sb);
        return error;
}

const struct blk_holder_ops fs_holder_ops = {
        .mark_dead                = fs_bdev_mark_dead,
        .sync                        = fs_bdev_sync,
        .freeze                        = fs_bdev_freeze,
        .thaw                        = fs_bdev_thaw,
};
EXPORT_SYMBOL_GPL(fs_holder_ops);

int setup_bdev_super(struct super_block *sb, int sb_flags,
                struct fs_context *fc)
{
        blk_mode_t mode = sb_open_mode(sb_flags);
        struct file *bdev_file;
        struct block_device *bdev;

        bdev_file = bdev_file_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
        if (IS_ERR(bdev_file)) {
                if (fc)
                        errorf(fc, "%s: Can't open blockdev", fc->source);
                return PTR_ERR(bdev_file);
        }
        bdev = file_bdev(bdev_file);

        /*
         * This really should be in blkdev_get_by_dev, but right now can't due
         * to legacy issues that require us to allow opening a block device node
         * writable from userspace even for a read-only block device.
         */
        if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
                bdev_fput(bdev_file);
                return -EACCES;
        }

        /*
         * It is enough to check bdev was not frozen before we set
         * s_bdev as freezing will wait until SB_BORN is set.
         */
        if (atomic_read(&bdev->bd_fsfreeze_count) > 0) {
                if (fc)
                        warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
                bdev_fput(bdev_file);
                return -EBUSY;
        }
        spin_lock(&sb_lock);
        sb->s_bdev_file = bdev_file;
        sb->s_bdev = bdev;
        sb->s_bdi = bdi_get(bdev->bd_disk->bdi);
        if (bdev_stable_writes(bdev))
                sb->s_iflags |= SB_I_STABLE_WRITES;
        spin_unlock(&sb_lock);

        snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
        shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name,
                                sb->s_id);
        sb_set_blocksize(sb, block_size(bdev));
        return 0;
}
EXPORT_SYMBOL_GPL(setup_bdev_super);

/**
 * get_tree_bdev - Get a superblock based on a single block device
 * @fc: The filesystem context holding the parameters
 * @fill_super: Helper to initialise a new superblock
 */
int get_tree_bdev(struct fs_context *fc,
                int (*fill_super)(struct super_block *,
                                  struct fs_context *))
{
        struct super_block *s;
        int error = 0;
        dev_t dev;

        if (!fc->source)
                return invalf(fc, "No source specified");

        error = lookup_bdev(fc->source, &dev);
        if (error) {
                errorf(fc, "%s: Can't lookup blockdev", fc->source);
                return error;
        }

        fc->sb_flags |= SB_NOSEC;
        s = sget_dev(fc, dev);
        if (IS_ERR(s))
                return PTR_ERR(s);

        if (s->s_root) {
                /* Don't summarily change the RO/RW state. */
                if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
                        warnf(fc, "%pg: Can't mount, would change RO state", s->s_bdev);
                        deactivate_locked_super(s);
                        return -EBUSY;
                }
        } else {
                error = setup_bdev_super(s, fc->sb_flags, fc);
                if (!error)
                        error = fill_super(s, fc);
                if (error) {
                        deactivate_locked_super(s);
                        return error;
                }
                s->s_flags |= SB_ACTIVE;
        }

        BUG_ON(fc->root);
        fc->root = dget(s->s_root);
        return 0;
}
EXPORT_SYMBOL(get_tree_bdev);

static int test_bdev_super(struct super_block *s, void *data)
{
        return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data;
}

struct dentry *mount_bdev(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data,
        int (*fill_super)(struct super_block *, void *, int))
{
        struct super_block *s;
        int error;
        dev_t dev;

        error = lookup_bdev(dev_name, &dev);
        if (error)
                return ERR_PTR(error);

        flags |= SB_NOSEC;
        s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev);
        if (IS_ERR(s))
                return ERR_CAST(s);

        if (s->s_root) {
                if ((flags ^ s->s_flags) & SB_RDONLY) {
                        deactivate_locked_super(s);
                        return ERR_PTR(-EBUSY);
                }
        } else {
                error = setup_bdev_super(s, flags, NULL);
                if (!error)
                        error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
                if (error) {
                        deactivate_locked_super(s);
                        return ERR_PTR(error);
                }

                s->s_flags |= SB_ACTIVE;
        }

        return dget(s->s_root);
}
EXPORT_SYMBOL(mount_bdev);

void kill_block_super(struct super_block *sb)
{
        struct block_device *bdev = sb->s_bdev;

        generic_shutdown_super(sb);
        if (bdev) {
                sync_blockdev(bdev);
                bdev_fput(sb->s_bdev_file);
        }
}

EXPORT_SYMBOL(kill_block_super);
#endif

struct dentry *mount_nodev(struct file_system_type *fs_type,
        int flags, void *data,
        int (*fill_super)(struct super_block *, void *, int))
{
        int error;
        struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);

        if (IS_ERR(s))
                return ERR_CAST(s);

        error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
        if (error) {
                deactivate_locked_super(s);
                return ERR_PTR(error);
        }
        s->s_flags |= SB_ACTIVE;
        return dget(s->s_root);
}
EXPORT_SYMBOL(mount_nodev);

int reconfigure_single(struct super_block *s,
                       int flags, void *data)
{
        struct fs_context *fc;
        int ret;

        /* The caller really need to be passing fc down into mount_single(),
         * then a chunk of this can be removed.  [Bollocks -- AV]
         * Better yet, reconfiguration shouldn't happen, but rather the second
         * mount should be rejected if the parameters are not compatible.
         */
        fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK);
        if (IS_ERR(fc))
                return PTR_ERR(fc);

        ret = parse_monolithic_mount_data(fc, data);
        if (ret < 0)
                goto out;

        ret = reconfigure_super(fc);
out:
        put_fs_context(fc);
        return ret;
}

static int compare_single(struct super_block *s, void *p)
{
        return 1;
}

struct dentry *mount_single(struct file_system_type *fs_type,
        int flags, void *data,
        int (*fill_super)(struct super_block *, void *, int))
{
        struct super_block *s;
        int error;

        s = sget(fs_type, compare_single, set_anon_super, flags, NULL);
        if (IS_ERR(s))
                return ERR_CAST(s);
        if (!s->s_root) {
                error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
                if (!error)
                        s->s_flags |= SB_ACTIVE;
        } else {
                error = reconfigure_single(s, flags, data);
        }
        if (unlikely(error)) {
                deactivate_locked_super(s);
                return ERR_PTR(error);
        }
        return dget(s->s_root);
}
EXPORT_SYMBOL(mount_single);

/**
 * vfs_get_tree - Get the mountable root
 * @fc: The superblock configuration context.
 *
 * The filesystem is invoked to get or create a superblock which can then later
 * be used for mounting.  The filesystem places a pointer to the root to be
 * used for mounting in @fc->root.
 */
int vfs_get_tree(struct fs_context *fc)
{
        struct super_block *sb;
        int error;

        if (fc->root)
                return -EBUSY;

        /* Get the mountable root in fc->root, with a ref on the root and a ref
         * on the superblock.
         */
        error = fc->ops->get_tree(fc);
        if (error < 0)
                return error;

        if (!fc->root) {
                pr_err("Filesystem %s get_tree() didn't set fc->root\n",
                       fc->fs_type->name);
                /* We don't know what the locking state of the superblock is -
                 * if there is a superblock.
                 */
                BUG();
        }

        sb = fc->root->d_sb;
        WARN_ON(!sb->s_bdi);

        /*
         * super_wake() contains a memory barrier which also care of
         * ordering for super_cache_count(). We place it before setting
         * SB_BORN as the data dependency between the two functions is
         * the superblock structure contents that we just set up, not
         * the SB_BORN flag.
         */
        super_wake(sb, SB_BORN);

        error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL);
        if (unlikely(error)) {
                fc_drop_locked(fc);
                return error;
        }

        /*
         * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
         * but s_maxbytes was an unsigned long long for many releases. Throw
         * this warning for a little while to try and catch filesystems that
         * violate this rule.
         */
        WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
                "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes);

        return 0;
}
EXPORT_SYMBOL(vfs_get_tree);

/*
 * Setup private BDI for given superblock. It gets automatically cleaned up
 * in generic_shutdown_super().
 */
int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
{
        struct backing_dev_info *bdi;
        int err;
        va_list args;

        bdi = bdi_alloc(NUMA_NO_NODE);
        if (!bdi)
                return -ENOMEM;

        va_start(args, fmt);
        err = bdi_register_va(bdi, fmt, args);
        va_end(args);
        if (err) {
                bdi_put(bdi);
                return err;
        }
        WARN_ON(sb->s_bdi != &noop_backing_dev_info);
        sb->s_bdi = bdi;
        sb->s_iflags |= SB_I_PERSB_BDI;

        return 0;
}
EXPORT_SYMBOL(super_setup_bdi_name);

/*
 * Setup private BDI for given superblock. I gets automatically cleaned up
 * in generic_shutdown_super().
 */
int super_setup_bdi(struct super_block *sb)
{
        static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);

        return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name,
                                    atomic_long_inc_return(&bdi_seq));
}
EXPORT_SYMBOL(super_setup_bdi);

/**
 * sb_wait_write - wait until all writers to given file system finish
 * @sb: the super for which we wait
 * @level: type of writers we wait for (normal vs page fault)
 *
 * This function waits until there are no writers of given type to given file
 * system.
 */
static void sb_wait_write(struct super_block *sb, int level)
{
        percpu_down_write(sb->s_writers.rw_sem + level-1);
}

/*
 * We are going to return to userspace and forget about these locks, the
 * ownership goes to the caller of thaw_super() which does unlock().
 */
static void lockdep_sb_freeze_release(struct super_block *sb)
{
        int level;

        for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
                percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
}

/*
 * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb).
 */
static void lockdep_sb_freeze_acquire(struct super_block *sb)
{
        int level;

        for (level = 0; level < SB_FREEZE_LEVELS; ++level)
                percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
}

static void sb_freeze_unlock(struct super_block *sb, int level)
{
        for (level--; level >= 0; level--)
                percpu_up_write(sb->s_writers.rw_sem + level);
}

static int wait_for_partially_frozen(struct super_block *sb)
{
        int ret = 0;

        do {
                unsigned short old = sb->s_writers.frozen;

                up_write(&sb->s_umount);
                ret = wait_var_event_killable(&sb->s_writers.frozen,
                                               sb->s_writers.frozen != old);
                down_write(&sb->s_umount);
        } while (ret == 0 &&
                 sb->s_writers.frozen != SB_UNFROZEN &&
                 sb->s_writers.frozen != SB_FREEZE_COMPLETE);

        return ret;
}

#define FREEZE_HOLDERS (FREEZE_HOLDER_KERNEL | FREEZE_HOLDER_USERSPACE)
#define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST)

static inline int freeze_inc(struct super_block *sb, enum freeze_holder who)
{
        WARN_ON_ONCE((who & ~FREEZE_FLAGS));
        WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);

        if (who & FREEZE_HOLDER_KERNEL)
                ++sb->s_writers.freeze_kcount;
        if (who & FREEZE_HOLDER_USERSPACE)
                ++sb->s_writers.freeze_ucount;
        return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount;
}

static inline int freeze_dec(struct super_block *sb, enum freeze_holder who)
{
        WARN_ON_ONCE((who & ~FREEZE_FLAGS));
        WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);

        if ((who & FREEZE_HOLDER_KERNEL) && sb->s_writers.freeze_kcount)
                --sb->s_writers.freeze_kcount;
        if ((who & FREEZE_HOLDER_USERSPACE) && sb->s_writers.freeze_ucount)
                --sb->s_writers.freeze_ucount;
        return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount;
}

static inline bool may_freeze(struct super_block *sb, enum freeze_holder who)
{
        WARN_ON_ONCE((who & ~FREEZE_FLAGS));
        WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);

        if (who & FREEZE_HOLDER_KERNEL)
                return (who & FREEZE_MAY_NEST) ||
                       sb->s_writers.freeze_kcount == 0;
        if (who & FREEZE_HOLDER_USERSPACE)
                return (who & FREEZE_MAY_NEST) ||
                       sb->s_writers.freeze_ucount == 0;
        return false;
}

/**
 * freeze_super - lock the filesystem and force it into a consistent state
 * @sb: the super to lock
 * @who: context that wants to freeze
 *
 * Syncs the super to make sure the filesystem is consistent and calls the fs's
 * freeze_fs.  Subsequent calls to this without first thawing the fs may return
 * -EBUSY.
 *
 * @who should be:
 * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs;
 * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs.
 * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed.
 *
 * The @who argument distinguishes between the kernel and userspace trying to
 * freeze the filesystem.  Although there cannot be multiple kernel freezes or
 * multiple userspace freezes in effect at any given time, the kernel and
 * userspace can both hold a filesystem frozen.  The filesystem remains frozen
 * until there are no kernel or userspace freezes in effect.
 *
 * A filesystem may hold multiple devices and thus a filesystems may be
 * frozen through the block layer via multiple block devices. In this
 * case the request is marked as being allowed to nest by passing
 * FREEZE_MAY_NEST. The filesystem remains frozen until all block
 * devices are unfrozen. If multiple freezes are attempted without
 * FREEZE_MAY_NEST -EBUSY will be returned.
 *
 * During this function, sb->s_writers.frozen goes through these values:
 *
 * SB_UNFROZEN: File system is normal, all writes progress as usual.
 *
 * SB_FREEZE_WRITE: The file system is in the process of being frozen.  New
 * writes should be blocked, though page faults are still allowed. We wait for
 * all writes to complete and then proceed to the next stage.
 *
 * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
 * but internal fs threads can still modify the filesystem (although they
 * should not dirty new pages or inodes), writeback can run etc. After waiting
 * for all running page faults we sync the filesystem which will clean all
 * dirty pages and inodes (no new dirty pages or inodes can be created when
 * sync is running).
 *
 * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
 * modification are blocked (e.g. XFS preallocation truncation on inode
 * reclaim). This is usually implemented by blocking new transactions for
 * filesystems that have them and need this additional guard. After all
 * internal writers are finished we call ->freeze_fs() to finish filesystem
 * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
 * mostly auxiliary for filesystems to verify they do not modify frozen fs.
 *
 * sb->s_writers.frozen is protected by sb->s_umount.
 *
 * Return: If the freeze was successful zero is returned. If the freeze
 *         failed a negative error code is returned.
 */
int freeze_super(struct super_block *sb, enum freeze_holder who)
{
        int ret;

        if (!super_lock_excl(sb)) {
                WARN_ON_ONCE("Dying superblock while freezing!");
                return -EINVAL;
        }
        atomic_inc(&sb->s_active);

retry:
        if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) {
                if (may_freeze(sb, who))
                        ret = !!WARN_ON_ONCE(freeze_inc(sb, who) == 1);
                else
                        ret = -EBUSY;
                /* All freezers share a single active reference. */
                deactivate_locked_super(sb);
                return ret;
        }

        if (sb->s_writers.frozen != SB_UNFROZEN) {
                ret = wait_for_partially_frozen(sb);
                if (ret) {
                        deactivate_locked_super(sb);
                        return ret;
                }

                goto retry;
        }

        if (sb_rdonly(sb)) {
                /* Nothing to do really... */
                WARN_ON_ONCE(freeze_inc(sb, who) > 1);
                sb->s_writers.frozen = SB_FREEZE_COMPLETE;
                wake_up_var(&sb->s_writers.frozen);
                super_unlock_excl(sb);
                return 0;
        }

        sb->s_writers.frozen = SB_FREEZE_WRITE;
        /* Release s_umount to preserve sb_start_write -> s_umount ordering */
        super_unlock_excl(sb);
        sb_wait_write(sb, SB_FREEZE_WRITE);
        __super_lock_excl(sb);

        /* Now we go and block page faults... */
        sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
        sb_wait_write(sb, SB_FREEZE_PAGEFAULT);

        /* All writers are done so after syncing there won't be dirty data */
        ret = sync_filesystem(sb);
        if (ret) {
                sb->s_writers.frozen = SB_UNFROZEN;
                sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT);
                wake_up_var(&sb->s_writers.frozen);
                deactivate_locked_super(sb);
                return ret;
        }

        /* Now wait for internal filesystem counter */
        sb->s_writers.frozen = SB_FREEZE_FS;
        sb_wait_write(sb, SB_FREEZE_FS);

        if (sb->s_op->freeze_fs) {
                ret = sb->s_op->freeze_fs(sb);
                if (ret) {
                        printk(KERN_ERR
                                "VFS:Filesystem freeze failed\n");
                        sb->s_writers.frozen = SB_UNFROZEN;
                        sb_freeze_unlock(sb, SB_FREEZE_FS);
                        wake_up_var(&sb->s_writers.frozen);
                        deactivate_locked_super(sb);
                        return ret;
                }
        }
        /*
         * For debugging purposes so that fs can warn if it sees write activity
         * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
         */
        WARN_ON_ONCE(freeze_inc(sb, who) > 1);
        sb->s_writers.frozen = SB_FREEZE_COMPLETE;
        wake_up_var(&sb->s_writers.frozen);
        lockdep_sb_freeze_release(sb);
        super_unlock_excl(sb);
        return 0;
}
EXPORT_SYMBOL(freeze_super);

/*
 * Undoes the effect of a freeze_super_locked call.  If the filesystem is
 * frozen both by userspace and the kernel, a thaw call from either source
 * removes that state without releasing the other state or unlocking the
 * filesystem.
 */
static int thaw_super_locked(struct super_block *sb, enum freeze_holder who)
{
        int error = -EINVAL;

        if (sb->s_writers.frozen != SB_FREEZE_COMPLETE)
                goto out_unlock;

        /*
         * All freezers share a single active reference.
         * So just unlock in case there are any left.
         */
        if (freeze_dec(sb, who))
                goto out_unlock;

        if (sb_rdonly(sb)) {
                sb->s_writers.frozen = SB_UNFROZEN;
                wake_up_var(&sb->s_writers.frozen);
                goto out_deactivate;
        }

        lockdep_sb_freeze_acquire(sb);

        if (sb->s_op->unfreeze_fs) {
                error = sb->s_op->unfreeze_fs(sb);
                if (error) {
                        pr_err("VFS: Filesystem thaw failed\n");
                        freeze_inc(sb, who);
                        lockdep_sb_freeze_release(sb);
                        goto out_unlock;
                }
        }

        sb->s_writers.frozen = SB_UNFROZEN;
        wake_up_var(&sb->s_writers.frozen);
        sb_freeze_unlock(sb, SB_FREEZE_FS);
out_deactivate:
        deactivate_locked_super(sb);
        return 0;

out_unlock:
        super_unlock_excl(sb);
        return error;
}

/**
 * thaw_super -- unlock filesystem
 * @sb: the super to thaw
 * @who: context that wants to freeze
 *
 * Unlocks the filesystem and marks it writeable again after freeze_super()
 * if there are no remaining freezes on the filesystem.
 *
 * @who should be:
 * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs;
 * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs.
 * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed
 *
 * A filesystem may hold multiple devices and thus a filesystems may
 * have been frozen through the block layer via multiple block devices.
 * The filesystem remains frozen until all block devices are unfrozen.
 */
int thaw_super(struct super_block *sb, enum freeze_holder who)
{
        if (!super_lock_excl(sb)) {
                WARN_ON_ONCE("Dying superblock while thawing!");
                return -EINVAL;
        }
        return thaw_super_locked(sb, who);
}
EXPORT_SYMBOL(thaw_super);

/*
 * Create workqueue for deferred direct IO completions. We allocate the
 * workqueue when it's first needed. This avoids creating workqueue for
 * filesystems that don't need it and also allows us to create the workqueue
 * late enough so the we can include s_id in the name of the workqueue.
 */
int sb_init_dio_done_wq(struct super_block *sb)
{
        struct workqueue_struct *old;
        struct workqueue_struct *wq = alloc_workqueue("dio/%s",
                                                      WQ_MEM_RECLAIM, 0,
                                                      sb->s_id);
        if (!wq)
                return -ENOMEM;
        /*
         * This has to be atomic as more DIOs can race to create the workqueue
         */
        old = cmpxchg(&sb->s_dio_done_wq, NULL, wq);
        /* Someone created workqueue before us? Free ours... */
        if (old)
                destroy_workqueue(wq);
        return 0;
}
EXPORT_SYMBOL_GPL(sb_init_dio_done_wq);































































    1 































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PID_NS_H
#define _LINUX_PID_NS_H

#include <linux/sched.h>
#include <linux/bug.h>
#include <linux/mm.h>
#include <linux/workqueue.h>
#include <linux/threads.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/idr.h>

/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
#define MAX_PID_NS_LEVEL 32

struct fs_pin;

#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
/* modes for vm.memfd_noexec sysctl */
#define MEMFD_NOEXEC_SCOPE_EXEC                        0 /* MFD_EXEC implied if unset */
#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL                1 /* MFD_NOEXEC_SEAL implied if unset */
#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED        2 /* same as 1, except MFD_EXEC rejected */
#endif

struct pid_namespace {
        struct idr idr;
        struct rcu_head rcu;
        unsigned int pid_allocated;
        struct task_struct *child_reaper;
        struct kmem_cache *pid_cachep;
        unsigned int level;
        struct pid_namespace *parent;
#ifdef CONFIG_BSD_PROCESS_ACCT
        struct fs_pin *bacct;
#endif
        struct user_namespace *user_ns;
        struct ucounts *ucounts;
        int reboot;        /* group exit code if this pidns was rebooted */
        struct ns_common ns;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
        int memfd_noexec_scope;
#endif
} __randomize_layout;

extern struct pid_namespace init_pid_ns;

#define PIDNS_ADDING (1U << 31)

#ifdef CONFIG_PID_NS
static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
{
        if (ns != &init_pid_ns)
                refcount_inc(&ns->ns.count);
        return ns;
}

#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
{
        int scope = MEMFD_NOEXEC_SCOPE_EXEC;

        for (; ns; ns = ns->parent)
                scope = max(scope, READ_ONCE(ns->memfd_noexec_scope));

        return scope;
}
#else
static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
{
        return 0;
}
#endif

extern struct pid_namespace *copy_pid_ns(unsigned long flags,
        struct user_namespace *user_ns, struct pid_namespace *ns);
extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);
extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd);
extern void put_pid_ns(struct pid_namespace *ns);

#else /* !CONFIG_PID_NS */
#include <linux/err.h>

static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
{
        return ns;
}

static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
{
        return 0;
}

static inline struct pid_namespace *copy_pid_ns(unsigned long flags,
        struct user_namespace *user_ns, struct pid_namespace *ns)
{
        if (flags & CLONE_NEWPID)
                ns = ERR_PTR(-EINVAL);
        return ns;
}

static inline void put_pid_ns(struct pid_namespace *ns)
{
}

static inline void zap_pid_ns_processes(struct pid_namespace *ns)
{
        BUG();
}

static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
{
        return 0;
}
#endif /* CONFIG_PID_NS */

extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
void pidhash_init(void);
void pid_idr_init(void);

static inline bool task_is_in_init_pid_ns(struct task_struct *tsk)
{
        return task_active_pid_ns(tsk) == &init_pid_ns;
}

#endif /* _LINUX_PID_NS_H */

























































































































































    3 








    3 



























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Directory notifications for Linux.
 *
 * Copyright (C) 2000,2001,2002 Stephen Rothwell
 *
 * Copyright (C) 2009 Eric Paris <Red Hat Inc>
 * dnotify was largly rewritten to use the new fsnotify infrastructure
 */
#include <linux/fs.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/dnotify.h>
#include <linux/init.h>
#include <linux/security.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/fdtable.h>
#include <linux/fsnotify_backend.h>

static int dir_notify_enable __read_mostly = 1;
#ifdef CONFIG_SYSCTL
static struct ctl_table dnotify_sysctls[] = {
        {
                .procname        = "dir-notify-enable",
                .data                = &dir_notify_enable,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
};
static void __init dnotify_sysctl_init(void)
{
        register_sysctl_init("fs", dnotify_sysctls);
}
#else
#define dnotify_sysctl_init() do { } while (0)
#endif

static struct kmem_cache *dnotify_struct_cache __ro_after_init;
static struct kmem_cache *dnotify_mark_cache __ro_after_init;
static struct fsnotify_group *dnotify_group __ro_after_init;

/*
 * dnotify will attach one of these to each inode (i_fsnotify_marks) which
 * is being watched by dnotify.  If multiple userspace applications are watching
 * the same directory with dnotify their information is chained in dn
 */
struct dnotify_mark {
        struct fsnotify_mark fsn_mark;
        struct dnotify_struct *dn;
};

/*
 * When a process starts or stops watching an inode the set of events which
 * dnotify cares about for that inode may change.  This function runs the
 * list of everything receiving dnotify events about this directory and calculates
 * the set of all those events.  After it updates what dnotify is interested in
 * it calls the fsnotify function so it can update the set of all events relevant
 * to this inode.
 */
static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
{
        __u32 new_mask = 0;
        struct dnotify_struct *dn;
        struct dnotify_mark *dn_mark  = container_of(fsn_mark,
                                                     struct dnotify_mark,
                                                     fsn_mark);

        assert_spin_locked(&fsn_mark->lock);

        for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next)
                new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
        if (fsn_mark->mask == new_mask)
                return;
        fsn_mark->mask = new_mask;

        fsnotify_recalc_mask(fsn_mark->connector);
}

/*
 * Mains fsnotify call where events are delivered to dnotify.
 * Find the dnotify mark on the relevant inode, run the list of dnotify structs
 * on that mark and determine which of them has expressed interest in receiving
 * events of this type.  When found send the correct process and signal and
 * destroy the dnotify struct if it was not registered to receive multiple
 * events.
 */
static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
                                struct inode *inode, struct inode *dir,
                                const struct qstr *name, u32 cookie)
{
        struct dnotify_mark *dn_mark;
        struct dnotify_struct *dn;
        struct dnotify_struct **prev;
        struct fown_struct *fown;
        __u32 test_mask = mask & ~FS_EVENT_ON_CHILD;

        /* not a dir, dnotify doesn't care */
        if (!dir && !(mask & FS_ISDIR))
                return 0;

        dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);

        spin_lock(&inode_mark->lock);
        prev = &dn_mark->dn;
        while ((dn = *prev) != NULL) {
                if ((dn->dn_mask & test_mask) == 0) {
                        prev = &dn->dn_next;
                        continue;
                }
                fown = &dn->dn_filp->f_owner;
                send_sigio(fown, dn->dn_fd, POLL_MSG);
                if (dn->dn_mask & FS_DN_MULTISHOT)
                        prev = &dn->dn_next;
                else {
                        *prev = dn->dn_next;
                        kmem_cache_free(dnotify_struct_cache, dn);
                        dnotify_recalc_inode_mask(inode_mark);
                }
        }

        spin_unlock(&inode_mark->lock);

        return 0;
}

static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
{
        struct dnotify_mark *dn_mark = container_of(fsn_mark,
                                                    struct dnotify_mark,
                                                    fsn_mark);

        BUG_ON(dn_mark->dn);

        kmem_cache_free(dnotify_mark_cache, dn_mark);
}

static const struct fsnotify_ops dnotify_fsnotify_ops = {
        .handle_inode_event = dnotify_handle_event,
        .free_mark = dnotify_free_mark,
};

/*
 * Called every time a file is closed.  Looks first for a dnotify mark on the
 * inode.  If one is found run all of the ->dn structures attached to that
 * mark for one relevant to this process closing the file and remove that
 * dnotify_struct.  If that was the last dnotify_struct also remove the
 * fsnotify_mark.
 */
void dnotify_flush(struct file *filp, fl_owner_t id)
{
        struct fsnotify_mark *fsn_mark;
        struct dnotify_mark *dn_mark;
        struct dnotify_struct *dn;
        struct dnotify_struct **prev;
        struct inode *inode;
        bool free = false;

        inode = file_inode(filp);
        if (!S_ISDIR(inode->i_mode))
                return;

        fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group);
        if (!fsn_mark)
                return;
        dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);

        fsnotify_group_lock(dnotify_group);

        spin_lock(&fsn_mark->lock);
        prev = &dn_mark->dn;
        while ((dn = *prev) != NULL) {
                if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
                        *prev = dn->dn_next;
                        kmem_cache_free(dnotify_struct_cache, dn);
                        dnotify_recalc_inode_mask(fsn_mark);
                        break;
                }
                prev = &dn->dn_next;
        }

        spin_unlock(&fsn_mark->lock);

        /* nothing else could have found us thanks to the dnotify_groups
           mark_mutex */
        if (dn_mark->dn == NULL) {
                fsnotify_detach_mark(fsn_mark);
                free = true;
        }

        fsnotify_group_unlock(dnotify_group);

        if (free)
                fsnotify_free_mark(fsn_mark);
        fsnotify_put_mark(fsn_mark);
}

/* this conversion is done only at watch creation */
static __u32 convert_arg(unsigned int arg)
{
        __u32 new_mask = FS_EVENT_ON_CHILD;

        if (arg & DN_MULTISHOT)
                new_mask |= FS_DN_MULTISHOT;
        if (arg & DN_DELETE)
                new_mask |= (FS_DELETE | FS_MOVED_FROM);
        if (arg & DN_MODIFY)
                new_mask |= FS_MODIFY;
        if (arg & DN_ACCESS)
                new_mask |= FS_ACCESS;
        if (arg & DN_ATTRIB)
                new_mask |= FS_ATTRIB;
        if (arg & DN_RENAME)
                new_mask |= FS_RENAME;
        if (arg & DN_CREATE)
                new_mask |= (FS_CREATE | FS_MOVED_TO);

        return new_mask;
}

/*
 * If multiple processes watch the same inode with dnotify there is only one
 * dnotify mark in inode->i_fsnotify_marks but we chain a dnotify_struct
 * onto that mark.  This function either attaches the new dnotify_struct onto
 * that list, or it |= the mask onto an existing dnofiy_struct.
 */
static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark,
                     fl_owner_t id, int fd, struct file *filp, __u32 mask)
{
        struct dnotify_struct *odn;

        odn = dn_mark->dn;
        while (odn != NULL) {
                /* adding more events to existing dnofiy_struct? */
                if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
                        odn->dn_fd = fd;
                        odn->dn_mask |= mask;
                        return -EEXIST;
                }
                odn = odn->dn_next;
        }

        dn->dn_mask = mask;
        dn->dn_fd = fd;
        dn->dn_filp = filp;
        dn->dn_owner = id;
        dn->dn_next = dn_mark->dn;
        dn_mark->dn = dn;

        return 0;
}

/*
 * When a process calls fcntl to attach a dnotify watch to a directory it ends
 * up here.  Allocate both a mark for fsnotify to add and a dnotify_struct to be
 * attached to the fsnotify_mark.
 */
int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
{
        struct dnotify_mark *new_dn_mark, *dn_mark;
        struct fsnotify_mark *new_fsn_mark, *fsn_mark;
        struct dnotify_struct *dn;
        struct inode *inode;
        fl_owner_t id = current->files;
        struct file *f = NULL;
        int destroy = 0, error = 0;
        __u32 mask;

        /* we use these to tell if we need to kfree */
        new_fsn_mark = NULL;
        dn = NULL;

        if (!dir_notify_enable) {
                error = -EINVAL;
                goto out_err;
        }

        /* a 0 mask means we are explicitly removing the watch */
        if ((arg & ~DN_MULTISHOT) == 0) {
                dnotify_flush(filp, id);
                error = 0;
                goto out_err;
        }

        /* dnotify only works on directories */
        inode = file_inode(filp);
        if (!S_ISDIR(inode->i_mode)) {
                error = -ENOTDIR;
                goto out_err;
        }

        /*
         * convert the userspace DN_* "arg" to the internal FS_*
         * defined in fsnotify
         */
        mask = convert_arg(arg);

        error = security_path_notify(&filp->f_path, mask,
                        FSNOTIFY_OBJ_TYPE_INODE);
        if (error)
                goto out_err;

        /* expect most fcntl to add new rather than augment old */
        dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL);
        if (!dn) {
                error = -ENOMEM;
                goto out_err;
        }

        /* new fsnotify mark, we expect most fcntl calls to add a new mark */
        new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL);
        if (!new_dn_mark) {
                error = -ENOMEM;
                goto out_err;
        }

        /* set up the new_fsn_mark and new_dn_mark */
        new_fsn_mark = &new_dn_mark->fsn_mark;
        fsnotify_init_mark(new_fsn_mark, dnotify_group);
        new_fsn_mark->mask = mask;
        new_dn_mark->dn = NULL;

        /* this is needed to prevent the fcntl/close race described below */
        fsnotify_group_lock(dnotify_group);

        /* add the new_fsn_mark or find an old one. */
        fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group);
        if (fsn_mark) {
                dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
                spin_lock(&fsn_mark->lock);
        } else {
                error = fsnotify_add_inode_mark_locked(new_fsn_mark, inode, 0);
                if (error) {
                        fsnotify_group_unlock(dnotify_group);
                        goto out_err;
                }
                spin_lock(&new_fsn_mark->lock);
                fsn_mark = new_fsn_mark;
                dn_mark = new_dn_mark;
                /* we used new_fsn_mark, so don't free it */
                new_fsn_mark = NULL;
        }

        rcu_read_lock();
        f = lookup_fdget_rcu(fd);
        rcu_read_unlock();

        /* if (f != filp) means that we lost a race and another task/thread
         * actually closed the fd we are still playing with before we grabbed
         * the dnotify_groups mark_mutex and fsn_mark->lock.  Since closing the
         * fd is the only time we clean up the marks we need to get our mark
         * off the list. */
        if (f != filp) {
                /* if we added ourselves, shoot ourselves, it's possible that
                 * the flush actually did shoot this fsn_mark.  That's fine too
                 * since multiple calls to destroy_mark is perfectly safe, if
                 * we found a dn_mark already attached to the inode, just sod
                 * off silently as the flush at close time dealt with it.
                 */
                if (dn_mark == new_dn_mark)
                        destroy = 1;
                error = 0;
                goto out;
        }

        __f_setown(filp, task_pid(current), PIDTYPE_TGID, 0);

        error = attach_dn(dn, dn_mark, id, fd, filp, mask);
        /* !error means that we attached the dn to the dn_mark, so don't free it */
        if (!error)
                dn = NULL;
        /* -EEXIST means that we didn't add this new dn and used an old one.
         * that isn't an error (and the unused dn should be freed) */
        else if (error == -EEXIST)
                error = 0;

        dnotify_recalc_inode_mask(fsn_mark);
out:
        spin_unlock(&fsn_mark->lock);

        if (destroy)
                fsnotify_detach_mark(fsn_mark);
        fsnotify_group_unlock(dnotify_group);
        if (destroy)
                fsnotify_free_mark(fsn_mark);
        fsnotify_put_mark(fsn_mark);
out_err:
        if (new_fsn_mark)
                fsnotify_put_mark(new_fsn_mark);
        if (dn)
                kmem_cache_free(dnotify_struct_cache, dn);
        if (f)
                fput(f);
        return error;
}

static int __init dnotify_init(void)
{
        dnotify_struct_cache = KMEM_CACHE(dnotify_struct,
                                          SLAB_PANIC|SLAB_ACCOUNT);
        dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT);

        dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops,
                                             FSNOTIFY_GROUP_NOFS);
        if (IS_ERR(dnotify_group))
                panic("unable to allocate fsnotify group for dnotify\n");
        dnotify_sysctl_init();
        return 0;
}

module_init(dnotify_init)


























































    1 

























    1 







































    1 


    1 









    1 







    1 




    1 
































    1 

























    1 

























































































    1 






























































































































    1 









    1 









































    1 










    1 


    1 




















    1 






    1 








    1 









    1 
    1 




    1 

























































































































































































































































































































































































































































































































    1 






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
// SPDX-License-Identifier: GPL-2.0
/*
 *
 * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
 *
 */

#include <linux/buffer_head.h>
#include <linux/fs.h>
#include <linux/mpage.h>
#include <linux/namei.h>
#include <linux/nls.h>
#include <linux/uio.h>
#include <linux/writeback.h>

#include "debug.h"
#include "ntfs.h"
#include "ntfs_fs.h"

/*
 * ntfs_read_mft - Read record and parses MFT.
 */
static struct inode *ntfs_read_mft(struct inode *inode,
                                   const struct cpu_str *name,
                                   const struct MFT_REF *ref)
{
        int err = 0;
        struct ntfs_inode *ni = ntfs_i(inode);
        struct super_block *sb = inode->i_sb;
        struct ntfs_sb_info *sbi = sb->s_fs_info;
        mode_t mode = 0;
        struct ATTR_STD_INFO5 *std5 = NULL;
        struct ATTR_LIST_ENTRY *le;
        struct ATTRIB *attr;
        bool is_match = false;
        bool is_root = false;
        bool is_dir;
        unsigned long ino = inode->i_ino;
        u32 rp_fa = 0, asize, t32;
        u16 roff, rsize, names = 0, links = 0;
        const struct ATTR_FILE_NAME *fname = NULL;
        const struct INDEX_ROOT *root;
        struct REPARSE_DATA_BUFFER rp; // 0x18 bytes
        u64 t64;
        struct MFT_REC *rec;
        struct runs_tree *run;
        struct timespec64 ts;

        inode->i_op = NULL;
        /* Setup 'uid' and 'gid' */
        inode->i_uid = sbi->options->fs_uid;
        inode->i_gid = sbi->options->fs_gid;

        err = mi_init(&ni->mi, sbi, ino);
        if (err)
                goto out;

        if (!sbi->mft.ni && ino == MFT_REC_MFT && !sb->s_root) {
                t64 = sbi->mft.lbo >> sbi->cluster_bits;
                t32 = bytes_to_cluster(sbi, MFT_REC_VOL * sbi->record_size);
                sbi->mft.ni = ni;
                init_rwsem(&ni->file.run_lock);

                if (!run_add_entry(&ni->file.run, 0, t64, t32, true)) {
                        err = -ENOMEM;
                        goto out;
                }
        }

        err = mi_read(&ni->mi, ino == MFT_REC_MFT);

        if (err)
                goto out;

        rec = ni->mi.mrec;

        if (sbi->flags & NTFS_FLAGS_LOG_REPLAYING) {
                ;
        } else if (ref->seq != rec->seq) {
                err = -EINVAL;
                ntfs_err(sb, "MFT: r=%lx, expect seq=%x instead of %x!", ino,
                         le16_to_cpu(ref->seq), le16_to_cpu(rec->seq));
                goto out;
        } else if (!is_rec_inuse(rec)) {
                err = -ESTALE;
                ntfs_err(sb, "Inode r=%x is not in use!", (u32)ino);
                goto out;
        }

        if (le32_to_cpu(rec->total) != sbi->record_size) {
                /* Bad inode? */
                err = -EINVAL;
                goto out;
        }

        if (!is_rec_base(rec)) {
                err = -EINVAL;
                goto out;
        }

        /* Record should contain $I30 root. */
        is_dir = rec->flags & RECORD_FLAG_DIR;

        /* MFT_REC_MFT is not a dir */
        if (is_dir && ino == MFT_REC_MFT) {
                err = -EINVAL;
                goto out;
        }

        inode->i_generation = le16_to_cpu(rec->seq);

        /* Enumerate all struct Attributes MFT. */
        le = NULL;
        attr = NULL;

        /*
         * To reduce tab pressure use goto instead of
         * while( (attr = ni_enum_attr_ex(ni, attr, &le, NULL) ))
         */
next_attr:
        run = NULL;
        err = -EINVAL;
        attr = ni_enum_attr_ex(ni, attr, &le, NULL);
        if (!attr)
                goto end_enum;

        if (le && le->vcn) {
                /* This is non primary attribute segment. Ignore if not MFT. */
                if (ino != MFT_REC_MFT || attr->type != ATTR_DATA)
                        goto next_attr;

                run = &ni->file.run;
                asize = le32_to_cpu(attr->size);
                goto attr_unpack_run;
        }

        roff = attr->non_res ? 0 : le16_to_cpu(attr->res.data_off);
        rsize = attr->non_res ? 0 : le32_to_cpu(attr->res.data_size);
        asize = le32_to_cpu(attr->size);

        /*
         * Really this check was done in 'ni_enum_attr_ex' -> ... 'mi_enum_attr'.
         * There not critical to check this case again
         */
        if (attr->name_len &&
            sizeof(short) * attr->name_len + le16_to_cpu(attr->name_off) >
                    asize)
                goto out;

        if (attr->non_res) {
                t64 = le64_to_cpu(attr->nres.alloc_size);
                if (le64_to_cpu(attr->nres.data_size) > t64 ||
                    le64_to_cpu(attr->nres.valid_size) > t64)
                        goto out;
        }

        switch (attr->type) {
        case ATTR_STD:
                if (attr->non_res ||
                    asize < sizeof(struct ATTR_STD_INFO) + roff ||
                    rsize < sizeof(struct ATTR_STD_INFO))
                        goto out;

                if (std5)
                        goto next_attr;

                std5 = Add2Ptr(attr, roff);

#ifdef STATX_BTIME
                nt2kernel(std5->cr_time, &ni->i_crtime);
#endif
                nt2kernel(std5->a_time, &ts);
                inode_set_atime_to_ts(inode, ts);
                nt2kernel(std5->c_time, &ts);
                inode_set_ctime_to_ts(inode, ts);
                nt2kernel(std5->m_time, &ts);
                inode_set_mtime_to_ts(inode, ts);

                ni->std_fa = std5->fa;

                if (asize >= sizeof(struct ATTR_STD_INFO5) + roff &&
                    rsize >= sizeof(struct ATTR_STD_INFO5))
                        ni->std_security_id = std5->security_id;
                goto next_attr;

        case ATTR_LIST:
                if (attr->name_len || le || ino == MFT_REC_LOG)
                        goto out;

                err = ntfs_load_attr_list(ni, attr);
                if (err)
                        goto out;

                le = NULL;
                attr = NULL;
                goto next_attr;

        case ATTR_NAME:
                if (attr->non_res || asize < SIZEOF_ATTRIBUTE_FILENAME + roff ||
                    rsize < SIZEOF_ATTRIBUTE_FILENAME)
                        goto out;

                names += 1;
                fname = Add2Ptr(attr, roff);
                if (fname->type == FILE_NAME_DOS)
                        goto next_attr;

                links += 1;
                if (name && name->len == fname->name_len &&
                    !ntfs_cmp_names_cpu(name, (struct le_str *)&fname->name_len,
                                        NULL, false))
                        is_match = true;

                goto next_attr;

        case ATTR_DATA:
                if (is_dir) {
                        /* Ignore data attribute in dir record. */
                        goto next_attr;
                }

                if (ino == MFT_REC_BADCLUST && !attr->non_res)
                        goto next_attr;

                if (attr->name_len &&
                    ((ino != MFT_REC_BADCLUST || !attr->non_res ||
                      attr->name_len != ARRAY_SIZE(BAD_NAME) ||
                      memcmp(attr_name(attr), BAD_NAME, sizeof(BAD_NAME))) &&
                     (ino != MFT_REC_SECURE || !attr->non_res ||
                      attr->name_len != ARRAY_SIZE(SDS_NAME) ||
                      memcmp(attr_name(attr), SDS_NAME, sizeof(SDS_NAME))))) {
                        /* File contains stream attribute. Ignore it. */
                        goto next_attr;
                }

                if (is_attr_sparsed(attr))
                        ni->std_fa |= FILE_ATTRIBUTE_SPARSE_FILE;
                else
                        ni->std_fa &= ~FILE_ATTRIBUTE_SPARSE_FILE;

                if (is_attr_compressed(attr))
                        ni->std_fa |= FILE_ATTRIBUTE_COMPRESSED;
                else
                        ni->std_fa &= ~FILE_ATTRIBUTE_COMPRESSED;

                if (is_attr_encrypted(attr))
                        ni->std_fa |= FILE_ATTRIBUTE_ENCRYPTED;
                else
                        ni->std_fa &= ~FILE_ATTRIBUTE_ENCRYPTED;

                if (!attr->non_res) {
                        ni->i_valid = inode->i_size = rsize;
                        inode_set_bytes(inode, rsize);
                }

                mode = S_IFREG | (0777 & sbi->options->fs_fmask_inv);

                if (!attr->non_res) {
                        ni->ni_flags |= NI_FLAG_RESIDENT;
                        goto next_attr;
                }

                inode_set_bytes(inode, attr_ondisk_size(attr));

                ni->i_valid = le64_to_cpu(attr->nres.valid_size);
                inode->i_size = le64_to_cpu(attr->nres.data_size);
                if (!attr->nres.alloc_size)
                        goto next_attr;

                run = ino == MFT_REC_BITMAP ? &sbi->used.bitmap.run :
                                              &ni->file.run;
                break;

        case ATTR_ROOT:
                if (attr->non_res)
                        goto out;

                root = Add2Ptr(attr, roff);

                if (attr->name_len != ARRAY_SIZE(I30_NAME) ||
                    memcmp(attr_name(attr), I30_NAME, sizeof(I30_NAME)))
                        goto next_attr;

                if (root->type != ATTR_NAME ||
                    root->rule != NTFS_COLLATION_TYPE_FILENAME)
                        goto out;

                if (!is_dir)
                        goto next_attr;

                is_root = true;
                ni->ni_flags |= NI_FLAG_DIR;

                err = indx_init(&ni->dir, sbi, attr, INDEX_MUTEX_I30);
                if (err)
                        goto out;

                mode = sb->s_root ?
                               (S_IFDIR | (0777 & sbi->options->fs_dmask_inv)) :
                               (S_IFDIR | 0777);
                goto next_attr;

        case ATTR_ALLOC:
                if (!is_root || attr->name_len != ARRAY_SIZE(I30_NAME) ||
                    memcmp(attr_name(attr), I30_NAME, sizeof(I30_NAME)))
                        goto next_attr;

                inode->i_size = le64_to_cpu(attr->nres.data_size);
                ni->i_valid = le64_to_cpu(attr->nres.valid_size);
                inode_set_bytes(inode, le64_to_cpu(attr->nres.alloc_size));

                run = &ni->dir.alloc_run;
                break;

        case ATTR_BITMAP:
                if (ino == MFT_REC_MFT) {
                        if (!attr->non_res)
                                goto out;
#ifndef CONFIG_NTFS3_64BIT_CLUSTER
                        /* 0x20000000 = 2^32 / 8 */
                        if (le64_to_cpu(attr->nres.alloc_size) >= 0x20000000)
                                goto out;
#endif
                        run = &sbi->mft.bitmap.run;
                        break;
                } else if (is_dir && attr->name_len == ARRAY_SIZE(I30_NAME) &&
                           !memcmp(attr_name(attr), I30_NAME,
                                   sizeof(I30_NAME)) &&
                           attr->non_res) {
                        run = &ni->dir.bitmap_run;
                        break;
                }
                goto next_attr;

        case ATTR_REPARSE:
                if (attr->name_len)
                        goto next_attr;

                rp_fa = ni_parse_reparse(ni, attr, &rp);
                switch (rp_fa) {
                case REPARSE_LINK:
                        /*
                         * Normal symlink.
                         * Assume one unicode symbol == one utf8.
                         */
                        inode->i_size = le16_to_cpu(rp.SymbolicLinkReparseBuffer
                                                            .PrintNameLength) /
                                        sizeof(u16);
                        ni->i_valid = inode->i_size;
                        /* Clear directory bit. */
                        if (ni->ni_flags & NI_FLAG_DIR) {
                                indx_clear(&ni->dir);
                                memset(&ni->dir, 0, sizeof(ni->dir));
                                ni->ni_flags &= ~NI_FLAG_DIR;
                        } else {
                                run_close(&ni->file.run);
                        }
                        mode = S_IFLNK | 0777;
                        is_dir = false;
                        if (attr->non_res) {
                                run = &ni->file.run;
                                goto attr_unpack_run; // Double break.
                        }
                        break;

                case REPARSE_COMPRESSED:
                        break;

                case REPARSE_DEDUPLICATED:
                        break;
                }
                goto next_attr;

        case ATTR_EA_INFO:
                if (!attr->name_len &&
                    resident_data_ex(attr, sizeof(struct EA_INFO))) {
                        ni->ni_flags |= NI_FLAG_EA;
                        /*
                         * ntfs_get_wsl_perm updates inode->i_uid, inode->i_gid, inode->i_mode
                         */
                        inode->i_mode = mode;
                        ntfs_get_wsl_perm(inode);
                        mode = inode->i_mode;
                }
                goto next_attr;

        default:
                goto next_attr;
        }

attr_unpack_run:
        roff = le16_to_cpu(attr->nres.run_off);

        if (roff > asize) {
                err = -EINVAL;
                goto out;
        }

        t64 = le64_to_cpu(attr->nres.svcn);

        err = run_unpack_ex(run, sbi, ino, t64, le64_to_cpu(attr->nres.evcn),
                            t64, Add2Ptr(attr, roff), asize - roff);
        if (err < 0)
                goto out;
        err = 0;
        goto next_attr;

end_enum:

        if (!std5)
                goto out;

        if (!is_match && name) {
                err = -ENOENT;
                goto out;
        }

        if (std5->fa & FILE_ATTRIBUTE_READONLY)
                mode &= ~0222;

        if (!names) {
                err = -EINVAL;
                goto out;
        }

        if (names != le16_to_cpu(rec->hard_links)) {
                /* Correct minor error on the fly. Do not mark inode as dirty. */
                ntfs_inode_warn(inode, "Correct links count -> %u.", names);
                rec->hard_links = cpu_to_le16(names);
                ni->mi.dirty = true;
        }

        set_nlink(inode, links);

        if (S_ISDIR(mode)) {
                ni->std_fa |= FILE_ATTRIBUTE_DIRECTORY;

                /*
                 * Dot and dot-dot should be included in count but was not
                 * included in enumeration.
                 * Usually a hard links to directories are disabled.
                 */
                inode->i_op = &ntfs_dir_inode_operations;
                if (is_legacy_ntfs(inode->i_sb))
                        inode->i_fop = &ntfs_legacy_dir_operations;
                else
                        inode->i_fop = &ntfs_dir_operations;
                ni->i_valid = 0;
        } else if (S_ISLNK(mode)) {
                ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY;
                inode->i_op = &ntfs_link_inode_operations;
                inode->i_fop = NULL;
                inode_nohighmem(inode);
        } else if (S_ISREG(mode)) {
                ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY;
                inode->i_op = &ntfs_file_inode_operations;
                if (is_legacy_ntfs(inode->i_sb))
                        inode->i_fop = &ntfs_legacy_file_operations;
                else
                        inode->i_fop = &ntfs_file_operations;
                inode->i_mapping->a_ops = is_compressed(ni) ? &ntfs_aops_cmpr :
                                                              &ntfs_aops;
                if (ino != MFT_REC_MFT)
                        init_rwsem(&ni->file.run_lock);
        } else if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) ||
                   S_ISSOCK(mode)) {
                inode->i_op = &ntfs_special_inode_operations;
                init_special_inode(inode, mode, inode->i_rdev);
        } else if (fname && fname->home.low == cpu_to_le32(MFT_REC_EXTEND) &&
                   fname->home.seq == cpu_to_le16(MFT_REC_EXTEND)) {
                /* Records in $Extend are not a files or general directories. */
                inode->i_op = &ntfs_file_inode_operations;
        } else {
                err = -EINVAL;
                goto out;
        }

        if ((sbi->options->sys_immutable &&
             (std5->fa & FILE_ATTRIBUTE_SYSTEM)) &&
            !S_ISFIFO(mode) && !S_ISSOCK(mode) && !S_ISLNK(mode)) {
                inode->i_flags |= S_IMMUTABLE;
        } else {
                inode->i_flags &= ~S_IMMUTABLE;
        }

        inode->i_mode = mode;
        if (!(ni->ni_flags & NI_FLAG_EA)) {
                /* If no xattr then no security (stored in xattr). */
                inode->i_flags |= S_NOSEC;
        }

        if (ino == MFT_REC_MFT && !sb->s_root)
                sbi->mft.ni = NULL;

        unlock_new_inode(inode);

        return inode;

out:
        if (ino == MFT_REC_MFT && !sb->s_root)
                sbi->mft.ni = NULL;

        iget_failed(inode);
        return ERR_PTR(err);
}

/*
 * ntfs_test_inode
 *
 * Return: 1 if match.
 */
static int ntfs_test_inode(struct inode *inode, void *data)
{
        struct MFT_REF *ref = data;

        return ino_get(ref) == inode->i_ino;
}

static int ntfs_set_inode(struct inode *inode, void *data)
{
        const struct MFT_REF *ref = data;

        inode->i_ino = ino_get(ref);
        return 0;
}

struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref,
                         const struct cpu_str *name)
{
        struct inode *inode;

        inode = iget5_locked(sb, ino_get(ref), ntfs_test_inode, ntfs_set_inode,
                             (void *)ref);
        if (unlikely(!inode))
                return ERR_PTR(-ENOMEM);

        /* If this is a freshly allocated inode, need to read it now. */
        if (inode->i_state & I_NEW)
                inode = ntfs_read_mft(inode, name, ref);
        else if (ref->seq != ntfs_i(inode)->mi.mrec->seq) {
                /* Inode overlaps? */
                _ntfs_bad_inode(inode);
        }

        if (IS_ERR(inode) && name)
                ntfs_set_state(sb->s_fs_info, NTFS_DIRTY_ERROR);

        return inode;
}

enum get_block_ctx {
        GET_BLOCK_GENERAL = 0,
        GET_BLOCK_WRITE_BEGIN = 1,
        GET_BLOCK_DIRECT_IO_R = 2,
        GET_BLOCK_DIRECT_IO_W = 3,
        GET_BLOCK_BMAP = 4,
};

static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
                                       struct buffer_head *bh, int create,
                                       enum get_block_ctx ctx)
{
        struct super_block *sb = inode->i_sb;
        struct ntfs_sb_info *sbi = sb->s_fs_info;
        struct ntfs_inode *ni = ntfs_i(inode);
        struct folio *folio = bh->b_folio;
        u8 cluster_bits = sbi->cluster_bits;
        u32 block_size = sb->s_blocksize;
        u64 bytes, lbo, valid;
        u32 off;
        int err;
        CLST vcn, lcn, len;
        bool new;

        /* Clear previous state. */
        clear_buffer_new(bh);
        clear_buffer_uptodate(bh);

        if (is_resident(ni)) {
                bh->b_blocknr = RESIDENT_LCN;
                bh->b_size = block_size;
                if (!folio) {
                        err = 0;
                } else {
                        ni_lock(ni);
                        err = attr_data_read_resident(ni, &folio->page);
                        ni_unlock(ni);

                        if (!err)
                                set_buffer_uptodate(bh);
                }
                return err;
        }

        vcn = vbo >> cluster_bits;
        off = vbo & sbi->cluster_mask;
        new = false;

        err = attr_data_get_block(ni, vcn, 1, &lcn, &len, create ? &new : NULL,
                                  create && sbi->cluster_size > PAGE_SIZE);
        if (err)
                goto out;

        if (!len)
                return 0;

        bytes = ((u64)len << cluster_bits) - off;

        if (lcn == SPARSE_LCN) {
                if (!create) {
                        if (bh->b_size > bytes)
                                bh->b_size = bytes;
                        return 0;
                }
                WARN_ON(1);
        }

        if (new)
                set_buffer_new(bh);

        lbo = ((u64)lcn << cluster_bits) + off;

        set_buffer_mapped(bh);
        bh->b_bdev = sb->s_bdev;
        bh->b_blocknr = lbo >> sb->s_blocksize_bits;

        valid = ni->i_valid;

        if (ctx == GET_BLOCK_DIRECT_IO_W) {
                /* ntfs_direct_IO will update ni->i_valid. */
                if (vbo >= valid)
                        set_buffer_new(bh);
        } else if (create) {
                /* Normal write. */
                if (bytes > bh->b_size)
                        bytes = bh->b_size;

                if (vbo >= valid)
                        set_buffer_new(bh);

                if (vbo + bytes > valid) {
                        ni->i_valid = vbo + bytes;
                        mark_inode_dirty(inode);
                }
        } else if (vbo >= valid) {
                /* Read out of valid data. */
                clear_buffer_mapped(bh);
        } else if (vbo + bytes <= valid) {
                /* Normal read. */
        } else if (vbo + block_size <= valid) {
                /* Normal short read. */
                bytes = block_size;
        } else {
                /*
                 * Read across valid size: vbo < valid && valid < vbo + block_size
                 */
                bytes = block_size;

                if (folio) {
                        u32 voff = valid - vbo;

                        bh->b_size = block_size;
                        off = vbo & (PAGE_SIZE - 1);
                        folio_set_bh(bh, folio, off);

                        if (bh_read(bh, 0) < 0) {
                                err = -EIO;
                                goto out;
                        }
                        folio_zero_segment(folio, off + voff, off + block_size);
                }
        }

        if (bh->b_size > bytes)
                bh->b_size = bytes;

#ifndef __LP64__
        if (ctx == GET_BLOCK_DIRECT_IO_W || ctx == GET_BLOCK_DIRECT_IO_R) {
                static_assert(sizeof(size_t) < sizeof(loff_t));
                if (bytes > 0x40000000u)
                        bh->b_size = 0x40000000u;
        }
#endif

        return 0;

out:
        return err;
}

int ntfs_get_block(struct inode *inode, sector_t vbn,
                   struct buffer_head *bh_result, int create)
{
        return ntfs_get_block_vbo(inode, (u64)vbn << inode->i_blkbits,
                                  bh_result, create, GET_BLOCK_GENERAL);
}

static int ntfs_get_block_bmap(struct inode *inode, sector_t vsn,
                               struct buffer_head *bh_result, int create)
{
        return ntfs_get_block_vbo(inode,
                                  (u64)vsn << inode->i_sb->s_blocksize_bits,
                                  bh_result, create, GET_BLOCK_BMAP);
}

static sector_t ntfs_bmap(struct address_space *mapping, sector_t block)
{
        return generic_block_bmap(mapping, block, ntfs_get_block_bmap);
}

static int ntfs_read_folio(struct file *file, struct folio *folio)
{
        struct page *page = &folio->page;
        int err;
        struct address_space *mapping = page->mapping;
        struct inode *inode = mapping->host;
        struct ntfs_inode *ni = ntfs_i(inode);

        if (is_resident(ni)) {
                ni_lock(ni);
                err = attr_data_read_resident(ni, page);
                ni_unlock(ni);
                if (err != E_NTFS_NONRESIDENT) {
                        unlock_page(page);
                        return err;
                }
        }

        if (is_compressed(ni)) {
                ni_lock(ni);
                err = ni_readpage_cmpr(ni, page);
                ni_unlock(ni);
                return err;
        }

        /* Normal + sparse files. */
        return mpage_read_folio(folio, ntfs_get_block);
}

static void ntfs_readahead(struct readahead_control *rac)
{
        struct address_space *mapping = rac->mapping;
        struct inode *inode = mapping->host;
        struct ntfs_inode *ni = ntfs_i(inode);
        u64 valid;
        loff_t pos;

        if (is_resident(ni)) {
                /* No readahead for resident. */
                return;
        }

        if (is_compressed(ni)) {
                /* No readahead for compressed. */
                return;
        }

        valid = ni->i_valid;
        pos = readahead_pos(rac);

        if (valid < i_size_read(inode) && pos <= valid &&
            valid < pos + readahead_length(rac)) {
                /* Range cross 'valid'. Read it page by page. */
                return;
        }

        mpage_readahead(rac, ntfs_get_block);
}

static int ntfs_get_block_direct_IO_R(struct inode *inode, sector_t iblock,
                                      struct buffer_head *bh_result, int create)
{
        return ntfs_get_block_vbo(inode, (u64)iblock << inode->i_blkbits,
                                  bh_result, create, GET_BLOCK_DIRECT_IO_R);
}

static int ntfs_get_block_direct_IO_W(struct inode *inode, sector_t iblock,
                                      struct buffer_head *bh_result, int create)
{
        return ntfs_get_block_vbo(inode, (u64)iblock << inode->i_blkbits,
                                  bh_result, create, GET_BLOCK_DIRECT_IO_W);
}

static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        struct ntfs_inode *ni = ntfs_i(inode);
        loff_t vbo = iocb->ki_pos;
        loff_t end;
        int wr = iov_iter_rw(iter) & WRITE;
        size_t iter_count = iov_iter_count(iter);
        loff_t valid;
        ssize_t ret;

        if (is_resident(ni)) {
                /* Switch to buffered write. */
                ret = 0;
                goto out;
        }

        ret = blockdev_direct_IO(iocb, inode, iter,
                                 wr ? ntfs_get_block_direct_IO_W :
                                      ntfs_get_block_direct_IO_R);

        if (ret > 0)
                end = vbo + ret;
        else if (wr && ret == -EIOCBQUEUED)
                end = vbo + iter_count;
        else
                goto out;

        valid = ni->i_valid;
        if (wr) {
                if (end > valid && !S_ISBLK(inode->i_mode)) {
                        ni->i_valid = end;
                        mark_inode_dirty(inode);
                }
        } else if (vbo < valid && valid < end) {
                /* Fix page. */
                iov_iter_revert(iter, end - valid);
                iov_iter_zero(end - valid, iter);
        }

out:
        return ret;
}

int ntfs_set_size(struct inode *inode, u64 new_size)
{
        struct super_block *sb = inode->i_sb;
        struct ntfs_sb_info *sbi = sb->s_fs_info;
        struct ntfs_inode *ni = ntfs_i(inode);
        int err;

        /* Check for maximum file size. */
        if (is_sparsed(ni) || is_compressed(ni)) {
                if (new_size > sbi->maxbytes_sparse) {
                        err = -EFBIG;
                        goto out;
                }
        } else if (new_size > sbi->maxbytes) {
                err = -EFBIG;
                goto out;
        }

        ni_lock(ni);
        down_write(&ni->file.run_lock);

        err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, new_size,
                            &ni->i_valid, true, NULL);

        up_write(&ni->file.run_lock);
        ni_unlock(ni);

        mark_inode_dirty(inode);

out:
        return err;
}

static int ntfs_resident_writepage(struct folio *folio,
                                   struct writeback_control *wbc, void *data)
{
        struct address_space *mapping = data;
        struct inode *inode = mapping->host;
        struct ntfs_inode *ni = ntfs_i(inode);
        int ret;

        if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
                return -EIO;

        ni_lock(ni);
        ret = attr_data_write_resident(ni, &folio->page);
        ni_unlock(ni);

        if (ret != E_NTFS_NONRESIDENT)
                folio_unlock(folio);
        mapping_set_error(mapping, ret);
        return ret;
}

static int ntfs_writepages(struct address_space *mapping,
                           struct writeback_control *wbc)
{
        struct inode *inode = mapping->host;

        if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
                return -EIO;

        if (is_resident(ntfs_i(inode)))
                return write_cache_pages(mapping, wbc, ntfs_resident_writepage,
                                         mapping);
        return mpage_writepages(mapping, wbc, ntfs_get_block);
}

static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn,
                                      struct buffer_head *bh_result, int create)
{
        return ntfs_get_block_vbo(inode, (u64)vbn << inode->i_blkbits,
                                  bh_result, create, GET_BLOCK_WRITE_BEGIN);
}

int ntfs_write_begin(struct file *file, struct address_space *mapping,
                     loff_t pos, u32 len, struct page **pagep, void **fsdata)
{
        int err;
        struct inode *inode = mapping->host;
        struct ntfs_inode *ni = ntfs_i(inode);

        if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
                return -EIO;

        *pagep = NULL;
        if (is_resident(ni)) {
                struct page *page =
                        grab_cache_page_write_begin(mapping, pos >> PAGE_SHIFT);

                if (!page) {
                        err = -ENOMEM;
                        goto out;
                }

                ni_lock(ni);
                err = attr_data_read_resident(ni, page);
                ni_unlock(ni);

                if (!err) {
                        *pagep = page;
                        goto out;
                }
                unlock_page(page);
                put_page(page);

                if (err != E_NTFS_NONRESIDENT)
                        goto out;
        }

        err = block_write_begin(mapping, pos, len, pagep,
                                ntfs_get_block_write_begin);

out:
        return err;
}

/*
 * ntfs_write_end - Address_space_operations::write_end.
 */
int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
                   u32 len, u32 copied, struct page *page, void *fsdata)
{
        struct inode *inode = mapping->host;
        struct ntfs_inode *ni = ntfs_i(inode);
        u64 valid = ni->i_valid;
        bool dirty = false;
        int err;

        if (is_resident(ni)) {
                ni_lock(ni);
                err = attr_data_write_resident(ni, page);
                ni_unlock(ni);
                if (!err) {
                        dirty = true;
                        /* Clear any buffers in page. */
                        if (page_has_buffers(page)) {
                                struct buffer_head *head, *bh;

                                bh = head = page_buffers(page);
                                do {
                                        clear_buffer_dirty(bh);
                                        clear_buffer_mapped(bh);
                                        set_buffer_uptodate(bh);
                                } while (head != (bh = bh->b_this_page));
                        }
                        SetPageUptodate(page);
                        err = copied;
                }
                unlock_page(page);
                put_page(page);
        } else {
                err = generic_write_end(file, mapping, pos, len, copied, page,
                                        fsdata);
        }

        if (err >= 0) {
                if (!(ni->std_fa & FILE_ATTRIBUTE_ARCHIVE)) {
                        inode_set_mtime_to_ts(inode,
                                              inode_set_ctime_current(inode));
                        ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE;
                        dirty = true;
                }

                if (valid != ni->i_valid) {
                        /* ni->i_valid is changed in ntfs_get_block_vbo. */
                        dirty = true;
                }

                if (pos + err > inode->i_size) {
                        i_size_write(inode, pos + err);
                        dirty = true;
                }

                if (dirty)
                        mark_inode_dirty(inode);
        }

        return err;
}

int reset_log_file(struct inode *inode)
{
        int err;
        loff_t pos = 0;
        u32 log_size = inode->i_size;
        struct address_space *mapping = inode->i_mapping;

        for (;;) {
                u32 len;
                void *kaddr;
                struct page *page;

                len = pos + PAGE_SIZE > log_size ? (log_size - pos) : PAGE_SIZE;

                err = block_write_begin(mapping, pos, len, &page,
                                        ntfs_get_block_write_begin);
                if (err)
                        goto out;

                kaddr = kmap_atomic(page);
                memset(kaddr, -1, len);
                kunmap_atomic(kaddr);
                flush_dcache_page(page);

                err = block_write_end(NULL, mapping, pos, len, len, page, NULL);
                if (err < 0)
                        goto out;
                pos += len;

                if (pos >= log_size)
                        break;
                balance_dirty_pages_ratelimited(mapping);
        }
out:
        mark_inode_dirty_sync(inode);

        return err;
}

int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc)
{
        return _ni_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
}

int ntfs_sync_inode(struct inode *inode)
{
        return _ni_write_inode(inode, 1);
}

/*
 * writeback_inode - Helper function for ntfs_flush_inodes().
 *
 * This writes both the inode and the file data blocks, waiting
 * for in flight data blocks before the start of the call.  It
 * does not wait for any io started during the call.
 */
static int writeback_inode(struct inode *inode)
{
        int ret = sync_inode_metadata(inode, 0);

        if (!ret)
                ret = filemap_fdatawrite(inode->i_mapping);
        return ret;
}

/*
 * ntfs_flush_inodes
 *
 * Write data and metadata corresponding to i1 and i2.  The io is
 * started but we do not wait for any of it to finish.
 *
 * filemap_flush() is used for the block device, so if there is a dirty
 * page for a block already in flight, we will not wait and start the
 * io over again.
 */
int ntfs_flush_inodes(struct super_block *sb, struct inode *i1,
                      struct inode *i2)
{
        int ret = 0;

        if (i1)
                ret = writeback_inode(i1);
        if (!ret && i2)
                ret = writeback_inode(i2);
        if (!ret)
                ret = sync_blockdev_nowait(sb->s_bdev);
        return ret;
}

int inode_write_data(struct inode *inode, const void *data, size_t bytes)
{
        pgoff_t idx;

        /* Write non resident data. */
        for (idx = 0; bytes; idx++) {
                size_t op = bytes > PAGE_SIZE ? PAGE_SIZE : bytes;
                struct page *page = ntfs_map_page(inode->i_mapping, idx);

                if (IS_ERR(page))
                        return PTR_ERR(page);

                lock_page(page);
                WARN_ON(!PageUptodate(page));
                ClearPageUptodate(page);

                memcpy(page_address(page), data, op);

                flush_dcache_page(page);
                SetPageUptodate(page);
                unlock_page(page);

                ntfs_unmap_page(page);

                bytes -= op;
                data = Add2Ptr(data, PAGE_SIZE);
        }
        return 0;
}

/*
 * ntfs_reparse_bytes
 *
 * Number of bytes for REPARSE_DATA_BUFFER(IO_REPARSE_TAG_SYMLINK)
 * for unicode string of @uni_len length.
 */
static inline u32 ntfs_reparse_bytes(u32 uni_len)
{
        /* Header + unicode string + decorated unicode string. */
        return sizeof(short) * (2 * uni_len + 4) +
               offsetof(struct REPARSE_DATA_BUFFER,
                        SymbolicLinkReparseBuffer.PathBuffer);
}

static struct REPARSE_DATA_BUFFER *
ntfs_create_reparse_buffer(struct ntfs_sb_info *sbi, const char *symname,
                           u32 size, u16 *nsize)
{
        int i, err;
        struct REPARSE_DATA_BUFFER *rp;
        __le16 *rp_name;
        typeof(rp->SymbolicLinkReparseBuffer) *rs;

        rp = kzalloc(ntfs_reparse_bytes(2 * size + 2), GFP_NOFS);
        if (!rp)
                return ERR_PTR(-ENOMEM);

        rs = &rp->SymbolicLinkReparseBuffer;
        rp_name = rs->PathBuffer;

        /* Convert link name to UTF-16. */
        err = ntfs_nls_to_utf16(sbi, symname, size,
                                (struct cpu_str *)(rp_name - 1), 2 * size,
                                UTF16_LITTLE_ENDIAN);
        if (err < 0)
                goto out;

        /* err = the length of unicode name of symlink. */
        *nsize = ntfs_reparse_bytes(err);

        if (*nsize > sbi->reparse.max_size) {
                err = -EFBIG;
                goto out;
        }

        /* Translate Linux '/' into Windows '\'. */
        for (i = 0; i < err; i++) {
                if (rp_name[i] == cpu_to_le16('/'))
                        rp_name[i] = cpu_to_le16('\\');
        }

        rp->ReparseTag = IO_REPARSE_TAG_SYMLINK;
        rp->ReparseDataLength =
                cpu_to_le16(*nsize - offsetof(struct REPARSE_DATA_BUFFER,
                                              SymbolicLinkReparseBuffer));

        /* PrintName + SubstituteName. */
        rs->SubstituteNameOffset = cpu_to_le16(sizeof(short) * err);
        rs->SubstituteNameLength = cpu_to_le16(sizeof(short) * err + 8);
        rs->PrintNameLength = rs->SubstituteNameOffset;

        /*
         * TODO: Use relative path if possible to allow Windows to
         * parse this path.
         * 0-absolute path 1- relative path (SYMLINK_FLAG_RELATIVE).
         */
        rs->Flags = 0;

        memmove(rp_name + err + 4, rp_name, sizeof(short) * err);

        /* Decorate SubstituteName. */
        rp_name += err;
        rp_name[0] = cpu_to_le16('\\');
        rp_name[1] = cpu_to_le16('?');
        rp_name[2] = cpu_to_le16('?');
        rp_name[3] = cpu_to_le16('\\');

        return rp;
out:
        kfree(rp);
        return ERR_PTR(err);
}

/*
 * ntfs_create_inode
 *
 * Helper function for:
 * - ntfs_create
 * - ntfs_mknod
 * - ntfs_symlink
 * - ntfs_mkdir
 * - ntfs_atomic_open
 *
 * NOTE: if fnd != NULL (ntfs_atomic_open) then @dir is locked
 */
int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
                      struct dentry *dentry, const struct cpu_str *uni,
                      umode_t mode, dev_t dev, const char *symname, u32 size,
                      struct ntfs_fnd *fnd)
{
        int err;
        struct super_block *sb = dir->i_sb;
        struct ntfs_sb_info *sbi = sb->s_fs_info;
        const struct qstr *name = &dentry->d_name;
        CLST ino = 0;
        struct ntfs_inode *dir_ni = ntfs_i(dir);
        struct ntfs_inode *ni = NULL;
        struct inode *inode = NULL;
        struct ATTRIB *attr;
        struct ATTR_STD_INFO5 *std5;
        struct ATTR_FILE_NAME *fname;
        struct MFT_REC *rec;
        u32 asize, dsize, sd_size;
        enum FILE_ATTRIBUTE fa;
        __le32 security_id = SECURITY_ID_INVALID;
        CLST vcn;
        const void *sd;
        u16 t16, nsize = 0, aid = 0;
        struct INDEX_ROOT *root, *dir_root;
        struct NTFS_DE *e, *new_de = NULL;
        struct REPARSE_DATA_BUFFER *rp = NULL;
        bool rp_inserted = false;

        /* New file will be resident or non resident. */
        const bool new_file_resident = 1;

        if (!fnd)
                ni_lock_dir(dir_ni);

        dir_root = indx_get_root(&dir_ni->dir, dir_ni, NULL, NULL);
        if (!dir_root) {
                err = -EINVAL;
                goto out1;
        }

        if (S_ISDIR(mode)) {
                /* Use parent's directory attributes. */
                fa = dir_ni->std_fa | FILE_ATTRIBUTE_DIRECTORY |
                     FILE_ATTRIBUTE_ARCHIVE;
                /*
                 * By default child directory inherits parent attributes.
                 * Root directory is hidden + system.
                 * Make an exception for children in root.
                 */
                if (dir->i_ino == MFT_REC_ROOT)
                        fa &= ~(FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_SYSTEM);
        } else if (S_ISLNK(mode)) {
                /* It is good idea that link should be the same type (file/dir) as target */
                fa = FILE_ATTRIBUTE_REPARSE_POINT;

                /*
                 * Linux: there are dir/file/symlink and so on.
                 * NTFS: symlinks are "dir + reparse" or "file + reparse"
                 * It is good idea to create:
                 * dir + reparse if 'symname' points to directory
                 * or
                 * file + reparse if 'symname' points to file
                 * Unfortunately kern_path hangs if symname contains 'dir'.
                 */

                /*
                 *        struct path path;
                 *
                 *        if (!kern_path(symname, LOOKUP_FOLLOW, &path)){
                 *                struct inode *target = d_inode(path.dentry);
                 *
                 *                if (S_ISDIR(target->i_mode))
                 *                        fa |= FILE_ATTRIBUTE_DIRECTORY;
                 *                // if ( target->i_sb == sb ){
                 *                //        use relative path?
                 *                // }
                 *                path_put(&path);
                 *        }
                 */
        } else if (S_ISREG(mode)) {
                if (sbi->options->sparse) {
                        /* Sparsed regular file, cause option 'sparse'. */
                        fa = FILE_ATTRIBUTE_SPARSE_FILE |
                             FILE_ATTRIBUTE_ARCHIVE;
                } else if (dir_ni->std_fa & FILE_ATTRIBUTE_COMPRESSED) {
                        /* Compressed regular file, if parent is compressed. */
                        fa = FILE_ATTRIBUTE_COMPRESSED | FILE_ATTRIBUTE_ARCHIVE;
                } else {
                        /* Regular file, default attributes. */
                        fa = FILE_ATTRIBUTE_ARCHIVE;
                }
        } else {
                fa = FILE_ATTRIBUTE_ARCHIVE;
        }

        /* If option "hide_dot_files" then set hidden attribute for dot files. */
        if (sbi->options->hide_dot_files && name->name[0] == '.')
                fa |= FILE_ATTRIBUTE_HIDDEN;

        if (!(mode & 0222))
                fa |= FILE_ATTRIBUTE_READONLY;

        /* Allocate PATH_MAX bytes. */
        new_de = __getname();
        if (!new_de) {
                err = -ENOMEM;
                goto out1;
        }

        if (unlikely(ntfs3_forced_shutdown(sb))) {
                err = -EIO;
                goto out2;
        }

        /* Mark rw ntfs as dirty. it will be cleared at umount. */
        ntfs_set_state(sbi, NTFS_DIRTY_DIRTY);

        /* Step 1: allocate and fill new mft record. */
        err = ntfs_look_free_mft(sbi, &ino, false, NULL, NULL);
        if (err)
                goto out2;

        ni = ntfs_new_inode(sbi, ino, S_ISDIR(mode) ? RECORD_FLAG_DIR : 0);
        if (IS_ERR(ni)) {
                err = PTR_ERR(ni);
                ni = NULL;
                goto out3;
        }
        inode = &ni->vfs_inode;
        inode_init_owner(idmap, inode, dir, mode);
        mode = inode->i_mode;

        ni->i_crtime = current_time(inode);

        rec = ni->mi.mrec;
        rec->hard_links = cpu_to_le16(1);
        attr = Add2Ptr(rec, le16_to_cpu(rec->attr_off));

        /* Get default security id. */
        sd = s_default_security;
        sd_size = sizeof(s_default_security);

        if (is_ntfs3(sbi)) {
                security_id = dir_ni->std_security_id;
                if (le32_to_cpu(security_id) < SECURITY_ID_FIRST) {
                        security_id = sbi->security.def_security_id;

                        if (security_id == SECURITY_ID_INVALID &&
                            !ntfs_insert_security(sbi, sd, sd_size,
                                                  &security_id, NULL))
                                sbi->security.def_security_id = security_id;
                }
        }

        /* Insert standard info. */
        std5 = Add2Ptr(attr, SIZEOF_RESIDENT);

        if (security_id == SECURITY_ID_INVALID) {
                dsize = sizeof(struct ATTR_STD_INFO);
        } else {
                dsize = sizeof(struct ATTR_STD_INFO5);
                std5->security_id = security_id;
                ni->std_security_id = security_id;
        }
        asize = SIZEOF_RESIDENT + dsize;

        attr->type = ATTR_STD;
        attr->size = cpu_to_le32(asize);
        attr->id = cpu_to_le16(aid++);
        attr->res.data_off = SIZEOF_RESIDENT_LE;
        attr->res.data_size = cpu_to_le32(dsize);

        std5->cr_time = std5->m_time = std5->c_time = std5->a_time =
                kernel2nt(&ni->i_crtime);

        std5->fa = ni->std_fa = fa;

        attr = Add2Ptr(attr, asize);

        /* Insert file name. */
        err = fill_name_de(sbi, new_de, name, uni);
        if (err)
                goto out4;

        mi_get_ref(&ni->mi, &new_de->ref);

        fname = (struct ATTR_FILE_NAME *)(new_de + 1);

        if (sbi->options->windows_names &&
            !valid_windows_name(sbi, (struct le_str *)&fname->name_len)) {
                err = -EINVAL;
                goto out4;
        }

        mi_get_ref(&dir_ni->mi, &fname->home);
        fname->dup.cr_time = fname->dup.m_time = fname->dup.c_time =
                fname->dup.a_time = std5->cr_time;
        fname->dup.alloc_size = fname->dup.data_size = 0;
        fname->dup.fa = std5->fa;
        fname->dup.ea_size = fname->dup.reparse = 0;

        dsize = le16_to_cpu(new_de->key_size);
        asize = ALIGN(SIZEOF_RESIDENT + dsize, 8);

        attr->type = ATTR_NAME;
        attr->size = cpu_to_le32(asize);
        attr->res.data_off = SIZEOF_RESIDENT_LE;
        attr->res.flags = RESIDENT_FLAG_INDEXED;
        attr->id = cpu_to_le16(aid++);
        attr->res.data_size = cpu_to_le32(dsize);
        memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), fname, dsize);

        attr = Add2Ptr(attr, asize);

        if (security_id == SECURITY_ID_INVALID) {
                /* Insert security attribute. */
                asize = SIZEOF_RESIDENT + ALIGN(sd_size, 8);

                attr->type = ATTR_SECURE;
                attr->size = cpu_to_le32(asize);
                attr->id = cpu_to_le16(aid++);
                attr->res.data_off = SIZEOF_RESIDENT_LE;
                attr->res.data_size = cpu_to_le32(sd_size);
                memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), sd, sd_size);

                attr = Add2Ptr(attr, asize);
        }

        attr->id = cpu_to_le16(aid++);
        if (fa & FILE_ATTRIBUTE_DIRECTORY) {
                /*
                 * Regular directory or symlink to directory.
                 * Create root attribute.
                 */
                dsize = sizeof(struct INDEX_ROOT) + sizeof(struct NTFS_DE);
                asize = sizeof(I30_NAME) + SIZEOF_RESIDENT + dsize;

                attr->type = ATTR_ROOT;
                attr->size = cpu_to_le32(asize);

                attr->name_len = ARRAY_SIZE(I30_NAME);
                attr->name_off = SIZEOF_RESIDENT_LE;
                attr->res.data_off =
                        cpu_to_le16(sizeof(I30_NAME) + SIZEOF_RESIDENT);
                attr->res.data_size = cpu_to_le32(dsize);
                memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), I30_NAME,
                       sizeof(I30_NAME));

                root = Add2Ptr(attr, sizeof(I30_NAME) + SIZEOF_RESIDENT);
                memcpy(root, dir_root, offsetof(struct INDEX_ROOT, ihdr));
                root->ihdr.de_off = cpu_to_le32(sizeof(struct INDEX_HDR));
                root->ihdr.used = cpu_to_le32(sizeof(struct INDEX_HDR) +
                                              sizeof(struct NTFS_DE));
                root->ihdr.total = root->ihdr.used;

                e = Add2Ptr(root, sizeof(struct INDEX_ROOT));
                e->size = cpu_to_le16(sizeof(struct NTFS_DE));
                e->flags = NTFS_IE_LAST;
        } else if (S_ISLNK(mode)) {
                /*
                 * Symlink to file.
                 * Create empty resident data attribute.
                 */
                asize = SIZEOF_RESIDENT;

                /* Insert empty ATTR_DATA */
                attr->type = ATTR_DATA;
                attr->size = cpu_to_le32(SIZEOF_RESIDENT);
                attr->name_off = SIZEOF_RESIDENT_LE;
                attr->res.data_off = SIZEOF_RESIDENT_LE;
        } else if (!new_file_resident && S_ISREG(mode)) {
                /*
                 * Regular file. Create empty non resident data attribute.
                 */
                attr->type = ATTR_DATA;
                attr->non_res = 1;
                attr->nres.evcn = cpu_to_le64(-1ll);
                if (fa & FILE_ATTRIBUTE_SPARSE_FILE) {
                        attr->size = cpu_to_le32(SIZEOF_NONRESIDENT_EX + 8);
                        attr->name_off = SIZEOF_NONRESIDENT_EX_LE;
                        attr->flags = ATTR_FLAG_SPARSED;
                        asize = SIZEOF_NONRESIDENT_EX + 8;
                } else if (fa & FILE_ATTRIBUTE_COMPRESSED) {
                        attr->size = cpu_to_le32(SIZEOF_NONRESIDENT_EX + 8);
                        attr->name_off = SIZEOF_NONRESIDENT_EX_LE;
                        attr->flags = ATTR_FLAG_COMPRESSED;
                        attr->nres.c_unit = COMPRESSION_UNIT;
                        asize = SIZEOF_NONRESIDENT_EX + 8;
                } else {
                        attr->size = cpu_to_le32(SIZEOF_NONRESIDENT + 8);
                        attr->name_off = SIZEOF_NONRESIDENT_LE;
                        asize = SIZEOF_NONRESIDENT + 8;
                }
                attr->nres.run_off = attr->name_off;
        } else {
                /*
                 * Node. Create empty resident data attribute.
                 */
                attr->type = ATTR_DATA;
                attr->size = cpu_to_le32(SIZEOF_RESIDENT);
                attr->name_off = SIZEOF_RESIDENT_LE;
                if (fa & FILE_ATTRIBUTE_SPARSE_FILE)
                        attr->flags = ATTR_FLAG_SPARSED;
                else if (fa & FILE_ATTRIBUTE_COMPRESSED)
                        attr->flags = ATTR_FLAG_COMPRESSED;
                attr->res.data_off = SIZEOF_RESIDENT_LE;
                asize = SIZEOF_RESIDENT;
                ni->ni_flags |= NI_FLAG_RESIDENT;
        }

        if (S_ISDIR(mode)) {
                ni->ni_flags |= NI_FLAG_DIR;
                err = indx_init(&ni->dir, sbi, attr, INDEX_MUTEX_I30);
                if (err)
                        goto out4;
        } else if (S_ISLNK(mode)) {
                rp = ntfs_create_reparse_buffer(sbi, symname, size, &nsize);

                if (IS_ERR(rp)) {
                        err = PTR_ERR(rp);
                        rp = NULL;
                        goto out4;
                }

                /*
                 * Insert ATTR_REPARSE.
                 */
                attr = Add2Ptr(attr, asize);
                attr->type = ATTR_REPARSE;
                attr->id = cpu_to_le16(aid++);

                /* Resident or non resident? */
                asize = ALIGN(SIZEOF_RESIDENT + nsize, 8);
                t16 = PtrOffset(rec, attr);

                /*
                 * Below function 'ntfs_save_wsl_perm' requires 0x78 bytes.
                 * It is good idea to keep extened attributes resident.
                 */
                if (asize + t16 + 0x78 + 8 > sbi->record_size) {
                        CLST alen;
                        CLST clst = bytes_to_cluster(sbi, nsize);

                        /* Bytes per runs. */
                        t16 = sbi->record_size - t16 - SIZEOF_NONRESIDENT;

                        attr->non_res = 1;
                        attr->nres.evcn = cpu_to_le64(clst - 1);
                        attr->name_off = SIZEOF_NONRESIDENT_LE;
                        attr->nres.run_off = attr->name_off;
                        attr->nres.data_size = cpu_to_le64(nsize);
                        attr->nres.valid_size = attr->nres.data_size;
                        attr->nres.alloc_size =
                                cpu_to_le64(ntfs_up_cluster(sbi, nsize));

                        err = attr_allocate_clusters(sbi, &ni->file.run, 0, 0,
                                                     clst, NULL, ALLOCATE_DEF,
                                                     &alen, 0, NULL, NULL);
                        if (err)
                                goto out5;

                        err = run_pack(&ni->file.run, 0, clst,
                                       Add2Ptr(attr, SIZEOF_NONRESIDENT), t16,
                                       &vcn);
                        if (err < 0)
                                goto out5;

                        if (vcn != clst) {
                                err = -EINVAL;
                                goto out5;
                        }

                        asize = SIZEOF_NONRESIDENT + ALIGN(err, 8);
                        /* Write non resident data. */
                        err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp,
                                                nsize, 0);
                        if (err)
                                goto out5;
                } else {
                        attr->res.data_off = SIZEOF_RESIDENT_LE;
                        attr->res.data_size = cpu_to_le32(nsize);
                        memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), rp, nsize);
                }
                /* Size of symlink equals the length of input string. */
                inode->i_size = size;

                attr->size = cpu_to_le32(asize);

                err = ntfs_insert_reparse(sbi, IO_REPARSE_TAG_SYMLINK,
                                          &new_de->ref);
                if (err)
                        goto out5;

                rp_inserted = true;
        }

        attr = Add2Ptr(attr, asize);
        attr->type = ATTR_END;

        rec->used = cpu_to_le32(PtrOffset(rec, attr) + 8);
        rec->next_attr_id = cpu_to_le16(aid);

        inode->i_generation = le16_to_cpu(rec->seq);

        if (S_ISDIR(mode)) {
                inode->i_op = &ntfs_dir_inode_operations;
                if (is_legacy_ntfs(inode->i_sb))
                        inode->i_fop = &ntfs_legacy_dir_operations;
                else
                        inode->i_fop = &ntfs_dir_operations;
        } else if (S_ISLNK(mode)) {
                inode->i_op = &ntfs_link_inode_operations;
                inode->i_fop = NULL;
                inode->i_mapping->a_ops = &ntfs_aops;
                inode->i_size = size;
                inode_nohighmem(inode);
        } else if (S_ISREG(mode)) {
                inode->i_op = &ntfs_file_inode_operations;
                if (is_legacy_ntfs(inode->i_sb))
                        inode->i_fop = &ntfs_legacy_file_operations;
                else
                        inode->i_fop = &ntfs_file_operations;
                inode->i_mapping->a_ops = is_compressed(ni) ? &ntfs_aops_cmpr :
                                                              &ntfs_aops;
                init_rwsem(&ni->file.run_lock);
        } else {
                inode->i_op = &ntfs_special_inode_operations;
                init_special_inode(inode, mode, dev);
        }

#ifdef CONFIG_NTFS3_FS_POSIX_ACL
        if (!S_ISLNK(mode) && (sb->s_flags & SB_POSIXACL)) {
                err = ntfs_init_acl(idmap, inode, dir);
                if (err)
                        goto out5;
        } else
#endif
        {
                inode->i_flags |= S_NOSEC;
        }

        /*
         * ntfs_init_acl and ntfs_save_wsl_perm update extended attribute.
         * The packed size of extended attribute is stored in direntry too.
         * 'fname' here points to inside new_de.
         */
        ntfs_save_wsl_perm(inode, &fname->dup.ea_size);

        /*
         * update ea_size in file_name attribute too.
         * Use ni_find_attr cause layout of MFT record may be changed
         * in ntfs_init_acl and ntfs_save_wsl_perm.
         */
        attr = ni_find_attr(ni, NULL, NULL, ATTR_NAME, NULL, 0, NULL, NULL);
        if (attr) {
                struct ATTR_FILE_NAME *fn;

                fn = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME);
                if (fn)
                        fn->dup.ea_size = fname->dup.ea_size;
        }

        /* We do not need to update parent directory later */
        ni->ni_flags &= ~NI_FLAG_UPDATE_PARENT;

        /* Step 2: Add new name in index. */
        err = indx_insert_entry(&dir_ni->dir, dir_ni, new_de, sbi, fnd, 0);
        if (err)
                goto out6;

        /*
         * Call 'd_instantiate' after inode->i_op is set
         * but before finish_open.
         */
        d_instantiate(dentry, inode);

        /* Set original time. inode times (i_ctime) may be changed in ntfs_init_acl. */
        inode_set_atime_to_ts(inode, ni->i_crtime);
        inode_set_ctime_to_ts(inode, ni->i_crtime);
        inode_set_mtime_to_ts(inode, ni->i_crtime);
        inode_set_mtime_to_ts(dir, ni->i_crtime);
        inode_set_ctime_to_ts(dir, ni->i_crtime);

        mark_inode_dirty(dir);
        mark_inode_dirty(inode);

        /* Normal exit. */
        goto out2;

out6:
        if (rp_inserted)
                ntfs_remove_reparse(sbi, IO_REPARSE_TAG_SYMLINK, &new_de->ref);

out5:
        if (!S_ISDIR(mode))
                run_deallocate(sbi, &ni->file.run, false);

out4:
        clear_rec_inuse(rec);
        clear_nlink(inode);
        ni->mi.dirty = false;
        discard_new_inode(inode);
out3:
        ntfs_mark_rec_free(sbi, ino, false);

out2:
        __putname(new_de);
        kfree(rp);

out1:
        if (!fnd)
                ni_unlock(dir_ni);

        if (!err)
                unlock_new_inode(inode);

        return err;
}

int ntfs_link_inode(struct inode *inode, struct dentry *dentry)
{
        int err;
        struct ntfs_inode *ni = ntfs_i(inode);
        struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
        struct NTFS_DE *de;

        /* Allocate PATH_MAX bytes. */
        de = __getname();
        if (!de)
                return -ENOMEM;

        /* Mark rw ntfs as dirty. It will be cleared at umount. */
        ntfs_set_state(sbi, NTFS_DIRTY_DIRTY);

        /* Construct 'de'. */
        err = fill_name_de(sbi, de, &dentry->d_name, NULL);
        if (err)
                goto out;

        err = ni_add_name(ntfs_i(d_inode(dentry->d_parent)), ni, de);
out:
        __putname(de);
        return err;
}

/*
 * ntfs_unlink_inode
 *
 * inode_operations::unlink
 * inode_operations::rmdir
 */
int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry)
{
        int err;
        struct ntfs_sb_info *sbi = dir->i_sb->s_fs_info;
        struct inode *inode = d_inode(dentry);
        struct ntfs_inode *ni = ntfs_i(inode);
        struct ntfs_inode *dir_ni = ntfs_i(dir);
        struct NTFS_DE *de, *de2 = NULL;
        int undo_remove;

        if (ntfs_is_meta_file(sbi, ni->mi.rno))
                return -EINVAL;

        /* Allocate PATH_MAX bytes. */
        de = __getname();
        if (!de)
                return -ENOMEM;

        ni_lock(ni);

        if (S_ISDIR(inode->i_mode) && !dir_is_empty(inode)) {
                err = -ENOTEMPTY;
                goto out;
        }

        err = fill_name_de(sbi, de, &dentry->d_name, NULL);
        if (err < 0)
                goto out;

        undo_remove = 0;
        err = ni_remove_name(dir_ni, ni, de, &de2, &undo_remove);

        if (!err) {
                drop_nlink(inode);
                inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
                mark_inode_dirty(dir);
                inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
                if (inode->i_nlink)
                        mark_inode_dirty(inode);
        } else if (!ni_remove_name_undo(dir_ni, ni, de, de2, undo_remove)) {
                _ntfs_bad_inode(inode);
        } else {
                if (ni_is_dirty(dir))
                        mark_inode_dirty(dir);
                if (ni_is_dirty(inode))
                        mark_inode_dirty(inode);
        }

out:
        ni_unlock(ni);
        __putname(de);
        return err;
}

void ntfs_evict_inode(struct inode *inode)
{
        truncate_inode_pages_final(&inode->i_data);

        invalidate_inode_buffers(inode);
        clear_inode(inode);

        ni_clear(ntfs_i(inode));
}

/*
 * ntfs_translate_junction
 *
 * Translate a Windows junction target to the Linux equivalent.
 * On junctions, targets are always absolute (they include the drive
 * letter). We have no way of knowing if the target is for the current
 * mounted device or not so we just assume it is.
 */
static int ntfs_translate_junction(const struct super_block *sb,
                                   const struct dentry *link_de, char *target,
                                   int target_len, int target_max)
{
        int tl_len, err = target_len;
        char *link_path_buffer = NULL, *link_path;
        char *translated = NULL;
        char *target_start;
        int copy_len;

        link_path_buffer = kmalloc(PATH_MAX, GFP_NOFS);
        if (!link_path_buffer) {
                err = -ENOMEM;
                goto out;
        }
        /* Get link path, relative to mount point */
        link_path = dentry_path_raw(link_de, link_path_buffer, PATH_MAX);
        if (IS_ERR(link_path)) {
                ntfs_err(sb, "Error getting link path");
                err = -EINVAL;
                goto out;
        }

        translated = kmalloc(PATH_MAX, GFP_NOFS);
        if (!translated) {
                err = -ENOMEM;
                goto out;
        }

        /* Make translated path a relative path to mount point */
        strcpy(translated, "./");
        ++link_path; /* Skip leading / */
        for (tl_len = sizeof("./") - 1; *link_path; ++link_path) {
                if (*link_path == '/') {
                        if (PATH_MAX - tl_len < sizeof("../")) {
                                ntfs_err(sb,
                                         "Link path %s has too many components",
                                         link_path);
                                err = -EINVAL;
                                goto out;
                        }
                        strcpy(translated + tl_len, "../");
                        tl_len += sizeof("../") - 1;
                }
        }

        /* Skip drive letter */
        target_start = target;
        while (*target_start && *target_start != ':')
                ++target_start;

        if (!*target_start) {
                ntfs_err(sb, "Link target (%s) missing drive separator",
                         target);
                err = -EINVAL;
                goto out;
        }

        /* Skip drive separator and leading /, if exists */
        target_start += 1 + (target_start[1] == '/');
        copy_len = target_len - (target_start - target);

        if (PATH_MAX - tl_len <= copy_len) {
                ntfs_err(sb, "Link target %s too large for buffer (%d <= %d)",
                         target_start, PATH_MAX - tl_len, copy_len);
                err = -EINVAL;
                goto out;
        }

        /* translated path has a trailing / and target_start does not */
        strcpy(translated + tl_len, target_start);
        tl_len += copy_len;
        if (target_max <= tl_len) {
                ntfs_err(sb, "Target path %s too large for buffer (%d <= %d)",
                         translated, target_max, tl_len);
                err = -EINVAL;
                goto out;
        }
        strcpy(target, translated);
        err = tl_len;

out:
        kfree(link_path_buffer);
        kfree(translated);
        return err;
}

static noinline int ntfs_readlink_hlp(const struct dentry *link_de,
                                      struct inode *inode, char *buffer,
                                      int buflen)
{
        int i, err = -EINVAL;
        struct ntfs_inode *ni = ntfs_i(inode);
        struct super_block *sb = inode->i_sb;
        struct ntfs_sb_info *sbi = sb->s_fs_info;
        u64 size;
        u16 ulen = 0;
        void *to_free = NULL;
        struct REPARSE_DATA_BUFFER *rp;
        const __le16 *uname;
        struct ATTRIB *attr;

        /* Reparse data present. Try to parse it. */
        static_assert(!offsetof(struct REPARSE_DATA_BUFFER, ReparseTag));
        static_assert(sizeof(u32) == sizeof(rp->ReparseTag));

        *buffer = 0;

        attr = ni_find_attr(ni, NULL, NULL, ATTR_REPARSE, NULL, 0, NULL, NULL);
        if (!attr)
                goto out;

        if (!attr->non_res) {
                rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER));
                if (!rp)
                        goto out;
                size = le32_to_cpu(attr->res.data_size);
        } else {
                size = le64_to_cpu(attr->nres.data_size);
                rp = NULL;
        }

        if (size > sbi->reparse.max_size || size <= sizeof(u32))
                goto out;

        if (!rp) {
                rp = kmalloc(size, GFP_NOFS);
                if (!rp) {
                        err = -ENOMEM;
                        goto out;
                }
                to_free = rp;
                /* Read into temporal buffer. */
                err = ntfs_read_run_nb(sbi, &ni->file.run, 0, rp, size, NULL);
                if (err)
                        goto out;
        }

        /* Microsoft Tag. */
        switch (rp->ReparseTag) {
        case IO_REPARSE_TAG_MOUNT_POINT:
                /* Mount points and junctions. */
                /* Can we use 'Rp->MountPointReparseBuffer.PrintNameLength'? */
                if (size <= offsetof(struct REPARSE_DATA_BUFFER,
                                     MountPointReparseBuffer.PathBuffer))
                        goto out;
                uname = Add2Ptr(rp,
                                offsetof(struct REPARSE_DATA_BUFFER,
                                         MountPointReparseBuffer.PathBuffer) +
                                        le16_to_cpu(rp->MountPointReparseBuffer
                                                            .PrintNameOffset));
                ulen = le16_to_cpu(rp->MountPointReparseBuffer.PrintNameLength);
                break;

        case IO_REPARSE_TAG_SYMLINK:
                /* FolderSymbolicLink */
                /* Can we use 'Rp->SymbolicLinkReparseBuffer.PrintNameLength'? */
                if (size <= offsetof(struct REPARSE_DATA_BUFFER,
                                     SymbolicLinkReparseBuffer.PathBuffer))
                        goto out;
                uname = Add2Ptr(
                        rp, offsetof(struct REPARSE_DATA_BUFFER,
                                     SymbolicLinkReparseBuffer.PathBuffer) +
                                    le16_to_cpu(rp->SymbolicLinkReparseBuffer
                                                        .PrintNameOffset));
                ulen = le16_to_cpu(
                        rp->SymbolicLinkReparseBuffer.PrintNameLength);
                break;

        case IO_REPARSE_TAG_CLOUD:
        case IO_REPARSE_TAG_CLOUD_1:
        case IO_REPARSE_TAG_CLOUD_2:
        case IO_REPARSE_TAG_CLOUD_3:
        case IO_REPARSE_TAG_CLOUD_4:
        case IO_REPARSE_TAG_CLOUD_5:
        case IO_REPARSE_TAG_CLOUD_6:
        case IO_REPARSE_TAG_CLOUD_7:
        case IO_REPARSE_TAG_CLOUD_8:
        case IO_REPARSE_TAG_CLOUD_9:
        case IO_REPARSE_TAG_CLOUD_A:
        case IO_REPARSE_TAG_CLOUD_B:
        case IO_REPARSE_TAG_CLOUD_C:
        case IO_REPARSE_TAG_CLOUD_D:
        case IO_REPARSE_TAG_CLOUD_E:
        case IO_REPARSE_TAG_CLOUD_F:
                err = sizeof("OneDrive") - 1;
                if (err > buflen)
                        err = buflen;
                memcpy(buffer, "OneDrive", err);
                goto out;

        default:
                if (IsReparseTagMicrosoft(rp->ReparseTag)) {
                        /* Unknown Microsoft Tag. */
                        goto out;
                }
                if (!IsReparseTagNameSurrogate(rp->ReparseTag) ||
                    size <= sizeof(struct REPARSE_POINT)) {
                        goto out;
                }

                /* Users tag. */
                uname = Add2Ptr(rp, sizeof(struct REPARSE_POINT));
                ulen = le16_to_cpu(rp->ReparseDataLength) -
                       sizeof(struct REPARSE_POINT);
        }

        /* Convert nlen from bytes to UNICODE chars. */
        ulen >>= 1;

        /* Check that name is available. */
        if (!ulen || uname + ulen > (__le16 *)Add2Ptr(rp, size))
                goto out;

        /* If name is already zero terminated then truncate it now. */
        if (!uname[ulen - 1])
                ulen -= 1;

        err = ntfs_utf16_to_nls(sbi, uname, ulen, buffer, buflen);

        if (err < 0)
                goto out;

        /* Translate Windows '\' into Linux '/'. */
        for (i = 0; i < err; i++) {
                if (buffer[i] == '\\')
                        buffer[i] = '/';
        }

        /* Always set last zero. */
        buffer[err] = 0;

        /* If this is a junction, translate the link target. */
        if (rp->ReparseTag == IO_REPARSE_TAG_MOUNT_POINT)
                err = ntfs_translate_junction(sb, link_de, buffer, err, buflen);

out:
        kfree(to_free);
        return err;
}

static const char *ntfs_get_link(struct dentry *de, struct inode *inode,
                                 struct delayed_call *done)
{
        int err;
        char *ret;

        if (!de)
                return ERR_PTR(-ECHILD);

        ret = kmalloc(PAGE_SIZE, GFP_NOFS);
        if (!ret)
                return ERR_PTR(-ENOMEM);

        err = ntfs_readlink_hlp(de, inode, ret, PAGE_SIZE);
        if (err < 0) {
                kfree(ret);
                return ERR_PTR(err);
        }

        set_delayed_call(done, kfree_link, ret);

        return ret;
}

// clang-format off
const struct inode_operations ntfs_link_inode_operations = {
        .get_link        = ntfs_get_link,
        .setattr        = ntfs3_setattr,
        .listxattr        = ntfs_listxattr,
};

const struct address_space_operations ntfs_aops = {
        .read_folio        = ntfs_read_folio,
        .readahead        = ntfs_readahead,
        .writepages        = ntfs_writepages,
        .write_begin        = ntfs_write_begin,
        .write_end        = ntfs_write_end,
        .direct_IO        = ntfs_direct_IO,
        .bmap                = ntfs_bmap,
        .dirty_folio        = block_dirty_folio,
        .migrate_folio        = buffer_migrate_folio,
        .invalidate_folio = block_invalidate_folio,
};

const struct address_space_operations ntfs_aops_cmpr = {
        .read_folio        = ntfs_read_folio,
        .readahead        = ntfs_readahead,
};
// clang-format on









































































































































































   11 





















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIMEKEEPING_H
#define _LINUX_TIMEKEEPING_H

#include <linux/errno.h>
#include <linux/clocksource_ids.h>
#include <linux/ktime.h>

/* Included from linux/ktime.h */

void timekeeping_init(void);
extern int timekeeping_suspended;

/* Architecture timer tick functions: */
extern void legacy_timer_tick(unsigned long ticks);

/*
 * Get and set timeofday
 */
extern int do_settimeofday64(const struct timespec64 *ts);
extern int do_sys_settimeofday64(const struct timespec64 *tv,
                                 const struct timezone *tz);

/*
 * ktime_get() family - read the current time in a multitude of ways.
 *
 * The default time reference is CLOCK_MONOTONIC, starting at
 * boot time but not counting the time spent in suspend.
 * For other references, use the functions with "real", "clocktai",
 * "boottime" and "raw" suffixes.
 *
 * To get the time in a different format, use the ones with
 * "ns", "ts64" and "seconds" suffix.
 *
 * See Documentation/core-api/timekeeping.rst for more details.
 */


/*
 * timespec64 based interfaces
 */
extern void ktime_get_raw_ts64(struct timespec64 *ts);
extern void ktime_get_ts64(struct timespec64 *ts);
extern void ktime_get_real_ts64(struct timespec64 *tv);
extern void ktime_get_coarse_ts64(struct timespec64 *ts);
extern void ktime_get_coarse_real_ts64(struct timespec64 *ts);

void getboottime64(struct timespec64 *ts);

/*
 * time64_t base interfaces
 */
extern time64_t ktime_get_seconds(void);
extern time64_t __ktime_get_real_seconds(void);
extern time64_t ktime_get_real_seconds(void);

/*
 * ktime_t based interfaces
 */

enum tk_offsets {
        TK_OFFS_REAL,
        TK_OFFS_BOOT,
        TK_OFFS_TAI,
        TK_OFFS_MAX,
};

extern ktime_t ktime_get(void);
extern ktime_t ktime_get_with_offset(enum tk_offsets offs);
extern ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs);
extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs);
extern ktime_t ktime_get_raw(void);
extern u32 ktime_get_resolution_ns(void);

/**
 * ktime_get_real - get the real (wall-) time in ktime_t format
 *
 * Returns: real (wall) time in ktime_t format
 */
static inline ktime_t ktime_get_real(void)
{
        return ktime_get_with_offset(TK_OFFS_REAL);
}

static inline ktime_t ktime_get_coarse_real(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_REAL);
}

/**
 * ktime_get_boottime - Get monotonic time since boot in ktime_t format
 *
 * This is similar to CLOCK_MONTONIC/ktime_get, but also includes the
 * time spent in suspend.
 *
 * Returns: monotonic time since boot in ktime_t format
 */
static inline ktime_t ktime_get_boottime(void)
{
        return ktime_get_with_offset(TK_OFFS_BOOT);
}

static inline ktime_t ktime_get_coarse_boottime(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_BOOT);
}

/**
 * ktime_get_clocktai - Get the TAI time of day in ktime_t format
 *
 * Returns: the TAI time of day in ktime_t format
 */
static inline ktime_t ktime_get_clocktai(void)
{
        return ktime_get_with_offset(TK_OFFS_TAI);
}

static inline ktime_t ktime_get_coarse_clocktai(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_TAI);
}

static inline ktime_t ktime_get_coarse(void)
{
        struct timespec64 ts;

        ktime_get_coarse_ts64(&ts);
        return timespec64_to_ktime(ts);
}

static inline u64 ktime_get_coarse_ns(void)
{
        return ktime_to_ns(ktime_get_coarse());
}

static inline u64 ktime_get_coarse_real_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_real());
}

static inline u64 ktime_get_coarse_boottime_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_boottime());
}

static inline u64 ktime_get_coarse_clocktai_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_clocktai());
}

/**
 * ktime_mono_to_real - Convert monotonic time to clock realtime
 * @mono: monotonic time to convert
 *
 * Returns: time converted to realtime clock
 */
static inline ktime_t ktime_mono_to_real(ktime_t mono)
{
        return ktime_mono_to_any(mono, TK_OFFS_REAL);
}

/**
 * ktime_get_ns - Get the current time in nanoseconds
 *
 * Returns: current time converted to nanoseconds
 */
static inline u64 ktime_get_ns(void)
{
        return ktime_to_ns(ktime_get());
}

/**
 * ktime_get_real_ns - Get the current real/wall time in nanoseconds
 *
 * Returns: current real time converted to nanoseconds
 */
static inline u64 ktime_get_real_ns(void)
{
        return ktime_to_ns(ktime_get_real());
}

/**
 * ktime_get_boottime_ns - Get the monotonic time since boot in nanoseconds
 *
 * Returns: current boottime converted to nanoseconds
 */
static inline u64 ktime_get_boottime_ns(void)
{
        return ktime_to_ns(ktime_get_boottime());
}

/**
 * ktime_get_clocktai_ns - Get the current TAI time of day in nanoseconds
 *
 * Returns: current TAI time converted to nanoseconds
 */
static inline u64 ktime_get_clocktai_ns(void)
{
        return ktime_to_ns(ktime_get_clocktai());
}

/**
 * ktime_get_raw_ns - Get the raw monotonic time in nanoseconds
 *
 * Returns: current raw monotonic time converted to nanoseconds
 */
static inline u64 ktime_get_raw_ns(void)
{
        return ktime_to_ns(ktime_get_raw());
}

extern u64 ktime_get_mono_fast_ns(void);
extern u64 ktime_get_raw_fast_ns(void);
extern u64 ktime_get_boot_fast_ns(void);
extern u64 ktime_get_tai_fast_ns(void);
extern u64 ktime_get_real_fast_ns(void);

/*
 * timespec64/time64_t interfaces utilizing the ktime based ones
 * for API completeness, these could be implemented more efficiently
 * if needed.
 */
static inline void ktime_get_boottime_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_boottime());
}

static inline void ktime_get_coarse_boottime_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_coarse_boottime());
}

static inline time64_t ktime_get_boottime_seconds(void)
{
        return ktime_divns(ktime_get_coarse_boottime(), NSEC_PER_SEC);
}

static inline void ktime_get_clocktai_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_clocktai());
}

static inline void ktime_get_coarse_clocktai_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_coarse_clocktai());
}

static inline time64_t ktime_get_clocktai_seconds(void)
{
        return ktime_divns(ktime_get_coarse_clocktai(), NSEC_PER_SEC);
}

/*
 * RTC specific
 */
extern bool timekeeping_rtc_skipsuspend(void);
extern bool timekeeping_rtc_skipresume(void);

extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta);

/**
 * struct ktime_timestamps - Simultaneous mono/boot/real timestamps
 * @mono:        Monotonic timestamp
 * @boot:        Boottime timestamp
 * @real:        Realtime timestamp
 */
struct ktime_timestamps {
        u64                mono;
        u64                boot;
        u64                real;
};

/**
 * struct system_time_snapshot - simultaneous raw/real time capture with
 *                                 counter value
 * @cycles:        Clocksource counter value to produce the system times
 * @real:        Realtime system time
 * @raw:        Monotonic raw system time
 * @cs_id:        Clocksource ID
 * @clock_was_set_seq:        The sequence number of clock-was-set events
 * @cs_was_changed_seq:        The sequence number of clocksource change events
 */
struct system_time_snapshot {
        u64                        cycles;
        ktime_t                        real;
        ktime_t                        raw;
        enum clocksource_ids        cs_id;
        unsigned int                clock_was_set_seq;
        u8                        cs_was_changed_seq;
};

/**
 * struct system_device_crosststamp - system/device cross-timestamp
 *                                      (synchronized capture)
 * @device:                Device time
 * @sys_realtime:        Realtime simultaneous with device time
 * @sys_monoraw:        Monotonic raw simultaneous with device time
 */
struct system_device_crosststamp {
        ktime_t device;
        ktime_t sys_realtime;
        ktime_t sys_monoraw;
};

/**
 * struct system_counterval_t - system counter value with the ID of the
 *                                corresponding clocksource
 * @cycles:        System counter value
 * @cs_id:        Clocksource ID corresponding to system counter value. Used by
 *                timekeeping code to verify comparability of two cycle values.
 *                The default ID, CSID_GENERIC, does not identify a specific
 *                clocksource.
 */
struct system_counterval_t {
        u64                        cycles;
        enum clocksource_ids        cs_id;
};

/*
 * Get cross timestamp between system clock and device clock
 */
extern int get_device_system_crosststamp(
                        int (*get_time_fn)(ktime_t *device_time,
                                struct system_counterval_t *system_counterval,
                                void *ctx),
                        void *ctx,
                        struct system_time_snapshot *history,
                        struct system_device_crosststamp *xtstamp);

/*
 * Simultaneously snapshot realtime and monotonic raw clocks
 */
extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot);

/* NMI safe mono/boot/realtime timestamps */
extern void ktime_get_fast_timestamps(struct ktime_timestamps *snap);

/*
 * Persistent clock related interfaces
 */
extern int persistent_clock_is_local;

extern void read_persistent_clock64(struct timespec64 *ts);
void read_persistent_wall_and_boot_offset(struct timespec64 *wall_clock,
                                          struct timespec64 *boot_offset);
#ifdef CONFIG_GENERIC_CMOS_UPDATE
extern int update_persistent_clock64(struct timespec64 now);
#endif

#endif

























































    1 















































































































































    1 

























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Integer base 2 logarithm calculation
 *
 * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_LOG2_H
#define _LINUX_LOG2_H

#include <linux/types.h>
#include <linux/bitops.h>

/*
 * non-constant log of base 2 calculators
 * - the arch may override these in asm/bitops.h if they can be implemented
 *   more efficiently than using fls() and fls64()
 * - the arch is not required to handle n==0 if implementing the fallback
 */
#ifndef CONFIG_ARCH_HAS_ILOG2_U32
static __always_inline __attribute__((const))
int __ilog2_u32(u32 n)
{
        return fls(n) - 1;
}
#endif

#ifndef CONFIG_ARCH_HAS_ILOG2_U64
static __always_inline __attribute__((const))
int __ilog2_u64(u64 n)
{
        return fls64(n) - 1;
}
#endif

/**
 * is_power_of_2() - check if a value is a power of two
 * @n: the value to check
 *
 * Determine whether some value is a power of two, where zero is
 * *not* considered a power of two.
 * Return: true if @n is a power of 2, otherwise false.
 */
static inline __attribute__((const))
bool is_power_of_2(unsigned long n)
{
        return (n != 0 && ((n & (n - 1)) == 0));
}

/**
 * __roundup_pow_of_two() - round up to nearest power of two
 * @n: value to round up
 */
static inline __attribute__((const))
unsigned long __roundup_pow_of_two(unsigned long n)
{
        return 1UL << fls_long(n - 1);
}

/**
 * __rounddown_pow_of_two() - round down to nearest power of two
 * @n: value to round down
 */
static inline __attribute__((const))
unsigned long __rounddown_pow_of_two(unsigned long n)
{
        return 1UL << (fls_long(n) - 1);
}

/**
 * const_ilog2 - log base 2 of 32-bit or a 64-bit constant unsigned value
 * @n: parameter
 *
 * Use this where sparse expects a true constant expression, e.g. for array
 * indices.
 */
#define const_ilog2(n)                                \
(                                                \
        __builtin_constant_p(n) ? (                \
                (n) < 2 ? 0 :                        \
                (n) & (1ULL << 63) ? 63 :        \
                (n) & (1ULL << 62) ? 62 :        \
                (n) & (1ULL << 61) ? 61 :        \
                (n) & (1ULL << 60) ? 60 :        \
                (n) & (1ULL << 59) ? 59 :        \
                (n) & (1ULL << 58) ? 58 :        \
                (n) & (1ULL << 57) ? 57 :        \
                (n) & (1ULL << 56) ? 56 :        \
                (n) & (1ULL << 55) ? 55 :        \
                (n) & (1ULL << 54) ? 54 :        \
                (n) & (1ULL << 53) ? 53 :        \
                (n) & (1ULL << 52) ? 52 :        \
                (n) & (1ULL << 51) ? 51 :        \
                (n) & (1ULL << 50) ? 50 :        \
                (n) & (1ULL << 49) ? 49 :        \
                (n) & (1ULL << 48) ? 48 :        \
                (n) & (1ULL << 47) ? 47 :        \
                (n) & (1ULL << 46) ? 46 :        \
                (n) & (1ULL << 45) ? 45 :        \
                (n) & (1ULL << 44) ? 44 :        \
                (n) & (1ULL << 43) ? 43 :        \
                (n) & (1ULL << 42) ? 42 :        \
                (n) & (1ULL << 41) ? 41 :        \
                (n) & (1ULL << 40) ? 40 :        \
                (n) & (1ULL << 39) ? 39 :        \
                (n) & (1ULL << 38) ? 38 :        \
                (n) & (1ULL << 37) ? 37 :        \
                (n) & (1ULL << 36) ? 36 :        \
                (n) & (1ULL << 35) ? 35 :        \
                (n) & (1ULL << 34) ? 34 :        \
                (n) & (1ULL << 33) ? 33 :        \
                (n) & (1ULL << 32) ? 32 :        \
                (n) & (1ULL << 31) ? 31 :        \
                (n) & (1ULL << 30) ? 30 :        \
                (n) & (1ULL << 29) ? 29 :        \
                (n) & (1ULL << 28) ? 28 :        \
                (n) & (1ULL << 27) ? 27 :        \
                (n) & (1ULL << 26) ? 26 :        \
                (n) & (1ULL << 25) ? 25 :        \
                (n) & (1ULL << 24) ? 24 :        \
                (n) & (1ULL << 23) ? 23 :        \
                (n) & (1ULL << 22) ? 22 :        \
                (n) & (1ULL << 21) ? 21 :        \
                (n) & (1ULL << 20) ? 20 :        \
                (n) & (1ULL << 19) ? 19 :        \
                (n) & (1ULL << 18) ? 18 :        \
                (n) & (1ULL << 17) ? 17 :        \
                (n) & (1ULL << 16) ? 16 :        \
                (n) & (1ULL << 15) ? 15 :        \
                (n) & (1ULL << 14) ? 14 :        \
                (n) & (1ULL << 13) ? 13 :        \
                (n) & (1ULL << 12) ? 12 :        \
                (n) & (1ULL << 11) ? 11 :        \
                (n) & (1ULL << 10) ? 10 :        \
                (n) & (1ULL <<  9) ?  9 :        \
                (n) & (1ULL <<  8) ?  8 :        \
                (n) & (1ULL <<  7) ?  7 :        \
                (n) & (1ULL <<  6) ?  6 :        \
                (n) & (1ULL <<  5) ?  5 :        \
                (n) & (1ULL <<  4) ?  4 :        \
                (n) & (1ULL <<  3) ?  3 :        \
                (n) & (1ULL <<  2) ?  2 :        \
                1) :                                \
        -1)

/**
 * ilog2 - log base 2 of 32-bit or a 64-bit unsigned value
 * @n: parameter
 *
 * constant-capable log of base 2 calculation
 * - this can be used to initialise global variables from constant data, hence
 * the massive ternary operator construction
 *
 * selects the appropriately-sized optimised version depending on sizeof(n)
 */
#define ilog2(n) \
( \
        __builtin_constant_p(n) ?        \
        ((n) < 2 ? 0 :                        \
         63 - __builtin_clzll(n)) :        \
        (sizeof(n) <= 4) ?                \
        __ilog2_u32(n) :                \
        __ilog2_u64(n)                        \
 )

/**
 * roundup_pow_of_two - round the given value up to nearest power of two
 * @n: parameter
 *
 * round the given value up to the nearest power of two
 * - the result is undefined when n == 0
 * - this can be used to initialise global variables from constant data
 */
#define roundup_pow_of_two(n)                        \
(                                                \
        __builtin_constant_p(n) ? (                \
                ((n) == 1) ? 1 :                \
                (1UL << (ilog2((n) - 1) + 1))        \
                                   ) :                \
        __roundup_pow_of_two(n)                        \
 )

/**
 * rounddown_pow_of_two - round the given value down to nearest power of two
 * @n: parameter
 *
 * round the given value down to the nearest power of two
 * - the result is undefined when n == 0
 * - this can be used to initialise global variables from constant data
 */
#define rounddown_pow_of_two(n)                        \
(                                                \
        __builtin_constant_p(n) ? (                \
                (1UL << ilog2(n))) :                \
        __rounddown_pow_of_two(n)                \
 )

static inline __attribute_const__
int __order_base_2(unsigned long n)
{
        return n > 1 ? ilog2(n - 1) + 1 : 0;
}

/**
 * order_base_2 - calculate the (rounded up) base 2 order of the argument
 * @n: parameter
 *
 * The first few values calculated by this routine:
 *  ob2(0) = 0
 *  ob2(1) = 0
 *  ob2(2) = 1
 *  ob2(3) = 2
 *  ob2(4) = 2
 *  ob2(5) = 3
 *  ... and so on.
 */
#define order_base_2(n)                                \
(                                                \
        __builtin_constant_p(n) ? (                \
                ((n) == 0 || (n) == 1) ? 0 :        \
                ilog2((n) - 1) + 1) :                \
        __order_base_2(n)                        \
)

static inline __attribute__((const))
int __bits_per(unsigned long n)
{
        if (n < 2)
                return 1;
        if (is_power_of_2(n))
                return order_base_2(n) + 1;
        return order_base_2(n);
}

/**
 * bits_per - calculate the number of bits required for the argument
 * @n: parameter
 *
 * This is constant-capable and can be used for compile time
 * initializations, e.g bitfields.
 *
 * The first few values calculated by this routine:
 * bf(0) = 1
 * bf(1) = 1
 * bf(2) = 2
 * bf(3) = 2
 * bf(4) = 3
 * ... and so on.
 */
#define bits_per(n)                                \
(                                                \
        __builtin_constant_p(n) ? (                \
                ((n) == 0 || (n) == 1)                \
                        ? 1 : ilog2(n) + 1        \
        ) :                                        \
        __bits_per(n)                                \
)
#endif /* _LINUX_LOG2_H */































































































































































































































































































































































































































































































































































































































































































































































































    2 














    2 




















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/condition.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include "common.h"
#include <linux/slab.h>

/* List of "struct tomoyo_condition". */
LIST_HEAD(tomoyo_condition_list);

/**
 * tomoyo_argv - Check argv[] in "struct linux_binbrm".
 *
 * @index:   Index number of @arg_ptr.
 * @arg_ptr: Contents of argv[@index].
 * @argc:    Length of @argv.
 * @argv:    Pointer to "struct tomoyo_argv".
 * @checked: Set to true if @argv[@index] was found.
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_argv(const unsigned int index, const char *arg_ptr,
                        const int argc, const struct tomoyo_argv *argv,
                        u8 *checked)
{
        int i;
        struct tomoyo_path_info arg;

        arg.name = arg_ptr;
        for (i = 0; i < argc; argv++, checked++, i++) {
                bool result;

                if (index != argv->index)
                        continue;
                *checked = 1;
                tomoyo_fill_path_info(&arg);
                result = tomoyo_path_matches_pattern(&arg, argv->value);
                if (argv->is_not)
                        result = !result;
                if (!result)
                        return false;
        }
        return true;
}

/**
 * tomoyo_envp - Check envp[] in "struct linux_binbrm".
 *
 * @env_name:  The name of environment variable.
 * @env_value: The value of environment variable.
 * @envc:      Length of @envp.
 * @envp:      Pointer to "struct tomoyo_envp".
 * @checked:   Set to true if @envp[@env_name] was found.
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_envp(const char *env_name, const char *env_value,
                        const int envc, const struct tomoyo_envp *envp,
                        u8 *checked)
{
        int i;
        struct tomoyo_path_info name;
        struct tomoyo_path_info value;

        name.name = env_name;
        tomoyo_fill_path_info(&name);
        value.name = env_value;
        tomoyo_fill_path_info(&value);
        for (i = 0; i < envc; envp++, checked++, i++) {
                bool result;

                if (!tomoyo_path_matches_pattern(&name, envp->name))
                        continue;
                *checked = 1;
                if (envp->value) {
                        result = tomoyo_path_matches_pattern(&value,
                                                             envp->value);
                        if (envp->is_not)
                                result = !result;
                } else {
                        result = true;
                        if (!envp->is_not)
                                result = !result;
                }
                if (!result)
                        return false;
        }
        return true;
}

/**
 * tomoyo_scan_bprm - Scan "struct linux_binprm".
 *
 * @ee:   Pointer to "struct tomoyo_execve".
 * @argc: Length of @argc.
 * @argv: Pointer to "struct tomoyo_argv".
 * @envc: Length of @envp.
 * @envp: Pointer to "struct tomoyo_envp".
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_scan_bprm(struct tomoyo_execve *ee,
                             const u16 argc, const struct tomoyo_argv *argv,
                             const u16 envc, const struct tomoyo_envp *envp)
{
        struct linux_binprm *bprm = ee->bprm;
        struct tomoyo_page_dump *dump = &ee->dump;
        char *arg_ptr = ee->tmp;
        int arg_len = 0;
        unsigned long pos = bprm->p;
        int offset = pos % PAGE_SIZE;
        int argv_count = bprm->argc;
        int envp_count = bprm->envc;
        bool result = true;
        u8 local_checked[32];
        u8 *checked;

        if (argc + envc <= sizeof(local_checked)) {
                checked = local_checked;
                memset(local_checked, 0, sizeof(local_checked));
        } else {
                checked = kzalloc(argc + envc, GFP_NOFS);
                if (!checked)
                        return false;
        }
        while (argv_count || envp_count) {
                if (!tomoyo_dump_page(bprm, pos, dump)) {
                        result = false;
                        goto out;
                }
                pos += PAGE_SIZE - offset;
                while (offset < PAGE_SIZE) {
                        /* Read. */
                        const char *kaddr = dump->data;
                        const unsigned char c = kaddr[offset++];

                        if (c && arg_len < TOMOYO_EXEC_TMPSIZE - 10) {
                                if (c == '\\') {
                                        arg_ptr[arg_len++] = '\\';
                                        arg_ptr[arg_len++] = '\\';
                                } else if (c > ' ' && c < 127) {
                                        arg_ptr[arg_len++] = c;
                                } else {
                                        arg_ptr[arg_len++] = '\\';
                                        arg_ptr[arg_len++] = (c >> 6) + '0';
                                        arg_ptr[arg_len++] =
                                                ((c >> 3) & 7) + '0';
                                        arg_ptr[arg_len++] = (c & 7) + '0';
                                }
                        } else {
                                arg_ptr[arg_len] = '\0';
                        }
                        if (c)
                                continue;
                        /* Check. */
                        if (argv_count) {
                                if (!tomoyo_argv(bprm->argc - argv_count,
                                                 arg_ptr, argc, argv,
                                                 checked)) {
                                        result = false;
                                        break;
                                }
                                argv_count--;
                        } else if (envp_count) {
                                char *cp = strchr(arg_ptr, '=');

                                if (cp) {
                                        *cp = '\0';
                                        if (!tomoyo_envp(arg_ptr, cp + 1,
                                                         envc, envp,
                                                         checked + argc)) {
                                                result = false;
                                                break;
                                        }
                                }
                                envp_count--;
                        } else {
                                break;
                        }
                        arg_len = 0;
                }
                offset = 0;
                if (!result)
                        break;
        }
out:
        if (result) {
                int i;

                /* Check not-yet-checked entries. */
                for (i = 0; i < argc; i++) {
                        if (checked[i])
                                continue;
                        /*
                         * Return true only if all unchecked indexes in
                         * bprm->argv[] are not matched.
                         */
                        if (argv[i].is_not)
                                continue;
                        result = false;
                        break;
                }
                for (i = 0; i < envc; envp++, i++) {
                        if (checked[argc + i])
                                continue;
                        /*
                         * Return true only if all unchecked environ variables
                         * in bprm->envp[] are either undefined or not matched.
                         */
                        if ((!envp->value && !envp->is_not) ||
                            (envp->value && envp->is_not))
                                continue;
                        result = false;
                        break;
                }
        }
        if (checked != local_checked)
                kfree(checked);
        return result;
}

/**
 * tomoyo_scan_exec_realpath - Check "exec.realpath" parameter of "struct tomoyo_condition".
 *
 * @file:  Pointer to "struct file".
 * @ptr:   Pointer to "struct tomoyo_name_union".
 * @match: True if "exec.realpath=", false if "exec.realpath!=".
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_scan_exec_realpath(struct file *file,
                                      const struct tomoyo_name_union *ptr,
                                      const bool match)
{
        bool result;
        struct tomoyo_path_info exe;

        if (!file)
                return false;
        exe.name = tomoyo_realpath_from_path(&file->f_path);
        if (!exe.name)
                return false;
        tomoyo_fill_path_info(&exe);
        result = tomoyo_compare_name_union(&exe, ptr);
        kfree(exe.name);
        return result == match;
}

/**
 * tomoyo_get_dqword - tomoyo_get_name() for a quoted string.
 *
 * @start: String to save.
 *
 * Returns pointer to "struct tomoyo_path_info" on success, NULL otherwise.
 */
static const struct tomoyo_path_info *tomoyo_get_dqword(char *start)
{
        char *cp = start + strlen(start) - 1;

        if (cp == start || *start++ != '"' || *cp != '"')
                return NULL;
        *cp = '\0';
        if (*start && !tomoyo_correct_word(start))
                return NULL;
        return tomoyo_get_name(start);
}

/**
 * tomoyo_parse_name_union_quoted - Parse a quoted word.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 * @ptr:   Pointer to "struct tomoyo_name_union".
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_parse_name_union_quoted(struct tomoyo_acl_param *param,
                                           struct tomoyo_name_union *ptr)
{
        char *filename = param->data;

        if (*filename == '@')
                return tomoyo_parse_name_union(param, ptr);
        ptr->filename = tomoyo_get_dqword(filename);
        return ptr->filename != NULL;
}

/**
 * tomoyo_parse_argv - Parse an argv[] condition part.
 *
 * @left:  Lefthand value.
 * @right: Righthand value.
 * @argv:  Pointer to "struct tomoyo_argv".
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_parse_argv(char *left, char *right,
                              struct tomoyo_argv *argv)
{
        if (tomoyo_parse_ulong(&argv->index, &left) !=
            TOMOYO_VALUE_TYPE_DECIMAL || *left++ != ']' || *left)
                return false;
        argv->value = tomoyo_get_dqword(right);
        return argv->value != NULL;
}

/**
 * tomoyo_parse_envp - Parse an envp[] condition part.
 *
 * @left:  Lefthand value.
 * @right: Righthand value.
 * @envp:  Pointer to "struct tomoyo_envp".
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_parse_envp(char *left, char *right,
                              struct tomoyo_envp *envp)
{
        const struct tomoyo_path_info *name;
        const struct tomoyo_path_info *value;
        char *cp = left + strlen(left) - 1;

        if (*cp-- != ']' || *cp != '"')
                goto out;
        *cp = '\0';
        if (!tomoyo_correct_word(left))
                goto out;
        name = tomoyo_get_name(left);
        if (!name)
                goto out;
        if (!strcmp(right, "NULL")) {
                value = NULL;
        } else {
                value = tomoyo_get_dqword(right);
                if (!value) {
                        tomoyo_put_name(name);
                        goto out;
                }
        }
        envp->name = name;
        envp->value = value;
        return true;
out:
        return false;
}

/**
 * tomoyo_same_condition - Check for duplicated "struct tomoyo_condition" entry.
 *
 * @a: Pointer to "struct tomoyo_condition".
 * @b: Pointer to "struct tomoyo_condition".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_same_condition(const struct tomoyo_condition *a,
                                         const struct tomoyo_condition *b)
{
        return a->size == b->size && a->condc == b->condc &&
                a->numbers_count == b->numbers_count &&
                a->names_count == b->names_count &&
                a->argc == b->argc && a->envc == b->envc &&
                a->grant_log == b->grant_log && a->transit == b->transit &&
                !memcmp(a + 1, b + 1, a->size - sizeof(*a));
}

/**
 * tomoyo_condition_type - Get condition type.
 *
 * @word: Keyword string.
 *
 * Returns one of values in "enum tomoyo_conditions_index" on success,
 * TOMOYO_MAX_CONDITION_KEYWORD otherwise.
 */
static u8 tomoyo_condition_type(const char *word)
{
        u8 i;

        for (i = 0; i < TOMOYO_MAX_CONDITION_KEYWORD; i++) {
                if (!strcmp(word, tomoyo_condition_keyword[i]))
                        break;
        }
        return i;
}

/* Define this to enable debug mode. */
/* #define DEBUG_CONDITION */

#ifdef DEBUG_CONDITION
#define dprintk printk
#else
#define dprintk(...) do { } while (0)
#endif

/**
 * tomoyo_commit_condition - Commit "struct tomoyo_condition".
 *
 * @entry: Pointer to "struct tomoyo_condition".
 *
 * Returns pointer to "struct tomoyo_condition" on success, NULL otherwise.
 *
 * This function merges duplicated entries. This function returns NULL if
 * @entry is not duplicated but memory quota for policy has exceeded.
 */
static struct tomoyo_condition *tomoyo_commit_condition
(struct tomoyo_condition *entry)
{
        struct tomoyo_condition *ptr;
        bool found = false;

        if (mutex_lock_interruptible(&tomoyo_policy_lock)) {
                dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__);
                ptr = NULL;
                found = true;
                goto out;
        }
        list_for_each_entry(ptr, &tomoyo_condition_list, head.list) {
                if (!tomoyo_same_condition(ptr, entry) ||
                    atomic_read(&ptr->head.users) == TOMOYO_GC_IN_PROGRESS)
                        continue;
                /* Same entry found. Share this entry. */
                atomic_inc(&ptr->head.users);
                found = true;
                break;
        }
        if (!found) {
                if (tomoyo_memory_ok(entry)) {
                        atomic_set(&entry->head.users, 1);
                        list_add(&entry->head.list, &tomoyo_condition_list);
                } else {
                        found = true;
                        ptr = NULL;
                }
        }
        mutex_unlock(&tomoyo_policy_lock);
out:
        if (found) {
                tomoyo_del_condition(&entry->head.list);
                kfree(entry);
                entry = ptr;
        }
        return entry;
}

/**
 * tomoyo_get_transit_preference - Parse domain transition preference for execve().
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 * @e:     Pointer to "struct tomoyo_condition".
 *
 * Returns the condition string part.
 */
static char *tomoyo_get_transit_preference(struct tomoyo_acl_param *param,
                                           struct tomoyo_condition *e)
{
        char * const pos = param->data;
        bool flag;

        if (*pos == '<') {
                e->transit = tomoyo_get_domainname(param);
                goto done;
        }
        {
                char *cp = strchr(pos, ' ');

                if (cp)
                        *cp = '\0';
                flag = tomoyo_correct_path(pos) || !strcmp(pos, "keep") ||
                        !strcmp(pos, "initialize") || !strcmp(pos, "reset") ||
                        !strcmp(pos, "child") || !strcmp(pos, "parent");
                if (cp)
                        *cp = ' ';
        }
        if (!flag)
                return pos;
        e->transit = tomoyo_get_name(tomoyo_read_token(param));
done:
        if (e->transit)
                return param->data;
        /*
         * Return a bad read-only condition string that will let
         * tomoyo_get_condition() return NULL.
         */
        return "/";
}

/**
 * tomoyo_get_condition - Parse condition part.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns pointer to "struct tomoyo_condition" on success, NULL otherwise.
 */
struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param)
{
        struct tomoyo_condition *entry = NULL;
        struct tomoyo_condition_element *condp = NULL;
        struct tomoyo_number_union *numbers_p = NULL;
        struct tomoyo_name_union *names_p = NULL;
        struct tomoyo_argv *argv = NULL;
        struct tomoyo_envp *envp = NULL;
        struct tomoyo_condition e = { };
        char * const start_of_string =
                tomoyo_get_transit_preference(param, &e);
        char * const end_of_string = start_of_string + strlen(start_of_string);
        char *pos;

rerun:
        pos = start_of_string;
        while (1) {
                u8 left = -1;
                u8 right = -1;
                char *left_word = pos;
                char *cp;
                char *right_word;
                bool is_not;

                if (!*left_word)
                        break;
                /*
                 * Since left-hand condition does not allow use of "path_group"
                 * or "number_group" and environment variable's names do not
                 * accept '=', it is guaranteed that the original line consists
                 * of one or more repetition of $left$operator$right blocks
                 * where "$left is free from '=' and ' '" and "$operator is
                 * either '=' or '!='" and "$right is free from ' '".
                 * Therefore, we can reconstruct the original line at the end
                 * of dry run even if we overwrite $operator with '\0'.
                 */
                cp = strchr(pos, ' ');
                if (cp) {
                        *cp = '\0'; /* Will restore later. */
                        pos = cp + 1;
                } else {
                        pos = "";
                }
                right_word = strchr(left_word, '=');
                if (!right_word || right_word == left_word)
                        goto out;
                is_not = *(right_word - 1) == '!';
                if (is_not)
                        *(right_word++ - 1) = '\0'; /* Will restore later. */
                else if (*(right_word + 1) != '=')
                        *right_word++ = '\0'; /* Will restore later. */
                else
                        goto out;
                dprintk(KERN_WARNING "%u: <%s>%s=<%s>\n", __LINE__, left_word,
                        is_not ? "!" : "", right_word);
                if (!strcmp(left_word, "grant_log")) {
                        if (entry) {
                                if (is_not ||
                                    entry->grant_log != TOMOYO_GRANTLOG_AUTO)
                                        goto out;
                                else if (!strcmp(right_word, "yes"))
                                        entry->grant_log = TOMOYO_GRANTLOG_YES;
                                else if (!strcmp(right_word, "no"))
                                        entry->grant_log = TOMOYO_GRANTLOG_NO;
                                else
                                        goto out;
                        }
                        continue;
                }
                if (!strncmp(left_word, "exec.argv[", 10)) {
                        if (!argv) {
                                e.argc++;
                                e.condc++;
                        } else {
                                e.argc--;
                                e.condc--;
                                left = TOMOYO_ARGV_ENTRY;
                                argv->is_not = is_not;
                                if (!tomoyo_parse_argv(left_word + 10,
                                                       right_word, argv++))
                                        goto out;
                        }
                        goto store_value;
                }
                if (!strncmp(left_word, "exec.envp[\"", 11)) {
                        if (!envp) {
                                e.envc++;
                                e.condc++;
                        } else {
                                e.envc--;
                                e.condc--;
                                left = TOMOYO_ENVP_ENTRY;
                                envp->is_not = is_not;
                                if (!tomoyo_parse_envp(left_word + 11,
                                                       right_word, envp++))
                                        goto out;
                        }
                        goto store_value;
                }
                left = tomoyo_condition_type(left_word);
                dprintk(KERN_WARNING "%u: <%s> left=%u\n", __LINE__, left_word,
                        left);
                if (left == TOMOYO_MAX_CONDITION_KEYWORD) {
                        if (!numbers_p) {
                                e.numbers_count++;
                        } else {
                                e.numbers_count--;
                                left = TOMOYO_NUMBER_UNION;
                                param->data = left_word;
                                if (*left_word == '@' ||
                                    !tomoyo_parse_number_union(param,
                                                               numbers_p++))
                                        goto out;
                        }
                }
                if (!condp)
                        e.condc++;
                else
                        e.condc--;
                if (left == TOMOYO_EXEC_REALPATH ||
                    left == TOMOYO_SYMLINK_TARGET) {
                        if (!names_p) {
                                e.names_count++;
                        } else {
                                e.names_count--;
                                right = TOMOYO_NAME_UNION;
                                param->data = right_word;
                                if (!tomoyo_parse_name_union_quoted(param,
                                                                    names_p++))
                                        goto out;
                        }
                        goto store_value;
                }
                right = tomoyo_condition_type(right_word);
                if (right == TOMOYO_MAX_CONDITION_KEYWORD) {
                        if (!numbers_p) {
                                e.numbers_count++;
                        } else {
                                e.numbers_count--;
                                right = TOMOYO_NUMBER_UNION;
                                param->data = right_word;
                                if (!tomoyo_parse_number_union(param,
                                                               numbers_p++))
                                        goto out;
                        }
                }
store_value:
                if (!condp) {
                        dprintk(KERN_WARNING "%u: dry_run left=%u right=%u match=%u\n",
                                __LINE__, left, right, !is_not);
                        continue;
                }
                condp->left = left;
                condp->right = right;
                condp->equals = !is_not;
                dprintk(KERN_WARNING "%u: left=%u right=%u match=%u\n",
                        __LINE__, condp->left, condp->right,
                        condp->equals);
                condp++;
        }
        dprintk(KERN_INFO "%u: cond=%u numbers=%u names=%u ac=%u ec=%u\n",
                __LINE__, e.condc, e.numbers_count, e.names_count, e.argc,
                e.envc);
        if (entry) {
                BUG_ON(e.names_count | e.numbers_count | e.argc | e.envc |
                       e.condc);
                return tomoyo_commit_condition(entry);
        }
        e.size = sizeof(*entry)
                + e.condc * sizeof(struct tomoyo_condition_element)
                + e.numbers_count * sizeof(struct tomoyo_number_union)
                + e.names_count * sizeof(struct tomoyo_name_union)
                + e.argc * sizeof(struct tomoyo_argv)
                + e.envc * sizeof(struct tomoyo_envp);
        entry = kzalloc(e.size, GFP_NOFS);
        if (!entry)
                goto out2;
        *entry = e;
        e.transit = NULL;
        condp = (struct tomoyo_condition_element *) (entry + 1);
        numbers_p = (struct tomoyo_number_union *) (condp + e.condc);
        names_p = (struct tomoyo_name_union *) (numbers_p + e.numbers_count);
        argv = (struct tomoyo_argv *) (names_p + e.names_count);
        envp = (struct tomoyo_envp *) (argv + e.argc);
        {
                bool flag = false;

                for (pos = start_of_string; pos < end_of_string; pos++) {
                        if (*pos)
                                continue;
                        if (flag) /* Restore " ". */
                                *pos = ' ';
                        else if (*(pos + 1) == '=') /* Restore "!=". */
                                *pos = '!';
                        else /* Restore "=". */
                                *pos = '=';
                        flag = !flag;
                }
        }
        goto rerun;
out:
        dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__);
        if (entry) {
                tomoyo_del_condition(&entry->head.list);
                kfree(entry);
        }
out2:
        tomoyo_put_name(e.transit);
        return NULL;
}

/**
 * tomoyo_get_attributes - Revalidate "struct inode".
 *
 * @obj: Pointer to "struct tomoyo_obj_info".
 *
 * Returns nothing.
 */
void tomoyo_get_attributes(struct tomoyo_obj_info *obj)
{
        u8 i;
        struct dentry *dentry = NULL;

        for (i = 0; i < TOMOYO_MAX_PATH_STAT; i++) {
                struct inode *inode;

                switch (i) {
                case TOMOYO_PATH1:
                        dentry = obj->path1.dentry;
                        if (!dentry)
                                continue;
                        break;
                case TOMOYO_PATH2:
                        dentry = obj->path2.dentry;
                        if (!dentry)
                                continue;
                        break;
                default:
                        if (!dentry)
                                continue;
                        dentry = dget_parent(dentry);
                        break;
                }
                inode = d_backing_inode(dentry);
                if (inode) {
                        struct tomoyo_mini_stat *stat = &obj->stat[i];

                        stat->uid  = inode->i_uid;
                        stat->gid  = inode->i_gid;
                        stat->ino  = inode->i_ino;
                        stat->mode = inode->i_mode;
                        stat->dev  = inode->i_sb->s_dev;
                        stat->rdev = inode->i_rdev;
                        obj->stat_valid[i] = true;
                }
                if (i & 1) /* TOMOYO_PATH1_PARENT or TOMOYO_PATH2_PARENT */
                        dput(dentry);
        }
}

/**
 * tomoyo_condition - Check condition part.
 *
 * @r:    Pointer to "struct tomoyo_request_info".
 * @cond: Pointer to "struct tomoyo_condition". Maybe NULL.
 *
 * Returns true on success, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
bool tomoyo_condition(struct tomoyo_request_info *r,
                      const struct tomoyo_condition *cond)
{
        u32 i;
        unsigned long min_v[2] = { 0, 0 };
        unsigned long max_v[2] = { 0, 0 };
        const struct tomoyo_condition_element *condp;
        const struct tomoyo_number_union *numbers_p;
        const struct tomoyo_name_union *names_p;
        const struct tomoyo_argv *argv;
        const struct tomoyo_envp *envp;
        struct tomoyo_obj_info *obj;
        u16 condc;
        u16 argc;
        u16 envc;
        struct linux_binprm *bprm = NULL;

        if (!cond)
                return true;
        condc = cond->condc;
        argc = cond->argc;
        envc = cond->envc;
        obj = r->obj;
        if (r->ee)
                bprm = r->ee->bprm;
        if (!bprm && (argc || envc))
                return false;
        condp = (struct tomoyo_condition_element *) (cond + 1);
        numbers_p = (const struct tomoyo_number_union *) (condp + condc);
        names_p = (const struct tomoyo_name_union *)
                (numbers_p + cond->numbers_count);
        argv = (const struct tomoyo_argv *) (names_p + cond->names_count);
        envp = (const struct tomoyo_envp *) (argv + argc);
        for (i = 0; i < condc; i++) {
                const bool match = condp->equals;
                const u8 left = condp->left;
                const u8 right = condp->right;
                bool is_bitop[2] = { false, false };
                u8 j;

                condp++;
                /* Check argv[] and envp[] later. */
                if (left == TOMOYO_ARGV_ENTRY || left == TOMOYO_ENVP_ENTRY)
                        continue;
                /* Check string expressions. */
                if (right == TOMOYO_NAME_UNION) {
                        const struct tomoyo_name_union *ptr = names_p++;
                        struct tomoyo_path_info *symlink;
                        struct tomoyo_execve *ee;
                        struct file *file;

                        switch (left) {
                        case TOMOYO_SYMLINK_TARGET:
                                symlink = obj ? obj->symlink_target : NULL;
                                if (!symlink ||
                                    !tomoyo_compare_name_union(symlink, ptr)
                                    == match)
                                        goto out;
                                break;
                        case TOMOYO_EXEC_REALPATH:
                                ee = r->ee;
                                file = ee ? ee->bprm->file : NULL;
                                if (!tomoyo_scan_exec_realpath(file, ptr,
                                                               match))
                                        goto out;
                                break;
                        }
                        continue;
                }
                /* Check numeric or bit-op expressions. */
                for (j = 0; j < 2; j++) {
                        const u8 index = j ? right : left;
                        unsigned long value = 0;

                        switch (index) {
                        case TOMOYO_TASK_UID:
                                value = from_kuid(&init_user_ns, current_uid());
                                break;
                        case TOMOYO_TASK_EUID:
                                value = from_kuid(&init_user_ns, current_euid());
                                break;
                        case TOMOYO_TASK_SUID:
                                value = from_kuid(&init_user_ns, current_suid());
                                break;
                        case TOMOYO_TASK_FSUID:
                                value = from_kuid(&init_user_ns, current_fsuid());
                                break;
                        case TOMOYO_TASK_GID:
                                value = from_kgid(&init_user_ns, current_gid());
                                break;
                        case TOMOYO_TASK_EGID:
                                value = from_kgid(&init_user_ns, current_egid());
                                break;
                        case TOMOYO_TASK_SGID:
                                value = from_kgid(&init_user_ns, current_sgid());
                                break;
                        case TOMOYO_TASK_FSGID:
                                value = from_kgid(&init_user_ns, current_fsgid());
                                break;
                        case TOMOYO_TASK_PID:
                                value = tomoyo_sys_getpid();
                                break;
                        case TOMOYO_TASK_PPID:
                                value = tomoyo_sys_getppid();
                                break;
                        case TOMOYO_TYPE_IS_SOCKET:
                                value = S_IFSOCK;
                                break;
                        case TOMOYO_TYPE_IS_SYMLINK:
                                value = S_IFLNK;
                                break;
                        case TOMOYO_TYPE_IS_FILE:
                                value = S_IFREG;
                                break;
                        case TOMOYO_TYPE_IS_BLOCK_DEV:
                                value = S_IFBLK;
                                break;
                        case TOMOYO_TYPE_IS_DIRECTORY:
                                value = S_IFDIR;
                                break;
                        case TOMOYO_TYPE_IS_CHAR_DEV:
                                value = S_IFCHR;
                                break;
                        case TOMOYO_TYPE_IS_FIFO:
                                value = S_IFIFO;
                                break;
                        case TOMOYO_MODE_SETUID:
                                value = S_ISUID;
                                break;
                        case TOMOYO_MODE_SETGID:
                                value = S_ISGID;
                                break;
                        case TOMOYO_MODE_STICKY:
                                value = S_ISVTX;
                                break;
                        case TOMOYO_MODE_OWNER_READ:
                                value = 0400;
                                break;
                        case TOMOYO_MODE_OWNER_WRITE:
                                value = 0200;
                                break;
                        case TOMOYO_MODE_OWNER_EXECUTE:
                                value = 0100;
                                break;
                        case TOMOYO_MODE_GROUP_READ:
                                value = 0040;
                                break;
                        case TOMOYO_MODE_GROUP_WRITE:
                                value = 0020;
                                break;
                        case TOMOYO_MODE_GROUP_EXECUTE:
                                value = 0010;
                                break;
                        case TOMOYO_MODE_OTHERS_READ:
                                value = 0004;
                                break;
                        case TOMOYO_MODE_OTHERS_WRITE:
                                value = 0002;
                                break;
                        case TOMOYO_MODE_OTHERS_EXECUTE:
                                value = 0001;
                                break;
                        case TOMOYO_EXEC_ARGC:
                                if (!bprm)
                                        goto out;
                                value = bprm->argc;
                                break;
                        case TOMOYO_EXEC_ENVC:
                                if (!bprm)
                                        goto out;
                                value = bprm->envc;
                                break;
                        case TOMOYO_NUMBER_UNION:
                                /* Fetch values later. */
                                break;
                        default:
                                if (!obj)
                                        goto out;
                                if (!obj->validate_done) {
                                        tomoyo_get_attributes(obj);
                                        obj->validate_done = true;
                                }
                                {
                                        u8 stat_index;
                                        struct tomoyo_mini_stat *stat;

                                        switch (index) {
                                        case TOMOYO_PATH1_UID:
                                        case TOMOYO_PATH1_GID:
                                        case TOMOYO_PATH1_INO:
                                        case TOMOYO_PATH1_MAJOR:
                                        case TOMOYO_PATH1_MINOR:
                                        case TOMOYO_PATH1_TYPE:
                                        case TOMOYO_PATH1_DEV_MAJOR:
                                        case TOMOYO_PATH1_DEV_MINOR:
                                        case TOMOYO_PATH1_PERM:
                                                stat_index = TOMOYO_PATH1;
                                                break;
                                        case TOMOYO_PATH2_UID:
                                        case TOMOYO_PATH2_GID:
                                        case TOMOYO_PATH2_INO:
                                        case TOMOYO_PATH2_MAJOR:
                                        case TOMOYO_PATH2_MINOR:
                                        case TOMOYO_PATH2_TYPE:
                                        case TOMOYO_PATH2_DEV_MAJOR:
                                        case TOMOYO_PATH2_DEV_MINOR:
                                        case TOMOYO_PATH2_PERM:
                                                stat_index = TOMOYO_PATH2;
                                                break;
                                        case TOMOYO_PATH1_PARENT_UID:
                                        case TOMOYO_PATH1_PARENT_GID:
                                        case TOMOYO_PATH1_PARENT_INO:
                                        case TOMOYO_PATH1_PARENT_PERM:
                                                stat_index =
                                                        TOMOYO_PATH1_PARENT;
                                                break;
                                        case TOMOYO_PATH2_PARENT_UID:
                                        case TOMOYO_PATH2_PARENT_GID:
                                        case TOMOYO_PATH2_PARENT_INO:
                                        case TOMOYO_PATH2_PARENT_PERM:
                                                stat_index =
                                                        TOMOYO_PATH2_PARENT;
                                                break;
                                        default:
                                                goto out;
                                        }
                                        if (!obj->stat_valid[stat_index])
                                                goto out;
                                        stat = &obj->stat[stat_index];
                                        switch (index) {
                                        case TOMOYO_PATH1_UID:
                                        case TOMOYO_PATH2_UID:
                                        case TOMOYO_PATH1_PARENT_UID:
                                        case TOMOYO_PATH2_PARENT_UID:
                                                value = from_kuid(&init_user_ns, stat->uid);
                                                break;
                                        case TOMOYO_PATH1_GID:
                                        case TOMOYO_PATH2_GID:
                                        case TOMOYO_PATH1_PARENT_GID:
                                        case TOMOYO_PATH2_PARENT_GID:
                                                value = from_kgid(&init_user_ns, stat->gid);
                                                break;
                                        case TOMOYO_PATH1_INO:
                                        case TOMOYO_PATH2_INO:
                                        case TOMOYO_PATH1_PARENT_INO:
                                        case TOMOYO_PATH2_PARENT_INO:
                                                value = stat->ino;
                                                break;
                                        case TOMOYO_PATH1_MAJOR:
                                        case TOMOYO_PATH2_MAJOR:
                                                value = MAJOR(stat->dev);
                                                break;
                                        case TOMOYO_PATH1_MINOR:
                                        case TOMOYO_PATH2_MINOR:
                                                value = MINOR(stat->dev);
                                                break;
                                        case TOMOYO_PATH1_TYPE:
                                        case TOMOYO_PATH2_TYPE:
                                                value = stat->mode & S_IFMT;
                                                break;
                                        case TOMOYO_PATH1_DEV_MAJOR:
                                        case TOMOYO_PATH2_DEV_MAJOR:
                                                value = MAJOR(stat->rdev);
                                                break;
                                        case TOMOYO_PATH1_DEV_MINOR:
                                        case TOMOYO_PATH2_DEV_MINOR:
                                                value = MINOR(stat->rdev);
                                                break;
                                        case TOMOYO_PATH1_PERM:
                                        case TOMOYO_PATH2_PERM:
                                        case TOMOYO_PATH1_PARENT_PERM:
                                        case TOMOYO_PATH2_PARENT_PERM:
                                                value = stat->mode & S_IALLUGO;
                                                break;
                                        }
                                }
                                break;
                        }
                        max_v[j] = value;
                        min_v[j] = value;
                        switch (index) {
                        case TOMOYO_MODE_SETUID:
                        case TOMOYO_MODE_SETGID:
                        case TOMOYO_MODE_STICKY:
                        case TOMOYO_MODE_OWNER_READ:
                        case TOMOYO_MODE_OWNER_WRITE:
                        case TOMOYO_MODE_OWNER_EXECUTE:
                        case TOMOYO_MODE_GROUP_READ:
                        case TOMOYO_MODE_GROUP_WRITE:
                        case TOMOYO_MODE_GROUP_EXECUTE:
                        case TOMOYO_MODE_OTHERS_READ:
                        case TOMOYO_MODE_OTHERS_WRITE:
                        case TOMOYO_MODE_OTHERS_EXECUTE:
                                is_bitop[j] = true;
                        }
                }
                if (left == TOMOYO_NUMBER_UNION) {
                        /* Fetch values now. */
                        const struct tomoyo_number_union *ptr = numbers_p++;

                        min_v[0] = ptr->values[0];
                        max_v[0] = ptr->values[1];
                }
                if (right == TOMOYO_NUMBER_UNION) {
                        /* Fetch values now. */
                        const struct tomoyo_number_union *ptr = numbers_p++;

                        if (ptr->group) {
                                if (tomoyo_number_matches_group(min_v[0],
                                                                max_v[0],
                                                                ptr->group)
                                    == match)
                                        continue;
                        } else {
                                if ((min_v[0] <= ptr->values[1] &&
                                     max_v[0] >= ptr->values[0]) == match)
                                        continue;
                        }
                        goto out;
                }
                /*
                 * Bit operation is valid only when counterpart value
                 * represents permission.
                 */
                if (is_bitop[0] && is_bitop[1]) {
                        goto out;
                } else if (is_bitop[0]) {
                        switch (right) {
                        case TOMOYO_PATH1_PERM:
                        case TOMOYO_PATH1_PARENT_PERM:
                        case TOMOYO_PATH2_PERM:
                        case TOMOYO_PATH2_PARENT_PERM:
                                if (!(max_v[0] & max_v[1]) == !match)
                                        continue;
                        }
                        goto out;
                } else if (is_bitop[1]) {
                        switch (left) {
                        case TOMOYO_PATH1_PERM:
                        case TOMOYO_PATH1_PARENT_PERM:
                        case TOMOYO_PATH2_PERM:
                        case TOMOYO_PATH2_PARENT_PERM:
                                if (!(max_v[0] & max_v[1]) == !match)
                                        continue;
                        }
                        goto out;
                }
                /* Normal value range comparison. */
                if ((min_v[0] <= max_v[1] && max_v[0] >= min_v[1]) == match)
                        continue;
out:
                return false;
        }
        /* Check argv[] and envp[] now. */
        if (r->ee && (argc || envc))
                return tomoyo_scan_bprm(r->ee, argc, argv, envc, envp);
        return true;
}



































































































    1 










    1 















































    1 














    1 















    1 

















































    1 























    1 







    1 







    1 



































































































































































































































































































    1 

    1 






    1 

























































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
// SPDX-License-Identifier: GPL-2.0-only
/*
 *
 * Copyright (C) 2011 Novell Inc.
 */

#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/xattr.h>
#include <linux/ratelimit.h>
#include <linux/fiemap.h>
#include <linux/fileattr.h>
#include <linux/security.h>
#include <linux/namei.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
#include "overlayfs.h"


int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                struct iattr *attr)
{
        int err;
        struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
        bool full_copy_up = false;
        struct dentry *upperdentry;
        const struct cred *old_cred;

        err = setattr_prepare(&nop_mnt_idmap, dentry, attr);
        if (err)
                return err;

        if (attr->ia_valid & ATTR_SIZE) {
                /* Truncate should trigger data copy up as well */
                full_copy_up = true;
        }

        if (!full_copy_up)
                err = ovl_copy_up(dentry);
        else
                err = ovl_copy_up_with_data(dentry);
        if (!err) {
                struct inode *winode = NULL;

                upperdentry = ovl_dentry_upper(dentry);

                if (attr->ia_valid & ATTR_SIZE) {
                        winode = d_inode(upperdentry);
                        err = get_write_access(winode);
                        if (err)
                                goto out;
                }

                if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
                        attr->ia_valid &= ~ATTR_MODE;

                /*
                 * We might have to translate ovl file into real file object
                 * once use cases emerge.  For now, simply don't let underlying
                 * filesystem rely on attr->ia_file
                 */
                attr->ia_valid &= ~ATTR_FILE;

                /*
                 * If open(O_TRUNC) is done, VFS calls ->setattr with ATTR_OPEN
                 * set.  Overlayfs does not pass O_TRUNC flag to underlying
                 * filesystem during open -> do not pass ATTR_OPEN.  This
                 * disables optimization in fuse which assumes open(O_TRUNC)
                 * already set file size to 0.  But we never passed O_TRUNC to
                 * fuse.  So by clearing ATTR_OPEN, fuse will be forced to send
                 * setattr request to server.
                 */
                attr->ia_valid &= ~ATTR_OPEN;

                err = ovl_want_write(dentry);
                if (err)
                        goto out_put_write;

                inode_lock(upperdentry->d_inode);
                old_cred = ovl_override_creds(dentry->d_sb);
                err = ovl_do_notify_change(ofs, upperdentry, attr);
                revert_creds(old_cred);
                if (!err)
                        ovl_copyattr(dentry->d_inode);
                inode_unlock(upperdentry->d_inode);
                ovl_drop_write(dentry);

out_put_write:
                if (winode)
                        put_write_access(winode);
        }
out:
        return err;
}

static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid)
{
        struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
        bool samefs = ovl_same_fs(ofs);
        unsigned int xinobits = ovl_xino_bits(ofs);
        unsigned int xinoshift = 64 - xinobits;

        if (samefs) {
                /*
                 * When all layers are on the same fs, all real inode
                 * number are unique, so we use the overlay st_dev,
                 * which is friendly to du -x.
                 */
                stat->dev = dentry->d_sb->s_dev;
                return;
        } else if (xinobits) {
                /*
                 * All inode numbers of underlying fs should not be using the
                 * high xinobits, so we use high xinobits to partition the
                 * overlay st_ino address space. The high bits holds the fsid
                 * (upper fsid is 0). The lowest xinobit is reserved for mapping
                 * the non-persistent inode numbers range in case of overflow.
                 * This way all overlay inode numbers are unique and use the
                 * overlay st_dev.
                 */
                if (likely(!(stat->ino >> xinoshift))) {
                        stat->ino |= ((u64)fsid) << (xinoshift + 1);
                        stat->dev = dentry->d_sb->s_dev;
                        return;
                } else if (ovl_xino_warn(ofs)) {
                        pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
                                            dentry, stat->ino, xinobits);
                }
        }

        /* The inode could not be mapped to a unified st_ino address space */
        if (S_ISDIR(dentry->d_inode->i_mode)) {
                /*
                 * Always use the overlay st_dev for directories, so 'find
                 * -xdev' will scan the entire overlay mount and won't cross the
                 * overlay mount boundaries.
                 *
                 * If not all layers are on the same fs the pair {real st_ino;
                 * overlay st_dev} is not unique, so use the non persistent
                 * overlay st_ino for directories.
                 */
                stat->dev = dentry->d_sb->s_dev;
                stat->ino = dentry->d_inode->i_ino;
        } else {
                /*
                 * For non-samefs setup, if we cannot map all layers st_ino
                 * to a unified address space, we need to make sure that st_dev
                 * is unique per underlying fs, so we use the unique anonymous
                 * bdev assigned to the underlying fs.
                 */
                stat->dev = ofs->fs[fsid].pseudo_dev;
        }
}

int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
                struct kstat *stat, u32 request_mask, unsigned int flags)
{
        struct dentry *dentry = path->dentry;
        enum ovl_path_type type;
        struct path realpath;
        const struct cred *old_cred;
        struct inode *inode = d_inode(dentry);
        bool is_dir = S_ISDIR(inode->i_mode);
        int fsid = 0;
        int err;
        bool metacopy_blocks = false;

        metacopy_blocks = ovl_is_metacopy_dentry(dentry);

        type = ovl_path_real(dentry, &realpath);
        old_cred = ovl_override_creds(dentry->d_sb);
        err = ovl_do_getattr(&realpath, stat, request_mask, flags);
        if (err)
                goto out;

        /* Report the effective immutable/append-only STATX flags */
        generic_fill_statx_attr(inode, stat);

        /*
         * For non-dir or same fs, we use st_ino of the copy up origin.
         * This guaranties constant st_dev/st_ino across copy up.
         * With xino feature and non-samefs, we use st_ino of the copy up
         * origin masked with high bits that represent the layer id.
         *
         * If lower filesystem supports NFS file handles, this also guaranties
         * persistent st_ino across mount cycle.
         */
        if (!is_dir || ovl_same_dev(OVL_FS(dentry->d_sb))) {
                if (!OVL_TYPE_UPPER(type)) {
                        fsid = ovl_layer_lower(dentry)->fsid;
                } else if (OVL_TYPE_ORIGIN(type)) {
                        struct kstat lowerstat;
                        u32 lowermask = STATX_INO | STATX_BLOCKS |
                                        (!is_dir ? STATX_NLINK : 0);

                        ovl_path_lower(dentry, &realpath);
                        err = ovl_do_getattr(&realpath, &lowerstat, lowermask,
                                             flags);
                        if (err)
                                goto out;

                        /*
                         * Lower hardlinks may be broken on copy up to different
                         * upper files, so we cannot use the lower origin st_ino
                         * for those different files, even for the same fs case.
                         *
                         * Similarly, several redirected dirs can point to the
                         * same dir on a lower layer. With the "verify_lower"
                         * feature, we do not use the lower origin st_ino, if
                         * we haven't verified that this redirect is unique.
                         *
                         * With inodes index enabled, it is safe to use st_ino
                         * of an indexed origin. The index validates that the
                         * upper hardlink is not broken and that a redirected
                         * dir is the only redirect to that origin.
                         */
                        if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) ||
                            (!ovl_verify_lower(dentry->d_sb) &&
                             (is_dir || lowerstat.nlink == 1))) {
                                fsid = ovl_layer_lower(dentry)->fsid;
                                stat->ino = lowerstat.ino;
                        }

                        /*
                         * If we are querying a metacopy dentry and lower
                         * dentry is data dentry, then use the blocks we
                         * queried just now. We don't have to do additional
                         * vfs_getattr(). If lower itself is metacopy, then
                         * additional vfs_getattr() is unavoidable.
                         */
                        if (metacopy_blocks &&
                            realpath.dentry == ovl_dentry_lowerdata(dentry)) {
                                stat->blocks = lowerstat.blocks;
                                metacopy_blocks = false;
                        }
                }

                if (metacopy_blocks) {
                        /*
                         * If lower is not same as lowerdata or if there was
                         * no origin on upper, we can end up here.
                         * With lazy lowerdata lookup, guess lowerdata blocks
                         * from size to avoid lowerdata lookup on stat(2).
                         */
                        struct kstat lowerdatastat;
                        u32 lowermask = STATX_BLOCKS;

                        ovl_path_lowerdata(dentry, &realpath);
                        if (realpath.dentry) {
                                err = ovl_do_getattr(&realpath, &lowerdatastat,
                                                     lowermask, flags);
                                if (err)
                                        goto out;
                        } else {
                                lowerdatastat.blocks =
                                        round_up(stat->size, stat->blksize) >> 9;
                        }
                        stat->blocks = lowerdatastat.blocks;
                }
        }

        ovl_map_dev_ino(dentry, stat, fsid);

        /*
         * It's probably not worth it to count subdirs to get the
         * correct link count.  nlink=1 seems to pacify 'find' and
         * other utilities.
         */
        if (is_dir && OVL_TYPE_MERGE(type))
                stat->nlink = 1;

        /*
         * Return the overlay inode nlinks for indexed upper inodes.
         * Overlay inode nlink counts the union of the upper hardlinks
         * and non-covered lower hardlinks. It does not include the upper
         * index hardlink.
         */
        if (!is_dir && ovl_test_flag(OVL_INDEX, d_inode(dentry)))
                stat->nlink = dentry->d_inode->i_nlink;

out:
        revert_creds(old_cred);

        return err;
}

int ovl_permission(struct mnt_idmap *idmap,
                   struct inode *inode, int mask)
{
        struct inode *upperinode = ovl_inode_upper(inode);
        struct inode *realinode;
        struct path realpath;
        const struct cred *old_cred;
        int err;

        /* Careful in RCU walk mode */
        realinode = ovl_i_path_real(inode, &realpath);
        if (!realinode) {
                WARN_ON(!(mask & MAY_NOT_BLOCK));
                return -ECHILD;
        }

        /*
         * Check overlay inode with the creds of task and underlying inode
         * with creds of mounter
         */
        err = generic_permission(&nop_mnt_idmap, inode, mask);
        if (err)
                return err;

        old_cred = ovl_override_creds(inode->i_sb);
        if (!upperinode &&
            !special_file(realinode->i_mode) && mask & MAY_WRITE) {
                mask &= ~(MAY_WRITE | MAY_APPEND);
                /* Make sure mounter can read file for copy up later */
                mask |= MAY_READ;
        }
        err = inode_permission(mnt_idmap(realpath.mnt), realinode, mask);
        revert_creds(old_cred);

        return err;
}

static const char *ovl_get_link(struct dentry *dentry,
                                struct inode *inode,
                                struct delayed_call *done)
{
        const struct cred *old_cred;
        const char *p;

        if (!dentry)
                return ERR_PTR(-ECHILD);

        old_cred = ovl_override_creds(dentry->d_sb);
        p = vfs_get_link(ovl_dentry_real(dentry), done);
        revert_creds(old_cred);
        return p;
}

#ifdef CONFIG_FS_POSIX_ACL
/*
 * Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone
 * of the POSIX ACLs retrieved from the lower layer to this function to not
 * alter the POSIX ACLs for the underlying filesystem.
 */
static void ovl_idmap_posix_acl(const struct inode *realinode,
                                struct mnt_idmap *idmap,
                                struct posix_acl *acl)
{
        struct user_namespace *fs_userns = i_user_ns(realinode);

        for (unsigned int i = 0; i < acl->a_count; i++) {
                vfsuid_t vfsuid;
                vfsgid_t vfsgid;

                struct posix_acl_entry *e = &acl->a_entries[i];
                switch (e->e_tag) {
                case ACL_USER:
                        vfsuid = make_vfsuid(idmap, fs_userns, e->e_uid);
                        e->e_uid = vfsuid_into_kuid(vfsuid);
                        break;
                case ACL_GROUP:
                        vfsgid = make_vfsgid(idmap, fs_userns, e->e_gid);
                        e->e_gid = vfsgid_into_kgid(vfsgid);
                        break;
                }
        }
}

/*
 * The @noperm argument is used to skip permission checking and is a temporary
 * measure. Quoting Miklos from an earlier discussion:
 *
 * > So there are two paths to getting an acl:
 * > 1) permission checking and 2) retrieving the value via getxattr(2).
 * > This is a similar situation as reading a symlink vs. following it.
 * > When following a symlink overlayfs always reads the link on the
 * > underlying fs just as if it was a readlink(2) call, calling
 * > security_inode_readlink() instead of security_inode_follow_link().
 * > This is logical: we are reading the link from the underlying storage,
 * > and following it on overlayfs.
 * >
 * > Applying the same logic to acl: we do need to call the
 * > security_inode_getxattr() on the underlying fs, even if just want to
 * > check permissions on overlay. This is currently not done, which is an
 * > inconsistency.
 * >
 * > Maybe adding the check to ovl_get_acl() is the right way to go, but
 * > I'm a little afraid of a performance regression.  Will look into that.
 *
 * Until we have made a decision allow this helper to take the @noperm
 * argument. We should hopefully be able to remove it soon.
 */
struct posix_acl *ovl_get_acl_path(const struct path *path,
                                   const char *acl_name, bool noperm)
{
        struct posix_acl *real_acl, *clone;
        struct mnt_idmap *idmap;
        struct inode *realinode = d_inode(path->dentry);

        idmap = mnt_idmap(path->mnt);

        if (noperm)
                real_acl = get_inode_acl(realinode, posix_acl_type(acl_name));
        else
                real_acl = vfs_get_acl(idmap, path->dentry, acl_name);
        if (IS_ERR_OR_NULL(real_acl))
                return real_acl;

        if (!is_idmapped_mnt(path->mnt))
                return real_acl;

        /*
        * We cannot alter the ACLs returned from the relevant layer as that
        * would alter the cached values filesystem wide for the lower
        * filesystem. Instead we can clone the ACLs and then apply the
        * relevant idmapping of the layer.
        */
        clone = posix_acl_clone(real_acl, GFP_KERNEL);
        posix_acl_release(real_acl); /* release original acl */
        if (!clone)
                return ERR_PTR(-ENOMEM);

        ovl_idmap_posix_acl(realinode, idmap, clone);
        return clone;
}

/*
 * When the relevant layer is an idmapped mount we need to take the idmapping
 * of the layer into account and translate any ACL_{GROUP,USER} values
 * according to the idmapped mount.
 *
 * We cannot alter the ACLs returned from the relevant layer as that would
 * alter the cached values filesystem wide for the lower filesystem. Instead we
 * can clone the ACLs and then apply the relevant idmapping of the layer.
 *
 * This is obviously only relevant when idmapped layers are used.
 */
struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap,
                                 struct inode *inode, int type,
                                 bool rcu, bool noperm)
{
        struct inode *realinode;
        struct posix_acl *acl;
        struct path realpath;

        /* Careful in RCU walk mode */
        realinode = ovl_i_path_real(inode, &realpath);
        if (!realinode) {
                WARN_ON(!rcu);
                return ERR_PTR(-ECHILD);
        }

        if (!IS_POSIXACL(realinode))
                return NULL;

        if (rcu) {
                /*
                 * If the layer is idmapped drop out of RCU path walk
                 * so we can clone the ACLs.
                 */
                if (is_idmapped_mnt(realpath.mnt))
                        return ERR_PTR(-ECHILD);

                acl = get_cached_acl_rcu(realinode, type);
        } else {
                const struct cred *old_cred;

                old_cred = ovl_override_creds(inode->i_sb);
                acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm);
                revert_creds(old_cred);
        }

        return acl;
}

static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
                                 struct posix_acl *acl, int type)
{
        int err;
        struct path realpath;
        const char *acl_name;
        const struct cred *old_cred;
        struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
        struct dentry *upperdentry = ovl_dentry_upper(dentry);
        struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);

        /*
         * If ACL is to be removed from a lower file, check if it exists in
         * the first place before copying it up.
         */
        acl_name = posix_acl_xattr_name(type);
        if (!acl && !upperdentry) {
                struct posix_acl *real_acl;

                ovl_path_lower(dentry, &realpath);
                old_cred = ovl_override_creds(dentry->d_sb);
                real_acl = vfs_get_acl(mnt_idmap(realpath.mnt), realdentry,
                                       acl_name);
                revert_creds(old_cred);
                if (IS_ERR(real_acl)) {
                        err = PTR_ERR(real_acl);
                        goto out;
                }
                posix_acl_release(real_acl);
        }

        if (!upperdentry) {
                err = ovl_copy_up(dentry);
                if (err)
                        goto out;

                realdentry = ovl_dentry_upper(dentry);
        }

        err = ovl_want_write(dentry);
        if (err)
                goto out;

        old_cred = ovl_override_creds(dentry->d_sb);
        if (acl)
                err = ovl_do_set_acl(ofs, realdentry, acl_name, acl);
        else
                err = ovl_do_remove_acl(ofs, realdentry, acl_name);
        revert_creds(old_cred);
        ovl_drop_write(dentry);

        /* copy c/mtime */
        ovl_copyattr(inode);
out:
        return err;
}

int ovl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                struct posix_acl *acl, int type)
{
        int err;
        struct inode *inode = d_inode(dentry);
        struct dentry *workdir = ovl_workdir(dentry);
        struct inode *realinode = ovl_inode_real(inode);

        if (!IS_POSIXACL(d_inode(workdir)))
                return -EOPNOTSUPP;
        if (!realinode->i_op->set_acl)
                return -EOPNOTSUPP;
        if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
                return acl ? -EACCES : 0;
        if (!inode_owner_or_capable(&nop_mnt_idmap, inode))
                return -EPERM;

        /*
         * Check if sgid bit needs to be cleared (actual setacl operation will
         * be done with mounter's capabilities and so that won't do it for us).
         */
        if (unlikely(inode->i_mode & S_ISGID) && type == ACL_TYPE_ACCESS &&
            !in_group_p(inode->i_gid) &&
            !capable_wrt_inode_uidgid(&nop_mnt_idmap, inode, CAP_FSETID)) {
                struct iattr iattr = { .ia_valid = ATTR_KILL_SGID };

                err = ovl_setattr(&nop_mnt_idmap, dentry, &iattr);
                if (err)
                        return err;
        }

        return ovl_set_or_remove_acl(dentry, inode, acl, type);
}
#endif

int ovl_update_time(struct inode *inode, int flags)
{
        if (flags & S_ATIME) {
                struct ovl_fs *ofs = OVL_FS(inode->i_sb);
                struct path upperpath = {
                        .mnt = ovl_upper_mnt(ofs),
                        .dentry = ovl_upperdentry_dereference(OVL_I(inode)),
                };

                if (upperpath.dentry) {
                        touch_atime(&upperpath);
                        inode_set_atime_to_ts(inode,
                                              inode_get_atime(d_inode(upperpath.dentry)));
                }
        }
        return 0;
}

static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                      u64 start, u64 len)
{
        int err;
        struct inode *realinode = ovl_inode_realdata(inode);
        const struct cred *old_cred;

        if (!realinode)
                return -EIO;

        if (!realinode->i_op->fiemap)
                return -EOPNOTSUPP;

        old_cred = ovl_override_creds(inode->i_sb);
        err = realinode->i_op->fiemap(realinode, fieinfo, start, len);
        revert_creds(old_cred);

        return err;
}

/*
 * Work around the fact that security_file_ioctl() takes a file argument.
 * Introducing security_inode_fileattr_get/set() hooks would solve this issue
 * properly.
 */
static int ovl_security_fileattr(const struct path *realpath, struct fileattr *fa,
                                 bool set)
{
        struct file *file;
        unsigned int cmd;
        int err;

        file = dentry_open(realpath, O_RDONLY, current_cred());
        if (IS_ERR(file))
                return PTR_ERR(file);

        if (set)
                cmd = fa->fsx_valid ? FS_IOC_FSSETXATTR : FS_IOC_SETFLAGS;
        else
                cmd = fa->fsx_valid ? FS_IOC_FSGETXATTR : FS_IOC_GETFLAGS;

        err = security_file_ioctl(file, cmd, 0);
        fput(file);

        return err;
}

int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa)
{
        int err;

        err = ovl_security_fileattr(realpath, fa, true);
        if (err)
                return err;

        return vfs_fileattr_set(mnt_idmap(realpath->mnt), realpath->dentry, fa);
}

int ovl_fileattr_set(struct mnt_idmap *idmap,
                     struct dentry *dentry, struct fileattr *fa)
{
        struct inode *inode = d_inode(dentry);
        struct path upperpath;
        const struct cred *old_cred;
        unsigned int flags;
        int err;

        err = ovl_copy_up(dentry);
        if (!err) {
                ovl_path_real(dentry, &upperpath);

                err = ovl_want_write(dentry);
                if (err)
                        goto out;

                old_cred = ovl_override_creds(inode->i_sb);
                /*
                 * Store immutable/append-only flags in xattr and clear them
                 * in upper fileattr (in case they were set by older kernel)
                 * so children of "ovl-immutable" directories lower aliases of
                 * "ovl-immutable" hardlinks could be copied up.
                 * Clear xattr when flags are cleared.
                 */
                err = ovl_set_protattr(inode, upperpath.dentry, fa);
                if (!err)
                        err = ovl_real_fileattr_set(&upperpath, fa);
                revert_creds(old_cred);
                ovl_drop_write(dentry);

                /*
                 * Merge real inode flags with inode flags read from
                 * overlay.protattr xattr
                 */
                flags = ovl_inode_real(inode)->i_flags & OVL_COPY_I_FLAGS_MASK;

                BUILD_BUG_ON(OVL_PROT_I_FLAGS_MASK & ~OVL_COPY_I_FLAGS_MASK);
                flags |= inode->i_flags & OVL_PROT_I_FLAGS_MASK;
                inode_set_flags(inode, flags, OVL_COPY_I_FLAGS_MASK);

                /* Update ctime */
                ovl_copyattr(inode);
        }
out:
        return err;
}

/* Convert inode protection flags to fileattr flags */
static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa)
{
        BUILD_BUG_ON(OVL_PROT_FS_FLAGS_MASK & ~FS_COMMON_FL);
        BUILD_BUG_ON(OVL_PROT_FSX_FLAGS_MASK & ~FS_XFLAG_COMMON);

        if (inode->i_flags & S_APPEND) {
                fa->flags |= FS_APPEND_FL;
                fa->fsx_xflags |= FS_XFLAG_APPEND;
        }
        if (inode->i_flags & S_IMMUTABLE) {
                fa->flags |= FS_IMMUTABLE_FL;
                fa->fsx_xflags |= FS_XFLAG_IMMUTABLE;
        }
}

int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa)
{
        int err;

        err = ovl_security_fileattr(realpath, fa, false);
        if (err)
                return err;

        err = vfs_fileattr_get(realpath->dentry, fa);
        if (err == -ENOIOCTLCMD)
                err = -ENOTTY;
        return err;
}

int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
        struct inode *inode = d_inode(dentry);
        struct path realpath;
        const struct cred *old_cred;
        int err;

        ovl_path_real(dentry, &realpath);

        old_cred = ovl_override_creds(inode->i_sb);
        err = ovl_real_fileattr_get(&realpath, fa);
        ovl_fileattr_prot_flags(inode, fa);
        revert_creds(old_cred);

        return err;
}

static const struct inode_operations ovl_file_inode_operations = {
        .setattr        = ovl_setattr,
        .permission        = ovl_permission,
        .getattr        = ovl_getattr,
        .listxattr        = ovl_listxattr,
        .get_inode_acl        = ovl_get_inode_acl,
        .get_acl        = ovl_get_acl,
        .set_acl        = ovl_set_acl,
        .update_time        = ovl_update_time,
        .fiemap                = ovl_fiemap,
        .fileattr_get        = ovl_fileattr_get,
        .fileattr_set        = ovl_fileattr_set,
};

static const struct inode_operations ovl_symlink_inode_operations = {
        .setattr        = ovl_setattr,
        .get_link        = ovl_get_link,
        .getattr        = ovl_getattr,
        .listxattr        = ovl_listxattr,
        .update_time        = ovl_update_time,
};

static const struct inode_operations ovl_special_inode_operations = {
        .setattr        = ovl_setattr,
        .permission        = ovl_permission,
        .getattr        = ovl_getattr,
        .listxattr        = ovl_listxattr,
        .get_inode_acl        = ovl_get_inode_acl,
        .get_acl        = ovl_get_acl,
        .set_acl        = ovl_set_acl,
        .update_time        = ovl_update_time,
};

static const struct address_space_operations ovl_aops = {
        /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */
        .direct_IO                = noop_direct_IO,
};

/*
 * It is possible to stack overlayfs instance on top of another
 * overlayfs instance as lower layer. We need to annotate the
 * stackable i_mutex locks according to stack level of the super
 * block instance. An overlayfs instance can never be in stack
 * depth 0 (there is always a real fs below it).  An overlayfs
 * inode lock will use the lockdep annotation ovl_i_mutex_key[depth].
 *
 * For example, here is a snip from /proc/lockdep_chains after
 * dir_iterate of nested overlayfs:
 *
 * [...] &ovl_i_mutex_dir_key[depth]   (stack_depth=2)
 * [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1)
 * [...] &type->i_mutex_dir_key        (stack_depth=0)
 *
 * Locking order w.r.t ovl_want_write() is important for nested overlayfs.
 *
 * This chain is valid:
 * - inode->i_rwsem                        (inode_lock[2])
 * - upper_mnt->mnt_sb->s_writers        (ovl_want_write[0])
 * - OVL_I(inode)->lock                        (ovl_inode_lock[2])
 * - OVL_I(lowerinode)->lock                (ovl_inode_lock[1])
 *
 * And this chain is valid:
 * - inode->i_rwsem                        (inode_lock[2])
 * - OVL_I(inode)->lock                        (ovl_inode_lock[2])
 * - lowerinode->i_rwsem                (inode_lock[1])
 * - OVL_I(lowerinode)->lock                (ovl_inode_lock[1])
 *
 * But lowerinode->i_rwsem SHOULD NOT be acquired while ovl_want_write() is
 * held, because it is in reverse order of the non-nested case using the same
 * upper fs:
 * - inode->i_rwsem                        (inode_lock[1])
 * - upper_mnt->mnt_sb->s_writers        (ovl_want_write[0])
 * - OVL_I(inode)->lock                        (ovl_inode_lock[1])
 */
#define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH

static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
{
#ifdef CONFIG_LOCKDEP
        static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING];
        static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING];
        static struct lock_class_key ovl_i_lock_key[OVL_MAX_NESTING];

        int depth = inode->i_sb->s_stack_depth - 1;

        if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING))
                depth = 0;

        if (S_ISDIR(inode->i_mode))
                lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]);
        else
                lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]);

        lockdep_set_class(&OVL_I(inode)->lock, &ovl_i_lock_key[depth]);
#endif
}

static void ovl_next_ino(struct inode *inode)
{
        struct ovl_fs *ofs = OVL_FS(inode->i_sb);

        inode->i_ino = atomic_long_inc_return(&ofs->last_ino);
        if (unlikely(!inode->i_ino))
                inode->i_ino = atomic_long_inc_return(&ofs->last_ino);
}

static void ovl_map_ino(struct inode *inode, unsigned long ino, int fsid)
{
        struct ovl_fs *ofs = OVL_FS(inode->i_sb);
        int xinobits = ovl_xino_bits(ofs);
        unsigned int xinoshift = 64 - xinobits;

        /*
         * When d_ino is consistent with st_ino (samefs or i_ino has enough
         * bits to encode layer), set the same value used for st_ino to i_ino,
         * so inode number exposed via /proc/locks and a like will be
         * consistent with d_ino and st_ino values. An i_ino value inconsistent
         * with d_ino also causes nfsd readdirplus to fail.
         */
        inode->i_ino = ino;
        if (ovl_same_fs(ofs)) {
                return;
        } else if (xinobits && likely(!(ino >> xinoshift))) {
                inode->i_ino |= (unsigned long)fsid << (xinoshift + 1);
                return;
        }

        /*
         * For directory inodes on non-samefs with xino disabled or xino
         * overflow, we allocate a non-persistent inode number, to be used for
         * resolving st_ino collisions in ovl_map_dev_ino().
         *
         * To avoid ino collision with legitimate xino values from upper
         * layer (fsid 0), use the lowest xinobit to map the non
         * persistent inode numbers to the unified st_ino address space.
         */
        if (S_ISDIR(inode->i_mode)) {
                ovl_next_ino(inode);
                if (xinobits) {
                        inode->i_ino &= ~0UL >> xinobits;
                        inode->i_ino |= 1UL << xinoshift;
                }
        }
}

void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip,
                    unsigned long ino, int fsid)
{
        struct inode *realinode;
        struct ovl_inode *oi = OVL_I(inode);

        oi->__upperdentry = oip->upperdentry;
        oi->oe = oip->oe;
        oi->redirect = oip->redirect;
        oi->lowerdata_redirect = oip->lowerdata_redirect;

        realinode = ovl_inode_real(inode);
        ovl_copyattr(inode);
        ovl_copyflags(realinode, inode);
        ovl_map_ino(inode, ino, fsid);
}

static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev)
{
        inode->i_mode = mode;
        inode->i_flags |= S_NOCMTIME;
#ifdef CONFIG_FS_POSIX_ACL
        inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE;
#endif

        ovl_lockdep_annotate_inode_mutex_key(inode);

        switch (mode & S_IFMT) {
        case S_IFREG:
                inode->i_op = &ovl_file_inode_operations;
                inode->i_fop = &ovl_file_operations;
                inode->i_mapping->a_ops = &ovl_aops;
                break;

        case S_IFDIR:
                inode->i_op = &ovl_dir_inode_operations;
                inode->i_fop = &ovl_dir_operations;
                break;

        case S_IFLNK:
                inode->i_op = &ovl_symlink_inode_operations;
                break;

        default:
                inode->i_op = &ovl_special_inode_operations;
                init_special_inode(inode, mode, rdev);
                break;
        }
}

/*
 * With inodes index enabled, an overlay inode nlink counts the union of upper
 * hardlinks and non-covered lower hardlinks. During the lifetime of a non-pure
 * upper inode, the following nlink modifying operations can happen:
 *
 * 1. Lower hardlink copy up
 * 2. Upper hardlink created, unlinked or renamed over
 * 3. Lower hardlink whiteout or renamed over
 *
 * For the first, copy up case, the union nlink does not change, whether the
 * operation succeeds or fails, but the upper inode nlink may change.
 * Therefore, before copy up, we store the union nlink value relative to the
 * lower inode nlink in the index inode xattr .overlay.nlink.
 *
 * For the second, upper hardlink case, the union nlink should be incremented
 * or decremented IFF the operation succeeds, aligned with nlink change of the
 * upper inode. Therefore, before link/unlink/rename, we store the union nlink
 * value relative to the upper inode nlink in the index inode.
 *
 * For the last, lower cover up case, we simplify things by preceding the
 * whiteout or cover up with copy up. This makes sure that there is an index
 * upper inode where the nlink xattr can be stored before the copied up upper
 * entry is unlink.
 */
#define OVL_NLINK_ADD_UPPER        (1 << 0)

/*
 * On-disk format for indexed nlink:
 *
 * nlink relative to the upper inode - "U[+-]NUM"
 * nlink relative to the lower inode - "L[+-]NUM"
 */

static int ovl_set_nlink_common(struct dentry *dentry,
                                struct dentry *realdentry, const char *format)
{
        struct inode *inode = d_inode(dentry);
        struct inode *realinode = d_inode(realdentry);
        char buf[13];
        int len;

        len = snprintf(buf, sizeof(buf), format,
                       (int) (inode->i_nlink - realinode->i_nlink));

        if (WARN_ON(len >= sizeof(buf)))
                return -EIO;

        return ovl_setxattr(OVL_FS(inode->i_sb), ovl_dentry_upper(dentry),
                            OVL_XATTR_NLINK, buf, len);
}

int ovl_set_nlink_upper(struct dentry *dentry)
{
        return ovl_set_nlink_common(dentry, ovl_dentry_upper(dentry), "U%+i");
}

int ovl_set_nlink_lower(struct dentry *dentry)
{
        return ovl_set_nlink_common(dentry, ovl_dentry_lower(dentry), "L%+i");
}

unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry,
                           struct dentry *upperdentry,
                           unsigned int fallback)
{
        int nlink_diff;
        int nlink;
        char buf[13];
        int err;

        if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1)
                return fallback;

        err = ovl_getxattr_upper(ofs, upperdentry, OVL_XATTR_NLINK,
                                 &buf, sizeof(buf) - 1);
        if (err < 0)
                goto fail;

        buf[err] = '\0';
        if ((buf[0] != 'L' && buf[0] != 'U') ||
            (buf[1] != '+' && buf[1] != '-'))
                goto fail;

        err = kstrtoint(buf + 1, 10, &nlink_diff);
        if (err < 0)
                goto fail;

        nlink = d_inode(buf[0] == 'L' ? lowerdentry : upperdentry)->i_nlink;
        nlink += nlink_diff;

        if (nlink <= 0)
                goto fail;

        return nlink;

fail:
        pr_warn_ratelimited("failed to get index nlink (%pd2, err=%i)\n",
                            upperdentry, err);
        return fallback;
}

struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev)
{
        struct inode *inode;

        inode = new_inode(sb);
        if (inode)
                ovl_fill_inode(inode, mode, rdev);

        return inode;
}

static int ovl_inode_test(struct inode *inode, void *data)
{
        return inode->i_private == data;
}

static int ovl_inode_set(struct inode *inode, void *data)
{
        inode->i_private = data;
        return 0;
}

static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry,
                             struct dentry *upperdentry, bool strict)
{
        /*
         * For directories, @strict verify from lookup path performs consistency
         * checks, so NULL lower/upper in dentry must match NULL lower/upper in
         * inode. Non @strict verify from NFS handle decode path passes NULL for
         * 'unknown' lower/upper.
         */
        if (S_ISDIR(inode->i_mode) && strict) {
                /* Real lower dir moved to upper layer under us? */
                if (!lowerdentry && ovl_inode_lower(inode))
                        return false;

                /* Lookup of an uncovered redirect origin? */
                if (!upperdentry && ovl_inode_upper(inode))
                        return false;
        }

        /*
         * Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL.
         * This happens when finding a copied up overlay inode for a renamed
         * or hardlinked overlay dentry and lower dentry cannot be followed
         * by origin because lower fs does not support file handles.
         */
        if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry))
                return false;

        /*
         * Allow non-NULL __upperdentry in inode even if upperdentry is NULL.
         * This happens when finding a lower alias for a copied up hard link.
         */
        if (upperdentry && ovl_inode_upper(inode) != d_inode(upperdentry))
                return false;

        return true;
}

struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
                               bool is_upper)
{
        struct inode *inode, *key = d_inode(real);

        inode = ilookup5(sb, (unsigned long) key, ovl_inode_test, key);
        if (!inode)
                return NULL;

        if (!ovl_verify_inode(inode, is_upper ? NULL : real,
                              is_upper ? real : NULL, false)) {
                iput(inode);
                return ERR_PTR(-ESTALE);
        }

        return inode;
}

bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir)
{
        struct inode *key = d_inode(dir);
        struct inode *trap;
        bool res;

        trap = ilookup5(sb, (unsigned long) key, ovl_inode_test, key);
        if (!trap)
                return false;

        res = IS_DEADDIR(trap) && !ovl_inode_upper(trap) &&
                                  !ovl_inode_lower(trap);

        iput(trap);
        return res;
}

/*
 * Create an inode cache entry for layer root dir, that will intentionally
 * fail ovl_verify_inode(), so any lookup that will find some layer root
 * will fail.
 */
struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir)
{
        struct inode *key = d_inode(dir);
        struct inode *trap;

        if (!d_is_dir(dir))
                return ERR_PTR(-ENOTDIR);

        trap = iget5_locked(sb, (unsigned long) key, ovl_inode_test,
                            ovl_inode_set, key);
        if (!trap)
                return ERR_PTR(-ENOMEM);

        if (!(trap->i_state & I_NEW)) {
                /* Conflicting layer roots? */
                iput(trap);
                return ERR_PTR(-ELOOP);
        }

        trap->i_mode = S_IFDIR;
        trap->i_flags = S_DEAD;
        unlock_new_inode(trap);

        return trap;
}

/*
 * Does overlay inode need to be hashed by lower inode?
 */
static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper,
                             struct dentry *lower, bool index)
{
        struct ovl_fs *ofs = OVL_FS(sb);

        /* No, if pure upper */
        if (!lower)
                return false;

        /* Yes, if already indexed */
        if (index)
                return true;

        /* Yes, if won't be copied up */
        if (!ovl_upper_mnt(ofs))
                return true;

        /* No, if lower hardlink is or will be broken on copy up */
        if ((upper || !ovl_indexdir(sb)) &&
            !d_is_dir(lower) && d_inode(lower)->i_nlink > 1)
                return false;

        /* No, if non-indexed upper with NFS export */
        if (ofs->config.nfs_export && upper)
                return false;

        /* Otherwise, hash by lower inode for fsnotify */
        return true;
}

static struct inode *ovl_iget5(struct super_block *sb, struct inode *newinode,
                               struct inode *key)
{
        return newinode ? inode_insert5(newinode, (unsigned long) key,
                                         ovl_inode_test, ovl_inode_set, key) :
                          iget5_locked(sb, (unsigned long) key,
                                       ovl_inode_test, ovl_inode_set, key);
}

struct inode *ovl_get_inode(struct super_block *sb,
                            struct ovl_inode_params *oip)
{
        struct ovl_fs *ofs = OVL_FS(sb);
        struct dentry *upperdentry = oip->upperdentry;
        struct ovl_path *lowerpath = ovl_lowerpath(oip->oe);
        struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL;
        struct inode *inode;
        struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL;
        struct path realpath = {
                .dentry = upperdentry ?: lowerdentry,
                .mnt = upperdentry ? ovl_upper_mnt(ofs) : lowerpath->layer->mnt,
        };
        bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry,
                                        oip->index);
        int fsid = bylower ? lowerpath->layer->fsid : 0;
        bool is_dir;
        unsigned long ino = 0;
        int err = oip->newinode ? -EEXIST : -ENOMEM;

        if (!realinode)
                realinode = d_inode(lowerdentry);

        /*
         * Copy up origin (lower) may exist for non-indexed upper, but we must
         * not use lower as hash key if this is a broken hardlink.
         */
        is_dir = S_ISDIR(realinode->i_mode);
        if (upperdentry || bylower) {
                struct inode *key = d_inode(bylower ? lowerdentry :
                                                      upperdentry);
                unsigned int nlink = is_dir ? 1 : realinode->i_nlink;

                inode = ovl_iget5(sb, oip->newinode, key);
                if (!inode)
                        goto out_err;
                if (!(inode->i_state & I_NEW)) {
                        /*
                         * Verify that the underlying files stored in the inode
                         * match those in the dentry.
                         */
                        if (!ovl_verify_inode(inode, lowerdentry, upperdentry,
                                              true)) {
                                iput(inode);
                                err = -ESTALE;
                                goto out_err;
                        }

                        dput(upperdentry);
                        ovl_free_entry(oip->oe);
                        kfree(oip->redirect);
                        kfree(oip->lowerdata_redirect);
                        goto out;
                }

                /* Recalculate nlink for non-dir due to indexing */
                if (!is_dir)
                        nlink = ovl_get_nlink(ofs, lowerdentry, upperdentry,
                                              nlink);
                set_nlink(inode, nlink);
                ino = key->i_ino;
        } else {
                /* Lower hardlink that will be broken on copy up */
                inode = new_inode(sb);
                if (!inode) {
                        err = -ENOMEM;
                        goto out_err;
                }
                ino = realinode->i_ino;
                fsid = lowerpath->layer->fsid;
        }
        ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev);
        ovl_inode_init(inode, oip, ino, fsid);

        if (upperdentry && ovl_is_impuredir(sb, upperdentry))
                ovl_set_flag(OVL_IMPURE, inode);

        if (oip->index)
                ovl_set_flag(OVL_INDEX, inode);

        if (bylower)
                ovl_set_flag(OVL_CONST_INO, inode);

        /* Check for non-merge dir that may have whiteouts */
        if (is_dir) {
                if (((upperdentry && lowerdentry) || ovl_numlower(oip->oe) > 1) ||
                    ovl_path_check_origin_xattr(ofs, &realpath)) {
                        ovl_set_flag(OVL_WHITEOUTS, inode);
                }
        }

        /* Check for immutable/append-only inode flags in xattr */
        if (upperdentry)
                ovl_check_protattr(inode, upperdentry);

        if (inode->i_state & I_NEW)
                unlock_new_inode(inode);
out:
        return inode;

out_err:
        pr_warn_ratelimited("failed to get inode (%i)\n", err);
        inode = ERR_PTR(err);
        goto out;
}



































































    5 














    7 






    2 













    2 







































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * NUMA memory policies for Linux.
 * Copyright 2003,2004 Andi Kleen SuSE Labs
 */
#ifndef _LINUX_MEMPOLICY_H
#define _LINUX_MEMPOLICY_H 1

#include <linux/sched.h>
#include <linux/mmzone.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <uapi/linux/mempolicy.h>

struct mm_struct;

#define NO_INTERLEAVE_INDEX (-1UL)        /* use task il_prev for interleaving */

#ifdef CONFIG_NUMA

/*
 * Describe a memory policy.
 *
 * A mempolicy can be either associated with a process or with a VMA.
 * For VMA related allocations the VMA policy is preferred, otherwise
 * the process policy is used. Interrupts ignore the memory policy
 * of the current process.
 *
 * Locking policy for interleave:
 * In process context there is no locking because only the process accesses
 * its own state. All vma manipulation is somewhat protected by a down_read on
 * mmap_lock.
 *
 * Freeing policy:
 * Mempolicy objects are reference counted.  A mempolicy will be freed when
 * mpol_put() decrements the reference count to zero.
 *
 * Duplicating policy objects:
 * mpol_dup() allocates a new mempolicy and copies the specified mempolicy
 * to the new storage.  The reference count of the new object is initialized
 * to 1, representing the caller of mpol_dup().
 */
struct mempolicy {
        atomic_t refcnt;
        unsigned short mode;         /* See MPOL_* above */
        unsigned short flags;        /* See set_mempolicy() MPOL_F_* above */
        nodemask_t nodes;        /* interleave/bind/perfer */
        int home_node;                /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */

        union {
                nodemask_t cpuset_mems_allowed;        /* relative to these nodes */
                nodemask_t user_nodemask;        /* nodemask passed by user */
        } w;
};

/*
 * Support for managing mempolicy data objects (clone, copy, destroy)
 * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
 */

extern void __mpol_put(struct mempolicy *pol);
static inline void mpol_put(struct mempolicy *pol)
{
        if (pol)
                __mpol_put(pol);
}

/*
 * Does mempolicy pol need explicit unref after use?
 * Currently only needed for shared policies.
 */
static inline int mpol_needs_cond_ref(struct mempolicy *pol)
{
        return (pol && (pol->flags & MPOL_F_SHARED));
}

static inline void mpol_cond_put(struct mempolicy *pol)
{
        if (mpol_needs_cond_ref(pol))
                __mpol_put(pol);
}

extern struct mempolicy *__mpol_dup(struct mempolicy *pol);
static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
{
        if (pol)
                pol = __mpol_dup(pol);
        return pol;
}

static inline void mpol_get(struct mempolicy *pol)
{
        if (pol)
                atomic_inc(&pol->refcnt);
}

extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        if (a == b)
                return true;
        return __mpol_equal(a, b);
}

/*
 * Tree of shared policies for a shared memory region.
 */
struct shared_policy {
        struct rb_root root;
        rwlock_t lock;
};
struct sp_node {
        struct rb_node nd;
        pgoff_t start, end;
        struct mempolicy *policy;
};

int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
int mpol_set_shared_policy(struct shared_policy *sp,
                           struct vm_area_struct *vma, struct mempolicy *mpol);
void mpol_free_shared_policy(struct shared_policy *sp);
struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
                                            pgoff_t idx);

struct mempolicy *get_task_policy(struct task_struct *p);
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
                unsigned long addr, pgoff_t *ilx);
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
                unsigned long addr, int order, pgoff_t *ilx);
bool vma_policy_mof(struct vm_area_struct *vma);

extern void numa_default_policy(void);
extern void numa_policy_init(void);
extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new);
extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);

extern int huge_node(struct vm_area_struct *vma,
                                unsigned long addr, gfp_t gfp_flags,
                                struct mempolicy **mpol, nodemask_t **nodemask);
extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
extern bool mempolicy_in_oom_domain(struct task_struct *tsk,
                                const nodemask_t *mask);
extern unsigned int mempolicy_slab_node(void);

extern enum zone_type policy_zone;

static inline void check_highest_zone(enum zone_type k)
{
        if (k > policy_zone && k != ZONE_MOVABLE)
                policy_zone = k;
}

int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags);


#ifdef CONFIG_TMPFS
extern int mpol_parse_str(char *str, struct mempolicy **mpol);
#endif

extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);

/* Check if a vma is migratable */
extern bool vma_migratable(struct vm_area_struct *vma);

int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
                                        unsigned long addr);
extern void mpol_put_task_policy(struct task_struct *);

static inline bool mpol_is_preferred_many(struct mempolicy *pol)
{
        return  (pol->mode == MPOL_PREFERRED_MANY);
}

extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone);

#else

struct mempolicy {};

static inline struct mempolicy *get_task_policy(struct task_struct *p)
{
        return NULL;
}

static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        return true;
}

static inline void mpol_put(struct mempolicy *pol)
{
}

static inline void mpol_cond_put(struct mempolicy *pol)
{
}

static inline void mpol_get(struct mempolicy *pol)
{
}

struct shared_policy {};

static inline void mpol_shared_policy_init(struct shared_policy *sp,
                                                struct mempolicy *mpol)
{
}

static inline void mpol_free_shared_policy(struct shared_policy *sp)
{
}

static inline struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx)
{
        return NULL;
}

static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
                                unsigned long addr, int order, pgoff_t *ilx)
{
        *ilx = 0;
        return NULL;
}

static inline int
vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
        return 0;
}

static inline void numa_policy_init(void)
{
}

static inline void numa_default_policy(void)
{
}

static inline void mpol_rebind_task(struct task_struct *tsk,
                                const nodemask_t *new)
{
}

static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
}

static inline int huge_node(struct vm_area_struct *vma,
                                unsigned long addr, gfp_t gfp_flags,
                                struct mempolicy **mpol, nodemask_t **nodemask)
{
        *mpol = NULL;
        *nodemask = NULL;
        return 0;
}

static inline bool init_nodemask_of_mempolicy(nodemask_t *m)
{
        return false;
}

static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                                   const nodemask_t *to, int flags)
{
        return 0;
}

static inline void check_highest_zone(int k)
{
}

#ifdef CONFIG_TMPFS
static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
{
        return 1;        /* error */
}
#endif

static inline int mpol_misplaced(struct folio *folio,
                                 struct vm_fault *vmf,
                                 unsigned long address)
{
        return -1; /* no node preference */
}

static inline void mpol_put_task_policy(struct task_struct *task)
{
}

static inline bool mpol_is_preferred_many(struct mempolicy *pol)
{
        return  false;
}

#endif /* CONFIG_NUMA */
#endif























































    2 







    2 














    2 






    2 


    2 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
/*
 *  linux/fs/hfs/string.c
 *
 * Copyright (C) 1995-1997  Paul H. Hargrove
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 * This file may be distributed under the terms of the GNU General Public License.
 *
 * This file contains the string comparison function for the
 * Macintosh character set.
 *
 * The code in this file is derived from code which is copyright
 * 1986, 1989, 1990 by Abacus Research and Development, Inc. (ARDI)
 * It is used here by the permission of ARDI's president Cliff Matthews.
 */

#include "hfs_fs.h"
#include <linux/dcache.h>

/*================ File-local variables ================*/

/*
 * unsigned char caseorder[]
 *
 * Defines the lexical ordering of characters on the Macintosh
 *
 * Composition of the 'casefold' and 'order' tables from ARDI's code
 * with the entry for 0x20 changed to match that for 0xCA to remove
 * special case for those two characters.
 */
static unsigned char caseorder[256] = {
        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
        0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
        0x20,0x22,0x23,0x28,0x29,0x2A,0x2B,0x2C,0x2F,0x30,0x31,0x32,0x33,0x34,0x35,0x36,
        0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,0x40,0x41,0x42,0x43,0x44,0x45,0x46,
        0x47,0x48,0x57,0x59,0x5D,0x5F,0x66,0x68,0x6A,0x6C,0x72,0x74,0x76,0x78,0x7A,0x7E,
        0x8C,0x8E,0x90,0x92,0x95,0x97,0x9E,0xA0,0xA2,0xA4,0xA7,0xA9,0xAA,0xAB,0xAC,0xAD,
        0x4E,0x48,0x57,0x59,0x5D,0x5F,0x66,0x68,0x6A,0x6C,0x72,0x74,0x76,0x78,0x7A,0x7E,
        0x8C,0x8E,0x90,0x92,0x95,0x97,0x9E,0xA0,0xA2,0xA4,0xA7,0xAF,0xB0,0xB1,0xB2,0xB3,
        0x4A,0x4C,0x5A,0x60,0x7B,0x7F,0x98,0x4F,0x49,0x51,0x4A,0x4B,0x4C,0x5A,0x60,0x63,
        0x64,0x65,0x6E,0x6F,0x70,0x71,0x7B,0x84,0x85,0x86,0x7F,0x80,0x9A,0x9B,0x9C,0x98,
        0xB4,0xB5,0xB6,0xB7,0xB8,0xB9,0xBA,0x94,0xBB,0xBC,0xBD,0xBE,0xBF,0xC0,0x4D,0x81,
        0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xCA,0xCB,0x55,0x8A,0xCC,0x4D,0x81,
        0xCD,0xCE,0xCF,0xD0,0xD1,0xD2,0xD3,0x26,0x27,0xD4,0x20,0x49,0x4B,0x80,0x82,0x82,
        0xD5,0xD6,0x24,0x25,0x2D,0x2E,0xD7,0xD8,0xA6,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
        0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
        0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
};

/*================ Global functions ================*/

/*
 * Hash a string to an integer in a case-independent way
 */
int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this)
{
        const unsigned char *name = this->name;
        unsigned int hash, len = this->len;

        if (len > HFS_NAMELEN)
                len = HFS_NAMELEN;

        hash = init_name_hash(dentry);
        for (; len; len--)
                hash = partial_name_hash(caseorder[*name++], hash);
        this->hash = end_name_hash(hash);
        return 0;
}

/*
 * Compare two strings in the HFS filename character ordering
 * Returns positive, negative, or zero, not just 0 or (+/-)1
 *
 * Equivalent to ARDI's call:
 *        ROMlib_RelString(s1+1, s2+1, true, false, (s1[0]<<16) | s2[0])
 */
int hfs_strcmp(const unsigned char *s1, unsigned int len1,
               const unsigned char *s2, unsigned int len2)
{
        int len, tmp;

        len = (len1 > len2) ? len2 : len1;

        while (len--) {
                tmp = (int)caseorder[*(s1++)] - (int)caseorder[*(s2++)];
                if (tmp)
                        return tmp;
        }
        return len1 - len2;
}

/*
 * Test for equality of two strings in the HFS filename character ordering.
 * return 1 on failure and 0 on success
 */
int hfs_compare_dentry(const struct dentry *dentry,
                unsigned int len, const char *str, const struct qstr *name)
{
        const unsigned char *n1, *n2;

        if (len >= HFS_NAMELEN) {
                if (name->len < HFS_NAMELEN)
                        return 1;
                len = HFS_NAMELEN;
        } else if (len != name->len)
                return 1;

        n1 = str;
        n2 = name->name;
        while (len--) {
                if (caseorder[*n1++] != caseorder[*n2++])
                        return 1;
        }
        return 0;
}










































































































































































































































































































































































    1 










































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/char_dev.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/init.h>
#include <linux/fs.h>
#include <linux/kdev_t.h>
#include <linux/slab.h>
#include <linux/string.h>

#include <linux/major.h>
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/seq_file.h>

#include <linux/kobject.h>
#include <linux/kobj_map.h>
#include <linux/cdev.h>
#include <linux/mutex.h>
#include <linux/backing-dev.h>
#include <linux/tty.h>

#include "internal.h"

static struct kobj_map *cdev_map __ro_after_init;

static DEFINE_MUTEX(chrdevs_lock);

#define CHRDEV_MAJOR_HASH_SIZE 255

static struct char_device_struct {
        struct char_device_struct *next;
        unsigned int major;
        unsigned int baseminor;
        int minorct;
        char name[64];
        struct cdev *cdev;                /* will die */
} *chrdevs[CHRDEV_MAJOR_HASH_SIZE];

/* index in the above */
static inline int major_to_index(unsigned major)
{
        return major % CHRDEV_MAJOR_HASH_SIZE;
}

#ifdef CONFIG_PROC_FS

void chrdev_show(struct seq_file *f, off_t offset)
{
        struct char_device_struct *cd;

        mutex_lock(&chrdevs_lock);
        for (cd = chrdevs[major_to_index(offset)]; cd; cd = cd->next) {
                if (cd->major == offset)
                        seq_printf(f, "%3d %s\n", cd->major, cd->name);
        }
        mutex_unlock(&chrdevs_lock);
}

#endif /* CONFIG_PROC_FS */

static int find_dynamic_major(void)
{
        int i;
        struct char_device_struct *cd;

        for (i = ARRAY_SIZE(chrdevs)-1; i >= CHRDEV_MAJOR_DYN_END; i--) {
                if (chrdevs[i] == NULL)
                        return i;
        }

        for (i = CHRDEV_MAJOR_DYN_EXT_START;
             i >= CHRDEV_MAJOR_DYN_EXT_END; i--) {
                for (cd = chrdevs[major_to_index(i)]; cd; cd = cd->next)
                        if (cd->major == i)
                                break;

                if (cd == NULL)
                        return i;
        }

        return -EBUSY;
}

/*
 * Register a single major with a specified minor range.
 *
 * If major == 0 this function will dynamically allocate an unused major.
 * If major > 0 this function will attempt to reserve the range of minors
 * with given major.
 *
 */
static struct char_device_struct *
__register_chrdev_region(unsigned int major, unsigned int baseminor,
                           int minorct, const char *name)
{
        struct char_device_struct *cd, *curr, *prev = NULL;
        int ret;
        int i;

        if (major >= CHRDEV_MAJOR_MAX) {
                pr_err("CHRDEV \"%s\" major requested (%u) is greater than the maximum (%u)\n",
                       name, major, CHRDEV_MAJOR_MAX-1);
                return ERR_PTR(-EINVAL);
        }

        if (minorct > MINORMASK + 1 - baseminor) {
                pr_err("CHRDEV \"%s\" minor range requested (%u-%u) is out of range of maximum range (%u-%u) for a single major\n",
                        name, baseminor, baseminor + minorct - 1, 0, MINORMASK);
                return ERR_PTR(-EINVAL);
        }

        cd = kzalloc(sizeof(struct char_device_struct), GFP_KERNEL);
        if (cd == NULL)
                return ERR_PTR(-ENOMEM);

        mutex_lock(&chrdevs_lock);

        if (major == 0) {
                ret = find_dynamic_major();
                if (ret < 0) {
                        pr_err("CHRDEV \"%s\" dynamic allocation region is full\n",
                               name);
                        goto out;
                }
                major = ret;
        }

        ret = -EBUSY;
        i = major_to_index(major);
        for (curr = chrdevs[i]; curr; prev = curr, curr = curr->next) {
                if (curr->major < major)
                        continue;

                if (curr->major > major)
                        break;

                if (curr->baseminor + curr->minorct <= baseminor)
                        continue;

                if (curr->baseminor >= baseminor + minorct)
                        break;

                goto out;
        }

        cd->major = major;
        cd->baseminor = baseminor;
        cd->minorct = minorct;
        strscpy(cd->name, name, sizeof(cd->name));

        if (!prev) {
                cd->next = curr;
                chrdevs[i] = cd;
        } else {
                cd->next = prev->next;
                prev->next = cd;
        }

        mutex_unlock(&chrdevs_lock);
        return cd;
out:
        mutex_unlock(&chrdevs_lock);
        kfree(cd);
        return ERR_PTR(ret);
}

static struct char_device_struct *
__unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct)
{
        struct char_device_struct *cd = NULL, **cp;
        int i = major_to_index(major);

        mutex_lock(&chrdevs_lock);
        for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next)
                if ((*cp)->major == major &&
                    (*cp)->baseminor == baseminor &&
                    (*cp)->minorct == minorct)
                        break;
        if (*cp) {
                cd = *cp;
                *cp = cd->next;
        }
        mutex_unlock(&chrdevs_lock);
        return cd;
}

/**
 * register_chrdev_region() - register a range of device numbers
 * @from: the first in the desired range of device numbers; must include
 *        the major number.
 * @count: the number of consecutive device numbers required
 * @name: the name of the device or driver.
 *
 * Return value is zero on success, a negative error code on failure.
 */
int register_chrdev_region(dev_t from, unsigned count, const char *name)
{
        struct char_device_struct *cd;
        dev_t to = from + count;
        dev_t n, next;

        for (n = from; n < to; n = next) {
                next = MKDEV(MAJOR(n)+1, 0);
                if (next > to)
                        next = to;
                cd = __register_chrdev_region(MAJOR(n), MINOR(n),
                               next - n, name);
                if (IS_ERR(cd))
                        goto fail;
        }
        return 0;
fail:
        to = n;
        for (n = from; n < to; n = next) {
                next = MKDEV(MAJOR(n)+1, 0);
                kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n));
        }
        return PTR_ERR(cd);
}

/**
 * alloc_chrdev_region() - register a range of char device numbers
 * @dev: output parameter for first assigned number
 * @baseminor: first of the requested range of minor numbers
 * @count: the number of minor numbers required
 * @name: the name of the associated device or driver
 *
 * Allocates a range of char device numbers.  The major number will be
 * chosen dynamically, and returned (along with the first minor number)
 * in @dev.  Returns zero or a negative error code.
 */
int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
                        const char *name)
{
        struct char_device_struct *cd;
        cd = __register_chrdev_region(0, baseminor, count, name);
        if (IS_ERR(cd))
                return PTR_ERR(cd);
        *dev = MKDEV(cd->major, cd->baseminor);
        return 0;
}

/**
 * __register_chrdev() - create and register a cdev occupying a range of minors
 * @major: major device number or 0 for dynamic allocation
 * @baseminor: first of the requested range of minor numbers
 * @count: the number of minor numbers required
 * @name: name of this range of devices
 * @fops: file operations associated with this devices
 *
 * If @major == 0 this functions will dynamically allocate a major and return
 * its number.
 *
 * If @major > 0 this function will attempt to reserve a device with the given
 * major number and will return zero on success.
 *
 * Returns a -ve errno on failure.
 *
 * The name of this device has nothing to do with the name of the device in
 * /dev. It only helps to keep track of the different owners of devices. If
 * your module name has only one type of devices it's ok to use e.g. the name
 * of the module here.
 */
int __register_chrdev(unsigned int major, unsigned int baseminor,
                      unsigned int count, const char *name,
                      const struct file_operations *fops)
{
        struct char_device_struct *cd;
        struct cdev *cdev;
        int err = -ENOMEM;

        cd = __register_chrdev_region(major, baseminor, count, name);
        if (IS_ERR(cd))
                return PTR_ERR(cd);

        cdev = cdev_alloc();
        if (!cdev)
                goto out2;

        cdev->owner = fops->owner;
        cdev->ops = fops;
        kobject_set_name(&cdev->kobj, "%s", name);

        err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
        if (err)
                goto out;

        cd->cdev = cdev;

        return major ? 0 : cd->major;
out:
        kobject_put(&cdev->kobj);
out2:
        kfree(__unregister_chrdev_region(cd->major, baseminor, count));
        return err;
}

/**
 * unregister_chrdev_region() - unregister a range of device numbers
 * @from: the first in the range of numbers to unregister
 * @count: the number of device numbers to unregister
 *
 * This function will unregister a range of @count device numbers,
 * starting with @from.  The caller should normally be the one who
 * allocated those numbers in the first place...
 */
void unregister_chrdev_region(dev_t from, unsigned count)
{
        dev_t to = from + count;
        dev_t n, next;

        for (n = from; n < to; n = next) {
                next = MKDEV(MAJOR(n)+1, 0);
                if (next > to)
                        next = to;
                kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n));
        }
}

/**
 * __unregister_chrdev - unregister and destroy a cdev
 * @major: major device number
 * @baseminor: first of the range of minor numbers
 * @count: the number of minor numbers this cdev is occupying
 * @name: name of this range of devices
 *
 * Unregister and destroy the cdev occupying the region described by
 * @major, @baseminor and @count.  This function undoes what
 * __register_chrdev() did.
 */
void __unregister_chrdev(unsigned int major, unsigned int baseminor,
                         unsigned int count, const char *name)
{
        struct char_device_struct *cd;

        cd = __unregister_chrdev_region(major, baseminor, count);
        if (cd && cd->cdev)
                cdev_del(cd->cdev);
        kfree(cd);
}

static DEFINE_SPINLOCK(cdev_lock);

static struct kobject *cdev_get(struct cdev *p)
{
        struct module *owner = p->owner;
        struct kobject *kobj;

        if (!try_module_get(owner))
                return NULL;
        kobj = kobject_get_unless_zero(&p->kobj);
        if (!kobj)
                module_put(owner);
        return kobj;
}

void cdev_put(struct cdev *p)
{
        if (p) {
                struct module *owner = p->owner;
                kobject_put(&p->kobj);
                module_put(owner);
        }
}

/*
 * Called every time a character special file is opened
 */
static int chrdev_open(struct inode *inode, struct file *filp)
{
        const struct file_operations *fops;
        struct cdev *p;
        struct cdev *new = NULL;
        int ret = 0;

        spin_lock(&cdev_lock);
        p = inode->i_cdev;
        if (!p) {
                struct kobject *kobj;
                int idx;
                spin_unlock(&cdev_lock);
                kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
                if (!kobj)
                        return -ENXIO;
                new = container_of(kobj, struct cdev, kobj);
                spin_lock(&cdev_lock);
                /* Check i_cdev again in case somebody beat us to it while
                   we dropped the lock. */
                p = inode->i_cdev;
                if (!p) {
                        inode->i_cdev = p = new;
                        list_add(&inode->i_devices, &p->list);
                        new = NULL;
                } else if (!cdev_get(p))
                        ret = -ENXIO;
        } else if (!cdev_get(p))
                ret = -ENXIO;
        spin_unlock(&cdev_lock);
        cdev_put(new);
        if (ret)
                return ret;

        ret = -ENXIO;
        fops = fops_get(p->ops);
        if (!fops)
                goto out_cdev_put;

        replace_fops(filp, fops);
        if (filp->f_op->open) {
                ret = filp->f_op->open(inode, filp);
                if (ret)
                        goto out_cdev_put;
        }

        return 0;

 out_cdev_put:
        cdev_put(p);
        return ret;
}

void cd_forget(struct inode *inode)
{
        spin_lock(&cdev_lock);
        list_del_init(&inode->i_devices);
        inode->i_cdev = NULL;
        inode->i_mapping = &inode->i_data;
        spin_unlock(&cdev_lock);
}

static void cdev_purge(struct cdev *cdev)
{
        spin_lock(&cdev_lock);
        while (!list_empty(&cdev->list)) {
                struct inode *inode;
                inode = container_of(cdev->list.next, struct inode, i_devices);
                list_del_init(&inode->i_devices);
                inode->i_cdev = NULL;
        }
        spin_unlock(&cdev_lock);
}

/*
 * Dummy default file-operations: the only thing this does
 * is contain the open that then fills in the correct operations
 * depending on the special file...
 */
const struct file_operations def_chr_fops = {
        .open = chrdev_open,
        .llseek = noop_llseek,
};

static struct kobject *exact_match(dev_t dev, int *part, void *data)
{
        struct cdev *p = data;
        return &p->kobj;
}

static int exact_lock(dev_t dev, void *data)
{
        struct cdev *p = data;
        return cdev_get(p) ? 0 : -1;
}

/**
 * cdev_add() - add a char device to the system
 * @p: the cdev structure for the device
 * @dev: the first device number for which this device is responsible
 * @count: the number of consecutive minor numbers corresponding to this
 *         device
 *
 * cdev_add() adds the device represented by @p to the system, making it
 * live immediately.  A negative error code is returned on failure.
 */
int cdev_add(struct cdev *p, dev_t dev, unsigned count)
{
        int error;

        p->dev = dev;
        p->count = count;

        if (WARN_ON(dev == WHITEOUT_DEV)) {
                error = -EBUSY;
                goto err;
        }

        error = kobj_map(cdev_map, dev, count, NULL,
                         exact_match, exact_lock, p);
        if (error)
                goto err;

        kobject_get(p->kobj.parent);

        return 0;

err:
        kfree_const(p->kobj.name);
        p->kobj.name = NULL;
        return error;
}

/**
 * cdev_set_parent() - set the parent kobject for a char device
 * @p: the cdev structure
 * @kobj: the kobject to take a reference to
 *
 * cdev_set_parent() sets a parent kobject which will be referenced
 * appropriately so the parent is not freed before the cdev. This
 * should be called before cdev_add.
 */
void cdev_set_parent(struct cdev *p, struct kobject *kobj)
{
        WARN_ON(!kobj->state_initialized);
        p->kobj.parent = kobj;
}

/**
 * cdev_device_add() - add a char device and it's corresponding
 *        struct device, linkink
 * @dev: the device structure
 * @cdev: the cdev structure
 *
 * cdev_device_add() adds the char device represented by @cdev to the system,
 * just as cdev_add does. It then adds @dev to the system using device_add
 * The dev_t for the char device will be taken from the struct device which
 * needs to be initialized first. This helper function correctly takes a
 * reference to the parent device so the parent will not get released until
 * all references to the cdev are released.
 *
 * This helper uses dev->devt for the device number. If it is not set
 * it will not add the cdev and it will be equivalent to device_add.
 *
 * This function should be used whenever the struct cdev and the
 * struct device are members of the same structure whose lifetime is
 * managed by the struct device.
 *
 * NOTE: Callers must assume that userspace was able to open the cdev and
 * can call cdev fops callbacks at any time, even if this function fails.
 */
int cdev_device_add(struct cdev *cdev, struct device *dev)
{
        int rc = 0;

        if (dev->devt) {
                cdev_set_parent(cdev, &dev->kobj);

                rc = cdev_add(cdev, dev->devt, 1);
                if (rc)
                        return rc;
        }

        rc = device_add(dev);
        if (rc && dev->devt)
                cdev_del(cdev);

        return rc;
}

/**
 * cdev_device_del() - inverse of cdev_device_add
 * @dev: the device structure
 * @cdev: the cdev structure
 *
 * cdev_device_del() is a helper function to call cdev_del and device_del.
 * It should be used whenever cdev_device_add is used.
 *
 * If dev->devt is not set it will not remove the cdev and will be equivalent
 * to device_del.
 *
 * NOTE: This guarantees that associated sysfs callbacks are not running
 * or runnable, however any cdevs already open will remain and their fops
 * will still be callable even after this function returns.
 */
void cdev_device_del(struct cdev *cdev, struct device *dev)
{
        device_del(dev);
        if (dev->devt)
                cdev_del(cdev);
}

static void cdev_unmap(dev_t dev, unsigned count)
{
        kobj_unmap(cdev_map, dev, count);
}

/**
 * cdev_del() - remove a cdev from the system
 * @p: the cdev structure to be removed
 *
 * cdev_del() removes @p from the system, possibly freeing the structure
 * itself.
 *
 * NOTE: This guarantees that cdev device will no longer be able to be
 * opened, however any cdevs already open will remain and their fops will
 * still be callable even after cdev_del returns.
 */
void cdev_del(struct cdev *p)
{
        cdev_unmap(p->dev, p->count);
        kobject_put(&p->kobj);
}


static void cdev_default_release(struct kobject *kobj)
{
        struct cdev *p = container_of(kobj, struct cdev, kobj);
        struct kobject *parent = kobj->parent;

        cdev_purge(p);
        kobject_put(parent);
}

static void cdev_dynamic_release(struct kobject *kobj)
{
        struct cdev *p = container_of(kobj, struct cdev, kobj);
        struct kobject *parent = kobj->parent;

        cdev_purge(p);
        kfree(p);
        kobject_put(parent);
}

static struct kobj_type ktype_cdev_default = {
        .release        = cdev_default_release,
};

static struct kobj_type ktype_cdev_dynamic = {
        .release        = cdev_dynamic_release,
};

/**
 * cdev_alloc() - allocate a cdev structure
 *
 * Allocates and returns a cdev structure, or NULL on failure.
 */
struct cdev *cdev_alloc(void)
{
        struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL);
        if (p) {
                INIT_LIST_HEAD(&p->list);
                kobject_init(&p->kobj, &ktype_cdev_dynamic);
        }
        return p;
}

/**
 * cdev_init() - initialize a cdev structure
 * @cdev: the structure to initialize
 * @fops: the file_operations for this device
 *
 * Initializes @cdev, remembering @fops, making it ready to add to the
 * system with cdev_add().
 */
void cdev_init(struct cdev *cdev, const struct file_operations *fops)
{
        memset(cdev, 0, sizeof *cdev);
        INIT_LIST_HEAD(&cdev->list);
        kobject_init(&cdev->kobj, &ktype_cdev_default);
        cdev->ops = fops;
}

static struct kobject *base_probe(dev_t dev, int *part, void *data)
{
        if (request_module("char-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0)
                /* Make old-style 2.4 aliases work */
                request_module("char-major-%d", MAJOR(dev));
        return NULL;
}

void __init chrdev_init(void)
{
        cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
}


/* Let modules do char dev stuff */
EXPORT_SYMBOL(register_chrdev_region);
EXPORT_SYMBOL(unregister_chrdev_region);
EXPORT_SYMBOL(alloc_chrdev_region);
EXPORT_SYMBOL(cdev_init);
EXPORT_SYMBOL(cdev_alloc);
EXPORT_SYMBOL(cdev_del);
EXPORT_SYMBOL(cdev_add);
EXPORT_SYMBOL(cdev_set_parent);
EXPORT_SYMBOL(cdev_device_add);
EXPORT_SYMBOL(cdev_device_del);
EXPORT_SYMBOL(__register_chrdev);
EXPORT_SYMBOL(__unregister_chrdev);



































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UACCESS_H
#define _ASM_X86_UACCESS_H
/*
 * User space memory access functions
 */
#include <linux/compiler.h>
#include <linux/instrumented.h>
#include <linux/kasan-checks.h>
#include <linux/mm_types.h>
#include <linux/string.h>
#include <linux/mmap_lock.h>
#include <asm/asm.h>
#include <asm/page.h>
#include <asm/smap.h>
#include <asm/extable.h>
#include <asm/tlbflush.h>

#ifdef CONFIG_X86_32
# include <asm/uaccess_32.h>
#else
# include <asm/uaccess_64.h>
#endif

#include <asm-generic/access_ok.h>

extern int __get_user_1(void);
extern int __get_user_2(void);
extern int __get_user_4(void);
extern int __get_user_8(void);
extern int __get_user_nocheck_1(void);
extern int __get_user_nocheck_2(void);
extern int __get_user_nocheck_4(void);
extern int __get_user_nocheck_8(void);
extern int __get_user_bad(void);

#define __uaccess_begin() stac()
#define __uaccess_end()   clac()
#define __uaccess_begin_nospec()        \
({                                        \
        stac();                                \
        barrier_nospec();                \
})

/*
 * This is the smallest unsigned integer type that can fit a value
 * (up to 'long long')
 */
#define __inttype(x) __typeof__(                \
        __typefits(x,char,                        \
          __typefits(x,short,                        \
            __typefits(x,int,                        \
              __typefits(x,long,0ULL)))))

#define __typefits(x,type,not) \
        __builtin_choose_expr(sizeof(x)<=sizeof(type),(unsigned type)0,not)

/*
 * This is used for both get_user() and __get_user() to expand to
 * the proper special function call that has odd calling conventions
 * due to returning both a value and an error, and that depends on
 * the size of the pointer passed in.
 *
 * Careful: we have to cast the result to the type of the pointer
 * for sign reasons.
 *
 * The use of _ASM_DX as the register specifier is a bit of a
 * simplification, as gcc only cares about it as the starting point
 * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits
 * (%ecx being the next register in gcc's x86 register sequence), and
 * %rdx on 64 bits.
 *
 * Clang/LLVM cares about the size of the register, but still wants
 * the base register for something that ends up being a pair.
 */
#define do_get_user_call(fn,x,ptr)                                        \
({                                                                        \
        int __ret_gu;                                                        \
        register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX);                \
        __chk_user_ptr(ptr);                                                \
        asm volatile("call __" #fn "_%c[size]"                                \
                     : "=a" (__ret_gu), "=r" (__val_gu),                \
                        ASM_CALL_CONSTRAINT                                \
                     : "0" (ptr), [size] "i" (sizeof(*(ptr))));                \
        instrument_get_user(__val_gu);                                        \
        (x) = (__force __typeof__(*(ptr))) __val_gu;                        \
        __builtin_expect(__ret_gu, 0);                                        \
})

/**
 * get_user - Get a simple variable from user space.
 * @x:   Variable to store result.
 * @ptr: Source address, in user space.
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * This macro copies a single simple variable from user space to kernel
 * space.  It supports simple types like char and int, but not larger
 * data types like structures or arrays.
 *
 * @ptr must have pointer-to-simple-variable type, and the result of
 * dereferencing @ptr must be assignable to @x without a cast.
 *
 * Return: zero on success, or -EFAULT on error.
 * On error, the variable @x is set to zero.
 */
#define get_user(x,ptr) ({ might_fault(); do_get_user_call(get_user,x,ptr); })

/**
 * __get_user - Get a simple variable from user space, with less checking.
 * @x:   Variable to store result.
 * @ptr: Source address, in user space.
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * This macro copies a single simple variable from user space to kernel
 * space.  It supports simple types like char and int, but not larger
 * data types like structures or arrays.
 *
 * @ptr must have pointer-to-simple-variable type, and the result of
 * dereferencing @ptr must be assignable to @x without a cast.
 *
 * Caller must check the pointer with access_ok() before calling this
 * function.
 *
 * Return: zero on success, or -EFAULT on error.
 * On error, the variable @x is set to zero.
 */
#define __get_user(x,ptr) do_get_user_call(get_user_nocheck,x,ptr)


#ifdef CONFIG_X86_32
#define __put_user_goto_u64(x, addr, label)                        \
        asm goto("\n"                                        \
                     "1:        movl %%eax,0(%1)\n"                \
                     "2:        movl %%edx,4(%1)\n"                \
                     _ASM_EXTABLE_UA(1b, %l2)                        \
                     _ASM_EXTABLE_UA(2b, %l2)                        \
                     : : "A" (x), "r" (addr)                        \
                     : : label)

#else
#define __put_user_goto_u64(x, ptr, label) \
        __put_user_goto(x, ptr, "q", "er", label)
#endif

extern void __put_user_bad(void);

/*
 * Strange magic calling convention: pointer in %ecx,
 * value in %eax(:%edx), return value in %ecx. clobbers %rbx
 */
extern void __put_user_1(void);
extern void __put_user_2(void);
extern void __put_user_4(void);
extern void __put_user_8(void);
extern void __put_user_nocheck_1(void);
extern void __put_user_nocheck_2(void);
extern void __put_user_nocheck_4(void);
extern void __put_user_nocheck_8(void);

/*
 * ptr must be evaluated and assigned to the temporary __ptr_pu before
 * the assignment of x to __val_pu, to avoid any function calls
 * involved in the ptr expression (possibly implicitly generated due
 * to KASAN) from clobbering %ax.
 */
#define do_put_user_call(fn,x,ptr)                                        \
({                                                                        \
        int __ret_pu;                                                        \
        void __user *__ptr_pu;                                                \
        register __typeof__(*(ptr)) __val_pu asm("%"_ASM_AX);                \
        __typeof__(*(ptr)) __x = (x); /* eval x once */                        \
        __typeof__(ptr) __ptr = (ptr); /* eval ptr once */                \
        __chk_user_ptr(__ptr);                                                \
        __ptr_pu = __ptr;                                                \
        __val_pu = __x;                                                        \
        asm volatile("call __" #fn "_%c[size]"                                \
                     : "=c" (__ret_pu),                                        \
                        ASM_CALL_CONSTRAINT                                \
                     : "0" (__ptr_pu),                                        \
                       "r" (__val_pu),                                        \
                       [size] "i" (sizeof(*(ptr)))                        \
                     :"ebx");                                                \
        instrument_put_user(__x, __ptr, sizeof(*(ptr)));                \
        __builtin_expect(__ret_pu, 0);                                        \
})

/**
 * put_user - Write a simple value into user space.
 * @x:   Value to copy to user space.
 * @ptr: Destination address, in user space.
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * This macro copies a single simple value from kernel space to user
 * space.  It supports simple types like char and int, but not larger
 * data types like structures or arrays.
 *
 * @ptr must have pointer-to-simple-variable type, and @x must be assignable
 * to the result of dereferencing @ptr.
 *
 * Return: zero on success, or -EFAULT on error.
 */
#define put_user(x, ptr) ({ might_fault(); do_put_user_call(put_user,x,ptr); })

/**
 * __put_user - Write a simple value into user space, with less checking.
 * @x:   Value to copy to user space.
 * @ptr: Destination address, in user space.
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * This macro copies a single simple value from kernel space to user
 * space.  It supports simple types like char and int, but not larger
 * data types like structures or arrays.
 *
 * @ptr must have pointer-to-simple-variable type, and @x must be assignable
 * to the result of dereferencing @ptr.
 *
 * Caller must check the pointer with access_ok() before calling this
 * function.
 *
 * Return: zero on success, or -EFAULT on error.
 */
#define __put_user(x, ptr) do_put_user_call(put_user_nocheck,x,ptr)

#define __put_user_size(x, ptr, size, label)                                \
do {                                                                        \
        __typeof__(*(ptr)) __x = (x); /* eval x once */                        \
        __typeof__(ptr) __ptr = (ptr); /* eval ptr once */                \
        __chk_user_ptr(__ptr);                                                \
        switch (size) {                                                        \
        case 1:                                                                \
                __put_user_goto(__x, __ptr, "b", "iq", label);                \
                break;                                                        \
        case 2:                                                                \
                __put_user_goto(__x, __ptr, "w", "ir", label);                \
                break;                                                        \
        case 4:                                                                \
                __put_user_goto(__x, __ptr, "l", "ir", label);                \
                break;                                                        \
        case 8:                                                                \
                __put_user_goto_u64(__x, __ptr, label);                        \
                break;                                                        \
        default:                                                        \
                __put_user_bad();                                        \
        }                                                                \
        instrument_put_user(__x, __ptr, size);                                \
} while (0)

#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT

#ifdef CONFIG_X86_32
#define __get_user_asm_u64(x, ptr, label) do {                                \
        unsigned int __gu_low, __gu_high;                                \
        const unsigned int __user *__gu_ptr;                                \
        __gu_ptr = (const void __user *)(ptr);                                \
        __get_user_asm(__gu_low, __gu_ptr, "l", "=r", label);                \
        __get_user_asm(__gu_high, __gu_ptr+1, "l", "=r", label);        \
        (x) = ((unsigned long long)__gu_high << 32) | __gu_low;                \
} while (0)
#else
#define __get_user_asm_u64(x, ptr, label)                                \
        __get_user_asm(x, ptr, "q", "=r", label)
#endif

#define __get_user_size(x, ptr, size, label)                                \
do {                                                                        \
        __chk_user_ptr(ptr);                                                \
        switch (size) {                                                        \
        case 1:        {                                                        \
                unsigned char x_u8__;                                        \
                __get_user_asm(x_u8__, ptr, "b", "=q", label);                \
                (x) = x_u8__;                                                \
                break;                                                        \
        }                                                                \
        case 2:                                                                \
                __get_user_asm(x, ptr, "w", "=r", label);                \
                break;                                                        \
        case 4:                                                                \
                __get_user_asm(x, ptr, "l", "=r", label);                \
                break;                                                        \
        case 8:                                                                \
                __get_user_asm_u64(x, ptr, label);                        \
                break;                                                        \
        default:                                                        \
                (x) = __get_user_bad();                                        \
        }                                                                \
        instrument_get_user(x);                                                \
} while (0)

#define __get_user_asm(x, addr, itype, ltype, label)                        \
        asm_goto_output("\n"                                                \
                     "1:        mov"itype" %[umem],%[output]\n"                \
                     _ASM_EXTABLE_UA(1b, %l2)                                \
                     : [output] ltype(x)                                \
                     : [umem] "m" (__m(addr))                                \
                     : : label)

#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT

#ifdef CONFIG_X86_32
#define __get_user_asm_u64(x, ptr, retval)                                \
({                                                                        \
        __typeof__(ptr) __ptr = (ptr);                                        \
        asm volatile("\n"                                                \
                     "1:        movl %[lowbits],%%eax\n"                \
                     "2:        movl %[highbits],%%edx\n"                \
                     "3:\n"                                                \
                     _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG |        \
                                           EX_FLAG_CLEAR_AX_DX,                \
                                           %[errout])                        \
                     _ASM_EXTABLE_TYPE_REG(2b, 3b, EX_TYPE_EFAULT_REG |        \
                                           EX_FLAG_CLEAR_AX_DX,                \
                                           %[errout])                        \
                     : [errout] "=r" (retval),                                \
                       [output] "=&A"(x)                                \
                     : [lowbits] "m" (__m(__ptr)),                        \
                       [highbits] "m" __m(((u32 __user *)(__ptr)) + 1),        \
                       "0" (retval));                                        \
})

#else
#define __get_user_asm_u64(x, ptr, retval) \
         __get_user_asm(x, ptr, retval, "q")
#endif

#define __get_user_size(x, ptr, size, retval)                                \
do {                                                                        \
        unsigned char x_u8__;                                                \
                                                                        \
        retval = 0;                                                        \
        __chk_user_ptr(ptr);                                                \
        switch (size) {                                                        \
        case 1:                                                                \
                __get_user_asm(x_u8__, ptr, retval, "b");                \
                (x) = x_u8__;                                                \
                break;                                                        \
        case 2:                                                                \
                __get_user_asm(x, ptr, retval, "w");                        \
                break;                                                        \
        case 4:                                                                \
                __get_user_asm(x, ptr, retval, "l");                        \
                break;                                                        \
        case 8:                                                                \
                __get_user_asm_u64(x, ptr, retval);                        \
                break;                                                        \
        default:                                                        \
                (x) = __get_user_bad();                                        \
        }                                                                \
} while (0)

#define __get_user_asm(x, addr, err, itype)                                \
        asm volatile("\n"                                                \
                     "1:        mov"itype" %[umem],%[output]\n"                \
                     "2:\n"                                                \
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG | \
                                           EX_FLAG_CLEAR_AX,                \
                                           %[errout])                        \
                     : [errout] "=r" (err),                                \
                       [output] "=a" (x)                                \
                     : [umem] "m" (__m(addr)),                                \
                       "0" (err))

#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT

#ifdef CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label)        ({ \
        bool success;                                                        \
        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);                \
        __typeof__(*(_ptr)) __old = *_old;                                \
        __typeof__(*(_ptr)) __new = (_new);                                \
        asm_goto_output("\n"                                                \
                     "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
                     _ASM_EXTABLE_UA(1b, %l[label])                        \
                     : CC_OUT(z) (success),                                \
                       [ptr] "+m" (*_ptr),                                \
                       [old] "+a" (__old)                                \
                     : [new] ltype (__new)                                \
                     : "memory"                                                \
                     : label);                                                \
        if (unlikely(!success))                                                \
                *_old = __old;                                                \
        likely(success);                                        })

#ifdef CONFIG_X86_32
#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label)        ({        \
        bool success;                                                        \
        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);                \
        __typeof__(*(_ptr)) __old = *_old;                                \
        __typeof__(*(_ptr)) __new = (_new);                                \
        asm_goto_output("\n"                                                \
                     "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n"                \
                     _ASM_EXTABLE_UA(1b, %l[label])                        \
                     : CC_OUT(z) (success),                                \
                       "+A" (__old),                                        \
                       [ptr] "+m" (*_ptr)                                \
                     : "b" ((u32)__new),                                \
                       "c" ((u32)((u64)__new >> 32))                        \
                     : "memory"                                                \
                     : label);                                                \
        if (unlikely(!success))                                                \
                *_old = __old;                                                \
        likely(success);                                        })
#endif // CONFIG_X86_32
#else  // !CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label)        ({ \
        int __err = 0;                                                        \
        bool success;                                                        \
        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);                \
        __typeof__(*(_ptr)) __old = *_old;                                \
        __typeof__(*(_ptr)) __new = (_new);                                \
        asm volatile("\n"                                                \
                     "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
                     CC_SET(z)                                                \
                     "2:\n"                                                \
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG,        \
                                           %[errout])                        \
                     : CC_OUT(z) (success),                                \
                       [errout] "+r" (__err),                                \
                       [ptr] "+m" (*_ptr),                                \
                       [old] "+a" (__old)                                \
                     : [new] ltype (__new)                                \
                     : "memory");                                        \
        if (unlikely(__err))                                                \
                goto label;                                                \
        if (unlikely(!success))                                                \
                *_old = __old;                                                \
        likely(success);                                        })

#ifdef CONFIG_X86_32
/*
 * Unlike the normal CMPXCHG, use output GPR for both success/fail and error.
 * There are only six GPRs available and four (EAX, EBX, ECX, and EDX) are
 * hardcoded by CMPXCHG8B, leaving only ESI and EDI.  If the compiler uses
 * both ESI and EDI for the memory operand, compilation will fail if the error
 * is an input+output as there will be no register available for input.
 */
#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label)        ({        \
        int __result;                                                        \
        __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);                \
        __typeof__(*(_ptr)) __old = *_old;                                \
        __typeof__(*(_ptr)) __new = (_new);                                \
        asm volatile("\n"                                                \
                     "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n"                \
                     "mov $0, %[result]\n\t"                                \
                     "setz %b[result]\n"                                \
                     "2:\n"                                                \
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG,        \
                                           %[result])                        \
                     : [result] "=q" (__result),                        \
                       "+A" (__old),                                        \
                       [ptr] "+m" (*_ptr)                                \
                     : "b" ((u32)__new),                                \
                       "c" ((u32)((u64)__new >> 32))                        \
                     : "memory", "cc");                                        \
        if (unlikely(__result < 0))                                        \
                goto label;                                                \
        if (unlikely(!__result))                                        \
                *_old = __old;                                                \
        likely(__result);                                        })
#endif // CONFIG_X86_32
#endif // CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT

/* FIXME: this hack is definitely wrong -AK */
struct __large_struct { unsigned long buf[100]; };
#define __m(x) (*(struct __large_struct __user *)(x))

/*
 * Tell gcc we read from memory instead of writing: this is because
 * we do not write to any memory gcc knows about, so there are no
 * aliasing issues.
 */
#define __put_user_goto(x, addr, itype, ltype, label)                        \
        asm goto("\n"                                                        \
                "1:        mov"itype" %0,%1\n"                                \
                _ASM_EXTABLE_UA(1b, %l2)                                \
                : : ltype(x), "m" (__m(addr))                                \
                : : label)

extern unsigned long
copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
extern __must_check long
strncpy_from_user(char *dst, const char __user *src, long count);

extern __must_check long strnlen_user(const char __user *str, long n);

#ifdef CONFIG_ARCH_HAS_COPY_MC
unsigned long __must_check
copy_mc_to_kernel(void *to, const void *from, unsigned len);
#define copy_mc_to_kernel copy_mc_to_kernel

unsigned long __must_check
copy_mc_to_user(void __user *to, const void *from, unsigned len);
#endif

/*
 * movsl can be slow when source and dest are not both 8-byte aligned
 */
#ifdef CONFIG_X86_INTEL_USERCOPY
extern struct movsl_mask {
        int mask;
} ____cacheline_aligned_in_smp movsl_mask;
#endif

#define ARCH_HAS_NOCACHE_UACCESS 1

/*
 * The "unsafe" user accesses aren't really "unsafe", but the naming
 * is a big fat warning: you have to not only do the access_ok()
 * checking before using them, but you have to surround them with the
 * user_access_begin/end() pair.
 */
static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len)
{
        if (unlikely(!access_ok(ptr,len)))
                return 0;
        __uaccess_begin_nospec();
        return 1;
}
#define user_access_begin(a,b)        user_access_begin(a,b)
#define user_access_end()        __uaccess_end()

#define user_access_save()        smap_save()
#define user_access_restore(x)        smap_restore(x)

#define unsafe_put_user(x, ptr, label)        \
        __put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label)

#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define unsafe_get_user(x, ptr, err_label)                                        \
do {                                                                                \
        __inttype(*(ptr)) __gu_val;                                                \
        __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), err_label);                \
        (x) = (__force __typeof__(*(ptr)))__gu_val;                                \
} while (0)
#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define unsafe_get_user(x, ptr, err_label)                                        \
do {                                                                                \
        int __gu_err;                                                                \
        __inttype(*(ptr)) __gu_val;                                                \
        __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err);                \
        (x) = (__force __typeof__(*(ptr)))__gu_val;                                \
        if (unlikely(__gu_err)) goto err_label;                                        \
} while (0)
#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT

extern void __try_cmpxchg_user_wrong_size(void);

#ifndef CONFIG_X86_32
#define __try_cmpxchg64_user_asm(_ptr, _oldp, _nval, _label)                \
        __try_cmpxchg_user_asm("q", "r", (_ptr), (_oldp), (_nval), _label)
#endif

/*
 * Force the pointer to u<size> to match the size expected by the asm helper.
 * clang/LLVM compiles all cases and only discards the unused paths after
 * processing errors, which breaks i386 if the pointer is an 8-byte value.
 */
#define unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({                        \
        bool __ret;                                                                \
        __chk_user_ptr(_ptr);                                                        \
        switch (sizeof(*(_ptr))) {                                                \
        case 1:        __ret = __try_cmpxchg_user_asm("b", "q",                        \
                                               (__force u8 *)(_ptr), (_oldp),        \
                                               (_nval), _label);                \
                break;                                                                \
        case 2:        __ret = __try_cmpxchg_user_asm("w", "r",                        \
                                               (__force u16 *)(_ptr), (_oldp),        \
                                               (_nval), _label);                \
                break;                                                                \
        case 4:        __ret = __try_cmpxchg_user_asm("l", "r",                        \
                                               (__force u32 *)(_ptr), (_oldp),        \
                                               (_nval), _label);                \
                break;                                                                \
        case 8:        __ret = __try_cmpxchg64_user_asm((__force u64 *)(_ptr), (_oldp),\
                                                 (_nval), _label);                \
                break;                                                                \
        default: __try_cmpxchg_user_wrong_size();                                \
        }                                                                        \
        __ret;                                                })

/* "Returns" 0 on success, 1 on failure, -EFAULT if the access faults. */
#define __try_cmpxchg_user(_ptr, _oldp, _nval, _label)        ({                \
        int __ret = -EFAULT;                                                \
        __uaccess_begin_nospec();                                        \
        __ret = !unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label);        \
_label:                                                                        \
        __uaccess_end();                                                \
        __ret;                                                                \
                                                        })

/*
 * We want the unsafe accessors to always be inlined and use
 * the error labels - thus the macro games.
 */
#define unsafe_copy_loop(dst, src, len, type, label)                                \
        while (len >= sizeof(type)) {                                                \
                unsafe_put_user(*(type *)(src),(type __user *)(dst),label);        \
                dst += sizeof(type);                                                \
                src += sizeof(type);                                                \
                len -= sizeof(type);                                                \
        }

#define unsafe_copy_to_user(_dst,_src,_len,label)                        \
do {                                                                        \
        char __user *__ucu_dst = (_dst);                                \
        const char *__ucu_src = (_src);                                        \
        size_t __ucu_len = (_len);                                        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label);        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label);        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label);        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label);        \
} while (0)

#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define __get_kernel_nofault(dst, src, type, err_label)                        \
        __get_user_size(*((type *)(dst)), (__force type __user *)(src),        \
                        sizeof(type), err_label)
#else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define __get_kernel_nofault(dst, src, type, err_label)                        \
do {                                                                        \
        int __kr_err;                                                        \
                                                                        \
        __get_user_size(*((type *)(dst)), (__force type __user *)(src),        \
                        sizeof(type), __kr_err);                        \
        if (unlikely(__kr_err))                                                \
                goto err_label;                                                \
} while (0)
#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT

#define __put_kernel_nofault(dst, src, type, err_label)                        \
        __put_user_size(*((type *)(src)), (__force type __user *)(dst),        \
                        sizeof(type), err_label)

#endif /* _ASM_X86_UACCESS_H */









































































































    1 






















    1 
















    1 








    1 

















    1 












    1 

































    1 
    1 




































    1 
    1 





















    4 












    3 
    4 






    2 






    1 
    1 











    4 









    3 








    1 
























































































































































    2 















    1 



    2 





    1 
    1 


    2 
    1 
    2 
    2 
















































































































































































    1 



    1 




    2 























    1 

    1 














    1 


    1 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
// SPDX-License-Identifier: GPL-2.0-or-later
/* Provide a way to create a superblock configuration context within the kernel
 * that allows a superblock to be set up prior to mounting.
 *
 * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/magic.h>
#include <linux/security.h>
#include <linux/mnt_namespace.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <net/net_namespace.h>
#include <asm/sections.h>
#include "mount.h"
#include "internal.h"

enum legacy_fs_param {
        LEGACY_FS_UNSET_PARAMS,
        LEGACY_FS_MONOLITHIC_PARAMS,
        LEGACY_FS_INDIVIDUAL_PARAMS,
};

struct legacy_fs_context {
        char                        *legacy_data;        /* Data page for legacy filesystems */
        size_t                        data_size;
        enum legacy_fs_param        param_type;
};

static int legacy_init_fs_context(struct fs_context *fc);

static const struct constant_table common_set_sb_flag[] = {
        { "dirsync",        SB_DIRSYNC },
        { "lazytime",        SB_LAZYTIME },
        { "mand",        SB_MANDLOCK },
        { "ro",                SB_RDONLY },
        { "sync",        SB_SYNCHRONOUS },
        { },
};

static const struct constant_table common_clear_sb_flag[] = {
        { "async",        SB_SYNCHRONOUS },
        { "nolazytime",        SB_LAZYTIME },
        { "nomand",        SB_MANDLOCK },
        { "rw",                SB_RDONLY },
        { },
};

/*
 * Check for a common mount option that manipulates s_flags.
 */
static int vfs_parse_sb_flag(struct fs_context *fc, const char *key)
{
        unsigned int token;

        token = lookup_constant(common_set_sb_flag, key, 0);
        if (token) {
                fc->sb_flags |= token;
                fc->sb_flags_mask |= token;
                return 0;
        }

        token = lookup_constant(common_clear_sb_flag, key, 0);
        if (token) {
                fc->sb_flags &= ~token;
                fc->sb_flags_mask |= token;
                return 0;
        }

        return -ENOPARAM;
}

/**
 * vfs_parse_fs_param_source - Handle setting "source" via parameter
 * @fc: The filesystem context to modify
 * @param: The parameter
 *
 * This is a simple helper for filesystems to verify that the "source" they
 * accept is sane.
 *
 * Returns 0 on success, -ENOPARAM if this is not  "source" parameter, and
 * -EINVAL otherwise. In the event of failure, supplementary error information
 *  is logged.
 */
int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param)
{
        if (strcmp(param->key, "source") != 0)
                return -ENOPARAM;

        if (param->type != fs_value_is_string)
                return invalf(fc, "Non-string source");

        if (fc->source)
                return invalf(fc, "Multiple sources");

        fc->source = param->string;
        param->string = NULL;
        return 0;
}
EXPORT_SYMBOL(vfs_parse_fs_param_source);

/**
 * vfs_parse_fs_param - Add a single parameter to a superblock config
 * @fc: The filesystem context to modify
 * @param: The parameter
 *
 * A single mount option in string form is applied to the filesystem context
 * being set up.  Certain standard options (for example "ro") are translated
 * into flag bits without going to the filesystem.  The active security module
 * is allowed to observe and poach options.  Any other options are passed over
 * to the filesystem to parse.
 *
 * This may be called multiple times for a context.
 *
 * Returns 0 on success and a negative error code on failure.  In the event of
 * failure, supplementary error information may have been set.
 */
int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param)
{
        int ret;

        if (!param->key)
                return invalf(fc, "Unnamed parameter\n");

        ret = vfs_parse_sb_flag(fc, param->key);
        if (ret != -ENOPARAM)
                return ret;

        ret = security_fs_context_parse_param(fc, param);
        if (ret != -ENOPARAM)
                /* Param belongs to the LSM or is disallowed by the LSM; so
                 * don't pass to the FS.
                 */
                return ret;

        if (fc->ops->parse_param) {
                ret = fc->ops->parse_param(fc, param);
                if (ret != -ENOPARAM)
                        return ret;
        }

        /* If the filesystem doesn't take any arguments, give it the
         * default handling of source.
         */
        ret = vfs_parse_fs_param_source(fc, param);
        if (ret != -ENOPARAM)
                return ret;

        return invalf(fc, "%s: Unknown parameter '%s'",
                      fc->fs_type->name, param->key);
}
EXPORT_SYMBOL(vfs_parse_fs_param);

/**
 * vfs_parse_fs_string - Convenience function to just parse a string.
 * @fc: Filesystem context.
 * @key: Parameter name.
 * @value: Default value.
 * @v_size: Maximum number of bytes in the value.
 */
int vfs_parse_fs_string(struct fs_context *fc, const char *key,
                        const char *value, size_t v_size)
{
        int ret;

        struct fs_parameter param = {
                .key        = key,
                .type        = fs_value_is_flag,
                .size        = v_size,
        };

        if (value) {
                param.string = kmemdup_nul(value, v_size, GFP_KERNEL);
                if (!param.string)
                        return -ENOMEM;
                param.type = fs_value_is_string;
        }

        ret = vfs_parse_fs_param(fc, &param);
        kfree(param.string);
        return ret;
}
EXPORT_SYMBOL(vfs_parse_fs_string);

/**
 * vfs_parse_monolithic_sep - Parse key[=val][,key[=val]]* mount data
 * @fc: The superblock configuration to fill in.
 * @data: The data to parse
 * @sep: callback for separating next option
 *
 * Parse a blob of data that's in key[=val][,key[=val]]* form with a custom
 * option separator callback.
 *
 * Returns 0 on success or the error returned by the ->parse_option() fs_context
 * operation on failure.
 */
int vfs_parse_monolithic_sep(struct fs_context *fc, void *data,
                             char *(*sep)(char **))
{
        char *options = data, *key;
        int ret = 0;

        if (!options)
                return 0;

        ret = security_sb_eat_lsm_opts(options, &fc->security);
        if (ret)
                return ret;

        while ((key = sep(&options)) != NULL) {
                if (*key) {
                        size_t v_len = 0;
                        char *value = strchr(key, '=');

                        if (value) {
                                if (value == key)
                                        continue;
                                *value++ = 0;
                                v_len = strlen(value);
                        }
                        ret = vfs_parse_fs_string(fc, key, value, v_len);
                        if (ret < 0)
                                break;
                }
        }

        return ret;
}
EXPORT_SYMBOL(vfs_parse_monolithic_sep);

static char *vfs_parse_comma_sep(char **s)
{
        return strsep(s, ",");
}

/**
 * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data
 * @fc: The superblock configuration to fill in.
 * @data: The data to parse
 *
 * Parse a blob of data that's in key[=val][,key[=val]]* form.  This can be
 * called from the ->monolithic_mount_data() fs_context operation.
 *
 * Returns 0 on success or the error returned by the ->parse_option() fs_context
 * operation on failure.
 */
int generic_parse_monolithic(struct fs_context *fc, void *data)
{
        return vfs_parse_monolithic_sep(fc, data, vfs_parse_comma_sep);
}
EXPORT_SYMBOL(generic_parse_monolithic);

/**
 * alloc_fs_context - Create a filesystem context.
 * @fs_type: The filesystem type.
 * @reference: The dentry from which this one derives (or NULL)
 * @sb_flags: Filesystem/superblock flags (SB_*)
 * @sb_flags_mask: Applicable members of @sb_flags
 * @purpose: The purpose that this configuration shall be used for.
 *
 * Open a filesystem and create a mount context.  The mount context is
 * initialised with the supplied flags and, if a submount/automount from
 * another superblock (referred to by @reference) is supplied, may have
 * parameters such as namespaces copied across from that superblock.
 */
static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
                                      struct dentry *reference,
                                      unsigned int sb_flags,
                                      unsigned int sb_flags_mask,
                                      enum fs_context_purpose purpose)
{
        int (*init_fs_context)(struct fs_context *);
        struct fs_context *fc;
        int ret = -ENOMEM;

        fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL_ACCOUNT);
        if (!fc)
                return ERR_PTR(-ENOMEM);

        fc->purpose        = purpose;
        fc->sb_flags        = sb_flags;
        fc->sb_flags_mask = sb_flags_mask;
        fc->fs_type        = get_filesystem(fs_type);
        fc->cred        = get_current_cred();
        fc->net_ns        = get_net(current->nsproxy->net_ns);
        fc->log.prefix        = fs_type->name;

        mutex_init(&fc->uapi_mutex);

        switch (purpose) {
        case FS_CONTEXT_FOR_MOUNT:
                fc->user_ns = get_user_ns(fc->cred->user_ns);
                break;
        case FS_CONTEXT_FOR_SUBMOUNT:
                fc->user_ns = get_user_ns(reference->d_sb->s_user_ns);
                break;
        case FS_CONTEXT_FOR_RECONFIGURE:
                atomic_inc(&reference->d_sb->s_active);
                fc->user_ns = get_user_ns(reference->d_sb->s_user_ns);
                fc->root = dget(reference);
                break;
        }

        /* TODO: Make all filesystems support this unconditionally */
        init_fs_context = fc->fs_type->init_fs_context;
        if (!init_fs_context)
                init_fs_context = legacy_init_fs_context;

        ret = init_fs_context(fc);
        if (ret < 0)
                goto err_fc;
        fc->need_free = true;
        return fc;

err_fc:
        put_fs_context(fc);
        return ERR_PTR(ret);
}

struct fs_context *fs_context_for_mount(struct file_system_type *fs_type,
                                        unsigned int sb_flags)
{
        return alloc_fs_context(fs_type, NULL, sb_flags, 0,
                                        FS_CONTEXT_FOR_MOUNT);
}
EXPORT_SYMBOL(fs_context_for_mount);

struct fs_context *fs_context_for_reconfigure(struct dentry *dentry,
                                        unsigned int sb_flags,
                                        unsigned int sb_flags_mask)
{
        return alloc_fs_context(dentry->d_sb->s_type, dentry, sb_flags,
                                sb_flags_mask, FS_CONTEXT_FOR_RECONFIGURE);
}
EXPORT_SYMBOL(fs_context_for_reconfigure);

/**
 * fs_context_for_submount: allocate a new fs_context for a submount
 * @type: file_system_type of the new context
 * @reference: reference dentry from which to copy relevant info
 *
 * Allocate a new fs_context suitable for a submount. This also ensures that
 * the fc->security object is inherited from @reference (if needed).
 */
struct fs_context *fs_context_for_submount(struct file_system_type *type,
                                           struct dentry *reference)
{
        struct fs_context *fc;
        int ret;

        fc = alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT);
        if (IS_ERR(fc))
                return fc;

        ret = security_fs_context_submount(fc, reference->d_sb);
        if (ret) {
                put_fs_context(fc);
                return ERR_PTR(ret);
        }

        return fc;
}
EXPORT_SYMBOL(fs_context_for_submount);

void fc_drop_locked(struct fs_context *fc)
{
        struct super_block *sb = fc->root->d_sb;
        dput(fc->root);
        fc->root = NULL;
        deactivate_locked_super(sb);
}

static void legacy_fs_context_free(struct fs_context *fc);

/**
 * vfs_dup_fs_context - Duplicate a filesystem context.
 * @src_fc: The context to copy.
 */
struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc)
{
        struct fs_context *fc;
        int ret;

        if (!src_fc->ops->dup)
                return ERR_PTR(-EOPNOTSUPP);

        fc = kmemdup(src_fc, sizeof(struct fs_context), GFP_KERNEL);
        if (!fc)
                return ERR_PTR(-ENOMEM);

        mutex_init(&fc->uapi_mutex);

        fc->fs_private        = NULL;
        fc->s_fs_info        = NULL;
        fc->source        = NULL;
        fc->security        = NULL;
        get_filesystem(fc->fs_type);
        get_net(fc->net_ns);
        get_user_ns(fc->user_ns);
        get_cred(fc->cred);
        if (fc->log.log)
                refcount_inc(&fc->log.log->usage);

        /* Can't call put until we've called ->dup */
        ret = fc->ops->dup(fc, src_fc);
        if (ret < 0)
                goto err_fc;

        ret = security_fs_context_dup(fc, src_fc);
        if (ret < 0)
                goto err_fc;
        return fc;

err_fc:
        put_fs_context(fc);
        return ERR_PTR(ret);
}
EXPORT_SYMBOL(vfs_dup_fs_context);

/**
 * logfc - Log a message to a filesystem context
 * @log: The filesystem context to log to, or NULL to use printk.
 * @prefix: A string to prefix the output with, or NULL.
 * @level: 'w' for a warning, 'e' for an error.  Anything else is a notice.
 * @fmt: The format of the buffer.
 */
void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...)
{
        va_list va;
        struct va_format vaf = {.fmt = fmt, .va = &va};

        va_start(va, fmt);
        if (!log) {
                switch (level) {
                case 'w':
                        printk(KERN_WARNING "%s%s%pV\n", prefix ? prefix : "",
                                                prefix ? ": " : "", &vaf);
                        break;
                case 'e':
                        printk(KERN_ERR "%s%s%pV\n", prefix ? prefix : "",
                                                prefix ? ": " : "", &vaf);
                        break;
                default:
                        printk(KERN_NOTICE "%s%s%pV\n", prefix ? prefix : "",
                                                prefix ? ": " : "", &vaf);
                        break;
                }
        } else {
                unsigned int logsize = ARRAY_SIZE(log->buffer);
                u8 index;
                char *q = kasprintf(GFP_KERNEL, "%c %s%s%pV\n", level,
                                                prefix ? prefix : "",
                                                prefix ? ": " : "", &vaf);

                index = log->head & (logsize - 1);
                BUILD_BUG_ON(sizeof(log->head) != sizeof(u8) ||
                             sizeof(log->tail) != sizeof(u8));
                if ((u8)(log->head - log->tail) == logsize) {
                        /* The buffer is full, discard the oldest message */
                        if (log->need_free & (1 << index))
                                kfree(log->buffer[index]);
                        log->tail++;
                }

                log->buffer[index] = q ? q : "OOM: Can't store error string";
                if (q)
                        log->need_free |= 1 << index;
                else
                        log->need_free &= ~(1 << index);
                log->head++;
        }
        va_end(va);
}
EXPORT_SYMBOL(logfc);

/*
 * Free a logging structure.
 */
static void put_fc_log(struct fs_context *fc)
{
        struct fc_log *log = fc->log.log;
        int i;

        if (log) {
                if (refcount_dec_and_test(&log->usage)) {
                        fc->log.log = NULL;
                        for (i = 0; i <= 7; i++)
                                if (log->need_free & (1 << i))
                                        kfree(log->buffer[i]);
                        kfree(log);
                }
        }
}

/**
 * put_fs_context - Dispose of a superblock configuration context.
 * @fc: The context to dispose of.
 */
void put_fs_context(struct fs_context *fc)
{
        struct super_block *sb;

        if (fc->root) {
                sb = fc->root->d_sb;
                dput(fc->root);
                fc->root = NULL;
                deactivate_super(sb);
        }

        if (fc->need_free && fc->ops && fc->ops->free)
                fc->ops->free(fc);

        security_free_mnt_opts(&fc->security);
        put_net(fc->net_ns);
        put_user_ns(fc->user_ns);
        put_cred(fc->cred);
        put_fc_log(fc);
        put_filesystem(fc->fs_type);
        kfree(fc->source);
        kfree(fc);
}
EXPORT_SYMBOL(put_fs_context);

/*
 * Free the config for a filesystem that doesn't support fs_context.
 */
static void legacy_fs_context_free(struct fs_context *fc)
{
        struct legacy_fs_context *ctx = fc->fs_private;

        if (ctx) {
                if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS)
                        kfree(ctx->legacy_data);
                kfree(ctx);
        }
}

/*
 * Duplicate a legacy config.
 */
static int legacy_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
{
        struct legacy_fs_context *ctx;
        struct legacy_fs_context *src_ctx = src_fc->fs_private;

        ctx = kmemdup(src_ctx, sizeof(*src_ctx), GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) {
                ctx->legacy_data = kmemdup(src_ctx->legacy_data,
                                           src_ctx->data_size, GFP_KERNEL);
                if (!ctx->legacy_data) {
                        kfree(ctx);
                        return -ENOMEM;
                }
        }

        fc->fs_private = ctx;
        return 0;
}

/*
 * Add a parameter to a legacy config.  We build up a comma-separated list of
 * options.
 */
static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct legacy_fs_context *ctx = fc->fs_private;
        unsigned int size = ctx->data_size;
        size_t len = 0;
        int ret;

        ret = vfs_parse_fs_param_source(fc, param);
        if (ret != -ENOPARAM)
                return ret;

        if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS)
                return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options");

        switch (param->type) {
        case fs_value_is_string:
                len = 1 + param->size;
                fallthrough;
        case fs_value_is_flag:
                len += strlen(param->key);
                break;
        default:
                return invalf(fc, "VFS: Legacy: Parameter type for '%s' not supported",
                              param->key);
        }

        if (size + len + 2 > PAGE_SIZE)
                return invalf(fc, "VFS: Legacy: Cumulative options too large");
        if (strchr(param->key, ',') ||
            (param->type == fs_value_is_string &&
             memchr(param->string, ',', param->size)))
                return invalf(fc, "VFS: Legacy: Option '%s' contained comma",
                              param->key);
        if (!ctx->legacy_data) {
                ctx->legacy_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
                if (!ctx->legacy_data)
                        return -ENOMEM;
        }

        if (size)
                ctx->legacy_data[size++] = ',';
        len = strlen(param->key);
        memcpy(ctx->legacy_data + size, param->key, len);
        size += len;
        if (param->type == fs_value_is_string) {
                ctx->legacy_data[size++] = '=';
                memcpy(ctx->legacy_data + size, param->string, param->size);
                size += param->size;
        }
        ctx->legacy_data[size] = '\0';
        ctx->data_size = size;
        ctx->param_type = LEGACY_FS_INDIVIDUAL_PARAMS;
        return 0;
}

/*
 * Add monolithic mount data.
 */
static int legacy_parse_monolithic(struct fs_context *fc, void *data)
{
        struct legacy_fs_context *ctx = fc->fs_private;

        if (ctx->param_type != LEGACY_FS_UNSET_PARAMS) {
                pr_warn("VFS: Can't mix monolithic and individual options\n");
                return -EINVAL;
        }

        ctx->legacy_data = data;
        ctx->param_type = LEGACY_FS_MONOLITHIC_PARAMS;
        if (!ctx->legacy_data)
                return 0;

        if (fc->fs_type->fs_flags & FS_BINARY_MOUNTDATA)
                return 0;
        return security_sb_eat_lsm_opts(ctx->legacy_data, &fc->security);
}

/*
 * Get a mountable root with the legacy mount command.
 */
static int legacy_get_tree(struct fs_context *fc)
{
        struct legacy_fs_context *ctx = fc->fs_private;
        struct super_block *sb;
        struct dentry *root;

        root = fc->fs_type->mount(fc->fs_type, fc->sb_flags,
                                      fc->source, ctx->legacy_data);
        if (IS_ERR(root))
                return PTR_ERR(root);

        sb = root->d_sb;
        BUG_ON(!sb);

        fc->root = root;
        return 0;
}

/*
 * Handle remount.
 */
static int legacy_reconfigure(struct fs_context *fc)
{
        struct legacy_fs_context *ctx = fc->fs_private;
        struct super_block *sb = fc->root->d_sb;

        if (!sb->s_op->remount_fs)
                return 0;

        return sb->s_op->remount_fs(sb, &fc->sb_flags,
                                    ctx ? ctx->legacy_data : NULL);
}

const struct fs_context_operations legacy_fs_context_ops = {
        .free                        = legacy_fs_context_free,
        .dup                        = legacy_fs_context_dup,
        .parse_param                = legacy_parse_param,
        .parse_monolithic        = legacy_parse_monolithic,
        .get_tree                = legacy_get_tree,
        .reconfigure                = legacy_reconfigure,
};

/*
 * Initialise a legacy context for a filesystem that doesn't support
 * fs_context.
 */
static int legacy_init_fs_context(struct fs_context *fc)
{
        fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT);
        if (!fc->fs_private)
                return -ENOMEM;
        fc->ops = &legacy_fs_context_ops;
        return 0;
}

int parse_monolithic_mount_data(struct fs_context *fc, void *data)
{
        int (*monolithic_mount_data)(struct fs_context *, void *);

        monolithic_mount_data = fc->ops->parse_monolithic;
        if (!monolithic_mount_data)
                monolithic_mount_data = generic_parse_monolithic;

        return monolithic_mount_data(fc, data);
}

/*
 * Clean up a context after performing an action on it and put it into a state
 * from where it can be used to reconfigure a superblock.
 *
 * Note that here we do only the parts that can't fail; the rest is in
 * finish_clean_context() below and in between those fs_context is marked
 * FS_CONTEXT_AWAITING_RECONF.  The reason for splitup is that after
 * successful mount or remount we need to report success to userland.
 * Trying to do full reinit (for the sake of possible subsequent remount)
 * and failing to allocate memory would've put us into a nasty situation.
 * So here we only discard the old state and reinitialization is left
 * until we actually try to reconfigure.
 */
void vfs_clean_context(struct fs_context *fc)
{
        if (fc->need_free && fc->ops && fc->ops->free)
                fc->ops->free(fc);
        fc->need_free = false;
        fc->fs_private = NULL;
        fc->s_fs_info = NULL;
        fc->sb_flags = 0;
        security_free_mnt_opts(&fc->security);
        kfree(fc->source);
        fc->source = NULL;
        fc->exclusive = false;

        fc->purpose = FS_CONTEXT_FOR_RECONFIGURE;
        fc->phase = FS_CONTEXT_AWAITING_RECONF;
}

int finish_clean_context(struct fs_context *fc)
{
        int error;

        if (fc->phase != FS_CONTEXT_AWAITING_RECONF)
                return 0;

        if (fc->fs_type->init_fs_context)
                error = fc->fs_type->init_fs_context(fc);
        else
                error = legacy_init_fs_context(fc);
        if (unlikely(error)) {
                fc->phase = FS_CONTEXT_FAILED;
                return error;
        }
        fc->need_free = true;
        fc->phase = FS_CONTEXT_RECONF_PARAMS;
        return 0;
}



































































































































































































    3 


















































































































































































    3 
    4 







































































































    3 


    3 
















   13 
















   12 


















   11 
   13 















   11 


















    3 



    3 









































   11 



   12 


   13 


    9 







   10 









































































    9 





   13 

   11 

    1 
   13 














    1 

    9 














   10 










   11 




   12 
    1 
   11 

    8 
    3 







   13 
    2 


    3 








    1 
   11 

   12 
















































   13 
















   11 

















   11 

    7 


    9 


   11 



































































































   11 



   13 

    1 



    3 


   11 





















































































    3 











    7 





   16 




































   10 
   16 














































   15 
   14 








   15 

















    9 
    6 
   13 






    1 

    1 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 1991, 1992 Linus Torvalds
 * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
 * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
 *        -  July2000
 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
 */

/*
 * This handles all read/write requests to block devices
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-pm.h>
#include <linux/blk-integrity.h>
#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/kernel_stat.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/completion.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/fault-inject.h>
#include <linux/list_sort.h>
#include <linux/delay.h>
#include <linux/ratelimit.h>
#include <linux/pm_runtime.h>
#include <linux/t10-pi.h>
#include <linux/debugfs.h>
#include <linux/bpf.h>
#include <linux/part_stat.h>
#include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h>

#define CREATE_TRACE_POINTS
#include <trace/events/block.h>

#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-pm.h"
#include "blk-cgroup.h"
#include "blk-throttle.h"
#include "blk-ioprio.h"

struct dentry *blk_debugfs_root;

EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert);

static DEFINE_IDA(blk_queue_ida);

/*
 * For queue allocation
 */
static struct kmem_cache *blk_requestq_cachep;

/*
 * Controlling structure to kblockd
 */
static struct workqueue_struct *kblockd_workqueue;

/**
 * blk_queue_flag_set - atomically set a queue flag
 * @flag: flag to be set
 * @q: request queue
 */
void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
{
        set_bit(flag, &q->queue_flags);
}
EXPORT_SYMBOL(blk_queue_flag_set);

/**
 * blk_queue_flag_clear - atomically clear a queue flag
 * @flag: flag to be cleared
 * @q: request queue
 */
void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
{
        clear_bit(flag, &q->queue_flags);
}
EXPORT_SYMBOL(blk_queue_flag_clear);

/**
 * blk_queue_flag_test_and_set - atomically test and set a queue flag
 * @flag: flag to be set
 * @q: request queue
 *
 * Returns the previous value of @flag - 0 if the flag was not set and 1 if
 * the flag was already set.
 */
bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
{
        return test_and_set_bit(flag, &q->queue_flags);
}
EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);

#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
static const char *const blk_op_name[] = {
        REQ_OP_NAME(READ),
        REQ_OP_NAME(WRITE),
        REQ_OP_NAME(FLUSH),
        REQ_OP_NAME(DISCARD),
        REQ_OP_NAME(SECURE_ERASE),
        REQ_OP_NAME(ZONE_RESET),
        REQ_OP_NAME(ZONE_RESET_ALL),
        REQ_OP_NAME(ZONE_OPEN),
        REQ_OP_NAME(ZONE_CLOSE),
        REQ_OP_NAME(ZONE_FINISH),
        REQ_OP_NAME(ZONE_APPEND),
        REQ_OP_NAME(WRITE_ZEROES),
        REQ_OP_NAME(DRV_IN),
        REQ_OP_NAME(DRV_OUT),
};
#undef REQ_OP_NAME

/**
 * blk_op_str - Return string XXX in the REQ_OP_XXX.
 * @op: REQ_OP_XXX.
 *
 * Description: Centralize block layer function to convert REQ_OP_XXX into
 * string format. Useful in the debugging and tracing bio or request. For
 * invalid REQ_OP_XXX it returns string "UNKNOWN".
 */
inline const char *blk_op_str(enum req_op op)
{
        const char *op_str = "UNKNOWN";

        if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
                op_str = blk_op_name[op];

        return op_str;
}
EXPORT_SYMBOL_GPL(blk_op_str);

static const struct {
        int                errno;
        const char        *name;
} blk_errors[] = {
        [BLK_STS_OK]                = { 0,                "" },
        [BLK_STS_NOTSUPP]        = { -EOPNOTSUPP, "operation not supported" },
        [BLK_STS_TIMEOUT]        = { -ETIMEDOUT,        "timeout" },
        [BLK_STS_NOSPC]                = { -ENOSPC,        "critical space allocation" },
        [BLK_STS_TRANSPORT]        = { -ENOLINK,        "recoverable transport" },
        [BLK_STS_TARGET]        = { -EREMOTEIO,        "critical target" },
        [BLK_STS_RESV_CONFLICT]        = { -EBADE,        "reservation conflict" },
        [BLK_STS_MEDIUM]        = { -ENODATA,        "critical medium" },
        [BLK_STS_PROTECTION]        = { -EILSEQ,        "protection" },
        [BLK_STS_RESOURCE]        = { -ENOMEM,        "kernel resource" },
        [BLK_STS_DEV_RESOURCE]        = { -EBUSY,        "device resource" },
        [BLK_STS_AGAIN]                = { -EAGAIN,        "nonblocking retry" },
        [BLK_STS_OFFLINE]        = { -ENODEV,        "device offline" },

        /* device mapper special case, should not leak out: */
        [BLK_STS_DM_REQUEUE]        = { -EREMCHG, "dm internal retry" },

        /* zone device specific errors */
        [BLK_STS_ZONE_OPEN_RESOURCE]        = { -ETOOMANYREFS, "open zones exceeded" },
        [BLK_STS_ZONE_ACTIVE_RESOURCE]        = { -EOVERFLOW, "active zones exceeded" },

        /* Command duration limit device-side timeout */
        [BLK_STS_DURATION_LIMIT]        = { -ETIME, "duration limit exceeded" },

        /* everything else not covered above: */
        [BLK_STS_IOERR]                = { -EIO,        "I/O" },
};

blk_status_t errno_to_blk_status(int errno)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(blk_errors); i++) {
                if (blk_errors[i].errno == errno)
                        return (__force blk_status_t)i;
        }

        return BLK_STS_IOERR;
}
EXPORT_SYMBOL_GPL(errno_to_blk_status);

int blk_status_to_errno(blk_status_t status)
{
        int idx = (__force int)status;

        if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
                return -EIO;
        return blk_errors[idx].errno;
}
EXPORT_SYMBOL_GPL(blk_status_to_errno);

const char *blk_status_to_str(blk_status_t status)
{
        int idx = (__force int)status;

        if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
                return "<null>";
        return blk_errors[idx].name;
}
EXPORT_SYMBOL_GPL(blk_status_to_str);

/**
 * blk_sync_queue - cancel any pending callbacks on a queue
 * @q: the queue
 *
 * Description:
 *     The block layer may perform asynchronous callback activity
 *     on a queue, such as calling the unplug function after a timeout.
 *     A block device may call blk_sync_queue to ensure that any
 *     such activity is cancelled, thus allowing it to release resources
 *     that the callbacks might use. The caller must already have made sure
 *     that its ->submit_bio will not re-add plugging prior to calling
 *     this function.
 *
 *     This function does not cancel any asynchronous activity arising
 *     out of elevator or throttling code. That would require elevator_exit()
 *     and blkcg_exit_queue() to be called with queue lock initialized.
 *
 */
void blk_sync_queue(struct request_queue *q)
{
        del_timer_sync(&q->timeout);
        cancel_work_sync(&q->timeout_work);
}
EXPORT_SYMBOL(blk_sync_queue);

/**
 * blk_set_pm_only - increment pm_only counter
 * @q: request queue pointer
 */
void blk_set_pm_only(struct request_queue *q)
{
        atomic_inc(&q->pm_only);
}
EXPORT_SYMBOL_GPL(blk_set_pm_only);

void blk_clear_pm_only(struct request_queue *q)
{
        int pm_only;

        pm_only = atomic_dec_return(&q->pm_only);
        WARN_ON_ONCE(pm_only < 0);
        if (pm_only == 0)
                wake_up_all(&q->mq_freeze_wq);
}
EXPORT_SYMBOL_GPL(blk_clear_pm_only);

static void blk_free_queue_rcu(struct rcu_head *rcu_head)
{
        struct request_queue *q = container_of(rcu_head,
                        struct request_queue, rcu_head);

        percpu_ref_exit(&q->q_usage_counter);
        kmem_cache_free(blk_requestq_cachep, q);
}

static void blk_free_queue(struct request_queue *q)
{
        blk_free_queue_stats(q->stats);
        if (queue_is_mq(q))
                blk_mq_release(q);

        ida_free(&blk_queue_ida, q->id);
        call_rcu(&q->rcu_head, blk_free_queue_rcu);
}

/**
 * blk_put_queue - decrement the request_queue refcount
 * @q: the request_queue structure to decrement the refcount for
 *
 * Decrements the refcount of the request_queue and free it when the refcount
 * reaches 0.
 */
void blk_put_queue(struct request_queue *q)
{
        if (refcount_dec_and_test(&q->refs))
                blk_free_queue(q);
}
EXPORT_SYMBOL(blk_put_queue);

void blk_queue_start_drain(struct request_queue *q)
{
        /*
         * When queue DYING flag is set, we need to block new req
         * entering queue, so we call blk_freeze_queue_start() to
         * prevent I/O from crossing blk_queue_enter().
         */
        blk_freeze_queue_start(q);
        if (queue_is_mq(q))
                blk_mq_wake_waiters(q);
        /* Make blk_queue_enter() reexamine the DYING flag. */
        wake_up_all(&q->mq_freeze_wq);
}

/**
 * blk_queue_enter() - try to increase q->q_usage_counter
 * @q: request queue pointer
 * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM
 */
int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
{
        const bool pm = flags & BLK_MQ_REQ_PM;

        while (!blk_try_enter_queue(q, pm)) {
                if (flags & BLK_MQ_REQ_NOWAIT)
                        return -EAGAIN;

                /*
                 * read pair of barrier in blk_freeze_queue_start(), we need to
                 * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
                 * reading .mq_freeze_depth or queue dying flag, otherwise the
                 * following wait may never return if the two reads are
                 * reordered.
                 */
                smp_rmb();
                wait_event(q->mq_freeze_wq,
                           (!q->mq_freeze_depth &&
                            blk_pm_resume_queue(pm, q)) ||
                           blk_queue_dying(q));
                if (blk_queue_dying(q))
                        return -ENODEV;
        }

        return 0;
}

int __bio_queue_enter(struct request_queue *q, struct bio *bio)
{
        while (!blk_try_enter_queue(q, false)) {
                struct gendisk *disk = bio->bi_bdev->bd_disk;

                if (bio->bi_opf & REQ_NOWAIT) {
                        if (test_bit(GD_DEAD, &disk->state))
                                goto dead;
                        bio_wouldblock_error(bio);
                        return -EAGAIN;
                }

                /*
                 * read pair of barrier in blk_freeze_queue_start(), we need to
                 * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
                 * reading .mq_freeze_depth or queue dying flag, otherwise the
                 * following wait may never return if the two reads are
                 * reordered.
                 */
                smp_rmb();
                wait_event(q->mq_freeze_wq,
                           (!q->mq_freeze_depth &&
                            blk_pm_resume_queue(false, q)) ||
                           test_bit(GD_DEAD, &disk->state));
                if (test_bit(GD_DEAD, &disk->state))
                        goto dead;
        }

        return 0;
dead:
        bio_io_error(bio);
        return -ENODEV;
}

void blk_queue_exit(struct request_queue *q)
{
        percpu_ref_put(&q->q_usage_counter);
}

static void blk_queue_usage_counter_release(struct percpu_ref *ref)
{
        struct request_queue *q =
                container_of(ref, struct request_queue, q_usage_counter);

        wake_up_all(&q->mq_freeze_wq);
}

static void blk_rq_timed_out_timer(struct timer_list *t)
{
        struct request_queue *q = from_timer(q, t, timeout);

        kblockd_schedule_work(&q->timeout_work);
}

static void blk_timeout_work(struct work_struct *work)
{
}

struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
{
        struct request_queue *q;
        int error;

        q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO,
                                  node_id);
        if (!q)
                return ERR_PTR(-ENOMEM);

        q->last_merge = NULL;

        q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL);
        if (q->id < 0) {
                error = q->id;
                goto fail_q;
        }

        q->stats = blk_alloc_queue_stats();
        if (!q->stats) {
                error = -ENOMEM;
                goto fail_id;
        }

        error = blk_set_default_limits(lim);
        if (error)
                goto fail_stats;
        q->limits = *lim;

        q->node = node_id;

        atomic_set(&q->nr_active_requests_shared_tags, 0);

        timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
        INIT_WORK(&q->timeout_work, blk_timeout_work);
        INIT_LIST_HEAD(&q->icq_list);

        refcount_set(&q->refs, 1);
        mutex_init(&q->debugfs_mutex);
        mutex_init(&q->sysfs_lock);
        mutex_init(&q->sysfs_dir_lock);
        mutex_init(&q->limits_lock);
        mutex_init(&q->rq_qos_mutex);
        spin_lock_init(&q->queue_lock);

        init_waitqueue_head(&q->mq_freeze_wq);
        mutex_init(&q->mq_freeze_lock);

        blkg_init_queue(q);

        /*
         * Init percpu_ref in atomic mode so that it's faster to shutdown.
         * See blk_register_queue() for details.
         */
        error = percpu_ref_init(&q->q_usage_counter,
                                blk_queue_usage_counter_release,
                                PERCPU_REF_INIT_ATOMIC, GFP_KERNEL);
        if (error)
                goto fail_stats;

        q->nr_requests = BLKDEV_DEFAULT_RQ;

        return q;

fail_stats:
        blk_free_queue_stats(q->stats);
fail_id:
        ida_free(&blk_queue_ida, q->id);
fail_q:
        kmem_cache_free(blk_requestq_cachep, q);
        return ERR_PTR(error);
}

/**
 * blk_get_queue - increment the request_queue refcount
 * @q: the request_queue structure to increment the refcount for
 *
 * Increment the refcount of the request_queue kobject.
 *
 * Context: Any context.
 */
bool blk_get_queue(struct request_queue *q)
{
        if (unlikely(blk_queue_dying(q)))
                return false;
        refcount_inc(&q->refs);
        return true;
}
EXPORT_SYMBOL(blk_get_queue);

#ifdef CONFIG_FAIL_MAKE_REQUEST

static DECLARE_FAULT_ATTR(fail_make_request);

static int __init setup_fail_make_request(char *str)
{
        return setup_fault_attr(&fail_make_request, str);
}
__setup("fail_make_request=", setup_fail_make_request);

bool should_fail_request(struct block_device *part, unsigned int bytes)
{
        return bdev_test_flag(part, BD_MAKE_IT_FAIL) &&
               should_fail(&fail_make_request, bytes);
}

static int __init fail_make_request_debugfs(void)
{
        struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
                                                NULL, &fail_make_request);

        return PTR_ERR_OR_ZERO(dir);
}

late_initcall(fail_make_request_debugfs);
#endif /* CONFIG_FAIL_MAKE_REQUEST */

static inline void bio_check_ro(struct bio *bio)
{
        if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
                if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
                        return;

                if (bdev_test_flag(bio->bi_bdev, BD_RO_WARNED))
                        return;

                bdev_set_flag(bio->bi_bdev, BD_RO_WARNED);

                /*
                 * Use ioctl to set underlying disk of raid/dm to read-only
                 * will trigger this.
                 */
                pr_warn("Trying to write to read-only block-device %pg\n",
                        bio->bi_bdev);
        }
}

static noinline int should_fail_bio(struct bio *bio)
{
        if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size))
                return -EIO;
        return 0;
}
ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);

/*
 * Check whether this bio extends beyond the end of the device or partition.
 * This may well happen - the kernel calls bread() without checking the size of
 * the device, e.g., when mounting a file system.
 */
static inline int bio_check_eod(struct bio *bio)
{
        sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);
        unsigned int nr_sectors = bio_sectors(bio);

        if (nr_sectors &&
            (nr_sectors > maxsector ||
             bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
                pr_info_ratelimited("%s: attempt to access beyond end of device\n"
                                    "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n",
                                    current->comm, bio->bi_bdev, bio->bi_opf,
                                    bio->bi_iter.bi_sector, nr_sectors, maxsector);
                return -EIO;
        }
        return 0;
}

/*
 * Remap block n of partition p to block n+start(p) of the disk.
 */
static int blk_partition_remap(struct bio *bio)
{
        struct block_device *p = bio->bi_bdev;

        if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
                return -EIO;
        if (bio_sectors(bio)) {
                bio->bi_iter.bi_sector += p->bd_start_sect;
                trace_block_bio_remap(bio, p->bd_dev,
                                      bio->bi_iter.bi_sector -
                                      p->bd_start_sect);
        }
        bio_set_flag(bio, BIO_REMAPPED);
        return 0;
}

/*
 * Check write append to a zoned block device.
 */
static inline blk_status_t blk_check_zone_append(struct request_queue *q,
                                                 struct bio *bio)
{
        int nr_sectors = bio_sectors(bio);

        /* Only applicable to zoned block devices */
        if (!bdev_is_zoned(bio->bi_bdev))
                return BLK_STS_NOTSUPP;

        /* The bio sector must point to the start of a sequential zone */
        if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector))
                return BLK_STS_IOERR;

        /*
         * Not allowed to cross zone boundaries. Otherwise, the BIO will be
         * split and could result in non-contiguous sectors being written in
         * different zones.
         */
        if (nr_sectors > q->limits.chunk_sectors)
                return BLK_STS_IOERR;

        /* Make sure the BIO is small enough and will not get split */
        if (nr_sectors > queue_max_zone_append_sectors(q))
                return BLK_STS_IOERR;

        bio->bi_opf |= REQ_NOMERGE;

        return BLK_STS_OK;
}

static void __submit_bio(struct bio *bio)
{
        /* If plug is not used, add new plug here to cache nsecs time. */
        struct blk_plug plug;

        if (unlikely(!blk_crypto_bio_prep(&bio)))
                return;

        blk_start_plug(&plug);

        if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) {
                blk_mq_submit_bio(bio);
        } else if (likely(bio_queue_enter(bio) == 0)) {
                struct gendisk *disk = bio->bi_bdev->bd_disk;

                disk->fops->submit_bio(bio);
                blk_queue_exit(disk->queue);
        }

        blk_finish_plug(&plug);
}

/*
 * The loop in this function may be a bit non-obvious, and so deserves some
 * explanation:
 *
 *  - Before entering the loop, bio->bi_next is NULL (as all callers ensure
 *    that), so we have a list with a single bio.
 *  - We pretend that we have just taken it off a longer list, so we assign
 *    bio_list to a pointer to the bio_list_on_stack, thus initialising the
 *    bio_list of new bios to be added.  ->submit_bio() may indeed add some more
 *    bios through a recursive call to submit_bio_noacct.  If it did, we find a
 *    non-NULL value in bio_list and re-enter the loop from the top.
 *  - In this case we really did just take the bio of the top of the list (no
 *    pretending) and so remove it from bio_list, and call into ->submit_bio()
 *    again.
 *
 * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
 * bio_list_on_stack[1] contains bios that were submitted before the current
 *        ->submit_bio, but that haven't been processed yet.
 */
static void __submit_bio_noacct(struct bio *bio)
{
        struct bio_list bio_list_on_stack[2];

        BUG_ON(bio->bi_next);

        bio_list_init(&bio_list_on_stack[0]);
        current->bio_list = bio_list_on_stack;

        do {
                struct request_queue *q = bdev_get_queue(bio->bi_bdev);
                struct bio_list lower, same;

                /*
                 * Create a fresh bio_list for all subordinate requests.
                 */
                bio_list_on_stack[1] = bio_list_on_stack[0];
                bio_list_init(&bio_list_on_stack[0]);

                __submit_bio(bio);

                /*
                 * Sort new bios into those for a lower level and those for the
                 * same level.
                 */
                bio_list_init(&lower);
                bio_list_init(&same);
                while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
                        if (q == bdev_get_queue(bio->bi_bdev))
                                bio_list_add(&same, bio);
                        else
                                bio_list_add(&lower, bio);

                /*
                 * Now assemble so we handle the lowest level first.
                 */
                bio_list_merge(&bio_list_on_stack[0], &lower);
                bio_list_merge(&bio_list_on_stack[0], &same);
                bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
        } while ((bio = bio_list_pop(&bio_list_on_stack[0])));

        current->bio_list = NULL;
}

static void __submit_bio_noacct_mq(struct bio *bio)
{
        struct bio_list bio_list[2] = { };

        current->bio_list = bio_list;

        do {
                __submit_bio(bio);
        } while ((bio = bio_list_pop(&bio_list[0])));

        current->bio_list = NULL;
}

void submit_bio_noacct_nocheck(struct bio *bio)
{
        blk_cgroup_bio_start(bio);
        blkcg_bio_issue_init(bio);

        if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
                trace_block_bio_queue(bio);
                /*
                 * Now that enqueuing has been traced, we need to trace
                 * completion as well.
                 */
                bio_set_flag(bio, BIO_TRACE_COMPLETION);
        }

        /*
         * We only want one ->submit_bio to be active at a time, else stack
         * usage with stacked devices could be a problem.  Use current->bio_list
         * to collect a list of requests submited by a ->submit_bio method while
         * it is active, and then process them after it returned.
         */
        if (current->bio_list)
                bio_list_add(&current->bio_list[0], bio);
        else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO))
                __submit_bio_noacct_mq(bio);
        else
                __submit_bio_noacct(bio);
}

/**
 * submit_bio_noacct - re-submit a bio to the block device layer for I/O
 * @bio:  The bio describing the location in memory and on the device.
 *
 * This is a version of submit_bio() that shall only be used for I/O that is
 * resubmitted to lower level drivers by stacking block drivers.  All file
 * systems and other upper level users of the block layer should use
 * submit_bio() instead.
 */
void submit_bio_noacct(struct bio *bio)
{
        struct block_device *bdev = bio->bi_bdev;
        struct request_queue *q = bdev_get_queue(bdev);
        blk_status_t status = BLK_STS_IOERR;

        might_sleep();

        /*
         * For a REQ_NOWAIT based request, return -EOPNOTSUPP
         * if queue does not support NOWAIT.
         */
        if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev))
                goto not_supported;

        if (should_fail_bio(bio))
                goto end_io;
        bio_check_ro(bio);
        if (!bio_flagged(bio, BIO_REMAPPED)) {
                if (unlikely(bio_check_eod(bio)))
                        goto end_io;
                if (bdev_is_partition(bdev) &&
                    unlikely(blk_partition_remap(bio)))
                        goto end_io;
        }

        /*
         * Filter flush bio's early so that bio based drivers without flush
         * support don't have to worry about them.
         */
        if (op_is_flush(bio->bi_opf)) {
                if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE &&
                                 bio_op(bio) != REQ_OP_ZONE_APPEND))
                        goto end_io;
                if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
                        bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
                        if (!bio_sectors(bio)) {
                                status = BLK_STS_OK;
                                goto end_io;
                        }
                }
        }

        if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
                bio_clear_polled(bio);

        switch (bio_op(bio)) {
        case REQ_OP_READ:
        case REQ_OP_WRITE:
                break;
        case REQ_OP_FLUSH:
                /*
                 * REQ_OP_FLUSH can't be submitted through bios, it is only
                 * synthetized in struct request by the flush state machine.
                 */
                goto not_supported;
        case REQ_OP_DISCARD:
                if (!bdev_max_discard_sectors(bdev))
                        goto not_supported;
                break;
        case REQ_OP_SECURE_ERASE:
                if (!bdev_max_secure_erase_sectors(bdev))
                        goto not_supported;
                break;
        case REQ_OP_ZONE_APPEND:
                status = blk_check_zone_append(q, bio);
                if (status != BLK_STS_OK)
                        goto end_io;
                break;
        case REQ_OP_WRITE_ZEROES:
                if (!q->limits.max_write_zeroes_sectors)
                        goto not_supported;
                break;
        case REQ_OP_ZONE_RESET:
        case REQ_OP_ZONE_OPEN:
        case REQ_OP_ZONE_CLOSE:
        case REQ_OP_ZONE_FINISH:
                if (!bdev_is_zoned(bio->bi_bdev))
                        goto not_supported;
                break;
        case REQ_OP_ZONE_RESET_ALL:
                if (!bdev_is_zoned(bio->bi_bdev) || !blk_queue_zone_resetall(q))
                        goto not_supported;
                break;
        case REQ_OP_DRV_IN:
        case REQ_OP_DRV_OUT:
                /*
                 * Driver private operations are only used with passthrough
                 * requests.
                 */
                fallthrough;
        default:
                goto not_supported;
        }

        if (blk_throtl_bio(bio))
                return;
        submit_bio_noacct_nocheck(bio);
        return;

not_supported:
        status = BLK_STS_NOTSUPP;
end_io:
        bio->bi_status = status;
        bio_endio(bio);
}
EXPORT_SYMBOL(submit_bio_noacct);

static void bio_set_ioprio(struct bio *bio)
{
        /* Nobody set ioprio so far? Initialize it based on task's nice value */
        if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
                bio->bi_ioprio = get_current_ioprio();
        blkcg_set_ioprio(bio);
}

/**
 * submit_bio - submit a bio to the block device layer for I/O
 * @bio: The &struct bio which describes the I/O
 *
 * submit_bio() is used to submit I/O requests to block devices.  It is passed a
 * fully set up &struct bio that describes the I/O that needs to be done.  The
 * bio will be send to the device described by the bi_bdev field.
 *
 * The success/failure status of the request, along with notification of
 * completion, is delivered asynchronously through the ->bi_end_io() callback
 * in @bio.  The bio must NOT be touched by the caller until ->bi_end_io() has
 * been called.
 */
void submit_bio(struct bio *bio)
{
        if (bio_op(bio) == REQ_OP_READ) {
                task_io_account_read(bio->bi_iter.bi_size);
                count_vm_events(PGPGIN, bio_sectors(bio));
        } else if (bio_op(bio) == REQ_OP_WRITE) {
                count_vm_events(PGPGOUT, bio_sectors(bio));
        }

        bio_set_ioprio(bio);
        submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio);

/**
 * bio_poll - poll for BIO completions
 * @bio: bio to poll for
 * @iob: batches of IO
 * @flags: BLK_POLL_* flags that control the behavior
 *
 * Poll for completions on queue associated with the bio. Returns number of
 * completed entries found.
 *
 * Note: the caller must either be the context that submitted @bio, or
 * be in a RCU critical section to prevent freeing of @bio.
 */
int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
{
        blk_qc_t cookie = READ_ONCE(bio->bi_cookie);
        struct block_device *bdev;
        struct request_queue *q;
        int ret = 0;

        bdev = READ_ONCE(bio->bi_bdev);
        if (!bdev)
                return 0;

        q = bdev_get_queue(bdev);
        if (cookie == BLK_QC_T_NONE ||
            !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
                return 0;

        blk_flush_plug(current->plug, false);

        /*
         * We need to be able to enter a frozen queue, similar to how
         * timeouts also need to do that. If that is blocked, then we can
         * have pending IO when a queue freeze is started, and then the
         * wait for the freeze to finish will wait for polled requests to
         * timeout as the poller is preventer from entering the queue and
         * completing them. As long as we prevent new IO from being queued,
         * that should be all that matters.
         */
        if (!percpu_ref_tryget(&q->q_usage_counter))
                return 0;
        if (queue_is_mq(q)) {
                ret = blk_mq_poll(q, cookie, iob, flags);
        } else {
                struct gendisk *disk = q->disk;

                if (disk && disk->fops->poll_bio)
                        ret = disk->fops->poll_bio(bio, iob, flags);
        }
        blk_queue_exit(q);
        return ret;
}
EXPORT_SYMBOL_GPL(bio_poll);

/*
 * Helper to implement file_operations.iopoll.  Requires the bio to be stored
 * in iocb->private, and cleared before freeing the bio.
 */
int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
                    unsigned int flags)
{
        struct bio *bio;
        int ret = 0;

        /*
         * Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can
         * point to a freshly allocated bio at this point.  If that happens
         * we have a few cases to consider:
         *
         *  1) the bio is beeing initialized and bi_bdev is NULL.  We can just
         *     simply nothing in this case
         *  2) the bio points to a not poll enabled device.  bio_poll will catch
         *     this and return 0
         *  3) the bio points to a poll capable device, including but not
         *     limited to the one that the original bio pointed to.  In this
         *     case we will call into the actual poll method and poll for I/O,
         *     even if we don't need to, but it won't cause harm either.
         *
         * For cases 2) and 3) above the RCU grace period ensures that bi_bdev
         * is still allocated. Because partitions hold a reference to the whole
         * device bdev and thus disk, the disk is also still valid.  Grabbing
         * a reference to the queue in bio_poll() ensures the hctxs and requests
         * are still valid as well.
         */
        rcu_read_lock();
        bio = READ_ONCE(kiocb->private);
        if (bio)
                ret = bio_poll(bio, iob, flags);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(iocb_bio_iopoll);

void update_io_ticks(struct block_device *part, unsigned long now, bool end)
{
        unsigned long stamp;
again:
        stamp = READ_ONCE(part->bd_stamp);
        if (unlikely(time_after(now, stamp)) &&
            likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
            (end || part_in_flight(part)))
                __part_stat_add(part, io_ticks, now - stamp);

        if (bdev_is_partition(part)) {
                part = bdev_whole(part);
                goto again;
        }
}

unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op,
                                 unsigned long start_time)
{
        part_stat_lock();
        update_io_ticks(bdev, start_time, false);
        part_stat_local_inc(bdev, in_flight[op_is_write(op)]);
        part_stat_unlock();

        return start_time;
}
EXPORT_SYMBOL(bdev_start_io_acct);

/**
 * bio_start_io_acct - start I/O accounting for bio based drivers
 * @bio:        bio to start account for
 *
 * Returns the start time that should be passed back to bio_end_io_acct().
 */
unsigned long bio_start_io_acct(struct bio *bio)
{
        return bdev_start_io_acct(bio->bi_bdev, bio_op(bio), jiffies);
}
EXPORT_SYMBOL_GPL(bio_start_io_acct);

void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
                      unsigned int sectors, unsigned long start_time)
{
        const int sgrp = op_stat_group(op);
        unsigned long now = READ_ONCE(jiffies);
        unsigned long duration = now - start_time;

        part_stat_lock();
        update_io_ticks(bdev, now, true);
        part_stat_inc(bdev, ios[sgrp]);
        part_stat_add(bdev, sectors[sgrp], sectors);
        part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
        part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
        part_stat_unlock();
}
EXPORT_SYMBOL(bdev_end_io_acct);

void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
                              struct block_device *orig_bdev)
{
        bdev_end_io_acct(orig_bdev, bio_op(bio), bio_sectors(bio), start_time);
}
EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped);

/**
 * blk_lld_busy - Check if underlying low-level drivers of a device are busy
 * @q : the queue of the device being checked
 *
 * Description:
 *    Check if underlying low-level drivers of a device are busy.
 *    If the drivers want to export their busy state, they must set own
 *    exporting function using blk_queue_lld_busy() first.
 *
 *    Basically, this function is used only by request stacking drivers
 *    to stop dispatching requests to underlying devices when underlying
 *    devices are busy.  This behavior helps more I/O merging on the queue
 *    of the request stacking driver and prevents I/O throughput regression
 *    on burst I/O load.
 *
 * Return:
 *    0 - Not busy (The request stacking driver should dispatch request)
 *    1 - Busy (The request stacking driver should stop dispatching request)
 */
int blk_lld_busy(struct request_queue *q)
{
        if (queue_is_mq(q) && q->mq_ops->busy)
                return q->mq_ops->busy(q);

        return 0;
}
EXPORT_SYMBOL_GPL(blk_lld_busy);

int kblockd_schedule_work(struct work_struct *work)
{
        return queue_work(kblockd_workqueue, work);
}
EXPORT_SYMBOL(kblockd_schedule_work);

int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
                                unsigned long delay)
{
        return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
}
EXPORT_SYMBOL(kblockd_mod_delayed_work_on);

void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
{
        struct task_struct *tsk = current;

        /*
         * If this is a nested plug, don't actually assign it.
         */
        if (tsk->plug)
                return;

        plug->cur_ktime = 0;
        plug->mq_list = NULL;
        plug->cached_rq = NULL;
        plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
        plug->rq_count = 0;
        plug->multiple_queues = false;
        plug->has_elevator = false;
        INIT_LIST_HEAD(&plug->cb_list);

        /*
         * Store ordering should not be needed here, since a potential
         * preempt will imply a full memory barrier
         */
        tsk->plug = plug;
}

/**
 * blk_start_plug - initialize blk_plug and track it inside the task_struct
 * @plug:        The &struct blk_plug that needs to be initialized
 *
 * Description:
 *   blk_start_plug() indicates to the block layer an intent by the caller
 *   to submit multiple I/O requests in a batch.  The block layer may use
 *   this hint to defer submitting I/Os from the caller until blk_finish_plug()
 *   is called.  However, the block layer may choose to submit requests
 *   before a call to blk_finish_plug() if the number of queued I/Os
 *   exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than
 *   %BLK_PLUG_FLUSH_SIZE.  The queued I/Os may also be submitted early if
 *   the task schedules (see below).
 *
 *   Tracking blk_plug inside the task_struct will help with auto-flushing the
 *   pending I/O should the task end up blocking between blk_start_plug() and
 *   blk_finish_plug(). This is important from a performance perspective, but
 *   also ensures that we don't deadlock. For instance, if the task is blocking
 *   for a memory allocation, memory reclaim could end up wanting to free a
 *   page belonging to that request that is currently residing in our private
 *   plug. By flushing the pending I/O when the process goes to sleep, we avoid
 *   this kind of deadlock.
 */
void blk_start_plug(struct blk_plug *plug)
{
        blk_start_plug_nr_ios(plug, 1);
}
EXPORT_SYMBOL(blk_start_plug);

static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
{
        LIST_HEAD(callbacks);

        while (!list_empty(&plug->cb_list)) {
                list_splice_init(&plug->cb_list, &callbacks);

                while (!list_empty(&callbacks)) {
                        struct blk_plug_cb *cb = list_first_entry(&callbacks,
                                                          struct blk_plug_cb,
                                                          list);
                        list_del(&cb->list);
                        cb->callback(cb, from_schedule);
                }
        }
}

struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
                                      int size)
{
        struct blk_plug *plug = current->plug;
        struct blk_plug_cb *cb;

        if (!plug)
                return NULL;

        list_for_each_entry(cb, &plug->cb_list, list)
                if (cb->callback == unplug && cb->data == data)
                        return cb;

        /* Not currently on the callback list */
        BUG_ON(size < sizeof(*cb));
        cb = kzalloc(size, GFP_ATOMIC);
        if (cb) {
                cb->data = data;
                cb->callback = unplug;
                list_add(&cb->list, &plug->cb_list);
        }
        return cb;
}
EXPORT_SYMBOL(blk_check_plugged);

void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
{
        if (!list_empty(&plug->cb_list))
                flush_plug_callbacks(plug, from_schedule);
        blk_mq_flush_plug_list(plug, from_schedule);
        /*
         * Unconditionally flush out cached requests, even if the unplug
         * event came from schedule. Since we know hold references to the
         * queue for cached requests, we don't want a blocked task holding
         * up a queue freeze/quiesce event.
         */
        if (unlikely(!rq_list_empty(plug->cached_rq)))
                blk_mq_free_plug_rqs(plug);

        plug->cur_ktime = 0;
        current->flags &= ~PF_BLOCK_TS;
}

/**
 * blk_finish_plug - mark the end of a batch of submitted I/O
 * @plug:        The &struct blk_plug passed to blk_start_plug()
 *
 * Description:
 * Indicate that a batch of I/O submissions is complete.  This function
 * must be paired with an initial call to blk_start_plug().  The intent
 * is to allow the block layer to optimize I/O submission.  See the
 * documentation for blk_start_plug() for more information.
 */
void blk_finish_plug(struct blk_plug *plug)
{
        if (plug == current->plug) {
                __blk_flush_plug(plug, false);
                current->plug = NULL;
        }
}
EXPORT_SYMBOL(blk_finish_plug);

void blk_io_schedule(void)
{
        /* Prevent hang_check timer from firing at us during very long I/O */
        unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;

        if (timeout)
                io_schedule_timeout(timeout);
        else
                io_schedule();
}
EXPORT_SYMBOL_GPL(blk_io_schedule);

int __init blk_dev_init(void)
{
        BUILD_BUG_ON((__force u32)REQ_OP_LAST >= (1 << REQ_OP_BITS));
        BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
                        sizeof_field(struct request, cmd_flags));
        BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
                        sizeof_field(struct bio, bi_opf));

        /* used for unplugging and affects IO latency/throughput - HIGHPRI */
        kblockd_workqueue = alloc_workqueue("kblockd",
                                            WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
        if (!kblockd_workqueue)
                panic("Failed to create kblockd\n");

        blk_requestq_cachep = KMEM_CACHE(request_queue, SLAB_PANIC);

        blk_debugfs_root = debugfs_create_dir("block", NULL);

        return 0;
}










   30 

























   29 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
// SPDX-License-Identifier: GPL-2.0-only

#include <linux/uaccess.h>
#include <linux/kernel.h>

#include <asm/vsyscall.h>

#ifdef CONFIG_X86_64
bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
{
        unsigned long vaddr = (unsigned long)unsafe_src;

        /*
         * Do not allow userspace addresses.  This disallows
         * normal userspace and the userspace guard page:
         */
        if (vaddr < TASK_SIZE_MAX + PAGE_SIZE)
                return false;

        /*
         * Reading from the vsyscall page may cause an unhandled fault in
         * certain cases.  Though it is at an address above TASK_SIZE_MAX, it is
         * usually considered as a user space address.
         */
        if (is_vsyscall_vaddr(vaddr))
                return false;

        /*
         * Allow everything during early boot before 'x86_virt_bits'
         * is initialized.  Needed for instruction decoding in early
         * exception handlers.
         */
        if (!boot_cpu_data.x86_virt_bits)
                return true;

        return __is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits);
}
#else
bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
{
        return (unsigned long)unsafe_src >= TASK_SIZE_MAX;
}
#endif











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  linux/drivers/char/serial_core.h
 *
 *  Copyright (C) 2000 Deep Blue Solutions Ltd.
 */
#ifndef LINUX_SERIAL_CORE_H
#define LINUX_SERIAL_CORE_H

#include <linux/bitops.h>
#include <linux/compiler.h>
#include <linux/console.h>
#include <linux/interrupt.h>
#include <linux/spinlock.h>
#include <linux/sched.h>
#include <linux/tty.h>
#include <linux/mutex.h>
#include <linux/sysrq.h>
#include <uapi/linux/serial_core.h>

#ifdef CONFIG_SERIAL_CORE_CONSOLE
#define uart_console(port) \
        ((port)->cons && (port)->cons->index == (port)->line)
#else
#define uart_console(port)      ({ (void)port; 0; })
#endif

struct uart_port;
struct serial_struct;
struct serial_port_device;
struct device;
struct gpio_desc;

/**
 * struct uart_ops -- interface between serial_core and the driver
 *
 * This structure describes all the operations that can be done on the
 * physical hardware.
 *
 * @tx_empty: ``unsigned int ()(struct uart_port *port)``
 *
 *        This function tests whether the transmitter fifo and shifter for the
 *        @port is empty. If it is empty, this function should return
 *        %TIOCSER_TEMT, otherwise return 0. If the port does not support this
 *        operation, then it should return %TIOCSER_TEMT.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *        This call must not sleep
 *
 * @set_mctrl: ``void ()(struct uart_port *port, unsigned int mctrl)``
 *
 *        This function sets the modem control lines for @port to the state
 *        described by @mctrl. The relevant bits of @mctrl are:
 *
 *                - %TIOCM_RTS        RTS signal.
 *                - %TIOCM_DTR        DTR signal.
 *                - %TIOCM_OUT1        OUT1 signal.
 *                - %TIOCM_OUT2        OUT2 signal.
 *                - %TIOCM_LOOP        Set the port into loopback mode.
 *
 *        If the appropriate bit is set, the signal should be driven
 *        active.  If the bit is clear, the signal should be driven
 *        inactive.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @get_mctrl: ``unsigned int ()(struct uart_port *port)``
 *
 *        Returns the current state of modem control inputs of @port. The state
 *        of the outputs should not be returned, since the core keeps track of
 *        their state. The state information should include:
 *
 *                - %TIOCM_CAR        state of DCD signal
 *                - %TIOCM_CTS        state of CTS signal
 *                - %TIOCM_DSR        state of DSR signal
 *                - %TIOCM_RI        state of RI signal
 *
 *        The bit is set if the signal is currently driven active.  If
 *        the port does not support CTS, DCD or DSR, the driver should
 *        indicate that the signal is permanently active. If RI is
 *        not available, the signal should not be indicated as active.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @stop_tx: ``void ()(struct uart_port *port)``
 *
 *        Stop transmitting characters. This might be due to the CTS line
 *        becoming inactive or the tty layer indicating we want to stop
 *        transmission due to an %XOFF character.
 *
 *        The driver should stop transmitting characters as soon as possible.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @start_tx: ``void ()(struct uart_port *port)``
 *
 *        Start transmitting characters.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @throttle: ``void ()(struct uart_port *port)``
 *
 *        Notify the serial driver that input buffers for the line discipline are
 *        close to full, and it should somehow signal that no more characters
 *        should be sent to the serial port.
 *        This will be called only if hardware assisted flow control is enabled.
 *
 *        Locking: serialized with @unthrottle() and termios modification by the
 *        tty layer.
 *
 * @unthrottle: ``void ()(struct uart_port *port)``
 *
 *        Notify the serial driver that characters can now be sent to the serial
 *        port without fear of overrunning the input buffers of the line
 *        disciplines.
 *
 *        This will be called only if hardware assisted flow control is enabled.
 *
 *        Locking: serialized with @throttle() and termios modification by the
 *        tty layer.
 *
 * @send_xchar: ``void ()(struct uart_port *port, char ch)``
 *
 *        Transmit a high priority character, even if the port is stopped. This
 *        is used to implement XON/XOFF flow control and tcflow(). If the serial
 *        driver does not implement this function, the tty core will append the
 *        character to the circular buffer and then call start_tx() / stop_tx()
 *        to flush the data out.
 *
 *        Do not transmit if @ch == '\0' (%__DISABLED_CHAR).
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @start_rx: ``void ()(struct uart_port *port)``
 *
 *        Start receiving characters.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @stop_rx: ``void ()(struct uart_port *port)``
 *
 *        Stop receiving characters; the @port is in the process of being closed.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @enable_ms: ``void ()(struct uart_port *port)``
 *
 *        Enable the modem status interrupts.
 *
 *        This method may be called multiple times. Modem status interrupts
 *        should be disabled when the @shutdown() method is called.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @break_ctl: ``void ()(struct uart_port *port, int ctl)``
 *
 *        Control the transmission of a break signal. If @ctl is nonzero, the
 *        break signal should be transmitted. The signal should be terminated
 *        when another call is made with a zero @ctl.
 *
 *        Locking: caller holds tty_port->mutex
 *
 * @startup: ``int ()(struct uart_port *port)``
 *
 *        Grab any interrupt resources and initialise any low level driver state.
 *        Enable the port for reception. It should not activate RTS nor DTR;
 *        this will be done via a separate call to @set_mctrl().
 *
 *        This method will only be called when the port is initially opened.
 *
 *        Locking: port_sem taken.
 *        Interrupts: globally disabled.
 *
 * @shutdown: ``void ()(struct uart_port *port)``
 *
 *        Disable the @port, disable any break condition that may be in effect,
 *        and free any interrupt resources. It should not disable RTS nor DTR;
 *        this will have already been done via a separate call to @set_mctrl().
 *
 *        Drivers must not access @port->state once this call has completed.
 *
 *        This method will only be called when there are no more users of this
 *        @port.
 *
 *        Locking: port_sem taken.
 *        Interrupts: caller dependent.
 *
 * @flush_buffer: ``void ()(struct uart_port *port)``
 *
 *        Flush any write buffers, reset any DMA state and stop any ongoing DMA
 *        transfers.
 *
 *        This will be called whenever the @port->state->xmit circular buffer is
 *        cleared.
 *
 *        Locking: @port->lock taken.
 *        Interrupts: locally disabled.
 *        This call must not sleep
 *
 * @set_termios: ``void ()(struct uart_port *port, struct ktermios *new,
 *                        struct ktermios *old)``
 *
 *        Change the @port parameters, including word length, parity, stop bits.
 *        Update @port->read_status_mask and @port->ignore_status_mask to
 *        indicate the types of events we are interested in receiving. Relevant
 *        ktermios::c_cflag bits are:
 *
 *        - %CSIZE - word size
 *        - %CSTOPB - 2 stop bits
 *        - %PARENB - parity enable
 *        - %PARODD - odd parity (when %PARENB is in force)
 *        - %ADDRB - address bit (changed through uart_port::rs485_config()).
 *        - %CREAD - enable reception of characters (if not set, still receive
 *          characters from the port, but throw them away).
 *        - %CRTSCTS - if set, enable CTS status change reporting.
 *        - %CLOCAL - if not set, enable modem status change reporting.
 *
 *        Relevant ktermios::c_iflag bits are:
 *
 *        - %INPCK - enable frame and parity error events to be passed to the TTY
 *          layer.
 *        - %BRKINT / %PARMRK - both of these enable break events to be passed to
 *          the TTY layer.
 *        - %IGNPAR - ignore parity and framing errors.
 *        - %IGNBRK - ignore break errors. If %IGNPAR is also set, ignore overrun
 *          errors as well.
 *
 *        The interaction of the ktermios::c_iflag bits is as follows (parity
 *        error given as an example):
 *
 *        ============ ======= ======= =========================================
 *        Parity error INPCK   IGNPAR
 *        ============ ======= ======= =========================================
 *        n/a             0             n/a     character received, marked as %TTY_NORMAL
 *        None             1             n/a     character received, marked as %TTY_NORMAL
 *        Yes             1             0             character received, marked as %TTY_PARITY
 *        Yes             1             1             character discarded
 *        ============ ======= ======= =========================================
 *
 *        Other flags may be used (eg, xon/xoff characters) if your hardware
 *        supports hardware "soft" flow control.
 *
 *        Locking: caller holds tty_port->mutex
 *        Interrupts: caller dependent.
 *        This call must not sleep
 *
 * @set_ldisc: ``void ()(struct uart_port *port, struct ktermios *termios)``
 *
 *        Notifier for discipline change. See
 *        Documentation/driver-api/tty/tty_ldisc.rst.
 *
 *        Locking: caller holds tty_port->mutex
 *
 * @pm: ``void ()(struct uart_port *port, unsigned int state,
 *                 unsigned int oldstate)``
 *
 *        Perform any power management related activities on the specified @port.
 *        @state indicates the new state (defined by enum uart_pm_state),
 *        @oldstate indicates the previous state.
 *
 *        This function should not be used to grab any resources.
 *
 *        This will be called when the @port is initially opened and finally
 *        closed, except when the @port is also the system console. This will
 *        occur even if %CONFIG_PM is not set.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @type: ``const char *()(struct uart_port *port)``
 *
 *        Return a pointer to a string constant describing the specified @port,
 *        or return %NULL, in which case the string 'unknown' is substituted.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @release_port: ``void ()(struct uart_port *port)``
 *
 *        Release any memory and IO region resources currently in use by the
 *        @port.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @request_port: ``int ()(struct uart_port *port)``
 *
 *        Request any memory and IO region resources required by the port. If any
 *        fail, no resources should be registered when this function returns, and
 *        it should return -%EBUSY on failure.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @config_port: ``void ()(struct uart_port *port, int type)``
 *
 *        Perform any autoconfiguration steps required for the @port. @type
 *        contains a bit mask of the required configuration. %UART_CONFIG_TYPE
 *        indicates that the port requires detection and identification.
 *        @port->type should be set to the type found, or %PORT_UNKNOWN if no
 *        port was detected.
 *
 *        %UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal,
 *        which should be probed using standard kernel autoprobing techniques.
 *        This is not necessary on platforms where ports have interrupts
 *        internally hard wired (eg, system on a chip implementations).
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @verify_port: ``int ()(struct uart_port *port,
 *                        struct serial_struct *serinfo)``
 *
 *        Verify the new serial port information contained within @serinfo is
 *        suitable for this port type.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @ioctl: ``int ()(struct uart_port *port, unsigned int cmd,
 *                unsigned long arg)``
 *
 *        Perform any port specific IOCTLs. IOCTL commands must be defined using
 *        the standard numbering system found in <asm/ioctl.h>.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *
 * @poll_init: ``int ()(struct uart_port *port)``
 *
 *        Called by kgdb to perform the minimal hardware initialization needed to
 *        support @poll_put_char() and @poll_get_char(). Unlike @startup(), this
 *        should not request interrupts.
 *
 *        Locking: %tty_mutex and tty_port->mutex taken.
 *        Interrupts: n/a.
 *
 * @poll_put_char: ``void ()(struct uart_port *port, unsigned char ch)``
 *
 *        Called by kgdb to write a single character @ch directly to the serial
 *        @port. It can and should block until there is space in the TX FIFO.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *        This call must not sleep
 *
 * @poll_get_char: ``int ()(struct uart_port *port)``
 *
 *        Called by kgdb to read a single character directly from the serial
 *        port. If data is available, it should be returned; otherwise the
 *        function should return %NO_POLL_CHAR immediately.
 *
 *        Locking: none.
 *        Interrupts: caller dependent.
 *        This call must not sleep
 */
struct uart_ops {
        unsigned int        (*tx_empty)(struct uart_port *);
        void                (*set_mctrl)(struct uart_port *, unsigned int mctrl);
        unsigned int        (*get_mctrl)(struct uart_port *);
        void                (*stop_tx)(struct uart_port *);
        void                (*start_tx)(struct uart_port *);
        void                (*throttle)(struct uart_port *);
        void                (*unthrottle)(struct uart_port *);
        void                (*send_xchar)(struct uart_port *, char ch);
        void                (*stop_rx)(struct uart_port *);
        void                (*start_rx)(struct uart_port *);
        void                (*enable_ms)(struct uart_port *);
        void                (*break_ctl)(struct uart_port *, int ctl);
        int                (*startup)(struct uart_port *);
        void                (*shutdown)(struct uart_port *);
        void                (*flush_buffer)(struct uart_port *);
        void                (*set_termios)(struct uart_port *, struct ktermios *new,
                                       const struct ktermios *old);
        void                (*set_ldisc)(struct uart_port *, struct ktermios *);
        void                (*pm)(struct uart_port *, unsigned int state,
                              unsigned int oldstate);
        const char        *(*type)(struct uart_port *);
        void                (*release_port)(struct uart_port *);
        int                (*request_port)(struct uart_port *);
        void                (*config_port)(struct uart_port *, int);
        int                (*verify_port)(struct uart_port *, struct serial_struct *);
        int                (*ioctl)(struct uart_port *, unsigned int, unsigned long);
#ifdef CONFIG_CONSOLE_POLL
        int                (*poll_init)(struct uart_port *);
        void                (*poll_put_char)(struct uart_port *, unsigned char);
        int                (*poll_get_char)(struct uart_port *);
#endif
};

#define NO_POLL_CHAR                0x00ff0000
#define UART_CONFIG_TYPE        (1 << 0)
#define UART_CONFIG_IRQ                (1 << 1)

struct uart_icount {
        __u32        cts;
        __u32        dsr;
        __u32        rng;
        __u32        dcd;
        __u32        rx;
        __u32        tx;
        __u32        frame;
        __u32        overrun;
        __u32        parity;
        __u32        brk;
        __u32        buf_overrun;
};

typedef u64 __bitwise upf_t;
typedef unsigned int __bitwise upstat_t;

struct uart_port {
        spinlock_t                lock;                        /* port lock */
        unsigned long                iobase;                        /* in/out[bwl] */
        unsigned char __iomem        *membase;                /* read/write[bwl] */
        unsigned int                (*serial_in)(struct uart_port *, int);
        void                        (*serial_out)(struct uart_port *, int, int);
        void                        (*set_termios)(struct uart_port *,
                                               struct ktermios *new,
                                               const struct ktermios *old);
        void                        (*set_ldisc)(struct uart_port *,
                                             struct ktermios *);
        unsigned int                (*get_mctrl)(struct uart_port *);
        void                        (*set_mctrl)(struct uart_port *, unsigned int);
        unsigned int                (*get_divisor)(struct uart_port *,
                                               unsigned int baud,
                                               unsigned int *frac);
        void                        (*set_divisor)(struct uart_port *,
                                               unsigned int baud,
                                               unsigned int quot,
                                               unsigned int quot_frac);
        int                        (*startup)(struct uart_port *port);
        void                        (*shutdown)(struct uart_port *port);
        void                        (*throttle)(struct uart_port *port);
        void                        (*unthrottle)(struct uart_port *port);
        int                        (*handle_irq)(struct uart_port *);
        void                        (*pm)(struct uart_port *, unsigned int state,
                                      unsigned int old);
        void                        (*handle_break)(struct uart_port *);
        int                        (*rs485_config)(struct uart_port *,
                                                struct ktermios *termios,
                                                struct serial_rs485 *rs485);
        int                        (*iso7816_config)(struct uart_port *,
                                                  struct serial_iso7816 *iso7816);
        unsigned int                ctrl_id;                /* optional serial core controller id */
        unsigned int                port_id;                /* optional serial core port id */
        unsigned int                irq;                        /* irq number */
        unsigned long                irqflags;                /* irq flags  */
        unsigned int                uartclk;                /* base uart clock */
        unsigned int                fifosize;                /* tx fifo size */
        unsigned char                x_char;                        /* xon/xoff char */
        unsigned char                regshift;                /* reg offset shift */

        unsigned char                iotype;                        /* io access style */

#define UPIO_UNKNOWN                ((unsigned char)~0U)        /* UCHAR_MAX */
#define UPIO_PORT                (SERIAL_IO_PORT)        /* 8b I/O port access */
#define UPIO_HUB6                (SERIAL_IO_HUB6)        /* Hub6 ISA card */
#define UPIO_MEM                (SERIAL_IO_MEM)                /* driver-specific */
#define UPIO_MEM32                (SERIAL_IO_MEM32)        /* 32b little endian */
#define UPIO_AU                        (SERIAL_IO_AU)                /* Au1x00 and RT288x type IO */
#define UPIO_TSI                (SERIAL_IO_TSI)                /* Tsi108/109 type IO */
#define UPIO_MEM32BE                (SERIAL_IO_MEM32BE)        /* 32b big endian */
#define UPIO_MEM16                (SERIAL_IO_MEM16)        /* 16b little endian */

        unsigned char                quirks;                        /* internal quirks */

        /* internal quirks must be updated while holding port mutex */
#define UPQ_NO_TXEN_TEST        BIT(0)

        unsigned int                read_status_mask;        /* driver specific */
        unsigned int                ignore_status_mask;        /* driver specific */
        struct uart_state        *state;                        /* pointer to parent state */
        struct uart_icount        icount;                        /* statistics */

        struct console                *cons;                        /* struct console, if any */
        /* flags must be updated while holding port mutex */
        upf_t                        flags;

        /*
         * These flags must be equivalent to the flags defined in
         * include/uapi/linux/tty_flags.h which are the userspace definitions
         * assigned from the serial_struct flags in uart_set_info()
         * [for bit definitions in the UPF_CHANGE_MASK]
         *
         * Bits [0..ASYNCB_LAST_USER] are userspace defined/visible/changeable
         * The remaining bits are serial-core specific and not modifiable by
         * userspace.
         */
#define UPF_FOURPORT                ((__force upf_t) ASYNC_FOURPORT       /* 1  */ )
#define UPF_SAK                        ((__force upf_t) ASYNC_SAK            /* 2  */ )
#define UPF_SPD_HI                ((__force upf_t) ASYNC_SPD_HI         /* 4  */ )
#define UPF_SPD_VHI                ((__force upf_t) ASYNC_SPD_VHI        /* 5  */ )
#define UPF_SPD_CUST                ((__force upf_t) ASYNC_SPD_CUST   /* 0x0030 */ )
#define UPF_SPD_WARP                ((__force upf_t) ASYNC_SPD_WARP   /* 0x1010 */ )
#define UPF_SPD_MASK                ((__force upf_t) ASYNC_SPD_MASK   /* 0x1030 */ )
#define UPF_SKIP_TEST                ((__force upf_t) ASYNC_SKIP_TEST      /* 6  */ )
#define UPF_AUTO_IRQ                ((__force upf_t) ASYNC_AUTO_IRQ       /* 7  */ )
#define UPF_HARDPPS_CD                ((__force upf_t) ASYNC_HARDPPS_CD     /* 11 */ )
#define UPF_SPD_SHI                ((__force upf_t) ASYNC_SPD_SHI        /* 12 */ )
#define UPF_LOW_LATENCY                ((__force upf_t) ASYNC_LOW_LATENCY    /* 13 */ )
#define UPF_BUGGY_UART                ((__force upf_t) ASYNC_BUGGY_UART     /* 14 */ )
#define UPF_MAGIC_MULTIPLIER        ((__force upf_t) ASYNC_MAGIC_MULTIPLIER /* 16 */ )

#define UPF_NO_THRE_TEST        ((__force upf_t) BIT_ULL(19))
/* Port has hardware-assisted h/w flow control */
#define UPF_AUTO_CTS                ((__force upf_t) BIT_ULL(20))
#define UPF_AUTO_RTS                ((__force upf_t) BIT_ULL(21))
#define UPF_HARD_FLOW                ((__force upf_t) (UPF_AUTO_CTS | UPF_AUTO_RTS))
/* Port has hardware-assisted s/w flow control */
#define UPF_SOFT_FLOW                ((__force upf_t) BIT_ULL(22))
#define UPF_CONS_FLOW                ((__force upf_t) BIT_ULL(23))
#define UPF_SHARE_IRQ                ((__force upf_t) BIT_ULL(24))
#define UPF_EXAR_EFR                ((__force upf_t) BIT_ULL(25))
#define UPF_BUG_THRE                ((__force upf_t) BIT_ULL(26))
/* The exact UART type is known and should not be probed.  */
#define UPF_FIXED_TYPE                ((__force upf_t) BIT_ULL(27))
#define UPF_BOOT_AUTOCONF        ((__force upf_t) BIT_ULL(28))
#define UPF_FIXED_PORT                ((__force upf_t) BIT_ULL(29))
#define UPF_DEAD                ((__force upf_t) BIT_ULL(30))
#define UPF_IOREMAP                ((__force upf_t) BIT_ULL(31))
#define UPF_FULL_PROBE                ((__force upf_t) BIT_ULL(32))

#define __UPF_CHANGE_MASK        0x17fff
#define UPF_CHANGE_MASK                ((__force upf_t) __UPF_CHANGE_MASK)
#define UPF_USR_MASK                ((__force upf_t) (UPF_SPD_MASK|UPF_LOW_LATENCY))

#if __UPF_CHANGE_MASK > ASYNC_FLAGS
#error Change mask not equivalent to userspace-visible bit defines
#endif

        /*
         * Must hold termios_rwsem, port mutex and port lock to change;
         * can hold any one lock to read.
         */
        upstat_t                status;

#define UPSTAT_CTS_ENABLE        ((__force upstat_t) (1 << 0))
#define UPSTAT_DCD_ENABLE        ((__force upstat_t) (1 << 1))
#define UPSTAT_AUTORTS                ((__force upstat_t) (1 << 2))
#define UPSTAT_AUTOCTS                ((__force upstat_t) (1 << 3))
#define UPSTAT_AUTOXOFF                ((__force upstat_t) (1 << 4))
#define UPSTAT_SYNC_FIFO        ((__force upstat_t) (1 << 5))

        bool                        hw_stopped;                /* sw-assisted CTS flow state */
        unsigned int                mctrl;                        /* current modem ctrl settings */
        unsigned int                frame_time;                /* frame timing in ns */
        unsigned int                type;                        /* port type */
        const struct uart_ops        *ops;
        unsigned int                custom_divisor;
        unsigned int                line;                        /* port index */
        unsigned int                minor;
        resource_size_t                mapbase;                /* for ioremap */
        resource_size_t                mapsize;
        struct device                *dev;                        /* serial port physical parent device */
        struct serial_port_device *port_dev;                /* serial core port device */

        unsigned long                sysrq;                        /* sysrq timeout */
        u8                        sysrq_ch;                /* char for sysrq */
        unsigned char                has_sysrq;
        unsigned char                sysrq_seq;                /* index in sysrq_toggle_seq */

        unsigned char                hub6;                        /* this should be in the 8250 driver */
        unsigned char                suspended;
        unsigned char                console_reinit;
        const char                *name;                        /* port name */
        struct attribute_group        *attr_group;                /* port specific attributes */
        const struct attribute_group **tty_groups;        /* all attributes (serial core use only) */
        struct serial_rs485     rs485;
        struct serial_rs485        rs485_supported;        /* Supported mask for serial_rs485 */
        struct gpio_desc        *rs485_term_gpio;        /* enable RS485 bus termination */
        struct gpio_desc        *rs485_rx_during_tx_gpio; /* Output GPIO that sets the state of RS485 RX during TX */
        struct serial_iso7816   iso7816;
        void                        *private_data;                /* generic platform data pointer */
};

/**
 * uart_port_lock - Lock the UART port
 * @up:                Pointer to UART port structure
 */
static inline void uart_port_lock(struct uart_port *up)
{
        spin_lock(&up->lock);
}

/**
 * uart_port_lock_irq - Lock the UART port and disable interrupts
 * @up:                Pointer to UART port structure
 */
static inline void uart_port_lock_irq(struct uart_port *up)
{
        spin_lock_irq(&up->lock);
}

/**
 * uart_port_lock_irqsave - Lock the UART port, save and disable interrupts
 * @up:                Pointer to UART port structure
 * @flags:        Pointer to interrupt flags storage
 */
static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags)
{
        spin_lock_irqsave(&up->lock, *flags);
}

/**
 * uart_port_trylock - Try to lock the UART port
 * @up:                Pointer to UART port structure
 *
 * Returns: True if lock was acquired, false otherwise
 */
static inline bool uart_port_trylock(struct uart_port *up)
{
        return spin_trylock(&up->lock);
}

/**
 * uart_port_trylock_irqsave - Try to lock the UART port, save and disable interrupts
 * @up:                Pointer to UART port structure
 * @flags:        Pointer to interrupt flags storage
 *
 * Returns: True if lock was acquired, false otherwise
 */
static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long *flags)
{
        return spin_trylock_irqsave(&up->lock, *flags);
}

/**
 * uart_port_unlock - Unlock the UART port
 * @up:                Pointer to UART port structure
 */
static inline void uart_port_unlock(struct uart_port *up)
{
        spin_unlock(&up->lock);
}

/**
 * uart_port_unlock_irq - Unlock the UART port and re-enable interrupts
 * @up:                Pointer to UART port structure
 */
static inline void uart_port_unlock_irq(struct uart_port *up)
{
        spin_unlock_irq(&up->lock);
}

/**
 * uart_port_unlock_irqrestore - Unlock the UART port, restore interrupts
 * @up:                Pointer to UART port structure
 * @flags:        The saved interrupt flags for restore
 */
static inline void uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags)
{
        spin_unlock_irqrestore(&up->lock, flags);
}

static inline int serial_port_in(struct uart_port *up, int offset)
{
        return up->serial_in(up, offset);
}

static inline void serial_port_out(struct uart_port *up, int offset, int value)
{
        up->serial_out(up, offset, value);
}

/**
 * enum uart_pm_state - power states for UARTs
 * @UART_PM_STATE_ON: UART is powered, up and operational
 * @UART_PM_STATE_OFF: UART is powered off
 * @UART_PM_STATE_UNDEFINED: sentinel
 */
enum uart_pm_state {
        UART_PM_STATE_ON = 0,
        UART_PM_STATE_OFF = 3, /* number taken from ACPI */
        UART_PM_STATE_UNDEFINED,
};

/*
 * This is the state information which is persistent across opens.
 */
struct uart_state {
        struct tty_port                port;

        enum uart_pm_state        pm_state;

        atomic_t                refcount;
        wait_queue_head_t        remove_wait;
        struct uart_port        *uart_port;
};

#define UART_XMIT_SIZE        PAGE_SIZE


/* number of characters left in xmit buffer before we ask for more */
#define WAKEUP_CHARS                256

/**
 * uart_xmit_advance - Advance xmit buffer and account Tx'ed chars
 * @up: uart_port structure describing the port
 * @chars: number of characters sent
 *
 * This function advances the tail of circular xmit buffer by the number of
 * @chars transmitted and handles accounting of transmitted bytes (into
 * @up's icount.tx).
 */
static inline void uart_xmit_advance(struct uart_port *up, unsigned int chars)
{
        struct tty_port *tport = &up->state->port;

        kfifo_skip_count(&tport->xmit_fifo, chars);
        up->icount.tx += chars;
}

static inline unsigned int uart_fifo_out(struct uart_port *up,
                unsigned char *buf, unsigned int chars)
{
        struct tty_port *tport = &up->state->port;

        chars = kfifo_out(&tport->xmit_fifo, buf, chars);
        up->icount.tx += chars;

        return chars;
}

static inline unsigned int uart_fifo_get(struct uart_port *up,
                unsigned char *ch)
{
        struct tty_port *tport = &up->state->port;
        unsigned int chars;

        chars = kfifo_get(&tport->xmit_fifo, ch);
        up->icount.tx += chars;

        return chars;
}

struct module;
struct tty_driver;

struct uart_driver {
        struct module                *owner;
        const char                *driver_name;
        const char                *dev_name;
        int                         major;
        int                         minor;
        int                         nr;
        struct console                *cons;

        /*
         * these are private; the low level driver should not
         * touch these; they should be initialised to NULL
         */
        struct uart_state        *state;
        struct tty_driver        *tty_driver;
};

void uart_write_wakeup(struct uart_port *port);

/**
 * enum UART_TX_FLAGS -- flags for uart_port_tx_flags()
 *
 * @UART_TX_NOSTOP: don't call port->ops->stop_tx() on empty buffer
 */
enum UART_TX_FLAGS {
        UART_TX_NOSTOP = BIT(0),
};

#define __uart_port_tx(uport, ch, flags, tx_ready, put_char, tx_done,              \
                       for_test, for_post)                                      \
({                                                                              \
        struct uart_port *__port = (uport);                                      \
        struct tty_port *__tport = &__port->state->port;                      \
        unsigned int pending;                                                      \
                                                                              \
        for (; (for_test) && (tx_ready); (for_post), __port->icount.tx++) {   \
                if (__port->x_char) {                                              \
                        (ch) = __port->x_char;                                      \
                        (put_char);                                              \
                        __port->x_char = 0;                                      \
                        continue;                                              \
                }                                                              \
                                                                              \
                if (uart_tx_stopped(__port))                                      \
                        break;                                                      \
                                                                              \
                if (!kfifo_get(&__tport->xmit_fifo, &(ch)))                      \
                        break;                                                      \
                                                                              \
                (put_char);                                                      \
        }                                                                      \
                                                                              \
        (tx_done);                                                              \
                                                                              \
        pending = kfifo_len(&__tport->xmit_fifo);                              \
        if (pending < WAKEUP_CHARS) {                                              \
                uart_write_wakeup(__port);                                      \
                                                                              \
                if (!((flags) & UART_TX_NOSTOP) && pending == 0 &&              \
                    __port->ops->tx_empty(__port))                              \
                        __port->ops->stop_tx(__port);                              \
        }                                                                      \
                                                                              \
        pending;                                                              \
})

/**
 * uart_port_tx_limited -- transmit helper for uart_port with count limiting
 * @port: uart port
 * @ch: variable to store a character to be written to the HW
 * @count: a limit of characters to send
 * @tx_ready: can HW accept more data function
 * @put_char: function to write a character
 * @tx_done: function to call after the loop is done
 *
 * This helper transmits characters from the xmit buffer to the hardware using
 * @put_char(). It does so until @count characters are sent and while @tx_ready
 * evaluates to true.
 *
 * Returns: the number of characters in the xmit buffer when done.
 *
 * The expression in macro parameters shall be designed as follows:
 *  * **tx_ready:** should evaluate to true if the HW can accept more data to
 *    be sent. This parameter can be %true, which means the HW is always ready.
 *  * **put_char:** shall write @ch to the device of @port.
 *  * **tx_done:** when the write loop is done, this can perform arbitrary
 *    action before potential invocation of ops->stop_tx() happens. If the
 *    driver does not need to do anything, use e.g. ({}).
 *
 * For all of them, @port->lock is held, interrupts are locally disabled and
 * the expressions must not sleep.
 */
#define uart_port_tx_limited(port, ch, count, tx_ready, put_char, tx_done) ({ \
        unsigned int __count = (count);                                              \
        __uart_port_tx(port, ch, 0, tx_ready, put_char, tx_done, __count,     \
                        __count--);                                              \
})

/**
 * uart_port_tx -- transmit helper for uart_port
 * @port: uart port
 * @ch: variable to store a character to be written to the HW
 * @tx_ready: can HW accept more data function
 * @put_char: function to write a character
 *
 * See uart_port_tx_limited() for more details.
 */
#define uart_port_tx(port, ch, tx_ready, put_char)                        \
        __uart_port_tx(port, ch, 0, tx_ready, put_char, ({}), true, ({}))


/**
 * uart_port_tx_flags -- transmit helper for uart_port with flags
 * @port: uart port
 * @ch: variable to store a character to be written to the HW
 * @flags: %UART_TX_NOSTOP or similar
 * @tx_ready: can HW accept more data function
 * @put_char: function to write a character
 *
 * See uart_port_tx_limited() for more details.
 */
#define uart_port_tx_flags(port, ch, flags, tx_ready, put_char)                \
        __uart_port_tx(port, ch, flags, tx_ready, put_char, ({}), true, ({}))
/*
 * Baud rate helpers.
 */
void uart_update_timeout(struct uart_port *port, unsigned int cflag,
                         unsigned int baud);
unsigned int uart_get_baud_rate(struct uart_port *port, struct ktermios *termios,
                                const struct ktermios *old, unsigned int min,
                                unsigned int max);
unsigned int uart_get_divisor(struct uart_port *port, unsigned int baud);

/*
 * Calculates FIFO drain time.
 */
static inline unsigned long uart_fifo_timeout(struct uart_port *port)
{
        u64 fifo_timeout = (u64)READ_ONCE(port->frame_time) * port->fifosize;

        /* Add .02 seconds of slop */
        fifo_timeout += 20 * NSEC_PER_MSEC;

        return max(nsecs_to_jiffies(fifo_timeout), 1UL);
}

/* Base timer interval for polling */
static inline unsigned long uart_poll_timeout(struct uart_port *port)
{
        unsigned long timeout = uart_fifo_timeout(port);

        return timeout > 6 ? (timeout / 2 - 2) : 1;
}

/*
 * Console helpers.
 */
struct earlycon_device {
        struct console *con;
        struct uart_port port;
        char options[32];                /* e.g., 115200n8 */
        unsigned int baud;
};

struct earlycon_id {
        char        name[15];
        char        name_term;        /* In case compiler didn't '\0' term name */
        char        compatible[128];
        int        (*setup)(struct earlycon_device *, const char *options);
};

extern const struct earlycon_id __earlycon_table[];
extern const struct earlycon_id __earlycon_table_end[];

#if defined(CONFIG_SERIAL_EARLYCON) && !defined(MODULE)
#define EARLYCON_USED_OR_UNUSED        __used
#else
#define EARLYCON_USED_OR_UNUSED        __maybe_unused
#endif

#define OF_EARLYCON_DECLARE(_name, compat, fn)                                \
        static const struct earlycon_id __UNIQUE_ID(__earlycon_##_name) \
                EARLYCON_USED_OR_UNUSED  __section("__earlycon_table")  \
                __aligned(__alignof__(struct earlycon_id))                \
                = { .name = __stringify(_name),                                \
                    .compatible = compat,                                \
                    .setup = fn }

#define EARLYCON_DECLARE(_name, fn)        OF_EARLYCON_DECLARE(_name, "", fn)

int of_setup_earlycon(const struct earlycon_id *match, unsigned long node,
                      const char *options);

#ifdef CONFIG_SERIAL_EARLYCON
extern bool earlycon_acpi_spcr_enable __initdata;
int setup_earlycon(char *buf);
#else
static const bool earlycon_acpi_spcr_enable EARLYCON_USED_OR_UNUSED;
static inline int setup_earlycon(char *buf) { return 0; }
#endif

/* Variant of uart_console_registered() when the console_list_lock is held. */
static inline bool uart_console_registered_locked(struct uart_port *port)
{
        return uart_console(port) && console_is_registered_locked(port->cons);
}

static inline bool uart_console_registered(struct uart_port *port)
{
        return uart_console(port) && console_is_registered(port->cons);
}

struct uart_port *uart_get_console(struct uart_port *ports, int nr,
                                   struct console *c);
int uart_parse_earlycon(char *p, unsigned char *iotype, resource_size_t *addr,
                        char **options);
void uart_parse_options(const char *options, int *baud, int *parity, int *bits,
                        int *flow);
int uart_set_options(struct uart_port *port, struct console *co, int baud,
                     int parity, int bits, int flow);
struct tty_driver *uart_console_device(struct console *co, int *index);
void uart_console_write(struct uart_port *port, const char *s,
                        unsigned int count,
                        void (*putchar)(struct uart_port *, unsigned char));

/*
 * Port/driver registration/removal
 */
int uart_register_driver(struct uart_driver *uart);
void uart_unregister_driver(struct uart_driver *uart);
int uart_add_one_port(struct uart_driver *reg, struct uart_port *port);
void uart_remove_one_port(struct uart_driver *reg, struct uart_port *port);
int uart_read_port_properties(struct uart_port *port);
int uart_read_and_validate_port_properties(struct uart_port *port);
bool uart_match_port(const struct uart_port *port1,
                const struct uart_port *port2);

/*
 * Power Management
 */
int uart_suspend_port(struct uart_driver *reg, struct uart_port *port);
int uart_resume_port(struct uart_driver *reg, struct uart_port *port);

static inline int uart_tx_stopped(struct uart_port *port)
{
        struct tty_struct *tty = port->state->port.tty;
        if ((tty && tty->flow.stopped) || port->hw_stopped)
                return 1;
        return 0;
}

static inline bool uart_cts_enabled(struct uart_port *uport)
{
        return !!(uport->status & UPSTAT_CTS_ENABLE);
}

static inline bool uart_softcts_mode(struct uart_port *uport)
{
        upstat_t mask = UPSTAT_CTS_ENABLE | UPSTAT_AUTOCTS;

        return ((uport->status & mask) == UPSTAT_CTS_ENABLE);
}

/*
 * The following are helper functions for the low level drivers.
 */

void uart_handle_dcd_change(struct uart_port *uport, bool active);
void uart_handle_cts_change(struct uart_port *uport, bool active);

void uart_insert_char(struct uart_port *port, unsigned int status,
                      unsigned int overrun, u8 ch, u8 flag);

void uart_xchar_out(struct uart_port *uport, int offset);

#ifdef CONFIG_MAGIC_SYSRQ_SERIAL
#define SYSRQ_TIMEOUT        (HZ * 5)

bool uart_try_toggle_sysrq(struct uart_port *port, u8 ch);

static inline int uart_handle_sysrq_char(struct uart_port *port, u8 ch)
{
        if (!port->sysrq)
                return 0;

        if (ch && time_before(jiffies, port->sysrq)) {
                if (sysrq_mask()) {
                        handle_sysrq(ch);
                        port->sysrq = 0;
                        return 1;
                }
                if (uart_try_toggle_sysrq(port, ch))
                        return 1;
        }
        port->sysrq = 0;

        return 0;
}

static inline int uart_prepare_sysrq_char(struct uart_port *port, u8 ch)
{
        if (!port->sysrq)
                return 0;

        if (ch && time_before(jiffies, port->sysrq)) {
                if (sysrq_mask()) {
                        port->sysrq_ch = ch;
                        port->sysrq = 0;
                        return 1;
                }
                if (uart_try_toggle_sysrq(port, ch))
                        return 1;
        }
        port->sysrq = 0;

        return 0;
}

static inline void uart_unlock_and_check_sysrq(struct uart_port *port)
{
        u8 sysrq_ch;

        if (!port->has_sysrq) {
                uart_port_unlock(port);
                return;
        }

        sysrq_ch = port->sysrq_ch;
        port->sysrq_ch = 0;

        uart_port_unlock(port);

        if (sysrq_ch)
                handle_sysrq(sysrq_ch);
}

static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port,
                unsigned long flags)
{
        u8 sysrq_ch;

        if (!port->has_sysrq) {
                uart_port_unlock_irqrestore(port, flags);
                return;
        }

        sysrq_ch = port->sysrq_ch;
        port->sysrq_ch = 0;

        uart_port_unlock_irqrestore(port, flags);

        if (sysrq_ch)
                handle_sysrq(sysrq_ch);
}
#else        /* CONFIG_MAGIC_SYSRQ_SERIAL */
static inline int uart_handle_sysrq_char(struct uart_port *port, u8 ch)
{
        return 0;
}
static inline int uart_prepare_sysrq_char(struct uart_port *port, u8 ch)
{
        return 0;
}
static inline void uart_unlock_and_check_sysrq(struct uart_port *port)
{
        uart_port_unlock(port);
}
static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port,
                unsigned long flags)
{
        uart_port_unlock_irqrestore(port, flags);
}
#endif        /* CONFIG_MAGIC_SYSRQ_SERIAL */

/*
 * We do the SysRQ and SAK checking like this...
 */
static inline int uart_handle_break(struct uart_port *port)
{
        struct uart_state *state = port->state;

        if (port->handle_break)
                port->handle_break(port);

#ifdef CONFIG_MAGIC_SYSRQ_SERIAL
        if (port->has_sysrq && uart_console(port)) {
                if (!port->sysrq) {
                        port->sysrq = jiffies + SYSRQ_TIMEOUT;
                        return 1;
                }
                port->sysrq = 0;
        }
#endif
        if (port->flags & UPF_SAK)
                do_SAK(state->port.tty);
        return 0;
}

/*
 *        UART_ENABLE_MS - determine if port should enable modem status irqs
 */
#define UART_ENABLE_MS(port,cflag)        ((port)->flags & UPF_HARDPPS_CD || \
                                         (cflag) & CRTSCTS || \
                                         !((cflag) & CLOCAL))

int uart_get_rs485_mode(struct uart_port *port);
#endif /* LINUX_SERIAL_CORE_H */

















































































































































































































































































































































































    1 
    1 



    1 









































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#ifndef BTRFS_INODE_H
#define BTRFS_INODE_H

#include <linux/hash.h>
#include <linux/refcount.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/compiler.h>
#include <linux/fscrypt.h>
#include <linux/lockdep.h>
#include <uapi/linux/btrfs_tree.h>
#include <trace/events/btrfs.h>
#include "block-rsv.h"
#include "btrfs_inode.h"
#include "extent_map.h"
#include "extent_io.h"
#include "extent-io-tree.h"
#include "ordered-data.h"
#include "delayed-inode.h"

struct extent_state;
struct posix_acl;
struct iov_iter;
struct writeback_control;
struct btrfs_root;
struct btrfs_fs_info;
struct btrfs_trans_handle;

/*
 * Since we search a directory based on f_pos (struct dir_context::pos) we have
 * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so
 * everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()).
 */
#define BTRFS_DIR_START_INDEX 2

/*
 * ordered_data_close is set by truncate when a file that used
 * to have good data has been truncated to zero.  When it is set
 * the btrfs file release call will add this inode to the
 * ordered operations list so that we make sure to flush out any
 * new data the application may have written before commit.
 */
enum {
        BTRFS_INODE_FLUSH_ON_CLOSE,
        BTRFS_INODE_DUMMY,
        BTRFS_INODE_IN_DEFRAG,
        BTRFS_INODE_HAS_ASYNC_EXTENT,
         /*
          * Always set under the VFS' inode lock, otherwise it can cause races
          * during fsync (we start as a fast fsync and then end up in a full
          * fsync racing with ordered extent completion).
          */
        BTRFS_INODE_NEEDS_FULL_SYNC,
        BTRFS_INODE_COPY_EVERYTHING,
        BTRFS_INODE_HAS_PROPS,
        BTRFS_INODE_SNAPSHOT_FLUSH,
        /*
         * Set and used when logging an inode and it serves to signal that an
         * inode does not have xattrs, so subsequent fsyncs can avoid searching
         * for xattrs to log. This bit must be cleared whenever a xattr is added
         * to an inode.
         */
        BTRFS_INODE_NO_XATTRS,
        /*
         * Set when we are in a context where we need to start a transaction and
         * have dirty pages with the respective file range locked. This is to
         * ensure that when reserving space for the transaction, if we are low
         * on available space and need to flush delalloc, we will not flush
         * delalloc for this inode, because that could result in a deadlock (on
         * the file range, inode's io_tree).
         */
        BTRFS_INODE_NO_DELALLOC_FLUSH,
        /*
         * Set when we are working on enabling verity for a file. Computing and
         * writing the whole Merkle tree can take a while so we want to prevent
         * races where two separate tasks attempt to simultaneously start verity
         * on the same file.
         */
        BTRFS_INODE_VERITY_IN_PROGRESS,
        /* Set when this inode is a free space inode. */
        BTRFS_INODE_FREE_SPACE_INODE,
        /* Set when there are no capabilities in XATTs for the inode. */
        BTRFS_INODE_NO_CAP_XATTR,
        /*
         * Set if an error happened when doing a COW write before submitting a
         * bio or during writeback. Used for both buffered writes and direct IO
         * writes. This is to signal a fast fsync that it has to wait for
         * ordered extents to complete and therefore not log extent maps that
         * point to unwritten extents (when an ordered extent completes and it
         * has the BTRFS_ORDERED_IOERR flag set, it drops extent maps in its
         * range).
         */
        BTRFS_INODE_COW_WRITE_ERROR,
};

/* in memory btrfs inode */
struct btrfs_inode {
        /* which subvolume this inode belongs to */
        struct btrfs_root *root;

        /* key used to find this inode on disk.  This is used by the code
         * to read in roots of subvolumes
         */
        struct btrfs_key location;

        /* Cached value of inode property 'compression'. */
        u8 prop_compress;

        /*
         * Force compression on the file using the defrag ioctl, could be
         * different from prop_compress and takes precedence if set.
         */
        u8 defrag_compress;

        /*
         * Lock for counters and all fields used to determine if the inode is in
         * the log or not (last_trans, last_sub_trans, last_log_commit,
         * logged_trans), to access/update delalloc_bytes, new_delalloc_bytes,
         * defrag_bytes, disk_i_size, outstanding_extents, csum_bytes and to
         * update the VFS' inode number of bytes used.
         */
        spinlock_t lock;

        /* the extent_tree has caches of all the extent mappings to disk */
        struct extent_map_tree extent_tree;

        /* the io_tree does range state (DIRTY, LOCKED etc) */
        struct extent_io_tree io_tree;

        /*
         * Keep track of where the inode has extent items mapped in order to
         * make sure the i_size adjustments are accurate. Not required when the
         * filesystem is NO_HOLES, the status can't be set while mounted as
         * it's a mkfs-time feature.
         */
        struct extent_io_tree *file_extent_tree;

        /* held while logging the inode in tree-log.c */
        struct mutex log_mutex;

        /*
         * Counters to keep track of the number of extent item's we may use due
         * to delalloc and such.  outstanding_extents is the number of extent
         * items we think we'll end up using, and reserved_extents is the number
         * of extent items we've reserved metadata for. Protected by 'lock'.
         */
        unsigned outstanding_extents;

        /* used to order data wrt metadata */
        spinlock_t ordered_tree_lock;
        struct rb_root ordered_tree;
        struct rb_node *ordered_tree_last;

        /* list of all the delalloc inodes in the FS.  There are times we need
         * to write all the delalloc pages to disk, and this list is used
         * to walk them all.
         */
        struct list_head delalloc_inodes;

        /* node for the red-black tree that links inodes in subvolume root */
        struct rb_node rb_node;

        unsigned long runtime_flags;

        /* full 64 bit generation number, struct vfs_inode doesn't have a big
         * enough field for this.
         */
        u64 generation;

        /*
         * ID of the transaction handle that last modified this inode.
         * Protected by 'lock'.
         */
        u64 last_trans;

        /*
         * ID of the transaction that last logged this inode.
         * Protected by 'lock'.
         */
        u64 logged_trans;

        /*
         * Log transaction ID when this inode was last modified.
         * Protected by 'lock'.
         */
        int last_sub_trans;

        /* A local copy of root's last_log_commit. Protected by 'lock'. */
        int last_log_commit;

        union {
                /*
                 * Total number of bytes pending delalloc, used by stat to
                 * calculate the real block usage of the file. This is used
                 * only for files. Protected by 'lock'.
                 */
                u64 delalloc_bytes;
                /*
                 * The lowest possible index of the next dir index key which
                 * points to an inode that needs to be logged.
                 * This is used only for directories.
                 * Use the helpers btrfs_get_first_dir_index_to_log() and
                 * btrfs_set_first_dir_index_to_log() to access this field.
                 */
                u64 first_dir_index_to_log;
        };

        union {
                /*
                 * Total number of bytes pending delalloc that fall within a file
                 * range that is either a hole or beyond EOF (and no prealloc extent
                 * exists in the range). This is always <= delalloc_bytes and this
                 * is used only for files. Protected by 'lock'.
                 */
                u64 new_delalloc_bytes;
                /*
                 * The offset of the last dir index key that was logged.
                 * This is used only for directories.
                 */
                u64 last_dir_index_offset;
        };

        /*
         * Total number of bytes pending defrag, used by stat to check whether
         * it needs COW. Protected by 'lock'.
         */
        u64 defrag_bytes;

        /*
         * The size of the file stored in the metadata on disk.  data=ordered
         * means the in-memory i_size might be larger than the size on disk
         * because not all the blocks are written yet. Protected by 'lock'.
         */
        u64 disk_i_size;

        /*
         * If this is a directory then index_cnt is the counter for the index
         * number for new files that are created. For an empty directory, this
         * must be initialized to BTRFS_DIR_START_INDEX.
         */
        u64 index_cnt;

        /* Cache the directory index number to speed the dir/file remove */
        u64 dir_index;

        /* the fsync log has some corner cases that mean we have to check
         * directories to see if any unlinks have been done before
         * the directory was logged.  See tree-log.c for all the
         * details
         */
        u64 last_unlink_trans;

        /*
         * The id/generation of the last transaction where this inode was
         * either the source or the destination of a clone/dedupe operation.
         * Used when logging an inode to know if there are shared extents that
         * need special care when logging checksum items, to avoid duplicate
         * checksum items in a log (which can lead to a corruption where we end
         * up with missing checksum ranges after log replay).
         * Protected by the vfs inode lock.
         */
        u64 last_reflink_trans;

        /*
         * Number of bytes outstanding that are going to need csums.  This is
         * used in ENOSPC accounting. Protected by 'lock'.
         */
        u64 csum_bytes;

        /* Backwards incompatible flags, lower half of inode_item::flags  */
        u32 flags;
        /* Read-only compatibility flags, upper half of inode_item::flags */
        u32 ro_flags;

        struct btrfs_block_rsv block_rsv;

        struct btrfs_delayed_node *delayed_node;

        /* File creation time. */
        u64 i_otime_sec;
        u32 i_otime_nsec;

        /* Hook into fs_info->delayed_iputs */
        struct list_head delayed_iput;

        struct rw_semaphore i_mmap_lock;
        struct inode vfs_inode;
};

static inline u64 btrfs_get_first_dir_index_to_log(const struct btrfs_inode *inode)
{
        return READ_ONCE(inode->first_dir_index_to_log);
}

static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode,
                                                    u64 index)
{
        WRITE_ONCE(inode->first_dir_index_to_log, index);
}

static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
{
        return container_of(inode, struct btrfs_inode, vfs_inode);
}

static inline unsigned long btrfs_inode_hash(u64 objectid,
                                             const struct btrfs_root *root)
{
        u64 h = objectid ^ (root->root_key.objectid * GOLDEN_RATIO_PRIME);

#if BITS_PER_LONG == 32
        h = (h >> 32) ^ (h & 0xffffffff);
#endif

        return (unsigned long)h;
}

#if BITS_PER_LONG == 32

/*
 * On 32 bit systems the i_ino of struct inode is 32 bits (unsigned long), so
 * we use the inode's location objectid which is a u64 to avoid truncation.
 */
static inline u64 btrfs_ino(const struct btrfs_inode *inode)
{
        u64 ino = inode->location.objectid;

        /* type == BTRFS_ROOT_ITEM_KEY: subvol dir */
        if (inode->location.type == BTRFS_ROOT_ITEM_KEY)
                ino = inode->vfs_inode.i_ino;
        return ino;
}

#else

static inline u64 btrfs_ino(const struct btrfs_inode *inode)
{
        return inode->vfs_inode.i_ino;
}

#endif

static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
{
        i_size_write(&inode->vfs_inode, size);
        inode->disk_i_size = size;
}

static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
{
        return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags);
}

static inline bool is_data_inode(struct inode *inode)
{
        return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID;
}

static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
                                                 int mod)
{
        lockdep_assert_held(&inode->lock);
        inode->outstanding_extents += mod;
        if (btrfs_is_free_space_inode(inode))
                return;
        trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode),
                                                  mod, inode->outstanding_extents);
}

/*
 * Called every time after doing a buffered, direct IO or memory mapped write.
 *
 * This is to ensure that if we write to a file that was previously fsynced in
 * the current transaction, then try to fsync it again in the same transaction,
 * we will know that there were changes in the file and that it needs to be
 * logged.
 */
static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
{
        spin_lock(&inode->lock);
        inode->last_sub_trans = inode->root->log_transid;
        spin_unlock(&inode->lock);
}

/*
 * Should be called while holding the inode's VFS lock in exclusive mode, or
 * while holding the inode's mmap lock (struct btrfs_inode::i_mmap_lock) in
 * either shared or exclusive mode, or in a context where no one else can access
 * the inode concurrently (during inode creation or when loading an inode from
 * disk).
 */
static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode)
{
        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
        /*
         * The inode may have been part of a reflink operation in the last
         * transaction that modified it, and then a fsync has reset the
         * last_reflink_trans to avoid subsequent fsyncs in the same
         * transaction to do unnecessary work. So update last_reflink_trans
         * to the last_trans value (we have to be pessimistic and assume a
         * reflink happened).
         *
         * The ->last_trans is protected by the inode's spinlock and we can
         * have a concurrent ordered extent completion update it. Also set
         * last_reflink_trans to ->last_trans only if the former is less than
         * the later, because we can be called in a context where
         * last_reflink_trans was set to the current transaction generation
         * while ->last_trans was not yet updated in the current transaction,
         * and therefore has a lower value.
         */
        spin_lock(&inode->lock);
        if (inode->last_reflink_trans < inode->last_trans)
                inode->last_reflink_trans = inode->last_trans;
        spin_unlock(&inode->lock);
}

static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
{
        bool ret = false;

        spin_lock(&inode->lock);
        if (inode->logged_trans == generation &&
            inode->last_sub_trans <= inode->last_log_commit &&
            inode->last_sub_trans <= btrfs_get_root_last_log_commit(inode->root))
                ret = true;
        spin_unlock(&inode->lock);
        return ret;
}

/*
 * Check if the inode has flags compatible with compression
 */
static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode)
{
        if (inode->flags & BTRFS_INODE_NODATACOW ||
            inode->flags & BTRFS_INODE_NODATASUM)
                return false;
        return true;
}

/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT                                "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes)                size, bytes

int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
                            u32 pgoff, u8 *csum, const u8 * const csum_expected);
bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
                        u32 bio_offset, struct bio_vec *bv);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
                              u64 *orig_start, u64 *orig_block_len,
                              u64 *ram_bytes, bool nowait, bool strict);

void btrfs_del_delalloc_inode(struct btrfs_inode *inode);
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
                       struct btrfs_inode *dir, struct btrfs_inode *inode,
                       const struct fscrypt_str *name);
int btrfs_add_link(struct btrfs_trans_handle *trans,
                   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
                   const struct fscrypt_str *name, int add_backref, u64 index);
int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry);
int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
                         int front);

int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
                               bool in_reclaim_context);
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
                              unsigned int extra_bits,
                              struct extent_state **cached_state);

struct btrfs_new_inode_args {
        /* Input */
        struct inode *dir;
        struct dentry *dentry;
        struct inode *inode;
        bool orphan;
        bool subvol;

        /* Output from btrfs_new_inode_prepare(), input to btrfs_create_new_inode(). */
        struct posix_acl *default_acl;
        struct posix_acl *acl;
        struct fscrypt_name fname;
};

int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
                            unsigned int *trans_num_items);
int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
                           struct btrfs_new_inode_args *args);
void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args);
struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap,
                                     struct inode *dir);
 void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
                                u32 bits);
void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
                                 struct extent_state *state, u32 bits);
void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
                                 struct extent_state *other);
void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
                                 struct extent_state *orig, u64 split);
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
void btrfs_evict_inode(struct inode *inode);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
void btrfs_free_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
int __init btrfs_init_cachep(void);
void __cold btrfs_destroy_cachep(void);
struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
                              struct btrfs_root *root, struct btrfs_path *path);
struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
                                    struct page *page, u64 start, u64 len);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
                       struct btrfs_inode *inode);
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
                                struct btrfs_inode *inode);
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode);
int btrfs_orphan_cleanup(struct btrfs_root *root);
int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size);
void btrfs_add_delayed_iput(struct btrfs_inode *inode);
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
int btrfs_prealloc_file_range(struct inode *inode, int mode,
                              u64 start, u64 num_bytes, u64 min_size,
                              loff_t actual_len, u64 *alloc_hint);
int btrfs_prealloc_file_range_trans(struct inode *inode,
                                    struct btrfs_trans_handle *trans, int mode,
                                    u64 start, u64 num_bytes, u64 min_size,
                                    loff_t actual_len, u64 *alloc_hint);
int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
                             u64 start, u64 end, struct writeback_control *wbc);
int btrfs_writepage_cow_fixup(struct page *page);
int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
                                             int compress_type);
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
                                          u64 file_offset, u64 disk_bytenr,
                                          u64 disk_io_size,
                                          struct page **pages);
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
                           struct btrfs_ioctl_encoded_io_args *encoded);
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
                               const struct btrfs_ioctl_encoded_io_args *encoded);

ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
                       size_t done_before);
struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
                                  size_t done_before);
struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino);

extern const struct dentry_operations btrfs_dentry_operations;

/* Inode locking type flags, by default the exclusive lock is taken. */
enum btrfs_ilock_type {
        ENUM_BIT(BTRFS_ILOCK_SHARED),
        ENUM_BIT(BTRFS_ILOCK_TRY),
        ENUM_BIT(BTRFS_ILOCK_MMAP),
};

int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags);
void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags);
void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes,
                              const u64 del_bytes);
void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);

#endif





















































    1 






















    1 





    1 


    1 



















    1 
    1 



    1 















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/list_bl.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/mbcache.h>

/*
 * Mbcache is a simple key-value store. Keys need not be unique, however
 * key-value pairs are expected to be unique (we use this fact in
 * mb_cache_entry_delete_or_get()).
 *
 * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
 * Ext4 also uses it for deduplication of xattr values stored in inodes.
 * They use hash of data as a key and provide a value that may represent a
 * block or inode number. That's why keys need not be unique (hash of different
 * data may be the same). However user provided value always uniquely
 * identifies a cache entry.
 *
 * We provide functions for creation and removal of entries, search by key,
 * and a special "delete entry with given key-value pair" operation. Fixed
 * size hash table is used for fast key lookups.
 */

struct mb_cache {
        /* Hash table of entries */
        struct hlist_bl_head        *c_hash;
        /* log2 of hash table size */
        int                        c_bucket_bits;
        /* Maximum entries in cache to avoid degrading hash too much */
        unsigned long                c_max_entries;
        /* Protects c_list, c_entry_count */
        spinlock_t                c_list_lock;
        struct list_head        c_list;
        /* Number of entries in cache */
        unsigned long                c_entry_count;
        struct shrinker                *c_shrink;
        /* Work for shrinking when the cache has too many entries */
        struct work_struct        c_shrink_work;
};

static struct kmem_cache *mb_entry_cache;

static unsigned long mb_cache_shrink(struct mb_cache *cache,
                                     unsigned long nr_to_scan);

static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
                                                        u32 key)
{
        return &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
}

/*
 * Number of entries to reclaim synchronously when there are too many entries
 * in cache
 */
#define SYNC_SHRINK_BATCH 64

/*
 * mb_cache_entry_create - create entry in cache
 * @cache - cache where the entry should be created
 * @mask - gfp mask with which the entry should be allocated
 * @key - key of the entry
 * @value - value of the entry
 * @reusable - is the entry reusable by others?
 *
 * Creates entry in @cache with key @key and value @value. The function returns
 * -EBUSY if entry with the same key and value already exists in cache.
 * Otherwise 0 is returned.
 */
int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
                          u64 value, bool reusable)
{
        struct mb_cache_entry *entry, *dup;
        struct hlist_bl_node *dup_node;
        struct hlist_bl_head *head;

        /* Schedule background reclaim if there are too many entries */
        if (cache->c_entry_count >= cache->c_max_entries)
                schedule_work(&cache->c_shrink_work);
        /* Do some sync reclaim if background reclaim cannot keep up */
        if (cache->c_entry_count >= 2*cache->c_max_entries)
                mb_cache_shrink(cache, SYNC_SHRINK_BATCH);

        entry = kmem_cache_alloc(mb_entry_cache, mask);
        if (!entry)
                return -ENOMEM;

        INIT_LIST_HEAD(&entry->e_list);
        /*
         * We create entry with two references. One reference is kept by the
         * hash table, the other reference is used to protect us from
         * mb_cache_entry_delete_or_get() until the entry is fully setup. This
         * avoids nesting of cache->c_list_lock into hash table bit locks which
         * is problematic for RT.
         */
        atomic_set(&entry->e_refcnt, 2);
        entry->e_key = key;
        entry->e_value = value;
        entry->e_flags = 0;
        if (reusable)
                set_bit(MBE_REUSABLE_B, &entry->e_flags);
        head = mb_cache_entry_head(cache, key);
        hlist_bl_lock(head);
        hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
                if (dup->e_key == key && dup->e_value == value) {
                        hlist_bl_unlock(head);
                        kmem_cache_free(mb_entry_cache, entry);
                        return -EBUSY;
                }
        }
        hlist_bl_add_head(&entry->e_hash_list, head);
        hlist_bl_unlock(head);
        spin_lock(&cache->c_list_lock);
        list_add_tail(&entry->e_list, &cache->c_list);
        cache->c_entry_count++;
        spin_unlock(&cache->c_list_lock);
        mb_cache_entry_put(cache, entry);

        return 0;
}
EXPORT_SYMBOL(mb_cache_entry_create);

void __mb_cache_entry_free(struct mb_cache *cache, struct mb_cache_entry *entry)
{
        struct hlist_bl_head *head;

        head = mb_cache_entry_head(cache, entry->e_key);
        hlist_bl_lock(head);
        hlist_bl_del(&entry->e_hash_list);
        hlist_bl_unlock(head);
        kmem_cache_free(mb_entry_cache, entry);
}
EXPORT_SYMBOL(__mb_cache_entry_free);

/*
 * mb_cache_entry_wait_unused - wait to be the last user of the entry
 *
 * @entry - entry to work on
 *
 * Wait to be the last user of the entry.
 */
void mb_cache_entry_wait_unused(struct mb_cache_entry *entry)
{
        wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 2);
}
EXPORT_SYMBOL(mb_cache_entry_wait_unused);

static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
                                           struct mb_cache_entry *entry,
                                           u32 key)
{
        struct mb_cache_entry *old_entry = entry;
        struct hlist_bl_node *node;
        struct hlist_bl_head *head;

        head = mb_cache_entry_head(cache, key);
        hlist_bl_lock(head);
        if (entry && !hlist_bl_unhashed(&entry->e_hash_list))
                node = entry->e_hash_list.next;
        else
                node = hlist_bl_first(head);
        while (node) {
                entry = hlist_bl_entry(node, struct mb_cache_entry,
                                       e_hash_list);
                if (entry->e_key == key &&
                    test_bit(MBE_REUSABLE_B, &entry->e_flags) &&
                    atomic_inc_not_zero(&entry->e_refcnt))
                        goto out;
                node = node->next;
        }
        entry = NULL;
out:
        hlist_bl_unlock(head);
        if (old_entry)
                mb_cache_entry_put(cache, old_entry);

        return entry;
}

/*
 * mb_cache_entry_find_first - find the first reusable entry with the given key
 * @cache: cache where we should search
 * @key: key to look for
 *
 * Search in @cache for a reusable entry with key @key. Grabs reference to the
 * first reusable entry found and returns the entry.
 */
struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
                                                 u32 key)
{
        return __entry_find(cache, NULL, key);
}
EXPORT_SYMBOL(mb_cache_entry_find_first);

/*
 * mb_cache_entry_find_next - find next reusable entry with the same key
 * @cache: cache where we should search
 * @entry: entry to start search from
 *
 * Finds next reusable entry in the hash chain which has the same key as @entry.
 * If @entry is unhashed (which can happen when deletion of entry races with the
 * search), finds the first reusable entry in the hash chain. The function drops
 * reference to @entry and returns with a reference to the found entry.
 */
struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
                                                struct mb_cache_entry *entry)
{
        return __entry_find(cache, entry, entry->e_key);
}
EXPORT_SYMBOL(mb_cache_entry_find_next);

/*
 * mb_cache_entry_get - get a cache entry by value (and key)
 * @cache - cache we work with
 * @key - key
 * @value - value
 */
struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
                                          u64 value)
{
        struct hlist_bl_node *node;
        struct hlist_bl_head *head;
        struct mb_cache_entry *entry;

        head = mb_cache_entry_head(cache, key);
        hlist_bl_lock(head);
        hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
                if (entry->e_key == key && entry->e_value == value &&
                    atomic_inc_not_zero(&entry->e_refcnt))
                        goto out;
        }
        entry = NULL;
out:
        hlist_bl_unlock(head);
        return entry;
}
EXPORT_SYMBOL(mb_cache_entry_get);

/* mb_cache_entry_delete_or_get - remove a cache entry if it has no users
 * @cache - cache we work with
 * @key - key
 * @value - value
 *
 * Remove entry from cache @cache with key @key and value @value. The removal
 * happens only if the entry is unused. The function returns NULL in case the
 * entry was successfully removed or there's no entry in cache. Otherwise the
 * function grabs reference of the entry that we failed to delete because it
 * still has users and return it.
 */
struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
                                                    u32 key, u64 value)
{
        struct mb_cache_entry *entry;

        entry = mb_cache_entry_get(cache, key, value);
        if (!entry)
                return NULL;

        /*
         * Drop the ref we got from mb_cache_entry_get() and the initial hash
         * ref if we are the last user
         */
        if (atomic_cmpxchg(&entry->e_refcnt, 2, 0) != 2)
                return entry;

        spin_lock(&cache->c_list_lock);
        if (!list_empty(&entry->e_list))
                list_del_init(&entry->e_list);
        cache->c_entry_count--;
        spin_unlock(&cache->c_list_lock);
        __mb_cache_entry_free(cache, entry);
        return NULL;
}
EXPORT_SYMBOL(mb_cache_entry_delete_or_get);

/* mb_cache_entry_touch - cache entry got used
 * @cache - cache the entry belongs to
 * @entry - entry that got used
 *
 * Marks entry as used to give hit higher chances of surviving in cache.
 */
void mb_cache_entry_touch(struct mb_cache *cache,
                          struct mb_cache_entry *entry)
{
        set_bit(MBE_REFERENCED_B, &entry->e_flags);
}
EXPORT_SYMBOL(mb_cache_entry_touch);

static unsigned long mb_cache_count(struct shrinker *shrink,
                                    struct shrink_control *sc)
{
        struct mb_cache *cache = shrink->private_data;

        return cache->c_entry_count;
}

/* Shrink number of entries in cache */
static unsigned long mb_cache_shrink(struct mb_cache *cache,
                                     unsigned long nr_to_scan)
{
        struct mb_cache_entry *entry;
        unsigned long shrunk = 0;

        spin_lock(&cache->c_list_lock);
        while (nr_to_scan-- && !list_empty(&cache->c_list)) {
                entry = list_first_entry(&cache->c_list,
                                         struct mb_cache_entry, e_list);
                /* Drop initial hash reference if there is no user */
                if (test_bit(MBE_REFERENCED_B, &entry->e_flags) ||
                    atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) {
                        clear_bit(MBE_REFERENCED_B, &entry->e_flags);
                        list_move_tail(&entry->e_list, &cache->c_list);
                        continue;
                }
                list_del_init(&entry->e_list);
                cache->c_entry_count--;
                spin_unlock(&cache->c_list_lock);
                __mb_cache_entry_free(cache, entry);
                shrunk++;
                cond_resched();
                spin_lock(&cache->c_list_lock);
        }
        spin_unlock(&cache->c_list_lock);

        return shrunk;
}

static unsigned long mb_cache_scan(struct shrinker *shrink,
                                   struct shrink_control *sc)
{
        struct mb_cache *cache = shrink->private_data;
        return mb_cache_shrink(cache, sc->nr_to_scan);
}

/* We shrink 1/X of the cache when we have too many entries in it */
#define SHRINK_DIVISOR 16

static void mb_cache_shrink_worker(struct work_struct *work)
{
        struct mb_cache *cache = container_of(work, struct mb_cache,
                                              c_shrink_work);
        mb_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR);
}

/*
 * mb_cache_create - create cache
 * @bucket_bits: log2 of the hash table size
 *
 * Create cache for keys with 2^bucket_bits hash entries.
 */
struct mb_cache *mb_cache_create(int bucket_bits)
{
        struct mb_cache *cache;
        unsigned long bucket_count = 1UL << bucket_bits;
        unsigned long i;

        cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL);
        if (!cache)
                goto err_out;
        cache->c_bucket_bits = bucket_bits;
        cache->c_max_entries = bucket_count << 4;
        INIT_LIST_HEAD(&cache->c_list);
        spin_lock_init(&cache->c_list_lock);
        cache->c_hash = kmalloc_array(bucket_count,
                                      sizeof(struct hlist_bl_head),
                                      GFP_KERNEL);
        if (!cache->c_hash) {
                kfree(cache);
                goto err_out;
        }
        for (i = 0; i < bucket_count; i++)
                INIT_HLIST_BL_HEAD(&cache->c_hash[i]);

        cache->c_shrink = shrinker_alloc(0, "mbcache-shrinker");
        if (!cache->c_shrink) {
                kfree(cache->c_hash);
                kfree(cache);
                goto err_out;
        }

        cache->c_shrink->count_objects = mb_cache_count;
        cache->c_shrink->scan_objects = mb_cache_scan;
        cache->c_shrink->private_data = cache;

        shrinker_register(cache->c_shrink);

        INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker);

        return cache;

err_out:
        return NULL;
}
EXPORT_SYMBOL(mb_cache_create);

/*
 * mb_cache_destroy - destroy cache
 * @cache: the cache to destroy
 *
 * Free all entries in cache and cache itself. Caller must make sure nobody
 * (except shrinker) can reach @cache when calling this.
 */
void mb_cache_destroy(struct mb_cache *cache)
{
        struct mb_cache_entry *entry, *next;

        shrinker_free(cache->c_shrink);

        /*
         * We don't bother with any locking. Cache must not be used at this
         * point.
         */
        list_for_each_entry_safe(entry, next, &cache->c_list, e_list) {
                list_del(&entry->e_list);
                WARN_ON(atomic_read(&entry->e_refcnt) != 1);
                mb_cache_entry_put(cache, entry);
        }
        kfree(cache->c_hash);
        kfree(cache);
}
EXPORT_SYMBOL(mb_cache_destroy);

static int __init mbcache_init(void)
{
        mb_entry_cache = KMEM_CACHE(mb_cache_entry, SLAB_RECLAIM_ACCOUNT);
        if (!mb_entry_cache)
                return -ENOMEM;
        return 0;
}

static void __exit mbcache_exit(void)
{
        kmem_cache_destroy(mb_entry_cache);
}

module_init(mbcache_init)
module_exit(mbcache_exit)

MODULE_AUTHOR("Jan Kara <jack@suse.cz>");
MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
MODULE_LICENSE("GPL");




































































































    4 






    1 


    3 




    2 









    3 


    3 
























    3 









    3 







    3 
























































    4 







    4 


    3 





    1 




    3 






    3 




















    3 

















    1 





    3 


    3 



























































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
/*
 * Ext4 orphan inode handling
 */
#include <linux/fs.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>

#include "ext4.h"
#include "ext4_jbd2.h"

static int ext4_orphan_file_add(handle_t *handle, struct inode *inode)
{
        int i, j, start;
        struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
        int ret = 0;
        bool found = false;
        __le32 *bdata;
        int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
        int looped = 0;

        /*
         * Find block with free orphan entry. Use CPU number for a naive hash
         * for a search start in the orphan file
         */
        start = raw_smp_processor_id()*13 % oi->of_blocks;
        i = start;
        do {
                if (atomic_dec_if_positive(&oi->of_binfo[i].ob_free_entries)
                    >= 0) {
                        found = true;
                        break;
                }
                if (++i >= oi->of_blocks)
                        i = 0;
        } while (i != start);

        if (!found) {
                /*
                 * For now we don't grow or shrink orphan file. We just use
                 * whatever was allocated at mke2fs time. The additional
                 * credits we would have to reserve for each orphan inode
                 * operation just don't seem worth it.
                 */
                return -ENOSPC;
        }

        ret = ext4_journal_get_write_access(handle, inode->i_sb,
                                oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE);
        if (ret) {
                atomic_inc(&oi->of_binfo[i].ob_free_entries);
                return ret;
        }

        bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
        /* Find empty slot in a block */
        j = 0;
        do {
                if (looped) {
                        /*
                         * Did we walk through the block several times without
                         * finding free entry? It is theoretically possible
                         * if entries get constantly allocated and freed or
                         * if the block is corrupted. Avoid indefinite looping
                         * and bail. We'll use orphan list instead.
                         */
                        if (looped > 3) {
                                atomic_inc(&oi->of_binfo[i].ob_free_entries);
                                return -ENOSPC;
                        }
                        cond_resched();
                }
                while (bdata[j]) {
                        if (++j >= inodes_per_ob) {
                                j = 0;
                                looped++;
                        }
                }
        } while (cmpxchg(&bdata[j], (__le32)0, cpu_to_le32(inode->i_ino)) !=
                 (__le32)0);

        EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
        ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);

        return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh);
}

/*
 * ext4_orphan_add() links an unlinked or truncated inode into a list of
 * such inodes, starting at the superblock, in case we crash before the
 * file is closed/deleted, or in case the inode truncate spans multiple
 * transactions and the last transaction is not recovered after a crash.
 *
 * At filesystem recovery time, we walk this list deleting unlinked
 * inodes and truncating linked inodes in ext4_orphan_cleanup().
 *
 * Orphan list manipulation functions must be called under i_rwsem unless
 * we are just creating the inode or deleting it.
 */
int ext4_orphan_add(handle_t *handle, struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_iloc iloc;
        int err = 0, rc;
        bool dirty = false;

        if (!sbi->s_journal || is_bad_inode(inode))
                return 0;

        WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
                     !inode_is_locked(inode));
        /*
         * Inode orphaned in orphan file or in orphan list?
         */
        if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) ||
            !list_empty(&EXT4_I(inode)->i_orphan))
                return 0;

        /*
         * Orphan handling is only valid for files with data blocks
         * being truncated, or files being unlinked. Note that we either
         * hold i_rwsem, or the inode can not be referenced from outside,
         * so i_nlink should not be bumped due to race
         */
        ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);

        if (sbi->s_orphan_info.of_blocks) {
                err = ext4_orphan_file_add(handle, inode);
                /*
                 * Fallback to normal orphan list of orphan file is
                 * out of space
                 */
                if (err != -ENOSPC)
                        return err;
        }

        BUFFER_TRACE(sbi->s_sbh, "get_write_access");
        err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
                                            EXT4_JTR_NONE);
        if (err)
                goto out;

        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (err)
                goto out;

        mutex_lock(&sbi->s_orphan_lock);
        /*
         * Due to previous errors inode may be already a part of on-disk
         * orphan list. If so skip on-disk list modification.
         */
        if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) >
            (le32_to_cpu(sbi->s_es->s_inodes_count))) {
                /* Insert this inode at the head of the on-disk orphan list */
                NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan);
                lock_buffer(sbi->s_sbh);
                sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
                ext4_superblock_csum_set(sb);
                unlock_buffer(sbi->s_sbh);
                dirty = true;
        }
        list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan);
        mutex_unlock(&sbi->s_orphan_lock);

        if (dirty) {
                err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
                rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
                if (!err)
                        err = rc;
                if (err) {
                        /*
                         * We have to remove inode from in-memory list if
                         * addition to on disk orphan list failed. Stray orphan
                         * list entries can cause panics at unmount time.
                         */
                        mutex_lock(&sbi->s_orphan_lock);
                        list_del_init(&EXT4_I(inode)->i_orphan);
                        mutex_unlock(&sbi->s_orphan_lock);
                }
        } else
                brelse(iloc.bh);

        ext4_debug("superblock will point to %lu\n", inode->i_ino);
        ext4_debug("orphan inode %lu will point to %d\n",
                        inode->i_ino, NEXT_ORPHAN(inode));
out:
        ext4_std_error(sb, err);
        return err;
}

static int ext4_orphan_file_del(handle_t *handle, struct inode *inode)
{
        struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info;
        __le32 *bdata;
        int blk, off;
        int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb);
        int ret = 0;

        if (!handle)
                goto out;
        blk = EXT4_I(inode)->i_orphan_idx / inodes_per_ob;
        off = EXT4_I(inode)->i_orphan_idx % inodes_per_ob;
        if (WARN_ON_ONCE(blk >= oi->of_blocks))
                goto out;

        ret = ext4_journal_get_write_access(handle, inode->i_sb,
                                oi->of_binfo[blk].ob_bh, EXT4_JTR_ORPHAN_FILE);
        if (ret)
                goto out;

        bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data);
        bdata[off] = 0;
        atomic_inc(&oi->of_binfo[blk].ob_free_entries);
        ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh);
out:
        ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
        INIT_LIST_HEAD(&EXT4_I(inode)->i_orphan);

        return ret;
}

/*
 * ext4_orphan_del() removes an unlinked or truncated inode from the list
 * of such inodes stored on disk, because it is finally being cleaned up.
 */
int ext4_orphan_del(handle_t *handle, struct inode *inode)
{
        struct list_head *prev;
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        __u32 ino_next;
        struct ext4_iloc iloc;
        int err = 0;

        if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
                return 0;

        WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
                     !inode_is_locked(inode));
        if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE))
                return ext4_orphan_file_del(handle, inode);

        /* Do this quick check before taking global s_orphan_lock. */
        if (list_empty(&ei->i_orphan))
                return 0;

        if (handle) {
                /* Grab inode buffer early before taking global s_orphan_lock */
                err = ext4_reserve_inode_write(handle, inode, &iloc);
        }

        mutex_lock(&sbi->s_orphan_lock);
        ext4_debug("remove inode %lu from orphan list\n", inode->i_ino);

        prev = ei->i_orphan.prev;
        list_del_init(&ei->i_orphan);

        /* If we're on an error path, we may not have a valid
         * transaction handle with which to update the orphan list on
         * disk, but we still need to remove the inode from the linked
         * list in memory. */
        if (!handle || err) {
                mutex_unlock(&sbi->s_orphan_lock);
                goto out_err;
        }

        ino_next = NEXT_ORPHAN(inode);
        if (prev == &sbi->s_orphan) {
                ext4_debug("superblock will point to %u\n", ino_next);
                BUFFER_TRACE(sbi->s_sbh, "get_write_access");
                err = ext4_journal_get_write_access(handle, inode->i_sb,
                                                    sbi->s_sbh, EXT4_JTR_NONE);
                if (err) {
                        mutex_unlock(&sbi->s_orphan_lock);
                        goto out_brelse;
                }
                lock_buffer(sbi->s_sbh);
                sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
                ext4_superblock_csum_set(inode->i_sb);
                unlock_buffer(sbi->s_sbh);
                mutex_unlock(&sbi->s_orphan_lock);
                err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
        } else {
                struct ext4_iloc iloc2;
                struct inode *i_prev =
                        &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;

                ext4_debug("orphan inode %lu will point to %u\n",
                          i_prev->i_ino, ino_next);
                err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
                if (err) {
                        mutex_unlock(&sbi->s_orphan_lock);
                        goto out_brelse;
                }
                NEXT_ORPHAN(i_prev) = ino_next;
                err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
                mutex_unlock(&sbi->s_orphan_lock);
        }
        if (err)
                goto out_brelse;
        NEXT_ORPHAN(inode) = 0;
        err = ext4_mark_iloc_dirty(handle, inode, &iloc);
out_err:
        ext4_std_error(inode->i_sb, err);
        return err;

out_brelse:
        brelse(iloc.bh);
        goto out_err;
}

#ifdef CONFIG_QUOTA
static int ext4_quota_on_mount(struct super_block *sb, int type)
{
        return dquot_quota_on_mount(sb,
                rcu_dereference_protected(EXT4_SB(sb)->s_qf_names[type],
                                          lockdep_is_held(&sb->s_umount)),
                EXT4_SB(sb)->s_jquota_fmt, type);
}
#endif

static void ext4_process_orphan(struct inode *inode,
                                int *nr_truncates, int *nr_orphans)
{
        struct super_block *sb = inode->i_sb;
        int ret;

        dquot_initialize(inode);
        if (inode->i_nlink) {
                if (test_opt(sb, DEBUG))
                        ext4_msg(sb, KERN_DEBUG,
                                "%s: truncating inode %lu to %lld bytes",
                                __func__, inode->i_ino, inode->i_size);
                ext4_debug("truncating inode %lu to %lld bytes\n",
                           inode->i_ino, inode->i_size);
                inode_lock(inode);
                truncate_inode_pages(inode->i_mapping, inode->i_size);
                ret = ext4_truncate(inode);
                if (ret) {
                        /*
                         * We need to clean up the in-core orphan list
                         * manually if ext4_truncate() failed to get a
                         * transaction handle.
                         */
                        ext4_orphan_del(NULL, inode);
                        ext4_std_error(inode->i_sb, ret);
                }
                inode_unlock(inode);
                (*nr_truncates)++;
        } else {
                if (test_opt(sb, DEBUG))
                        ext4_msg(sb, KERN_DEBUG,
                                "%s: deleting unreferenced inode %lu",
                                __func__, inode->i_ino);
                ext4_debug("deleting unreferenced inode %lu\n",
                           inode->i_ino);
                (*nr_orphans)++;
        }
        iput(inode);  /* The delete magic happens here! */
}

/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
 * the superblock) which were deleted from all directories, but held open by
 * a process at the time of a crash.  We walk the list and try to delete these
 * inodes at recovery time (only with a read-write filesystem).
 *
 * In order to keep the orphan inode chain consistent during traversal (in
 * case of crash during recovery), we link each inode into the superblock
 * orphan list_head and handle it the same way as an inode deletion during
 * normal operation (which journals the operations for us).
 *
 * We only do an iget() and an iput() on each inode, which is very safe if we
 * accidentally point at an in-use or already deleted inode.  The worst that
 * can happen in this case is that we get a "bit already cleared" message from
 * ext4_free_inode().  The only reason we would point at a wrong inode is if
 * e2fsck was run on this filesystem, and it must have already done the orphan
 * inode cleanup for us, so we can safely abort without any further action.
 */
void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
{
        unsigned int s_flags = sb->s_flags;
        int nr_orphans = 0, nr_truncates = 0;
        struct inode *inode;
        int i, j;
#ifdef CONFIG_QUOTA
        int quota_update = 0;
#endif
        __le32 *bdata;
        struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
        int inodes_per_ob = ext4_inodes_per_orphan_block(sb);

        if (!es->s_last_orphan && !oi->of_blocks) {
                ext4_debug("no orphan inodes to clean up\n");
                return;
        }

        if (bdev_read_only(sb->s_bdev)) {
                ext4_msg(sb, KERN_ERR, "write access "
                        "unavailable, skipping orphan cleanup");
                return;
        }

        /* Check if feature set would not allow a r/w mount */
        if (!ext4_feature_set_ok(sb, 0)) {
                ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
                         "unknown ROCOMPAT features");
                return;
        }

        if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
                /* don't clear list on RO mount w/ errors */
                if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
                        ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
                                  "clearing orphan list.");
                        es->s_last_orphan = 0;
                }
                ext4_debug("Skipping orphan recovery on fs with errors.\n");
                return;
        }

        if (s_flags & SB_RDONLY) {
                ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
                sb->s_flags &= ~SB_RDONLY;
        }
#ifdef CONFIG_QUOTA
        /*
         * Turn on quotas which were not enabled for read-only mounts if
         * filesystem has quota feature, so that they are updated correctly.
         */
        if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) {
                int ret = ext4_enable_quotas(sb);

                if (!ret)
                        quota_update = 1;
                else
                        ext4_msg(sb, KERN_ERR,
                                "Cannot turn on quotas: error %d", ret);
        }

        /* Turn on journaled quotas used for old sytle */
        for (i = 0; i < EXT4_MAXQUOTAS; i++) {
                if (EXT4_SB(sb)->s_qf_names[i]) {
                        int ret = ext4_quota_on_mount(sb, i);

                        if (!ret)
                                quota_update = 1;
                        else
                                ext4_msg(sb, KERN_ERR,
                                        "Cannot turn on journaled "
                                        "quota: type %d: error %d", i, ret);
                }
        }
#endif

        while (es->s_last_orphan) {
                /*
                 * We may have encountered an error during cleanup; if
                 * so, skip the rest.
                 */
                if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
                        ext4_debug("Skipping orphan recovery on fs with errors.\n");
                        es->s_last_orphan = 0;
                        break;
                }

                inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
                if (IS_ERR(inode)) {
                        es->s_last_orphan = 0;
                        break;
                }

                list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
                ext4_process_orphan(inode, &nr_truncates, &nr_orphans);
        }

        for (i = 0; i < oi->of_blocks; i++) {
                bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
                for (j = 0; j < inodes_per_ob; j++) {
                        if (!bdata[j])
                                continue;
                        inode = ext4_orphan_get(sb, le32_to_cpu(bdata[j]));
                        if (IS_ERR(inode))
                                continue;
                        ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE);
                        EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j;
                        ext4_process_orphan(inode, &nr_truncates, &nr_orphans);
                }
        }

#define PLURAL(x) (x), ((x) == 1) ? "" : "s"

        if (nr_orphans)
                ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
                       PLURAL(nr_orphans));
        if (nr_truncates)
                ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
                       PLURAL(nr_truncates));
#ifdef CONFIG_QUOTA
        /* Turn off quotas if they were enabled for orphan cleanup */
        if (quota_update) {
                for (i = 0; i < EXT4_MAXQUOTAS; i++) {
                        if (sb_dqopt(sb)->files[i])
                                dquot_quota_off(sb, i);
                }
        }
#endif
        sb->s_flags = s_flags; /* Restore SB_RDONLY status */
}

void ext4_release_orphan_info(struct super_block *sb)
{
        int i;
        struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;

        if (!oi->of_blocks)
                return;
        for (i = 0; i < oi->of_blocks; i++)
                brelse(oi->of_binfo[i].ob_bh);
        kfree(oi->of_binfo);
}

static struct ext4_orphan_block_tail *ext4_orphan_block_tail(
                                                struct super_block *sb,
                                                struct buffer_head *bh)
{
        return (struct ext4_orphan_block_tail *)(bh->b_data + sb->s_blocksize -
                                sizeof(struct ext4_orphan_block_tail));
}

static int ext4_orphan_file_block_csum_verify(struct super_block *sb,
                                              struct buffer_head *bh)
{
        __u32 calculated;
        int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
        struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
        struct ext4_orphan_block_tail *ot;
        __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);

        if (!ext4_has_metadata_csum(sb))
                return 1;

        ot = ext4_orphan_block_tail(sb, bh);
        calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed,
                                 (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr));
        calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data,
                                 inodes_per_ob * sizeof(__u32));
        return le32_to_cpu(ot->ob_checksum) == calculated;
}

/* This gets called only when checksumming is enabled */
void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers,
                                    struct buffer_head *bh,
                                    void *data, size_t size)
{
        struct super_block *sb = EXT4_TRIGGER(triggers)->sb;
        __u32 csum;
        int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
        struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
        struct ext4_orphan_block_tail *ot;
        __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);

        csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed,
                           (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr));
        csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data,
                           inodes_per_ob * sizeof(__u32));
        ot = ext4_orphan_block_tail(sb, bh);
        ot->ob_checksum = cpu_to_le32(csum);
}

int ext4_init_orphan_info(struct super_block *sb)
{
        struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
        struct inode *inode;
        int i, j;
        int ret;
        int free;
        __le32 *bdata;
        int inodes_per_ob = ext4_inodes_per_orphan_block(sb);
        struct ext4_orphan_block_tail *ot;
        ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum);

        if (!ext4_has_feature_orphan_file(sb))
                return 0;

        inode = ext4_iget(sb, orphan_ino, EXT4_IGET_SPECIAL);
        if (IS_ERR(inode)) {
                ext4_msg(sb, KERN_ERR, "get orphan inode failed");
                return PTR_ERR(inode);
        }
        oi->of_blocks = inode->i_size >> sb->s_blocksize_bits;
        oi->of_csum_seed = EXT4_I(inode)->i_csum_seed;
        oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block),
                               GFP_KERNEL);
        if (!oi->of_binfo) {
                ret = -ENOMEM;
                goto out_put;
        }
        for (i = 0; i < oi->of_blocks; i++) {
                oi->of_binfo[i].ob_bh = ext4_bread(NULL, inode, i, 0);
                if (IS_ERR(oi->of_binfo[i].ob_bh)) {
                        ret = PTR_ERR(oi->of_binfo[i].ob_bh);
                        goto out_free;
                }
                if (!oi->of_binfo[i].ob_bh) {
                        ret = -EIO;
                        goto out_free;
                }
                ot = ext4_orphan_block_tail(sb, oi->of_binfo[i].ob_bh);
                if (le32_to_cpu(ot->ob_magic) != EXT4_ORPHAN_BLOCK_MAGIC) {
                        ext4_error(sb, "orphan file block %d: bad magic", i);
                        ret = -EIO;
                        goto out_free;
                }
                if (!ext4_orphan_file_block_csum_verify(sb,
                                                oi->of_binfo[i].ob_bh)) {
                        ext4_error(sb, "orphan file block %d: bad checksum", i);
                        ret = -EIO;
                        goto out_free;
                }
                bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data);
                free = 0;
                for (j = 0; j < inodes_per_ob; j++)
                        if (bdata[j] == 0)
                                free++;
                atomic_set(&oi->of_binfo[i].ob_free_entries, free);
        }
        iput(inode);
        return 0;
out_free:
        for (i--; i >= 0; i--)
                brelse(oi->of_binfo[i].ob_bh);
        kfree(oi->of_binfo);
out_put:
        iput(inode);
        return ret;
}

int ext4_orphan_file_empty(struct super_block *sb)
{
        struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info;
        int i;
        int inodes_per_ob = ext4_inodes_per_orphan_block(sb);

        if (!ext4_has_feature_orphan_file(sb))
                return 1;
        for (i = 0; i < oi->of_blocks; i++)
                if (atomic_read(&oi->of_binfo[i].ob_free_entries) !=
                    inodes_per_ob)
                        return 0;
        return 1;
}








































































































































































































    1 


    1 


    1 


















































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/nospec.h>
#include <linux/hugetlb.h>
#include <linux/compat.h>
#include <linux/io_uring.h>

#include <uapi/linux/io_uring.h>

#include "io_uring.h"
#include "alloc_cache.h"
#include "openclose.h"
#include "rsrc.h"
#include "memmap.h"

struct io_rsrc_update {
        struct file                        *file;
        u64                                arg;
        u32                                nr_args;
        u32                                offset;
};

static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
                                  struct io_mapped_ubuf **pimu,
                                  struct page **last_hpage);

/* only define max */
#define IORING_MAX_FIXED_FILES        (1U << 20)
#define IORING_MAX_REG_BUFFERS        (1U << 14)

static const struct io_mapped_ubuf dummy_ubuf = {
        /* set invalid range, so io_import_fixed() fails meeting it */
        .ubuf = -1UL,
        .ubuf_end = 0,
};

int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
{
        unsigned long page_limit, cur_pages, new_pages;

        if (!nr_pages)
                return 0;

        /* Don't allow more pages than we can safely lock */
        page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;

        cur_pages = atomic_long_read(&user->locked_vm);
        do {
                new_pages = cur_pages + nr_pages;
                if (new_pages > page_limit)
                        return -ENOMEM;
        } while (!atomic_long_try_cmpxchg(&user->locked_vm,
                                          &cur_pages, new_pages));
        return 0;
}

static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
        if (ctx->user)
                __io_unaccount_mem(ctx->user, nr_pages);

        if (ctx->mm_account)
                atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
}

static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
{
        int ret;

        if (ctx->user) {
                ret = __io_account_mem(ctx->user, nr_pages);
                if (ret)
                        return ret;
        }

        if (ctx->mm_account)
                atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);

        return 0;
}

static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
                       void __user *arg, unsigned index)
{
        struct iovec __user *src;

#ifdef CONFIG_COMPAT
        if (ctx->compat) {
                struct compat_iovec __user *ciovs;
                struct compat_iovec ciov;

                ciovs = (struct compat_iovec __user *) arg;
                if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
                        return -EFAULT;

                dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
                dst->iov_len = ciov.iov_len;
                return 0;
        }
#endif
        src = (struct iovec __user *) arg;
        if (copy_from_user(dst, &src[index], sizeof(*dst)))
                return -EFAULT;
        return 0;
}

static int io_buffer_validate(struct iovec *iov)
{
        unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);

        /*
         * Don't impose further limits on the size and buffer
         * constraints here, we'll -EINVAL later when IO is
         * submitted if they are wrong.
         */
        if (!iov->iov_base)
                return iov->iov_len ? -EFAULT : 0;
        if (!iov->iov_len)
                return -EFAULT;

        /* arbitrary limit, but we need something */
        if (iov->iov_len > SZ_1G)
                return -EFAULT;

        if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
                return -EOVERFLOW;

        return 0;
}

static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
{
        struct io_mapped_ubuf *imu = *slot;
        unsigned int i;

        if (imu != &dummy_ubuf) {
                for (i = 0; i < imu->nr_bvecs; i++)
                        unpin_user_page(imu->bvec[i].bv_page);
                if (imu->acct_pages)
                        io_unaccount_mem(ctx, imu->acct_pages);
                kvfree(imu);
        }
        *slot = NULL;
}

static void io_rsrc_put_work(struct io_rsrc_node *node)
{
        struct io_rsrc_put *prsrc = &node->item;

        if (prsrc->tag)
                io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0);

        switch (node->type) {
        case IORING_RSRC_FILE:
                fput(prsrc->file);
                break;
        case IORING_RSRC_BUFFER:
                io_rsrc_buf_put(node->ctx, prsrc);
                break;
        default:
                WARN_ON_ONCE(1);
                break;
        }
}

void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
{
        if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node))
                kfree(node);
}

void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
        __must_hold(&node->ctx->uring_lock)
{
        struct io_ring_ctx *ctx = node->ctx;

        while (!list_empty(&ctx->rsrc_ref_list)) {
                node = list_first_entry(&ctx->rsrc_ref_list,
                                            struct io_rsrc_node, node);
                /* recycle ref nodes in order */
                if (node->refs)
                        break;
                list_del(&node->node);

                if (likely(!node->empty))
                        io_rsrc_put_work(node);
                io_rsrc_node_destroy(ctx, node);
        }
        if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce))
                wake_up_all(&ctx->rsrc_quiesce_wq);
}

struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
{
        struct io_rsrc_node *ref_node;

        ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache);
        if (!ref_node) {
                ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
                if (!ref_node)
                        return NULL;
        }

        ref_node->ctx = ctx;
        ref_node->empty = 0;
        ref_node->refs = 1;
        return ref_node;
}

__cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
                                      struct io_ring_ctx *ctx)
{
        struct io_rsrc_node *backup;
        DEFINE_WAIT(we);
        int ret;

        /* As We may drop ->uring_lock, other task may have started quiesce */
        if (data->quiesce)
                return -ENXIO;

        backup = io_rsrc_node_alloc(ctx);
        if (!backup)
                return -ENOMEM;
        ctx->rsrc_node->empty = true;
        ctx->rsrc_node->type = -1;
        list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list);
        io_put_rsrc_node(ctx, ctx->rsrc_node);
        ctx->rsrc_node = backup;

        if (list_empty(&ctx->rsrc_ref_list))
                return 0;

        if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
                atomic_set(&ctx->cq_wait_nr, 1);
                smp_mb();
        }

        ctx->rsrc_quiesce++;
        data->quiesce = true;
        do {
                prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE);
                mutex_unlock(&ctx->uring_lock);

                ret = io_run_task_work_sig(ctx);
                if (ret < 0) {
                        __set_current_state(TASK_RUNNING);
                        mutex_lock(&ctx->uring_lock);
                        if (list_empty(&ctx->rsrc_ref_list))
                                ret = 0;
                        break;
                }

                schedule();
                __set_current_state(TASK_RUNNING);
                mutex_lock(&ctx->uring_lock);
                ret = 0;
        } while (!list_empty(&ctx->rsrc_ref_list));

        finish_wait(&ctx->rsrc_quiesce_wq, &we);
        data->quiesce = false;
        ctx->rsrc_quiesce--;

        if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
                atomic_set(&ctx->cq_wait_nr, 0);
                smp_mb();
        }
        return ret;
}

static void io_free_page_table(void **table, size_t size)
{
        unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);

        for (i = 0; i < nr_tables; i++)
                kfree(table[i]);
        kfree(table);
}

static void io_rsrc_data_free(struct io_rsrc_data *data)
{
        size_t size = data->nr * sizeof(data->tags[0][0]);

        if (data->tags)
                io_free_page_table((void **)data->tags, size);
        kfree(data);
}

static __cold void **io_alloc_page_table(size_t size)
{
        unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
        size_t init_size = size;
        void **table;

        table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
        if (!table)
                return NULL;

        for (i = 0; i < nr_tables; i++) {
                unsigned int this_size = min_t(size_t, size, PAGE_SIZE);

                table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
                if (!table[i]) {
                        io_free_page_table(table, init_size);
                        return NULL;
                }
                size -= this_size;
        }
        return table;
}

__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type,
                                     u64 __user *utags,
                                     unsigned nr, struct io_rsrc_data **pdata)
{
        struct io_rsrc_data *data;
        int ret = 0;
        unsigned i;

        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (!data)
                return -ENOMEM;
        data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
        if (!data->tags) {
                kfree(data);
                return -ENOMEM;
        }

        data->nr = nr;
        data->ctx = ctx;
        data->rsrc_type = type;
        if (utags) {
                ret = -EFAULT;
                for (i = 0; i < nr; i++) {
                        u64 *tag_slot = io_get_tag_slot(data, i);

                        if (copy_from_user(tag_slot, &utags[i],
                                           sizeof(*tag_slot)))
                                goto fail;
                }
        }
        *pdata = data;
        return 0;
fail:
        io_rsrc_data_free(data);
        return ret;
}

static int __io_sqe_files_update(struct io_ring_ctx *ctx,
                                 struct io_uring_rsrc_update2 *up,
                                 unsigned nr_args)
{
        u64 __user *tags = u64_to_user_ptr(up->tags);
        __s32 __user *fds = u64_to_user_ptr(up->data);
        struct io_rsrc_data *data = ctx->file_data;
        struct io_fixed_file *file_slot;
        int fd, i, err = 0;
        unsigned int done;

        if (!ctx->file_data)
                return -ENXIO;
        if (up->offset + nr_args > ctx->nr_user_files)
                return -EINVAL;

        for (done = 0; done < nr_args; done++) {
                u64 tag = 0;

                if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
                    copy_from_user(&fd, &fds[done], sizeof(fd))) {
                        err = -EFAULT;
                        break;
                }
                if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
                        err = -EINVAL;
                        break;
                }
                if (fd == IORING_REGISTER_FILES_SKIP)
                        continue;

                i = array_index_nospec(up->offset + done, ctx->nr_user_files);
                file_slot = io_fixed_file_slot(&ctx->file_table, i);

                if (file_slot->file_ptr) {
                        err = io_queue_rsrc_removal(data, i,
                                                    io_slot_file(file_slot));
                        if (err)
                                break;
                        file_slot->file_ptr = 0;
                        io_file_bitmap_clear(&ctx->file_table, i);
                }
                if (fd != -1) {
                        struct file *file = fget(fd);

                        if (!file) {
                                err = -EBADF;
                                break;
                        }
                        /*
                         * Don't allow io_uring instances to be registered.
                         */
                        if (io_is_uring_fops(file)) {
                                fput(file);
                                err = -EBADF;
                                break;
                        }
                        *io_get_tag_slot(data, i) = tag;
                        io_fixed_file_set(file_slot, file);
                        io_file_bitmap_set(&ctx->file_table, i);
                }
        }
        return done ? done : err;
}

static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
                                   struct io_uring_rsrc_update2 *up,
                                   unsigned int nr_args)
{
        u64 __user *tags = u64_to_user_ptr(up->tags);
        struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
        struct page *last_hpage = NULL;
        __u32 done;
        int i, err;

        if (!ctx->buf_data)
                return -ENXIO;
        if (up->offset + nr_args > ctx->nr_user_bufs)
                return -EINVAL;

        for (done = 0; done < nr_args; done++) {
                struct io_mapped_ubuf *imu;
                u64 tag = 0;

                err = io_copy_iov(ctx, &iov, iovs, done);
                if (err)
                        break;
                if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
                        err = -EFAULT;
                        break;
                }
                err = io_buffer_validate(&iov);
                if (err)
                        break;
                if (!iov.iov_base && tag) {
                        err = -EINVAL;
                        break;
                }
                err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
                if (err)
                        break;

                i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
                if (ctx->user_bufs[i] != &dummy_ubuf) {
                        err = io_queue_rsrc_removal(ctx->buf_data, i,
                                                    ctx->user_bufs[i]);
                        if (unlikely(err)) {
                                io_buffer_unmap(ctx, &imu);
                                break;
                        }
                        ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf;
                }

                ctx->user_bufs[i] = imu;
                *io_get_tag_slot(ctx->buf_data, i) = tag;
        }
        return done ? done : err;
}

static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
                                     struct io_uring_rsrc_update2 *up,
                                     unsigned nr_args)
{
        __u32 tmp;

        lockdep_assert_held(&ctx->uring_lock);

        if (check_add_overflow(up->offset, nr_args, &tmp))
                return -EOVERFLOW;

        switch (type) {
        case IORING_RSRC_FILE:
                return __io_sqe_files_update(ctx, up, nr_args);
        case IORING_RSRC_BUFFER:
                return __io_sqe_buffers_update(ctx, up, nr_args);
        }
        return -EINVAL;
}

int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
                             unsigned nr_args)
{
        struct io_uring_rsrc_update2 up;

        if (!nr_args)
                return -EINVAL;
        memset(&up, 0, sizeof(up));
        if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
                return -EFAULT;
        if (up.resv || up.resv2)
                return -EINVAL;
        return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
}

int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
                            unsigned size, unsigned type)
{
        struct io_uring_rsrc_update2 up;

        if (size != sizeof(up))
                return -EINVAL;
        if (copy_from_user(&up, arg, sizeof(up)))
                return -EFAULT;
        if (!up.nr || up.resv || up.resv2)
                return -EINVAL;
        return __io_register_rsrc_update(ctx, type, &up, up.nr);
}

__cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
                            unsigned int size, unsigned int type)
{
        struct io_uring_rsrc_register rr;

        /* keep it extendible */
        if (size != sizeof(rr))
                return -EINVAL;

        memset(&rr, 0, sizeof(rr));
        if (copy_from_user(&rr, arg, size))
                return -EFAULT;
        if (!rr.nr || rr.resv2)
                return -EINVAL;
        if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
                return -EINVAL;

        switch (type) {
        case IORING_RSRC_FILE:
                if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
                        break;
                return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
                                             rr.nr, u64_to_user_ptr(rr.tags));
        case IORING_RSRC_BUFFER:
                if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
                        break;
                return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
                                               rr.nr, u64_to_user_ptr(rr.tags));
        }
        return -EINVAL;
}

int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);

        if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
                return -EINVAL;
        if (sqe->rw_flags || sqe->splice_fd_in)
                return -EINVAL;

        up->offset = READ_ONCE(sqe->off);
        up->nr_args = READ_ONCE(sqe->len);
        if (!up->nr_args)
                return -EINVAL;
        up->arg = READ_ONCE(sqe->addr);
        return 0;
}

static int io_files_update_with_index_alloc(struct io_kiocb *req,
                                            unsigned int issue_flags)
{
        struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
        __s32 __user *fds = u64_to_user_ptr(up->arg);
        unsigned int done;
        struct file *file;
        int ret, fd;

        if (!req->ctx->file_data)
                return -ENXIO;

        for (done = 0; done < up->nr_args; done++) {
                if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
                        ret = -EFAULT;
                        break;
                }

                file = fget(fd);
                if (!file) {
                        ret = -EBADF;
                        break;
                }
                ret = io_fixed_fd_install(req, issue_flags, file,
                                          IORING_FILE_INDEX_ALLOC);
                if (ret < 0)
                        break;
                if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
                        __io_close_fixed(req->ctx, issue_flags, ret);
                        ret = -EFAULT;
                        break;
                }
        }

        if (done)
                return done;
        return ret;
}

int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
        struct io_ring_ctx *ctx = req->ctx;
        struct io_uring_rsrc_update2 up2;
        int ret;

        up2.offset = up->offset;
        up2.data = up->arg;
        up2.nr = 0;
        up2.tags = 0;
        up2.resv = 0;
        up2.resv2 = 0;

        if (up->offset == IORING_FILE_INDEX_ALLOC) {
                ret = io_files_update_with_index_alloc(req, issue_flags);
        } else {
                io_ring_submit_lock(ctx, issue_flags);
                ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
                                                &up2, up->nr_args);
                io_ring_submit_unlock(ctx, issue_flags);
        }

        if (ret < 0)
                req_set_fail(req);
        io_req_set_res(req, ret, 0);
        return IOU_OK;
}

int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc)
{
        struct io_ring_ctx *ctx = data->ctx;
        struct io_rsrc_node *node = ctx->rsrc_node;
        u64 *tag_slot = io_get_tag_slot(data, idx);

        ctx->rsrc_node = io_rsrc_node_alloc(ctx);
        if (unlikely(!ctx->rsrc_node)) {
                ctx->rsrc_node = node;
                return -ENOMEM;
        }

        node->item.rsrc = rsrc;
        node->type = data->rsrc_type;
        node->item.tag = *tag_slot;
        *tag_slot = 0;
        list_add_tail(&node->node, &ctx->rsrc_ref_list);
        io_put_rsrc_node(ctx, node);
        return 0;
}

void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
        int i;

        for (i = 0; i < ctx->nr_user_files; i++) {
                struct file *file = io_file_from_index(&ctx->file_table, i);

                if (!file)
                        continue;
                io_file_bitmap_clear(&ctx->file_table, i);
                fput(file);
        }

        io_free_file_tables(&ctx->file_table);
        io_file_table_set_alloc_range(ctx, 0, 0);
        io_rsrc_data_free(ctx->file_data);
        ctx->file_data = NULL;
        ctx->nr_user_files = 0;
}

int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
        unsigned nr = ctx->nr_user_files;
        int ret;

        if (!ctx->file_data)
                return -ENXIO;

        /*
         * Quiesce may unlock ->uring_lock, and while it's not held
         * prevent new requests using the table.
         */
        ctx->nr_user_files = 0;
        ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
        ctx->nr_user_files = nr;
        if (!ret)
                __io_sqe_files_unregister(ctx);
        return ret;
}

int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
                          unsigned nr_args, u64 __user *tags)
{
        __s32 __user *fds = (__s32 __user *) arg;
        struct file *file;
        int fd, ret;
        unsigned i;

        if (ctx->file_data)
                return -EBUSY;
        if (!nr_args)
                return -EINVAL;
        if (nr_args > IORING_MAX_FIXED_FILES)
                return -EMFILE;
        if (nr_args > rlimit(RLIMIT_NOFILE))
                return -EMFILE;
        ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args,
                                 &ctx->file_data);
        if (ret)
                return ret;

        if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
                io_rsrc_data_free(ctx->file_data);
                ctx->file_data = NULL;
                return -ENOMEM;
        }

        for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
                struct io_fixed_file *file_slot;

                if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
                        ret = -EFAULT;
                        goto fail;
                }
                /* allow sparse sets */
                if (!fds || fd == -1) {
                        ret = -EINVAL;
                        if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
                                goto fail;
                        continue;
                }

                file = fget(fd);
                ret = -EBADF;
                if (unlikely(!file))
                        goto fail;

                /*
                 * Don't allow io_uring instances to be registered.
                 */
                if (io_is_uring_fops(file)) {
                        fput(file);
                        goto fail;
                }
                file_slot = io_fixed_file_slot(&ctx->file_table, i);
                io_fixed_file_set(file_slot, file);
                io_file_bitmap_set(&ctx->file_table, i);
        }

        /* default it to the whole table */
        io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files);
        return 0;
fail:
        __io_sqe_files_unregister(ctx);
        return ret;
}

static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
        io_buffer_unmap(ctx, &prsrc->buf);
        prsrc->buf = NULL;
}

void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
        unsigned int i;

        for (i = 0; i < ctx->nr_user_bufs; i++)
                io_buffer_unmap(ctx, &ctx->user_bufs[i]);
        kfree(ctx->user_bufs);
        io_rsrc_data_free(ctx->buf_data);
        ctx->user_bufs = NULL;
        ctx->buf_data = NULL;
        ctx->nr_user_bufs = 0;
}

int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
        unsigned nr = ctx->nr_user_bufs;
        int ret;

        if (!ctx->buf_data)
                return -ENXIO;

        /*
         * Quiesce may unlock ->uring_lock, and while it's not held
         * prevent new requests using the table.
         */
        ctx->nr_user_bufs = 0;
        ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
        ctx->nr_user_bufs = nr;
        if (!ret)
                __io_sqe_buffers_unregister(ctx);
        return ret;
}

/*
 * Not super efficient, but this is just a registration time. And we do cache
 * the last compound head, so generally we'll only do a full search if we don't
 * match that one.
 *
 * We check if the given compound head page has already been accounted, to
 * avoid double accounting it. This allows us to account the full size of the
 * page, not just the constituent pages of a huge page.
 */
static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
                                  int nr_pages, struct page *hpage)
{
        int i, j;

        /* check current page array */
        for (i = 0; i < nr_pages; i++) {
                if (!PageCompound(pages[i]))
                        continue;
                if (compound_head(pages[i]) == hpage)
                        return true;
        }

        /* check previously registered pages */
        for (i = 0; i < ctx->nr_user_bufs; i++) {
                struct io_mapped_ubuf *imu = ctx->user_bufs[i];

                for (j = 0; j < imu->nr_bvecs; j++) {
                        if (!PageCompound(imu->bvec[j].bv_page))
                                continue;
                        if (compound_head(imu->bvec[j].bv_page) == hpage)
                                return true;
                }
        }

        return false;
}

static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
                                 int nr_pages, struct io_mapped_ubuf *imu,
                                 struct page **last_hpage)
{
        int i, ret;

        imu->acct_pages = 0;
        for (i = 0; i < nr_pages; i++) {
                if (!PageCompound(pages[i])) {
                        imu->acct_pages++;
                } else {
                        struct page *hpage;

                        hpage = compound_head(pages[i]);
                        if (hpage == *last_hpage)
                                continue;
                        *last_hpage = hpage;
                        if (headpage_already_acct(ctx, pages, i, hpage))
                                continue;
                        imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
                }
        }

        if (!imu->acct_pages)
                return 0;

        ret = io_account_mem(ctx, imu->acct_pages);
        if (ret)
                imu->acct_pages = 0;
        return ret;
}

static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
                                  struct io_mapped_ubuf **pimu,
                                  struct page **last_hpage)
{
        struct io_mapped_ubuf *imu = NULL;
        struct page **pages = NULL;
        unsigned long off;
        size_t size;
        int ret, nr_pages, i;
        struct folio *folio = NULL;

        *pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
        if (!iov->iov_base)
                return 0;

        ret = -ENOMEM;
        pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
                                &nr_pages);
        if (IS_ERR(pages)) {
                ret = PTR_ERR(pages);
                pages = NULL;
                goto done;
        }

        /* If it's a huge page, try to coalesce them into a single bvec entry */
        if (nr_pages > 1) {
                folio = page_folio(pages[0]);
                for (i = 1; i < nr_pages; i++) {
                        /*
                         * Pages must be consecutive and on the same folio for
                         * this to work
                         */
                        if (page_folio(pages[i]) != folio ||
                            pages[i] != pages[i - 1] + 1) {
                                folio = NULL;
                                break;
                        }
                }
                if (folio) {
                        /*
                         * The pages are bound to the folio, it doesn't
                         * actually unpin them but drops all but one reference,
                         * which is usually put down by io_buffer_unmap().
                         * Note, needs a better helper.
                         */
                        unpin_user_pages(&pages[1], nr_pages - 1);
                        nr_pages = 1;
                }
        }

        imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
        if (!imu)
                goto done;

        ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
        if (ret) {
                unpin_user_pages(pages, nr_pages);
                goto done;
        }

        off = (unsigned long) iov->iov_base & ~PAGE_MASK;
        size = iov->iov_len;
        /* store original address for later verification */
        imu->ubuf = (unsigned long) iov->iov_base;
        imu->ubuf_end = imu->ubuf + iov->iov_len;
        imu->nr_bvecs = nr_pages;
        *pimu = imu;
        ret = 0;

        if (folio) {
                bvec_set_page(&imu->bvec[0], pages[0], size, off);
                goto done;
        }
        for (i = 0; i < nr_pages; i++) {
                size_t vec_len;

                vec_len = min_t(size_t, size, PAGE_SIZE - off);
                bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
                off = 0;
                size -= vec_len;
        }
done:
        if (ret)
                kvfree(imu);
        kvfree(pages);
        return ret;
}

static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
{
        ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
        return ctx->user_bufs ? 0 : -ENOMEM;
}

int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
                            unsigned int nr_args, u64 __user *tags)
{
        struct page *last_hpage = NULL;
        struct io_rsrc_data *data;
        int i, ret;
        struct iovec iov;

        BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));

        if (ctx->user_bufs)
                return -EBUSY;
        if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
                return -EINVAL;
        ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data);
        if (ret)
                return ret;
        ret = io_buffers_map_alloc(ctx, nr_args);
        if (ret) {
                io_rsrc_data_free(data);
                return ret;
        }

        for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
                if (arg) {
                        ret = io_copy_iov(ctx, &iov, arg, i);
                        if (ret)
                                break;
                        ret = io_buffer_validate(&iov);
                        if (ret)
                                break;
                } else {
                        memset(&iov, 0, sizeof(iov));
                }

                if (!iov.iov_base && *io_get_tag_slot(data, i)) {
                        ret = -EINVAL;
                        break;
                }

                ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
                                             &last_hpage);
                if (ret)
                        break;
        }

        WARN_ON_ONCE(ctx->buf_data);

        ctx->buf_data = data;
        if (ret)
                __io_sqe_buffers_unregister(ctx);
        return ret;
}

int io_import_fixed(int ddir, struct iov_iter *iter,
                           struct io_mapped_ubuf *imu,
                           u64 buf_addr, size_t len)
{
        u64 buf_end;
        size_t offset;

        if (WARN_ON_ONCE(!imu))
                return -EFAULT;
        if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
                return -EFAULT;
        /* not inside the mapped region */
        if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
                return -EFAULT;

        /*
         * Might not be a start of buffer, set size appropriately
         * and advance us to the beginning.
         */
        offset = buf_addr - imu->ubuf;
        iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);

        if (offset) {
                /*
                 * Don't use iov_iter_advance() here, as it's really slow for
                 * using the latter parts of a big fixed buffer - it iterates
                 * over each segment manually. We can cheat a bit here, because
                 * we know that:
                 *
                 * 1) it's a BVEC iter, we set it up
                 * 2) all bvecs are PAGE_SIZE in size, except potentially the
                 *    first and last bvec
                 *
                 * So just find our index, and adjust the iterator afterwards.
                 * If the offset is within the first bvec (or the whole first
                 * bvec, just use iov_iter_advance(). This makes it easier
                 * since we can just skip the first segment, which may not
                 * be PAGE_SIZE aligned.
                 */
                const struct bio_vec *bvec = imu->bvec;

                if (offset < bvec->bv_len) {
                        /*
                         * Note, huge pages buffers consists of one large
                         * bvec entry and should always go this way. The other
                         * branch doesn't expect non PAGE_SIZE'd chunks.
                         */
                        iter->bvec = bvec;
                        iter->count -= offset;
                        iter->iov_offset = offset;
                } else {
                        unsigned long seg_skip;

                        /* skip first vec */
                        offset -= bvec->bv_len;
                        seg_skip = 1 + (offset >> PAGE_SHIFT);

                        iter->bvec = bvec + seg_skip;
                        iter->nr_segs -= seg_skip;
                        iter->count -= bvec->bv_len + offset;
                        iter->iov_offset = offset & ~PAGE_MASK;
                }
        }

        return 0;
}



































    3 












    1 
    1 






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 









    1 




    1 








































































    1 









    1 








    1 


    1 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





    1 

































































    1 













































    2 



    2 






































































































































































































































































































































    1 












    1 
































































































    1 








    1 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2011 STRATO.  All rights reserved.
 */

#include <linux/sched.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/btrfs.h>
#include <linux/sched/mm.h>

#include "ctree.h"
#include "transaction.h"
#include "disk-io.h"
#include "locking.h"
#include "ulist.h"
#include "backref.h"
#include "extent_io.h"
#include "qgroup.h"
#include "block-group.h"
#include "sysfs.h"
#include "tree-mod-log.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
#include "tree-checker.h"

enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info)
{
        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
                return BTRFS_QGROUP_MODE_DISABLED;
        if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE)
                return BTRFS_QGROUP_MODE_SIMPLE;
        return BTRFS_QGROUP_MODE_FULL;
}

bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info)
{
        return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED;
}

bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info)
{
        return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL;
}

/*
 * Helpers to access qgroup reservation
 *
 * Callers should ensure the lock context and type are valid
 */

static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
{
        u64 ret = 0;
        int i;

        for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
                ret += qgroup->rsv.values[i];

        return ret;
}

#ifdef CONFIG_BTRFS_DEBUG
static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
{
        if (type == BTRFS_QGROUP_RSV_DATA)
                return "data";
        if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
                return "meta_pertrans";
        if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
                return "meta_prealloc";
        return NULL;
}
#endif

static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
                           struct btrfs_qgroup *qgroup, u64 num_bytes,
                           enum btrfs_qgroup_rsv_type type)
{
        trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
        qgroup->rsv.values[type] += num_bytes;
}

static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
                               struct btrfs_qgroup *qgroup, u64 num_bytes,
                               enum btrfs_qgroup_rsv_type type)
{
        trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
        if (qgroup->rsv.values[type] >= num_bytes) {
                qgroup->rsv.values[type] -= num_bytes;
                return;
        }
#ifdef CONFIG_BTRFS_DEBUG
        WARN_RATELIMIT(1,
                "qgroup %llu %s reserved space underflow, have %llu to free %llu",
                qgroup->qgroupid, qgroup_rsv_type_str(type),
                qgroup->rsv.values[type], num_bytes);
#endif
        qgroup->rsv.values[type] = 0;
}

static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
                                     struct btrfs_qgroup *dest,
                                     struct btrfs_qgroup *src)
{
        int i;

        for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
                qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
}

static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
                                         struct btrfs_qgroup *dest,
                                          struct btrfs_qgroup *src)
{
        int i;

        for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
                qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
}

static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
                                           int mod)
{
        if (qg->old_refcnt < seq)
                qg->old_refcnt = seq;
        qg->old_refcnt += mod;
}

static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
                                           int mod)
{
        if (qg->new_refcnt < seq)
                qg->new_refcnt = seq;
        qg->new_refcnt += mod;
}

static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
{
        if (qg->old_refcnt < seq)
                return 0;
        return qg->old_refcnt - seq;
}

static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
{
        if (qg->new_refcnt < seq)
                return 0;
        return qg->new_refcnt - seq;
}

/*
 * glue structure to represent the relations between qgroups.
 */
struct btrfs_qgroup_list {
        struct list_head next_group;
        struct list_head next_member;
        struct btrfs_qgroup *group;
        struct btrfs_qgroup *member;
};

static int
qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
                   int init_flags);
static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);

/* must be called with qgroup_ioctl_lock held */
static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
                                           u64 qgroupid)
{
        struct rb_node *n = fs_info->qgroup_tree.rb_node;
        struct btrfs_qgroup *qgroup;

        while (n) {
                qgroup = rb_entry(n, struct btrfs_qgroup, node);
                if (qgroup->qgroupid < qgroupid)
                        n = n->rb_left;
                else if (qgroup->qgroupid > qgroupid)
                        n = n->rb_right;
                else
                        return qgroup;
        }
        return NULL;
}

/*
 * Add qgroup to the filesystem's qgroup tree.
 *
 * Must be called with qgroup_lock held and @prealloc preallocated.
 *
 * The control on the lifespan of @prealloc would be transferred to this
 * function, thus caller should no longer touch @prealloc.
 */
static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
                                          struct btrfs_qgroup *prealloc,
                                          u64 qgroupid)
{
        struct rb_node **p = &fs_info->qgroup_tree.rb_node;
        struct rb_node *parent = NULL;
        struct btrfs_qgroup *qgroup;

        /* Caller must have pre-allocated @prealloc. */
        ASSERT(prealloc);

        while (*p) {
                parent = *p;
                qgroup = rb_entry(parent, struct btrfs_qgroup, node);

                if (qgroup->qgroupid < qgroupid) {
                        p = &(*p)->rb_left;
                } else if (qgroup->qgroupid > qgroupid) {
                        p = &(*p)->rb_right;
                } else {
                        kfree(prealloc);
                        return qgroup;
                }
        }

        qgroup = prealloc;
        qgroup->qgroupid = qgroupid;
        INIT_LIST_HEAD(&qgroup->groups);
        INIT_LIST_HEAD(&qgroup->members);
        INIT_LIST_HEAD(&qgroup->dirty);
        INIT_LIST_HEAD(&qgroup->iterator);
        INIT_LIST_HEAD(&qgroup->nested_iterator);

        rb_link_node(&qgroup->node, parent, p);
        rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);

        return qgroup;
}

static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
                            struct btrfs_qgroup *qgroup)
{
        struct btrfs_qgroup_list *list;

        list_del(&qgroup->dirty);
        while (!list_empty(&qgroup->groups)) {
                list = list_first_entry(&qgroup->groups,
                                        struct btrfs_qgroup_list, next_group);
                list_del(&list->next_group);
                list_del(&list->next_member);
                kfree(list);
        }

        while (!list_empty(&qgroup->members)) {
                list = list_first_entry(&qgroup->members,
                                        struct btrfs_qgroup_list, next_member);
                list_del(&list->next_group);
                list_del(&list->next_member);
                kfree(list);
        }
}

/* must be called with qgroup_lock held */
static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
{
        struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);

        if (!qgroup)
                return -ENOENT;

        rb_erase(&qgroup->node, &fs_info->qgroup_tree);
        __del_qgroup_rb(fs_info, qgroup);
        return 0;
}

/*
 * Add relation specified by two qgroups.
 *
 * Must be called with qgroup_lock held, the ownership of @prealloc is
 * transferred to this function and caller should not touch it anymore.
 *
 * Return: 0        on success
 *         -ENOENT  if one of the qgroups is NULL
 *         <0       other errors
 */
static int __add_relation_rb(struct btrfs_qgroup_list *prealloc,
                             struct btrfs_qgroup *member,
                             struct btrfs_qgroup *parent)
{
        if (!member || !parent) {
                kfree(prealloc);
                return -ENOENT;
        }

        prealloc->group = parent;
        prealloc->member = member;
        list_add_tail(&prealloc->next_group, &member->groups);
        list_add_tail(&prealloc->next_member, &parent->members);

        return 0;
}

/*
 * Add relation specified by two qgroup ids.
 *
 * Must be called with qgroup_lock held.
 *
 * Return: 0        on success
 *         -ENOENT  if one of the ids does not exist
 *         <0       other errors
 */
static int add_relation_rb(struct btrfs_fs_info *fs_info,
                           struct btrfs_qgroup_list *prealloc,
                           u64 memberid, u64 parentid)
{
        struct btrfs_qgroup *member;
        struct btrfs_qgroup *parent;

        member = find_qgroup_rb(fs_info, memberid);
        parent = find_qgroup_rb(fs_info, parentid);

        return __add_relation_rb(prealloc, member, parent);
}

/* Must be called with qgroup_lock held */
static int del_relation_rb(struct btrfs_fs_info *fs_info,
                           u64 memberid, u64 parentid)
{
        struct btrfs_qgroup *member;
        struct btrfs_qgroup *parent;
        struct btrfs_qgroup_list *list;

        member = find_qgroup_rb(fs_info, memberid);
        parent = find_qgroup_rb(fs_info, parentid);
        if (!member || !parent)
                return -ENOENT;

        list_for_each_entry(list, &member->groups, next_group) {
                if (list->group == parent) {
                        list_del(&list->next_group);
                        list_del(&list->next_member);
                        kfree(list);
                        return 0;
                }
        }
        return -ENOENT;
}

#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
                               u64 rfer, u64 excl)
{
        struct btrfs_qgroup *qgroup;

        qgroup = find_qgroup_rb(fs_info, qgroupid);
        if (!qgroup)
                return -EINVAL;
        if (qgroup->rfer != rfer || qgroup->excl != excl)
                return -EINVAL;
        return 0;
}
#endif

static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info)
{
        if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
                return;
        fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT |
                                  BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
                                  BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
}

static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info,
                                   struct extent_buffer *leaf, int slot,
                                   struct btrfs_qgroup_status_item *ptr)
{
        ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
        ASSERT(btrfs_item_size(leaf, slot) >= sizeof(*ptr));
        fs_info->qgroup_enable_gen = btrfs_qgroup_status_enable_gen(leaf, ptr);
}

/*
 * The full config is read in one go, only called from open_ctree()
 * It doesn't use any locking, as at this point we're still single-threaded
 */
int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
{
        struct btrfs_key key;
        struct btrfs_key found_key;
        struct btrfs_root *quota_root = fs_info->quota_root;
        struct btrfs_path *path = NULL;
        struct extent_buffer *l;
        int slot;
        int ret = 0;
        u64 flags = 0;
        u64 rescan_progress = 0;

        if (!fs_info->quota_root)
                return 0;

        fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
        if (!fs_info->qgroup_ulist) {
                ret = -ENOMEM;
                goto out;
        }

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }

        ret = btrfs_sysfs_add_qgroups(fs_info);
        if (ret < 0)
                goto out;
        /* default this to quota off, in case no status key is found */
        fs_info->qgroup_flags = 0;

        /*
         * pass 1: read status, all qgroup infos and limits
         */
        key.objectid = 0;
        key.type = 0;
        key.offset = 0;
        ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
        if (ret)
                goto out;

        while (1) {
                struct btrfs_qgroup *qgroup;

                slot = path->slots[0];
                l = path->nodes[0];
                btrfs_item_key_to_cpu(l, &found_key, slot);

                if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
                        struct btrfs_qgroup_status_item *ptr;

                        ptr = btrfs_item_ptr(l, slot,
                                             struct btrfs_qgroup_status_item);

                        if (btrfs_qgroup_status_version(l, ptr) !=
                            BTRFS_QGROUP_STATUS_VERSION) {
                                btrfs_err(fs_info,
                                 "old qgroup version, quota disabled");
                                goto out;
                        }
                        fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr);
                        if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) {
                                qgroup_read_enable_gen(fs_info, l, slot, ptr);
                        } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) {
                                qgroup_mark_inconsistent(fs_info);
                                btrfs_err(fs_info,
                                        "qgroup generation mismatch, marked as inconsistent");
                        }
                        rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
                        goto next1;
                }

                if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
                    found_key.type != BTRFS_QGROUP_LIMIT_KEY)
                        goto next1;

                qgroup = find_qgroup_rb(fs_info, found_key.offset);
                if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
                    (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
                        btrfs_err(fs_info, "inconsistent qgroup config");
                        qgroup_mark_inconsistent(fs_info);
                }
                if (!qgroup) {
                        struct btrfs_qgroup *prealloc;
                        struct btrfs_root *tree_root = fs_info->tree_root;

                        prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
                        if (!prealloc) {
                                ret = -ENOMEM;
                                goto out;
                        }
                        qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
                        /*
                         * If a qgroup exists for a subvolume ID, it is possible
                         * that subvolume has been deleted, in which case
                         * re-using that ID would lead to incorrect accounting.
                         *
                         * Ensure that we skip any such subvol ids.
                         *
                         * We don't need to lock because this is only called
                         * during mount before we start doing things like creating
                         * subvolumes.
                         */
                        if (is_fstree(qgroup->qgroupid) &&
                            qgroup->qgroupid > tree_root->free_objectid)
                                /*
                                 * Don't need to check against BTRFS_LAST_FREE_OBJECTID,
                                 * as it will get checked on the next call to
                                 * btrfs_get_free_objectid.
                                 */
                                tree_root->free_objectid = qgroup->qgroupid + 1;
                }
                ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
                if (ret < 0)
                        goto out;

                switch (found_key.type) {
                case BTRFS_QGROUP_INFO_KEY: {
                        struct btrfs_qgroup_info_item *ptr;

                        ptr = btrfs_item_ptr(l, slot,
                                             struct btrfs_qgroup_info_item);
                        qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
                        qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
                        qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
                        qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
                        /* generation currently unused */
                        break;
                }
                case BTRFS_QGROUP_LIMIT_KEY: {
                        struct btrfs_qgroup_limit_item *ptr;

                        ptr = btrfs_item_ptr(l, slot,
                                             struct btrfs_qgroup_limit_item);
                        qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
                        qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
                        qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
                        qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
                        qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
                        break;
                }
                }
next1:
                ret = btrfs_next_item(quota_root, path);
                if (ret < 0)
                        goto out;
                if (ret)
                        break;
        }
        btrfs_release_path(path);

        /*
         * pass 2: read all qgroup relations
         */
        key.objectid = 0;
        key.type = BTRFS_QGROUP_RELATION_KEY;
        key.offset = 0;
        ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
        if (ret)
                goto out;
        while (1) {
                struct btrfs_qgroup_list *list = NULL;

                slot = path->slots[0];
                l = path->nodes[0];
                btrfs_item_key_to_cpu(l, &found_key, slot);

                if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
                        goto next2;

                if (found_key.objectid > found_key.offset) {
                        /* parent <- member, not needed to build config */
                        /* FIXME should we omit the key completely? */
                        goto next2;
                }

                list = kzalloc(sizeof(*list), GFP_KERNEL);
                if (!list) {
                        ret = -ENOMEM;
                        goto out;
                }
                ret = add_relation_rb(fs_info, list, found_key.objectid,
                                      found_key.offset);
                list = NULL;
                if (ret == -ENOENT) {
                        btrfs_warn(fs_info,
                                "orphan qgroup relation 0x%llx->0x%llx",
                                found_key.objectid, found_key.offset);
                        ret = 0;        /* ignore the error */
                }
                if (ret)
                        goto out;
next2:
                ret = btrfs_next_item(quota_root, path);
                if (ret < 0)
                        goto out;
                if (ret)
                        break;
        }
out:
        btrfs_free_path(path);
        fs_info->qgroup_flags |= flags;
        if (ret >= 0) {
                if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)
                        set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
                if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
                        ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
        } else {
                ulist_free(fs_info->qgroup_ulist);
                fs_info->qgroup_ulist = NULL;
                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
                btrfs_sysfs_del_qgroups(fs_info);
        }

        return ret < 0 ? ret : 0;
}

/*
 * Called in close_ctree() when quota is still enabled.  This verifies we don't
 * leak some reserved space.
 *
 * Return false if no reserved space is left.
 * Return true if some reserved space is leaked.
 */
bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
{
        struct rb_node *node;
        bool ret = false;

        if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
                return ret;
        /*
         * Since we're unmounting, there is no race and no need to grab qgroup
         * lock.  And here we don't go post-order to provide a more user
         * friendly sorted result.
         */
        for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
                struct btrfs_qgroup *qgroup;
                int i;

                qgroup = rb_entry(node, struct btrfs_qgroup, node);
                for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
                        if (qgroup->rsv.values[i]) {
                                ret = true;
                                btrfs_warn(fs_info,
                "qgroup %hu/%llu has unreleased space, type %d rsv %llu",
                                   btrfs_qgroup_level(qgroup->qgroupid),
                                   btrfs_qgroup_subvolid(qgroup->qgroupid),
                                   i, qgroup->rsv.values[i]);
                        }
                }
        }
        return ret;
}

/*
 * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
 * first two are in single-threaded paths.And for the third one, we have set
 * quota_root to be null with qgroup_lock held before, so it is safe to clean
 * up the in-memory structures without qgroup_lock held.
 */
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
{
        struct rb_node *n;
        struct btrfs_qgroup *qgroup;

        while ((n = rb_first(&fs_info->qgroup_tree))) {
                qgroup = rb_entry(n, struct btrfs_qgroup, node);
                rb_erase(n, &fs_info->qgroup_tree);
                __del_qgroup_rb(fs_info, qgroup);
                btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
                kfree(qgroup);
        }
        /*
         * We call btrfs_free_qgroup_config() when unmounting
         * filesystem and disabling quota, so we set qgroup_ulist
         * to be null here to avoid double free.
         */
        ulist_free(fs_info->qgroup_ulist);
        fs_info->qgroup_ulist = NULL;
        btrfs_sysfs_del_qgroups(fs_info);
}

static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
                                    u64 dst)
{
        int ret;
        struct btrfs_root *quota_root = trans->fs_info->quota_root;
        struct btrfs_path *path;
        struct btrfs_key key;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = src;
        key.type = BTRFS_QGROUP_RELATION_KEY;
        key.offset = dst;

        ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);

        btrfs_mark_buffer_dirty(trans, path->nodes[0]);

        btrfs_free_path(path);
        return ret;
}

static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
                                    u64 dst)
{
        int ret;
        struct btrfs_root *quota_root = trans->fs_info->quota_root;
        struct btrfs_path *path;
        struct btrfs_key key;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = src;
        key.type = BTRFS_QGROUP_RELATION_KEY;
        key.offset = dst;

        ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
        if (ret < 0)
                goto out;

        if (ret > 0) {
                ret = -ENOENT;
                goto out;
        }

        ret = btrfs_del_item(trans, quota_root, path);
out:
        btrfs_free_path(path);
        return ret;
}

static int add_qgroup_item(struct btrfs_trans_handle *trans,
                           struct btrfs_root *quota_root, u64 qgroupid)
{
        int ret;
        struct btrfs_path *path;
        struct btrfs_qgroup_info_item *qgroup_info;
        struct btrfs_qgroup_limit_item *qgroup_limit;
        struct extent_buffer *leaf;
        struct btrfs_key key;

        if (btrfs_is_testing(quota_root->fs_info))
                return 0;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = 0;
        key.type = BTRFS_QGROUP_INFO_KEY;
        key.offset = qgroupid;

        /*
         * Avoid a transaction abort by catching -EEXIST here. In that
         * case, we proceed by re-initializing the existing structure
         * on disk.
         */

        ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
                                      sizeof(*qgroup_info));
        if (ret && ret != -EEXIST)
                goto out;

        leaf = path->nodes[0];
        qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
                                 struct btrfs_qgroup_info_item);
        btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
        btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
        btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
        btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
        btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);

        btrfs_mark_buffer_dirty(trans, leaf);

        btrfs_release_path(path);

        key.type = BTRFS_QGROUP_LIMIT_KEY;
        ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
                                      sizeof(*qgroup_limit));
        if (ret && ret != -EEXIST)
                goto out;

        leaf = path->nodes[0];
        qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
                                  struct btrfs_qgroup_limit_item);
        btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
        btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
        btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
        btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
        btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);

        btrfs_mark_buffer_dirty(trans, leaf);

        ret = 0;
out:
        btrfs_free_path(path);
        return ret;
}

static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
{
        int ret;
        struct btrfs_root *quota_root = trans->fs_info->quota_root;
        struct btrfs_path *path;
        struct btrfs_key key;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = 0;
        key.type = BTRFS_QGROUP_INFO_KEY;
        key.offset = qgroupid;
        ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
        if (ret < 0)
                goto out;

        if (ret > 0) {
                ret = -ENOENT;
                goto out;
        }

        ret = btrfs_del_item(trans, quota_root, path);
        if (ret)
                goto out;

        btrfs_release_path(path);

        key.type = BTRFS_QGROUP_LIMIT_KEY;
        ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
        if (ret < 0)
                goto out;

        if (ret > 0) {
                ret = -ENOENT;
                goto out;
        }

        ret = btrfs_del_item(trans, quota_root, path);

out:
        btrfs_free_path(path);
        return ret;
}

static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
                                    struct btrfs_qgroup *qgroup)
{
        struct btrfs_root *quota_root = trans->fs_info->quota_root;
        struct btrfs_path *path;
        struct btrfs_key key;
        struct extent_buffer *l;
        struct btrfs_qgroup_limit_item *qgroup_limit;
        int ret;
        int slot;

        key.objectid = 0;
        key.type = BTRFS_QGROUP_LIMIT_KEY;
        key.offset = qgroup->qgroupid;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
        if (ret > 0)
                ret = -ENOENT;

        if (ret)
                goto out;

        l = path->nodes[0];
        slot = path->slots[0];
        qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
        btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
        btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
        btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
        btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
        btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);

        btrfs_mark_buffer_dirty(trans, l);

out:
        btrfs_free_path(path);
        return ret;
}

static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
                                   struct btrfs_qgroup *qgroup)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *quota_root = fs_info->quota_root;
        struct btrfs_path *path;
        struct btrfs_key key;
        struct extent_buffer *l;
        struct btrfs_qgroup_info_item *qgroup_info;
        int ret;
        int slot;

        if (btrfs_is_testing(fs_info))
                return 0;

        key.objectid = 0;
        key.type = BTRFS_QGROUP_INFO_KEY;
        key.offset = qgroup->qgroupid;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
        if (ret > 0)
                ret = -ENOENT;

        if (ret)
                goto out;

        l = path->nodes[0];
        slot = path->slots[0];
        qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
        btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
        btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
        btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
        btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
        btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);

        btrfs_mark_buffer_dirty(trans, l);

out:
        btrfs_free_path(path);
        return ret;
}

static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *quota_root = fs_info->quota_root;
        struct btrfs_path *path;
        struct btrfs_key key;
        struct extent_buffer *l;
        struct btrfs_qgroup_status_item *ptr;
        int ret;
        int slot;

        key.objectid = 0;
        key.type = BTRFS_QGROUP_STATUS_KEY;
        key.offset = 0;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
        if (ret > 0)
                ret = -ENOENT;

        if (ret)
                goto out;

        l = path->nodes[0];
        slot = path->slots[0];
        ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
        btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags &
                                      BTRFS_QGROUP_STATUS_FLAGS_MASK);
        btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
        btrfs_set_qgroup_status_rescan(l, ptr,
                                fs_info->qgroup_rescan_progress.objectid);

        btrfs_mark_buffer_dirty(trans, l);

out:
        btrfs_free_path(path);
        return ret;
}

/*
 * called with qgroup_lock held
 */
static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root)
{
        struct btrfs_path *path;
        struct btrfs_key key;
        struct extent_buffer *leaf = NULL;
        int ret;
        int nr = 0;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = 0;
        key.offset = 0;
        key.type = 0;

        while (1) {
                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
                if (ret < 0)
                        goto out;
                leaf = path->nodes[0];
                nr = btrfs_header_nritems(leaf);
                if (!nr)
                        break;
                /*
                 * delete the leaf one by one
                 * since the whole tree is going
                 * to be deleted.
                 */
                path->slots[0] = 0;
                ret = btrfs_del_items(trans, root, path, 0, nr);
                if (ret)
                        goto out;

                btrfs_release_path(path);
        }
        ret = 0;
out:
        btrfs_free_path(path);
        return ret;
}

int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
                       struct btrfs_ioctl_quota_ctl_args *quota_ctl_args)
{
        struct btrfs_root *quota_root;
        struct btrfs_root *tree_root = fs_info->tree_root;
        struct btrfs_path *path = NULL;
        struct btrfs_qgroup_status_item *ptr;
        struct extent_buffer *leaf;
        struct btrfs_key key;
        struct btrfs_key found_key;
        struct btrfs_qgroup *qgroup = NULL;
        struct btrfs_qgroup *prealloc = NULL;
        struct btrfs_trans_handle *trans = NULL;
        struct ulist *ulist = NULL;
        const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA);
        int ret = 0;
        int slot;

        /*
         * We need to have subvol_sem write locked, to prevent races between
         * concurrent tasks trying to enable quotas, because we will unlock
         * and relock qgroup_ioctl_lock before setting fs_info->quota_root
         * and before setting BTRFS_FS_QUOTA_ENABLED.
         */
        lockdep_assert_held_write(&fs_info->subvol_sem);

        if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
                btrfs_err(fs_info,
                          "qgroups are currently unsupported in extent tree v2");
                return -EINVAL;
        }

        mutex_lock(&fs_info->qgroup_ioctl_lock);
        if (fs_info->quota_root)
                goto out;

        ulist = ulist_alloc(GFP_KERNEL);
        if (!ulist) {
                ret = -ENOMEM;
                goto out;
        }

        ret = btrfs_sysfs_add_qgroups(fs_info);
        if (ret < 0)
                goto out;

        /*
         * Unlock qgroup_ioctl_lock before starting the transaction. This is to
         * avoid lock acquisition inversion problems (reported by lockdep) between
         * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
         * start a transaction.
         * After we started the transaction lock qgroup_ioctl_lock again and
         * check if someone else created the quota root in the meanwhile. If so,
         * just return success and release the transaction handle.
         *
         * Also we don't need to worry about someone else calling
         * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
         * that function returns 0 (success) when the sysfs entries already exist.
         */
        mutex_unlock(&fs_info->qgroup_ioctl_lock);

        /*
         * 1 for quota root item
         * 1 for BTRFS_QGROUP_STATUS item
         *
         * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items
         * per subvolume. However those are not currently reserved since it
         * would be a lot of overkill.
         */
        trans = btrfs_start_transaction(tree_root, 2);

        mutex_lock(&fs_info->qgroup_ioctl_lock);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                trans = NULL;
                goto out;
        }

        if (fs_info->quota_root)
                goto out;

        fs_info->qgroup_ulist = ulist;
        ulist = NULL;

        /*
         * initially create the quota tree
         */
        quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
        if (IS_ERR(quota_root)) {
                ret =  PTR_ERR(quota_root);
                btrfs_abort_transaction(trans, ret);
                goto out;
        }

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                btrfs_abort_transaction(trans, ret);
                goto out_free_root;
        }

        key.objectid = 0;
        key.type = BTRFS_QGROUP_STATUS_KEY;
        key.offset = 0;

        ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
                                      sizeof(*ptr));
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out_free_path;
        }

        leaf = path->nodes[0];
        ptr = btrfs_item_ptr(leaf, path->slots[0],
                                 struct btrfs_qgroup_status_item);
        btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
        btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
        fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON;
        if (simple) {
                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
                btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid);
        } else {
                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
        }
        btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags &
                                      BTRFS_QGROUP_STATUS_FLAGS_MASK);
        btrfs_set_qgroup_status_rescan(leaf, ptr, 0);

        btrfs_mark_buffer_dirty(trans, leaf);

        key.objectid = 0;
        key.type = BTRFS_ROOT_REF_KEY;
        key.offset = 0;

        btrfs_release_path(path);
        ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
        if (ret > 0)
                goto out_add_root;
        if (ret < 0) {
                btrfs_abort_transaction(trans, ret);
                goto out_free_path;
        }

        while (1) {
                slot = path->slots[0];
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, slot);

                if (found_key.type == BTRFS_ROOT_REF_KEY) {

                        /* Release locks on tree_root before we access quota_root */
                        btrfs_release_path(path);

                        /* We should not have a stray @prealloc pointer. */
                        ASSERT(prealloc == NULL);
                        prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
                        if (!prealloc) {
                                ret = -ENOMEM;
                                btrfs_abort_transaction(trans, ret);
                                goto out_free_path;
                        }

                        ret = add_qgroup_item(trans, quota_root,
                                              found_key.offset);
                        if (ret) {
                                btrfs_abort_transaction(trans, ret);
                                goto out_free_path;
                        }

                        qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
                        prealloc = NULL;
                        if (IS_ERR(qgroup)) {
                                ret = PTR_ERR(qgroup);
                                btrfs_abort_transaction(trans, ret);
                                goto out_free_path;
                        }
                        ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
                        if (ret < 0) {
                                btrfs_abort_transaction(trans, ret);
                                goto out_free_path;
                        }
                        ret = btrfs_search_slot_for_read(tree_root, &found_key,
                                                         path, 1, 0);
                        if (ret < 0) {
                                btrfs_abort_transaction(trans, ret);
                                goto out_free_path;
                        }
                        if (ret > 0) {
                                /*
                                 * Shouldn't happen, but in case it does we
                                 * don't need to do the btrfs_next_item, just
                                 * continue.
                                 */
                                continue;
                        }
                }
                ret = btrfs_next_item(tree_root, path);
                if (ret < 0) {
                        btrfs_abort_transaction(trans, ret);
                        goto out_free_path;
                }
                if (ret)
                        break;
        }

out_add_root:
        btrfs_release_path(path);
        ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out_free_path;
        }

        ASSERT(prealloc == NULL);
        prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
        if (!prealloc) {
                ret = -ENOMEM;
                goto out_free_path;
        }
        qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID);
        prealloc = NULL;
        ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
        if (ret < 0) {
                btrfs_abort_transaction(trans, ret);
                goto out_free_path;
        }

        fs_info->qgroup_enable_gen = trans->transid;

        mutex_unlock(&fs_info->qgroup_ioctl_lock);
        /*
         * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
         * a deadlock with tasks concurrently doing other qgroup operations, such
         * adding/removing qgroups or adding/deleting qgroup relations for example,
         * because all qgroup operations first start or join a transaction and then
         * lock the qgroup_ioctl_lock mutex.
         * We are safe from a concurrent task trying to enable quotas, by calling
         * this function, since we are serialized by fs_info->subvol_sem.
         */
        ret = btrfs_commit_transaction(trans);
        trans = NULL;
        mutex_lock(&fs_info->qgroup_ioctl_lock);
        if (ret)
                goto out_free_path;

        /*
         * Set quota enabled flag after committing the transaction, to avoid
         * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
         * creation.
         */
        spin_lock(&fs_info->qgroup_lock);
        fs_info->quota_root = quota_root;
        set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
        if (simple)
                btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA);
        spin_unlock(&fs_info->qgroup_lock);

        /* Skip rescan for simple qgroups. */
        if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
                goto out_free_path;

        ret = qgroup_rescan_init(fs_info, 0, 1);
        if (!ret) {
                qgroup_rescan_zero_tracking(fs_info);
                fs_info->qgroup_rescan_running = true;
                btrfs_queue_work(fs_info->qgroup_rescan_workers,
                                 &fs_info->qgroup_rescan_work);
        } else {
                /*
                 * We have set both BTRFS_FS_QUOTA_ENABLED and
                 * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with
                 * -EINPROGRESS. That can happen because someone started the
                 * rescan worker by calling quota rescan ioctl before we
                 * attempted to initialize the rescan worker. Failure due to
                 * quotas disabled in the meanwhile is not possible, because
                 * we are holding a write lock on fs_info->subvol_sem, which
                 * is also acquired when disabling quotas.
                 * Ignore such error, and any other error would need to undo
                 * everything we did in the transaction we just committed.
                 */
                ASSERT(ret == -EINPROGRESS);
                ret = 0;
        }

out_free_path:
        btrfs_free_path(path);
out_free_root:
        if (ret)
                btrfs_put_root(quota_root);
out:
        if (ret) {
                ulist_free(fs_info->qgroup_ulist);
                fs_info->qgroup_ulist = NULL;
                btrfs_sysfs_del_qgroups(fs_info);
        }
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
        if (ret && trans)
                btrfs_end_transaction(trans);
        else if (trans)
                ret = btrfs_end_transaction(trans);
        ulist_free(ulist);
        kfree(prealloc);
        return ret;
}

/*
 * It is possible to have outstanding ordered extents which reserved bytes
 * before we disabled. We need to fully flush delalloc, ordered extents, and a
 * commit to ensure that we don't leak such reservations, only to have them
 * come back if we re-enable.
 *
 * - enable simple quotas
 * - reserve space
 * - release it, store rsv_bytes in OE
 * - disable quotas
 * - enable simple quotas (qgroup rsv are all 0)
 * - OE finishes
 * - run delayed refs
 * - free rsv_bytes, resulting in miscounting or even underflow
 */
static int flush_reservations(struct btrfs_fs_info *fs_info)
{
        struct btrfs_trans_handle *trans;
        int ret;

        ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
        if (ret)
                return ret;
        btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
        trans = btrfs_join_transaction(fs_info->tree_root);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
        ret = btrfs_commit_transaction(trans);

        return ret;
}

int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
{
        struct btrfs_root *quota_root;
        struct btrfs_trans_handle *trans = NULL;
        int ret = 0;

        /*
         * We need to have subvol_sem write locked to prevent races with
         * snapshot creation.
         */
        lockdep_assert_held_write(&fs_info->subvol_sem);

        /*
         * Relocation will mess with backrefs, so make sure we have the
         * cleaner_mutex held to protect us from relocate.
         */
        lockdep_assert_held(&fs_info->cleaner_mutex);

        mutex_lock(&fs_info->qgroup_ioctl_lock);
        if (!fs_info->quota_root)
                goto out;

        /*
         * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
         * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
         * to lock that mutex while holding a transaction handle and the rescan
         * worker needs to commit a transaction.
         */
        mutex_unlock(&fs_info->qgroup_ioctl_lock);

        /*
         * Request qgroup rescan worker to complete and wait for it. This wait
         * must be done before transaction start for quota disable since it may
         * deadlock with transaction by the qgroup rescan worker.
         */
        clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
        btrfs_qgroup_wait_for_completion(fs_info, false);

        /*
         * We have nothing held here and no trans handle, just return the error
         * if there is one.
         */
        ret = flush_reservations(fs_info);
        if (ret)
                return ret;

        /*
         * 1 For the root item
         *
         * We should also reserve enough items for the quota tree deletion in
         * btrfs_clean_quota_tree but this is not done.
         *
         * Also, we must always start a transaction without holding the mutex
         * qgroup_ioctl_lock, see btrfs_quota_enable().
         */
        trans = btrfs_start_transaction(fs_info->tree_root, 1);

        mutex_lock(&fs_info->qgroup_ioctl_lock);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                trans = NULL;
                set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
                goto out;
        }

        if (!fs_info->quota_root)
                goto out;

        spin_lock(&fs_info->qgroup_lock);
        quota_root = fs_info->quota_root;
        fs_info->quota_root = NULL;
        fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
        fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
        fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
        spin_unlock(&fs_info->qgroup_lock);

        btrfs_free_qgroup_config(fs_info);

        ret = btrfs_clean_quota_tree(trans, quota_root);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out;
        }

        ret = btrfs_del_root(trans, &quota_root->root_key);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out;
        }

        spin_lock(&fs_info->trans_lock);
        list_del(&quota_root->dirty_list);
        spin_unlock(&fs_info->trans_lock);

        btrfs_tree_lock(quota_root->node);
        btrfs_clear_buffer_dirty(trans, quota_root->node);
        btrfs_tree_unlock(quota_root->node);
        btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
                              quota_root->node, 0, 1);

        btrfs_put_root(quota_root);

out:
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
        if (ret && trans)
                btrfs_end_transaction(trans);
        else if (trans)
                ret = btrfs_commit_transaction(trans);
        return ret;
}

static void qgroup_dirty(struct btrfs_fs_info *fs_info,
                         struct btrfs_qgroup *qgroup)
{
        if (list_empty(&qgroup->dirty))
                list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
}

static void qgroup_iterator_add(struct list_head *head, struct btrfs_qgroup *qgroup)
{
        if (!list_empty(&qgroup->iterator))
                return;

        list_add_tail(&qgroup->iterator, head);
}

static void qgroup_iterator_clean(struct list_head *head)
{
        while (!list_empty(head)) {
                struct btrfs_qgroup *qgroup;

                qgroup = list_first_entry(head, struct btrfs_qgroup, iterator);
                list_del_init(&qgroup->iterator);
        }
}

/*
 * The easy accounting, we're updating qgroup relationship whose child qgroup
 * only has exclusive extents.
 *
 * In this case, all exclusive extents will also be exclusive for parent, so
 * excl/rfer just get added/removed.
 *
 * So is qgroup reservation space, which should also be added/removed to
 * parent.
 * Or when child tries to release reservation space, parent will underflow its
 * reservation (for relationship adding case).
 *
 * Caller should hold fs_info->qgroup_lock.
 */
static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
                                    struct btrfs_qgroup *src, int sign)
{
        struct btrfs_qgroup *qgroup;
        struct btrfs_qgroup *cur;
        LIST_HEAD(qgroup_list);
        u64 num_bytes = src->excl;
        int ret = 0;

        qgroup = find_qgroup_rb(fs_info, ref_root);
        if (!qgroup)
                goto out;

        qgroup_iterator_add(&qgroup_list, qgroup);
        list_for_each_entry(cur, &qgroup_list, iterator) {
                struct btrfs_qgroup_list *glist;

                qgroup->rfer += sign * num_bytes;
                qgroup->rfer_cmpr += sign * num_bytes;

                WARN_ON(sign < 0 && qgroup->excl < num_bytes);
                qgroup->excl += sign * num_bytes;
                qgroup->excl_cmpr += sign * num_bytes;

                if (sign > 0)
                        qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
                else
                        qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
                qgroup_dirty(fs_info, qgroup);

                /* Append parent qgroups to @qgroup_list. */
                list_for_each_entry(glist, &qgroup->groups, next_group)
                        qgroup_iterator_add(&qgroup_list, glist->group);
        }
        ret = 0;
out:
        qgroup_iterator_clean(&qgroup_list);
        return ret;
}


/*
 * Quick path for updating qgroup with only excl refs.
 *
 * In that case, just update all parent will be enough.
 * Or we needs to do a full rescan.
 * Caller should also hold fs_info->qgroup_lock.
 *
 * Return 0 for quick update, return >0 for need to full rescan
 * and mark INCONSISTENT flag.
 * Return < 0 for other error.
 */
static int quick_update_accounting(struct btrfs_fs_info *fs_info,
                                   u64 src, u64 dst, int sign)
{
        struct btrfs_qgroup *qgroup;
        int ret = 1;

        qgroup = find_qgroup_rb(fs_info, src);
        if (!qgroup)
                goto out;
        if (qgroup->excl == qgroup->rfer) {
                ret = __qgroup_excl_accounting(fs_info, dst, qgroup, sign);
                if (ret < 0)
                        goto out;
                ret = 0;
        }
out:
        if (ret)
                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
        return ret;
}

int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_qgroup *parent;
        struct btrfs_qgroup *member;
        struct btrfs_qgroup_list *list;
        struct btrfs_qgroup_list *prealloc = NULL;
        int ret = 0;

        /* Check the level of src and dst first */
        if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
                return -EINVAL;

        mutex_lock(&fs_info->qgroup_ioctl_lock);
        if (!fs_info->quota_root) {
                ret = -ENOTCONN;
                goto out;
        }
        member = find_qgroup_rb(fs_info, src);
        parent = find_qgroup_rb(fs_info, dst);
        if (!member || !parent) {
                ret = -EINVAL;
                goto out;
        }

        /* check if such qgroup relation exist firstly */
        list_for_each_entry(list, &member->groups, next_group) {
                if (list->group == parent) {
                        ret = -EEXIST;
                        goto out;
                }
        }

        prealloc = kzalloc(sizeof(*list), GFP_NOFS);
        if (!prealloc) {
                ret = -ENOMEM;
                goto out;
        }
        ret = add_qgroup_relation_item(trans, src, dst);
        if (ret)
                goto out;

        ret = add_qgroup_relation_item(trans, dst, src);
        if (ret) {
                del_qgroup_relation_item(trans, src, dst);
                goto out;
        }

        spin_lock(&fs_info->qgroup_lock);
        ret = __add_relation_rb(prealloc, member, parent);
        prealloc = NULL;
        if (ret < 0) {
                spin_unlock(&fs_info->qgroup_lock);
                goto out;
        }
        ret = quick_update_accounting(fs_info, src, dst, 1);
        spin_unlock(&fs_info->qgroup_lock);
out:
        kfree(prealloc);
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
        return ret;
}

static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
                                 u64 dst)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_qgroup *parent;
        struct btrfs_qgroup *member;
        struct btrfs_qgroup_list *list;
        bool found = false;
        int ret = 0;
        int ret2;

        if (!fs_info->quota_root) {
                ret = -ENOTCONN;
                goto out;
        }

        member = find_qgroup_rb(fs_info, src);
        parent = find_qgroup_rb(fs_info, dst);
        /*
         * The parent/member pair doesn't exist, then try to delete the dead
         * relation items only.
         */
        if (!member || !parent)
                goto delete_item;

        /* check if such qgroup relation exist firstly */
        list_for_each_entry(list, &member->groups, next_group) {
                if (list->group == parent) {
                        found = true;
                        break;
                }
        }

delete_item:
        ret = del_qgroup_relation_item(trans, src, dst);
        if (ret < 0 && ret != -ENOENT)
                goto out;
        ret2 = del_qgroup_relation_item(trans, dst, src);
        if (ret2 < 0 && ret2 != -ENOENT)
                goto out;

        /* At least one deletion succeeded, return 0 */
        if (!ret || !ret2)
                ret = 0;

        if (found) {
                spin_lock(&fs_info->qgroup_lock);
                del_relation_rb(fs_info, src, dst);
                ret = quick_update_accounting(fs_info, src, dst, -1);
                spin_unlock(&fs_info->qgroup_lock);
        }
out:
        return ret;
}

int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
                              u64 dst)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        int ret = 0;

        mutex_lock(&fs_info->qgroup_ioctl_lock);
        ret = __del_qgroup_relation(trans, src, dst);
        mutex_unlock(&fs_info->qgroup_ioctl_lock);

        return ret;
}

int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *quota_root;
        struct btrfs_qgroup *qgroup;
        struct btrfs_qgroup *prealloc = NULL;
        int ret = 0;

        if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
                return 0;

        mutex_lock(&fs_info->qgroup_ioctl_lock);
        if (!fs_info->quota_root) {
                ret = -ENOTCONN;
                goto out;
        }
        quota_root = fs_info->quota_root;
        qgroup = find_qgroup_rb(fs_info, qgroupid);
        if (qgroup) {
                ret = -EEXIST;
                goto out;
        }

        prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
        if (!prealloc) {
                ret = -ENOMEM;
                goto out;
        }

        ret = add_qgroup_item(trans, quota_root, qgroupid);
        if (ret)
                goto out;

        spin_lock(&fs_info->qgroup_lock);
        qgroup = add_qgroup_rb(fs_info, prealloc, qgroupid);
        spin_unlock(&fs_info->qgroup_lock);
        prealloc = NULL;

        ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
out:
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
        kfree(prealloc);
        return ret;
}

static bool qgroup_has_usage(struct btrfs_qgroup *qgroup)
{
        return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 ||
                qgroup->excl > 0 || qgroup->excl_cmpr > 0 ||
                qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 ||
                qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 ||
                qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0);
}

int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_qgroup *qgroup;
        struct btrfs_qgroup_list *list;
        int ret = 0;

        mutex_lock(&fs_info->qgroup_ioctl_lock);
        if (!fs_info->quota_root) {
                ret = -ENOTCONN;
                goto out;
        }

        qgroup = find_qgroup_rb(fs_info, qgroupid);
        if (!qgroup) {
                ret = -ENOENT;
                goto out;
        }

        if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
                ret = -EBUSY;
                goto out;
        }

        /* Check if there are no children of this qgroup */
        if (!list_empty(&qgroup->members)) {
                ret = -EBUSY;
                goto out;
        }

        ret = del_qgroup_item(trans, qgroupid);
        if (ret && ret != -ENOENT)
                goto out;

        while (!list_empty(&qgroup->groups)) {
                list = list_first_entry(&qgroup->groups,
                                        struct btrfs_qgroup_list, next_group);
                ret = __del_qgroup_relation(trans, qgroupid,
                                            list->group->qgroupid);
                if (ret)
                        goto out;
        }

        spin_lock(&fs_info->qgroup_lock);
        del_qgroup_rb(fs_info, qgroupid);
        spin_unlock(&fs_info->qgroup_lock);

        /*
         * Remove the qgroup from sysfs now without holding the qgroup_lock
         * spinlock, since the sysfs_remove_group() function needs to take
         * the mutex kernfs_mutex through kernfs_remove_by_name_ns().
         */
        btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
        kfree(qgroup);
out:
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
        return ret;
}

int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
                       struct btrfs_qgroup_limit *limit)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_qgroup *qgroup;
        int ret = 0;
        /* Sometimes we would want to clear the limit on this qgroup.
         * To meet this requirement, we treat the -1 as a special value
         * which tell kernel to clear the limit on this qgroup.
         */
        const u64 CLEAR_VALUE = -1;

        mutex_lock(&fs_info->qgroup_ioctl_lock);
        if (!fs_info->quota_root) {
                ret = -ENOTCONN;
                goto out;
        }

        qgroup = find_qgroup_rb(fs_info, qgroupid);
        if (!qgroup) {
                ret = -ENOENT;
                goto out;
        }

        spin_lock(&fs_info->qgroup_lock);
        if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
                if (limit->max_rfer == CLEAR_VALUE) {
                        qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
                        limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
                        qgroup->max_rfer = 0;
                } else {
                        qgroup->max_rfer = limit->max_rfer;
                }
        }
        if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
                if (limit->max_excl == CLEAR_VALUE) {
                        qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
                        limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
                        qgroup->max_excl = 0;
                } else {
                        qgroup->max_excl = limit->max_excl;
                }
        }
        if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
                if (limit->rsv_rfer == CLEAR_VALUE) {
                        qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
                        limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
                        qgroup->rsv_rfer = 0;
                } else {
                        qgroup->rsv_rfer = limit->rsv_rfer;
                }
        }
        if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
                if (limit->rsv_excl == CLEAR_VALUE) {
                        qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
                        limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
                        qgroup->rsv_excl = 0;
                } else {
                        qgroup->rsv_excl = limit->rsv_excl;
                }
        }
        qgroup->lim_flags |= limit->flags;

        spin_unlock(&fs_info->qgroup_lock);

        ret = update_qgroup_limit_item(trans, qgroup);
        if (ret) {
                qgroup_mark_inconsistent(fs_info);
                btrfs_info(fs_info, "unable to update quota limit for %llu",
                       qgroupid);
        }

out:
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
        return ret;
}

/*
 * Inform qgroup to trace one dirty extent, its info is recorded in @record.
 * So qgroup can account it at transaction committing time.
 *
 * No lock version, caller must acquire delayed ref lock and allocated memory,
 * then call btrfs_qgroup_trace_extent_post() after exiting lock context.
 *
 * Return 0 for success insert
 * Return >0 for existing record, caller can free @record safely.
 * Error is not possible
 */
int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
                                struct btrfs_delayed_ref_root *delayed_refs,
                                struct btrfs_qgroup_extent_record *record)
{
        struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
        struct rb_node *parent_node = NULL;
        struct btrfs_qgroup_extent_record *entry;
        u64 bytenr = record->bytenr;

        if (!btrfs_qgroup_full_accounting(fs_info))
                return 1;

        lockdep_assert_held(&delayed_refs->lock);
        trace_btrfs_qgroup_trace_extent(fs_info, record);

        while (*p) {
                parent_node = *p;
                entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
                                 node);
                if (bytenr < entry->bytenr) {
                        p = &(*p)->rb_left;
                } else if (bytenr > entry->bytenr) {
                        p = &(*p)->rb_right;
                } else {
                        if (record->data_rsv && !entry->data_rsv) {
                                entry->data_rsv = record->data_rsv;
                                entry->data_rsv_refroot =
                                        record->data_rsv_refroot;
                        }
                        return 1;
                }
        }

        rb_link_node(&record->node, parent_node, p);
        rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
        return 0;
}

/*
 * Post handler after qgroup_trace_extent_nolock().
 *
 * NOTE: Current qgroup does the expensive backref walk at transaction
 * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming
 * new transaction.
 * This is designed to allow btrfs_find_all_roots() to get correct new_roots
 * result.
 *
 * However for old_roots there is no need to do backref walk at that time,
 * since we search commit roots to walk backref and result will always be
 * correct.
 *
 * Due to the nature of no lock version, we can't do backref there.
 * So we must call btrfs_qgroup_trace_extent_post() after exiting
 * spinlock context.
 *
 * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result
 * using current root, then we can move all expensive backref walk out of
 * transaction committing, but not now as qgroup accounting will be wrong again.
 */
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
                                   struct btrfs_qgroup_extent_record *qrecord)
{
        struct btrfs_backref_walk_ctx ctx = { 0 };
        int ret;

        if (!btrfs_qgroup_full_accounting(trans->fs_info))
                return 0;
        /*
         * We are always called in a context where we are already holding a
         * transaction handle. Often we are called when adding a data delayed
         * reference from btrfs_truncate_inode_items() (truncating or unlinking),
         * in which case we will be holding a write lock on extent buffer from a
         * subvolume tree. In this case we can't allow btrfs_find_all_roots() to
         * acquire fs_info->commit_root_sem, because that is a higher level lock
         * that must be acquired before locking any extent buffers.
         *
         * So we want btrfs_find_all_roots() to not acquire the commit_root_sem
         * but we can't pass it a non-NULL transaction handle, because otherwise
         * it would not use commit roots and would lock extent buffers, causing
         * a deadlock if it ends up trying to read lock the same extent buffer
         * that was previously write locked at btrfs_truncate_inode_items().
         *
         * So pass a NULL transaction handle to btrfs_find_all_roots() and
         * explicitly tell it to not acquire the commit_root_sem - if we are
         * holding a transaction handle we don't need its protection.
         */
        ASSERT(trans != NULL);

        if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
                return 0;

        ctx.bytenr = qrecord->bytenr;
        ctx.fs_info = trans->fs_info;

        ret = btrfs_find_all_roots(&ctx, true);
        if (ret < 0) {
                qgroup_mark_inconsistent(trans->fs_info);
                btrfs_warn(trans->fs_info,
"error accounting new delayed refs extent (err code: %d), quota inconsistent",
                        ret);
                return 0;
        }

        /*
         * Here we don't need to get the lock of
         * trans->transaction->delayed_refs, since inserted qrecord won't
         * be deleted, only qrecord->node may be modified (new qrecord insert)
         *
         * So modifying qrecord->old_roots is safe here
         */
        qrecord->old_roots = ctx.roots;
        return 0;
}

/*
 * Inform qgroup to trace one dirty extent, specified by @bytenr and
 * @num_bytes.
 * So qgroup can account it at commit trans time.
 *
 * Better encapsulated version, with memory allocation and backref walk for
 * commit roots.
 * So this can sleep.
 *
 * Return 0 if the operation is done.
 * Return <0 for error, like memory allocation failure or invalid parameter
 * (NULL trans)
 */
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
                              u64 num_bytes)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_qgroup_extent_record *record;
        struct btrfs_delayed_ref_root *delayed_refs;
        int ret;

        if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0)
                return 0;
        record = kzalloc(sizeof(*record), GFP_NOFS);
        if (!record)
                return -ENOMEM;

        delayed_refs = &trans->transaction->delayed_refs;
        record->bytenr = bytenr;
        record->num_bytes = num_bytes;
        record->old_roots = NULL;

        spin_lock(&delayed_refs->lock);
        ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
        spin_unlock(&delayed_refs->lock);
        if (ret > 0) {
                kfree(record);
                return 0;
        }
        return btrfs_qgroup_trace_extent_post(trans, record);
}

/*
 * Inform qgroup to trace all leaf items of data
 *
 * Return 0 for success
 * Return <0 for error(ENOMEM)
 */
int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
                                  struct extent_buffer *eb)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        int nr = btrfs_header_nritems(eb);
        int i, extent_type, ret;
        struct btrfs_key key;
        struct btrfs_file_extent_item *fi;
        u64 bytenr, num_bytes;

        /* We can be called directly from walk_up_proc() */
        if (!btrfs_qgroup_full_accounting(fs_info))
                return 0;

        for (i = 0; i < nr; i++) {
                btrfs_item_key_to_cpu(eb, &key, i);

                if (key.type != BTRFS_EXTENT_DATA_KEY)
                        continue;

                fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
                /* filter out non qgroup-accountable extents  */
                extent_type = btrfs_file_extent_type(eb, fi);

                if (extent_type == BTRFS_FILE_EXTENT_INLINE)
                        continue;

                bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
                if (!bytenr)
                        continue;

                num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);

                ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes);
                if (ret)
                        return ret;
        }
        cond_resched();
        return 0;
}

/*
 * Walk up the tree from the bottom, freeing leaves and any interior
 * nodes which have had all slots visited. If a node (leaf or
 * interior) is freed, the node above it will have it's slot
 * incremented. The root node will never be freed.
 *
 * At the end of this function, we should have a path which has all
 * slots incremented to the next position for a search. If we need to
 * read a new node it will be NULL and the node above it will have the
 * correct slot selected for a later read.
 *
 * If we increment the root nodes slot counter past the number of
 * elements, 1 is returned to signal completion of the search.
 */
static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
{
        int level = 0;
        int nr, slot;
        struct extent_buffer *eb;

        if (root_level == 0)
                return 1;

        while (level <= root_level) {
                eb = path->nodes[level];
                nr = btrfs_header_nritems(eb);
                path->slots[level]++;
                slot = path->slots[level];
                if (slot >= nr || level == 0) {
                        /*
                         * Don't free the root -  we will detect this
                         * condition after our loop and return a
                         * positive value for caller to stop walking the tree.
                         */
                        if (level != root_level) {
                                btrfs_tree_unlock_rw(eb, path->locks[level]);
                                path->locks[level] = 0;

                                free_extent_buffer(eb);
                                path->nodes[level] = NULL;
                                path->slots[level] = 0;
                        }
                } else {
                        /*
                         * We have a valid slot to walk back down
                         * from. Stop here so caller can process these
                         * new nodes.
                         */
                        break;
                }

                level++;
        }

        eb = path->nodes[root_level];
        if (path->slots[root_level] >= btrfs_header_nritems(eb))
                return 1;

        return 0;
}

/*
 * Helper function to trace a subtree tree block swap.
 *
 * The swap will happen in highest tree block, but there may be a lot of
 * tree blocks involved.
 *
 * For example:
 *  OO = Old tree blocks
 *  NN = New tree blocks allocated during balance
 *
 *           File tree (257)                  Reloc tree for 257
 * L2              OO                                NN
 *               /    \                            /    \
 * L1          OO      OO (a)                    OO      NN (a)
 *            / \     / \                       / \     / \
 * L0       OO   OO OO   OO                   OO   OO NN   NN
 *                  (b)  (c)                          (b)  (c)
 *
 * When calling qgroup_trace_extent_swap(), we will pass:
 * @src_eb = OO(a)
 * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
 * @dst_level = 0
 * @root_level = 1
 *
 * In that case, qgroup_trace_extent_swap() will search from OO(a) to
 * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
 *
 * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
 *
 * 1) Tree search from @src_eb
 *    It should acts as a simplified btrfs_search_slot().
 *    The key for search can be extracted from @dst_path->nodes[dst_level]
 *    (first key).
 *
 * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
 *    NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
 *    They should be marked during previous (@dst_level = 1) iteration.
 *
 * 3) Mark file extents in leaves dirty
 *    We don't have good way to pick out new file extents only.
 *    So we still follow the old method by scanning all file extents in
 *    the leave.
 *
 * This function can free us from keeping two paths, thus later we only need
 * to care about how to iterate all new tree blocks in reloc tree.
 */
static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
                                    struct extent_buffer *src_eb,
                                    struct btrfs_path *dst_path,
                                    int dst_level, int root_level,
                                    bool trace_leaf)
{
        struct btrfs_key key;
        struct btrfs_path *src_path;
        struct btrfs_fs_info *fs_info = trans->fs_info;
        u32 nodesize = fs_info->nodesize;
        int cur_level = root_level;
        int ret;

        BUG_ON(dst_level > root_level);
        /* Level mismatch */
        if (btrfs_header_level(src_eb) != root_level)
                return -EINVAL;

        src_path = btrfs_alloc_path();
        if (!src_path) {
                ret = -ENOMEM;
                goto out;
        }

        if (dst_level)
                btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
        else
                btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);

        /* For src_path */
        atomic_inc(&src_eb->refs);
        src_path->nodes[root_level] = src_eb;
        src_path->slots[root_level] = dst_path->slots[root_level];
        src_path->locks[root_level] = 0;

        /* A simplified version of btrfs_search_slot() */
        while (cur_level >= dst_level) {
                struct btrfs_key src_key;
                struct btrfs_key dst_key;

                if (src_path->nodes[cur_level] == NULL) {
                        struct extent_buffer *eb;
                        int parent_slot;

                        eb = src_path->nodes[cur_level + 1];
                        parent_slot = src_path->slots[cur_level + 1];

                        eb = btrfs_read_node_slot(eb, parent_slot);
                        if (IS_ERR(eb)) {
                                ret = PTR_ERR(eb);
                                goto out;
                        }

                        src_path->nodes[cur_level] = eb;

                        btrfs_tree_read_lock(eb);
                        src_path->locks[cur_level] = BTRFS_READ_LOCK;
                }

                src_path->slots[cur_level] = dst_path->slots[cur_level];
                if (cur_level) {
                        btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
                                        &dst_key, dst_path->slots[cur_level]);
                        btrfs_node_key_to_cpu(src_path->nodes[cur_level],
                                        &src_key, src_path->slots[cur_level]);
                } else {
                        btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
                                        &dst_key, dst_path->slots[cur_level]);
                        btrfs_item_key_to_cpu(src_path->nodes[cur_level],
                                        &src_key, src_path->slots[cur_level]);
                }
                /* Content mismatch, something went wrong */
                if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
                        ret = -ENOENT;
                        goto out;
                }
                cur_level--;
        }

        /*
         * Now both @dst_path and @src_path have been populated, record the tree
         * blocks for qgroup accounting.
         */
        ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
                                        nodesize);
        if (ret < 0)
                goto out;
        ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start,
                                        nodesize);
        if (ret < 0)
                goto out;

        /* Record leaf file extents */
        if (dst_level == 0 && trace_leaf) {
                ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
                if (ret < 0)
                        goto out;
                ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
        }
out:
        btrfs_free_path(src_path);
        return ret;
}

/*
 * Helper function to do recursive generation-aware depth-first search, to
 * locate all new tree blocks in a subtree of reloc tree.
 *
 * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
 *         reloc tree
 * L2         NN (a)
 *          /    \
 * L1    OO        NN (b)
 *      /  \      /  \
 * L0  OO  OO    OO  NN
 *               (c) (d)
 * If we pass:
 * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
 * @cur_level = 1
 * @root_level = 1
 *
 * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
 * above tree blocks along with their counter parts in file tree.
 * While during search, old tree blocks OO(c) will be skipped as tree block swap
 * won't affect OO(c).
 */
static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
                                           struct extent_buffer *src_eb,
                                           struct btrfs_path *dst_path,
                                           int cur_level, int root_level,
                                           u64 last_snapshot, bool trace_leaf)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct extent_buffer *eb;
        bool need_cleanup = false;
        int ret = 0;
        int i;

        /* Level sanity check */
        if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
            root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
            root_level < cur_level) {
                btrfs_err_rl(fs_info,
                        "%s: bad levels, cur_level=%d root_level=%d",
                        __func__, cur_level, root_level);
                return -EUCLEAN;
        }

        /* Read the tree block if needed */
        if (dst_path->nodes[cur_level] == NULL) {
                int parent_slot;
                u64 child_gen;

                /*
                 * dst_path->nodes[root_level] must be initialized before
                 * calling this function.
                 */
                if (cur_level == root_level) {
                        btrfs_err_rl(fs_info,
        "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
                                __func__, root_level, root_level, cur_level);
                        return -EUCLEAN;
                }

                /*
                 * We need to get child blockptr/gen from parent before we can
                 * read it.
                  */
                eb = dst_path->nodes[cur_level + 1];
                parent_slot = dst_path->slots[cur_level + 1];
                child_gen = btrfs_node_ptr_generation(eb, parent_slot);

                /* This node is old, no need to trace */
                if (child_gen < last_snapshot)
                        goto out;

                eb = btrfs_read_node_slot(eb, parent_slot);
                if (IS_ERR(eb)) {
                        ret = PTR_ERR(eb);
                        goto out;
                }

                dst_path->nodes[cur_level] = eb;
                dst_path->slots[cur_level] = 0;

                btrfs_tree_read_lock(eb);
                dst_path->locks[cur_level] = BTRFS_READ_LOCK;
                need_cleanup = true;
        }

        /* Now record this tree block and its counter part for qgroups */
        ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
                                       root_level, trace_leaf);
        if (ret < 0)
                goto cleanup;

        eb = dst_path->nodes[cur_level];

        if (cur_level > 0) {
                /* Iterate all child tree blocks */
                for (i = 0; i < btrfs_header_nritems(eb); i++) {
                        /* Skip old tree blocks as they won't be swapped */
                        if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
                                continue;
                        dst_path->slots[cur_level] = i;

                        /* Recursive call (at most 7 times) */
                        ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
                                        dst_path, cur_level - 1, root_level,
                                        last_snapshot, trace_leaf);
                        if (ret < 0)
                                goto cleanup;
                }
        }

cleanup:
        if (need_cleanup) {
                /* Clean up */
                btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
                                     dst_path->locks[cur_level]);
                free_extent_buffer(dst_path->nodes[cur_level]);
                dst_path->nodes[cur_level] = NULL;
                dst_path->slots[cur_level] = 0;
                dst_path->locks[cur_level] = 0;
        }
out:
        return ret;
}

static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
                                struct extent_buffer *src_eb,
                                struct extent_buffer *dst_eb,
                                u64 last_snapshot, bool trace_leaf)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_path *dst_path = NULL;
        int level;
        int ret;

        if (!btrfs_qgroup_full_accounting(fs_info))
                return 0;

        /* Wrong parameter order */
        if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
                btrfs_err_rl(fs_info,
                "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
                             btrfs_header_generation(src_eb),
                             btrfs_header_generation(dst_eb));
                return -EUCLEAN;
        }

        if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
                ret = -EIO;
                goto out;
        }

        level = btrfs_header_level(dst_eb);
        dst_path = btrfs_alloc_path();
        if (!dst_path) {
                ret = -ENOMEM;
                goto out;
        }
        /* For dst_path */
        atomic_inc(&dst_eb->refs);
        dst_path->nodes[level] = dst_eb;
        dst_path->slots[level] = 0;
        dst_path->locks[level] = 0;

        /* Do the generation aware breadth-first search */
        ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
                                              level, last_snapshot, trace_leaf);
        if (ret < 0)
                goto out;
        ret = 0;

out:
        btrfs_free_path(dst_path);
        if (ret < 0)
                qgroup_mark_inconsistent(fs_info);
        return ret;
}

/*
 * Inform qgroup to trace a whole subtree, including all its child tree
 * blocks and data.
 * The root tree block is specified by @root_eb.
 *
 * Normally used by relocation(tree block swap) and subvolume deletion.
 *
 * Return 0 for success
 * Return <0 for error(ENOMEM or tree search error)
 */
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
                               struct extent_buffer *root_eb,
                               u64 root_gen, int root_level)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        int ret = 0;
        int level;
        u8 drop_subptree_thres;
        struct extent_buffer *eb = root_eb;
        struct btrfs_path *path = NULL;

        ASSERT(0 <= root_level && root_level < BTRFS_MAX_LEVEL);
        ASSERT(root_eb != NULL);

        if (!btrfs_qgroup_full_accounting(fs_info))
                return 0;

        spin_lock(&fs_info->qgroup_lock);
        drop_subptree_thres = fs_info->qgroup_drop_subtree_thres;
        spin_unlock(&fs_info->qgroup_lock);

        /*
         * This function only gets called for snapshot drop, if we hit a high
         * node here, it means we are going to change ownership for quite a lot
         * of extents, which will greatly slow down btrfs_commit_transaction().
         *
         * So here if we find a high tree here, we just skip the accounting and
         * mark qgroup inconsistent.
         */
        if (root_level >= drop_subptree_thres) {
                qgroup_mark_inconsistent(fs_info);
                return 0;
        }

        if (!extent_buffer_uptodate(root_eb)) {
                struct btrfs_tree_parent_check check = {
                        .has_first_key = false,
                        .transid = root_gen,
                        .level = root_level
                };

                ret = btrfs_read_extent_buffer(root_eb, &check);
                if (ret)
                        goto out;
        }

        if (root_level == 0) {
                ret = btrfs_qgroup_trace_leaf_items(trans, root_eb);
                goto out;
        }

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        /*
         * Walk down the tree.  Missing extent blocks are filled in as
         * we go. Metadata is accounted every time we read a new
         * extent block.
         *
         * When we reach a leaf, we account for file extent items in it,
         * walk back up the tree (adjusting slot pointers as we go)
         * and restart the search process.
         */
        atomic_inc(&root_eb->refs);        /* For path */
        path->nodes[root_level] = root_eb;
        path->slots[root_level] = 0;
        path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
walk_down:
        level = root_level;
        while (level >= 0) {
                if (path->nodes[level] == NULL) {
                        int parent_slot;
                        u64 child_bytenr;

                        /*
                         * We need to get child blockptr from parent before we
                         * can read it.
                          */
                        eb = path->nodes[level + 1];
                        parent_slot = path->slots[level + 1];
                        child_bytenr = btrfs_node_blockptr(eb, parent_slot);

                        eb = btrfs_read_node_slot(eb, parent_slot);
                        if (IS_ERR(eb)) {
                                ret = PTR_ERR(eb);
                                goto out;
                        }

                        path->nodes[level] = eb;
                        path->slots[level] = 0;

                        btrfs_tree_read_lock(eb);
                        path->locks[level] = BTRFS_READ_LOCK;

                        ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
                                                        fs_info->nodesize);
                        if (ret)
                                goto out;
                }

                if (level == 0) {
                        ret = btrfs_qgroup_trace_leaf_items(trans,
                                                            path->nodes[level]);
                        if (ret)
                                goto out;

                        /* Nonzero return here means we completed our search */
                        ret = adjust_slots_upwards(path, root_level);
                        if (ret)
                                break;

                        /* Restart search with new slots */
                        goto walk_down;
                }

                level--;
        }

        ret = 0;
out:
        btrfs_free_path(path);

        return ret;
}

static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup)
{
        if (!list_empty(&qgroup->nested_iterator))
                return;

        list_add_tail(&qgroup->nested_iterator, head);
}

static void qgroup_iterator_nested_clean(struct list_head *head)
{
        while (!list_empty(head)) {
                struct btrfs_qgroup *qgroup;

                qgroup = list_first_entry(head, struct btrfs_qgroup, nested_iterator);
                list_del_init(&qgroup->nested_iterator);
        }
}

#define UPDATE_NEW        0
#define UPDATE_OLD        1
/*
 * Walk all of the roots that points to the bytenr and adjust their refcnts.
 */
static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
                                 struct ulist *roots, struct list_head *qgroups,
                                 u64 seq, int update_old)
{
        struct ulist_node *unode;
        struct ulist_iterator uiter;
        struct btrfs_qgroup *qg;

        if (!roots)
                return;
        ULIST_ITER_INIT(&uiter);
        while ((unode = ulist_next(roots, &uiter))) {
                LIST_HEAD(tmp);

                qg = find_qgroup_rb(fs_info, unode->val);
                if (!qg)
                        continue;

                qgroup_iterator_nested_add(qgroups, qg);
                qgroup_iterator_add(&tmp, qg);
                list_for_each_entry(qg, &tmp, iterator) {
                        struct btrfs_qgroup_list *glist;

                        if (update_old)
                                btrfs_qgroup_update_old_refcnt(qg, seq, 1);
                        else
                                btrfs_qgroup_update_new_refcnt(qg, seq, 1);

                        list_for_each_entry(glist, &qg->groups, next_group) {
                                qgroup_iterator_nested_add(qgroups, glist->group);
                                qgroup_iterator_add(&tmp, glist->group);
                        }
                }
                qgroup_iterator_clean(&tmp);
        }
}

/*
 * Update qgroup rfer/excl counters.
 * Rfer update is easy, codes can explain themselves.
 *
 * Excl update is tricky, the update is split into 2 parts.
 * Part 1: Possible exclusive <-> sharing detect:
 *        |        A        |        !A        |
 *  -------------------------------------
 *  B        |        *        |        -        |
 *  -------------------------------------
 *  !B        |        +        |        **        |
 *  -------------------------------------
 *
 * Conditions:
 * A:        cur_old_roots < nr_old_roots        (not exclusive before)
 * !A:        cur_old_roots == nr_old_roots        (possible exclusive before)
 * B:        cur_new_roots < nr_new_roots        (not exclusive now)
 * !B:        cur_new_roots == nr_new_roots        (possible exclusive now)
 *
 * Results:
 * +: Possible sharing -> exclusive        -: Possible exclusive -> sharing
 * *: Definitely not changed.                **: Possible unchanged.
 *
 * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
 *
 * To make the logic clear, we first use condition A and B to split
 * combination into 4 results.
 *
 * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
 * only on variant maybe 0.
 *
 * Lastly, check result **, since there are 2 variants maybe 0, split them
 * again(2x2).
 * But this time we don't need to consider other things, the codes and logic
 * is easy to understand now.
 */
static void qgroup_update_counters(struct btrfs_fs_info *fs_info,
                                   struct list_head *qgroups, u64 nr_old_roots,
                                   u64 nr_new_roots, u64 num_bytes, u64 seq)
{
        struct btrfs_qgroup *qg;

        list_for_each_entry(qg, qgroups, nested_iterator) {
                u64 cur_new_count, cur_old_count;
                bool dirty = false;

                cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
                cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);

                trace_qgroup_update_counters(fs_info, qg, cur_old_count,
                                             cur_new_count);

                /* Rfer update part */
                if (cur_old_count == 0 && cur_new_count > 0) {
                        qg->rfer += num_bytes;
                        qg->rfer_cmpr += num_bytes;
                        dirty = true;
                }
                if (cur_old_count > 0 && cur_new_count == 0) {
                        qg->rfer -= num_bytes;
                        qg->rfer_cmpr -= num_bytes;
                        dirty = true;
                }

                /* Excl update part */
                /* Exclusive/none -> shared case */
                if (cur_old_count == nr_old_roots &&
                    cur_new_count < nr_new_roots) {
                        /* Exclusive -> shared */
                        if (cur_old_count != 0) {
                                qg->excl -= num_bytes;
                                qg->excl_cmpr -= num_bytes;
                                dirty = true;
                        }
                }

                /* Shared -> exclusive/none case */
                if (cur_old_count < nr_old_roots &&
                    cur_new_count == nr_new_roots) {
                        /* Shared->exclusive */
                        if (cur_new_count != 0) {
                                qg->excl += num_bytes;
                                qg->excl_cmpr += num_bytes;
                                dirty = true;
                        }
                }

                /* Exclusive/none -> exclusive/none case */
                if (cur_old_count == nr_old_roots &&
                    cur_new_count == nr_new_roots) {
                        if (cur_old_count == 0) {
                                /* None -> exclusive/none */

                                if (cur_new_count != 0) {
                                        /* None -> exclusive */
                                        qg->excl += num_bytes;
                                        qg->excl_cmpr += num_bytes;
                                        dirty = true;
                                }
                                /* None -> none, nothing changed */
                        } else {
                                /* Exclusive -> exclusive/none */

                                if (cur_new_count == 0) {
                                        /* Exclusive -> none */
                                        qg->excl -= num_bytes;
                                        qg->excl_cmpr -= num_bytes;
                                        dirty = true;
                                }
                                /* Exclusive -> exclusive, nothing changed */
                        }
                }

                if (dirty)
                        qgroup_dirty(fs_info, qg);
        }
}

/*
 * Check if the @roots potentially is a list of fs tree roots
 *
 * Return 0 for definitely not a fs/subvol tree roots ulist
 * Return 1 for possible fs/subvol tree roots in the list (considering an empty
 *          one as well)
 */
static int maybe_fs_roots(struct ulist *roots)
{
        struct ulist_node *unode;
        struct ulist_iterator uiter;

        /* Empty one, still possible for fs roots */
        if (!roots || roots->nnodes == 0)
                return 1;

        ULIST_ITER_INIT(&uiter);
        unode = ulist_next(roots, &uiter);
        if (!unode)
                return 1;

        /*
         * If it contains fs tree roots, then it must belong to fs/subvol
         * trees.
         * If it contains a non-fs tree, it won't be shared with fs/subvol trees.
         */
        return is_fstree(unode->val);
}

int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
                                u64 num_bytes, struct ulist *old_roots,
                                struct ulist *new_roots)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        LIST_HEAD(qgroups);
        u64 seq;
        u64 nr_new_roots = 0;
        u64 nr_old_roots = 0;
        int ret = 0;

        /*
         * If quotas get disabled meanwhile, the resources need to be freed and
         * we can't just exit here.
         */
        if (!btrfs_qgroup_full_accounting(fs_info) ||
            fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
                goto out_free;

        if (new_roots) {
                if (!maybe_fs_roots(new_roots))
                        goto out_free;
                nr_new_roots = new_roots->nnodes;
        }
        if (old_roots) {
                if (!maybe_fs_roots(old_roots))
                        goto out_free;
                nr_old_roots = old_roots->nnodes;
        }

        /* Quick exit, either not fs tree roots, or won't affect any qgroup */
        if (nr_old_roots == 0 && nr_new_roots == 0)
                goto out_free;

        trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
                                        num_bytes, nr_old_roots, nr_new_roots);

        mutex_lock(&fs_info->qgroup_rescan_lock);
        if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
                if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
                        mutex_unlock(&fs_info->qgroup_rescan_lock);
                        ret = 0;
                        goto out_free;
                }
        }
        mutex_unlock(&fs_info->qgroup_rescan_lock);

        spin_lock(&fs_info->qgroup_lock);
        seq = fs_info->qgroup_seq;

        /* Update old refcnts using old_roots */
        qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, UPDATE_OLD);

        /* Update new refcnts using new_roots */
        qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, UPDATE_NEW);

        qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots,
                               num_bytes, seq);

        /*
         * We're done using the iterator, release all its qgroups while holding
         * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup()
         * and trigger use-after-free accesses to qgroups.
         */
        qgroup_iterator_nested_clean(&qgroups);

        /*
         * Bump qgroup_seq to avoid seq overlap
         */
        fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
        spin_unlock(&fs_info->qgroup_lock);
out_free:
        ulist_free(old_roots);
        ulist_free(new_roots);
        return ret;
}

int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_qgroup_extent_record *record;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct ulist *new_roots = NULL;
        struct rb_node *node;
        u64 num_dirty_extents = 0;
        u64 qgroup_to_skip;
        int ret = 0;

        if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
                return 0;

        delayed_refs = &trans->transaction->delayed_refs;
        qgroup_to_skip = delayed_refs->qgroup_to_skip;
        while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
                record = rb_entry(node, struct btrfs_qgroup_extent_record,
                                  node);

                num_dirty_extents++;
                trace_btrfs_qgroup_account_extents(fs_info, record);

                if (!ret && !(fs_info->qgroup_flags &
                              BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
                        struct btrfs_backref_walk_ctx ctx = { 0 };

                        ctx.bytenr = record->bytenr;
                        ctx.fs_info = fs_info;

                        /*
                         * Old roots should be searched when inserting qgroup
                         * extent record.
                         *
                         * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case,
                         * we may have some record inserted during
                         * NO_ACCOUNTING (thus no old_roots populated), but
                         * later we start rescan, which clears NO_ACCOUNTING,
                         * leaving some inserted records without old_roots
                         * populated.
                         *
                         * Those cases are rare and should not cause too much
                         * time spent during commit_transaction().
                         */
                        if (!record->old_roots) {
                                /* Search commit root to find old_roots */
                                ret = btrfs_find_all_roots(&ctx, false);
                                if (ret < 0)
                                        goto cleanup;
                                record->old_roots = ctx.roots;
                                ctx.roots = NULL;
                        }

                        /*
                         * Use BTRFS_SEQ_LAST as time_seq to do special search,
                         * which doesn't lock tree or delayed_refs and search
                         * current root. It's safe inside commit_transaction().
                         */
                        ctx.trans = trans;
                        ctx.time_seq = BTRFS_SEQ_LAST;
                        ret = btrfs_find_all_roots(&ctx, false);
                        if (ret < 0)
                                goto cleanup;
                        new_roots = ctx.roots;
                        if (qgroup_to_skip) {
                                ulist_del(new_roots, qgroup_to_skip, 0);
                                ulist_del(record->old_roots, qgroup_to_skip,
                                          0);
                        }
                        ret = btrfs_qgroup_account_extent(trans, record->bytenr,
                                                          record->num_bytes,
                                                          record->old_roots,
                                                          new_roots);
                        record->old_roots = NULL;
                        new_roots = NULL;
                }
                /* Free the reserved data space */
                btrfs_qgroup_free_refroot(fs_info,
                                record->data_rsv_refroot,
                                record->data_rsv,
                                BTRFS_QGROUP_RSV_DATA);
cleanup:
                ulist_free(record->old_roots);
                ulist_free(new_roots);
                new_roots = NULL;
                rb_erase(node, &delayed_refs->dirty_extent_root);
                kfree(record);

        }
        trace_qgroup_num_dirty_extents(fs_info, trans->transid,
                                       num_dirty_extents);
        return ret;
}

/*
 * Writes all changed qgroups to disk.
 * Called by the transaction commit path and the qgroup assign ioctl.
 */
int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        int ret = 0;

        /*
         * In case we are called from the qgroup assign ioctl, assert that we
         * are holding the qgroup_ioctl_lock, otherwise we can race with a quota
         * disable operation (ioctl) and access a freed quota root.
         */
        if (trans->transaction->state != TRANS_STATE_COMMIT_DOING)
                lockdep_assert_held(&fs_info->qgroup_ioctl_lock);

        if (!fs_info->quota_root)
                return ret;

        spin_lock(&fs_info->qgroup_lock);
        while (!list_empty(&fs_info->dirty_qgroups)) {
                struct btrfs_qgroup *qgroup;
                qgroup = list_first_entry(&fs_info->dirty_qgroups,
                                          struct btrfs_qgroup, dirty);
                list_del_init(&qgroup->dirty);
                spin_unlock(&fs_info->qgroup_lock);
                ret = update_qgroup_info_item(trans, qgroup);
                if (ret)
                        qgroup_mark_inconsistent(fs_info);
                ret = update_qgroup_limit_item(trans, qgroup);
                if (ret)
                        qgroup_mark_inconsistent(fs_info);
                spin_lock(&fs_info->qgroup_lock);
        }
        if (btrfs_qgroup_enabled(fs_info))
                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
        else
                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
        spin_unlock(&fs_info->qgroup_lock);

        ret = update_qgroup_status_item(trans);
        if (ret)
                qgroup_mark_inconsistent(fs_info);

        return ret;
}

int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info,
                               struct btrfs_qgroup_inherit *inherit,
                               size_t size)
{
        if (!btrfs_qgroup_enabled(fs_info))
                return 0;
        if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP)
                return -EOPNOTSUPP;
        if (size < sizeof(*inherit) || size > PAGE_SIZE)
                return -EINVAL;

        /*
         * In the past we allowed btrfs_qgroup_inherit to specify to copy
         * rfer/excl numbers directly from other qgroups.  This behavior has
         * been disabled in userspace for a very long time, but here we should
         * also disable it in kernel, as this behavior is known to mark qgroup
         * inconsistent, and a rescan would wipe out the changes anyway.
         *
         * Reject any btrfs_qgroup_inherit with num_ref_copies or num_excl_copies.
         */
        if (inherit->num_ref_copies > 0 || inherit->num_excl_copies > 0)
                return -EINVAL;

        if (size != struct_size(inherit, qgroups, inherit->num_qgroups))
                return -EINVAL;

        /*
         * Now check all the remaining qgroups, they should all:
         *
         * - Exist
         * - Be higher level qgroups.
         */
        for (int i = 0; i < inherit->num_qgroups; i++) {
                struct btrfs_qgroup *qgroup;
                u64 qgroupid = inherit->qgroups[i];

                if (btrfs_qgroup_level(qgroupid) == 0)
                        return -EINVAL;

                spin_lock(&fs_info->qgroup_lock);
                qgroup = find_qgroup_rb(fs_info, qgroupid);
                if (!qgroup) {
                        spin_unlock(&fs_info->qgroup_lock);
                        return -ENOENT;
                }
                spin_unlock(&fs_info->qgroup_lock);
        }
        return 0;
}

static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info,
                               u64 inode_rootid,
                               struct btrfs_qgroup_inherit **inherit)
{
        int i = 0;
        u64 num_qgroups = 0;
        struct btrfs_qgroup *inode_qg;
        struct btrfs_qgroup_list *qg_list;
        struct btrfs_qgroup_inherit *res;
        size_t struct_sz;
        u64 *qgids;

        if (*inherit)
                return -EEXIST;

        inode_qg = find_qgroup_rb(fs_info, inode_rootid);
        if (!inode_qg)
                return -ENOENT;

        num_qgroups = list_count_nodes(&inode_qg->groups);

        if (!num_qgroups)
                return 0;

        struct_sz = struct_size(res, qgroups, num_qgroups);
        if (struct_sz == SIZE_MAX)
                return -ERANGE;

        res = kzalloc(struct_sz, GFP_NOFS);
        if (!res)
                return -ENOMEM;
        res->num_qgroups = num_qgroups;
        qgids = res->qgroups;

        list_for_each_entry(qg_list, &inode_qg->groups, next_group)
                qgids[i++] = qg_list->group->qgroupid;

        *inherit = res;
        return 0;
}

/*
 * Check if we can skip rescan when inheriting qgroups.  If @src has a single
 * @parent, and that @parent is owning all its bytes exclusively, we can skip
 * the full rescan, by just adding nodesize to the @parent's excl/rfer.
 *
 * Return <0 for fatal errors (like srcid/parentid has no qgroup).
 * Return 0 if a quick inherit is done.
 * Return >0 if a quick inherit is not possible, and a full rescan is needed.
 */
static int qgroup_snapshot_quick_inherit(struct btrfs_fs_info *fs_info,
                                         u64 srcid, u64 parentid)
{
        struct btrfs_qgroup *src;
        struct btrfs_qgroup *parent;
        struct btrfs_qgroup_list *list;
        int nr_parents = 0;

        src = find_qgroup_rb(fs_info, srcid);
        if (!src)
                return -ENOENT;
        parent = find_qgroup_rb(fs_info, parentid);
        if (!parent)
                return -ENOENT;

        /*
         * Source has no parent qgroup, but our new qgroup would have one.
         * Qgroup numbers would become inconsistent.
         */
        if (list_empty(&src->groups))
                return 1;

        list_for_each_entry(list, &src->groups, next_group) {
                /* The parent is not the same, quick update is not possible. */
                if (list->group->qgroupid != parentid)
                        return 1;
                nr_parents++;
                /*
                 * More than one parent qgroup, we can't be sure about accounting
                 * consistency.
                 */
                if (nr_parents > 1)
                        return 1;
        }

        /*
         * The parent is not exclusively owning all its bytes.  We're not sure
         * if the source has any bytes not fully owned by the parent.
         */
        if (parent->excl != parent->rfer)
                return 1;

        parent->excl += fs_info->nodesize;
        parent->rfer += fs_info->nodesize;
        return 0;
}

/*
 * Copy the accounting information between qgroups. This is necessary
 * when a snapshot or a subvolume is created. Throwing an error will
 * cause a transaction abort so we take extra care here to only error
 * when a readonly fs is a reasonable outcome.
 */
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
                         u64 objectid, u64 inode_rootid,
                         struct btrfs_qgroup_inherit *inherit)
{
        int ret = 0;
        int i;
        u64 *i_qgroups;
        bool committing = false;
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *quota_root;
        struct btrfs_qgroup *srcgroup;
        struct btrfs_qgroup *dstgroup;
        struct btrfs_qgroup *prealloc;
        struct btrfs_qgroup_list **qlist_prealloc = NULL;
        bool free_inherit = false;
        bool need_rescan = false;
        u32 level_size = 0;
        u64 nums;

        prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
        if (!prealloc)
                return -ENOMEM;

        /*
         * There are only two callers of this function.
         *
         * One in create_subvol() in the ioctl context, which needs to hold
         * the qgroup_ioctl_lock.
         *
         * The other one in create_pending_snapshot() where no other qgroup
         * code can modify the fs as they all need to either start a new trans
         * or hold a trans handler, thus we don't need to hold
         * qgroup_ioctl_lock.
         * This would avoid long and complex lock chain and make lockdep happy.
         */
        spin_lock(&fs_info->trans_lock);
        if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
                committing = true;
        spin_unlock(&fs_info->trans_lock);

        if (!committing)
                mutex_lock(&fs_info->qgroup_ioctl_lock);
        if (!btrfs_qgroup_enabled(fs_info))
                goto out;

        quota_root = fs_info->quota_root;
        if (!quota_root) {
                ret = -EINVAL;
                goto out;
        }

        if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && !inherit) {
                ret = qgroup_auto_inherit(fs_info, inode_rootid, &inherit);
                if (ret)
                        goto out;
                free_inherit = true;
        }

        if (inherit) {
                i_qgroups = (u64 *)(inherit + 1);
                nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
                       2 * inherit->num_excl_copies;
                for (i = 0; i < nums; ++i) {
                        srcgroup = find_qgroup_rb(fs_info, *i_qgroups);

                        /*
                         * Zero out invalid groups so we can ignore
                         * them later.
                         */
                        if (!srcgroup ||
                            ((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
                                *i_qgroups = 0ULL;

                        ++i_qgroups;
                }
        }

        /*
         * create a tracking group for the subvol itself
         */
        ret = add_qgroup_item(trans, quota_root, objectid);
        if (ret)
                goto out;

        /*
         * add qgroup to all inherited groups
         */
        if (inherit) {
                i_qgroups = (u64 *)(inherit + 1);
                for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
                        if (*i_qgroups == 0)
                                continue;
                        ret = add_qgroup_relation_item(trans, objectid,
                                                       *i_qgroups);
                        if (ret && ret != -EEXIST)
                                goto out;
                        ret = add_qgroup_relation_item(trans, *i_qgroups,
                                                       objectid);
                        if (ret && ret != -EEXIST)
                                goto out;
                }
                ret = 0;

                qlist_prealloc = kcalloc(inherit->num_qgroups,
                                         sizeof(struct btrfs_qgroup_list *),
                                         GFP_NOFS);
                if (!qlist_prealloc) {
                        ret = -ENOMEM;
                        goto out;
                }
                for (int i = 0; i < inherit->num_qgroups; i++) {
                        qlist_prealloc[i] = kzalloc(sizeof(struct btrfs_qgroup_list),
                                                    GFP_NOFS);
                        if (!qlist_prealloc[i]) {
                                ret = -ENOMEM;
                                goto out;
                        }
                }
        }

        spin_lock(&fs_info->qgroup_lock);

        dstgroup = add_qgroup_rb(fs_info, prealloc, objectid);
        prealloc = NULL;

        if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
                dstgroup->lim_flags = inherit->lim.flags;
                dstgroup->max_rfer = inherit->lim.max_rfer;
                dstgroup->max_excl = inherit->lim.max_excl;
                dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
                dstgroup->rsv_excl = inherit->lim.rsv_excl;

                qgroup_dirty(fs_info, dstgroup);
        }

        if (srcid && btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) {
                srcgroup = find_qgroup_rb(fs_info, srcid);
                if (!srcgroup)
                        goto unlock;

                /*
                 * We call inherit after we clone the root in order to make sure
                 * our counts don't go crazy, so at this point the only
                 * difference between the two roots should be the root node.
                 */
                level_size = fs_info->nodesize;
                dstgroup->rfer = srcgroup->rfer;
                dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
                dstgroup->excl = level_size;
                dstgroup->excl_cmpr = level_size;
                srcgroup->excl = level_size;
                srcgroup->excl_cmpr = level_size;

                /* inherit the limit info */
                dstgroup->lim_flags = srcgroup->lim_flags;
                dstgroup->max_rfer = srcgroup->max_rfer;
                dstgroup->max_excl = srcgroup->max_excl;
                dstgroup->rsv_rfer = srcgroup->rsv_rfer;
                dstgroup->rsv_excl = srcgroup->rsv_excl;

                qgroup_dirty(fs_info, dstgroup);
                qgroup_dirty(fs_info, srcgroup);

                /*
                 * If the source qgroup has parent but the new one doesn't,
                 * we need a full rescan.
                 */
                if (!inherit && !list_empty(&srcgroup->groups))
                        need_rescan = true;
        }

        if (!inherit)
                goto unlock;

        i_qgroups = (u64 *)(inherit + 1);
        for (i = 0; i < inherit->num_qgroups; ++i) {
                if (*i_qgroups) {
                        ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid,
                                              *i_qgroups);
                        qlist_prealloc[i] = NULL;
                        if (ret)
                                goto unlock;
                }
                if (srcid) {
                        /* Check if we can do a quick inherit. */
                        ret = qgroup_snapshot_quick_inherit(fs_info, srcid, *i_qgroups);
                        if (ret < 0)
                                goto unlock;
                        if (ret > 0)
                                need_rescan = true;
                        ret = 0;
                }
                ++i_qgroups;
        }

        for (i = 0; i <  inherit->num_ref_copies; ++i, i_qgroups += 2) {
                struct btrfs_qgroup *src;
                struct btrfs_qgroup *dst;

                if (!i_qgroups[0] || !i_qgroups[1])
                        continue;

                src = find_qgroup_rb(fs_info, i_qgroups[0]);
                dst = find_qgroup_rb(fs_info, i_qgroups[1]);

                if (!src || !dst) {
                        ret = -EINVAL;
                        goto unlock;
                }

                dst->rfer = src->rfer - level_size;
                dst->rfer_cmpr = src->rfer_cmpr - level_size;

                /* Manually tweaking numbers certainly needs a rescan */
                need_rescan = true;
        }
        for (i = 0; i <  inherit->num_excl_copies; ++i, i_qgroups += 2) {
                struct btrfs_qgroup *src;
                struct btrfs_qgroup *dst;

                if (!i_qgroups[0] || !i_qgroups[1])
                        continue;

                src = find_qgroup_rb(fs_info, i_qgroups[0]);
                dst = find_qgroup_rb(fs_info, i_qgroups[1]);

                if (!src || !dst) {
                        ret = -EINVAL;
                        goto unlock;
                }

                dst->excl = src->excl + level_size;
                dst->excl_cmpr = src->excl_cmpr + level_size;
                need_rescan = true;
        }

unlock:
        spin_unlock(&fs_info->qgroup_lock);
        if (!ret)
                ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
out:
        if (!committing)
                mutex_unlock(&fs_info->qgroup_ioctl_lock);
        if (need_rescan)
                qgroup_mark_inconsistent(fs_info);
        if (qlist_prealloc) {
                for (int i = 0; i < inherit->num_qgroups; i++)
                        kfree(qlist_prealloc[i]);
                kfree(qlist_prealloc);
        }
        if (free_inherit)
                kfree(inherit);
        kfree(prealloc);
        return ret;
}

static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
{
        if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
            qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
                return false;

        if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
            qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
                return false;

        return true;
}

static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
                          enum btrfs_qgroup_rsv_type type)
{
        struct btrfs_qgroup *qgroup;
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 ref_root = btrfs_root_id(root);
        int ret = 0;
        LIST_HEAD(qgroup_list);

        if (!is_fstree(ref_root))
                return 0;

        if (num_bytes == 0)
                return 0;

        if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
            capable(CAP_SYS_RESOURCE))
                enforce = false;

        spin_lock(&fs_info->qgroup_lock);
        if (!fs_info->quota_root)
                goto out;

        qgroup = find_qgroup_rb(fs_info, ref_root);
        if (!qgroup)
                goto out;

        qgroup_iterator_add(&qgroup_list, qgroup);
        list_for_each_entry(qgroup, &qgroup_list, iterator) {
                struct btrfs_qgroup_list *glist;

                if (enforce && !qgroup_check_limits(qgroup, num_bytes)) {
                        ret = -EDQUOT;
                        goto out;
                }

                list_for_each_entry(glist, &qgroup->groups, next_group)
                        qgroup_iterator_add(&qgroup_list, glist->group);
        }

        ret = 0;
        /*
         * no limits exceeded, now record the reservation into all qgroups
         */
        list_for_each_entry(qgroup, &qgroup_list, iterator)
                qgroup_rsv_add(fs_info, qgroup, num_bytes, type);

out:
        qgroup_iterator_clean(&qgroup_list);
        spin_unlock(&fs_info->qgroup_lock);
        return ret;
}

/*
 * Free @num_bytes of reserved space with @type for qgroup.  (Normally level 0
 * qgroup).
 *
 * Will handle all higher level qgroup too.
 *
 * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
 * This special case is only used for META_PERTRANS type.
 */
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
                               u64 ref_root, u64 num_bytes,
                               enum btrfs_qgroup_rsv_type type)
{
        struct btrfs_qgroup *qgroup;
        LIST_HEAD(qgroup_list);

        if (!is_fstree(ref_root))
                return;

        if (num_bytes == 0)
                return;

        if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
                WARN(1, "%s: Invalid type to free", __func__);
                return;
        }
        spin_lock(&fs_info->qgroup_lock);

        if (!fs_info->quota_root)
                goto out;

        qgroup = find_qgroup_rb(fs_info, ref_root);
        if (!qgroup)
                goto out;

        if (num_bytes == (u64)-1)
                /*
                 * We're freeing all pertrans rsv, get reserved value from
                 * level 0 qgroup as real num_bytes to free.
                 */
                num_bytes = qgroup->rsv.values[type];

        qgroup_iterator_add(&qgroup_list, qgroup);
        list_for_each_entry(qgroup, &qgroup_list, iterator) {
                struct btrfs_qgroup_list *glist;

                qgroup_rsv_release(fs_info, qgroup, num_bytes, type);
                list_for_each_entry(glist, &qgroup->groups, next_group) {
                        qgroup_iterator_add(&qgroup_list, glist->group);
                }
        }
out:
        qgroup_iterator_clean(&qgroup_list);
        spin_unlock(&fs_info->qgroup_lock);
}

/*
 * Check if the leaf is the last leaf. Which means all node pointers
 * are at their last position.
 */
static bool is_last_leaf(struct btrfs_path *path)
{
        int i;

        for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
                if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
                        return false;
        }
        return true;
}

/*
 * returns < 0 on error, 0 when more leafs are to be scanned.
 * returns 1 when done.
 */
static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
                              struct btrfs_path *path)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *extent_root;
        struct btrfs_key found;
        struct extent_buffer *scratch_leaf = NULL;
        u64 num_bytes;
        bool done;
        int slot;
        int ret;

        if (!btrfs_qgroup_full_accounting(fs_info))
                return 1;

        mutex_lock(&fs_info->qgroup_rescan_lock);
        extent_root = btrfs_extent_root(fs_info,
                                fs_info->qgroup_rescan_progress.objectid);
        ret = btrfs_search_slot_for_read(extent_root,
                                         &fs_info->qgroup_rescan_progress,
                                         path, 1, 0);

        btrfs_debug(fs_info,
                "current progress key (%llu %u %llu), search_slot ret %d",
                fs_info->qgroup_rescan_progress.objectid,
                fs_info->qgroup_rescan_progress.type,
                fs_info->qgroup_rescan_progress.offset, ret);

        if (ret) {
                /*
                 * The rescan is about to end, we will not be scanning any
                 * further blocks. We cannot unset the RESCAN flag here, because
                 * we want to commit the transaction if everything went well.
                 * To make the live accounting work in this phase, we set our
                 * scan progress pointer such that every real extent objectid
                 * will be smaller.
                 */
                fs_info->qgroup_rescan_progress.objectid = (u64)-1;
                btrfs_release_path(path);
                mutex_unlock(&fs_info->qgroup_rescan_lock);
                return ret;
        }
        done = is_last_leaf(path);

        btrfs_item_key_to_cpu(path->nodes[0], &found,
                              btrfs_header_nritems(path->nodes[0]) - 1);
        fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;

        scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
        if (!scratch_leaf) {
                ret = -ENOMEM;
                mutex_unlock(&fs_info->qgroup_rescan_lock);
                goto out;
        }
        slot = path->slots[0];
        btrfs_release_path(path);
        mutex_unlock(&fs_info->qgroup_rescan_lock);

        for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
                struct btrfs_backref_walk_ctx ctx = { 0 };

                btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
                if (found.type != BTRFS_EXTENT_ITEM_KEY &&
                    found.type != BTRFS_METADATA_ITEM_KEY)
                        continue;
                if (found.type == BTRFS_METADATA_ITEM_KEY)
                        num_bytes = fs_info->nodesize;
                else
                        num_bytes = found.offset;

                ctx.bytenr = found.objectid;
                ctx.fs_info = fs_info;

                ret = btrfs_find_all_roots(&ctx, false);
                if (ret < 0)
                        goto out;
                /* For rescan, just pass old_roots as NULL */
                ret = btrfs_qgroup_account_extent(trans, found.objectid,
                                                  num_bytes, NULL, ctx.roots);
                if (ret < 0)
                        goto out;
        }
out:
        if (scratch_leaf)
                free_extent_buffer(scratch_leaf);

        if (done && !ret) {
                ret = 1;
                fs_info->qgroup_rescan_progress.objectid = (u64)-1;
        }
        return ret;
}

static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
{
        if (btrfs_fs_closing(fs_info))
                return true;
        if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
                return true;
        if (!btrfs_qgroup_enabled(fs_info))
                return true;
        if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
                return true;
        return false;
}

static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
{
        struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
                                                     qgroup_rescan_work);
        struct btrfs_path *path;
        struct btrfs_trans_handle *trans = NULL;
        int ret = 0;
        bool stopped = false;
        bool did_leaf_rescans = false;

        if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
                return;

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }
        /*
         * Rescan should only search for commit root, and any later difference
         * should be recorded by qgroup
         */
        path->search_commit_root = 1;
        path->skip_locking = 1;

        while (!ret && !(stopped = rescan_should_stop(fs_info))) {
                trans = btrfs_start_transaction(fs_info->fs_root, 0);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
                        break;
                }

                ret = qgroup_rescan_leaf(trans, path);
                did_leaf_rescans = true;

                if (ret > 0)
                        btrfs_commit_transaction(trans);
                else
                        btrfs_end_transaction(trans);
        }

out:
        btrfs_free_path(path);

        mutex_lock(&fs_info->qgroup_rescan_lock);
        if (ret > 0 &&
            fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
        } else if (ret < 0 || stopped) {
                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
        }
        mutex_unlock(&fs_info->qgroup_rescan_lock);

        /*
         * Only update status, since the previous part has already updated the
         * qgroup info, and only if we did any actual work. This also prevents
         * race with a concurrent quota disable, which has already set
         * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at
         * btrfs_quota_disable().
         */
        if (did_leaf_rescans) {
                trans = btrfs_start_transaction(fs_info->quota_root, 1);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
                        trans = NULL;
                        btrfs_err(fs_info,
                                  "fail to start transaction for status update: %d",
                                  ret);
                }
        } else {
                trans = NULL;
        }

        mutex_lock(&fs_info->qgroup_rescan_lock);
        if (!stopped ||
            fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
        if (trans) {
                int ret2 = update_qgroup_status_item(trans);

                if (ret2 < 0) {
                        ret = ret2;
                        btrfs_err(fs_info, "fail to update qgroup status: %d", ret);
                }
        }
        fs_info->qgroup_rescan_running = false;
        fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
        complete_all(&fs_info->qgroup_rescan_completion);
        mutex_unlock(&fs_info->qgroup_rescan_lock);

        if (!trans)
                return;

        btrfs_end_transaction(trans);

        if (stopped) {
                btrfs_info(fs_info, "qgroup scan paused");
        } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) {
                btrfs_info(fs_info, "qgroup scan cancelled");
        } else if (ret >= 0) {
                btrfs_info(fs_info, "qgroup scan completed%s",
                        ret > 0 ? " (inconsistency flag cleared)" : "");
        } else {
                btrfs_err(fs_info, "qgroup scan failed with %d", ret);
        }
}

/*
 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
 * memory required for the rescan context.
 */
static int
qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
                   int init_flags)
{
        int ret = 0;

        if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) {
                btrfs_warn(fs_info, "qgroup rescan init failed, running in simple mode");
                return -EINVAL;
        }

        if (!init_flags) {
                /* we're resuming qgroup rescan at mount time */
                if (!(fs_info->qgroup_flags &
                      BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
                        btrfs_debug(fs_info,
                        "qgroup rescan init failed, qgroup rescan is not queued");
                        ret = -EINVAL;
                } else if (!(fs_info->qgroup_flags &
                             BTRFS_QGROUP_STATUS_FLAG_ON)) {
                        btrfs_debug(fs_info,
                        "qgroup rescan init failed, qgroup is not enabled");
                        ret = -ENOTCONN;
                }

                if (ret)
                        return ret;
        }

        mutex_lock(&fs_info->qgroup_rescan_lock);

        if (init_flags) {
                if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
                        ret = -EINPROGRESS;
                } else if (!(fs_info->qgroup_flags &
                             BTRFS_QGROUP_STATUS_FLAG_ON)) {
                        btrfs_debug(fs_info,
                        "qgroup rescan init failed, qgroup is not enabled");
                        ret = -ENOTCONN;
                } else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
                        /* Quota disable is in progress */
                        ret = -EBUSY;
                }

                if (ret) {
                        mutex_unlock(&fs_info->qgroup_rescan_lock);
                        return ret;
                }
                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
        }

        memset(&fs_info->qgroup_rescan_progress, 0,
                sizeof(fs_info->qgroup_rescan_progress));
        fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
                                   BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
        fs_info->qgroup_rescan_progress.objectid = progress_objectid;
        init_completion(&fs_info->qgroup_rescan_completion);
        mutex_unlock(&fs_info->qgroup_rescan_lock);

        btrfs_init_work(&fs_info->qgroup_rescan_work,
                        btrfs_qgroup_rescan_worker, NULL);
        return 0;
}

static void
qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
{
        struct rb_node *n;
        struct btrfs_qgroup *qgroup;

        spin_lock(&fs_info->qgroup_lock);
        /* clear all current qgroup tracking information */
        for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
                qgroup = rb_entry(n, struct btrfs_qgroup, node);
                qgroup->rfer = 0;
                qgroup->rfer_cmpr = 0;
                qgroup->excl = 0;
                qgroup->excl_cmpr = 0;
                qgroup_dirty(fs_info, qgroup);
        }
        spin_unlock(&fs_info->qgroup_lock);
}

int
btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
{
        int ret = 0;
        struct btrfs_trans_handle *trans;

        ret = qgroup_rescan_init(fs_info, 0, 1);
        if (ret)
                return ret;

        /*
         * We have set the rescan_progress to 0, which means no more
         * delayed refs will be accounted by btrfs_qgroup_account_ref.
         * However, btrfs_qgroup_account_ref may be right after its call
         * to btrfs_find_all_roots, in which case it would still do the
         * accounting.
         * To solve this, we're committing the transaction, which will
         * ensure we run all delayed refs and only after that, we are
         * going to clear all tracking information for a clean start.
         */

        trans = btrfs_attach_transaction_barrier(fs_info->fs_root);
        if (IS_ERR(trans) && trans != ERR_PTR(-ENOENT)) {
                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
                return PTR_ERR(trans);
        } else if (trans != ERR_PTR(-ENOENT)) {
                ret = btrfs_commit_transaction(trans);
                if (ret) {
                        fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
                        return ret;
                }
        }

        qgroup_rescan_zero_tracking(fs_info);

        mutex_lock(&fs_info->qgroup_rescan_lock);
        fs_info->qgroup_rescan_running = true;
        btrfs_queue_work(fs_info->qgroup_rescan_workers,
                         &fs_info->qgroup_rescan_work);
        mutex_unlock(&fs_info->qgroup_rescan_lock);

        return 0;
}

int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
                                     bool interruptible)
{
        int running;
        int ret = 0;

        mutex_lock(&fs_info->qgroup_rescan_lock);
        running = fs_info->qgroup_rescan_running;
        mutex_unlock(&fs_info->qgroup_rescan_lock);

        if (!running)
                return 0;

        if (interruptible)
                ret = wait_for_completion_interruptible(
                                        &fs_info->qgroup_rescan_completion);
        else
                wait_for_completion(&fs_info->qgroup_rescan_completion);

        return ret;
}

/*
 * this is only called from open_ctree where we're still single threaded, thus
 * locking is omitted here.
 */
void
btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
{
        if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
                mutex_lock(&fs_info->qgroup_rescan_lock);
                fs_info->qgroup_rescan_running = true;
                btrfs_queue_work(fs_info->qgroup_rescan_workers,
                                 &fs_info->qgroup_rescan_work);
                mutex_unlock(&fs_info->qgroup_rescan_lock);
        }
}

#define rbtree_iterate_from_safe(node, next, start)                                \
       for (node = start; node && ({ next = rb_next(node); 1;}); node = next)

static int qgroup_unreserve_range(struct btrfs_inode *inode,
                                  struct extent_changeset *reserved, u64 start,
                                  u64 len)
{
        struct rb_node *node;
        struct rb_node *next;
        struct ulist_node *entry;
        int ret = 0;

        node = reserved->range_changed.root.rb_node;
        if (!node)
                return 0;
        while (node) {
                entry = rb_entry(node, struct ulist_node, rb_node);
                if (entry->val < start)
                        node = node->rb_right;
                else
                        node = node->rb_left;
        }

        if (entry->val > start && rb_prev(&entry->rb_node))
                entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
                                 rb_node);

        rbtree_iterate_from_safe(node, next, &entry->rb_node) {
                u64 entry_start;
                u64 entry_end;
                u64 entry_len;
                int clear_ret;

                entry = rb_entry(node, struct ulist_node, rb_node);
                entry_start = entry->val;
                entry_end = entry->aux;
                entry_len = entry_end - entry_start + 1;

                if (entry_start >= start + len)
                        break;
                if (entry_start + entry_len <= start)
                        continue;
                /*
                 * Now the entry is in [start, start + len), revert the
                 * EXTENT_QGROUP_RESERVED bit.
                 */
                clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
                                              entry_end, EXTENT_QGROUP_RESERVED);
                if (!ret && clear_ret < 0)
                        ret = clear_ret;

                ulist_del(&reserved->range_changed, entry->val, entry->aux);
                if (likely(reserved->bytes_changed >= entry_len)) {
                        reserved->bytes_changed -= entry_len;
                } else {
                        WARN_ON(1);
                        reserved->bytes_changed = 0;
                }
        }

        return ret;
}

/*
 * Try to free some space for qgroup.
 *
 * For qgroup, there are only 3 ways to free qgroup space:
 * - Flush nodatacow write
 *   Any nodatacow write will free its reserved data space at run_delalloc_range().
 *   In theory, we should only flush nodatacow inodes, but it's not yet
 *   possible, so we need to flush the whole root.
 *
 * - Wait for ordered extents
 *   When ordered extents are finished, their reserved metadata is finally
 *   converted to per_trans status, which can be freed by later commit
 *   transaction.
 *
 * - Commit transaction
 *   This would free the meta_per_trans space.
 *   In theory this shouldn't provide much space, but any more qgroup space
 *   is needed.
 */
static int try_flush_qgroup(struct btrfs_root *root)
{
        struct btrfs_trans_handle *trans;
        int ret;

        /* Can't hold an open transaction or we run the risk of deadlocking. */
        ASSERT(current->journal_info == NULL);
        if (WARN_ON(current->journal_info))
                return 0;

        /*
         * We don't want to run flush again and again, so if there is a running
         * one, we won't try to start a new flush, but exit directly.
         */
        if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
                wait_event(root->qgroup_flush_wait,
                        !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
                return 0;
        }

        ret = btrfs_start_delalloc_snapshot(root, true);
        if (ret < 0)
                goto out;
        btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);

        trans = btrfs_attach_transaction_barrier(root);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                if (ret == -ENOENT)
                        ret = 0;
                goto out;
        }

        ret = btrfs_commit_transaction(trans);
out:
        clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
        wake_up(&root->qgroup_flush_wait);
        return ret;
}

static int qgroup_reserve_data(struct btrfs_inode *inode,
                        struct extent_changeset **reserved_ret, u64 start,
                        u64 len)
{
        struct btrfs_root *root = inode->root;
        struct extent_changeset *reserved;
        bool new_reserved = false;
        u64 orig_reserved;
        u64 to_reserve;
        int ret;

        if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
            !is_fstree(btrfs_root_id(root)) || len == 0)
                return 0;

        /* @reserved parameter is mandatory for qgroup */
        if (WARN_ON(!reserved_ret))
                return -EINVAL;
        if (!*reserved_ret) {
                new_reserved = true;
                *reserved_ret = extent_changeset_alloc();
                if (!*reserved_ret)
                        return -ENOMEM;
        }
        reserved = *reserved_ret;
        /* Record already reserved space */
        orig_reserved = reserved->bytes_changed;
        ret = set_record_extent_bits(&inode->io_tree, start,
                        start + len -1, EXTENT_QGROUP_RESERVED, reserved);

        /* Newly reserved space */
        to_reserve = reserved->bytes_changed - orig_reserved;
        trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
                                        to_reserve, QGROUP_RESERVE);
        if (ret < 0)
                goto out;
        ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
        if (ret < 0)
                goto cleanup;

        return ret;

cleanup:
        qgroup_unreserve_range(inode, reserved, start, len);
out:
        if (new_reserved) {
                extent_changeset_free(reserved);
                *reserved_ret = NULL;
        }
        return ret;
}

/*
 * Reserve qgroup space for range [start, start + len).
 *
 * This function will either reserve space from related qgroups or do nothing
 * if the range is already reserved.
 *
 * Return 0 for successful reservation
 * Return <0 for error (including -EQUOT)
 *
 * NOTE: This function may sleep for memory allocation, dirty page flushing and
 *         commit transaction. So caller should not hold any dirty page locked.
 */
int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
                        struct extent_changeset **reserved_ret, u64 start,
                        u64 len)
{
        int ret;

        ret = qgroup_reserve_data(inode, reserved_ret, start, len);
        if (ret <= 0 && ret != -EDQUOT)
                return ret;

        ret = try_flush_qgroup(inode->root);
        if (ret < 0)
                return ret;
        return qgroup_reserve_data(inode, reserved_ret, start, len);
}

/* Free ranges specified by @reserved, normally in error path */
static int qgroup_free_reserved_data(struct btrfs_inode *inode,
                                     struct extent_changeset *reserved,
                                     u64 start, u64 len, u64 *freed_ret)
{
        struct btrfs_root *root = inode->root;
        struct ulist_node *unode;
        struct ulist_iterator uiter;
        struct extent_changeset changeset;
        u64 freed = 0;
        int ret;

        extent_changeset_init(&changeset);
        len = round_up(start + len, root->fs_info->sectorsize);
        start = round_down(start, root->fs_info->sectorsize);

        ULIST_ITER_INIT(&uiter);
        while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
                u64 range_start = unode->val;
                /* unode->aux is the inclusive end */
                u64 range_len = unode->aux - range_start + 1;
                u64 free_start;
                u64 free_len;

                extent_changeset_release(&changeset);

                /* Only free range in range [start, start + len) */
                if (range_start >= start + len ||
                    range_start + range_len <= start)
                        continue;
                free_start = max(range_start, start);
                free_len = min(start + len, range_start + range_len) -
                           free_start;
                /*
                 * TODO: To also modify reserved->ranges_reserved to reflect
                 * the modification.
                 *
                 * However as long as we free qgroup reserved according to
                 * EXTENT_QGROUP_RESERVED, we won't double free.
                 * So not need to rush.
                 */
                ret = clear_record_extent_bits(&inode->io_tree, free_start,
                                free_start + free_len - 1,
                                EXTENT_QGROUP_RESERVED, &changeset);
                if (ret < 0)
                        goto out;
                freed += changeset.bytes_changed;
        }
        btrfs_qgroup_free_refroot(root->fs_info, btrfs_root_id(root), freed,
                                  BTRFS_QGROUP_RSV_DATA);
        if (freed_ret)
                *freed_ret = freed;
        ret = 0;
out:
        extent_changeset_release(&changeset);
        return ret;
}

static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
                        struct extent_changeset *reserved, u64 start, u64 len,
                        u64 *released, int free)
{
        struct extent_changeset changeset;
        int trace_op = QGROUP_RELEASE;
        int ret;

        if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
                extent_changeset_init(&changeset);
                return clear_record_extent_bits(&inode->io_tree, start,
                                                start + len - 1,
                                                EXTENT_QGROUP_RESERVED, &changeset);
        }

        /* In release case, we shouldn't have @reserved */
        WARN_ON(!free && reserved);
        if (free && reserved)
                return qgroup_free_reserved_data(inode, reserved, start, len, released);
        extent_changeset_init(&changeset);
        ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
                                       EXTENT_QGROUP_RESERVED, &changeset);
        if (ret < 0)
                goto out;

        if (free)
                trace_op = QGROUP_FREE;
        trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
                                        changeset.bytes_changed, trace_op);
        if (free)
                btrfs_qgroup_free_refroot(inode->root->fs_info,
                                btrfs_root_id(inode->root),
                                changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
        if (released)
                *released = changeset.bytes_changed;
out:
        extent_changeset_release(&changeset);
        return ret;
}

/*
 * Free a reserved space range from io_tree and related qgroups
 *
 * Should be called when a range of pages get invalidated before reaching disk.
 * Or for error cleanup case.
 * if @reserved is given, only reserved range in [@start, @start + @len) will
 * be freed.
 *
 * For data written to disk, use btrfs_qgroup_release_data().
 *
 * NOTE: This function may sleep for memory allocation.
 */
int btrfs_qgroup_free_data(struct btrfs_inode *inode,
                           struct extent_changeset *reserved,
                           u64 start, u64 len, u64 *freed)
{
        return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1);
}

/*
 * Release a reserved space range from io_tree only.
 *
 * Should be called when a range of pages get written to disk and corresponding
 * FILE_EXTENT is inserted into corresponding root.
 *
 * Since new qgroup accounting framework will only update qgroup numbers at
 * commit_transaction() time, its reserved space shouldn't be freed from
 * related qgroups.
 *
 * But we should release the range from io_tree, to allow further write to be
 * COWed.
 *
 * NOTE: This function may sleep for memory allocation.
 */
int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released)
{
        return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0);
}

static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
                              enum btrfs_qgroup_rsv_type type)
{
        if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
            type != BTRFS_QGROUP_RSV_META_PERTRANS)
                return;
        if (num_bytes == 0)
                return;

        spin_lock(&root->qgroup_meta_rsv_lock);
        if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
                root->qgroup_meta_rsv_prealloc += num_bytes;
        else
                root->qgroup_meta_rsv_pertrans += num_bytes;
        spin_unlock(&root->qgroup_meta_rsv_lock);
}

static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
                             enum btrfs_qgroup_rsv_type type)
{
        if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
            type != BTRFS_QGROUP_RSV_META_PERTRANS)
                return 0;
        if (num_bytes == 0)
                return 0;

        spin_lock(&root->qgroup_meta_rsv_lock);
        if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
                num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
                                  num_bytes);
                root->qgroup_meta_rsv_prealloc -= num_bytes;
        } else {
                num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
                                  num_bytes);
                root->qgroup_meta_rsv_pertrans -= num_bytes;
        }
        spin_unlock(&root->qgroup_meta_rsv_lock);
        return num_bytes;
}

int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
                              enum btrfs_qgroup_rsv_type type, bool enforce)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;

        if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
            !is_fstree(btrfs_root_id(root)) || num_bytes == 0)
                return 0;

        BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
        trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
        ret = qgroup_reserve(root, num_bytes, enforce, type);
        if (ret < 0)
                return ret;
        /*
         * Record what we have reserved into root.
         *
         * To avoid quota disabled->enabled underflow.
         * In that case, we may try to free space we haven't reserved
         * (since quota was disabled), so record what we reserved into root.
         * And ensure later release won't underflow this number.
         */
        add_root_meta_rsv(root, num_bytes, type);
        return ret;
}

int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
                                enum btrfs_qgroup_rsv_type type, bool enforce,
                                bool noflush)
{
        int ret;

        ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
        if ((ret <= 0 && ret != -EDQUOT) || noflush)
                return ret;

        ret = try_flush_qgroup(root);
        if (ret < 0)
                return ret;
        return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
}

/*
 * Per-transaction meta reservation should be all freed at transaction commit
 * time
 */
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;

        if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
            !is_fstree(btrfs_root_id(root)))
                return;

        /* TODO: Update trace point to handle such free */
        trace_qgroup_meta_free_all_pertrans(root);
        /* Special value -1 means to free all reserved space */
        btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), (u64)-1,
                                  BTRFS_QGROUP_RSV_META_PERTRANS);
}

void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
                              enum btrfs_qgroup_rsv_type type)
{
        struct btrfs_fs_info *fs_info = root->fs_info;

        if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
            !is_fstree(btrfs_root_id(root)))
                return;

        /*
         * reservation for META_PREALLOC can happen before quota is enabled,
         * which can lead to underflow.
         * Here ensure we will only free what we really have reserved.
         */
        num_bytes = sub_root_meta_rsv(root, num_bytes, type);
        BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
        trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
        btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type);
}

static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
                                int num_bytes)
{
        struct btrfs_qgroup *qgroup;
        LIST_HEAD(qgroup_list);

        if (num_bytes == 0)
                return;
        if (!fs_info->quota_root)
                return;

        spin_lock(&fs_info->qgroup_lock);
        qgroup = find_qgroup_rb(fs_info, ref_root);
        if (!qgroup)
                goto out;

        qgroup_iterator_add(&qgroup_list, qgroup);
        list_for_each_entry(qgroup, &qgroup_list, iterator) {
                struct btrfs_qgroup_list *glist;

                qgroup_rsv_release(fs_info, qgroup, num_bytes,
                                BTRFS_QGROUP_RSV_META_PREALLOC);
                if (!sb_rdonly(fs_info->sb))
                        qgroup_rsv_add(fs_info, qgroup, num_bytes,
                                       BTRFS_QGROUP_RSV_META_PERTRANS);

                list_for_each_entry(glist, &qgroup->groups, next_group)
                        qgroup_iterator_add(&qgroup_list, glist->group);
        }
out:
        qgroup_iterator_clean(&qgroup_list);
        spin_unlock(&fs_info->qgroup_lock);
}

/*
 * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
 *
 * This is called when preallocated meta reservation needs to be used.
 * Normally after btrfs_join_transaction() call.
 */
void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
{
        struct btrfs_fs_info *fs_info = root->fs_info;

        if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
            !is_fstree(btrfs_root_id(root)))
                return;
        /* Same as btrfs_qgroup_free_meta_prealloc() */
        num_bytes = sub_root_meta_rsv(root, num_bytes,
                                      BTRFS_QGROUP_RSV_META_PREALLOC);
        trace_qgroup_meta_convert(root, num_bytes);
        qgroup_convert_meta(fs_info, btrfs_root_id(root), num_bytes);
        if (!sb_rdonly(fs_info->sb))
                add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS);
}

/*
 * Check qgroup reserved space leaking, normally at destroy inode
 * time
 */
void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
{
        struct extent_changeset changeset;
        struct ulist_node *unode;
        struct ulist_iterator iter;
        int ret;

        extent_changeset_init(&changeset);
        ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
                        EXTENT_QGROUP_RESERVED, &changeset);

        WARN_ON(ret < 0);
        if (WARN_ON(changeset.bytes_changed)) {
                ULIST_ITER_INIT(&iter);
                while ((unode = ulist_next(&changeset.range_changed, &iter))) {
                        btrfs_warn(inode->root->fs_info,
                "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
                                btrfs_ino(inode), unode->val, unode->aux);
                }
                btrfs_qgroup_free_refroot(inode->root->fs_info,
                                btrfs_root_id(inode->root),
                                changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);

        }
        extent_changeset_release(&changeset);
}

void btrfs_qgroup_init_swapped_blocks(
        struct btrfs_qgroup_swapped_blocks *swapped_blocks)
{
        int i;

        spin_lock_init(&swapped_blocks->lock);
        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
                swapped_blocks->blocks[i] = RB_ROOT;
        swapped_blocks->swapped = false;
}

/*
 * Delete all swapped blocks record of @root.
 * Every record here means we skipped a full subtree scan for qgroup.
 *
 * Gets called when committing one transaction.
 */
void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
{
        struct btrfs_qgroup_swapped_blocks *swapped_blocks;
        int i;

        swapped_blocks = &root->swapped_blocks;

        spin_lock(&swapped_blocks->lock);
        if (!swapped_blocks->swapped)
                goto out;
        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
                struct rb_root *cur_root = &swapped_blocks->blocks[i];
                struct btrfs_qgroup_swapped_block *entry;
                struct btrfs_qgroup_swapped_block *next;

                rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
                                                     node)
                        kfree(entry);
                swapped_blocks->blocks[i] = RB_ROOT;
        }
        swapped_blocks->swapped = false;
out:
        spin_unlock(&swapped_blocks->lock);
}

/*
 * Add subtree roots record into @subvol_root.
 *
 * @subvol_root:        tree root of the subvolume tree get swapped
 * @bg:                        block group under balance
 * @subvol_parent/slot:        pointer to the subtree root in subvolume tree
 * @reloc_parent/slot:        pointer to the subtree root in reloc tree
 *                        BOTH POINTERS ARE BEFORE TREE SWAP
 * @last_snapshot:        last snapshot generation of the subvolume tree
 */
int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
                struct btrfs_root *subvol_root,
                struct btrfs_block_group *bg,
                struct extent_buffer *subvol_parent, int subvol_slot,
                struct extent_buffer *reloc_parent, int reloc_slot,
                u64 last_snapshot)
{
        struct btrfs_fs_info *fs_info = subvol_root->fs_info;
        struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
        struct btrfs_qgroup_swapped_block *block;
        struct rb_node **cur;
        struct rb_node *parent = NULL;
        int level = btrfs_header_level(subvol_parent) - 1;
        int ret = 0;

        if (!btrfs_qgroup_full_accounting(fs_info))
                return 0;

        if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
            btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
                btrfs_err_rl(fs_info,
                "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
                        __func__,
                        btrfs_node_ptr_generation(subvol_parent, subvol_slot),
                        btrfs_node_ptr_generation(reloc_parent, reloc_slot));
                return -EUCLEAN;
        }

        block = kmalloc(sizeof(*block), GFP_NOFS);
        if (!block) {
                ret = -ENOMEM;
                goto out;
        }

        /*
         * @reloc_parent/slot is still before swap, while @block is going to
         * record the bytenr after swap, so we do the swap here.
         */
        block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
        block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
                                                             reloc_slot);
        block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
        block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
                                                            subvol_slot);
        block->last_snapshot = last_snapshot;
        block->level = level;

        /*
         * If we have bg == NULL, we're called from btrfs_recover_relocation(),
         * no one else can modify tree blocks thus we qgroup will not change
         * no matter the value of trace_leaf.
         */
        if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
                block->trace_leaf = true;
        else
                block->trace_leaf = false;
        btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);

        /* Insert @block into @blocks */
        spin_lock(&blocks->lock);
        cur = &blocks->blocks[level].rb_node;
        while (*cur) {
                struct btrfs_qgroup_swapped_block *entry;

                parent = *cur;
                entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
                                 node);

                if (entry->subvol_bytenr < block->subvol_bytenr) {
                        cur = &(*cur)->rb_left;
                } else if (entry->subvol_bytenr > block->subvol_bytenr) {
                        cur = &(*cur)->rb_right;
                } else {
                        if (entry->subvol_generation !=
                                        block->subvol_generation ||
                            entry->reloc_bytenr != block->reloc_bytenr ||
                            entry->reloc_generation !=
                                        block->reloc_generation) {
                                /*
                                 * Duplicated but mismatch entry found.
                                 * Shouldn't happen.
                                 *
                                 * Marking qgroup inconsistent should be enough
                                 * for end users.
                                 */
                                WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
                                ret = -EEXIST;
                        }
                        kfree(block);
                        goto out_unlock;
                }
        }
        rb_link_node(&block->node, parent, cur);
        rb_insert_color(&block->node, &blocks->blocks[level]);
        blocks->swapped = true;
out_unlock:
        spin_unlock(&blocks->lock);
out:
        if (ret < 0)
                qgroup_mark_inconsistent(fs_info);
        return ret;
}

/*
 * Check if the tree block is a subtree root, and if so do the needed
 * delayed subtree trace for qgroup.
 *
 * This is called during btrfs_cow_block().
 */
int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
                                         struct btrfs_root *root,
                                         struct extent_buffer *subvol_eb)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_tree_parent_check check = { 0 };
        struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
        struct btrfs_qgroup_swapped_block *block;
        struct extent_buffer *reloc_eb = NULL;
        struct rb_node *node;
        bool found = false;
        bool swapped = false;
        int level = btrfs_header_level(subvol_eb);
        int ret = 0;
        int i;

        if (!btrfs_qgroup_full_accounting(fs_info))
                return 0;
        if (!is_fstree(btrfs_root_id(root)) || !root->reloc_root)
                return 0;

        spin_lock(&blocks->lock);
        if (!blocks->swapped) {
                spin_unlock(&blocks->lock);
                return 0;
        }
        node = blocks->blocks[level].rb_node;

        while (node) {
                block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
                if (block->subvol_bytenr < subvol_eb->start) {
                        node = node->rb_left;
                } else if (block->subvol_bytenr > subvol_eb->start) {
                        node = node->rb_right;
                } else {
                        found = true;
                        break;
                }
        }
        if (!found) {
                spin_unlock(&blocks->lock);
                goto out;
        }
        /* Found one, remove it from @blocks first and update blocks->swapped */
        rb_erase(&block->node, &blocks->blocks[level]);
        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
                if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
                        swapped = true;
                        break;
                }
        }
        blocks->swapped = swapped;
        spin_unlock(&blocks->lock);

        check.level = block->level;
        check.transid = block->reloc_generation;
        check.has_first_key = true;
        memcpy(&check.first_key, &block->first_key, sizeof(check.first_key));

        /* Read out reloc subtree root */
        reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check);
        if (IS_ERR(reloc_eb)) {
                ret = PTR_ERR(reloc_eb);
                reloc_eb = NULL;
                goto free_out;
        }
        if (!extent_buffer_uptodate(reloc_eb)) {
                ret = -EIO;
                goto free_out;
        }

        ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
                        block->last_snapshot, block->trace_leaf);
free_out:
        kfree(block);
        free_extent_buffer(reloc_eb);
out:
        if (ret < 0) {
                btrfs_err_rl(fs_info,
                             "failed to account subtree at bytenr %llu: %d",
                             subvol_eb->start, ret);
                qgroup_mark_inconsistent(fs_info);
        }
        return ret;
}

void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
{
        struct btrfs_qgroup_extent_record *entry;
        struct btrfs_qgroup_extent_record *next;
        struct rb_root *root;

        root = &trans->delayed_refs.dirty_extent_root;
        rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
                ulist_free(entry->old_roots);
                kfree(entry);
        }
        *root = RB_ROOT;
}

void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes)
{
        if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
                return;

        if (!is_fstree(root))
                return;

        btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA);
}

int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
                              struct btrfs_squota_delta *delta)
{
        int ret;
        struct btrfs_qgroup *qgroup;
        struct btrfs_qgroup *qg;
        LIST_HEAD(qgroup_list);
        u64 root = delta->root;
        u64 num_bytes = delta->num_bytes;
        const int sign = (delta->is_inc ? 1 : -1);

        if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
                return 0;

        if (!is_fstree(root))
                return 0;

        /* If the extent predates enabling quotas, don't count it. */
        if (delta->generation < fs_info->qgroup_enable_gen)
                return 0;

        spin_lock(&fs_info->qgroup_lock);
        qgroup = find_qgroup_rb(fs_info, root);
        if (!qgroup) {
                ret = -ENOENT;
                goto out;
        }

        ret = 0;
        qgroup_iterator_add(&qgroup_list, qgroup);
        list_for_each_entry(qg, &qgroup_list, iterator) {
                struct btrfs_qgroup_list *glist;

                qg->excl += num_bytes * sign;
                qg->rfer += num_bytes * sign;
                qgroup_dirty(fs_info, qg);

                list_for_each_entry(glist, &qg->groups, next_group)
                        qgroup_iterator_add(&qgroup_list, glist->group);
        }
        qgroup_iterator_clean(&qgroup_list);

out:
        spin_unlock(&fs_info->qgroup_lock);
        return ret;
}











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   16 
   16 














    1 
    1 



























































    2 











    2 

    2 
















































































































































    1 




    1 





















    3 
    3 



    3 






































    3 
    3 
















    1 
    1 














































    3 
    3 


























    2 
    2 





















    2 
    2 



























    1 
    1 

















    3 



    2 










































    1 
    1 






























   13 
   15 



   16 




















    4 
    4 










    4 


















































































    6 














    6 



    6 



    6 













    6 

    6 





















    1 
    1 


















    4 


    4 











    2 


    2 














    2 


    2 













    2 


    2 












    1 


    1 















    2 


    2 














    2 


    2 

















    4 
    4 




    4 















    1 


    1 














    3 


    3 













    3 


    4 











    1 
    1 















    4 


    4 






























    2 


    2 












    1 


    1 














    2 


    2 














    2 


    2 













    2 


    2 







































    3 
    3 




    3 






    3 












    1 


    2 















    2 


    2 

















   32 


   34 

















    8 


    9 













   10 


   10 











    6 


    6 









































































































































































































































    5 
    5 






























































































































































































   14 
   15 












   14 
   12 



   14 












    5 
    5 









    8 


    8 



    7 


















    1 
    1 



























    1 

    2 





































    2 
    3 












    4 
    4 





























    2 
    2 

















    1 
    1 












    1 
    1 














































    8 


    8 



    8 














    7 
    7 














    1 
    1 






































































































































































































































































































































































    8 

    9 




































































































































































































    2 
    2 



































































































































































































































































































































































   19 
   20 

   17 



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Security plug functions
 *
 * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com>
 * Copyright (C) 2001-2002 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com>
 * Copyright (C) 2016 Mellanox Technologies
 * Copyright (C) 2023 Microsoft Corporation <paul@paul-moore.com>
 */

#define pr_fmt(fmt) "LSM: " fmt

#include <linux/bpf.h>
#include <linux/capability.h>
#include <linux/dcache.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kernel_read_file.h>
#include <linux/lsm_hooks.h>
#include <linux/fsnotify.h>
#include <linux/mman.h>
#include <linux/mount.h>
#include <linux/personality.h>
#include <linux/backing-dev.h>
#include <linux/string.h>
#include <linux/xattr.h>
#include <linux/msg.h>
#include <linux/overflow.h>
#include <net/flow.h>

/* How many LSMs were built into the kernel? */
#define LSM_COUNT (__end_lsm_info - __start_lsm_info)

/*
 * How many LSMs are built into the kernel as determined at
 * build time. Used to determine fixed array sizes.
 * The capability module is accounted for by CONFIG_SECURITY
 */
#define LSM_CONFIG_COUNT ( \
        (IS_ENABLED(CONFIG_SECURITY) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_SECURITY_SELINUX) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_SECURITY_SMACK) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_SECURITY_TOMOYO) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_SECURITY_APPARMOR) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_SECURITY_YAMA) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_SECURITY_LOADPIN) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_SECURITY_SAFESETID) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_SECURITY_LOCKDOWN_LSM) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_BPF_LSM) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_SECURITY_LANDLOCK) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_IMA) ? 1 : 0) + \
        (IS_ENABLED(CONFIG_EVM) ? 1 : 0))

/*
 * These are descriptions of the reasons that can be passed to the
 * security_locked_down() LSM hook. Placing this array here allows
 * all security modules to use the same descriptions for auditing
 * purposes.
 */
const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX + 1] = {
        [LOCKDOWN_NONE] = "none",
        [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading",
        [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port",
        [LOCKDOWN_EFI_TEST] = "/dev/efi_test access",
        [LOCKDOWN_KEXEC] = "kexec of unsigned images",
        [LOCKDOWN_HIBERNATION] = "hibernation",
        [LOCKDOWN_PCI_ACCESS] = "direct PCI access",
        [LOCKDOWN_IOPORT] = "raw io port access",
        [LOCKDOWN_MSR] = "raw MSR access",
        [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables",
        [LOCKDOWN_DEVICE_TREE] = "modifying device tree contents",
        [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage",
        [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO",
        [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters",
        [LOCKDOWN_MMIOTRACE] = "unsafe mmio",
        [LOCKDOWN_DEBUGFS] = "debugfs access",
        [LOCKDOWN_XMON_WR] = "xmon write access",
        [LOCKDOWN_BPF_WRITE_USER] = "use of bpf to write user RAM",
        [LOCKDOWN_DBG_WRITE_KERNEL] = "use of kgdb/kdb to write kernel RAM",
        [LOCKDOWN_RTAS_ERROR_INJECTION] = "RTAS error injection",
        [LOCKDOWN_INTEGRITY_MAX] = "integrity",
        [LOCKDOWN_KCORE] = "/proc/kcore access",
        [LOCKDOWN_KPROBES] = "use of kprobes",
        [LOCKDOWN_BPF_READ_KERNEL] = "use of bpf to read kernel RAM",
        [LOCKDOWN_DBG_READ_KERNEL] = "use of kgdb/kdb to read kernel RAM",
        [LOCKDOWN_PERF] = "unsafe use of perf",
        [LOCKDOWN_TRACEFS] = "use of tracefs",
        [LOCKDOWN_XMON_RW] = "xmon read and write access",
        [LOCKDOWN_XFRM_SECRET] = "xfrm SA secret",
        [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality",
};

struct security_hook_heads security_hook_heads __ro_after_init;
static BLOCKING_NOTIFIER_HEAD(blocking_lsm_notifier_chain);

static struct kmem_cache *lsm_file_cache;
static struct kmem_cache *lsm_inode_cache;

char *lsm_names;
static struct lsm_blob_sizes blob_sizes __ro_after_init;

/* Boot-time LSM user choice */
static __initdata const char *chosen_lsm_order;
static __initdata const char *chosen_major_lsm;

static __initconst const char *const builtin_lsm_order = CONFIG_LSM;

/* Ordered list of LSMs to initialize. */
static __initdata struct lsm_info **ordered_lsms;
static __initdata struct lsm_info *exclusive;

static __initdata bool debug;
#define init_debug(...)                                                \
        do {                                                        \
                if (debug)                                        \
                        pr_info(__VA_ARGS__);                        \
        } while (0)

static bool __init is_enabled(struct lsm_info *lsm)
{
        if (!lsm->enabled)
                return false;

        return *lsm->enabled;
}

/* Mark an LSM's enabled flag. */
static int lsm_enabled_true __initdata = 1;
static int lsm_enabled_false __initdata = 0;
static void __init set_enabled(struct lsm_info *lsm, bool enabled)
{
        /*
         * When an LSM hasn't configured an enable variable, we can use
         * a hard-coded location for storing the default enabled state.
         */
        if (!lsm->enabled) {
                if (enabled)
                        lsm->enabled = &lsm_enabled_true;
                else
                        lsm->enabled = &lsm_enabled_false;
        } else if (lsm->enabled == &lsm_enabled_true) {
                if (!enabled)
                        lsm->enabled = &lsm_enabled_false;
        } else if (lsm->enabled == &lsm_enabled_false) {
                if (enabled)
                        lsm->enabled = &lsm_enabled_true;
        } else {
                *lsm->enabled = enabled;
        }
}

/* Is an LSM already listed in the ordered LSMs list? */
static bool __init exists_ordered_lsm(struct lsm_info *lsm)
{
        struct lsm_info **check;

        for (check = ordered_lsms; *check; check++)
                if (*check == lsm)
                        return true;

        return false;
}

/* Append an LSM to the list of ordered LSMs to initialize. */
static int last_lsm __initdata;
static void __init append_ordered_lsm(struct lsm_info *lsm, const char *from)
{
        /* Ignore duplicate selections. */
        if (exists_ordered_lsm(lsm))
                return;

        if (WARN(last_lsm == LSM_COUNT, "%s: out of LSM slots!?\n", from))
                return;

        /* Enable this LSM, if it is not already set. */
        if (!lsm->enabled)
                lsm->enabled = &lsm_enabled_true;
        ordered_lsms[last_lsm++] = lsm;

        init_debug("%s ordered: %s (%s)\n", from, lsm->name,
                   is_enabled(lsm) ? "enabled" : "disabled");
}

/* Is an LSM allowed to be initialized? */
static bool __init lsm_allowed(struct lsm_info *lsm)
{
        /* Skip if the LSM is disabled. */
        if (!is_enabled(lsm))
                return false;

        /* Not allowed if another exclusive LSM already initialized. */
        if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && exclusive) {
                init_debug("exclusive disabled: %s\n", lsm->name);
                return false;
        }

        return true;
}

static void __init lsm_set_blob_size(int *need, int *lbs)
{
        int offset;

        if (*need <= 0)
                return;

        offset = ALIGN(*lbs, sizeof(void *));
        *lbs = offset + *need;
        *need = offset;
}

static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed)
{
        if (!needed)
                return;

        lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred);
        lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file);
        /*
         * The inode blob gets an rcu_head in addition to
         * what the modules might need.
         */
        if (needed->lbs_inode && blob_sizes.lbs_inode == 0)
                blob_sizes.lbs_inode = sizeof(struct rcu_head);
        lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode);
        lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc);
        lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg);
        lsm_set_blob_size(&needed->lbs_superblock, &blob_sizes.lbs_superblock);
        lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task);
        lsm_set_blob_size(&needed->lbs_xattr_count,
                          &blob_sizes.lbs_xattr_count);
}

/* Prepare LSM for initialization. */
static void __init prepare_lsm(struct lsm_info *lsm)
{
        int enabled = lsm_allowed(lsm);

        /* Record enablement (to handle any following exclusive LSMs). */
        set_enabled(lsm, enabled);

        /* If enabled, do pre-initialization work. */
        if (enabled) {
                if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && !exclusive) {
                        exclusive = lsm;
                        init_debug("exclusive chosen:   %s\n", lsm->name);
                }

                lsm_set_blob_sizes(lsm->blobs);
        }
}

/* Initialize a given LSM, if it is enabled. */
static void __init initialize_lsm(struct lsm_info *lsm)
{
        if (is_enabled(lsm)) {
                int ret;

                init_debug("initializing %s\n", lsm->name);
                ret = lsm->init();
                WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret);
        }
}

/*
 * Current index to use while initializing the lsm id list.
 */
u32 lsm_active_cnt __ro_after_init;
const struct lsm_id *lsm_idlist[LSM_CONFIG_COUNT];

/* Populate ordered LSMs list from comma-separated LSM name list. */
static void __init ordered_lsm_parse(const char *order, const char *origin)
{
        struct lsm_info *lsm;
        char *sep, *name, *next;

        /* LSM_ORDER_FIRST is always first. */
        for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                if (lsm->order == LSM_ORDER_FIRST)
                        append_ordered_lsm(lsm, "  first");
        }

        /* Process "security=", if given. */
        if (chosen_major_lsm) {
                struct lsm_info *major;

                /*
                 * To match the original "security=" behavior, this
                 * explicitly does NOT fallback to another Legacy Major
                 * if the selected one was separately disabled: disable
                 * all non-matching Legacy Major LSMs.
                 */
                for (major = __start_lsm_info; major < __end_lsm_info;
                     major++) {
                        if ((major->flags & LSM_FLAG_LEGACY_MAJOR) &&
                            strcmp(major->name, chosen_major_lsm) != 0) {
                                set_enabled(major, false);
                                init_debug("security=%s disabled: %s (only one legacy major LSM)\n",
                                           chosen_major_lsm, major->name);
                        }
                }
        }

        sep = kstrdup(order, GFP_KERNEL);
        next = sep;
        /* Walk the list, looking for matching LSMs. */
        while ((name = strsep(&next, ",")) != NULL) {
                bool found = false;

                for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                        if (strcmp(lsm->name, name) == 0) {
                                if (lsm->order == LSM_ORDER_MUTABLE)
                                        append_ordered_lsm(lsm, origin);
                                found = true;
                        }
                }

                if (!found)
                        init_debug("%s ignored: %s (not built into kernel)\n",
                                   origin, name);
        }

        /* Process "security=", if given. */
        if (chosen_major_lsm) {
                for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                        if (exists_ordered_lsm(lsm))
                                continue;
                        if (strcmp(lsm->name, chosen_major_lsm) == 0)
                                append_ordered_lsm(lsm, "security=");
                }
        }

        /* LSM_ORDER_LAST is always last. */
        for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                if (lsm->order == LSM_ORDER_LAST)
                        append_ordered_lsm(lsm, "   last");
        }

        /* Disable all LSMs not in the ordered list. */
        for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                if (exists_ordered_lsm(lsm))
                        continue;
                set_enabled(lsm, false);
                init_debug("%s skipped: %s (not in requested order)\n",
                           origin, lsm->name);
        }

        kfree(sep);
}

static void __init lsm_early_cred(struct cred *cred);
static void __init lsm_early_task(struct task_struct *task);

static int lsm_append(const char *new, char **result);

static void __init report_lsm_order(void)
{
        struct lsm_info **lsm, *early;
        int first = 0;

        pr_info("initializing lsm=");

        /* Report each enabled LSM name, comma separated. */
        for (early = __start_early_lsm_info;
             early < __end_early_lsm_info; early++)
                if (is_enabled(early))
                        pr_cont("%s%s", first++ == 0 ? "" : ",", early->name);
        for (lsm = ordered_lsms; *lsm; lsm++)
                if (is_enabled(*lsm))
                        pr_cont("%s%s", first++ == 0 ? "" : ",", (*lsm)->name);

        pr_cont("\n");
}

static void __init ordered_lsm_init(void)
{
        struct lsm_info **lsm;

        ordered_lsms = kcalloc(LSM_COUNT + 1, sizeof(*ordered_lsms),
                               GFP_KERNEL);

        if (chosen_lsm_order) {
                if (chosen_major_lsm) {
                        pr_warn("security=%s is ignored because it is superseded by lsm=%s\n",
                                chosen_major_lsm, chosen_lsm_order);
                        chosen_major_lsm = NULL;
                }
                ordered_lsm_parse(chosen_lsm_order, "cmdline");
        } else
                ordered_lsm_parse(builtin_lsm_order, "builtin");

        for (lsm = ordered_lsms; *lsm; lsm++)
                prepare_lsm(*lsm);

        report_lsm_order();

        init_debug("cred blob size       = %d\n", blob_sizes.lbs_cred);
        init_debug("file blob size       = %d\n", blob_sizes.lbs_file);
        init_debug("inode blob size      = %d\n", blob_sizes.lbs_inode);
        init_debug("ipc blob size        = %d\n", blob_sizes.lbs_ipc);
        init_debug("msg_msg blob size    = %d\n", blob_sizes.lbs_msg_msg);
        init_debug("superblock blob size = %d\n", blob_sizes.lbs_superblock);
        init_debug("task blob size       = %d\n", blob_sizes.lbs_task);
        init_debug("xattr slots          = %d\n", blob_sizes.lbs_xattr_count);

        /*
         * Create any kmem_caches needed for blobs
         */
        if (blob_sizes.lbs_file)
                lsm_file_cache = kmem_cache_create("lsm_file_cache",
                                                   blob_sizes.lbs_file, 0,
                                                   SLAB_PANIC, NULL);
        if (blob_sizes.lbs_inode)
                lsm_inode_cache = kmem_cache_create("lsm_inode_cache",
                                                    blob_sizes.lbs_inode, 0,
                                                    SLAB_PANIC, NULL);

        lsm_early_cred((struct cred *) current->cred);
        lsm_early_task(current);
        for (lsm = ordered_lsms; *lsm; lsm++)
                initialize_lsm(*lsm);

        kfree(ordered_lsms);
}

int __init early_security_init(void)
{
        struct lsm_info *lsm;

#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
        INIT_HLIST_HEAD(&security_hook_heads.NAME);
#include "linux/lsm_hook_defs.h"
#undef LSM_HOOK

        for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
                if (!lsm->enabled)
                        lsm->enabled = &lsm_enabled_true;
                prepare_lsm(lsm);
                initialize_lsm(lsm);
        }

        return 0;
}

/**
 * security_init - initializes the security framework
 *
 * This should be called early in the kernel initialization sequence.
 */
int __init security_init(void)
{
        struct lsm_info *lsm;

        init_debug("legacy security=%s\n", chosen_major_lsm ? : " *unspecified*");
        init_debug("  CONFIG_LSM=%s\n", builtin_lsm_order);
        init_debug("boot arg lsm=%s\n", chosen_lsm_order ? : " *unspecified*");

        /*
         * Append the names of the early LSM modules now that kmalloc() is
         * available
         */
        for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
                init_debug("  early started: %s (%s)\n", lsm->name,
                           is_enabled(lsm) ? "enabled" : "disabled");
                if (lsm->enabled)
                        lsm_append(lsm->name, &lsm_names);
        }

        /* Load LSMs in specified order. */
        ordered_lsm_init();

        return 0;
}

/* Save user chosen LSM */
static int __init choose_major_lsm(char *str)
{
        chosen_major_lsm = str;
        return 1;
}
__setup("security=", choose_major_lsm);

/* Explicitly choose LSM initialization order. */
static int __init choose_lsm_order(char *str)
{
        chosen_lsm_order = str;
        return 1;
}
__setup("lsm=", choose_lsm_order);

/* Enable LSM order debugging. */
static int __init enable_debug(char *str)
{
        debug = true;
        return 1;
}
__setup("lsm.debug", enable_debug);

static bool match_last_lsm(const char *list, const char *lsm)
{
        const char *last;

        if (WARN_ON(!list || !lsm))
                return false;
        last = strrchr(list, ',');
        if (last)
                /* Pass the comma, strcmp() will check for '\0' */
                last++;
        else
                last = list;
        return !strcmp(last, lsm);
}

static int lsm_append(const char *new, char **result)
{
        char *cp;

        if (*result == NULL) {
                *result = kstrdup(new, GFP_KERNEL);
                if (*result == NULL)
                        return -ENOMEM;
        } else {
                /* Check if it is the last registered name */
                if (match_last_lsm(*result, new))
                        return 0;
                cp = kasprintf(GFP_KERNEL, "%s,%s", *result, new);
                if (cp == NULL)
                        return -ENOMEM;
                kfree(*result);
                *result = cp;
        }
        return 0;
}

/**
 * security_add_hooks - Add a modules hooks to the hook lists.
 * @hooks: the hooks to add
 * @count: the number of hooks to add
 * @lsmid: the identification information for the security module
 *
 * Each LSM has to register its hooks with the infrastructure.
 */
void __init security_add_hooks(struct security_hook_list *hooks, int count,
                               const struct lsm_id *lsmid)
{
        int i;

        /*
         * A security module may call security_add_hooks() more
         * than once during initialization, and LSM initialization
         * is serialized. Landlock is one such case.
         * Look at the previous entry, if there is one, for duplication.
         */
        if (lsm_active_cnt == 0 || lsm_idlist[lsm_active_cnt - 1] != lsmid) {
                if (lsm_active_cnt >= LSM_CONFIG_COUNT)
                        panic("%s Too many LSMs registered.\n", __func__);
                lsm_idlist[lsm_active_cnt++] = lsmid;
        }

        for (i = 0; i < count; i++) {
                hooks[i].lsmid = lsmid;
                hlist_add_tail_rcu(&hooks[i].list, hooks[i].head);
        }

        /*
         * Don't try to append during early_security_init(), we'll come back
         * and fix this up afterwards.
         */
        if (slab_is_available()) {
                if (lsm_append(lsmid->name, &lsm_names) < 0)
                        panic("%s - Cannot get early memory.\n", __func__);
        }
}

int call_blocking_lsm_notifier(enum lsm_event event, void *data)
{
        return blocking_notifier_call_chain(&blocking_lsm_notifier_chain,
                                            event, data);
}
EXPORT_SYMBOL(call_blocking_lsm_notifier);

int register_blocking_lsm_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&blocking_lsm_notifier_chain,
                                                nb);
}
EXPORT_SYMBOL(register_blocking_lsm_notifier);

int unregister_blocking_lsm_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&blocking_lsm_notifier_chain,
                                                  nb);
}
EXPORT_SYMBOL(unregister_blocking_lsm_notifier);

/**
 * lsm_cred_alloc - allocate a composite cred blob
 * @cred: the cred that needs a blob
 * @gfp: allocation type
 *
 * Allocate the cred blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_cred_alloc(struct cred *cred, gfp_t gfp)
{
        if (blob_sizes.lbs_cred == 0) {
                cred->security = NULL;
                return 0;
        }

        cred->security = kzalloc(blob_sizes.lbs_cred, gfp);
        if (cred->security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_early_cred - during initialization allocate a composite cred blob
 * @cred: the cred that needs a blob
 *
 * Allocate the cred blob for all the modules
 */
static void __init lsm_early_cred(struct cred *cred)
{
        int rc = lsm_cred_alloc(cred, GFP_KERNEL);

        if (rc)
                panic("%s: Early cred alloc failed.\n", __func__);
}

/**
 * lsm_file_alloc - allocate a composite file blob
 * @file: the file that needs a blob
 *
 * Allocate the file blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_file_alloc(struct file *file)
{
        if (!lsm_file_cache) {
                file->f_security = NULL;
                return 0;
        }

        file->f_security = kmem_cache_zalloc(lsm_file_cache, GFP_KERNEL);
        if (file->f_security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_inode_alloc - allocate a composite inode blob
 * @inode: the inode that needs a blob
 *
 * Allocate the inode blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
int lsm_inode_alloc(struct inode *inode)
{
        if (!lsm_inode_cache) {
                inode->i_security = NULL;
                return 0;
        }

        inode->i_security = kmem_cache_zalloc(lsm_inode_cache, GFP_NOFS);
        if (inode->i_security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_task_alloc - allocate a composite task blob
 * @task: the task that needs a blob
 *
 * Allocate the task blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_task_alloc(struct task_struct *task)
{
        if (blob_sizes.lbs_task == 0) {
                task->security = NULL;
                return 0;
        }

        task->security = kzalloc(blob_sizes.lbs_task, GFP_KERNEL);
        if (task->security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_ipc_alloc - allocate a composite ipc blob
 * @kip: the ipc that needs a blob
 *
 * Allocate the ipc blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_ipc_alloc(struct kern_ipc_perm *kip)
{
        if (blob_sizes.lbs_ipc == 0) {
                kip->security = NULL;
                return 0;
        }

        kip->security = kzalloc(blob_sizes.lbs_ipc, GFP_KERNEL);
        if (kip->security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_msg_msg_alloc - allocate a composite msg_msg blob
 * @mp: the msg_msg that needs a blob
 *
 * Allocate the ipc blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_msg_msg_alloc(struct msg_msg *mp)
{
        if (blob_sizes.lbs_msg_msg == 0) {
                mp->security = NULL;
                return 0;
        }

        mp->security = kzalloc(blob_sizes.lbs_msg_msg, GFP_KERNEL);
        if (mp->security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_early_task - during initialization allocate a composite task blob
 * @task: the task that needs a blob
 *
 * Allocate the task blob for all the modules
 */
static void __init lsm_early_task(struct task_struct *task)
{
        int rc = lsm_task_alloc(task);

        if (rc)
                panic("%s: Early task alloc failed.\n", __func__);
}

/**
 * lsm_superblock_alloc - allocate a composite superblock blob
 * @sb: the superblock that needs a blob
 *
 * Allocate the superblock blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_superblock_alloc(struct super_block *sb)
{
        if (blob_sizes.lbs_superblock == 0) {
                sb->s_security = NULL;
                return 0;
        }

        sb->s_security = kzalloc(blob_sizes.lbs_superblock, GFP_KERNEL);
        if (sb->s_security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_fill_user_ctx - Fill a user space lsm_ctx structure
 * @uctx: a userspace LSM context to be filled
 * @uctx_len: available uctx size (input), used uctx size (output)
 * @val: the new LSM context value
 * @val_len: the size of the new LSM context value
 * @id: LSM id
 * @flags: LSM defined flags
 *
 * Fill all of the fields in a userspace lsm_ctx structure.  If @uctx is NULL
 * simply calculate the required size to output via @utc_len and return
 * success.
 *
 * Returns 0 on success, -E2BIG if userspace buffer is not large enough,
 * -EFAULT on a copyout error, -ENOMEM if memory can't be allocated.
 */
int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, u32 *uctx_len,
                      void *val, size_t val_len,
                      u64 id, u64 flags)
{
        struct lsm_ctx *nctx = NULL;
        size_t nctx_len;
        int rc = 0;

        nctx_len = ALIGN(struct_size(nctx, ctx, val_len), sizeof(void *));
        if (nctx_len > *uctx_len) {
                rc = -E2BIG;
                goto out;
        }

        /* no buffer - return success/0 and set @uctx_len to the req size */
        if (!uctx)
                goto out;

        nctx = kzalloc(nctx_len, GFP_KERNEL);
        if (nctx == NULL) {
                rc = -ENOMEM;
                goto out;
        }
        nctx->id = id;
        nctx->flags = flags;
        nctx->len = nctx_len;
        nctx->ctx_len = val_len;
        memcpy(nctx->ctx, val, val_len);

        if (copy_to_user(uctx, nctx, nctx_len))
                rc = -EFAULT;

out:
        kfree(nctx);
        *uctx_len = nctx_len;
        return rc;
}

/*
 * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and
 * can be accessed with:
 *
 *        LSM_RET_DEFAULT(<hook_name>)
 *
 * The macros below define static constants for the default value of each
 * LSM hook.
 */
#define LSM_RET_DEFAULT(NAME) (NAME##_default)
#define DECLARE_LSM_RET_DEFAULT_void(DEFAULT, NAME)
#define DECLARE_LSM_RET_DEFAULT_int(DEFAULT, NAME) \
        static const int __maybe_unused LSM_RET_DEFAULT(NAME) = (DEFAULT);
#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
        DECLARE_LSM_RET_DEFAULT_##RET(DEFAULT, NAME)

#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK

/*
 * Hook list operation macros.
 *
 * call_void_hook:
 *        This is a hook that does not return a value.
 *
 * call_int_hook:
 *        This is a hook that returns a value.
 */

#define call_void_hook(FUNC, ...)                                \
        do {                                                        \
                struct security_hook_list *P;                        \
                                                                \
                hlist_for_each_entry(P, &security_hook_heads.FUNC, list) \
                        P->hook.FUNC(__VA_ARGS__);                \
        } while (0)

#define call_int_hook(FUNC, ...) ({                                \
        int RC = LSM_RET_DEFAULT(FUNC);                                \
        do {                                                        \
                struct security_hook_list *P;                        \
                                                                \
                hlist_for_each_entry(P, &security_hook_heads.FUNC, list) { \
                        RC = P->hook.FUNC(__VA_ARGS__);                \
                        if (RC != LSM_RET_DEFAULT(FUNC))        \
                                break;                                \
                }                                                \
        } while (0);                                                \
        RC;                                                        \
})

/* Security operations */

/**
 * security_binder_set_context_mgr() - Check if becoming binder ctx mgr is ok
 * @mgr: task credentials of current binder process
 *
 * Check whether @mgr is allowed to be the binder context manager.
 *
 * Return: Return 0 if permission is granted.
 */
int security_binder_set_context_mgr(const struct cred *mgr)
{
        return call_int_hook(binder_set_context_mgr, mgr);
}

/**
 * security_binder_transaction() - Check if a binder transaction is allowed
 * @from: sending process
 * @to: receiving process
 *
 * Check whether @from is allowed to invoke a binder transaction call to @to.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_binder_transaction(const struct cred *from,
                                const struct cred *to)
{
        return call_int_hook(binder_transaction, from, to);
}

/**
 * security_binder_transfer_binder() - Check if a binder transfer is allowed
 * @from: sending process
 * @to: receiving process
 *
 * Check whether @from is allowed to transfer a binder reference to @to.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_binder_transfer_binder(const struct cred *from,
                                    const struct cred *to)
{
        return call_int_hook(binder_transfer_binder, from, to);
}

/**
 * security_binder_transfer_file() - Check if a binder file xfer is allowed
 * @from: sending process
 * @to: receiving process
 * @file: file being transferred
 *
 * Check whether @from is allowed to transfer @file to @to.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_binder_transfer_file(const struct cred *from,
                                  const struct cred *to, const struct file *file)
{
        return call_int_hook(binder_transfer_file, from, to, file);
}

/**
 * security_ptrace_access_check() - Check if tracing is allowed
 * @child: target process
 * @mode: PTRACE_MODE flags
 *
 * Check permission before allowing the current process to trace the @child
 * process.  Security modules may also want to perform a process tracing check
 * during an execve in the set_security or apply_creds hooks of tracing check
 * during an execve in the bprm_set_creds hook of binprm_security_ops if the
 * process is being traced and its security attributes would be changed by the
 * execve.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ptrace_access_check(struct task_struct *child, unsigned int mode)
{
        return call_int_hook(ptrace_access_check, child, mode);
}

/**
 * security_ptrace_traceme() - Check if tracing is allowed
 * @parent: tracing process
 *
 * Check that the @parent process has sufficient permission to trace the
 * current process before allowing the current process to present itself to the
 * @parent process for tracing.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ptrace_traceme(struct task_struct *parent)
{
        return call_int_hook(ptrace_traceme, parent);
}

/**
 * security_capget() - Get the capability sets for a process
 * @target: target process
 * @effective: effective capability set
 * @inheritable: inheritable capability set
 * @permitted: permitted capability set
 *
 * Get the @effective, @inheritable, and @permitted capability sets for the
 * @target process.  The hook may also perform permission checking to determine
 * if the current process is allowed to see the capability sets of the @target
 * process.
 *
 * Return: Returns 0 if the capability sets were successfully obtained.
 */
int security_capget(const struct task_struct *target,
                    kernel_cap_t *effective,
                    kernel_cap_t *inheritable,
                    kernel_cap_t *permitted)
{
        return call_int_hook(capget, target, effective, inheritable, permitted);
}

/**
 * security_capset() - Set the capability sets for a process
 * @new: new credentials for the target process
 * @old: current credentials of the target process
 * @effective: effective capability set
 * @inheritable: inheritable capability set
 * @permitted: permitted capability set
 *
 * Set the @effective, @inheritable, and @permitted capability sets for the
 * current process.
 *
 * Return: Returns 0 and update @new if permission is granted.
 */
int security_capset(struct cred *new, const struct cred *old,
                    const kernel_cap_t *effective,
                    const kernel_cap_t *inheritable,
                    const kernel_cap_t *permitted)
{
        return call_int_hook(capset, new, old, effective, inheritable,
                             permitted);
}

/**
 * security_capable() - Check if a process has the necessary capability
 * @cred: credentials to examine
 * @ns: user namespace
 * @cap: capability requested
 * @opts: capability check options
 *
 * Check whether the @tsk process has the @cap capability in the indicated
 * credentials.  @cap contains the capability <include/linux/capability.h>.
 * @opts contains options for the capable check <include/linux/security.h>.
 *
 * Return: Returns 0 if the capability is granted.
 */
int security_capable(const struct cred *cred,
                     struct user_namespace *ns,
                     int cap,
                     unsigned int opts)
{
        return call_int_hook(capable, cred, ns, cap, opts);
}

/**
 * security_quotactl() - Check if a quotactl() syscall is allowed for this fs
 * @cmds: commands
 * @type: type
 * @id: id
 * @sb: filesystem
 *
 * Check whether the quotactl syscall is allowed for this @sb.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_quotactl(int cmds, int type, int id, const struct super_block *sb)
{
        return call_int_hook(quotactl, cmds, type, id, sb);
}

/**
 * security_quota_on() - Check if QUOTAON is allowed for a dentry
 * @dentry: dentry
 *
 * Check whether QUOTAON is allowed for @dentry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_quota_on(struct dentry *dentry)
{
        return call_int_hook(quota_on, dentry);
}

/**
 * security_syslog() - Check if accessing the kernel message ring is allowed
 * @type: SYSLOG_ACTION_* type
 *
 * Check permission before accessing the kernel message ring or changing
 * logging to the console.  See the syslog(2) manual page for an explanation of
 * the @type values.
 *
 * Return: Return 0 if permission is granted.
 */
int security_syslog(int type)
{
        return call_int_hook(syslog, type);
}

/**
 * security_settime64() - Check if changing the system time is allowed
 * @ts: new time
 * @tz: timezone
 *
 * Check permission to change the system time, struct timespec64 is defined in
 * <include/linux/time64.h> and timezone is defined in <include/linux/time.h>.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_settime64(const struct timespec64 *ts, const struct timezone *tz)
{
        return call_int_hook(settime, ts, tz);
}

/**
 * security_vm_enough_memory_mm() - Check if allocating a new mem map is allowed
 * @mm: mm struct
 * @pages: number of pages
 *
 * Check permissions for allocating a new virtual mapping.  If all LSMs return
 * a positive value, __vm_enough_memory() will be called with cap_sys_admin
 * set. If at least one LSM returns 0 or negative, __vm_enough_memory() will be
 * called with cap_sys_admin cleared.
 *
 * Return: Returns 0 if permission is granted by the LSM infrastructure to the
 *         caller.
 */
int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
{
        struct security_hook_list *hp;
        int cap_sys_admin = 1;
        int rc;

        /*
         * The module will respond with a positive value if
         * it thinks the __vm_enough_memory() call should be
         * made with the cap_sys_admin set. If all of the modules
         * agree that it should be set it will. If any module
         * thinks it should not be set it won't.
         */
        hlist_for_each_entry(hp, &security_hook_heads.vm_enough_memory, list) {
                rc = hp->hook.vm_enough_memory(mm, pages);
                if (rc <= 0) {
                        cap_sys_admin = 0;
                        break;
                }
        }
        return __vm_enough_memory(mm, pages, cap_sys_admin);
}

/**
 * security_bprm_creds_for_exec() - Prepare the credentials for exec()
 * @bprm: binary program information
 *
 * If the setup in prepare_exec_creds did not setup @bprm->cred->security
 * properly for executing @bprm->file, update the LSM's portion of
 * @bprm->cred->security to be what commit_creds needs to install for the new
 * program.  This hook may also optionally check permissions (e.g. for
 * transitions between security domains).  The hook must set @bprm->secureexec
 * to 1 if AT_SECURE should be set to request libc enable secure mode.  @bprm
 * contains the linux_binprm structure.
 *
 * Return: Returns 0 if the hook is successful and permission is granted.
 */
int security_bprm_creds_for_exec(struct linux_binprm *bprm)
{
        return call_int_hook(bprm_creds_for_exec, bprm);
}

/**
 * security_bprm_creds_from_file() - Update linux_binprm creds based on file
 * @bprm: binary program information
 * @file: associated file
 *
 * If @file is setpcap, suid, sgid or otherwise marked to change privilege upon
 * exec, update @bprm->cred to reflect that change. This is called after
 * finding the binary that will be executed without an interpreter.  This
 * ensures that the credentials will not be derived from a script that the
 * binary will need to reopen, which when reopend may end up being a completely
 * different file.  This hook may also optionally check permissions (e.g. for
 * transitions between security domains).  The hook must set @bprm->secureexec
 * to 1 if AT_SECURE should be set to request libc enable secure mode.  The
 * hook must add to @bprm->per_clear any personality flags that should be
 * cleared from current->personality.  @bprm contains the linux_binprm
 * structure.
 *
 * Return: Returns 0 if the hook is successful and permission is granted.
 */
int security_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file)
{
        return call_int_hook(bprm_creds_from_file, bprm, file);
}

/**
 * security_bprm_check() - Mediate binary handler search
 * @bprm: binary program information
 *
 * This hook mediates the point when a search for a binary handler will begin.
 * It allows a check against the @bprm->cred->security value which was set in
 * the preceding creds_for_exec call.  The argv list and envp list are reliably
 * available in @bprm.  This hook may be called multiple times during a single
 * execve.  @bprm contains the linux_binprm structure.
 *
 * Return: Returns 0 if the hook is successful and permission is granted.
 */
int security_bprm_check(struct linux_binprm *bprm)
{
        return call_int_hook(bprm_check_security, bprm);
}

/**
 * security_bprm_committing_creds() - Install creds for a process during exec()
 * @bprm: binary program information
 *
 * Prepare to install the new security attributes of a process being
 * transformed by an execve operation, based on the old credentials pointed to
 * by @current->cred and the information set in @bprm->cred by the
 * bprm_creds_for_exec hook.  @bprm points to the linux_binprm structure.  This
 * hook is a good place to perform state changes on the process such as closing
 * open file descriptors to which access will no longer be granted when the
 * attributes are changed.  This is called immediately before commit_creds().
 */
void security_bprm_committing_creds(const struct linux_binprm *bprm)
{
        call_void_hook(bprm_committing_creds, bprm);
}

/**
 * security_bprm_committed_creds() - Tidy up after cred install during exec()
 * @bprm: binary program information
 *
 * Tidy up after the installation of the new security attributes of a process
 * being transformed by an execve operation.  The new credentials have, by this
 * point, been set to @current->cred.  @bprm points to the linux_binprm
 * structure.  This hook is a good place to perform state changes on the
 * process such as clearing out non-inheritable signal state.  This is called
 * immediately after commit_creds().
 */
void security_bprm_committed_creds(const struct linux_binprm *bprm)
{
        call_void_hook(bprm_committed_creds, bprm);
}

/**
 * security_fs_context_submount() - Initialise fc->security
 * @fc: new filesystem context
 * @reference: dentry reference for submount/remount
 *
 * Fill out the ->security field for a new fs_context.
 *
 * Return: Returns 0 on success or negative error code on failure.
 */
int security_fs_context_submount(struct fs_context *fc, struct super_block *reference)
{
        return call_int_hook(fs_context_submount, fc, reference);
}

/**
 * security_fs_context_dup() - Duplicate a fs_context LSM blob
 * @fc: destination filesystem context
 * @src_fc: source filesystem context
 *
 * Allocate and attach a security structure to sc->security.  This pointer is
 * initialised to NULL by the caller.  @fc indicates the new filesystem context.
 * @src_fc indicates the original filesystem context.
 *
 * Return: Returns 0 on success or a negative error code on failure.
 */
int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
{
        return call_int_hook(fs_context_dup, fc, src_fc);
}

/**
 * security_fs_context_parse_param() - Configure a filesystem context
 * @fc: filesystem context
 * @param: filesystem parameter
 *
 * Userspace provided a parameter to configure a superblock.  The LSM can
 * consume the parameter or return it to the caller for use elsewhere.
 *
 * Return: If the parameter is used by the LSM it should return 0, if it is
 *         returned to the caller -ENOPARAM is returned, otherwise a negative
 *         error code is returned.
 */
int security_fs_context_parse_param(struct fs_context *fc,
                                    struct fs_parameter *param)
{
        struct security_hook_list *hp;
        int trc;
        int rc = -ENOPARAM;

        hlist_for_each_entry(hp, &security_hook_heads.fs_context_parse_param,
                             list) {
                trc = hp->hook.fs_context_parse_param(fc, param);
                if (trc == 0)
                        rc = 0;
                else if (trc != -ENOPARAM)
                        return trc;
        }
        return rc;
}

/**
 * security_sb_alloc() - Allocate a super_block LSM blob
 * @sb: filesystem superblock
 *
 * Allocate and attach a security structure to the sb->s_security field.  The
 * s_security field is initialized to NULL when the structure is allocated.
 * @sb contains the super_block structure to be modified.
 *
 * Return: Returns 0 if operation was successful.
 */
int security_sb_alloc(struct super_block *sb)
{
        int rc = lsm_superblock_alloc(sb);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(sb_alloc_security, sb);
        if (unlikely(rc))
                security_sb_free(sb);
        return rc;
}

/**
 * security_sb_delete() - Release super_block LSM associated objects
 * @sb: filesystem superblock
 *
 * Release objects tied to a superblock (e.g. inodes).  @sb contains the
 * super_block structure being released.
 */
void security_sb_delete(struct super_block *sb)
{
        call_void_hook(sb_delete, sb);
}

/**
 * security_sb_free() - Free a super_block LSM blob
 * @sb: filesystem superblock
 *
 * Deallocate and clear the sb->s_security field.  @sb contains the super_block
 * structure to be modified.
 */
void security_sb_free(struct super_block *sb)
{
        call_void_hook(sb_free_security, sb);
        kfree(sb->s_security);
        sb->s_security = NULL;
}

/**
 * security_free_mnt_opts() - Free memory associated with mount options
 * @mnt_opts: LSM processed mount options
 *
 * Free memory associated with @mnt_ops.
 */
void security_free_mnt_opts(void **mnt_opts)
{
        if (!*mnt_opts)
                return;
        call_void_hook(sb_free_mnt_opts, *mnt_opts);
        *mnt_opts = NULL;
}
EXPORT_SYMBOL(security_free_mnt_opts);

/**
 * security_sb_eat_lsm_opts() - Consume LSM mount options
 * @options: mount options
 * @mnt_opts: LSM processed mount options
 *
 * Eat (scan @options) and save them in @mnt_opts.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_sb_eat_lsm_opts(char *options, void **mnt_opts)
{
        return call_int_hook(sb_eat_lsm_opts, options, mnt_opts);
}
EXPORT_SYMBOL(security_sb_eat_lsm_opts);

/**
 * security_sb_mnt_opts_compat() - Check if new mount options are allowed
 * @sb: filesystem superblock
 * @mnt_opts: new mount options
 *
 * Determine if the new mount options in @mnt_opts are allowed given the
 * existing mounted filesystem at @sb.  @sb superblock being compared.
 *
 * Return: Returns 0 if options are compatible.
 */
int security_sb_mnt_opts_compat(struct super_block *sb,
                                void *mnt_opts)
{
        return call_int_hook(sb_mnt_opts_compat, sb, mnt_opts);
}
EXPORT_SYMBOL(security_sb_mnt_opts_compat);

/**
 * security_sb_remount() - Verify no incompatible mount changes during remount
 * @sb: filesystem superblock
 * @mnt_opts: (re)mount options
 *
 * Extracts security system specific mount options and verifies no changes are
 * being made to those options.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_remount(struct super_block *sb,
                        void *mnt_opts)
{
        return call_int_hook(sb_remount, sb, mnt_opts);
}
EXPORT_SYMBOL(security_sb_remount);

/**
 * security_sb_kern_mount() - Check if a kernel mount is allowed
 * @sb: filesystem superblock
 *
 * Mount this @sb if allowed by permissions.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_kern_mount(const struct super_block *sb)
{
        return call_int_hook(sb_kern_mount, sb);
}

/**
 * security_sb_show_options() - Output the mount options for a superblock
 * @m: output file
 * @sb: filesystem superblock
 *
 * Show (print on @m) mount options for this @sb.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_sb_show_options(struct seq_file *m, struct super_block *sb)
{
        return call_int_hook(sb_show_options, m, sb);
}

/**
 * security_sb_statfs() - Check if accessing fs stats is allowed
 * @dentry: superblock handle
 *
 * Check permission before obtaining filesystem statistics for the @mnt
 * mountpoint.  @dentry is a handle on the superblock for the filesystem.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_statfs(struct dentry *dentry)
{
        return call_int_hook(sb_statfs, dentry);
}

/**
 * security_sb_mount() - Check permission for mounting a filesystem
 * @dev_name: filesystem backing device
 * @path: mount point
 * @type: filesystem type
 * @flags: mount flags
 * @data: filesystem specific data
 *
 * Check permission before an object specified by @dev_name is mounted on the
 * mount point named by @nd.  For an ordinary mount, @dev_name identifies a
 * device if the file system type requires a device.  For a remount
 * (@flags & MS_REMOUNT), @dev_name is irrelevant.  For a loopback/bind mount
 * (@flags & MS_BIND), @dev_name identifies the        pathname of the object being
 * mounted.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_mount(const char *dev_name, const struct path *path,
                      const char *type, unsigned long flags, void *data)
{
        return call_int_hook(sb_mount, dev_name, path, type, flags, data);
}

/**
 * security_sb_umount() - Check permission for unmounting a filesystem
 * @mnt: mounted filesystem
 * @flags: unmount flags
 *
 * Check permission before the @mnt file system is unmounted.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_umount(struct vfsmount *mnt, int flags)
{
        return call_int_hook(sb_umount, mnt, flags);
}

/**
 * security_sb_pivotroot() - Check permissions for pivoting the rootfs
 * @old_path: new location for current rootfs
 * @new_path: location of the new rootfs
 *
 * Check permission before pivoting the root filesystem.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_pivotroot(const struct path *old_path,
                          const struct path *new_path)
{
        return call_int_hook(sb_pivotroot, old_path, new_path);
}

/**
 * security_sb_set_mnt_opts() - Set the mount options for a filesystem
 * @sb: filesystem superblock
 * @mnt_opts: binary mount options
 * @kern_flags: kernel flags (in)
 * @set_kern_flags: kernel flags (out)
 *
 * Set the security relevant mount options used for a superblock.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sb_set_mnt_opts(struct super_block *sb,
                             void *mnt_opts,
                             unsigned long kern_flags,
                             unsigned long *set_kern_flags)
{
        struct security_hook_list *hp;
        int rc = mnt_opts ? -EOPNOTSUPP : LSM_RET_DEFAULT(sb_set_mnt_opts);

        hlist_for_each_entry(hp, &security_hook_heads.sb_set_mnt_opts,
                             list) {
                rc = hp->hook.sb_set_mnt_opts(sb, mnt_opts, kern_flags,
                                              set_kern_flags);
                if (rc != LSM_RET_DEFAULT(sb_set_mnt_opts))
                        break;
        }
        return rc;
}
EXPORT_SYMBOL(security_sb_set_mnt_opts);

/**
 * security_sb_clone_mnt_opts() - Duplicate superblock mount options
 * @oldsb: source superblock
 * @newsb: destination superblock
 * @kern_flags: kernel flags (in)
 * @set_kern_flags: kernel flags (out)
 *
 * Copy all security options from a given superblock to another.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sb_clone_mnt_opts(const struct super_block *oldsb,
                               struct super_block *newsb,
                               unsigned long kern_flags,
                               unsigned long *set_kern_flags)
{
        return call_int_hook(sb_clone_mnt_opts, oldsb, newsb,
                             kern_flags, set_kern_flags);
}
EXPORT_SYMBOL(security_sb_clone_mnt_opts);

/**
 * security_move_mount() - Check permissions for moving a mount
 * @from_path: source mount point
 * @to_path: destination mount point
 *
 * Check permission before a mount is moved.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_move_mount(const struct path *from_path,
                        const struct path *to_path)
{
        return call_int_hook(move_mount, from_path, to_path);
}

/**
 * security_path_notify() - Check if setting a watch is allowed
 * @path: file path
 * @mask: event mask
 * @obj_type: file path type
 *
 * Check permissions before setting a watch on events as defined by @mask, on
 * an object at @path, whose type is defined by @obj_type.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_notify(const struct path *path, u64 mask,
                         unsigned int obj_type)
{
        return call_int_hook(path_notify, path, mask, obj_type);
}

/**
 * security_inode_alloc() - Allocate an inode LSM blob
 * @inode: the inode
 *
 * Allocate and attach a security structure to @inode->i_security.  The
 * i_security field is initialized to NULL when the inode structure is
 * allocated.
 *
 * Return: Return 0 if operation was successful.
 */
int security_inode_alloc(struct inode *inode)
{
        int rc = lsm_inode_alloc(inode);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(inode_alloc_security, inode);
        if (unlikely(rc))
                security_inode_free(inode);
        return rc;
}

static void inode_free_by_rcu(struct rcu_head *head)
{
        /*
         * The rcu head is at the start of the inode blob
         */
        kmem_cache_free(lsm_inode_cache, head);
}

/**
 * security_inode_free() - Free an inode's LSM blob
 * @inode: the inode
 *
 * Deallocate the inode security structure and set @inode->i_security to NULL.
 */
void security_inode_free(struct inode *inode)
{
        call_void_hook(inode_free_security, inode);
        /*
         * The inode may still be referenced in a path walk and
         * a call to security_inode_permission() can be made
         * after inode_free_security() is called. Ideally, the VFS
         * wouldn't do this, but fixing that is a much harder
         * job. For now, simply free the i_security via RCU, and
         * leave the current inode->i_security pointer intact.
         * The inode will be freed after the RCU grace period too.
         */
        if (inode->i_security)
                call_rcu((struct rcu_head *)inode->i_security,
                         inode_free_by_rcu);
}

/**
 * security_dentry_init_security() - Perform dentry initialization
 * @dentry: the dentry to initialize
 * @mode: mode used to determine resource type
 * @name: name of the last path component
 * @xattr_name: name of the security/LSM xattr
 * @ctx: pointer to the resulting LSM context
 * @ctxlen: length of @ctx
 *
 * Compute a context for a dentry as the inode is not yet available since NFSv4
 * has no label backed by an EA anyway.  It is important to note that
 * @xattr_name does not need to be free'd by the caller, it is a static string.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_dentry_init_security(struct dentry *dentry, int mode,
                                  const struct qstr *name,
                                  const char **xattr_name, void **ctx,
                                  u32 *ctxlen)
{
        return call_int_hook(dentry_init_security, dentry, mode, name,
                             xattr_name, ctx, ctxlen);
}
EXPORT_SYMBOL(security_dentry_init_security);

/**
 * security_dentry_create_files_as() - Perform dentry initialization
 * @dentry: the dentry to initialize
 * @mode: mode used to determine resource type
 * @name: name of the last path component
 * @old: creds to use for LSM context calculations
 * @new: creds to modify
 *
 * Compute a context for a dentry as the inode is not yet available and set
 * that context in passed in creds so that new files are created using that
 * context. Context is calculated using the passed in creds and not the creds
 * of the caller.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_dentry_create_files_as(struct dentry *dentry, int mode,
                                    struct qstr *name,
                                    const struct cred *old, struct cred *new)
{
        return call_int_hook(dentry_create_files_as, dentry, mode,
                             name, old, new);
}
EXPORT_SYMBOL(security_dentry_create_files_as);

/**
 * security_inode_init_security() - Initialize an inode's LSM context
 * @inode: the inode
 * @dir: parent directory
 * @qstr: last component of the pathname
 * @initxattrs: callback function to write xattrs
 * @fs_data: filesystem specific data
 *
 * Obtain the security attribute name suffix and value to set on a newly
 * created inode and set up the incore security field for the new inode.  This
 * hook is called by the fs code as part of the inode creation transaction and
 * provides for atomic labeling of the inode, unlike the post_create/mkdir/...
 * hooks called by the VFS.
 *
 * The hook function is expected to populate the xattrs array, by calling
 * lsm_get_xattr_slot() to retrieve the slots reserved by the security module
 * with the lbs_xattr_count field of the lsm_blob_sizes structure.  For each
 * slot, the hook function should set ->name to the attribute name suffix
 * (e.g. selinux), to allocate ->value (will be freed by the caller) and set it
 * to the attribute value, to set ->value_len to the length of the value.  If
 * the security module does not use security attributes or does not wish to put
 * a security attribute on this particular inode, then it should return
 * -EOPNOTSUPP to skip this processing.
 *
 * Return: Returns 0 if the LSM successfully initialized all of the inode
 *         security attributes that are required, negative values otherwise.
 */
int security_inode_init_security(struct inode *inode, struct inode *dir,
                                 const struct qstr *qstr,
                                 const initxattrs initxattrs, void *fs_data)
{
        struct security_hook_list *hp;
        struct xattr *new_xattrs = NULL;
        int ret = -EOPNOTSUPP, xattr_count = 0;

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        if (!blob_sizes.lbs_xattr_count)
                return 0;

        if (initxattrs) {
                /* Allocate +1 as terminator. */
                new_xattrs = kcalloc(blob_sizes.lbs_xattr_count + 1,
                                     sizeof(*new_xattrs), GFP_NOFS);
                if (!new_xattrs)
                        return -ENOMEM;
        }

        hlist_for_each_entry(hp, &security_hook_heads.inode_init_security,
                             list) {
                ret = hp->hook.inode_init_security(inode, dir, qstr, new_xattrs,
                                                  &xattr_count);
                if (ret && ret != -EOPNOTSUPP)
                        goto out;
                /*
                 * As documented in lsm_hooks.h, -EOPNOTSUPP in this context
                 * means that the LSM is not willing to provide an xattr, not
                 * that it wants to signal an error. Thus, continue to invoke
                 * the remaining LSMs.
                 */
        }

        /* If initxattrs() is NULL, xattr_count is zero, skip the call. */
        if (!xattr_count)
                goto out;

        ret = initxattrs(inode, new_xattrs, fs_data);
out:
        for (; xattr_count > 0; xattr_count--)
                kfree(new_xattrs[xattr_count - 1].value);
        kfree(new_xattrs);
        return (ret == -EOPNOTSUPP) ? 0 : ret;
}
EXPORT_SYMBOL(security_inode_init_security);

/**
 * security_inode_init_security_anon() - Initialize an anonymous inode
 * @inode: the inode
 * @name: the anonymous inode class
 * @context_inode: an optional related inode
 *
 * Set up the incore security field for the new anonymous inode and return
 * whether the inode creation is permitted by the security module or not.
 *
 * Return: Returns 0 on success, -EACCES if the security module denies the
 * creation of this inode, or another -errno upon other errors.
 */
int security_inode_init_security_anon(struct inode *inode,
                                      const struct qstr *name,
                                      const struct inode *context_inode)
{
        return call_int_hook(inode_init_security_anon, inode, name,
                             context_inode);
}

#ifdef CONFIG_SECURITY_PATH
/**
 * security_path_mknod() - Check if creating a special file is allowed
 * @dir: parent directory
 * @dentry: new file
 * @mode: new file mode
 * @dev: device number
 *
 * Check permissions when creating a file. Note that this hook is called even
 * if mknod operation is being done for a regular file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_mknod(const struct path *dir, struct dentry *dentry,
                        umode_t mode, unsigned int dev)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_mknod, dir, dentry, mode, dev);
}
EXPORT_SYMBOL(security_path_mknod);

/**
 * security_path_post_mknod() - Update inode security after reg file creation
 * @idmap: idmap of the mount
 * @dentry: new file
 *
 * Update inode security field after a regular file has been created.
 */
void security_path_post_mknod(struct mnt_idmap *idmap, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(path_post_mknod, idmap, dentry);
}

/**
 * security_path_mkdir() - Check if creating a new directory is allowed
 * @dir: parent directory
 * @dentry: new directory
 * @mode: new directory mode
 *
 * Check permissions to create a new directory in the existing directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_mkdir(const struct path *dir, struct dentry *dentry,
                        umode_t mode)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_mkdir, dir, dentry, mode);
}
EXPORT_SYMBOL(security_path_mkdir);

/**
 * security_path_rmdir() - Check if removing a directory is allowed
 * @dir: parent directory
 * @dentry: directory to remove
 *
 * Check the permission to remove a directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_rmdir(const struct path *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_rmdir, dir, dentry);
}

/**
 * security_path_unlink() - Check if removing a hard link is allowed
 * @dir: parent directory
 * @dentry: file
 *
 * Check the permission to remove a hard link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_unlink(const struct path *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_unlink, dir, dentry);
}
EXPORT_SYMBOL(security_path_unlink);

/**
 * security_path_symlink() - Check if creating a symbolic link is allowed
 * @dir: parent directory
 * @dentry: symbolic link
 * @old_name: file pathname
 *
 * Check the permission to create a symbolic link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_symlink(const struct path *dir, struct dentry *dentry,
                          const char *old_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_symlink, dir, dentry, old_name);
}

/**
 * security_path_link - Check if creating a hard link is allowed
 * @old_dentry: existing file
 * @new_dir: new parent directory
 * @new_dentry: new link
 *
 * Check permission before creating a new hard link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_link(struct dentry *old_dentry, const struct path *new_dir,
                       struct dentry *new_dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry))))
                return 0;
        return call_int_hook(path_link, old_dentry, new_dir, new_dentry);
}

/**
 * security_path_rename() - Check if renaming a file is allowed
 * @old_dir: parent directory of the old file
 * @old_dentry: the old file
 * @new_dir: parent directory of the new file
 * @new_dentry: the new file
 * @flags: flags
 *
 * Check for permission to rename a file or directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_rename(const struct path *old_dir, struct dentry *old_dentry,
                         const struct path *new_dir, struct dentry *new_dentry,
                         unsigned int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) ||
                     (d_is_positive(new_dentry) &&
                      IS_PRIVATE(d_backing_inode(new_dentry)))))
                return 0;

        return call_int_hook(path_rename, old_dir, old_dentry, new_dir,
                             new_dentry, flags);
}
EXPORT_SYMBOL(security_path_rename);

/**
 * security_path_truncate() - Check if truncating a file is allowed
 * @path: file
 *
 * Check permission before truncating the file indicated by path.  Note that
 * truncation permissions may also be checked based on already opened files,
 * using the security_file_truncate() hook.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_truncate(const struct path *path)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_truncate, path);
}

/**
 * security_path_chmod() - Check if changing the file's mode is allowed
 * @path: file
 * @mode: new mode
 *
 * Check for permission to change a mode of the file @path. The new mode is
 * specified in @mode which is a bitmask of constants from
 * <include/uapi/linux/stat.h>.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_chmod(const struct path *path, umode_t mode)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_chmod, path, mode);
}

/**
 * security_path_chown() - Check if changing the file's owner/group is allowed
 * @path: file
 * @uid: file owner
 * @gid: file group
 *
 * Check for permission to change owner/group of a file or directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_chown, path, uid, gid);
}

/**
 * security_path_chroot() - Check if changing the root directory is allowed
 * @path: directory
 *
 * Check for permission to change root directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_chroot(const struct path *path)
{
        return call_int_hook(path_chroot, path);
}
#endif /* CONFIG_SECURITY_PATH */

/**
 * security_inode_create() - Check if creating a file is allowed
 * @dir: the parent directory
 * @dentry: the file being created
 * @mode: requested file mode
 *
 * Check permission to create a regular file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_create(struct inode *dir, struct dentry *dentry,
                          umode_t mode)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_create, dir, dentry, mode);
}
EXPORT_SYMBOL_GPL(security_inode_create);

/**
 * security_inode_post_create_tmpfile() - Update inode security of new tmpfile
 * @idmap: idmap of the mount
 * @inode: inode of the new tmpfile
 *
 * Update inode security data after a tmpfile has been created.
 */
void security_inode_post_create_tmpfile(struct mnt_idmap *idmap,
                                        struct inode *inode)
{
        if (unlikely(IS_PRIVATE(inode)))
                return;
        call_void_hook(inode_post_create_tmpfile, idmap, inode);
}

/**
 * security_inode_link() - Check if creating a hard link is allowed
 * @old_dentry: existing file
 * @dir: new parent directory
 * @new_dentry: new link
 *
 * Check permission before creating a new hard link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_link(struct dentry *old_dentry, struct inode *dir,
                        struct dentry *new_dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry))))
                return 0;
        return call_int_hook(inode_link, old_dentry, dir, new_dentry);
}

/**
 * security_inode_unlink() - Check if removing a hard link is allowed
 * @dir: parent directory
 * @dentry: file
 *
 * Check the permission to remove a hard link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_unlink(struct inode *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_unlink, dir, dentry);
}

/**
 * security_inode_symlink() - Check if creating a symbolic link is allowed
 * @dir: parent directory
 * @dentry: symbolic link
 * @old_name: existing filename
 *
 * Check the permission to create a symbolic link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_symlink(struct inode *dir, struct dentry *dentry,
                           const char *old_name)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_symlink, dir, dentry, old_name);
}

/**
 * security_inode_mkdir() - Check if creation a new director is allowed
 * @dir: parent directory
 * @dentry: new directory
 * @mode: new directory mode
 *
 * Check permissions to create a new directory in the existing directory
 * associated with inode structure @dir.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_mkdir, dir, dentry, mode);
}
EXPORT_SYMBOL_GPL(security_inode_mkdir);

/**
 * security_inode_rmdir() - Check if removing a directory is allowed
 * @dir: parent directory
 * @dentry: directory to be removed
 *
 * Check the permission to remove a directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_rmdir, dir, dentry);
}

/**
 * security_inode_mknod() - Check if creating a special file is allowed
 * @dir: parent directory
 * @dentry: new file
 * @mode: new file mode
 * @dev: device number
 *
 * Check permissions when creating a special file (or a socket or a fifo file
 * created via the mknod system call).  Note that if mknod operation is being
 * done for a regular file, then the create hook will be called and not this
 * hook.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_mknod(struct inode *dir, struct dentry *dentry,
                         umode_t mode, dev_t dev)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_mknod, dir, dentry, mode, dev);
}

/**
 * security_inode_rename() - Check if renaming a file is allowed
 * @old_dir: parent directory of the old file
 * @old_dentry: the old file
 * @new_dir: parent directory of the new file
 * @new_dentry: the new file
 * @flags: flags
 *
 * Check for permission to rename a file or directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
                          struct inode *new_dir, struct dentry *new_dentry,
                          unsigned int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) ||
                     (d_is_positive(new_dentry) &&
                      IS_PRIVATE(d_backing_inode(new_dentry)))))
                return 0;

        if (flags & RENAME_EXCHANGE) {
                int err = call_int_hook(inode_rename, new_dir, new_dentry,
                                        old_dir, old_dentry);
                if (err)
                        return err;
        }

        return call_int_hook(inode_rename, old_dir, old_dentry,
                             new_dir, new_dentry);
}

/**
 * security_inode_readlink() - Check if reading a symbolic link is allowed
 * @dentry: link
 *
 * Check the permission to read the symbolic link.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_readlink(struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_readlink, dentry);
}

/**
 * security_inode_follow_link() - Check if following a symbolic link is allowed
 * @dentry: link dentry
 * @inode: link inode
 * @rcu: true if in RCU-walk mode
 *
 * Check permission to follow a symbolic link when looking up a pathname.  If
 * @rcu is true, @inode is not stable.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_follow_link(struct dentry *dentry, struct inode *inode,
                               bool rcu)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_follow_link, dentry, inode, rcu);
}

/**
 * security_inode_permission() - Check if accessing an inode is allowed
 * @inode: inode
 * @mask: access mask
 *
 * Check permission before accessing an inode.  This hook is called by the
 * existing Linux permission function, so a security module can use it to
 * provide additional checking for existing Linux permission checks.  Notice
 * that this hook is called when a file is opened (as well as many other
 * operations), whereas the file_security_ops permission hook is called when
 * the actual read/write operations are performed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_permission(struct inode *inode, int mask)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_permission, inode, mask);
}

/**
 * security_inode_setattr() - Check if setting file attributes is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @attr: new attributes
 *
 * Check permission before setting file attributes.  Note that the kernel call
 * to notify_change is performed from several locations, whenever file
 * attributes change (such as when a file is truncated, chown/chmod operations,
 * transferring disk quotas, etc).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_setattr(struct mnt_idmap *idmap,
                           struct dentry *dentry, struct iattr *attr)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_setattr, idmap, dentry, attr);
}
EXPORT_SYMBOL_GPL(security_inode_setattr);

/**
 * security_inode_post_setattr() - Update the inode after a setattr operation
 * @idmap: idmap of the mount
 * @dentry: file
 * @ia_valid: file attributes set
 *
 * Update inode security field after successful setting file attributes.
 */
void security_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                                 int ia_valid)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_setattr, idmap, dentry, ia_valid);
}

/**
 * security_inode_getattr() - Check if getting file attributes is allowed
 * @path: file
 *
 * Check permission before obtaining file attributes.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_getattr(const struct path *path)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(inode_getattr, path);
}

/**
 * security_inode_setxattr() - Check if setting file xattrs is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @name: xattr name
 * @value: xattr value
 * @size: size of xattr value
 * @flags: flags
 *
 * Check permission before setting the extended attributes.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_setxattr(struct mnt_idmap *idmap,
                            struct dentry *dentry, const char *name,
                            const void *value, size_t size, int flags)
{
        int ret;

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        /*
         * SELinux and Smack integrate the cap call,
         * so assume that all LSMs supplying this call do so.
         */
        ret = call_int_hook(inode_setxattr, idmap, dentry, name, value, size,
                            flags);

        if (ret == 1)
                ret = cap_inode_setxattr(dentry, name, value, size, flags);
        return ret;
}

/**
 * security_inode_set_acl() - Check if setting posix acls is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @acl_name: acl name
 * @kacl: acl struct
 *
 * Check permission before setting posix acls, the posix acls in @kacl are
 * identified by @acl_name.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_set_acl(struct mnt_idmap *idmap,
                           struct dentry *dentry, const char *acl_name,
                           struct posix_acl *kacl)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_set_acl, idmap, dentry, acl_name, kacl);
}

/**
 * security_inode_post_set_acl() - Update inode security from posix acls set
 * @dentry: file
 * @acl_name: acl name
 * @kacl: acl struct
 *
 * Update inode security data after successfully setting posix acls on @dentry.
 * The posix acls in @kacl are identified by @acl_name.
 */
void security_inode_post_set_acl(struct dentry *dentry, const char *acl_name,
                                 struct posix_acl *kacl)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_set_acl, dentry, acl_name, kacl);
}

/**
 * security_inode_get_acl() - Check if reading posix acls is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @acl_name: acl name
 *
 * Check permission before getting osix acls, the posix acls are identified by
 * @acl_name.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_get_acl(struct mnt_idmap *idmap,
                           struct dentry *dentry, const char *acl_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_get_acl, idmap, dentry, acl_name);
}

/**
 * security_inode_remove_acl() - Check if removing a posix acl is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @acl_name: acl name
 *
 * Check permission before removing posix acls, the posix acls are identified
 * by @acl_name.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_remove_acl(struct mnt_idmap *idmap,
                              struct dentry *dentry, const char *acl_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_remove_acl, idmap, dentry, acl_name);
}

/**
 * security_inode_post_remove_acl() - Update inode security after rm posix acls
 * @idmap: idmap of the mount
 * @dentry: file
 * @acl_name: acl name
 *
 * Update inode security data after successfully removing posix acls on
 * @dentry in @idmap. The posix acls are identified by @acl_name.
 */
void security_inode_post_remove_acl(struct mnt_idmap *idmap,
                                    struct dentry *dentry, const char *acl_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_remove_acl, idmap, dentry, acl_name);
}

/**
 * security_inode_post_setxattr() - Update the inode after a setxattr operation
 * @dentry: file
 * @name: xattr name
 * @value: xattr value
 * @size: xattr value size
 * @flags: flags
 *
 * Update inode security field after successful setxattr operation.
 */
void security_inode_post_setxattr(struct dentry *dentry, const char *name,
                                  const void *value, size_t size, int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_setxattr, dentry, name, value, size, flags);
}

/**
 * security_inode_getxattr() - Check if xattr access is allowed
 * @dentry: file
 * @name: xattr name
 *
 * Check permission before obtaining the extended attributes identified by
 * @name for @dentry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_getxattr(struct dentry *dentry, const char *name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_getxattr, dentry, name);
}

/**
 * security_inode_listxattr() - Check if listing xattrs is allowed
 * @dentry: file
 *
 * Check permission before obtaining the list of extended attribute names for
 * @dentry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_listxattr(struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_listxattr, dentry);
}

/**
 * security_inode_removexattr() - Check if removing an xattr is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @name: xattr name
 *
 * Check permission before removing the extended attribute identified by @name
 * for @dentry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_removexattr(struct mnt_idmap *idmap,
                               struct dentry *dentry, const char *name)
{
        int ret;

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        /*
         * SELinux and Smack integrate the cap call,
         * so assume that all LSMs supplying this call do so.
         */
        ret = call_int_hook(inode_removexattr, idmap, dentry, name);
        if (ret == 1)
                ret = cap_inode_removexattr(idmap, dentry, name);
        return ret;
}

/**
 * security_inode_post_removexattr() - Update the inode after a removexattr op
 * @dentry: file
 * @name: xattr name
 *
 * Update the inode after a successful removexattr operation.
 */
void security_inode_post_removexattr(struct dentry *dentry, const char *name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_removexattr, dentry, name);
}

/**
 * security_inode_need_killpriv() - Check if security_inode_killpriv() required
 * @dentry: associated dentry
 *
 * Called when an inode has been changed to determine if
 * security_inode_killpriv() should be called.
 *
 * Return: Return <0 on error to abort the inode change operation, return 0 if
 *         security_inode_killpriv() does not need to be called, return >0 if
 *         security_inode_killpriv() does need to be called.
 */
int security_inode_need_killpriv(struct dentry *dentry)
{
        return call_int_hook(inode_need_killpriv, dentry);
}

/**
 * security_inode_killpriv() - The setuid bit is removed, update LSM state
 * @idmap: idmap of the mount
 * @dentry: associated dentry
 *
 * The @dentry's setuid bit is being removed.  Remove similar security labels.
 * Called with the dentry->d_inode->i_mutex held.
 *
 * Return: Return 0 on success.  If error is returned, then the operation
 *         causing setuid bit removal is failed.
 */
int security_inode_killpriv(struct mnt_idmap *idmap,
                            struct dentry *dentry)
{
        return call_int_hook(inode_killpriv, idmap, dentry);
}

/**
 * security_inode_getsecurity() - Get the xattr security label of an inode
 * @idmap: idmap of the mount
 * @inode: inode
 * @name: xattr name
 * @buffer: security label buffer
 * @alloc: allocation flag
 *
 * Retrieve a copy of the extended attribute representation of the security
 * label associated with @name for @inode via @buffer.  Note that @name is the
 * remainder of the attribute name after the security prefix has been removed.
 * @alloc is used to specify if the call should return a value via the buffer
 * or just the value length.
 *
 * Return: Returns size of buffer on success.
 */
int security_inode_getsecurity(struct mnt_idmap *idmap,
                               struct inode *inode, const char *name,
                               void **buffer, bool alloc)
{
        if (unlikely(IS_PRIVATE(inode)))
                return LSM_RET_DEFAULT(inode_getsecurity);

        return call_int_hook(inode_getsecurity, idmap, inode, name, buffer,
                             alloc);
}

/**
 * security_inode_setsecurity() - Set the xattr security label of an inode
 * @inode: inode
 * @name: xattr name
 * @value: security label
 * @size: length of security label
 * @flags: flags
 *
 * Set the security label associated with @name for @inode from the extended
 * attribute value @value.  @size indicates the size of the @value in bytes.
 * @flags may be XATTR_CREATE, XATTR_REPLACE, or 0. Note that @name is the
 * remainder of the attribute name after the security. prefix has been removed.
 *
 * Return: Returns 0 on success.
 */
int security_inode_setsecurity(struct inode *inode, const char *name,
                               const void *value, size_t size, int flags)
{
        if (unlikely(IS_PRIVATE(inode)))
                return LSM_RET_DEFAULT(inode_setsecurity);

        return call_int_hook(inode_setsecurity, inode, name, value, size,
                             flags);
}

/**
 * security_inode_listsecurity() - List the xattr security label names
 * @inode: inode
 * @buffer: buffer
 * @buffer_size: size of buffer
 *
 * Copy the extended attribute names for the security labels associated with
 * @inode into @buffer.  The maximum size of @buffer is specified by
 * @buffer_size.  @buffer may be NULL to request the size of the buffer
 * required.
 *
 * Return: Returns number of bytes used/required on success.
 */
int security_inode_listsecurity(struct inode *inode,
                                char *buffer, size_t buffer_size)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_listsecurity, inode, buffer, buffer_size);
}
EXPORT_SYMBOL(security_inode_listsecurity);

/**
 * security_inode_getsecid() - Get an inode's secid
 * @inode: inode
 * @secid: secid to return
 *
 * Get the secid associated with the node.  In case of failure, @secid will be
 * set to zero.
 */
void security_inode_getsecid(struct inode *inode, u32 *secid)
{
        call_void_hook(inode_getsecid, inode, secid);
}

/**
 * security_inode_copy_up() - Create new creds for an overlayfs copy-up op
 * @src: union dentry of copy-up file
 * @new: newly created creds
 *
 * A file is about to be copied up from lower layer to upper layer of overlay
 * filesystem. Security module can prepare a set of new creds and modify as
 * need be and return new creds. Caller will switch to new creds temporarily to
 * create new file and release newly allocated creds.
 *
 * Return: Returns 0 on success or a negative error code on error.
 */
int security_inode_copy_up(struct dentry *src, struct cred **new)
{
        return call_int_hook(inode_copy_up, src, new);
}
EXPORT_SYMBOL(security_inode_copy_up);

/**
 * security_inode_copy_up_xattr() - Filter xattrs in an overlayfs copy-up op
 * @src: union dentry of copy-up file
 * @name: xattr name
 *
 * Filter the xattrs being copied up when a unioned file is copied up from a
 * lower layer to the union/overlay layer.   The caller is responsible for
 * reading and writing the xattrs, this hook is merely a filter.
 *
 * Return: Returns 0 to accept the xattr, 1 to discard the xattr, -EOPNOTSUPP
 *         if the security module does not know about attribute, or a negative
 *         error code to abort the copy up.
 */
int security_inode_copy_up_xattr(struct dentry *src, const char *name)
{
        int rc;

        /*
         * The implementation can return 0 (accept the xattr), 1 (discard the
         * xattr), -EOPNOTSUPP if it does not know anything about the xattr or
         * any other error code in case of an error.
         */
        rc = call_int_hook(inode_copy_up_xattr, src, name);
        if (rc != LSM_RET_DEFAULT(inode_copy_up_xattr))
                return rc;

        return LSM_RET_DEFAULT(inode_copy_up_xattr);
}
EXPORT_SYMBOL(security_inode_copy_up_xattr);

/**
 * security_kernfs_init_security() - Init LSM context for a kernfs node
 * @kn_dir: parent kernfs node
 * @kn: the kernfs node to initialize
 *
 * Initialize the security context of a newly created kernfs node based on its
 * own and its parent's attributes.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernfs_init_security(struct kernfs_node *kn_dir,
                                  struct kernfs_node *kn)
{
        return call_int_hook(kernfs_init_security, kn_dir, kn);
}

/**
 * security_file_permission() - Check file permissions
 * @file: file
 * @mask: requested permissions
 *
 * Check file permissions before accessing an open file.  This hook is called
 * by various operations that read or write files.  A security module can use
 * this hook to perform additional checking on these operations, e.g. to
 * revalidate permissions on use to support privilege bracketing or policy
 * changes.  Notice that this hook is used when the actual read/write
 * operations are performed, whereas the inode_security_ops hook is called when
 * a file is opened (as well as many other operations).  Although this hook can
 * be used to revalidate permissions for various system call operations that
 * read or write files, it does not address the revalidation of permissions for
 * memory-mapped files.  Security modules must handle this separately if they
 * need such revalidation.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_permission(struct file *file, int mask)
{
        return call_int_hook(file_permission, file, mask);
}

/**
 * security_file_alloc() - Allocate and init a file's LSM blob
 * @file: the file
 *
 * Allocate and attach a security structure to the file->f_security field.  The
 * security field is initialized to NULL when the structure is first created.
 *
 * Return: Return 0 if the hook is successful and permission is granted.
 */
int security_file_alloc(struct file *file)
{
        int rc = lsm_file_alloc(file);

        if (rc)
                return rc;
        rc = call_int_hook(file_alloc_security, file);
        if (unlikely(rc))
                security_file_free(file);
        return rc;
}

/**
 * security_file_release() - Perform actions before releasing the file ref
 * @file: the file
 *
 * Perform actions before releasing the last reference to a file.
 */
void security_file_release(struct file *file)
{
        call_void_hook(file_release, file);
}

/**
 * security_file_free() - Free a file's LSM blob
 * @file: the file
 *
 * Deallocate and free any security structures stored in file->f_security.
 */
void security_file_free(struct file *file)
{
        void *blob;

        call_void_hook(file_free_security, file);

        blob = file->f_security;
        if (blob) {
                file->f_security = NULL;
                kmem_cache_free(lsm_file_cache, blob);
        }
}

/**
 * security_file_ioctl() - Check if an ioctl is allowed
 * @file: associated file
 * @cmd: ioctl cmd
 * @arg: ioctl arguments
 *
 * Check permission for an ioctl operation on @file.  Note that @arg sometimes
 * represents a user space pointer; in other cases, it may be a simple integer
 * value.  When @arg represents a user space pointer, it should never be used
 * by the security module.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        return call_int_hook(file_ioctl, file, cmd, arg);
}
EXPORT_SYMBOL_GPL(security_file_ioctl);

/**
 * security_file_ioctl_compat() - Check if an ioctl is allowed in compat mode
 * @file: associated file
 * @cmd: ioctl cmd
 * @arg: ioctl arguments
 *
 * Compat version of security_file_ioctl() that correctly handles 32-bit
 * processes running on 64-bit kernels.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_ioctl_compat(struct file *file, unsigned int cmd,
                               unsigned long arg)
{
        return call_int_hook(file_ioctl_compat, file, cmd, arg);
}
EXPORT_SYMBOL_GPL(security_file_ioctl_compat);

static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
{
        /*
         * Does we have PROT_READ and does the application expect
         * it to imply PROT_EXEC?  If not, nothing to talk about...
         */
        if ((prot & (PROT_READ | PROT_EXEC)) != PROT_READ)
                return prot;
        if (!(current->personality & READ_IMPLIES_EXEC))
                return prot;
        /*
         * if that's an anonymous mapping, let it.
         */
        if (!file)
                return prot | PROT_EXEC;
        /*
         * ditto if it's not on noexec mount, except that on !MMU we need
         * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case
         */
        if (!path_noexec(&file->f_path)) {
#ifndef CONFIG_MMU
                if (file->f_op->mmap_capabilities) {
                        unsigned caps = file->f_op->mmap_capabilities(file);
                        if (!(caps & NOMMU_MAP_EXEC))
                                return prot;
                }
#endif
                return prot | PROT_EXEC;
        }
        /* anything on noexec mount won't get PROT_EXEC */
        return prot;
}

/**
 * security_mmap_file() - Check if mmap'ing a file is allowed
 * @file: file
 * @prot: protection applied by the kernel
 * @flags: flags
 *
 * Check permissions for a mmap operation.  The @file may be NULL, e.g. if
 * mapping anonymous memory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_mmap_file(struct file *file, unsigned long prot,
                       unsigned long flags)
{
        return call_int_hook(mmap_file, file, prot, mmap_prot(file, prot),
                             flags);
}

/**
 * security_mmap_addr() - Check if mmap'ing an address is allowed
 * @addr: address
 *
 * Check permissions for a mmap operation at @addr.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_mmap_addr(unsigned long addr)
{
        return call_int_hook(mmap_addr, addr);
}

/**
 * security_file_mprotect() - Check if changing memory protections is allowed
 * @vma: memory region
 * @reqprot: application requested protection
 * @prot: protection applied by the kernel
 *
 * Check permissions before changing memory access permissions.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
                           unsigned long prot)
{
        return call_int_hook(file_mprotect, vma, reqprot, prot);
}

/**
 * security_file_lock() - Check if a file lock is allowed
 * @file: file
 * @cmd: lock operation (e.g. F_RDLCK, F_WRLCK)
 *
 * Check permission before performing file locking operations.  Note the hook
 * mediates both flock and fcntl style locks.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_lock(struct file *file, unsigned int cmd)
{
        return call_int_hook(file_lock, file, cmd);
}

/**
 * security_file_fcntl() - Check if fcntl() op is allowed
 * @file: file
 * @cmd: fcntl command
 * @arg: command argument
 *
 * Check permission before allowing the file operation specified by @cmd from
 * being performed on the file @file.  Note that @arg sometimes represents a
 * user space pointer; in other cases, it may be a simple integer value.  When
 * @arg represents a user space pointer, it should never be used by the
 * security module.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
{
        return call_int_hook(file_fcntl, file, cmd, arg);
}

/**
 * security_file_set_fowner() - Set the file owner info in the LSM blob
 * @file: the file
 *
 * Save owner security information (typically from current->security) in
 * file->f_security for later use by the send_sigiotask hook.
 *
 * Return: Returns 0 on success.
 */
void security_file_set_fowner(struct file *file)
{
        call_void_hook(file_set_fowner, file);
}

/**
 * security_file_send_sigiotask() - Check if sending SIGIO/SIGURG is allowed
 * @tsk: target task
 * @fown: signal sender
 * @sig: signal to be sent, SIGIO is sent if 0
 *
 * Check permission for the file owner @fown to send SIGIO or SIGURG to the
 * process @tsk.  Note that this hook is sometimes called from interrupt.  Note
 * that the fown_struct, @fown, is never outside the context of a struct file,
 * so the file structure (and associated security information) can always be
 * obtained: container_of(fown, struct file, f_owner).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_send_sigiotask(struct task_struct *tsk,
                                 struct fown_struct *fown, int sig)
{
        return call_int_hook(file_send_sigiotask, tsk, fown, sig);
}

/**
 * security_file_receive() - Check if receiving a file via IPC is allowed
 * @file: file being received
 *
 * This hook allows security modules to control the ability of a process to
 * receive an open file descriptor via socket IPC.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_receive(struct file *file)
{
        return call_int_hook(file_receive, file);
}

/**
 * security_file_open() - Save open() time state for late use by the LSM
 * @file:
 *
 * Save open-time permission checking state for later use upon file_permission,
 * and recheck access if anything has changed since inode_permission.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_open(struct file *file)
{
        int ret;

        ret = call_int_hook(file_open, file);
        if (ret)
                return ret;

        return fsnotify_open_perm(file);
}

/**
 * security_file_post_open() - Evaluate a file after it has been opened
 * @file: the file
 * @mask: access mask
 *
 * Evaluate an opened file and the access mask requested with open(). The hook
 * is useful for LSMs that require the file content to be available in order to
 * make decisions.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_post_open(struct file *file, int mask)
{
        return call_int_hook(file_post_open, file, mask);
}
EXPORT_SYMBOL_GPL(security_file_post_open);

/**
 * security_file_truncate() - Check if truncating a file is allowed
 * @file: file
 *
 * Check permission before truncating a file, i.e. using ftruncate.  Note that
 * truncation permission may also be checked based on the path, using the
 * @path_truncate hook.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_truncate(struct file *file)
{
        return call_int_hook(file_truncate, file);
}

/**
 * security_task_alloc() - Allocate a task's LSM blob
 * @task: the task
 * @clone_flags: flags indicating what is being shared
 *
 * Handle allocation of task-related resources.
 *
 * Return: Returns a zero on success, negative values on failure.
 */
int security_task_alloc(struct task_struct *task, unsigned long clone_flags)
{
        int rc = lsm_task_alloc(task);

        if (rc)
                return rc;
        rc = call_int_hook(task_alloc, task, clone_flags);
        if (unlikely(rc))
                security_task_free(task);
        return rc;
}

/**
 * security_task_free() - Free a task's LSM blob and related resources
 * @task: task
 *
 * Handle release of task-related resources.  Note that this can be called from
 * interrupt context.
 */
void security_task_free(struct task_struct *task)
{
        call_void_hook(task_free, task);

        kfree(task->security);
        task->security = NULL;
}

/**
 * security_cred_alloc_blank() - Allocate the min memory to allow cred_transfer
 * @cred: credentials
 * @gfp: gfp flags
 *
 * Only allocate sufficient memory and attach to @cred such that
 * cred_transfer() will not get ENOMEM.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_cred_alloc_blank(struct cred *cred, gfp_t gfp)
{
        int rc = lsm_cred_alloc(cred, gfp);

        if (rc)
                return rc;

        rc = call_int_hook(cred_alloc_blank, cred, gfp);
        if (unlikely(rc))
                security_cred_free(cred);
        return rc;
}

/**
 * security_cred_free() - Free the cred's LSM blob and associated resources
 * @cred: credentials
 *
 * Deallocate and clear the cred->security field in a set of credentials.
 */
void security_cred_free(struct cred *cred)
{
        /*
         * There is a failure case in prepare_creds() that
         * may result in a call here with ->security being NULL.
         */
        if (unlikely(cred->security == NULL))
                return;

        call_void_hook(cred_free, cred);

        kfree(cred->security);
        cred->security = NULL;
}

/**
 * security_prepare_creds() - Prepare a new set of credentials
 * @new: new credentials
 * @old: original credentials
 * @gfp: gfp flags
 *
 * Prepare a new set of credentials by copying the data from the old set.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp)
{
        int rc = lsm_cred_alloc(new, gfp);

        if (rc)
                return rc;

        rc = call_int_hook(cred_prepare, new, old, gfp);
        if (unlikely(rc))
                security_cred_free(new);
        return rc;
}

/**
 * security_transfer_creds() - Transfer creds
 * @new: target credentials
 * @old: original credentials
 *
 * Transfer data from original creds to new creds.
 */
void security_transfer_creds(struct cred *new, const struct cred *old)
{
        call_void_hook(cred_transfer, new, old);
}

/**
 * security_cred_getsecid() - Get the secid from a set of credentials
 * @c: credentials
 * @secid: secid value
 *
 * Retrieve the security identifier of the cred structure @c.  In case of
 * failure, @secid will be set to zero.
 */
void security_cred_getsecid(const struct cred *c, u32 *secid)
{
        *secid = 0;
        call_void_hook(cred_getsecid, c, secid);
}
EXPORT_SYMBOL(security_cred_getsecid);

/**
 * security_kernel_act_as() - Set the kernel credentials to act as secid
 * @new: credentials
 * @secid: secid
 *
 * Set the credentials for a kernel service to act as (subjective context).
 * The current task must be the one that nominated @secid.
 *
 * Return: Returns 0 if successful.
 */
int security_kernel_act_as(struct cred *new, u32 secid)
{
        return call_int_hook(kernel_act_as, new, secid);
}

/**
 * security_kernel_create_files_as() - Set file creation context using an inode
 * @new: target credentials
 * @inode: reference inode
 *
 * Set the file creation context in a set of credentials to be the same as the
 * objective context of the specified inode.  The current task must be the one
 * that nominated @inode.
 *
 * Return: Returns 0 if successful.
 */
int security_kernel_create_files_as(struct cred *new, struct inode *inode)
{
        return call_int_hook(kernel_create_files_as, new, inode);
}

/**
 * security_kernel_module_request() - Check if loading a module is allowed
 * @kmod_name: module name
 *
 * Ability to trigger the kernel to automatically upcall to userspace for
 * userspace to load a kernel module with the given name.
 *
 * Return: Returns 0 if successful.
 */
int security_kernel_module_request(char *kmod_name)
{
        return call_int_hook(kernel_module_request, kmod_name);
}

/**
 * security_kernel_read_file() - Read a file specified by userspace
 * @file: file
 * @id: file identifier
 * @contents: trust if security_kernel_post_read_file() will be called
 *
 * Read a file specified by userspace.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernel_read_file(struct file *file, enum kernel_read_file_id id,
                              bool contents)
{
        return call_int_hook(kernel_read_file, file, id, contents);
}
EXPORT_SYMBOL_GPL(security_kernel_read_file);

/**
 * security_kernel_post_read_file() - Read a file specified by userspace
 * @file: file
 * @buf: file contents
 * @size: size of file contents
 * @id: file identifier
 *
 * Read a file specified by userspace.  This must be paired with a prior call
 * to security_kernel_read_file() call that indicated this hook would also be
 * called, see security_kernel_read_file() for more information.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernel_post_read_file(struct file *file, char *buf, loff_t size,
                                   enum kernel_read_file_id id)
{
        return call_int_hook(kernel_post_read_file, file, buf, size, id);
}
EXPORT_SYMBOL_GPL(security_kernel_post_read_file);

/**
 * security_kernel_load_data() - Load data provided by userspace
 * @id: data identifier
 * @contents: true if security_kernel_post_load_data() will be called
 *
 * Load data provided by userspace.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernel_load_data(enum kernel_load_data_id id, bool contents)
{
        return call_int_hook(kernel_load_data, id, contents);
}
EXPORT_SYMBOL_GPL(security_kernel_load_data);

/**
 * security_kernel_post_load_data() - Load userspace data from a non-file source
 * @buf: data
 * @size: size of data
 * @id: data identifier
 * @description: text description of data, specific to the id value
 *
 * Load data provided by a non-file source (usually userspace buffer).  This
 * must be paired with a prior security_kernel_load_data() call that indicated
 * this hook would also be called, see security_kernel_load_data() for more
 * information.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernel_post_load_data(char *buf, loff_t size,
                                   enum kernel_load_data_id id,
                                   char *description)
{
        return call_int_hook(kernel_post_load_data, buf, size, id, description);
}
EXPORT_SYMBOL_GPL(security_kernel_post_load_data);

/**
 * security_task_fix_setuid() - Update LSM with new user id attributes
 * @new: updated credentials
 * @old: credentials being replaced
 * @flags: LSM_SETID_* flag values
 *
 * Update the module's state after setting one or more of the user identity
 * attributes of the current process.  The @flags parameter indicates which of
 * the set*uid system calls invoked this hook.  If @new is the set of
 * credentials that will be installed.  Modifications should be made to this
 * rather than to @current->cred.
 *
 * Return: Returns 0 on success.
 */
int security_task_fix_setuid(struct cred *new, const struct cred *old,
                             int flags)
{
        return call_int_hook(task_fix_setuid, new, old, flags);
}

/**
 * security_task_fix_setgid() - Update LSM with new group id attributes
 * @new: updated credentials
 * @old: credentials being replaced
 * @flags: LSM_SETID_* flag value
 *
 * Update the module's state after setting one or more of the group identity
 * attributes of the current process.  The @flags parameter indicates which of
 * the set*gid system calls invoked this hook.  @new is the set of credentials
 * that will be installed.  Modifications should be made to this rather than to
 * @current->cred.
 *
 * Return: Returns 0 on success.
 */
int security_task_fix_setgid(struct cred *new, const struct cred *old,
                             int flags)
{
        return call_int_hook(task_fix_setgid, new, old, flags);
}

/**
 * security_task_fix_setgroups() - Update LSM with new supplementary groups
 * @new: updated credentials
 * @old: credentials being replaced
 *
 * Update the module's state after setting the supplementary group identity
 * attributes of the current process.  @new is the set of credentials that will
 * be installed.  Modifications should be made to this rather than to
 * @current->cred.
 *
 * Return: Returns 0 on success.
 */
int security_task_fix_setgroups(struct cred *new, const struct cred *old)
{
        return call_int_hook(task_fix_setgroups, new, old);
}

/**
 * security_task_setpgid() - Check if setting the pgid is allowed
 * @p: task being modified
 * @pgid: new pgid
 *
 * Check permission before setting the process group identifier of the process
 * @p to @pgid.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setpgid(struct task_struct *p, pid_t pgid)
{
        return call_int_hook(task_setpgid, p, pgid);
}

/**
 * security_task_getpgid() - Check if getting the pgid is allowed
 * @p: task
 *
 * Check permission before getting the process group identifier of the process
 * @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_getpgid(struct task_struct *p)
{
        return call_int_hook(task_getpgid, p);
}

/**
 * security_task_getsid() - Check if getting the session id is allowed
 * @p: task
 *
 * Check permission before getting the session identifier of the process @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_getsid(struct task_struct *p)
{
        return call_int_hook(task_getsid, p);
}

/**
 * security_current_getsecid_subj() - Get the current task's subjective secid
 * @secid: secid value
 *
 * Retrieve the subjective security identifier of the current task and return
 * it in @secid.  In case of failure, @secid will be set to zero.
 */
void security_current_getsecid_subj(u32 *secid)
{
        *secid = 0;
        call_void_hook(current_getsecid_subj, secid);
}
EXPORT_SYMBOL(security_current_getsecid_subj);

/**
 * security_task_getsecid_obj() - Get a task's objective secid
 * @p: target task
 * @secid: secid value
 *
 * Retrieve the objective security identifier of the task_struct in @p and
 * return it in @secid. In case of failure, @secid will be set to zero.
 */
void security_task_getsecid_obj(struct task_struct *p, u32 *secid)
{
        *secid = 0;
        call_void_hook(task_getsecid_obj, p, secid);
}
EXPORT_SYMBOL(security_task_getsecid_obj);

/**
 * security_task_setnice() - Check if setting a task's nice value is allowed
 * @p: target task
 * @nice: nice value
 *
 * Check permission before setting the nice value of @p to @nice.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setnice(struct task_struct *p, int nice)
{
        return call_int_hook(task_setnice, p, nice);
}

/**
 * security_task_setioprio() - Check if setting a task's ioprio is allowed
 * @p: target task
 * @ioprio: ioprio value
 *
 * Check permission before setting the ioprio value of @p to @ioprio.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setioprio(struct task_struct *p, int ioprio)
{
        return call_int_hook(task_setioprio, p, ioprio);
}

/**
 * security_task_getioprio() - Check if getting a task's ioprio is allowed
 * @p: task
 *
 * Check permission before getting the ioprio value of @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_getioprio(struct task_struct *p)
{
        return call_int_hook(task_getioprio, p);
}

/**
 * security_task_prlimit() - Check if get/setting resources limits is allowed
 * @cred: current task credentials
 * @tcred: target task credentials
 * @flags: LSM_PRLIMIT_* flag bits indicating a get/set/both
 *
 * Check permission before getting and/or setting the resource limits of
 * another task.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_prlimit(const struct cred *cred, const struct cred *tcred,
                          unsigned int flags)
{
        return call_int_hook(task_prlimit, cred, tcred, flags);
}

/**
 * security_task_setrlimit() - Check if setting a new rlimit value is allowed
 * @p: target task's group leader
 * @resource: resource whose limit is being set
 * @new_rlim: new resource limit
 *
 * Check permission before setting the resource limits of process @p for
 * @resource to @new_rlim.  The old resource limit values can be examined by
 * dereferencing (p->signal->rlim + resource).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setrlimit(struct task_struct *p, unsigned int resource,
                            struct rlimit *new_rlim)
{
        return call_int_hook(task_setrlimit, p, resource, new_rlim);
}

/**
 * security_task_setscheduler() - Check if setting sched policy/param is allowed
 * @p: target task
 *
 * Check permission before setting scheduling policy and/or parameters of
 * process @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setscheduler(struct task_struct *p)
{
        return call_int_hook(task_setscheduler, p);
}

/**
 * security_task_getscheduler() - Check if getting scheduling info is allowed
 * @p: target task
 *
 * Check permission before obtaining scheduling information for process @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_getscheduler(struct task_struct *p)
{
        return call_int_hook(task_getscheduler, p);
}

/**
 * security_task_movememory() - Check if moving memory is allowed
 * @p: task
 *
 * Check permission before moving memory owned by process @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_movememory(struct task_struct *p)
{
        return call_int_hook(task_movememory, p);
}

/**
 * security_task_kill() - Check if sending a signal is allowed
 * @p: target process
 * @info: signal information
 * @sig: signal value
 * @cred: credentials of the signal sender, NULL if @current
 *
 * Check permission before sending signal @sig to @p.  @info can be NULL, the
 * constant 1, or a pointer to a kernel_siginfo structure.  If @info is 1 or
 * SI_FROMKERNEL(info) is true, then the signal should be viewed as coming from
 * the kernel and should typically be permitted.  SIGIO signals are handled
 * separately by the send_sigiotask hook in file_security_ops.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_kill(struct task_struct *p, struct kernel_siginfo *info,
                       int sig, const struct cred *cred)
{
        return call_int_hook(task_kill, p, info, sig, cred);
}

/**
 * security_task_prctl() - Check if a prctl op is allowed
 * @option: operation
 * @arg2: argument
 * @arg3: argument
 * @arg4: argument
 * @arg5: argument
 *
 * Check permission before performing a process control operation on the
 * current process.
 *
 * Return: Return -ENOSYS if no-one wanted to handle this op, any other value
 *         to cause prctl() to return immediately with that value.
 */
int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                        unsigned long arg4, unsigned long arg5)
{
        int thisrc;
        int rc = LSM_RET_DEFAULT(task_prctl);
        struct security_hook_list *hp;

        hlist_for_each_entry(hp, &security_hook_heads.task_prctl, list) {
                thisrc = hp->hook.task_prctl(option, arg2, arg3, arg4, arg5);
                if (thisrc != LSM_RET_DEFAULT(task_prctl)) {
                        rc = thisrc;
                        if (thisrc != 0)
                                break;
                }
        }
        return rc;
}

/**
 * security_task_to_inode() - Set the security attributes of a task's inode
 * @p: task
 * @inode: inode
 *
 * Set the security attributes for an inode based on an associated task's
 * security attributes, e.g. for /proc/pid inodes.
 */
void security_task_to_inode(struct task_struct *p, struct inode *inode)
{
        call_void_hook(task_to_inode, p, inode);
}

/**
 * security_create_user_ns() - Check if creating a new userns is allowed
 * @cred: prepared creds
 *
 * Check permission prior to creating a new user namespace.
 *
 * Return: Returns 0 if successful, otherwise < 0 error code.
 */
int security_create_user_ns(const struct cred *cred)
{
        return call_int_hook(userns_create, cred);
}

/**
 * security_ipc_permission() - Check if sysv ipc access is allowed
 * @ipcp: ipc permission structure
 * @flag: requested permissions
 *
 * Check permissions for access to IPC.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
{
        return call_int_hook(ipc_permission, ipcp, flag);
}

/**
 * security_ipc_getsecid() - Get the sysv ipc object's secid
 * @ipcp: ipc permission structure
 * @secid: secid pointer
 *
 * Get the secid associated with the ipc object.  In case of failure, @secid
 * will be set to zero.
 */
void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid)
{
        *secid = 0;
        call_void_hook(ipc_getsecid, ipcp, secid);
}

/**
 * security_msg_msg_alloc() - Allocate a sysv ipc message LSM blob
 * @msg: message structure
 *
 * Allocate and attach a security structure to the msg->security field.  The
 * security field is initialized to NULL when the structure is first created.
 *
 * Return: Return 0 if operation was successful and permission is granted.
 */
int security_msg_msg_alloc(struct msg_msg *msg)
{
        int rc = lsm_msg_msg_alloc(msg);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(msg_msg_alloc_security, msg);
        if (unlikely(rc))
                security_msg_msg_free(msg);
        return rc;
}

/**
 * security_msg_msg_free() - Free a sysv ipc message LSM blob
 * @msg: message structure
 *
 * Deallocate the security structure for this message.
 */
void security_msg_msg_free(struct msg_msg *msg)
{
        call_void_hook(msg_msg_free_security, msg);
        kfree(msg->security);
        msg->security = NULL;
}

/**
 * security_msg_queue_alloc() - Allocate a sysv ipc msg queue LSM blob
 * @msq: sysv ipc permission structure
 *
 * Allocate and attach a security structure to @msg. The security field is
 * initialized to NULL when the structure is first created.
 *
 * Return: Returns 0 if operation was successful and permission is granted.
 */
int security_msg_queue_alloc(struct kern_ipc_perm *msq)
{
        int rc = lsm_ipc_alloc(msq);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(msg_queue_alloc_security, msq);
        if (unlikely(rc))
                security_msg_queue_free(msq);
        return rc;
}

/**
 * security_msg_queue_free() - Free a sysv ipc msg queue LSM blob
 * @msq: sysv ipc permission structure
 *
 * Deallocate security field @perm->security for the message queue.
 */
void security_msg_queue_free(struct kern_ipc_perm *msq)
{
        call_void_hook(msg_queue_free_security, msq);
        kfree(msq->security);
        msq->security = NULL;
}

/**
 * security_msg_queue_associate() - Check if a msg queue operation is allowed
 * @msq: sysv ipc permission structure
 * @msqflg: operation flags
 *
 * Check permission when a message queue is requested through the msgget system
 * call. This hook is only called when returning the message queue identifier
 * for an existing message queue, not when a new message queue is created.
 *
 * Return: Return 0 if permission is granted.
 */
int security_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg)
{
        return call_int_hook(msg_queue_associate, msq, msqflg);
}

/**
 * security_msg_queue_msgctl() - Check if a msg queue operation is allowed
 * @msq: sysv ipc permission structure
 * @cmd: operation
 *
 * Check permission when a message control operation specified by @cmd is to be
 * performed on the message queue with permissions.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd)
{
        return call_int_hook(msg_queue_msgctl, msq, cmd);
}

/**
 * security_msg_queue_msgsnd() - Check if sending a sysv ipc message is allowed
 * @msq: sysv ipc permission structure
 * @msg: message
 * @msqflg: operation flags
 *
 * Check permission before a message, @msg, is enqueued on the message queue
 * with permissions specified in @msq.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_msg_queue_msgsnd(struct kern_ipc_perm *msq,
                              struct msg_msg *msg, int msqflg)
{
        return call_int_hook(msg_queue_msgsnd, msq, msg, msqflg);
}

/**
 * security_msg_queue_msgrcv() - Check if receiving a sysv ipc msg is allowed
 * @msq: sysv ipc permission structure
 * @msg: message
 * @target: target task
 * @type: type of message requested
 * @mode: operation flags
 *
 * Check permission before a message, @msg, is removed from the message        queue.
 * The @target task structure contains a pointer to the process that will be
 * receiving the message (not equal to the current process when inline receives
 * are being performed).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg,
                              struct task_struct *target, long type, int mode)
{
        return call_int_hook(msg_queue_msgrcv, msq, msg, target, type, mode);
}

/**
 * security_shm_alloc() - Allocate a sysv shm LSM blob
 * @shp: sysv ipc permission structure
 *
 * Allocate and attach a security structure to the @shp security field.  The
 * security field is initialized to NULL when the structure is first created.
 *
 * Return: Returns 0 if operation was successful and permission is granted.
 */
int security_shm_alloc(struct kern_ipc_perm *shp)
{
        int rc = lsm_ipc_alloc(shp);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(shm_alloc_security, shp);
        if (unlikely(rc))
                security_shm_free(shp);
        return rc;
}

/**
 * security_shm_free() - Free a sysv shm LSM blob
 * @shp: sysv ipc permission structure
 *
 * Deallocate the security structure @perm->security for the memory segment.
 */
void security_shm_free(struct kern_ipc_perm *shp)
{
        call_void_hook(shm_free_security, shp);
        kfree(shp->security);
        shp->security = NULL;
}

/**
 * security_shm_associate() - Check if a sysv shm operation is allowed
 * @shp: sysv ipc permission structure
 * @shmflg: operation flags
 *
 * Check permission when a shared memory region is requested through the shmget
 * system call. This hook is only called when returning the shared memory
 * region identifier for an existing region, not when a new shared memory
 * region is created.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_shm_associate(struct kern_ipc_perm *shp, int shmflg)
{
        return call_int_hook(shm_associate, shp, shmflg);
}

/**
 * security_shm_shmctl() - Check if a sysv shm operation is allowed
 * @shp: sysv ipc permission structure
 * @cmd: operation
 *
 * Check permission when a shared memory control operation specified by @cmd is
 * to be performed on the shared memory region with permissions in @shp.
 *
 * Return: Return 0 if permission is granted.
 */
int security_shm_shmctl(struct kern_ipc_perm *shp, int cmd)
{
        return call_int_hook(shm_shmctl, shp, cmd);
}

/**
 * security_shm_shmat() - Check if a sysv shm attach operation is allowed
 * @shp: sysv ipc permission structure
 * @shmaddr: address of memory region to attach
 * @shmflg: operation flags
 *
 * Check permissions prior to allowing the shmat system call to attach the
 * shared memory segment with permissions @shp to the data segment of the
 * calling process. The attaching address is specified by @shmaddr.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_shm_shmat(struct kern_ipc_perm *shp,
                       char __user *shmaddr, int shmflg)
{
        return call_int_hook(shm_shmat, shp, shmaddr, shmflg);
}

/**
 * security_sem_alloc() - Allocate a sysv semaphore LSM blob
 * @sma: sysv ipc permission structure
 *
 * Allocate and attach a security structure to the @sma security field. The
 * security field is initialized to NULL when the structure is first created.
 *
 * Return: Returns 0 if operation was successful and permission is granted.
 */
int security_sem_alloc(struct kern_ipc_perm *sma)
{
        int rc = lsm_ipc_alloc(sma);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(sem_alloc_security, sma);
        if (unlikely(rc))
                security_sem_free(sma);
        return rc;
}

/**
 * security_sem_free() - Free a sysv semaphore LSM blob
 * @sma: sysv ipc permission structure
 *
 * Deallocate security structure @sma->security for the semaphore.
 */
void security_sem_free(struct kern_ipc_perm *sma)
{
        call_void_hook(sem_free_security, sma);
        kfree(sma->security);
        sma->security = NULL;
}

/**
 * security_sem_associate() - Check if a sysv semaphore operation is allowed
 * @sma: sysv ipc permission structure
 * @semflg: operation flags
 *
 * Check permission when a semaphore is requested through the semget system
 * call. This hook is only called when returning the semaphore identifier for
 * an existing semaphore, not when a new one must be created.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sem_associate(struct kern_ipc_perm *sma, int semflg)
{
        return call_int_hook(sem_associate, sma, semflg);
}

/**
 * security_sem_semctl() - Check if a sysv semaphore operation is allowed
 * @sma: sysv ipc permission structure
 * @cmd: operation
 *
 * Check permission when a semaphore operation specified by @cmd is to be
 * performed on the semaphore.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sem_semctl(struct kern_ipc_perm *sma, int cmd)
{
        return call_int_hook(sem_semctl, sma, cmd);
}

/**
 * security_sem_semop() - Check if a sysv semaphore operation is allowed
 * @sma: sysv ipc permission structure
 * @sops: operations to perform
 * @nsops: number of operations
 * @alter: flag indicating changes will be made
 *
 * Check permissions before performing operations on members of the semaphore
 * set. If the @alter flag is nonzero, the semaphore set may be modified.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops,
                       unsigned nsops, int alter)
{
        return call_int_hook(sem_semop, sma, sops, nsops, alter);
}

/**
 * security_d_instantiate() - Populate an inode's LSM state based on a dentry
 * @dentry: dentry
 * @inode: inode
 *
 * Fill in @inode security information for a @dentry if allowed.
 */
void security_d_instantiate(struct dentry *dentry, struct inode *inode)
{
        if (unlikely(inode && IS_PRIVATE(inode)))
                return;
        call_void_hook(d_instantiate, dentry, inode);
}
EXPORT_SYMBOL(security_d_instantiate);

/*
 * Please keep this in sync with it's counterpart in security/lsm_syscalls.c
 */

/**
 * security_getselfattr - Read an LSM attribute of the current process.
 * @attr: which attribute to return
 * @uctx: the user-space destination for the information, or NULL
 * @size: pointer to the size of space available to receive the data
 * @flags: special handling options. LSM_FLAG_SINGLE indicates that only
 * attributes associated with the LSM identified in the passed @ctx be
 * reported.
 *
 * A NULL value for @uctx can be used to get both the number of attributes
 * and the size of the data.
 *
 * Returns the number of attributes found on success, negative value
 * on error. @size is reset to the total size of the data.
 * If @size is insufficient to contain the data -E2BIG is returned.
 */
int security_getselfattr(unsigned int attr, struct lsm_ctx __user *uctx,
                         u32 __user *size, u32 flags)
{
        struct security_hook_list *hp;
        struct lsm_ctx lctx = { .id = LSM_ID_UNDEF, };
        u8 __user *base = (u8 __user *)uctx;
        u32 entrysize;
        u32 total = 0;
        u32 left;
        bool toobig = false;
        bool single = false;
        int count = 0;
        int rc;

        if (attr == LSM_ATTR_UNDEF)
                return -EINVAL;
        if (size == NULL)
                return -EINVAL;
        if (get_user(left, size))
                return -EFAULT;

        if (flags) {
                /*
                 * Only flag supported is LSM_FLAG_SINGLE
                 */
                if (flags != LSM_FLAG_SINGLE || !uctx)
                        return -EINVAL;
                if (copy_from_user(&lctx, uctx, sizeof(lctx)))
                        return -EFAULT;
                /*
                 * If the LSM ID isn't specified it is an error.
                 */
                if (lctx.id == LSM_ID_UNDEF)
                        return -EINVAL;
                single = true;
        }

        /*
         * In the usual case gather all the data from the LSMs.
         * In the single case only get the data from the LSM specified.
         */
        hlist_for_each_entry(hp, &security_hook_heads.getselfattr, list) {
                if (single && lctx.id != hp->lsmid->id)
                        continue;
                entrysize = left;
                if (base)
                        uctx = (struct lsm_ctx __user *)(base + total);
                rc = hp->hook.getselfattr(attr, uctx, &entrysize, flags);
                if (rc == -EOPNOTSUPP) {
                        rc = 0;
                        continue;
                }
                if (rc == -E2BIG) {
                        rc = 0;
                        left = 0;
                        toobig = true;
                } else if (rc < 0)
                        return rc;
                else
                        left -= entrysize;

                total += entrysize;
                count += rc;
                if (single)
                        break;
        }
        if (put_user(total, size))
                return -EFAULT;
        if (toobig)
                return -E2BIG;
        if (count == 0)
                return LSM_RET_DEFAULT(getselfattr);
        return count;
}

/*
 * Please keep this in sync with it's counterpart in security/lsm_syscalls.c
 */

/**
 * security_setselfattr - Set an LSM attribute on the current process.
 * @attr: which attribute to set
 * @uctx: the user-space source for the information
 * @size: the size of the data
 * @flags: reserved for future use, must be 0
 *
 * Set an LSM attribute for the current process. The LSM, attribute
 * and new value are included in @uctx.
 *
 * Returns 0 on success, -EINVAL if the input is inconsistent, -EFAULT
 * if the user buffer is inaccessible, E2BIG if size is too big, or an
 * LSM specific failure.
 */
int security_setselfattr(unsigned int attr, struct lsm_ctx __user *uctx,
                         u32 size, u32 flags)
{
        struct security_hook_list *hp;
        struct lsm_ctx *lctx;
        int rc = LSM_RET_DEFAULT(setselfattr);
        u64 required_len;

        if (flags)
                return -EINVAL;
        if (size < sizeof(*lctx))
                return -EINVAL;
        if (size > PAGE_SIZE)
                return -E2BIG;

        lctx = memdup_user(uctx, size);
        if (IS_ERR(lctx))
                return PTR_ERR(lctx);

        if (size < lctx->len ||
            check_add_overflow(sizeof(*lctx), lctx->ctx_len, &required_len) ||
            lctx->len < required_len) {
                rc = -EINVAL;
                goto free_out;
        }

        hlist_for_each_entry(hp, &security_hook_heads.setselfattr, list)
                if ((hp->lsmid->id) == lctx->id) {
                        rc = hp->hook.setselfattr(attr, lctx, size, flags);
                        break;
                }

free_out:
        kfree(lctx);
        return rc;
}

/**
 * security_getprocattr() - Read an attribute for a task
 * @p: the task
 * @lsmid: LSM identification
 * @name: attribute name
 * @value: attribute value
 *
 * Read attribute @name for task @p and store it into @value if allowed.
 *
 * Return: Returns the length of @value on success, a negative value otherwise.
 */
int security_getprocattr(struct task_struct *p, int lsmid, const char *name,
                         char **value)
{
        struct security_hook_list *hp;

        hlist_for_each_entry(hp, &security_hook_heads.getprocattr, list) {
                if (lsmid != 0 && lsmid != hp->lsmid->id)
                        continue;
                return hp->hook.getprocattr(p, name, value);
        }
        return LSM_RET_DEFAULT(getprocattr);
}

/**
 * security_setprocattr() - Set an attribute for a task
 * @lsmid: LSM identification
 * @name: attribute name
 * @value: attribute value
 * @size: attribute value size
 *
 * Write (set) the current task's attribute @name to @value, size @size if
 * allowed.
 *
 * Return: Returns bytes written on success, a negative value otherwise.
 */
int security_setprocattr(int lsmid, const char *name, void *value, size_t size)
{
        struct security_hook_list *hp;

        hlist_for_each_entry(hp, &security_hook_heads.setprocattr, list) {
                if (lsmid != 0 && lsmid != hp->lsmid->id)
                        continue;
                return hp->hook.setprocattr(name, value, size);
        }
        return LSM_RET_DEFAULT(setprocattr);
}

/**
 * security_netlink_send() - Save info and check if netlink sending is allowed
 * @sk: sending socket
 * @skb: netlink message
 *
 * Save security information for a netlink message so that permission checking
 * can be performed when the message is processed.  The security information
 * can be saved using the eff_cap field of the netlink_skb_parms structure.
 * Also may be used to provide fine grained control over message transmission.
 *
 * Return: Returns 0 if the information was successfully saved and message is
 *         allowed to be transmitted.
 */
int security_netlink_send(struct sock *sk, struct sk_buff *skb)
{
        return call_int_hook(netlink_send, sk, skb);
}

/**
 * security_ismaclabel() - Check if the named attribute is a MAC label
 * @name: full extended attribute name
 *
 * Check if the extended attribute specified by @name represents a MAC label.
 *
 * Return: Returns 1 if name is a MAC attribute otherwise returns 0.
 */
int security_ismaclabel(const char *name)
{
        return call_int_hook(ismaclabel, name);
}
EXPORT_SYMBOL(security_ismaclabel);

/**
 * security_secid_to_secctx() - Convert a secid to a secctx
 * @secid: secid
 * @secdata: secctx
 * @seclen: secctx length
 *
 * Convert secid to security context.  If @secdata is NULL the length of the
 * result will be returned in @seclen, but no @secdata will be returned.  This
 * does mean that the length could change between calls to check the length and
 * the next call which actually allocates and returns the @secdata.
 *
 * Return: Return 0 on success, error on failure.
 */
int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
{
        return call_int_hook(secid_to_secctx, secid, secdata, seclen);
}
EXPORT_SYMBOL(security_secid_to_secctx);

/**
 * security_secctx_to_secid() - Convert a secctx to a secid
 * @secdata: secctx
 * @seclen: length of secctx
 * @secid: secid
 *
 * Convert security context to secid.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
{
        *secid = 0;
        return call_int_hook(secctx_to_secid, secdata, seclen, secid);
}
EXPORT_SYMBOL(security_secctx_to_secid);

/**
 * security_release_secctx() - Free a secctx buffer
 * @secdata: secctx
 * @seclen: length of secctx
 *
 * Release the security context.
 */
void security_release_secctx(char *secdata, u32 seclen)
{
        call_void_hook(release_secctx, secdata, seclen);
}
EXPORT_SYMBOL(security_release_secctx);

/**
 * security_inode_invalidate_secctx() - Invalidate an inode's security label
 * @inode: inode
 *
 * Notify the security module that it must revalidate the security context of
 * an inode.
 */
void security_inode_invalidate_secctx(struct inode *inode)
{
        call_void_hook(inode_invalidate_secctx, inode);
}
EXPORT_SYMBOL(security_inode_invalidate_secctx);

/**
 * security_inode_notifysecctx() - Notify the LSM of an inode's security label
 * @inode: inode
 * @ctx: secctx
 * @ctxlen: length of secctx
 *
 * Notify the security module of what the security context of an inode should
 * be.  Initializes the incore security context managed by the security module
 * for this inode.  Example usage: NFS client invokes this hook to initialize
 * the security context in its incore inode to the value provided by the server
 * for the file when the server returned the file's attributes to the client.
 * Must be called with inode->i_mutex locked.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen)
{
        return call_int_hook(inode_notifysecctx, inode, ctx, ctxlen);
}
EXPORT_SYMBOL(security_inode_notifysecctx);

/**
 * security_inode_setsecctx() - Change the security label of an inode
 * @dentry: inode
 * @ctx: secctx
 * @ctxlen: length of secctx
 *
 * Change the security context of an inode.  Updates the incore security
 * context managed by the security module and invokes the fs code as needed
 * (via __vfs_setxattr_noperm) to update any backing xattrs that represent the
 * context.  Example usage: NFS server invokes this hook to change the security
 * context in its incore inode and on the backing filesystem to a value
 * provided by the client on a SETATTR operation.  Must be called with
 * inode->i_mutex locked.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
{
        return call_int_hook(inode_setsecctx, dentry, ctx, ctxlen);
}
EXPORT_SYMBOL(security_inode_setsecctx);

/**
 * security_inode_getsecctx() - Get the security label of an inode
 * @inode: inode
 * @ctx: secctx
 * @ctxlen: length of secctx
 *
 * On success, returns 0 and fills out @ctx and @ctxlen with the security
 * context for the given @inode.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
{
        return call_int_hook(inode_getsecctx, inode, ctx, ctxlen);
}
EXPORT_SYMBOL(security_inode_getsecctx);

#ifdef CONFIG_WATCH_QUEUE
/**
 * security_post_notification() - Check if a watch notification can be posted
 * @w_cred: credentials of the task that set the watch
 * @cred: credentials of the task which triggered the watch
 * @n: the notification
 *
 * Check to see if a watch notification can be posted to a particular queue.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_post_notification(const struct cred *w_cred,
                               const struct cred *cred,
                               struct watch_notification *n)
{
        return call_int_hook(post_notification, w_cred, cred, n);
}
#endif /* CONFIG_WATCH_QUEUE */

#ifdef CONFIG_KEY_NOTIFICATIONS
/**
 * security_watch_key() - Check if a task is allowed to watch for key events
 * @key: the key to watch
 *
 * Check to see if a process is allowed to watch for event notifications from
 * a key or keyring.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_watch_key(struct key *key)
{
        return call_int_hook(watch_key, key);
}
#endif /* CONFIG_KEY_NOTIFICATIONS */

#ifdef CONFIG_SECURITY_NETWORK
/**
 * security_unix_stream_connect() - Check if a AF_UNIX stream is allowed
 * @sock: originating sock
 * @other: peer sock
 * @newsk: new sock
 *
 * Check permissions before establishing a Unix domain stream connection
 * between @sock and @other.
 *
 * The @unix_stream_connect and @unix_may_send hooks were necessary because
 * Linux provides an alternative to the conventional file name space for Unix
 * domain sockets.  Whereas binding and connecting to sockets in the file name
 * space is mediated by the typical file permissions (and caught by the mknod
 * and permission hooks in inode_security_ops), binding and connecting to
 * sockets in the abstract name space is completely unmediated.  Sufficient
 * control of Unix domain sockets in the abstract name space isn't possible
 * using only the socket layer hooks, since we need to know the actual target
 * socket, which is not looked up until we are inside the af_unix code.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_unix_stream_connect(struct sock *sock, struct sock *other,
                                 struct sock *newsk)
{
        return call_int_hook(unix_stream_connect, sock, other, newsk);
}
EXPORT_SYMBOL(security_unix_stream_connect);

/**
 * security_unix_may_send() - Check if AF_UNIX socket can send datagrams
 * @sock: originating sock
 * @other: peer sock
 *
 * Check permissions before connecting or sending datagrams from @sock to
 * @other.
 *
 * The @unix_stream_connect and @unix_may_send hooks were necessary because
 * Linux provides an alternative to the conventional file name space for Unix
 * domain sockets.  Whereas binding and connecting to sockets in the file name
 * space is mediated by the typical file permissions (and caught by the mknod
 * and permission hooks in inode_security_ops), binding and connecting to
 * sockets in the abstract name space is completely unmediated.  Sufficient
 * control of Unix domain sockets in the abstract name space isn't possible
 * using only the socket layer hooks, since we need to know the actual target
 * socket, which is not looked up until we are inside the af_unix code.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_unix_may_send(struct socket *sock,  struct socket *other)
{
        return call_int_hook(unix_may_send, sock, other);
}
EXPORT_SYMBOL(security_unix_may_send);

/**
 * security_socket_create() - Check if creating a new socket is allowed
 * @family: protocol family
 * @type: communications type
 * @protocol: requested protocol
 * @kern: set to 1 if a kernel socket is requested
 *
 * Check permissions prior to creating a new socket.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_create(int family, int type, int protocol, int kern)
{
        return call_int_hook(socket_create, family, type, protocol, kern);
}

/**
 * security_socket_post_create() - Initialize a newly created socket
 * @sock: socket
 * @family: protocol family
 * @type: communications type
 * @protocol: requested protocol
 * @kern: set to 1 if a kernel socket is requested
 *
 * This hook allows a module to update or allocate a per-socket security
 * structure. Note that the security field was not added directly to the socket
 * structure, but rather, the socket security information is stored in the
 * associated inode.  Typically, the inode alloc_security hook will allocate
 * and attach security information to SOCK_INODE(sock)->i_security.  This hook
 * may be used to update the SOCK_INODE(sock)->i_security field with additional
 * information that wasn't available when the inode was allocated.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_post_create(struct socket *sock, int family,
                                int type, int protocol, int kern)
{
        return call_int_hook(socket_post_create, sock, family, type,
                             protocol, kern);
}

/**
 * security_socket_socketpair() - Check if creating a socketpair is allowed
 * @socka: first socket
 * @sockb: second socket
 *
 * Check permissions before creating a fresh pair of sockets.
 *
 * Return: Returns 0 if permission is granted and the connection was
 *         established.
 */
int security_socket_socketpair(struct socket *socka, struct socket *sockb)
{
        return call_int_hook(socket_socketpair, socka, sockb);
}
EXPORT_SYMBOL(security_socket_socketpair);

/**
 * security_socket_bind() - Check if a socket bind operation is allowed
 * @sock: socket
 * @address: requested bind address
 * @addrlen: length of address
 *
 * Check permission before socket protocol layer bind operation is performed
 * and the socket @sock is bound to the address specified in the @address
 * parameter.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_bind(struct socket *sock,
                         struct sockaddr *address, int addrlen)
{
        return call_int_hook(socket_bind, sock, address, addrlen);
}

/**
 * security_socket_connect() - Check if a socket connect operation is allowed
 * @sock: socket
 * @address: address of remote connection point
 * @addrlen: length of address
 *
 * Check permission before socket protocol layer connect operation attempts to
 * connect socket @sock to a remote address, @address.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_connect(struct socket *sock,
                            struct sockaddr *address, int addrlen)
{
        return call_int_hook(socket_connect, sock, address, addrlen);
}

/**
 * security_socket_listen() - Check if a socket is allowed to listen
 * @sock: socket
 * @backlog: connection queue size
 *
 * Check permission before socket protocol layer listen operation.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_listen(struct socket *sock, int backlog)
{
        return call_int_hook(socket_listen, sock, backlog);
}

/**
 * security_socket_accept() - Check if a socket is allowed to accept connections
 * @sock: listening socket
 * @newsock: newly creation connection socket
 *
 * Check permission before accepting a new connection.  Note that the new
 * socket, @newsock, has been created and some information copied to it, but
 * the accept operation has not actually been performed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_accept(struct socket *sock, struct socket *newsock)
{
        return call_int_hook(socket_accept, sock, newsock);
}

/**
 * security_socket_sendmsg() - Check if sending a message is allowed
 * @sock: sending socket
 * @msg: message to send
 * @size: size of message
 *
 * Check permission before transmitting a message to another socket.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size)
{
        return call_int_hook(socket_sendmsg, sock, msg, size);
}

/**
 * security_socket_recvmsg() - Check if receiving a message is allowed
 * @sock: receiving socket
 * @msg: message to receive
 * @size: size of message
 * @flags: operational flags
 *
 * Check permission before receiving a message from a socket.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_recvmsg(struct socket *sock, struct msghdr *msg,
                            int size, int flags)
{
        return call_int_hook(socket_recvmsg, sock, msg, size, flags);
}

/**
 * security_socket_getsockname() - Check if reading the socket addr is allowed
 * @sock: socket
 *
 * Check permission before reading the local address (name) of the socket
 * object.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_getsockname(struct socket *sock)
{
        return call_int_hook(socket_getsockname, sock);
}

/**
 * security_socket_getpeername() - Check if reading the peer's addr is allowed
 * @sock: socket
 *
 * Check permission before the remote address (name) of a socket object.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_getpeername(struct socket *sock)
{
        return call_int_hook(socket_getpeername, sock);
}

/**
 * security_socket_getsockopt() - Check if reading a socket option is allowed
 * @sock: socket
 * @level: option's protocol level
 * @optname: option name
 *
 * Check permissions before retrieving the options associated with socket
 * @sock.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_getsockopt(struct socket *sock, int level, int optname)
{
        return call_int_hook(socket_getsockopt, sock, level, optname);
}

/**
 * security_socket_setsockopt() - Check if setting a socket option is allowed
 * @sock: socket
 * @level: option's protocol level
 * @optname: option name
 *
 * Check permissions before setting the options associated with socket @sock.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_setsockopt(struct socket *sock, int level, int optname)
{
        return call_int_hook(socket_setsockopt, sock, level, optname);
}

/**
 * security_socket_shutdown() - Checks if shutting down the socket is allowed
 * @sock: socket
 * @how: flag indicating how sends and receives are handled
 *
 * Checks permission before all or part of a connection on the socket @sock is
 * shut down.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_shutdown(struct socket *sock, int how)
{
        return call_int_hook(socket_shutdown, sock, how);
}

/**
 * security_sock_rcv_skb() - Check if an incoming network packet is allowed
 * @sk: destination sock
 * @skb: incoming packet
 *
 * Check permissions on incoming network packets.  This hook is distinct from
 * Netfilter's IP input hooks since it is the first time that the incoming
 * sk_buff @skb has been associated with a particular socket, @sk.  Must not
 * sleep inside this hook because some callers hold spinlocks.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        return call_int_hook(socket_sock_rcv_skb, sk, skb);
}
EXPORT_SYMBOL(security_sock_rcv_skb);

/**
 * security_socket_getpeersec_stream() - Get the remote peer label
 * @sock: socket
 * @optval: destination buffer
 * @optlen: size of peer label copied into the buffer
 * @len: maximum size of the destination buffer
 *
 * This hook allows the security module to provide peer socket security state
 * for unix or connected tcp sockets to userspace via getsockopt SO_GETPEERSEC.
 * For tcp sockets this can be meaningful if the socket is associated with an
 * ipsec SA.
 *
 * Return: Returns 0 if all is well, otherwise, typical getsockopt return
 *         values.
 */
int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval,
                                      sockptr_t optlen, unsigned int len)
{
        return call_int_hook(socket_getpeersec_stream, sock, optval, optlen,
                             len);
}

/**
 * security_socket_getpeersec_dgram() - Get the remote peer label
 * @sock: socket
 * @skb: datagram packet
 * @secid: remote peer label secid
 *
 * This hook allows the security module to provide peer socket security state
 * for udp sockets on a per-packet basis to userspace via getsockopt
 * SO_GETPEERSEC. The application must first have indicated the IP_PASSSEC
 * option via getsockopt. It can then retrieve the security state returned by
 * this hook for a packet via the SCM_SECURITY ancillary message type.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_socket_getpeersec_dgram(struct socket *sock,
                                     struct sk_buff *skb, u32 *secid)
{
        return call_int_hook(socket_getpeersec_dgram, sock, skb, secid);
}
EXPORT_SYMBOL(security_socket_getpeersec_dgram);

/**
 * security_sk_alloc() - Allocate and initialize a sock's LSM blob
 * @sk: sock
 * @family: protocol family
 * @priority: gfp flags
 *
 * Allocate and attach a security structure to the sk->sk_security field, which
 * is used to copy security attributes between local stream sockets.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sk_alloc(struct sock *sk, int family, gfp_t priority)
{
        return call_int_hook(sk_alloc_security, sk, family, priority);
}

/**
 * security_sk_free() - Free the sock's LSM blob
 * @sk: sock
 *
 * Deallocate security structure.
 */
void security_sk_free(struct sock *sk)
{
        call_void_hook(sk_free_security, sk);
}

/**
 * security_sk_clone() - Clone a sock's LSM state
 * @sk: original sock
 * @newsk: target sock
 *
 * Clone/copy security structure.
 */
void security_sk_clone(const struct sock *sk, struct sock *newsk)
{
        call_void_hook(sk_clone_security, sk, newsk);
}
EXPORT_SYMBOL(security_sk_clone);

/**
 * security_sk_classify_flow() - Set a flow's secid based on socket
 * @sk: original socket
 * @flic: target flow
 *
 * Set the target flow's secid to socket's secid.
 */
void security_sk_classify_flow(const struct sock *sk, struct flowi_common *flic)
{
        call_void_hook(sk_getsecid, sk, &flic->flowic_secid);
}
EXPORT_SYMBOL(security_sk_classify_flow);

/**
 * security_req_classify_flow() - Set a flow's secid based on request_sock
 * @req: request_sock
 * @flic: target flow
 *
 * Sets @flic's secid to @req's secid.
 */
void security_req_classify_flow(const struct request_sock *req,
                                struct flowi_common *flic)
{
        call_void_hook(req_classify_flow, req, flic);
}
EXPORT_SYMBOL(security_req_classify_flow);

/**
 * security_sock_graft() - Reconcile LSM state when grafting a sock on a socket
 * @sk: sock being grafted
 * @parent: target parent socket
 *
 * Sets @parent's inode secid to @sk's secid and update @sk with any necessary
 * LSM state from @parent.
 */
void security_sock_graft(struct sock *sk, struct socket *parent)
{
        call_void_hook(sock_graft, sk, parent);
}
EXPORT_SYMBOL(security_sock_graft);

/**
 * security_inet_conn_request() - Set request_sock state using incoming connect
 * @sk: parent listening sock
 * @skb: incoming connection
 * @req: new request_sock
 *
 * Initialize the @req LSM state based on @sk and the incoming connect in @skb.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inet_conn_request(const struct sock *sk,
                               struct sk_buff *skb, struct request_sock *req)
{
        return call_int_hook(inet_conn_request, sk, skb, req);
}
EXPORT_SYMBOL(security_inet_conn_request);

/**
 * security_inet_csk_clone() - Set new sock LSM state based on request_sock
 * @newsk: new sock
 * @req: connection request_sock
 *
 * Set that LSM state of @sock using the LSM state from @req.
 */
void security_inet_csk_clone(struct sock *newsk,
                             const struct request_sock *req)
{
        call_void_hook(inet_csk_clone, newsk, req);
}

/**
 * security_inet_conn_established() - Update sock's LSM state with connection
 * @sk: sock
 * @skb: connection packet
 *
 * Update @sock's LSM state to represent a new connection from @skb.
 */
void security_inet_conn_established(struct sock *sk,
                                    struct sk_buff *skb)
{
        call_void_hook(inet_conn_established, sk, skb);
}
EXPORT_SYMBOL(security_inet_conn_established);

/**
 * security_secmark_relabel_packet() - Check if setting a secmark is allowed
 * @secid: new secmark value
 *
 * Check if the process should be allowed to relabel packets to @secid.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_secmark_relabel_packet(u32 secid)
{
        return call_int_hook(secmark_relabel_packet, secid);
}
EXPORT_SYMBOL(security_secmark_relabel_packet);

/**
 * security_secmark_refcount_inc() - Increment the secmark labeling rule count
 *
 * Tells the LSM to increment the number of secmark labeling rules loaded.
 */
void security_secmark_refcount_inc(void)
{
        call_void_hook(secmark_refcount_inc);
}
EXPORT_SYMBOL(security_secmark_refcount_inc);

/**
 * security_secmark_refcount_dec() - Decrement the secmark labeling rule count
 *
 * Tells the LSM to decrement the number of secmark labeling rules loaded.
 */
void security_secmark_refcount_dec(void)
{
        call_void_hook(secmark_refcount_dec);
}
EXPORT_SYMBOL(security_secmark_refcount_dec);

/**
 * security_tun_dev_alloc_security() - Allocate a LSM blob for a TUN device
 * @security: pointer to the LSM blob
 *
 * This hook allows a module to allocate a security structure for a TUN        device,
 * returning the pointer in @security.
 *
 * Return: Returns a zero on success, negative values on failure.
 */
int security_tun_dev_alloc_security(void **security)
{
        return call_int_hook(tun_dev_alloc_security, security);
}
EXPORT_SYMBOL(security_tun_dev_alloc_security);

/**
 * security_tun_dev_free_security() - Free a TUN device LSM blob
 * @security: LSM blob
 *
 * This hook allows a module to free the security structure for a TUN device.
 */
void security_tun_dev_free_security(void *security)
{
        call_void_hook(tun_dev_free_security, security);
}
EXPORT_SYMBOL(security_tun_dev_free_security);

/**
 * security_tun_dev_create() - Check if creating a TUN device is allowed
 *
 * Check permissions prior to creating a new TUN device.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_tun_dev_create(void)
{
        return call_int_hook(tun_dev_create);
}
EXPORT_SYMBOL(security_tun_dev_create);

/**
 * security_tun_dev_attach_queue() - Check if attaching a TUN queue is allowed
 * @security: TUN device LSM blob
 *
 * Check permissions prior to attaching to a TUN device queue.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_tun_dev_attach_queue(void *security)
{
        return call_int_hook(tun_dev_attach_queue, security);
}
EXPORT_SYMBOL(security_tun_dev_attach_queue);

/**
 * security_tun_dev_attach() - Update TUN device LSM state on attach
 * @sk: associated sock
 * @security: TUN device LSM blob
 *
 * This hook can be used by the module to update any security state associated
 * with the TUN device's sock structure.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_tun_dev_attach(struct sock *sk, void *security)
{
        return call_int_hook(tun_dev_attach, sk, security);
}
EXPORT_SYMBOL(security_tun_dev_attach);

/**
 * security_tun_dev_open() - Update TUN device LSM state on open
 * @security: TUN device LSM blob
 *
 * This hook can be used by the module to update any security state associated
 * with the TUN device's security structure.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_tun_dev_open(void *security)
{
        return call_int_hook(tun_dev_open, security);
}
EXPORT_SYMBOL(security_tun_dev_open);

/**
 * security_sctp_assoc_request() - Update the LSM on a SCTP association req
 * @asoc: SCTP association
 * @skb: packet requesting the association
 *
 * Passes the @asoc and @chunk->skb of the association INIT packet to the LSM.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sctp_assoc_request(struct sctp_association *asoc,
                                struct sk_buff *skb)
{
        return call_int_hook(sctp_assoc_request, asoc, skb);
}
EXPORT_SYMBOL(security_sctp_assoc_request);

/**
 * security_sctp_bind_connect() - Validate a list of addrs for a SCTP option
 * @sk: socket
 * @optname: SCTP option to validate
 * @address: list of IP addresses to validate
 * @addrlen: length of the address list
 *
 * Validiate permissions required for each address associated with sock        @sk.
 * Depending on @optname, the addresses will be treated as either a connect or
 * bind service. The @addrlen is calculated on each IPv4 and IPv6 address using
 * sizeof(struct sockaddr_in) or sizeof(struct sockaddr_in6).
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sctp_bind_connect(struct sock *sk, int optname,
                               struct sockaddr *address, int addrlen)
{
        return call_int_hook(sctp_bind_connect, sk, optname, address, addrlen);
}
EXPORT_SYMBOL(security_sctp_bind_connect);

/**
 * security_sctp_sk_clone() - Clone a SCTP sock's LSM state
 * @asoc: SCTP association
 * @sk: original sock
 * @newsk: target sock
 *
 * Called whenever a new socket is created by accept(2) (i.e. a TCP style
 * socket) or when a socket is 'peeled off' e.g userspace calls
 * sctp_peeloff(3).
 */
void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk,
                            struct sock *newsk)
{
        call_void_hook(sctp_sk_clone, asoc, sk, newsk);
}
EXPORT_SYMBOL(security_sctp_sk_clone);

/**
 * security_sctp_assoc_established() - Update LSM state when assoc established
 * @asoc: SCTP association
 * @skb: packet establishing the association
 *
 * Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet to the
 * security module.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sctp_assoc_established(struct sctp_association *asoc,
                                    struct sk_buff *skb)
{
        return call_int_hook(sctp_assoc_established, asoc, skb);
}
EXPORT_SYMBOL(security_sctp_assoc_established);

/**
 * security_mptcp_add_subflow() - Inherit the LSM label from the MPTCP socket
 * @sk: the owning MPTCP socket
 * @ssk: the new subflow
 *
 * Update the labeling for the given MPTCP subflow, to match the one of the
 * owning MPTCP socket. This hook has to be called after the socket creation and
 * initialization via the security_socket_create() and
 * security_socket_post_create() LSM hooks.
 *
 * Return: Returns 0 on success or a negative error code on failure.
 */
int security_mptcp_add_subflow(struct sock *sk, struct sock *ssk)
{
        return call_int_hook(mptcp_add_subflow, sk, ssk);
}

#endif        /* CONFIG_SECURITY_NETWORK */

#ifdef CONFIG_SECURITY_INFINIBAND
/**
 * security_ib_pkey_access() - Check if access to an IB pkey is allowed
 * @sec: LSM blob
 * @subnet_prefix: subnet prefix of the port
 * @pkey: IB pkey
 *
 * Check permission to access a pkey when modifying a QP.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ib_pkey_access(void *sec, u64 subnet_prefix, u16 pkey)
{
        return call_int_hook(ib_pkey_access, sec, subnet_prefix, pkey);
}
EXPORT_SYMBOL(security_ib_pkey_access);

/**
 * security_ib_endport_manage_subnet() - Check if SMPs traffic is allowed
 * @sec: LSM blob
 * @dev_name: IB device name
 * @port_num: port number
 *
 * Check permissions to send and receive SMPs on a end port.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ib_endport_manage_subnet(void *sec,
                                      const char *dev_name, u8 port_num)
{
        return call_int_hook(ib_endport_manage_subnet, sec, dev_name, port_num);
}
EXPORT_SYMBOL(security_ib_endport_manage_subnet);

/**
 * security_ib_alloc_security() - Allocate an Infiniband LSM blob
 * @sec: LSM blob
 *
 * Allocate a security structure for Infiniband objects.
 *
 * Return: Returns 0 on success, non-zero on failure.
 */
int security_ib_alloc_security(void **sec)
{
        return call_int_hook(ib_alloc_security, sec);
}
EXPORT_SYMBOL(security_ib_alloc_security);

/**
 * security_ib_free_security() - Free an Infiniband LSM blob
 * @sec: LSM blob
 *
 * Deallocate an Infiniband security structure.
 */
void security_ib_free_security(void *sec)
{
        call_void_hook(ib_free_security, sec);
}
EXPORT_SYMBOL(security_ib_free_security);
#endif        /* CONFIG_SECURITY_INFINIBAND */

#ifdef CONFIG_SECURITY_NETWORK_XFRM
/**
 * security_xfrm_policy_alloc() - Allocate a xfrm policy LSM blob
 * @ctxp: xfrm security context being added to the SPD
 * @sec_ctx: security label provided by userspace
 * @gfp: gfp flags
 *
 * Allocate a security structure to the xp->security field; the security field
 * is initialized to NULL when the xfrm_policy is allocated.
 *
 * Return:  Return 0 if operation was successful.
 */
int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp,
                               struct xfrm_user_sec_ctx *sec_ctx,
                               gfp_t gfp)
{
        return call_int_hook(xfrm_policy_alloc_security, ctxp, sec_ctx, gfp);
}
EXPORT_SYMBOL(security_xfrm_policy_alloc);

/**
 * security_xfrm_policy_clone() - Clone xfrm policy LSM state
 * @old_ctx: xfrm security context
 * @new_ctxp: target xfrm security context
 *
 * Allocate a security structure in new_ctxp that contains the information from
 * the old_ctx structure.
 *
 * Return: Return 0 if operation was successful.
 */
int security_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx,
                               struct xfrm_sec_ctx **new_ctxp)
{
        return call_int_hook(xfrm_policy_clone_security, old_ctx, new_ctxp);
}

/**
 * security_xfrm_policy_free() - Free a xfrm security context
 * @ctx: xfrm security context
 *
 * Free LSM resources associated with @ctx.
 */
void security_xfrm_policy_free(struct xfrm_sec_ctx *ctx)
{
        call_void_hook(xfrm_policy_free_security, ctx);
}
EXPORT_SYMBOL(security_xfrm_policy_free);

/**
 * security_xfrm_policy_delete() - Check if deleting a xfrm policy is allowed
 * @ctx: xfrm security context
 *
 * Authorize deletion of a SPD entry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_xfrm_policy_delete(struct xfrm_sec_ctx *ctx)
{
        return call_int_hook(xfrm_policy_delete_security, ctx);
}

/**
 * security_xfrm_state_alloc() - Allocate a xfrm state LSM blob
 * @x: xfrm state being added to the SAD
 * @sec_ctx: security label provided by userspace
 *
 * Allocate a security structure to the @x->security field; the security field
 * is initialized to NULL when the xfrm_state is allocated. Set the context to
 * correspond to @sec_ctx.
 *
 * Return: Return 0 if operation was successful.
 */
int security_xfrm_state_alloc(struct xfrm_state *x,
                              struct xfrm_user_sec_ctx *sec_ctx)
{
        return call_int_hook(xfrm_state_alloc, x, sec_ctx);
}
EXPORT_SYMBOL(security_xfrm_state_alloc);

/**
 * security_xfrm_state_alloc_acquire() - Allocate a xfrm state LSM blob
 * @x: xfrm state being added to the SAD
 * @polsec: associated policy's security context
 * @secid: secid from the flow
 *
 * Allocate a security structure to the x->security field; the security field
 * is initialized to NULL when the xfrm_state is allocated.  Set the context to
 * correspond to secid.
 *
 * Return: Returns 0 if operation was successful.
 */
int security_xfrm_state_alloc_acquire(struct xfrm_state *x,
                                      struct xfrm_sec_ctx *polsec, u32 secid)
{
        return call_int_hook(xfrm_state_alloc_acquire, x, polsec, secid);
}

/**
 * security_xfrm_state_delete() - Check if deleting a xfrm state is allowed
 * @x: xfrm state
 *
 * Authorize deletion of x->security.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_xfrm_state_delete(struct xfrm_state *x)
{
        return call_int_hook(xfrm_state_delete_security, x);
}
EXPORT_SYMBOL(security_xfrm_state_delete);

/**
 * security_xfrm_state_free() - Free a xfrm state
 * @x: xfrm state
 *
 * Deallocate x->security.
 */
void security_xfrm_state_free(struct xfrm_state *x)
{
        call_void_hook(xfrm_state_free_security, x);
}

/**
 * security_xfrm_policy_lookup() - Check if using a xfrm policy is allowed
 * @ctx: target xfrm security context
 * @fl_secid: flow secid used to authorize access
 *
 * Check permission when a flow selects a xfrm_policy for processing XFRMs on a
 * packet.  The hook is called when selecting either a per-socket policy or a
 * generic xfrm policy.
 *
 * Return: Return 0 if permission is granted, -ESRCH otherwise, or -errno on
 *         other errors.
 */
int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid)
{
        return call_int_hook(xfrm_policy_lookup, ctx, fl_secid);
}

/**
 * security_xfrm_state_pol_flow_match() - Check for a xfrm match
 * @x: xfrm state to match
 * @xp: xfrm policy to check for a match
 * @flic: flow to check for a match.
 *
 * Check @xp and @flic for a match with @x.
 *
 * Return: Returns 1 if there is a match.
 */
int security_xfrm_state_pol_flow_match(struct xfrm_state *x,
                                       struct xfrm_policy *xp,
                                       const struct flowi_common *flic)
{
        struct security_hook_list *hp;
        int rc = LSM_RET_DEFAULT(xfrm_state_pol_flow_match);

        /*
         * Since this function is expected to return 0 or 1, the judgment
         * becomes difficult if multiple LSMs supply this call. Fortunately,
         * we can use the first LSM's judgment because currently only SELinux
         * supplies this call.
         *
         * For speed optimization, we explicitly break the loop rather than
         * using the macro
         */
        hlist_for_each_entry(hp, &security_hook_heads.xfrm_state_pol_flow_match,
                             list) {
                rc = hp->hook.xfrm_state_pol_flow_match(x, xp, flic);
                break;
        }
        return rc;
}

/**
 * security_xfrm_decode_session() - Determine the xfrm secid for a packet
 * @skb: xfrm packet
 * @secid: secid
 *
 * Decode the packet in @skb and return the security label in @secid.
 *
 * Return: Return 0 if all xfrms used have the same secid.
 */
int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid)
{
        return call_int_hook(xfrm_decode_session, skb, secid, 1);
}

void security_skb_classify_flow(struct sk_buff *skb, struct flowi_common *flic)
{
        int rc = call_int_hook(xfrm_decode_session, skb, &flic->flowic_secid,
                               0);

        BUG_ON(rc);
}
EXPORT_SYMBOL(security_skb_classify_flow);
#endif        /* CONFIG_SECURITY_NETWORK_XFRM */

#ifdef CONFIG_KEYS
/**
 * security_key_alloc() - Allocate and initialize a kernel key LSM blob
 * @key: key
 * @cred: credentials
 * @flags: allocation flags
 *
 * Permit allocation of a key and assign security data. Note that key does not
 * have a serial number assigned at this point.
 *
 * Return: Return 0 if permission is granted, -ve error otherwise.
 */
int security_key_alloc(struct key *key, const struct cred *cred,
                       unsigned long flags)
{
        return call_int_hook(key_alloc, key, cred, flags);
}

/**
 * security_key_free() - Free a kernel key LSM blob
 * @key: key
 *
 * Notification of destruction; free security data.
 */
void security_key_free(struct key *key)
{
        call_void_hook(key_free, key);
}

/**
 * security_key_permission() - Check if a kernel key operation is allowed
 * @key_ref: key reference
 * @cred: credentials of actor requesting access
 * @need_perm: requested permissions
 *
 * See whether a specific operational right is granted to a process on a key.
 *
 * Return: Return 0 if permission is granted, -ve error otherwise.
 */
int security_key_permission(key_ref_t key_ref, const struct cred *cred,
                            enum key_need_perm need_perm)
{
        return call_int_hook(key_permission, key_ref, cred, need_perm);
}

/**
 * security_key_getsecurity() - Get the key's security label
 * @key: key
 * @buffer: security label buffer
 *
 * Get a textual representation of the security context attached to a key for
 * the purposes of honouring KEYCTL_GETSECURITY.  This function allocates the
 * storage for the NUL-terminated string and the caller should free it.
 *
 * Return: Returns the length of @buffer (including terminating NUL) or -ve if
 *         an error occurs.  May also return 0 (and a NULL buffer pointer) if
 *         there is no security label assigned to the key.
 */
int security_key_getsecurity(struct key *key, char **buffer)
{
        *buffer = NULL;
        return call_int_hook(key_getsecurity, key, buffer);
}

/**
 * security_key_post_create_or_update() - Notification of key create or update
 * @keyring: keyring to which the key is linked to
 * @key: created or updated key
 * @payload: data used to instantiate or update the key
 * @payload_len: length of payload
 * @flags: key flags
 * @create: flag indicating whether the key was created or updated
 *
 * Notify the caller of a key creation or update.
 */
void security_key_post_create_or_update(struct key *keyring, struct key *key,
                                        const void *payload, size_t payload_len,
                                        unsigned long flags, bool create)
{
        call_void_hook(key_post_create_or_update, keyring, key, payload,
                       payload_len, flags, create);
}
#endif        /* CONFIG_KEYS */

#ifdef CONFIG_AUDIT
/**
 * security_audit_rule_init() - Allocate and init an LSM audit rule struct
 * @field: audit action
 * @op: rule operator
 * @rulestr: rule context
 * @lsmrule: receive buffer for audit rule struct
 * @gfp: GFP flag used for kmalloc
 *
 * Allocate and initialize an LSM audit rule structure.
 *
 * Return: Return 0 if @lsmrule has been successfully set, -EINVAL in case of
 *         an invalid rule.
 */
int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule,
                             gfp_t gfp)
{
        return call_int_hook(audit_rule_init, field, op, rulestr, lsmrule, gfp);
}

/**
 * security_audit_rule_known() - Check if an audit rule contains LSM fields
 * @krule: audit rule
 *
 * Specifies whether given @krule contains any fields related to the current
 * LSM.
 *
 * Return: Returns 1 in case of relation found, 0 otherwise.
 */
int security_audit_rule_known(struct audit_krule *krule)
{
        return call_int_hook(audit_rule_known, krule);
}

/**
 * security_audit_rule_free() - Free an LSM audit rule struct
 * @lsmrule: audit rule struct
 *
 * Deallocate the LSM audit rule structure previously allocated by
 * audit_rule_init().
 */
void security_audit_rule_free(void *lsmrule)
{
        call_void_hook(audit_rule_free, lsmrule);
}

/**
 * security_audit_rule_match() - Check if a label matches an audit rule
 * @secid: security label
 * @field: LSM audit field
 * @op: matching operator
 * @lsmrule: audit rule
 *
 * Determine if given @secid matches a rule previously approved by
 * security_audit_rule_known().
 *
 * Return: Returns 1 if secid matches the rule, 0 if it does not, -ERRNO on
 *         failure.
 */
int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule)
{
        return call_int_hook(audit_rule_match, secid, field, op, lsmrule);
}
#endif /* CONFIG_AUDIT */

#ifdef CONFIG_BPF_SYSCALL
/**
 * security_bpf() - Check if the bpf syscall operation is allowed
 * @cmd: command
 * @attr: bpf attribute
 * @size: size
 *
 * Do a initial check for all bpf syscalls after the attribute is copied into
 * the kernel. The actual security module can implement their own rules to
 * check the specific cmd they need.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_bpf(int cmd, union bpf_attr *attr, unsigned int size)
{
        return call_int_hook(bpf, cmd, attr, size);
}

/**
 * security_bpf_map() - Check if access to a bpf map is allowed
 * @map: bpf map
 * @fmode: mode
 *
 * Do a check when the kernel generates and returns a file descriptor for eBPF
 * maps.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_bpf_map(struct bpf_map *map, fmode_t fmode)
{
        return call_int_hook(bpf_map, map, fmode);
}

/**
 * security_bpf_prog() - Check if access to a bpf program is allowed
 * @prog: bpf program
 *
 * Do a check when the kernel generates and returns a file descriptor for eBPF
 * programs.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_bpf_prog(struct bpf_prog *prog)
{
        return call_int_hook(bpf_prog, prog);
}

/**
 * security_bpf_map_create() - Check if BPF map creation is allowed
 * @map: BPF map object
 * @attr: BPF syscall attributes used to create BPF map
 * @token: BPF token used to grant user access
 *
 * Do a check when the kernel creates a new BPF map. This is also the
 * point where LSM blob is allocated for LSMs that need them.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
                            struct bpf_token *token)
{
        return call_int_hook(bpf_map_create, map, attr, token);
}

/**
 * security_bpf_prog_load() - Check if loading of BPF program is allowed
 * @prog: BPF program object
 * @attr: BPF syscall attributes used to create BPF program
 * @token: BPF token used to grant user access to BPF subsystem
 *
 * Perform an access control check when the kernel loads a BPF program and
 * allocates associated BPF program object. This hook is also responsible for
 * allocating any required LSM state for the BPF program.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
                           struct bpf_token *token)
{
        return call_int_hook(bpf_prog_load, prog, attr, token);
}

/**
 * security_bpf_token_create() - Check if creating of BPF token is allowed
 * @token: BPF token object
 * @attr: BPF syscall attributes used to create BPF token
 * @path: path pointing to BPF FS mount point from which BPF token is created
 *
 * Do a check when the kernel instantiates a new BPF token object from BPF FS
 * instance. This is also the point where LSM blob can be allocated for LSMs.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
                              struct path *path)
{
        return call_int_hook(bpf_token_create, token, attr, path);
}

/**
 * security_bpf_token_cmd() - Check if BPF token is allowed to delegate
 * requested BPF syscall command
 * @token: BPF token object
 * @cmd: BPF syscall command requested to be delegated by BPF token
 *
 * Do a check when the kernel decides whether provided BPF token should allow
 * delegation of requested BPF syscall command.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
{
        return call_int_hook(bpf_token_cmd, token, cmd);
}

/**
 * security_bpf_token_capable() - Check if BPF token is allowed to delegate
 * requested BPF-related capability
 * @token: BPF token object
 * @cap: capabilities requested to be delegated by BPF token
 *
 * Do a check when the kernel decides whether provided BPF token should allow
 * delegation of requested BPF-related capabilities.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_token_capable(const struct bpf_token *token, int cap)
{
        return call_int_hook(bpf_token_capable, token, cap);
}

/**
 * security_bpf_map_free() - Free a bpf map's LSM blob
 * @map: bpf map
 *
 * Clean up the security information stored inside bpf map.
 */
void security_bpf_map_free(struct bpf_map *map)
{
        call_void_hook(bpf_map_free, map);
}

/**
 * security_bpf_prog_free() - Free a BPF program's LSM blob
 * @prog: BPF program struct
 *
 * Clean up the security information stored inside BPF program.
 */
void security_bpf_prog_free(struct bpf_prog *prog)
{
        call_void_hook(bpf_prog_free, prog);
}

/**
 * security_bpf_token_free() - Free a BPF token's LSM blob
 * @token: BPF token struct
 *
 * Clean up the security information stored inside BPF token.
 */
void security_bpf_token_free(struct bpf_token *token)
{
        call_void_hook(bpf_token_free, token);
}
#endif /* CONFIG_BPF_SYSCALL */

/**
 * security_locked_down() - Check if a kernel feature is allowed
 * @what: requested kernel feature
 *
 * Determine whether a kernel feature that potentially enables arbitrary code
 * execution in kernel space should be permitted.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_locked_down(enum lockdown_reason what)
{
        return call_int_hook(locked_down, what);
}
EXPORT_SYMBOL(security_locked_down);

#ifdef CONFIG_PERF_EVENTS
/**
 * security_perf_event_open() - Check if a perf event open is allowed
 * @attr: perf event attribute
 * @type: type of event
 *
 * Check whether the @type of perf_event_open syscall is allowed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_perf_event_open(struct perf_event_attr *attr, int type)
{
        return call_int_hook(perf_event_open, attr, type);
}

/**
 * security_perf_event_alloc() - Allocate a perf event LSM blob
 * @event: perf event
 *
 * Allocate and save perf_event security info.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_perf_event_alloc(struct perf_event *event)
{
        return call_int_hook(perf_event_alloc, event);
}

/**
 * security_perf_event_free() - Free a perf event LSM blob
 * @event: perf event
 *
 * Release (free) perf_event security info.
 */
void security_perf_event_free(struct perf_event *event)
{
        call_void_hook(perf_event_free, event);
}

/**
 * security_perf_event_read() - Check if reading a perf event label is allowed
 * @event: perf event
 *
 * Read perf_event security info if allowed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_perf_event_read(struct perf_event *event)
{
        return call_int_hook(perf_event_read, event);
}

/**
 * security_perf_event_write() - Check if writing a perf event label is allowed
 * @event: perf event
 *
 * Write perf_event security info if allowed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_perf_event_write(struct perf_event *event)
{
        return call_int_hook(perf_event_write, event);
}
#endif /* CONFIG_PERF_EVENTS */

#ifdef CONFIG_IO_URING
/**
 * security_uring_override_creds() - Check if overriding creds is allowed
 * @new: new credentials
 *
 * Check if the current task, executing an io_uring operation, is allowed to
 * override it's credentials with @new.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_uring_override_creds(const struct cred *new)
{
        return call_int_hook(uring_override_creds, new);
}

/**
 * security_uring_sqpoll() - Check if IORING_SETUP_SQPOLL is allowed
 *
 * Check whether the current task is allowed to spawn a io_uring polling thread
 * (IORING_SETUP_SQPOLL).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_uring_sqpoll(void)
{
        return call_int_hook(uring_sqpoll);
}

/**
 * security_uring_cmd() - Check if a io_uring passthrough command is allowed
 * @ioucmd: command
 *
 * Check whether the file_operations uring_cmd is allowed to run.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_uring_cmd(struct io_uring_cmd *ioucmd)
{
        return call_int_hook(uring_cmd, ioucmd);
}
#endif /* CONFIG_IO_URING */




















































































































































































































    1 









    1 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
/*
 * linux/fs/nls/nls_iso8859-1.c
 *
 * Charset iso8859-1 translation tables.
 * Generated automatically from the Unicode and charset
 * tables from the Unicode Organization (www.unicode.org).
 * The Unicode to charset table has only exact mappings.
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/nls.h>
#include <linux/errno.h>

static const wchar_t charset2uni[256] = {
        /* 0x00*/
        0x0000, 0x0001, 0x0002, 0x0003,
        0x0004, 0x0005, 0x0006, 0x0007,
        0x0008, 0x0009, 0x000a, 0x000b,
        0x000c, 0x000d, 0x000e, 0x000f,
        /* 0x10*/
        0x0010, 0x0011, 0x0012, 0x0013,
        0x0014, 0x0015, 0x0016, 0x0017,
        0x0018, 0x0019, 0x001a, 0x001b,
        0x001c, 0x001d, 0x001e, 0x001f,
        /* 0x20*/
        0x0020, 0x0021, 0x0022, 0x0023,
        0x0024, 0x0025, 0x0026, 0x0027,
        0x0028, 0x0029, 0x002a, 0x002b,
        0x002c, 0x002d, 0x002e, 0x002f,
        /* 0x30*/
        0x0030, 0x0031, 0x0032, 0x0033,
        0x0034, 0x0035, 0x0036, 0x0037,
        0x0038, 0x0039, 0x003a, 0x003b,
        0x003c, 0x003d, 0x003e, 0x003f,
        /* 0x40*/
        0x0040, 0x0041, 0x0042, 0x0043,
        0x0044, 0x0045, 0x0046, 0x0047,
        0x0048, 0x0049, 0x004a, 0x004b,
        0x004c, 0x004d, 0x004e, 0x004f,
        /* 0x50*/
        0x0050, 0x0051, 0x0052, 0x0053,
        0x0054, 0x0055, 0x0056, 0x0057,
        0x0058, 0x0059, 0x005a, 0x005b,
        0x005c, 0x005d, 0x005e, 0x005f,
        /* 0x60*/
        0x0060, 0x0061, 0x0062, 0x0063,
        0x0064, 0x0065, 0x0066, 0x0067,
        0x0068, 0x0069, 0x006a, 0x006b,
        0x006c, 0x006d, 0x006e, 0x006f,
        /* 0x70*/
        0x0070, 0x0071, 0x0072, 0x0073,
        0x0074, 0x0075, 0x0076, 0x0077,
        0x0078, 0x0079, 0x007a, 0x007b,
        0x007c, 0x007d, 0x007e, 0x007f,
        /* 0x80*/
        0x0080, 0x0081, 0x0082, 0x0083,
        0x0084, 0x0085, 0x0086, 0x0087,
        0x0088, 0x0089, 0x008a, 0x008b,
        0x008c, 0x008d, 0x008e, 0x008f,
        /* 0x90*/
        0x0090, 0x0091, 0x0092, 0x0093,
        0x0094, 0x0095, 0x0096, 0x0097,
        0x0098, 0x0099, 0x009a, 0x009b,
        0x009c, 0x009d, 0x009e, 0x009f,
        /* 0xa0*/
        0x00a0, 0x00a1, 0x00a2, 0x00a3,
        0x00a4, 0x00a5, 0x00a6, 0x00a7,
        0x00a8, 0x00a9, 0x00aa, 0x00ab,
        0x00ac, 0x00ad, 0x00ae, 0x00af,
        /* 0xb0*/
        0x00b0, 0x00b1, 0x00b2, 0x00b3,
        0x00b4, 0x00b5, 0x00b6, 0x00b7,
        0x00b8, 0x00b9, 0x00ba, 0x00bb,
        0x00bc, 0x00bd, 0x00be, 0x00bf,
        /* 0xc0*/
        0x00c0, 0x00c1, 0x00c2, 0x00c3,
        0x00c4, 0x00c5, 0x00c6, 0x00c7,
        0x00c8, 0x00c9, 0x00ca, 0x00cb,
        0x00cc, 0x00cd, 0x00ce, 0x00cf,
        /* 0xd0*/
        0x00d0, 0x00d1, 0x00d2, 0x00d3,
        0x00d4, 0x00d5, 0x00d6, 0x00d7,
        0x00d8, 0x00d9, 0x00da, 0x00db,
        0x00dc, 0x00dd, 0x00de, 0x00df,
        /* 0xe0*/
        0x00e0, 0x00e1, 0x00e2, 0x00e3,
        0x00e4, 0x00e5, 0x00e6, 0x00e7,
        0x00e8, 0x00e9, 0x00ea, 0x00eb,
        0x00ec, 0x00ed, 0x00ee, 0x00ef,
        /* 0xf0*/
        0x00f0, 0x00f1, 0x00f2, 0x00f3,
        0x00f4, 0x00f5, 0x00f6, 0x00f7,
        0x00f8, 0x00f9, 0x00fa, 0x00fb,
        0x00fc, 0x00fd, 0x00fe, 0x00ff,
};

static const unsigned char page00[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
        0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
        0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
        0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
        0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
        0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
        0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
        0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
        0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
        0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
        0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
        0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
        0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
        0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
        0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
        0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
        0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
};

static const unsigned char *const page_uni2charset[256] = {
        page00, NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
};

static const unsigned char charset2lower[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
        0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
        0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
        0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
        0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
        0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
        0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
        0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
        0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
        0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */
        0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */
        0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */
        0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */
        0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
        0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
        0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
        0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
};

static const unsigned char charset2upper[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
        0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
        0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
        0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
        0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
        0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
        0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
        0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0x00, 0xb6, 0xb7, /* 0xb0-0xb7 */
        0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
        0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
        0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
        0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
        0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
        0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */
        0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */
        0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */
        0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0x00, /* 0xf8-0xff */
};

static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
{
        const unsigned char *uni2charset;
        unsigned char cl = uni & 0x00ff;
        unsigned char ch = (uni & 0xff00) >> 8;

        if (boundlen <= 0)
                return -ENAMETOOLONG;

        uni2charset = page_uni2charset[ch];
        if (uni2charset && uni2charset[cl])
                out[0] = uni2charset[cl];
        else
                return -EINVAL;
        return 1;
}

static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
{
        *uni = charset2uni[*rawstring];
        if (*uni == 0x0000)
                return -EINVAL;
        return 1;
}

static struct nls_table table = {
        .charset        = "iso8859-1",
        .uni2char        = uni2char,
        .char2uni        = char2uni,
        .charset2lower        = charset2lower,
        .charset2upper        = charset2upper,
};

static int __init init_nls_iso8859_1(void)
{
        return register_nls(&table);
}

static void __exit exit_nls_iso8859_1(void)
{
        unregister_nls(&table);
}

module_init(init_nls_iso8859_1)
module_exit(exit_nls_iso8859_1)

MODULE_LICENSE("Dual BSD/GPL");





















































































































































































    3 













































































   13 





























































































































   12 



















































































































   12 


   12 































    1 


    4 

    4 

    5 









































































































































































































































































   13 
   13 












    6 














    6 













    6 
    6 









































































































































































   12 

































    8 

   10 
    1 














    1 



    1 
    1 



























































    1 























    4 
    2 










































































































































































    1 









    2 
    2 





























    2 
    2 


    2 
















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PAGEMAP_H
#define _LINUX_PAGEMAP_H

/*
 * Copyright 1995 Linus Torvalds
 */
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/list.h>
#include <linux/highmem.h>
#include <linux/compiler.h>
#include <linux/uaccess.h>
#include <linux/gfp.h>
#include <linux/bitops.h>
#include <linux/hardirq.h> /* for in_interrupt() */
#include <linux/hugetlb_inline.h>

struct folio_batch;

unsigned long invalidate_mapping_pages(struct address_space *mapping,
                                        pgoff_t start, pgoff_t end);

static inline void invalidate_remote_inode(struct inode *inode)
{
        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
            S_ISLNK(inode->i_mode))
                invalidate_mapping_pages(inode->i_mapping, 0, -1);
}
int invalidate_inode_pages2(struct address_space *mapping);
int invalidate_inode_pages2_range(struct address_space *mapping,
                pgoff_t start, pgoff_t end);
int kiocb_invalidate_pages(struct kiocb *iocb, size_t count);
void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count);

int write_inode_now(struct inode *, int sync);
int filemap_fdatawrite(struct address_space *);
int filemap_flush(struct address_space *);
int filemap_fdatawait_keep_errors(struct address_space *mapping);
int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend);
int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
                loff_t start_byte, loff_t end_byte);
int filemap_invalidate_inode(struct inode *inode, bool flush,
                             loff_t start, loff_t end);

static inline int filemap_fdatawait(struct address_space *mapping)
{
        return filemap_fdatawait_range(mapping, 0, LLONG_MAX);
}

bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend);
int filemap_write_and_wait_range(struct address_space *mapping,
                loff_t lstart, loff_t lend);
int __filemap_fdatawrite_range(struct address_space *mapping,
                loff_t start, loff_t end, int sync_mode);
int filemap_fdatawrite_range(struct address_space *mapping,
                loff_t start, loff_t end);
int filemap_check_errors(struct address_space *mapping);
void __filemap_set_wb_err(struct address_space *mapping, int err);
int filemap_fdatawrite_wbc(struct address_space *mapping,
                           struct writeback_control *wbc);
int kiocb_write_and_wait(struct kiocb *iocb, size_t count);

static inline int filemap_write_and_wait(struct address_space *mapping)
{
        return filemap_write_and_wait_range(mapping, 0, LLONG_MAX);
}

/**
 * filemap_set_wb_err - set a writeback error on an address_space
 * @mapping: mapping in which to set writeback error
 * @err: error to be set in mapping
 *
 * When writeback fails in some way, we must record that error so that
 * userspace can be informed when fsync and the like are called.  We endeavor
 * to report errors on any file that was open at the time of the error.  Some
 * internal callers also need to know when writeback errors have occurred.
 *
 * When a writeback error occurs, most filesystems will want to call
 * filemap_set_wb_err to record the error in the mapping so that it will be
 * automatically reported whenever fsync is called on the file.
 */
static inline void filemap_set_wb_err(struct address_space *mapping, int err)
{
        /* Fastpath for common case of no error */
        if (unlikely(err))
                __filemap_set_wb_err(mapping, err);
}

/**
 * filemap_check_wb_err - has an error occurred since the mark was sampled?
 * @mapping: mapping to check for writeback errors
 * @since: previously-sampled errseq_t
 *
 * Grab the errseq_t value from the mapping, and see if it has changed "since"
 * the given value was sampled.
 *
 * If it has then report the latest error set, otherwise return 0.
 */
static inline int filemap_check_wb_err(struct address_space *mapping,
                                        errseq_t since)
{
        return errseq_check(&mapping->wb_err, since);
}

/**
 * filemap_sample_wb_err - sample the current errseq_t to test for later errors
 * @mapping: mapping to be sampled
 *
 * Writeback errors are always reported relative to a particular sample point
 * in the past. This function provides those sample points.
 */
static inline errseq_t filemap_sample_wb_err(struct address_space *mapping)
{
        return errseq_sample(&mapping->wb_err);
}

/**
 * file_sample_sb_err - sample the current errseq_t to test for later errors
 * @file: file pointer to be sampled
 *
 * Grab the most current superblock-level errseq_t value for the given
 * struct file.
 */
static inline errseq_t file_sample_sb_err(struct file *file)
{
        return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
}

/*
 * Flush file data before changing attributes.  Caller must hold any locks
 * required to prevent further writes to this file until we're done setting
 * flags.
 */
static inline int inode_drain_writes(struct inode *inode)
{
        inode_dio_wait(inode);
        return filemap_write_and_wait(inode->i_mapping);
}

static inline bool mapping_empty(struct address_space *mapping)
{
        return xa_empty(&mapping->i_pages);
}

/*
 * mapping_shrinkable - test if page cache state allows inode reclaim
 * @mapping: the page cache mapping
 *
 * This checks the mapping's cache state for the pupose of inode
 * reclaim and LRU management.
 *
 * The caller is expected to hold the i_lock, but is not required to
 * hold the i_pages lock, which usually protects cache state. That's
 * because the i_lock and the list_lru lock that protect the inode and
 * its LRU state don't nest inside the irq-safe i_pages lock.
 *
 * Cache deletions are performed under the i_lock, which ensures that
 * when an inode goes empty, it will reliably get queued on the LRU.
 *
 * Cache additions do not acquire the i_lock and may race with this
 * check, in which case we'll report the inode as shrinkable when it
 * has cache pages. This is okay: the shrinker also checks the
 * refcount and the referenced bit, which will be elevated or set in
 * the process of adding new cache pages to an inode.
 */
static inline bool mapping_shrinkable(struct address_space *mapping)
{
        void *head;

        /*
         * On highmem systems, there could be lowmem pressure from the
         * inodes before there is highmem pressure from the page
         * cache. Make inodes shrinkable regardless of cache state.
         */
        if (IS_ENABLED(CONFIG_HIGHMEM))
                return true;

        /* Cache completely empty? Shrink away. */
        head = rcu_access_pointer(mapping->i_pages.xa_head);
        if (!head)
                return true;

        /*
         * The xarray stores single offset-0 entries directly in the
         * head pointer, which allows non-resident page cache entries
         * to escape the shadow shrinker's list of xarray nodes. The
         * inode shrinker needs to pick them up under memory pressure.
         */
        if (!xa_is_node(head) && xa_is_value(head))
                return true;

        return false;
}

/*
 * Bits in mapping->flags.
 */
enum mapping_flags {
        AS_EIO                = 0,        /* IO error on async write */
        AS_ENOSPC        = 1,        /* ENOSPC on async write */
        AS_MM_ALL_LOCKS        = 2,        /* under mm_take_all_locks() */
        AS_UNEVICTABLE        = 3,        /* e.g., ramdisk, SHM_LOCK */
        AS_EXITING        = 4,         /* final truncate in progress */
        /* writeback related tags are not used */
        AS_NO_WRITEBACK_TAGS = 5,
        AS_LARGE_FOLIO_SUPPORT = 6,
        AS_RELEASE_ALWAYS,        /* Call ->release_folio(), even if no private data */
        AS_STABLE_WRITES,        /* must wait for writeback before modifying
                                   folio contents */
        AS_UNMOVABLE,                /* The mapping cannot be moved, ever */
};

/**
 * mapping_set_error - record a writeback error in the address_space
 * @mapping: the mapping in which an error should be set
 * @error: the error to set in the mapping
 *
 * When writeback fails in some way, we must record that error so that
 * userspace can be informed when fsync and the like are called.  We endeavor
 * to report errors on any file that was open at the time of the error.  Some
 * internal callers also need to know when writeback errors have occurred.
 *
 * When a writeback error occurs, most filesystems will want to call
 * mapping_set_error to record the error in the mapping so that it can be
 * reported when the application calls fsync(2).
 */
static inline void mapping_set_error(struct address_space *mapping, int error)
{
        if (likely(!error))
                return;

        /* Record in wb_err for checkers using errseq_t based tracking */
        __filemap_set_wb_err(mapping, error);

        /* Record it in superblock */
        if (mapping->host)
                errseq_set(&mapping->host->i_sb->s_wb_err, error);

        /* Record it in flags for now, for legacy callers */
        if (error == -ENOSPC)
                set_bit(AS_ENOSPC, &mapping->flags);
        else
                set_bit(AS_EIO, &mapping->flags);
}

static inline void mapping_set_unevictable(struct address_space *mapping)
{
        set_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline void mapping_clear_unevictable(struct address_space *mapping)
{
        clear_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline bool mapping_unevictable(struct address_space *mapping)
{
        return mapping && test_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline void mapping_set_exiting(struct address_space *mapping)
{
        set_bit(AS_EXITING, &mapping->flags);
}

static inline int mapping_exiting(struct address_space *mapping)
{
        return test_bit(AS_EXITING, &mapping->flags);
}

static inline void mapping_set_no_writeback_tags(struct address_space *mapping)
{
        set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}

static inline int mapping_use_writeback_tags(struct address_space *mapping)
{
        return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}

static inline bool mapping_release_always(const struct address_space *mapping)
{
        return test_bit(AS_RELEASE_ALWAYS, &mapping->flags);
}

static inline void mapping_set_release_always(struct address_space *mapping)
{
        set_bit(AS_RELEASE_ALWAYS, &mapping->flags);
}

static inline void mapping_clear_release_always(struct address_space *mapping)
{
        clear_bit(AS_RELEASE_ALWAYS, &mapping->flags);
}

static inline bool mapping_stable_writes(const struct address_space *mapping)
{
        return test_bit(AS_STABLE_WRITES, &mapping->flags);
}

static inline void mapping_set_stable_writes(struct address_space *mapping)
{
        set_bit(AS_STABLE_WRITES, &mapping->flags);
}

static inline void mapping_clear_stable_writes(struct address_space *mapping)
{
        clear_bit(AS_STABLE_WRITES, &mapping->flags);
}

static inline void mapping_set_unmovable(struct address_space *mapping)
{
        /*
         * It's expected unmovable mappings are also unevictable. Compaction
         * migrate scanner (isolate_migratepages_block()) relies on this to
         * reduce page locking.
         */
        set_bit(AS_UNEVICTABLE, &mapping->flags);
        set_bit(AS_UNMOVABLE, &mapping->flags);
}

static inline bool mapping_unmovable(struct address_space *mapping)
{
        return test_bit(AS_UNMOVABLE, &mapping->flags);
}

static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
{
        return mapping->gfp_mask;
}

/* Restricts the given gfp_mask to what the mapping allows. */
static inline gfp_t mapping_gfp_constraint(struct address_space *mapping,
                gfp_t gfp_mask)
{
        return mapping_gfp_mask(mapping) & gfp_mask;
}

/*
 * This is non-atomic.  Only to be used before the mapping is activated.
 * Probably needs a barrier...
 */
static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
{
        m->gfp_mask = mask;
}

/*
 * There are some parts of the kernel which assume that PMD entries
 * are exactly HPAGE_PMD_ORDER.  Those should be fixed, but until then,
 * limit the maximum allocation order to PMD size.  I'm not aware of any
 * assumptions about maximum order if THP are disabled, but 8 seems like
 * a good order (that's 1MB if you're using 4kB pages)
 */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define MAX_PAGECACHE_ORDER        HPAGE_PMD_ORDER
#else
#define MAX_PAGECACHE_ORDER        8
#endif

/**
 * mapping_set_large_folios() - Indicate the file supports large folios.
 * @mapping: The file.
 *
 * The filesystem should call this function in its inode constructor to
 * indicate that the VFS can use large folios to cache the contents of
 * the file.
 *
 * Context: This should not be called while the inode is active as it
 * is non-atomic.
 */
static inline void mapping_set_large_folios(struct address_space *mapping)
{
        __set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
}

/*
 * Large folio support currently depends on THP.  These dependencies are
 * being worked on but are not yet fixed.
 */
static inline bool mapping_large_folio_support(struct address_space *mapping)
{
        /* AS_LARGE_FOLIO_SUPPORT is only reasonable for pagecache folios */
        VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON,
                        "Anonymous mapping always supports large folio");

        return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
                test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
}

/* Return the maximum folio size for this pagecache mapping, in bytes. */
static inline size_t mapping_max_folio_size(struct address_space *mapping)
{
        if (mapping_large_folio_support(mapping))
                return PAGE_SIZE << MAX_PAGECACHE_ORDER;
        return PAGE_SIZE;
}

static inline int filemap_nr_thps(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        return atomic_read(&mapping->nr_thps);
#else
        return 0;
#endif
}

static inline void filemap_nr_thps_inc(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        if (!mapping_large_folio_support(mapping))
                atomic_inc(&mapping->nr_thps);
#else
        WARN_ON_ONCE(mapping_large_folio_support(mapping) == 0);
#endif
}

static inline void filemap_nr_thps_dec(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        if (!mapping_large_folio_support(mapping))
                atomic_dec(&mapping->nr_thps);
#else
        WARN_ON_ONCE(mapping_large_folio_support(mapping) == 0);
#endif
}

struct address_space *page_mapping(struct page *);
struct address_space *folio_mapping(struct folio *);
struct address_space *swapcache_mapping(struct folio *);

/**
 * folio_file_mapping - Find the mapping this folio belongs to.
 * @folio: The folio.
 *
 * For folios which are in the page cache, return the mapping that this
 * page belongs to.  Folios in the swap cache return the mapping of the
 * swap file or swap device where the data is stored.  This is different
 * from the mapping returned by folio_mapping().  The only reason to
 * use it is if, like NFS, you return 0 from ->activate_swapfile.
 *
 * Do not call this for folios which aren't in the page cache or swap cache.
 */
static inline struct address_space *folio_file_mapping(struct folio *folio)
{
        if (unlikely(folio_test_swapcache(folio)))
                return swapcache_mapping(folio);

        return folio->mapping;
}

/**
 * folio_flush_mapping - Find the file mapping this folio belongs to.
 * @folio: The folio.
 *
 * For folios which are in the page cache, return the mapping that this
 * page belongs to.  Anonymous folios return NULL, even if they're in
 * the swap cache.  Other kinds of folio also return NULL.
 *
 * This is ONLY used by architecture cache flushing code.  If you aren't
 * writing cache flushing code, you want either folio_mapping() or
 * folio_file_mapping().
 */
static inline struct address_space *folio_flush_mapping(struct folio *folio)
{
        if (unlikely(folio_test_swapcache(folio)))
                return NULL;

        return folio_mapping(folio);
}

static inline struct address_space *page_file_mapping(struct page *page)
{
        return folio_file_mapping(page_folio(page));
}

/**
 * folio_inode - Get the host inode for this folio.
 * @folio: The folio.
 *
 * For folios which are in the page cache, return the inode that this folio
 * belongs to.
 *
 * Do not call this for folios which aren't in the page cache.
 */
static inline struct inode *folio_inode(struct folio *folio)
{
        return folio->mapping->host;
}

/**
 * folio_attach_private - Attach private data to a folio.
 * @folio: Folio to attach data to.
 * @data: Data to attach to folio.
 *
 * Attaching private data to a folio increments the page's reference count.
 * The data must be detached before the folio will be freed.
 */
static inline void folio_attach_private(struct folio *folio, void *data)
{
        folio_get(folio);
        folio->private = data;
        folio_set_private(folio);
}

/**
 * folio_change_private - Change private data on a folio.
 * @folio: Folio to change the data on.
 * @data: Data to set on the folio.
 *
 * Change the private data attached to a folio and return the old
 * data.  The page must previously have had data attached and the data
 * must be detached before the folio will be freed.
 *
 * Return: Data that was previously attached to the folio.
 */
static inline void *folio_change_private(struct folio *folio, void *data)
{
        void *old = folio_get_private(folio);

        folio->private = data;
        return old;
}

/**
 * folio_detach_private - Detach private data from a folio.
 * @folio: Folio to detach data from.
 *
 * Removes the data that was previously attached to the folio and decrements
 * the refcount on the page.
 *
 * Return: Data that was attached to the folio.
 */
static inline void *folio_detach_private(struct folio *folio)
{
        void *data = folio_get_private(folio);

        if (!folio_test_private(folio))
                return NULL;
        folio_clear_private(folio);
        folio->private = NULL;
        folio_put(folio);

        return data;
}

static inline void attach_page_private(struct page *page, void *data)
{
        folio_attach_private(page_folio(page), data);
}

static inline void *detach_page_private(struct page *page)
{
        return folio_detach_private(page_folio(page));
}

#ifdef CONFIG_NUMA
struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order);
#else
static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
{
        return folio_alloc_noprof(gfp, order);
}
#endif

#define filemap_alloc_folio(...)                                \
        alloc_hooks(filemap_alloc_folio_noprof(__VA_ARGS__))

static inline struct page *__page_cache_alloc(gfp_t gfp)
{
        return &filemap_alloc_folio(gfp, 0)->page;
}

static inline gfp_t readahead_gfp_mask(struct address_space *x)
{
        return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
}

typedef int filler_t(struct file *, struct folio *);

pgoff_t page_cache_next_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan);
pgoff_t page_cache_prev_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan);

/**
 * typedef fgf_t - Flags for getting folios from the page cache.
 *
 * Most users of the page cache will not need to use these flags;
 * there are convenience functions such as filemap_get_folio() and
 * filemap_lock_folio().  For users which need more control over exactly
 * what is done with the folios, these flags to __filemap_get_folio()
 * are available.
 *
 * * %FGP_ACCESSED - The folio will be marked accessed.
 * * %FGP_LOCK - The folio is returned locked.
 * * %FGP_CREAT - If no folio is present then a new folio is allocated,
 *   added to the page cache and the VM's LRU list.  The folio is
 *   returned locked.
 * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
 *   folio is already in cache.  If the folio was allocated, unlock it
 *   before returning so the caller can do the same dance.
 * * %FGP_WRITE - The folio will be written to by the caller.
 * * %FGP_NOFS - __GFP_FS will get cleared in gfp.
 * * %FGP_NOWAIT - Don't block on the folio lock.
 * * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
 * * %FGP_WRITEBEGIN - The flags to use in a filesystem write_begin()
 *   implementation.
 */
typedef unsigned int __bitwise fgf_t;

#define FGP_ACCESSED                ((__force fgf_t)0x00000001)
#define FGP_LOCK                ((__force fgf_t)0x00000002)
#define FGP_CREAT                ((__force fgf_t)0x00000004)
#define FGP_WRITE                ((__force fgf_t)0x00000008)
#define FGP_NOFS                ((__force fgf_t)0x00000010)
#define FGP_NOWAIT                ((__force fgf_t)0x00000020)
#define FGP_FOR_MMAP                ((__force fgf_t)0x00000040)
#define FGP_STABLE                ((__force fgf_t)0x00000080)
#define FGF_GET_ORDER(fgf)        (((__force unsigned)fgf) >> 26)        /* top 6 bits */

#define FGP_WRITEBEGIN                (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE)

/**
 * fgf_set_order - Encode a length in the fgf_t flags.
 * @size: The suggested size of the folio to create.
 *
 * The caller of __filemap_get_folio() can use this to suggest a preferred
 * size for the folio that is created.  If there is already a folio at
 * the index, it will be returned, no matter what its size.  If a folio
 * is freshly created, it may be of a different size than requested
 * due to alignment constraints, memory pressure, or the presence of
 * other folios at nearby indices.
 */
static inline fgf_t fgf_set_order(size_t size)
{
        unsigned int shift = ilog2(size);

        if (shift <= PAGE_SHIFT)
                return 0;
        return (__force fgf_t)((shift - PAGE_SHIFT) << 26);
}

void *filemap_get_entry(struct address_space *mapping, pgoff_t index);
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
                fgf_t fgp_flags, gfp_t gfp);
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
                fgf_t fgp_flags, gfp_t gfp);

/**
 * filemap_get_folio - Find and get a folio.
 * @mapping: The address_space to search.
 * @index: The page index.
 *
 * Looks up the page cache entry at @mapping & @index.  If a folio is
 * present, it is returned with an increased refcount.
 *
 * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for
 * this index.  Will not return a shadow, swap or DAX entry.
 */
static inline struct folio *filemap_get_folio(struct address_space *mapping,
                                        pgoff_t index)
{
        return __filemap_get_folio(mapping, index, 0, 0);
}

/**
 * filemap_lock_folio - Find and lock a folio.
 * @mapping: The address_space to search.
 * @index: The page index.
 *
 * Looks up the page cache entry at @mapping & @index.  If a folio is
 * present, it is returned locked with an increased refcount.
 *
 * Context: May sleep.
 * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for
 * this index.  Will not return a shadow, swap or DAX entry.
 */
static inline struct folio *filemap_lock_folio(struct address_space *mapping,
                                        pgoff_t index)
{
        return __filemap_get_folio(mapping, index, FGP_LOCK, 0);
}

/**
 * filemap_grab_folio - grab a folio from the page cache
 * @mapping: The address space to search
 * @index: The page index
 *
 * Looks up the page cache entry at @mapping & @index. If no folio is found,
 * a new folio is created. The folio is locked, marked as accessed, and
 * returned.
 *
 * Return: A found or created folio. ERR_PTR(-ENOMEM) if no folio is found
 * and failed to create a folio.
 */
static inline struct folio *filemap_grab_folio(struct address_space *mapping,
                                        pgoff_t index)
{
        return __filemap_get_folio(mapping, index,
                        FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
                        mapping_gfp_mask(mapping));
}

/**
 * find_get_page - find and get a page reference
 * @mapping: the address_space to search
 * @offset: the page index
 *
 * Looks up the page cache slot at @mapping & @offset.  If there is a
 * page cache page, it is returned with an increased refcount.
 *
 * Otherwise, %NULL is returned.
 */
static inline struct page *find_get_page(struct address_space *mapping,
                                        pgoff_t offset)
{
        return pagecache_get_page(mapping, offset, 0, 0);
}

static inline struct page *find_get_page_flags(struct address_space *mapping,
                                        pgoff_t offset, fgf_t fgp_flags)
{
        return pagecache_get_page(mapping, offset, fgp_flags, 0);
}

/**
 * find_lock_page - locate, pin and lock a pagecache page
 * @mapping: the address_space to search
 * @index: the page index
 *
 * Looks up the page cache entry at @mapping & @index.  If there is a
 * page cache page, it is returned locked and with an increased
 * refcount.
 *
 * Context: May sleep.
 * Return: A struct page or %NULL if there is no page in the cache for this
 * index.
 */
static inline struct page *find_lock_page(struct address_space *mapping,
                                        pgoff_t index)
{
        return pagecache_get_page(mapping, index, FGP_LOCK, 0);
}

/**
 * find_or_create_page - locate or add a pagecache page
 * @mapping: the page's address_space
 * @index: the page's index into the mapping
 * @gfp_mask: page allocation mode
 *
 * Looks up the page cache slot at @mapping & @offset.  If there is a
 * page cache page, it is returned locked and with an increased
 * refcount.
 *
 * If the page is not present, a new page is allocated using @gfp_mask
 * and added to the page cache and the VM's LRU list.  The page is
 * returned locked and with an increased refcount.
 *
 * On memory exhaustion, %NULL is returned.
 *
 * find_or_create_page() may sleep, even if @gfp_flags specifies an
 * atomic allocation!
 */
static inline struct page *find_or_create_page(struct address_space *mapping,
                                        pgoff_t index, gfp_t gfp_mask)
{
        return pagecache_get_page(mapping, index,
                                        FGP_LOCK|FGP_ACCESSED|FGP_CREAT,
                                        gfp_mask);
}

/**
 * grab_cache_page_nowait - returns locked page at given index in given cache
 * @mapping: target address_space
 * @index: the page index
 *
 * Same as grab_cache_page(), but do not wait if the page is unavailable.
 * This is intended for speculative data generators, where the data can
 * be regenerated if the page couldn't be grabbed.  This routine should
 * be safe to call while holding the lock for another page.
 *
 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
 * and deadlock against the caller's locked page.
 */
static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
                                pgoff_t index)
{
        return pagecache_get_page(mapping, index,
                        FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
                        mapping_gfp_mask(mapping));
}

#define swapcache_index(folio)        __page_file_index(&(folio)->page)

/**
 * folio_index - File index of a folio.
 * @folio: The folio.
 *
 * For a folio which is either in the page cache or the swap cache,
 * return its index within the address_space it belongs to.  If you know
 * the page is definitely in the page cache, you can look at the folio's
 * index directly.
 *
 * Return: The index (offset in units of pages) of a folio in its file.
 */
static inline pgoff_t folio_index(struct folio *folio)
{
        if (unlikely(folio_test_swapcache(folio)))
                return swapcache_index(folio);
        return folio->index;
}

/**
 * folio_next_index - Get the index of the next folio.
 * @folio: The current folio.
 *
 * Return: The index of the folio which follows this folio in the file.
 */
static inline pgoff_t folio_next_index(struct folio *folio)
{
        return folio->index + folio_nr_pages(folio);
}

/**
 * folio_file_page - The page for a particular index.
 * @folio: The folio which contains this index.
 * @index: The index we want to look up.
 *
 * Sometimes after looking up a folio in the page cache, we need to
 * obtain the specific page for an index (eg a page fault).
 *
 * Return: The page containing the file data for this index.
 */
static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
{
        return folio_page(folio, index & (folio_nr_pages(folio) - 1));
}

/**
 * folio_contains - Does this folio contain this index?
 * @folio: The folio.
 * @index: The page index within the file.
 *
 * Context: The caller should have the page locked in order to prevent
 * (eg) shmem from moving the page between the page cache and swap cache
 * and changing its index in the middle of the operation.
 * Return: true or false.
 */
static inline bool folio_contains(struct folio *folio, pgoff_t index)
{
        return index - folio_index(folio) < folio_nr_pages(folio);
}

/*
 * Given the page we found in the page cache, return the page corresponding
 * to this index in the file
 */
static inline struct page *find_subpage(struct page *head, pgoff_t index)
{
        /* HugeTLBfs wants the head page regardless */
        if (PageHuge(head))
                return head;

        return head + (index & (thp_nr_pages(head) - 1));
}

unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch);
unsigned filemap_get_folios_contig(struct address_space *mapping,
                pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch);

struct page *grab_cache_page_write_begin(struct address_space *mapping,
                        pgoff_t index);

/*
 * Returns locked page at given index in given cache, creating it if needed.
 */
static inline struct page *grab_cache_page(struct address_space *mapping,
                                                                pgoff_t index)
{
        return find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
}

struct folio *read_cache_folio(struct address_space *, pgoff_t index,
                filler_t *filler, struct file *file);
struct folio *mapping_read_folio_gfp(struct address_space *, pgoff_t index,
                gfp_t flags);
struct page *read_cache_page(struct address_space *, pgoff_t index,
                filler_t *filler, struct file *file);
extern struct page * read_cache_page_gfp(struct address_space *mapping,
                                pgoff_t index, gfp_t gfp_mask);

static inline struct page *read_mapping_page(struct address_space *mapping,
                                pgoff_t index, struct file *file)
{
        return read_cache_page(mapping, index, NULL, file);
}

static inline struct folio *read_mapping_folio(struct address_space *mapping,
                                pgoff_t index, struct file *file)
{
        return read_cache_folio(mapping, index, NULL, file);
}

/*
 * Get the offset in PAGE_SIZE (even for hugetlb pages).
 */
static inline pgoff_t page_to_pgoff(struct page *page)
{
        struct page *head;

        if (likely(!PageTransTail(page)))
                return page->index;

        head = compound_head(page);
        /*
         *  We don't initialize ->index for tail pages: calculate based on
         *  head page
         */
        return head->index + page - head;
}

/*
 * Return byte-offset into filesystem object for page.
 */
static inline loff_t page_offset(struct page *page)
{
        return ((loff_t)page->index) << PAGE_SHIFT;
}

static inline loff_t page_file_offset(struct page *page)
{
        return ((loff_t)page_index(page)) << PAGE_SHIFT;
}

/**
 * folio_pos - Returns the byte position of this folio in its file.
 * @folio: The folio.
 */
static inline loff_t folio_pos(struct folio *folio)
{
        return page_offset(&folio->page);
}

/**
 * folio_file_pos - Returns the byte position of this folio in its file.
 * @folio: The folio.
 *
 * This differs from folio_pos() for folios which belong to a swap file.
 * NFS is the only filesystem today which needs to use folio_file_pos().
 */
static inline loff_t folio_file_pos(struct folio *folio)
{
        return page_file_offset(&folio->page);
}

/*
 * Get the offset in PAGE_SIZE (even for hugetlb folios).
 */
static inline pgoff_t folio_pgoff(struct folio *folio)
{
        return folio->index;
}

static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
                                        unsigned long address)
{
        pgoff_t pgoff;
        pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
        pgoff += vma->vm_pgoff;
        return pgoff;
}

struct wait_page_key {
        struct folio *folio;
        int bit_nr;
        int page_match;
};

struct wait_page_queue {
        struct folio *folio;
        int bit_nr;
        wait_queue_entry_t wait;
};

static inline bool wake_page_match(struct wait_page_queue *wait_page,
                                  struct wait_page_key *key)
{
        if (wait_page->folio != key->folio)
               return false;
        key->page_match = 1;

        if (wait_page->bit_nr != key->bit_nr)
                return false;

        return true;
}

void __folio_lock(struct folio *folio);
int __folio_lock_killable(struct folio *folio);
vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf);
void unlock_page(struct page *page);
void folio_unlock(struct folio *folio);

/**
 * folio_trylock() - Attempt to lock a folio.
 * @folio: The folio to attempt to lock.
 *
 * Sometimes it is undesirable to wait for a folio to be unlocked (eg
 * when the locks are being taken in the wrong order, or if making
 * progress through a batch of folios is more important than processing
 * them in order).  Usually folio_lock() is the correct function to call.
 *
 * Context: Any context.
 * Return: Whether the lock was successfully acquired.
 */
static inline bool folio_trylock(struct folio *folio)
{
        return likely(!test_and_set_bit_lock(PG_locked, folio_flags(folio, 0)));
}

/*
 * Return true if the page was successfully locked
 */
static inline bool trylock_page(struct page *page)
{
        return folio_trylock(page_folio(page));
}

/**
 * folio_lock() - Lock this folio.
 * @folio: The folio to lock.
 *
 * The folio lock protects against many things, probably more than it
 * should.  It is primarily held while a folio is being brought uptodate,
 * either from its backing file or from swap.  It is also held while a
 * folio is being truncated from its address_space, so holding the lock
 * is sufficient to keep folio->mapping stable.
 *
 * The folio lock is also held while write() is modifying the page to
 * provide POSIX atomicity guarantees (as long as the write does not
 * cross a page boundary).  Other modifications to the data in the folio
 * do not hold the folio lock and can race with writes, eg DMA and stores
 * to mapped pages.
 *
 * Context: May sleep.  If you need to acquire the locks of two or
 * more folios, they must be in order of ascending index, if they are
 * in the same address_space.  If they are in different address_spaces,
 * acquire the lock of the folio which belongs to the address_space which
 * has the lowest address in memory first.
 */
static inline void folio_lock(struct folio *folio)
{
        might_sleep();
        if (!folio_trylock(folio))
                __folio_lock(folio);
}

/**
 * lock_page() - Lock the folio containing this page.
 * @page: The page to lock.
 *
 * See folio_lock() for a description of what the lock protects.
 * This is a legacy function and new code should probably use folio_lock()
 * instead.
 *
 * Context: May sleep.  Pages in the same folio share a lock, so do not
 * attempt to lock two pages which share a folio.
 */
static inline void lock_page(struct page *page)
{
        struct folio *folio;
        might_sleep();

        folio = page_folio(page);
        if (!folio_trylock(folio))
                __folio_lock(folio);
}

/**
 * folio_lock_killable() - Lock this folio, interruptible by a fatal signal.
 * @folio: The folio to lock.
 *
 * Attempts to lock the folio, like folio_lock(), except that the sleep
 * to acquire the lock is interruptible by a fatal signal.
 *
 * Context: May sleep; see folio_lock().
 * Return: 0 if the lock was acquired; -EINTR if a fatal signal was received.
 */
static inline int folio_lock_killable(struct folio *folio)
{
        might_sleep();
        if (!folio_trylock(folio))
                return __folio_lock_killable(folio);
        return 0;
}

/*
 * folio_lock_or_retry - Lock the folio, unless this would block and the
 * caller indicated that it can handle a retry.
 *
 * Return value and mmap_lock implications depend on flags; see
 * __folio_lock_or_retry().
 */
static inline vm_fault_t folio_lock_or_retry(struct folio *folio,
                                             struct vm_fault *vmf)
{
        might_sleep();
        if (!folio_trylock(folio))
                return __folio_lock_or_retry(folio, vmf);
        return 0;
}

/*
 * This is exported only for folio_wait_locked/folio_wait_writeback, etc.,
 * and should not be used directly.
 */
void folio_wait_bit(struct folio *folio, int bit_nr);
int folio_wait_bit_killable(struct folio *folio, int bit_nr);

/* 
 * Wait for a folio to be unlocked.
 *
 * This must be called with the caller "holding" the folio,
 * ie with increased folio reference count so that the folio won't
 * go away during the wait.
 */
static inline void folio_wait_locked(struct folio *folio)
{
        if (folio_test_locked(folio))
                folio_wait_bit(folio, PG_locked);
}

static inline int folio_wait_locked_killable(struct folio *folio)
{
        if (!folio_test_locked(folio))
                return 0;
        return folio_wait_bit_killable(folio, PG_locked);
}

static inline void wait_on_page_locked(struct page *page)
{
        folio_wait_locked(page_folio(page));
}

void folio_end_read(struct folio *folio, bool success);
void wait_on_page_writeback(struct page *page);
void folio_wait_writeback(struct folio *folio);
int folio_wait_writeback_killable(struct folio *folio);
void end_page_writeback(struct page *page);
void folio_end_writeback(struct folio *folio);
void wait_for_stable_page(struct page *page);
void folio_wait_stable(struct folio *folio);
void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn);
void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb);
void __folio_cancel_dirty(struct folio *folio);
static inline void folio_cancel_dirty(struct folio *folio)
{
        /* Avoid atomic ops, locking, etc. when not actually needed. */
        if (folio_test_dirty(folio))
                __folio_cancel_dirty(folio);
}
bool folio_clear_dirty_for_io(struct folio *folio);
bool clear_page_dirty_for_io(struct page *page);
void folio_invalidate(struct folio *folio, size_t offset, size_t length);
bool noop_dirty_folio(struct address_space *mapping, struct folio *folio);

#ifdef CONFIG_MIGRATION
int filemap_migrate_folio(struct address_space *mapping, struct folio *dst,
                struct folio *src, enum migrate_mode mode);
#else
#define filemap_migrate_folio NULL
#endif
void folio_end_private_2(struct folio *folio);
void folio_wait_private_2(struct folio *folio);
int folio_wait_private_2_killable(struct folio *folio);

/*
 * Add an arbitrary waiter to a page's wait queue
 */
void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter);

/*
 * Fault in userspace address range.
 */
size_t fault_in_writeable(char __user *uaddr, size_t size);
size_t fault_in_subpage_writeable(char __user *uaddr, size_t size);
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size);
size_t fault_in_readable(const char __user *uaddr, size_t size);

int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                pgoff_t index, gfp_t gfp);
int filemap_add_folio(struct address_space *mapping, struct folio *folio,
                pgoff_t index, gfp_t gfp);
void filemap_remove_folio(struct folio *folio);
void __filemap_remove_folio(struct folio *folio, void *shadow);
void replace_page_cache_folio(struct folio *old, struct folio *new);
void delete_from_page_cache_batch(struct address_space *mapping,
                                  struct folio_batch *fbatch);
bool filemap_release_folio(struct folio *folio, gfp_t gfp);
loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end,
                int whence);

/* Must be non-static for BPF error injection */
int __filemap_add_folio(struct address_space *mapping, struct folio *folio,
                pgoff_t index, gfp_t gfp, void **shadowp);

bool filemap_range_has_writeback(struct address_space *mapping,
                                 loff_t start_byte, loff_t end_byte);

/**
 * filemap_range_needs_writeback - check if range potentially needs writeback
 * @mapping:           address space within which to check
 * @start_byte:        offset in bytes where the range starts
 * @end_byte:          offset in bytes where the range ends (inclusive)
 *
 * Find at least one page in the range supplied, usually used to check if
 * direct writing in this range will trigger a writeback. Used by O_DIRECT
 * read/write with IOCB_NOWAIT, to see if the caller needs to do
 * filemap_write_and_wait_range() before proceeding.
 *
 * Return: %true if the caller should do filemap_write_and_wait_range() before
 * doing O_DIRECT to a page in this range, %false otherwise.
 */
static inline bool filemap_range_needs_writeback(struct address_space *mapping,
                                                 loff_t start_byte,
                                                 loff_t end_byte)
{
        if (!mapping->nrpages)
                return false;
        if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
            !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
                return false;
        return filemap_range_has_writeback(mapping, start_byte, end_byte);
}

/**
 * struct readahead_control - Describes a readahead request.
 *
 * A readahead request is for consecutive pages.  Filesystems which
 * implement the ->readahead method should call readahead_page() or
 * readahead_page_batch() in a loop and attempt to start I/O against
 * each page in the request.
 *
 * Most of the fields in this struct are private and should be accessed
 * by the functions below.
 *
 * @file: The file, used primarily by network filesystems for authentication.
 *          May be NULL if invoked internally by the filesystem.
 * @mapping: Readahead this filesystem object.
 * @ra: File readahead state.  May be NULL.
 */
struct readahead_control {
        struct file *file;
        struct address_space *mapping;
        struct file_ra_state *ra;
/* private: use the readahead_* accessors instead */
        pgoff_t _index;
        unsigned int _nr_pages;
        unsigned int _batch_count;
        bool _workingset;
        unsigned long _pflags;
};

#define DEFINE_READAHEAD(ractl, f, r, m, i)                                \
        struct readahead_control ractl = {                                \
                .file = f,                                                \
                .mapping = m,                                                \
                .ra = r,                                                \
                ._index = i,                                                \
        }

#define VM_READAHEAD_PAGES        (SZ_128K / PAGE_SIZE)

void page_cache_ra_unbounded(struct readahead_control *,
                unsigned long nr_to_read, unsigned long lookahead_count);
void page_cache_sync_ra(struct readahead_control *, unsigned long req_count);
void page_cache_async_ra(struct readahead_control *, struct folio *,
                unsigned long req_count);
void readahead_expand(struct readahead_control *ractl,
                      loff_t new_start, size_t new_len);

/**
 * page_cache_sync_readahead - generic file readahead
 * @mapping: address_space which holds the pagecache and I/O vectors
 * @ra: file_ra_state which holds the readahead state
 * @file: Used by the filesystem for authentication.
 * @index: Index of first page to be read.
 * @req_count: Total number of pages being read by the caller.
 *
 * page_cache_sync_readahead() should be called when a cache miss happened:
 * it will submit the read.  The readahead logic may decide to piggyback more
 * pages onto the read request if access patterns suggest it will improve
 * performance.
 */
static inline
void page_cache_sync_readahead(struct address_space *mapping,
                struct file_ra_state *ra, struct file *file, pgoff_t index,
                unsigned long req_count)
{
        DEFINE_READAHEAD(ractl, file, ra, mapping, index);
        page_cache_sync_ra(&ractl, req_count);
}

/**
 * page_cache_async_readahead - file readahead for marked pages
 * @mapping: address_space which holds the pagecache and I/O vectors
 * @ra: file_ra_state which holds the readahead state
 * @file: Used by the filesystem for authentication.
 * @folio: The folio at @index which triggered the readahead call.
 * @index: Index of first page to be read.
 * @req_count: Total number of pages being read by the caller.
 *
 * page_cache_async_readahead() should be called when a page is used which
 * is marked as PageReadahead; this is a marker to suggest that the application
 * has used up enough of the readahead window that we should start pulling in
 * more pages.
 */
static inline
void page_cache_async_readahead(struct address_space *mapping,
                struct file_ra_state *ra, struct file *file,
                struct folio *folio, pgoff_t index, unsigned long req_count)
{
        DEFINE_READAHEAD(ractl, file, ra, mapping, index);
        page_cache_async_ra(&ractl, folio, req_count);
}

static inline struct folio *__readahead_folio(struct readahead_control *ractl)
{
        struct folio *folio;

        BUG_ON(ractl->_batch_count > ractl->_nr_pages);
        ractl->_nr_pages -= ractl->_batch_count;
        ractl->_index += ractl->_batch_count;

        if (!ractl->_nr_pages) {
                ractl->_batch_count = 0;
                return NULL;
        }

        folio = xa_load(&ractl->mapping->i_pages, ractl->_index);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        ractl->_batch_count = folio_nr_pages(folio);

        return folio;
}

/**
 * readahead_page - Get the next page to read.
 * @ractl: The current readahead request.
 *
 * Context: The page is locked and has an elevated refcount.  The caller
 * should decreases the refcount once the page has been submitted for I/O
 * and unlock the page once all I/O to that page has completed.
 * Return: A pointer to the next page, or %NULL if we are done.
 */
static inline struct page *readahead_page(struct readahead_control *ractl)
{
        struct folio *folio = __readahead_folio(ractl);

        return &folio->page;
}

/**
 * readahead_folio - Get the next folio to read.
 * @ractl: The current readahead request.
 *
 * Context: The folio is locked.  The caller should unlock the folio once
 * all I/O to that folio has completed.
 * Return: A pointer to the next folio, or %NULL if we are done.
 */
static inline struct folio *readahead_folio(struct readahead_control *ractl)
{
        struct folio *folio = __readahead_folio(ractl);

        if (folio)
                folio_put(folio);
        return folio;
}

static inline unsigned int __readahead_batch(struct readahead_control *rac,
                struct page **array, unsigned int array_sz)
{
        unsigned int i = 0;
        XA_STATE(xas, &rac->mapping->i_pages, 0);
        struct page *page;

        BUG_ON(rac->_batch_count > rac->_nr_pages);
        rac->_nr_pages -= rac->_batch_count;
        rac->_index += rac->_batch_count;
        rac->_batch_count = 0;

        xas_set(&xas, rac->_index);
        rcu_read_lock();
        xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) {
                if (xas_retry(&xas, page))
                        continue;
                VM_BUG_ON_PAGE(!PageLocked(page), page);
                VM_BUG_ON_PAGE(PageTail(page), page);
                array[i++] = page;
                rac->_batch_count += thp_nr_pages(page);
                if (i == array_sz)
                        break;
        }
        rcu_read_unlock();

        return i;
}

/**
 * readahead_page_batch - Get a batch of pages to read.
 * @rac: The current readahead request.
 * @array: An array of pointers to struct page.
 *
 * Context: The pages are locked and have an elevated refcount.  The caller
 * should decreases the refcount once the page has been submitted for I/O
 * and unlock the page once all I/O to that page has completed.
 * Return: The number of pages placed in the array.  0 indicates the request
 * is complete.
 */
#define readahead_page_batch(rac, array)                                \
        __readahead_batch(rac, array, ARRAY_SIZE(array))

/**
 * readahead_pos - The byte offset into the file of this readahead request.
 * @rac: The readahead request.
 */
static inline loff_t readahead_pos(struct readahead_control *rac)
{
        return (loff_t)rac->_index * PAGE_SIZE;
}

/**
 * readahead_length - The number of bytes in this readahead request.
 * @rac: The readahead request.
 */
static inline size_t readahead_length(struct readahead_control *rac)
{
        return rac->_nr_pages * PAGE_SIZE;
}

/**
 * readahead_index - The index of the first page in this readahead request.
 * @rac: The readahead request.
 */
static inline pgoff_t readahead_index(struct readahead_control *rac)
{
        return rac->_index;
}

/**
 * readahead_count - The number of pages in this readahead request.
 * @rac: The readahead request.
 */
static inline unsigned int readahead_count(struct readahead_control *rac)
{
        return rac->_nr_pages;
}

/**
 * readahead_batch_length - The number of bytes in the current batch.
 * @rac: The readahead request.
 */
static inline size_t readahead_batch_length(struct readahead_control *rac)
{
        return rac->_batch_count * PAGE_SIZE;
}

static inline unsigned long dir_pages(struct inode *inode)
{
        return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >>
                               PAGE_SHIFT;
}

/**
 * folio_mkwrite_check_truncate - check if folio was truncated
 * @folio: the folio to check
 * @inode: the inode to check the folio against
 *
 * Return: the number of bytes in the folio up to EOF,
 * or -EFAULT if the folio was truncated.
 */
static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio,
                                              struct inode *inode)
{
        loff_t size = i_size_read(inode);
        pgoff_t index = size >> PAGE_SHIFT;
        size_t offset = offset_in_folio(folio, size);

        if (!folio->mapping)
                return -EFAULT;

        /* folio is wholly inside EOF */
        if (folio_next_index(folio) - 1 < index)
                return folio_size(folio);
        /* folio is wholly past EOF */
        if (folio->index > index || !offset)
                return -EFAULT;
        /* folio is partially inside EOF */
        return offset;
}

/**
 * page_mkwrite_check_truncate - check if page was truncated
 * @page: the page to check
 * @inode: the inode to check the page against
 *
 * Returns the number of bytes in the page up to EOF,
 * or -EFAULT if the page was truncated.
 */
static inline int page_mkwrite_check_truncate(struct page *page,
                                              struct inode *inode)
{
        loff_t size = i_size_read(inode);
        pgoff_t index = size >> PAGE_SHIFT;
        int offset = offset_in_page(size);

        if (page->mapping != inode->i_mapping)
                return -EFAULT;

        /* page is wholly inside EOF */
        if (page->index < index)
                return PAGE_SIZE;
        /* page is wholly past EOF */
        if (page->index > index || !offset)
                return -EFAULT;
        /* page is partially inside EOF */
        return offset;
}

/**
 * i_blocks_per_folio - How many blocks fit in this folio.
 * @inode: The inode which contains the blocks.
 * @folio: The folio.
 *
 * If the block size is larger than the size of this folio, return zero.
 *
 * Context: The caller should hold a refcount on the folio to prevent it
 * from being split.
 * Return: The number of filesystem blocks covered by this folio.
 */
static inline
unsigned int i_blocks_per_folio(struct inode *inode, struct folio *folio)
{
        return folio_size(folio) >> inode->i_blkbits;
}

static inline
unsigned int i_blocks_per_page(struct inode *inode, struct page *page)
{
        return i_blocks_per_folio(inode, page_folio(page));
}
#endif /* _LINUX_PAGEMAP_H */












































    4 



    4 





    4 





    1 





    1 
















    6 




    6 





    5 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/bitmap.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 */

#include <linux/buffer_head.h>
#include "ext4.h"

unsigned int ext4_count_free(char *bitmap, unsigned int numchars)
{
        return numchars * BITS_PER_BYTE - memweight(bitmap, numchars);
}

int ext4_inode_bitmap_csum_verify(struct super_block *sb,
                                  struct ext4_group_desc *gdp,
                                  struct buffer_head *bh, int sz)
{
        __u32 hi;
        __u32 provided, calculated;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (!ext4_has_metadata_csum(sb))
                return 1;

        provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
        calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
        if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
                hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi);
                provided |= (hi << 16);
        } else
                calculated &= 0xFFFF;

        return provided == calculated;
}

void ext4_inode_bitmap_csum_set(struct super_block *sb,
                                struct ext4_group_desc *gdp,
                                struct buffer_head *bh, int sz)
{
        __u32 csum;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (!ext4_has_metadata_csum(sb))
                return;

        csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
        gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
        if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
                gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
}

int ext4_block_bitmap_csum_verify(struct super_block *sb,
                                  struct ext4_group_desc *gdp,
                                  struct buffer_head *bh)
{
        __u32 hi;
        __u32 provided, calculated;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;

        if (!ext4_has_metadata_csum(sb))
                return 1;

        provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
        calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
        if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) {
                hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi);
                provided |= (hi << 16);
        } else
                calculated &= 0xFFFF;

        return provided == calculated;
}

void ext4_block_bitmap_csum_set(struct super_block *sb,
                                struct ext4_group_desc *gdp,
                                struct buffer_head *bh)
{
        int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
        __u32 csum;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (!ext4_has_metadata_csum(sb))
                return;

        csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
        gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
        if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
                gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16);
}






















    2 














    2 































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
/*
 *  linux/fs/hfs/dir.c
 *
 * Copyright (C) 1995-1997  Paul H. Hargrove
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 * This file may be distributed under the terms of the GNU General Public License.
 *
 * This file contains directory-related functions independent of which
 * scheme is being used to represent forks.
 *
 * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds
 */

#include "hfs_fs.h"
#include "btree.h"

/*
 * hfs_lookup()
 */
static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
                                 unsigned int flags)
{
        hfs_cat_rec rec;
        struct hfs_find_data fd;
        struct inode *inode = NULL;
        int res;

        res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
        if (res)
                return ERR_PTR(res);
        hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
        res = hfs_brec_read(&fd, &rec, sizeof(rec));
        if (res) {
                if (res != -ENOENT)
                        inode = ERR_PTR(res);
        } else {
                inode = hfs_iget(dir->i_sb, &fd.search_key->cat, &rec);
                if (!inode)
                        inode = ERR_PTR(-EACCES);
        }
        hfs_find_exit(&fd);
        return d_splice_alias(inode, dentry);
}

/*
 * hfs_readdir
 */
static int hfs_readdir(struct file *file, struct dir_context *ctx)
{
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        int len, err;
        char strbuf[HFS_MAX_NAMELEN];
        union hfs_cat_rec entry;
        struct hfs_find_data fd;
        struct hfs_readdir_data *rd;
        u16 type;

        if (ctx->pos >= inode->i_size)
                return 0;

        err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
        if (err)
                return err;
        hfs_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
        err = hfs_brec_find(&fd);
        if (err)
                goto out;

        if (ctx->pos == 0) {
                /* This is completely artificial... */
                if (!dir_emit_dot(file, ctx))
                        goto out;
                ctx->pos = 1;
        }
        if (ctx->pos == 1) {
                if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
                        err = -EIO;
                        goto out;
                }

                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
                if (entry.type != HFS_CDR_THD) {
                        pr_err("bad catalog folder thread\n");
                        err = -EIO;
                        goto out;
                }
                //if (fd.entrylength < HFS_MIN_THREAD_SZ) {
                //        pr_err("truncated catalog thread\n");
                //        err = -EIO;
                //        goto out;
                //}
                if (!dir_emit(ctx, "..", 2,
                            be32_to_cpu(entry.thread.ParID), DT_DIR))
                        goto out;
                ctx->pos = 2;
        }
        if (ctx->pos >= inode->i_size)
                goto out;
        err = hfs_brec_goto(&fd, ctx->pos - 1);
        if (err)
                goto out;

        for (;;) {
                if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) {
                        pr_err("walked past end of dir\n");
                        err = -EIO;
                        goto out;
                }

                if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
                        err = -EIO;
                        goto out;
                }

                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
                type = entry.type;
                len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName);
                if (type == HFS_CDR_DIR) {
                        if (fd.entrylength < sizeof(struct hfs_cat_dir)) {
                                pr_err("small dir entry\n");
                                err = -EIO;
                                goto out;
                        }
                        if (!dir_emit(ctx, strbuf, len,
                                    be32_to_cpu(entry.dir.DirID), DT_DIR))
                                break;
                } else if (type == HFS_CDR_FIL) {
                        if (fd.entrylength < sizeof(struct hfs_cat_file)) {
                                pr_err("small file entry\n");
                                err = -EIO;
                                goto out;
                        }
                        if (!dir_emit(ctx, strbuf, len,
                                    be32_to_cpu(entry.file.FlNum), DT_REG))
                                break;
                } else {
                        pr_err("bad catalog entry type %d\n", type);
                        err = -EIO;
                        goto out;
                }
                ctx->pos++;
                if (ctx->pos >= inode->i_size)
                        goto out;
                err = hfs_brec_goto(&fd, 1);
                if (err)
                        goto out;
        }
        rd = file->private_data;
        if (!rd) {
                rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL);
                if (!rd) {
                        err = -ENOMEM;
                        goto out;
                }
                file->private_data = rd;
                rd->file = file;
                spin_lock(&HFS_I(inode)->open_dir_lock);
                list_add(&rd->list, &HFS_I(inode)->open_dir_list);
                spin_unlock(&HFS_I(inode)->open_dir_lock);
        }
        /*
         * Can be done after the list insertion; exclusion with
         * hfs_delete_cat() is provided by directory lock.
         */
        memcpy(&rd->key, &fd.key->cat, sizeof(struct hfs_cat_key));
out:
        hfs_find_exit(&fd);
        return err;
}

static int hfs_dir_release(struct inode *inode, struct file *file)
{
        struct hfs_readdir_data *rd = file->private_data;
        if (rd) {
                spin_lock(&HFS_I(inode)->open_dir_lock);
                list_del(&rd->list);
                spin_unlock(&HFS_I(inode)->open_dir_lock);
                kfree(rd);
        }
        return 0;
}

/*
 * hfs_create()
 *
 * This is the create() entry in the inode_operations structure for
 * regular HFS directories.  The purpose is to create a new file in
 * a directory and return a corresponding inode, given the inode for
 * the directory and the name (and its length) of the new file.
 */
static int hfs_create(struct mnt_idmap *idmap, struct inode *dir,
                      struct dentry *dentry, umode_t mode, bool excl)
{
        struct inode *inode;
        int res;

        inode = hfs_new_inode(dir, &dentry->d_name, mode);
        if (!inode)
                return -ENOMEM;

        res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
        if (res) {
                clear_nlink(inode);
                hfs_delete_inode(inode);
                iput(inode);
                return res;
        }
        d_instantiate(dentry, inode);
        mark_inode_dirty(inode);
        return 0;
}

/*
 * hfs_mkdir()
 *
 * This is the mkdir() entry in the inode_operations structure for
 * regular HFS directories.  The purpose is to create a new directory
 * in a directory, given the inode for the parent directory and the
 * name (and its length) of the new directory.
 */
static int hfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
                     struct dentry *dentry, umode_t mode)
{
        struct inode *inode;
        int res;

        inode = hfs_new_inode(dir, &dentry->d_name, S_IFDIR | mode);
        if (!inode)
                return -ENOMEM;

        res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
        if (res) {
                clear_nlink(inode);
                hfs_delete_inode(inode);
                iput(inode);
                return res;
        }
        d_instantiate(dentry, inode);
        mark_inode_dirty(inode);
        return 0;
}

/*
 * hfs_remove()
 *
 * This serves as both unlink() and rmdir() in the inode_operations
 * structure for regular HFS directories.  The purpose is to delete
 * an existing child, given the inode for the parent directory and
 * the name (and its length) of the existing directory.
 *
 * HFS does not have hardlinks, so both rmdir and unlink set the
 * link count to 0.  The only difference is the emptiness check.
 */
static int hfs_remove(struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        int res;

        if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
                return -ENOTEMPTY;
        res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
        if (res)
                return res;
        clear_nlink(inode);
        inode_set_ctime_current(inode);
        hfs_delete_inode(inode);
        mark_inode_dirty(inode);
        return 0;
}

/*
 * hfs_rename()
 *
 * This is the rename() entry in the inode_operations structure for
 * regular HFS directories.  The purpose is to rename an existing
 * file or directory, given the inode for the current directory and
 * the name (and its length) of the existing file/directory and the
 * inode for the new directory and the name (and its length) of the
 * new file/directory.
 * XXX: how do you handle must_be dir?
 */
static int hfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
                      struct dentry *old_dentry, struct inode *new_dir,
                      struct dentry *new_dentry, unsigned int flags)
{
        int res;

        if (flags & ~RENAME_NOREPLACE)
                return -EINVAL;

        /* Unlink destination if it already exists */
        if (d_really_is_positive(new_dentry)) {
                res = hfs_remove(new_dir, new_dentry);
                if (res)
                        return res;
        }

        res = hfs_cat_move(d_inode(old_dentry)->i_ino,
                           old_dir, &old_dentry->d_name,
                           new_dir, &new_dentry->d_name);
        if (!res)
                hfs_cat_build_key(old_dir->i_sb,
                                  (btree_key *)&HFS_I(d_inode(old_dentry))->cat_key,
                                  new_dir->i_ino, &new_dentry->d_name);
        return res;
}

const struct file_operations hfs_dir_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = hfs_readdir,
        .llseek                = generic_file_llseek,
        .release        = hfs_dir_release,
};

const struct inode_operations hfs_dir_inode_operations = {
        .create                = hfs_create,
        .lookup                = hfs_lookup,
        .unlink                = hfs_remove,
        .mkdir                = hfs_mkdir,
        .rmdir                = hfs_remove,
        .rename                = hfs_rename,
        .setattr        = hfs_inode_setattr,
};


















































































































    1 




    1 






    1 










    1 



















    1 




























    4 






























    2 








    2 


















































    1 










    1 


    1 


    1 








    1 


    1 




































































































































































    1 














    1 






    1 




    1 











































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
// SPDX-License-Identifier: GPL-2.0

#include "misc.h"
#include "ctree.h"
#include "block-rsv.h"
#include "space-info.h"
#include "transaction.h"
#include "block-group.h"
#include "fs.h"
#include "accessors.h"

/*
 * HOW DO BLOCK RESERVES WORK
 *
 *   Think of block_rsv's as buckets for logically grouped metadata
 *   reservations.  Each block_rsv has a ->size and a ->reserved.  ->size is
 *   how large we want our block rsv to be, ->reserved is how much space is
 *   currently reserved for this block reserve.
 *
 *   ->failfast exists for the truncate case, and is described below.
 *
 * NORMAL OPERATION
 *
 *   -> Reserve
 *     Entrance: btrfs_block_rsv_add, btrfs_block_rsv_refill
 *
 *     We call into btrfs_reserve_metadata_bytes() with our bytes, which is
 *     accounted for in space_info->bytes_may_use, and then add the bytes to
 *     ->reserved, and ->size in the case of btrfs_block_rsv_add.
 *
 *     ->size is an over-estimation of how much we may use for a particular
 *     operation.
 *
 *   -> Use
 *     Entrance: btrfs_use_block_rsv
 *
 *     When we do a btrfs_alloc_tree_block() we call into btrfs_use_block_rsv()
 *     to determine the appropriate block_rsv to use, and then verify that
 *     ->reserved has enough space for our tree block allocation.  Once
 *     successful we subtract fs_info->nodesize from ->reserved.
 *
 *   -> Finish
 *     Entrance: btrfs_block_rsv_release
 *
 *     We are finished with our operation, subtract our individual reservation
 *     from ->size, and then subtract ->size from ->reserved and free up the
 *     excess if there is any.
 *
 *     There is some logic here to refill the delayed refs rsv or the global rsv
 *     as needed, otherwise the excess is subtracted from
 *     space_info->bytes_may_use.
 *
 * TYPES OF BLOCK RESERVES
 *
 * BLOCK_RSV_TRANS, BLOCK_RSV_DELOPS, BLOCK_RSV_CHUNK
 *   These behave normally, as described above, just within the confines of the
 *   lifetime of their particular operation (transaction for the whole trans
 *   handle lifetime, for example).
 *
 * BLOCK_RSV_GLOBAL
 *   It is impossible to properly account for all the space that may be required
 *   to make our extent tree updates.  This block reserve acts as an overflow
 *   buffer in case our delayed refs reserve does not reserve enough space to
 *   update the extent tree.
 *
 *   We can steal from this in some cases as well, notably on evict() or
 *   truncate() in order to help users recover from ENOSPC conditions.
 *
 * BLOCK_RSV_DELALLOC
 *   The individual item sizes are determined by the per-inode size
 *   calculations, which are described with the delalloc code.  This is pretty
 *   straightforward, it's just the calculation of ->size encodes a lot of
 *   different items, and thus it gets used when updating inodes, inserting file
 *   extents, and inserting checksums.
 *
 * BLOCK_RSV_DELREFS
 *   We keep a running tally of how many delayed refs we have on the system.
 *   We assume each one of these delayed refs are going to use a full
 *   reservation.  We use the transaction items and pre-reserve space for every
 *   operation, and use this reservation to refill any gap between ->size and
 *   ->reserved that may exist.
 *
 *   From there it's straightforward, removing a delayed ref means we remove its
 *   count from ->size and free up reservations as necessary.  Since this is
 *   the most dynamic block reserve in the system, we will try to refill this
 *   block reserve first with any excess returned by any other block reserve.
 *
 * BLOCK_RSV_EMPTY
 *   This is the fallback block reserve to make us try to reserve space if we
 *   don't have a specific bucket for this allocation.  It is mostly used for
 *   updating the device tree and such, since that is a separate pool we're
 *   content to just reserve space from the space_info on demand.
 *
 * BLOCK_RSV_TEMP
 *   This is used by things like truncate and iput.  We will temporarily
 *   allocate a block reserve, set it to some size, and then truncate bytes
 *   until we have no space left.  With ->failfast set we'll simply return
 *   ENOSPC from btrfs_use_block_rsv() to signal that we need to unwind and try
 *   to make a new reservation.  This is because these operations are
 *   unbounded, so we want to do as much work as we can, and then back off and
 *   re-reserve.
 */

static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
                                    struct btrfs_block_rsv *block_rsv,
                                    struct btrfs_block_rsv *dest, u64 num_bytes,
                                    u64 *qgroup_to_release_ret)
{
        struct btrfs_space_info *space_info = block_rsv->space_info;
        u64 qgroup_to_release = 0;
        u64 ret;

        spin_lock(&block_rsv->lock);
        if (num_bytes == (u64)-1) {
                num_bytes = block_rsv->size;
                qgroup_to_release = block_rsv->qgroup_rsv_size;
        }
        block_rsv->size -= num_bytes;
        if (block_rsv->reserved >= block_rsv->size) {
                num_bytes = block_rsv->reserved - block_rsv->size;
                block_rsv->reserved = block_rsv->size;
                block_rsv->full = true;
        } else {
                num_bytes = 0;
        }
        if (qgroup_to_release_ret &&
            block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
                qgroup_to_release = block_rsv->qgroup_rsv_reserved -
                                    block_rsv->qgroup_rsv_size;
                block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
        } else {
                qgroup_to_release = 0;
        }
        spin_unlock(&block_rsv->lock);

        ret = num_bytes;
        if (num_bytes > 0) {
                if (dest) {
                        spin_lock(&dest->lock);
                        if (!dest->full) {
                                u64 bytes_to_add;

                                bytes_to_add = dest->size - dest->reserved;
                                bytes_to_add = min(num_bytes, bytes_to_add);
                                dest->reserved += bytes_to_add;
                                if (dest->reserved >= dest->size)
                                        dest->full = true;
                                num_bytes -= bytes_to_add;
                        }
                        spin_unlock(&dest->lock);
                }
                if (num_bytes)
                        btrfs_space_info_free_bytes_may_use(fs_info,
                                                            space_info,
                                                            num_bytes);
        }
        if (qgroup_to_release_ret)
                *qgroup_to_release_ret = qgroup_to_release;
        return ret;
}

int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
                            struct btrfs_block_rsv *dst, u64 num_bytes,
                            bool update_size)
{
        int ret;

        ret = btrfs_block_rsv_use_bytes(src, num_bytes);
        if (ret)
                return ret;

        btrfs_block_rsv_add_bytes(dst, num_bytes, update_size);
        return 0;
}

void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type)
{
        memset(rsv, 0, sizeof(*rsv));
        spin_lock_init(&rsv->lock);
        rsv->type = type;
}

void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
                                   struct btrfs_block_rsv *rsv,
                                   enum btrfs_rsv_type type)
{
        btrfs_init_block_rsv(rsv, type);
        rsv->space_info = btrfs_find_space_info(fs_info,
                                            BTRFS_BLOCK_GROUP_METADATA);
}

struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
                                              enum btrfs_rsv_type type)
{
        struct btrfs_block_rsv *block_rsv;

        block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
        if (!block_rsv)
                return NULL;

        btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
        return block_rsv;
}

void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
                          struct btrfs_block_rsv *rsv)
{
        if (!rsv)
                return;
        btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);
        kfree(rsv);
}

int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
                        struct btrfs_block_rsv *block_rsv, u64 num_bytes,
                        enum btrfs_reserve_flush_enum flush)
{
        int ret;

        if (num_bytes == 0)
                return 0;

        ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
                                           num_bytes, flush);
        if (!ret)
                btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);

        return ret;
}

int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent)
{
        u64 num_bytes = 0;
        int ret = -ENOSPC;

        spin_lock(&block_rsv->lock);
        num_bytes = mult_perc(block_rsv->size, min_percent);
        if (block_rsv->reserved >= num_bytes)
                ret = 0;
        spin_unlock(&block_rsv->lock);

        return ret;
}

int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
                           struct btrfs_block_rsv *block_rsv, u64 num_bytes,
                           enum btrfs_reserve_flush_enum flush)
{
        int ret = -ENOSPC;

        if (!block_rsv)
                return 0;

        spin_lock(&block_rsv->lock);
        if (block_rsv->reserved >= num_bytes)
                ret = 0;
        else
                num_bytes -= block_rsv->reserved;
        spin_unlock(&block_rsv->lock);

        if (!ret)
                return 0;

        ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
                                           num_bytes, flush);
        if (!ret) {
                btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
                return 0;
        }

        return ret;
}

u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
                            struct btrfs_block_rsv *block_rsv, u64 num_bytes,
                            u64 *qgroup_to_release)
{
        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
        struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
        struct btrfs_block_rsv *target = NULL;

        /*
         * If we are a delayed block reserve then push to the global rsv,
         * otherwise dump into the global delayed reserve if it is not full.
         */
        if (block_rsv->type == BTRFS_BLOCK_RSV_DELOPS)
                target = global_rsv;
        else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv))
                target = delayed_rsv;

        if (target && block_rsv->space_info != target->space_info)
                target = NULL;

        return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
                                       qgroup_to_release);
}

int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes)
{
        int ret = -ENOSPC;

        spin_lock(&block_rsv->lock);
        if (block_rsv->reserved >= num_bytes) {
                block_rsv->reserved -= num_bytes;
                if (block_rsv->reserved < block_rsv->size)
                        block_rsv->full = false;
                ret = 0;
        }
        spin_unlock(&block_rsv->lock);
        return ret;
}

void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
                               u64 num_bytes, bool update_size)
{
        spin_lock(&block_rsv->lock);
        block_rsv->reserved += num_bytes;
        if (update_size)
                block_rsv->size += num_bytes;
        else if (block_rsv->reserved >= block_rsv->size)
                block_rsv->full = true;
        spin_unlock(&block_rsv->lock);
}

void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
{
        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
        struct btrfs_space_info *sinfo = block_rsv->space_info;
        struct btrfs_root *root, *tmp;
        u64 num_bytes = btrfs_root_used(&fs_info->tree_root->root_item);
        unsigned int min_items = 1;

        /*
         * The global block rsv is based on the size of the extent tree, the
         * checksum tree and the root tree.  If the fs is empty we want to set
         * it to a minimal amount for safety.
         *
         * We also are going to need to modify the minimum of the tree root and
         * any global roots we could touch.
         */
        read_lock(&fs_info->global_root_lock);
        rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree,
                                             rb_node) {
                if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID ||
                    btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID ||
                    btrfs_root_id(root) == BTRFS_FREE_SPACE_TREE_OBJECTID) {
                        num_bytes += btrfs_root_used(&root->root_item);
                        min_items++;
                }
        }
        read_unlock(&fs_info->global_root_lock);

        if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
                num_bytes += btrfs_root_used(&fs_info->block_group_root->root_item);
                min_items++;
        }

        if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) {
                num_bytes += btrfs_root_used(&fs_info->stripe_root->root_item);
                min_items++;
        }

        /*
         * But we also want to reserve enough space so we can do the fallback
         * global reserve for an unlink, which is an additional
         * BTRFS_UNLINK_METADATA_UNITS items.
         *
         * But we also need space for the delayed ref updates from the unlink,
         * so add BTRFS_UNLINK_METADATA_UNITS units for delayed refs, one for
         * each unlink metadata item.
         */
        min_items += BTRFS_UNLINK_METADATA_UNITS;

        num_bytes = max_t(u64, num_bytes,
                          btrfs_calc_insert_metadata_size(fs_info, min_items) +
                          btrfs_calc_delayed_ref_bytes(fs_info,
                                               BTRFS_UNLINK_METADATA_UNITS));

        spin_lock(&sinfo->lock);
        spin_lock(&block_rsv->lock);

        block_rsv->size = min_t(u64, num_bytes, SZ_512M);

        if (block_rsv->reserved < block_rsv->size) {
                num_bytes = block_rsv->size - block_rsv->reserved;
                btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
                                                      num_bytes);
                block_rsv->reserved = block_rsv->size;
        } else if (block_rsv->reserved > block_rsv->size) {
                num_bytes = block_rsv->reserved - block_rsv->size;
                btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
                                                      -num_bytes);
                block_rsv->reserved = block_rsv->size;
                btrfs_try_granting_tickets(fs_info, sinfo);
        }

        block_rsv->full = (block_rsv->reserved == block_rsv->size);

        if (block_rsv->size >= sinfo->total_bytes)
                sinfo->force_alloc = CHUNK_ALLOC_FORCE;
        spin_unlock(&block_rsv->lock);
        spin_unlock(&sinfo->lock);
}

void btrfs_init_root_block_rsv(struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;

        switch (btrfs_root_id(root)) {
        case BTRFS_CSUM_TREE_OBJECTID:
        case BTRFS_EXTENT_TREE_OBJECTID:
        case BTRFS_FREE_SPACE_TREE_OBJECTID:
        case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
        case BTRFS_RAID_STRIPE_TREE_OBJECTID:
                root->block_rsv = &fs_info->delayed_refs_rsv;
                break;
        case BTRFS_ROOT_TREE_OBJECTID:
        case BTRFS_DEV_TREE_OBJECTID:
        case BTRFS_QUOTA_TREE_OBJECTID:
                root->block_rsv = &fs_info->global_block_rsv;
                break;
        case BTRFS_CHUNK_TREE_OBJECTID:
                root->block_rsv = &fs_info->chunk_block_rsv;
                break;
        default:
                root->block_rsv = NULL;
                break;
        }
}

void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
{
        struct btrfs_space_info *space_info;

        space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
        fs_info->chunk_block_rsv.space_info = space_info;

        space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
        fs_info->global_block_rsv.space_info = space_info;
        fs_info->trans_block_rsv.space_info = space_info;
        fs_info->empty_block_rsv.space_info = space_info;
        fs_info->delayed_block_rsv.space_info = space_info;
        fs_info->delayed_refs_rsv.space_info = space_info;

        btrfs_update_global_block_rsv(fs_info);
}

void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info)
{
        btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1,
                                NULL);
        WARN_ON(fs_info->trans_block_rsv.size > 0);
        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
        WARN_ON(fs_info->chunk_block_rsv.size > 0);
        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
        WARN_ON(fs_info->delayed_block_rsv.size > 0);
        WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
        WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
        WARN_ON(fs_info->delayed_refs_rsv.size > 0);
}

static struct btrfs_block_rsv *get_block_rsv(
                                        const struct btrfs_trans_handle *trans,
                                        const struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_block_rsv *block_rsv = NULL;

        if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
            (root == fs_info->uuid_root) ||
            (trans->adding_csums && btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID))
                block_rsv = trans->block_rsv;

        if (!block_rsv)
                block_rsv = root->block_rsv;

        if (!block_rsv)
                block_rsv = &fs_info->empty_block_rsv;

        return block_rsv;
}

struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u32 blocksize)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_block_rsv *block_rsv;
        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
        int ret;
        bool global_updated = false;

        block_rsv = get_block_rsv(trans, root);

        if (unlikely(btrfs_block_rsv_size(block_rsv) == 0))
                goto try_reserve;
again:
        ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
        if (!ret)
                return block_rsv;

        if (block_rsv->failfast)
                return ERR_PTR(ret);

        if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
                global_updated = true;
                btrfs_update_global_block_rsv(fs_info);
                goto again;
        }

        /*
         * The global reserve still exists to save us from ourselves, so don't
         * warn_on if we are short on our delayed refs reserve.
         */
        if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
            btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
                static DEFINE_RATELIMIT_STATE(_rs,
                                DEFAULT_RATELIMIT_INTERVAL * 10,
                                /*DEFAULT_RATELIMIT_BURST*/ 1);
                if (__ratelimit(&_rs))
                        WARN(1, KERN_DEBUG
                                "BTRFS: block rsv %d returned %d\n",
                                block_rsv->type, ret);
        }
try_reserve:
        ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info,
                                           blocksize, BTRFS_RESERVE_NO_FLUSH);
        if (!ret)
                return block_rsv;
        /*
         * If we couldn't reserve metadata bytes try and use some from
         * the global reserve if its space type is the same as the global
         * reservation.
         */
        if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
            block_rsv->space_info == global_rsv->space_info) {
                ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize);
                if (!ret)
                        return global_rsv;
        }

        /*
         * All hope is lost, but of course our reservations are overly
         * pessimistic, so instead of possibly having an ENOSPC abort here, try
         * one last time to force a reservation if there's enough actual space
         * on disk to make the reservation.
         */
        ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, blocksize,
                                           BTRFS_RESERVE_FLUSH_EMERGENCY);
        if (!ret)
                return block_rsv;

        return ERR_PTR(ret);
}

int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
                                       struct btrfs_block_rsv *rsv)
{
        u64 needed_bytes;
        int ret;

        /* 1 for slack space, 1 for updating the inode */
        needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
                btrfs_calc_metadata_size(fs_info, 1);

        spin_lock(&rsv->lock);
        if (rsv->reserved < needed_bytes)
                ret = -ENOSPC;
        else
                ret = 0;
        spin_unlock(&rsv->lock);
        return ret;
}











































































































































































































    3 


    3 





















    2 





    2 






    2 
    2 



    3 
    3 


















    3 












    2 



    3 



    3 





    2 










































    3 



    3 




    2 













    2 









    2 





    2 



    2 
    1 












    2 












    3 







    3 

















    3 


    3 









    2 

    1 
















    3 




    3 






    3 

    3 












    2 
























    2 







    2 












    2 









    2 
    2 















































































































































































































































































































































































































































































































    2 






































    1 




    2 


















    3 




















    1 






    1 



    2 
    2 


    2 







    2 













































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
// SPDX-License-Identifier: GPL-2.0
/*
 * KFENCE guarded object allocator and fault handling.
 *
 * Copyright (C) 2020, Google LLC.
 */

#define pr_fmt(fmt) "kfence: " fmt

#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/debugfs.h>
#include <linux/hash.h>
#include <linux/irq_work.h>
#include <linux/jhash.h>
#include <linux/kcsan-checks.h>
#include <linux/kfence.h>
#include <linux/kmemleak.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/log2.h>
#include <linux/memblock.h>
#include <linux/moduleparam.h>
#include <linux/notifier.h>
#include <linux/panic_notifier.h>
#include <linux/random.h>
#include <linux/rcupdate.h>
#include <linux/sched/clock.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/string.h>

#include <asm/kfence.h>

#include "kfence.h"

/* Disables KFENCE on the first warning assuming an irrecoverable error. */
#define KFENCE_WARN_ON(cond)                                                   \
        ({                                                                     \
                const bool __cond = WARN_ON(cond);                             \
                if (unlikely(__cond)) {                                        \
                        WRITE_ONCE(kfence_enabled, false);                     \
                        disabled_by_warn = true;                               \
                }                                                              \
                __cond;                                                        \
        })

/* === Data ================================================================= */

static bool kfence_enabled __read_mostly;
static bool disabled_by_warn __read_mostly;

unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */

#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
#endif
#define MODULE_PARAM_PREFIX "kfence."

static int kfence_enable_late(void);
static int param_set_sample_interval(const char *val, const struct kernel_param *kp)
{
        unsigned long num;
        int ret = kstrtoul(val, 0, &num);

        if (ret < 0)
                return ret;

        /* Using 0 to indicate KFENCE is disabled. */
        if (!num && READ_ONCE(kfence_enabled)) {
                pr_info("disabled\n");
                WRITE_ONCE(kfence_enabled, false);
        }

        *((unsigned long *)kp->arg) = num;

        if (num && !READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING)
                return disabled_by_warn ? -EINVAL : kfence_enable_late();
        return 0;
}

static int param_get_sample_interval(char *buffer, const struct kernel_param *kp)
{
        if (!READ_ONCE(kfence_enabled))
                return sprintf(buffer, "0\n");

        return param_get_ulong(buffer, kp);
}

static const struct kernel_param_ops sample_interval_param_ops = {
        .set = param_set_sample_interval,
        .get = param_get_sample_interval,
};
module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);

/* Pool usage% threshold when currently covered allocations are skipped. */
static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);

/* If true, use a deferrable timer. */
static bool kfence_deferrable __read_mostly = IS_ENABLED(CONFIG_KFENCE_DEFERRABLE);
module_param_named(deferrable, kfence_deferrable, bool, 0444);

/* If true, check all canary bytes on panic. */
static bool kfence_check_on_panic __read_mostly;
module_param_named(check_on_panic, kfence_check_on_panic, bool, 0444);

/* The pool of pages used for guard pages and objects. */
char *__kfence_pool __read_mostly;
EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */

/*
 * Per-object metadata, with one-to-one mapping of object metadata to
 * backing pages (in __kfence_pool).
 */
static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0);
struct kfence_metadata *kfence_metadata __read_mostly;

/*
 * If kfence_metadata is not NULL, it may be accessed by kfence_shutdown_cache().
 * So introduce kfence_metadata_init to initialize metadata, and then make
 * kfence_metadata visible after initialization is successful. This prevents
 * potential UAF or access to uninitialized metadata.
 */
static struct kfence_metadata *kfence_metadata_init __read_mostly;

/* Freelist with available objects. */
static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist);
static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */

/*
 * The static key to set up a KFENCE allocation; or if static keys are not used
 * to gate allocations, to avoid a load and compare if KFENCE is disabled.
 */
DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);

/* Gates the allocation, ensuring only one succeeds in a given period. */
atomic_t kfence_allocation_gate = ATOMIC_INIT(1);

/*
 * A Counting Bloom filter of allocation coverage: limits currently covered
 * allocations of the same source filling up the pool.
 *
 * Assuming a range of 15%-85% unique allocations in the pool at any point in
 * time, the below parameters provide a probablity of 0.02-0.33 for false
 * positive hits respectively:
 *
 *        P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
 */
#define ALLOC_COVERED_HNUM        2
#define ALLOC_COVERED_ORDER        (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2)
#define ALLOC_COVERED_SIZE        (1 << ALLOC_COVERED_ORDER)
#define ALLOC_COVERED_HNEXT(h)        hash_32(h, ALLOC_COVERED_ORDER)
#define ALLOC_COVERED_MASK        (ALLOC_COVERED_SIZE - 1)
static atomic_t alloc_covered[ALLOC_COVERED_SIZE];

/* Stack depth used to determine uniqueness of an allocation. */
#define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8)

/*
 * Randomness for stack hashes, making the same collisions across reboots and
 * different machines less likely.
 */
static u32 stack_hash_seed __ro_after_init;

/* Statistics counters for debugfs. */
enum kfence_counter_id {
        KFENCE_COUNTER_ALLOCATED,
        KFENCE_COUNTER_ALLOCS,
        KFENCE_COUNTER_FREES,
        KFENCE_COUNTER_ZOMBIES,
        KFENCE_COUNTER_BUGS,
        KFENCE_COUNTER_SKIP_INCOMPAT,
        KFENCE_COUNTER_SKIP_CAPACITY,
        KFENCE_COUNTER_SKIP_COVERED,
        KFENCE_COUNTER_COUNT,
};
static atomic_long_t counters[KFENCE_COUNTER_COUNT];
static const char *const counter_names[] = {
        [KFENCE_COUNTER_ALLOCATED]        = "currently allocated",
        [KFENCE_COUNTER_ALLOCS]                = "total allocations",
        [KFENCE_COUNTER_FREES]                = "total frees",
        [KFENCE_COUNTER_ZOMBIES]        = "zombie allocations",
        [KFENCE_COUNTER_BUGS]                = "total bugs",
        [KFENCE_COUNTER_SKIP_INCOMPAT]        = "skipped allocations (incompatible)",
        [KFENCE_COUNTER_SKIP_CAPACITY]        = "skipped allocations (capacity)",
        [KFENCE_COUNTER_SKIP_COVERED]        = "skipped allocations (covered)",
};
static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);

/* === Internals ============================================================ */

static inline bool should_skip_covered(void)
{
        unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;

        return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
}

static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
{
        num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
        num_entries = filter_irq_stacks(stack_entries, num_entries);
        return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed);
}

/*
 * Adds (or subtracts) count @val for allocation stack trace hash
 * @alloc_stack_hash from Counting Bloom filter.
 */
static void alloc_covered_add(u32 alloc_stack_hash, int val)
{
        int i;

        for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
                atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
                alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
        }
}

/*
 * Returns true if the allocation stack trace hash @alloc_stack_hash is
 * currently contained (non-zero count) in Counting Bloom filter.
 */
static bool alloc_covered_contains(u32 alloc_stack_hash)
{
        int i;

        for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
                if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]))
                        return false;
                alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
        }

        return true;
}

static bool kfence_protect(unsigned long addr)
{
        return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
}

static bool kfence_unprotect(unsigned long addr)
{
        return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false));
}

static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta)
{
        unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2;
        unsigned long pageaddr = (unsigned long)&__kfence_pool[offset];

        /* The checks do not affect performance; only called from slow-paths. */

        /* Only call with a pointer into kfence_metadata. */
        if (KFENCE_WARN_ON(meta < kfence_metadata ||
                           meta >= kfence_metadata + CONFIG_KFENCE_NUM_OBJECTS))
                return 0;

        /*
         * This metadata object only ever maps to 1 page; verify that the stored
         * address is in the expected range.
         */
        if (KFENCE_WARN_ON(ALIGN_DOWN(meta->addr, PAGE_SIZE) != pageaddr))
                return 0;

        return pageaddr;
}

/*
 * Update the object's metadata state, including updating the alloc/free stacks
 * depending on the state transition.
 */
static noinline void
metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next,
                      unsigned long *stack_entries, size_t num_stack_entries)
{
        struct kfence_track *track =
                next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track;

        lockdep_assert_held(&meta->lock);

        if (stack_entries) {
                memcpy(track->stack_entries, stack_entries,
                       num_stack_entries * sizeof(stack_entries[0]));
        } else {
                /*
                 * Skip over 1 (this) functions; noinline ensures we do not
                 * accidentally skip over the caller by never inlining.
                 */
                num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
        }
        track->num_stack_entries = num_stack_entries;
        track->pid = task_pid_nr(current);
        track->cpu = raw_smp_processor_id();
        track->ts_nsec = local_clock(); /* Same source as printk timestamps. */

        /*
         * Pairs with READ_ONCE() in
         *        kfence_shutdown_cache(),
         *        kfence_handle_page_fault().
         */
        WRITE_ONCE(meta->state, next);
}

/* Check canary byte at @addr. */
static inline bool check_canary_byte(u8 *addr)
{
        struct kfence_metadata *meta;
        unsigned long flags;

        if (likely(*addr == KFENCE_CANARY_PATTERN_U8(addr)))
                return true;

        atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);

        meta = addr_to_metadata((unsigned long)addr);
        raw_spin_lock_irqsave(&meta->lock, flags);
        kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION);
        raw_spin_unlock_irqrestore(&meta->lock, flags);

        return false;
}

static inline void set_canary(const struct kfence_metadata *meta)
{
        const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
        unsigned long addr = pageaddr;

        /*
         * The canary may be written to part of the object memory, but it does
         * not affect it. The user should initialize the object before using it.
         */
        for (; addr < meta->addr; addr += sizeof(u64))
                *((u64 *)addr) = KFENCE_CANARY_PATTERN_U64;

        addr = ALIGN_DOWN(meta->addr + meta->size, sizeof(u64));
        for (; addr - pageaddr < PAGE_SIZE; addr += sizeof(u64))
                *((u64 *)addr) = KFENCE_CANARY_PATTERN_U64;
}

static inline void check_canary(const struct kfence_metadata *meta)
{
        const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
        unsigned long addr = pageaddr;

        /*
         * We'll iterate over each canary byte per-side until a corrupted byte
         * is found. However, we'll still iterate over the canary bytes to the
         * right of the object even if there was an error in the canary bytes to
         * the left of the object. Specifically, if check_canary_byte()
         * generates an error, showing both sides might give more clues as to
         * what the error is about when displaying which bytes were corrupted.
         */

        /* Apply to left of object. */
        for (; meta->addr - addr >= sizeof(u64); addr += sizeof(u64)) {
                if (unlikely(*((u64 *)addr) != KFENCE_CANARY_PATTERN_U64))
                        break;
        }

        /*
         * If the canary is corrupted in a certain 64 bytes, or the canary
         * memory cannot be completely covered by multiple consecutive 64 bytes,
         * it needs to be checked one by one.
         */
        for (; addr < meta->addr; addr++) {
                if (unlikely(!check_canary_byte((u8 *)addr)))
                        break;
        }

        /* Apply to right of object. */
        for (addr = meta->addr + meta->size; addr % sizeof(u64) != 0; addr++) {
                if (unlikely(!check_canary_byte((u8 *)addr)))
                        return;
        }
        for (; addr - pageaddr < PAGE_SIZE; addr += sizeof(u64)) {
                if (unlikely(*((u64 *)addr) != KFENCE_CANARY_PATTERN_U64)) {

                        for (; addr - pageaddr < PAGE_SIZE; addr++) {
                                if (!check_canary_byte((u8 *)addr))
                                        return;
                        }
                }
        }
}

static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
                                  unsigned long *stack_entries, size_t num_stack_entries,
                                  u32 alloc_stack_hash)
{
        struct kfence_metadata *meta = NULL;
        unsigned long flags;
        struct slab *slab;
        void *addr;
        const bool random_right_allocate = get_random_u32_below(2);
        const bool random_fault = CONFIG_KFENCE_STRESS_TEST_FAULTS &&
                                  !get_random_u32_below(CONFIG_KFENCE_STRESS_TEST_FAULTS);

        /* Try to obtain a free object. */
        raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
        if (!list_empty(&kfence_freelist)) {
                meta = list_entry(kfence_freelist.next, struct kfence_metadata, list);
                list_del_init(&meta->list);
        }
        raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
        if (!meta) {
                atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
                return NULL;
        }

        if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
                /*
                 * This is extremely unlikely -- we are reporting on a
                 * use-after-free, which locked meta->lock, and the reporting
                 * code via printk calls kmalloc() which ends up in
                 * kfence_alloc() and tries to grab the same object that we're
                 * reporting on. While it has never been observed, lockdep does
                 * report that there is a possibility of deadlock. Fix it by
                 * using trylock and bailing out gracefully.
                 */
                raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
                /* Put the object back on the freelist. */
                list_add_tail(&meta->list, &kfence_freelist);
                raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);

                return NULL;
        }

        meta->addr = metadata_to_pageaddr(meta);
        /* Unprotect if we're reusing this page. */
        if (meta->state == KFENCE_OBJECT_FREED)
                kfence_unprotect(meta->addr);

        /*
         * Note: for allocations made before RNG initialization, will always
         * return zero. We still benefit from enabling KFENCE as early as
         * possible, even when the RNG is not yet available, as this will allow
         * KFENCE to detect bugs due to earlier allocations. The only downside
         * is that the out-of-bounds accesses detected are deterministic for
         * such allocations.
         */
        if (random_right_allocate) {
                /* Allocate on the "right" side, re-calculate address. */
                meta->addr += PAGE_SIZE - size;
                meta->addr = ALIGN_DOWN(meta->addr, cache->align);
        }

        addr = (void *)meta->addr;

        /* Update remaining metadata. */
        metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries);
        /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
        WRITE_ONCE(meta->cache, cache);
        meta->size = size;
        meta->alloc_stack_hash = alloc_stack_hash;
        raw_spin_unlock_irqrestore(&meta->lock, flags);

        alloc_covered_add(alloc_stack_hash, 1);

        /* Set required slab fields. */
        slab = virt_to_slab((void *)meta->addr);
        slab->slab_cache = cache;
        slab->objects = 1;

        /* Memory initialization. */
        set_canary(meta);

        /*
         * We check slab_want_init_on_alloc() ourselves, rather than letting
         * SL*B do the initialization, as otherwise we might overwrite KFENCE's
         * redzone.
         */
        if (unlikely(slab_want_init_on_alloc(gfp, cache)))
                memzero_explicit(addr, size);
        if (cache->ctor)
                cache->ctor(addr);

        if (random_fault)
                kfence_protect(meta->addr); /* Random "faults" by protecting the object. */

        atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]);
        atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]);

        return addr;
}

static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
{
        struct kcsan_scoped_access assert_page_exclusive;
        unsigned long flags;
        bool init;

        raw_spin_lock_irqsave(&meta->lock, flags);

        if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) {
                /* Invalid or double-free, bail out. */
                atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
                kfence_report_error((unsigned long)addr, false, NULL, meta,
                                    KFENCE_ERROR_INVALID_FREE);
                raw_spin_unlock_irqrestore(&meta->lock, flags);
                return;
        }

        /* Detect racy use-after-free, or incorrect reallocation of this page by KFENCE. */
        kcsan_begin_scoped_access((void *)ALIGN_DOWN((unsigned long)addr, PAGE_SIZE), PAGE_SIZE,
                                  KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT,
                                  &assert_page_exclusive);

        if (CONFIG_KFENCE_STRESS_TEST_FAULTS)
                kfence_unprotect((unsigned long)addr); /* To check canary bytes. */

        /* Restore page protection if there was an OOB access. */
        if (meta->unprotected_page) {
                memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE);
                kfence_protect(meta->unprotected_page);
                meta->unprotected_page = 0;
        }

        /* Mark the object as freed. */
        metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0);
        init = slab_want_init_on_free(meta->cache);
        raw_spin_unlock_irqrestore(&meta->lock, flags);

        alloc_covered_add(meta->alloc_stack_hash, -1);

        /* Check canary bytes for memory corruption. */
        check_canary(meta);

        /*
         * Clear memory if init-on-free is set. While we protect the page, the
         * data is still there, and after a use-after-free is detected, we
         * unprotect the page, so the data is still accessible.
         */
        if (!zombie && unlikely(init))
                memzero_explicit(addr, meta->size);

        /* Protect to detect use-after-frees. */
        kfence_protect((unsigned long)addr);

        kcsan_end_scoped_access(&assert_page_exclusive);
        if (!zombie) {
                /* Add it to the tail of the freelist for reuse. */
                raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
                KFENCE_WARN_ON(!list_empty(&meta->list));
                list_add_tail(&meta->list, &kfence_freelist);
                raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);

                atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]);
                atomic_long_inc(&counters[KFENCE_COUNTER_FREES]);
        } else {
                /* See kfence_shutdown_cache(). */
                atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]);
        }
}

static void rcu_guarded_free(struct rcu_head *h)
{
        struct kfence_metadata *meta = container_of(h, struct kfence_metadata, rcu_head);

        kfence_guarded_free((void *)meta->addr, meta, false);
}

/*
 * Initialization of the KFENCE pool after its allocation.
 * Returns 0 on success; otherwise returns the address up to
 * which partial initialization succeeded.
 */
static unsigned long kfence_init_pool(void)
{
        unsigned long addr;
        struct page *pages;
        int i;

        if (!arch_kfence_init_pool())
                return (unsigned long)__kfence_pool;

        addr = (unsigned long)__kfence_pool;
        pages = virt_to_page(__kfence_pool);

        /*
         * Set up object pages: they must have PG_slab set, to avoid freeing
         * these as real pages.
         *
         * We also want to avoid inserting kfence_free() in the kfree()
         * fast-path in SLUB, and therefore need to ensure kfree() correctly
         * enters __slab_free() slow-path.
         */
        for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
                struct slab *slab = page_slab(nth_page(pages, i));

                if (!i || (i % 2))
                        continue;

                __folio_set_slab(slab_folio(slab));
#ifdef CONFIG_MEMCG_KMEM
                slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts |
                                 MEMCG_DATA_OBJEXTS;
#endif
        }

        /*
         * Protect the first 2 pages. The first page is mostly unnecessary, and
         * merely serves as an extended guard page. However, adding one
         * additional page in the beginning gives us an even number of pages,
         * which simplifies the mapping of address to metadata index.
         */
        for (i = 0; i < 2; i++) {
                if (unlikely(!kfence_protect(addr)))
                        return addr;

                addr += PAGE_SIZE;
        }

        for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
                struct kfence_metadata *meta = &kfence_metadata_init[i];

                /* Initialize metadata. */
                INIT_LIST_HEAD(&meta->list);
                raw_spin_lock_init(&meta->lock);
                meta->state = KFENCE_OBJECT_UNUSED;
                meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */
                list_add_tail(&meta->list, &kfence_freelist);

                /* Protect the right redzone. */
                if (unlikely(!kfence_protect(addr + PAGE_SIZE)))
                        goto reset_slab;

                addr += 2 * PAGE_SIZE;
        }

        /*
         * Make kfence_metadata visible only when initialization is successful.
         * Otherwise, if the initialization fails and kfence_metadata is freed,
         * it may cause UAF in kfence_shutdown_cache().
         */
        smp_store_release(&kfence_metadata, kfence_metadata_init);
        return 0;

reset_slab:
        for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
                struct slab *slab = page_slab(nth_page(pages, i));

                if (!i || (i % 2))
                        continue;
#ifdef CONFIG_MEMCG_KMEM
                slab->obj_exts = 0;
#endif
                __folio_clear_slab(slab_folio(slab));
        }

        return addr;
}

static bool __init kfence_init_pool_early(void)
{
        unsigned long addr;

        if (!__kfence_pool)
                return false;

        addr = kfence_init_pool();

        if (!addr) {
                /*
                 * The pool is live and will never be deallocated from this point on.
                 * Ignore the pool object from the kmemleak phys object tree, as it would
                 * otherwise overlap with allocations returned by kfence_alloc(), which
                 * are registered with kmemleak through the slab post-alloc hook.
                 */
                kmemleak_ignore_phys(__pa(__kfence_pool));
                return true;
        }

        /*
         * Only release unprotected pages, and do not try to go back and change
         * page attributes due to risk of failing to do so as well. If changing
         * page attributes for some pages fails, it is very likely that it also
         * fails for the first page, and therefore expect addr==__kfence_pool in
         * most failure cases.
         */
        memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
        __kfence_pool = NULL;

        memblock_free_late(__pa(kfence_metadata_init), KFENCE_METADATA_SIZE);
        kfence_metadata_init = NULL;

        return false;
}

/* === DebugFS Interface ==================================================== */

static int stats_show(struct seq_file *seq, void *v)
{
        int i;

        seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled));
        for (i = 0; i < KFENCE_COUNTER_COUNT; i++)
                seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i]));

        return 0;
}
DEFINE_SHOW_ATTRIBUTE(stats);

/*
 * debugfs seq_file operations for /sys/kernel/debug/kfence/objects.
 * start_object() and next_object() return the object index + 1, because NULL is used
 * to stop iteration.
 */
static void *start_object(struct seq_file *seq, loff_t *pos)
{
        if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
                return (void *)((long)*pos + 1);
        return NULL;
}

static void stop_object(struct seq_file *seq, void *v)
{
}

static void *next_object(struct seq_file *seq, void *v, loff_t *pos)
{
        ++*pos;
        if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
                return (void *)((long)*pos + 1);
        return NULL;
}

static int show_object(struct seq_file *seq, void *v)
{
        struct kfence_metadata *meta = &kfence_metadata[(long)v - 1];
        unsigned long flags;

        raw_spin_lock_irqsave(&meta->lock, flags);
        kfence_print_object(seq, meta);
        raw_spin_unlock_irqrestore(&meta->lock, flags);
        seq_puts(seq, "---------------------------------\n");

        return 0;
}

static const struct seq_operations objects_sops = {
        .start = start_object,
        .next = next_object,
        .stop = stop_object,
        .show = show_object,
};
DEFINE_SEQ_ATTRIBUTE(objects);

static int kfence_debugfs_init(void)
{
        struct dentry *kfence_dir;

        if (!READ_ONCE(kfence_enabled))
                return 0;

        kfence_dir = debugfs_create_dir("kfence", NULL);
        debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops);
        debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops);
        return 0;
}

late_initcall(kfence_debugfs_init);

/* === Panic Notifier ====================================================== */

static void kfence_check_all_canary(void)
{
        int i;

        for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
                struct kfence_metadata *meta = &kfence_metadata[i];

                if (meta->state == KFENCE_OBJECT_ALLOCATED)
                        check_canary(meta);
        }
}

static int kfence_check_canary_callback(struct notifier_block *nb,
                                        unsigned long reason, void *arg)
{
        kfence_check_all_canary();
        return NOTIFY_OK;
}

static struct notifier_block kfence_check_canary_notifier = {
        .notifier_call = kfence_check_canary_callback,
};

/* === Allocation Gate Timer ================================================ */

static struct delayed_work kfence_timer;

#ifdef CONFIG_KFENCE_STATIC_KEYS
/* Wait queue to wake up allocation-gate timer task. */
static DECLARE_WAIT_QUEUE_HEAD(allocation_wait);

static void wake_up_kfence_timer(struct irq_work *work)
{
        wake_up(&allocation_wait);
}
static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer);
#endif

/*
 * Set up delayed work, which will enable and disable the static key. We need to
 * use a work queue (rather than a simple timer), since enabling and disabling a
 * static key cannot be done from an interrupt.
 *
 * Note: Toggling a static branch currently causes IPIs, and here we'll end up
 * with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with
 * more aggressive sampling intervals), we could get away with a variant that
 * avoids IPIs, at the cost of not immediately capturing allocations if the
 * instructions remain cached.
 */
static void toggle_allocation_gate(struct work_struct *work)
{
        if (!READ_ONCE(kfence_enabled))
                return;

        atomic_set(&kfence_allocation_gate, 0);
#ifdef CONFIG_KFENCE_STATIC_KEYS
        /* Enable static key, and await allocation to happen. */
        static_branch_enable(&kfence_allocation_key);

        wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate));

        /* Disable static key and reset timer. */
        static_branch_disable(&kfence_allocation_key);
#endif
        queue_delayed_work(system_unbound_wq, &kfence_timer,
                           msecs_to_jiffies(kfence_sample_interval));
}

/* === Public interface ===================================================== */

void __init kfence_alloc_pool_and_metadata(void)
{
        if (!kfence_sample_interval)
                return;

        /*
         * If the pool has already been initialized by arch, there is no need to
         * re-allocate the memory pool.
         */
        if (!__kfence_pool)
                __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);

        if (!__kfence_pool) {
                pr_err("failed to allocate pool\n");
                return;
        }

        /* The memory allocated by memblock has been zeroed out. */
        kfence_metadata_init = memblock_alloc(KFENCE_METADATA_SIZE, PAGE_SIZE);
        if (!kfence_metadata_init) {
                pr_err("failed to allocate metadata\n");
                memblock_free(__kfence_pool, KFENCE_POOL_SIZE);
                __kfence_pool = NULL;
        }
}

static void kfence_init_enable(void)
{
        if (!IS_ENABLED(CONFIG_KFENCE_STATIC_KEYS))
                static_branch_enable(&kfence_allocation_key);

        if (kfence_deferrable)
                INIT_DEFERRABLE_WORK(&kfence_timer, toggle_allocation_gate);
        else
                INIT_DELAYED_WORK(&kfence_timer, toggle_allocation_gate);

        if (kfence_check_on_panic)
                atomic_notifier_chain_register(&panic_notifier_list, &kfence_check_canary_notifier);

        WRITE_ONCE(kfence_enabled, true);
        queue_delayed_work(system_unbound_wq, &kfence_timer, 0);

        pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
                CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool,
                (void *)(__kfence_pool + KFENCE_POOL_SIZE));
}

void __init kfence_init(void)
{
        stack_hash_seed = get_random_u32();

        /* Setting kfence_sample_interval to 0 on boot disables KFENCE. */
        if (!kfence_sample_interval)
                return;

        if (!kfence_init_pool_early()) {
                pr_err("%s failed\n", __func__);
                return;
        }

        kfence_init_enable();
}

static int kfence_init_late(void)
{
        const unsigned long nr_pages_pool = KFENCE_POOL_SIZE / PAGE_SIZE;
        const unsigned long nr_pages_meta = KFENCE_METADATA_SIZE / PAGE_SIZE;
        unsigned long addr = (unsigned long)__kfence_pool;
        unsigned long free_size = KFENCE_POOL_SIZE;
        int err = -ENOMEM;

#ifdef CONFIG_CONTIG_ALLOC
        struct page *pages;

        pages = alloc_contig_pages(nr_pages_pool, GFP_KERNEL, first_online_node,
                                   NULL);
        if (!pages)
                return -ENOMEM;

        __kfence_pool = page_to_virt(pages);
        pages = alloc_contig_pages(nr_pages_meta, GFP_KERNEL, first_online_node,
                                   NULL);
        if (pages)
                kfence_metadata_init = page_to_virt(pages);
#else
        if (nr_pages_pool > MAX_ORDER_NR_PAGES ||
            nr_pages_meta > MAX_ORDER_NR_PAGES) {
                pr_warn("KFENCE_NUM_OBJECTS too large for buddy allocator\n");
                return -EINVAL;
        }

        __kfence_pool = alloc_pages_exact(KFENCE_POOL_SIZE, GFP_KERNEL);
        if (!__kfence_pool)
                return -ENOMEM;

        kfence_metadata_init = alloc_pages_exact(KFENCE_METADATA_SIZE, GFP_KERNEL);
#endif

        if (!kfence_metadata_init)
                goto free_pool;

        memzero_explicit(kfence_metadata_init, KFENCE_METADATA_SIZE);
        addr = kfence_init_pool();
        if (!addr) {
                kfence_init_enable();
                kfence_debugfs_init();
                return 0;
        }

        pr_err("%s failed\n", __func__);
        free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool);
        err = -EBUSY;

#ifdef CONFIG_CONTIG_ALLOC
        free_contig_range(page_to_pfn(virt_to_page((void *)kfence_metadata_init)),
                          nr_pages_meta);
free_pool:
        free_contig_range(page_to_pfn(virt_to_page((void *)addr)),
                          free_size / PAGE_SIZE);
#else
        free_pages_exact((void *)kfence_metadata_init, KFENCE_METADATA_SIZE);
free_pool:
        free_pages_exact((void *)addr, free_size);
#endif

        kfence_metadata_init = NULL;
        __kfence_pool = NULL;
        return err;
}

static int kfence_enable_late(void)
{
        if (!__kfence_pool)
                return kfence_init_late();

        WRITE_ONCE(kfence_enabled, true);
        queue_delayed_work(system_unbound_wq, &kfence_timer, 0);
        pr_info("re-enabled\n");
        return 0;
}

void kfence_shutdown_cache(struct kmem_cache *s)
{
        unsigned long flags;
        struct kfence_metadata *meta;
        int i;

        /* Pairs with release in kfence_init_pool(). */
        if (!smp_load_acquire(&kfence_metadata))
                return;

        for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
                bool in_use;

                meta = &kfence_metadata[i];

                /*
                 * If we observe some inconsistent cache and state pair where we
                 * should have returned false here, cache destruction is racing
                 * with either kmem_cache_alloc() or kmem_cache_free(). Taking
                 * the lock will not help, as different critical section
                 * serialization will have the same outcome.
                 */
                if (READ_ONCE(meta->cache) != s ||
                    READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED)
                        continue;

                raw_spin_lock_irqsave(&meta->lock, flags);
                in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED;
                raw_spin_unlock_irqrestore(&meta->lock, flags);

                if (in_use) {
                        /*
                         * This cache still has allocations, and we should not
                         * release them back into the freelist so they can still
                         * safely be used and retain the kernel's default
                         * behaviour of keeping the allocations alive (leak the
                         * cache); however, they effectively become "zombie
                         * allocations" as the KFENCE objects are the only ones
                         * still in use and the owning cache is being destroyed.
                         *
                         * We mark them freed, so that any subsequent use shows
                         * more useful error messages that will include stack
                         * traces of the user of the object, the original
                         * allocation, and caller to shutdown_cache().
                         */
                        kfence_guarded_free((void *)meta->addr, meta, /*zombie=*/true);
                }
        }

        for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
                meta = &kfence_metadata[i];

                /* See above. */
                if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_FREED)
                        continue;

                raw_spin_lock_irqsave(&meta->lock, flags);
                if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED)
                        meta->cache = NULL;
                raw_spin_unlock_irqrestore(&meta->lock, flags);
        }
}

void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
{
        unsigned long stack_entries[KFENCE_STACK_DEPTH];
        size_t num_stack_entries;
        u32 alloc_stack_hash;

        /*
         * Perform size check before switching kfence_allocation_gate, so that
         * we don't disable KFENCE without making an allocation.
         */
        if (size > PAGE_SIZE) {
                atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
                return NULL;
        }

        /*
         * Skip allocations from non-default zones, including DMA. We cannot
         * guarantee that pages in the KFENCE pool will have the requested
         * properties (e.g. reside in DMAable memory).
         */
        if ((flags & GFP_ZONEMASK) ||
            (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
                atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
                return NULL;
        }

        /*
         * Skip allocations for this slab, if KFENCE has been disabled for
         * this slab.
         */
        if (s->flags & SLAB_SKIP_KFENCE)
                return NULL;

        if (atomic_inc_return(&kfence_allocation_gate) > 1)
                return NULL;
#ifdef CONFIG_KFENCE_STATIC_KEYS
        /*
         * waitqueue_active() is fully ordered after the update of
         * kfence_allocation_gate per atomic_inc_return().
         */
        if (waitqueue_active(&allocation_wait)) {
                /*
                 * Calling wake_up() here may deadlock when allocations happen
                 * from within timer code. Use an irq_work to defer it.
                 */
                irq_work_queue(&wake_up_kfence_timer_work);
        }
#endif

        if (!READ_ONCE(kfence_enabled))
                return NULL;

        num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);

        /*
         * Do expensive check for coverage of allocation in slow-path after
         * allocation_gate has already become non-zero, even though it might
         * mean not making any allocation within a given sample interval.
         *
         * This ensures reasonable allocation coverage when the pool is almost
         * full, including avoiding long-lived allocations of the same source
         * filling up the pool (e.g. pagecache allocations).
         */
        alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries);
        if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) {
                atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
                return NULL;
        }

        return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries,
                                    alloc_stack_hash);
}

size_t kfence_ksize(const void *addr)
{
        const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);

        /*
         * Read locklessly -- if there is a race with __kfence_alloc(), this is
         * either a use-after-free or invalid access.
         */
        return meta ? meta->size : 0;
}

void *kfence_object_start(const void *addr)
{
        const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);

        /*
         * Read locklessly -- if there is a race with __kfence_alloc(), this is
         * either a use-after-free or invalid access.
         */
        return meta ? (void *)meta->addr : NULL;
}

void __kfence_free(void *addr)
{
        struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);

#ifdef CONFIG_MEMCG_KMEM
        KFENCE_WARN_ON(meta->obj_exts.objcg);
#endif
        /*
         * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing
         * the object, as the object page may be recycled for other-typed
         * objects once it has been freed. meta->cache may be NULL if the cache
         * was destroyed.
         */
        if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU)))
                call_rcu(&meta->rcu_head, rcu_guarded_free);
        else
                kfence_guarded_free(addr, meta, false);
}

bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs)
{
        const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE;
        struct kfence_metadata *to_report = NULL;
        enum kfence_error_type error_type;
        unsigned long flags;

        if (!is_kfence_address((void *)addr))
                return false;

        if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */
                return kfence_unprotect(addr); /* ... unprotect and proceed. */

        atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);

        if (page_index % 2) {
                /* This is a redzone, report a buffer overflow. */
                struct kfence_metadata *meta;
                int distance = 0;

                meta = addr_to_metadata(addr - PAGE_SIZE);
                if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
                        to_report = meta;
                        /* Data race ok; distance calculation approximate. */
                        distance = addr - data_race(meta->addr + meta->size);
                }

                meta = addr_to_metadata(addr + PAGE_SIZE);
                if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
                        /* Data race ok; distance calculation approximate. */
                        if (!to_report || distance > data_race(meta->addr) - addr)
                                to_report = meta;
                }

                if (!to_report)
                        goto out;

                raw_spin_lock_irqsave(&to_report->lock, flags);
                to_report->unprotected_page = addr;
                error_type = KFENCE_ERROR_OOB;

                /*
                 * If the object was freed before we took the look we can still
                 * report this as an OOB -- the report will simply show the
                 * stacktrace of the free as well.
                 */
        } else {
                to_report = addr_to_metadata(addr);
                if (!to_report)
                        goto out;

                raw_spin_lock_irqsave(&to_report->lock, flags);
                error_type = KFENCE_ERROR_UAF;
                /*
                 * We may race with __kfence_alloc(), and it is possible that a
                 * freed object may be reallocated. We simply report this as a
                 * use-after-free, with the stack trace showing the place where
                 * the object was re-allocated.
                 */
        }

out:
        if (to_report) {
                kfence_report_error(addr, is_write, regs, to_report, error_type);
                raw_spin_unlock_irqrestore(&to_report->lock, flags);
        } else {
                /* This may be a UAF or OOB access, but we can't be sure. */
                kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID);
        }

        return kfence_unprotect(addr); /* Unprotect and let access proceed. */
}




























































































    5 
































    5 












    5 













    5 

























































    5 

























    1 

































    1 

























    1 















    1 














    1 







    1 




    1 



    1 






    1 









    1 
    1 















    2 



    2 











































    2 












    2 




    2 











    2 















































    2 




















    2 









    2 







































    3 


















    2 

























    1 
























































































    1 









    5 










    5 









































































































































    4 







    4 



























    4 
























    5 



























    5 















    1 



    4 

    5 







    5 







    1 





    4 





    5 







    2 

    3 




    2 


























    5 



    5 



    5 










    5 












    5 












    5 

    4 










































    5 


















































    4 







    4 








    4 

    3 

    4 





    3 


    2 

    2 



    1 

    4 












    3 
    2 

    1 
    4 
    4 















    5 







    4 
    1 















    1 














    3 

    1 










    5 











    5 



    1 

    4 





    1 
    4 











    5 















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/ialloc.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  BSD ufs-inspired inode and directory allocation by
 *  Stephen Tweedie (sct@redhat.com), 1993
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 */

#include <linux/time.h>
#include <linux/fs.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include <linux/random.h>
#include <linux/bitops.h>
#include <linux/blkdev.h>
#include <linux/cred.h>

#include <asm/byteorder.h>

#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"

#include <trace/events/ext4.h>

/*
 * ialloc.c contains the inodes allocation and deallocation routines
 */

/*
 * The free inodes are managed by bitmaps.  A file system contains several
 * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
 * block for inodes, N blocks for the inode table and data blocks.
 *
 * The file system contains group descriptors which are located after the
 * super block.  Each descriptor contains the number of the bitmap block and
 * the free blocks count in the block.
 */

/*
 * To avoid calling the atomic setbit hundreds or thousands of times, we only
 * need to use it within a single byte (to ensure we get endianness right).
 * We can use memset for the rest of the bitmap as there are no other users.
 */
void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
{
        int i;

        if (start_bit >= end_bit)
                return;

        ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
        for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
                ext4_set_bit(i, bitmap);
        if (i < end_bit)
                memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
}

void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
{
        if (uptodate) {
                set_buffer_uptodate(bh);
                set_bitmap_uptodate(bh);
        }
        unlock_buffer(bh);
        put_bh(bh);
}

static int ext4_validate_inode_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *desc,
                                      ext4_group_t block_group,
                                      struct buffer_head *bh)
{
        ext4_fsblk_t        blk;
        struct ext4_group_info *grp;

        if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
                return 0;

        grp = ext4_get_group_info(sb, block_group);

        if (buffer_verified(bh))
                return 0;
        if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
                return -EFSCORRUPTED;

        ext4_lock_group(sb, block_group);
        if (buffer_verified(bh))
                goto verified;
        blk = ext4_inode_bitmap(sb, desc);
        if (!ext4_inode_bitmap_csum_verify(sb, desc, bh,
                                           EXT4_INODES_PER_GROUP(sb) / 8) ||
            ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) {
                ext4_unlock_group(sb, block_group);
                ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
                           "inode_bitmap = %llu", block_group, blk);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_IBITMAP_CORRUPT);
                return -EFSBADCRC;
        }
        set_buffer_verified(bh);
verified:
        ext4_unlock_group(sb, block_group);
        return 0;
}

/*
 * Read the inode allocation bitmap for a given block_group, reading
 * into the specified slot in the superblock's bitmap cache.
 *
 * Return buffer_head of bitmap on success, or an ERR_PTR on error.
 */
static struct buffer_head *
ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
{
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct buffer_head *bh = NULL;
        ext4_fsblk_t bitmap_blk;
        int err;

        desc = ext4_get_group_desc(sb, block_group, NULL);
        if (!desc)
                return ERR_PTR(-EFSCORRUPTED);

        bitmap_blk = ext4_inode_bitmap(sb, desc);
        if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
            (bitmap_blk >= ext4_blocks_count(sbi->s_es))) {
                ext4_error(sb, "Invalid inode bitmap blk %llu in "
                           "block_group %u", bitmap_blk, block_group);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_IBITMAP_CORRUPT);
                return ERR_PTR(-EFSCORRUPTED);
        }
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
                ext4_warning(sb, "Cannot read inode bitmap - "
                             "block_group = %u, inode_bitmap = %llu",
                             block_group, bitmap_blk);
                return ERR_PTR(-ENOMEM);
        }
        if (bitmap_uptodate(bh))
                goto verify;

        lock_buffer(bh);
        if (bitmap_uptodate(bh)) {
                unlock_buffer(bh);
                goto verify;
        }

        ext4_lock_group(sb, block_group);
        if (ext4_has_group_desc_csum(sb) &&
            (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) {
                if (block_group == 0) {
                        ext4_unlock_group(sb, block_group);
                        unlock_buffer(bh);
                        ext4_error(sb, "Inode bitmap for bg 0 marked "
                                   "uninitialized");
                        err = -EFSCORRUPTED;
                        goto out;
                }
                memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
                ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb),
                                     sb->s_blocksize * 8, bh->b_data);
                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
                set_buffer_verified(bh);
                ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
                return bh;
        }
        ext4_unlock_group(sb, block_group);

        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
                 * bitmap is also uptodate
                 */
                set_bitmap_uptodate(bh);
                unlock_buffer(bh);
                goto verify;
        }
        /*
         * submit the buffer_head for reading
         */
        trace_ext4_load_inode_bitmap(sb, block_group);
        ext4_read_bh(bh, REQ_META | REQ_PRIO, ext4_end_bitmap_read);
        ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO);
        if (!buffer_uptodate(bh)) {
                put_bh(bh);
                ext4_error_err(sb, EIO, "Cannot read inode bitmap - "
                               "block_group = %u, inode_bitmap = %llu",
                               block_group, bitmap_blk);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                EXT4_GROUP_INFO_IBITMAP_CORRUPT);
                return ERR_PTR(-EIO);
        }

verify:
        err = ext4_validate_inode_bitmap(sb, desc, block_group, bh);
        if (err)
                goto out;
        return bh;
out:
        put_bh(bh);
        return ERR_PTR(err);
}

/*
 * NOTE! When we get the inode, we're the only people
 * that have access to it, and as such there are no
 * race conditions we have to worry about. The inode
 * is not on the hash-lists, and it cannot be reached
 * through the filesystem because the directory entry
 * has been deleted earlier.
 *
 * HOWEVER: we must make sure that we get no aliases,
 * which means that we have to call "clear_inode()"
 * _before_ we mark the inode not in use in the inode
 * bitmaps. Otherwise a newly created file might use
 * the same inode number (not actually the same pointer
 * though), and then we'd have two inodes sharing the
 * same inode number and space on the harddisk.
 */
void ext4_free_inode(handle_t *handle, struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        int is_directory;
        unsigned long ino;
        struct buffer_head *bitmap_bh = NULL;
        struct buffer_head *bh2;
        ext4_group_t block_group;
        unsigned long bit;
        struct ext4_group_desc *gdp;
        struct ext4_super_block *es;
        struct ext4_sb_info *sbi;
        int fatal = 0, err, count, cleared;
        struct ext4_group_info *grp;

        if (!sb) {
                printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
                       "nonexistent device\n", __func__, __LINE__);
                return;
        }
        if (atomic_read(&inode->i_count) > 1) {
                ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
                         __func__, __LINE__, inode->i_ino,
                         atomic_read(&inode->i_count));
                return;
        }
        if (inode->i_nlink) {
                ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n",
                         __func__, __LINE__, inode->i_ino, inode->i_nlink);
                return;
        }
        sbi = EXT4_SB(sb);

        ino = inode->i_ino;
        ext4_debug("freeing inode %lu\n", ino);
        trace_ext4_free_inode(inode);

        dquot_initialize(inode);
        dquot_free_inode(inode);

        is_directory = S_ISDIR(inode->i_mode);

        /* Do this BEFORE marking the inode not in use or returning an error */
        ext4_clear_inode(inode);

        es = sbi->s_es;
        if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
                ext4_error(sb, "reserved or nonexistent inode %lu", ino);
                goto error_return;
        }
        block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
        bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
        bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
        /* Don't bother if the inode bitmap is corrupt. */
        if (IS_ERR(bitmap_bh)) {
                fatal = PTR_ERR(bitmap_bh);
                bitmap_bh = NULL;
                goto error_return;
        }
        if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
                grp = ext4_get_group_info(sb, block_group);
                if (!grp || unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
                        fatal = -EFSCORRUPTED;
                        goto error_return;
                }
        }

        BUFFER_TRACE(bitmap_bh, "get_write_access");
        fatal = ext4_journal_get_write_access(handle, sb, bitmap_bh,
                                              EXT4_JTR_NONE);
        if (fatal)
                goto error_return;

        fatal = -ESRCH;
        gdp = ext4_get_group_desc(sb, block_group, &bh2);
        if (gdp) {
                BUFFER_TRACE(bh2, "get_write_access");
                fatal = ext4_journal_get_write_access(handle, sb, bh2,
                                                      EXT4_JTR_NONE);
        }
        ext4_lock_group(sb, block_group);
        cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
        if (fatal || !cleared) {
                ext4_unlock_group(sb, block_group);
                goto out;
        }

        count = ext4_free_inodes_count(sb, gdp) + 1;
        ext4_free_inodes_set(sb, gdp, count);
        if (is_directory) {
                count = ext4_used_dirs_count(sb, gdp) - 1;
                ext4_used_dirs_set(sb, gdp, count);
                if (percpu_counter_initialized(&sbi->s_dirs_counter))
                        percpu_counter_dec(&sbi->s_dirs_counter);
        }
        ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh,
                                   EXT4_INODES_PER_GROUP(sb) / 8);
        ext4_group_desc_csum_set(sb, block_group, gdp);
        ext4_unlock_group(sb, block_group);

        if (percpu_counter_initialized(&sbi->s_freeinodes_counter))
                percpu_counter_inc(&sbi->s_freeinodes_counter);
        if (sbi->s_log_groups_per_flex) {
                struct flex_groups *fg;

                fg = sbi_array_rcu_deref(sbi, s_flex_groups,
                                         ext4_flex_group(sbi, block_group));
                atomic_inc(&fg->free_inodes);
                if (is_directory)
                        atomic_dec(&fg->used_dirs);
        }
        BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
        fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
out:
        if (cleared) {
                BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
                if (!fatal)
                        fatal = err;
        } else {
                ext4_error(sb, "bit already cleared for inode %lu", ino);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_IBITMAP_CORRUPT);
        }

error_return:
        brelse(bitmap_bh);
        ext4_std_error(sb, fatal);
}

struct orlov_stats {
        __u64 free_clusters;
        __u32 free_inodes;
        __u32 used_dirs;
};

/*
 * Helper function for Orlov's allocator; returns critical information
 * for a particular block group or flex_bg.  If flex_size is 1, then g
 * is a block group number; otherwise it is flex_bg number.
 */
static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
                            int flex_size, struct orlov_stats *stats)
{
        struct ext4_group_desc *desc;

        if (flex_size > 1) {
                struct flex_groups *fg = sbi_array_rcu_deref(EXT4_SB(sb),
                                                             s_flex_groups, g);
                stats->free_inodes = atomic_read(&fg->free_inodes);
                stats->free_clusters = atomic64_read(&fg->free_clusters);
                stats->used_dirs = atomic_read(&fg->used_dirs);
                return;
        }

        desc = ext4_get_group_desc(sb, g, NULL);
        if (desc) {
                stats->free_inodes = ext4_free_inodes_count(sb, desc);
                stats->free_clusters = ext4_free_group_clusters(sb, desc);
                stats->used_dirs = ext4_used_dirs_count(sb, desc);
        } else {
                stats->free_inodes = 0;
                stats->free_clusters = 0;
                stats->used_dirs = 0;
        }
}

/*
 * Orlov's allocator for directories.
 *
 * We always try to spread first-level directories.
 *
 * If there are blockgroups with both free inodes and free clusters counts
 * not worse than average we return one with smallest directory count.
 * Otherwise we simply return a random group.
 *
 * For the rest rules look so:
 *
 * It's OK to put directory into a group unless
 * it has too many directories already (max_dirs) or
 * it has too few free inodes left (min_inodes) or
 * it has too few free clusters left (min_clusters) or
 * Parent's group is preferred, if it doesn't satisfy these
 * conditions we search cyclically through the rest. If none
 * of the groups look good we just look for a group with more
 * free inodes than average (starting at parent's group).
 */

static int find_group_orlov(struct super_block *sb, struct inode *parent,
                            ext4_group_t *group, umode_t mode,
                            const struct qstr *qstr)
{
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t real_ngroups = ext4_get_groups_count(sb);
        int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
        unsigned int freei, avefreei, grp_free;
        ext4_fsblk_t freec, avefreec;
        unsigned int ndirs;
        int max_dirs, min_inodes;
        ext4_grpblk_t min_clusters;
        ext4_group_t i, grp, g, ngroups;
        struct ext4_group_desc *desc;
        struct orlov_stats stats;
        int flex_size = ext4_flex_bg_size(sbi);
        struct dx_hash_info hinfo;

        ngroups = real_ngroups;
        if (flex_size > 1) {
                ngroups = (real_ngroups + flex_size - 1) >>
                        sbi->s_log_groups_per_flex;
                parent_group >>= sbi->s_log_groups_per_flex;
        }

        freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
        avefreei = freei / ngroups;
        freec = percpu_counter_read_positive(&sbi->s_freeclusters_counter);
        avefreec = freec;
        do_div(avefreec, ngroups);
        ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);

        if (S_ISDIR(mode) &&
            ((parent == d_inode(sb->s_root)) ||
             (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
                int best_ndir = inodes_per_group;
                int ret = -1;

                if (qstr) {
                        hinfo.hash_version = DX_HASH_HALF_MD4;
                        hinfo.seed = sbi->s_hash_seed;
                        ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo);
                        parent_group = hinfo.hash % ngroups;
                } else
                        parent_group = get_random_u32_below(ngroups);
                for (i = 0; i < ngroups; i++) {
                        g = (parent_group + i) % ngroups;
                        get_orlov_stats(sb, g, flex_size, &stats);
                        if (!stats.free_inodes)
                                continue;
                        if (stats.used_dirs >= best_ndir)
                                continue;
                        if (stats.free_inodes < avefreei)
                                continue;
                        if (stats.free_clusters < avefreec)
                                continue;
                        grp = g;
                        ret = 0;
                        best_ndir = stats.used_dirs;
                }
                if (ret)
                        goto fallback;
        found_flex_bg:
                if (flex_size == 1) {
                        *group = grp;
                        return 0;
                }

                /*
                 * We pack inodes at the beginning of the flexgroup's
                 * inode tables.  Block allocation decisions will do
                 * something similar, although regular files will
                 * start at 2nd block group of the flexgroup.  See
                 * ext4_ext_find_goal() and ext4_find_near().
                 */
                grp *= flex_size;
                for (i = 0; i < flex_size; i++) {
                        if (grp+i >= real_ngroups)
                                break;
                        desc = ext4_get_group_desc(sb, grp+i, NULL);
                        if (desc && ext4_free_inodes_count(sb, desc)) {
                                *group = grp+i;
                                return 0;
                        }
                }
                goto fallback;
        }

        max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16;
        min_inodes = avefreei - inodes_per_group*flex_size / 4;
        if (min_inodes < 1)
                min_inodes = 1;
        min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;

        /*
         * Start looking in the flex group where we last allocated an
         * inode for this parent directory
         */
        if (EXT4_I(parent)->i_last_alloc_group != ~0) {
                parent_group = EXT4_I(parent)->i_last_alloc_group;
                if (flex_size > 1)
                        parent_group >>= sbi->s_log_groups_per_flex;
        }

        for (i = 0; i < ngroups; i++) {
                grp = (parent_group + i) % ngroups;
                get_orlov_stats(sb, grp, flex_size, &stats);
                if (stats.used_dirs >= max_dirs)
                        continue;
                if (stats.free_inodes < min_inodes)
                        continue;
                if (stats.free_clusters < min_clusters)
                        continue;
                goto found_flex_bg;
        }

fallback:
        ngroups = real_ngroups;
        avefreei = freei / ngroups;
fallback_retry:
        parent_group = EXT4_I(parent)->i_block_group;
        for (i = 0; i < ngroups; i++) {
                grp = (parent_group + i) % ngroups;
                desc = ext4_get_group_desc(sb, grp, NULL);
                if (desc) {
                        grp_free = ext4_free_inodes_count(sb, desc);
                        if (grp_free && grp_free >= avefreei) {
                                *group = grp;
                                return 0;
                        }
                }
        }

        if (avefreei) {
                /*
                 * The free-inodes counter is approximate, and for really small
                 * filesystems the above test can fail to find any blockgroups
                 */
                avefreei = 0;
                goto fallback_retry;
        }

        return -1;
}

static int find_group_other(struct super_block *sb, struct inode *parent,
                            ext4_group_t *group, umode_t mode)
{
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
        struct ext4_group_desc *desc;
        int flex_size = ext4_flex_bg_size(EXT4_SB(sb));

        /*
         * Try to place the inode is the same flex group as its
         * parent.  If we can't find space, use the Orlov algorithm to
         * find another flex group, and store that information in the
         * parent directory's inode information so that use that flex
         * group for future allocations.
         */
        if (flex_size > 1) {
                int retry = 0;

        try_again:
                parent_group &= ~(flex_size-1);
                last = parent_group + flex_size;
                if (last > ngroups)
                        last = ngroups;
                for  (i = parent_group; i < last; i++) {
                        desc = ext4_get_group_desc(sb, i, NULL);
                        if (desc && ext4_free_inodes_count(sb, desc)) {
                                *group = i;
                                return 0;
                        }
                }
                if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
                        retry = 1;
                        parent_group = EXT4_I(parent)->i_last_alloc_group;
                        goto try_again;
                }
                /*
                 * If this didn't work, use the Orlov search algorithm
                 * to find a new flex group; we pass in the mode to
                 * avoid the topdir algorithms.
                 */
                *group = parent_group + flex_size;
                if (*group > ngroups)
                        *group = 0;
                return find_group_orlov(sb, parent, group, mode, NULL);
        }

        /*
         * Try to place the inode in its parent directory
         */
        *group = parent_group;
        desc = ext4_get_group_desc(sb, *group, NULL);
        if (desc && ext4_free_inodes_count(sb, desc) &&
            ext4_free_group_clusters(sb, desc))
                return 0;

        /*
         * We're going to place this inode in a different blockgroup from its
         * parent.  We want to cause files in a common directory to all land in
         * the same blockgroup.  But we want files which are in a different
         * directory which shares a blockgroup with our parent to land in a
         * different blockgroup.
         *
         * So add our directory's i_ino into the starting point for the hash.
         */
        *group = (*group + parent->i_ino) % ngroups;

        /*
         * Use a quadratic hash to find a group with a free inode and some free
         * blocks.
         */
        for (i = 1; i < ngroups; i <<= 1) {
                *group += i;
                if (*group >= ngroups)
                        *group -= ngroups;
                desc = ext4_get_group_desc(sb, *group, NULL);
                if (desc && ext4_free_inodes_count(sb, desc) &&
                    ext4_free_group_clusters(sb, desc))
                        return 0;
        }

        /*
         * That failed: try linear search for a free inode, even if that group
         * has no free blocks.
         */
        *group = parent_group;
        for (i = 0; i < ngroups; i++) {
                if (++*group >= ngroups)
                        *group = 0;
                desc = ext4_get_group_desc(sb, *group, NULL);
                if (desc && ext4_free_inodes_count(sb, desc))
                        return 0;
        }

        return -1;
}

/*
 * In no journal mode, if an inode has recently been deleted, we want
 * to avoid reusing it until we're reasonably sure the inode table
 * block has been written back to disk.  (Yes, these values are
 * somewhat arbitrary...)
 */
#define RECENTCY_MIN        60
#define RECENTCY_DIRTY        300

static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
{
        struct ext4_group_desc        *gdp;
        struct ext4_inode        *raw_inode;
        struct buffer_head        *bh;
        int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
        int offset, ret = 0;
        int recentcy = RECENTCY_MIN;
        u32 dtime, now;

        gdp = ext4_get_group_desc(sb, group, NULL);
        if (unlikely(!gdp))
                return 0;

        bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) +
                       (ino / inodes_per_block));
        if (!bh || !buffer_uptodate(bh))
                /*
                 * If the block is not in the buffer cache, then it
                 * must have been written out.
                 */
                goto out;

        offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
        raw_inode = (struct ext4_inode *) (bh->b_data + offset);

        /* i_dtime is only 32 bits on disk, but we only care about relative
         * times in the range of a few minutes (i.e. long enough to sync a
         * recently-deleted inode to disk), so using the low 32 bits of the
         * clock (a 68 year range) is enough, see time_before32() */
        dtime = le32_to_cpu(raw_inode->i_dtime);
        now = ktime_get_real_seconds();
        if (buffer_dirty(bh))
                recentcy += RECENTCY_DIRTY;

        if (dtime && time_before32(dtime, now) &&
            time_before32(now, dtime + recentcy))
                ret = 1;
out:
        brelse(bh);
        return ret;
}

static int find_inode_bit(struct super_block *sb, ext4_group_t group,
                          struct buffer_head *bitmap, unsigned long *ino)
{
        bool check_recently_deleted = EXT4_SB(sb)->s_journal == NULL;
        unsigned long recently_deleted_ino = EXT4_INODES_PER_GROUP(sb);

next:
        *ino = ext4_find_next_zero_bit((unsigned long *)
                                       bitmap->b_data,
                                       EXT4_INODES_PER_GROUP(sb), *ino);
        if (*ino >= EXT4_INODES_PER_GROUP(sb))
                goto not_found;

        if (check_recently_deleted && recently_deleted(sb, group, *ino)) {
                recently_deleted_ino = *ino;
                *ino = *ino + 1;
                if (*ino < EXT4_INODES_PER_GROUP(sb))
                        goto next;
                goto not_found;
        }
        return 1;
not_found:
        if (recently_deleted_ino >= EXT4_INODES_PER_GROUP(sb))
                return 0;
        /*
         * Not reusing recently deleted inodes is mostly a preference. We don't
         * want to report ENOSPC or skew allocation patterns because of that.
         * So return even recently deleted inode if we could find better in the
         * given range.
         */
        *ino = recently_deleted_ino;
        return 1;
}

int ext4_mark_inode_used(struct super_block *sb, int ino)
{
        unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
        struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL;
        struct ext4_group_desc *gdp;
        ext4_group_t group;
        int bit;
        int err = -EFSCORRUPTED;

        if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
                goto out;

        group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
        bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
        inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
        if (IS_ERR(inode_bitmap_bh))
                return PTR_ERR(inode_bitmap_bh);

        if (ext4_test_bit(bit, inode_bitmap_bh->b_data)) {
                err = 0;
                goto out;
        }

        gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
        if (!gdp || !group_desc_bh) {
                err = -EINVAL;
                goto out;
        }

        ext4_set_bit(bit, inode_bitmap_bh->b_data);

        BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh);
        if (err) {
                ext4_std_error(sb, err);
                goto out;
        }
        err = sync_dirty_buffer(inode_bitmap_bh);
        if (err) {
                ext4_std_error(sb, err);
                goto out;
        }

        /* We may have to initialize the block bitmap if it isn't already */
        if (ext4_has_group_desc_csum(sb) &&
            gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                struct buffer_head *block_bitmap_bh;

                block_bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (IS_ERR(block_bitmap_bh)) {
                        err = PTR_ERR(block_bitmap_bh);
                        goto out;
                }

                BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
                err = ext4_handle_dirty_metadata(NULL, NULL, block_bitmap_bh);
                sync_dirty_buffer(block_bitmap_bh);

                /* recheck and clear flag under lock if we still need to */
                ext4_lock_group(sb, group);
                if (ext4_has_group_desc_csum(sb) &&
                    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                        ext4_free_group_clusters_set(sb, gdp,
                                ext4_free_clusters_after_init(sb, group, gdp));
                        ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh);
                        ext4_group_desc_csum_set(sb, group, gdp);
                }
                ext4_unlock_group(sb, group);
                brelse(block_bitmap_bh);

                if (err) {
                        ext4_std_error(sb, err);
                        goto out;
                }
        }

        /* Update the relevant bg descriptor fields */
        if (ext4_has_group_desc_csum(sb)) {
                int free;

                ext4_lock_group(sb, group); /* while we modify the bg desc */
                free = EXT4_INODES_PER_GROUP(sb) -
                        ext4_itable_unused_count(sb, gdp);
                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
                        free = 0;
                }

                /*
                 * Check the relative inode number against the last used
                 * relative inode number in this group. if it is greater
                 * we need to update the bg_itable_unused count
                 */
                if (bit >= free)
                        ext4_itable_unused_set(sb, gdp,
                                        (EXT4_INODES_PER_GROUP(sb) - bit - 1));
        } else {
                ext4_lock_group(sb, group);
        }

        ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
        if (ext4_has_group_desc_csum(sb)) {
                ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh,
                                           EXT4_INODES_PER_GROUP(sb) / 8);
                ext4_group_desc_csum_set(sb, group, gdp);
        }

        ext4_unlock_group(sb, group);
        err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh);
        sync_dirty_buffer(group_desc_bh);
out:
        return err;
}

static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode,
                                            bool encrypt)
{
        struct super_block *sb = dir->i_sb;
        int nblocks = 0;
#ifdef CONFIG_EXT4_FS_POSIX_ACL
        struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT);

        if (IS_ERR(p))
                return PTR_ERR(p);
        if (p) {
                int acl_size = p->a_count * sizeof(ext4_acl_entry);

                nblocks += (S_ISDIR(mode) ? 2 : 1) *
                        __ext4_xattr_set_credits(sb, NULL /* inode */,
                                                 NULL /* block_bh */, acl_size,
                                                 true /* is_create */);
                posix_acl_release(p);
        }
#endif

#ifdef CONFIG_SECURITY
        {
                int num_security_xattrs = 1;

#ifdef CONFIG_INTEGRITY
                num_security_xattrs++;
#endif
                /*
                 * We assume that security xattrs are never more than 1k.
                 * In practice they are under 128 bytes.
                 */
                nblocks += num_security_xattrs *
                        __ext4_xattr_set_credits(sb, NULL /* inode */,
                                                 NULL /* block_bh */, 1024,
                                                 true /* is_create */);
        }
#endif
        if (encrypt)
                nblocks += __ext4_xattr_set_credits(sb,
                                                    NULL /* inode */,
                                                    NULL /* block_bh */,
                                                    FSCRYPT_SET_CONTEXT_MAX_SIZE,
                                                    true /* is_create */);
        return nblocks;
}

/*
 * There are two policies for allocating an inode.  If the new inode is
 * a directory, then a forward search is made for a block group with both
 * free space and a low directory-to-inode ratio; if that fails, then of
 * the groups with above-average free space, that group with the fewest
 * directories already is chosen.
 *
 * For other inodes, search forward from the parent directory's block
 * group to find a free inode.
 */
struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
                               handle_t *handle, struct inode *dir,
                               umode_t mode, const struct qstr *qstr,
                               __u32 goal, uid_t *owner, __u32 i_flags,
                               int handle_type, unsigned int line_no,
                               int nblocks)
{
        struct super_block *sb;
        struct buffer_head *inode_bitmap_bh = NULL;
        struct buffer_head *group_desc_bh;
        ext4_group_t ngroups, group = 0;
        unsigned long ino = 0;
        struct inode *inode;
        struct ext4_group_desc *gdp = NULL;
        struct ext4_inode_info *ei;
        struct ext4_sb_info *sbi;
        int ret2, err;
        struct inode *ret;
        ext4_group_t i;
        ext4_group_t flex_group;
        struct ext4_group_info *grp = NULL;
        bool encrypt = false;

        /* Cannot create files in a deleted directory */
        if (!dir || !dir->i_nlink)
                return ERR_PTR(-EPERM);

        sb = dir->i_sb;
        sbi = EXT4_SB(sb);

        if (unlikely(ext4_forced_shutdown(sb)))
                return ERR_PTR(-EIO);

        ngroups = ext4_get_groups_count(sb);
        trace_ext4_request_inode(dir, mode);
        inode = new_inode(sb);
        if (!inode)
                return ERR_PTR(-ENOMEM);
        ei = EXT4_I(inode);

        /*
         * Initialize owners and quota early so that we don't have to account
         * for quota initialization worst case in standard inode creating
         * transaction
         */
        if (owner) {
                inode->i_mode = mode;
                i_uid_write(inode, owner[0]);
                i_gid_write(inode, owner[1]);
        } else if (test_opt(sb, GRPID)) {
                inode->i_mode = mode;
                inode_fsuid_set(inode, idmap);
                inode->i_gid = dir->i_gid;
        } else
                inode_init_owner(idmap, inode, dir, mode);

        if (ext4_has_feature_project(sb) &&
            ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
                ei->i_projid = EXT4_I(dir)->i_projid;
        else
                ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);

        if (!(i_flags & EXT4_EA_INODE_FL)) {
                err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
                if (err)
                        goto out;
        }

        err = dquot_initialize(inode);
        if (err)
                goto out;

        if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
                ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt);
                if (ret2 < 0) {
                        err = ret2;
                        goto out;
                }
                nblocks += ret2;
        }

        if (!goal)
                goal = sbi->s_inode_goal;

        if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
                group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
                ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
                ret2 = 0;
                goto got_group;
        }

        if (S_ISDIR(mode))
                ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
        else
                ret2 = find_group_other(sb, dir, &group, mode);

got_group:
        EXT4_I(dir)->i_last_alloc_group = group;
        err = -ENOSPC;
        if (ret2 == -1)
                goto out;

        /*
         * Normally we will only go through one pass of this loop,
         * unless we get unlucky and it turns out the group we selected
         * had its last inode grabbed by someone else.
         */
        for (i = 0; i < ngroups; i++, ino = 0) {
                err = -EIO;

                gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
                if (!gdp)
                        goto out;

                /*
                 * Check free inodes count before loading bitmap.
                 */
                if (ext4_free_inodes_count(sb, gdp) == 0)
                        goto next_group;

                if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
                        grp = ext4_get_group_info(sb, group);
                        /*
                         * Skip groups with already-known suspicious inode
                         * tables
                         */
                        if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
                                goto next_group;
                }

                brelse(inode_bitmap_bh);
                inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
                /* Skip groups with suspicious inode tables */
                if (((!(sbi->s_mount_state & EXT4_FC_REPLAY))
                     && EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) ||
                    IS_ERR(inode_bitmap_bh)) {
                        inode_bitmap_bh = NULL;
                        goto next_group;
                }

repeat_in_this_group:
                ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
                if (!ret2)
                        goto next_group;

                if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) {
                        ext4_error(sb, "reserved inode found cleared - "
                                   "inode=%lu", ino + 1);
                        ext4_mark_group_bitmap_corrupted(sb, group,
                                        EXT4_GROUP_INFO_IBITMAP_CORRUPT);
                        goto next_group;
                }

                if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) {
                        BUG_ON(nblocks <= 0);
                        handle = __ext4_journal_start_sb(NULL, dir->i_sb,
                                 line_no, handle_type, nblocks, 0,
                                 ext4_trans_default_revoke_credits(sb));
                        if (IS_ERR(handle)) {
                                err = PTR_ERR(handle);
                                ext4_std_error(sb, err);
                                goto out;
                        }
                }
                BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, sb, inode_bitmap_bh,
                                                    EXT4_JTR_NONE);
                if (err) {
                        ext4_std_error(sb, err);
                        goto out;
                }
                ext4_lock_group(sb, group);
                ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
                if (ret2) {
                        /* Someone already took the bit. Repeat the search
                         * with lock held.
                         */
                        ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
                        if (ret2) {
                                ext4_set_bit(ino, inode_bitmap_bh->b_data);
                                ret2 = 0;
                        } else {
                                ret2 = 1; /* we didn't grab the inode */
                        }
                }
                ext4_unlock_group(sb, group);
                ino++;                /* the inode bitmap is zero-based */
                if (!ret2)
                        goto got; /* we grabbed the inode! */

                if (ino < EXT4_INODES_PER_GROUP(sb))
                        goto repeat_in_this_group;
next_group:
                if (++group == ngroups)
                        group = 0;
        }
        err = -ENOSPC;
        goto out;

got:
        BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
        if (err) {
                ext4_std_error(sb, err);
                goto out;
        }

        BUFFER_TRACE(group_desc_bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, sb, group_desc_bh,
                                            EXT4_JTR_NONE);
        if (err) {
                ext4_std_error(sb, err);
                goto out;
        }

        /* We may have to initialize the block bitmap if it isn't already */
        if (ext4_has_group_desc_csum(sb) &&
            gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                struct buffer_head *block_bitmap_bh;

                block_bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (IS_ERR(block_bitmap_bh)) {
                        err = PTR_ERR(block_bitmap_bh);
                        goto out;
                }
                BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
                err = ext4_journal_get_write_access(handle, sb, block_bitmap_bh,
                                                    EXT4_JTR_NONE);
                if (err) {
                        brelse(block_bitmap_bh);
                        ext4_std_error(sb, err);
                        goto out;
                }

                BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
                err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);

                /* recheck and clear flag under lock if we still need to */
                ext4_lock_group(sb, group);
                if (ext4_has_group_desc_csum(sb) &&
                    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                        ext4_free_group_clusters_set(sb, gdp,
                                ext4_free_clusters_after_init(sb, group, gdp));
                        ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh);
                        ext4_group_desc_csum_set(sb, group, gdp);
                }
                ext4_unlock_group(sb, group);
                brelse(block_bitmap_bh);

                if (err) {
                        ext4_std_error(sb, err);
                        goto out;
                }
        }

        /* Update the relevant bg descriptor fields */
        if (ext4_has_group_desc_csum(sb)) {
                int free;
                struct ext4_group_info *grp = NULL;

                if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
                        grp = ext4_get_group_info(sb, group);
                        if (!grp) {
                                err = -EFSCORRUPTED;
                                goto out;
                        }
                        down_read(&grp->alloc_sem); /*
                                                     * protect vs itable
                                                     * lazyinit
                                                     */
                }
                ext4_lock_group(sb, group); /* while we modify the bg desc */
                free = EXT4_INODES_PER_GROUP(sb) -
                        ext4_itable_unused_count(sb, gdp);
                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
                        free = 0;
                }
                /*
                 * Check the relative inode number against the last used
                 * relative inode number in this group. if it is greater
                 * we need to update the bg_itable_unused count
                 */
                if (ino > free)
                        ext4_itable_unused_set(sb, gdp,
                                        (EXT4_INODES_PER_GROUP(sb) - ino));
                if (!(sbi->s_mount_state & EXT4_FC_REPLAY))
                        up_read(&grp->alloc_sem);
        } else {
                ext4_lock_group(sb, group);
        }

        ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
        if (S_ISDIR(mode)) {
                ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
                if (sbi->s_log_groups_per_flex) {
                        ext4_group_t f = ext4_flex_group(sbi, group);

                        atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
                                                        f)->used_dirs);
                }
        }
        if (ext4_has_group_desc_csum(sb)) {
                ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh,
                                           EXT4_INODES_PER_GROUP(sb) / 8);
                ext4_group_desc_csum_set(sb, group, gdp);
        }
        ext4_unlock_group(sb, group);

        BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
        if (err) {
                ext4_std_error(sb, err);
                goto out;
        }

        percpu_counter_dec(&sbi->s_freeinodes_counter);
        if (S_ISDIR(mode))
                percpu_counter_inc(&sbi->s_dirs_counter);

        if (sbi->s_log_groups_per_flex) {
                flex_group = ext4_flex_group(sbi, group);
                atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups,
                                                flex_group)->free_inodes);
        }

        inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
        /* This is the optimal IO size (for stat), not the fs block size */
        inode->i_blocks = 0;
        simple_inode_init_ts(inode);
        ei->i_crtime = inode_get_mtime(inode);

        memset(ei->i_data, 0, sizeof(ei->i_data));
        ei->i_dir_start_lookup = 0;
        ei->i_disksize = 0;

        /* Don't inherit extent flag from directory, amongst others. */
        ei->i_flags =
                ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
        ei->i_flags |= i_flags;
        ei->i_file_acl = 0;
        ei->i_dtime = 0;
        ei->i_block_group = group;
        ei->i_last_alloc_group = ~0;

        ext4_set_inode_flags(inode, true);
        if (IS_DIRSYNC(inode))
                ext4_handle_sync(handle);
        if (insert_inode_locked(inode) < 0) {
                /*
                 * Likely a bitmap corruption causing inode to be allocated
                 * twice.
                 */
                err = -EIO;
                ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
                           inode->i_ino);
                ext4_mark_group_bitmap_corrupted(sb, group,
                                        EXT4_GROUP_INFO_IBITMAP_CORRUPT);
                goto out;
        }
        inode->i_generation = get_random_u32();

        /* Precompute checksum seed for inode metadata */
        if (ext4_has_metadata_csum(sb)) {
                __u32 csum;
                __le32 inum = cpu_to_le32(inode->i_ino);
                __le32 gen = cpu_to_le32(inode->i_generation);
                csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
                                   sizeof(inum));
                ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
                                              sizeof(gen));
        }

        ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
        ext4_set_inode_state(inode, EXT4_STATE_NEW);

        ei->i_extra_isize = sbi->s_want_extra_isize;
        ei->i_inline_off = 0;
        if (ext4_has_feature_inline_data(sb) &&
            (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode)))
                ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
        ret = inode;
        err = dquot_alloc_inode(inode);
        if (err)
                goto fail_drop;

        /*
         * Since the encryption xattr will always be unique, create it first so
         * that it's less likely to end up in an external xattr block and
         * prevent its deduplication.
         */
        if (encrypt) {
                err = fscrypt_set_context(inode, handle);
                if (err)
                        goto fail_free_drop;
        }

        if (!(ei->i_flags & EXT4_EA_INODE_FL)) {
                err = ext4_init_acl(handle, inode, dir);
                if (err)
                        goto fail_free_drop;

                err = ext4_init_security(handle, inode, dir, qstr);
                if (err)
                        goto fail_free_drop;
        }

        if (ext4_has_feature_extents(sb)) {
                /* set extent flag only for directory, file and normal symlink*/
                if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
                        ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
                        ext4_ext_tree_init(handle, inode);
                }
        }

        if (ext4_handle_valid(handle)) {
                ei->i_sync_tid = handle->h_transaction->t_tid;
                ei->i_datasync_tid = handle->h_transaction->t_tid;
        }

        err = ext4_mark_inode_dirty(handle, inode);
        if (err) {
                ext4_std_error(sb, err);
                goto fail_free_drop;
        }

        ext4_debug("allocating inode %lu\n", inode->i_ino);
        trace_ext4_allocate_inode(inode, dir, mode);
        brelse(inode_bitmap_bh);
        return ret;

fail_free_drop:
        dquot_free_inode(inode);
fail_drop:
        clear_nlink(inode);
        unlock_new_inode(inode);
out:
        dquot_drop(inode);
        inode->i_flags |= S_NOQUOTA;
        iput(inode);
        brelse(inode_bitmap_bh);
        return ERR_PTR(err);
}

/* Verify that we are loading a valid orphan from disk */
struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
{
        unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
        ext4_group_t block_group;
        int bit;
        struct buffer_head *bitmap_bh = NULL;
        struct inode *inode = NULL;
        int err = -EFSCORRUPTED;

        if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
                goto bad_orphan;

        block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
        bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
        bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
        if (IS_ERR(bitmap_bh))
                return ERR_CAST(bitmap_bh);

        /* Having the inode bit set should be a 100% indicator that this
         * is a valid orphan (no e2fsck run on fs).  Orphans also include
         * inodes that were being truncated, so we can't check i_nlink==0.
         */
        if (!ext4_test_bit(bit, bitmap_bh->b_data))
                goto bad_orphan;

        inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                ext4_error_err(sb, -err,
                               "couldn't read orphan inode %lu (err %d)",
                               ino, err);
                brelse(bitmap_bh);
                return inode;
        }

        /*
         * If the orphans has i_nlinks > 0 then it should be able to
         * be truncated, otherwise it won't be removed from the orphan
         * list during processing and an infinite loop will result.
         * Similarly, it must not be a bad inode.
         */
        if ((inode->i_nlink && !ext4_can_truncate(inode)) ||
            is_bad_inode(inode))
                goto bad_orphan;

        if (NEXT_ORPHAN(inode) > max_ino)
                goto bad_orphan;
        brelse(bitmap_bh);
        return inode;

bad_orphan:
        ext4_error(sb, "bad orphan inode %lu", ino);
        if (bitmap_bh)
                printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n",
                       bit, (unsigned long long)bitmap_bh->b_blocknr,
                       ext4_test_bit(bit, bitmap_bh->b_data));
        if (inode) {
                printk(KERN_ERR "is_bad_inode(inode)=%d\n",
                       is_bad_inode(inode));
                printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n",
                       NEXT_ORPHAN(inode));
                printk(KERN_ERR "max_ino=%lu\n", max_ino);
                printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink);
                /* Avoid freeing blocks if we got a bad deleted inode */
                if (inode->i_nlink == 0)
                        inode->i_blocks = 0;
                iput(inode);
        }
        brelse(bitmap_bh);
        return ERR_PTR(err);
}

unsigned long ext4_count_free_inodes(struct super_block *sb)
{
        unsigned long desc_count;
        struct ext4_group_desc *gdp;
        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
#ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        unsigned long bitmap_count, x;
        struct buffer_head *bitmap_bh = NULL;

        es = EXT4_SB(sb)->s_es;
        desc_count = 0;
        bitmap_count = 0;
        gdp = NULL;
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
                desc_count += ext4_free_inodes_count(sb, gdp);
                brelse(bitmap_bh);
                bitmap_bh = ext4_read_inode_bitmap(sb, i);
                if (IS_ERR(bitmap_bh)) {
                        bitmap_bh = NULL;
                        continue;
                }

                x = ext4_count_free(bitmap_bh->b_data,
                                    EXT4_INODES_PER_GROUP(sb) / 8);
                printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
                        (unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
                bitmap_count += x;
        }
        brelse(bitmap_bh);
        printk(KERN_DEBUG "ext4_count_free_inodes: "
               "stored = %u, computed = %lu, %lu\n",
               le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
        return desc_count;
#else
        desc_count = 0;
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
                desc_count += ext4_free_inodes_count(sb, gdp);
                cond_resched();
        }
        return desc_count;
#endif
}

/* Called at mount-time, super-block is locked */
unsigned long ext4_count_dirs(struct super_block * sb)
{
        unsigned long count = 0;
        ext4_group_t i, ngroups = ext4_get_groups_count(sb);

        for (i = 0; i < ngroups; i++) {
                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
                count += ext4_used_dirs_count(sb, gdp);
        }
        return count;
}

/*
 * Zeroes not yet zeroed inode table - just write zeroes through the whole
 * inode table. Must be called without any spinlock held. The only place
 * where it is called from on active part of filesystem is ext4lazyinit
 * thread, so we do not need any special locks, however we have to prevent
 * inode allocation from the current group, so we take alloc_sem lock, to
 * block ext4_new_inode() until we are finished.
 */
int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
                                 int barrier)
{
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *gdp = NULL;
        struct buffer_head *group_desc_bh;
        handle_t *handle;
        ext4_fsblk_t blk;
        int num, ret = 0, used_blks = 0;
        unsigned long used_inos = 0;

        gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
        if (!gdp || !grp)
                goto out;

        /*
         * We do not need to lock this, because we are the only one
         * handling this flag.
         */
        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
                goto out;

        handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                goto out;
        }

        down_write(&grp->alloc_sem);
        /*
         * If inode bitmap was already initialized there may be some
         * used inodes so we need to skip blocks with used inodes in
         * inode table.
         */
        if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) {
                used_inos = EXT4_INODES_PER_GROUP(sb) -
                            ext4_itable_unused_count(sb, gdp);
                used_blks = DIV_ROUND_UP(used_inos, sbi->s_inodes_per_block);

                /* Bogus inode unused count? */
                if (used_blks < 0 || used_blks > sbi->s_itb_per_group) {
                        ext4_error(sb, "Something is wrong with group %u: "
                                   "used itable blocks: %d; "
                                   "itable unused count: %u",
                                   group, used_blks,
                                   ext4_itable_unused_count(sb, gdp));
                        ret = 1;
                        goto err_out;
                }

                used_inos += group * EXT4_INODES_PER_GROUP(sb);
                /*
                 * Are there some uninitialized inodes in the inode table
                 * before the first normal inode?
                 */
                if ((used_blks != sbi->s_itb_per_group) &&
                     (used_inos < EXT4_FIRST_INO(sb))) {
                        ext4_error(sb, "Something is wrong with group %u: "
                                   "itable unused count: %u; "
                                   "itables initialized count: %ld",
                                   group, ext4_itable_unused_count(sb, gdp),
                                   used_inos);
                        ret = 1;
                        goto err_out;
                }
        }

        blk = ext4_inode_table(sb, gdp) + used_blks;
        num = sbi->s_itb_per_group - used_blks;

        BUFFER_TRACE(group_desc_bh, "get_write_access");
        ret = ext4_journal_get_write_access(handle, sb, group_desc_bh,
                                            EXT4_JTR_NONE);
        if (ret)
                goto err_out;

        /*
         * Skip zeroout if the inode table is full. But we set the ZEROED
         * flag anyway, because obviously, when it is full it does not need
         * further zeroing.
         */
        if (unlikely(num == 0))
                goto skip_zeroout;

        ext4_debug("going to zero out inode table in group %d\n",
                   group);
        ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
        if (ret < 0)
                goto err_out;
        if (barrier)
                blkdev_issue_flush(sb->s_bdev);

skip_zeroout:
        ext4_lock_group(sb, group);
        gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
        ext4_group_desc_csum_set(sb, group, gdp);
        ext4_unlock_group(sb, group);

        BUFFER_TRACE(group_desc_bh,
                     "call ext4_handle_dirty_metadata");
        ret = ext4_handle_dirty_metadata(handle, NULL,
                                         group_desc_bh);

err_out:
        up_write(&grp->alloc_sem);
        ext4_journal_stop(handle);
out:
        return ret;
}










    2 















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM mmap

#if !defined(_TRACE_MMAP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MMAP_H

#include <linux/tracepoint.h>

TRACE_EVENT(vm_unmapped_area,

        TP_PROTO(unsigned long addr, struct vm_unmapped_area_info *info),

        TP_ARGS(addr, info),

        TP_STRUCT__entry(
                __field(unsigned long,        addr)
                __field(unsigned long,        total_vm)
                __field(unsigned long,        flags)
                __field(unsigned long,        length)
                __field(unsigned long,        low_limit)
                __field(unsigned long,        high_limit)
                __field(unsigned long,        align_mask)
                __field(unsigned long,        align_offset)
        ),

        TP_fast_assign(
                __entry->addr = addr;
                __entry->total_vm = current->mm->total_vm;
                __entry->flags = info->flags;
                __entry->length = info->length;
                __entry->low_limit = info->low_limit;
                __entry->high_limit = info->high_limit;
                __entry->align_mask = info->align_mask;
                __entry->align_offset = info->align_offset;
        ),

        TP_printk("addr=0x%lx err=%ld total_vm=0x%lx flags=0x%lx len=0x%lx lo=0x%lx hi=0x%lx mask=0x%lx ofs=0x%lx",
                IS_ERR_VALUE(__entry->addr) ? 0 : __entry->addr,
                IS_ERR_VALUE(__entry->addr) ? __entry->addr : 0,
                __entry->total_vm, __entry->flags, __entry->length,
                __entry->low_limit, __entry->high_limit, __entry->align_mask,
                __entry->align_offset)
);

TRACE_EVENT(vma_mas_szero,
        TP_PROTO(struct maple_tree *mt, unsigned long start,
                 unsigned long end),

        TP_ARGS(mt, start, end),

        TP_STRUCT__entry(
                        __field(struct maple_tree *, mt)
                        __field(unsigned long, start)
                        __field(unsigned long, end)
        ),

        TP_fast_assign(
                        __entry->mt                = mt;
                        __entry->start                = start;
                        __entry->end                = end;
        ),

        TP_printk("mt_mod %p, (NULL), SNULL, %lu, %lu,",
                  __entry->mt,
                  (unsigned long) __entry->start,
                  (unsigned long) __entry->end
        )
);

TRACE_EVENT(vma_store,
        TP_PROTO(struct maple_tree *mt, struct vm_area_struct *vma),

        TP_ARGS(mt, vma),

        TP_STRUCT__entry(
                        __field(struct maple_tree *, mt)
                        __field(struct vm_area_struct *, vma)
                        __field(unsigned long, vm_start)
                        __field(unsigned long, vm_end)
        ),

        TP_fast_assign(
                        __entry->mt                = mt;
                        __entry->vma                = vma;
                        __entry->vm_start        = vma->vm_start;
                        __entry->vm_end                = vma->vm_end - 1;
        ),

        TP_printk("mt_mod %p, (%p), STORE, %lu, %lu,",
                  __entry->mt, __entry->vma,
                  (unsigned long) __entry->vm_start,
                  (unsigned long) __entry->vm_end
        )
);


TRACE_EVENT(exit_mmap,
        TP_PROTO(struct mm_struct *mm),

        TP_ARGS(mm),

        TP_STRUCT__entry(
                        __field(struct mm_struct *, mm)
                        __field(struct maple_tree *, mt)
        ),

        TP_fast_assign(
                       __entry->mm                = mm;
                       __entry->mt                = &mm->mm_mt;
        ),

        TP_printk("mt_mod %p, DESTROY",
                  __entry->mt
        )
);

#endif

/* This part must be outside protection */
#include <trace/define_trace.h>














































    2 
















    3 


    3 















































    3 






    3 














    3 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/init.h>
#include <linux/scatterlist.h>
#include <linux/mempool.h>
#include <linux/slab.h>

#define SG_MEMPOOL_NR                ARRAY_SIZE(sg_pools)
#define SG_MEMPOOL_SIZE                2

struct sg_pool {
        size_t                size;
        char                *name;
        struct kmem_cache        *slab;
        mempool_t        *pool;
};

#define SP(x) { .size = x, "sgpool-" __stringify(x) }
#if (SG_CHUNK_SIZE < 32)
#error SG_CHUNK_SIZE is too small (must be 32 or greater)
#endif
static struct sg_pool sg_pools[] = {
        SP(8),
        SP(16),
#if (SG_CHUNK_SIZE > 32)
        SP(32),
#if (SG_CHUNK_SIZE > 64)
        SP(64),
#if (SG_CHUNK_SIZE > 128)
        SP(128),
#if (SG_CHUNK_SIZE > 256)
#error SG_CHUNK_SIZE is too large (256 MAX)
#endif
#endif
#endif
#endif
        SP(SG_CHUNK_SIZE)
};
#undef SP

static inline unsigned int sg_pool_index(unsigned short nents)
{
        unsigned int index;

        BUG_ON(nents > SG_CHUNK_SIZE);

        if (nents <= 8)
                index = 0;
        else
                index = get_count_order(nents) - 3;

        return index;
}

static void sg_pool_free(struct scatterlist *sgl, unsigned int nents)
{
        struct sg_pool *sgp;

        sgp = sg_pools + sg_pool_index(nents);
        mempool_free(sgl, sgp->pool);
}

static struct scatterlist *sg_pool_alloc(unsigned int nents, gfp_t gfp_mask)
{
        struct sg_pool *sgp;

        sgp = sg_pools + sg_pool_index(nents);
        return mempool_alloc(sgp->pool, gfp_mask);
}

/**
 * sg_free_table_chained - Free a previously mapped sg table
 * @table:        The sg table header to use
 * @nents_first_chunk: size of the first_chunk SGL passed to
 *                sg_alloc_table_chained
 *
 *  Description:
 *    Free an sg table previously allocated and setup with
 *    sg_alloc_table_chained().
 *
 *    @nents_first_chunk has to be same with that same parameter passed
 *    to sg_alloc_table_chained().
 *
 **/
void sg_free_table_chained(struct sg_table *table,
                unsigned nents_first_chunk)
{
        if (table->orig_nents <= nents_first_chunk)
                return;

        if (nents_first_chunk == 1)
                nents_first_chunk = 0;

        __sg_free_table(table, SG_CHUNK_SIZE, nents_first_chunk, sg_pool_free,
                        table->orig_nents);
}
EXPORT_SYMBOL_GPL(sg_free_table_chained);

/**
 * sg_alloc_table_chained - Allocate and chain SGLs in an sg table
 * @table:        The sg table header to use
 * @nents:        Number of entries in sg list
 * @first_chunk: first SGL
 * @nents_first_chunk: number of the SGL of @first_chunk
 *
 *  Description:
 *    Allocate and chain SGLs in an sg table. If @nents@ is larger than
 *    @nents_first_chunk a chained sg table will be setup. @first_chunk is
 *    ignored if nents_first_chunk <= 1 because user expects the SGL points
 *    non-chain SGL.
 *
 **/
int sg_alloc_table_chained(struct sg_table *table, int nents,
                struct scatterlist *first_chunk, unsigned nents_first_chunk)
{
        int ret;

        BUG_ON(!nents);

        if (first_chunk && nents_first_chunk) {
                if (nents <= nents_first_chunk) {
                        table->nents = table->orig_nents = nents;
                        sg_init_table(table->sgl, nents);
                        return 0;
                }
        }

        /* User supposes that the 1st SGL includes real entry */
        if (nents_first_chunk <= 1) {
                first_chunk = NULL;
                nents_first_chunk = 0;
        }

        ret = __sg_alloc_table(table, nents, SG_CHUNK_SIZE,
                               first_chunk, nents_first_chunk,
                               GFP_ATOMIC, sg_pool_alloc);
        if (unlikely(ret))
                sg_free_table_chained(table, nents_first_chunk);
        return ret;
}
EXPORT_SYMBOL_GPL(sg_alloc_table_chained);

static __init int sg_pool_init(void)
{
        int i;

        for (i = 0; i < SG_MEMPOOL_NR; i++) {
                struct sg_pool *sgp = sg_pools + i;
                int size = sgp->size * sizeof(struct scatterlist);

                sgp->slab = kmem_cache_create(sgp->name, size, 0,
                                SLAB_HWCACHE_ALIGN, NULL);
                if (!sgp->slab) {
                        printk(KERN_ERR "SG_POOL: can't init sg slab %s\n",
                                        sgp->name);
                        goto cleanup_sdb;
                }

                sgp->pool = mempool_create_slab_pool(SG_MEMPOOL_SIZE,
                                                     sgp->slab);
                if (!sgp->pool) {
                        printk(KERN_ERR "SG_POOL: can't init sg mempool %s\n",
                                        sgp->name);
                        goto cleanup_sdb;
                }
        }

        return 0;

cleanup_sdb:
        for (i = 0; i < SG_MEMPOOL_NR; i++) {
                struct sg_pool *sgp = sg_pools + i;

                mempool_destroy(sgp->pool);
                kmem_cache_destroy(sgp->slab);
        }

        return -ENOMEM;
}

subsys_initcall(sg_pool_init);


























































































































































































































































































































































































































































    2 











    2 









    3 


































    3 





    3 





    1 
    3 






















































    3 













    3 







    3 













    3 
























































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Virtio SCSI HBA driver
 *
 * Copyright IBM Corp. 2010
 * Copyright Red Hat, Inc. 2011
 *
 * Authors:
 *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
 *  Paolo Bonzini   <pbonzini@redhat.com>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/slab.h>
#include <linux/mempool.h>
#include <linux/interrupt.h>
#include <linux/virtio.h>
#include <linux/virtio_ids.h>
#include <linux/virtio_config.h>
#include <linux/virtio_scsi.h>
#include <linux/cpu.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <scsi/scsi_host.h>
#include <scsi/scsi_device.h>
#include <scsi/scsi_cmnd.h>
#include <scsi/scsi_tcq.h>
#include <scsi/scsi_devinfo.h>
#include <linux/seqlock.h>
#include <linux/blk-mq-virtio.h>

#include "sd.h"

#define VIRTIO_SCSI_MEMPOOL_SZ 64
#define VIRTIO_SCSI_EVENT_LEN 8
#define VIRTIO_SCSI_VQ_BASE 2

static unsigned int virtscsi_poll_queues;
module_param(virtscsi_poll_queues, uint, 0644);
MODULE_PARM_DESC(virtscsi_poll_queues,
                 "The number of dedicated virtqueues for polling I/O");

/* Command queue element */
struct virtio_scsi_cmd {
        struct scsi_cmnd *sc;
        struct completion *comp;
        union {
                struct virtio_scsi_cmd_req       cmd;
                struct virtio_scsi_cmd_req_pi    cmd_pi;
                struct virtio_scsi_ctrl_tmf_req  tmf;
                struct virtio_scsi_ctrl_an_req   an;
        } req;
        union {
                struct virtio_scsi_cmd_resp      cmd;
                struct virtio_scsi_ctrl_tmf_resp tmf;
                struct virtio_scsi_ctrl_an_resp  an;
                struct virtio_scsi_event         evt;
        } resp;
} ____cacheline_aligned_in_smp;

struct virtio_scsi_event_node {
        struct virtio_scsi *vscsi;
        struct virtio_scsi_event event;
        struct work_struct work;
};

struct virtio_scsi_vq {
        /* Protects vq */
        spinlock_t vq_lock;

        struct virtqueue *vq;
};

/* Driver instance state */
struct virtio_scsi {
        struct virtio_device *vdev;

        /* Get some buffers ready for event vq */
        struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN];

        u32 num_queues;
        int io_queues[HCTX_MAX_TYPES];

        struct hlist_node node;

        /* Protected by event_vq lock */
        bool stop_events;

        struct virtio_scsi_vq ctrl_vq;
        struct virtio_scsi_vq event_vq;
        struct virtio_scsi_vq req_vqs[];
};

static struct kmem_cache *virtscsi_cmd_cache;
static mempool_t *virtscsi_cmd_pool;

static inline struct Scsi_Host *virtio_scsi_host(struct virtio_device *vdev)
{
        return vdev->priv;
}

static void virtscsi_compute_resid(struct scsi_cmnd *sc, u32 resid)
{
        if (resid)
                scsi_set_resid(sc, min(resid, scsi_bufflen(sc)));
}

/*
 * virtscsi_complete_cmd - finish a scsi_cmd and invoke scsi_done
 *
 * Called with vq_lock held.
 */
static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf)
{
        struct virtio_scsi_cmd *cmd = buf;
        struct scsi_cmnd *sc = cmd->sc;
        struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd;

        dev_dbg(&sc->device->sdev_gendev,
                "cmd %p response %u status %#02x sense_len %u\n",
                sc, resp->response, resp->status, resp->sense_len);

        sc->result = resp->status;
        virtscsi_compute_resid(sc, virtio32_to_cpu(vscsi->vdev, resp->resid));
        switch (resp->response) {
        case VIRTIO_SCSI_S_OK:
                set_host_byte(sc, DID_OK);
                break;
        case VIRTIO_SCSI_S_OVERRUN:
                set_host_byte(sc, DID_ERROR);
                break;
        case VIRTIO_SCSI_S_ABORTED:
                set_host_byte(sc, DID_ABORT);
                break;
        case VIRTIO_SCSI_S_BAD_TARGET:
                set_host_byte(sc, DID_BAD_TARGET);
                break;
        case VIRTIO_SCSI_S_RESET:
                set_host_byte(sc, DID_RESET);
                break;
        case VIRTIO_SCSI_S_BUSY:
                set_host_byte(sc, DID_BUS_BUSY);
                break;
        case VIRTIO_SCSI_S_TRANSPORT_FAILURE:
                set_host_byte(sc, DID_TRANSPORT_DISRUPTED);
                break;
        case VIRTIO_SCSI_S_TARGET_FAILURE:
                set_host_byte(sc, DID_BAD_TARGET);
                break;
        case VIRTIO_SCSI_S_NEXUS_FAILURE:
                set_status_byte(sc, SAM_STAT_RESERVATION_CONFLICT);
                break;
        default:
                scmd_printk(KERN_WARNING, sc, "Unknown response %d",
                            resp->response);
                fallthrough;
        case VIRTIO_SCSI_S_FAILURE:
                set_host_byte(sc, DID_ERROR);
                break;
        }

        WARN_ON(virtio32_to_cpu(vscsi->vdev, resp->sense_len) >
                VIRTIO_SCSI_SENSE_SIZE);
        if (resp->sense_len) {
                memcpy(sc->sense_buffer, resp->sense,
                       min_t(u32,
                             virtio32_to_cpu(vscsi->vdev, resp->sense_len),
                             VIRTIO_SCSI_SENSE_SIZE));
        }

        scsi_done(sc);
}

static void virtscsi_vq_done(struct virtio_scsi *vscsi,
                             struct virtio_scsi_vq *virtscsi_vq,
                             void (*fn)(struct virtio_scsi *vscsi, void *buf))
{
        void *buf;
        unsigned int len;
        unsigned long flags;
        struct virtqueue *vq = virtscsi_vq->vq;

        spin_lock_irqsave(&virtscsi_vq->vq_lock, flags);
        do {
                virtqueue_disable_cb(vq);
                while ((buf = virtqueue_get_buf(vq, &len)) != NULL)
                        fn(vscsi, buf);

        } while (!virtqueue_enable_cb(vq));
        spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags);
}

static void virtscsi_req_done(struct virtqueue *vq)
{
        struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
        struct virtio_scsi *vscsi = shost_priv(sh);
        int index = vq->index - VIRTIO_SCSI_VQ_BASE;
        struct virtio_scsi_vq *req_vq = &vscsi->req_vqs[index];

        virtscsi_vq_done(vscsi, req_vq, virtscsi_complete_cmd);
};

static void virtscsi_poll_requests(struct virtio_scsi *vscsi)
{
        int i, num_vqs;

        num_vqs = vscsi->num_queues;
        for (i = 0; i < num_vqs; i++)
                virtscsi_vq_done(vscsi, &vscsi->req_vqs[i],
                                 virtscsi_complete_cmd);
}

static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf)
{
        struct virtio_scsi_cmd *cmd = buf;

        if (cmd->comp)
                complete(cmd->comp);
}

static void virtscsi_ctrl_done(struct virtqueue *vq)
{
        struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
        struct virtio_scsi *vscsi = shost_priv(sh);

        virtscsi_vq_done(vscsi, &vscsi->ctrl_vq, virtscsi_complete_free);
};

static void virtscsi_handle_event(struct work_struct *work);

static int virtscsi_kick_event(struct virtio_scsi *vscsi,
                               struct virtio_scsi_event_node *event_node)
{
        int err;
        struct scatterlist sg;
        unsigned long flags;

        INIT_WORK(&event_node->work, virtscsi_handle_event);
        sg_init_one(&sg, &event_node->event, sizeof(struct virtio_scsi_event));

        spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags);

        err = virtqueue_add_inbuf(vscsi->event_vq.vq, &sg, 1, event_node,
                                  GFP_ATOMIC);
        if (!err)
                virtqueue_kick(vscsi->event_vq.vq);

        spin_unlock_irqrestore(&vscsi->event_vq.vq_lock, flags);

        return err;
}

static int virtscsi_kick_event_all(struct virtio_scsi *vscsi)
{
        int i;

        for (i = 0; i < VIRTIO_SCSI_EVENT_LEN; i++) {
                vscsi->event_list[i].vscsi = vscsi;
                virtscsi_kick_event(vscsi, &vscsi->event_list[i]);
        }

        return 0;
}

static void virtscsi_cancel_event_work(struct virtio_scsi *vscsi)
{
        int i;

        /* Stop scheduling work before calling cancel_work_sync.  */
        spin_lock_irq(&vscsi->event_vq.vq_lock);
        vscsi->stop_events = true;
        spin_unlock_irq(&vscsi->event_vq.vq_lock);

        for (i = 0; i < VIRTIO_SCSI_EVENT_LEN; i++)
                cancel_work_sync(&vscsi->event_list[i].work);
}

static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi,
                                            struct virtio_scsi_event *event)
{
        struct scsi_device *sdev;
        struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev);
        unsigned int target = event->lun[1];
        unsigned int lun = (event->lun[2] << 8) | event->lun[3];

        switch (virtio32_to_cpu(vscsi->vdev, event->reason)) {
        case VIRTIO_SCSI_EVT_RESET_RESCAN:
                if (lun == 0) {
                        scsi_scan_target(&shost->shost_gendev, 0, target,
                                         SCAN_WILD_CARD, SCSI_SCAN_INITIAL);
                } else {
                        scsi_add_device(shost, 0, target, lun);
                }
                break;
        case VIRTIO_SCSI_EVT_RESET_REMOVED:
                sdev = scsi_device_lookup(shost, 0, target, lun);
                if (sdev) {
                        scsi_remove_device(sdev);
                        scsi_device_put(sdev);
                } else {
                        pr_err("SCSI device %d 0 %d %d not found\n",
                                shost->host_no, target, lun);
                }
                break;
        default:
                pr_info("Unsupported virtio scsi event reason %x\n", event->reason);
        }
}

static void virtscsi_handle_param_change(struct virtio_scsi *vscsi,
                                         struct virtio_scsi_event *event)
{
        struct scsi_device *sdev;
        struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev);
        unsigned int target = event->lun[1];
        unsigned int lun = (event->lun[2] << 8) | event->lun[3];
        u8 asc = virtio32_to_cpu(vscsi->vdev, event->reason) & 255;
        u8 ascq = virtio32_to_cpu(vscsi->vdev, event->reason) >> 8;

        sdev = scsi_device_lookup(shost, 0, target, lun);
        if (!sdev) {
                pr_err("SCSI device %d 0 %d %d not found\n",
                        shost->host_no, target, lun);
                return;
        }

        /* Handle "Parameters changed", "Mode parameters changed", and
           "Capacity data has changed".  */
        if (asc == 0x2a && (ascq == 0x00 || ascq == 0x01 || ascq == 0x09))
                scsi_rescan_device(sdev);

        scsi_device_put(sdev);
}

static int virtscsi_rescan_hotunplug(struct virtio_scsi *vscsi)
{
        struct scsi_device *sdev;
        struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev);
        unsigned char scsi_cmd[MAX_COMMAND_SIZE];
        int result, inquiry_len, inq_result_len = 256;
        char *inq_result = kmalloc(inq_result_len, GFP_KERNEL);

        if (!inq_result)
                return -ENOMEM;

        shost_for_each_device(sdev, shost) {
                inquiry_len = sdev->inquiry_len ? sdev->inquiry_len : 36;

                memset(scsi_cmd, 0, sizeof(scsi_cmd));
                scsi_cmd[0] = INQUIRY;
                scsi_cmd[4] = (unsigned char) inquiry_len;

                memset(inq_result, 0, inq_result_len);

                result = scsi_execute_cmd(sdev, scsi_cmd, REQ_OP_DRV_IN,
                                          inq_result, inquiry_len,
                                          SD_TIMEOUT, SD_MAX_RETRIES, NULL);

                if (result == 0 && inq_result[0] >> 5) {
                        /* PQ indicates the LUN is not attached */
                        scsi_remove_device(sdev);
                } else if (result > 0 && host_byte(result) == DID_BAD_TARGET) {
                        /*
                         * If all LUNs of a virtio-scsi device are unplugged
                         * it will respond with BAD TARGET on any INQUIRY
                         * command.
                         * Remove the device in this case as well.
                         */
                        scsi_remove_device(sdev);
                }
        }

        kfree(inq_result);
        return 0;
}

static void virtscsi_handle_event(struct work_struct *work)
{
        struct virtio_scsi_event_node *event_node =
                container_of(work, struct virtio_scsi_event_node, work);
        struct virtio_scsi *vscsi = event_node->vscsi;
        struct virtio_scsi_event *event = &event_node->event;

        if (event->event &
            cpu_to_virtio32(vscsi->vdev, VIRTIO_SCSI_T_EVENTS_MISSED)) {
                int ret;

                event->event &= ~cpu_to_virtio32(vscsi->vdev,
                                                   VIRTIO_SCSI_T_EVENTS_MISSED);
                ret = virtscsi_rescan_hotunplug(vscsi);
                if (ret)
                        return;
                scsi_scan_host(virtio_scsi_host(vscsi->vdev));
        }

        switch (virtio32_to_cpu(vscsi->vdev, event->event)) {
        case VIRTIO_SCSI_T_NO_EVENT:
                break;
        case VIRTIO_SCSI_T_TRANSPORT_RESET:
                virtscsi_handle_transport_reset(vscsi, event);
                break;
        case VIRTIO_SCSI_T_PARAM_CHANGE:
                virtscsi_handle_param_change(vscsi, event);
                break;
        default:
                pr_err("Unsupported virtio scsi event %x\n", event->event);
        }
        virtscsi_kick_event(vscsi, event_node);
}

static void virtscsi_complete_event(struct virtio_scsi *vscsi, void *buf)
{
        struct virtio_scsi_event_node *event_node = buf;

        if (!vscsi->stop_events)
                queue_work(system_freezable_wq, &event_node->work);
}

static void virtscsi_event_done(struct virtqueue *vq)
{
        struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
        struct virtio_scsi *vscsi = shost_priv(sh);

        virtscsi_vq_done(vscsi, &vscsi->event_vq, virtscsi_complete_event);
};

static int __virtscsi_add_cmd(struct virtqueue *vq,
                            struct virtio_scsi_cmd *cmd,
                            size_t req_size, size_t resp_size)
{
        struct scsi_cmnd *sc = cmd->sc;
        struct scatterlist *sgs[6], req, resp;
        struct sg_table *out, *in;
        unsigned out_num = 0, in_num = 0;

        out = in = NULL;

        if (sc && sc->sc_data_direction != DMA_NONE) {
                if (sc->sc_data_direction != DMA_FROM_DEVICE)
                        out = &sc->sdb.table;
                if (sc->sc_data_direction != DMA_TO_DEVICE)
                        in = &sc->sdb.table;
        }

        /* Request header.  */
        sg_init_one(&req, &cmd->req, req_size);
        sgs[out_num++] = &req;

        /* Data-out buffer.  */
        if (out) {
                /* Place WRITE protection SGLs before Data OUT payload */
                if (scsi_prot_sg_count(sc))
                        sgs[out_num++] = scsi_prot_sglist(sc);
                sgs[out_num++] = out->sgl;
        }

        /* Response header.  */
        sg_init_one(&resp, &cmd->resp, resp_size);
        sgs[out_num + in_num++] = &resp;

        /* Data-in buffer */
        if (in) {
                /* Place READ protection SGLs before Data IN payload */
                if (scsi_prot_sg_count(sc))
                        sgs[out_num + in_num++] = scsi_prot_sglist(sc);
                sgs[out_num + in_num++] = in->sgl;
        }

        return virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, GFP_ATOMIC);
}

static void virtscsi_kick_vq(struct virtio_scsi_vq *vq)
{
        bool needs_kick;
        unsigned long flags;

        spin_lock_irqsave(&vq->vq_lock, flags);
        needs_kick = virtqueue_kick_prepare(vq->vq);
        spin_unlock_irqrestore(&vq->vq_lock, flags);

        if (needs_kick)
                virtqueue_notify(vq->vq);
}

/**
 * virtscsi_add_cmd - add a virtio_scsi_cmd to a virtqueue, optionally kick it
 * @vq                : the struct virtqueue we're talking about
 * @cmd                : command structure
 * @req_size        : size of the request buffer
 * @resp_size        : size of the response buffer
 * @kick        : whether to kick the virtqueue immediately
 */
static int virtscsi_add_cmd(struct virtio_scsi_vq *vq,
                             struct virtio_scsi_cmd *cmd,
                             size_t req_size, size_t resp_size,
                             bool kick)
{
        unsigned long flags;
        int err;
        bool needs_kick = false;

        spin_lock_irqsave(&vq->vq_lock, flags);
        err = __virtscsi_add_cmd(vq->vq, cmd, req_size, resp_size);
        if (!err && kick)
                needs_kick = virtqueue_kick_prepare(vq->vq);

        spin_unlock_irqrestore(&vq->vq_lock, flags);

        if (needs_kick)
                virtqueue_notify(vq->vq);
        return err;
}

static void virtio_scsi_init_hdr(struct virtio_device *vdev,
                                 struct virtio_scsi_cmd_req *cmd,
                                 struct scsi_cmnd *sc)
{
        cmd->lun[0] = 1;
        cmd->lun[1] = sc->device->id;
        cmd->lun[2] = (sc->device->lun >> 8) | 0x40;
        cmd->lun[3] = sc->device->lun & 0xff;
        cmd->tag = cpu_to_virtio64(vdev, (unsigned long)sc);
        cmd->task_attr = VIRTIO_SCSI_S_SIMPLE;
        cmd->prio = 0;
        cmd->crn = 0;
}

#ifdef CONFIG_BLK_DEV_INTEGRITY
static void virtio_scsi_init_hdr_pi(struct virtio_device *vdev,
                                    struct virtio_scsi_cmd_req_pi *cmd_pi,
                                    struct scsi_cmnd *sc)
{
        struct request *rq = scsi_cmd_to_rq(sc);
        struct blk_integrity *bi;

        virtio_scsi_init_hdr(vdev, (struct virtio_scsi_cmd_req *)cmd_pi, sc);

        if (!rq || !scsi_prot_sg_count(sc))
                return;

        bi = blk_get_integrity(rq->q->disk);

        if (sc->sc_data_direction == DMA_TO_DEVICE)
                cmd_pi->pi_bytesout = cpu_to_virtio32(vdev,
                                                      bio_integrity_bytes(bi,
                                                        blk_rq_sectors(rq)));
        else if (sc->sc_data_direction == DMA_FROM_DEVICE)
                cmd_pi->pi_bytesin = cpu_to_virtio32(vdev,
                                                     bio_integrity_bytes(bi,
                                                        blk_rq_sectors(rq)));
}
#endif

static struct virtio_scsi_vq *virtscsi_pick_vq_mq(struct virtio_scsi *vscsi,
                                                  struct scsi_cmnd *sc)
{
        u32 tag = blk_mq_unique_tag(scsi_cmd_to_rq(sc));
        u16 hwq = blk_mq_unique_tag_to_hwq(tag);

        return &vscsi->req_vqs[hwq];
}

static int virtscsi_queuecommand(struct Scsi_Host *shost,
                                 struct scsi_cmnd *sc)
{
        struct virtio_scsi *vscsi = shost_priv(shost);
        struct virtio_scsi_vq *req_vq = virtscsi_pick_vq_mq(vscsi, sc);
        struct virtio_scsi_cmd *cmd = scsi_cmd_priv(sc);
        bool kick;
        unsigned long flags;
        int req_size;
        int ret;

        BUG_ON(scsi_sg_count(sc) > shost->sg_tablesize);

        /* TODO: check feature bit and fail if unsupported?  */
        BUG_ON(sc->sc_data_direction == DMA_BIDIRECTIONAL);

        dev_dbg(&sc->device->sdev_gendev,
                "cmd %p CDB: %#02x\n", sc, sc->cmnd[0]);

        cmd->sc = sc;

        BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE);

#ifdef CONFIG_BLK_DEV_INTEGRITY
        if (virtio_has_feature(vscsi->vdev, VIRTIO_SCSI_F_T10_PI)) {
                virtio_scsi_init_hdr_pi(vscsi->vdev, &cmd->req.cmd_pi, sc);
                memcpy(cmd->req.cmd_pi.cdb, sc->cmnd, sc->cmd_len);
                req_size = sizeof(cmd->req.cmd_pi);
        } else
#endif
        {
                virtio_scsi_init_hdr(vscsi->vdev, &cmd->req.cmd, sc);
                memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len);
                req_size = sizeof(cmd->req.cmd);
        }

        kick = (sc->flags & SCMD_LAST) != 0;
        ret = virtscsi_add_cmd(req_vq, cmd, req_size, sizeof(cmd->resp.cmd), kick);
        if (ret == -EIO) {
                cmd->resp.cmd.response = VIRTIO_SCSI_S_BAD_TARGET;
                spin_lock_irqsave(&req_vq->vq_lock, flags);
                virtscsi_complete_cmd(vscsi, cmd);
                spin_unlock_irqrestore(&req_vq->vq_lock, flags);
        } else if (ret != 0) {
                return SCSI_MLQUEUE_HOST_BUSY;
        }
        return 0;
}

static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd)
{
        DECLARE_COMPLETION_ONSTACK(comp);
        int ret = FAILED;

        cmd->comp = &comp;
        if (virtscsi_add_cmd(&vscsi->ctrl_vq, cmd,
                              sizeof cmd->req.tmf, sizeof cmd->resp.tmf, true) < 0)
                goto out;

        wait_for_completion(&comp);
        if (cmd->resp.tmf.response == VIRTIO_SCSI_S_OK ||
            cmd->resp.tmf.response == VIRTIO_SCSI_S_FUNCTION_SUCCEEDED)
                ret = SUCCESS;

        /*
         * The spec guarantees that all requests related to the TMF have
         * been completed, but the callback might not have run yet if
         * we're using independent interrupts (e.g. MSI).  Poll the
         * virtqueues once.
         *
         * In the abort case, scsi_done() will do nothing, because the
         * command timed out and hence SCMD_STATE_COMPLETE has been set.
         */
        virtscsi_poll_requests(vscsi);

out:
        mempool_free(cmd, virtscsi_cmd_pool);
        return ret;
}

static int virtscsi_device_reset(struct scsi_cmnd *sc)
{
        struct virtio_scsi *vscsi = shost_priv(sc->device->host);
        struct virtio_scsi_cmd *cmd;

        sdev_printk(KERN_INFO, sc->device, "device reset\n");
        cmd = mempool_alloc(virtscsi_cmd_pool, GFP_NOIO);
        if (!cmd)
                return FAILED;

        memset(cmd, 0, sizeof(*cmd));
        cmd->req.tmf = (struct virtio_scsi_ctrl_tmf_req){
                .type = VIRTIO_SCSI_T_TMF,
                .subtype = cpu_to_virtio32(vscsi->vdev,
                                             VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET),
                .lun[0] = 1,
                .lun[1] = sc->device->id,
                .lun[2] = (sc->device->lun >> 8) | 0x40,
                .lun[3] = sc->device->lun & 0xff,
        };
        return virtscsi_tmf(vscsi, cmd);
}

static int virtscsi_device_alloc(struct scsi_device *sdevice)
{
        /*
         * Passed through SCSI targets (e.g. with qemu's 'scsi-block')
         * may have transfer limits which come from the host SCSI
         * controller or something on the host side other than the
         * target itself.
         *
         * To make this work properly, the hypervisor can adjust the
         * target's VPD information to advertise these limits.  But
         * for that to work, the guest has to look at the VPD pages,
         * which we won't do by default if it is an SPC-2 device, even
         * if it does actually support it.
         *
         * So, set the blist to always try to read the VPD pages.
         */
        sdevice->sdev_bflags = BLIST_TRY_VPD_PAGES;

        return 0;
}


/**
 * virtscsi_change_queue_depth() - Change a virtscsi target's queue depth
 * @sdev:        Virtscsi target whose queue depth to change
 * @qdepth:        New queue depth
 */
static int virtscsi_change_queue_depth(struct scsi_device *sdev, int qdepth)
{
        struct Scsi_Host *shost = sdev->host;
        int max_depth = shost->cmd_per_lun;

        return scsi_change_queue_depth(sdev, min(max_depth, qdepth));
}

static int virtscsi_abort(struct scsi_cmnd *sc)
{
        struct virtio_scsi *vscsi = shost_priv(sc->device->host);
        struct virtio_scsi_cmd *cmd;

        scmd_printk(KERN_INFO, sc, "abort\n");
        cmd = mempool_alloc(virtscsi_cmd_pool, GFP_NOIO);
        if (!cmd)
                return FAILED;

        memset(cmd, 0, sizeof(*cmd));
        cmd->req.tmf = (struct virtio_scsi_ctrl_tmf_req){
                .type = VIRTIO_SCSI_T_TMF,
                .subtype = VIRTIO_SCSI_T_TMF_ABORT_TASK,
                .lun[0] = 1,
                .lun[1] = sc->device->id,
                .lun[2] = (sc->device->lun >> 8) | 0x40,
                .lun[3] = sc->device->lun & 0xff,
                .tag = cpu_to_virtio64(vscsi->vdev, (unsigned long)sc),
        };
        return virtscsi_tmf(vscsi, cmd);
}

static void virtscsi_map_queues(struct Scsi_Host *shost)
{
        struct virtio_scsi *vscsi = shost_priv(shost);
        int i, qoff;

        for (i = 0, qoff = 0; i < shost->nr_maps; i++) {
                struct blk_mq_queue_map *map = &shost->tag_set.map[i];

                map->nr_queues = vscsi->io_queues[i];
                map->queue_offset = qoff;
                qoff += map->nr_queues;

                if (map->nr_queues == 0)
                        continue;

                /*
                 * Regular queues have interrupts and hence CPU affinity is
                 * defined by the core virtio code, but polling queues have
                 * no interrupts so we let the block layer assign CPU affinity.
                 */
                if (i == HCTX_TYPE_POLL)
                        blk_mq_map_queues(map);
                else
                        blk_mq_virtio_map_queues(map, vscsi->vdev, 2);
        }
}

static int virtscsi_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
{
        struct virtio_scsi *vscsi = shost_priv(shost);
        struct virtio_scsi_vq *virtscsi_vq = &vscsi->req_vqs[queue_num];
        unsigned long flags;
        unsigned int len;
        int found = 0;
        void *buf;

        spin_lock_irqsave(&virtscsi_vq->vq_lock, flags);

        while ((buf = virtqueue_get_buf(virtscsi_vq->vq, &len)) != NULL) {
                virtscsi_complete_cmd(vscsi, buf);
                found++;
        }

        spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags);

        return found;
}

static void virtscsi_commit_rqs(struct Scsi_Host *shost, u16 hwq)
{
        struct virtio_scsi *vscsi = shost_priv(shost);

        virtscsi_kick_vq(&vscsi->req_vqs[hwq]);
}

/*
 * The host guarantees to respond to each command, although I/O
 * latencies might be higher than on bare metal.  Reset the timer
 * unconditionally to give the host a chance to perform EH.
 */
static enum scsi_timeout_action virtscsi_eh_timed_out(struct scsi_cmnd *scmnd)
{
        return SCSI_EH_RESET_TIMER;
}

static const struct scsi_host_template virtscsi_host_template = {
        .module = THIS_MODULE,
        .name = "Virtio SCSI HBA",
        .proc_name = "virtio_scsi",
        .this_id = -1,
        .cmd_size = sizeof(struct virtio_scsi_cmd),
        .queuecommand = virtscsi_queuecommand,
        .mq_poll = virtscsi_mq_poll,
        .commit_rqs = virtscsi_commit_rqs,
        .change_queue_depth = virtscsi_change_queue_depth,
        .eh_abort_handler = virtscsi_abort,
        .eh_device_reset_handler = virtscsi_device_reset,
        .eh_timed_out = virtscsi_eh_timed_out,
        .slave_alloc = virtscsi_device_alloc,

        .dma_boundary = UINT_MAX,
        .map_queues = virtscsi_map_queues,
        .track_queue_depth = 1,
};

#define virtscsi_config_get(vdev, fld) \
        ({ \
                __virtio_native_type(struct virtio_scsi_config, fld) __val; \
                virtio_cread(vdev, struct virtio_scsi_config, fld, &__val); \
                __val; \
        })

#define virtscsi_config_set(vdev, fld, val) \
        do { \
                __virtio_native_type(struct virtio_scsi_config, fld) __val = (val); \
                virtio_cwrite(vdev, struct virtio_scsi_config, fld, &__val); \
        } while(0)

static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
                             struct virtqueue *vq)
{
        spin_lock_init(&virtscsi_vq->vq_lock);
        virtscsi_vq->vq = vq;
}

static void virtscsi_remove_vqs(struct virtio_device *vdev)
{
        /* Stop all the virtqueues. */
        virtio_reset_device(vdev);
        vdev->config->del_vqs(vdev);
}

static int virtscsi_init(struct virtio_device *vdev,
                         struct virtio_scsi *vscsi)
{
        int err;
        u32 i;
        u32 num_vqs, num_poll_vqs, num_req_vqs;
        vq_callback_t **callbacks;
        const char **names;
        struct virtqueue **vqs;
        struct irq_affinity desc = { .pre_vectors = 2 };

        num_req_vqs = vscsi->num_queues;
        num_vqs = num_req_vqs + VIRTIO_SCSI_VQ_BASE;
        vqs = kmalloc_array(num_vqs, sizeof(struct virtqueue *), GFP_KERNEL);
        callbacks = kmalloc_array(num_vqs, sizeof(vq_callback_t *),
                                  GFP_KERNEL);
        names = kmalloc_array(num_vqs, sizeof(char *), GFP_KERNEL);

        if (!callbacks || !vqs || !names) {
                err = -ENOMEM;
                goto out;
        }

        num_poll_vqs = min_t(unsigned int, virtscsi_poll_queues,
                             num_req_vqs - 1);
        vscsi->io_queues[HCTX_TYPE_DEFAULT] = num_req_vqs - num_poll_vqs;
        vscsi->io_queues[HCTX_TYPE_READ] = 0;
        vscsi->io_queues[HCTX_TYPE_POLL] = num_poll_vqs;

        dev_info(&vdev->dev, "%d/%d/%d default/read/poll queues\n",
                 vscsi->io_queues[HCTX_TYPE_DEFAULT],
                 vscsi->io_queues[HCTX_TYPE_READ],
                 vscsi->io_queues[HCTX_TYPE_POLL]);

        callbacks[0] = virtscsi_ctrl_done;
        callbacks[1] = virtscsi_event_done;
        names[0] = "control";
        names[1] = "event";
        for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs - num_poll_vqs; i++) {
                callbacks[i] = virtscsi_req_done;
                names[i] = "request";
        }

        for (; i < num_vqs; i++) {
                callbacks[i] = NULL;
                names[i] = "request_poll";
        }

        /* Discover virtqueues and write information to configuration.  */
        err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc);
        if (err)
                goto out;

        virtscsi_init_vq(&vscsi->ctrl_vq, vqs[0]);
        virtscsi_init_vq(&vscsi->event_vq, vqs[1]);
        for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++)
                virtscsi_init_vq(&vscsi->req_vqs[i - VIRTIO_SCSI_VQ_BASE],
                                 vqs[i]);

        virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE);
        virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE);

        err = 0;

out:
        kfree(names);
        kfree(callbacks);
        kfree(vqs);
        if (err)
                virtscsi_remove_vqs(vdev);
        return err;
}

static int virtscsi_probe(struct virtio_device *vdev)
{
        struct Scsi_Host *shost;
        struct virtio_scsi *vscsi;
        int err;
        u32 sg_elems, num_targets;
        u32 cmd_per_lun;
        u32 num_queues;

        if (!vdev->config->get) {
                dev_err(&vdev->dev, "%s failure: config access disabled\n",
                        __func__);
                return -EINVAL;
        }

        /* We need to know how many queues before we allocate. */
        num_queues = virtscsi_config_get(vdev, num_queues) ? : 1;
        num_queues = min_t(unsigned int, nr_cpu_ids, num_queues);

        num_targets = virtscsi_config_get(vdev, max_target) + 1;

        shost = scsi_host_alloc(&virtscsi_host_template,
                                struct_size(vscsi, req_vqs, num_queues));
        if (!shost)
                return -ENOMEM;

        sg_elems = virtscsi_config_get(vdev, seg_max) ?: 1;
        shost->sg_tablesize = sg_elems;
        shost->nr_maps = 1;
        vscsi = shost_priv(shost);
        vscsi->vdev = vdev;
        vscsi->num_queues = num_queues;
        vdev->priv = shost;

        err = virtscsi_init(vdev, vscsi);
        if (err)
                goto virtscsi_init_failed;

        if (vscsi->io_queues[HCTX_TYPE_POLL])
                shost->nr_maps = HCTX_TYPE_POLL + 1;

        shost->can_queue = virtqueue_get_vring_size(vscsi->req_vqs[0].vq);

        cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1;
        shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue);
        shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF;

        /* LUNs > 256 are reported with format 1, so they go in the range
         * 16640-32767.
         */
        shost->max_lun = virtscsi_config_get(vdev, max_lun) + 1 + 0x4000;
        shost->max_id = num_targets;
        shost->max_channel = 0;
        shost->max_cmd_len = VIRTIO_SCSI_CDB_SIZE;
        shost->nr_hw_queues = num_queues;

#ifdef CONFIG_BLK_DEV_INTEGRITY
        if (virtio_has_feature(vdev, VIRTIO_SCSI_F_T10_PI)) {
                int host_prot;

                host_prot = SHOST_DIF_TYPE1_PROTECTION | SHOST_DIF_TYPE2_PROTECTION |
                            SHOST_DIF_TYPE3_PROTECTION | SHOST_DIX_TYPE1_PROTECTION |
                            SHOST_DIX_TYPE2_PROTECTION | SHOST_DIX_TYPE3_PROTECTION;

                scsi_host_set_prot(shost, host_prot);
                scsi_host_set_guard(shost, SHOST_DIX_GUARD_CRC);
        }
#endif

        err = scsi_add_host(shost, &vdev->dev);
        if (err)
                goto scsi_add_host_failed;

        virtio_device_ready(vdev);

        if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG))
                virtscsi_kick_event_all(vscsi);

        scsi_scan_host(shost);
        return 0;

scsi_add_host_failed:
        vdev->config->del_vqs(vdev);
virtscsi_init_failed:
        scsi_host_put(shost);
        return err;
}

static void virtscsi_remove(struct virtio_device *vdev)
{
        struct Scsi_Host *shost = virtio_scsi_host(vdev);
        struct virtio_scsi *vscsi = shost_priv(shost);

        if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG))
                virtscsi_cancel_event_work(vscsi);

        scsi_remove_host(shost);
        virtscsi_remove_vqs(vdev);
        scsi_host_put(shost);
}

#ifdef CONFIG_PM_SLEEP
static int virtscsi_freeze(struct virtio_device *vdev)
{
        virtscsi_remove_vqs(vdev);
        return 0;
}

static int virtscsi_restore(struct virtio_device *vdev)
{
        struct Scsi_Host *sh = virtio_scsi_host(vdev);
        struct virtio_scsi *vscsi = shost_priv(sh);
        int err;

        err = virtscsi_init(vdev, vscsi);
        if (err)
                return err;

        virtio_device_ready(vdev);

        if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG))
                virtscsi_kick_event_all(vscsi);

        return err;
}
#endif

static struct virtio_device_id id_table[] = {
        { VIRTIO_ID_SCSI, VIRTIO_DEV_ANY_ID },
        { 0 },
};

static unsigned int features[] = {
        VIRTIO_SCSI_F_HOTPLUG,
        VIRTIO_SCSI_F_CHANGE,
#ifdef CONFIG_BLK_DEV_INTEGRITY
        VIRTIO_SCSI_F_T10_PI,
#endif
};

static struct virtio_driver virtio_scsi_driver = {
        .feature_table = features,
        .feature_table_size = ARRAY_SIZE(features),
        .driver.name = KBUILD_MODNAME,
        .id_table = id_table,
        .probe = virtscsi_probe,
#ifdef CONFIG_PM_SLEEP
        .freeze = virtscsi_freeze,
        .restore = virtscsi_restore,
#endif
        .remove = virtscsi_remove,
};

static int __init virtio_scsi_init(void)
{
        int ret = -ENOMEM;

        virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0);
        if (!virtscsi_cmd_cache) {
                pr_err("kmem_cache_create() for virtscsi_cmd_cache failed\n");
                goto error;
        }


        virtscsi_cmd_pool =
                mempool_create_slab_pool(VIRTIO_SCSI_MEMPOOL_SZ,
                                         virtscsi_cmd_cache);
        if (!virtscsi_cmd_pool) {
                pr_err("mempool_create() for virtscsi_cmd_pool failed\n");
                goto error;
        }
        ret = register_virtio_driver(&virtio_scsi_driver);
        if (ret < 0)
                goto error;

        return 0;

error:
        mempool_destroy(virtscsi_cmd_pool);
        virtscsi_cmd_pool = NULL;
        kmem_cache_destroy(virtscsi_cmd_cache);
        virtscsi_cmd_cache = NULL;
        return ret;
}

static void __exit virtio_scsi_fini(void)
{
        unregister_virtio_driver(&virtio_scsi_driver);
        mempool_destroy(virtscsi_cmd_pool);
        kmem_cache_destroy(virtscsi_cmd_cache);
}
module_init(virtio_scsi_init);
module_exit(virtio_scsi_fini);

MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_DESCRIPTION("Virtio SCSI HBA driver");
MODULE_LICENSE("GPL");



























































































    3 




    3 














    1 











    4 


    3 


    1 










    4 

    3 
    1 
    4 









































































































































    4 






    5 












    4 












    5 

    5 











    4 


























































    2 







    3 



    2 


    2 





    1 






    1 

    1 



























    1 
    1 











    2 








    1 
    1 


    1 











    1 




    2 






    1 


    1 
















    2 
    1 
    1 






    2 









    2 



    2 






    1 



    1 



    2 

























    1 
    1 






















































































































































    3 



    1 



    1 












    1 






















    4 
    4 
    1 
    1 



















    2 
    2 
    1 


































































    2 





















    1 



    1 

    1 










    1 








    1 






    1 
    1 
    1 

    1 











































































































































    1 
    1 

    1 



























    1 

















    1 







    1 









    1 





















    2 







    1 


    4 























    1 








    1 
    1 



    1 













    2 














    3 







    1 
    2 
























    2 



















    3 




    2 
    1 


    3 






















    1 







    1 







    1 









    1 





























    2 










    3 











    1 

    2 






    3 




    1 





    1 
















































































































    2 



    1 

































    1 
































































































    2 







    2 



















    2 



    2 



    2 












































































































































    3 

    3 







    3 









    4 





    2 











    1 












    4 




    4 







    1 

    1 
















    3 


    3 













   12 


   12 





































































































































































































































































































































































































    4 
    4 




    4 
    4 













    4 





    4 


    4 
    3 











    2 




    2 
    2 

    2 










    2 

    2 
















    1 
    1 

    2 


    2 
    2 

    2 





    2 


    2 























































































    1 






































    4 






    4 











    2 






    2 



    1 










    2 

    2 


    4 




    4 
    3 
    1 














    4 
















    3 

    3 




    3 




    4 









    2 
    2 




    4 










    4 

    2 
    2 






































    4 



    5 




















    2 




    4 












    1 








    3 













    3 























    3 



    1 












    3 








    3 












    3 





    2 




















    1 






    1 






















    1 






















    2 


    3 














    3 












    2 
    1 





    3 
    2 
    1 




    1 


    2 
    1 








    3 
































    2 





    2 



    2 












    2 
























































































































































































































































































































































































    1 











    1 





    1 



































    1 
    1 

    1 


























    1 



    4 




















    1 






    3 
    3 

























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/mmap.c
 *
 * Written by obz.
 *
 * Address space accounting code        <alan@lxorguk.ukuu.org.uk>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/capability.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/profile.h>
#include <linux/export.h>
#include <linux/mount.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
#include <linux/mmdebug.h>
#include <linux/perf_event.h>
#include <linux/audit.h>
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
#include <linux/userfaultfd_k.h>
#include <linux/moduleparam.h>
#include <linux/pkeys.h>
#include <linux/oom.h>
#include <linux/sched/mm.h>
#include <linux/ksm.h>

#include <linux/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>

#define CREATE_TRACE_POINTS
#include <trace/events/mmap.h>

#include "internal.h"

#ifndef arch_mmap_check
#define arch_mmap_check(addr, len, flags)        (0)
#endif

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
int mmap_rnd_bits_max __ro_after_init = CONFIG_ARCH_MMAP_RND_BITS_MAX;
int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
#endif

static bool ignore_rlimit_data;
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);

static void unmap_region(struct mm_struct *mm, struct ma_state *mas,
                struct vm_area_struct *vma, struct vm_area_struct *prev,
                struct vm_area_struct *next, unsigned long start,
                unsigned long end, unsigned long tree_end, bool mm_wr_locked);

static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
{
        return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
}

/* Update vma->vm_page_prot to reflect vma->vm_flags. */
void vma_set_page_prot(struct vm_area_struct *vma)
{
        unsigned long vm_flags = vma->vm_flags;
        pgprot_t vm_page_prot;

        vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
        if (vma_wants_writenotify(vma, vm_page_prot)) {
                vm_flags &= ~VM_SHARED;
                vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
        }
        /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
        WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
}

/*
 * Requires inode->i_mapping->i_mmap_rwsem
 */
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
                                      struct address_space *mapping)
{
        if (vma_is_shared_maywrite(vma))
                mapping_unmap_writable(mapping);

        flush_dcache_mmap_lock(mapping);
        vma_interval_tree_remove(vma, &mapping->i_mmap);
        flush_dcache_mmap_unlock(mapping);
}

/*
 * Unlink a file-based vm structure from its interval tree, to hide
 * vma from rmap and vmtruncate before freeing its page tables.
 */
void unlink_file_vma(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;

        if (file) {
                struct address_space *mapping = file->f_mapping;
                i_mmap_lock_write(mapping);
                __remove_shared_vm_struct(vma, mapping);
                i_mmap_unlock_write(mapping);
        }
}

/*
 * Close a vm structure and free it.
 */
static void remove_vma(struct vm_area_struct *vma, bool unreachable)
{
        might_sleep();
        if (vma->vm_ops && vma->vm_ops->close)
                vma->vm_ops->close(vma);
        if (vma->vm_file)
                fput(vma->vm_file);
        mpol_put(vma_policy(vma));
        if (unreachable)
                __vm_area_free(vma);
        else
                vm_area_free(vma);
}

static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
                                                    unsigned long min)
{
        return mas_prev(&vmi->mas, min);
}

/*
 * check_brk_limits() - Use platform specific check of range & verify mlock
 * limits.
 * @addr: The address to check
 * @len: The size of increase.
 *
 * Return: 0 on success.
 */
static int check_brk_limits(unsigned long addr, unsigned long len)
{
        unsigned long mapped_addr;

        mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
        if (IS_ERR_VALUE(mapped_addr))
                return mapped_addr;

        return mlock_future_ok(current->mm, current->mm->def_flags, len)
                ? 0 : -EAGAIN;
}
static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma,
                unsigned long addr, unsigned long request, unsigned long flags);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
        unsigned long newbrk, oldbrk, origbrk;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *brkvma, *next = NULL;
        unsigned long min_brk;
        bool populate = false;
        LIST_HEAD(uf);
        struct vma_iterator vmi;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        origbrk = mm->brk;

#ifdef CONFIG_COMPAT_BRK
        /*
         * CONFIG_COMPAT_BRK can still be overridden by setting
         * randomize_va_space to 2, which will still cause mm->start_brk
         * to be arbitrarily shifted
         */
        if (current->brk_randomized)
                min_brk = mm->start_brk;
        else
                min_brk = mm->end_data;
#else
        min_brk = mm->start_brk;
#endif
        if (brk < min_brk)
                goto out;

        /*
         * Check against rlimit here. If this check is done later after the test
         * of oldbrk with newbrk then it can escape the test and let the data
         * segment grow beyond its set limit the in case where the limit is
         * not page aligned -Ram Gupta
         */
        if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
                              mm->end_data, mm->start_data))
                goto out;

        newbrk = PAGE_ALIGN(brk);
        oldbrk = PAGE_ALIGN(mm->brk);
        if (oldbrk == newbrk) {
                mm->brk = brk;
                goto success;
        }

        /* Always allow shrinking brk. */
        if (brk <= mm->brk) {
                /* Search one past newbrk */
                vma_iter_init(&vmi, mm, newbrk);
                brkvma = vma_find(&vmi, oldbrk);
                if (!brkvma || brkvma->vm_start >= oldbrk)
                        goto out; /* mapping intersects with an existing non-brk vma. */
                /*
                 * mm->brk must be protected by write mmap_lock.
                 * do_vma_munmap() will drop the lock on success,  so update it
                 * before calling do_vma_munmap().
                 */
                mm->brk = brk;
                if (do_vma_munmap(&vmi, brkvma, newbrk, oldbrk, &uf, true))
                        goto out;

                goto success_unlocked;
        }

        if (check_brk_limits(oldbrk, newbrk - oldbrk))
                goto out;

        /*
         * Only check if the next VMA is within the stack_guard_gap of the
         * expansion area
         */
        vma_iter_init(&vmi, mm, oldbrk);
        next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap);
        if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
                goto out;

        brkvma = vma_prev_limit(&vmi, mm->start_brk);
        /* Ok, looks good - let it rip. */
        if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
                goto out;

        mm->brk = brk;
        if (mm->def_flags & VM_LOCKED)
                populate = true;

success:
        mmap_write_unlock(mm);
success_unlocked:
        userfaultfd_unmap_complete(mm, &uf);
        if (populate)
                mm_populate(oldbrk, newbrk - oldbrk);
        return brk;

out:
        mm->brk = origbrk;
        mmap_write_unlock(mm);
        return origbrk;
}

#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
static void validate_mm(struct mm_struct *mm)
{
        int bug = 0;
        int i = 0;
        struct vm_area_struct *vma;
        VMA_ITERATOR(vmi, mm, 0);

        mt_validate(&mm->mm_mt);
        for_each_vma(vmi, vma) {
#ifdef CONFIG_DEBUG_VM_RB
                struct anon_vma *anon_vma = vma->anon_vma;
                struct anon_vma_chain *avc;
#endif
                unsigned long vmi_start, vmi_end;
                bool warn = 0;

                vmi_start = vma_iter_addr(&vmi);
                vmi_end = vma_iter_end(&vmi);
                if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm))
                        warn = 1;

                if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm))
                        warn = 1;

                if (warn) {
                        pr_emerg("issue in %s\n", current->comm);
                        dump_stack();
                        dump_vma(vma);
                        pr_emerg("tree range: %px start %lx end %lx\n", vma,
                                 vmi_start, vmi_end - 1);
                        vma_iter_dump_tree(&vmi);
                }

#ifdef CONFIG_DEBUG_VM_RB
                if (anon_vma) {
                        anon_vma_lock_read(anon_vma);
                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                                anon_vma_interval_tree_verify(avc);
                        anon_vma_unlock_read(anon_vma);
                }
#endif
                i++;
        }
        if (i != mm->map_count) {
                pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
                bug = 1;
        }
        VM_BUG_ON_MM(bug, mm);
}

#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */
#define validate_mm(mm) do { } while (0)
#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */

/*
 * vma has some anon_vma assigned, and is already inserted on that
 * anon_vma's interval trees.
 *
 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
 * vma must be removed from the anon_vma's interval trees using
 * anon_vma_interval_tree_pre_update_vma().
 *
 * After the update, the vma will be reinserted using
 * anon_vma_interval_tree_post_update_vma().
 *
 * The entire update must be protected by exclusive mmap_lock and by
 * the root anon_vma's mutex.
 */
static inline void
anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
{
        struct anon_vma_chain *avc;

        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
}

static inline void
anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
{
        struct anon_vma_chain *avc;

        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
}

static unsigned long count_vma_pages_range(struct mm_struct *mm,
                unsigned long addr, unsigned long end)
{
        VMA_ITERATOR(vmi, mm, addr);
        struct vm_area_struct *vma;
        unsigned long nr_pages = 0;

        for_each_vma_range(vmi, vma, end) {
                unsigned long vm_start = max(addr, vma->vm_start);
                unsigned long vm_end = min(end, vma->vm_end);

                nr_pages += PHYS_PFN(vm_end - vm_start);
        }

        return nr_pages;
}

static void __vma_link_file(struct vm_area_struct *vma,
                            struct address_space *mapping)
{
        if (vma_is_shared_maywrite(vma))
                mapping_allow_writable(mapping);

        flush_dcache_mmap_lock(mapping);
        vma_interval_tree_insert(vma, &mapping->i_mmap);
        flush_dcache_mmap_unlock(mapping);
}

static void vma_link_file(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;
        struct address_space *mapping;

        if (file) {
                mapping = file->f_mapping;
                i_mmap_lock_write(mapping);
                __vma_link_file(vma, mapping);
                i_mmap_unlock_write(mapping);
        }
}

static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
{
        VMA_ITERATOR(vmi, mm, 0);

        vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
        if (vma_iter_prealloc(&vmi, vma))
                return -ENOMEM;

        vma_start_write(vma);
        vma_iter_store(&vmi, vma);
        vma_link_file(vma);
        mm->map_count++;
        validate_mm(mm);
        return 0;
}

/*
 * init_multi_vma_prep() - Initializer for struct vma_prepare
 * @vp: The vma_prepare struct
 * @vma: The vma that will be altered once locked
 * @next: The next vma if it is to be adjusted
 * @remove: The first vma to be removed
 * @remove2: The second vma to be removed
 */
static inline void init_multi_vma_prep(struct vma_prepare *vp,
                struct vm_area_struct *vma, struct vm_area_struct *next,
                struct vm_area_struct *remove, struct vm_area_struct *remove2)
{
        memset(vp, 0, sizeof(struct vma_prepare));
        vp->vma = vma;
        vp->anon_vma = vma->anon_vma;
        vp->remove = remove;
        vp->remove2 = remove2;
        vp->adj_next = next;
        if (!vp->anon_vma && next)
                vp->anon_vma = next->anon_vma;

        vp->file = vma->vm_file;
        if (vp->file)
                vp->mapping = vma->vm_file->f_mapping;

}

/*
 * init_vma_prep() - Initializer wrapper for vma_prepare struct
 * @vp: The vma_prepare struct
 * @vma: The vma that will be altered once locked
 */
static inline void init_vma_prep(struct vma_prepare *vp,
                                 struct vm_area_struct *vma)
{
        init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
}


/*
 * vma_prepare() - Helper function for handling locking VMAs prior to altering
 * @vp: The initialized vma_prepare struct
 */
static inline void vma_prepare(struct vma_prepare *vp)
{
        if (vp->file) {
                uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);

                if (vp->adj_next)
                        uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
                                      vp->adj_next->vm_end);

                i_mmap_lock_write(vp->mapping);
                if (vp->insert && vp->insert->vm_file) {
                        /*
                         * Put into interval tree now, so instantiated pages
                         * are visible to arm/parisc __flush_dcache_page
                         * throughout; but we cannot insert into address
                         * space until vma start or end is updated.
                         */
                        __vma_link_file(vp->insert,
                                        vp->insert->vm_file->f_mapping);
                }
        }

        if (vp->anon_vma) {
                anon_vma_lock_write(vp->anon_vma);
                anon_vma_interval_tree_pre_update_vma(vp->vma);
                if (vp->adj_next)
                        anon_vma_interval_tree_pre_update_vma(vp->adj_next);
        }

        if (vp->file) {
                flush_dcache_mmap_lock(vp->mapping);
                vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
                if (vp->adj_next)
                        vma_interval_tree_remove(vp->adj_next,
                                                 &vp->mapping->i_mmap);
        }

}

/*
 * vma_complete- Helper function for handling the unlocking after altering VMAs,
 * or for inserting a VMA.
 *
 * @vp: The vma_prepare struct
 * @vmi: The vma iterator
 * @mm: The mm_struct
 */
static inline void vma_complete(struct vma_prepare *vp,
                                struct vma_iterator *vmi, struct mm_struct *mm)
{
        if (vp->file) {
                if (vp->adj_next)
                        vma_interval_tree_insert(vp->adj_next,
                                                 &vp->mapping->i_mmap);
                vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
                flush_dcache_mmap_unlock(vp->mapping);
        }

        if (vp->remove && vp->file) {
                __remove_shared_vm_struct(vp->remove, vp->mapping);
                if (vp->remove2)
                        __remove_shared_vm_struct(vp->remove2, vp->mapping);
        } else if (vp->insert) {
                /*
                 * split_vma has split insert from vma, and needs
                 * us to insert it before dropping the locks
                 * (it may either follow vma or precede it).
                 */
                vma_iter_store(vmi, vp->insert);
                mm->map_count++;
        }

        if (vp->anon_vma) {
                anon_vma_interval_tree_post_update_vma(vp->vma);
                if (vp->adj_next)
                        anon_vma_interval_tree_post_update_vma(vp->adj_next);
                anon_vma_unlock_write(vp->anon_vma);
        }

        if (vp->file) {
                i_mmap_unlock_write(vp->mapping);
                uprobe_mmap(vp->vma);

                if (vp->adj_next)
                        uprobe_mmap(vp->adj_next);
        }

        if (vp->remove) {
again:
                vma_mark_detached(vp->remove, true);
                if (vp->file) {
                        uprobe_munmap(vp->remove, vp->remove->vm_start,
                                      vp->remove->vm_end);
                        fput(vp->file);
                }
                if (vp->remove->anon_vma)
                        anon_vma_merge(vp->vma, vp->remove);
                mm->map_count--;
                mpol_put(vma_policy(vp->remove));
                if (!vp->remove2)
                        WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
                vm_area_free(vp->remove);

                /*
                 * In mprotect's case 6 (see comments on vma_merge),
                 * we are removing both mid and next vmas
                 */
                if (vp->remove2) {
                        vp->remove = vp->remove2;
                        vp->remove2 = NULL;
                        goto again;
                }
        }
        if (vp->insert && vp->file)
                uprobe_mmap(vp->insert);
        validate_mm(mm);
}

/*
 * dup_anon_vma() - Helper function to duplicate anon_vma
 * @dst: The destination VMA
 * @src: The source VMA
 * @dup: Pointer to the destination VMA when successful.
 *
 * Returns: 0 on success.
 */
static inline int dup_anon_vma(struct vm_area_struct *dst,
                struct vm_area_struct *src, struct vm_area_struct **dup)
{
        /*
         * Easily overlooked: when mprotect shifts the boundary, make sure the
         * expanding vma has anon_vma set if the shrinking vma had, to cover any
         * anon pages imported.
         */
        if (src->anon_vma && !dst->anon_vma) {
                int ret;

                vma_assert_write_locked(dst);
                dst->anon_vma = src->anon_vma;
                ret = anon_vma_clone(dst, src);
                if (ret)
                        return ret;

                *dup = dst;
        }

        return 0;
}

/*
 * vma_expand - Expand an existing VMA
 *
 * @vmi: The vma iterator
 * @vma: The vma to expand
 * @start: The start of the vma
 * @end: The exclusive end of the vma
 * @pgoff: The page offset of vma
 * @next: The current of next vma.
 *
 * Expand @vma to @start and @end.  Can expand off the start and end.  Will
 * expand over @next if it's different from @vma and @end == @next->vm_end.
 * Checking if the @vma can expand and merge with @next needs to be handled by
 * the caller.
 *
 * Returns: 0 on success
 */
int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
               unsigned long start, unsigned long end, pgoff_t pgoff,
               struct vm_area_struct *next)
{
        struct vm_area_struct *anon_dup = NULL;
        bool remove_next = false;
        struct vma_prepare vp;

        vma_start_write(vma);
        if (next && (vma != next) && (end == next->vm_end)) {
                int ret;

                remove_next = true;
                vma_start_write(next);
                ret = dup_anon_vma(vma, next, &anon_dup);
                if (ret)
                        return ret;
        }

        init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL);
        /* Not merging but overwriting any part of next is not handled. */
        VM_WARN_ON(next && !vp.remove &&
                  next != vma && end > next->vm_start);
        /* Only handles expanding */
        VM_WARN_ON(vma->vm_start < start || vma->vm_end > end);

        /* Note: vma iterator must be pointing to 'start' */
        vma_iter_config(vmi, start, end);
        if (vma_iter_prealloc(vmi, vma))
                goto nomem;

        vma_prepare(&vp);
        vma_adjust_trans_huge(vma, start, end, 0);
        vma_set_range(vma, start, end, pgoff);
        vma_iter_store(vmi, vma);

        vma_complete(&vp, vmi, vma->vm_mm);
        return 0;

nomem:
        if (anon_dup)
                unlink_anon_vmas(anon_dup);
        return -ENOMEM;
}

/*
 * vma_shrink() - Reduce an existing VMAs memory area
 * @vmi: The vma iterator
 * @vma: The VMA to modify
 * @start: The new start
 * @end: The new end
 *
 * Returns: 0 on success, -ENOMEM otherwise
 */
int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
               unsigned long start, unsigned long end, pgoff_t pgoff)
{
        struct vma_prepare vp;

        WARN_ON((vma->vm_start != start) && (vma->vm_end != end));

        if (vma->vm_start < start)
                vma_iter_config(vmi, vma->vm_start, start);
        else
                vma_iter_config(vmi, end, vma->vm_end);

        if (vma_iter_prealloc(vmi, NULL))
                return -ENOMEM;

        vma_start_write(vma);

        init_vma_prep(&vp, vma);
        vma_prepare(&vp);
        vma_adjust_trans_huge(vma, start, end, 0);

        vma_iter_clear(vmi);
        vma_set_range(vma, start, end, pgoff);
        vma_complete(&vp, vmi, vma->vm_mm);
        return 0;
}

/*
 * If the vma has a ->close operation then the driver probably needs to release
 * per-vma resources, so we don't attempt to merge those if the caller indicates
 * the current vma may be removed as part of the merge.
 */
static inline bool is_mergeable_vma(struct vm_area_struct *vma,
                struct file *file, unsigned long vm_flags,
                struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
                struct anon_vma_name *anon_name, bool may_remove_vma)
{
        /*
         * VM_SOFTDIRTY should not prevent from VMA merging, if we
         * match the flags but dirty bit -- the caller should mark
         * merged VMA as dirty. If dirty bit won't be excluded from
         * comparison, we increase pressure on the memory system forcing
         * the kernel to generate new VMAs when old one could be
         * extended instead.
         */
        if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
                return false;
        if (vma->vm_file != file)
                return false;
        if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
                return false;
        if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
                return false;
        if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
                return false;
        return true;
}

static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
                 struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
        /*
         * The list_is_singular() test is to avoid merging VMA cloned from
         * parents. This can improve scalability caused by anon_vma lock.
         */
        if ((!anon_vma1 || !anon_vma2) && (!vma ||
                list_is_singular(&vma->anon_vma_chain)))
                return true;
        return anon_vma1 == anon_vma2;
}

/*
 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
 * in front of (at a lower virtual address and file offset than) the vma.
 *
 * We cannot merge two vmas if they have differently assigned (non-NULL)
 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
 *
 * We don't check here for the merged mmap wrapping around the end of pagecache
 * indices (16TB on ia32) because do_mmap() does not permit mmap's which
 * wrap, nor mmaps which cover the final page at index -1UL.
 *
 * We assume the vma may be removed as part of the merge.
 */
static bool
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
                struct anon_vma *anon_vma, struct file *file,
                pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
                struct anon_vma_name *anon_name)
{
        if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) &&
            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                if (vma->vm_pgoff == vm_pgoff)
                        return true;
        }
        return false;
}

/*
 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
 * beyond (at a higher virtual address and file offset than) the vma.
 *
 * We cannot merge two vmas if they have differently assigned (non-NULL)
 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
 *
 * We assume that vma is not removed as part of the merge.
 */
static bool
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
                struct anon_vma *anon_vma, struct file *file,
                pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
                struct anon_vma_name *anon_name)
{
        if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) &&
            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                pgoff_t vm_pglen;
                vm_pglen = vma_pages(vma);
                if (vma->vm_pgoff + vm_pglen == vm_pgoff)
                        return true;
        }
        return false;
}

/*
 * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
 * figure out whether that can be merged with its predecessor or its
 * successor.  Or both (it neatly fills a hole).
 *
 * In most cases - when called for mmap, brk or mremap - [addr,end) is
 * certain not to be mapped by the time vma_merge is called; but when
 * called for mprotect, it is certain to be already mapped (either at
 * an offset within prev, or at the start of next), and the flags of
 * this area are about to be changed to vm_flags - and the no-change
 * case has already been eliminated.
 *
 * The following mprotect cases have to be considered, where **** is
 * the area passed down from mprotect_fixup, never extending beyond one
 * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts
 * at the same address as **** and is of the same or larger span, and
 * NNNN the next vma after ****:
 *
 *     ****             ****                   ****
 *    PPPPPPNNNNNN    PPPPPPNNNNNN       PPPPPPCCCCCC
 *    cannot merge    might become       might become
 *                    PPNNNNNNNNNN       PPPPPPPPPPCC
 *    mmap, brk or    case 4 below       case 5 below
 *    mremap move:
 *                        ****               ****
 *                    PPPP    NNNN       PPPPCCCCNNNN
 *                    might become       might become
 *                    PPPPPPPPPPPP 1 or  PPPPPPPPPPPP 6 or
 *                    PPPPPPPPNNNN 2 or  PPPPPPPPNNNN 7 or
 *                    PPPPNNNNNNNN 3     PPPPNNNNNNNN 8
 *
 * It is important for case 8 that the vma CCCC overlapping the
 * region **** is never going to extended over NNNN. Instead NNNN must
 * be extended in region **** and CCCC must be removed. This way in
 * all cases where vma_merge succeeds, the moment vma_merge drops the
 * rmap_locks, the properties of the merged vma will be already
 * correct for the whole merged range. Some of those properties like
 * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
 * be correct for the whole merged range immediately after the
 * rmap_locks are released. Otherwise if NNNN would be removed and
 * CCCC would be extended over the NNNN range, remove_migration_ptes
 * or other rmap walkers (if working on addresses beyond the "end"
 * parameter) may establish ptes with the wrong permissions of CCCC
 * instead of the right permissions of NNNN.
 *
 * In the code below:
 * PPPP is represented by *prev
 * CCCC is represented by *curr or not represented at all (NULL)
 * NNNN is represented by *next or not represented at all (NULL)
 * **** is not represented - it will be merged and the vma containing the
 *      area is returned, or the function will return NULL
 */
static struct vm_area_struct
*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev,
           struct vm_area_struct *src, unsigned long addr, unsigned long end,
           unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy,
           struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
           struct anon_vma_name *anon_name)
{
        struct mm_struct *mm = src->vm_mm;
        struct anon_vma *anon_vma = src->anon_vma;
        struct file *file = src->vm_file;
        struct vm_area_struct *curr, *next, *res;
        struct vm_area_struct *vma, *adjust, *remove, *remove2;
        struct vm_area_struct *anon_dup = NULL;
        struct vma_prepare vp;
        pgoff_t vma_pgoff;
        int err = 0;
        bool merge_prev = false;
        bool merge_next = false;
        bool vma_expanded = false;
        unsigned long vma_start = addr;
        unsigned long vma_end = end;
        pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
        long adj_start = 0;

        /*
         * We later require that vma->vm_flags == vm_flags,
         * so this tests vma->vm_flags & VM_SPECIAL, too.
         */
        if (vm_flags & VM_SPECIAL)
                return NULL;

        /* Does the input range span an existing VMA? (cases 5 - 8) */
        curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end);

        if (!curr ||                        /* cases 1 - 4 */
            end == curr->vm_end)        /* cases 6 - 8, adjacent VMA */
                next = vma_lookup(mm, end);
        else
                next = NULL;                /* case 5 */

        if (prev) {
                vma_start = prev->vm_start;
                vma_pgoff = prev->vm_pgoff;

                /* Can we merge the predecessor? */
                if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy)
                    && can_vma_merge_after(prev, vm_flags, anon_vma, file,
                                           pgoff, vm_userfaultfd_ctx, anon_name)) {
                        merge_prev = true;
                        vma_prev(vmi);
                }
        }

        /* Can we merge the successor? */
        if (next && mpol_equal(policy, vma_policy(next)) &&
            can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen,
                                 vm_userfaultfd_ctx, anon_name)) {
                merge_next = true;
        }

        /* Verify some invariant that must be enforced by the caller. */
        VM_WARN_ON(prev && addr <= prev->vm_start);
        VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end));
        VM_WARN_ON(addr >= end);

        if (!merge_prev && !merge_next)
                return NULL; /* Not mergeable. */

        if (merge_prev)
                vma_start_write(prev);

        res = vma = prev;
        remove = remove2 = adjust = NULL;

        /* Can we merge both the predecessor and the successor? */
        if (merge_prev && merge_next &&
            is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) {
                vma_start_write(next);
                remove = next;                                /* case 1 */
                vma_end = next->vm_end;
                err = dup_anon_vma(prev, next, &anon_dup);
                if (curr) {                                /* case 6 */
                        vma_start_write(curr);
                        remove = curr;
                        remove2 = next;
                        /*
                         * Note that the dup_anon_vma below cannot overwrite err
                         * since the first caller would do nothing unless next
                         * has an anon_vma.
                         */
                        if (!next->anon_vma)
                                err = dup_anon_vma(prev, curr, &anon_dup);
                }
        } else if (merge_prev) {                        /* case 2 */
                if (curr) {
                        vma_start_write(curr);
                        if (end == curr->vm_end) {        /* case 7 */
                                /*
                                 * can_vma_merge_after() assumed we would not be
                                 * removing prev vma, so it skipped the check
                                 * for vm_ops->close, but we are removing curr
                                 */
                                if (curr->vm_ops && curr->vm_ops->close)
                                        err = -EINVAL;
                                remove = curr;
                        } else {                        /* case 5 */
                                adjust = curr;
                                adj_start = (end - curr->vm_start);
                        }
                        if (!err)
                                err = dup_anon_vma(prev, curr, &anon_dup);
                }
        } else { /* merge_next */
                vma_start_write(next);
                res = next;
                if (prev && addr < prev->vm_end) {        /* case 4 */
                        vma_start_write(prev);
                        vma_end = addr;
                        adjust = next;
                        adj_start = -(prev->vm_end - addr);
                        err = dup_anon_vma(next, prev, &anon_dup);
                } else {
                        /*
                         * Note that cases 3 and 8 are the ONLY ones where prev
                         * is permitted to be (but is not necessarily) NULL.
                         */
                        vma = next;                        /* case 3 */
                        vma_start = addr;
                        vma_end = next->vm_end;
                        vma_pgoff = next->vm_pgoff - pglen;
                        if (curr) {                        /* case 8 */
                                vma_pgoff = curr->vm_pgoff;
                                vma_start_write(curr);
                                remove = curr;
                                err = dup_anon_vma(next, curr, &anon_dup);
                        }
                }
        }

        /* Error in anon_vma clone. */
        if (err)
                goto anon_vma_fail;

        if (vma_start < vma->vm_start || vma_end > vma->vm_end)
                vma_expanded = true;

        if (vma_expanded) {
                vma_iter_config(vmi, vma_start, vma_end);
        } else {
                vma_iter_config(vmi, adjust->vm_start + adj_start,
                                adjust->vm_end);
        }

        if (vma_iter_prealloc(vmi, vma))
                goto prealloc_fail;

        init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
        VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
                   vp.anon_vma != adjust->anon_vma);

        vma_prepare(&vp);
        vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
        vma_set_range(vma, vma_start, vma_end, vma_pgoff);

        if (vma_expanded)
                vma_iter_store(vmi, vma);

        if (adj_start) {
                adjust->vm_start += adj_start;
                adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
                if (adj_start < 0) {
                        WARN_ON(vma_expanded);
                        vma_iter_store(vmi, next);
                }
        }

        vma_complete(&vp, vmi, mm);
        khugepaged_enter_vma(res, vm_flags);
        return res;

prealloc_fail:
        if (anon_dup)
                unlink_anon_vmas(anon_dup);

anon_vma_fail:
        vma_iter_set(vmi, addr);
        vma_iter_load(vmi);
        return NULL;
}

/*
 * Rough compatibility check to quickly see if it's even worth looking
 * at sharing an anon_vma.
 *
 * They need to have the same vm_file, and the flags can only differ
 * in things that mprotect may change.
 *
 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
 * we can merge the two vma's. For example, we refuse to merge a vma if
 * there is a vm_ops->close() function, because that indicates that the
 * driver is doing some kind of reference counting. But that doesn't
 * really matter for the anon_vma sharing case.
 */
static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
{
        return a->vm_end == b->vm_start &&
                mpol_equal(vma_policy(a), vma_policy(b)) &&
                a->vm_file == b->vm_file &&
                !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
                b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
}

/*
 * Do some basic sanity checking to see if we can re-use the anon_vma
 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
 * the same as 'old', the other will be the new one that is trying
 * to share the anon_vma.
 *
 * NOTE! This runs with mmap_lock held for reading, so it is possible that
 * the anon_vma of 'old' is concurrently in the process of being set up
 * by another page fault trying to merge _that_. But that's ok: if it
 * is being set up, that automatically means that it will be a singleton
 * acceptable for merging, so we can do all of this optimistically. But
 * we do that READ_ONCE() to make sure that we never re-load the pointer.
 *
 * IOW: that the "list_is_singular()" test on the anon_vma_chain only
 * matters for the 'stable anon_vma' case (ie the thing we want to avoid
 * is to return an anon_vma that is "complex" due to having gone through
 * a fork).
 *
 * We also make sure that the two vma's are compatible (adjacent,
 * and with the same memory policies). That's all stable, even with just
 * a read lock on the mmap_lock.
 */
static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
{
        if (anon_vma_compatible(a, b)) {
                struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);

                if (anon_vma && list_is_singular(&old->anon_vma_chain))
                        return anon_vma;
        }
        return NULL;
}

/*
 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
 * neighbouring vmas for a suitable anon_vma, before it goes off
 * to allocate a new anon_vma.  It checks because a repetitive
 * sequence of mprotects and faults may otherwise lead to distinct
 * anon_vmas being allocated, preventing vma merge in subsequent
 * mprotect.
 */
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
        struct anon_vma *anon_vma = NULL;
        struct vm_area_struct *prev, *next;
        VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end);

        /* Try next first. */
        next = vma_iter_load(&vmi);
        if (next) {
                anon_vma = reusable_anon_vma(next, vma, next);
                if (anon_vma)
                        return anon_vma;
        }

        prev = vma_prev(&vmi);
        VM_BUG_ON_VMA(prev != vma, vma);
        prev = vma_prev(&vmi);
        /* Try prev next. */
        if (prev)
                anon_vma = reusable_anon_vma(prev, prev, vma);

        /*
         * We might reach here with anon_vma == NULL if we can't find
         * any reusable anon_vma.
         * There's no absolute need to look only at touching neighbours:
         * we could search further afield for "compatible" anon_vmas.
         * But it would probably just be a waste of time searching,
         * or lead to too many vmas hanging off the same anon_vma.
         * We're trying to allow mprotect remerging later on,
         * not trying to minimize memory used for anon_vmas.
         */
        return anon_vma;
}

/*
 * If a hint addr is less than mmap_min_addr change hint to be as
 * low as possible but still greater than mmap_min_addr
 */
static inline unsigned long round_hint_to_min(unsigned long hint)
{
        hint &= PAGE_MASK;
        if (((void *)hint != NULL) &&
            (hint < mmap_min_addr))
                return PAGE_ALIGN(mmap_min_addr);
        return hint;
}

bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
                        unsigned long bytes)
{
        unsigned long locked_pages, limit_pages;

        if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
                return true;

        locked_pages = bytes >> PAGE_SHIFT;
        locked_pages += mm->locked_vm;

        limit_pages = rlimit(RLIMIT_MEMLOCK);
        limit_pages >>= PAGE_SHIFT;

        return locked_pages <= limit_pages;
}

static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
{
        if (S_ISREG(inode->i_mode))
                return MAX_LFS_FILESIZE;

        if (S_ISBLK(inode->i_mode))
                return MAX_LFS_FILESIZE;

        if (S_ISSOCK(inode->i_mode))
                return MAX_LFS_FILESIZE;

        /* Special "we do even unsigned file positions" case */
        if (file->f_mode & FMODE_UNSIGNED_OFFSET)
                return 0;

        /* Yes, random drivers might want more. But I'm tired of buggy drivers */
        return ULONG_MAX;
}

static inline bool file_mmap_ok(struct file *file, struct inode *inode,
                                unsigned long pgoff, unsigned long len)
{
        u64 maxsize = file_mmap_size_max(file, inode);

        if (maxsize && len > maxsize)
                return false;
        maxsize -= len;
        if (pgoff > maxsize >> PAGE_SHIFT)
                return false;
        return true;
}

/*
 * The caller must write-lock current->mm->mmap_lock.
 */
unsigned long do_mmap(struct file *file, unsigned long addr,
                        unsigned long len, unsigned long prot,
                        unsigned long flags, vm_flags_t vm_flags,
                        unsigned long pgoff, unsigned long *populate,
                        struct list_head *uf)
{
        struct mm_struct *mm = current->mm;
        int pkey = 0;

        *populate = 0;

        if (!len)
                return -EINVAL;

        /*
         * Does the application expect PROT_READ to imply PROT_EXEC?
         *
         * (the exception is when the underlying filesystem is noexec
         *  mounted, in which case we don't add PROT_EXEC.)
         */
        if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
                if (!(file && path_noexec(&file->f_path)))
                        prot |= PROT_EXEC;

        /* force arch specific MAP_FIXED handling in get_unmapped_area */
        if (flags & MAP_FIXED_NOREPLACE)
                flags |= MAP_FIXED;

        if (!(flags & MAP_FIXED))
                addr = round_hint_to_min(addr);

        /* Careful about overflows.. */
        len = PAGE_ALIGN(len);
        if (!len)
                return -ENOMEM;

        /* offset overflow? */
        if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
                return -EOVERFLOW;

        /* Too many mappings? */
        if (mm->map_count > sysctl_max_map_count)
                return -ENOMEM;

        /*
         * addr is returned from get_unmapped_area,
         * There are two cases:
         * 1> MAP_FIXED == false
         *        unallocated memory, no need to check sealing.
         * 1> MAP_FIXED == true
         *        sealing is checked inside mmap_region when
         *        do_vmi_munmap is called.
         */

        if (prot == PROT_EXEC) {
                pkey = execute_only_pkey(mm);
                if (pkey < 0)
                        pkey = 0;
        }

        /* Do simple checking here so the lower-level routines won't have
         * to. we assume access permissions have been handled by the open
         * of the memory object, so we don't do any here.
         */
        vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
                        mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;

        /* Obtain the address to map to. we verify (or select) it and ensure
         * that it represents a valid section of the address space.
         */
        addr = __get_unmapped_area(file, addr, len, pgoff, flags, vm_flags);
        if (IS_ERR_VALUE(addr))
                return addr;

        if (flags & MAP_FIXED_NOREPLACE) {
                if (find_vma_intersection(mm, addr, addr + len))
                        return -EEXIST;
        }

        if (flags & MAP_LOCKED)
                if (!can_do_mlock())
                        return -EPERM;

        if (!mlock_future_ok(mm, vm_flags, len))
                return -EAGAIN;

        if (file) {
                struct inode *inode = file_inode(file);
                unsigned long flags_mask;

                if (!file_mmap_ok(file, inode, pgoff, len))
                        return -EOVERFLOW;

                flags_mask = LEGACY_MAP_MASK;
                if (file->f_op->fop_flags & FOP_MMAP_SYNC)
                        flags_mask |= MAP_SYNC;

                switch (flags & MAP_TYPE) {
                case MAP_SHARED:
                        /*
                         * Force use of MAP_SHARED_VALIDATE with non-legacy
                         * flags. E.g. MAP_SYNC is dangerous to use with
                         * MAP_SHARED as you don't know which consistency model
                         * you will get. We silently ignore unsupported flags
                         * with MAP_SHARED to preserve backward compatibility.
                         */
                        flags &= LEGACY_MAP_MASK;
                        fallthrough;
                case MAP_SHARED_VALIDATE:
                        if (flags & ~flags_mask)
                                return -EOPNOTSUPP;
                        if (prot & PROT_WRITE) {
                                if (!(file->f_mode & FMODE_WRITE))
                                        return -EACCES;
                                if (IS_SWAPFILE(file->f_mapping->host))
                                        return -ETXTBSY;
                        }

                        /*
                         * Make sure we don't allow writing to an append-only
                         * file..
                         */
                        if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
                                return -EACCES;

                        vm_flags |= VM_SHARED | VM_MAYSHARE;
                        if (!(file->f_mode & FMODE_WRITE))
                                vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
                        fallthrough;
                case MAP_PRIVATE:
                        if (!(file->f_mode & FMODE_READ))
                                return -EACCES;
                        if (path_noexec(&file->f_path)) {
                                if (vm_flags & VM_EXEC)
                                        return -EPERM;
                                vm_flags &= ~VM_MAYEXEC;
                        }

                        if (!file->f_op->mmap)
                                return -ENODEV;
                        if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
                                return -EINVAL;
                        break;

                default:
                        return -EINVAL;
                }
        } else {
                switch (flags & MAP_TYPE) {
                case MAP_SHARED:
                        if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
                                return -EINVAL;
                        /*
                         * Ignore pgoff.
                         */
                        pgoff = 0;
                        vm_flags |= VM_SHARED | VM_MAYSHARE;
                        break;
                case MAP_PRIVATE:
                        /*
                         * Set pgoff according to addr for anon_vma.
                         */
                        pgoff = addr >> PAGE_SHIFT;
                        break;
                default:
                        return -EINVAL;
                }
        }

        /*
         * Set 'VM_NORESERVE' if we should not account for the
         * memory use of this mapping.
         */
        if (flags & MAP_NORESERVE) {
                /* We honor MAP_NORESERVE if allowed to overcommit */
                if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
                        vm_flags |= VM_NORESERVE;

                /* hugetlb applies strict overcommit unless MAP_NORESERVE */
                if (file && is_file_hugepages(file))
                        vm_flags |= VM_NORESERVE;
        }

        addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
        if (!IS_ERR_VALUE(addr) &&
            ((vm_flags & VM_LOCKED) ||
             (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
                *populate = len;
        return addr;
}

unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
                              unsigned long prot, unsigned long flags,
                              unsigned long fd, unsigned long pgoff)
{
        struct file *file = NULL;
        unsigned long retval;

        if (!(flags & MAP_ANONYMOUS)) {
                audit_mmap_fd(fd, flags);
                file = fget(fd);
                if (!file)
                        return -EBADF;
                if (is_file_hugepages(file)) {
                        len = ALIGN(len, huge_page_size(hstate_file(file)));
                } else if (unlikely(flags & MAP_HUGETLB)) {
                        retval = -EINVAL;
                        goto out_fput;
                }
        } else if (flags & MAP_HUGETLB) {
                struct hstate *hs;

                hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
                if (!hs)
                        return -EINVAL;

                len = ALIGN(len, huge_page_size(hs));
                /*
                 * VM_NORESERVE is used because the reservations will be
                 * taken when vm_ops->mmap() is called
                 */
                file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
                                VM_NORESERVE,
                                HUGETLB_ANONHUGE_INODE,
                                (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
                if (IS_ERR(file))
                        return PTR_ERR(file);
        }

        retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
out_fput:
        if (file)
                fput(file);
        return retval;
}

SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                unsigned long, prot, unsigned long, flags,
                unsigned long, fd, unsigned long, pgoff)
{
        return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
}

#ifdef __ARCH_WANT_SYS_OLD_MMAP
struct mmap_arg_struct {
        unsigned long addr;
        unsigned long len;
        unsigned long prot;
        unsigned long flags;
        unsigned long fd;
        unsigned long offset;
};

SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
{
        struct mmap_arg_struct a;

        if (copy_from_user(&a, arg, sizeof(a)))
                return -EFAULT;
        if (offset_in_page(a.offset))
                return -EINVAL;

        return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
                               a.offset >> PAGE_SHIFT);
}
#endif /* __ARCH_WANT_SYS_OLD_MMAP */

static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops)
{
        return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite);
}

static bool vma_is_shared_writable(struct vm_area_struct *vma)
{
        return (vma->vm_flags & (VM_WRITE | VM_SHARED)) ==
                (VM_WRITE | VM_SHARED);
}

static bool vma_fs_can_writeback(struct vm_area_struct *vma)
{
        /* No managed pages to writeback. */
        if (vma->vm_flags & VM_PFNMAP)
                return false;

        return vma->vm_file && vma->vm_file->f_mapping &&
                mapping_can_writeback(vma->vm_file->f_mapping);
}

/*
 * Does this VMA require the underlying folios to have their dirty state
 * tracked?
 */
bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
{
        /* Only shared, writable VMAs require dirty tracking. */
        if (!vma_is_shared_writable(vma))
                return false;

        /* Does the filesystem need to be notified? */
        if (vm_ops_needs_writenotify(vma->vm_ops))
                return true;

        /*
         * Even if the filesystem doesn't indicate a need for writenotify, if it
         * can writeback, dirty tracking is still required.
         */
        return vma_fs_can_writeback(vma);
}

/*
 * Some shared mappings will want the pages marked read-only
 * to track write events. If so, we'll downgrade vm_page_prot
 * to the private version (using protection_map[] without the
 * VM_SHARED bit).
 */
bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
{
        /* If it was private or non-writable, the write bit is already clear */
        if (!vma_is_shared_writable(vma))
                return false;

        /* The backer wishes to know when pages are first written to? */
        if (vm_ops_needs_writenotify(vma->vm_ops))
                return true;

        /* The open routine did something to the protections that pgprot_modify
         * won't preserve? */
        if (pgprot_val(vm_page_prot) !=
            pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
                return false;

        /*
         * Do we need to track softdirty? hugetlb does not support softdirty
         * tracking yet.
         */
        if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
                return true;

        /* Do we need write faults for uffd-wp tracking? */
        if (userfaultfd_wp(vma))
                return true;

        /* Can the mapping track the dirty pages? */
        return vma_fs_can_writeback(vma);
}

/*
 * We account for memory if it's a private writeable mapping,
 * not hugepages and VM_NORESERVE wasn't set.
 */
static inline bool accountable_mapping(struct file *file, vm_flags_t vm_flags)
{
        /*
         * hugetlb has its own accounting separate from the core VM
         * VM_HUGETLB may not be set yet so we cannot check for that flag.
         */
        if (file && is_file_hugepages(file))
                return false;

        return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
}

/**
 * unmapped_area() - Find an area between the low_limit and the high_limit with
 * the correct alignment and offset, all from @info. Note: current->mm is used
 * for the search.
 *
 * @info: The unmapped area information including the range [low_limit -
 * high_limit), the alignment offset and mask.
 *
 * Return: A memory address or -ENOMEM.
 */
static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
        unsigned long length, gap;
        unsigned long low_limit, high_limit;
        struct vm_area_struct *tmp;
        VMA_ITERATOR(vmi, current->mm, 0);

        /* Adjust search length to account for worst case alignment overhead */
        length = info->length + info->align_mask + info->start_gap;
        if (length < info->length)
                return -ENOMEM;

        low_limit = info->low_limit;
        if (low_limit < mmap_min_addr)
                low_limit = mmap_min_addr;
        high_limit = info->high_limit;
retry:
        if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length))
                return -ENOMEM;

        /*
         * Adjust for the gap first so it doesn't interfere with the
         * later alignment. The first step is the minimum needed to
         * fulill the start gap, the next steps is the minimum to align
         * that. It is the minimum needed to fulill both.
         */
        gap = vma_iter_addr(&vmi) + info->start_gap;
        gap += (info->align_offset - gap) & info->align_mask;
        tmp = vma_next(&vmi);
        if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
                if (vm_start_gap(tmp) < gap + length - 1) {
                        low_limit = tmp->vm_end;
                        vma_iter_reset(&vmi);
                        goto retry;
                }
        } else {
                tmp = vma_prev(&vmi);
                if (tmp && vm_end_gap(tmp) > gap) {
                        low_limit = vm_end_gap(tmp);
                        vma_iter_reset(&vmi);
                        goto retry;
                }
        }

        return gap;
}

/**
 * unmapped_area_topdown() - Find an area between the low_limit and the
 * high_limit with the correct alignment and offset at the highest available
 * address, all from @info. Note: current->mm is used for the search.
 *
 * @info: The unmapped area information including the range [low_limit -
 * high_limit), the alignment offset and mask.
 *
 * Return: A memory address or -ENOMEM.
 */
static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
{
        unsigned long length, gap, gap_end;
        unsigned long low_limit, high_limit;
        struct vm_area_struct *tmp;
        VMA_ITERATOR(vmi, current->mm, 0);

        /* Adjust search length to account for worst case alignment overhead */
        length = info->length + info->align_mask + info->start_gap;
        if (length < info->length)
                return -ENOMEM;

        low_limit = info->low_limit;
        if (low_limit < mmap_min_addr)
                low_limit = mmap_min_addr;
        high_limit = info->high_limit;
retry:
        if (vma_iter_area_highest(&vmi, low_limit, high_limit, length))
                return -ENOMEM;

        gap = vma_iter_end(&vmi) - info->length;
        gap -= (gap - info->align_offset) & info->align_mask;
        gap_end = vma_iter_end(&vmi);
        tmp = vma_next(&vmi);
        if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
                if (vm_start_gap(tmp) < gap_end) {
                        high_limit = vm_start_gap(tmp);
                        vma_iter_reset(&vmi);
                        goto retry;
                }
        } else {
                tmp = vma_prev(&vmi);
                if (tmp && vm_end_gap(tmp) > gap) {
                        high_limit = tmp->vm_start;
                        vma_iter_reset(&vmi);
                        goto retry;
                }
        }

        return gap;
}

/*
 * Search for an unmapped address range.
 *
 * We are looking for a range that:
 * - does not intersect with any VMA;
 * - is contained within the [low_limit, high_limit) interval;
 * - is at least the desired size.
 * - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
 */
unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
{
        unsigned long addr;

        if (info->flags & VM_UNMAPPED_AREA_TOPDOWN)
                addr = unmapped_area_topdown(info);
        else
                addr = unmapped_area(info);

        trace_vm_unmapped_area(addr, info);
        return addr;
}

/* Get an address range which is currently unmapped.
 * For shmat() with addr=0.
 *
 * Ugly calling convention alert:
 * Return value with the low bits set means error value,
 * ie
 *        if (ret & ~PAGE_MASK)
 *                error = ret;
 *
 * This function "knows" that -ENOMEM has the bits set.
 */
unsigned long
generic_get_unmapped_area(struct file *filp, unsigned long addr,
                          unsigned long len, unsigned long pgoff,
                          unsigned long flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
        struct vm_unmapped_area_info info = {};
        const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);

        if (len > mmap_end - mmap_min_addr)
                return -ENOMEM;

        if (flags & MAP_FIXED)
                return addr;

        if (addr) {
                addr = PAGE_ALIGN(addr);
                vma = find_vma_prev(mm, addr, &prev);
                if (mmap_end - len >= addr && addr >= mmap_min_addr &&
                    (!vma || addr + len <= vm_start_gap(vma)) &&
                    (!prev || addr >= vm_end_gap(prev)))
                        return addr;
        }

        info.length = len;
        info.low_limit = mm->mmap_base;
        info.high_limit = mmap_end;
        return vm_unmapped_area(&info);
}

#ifndef HAVE_ARCH_UNMAPPED_AREA
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
                       unsigned long len, unsigned long pgoff,
                       unsigned long flags)
{
        return generic_get_unmapped_area(filp, addr, len, pgoff, flags);
}
#endif

/*
 * This mmap-allocator allocates new areas top-down from below the
 * stack's low limit (the base):
 */
unsigned long
generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                                  unsigned long len, unsigned long pgoff,
                                  unsigned long flags)
{
        struct vm_area_struct *vma, *prev;
        struct mm_struct *mm = current->mm;
        struct vm_unmapped_area_info info = {};
        const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);

        /* requested length too big for entire address space */
        if (len > mmap_end - mmap_min_addr)
                return -ENOMEM;

        if (flags & MAP_FIXED)
                return addr;

        /* requesting a specific address */
        if (addr) {
                addr = PAGE_ALIGN(addr);
                vma = find_vma_prev(mm, addr, &prev);
                if (mmap_end - len >= addr && addr >= mmap_min_addr &&
                                (!vma || addr + len <= vm_start_gap(vma)) &&
                                (!prev || addr >= vm_end_gap(prev)))
                        return addr;
        }

        info.flags = VM_UNMAPPED_AREA_TOPDOWN;
        info.length = len;
        info.low_limit = PAGE_SIZE;
        info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
        addr = vm_unmapped_area(&info);

        /*
         * A failed mmap() very likely causes application failure,
         * so fall back to the bottom-up function here. This scenario
         * can happen with large stack limits and large mmap()
         * allocations.
         */
        if (offset_in_page(addr)) {
                VM_BUG_ON(addr != -ENOMEM);
                info.flags = 0;
                info.low_limit = TASK_UNMAPPED_BASE;
                info.high_limit = mmap_end;
                addr = vm_unmapped_area(&info);
        }

        return addr;
}

#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                               unsigned long len, unsigned long pgoff,
                               unsigned long flags)
{
        return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags);
}
#endif

#ifndef HAVE_ARCH_UNMAPPED_AREA_VMFLAGS
unsigned long
arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len,
                               unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
{
        return arch_get_unmapped_area(filp, addr, len, pgoff, flags);
}

unsigned long
arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr,
                                       unsigned long len, unsigned long pgoff,
                                       unsigned long flags, vm_flags_t vm_flags)
{
        return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags);
}
#endif

unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp,
                                           unsigned long addr, unsigned long len,
                                           unsigned long pgoff, unsigned long flags,
                                           vm_flags_t vm_flags)
{
        if (test_bit(MMF_TOPDOWN, &mm->flags))
                return arch_get_unmapped_area_topdown_vmflags(filp, addr, len, pgoff,
                                                              flags, vm_flags);
        return arch_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, vm_flags);
}

unsigned long
__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
{
        unsigned long (*get_area)(struct file *, unsigned long,
                                  unsigned long, unsigned long, unsigned long)
                                  = NULL;

        unsigned long error = arch_mmap_check(addr, len, flags);
        if (error)
                return error;

        /* Careful about overflows.. */
        if (len > TASK_SIZE)
                return -ENOMEM;

        if (file) {
                if (file->f_op->get_unmapped_area)
                        get_area = file->f_op->get_unmapped_area;
        } else if (flags & MAP_SHARED) {
                /*
                 * mmap_region() will call shmem_zero_setup() to create a file,
                 * so use shmem's get_unmapped_area in case it can be huge.
                 */
                get_area = shmem_get_unmapped_area;
        }

        /* Always treat pgoff as zero for anonymous memory. */
        if (!file)
                pgoff = 0;

        if (get_area) {
                addr = get_area(file, addr, len, pgoff, flags);
        } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
                /* Ensures that larger anonymous mappings are THP aligned. */
                addr = thp_get_unmapped_area_vmflags(file, addr, len,
                                                     pgoff, flags, vm_flags);
        } else {
                addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len,
                                                    pgoff, flags, vm_flags);
        }
        if (IS_ERR_VALUE(addr))
                return addr;

        if (addr > TASK_SIZE - len)
                return -ENOMEM;
        if (offset_in_page(addr))
                return -EINVAL;

        error = security_mmap_addr(addr);
        return error ? error : addr;
}

unsigned long
mm_get_unmapped_area(struct mm_struct *mm, struct file *file,
                     unsigned long addr, unsigned long len,
                     unsigned long pgoff, unsigned long flags)
{
        if (test_bit(MMF_TOPDOWN, &mm->flags))
                return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags);
        return arch_get_unmapped_area(file, addr, len, pgoff, flags);
}
EXPORT_SYMBOL(mm_get_unmapped_area);

/**
 * find_vma_intersection() - Look up the first VMA which intersects the interval
 * @mm: The process address space.
 * @start_addr: The inclusive start user address.
 * @end_addr: The exclusive end user address.
 *
 * Returns: The first VMA within the provided range, %NULL otherwise.  Assumes
 * start_addr < end_addr.
 */
struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
                                             unsigned long start_addr,
                                             unsigned long end_addr)
{
        unsigned long index = start_addr;

        mmap_assert_locked(mm);
        return mt_find(&mm->mm_mt, &index, end_addr - 1);
}
EXPORT_SYMBOL(find_vma_intersection);

/**
 * find_vma() - Find the VMA for a given address, or the next VMA.
 * @mm: The mm_struct to check
 * @addr: The address
 *
 * Returns: The VMA associated with addr, or the next VMA.
 * May return %NULL in the case of no VMA at addr or above.
 */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
        unsigned long index = addr;

        mmap_assert_locked(mm);
        return mt_find(&mm->mm_mt, &index, ULONG_MAX);
}
EXPORT_SYMBOL(find_vma);

/**
 * find_vma_prev() - Find the VMA for a given address, or the next vma and
 * set %pprev to the previous VMA, if any.
 * @mm: The mm_struct to check
 * @addr: The address
 * @pprev: The pointer to set to the previous VMA
 *
 * Note that RCU lock is missing here since the external mmap_lock() is used
 * instead.
 *
 * Returns: The VMA associated with @addr, or the next vma.
 * May return %NULL in the case of no vma at addr or above.
 */
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
                        struct vm_area_struct **pprev)
{
        struct vm_area_struct *vma;
        VMA_ITERATOR(vmi, mm, addr);

        vma = vma_iter_load(&vmi);
        *pprev = vma_prev(&vmi);
        if (!vma)
                vma = vma_next(&vmi);
        return vma;
}

/*
 * Verify that the stack growth is acceptable and
 * update accounting. This is shared with both the
 * grow-up and grow-down cases.
 */
static int acct_stack_growth(struct vm_area_struct *vma,
                             unsigned long size, unsigned long grow)
{
        struct mm_struct *mm = vma->vm_mm;
        unsigned long new_start;

        /* address space limit tests */
        if (!may_expand_vm(mm, vma->vm_flags, grow))
                return -ENOMEM;

        /* Stack limit test */
        if (size > rlimit(RLIMIT_STACK))
                return -ENOMEM;

        /* mlock limit tests */
        if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT))
                return -ENOMEM;

        /* Check to ensure the stack will not grow into a hugetlb-only region */
        new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
                        vma->vm_end - size;
        if (is_hugepage_only_range(vma->vm_mm, new_start, size))
                return -EFAULT;

        /*
         * Overcommit..  This must be the final test, as it will
         * update security statistics.
         */
        if (security_vm_enough_memory_mm(mm, grow))
                return -ENOMEM;

        return 0;
}

#if defined(CONFIG_STACK_GROWSUP)
/*
 * PA-RISC uses this for its stack.
 * vma is the last one with address > vma->vm_end.  Have to extend vma.
 */
static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *next;
        unsigned long gap_addr;
        int error = 0;
        VMA_ITERATOR(vmi, mm, vma->vm_start);

        if (!(vma->vm_flags & VM_GROWSUP))
                return -EFAULT;

        /* Guard against exceeding limits of the address space. */
        address &= PAGE_MASK;
        if (address >= (TASK_SIZE & PAGE_MASK))
                return -ENOMEM;
        address += PAGE_SIZE;

        /* Enforce stack_guard_gap */
        gap_addr = address + stack_guard_gap;

        /* Guard against overflow */
        if (gap_addr < address || gap_addr > TASK_SIZE)
                gap_addr = TASK_SIZE;

        next = find_vma_intersection(mm, vma->vm_end, gap_addr);
        if (next && vma_is_accessible(next)) {
                if (!(next->vm_flags & VM_GROWSUP))
                        return -ENOMEM;
                /* Check that both stack segments have the same anon_vma? */
        }

        if (next)
                vma_iter_prev_range_limit(&vmi, address);

        vma_iter_config(&vmi, vma->vm_start, address);
        if (vma_iter_prealloc(&vmi, vma))
                return -ENOMEM;

        /* We must make sure the anon_vma is allocated. */
        if (unlikely(anon_vma_prepare(vma))) {
                vma_iter_free(&vmi);
                return -ENOMEM;
        }

        /* Lock the VMA before expanding to prevent concurrent page faults */
        vma_start_write(vma);
        /*
         * vma->vm_start/vm_end cannot change under us because the caller
         * is required to hold the mmap_lock in read mode.  We need the
         * anon_vma lock to serialize against concurrent expand_stacks.
         */
        anon_vma_lock_write(vma->anon_vma);

        /* Somebody else might have raced and expanded it already */
        if (address > vma->vm_end) {
                unsigned long size, grow;

                size = address - vma->vm_start;
                grow = (address - vma->vm_end) >> PAGE_SHIFT;

                error = -ENOMEM;
                if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
                                /*
                                 * We only hold a shared mmap_lock lock here, so
                                 * we need to protect against concurrent vma
                                 * expansions.  anon_vma_lock_write() doesn't
                                 * help here, as we don't guarantee that all
                                 * growable vmas in a mm share the same root
                                 * anon vma.  So, we reuse mm->page_table_lock
                                 * to guard against concurrent vma expansions.
                                 */
                                spin_lock(&mm->page_table_lock);
                                if (vma->vm_flags & VM_LOCKED)
                                        mm->locked_vm += grow;
                                vm_stat_account(mm, vma->vm_flags, grow);
                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_end = address;
                                /* Overwrite old entry in mtree. */
                                vma_iter_store(&vmi, vma);
                                anon_vma_interval_tree_post_update_vma(vma);
                                spin_unlock(&mm->page_table_lock);

                                perf_event_mmap(vma);
                        }
                }
        }
        anon_vma_unlock_write(vma->anon_vma);
        vma_iter_free(&vmi);
        validate_mm(mm);
        return error;
}
#endif /* CONFIG_STACK_GROWSUP */

/*
 * vma is the first one with address < vma->vm_start.  Have to extend vma.
 * mmap_lock held for writing.
 */
int expand_downwards(struct vm_area_struct *vma, unsigned long address)
{
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *prev;
        int error = 0;
        VMA_ITERATOR(vmi, mm, vma->vm_start);

        if (!(vma->vm_flags & VM_GROWSDOWN))
                return -EFAULT;

        address &= PAGE_MASK;
        if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
                return -EPERM;

        /* Enforce stack_guard_gap */
        prev = vma_prev(&vmi);
        /* Check that both stack segments have the same anon_vma? */
        if (prev) {
                if (!(prev->vm_flags & VM_GROWSDOWN) &&
                    vma_is_accessible(prev) &&
                    (address - prev->vm_end < stack_guard_gap))
                        return -ENOMEM;
        }

        if (prev)
                vma_iter_next_range_limit(&vmi, vma->vm_start);

        vma_iter_config(&vmi, address, vma->vm_end);
        if (vma_iter_prealloc(&vmi, vma))
                return -ENOMEM;

        /* We must make sure the anon_vma is allocated. */
        if (unlikely(anon_vma_prepare(vma))) {
                vma_iter_free(&vmi);
                return -ENOMEM;
        }

        /* Lock the VMA before expanding to prevent concurrent page faults */
        vma_start_write(vma);
        /*
         * vma->vm_start/vm_end cannot change under us because the caller
         * is required to hold the mmap_lock in read mode.  We need the
         * anon_vma lock to serialize against concurrent expand_stacks.
         */
        anon_vma_lock_write(vma->anon_vma);

        /* Somebody else might have raced and expanded it already */
        if (address < vma->vm_start) {
                unsigned long size, grow;

                size = vma->vm_end - address;
                grow = (vma->vm_start - address) >> PAGE_SHIFT;

                error = -ENOMEM;
                if (grow <= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
                                /*
                                 * We only hold a shared mmap_lock lock here, so
                                 * we need to protect against concurrent vma
                                 * expansions.  anon_vma_lock_write() doesn't
                                 * help here, as we don't guarantee that all
                                 * growable vmas in a mm share the same root
                                 * anon vma.  So, we reuse mm->page_table_lock
                                 * to guard against concurrent vma expansions.
                                 */
                                spin_lock(&mm->page_table_lock);
                                if (vma->vm_flags & VM_LOCKED)
                                        mm->locked_vm += grow;
                                vm_stat_account(mm, vma->vm_flags, grow);
                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_start = address;
                                vma->vm_pgoff -= grow;
                                /* Overwrite old entry in mtree. */
                                vma_iter_store(&vmi, vma);
                                anon_vma_interval_tree_post_update_vma(vma);
                                spin_unlock(&mm->page_table_lock);

                                perf_event_mmap(vma);
                        }
                }
        }
        anon_vma_unlock_write(vma->anon_vma);
        vma_iter_free(&vmi);
        validate_mm(mm);
        return error;
}

/* enforced gap between the expanding stack and other mappings. */
unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;

static int __init cmdline_parse_stack_guard_gap(char *p)
{
        unsigned long val;
        char *endptr;

        val = simple_strtoul(p, &endptr, 10);
        if (!*endptr)
                stack_guard_gap = val << PAGE_SHIFT;

        return 1;
}
__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);

#ifdef CONFIG_STACK_GROWSUP
int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
{
        return expand_upwards(vma, address);
}

struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma, *prev;

        addr &= PAGE_MASK;
        vma = find_vma_prev(mm, addr, &prev);
        if (vma && (vma->vm_start <= addr))
                return vma;
        if (!prev)
                return NULL;
        if (expand_stack_locked(prev, addr))
                return NULL;
        if (prev->vm_flags & VM_LOCKED)
                populate_vma_page_range(prev, addr, prev->vm_end, NULL);
        return prev;
}
#else
int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
{
        return expand_downwards(vma, address);
}

struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma;
        unsigned long start;

        addr &= PAGE_MASK;
        vma = find_vma(mm, addr);
        if (!vma)
                return NULL;
        if (vma->vm_start <= addr)
                return vma;
        start = vma->vm_start;
        if (expand_stack_locked(vma, addr))
                return NULL;
        if (vma->vm_flags & VM_LOCKED)
                populate_vma_page_range(vma, addr, start, NULL);
        return vma;
}
#endif

#if defined(CONFIG_STACK_GROWSUP)

#define vma_expand_up(vma,addr) expand_upwards(vma, addr)
#define vma_expand_down(vma, addr) (-EFAULT)

#else

#define vma_expand_up(vma,addr) (-EFAULT)
#define vma_expand_down(vma, addr) expand_downwards(vma, addr)

#endif

/*
 * expand_stack(): legacy interface for page faulting. Don't use unless
 * you have to.
 *
 * This is called with the mm locked for reading, drops the lock, takes
 * the lock for writing, tries to look up a vma again, expands it if
 * necessary, and downgrades the lock to reading again.
 *
 * If no vma is found or it can't be expanded, it returns NULL and has
 * dropped the lock.
 */
struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma, *prev;

        mmap_read_unlock(mm);
        if (mmap_write_lock_killable(mm))
                return NULL;

        vma = find_vma_prev(mm, addr, &prev);
        if (vma && vma->vm_start <= addr)
                goto success;

        if (prev && !vma_expand_up(prev, addr)) {
                vma = prev;
                goto success;
        }

        if (vma && !vma_expand_down(vma, addr))
                goto success;

        mmap_write_unlock(mm);
        return NULL;

success:
        mmap_write_downgrade(mm);
        return vma;
}

/*
 * Ok - we have the memory areas we should free on a maple tree so release them,
 * and do the vma updates.
 *
 * Called with the mm semaphore held.
 */
static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
{
        unsigned long nr_accounted = 0;
        struct vm_area_struct *vma;

        /* Update high watermark before we lower total_vm */
        update_hiwater_vm(mm);
        mas_for_each(mas, vma, ULONG_MAX) {
                long nrpages = vma_pages(vma);

                if (vma->vm_flags & VM_ACCOUNT)
                        nr_accounted += nrpages;
                vm_stat_account(mm, vma->vm_flags, -nrpages);
                remove_vma(vma, false);
        }
        vm_unacct_memory(nr_accounted);
}

/*
 * Get rid of page table information in the indicated region.
 *
 * Called with the mm semaphore held.
 */
static void unmap_region(struct mm_struct *mm, struct ma_state *mas,
                struct vm_area_struct *vma, struct vm_area_struct *prev,
                struct vm_area_struct *next, unsigned long start,
                unsigned long end, unsigned long tree_end, bool mm_wr_locked)
{
        struct mmu_gather tlb;
        unsigned long mt_start = mas->index;

        lru_add_drain();
        tlb_gather_mmu(&tlb, mm);
        update_hiwater_rss(mm);
        unmap_vmas(&tlb, mas, vma, start, end, tree_end, mm_wr_locked);
        mas_set(mas, mt_start);
        free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
                                 next ? next->vm_start : USER_PGTABLES_CEILING,
                                 mm_wr_locked);
        tlb_finish_mmu(&tlb);
}

/*
 * __split_vma() bypasses sysctl_max_map_count checking.  We use this where it
 * has already been checked or doesn't make sense to fail.
 * VMA Iterator will point to the end VMA.
 */
static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
                       unsigned long addr, int new_below)
{
        struct vma_prepare vp;
        struct vm_area_struct *new;
        int err;

        WARN_ON(vma->vm_start >= addr);
        WARN_ON(vma->vm_end <= addr);

        if (vma->vm_ops && vma->vm_ops->may_split) {
                err = vma->vm_ops->may_split(vma, addr);
                if (err)
                        return err;
        }

        new = vm_area_dup(vma);
        if (!new)
                return -ENOMEM;

        if (new_below) {
                new->vm_end = addr;
        } else {
                new->vm_start = addr;
                new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
        }

        err = -ENOMEM;
        vma_iter_config(vmi, new->vm_start, new->vm_end);
        if (vma_iter_prealloc(vmi, new))
                goto out_free_vma;

        err = vma_dup_policy(vma, new);
        if (err)
                goto out_free_vmi;

        err = anon_vma_clone(new, vma);
        if (err)
                goto out_free_mpol;

        if (new->vm_file)
                get_file(new->vm_file);

        if (new->vm_ops && new->vm_ops->open)
                new->vm_ops->open(new);

        vma_start_write(vma);
        vma_start_write(new);

        init_vma_prep(&vp, vma);
        vp.insert = new;
        vma_prepare(&vp);
        vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);

        if (new_below) {
                vma->vm_start = addr;
                vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT;
        } else {
                vma->vm_end = addr;
        }

        /* vma_complete stores the new vma */
        vma_complete(&vp, vmi, vma->vm_mm);

        /* Success. */
        if (new_below)
                vma_next(vmi);
        return 0;

out_free_mpol:
        mpol_put(vma_policy(new));
out_free_vmi:
        vma_iter_free(vmi);
out_free_vma:
        vm_area_free(new);
        return err;
}

/*
 * Split a vma into two pieces at address 'addr', a new vma is allocated
 * either for the first part or the tail.
 */
static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
                     unsigned long addr, int new_below)
{
        if (vma->vm_mm->map_count >= sysctl_max_map_count)
                return -ENOMEM;

        return __split_vma(vmi, vma, addr, new_below);
}

/*
 * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
 * context and anonymous VMA name within the range [start, end).
 *
 * As a result, we might be able to merge the newly modified VMA range with an
 * adjacent VMA with identical properties.
 *
 * If no merge is possible and the range does not span the entirety of the VMA,
 * we then need to split the VMA to accommodate the change.
 *
 * The function returns either the merged VMA, the original VMA if a split was
 * required instead, or an error if the split failed.
 */
struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
                                  struct vm_area_struct *prev,
                                  struct vm_area_struct *vma,
                                  unsigned long start, unsigned long end,
                                  unsigned long vm_flags,
                                  struct mempolicy *policy,
                                  struct vm_userfaultfd_ctx uffd_ctx,
                                  struct anon_vma_name *anon_name)
{
        pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
        struct vm_area_struct *merged;

        merged = vma_merge(vmi, prev, vma, start, end, vm_flags,
                           pgoff, policy, uffd_ctx, anon_name);
        if (merged)
                return merged;

        if (vma->vm_start < start) {
                int err = split_vma(vmi, vma, start, 1);

                if (err)
                        return ERR_PTR(err);
        }

        if (vma->vm_end > end) {
                int err = split_vma(vmi, vma, end, 0);

                if (err)
                        return ERR_PTR(err);
        }

        return vma;
}

/*
 * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
 * must ensure that [start, end) does not overlap any existing VMA.
 */
static struct vm_area_struct
*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev,
                   struct vm_area_struct *vma, unsigned long start,
                   unsigned long end, pgoff_t pgoff)
{
        return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff,
                         vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma));
}

/*
 * Expand vma by delta bytes, potentially merging with an immediately adjacent
 * VMA with identical properties.
 */
struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
                                        struct vm_area_struct *vma,
                                        unsigned long delta)
{
        pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma);

        /* vma is specified as prev, so case 1 or 2 will apply. */
        return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta,
                         vma->vm_flags, pgoff, vma_policy(vma),
                         vma->vm_userfaultfd_ctx, anon_vma_name(vma));
}

/*
 * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
 * @vmi: The vma iterator
 * @vma: The starting vm_area_struct
 * @mm: The mm_struct
 * @start: The aligned start address to munmap.
 * @end: The aligned end address to munmap.
 * @uf: The userfaultfd list_head
 * @unlock: Set to true to drop the mmap_lock.  unlocking only happens on
 * success.
 *
 * Return: 0 on success and drops the lock if so directed, error and leaves the
 * lock held otherwise.
 */
static int
do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
                    struct mm_struct *mm, unsigned long start,
                    unsigned long end, struct list_head *uf, bool unlock)
{
        struct vm_area_struct *prev, *next = NULL;
        struct maple_tree mt_detach;
        int count = 0;
        int error = -ENOMEM;
        unsigned long locked_vm = 0;
        MA_STATE(mas_detach, &mt_detach, 0, 0);
        mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
        mt_on_stack(mt_detach);

        /*
         * If we need to split any vma, do it now to save pain later.
         *
         * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
         * unmapped vm_area_struct will remain in use: so lower split_vma
         * places tmp vma above, and higher split_vma places tmp vma below.
         */

        /* Does it split the first one? */
        if (start > vma->vm_start) {

                /*
                 * Make sure that map_count on return from munmap() will
                 * not exceed its limit; but let map_count go just above
                 * its limit temporarily, to help free resources as expected.
                 */
                if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
                        goto map_count_exceeded;

                error = __split_vma(vmi, vma, start, 1);
                if (error)
                        goto start_split_failed;
        }

        /*
         * Detach a range of VMAs from the mm. Using next as a temp variable as
         * it is always overwritten.
         */
        next = vma;
        do {
                /* Does it split the end? */
                if (next->vm_end > end) {
                        error = __split_vma(vmi, next, end, 0);
                        if (error)
                                goto end_split_failed;
                }
                vma_start_write(next);
                mas_set(&mas_detach, count);
                error = mas_store_gfp(&mas_detach, next, GFP_KERNEL);
                if (error)
                        goto munmap_gather_failed;
                vma_mark_detached(next, true);
                if (next->vm_flags & VM_LOCKED)
                        locked_vm += vma_pages(next);

                count++;
                if (unlikely(uf)) {
                        /*
                         * If userfaultfd_unmap_prep returns an error the vmas
                         * will remain split, but userland will get a
                         * highly unexpected error anyway. This is no
                         * different than the case where the first of the two
                         * __split_vma fails, but we don't undo the first
                         * split, despite we could. This is unlikely enough
                         * failure that it's not worth optimizing it for.
                         */
                        error = userfaultfd_unmap_prep(next, start, end, uf);

                        if (error)
                                goto userfaultfd_error;
                }
#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
                BUG_ON(next->vm_start < start);
                BUG_ON(next->vm_start > end);
#endif
        } for_each_vma_range(*vmi, next, end);

#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
        /* Make sure no VMAs are about to be lost. */
        {
                MA_STATE(test, &mt_detach, 0, 0);
                struct vm_area_struct *vma_mas, *vma_test;
                int test_count = 0;

                vma_iter_set(vmi, start);
                rcu_read_lock();
                vma_test = mas_find(&test, count - 1);
                for_each_vma_range(*vmi, vma_mas, end) {
                        BUG_ON(vma_mas != vma_test);
                        test_count++;
                        vma_test = mas_next(&test, count - 1);
                }
                rcu_read_unlock();
                BUG_ON(count != test_count);
        }
#endif

        while (vma_iter_addr(vmi) > start)
                vma_iter_prev_range(vmi);

        error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
        if (error)
                goto clear_tree_failed;

        /* Point of no return */
        mm->locked_vm -= locked_vm;
        mm->map_count -= count;
        if (unlock)
                mmap_write_downgrade(mm);

        prev = vma_iter_prev_range(vmi);
        next = vma_next(vmi);
        if (next)
                vma_iter_prev_range(vmi);

        /*
         * We can free page tables without write-locking mmap_lock because VMAs
         * were isolated before we downgraded mmap_lock.
         */
        mas_set(&mas_detach, 1);
        unmap_region(mm, &mas_detach, vma, prev, next, start, end, count,
                     !unlock);
        /* Statistics and freeing VMAs */
        mas_set(&mas_detach, 0);
        remove_mt(mm, &mas_detach);
        validate_mm(mm);
        if (unlock)
                mmap_read_unlock(mm);

        __mt_destroy(&mt_detach);
        return 0;

clear_tree_failed:
userfaultfd_error:
munmap_gather_failed:
end_split_failed:
        mas_set(&mas_detach, 0);
        mas_for_each(&mas_detach, next, end)
                vma_mark_detached(next, false);

        __mt_destroy(&mt_detach);
start_split_failed:
map_count_exceeded:
        validate_mm(mm);
        return error;
}

/*
 * do_vmi_munmap() - munmap a given range.
 * @vmi: The vma iterator
 * @mm: The mm_struct
 * @start: The start address to munmap
 * @len: The length of the range to munmap
 * @uf: The userfaultfd list_head
 * @unlock: set to true if the user wants to drop the mmap_lock on success
 *
 * This function takes a @mas that is either pointing to the previous VMA or set
 * to MA_START and sets it up to remove the mapping(s).  The @len will be
 * aligned and any arch_unmap work will be preformed.
 *
 * Return: 0 on success and drops the lock if so directed, error and leaves the
 * lock held otherwise.
 */
int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
                  unsigned long start, size_t len, struct list_head *uf,
                  bool unlock)
{
        unsigned long end;
        struct vm_area_struct *vma;

        if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
                return -EINVAL;

        end = start + PAGE_ALIGN(len);
        if (end == start)
                return -EINVAL;

        /*
         * Check if memory is sealed before arch_unmap.
         * Prevent unmapping a sealed VMA.
         * can_modify_mm assumes we have acquired the lock on MM.
         */
        if (unlikely(!can_modify_mm(mm, start, end)))
                return -EPERM;

         /* arch_unmap() might do unmaps itself.  */
        arch_unmap(mm, start, end);

        /* Find the first overlapping VMA */
        vma = vma_find(vmi, end);
        if (!vma) {
                if (unlock)
                        mmap_write_unlock(mm);
                return 0;
        }

        return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
}

/* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls.
 * @mm: The mm_struct
 * @start: The start address to munmap
 * @len: The length to be munmapped.
 * @uf: The userfaultfd list_head
 *
 * Return: 0 on success, error otherwise.
 */
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
              struct list_head *uf)
{
        VMA_ITERATOR(vmi, mm, start);

        return do_vmi_munmap(&vmi, mm, start, len, uf, false);
}

unsigned long mmap_region(struct file *file, unsigned long addr,
                unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
                struct list_head *uf)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = NULL;
        struct vm_area_struct *next, *prev, *merge;
        pgoff_t pglen = len >> PAGE_SHIFT;
        unsigned long charged = 0;
        unsigned long end = addr + len;
        unsigned long merge_start = addr, merge_end = end;
        bool writable_file_mapping = false;
        pgoff_t vm_pgoff;
        int error;
        VMA_ITERATOR(vmi, mm, addr);

        /* Check against address space limit. */
        if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
                unsigned long nr_pages;

                /*
                 * MAP_FIXED may remove pages of mappings that intersects with
                 * requested mapping. Account for the pages it would unmap.
                 */
                nr_pages = count_vma_pages_range(mm, addr, end);

                if (!may_expand_vm(mm, vm_flags,
                                        (len >> PAGE_SHIFT) - nr_pages))
                        return -ENOMEM;
        }

        /* Unmap any existing mapping in the area */
        error = do_vmi_munmap(&vmi, mm, addr, len, uf, false);
        if (error == -EPERM)
                return error;
        else if (error)
                return -ENOMEM;

        /*
         * Private writable mapping: check memory availability
         */
        if (accountable_mapping(file, vm_flags)) {
                charged = len >> PAGE_SHIFT;
                if (security_vm_enough_memory_mm(mm, charged))
                        return -ENOMEM;
                vm_flags |= VM_ACCOUNT;
        }

        next = vma_next(&vmi);
        prev = vma_prev(&vmi);
        if (vm_flags & VM_SPECIAL) {
                if (prev)
                        vma_iter_next_range(&vmi);
                goto cannot_expand;
        }

        /* Attempt to expand an old mapping */
        /* Check next */
        if (next && next->vm_start == end && !vma_policy(next) &&
            can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen,
                                 NULL_VM_UFFD_CTX, NULL)) {
                merge_end = next->vm_end;
                vma = next;
                vm_pgoff = next->vm_pgoff - pglen;
        }

        /* Check prev */
        if (prev && prev->vm_end == addr && !vma_policy(prev) &&
            (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file,
                                       pgoff, vma->vm_userfaultfd_ctx, NULL) :
                   can_vma_merge_after(prev, vm_flags, NULL, file, pgoff,
                                       NULL_VM_UFFD_CTX, NULL))) {
                merge_start = prev->vm_start;
                vma = prev;
                vm_pgoff = prev->vm_pgoff;
        } else if (prev) {
                vma_iter_next_range(&vmi);
        }

        /* Actually expand, if possible */
        if (vma &&
            !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) {
                khugepaged_enter_vma(vma, vm_flags);
                goto expanded;
        }

        if (vma == prev)
                vma_iter_set(&vmi, addr);
cannot_expand:

        /*
         * Determine the object being mapped and call the appropriate
         * specific mapper. the address has already been validated, but
         * not unmapped, but the maps are removed from the list.
         */
        vma = vm_area_alloc(mm);
        if (!vma) {
                error = -ENOMEM;
                goto unacct_error;
        }

        vma_iter_config(&vmi, addr, end);
        vma_set_range(vma, addr, end, pgoff);
        vm_flags_init(vma, vm_flags);
        vma->vm_page_prot = vm_get_page_prot(vm_flags);

        if (file) {
                vma->vm_file = get_file(file);
                error = call_mmap(file, vma);
                if (error)
                        goto unmap_and_free_vma;

                if (vma_is_shared_maywrite(vma)) {
                        error = mapping_map_writable(file->f_mapping);
                        if (error)
                                goto close_and_free_vma;

                        writable_file_mapping = true;
                }

                /*
                 * Expansion is handled above, merging is handled below.
                 * Drivers should not alter the address of the VMA.
                 */
                error = -EINVAL;
                if (WARN_ON((addr != vma->vm_start)))
                        goto close_and_free_vma;

                vma_iter_config(&vmi, addr, end);
                /*
                 * If vm_flags changed after call_mmap(), we should try merge
                 * vma again as we may succeed this time.
                 */
                if (unlikely(vm_flags != vma->vm_flags && prev)) {
                        merge = vma_merge_new_vma(&vmi, prev, vma,
                                                  vma->vm_start, vma->vm_end,
                                                  vma->vm_pgoff);
                        if (merge) {
                                /*
                                 * ->mmap() can change vma->vm_file and fput
                                 * the original file. So fput the vma->vm_file
                                 * here or we would add an extra fput for file
                                 * and cause general protection fault
                                 * ultimately.
                                 */
                                fput(vma->vm_file);
                                vm_area_free(vma);
                                vma = merge;
                                /* Update vm_flags to pick up the change. */
                                vm_flags = vma->vm_flags;
                                goto unmap_writable;
                        }
                }

                vm_flags = vma->vm_flags;
        } else if (vm_flags & VM_SHARED) {
                error = shmem_zero_setup(vma);
                if (error)
                        goto free_vma;
        } else {
                vma_set_anonymous(vma);
        }

        if (map_deny_write_exec(vma, vma->vm_flags)) {
                error = -EACCES;
                goto close_and_free_vma;
        }

        /* Allow architectures to sanity-check the vm_flags */
        error = -EINVAL;
        if (!arch_validate_flags(vma->vm_flags))
                goto close_and_free_vma;

        error = -ENOMEM;
        if (vma_iter_prealloc(&vmi, vma))
                goto close_and_free_vma;

        /* Lock the VMA since it is modified after insertion into VMA tree */
        vma_start_write(vma);
        vma_iter_store(&vmi, vma);
        mm->map_count++;
        vma_link_file(vma);

        /*
         * vma_merge() calls khugepaged_enter_vma() either, the below
         * call covers the non-merge case.
         */
        khugepaged_enter_vma(vma, vma->vm_flags);

        /* Once vma denies write, undo our temporary denial count */
unmap_writable:
        if (writable_file_mapping)
                mapping_unmap_writable(file->f_mapping);
        file = vma->vm_file;
        ksm_add_vma(vma);
expanded:
        perf_event_mmap(vma);

        vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
        if (vm_flags & VM_LOCKED) {
                if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
                                        is_vm_hugetlb_page(vma) ||
                                        vma == get_gate_vma(current->mm))
                        vm_flags_clear(vma, VM_LOCKED_MASK);
                else
                        mm->locked_vm += (len >> PAGE_SHIFT);
        }

        if (file)
                uprobe_mmap(vma);

        /*
         * New (or expanded) vma always get soft dirty status.
         * Otherwise user-space soft-dirty page tracker won't
         * be able to distinguish situation when vma area unmapped,
         * then new mapped in-place (which must be aimed as
         * a completely new data area).
         */
        vm_flags_set(vma, VM_SOFTDIRTY);

        vma_set_page_prot(vma);

        validate_mm(mm);
        return addr;

close_and_free_vma:
        if (file && vma->vm_ops && vma->vm_ops->close)
                vma->vm_ops->close(vma);

        if (file || vma->vm_file) {
unmap_and_free_vma:
                fput(vma->vm_file);
                vma->vm_file = NULL;

                vma_iter_set(&vmi, vma->vm_end);
                /* Undo any partial mapping done by a device driver. */
                unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
                             vma->vm_end, vma->vm_end, true);
        }
        if (writable_file_mapping)
                mapping_unmap_writable(file->f_mapping);
free_vma:
        vm_area_free(vma);
unacct_error:
        if (charged)
                vm_unacct_memory(charged);
        validate_mm(mm);
        return error;
}

static int __vm_munmap(unsigned long start, size_t len, bool unlock)
{
        int ret;
        struct mm_struct *mm = current->mm;
        LIST_HEAD(uf);
        VMA_ITERATOR(vmi, mm, start);

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock);
        if (ret || !unlock)
                mmap_write_unlock(mm);

        userfaultfd_unmap_complete(mm, &uf);
        return ret;
}

int vm_munmap(unsigned long start, size_t len)
{
        return __vm_munmap(start, len, false);
}
EXPORT_SYMBOL(vm_munmap);

SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
        addr = untagged_addr(addr);
        return __vm_munmap(addr, len, true);
}


/*
 * Emulation of deprecated remap_file_pages() syscall.
 */
SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
{

        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long populate = 0;
        unsigned long ret = -EINVAL;
        struct file *file;

        pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n",
                     current->comm, current->pid);

        if (prot)
                return ret;
        start = start & PAGE_MASK;
        size = size & PAGE_MASK;

        if (start + size <= start)
                return ret;

        /* Does pgoff wrap? */
        if (pgoff + (size >> PAGE_SHIFT) < pgoff)
                return ret;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        vma = vma_lookup(mm, start);

        if (!vma || !(vma->vm_flags & VM_SHARED))
                goto out;

        if (start + size > vma->vm_end) {
                VMA_ITERATOR(vmi, mm, vma->vm_end);
                struct vm_area_struct *next, *prev = vma;

                for_each_vma_range(vmi, next, start + size) {
                        /* hole between vmas ? */
                        if (next->vm_start != prev->vm_end)
                                goto out;

                        if (next->vm_file != vma->vm_file)
                                goto out;

                        if (next->vm_flags != vma->vm_flags)
                                goto out;

                        if (start + size <= next->vm_end)
                                break;

                        prev = next;
                }

                if (!next)
                        goto out;
        }

        prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
        prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
        prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;

        flags &= MAP_NONBLOCK;
        flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
        if (vma->vm_flags & VM_LOCKED)
                flags |= MAP_LOCKED;

        file = get_file(vma->vm_file);
        ret = do_mmap(vma->vm_file, start, size,
                        prot, flags, 0, pgoff, &populate, NULL);
        fput(file);
out:
        mmap_write_unlock(mm);
        if (populate)
                mm_populate(ret, populate);
        if (!IS_ERR_VALUE(ret))
                ret = 0;
        return ret;
}

/*
 * do_vma_munmap() - Unmap a full or partial vma.
 * @vmi: The vma iterator pointing at the vma
 * @vma: The first vma to be munmapped
 * @start: the start of the address to unmap
 * @end: The end of the address to unmap
 * @uf: The userfaultfd list_head
 * @unlock: Drop the lock on success
 *
 * unmaps a VMA mapping when the vma iterator is already in position.
 * Does not handle alignment.
 *
 * Return: 0 on success drops the lock of so directed, error on failure and will
 * still hold the lock.
 */
int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
                unsigned long start, unsigned long end, struct list_head *uf,
                bool unlock)
{
        struct mm_struct *mm = vma->vm_mm;

        /*
         * Check if memory is sealed before arch_unmap.
         * Prevent unmapping a sealed VMA.
         * can_modify_mm assumes we have acquired the lock on MM.
         */
        if (unlikely(!can_modify_mm(mm, start, end)))
                return -EPERM;

        arch_unmap(mm, start, end);
        return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
}

/*
 * do_brk_flags() - Increase the brk vma if the flags match.
 * @vmi: The vma iterator
 * @addr: The start address
 * @len: The length of the increase
 * @vma: The vma,
 * @flags: The VMA Flags
 *
 * Extend the brk VMA from addr to addr + len.  If the VMA is NULL or the flags
 * do not match then create a new anonymous VMA.  Eventually we may be able to
 * do some brk-specific accounting here.
 */
static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
                unsigned long addr, unsigned long len, unsigned long flags)
{
        struct mm_struct *mm = current->mm;
        struct vma_prepare vp;

        /*
         * Check against address space limits by the changed size
         * Note: This happens *after* clearing old mappings in some code paths.
         */
        flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
        if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
                return -ENOMEM;

        if (mm->map_count > sysctl_max_map_count)
                return -ENOMEM;

        if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
                return -ENOMEM;

        /*
         * Expand the existing vma if possible; Note that singular lists do not
         * occur after forking, so the expand will only happen on new VMAs.
         */
        if (vma && vma->vm_end == addr && !vma_policy(vma) &&
            can_vma_merge_after(vma, flags, NULL, NULL,
                                addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) {
                vma_iter_config(vmi, vma->vm_start, addr + len);
                if (vma_iter_prealloc(vmi, vma))
                        goto unacct_fail;

                vma_start_write(vma);

                init_vma_prep(&vp, vma);
                vma_prepare(&vp);
                vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
                vma->vm_end = addr + len;
                vm_flags_set(vma, VM_SOFTDIRTY);
                vma_iter_store(vmi, vma);

                vma_complete(&vp, vmi, mm);
                khugepaged_enter_vma(vma, flags);
                goto out;
        }

        if (vma)
                vma_iter_next_range(vmi);
        /* create a vma struct for an anonymous mapping */
        vma = vm_area_alloc(mm);
        if (!vma)
                goto unacct_fail;

        vma_set_anonymous(vma);
        vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
        vm_flags_init(vma, flags);
        vma->vm_page_prot = vm_get_page_prot(flags);
        vma_start_write(vma);
        if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
                goto mas_store_fail;

        mm->map_count++;
        validate_mm(mm);
        ksm_add_vma(vma);
out:
        perf_event_mmap(vma);
        mm->total_vm += len >> PAGE_SHIFT;
        mm->data_vm += len >> PAGE_SHIFT;
        if (flags & VM_LOCKED)
                mm->locked_vm += (len >> PAGE_SHIFT);
        vm_flags_set(vma, VM_SOFTDIRTY);
        return 0;

mas_store_fail:
        vm_area_free(vma);
unacct_fail:
        vm_unacct_memory(len >> PAGE_SHIFT);
        return -ENOMEM;
}

int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = NULL;
        unsigned long len;
        int ret;
        bool populate;
        LIST_HEAD(uf);
        VMA_ITERATOR(vmi, mm, addr);

        len = PAGE_ALIGN(request);
        if (len < request)
                return -ENOMEM;
        if (!len)
                return 0;

        /* Until we need other flags, refuse anything except VM_EXEC. */
        if ((flags & (~VM_EXEC)) != 0)
                return -EINVAL;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        ret = check_brk_limits(addr, len);
        if (ret)
                goto limits_failed;

        ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0);
        if (ret)
                goto munmap_failed;

        vma = vma_prev(&vmi);
        ret = do_brk_flags(&vmi, vma, addr, len, flags);
        populate = ((mm->def_flags & VM_LOCKED) != 0);
        mmap_write_unlock(mm);
        userfaultfd_unmap_complete(mm, &uf);
        if (populate && !ret)
                mm_populate(addr, len);
        return ret;

munmap_failed:
limits_failed:
        mmap_write_unlock(mm);
        return ret;
}
EXPORT_SYMBOL(vm_brk_flags);

/* Release all mmaps. */
void exit_mmap(struct mm_struct *mm)
{
        struct mmu_gather tlb;
        struct vm_area_struct *vma;
        unsigned long nr_accounted = 0;
        VMA_ITERATOR(vmi, mm, 0);
        int count = 0;

        /* mm's last user has gone, and its about to be pulled down */
        mmu_notifier_release(mm);

        mmap_read_lock(mm);
        arch_exit_mmap(mm);

        vma = vma_next(&vmi);
        if (!vma || unlikely(xa_is_zero(vma))) {
                /* Can happen if dup_mmap() received an OOM */
                mmap_read_unlock(mm);
                mmap_write_lock(mm);
                goto destroy;
        }

        lru_add_drain();
        flush_cache_mm(mm);
        tlb_gather_mmu_fullmm(&tlb, mm);
        /* update_hiwater_rss(mm) here? but nobody should be looking */
        /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
        unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false);
        mmap_read_unlock(mm);

        /*
         * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper
         * because the memory has been already freed.
         */
        set_bit(MMF_OOM_SKIP, &mm->flags);
        mmap_write_lock(mm);
        mt_clear_in_rcu(&mm->mm_mt);
        vma_iter_set(&vmi, vma->vm_end);
        free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
                      USER_PGTABLES_CEILING, true);
        tlb_finish_mmu(&tlb);

        /*
         * Walk the list again, actually closing and freeing it, with preemption
         * enabled, without holding any MM locks besides the unreachable
         * mmap_write_lock.
         */
        vma_iter_set(&vmi, vma->vm_end);
        do {
                if (vma->vm_flags & VM_ACCOUNT)
                        nr_accounted += vma_pages(vma);
                remove_vma(vma, true);
                count++;
                cond_resched();
                vma = vma_next(&vmi);
        } while (vma && likely(!xa_is_zero(vma)));

        BUG_ON(count != mm->map_count);

        trace_exit_mmap(mm);
destroy:
        __mt_destroy(&mm->mm_mt);
        mmap_write_unlock(mm);
        vm_unacct_memory(nr_accounted);
}

/* Insert vm structure into process list sorted by address
 * and into the inode's i_mmap tree.  If vm_file is non-NULL
 * then i_mmap_rwsem is taken here.
 */
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
        unsigned long charged = vma_pages(vma);


        if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
                return -ENOMEM;

        if ((vma->vm_flags & VM_ACCOUNT) &&
             security_vm_enough_memory_mm(mm, charged))
                return -ENOMEM;

        /*
         * The vm_pgoff of a purely anonymous vma should be irrelevant
         * until its first write fault, when page's anon_vma and index
         * are set.  But now set the vm_pgoff it will almost certainly
         * end up with (unless mremap moves it elsewhere before that
         * first wfault), so /proc/pid/maps tells a consistent story.
         *
         * By setting it to reflect the virtual start address of the
         * vma, merges and splits can happen in a seamless way, just
         * using the existing file pgoff checks and manipulations.
         * Similarly in do_mmap and in do_brk_flags.
         */
        if (vma_is_anonymous(vma)) {
                BUG_ON(vma->anon_vma);
                vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
        }

        if (vma_link(mm, vma)) {
                if (vma->vm_flags & VM_ACCOUNT)
                        vm_unacct_memory(charged);
                return -ENOMEM;
        }

        return 0;
}

/*
 * Copy the vma structure to a new location in the same mm,
 * prior to moving page table entries, to effect an mremap move.
 */
struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
        unsigned long addr, unsigned long len, pgoff_t pgoff,
        bool *need_rmap_locks)
{
        struct vm_area_struct *vma = *vmap;
        unsigned long vma_start = vma->vm_start;
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *new_vma, *prev;
        bool faulted_in_anon_vma = true;
        VMA_ITERATOR(vmi, mm, addr);

        /*
         * If anonymous vma has not yet been faulted, update new pgoff
         * to match new location, to increase its chance of merging.
         */
        if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
                pgoff = addr >> PAGE_SHIFT;
                faulted_in_anon_vma = false;
        }

        new_vma = find_vma_prev(mm, addr, &prev);
        if (new_vma && new_vma->vm_start < addr + len)
                return NULL;        /* should never get here */

        new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff);
        if (new_vma) {
                /*
                 * Source vma may have been merged into new_vma
                 */
                if (unlikely(vma_start >= new_vma->vm_start &&
                             vma_start < new_vma->vm_end)) {
                        /*
                         * The only way we can get a vma_merge with
                         * self during an mremap is if the vma hasn't
                         * been faulted in yet and we were allowed to
                         * reset the dst vma->vm_pgoff to the
                         * destination address of the mremap to allow
                         * the merge to happen. mremap must change the
                         * vm_pgoff linearity between src and dst vmas
                         * (in turn preventing a vma_merge) to be
                         * safe. It is only safe to keep the vm_pgoff
                         * linear if there are no pages mapped yet.
                         */
                        VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
                        *vmap = vma = new_vma;
                }
                *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
        } else {
                new_vma = vm_area_dup(vma);
                if (!new_vma)
                        goto out;
                vma_set_range(new_vma, addr, addr + len, pgoff);
                if (vma_dup_policy(vma, new_vma))
                        goto out_free_vma;
                if (anon_vma_clone(new_vma, vma))
                        goto out_free_mempol;
                if (new_vma->vm_file)
                        get_file(new_vma->vm_file);
                if (new_vma->vm_ops && new_vma->vm_ops->open)
                        new_vma->vm_ops->open(new_vma);
                if (vma_link(mm, new_vma))
                        goto out_vma_link;
                *need_rmap_locks = false;
        }
        return new_vma;

out_vma_link:
        if (new_vma->vm_ops && new_vma->vm_ops->close)
                new_vma->vm_ops->close(new_vma);

        if (new_vma->vm_file)
                fput(new_vma->vm_file);

        unlink_anon_vmas(new_vma);
out_free_mempol:
        mpol_put(vma_policy(new_vma));
out_free_vma:
        vm_area_free(new_vma);
out:
        return NULL;
}

/*
 * Return true if the calling process may expand its vm space by the passed
 * number of pages
 */
bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
{
        if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
                return false;

        if (is_data_mapping(flags) &&
            mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
                /* Workaround for Valgrind */
                if (rlimit(RLIMIT_DATA) == 0 &&
                    mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT)
                        return true;

                pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n",
                             current->comm, current->pid,
                             (mm->data_vm + npages) << PAGE_SHIFT,
                             rlimit(RLIMIT_DATA),
                             ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data");

                if (!ignore_rlimit_data)
                        return false;
        }

        return true;
}

void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
{
        WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages);

        if (is_exec_mapping(flags))
                mm->exec_vm += npages;
        else if (is_stack_mapping(flags))
                mm->stack_vm += npages;
        else if (is_data_mapping(flags))
                mm->data_vm += npages;
}

static vm_fault_t special_mapping_fault(struct vm_fault *vmf);

/*
 * Having a close hook prevents vma merging regardless of flags.
 */
static void special_mapping_close(struct vm_area_struct *vma)
{
}

static const char *special_mapping_name(struct vm_area_struct *vma)
{
        return ((struct vm_special_mapping *)vma->vm_private_data)->name;
}

static int special_mapping_mremap(struct vm_area_struct *new_vma)
{
        struct vm_special_mapping *sm = new_vma->vm_private_data;

        if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
                return -EFAULT;

        if (sm->mremap)
                return sm->mremap(sm, new_vma);

        return 0;
}

static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr)
{
        /*
         * Forbid splitting special mappings - kernel has expectations over
         * the number of pages in mapping. Together with VM_DONTEXPAND
         * the size of vma should stay the same over the special mapping's
         * lifetime.
         */
        return -EINVAL;
}

static const struct vm_operations_struct special_mapping_vmops = {
        .close = special_mapping_close,
        .fault = special_mapping_fault,
        .mremap = special_mapping_mremap,
        .name = special_mapping_name,
        /* vDSO code relies that VVAR can't be accessed remotely */
        .access = NULL,
        .may_split = special_mapping_split,
};

static const struct vm_operations_struct legacy_special_mapping_vmops = {
        .close = special_mapping_close,
        .fault = special_mapping_fault,
};

static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        pgoff_t pgoff;
        struct page **pages;

        if (vma->vm_ops == &legacy_special_mapping_vmops) {
                pages = vma->vm_private_data;
        } else {
                struct vm_special_mapping *sm = vma->vm_private_data;

                if (sm->fault)
                        return sm->fault(sm, vmf->vma, vmf);

                pages = sm->pages;
        }

        for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
                pgoff--;

        if (*pages) {
                struct page *page = *pages;
                get_page(page);
                vmf->page = page;
                return 0;
        }

        return VM_FAULT_SIGBUS;
}

static struct vm_area_struct *__install_special_mapping(
        struct mm_struct *mm,
        unsigned long addr, unsigned long len,
        unsigned long vm_flags, void *priv,
        const struct vm_operations_struct *ops)
{
        int ret;
        struct vm_area_struct *vma;

        vma = vm_area_alloc(mm);
        if (unlikely(vma == NULL))
                return ERR_PTR(-ENOMEM);

        vma_set_range(vma, addr, addr + len, 0);
        vm_flags_init(vma, (vm_flags | mm->def_flags |
                      VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK);
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);

        vma->vm_ops = ops;
        vma->vm_private_data = priv;

        ret = insert_vm_struct(mm, vma);
        if (ret)
                goto out;

        vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);

        perf_event_mmap(vma);

        return vma;

out:
        vm_area_free(vma);
        return ERR_PTR(ret);
}

bool vma_is_special_mapping(const struct vm_area_struct *vma,
        const struct vm_special_mapping *sm)
{
        return vma->vm_private_data == sm &&
                (vma->vm_ops == &special_mapping_vmops ||
                 vma->vm_ops == &legacy_special_mapping_vmops);
}

/*
 * Called with mm->mmap_lock held for writing.
 * Insert a new vma covering the given region, with the given flags.
 * Its pages are supplied by the given array of struct page *.
 * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
 * The region past the last page supplied will always produce SIGBUS.
 * The array pointer and the pages it points to are assumed to stay alive
 * for as long as this mapping might exist.
 */
struct vm_area_struct *_install_special_mapping(
        struct mm_struct *mm,
        unsigned long addr, unsigned long len,
        unsigned long vm_flags, const struct vm_special_mapping *spec)
{
        return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
                                        &special_mapping_vmops);
}

int install_special_mapping(struct mm_struct *mm,
                            unsigned long addr, unsigned long len,
                            unsigned long vm_flags, struct page **pages)
{
        struct vm_area_struct *vma = __install_special_mapping(
                mm, addr, len, vm_flags, (void *)pages,
                &legacy_special_mapping_vmops);

        return PTR_ERR_OR_ZERO(vma);
}

static DEFINE_MUTEX(mm_all_locks_mutex);

static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
{
        if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
                 */
                down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
                /*
                 * We can safely modify head.next after taking the
                 * anon_vma->root->rwsem. If some other vma in this mm shares
                 * the same anon_vma we won't take it again.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us thanks to the
                 * anon_vma->root->rwsem.
                 */
                if (__test_and_set_bit(0, (unsigned long *)
                                       &anon_vma->root->rb_root.rb_root.rb_node))
                        BUG();
        }
}

static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
{
        if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
                /*
                 * AS_MM_ALL_LOCKS can't change from under us because
                 * we hold the mm_all_locks_mutex.
                 *
                 * Operations on ->flags have to be atomic because
                 * even if AS_MM_ALL_LOCKS is stable thanks to the
                 * mm_all_locks_mutex, there may be other cpus
                 * changing other bitflags in parallel to us.
                 */
                if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
                        BUG();
                down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
        }
}

/*
 * This operation locks against the VM for all pte/vma/mm related
 * operations that could ever happen on a certain mm. This includes
 * vmtruncate, try_to_unmap, and all page faults.
 *
 * The caller must take the mmap_lock in write mode before calling
 * mm_take_all_locks(). The caller isn't allowed to release the
 * mmap_lock until mm_drop_all_locks() returns.
 *
 * mmap_lock in write mode is required in order to block all operations
 * that could modify pagetables and free pages without need of
 * altering the vma layout. It's also needed in write mode to avoid new
 * anon_vmas to be associated with existing vmas.
 *
 * A single task can't take more than one mm_take_all_locks() in a row
 * or it would deadlock.
 *
 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
 * mapping->flags avoid to take the same lock twice, if more than one
 * vma in this mm is backed by the same anon_vma or address_space.
 *
 * We take locks in following order, accordingly to comment at beginning
 * of mm/rmap.c:
 *   - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
 *     hugetlb mapping);
 *   - all vmas marked locked
 *   - all i_mmap_rwsem locks;
 *   - all anon_vma->rwseml
 *
 * We can take all locks within these types randomly because the VM code
 * doesn't nest them and we protected from parallel mm_take_all_locks() by
 * mm_all_locks_mutex.
 *
 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
 * that may have to take thousand of locks.
 *
 * mm_take_all_locks() can fail if it's interrupted by signals.
 */
int mm_take_all_locks(struct mm_struct *mm)
{
        struct vm_area_struct *vma;
        struct anon_vma_chain *avc;
        VMA_ITERATOR(vmi, mm, 0);

        mmap_assert_write_locked(mm);

        mutex_lock(&mm_all_locks_mutex);

        /*
         * vma_start_write() does not have a complement in mm_drop_all_locks()
         * because vma_start_write() is always asymmetrical; it marks a VMA as
         * being written to until mmap_write_unlock() or mmap_write_downgrade()
         * is reached.
         */
        for_each_vma(vmi, vma) {
                if (signal_pending(current))
                        goto out_unlock;
                vma_start_write(vma);
        }

        vma_iter_init(&vmi, mm, 0);
        for_each_vma(vmi, vma) {
                if (signal_pending(current))
                        goto out_unlock;
                if (vma->vm_file && vma->vm_file->f_mapping &&
                                is_vm_hugetlb_page(vma))
                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
        }

        vma_iter_init(&vmi, mm, 0);
        for_each_vma(vmi, vma) {
                if (signal_pending(current))
                        goto out_unlock;
                if (vma->vm_file && vma->vm_file->f_mapping &&
                                !is_vm_hugetlb_page(vma))
                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
        }

        vma_iter_init(&vmi, mm, 0);
        for_each_vma(vmi, vma) {
                if (signal_pending(current))
                        goto out_unlock;
                if (vma->anon_vma)
                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                                vm_lock_anon_vma(mm, avc->anon_vma);
        }

        return 0;

out_unlock:
        mm_drop_all_locks(mm);
        return -EINTR;
}

static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
{
        if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change to 0 from under
                 * us because we hold the mm_all_locks_mutex.
                 *
                 * We must however clear the bitflag before unlocking
                 * the vma so the users using the anon_vma->rb_root will
                 * never see our bitflag.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us until we release the
                 * anon_vma->root->rwsem.
                 */
                if (!__test_and_clear_bit(0, (unsigned long *)
                                          &anon_vma->root->rb_root.rb_root.rb_node))
                        BUG();
                anon_vma_unlock_write(anon_vma);
        }
}

static void vm_unlock_mapping(struct address_space *mapping)
{
        if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
                /*
                 * AS_MM_ALL_LOCKS can't change to 0 from under us
                 * because we hold the mm_all_locks_mutex.
                 */
                i_mmap_unlock_write(mapping);
                if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
                                        &mapping->flags))
                        BUG();
        }
}

/*
 * The mmap_lock cannot be released by the caller until
 * mm_drop_all_locks() returns.
 */
void mm_drop_all_locks(struct mm_struct *mm)
{
        struct vm_area_struct *vma;
        struct anon_vma_chain *avc;
        VMA_ITERATOR(vmi, mm, 0);

        mmap_assert_write_locked(mm);
        BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));

        for_each_vma(vmi, vma) {
                if (vma->anon_vma)
                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                                vm_unlock_anon_vma(avc->anon_vma);
                if (vma->vm_file && vma->vm_file->f_mapping)
                        vm_unlock_mapping(vma->vm_file->f_mapping);
        }

        mutex_unlock(&mm_all_locks_mutex);
}

/*
 * initialise the percpu counter for VM
 */
void __init mmap_init(void)
{
        int ret;

        ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
        VM_BUG_ON(ret);
}

/*
 * Initialise sysctl_user_reserve_kbytes.
 *
 * This is intended to prevent a user from starting a single memory hogging
 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
 * mode.
 *
 * The default value is min(3% of free memory, 128MB)
 * 128MB is enough to recover with sshd/login, bash, and top/kill.
 */
static int init_user_reserve(void)
{
        unsigned long free_kbytes;

        free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));

        sysctl_user_reserve_kbytes = min(free_kbytes / 32, SZ_128K);
        return 0;
}
subsys_initcall(init_user_reserve);

/*
 * Initialise sysctl_admin_reserve_kbytes.
 *
 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
 * to log in and kill a memory hogging process.
 *
 * Systems with more than 256MB will reserve 8MB, enough to recover
 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
 * only reserve 3% of free pages by default.
 */
static int init_admin_reserve(void)
{
        unsigned long free_kbytes;

        free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));

        sysctl_admin_reserve_kbytes = min(free_kbytes / 32, SZ_8K);
        return 0;
}
subsys_initcall(init_admin_reserve);

/*
 * Reinititalise user and admin reserves if memory is added or removed.
 *
 * The default user reserve max is 128MB, and the default max for the
 * admin reserve is 8MB. These are usually, but not always, enough to
 * enable recovery from a memory hogging process using login/sshd, a shell,
 * and tools like top. It may make sense to increase or even disable the
 * reserve depending on the existence of swap or variations in the recovery
 * tools. So, the admin may have changed them.
 *
 * If memory is added and the reserves have been eliminated or increased above
 * the default max, then we'll trust the admin.
 *
 * If memory is removed and there isn't enough free memory, then we
 * need to reset the reserves.
 *
 * Otherwise keep the reserve set by the admin.
 */
static int reserve_mem_notifier(struct notifier_block *nb,
                             unsigned long action, void *data)
{
        unsigned long tmp, free_kbytes;

        switch (action) {
        case MEM_ONLINE:
                /* Default max is 128MB. Leave alone if modified by operator. */
                tmp = sysctl_user_reserve_kbytes;
                if (tmp > 0 && tmp < SZ_128K)
                        init_user_reserve();

                /* Default max is 8MB.  Leave alone if modified by operator. */
                tmp = sysctl_admin_reserve_kbytes;
                if (tmp > 0 && tmp < SZ_8K)
                        init_admin_reserve();

                break;
        case MEM_OFFLINE:
                free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));

                if (sysctl_user_reserve_kbytes > free_kbytes) {
                        init_user_reserve();
                        pr_info("vm.user_reserve_kbytes reset to %lu\n",
                                sysctl_user_reserve_kbytes);
                }

                if (sysctl_admin_reserve_kbytes > free_kbytes) {
                        init_admin_reserve();
                        pr_info("vm.admin_reserve_kbytes reset to %lu\n",
                                sysctl_admin_reserve_kbytes);
                }
                break;
        default:
                break;
        }
        return NOTIFY_OK;
}

static int __meminit init_reserve_notifier(void)
{
        if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI))
                pr_err("Failed registering memory add/remove notifier for admin reserve\n");

        return 0;
}
subsys_initcall(init_reserve_notifier);



























    3 
    3 

























    3 





    3 
    3 





    3 










    3 

































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
// SPDX-License-Identifier: GPL-2.0
/*
 * Convert integer string representation to an integer.
 * If an integer doesn't fit into specified type, -E is returned.
 *
 * Integer starts with optional sign.
 * kstrtou*() functions do not accept sign "-".
 *
 * Radix 0 means autodetection: leading "0x" implies radix 16,
 * leading "0" implies radix 8, otherwise radix is 10.
 * Autodetection hints work after optional sign, but not before.
 *
 * If -E is returned, result is not touched.
 */
#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/kstrtox.h>
#include <linux/math64.h>
#include <linux/types.h>
#include <linux/uaccess.h>

#include "kstrtox.h"

noinline
const char *_parse_integer_fixup_radix(const char *s, unsigned int *base)
{
        if (*base == 0) {
                if (s[0] == '0') {
                        if (_tolower(s[1]) == 'x' && isxdigit(s[2]))
                                *base = 16;
                        else
                                *base = 8;
                } else
                        *base = 10;
        }
        if (*base == 16 && s[0] == '0' && _tolower(s[1]) == 'x')
                s += 2;
        return s;
}

/*
 * Convert non-negative integer string representation in explicitly given radix
 * to an integer. A maximum of max_chars characters will be converted.
 *
 * Return number of characters consumed maybe or-ed with overflow bit.
 * If overflow occurs, result integer (incorrect) is still returned.
 *
 * Don't you dare use this function.
 */
noinline
unsigned int _parse_integer_limit(const char *s, unsigned int base, unsigned long long *p,
                                  size_t max_chars)
{
        unsigned long long res;
        unsigned int rv;

        res = 0;
        rv = 0;
        while (max_chars--) {
                unsigned int c = *s;
                unsigned int lc = _tolower(c);
                unsigned int val;

                if ('0' <= c && c <= '9')
                        val = c - '0';
                else if ('a' <= lc && lc <= 'f')
                        val = lc - 'a' + 10;
                else
                        break;

                if (val >= base)
                        break;
                /*
                 * Check for overflow only if we are within range of
                 * it in the max base we support (16)
                 */
                if (unlikely(res & (~0ull << 60))) {
                        if (res > div_u64(ULLONG_MAX - val, base))
                                rv |= KSTRTOX_OVERFLOW;
                }
                res = res * base + val;
                rv++;
                s++;
        }
        *p = res;
        return rv;
}

noinline
unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long *p)
{
        return _parse_integer_limit(s, base, p, INT_MAX);
}

static int _kstrtoull(const char *s, unsigned int base, unsigned long long *res)
{
        unsigned long long _res;
        unsigned int rv;

        s = _parse_integer_fixup_radix(s, &base);
        rv = _parse_integer(s, base, &_res);
        if (rv & KSTRTOX_OVERFLOW)
                return -ERANGE;
        if (rv == 0)
                return -EINVAL;
        s += rv;
        if (*s == '\n')
                s++;
        if (*s)
                return -EINVAL;
        *res = _res;
        return 0;
}

/**
 * kstrtoull - convert a string to an unsigned long long
 * @s: The start of the string. The string must be null-terminated, and may also
 *  include a single newline before its terminating null. The first character
 *  may also be a plus sign, but not a minus sign.
 * @base: The number base to use. The maximum supported base is 16. If base is
 *  given as 0, then the base of the string is automatically detected with the
 *  conventional semantics - If it begins with 0x the number will be parsed as a
 *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
 *  parsed as an octal number. Otherwise it will be parsed as a decimal.
 * @res: Where to write the result of the conversion on success.
 *
 * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
 * Preferred over simple_strtoull(). Return code must be checked.
 */
noinline
int kstrtoull(const char *s, unsigned int base, unsigned long long *res)
{
        if (s[0] == '+')
                s++;
        return _kstrtoull(s, base, res);
}
EXPORT_SYMBOL(kstrtoull);

/**
 * kstrtoll - convert a string to a long long
 * @s: The start of the string. The string must be null-terminated, and may also
 *  include a single newline before its terminating null. The first character
 *  may also be a plus sign or a minus sign.
 * @base: The number base to use. The maximum supported base is 16. If base is
 *  given as 0, then the base of the string is automatically detected with the
 *  conventional semantics - If it begins with 0x the number will be parsed as a
 *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
 *  parsed as an octal number. Otherwise it will be parsed as a decimal.
 * @res: Where to write the result of the conversion on success.
 *
 * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
 * Preferred over simple_strtoll(). Return code must be checked.
 */
noinline
int kstrtoll(const char *s, unsigned int base, long long *res)
{
        unsigned long long tmp;
        int rv;

        if (s[0] == '-') {
                rv = _kstrtoull(s + 1, base, &tmp);
                if (rv < 0)
                        return rv;
                if ((long long)-tmp > 0)
                        return -ERANGE;
                *res = -tmp;
        } else {
                rv = kstrtoull(s, base, &tmp);
                if (rv < 0)
                        return rv;
                if ((long long)tmp < 0)
                        return -ERANGE;
                *res = tmp;
        }
        return 0;
}
EXPORT_SYMBOL(kstrtoll);

/* Internal, do not use. */
int _kstrtoul(const char *s, unsigned int base, unsigned long *res)
{
        unsigned long long tmp;
        int rv;

        rv = kstrtoull(s, base, &tmp);
        if (rv < 0)
                return rv;
        if (tmp != (unsigned long)tmp)
                return -ERANGE;
        *res = tmp;
        return 0;
}
EXPORT_SYMBOL(_kstrtoul);

/* Internal, do not use. */
int _kstrtol(const char *s, unsigned int base, long *res)
{
        long long tmp;
        int rv;

        rv = kstrtoll(s, base, &tmp);
        if (rv < 0)
                return rv;
        if (tmp != (long)tmp)
                return -ERANGE;
        *res = tmp;
        return 0;
}
EXPORT_SYMBOL(_kstrtol);

/**
 * kstrtouint - convert a string to an unsigned int
 * @s: The start of the string. The string must be null-terminated, and may also
 *  include a single newline before its terminating null. The first character
 *  may also be a plus sign, but not a minus sign.
 * @base: The number base to use. The maximum supported base is 16. If base is
 *  given as 0, then the base of the string is automatically detected with the
 *  conventional semantics - If it begins with 0x the number will be parsed as a
 *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
 *  parsed as an octal number. Otherwise it will be parsed as a decimal.
 * @res: Where to write the result of the conversion on success.
 *
 * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
 * Preferred over simple_strtoul(). Return code must be checked.
 */
noinline
int kstrtouint(const char *s, unsigned int base, unsigned int *res)
{
        unsigned long long tmp;
        int rv;

        rv = kstrtoull(s, base, &tmp);
        if (rv < 0)
                return rv;
        if (tmp != (unsigned int)tmp)
                return -ERANGE;
        *res = tmp;
        return 0;
}
EXPORT_SYMBOL(kstrtouint);

/**
 * kstrtoint - convert a string to an int
 * @s: The start of the string. The string must be null-terminated, and may also
 *  include a single newline before its terminating null. The first character
 *  may also be a plus sign or a minus sign.
 * @base: The number base to use. The maximum supported base is 16. If base is
 *  given as 0, then the base of the string is automatically detected with the
 *  conventional semantics - If it begins with 0x the number will be parsed as a
 *  hexadecimal (case insensitive), if it otherwise begins with 0, it will be
 *  parsed as an octal number. Otherwise it will be parsed as a decimal.
 * @res: Where to write the result of the conversion on success.
 *
 * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
 * Preferred over simple_strtol(). Return code must be checked.
 */
noinline
int kstrtoint(const char *s, unsigned int base, int *res)
{
        long long tmp;
        int rv;

        rv = kstrtoll(s, base, &tmp);
        if (rv < 0)
                return rv;
        if (tmp != (int)tmp)
                return -ERANGE;
        *res = tmp;
        return 0;
}
EXPORT_SYMBOL(kstrtoint);

noinline
int kstrtou16(const char *s, unsigned int base, u16 *res)
{
        unsigned long long tmp;
        int rv;

        rv = kstrtoull(s, base, &tmp);
        if (rv < 0)
                return rv;
        if (tmp != (u16)tmp)
                return -ERANGE;
        *res = tmp;
        return 0;
}
EXPORT_SYMBOL(kstrtou16);

noinline
int kstrtos16(const char *s, unsigned int base, s16 *res)
{
        long long tmp;
        int rv;

        rv = kstrtoll(s, base, &tmp);
        if (rv < 0)
                return rv;
        if (tmp != (s16)tmp)
                return -ERANGE;
        *res = tmp;
        return 0;
}
EXPORT_SYMBOL(kstrtos16);

noinline
int kstrtou8(const char *s, unsigned int base, u8 *res)
{
        unsigned long long tmp;
        int rv;

        rv = kstrtoull(s, base, &tmp);
        if (rv < 0)
                return rv;
        if (tmp != (u8)tmp)
                return -ERANGE;
        *res = tmp;
        return 0;
}
EXPORT_SYMBOL(kstrtou8);

noinline
int kstrtos8(const char *s, unsigned int base, s8 *res)
{
        long long tmp;
        int rv;

        rv = kstrtoll(s, base, &tmp);
        if (rv < 0)
                return rv;
        if (tmp != (s8)tmp)
                return -ERANGE;
        *res = tmp;
        return 0;
}
EXPORT_SYMBOL(kstrtos8);

/**
 * kstrtobool - convert common user inputs into boolean values
 * @s: input string
 * @res: result
 *
 * This routine returns 0 iff the first character is one of 'YyTt1NnFf0', or
 * [oO][NnFf] for "on" and "off". Otherwise it will return -EINVAL.  Value
 * pointed to by res is updated upon finding a match.
 */
noinline
int kstrtobool(const char *s, bool *res)
{
        if (!s)
                return -EINVAL;

        switch (s[0]) {
        case 'y':
        case 'Y':
        case 't':
        case 'T':
        case '1':
                *res = true;
                return 0;
        case 'n':
        case 'N':
        case 'f':
        case 'F':
        case '0':
                *res = false;
                return 0;
        case 'o':
        case 'O':
                switch (s[1]) {
                case 'n':
                case 'N':
                        *res = true;
                        return 0;
                case 'f':
                case 'F':
                        *res = false;
                        return 0;
                default:
                        break;
                }
                break;
        default:
                break;
        }

        return -EINVAL;
}
EXPORT_SYMBOL(kstrtobool);

/*
 * Since "base" would be a nonsense argument, this open-codes the
 * _from_user helper instead of using the helper macro below.
 */
int kstrtobool_from_user(const char __user *s, size_t count, bool *res)
{
        /* Longest string needed to differentiate, newline, terminator */
        char buf[4];

        count = min(count, sizeof(buf) - 1);
        if (copy_from_user(buf, s, count))
                return -EFAULT;
        buf[count] = '\0';
        return kstrtobool(buf, res);
}
EXPORT_SYMBOL(kstrtobool_from_user);

#define kstrto_from_user(f, g, type)                                        \
int f(const char __user *s, size_t count, unsigned int base, type *res)        \
{                                                                        \
        /* sign, base 2 representation, newline, terminator */                \
        char buf[1 + sizeof(type) * 8 + 1 + 1];                                \
                                                                        \
        count = min(count, sizeof(buf) - 1);                                \
        if (copy_from_user(buf, s, count))                                \
                return -EFAULT;                                                \
        buf[count] = '\0';                                                \
        return g(buf, base, res);                                        \
}                                                                        \
EXPORT_SYMBOL(f)

kstrto_from_user(kstrtoull_from_user,        kstrtoull,        unsigned long long);
kstrto_from_user(kstrtoll_from_user,        kstrtoll,        long long);
kstrto_from_user(kstrtoul_from_user,        kstrtoul,        unsigned long);
kstrto_from_user(kstrtol_from_user,        kstrtol,        long);
kstrto_from_user(kstrtouint_from_user,        kstrtouint,        unsigned int);
kstrto_from_user(kstrtoint_from_user,        kstrtoint,        int);
kstrto_from_user(kstrtou16_from_user,        kstrtou16,        u16);
kstrto_from_user(kstrtos16_from_user,        kstrtos16,        s16);
kstrto_from_user(kstrtou8_from_user,        kstrtou8,        u8);
kstrto_from_user(kstrtos8_from_user,        kstrtos8,        s8);


























































    2 











































































































































    3 
    1 










































    3 

    1 
































































    1 





    1 





    3 







    3 
    3 







    1 
    3 
    3 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
  Red Black Trees
  (C) 1999  Andrea Arcangeli <andrea@suse.de>
  (C) 2002  David Woodhouse <dwmw2@infradead.org>
  (C) 2012  Michel Lespinasse <walken@google.com>


  linux/include/linux/rbtree_augmented.h
*/

#ifndef _LINUX_RBTREE_AUGMENTED_H
#define _LINUX_RBTREE_AUGMENTED_H

#include <linux/compiler.h>
#include <linux/rbtree.h>
#include <linux/rcupdate.h>

/*
 * Please note - only struct rb_augment_callbacks and the prototypes for
 * rb_insert_augmented() and rb_erase_augmented() are intended to be public.
 * The rest are implementation details you are not expected to depend on.
 *
 * See Documentation/core-api/rbtree.rst for documentation and samples.
 */

struct rb_augment_callbacks {
        void (*propagate)(struct rb_node *node, struct rb_node *stop);
        void (*copy)(struct rb_node *old, struct rb_node *new);
        void (*rotate)(struct rb_node *old, struct rb_node *new);
};

extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
        void (*augment_rotate)(struct rb_node *old, struct rb_node *new));

/*
 * Fixup the rbtree and update the augmented information when rebalancing.
 *
 * On insertion, the user must update the augmented information on the path
 * leading to the inserted node, then call rb_link_node() as usual and
 * rb_insert_augmented() instead of the usual rb_insert_color() call.
 * If rb_insert_augmented() rebalances the rbtree, it will callback into
 * a user provided function to update the augmented information on the
 * affected subtrees.
 */
static inline void
rb_insert_augmented(struct rb_node *node, struct rb_root *root,
                    const struct rb_augment_callbacks *augment)
{
        __rb_insert_augmented(node, root, augment->rotate);
}

static inline void
rb_insert_augmented_cached(struct rb_node *node,
                           struct rb_root_cached *root, bool newleft,
                           const struct rb_augment_callbacks *augment)
{
        if (newleft)
                root->rb_leftmost = node;
        rb_insert_augmented(node, &root->rb_root, augment);
}

static __always_inline struct rb_node *
rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree,
                        bool (*less)(struct rb_node *, const struct rb_node *),
                        const struct rb_augment_callbacks *augment)
{
        struct rb_node **link = &tree->rb_root.rb_node;
        struct rb_node *parent = NULL;
        bool leftmost = true;

        while (*link) {
                parent = *link;
                if (less(node, parent)) {
                        link = &parent->rb_left;
                } else {
                        link = &parent->rb_right;
                        leftmost = false;
                }
        }

        rb_link_node(node, parent, link);
        augment->propagate(parent, NULL); /* suboptimal */
        rb_insert_augmented_cached(node, tree, leftmost, augment);

        return leftmost ? node : NULL;
}

/*
 * Template for declaring augmented rbtree callbacks (generic case)
 *
 * RBSTATIC:    'static' or empty
 * RBNAME:      name of the rb_augment_callbacks structure
 * RBSTRUCT:    struct type of the tree nodes
 * RBFIELD:     name of struct rb_node field within RBSTRUCT
 * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree
 * RBCOMPUTE:   name of function that recomputes the RBAUGMENTED data
 */

#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,                                \
                             RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE)        \
static inline void                                                        \
RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop)                \
{                                                                        \
        while (rb != stop) {                                                \
                RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD);        \
                if (RBCOMPUTE(node, true))                                \
                        break;                                                \
                rb = rb_parent(&node->RBFIELD);                                \
        }                                                                \
}                                                                        \
static inline void                                                        \
RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)                \
{                                                                        \
        RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);                \
        RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);                \
        new->RBAUGMENTED = old->RBAUGMENTED;                                \
}                                                                        \
static void                                                                \
RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)        \
{                                                                        \
        RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);                \
        RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);                \
        new->RBAUGMENTED = old->RBAUGMENTED;                                \
        RBCOMPUTE(old, false);                                                \
}                                                                        \
RBSTATIC const struct rb_augment_callbacks RBNAME = {                        \
        .propagate = RBNAME ## _propagate,                                \
        .copy = RBNAME ## _copy,                                        \
        .rotate = RBNAME ## _rotate                                        \
};

/*
 * Template for declaring augmented rbtree callbacks,
 * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes.
 *
 * RBSTATIC:    'static' or empty
 * RBNAME:      name of the rb_augment_callbacks structure
 * RBSTRUCT:    struct type of the tree nodes
 * RBFIELD:     name of struct rb_node field within RBSTRUCT
 * RBTYPE:      type of the RBAUGMENTED field
 * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree
 * RBCOMPUTE:   name of function that returns the per-node RBTYPE scalar
 */

#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD,              \
                                 RBTYPE, RBAUGMENTED, RBCOMPUTE)              \
static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit)              \
{                                                                              \
        RBSTRUCT *child;                                                      \
        RBTYPE max = RBCOMPUTE(node);                                              \
        if (node->RBFIELD.rb_left) {                                              \
                child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD);   \
                if (child->RBAUGMENTED > max)                                      \
                        max = child->RBAUGMENTED;                              \
        }                                                                      \
        if (node->RBFIELD.rb_right) {                                              \
                child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD);  \
                if (child->RBAUGMENTED > max)                                      \
                        max = child->RBAUGMENTED;                              \
        }                                                                      \
        if (exit && node->RBAUGMENTED == max)                                      \
                return true;                                                      \
        node->RBAUGMENTED = max;                                              \
        return false;                                                              \
}                                                                              \
RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,                                              \
                     RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max)


#define        RB_RED                0
#define        RB_BLACK        1

#define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))

#define __rb_color(pc)     ((pc) & 1)
#define __rb_is_black(pc)  __rb_color(pc)
#define __rb_is_red(pc)    (!__rb_color(pc))
#define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
#define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
#define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)

static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
{
        rb->__rb_parent_color = rb_color(rb) + (unsigned long)p;
}

static inline void rb_set_parent_color(struct rb_node *rb,
                                       struct rb_node *p, int color)
{
        rb->__rb_parent_color = (unsigned long)p + color;
}

static inline void
__rb_change_child(struct rb_node *old, struct rb_node *new,
                  struct rb_node *parent, struct rb_root *root)
{
        if (parent) {
                if (parent->rb_left == old)
                        WRITE_ONCE(parent->rb_left, new);
                else
                        WRITE_ONCE(parent->rb_right, new);
        } else
                WRITE_ONCE(root->rb_node, new);
}

static inline void
__rb_change_child_rcu(struct rb_node *old, struct rb_node *new,
                      struct rb_node *parent, struct rb_root *root)
{
        if (parent) {
                if (parent->rb_left == old)
                        rcu_assign_pointer(parent->rb_left, new);
                else
                        rcu_assign_pointer(parent->rb_right, new);
        } else
                rcu_assign_pointer(root->rb_node, new);
}

extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
        void (*augment_rotate)(struct rb_node *old, struct rb_node *new));

static __always_inline struct rb_node *
__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
                     const struct rb_augment_callbacks *augment)
{
        struct rb_node *child = node->rb_right;
        struct rb_node *tmp = node->rb_left;
        struct rb_node *parent, *rebalance;
        unsigned long pc;

        if (!tmp) {
                /*
                 * Case 1: node to erase has no more than 1 child (easy!)
                 *
                 * Note that if there is one child it must be red due to 5)
                 * and node must be black due to 4). We adjust colors locally
                 * so as to bypass __rb_erase_color() later on.
                 */
                pc = node->__rb_parent_color;
                parent = __rb_parent(pc);
                __rb_change_child(node, child, parent, root);
                if (child) {
                        child->__rb_parent_color = pc;
                        rebalance = NULL;
                } else
                        rebalance = __rb_is_black(pc) ? parent : NULL;
                tmp = parent;
        } else if (!child) {
                /* Still case 1, but this time the child is node->rb_left */
                tmp->__rb_parent_color = pc = node->__rb_parent_color;
                parent = __rb_parent(pc);
                __rb_change_child(node, tmp, parent, root);
                rebalance = NULL;
                tmp = parent;
        } else {
                struct rb_node *successor = child, *child2;

                tmp = child->rb_left;
                if (!tmp) {
                        /*
                         * Case 2: node's successor is its right child
                         *
                         *    (n)          (s)
                         *    / \          / \
                         *  (x) (s)  ->  (x) (c)
                         *        \
                         *        (c)
                         */
                        parent = successor;
                        child2 = successor->rb_right;

                        augment->copy(node, successor);
                } else {
                        /*
                         * Case 3: node's successor is leftmost under
                         * node's right child subtree
                         *
                         *    (n)          (s)
                         *    / \          / \
                         *  (x) (y)  ->  (x) (y)
                         *      /            /
                         *    (p)          (p)
                         *    /            /
                         *  (s)          (c)
                         *    \
                         *    (c)
                         */
                        do {
                                parent = successor;
                                successor = tmp;
                                tmp = tmp->rb_left;
                        } while (tmp);
                        child2 = successor->rb_right;
                        WRITE_ONCE(parent->rb_left, child2);
                        WRITE_ONCE(successor->rb_right, child);
                        rb_set_parent(child, successor);

                        augment->copy(node, successor);
                        augment->propagate(parent, successor);
                }

                tmp = node->rb_left;
                WRITE_ONCE(successor->rb_left, tmp);
                rb_set_parent(tmp, successor);

                pc = node->__rb_parent_color;
                tmp = __rb_parent(pc);
                __rb_change_child(node, successor, tmp, root);

                if (child2) {
                        rb_set_parent_color(child2, parent, RB_BLACK);
                        rebalance = NULL;
                } else {
                        rebalance = rb_is_black(successor) ? parent : NULL;
                }
                successor->__rb_parent_color = pc;
                tmp = successor;
        }

        augment->propagate(tmp, NULL);
        return rebalance;
}

static __always_inline void
rb_erase_augmented(struct rb_node *node, struct rb_root *root,
                   const struct rb_augment_callbacks *augment)
{
        struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
        if (rebalance)
                __rb_erase_color(rebalance, root, augment->rotate);
}

static __always_inline void
rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
                          const struct rb_augment_callbacks *augment)
{
        if (root->rb_leftmost == node)
                root->rb_leftmost = rb_next(node);
        rb_erase_augmented(node, &root->rb_root, augment);
}

#endif        /* _LINUX_RBTREE_AUGMENTED_H */




























































































    1 

    1 

    1 













    1 













































































































































































































































    1 

























































































    1 






































    1 







    1 






    1 


    1 









































































































































































































































































































































































































































    1 





















    1 




    1 






























































































    3 



    3 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/fcntl.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/syscalls.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/sched/task.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/capability.h>
#include <linux/dnotify.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/pipe_fs_i.h>
#include <linux/security.h>
#include <linux/ptrace.h>
#include <linux/signal.h>
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/memfd.h>
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/rw_hint.h>

#include <linux/poll.h>
#include <asm/siginfo.h>
#include <linux/uaccess.h>

#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)

static int setfl(int fd, struct file * filp, unsigned int arg)
{
        struct inode * inode = file_inode(filp);
        int error = 0;

        /*
         * O_APPEND cannot be cleared if the file is marked as append-only
         * and the file is open for write.
         */
        if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode))
                return -EPERM;

        /* O_NOATIME can only be set by the owner or superuser */
        if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
                if (!inode_owner_or_capable(file_mnt_idmap(filp), inode))
                        return -EPERM;

        /* required for strict SunOS emulation */
        if (O_NONBLOCK != O_NDELAY)
               if (arg & O_NDELAY)
                   arg |= O_NONBLOCK;

        /* Pipe packetized mode is controlled by O_DIRECT flag */
        if (!S_ISFIFO(inode->i_mode) &&
            (arg & O_DIRECT) &&
            !(filp->f_mode & FMODE_CAN_ODIRECT))
                return -EINVAL;

        if (filp->f_op->check_flags)
                error = filp->f_op->check_flags(arg);
        if (error)
                return error;

        /*
         * ->fasync() is responsible for setting the FASYNC bit.
         */
        if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op->fasync) {
                error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
                if (error < 0)
                        goto out;
                if (error > 0)
                        error = 0;
        }
        spin_lock(&filp->f_lock);
        filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
        filp->f_iocb_flags = iocb_flags(filp);
        spin_unlock(&filp->f_lock);

 out:
        return error;
}

static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
                     int force)
{
        write_lock_irq(&filp->f_owner.lock);
        if (force || !filp->f_owner.pid) {
                put_pid(filp->f_owner.pid);
                filp->f_owner.pid = get_pid(pid);
                filp->f_owner.pid_type = type;

                if (pid) {
                        const struct cred *cred = current_cred();
                        filp->f_owner.uid = cred->uid;
                        filp->f_owner.euid = cred->euid;
                }
        }
        write_unlock_irq(&filp->f_owner.lock);
}

void __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
                int force)
{
        security_file_set_fowner(filp);
        f_modown(filp, pid, type, force);
}
EXPORT_SYMBOL(__f_setown);

int f_setown(struct file *filp, int who, int force)
{
        enum pid_type type;
        struct pid *pid = NULL;
        int ret = 0;

        type = PIDTYPE_TGID;
        if (who < 0) {
                /* avoid overflow below */
                if (who == INT_MIN)
                        return -EINVAL;

                type = PIDTYPE_PGID;
                who = -who;
        }

        rcu_read_lock();
        if (who) {
                pid = find_vpid(who);
                if (!pid)
                        ret = -ESRCH;
        }

        if (!ret)
                __f_setown(filp, pid, type, force);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL(f_setown);

void f_delown(struct file *filp)
{
        f_modown(filp, NULL, PIDTYPE_TGID, 1);
}

pid_t f_getown(struct file *filp)
{
        pid_t pid = 0;

        read_lock_irq(&filp->f_owner.lock);
        rcu_read_lock();
        if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) {
                pid = pid_vnr(filp->f_owner.pid);
                if (filp->f_owner.pid_type == PIDTYPE_PGID)
                        pid = -pid;
        }
        rcu_read_unlock();
        read_unlock_irq(&filp->f_owner.lock);
        return pid;
}

static int f_setown_ex(struct file *filp, unsigned long arg)
{
        struct f_owner_ex __user *owner_p = (void __user *)arg;
        struct f_owner_ex owner;
        struct pid *pid;
        int type;
        int ret;

        ret = copy_from_user(&owner, owner_p, sizeof(owner));
        if (ret)
                return -EFAULT;

        switch (owner.type) {
        case F_OWNER_TID:
                type = PIDTYPE_PID;
                break;

        case F_OWNER_PID:
                type = PIDTYPE_TGID;
                break;

        case F_OWNER_PGRP:
                type = PIDTYPE_PGID;
                break;

        default:
                return -EINVAL;
        }

        rcu_read_lock();
        pid = find_vpid(owner.pid);
        if (owner.pid && !pid)
                ret = -ESRCH;
        else
                 __f_setown(filp, pid, type, 1);
        rcu_read_unlock();

        return ret;
}

static int f_getown_ex(struct file *filp, unsigned long arg)
{
        struct f_owner_ex __user *owner_p = (void __user *)arg;
        struct f_owner_ex owner = {};
        int ret = 0;

        read_lock_irq(&filp->f_owner.lock);
        rcu_read_lock();
        if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type))
                owner.pid = pid_vnr(filp->f_owner.pid);
        rcu_read_unlock();
        switch (filp->f_owner.pid_type) {
        case PIDTYPE_PID:
                owner.type = F_OWNER_TID;
                break;

        case PIDTYPE_TGID:
                owner.type = F_OWNER_PID;
                break;

        case PIDTYPE_PGID:
                owner.type = F_OWNER_PGRP;
                break;

        default:
                WARN_ON(1);
                ret = -EINVAL;
                break;
        }
        read_unlock_irq(&filp->f_owner.lock);

        if (!ret) {
                ret = copy_to_user(owner_p, &owner, sizeof(owner));
                if (ret)
                        ret = -EFAULT;
        }
        return ret;
}

#ifdef CONFIG_CHECKPOINT_RESTORE
static int f_getowner_uids(struct file *filp, unsigned long arg)
{
        struct user_namespace *user_ns = current_user_ns();
        uid_t __user *dst = (void __user *)arg;
        uid_t src[2];
        int err;

        read_lock_irq(&filp->f_owner.lock);
        src[0] = from_kuid(user_ns, filp->f_owner.uid);
        src[1] = from_kuid(user_ns, filp->f_owner.euid);
        read_unlock_irq(&filp->f_owner.lock);

        err  = put_user(src[0], &dst[0]);
        err |= put_user(src[1], &dst[1]);

        return err;
}
#else
static int f_getowner_uids(struct file *filp, unsigned long arg)
{
        return -EINVAL;
}
#endif

static bool rw_hint_valid(u64 hint)
{
        BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET);
        BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE);
        BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT);
        BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM);
        BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG);
        BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME);

        switch (hint) {
        case RWH_WRITE_LIFE_NOT_SET:
        case RWH_WRITE_LIFE_NONE:
        case RWH_WRITE_LIFE_SHORT:
        case RWH_WRITE_LIFE_MEDIUM:
        case RWH_WRITE_LIFE_LONG:
        case RWH_WRITE_LIFE_EXTREME:
                return true;
        default:
                return false;
        }
}

static long fcntl_get_rw_hint(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        struct inode *inode = file_inode(file);
        u64 __user *argp = (u64 __user *)arg;
        u64 hint = READ_ONCE(inode->i_write_hint);

        if (copy_to_user(argp, &hint, sizeof(*argp)))
                return -EFAULT;
        return 0;
}

static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        struct inode *inode = file_inode(file);
        u64 __user *argp = (u64 __user *)arg;
        u64 hint;

        if (copy_from_user(&hint, argp, sizeof(hint)))
                return -EFAULT;
        if (!rw_hint_valid(hint))
                return -EINVAL;

        WRITE_ONCE(inode->i_write_hint, hint);

        /*
         * file->f_mapping->host may differ from inode. As an example,
         * blkdev_open() modifies file->f_mapping.
         */
        if (file->f_mapping->host != inode)
                WRITE_ONCE(file->f_mapping->host->i_write_hint, hint);

        return 0;
}

/* Is the file descriptor a dup of the file? */
static long f_dupfd_query(int fd, struct file *filp)
{
        CLASS(fd_raw, f)(fd);

        /*
         * We can do the 'fdput()' immediately, as the only thing that
         * matters is the pointer value which isn't changed by the fdput.
         *
         * Technically we didn't need a ref at all, and 'fdget()' was
         * overkill, but given our lockless file pointer lookup, the
         * alternatives are complicated.
         */
        return f.file == filp;
}

static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
                struct file *filp)
{
        void __user *argp = (void __user *)arg;
        int argi = (int)arg;
        struct flock flock;
        long err = -EINVAL;

        switch (cmd) {
        case F_DUPFD:
                err = f_dupfd(argi, filp, 0);
                break;
        case F_DUPFD_CLOEXEC:
                err = f_dupfd(argi, filp, O_CLOEXEC);
                break;
        case F_DUPFD_QUERY:
                err = f_dupfd_query(argi, filp);
                break;
        case F_GETFD:
                err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
                break;
        case F_SETFD:
                err = 0;
                set_close_on_exec(fd, argi & FD_CLOEXEC);
                break;
        case F_GETFL:
                err = filp->f_flags;
                break;
        case F_SETFL:
                err = setfl(fd, filp, argi);
                break;
#if BITS_PER_LONG != 32
        /* 32-bit arches must use fcntl64() */
        case F_OFD_GETLK:
#endif
        case F_GETLK:
                if (copy_from_user(&flock, argp, sizeof(flock)))
                        return -EFAULT;
                err = fcntl_getlk(filp, cmd, &flock);
                if (!err && copy_to_user(argp, &flock, sizeof(flock)))
                        return -EFAULT;
                break;
#if BITS_PER_LONG != 32
        /* 32-bit arches must use fcntl64() */
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
                fallthrough;
#endif
        case F_SETLK:
        case F_SETLKW:
                if (copy_from_user(&flock, argp, sizeof(flock)))
                        return -EFAULT;
                err = fcntl_setlk(fd, filp, cmd, &flock);
                break;
        case F_GETOWN:
                /*
                 * XXX If f_owner is a process group, the
                 * negative return value will get converted
                 * into an error.  Oops.  If we keep the
                 * current syscall conventions, the only way
                 * to fix this will be in libc.
                 */
                err = f_getown(filp);
                force_successful_syscall_return();
                break;
        case F_SETOWN:
                err = f_setown(filp, argi, 1);
                break;
        case F_GETOWN_EX:
                err = f_getown_ex(filp, arg);
                break;
        case F_SETOWN_EX:
                err = f_setown_ex(filp, arg);
                break;
        case F_GETOWNER_UIDS:
                err = f_getowner_uids(filp, arg);
                break;
        case F_GETSIG:
                err = filp->f_owner.signum;
                break;
        case F_SETSIG:
                /* arg == 0 restores default behaviour. */
                if (!valid_signal(argi)) {
                        break;
                }
                err = 0;
                filp->f_owner.signum = argi;
                break;
        case F_GETLEASE:
                err = fcntl_getlease(filp);
                break;
        case F_SETLEASE:
                err = fcntl_setlease(fd, filp, argi);
                break;
        case F_NOTIFY:
                err = fcntl_dirnotify(fd, filp, argi);
                break;
        case F_SETPIPE_SZ:
        case F_GETPIPE_SZ:
                err = pipe_fcntl(filp, cmd, argi);
                break;
        case F_ADD_SEALS:
        case F_GET_SEALS:
                err = memfd_fcntl(filp, cmd, argi);
                break;
        case F_GET_RW_HINT:
                err = fcntl_get_rw_hint(filp, cmd, arg);
                break;
        case F_SET_RW_HINT:
                err = fcntl_set_rw_hint(filp, cmd, arg);
                break;
        default:
                break;
        }
        return err;
}

static int check_fcntl_cmd(unsigned cmd)
{
        switch (cmd) {
        case F_DUPFD:
        case F_DUPFD_CLOEXEC:
        case F_DUPFD_QUERY:
        case F_GETFD:
        case F_SETFD:
        case F_GETFL:
                return 1;
        }
        return 0;
}

SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{        
        struct fd f = fdget_raw(fd);
        long err = -EBADF;

        if (!f.file)
                goto out;

        if (unlikely(f.file->f_mode & FMODE_PATH)) {
                if (!check_fcntl_cmd(cmd))
                        goto out1;
        }

        err = security_file_fcntl(f.file, cmd, arg);
        if (!err)
                err = do_fcntl(fd, cmd, arg, f.file);

out1:
         fdput(f);
out:
        return err;
}

#if BITS_PER_LONG == 32
SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
                unsigned long, arg)
{        
        void __user *argp = (void __user *)arg;
        struct fd f = fdget_raw(fd);
        struct flock64 flock;
        long err = -EBADF;

        if (!f.file)
                goto out;

        if (unlikely(f.file->f_mode & FMODE_PATH)) {
                if (!check_fcntl_cmd(cmd))
                        goto out1;
        }

        err = security_file_fcntl(f.file, cmd, arg);
        if (err)
                goto out1;
        
        switch (cmd) {
        case F_GETLK64:
        case F_OFD_GETLK:
                err = -EFAULT;
                if (copy_from_user(&flock, argp, sizeof(flock)))
                        break;
                err = fcntl_getlk64(f.file, cmd, &flock);
                if (!err && copy_to_user(argp, &flock, sizeof(flock)))
                        err = -EFAULT;
                break;
        case F_SETLK64:
        case F_SETLKW64:
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
                err = -EFAULT;
                if (copy_from_user(&flock, argp, sizeof(flock)))
                        break;
                err = fcntl_setlk64(fd, f.file, cmd, &flock);
                break;
        default:
                err = do_fcntl(fd, cmd, arg, f.file);
                break;
        }
out1:
        fdput(f);
out:
        return err;
}
#endif

#ifdef CONFIG_COMPAT
/* careful - don't use anywhere else */
#define copy_flock_fields(dst, src)                \
        (dst)->l_type = (src)->l_type;                \
        (dst)->l_whence = (src)->l_whence;        \
        (dst)->l_start = (src)->l_start;        \
        (dst)->l_len = (src)->l_len;                \
        (dst)->l_pid = (src)->l_pid;

static int get_compat_flock(struct flock *kfl, const struct compat_flock __user *ufl)
{
        struct compat_flock fl;

        if (copy_from_user(&fl, ufl, sizeof(struct compat_flock)))
                return -EFAULT;
        copy_flock_fields(kfl, &fl);
        return 0;
}

static int get_compat_flock64(struct flock *kfl, const struct compat_flock64 __user *ufl)
{
        struct compat_flock64 fl;

        if (copy_from_user(&fl, ufl, sizeof(struct compat_flock64)))
                return -EFAULT;
        copy_flock_fields(kfl, &fl);
        return 0;
}

static int put_compat_flock(const struct flock *kfl, struct compat_flock __user *ufl)
{
        struct compat_flock fl;

        memset(&fl, 0, sizeof(struct compat_flock));
        copy_flock_fields(&fl, kfl);
        if (copy_to_user(ufl, &fl, sizeof(struct compat_flock)))
                return -EFAULT;
        return 0;
}

static int put_compat_flock64(const struct flock *kfl, struct compat_flock64 __user *ufl)
{
        struct compat_flock64 fl;

        BUILD_BUG_ON(sizeof(kfl->l_start) > sizeof(ufl->l_start));
        BUILD_BUG_ON(sizeof(kfl->l_len) > sizeof(ufl->l_len));

        memset(&fl, 0, sizeof(struct compat_flock64));
        copy_flock_fields(&fl, kfl);
        if (copy_to_user(ufl, &fl, sizeof(struct compat_flock64)))
                return -EFAULT;
        return 0;
}
#undef copy_flock_fields

static unsigned int
convert_fcntl_cmd(unsigned int cmd)
{
        switch (cmd) {
        case F_GETLK64:
                return F_GETLK;
        case F_SETLK64:
                return F_SETLK;
        case F_SETLKW64:
                return F_SETLKW;
        }

        return cmd;
}

/*
 * GETLK was successful and we need to return the data, but it needs to fit in
 * the compat structure.
 * l_start shouldn't be too big, unless the original start + end is greater than
 * COMPAT_OFF_T_MAX, in which case the app was asking for trouble, so we return
 * -EOVERFLOW in that case.  l_len could be too big, in which case we just
 * truncate it, and only allow the app to see that part of the conflicting lock
 * that might make sense to it anyway
 */
static int fixup_compat_flock(struct flock *flock)
{
        if (flock->l_start > COMPAT_OFF_T_MAX)
                return -EOVERFLOW;
        if (flock->l_len > COMPAT_OFF_T_MAX)
                flock->l_len = COMPAT_OFF_T_MAX;
        return 0;
}

static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
                             compat_ulong_t arg)
{
        struct fd f = fdget_raw(fd);
        struct flock flock;
        long err = -EBADF;

        if (!f.file)
                return err;

        if (unlikely(f.file->f_mode & FMODE_PATH)) {
                if (!check_fcntl_cmd(cmd))
                        goto out_put;
        }

        err = security_file_fcntl(f.file, cmd, arg);
        if (err)
                goto out_put;

        switch (cmd) {
        case F_GETLK:
                err = get_compat_flock(&flock, compat_ptr(arg));
                if (err)
                        break;
                err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
                if (err)
                        break;
                err = fixup_compat_flock(&flock);
                if (!err)
                        err = put_compat_flock(&flock, compat_ptr(arg));
                break;
        case F_GETLK64:
        case F_OFD_GETLK:
                err = get_compat_flock64(&flock, compat_ptr(arg));
                if (err)
                        break;
                err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
                if (!err)
                        err = put_compat_flock64(&flock, compat_ptr(arg));
                break;
        case F_SETLK:
        case F_SETLKW:
                err = get_compat_flock(&flock, compat_ptr(arg));
                if (err)
                        break;
                err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock);
                break;
        case F_SETLK64:
        case F_SETLKW64:
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
                err = get_compat_flock64(&flock, compat_ptr(arg));
                if (err)
                        break;
                err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock);
                break;
        default:
                err = do_fcntl(fd, cmd, arg, f.file);
                break;
        }
out_put:
        fdput(f);
        return err;
}

COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
                       compat_ulong_t, arg)
{
        return do_compat_fcntl64(fd, cmd, arg);
}

COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
                       compat_ulong_t, arg)
{
        switch (cmd) {
        case F_GETLK64:
        case F_SETLK64:
        case F_SETLKW64:
        case F_OFD_GETLK:
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
                return -EINVAL;
        }
        return do_compat_fcntl64(fd, cmd, arg);
}
#endif

/* Table to convert sigio signal codes into poll band bitmaps */

static const __poll_t band_table[NSIGPOLL] = {
        EPOLLIN | EPOLLRDNORM,                        /* POLL_IN */
        EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND,        /* POLL_OUT */
        EPOLLIN | EPOLLRDNORM | EPOLLMSG,                /* POLL_MSG */
        EPOLLERR,                                /* POLL_ERR */
        EPOLLPRI | EPOLLRDBAND,                        /* POLL_PRI */
        EPOLLHUP | EPOLLERR                        /* POLL_HUP */
};

static inline int sigio_perm(struct task_struct *p,
                             struct fown_struct *fown, int sig)
{
        const struct cred *cred;
        int ret;

        rcu_read_lock();
        cred = __task_cred(p);
        ret = ((uid_eq(fown->euid, GLOBAL_ROOT_UID) ||
                uid_eq(fown->euid, cred->suid) || uid_eq(fown->euid, cred->uid) ||
                uid_eq(fown->uid,  cred->suid) || uid_eq(fown->uid,  cred->uid)) &&
               !security_file_send_sigiotask(p, fown, sig));
        rcu_read_unlock();
        return ret;
}

static void send_sigio_to_task(struct task_struct *p,
                               struct fown_struct *fown,
                               int fd, int reason, enum pid_type type)
{
        /*
         * F_SETSIG can change ->signum lockless in parallel, make
         * sure we read it once and use the same value throughout.
         */
        int signum = READ_ONCE(fown->signum);

        if (!sigio_perm(p, fown, signum))
                return;

        switch (signum) {
                default: {
                        kernel_siginfo_t si;

                        /* Queue a rt signal with the appropriate fd as its
                           value.  We use SI_SIGIO as the source, not 
                           SI_KERNEL, since kernel signals always get 
                           delivered even if we can't queue.  Failure to
                           queue in this case _should_ be reported; we fall
                           back to SIGIO in that case. --sct */
                        clear_siginfo(&si);
                        si.si_signo = signum;
                        si.si_errno = 0;
                        si.si_code  = reason;
                        /*
                         * Posix definies POLL_IN and friends to be signal
                         * specific si_codes for SIG_POLL.  Linux extended
                         * these si_codes to other signals in a way that is
                         * ambiguous if other signals also have signal
                         * specific si_codes.  In that case use SI_SIGIO instead
                         * to remove the ambiguity.
                         */
                        if ((signum != SIGPOLL) && sig_specific_sicodes(signum))
                                si.si_code = SI_SIGIO;

                        /* Make sure we are called with one of the POLL_*
                           reasons, otherwise we could leak kernel stack into
                           userspace.  */
                        BUG_ON((reason < POLL_IN) || ((reason - POLL_IN) >= NSIGPOLL));
                        if (reason - POLL_IN >= NSIGPOLL)
                                si.si_band  = ~0L;
                        else
                                si.si_band = mangle_poll(band_table[reason - POLL_IN]);
                        si.si_fd    = fd;
                        if (!do_send_sig_info(signum, &si, p, type))
                                break;
                }
                        fallthrough;        /* fall back on the old plain SIGIO signal */
                case 0:
                        do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type);
        }
}

void send_sigio(struct fown_struct *fown, int fd, int band)
{
        struct task_struct *p;
        enum pid_type type;
        unsigned long flags;
        struct pid *pid;
        
        read_lock_irqsave(&fown->lock, flags);

        type = fown->pid_type;
        pid = fown->pid;
        if (!pid)
                goto out_unlock_fown;

        if (type <= PIDTYPE_TGID) {
                rcu_read_lock();
                p = pid_task(pid, PIDTYPE_PID);
                if (p)
                        send_sigio_to_task(p, fown, fd, band, type);
                rcu_read_unlock();
        } else {
                read_lock(&tasklist_lock);
                do_each_pid_task(pid, type, p) {
                        send_sigio_to_task(p, fown, fd, band, type);
                } while_each_pid_task(pid, type, p);
                read_unlock(&tasklist_lock);
        }
 out_unlock_fown:
        read_unlock_irqrestore(&fown->lock, flags);
}

static void send_sigurg_to_task(struct task_struct *p,
                                struct fown_struct *fown, enum pid_type type)
{
        if (sigio_perm(p, fown, SIGURG))
                do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type);
}

int send_sigurg(struct fown_struct *fown)
{
        struct task_struct *p;
        enum pid_type type;
        struct pid *pid;
        unsigned long flags;
        int ret = 0;
        
        read_lock_irqsave(&fown->lock, flags);

        type = fown->pid_type;
        pid = fown->pid;
        if (!pid)
                goto out_unlock_fown;

        ret = 1;

        if (type <= PIDTYPE_TGID) {
                rcu_read_lock();
                p = pid_task(pid, PIDTYPE_PID);
                if (p)
                        send_sigurg_to_task(p, fown, type);
                rcu_read_unlock();
        } else {
                read_lock(&tasklist_lock);
                do_each_pid_task(pid, type, p) {
                        send_sigurg_to_task(p, fown, type);
                } while_each_pid_task(pid, type, p);
                read_unlock(&tasklist_lock);
        }
 out_unlock_fown:
        read_unlock_irqrestore(&fown->lock, flags);
        return ret;
}

static DEFINE_SPINLOCK(fasync_lock);
static struct kmem_cache *fasync_cache __ro_after_init;

/*
 * Remove a fasync entry. If successfully removed, return
 * positive and clear the FASYNC flag. If no entry exists,
 * do nothing and return 0.
 *
 * NOTE! It is very important that the FASYNC flag always
 * match the state "is the filp on a fasync list".
 *
 */
int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
{
        struct fasync_struct *fa, **fp;
        int result = 0;

        spin_lock(&filp->f_lock);
        spin_lock(&fasync_lock);
        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
                if (fa->fa_file != filp)
                        continue;

                write_lock_irq(&fa->fa_lock);
                fa->fa_file = NULL;
                write_unlock_irq(&fa->fa_lock);

                *fp = fa->fa_next;
                kfree_rcu(fa, fa_rcu);
                filp->f_flags &= ~FASYNC;
                result = 1;
                break;
        }
        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
        return result;
}

struct fasync_struct *fasync_alloc(void)
{
        return kmem_cache_alloc(fasync_cache, GFP_KERNEL);
}

/*
 * NOTE! This can be used only for unused fasync entries:
 * entries that actually got inserted on the fasync list
 * need to be released by rcu - see fasync_remove_entry.
 */
void fasync_free(struct fasync_struct *new)
{
        kmem_cache_free(fasync_cache, new);
}

/*
 * Insert a new entry into the fasync list.  Return the pointer to the
 * old one if we didn't use the new one.
 *
 * NOTE! It is very important that the FASYNC flag always
 * match the state "is the filp on a fasync list".
 */
struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new)
{
        struct fasync_struct *fa, **fp;

        spin_lock(&filp->f_lock);
        spin_lock(&fasync_lock);
        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
                if (fa->fa_file != filp)
                        continue;

                write_lock_irq(&fa->fa_lock);
                fa->fa_fd = fd;
                write_unlock_irq(&fa->fa_lock);
                goto out;
        }

        rwlock_init(&new->fa_lock);
        new->magic = FASYNC_MAGIC;
        new->fa_file = filp;
        new->fa_fd = fd;
        new->fa_next = *fapp;
        rcu_assign_pointer(*fapp, new);
        filp->f_flags |= FASYNC;

out:
        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
        return fa;
}

/*
 * Add a fasync entry. Return negative on error, positive if
 * added, and zero if did nothing but change an existing one.
 */
static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
{
        struct fasync_struct *new;

        new = fasync_alloc();
        if (!new)
                return -ENOMEM;

        /*
         * fasync_insert_entry() returns the old (update) entry if
         * it existed.
         *
         * So free the (unused) new entry and return 0 to let the
         * caller know that we didn't add any new fasync entries.
         */
        if (fasync_insert_entry(fd, filp, fapp, new)) {
                fasync_free(new);
                return 0;
        }

        return 1;
}

/*
 * fasync_helper() is used by almost all character device drivers
 * to set up the fasync queue, and for regular files by the file
 * lease code. It returns negative on error, 0 if it did no changes
 * and positive if it added/deleted the entry.
 */
int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
{
        if (!on)
                return fasync_remove_entry(filp, fapp);
        return fasync_add_entry(fd, filp, fapp);
}

EXPORT_SYMBOL(fasync_helper);

/*
 * rcu_read_lock() is held
 */
static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
{
        while (fa) {
                struct fown_struct *fown;
                unsigned long flags;

                if (fa->magic != FASYNC_MAGIC) {
                        printk(KERN_ERR "kill_fasync: bad magic number in "
                               "fasync_struct!\n");
                        return;
                }
                read_lock_irqsave(&fa->fa_lock, flags);
                if (fa->fa_file) {
                        fown = &fa->fa_file->f_owner;
                        /* Don't send SIGURG to processes which have not set a
                           queued signum: SIGURG has its own default signalling
                           mechanism. */
                        if (!(sig == SIGURG && fown->signum == 0))
                                send_sigio(fown, fa->fa_fd, band);
                }
                read_unlock_irqrestore(&fa->fa_lock, flags);
                fa = rcu_dereference(fa->fa_next);
        }
}

void kill_fasync(struct fasync_struct **fp, int sig, int band)
{
        /* First a quick test without locking: usually
         * the list is empty.
         */
        if (*fp) {
                rcu_read_lock();
                kill_fasync_rcu(rcu_dereference(*fp), sig, band);
                rcu_read_unlock();
        }
}
EXPORT_SYMBOL(kill_fasync);

static int __init fcntl_init(void)
{
        /*
         * Please add new bits here to ensure allocation uniqueness.
         * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
         * is defined as O_NONBLOCK on some platforms and not on others.
         */
        BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
                HWEIGHT32(
                        (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
                        __FMODE_EXEC | __FMODE_NONOTIFY));

        fasync_cache = kmem_cache_create("fasync_cache",
                                         sizeof(struct fasync_struct), 0,
                                         SLAB_PANIC | SLAB_ACCOUNT, NULL);
        return 0;
}

module_init(fcntl_init)


























































































    1 











































































































































































































    2 













    2 
















































































































































































































































    2 

















































































































































































    1 




    1 









    1 











    1 

    1 















    1 




























    2 






































    3 






















































    2 















































































   14 



















    6 





    6 







    6 


    6 









    6 


































    6 
















    2 











    2 


















    1 




    1 




























    2 



    2 




































    3 











    3 


















   31 









    3 


   30 





   30 

   33 
















   10 








   10 














    6 








































































































































































































































































































































































































   13 


















    1 















    1 















    1 







    2 




















    1 







    1 





































    3 












    2 






    1 




















































































    1 



















































































































    8 

















































































































































































































    8 










































































































































    2 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   15 




















    8 


































    2 












    3 












    4 



























































    3 




    1 




















    2 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Simplified MAC Kernel (smack) security module
 *
 *  This file contains the smack hook function implementations.
 *
 *  Authors:
 *        Casey Schaufler <casey@schaufler-ca.com>
 *        Jarkko Sakkinen <jarkko.sakkinen@intel.com>
 *
 *  Copyright (C) 2007 Casey Schaufler <casey@schaufler-ca.com>
 *  Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
 *                Paul Moore <paul@paul-moore.com>
 *  Copyright (C) 2010 Nokia Corporation
 *  Copyright (C) 2011 Intel Corporation.
 */

#include <linux/xattr.h>
#include <linux/pagemap.h>
#include <linux/mount.h>
#include <linux/stat.h>
#include <linux/kd.h>
#include <asm/ioctls.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/dccp.h>
#include <linux/icmpv6.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <net/cipso_ipv4.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <linux/audit.h>
#include <linux/magic.h>
#include <linux/dcache.h>
#include <linux/personality.h>
#include <linux/msg.h>
#include <linux/shm.h>
#include <uapi/linux/shm.h>
#include <linux/binfmts.h>
#include <linux/parser.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/watch_queue.h>
#include <linux/io_uring/cmd.h>
#include <uapi/linux/lsm.h>
#include "smack.h"

#define TRANS_TRUE        "TRUE"
#define TRANS_TRUE_SIZE        4

#define SMK_CONNECTING        0
#define SMK_RECEIVING        1
#define SMK_SENDING        2

/*
 * Smack uses multiple xattrs.
 * SMACK64 - for access control,
 * SMACK64TRANSMUTE - label initialization,
 * Not saved on files - SMACK64IPIN and SMACK64IPOUT,
 * Must be set explicitly - SMACK64EXEC and SMACK64MMAP
 */
#define SMACK_INODE_INIT_XATTRS 2

#ifdef SMACK_IPV6_PORT_LABELING
static DEFINE_MUTEX(smack_ipv6_lock);
static LIST_HEAD(smk_ipv6_port_list);
#endif
struct kmem_cache *smack_rule_cache;
int smack_enabled __initdata;

#define A(s) {"smack"#s, sizeof("smack"#s) - 1, Opt_##s}
static struct {
        const char *name;
        int len;
        int opt;
} smk_mount_opts[] = {
        {"smackfsdef", sizeof("smackfsdef") - 1, Opt_fsdefault},
        A(fsdefault), A(fsfloor), A(fshat), A(fsroot), A(fstransmute)
};
#undef A

static int match_opt_prefix(char *s, int l, char **arg)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(smk_mount_opts); i++) {
                size_t len = smk_mount_opts[i].len;
                if (len > l || memcmp(s, smk_mount_opts[i].name, len))
                        continue;
                if (len == l || s[len] != '=')
                        continue;
                *arg = s + len + 1;
                return smk_mount_opts[i].opt;
        }
        return Opt_error;
}

#ifdef CONFIG_SECURITY_SMACK_BRINGUP
static char *smk_bu_mess[] = {
        "Bringup Error",        /* Unused */
        "Bringup",                /* SMACK_BRINGUP_ALLOW */
        "Unconfined Subject",        /* SMACK_UNCONFINED_SUBJECT */
        "Unconfined Object",        /* SMACK_UNCONFINED_OBJECT */
};

static void smk_bu_mode(int mode, char *s)
{
        int i = 0;

        if (mode & MAY_READ)
                s[i++] = 'r';
        if (mode & MAY_WRITE)
                s[i++] = 'w';
        if (mode & MAY_EXEC)
                s[i++] = 'x';
        if (mode & MAY_APPEND)
                s[i++] = 'a';
        if (mode & MAY_TRANSMUTE)
                s[i++] = 't';
        if (mode & MAY_LOCK)
                s[i++] = 'l';
        if (i == 0)
                s[i++] = '-';
        s[i] = '\0';
}
#endif

#ifdef CONFIG_SECURITY_SMACK_BRINGUP
static int smk_bu_note(char *note, struct smack_known *sskp,
                       struct smack_known *oskp, int mode, int rc)
{
        char acc[SMK_NUM_ACCESS_TYPE + 1];

        if (rc <= 0)
                return rc;
        if (rc > SMACK_UNCONFINED_OBJECT)
                rc = 0;

        smk_bu_mode(mode, acc);
        pr_info("Smack %s: (%s %s %s) %s\n", smk_bu_mess[rc],
                sskp->smk_known, oskp->smk_known, acc, note);
        return 0;
}
#else
#define smk_bu_note(note, sskp, oskp, mode, RC) (RC)
#endif

#ifdef CONFIG_SECURITY_SMACK_BRINGUP
static int smk_bu_current(char *note, struct smack_known *oskp,
                          int mode, int rc)
{
        struct task_smack *tsp = smack_cred(current_cred());
        char acc[SMK_NUM_ACCESS_TYPE + 1];

        if (rc <= 0)
                return rc;
        if (rc > SMACK_UNCONFINED_OBJECT)
                rc = 0;

        smk_bu_mode(mode, acc);
        pr_info("Smack %s: (%s %s %s) %s %s\n", smk_bu_mess[rc],
                tsp->smk_task->smk_known, oskp->smk_known,
                acc, current->comm, note);
        return 0;
}
#else
#define smk_bu_current(note, oskp, mode, RC) (RC)
#endif

#ifdef CONFIG_SECURITY_SMACK_BRINGUP
static int smk_bu_task(struct task_struct *otp, int mode, int rc)
{
        struct task_smack *tsp = smack_cred(current_cred());
        struct smack_known *smk_task = smk_of_task_struct_obj(otp);
        char acc[SMK_NUM_ACCESS_TYPE + 1];

        if (rc <= 0)
                return rc;
        if (rc > SMACK_UNCONFINED_OBJECT)
                rc = 0;

        smk_bu_mode(mode, acc);
        pr_info("Smack %s: (%s %s %s) %s to %s\n", smk_bu_mess[rc],
                tsp->smk_task->smk_known, smk_task->smk_known, acc,
                current->comm, otp->comm);
        return 0;
}
#else
#define smk_bu_task(otp, mode, RC) (RC)
#endif

#ifdef CONFIG_SECURITY_SMACK_BRINGUP
static int smk_bu_inode(struct inode *inode, int mode, int rc)
{
        struct task_smack *tsp = smack_cred(current_cred());
        struct inode_smack *isp = smack_inode(inode);
        char acc[SMK_NUM_ACCESS_TYPE + 1];

        if (isp->smk_flags & SMK_INODE_IMPURE)
                pr_info("Smack Unconfined Corruption: inode=(%s %ld) %s\n",
                        inode->i_sb->s_id, inode->i_ino, current->comm);

        if (rc <= 0)
                return rc;
        if (rc > SMACK_UNCONFINED_OBJECT)
                rc = 0;
        if (rc == SMACK_UNCONFINED_SUBJECT &&
            (mode & (MAY_WRITE | MAY_APPEND)))
                isp->smk_flags |= SMK_INODE_IMPURE;

        smk_bu_mode(mode, acc);

        pr_info("Smack %s: (%s %s %s) inode=(%s %ld) %s\n", smk_bu_mess[rc],
                tsp->smk_task->smk_known, isp->smk_inode->smk_known, acc,
                inode->i_sb->s_id, inode->i_ino, current->comm);
        return 0;
}
#else
#define smk_bu_inode(inode, mode, RC) (RC)
#endif

#ifdef CONFIG_SECURITY_SMACK_BRINGUP
static int smk_bu_file(struct file *file, int mode, int rc)
{
        struct task_smack *tsp = smack_cred(current_cred());
        struct smack_known *sskp = tsp->smk_task;
        struct inode *inode = file_inode(file);
        struct inode_smack *isp = smack_inode(inode);
        char acc[SMK_NUM_ACCESS_TYPE + 1];

        if (isp->smk_flags & SMK_INODE_IMPURE)
                pr_info("Smack Unconfined Corruption: inode=(%s %ld) %s\n",
                        inode->i_sb->s_id, inode->i_ino, current->comm);

        if (rc <= 0)
                return rc;
        if (rc > SMACK_UNCONFINED_OBJECT)
                rc = 0;

        smk_bu_mode(mode, acc);
        pr_info("Smack %s: (%s %s %s) file=(%s %ld %pD) %s\n", smk_bu_mess[rc],
                sskp->smk_known, smk_of_inode(inode)->smk_known, acc,
                inode->i_sb->s_id, inode->i_ino, file,
                current->comm);
        return 0;
}
#else
#define smk_bu_file(file, mode, RC) (RC)
#endif

#ifdef CONFIG_SECURITY_SMACK_BRINGUP
static int smk_bu_credfile(const struct cred *cred, struct file *file,
                                int mode, int rc)
{
        struct task_smack *tsp = smack_cred(cred);
        struct smack_known *sskp = tsp->smk_task;
        struct inode *inode = file_inode(file);
        struct inode_smack *isp = smack_inode(inode);
        char acc[SMK_NUM_ACCESS_TYPE + 1];

        if (isp->smk_flags & SMK_INODE_IMPURE)
                pr_info("Smack Unconfined Corruption: inode=(%s %ld) %s\n",
                        inode->i_sb->s_id, inode->i_ino, current->comm);

        if (rc <= 0)
                return rc;
        if (rc > SMACK_UNCONFINED_OBJECT)
                rc = 0;

        smk_bu_mode(mode, acc);
        pr_info("Smack %s: (%s %s %s) file=(%s %ld %pD) %s\n", smk_bu_mess[rc],
                sskp->smk_known, smk_of_inode(inode)->smk_known, acc,
                inode->i_sb->s_id, inode->i_ino, file,
                current->comm);
        return 0;
}
#else
#define smk_bu_credfile(cred, file, mode, RC) (RC)
#endif

/**
 * smk_fetch - Fetch the smack label from a file.
 * @name: type of the label (attribute)
 * @ip: a pointer to the inode
 * @dp: a pointer to the dentry
 *
 * Returns a pointer to the master list entry for the Smack label,
 * NULL if there was no label to fetch, or an error code.
 */
static struct smack_known *smk_fetch(const char *name, struct inode *ip,
                                        struct dentry *dp)
{
        int rc;
        char *buffer;
        struct smack_known *skp = NULL;

        if (!(ip->i_opflags & IOP_XATTR))
                return ERR_PTR(-EOPNOTSUPP);

        buffer = kzalloc(SMK_LONGLABEL, GFP_NOFS);
        if (buffer == NULL)
                return ERR_PTR(-ENOMEM);

        rc = __vfs_getxattr(dp, ip, name, buffer, SMK_LONGLABEL);
        if (rc < 0)
                skp = ERR_PTR(rc);
        else if (rc == 0)
                skp = NULL;
        else
                skp = smk_import_entry(buffer, rc);

        kfree(buffer);

        return skp;
}

/**
 * init_inode_smack - initialize an inode security blob
 * @inode: inode to extract the info from
 * @skp: a pointer to the Smack label entry to use in the blob
 *
 */
static void init_inode_smack(struct inode *inode, struct smack_known *skp)
{
        struct inode_smack *isp = smack_inode(inode);

        isp->smk_inode = skp;
        isp->smk_flags = 0;
}

/**
 * init_task_smack - initialize a task security blob
 * @tsp: blob to initialize
 * @task: a pointer to the Smack label for the running task
 * @forked: a pointer to the Smack label for the forked task
 *
 */
static void init_task_smack(struct task_smack *tsp, struct smack_known *task,
                                        struct smack_known *forked)
{
        tsp->smk_task = task;
        tsp->smk_forked = forked;
        INIT_LIST_HEAD(&tsp->smk_rules);
        INIT_LIST_HEAD(&tsp->smk_relabel);
        mutex_init(&tsp->smk_rules_lock);
}

/**
 * smk_copy_rules - copy a rule set
 * @nhead: new rules header pointer
 * @ohead: old rules header pointer
 * @gfp: type of the memory for the allocation
 *
 * Returns 0 on success, -ENOMEM on error
 */
static int smk_copy_rules(struct list_head *nhead, struct list_head *ohead,
                                gfp_t gfp)
{
        struct smack_rule *nrp;
        struct smack_rule *orp;
        int rc = 0;

        list_for_each_entry_rcu(orp, ohead, list) {
                nrp = kmem_cache_zalloc(smack_rule_cache, gfp);
                if (nrp == NULL) {
                        rc = -ENOMEM;
                        break;
                }
                *nrp = *orp;
                list_add_rcu(&nrp->list, nhead);
        }
        return rc;
}

/**
 * smk_copy_relabel - copy smk_relabel labels list
 * @nhead: new rules header pointer
 * @ohead: old rules header pointer
 * @gfp: type of the memory for the allocation
 *
 * Returns 0 on success, -ENOMEM on error
 */
static int smk_copy_relabel(struct list_head *nhead, struct list_head *ohead,
                                gfp_t gfp)
{
        struct smack_known_list_elem *nklep;
        struct smack_known_list_elem *oklep;

        list_for_each_entry(oklep, ohead, list) {
                nklep = kzalloc(sizeof(struct smack_known_list_elem), gfp);
                if (nklep == NULL) {
                        smk_destroy_label_list(nhead);
                        return -ENOMEM;
                }
                nklep->smk_label = oklep->smk_label;
                list_add(&nklep->list, nhead);
        }

        return 0;
}

/**
 * smk_ptrace_mode - helper function for converting PTRACE_MODE_* into MAY_*
 * @mode: input mode in form of PTRACE_MODE_*
 *
 * Returns a converted MAY_* mode usable by smack rules
 */
static inline unsigned int smk_ptrace_mode(unsigned int mode)
{
        if (mode & PTRACE_MODE_ATTACH)
                return MAY_READWRITE;
        if (mode & PTRACE_MODE_READ)
                return MAY_READ;

        return 0;
}

/**
 * smk_ptrace_rule_check - helper for ptrace access
 * @tracer: tracer process
 * @tracee_known: label entry of the process that's about to be traced
 * @mode: ptrace attachment mode (PTRACE_MODE_*)
 * @func: name of the function that called us, used for audit
 *
 * Returns 0 on access granted, -error on error
 */
static int smk_ptrace_rule_check(struct task_struct *tracer,
                                 struct smack_known *tracee_known,
                                 unsigned int mode, const char *func)
{
        int rc;
        struct smk_audit_info ad, *saip = NULL;
        struct task_smack *tsp;
        struct smack_known *tracer_known;
        const struct cred *tracercred;

        if ((mode & PTRACE_MODE_NOAUDIT) == 0) {
                smk_ad_init(&ad, func, LSM_AUDIT_DATA_TASK);
                smk_ad_setfield_u_tsk(&ad, tracer);
                saip = &ad;
        }

        rcu_read_lock();
        tracercred = __task_cred(tracer);
        tsp = smack_cred(tracercred);
        tracer_known = smk_of_task(tsp);

        if ((mode & PTRACE_MODE_ATTACH) &&
            (smack_ptrace_rule == SMACK_PTRACE_EXACT ||
             smack_ptrace_rule == SMACK_PTRACE_DRACONIAN)) {
                if (tracer_known->smk_known == tracee_known->smk_known)
                        rc = 0;
                else if (smack_ptrace_rule == SMACK_PTRACE_DRACONIAN)
                        rc = -EACCES;
                else if (smack_privileged_cred(CAP_SYS_PTRACE, tracercred))
                        rc = 0;
                else
                        rc = -EACCES;

                if (saip)
                        smack_log(tracer_known->smk_known,
                                  tracee_known->smk_known,
                                  0, rc, saip);

                rcu_read_unlock();
                return rc;
        }

        /* In case of rule==SMACK_PTRACE_DEFAULT or mode==PTRACE_MODE_READ */
        rc = smk_tskacc(tsp, tracee_known, smk_ptrace_mode(mode), saip);

        rcu_read_unlock();
        return rc;
}

/*
 * LSM hooks.
 * We he, that is fun!
 */

/**
 * smack_ptrace_access_check - Smack approval on PTRACE_ATTACH
 * @ctp: child task pointer
 * @mode: ptrace attachment mode (PTRACE_MODE_*)
 *
 * Returns 0 if access is OK, an error code otherwise
 *
 * Do the capability checks.
 */
static int smack_ptrace_access_check(struct task_struct *ctp, unsigned int mode)
{
        struct smack_known *skp;

        skp = smk_of_task_struct_obj(ctp);

        return smk_ptrace_rule_check(current, skp, mode, __func__);
}

/**
 * smack_ptrace_traceme - Smack approval on PTRACE_TRACEME
 * @ptp: parent task pointer
 *
 * Returns 0 if access is OK, an error code otherwise
 *
 * Do the capability checks, and require PTRACE_MODE_ATTACH.
 */
static int smack_ptrace_traceme(struct task_struct *ptp)
{
        struct smack_known *skp;

        skp = smk_of_task(smack_cred(current_cred()));

        return smk_ptrace_rule_check(ptp, skp, PTRACE_MODE_ATTACH, __func__);
}

/**
 * smack_syslog - Smack approval on syslog
 * @typefrom_file: unused
 *
 * Returns 0 on success, error code otherwise.
 */
static int smack_syslog(int typefrom_file)
{
        int rc = 0;
        struct smack_known *skp = smk_of_current();

        if (smack_privileged(CAP_MAC_OVERRIDE))
                return 0;

        if (smack_syslog_label != NULL && smack_syslog_label != skp)
                rc = -EACCES;

        return rc;
}

/*
 * Superblock Hooks.
 */

/**
 * smack_sb_alloc_security - allocate a superblock blob
 * @sb: the superblock getting the blob
 *
 * Returns 0 on success or -ENOMEM on error.
 */
static int smack_sb_alloc_security(struct super_block *sb)
{
        struct superblock_smack *sbsp = smack_superblock(sb);

        sbsp->smk_root = &smack_known_floor;
        sbsp->smk_default = &smack_known_floor;
        sbsp->smk_floor = &smack_known_floor;
        sbsp->smk_hat = &smack_known_hat;
        /*
         * SMK_SB_INITIALIZED will be zero from kzalloc.
         */

        return 0;
}

struct smack_mnt_opts {
        const char *fsdefault;
        const char *fsfloor;
        const char *fshat;
        const char *fsroot;
        const char *fstransmute;
};

static void smack_free_mnt_opts(void *mnt_opts)
{
        kfree(mnt_opts);
}

static int smack_add_opt(int token, const char *s, void **mnt_opts)
{
        struct smack_mnt_opts *opts = *mnt_opts;
        struct smack_known *skp;

        if (!opts) {
                opts = kzalloc(sizeof(struct smack_mnt_opts), GFP_KERNEL);
                if (!opts)
                        return -ENOMEM;
                *mnt_opts = opts;
        }
        if (!s)
                return -ENOMEM;

        skp = smk_import_entry(s, 0);
        if (IS_ERR(skp))
                return PTR_ERR(skp);

        switch (token) {
        case Opt_fsdefault:
                if (opts->fsdefault)
                        goto out_opt_err;
                opts->fsdefault = skp->smk_known;
                break;
        case Opt_fsfloor:
                if (opts->fsfloor)
                        goto out_opt_err;
                opts->fsfloor = skp->smk_known;
                break;
        case Opt_fshat:
                if (opts->fshat)
                        goto out_opt_err;
                opts->fshat = skp->smk_known;
                break;
        case Opt_fsroot:
                if (opts->fsroot)
                        goto out_opt_err;
                opts->fsroot = skp->smk_known;
                break;
        case Opt_fstransmute:
                if (opts->fstransmute)
                        goto out_opt_err;
                opts->fstransmute = skp->smk_known;
                break;
        }
        return 0;

out_opt_err:
        pr_warn("Smack: duplicate mount options\n");
        return -EINVAL;
}

/**
 * smack_fs_context_submount - Initialise security data for a filesystem context
 * @fc: The filesystem context.
 * @reference: reference superblock
 *
 * Returns 0 on success or -ENOMEM on error.
 */
static int smack_fs_context_submount(struct fs_context *fc,
                                 struct super_block *reference)
{
        struct superblock_smack *sbsp;
        struct smack_mnt_opts *ctx;
        struct inode_smack *isp;

        ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;
        fc->security = ctx;

        sbsp = smack_superblock(reference);
        isp = smack_inode(reference->s_root->d_inode);

        if (sbsp->smk_default) {
                ctx->fsdefault = kstrdup(sbsp->smk_default->smk_known, GFP_KERNEL);
                if (!ctx->fsdefault)
                        return -ENOMEM;
        }

        if (sbsp->smk_floor) {
                ctx->fsfloor = kstrdup(sbsp->smk_floor->smk_known, GFP_KERNEL);
                if (!ctx->fsfloor)
                        return -ENOMEM;
        }

        if (sbsp->smk_hat) {
                ctx->fshat = kstrdup(sbsp->smk_hat->smk_known, GFP_KERNEL);
                if (!ctx->fshat)
                        return -ENOMEM;
        }

        if (isp->smk_flags & SMK_INODE_TRANSMUTE) {
                if (sbsp->smk_root) {
                        ctx->fstransmute = kstrdup(sbsp->smk_root->smk_known, GFP_KERNEL);
                        if (!ctx->fstransmute)
                                return -ENOMEM;
                }
        }
        return 0;
}

/**
 * smack_fs_context_dup - Duplicate the security data on fs_context duplication
 * @fc: The new filesystem context.
 * @src_fc: The source filesystem context being duplicated.
 *
 * Returns 0 on success or -ENOMEM on error.
 */
static int smack_fs_context_dup(struct fs_context *fc,
                                struct fs_context *src_fc)
{
        struct smack_mnt_opts *dst, *src = src_fc->security;

        if (!src)
                return 0;

        fc->security = kzalloc(sizeof(struct smack_mnt_opts), GFP_KERNEL);
        if (!fc->security)
                return -ENOMEM;

        dst = fc->security;
        dst->fsdefault = src->fsdefault;
        dst->fsfloor = src->fsfloor;
        dst->fshat = src->fshat;
        dst->fsroot = src->fsroot;
        dst->fstransmute = src->fstransmute;

        return 0;
}

static const struct fs_parameter_spec smack_fs_parameters[] = {
        fsparam_string("smackfsdef",                Opt_fsdefault),
        fsparam_string("smackfsdefault",        Opt_fsdefault),
        fsparam_string("smackfsfloor",                Opt_fsfloor),
        fsparam_string("smackfshat",                Opt_fshat),
        fsparam_string("smackfsroot",                Opt_fsroot),
        fsparam_string("smackfstransmute",        Opt_fstransmute),
        {}
};

/**
 * smack_fs_context_parse_param - Parse a single mount parameter
 * @fc: The new filesystem context being constructed.
 * @param: The parameter.
 *
 * Returns 0 on success, -ENOPARAM to pass the parameter on or anything else on
 * error.
 */
static int smack_fs_context_parse_param(struct fs_context *fc,
                                        struct fs_parameter *param)
{
        struct fs_parse_result result;
        int opt, rc;

        opt = fs_parse(fc, smack_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        rc = smack_add_opt(opt, param->string, &fc->security);
        if (!rc)
                param->string = NULL;
        return rc;
}

static int smack_sb_eat_lsm_opts(char *options, void **mnt_opts)
{
        char *from = options, *to = options;
        bool first = true;

        while (1) {
                char *next = strchr(from, ',');
                int token, len, rc;
                char *arg = NULL;

                if (next)
                        len = next - from;
                else
                        len = strlen(from);

                token = match_opt_prefix(from, len, &arg);
                if (token != Opt_error) {
                        arg = kmemdup_nul(arg, from + len - arg, GFP_KERNEL);
                        rc = smack_add_opt(token, arg, mnt_opts);
                        kfree(arg);
                        if (unlikely(rc)) {
                                if (*mnt_opts)
                                        smack_free_mnt_opts(*mnt_opts);
                                *mnt_opts = NULL;
                                return rc;
                        }
                } else {
                        if (!first) {        // copy with preceding comma
                                from--;
                                len++;
                        }
                        if (to != from)
                                memmove(to, from, len);
                        to += len;
                        first = false;
                }
                if (!from[len])
                        break;
                from += len + 1;
        }
        *to = '\0';
        return 0;
}

/**
 * smack_set_mnt_opts - set Smack specific mount options
 * @sb: the file system superblock
 * @mnt_opts: Smack mount options
 * @kern_flags: mount option from kernel space or user space
 * @set_kern_flags: where to store converted mount opts
 *
 * Returns 0 on success, an error code on failure
 *
 * Allow filesystems with binary mount data to explicitly set Smack mount
 * labels.
 */
static int smack_set_mnt_opts(struct super_block *sb,
                void *mnt_opts,
                unsigned long kern_flags,
                unsigned long *set_kern_flags)
{
        struct dentry *root = sb->s_root;
        struct inode *inode = d_backing_inode(root);
        struct superblock_smack *sp = smack_superblock(sb);
        struct inode_smack *isp;
        struct smack_known *skp;
        struct smack_mnt_opts *opts = mnt_opts;
        bool transmute = false;

        if (sp->smk_flags & SMK_SB_INITIALIZED)
                return 0;

        if (!smack_privileged(CAP_MAC_ADMIN)) {
                /*
                 * Unprivileged mounts don't get to specify Smack values.
                 */
                if (opts)
                        return -EPERM;
                /*
                 * Unprivileged mounts get root and default from the caller.
                 */
                skp = smk_of_current();
                sp->smk_root = skp;
                sp->smk_default = skp;
                /*
                 * For a handful of fs types with no user-controlled
                 * backing store it's okay to trust security labels
                 * in the filesystem. The rest are untrusted.
                 */
                if (sb->s_user_ns != &init_user_ns &&
                    sb->s_magic != SYSFS_MAGIC && sb->s_magic != TMPFS_MAGIC &&
                    sb->s_magic != RAMFS_MAGIC) {
                        transmute = true;
                        sp->smk_flags |= SMK_SB_UNTRUSTED;
                }
        }

        sp->smk_flags |= SMK_SB_INITIALIZED;

        if (opts) {
                if (opts->fsdefault) {
                        skp = smk_import_entry(opts->fsdefault, 0);
                        if (IS_ERR(skp))
                                return PTR_ERR(skp);
                        sp->smk_default = skp;
                }
                if (opts->fsfloor) {
                        skp = smk_import_entry(opts->fsfloor, 0);
                        if (IS_ERR(skp))
                                return PTR_ERR(skp);
                        sp->smk_floor = skp;
                }
                if (opts->fshat) {
                        skp = smk_import_entry(opts->fshat, 0);
                        if (IS_ERR(skp))
                                return PTR_ERR(skp);
                        sp->smk_hat = skp;
                }
                if (opts->fsroot) {
                        skp = smk_import_entry(opts->fsroot, 0);
                        if (IS_ERR(skp))
                                return PTR_ERR(skp);
                        sp->smk_root = skp;
                }
                if (opts->fstransmute) {
                        skp = smk_import_entry(opts->fstransmute, 0);
                        if (IS_ERR(skp))
                                return PTR_ERR(skp);
                        sp->smk_root = skp;
                        transmute = true;
                }
        }

        /*
         * Initialize the root inode.
         */
        init_inode_smack(inode, sp->smk_root);

        if (transmute) {
                isp = smack_inode(inode);
                isp->smk_flags |= SMK_INODE_TRANSMUTE;
        }

        return 0;
}

/**
 * smack_sb_statfs - Smack check on statfs
 * @dentry: identifies the file system in question
 *
 * Returns 0 if current can read the floor of the filesystem,
 * and error code otherwise
 */
static int smack_sb_statfs(struct dentry *dentry)
{
        struct superblock_smack *sbp = smack_superblock(dentry->d_sb);
        int rc;
        struct smk_audit_info ad;

        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY);
        smk_ad_setfield_u_fs_path_dentry(&ad, dentry);

        rc = smk_curacc(sbp->smk_floor, MAY_READ, &ad);
        rc = smk_bu_current("statfs", sbp->smk_floor, MAY_READ, rc);
        return rc;
}

/*
 * BPRM hooks
 */

/**
 * smack_bprm_creds_for_exec - Update bprm->cred if needed for exec
 * @bprm: the exec information
 *
 * Returns 0 if it gets a blob, -EPERM if exec forbidden and -ENOMEM otherwise
 */
static int smack_bprm_creds_for_exec(struct linux_binprm *bprm)
{
        struct inode *inode = file_inode(bprm->file);
        struct task_smack *bsp = smack_cred(bprm->cred);
        struct inode_smack *isp;
        struct superblock_smack *sbsp;
        int rc;

        isp = smack_inode(inode);
        if (isp->smk_task == NULL || isp->smk_task == bsp->smk_task)
                return 0;

        sbsp = smack_superblock(inode->i_sb);
        if ((sbsp->smk_flags & SMK_SB_UNTRUSTED) &&
            isp->smk_task != sbsp->smk_root)
                return 0;

        if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
                struct task_struct *tracer;
                rc = 0;

                rcu_read_lock();
                tracer = ptrace_parent(current);
                if (likely(tracer != NULL))
                        rc = smk_ptrace_rule_check(tracer,
                                                   isp->smk_task,
                                                   PTRACE_MODE_ATTACH,
                                                   __func__);
                rcu_read_unlock();

                if (rc != 0)
                        return rc;
        }
        if (bprm->unsafe & ~LSM_UNSAFE_PTRACE)
                return -EPERM;

        bsp->smk_task = isp->smk_task;
        bprm->per_clear |= PER_CLEAR_ON_SETID;

        /* Decide if this is a secure exec. */
        if (bsp->smk_task != bsp->smk_forked)
                bprm->secureexec = 1;

        return 0;
}

/*
 * Inode hooks
 */

/**
 * smack_inode_alloc_security - allocate an inode blob
 * @inode: the inode in need of a blob
 *
 * Returns 0
 */
static int smack_inode_alloc_security(struct inode *inode)
{
        struct smack_known *skp = smk_of_current();

        init_inode_smack(inode, skp);
        return 0;
}

/**
 * smack_inode_init_security - copy out the smack from an inode
 * @inode: the newly created inode
 * @dir: containing directory object
 * @qstr: unused
 * @xattrs: where to put the attributes
 * @xattr_count: current number of LSM-provided xattrs (updated)
 *
 * Returns 0 if it all works out, -ENOMEM if there's no memory
 */
static int smack_inode_init_security(struct inode *inode, struct inode *dir,
                                     const struct qstr *qstr,
                                     struct xattr *xattrs, int *xattr_count)
{
        struct task_smack *tsp = smack_cred(current_cred());
        struct inode_smack *issp = smack_inode(inode);
        struct smack_known *skp = smk_of_task(tsp);
        struct smack_known *isp = smk_of_inode(inode);
        struct smack_known *dsp = smk_of_inode(dir);
        struct xattr *xattr = lsm_get_xattr_slot(xattrs, xattr_count);
        int may;

        /*
         * If equal, transmuting already occurred in
         * smack_dentry_create_files_as(). No need to check again.
         */
        if (tsp->smk_task != tsp->smk_transmuted) {
                rcu_read_lock();
                may = smk_access_entry(skp->smk_known, dsp->smk_known,
                                       &skp->smk_rules);
                rcu_read_unlock();
        }

        /*
         * In addition to having smk_task equal to smk_transmuted,
         * if the access rule allows transmutation and the directory
         * requests transmutation then by all means transmute.
         * Mark the inode as changed.
         */
        if ((tsp->smk_task == tsp->smk_transmuted) ||
            (may > 0 && ((may & MAY_TRANSMUTE) != 0) &&
             smk_inode_transmutable(dir))) {
                struct xattr *xattr_transmute;

                /*
                 * The caller of smack_dentry_create_files_as()
                 * should have overridden the current cred, so the
                 * inode label was already set correctly in
                 * smack_inode_alloc_security().
                 */
                if (tsp->smk_task != tsp->smk_transmuted)
                        isp = issp->smk_inode = dsp;

                issp->smk_flags |= SMK_INODE_TRANSMUTE;
                xattr_transmute = lsm_get_xattr_slot(xattrs,
                                                     xattr_count);
                if (xattr_transmute) {
                        xattr_transmute->value = kmemdup(TRANS_TRUE,
                                                         TRANS_TRUE_SIZE,
                                                         GFP_NOFS);
                        if (!xattr_transmute->value)
                                return -ENOMEM;

                        xattr_transmute->value_len = TRANS_TRUE_SIZE;
                        xattr_transmute->name = XATTR_SMACK_TRANSMUTE;
                }
        }

        issp->smk_flags |= SMK_INODE_INSTANT;

        if (xattr) {
                xattr->value = kstrdup(isp->smk_known, GFP_NOFS);
                if (!xattr->value)
                        return -ENOMEM;

                xattr->value_len = strlen(isp->smk_known);
                xattr->name = XATTR_SMACK_SUFFIX;
        }

        return 0;
}

/**
 * smack_inode_link - Smack check on link
 * @old_dentry: the existing object
 * @dir: unused
 * @new_dentry: the new object
 *
 * Returns 0 if access is permitted, an error code otherwise
 */
static int smack_inode_link(struct dentry *old_dentry, struct inode *dir,
                            struct dentry *new_dentry)
{
        struct smack_known *isp;
        struct smk_audit_info ad;
        int rc;

        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY);
        smk_ad_setfield_u_fs_path_dentry(&ad, old_dentry);

        isp = smk_of_inode(d_backing_inode(old_dentry));
        rc = smk_curacc(isp, MAY_WRITE, &ad);
        rc = smk_bu_inode(d_backing_inode(old_dentry), MAY_WRITE, rc);

        if (rc == 0 && d_is_positive(new_dentry)) {
                isp = smk_of_inode(d_backing_inode(new_dentry));
                smk_ad_setfield_u_fs_path_dentry(&ad, new_dentry);
                rc = smk_curacc(isp, MAY_WRITE, &ad);
                rc = smk_bu_inode(d_backing_inode(new_dentry), MAY_WRITE, rc);
        }

        return rc;
}

/**
 * smack_inode_unlink - Smack check on inode deletion
 * @dir: containing directory object
 * @dentry: file to unlink
 *
 * Returns 0 if current can write the containing directory
 * and the object, error code otherwise
 */
static int smack_inode_unlink(struct inode *dir, struct dentry *dentry)
{
        struct inode *ip = d_backing_inode(dentry);
        struct smk_audit_info ad;
        int rc;

        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY);
        smk_ad_setfield_u_fs_path_dentry(&ad, dentry);

        /*
         * You need write access to the thing you're unlinking
         */
        rc = smk_curacc(smk_of_inode(ip), MAY_WRITE, &ad);
        rc = smk_bu_inode(ip, MAY_WRITE, rc);
        if (rc == 0) {
                /*
                 * You also need write access to the containing directory
                 */
                smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_INODE);
                smk_ad_setfield_u_fs_inode(&ad, dir);
                rc = smk_curacc(smk_of_inode(dir), MAY_WRITE, &ad);
                rc = smk_bu_inode(dir, MAY_WRITE, rc);
        }
        return rc;
}

/**
 * smack_inode_rmdir - Smack check on directory deletion
 * @dir: containing directory object
 * @dentry: directory to unlink
 *
 * Returns 0 if current can write the containing directory
 * and the directory, error code otherwise
 */
static int smack_inode_rmdir(struct inode *dir, struct dentry *dentry)
{
        struct smk_audit_info ad;
        int rc;

        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY);
        smk_ad_setfield_u_fs_path_dentry(&ad, dentry);

        /*
         * You need write access to the thing you're removing
         */
        rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_WRITE, &ad);
        rc = smk_bu_inode(d_backing_inode(dentry), MAY_WRITE, rc);
        if (rc == 0) {
                /*
                 * You also need write access to the containing directory
                 */
                smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_INODE);
                smk_ad_setfield_u_fs_inode(&ad, dir);
                rc = smk_curacc(smk_of_inode(dir), MAY_WRITE, &ad);
                rc = smk_bu_inode(dir, MAY_WRITE, rc);
        }

        return rc;
}

/**
 * smack_inode_rename - Smack check on rename
 * @old_inode: unused
 * @old_dentry: the old object
 * @new_inode: unused
 * @new_dentry: the new object
 *
 * Read and write access is required on both the old and
 * new directories.
 *
 * Returns 0 if access is permitted, an error code otherwise
 */
static int smack_inode_rename(struct inode *old_inode,
                              struct dentry *old_dentry,
                              struct inode *new_inode,
                              struct dentry *new_dentry)
{
        int rc;
        struct smack_known *isp;
        struct smk_audit_info ad;

        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY);
        smk_ad_setfield_u_fs_path_dentry(&ad, old_dentry);

        isp = smk_of_inode(d_backing_inode(old_dentry));
        rc = smk_curacc(isp, MAY_READWRITE, &ad);
        rc = smk_bu_inode(d_backing_inode(old_dentry), MAY_READWRITE, rc);

        if (rc == 0 && d_is_positive(new_dentry)) {
                isp = smk_of_inode(d_backing_inode(new_dentry));
                smk_ad_setfield_u_fs_path_dentry(&ad, new_dentry);
                rc = smk_curacc(isp, MAY_READWRITE, &ad);
                rc = smk_bu_inode(d_backing_inode(new_dentry), MAY_READWRITE, rc);
        }
        return rc;
}

/**
 * smack_inode_permission - Smack version of permission()
 * @inode: the inode in question
 * @mask: the access requested
 *
 * This is the important Smack hook.
 *
 * Returns 0 if access is permitted, an error code otherwise
 */
static int smack_inode_permission(struct inode *inode, int mask)
{
        struct superblock_smack *sbsp = smack_superblock(inode->i_sb);
        struct smk_audit_info ad;
        int no_block = mask & MAY_NOT_BLOCK;
        int rc;

        mask &= (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND);
        /*
         * No permission to check. Existence test. Yup, it's there.
         */
        if (mask == 0)
                return 0;

        if (sbsp->smk_flags & SMK_SB_UNTRUSTED) {
                if (smk_of_inode(inode) != sbsp->smk_root)
                        return -EACCES;
        }

        /* May be droppable after audit */
        if (no_block)
                return -ECHILD;
        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_INODE);
        smk_ad_setfield_u_fs_inode(&ad, inode);
        rc = smk_curacc(smk_of_inode(inode), mask, &ad);
        rc = smk_bu_inode(inode, mask, rc);
        return rc;
}

/**
 * smack_inode_setattr - Smack check for setting attributes
 * @idmap: idmap of the mount
 * @dentry: the object
 * @iattr: for the force flag
 *
 * Returns 0 if access is permitted, an error code otherwise
 */
static int smack_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                               struct iattr *iattr)
{
        struct smk_audit_info ad;
        int rc;

        /*
         * Need to allow for clearing the setuid bit.
         */
        if (iattr->ia_valid & ATTR_FORCE)
                return 0;
        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY);
        smk_ad_setfield_u_fs_path_dentry(&ad, dentry);

        rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_WRITE, &ad);
        rc = smk_bu_inode(d_backing_inode(dentry), MAY_WRITE, rc);
        return rc;
}

/**
 * smack_inode_getattr - Smack check for getting attributes
 * @path: path to extract the info from
 *
 * Returns 0 if access is permitted, an error code otherwise
 */
static int smack_inode_getattr(const struct path *path)
{
        struct smk_audit_info ad;
        struct inode *inode = d_backing_inode(path->dentry);
        int rc;

        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH);
        smk_ad_setfield_u_fs_path(&ad, *path);
        rc = smk_curacc(smk_of_inode(inode), MAY_READ, &ad);
        rc = smk_bu_inode(inode, MAY_READ, rc);
        return rc;
}

/**
 * smack_inode_setxattr - Smack check for setting xattrs
 * @idmap: idmap of the mount
 * @dentry: the object
 * @name: name of the attribute
 * @value: value of the attribute
 * @size: size of the value
 * @flags: unused
 *
 * This protects the Smack attribute explicitly.
 *
 * Returns 0 if access is permitted, an error code otherwise
 */
static int smack_inode_setxattr(struct mnt_idmap *idmap,
                                struct dentry *dentry, const char *name,
                                const void *value, size_t size, int flags)
{
        struct smk_audit_info ad;
        struct smack_known *skp;
        int check_priv = 0;
        int check_import = 0;
        int check_star = 0;
        int rc = 0;

        /*
         * Check label validity here so import won't fail in post_setxattr
         */
        if (strcmp(name, XATTR_NAME_SMACK) == 0 ||
            strcmp(name, XATTR_NAME_SMACKIPIN) == 0 ||
            strcmp(name, XATTR_NAME_SMACKIPOUT) == 0) {
                check_priv = 1;
                check_import = 1;
        } else if (strcmp(name, XATTR_NAME_SMACKEXEC) == 0 ||
                   strcmp(name, XATTR_NAME_SMACKMMAP) == 0) {
                check_priv = 1;
                check_import = 1;
                check_star = 1;
        } else if (strcmp(name, XATTR_NAME_SMACKTRANSMUTE) == 0) {
                check_priv = 1;
                if (!S_ISDIR(d_backing_inode(dentry)->i_mode) ||
                    size != TRANS_TRUE_SIZE ||
                    strncmp(value, TRANS_TRUE, TRANS_TRUE_SIZE) != 0)
                        rc = -EINVAL;
        } else
                rc = cap_inode_setxattr(dentry, name, value, size, flags);

        if (check_priv && !smack_privileged(CAP_MAC_ADMIN))
                rc = -EPERM;

        if (rc == 0 && check_import) {
                skp = size ? smk_import_entry(value, size) : NULL;
                if (IS_ERR(skp))
                        rc = PTR_ERR(skp);
                else if (skp == NULL || (check_star &&
                    (skp == &smack_known_star || skp == &smack_known_web)))
                        rc = -EINVAL;
        }

        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY);
        smk_ad_setfield_u_fs_path_dentry(&ad, dentry);

        if (rc == 0) {
                rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_WRITE, &ad);
                rc = smk_bu_inode(d_backing_inode(dentry), MAY_WRITE, rc);
        }

        return rc;
}

/**
 * smack_inode_post_setxattr - Apply the Smack update approved above
 * @dentry: object
 * @name: attribute name
 * @value: attribute value
 * @size: attribute size
 * @flags: unused
 *
 * Set the pointer in the inode blob to the entry found
 * in the master label list.
 */
static void smack_inode_post_setxattr(struct dentry *dentry, const char *name,
                                      const void *value, size_t size, int flags)
{
        struct smack_known *skp;
        struct inode_smack *isp = smack_inode(d_backing_inode(dentry));

        if (strcmp(name, XATTR_NAME_SMACKTRANSMUTE) == 0) {
                isp->smk_flags |= SMK_INODE_TRANSMUTE;
                return;
        }

        if (strcmp(name, XATTR_NAME_SMACK) == 0) {
                skp = smk_import_entry(value, size);
                if (!IS_ERR(skp))
                        isp->smk_inode = skp;
        } else if (strcmp(name, XATTR_NAME_SMACKEXEC) == 0) {
                skp = smk_import_entry(value, size);
                if (!IS_ERR(skp))
                        isp->smk_task = skp;
        } else if (strcmp(name, XATTR_NAME_SMACKMMAP) == 0) {
                skp = smk_import_entry(value, size);
                if (!IS_ERR(skp))
                        isp->smk_mmap = skp;
        }

        return;
}

/**
 * smack_inode_getxattr - Smack check on getxattr
 * @dentry: the object
 * @name: unused
 *
 * Returns 0 if access is permitted, an error code otherwise
 */
static int smack_inode_getxattr(struct dentry *dentry, const char *name)
{
        struct smk_audit_info ad;
        int rc;

        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY);
        smk_ad_setfield_u_fs_path_dentry(&ad, dentry);

        rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_READ, &ad);
        rc = smk_bu_inode(d_backing_inode(dentry), MAY_READ, rc);
        return rc;
}

/**
 * smack_inode_removexattr - Smack check on removexattr
 * @idmap: idmap of the mount
 * @dentry: the object
 * @name: name of the attribute
 *
 * Removing the Smack attribute requires CAP_MAC_ADMIN
 *
 * Returns 0 if access is permitted, an error code otherwise
 */
static int smack_inode_removexattr(struct mnt_idmap *idmap,
                                   struct dentry *dentry, const char *name)
{
        struct inode_smack *isp;
        struct smk_audit_info ad;
        int rc = 0;

        if (strcmp(name, XATTR_NAME_SMACK) == 0 ||
            strcmp(name, XATTR_NAME_SMACKIPIN) == 0 ||
            strcmp(name, XATTR_NAME_SMACKIPOUT) == 0 ||
            strcmp(name, XATTR_NAME_SMACKEXEC) == 0 ||
            strcmp(name, XATTR_NAME_SMACKTRANSMUTE) == 0 ||
            strcmp(name, XATTR_NAME_SMACKMMAP) == 0) {
                if (!smack_privileged(CAP_MAC_ADMIN))
                        rc = -EPERM;
        } else
                rc = cap_inode_removexattr(idmap, dentry, name);

        if (rc != 0)
                return rc;

        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY);
        smk_ad_setfield_u_fs_path_dentry(&ad, dentry);

        rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_WRITE, &ad);
        rc = smk_bu_inode(d_backing_inode(dentry), MAY_WRITE, rc);
        if (rc != 0)
                return rc;

        isp = smack_inode(d_backing_inode(dentry));
        /*
         * Don't do anything special for these.
         *        XATTR_NAME_SMACKIPIN
         *        XATTR_NAME_SMACKIPOUT
         */
        if (strcmp(name, XATTR_NAME_SMACK) == 0) {
                struct super_block *sbp = dentry->d_sb;
                struct superblock_smack *sbsp = smack_superblock(sbp);

                isp->smk_inode = sbsp->smk_default;
        } else if (strcmp(name, XATTR_NAME_SMACKEXEC) == 0)
                isp->smk_task = NULL;
        else if (strcmp(name, XATTR_NAME_SMACKMMAP) == 0)
                isp->smk_mmap = NULL;
        else if (strcmp(name, XATTR_NAME_SMACKTRANSMUTE) == 0)
                isp->smk_flags &= ~SMK_INODE_TRANSMUTE;

        return 0;
}

/**
 * smack_inode_set_acl - Smack check for setting posix acls
 * @idmap: idmap of the mnt this request came from
 * @dentry: the object
 * @acl_name: name of the posix acl
 * @kacl: the posix acls
 *
 * Returns 0 if access is permitted, an error code otherwise
 */
static int smack_inode_set_acl(struct mnt_idmap *idmap,
                               struct dentry *dentry, const char *acl_name,
                               struct posix_acl *kacl)
{
        struct smk_audit_info ad;
        int rc;

        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY);
        smk_ad_setfield_u_fs_path_dentry(&ad, dentry);

        rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_WRITE, &ad);
        rc = smk_bu_inode(d_backing_inode(dentry), MAY_WRITE, rc);
        return rc;
}

/**
 * smack_inode_get_acl - Smack check for getting posix acls
 * @idmap: idmap of the mnt this request came from
 * @dentry: the object
 * @acl_name: name of the posix acl
 *
 * Returns 0 if access is permitted, an error code otherwise
 */
static int smack_inode_get_acl(struct mnt_idmap *idmap,
                               struct dentry *dentry, const char *acl_name)
{
        struct smk_audit_info ad;
        int rc;

        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY);
        smk_ad_setfield_u_fs_path_dentry(&ad, dentry);

        rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_READ, &ad);
        rc = smk_bu_inode(d_backing_inode(dentry), MAY_READ, rc);
        return rc;
}

/**
 * smack_inode_remove_acl - Smack check for getting posix acls
 * @idmap: idmap of the mnt this request came from
 * @dentry: the object
 * @acl_name: name of the posix acl
 *
 * Returns 0 if access is permitted, an error code otherwise
 */
static int smack_inode_remove_acl(struct mnt_idmap *idmap,
                                  struct dentry *dentry, const char *acl_name)
{
        struct smk_audit_info ad;
        int rc;

        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_DENTRY);
        smk_ad_setfield_u_fs_path_dentry(&ad, dentry);

        rc = smk_curacc(smk_of_inode(d_backing_inode(dentry)), MAY_WRITE, &ad);
        rc = smk_bu_inode(d_backing_inode(dentry), MAY_WRITE, rc);
        return rc;
}

/**
 * smack_inode_getsecurity - get smack xattrs
 * @idmap: idmap of the mount
 * @inode: the object
 * @name: attribute name
 * @buffer: where to put the result
 * @alloc: duplicate memory
 *
 * Returns the size of the attribute or an error code
 */
static int smack_inode_getsecurity(struct mnt_idmap *idmap,
                                   struct inode *inode, const char *name,
                                   void **buffer, bool alloc)
{
        struct socket_smack *ssp;
        struct socket *sock;
        struct super_block *sbp;
        struct inode *ip = inode;
        struct smack_known *isp;
        struct inode_smack *ispp;
        size_t label_len;
        char *label = NULL;

        if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) {
                isp = smk_of_inode(inode);
        } else if (strcmp(name, XATTR_SMACK_TRANSMUTE) == 0) {
                ispp = smack_inode(inode);
                if (ispp->smk_flags & SMK_INODE_TRANSMUTE)
                        label = TRANS_TRUE;
                else
                        label = "";
        } else {
                /*
                 * The rest of the Smack xattrs are only on sockets.
                 */
                sbp = ip->i_sb;
                if (sbp->s_magic != SOCKFS_MAGIC)
                        return -EOPNOTSUPP;

                sock = SOCKET_I(ip);
                if (sock == NULL || sock->sk == NULL)
                        return -EOPNOTSUPP;

                ssp = sock->sk->sk_security;

                if (strcmp(name, XATTR_SMACK_IPIN) == 0)
                        isp = ssp->smk_in;
                else if (strcmp(name, XATTR_SMACK_IPOUT) == 0)
                        isp = ssp->smk_out;
                else
                        return -EOPNOTSUPP;
        }

        if (!label)
                label = isp->smk_known;

        label_len = strlen(label);

        if (alloc) {
                *buffer = kstrdup(label, GFP_KERNEL);
                if (*buffer == NULL)
                        return -ENOMEM;
        }

        return label_len;
}


/**
 * smack_inode_listsecurity - list the Smack attributes
 * @inode: the object
 * @buffer: where they go
 * @buffer_size: size of buffer
 */
static int smack_inode_listsecurity(struct inode *inode, char *buffer,
                                    size_t buffer_size)
{
        int len = sizeof(XATTR_NAME_SMACK);

        if (buffer != NULL && len <= buffer_size)
                memcpy(buffer, XATTR_NAME_SMACK, len);

        return len;
}

/**
 * smack_inode_getsecid - Extract inode's security id
 * @inode: inode to extract the info from
 * @secid: where result will be saved
 */
static void smack_inode_getsecid(struct inode *inode, u32 *secid)
{
        struct smack_known *skp = smk_of_inode(inode);

        *secid = skp->smk_secid;
}

/*
 * File Hooks
 */

/*
 * There is no smack_file_permission hook
 *
 * Should access checks be done on each read or write?
 * UNICOS and SELinux say yes.
 * Trusted Solaris, Trusted Irix, and just about everyone else says no.
 *
 * I'll say no for now. Smack does not do the frequent
 * label changing that SELinux does.
 */

/**
 * smack_file_alloc_security - assign a file security blob
 * @file: the object
 *
 * The security blob for a file is a pointer to the master
 * label list, so no allocation is done.
 *
 * f_security is the owner security information. It
 * isn't used on file access checks, it's for send_sigio.
 *
 * Returns 0
 */
static int smack_file_alloc_security(struct file *file)
{
        struct smack_known **blob = smack_file(file);

        *blob = smk_of_current();
        return 0;
}

/**
 * smack_file_ioctl - Smack check on ioctls
 * @file: the object
 * @cmd: what to do
 * @arg: unused
 *
 * Relies heavily on the correct use of the ioctl command conventions.
 *
 * Returns 0 if allowed, error code otherwise
 */
static int smack_file_ioctl(struct file *file, unsigned int cmd,
                            unsigned long arg)
{
        int rc = 0;
        struct smk_audit_info ad;
        struct inode *inode = file_inode(file);

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH);
        smk_ad_setfield_u_fs_path(&ad, file->f_path);

        if (_IOC_DIR(cmd) & _IOC_WRITE) {
                rc = smk_curacc(smk_of_inode(inode), MAY_WRITE, &ad);
                rc = smk_bu_file(file, MAY_WRITE, rc);
        }

        if (rc == 0 && (_IOC_DIR(cmd) & _IOC_READ)) {
                rc = smk_curacc(smk_of_inode(inode), MAY_READ, &ad);
                rc = smk_bu_file(file, MAY_READ, rc);
        }

        return rc;
}

/**
 * smack_file_lock - Smack check on file locking
 * @file: the object
 * @cmd: unused
 *
 * Returns 0 if current has lock access, error code otherwise
 */
static int smack_file_lock(struct file *file, unsigned int cmd)
{
        struct smk_audit_info ad;
        int rc;
        struct inode *inode = file_inode(file);

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH);
        smk_ad_setfield_u_fs_path(&ad, file->f_path);
        rc = smk_curacc(smk_of_inode(inode), MAY_LOCK, &ad);
        rc = smk_bu_file(file, MAY_LOCK, rc);
        return rc;
}

/**
 * smack_file_fcntl - Smack check on fcntl
 * @file: the object
 * @cmd: what action to check
 * @arg: unused
 *
 * Generally these operations are harmless.
 * File locking operations present an obvious mechanism
 * for passing information, so they require write access.
 *
 * Returns 0 if current has access, error code otherwise
 */
static int smack_file_fcntl(struct file *file, unsigned int cmd,
                            unsigned long arg)
{
        struct smk_audit_info ad;
        int rc = 0;
        struct inode *inode = file_inode(file);

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        switch (cmd) {
        case F_GETLK:
                break;
        case F_SETLK:
        case F_SETLKW:
                smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH);
                smk_ad_setfield_u_fs_path(&ad, file->f_path);
                rc = smk_curacc(smk_of_inode(inode), MAY_LOCK, &ad);
                rc = smk_bu_file(file, MAY_LOCK, rc);
                break;
        case F_SETOWN:
        case F_SETSIG:
                smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH);
                smk_ad_setfield_u_fs_path(&ad, file->f_path);
                rc = smk_curacc(smk_of_inode(inode), MAY_WRITE, &ad);
                rc = smk_bu_file(file, MAY_WRITE, rc);
                break;
        default:
                break;
        }

        return rc;
}

/**
 * smack_mmap_file - Check permissions for a mmap operation.
 * @file: contains the file structure for file to map (may be NULL).
 * @reqprot: contains the protection requested by the application.
 * @prot: contains the protection that will be applied by the kernel.
 * @flags: contains the operational flags.
 *
 * The @file may be NULL, e.g. if mapping anonymous memory.
 *
 * Return 0 if permission is granted.
 */
static int smack_mmap_file(struct file *file,
                           unsigned long reqprot, unsigned long prot,
                           unsigned long flags)
{
        struct smack_known *skp;
        struct smack_known *mkp;
        struct smack_rule *srp;
        struct task_smack *tsp;
        struct smack_known *okp;
        struct inode_smack *isp;
        struct superblock_smack *sbsp;
        int may;
        int mmay;
        int tmay;
        int rc;

        if (file == NULL)
                return 0;

        if (unlikely(IS_PRIVATE(file_inode(file))))
                return 0;

        isp = smack_inode(file_inode(file));
        if (isp->smk_mmap == NULL)
                return 0;
        sbsp = smack_superblock(file_inode(file)->i_sb);
        if (sbsp->smk_flags & SMK_SB_UNTRUSTED &&
            isp->smk_mmap != sbsp->smk_root)
                return -EACCES;
        mkp = isp->smk_mmap;

        tsp = smack_cred(current_cred());
        skp = smk_of_current();
        rc = 0;

        rcu_read_lock();
        /*
         * For each Smack rule associated with the subject
         * label verify that the SMACK64MMAP also has access
         * to that rule's object label.
         */
        list_for_each_entry_rcu(srp, &skp->smk_rules, list) {
                okp = srp->smk_object;
                /*
                 * Matching labels always allows access.
                 */
                if (mkp->smk_known == okp->smk_known)
                        continue;
                /*
                 * If there is a matching local rule take
                 * that into account as well.
                 */
                may = smk_access_entry(srp->smk_subject->smk_known,
                                       okp->smk_known,
                                       &tsp->smk_rules);
                if (may == -ENOENT)
                        may = srp->smk_access;
                else
                        may &= srp->smk_access;
                /*
                 * If may is zero the SMACK64MMAP subject can't
                 * possibly have less access.
                 */
                if (may == 0)
                        continue;

                /*
                 * Fetch the global list entry.
                 * If there isn't one a SMACK64MMAP subject
                 * can't have as much access as current.
                 */
                mmay = smk_access_entry(mkp->smk_known, okp->smk_known,
                                        &mkp->smk_rules);
                if (mmay == -ENOENT) {
                        rc = -EACCES;
                        break;
                }
                /*
                 * If there is a local entry it modifies the
                 * potential access, too.
                 */
                tmay = smk_access_entry(mkp->smk_known, okp->smk_known,
                                        &tsp->smk_rules);
                if (tmay != -ENOENT)
                        mmay &= tmay;

                /*
                 * If there is any access available to current that is
                 * not available to a SMACK64MMAP subject
                 * deny access.
                 */
                if ((may | mmay) != mmay) {
                        rc = -EACCES;
                        break;
                }
        }

        rcu_read_unlock();

        return rc;
}

/**
 * smack_file_set_fowner - set the file security blob value
 * @file: object in question
 *
 */
static void smack_file_set_fowner(struct file *file)
{
        struct smack_known **blob = smack_file(file);

        *blob = smk_of_current();
}

/**
 * smack_file_send_sigiotask - Smack on sigio
 * @tsk: The target task
 * @fown: the object the signal come from
 * @signum: unused
 *
 * Allow a privileged task to get signals even if it shouldn't
 *
 * Returns 0 if a subject with the object's smack could
 * write to the task, an error code otherwise.
 */
static int smack_file_send_sigiotask(struct task_struct *tsk,
                                     struct fown_struct *fown, int signum)
{
        struct smack_known **blob;
        struct smack_known *skp;
        struct smack_known *tkp = smk_of_task(smack_cred(tsk->cred));
        const struct cred *tcred;
        struct file *file;
        int rc;
        struct smk_audit_info ad;

        /*
         * struct fown_struct is never outside the context of a struct file
         */
        file = container_of(fown, struct file, f_owner);

        /* we don't log here as rc can be overriden */
        blob = smack_file(file);
        skp = *blob;
        rc = smk_access(skp, tkp, MAY_DELIVER, NULL);
        rc = smk_bu_note("sigiotask", skp, tkp, MAY_DELIVER, rc);

        rcu_read_lock();
        tcred = __task_cred(tsk);
        if (rc != 0 && smack_privileged_cred(CAP_MAC_OVERRIDE, tcred))
                rc = 0;
        rcu_read_unlock();

        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
        smk_ad_setfield_u_tsk(&ad, tsk);
        smack_log(skp->smk_known, tkp->smk_known, MAY_DELIVER, rc, &ad);
        return rc;
}

/**
 * smack_file_receive - Smack file receive check
 * @file: the object
 *
 * Returns 0 if current has access, error code otherwise
 */
static int smack_file_receive(struct file *file)
{
        int rc;
        int may = 0;
        struct smk_audit_info ad;
        struct inode *inode = file_inode(file);
        struct socket *sock;
        struct task_smack *tsp;
        struct socket_smack *ssp;

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH);
        smk_ad_setfield_u_fs_path(&ad, file->f_path);

        if (inode->i_sb->s_magic == SOCKFS_MAGIC) {
                sock = SOCKET_I(inode);
                ssp = sock->sk->sk_security;
                tsp = smack_cred(current_cred());
                /*
                 * If the receiving process can't write to the
                 * passed socket or if the passed socket can't
                 * write to the receiving process don't accept
                 * the passed socket.
                 */
                rc = smk_access(tsp->smk_task, ssp->smk_out, MAY_WRITE, &ad);
                rc = smk_bu_file(file, may, rc);
                if (rc < 0)
                        return rc;
                rc = smk_access(ssp->smk_in, tsp->smk_task, MAY_WRITE, &ad);
                rc = smk_bu_file(file, may, rc);
                return rc;
        }
        /*
         * This code relies on bitmasks.
         */
        if (file->f_mode & FMODE_READ)
                may = MAY_READ;
        if (file->f_mode & FMODE_WRITE)
                may |= MAY_WRITE;

        rc = smk_curacc(smk_of_inode(inode), may, &ad);
        rc = smk_bu_file(file, may, rc);
        return rc;
}

/**
 * smack_file_open - Smack dentry open processing
 * @file: the object
 *
 * Set the security blob in the file structure.
 * Allow the open only if the task has read access. There are
 * many read operations (e.g. fstat) that you can do with an
 * fd even if you have the file open write-only.
 *
 * Returns 0 if current has access, error code otherwise
 */
static int smack_file_open(struct file *file)
{
        struct task_smack *tsp = smack_cred(file->f_cred);
        struct inode *inode = file_inode(file);
        struct smk_audit_info ad;
        int rc;

        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH);
        smk_ad_setfield_u_fs_path(&ad, file->f_path);
        rc = smk_tskacc(tsp, smk_of_inode(inode), MAY_READ, &ad);
        rc = smk_bu_credfile(file->f_cred, file, MAY_READ, rc);

        return rc;
}

/*
 * Task hooks
 */

/**
 * smack_cred_alloc_blank - "allocate" blank task-level security credentials
 * @cred: the new credentials
 * @gfp: the atomicity of any memory allocations
 *
 * Prepare a blank set of credentials for modification.  This must allocate all
 * the memory the LSM module might require such that cred_transfer() can
 * complete without error.
 */
static int smack_cred_alloc_blank(struct cred *cred, gfp_t gfp)
{
        init_task_smack(smack_cred(cred), NULL, NULL);
        return 0;
}


/**
 * smack_cred_free - "free" task-level security credentials
 * @cred: the credentials in question
 *
 */
static void smack_cred_free(struct cred *cred)
{
        struct task_smack *tsp = smack_cred(cred);
        struct smack_rule *rp;
        struct list_head *l;
        struct list_head *n;

        smk_destroy_label_list(&tsp->smk_relabel);

        list_for_each_safe(l, n, &tsp->smk_rules) {
                rp = list_entry(l, struct smack_rule, list);
                list_del(&rp->list);
                kmem_cache_free(smack_rule_cache, rp);
        }
}

/**
 * smack_cred_prepare - prepare new set of credentials for modification
 * @new: the new credentials
 * @old: the original credentials
 * @gfp: the atomicity of any memory allocations
 *
 * Prepare a new set of credentials for modification.
 */
static int smack_cred_prepare(struct cred *new, const struct cred *old,
                              gfp_t gfp)
{
        struct task_smack *old_tsp = smack_cred(old);
        struct task_smack *new_tsp = smack_cred(new);
        int rc;

        init_task_smack(new_tsp, old_tsp->smk_task, old_tsp->smk_task);

        rc = smk_copy_rules(&new_tsp->smk_rules, &old_tsp->smk_rules, gfp);
        if (rc != 0)
                return rc;

        rc = smk_copy_relabel(&new_tsp->smk_relabel, &old_tsp->smk_relabel,
                                gfp);
        return rc;
}

/**
 * smack_cred_transfer - Transfer the old credentials to the new credentials
 * @new: the new credentials
 * @old: the original credentials
 *
 * Fill in a set of blank credentials from another set of credentials.
 */
static void smack_cred_transfer(struct cred *new, const struct cred *old)
{
        struct task_smack *old_tsp = smack_cred(old);
        struct task_smack *new_tsp = smack_cred(new);

        init_task_smack(new_tsp, old_tsp->smk_task, old_tsp->smk_task);
}

/**
 * smack_cred_getsecid - get the secid corresponding to a creds structure
 * @cred: the object creds
 * @secid: where to put the result
 *
 * Sets the secid to contain a u32 version of the smack label.
 */
static void smack_cred_getsecid(const struct cred *cred, u32 *secid)
{
        struct smack_known *skp;

        rcu_read_lock();
        skp = smk_of_task(smack_cred(cred));
        *secid = skp->smk_secid;
        rcu_read_unlock();
}

/**
 * smack_kernel_act_as - Set the subjective context in a set of credentials
 * @new: points to the set of credentials to be modified.
 * @secid: specifies the security ID to be set
 *
 * Set the security data for a kernel service.
 */
static int smack_kernel_act_as(struct cred *new, u32 secid)
{
        struct task_smack *new_tsp = smack_cred(new);

        new_tsp->smk_task = smack_from_secid(secid);
        return 0;
}

/**
 * smack_kernel_create_files_as - Set the file creation label in a set of creds
 * @new: points to the set of credentials to be modified
 * @inode: points to the inode to use as a reference
 *
 * Set the file creation context in a set of credentials to the same
 * as the objective context of the specified inode
 */
static int smack_kernel_create_files_as(struct cred *new,
                                        struct inode *inode)
{
        struct inode_smack *isp = smack_inode(inode);
        struct task_smack *tsp = smack_cred(new);

        tsp->smk_forked = isp->smk_inode;
        tsp->smk_task = tsp->smk_forked;
        return 0;
}

/**
 * smk_curacc_on_task - helper to log task related access
 * @p: the task object
 * @access: the access requested
 * @caller: name of the calling function for audit
 *
 * Return 0 if access is permitted
 */
static int smk_curacc_on_task(struct task_struct *p, int access,
                                const char *caller)
{
        struct smk_audit_info ad;
        struct smack_known *skp = smk_of_task_struct_obj(p);
        int rc;

        smk_ad_init(&ad, caller, LSM_AUDIT_DATA_TASK);
        smk_ad_setfield_u_tsk(&ad, p);
        rc = smk_curacc(skp, access, &ad);
        rc = smk_bu_task(p, access, rc);
        return rc;
}

/**
 * smack_task_setpgid - Smack check on setting pgid
 * @p: the task object
 * @pgid: unused
 *
 * Return 0 if write access is permitted
 */
static int smack_task_setpgid(struct task_struct *p, pid_t pgid)
{
        return smk_curacc_on_task(p, MAY_WRITE, __func__);
}

/**
 * smack_task_getpgid - Smack access check for getpgid
 * @p: the object task
 *
 * Returns 0 if current can read the object task, error code otherwise
 */
static int smack_task_getpgid(struct task_struct *p)
{
        return smk_curacc_on_task(p, MAY_READ, __func__);
}

/**
 * smack_task_getsid - Smack access check for getsid
 * @p: the object task
 *
 * Returns 0 if current can read the object task, error code otherwise
 */
static int smack_task_getsid(struct task_struct *p)
{
        return smk_curacc_on_task(p, MAY_READ, __func__);
}

/**
 * smack_current_getsecid_subj - get the subjective secid of the current task
 * @secid: where to put the result
 *
 * Sets the secid to contain a u32 version of the task's subjective smack label.
 */
static void smack_current_getsecid_subj(u32 *secid)
{
        struct smack_known *skp = smk_of_current();

        *secid = skp->smk_secid;
}

/**
 * smack_task_getsecid_obj - get the objective secid of the task
 * @p: the task
 * @secid: where to put the result
 *
 * Sets the secid to contain a u32 version of the task's objective smack label.
 */
static void smack_task_getsecid_obj(struct task_struct *p, u32 *secid)
{
        struct smack_known *skp = smk_of_task_struct_obj(p);

        *secid = skp->smk_secid;
}

/**
 * smack_task_setnice - Smack check on setting nice
 * @p: the task object
 * @nice: unused
 *
 * Return 0 if write access is permitted
 */
static int smack_task_setnice(struct task_struct *p, int nice)
{
        return smk_curacc_on_task(p, MAY_WRITE, __func__);
}

/**
 * smack_task_setioprio - Smack check on setting ioprio
 * @p: the task object
 * @ioprio: unused
 *
 * Return 0 if write access is permitted
 */
static int smack_task_setioprio(struct task_struct *p, int ioprio)
{
        return smk_curacc_on_task(p, MAY_WRITE, __func__);
}

/**
 * smack_task_getioprio - Smack check on reading ioprio
 * @p: the task object
 *
 * Return 0 if read access is permitted
 */
static int smack_task_getioprio(struct task_struct *p)
{
        return smk_curacc_on_task(p, MAY_READ, __func__);
}

/**
 * smack_task_setscheduler - Smack check on setting scheduler
 * @p: the task object
 *
 * Return 0 if read access is permitted
 */
static int smack_task_setscheduler(struct task_struct *p)
{
        return smk_curacc_on_task(p, MAY_WRITE, __func__);
}

/**
 * smack_task_getscheduler - Smack check on reading scheduler
 * @p: the task object
 *
 * Return 0 if read access is permitted
 */
static int smack_task_getscheduler(struct task_struct *p)
{
        return smk_curacc_on_task(p, MAY_READ, __func__);
}

/**
 * smack_task_movememory - Smack check on moving memory
 * @p: the task object
 *
 * Return 0 if write access is permitted
 */
static int smack_task_movememory(struct task_struct *p)
{
        return smk_curacc_on_task(p, MAY_WRITE, __func__);
}

/**
 * smack_task_kill - Smack check on signal delivery
 * @p: the task object
 * @info: unused
 * @sig: unused
 * @cred: identifies the cred to use in lieu of current's
 *
 * Return 0 if write access is permitted
 *
 */
static int smack_task_kill(struct task_struct *p, struct kernel_siginfo *info,
                           int sig, const struct cred *cred)
{
        struct smk_audit_info ad;
        struct smack_known *skp;
        struct smack_known *tkp = smk_of_task_struct_obj(p);
        int rc;

        if (!sig)
                return 0; /* null signal; existence test */

        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
        smk_ad_setfield_u_tsk(&ad, p);
        /*
         * Sending a signal requires that the sender
         * can write the receiver.
         */
        if (cred == NULL) {
                rc = smk_curacc(tkp, MAY_DELIVER, &ad);
                rc = smk_bu_task(p, MAY_DELIVER, rc);
                return rc;
        }
        /*
         * If the cred isn't NULL we're dealing with some USB IO
         * specific behavior. This is not clean. For one thing
         * we can't take privilege into account.
         */
        skp = smk_of_task(smack_cred(cred));
        rc = smk_access(skp, tkp, MAY_DELIVER, &ad);
        rc = smk_bu_note("USB signal", skp, tkp, MAY_DELIVER, rc);
        return rc;
}

/**
 * smack_task_to_inode - copy task smack into the inode blob
 * @p: task to copy from
 * @inode: inode to copy to
 *
 * Sets the smack pointer in the inode security blob
 */
static void smack_task_to_inode(struct task_struct *p, struct inode *inode)
{
        struct inode_smack *isp = smack_inode(inode);
        struct smack_known *skp = smk_of_task_struct_obj(p);

        isp->smk_inode = skp;
        isp->smk_flags |= SMK_INODE_INSTANT;
}

/*
 * Socket hooks.
 */

/**
 * smack_sk_alloc_security - Allocate a socket blob
 * @sk: the socket
 * @family: unused
 * @gfp_flags: memory allocation flags
 *
 * Assign Smack pointers to current
 *
 * Returns 0 on success, -ENOMEM is there's no memory
 */
static int smack_sk_alloc_security(struct sock *sk, int family, gfp_t gfp_flags)
{
        struct smack_known *skp = smk_of_current();
        struct socket_smack *ssp;

        ssp = kzalloc(sizeof(struct socket_smack), gfp_flags);
        if (ssp == NULL)
                return -ENOMEM;

        /*
         * Sockets created by kernel threads receive web label.
         */
        if (unlikely(current->flags & PF_KTHREAD)) {
                ssp->smk_in = &smack_known_web;
                ssp->smk_out = &smack_known_web;
        } else {
                ssp->smk_in = skp;
                ssp->smk_out = skp;
        }
        ssp->smk_packet = NULL;

        sk->sk_security = ssp;

        return 0;
}

/**
 * smack_sk_free_security - Free a socket blob
 * @sk: the socket
 *
 * Clears the blob pointer
 */
static void smack_sk_free_security(struct sock *sk)
{
#ifdef SMACK_IPV6_PORT_LABELING
        struct smk_port_label *spp;

        if (sk->sk_family == PF_INET6) {
                rcu_read_lock();
                list_for_each_entry_rcu(spp, &smk_ipv6_port_list, list) {
                        if (spp->smk_sock != sk)
                                continue;
                        spp->smk_can_reuse = 1;
                        break;
                }
                rcu_read_unlock();
        }
#endif
        kfree(sk->sk_security);
}

/**
 * smack_sk_clone_security - Copy security context
 * @sk: the old socket
 * @newsk: the new socket
 *
 * Copy the security context of the old socket pointer to the cloned
 */
static void smack_sk_clone_security(const struct sock *sk, struct sock *newsk)
{
        struct socket_smack *ssp_old = sk->sk_security;
        struct socket_smack *ssp_new = newsk->sk_security;

        *ssp_new = *ssp_old;
}

/**
* smack_ipv4host_label - check host based restrictions
* @sip: the object end
*
* looks for host based access restrictions
*
* This version will only be appropriate for really small sets of single label
* hosts.  The caller is responsible for ensuring that the RCU read lock is
* taken before calling this function.
*
* Returns the label of the far end or NULL if it's not special.
*/
static struct smack_known *smack_ipv4host_label(struct sockaddr_in *sip)
{
        struct smk_net4addr *snp;
        struct in_addr *siap = &sip->sin_addr;

        if (siap->s_addr == 0)
                return NULL;

        list_for_each_entry_rcu(snp, &smk_net4addr_list, list)
                /*
                 * we break after finding the first match because
                 * the list is sorted from longest to shortest mask
                 * so we have found the most specific match
                 */
                if (snp->smk_host.s_addr ==
                    (siap->s_addr & snp->smk_mask.s_addr))
                        return snp->smk_label;

        return NULL;
}

/*
 * smk_ipv6_localhost - Check for local ipv6 host address
 * @sip: the address
 *
 * Returns boolean true if this is the localhost address
 */
static bool smk_ipv6_localhost(struct sockaddr_in6 *sip)
{
        __be16 *be16p = (__be16 *)&sip->sin6_addr;
        __be32 *be32p = (__be32 *)&sip->sin6_addr;

        if (be32p[0] == 0 && be32p[1] == 0 && be32p[2] == 0 && be16p[6] == 0 &&
            ntohs(be16p[7]) == 1)
                return true;
        return false;
}

/**
* smack_ipv6host_label - check host based restrictions
* @sip: the object end
*
* looks for host based access restrictions
*
* This version will only be appropriate for really small sets of single label
* hosts.  The caller is responsible for ensuring that the RCU read lock is
* taken before calling this function.
*
* Returns the label of the far end or NULL if it's not special.
*/
static struct smack_known *smack_ipv6host_label(struct sockaddr_in6 *sip)
{
        struct smk_net6addr *snp;
        struct in6_addr *sap = &sip->sin6_addr;
        int i;
        int found = 0;

        /*
         * It's local. Don't look for a host label.
         */
        if (smk_ipv6_localhost(sip))
                return NULL;

        list_for_each_entry_rcu(snp, &smk_net6addr_list, list) {
                /*
                 * If the label is NULL the entry has
                 * been renounced. Ignore it.
                 */
                if (snp->smk_label == NULL)
                        continue;
                /*
                * we break after finding the first match because
                * the list is sorted from longest to shortest mask
                * so we have found the most specific match
                */
                for (found = 1, i = 0; i < 8; i++) {
                        if ((sap->s6_addr16[i] & snp->smk_mask.s6_addr16[i]) !=
                            snp->smk_host.s6_addr16[i]) {
                                found = 0;
                                break;
                        }
                }
                if (found)
                        return snp->smk_label;
        }

        return NULL;
}

/**
 * smack_netlbl_add - Set the secattr on a socket
 * @sk: the socket
 *
 * Attach the outbound smack value (smk_out) to the socket.
 *
 * Returns 0 on success or an error code
 */
static int smack_netlbl_add(struct sock *sk)
{
        struct socket_smack *ssp = sk->sk_security;
        struct smack_known *skp = ssp->smk_out;
        int rc;

        local_bh_disable();
        bh_lock_sock_nested(sk);

        rc = netlbl_sock_setattr(sk, sk->sk_family, &skp->smk_netlabel,
                                 netlbl_sk_lock_check(sk));
        switch (rc) {
        case 0:
                ssp->smk_state = SMK_NETLBL_LABELED;
                break;
        case -EDESTADDRREQ:
                ssp->smk_state = SMK_NETLBL_REQSKB;
                rc = 0;
                break;
        }

        bh_unlock_sock(sk);
        local_bh_enable();

        return rc;
}

/**
 * smack_netlbl_delete - Remove the secattr from a socket
 * @sk: the socket
 *
 * Remove the outbound smack value from a socket
 */
static void smack_netlbl_delete(struct sock *sk)
{
        struct socket_smack *ssp = sk->sk_security;

        /*
         * Take the label off the socket if one is set.
         */
        if (ssp->smk_state != SMK_NETLBL_LABELED)
                return;

        local_bh_disable();
        bh_lock_sock_nested(sk);
        netlbl_sock_delattr(sk);
        bh_unlock_sock(sk);
        local_bh_enable();
        ssp->smk_state = SMK_NETLBL_UNLABELED;
}

/**
 * smk_ipv4_check - Perform IPv4 host access checks
 * @sk: the socket
 * @sap: the destination address
 *
 * Set the correct secattr for the given socket based on the destination
 * address and perform any outbound access checks needed.
 *
 * Returns 0 on success or an error code.
 *
 */
static int smk_ipv4_check(struct sock *sk, struct sockaddr_in *sap)
{
        struct smack_known *skp;
        int rc = 0;
        struct smack_known *hkp;
        struct socket_smack *ssp = sk->sk_security;
        struct smk_audit_info ad;

        rcu_read_lock();
        hkp = smack_ipv4host_label(sap);
        if (hkp != NULL) {
#ifdef CONFIG_AUDIT
                struct lsm_network_audit net;

                smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net);
                ad.a.u.net->family = sap->sin_family;
                ad.a.u.net->dport = sap->sin_port;
                ad.a.u.net->v4info.daddr = sap->sin_addr.s_addr;
#endif
                skp = ssp->smk_out;
                rc = smk_access(skp, hkp, MAY_WRITE, &ad);
                rc = smk_bu_note("IPv4 host check", skp, hkp, MAY_WRITE, rc);
                /*
                 * Clear the socket netlabel if it's set.
                 */
                if (!rc)
                        smack_netlbl_delete(sk);
        }
        rcu_read_unlock();

        return rc;
}

/**
 * smk_ipv6_check - check Smack access
 * @subject: subject Smack label
 * @object: object Smack label
 * @address: address
 * @act: the action being taken
 *
 * Check an IPv6 access
 */
static int smk_ipv6_check(struct smack_known *subject,
                                struct smack_known *object,
                                struct sockaddr_in6 *address, int act)
{
#ifdef CONFIG_AUDIT
        struct lsm_network_audit net;
#endif
        struct smk_audit_info ad;
        int rc;

#ifdef CONFIG_AUDIT
        smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net);
        ad.a.u.net->family = PF_INET6;
        ad.a.u.net->dport = address->sin6_port;
        if (act == SMK_RECEIVING)
                ad.a.u.net->v6info.saddr = address->sin6_addr;
        else
                ad.a.u.net->v6info.daddr = address->sin6_addr;
#endif
        rc = smk_access(subject, object, MAY_WRITE, &ad);
        rc = smk_bu_note("IPv6 check", subject, object, MAY_WRITE, rc);
        return rc;
}

#ifdef SMACK_IPV6_PORT_LABELING
/**
 * smk_ipv6_port_label - Smack port access table management
 * @sock: socket
 * @address: address
 *
 * Create or update the port list entry
 */
static void smk_ipv6_port_label(struct socket *sock, struct sockaddr *address)
{
        struct sock *sk = sock->sk;
        struct sockaddr_in6 *addr6;
        struct socket_smack *ssp = sock->sk->sk_security;
        struct smk_port_label *spp;
        unsigned short port = 0;

        if (address == NULL) {
                /*
                 * This operation is changing the Smack information
                 * on the bound socket. Take the changes to the port
                 * as well.
                 */
                rcu_read_lock();
                list_for_each_entry_rcu(spp, &smk_ipv6_port_list, list) {
                        if (sk != spp->smk_sock)
                                continue;
                        spp->smk_in = ssp->smk_in;
                        spp->smk_out = ssp->smk_out;
                        rcu_read_unlock();
                        return;
                }
                /*
                 * A NULL address is only used for updating existing
                 * bound entries. If there isn't one, it's OK.
                 */
                rcu_read_unlock();
                return;
        }

        addr6 = (struct sockaddr_in6 *)address;
        port = ntohs(addr6->sin6_port);
        /*
         * This is a special case that is safely ignored.
         */
        if (port == 0)
                return;

        /*
         * Look for an existing port list entry.
         * This is an indication that a port is getting reused.
         */
        rcu_read_lock();
        list_for_each_entry_rcu(spp, &smk_ipv6_port_list, list) {
                if (spp->smk_port != port || spp->smk_sock_type != sock->type)
                        continue;
                if (spp->smk_can_reuse != 1) {
                        rcu_read_unlock();
                        return;
                }
                spp->smk_port = port;
                spp->smk_sock = sk;
                spp->smk_in = ssp->smk_in;
                spp->smk_out = ssp->smk_out;
                spp->smk_can_reuse = 0;
                rcu_read_unlock();
                return;
        }
        rcu_read_unlock();
        /*
         * A new port entry is required.
         */
        spp = kzalloc(sizeof(*spp), GFP_KERNEL);
        if (spp == NULL)
                return;

        spp->smk_port = port;
        spp->smk_sock = sk;
        spp->smk_in = ssp->smk_in;
        spp->smk_out = ssp->smk_out;
        spp->smk_sock_type = sock->type;
        spp->smk_can_reuse = 0;

        mutex_lock(&smack_ipv6_lock);
        list_add_rcu(&spp->list, &smk_ipv6_port_list);
        mutex_unlock(&smack_ipv6_lock);
        return;
}

/**
 * smk_ipv6_port_check - check Smack port access
 * @sk: socket
 * @address: address
 * @act: the action being taken
 *
 * Create or update the port list entry
 */
static int smk_ipv6_port_check(struct sock *sk, struct sockaddr_in6 *address,
                                int act)
{
        struct smk_port_label *spp;
        struct socket_smack *ssp = sk->sk_security;
        struct smack_known *skp = NULL;
        unsigned short port;
        struct smack_known *object;

        if (act == SMK_RECEIVING) {
                skp = smack_ipv6host_label(address);
                object = ssp->smk_in;
        } else {
                skp = ssp->smk_out;
                object = smack_ipv6host_label(address);
        }

        /*
         * The other end is a single label host.
         */
        if (skp != NULL && object != NULL)
                return smk_ipv6_check(skp, object, address, act);
        if (skp == NULL)
                skp = smack_net_ambient;
        if (object == NULL)
                object = smack_net_ambient;

        /*
         * It's remote, so port lookup does no good.
         */
        if (!smk_ipv6_localhost(address))
                return smk_ipv6_check(skp, object, address, act);

        /*
         * It's local so the send check has to have passed.
         */
        if (act == SMK_RECEIVING)
                return 0;

        port = ntohs(address->sin6_port);
        rcu_read_lock();
        list_for_each_entry_rcu(spp, &smk_ipv6_port_list, list) {
                if (spp->smk_port != port || spp->smk_sock_type != sk->sk_type)
                        continue;
                object = spp->smk_in;
                if (act == SMK_CONNECTING)
                        ssp->smk_packet = spp->smk_out;
                break;
        }
        rcu_read_unlock();

        return smk_ipv6_check(skp, object, address, act);
}
#endif

/**
 * smack_inode_setsecurity - set smack xattrs
 * @inode: the object
 * @name: attribute name
 * @value: attribute value
 * @size: size of the attribute
 * @flags: unused
 *
 * Sets the named attribute in the appropriate blob
 *
 * Returns 0 on success, or an error code
 */
static int smack_inode_setsecurity(struct inode *inode, const char *name,
                                   const void *value, size_t size, int flags)
{
        struct smack_known *skp;
        struct inode_smack *nsp = smack_inode(inode);
        struct socket_smack *ssp;
        struct socket *sock;
        int rc = 0;

        if (value == NULL || size > SMK_LONGLABEL || size == 0)
                return -EINVAL;

        if (strcmp(name, XATTR_SMACK_TRANSMUTE) == 0) {
                if (!S_ISDIR(inode->i_mode) || size != TRANS_TRUE_SIZE ||
                    strncmp(value, TRANS_TRUE, TRANS_TRUE_SIZE) != 0)
                        return -EINVAL;

                nsp->smk_flags |= SMK_INODE_TRANSMUTE;
                return 0;
        }

        skp = smk_import_entry(value, size);
        if (IS_ERR(skp))
                return PTR_ERR(skp);

        if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) {
                nsp->smk_inode = skp;
                nsp->smk_flags |= SMK_INODE_INSTANT;
                return 0;
        }
        /*
         * The rest of the Smack xattrs are only on sockets.
         */
        if (inode->i_sb->s_magic != SOCKFS_MAGIC)
                return -EOPNOTSUPP;

        sock = SOCKET_I(inode);
        if (sock == NULL || sock->sk == NULL)
                return -EOPNOTSUPP;

        ssp = sock->sk->sk_security;

        if (strcmp(name, XATTR_SMACK_IPIN) == 0)
                ssp->smk_in = skp;
        else if (strcmp(name, XATTR_SMACK_IPOUT) == 0) {
                ssp->smk_out = skp;
                if (sock->sk->sk_family == PF_INET) {
                        rc = smack_netlbl_add(sock->sk);
                        if (rc != 0)
                                printk(KERN_WARNING
                                        "Smack: \"%s\" netlbl error %d.\n",
                                        __func__, -rc);
                }
        } else
                return -EOPNOTSUPP;

#ifdef SMACK_IPV6_PORT_LABELING
        if (sock->sk->sk_family == PF_INET6)
                smk_ipv6_port_label(sock, NULL);
#endif

        return 0;
}

/**
 * smack_socket_post_create - finish socket setup
 * @sock: the socket
 * @family: protocol family
 * @type: unused
 * @protocol: unused
 * @kern: unused
 *
 * Sets the netlabel information on the socket
 *
 * Returns 0 on success, and error code otherwise
 */
static int smack_socket_post_create(struct socket *sock, int family,
                                    int type, int protocol, int kern)
{
        struct socket_smack *ssp;

        if (sock->sk == NULL)
                return 0;

        /*
         * Sockets created by kernel threads receive web label.
         */
        if (unlikely(current->flags & PF_KTHREAD)) {
                ssp = sock->sk->sk_security;
                ssp->smk_in = &smack_known_web;
                ssp->smk_out = &smack_known_web;
        }

        if (family != PF_INET)
                return 0;
        /*
         * Set the outbound netlbl.
         */
        return smack_netlbl_add(sock->sk);
}

/**
 * smack_socket_socketpair - create socket pair
 * @socka: one socket
 * @sockb: another socket
 *
 * Cross reference the peer labels for SO_PEERSEC
 *
 * Returns 0
 */
static int smack_socket_socketpair(struct socket *socka,
                                   struct socket *sockb)
{
        struct socket_smack *asp = socka->sk->sk_security;
        struct socket_smack *bsp = sockb->sk->sk_security;

        asp->smk_packet = bsp->smk_out;
        bsp->smk_packet = asp->smk_out;

        return 0;
}

#ifdef SMACK_IPV6_PORT_LABELING
/**
 * smack_socket_bind - record port binding information.
 * @sock: the socket
 * @address: the port address
 * @addrlen: size of the address
 *
 * Records the label bound to a port.
 *
 * Returns 0 on success, and error code otherwise
 */
static int smack_socket_bind(struct socket *sock, struct sockaddr *address,
                                int addrlen)
{
        if (sock->sk != NULL && sock->sk->sk_family == PF_INET6) {
                if (addrlen < SIN6_LEN_RFC2133 ||
                    address->sa_family != AF_INET6)
                        return -EINVAL;
                smk_ipv6_port_label(sock, address);
        }
        return 0;
}
#endif /* SMACK_IPV6_PORT_LABELING */

/**
 * smack_socket_connect - connect access check
 * @sock: the socket
 * @sap: the other end
 * @addrlen: size of sap
 *
 * Verifies that a connection may be possible
 *
 * Returns 0 on success, and error code otherwise
 */
static int smack_socket_connect(struct socket *sock, struct sockaddr *sap,
                                int addrlen)
{
        int rc = 0;

        if (sock->sk == NULL)
                return 0;
        if (sock->sk->sk_family != PF_INET &&
            (!IS_ENABLED(CONFIG_IPV6) || sock->sk->sk_family != PF_INET6))
                return 0;
        if (addrlen < offsetofend(struct sockaddr, sa_family))
                return 0;
        if (IS_ENABLED(CONFIG_IPV6) && sap->sa_family == AF_INET6) {
                struct sockaddr_in6 *sip = (struct sockaddr_in6 *)sap;
                struct smack_known *rsp = NULL;

                if (addrlen < SIN6_LEN_RFC2133)
                        return 0;
                if (__is_defined(SMACK_IPV6_SECMARK_LABELING))
                        rsp = smack_ipv6host_label(sip);
                if (rsp != NULL) {
                        struct socket_smack *ssp = sock->sk->sk_security;

                        rc = smk_ipv6_check(ssp->smk_out, rsp, sip,
                                            SMK_CONNECTING);
                }
#ifdef SMACK_IPV6_PORT_LABELING
                rc = smk_ipv6_port_check(sock->sk, sip, SMK_CONNECTING);
#endif

                return rc;
        }
        if (sap->sa_family != AF_INET || addrlen < sizeof(struct sockaddr_in))
                return 0;
        rc = smk_ipv4_check(sock->sk, (struct sockaddr_in *)sap);
        return rc;
}

/**
 * smack_flags_to_may - convert S_ to MAY_ values
 * @flags: the S_ value
 *
 * Returns the equivalent MAY_ value
 */
static int smack_flags_to_may(int flags)
{
        int may = 0;

        if (flags & S_IRUGO)
                may |= MAY_READ;
        if (flags & S_IWUGO)
                may |= MAY_WRITE;
        if (flags & S_IXUGO)
                may |= MAY_EXEC;

        return may;
}

/**
 * smack_msg_msg_alloc_security - Set the security blob for msg_msg
 * @msg: the object
 *
 * Returns 0
 */
static int smack_msg_msg_alloc_security(struct msg_msg *msg)
{
        struct smack_known **blob = smack_msg_msg(msg);

        *blob = smk_of_current();
        return 0;
}

/**
 * smack_of_ipc - the smack pointer for the ipc
 * @isp: the object
 *
 * Returns a pointer to the smack value
 */
static struct smack_known *smack_of_ipc(struct kern_ipc_perm *isp)
{
        struct smack_known **blob = smack_ipc(isp);

        return *blob;
}

/**
 * smack_ipc_alloc_security - Set the security blob for ipc
 * @isp: the object
 *
 * Returns 0
 */
static int smack_ipc_alloc_security(struct kern_ipc_perm *isp)
{
        struct smack_known **blob = smack_ipc(isp);

        *blob = smk_of_current();
        return 0;
}

/**
 * smk_curacc_shm : check if current has access on shm
 * @isp : the object
 * @access : access requested
 *
 * Returns 0 if current has the requested access, error code otherwise
 */
static int smk_curacc_shm(struct kern_ipc_perm *isp, int access)
{
        struct smack_known *ssp = smack_of_ipc(isp);
        struct smk_audit_info ad;
        int rc;

#ifdef CONFIG_AUDIT
        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC);
        ad.a.u.ipc_id = isp->id;
#endif
        rc = smk_curacc(ssp, access, &ad);
        rc = smk_bu_current("shm", ssp, access, rc);
        return rc;
}

/**
 * smack_shm_associate - Smack access check for shm
 * @isp: the object
 * @shmflg: access requested
 *
 * Returns 0 if current has the requested access, error code otherwise
 */
static int smack_shm_associate(struct kern_ipc_perm *isp, int shmflg)
{
        int may;

        may = smack_flags_to_may(shmflg);
        return smk_curacc_shm(isp, may);
}

/**
 * smack_shm_shmctl - Smack access check for shm
 * @isp: the object
 * @cmd: what it wants to do
 *
 * Returns 0 if current has the requested access, error code otherwise
 */
static int smack_shm_shmctl(struct kern_ipc_perm *isp, int cmd)
{
        int may;

        switch (cmd) {
        case IPC_STAT:
        case SHM_STAT:
        case SHM_STAT_ANY:
                may = MAY_READ;
                break;
        case IPC_SET:
        case SHM_LOCK:
        case SHM_UNLOCK:
        case IPC_RMID:
                may = MAY_READWRITE;
                break;
        case IPC_INFO:
        case SHM_INFO:
                /*
                 * System level information.
                 */
                return 0;
        default:
                return -EINVAL;
        }
        return smk_curacc_shm(isp, may);
}

/**
 * smack_shm_shmat - Smack access for shmat
 * @isp: the object
 * @shmaddr: unused
 * @shmflg: access requested
 *
 * Returns 0 if current has the requested access, error code otherwise
 */
static int smack_shm_shmat(struct kern_ipc_perm *isp, char __user *shmaddr,
                           int shmflg)
{
        int may;

        may = smack_flags_to_may(shmflg);
        return smk_curacc_shm(isp, may);
}

/**
 * smk_curacc_sem : check if current has access on sem
 * @isp : the object
 * @access : access requested
 *
 * Returns 0 if current has the requested access, error code otherwise
 */
static int smk_curacc_sem(struct kern_ipc_perm *isp, int access)
{
        struct smack_known *ssp = smack_of_ipc(isp);
        struct smk_audit_info ad;
        int rc;

#ifdef CONFIG_AUDIT
        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC);
        ad.a.u.ipc_id = isp->id;
#endif
        rc = smk_curacc(ssp, access, &ad);
        rc = smk_bu_current("sem", ssp, access, rc);
        return rc;
}

/**
 * smack_sem_associate - Smack access check for sem
 * @isp: the object
 * @semflg: access requested
 *
 * Returns 0 if current has the requested access, error code otherwise
 */
static int smack_sem_associate(struct kern_ipc_perm *isp, int semflg)
{
        int may;

        may = smack_flags_to_may(semflg);
        return smk_curacc_sem(isp, may);
}

/**
 * smack_sem_semctl - Smack access check for sem
 * @isp: the object
 * @cmd: what it wants to do
 *
 * Returns 0 if current has the requested access, error code otherwise
 */
static int smack_sem_semctl(struct kern_ipc_perm *isp, int cmd)
{
        int may;

        switch (cmd) {
        case GETPID:
        case GETNCNT:
        case GETZCNT:
        case GETVAL:
        case GETALL:
        case IPC_STAT:
        case SEM_STAT:
        case SEM_STAT_ANY:
                may = MAY_READ;
                break;
        case SETVAL:
        case SETALL:
        case IPC_RMID:
        case IPC_SET:
                may = MAY_READWRITE;
                break;
        case IPC_INFO:
        case SEM_INFO:
                /*
                 * System level information
                 */
                return 0;
        default:
                return -EINVAL;
        }

        return smk_curacc_sem(isp, may);
}

/**
 * smack_sem_semop - Smack checks of semaphore operations
 * @isp: the object
 * @sops: unused
 * @nsops: unused
 * @alter: unused
 *
 * Treated as read and write in all cases.
 *
 * Returns 0 if access is allowed, error code otherwise
 */
static int smack_sem_semop(struct kern_ipc_perm *isp, struct sembuf *sops,
                           unsigned nsops, int alter)
{
        return smk_curacc_sem(isp, MAY_READWRITE);
}

/**
 * smk_curacc_msq : helper to check if current has access on msq
 * @isp : the msq
 * @access : access requested
 *
 * return 0 if current has access, error otherwise
 */
static int smk_curacc_msq(struct kern_ipc_perm *isp, int access)
{
        struct smack_known *msp = smack_of_ipc(isp);
        struct smk_audit_info ad;
        int rc;

#ifdef CONFIG_AUDIT
        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC);
        ad.a.u.ipc_id = isp->id;
#endif
        rc = smk_curacc(msp, access, &ad);
        rc = smk_bu_current("msq", msp, access, rc);
        return rc;
}

/**
 * smack_msg_queue_associate - Smack access check for msg_queue
 * @isp: the object
 * @msqflg: access requested
 *
 * Returns 0 if current has the requested access, error code otherwise
 */
static int smack_msg_queue_associate(struct kern_ipc_perm *isp, int msqflg)
{
        int may;

        may = smack_flags_to_may(msqflg);
        return smk_curacc_msq(isp, may);
}

/**
 * smack_msg_queue_msgctl - Smack access check for msg_queue
 * @isp: the object
 * @cmd: what it wants to do
 *
 * Returns 0 if current has the requested access, error code otherwise
 */
static int smack_msg_queue_msgctl(struct kern_ipc_perm *isp, int cmd)
{
        int may;

        switch (cmd) {
        case IPC_STAT:
        case MSG_STAT:
        case MSG_STAT_ANY:
                may = MAY_READ;
                break;
        case IPC_SET:
        case IPC_RMID:
                may = MAY_READWRITE;
                break;
        case IPC_INFO:
        case MSG_INFO:
                /*
                 * System level information
                 */
                return 0;
        default:
                return -EINVAL;
        }

        return smk_curacc_msq(isp, may);
}

/**
 * smack_msg_queue_msgsnd - Smack access check for msg_queue
 * @isp: the object
 * @msg: unused
 * @msqflg: access requested
 *
 * Returns 0 if current has the requested access, error code otherwise
 */
static int smack_msg_queue_msgsnd(struct kern_ipc_perm *isp, struct msg_msg *msg,
                                  int msqflg)
{
        int may;

        may = smack_flags_to_may(msqflg);
        return smk_curacc_msq(isp, may);
}

/**
 * smack_msg_queue_msgrcv - Smack access check for msg_queue
 * @isp: the object
 * @msg: unused
 * @target: unused
 * @type: unused
 * @mode: unused
 *
 * Returns 0 if current has read and write access, error code otherwise
 */
static int smack_msg_queue_msgrcv(struct kern_ipc_perm *isp,
                                  struct msg_msg *msg,
                                  struct task_struct *target, long type,
                                  int mode)
{
        return smk_curacc_msq(isp, MAY_READWRITE);
}

/**
 * smack_ipc_permission - Smack access for ipc_permission()
 * @ipp: the object permissions
 * @flag: access requested
 *
 * Returns 0 if current has read and write access, error code otherwise
 */
static int smack_ipc_permission(struct kern_ipc_perm *ipp, short flag)
{
        struct smack_known **blob = smack_ipc(ipp);
        struct smack_known *iskp = *blob;
        int may = smack_flags_to_may(flag);
        struct smk_audit_info ad;
        int rc;

#ifdef CONFIG_AUDIT
        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC);
        ad.a.u.ipc_id = ipp->id;
#endif
        rc = smk_curacc(iskp, may, &ad);
        rc = smk_bu_current("svipc", iskp, may, rc);
        return rc;
}

/**
 * smack_ipc_getsecid - Extract smack security id
 * @ipp: the object permissions
 * @secid: where result will be saved
 */
static void smack_ipc_getsecid(struct kern_ipc_perm *ipp, u32 *secid)
{
        struct smack_known **blob = smack_ipc(ipp);
        struct smack_known *iskp = *blob;

        *secid = iskp->smk_secid;
}

/**
 * smack_d_instantiate - Make sure the blob is correct on an inode
 * @opt_dentry: dentry where inode will be attached
 * @inode: the object
 *
 * Set the inode's security blob if it hasn't been done already.
 */
static void smack_d_instantiate(struct dentry *opt_dentry, struct inode *inode)
{
        struct super_block *sbp;
        struct superblock_smack *sbsp;
        struct inode_smack *isp;
        struct smack_known *skp;
        struct smack_known *ckp = smk_of_current();
        struct smack_known *final;
        char trattr[TRANS_TRUE_SIZE];
        int transflag = 0;
        int rc;
        struct dentry *dp;

        if (inode == NULL)
                return;

        isp = smack_inode(inode);

        /*
         * If the inode is already instantiated
         * take the quick way out
         */
        if (isp->smk_flags & SMK_INODE_INSTANT)
                return;

        sbp = inode->i_sb;
        sbsp = smack_superblock(sbp);
        /*
         * We're going to use the superblock default label
         * if there's no label on the file.
         */
        final = sbsp->smk_default;

        /*
         * If this is the root inode the superblock
         * may be in the process of initialization.
         * If that is the case use the root value out
         * of the superblock.
         */
        if (opt_dentry->d_parent == opt_dentry) {
                switch (sbp->s_magic) {
                case CGROUP_SUPER_MAGIC:
                case CGROUP2_SUPER_MAGIC:
                        /*
                         * The cgroup filesystem is never mounted,
                         * so there's no opportunity to set the mount
                         * options.
                         */
                        sbsp->smk_root = &smack_known_star;
                        sbsp->smk_default = &smack_known_star;
                        isp->smk_inode = sbsp->smk_root;
                        break;
                case TMPFS_MAGIC:
                        /*
                         * What about shmem/tmpfs anonymous files with dentry
                         * obtained from d_alloc_pseudo()?
                         */
                        isp->smk_inode = smk_of_current();
                        break;
                case PIPEFS_MAGIC:
                        isp->smk_inode = smk_of_current();
                        break;
                case SOCKFS_MAGIC:
                        /*
                         * Socket access is controlled by the socket
                         * structures associated with the task involved.
                         */
                        isp->smk_inode = &smack_known_star;
                        break;
                default:
                        isp->smk_inode = sbsp->smk_root;
                        break;
                }
                isp->smk_flags |= SMK_INODE_INSTANT;
                return;
        }

        /*
         * This is pretty hackish.
         * Casey says that we shouldn't have to do
         * file system specific code, but it does help
         * with keeping it simple.
         */
        switch (sbp->s_magic) {
        case SMACK_MAGIC:
        case CGROUP_SUPER_MAGIC:
        case CGROUP2_SUPER_MAGIC:
                /*
                 * Casey says that it's a little embarrassing
                 * that the smack file system doesn't do
                 * extended attributes.
                 *
                 * Cgroupfs is special
                 */
                final = &smack_known_star;
                break;
        case DEVPTS_SUPER_MAGIC:
                /*
                 * devpts seems content with the label of the task.
                 * Programs that change smack have to treat the
                 * pty with respect.
                 */
                final = ckp;
                break;
        case PROC_SUPER_MAGIC:
                /*
                 * Casey says procfs appears not to care.
                 * The superblock default suffices.
                 */
                break;
        case TMPFS_MAGIC:
                /*
                 * Device labels should come from the filesystem,
                 * but watch out, because they're volitile,
                 * getting recreated on every reboot.
                 */
                final = &smack_known_star;
                /*
                 * If a smack value has been set we want to use it,
                 * but since tmpfs isn't giving us the opportunity
                 * to set mount options simulate setting the
                 * superblock default.
                 */
                fallthrough;
        default:
                /*
                 * This isn't an understood special case.
                 * Get the value from the xattr.
                 */

                /*
                 * UNIX domain sockets use lower level socket data.
                 */
                if (S_ISSOCK(inode->i_mode)) {
                        final = &smack_known_star;
                        break;
                }
                /*
                 * No xattr support means, alas, no SMACK label.
                 * Use the aforeapplied default.
                 * It would be curious if the label of the task
                 * does not match that assigned.
                 */
                if (!(inode->i_opflags & IOP_XATTR))
                        break;
                /*
                 * Get the dentry for xattr.
                 */
                dp = dget(opt_dentry);
                skp = smk_fetch(XATTR_NAME_SMACK, inode, dp);
                if (!IS_ERR_OR_NULL(skp))
                        final = skp;

                /*
                 * Transmuting directory
                 */
                if (S_ISDIR(inode->i_mode)) {
                        /*
                         * If this is a new directory and the label was
                         * transmuted when the inode was initialized
                         * set the transmute attribute on the directory
                         * and mark the inode.
                         *
                         * If there is a transmute attribute on the
                         * directory mark the inode.
                         */
                        rc = __vfs_getxattr(dp, inode,
                                            XATTR_NAME_SMACKTRANSMUTE, trattr,
                                            TRANS_TRUE_SIZE);
                        if (rc >= 0 && strncmp(trattr, TRANS_TRUE,
                                               TRANS_TRUE_SIZE) != 0)
                                rc = -EINVAL;
                        if (rc >= 0)
                                transflag = SMK_INODE_TRANSMUTE;
                }
                /*
                 * Don't let the exec or mmap label be "*" or "@".
                 */
                skp = smk_fetch(XATTR_NAME_SMACKEXEC, inode, dp);
                if (IS_ERR(skp) || skp == &smack_known_star ||
                    skp == &smack_known_web)
                        skp = NULL;
                isp->smk_task = skp;

                skp = smk_fetch(XATTR_NAME_SMACKMMAP, inode, dp);
                if (IS_ERR(skp) || skp == &smack_known_star ||
                    skp == &smack_known_web)
                        skp = NULL;
                isp->smk_mmap = skp;

                dput(dp);
                break;
        }

        if (final == NULL)
                isp->smk_inode = ckp;
        else
                isp->smk_inode = final;

        isp->smk_flags |= (SMK_INODE_INSTANT | transflag);

        return;
}

/**
 * smack_getselfattr - Smack current process attribute
 * @attr: which attribute to fetch
 * @ctx: buffer to receive the result
 * @size: available size in, actual size out
 * @flags: unused
 *
 * Fill the passed user space @ctx with the details of the requested
 * attribute.
 *
 * Returns the number of attributes on success, an error code otherwise.
 * There will only ever be one attribute.
 */
static int smack_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx,
                             u32 *size, u32 flags)
{
        int rc;
        struct smack_known *skp;

        if (attr != LSM_ATTR_CURRENT)
                return -EOPNOTSUPP;

        skp = smk_of_current();
        rc = lsm_fill_user_ctx(ctx, size,
                               skp->smk_known, strlen(skp->smk_known) + 1,
                               LSM_ID_SMACK, 0);
        return (!rc ? 1 : rc);
}

/**
 * smack_getprocattr - Smack process attribute access
 * @p: the object task
 * @name: the name of the attribute in /proc/.../attr
 * @value: where to put the result
 *
 * Places a copy of the task Smack into value
 *
 * Returns the length of the smack label or an error code
 */
static int smack_getprocattr(struct task_struct *p, const char *name, char **value)
{
        struct smack_known *skp = smk_of_task_struct_obj(p);
        char *cp;
        int slen;

        if (strcmp(name, "current") != 0)
                return -EINVAL;

        cp = kstrdup(skp->smk_known, GFP_KERNEL);
        if (cp == NULL)
                return -ENOMEM;

        slen = strlen(cp);
        *value = cp;
        return slen;
}

/**
 * do_setattr - Smack process attribute setting
 * @attr: the ID of the attribute
 * @value: the value to set
 * @size: the size of the value
 *
 * Sets the Smack value of the task. Only setting self
 * is permitted and only with privilege
 *
 * Returns the length of the smack label or an error code
 */
static int do_setattr(u64 attr, void *value, size_t size)
{
        struct task_smack *tsp = smack_cred(current_cred());
        struct cred *new;
        struct smack_known *skp;
        struct smack_known_list_elem *sklep;
        int rc;

        if (!smack_privileged(CAP_MAC_ADMIN) && list_empty(&tsp->smk_relabel))
                return -EPERM;

        if (value == NULL || size == 0 || size >= SMK_LONGLABEL)
                return -EINVAL;

        if (attr != LSM_ATTR_CURRENT)
                return -EOPNOTSUPP;

        skp = smk_import_entry(value, size);
        if (IS_ERR(skp))
                return PTR_ERR(skp);

        /*
         * No process is ever allowed the web ("@") label
         * and the star ("*") label.
         */
        if (skp == &smack_known_web || skp == &smack_known_star)
                return -EINVAL;

        if (!smack_privileged(CAP_MAC_ADMIN)) {
                rc = -EPERM;
                list_for_each_entry(sklep, &tsp->smk_relabel, list)
                        if (sklep->smk_label == skp) {
                                rc = 0;
                                break;
                        }
                if (rc)
                        return rc;
        }

        new = prepare_creds();
        if (new == NULL)
                return -ENOMEM;

        tsp = smack_cred(new);
        tsp->smk_task = skp;
        /*
         * process can change its label only once
         */
        smk_destroy_label_list(&tsp->smk_relabel);

        commit_creds(new);
        return size;
}

/**
 * smack_setselfattr - Set a Smack process attribute
 * @attr: which attribute to set
 * @ctx: buffer containing the data
 * @size: size of @ctx
 * @flags: unused
 *
 * Fill the passed user space @ctx with the details of the requested
 * attribute.
 *
 * Returns 0 on success, an error code otherwise.
 */
static int smack_setselfattr(unsigned int attr, struct lsm_ctx *ctx,
                             u32 size, u32 flags)
{
        int rc;

        rc = do_setattr(attr, ctx->ctx, ctx->ctx_len);
        if (rc > 0)
                return 0;
        return rc;
}

/**
 * smack_setprocattr - Smack process attribute setting
 * @name: the name of the attribute in /proc/.../attr
 * @value: the value to set
 * @size: the size of the value
 *
 * Sets the Smack value of the task. Only setting self
 * is permitted and only with privilege
 *
 * Returns the length of the smack label or an error code
 */
static int smack_setprocattr(const char *name, void *value, size_t size)
{
        int attr = lsm_name_to_attr(name);

        if (attr != LSM_ATTR_UNDEF)
                return do_setattr(attr, value, size);
        return -EINVAL;
}

/**
 * smack_unix_stream_connect - Smack access on UDS
 * @sock: one sock
 * @other: the other sock
 * @newsk: unused
 *
 * Return 0 if a subject with the smack of sock could access
 * an object with the smack of other, otherwise an error code
 */
static int smack_unix_stream_connect(struct sock *sock,
                                     struct sock *other, struct sock *newsk)
{
        struct smack_known *skp;
        struct smack_known *okp;
        struct socket_smack *ssp = sock->sk_security;
        struct socket_smack *osp = other->sk_security;
        struct socket_smack *nsp = newsk->sk_security;
        struct smk_audit_info ad;
        int rc = 0;
#ifdef CONFIG_AUDIT
        struct lsm_network_audit net;
#endif

        if (!smack_privileged(CAP_MAC_OVERRIDE)) {
                skp = ssp->smk_out;
                okp = osp->smk_in;
#ifdef CONFIG_AUDIT
                smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net);
                smk_ad_setfield_u_net_sk(&ad, other);
#endif
                rc = smk_access(skp, okp, MAY_WRITE, &ad);
                rc = smk_bu_note("UDS connect", skp, okp, MAY_WRITE, rc);
                if (rc == 0) {
                        okp = osp->smk_out;
                        skp = ssp->smk_in;
                        rc = smk_access(okp, skp, MAY_WRITE, &ad);
                        rc = smk_bu_note("UDS connect", okp, skp,
                                                MAY_WRITE, rc);
                }
        }

        /*
         * Cross reference the peer labels for SO_PEERSEC.
         */
        if (rc == 0) {
                nsp->smk_packet = ssp->smk_out;
                ssp->smk_packet = osp->smk_out;
        }

        return rc;
}

/**
 * smack_unix_may_send - Smack access on UDS
 * @sock: one socket
 * @other: the other socket
 *
 * Return 0 if a subject with the smack of sock could access
 * an object with the smack of other, otherwise an error code
 */
static int smack_unix_may_send(struct socket *sock, struct socket *other)
{
        struct socket_smack *ssp = sock->sk->sk_security;
        struct socket_smack *osp = other->sk->sk_security;
        struct smk_audit_info ad;
        int rc;

#ifdef CONFIG_AUDIT
        struct lsm_network_audit net;

        smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net);
        smk_ad_setfield_u_net_sk(&ad, other->sk);
#endif

        if (smack_privileged(CAP_MAC_OVERRIDE))
                return 0;

        rc = smk_access(ssp->smk_out, osp->smk_in, MAY_WRITE, &ad);
        rc = smk_bu_note("UDS send", ssp->smk_out, osp->smk_in, MAY_WRITE, rc);
        return rc;
}

/**
 * smack_socket_sendmsg - Smack check based on destination host
 * @sock: the socket
 * @msg: the message
 * @size: the size of the message
 *
 * Return 0 if the current subject can write to the destination host.
 * For IPv4 this is only a question if the destination is a single label host.
 * For IPv6 this is a check against the label of the port.
 */
static int smack_socket_sendmsg(struct socket *sock, struct msghdr *msg,
                                int size)
{
        struct sockaddr_in *sip = (struct sockaddr_in *) msg->msg_name;
#if IS_ENABLED(CONFIG_IPV6)
        struct sockaddr_in6 *sap = (struct sockaddr_in6 *) msg->msg_name;
#endif
#ifdef SMACK_IPV6_SECMARK_LABELING
        struct socket_smack *ssp = sock->sk->sk_security;
        struct smack_known *rsp;
#endif
        int rc = 0;

        /*
         * Perfectly reasonable for this to be NULL
         */
        if (sip == NULL)
                return 0;

        switch (sock->sk->sk_family) {
        case AF_INET:
                if (msg->msg_namelen < sizeof(struct sockaddr_in) ||
                    sip->sin_family != AF_INET)
                        return -EINVAL;
                rc = smk_ipv4_check(sock->sk, sip);
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                if (msg->msg_namelen < SIN6_LEN_RFC2133 ||
                    sap->sin6_family != AF_INET6)
                        return -EINVAL;
#ifdef SMACK_IPV6_SECMARK_LABELING
                rsp = smack_ipv6host_label(sap);
                if (rsp != NULL)
                        rc = smk_ipv6_check(ssp->smk_out, rsp, sap,
                                                SMK_CONNECTING);
#endif
#ifdef SMACK_IPV6_PORT_LABELING
                rc = smk_ipv6_port_check(sock->sk, sap, SMK_SENDING);
#endif
#endif /* IS_ENABLED(CONFIG_IPV6) */
                break;
        }
        return rc;
}

/**
 * smack_from_secattr - Convert a netlabel attr.mls.lvl/attr.mls.cat pair to smack
 * @sap: netlabel secattr
 * @ssp: socket security information
 *
 * Returns a pointer to a Smack label entry found on the label list.
 */
static struct smack_known *smack_from_secattr(struct netlbl_lsm_secattr *sap,
                                                struct socket_smack *ssp)
{
        struct smack_known *skp;
        int found = 0;
        int acat;
        int kcat;

        /*
         * Netlabel found it in the cache.
         */
        if ((sap->flags & NETLBL_SECATTR_CACHE) != 0)
                return (struct smack_known *)sap->cache->data;

        if ((sap->flags & NETLBL_SECATTR_SECID) != 0)
                /*
                 * Looks like a fallback, which gives us a secid.
                 */
                return smack_from_secid(sap->attr.secid);

        if ((sap->flags & NETLBL_SECATTR_MLS_LVL) != 0) {
                /*
                 * Looks like a CIPSO packet.
                 * If there are flags but no level netlabel isn't
                 * behaving the way we expect it to.
                 *
                 * Look it up in the label table
                 * Without guidance regarding the smack value
                 * for the packet fall back on the network
                 * ambient value.
                 */
                rcu_read_lock();
                list_for_each_entry_rcu(skp, &smack_known_list, list) {
                        if (sap->attr.mls.lvl != skp->smk_netlabel.attr.mls.lvl)
                                continue;
                        /*
                         * Compare the catsets. Use the netlbl APIs.
                         */
                        if ((sap->flags & NETLBL_SECATTR_MLS_CAT) == 0) {
                                if ((skp->smk_netlabel.flags &
                                     NETLBL_SECATTR_MLS_CAT) == 0)
                                        found = 1;
                                break;
                        }
                        for (acat = -1, kcat = -1; acat == kcat; ) {
                                acat = netlbl_catmap_walk(sap->attr.mls.cat,
                                                          acat + 1);
                                kcat = netlbl_catmap_walk(
                                        skp->smk_netlabel.attr.mls.cat,
                                        kcat + 1);
                                if (acat < 0 || kcat < 0)
                                        break;
                        }
                        if (acat == kcat) {
                                found = 1;
                                break;
                        }
                }
                rcu_read_unlock();

                if (found)
                        return skp;

                if (ssp != NULL && ssp->smk_in == &smack_known_star)
                        return &smack_known_web;
                return &smack_known_star;
        }
        /*
         * Without guidance regarding the smack value
         * for the packet fall back on the network
         * ambient value.
         */
        return smack_net_ambient;
}

#if IS_ENABLED(CONFIG_IPV6)
static int smk_skb_to_addr_ipv6(struct sk_buff *skb, struct sockaddr_in6 *sip)
{
        u8 nexthdr;
        int offset;
        int proto = -EINVAL;
        struct ipv6hdr _ipv6h;
        struct ipv6hdr *ip6;
        __be16 frag_off;
        struct tcphdr _tcph, *th;
        struct udphdr _udph, *uh;
        struct dccp_hdr _dccph, *dh;

        sip->sin6_port = 0;

        offset = skb_network_offset(skb);
        ip6 = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
        if (ip6 == NULL)
                return -EINVAL;
        sip->sin6_addr = ip6->saddr;

        nexthdr = ip6->nexthdr;
        offset += sizeof(_ipv6h);
        offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
        if (offset < 0)
                return -EINVAL;

        proto = nexthdr;
        switch (proto) {
        case IPPROTO_TCP:
                th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
                if (th != NULL)
                        sip->sin6_port = th->source;
                break;
        case IPPROTO_UDP:
        case IPPROTO_UDPLITE:
                uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
                if (uh != NULL)
                        sip->sin6_port = uh->source;
                break;
        case IPPROTO_DCCP:
                dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph);
                if (dh != NULL)
                        sip->sin6_port = dh->dccph_sport;
                break;
        }
        return proto;
}
#endif /* CONFIG_IPV6 */

/**
 * smack_from_skb - Smack data from the secmark in an skb
 * @skb: packet
 *
 * Returns smack_known of the secmark or NULL if that won't work.
 */
#ifdef CONFIG_NETWORK_SECMARK
static struct smack_known *smack_from_skb(struct sk_buff *skb)
{
        if (skb == NULL || skb->secmark == 0)
                return NULL;

        return smack_from_secid(skb->secmark);
}
#else
static inline struct smack_known *smack_from_skb(struct sk_buff *skb)
{
        return NULL;
}
#endif

/**
 * smack_from_netlbl - Smack data from the IP options in an skb
 * @sk: socket data came in on
 * @family: address family
 * @skb: packet
 *
 * Find the Smack label in the IP options. If it hasn't been
 * added to the netlabel cache, add it here.
 *
 * Returns smack_known of the IP options or NULL if that won't work.
 */
static struct smack_known *smack_from_netlbl(const struct sock *sk, u16 family,
                                             struct sk_buff *skb)
{
        struct netlbl_lsm_secattr secattr;
        struct socket_smack *ssp = NULL;
        struct smack_known *skp = NULL;

        netlbl_secattr_init(&secattr);

        if (sk)
                ssp = sk->sk_security;

        if (netlbl_skbuff_getattr(skb, family, &secattr) == 0) {
                skp = smack_from_secattr(&secattr, ssp);
                if (secattr.flags & NETLBL_SECATTR_CACHEABLE)
                        netlbl_cache_add(skb, family, &skp->smk_netlabel);
        }

        netlbl_secattr_destroy(&secattr);

        return skp;
}

/**
 * smack_socket_sock_rcv_skb - Smack packet delivery access check
 * @sk: socket
 * @skb: packet
 *
 * Returns 0 if the packet should be delivered, an error code otherwise
 */
static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        struct socket_smack *ssp = sk->sk_security;
        struct smack_known *skp = NULL;
        int rc = 0;
        struct smk_audit_info ad;
        u16 family = sk->sk_family;
#ifdef CONFIG_AUDIT
        struct lsm_network_audit net;
#endif
#if IS_ENABLED(CONFIG_IPV6)
        struct sockaddr_in6 sadd;
        int proto;

        if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
                family = PF_INET;
#endif /* CONFIG_IPV6 */

        switch (family) {
        case PF_INET:
                /*
                 * If there is a secmark use it rather than the CIPSO label.
                 * If there is no secmark fall back to CIPSO.
                 * The secmark is assumed to reflect policy better.
                 */
                skp = smack_from_skb(skb);
                if (skp == NULL) {
                        skp = smack_from_netlbl(sk, family, skb);
                        if (skp == NULL)
                                skp = smack_net_ambient;
                }

#ifdef CONFIG_AUDIT
                smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net);
                ad.a.u.net->family = family;
                ad.a.u.net->netif = skb->skb_iif;
                ipv4_skb_to_auditdata(skb, &ad.a, NULL);
#endif
                /*
                 * Receiving a packet requires that the other end
                 * be able to write here. Read access is not required.
                 * This is the simplist possible security model
                 * for networking.
                 */
                rc = smk_access(skp, ssp->smk_in, MAY_WRITE, &ad);
                rc = smk_bu_note("IPv4 delivery", skp, ssp->smk_in,
                                        MAY_WRITE, rc);
                if (rc != 0)
                        netlbl_skbuff_err(skb, family, rc, 0);
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case PF_INET6:
                proto = smk_skb_to_addr_ipv6(skb, &sadd);
                if (proto != IPPROTO_UDP && proto != IPPROTO_UDPLITE &&
                    proto != IPPROTO_TCP && proto != IPPROTO_DCCP)
                        break;
#ifdef SMACK_IPV6_SECMARK_LABELING
                skp = smack_from_skb(skb);
                if (skp == NULL) {
                        if (smk_ipv6_localhost(&sadd))
                                break;
                        skp = smack_ipv6host_label(&sadd);
                        if (skp == NULL)
                                skp = smack_net_ambient;
                }
#ifdef CONFIG_AUDIT
                smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net);
                ad.a.u.net->family = family;
                ad.a.u.net->netif = skb->skb_iif;
                ipv6_skb_to_auditdata(skb, &ad.a, NULL);
#endif /* CONFIG_AUDIT */
                rc = smk_access(skp, ssp->smk_in, MAY_WRITE, &ad);
                rc = smk_bu_note("IPv6 delivery", skp, ssp->smk_in,
                                        MAY_WRITE, rc);
#endif /* SMACK_IPV6_SECMARK_LABELING */
#ifdef SMACK_IPV6_PORT_LABELING
                rc = smk_ipv6_port_check(sk, &sadd, SMK_RECEIVING);
#endif /* SMACK_IPV6_PORT_LABELING */
                if (rc != 0)
                        icmpv6_send(skb, ICMPV6_DEST_UNREACH,
                                        ICMPV6_ADM_PROHIBITED, 0);
                break;
#endif /* CONFIG_IPV6 */
        }

        return rc;
}

/**
 * smack_socket_getpeersec_stream - pull in packet label
 * @sock: the socket
 * @optval: user's destination
 * @optlen: size thereof
 * @len: max thereof
 *
 * returns zero on success, an error code otherwise
 */
static int smack_socket_getpeersec_stream(struct socket *sock,
                                          sockptr_t optval, sockptr_t optlen,
                                          unsigned int len)
{
        struct socket_smack *ssp;
        char *rcp = "";
        u32 slen = 1;
        int rc = 0;

        ssp = sock->sk->sk_security;
        if (ssp->smk_packet != NULL) {
                rcp = ssp->smk_packet->smk_known;
                slen = strlen(rcp) + 1;
        }
        if (slen > len) {
                rc = -ERANGE;
                goto out_len;
        }

        if (copy_to_sockptr(optval, rcp, slen))
                rc = -EFAULT;
out_len:
        if (copy_to_sockptr(optlen, &slen, sizeof(slen)))
                rc = -EFAULT;
        return rc;
}


/**
 * smack_socket_getpeersec_dgram - pull in packet label
 * @sock: the peer socket
 * @skb: packet data
 * @secid: pointer to where to put the secid of the packet
 *
 * Sets the netlabel socket state on sk from parent
 */
static int smack_socket_getpeersec_dgram(struct socket *sock,
                                         struct sk_buff *skb, u32 *secid)

{
        struct socket_smack *ssp = NULL;
        struct smack_known *skp;
        struct sock *sk = NULL;
        int family = PF_UNSPEC;
        u32 s = 0;        /* 0 is the invalid secid */

        if (skb != NULL) {
                if (skb->protocol == htons(ETH_P_IP))
                        family = PF_INET;
#if IS_ENABLED(CONFIG_IPV6)
                else if (skb->protocol == htons(ETH_P_IPV6))
                        family = PF_INET6;
#endif /* CONFIG_IPV6 */
        }
        if (family == PF_UNSPEC && sock != NULL)
                family = sock->sk->sk_family;

        switch (family) {
        case PF_UNIX:
                ssp = sock->sk->sk_security;
                s = ssp->smk_out->smk_secid;
                break;
        case PF_INET:
                skp = smack_from_skb(skb);
                if (skp) {
                        s = skp->smk_secid;
                        break;
                }
                /*
                 * Translate what netlabel gave us.
                 */
                if (sock != NULL)
                        sk = sock->sk;
                skp = smack_from_netlbl(sk, family, skb);
                if (skp != NULL)
                        s = skp->smk_secid;
                break;
        case PF_INET6:
#ifdef SMACK_IPV6_SECMARK_LABELING
                skp = smack_from_skb(skb);
                if (skp)
                        s = skp->smk_secid;
#endif
                break;
        }
        *secid = s;
        if (s == 0)
                return -EINVAL;
        return 0;
}

/**
 * smack_sock_graft - Initialize a newly created socket with an existing sock
 * @sk: child sock
 * @parent: parent socket
 *
 * Set the smk_{in,out} state of an existing sock based on the process that
 * is creating the new socket.
 */
static void smack_sock_graft(struct sock *sk, struct socket *parent)
{
        struct socket_smack *ssp;
        struct smack_known *skp = smk_of_current();

        if (sk == NULL ||
            (sk->sk_family != PF_INET && sk->sk_family != PF_INET6))
                return;

        ssp = sk->sk_security;
        ssp->smk_in = skp;
        ssp->smk_out = skp;
        /* cssp->smk_packet is already set in smack_inet_csk_clone() */
}

/**
 * smack_inet_conn_request - Smack access check on connect
 * @sk: socket involved
 * @skb: packet
 * @req: unused
 *
 * Returns 0 if a task with the packet label could write to
 * the socket, otherwise an error code
 */
static int smack_inet_conn_request(const struct sock *sk, struct sk_buff *skb,
                                   struct request_sock *req)
{
        u16 family = sk->sk_family;
        struct smack_known *skp;
        struct socket_smack *ssp = sk->sk_security;
        struct sockaddr_in addr;
        struct iphdr *hdr;
        struct smack_known *hskp;
        int rc;
        struct smk_audit_info ad;
#ifdef CONFIG_AUDIT
        struct lsm_network_audit net;
#endif

#if IS_ENABLED(CONFIG_IPV6)
        if (family == PF_INET6) {
                /*
                 * Handle mapped IPv4 packets arriving
                 * via IPv6 sockets. Don't set up netlabel
                 * processing on IPv6.
                 */
                if (skb->protocol == htons(ETH_P_IP))
                        family = PF_INET;
                else
                        return 0;
        }
#endif /* CONFIG_IPV6 */

        /*
         * If there is a secmark use it rather than the CIPSO label.
         * If there is no secmark fall back to CIPSO.
         * The secmark is assumed to reflect policy better.
         */
        skp = smack_from_skb(skb);
        if (skp == NULL) {
                skp = smack_from_netlbl(sk, family, skb);
                if (skp == NULL)
                        skp = &smack_known_huh;
        }

#ifdef CONFIG_AUDIT
        smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net);
        ad.a.u.net->family = family;
        ad.a.u.net->netif = skb->skb_iif;
        ipv4_skb_to_auditdata(skb, &ad.a, NULL);
#endif
        /*
         * Receiving a packet requires that the other end be able to write
         * here. Read access is not required.
         */
        rc = smk_access(skp, ssp->smk_in, MAY_WRITE, &ad);
        rc = smk_bu_note("IPv4 connect", skp, ssp->smk_in, MAY_WRITE, rc);
        if (rc != 0)
                return rc;

        /*
         * Save the peer's label in the request_sock so we can later setup
         * smk_packet in the child socket so that SO_PEERCRED can report it.
         */
        req->peer_secid = skp->smk_secid;

        /*
         * We need to decide if we want to label the incoming connection here
         * if we do we only need to label the request_sock and the stack will
         * propagate the wire-label to the sock when it is created.
         */
        hdr = ip_hdr(skb);
        addr.sin_addr.s_addr = hdr->saddr;
        rcu_read_lock();
        hskp = smack_ipv4host_label(&addr);
        rcu_read_unlock();

        if (hskp == NULL)
                rc = netlbl_req_setattr(req, &skp->smk_netlabel);
        else
                netlbl_req_delattr(req);

        return rc;
}

/**
 * smack_inet_csk_clone - Copy the connection information to the new socket
 * @sk: the new socket
 * @req: the connection's request_sock
 *
 * Transfer the connection's peer label to the newly created socket.
 */
static void smack_inet_csk_clone(struct sock *sk,
                                 const struct request_sock *req)
{
        struct socket_smack *ssp = sk->sk_security;
        struct smack_known *skp;

        if (req->peer_secid != 0) {
                skp = smack_from_secid(req->peer_secid);
                ssp->smk_packet = skp;
        } else
                ssp->smk_packet = NULL;
}

/*
 * Key management security hooks
 *
 * Casey has not tested key support very heavily.
 * The permission check is most likely too restrictive.
 * If you care about keys please have a look.
 */
#ifdef CONFIG_KEYS

/**
 * smack_key_alloc - Set the key security blob
 * @key: object
 * @cred: the credentials to use
 * @flags: unused
 *
 * No allocation required
 *
 * Returns 0
 */
static int smack_key_alloc(struct key *key, const struct cred *cred,
                           unsigned long flags)
{
        struct smack_known *skp = smk_of_task(smack_cred(cred));

        key->security = skp;
        return 0;
}

/**
 * smack_key_free - Clear the key security blob
 * @key: the object
 *
 * Clear the blob pointer
 */
static void smack_key_free(struct key *key)
{
        key->security = NULL;
}

/**
 * smack_key_permission - Smack access on a key
 * @key_ref: gets to the object
 * @cred: the credentials to use
 * @need_perm: requested key permission
 *
 * Return 0 if the task has read and write to the object,
 * an error code otherwise
 */
static int smack_key_permission(key_ref_t key_ref,
                                const struct cred *cred,
                                enum key_need_perm need_perm)
{
        struct key *keyp;
        struct smk_audit_info ad;
        struct smack_known *tkp = smk_of_task(smack_cred(cred));
        int request = 0;
        int rc;

        /*
         * Validate requested permissions
         */
        switch (need_perm) {
        case KEY_NEED_READ:
        case KEY_NEED_SEARCH:
        case KEY_NEED_VIEW:
                request |= MAY_READ;
                break;
        case KEY_NEED_WRITE:
        case KEY_NEED_LINK:
        case KEY_NEED_SETATTR:
                request |= MAY_WRITE;
                break;
        case KEY_NEED_UNSPECIFIED:
        case KEY_NEED_UNLINK:
        case KEY_SYSADMIN_OVERRIDE:
        case KEY_AUTHTOKEN_OVERRIDE:
        case KEY_DEFER_PERM_CHECK:
                return 0;
        default:
                return -EINVAL;
        }

        keyp = key_ref_to_ptr(key_ref);
        if (keyp == NULL)
                return -EINVAL;
        /*
         * If the key hasn't been initialized give it access so that
         * it may do so.
         */
        if (keyp->security == NULL)
                return 0;
        /*
         * This should not occur
         */
        if (tkp == NULL)
                return -EACCES;

        if (smack_privileged(CAP_MAC_OVERRIDE))
                return 0;

#ifdef CONFIG_AUDIT
        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_KEY);
        ad.a.u.key_struct.key = keyp->serial;
        ad.a.u.key_struct.key_desc = keyp->description;
#endif
        rc = smk_access(tkp, keyp->security, request, &ad);
        rc = smk_bu_note("key access", tkp, keyp->security, request, rc);
        return rc;
}

/*
 * smack_key_getsecurity - Smack label tagging the key
 * @key points to the key to be queried
 * @_buffer points to a pointer that should be set to point to the
 * resulting string (if no label or an error occurs).
 * Return the length of the string (including terminating NUL) or -ve if
 * an error.
 * May also return 0 (and a NULL buffer pointer) if there is no label.
 */
static int smack_key_getsecurity(struct key *key, char **_buffer)
{
        struct smack_known *skp = key->security;
        size_t length;
        char *copy;

        if (key->security == NULL) {
                *_buffer = NULL;
                return 0;
        }

        copy = kstrdup(skp->smk_known, GFP_KERNEL);
        if (copy == NULL)
                return -ENOMEM;
        length = strlen(copy) + 1;

        *_buffer = copy;
        return length;
}


#ifdef CONFIG_KEY_NOTIFICATIONS
/**
 * smack_watch_key - Smack access to watch a key for notifications.
 * @key: The key to be watched
 *
 * Return 0 if the @watch->cred has permission to read from the key object and
 * an error otherwise.
 */
static int smack_watch_key(struct key *key)
{
        struct smk_audit_info ad;
        struct smack_known *tkp = smk_of_current();
        int rc;

        if (key == NULL)
                return -EINVAL;
        /*
         * If the key hasn't been initialized give it access so that
         * it may do so.
         */
        if (key->security == NULL)
                return 0;
        /*
         * This should not occur
         */
        if (tkp == NULL)
                return -EACCES;

        if (smack_privileged_cred(CAP_MAC_OVERRIDE, current_cred()))
                return 0;

#ifdef CONFIG_AUDIT
        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_KEY);
        ad.a.u.key_struct.key = key->serial;
        ad.a.u.key_struct.key_desc = key->description;
#endif
        rc = smk_access(tkp, key->security, MAY_READ, &ad);
        rc = smk_bu_note("key watch", tkp, key->security, MAY_READ, rc);
        return rc;
}
#endif /* CONFIG_KEY_NOTIFICATIONS */
#endif /* CONFIG_KEYS */

#ifdef CONFIG_WATCH_QUEUE
/**
 * smack_post_notification - Smack access to post a notification to a queue
 * @w_cred: The credentials of the watcher.
 * @cred: The credentials of the event source (may be NULL).
 * @n: The notification message to be posted.
 */
static int smack_post_notification(const struct cred *w_cred,
                                   const struct cred *cred,
                                   struct watch_notification *n)
{
        struct smk_audit_info ad;
        struct smack_known *subj, *obj;
        int rc;

        /* Always let maintenance notifications through. */
        if (n->type == WATCH_TYPE_META)
                return 0;

        if (!cred)
                return 0;
        subj = smk_of_task(smack_cred(cred));
        obj = smk_of_task(smack_cred(w_cred));

        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NOTIFICATION);
        rc = smk_access(subj, obj, MAY_WRITE, &ad);
        rc = smk_bu_note("notification", subj, obj, MAY_WRITE, rc);
        return rc;
}
#endif /* CONFIG_WATCH_QUEUE */

/*
 * Smack Audit hooks
 *
 * Audit requires a unique representation of each Smack specific
 * rule. This unique representation is used to distinguish the
 * object to be audited from remaining kernel objects and also
 * works as a glue between the audit hooks.
 *
 * Since repository entries are added but never deleted, we'll use
 * the smack_known label address related to the given audit rule as
 * the needed unique representation. This also better fits the smack
 * model where nearly everything is a label.
 */
#ifdef CONFIG_AUDIT

/**
 * smack_audit_rule_init - Initialize a smack audit rule
 * @field: audit rule fields given from user-space (audit.h)
 * @op: required testing operator (=, !=, >, <, ...)
 * @rulestr: smack label to be audited
 * @vrule: pointer to save our own audit rule representation
 * @gfp: type of the memory for the allocation
 *
 * Prepare to audit cases where (@field @op @rulestr) is true.
 * The label to be audited is created if necessay.
 */
static int smack_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule,
                                 gfp_t gfp)
{
        struct smack_known *skp;
        char **rule = (char **)vrule;
        *rule = NULL;

        if (field != AUDIT_SUBJ_USER && field != AUDIT_OBJ_USER)
                return -EINVAL;

        if (op != Audit_equal && op != Audit_not_equal)
                return -EINVAL;

        skp = smk_import_entry(rulestr, 0);
        if (IS_ERR(skp))
                return PTR_ERR(skp);

        *rule = skp->smk_known;

        return 0;
}

/**
 * smack_audit_rule_known - Distinguish Smack audit rules
 * @krule: rule of interest, in Audit kernel representation format
 *
 * This is used to filter Smack rules from remaining Audit ones.
 * If it's proved that this rule belongs to us, the
 * audit_rule_match hook will be called to do the final judgement.
 */
static int smack_audit_rule_known(struct audit_krule *krule)
{
        struct audit_field *f;
        int i;

        for (i = 0; i < krule->field_count; i++) {
                f = &krule->fields[i];

                if (f->type == AUDIT_SUBJ_USER || f->type == AUDIT_OBJ_USER)
                        return 1;
        }

        return 0;
}

/**
 * smack_audit_rule_match - Audit given object ?
 * @secid: security id for identifying the object to test
 * @field: audit rule flags given from user-space
 * @op: required testing operator
 * @vrule: smack internal rule presentation
 *
 * The core Audit hook. It's used to take the decision of
 * whether to audit or not to audit a given object.
 */
static int smack_audit_rule_match(u32 secid, u32 field, u32 op, void *vrule)
{
        struct smack_known *skp;
        char *rule = vrule;

        if (unlikely(!rule)) {
                WARN_ONCE(1, "Smack: missing rule\n");
                return -ENOENT;
        }

        if (field != AUDIT_SUBJ_USER && field != AUDIT_OBJ_USER)
                return 0;

        skp = smack_from_secid(secid);

        /*
         * No need to do string comparisons. If a match occurs,
         * both pointers will point to the same smack_known
         * label.
         */
        if (op == Audit_equal)
                return (rule == skp->smk_known);
        if (op == Audit_not_equal)
                return (rule != skp->smk_known);

        return 0;
}

/*
 * There is no need for a smack_audit_rule_free hook.
 * No memory was allocated.
 */

#endif /* CONFIG_AUDIT */

/**
 * smack_ismaclabel - check if xattr @name references a smack MAC label
 * @name: Full xattr name to check.
 */
static int smack_ismaclabel(const char *name)
{
        return (strcmp(name, XATTR_SMACK_SUFFIX) == 0);
}


/**
 * smack_secid_to_secctx - return the smack label for a secid
 * @secid: incoming integer
 * @secdata: destination
 * @seclen: how long it is
 *
 * Exists for networking code.
 */
static int smack_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
{
        struct smack_known *skp = smack_from_secid(secid);

        if (secdata)
                *secdata = skp->smk_known;
        *seclen = strlen(skp->smk_known);
        return 0;
}

/**
 * smack_secctx_to_secid - return the secid for a smack label
 * @secdata: smack label
 * @seclen: how long result is
 * @secid: outgoing integer
 *
 * Exists for audit and networking code.
 */
static int smack_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
{
        struct smack_known *skp = smk_find_entry(secdata);

        if (skp)
                *secid = skp->smk_secid;
        else
                *secid = 0;
        return 0;
}

/*
 * There used to be a smack_release_secctx hook
 * that did nothing back when hooks were in a vector.
 * Now that there's a list such a hook adds cost.
 */

static int smack_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen)
{
        return smack_inode_setsecurity(inode, XATTR_SMACK_SUFFIX, ctx,
                                       ctxlen, 0);
}

static int smack_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
{
        return __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_SMACK,
                                     ctx, ctxlen, 0);
}

static int smack_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
{
        struct smack_known *skp = smk_of_inode(inode);

        *ctx = skp->smk_known;
        *ctxlen = strlen(skp->smk_known);
        return 0;
}

static int smack_inode_copy_up(struct dentry *dentry, struct cred **new)
{

        struct task_smack *tsp;
        struct smack_known *skp;
        struct inode_smack *isp;
        struct cred *new_creds = *new;

        if (new_creds == NULL) {
                new_creds = prepare_creds();
                if (new_creds == NULL)
                        return -ENOMEM;
        }

        tsp = smack_cred(new_creds);

        /*
         * Get label from overlay inode and set it in create_sid
         */
        isp = smack_inode(d_inode(dentry));
        skp = isp->smk_inode;
        tsp->smk_task = skp;
        *new = new_creds;
        return 0;
}

static int smack_inode_copy_up_xattr(struct dentry *src, const char *name)
{
        /*
         * Return 1 if this is the smack access Smack attribute.
         */
        if (strcmp(name, XATTR_NAME_SMACK) == 0)
                return 1;

        return -EOPNOTSUPP;
}

static int smack_dentry_create_files_as(struct dentry *dentry, int mode,
                                        struct qstr *name,
                                        const struct cred *old,
                                        struct cred *new)
{
        struct task_smack *otsp = smack_cred(old);
        struct task_smack *ntsp = smack_cred(new);
        struct inode_smack *isp;
        int may;

        /*
         * Use the process credential unless all of
         * the transmuting criteria are met
         */
        ntsp->smk_task = otsp->smk_task;

        /*
         * the attribute of the containing directory
         */
        isp = smack_inode(d_inode(dentry->d_parent));

        if (isp->smk_flags & SMK_INODE_TRANSMUTE) {
                rcu_read_lock();
                may = smk_access_entry(otsp->smk_task->smk_known,
                                       isp->smk_inode->smk_known,
                                       &otsp->smk_task->smk_rules);
                rcu_read_unlock();

                /*
                 * If the directory is transmuting and the rule
                 * providing access is transmuting use the containing
                 * directory label instead of the process label.
                 */
                if (may > 0 && (may & MAY_TRANSMUTE)) {
                        ntsp->smk_task = isp->smk_inode;
                        ntsp->smk_transmuted = ntsp->smk_task;
                }
        }
        return 0;
}

#ifdef CONFIG_IO_URING
/**
 * smack_uring_override_creds - Is io_uring cred override allowed?
 * @new: the target creds
 *
 * Check to see if the current task is allowed to override it's credentials
 * to service an io_uring operation.
 */
static int smack_uring_override_creds(const struct cred *new)
{
        struct task_smack *tsp = smack_cred(current_cred());
        struct task_smack *nsp = smack_cred(new);

        /*
         * Allow the degenerate case where the new Smack value is
         * the same as the current Smack value.
         */
        if (tsp->smk_task == nsp->smk_task)
                return 0;

        if (smack_privileged_cred(CAP_MAC_OVERRIDE, current_cred()))
                return 0;

        return -EPERM;
}

/**
 * smack_uring_sqpoll - check if a io_uring polling thread can be created
 *
 * Check to see if the current task is allowed to create a new io_uring
 * kernel polling thread.
 */
static int smack_uring_sqpoll(void)
{
        if (smack_privileged_cred(CAP_MAC_ADMIN, current_cred()))
                return 0;

        return -EPERM;
}

/**
 * smack_uring_cmd - check on file operations for io_uring
 * @ioucmd: the command in question
 *
 * Make a best guess about whether a io_uring "command" should
 * be allowed. Use the same logic used for determining if the
 * file could be opened for read in the absence of better criteria.
 */
static int smack_uring_cmd(struct io_uring_cmd *ioucmd)
{
        struct file *file = ioucmd->file;
        struct smk_audit_info ad;
        struct task_smack *tsp;
        struct inode *inode;
        int rc;

        if (!file)
                return -EINVAL;

        tsp = smack_cred(file->f_cred);
        inode = file_inode(file);

        smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_PATH);
        smk_ad_setfield_u_fs_path(&ad, file->f_path);
        rc = smk_tskacc(tsp, smk_of_inode(inode), MAY_READ, &ad);
        rc = smk_bu_credfile(file->f_cred, file, MAY_READ, rc);

        return rc;
}

#endif /* CONFIG_IO_URING */

struct lsm_blob_sizes smack_blob_sizes __ro_after_init = {
        .lbs_cred = sizeof(struct task_smack),
        .lbs_file = sizeof(struct smack_known *),
        .lbs_inode = sizeof(struct inode_smack),
        .lbs_ipc = sizeof(struct smack_known *),
        .lbs_msg_msg = sizeof(struct smack_known *),
        .lbs_superblock = sizeof(struct superblock_smack),
        .lbs_xattr_count = SMACK_INODE_INIT_XATTRS,
};

static const struct lsm_id smack_lsmid = {
        .name = "smack",
        .id = LSM_ID_SMACK,
};

static struct security_hook_list smack_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(ptrace_access_check, smack_ptrace_access_check),
        LSM_HOOK_INIT(ptrace_traceme, smack_ptrace_traceme),
        LSM_HOOK_INIT(syslog, smack_syslog),

        LSM_HOOK_INIT(fs_context_submount, smack_fs_context_submount),
        LSM_HOOK_INIT(fs_context_dup, smack_fs_context_dup),
        LSM_HOOK_INIT(fs_context_parse_param, smack_fs_context_parse_param),

        LSM_HOOK_INIT(sb_alloc_security, smack_sb_alloc_security),
        LSM_HOOK_INIT(sb_free_mnt_opts, smack_free_mnt_opts),
        LSM_HOOK_INIT(sb_eat_lsm_opts, smack_sb_eat_lsm_opts),
        LSM_HOOK_INIT(sb_statfs, smack_sb_statfs),
        LSM_HOOK_INIT(sb_set_mnt_opts, smack_set_mnt_opts),

        LSM_HOOK_INIT(bprm_creds_for_exec, smack_bprm_creds_for_exec),

        LSM_HOOK_INIT(inode_alloc_security, smack_inode_alloc_security),
        LSM_HOOK_INIT(inode_init_security, smack_inode_init_security),
        LSM_HOOK_INIT(inode_link, smack_inode_link),
        LSM_HOOK_INIT(inode_unlink, smack_inode_unlink),
        LSM_HOOK_INIT(inode_rmdir, smack_inode_rmdir),
        LSM_HOOK_INIT(inode_rename, smack_inode_rename),
        LSM_HOOK_INIT(inode_permission, smack_inode_permission),
        LSM_HOOK_INIT(inode_setattr, smack_inode_setattr),
        LSM_HOOK_INIT(inode_getattr, smack_inode_getattr),
        LSM_HOOK_INIT(inode_setxattr, smack_inode_setxattr),
        LSM_HOOK_INIT(inode_post_setxattr, smack_inode_post_setxattr),
        LSM_HOOK_INIT(inode_getxattr, smack_inode_getxattr),
        LSM_HOOK_INIT(inode_removexattr, smack_inode_removexattr),
        LSM_HOOK_INIT(inode_set_acl, smack_inode_set_acl),
        LSM_HOOK_INIT(inode_get_acl, smack_inode_get_acl),
        LSM_HOOK_INIT(inode_remove_acl, smack_inode_remove_acl),
        LSM_HOOK_INIT(inode_getsecurity, smack_inode_getsecurity),
        LSM_HOOK_INIT(inode_setsecurity, smack_inode_setsecurity),
        LSM_HOOK_INIT(inode_listsecurity, smack_inode_listsecurity),
        LSM_HOOK_INIT(inode_getsecid, smack_inode_getsecid),

        LSM_HOOK_INIT(file_alloc_security, smack_file_alloc_security),
        LSM_HOOK_INIT(file_ioctl, smack_file_ioctl),
        LSM_HOOK_INIT(file_ioctl_compat, smack_file_ioctl),
        LSM_HOOK_INIT(file_lock, smack_file_lock),
        LSM_HOOK_INIT(file_fcntl, smack_file_fcntl),
        LSM_HOOK_INIT(mmap_file, smack_mmap_file),
        LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
        LSM_HOOK_INIT(file_set_fowner, smack_file_set_fowner),
        LSM_HOOK_INIT(file_send_sigiotask, smack_file_send_sigiotask),
        LSM_HOOK_INIT(file_receive, smack_file_receive),

        LSM_HOOK_INIT(file_open, smack_file_open),

        LSM_HOOK_INIT(cred_alloc_blank, smack_cred_alloc_blank),
        LSM_HOOK_INIT(cred_free, smack_cred_free),
        LSM_HOOK_INIT(cred_prepare, smack_cred_prepare),
        LSM_HOOK_INIT(cred_transfer, smack_cred_transfer),
        LSM_HOOK_INIT(cred_getsecid, smack_cred_getsecid),
        LSM_HOOK_INIT(kernel_act_as, smack_kernel_act_as),
        LSM_HOOK_INIT(kernel_create_files_as, smack_kernel_create_files_as),
        LSM_HOOK_INIT(task_setpgid, smack_task_setpgid),
        LSM_HOOK_INIT(task_getpgid, smack_task_getpgid),
        LSM_HOOK_INIT(task_getsid, smack_task_getsid),
        LSM_HOOK_INIT(current_getsecid_subj, smack_current_getsecid_subj),
        LSM_HOOK_INIT(task_getsecid_obj, smack_task_getsecid_obj),
        LSM_HOOK_INIT(task_setnice, smack_task_setnice),
        LSM_HOOK_INIT(task_setioprio, smack_task_setioprio),
        LSM_HOOK_INIT(task_getioprio, smack_task_getioprio),
        LSM_HOOK_INIT(task_setscheduler, smack_task_setscheduler),
        LSM_HOOK_INIT(task_getscheduler, smack_task_getscheduler),
        LSM_HOOK_INIT(task_movememory, smack_task_movememory),
        LSM_HOOK_INIT(task_kill, smack_task_kill),
        LSM_HOOK_INIT(task_to_inode, smack_task_to_inode),

        LSM_HOOK_INIT(ipc_permission, smack_ipc_permission),
        LSM_HOOK_INIT(ipc_getsecid, smack_ipc_getsecid),

        LSM_HOOK_INIT(msg_msg_alloc_security, smack_msg_msg_alloc_security),

        LSM_HOOK_INIT(msg_queue_alloc_security, smack_ipc_alloc_security),
        LSM_HOOK_INIT(msg_queue_associate, smack_msg_queue_associate),
        LSM_HOOK_INIT(msg_queue_msgctl, smack_msg_queue_msgctl),
        LSM_HOOK_INIT(msg_queue_msgsnd, smack_msg_queue_msgsnd),
        LSM_HOOK_INIT(msg_queue_msgrcv, smack_msg_queue_msgrcv),

        LSM_HOOK_INIT(shm_alloc_security, smack_ipc_alloc_security),
        LSM_HOOK_INIT(shm_associate, smack_shm_associate),
        LSM_HOOK_INIT(shm_shmctl, smack_shm_shmctl),
        LSM_HOOK_INIT(shm_shmat, smack_shm_shmat),

        LSM_HOOK_INIT(sem_alloc_security, smack_ipc_alloc_security),
        LSM_HOOK_INIT(sem_associate, smack_sem_associate),
        LSM_HOOK_INIT(sem_semctl, smack_sem_semctl),
        LSM_HOOK_INIT(sem_semop, smack_sem_semop),

        LSM_HOOK_INIT(d_instantiate, smack_d_instantiate),

        LSM_HOOK_INIT(getselfattr, smack_getselfattr),
        LSM_HOOK_INIT(setselfattr, smack_setselfattr),
        LSM_HOOK_INIT(getprocattr, smack_getprocattr),
        LSM_HOOK_INIT(setprocattr, smack_setprocattr),

        LSM_HOOK_INIT(unix_stream_connect, smack_unix_stream_connect),
        LSM_HOOK_INIT(unix_may_send, smack_unix_may_send),

        LSM_HOOK_INIT(socket_post_create, smack_socket_post_create),
        LSM_HOOK_INIT(socket_socketpair, smack_socket_socketpair),
#ifdef SMACK_IPV6_PORT_LABELING
        LSM_HOOK_INIT(socket_bind, smack_socket_bind),
#endif
        LSM_HOOK_INIT(socket_connect, smack_socket_connect),
        LSM_HOOK_INIT(socket_sendmsg, smack_socket_sendmsg),
        LSM_HOOK_INIT(socket_sock_rcv_skb, smack_socket_sock_rcv_skb),
        LSM_HOOK_INIT(socket_getpeersec_stream, smack_socket_getpeersec_stream),
        LSM_HOOK_INIT(socket_getpeersec_dgram, smack_socket_getpeersec_dgram),
        LSM_HOOK_INIT(sk_alloc_security, smack_sk_alloc_security),
        LSM_HOOK_INIT(sk_free_security, smack_sk_free_security),
        LSM_HOOK_INIT(sk_clone_security, smack_sk_clone_security),
        LSM_HOOK_INIT(sock_graft, smack_sock_graft),
        LSM_HOOK_INIT(inet_conn_request, smack_inet_conn_request),
        LSM_HOOK_INIT(inet_csk_clone, smack_inet_csk_clone),

 /* key management security hooks */
#ifdef CONFIG_KEYS
        LSM_HOOK_INIT(key_alloc, smack_key_alloc),
        LSM_HOOK_INIT(key_free, smack_key_free),
        LSM_HOOK_INIT(key_permission, smack_key_permission),
        LSM_HOOK_INIT(key_getsecurity, smack_key_getsecurity),
#ifdef CONFIG_KEY_NOTIFICATIONS
        LSM_HOOK_INIT(watch_key, smack_watch_key),
#endif
#endif /* CONFIG_KEYS */

#ifdef CONFIG_WATCH_QUEUE
        LSM_HOOK_INIT(post_notification, smack_post_notification),
#endif

 /* Audit hooks */
#ifdef CONFIG_AUDIT
        LSM_HOOK_INIT(audit_rule_init, smack_audit_rule_init),
        LSM_HOOK_INIT(audit_rule_known, smack_audit_rule_known),
        LSM_HOOK_INIT(audit_rule_match, smack_audit_rule_match),
#endif /* CONFIG_AUDIT */

        LSM_HOOK_INIT(ismaclabel, smack_ismaclabel),
        LSM_HOOK_INIT(secid_to_secctx, smack_secid_to_secctx),
        LSM_HOOK_INIT(secctx_to_secid, smack_secctx_to_secid),
        LSM_HOOK_INIT(inode_notifysecctx, smack_inode_notifysecctx),
        LSM_HOOK_INIT(inode_setsecctx, smack_inode_setsecctx),
        LSM_HOOK_INIT(inode_getsecctx, smack_inode_getsecctx),
        LSM_HOOK_INIT(inode_copy_up, smack_inode_copy_up),
        LSM_HOOK_INIT(inode_copy_up_xattr, smack_inode_copy_up_xattr),
        LSM_HOOK_INIT(dentry_create_files_as, smack_dentry_create_files_as),
#ifdef CONFIG_IO_URING
        LSM_HOOK_INIT(uring_override_creds, smack_uring_override_creds),
        LSM_HOOK_INIT(uring_sqpoll, smack_uring_sqpoll),
        LSM_HOOK_INIT(uring_cmd, smack_uring_cmd),
#endif
};


static __init void init_smack_known_list(void)
{
        /*
         * Initialize rule list locks
         */
        mutex_init(&smack_known_huh.smk_rules_lock);
        mutex_init(&smack_known_hat.smk_rules_lock);
        mutex_init(&smack_known_floor.smk_rules_lock);
        mutex_init(&smack_known_star.smk_rules_lock);
        mutex_init(&smack_known_web.smk_rules_lock);
        /*
         * Initialize rule lists
         */
        INIT_LIST_HEAD(&smack_known_huh.smk_rules);
        INIT_LIST_HEAD(&smack_known_hat.smk_rules);
        INIT_LIST_HEAD(&smack_known_star.smk_rules);
        INIT_LIST_HEAD(&smack_known_floor.smk_rules);
        INIT_LIST_HEAD(&smack_known_web.smk_rules);
        /*
         * Create the known labels list
         */
        smk_insert_entry(&smack_known_huh);
        smk_insert_entry(&smack_known_hat);
        smk_insert_entry(&smack_known_star);
        smk_insert_entry(&smack_known_floor);
        smk_insert_entry(&smack_known_web);
}

/**
 * smack_init - initialize the smack system
 *
 * Returns 0 on success, -ENOMEM is there's no memory
 */
static __init int smack_init(void)
{
        struct cred *cred = (struct cred *) current->cred;
        struct task_smack *tsp;

        smack_rule_cache = KMEM_CACHE(smack_rule, 0);
        if (!smack_rule_cache)
                return -ENOMEM;

        /*
         * Set the security state for the initial task.
         */
        tsp = smack_cred(cred);
        init_task_smack(tsp, &smack_known_floor, &smack_known_floor);

        /*
         * Register with LSM
         */
        security_add_hooks(smack_hooks, ARRAY_SIZE(smack_hooks), &smack_lsmid);
        smack_enabled = 1;

        pr_info("Smack:  Initializing.\n");
#ifdef CONFIG_SECURITY_SMACK_NETFILTER
        pr_info("Smack:  Netfilter enabled.\n");
#endif
#ifdef SMACK_IPV6_PORT_LABELING
        pr_info("Smack:  IPv6 port labeling enabled.\n");
#endif
#ifdef SMACK_IPV6_SECMARK_LABELING
        pr_info("Smack:  IPv6 Netfilter enabled.\n");
#endif

        /* initialize the smack_known_list */
        init_smack_known_list();

        return 0;
}

/*
 * Smack requires early initialization in order to label
 * all processes and objects when they are created.
 */
DEFINE_LSM(smack) = {
        .name = "smack",
        .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE,
        .blobs = &smack_blob_sizes,
        .init = smack_init,
};










































































































































































































































































































































































































































































































































































































































































    2 


    2 













































































    3 

    3 



    3 










































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
// SPDX-License-Identifier: GPL-2.0
/*
 * kobject.c - library routines for handling generic kernel objects
 *
 * Copyright (c) 2002-2003 Patrick Mochel <mochel@osdl.org>
 * Copyright (c) 2006-2007 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (c) 2006-2007 Novell Inc.
 *
 * Please see the file Documentation/core-api/kobject.rst for critical information
 * about using the kobject interface.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/export.h>
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/random.h>

/**
 * kobject_namespace() - Return @kobj's namespace tag.
 * @kobj: kobject in question
 *
 * Returns namespace tag of @kobj if its parent has namespace ops enabled
 * and thus @kobj should have a namespace tag associated with it.  Returns
 * %NULL otherwise.
 */
const void *kobject_namespace(const struct kobject *kobj)
{
        const struct kobj_ns_type_operations *ns_ops = kobj_ns_ops(kobj);

        if (!ns_ops || ns_ops->type == KOBJ_NS_TYPE_NONE)
                return NULL;

        return kobj->ktype->namespace(kobj);
}

/**
 * kobject_get_ownership() - Get sysfs ownership data for @kobj.
 * @kobj: kobject in question
 * @uid: kernel user ID for sysfs objects
 * @gid: kernel group ID for sysfs objects
 *
 * Returns initial uid/gid pair that should be used when creating sysfs
 * representation of given kobject. Normally used to adjust ownership of
 * objects in a container.
 */
void kobject_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid)
{
        *uid = GLOBAL_ROOT_UID;
        *gid = GLOBAL_ROOT_GID;

        if (kobj->ktype->get_ownership)
                kobj->ktype->get_ownership(kobj, uid, gid);
}

static bool kobj_ns_type_is_valid(enum kobj_ns_type type)
{
        if ((type <= KOBJ_NS_TYPE_NONE) || (type >= KOBJ_NS_TYPES))
                return false;

        return true;
}

static int create_dir(struct kobject *kobj)
{
        const struct kobj_type *ktype = get_ktype(kobj);
        const struct kobj_ns_type_operations *ops;
        int error;

        error = sysfs_create_dir_ns(kobj, kobject_namespace(kobj));
        if (error)
                return error;

        if (ktype) {
                error = sysfs_create_groups(kobj, ktype->default_groups);
                if (error) {
                        sysfs_remove_dir(kobj);
                        return error;
                }
        }

        /*
         * @kobj->sd may be deleted by an ancestor going away.  Hold an
         * extra reference so that it stays until @kobj is gone.
         */
        sysfs_get(kobj->sd);

        /*
         * If @kobj has ns_ops, its children need to be filtered based on
         * their namespace tags.  Enable namespace support on @kobj->sd.
         */
        ops = kobj_child_ns_ops(kobj);
        if (ops) {
                BUG_ON(!kobj_ns_type_is_valid(ops->type));
                BUG_ON(!kobj_ns_type_registered(ops->type));

                sysfs_enable_ns(kobj->sd);
        }

        return 0;
}

static int get_kobj_path_length(const struct kobject *kobj)
{
        int length = 1;
        const struct kobject *parent = kobj;

        /* walk up the ancestors until we hit the one pointing to the
         * root.
         * Add 1 to strlen for leading '/' of each level.
         */
        do {
                if (kobject_name(parent) == NULL)
                        return 0;
                length += strlen(kobject_name(parent)) + 1;
                parent = parent->parent;
        } while (parent);
        return length;
}

static int fill_kobj_path(const struct kobject *kobj, char *path, int length)
{
        const struct kobject *parent;

        --length;
        for (parent = kobj; parent; parent = parent->parent) {
                int cur = strlen(kobject_name(parent));
                /* back up enough to print this name with '/' */
                length -= cur;
                if (length <= 0)
                        return -EINVAL;
                memcpy(path + length, kobject_name(parent), cur);
                *(path + --length) = '/';
        }

        pr_debug("'%s' (%p): %s: path = '%s'\n", kobject_name(kobj),
                 kobj, __func__, path);

        return 0;
}

/**
 * kobject_get_path() - Allocate memory and fill in the path for @kobj.
 * @kobj:        kobject in question, with which to build the path
 * @gfp_mask:        the allocation type used to allocate the path
 *
 * Return: The newly allocated memory, caller must free with kfree().
 */
char *kobject_get_path(const struct kobject *kobj, gfp_t gfp_mask)
{
        char *path;
        int len;

retry:
        len = get_kobj_path_length(kobj);
        if (len == 0)
                return NULL;
        path = kzalloc(len, gfp_mask);
        if (!path)
                return NULL;
        if (fill_kobj_path(kobj, path, len)) {
                kfree(path);
                goto retry;
        }

        return path;
}
EXPORT_SYMBOL_GPL(kobject_get_path);

/* add the kobject to its kset's list */
static void kobj_kset_join(struct kobject *kobj)
{
        if (!kobj->kset)
                return;

        kset_get(kobj->kset);
        spin_lock(&kobj->kset->list_lock);
        list_add_tail(&kobj->entry, &kobj->kset->list);
        spin_unlock(&kobj->kset->list_lock);
}

/* remove the kobject from its kset's list */
static void kobj_kset_leave(struct kobject *kobj)
{
        if (!kobj->kset)
                return;

        spin_lock(&kobj->kset->list_lock);
        list_del_init(&kobj->entry);
        spin_unlock(&kobj->kset->list_lock);
        kset_put(kobj->kset);
}

static void kobject_init_internal(struct kobject *kobj)
{
        if (!kobj)
                return;
        kref_init(&kobj->kref);
        INIT_LIST_HEAD(&kobj->entry);
        kobj->state_in_sysfs = 0;
        kobj->state_add_uevent_sent = 0;
        kobj->state_remove_uevent_sent = 0;
        kobj->state_initialized = 1;
}


static int kobject_add_internal(struct kobject *kobj)
{
        int error = 0;
        struct kobject *parent;

        if (!kobj)
                return -ENOENT;

        if (!kobj->name || !kobj->name[0]) {
                WARN(1,
                     "kobject: (%p): attempted to be registered with empty name!\n",
                     kobj);
                return -EINVAL;
        }

        parent = kobject_get(kobj->parent);

        /* join kset if set, use it as parent if we do not already have one */
        if (kobj->kset) {
                if (!parent)
                        parent = kobject_get(&kobj->kset->kobj);
                kobj_kset_join(kobj);
                kobj->parent = parent;
        }

        pr_debug("'%s' (%p): %s: parent: '%s', set: '%s'\n",
                 kobject_name(kobj), kobj, __func__,
                 parent ? kobject_name(parent) : "<NULL>",
                 kobj->kset ? kobject_name(&kobj->kset->kobj) : "<NULL>");

        error = create_dir(kobj);
        if (error) {
                kobj_kset_leave(kobj);
                kobject_put(parent);
                kobj->parent = NULL;

                /* be noisy on error issues */
                if (error == -EEXIST)
                        pr_err("%s failed for %s with -EEXIST, don't try to register things with the same name in the same directory.\n",
                               __func__, kobject_name(kobj));
                else
                        pr_err("%s failed for %s (error: %d parent: %s)\n",
                               __func__, kobject_name(kobj), error,
                               parent ? kobject_name(parent) : "'none'");
        } else
                kobj->state_in_sysfs = 1;

        return error;
}

/**
 * kobject_set_name_vargs() - Set the name of a kobject.
 * @kobj: struct kobject to set the name of
 * @fmt: format string used to build the name
 * @vargs: vargs to format the string.
 */
int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
                                  va_list vargs)
{
        const char *s;

        if (kobj->name && !fmt)
                return 0;

        s = kvasprintf_const(GFP_KERNEL, fmt, vargs);
        if (!s)
                return -ENOMEM;

        /*
         * ewww... some of these buggers have '/' in the name ... If
         * that's the case, we need to make sure we have an actual
         * allocated copy to modify, since kvasprintf_const may have
         * returned something from .rodata.
         */
        if (strchr(s, '/')) {
                char *t;

                t = kstrdup(s, GFP_KERNEL);
                kfree_const(s);
                if (!t)
                        return -ENOMEM;
                s = strreplace(t, '/', '!');
        }
        kfree_const(kobj->name);
        kobj->name = s;

        return 0;
}

/**
 * kobject_set_name() - Set the name of a kobject.
 * @kobj: struct kobject to set the name of
 * @fmt: format string used to build the name
 *
 * This sets the name of the kobject.  If you have already added the
 * kobject to the system, you must call kobject_rename() in order to
 * change the name of the kobject.
 */
int kobject_set_name(struct kobject *kobj, const char *fmt, ...)
{
        va_list vargs;
        int retval;

        va_start(vargs, fmt);
        retval = kobject_set_name_vargs(kobj, fmt, vargs);
        va_end(vargs);

        return retval;
}
EXPORT_SYMBOL(kobject_set_name);

/**
 * kobject_init() - Initialize a kobject structure.
 * @kobj: pointer to the kobject to initialize
 * @ktype: pointer to the ktype for this kobject.
 *
 * This function will properly initialize a kobject such that it can then
 * be passed to the kobject_add() call.
 *
 * After this function is called, the kobject MUST be cleaned up by a call
 * to kobject_put(), not by a call to kfree directly to ensure that all of
 * the memory is cleaned up properly.
 */
void kobject_init(struct kobject *kobj, const struct kobj_type *ktype)
{
        char *err_str;

        if (!kobj) {
                err_str = "invalid kobject pointer!";
                goto error;
        }
        if (!ktype) {
                err_str = "must have a ktype to be initialized properly!\n";
                goto error;
        }
        if (kobj->state_initialized) {
                /* do not error out as sometimes we can recover */
                pr_err("kobject (%p): tried to init an initialized object, something is seriously wrong.\n",
                       kobj);
                dump_stack_lvl(KERN_ERR);
        }

        kobject_init_internal(kobj);
        kobj->ktype = ktype;
        return;

error:
        pr_err("kobject (%p): %s\n", kobj, err_str);
        dump_stack_lvl(KERN_ERR);
}
EXPORT_SYMBOL(kobject_init);

static __printf(3, 0) int kobject_add_varg(struct kobject *kobj,
                                           struct kobject *parent,
                                           const char *fmt, va_list vargs)
{
        int retval;

        retval = kobject_set_name_vargs(kobj, fmt, vargs);
        if (retval) {
                pr_err("can not set name properly!\n");
                return retval;
        }
        kobj->parent = parent;
        return kobject_add_internal(kobj);
}

/**
 * kobject_add() - The main kobject add function.
 * @kobj: the kobject to add
 * @parent: pointer to the parent of the kobject.
 * @fmt: format to name the kobject with.
 *
 * The kobject name is set and added to the kobject hierarchy in this
 * function.
 *
 * If @parent is set, then the parent of the @kobj will be set to it.
 * If @parent is NULL, then the parent of the @kobj will be set to the
 * kobject associated with the kset assigned to this kobject.  If no kset
 * is assigned to the kobject, then the kobject will be located in the
 * root of the sysfs tree.
 *
 * Note, no "add" uevent will be created with this call, the caller should set
 * up all of the necessary sysfs files for the object and then call
 * kobject_uevent() with the UEVENT_ADD parameter to ensure that
 * userspace is properly notified of this kobject's creation.
 *
 * Return: If this function returns an error, kobject_put() must be
 *         called to properly clean up the memory associated with the
 *         object.  Under no instance should the kobject that is passed
 *         to this function be directly freed with a call to kfree(),
 *         that can leak memory.
 *
 *         If this function returns success, kobject_put() must also be called
 *         in order to properly clean up the memory associated with the object.
 *
 *         In short, once this function is called, kobject_put() MUST be called
 *         when the use of the object is finished in order to properly free
 *         everything.
 */
int kobject_add(struct kobject *kobj, struct kobject *parent,
                const char *fmt, ...)
{
        va_list args;
        int retval;

        if (!kobj)
                return -EINVAL;

        if (!kobj->state_initialized) {
                pr_err("kobject '%s' (%p): tried to add an uninitialized object, something is seriously wrong.\n",
                       kobject_name(kobj), kobj);
                dump_stack_lvl(KERN_ERR);
                return -EINVAL;
        }
        va_start(args, fmt);
        retval = kobject_add_varg(kobj, parent, fmt, args);
        va_end(args);

        return retval;
}
EXPORT_SYMBOL(kobject_add);

/**
 * kobject_init_and_add() - Initialize a kobject structure and add it to
 *                          the kobject hierarchy.
 * @kobj: pointer to the kobject to initialize
 * @ktype: pointer to the ktype for this kobject.
 * @parent: pointer to the parent of this kobject.
 * @fmt: the name of the kobject.
 *
 * This function combines the call to kobject_init() and kobject_add().
 *
 * If this function returns an error, kobject_put() must be called to
 * properly clean up the memory associated with the object.  This is the
 * same type of error handling after a call to kobject_add() and kobject
 * lifetime rules are the same here.
 */
int kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype,
                         struct kobject *parent, const char *fmt, ...)
{
        va_list args;
        int retval;

        kobject_init(kobj, ktype);

        va_start(args, fmt);
        retval = kobject_add_varg(kobj, parent, fmt, args);
        va_end(args);

        return retval;
}
EXPORT_SYMBOL_GPL(kobject_init_and_add);

/**
 * kobject_rename() - Change the name of an object.
 * @kobj: object in question.
 * @new_name: object's new name
 *
 * It is the responsibility of the caller to provide mutual
 * exclusion between two different calls of kobject_rename
 * on the same kobject and to ensure that new_name is valid and
 * won't conflict with other kobjects.
 */
int kobject_rename(struct kobject *kobj, const char *new_name)
{
        int error = 0;
        const char *devpath = NULL;
        const char *dup_name = NULL, *name;
        char *devpath_string = NULL;
        char *envp[2];

        kobj = kobject_get(kobj);
        if (!kobj)
                return -EINVAL;
        if (!kobj->parent) {
                kobject_put(kobj);
                return -EINVAL;
        }

        devpath = kobject_get_path(kobj, GFP_KERNEL);
        if (!devpath) {
                error = -ENOMEM;
                goto out;
        }
        devpath_string = kmalloc(strlen(devpath) + 15, GFP_KERNEL);
        if (!devpath_string) {
                error = -ENOMEM;
                goto out;
        }
        sprintf(devpath_string, "DEVPATH_OLD=%s", devpath);
        envp[0] = devpath_string;
        envp[1] = NULL;

        name = dup_name = kstrdup_const(new_name, GFP_KERNEL);
        if (!name) {
                error = -ENOMEM;
                goto out;
        }

        error = sysfs_rename_dir_ns(kobj, new_name, kobject_namespace(kobj));
        if (error)
                goto out;

        /* Install the new kobject name */
        dup_name = kobj->name;
        kobj->name = name;

        /* This function is mostly/only used for network interface.
         * Some hotplug package track interfaces by their name and
         * therefore want to know when the name is changed by the user. */
        kobject_uevent_env(kobj, KOBJ_MOVE, envp);

out:
        kfree_const(dup_name);
        kfree(devpath_string);
        kfree(devpath);
        kobject_put(kobj);

        return error;
}
EXPORT_SYMBOL_GPL(kobject_rename);

/**
 * kobject_move() - Move object to another parent.
 * @kobj: object in question.
 * @new_parent: object's new parent (can be NULL)
 */
int kobject_move(struct kobject *kobj, struct kobject *new_parent)
{
        int error;
        struct kobject *old_parent;
        const char *devpath = NULL;
        char *devpath_string = NULL;
        char *envp[2];

        kobj = kobject_get(kobj);
        if (!kobj)
                return -EINVAL;
        new_parent = kobject_get(new_parent);
        if (!new_parent) {
                if (kobj->kset)
                        new_parent = kobject_get(&kobj->kset->kobj);
        }

        /* old object path */
        devpath = kobject_get_path(kobj, GFP_KERNEL);
        if (!devpath) {
                error = -ENOMEM;
                goto out;
        }
        devpath_string = kmalloc(strlen(devpath) + 15, GFP_KERNEL);
        if (!devpath_string) {
                error = -ENOMEM;
                goto out;
        }
        sprintf(devpath_string, "DEVPATH_OLD=%s", devpath);
        envp[0] = devpath_string;
        envp[1] = NULL;
        error = sysfs_move_dir_ns(kobj, new_parent, kobject_namespace(kobj));
        if (error)
                goto out;
        old_parent = kobj->parent;
        kobj->parent = new_parent;
        new_parent = NULL;
        kobject_put(old_parent);
        kobject_uevent_env(kobj, KOBJ_MOVE, envp);
out:
        kobject_put(new_parent);
        kobject_put(kobj);
        kfree(devpath_string);
        kfree(devpath);
        return error;
}
EXPORT_SYMBOL_GPL(kobject_move);

static void __kobject_del(struct kobject *kobj)
{
        struct kernfs_node *sd;
        const struct kobj_type *ktype;

        sd = kobj->sd;
        ktype = get_ktype(kobj);

        if (ktype)
                sysfs_remove_groups(kobj, ktype->default_groups);

        /* send "remove" if the caller did not do it but sent "add" */
        if (kobj->state_add_uevent_sent && !kobj->state_remove_uevent_sent) {
                pr_debug("'%s' (%p): auto cleanup 'remove' event\n",
                         kobject_name(kobj), kobj);
                kobject_uevent(kobj, KOBJ_REMOVE);
        }

        sysfs_remove_dir(kobj);
        sysfs_put(sd);

        kobj->state_in_sysfs = 0;
        kobj_kset_leave(kobj);
        kobj->parent = NULL;
}

/**
 * kobject_del() - Unlink kobject from hierarchy.
 * @kobj: object.
 *
 * This is the function that should be called to delete an object
 * successfully added via kobject_add().
 */
void kobject_del(struct kobject *kobj)
{
        struct kobject *parent;

        if (!kobj)
                return;

        parent = kobj->parent;
        __kobject_del(kobj);
        kobject_put(parent);
}
EXPORT_SYMBOL(kobject_del);

/**
 * kobject_get() - Increment refcount for object.
 * @kobj: object.
 */
struct kobject *kobject_get(struct kobject *kobj)
{
        if (kobj) {
                if (!kobj->state_initialized)
                        WARN(1, KERN_WARNING
                                "kobject: '%s' (%p): is not initialized, yet kobject_get() is being called.\n",
                             kobject_name(kobj), kobj);
                kref_get(&kobj->kref);
        }
        return kobj;
}
EXPORT_SYMBOL(kobject_get);

struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
{
        if (!kobj)
                return NULL;
        if (!kref_get_unless_zero(&kobj->kref))
                kobj = NULL;
        return kobj;
}
EXPORT_SYMBOL(kobject_get_unless_zero);

/*
 * kobject_cleanup - free kobject resources.
 * @kobj: object to cleanup
 */
static void kobject_cleanup(struct kobject *kobj)
{
        struct kobject *parent = kobj->parent;
        const struct kobj_type *t = get_ktype(kobj);
        const char *name = kobj->name;

        pr_debug("'%s' (%p): %s, parent %p\n",
                 kobject_name(kobj), kobj, __func__, kobj->parent);

        if (t && !t->release)
                pr_debug("'%s' (%p): does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n",
                         kobject_name(kobj), kobj);

        /* remove from sysfs if the caller did not do it */
        if (kobj->state_in_sysfs) {
                pr_debug("'%s' (%p): auto cleanup kobject_del\n",
                         kobject_name(kobj), kobj);
                __kobject_del(kobj);
        } else {
                /* avoid dropping the parent reference unnecessarily */
                parent = NULL;
        }

        if (t && t->release) {
                pr_debug("'%s' (%p): calling ktype release\n",
                         kobject_name(kobj), kobj);
                t->release(kobj);
        }

        /* free name if we allocated it */
        if (name) {
                pr_debug("'%s': free name\n", name);
                kfree_const(name);
        }

        kobject_put(parent);
}

#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
static void kobject_delayed_cleanup(struct work_struct *work)
{
        kobject_cleanup(container_of(to_delayed_work(work),
                                     struct kobject, release));
}
#endif

static void kobject_release(struct kref *kref)
{
        struct kobject *kobj = container_of(kref, struct kobject, kref);
#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
        unsigned long delay = HZ + HZ * get_random_u32_below(4);
        pr_info("'%s' (%p): %s, parent %p (delayed %ld)\n",
                kobject_name(kobj), kobj, __func__, kobj->parent, delay);
        INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup);

        schedule_delayed_work(&kobj->release, delay);
#else
        kobject_cleanup(kobj);
#endif
}

/**
 * kobject_put() - Decrement refcount for object.
 * @kobj: object.
 *
 * Decrement the refcount, and if 0, call kobject_cleanup().
 */
void kobject_put(struct kobject *kobj)
{
        if (kobj) {
                if (!kobj->state_initialized)
                        WARN(1, KERN_WARNING
                                "kobject: '%s' (%p): is not initialized, yet kobject_put() is being called.\n",
                             kobject_name(kobj), kobj);
                kref_put(&kobj->kref, kobject_release);
        }
}
EXPORT_SYMBOL(kobject_put);

static void dynamic_kobj_release(struct kobject *kobj)
{
        pr_debug("(%p): %s\n", kobj, __func__);
        kfree(kobj);
}

static const struct kobj_type dynamic_kobj_ktype = {
        .release        = dynamic_kobj_release,
        .sysfs_ops        = &kobj_sysfs_ops,
};

/**
 * kobject_create() - Create a struct kobject dynamically.
 *
 * This function creates a kobject structure dynamically and sets it up
 * to be a "dynamic" kobject with a default release function set up.
 *
 * If the kobject was not able to be created, NULL will be returned.
 * The kobject structure returned from here must be cleaned up with a
 * call to kobject_put() and not kfree(), as kobject_init() has
 * already been called on this structure.
 */
static struct kobject *kobject_create(void)
{
        struct kobject *kobj;

        kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);
        if (!kobj)
                return NULL;

        kobject_init(kobj, &dynamic_kobj_ktype);
        return kobj;
}

/**
 * kobject_create_and_add() - Create a struct kobject dynamically and
 *                            register it with sysfs.
 * @name: the name for the kobject
 * @parent: the parent kobject of this kobject, if any.
 *
 * This function creates a kobject structure dynamically and registers it
 * with sysfs.  When you are finished with this structure, call
 * kobject_put() and the structure will be dynamically freed when
 * it is no longer being used.
 *
 * If the kobject was not able to be created, NULL will be returned.
 */
struct kobject *kobject_create_and_add(const char *name, struct kobject *parent)
{
        struct kobject *kobj;
        int retval;

        kobj = kobject_create();
        if (!kobj)
                return NULL;

        retval = kobject_add(kobj, parent, "%s", name);
        if (retval) {
                pr_warn("%s: kobject_add error: %d\n", __func__, retval);
                kobject_put(kobj);
                kobj = NULL;
        }
        return kobj;
}
EXPORT_SYMBOL_GPL(kobject_create_and_add);

/**
 * kset_init() - Initialize a kset for use.
 * @k: kset
 */
void kset_init(struct kset *k)
{
        kobject_init_internal(&k->kobj);
        INIT_LIST_HEAD(&k->list);
        spin_lock_init(&k->list_lock);
}

/* default kobject attribute operations */
static ssize_t kobj_attr_show(struct kobject *kobj, struct attribute *attr,
                              char *buf)
{
        struct kobj_attribute *kattr;
        ssize_t ret = -EIO;

        kattr = container_of(attr, struct kobj_attribute, attr);
        if (kattr->show)
                ret = kattr->show(kobj, kattr, buf);
        return ret;
}

static ssize_t kobj_attr_store(struct kobject *kobj, struct attribute *attr,
                               const char *buf, size_t count)
{
        struct kobj_attribute *kattr;
        ssize_t ret = -EIO;

        kattr = container_of(attr, struct kobj_attribute, attr);
        if (kattr->store)
                ret = kattr->store(kobj, kattr, buf, count);
        return ret;
}

const struct sysfs_ops kobj_sysfs_ops = {
        .show        = kobj_attr_show,
        .store        = kobj_attr_store,
};
EXPORT_SYMBOL_GPL(kobj_sysfs_ops);

/**
 * kset_register() - Initialize and add a kset.
 * @k: kset.
 *
 * NOTE: On error, the kset.kobj.name allocated by() kobj_set_name()
 * is freed, it can not be used any more.
 */
int kset_register(struct kset *k)
{
        int err;

        if (!k)
                return -EINVAL;

        if (!k->kobj.ktype) {
                pr_err("must have a ktype to be initialized properly!\n");
                return -EINVAL;
        }

        kset_init(k);
        err = kobject_add_internal(&k->kobj);
        if (err) {
                kfree_const(k->kobj.name);
                /* Set it to NULL to avoid accessing bad pointer in callers. */
                k->kobj.name = NULL;
                return err;
        }
        kobject_uevent(&k->kobj, KOBJ_ADD);
        return 0;
}
EXPORT_SYMBOL(kset_register);

/**
 * kset_unregister() - Remove a kset.
 * @k: kset.
 */
void kset_unregister(struct kset *k)
{
        if (!k)
                return;
        kobject_del(&k->kobj);
        kobject_put(&k->kobj);
}
EXPORT_SYMBOL(kset_unregister);

/**
 * kset_find_obj() - Search for object in kset.
 * @kset: kset we're looking in.
 * @name: object's name.
 *
 * Lock kset via @kset->subsys, and iterate over @kset->list,
 * looking for a matching kobject. If matching object is found
 * take a reference and return the object.
 */
struct kobject *kset_find_obj(struct kset *kset, const char *name)
{
        struct kobject *k;
        struct kobject *ret = NULL;

        spin_lock(&kset->list_lock);

        list_for_each_entry(k, &kset->list, entry) {
                if (kobject_name(k) && !strcmp(kobject_name(k), name)) {
                        ret = kobject_get_unless_zero(k);
                        break;
                }
        }

        spin_unlock(&kset->list_lock);
        return ret;
}
EXPORT_SYMBOL_GPL(kset_find_obj);

static void kset_release(struct kobject *kobj)
{
        struct kset *kset = container_of(kobj, struct kset, kobj);
        pr_debug("'%s' (%p): %s\n",
                 kobject_name(kobj), kobj, __func__);
        kfree(kset);
}

static void kset_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid)
{
        if (kobj->parent)
                kobject_get_ownership(kobj->parent, uid, gid);
}

static const struct kobj_type kset_ktype = {
        .sysfs_ops        = &kobj_sysfs_ops,
        .release        = kset_release,
        .get_ownership        = kset_get_ownership,
};

/**
 * kset_create() - Create a struct kset dynamically.
 *
 * @name: the name for the kset
 * @uevent_ops: a struct kset_uevent_ops for the kset
 * @parent_kobj: the parent kobject of this kset, if any.
 *
 * This function creates a kset structure dynamically.  This structure can
 * then be registered with the system and show up in sysfs with a call to
 * kset_register().  When you are finished with this structure, if
 * kset_register() has been called, call kset_unregister() and the
 * structure will be dynamically freed when it is no longer being used.
 *
 * If the kset was not able to be created, NULL will be returned.
 */
static struct kset *kset_create(const char *name,
                                const struct kset_uevent_ops *uevent_ops,
                                struct kobject *parent_kobj)
{
        struct kset *kset;
        int retval;

        kset = kzalloc(sizeof(*kset), GFP_KERNEL);
        if (!kset)
                return NULL;
        retval = kobject_set_name(&kset->kobj, "%s", name);
        if (retval) {
                kfree(kset);
                return NULL;
        }
        kset->uevent_ops = uevent_ops;
        kset->kobj.parent = parent_kobj;

        /*
         * The kobject of this kset will have a type of kset_ktype and belong to
         * no kset itself.  That way we can properly free it when it is
         * finished being used.
         */
        kset->kobj.ktype = &kset_ktype;
        kset->kobj.kset = NULL;

        return kset;
}

/**
 * kset_create_and_add() - Create a struct kset dynamically and add it to sysfs.
 *
 * @name: the name for the kset
 * @uevent_ops: a struct kset_uevent_ops for the kset
 * @parent_kobj: the parent kobject of this kset, if any.
 *
 * This function creates a kset structure dynamically and registers it
 * with sysfs.  When you are finished with this structure, call
 * kset_unregister() and the structure will be dynamically freed when it
 * is no longer being used.
 *
 * If the kset was not able to be created, NULL will be returned.
 */
struct kset *kset_create_and_add(const char *name,
                                 const struct kset_uevent_ops *uevent_ops,
                                 struct kobject *parent_kobj)
{
        struct kset *kset;
        int error;

        kset = kset_create(name, uevent_ops, parent_kobj);
        if (!kset)
                return NULL;
        error = kset_register(kset);
        if (error) {
                kfree(kset);
                return NULL;
        }
        return kset;
}
EXPORT_SYMBOL_GPL(kset_create_and_add);


static DEFINE_SPINLOCK(kobj_ns_type_lock);
static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES];

int kobj_ns_type_register(const struct kobj_ns_type_operations *ops)
{
        enum kobj_ns_type type = ops->type;
        int error;

        spin_lock(&kobj_ns_type_lock);

        error = -EINVAL;
        if (!kobj_ns_type_is_valid(type))
                goto out;

        error = -EBUSY;
        if (kobj_ns_ops_tbl[type])
                goto out;

        error = 0;
        kobj_ns_ops_tbl[type] = ops;

out:
        spin_unlock(&kobj_ns_type_lock);
        return error;
}

int kobj_ns_type_registered(enum kobj_ns_type type)
{
        int registered = 0;

        spin_lock(&kobj_ns_type_lock);
        if (kobj_ns_type_is_valid(type))
                registered = kobj_ns_ops_tbl[type] != NULL;
        spin_unlock(&kobj_ns_type_lock);

        return registered;
}

const struct kobj_ns_type_operations *kobj_child_ns_ops(const struct kobject *parent)
{
        const struct kobj_ns_type_operations *ops = NULL;

        if (parent && parent->ktype && parent->ktype->child_ns_type)
                ops = parent->ktype->child_ns_type(parent);

        return ops;
}

const struct kobj_ns_type_operations *kobj_ns_ops(const struct kobject *kobj)
{
        return kobj_child_ns_ops(kobj->parent);
}

bool kobj_ns_current_may_mount(enum kobj_ns_type type)
{
        bool may_mount = true;

        spin_lock(&kobj_ns_type_lock);
        if (kobj_ns_type_is_valid(type) && kobj_ns_ops_tbl[type])
                may_mount = kobj_ns_ops_tbl[type]->current_may_mount();
        spin_unlock(&kobj_ns_type_lock);

        return may_mount;
}

void *kobj_ns_grab_current(enum kobj_ns_type type)
{
        void *ns = NULL;

        spin_lock(&kobj_ns_type_lock);
        if (kobj_ns_type_is_valid(type) && kobj_ns_ops_tbl[type])
                ns = kobj_ns_ops_tbl[type]->grab_current_ns();
        spin_unlock(&kobj_ns_type_lock);

        return ns;
}
EXPORT_SYMBOL_GPL(kobj_ns_grab_current);

const void *kobj_ns_netlink(enum kobj_ns_type type, struct sock *sk)
{
        const void *ns = NULL;

        spin_lock(&kobj_ns_type_lock);
        if (kobj_ns_type_is_valid(type) && kobj_ns_ops_tbl[type])
                ns = kobj_ns_ops_tbl[type]->netlink_ns(sk);
        spin_unlock(&kobj_ns_type_lock);

        return ns;
}

const void *kobj_ns_initial(enum kobj_ns_type type)
{
        const void *ns = NULL;

        spin_lock(&kobj_ns_type_lock);
        if (kobj_ns_type_is_valid(type) && kobj_ns_ops_tbl[type])
                ns = kobj_ns_ops_tbl[type]->initial_ns();
        spin_unlock(&kobj_ns_type_lock);

        return ns;
}

void kobj_ns_drop(enum kobj_ns_type type, void *ns)
{
        spin_lock(&kobj_ns_type_lock);
        if (kobj_ns_type_is_valid(type) &&
            kobj_ns_ops_tbl[type] && kobj_ns_ops_tbl[type]->drop_ns)
                kobj_ns_ops_tbl[type]->drop_ns(ns);
        spin_unlock(&kobj_ns_type_lock);
}
EXPORT_SYMBOL_GPL(kobj_ns_drop);






































   32 









   35 

   37 











   32 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __X86_KERNEL_FPU_CONTEXT_H
#define __X86_KERNEL_FPU_CONTEXT_H

#include <asm/fpu/xstate.h>
#include <asm/trace/fpu.h>

/* Functions related to FPU context tracking */

/*
 * The in-register FPU state for an FPU context on a CPU is assumed to be
 * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
 * matches the FPU.
 *
 * If the FPU register state is valid, the kernel can skip restoring the
 * FPU state from memory.
 *
 * Any code that clobbers the FPU registers or updates the in-memory
 * FPU state for a task MUST let the rest of the kernel know that the
 * FPU registers are no longer valid for this task.
 *
 * Invalidate a resource you control: CPU if using the CPU for something else
 * (with preemption disabled), FPU for the current task, or a task that
 * is prevented from running by the current task.
 */
static inline void __cpu_invalidate_fpregs_state(void)
{
        __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
}

static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
{
        fpu->last_cpu = -1;
}

static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
{
        return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
}

static inline void fpregs_deactivate(struct fpu *fpu)
{
        __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
        trace_x86_fpu_regs_deactivated(fpu);
}

static inline void fpregs_activate(struct fpu *fpu)
{
        __this_cpu_write(fpu_fpregs_owner_ctx, fpu);
        trace_x86_fpu_regs_activated(fpu);
}

/* Internal helper for switch_fpu_return() and signal frame setup */
static inline void fpregs_restore_userregs(void)
{
        struct fpu *fpu = &current->thread.fpu;
        int cpu = smp_processor_id();

        if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER)))
                return;

        if (!fpregs_state_valid(fpu, cpu)) {
                /*
                 * This restores _all_ xstate which has not been
                 * established yet.
                 *
                 * If PKRU is enabled, then the PKRU value is already
                 * correct because it was either set in switch_to() or in
                 * flush_thread(). So it is excluded because it might be
                 * not up to date in current->thread.fpu.xsave state.
                 *
                 * XFD state is handled in restore_fpregs_from_fpstate().
                 */
                restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE);

                fpregs_activate(fpu);
                fpu->last_cpu = cpu;
        }
        clear_thread_flag(TIF_NEED_FPU_LOAD);
}

#endif



























































































































































































































































































    1 




































































   14 
   15 
   13 
   14 





















    4 








































































































































































































































   15 


   19 























































































































































































































































    3 
    1 















    2 




























    2 






















    2 
























    3 
    2 

































































































































































































































































































    1 











    1 





    1 















































































































    4 


    4 





    2 







    3 





























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* internal.h: mm/ internal definitions
 *
 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */
#ifndef __MM_INTERNAL_H
#define __MM_INTERNAL_H

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/tracepoint-defs.h>

struct folio_batch;

/*
 * The set of flags that only affect watermark checking and reclaim
 * behaviour. This is used by the MM to obey the caller constraints
 * about IO, FS and watermark checking while ignoring placement
 * hints such as HIGHMEM usage.
 */
#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
                        __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
                        __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
                        __GFP_NOLOCKDEP)

/* The GFP flags allowed during early boot */
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))

/* Control allocation cpuset and node placement constraints */
#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)

/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)

/*
 * Different from WARN_ON_ONCE(), no warning will be issued
 * when we specify __GFP_NOWARN.
 */
#define WARN_ON_ONCE_GFP(cond, gfp)        ({                                \
        static bool __section(".data.once") __warned;                        \
        int __ret_warn_once = !!(cond);                                        \
                                                                        \
        if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \
                __warned = true;                                        \
                WARN_ON(1);                                                \
        }                                                                \
        unlikely(__ret_warn_once);                                        \
})

void page_writeback_init(void);

/*
 * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages,
 * its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit
 * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE).  Hugetlb currently
 * leaves nr_pages_mapped at 0, but avoid surprise if it participates later.
 */
#define ENTIRELY_MAPPED                0x800000
#define FOLIO_PAGES_MAPPED        (ENTIRELY_MAPPED - 1)

/*
 * Flags passed to __show_mem() and show_free_areas() to suppress output in
 * various contexts.
 */
#define SHOW_MEM_FILTER_NODES                (0x0001u)        /* disallowed nodes */

/*
 * How many individual pages have an elevated _mapcount.  Excludes
 * the folio's entire_mapcount.
 *
 * Don't use this function outside of debugging code.
 */
static inline int folio_nr_pages_mapped(const struct folio *folio)
{
        return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED;
}

/*
 * Retrieve the first entry of a folio based on a provided entry within the
 * folio. We cannot rely on folio->swap as there is no guarantee that it has
 * been initialized. Used for calling arch_swap_restore()
 */
static inline swp_entry_t folio_swap(swp_entry_t entry,
                const struct folio *folio)
{
        swp_entry_t swap = {
                .val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)),
        };

        return swap;
}

static inline void *folio_raw_mapping(const struct folio *folio)
{
        unsigned long mapping = (unsigned long)folio->mapping;

        return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
}

#ifdef CONFIG_MMU

/* Flags for folio_pte_batch(). */
typedef int __bitwise fpb_t;

/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */
#define FPB_IGNORE_DIRTY                ((__force fpb_t)BIT(0))

/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */
#define FPB_IGNORE_SOFT_DIRTY                ((__force fpb_t)BIT(1))

static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
{
        if (flags & FPB_IGNORE_DIRTY)
                pte = pte_mkclean(pte);
        if (likely(flags & FPB_IGNORE_SOFT_DIRTY))
                pte = pte_clear_soft_dirty(pte);
        return pte_wrprotect(pte_mkold(pte));
}

/**
 * folio_pte_batch - detect a PTE batch for a large folio
 * @folio: The large folio to detect a PTE batch for.
 * @addr: The user virtual address the first page is mapped at.
 * @start_ptep: Page table pointer for the first entry.
 * @pte: Page table entry for the first page.
 * @max_nr: The maximum number of table entries to consider.
 * @flags: Flags to modify the PTE batch semantics.
 * @any_writable: Optional pointer to indicate whether any entry except the
 *                  first one is writable.
 * @any_young: Optional pointer to indicate whether any entry except the
 *                  first one is young.
 * @any_dirty: Optional pointer to indicate whether any entry except the
 *                  first one is dirty.
 *
 * Detect a PTE batch: consecutive (present) PTEs that map consecutive
 * pages of the same large folio.
 *
 * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
 * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and
 * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY).
 *
 * start_ptep must map any page of the folio. max_nr must be at least one and
 * must be limited by the caller so scanning cannot exceed a single page table.
 *
 * Return: the number of table entries in the batch.
 */
static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
                pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
                bool *any_writable, bool *any_young, bool *any_dirty)
{
        unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
        const pte_t *end_ptep = start_ptep + max_nr;
        pte_t expected_pte, *ptep;
        bool writable, young, dirty;
        int nr;

        if (any_writable)
                *any_writable = false;
        if (any_young)
                *any_young = false;
        if (any_dirty)
                *any_dirty = false;

        VM_WARN_ON_FOLIO(!pte_present(pte), folio);
        VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
        VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);

        nr = pte_batch_hint(start_ptep, pte);
        expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
        ptep = start_ptep + nr;

        while (ptep < end_ptep) {
                pte = ptep_get(ptep);
                if (any_writable)
                        writable = !!pte_write(pte);
                if (any_young)
                        young = !!pte_young(pte);
                if (any_dirty)
                        dirty = !!pte_dirty(pte);
                pte = __pte_batch_clear_ignored(pte, flags);

                if (!pte_same(pte, expected_pte))
                        break;

                /*
                 * Stop immediately once we reached the end of the folio. In
                 * corner cases the next PFN might fall into a different
                 * folio.
                 */
                if (pte_pfn(pte) >= folio_end_pfn)
                        break;

                if (any_writable)
                        *any_writable |= writable;
                if (any_young)
                        *any_young |= young;
                if (any_dirty)
                        *any_dirty |= dirty;

                nr = pte_batch_hint(ptep, pte);
                expected_pte = pte_advance_pfn(expected_pte, nr);
                ptep += nr;
        }

        return min(ptep - start_ptep, max_nr);
}

/**
 * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
 * @pte: The initial pte state; is_swap_pte(pte) must be true and
 *         non_swap_entry() must be false.
 *
 * Increments the swap offset, while maintaining all other fields, including
 * swap type, and any swp pte bits. The resulting pte is returned.
 */
static inline pte_t pte_next_swp_offset(pte_t pte)
{
        swp_entry_t entry = pte_to_swp_entry(pte);
        pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
                                                   (swp_offset(entry) + 1)));

        if (pte_swp_soft_dirty(pte))
                new = pte_swp_mksoft_dirty(new);
        if (pte_swp_exclusive(pte))
                new = pte_swp_mkexclusive(new);
        if (pte_swp_uffd_wp(pte))
                new = pte_swp_mkuffd_wp(new);

        return new;
}

/**
 * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
 * @start_ptep: Page table pointer for the first entry.
 * @max_nr: The maximum number of table entries to consider.
 * @pte: Page table entry for the first entry.
 *
 * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
 * containing swap entries all with consecutive offsets and targeting the same
 * swap type, all with matching swp pte bits.
 *
 * max_nr must be at least one and must be limited by the caller so scanning
 * cannot exceed a single page table.
 *
 * Return: the number of table entries in the batch.
 */
static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
{
        pte_t expected_pte = pte_next_swp_offset(pte);
        const pte_t *end_ptep = start_ptep + max_nr;
        pte_t *ptep = start_ptep + 1;

        VM_WARN_ON(max_nr < 1);
        VM_WARN_ON(!is_swap_pte(pte));
        VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));

        while (ptep < end_ptep) {
                pte = ptep_get(ptep);

                if (!pte_same(pte, expected_pte))
                        break;

                expected_pte = pte_next_swp_offset(expected_pte);
                ptep++;
        }

        return ptep - start_ptep;
}
#endif /* CONFIG_MMU */

void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
                                                int nr_throttled);
static inline void acct_reclaim_writeback(struct folio *folio)
{
        pg_data_t *pgdat = folio_pgdat(folio);
        int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled);

        if (nr_throttled)
                __acct_reclaim_writeback(pgdat, folio, nr_throttled);
}

static inline void wake_throttle_isolated(pg_data_t *pgdat)
{
        wait_queue_head_t *wqh;

        wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED];
        if (waitqueue_active(wqh))
                wake_up(wqh);
}

vm_fault_t vmf_anon_prepare(struct vm_fault *vmf);
vm_fault_t do_swap_page(struct vm_fault *vmf);
void folio_rotate_reclaimable(struct folio *folio);
bool __folio_end_writeback(struct folio *folio);
void deactivate_file_folio(struct folio *folio);
void folio_activate(struct folio *folio);

void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
                   struct vm_area_struct *start_vma, unsigned long floor,
                   unsigned long ceiling, bool mm_wr_locked);
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);

struct zap_details;
void unmap_page_range(struct mmu_gather *tlb,
                             struct vm_area_struct *vma,
                             unsigned long addr, unsigned long end,
                             struct zap_details *details);

void page_cache_ra_order(struct readahead_control *, struct file_ra_state *,
                unsigned int order);
void force_page_cache_ra(struct readahead_control *, unsigned long nr);
static inline void force_page_cache_readahead(struct address_space *mapping,
                struct file *file, pgoff_t index, unsigned long nr_to_read)
{
        DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index);
        force_page_cache_ra(&ractl, nr_to_read);
}

unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
void filemap_free_folio(struct address_space *mapping, struct folio *folio);
int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
                loff_t end);
long mapping_evict_folio(struct address_space *mapping, struct folio *folio);
unsigned long mapping_try_invalidate(struct address_space *mapping,
                pgoff_t start, pgoff_t end, unsigned long *nr_failed);

/**
 * folio_evictable - Test whether a folio is evictable.
 * @folio: The folio to test.
 *
 * Test whether @folio is evictable -- i.e., should be placed on
 * active/inactive lists vs unevictable list.
 *
 * Reasons folio might not be evictable:
 * 1. folio's mapping marked unevictable
 * 2. One of the pages in the folio is part of an mlocked VMA
 */
static inline bool folio_evictable(struct folio *folio)
{
        bool ret;

        /* Prevent address_space of inode and swap cache from being freed */
        rcu_read_lock();
        ret = !mapping_unevictable(folio_mapping(folio)) &&
                        !folio_test_mlocked(folio);
        rcu_read_unlock();
        return ret;
}

/*
 * Turn a non-refcounted page (->_refcount == 0) into refcounted with
 * a count of one.
 */
static inline void set_page_refcounted(struct page *page)
{
        VM_BUG_ON_PAGE(PageTail(page), page);
        VM_BUG_ON_PAGE(page_ref_count(page), page);
        set_page_count(page, 1);
}

/*
 * Return true if a folio needs ->release_folio() calling upon it.
 */
static inline bool folio_needs_release(struct folio *folio)
{
        struct address_space *mapping = folio_mapping(folio);

        return folio_has_private(folio) ||
                (mapping && mapping_release_always(mapping));
}

extern unsigned long highest_memmap_pfn;

/*
 * Maximum number of reclaim retries without progress before the OOM
 * killer is consider the only way forward.
 */
#define MAX_RECLAIM_RETRIES 16

/*
 * in mm/vmscan.c:
 */
bool isolate_lru_page(struct page *page);
bool folio_isolate_lru(struct folio *folio);
void putback_lru_page(struct page *page);
void folio_putback_lru(struct folio *folio);
extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);

/*
 * in mm/rmap.c:
 */
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);

/*
 * in mm/page_alloc.c
 */
#define K(x) ((x) << (PAGE_SHIFT-10))

extern char * const zone_names[MAX_NR_ZONES];

/* perform sanity checks on struct pages being allocated or freed */
DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);

extern int min_free_kbytes;

void setup_per_zone_wmarks(void);
void calculate_min_free_kbytes(void);
int __meminit init_per_zone_wmark_min(void);
void page_alloc_sysctl_init(void);

/*
 * Structure for holding the mostly immutable allocation parameters passed
 * between functions involved in allocations, including the alloc_pages*
 * family of functions.
 *
 * nodemask, migratetype and highest_zoneidx are initialized only once in
 * __alloc_pages() and then never change.
 *
 * zonelist, preferred_zone and highest_zoneidx are set first in
 * __alloc_pages() for the fast path, and might be later changed
 * in __alloc_pages_slowpath(). All other functions pass the whole structure
 * by a const pointer.
 */
struct alloc_context {
        struct zonelist *zonelist;
        nodemask_t *nodemask;
        struct zoneref *preferred_zoneref;
        int migratetype;

        /*
         * highest_zoneidx represents highest usable zone index of
         * the allocation request. Due to the nature of the zone,
         * memory on lower zone than the highest_zoneidx will be
         * protected by lowmem_reserve[highest_zoneidx].
         *
         * highest_zoneidx is also used by reclaim/compaction to limit
         * the target zone since higher zone than this index cannot be
         * usable for this allocation request.
         */
        enum zone_type highest_zoneidx;
        bool spread_dirty_pages;
};

/*
 * This function returns the order of a free page in the buddy system. In
 * general, page_zone(page)->lock must be held by the caller to prevent the
 * page from being allocated in parallel and returning garbage as the order.
 * If a caller does not hold page_zone(page)->lock, it must guarantee that the
 * page cannot be allocated or merged in parallel. Alternatively, it must
 * handle invalid values gracefully, and use buddy_order_unsafe() below.
 */
static inline unsigned int buddy_order(struct page *page)
{
        /* PageBuddy() must be checked by the caller */
        return page_private(page);
}

/*
 * Like buddy_order(), but for callers who cannot afford to hold the zone lock.
 * PageBuddy() should be checked first by the caller to minimize race window,
 * and invalid values must be handled gracefully.
 *
 * READ_ONCE is used so that if the caller assigns the result into a local
 * variable and e.g. tests it for valid range before using, the compiler cannot
 * decide to remove the variable and inline the page_private(page) multiple
 * times, potentially observing different values in the tests and the actual
 * use of the result.
 */
#define buddy_order_unsafe(page)        READ_ONCE(page_private(page))

/*
 * This function checks whether a page is free && is the buddy
 * we can coalesce a page and its buddy if
 * (a) the buddy is not in a hole (check before calling!) &&
 * (b) the buddy is in the buddy system &&
 * (c) a page and its buddy have the same order &&
 * (d) a page and its buddy are in the same zone.
 *
 * For recording whether a page is in the buddy system, we set PageBuddy.
 * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
 *
 * For recording page's order, we use page_private(page).
 */
static inline bool page_is_buddy(struct page *page, struct page *buddy,
                                 unsigned int order)
{
        if (!page_is_guard(buddy) && !PageBuddy(buddy))
                return false;

        if (buddy_order(buddy) != order)
                return false;

        /*
         * zone check is done late to avoid uselessly calculating
         * zone/node ids for pages that could never merge.
         */
        if (page_zone_id(page) != page_zone_id(buddy))
                return false;

        VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);

        return true;
}

/*
 * Locate the struct page for both the matching buddy in our
 * pair (buddy1) and the combined O(n+1) page they form (page).
 *
 * 1) Any buddy B1 will have an order O twin B2 which satisfies
 * the following equation:
 *     B2 = B1 ^ (1 << O)
 * For example, if the starting buddy (buddy2) is #8 its order
 * 1 buddy is #10:
 *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
 *
 * 2) Any buddy B will have an order O+1 parent P which
 * satisfies the following equation:
 *     P = B & ~(1 << O)
 *
 * Assumption: *_mem_map is contiguous at least up to MAX_PAGE_ORDER
 */
static inline unsigned long
__find_buddy_pfn(unsigned long page_pfn, unsigned int order)
{
        return page_pfn ^ (1 << order);
}

/*
 * Find the buddy of @page and validate it.
 * @page: The input page
 * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the
 *       function is used in the performance-critical __free_one_page().
 * @order: The order of the page
 * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to
 *             page_to_pfn().
 *
 * The found buddy can be a non PageBuddy, out of @page's zone, or its order is
 * not the same as @page. The validation is necessary before use it.
 *
 * Return: the found buddy page or NULL if not found.
 */
static inline struct page *find_buddy_page_pfn(struct page *page,
                        unsigned long pfn, unsigned int order, unsigned long *buddy_pfn)
{
        unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order);
        struct page *buddy;

        buddy = page + (__buddy_pfn - pfn);
        if (buddy_pfn)
                *buddy_pfn = __buddy_pfn;

        if (page_is_buddy(page, buddy, order))
                return buddy;
        return NULL;
}

extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
                                unsigned long end_pfn, struct zone *zone);

static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
                                unsigned long end_pfn, struct zone *zone)
{
        if (zone->contiguous)
                return pfn_to_page(start_pfn);

        return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
}

void set_zone_contiguous(struct zone *zone);

static inline void clear_zone_contiguous(struct zone *zone)
{
        zone->contiguous = false;
}

extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __putback_isolated_page(struct page *page, unsigned int order,
                                    int mt);
extern void memblock_free_pages(struct page *page, unsigned long pfn,
                                        unsigned int order);
extern void __free_pages_core(struct page *page, unsigned int order);

/*
 * This will have no effect, other than possibly generating a warning, if the
 * caller passes in a non-large folio.
 */
static inline void folio_set_order(struct folio *folio, unsigned int order)
{
        if (WARN_ON_ONCE(!order || !folio_test_large(folio)))
                return;

        folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order;
#ifdef CONFIG_64BIT
        folio->_folio_nr_pages = 1U << order;
#endif
}

void folio_undo_large_rmappable(struct folio *folio);

static inline struct folio *page_rmappable_folio(struct page *page)
{
        struct folio *folio = (struct folio *)page;

        if (folio && folio_test_large(folio))
                folio_set_large_rmappable(folio);
        return folio;
}

static inline void prep_compound_head(struct page *page, unsigned int order)
{
        struct folio *folio = (struct folio *)page;

        folio_set_order(folio, order);
        atomic_set(&folio->_large_mapcount, -1);
        atomic_set(&folio->_entire_mapcount, -1);
        atomic_set(&folio->_nr_pages_mapped, 0);
        atomic_set(&folio->_pincount, 0);
        if (order > 1)
                INIT_LIST_HEAD(&folio->_deferred_list);
}

static inline void prep_compound_tail(struct page *head, int tail_idx)
{
        struct page *p = head + tail_idx;

        p->mapping = TAIL_MAPPING;
        set_compound_head(p, head);
        set_page_private(p, 0);
}

extern void prep_compound_page(struct page *page, unsigned int order);

extern void post_alloc_hook(struct page *page, unsigned int order,
                                        gfp_t gfp_flags);
extern bool free_pages_prepare(struct page *page, unsigned int order);

extern int user_min_free_kbytes;

void free_unref_page(struct page *page, unsigned int order);
void free_unref_folios(struct folio_batch *fbatch);

extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone);
extern void zone_pcp_enable(struct zone *zone);
extern void zone_pcp_init(struct zone *zone);

extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
                          phys_addr_t min_addr,
                          int nid, bool exact_nid);

void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
                unsigned long, enum meminit_context, struct vmem_altmap *, int);

#if defined CONFIG_COMPACTION || defined CONFIG_CMA

/*
 * in mm/compaction.c
 */
/*
 * compact_control is used to track pages being migrated and the free pages
 * they are being migrated to during memory compaction. The free_pfn starts
 * at the end of a zone and migrate_pfn begins at the start. Movable pages
 * are moved to the end of a zone during a compaction run and the run
 * completes when free_pfn <= migrate_pfn
 */
struct compact_control {
        struct list_head freepages[NR_PAGE_ORDERS];        /* List of free pages to migrate to */
        struct list_head migratepages;        /* List of pages being migrated */
        unsigned int nr_freepages;        /* Number of isolated free pages */
        unsigned int nr_migratepages;        /* Number of pages to migrate */
        unsigned long free_pfn;                /* isolate_freepages search base */
        /*
         * Acts as an in/out parameter to page isolation for migration.
         * isolate_migratepages uses it as a search base.
         * isolate_migratepages_block will update the value to the next pfn
         * after the last isolated one.
         */
        unsigned long migrate_pfn;
        unsigned long fast_start_pfn;        /* a pfn to start linear scan from */
        struct zone *zone;
        unsigned long total_migrate_scanned;
        unsigned long total_free_scanned;
        unsigned short fast_search_fail;/* failures to use free list searches */
        short search_order;                /* order to start a fast search at */
        const gfp_t gfp_mask;                /* gfp mask of a direct compactor */
        int order;                        /* order a direct compactor needs */
        int migratetype;                /* migratetype of direct compactor */
        const unsigned int alloc_flags;        /* alloc flags of a direct compactor */
        const int highest_zoneidx;        /* zone index of a direct compactor */
        enum migrate_mode mode;                /* Async or sync migration mode */
        bool ignore_skip_hint;                /* Scan blocks even if marked skip */
        bool no_set_skip_hint;                /* Don't mark blocks for skipping */
        bool ignore_block_suitable;        /* Scan blocks considered unsuitable */
        bool direct_compaction;                /* False from kcompactd or /proc/... */
        bool proactive_compaction;        /* kcompactd proactive compaction */
        bool whole_zone;                /* Whole zone should/has been scanned */
        bool contended;                        /* Signal lock contention */
        bool finish_pageblock;                /* Scan the remainder of a pageblock. Used
                                         * when there are potentially transient
                                         * isolation or migration failures to
                                         * ensure forward progress.
                                         */
        bool alloc_contig;                /* alloc_contig_range allocation */
};

/*
 * Used in direct compaction when a page should be taken from the freelists
 * immediately when one is created during the free path.
 */
struct capture_control {
        struct compact_control *cc;
        struct page *page;
};

unsigned long
isolate_freepages_range(struct compact_control *cc,
                        unsigned long start_pfn, unsigned long end_pfn);
int
isolate_migratepages_range(struct compact_control *cc,
                           unsigned long low_pfn, unsigned long end_pfn);

int __alloc_contig_migrate_range(struct compact_control *cc,
                                        unsigned long start, unsigned long end,
                                        int migratetype);

/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void init_cma_reserved_pageblock(struct page *page);

#endif /* CONFIG_COMPACTION || CONFIG_CMA */

int find_suitable_fallback(struct free_area *area, unsigned int order,
                        int migratetype, bool only_stealable, bool *can_steal);

static inline bool free_area_empty(struct free_area *area, int migratetype)
{
        return list_empty(&area->free_list[migratetype]);
}

/*
 * These three helpers classifies VMAs for virtual memory accounting.
 */

/*
 * Executable code area - executable, not writable, not stack
 */
static inline bool is_exec_mapping(vm_flags_t flags)
{
        return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
}

/*
 * Stack area (including shadow stacks)
 *
 * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
 * do_mmap() forbids all other combinations.
 */
static inline bool is_stack_mapping(vm_flags_t flags)
{
        return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK);
}

/*
 * Data area - private, writable, not stack
 */
static inline bool is_data_mapping(vm_flags_t flags)
{
        return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
}

/* mm/util.c */
struct anon_vma *folio_anon_vma(struct folio *folio);

#ifdef CONFIG_MMU
void unmap_mapping_folio(struct folio *folio);
extern long populate_vma_page_range(struct vm_area_struct *vma,
                unsigned long start, unsigned long end, int *locked);
extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
                unsigned long end, bool write, int *locked);
extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
                               unsigned long bytes);

/*
 * NOTE: This function can't tell whether the folio is "fully mapped" in the
 * range.
 * "fully mapped" means all the pages of folio is associated with the page
 * table of range while this function just check whether the folio range is
 * within the range [start, end). Function caller needs to do page table
 * check if it cares about the page table association.
 *
 * Typical usage (like mlock or madvise) is:
 * Caller knows at least 1 page of folio is associated with page table of VMA
 * and the range [start, end) is intersect with the VMA range. Caller wants
 * to know whether the folio is fully associated with the range. It calls
 * this function to check whether the folio is in the range first. Then checks
 * the page table to know whether the folio is fully mapped to the range.
 */
static inline bool
folio_within_range(struct folio *folio, struct vm_area_struct *vma,
                unsigned long start, unsigned long end)
{
        pgoff_t pgoff, addr;
        unsigned long vma_pglen = vma_pages(vma);

        VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio);
        if (start > end)
                return false;

        if (start < vma->vm_start)
                start = vma->vm_start;

        if (end > vma->vm_end)
                end = vma->vm_end;

        pgoff = folio_pgoff(folio);

        /* if folio start address is not in vma range */
        if (!in_range(pgoff, vma->vm_pgoff, vma_pglen))
                return false;

        addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);

        return !(addr < start || end - addr < folio_size(folio));
}

static inline bool
folio_within_vma(struct folio *folio, struct vm_area_struct *vma)
{
        return folio_within_range(folio, vma, vma->vm_start, vma->vm_end);
}

/*
 * mlock_vma_folio() and munlock_vma_folio():
 * should be called with vma's mmap_lock held for read or write,
 * under page table lock for the pte/pmd being added or removed.
 *
 * mlock is usually called at the end of folio_add_*_rmap_*(), munlock at
 * the end of folio_remove_rmap_*(); but new anon folios are managed by
 * folio_add_lru_vma() calling mlock_new_folio().
 */
void mlock_folio(struct folio *folio);
static inline void mlock_vma_folio(struct folio *folio,
                                struct vm_area_struct *vma)
{
        /*
         * The VM_SPECIAL check here serves two purposes.
         * 1) VM_IO check prevents migration from double-counting during mlock.
         * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED
         *    is never left set on a VM_SPECIAL vma, there is an interval while
         *    file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may
         *    still be set while VM_SPECIAL bits are added: so ignore it then.
         */
        if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED))
                mlock_folio(folio);
}

void munlock_folio(struct folio *folio);
static inline void munlock_vma_folio(struct folio *folio,
                                        struct vm_area_struct *vma)
{
        /*
         * munlock if the function is called. Ideally, we should only
         * do munlock if any page of folio is unmapped from VMA and
         * cause folio not fully mapped to VMA.
         *
         * But it's not easy to confirm that's the situation. So we
         * always munlock the folio and page reclaim will correct it
         * if it's wrong.
         */
        if (unlikely(vma->vm_flags & VM_LOCKED))
                munlock_folio(folio);
}

void mlock_new_folio(struct folio *folio);
bool need_mlock_drain(int cpu);
void mlock_drain_local(void);
void mlock_drain_remote(int cpu);

extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);

/**
 * vma_address - Find the virtual address a page range is mapped at
 * @vma: The vma which maps this object.
 * @pgoff: The page offset within its object.
 * @nr_pages: The number of pages to consider.
 *
 * If any page in this range is mapped by this VMA, return the first address
 * where any of these pages appear.  Otherwise, return -EFAULT.
 */
static inline unsigned long vma_address(struct vm_area_struct *vma,
                pgoff_t pgoff, unsigned long nr_pages)
{
        unsigned long address;

        if (pgoff >= vma->vm_pgoff) {
                address = vma->vm_start +
                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
                /* Check for address beyond vma (or wrapped through 0?) */
                if (address < vma->vm_start || address >= vma->vm_end)
                        address = -EFAULT;
        } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) {
                /* Test above avoids possibility of wrap to 0 on 32-bit */
                address = vma->vm_start;
        } else {
                address = -EFAULT;
        }
        return address;
}

/*
 * Then at what user virtual address will none of the range be found in vma?
 * Assumes that vma_address() already returned a good starting address.
 */
static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw)
{
        struct vm_area_struct *vma = pvmw->vma;
        pgoff_t pgoff;
        unsigned long address;

        /* Common case, plus ->pgoff is invalid for KSM */
        if (pvmw->nr_pages == 1)
                return pvmw->address + PAGE_SIZE;

        pgoff = pvmw->pgoff + pvmw->nr_pages;
        address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
        /* Check for address beyond vma (or wrapped through 0?) */
        if (address < vma->vm_start || address > vma->vm_end)
                address = vma->vm_end;
        return address;
}

static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
                                                    struct file *fpin)
{
        int flags = vmf->flags;

        if (fpin)
                return fpin;

        /*
         * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
         * anything, so we only pin the file and drop the mmap_lock if only
         * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt.
         */
        if (fault_flag_allow_retry_first(flags) &&
            !(flags & FAULT_FLAG_RETRY_NOWAIT)) {
                fpin = get_file(vmf->vma->vm_file);
                release_fault_lock(vmf);
        }
        return fpin;
}
#else /* !CONFIG_MMU */
static inline void unmap_mapping_folio(struct folio *folio) { }
static inline void mlock_new_folio(struct folio *folio) { }
static inline bool need_mlock_drain(int cpu) { return false; }
static inline void mlock_drain_local(void) { }
static inline void mlock_drain_remote(int cpu) { }
static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
{
}
#endif /* !CONFIG_MMU */

/* Memory initialisation debug and verification */
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
DECLARE_STATIC_KEY_TRUE(deferred_pages);

bool __init deferred_grow_zone(struct zone *zone, unsigned int order);
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

enum mminit_level {
        MMINIT_WARNING,
        MMINIT_VERIFY,
        MMINIT_TRACE
};

#ifdef CONFIG_DEBUG_MEMORY_INIT

extern int mminit_loglevel;

#define mminit_dprintk(level, prefix, fmt, arg...) \
do { \
        if (level < mminit_loglevel) { \
                if (level <= MMINIT_WARNING) \
                        pr_warn("mminit::" prefix " " fmt, ##arg);        \
                else \
                        printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
        } \
} while (0)

extern void mminit_verify_pageflags_layout(void);
extern void mminit_verify_zonelist(void);
#else

static inline void mminit_dprintk(enum mminit_level level,
                                const char *prefix, const char *fmt, ...)
{
}

static inline void mminit_verify_pageflags_layout(void)
{
}

static inline void mminit_verify_zonelist(void)
{
}
#endif /* CONFIG_DEBUG_MEMORY_INIT */

#define NODE_RECLAIM_NOSCAN        -2
#define NODE_RECLAIM_FULL        -1
#define NODE_RECLAIM_SOME        0
#define NODE_RECLAIM_SUCCESS        1

#ifdef CONFIG_NUMA
extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
extern int find_next_best_node(int node, nodemask_t *used_node_mask);
#else
static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
                                unsigned int order)
{
        return NODE_RECLAIM_NOSCAN;
}
static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
{
        return NUMA_NO_NODE;
}
#endif

/*
 * mm/memory-failure.c
 */
void shake_folio(struct folio *folio);
extern int hwpoison_filter(struct page *p);

extern u32 hwpoison_filter_dev_major;
extern u32 hwpoison_filter_dev_minor;
extern u64 hwpoison_filter_flags_mask;
extern u64 hwpoison_filter_flags_value;
extern u64 hwpoison_filter_memcg;
extern u32 hwpoison_filter_enable;

extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
        unsigned long, unsigned long,
        unsigned long, unsigned long);

extern void set_pageblock_order(void);
unsigned long reclaim_pages(struct list_head *folio_list);
unsigned int reclaim_clean_pages_from_list(struct zone *zone,
                                            struct list_head *folio_list);
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN                WMARK_MIN
#define ALLOC_WMARK_LOW                WMARK_LOW
#define ALLOC_WMARK_HIGH        WMARK_HIGH
#define ALLOC_NO_WATERMARKS        0x04 /* don't check watermarks at all */

/* Mask to get the watermark bits */
#define ALLOC_WMARK_MASK        (ALLOC_NO_WATERMARKS-1)

/*
 * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
 * cannot assume a reduced access to memory reserves is sufficient for
 * !MMU
 */
#ifdef CONFIG_MMU
#define ALLOC_OOM                0x08
#else
#define ALLOC_OOM                ALLOC_NO_WATERMARKS
#endif

#define ALLOC_NON_BLOCK                 0x10 /* Caller cannot block. Allow access
                                       * to 25% of the min watermark or
                                       * 62.5% if __GFP_HIGH is set.
                                       */
#define ALLOC_MIN_RESERVE         0x20 /* __GFP_HIGH set. Allow access to 50%
                                       * of the min watermark.
                                       */
#define ALLOC_CPUSET                 0x40 /* check for correct cpuset */
#define ALLOC_CMA                 0x80 /* allow allocations from CMA areas */
#ifdef CONFIG_ZONE_DMA32
#define ALLOC_NOFRAGMENT        0x100 /* avoid mixing pageblock types */
#else
#define ALLOC_NOFRAGMENT          0x0
#endif
#define ALLOC_HIGHATOMIC        0x200 /* Allows access to MIGRATE_HIGHATOMIC */
#define ALLOC_KSWAPD                0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */

/* Flags that allow allocations below the min watermark. */
#define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM)

enum ttu_flags;
struct tlbflush_unmap_batch;


/*
 * only for MM internal work items which do not depend on
 * any allocations or locks which might depend on allocations
 */
extern struct workqueue_struct *mm_percpu_wq;

#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
void try_to_unmap_flush(void);
void try_to_unmap_flush_dirty(void);
void flush_tlb_batched_pending(struct mm_struct *mm);
#else
static inline void try_to_unmap_flush(void)
{
}
static inline void try_to_unmap_flush_dirty(void)
{
}
static inline void flush_tlb_batched_pending(struct mm_struct *mm)
{
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */

extern const struct trace_print_flags pageflag_names[];
extern const struct trace_print_flags pagetype_names[];
extern const struct trace_print_flags vmaflag_names[];
extern const struct trace_print_flags gfpflag_names[];

static inline bool is_migrate_highatomic(enum migratetype migratetype)
{
        return migratetype == MIGRATE_HIGHATOMIC;
}

void setup_zone_pageset(struct zone *zone);

struct migration_target_control {
        int nid;                /* preferred node id */
        nodemask_t *nmask;
        gfp_t gfp_mask;
        enum migrate_reason reason;
};

/*
 * mm/filemap.c
 */
size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
                              struct folio *folio, loff_t fpos, size_t size);

/*
 * mm/vmalloc.c
 */
#ifdef CONFIG_MMU
void __init vmalloc_init(void);
int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift);
#else
static inline void vmalloc_init(void)
{
}

static inline
int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift)
{
        return -EINVAL;
}
#endif

int __must_check __vmap_pages_range_noflush(unsigned long addr,
                               unsigned long end, pgprot_t prot,
                               struct page **pages, unsigned int page_shift);

void vunmap_range_noflush(unsigned long start, unsigned long end);

void __vunmap_range_noflush(unsigned long start, unsigned long end);

int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
                      unsigned long addr, int page_nid, int *flags);

void free_zone_device_folio(struct folio *folio);
int migrate_device_coherent_page(struct page *page);

/*
 * mm/gup.c
 */
struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags);
int __must_check try_grab_page(struct page *page, unsigned int flags);

/*
 * mm/huge_memory.c
 */
void touch_pud(struct vm_area_struct *vma, unsigned long addr,
               pud_t *pud, bool write);
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
               pmd_t *pmd, bool write);

/*
 * mm/mmap.c
 */
struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
                                        struct vm_area_struct *vma,
                                        unsigned long delta);

enum {
        /* mark page accessed */
        FOLL_TOUCH = 1 << 16,
        /* a retry, previous pass started an IO */
        FOLL_TRIED = 1 << 17,
        /* we are working on non-current tsk/mm */
        FOLL_REMOTE = 1 << 18,
        /* pages must be released via unpin_user_page */
        FOLL_PIN = 1 << 19,
        /* gup_fast: prevent fall-back to slow gup */
        FOLL_FAST_ONLY = 1 << 20,
        /* allow unlocking the mmap lock */
        FOLL_UNLOCKABLE = 1 << 21,
        /* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */
        FOLL_MADV_POPULATE = 1 << 22,
};

#define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \
                            FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \
                            FOLL_MADV_POPULATE)

/*
 * Indicates for which pages that are write-protected in the page table,
 * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the
 * GUP pin will remain consistent with the pages mapped into the page tables
 * of the MM.
 *
 * Temporary unmapping of PageAnonExclusive() pages or clearing of
 * PageAnonExclusive() has to protect against concurrent GUP:
 * * Ordinary GUP: Using the PT lock
 * * GUP-fast and fork(): mm->write_protect_seq
 * * GUP-fast and KSM or temporary unmapping (swap, migration): see
 *    folio_try_share_anon_rmap_*()
 *
 * Must be called with the (sub)page that's actually referenced via the
 * page table entry, which might not necessarily be the head page for a
 * PTE-mapped THP.
 *
 * If the vma is NULL, we're coming from the GUP-fast path and might have
 * to fallback to the slow path just to lookup the vma.
 */
static inline bool gup_must_unshare(struct vm_area_struct *vma,
                                    unsigned int flags, struct page *page)
{
        /*
         * FOLL_WRITE is implicitly handled correctly as the page table entry
         * has to be writable -- and if it references (part of) an anonymous
         * folio, that part is required to be marked exclusive.
         */
        if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN)
                return false;
        /*
         * Note: PageAnon(page) is stable until the page is actually getting
         * freed.
         */
        if (!PageAnon(page)) {
                /*
                 * We only care about R/O long-term pining: R/O short-term
                 * pinning does not have the semantics to observe successive
                 * changes through the process page tables.
                 */
                if (!(flags & FOLL_LONGTERM))
                        return false;

                /* We really need the vma ... */
                if (!vma)
                        return true;

                /*
                 * ... because we only care about writable private ("COW")
                 * mappings where we have to break COW early.
                 */
                return is_cow_mapping(vma->vm_flags);
        }

        /* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_rmb();

        /*
         * Note that PageKsm() pages cannot be exclusive, and consequently,
         * cannot get pinned.
         */
        return !PageAnonExclusive(page);
}

extern bool mirrored_kernelcore;
extern bool memblock_has_mirror(void);

static __always_inline void vma_set_range(struct vm_area_struct *vma,
                                          unsigned long start, unsigned long end,
                                          pgoff_t pgoff)
{
        vma->vm_start = start;
        vma->vm_end = end;
        vma->vm_pgoff = pgoff;
}

static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
{
        /*
         * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty
         * enablements, because when without soft-dirty being compiled in,
         * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY)
         * will be constantly true.
         */
        if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
                return false;

        /*
         * Soft-dirty is kind of special: its tracking is enabled when the
         * vma flags not set.
         */
        return !(vma->vm_flags & VM_SOFTDIRTY);
}

static inline void vma_iter_config(struct vma_iterator *vmi,
                unsigned long index, unsigned long last)
{
        __mas_set_range(&vmi->mas, index, last - 1);
}

static inline void vma_iter_reset(struct vma_iterator *vmi)
{
        mas_reset(&vmi->mas);
}

static inline
struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min)
{
        return mas_prev_range(&vmi->mas, min);
}

static inline
struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max)
{
        return mas_next_range(&vmi->mas, max);
}

static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min,
                                       unsigned long max, unsigned long size)
{
        return mas_empty_area(&vmi->mas, min, max - 1, size);
}

static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min,
                                        unsigned long max, unsigned long size)
{
        return mas_empty_area_rev(&vmi->mas, min, max - 1, size);
}

/*
 * VMA Iterator functions shared between nommu and mmap
 */
static inline int vma_iter_prealloc(struct vma_iterator *vmi,
                struct vm_area_struct *vma)
{
        return mas_preallocate(&vmi->mas, vma, GFP_KERNEL);
}

static inline void vma_iter_clear(struct vma_iterator *vmi)
{
        mas_store_prealloc(&vmi->mas, NULL);
}

static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
{
        return mas_walk(&vmi->mas);
}

/* Store a VMA with preallocated memory */
static inline void vma_iter_store(struct vma_iterator *vmi,
                                  struct vm_area_struct *vma)
{

#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
        if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
                        vmi->mas.index > vma->vm_start)) {
                pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n",
                        vmi->mas.index, vma->vm_start, vma->vm_start,
                        vma->vm_end, vmi->mas.index, vmi->mas.last);
        }
        if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
                        vmi->mas.last <  vma->vm_start)) {
                pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n",
                       vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end,
                       vmi->mas.index, vmi->mas.last);
        }
#endif

        if (vmi->mas.status != ma_start &&
            ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
                vma_iter_invalidate(vmi);

        __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
        mas_store_prealloc(&vmi->mas, vma);
}

static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
                        struct vm_area_struct *vma, gfp_t gfp)
{
        if (vmi->mas.status != ma_start &&
            ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
                vma_iter_invalidate(vmi);

        __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
        mas_store_gfp(&vmi->mas, vma, gfp);
        if (unlikely(mas_is_err(&vmi->mas)))
                return -ENOMEM;

        return 0;
}

/*
 * VMA lock generalization
 */
struct vma_prepare {
        struct vm_area_struct *vma;
        struct vm_area_struct *adj_next;
        struct file *file;
        struct address_space *mapping;
        struct anon_vma *anon_vma;
        struct vm_area_struct *insert;
        struct vm_area_struct *remove;
        struct vm_area_struct *remove2;
};

void __meminit __init_single_page(struct page *page, unsigned long pfn,
                                unsigned long zone, int nid);

/* shrinker related functions */
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
                          int priority);

#ifdef CONFIG_64BIT
static inline int can_do_mseal(unsigned long flags)
{
        if (flags)
                return -EINVAL;

        return 0;
}

bool can_modify_mm(struct mm_struct *mm, unsigned long start,
                unsigned long end);
bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
                unsigned long end, int behavior);
#else
static inline int can_do_mseal(unsigned long flags)
{
        return -EPERM;
}

static inline bool can_modify_mm(struct mm_struct *mm, unsigned long start,
                unsigned long end)
{
        return true;
}

static inline bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
                unsigned long end, int behavior)
{
        return true;
}
#endif

#ifdef CONFIG_SHRINKER_DEBUG
static inline __printf(2, 0) int shrinker_debugfs_name_alloc(
                        struct shrinker *shrinker, const char *fmt, va_list ap)
{
        shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);

        return shrinker->name ? 0 : -ENOMEM;
}

static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
{
        kfree_const(shrinker->name);
        shrinker->name = NULL;
}

extern int shrinker_debugfs_add(struct shrinker *shrinker);
extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
                                              int *debugfs_id);
extern void shrinker_debugfs_remove(struct dentry *debugfs_entry,
                                    int debugfs_id);
#else /* CONFIG_SHRINKER_DEBUG */
static inline int shrinker_debugfs_add(struct shrinker *shrinker)
{
        return 0;
}
static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker,
                                              const char *fmt, va_list ap)
{
        return 0;
}
static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
{
}
static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
                                                     int *debugfs_id)
{
        *debugfs_id = -1;
        return NULL;
}
static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
                                           int debugfs_id)
{
}
#endif /* CONFIG_SHRINKER_DEBUG */

/* Only track the nodes of mappings with shadow entries */
void workingset_update_node(struct xa_node *node);
extern struct list_lru shadow_nodes;

#endif        /* __MM_INTERNAL_H */










    1 
















































    1 




    1 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM csd

#if !defined(_TRACE_CSD_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_CSD_H

#include <linux/tracepoint.h>

TRACE_EVENT(csd_queue_cpu,

        TP_PROTO(const unsigned int cpu,
                unsigned long callsite,
                smp_call_func_t func,
                call_single_data_t *csd),

        TP_ARGS(cpu, callsite, func, csd),

        TP_STRUCT__entry(
                __field(unsigned int, cpu)
                __field(void *, callsite)
                __field(void *, func)
                __field(void *, csd)
                ),

            TP_fast_assign(
                __entry->cpu = cpu;
                __entry->callsite = (void *)callsite;
                __entry->func = func;
                __entry->csd  = csd;
                ),

        TP_printk("cpu=%u callsite=%pS func=%ps csd=%p",
                __entry->cpu, __entry->callsite, __entry->func, __entry->csd)
        );

/*
 * Tracepoints for a function which is called as an effect of smp_call_function.*
 */
DECLARE_EVENT_CLASS(csd_function,

        TP_PROTO(smp_call_func_t func, call_single_data_t *csd),

        TP_ARGS(func, csd),

        TP_STRUCT__entry(
                __field(void *,        func)
                __field(void *,        csd)
        ),

        TP_fast_assign(
                __entry->func        = func;
                __entry->csd        = csd;
        ),

        TP_printk("func=%ps, csd=%p", __entry->func, __entry->csd)
);

DEFINE_EVENT(csd_function, csd_function_entry,
        TP_PROTO(smp_call_func_t func, call_single_data_t *csd),
        TP_ARGS(func, csd)
);

DEFINE_EVENT(csd_function, csd_function_exit,
        TP_PROTO(smp_call_func_t func, call_single_data_t *csd),
        TP_ARGS(func, csd)
);

#endif /* _TRACE_CSD_H */

/* This part must be outside protection */
#include <trace/define_trace.h>













































































    1 




    1 



























    1 




































    1 








    1 
    1 





























































    1 






    1 

































    1 



































    1 









































    1 

    1 













    1 









































































    1 









    1 














    1 


























    1 




















































    1 











    1 








































    1 


























    1 

    1 

    1 


    1 








    1 





    1 
    1 






















    1 






    1 





    1 





    1 



































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/indirect.c
 *
 *  from
 *
 *  linux/fs/ext4/inode.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/inode.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  Goal-directed block allocation by Stephen Tweedie
 *        (sct@redhat.com), 1993, 1998
 */

#include "ext4_jbd2.h"
#include "truncate.h"
#include <linux/dax.h>
#include <linux/uio.h>

#include <trace/events/ext4.h>

typedef struct {
        __le32        *p;
        __le32        key;
        struct buffer_head *bh;
} Indirect;

static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
{
        p->key = *(p->p = v);
        p->bh = bh;
}

/**
 *        ext4_block_to_path - parse the block number into array of offsets
 *        @inode: inode in question (we are only interested in its superblock)
 *        @i_block: block number to be parsed
 *        @offsets: array to store the offsets in
 *        @boundary: set this non-zero if the referred-to block is likely to be
 *               followed (on disk) by an indirect block.
 *
 *        To store the locations of file's data ext4 uses a data structure common
 *        for UNIX filesystems - tree of pointers anchored in the inode, with
 *        data blocks at leaves and indirect blocks in intermediate nodes.
 *        This function translates the block number into path in that tree -
 *        return value is the path length and @offsets[n] is the offset of
 *        pointer to (n+1)th node in the nth one. If @block is out of range
 *        (negative or too large) warning is printed and zero returned.
 *
 *        Note: function doesn't find node addresses, so no IO is needed. All
 *        we need to know is the capacity of indirect blocks (taken from the
 *        inode->i_sb).
 */

/*
 * Portability note: the last comparison (check that we fit into triple
 * indirect block) is spelled differently, because otherwise on an
 * architecture with 32-bit longs and 8Kb pages we might get into trouble
 * if our filesystem had 8Kb blocks. We might use long long, but that would
 * kill us on x86. Oh, well, at least the sign propagation does not matter -
 * i_block would have to be negative in the very beginning, so we would not
 * get there at all.
 */

static int ext4_block_to_path(struct inode *inode,
                              ext4_lblk_t i_block,
                              ext4_lblk_t offsets[4], int *boundary)
{
        int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
        int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
        const long direct_blocks = EXT4_NDIR_BLOCKS,
                indirect_blocks = ptrs,
                double_blocks = (1 << (ptrs_bits * 2));
        int n = 0;
        int final = 0;

        if (i_block < direct_blocks) {
                offsets[n++] = i_block;
                final = direct_blocks;
        } else if ((i_block -= direct_blocks) < indirect_blocks) {
                offsets[n++] = EXT4_IND_BLOCK;
                offsets[n++] = i_block;
                final = ptrs;
        } else if ((i_block -= indirect_blocks) < double_blocks) {
                offsets[n++] = EXT4_DIND_BLOCK;
                offsets[n++] = i_block >> ptrs_bits;
                offsets[n++] = i_block & (ptrs - 1);
                final = ptrs;
        } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
                offsets[n++] = EXT4_TIND_BLOCK;
                offsets[n++] = i_block >> (ptrs_bits * 2);
                offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
                offsets[n++] = i_block & (ptrs - 1);
                final = ptrs;
        } else {
                ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
                             i_block + direct_blocks +
                             indirect_blocks + double_blocks, inode->i_ino);
        }
        if (boundary)
                *boundary = final - 1 - (i_block & (ptrs - 1));
        return n;
}

/**
 *        ext4_get_branch - read the chain of indirect blocks leading to data
 *        @inode: inode in question
 *        @depth: depth of the chain (1 - direct pointer, etc.)
 *        @offsets: offsets of pointers in inode/indirect blocks
 *        @chain: place to store the result
 *        @err: here we store the error value
 *
 *        Function fills the array of triples <key, p, bh> and returns %NULL
 *        if everything went OK or the pointer to the last filled triple
 *        (incomplete one) otherwise. Upon the return chain[i].key contains
 *        the number of (i+1)-th block in the chain (as it is stored in memory,
 *        i.e. little-endian 32-bit), chain[i].p contains the address of that
 *        number (it points into struct inode for i==0 and into the bh->b_data
 *        for i>0) and chain[i].bh points to the buffer_head of i-th indirect
 *        block for i>0 and NULL for i==0. In other words, it holds the block
 *        numbers of the chain, addresses they were taken from (and where we can
 *        verify that chain did not change) and buffer_heads hosting these
 *        numbers.
 *
 *        Function stops when it stumbles upon zero pointer (absent block)
 *                (pointer to last triple returned, *@err == 0)
 *        or when it gets an IO error reading an indirect block
 *                (ditto, *@err == -EIO)
 *        or when it reads all @depth-1 indirect blocks successfully and finds
 *        the whole chain, all way to the data (returns %NULL, *err == 0).
 *
 *      Need to be called with
 *      down_read(&EXT4_I(inode)->i_data_sem)
 */
static Indirect *ext4_get_branch(struct inode *inode, int depth,
                                 ext4_lblk_t  *offsets,
                                 Indirect chain[4], int *err)
{
        struct super_block *sb = inode->i_sb;
        Indirect *p = chain;
        struct buffer_head *bh;
        unsigned int key;
        int ret = -EIO;

        *err = 0;
        /* i_data is not going away, no lock needed */
        add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
        if (!p->key)
                goto no_block;
        while (--depth) {
                key = le32_to_cpu(p->key);
                if (key > ext4_blocks_count(EXT4_SB(sb)->s_es)) {
                        /* the block was out of range */
                        ret = -EFSCORRUPTED;
                        goto failure;
                }
                bh = sb_getblk(sb, key);
                if (unlikely(!bh)) {
                        ret = -ENOMEM;
                        goto failure;
                }

                if (!bh_uptodate_or_lock(bh)) {
                        if (ext4_read_bh(bh, 0, NULL) < 0) {
                                put_bh(bh);
                                goto failure;
                        }
                        /* validate block references */
                        if (ext4_check_indirect_blockref(inode, bh)) {
                                put_bh(bh);
                                goto failure;
                        }
                }

                add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
                /* Reader: end */
                if (!p->key)
                        goto no_block;
        }
        return NULL;

failure:
        *err = ret;
no_block:
        return p;
}

/**
 *        ext4_find_near - find a place for allocation with sufficient locality
 *        @inode: owner
 *        @ind: descriptor of indirect block.
 *
 *        This function returns the preferred place for block allocation.
 *        It is used when heuristic for sequential allocation fails.
 *        Rules are:
 *          + if there is a block to the left of our position - allocate near it.
 *          + if pointer will live in indirect block - allocate near that block.
 *          + if pointer will live in inode - allocate in the same
 *            cylinder group.
 *
 * In the latter case we colour the starting block by the callers PID to
 * prevent it from clashing with concurrent allocations for a different inode
 * in the same block group.   The PID is used here so that functionally related
 * files will be close-by on-disk.
 *
 *        Caller must make sure that @ind is valid and will stay that way.
 */
static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
        __le32 *p;

        /* Try to find previous block */
        for (p = ind->p - 1; p >= start; p--) {
                if (*p)
                        return le32_to_cpu(*p);
        }

        /* No such thing, so let's try location of indirect block */
        if (ind->bh)
                return ind->bh->b_blocknr;

        /*
         * It is going to be referred to from the inode itself? OK, just put it
         * into the same cylinder group then.
         */
        return ext4_inode_to_goal_block(inode);
}

/**
 *        ext4_find_goal - find a preferred place for allocation.
 *        @inode: owner
 *        @block:  block we want
 *        @partial: pointer to the last triple within a chain
 *
 *        Normally this function find the preferred place for block allocation,
 *        returns it.
 *        Because this is only used for non-extent files, we limit the block nr
 *        to 32 bits.
 */
static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
                                   Indirect *partial)
{
        ext4_fsblk_t goal;

        /*
         * XXX need to get goal block from mballoc's data structures
         */

        goal = ext4_find_near(inode, partial);
        goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
        return goal;
}

/**
 *        ext4_blks_to_allocate - Look up the block map and count the number
 *        of direct blocks need to be allocated for the given branch.
 *
 *        @branch: chain of indirect blocks
 *        @k: number of blocks need for indirect blocks
 *        @blks: number of data blocks to be mapped.
 *        @blocks_to_boundary:  the offset in the indirect block
 *
 *        return the total number of blocks to be allocate, including the
 *        direct and indirect blocks.
 */
static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
                                 int blocks_to_boundary)
{
        unsigned int count = 0;

        /*
         * Simple case, [t,d]Indirect block(s) has not allocated yet
         * then it's clear blocks on that path have not allocated
         */
        if (k > 0) {
                /* right now we don't handle cross boundary allocation */
                if (blks < blocks_to_boundary + 1)
                        count += blks;
                else
                        count += blocks_to_boundary + 1;
                return count;
        }

        count++;
        while (count < blks && count <= blocks_to_boundary &&
                le32_to_cpu(*(branch[0].p + count)) == 0) {
                count++;
        }
        return count;
}

/**
 * ext4_alloc_branch() - allocate and set up a chain of blocks
 * @handle: handle for this transaction
 * @ar: structure describing the allocation request
 * @indirect_blks: number of allocated indirect blocks
 * @offsets: offsets (in the blocks) to store the pointers to next.
 * @branch: place to store the chain in.
 *
 *        This function allocates blocks, zeroes out all but the last one,
 *        links them into chain and (if we are synchronous) writes them to disk.
 *        In other words, it prepares a branch that can be spliced onto the
 *        inode. It stores the information about that chain in the branch[], in
 *        the same format as ext4_get_branch() would do. We are calling it after
 *        we had read the existing part of chain and partial points to the last
 *        triple of that (one with zero ->key). Upon the exit we have the same
 *        picture as after the successful ext4_get_block(), except that in one
 *        place chain is disconnected - *branch->p is still zero (we did not
 *        set the last link), but branch->key contains the number that should
 *        be placed into *branch->p to fill that gap.
 *
 *        If allocation fails we free all blocks we've allocated (and forget
 *        their buffer_heads) and return the error value the from failed
 *        ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
 *        as described above and return 0.
 */
static int ext4_alloc_branch(handle_t *handle,
                             struct ext4_allocation_request *ar,
                             int indirect_blks, ext4_lblk_t *offsets,
                             Indirect *branch)
{
        struct buffer_head *                bh;
        ext4_fsblk_t                        b, new_blocks[4];
        __le32                                *p;
        int                                i, j, err, len = 1;

        for (i = 0; i <= indirect_blks; i++) {
                if (i == indirect_blks) {
                        new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err);
                } else {
                        ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle,
                                        ar->inode, ar->goal,
                                        ar->flags & EXT4_MB_DELALLOC_RESERVED,
                                        NULL, &err);
                        /* Simplify error cleanup... */
                        branch[i+1].bh = NULL;
                }
                if (err) {
                        i--;
                        goto failed;
                }
                branch[i].key = cpu_to_le32(new_blocks[i]);
                if (i == 0)
                        continue;

                bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]);
                if (unlikely(!bh)) {
                        err = -ENOMEM;
                        goto failed;
                }
                lock_buffer(bh);
                BUFFER_TRACE(bh, "call get_create_access");
                err = ext4_journal_get_create_access(handle, ar->inode->i_sb,
                                                     bh, EXT4_JTR_NONE);
                if (err) {
                        unlock_buffer(bh);
                        goto failed;
                }

                memset(bh->b_data, 0, bh->b_size);
                p = branch[i].p = (__le32 *) bh->b_data + offsets[i];
                b = new_blocks[i];

                if (i == indirect_blks)
                        len = ar->len;
                for (j = 0; j < len; j++)
                        *p++ = cpu_to_le32(b++);

                BUFFER_TRACE(bh, "marking uptodate");
                set_buffer_uptodate(bh);
                unlock_buffer(bh);

                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
                err = ext4_handle_dirty_metadata(handle, ar->inode, bh);
                if (err)
                        goto failed;
        }
        return 0;
failed:
        if (i == indirect_blks) {
                /* Free data blocks */
                ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i],
                                 ar->len, 0);
                i--;
        }
        for (; i >= 0; i--) {
                /*
                 * We want to ext4_forget() only freshly allocated indirect
                 * blocks. Buffer for new_blocks[i] is at branch[i+1].bh
                 * (buffer at branch[0].bh is indirect block / inode already
                 * existing before ext4_alloc_branch() was called). Also
                 * because blocks are freshly allocated, we don't need to
                 * revoke them which is why we don't set
                 * EXT4_FREE_BLOCKS_METADATA.
                 */
                ext4_free_blocks(handle, ar->inode, branch[i+1].bh,
                                 new_blocks[i], 1,
                                 branch[i+1].bh ? EXT4_FREE_BLOCKS_FORGET : 0);
        }
        return err;
}

/**
 * ext4_splice_branch() - splice the allocated branch onto inode.
 * @handle: handle for this transaction
 * @ar: structure describing the allocation request
 * @where: location of missing link
 * @num:   number of indirect blocks we are adding
 *
 * This function fills the missing link and does all housekeeping needed in
 * inode (->i_blocks, etc.). In case of success we end up with the full
 * chain to new block and return 0.
 */
static int ext4_splice_branch(handle_t *handle,
                              struct ext4_allocation_request *ar,
                              Indirect *where, int num)
{
        int i;
        int err = 0;
        ext4_fsblk_t current_block;

        /*
         * If we're splicing into a [td]indirect block (as opposed to the
         * inode) then we need to get write access to the [td]indirect block
         * before the splice.
         */
        if (where->bh) {
                BUFFER_TRACE(where->bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, ar->inode->i_sb,
                                                    where->bh, EXT4_JTR_NONE);
                if (err)
                        goto err_out;
        }
        /* That's it */

        *where->p = where->key;

        /*
         * Update the host buffer_head or inode to point to more just allocated
         * direct blocks blocks
         */
        if (num == 0 && ar->len > 1) {
                current_block = le32_to_cpu(where->key) + 1;
                for (i = 1; i < ar->len; i++)
                        *(where->p + i) = cpu_to_le32(current_block++);
        }

        /* We are done with atomic stuff, now do the rest of housekeeping */
        /* had we spliced it onto indirect block? */
        if (where->bh) {
                /*
                 * If we spliced it onto an indirect block, we haven't
                 * altered the inode.  Note however that if it is being spliced
                 * onto an indirect block at the very end of the file (the
                 * file is growing) then we *will* alter the inode to reflect
                 * the new i_size.  But that is not done here - it is done in
                 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
                 */
                ext4_debug("splicing indirect only\n");
                BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
                err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh);
                if (err)
                        goto err_out;
        } else {
                /*
                 * OK, we spliced it into the inode itself on a direct block.
                 */
                err = ext4_mark_inode_dirty(handle, ar->inode);
                if (unlikely(err))
                        goto err_out;
                ext4_debug("splicing direct\n");
        }
        return err;

err_out:
        for (i = 1; i <= num; i++) {
                /*
                 * branch[i].bh is newly allocated, so there is no
                 * need to revoke the block, which is why we don't
                 * need to set EXT4_FREE_BLOCKS_METADATA.
                 */
                ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1,
                                 EXT4_FREE_BLOCKS_FORGET);
        }
        ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key),
                         ar->len, 0);

        return err;
}

/*
 * The ext4_ind_map_blocks() function handles non-extents inodes
 * (i.e., using the traditional indirect/double-indirect i_blocks
 * scheme) for ext4_map_blocks().
 *
 * Allocation strategy is simple: if we have to allocate something, we will
 * have to go the whole way to leaf. So let's do it before attaching anything
 * to tree, set linkage between the newborn blocks, write them if sync is
 * required, recheck the path, free and repeat if check fails, otherwise
 * set the last missing link (that will protect us from any truncate-generated
 * removals - all blocks on the path are immune now) and possibly force the
 * write on the parent block.
 * That has a nice additional property: no special recovery from the failed
 * allocations is needed - we simply release blocks and do not touch anything
 * reachable from inode.
 *
 * `handle' can be NULL if create == 0.
 *
 * return > 0, # of blocks mapped or allocated.
 * return = 0, if plain lookup failed.
 * return < 0, error case.
 *
 * The ext4_ind_get_blocks() function should be called with
 * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
 * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
 * blocks.
 */
int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map,
                        int flags)
{
        struct ext4_allocation_request ar;
        int err = -EIO;
        ext4_lblk_t offsets[4];
        Indirect chain[4];
        Indirect *partial;
        int indirect_blks;
        int blocks_to_boundary = 0;
        int depth;
        int count = 0;
        ext4_fsblk_t first_block = 0;

        trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
        ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
        ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
        depth = ext4_block_to_path(inode, map->m_lblk, offsets,
                                   &blocks_to_boundary);

        if (depth == 0)
                goto out;

        partial = ext4_get_branch(inode, depth, offsets, chain, &err);

        /* Simplest case - block found, no allocation needed */
        if (!partial) {
                first_block = le32_to_cpu(chain[depth - 1].key);
                count++;
                /*map more blocks*/
                while (count < map->m_len && count <= blocks_to_boundary) {
                        ext4_fsblk_t blk;

                        blk = le32_to_cpu(*(chain[depth-1].p + count));

                        if (blk == first_block + count)
                                count++;
                        else
                                break;
                }
                goto got_it;
        }

        /* Next simple case - plain lookup failed */
        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                unsigned epb = inode->i_sb->s_blocksize / sizeof(u32);
                int i;

                /*
                 * Count number blocks in a subtree under 'partial'. At each
                 * level we count number of complete empty subtrees beyond
                 * current offset and then descend into the subtree only
                 * partially beyond current offset.
                 */
                count = 0;
                for (i = partial - chain + 1; i < depth; i++)
                        count = count * epb + (epb - offsets[i] - 1);
                count++;
                /* Fill in size of a hole we found */
                map->m_pblk = 0;
                map->m_len = min_t(unsigned int, map->m_len, count);
                goto cleanup;
        }

        /* Failed read of indirect block */
        if (err == -EIO)
                goto cleanup;

        /*
         * Okay, we need to do block allocation.
        */
        if (ext4_has_feature_bigalloc(inode->i_sb)) {
                EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
                                 "non-extent mapped inodes with bigalloc");
                err = -EFSCORRUPTED;
                goto out;
        }

        /* Set up for the direct block allocation */
        memset(&ar, 0, sizeof(ar));
        ar.inode = inode;
        ar.logical = map->m_lblk;
        if (S_ISREG(inode->i_mode))
                ar.flags = EXT4_MB_HINT_DATA;
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                ar.flags |= EXT4_MB_DELALLOC_RESERVED;
        if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
                ar.flags |= EXT4_MB_USE_RESERVED;

        ar.goal = ext4_find_goal(inode, map->m_lblk, partial);

        /* the number of blocks need to allocate for [d,t]indirect blocks */
        indirect_blks = (chain + depth) - partial - 1;

        /*
         * Next look up the indirect map to count the totoal number of
         * direct blocks to allocate for this branch.
         */
        ar.len = ext4_blks_to_allocate(partial, indirect_blks,
                                       map->m_len, blocks_to_boundary);

        /*
         * Block out ext4_truncate while we alter the tree
         */
        err = ext4_alloc_branch(handle, &ar, indirect_blks,
                                offsets + (partial - chain), partial);

        /*
         * The ext4_splice_branch call will free and forget any buffers
         * on the new chain if there is a failure, but that risks using
         * up transaction credits, especially for bitmaps where the
         * credits cannot be returned.  Can we handle this somehow?  We
         * may need to return -EAGAIN upwards in the worst case.  --sct
         */
        if (!err)
                err = ext4_splice_branch(handle, &ar, partial, indirect_blks);
        if (err)
                goto cleanup;

        map->m_flags |= EXT4_MAP_NEW;

        ext4_update_inode_fsync_trans(handle, inode, 1);
        count = ar.len;

        /*
         * Update reserved blocks/metadata blocks after successful block
         * allocation which had been deferred till now.
         */
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                ext4_da_update_reserve_space(inode, count, 1);

got_it:
        map->m_flags |= EXT4_MAP_MAPPED;
        map->m_pblk = le32_to_cpu(chain[depth-1].key);
        map->m_len = count;
        if (count > blocks_to_boundary)
                map->m_flags |= EXT4_MAP_BOUNDARY;
        err = count;
        /* Clean up and exit */
        partial = chain + depth - 1;        /* the whole chain */
cleanup:
        while (partial > chain) {
                BUFFER_TRACE(partial->bh, "call brelse");
                brelse(partial->bh);
                partial--;
        }
out:
        trace_ext4_ind_map_blocks_exit(inode, flags, map, err);
        return err;
}

/*
 * Calculate number of indirect blocks touched by mapping @nrblocks logically
 * contiguous blocks
 */
int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
{
        /*
         * With N contiguous data blocks, we need at most
         * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
         * 2 dindirect blocks, and 1 tindirect block
         */
        return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
}

static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
                                     struct buffer_head *bh, int *dropped)
{
        int err;

        if (bh) {
                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
                err = ext4_handle_dirty_metadata(handle, inode, bh);
                if (unlikely(err))
                        return err;
        }
        err = ext4_mark_inode_dirty(handle, inode);
        if (unlikely(err))
                return err;
        /*
         * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
         * moment, get_block can be called only for blocks inside i_size since
         * page cache has been already dropped and writes are blocked by
         * i_rwsem. So we can safely drop the i_data_sem here.
         */
        BUG_ON(EXT4_JOURNAL(inode) == NULL);
        ext4_discard_preallocations(inode);
        up_write(&EXT4_I(inode)->i_data_sem);
        *dropped = 1;
        return 0;
}

/*
 * Truncate transactions can be complex and absolutely huge.  So we need to
 * be able to restart the transaction at a convenient checkpoint to make
 * sure we don't overflow the journal.
 *
 * Try to extend this transaction for the purposes of truncation.  If
 * extend fails, we restart transaction.
 */
static int ext4_ind_truncate_ensure_credits(handle_t *handle,
                                            struct inode *inode,
                                            struct buffer_head *bh,
                                            int revoke_creds)
{
        int ret;
        int dropped = 0;

        ret = ext4_journal_ensure_credits_fn(handle, EXT4_RESERVE_TRANS_BLOCKS,
                        ext4_blocks_for_truncate(inode), revoke_creds,
                        ext4_ind_trunc_restart_fn(handle, inode, bh, &dropped));
        if (dropped)
                down_write(&EXT4_I(inode)->i_data_sem);
        if (ret <= 0)
                return ret;
        if (bh) {
                BUFFER_TRACE(bh, "retaking write access");
                ret = ext4_journal_get_write_access(handle, inode->i_sb, bh,
                                                    EXT4_JTR_NONE);
                if (unlikely(ret))
                        return ret;
        }
        return 0;
}

/*
 * Probably it should be a library function... search for first non-zero word
 * or memcmp with zero_page, whatever is better for particular architecture.
 * Linus?
 */
static inline int all_zeroes(__le32 *p, __le32 *q)
{
        while (p < q)
                if (*p++)
                        return 0;
        return 1;
}

/**
 *        ext4_find_shared - find the indirect blocks for partial truncation.
 *        @inode:          inode in question
 *        @depth:          depth of the affected branch
 *        @offsets: offsets of pointers in that branch (see ext4_block_to_path)
 *        @chain:          place to store the pointers to partial indirect blocks
 *        @top:          place to the (detached) top of branch
 *
 *        This is a helper function used by ext4_truncate().
 *
 *        When we do truncate() we may have to clean the ends of several
 *        indirect blocks but leave the blocks themselves alive. Block is
 *        partially truncated if some data below the new i_size is referred
 *        from it (and it is on the path to the first completely truncated
 *        data block, indeed).  We have to free the top of that path along
 *        with everything to the right of the path. Since no allocation
 *        past the truncation point is possible until ext4_truncate()
 *        finishes, we may safely do the latter, but top of branch may
 *        require special attention - pageout below the truncation point
 *        might try to populate it.
 *
 *        We atomically detach the top of branch from the tree, store the
 *        block number of its root in *@top, pointers to buffer_heads of
 *        partially truncated blocks - in @chain[].bh and pointers to
 *        their last elements that should not be removed - in
 *        @chain[].p. Return value is the pointer to last filled element
 *        of @chain.
 *
 *        The work left to caller to do the actual freeing of subtrees:
 *                a) free the subtree starting from *@top
 *                b) free the subtrees whose roots are stored in
 *                        (@chain[i].p+1 .. end of @chain[i].bh->b_data)
 *                c) free the subtrees growing from the inode past the @chain[0].
 *                        (no partially truncated stuff there).  */

static Indirect *ext4_find_shared(struct inode *inode, int depth,
                                  ext4_lblk_t offsets[4], Indirect chain[4],
                                  __le32 *top)
{
        Indirect *partial, *p;
        int k, err;

        *top = 0;
        /* Make k index the deepest non-null offset + 1 */
        for (k = depth; k > 1 && !offsets[k-1]; k--)
                ;
        partial = ext4_get_branch(inode, k, offsets, chain, &err);
        /* Writer: pointers */
        if (!partial)
                partial = chain + k-1;
        /*
         * If the branch acquired continuation since we've looked at it -
         * fine, it should all survive and (new) top doesn't belong to us.
         */
        if (!partial->key && *partial->p)
                /* Writer: end */
                goto no_top;
        for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
                ;
        /*
         * OK, we've found the last block that must survive. The rest of our
         * branch should be detached before unlocking. However, if that rest
         * of branch is all ours and does not grow immediately from the inode
         * it's easier to cheat and just decrement partial->p.
         */
        if (p == chain + k - 1 && p > chain) {
                p->p--;
        } else {
                *top = *p->p;
                /* Nope, don't do this in ext4.  Must leave the tree intact */
#if 0
                *p->p = 0;
#endif
        }
        /* Writer: end */

        while (partial > p) {
                brelse(partial->bh);
                partial--;
        }
no_top:
        return partial;
}

/*
 * Zero a number of block pointers in either an inode or an indirect block.
 * If we restart the transaction we must again get write access to the
 * indirect block for further modification.
 *
 * We release `count' blocks on disk, but (last - first) may be greater
 * than `count' because there can be holes in there.
 *
 * Return 0 on success, 1 on invalid block range
 * and < 0 on fatal error.
 */
static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
                             struct buffer_head *bh,
                             ext4_fsblk_t block_to_free,
                             unsigned long count, __le32 *first,
                             __le32 *last)
{
        __le32 *p;
        int        flags = EXT4_FREE_BLOCKS_VALIDATED;
        int        err;

        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
            ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
                flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
        else if (ext4_should_journal_data(inode))
                flags |= EXT4_FREE_BLOCKS_FORGET;

        if (!ext4_inode_block_valid(inode, block_to_free, count)) {
                EXT4_ERROR_INODE(inode, "attempt to clear invalid "
                                 "blocks %llu len %lu",
                                 (unsigned long long) block_to_free, count);
                return 1;
        }

        err = ext4_ind_truncate_ensure_credits(handle, inode, bh,
                                ext4_free_data_revoke_credits(inode, count));
        if (err < 0)
                goto out_err;

        for (p = first; p < last; p++)
                *p = 0;

        ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
        return 0;
out_err:
        ext4_std_error(inode->i_sb, err);
        return err;
}

/**
 * ext4_free_data - free a list of data blocks
 * @handle:        handle for this transaction
 * @inode:        inode we are dealing with
 * @this_bh:        indirect buffer_head which contains *@first and *@last
 * @first:        array of block numbers
 * @last:        points immediately past the end of array
 *
 * We are freeing all blocks referred from that array (numbers are stored as
 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
 *
 * We accumulate contiguous runs of blocks to free.  Conveniently, if these
 * blocks are contiguous then releasing them at one time will only affect one
 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
 * actually use a lot of journal space.
 *
 * @this_bh will be %NULL if @first and @last point into the inode's direct
 * block pointers.
 */
static void ext4_free_data(handle_t *handle, struct inode *inode,
                           struct buffer_head *this_bh,
                           __le32 *first, __le32 *last)
{
        ext4_fsblk_t block_to_free = 0;    /* Starting block # of a run */
        unsigned long count = 0;            /* Number of blocks in the run */
        __le32 *block_to_free_p = NULL;            /* Pointer into inode/ind
                                               corresponding to
                                               block_to_free */
        ext4_fsblk_t nr;                    /* Current block # */
        __le32 *p;                            /* Pointer into inode/ind
                                               for current block */
        int err = 0;

        if (this_bh) {                                /* For indirect block */
                BUFFER_TRACE(this_bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, inode->i_sb,
                                                    this_bh, EXT4_JTR_NONE);
                /* Important: if we can't update the indirect pointers
                 * to the blocks, we can't free them. */
                if (err)
                        return;
        }

        for (p = first; p < last; p++) {
                nr = le32_to_cpu(*p);
                if (nr) {
                        /* accumulate blocks to free if they're contiguous */
                        if (count == 0) {
                                block_to_free = nr;
                                block_to_free_p = p;
                                count = 1;
                        } else if (nr == block_to_free + count) {
                                count++;
                        } else {
                                err = ext4_clear_blocks(handle, inode, this_bh,
                                                        block_to_free, count,
                                                        block_to_free_p, p);
                                if (err)
                                        break;
                                block_to_free = nr;
                                block_to_free_p = p;
                                count = 1;
                        }
                }
        }

        if (!err && count > 0)
                err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
                                        count, block_to_free_p, p);
        if (err < 0)
                /* fatal error */
                return;

        if (this_bh) {
                BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");

                /*
                 * The buffer head should have an attached journal head at this
                 * point. However, if the data is corrupted and an indirect
                 * block pointed to itself, it would have been detached when
                 * the block was cleared. Check for this instead of OOPSing.
                 */
                if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
                        ext4_handle_dirty_metadata(handle, inode, this_bh);
                else
                        EXT4_ERROR_INODE(inode,
                                         "circular indirect block detected at "
                                         "block %llu",
                                (unsigned long long) this_bh->b_blocknr);
        }
}

/**
 *        ext4_free_branches - free an array of branches
 *        @handle: JBD handle for this transaction
 *        @inode:        inode we are dealing with
 *        @parent_bh: the buffer_head which contains *@first and *@last
 *        @first:        array of block numbers
 *        @last:        pointer immediately past the end of array
 *        @depth:        depth of the branches to free
 *
 *        We are freeing all blocks referred from these branches (numbers are
 *        stored as little-endian 32-bit) and updating @inode->i_blocks
 *        appropriately.
 */
static void ext4_free_branches(handle_t *handle, struct inode *inode,
                               struct buffer_head *parent_bh,
                               __le32 *first, __le32 *last, int depth)
{
        ext4_fsblk_t nr;
        __le32 *p;

        if (ext4_handle_is_aborted(handle))
                return;

        if (depth--) {
                struct buffer_head *bh;
                int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
                p = last;
                while (--p >= first) {
                        nr = le32_to_cpu(*p);
                        if (!nr)
                                continue;                /* A hole */

                        if (!ext4_inode_block_valid(inode, nr, 1)) {
                                EXT4_ERROR_INODE(inode,
                                                 "invalid indirect mapped "
                                                 "block %lu (level %d)",
                                                 (unsigned long) nr, depth);
                                break;
                        }

                        /* Go read the buffer for the next level down */
                        bh = ext4_sb_bread(inode->i_sb, nr, 0);

                        /*
                         * A read failure? Report error and clear slot
                         * (should be rare).
                         */
                        if (IS_ERR(bh)) {
                                ext4_error_inode_block(inode, nr, -PTR_ERR(bh),
                                                       "Read failure");
                                continue;
                        }

                        /* This zaps the entire block.  Bottom up. */
                        BUFFER_TRACE(bh, "free child branches");
                        ext4_free_branches(handle, inode, bh,
                                        (__le32 *) bh->b_data,
                                        (__le32 *) bh->b_data + addr_per_block,
                                        depth);
                        brelse(bh);

                        /*
                         * Everything below this pointer has been
                         * released.  Now let this top-of-subtree go.
                         *
                         * We want the freeing of this indirect block to be
                         * atomic in the journal with the updating of the
                         * bitmap block which owns it.  So make some room in
                         * the journal.
                         *
                         * We zero the parent pointer *after* freeing its
                         * pointee in the bitmaps, so if extend_transaction()
                         * for some reason fails to put the bitmap changes and
                         * the release into the same transaction, recovery
                         * will merely complain about releasing a free block,
                         * rather than leaking blocks.
                         */
                        if (ext4_handle_is_aborted(handle))
                                return;
                        if (ext4_ind_truncate_ensure_credits(handle, inode,
                                        NULL,
                                        ext4_free_metadata_revoke_credits(
                                                        inode->i_sb, 1)) < 0)
                                return;

                        /*
                         * The forget flag here is critical because if
                         * we are journaling (and not doing data
                         * journaling), we have to make sure a revoke
                         * record is written to prevent the journal
                         * replay from overwriting the (former)
                         * indirect block if it gets reallocated as a
                         * data block.  This must happen in the same
                         * transaction where the data blocks are
                         * actually freed.
                         */
                        ext4_free_blocks(handle, inode, NULL, nr, 1,
                                         EXT4_FREE_BLOCKS_METADATA|
                                         EXT4_FREE_BLOCKS_FORGET);

                        if (parent_bh) {
                                /*
                                 * The block which we have just freed is
                                 * pointed to by an indirect block: journal it
                                 */
                                BUFFER_TRACE(parent_bh, "get_write_access");
                                if (!ext4_journal_get_write_access(handle,
                                                inode->i_sb, parent_bh,
                                                EXT4_JTR_NONE)) {
                                        *p = 0;
                                        BUFFER_TRACE(parent_bh,
                                        "call ext4_handle_dirty_metadata");
                                        ext4_handle_dirty_metadata(handle,
                                                                   inode,
                                                                   parent_bh);
                                }
                        }
                }
        } else {
                /* We have reached the bottom of the tree. */
                BUFFER_TRACE(parent_bh, "free data blocks");
                ext4_free_data(handle, inode, parent_bh, first, last);
        }
}

void ext4_ind_truncate(handle_t *handle, struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        __le32 *i_data = ei->i_data;
        int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
        ext4_lblk_t offsets[4];
        Indirect chain[4];
        Indirect *partial;
        __le32 nr = 0;
        int n = 0;
        ext4_lblk_t last_block, max_block;
        unsigned blocksize = inode->i_sb->s_blocksize;

        last_block = (inode->i_size + blocksize-1)
                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
        max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);

        if (last_block != max_block) {
                n = ext4_block_to_path(inode, last_block, offsets, NULL);
                if (n == 0)
                        return;
        }

        ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);

        /*
         * The orphan list entry will now protect us from any crash which
         * occurs before the truncate completes, so it is now safe to propagate
         * the new, shorter inode size (held for now in i_size) into the
         * on-disk inode. We do this via i_disksize, which is the value which
         * ext4 *really* writes onto the disk inode.
         */
        ei->i_disksize = inode->i_size;

        if (last_block == max_block) {
                /*
                 * It is unnecessary to free any data blocks if last_block is
                 * equal to the indirect block limit.
                 */
                return;
        } else if (n == 1) {                /* direct blocks */
                ext4_free_data(handle, inode, NULL, i_data+offsets[0],
                               i_data + EXT4_NDIR_BLOCKS);
                goto do_indirects;
        }

        partial = ext4_find_shared(inode, n, offsets, chain, &nr);
        /* Kill the top of shared branch (not detached) */
        if (nr) {
                if (partial == chain) {
                        /* Shared branch grows from the inode */
                        ext4_free_branches(handle, inode, NULL,
                                           &nr, &nr+1, (chain+n-1) - partial);
                        *partial->p = 0;
                        /*
                         * We mark the inode dirty prior to restart,
                         * and prior to stop.  No need for it here.
                         */
                } else {
                        /* Shared branch grows from an indirect block */
                        BUFFER_TRACE(partial->bh, "get_write_access");
                        ext4_free_branches(handle, inode, partial->bh,
                                        partial->p,
                                        partial->p+1, (chain+n-1) - partial);
                }
        }
        /* Clear the ends of indirect blocks on the shared branch */
        while (partial > chain) {
                ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
                                   (__le32*)partial->bh->b_data+addr_per_block,
                                   (chain+n-1) - partial);
                BUFFER_TRACE(partial->bh, "call brelse");
                brelse(partial->bh);
                partial--;
        }
do_indirects:
        /* Kill the remaining (whole) subtrees */
        switch (offsets[0]) {
        default:
                nr = i_data[EXT4_IND_BLOCK];
                if (nr) {
                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
                        i_data[EXT4_IND_BLOCK] = 0;
                }
                fallthrough;
        case EXT4_IND_BLOCK:
                nr = i_data[EXT4_DIND_BLOCK];
                if (nr) {
                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
                        i_data[EXT4_DIND_BLOCK] = 0;
                }
                fallthrough;
        case EXT4_DIND_BLOCK:
                nr = i_data[EXT4_TIND_BLOCK];
                if (nr) {
                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
                        i_data[EXT4_TIND_BLOCK] = 0;
                }
                fallthrough;
        case EXT4_TIND_BLOCK:
                ;
        }
}

/**
 *        ext4_ind_remove_space - remove space from the range
 *        @handle: JBD handle for this transaction
 *        @inode:        inode we are dealing with
 *        @start:        First block to remove
 *        @end:        One block after the last block to remove (exclusive)
 *
 *        Free the blocks in the defined range (end is exclusive endpoint of
 *        range). This is used by ext4_punch_hole().
 */
int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
                          ext4_lblk_t start, ext4_lblk_t end)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        __le32 *i_data = ei->i_data;
        int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
        ext4_lblk_t offsets[4], offsets2[4];
        Indirect chain[4], chain2[4];
        Indirect *partial, *partial2;
        Indirect *p = NULL, *p2 = NULL;
        ext4_lblk_t max_block;
        __le32 nr = 0, nr2 = 0;
        int n = 0, n2 = 0;
        unsigned blocksize = inode->i_sb->s_blocksize;

        max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
        if (end >= max_block)
                end = max_block;
        if ((start >= end) || (start > max_block))
                return 0;

        n = ext4_block_to_path(inode, start, offsets, NULL);
        n2 = ext4_block_to_path(inode, end, offsets2, NULL);

        BUG_ON(n > n2);

        if ((n == 1) && (n == n2)) {
                /* We're punching only within direct block range */
                ext4_free_data(handle, inode, NULL, i_data + offsets[0],
                               i_data + offsets2[0]);
                return 0;
        } else if (n2 > n) {
                /*
                 * Start and end are on a different levels so we're going to
                 * free partial block at start, and partial block at end of
                 * the range. If there are some levels in between then
                 * do_indirects label will take care of that.
                 */

                if (n == 1) {
                        /*
                         * Start is at the direct block level, free
                         * everything to the end of the level.
                         */
                        ext4_free_data(handle, inode, NULL, i_data + offsets[0],
                                       i_data + EXT4_NDIR_BLOCKS);
                        goto end_range;
                }


                partial = p = ext4_find_shared(inode, n, offsets, chain, &nr);
                if (nr) {
                        if (partial == chain) {
                                /* Shared branch grows from the inode */
                                ext4_free_branches(handle, inode, NULL,
                                           &nr, &nr+1, (chain+n-1) - partial);
                                *partial->p = 0;
                        } else {
                                /* Shared branch grows from an indirect block */
                                BUFFER_TRACE(partial->bh, "get_write_access");
                                ext4_free_branches(handle, inode, partial->bh,
                                        partial->p,
                                        partial->p+1, (chain+n-1) - partial);
                        }
                }

                /*
                 * Clear the ends of indirect blocks on the shared branch
                 * at the start of the range
                 */
                while (partial > chain) {
                        ext4_free_branches(handle, inode, partial->bh,
                                partial->p + 1,
                                (__le32 *)partial->bh->b_data+addr_per_block,
                                (chain+n-1) - partial);
                        partial--;
                }

end_range:
                partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
                if (nr2) {
                        if (partial2 == chain2) {
                                /*
                                 * Remember, end is exclusive so here we're at
                                 * the start of the next level we're not going
                                 * to free. Everything was covered by the start
                                 * of the range.
                                 */
                                goto do_indirects;
                        }
                } else {
                        /*
                         * ext4_find_shared returns Indirect structure which
                         * points to the last element which should not be
                         * removed by truncate. But this is end of the range
                         * in punch_hole so we need to point to the next element
                         */
                        partial2->p++;
                }

                /*
                 * Clear the ends of indirect blocks on the shared branch
                 * at the end of the range
                 */
                while (partial2 > chain2) {
                        ext4_free_branches(handle, inode, partial2->bh,
                                           (__le32 *)partial2->bh->b_data,
                                           partial2->p,
                                           (chain2+n2-1) - partial2);
                        partial2--;
                }
                goto do_indirects;
        }

        /* Punch happened within the same level (n == n2) */
        partial = p = ext4_find_shared(inode, n, offsets, chain, &nr);
        partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);

        /* Free top, but only if partial2 isn't its subtree. */
        if (nr) {
                int level = min(partial - chain, partial2 - chain2);
                int i;
                int subtree = 1;

                for (i = 0; i <= level; i++) {
                        if (offsets[i] != offsets2[i]) {
                                subtree = 0;
                                break;
                        }
                }

                if (!subtree) {
                        if (partial == chain) {
                                /* Shared branch grows from the inode */
                                ext4_free_branches(handle, inode, NULL,
                                                   &nr, &nr+1,
                                                   (chain+n-1) - partial);
                                *partial->p = 0;
                        } else {
                                /* Shared branch grows from an indirect block */
                                BUFFER_TRACE(partial->bh, "get_write_access");
                                ext4_free_branches(handle, inode, partial->bh,
                                                   partial->p,
                                                   partial->p+1,
                                                   (chain+n-1) - partial);
                        }
                }
        }

        if (!nr2) {
                /*
                 * ext4_find_shared returns Indirect structure which
                 * points to the last element which should not be
                 * removed by truncate. But this is end of the range
                 * in punch_hole so we need to point to the next element
                 */
                partial2->p++;
        }

        while (partial > chain || partial2 > chain2) {
                int depth = (chain+n-1) - partial;
                int depth2 = (chain2+n2-1) - partial2;

                if (partial > chain && partial2 > chain2 &&
                    partial->bh->b_blocknr == partial2->bh->b_blocknr) {
                        /*
                         * We've converged on the same block. Clear the range,
                         * then we're done.
                         */
                        ext4_free_branches(handle, inode, partial->bh,
                                           partial->p + 1,
                                           partial2->p,
                                           (chain+n-1) - partial);
                        goto cleanup;
                }

                /*
                 * The start and end partial branches may not be at the same
                 * level even though the punch happened within one level. So, we
                 * give them a chance to arrive at the same level, then walk
                 * them in step with each other until we converge on the same
                 * block.
                 */
                if (partial > chain && depth <= depth2) {
                        ext4_free_branches(handle, inode, partial->bh,
                                           partial->p + 1,
                                           (__le32 *)partial->bh->b_data+addr_per_block,
                                           (chain+n-1) - partial);
                        partial--;
                }
                if (partial2 > chain2 && depth2 <= depth) {
                        ext4_free_branches(handle, inode, partial2->bh,
                                           (__le32 *)partial2->bh->b_data,
                                           partial2->p,
                                           (chain2+n2-1) - partial2);
                        partial2--;
                }
        }

cleanup:
        while (p && p > chain) {
                BUFFER_TRACE(p->bh, "call brelse");
                brelse(p->bh);
                p--;
        }
        while (p2 && p2 > chain2) {
                BUFFER_TRACE(p2->bh, "call brelse");
                brelse(p2->bh);
                p2--;
        }
        return 0;

do_indirects:
        /* Kill the remaining (whole) subtrees */
        switch (offsets[0]) {
        default:
                if (++n >= n2)
                        break;
                nr = i_data[EXT4_IND_BLOCK];
                if (nr) {
                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
                        i_data[EXT4_IND_BLOCK] = 0;
                }
                fallthrough;
        case EXT4_IND_BLOCK:
                if (++n >= n2)
                        break;
                nr = i_data[EXT4_DIND_BLOCK];
                if (nr) {
                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
                        i_data[EXT4_DIND_BLOCK] = 0;
                }
                fallthrough;
        case EXT4_DIND_BLOCK:
                if (++n >= n2)
                        break;
                nr = i_data[EXT4_TIND_BLOCK];
                if (nr) {
                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
                        i_data[EXT4_TIND_BLOCK] = 0;
                }
                fallthrough;
        case EXT4_TIND_BLOCK:
                ;
        }
        goto cleanup;
}



















































































































































































































































































































































































   10 





















































































































































































































   10 


   10 







   11 
   11 

   10 

   11 

















   11 












   10 

   11 





   12 



































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
// SPDX-License-Identifier: GPL-2.0
/*
 * Workingset detection
 *
 * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
 */

#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/writeback.h>
#include <linux/shmem_fs.h>
#include <linux/pagemap.h>
#include <linux/atomic.h>
#include <linux/module.h>
#include <linux/swap.h>
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include "internal.h"

/*
 *                Double CLOCK lists
 *
 * Per node, two clock lists are maintained for file pages: the
 * inactive and the active list.  Freshly faulted pages start out at
 * the head of the inactive list and page reclaim scans pages from the
 * tail.  Pages that are accessed multiple times on the inactive list
 * are promoted to the active list, to protect them from reclaim,
 * whereas active pages are demoted to the inactive list when the
 * active list grows too big.
 *
 *   fault ------------------------+
 *                                 |
 *              +--------------+   |            +-------------+
 *   reclaim <- |   inactive   | <-+-- demotion |    active   | <--+
 *              +--------------+                +-------------+    |
 *                     |                                           |
 *                     +-------------- promotion ------------------+
 *
 *
 *                Access frequency and refault distance
 *
 * A workload is thrashing when its pages are frequently used but they
 * are evicted from the inactive list every time before another access
 * would have promoted them to the active list.
 *
 * In cases where the average access distance between thrashing pages
 * is bigger than the size of memory there is nothing that can be
 * done - the thrashing set could never fit into memory under any
 * circumstance.
 *
 * However, the average access distance could be bigger than the
 * inactive list, yet smaller than the size of memory.  In this case,
 * the set could fit into memory if it weren't for the currently
 * active pages - which may be used more, hopefully less frequently:
 *
 *      +-memory available to cache-+
 *      |                           |
 *      +-inactive------+-active----+
 *  a b | c d e f g h i | J K L M N |
 *      +---------------+-----------+
 *
 * It is prohibitively expensive to accurately track access frequency
 * of pages.  But a reasonable approximation can be made to measure
 * thrashing on the inactive list, after which refaulting pages can be
 * activated optimistically to compete with the existing active pages.
 *
 * Approximating inactive page access frequency - Observations:
 *
 * 1. When a page is accessed for the first time, it is added to the
 *    head of the inactive list, slides every existing inactive page
 *    towards the tail by one slot, and pushes the current tail page
 *    out of memory.
 *
 * 2. When a page is accessed for the second time, it is promoted to
 *    the active list, shrinking the inactive list by one slot.  This
 *    also slides all inactive pages that were faulted into the cache
 *    more recently than the activated page towards the tail of the
 *    inactive list.
 *
 * Thus:
 *
 * 1. The sum of evictions and activations between any two points in
 *    time indicate the minimum number of inactive pages accessed in
 *    between.
 *
 * 2. Moving one inactive page N page slots towards the tail of the
 *    list requires at least N inactive page accesses.
 *
 * Combining these:
 *
 * 1. When a page is finally evicted from memory, the number of
 *    inactive pages accessed while the page was in cache is at least
 *    the number of page slots on the inactive list.
 *
 * 2. In addition, measuring the sum of evictions and activations (E)
 *    at the time of a page's eviction, and comparing it to another
 *    reading (R) at the time the page faults back into memory tells
 *    the minimum number of accesses while the page was not cached.
 *    This is called the refault distance.
 *
 * Because the first access of the page was the fault and the second
 * access the refault, we combine the in-cache distance with the
 * out-of-cache distance to get the complete minimum access distance
 * of this page:
 *
 *      NR_inactive + (R - E)
 *
 * And knowing the minimum access distance of a page, we can easily
 * tell if the page would be able to stay in cache assuming all page
 * slots in the cache were available:
 *
 *   NR_inactive + (R - E) <= NR_inactive + NR_active
 *
 * If we have swap we should consider about NR_inactive_anon and
 * NR_active_anon, so for page cache and anonymous respectively:
 *
 *   NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file
 *   + NR_inactive_anon + NR_active_anon
 *
 *   NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon
 *   + NR_inactive_file + NR_active_file
 *
 * Which can be further simplified to:
 *
 *   (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon
 *
 *   (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file
 *
 * Put into words, the refault distance (out-of-cache) can be seen as
 * a deficit in inactive list space (in-cache).  If the inactive list
 * had (R - E) more page slots, the page would not have been evicted
 * in between accesses, but activated instead.  And on a full system,
 * the only thing eating into inactive list space is active pages.
 *
 *
 *                Refaulting inactive pages
 *
 * All that is known about the active list is that the pages have been
 * accessed more than once in the past.  This means that at any given
 * time there is actually a good chance that pages on the active list
 * are no longer in active use.
 *
 * So when a refault distance of (R - E) is observed and there are at
 * least (R - E) pages in the userspace workingset, the refaulting page
 * is activated optimistically in the hope that (R - E) pages are actually
 * used less frequently than the refaulting page - or even not used at
 * all anymore.
 *
 * That means if inactive cache is refaulting with a suitable refault
 * distance, we assume the cache workingset is transitioning and put
 * pressure on the current workingset.
 *
 * If this is wrong and demotion kicks in, the pages which are truly
 * used more frequently will be reactivated while the less frequently
 * used once will be evicted from memory.
 *
 * But if this is right, the stale pages will be pushed out of memory
 * and the used pages get to stay in cache.
 *
 *                Refaulting active pages
 *
 * If on the other hand the refaulting pages have recently been
 * deactivated, it means that the active list is no longer protecting
 * actively used cache from reclaim. The cache is NOT transitioning to
 * a different workingset; the existing workingset is thrashing in the
 * space allocated to the page cache.
 *
 *
 *                Implementation
 *
 * For each node's LRU lists, a counter for inactive evictions and
 * activations is maintained (node->nonresident_age).
 *
 * On eviction, a snapshot of this counter (along with some bits to
 * identify the node) is stored in the now empty page cache
 * slot of the evicted page.  This is called a shadow entry.
 *
 * On cache misses for which there are shadow entries, an eligible
 * refault distance will immediately activate the refaulting page.
 */

#define WORKINGSET_SHIFT 1
#define EVICTION_SHIFT        ((BITS_PER_LONG - BITS_PER_XA_VALUE) +        \
                         WORKINGSET_SHIFT + NODES_SHIFT + \
                         MEM_CGROUP_ID_SHIFT)
#define EVICTION_MASK        (~0UL >> EVICTION_SHIFT)

/*
 * Eviction timestamps need to be able to cover the full range of
 * actionable refaults. However, bits are tight in the xarray
 * entry, and after storing the identifier for the lruvec there might
 * not be enough left to represent every single actionable refault. In
 * that case, we have to sacrifice granularity for distance, and group
 * evictions into coarser buckets by shaving off lower timestamp bits.
 */
static unsigned int bucket_order __read_mostly;

static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
                         bool workingset)
{
        eviction &= EVICTION_MASK;
        eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
        eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
        eviction = (eviction << WORKINGSET_SHIFT) | workingset;

        return xa_mk_value(eviction);
}

static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
                          unsigned long *evictionp, bool *workingsetp)
{
        unsigned long entry = xa_to_value(shadow);
        int memcgid, nid;
        bool workingset;

        workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1);
        entry >>= WORKINGSET_SHIFT;
        nid = entry & ((1UL << NODES_SHIFT) - 1);
        entry >>= NODES_SHIFT;
        memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
        entry >>= MEM_CGROUP_ID_SHIFT;

        *memcgidp = memcgid;
        *pgdat = NODE_DATA(nid);
        *evictionp = entry;
        *workingsetp = workingset;
}

#ifdef CONFIG_LRU_GEN

static void *lru_gen_eviction(struct folio *folio)
{
        int hist;
        unsigned long token;
        unsigned long min_seq;
        struct lruvec *lruvec;
        struct lru_gen_folio *lrugen;
        int type = folio_is_file_lru(folio);
        int delta = folio_nr_pages(folio);
        int refs = folio_lru_refs(folio);
        int tier = lru_tier_from_refs(refs);
        struct mem_cgroup *memcg = folio_memcg(folio);
        struct pglist_data *pgdat = folio_pgdat(folio);

        BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);

        lruvec = mem_cgroup_lruvec(memcg, pgdat);
        lrugen = &lruvec->lrugen;
        min_seq = READ_ONCE(lrugen->min_seq[type]);
        token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);

        hist = lru_hist_from_seq(min_seq);
        atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);

        return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
}

/*
 * Tests if the shadow entry is for a folio that was recently evicted.
 * Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
 */
static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
                                unsigned long *token, bool *workingset)
{
        int memcg_id;
        unsigned long min_seq;
        struct mem_cgroup *memcg;
        struct pglist_data *pgdat;

        unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);

        memcg = mem_cgroup_from_id(memcg_id);
        *lruvec = mem_cgroup_lruvec(memcg, pgdat);

        min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]);
        return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH));
}

static void lru_gen_refault(struct folio *folio, void *shadow)
{
        bool recent;
        int hist, tier, refs;
        bool workingset;
        unsigned long token;
        struct lruvec *lruvec;
        struct lru_gen_folio *lrugen;
        int type = folio_is_file_lru(folio);
        int delta = folio_nr_pages(folio);

        rcu_read_lock();

        recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset);
        if (lruvec != folio_lruvec(folio))
                goto unlock;

        mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);

        if (!recent)
                goto unlock;

        lrugen = &lruvec->lrugen;

        hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
        /* see the comment in folio_lru_refs() */
        refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
        tier = lru_tier_from_refs(refs);

        atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
        mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);

        /*
         * Count the following two cases as stalls:
         * 1. For pages accessed through page tables, hotter pages pushed out
         *    hot pages which refaulted immediately.
         * 2. For pages accessed multiple times through file descriptors,
         *    they would have been protected by sort_folio().
         */
        if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) {
                set_mask_bits(&folio->flags, 0, LRU_REFS_MASK | BIT(PG_workingset));
                mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
        }
unlock:
        rcu_read_unlock();
}

#else /* !CONFIG_LRU_GEN */

static void *lru_gen_eviction(struct folio *folio)
{
        return NULL;
}

static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec,
                                unsigned long *token, bool *workingset)
{
        return false;
}

static void lru_gen_refault(struct folio *folio, void *shadow)
{
}

#endif /* CONFIG_LRU_GEN */

/**
 * workingset_age_nonresident - age non-resident entries as LRU ages
 * @lruvec: the lruvec that was aged
 * @nr_pages: the number of pages to count
 *
 * As in-memory pages are aged, non-resident pages need to be aged as
 * well, in order for the refault distances later on to be comparable
 * to the in-memory dimensions. This function allows reclaim and LRU
 * operations to drive the non-resident aging along in parallel.
 */
void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
{
        /*
         * Reclaiming a cgroup means reclaiming all its children in a
         * round-robin fashion. That means that each cgroup has an LRU
         * order that is composed of the LRU orders of its child
         * cgroups; and every page has an LRU position not just in the
         * cgroup that owns it, but in all of that group's ancestors.
         *
         * So when the physical inactive list of a leaf cgroup ages,
         * the virtual inactive lists of all its parents, including
         * the root cgroup's, age as well.
         */
        do {
                atomic_long_add(nr_pages, &lruvec->nonresident_age);
        } while ((lruvec = parent_lruvec(lruvec)));
}

/**
 * workingset_eviction - note the eviction of a folio from memory
 * @target_memcg: the cgroup that is causing the reclaim
 * @folio: the folio being evicted
 *
 * Return: a shadow entry to be stored in @folio->mapping->i_pages in place
 * of the evicted @folio so that a later refault can be detected.
 */
void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
{
        struct pglist_data *pgdat = folio_pgdat(folio);
        unsigned long eviction;
        struct lruvec *lruvec;
        int memcgid;

        /* Folio is fully exclusive and pins folio's memory cgroup pointer */
        VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
        VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (lru_gen_enabled())
                return lru_gen_eviction(folio);

        lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
        /* XXX: target_memcg can be NULL, go through lruvec */
        memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
        eviction = atomic_long_read(&lruvec->nonresident_age);
        eviction >>= bucket_order;
        workingset_age_nonresident(lruvec, folio_nr_pages(folio));
        return pack_shadow(memcgid, pgdat, eviction,
                                folio_test_workingset(folio));
}

/**
 * workingset_test_recent - tests if the shadow entry is for a folio that was
 * recently evicted. Also fills in @workingset with the value unpacked from
 * shadow.
 * @shadow: the shadow entry to be tested.
 * @file: whether the corresponding folio is from the file lru.
 * @workingset: where the workingset value unpacked from shadow should
 * be stored.
 *
 * Return: true if the shadow is for a recently evicted folio; false otherwise.
 */
bool workingset_test_recent(void *shadow, bool file, bool *workingset)
{
        struct mem_cgroup *eviction_memcg;
        struct lruvec *eviction_lruvec;
        unsigned long refault_distance;
        unsigned long workingset_size;
        unsigned long refault;
        int memcgid;
        struct pglist_data *pgdat;
        unsigned long eviction;

        rcu_read_lock();

        if (lru_gen_enabled()) {
                bool recent = lru_gen_test_recent(shadow, file,
                                &eviction_lruvec, &eviction, workingset);

                rcu_read_unlock();
                return recent;
        }


        unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
        eviction <<= bucket_order;

        /*
         * Look up the memcg associated with the stored ID. It might
         * have been deleted since the folio's eviction.
         *
         * Note that in rare events the ID could have been recycled
         * for a new cgroup that refaults a shared folio. This is
         * impossible to tell from the available data. However, this
         * should be a rare and limited disturbance, and activations
         * are always speculative anyway. Ultimately, it's the aging
         * algorithm's job to shake out the minimum access frequency
         * for the active cache.
         *
         * XXX: On !CONFIG_MEMCG, this will always return NULL; it
         * would be better if the root_mem_cgroup existed in all
         * configurations instead.
         */
        eviction_memcg = mem_cgroup_from_id(memcgid);
        if (!mem_cgroup_disabled() &&
            (!eviction_memcg || !mem_cgroup_tryget(eviction_memcg))) {
                rcu_read_unlock();
                return false;
        }

        rcu_read_unlock();

        /*
         * Flush stats (and potentially sleep) outside the RCU read section.
         * XXX: With per-memcg flushing and thresholding, is ratelimiting
         * still needed here?
         */
        mem_cgroup_flush_stats_ratelimited(eviction_memcg);

        eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
        refault = atomic_long_read(&eviction_lruvec->nonresident_age);

        /*
         * Calculate the refault distance
         *
         * The unsigned subtraction here gives an accurate distance
         * across nonresident_age overflows in most cases. There is a
         * special case: usually, shadow entries have a short lifetime
         * and are either refaulted or reclaimed along with the inode
         * before they get too old.  But it is not impossible for the
         * nonresident_age to lap a shadow entry in the field, which
         * can then result in a false small refault distance, leading
         * to a false activation should this old entry actually
         * refault again.  However, earlier kernels used to deactivate
         * unconditionally with *every* reclaim invocation for the
         * longest time, so the occasional inappropriate activation
         * leading to pressure on the active list is not a problem.
         */
        refault_distance = (refault - eviction) & EVICTION_MASK;

        /*
         * Compare the distance to the existing workingset size. We
         * don't activate pages that couldn't stay resident even if
         * all the memory was available to the workingset. Whether
         * workingset competition needs to consider anon or not depends
         * on having free swap space.
         */
        workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
        if (!file) {
                workingset_size += lruvec_page_state(eviction_lruvec,
                                                     NR_INACTIVE_FILE);
        }
        if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) {
                workingset_size += lruvec_page_state(eviction_lruvec,
                                                     NR_ACTIVE_ANON);
                if (file) {
                        workingset_size += lruvec_page_state(eviction_lruvec,
                                                     NR_INACTIVE_ANON);
                }
        }

        mem_cgroup_put(eviction_memcg);
        return refault_distance <= workingset_size;
}

/**
 * workingset_refault - Evaluate the refault of a previously evicted folio.
 * @folio: The freshly allocated replacement folio.
 * @shadow: Shadow entry of the evicted folio.
 *
 * Calculates and evaluates the refault distance of the previously
 * evicted folio in the context of the node and the memcg whose memory
 * pressure caused the eviction.
 */
void workingset_refault(struct folio *folio, void *shadow)
{
        bool file = folio_is_file_lru(folio);
        struct pglist_data *pgdat;
        struct mem_cgroup *memcg;
        struct lruvec *lruvec;
        bool workingset;
        long nr;

        if (lru_gen_enabled()) {
                lru_gen_refault(folio, shadow);
                return;
        }

        /*
         * The activation decision for this folio is made at the level
         * where the eviction occurred, as that is where the LRU order
         * during folio reclaim is being determined.
         *
         * However, the cgroup that will own the folio is the one that
         * is actually experiencing the refault event. Make sure the folio is
         * locked to guarantee folio_memcg() stability throughout.
         */
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        nr = folio_nr_pages(folio);
        memcg = folio_memcg(folio);
        pgdat = folio_pgdat(folio);
        lruvec = mem_cgroup_lruvec(memcg, pgdat);

        mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);

        if (!workingset_test_recent(shadow, file, &workingset))
                return;

        folio_set_active(folio);
        workingset_age_nonresident(lruvec, nr);
        mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);

        /* Folio was active prior to eviction */
        if (workingset) {
                folio_set_workingset(folio);
                /*
                 * XXX: Move to folio_add_lru() when it supports new vs
                 * putback
                 */
                lru_note_cost_refault(folio);
                mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
        }
}

/**
 * workingset_activation - note a page activation
 * @folio: Folio that is being activated.
 */
void workingset_activation(struct folio *folio)
{
        struct mem_cgroup *memcg;

        rcu_read_lock();
        /*
         * Filter non-memcg pages here, e.g. unmap can call
         * mark_page_accessed() on VDSO pages.
         *
         * XXX: See workingset_refault() - this should return
         * root_mem_cgroup even for !CONFIG_MEMCG.
         */
        memcg = folio_memcg_rcu(folio);
        if (!mem_cgroup_disabled() && !memcg)
                goto out;
        workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
out:
        rcu_read_unlock();
}

/*
 * Shadow entries reflect the share of the working set that does not
 * fit into memory, so their number depends on the access pattern of
 * the workload.  In most cases, they will refault or get reclaimed
 * along with the inode, but a (malicious) workload that streams
 * through files with a total size several times that of available
 * memory, while preventing the inodes from being reclaimed, can
 * create excessive amounts of shadow nodes.  To keep a lid on this,
 * track shadow nodes and reclaim them when they grow way past the
 * point where they would still be useful.
 */

struct list_lru shadow_nodes;

void workingset_update_node(struct xa_node *node)
{
        struct address_space *mapping;
        struct page *page = virt_to_page(node);

        /*
         * Track non-empty nodes that contain only shadow entries;
         * unlink those that contain pages or are being freed.
         *
         * Avoid acquiring the list_lru lock when the nodes are
         * already where they should be. The list_empty() test is safe
         * as node->private_list is protected by the i_pages lock.
         */
        mapping = container_of(node->array, struct address_space, i_pages);
        lockdep_assert_held(&mapping->i_pages.xa_lock);

        if (node->count && node->count == node->nr_values) {
                if (list_empty(&node->private_list)) {
                        list_lru_add_obj(&shadow_nodes, &node->private_list);
                        __inc_node_page_state(page, WORKINGSET_NODES);
                }
        } else {
                if (!list_empty(&node->private_list)) {
                        list_lru_del_obj(&shadow_nodes, &node->private_list);
                        __dec_node_page_state(page, WORKINGSET_NODES);
                }
        }
}

static unsigned long count_shadow_nodes(struct shrinker *shrinker,
                                        struct shrink_control *sc)
{
        unsigned long max_nodes;
        unsigned long nodes;
        unsigned long pages;

        nodes = list_lru_shrink_count(&shadow_nodes, sc);
        if (!nodes)
                return SHRINK_EMPTY;

        /*
         * Approximate a reasonable limit for the nodes
         * containing shadow entries. We don't need to keep more
         * shadow entries than possible pages on the active list,
         * since refault distances bigger than that are dismissed.
         *
         * The size of the active list converges toward 100% of
         * overall page cache as memory grows, with only a tiny
         * inactive list. Assume the total cache size for that.
         *
         * Nodes might be sparsely populated, with only one shadow
         * entry in the extreme case. Obviously, we cannot keep one
         * node for every eligible shadow entry, so compromise on a
         * worst-case density of 1/8th. Below that, not all eligible
         * refaults can be detected anymore.
         *
         * On 64-bit with 7 xa_nodes per page and 64 slots
         * each, this will reclaim shadow entries when they consume
         * ~1.8% of available memory:
         *
         * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
         */
#ifdef CONFIG_MEMCG
        if (sc->memcg) {
                struct lruvec *lruvec;
                int i;

                mem_cgroup_flush_stats_ratelimited(sc->memcg);
                lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
                for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
                        pages += lruvec_page_state_local(lruvec,
                                                         NR_LRU_BASE + i);
                pages += lruvec_page_state_local(
                        lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
                pages += lruvec_page_state_local(
                        lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
        } else
#endif
                pages = node_present_pages(sc->nid);

        max_nodes = pages >> (XA_CHUNK_SHIFT - 3);

        if (nodes <= max_nodes)
                return 0;
        return nodes - max_nodes;
}

static enum lru_status shadow_lru_isolate(struct list_head *item,
                                          struct list_lru_one *lru,
                                          spinlock_t *lru_lock,
                                          void *arg) __must_hold(lru_lock)
{
        struct xa_node *node = container_of(item, struct xa_node, private_list);
        struct address_space *mapping;
        int ret;

        /*
         * Page cache insertions and deletions synchronously maintain
         * the shadow node LRU under the i_pages lock and the
         * lru_lock.  Because the page cache tree is emptied before
         * the inode can be destroyed, holding the lru_lock pins any
         * address_space that has nodes on the LRU.
         *
         * We can then safely transition to the i_pages lock to
         * pin only the address_space of the particular node we want
         * to reclaim, take the node off-LRU, and drop the lru_lock.
         */

        mapping = container_of(node->array, struct address_space, i_pages);

        /* Coming from the list, invert the lock order */
        if (!xa_trylock(&mapping->i_pages)) {
                spin_unlock_irq(lru_lock);
                ret = LRU_RETRY;
                goto out;
        }

        /* For page cache we need to hold i_lock */
        if (mapping->host != NULL) {
                if (!spin_trylock(&mapping->host->i_lock)) {
                        xa_unlock(&mapping->i_pages);
                        spin_unlock_irq(lru_lock);
                        ret = LRU_RETRY;
                        goto out;
                }
        }

        list_lru_isolate(lru, item);
        __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES);

        spin_unlock(lru_lock);

        /*
         * The nodes should only contain one or more shadow entries,
         * no pages, so we expect to be able to remove them all and
         * delete and free the empty node afterwards.
         */
        if (WARN_ON_ONCE(!node->nr_values))
                goto out_invalid;
        if (WARN_ON_ONCE(node->count != node->nr_values))
                goto out_invalid;
        xa_delete_node(node, workingset_update_node);
        __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);

out_invalid:
        xa_unlock_irq(&mapping->i_pages);
        if (mapping->host != NULL) {
                if (mapping_shrinkable(mapping))
                        inode_add_lru(mapping->host);
                spin_unlock(&mapping->host->i_lock);
        }
        ret = LRU_REMOVED_RETRY;
out:
        cond_resched();
        spin_lock_irq(lru_lock);
        return ret;
}

static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
                                       struct shrink_control *sc)
{
        /* list_lru lock nests inside the IRQ-safe i_pages lock */
        return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate,
                                        NULL);
}

/*
 * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
 * i_pages lock.
 */
static struct lock_class_key shadow_nodes_key;

static int __init workingset_init(void)
{
        struct shrinker *workingset_shadow_shrinker;
        unsigned int timestamp_bits;
        unsigned int max_order;
        int ret = -ENOMEM;

        BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
        /*
         * Calculate the eviction bucket size to cover the longest
         * actionable refault distance, which is currently half of
         * memory (totalram_pages/2). However, memory hotplug may add
         * some more pages at runtime, so keep working with up to
         * double the initial memory by using totalram_pages as-is.
         */
        timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
        max_order = fls_long(totalram_pages() - 1);
        if (max_order > timestamp_bits)
                bucket_order = max_order - timestamp_bits;
        pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
               timestamp_bits, max_order, bucket_order);

        workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
                                                    SHRINKER_MEMCG_AWARE,
                                                    "mm-shadow");
        if (!workingset_shadow_shrinker)
                goto err;

        ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
                              workingset_shadow_shrinker);
        if (ret)
                goto err_list_lru;

        workingset_shadow_shrinker->count_objects = count_shadow_nodes;
        workingset_shadow_shrinker->scan_objects = scan_shadow_nodes;
        /* ->count reports only fully expendable nodes */
        workingset_shadow_shrinker->seeks = 0;

        shrinker_register(workingset_shadow_shrinker);
        return 0;
err_list_lru:
        shrinker_free(workingset_shadow_shrinker);
err:
        return ret;
}
module_init(workingset_init);







































































































































































































































































































































































    1 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
/*
 * linux/fs/nls/nls_cp437.c
 *
 * Charset cp437 translation tables.
 * Generated automatically from the Unicode and charset
 * tables from the Unicode Organization (www.unicode.org).
 * The Unicode to charset table has only exact mappings.
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/nls.h>
#include <linux/errno.h>

static const wchar_t charset2uni[256] = {
        /* 0x00*/
        0x0000, 0x0001, 0x0002, 0x0003,
        0x0004, 0x0005, 0x0006, 0x0007,
        0x0008, 0x0009, 0x000a, 0x000b,
        0x000c, 0x000d, 0x000e, 0x000f,
        /* 0x10*/
        0x0010, 0x0011, 0x0012, 0x0013,
        0x0014, 0x0015, 0x0016, 0x0017,
        0x0018, 0x0019, 0x001a, 0x001b,
        0x001c, 0x001d, 0x001e, 0x001f,
        /* 0x20*/
        0x0020, 0x0021, 0x0022, 0x0023,
        0x0024, 0x0025, 0x0026, 0x0027,
        0x0028, 0x0029, 0x002a, 0x002b,
        0x002c, 0x002d, 0x002e, 0x002f,
        /* 0x30*/
        0x0030, 0x0031, 0x0032, 0x0033,
        0x0034, 0x0035, 0x0036, 0x0037,
        0x0038, 0x0039, 0x003a, 0x003b,
        0x003c, 0x003d, 0x003e, 0x003f,
        /* 0x40*/
        0x0040, 0x0041, 0x0042, 0x0043,
        0x0044, 0x0045, 0x0046, 0x0047,
        0x0048, 0x0049, 0x004a, 0x004b,
        0x004c, 0x004d, 0x004e, 0x004f,
        /* 0x50*/
        0x0050, 0x0051, 0x0052, 0x0053,
        0x0054, 0x0055, 0x0056, 0x0057,
        0x0058, 0x0059, 0x005a, 0x005b,
        0x005c, 0x005d, 0x005e, 0x005f,
        /* 0x60*/
        0x0060, 0x0061, 0x0062, 0x0063,
        0x0064, 0x0065, 0x0066, 0x0067,
        0x0068, 0x0069, 0x006a, 0x006b,
        0x006c, 0x006d, 0x006e, 0x006f,
        /* 0x70*/
        0x0070, 0x0071, 0x0072, 0x0073,
        0x0074, 0x0075, 0x0076, 0x0077,
        0x0078, 0x0079, 0x007a, 0x007b,
        0x007c, 0x007d, 0x007e, 0x007f,
        /* 0x80*/
        0x00c7, 0x00fc, 0x00e9, 0x00e2,
        0x00e4, 0x00e0, 0x00e5, 0x00e7,
        0x00ea, 0x00eb, 0x00e8, 0x00ef,
        0x00ee, 0x00ec, 0x00c4, 0x00c5,
        /* 0x90*/
        0x00c9, 0x00e6, 0x00c6, 0x00f4,
        0x00f6, 0x00f2, 0x00fb, 0x00f9,
        0x00ff, 0x00d6, 0x00dc, 0x00a2,
        0x00a3, 0x00a5, 0x20a7, 0x0192,
        /* 0xa0*/
        0x00e1, 0x00ed, 0x00f3, 0x00fa,
        0x00f1, 0x00d1, 0x00aa, 0x00ba,
        0x00bf, 0x2310, 0x00ac, 0x00bd,
        0x00bc, 0x00a1, 0x00ab, 0x00bb,
        /* 0xb0*/
        0x2591, 0x2592, 0x2593, 0x2502,
        0x2524, 0x2561, 0x2562, 0x2556,
        0x2555, 0x2563, 0x2551, 0x2557,
        0x255d, 0x255c, 0x255b, 0x2510,
        /* 0xc0*/
        0x2514, 0x2534, 0x252c, 0x251c,
        0x2500, 0x253c, 0x255e, 0x255f,
        0x255a, 0x2554, 0x2569, 0x2566,
        0x2560, 0x2550, 0x256c, 0x2567,
        /* 0xd0*/
        0x2568, 0x2564, 0x2565, 0x2559,
        0x2558, 0x2552, 0x2553, 0x256b,
        0x256a, 0x2518, 0x250c, 0x2588,
        0x2584, 0x258c, 0x2590, 0x2580,
        /* 0xe0*/
        0x03b1, 0x00df, 0x0393, 0x03c0,
        0x03a3, 0x03c3, 0x00b5, 0x03c4,
        0x03a6, 0x0398, 0x03a9, 0x03b4,
        0x221e, 0x03c6, 0x03b5, 0x2229,
        /* 0xf0*/
        0x2261, 0x00b1, 0x2265, 0x2264,
        0x2320, 0x2321, 0x00f7, 0x2248,
        0x00b0, 0x2219, 0x00b7, 0x221a,
        0x207f, 0x00b2, 0x25a0, 0x00a0,
};

static const unsigned char page00[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
        0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
        0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
        0xff, 0xad, 0x9b, 0x9c, 0x00, 0x9d, 0x00, 0x00, /* 0xa0-0xa7 */
        0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
        0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */
        0x00, 0x00, 0xa7, 0xaf, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */
        0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */
        0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
        0x00, 0xa5, 0x00, 0x00, 0x00, 0x00, 0x99, 0x00, /* 0xd0-0xd7 */
        0x00, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */
        0x85, 0xa0, 0x83, 0x00, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */
        0x8a, 0x82, 0x88, 0x89, 0x8d, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */
        0x00, 0xa4, 0x95, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */
        0x00, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0x98, /* 0xf8-0xff */
};

static const unsigned char page01[256] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */

        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
        0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
};

static const unsigned char page03[256] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */

        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
        0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
        0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
        0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */
        0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
        0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
        0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */
};

static const unsigned char page20[256] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */

        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */
};

static const unsigned char page22[256] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
        0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
        0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
        0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
        0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */
};

static const unsigned char page23[256] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
        0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
        0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
};

static const unsigned char page25[256] = {
        0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
        0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
        0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
        0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
        0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
        0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
        0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
        0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
        0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */
        0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */
        0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */
        0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */

        0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
        0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
        0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
        0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
};

static const unsigned char *const page_uni2charset[256] = {
        page00, page01, NULL,   page03, NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        page20, NULL,   page22, page23, NULL,   page25, NULL,   NULL,   
};

static const unsigned char charset2lower[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
        0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
        0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
        0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x84, 0x86, /* 0x88-0x8f */
        0x82, 0x91, 0x91, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
        0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
        0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa4, 0xa6, 0xa7, /* 0xa0-0xa7 */
        0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
        0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
        0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
        0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
        0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
        0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
        0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
        0xe0, 0xe1, 0x00, 0xe3, 0xe5, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
        0xed, 0x00, 0x00, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
        0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
        0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
};

static const unsigned char charset2upper[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
        0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
        0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x80, 0x9a, 0x90, 0x00, 0x8e, 0x00, 0x8f, 0x80, /* 0x80-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, /* 0x88-0x8f */
        0x90, 0x92, 0x92, 0x00, 0x99, 0x00, 0x00, 0x00, /* 0x90-0x97 */
        0x00, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x00, /* 0x98-0x9f */
        0x00, 0x00, 0x00, 0x00, 0xa5, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
        0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
        0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
        0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
        0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
        0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
        0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
        0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
        0x00, 0xe1, 0xe2, 0x00, 0xe4, 0xe4, 0x00, 0x00, /* 0xe0-0xe7 */
        0xe8, 0xe9, 0xea, 0x00, 0xec, 0xe8, 0x00, 0xef, /* 0xe8-0xef */
        0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
        0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
};

static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
{
        const unsigned char *uni2charset;
        unsigned char cl = uni & 0x00ff;
        unsigned char ch = (uni & 0xff00) >> 8;

        if (boundlen <= 0)
                return -ENAMETOOLONG;

        uni2charset = page_uni2charset[ch];
        if (uni2charset && uni2charset[cl])
                out[0] = uni2charset[cl];
        else
                return -EINVAL;
        return 1;
}

static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
{
        *uni = charset2uni[*rawstring];
        if (*uni == 0x0000)
                return -EINVAL;
        return 1;
}

static struct nls_table table = {
        .charset        = "cp437",
        .uni2char        = uni2char,
        .char2uni        = char2uni,
        .charset2lower        = charset2lower,
        .charset2upper        = charset2upper,
};

static int __init init_nls_cp437(void)
{
        return register_nls(&table);
}

static void __exit exit_nls_cp437(void)
{
        unregister_nls(&table);
}

module_init(init_nls_cp437)
module_exit(exit_nls_cp437)

MODULE_LICENSE("Dual BSD/GPL");























































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Module internals
 *
 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
 */

#include <linux/elf.h>
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/mm.h>

#ifndef ARCH_SHF_SMALL
#define ARCH_SHF_SMALL 0
#endif

/*
 * Use highest 4 bits of sh_entsize to store the mod_mem_type of this
 * section. This leaves 28 bits for offset on 32-bit systems, which is
 * about 256 MiB (WARN_ON_ONCE if we exceed that).
 */

#define SH_ENTSIZE_TYPE_BITS        4
#define SH_ENTSIZE_TYPE_SHIFT        (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS)
#define SH_ENTSIZE_TYPE_MASK        ((1UL << SH_ENTSIZE_TYPE_BITS) - 1)
#define SH_ENTSIZE_OFFSET_MASK        ((1UL << (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS)) - 1)

/* Maximum number of characters written by module_flags() */
#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)

struct kernel_symbol {
#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
        int value_offset;
        int name_offset;
        int namespace_offset;
#else
        unsigned long value;
        const char *name;
        const char *namespace;
#endif
};

extern struct mutex module_mutex;
extern struct list_head modules;

extern struct module_attribute *modinfo_attrs[];
extern size_t modinfo_attrs_count;

/* Provided by the linker */
extern const struct kernel_symbol __start___ksymtab[];
extern const struct kernel_symbol __stop___ksymtab[];
extern const struct kernel_symbol __start___ksymtab_gpl[];
extern const struct kernel_symbol __stop___ksymtab_gpl[];
extern const s32 __start___kcrctab[];
extern const s32 __start___kcrctab_gpl[];

struct load_info {
        const char *name;
        /* pointer to module in temporary copy, freed at end of load_module() */
        struct module *mod;
        Elf_Ehdr *hdr;
        unsigned long len;
        Elf_Shdr *sechdrs;
        char *secstrings, *strtab;
        unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
        bool sig_ok;
#ifdef CONFIG_KALLSYMS
        unsigned long mod_kallsyms_init_off;
#endif
#ifdef CONFIG_MODULE_DECOMPRESS
#ifdef CONFIG_MODULE_STATS
        unsigned long compressed_len;
#endif
        struct page **pages;
        unsigned int max_pages;
        unsigned int used_pages;
#endif
        struct {
                unsigned int sym, str, mod, vers, info, pcpu;
        } index;
};

enum mod_license {
        NOT_GPL_ONLY,
        GPL_ONLY,
};

struct find_symbol_arg {
        /* Input */
        const char *name;
        bool gplok;
        bool warn;

        /* Output */
        struct module *owner;
        const s32 *crc;
        const struct kernel_symbol *sym;
        enum mod_license license;
};

int mod_verify_sig(const void *mod, struct load_info *info);
int try_to_force_load(struct module *mod, const char *reason);
bool find_symbol(struct find_symbol_arg *fsa);
struct module *find_module_all(const char *name, size_t len, bool even_unformed);
int cmp_name(const void *name, const void *sym);
long module_get_offset_and_type(struct module *mod, enum mod_mem_type type,
                                Elf_Shdr *sechdr, unsigned int section);
char *module_flags(struct module *mod, char *buf, bool show_state);
size_t module_flags_taint(unsigned long taints, char *buf);

char *module_next_tag_pair(char *string, unsigned long *secsize);

#define for_each_modinfo_entry(entry, info, name) \
        for (entry = get_modinfo(info, name); entry; entry = get_next_modinfo(info, name, entry))

static inline void module_assert_mutex_or_preempt(void)
{
#ifdef CONFIG_LOCKDEP
        if (unlikely(!debug_locks))
                return;

        WARN_ON_ONCE(!rcu_read_lock_sched_held() &&
                     !lockdep_is_held(&module_mutex));
#endif
}

static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
{
#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
        return (unsigned long)offset_to_ptr(&sym->value_offset);
#else
        return sym->value;
#endif
}

#ifdef CONFIG_LIVEPATCH
int copy_module_elf(struct module *mod, struct load_info *info);
void free_module_elf(struct module *mod);
#else /* !CONFIG_LIVEPATCH */
static inline int copy_module_elf(struct module *mod, struct load_info *info)
{
        return 0;
}

static inline void free_module_elf(struct module *mod) { }
#endif /* CONFIG_LIVEPATCH */

static inline bool set_livepatch_module(struct module *mod)
{
#ifdef CONFIG_LIVEPATCH
        mod->klp = true;
        return true;
#else
        return false;
#endif
}

/**
 * enum fail_dup_mod_reason - state at which a duplicate module was detected
 *
 * @FAIL_DUP_MOD_BECOMING: the module is read properly, passes all checks but
 *         we've determined that another module with the same name is already loaded
 *         or being processed on our &modules list. This happens on early_mod_check()
 *         right before layout_and_allocate(). The kernel would have already
 *         vmalloc()'d space for the entire module through finit_module(). If
 *         decompression was used two vmap() spaces were used. These failures can
 *         happen when userspace has not seen the module present on the kernel and
 *         tries to load the module multiple times at same time.
 * @FAIL_DUP_MOD_LOAD: the module has been read properly, passes all validation
 *        checks and the kernel determines that the module was unique and because
 *        of this allocated yet another private kernel copy of the module space in
 *        layout_and_allocate() but after this determined in add_unformed_module()
 *        that another module with the same name is already loaded or being processed.
 *        These failures should be mitigated as much as possible and are indicative
 *        of really fast races in loading modules. Without module decompression
 *        they waste twice as much vmap space. With module decompression three
 *        times the module's size vmap space is wasted.
 */
enum fail_dup_mod_reason {
        FAIL_DUP_MOD_BECOMING = 0,
        FAIL_DUP_MOD_LOAD,
};

#ifdef CONFIG_MODULE_DEBUGFS
extern struct dentry *mod_debugfs_root;
#endif

#ifdef CONFIG_MODULE_STATS

#define mod_stat_add_long(count, var) atomic_long_add(count, var)
#define mod_stat_inc(name) atomic_inc(name)

extern atomic_long_t total_mod_size;
extern atomic_long_t total_text_size;
extern atomic_long_t invalid_kread_bytes;
extern atomic_long_t invalid_decompress_bytes;

extern atomic_t modcount;
extern atomic_t failed_kreads;
extern atomic_t failed_decompress;
struct mod_fail_load {
        struct list_head list;
        char name[MODULE_NAME_LEN];
        atomic_long_t count;
        unsigned long dup_fail_mask;
};

int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason);
void mod_stat_bump_invalid(struct load_info *info, int flags);
void mod_stat_bump_becoming(struct load_info *info, int flags);

#else

#define mod_stat_add_long(name, var)
#define mod_stat_inc(name)

static inline int try_add_failed_module(const char *name,
                                        enum fail_dup_mod_reason reason)
{
        return 0;
}

static inline void mod_stat_bump_invalid(struct load_info *info, int flags)
{
}

static inline void mod_stat_bump_becoming(struct load_info *info, int flags)
{
}

#endif /* CONFIG_MODULE_STATS */

#ifdef CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS
bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret);
void kmod_dup_request_announce(char *module_name, int ret);
#else
static inline bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret)
{
        return false;
}

static inline void kmod_dup_request_announce(char *module_name, int ret)
{
}
#endif

#ifdef CONFIG_MODULE_UNLOAD_TAINT_TRACKING
struct mod_unload_taint {
        struct list_head list;
        char name[MODULE_NAME_LEN];
        unsigned long taints;
        u64 count;
};

int try_add_tainted_module(struct module *mod);
void print_unloaded_tainted_modules(void);
#else /* !CONFIG_MODULE_UNLOAD_TAINT_TRACKING */
static inline int try_add_tainted_module(struct module *mod)
{
        return 0;
}

static inline void print_unloaded_tainted_modules(void)
{
}
#endif /* CONFIG_MODULE_UNLOAD_TAINT_TRACKING */

#ifdef CONFIG_MODULE_DECOMPRESS
int module_decompress(struct load_info *info, const void *buf, size_t size);
void module_decompress_cleanup(struct load_info *info);
#else
static inline int module_decompress(struct load_info *info,
                                    const void *buf, size_t size)
{
        return -EOPNOTSUPP;
}

static inline void module_decompress_cleanup(struct load_info *info)
{
}
#endif

struct mod_tree_root {
#ifdef CONFIG_MODULES_TREE_LOOKUP
        struct latch_tree_root root;
#endif
        unsigned long addr_min;
        unsigned long addr_max;
#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
        unsigned long data_addr_min;
        unsigned long data_addr_max;
#endif
};

extern struct mod_tree_root mod_tree;

#ifdef CONFIG_MODULES_TREE_LOOKUP
void mod_tree_insert(struct module *mod);
void mod_tree_remove_init(struct module *mod);
void mod_tree_remove(struct module *mod);
struct module *mod_find(unsigned long addr, struct mod_tree_root *tree);
#else /* !CONFIG_MODULES_TREE_LOOKUP */

static inline void mod_tree_insert(struct module *mod) { }
static inline void mod_tree_remove_init(struct module *mod) { }
static inline void mod_tree_remove(struct module *mod) { }
static inline struct module *mod_find(unsigned long addr, struct mod_tree_root *tree)
{
        struct module *mod;

        list_for_each_entry_rcu(mod, &modules, list,
                                lockdep_is_held(&module_mutex)) {
                if (within_module(addr, mod))
                        return mod;
        }

        return NULL;
}
#endif /* CONFIG_MODULES_TREE_LOOKUP */

int module_enable_rodata_ro(const struct module *mod, bool after_init);
int module_enable_data_nx(const struct module *mod);
int module_enable_text_rox(const struct module *mod);
int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
                                char *secstrings, struct module *mod);

#ifdef CONFIG_MODULE_SIG
int module_sig_check(struct load_info *info, int flags);
#else /* !CONFIG_MODULE_SIG */
static inline int module_sig_check(struct load_info *info, int flags)
{
        return 0;
}
#endif /* !CONFIG_MODULE_SIG */

#ifdef CONFIG_DEBUG_KMEMLEAK
void kmemleak_load_module(const struct module *mod, const struct load_info *info);
#else /* !CONFIG_DEBUG_KMEMLEAK */
static inline void kmemleak_load_module(const struct module *mod,
                                        const struct load_info *info) { }
#endif /* CONFIG_DEBUG_KMEMLEAK */

#ifdef CONFIG_KALLSYMS
void init_build_id(struct module *mod, const struct load_info *info);
void layout_symtab(struct module *mod, struct load_info *info);
void add_kallsyms(struct module *mod, const struct load_info *info);

static inline bool sect_empty(const Elf_Shdr *sect)
{
        return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
}
#else /* !CONFIG_KALLSYMS */
static inline void init_build_id(struct module *mod, const struct load_info *info) { }
static inline void layout_symtab(struct module *mod, struct load_info *info) { }
static inline void add_kallsyms(struct module *mod, const struct load_info *info) { }
#endif /* CONFIG_KALLSYMS */

#ifdef CONFIG_SYSFS
int mod_sysfs_setup(struct module *mod, const struct load_info *info,
                    struct kernel_param *kparam, unsigned int num_params);
void mod_sysfs_teardown(struct module *mod);
void init_param_lock(struct module *mod);
#else /* !CONFIG_SYSFS */
static inline int mod_sysfs_setup(struct module *mod,
                                     const struct load_info *info,
                                     struct kernel_param *kparam,
                                     unsigned int num_params)
{
        return 0;
}

static inline void mod_sysfs_teardown(struct module *mod) { }
static inline void init_param_lock(struct module *mod) { }
#endif /* CONFIG_SYSFS */

#ifdef CONFIG_MODVERSIONS
int check_version(const struct load_info *info,
                  const char *symname, struct module *mod, const s32 *crc);
void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp,
                   struct kernel_symbol *ks, struct tracepoint * const *tp);
int check_modstruct_version(const struct load_info *info, struct module *mod);
int same_magic(const char *amagic, const char *bmagic, bool has_crcs);
#else /* !CONFIG_MODVERSIONS */
static inline int check_version(const struct load_info *info,
                                const char *symname,
                                struct module *mod,
                                const s32 *crc)
{
        return 1;
}

static inline int check_modstruct_version(const struct load_info *info,
                                          struct module *mod)
{
        return 1;
}

static inline int same_magic(const char *amagic, const char *bmagic, bool has_crcs)
{
        return strcmp(amagic, bmagic) == 0;
}
#endif /* CONFIG_MODVERSIONS */















































































































































































































































































    4 



    6 

    1 
    5 

    4 
    1 


    6 

    5 





















































































































































































































































































































































    2 
    3 
    2 

























    1 



    1 
    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HIGHMEM_H
#define _LINUX_HIGHMEM_H

#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/bug.h>
#include <linux/cacheflush.h>
#include <linux/kmsan.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>

#include "highmem-internal.h"

/**
 * kmap - Map a page for long term usage
 * @page:        Pointer to the page to be mapped
 *
 * Returns: The virtual address of the mapping
 *
 * Can only be invoked from preemptible task context because on 32bit
 * systems with CONFIG_HIGHMEM enabled this function might sleep.
 *
 * For systems with CONFIG_HIGHMEM=n and for pages in the low memory area
 * this returns the virtual address of the direct kernel mapping.
 *
 * The returned virtual address is globally visible and valid up to the
 * point where it is unmapped via kunmap(). The pointer can be handed to
 * other contexts.
 *
 * For highmem pages on 32bit systems this can be slow as the mapping space
 * is limited and protected by a global lock. In case that there is no
 * mapping slot available the function blocks until a slot is released via
 * kunmap().
 */
static inline void *kmap(struct page *page);

/**
 * kunmap - Unmap the virtual address mapped by kmap()
 * @page:        Pointer to the page which was mapped by kmap()
 *
 * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of
 * pages in the low memory area.
 */
static inline void kunmap(struct page *page);

/**
 * kmap_to_page - Get the page for a kmap'ed address
 * @addr:        The address to look up
 *
 * Returns: The page which is mapped to @addr.
 */
static inline struct page *kmap_to_page(void *addr);

/**
 * kmap_flush_unused - Flush all unused kmap mappings in order to
 *                       remove stray mappings
 */
static inline void kmap_flush_unused(void);

/**
 * kmap_local_page - Map a page for temporary usage
 * @page: Pointer to the page to be mapped
 *
 * Returns: The virtual address of the mapping
 *
 * Can be invoked from any context, including interrupts.
 *
 * Requires careful handling when nesting multiple mappings because the map
 * management is stack based. The unmap has to be in the reverse order of
 * the map operation:
 *
 * addr1 = kmap_local_page(page1);
 * addr2 = kmap_local_page(page2);
 * ...
 * kunmap_local(addr2);
 * kunmap_local(addr1);
 *
 * Unmapping addr1 before addr2 is invalid and causes malfunction.
 *
 * Contrary to kmap() mappings the mapping is only valid in the context of
 * the caller and cannot be handed to other contexts.
 *
 * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the
 * virtual address of the direct mapping. Only real highmem pages are
 * temporarily mapped.
 *
 * While kmap_local_page() is significantly faster than kmap() for the highmem
 * case it comes with restrictions about the pointer validity.
 *
 * On HIGHMEM enabled systems mapping a highmem page has the side effect of
 * disabling migration in order to keep the virtual address stable across
 * preemption. No caller of kmap_local_page() can rely on this side effect.
 */
static inline void *kmap_local_page(struct page *page);

/**
 * kmap_local_folio - Map a page in this folio for temporary usage
 * @folio: The folio containing the page.
 * @offset: The byte offset within the folio which identifies the page.
 *
 * Requires careful handling when nesting multiple mappings because the map
 * management is stack based. The unmap has to be in the reverse order of
 * the map operation::
 *
 *   addr1 = kmap_local_folio(folio1, offset1);
 *   addr2 = kmap_local_folio(folio2, offset2);
 *   ...
 *   kunmap_local(addr2);
 *   kunmap_local(addr1);
 *
 * Unmapping addr1 before addr2 is invalid and causes malfunction.
 *
 * Contrary to kmap() mappings the mapping is only valid in the context of
 * the caller and cannot be handed to other contexts.
 *
 * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the
 * virtual address of the direct mapping. Only real highmem pages are
 * temporarily mapped.
 *
 * While it is significantly faster than kmap() for the highmem case it
 * comes with restrictions about the pointer validity.
 *
 * On HIGHMEM enabled systems mapping a highmem page has the side effect of
 * disabling migration in order to keep the virtual address stable across
 * preemption. No caller of kmap_local_folio() can rely on this side effect.
 *
 * Context: Can be invoked from any context.
 * Return: The virtual address of @offset.
 */
static inline void *kmap_local_folio(struct folio *folio, size_t offset);

/**
 * kmap_atomic - Atomically map a page for temporary usage - Deprecated!
 * @page:        Pointer to the page to be mapped
 *
 * Returns: The virtual address of the mapping
 *
 * In fact a wrapper around kmap_local_page() which also disables pagefaults
 * and, depending on PREEMPT_RT configuration, also CPU migration and
 * preemption. Therefore users should not count on the latter two side effects.
 *
 * Mappings should always be released by kunmap_atomic().
 *
 * Do not use in new code. Use kmap_local_page() instead.
 *
 * It is used in atomic context when code wants to access the contents of a
 * page that might be allocated from high memory (see __GFP_HIGHMEM), for
 * example a page in the pagecache.  The API has two functions, and they
 * can be used in a manner similar to the following::
 *
 *   // Find the page of interest.
 *   struct page *page = find_get_page(mapping, offset);
 *
 *   // Gain access to the contents of that page.
 *   void *vaddr = kmap_atomic(page);
 *
 *   // Do something to the contents of that page.
 *   memset(vaddr, 0, PAGE_SIZE);
 *
 *   // Unmap that page.
 *   kunmap_atomic(vaddr);
 *
 * Note that the kunmap_atomic() call takes the result of the kmap_atomic()
 * call, not the argument.
 *
 * If you need to map two pages because you want to copy from one page to
 * another you need to keep the kmap_atomic calls strictly nested, like:
 *
 * vaddr1 = kmap_atomic(page1);
 * vaddr2 = kmap_atomic(page2);
 *
 * memcpy(vaddr1, vaddr2, PAGE_SIZE);
 *
 * kunmap_atomic(vaddr2);
 * kunmap_atomic(vaddr1);
 */
static inline void *kmap_atomic(struct page *page);

/* Highmem related interfaces for management code */
static inline unsigned int nr_free_highpages(void);
static inline unsigned long totalhigh_pages(void);

#ifndef ARCH_HAS_FLUSH_ANON_PAGE
static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr)
{
}
#endif

#ifndef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE
static inline void flush_kernel_vmap_range(void *vaddr, int size)
{
}
static inline void invalidate_kernel_vmap_range(void *vaddr, int size)
{
}
#endif

/* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */
#ifndef clear_user_highpage
static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
{
        void *addr = kmap_local_page(page);
        clear_user_page(addr, vaddr, page);
        kunmap_local(addr);
}
#endif

#ifndef vma_alloc_zeroed_movable_folio
/**
 * vma_alloc_zeroed_movable_folio - Allocate a zeroed page for a VMA.
 * @vma: The VMA the page is to be allocated for.
 * @vaddr: The virtual address the page will be inserted into.
 *
 * This function will allocate a page suitable for inserting into this
 * VMA at this virtual address.  It may be allocated from highmem or
 * the movable zone.  An architecture may provide its own implementation.
 *
 * Return: A folio containing one allocated and zeroed page or NULL if
 * we are out of memory.
 */
static inline
struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
                                   unsigned long vaddr)
{
        struct folio *folio;

        folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr, false);
        if (folio)
                clear_user_highpage(&folio->page, vaddr);

        return folio;
}
#endif

static inline void clear_highpage(struct page *page)
{
        void *kaddr = kmap_local_page(page);
        clear_page(kaddr);
        kunmap_local(kaddr);
}

static inline void clear_highpage_kasan_tagged(struct page *page)
{
        void *kaddr = kmap_local_page(page);

        clear_page(kasan_reset_tag(kaddr));
        kunmap_local(kaddr);
}

#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE

static inline void tag_clear_highpage(struct page *page)
{
}

#endif

/*
 * If we pass in a base or tail page, we can zero up to PAGE_SIZE.
 * If we pass in a head page, we can zero up to the size of the compound page.
 */
#ifdef CONFIG_HIGHMEM
void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
                unsigned start2, unsigned end2);
#else
static inline void zero_user_segments(struct page *page,
                unsigned start1, unsigned end1,
                unsigned start2, unsigned end2)
{
        void *kaddr = kmap_local_page(page);
        unsigned int i;

        BUG_ON(end1 > page_size(page) || end2 > page_size(page));

        if (end1 > start1)
                memset(kaddr + start1, 0, end1 - start1);

        if (end2 > start2)
                memset(kaddr + start2, 0, end2 - start2);

        kunmap_local(kaddr);
        for (i = 0; i < compound_nr(page); i++)
                flush_dcache_page(page + i);
}
#endif

static inline void zero_user_segment(struct page *page,
        unsigned start, unsigned end)
{
        zero_user_segments(page, start, end, 0, 0);
}

static inline void zero_user(struct page *page,
        unsigned start, unsigned size)
{
        zero_user_segments(page, start, start + size, 0, 0);
}

#ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE

static inline void copy_user_highpage(struct page *to, struct page *from,
        unsigned long vaddr, struct vm_area_struct *vma)
{
        char *vfrom, *vto;

        vfrom = kmap_local_page(from);
        vto = kmap_local_page(to);
        copy_user_page(vto, vfrom, vaddr, to);
        kmsan_unpoison_memory(page_address(to), PAGE_SIZE);
        kunmap_local(vto);
        kunmap_local(vfrom);
}

#endif

#ifndef __HAVE_ARCH_COPY_HIGHPAGE

static inline void copy_highpage(struct page *to, struct page *from)
{
        char *vfrom, *vto;

        vfrom = kmap_local_page(from);
        vto = kmap_local_page(to);
        copy_page(vto, vfrom);
        kmsan_copy_page_meta(to, from);
        kunmap_local(vto);
        kunmap_local(vfrom);
}

#endif

#ifdef copy_mc_to_kernel
/*
 * If architecture supports machine check exception handling, define the
 * #MC versions of copy_user_highpage and copy_highpage. They copy a memory
 * page with #MC in source page (@from) handled, and return the number
 * of bytes not copied if there was a #MC, otherwise 0 for success.
 */
static inline int copy_mc_user_highpage(struct page *to, struct page *from,
                                        unsigned long vaddr, struct vm_area_struct *vma)
{
        unsigned long ret;
        char *vfrom, *vto;

        vfrom = kmap_local_page(from);
        vto = kmap_local_page(to);
        ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
        if (!ret)
                kmsan_unpoison_memory(page_address(to), PAGE_SIZE);
        kunmap_local(vto);
        kunmap_local(vfrom);

        return ret;
}

static inline int copy_mc_highpage(struct page *to, struct page *from)
{
        unsigned long ret;
        char *vfrom, *vto;

        vfrom = kmap_local_page(from);
        vto = kmap_local_page(to);
        ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
        if (!ret)
                kmsan_copy_page_meta(to, from);
        kunmap_local(vto);
        kunmap_local(vfrom);

        return ret;
}
#else
static inline int copy_mc_user_highpage(struct page *to, struct page *from,
                                        unsigned long vaddr, struct vm_area_struct *vma)
{
        copy_user_highpage(to, from, vaddr, vma);
        return 0;
}

static inline int copy_mc_highpage(struct page *to, struct page *from)
{
        copy_highpage(to, from);
        return 0;
}
#endif

static inline void memcpy_page(struct page *dst_page, size_t dst_off,
                               struct page *src_page, size_t src_off,
                               size_t len)
{
        char *dst = kmap_local_page(dst_page);
        char *src = kmap_local_page(src_page);

        VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE);
        memcpy(dst + dst_off, src + src_off, len);
        kunmap_local(src);
        kunmap_local(dst);
}

static inline void memset_page(struct page *page, size_t offset, int val,
                               size_t len)
{
        char *addr = kmap_local_page(page);

        VM_BUG_ON(offset + len > PAGE_SIZE);
        memset(addr + offset, val, len);
        kunmap_local(addr);
}

static inline void memcpy_from_page(char *to, struct page *page,
                                    size_t offset, size_t len)
{
        char *from = kmap_local_page(page);

        VM_BUG_ON(offset + len > PAGE_SIZE);
        memcpy(to, from + offset, len);
        kunmap_local(from);
}

static inline void memcpy_to_page(struct page *page, size_t offset,
                                  const char *from, size_t len)
{
        char *to = kmap_local_page(page);

        VM_BUG_ON(offset + len > PAGE_SIZE);
        memcpy(to + offset, from, len);
        flush_dcache_page(page);
        kunmap_local(to);
}

static inline void memzero_page(struct page *page, size_t offset, size_t len)
{
        char *addr = kmap_local_page(page);

        VM_BUG_ON(offset + len > PAGE_SIZE);
        memset(addr + offset, 0, len);
        flush_dcache_page(page);
        kunmap_local(addr);
}

/**
 * memcpy_from_folio - Copy a range of bytes from a folio.
 * @to: The memory to copy to.
 * @folio: The folio to read from.
 * @offset: The first byte in the folio to read.
 * @len: The number of bytes to copy.
 */
static inline void memcpy_from_folio(char *to, struct folio *folio,
                size_t offset, size_t len)
{
        VM_BUG_ON(offset + len > folio_size(folio));

        do {
                const char *from = kmap_local_folio(folio, offset);
                size_t chunk = len;

                if (folio_test_highmem(folio) &&
                    chunk > PAGE_SIZE - offset_in_page(offset))
                        chunk = PAGE_SIZE - offset_in_page(offset);
                memcpy(to, from, chunk);
                kunmap_local(from);

                to += chunk;
                offset += chunk;
                len -= chunk;
        } while (len > 0);
}

/**
 * memcpy_to_folio - Copy a range of bytes to a folio.
 * @folio: The folio to write to.
 * @offset: The first byte in the folio to store to.
 * @from: The memory to copy from.
 * @len: The number of bytes to copy.
 */
static inline void memcpy_to_folio(struct folio *folio, size_t offset,
                const char *from, size_t len)
{
        VM_BUG_ON(offset + len > folio_size(folio));

        do {
                char *to = kmap_local_folio(folio, offset);
                size_t chunk = len;

                if (folio_test_highmem(folio) &&
                    chunk > PAGE_SIZE - offset_in_page(offset))
                        chunk = PAGE_SIZE - offset_in_page(offset);
                memcpy(to, from, chunk);
                kunmap_local(to);

                from += chunk;
                offset += chunk;
                len -= chunk;
        } while (len > 0);

        flush_dcache_folio(folio);
}

/**
 * folio_zero_tail - Zero the tail of a folio.
 * @folio: The folio to zero.
 * @offset: The byte offset in the folio to start zeroing at.
 * @kaddr: The address the folio is currently mapped to.
 *
 * If you have already used kmap_local_folio() to map a folio, written
 * some data to it and now need to zero the end of the folio (and flush
 * the dcache), you can use this function.  If you do not have the
 * folio kmapped (eg the folio has been partially populated by DMA),
 * use folio_zero_range() or folio_zero_segment() instead.
 *
 * Return: An address which can be passed to kunmap_local().
 */
static inline __must_check void *folio_zero_tail(struct folio *folio,
                size_t offset, void *kaddr)
{
        size_t len = folio_size(folio) - offset;

        if (folio_test_highmem(folio)) {
                size_t max = PAGE_SIZE - offset_in_page(offset);

                while (len > max) {
                        memset(kaddr, 0, max);
                        kunmap_local(kaddr);
                        len -= max;
                        offset += max;
                        max = PAGE_SIZE;
                        kaddr = kmap_local_folio(folio, offset);
                }
        }

        memset(kaddr, 0, len);
        flush_dcache_folio(folio);

        return kaddr;
}

/**
 * folio_fill_tail - Copy some data to a folio and pad with zeroes.
 * @folio: The destination folio.
 * @offset: The offset into @folio at which to start copying.
 * @from: The data to copy.
 * @len: How many bytes of data to copy.
 *
 * This function is most useful for filesystems which support inline data.
 * When they want to copy data from the inode into the page cache, this
 * function does everything for them.  It supports large folios even on
 * HIGHMEM configurations.
 */
static inline void folio_fill_tail(struct folio *folio, size_t offset,
                const char *from, size_t len)
{
        char *to = kmap_local_folio(folio, offset);

        VM_BUG_ON(offset + len > folio_size(folio));

        if (folio_test_highmem(folio)) {
                size_t max = PAGE_SIZE - offset_in_page(offset);

                while (len > max) {
                        memcpy(to, from, max);
                        kunmap_local(to);
                        len -= max;
                        from += max;
                        offset += max;
                        max = PAGE_SIZE;
                        to = kmap_local_folio(folio, offset);
                }
        }

        memcpy(to, from, len);
        to = folio_zero_tail(folio, offset + len, to + len);
        kunmap_local(to);
}

/**
 * memcpy_from_file_folio - Copy some bytes from a file folio.
 * @to: The destination buffer.
 * @folio: The folio to copy from.
 * @pos: The position in the file.
 * @len: The maximum number of bytes to copy.
 *
 * Copy up to @len bytes from this folio.  This may be limited by PAGE_SIZE
 * if the folio comes from HIGHMEM, and by the size of the folio.
 *
 * Return: The number of bytes copied from the folio.
 */
static inline size_t memcpy_from_file_folio(char *to, struct folio *folio,
                loff_t pos, size_t len)
{
        size_t offset = offset_in_folio(folio, pos);
        char *from = kmap_local_folio(folio, offset);

        if (folio_test_highmem(folio)) {
                offset = offset_in_page(offset);
                len = min_t(size_t, len, PAGE_SIZE - offset);
        } else
                len = min(len, folio_size(folio) - offset);

        memcpy(to, from, len);
        kunmap_local(from);

        return len;
}

/**
 * folio_zero_segments() - Zero two byte ranges in a folio.
 * @folio: The folio to write to.
 * @start1: The first byte to zero.
 * @xend1: One more than the last byte in the first range.
 * @start2: The first byte to zero in the second range.
 * @xend2: One more than the last byte in the second range.
 */
static inline void folio_zero_segments(struct folio *folio,
                size_t start1, size_t xend1, size_t start2, size_t xend2)
{
        zero_user_segments(&folio->page, start1, xend1, start2, xend2);
}

/**
 * folio_zero_segment() - Zero a byte range in a folio.
 * @folio: The folio to write to.
 * @start: The first byte to zero.
 * @xend: One more than the last byte to zero.
 */
static inline void folio_zero_segment(struct folio *folio,
                size_t start, size_t xend)
{
        zero_user_segments(&folio->page, start, xend, 0, 0);
}

/**
 * folio_zero_range() - Zero a byte range in a folio.
 * @folio: The folio to write to.
 * @start: The first byte to zero.
 * @length: The number of bytes to zero.
 */
static inline void folio_zero_range(struct folio *folio,
                size_t start, size_t length)
{
        zero_user_segments(&folio->page, start, start + length, 0, 0);
}

/**
 * folio_release_kmap - Unmap a folio and drop a refcount.
 * @folio: The folio to release.
 * @addr: The address previously returned by a call to kmap_local_folio().
 *
 * It is common, eg in directory handling to kmap a folio.  This function
 * unmaps the folio and drops the refcount that was being held to keep the
 * folio alive while we accessed it.
 */
static inline void folio_release_kmap(struct folio *folio, void *addr)
{
        kunmap_local(addr);
        folio_put(folio);
}

static inline void unmap_and_put_page(struct page *page, void *addr)
{
        folio_release_kmap(page_folio(page), addr);
}

#endif /* _LINUX_HIGHMEM_H */





































































































    6 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
/* SPDX-License-Identifier: GPL-2.0 */
/*
  File: linux/xattr.h

  Extended attributes handling.

  Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org>
  Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
  Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
*/
#ifndef _LINUX_XATTR_H
#define _LINUX_XATTR_H


#include <linux/slab.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/user_namespace.h>
#include <uapi/linux/xattr.h>

struct inode;
struct dentry;

static inline bool is_posix_acl_xattr(const char *name)
{
        return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
               (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0);
}

/*
 * struct xattr_handler: When @name is set, match attributes with exactly that
 * name.  When @prefix is set instead, match attributes with that prefix and
 * with a non-empty suffix.
 */
struct xattr_handler {
        const char *name;
        const char *prefix;
        int flags;      /* fs private flags */
        bool (*list)(struct dentry *dentry);
        int (*get)(const struct xattr_handler *, struct dentry *dentry,
                   struct inode *inode, const char *name, void *buffer,
                   size_t size);
        int (*set)(const struct xattr_handler *,
                   struct mnt_idmap *idmap, struct dentry *dentry,
                   struct inode *inode, const char *name, const void *buffer,
                   size_t size, int flags);
};

/**
 * xattr_handler_can_list - check whether xattr can be listed
 * @handler: handler for this type of xattr
 * @dentry: dentry whose inode xattr to list
 *
 * Determine whether the xattr associated with @dentry can be listed given
 * @handler.
 *
 * Return: true if xattr can be listed, false if not.
 */
static inline bool xattr_handler_can_list(const struct xattr_handler *handler,
                                          struct dentry *dentry)
{
        return handler && (!handler->list || handler->list(dentry));
}

const char *xattr_full_name(const struct xattr_handler *, const char *);

struct xattr {
        const char *name;
        void *value;
        size_t value_len;
};

ssize_t __vfs_getxattr(struct dentry *, struct inode *, const char *, void *, size_t);
ssize_t vfs_getxattr(struct mnt_idmap *, struct dentry *, const char *,
                     void *, size_t);
ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size);
int __vfs_setxattr(struct mnt_idmap *, struct dentry *, struct inode *,
                   const char *, const void *, size_t, int);
int __vfs_setxattr_noperm(struct mnt_idmap *, struct dentry *,
                          const char *, const void *, size_t, int);
int __vfs_setxattr_locked(struct mnt_idmap *, struct dentry *,
                          const char *, const void *, size_t, int,
                          struct inode **);
int vfs_setxattr(struct mnt_idmap *, struct dentry *, const char *,
                 const void *, size_t, int);
int __vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *);
int __vfs_removexattr_locked(struct mnt_idmap *, struct dentry *,
                             const char *, struct inode **);
int vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *);

ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size);
int vfs_getxattr_alloc(struct mnt_idmap *idmap,
                       struct dentry *dentry, const char *name,
                       char **xattr_value, size_t size, gfp_t flags);

int xattr_supports_user_prefix(struct inode *inode);

static inline const char *xattr_prefix(const struct xattr_handler *handler)
{
        return handler->prefix ?: handler->name;
}

struct simple_xattrs {
        struct rb_root rb_root;
        rwlock_t lock;
};

struct simple_xattr {
        struct rb_node rb_node;
        char *name;
        size_t size;
        char value[];
};

void simple_xattrs_init(struct simple_xattrs *xattrs);
void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space);
size_t simple_xattr_space(const char *name, size_t size);
struct simple_xattr *simple_xattr_alloc(const void *value, size_t size);
void simple_xattr_free(struct simple_xattr *xattr);
int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
                     void *buffer, size_t size);
struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs,
                                      const char *name, const void *value,
                                      size_t size, int flags);
ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
                          char *buffer, size_t size);
void simple_xattr_add(struct simple_xattrs *xattrs,
                      struct simple_xattr *new_xattr);
int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name);

#endif        /* _LINUX_XATTR_H */





































































    3 






    2 








































    1 







    3 








    2 





























    1 

















    2 



    2 
    2 
    2 







































































    3 









    3 




    3 



































































































































































































































































































































    2 








    2 





























































































































































































































































































































































































































































































































































    1 




















    1 





































    1 



































































































































































































    1 
    1 
















    3 


    3 








































































































































































































































































    2 
















    2 
    1 
    1 




    2 














    2 







    2 


    2 





    1 














    2 



    2 















    1 













    1 
    1 



















    1 




    1 







    3 







































    1 





    1 



















































    3 










    1 




    2 












































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2011 Fujitsu.  All rights reserved.
 * Written by Miao Xie <miaox@cn.fujitsu.com>
 */

#include <linux/slab.h>
#include <linux/iversion.h>
#include "ctree.h"
#include "fs.h"
#include "messages.h"
#include "misc.h"
#include "delayed-inode.h"
#include "disk-io.h"
#include "transaction.h"
#include "qgroup.h"
#include "locking.h"
#include "inode-item.h"
#include "space-info.h"
#include "accessors.h"
#include "file-item.h"

#define BTRFS_DELAYED_WRITEBACK                512
#define BTRFS_DELAYED_BACKGROUND        128
#define BTRFS_DELAYED_BATCH                16

static struct kmem_cache *delayed_node_cache;

int __init btrfs_delayed_inode_init(void)
{
        delayed_node_cache = KMEM_CACHE(btrfs_delayed_node, 0);
        if (!delayed_node_cache)
                return -ENOMEM;
        return 0;
}

void __cold btrfs_delayed_inode_exit(void)
{
        kmem_cache_destroy(delayed_node_cache);
}

void btrfs_init_delayed_root(struct btrfs_delayed_root *delayed_root)
{
        atomic_set(&delayed_root->items, 0);
        atomic_set(&delayed_root->items_seq, 0);
        delayed_root->nodes = 0;
        spin_lock_init(&delayed_root->lock);
        init_waitqueue_head(&delayed_root->wait);
        INIT_LIST_HEAD(&delayed_root->node_list);
        INIT_LIST_HEAD(&delayed_root->prepare_list);
}

static inline void btrfs_init_delayed_node(
                                struct btrfs_delayed_node *delayed_node,
                                struct btrfs_root *root, u64 inode_id)
{
        delayed_node->root = root;
        delayed_node->inode_id = inode_id;
        refcount_set(&delayed_node->refs, 0);
        delayed_node->ins_root = RB_ROOT_CACHED;
        delayed_node->del_root = RB_ROOT_CACHED;
        mutex_init(&delayed_node->mutex);
        INIT_LIST_HEAD(&delayed_node->n_list);
        INIT_LIST_HEAD(&delayed_node->p_list);
}

static struct btrfs_delayed_node *btrfs_get_delayed_node(
                struct btrfs_inode *btrfs_inode)
{
        struct btrfs_root *root = btrfs_inode->root;
        u64 ino = btrfs_ino(btrfs_inode);
        struct btrfs_delayed_node *node;

        node = READ_ONCE(btrfs_inode->delayed_node);
        if (node) {
                refcount_inc(&node->refs);
                return node;
        }

        spin_lock(&root->inode_lock);
        node = xa_load(&root->delayed_nodes, ino);

        if (node) {
                if (btrfs_inode->delayed_node) {
                        refcount_inc(&node->refs);        /* can be accessed */
                        BUG_ON(btrfs_inode->delayed_node != node);
                        spin_unlock(&root->inode_lock);
                        return node;
                }

                /*
                 * It's possible that we're racing into the middle of removing
                 * this node from the xarray.  In this case, the refcount
                 * was zero and it should never go back to one.  Just return
                 * NULL like it was never in the xarray at all; our release
                 * function is in the process of removing it.
                 *
                 * Some implementations of refcount_inc refuse to bump the
                 * refcount once it has hit zero.  If we don't do this dance
                 * here, refcount_inc() may decide to just WARN_ONCE() instead
                 * of actually bumping the refcount.
                 *
                 * If this node is properly in the xarray, we want to bump the
                 * refcount twice, once for the inode and once for this get
                 * operation.
                 */
                if (refcount_inc_not_zero(&node->refs)) {
                        refcount_inc(&node->refs);
                        btrfs_inode->delayed_node = node;
                } else {
                        node = NULL;
                }

                spin_unlock(&root->inode_lock);
                return node;
        }
        spin_unlock(&root->inode_lock);

        return NULL;
}

/* Will return either the node or PTR_ERR(-ENOMEM) */
static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
                struct btrfs_inode *btrfs_inode)
{
        struct btrfs_delayed_node *node;
        struct btrfs_root *root = btrfs_inode->root;
        u64 ino = btrfs_ino(btrfs_inode);
        int ret;
        void *ptr;

again:
        node = btrfs_get_delayed_node(btrfs_inode);
        if (node)
                return node;

        node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
        if (!node)
                return ERR_PTR(-ENOMEM);
        btrfs_init_delayed_node(node, root, ino);

        /* Cached in the inode and can be accessed. */
        refcount_set(&node->refs, 2);

        /* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */
        ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS);
        if (ret == -ENOMEM) {
                kmem_cache_free(delayed_node_cache, node);
                return ERR_PTR(-ENOMEM);
        }
        spin_lock(&root->inode_lock);
        ptr = xa_load(&root->delayed_nodes, ino);
        if (ptr) {
                /* Somebody inserted it, go back and read it. */
                spin_unlock(&root->inode_lock);
                kmem_cache_free(delayed_node_cache, node);
                node = NULL;
                goto again;
        }
        ptr = xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC);
        ASSERT(xa_err(ptr) != -EINVAL);
        ASSERT(xa_err(ptr) != -ENOMEM);
        ASSERT(ptr == NULL);
        btrfs_inode->delayed_node = node;
        spin_unlock(&root->inode_lock);

        return node;
}

/*
 * Call it when holding delayed_node->mutex
 *
 * If mod = 1, add this node into the prepared list.
 */
static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
                                     struct btrfs_delayed_node *node,
                                     int mod)
{
        spin_lock(&root->lock);
        if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
                if (!list_empty(&node->p_list))
                        list_move_tail(&node->p_list, &root->prepare_list);
                else if (mod)
                        list_add_tail(&node->p_list, &root->prepare_list);
        } else {
                list_add_tail(&node->n_list, &root->node_list);
                list_add_tail(&node->p_list, &root->prepare_list);
                refcount_inc(&node->refs);        /* inserted into list */
                root->nodes++;
                set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
        }
        spin_unlock(&root->lock);
}

/* Call it when holding delayed_node->mutex */
static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
                                       struct btrfs_delayed_node *node)
{
        spin_lock(&root->lock);
        if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
                root->nodes--;
                refcount_dec(&node->refs);        /* not in the list */
                list_del_init(&node->n_list);
                if (!list_empty(&node->p_list))
                        list_del_init(&node->p_list);
                clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
        }
        spin_unlock(&root->lock);
}

static struct btrfs_delayed_node *btrfs_first_delayed_node(
                        struct btrfs_delayed_root *delayed_root)
{
        struct list_head *p;
        struct btrfs_delayed_node *node = NULL;

        spin_lock(&delayed_root->lock);
        if (list_empty(&delayed_root->node_list))
                goto out;

        p = delayed_root->node_list.next;
        node = list_entry(p, struct btrfs_delayed_node, n_list);
        refcount_inc(&node->refs);
out:
        spin_unlock(&delayed_root->lock);

        return node;
}

static struct btrfs_delayed_node *btrfs_next_delayed_node(
                                                struct btrfs_delayed_node *node)
{
        struct btrfs_delayed_root *delayed_root;
        struct list_head *p;
        struct btrfs_delayed_node *next = NULL;

        delayed_root = node->root->fs_info->delayed_root;
        spin_lock(&delayed_root->lock);
        if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
                /* not in the list */
                if (list_empty(&delayed_root->node_list))
                        goto out;
                p = delayed_root->node_list.next;
        } else if (list_is_last(&node->n_list, &delayed_root->node_list))
                goto out;
        else
                p = node->n_list.next;

        next = list_entry(p, struct btrfs_delayed_node, n_list);
        refcount_inc(&next->refs);
out:
        spin_unlock(&delayed_root->lock);

        return next;
}

static void __btrfs_release_delayed_node(
                                struct btrfs_delayed_node *delayed_node,
                                int mod)
{
        struct btrfs_delayed_root *delayed_root;

        if (!delayed_node)
                return;

        delayed_root = delayed_node->root->fs_info->delayed_root;

        mutex_lock(&delayed_node->mutex);
        if (delayed_node->count)
                btrfs_queue_delayed_node(delayed_root, delayed_node, mod);
        else
                btrfs_dequeue_delayed_node(delayed_root, delayed_node);
        mutex_unlock(&delayed_node->mutex);

        if (refcount_dec_and_test(&delayed_node->refs)) {
                struct btrfs_root *root = delayed_node->root;

                spin_lock(&root->inode_lock);
                /*
                 * Once our refcount goes to zero, nobody is allowed to bump it
                 * back up.  We can delete it now.
                 */
                ASSERT(refcount_read(&delayed_node->refs) == 0);
                xa_erase(&root->delayed_nodes, delayed_node->inode_id);
                spin_unlock(&root->inode_lock);
                kmem_cache_free(delayed_node_cache, delayed_node);
        }
}

static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
{
        __btrfs_release_delayed_node(node, 0);
}

static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
                                        struct btrfs_delayed_root *delayed_root)
{
        struct list_head *p;
        struct btrfs_delayed_node *node = NULL;

        spin_lock(&delayed_root->lock);
        if (list_empty(&delayed_root->prepare_list))
                goto out;

        p = delayed_root->prepare_list.next;
        list_del_init(p);
        node = list_entry(p, struct btrfs_delayed_node, p_list);
        refcount_inc(&node->refs);
out:
        spin_unlock(&delayed_root->lock);

        return node;
}

static inline void btrfs_release_prepared_delayed_node(
                                        struct btrfs_delayed_node *node)
{
        __btrfs_release_delayed_node(node, 1);
}

static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
                                           struct btrfs_delayed_node *node,
                                           enum btrfs_delayed_item_type type)
{
        struct btrfs_delayed_item *item;

        item = kmalloc(struct_size(item, data, data_len), GFP_NOFS);
        if (item) {
                item->data_len = data_len;
                item->type = type;
                item->bytes_reserved = 0;
                item->delayed_node = node;
                RB_CLEAR_NODE(&item->rb_node);
                INIT_LIST_HEAD(&item->log_list);
                item->logged = false;
                refcount_set(&item->refs, 1);
        }
        return item;
}

/*
 * Look up the delayed item by key.
 *
 * @delayed_node: pointer to the delayed node
 * @index:          the dir index value to lookup (offset of a dir index key)
 *
 * Note: if we don't find the right item, we will return the prev item and
 * the next item.
 */
static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
                                struct rb_root *root,
                                u64 index)
{
        struct rb_node *node = root->rb_node;
        struct btrfs_delayed_item *delayed_item = NULL;

        while (node) {
                delayed_item = rb_entry(node, struct btrfs_delayed_item,
                                        rb_node);
                if (delayed_item->index < index)
                        node = node->rb_right;
                else if (delayed_item->index > index)
                        node = node->rb_left;
                else
                        return delayed_item;
        }

        return NULL;
}

static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
                                    struct btrfs_delayed_item *ins)
{
        struct rb_node **p, *node;
        struct rb_node *parent_node = NULL;
        struct rb_root_cached *root;
        struct btrfs_delayed_item *item;
        bool leftmost = true;

        if (ins->type == BTRFS_DELAYED_INSERTION_ITEM)
                root = &delayed_node->ins_root;
        else
                root = &delayed_node->del_root;

        p = &root->rb_root.rb_node;
        node = &ins->rb_node;

        while (*p) {
                parent_node = *p;
                item = rb_entry(parent_node, struct btrfs_delayed_item,
                                 rb_node);

                if (item->index < ins->index) {
                        p = &(*p)->rb_right;
                        leftmost = false;
                } else if (item->index > ins->index) {
                        p = &(*p)->rb_left;
                } else {
                        return -EEXIST;
                }
        }

        rb_link_node(node, parent_node, p);
        rb_insert_color_cached(node, root, leftmost);

        if (ins->type == BTRFS_DELAYED_INSERTION_ITEM &&
            ins->index >= delayed_node->index_cnt)
                delayed_node->index_cnt = ins->index + 1;

        delayed_node->count++;
        atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
        return 0;
}

static void finish_one_item(struct btrfs_delayed_root *delayed_root)
{
        int seq = atomic_inc_return(&delayed_root->items_seq);

        /* atomic_dec_return implies a barrier */
        if ((atomic_dec_return(&delayed_root->items) <
            BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0))
                cond_wake_up_nomb(&delayed_root->wait);
}

static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
{
        struct btrfs_delayed_node *delayed_node = delayed_item->delayed_node;
        struct rb_root_cached *root;
        struct btrfs_delayed_root *delayed_root;

        /* Not inserted, ignore it. */
        if (RB_EMPTY_NODE(&delayed_item->rb_node))
                return;

        /* If it's in a rbtree, then we need to have delayed node locked. */
        lockdep_assert_held(&delayed_node->mutex);

        delayed_root = delayed_node->root->fs_info->delayed_root;

        if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM)
                root = &delayed_node->ins_root;
        else
                root = &delayed_node->del_root;

        rb_erase_cached(&delayed_item->rb_node, root);
        RB_CLEAR_NODE(&delayed_item->rb_node);
        delayed_node->count--;

        finish_one_item(delayed_root);
}

static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
{
        if (item) {
                __btrfs_remove_delayed_item(item);
                if (refcount_dec_and_test(&item->refs))
                        kfree(item);
        }
}

static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
                                        struct btrfs_delayed_node *delayed_node)
{
        struct rb_node *p;
        struct btrfs_delayed_item *item = NULL;

        p = rb_first_cached(&delayed_node->ins_root);
        if (p)
                item = rb_entry(p, struct btrfs_delayed_item, rb_node);

        return item;
}

static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
                                        struct btrfs_delayed_node *delayed_node)
{
        struct rb_node *p;
        struct btrfs_delayed_item *item = NULL;

        p = rb_first_cached(&delayed_node->del_root);
        if (p)
                item = rb_entry(p, struct btrfs_delayed_item, rb_node);

        return item;
}

static struct btrfs_delayed_item *__btrfs_next_delayed_item(
                                                struct btrfs_delayed_item *item)
{
        struct rb_node *p;
        struct btrfs_delayed_item *next = NULL;

        p = rb_next(&item->rb_node);
        if (p)
                next = rb_entry(p, struct btrfs_delayed_item, rb_node);

        return next;
}

static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
                                               struct btrfs_delayed_item *item)
{
        struct btrfs_block_rsv *src_rsv;
        struct btrfs_block_rsv *dst_rsv;
        struct btrfs_fs_info *fs_info = trans->fs_info;
        u64 num_bytes;
        int ret;

        if (!trans->bytes_reserved)
                return 0;

        src_rsv = trans->block_rsv;
        dst_rsv = &fs_info->delayed_block_rsv;

        num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);

        /*
         * Here we migrate space rsv from transaction rsv, since have already
         * reserved space when starting a transaction.  So no need to reserve
         * qgroup space here.
         */
        ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
        if (!ret) {
                trace_btrfs_space_reservation(fs_info, "delayed_item",
                                              item->delayed_node->inode_id,
                                              num_bytes, 1);
                /*
                 * For insertions we track reserved metadata space by accounting
                 * for the number of leaves that will be used, based on the delayed
                 * node's curr_index_batch_size and index_item_leaves fields.
                 */
                if (item->type == BTRFS_DELAYED_DELETION_ITEM)
                        item->bytes_reserved = num_bytes;
        }

        return ret;
}

static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
                                                struct btrfs_delayed_item *item)
{
        struct btrfs_block_rsv *rsv;
        struct btrfs_fs_info *fs_info = root->fs_info;

        if (!item->bytes_reserved)
                return;

        rsv = &fs_info->delayed_block_rsv;
        /*
         * Check btrfs_delayed_item_reserve_metadata() to see why we don't need
         * to release/reserve qgroup space.
         */
        trace_btrfs_space_reservation(fs_info, "delayed_item",
                                      item->delayed_node->inode_id,
                                      item->bytes_reserved, 0);
        btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL);
}

static void btrfs_delayed_item_release_leaves(struct btrfs_delayed_node *node,
                                              unsigned int num_leaves)
{
        struct btrfs_fs_info *fs_info = node->root->fs_info;
        const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, num_leaves);

        /* There are no space reservations during log replay, bail out. */
        if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
                return;

        trace_btrfs_space_reservation(fs_info, "delayed_item", node->inode_id,
                                      bytes, 0);
        btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, bytes, NULL);
}

static int btrfs_delayed_inode_reserve_metadata(
                                        struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root,
                                        struct btrfs_delayed_node *node)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_block_rsv *src_rsv;
        struct btrfs_block_rsv *dst_rsv;
        u64 num_bytes;
        int ret;

        src_rsv = trans->block_rsv;
        dst_rsv = &fs_info->delayed_block_rsv;

        num_bytes = btrfs_calc_metadata_size(fs_info, 1);

        /*
         * btrfs_dirty_inode will update the inode under btrfs_join_transaction
         * which doesn't reserve space for speed.  This is a problem since we
         * still need to reserve space for this update, so try to reserve the
         * space.
         *
         * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
         * we always reserve enough to update the inode item.
         */
        if (!src_rsv || (!trans->bytes_reserved &&
                         src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
                ret = btrfs_qgroup_reserve_meta(root, num_bytes,
                                          BTRFS_QGROUP_RSV_META_PREALLOC, true);
                if (ret < 0)
                        return ret;
                ret = btrfs_block_rsv_add(fs_info, dst_rsv, num_bytes,
                                          BTRFS_RESERVE_NO_FLUSH);
                /* NO_FLUSH could only fail with -ENOSPC */
                ASSERT(ret == 0 || ret == -ENOSPC);
                if (ret)
                        btrfs_qgroup_free_meta_prealloc(root, num_bytes);
        } else {
                ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
        }

        if (!ret) {
                trace_btrfs_space_reservation(fs_info, "delayed_inode",
                                              node->inode_id, num_bytes, 1);
                node->bytes_reserved = num_bytes;
        }

        return ret;
}

static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
                                                struct btrfs_delayed_node *node,
                                                bool qgroup_free)
{
        struct btrfs_block_rsv *rsv;

        if (!node->bytes_reserved)
                return;

        rsv = &fs_info->delayed_block_rsv;
        trace_btrfs_space_reservation(fs_info, "delayed_inode",
                                      node->inode_id, node->bytes_reserved, 0);
        btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved, NULL);
        if (qgroup_free)
                btrfs_qgroup_free_meta_prealloc(node->root,
                                node->bytes_reserved);
        else
                btrfs_qgroup_convert_reserved_meta(node->root,
                                node->bytes_reserved);
        node->bytes_reserved = 0;
}

/*
 * Insert a single delayed item or a batch of delayed items, as many as possible
 * that fit in a leaf. The delayed items (dir index keys) are sorted by their key
 * in the rbtree, and if there's a gap between two consecutive dir index items,
 * then it means at some point we had delayed dir indexes to add but they got
 * removed (by btrfs_delete_delayed_dir_index()) before we attempted to flush them
 * into the subvolume tree. Dir index keys also have their offsets coming from a
 * monotonically increasing counter, so we can't get new keys with an offset that
 * fits within a gap between delayed dir index items.
 */
static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     struct btrfs_path *path,
                                     struct btrfs_delayed_item *first_item)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_delayed_node *node = first_item->delayed_node;
        LIST_HEAD(item_list);
        struct btrfs_delayed_item *curr;
        struct btrfs_delayed_item *next;
        const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info);
        struct btrfs_item_batch batch;
        struct btrfs_key first_key;
        const u32 first_data_size = first_item->data_len;
        int total_size;
        char *ins_data = NULL;
        int ret;
        bool continuous_keys_only = false;

        lockdep_assert_held(&node->mutex);

        /*
         * During normal operation the delayed index offset is continuously
         * increasing, so we can batch insert all items as there will not be any
         * overlapping keys in the tree.
         *
         * The exception to this is log replay, where we may have interleaved
         * offsets in the tree, so our batch needs to be continuous keys only in
         * order to ensure we do not end up with out of order items in our leaf.
         */
        if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
                continuous_keys_only = true;

        /*
         * For delayed items to insert, we track reserved metadata bytes based
         * on the number of leaves that we will use.
         * See btrfs_insert_delayed_dir_index() and
         * btrfs_delayed_item_reserve_metadata()).
         */
        ASSERT(first_item->bytes_reserved == 0);

        list_add_tail(&first_item->tree_list, &item_list);
        batch.total_data_size = first_data_size;
        batch.nr = 1;
        total_size = first_data_size + sizeof(struct btrfs_item);
        curr = first_item;

        while (true) {
                int next_size;

                next = __btrfs_next_delayed_item(curr);
                if (!next)
                        break;

                /*
                 * We cannot allow gaps in the key space if we're doing log
                 * replay.
                 */
                if (continuous_keys_only && (next->index != curr->index + 1))
                        break;

                ASSERT(next->bytes_reserved == 0);

                next_size = next->data_len + sizeof(struct btrfs_item);
                if (total_size + next_size > max_size)
                        break;

                list_add_tail(&next->tree_list, &item_list);
                batch.nr++;
                total_size += next_size;
                batch.total_data_size += next->data_len;
                curr = next;
        }

        if (batch.nr == 1) {
                first_key.objectid = node->inode_id;
                first_key.type = BTRFS_DIR_INDEX_KEY;
                first_key.offset = first_item->index;
                batch.keys = &first_key;
                batch.data_sizes = &first_data_size;
        } else {
                struct btrfs_key *ins_keys;
                u32 *ins_sizes;
                int i = 0;

                ins_data = kmalloc(batch.nr * sizeof(u32) +
                                   batch.nr * sizeof(struct btrfs_key), GFP_NOFS);
                if (!ins_data) {
                        ret = -ENOMEM;
                        goto out;
                }
                ins_sizes = (u32 *)ins_data;
                ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32));
                batch.keys = ins_keys;
                batch.data_sizes = ins_sizes;
                list_for_each_entry(curr, &item_list, tree_list) {
                        ins_keys[i].objectid = node->inode_id;
                        ins_keys[i].type = BTRFS_DIR_INDEX_KEY;
                        ins_keys[i].offset = curr->index;
                        ins_sizes[i] = curr->data_len;
                        i++;
                }
        }

        ret = btrfs_insert_empty_items(trans, root, path, &batch);
        if (ret)
                goto out;

        list_for_each_entry(curr, &item_list, tree_list) {
                char *data_ptr;

                data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
                write_extent_buffer(path->nodes[0], &curr->data,
                                    (unsigned long)data_ptr, curr->data_len);
                path->slots[0]++;
        }

        /*
         * Now release our path before releasing the delayed items and their
         * metadata reservations, so that we don't block other tasks for more
         * time than needed.
         */
        btrfs_release_path(path);

        ASSERT(node->index_item_leaves > 0);

        /*
         * For normal operations we will batch an entire leaf's worth of delayed
         * items, so if there are more items to process we can decrement
         * index_item_leaves by 1 as we inserted 1 leaf's worth of items.
         *
         * However for log replay we may not have inserted an entire leaf's
         * worth of items, we may have not had continuous items, so decrementing
         * here would mess up the index_item_leaves accounting.  For this case
         * only clean up the accounting when there are no items left.
         */
        if (next && !continuous_keys_only) {
                /*
                 * We inserted one batch of items into a leaf a there are more
                 * items to flush in a future batch, now release one unit of
                 * metadata space from the delayed block reserve, corresponding
                 * the leaf we just flushed to.
                 */
                btrfs_delayed_item_release_leaves(node, 1);
                node->index_item_leaves--;
        } else if (!next) {
                /*
                 * There are no more items to insert. We can have a number of
                 * reserved leaves > 1 here - this happens when many dir index
                 * items are added and then removed before they are flushed (file
                 * names with a very short life, never span a transaction). So
                 * release all remaining leaves.
                 */
                btrfs_delayed_item_release_leaves(node, node->index_item_leaves);
                node->index_item_leaves = 0;
        }

        list_for_each_entry_safe(curr, next, &item_list, tree_list) {
                list_del(&curr->tree_list);
                btrfs_release_delayed_item(curr);
        }
out:
        kfree(ins_data);
        return ret;
}

static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
                                      struct btrfs_path *path,
                                      struct btrfs_root *root,
                                      struct btrfs_delayed_node *node)
{
        int ret = 0;

        while (ret == 0) {
                struct btrfs_delayed_item *curr;

                mutex_lock(&node->mutex);
                curr = __btrfs_first_delayed_insertion_item(node);
                if (!curr) {
                        mutex_unlock(&node->mutex);
                        break;
                }
                ret = btrfs_insert_delayed_item(trans, root, path, curr);
                mutex_unlock(&node->mutex);
        }

        return ret;
}

static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root,
                                    struct btrfs_path *path,
                                    struct btrfs_delayed_item *item)
{
        const u64 ino = item->delayed_node->inode_id;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_delayed_item *curr, *next;
        struct extent_buffer *leaf = path->nodes[0];
        LIST_HEAD(batch_list);
        int nitems, slot, last_slot;
        int ret;
        u64 total_reserved_size = item->bytes_reserved;

        ASSERT(leaf != NULL);

        slot = path->slots[0];
        last_slot = btrfs_header_nritems(leaf) - 1;
        /*
         * Our caller always gives us a path pointing to an existing item, so
         * this can not happen.
         */
        ASSERT(slot <= last_slot);
        if (WARN_ON(slot > last_slot))
                return -ENOENT;

        nitems = 1;
        curr = item;
        list_add_tail(&curr->tree_list, &batch_list);

        /*
         * Keep checking if the next delayed item matches the next item in the
         * leaf - if so, we can add it to the batch of items to delete from the
         * leaf.
         */
        while (slot < last_slot) {
                struct btrfs_key key;

                next = __btrfs_next_delayed_item(curr);
                if (!next)
                        break;

                slot++;
                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (key.objectid != ino ||
                    key.type != BTRFS_DIR_INDEX_KEY ||
                    key.offset != next->index)
                        break;
                nitems++;
                curr = next;
                list_add_tail(&curr->tree_list, &batch_list);
                total_reserved_size += curr->bytes_reserved;
        }

        ret = btrfs_del_items(trans, root, path, path->slots[0], nitems);
        if (ret)
                return ret;

        /* In case of BTRFS_FS_LOG_RECOVERING items won't have reserved space */
        if (total_reserved_size > 0) {
                /*
                 * Check btrfs_delayed_item_reserve_metadata() to see why we
                 * don't need to release/reserve qgroup space.
                 */
                trace_btrfs_space_reservation(fs_info, "delayed_item", ino,
                                              total_reserved_size, 0);
                btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv,
                                        total_reserved_size, NULL);
        }

        list_for_each_entry_safe(curr, next, &batch_list, tree_list) {
                list_del(&curr->tree_list);
                btrfs_release_delayed_item(curr);
        }

        return 0;
}

static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
                                      struct btrfs_path *path,
                                      struct btrfs_root *root,
                                      struct btrfs_delayed_node *node)
{
        struct btrfs_key key;
        int ret = 0;

        key.objectid = node->inode_id;
        key.type = BTRFS_DIR_INDEX_KEY;

        while (ret == 0) {
                struct btrfs_delayed_item *item;

                mutex_lock(&node->mutex);
                item = __btrfs_first_delayed_deletion_item(node);
                if (!item) {
                        mutex_unlock(&node->mutex);
                        break;
                }

                key.offset = item->index;
                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
                if (ret > 0) {
                        /*
                         * There's no matching item in the leaf. This means we
                         * have already deleted this item in a past run of the
                         * delayed items. We ignore errors when running delayed
                         * items from an async context, through a work queue job
                         * running btrfs_async_run_delayed_root(), and don't
                         * release delayed items that failed to complete. This
                         * is because we will retry later, and at transaction
                         * commit time we always run delayed items and will
                         * then deal with errors if they fail to run again.
                         *
                         * So just release delayed items for which we can't find
                         * an item in the tree, and move to the next item.
                         */
                        btrfs_release_path(path);
                        btrfs_release_delayed_item(item);
                        ret = 0;
                } else if (ret == 0) {
                        ret = btrfs_batch_delete_items(trans, root, path, item);
                        btrfs_release_path(path);
                }

                /*
                 * We unlock and relock on each iteration, this is to prevent
                 * blocking other tasks for too long while we are being run from
                 * the async context (work queue job). Those tasks are typically
                 * running system calls like creat/mkdir/rename/unlink/etc which
                 * need to add delayed items to this delayed node.
                 */
                mutex_unlock(&node->mutex);
        }

        return ret;
}

static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
{
        struct btrfs_delayed_root *delayed_root;

        if (delayed_node &&
            test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
                ASSERT(delayed_node->root);
                clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
                delayed_node->count--;

                delayed_root = delayed_node->root->fs_info->delayed_root;
                finish_one_item(delayed_root);
        }
}

static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node)
{

        if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) {
                struct btrfs_delayed_root *delayed_root;

                ASSERT(delayed_node->root);
                delayed_node->count--;

                delayed_root = delayed_node->root->fs_info->delayed_root;
                finish_one_item(delayed_root);
        }
}

static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root,
                                        struct btrfs_path *path,
                                        struct btrfs_delayed_node *node)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_key key;
        struct btrfs_inode_item *inode_item;
        struct extent_buffer *leaf;
        int mod;
        int ret;

        key.objectid = node->inode_id;
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;

        if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
                mod = -1;
        else
                mod = 1;

        ret = btrfs_lookup_inode(trans, root, path, &key, mod);
        if (ret > 0)
                ret = -ENOENT;
        if (ret < 0)
                goto out;

        leaf = path->nodes[0];
        inode_item = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_inode_item);
        write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
                            sizeof(struct btrfs_inode_item));
        btrfs_mark_buffer_dirty(trans, leaf);

        if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
                goto out;

        /*
         * Now we're going to delete the INODE_REF/EXTREF, which should be the
         * only one ref left.  Check if the next item is an INODE_REF/EXTREF.
         *
         * But if we're the last item already, release and search for the last
         * INODE_REF/EXTREF.
         */
        if (path->slots[0] + 1 >= btrfs_header_nritems(leaf)) {
                key.objectid = node->inode_id;
                key.type = BTRFS_INODE_EXTREF_KEY;
                key.offset = (u64)-1;

                btrfs_release_path(path);
                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
                if (ret < 0)
                        goto err_out;
                ASSERT(ret > 0);
                ASSERT(path->slots[0] > 0);
                ret = 0;
                path->slots[0]--;
                leaf = path->nodes[0];
        } else {
                path->slots[0]++;
        }
        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
        if (key.objectid != node->inode_id)
                goto out;
        if (key.type != BTRFS_INODE_REF_KEY &&
            key.type != BTRFS_INODE_EXTREF_KEY)
                goto out;

        /*
         * Delayed iref deletion is for the inode who has only one link,
         * so there is only one iref. The case that several irefs are
         * in the same item doesn't exist.
         */
        ret = btrfs_del_item(trans, root, path);
out:
        btrfs_release_delayed_iref(node);
        btrfs_release_path(path);
err_out:
        btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0));
        btrfs_release_delayed_inode(node);

        /*
         * If we fail to update the delayed inode we need to abort the
         * transaction, because we could leave the inode with the improper
         * counts behind.
         */
        if (ret && ret != -ENOENT)
                btrfs_abort_transaction(trans, ret);

        return ret;
}

static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
                                             struct btrfs_root *root,
                                             struct btrfs_path *path,
                                             struct btrfs_delayed_node *node)
{
        int ret;

        mutex_lock(&node->mutex);
        if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) {
                mutex_unlock(&node->mutex);
                return 0;
        }

        ret = __btrfs_update_delayed_inode(trans, root, path, node);
        mutex_unlock(&node->mutex);
        return ret;
}

static inline int
__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
                                   struct btrfs_path *path,
                                   struct btrfs_delayed_node *node)
{
        int ret;

        ret = btrfs_insert_delayed_items(trans, path, node->root, node);
        if (ret)
                return ret;

        ret = btrfs_delete_delayed_items(trans, path, node->root, node);
        if (ret)
                return ret;

        ret = btrfs_record_root_in_trans(trans, node->root);
        if (ret)
                return ret;
        ret = btrfs_update_delayed_inode(trans, node->root, path, node);
        return ret;
}

/*
 * Called when committing the transaction.
 * Returns 0 on success.
 * Returns < 0 on error and returns with an aborted transaction with any
 * outstanding delayed items cleaned up.
 */
static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_delayed_root *delayed_root;
        struct btrfs_delayed_node *curr_node, *prev_node;
        struct btrfs_path *path;
        struct btrfs_block_rsv *block_rsv;
        int ret = 0;
        bool count = (nr > 0);

        if (TRANS_ABORTED(trans))
                return -EIO;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        block_rsv = trans->block_rsv;
        trans->block_rsv = &fs_info->delayed_block_rsv;

        delayed_root = fs_info->delayed_root;

        curr_node = btrfs_first_delayed_node(delayed_root);
        while (curr_node && (!count || nr--)) {
                ret = __btrfs_commit_inode_delayed_items(trans, path,
                                                         curr_node);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        break;
                }

                prev_node = curr_node;
                curr_node = btrfs_next_delayed_node(curr_node);
                /*
                 * See the comment below about releasing path before releasing
                 * node. If the commit of delayed items was successful the path
                 * should always be released, but in case of an error, it may
                 * point to locked extent buffers (a leaf at the very least).
                 */
                ASSERT(path->nodes[0] == NULL);
                btrfs_release_delayed_node(prev_node);
        }

        /*
         * Release the path to avoid a potential deadlock and lockdep splat when
         * releasing the delayed node, as that requires taking the delayed node's
         * mutex. If another task starts running delayed items before we take
         * the mutex, it will first lock the mutex and then it may try to lock
         * the same btree path (leaf).
         */
        btrfs_free_path(path);

        if (curr_node)
                btrfs_release_delayed_node(curr_node);
        trans->block_rsv = block_rsv;

        return ret;
}

int btrfs_run_delayed_items(struct btrfs_trans_handle *trans)
{
        return __btrfs_run_delayed_items(trans, -1);
}

int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr)
{
        return __btrfs_run_delayed_items(trans, nr);
}

int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
                                     struct btrfs_inode *inode)
{
        struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
        struct btrfs_path *path;
        struct btrfs_block_rsv *block_rsv;
        int ret;

        if (!delayed_node)
                return 0;

        mutex_lock(&delayed_node->mutex);
        if (!delayed_node->count) {
                mutex_unlock(&delayed_node->mutex);
                btrfs_release_delayed_node(delayed_node);
                return 0;
        }
        mutex_unlock(&delayed_node->mutex);

        path = btrfs_alloc_path();
        if (!path) {
                btrfs_release_delayed_node(delayed_node);
                return -ENOMEM;
        }

        block_rsv = trans->block_rsv;
        trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;

        ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);

        btrfs_release_delayed_node(delayed_node);
        btrfs_free_path(path);
        trans->block_rsv = block_rsv;

        return ret;
}

int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct btrfs_trans_handle *trans;
        struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
        struct btrfs_path *path;
        struct btrfs_block_rsv *block_rsv;
        int ret;

        if (!delayed_node)
                return 0;

        mutex_lock(&delayed_node->mutex);
        if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
                mutex_unlock(&delayed_node->mutex);
                btrfs_release_delayed_node(delayed_node);
                return 0;
        }
        mutex_unlock(&delayed_node->mutex);

        trans = btrfs_join_transaction(delayed_node->root);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto out;
        }

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto trans_out;
        }

        block_rsv = trans->block_rsv;
        trans->block_rsv = &fs_info->delayed_block_rsv;

        mutex_lock(&delayed_node->mutex);
        if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags))
                ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
                                                   path, delayed_node);
        else
                ret = 0;
        mutex_unlock(&delayed_node->mutex);

        btrfs_free_path(path);
        trans->block_rsv = block_rsv;
trans_out:
        btrfs_end_transaction(trans);
        btrfs_btree_balance_dirty(fs_info);
out:
        btrfs_release_delayed_node(delayed_node);

        return ret;
}

void btrfs_remove_delayed_node(struct btrfs_inode *inode)
{
        struct btrfs_delayed_node *delayed_node;

        delayed_node = READ_ONCE(inode->delayed_node);
        if (!delayed_node)
                return;

        inode->delayed_node = NULL;
        btrfs_release_delayed_node(delayed_node);
}

struct btrfs_async_delayed_work {
        struct btrfs_delayed_root *delayed_root;
        int nr;
        struct btrfs_work work;
};

static void btrfs_async_run_delayed_root(struct btrfs_work *work)
{
        struct btrfs_async_delayed_work *async_work;
        struct btrfs_delayed_root *delayed_root;
        struct btrfs_trans_handle *trans;
        struct btrfs_path *path;
        struct btrfs_delayed_node *delayed_node = NULL;
        struct btrfs_root *root;
        struct btrfs_block_rsv *block_rsv;
        int total_done = 0;

        async_work = container_of(work, struct btrfs_async_delayed_work, work);
        delayed_root = async_work->delayed_root;

        path = btrfs_alloc_path();
        if (!path)
                goto out;

        do {
                if (atomic_read(&delayed_root->items) <
                    BTRFS_DELAYED_BACKGROUND / 2)
                        break;

                delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
                if (!delayed_node)
                        break;

                root = delayed_node->root;

                trans = btrfs_join_transaction(root);
                if (IS_ERR(trans)) {
                        btrfs_release_path(path);
                        btrfs_release_prepared_delayed_node(delayed_node);
                        total_done++;
                        continue;
                }

                block_rsv = trans->block_rsv;
                trans->block_rsv = &root->fs_info->delayed_block_rsv;

                __btrfs_commit_inode_delayed_items(trans, path, delayed_node);

                trans->block_rsv = block_rsv;
                btrfs_end_transaction(trans);
                btrfs_btree_balance_dirty_nodelay(root->fs_info);

                btrfs_release_path(path);
                btrfs_release_prepared_delayed_node(delayed_node);
                total_done++;

        } while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK)
                 || total_done < async_work->nr);

        btrfs_free_path(path);
out:
        wake_up(&delayed_root->wait);
        kfree(async_work);
}


static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
                                     struct btrfs_fs_info *fs_info, int nr)
{
        struct btrfs_async_delayed_work *async_work;

        async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
        if (!async_work)
                return -ENOMEM;

        async_work->delayed_root = delayed_root;
        btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL);
        async_work->nr = nr;

        btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
        return 0;
}

void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info)
{
        WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root));
}

static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
{
        int val = atomic_read(&delayed_root->items_seq);

        if (val < seq || val >= seq + BTRFS_DELAYED_BATCH)
                return 1;

        if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
                return 1;

        return 0;
}

void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
{
        struct btrfs_delayed_root *delayed_root = fs_info->delayed_root;

        if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) ||
                btrfs_workqueue_normal_congested(fs_info->delayed_workers))
                return;

        if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
                int seq;
                int ret;

                seq = atomic_read(&delayed_root->items_seq);

                ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0);
                if (ret)
                        return;

                wait_event_interruptible(delayed_root->wait,
                                         could_end_wait(delayed_root, seq));
                return;
        }

        btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH);
}

static void btrfs_release_dir_index_item_space(struct btrfs_trans_handle *trans)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);

        if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
                return;

        /*
         * Adding the new dir index item does not require touching another
         * leaf, so we can release 1 unit of metadata that was previously
         * reserved when starting the transaction. This applies only to
         * the case where we had a transaction start and excludes the
         * transaction join case (when replaying log trees).
         */
        trace_btrfs_space_reservation(fs_info, "transaction",
                                      trans->transid, bytes, 0);
        btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL);
        ASSERT(trans->bytes_reserved >= bytes);
        trans->bytes_reserved -= bytes;
}

/* Will return 0, -ENOMEM or -EEXIST (index number collision, unexpected). */
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
                                   const char *name, int name_len,
                                   struct btrfs_inode *dir,
                                   struct btrfs_disk_key *disk_key, u8 flags,
                                   u64 index)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info);
        struct btrfs_delayed_node *delayed_node;
        struct btrfs_delayed_item *delayed_item;
        struct btrfs_dir_item *dir_item;
        bool reserve_leaf_space;
        u32 data_len;
        int ret;

        delayed_node = btrfs_get_or_create_delayed_node(dir);
        if (IS_ERR(delayed_node))
                return PTR_ERR(delayed_node);

        delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len,
                                                delayed_node,
                                                BTRFS_DELAYED_INSERTION_ITEM);
        if (!delayed_item) {
                ret = -ENOMEM;
                goto release_node;
        }

        delayed_item->index = index;

        dir_item = (struct btrfs_dir_item *)delayed_item->data;
        dir_item->location = *disk_key;
        btrfs_set_stack_dir_transid(dir_item, trans->transid);
        btrfs_set_stack_dir_data_len(dir_item, 0);
        btrfs_set_stack_dir_name_len(dir_item, name_len);
        btrfs_set_stack_dir_flags(dir_item, flags);
        memcpy((char *)(dir_item + 1), name, name_len);

        data_len = delayed_item->data_len + sizeof(struct btrfs_item);

        mutex_lock(&delayed_node->mutex);

        /*
         * First attempt to insert the delayed item. This is to make the error
         * handling path simpler in case we fail (-EEXIST). There's no risk of
         * any other task coming in and running the delayed item before we do
         * the metadata space reservation below, because we are holding the
         * delayed node's mutex and that mutex must also be locked before the
         * node's delayed items can be run.
         */
        ret = __btrfs_add_delayed_item(delayed_node, delayed_item);
        if (unlikely(ret)) {
                btrfs_err(trans->fs_info,
"error adding delayed dir index item, name: %.*s, index: %llu, root: %llu, dir: %llu, dir->index_cnt: %llu, delayed_node->index_cnt: %llu, error: %d",
                          name_len, name, index, btrfs_root_id(delayed_node->root),
                          delayed_node->inode_id, dir->index_cnt,
                          delayed_node->index_cnt, ret);
                btrfs_release_delayed_item(delayed_item);
                btrfs_release_dir_index_item_space(trans);
                mutex_unlock(&delayed_node->mutex);
                goto release_node;
        }

        if (delayed_node->index_item_leaves == 0 ||
            delayed_node->curr_index_batch_size + data_len > leaf_data_size) {
                delayed_node->curr_index_batch_size = data_len;
                reserve_leaf_space = true;
        } else {
                delayed_node->curr_index_batch_size += data_len;
                reserve_leaf_space = false;
        }

        if (reserve_leaf_space) {
                ret = btrfs_delayed_item_reserve_metadata(trans, delayed_item);
                /*
                 * Space was reserved for a dir index item insertion when we
                 * started the transaction, so getting a failure here should be
                 * impossible.
                 */
                if (WARN_ON(ret)) {
                        btrfs_release_delayed_item(delayed_item);
                        mutex_unlock(&delayed_node->mutex);
                        goto release_node;
                }

                delayed_node->index_item_leaves++;
        } else {
                btrfs_release_dir_index_item_space(trans);
        }
        mutex_unlock(&delayed_node->mutex);

release_node:
        btrfs_release_delayed_node(delayed_node);
        return ret;
}

static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
                                               struct btrfs_delayed_node *node,
                                               u64 index)
{
        struct btrfs_delayed_item *item;

        mutex_lock(&node->mutex);
        item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index);
        if (!item) {
                mutex_unlock(&node->mutex);
                return 1;
        }

        /*
         * For delayed items to insert, we track reserved metadata bytes based
         * on the number of leaves that we will use.
         * See btrfs_insert_delayed_dir_index() and
         * btrfs_delayed_item_reserve_metadata()).
         */
        ASSERT(item->bytes_reserved == 0);
        ASSERT(node->index_item_leaves > 0);

        /*
         * If there's only one leaf reserved, we can decrement this item from the
         * current batch, otherwise we can not because we don't know which leaf
         * it belongs to. With the current limit on delayed items, we rarely
         * accumulate enough dir index items to fill more than one leaf (even
         * when using a leaf size of 4K).
         */
        if (node->index_item_leaves == 1) {
                const u32 data_len = item->data_len + sizeof(struct btrfs_item);

                ASSERT(node->curr_index_batch_size >= data_len);
                node->curr_index_batch_size -= data_len;
        }

        btrfs_release_delayed_item(item);

        /* If we now have no more dir index items, we can release all leaves. */
        if (RB_EMPTY_ROOT(&node->ins_root.rb_root)) {
                btrfs_delayed_item_release_leaves(node, node->index_item_leaves);
                node->index_item_leaves = 0;
        }

        mutex_unlock(&node->mutex);
        return 0;
}

int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
                                   struct btrfs_inode *dir, u64 index)
{
        struct btrfs_delayed_node *node;
        struct btrfs_delayed_item *item;
        int ret;

        node = btrfs_get_or_create_delayed_node(dir);
        if (IS_ERR(node))
                return PTR_ERR(node);

        ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index);
        if (!ret)
                goto end;

        item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM);
        if (!item) {
                ret = -ENOMEM;
                goto end;
        }

        item->index = index;

        ret = btrfs_delayed_item_reserve_metadata(trans, item);
        /*
         * we have reserved enough space when we start a new transaction,
         * so reserving metadata failure is impossible.
         */
        if (ret < 0) {
                btrfs_err(trans->fs_info,
"metadata reservation failed for delayed dir item deltiona, should have been reserved");
                btrfs_release_delayed_item(item);
                goto end;
        }

        mutex_lock(&node->mutex);
        ret = __btrfs_add_delayed_item(node, item);
        if (unlikely(ret)) {
                btrfs_err(trans->fs_info,
                          "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
                          index, btrfs_root_id(node->root),
                          node->inode_id, ret);
                btrfs_delayed_item_release_metadata(dir->root, item);
                btrfs_release_delayed_item(item);
        }
        mutex_unlock(&node->mutex);
end:
        btrfs_release_delayed_node(node);
        return ret;
}

int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
{
        struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);

        if (!delayed_node)
                return -ENOENT;

        /*
         * Since we have held i_mutex of this directory, it is impossible that
         * a new directory index is added into the delayed node and index_cnt
         * is updated now. So we needn't lock the delayed node.
         */
        if (!delayed_node->index_cnt) {
                btrfs_release_delayed_node(delayed_node);
                return -EINVAL;
        }

        inode->index_cnt = delayed_node->index_cnt;
        btrfs_release_delayed_node(delayed_node);
        return 0;
}

bool btrfs_readdir_get_delayed_items(struct inode *inode,
                                     u64 last_index,
                                     struct list_head *ins_list,
                                     struct list_head *del_list)
{
        struct btrfs_delayed_node *delayed_node;
        struct btrfs_delayed_item *item;

        delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
        if (!delayed_node)
                return false;

        /*
         * We can only do one readdir with delayed items at a time because of
         * item->readdir_list.
         */
        btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
        btrfs_inode_lock(BTRFS_I(inode), 0);

        mutex_lock(&delayed_node->mutex);
        item = __btrfs_first_delayed_insertion_item(delayed_node);
        while (item && item->index <= last_index) {
                refcount_inc(&item->refs);
                list_add_tail(&item->readdir_list, ins_list);
                item = __btrfs_next_delayed_item(item);
        }

        item = __btrfs_first_delayed_deletion_item(delayed_node);
        while (item && item->index <= last_index) {
                refcount_inc(&item->refs);
                list_add_tail(&item->readdir_list, del_list);
                item = __btrfs_next_delayed_item(item);
        }
        mutex_unlock(&delayed_node->mutex);
        /*
         * This delayed node is still cached in the btrfs inode, so refs
         * must be > 1 now, and we needn't check it is going to be freed
         * or not.
         *
         * Besides that, this function is used to read dir, we do not
         * insert/delete delayed items in this period. So we also needn't
         * requeue or dequeue this delayed node.
         */
        refcount_dec(&delayed_node->refs);

        return true;
}

void btrfs_readdir_put_delayed_items(struct inode *inode,
                                     struct list_head *ins_list,
                                     struct list_head *del_list)
{
        struct btrfs_delayed_item *curr, *next;

        list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
                list_del(&curr->readdir_list);
                if (refcount_dec_and_test(&curr->refs))
                        kfree(curr);
        }

        list_for_each_entry_safe(curr, next, del_list, readdir_list) {
                list_del(&curr->readdir_list);
                if (refcount_dec_and_test(&curr->refs))
                        kfree(curr);
        }

        /*
         * The VFS is going to do up_read(), so we need to downgrade back to a
         * read lock.
         */
        downgrade_write(&inode->i_rwsem);
}

int btrfs_should_delete_dir_index(struct list_head *del_list,
                                  u64 index)
{
        struct btrfs_delayed_item *curr;
        int ret = 0;

        list_for_each_entry(curr, del_list, readdir_list) {
                if (curr->index > index)
                        break;
                if (curr->index == index) {
                        ret = 1;
                        break;
                }
        }
        return ret;
}

/*
 * Read dir info stored in the delayed tree.
 */
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
                                    struct list_head *ins_list)
{
        struct btrfs_dir_item *di;
        struct btrfs_delayed_item *curr, *next;
        struct btrfs_key location;
        char *name;
        int name_len;
        int over = 0;
        unsigned char d_type;

        /*
         * Changing the data of the delayed item is impossible. So
         * we needn't lock them. And we have held i_mutex of the
         * directory, nobody can delete any directory indexes now.
         */
        list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
                list_del(&curr->readdir_list);

                if (curr->index < ctx->pos) {
                        if (refcount_dec_and_test(&curr->refs))
                                kfree(curr);
                        continue;
                }

                ctx->pos = curr->index;

                di = (struct btrfs_dir_item *)curr->data;
                name = (char *)(di + 1);
                name_len = btrfs_stack_dir_name_len(di);

                d_type = fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di->type));
                btrfs_disk_key_to_cpu(&location, &di->location);

                over = !dir_emit(ctx, name, name_len,
                               location.objectid, d_type);

                if (refcount_dec_and_test(&curr->refs))
                        kfree(curr);

                if (over)
                        return 1;
                ctx->pos++;
        }
        return 0;
}

static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
                                  struct btrfs_inode_item *inode_item,
                                  struct inode *inode)
{
        u64 flags;

        btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
        btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
        btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
        btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
        btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
        btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
        btrfs_set_stack_inode_generation(inode_item,
                                         BTRFS_I(inode)->generation);
        btrfs_set_stack_inode_sequence(inode_item,
                                       inode_peek_iversion(inode));
        btrfs_set_stack_inode_transid(inode_item, trans->transid);
        btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
        flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
                                          BTRFS_I(inode)->ro_flags);
        btrfs_set_stack_inode_flags(inode_item, flags);
        btrfs_set_stack_inode_block_group(inode_item, 0);

        btrfs_set_stack_timespec_sec(&inode_item->atime,
                                     inode_get_atime_sec(inode));
        btrfs_set_stack_timespec_nsec(&inode_item->atime,
                                      inode_get_atime_nsec(inode));

        btrfs_set_stack_timespec_sec(&inode_item->mtime,
                                     inode_get_mtime_sec(inode));
        btrfs_set_stack_timespec_nsec(&inode_item->mtime,
                                      inode_get_mtime_nsec(inode));

        btrfs_set_stack_timespec_sec(&inode_item->ctime,
                                     inode_get_ctime_sec(inode));
        btrfs_set_stack_timespec_nsec(&inode_item->ctime,
                                      inode_get_ctime_nsec(inode));

        btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec);
        btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec);
}

int btrfs_fill_inode(struct inode *inode, u32 *rdev)
{
        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
        struct btrfs_delayed_node *delayed_node;
        struct btrfs_inode_item *inode_item;

        delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
        if (!delayed_node)
                return -ENOENT;

        mutex_lock(&delayed_node->mutex);
        if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
                mutex_unlock(&delayed_node->mutex);
                btrfs_release_delayed_node(delayed_node);
                return -ENOENT;
        }

        inode_item = &delayed_node->inode_item;

        i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
        i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
        btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item));
        btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
                        round_up(i_size_read(inode), fs_info->sectorsize));
        inode->i_mode = btrfs_stack_inode_mode(inode_item);
        set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
        inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
        BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
        BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item);

        inode_set_iversion_queried(inode,
                                   btrfs_stack_inode_sequence(inode_item));
        inode->i_rdev = 0;
        *rdev = btrfs_stack_inode_rdev(inode_item);
        btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item),
                                &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);

        inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime),
                        btrfs_stack_timespec_nsec(&inode_item->atime));

        inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime),
                        btrfs_stack_timespec_nsec(&inode_item->mtime));

        inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime),
                        btrfs_stack_timespec_nsec(&inode_item->ctime));

        BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime);
        BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime);

        inode->i_generation = BTRFS_I(inode)->generation;
        BTRFS_I(inode)->index_cnt = (u64)-1;

        mutex_unlock(&delayed_node->mutex);
        btrfs_release_delayed_node(delayed_node);
        return 0;
}

int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
                               struct btrfs_inode *inode)
{
        struct btrfs_root *root = inode->root;
        struct btrfs_delayed_node *delayed_node;
        int ret = 0;

        delayed_node = btrfs_get_or_create_delayed_node(inode);
        if (IS_ERR(delayed_node))
                return PTR_ERR(delayed_node);

        mutex_lock(&delayed_node->mutex);
        if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
                fill_stack_inode_item(trans, &delayed_node->inode_item,
                                      &inode->vfs_inode);
                goto release_node;
        }

        ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
        if (ret)
                goto release_node;

        fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode);
        set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
        delayed_node->count++;
        atomic_inc(&root->fs_info->delayed_root->items);
release_node:
        mutex_unlock(&delayed_node->mutex);
        btrfs_release_delayed_node(delayed_node);
        return ret;
}

int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct btrfs_delayed_node *delayed_node;

        /*
         * we don't do delayed inode updates during log recovery because it
         * leads to enospc problems.  This means we also can't do
         * delayed inode refs
         */
        if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
                return -EAGAIN;

        delayed_node = btrfs_get_or_create_delayed_node(inode);
        if (IS_ERR(delayed_node))
                return PTR_ERR(delayed_node);

        /*
         * We don't reserve space for inode ref deletion is because:
         * - We ONLY do async inode ref deletion for the inode who has only
         *   one link(i_nlink == 1), it means there is only one inode ref.
         *   And in most case, the inode ref and the inode item are in the
         *   same leaf, and we will deal with them at the same time.
         *   Since we are sure we will reserve the space for the inode item,
         *   it is unnecessary to reserve space for inode ref deletion.
         * - If the inode ref and the inode item are not in the same leaf,
         *   We also needn't worry about enospc problem, because we reserve
         *   much more space for the inode update than it needs.
         * - At the worst, we can steal some space from the global reservation.
         *   It is very rare.
         */
        mutex_lock(&delayed_node->mutex);
        if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
                goto release_node;

        set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
        delayed_node->count++;
        atomic_inc(&fs_info->delayed_root->items);
release_node:
        mutex_unlock(&delayed_node->mutex);
        btrfs_release_delayed_node(delayed_node);
        return 0;
}

static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
{
        struct btrfs_root *root = delayed_node->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_delayed_item *curr_item, *prev_item;

        mutex_lock(&delayed_node->mutex);
        curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
        while (curr_item) {
                prev_item = curr_item;
                curr_item = __btrfs_next_delayed_item(prev_item);
                btrfs_release_delayed_item(prev_item);
        }

        if (delayed_node->index_item_leaves > 0) {
                btrfs_delayed_item_release_leaves(delayed_node,
                                          delayed_node->index_item_leaves);
                delayed_node->index_item_leaves = 0;
        }

        curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
        while (curr_item) {
                btrfs_delayed_item_release_metadata(root, curr_item);
                prev_item = curr_item;
                curr_item = __btrfs_next_delayed_item(prev_item);
                btrfs_release_delayed_item(prev_item);
        }

        btrfs_release_delayed_iref(delayed_node);

        if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
                btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false);
                btrfs_release_delayed_inode(delayed_node);
        }
        mutex_unlock(&delayed_node->mutex);
}

void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
{
        struct btrfs_delayed_node *delayed_node;

        delayed_node = btrfs_get_delayed_node(inode);
        if (!delayed_node)
                return;

        __btrfs_kill_delayed_node(delayed_node);
        btrfs_release_delayed_node(delayed_node);
}

void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
{
        unsigned long index = 0;
        struct btrfs_delayed_node *delayed_nodes[8];

        while (1) {
                struct btrfs_delayed_node *node;
                int count;

                spin_lock(&root->inode_lock);
                if (xa_empty(&root->delayed_nodes)) {
                        spin_unlock(&root->inode_lock);
                        return;
                }

                count = 0;
                xa_for_each_start(&root->delayed_nodes, index, node, index) {
                        /*
                         * Don't increase refs in case the node is dead and
                         * about to be removed from the tree in the loop below
                         */
                        if (refcount_inc_not_zero(&node->refs)) {
                                delayed_nodes[count] = node;
                                count++;
                        }
                        if (count >= ARRAY_SIZE(delayed_nodes))
                                break;
                }
                spin_unlock(&root->inode_lock);
                index++;

                for (int i = 0; i < count; i++) {
                        __btrfs_kill_delayed_node(delayed_nodes[i]);
                        btrfs_release_delayed_node(delayed_nodes[i]);
                }
        }
}

void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info)
{
        struct btrfs_delayed_node *curr_node, *prev_node;

        curr_node = btrfs_first_delayed_node(fs_info->delayed_root);
        while (curr_node) {
                __btrfs_kill_delayed_node(curr_node);

                prev_node = curr_node;
                curr_node = btrfs_next_delayed_node(curr_node);
                btrfs_release_delayed_node(prev_node);
        }
}

void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
                                 struct list_head *ins_list,
                                 struct list_head *del_list)
{
        struct btrfs_delayed_node *node;
        struct btrfs_delayed_item *item;

        node = btrfs_get_delayed_node(inode);
        if (!node)
                return;

        mutex_lock(&node->mutex);
        item = __btrfs_first_delayed_insertion_item(node);
        while (item) {
                /*
                 * It's possible that the item is already in a log list. This
                 * can happen in case two tasks are trying to log the same
                 * directory. For example if we have tasks A and task B:
                 *
                 * Task A collected the delayed items into a log list while
                 * under the inode's log_mutex (at btrfs_log_inode()), but it
                 * only releases the items after logging the inodes they point
                 * to (if they are new inodes), which happens after unlocking
                 * the log mutex;
                 *
                 * Task B enters btrfs_log_inode() and acquires the log_mutex
                 * of the same directory inode, before task B releases the
                 * delayed items. This can happen for example when logging some
                 * inode we need to trigger logging of its parent directory, so
                 * logging two files that have the same parent directory can
                 * lead to this.
                 *
                 * If this happens, just ignore delayed items already in a log
                 * list. All the tasks logging the directory are under a log
                 * transaction and whichever finishes first can not sync the log
                 * before the other completes and leaves the log transaction.
                 */
                if (!item->logged && list_empty(&item->log_list)) {
                        refcount_inc(&item->refs);
                        list_add_tail(&item->log_list, ins_list);
                }
                item = __btrfs_next_delayed_item(item);
        }

        item = __btrfs_first_delayed_deletion_item(node);
        while (item) {
                /* It may be non-empty, for the same reason mentioned above. */
                if (!item->logged && list_empty(&item->log_list)) {
                        refcount_inc(&item->refs);
                        list_add_tail(&item->log_list, del_list);
                }
                item = __btrfs_next_delayed_item(item);
        }
        mutex_unlock(&node->mutex);

        /*
         * We are called during inode logging, which means the inode is in use
         * and can not be evicted before we finish logging the inode. So we never
         * have the last reference on the delayed inode.
         * Also, we don't use btrfs_release_delayed_node() because that would
         * requeue the delayed inode (change its order in the list of prepared
         * nodes) and we don't want to do such change because we don't create or
         * delete delayed items.
         */
        ASSERT(refcount_read(&node->refs) > 1);
        refcount_dec(&node->refs);
}

void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
                                 struct list_head *ins_list,
                                 struct list_head *del_list)
{
        struct btrfs_delayed_node *node;
        struct btrfs_delayed_item *item;
        struct btrfs_delayed_item *next;

        node = btrfs_get_delayed_node(inode);
        if (!node)
                return;

        mutex_lock(&node->mutex);

        list_for_each_entry_safe(item, next, ins_list, log_list) {
                item->logged = true;
                list_del_init(&item->log_list);
                if (refcount_dec_and_test(&item->refs))
                        kfree(item);
        }

        list_for_each_entry_safe(item, next, del_list, log_list) {
                item->logged = true;
                list_del_init(&item->log_list);
                if (refcount_dec_and_test(&item->refs))
                        kfree(item);
        }

        mutex_unlock(&node->mutex);

        /*
         * We are called during inode logging, which means the inode is in use
         * and can not be evicted before we finish logging the inode. So we never
         * have the last reference on the delayed inode.
         * Also, we don't use btrfs_release_delayed_node() because that would
         * requeue the delayed inode (change its order in the list of prepared
         * nodes) and we don't want to do such change because we don't create or
         * delete delayed items.
         */
        ASSERT(refcount_read(&node->refs) > 1);
        refcount_dec(&node->refs);
}



























































































































































    1 












































    1 



































    1 





































    1 







































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (c) 2009-2021 Christoph Hellwig
 *
 * NOTE: none of these tracepoints shall be considered a stable kernel ABI
 * as they can change at any time.
 *
 * Current conventions for printing numbers measuring specific units:
 *
 * offset: byte offset into a subcomponent of a file operation
 * pos: file offset, in bytes
 * length: length of a file operation, in bytes
 * ino: inode number
 *
 * Numbers describing space allocations should be formatted in hexadecimal.
 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM iomap

#if !defined(_IOMAP_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _IOMAP_TRACE_H

#include <linux/tracepoint.h>

struct inode;

DECLARE_EVENT_CLASS(iomap_readpage_class,
        TP_PROTO(struct inode *inode, int nr_pages),
        TP_ARGS(inode, nr_pages),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(u64, ino)
                __field(int, nr_pages)
        ),
        TP_fast_assign(
                __entry->dev = inode->i_sb->s_dev;
                __entry->ino = inode->i_ino;
                __entry->nr_pages = nr_pages;
        ),
        TP_printk("dev %d:%d ino 0x%llx nr_pages %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->nr_pages)
)

#define DEFINE_READPAGE_EVENT(name)                \
DEFINE_EVENT(iomap_readpage_class, name,        \
        TP_PROTO(struct inode *inode, int nr_pages), \
        TP_ARGS(inode, nr_pages))
DEFINE_READPAGE_EVENT(iomap_readpage);
DEFINE_READPAGE_EVENT(iomap_readahead);

DECLARE_EVENT_CLASS(iomap_range_class,
        TP_PROTO(struct inode *inode, loff_t off, u64 len),
        TP_ARGS(inode, off, len),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(u64, ino)
                __field(loff_t, size)
                __field(loff_t, offset)
                __field(u64, length)
        ),
        TP_fast_assign(
                __entry->dev = inode->i_sb->s_dev;
                __entry->ino = inode->i_ino;
                __entry->size = i_size_read(inode);
                __entry->offset = off;
                __entry->length = len;
        ),
        TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx length 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->size,
                  __entry->offset,
                  __entry->length)
)

#define DEFINE_RANGE_EVENT(name)                \
DEFINE_EVENT(iomap_range_class, name,        \
        TP_PROTO(struct inode *inode, loff_t off, u64 len),\
        TP_ARGS(inode, off, len))
DEFINE_RANGE_EVENT(iomap_writepage);
DEFINE_RANGE_EVENT(iomap_release_folio);
DEFINE_RANGE_EVENT(iomap_invalidate_folio);
DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail);
DEFINE_RANGE_EVENT(iomap_dio_rw_queued);

#define IOMAP_TYPE_STRINGS \
        { IOMAP_HOLE,                "HOLE" }, \
        { IOMAP_DELALLOC,        "DELALLOC" }, \
        { IOMAP_MAPPED,                "MAPPED" }, \
        { IOMAP_UNWRITTEN,        "UNWRITTEN" }, \
        { IOMAP_INLINE,                "INLINE" }

#define IOMAP_FLAGS_STRINGS \
        { IOMAP_WRITE,                "WRITE" }, \
        { IOMAP_ZERO,                "ZERO" }, \
        { IOMAP_REPORT,                "REPORT" }, \
        { IOMAP_FAULT,                "FAULT" }, \
        { IOMAP_DIRECT,                "DIRECT" }, \
        { IOMAP_NOWAIT,                "NOWAIT" }

#define IOMAP_F_FLAGS_STRINGS \
        { IOMAP_F_NEW,                "NEW" }, \
        { IOMAP_F_DIRTY,        "DIRTY" }, \
        { IOMAP_F_SHARED,        "SHARED" }, \
        { IOMAP_F_MERGED,        "MERGED" }, \
        { IOMAP_F_BUFFER_HEAD,        "BH" }, \
        { IOMAP_F_SIZE_CHANGED,        "SIZE_CHANGED" }

#define IOMAP_DIO_STRINGS \
        {IOMAP_DIO_FORCE_WAIT,        "DIO_FORCE_WAIT" }, \
        {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \
        {IOMAP_DIO_PARTIAL,        "DIO_PARTIAL" }

DECLARE_EVENT_CLASS(iomap_class,
        TP_PROTO(struct inode *inode, struct iomap *iomap),
        TP_ARGS(inode, iomap),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(u64, ino)
                __field(u64, addr)
                __field(loff_t, offset)
                __field(u64, length)
                __field(u16, type)
                __field(u16, flags)
                __field(dev_t, bdev)
        ),
        TP_fast_assign(
                __entry->dev = inode->i_sb->s_dev;
                __entry->ino = inode->i_ino;
                __entry->addr = iomap->addr;
                __entry->offset = iomap->offset;
                __entry->length = iomap->length;
                __entry->type = iomap->type;
                __entry->flags = iomap->flags;
                __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0;
        ),
        TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr 0x%llx offset 0x%llx "
                  "length 0x%llx type %s flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  MAJOR(__entry->bdev), MINOR(__entry->bdev),
                  __entry->addr,
                  __entry->offset,
                  __entry->length,
                  __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS),
                  __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS))
)

#define DEFINE_IOMAP_EVENT(name)                \
DEFINE_EVENT(iomap_class, name,        \
        TP_PROTO(struct inode *inode, struct iomap *iomap), \
        TP_ARGS(inode, iomap))
DEFINE_IOMAP_EVENT(iomap_iter_dstmap);
DEFINE_IOMAP_EVENT(iomap_iter_srcmap);

TRACE_EVENT(iomap_writepage_map,
        TP_PROTO(struct inode *inode, u64 pos, unsigned int dirty_len,
                 struct iomap *iomap),
        TP_ARGS(inode, pos, dirty_len, iomap),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(u64, ino)
                __field(u64, pos)
                __field(u64, dirty_len)
                __field(u64, addr)
                __field(loff_t, offset)
                __field(u64, length)
                __field(u16, type)
                __field(u16, flags)
                __field(dev_t, bdev)
        ),
        TP_fast_assign(
                __entry->dev = inode->i_sb->s_dev;
                __entry->ino = inode->i_ino;
                __entry->pos = pos;
                __entry->dirty_len = dirty_len;
                __entry->addr = iomap->addr;
                __entry->offset = iomap->offset;
                __entry->length = iomap->length;
                __entry->type = iomap->type;
                __entry->flags = iomap->flags;
                __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0;
        ),
        TP_printk("dev %d:%d ino 0x%llx bdev %d:%d pos 0x%llx dirty len 0x%llx "
                  "addr 0x%llx offset 0x%llx length 0x%llx type %s flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  MAJOR(__entry->bdev), MINOR(__entry->bdev),
                  __entry->pos,
                  __entry->dirty_len,
                  __entry->addr,
                  __entry->offset,
                  __entry->length,
                  __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS),
                  __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS))
);

TRACE_EVENT(iomap_iter,
        TP_PROTO(struct iomap_iter *iter, const void *ops,
                 unsigned long caller),
        TP_ARGS(iter, ops, caller),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(u64, ino)
                __field(loff_t, pos)
                __field(u64, length)
                __field(s64, processed)
                __field(unsigned int, flags)
                __field(const void *, ops)
                __field(unsigned long, caller)
        ),
        TP_fast_assign(
                __entry->dev = iter->inode->i_sb->s_dev;
                __entry->ino = iter->inode->i_ino;
                __entry->pos = iter->pos;
                __entry->length = iomap_length(iter);
                __entry->processed = iter->processed;
                __entry->flags = iter->flags;
                __entry->ops = ops;
                __entry->caller = caller;
        ),
        TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                   __entry->ino,
                   __entry->pos,
                   __entry->length,
                   __entry->processed,
                   __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS),
                   __entry->flags,
                   __entry->ops,
                   (void *)__entry->caller)
);

TRACE_EVENT(iomap_dio_rw_begin,
        TP_PROTO(struct kiocb *iocb, struct iov_iter *iter,
                 unsigned int dio_flags, size_t done_before),
        TP_ARGS(iocb, iter, dio_flags, done_before),
        TP_STRUCT__entry(
                __field(dev_t,        dev)
                __field(ino_t,        ino)
                __field(loff_t, isize)
                __field(loff_t, pos)
                __field(size_t,        count)
                __field(size_t,        done_before)
                __field(int,        ki_flags)
                __field(unsigned int,        dio_flags)
                __field(bool,        aio)
        ),
        TP_fast_assign(
                __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev;
                __entry->ino = file_inode(iocb->ki_filp)->i_ino;
                __entry->isize = file_inode(iocb->ki_filp)->i_size;
                __entry->pos = iocb->ki_pos;
                __entry->count = iov_iter_count(iter);
                __entry->done_before = done_before;
                __entry->ki_flags = iocb->ki_flags;
                __entry->dio_flags = dio_flags;
                __entry->aio = !is_sync_kiocb(iocb);
        ),
        TP_printk("dev %d:%d ino 0x%lx size 0x%llx offset 0x%llx length 0x%zx done_before 0x%zx flags %s dio_flags %s aio %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->isize,
                  __entry->pos,
                  __entry->count,
                  __entry->done_before,
                  __print_flags(__entry->ki_flags, "|", TRACE_IOCB_STRINGS),
                  __print_flags(__entry->dio_flags, "|", IOMAP_DIO_STRINGS),
                  __entry->aio)
);

TRACE_EVENT(iomap_dio_complete,
        TP_PROTO(struct kiocb *iocb, int error, ssize_t ret),
        TP_ARGS(iocb, error, ret),
        TP_STRUCT__entry(
                __field(dev_t,        dev)
                __field(ino_t,        ino)
                __field(loff_t, isize)
                __field(loff_t, pos)
                __field(int,        ki_flags)
                __field(bool,        aio)
                __field(int,        error)
                __field(ssize_t, ret)
        ),
        TP_fast_assign(
                __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev;
                __entry->ino = file_inode(iocb->ki_filp)->i_ino;
                __entry->isize = file_inode(iocb->ki_filp)->i_size;
                __entry->pos = iocb->ki_pos;
                __entry->ki_flags = iocb->ki_flags;
                __entry->aio = !is_sync_kiocb(iocb);
                __entry->error = error;
                __entry->ret = ret;
        ),
        TP_printk("dev %d:%d ino 0x%lx size 0x%llx offset 0x%llx flags %s aio %d error %d ret %zd",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->isize,
                  __entry->pos,
                  __print_flags(__entry->ki_flags, "|", TRACE_IOCB_STRINGS),
                  __entry->aio,
                  __entry->error,
                  __entry->ret)
);

#endif /* _IOMAP_TRACE_H */

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
#define TRACE_INCLUDE_FILE trace
#include <trace/define_trace.h>
































































    2 












    2 












    2 

















































    2 


    3 
    1 
    2 



















    2 


























































    1 

    2 


    1 













































    1 







    1 

    1 















    1 


    1 

    1 






    1 
















































































    1 

    1 





    1 
    1 
    1 

























    1 


    1 







    1 







    1 





    1 
    1 























    1 








    1 
    1 
    1 







    1 















    1 











    1 

    1 








    1 

    1 

















    1 







    1 




    1 
    1 

    1 




    1 















































    1 













    1 
    1 

    1 










    1 

















    1 
    1 
    1 









    1 






    1 





    1 









































    1 






















    1 


















    1 












    1 





    1 


    1 




































































    2 



    1 


















    1 
    1 















    1 







    2 














    1 




    1 


































    1 

























    2 













    2 









    2 


    1 




    1 


































    1 





    1 







    1 


















    1 











    1 







    1 








































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
// SPDX-License-Identifier: GPL-2.0
/*
 * This file contains the procedures for the handling of select and poll
 *
 * Created for Linux based loosely upon Mathius Lattner's minix
 * patches by Peter MacDonald. Heavily edited by Linus.
 *
 *  4 February 1994
 *     COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
 *     flag set in its personality we do *not* modify the given timeout
 *     parameter to reflect time remaining.
 *
 *  24 January 2000
 *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
 *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
 */

#include <linux/compat.h>
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/personality.h> /* for STICKY_TIMEOUTS */
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/fs.h>
#include <linux/rcupdate.h>
#include <linux/hrtimer.h>
#include <linux/freezer.h>
#include <net/busy_poll.h>
#include <linux/vmalloc.h>

#include <linux/uaccess.h>


/*
 * Estimate expected accuracy in ns from a timeval.
 *
 * After quite a bit of churning around, we've settled on
 * a simple thing of taking 0.1% of the timeout as the
 * slack, with a cap of 100 msec.
 * "nice" tasks get a 0.5% slack instead.
 *
 * Consider this comment an open invitation to come up with even
 * better solutions..
 */

#define MAX_SLACK        (100 * NSEC_PER_MSEC)

static long __estimate_accuracy(struct timespec64 *tv)
{
        long slack;
        int divfactor = 1000;

        if (tv->tv_sec < 0)
                return 0;

        if (task_nice(current) > 0)
                divfactor = divfactor / 5;

        if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
                return MAX_SLACK;

        slack = tv->tv_nsec / divfactor;
        slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);

        if (slack > MAX_SLACK)
                return MAX_SLACK;

        return slack;
}

u64 select_estimate_accuracy(struct timespec64 *tv)
{
        u64 ret;
        struct timespec64 now;

        /*
         * Realtime tasks get a slack of 0 for obvious reasons.
         */

        if (rt_task(current))
                return 0;

        ktime_get_ts64(&now);
        now = timespec64_sub(*tv, now);
        ret = __estimate_accuracy(&now);
        if (ret < current->timer_slack_ns)
                return current->timer_slack_ns;
        return ret;
}



struct poll_table_page {
        struct poll_table_page * next;
        struct poll_table_entry * entry;
        struct poll_table_entry entries[];
};

#define POLL_TABLE_FULL(table) \
        ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))

/*
 * Ok, Peter made a complicated, but straightforward multiple_wait() function.
 * I have rewritten this, taking some shortcuts: This code may not be easy to
 * follow, but it should be free of race-conditions, and it's practical. If you
 * understand what I'm doing here, then you understand how the linux
 * sleep/wakeup mechanism works.
 *
 * Two very simple procedures, poll_wait() and poll_freewait() make all the
 * work.  poll_wait() is an inline-function defined in <linux/poll.h>,
 * as all select/poll functions have to call it to add an entry to the
 * poll table.
 */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
                       poll_table *p);

void poll_initwait(struct poll_wqueues *pwq)
{
        init_poll_funcptr(&pwq->pt, __pollwait);
        pwq->polling_task = current;
        pwq->triggered = 0;
        pwq->error = 0;
        pwq->table = NULL;
        pwq->inline_index = 0;
}
EXPORT_SYMBOL(poll_initwait);

static void free_poll_entry(struct poll_table_entry *entry)
{
        remove_wait_queue(entry->wait_address, &entry->wait);
        fput(entry->filp);
}

void poll_freewait(struct poll_wqueues *pwq)
{
        struct poll_table_page * p = pwq->table;
        int i;
        for (i = 0; i < pwq->inline_index; i++)
                free_poll_entry(pwq->inline_entries + i);
        while (p) {
                struct poll_table_entry * entry;
                struct poll_table_page *old;

                entry = p->entry;
                do {
                        entry--;
                        free_poll_entry(entry);
                } while (entry > p->entries);
                old = p;
                p = p->next;
                free_page((unsigned long) old);
        }
}
EXPORT_SYMBOL(poll_freewait);

static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
{
        struct poll_table_page *table = p->table;

        if (p->inline_index < N_INLINE_POLL_ENTRIES)
                return p->inline_entries + p->inline_index++;

        if (!table || POLL_TABLE_FULL(table)) {
                struct poll_table_page *new_table;

                new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
                if (!new_table) {
                        p->error = -ENOMEM;
                        return NULL;
                }
                new_table->entry = new_table->entries;
                new_table->next = table;
                p->table = new_table;
                table = new_table;
        }

        return table->entry++;
}

static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
        struct poll_wqueues *pwq = wait->private;
        DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);

        /*
         * Although this function is called under waitqueue lock, LOCK
         * doesn't imply write barrier and the users expect write
         * barrier semantics on wakeup functions.  The following
         * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
         * and is paired with smp_store_mb() in poll_schedule_timeout.
         */
        smp_wmb();
        pwq->triggered = 1;

        /*
         * Perform the default wake up operation using a dummy
         * waitqueue.
         *
         * TODO: This is hacky but there currently is no interface to
         * pass in @sync.  @sync is scheduled to be removed and once
         * that happens, wake_up_process() can be used directly.
         */
        return default_wake_function(&dummy_wait, mode, sync, key);
}

static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
        struct poll_table_entry *entry;

        entry = container_of(wait, struct poll_table_entry, wait);
        if (key && !(key_to_poll(key) & entry->key))
                return 0;
        return __pollwake(wait, mode, sync, key);
}

/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
                                poll_table *p)
{
        struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
        struct poll_table_entry *entry = poll_get_entry(pwq);
        if (!entry)
                return;
        entry->filp = get_file(filp);
        entry->wait_address = wait_address;
        entry->key = p->_key;
        init_waitqueue_func_entry(&entry->wait, pollwake);
        entry->wait.private = pwq;
        add_wait_queue(wait_address, &entry->wait);
}

static int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
                          ktime_t *expires, unsigned long slack)
{
        int rc = -EINTR;

        set_current_state(state);
        if (!pwq->triggered)
                rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
        __set_current_state(TASK_RUNNING);

        /*
         * Prepare for the next iteration.
         *
         * The following smp_store_mb() serves two purposes.  First, it's
         * the counterpart rmb of the wmb in pollwake() such that data
         * written before wake up is always visible after wake up.
         * Second, the full barrier guarantees that triggered clearing
         * doesn't pass event check of the next iteration.  Note that
         * this problem doesn't exist for the first iteration as
         * add_wait_queue() has full barrier semantics.
         */
        smp_store_mb(pwq->triggered, 0);

        return rc;
}

/**
 * poll_select_set_timeout - helper function to setup the timeout value
 * @to:                pointer to timespec64 variable for the final timeout
 * @sec:        seconds (from user space)
 * @nsec:        nanoseconds (from user space)
 *
 * Note, we do not use a timespec for the user space value here, That
 * way we can use the function for timeval and compat interfaces as well.
 *
 * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
 */
int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec)
{
        struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec};

        if (!timespec64_valid(&ts))
                return -EINVAL;

        /* Optimize for the zero timeout value here */
        if (!sec && !nsec) {
                to->tv_sec = to->tv_nsec = 0;
        } else {
                ktime_get_ts64(to);
                *to = timespec64_add_safe(*to, ts);
        }
        return 0;
}

enum poll_time_type {
        PT_TIMEVAL = 0,
        PT_OLD_TIMEVAL = 1,
        PT_TIMESPEC = 2,
        PT_OLD_TIMESPEC = 3,
};

static int poll_select_finish(struct timespec64 *end_time,
                              void __user *p,
                              enum poll_time_type pt_type, int ret)
{
        struct timespec64 rts;

        restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);

        if (!p)
                return ret;

        if (current->personality & STICKY_TIMEOUTS)
                goto sticky;

        /* No update for zero timeout */
        if (!end_time->tv_sec && !end_time->tv_nsec)
                return ret;

        ktime_get_ts64(&rts);
        rts = timespec64_sub(*end_time, rts);
        if (rts.tv_sec < 0)
                rts.tv_sec = rts.tv_nsec = 0;


        switch (pt_type) {
        case PT_TIMEVAL:
                {
                        struct __kernel_old_timeval rtv;

                        if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
                                memset(&rtv, 0, sizeof(rtv));
                        rtv.tv_sec = rts.tv_sec;
                        rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
                        if (!copy_to_user(p, &rtv, sizeof(rtv)))
                                return ret;
                }
                break;
        case PT_OLD_TIMEVAL:
                {
                        struct old_timeval32 rtv;

                        rtv.tv_sec = rts.tv_sec;
                        rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
                        if (!copy_to_user(p, &rtv, sizeof(rtv)))
                                return ret;
                }
                break;
        case PT_TIMESPEC:
                if (!put_timespec64(&rts, p))
                        return ret;
                break;
        case PT_OLD_TIMESPEC:
                if (!put_old_timespec32(&rts, p))
                        return ret;
                break;
        default:
                BUG();
        }
        /*
         * If an application puts its timeval in read-only memory, we
         * don't want the Linux-specific update to the timeval to
         * cause a fault after the select has completed
         * successfully. However, because we're not updating the
         * timeval, we can't restart the system call.
         */

sticky:
        if (ret == -ERESTARTNOHAND)
                ret = -EINTR;
        return ret;
}

/*
 * Scalable version of the fd_set.
 */

typedef struct {
        unsigned long *in, *out, *ex;
        unsigned long *res_in, *res_out, *res_ex;
} fd_set_bits;

/*
 * How many longwords for "nr" bits?
 */
#define FDS_BITPERLONG        (8*sizeof(long))
#define FDS_LONGS(nr)        (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
#define FDS_BYTES(nr)        (FDS_LONGS(nr)*sizeof(long))

/*
 * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned.
 */
static inline
int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
{
        nr = FDS_BYTES(nr);
        if (ufdset)
                return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0;

        memset(fdset, 0, nr);
        return 0;
}

static inline unsigned long __must_check
set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
{
        if (ufdset)
                return __copy_to_user(ufdset, fdset, FDS_BYTES(nr));
        return 0;
}

static inline
void zero_fd_set(unsigned long nr, unsigned long *fdset)
{
        memset(fdset, 0, FDS_BYTES(nr));
}

#define FDS_IN(fds, n)                (fds->in + n)
#define FDS_OUT(fds, n)                (fds->out + n)
#define FDS_EX(fds, n)                (fds->ex + n)

#define BITS(fds, n)        (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n))

static int max_select_fd(unsigned long n, fd_set_bits *fds)
{
        unsigned long *open_fds;
        unsigned long set;
        int max;
        struct fdtable *fdt;

        /* handle last in-complete long-word first */
        set = ~(~0UL << (n & (BITS_PER_LONG-1)));
        n /= BITS_PER_LONG;
        fdt = files_fdtable(current->files);
        open_fds = fdt->open_fds + n;
        max = 0;
        if (set) {
                set &= BITS(fds, n);
                if (set) {
                        if (!(set & ~*open_fds))
                                goto get_max;
                        return -EBADF;
                }
        }
        while (n) {
                open_fds--;
                n--;
                set = BITS(fds, n);
                if (!set)
                        continue;
                if (set & ~*open_fds)
                        return -EBADF;
                if (max)
                        continue;
get_max:
                do {
                        max++;
                        set >>= 1;
                } while (set);
                max += n * BITS_PER_LONG;
        }

        return max;
}

#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\
                        EPOLLNVAL)
#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\
                         EPOLLNVAL)
#define POLLEX_SET (EPOLLPRI | EPOLLNVAL)

static inline void wait_key_set(poll_table *wait, unsigned long in,
                                unsigned long out, unsigned long bit,
                                __poll_t ll_flag)
{
        wait->_key = POLLEX_SET | ll_flag;
        if (in & bit)
                wait->_key |= POLLIN_SET;
        if (out & bit)
                wait->_key |= POLLOUT_SET;
}

static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
        ktime_t expire, *to = NULL;
        struct poll_wqueues table;
        poll_table *wait;
        int retval, i, timed_out = 0;
        u64 slack = 0;
        __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
        unsigned long busy_start = 0;

        rcu_read_lock();
        retval = max_select_fd(n, fds);
        rcu_read_unlock();

        if (retval < 0)
                return retval;
        n = retval;

        poll_initwait(&table);
        wait = &table.pt;
        if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
                wait->_qproc = NULL;
                timed_out = 1;
        }

        if (end_time && !timed_out)
                slack = select_estimate_accuracy(end_time);

        retval = 0;
        for (;;) {
                unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
                bool can_busy_loop = false;

                inp = fds->in; outp = fds->out; exp = fds->ex;
                rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

                for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
                        unsigned long in, out, ex, all_bits, bit = 1, j;
                        unsigned long res_in = 0, res_out = 0, res_ex = 0;
                        __poll_t mask;

                        in = *inp++; out = *outp++; ex = *exp++;
                        all_bits = in | out | ex;
                        if (all_bits == 0) {
                                i += BITS_PER_LONG;
                                continue;
                        }

                        for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
                                struct fd f;
                                if (i >= n)
                                        break;
                                if (!(bit & all_bits))
                                        continue;
                                mask = EPOLLNVAL;
                                f = fdget(i);
                                if (f.file) {
                                        wait_key_set(wait, in, out, bit,
                                                     busy_flag);
                                        mask = vfs_poll(f.file, wait);

                                        fdput(f);
                                }
                                if ((mask & POLLIN_SET) && (in & bit)) {
                                        res_in |= bit;
                                        retval++;
                                        wait->_qproc = NULL;
                                }
                                if ((mask & POLLOUT_SET) && (out & bit)) {
                                        res_out |= bit;
                                        retval++;
                                        wait->_qproc = NULL;
                                }
                                if ((mask & POLLEX_SET) && (ex & bit)) {
                                        res_ex |= bit;
                                        retval++;
                                        wait->_qproc = NULL;
                                }
                                /* got something, stop busy polling */
                                if (retval) {
                                        can_busy_loop = false;
                                        busy_flag = 0;

                                /*
                                 * only remember a returned
                                 * POLL_BUSY_LOOP if we asked for it
                                 */
                                } else if (busy_flag & mask)
                                        can_busy_loop = true;

                        }
                        if (res_in)
                                *rinp = res_in;
                        if (res_out)
                                *routp = res_out;
                        if (res_ex)
                                *rexp = res_ex;
                        cond_resched();
                }
                wait->_qproc = NULL;
                if (retval || timed_out || signal_pending(current))
                        break;
                if (table.error) {
                        retval = table.error;
                        break;
                }

                /* only if found POLL_BUSY_LOOP sockets && not out of time */
                if (can_busy_loop && !need_resched()) {
                        if (!busy_start) {
                                busy_start = busy_loop_current_time();
                                continue;
                        }
                        if (!busy_loop_timeout(busy_start))
                                continue;
                }
                busy_flag = 0;

                /*
                 * If this is the first loop and we have a timeout
                 * given, then we convert to ktime_t and set the to
                 * pointer to the expiry value.
                 */
                if (end_time && !to) {
                        expire = timespec64_to_ktime(*end_time);
                        to = &expire;
                }

                if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
                                           to, slack))
                        timed_out = 1;
        }

        poll_freewait(&table);

        return retval;
}

/*
 * We can actually return ERESTARTSYS instead of EINTR, but I'd
 * like to be certain this leads to no problems. So I return
 * EINTR just for safety.
 *
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
                           fd_set __user *exp, struct timespec64 *end_time)
{
        fd_set_bits fds;
        void *bits;
        int ret, max_fds;
        size_t size, alloc_size;
        struct fdtable *fdt;
        /* Allocate small arguments on the stack to save memory and be faster */
        long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

        ret = -EINVAL;
        if (n < 0)
                goto out_nofds;

        /* max_fds can increase, so grab it once to avoid race */
        rcu_read_lock();
        fdt = files_fdtable(current->files);
        max_fds = fdt->max_fds;
        rcu_read_unlock();
        if (n > max_fds)
                n = max_fds;

        /*
         * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
         * since we used fdset we need to allocate memory in units of
         * long-words. 
         */
        size = FDS_BYTES(n);
        bits = stack_fds;
        if (size > sizeof(stack_fds) / 6) {
                /* Not enough space in on-stack array; must use kmalloc */
                ret = -ENOMEM;
                if (size > (SIZE_MAX / 6))
                        goto out_nofds;

                alloc_size = 6 * size;
                bits = kvmalloc(alloc_size, GFP_KERNEL);
                if (!bits)
                        goto out_nofds;
        }
        fds.in      = bits;
        fds.out     = bits +   size;
        fds.ex      = bits + 2*size;
        fds.res_in  = bits + 3*size;
        fds.res_out = bits + 4*size;
        fds.res_ex  = bits + 5*size;

        if ((ret = get_fd_set(n, inp, fds.in)) ||
            (ret = get_fd_set(n, outp, fds.out)) ||
            (ret = get_fd_set(n, exp, fds.ex)))
                goto out;
        zero_fd_set(n, fds.res_in);
        zero_fd_set(n, fds.res_out);
        zero_fd_set(n, fds.res_ex);

        ret = do_select(n, &fds, end_time);

        if (ret < 0)
                goto out;
        if (!ret) {
                ret = -ERESTARTNOHAND;
                if (signal_pending(current))
                        goto out;
                ret = 0;
        }

        if (set_fd_set(n, inp, fds.res_in) ||
            set_fd_set(n, outp, fds.res_out) ||
            set_fd_set(n, exp, fds.res_ex))
                ret = -EFAULT;

out:
        if (bits != stack_fds)
                kvfree(bits);
out_nofds:
        return ret;
}

static int kern_select(int n, fd_set __user *inp, fd_set __user *outp,
                       fd_set __user *exp, struct __kernel_old_timeval __user *tvp)
{
        struct timespec64 end_time, *to = NULL;
        struct __kernel_old_timeval tv;
        int ret;

        if (tvp) {
                if (copy_from_user(&tv, tvp, sizeof(tv)))
                        return -EFAULT;

                to = &end_time;
                if (poll_select_set_timeout(to,
                                tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
                                (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
                        return -EINVAL;
        }

        ret = core_sys_select(n, inp, outp, exp, to);
        return poll_select_finish(&end_time, tvp, PT_TIMEVAL, ret);
}

SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
                fd_set __user *, exp, struct __kernel_old_timeval __user *, tvp)
{
        return kern_select(n, inp, outp, exp, tvp);
}

static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
                       fd_set __user *exp, void __user *tsp,
                       const sigset_t __user *sigmask, size_t sigsetsize,
                       enum poll_time_type type)
{
        struct timespec64 ts, end_time, *to = NULL;
        int ret;

        if (tsp) {
                switch (type) {
                case PT_TIMESPEC:
                        if (get_timespec64(&ts, tsp))
                                return -EFAULT;
                        break;
                case PT_OLD_TIMESPEC:
                        if (get_old_timespec32(&ts, tsp))
                                return -EFAULT;
                        break;
                default:
                        BUG();
                }

                to = &end_time;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        ret = set_user_sigmask(sigmask, sigsetsize);
        if (ret)
                return ret;

        ret = core_sys_select(n, inp, outp, exp, to);
        return poll_select_finish(&end_time, tsp, type, ret);
}

/*
 * Most architectures can't handle 7-argument syscalls. So we provide a
 * 6-argument version where the sixth argument is a pointer to a structure
 * which has a pointer to the sigset_t itself followed by a size_t containing
 * the sigset size.
 */
struct sigset_argpack {
        sigset_t __user *p;
        size_t size;
};

static inline int get_sigset_argpack(struct sigset_argpack *to,
                                     struct sigset_argpack __user *from)
{
        // the path is hot enough for overhead of copy_from_user() to matter
        if (from) {
                if (!user_read_access_begin(from, sizeof(*from)))
                        return -EFAULT;
                unsafe_get_user(to->p, &from->p, Efault);
                unsafe_get_user(to->size, &from->size, Efault);
                user_read_access_end();
        }
        return 0;
Efault:
        user_access_end();
        return -EFAULT;
}

SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
                fd_set __user *, exp, struct __kernel_timespec __user *, tsp,
                void __user *, sig)
{
        struct sigset_argpack x = {NULL, 0};

        if (get_sigset_argpack(&x, sig))
                return -EFAULT;

        return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_TIMESPEC);
}

#if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT)

SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *, outp,
                fd_set __user *, exp, struct old_timespec32 __user *, tsp,
                void __user *, sig)
{
        struct sigset_argpack x = {NULL, 0};

        if (get_sigset_argpack(&x, sig))
                return -EFAULT;

        return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_OLD_TIMESPEC);
}

#endif

#ifdef __ARCH_WANT_SYS_OLD_SELECT
struct sel_arg_struct {
        unsigned long n;
        fd_set __user *inp, *outp, *exp;
        struct __kernel_old_timeval __user *tvp;
};

SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
{
        struct sel_arg_struct a;

        if (copy_from_user(&a, arg, sizeof(a)))
                return -EFAULT;
        return kern_select(a.n, a.inp, a.outp, a.exp, a.tvp);
}
#endif

struct poll_list {
        struct poll_list *next;
        unsigned int len;
        struct pollfd entries[];
};

#define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))

/*
 * Fish for pollable events on the pollfd->fd file descriptor. We're only
 * interested in events matching the pollfd->events mask, and the result
 * matching that mask is both recorded in pollfd->revents and returned. The
 * pwait poll_table will be used by the fd-provided poll handler for waiting,
 * if pwait->_qproc is non-NULL.
 */
static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
                                     bool *can_busy_poll,
                                     __poll_t busy_flag)
{
        int fd = pollfd->fd;
        __poll_t mask = 0, filter;
        struct fd f;

        if (fd < 0)
                goto out;
        mask = EPOLLNVAL;
        f = fdget(fd);
        if (!f.file)
                goto out;

        /* userland u16 ->events contains POLL... bitmap */
        filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
        pwait->_key = filter | busy_flag;
        mask = vfs_poll(f.file, pwait);
        if (mask & busy_flag)
                *can_busy_poll = true;
        mask &= filter;                /* Mask out unneeded events. */
        fdput(f);

out:
        /* ... and so does ->revents */
        pollfd->revents = mangle_poll(mask);
        return mask;
}

static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
                   struct timespec64 *end_time)
{
        poll_table* pt = &wait->pt;
        ktime_t expire, *to = NULL;
        int timed_out = 0, count = 0;
        u64 slack = 0;
        __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
        unsigned long busy_start = 0;

        /* Optimise the no-wait case */
        if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
                pt->_qproc = NULL;
                timed_out = 1;
        }

        if (end_time && !timed_out)
                slack = select_estimate_accuracy(end_time);

        for (;;) {
                struct poll_list *walk;
                bool can_busy_loop = false;

                for (walk = list; walk != NULL; walk = walk->next) {
                        struct pollfd * pfd, * pfd_end;

                        pfd = walk->entries;
                        pfd_end = pfd + walk->len;
                        for (; pfd != pfd_end; pfd++) {
                                /*
                                 * Fish for events. If we found one, record it
                                 * and kill poll_table->_qproc, so we don't
                                 * needlessly register any other waiters after
                                 * this. They'll get immediately deregistered
                                 * when we break out and return.
                                 */
                                if (do_pollfd(pfd, pt, &can_busy_loop,
                                              busy_flag)) {
                                        count++;
                                        pt->_qproc = NULL;
                                        /* found something, stop busy polling */
                                        busy_flag = 0;
                                        can_busy_loop = false;
                                }
                        }
                }
                /*
                 * All waiters have already been registered, so don't provide
                 * a poll_table->_qproc to them on the next loop iteration.
                 */
                pt->_qproc = NULL;
                if (!count) {
                        count = wait->error;
                        if (signal_pending(current))
                                count = -ERESTARTNOHAND;
                }
                if (count || timed_out)
                        break;

                /* only if found POLL_BUSY_LOOP sockets && not out of time */
                if (can_busy_loop && !need_resched()) {
                        if (!busy_start) {
                                busy_start = busy_loop_current_time();
                                continue;
                        }
                        if (!busy_loop_timeout(busy_start))
                                continue;
                }
                busy_flag = 0;

                /*
                 * If this is the first loop and we have a timeout
                 * given, then we convert to ktime_t and set the to
                 * pointer to the expiry value.
                 */
                if (end_time && !to) {
                        expire = timespec64_to_ktime(*end_time);
                        to = &expire;
                }

                if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
                        timed_out = 1;
        }
        return count;
}

#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
                        sizeof(struct pollfd))

static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
                struct timespec64 *end_time)
{
        struct poll_wqueues table;
        int err = -EFAULT, fdcount;
        /* Allocate small arguments on the stack to save memory and be
           faster - use long to make sure the buffer is aligned properly
           on 64 bit archs to avoid unaligned access */
        long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
        struct poll_list *const head = (struct poll_list *)stack_pps;
         struct poll_list *walk = head;
        unsigned int todo = nfds;
        unsigned int len;

        if (nfds > rlimit(RLIMIT_NOFILE))
                return -EINVAL;

        len = min_t(unsigned int, nfds, N_STACK_PPS);
        for (;;) {
                walk->next = NULL;
                walk->len = len;
                if (!len)
                        break;

                if (copy_from_user(walk->entries, ufds + nfds-todo,
                                        sizeof(struct pollfd) * walk->len))
                        goto out_fds;

                if (walk->len >= todo)
                        break;
                todo -= walk->len;

                len = min(todo, POLLFD_PER_PAGE);
                walk = walk->next = kmalloc(struct_size(walk, entries, len),
                                            GFP_KERNEL);
                if (!walk) {
                        err = -ENOMEM;
                        goto out_fds;
                }
        }

        poll_initwait(&table);
        fdcount = do_poll(head, &table, end_time);
        poll_freewait(&table);

        if (!user_write_access_begin(ufds, nfds * sizeof(*ufds)))
                goto out_fds;

        for (walk = head; walk; walk = walk->next) {
                struct pollfd *fds = walk->entries;
                unsigned int j;

                for (j = walk->len; j; fds++, ufds++, j--)
                        unsafe_put_user(fds->revents, &ufds->revents, Efault);
          }
        user_write_access_end();

        err = fdcount;
out_fds:
        walk = head->next;
        while (walk) {
                struct poll_list *pos = walk;
                walk = walk->next;
                kfree(pos);
        }

        return err;

Efault:
        user_write_access_end();
        err = -EFAULT;
        goto out_fds;
}

static long do_restart_poll(struct restart_block *restart_block)
{
        struct pollfd __user *ufds = restart_block->poll.ufds;
        int nfds = restart_block->poll.nfds;
        struct timespec64 *to = NULL, end_time;
        int ret;

        if (restart_block->poll.has_timeout) {
                end_time.tv_sec = restart_block->poll.tv_sec;
                end_time.tv_nsec = restart_block->poll.tv_nsec;
                to = &end_time;
        }

        ret = do_sys_poll(ufds, nfds, to);

        if (ret == -ERESTARTNOHAND)
                ret = set_restart_fn(restart_block, do_restart_poll);

        return ret;
}

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
                int, timeout_msecs)
{
        struct timespec64 end_time, *to = NULL;
        int ret;

        if (timeout_msecs >= 0) {
                to = &end_time;
                poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
                        NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
        }

        ret = do_sys_poll(ufds, nfds, to);

        if (ret == -ERESTARTNOHAND) {
                struct restart_block *restart_block;

                restart_block = &current->restart_block;
                restart_block->poll.ufds = ufds;
                restart_block->poll.nfds = nfds;

                if (timeout_msecs >= 0) {
                        restart_block->poll.tv_sec = end_time.tv_sec;
                        restart_block->poll.tv_nsec = end_time.tv_nsec;
                        restart_block->poll.has_timeout = 1;
                } else
                        restart_block->poll.has_timeout = 0;

                ret = set_restart_fn(restart_block, do_restart_poll);
        }
        return ret;
}

SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
                struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask,
                size_t, sigsetsize)
{
        struct timespec64 ts, end_time, *to = NULL;
        int ret;

        if (tsp) {
                if (get_timespec64(&ts, tsp))
                        return -EFAULT;

                to = &end_time;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        ret = set_user_sigmask(sigmask, sigsetsize);
        if (ret)
                return ret;

        ret = do_sys_poll(ufds, nfds, to);
        return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret);
}

#if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT)

SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
                struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask,
                size_t, sigsetsize)
{
        struct timespec64 ts, end_time, *to = NULL;
        int ret;

        if (tsp) {
                if (get_old_timespec32(&ts, tsp))
                        return -EFAULT;

                to = &end_time;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        ret = set_user_sigmask(sigmask, sigsetsize);
        if (ret)
                return ret;

        ret = do_sys_poll(ufds, nfds, to);
        return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret);
}
#endif

#ifdef CONFIG_COMPAT
#define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))

/*
 * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
 * 64-bit unsigned longs.
 */
static
int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
                        unsigned long *fdset)
{
        if (ufdset) {
                return compat_get_bitmap(fdset, ufdset, nr);
        } else {
                zero_fd_set(nr, fdset);
                return 0;
        }
}

static
int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
                      unsigned long *fdset)
{
        if (!ufdset)
                return 0;
        return compat_put_bitmap(ufdset, fdset, nr);
}


/*
 * This is a virtual copy of sys_select from fs/select.c and probably
 * should be compared to it from time to time
 */

/*
 * We can actually return ERESTARTSYS instead of EINTR, but I'd
 * like to be certain this leads to no problems. So I return
 * EINTR just for safety.
 *
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
static int compat_core_sys_select(int n, compat_ulong_t __user *inp,
        compat_ulong_t __user *outp, compat_ulong_t __user *exp,
        struct timespec64 *end_time)
{
        fd_set_bits fds;
        void *bits;
        int size, max_fds, ret = -EINVAL;
        struct fdtable *fdt;
        long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

        if (n < 0)
                goto out_nofds;

        /* max_fds can increase, so grab it once to avoid race */
        rcu_read_lock();
        fdt = files_fdtable(current->files);
        max_fds = fdt->max_fds;
        rcu_read_unlock();
        if (n > max_fds)
                n = max_fds;

        /*
         * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
         * since we used fdset we need to allocate memory in units of
         * long-words.
         */
        size = FDS_BYTES(n);
        bits = stack_fds;
        if (size > sizeof(stack_fds) / 6) {
                bits = kmalloc_array(6, size, GFP_KERNEL);
                ret = -ENOMEM;
                if (!bits)
                        goto out_nofds;
        }
        fds.in      = (unsigned long *)  bits;
        fds.out     = (unsigned long *) (bits +   size);
        fds.ex      = (unsigned long *) (bits + 2*size);
        fds.res_in  = (unsigned long *) (bits + 3*size);
        fds.res_out = (unsigned long *) (bits + 4*size);
        fds.res_ex  = (unsigned long *) (bits + 5*size);

        if ((ret = compat_get_fd_set(n, inp, fds.in)) ||
            (ret = compat_get_fd_set(n, outp, fds.out)) ||
            (ret = compat_get_fd_set(n, exp, fds.ex)))
                goto out;
        zero_fd_set(n, fds.res_in);
        zero_fd_set(n, fds.res_out);
        zero_fd_set(n, fds.res_ex);

        ret = do_select(n, &fds, end_time);

        if (ret < 0)
                goto out;
        if (!ret) {
                ret = -ERESTARTNOHAND;
                if (signal_pending(current))
                        goto out;
                ret = 0;
        }

        if (compat_set_fd_set(n, inp, fds.res_in) ||
            compat_set_fd_set(n, outp, fds.res_out) ||
            compat_set_fd_set(n, exp, fds.res_ex))
                ret = -EFAULT;
out:
        if (bits != stack_fds)
                kfree(bits);
out_nofds:
        return ret;
}

static int do_compat_select(int n, compat_ulong_t __user *inp,
        compat_ulong_t __user *outp, compat_ulong_t __user *exp,
        struct old_timeval32 __user *tvp)
{
        struct timespec64 end_time, *to = NULL;
        struct old_timeval32 tv;
        int ret;

        if (tvp) {
                if (copy_from_user(&tv, tvp, sizeof(tv)))
                        return -EFAULT;

                to = &end_time;
                if (poll_select_set_timeout(to,
                                tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
                                (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
                        return -EINVAL;
        }

        ret = compat_core_sys_select(n, inp, outp, exp, to);
        return poll_select_finish(&end_time, tvp, PT_OLD_TIMEVAL, ret);
}

COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
        compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
        struct old_timeval32 __user *, tvp)
{
        return do_compat_select(n, inp, outp, exp, tvp);
}

struct compat_sel_arg_struct {
        compat_ulong_t n;
        compat_uptr_t inp;
        compat_uptr_t outp;
        compat_uptr_t exp;
        compat_uptr_t tvp;
};

COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
{
        struct compat_sel_arg_struct a;

        if (copy_from_user(&a, arg, sizeof(a)))
                return -EFAULT;
        return do_compat_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
                                compat_ptr(a.exp), compat_ptr(a.tvp));
}

static long do_compat_pselect(int n, compat_ulong_t __user *inp,
        compat_ulong_t __user *outp, compat_ulong_t __user *exp,
        void __user *tsp, compat_sigset_t __user *sigmask,
        compat_size_t sigsetsize, enum poll_time_type type)
{
        struct timespec64 ts, end_time, *to = NULL;
        int ret;

        if (tsp) {
                switch (type) {
                case PT_OLD_TIMESPEC:
                        if (get_old_timespec32(&ts, tsp))
                                return -EFAULT;
                        break;
                case PT_TIMESPEC:
                        if (get_timespec64(&ts, tsp))
                                return -EFAULT;
                        break;
                default:
                        BUG();
                }

                to = &end_time;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        ret = set_compat_user_sigmask(sigmask, sigsetsize);
        if (ret)
                return ret;

        ret = compat_core_sys_select(n, inp, outp, exp, to);
        return poll_select_finish(&end_time, tsp, type, ret);
}

struct compat_sigset_argpack {
        compat_uptr_t p;
        compat_size_t size;
};
static inline int get_compat_sigset_argpack(struct compat_sigset_argpack *to,
                                            struct compat_sigset_argpack __user *from)
{
        if (from) {
                if (!user_read_access_begin(from, sizeof(*from)))
                        return -EFAULT;
                unsafe_get_user(to->p, &from->p, Efault);
                unsafe_get_user(to->size, &from->size, Efault);
                user_read_access_end();
        }
        return 0;
Efault:
        user_access_end();
        return -EFAULT;
}

COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp,
        compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
        struct __kernel_timespec __user *, tsp, void __user *, sig)
{
        struct compat_sigset_argpack x = {0, 0};

        if (get_compat_sigset_argpack(&x, sig))
                return -EFAULT;

        return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p),
                                 x.size, PT_TIMESPEC);
}

#if defined(CONFIG_COMPAT_32BIT_TIME)

COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp,
        compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
        struct old_timespec32 __user *, tsp, void __user *, sig)
{
        struct compat_sigset_argpack x = {0, 0};

        if (get_compat_sigset_argpack(&x, sig))
                return -EFAULT;

        return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p),
                                 x.size, PT_OLD_TIMESPEC);
}

#endif

#if defined(CONFIG_COMPAT_32BIT_TIME)
COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
        unsigned int,  nfds, struct old_timespec32 __user *, tsp,
        const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
        struct timespec64 ts, end_time, *to = NULL;
        int ret;

        if (tsp) {
                if (get_old_timespec32(&ts, tsp))
                        return -EFAULT;

                to = &end_time;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        ret = set_compat_user_sigmask(sigmask, sigsetsize);
        if (ret)
                return ret;

        ret = do_sys_poll(ufds, nfds, to);
        return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret);
}
#endif

/* New compat syscall for 64 bit time_t*/
COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
        unsigned int,  nfds, struct __kernel_timespec __user *, tsp,
        const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
        struct timespec64 ts, end_time, *to = NULL;
        int ret;

        if (tsp) {
                if (get_timespec64(&ts, tsp))
                        return -EFAULT;

                to = &end_time;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        ret = set_compat_user_sigmask(sigmask, sigsetsize);
        if (ret)
                return ret;

        ret = do_sys_poll(ufds, nfds, to);
        return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret);
}

#endif





































































































   19 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_ATOMIC64_64_H
#define _ASM_X86_ATOMIC64_64_H

#include <linux/types.h>
#include <asm/alternative.h>
#include <asm/cmpxchg.h>

/* The 64-bit atomic type */

#define ATOMIC64_INIT(i)        { (i) }

static __always_inline s64 arch_atomic64_read(const atomic64_t *v)
{
        return __READ_ONCE((v)->counter);
}

static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
{
        __WRITE_ONCE(v->counter, i);
}

static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "addq %1,%0"
                     : "=m" (v->counter)
                     : "er" (i), "m" (v->counter) : "memory");
}

static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "subq %1,%0"
                     : "=m" (v->counter)
                     : "er" (i), "m" (v->counter) : "memory");
}

static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i);
}
#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test

static __always_inline void arch_atomic64_inc(atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "incq %0"
                     : "=m" (v->counter)
                     : "m" (v->counter) : "memory");
}
#define arch_atomic64_inc arch_atomic64_inc

static __always_inline void arch_atomic64_dec(atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "decq %0"
                     : "=m" (v->counter)
                     : "m" (v->counter) : "memory");
}
#define arch_atomic64_dec arch_atomic64_dec

static __always_inline bool arch_atomic64_dec_and_test(atomic64_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e);
}
#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test

static __always_inline bool arch_atomic64_inc_and_test(atomic64_t *v)
{
        return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e);
}
#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test

static __always_inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v)
{
        return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i);
}
#define arch_atomic64_add_negative arch_atomic64_add_negative

static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
{
        return i + xadd(&v->counter, i);
}
#define arch_atomic64_add_return arch_atomic64_add_return

#define arch_atomic64_sub_return(i, v) arch_atomic64_add_return(-(i), v)

static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
        return xadd(&v->counter, i);
}
#define arch_atomic64_fetch_add arch_atomic64_fetch_add

#define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), v)

static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
        return arch_cmpxchg(&v->counter, old, new);
}
#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg

static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
        return arch_try_cmpxchg(&v->counter, old, new);
}
#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg

static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
{
        return arch_xchg(&v->counter, new);
}
#define arch_atomic64_xchg arch_atomic64_xchg

static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "andq %1,%0"
                        : "+m" (v->counter)
                        : "er" (i)
                        : "memory");
}

static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
{
        s64 val = arch_atomic64_read(v);

        do {
        } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
        return val;
}
#define arch_atomic64_fetch_and arch_atomic64_fetch_and

static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "orq %1,%0"
                        : "+m" (v->counter)
                        : "er" (i)
                        : "memory");
}

static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
{
        s64 val = arch_atomic64_read(v);

        do {
        } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
        return val;
}
#define arch_atomic64_fetch_or arch_atomic64_fetch_or

static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
{
        asm volatile(LOCK_PREFIX "xorq %1,%0"
                        : "+m" (v->counter)
                        : "er" (i)
                        : "memory");
}

static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
        s64 val = arch_atomic64_read(v);

        do {
        } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
        return val;
}
#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor

#endif /* _ASM_X86_ATOMIC64_64_H */


































































































































































    1 





















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
/*
 *  linux/fs/hfs/super.c
 *
 * Copyright (C) 1995-1997  Paul H. Hargrove
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 * This file may be distributed under the terms of the GNU General Public License.
 *
 * This file contains hfs_read_super(), some of the super_ops and
 * init_hfs_fs() and exit_hfs_fs().  The remaining super_ops are in
 * inode.c since they deal with inodes.
 *
 * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds
 */

#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/mount.h>
#include <linux/init.h>
#include <linux/nls.h>
#include <linux/parser.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/vfs.h>

#include "hfs_fs.h"
#include "btree.h"

static struct kmem_cache *hfs_inode_cachep;

MODULE_LICENSE("GPL");

static int hfs_sync_fs(struct super_block *sb, int wait)
{
        hfs_mdb_commit(sb);
        return 0;
}

/*
 * hfs_put_super()
 *
 * This is the put_super() entry in the super_operations structure for
 * HFS filesystems.  The purpose is to release the resources
 * associated with the superblock sb.
 */
static void hfs_put_super(struct super_block *sb)
{
        cancel_delayed_work_sync(&HFS_SB(sb)->mdb_work);
        hfs_mdb_close(sb);
        /* release the MDB's resources */
        hfs_mdb_put(sb);
}

static void flush_mdb(struct work_struct *work)
{
        struct hfs_sb_info *sbi;
        struct super_block *sb;

        sbi = container_of(work, struct hfs_sb_info, mdb_work.work);
        sb = sbi->sb;

        spin_lock(&sbi->work_lock);
        sbi->work_queued = 0;
        spin_unlock(&sbi->work_lock);

        hfs_mdb_commit(sb);
}

void hfs_mark_mdb_dirty(struct super_block *sb)
{
        struct hfs_sb_info *sbi = HFS_SB(sb);
        unsigned long delay;

        if (sb_rdonly(sb))
                return;

        spin_lock(&sbi->work_lock);
        if (!sbi->work_queued) {
                delay = msecs_to_jiffies(dirty_writeback_interval * 10);
                queue_delayed_work(system_long_wq, &sbi->mdb_work, delay);
                sbi->work_queued = 1;
        }
        spin_unlock(&sbi->work_lock);
}

/*
 * hfs_statfs()
 *
 * This is the statfs() entry in the super_operations structure for
 * HFS filesystems.  The purpose is to return various data about the
 * filesystem.
 *
 * changed f_files/f_ffree to reflect the fs_ablock/free_ablocks.
 */
static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct super_block *sb = dentry->d_sb;
        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);

        buf->f_type = HFS_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = (u32)HFS_SB(sb)->fs_ablocks * HFS_SB(sb)->fs_div;
        buf->f_bfree = (u32)HFS_SB(sb)->free_ablocks * HFS_SB(sb)->fs_div;
        buf->f_bavail = buf->f_bfree;
        buf->f_files = HFS_SB(sb)->fs_ablocks;
        buf->f_ffree = HFS_SB(sb)->free_ablocks;
        buf->f_fsid = u64_to_fsid(id);
        buf->f_namelen = HFS_NAMELEN;

        return 0;
}

static int hfs_remount(struct super_block *sb, int *flags, char *data)
{
        sync_filesystem(sb);
        *flags |= SB_NODIRATIME;
        if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
                return 0;
        if (!(*flags & SB_RDONLY)) {
                if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
                        pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended.  leaving read-only.\n");
                        sb->s_flags |= SB_RDONLY;
                        *flags |= SB_RDONLY;
                } else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) {
                        pr_warn("filesystem is marked locked, leaving read-only.\n");
                        sb->s_flags |= SB_RDONLY;
                        *flags |= SB_RDONLY;
                }
        }
        return 0;
}

static int hfs_show_options(struct seq_file *seq, struct dentry *root)
{
        struct hfs_sb_info *sbi = HFS_SB(root->d_sb);

        if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f))
                seq_show_option_n(seq, "creator", (char *)&sbi->s_creator, 4);
        if (sbi->s_type != cpu_to_be32(0x3f3f3f3f))
                seq_show_option_n(seq, "type", (char *)&sbi->s_type, 4);
        seq_printf(seq, ",uid=%u,gid=%u",
                        from_kuid_munged(&init_user_ns, sbi->s_uid),
                        from_kgid_munged(&init_user_ns, sbi->s_gid));
        if (sbi->s_file_umask != 0133)
                seq_printf(seq, ",file_umask=%o", sbi->s_file_umask);
        if (sbi->s_dir_umask != 0022)
                seq_printf(seq, ",dir_umask=%o", sbi->s_dir_umask);
        if (sbi->part >= 0)
                seq_printf(seq, ",part=%u", sbi->part);
        if (sbi->session >= 0)
                seq_printf(seq, ",session=%u", sbi->session);
        if (sbi->nls_disk)
                seq_printf(seq, ",codepage=%s", sbi->nls_disk->charset);
        if (sbi->nls_io)
                seq_printf(seq, ",iocharset=%s", sbi->nls_io->charset);
        if (sbi->s_quiet)
                seq_printf(seq, ",quiet");
        return 0;
}

static struct inode *hfs_alloc_inode(struct super_block *sb)
{
        struct hfs_inode_info *i;

        i = alloc_inode_sb(sb, hfs_inode_cachep, GFP_KERNEL);
        return i ? &i->vfs_inode : NULL;
}

static void hfs_free_inode(struct inode *inode)
{
        kmem_cache_free(hfs_inode_cachep, HFS_I(inode));
}

static const struct super_operations hfs_super_operations = {
        .alloc_inode        = hfs_alloc_inode,
        .free_inode        = hfs_free_inode,
        .write_inode        = hfs_write_inode,
        .evict_inode        = hfs_evict_inode,
        .put_super        = hfs_put_super,
        .sync_fs        = hfs_sync_fs,
        .statfs                = hfs_statfs,
        .remount_fs     = hfs_remount,
        .show_options        = hfs_show_options,
};

enum {
        opt_uid, opt_gid, opt_umask, opt_file_umask, opt_dir_umask,
        opt_part, opt_session, opt_type, opt_creator, opt_quiet,
        opt_codepage, opt_iocharset,
        opt_err
};

static const match_table_t tokens = {
        { opt_uid, "uid=%u" },
        { opt_gid, "gid=%u" },
        { opt_umask, "umask=%o" },
        { opt_file_umask, "file_umask=%o" },
        { opt_dir_umask, "dir_umask=%o" },
        { opt_part, "part=%u" },
        { opt_session, "session=%u" },
        { opt_type, "type=%s" },
        { opt_creator, "creator=%s" },
        { opt_quiet, "quiet" },
        { opt_codepage, "codepage=%s" },
        { opt_iocharset, "iocharset=%s" },
        { opt_err, NULL }
};

static inline int match_fourchar(substring_t *arg, u32 *result)
{
        if (arg->to - arg->from != 4)
                return -EINVAL;
        memcpy(result, arg->from, 4);
        return 0;
}

/*
 * parse_options()
 *
 * adapted from linux/fs/msdos/inode.c written 1992,93 by Werner Almesberger
 * This function is called by hfs_read_super() to parse the mount options.
 */
static int parse_options(char *options, struct hfs_sb_info *hsb)
{
        char *p;
        substring_t args[MAX_OPT_ARGS];
        int tmp, token;

        /* initialize the sb with defaults */
        hsb->s_uid = current_uid();
        hsb->s_gid = current_gid();
        hsb->s_file_umask = 0133;
        hsb->s_dir_umask = 0022;
        hsb->s_type = hsb->s_creator = cpu_to_be32(0x3f3f3f3f);        /* == '????' */
        hsb->s_quiet = 0;
        hsb->part = -1;
        hsb->session = -1;

        if (!options)
                return 1;

        while ((p = strsep(&options, ",")) != NULL) {
                if (!*p)
                        continue;

                token = match_token(p, tokens, args);
                switch (token) {
                case opt_uid:
                        if (match_int(&args[0], &tmp)) {
                                pr_err("uid requires an argument\n");
                                return 0;
                        }
                        hsb->s_uid = make_kuid(current_user_ns(), (uid_t)tmp);
                        if (!uid_valid(hsb->s_uid)) {
                                pr_err("invalid uid %d\n", tmp);
                                return 0;
                        }
                        break;
                case opt_gid:
                        if (match_int(&args[0], &tmp)) {
                                pr_err("gid requires an argument\n");
                                return 0;
                        }
                        hsb->s_gid = make_kgid(current_user_ns(), (gid_t)tmp);
                        if (!gid_valid(hsb->s_gid)) {
                                pr_err("invalid gid %d\n", tmp);
                                return 0;
                        }
                        break;
                case opt_umask:
                        if (match_octal(&args[0], &tmp)) {
                                pr_err("umask requires a value\n");
                                return 0;
                        }
                        hsb->s_file_umask = (umode_t)tmp;
                        hsb->s_dir_umask = (umode_t)tmp;
                        break;
                case opt_file_umask:
                        if (match_octal(&args[0], &tmp)) {
                                pr_err("file_umask requires a value\n");
                                return 0;
                        }
                        hsb->s_file_umask = (umode_t)tmp;
                        break;
                case opt_dir_umask:
                        if (match_octal(&args[0], &tmp)) {
                                pr_err("dir_umask requires a value\n");
                                return 0;
                        }
                        hsb->s_dir_umask = (umode_t)tmp;
                        break;
                case opt_part:
                        if (match_int(&args[0], &hsb->part)) {
                                pr_err("part requires an argument\n");
                                return 0;
                        }
                        break;
                case opt_session:
                        if (match_int(&args[0], &hsb->session)) {
                                pr_err("session requires an argument\n");
                                return 0;
                        }
                        break;
                case opt_type:
                        if (match_fourchar(&args[0], &hsb->s_type)) {
                                pr_err("type requires a 4 character value\n");
                                return 0;
                        }
                        break;
                case opt_creator:
                        if (match_fourchar(&args[0], &hsb->s_creator)) {
                                pr_err("creator requires a 4 character value\n");
                                return 0;
                        }
                        break;
                case opt_quiet:
                        hsb->s_quiet = 1;
                        break;
                case opt_codepage:
                        if (hsb->nls_disk) {
                                pr_err("unable to change codepage\n");
                                return 0;
                        }
                        p = match_strdup(&args[0]);
                        if (p)
                                hsb->nls_disk = load_nls(p);
                        if (!hsb->nls_disk) {
                                pr_err("unable to load codepage \"%s\"\n", p);
                                kfree(p);
                                return 0;
                        }
                        kfree(p);
                        break;
                case opt_iocharset:
                        if (hsb->nls_io) {
                                pr_err("unable to change iocharset\n");
                                return 0;
                        }
                        p = match_strdup(&args[0]);
                        if (p)
                                hsb->nls_io = load_nls(p);
                        if (!hsb->nls_io) {
                                pr_err("unable to load iocharset \"%s\"\n", p);
                                kfree(p);
                                return 0;
                        }
                        kfree(p);
                        break;
                default:
                        return 0;
                }
        }

        if (hsb->nls_disk && !hsb->nls_io) {
                hsb->nls_io = load_nls_default();
                if (!hsb->nls_io) {
                        pr_err("unable to load default iocharset\n");
                        return 0;
                }
        }
        hsb->s_dir_umask &= 0777;
        hsb->s_file_umask &= 0577;

        return 1;
}

/*
 * hfs_read_super()
 *
 * This is the function that is responsible for mounting an HFS
 * filesystem.        It performs all the tasks necessary to get enough data
 * from the disk to read the root inode.  This includes parsing the
 * mount options, dealing with Macintosh partitions, reading the
 * superblock and the allocation bitmap blocks, calling
 * hfs_btree_init() to get the necessary data about the extents and
 * catalog B-trees and, finally, reading the root inode into memory.
 */
static int hfs_fill_super(struct super_block *sb, void *data, int silent)
{
        struct hfs_sb_info *sbi;
        struct hfs_find_data fd;
        hfs_cat_rec rec;
        struct inode *root_inode;
        int res;

        sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;

        sbi->sb = sb;
        sb->s_fs_info = sbi;
        spin_lock_init(&sbi->work_lock);
        INIT_DELAYED_WORK(&sbi->mdb_work, flush_mdb);

        res = -EINVAL;
        if (!parse_options((char *)data, sbi)) {
                pr_err("unable to parse mount options\n");
                goto bail;
        }

        sb->s_op = &hfs_super_operations;
        sb->s_xattr = hfs_xattr_handlers;
        sb->s_flags |= SB_NODIRATIME;
        mutex_init(&sbi->bitmap_lock);

        res = hfs_mdb_get(sb);
        if (res) {
                if (!silent)
                        pr_warn("can't find a HFS filesystem on dev %s\n",
                                hfs_mdb_name(sb));
                res = -EINVAL;
                goto bail;
        }

        /* try to get the root inode */
        res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
        if (res)
                goto bail_no_root;
        res = hfs_cat_find_brec(sb, HFS_ROOT_CNID, &fd);
        if (!res) {
                if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) {
                        res =  -EIO;
                        goto bail_hfs_find;
                }
                hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength);
        }
        if (res)
                goto bail_hfs_find;
        res = -EINVAL;
        root_inode = hfs_iget(sb, &fd.search_key->cat, &rec);
        hfs_find_exit(&fd);
        if (!root_inode)
                goto bail_no_root;

        sb->s_d_op = &hfs_dentry_operations;
        res = -ENOMEM;
        sb->s_root = d_make_root(root_inode);
        if (!sb->s_root)
                goto bail_no_root;

        /* everything's okay */
        return 0;

bail_hfs_find:
        hfs_find_exit(&fd);
bail_no_root:
        pr_err("get root inode failed\n");
bail:
        hfs_mdb_put(sb);
        return res;
}

static struct dentry *hfs_mount(struct file_system_type *fs_type,
                      int flags, const char *dev_name, void *data)
{
        return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super);
}

static struct file_system_type hfs_fs_type = {
        .owner                = THIS_MODULE,
        .name                = "hfs",
        .mount                = hfs_mount,
        .kill_sb        = kill_block_super,
        .fs_flags        = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("hfs");

static void hfs_init_once(void *p)
{
        struct hfs_inode_info *i = p;

        inode_init_once(&i->vfs_inode);
}

static int __init init_hfs_fs(void)
{
        int err;

        hfs_inode_cachep = kmem_cache_create("hfs_inode_cache",
                sizeof(struct hfs_inode_info), 0,
                SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfs_init_once);
        if (!hfs_inode_cachep)
                return -ENOMEM;
        err = register_filesystem(&hfs_fs_type);
        if (err)
                kmem_cache_destroy(hfs_inode_cachep);
        return err;
}

static void __exit exit_hfs_fs(void)
{
        unregister_filesystem(&hfs_fs_type);

        /*
         * Make sure all delayed rcu free inodes are flushed before we
         * destroy cache.
         */
        rcu_barrier();
        kmem_cache_destroy(hfs_inode_cachep);
}

module_init(init_hfs_fs)
module_exit(exit_hfs_fs)











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BH_H
#define _LINUX_BH_H

#include <linux/instruction_pointer.h>
#include <linux/preempt.h>

#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_TRACE_IRQFLAGS)
extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
#else
static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
        preempt_count_add(cnt);
        barrier();
}
#endif

static inline void local_bh_disable(void)
{
        __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
}

extern void _local_bh_enable(void);
extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt);

static inline void local_bh_enable_ip(unsigned long ip)
{
        __local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET);
}

static inline void local_bh_enable(void)
{
        __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
}

#ifdef CONFIG_PREEMPT_RT
extern bool local_bh_blocked(void);
#else
static inline bool local_bh_blocked(void) { return false; }
#endif

#endif /* _LINUX_BH_H */





























    1 
















































































































































    4 
    3 
    4 
    5 



    5 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#include <asm/unaligned.h>
#include "messages.h"
#include "extent_io.h"
#include "fs.h"
#include "accessors.h"

static bool check_setget_bounds(const struct extent_buffer *eb,
                                const void *ptr, unsigned off, int size)
{
        const unsigned long member_offset = (unsigned long)ptr + off;

        if (unlikely(member_offset + size > eb->len)) {
                btrfs_warn(eb->fs_info,
                "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d",
                        (member_offset > eb->len ? "start" : "end"),
                        (unsigned long)ptr, eb->start, member_offset, size);
                return false;
        }

        return true;
}

void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb)
{
        token->eb = eb;
        token->kaddr = folio_address(eb->folios[0]);
        token->offset = 0;
}

/*
 * Macro templates that define helpers to read/write extent buffer data of a
 * given size, that are also used via ctree.h for access to item members by
 * specialized helpers.
 *
 * Generic helpers:
 * - btrfs_set_8 (for 8/16/32/64)
 * - btrfs_get_8 (for 8/16/32/64)
 *
 * Generic helpers with a token (cached address of the most recently accessed
 * page):
 * - btrfs_set_token_8 (for 8/16/32/64)
 * - btrfs_get_token_8 (for 8/16/32/64)
 *
 * The set/get functions handle data spanning two pages transparently, in case
 * metadata block size is larger than page.  Every pointer to metadata items is
 * an offset into the extent buffer page array, cast to a specific type.  This
 * gives us all the type checking.
 *
 * The extent buffer pages stored in the array folios may not form a contiguous
 * phyusical range, but the API functions assume the linear offset to the range
 * from 0 to metadata node size.
 */

#define DEFINE_BTRFS_SETGET_BITS(bits)                                        \
u##bits btrfs_get_token_##bits(struct btrfs_map_token *token,                \
                               const void *ptr, unsigned long off)        \
{                                                                        \
        const unsigned long member_offset = (unsigned long)ptr + off;        \
        const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
        const unsigned long oil = get_eb_offset_in_folio(token->eb,        \
                                                         member_offset);\
        const int unit_size = token->eb->folio_size;                        \
        const int unit_shift = token->eb->folio_shift;                        \
        const int size = sizeof(u##bits);                                \
        u8 lebytes[sizeof(u##bits)];                                        \
        const int part = unit_size - oil;                                \
                                                                        \
        ASSERT(token);                                                        \
        ASSERT(token->kaddr);                                                \
        ASSERT(check_setget_bounds(token->eb, ptr, off, size));                \
        if (token->offset <= member_offset &&                                \
            member_offset + size <= token->offset + unit_size) {        \
                return get_unaligned_le##bits(token->kaddr + oil);        \
        }                                                                \
        token->kaddr = folio_address(token->eb->folios[idx]);                \
        token->offset = idx << unit_shift;                                \
        if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \
                return get_unaligned_le##bits(token->kaddr + oil);        \
                                                                        \
        memcpy(lebytes, token->kaddr + oil, part);                        \
        token->kaddr = folio_address(token->eb->folios[idx + 1]);        \
        token->offset = (idx + 1) << unit_shift;                        \
        memcpy(lebytes + part, token->kaddr, size - part);                \
        return get_unaligned_le##bits(lebytes);                                \
}                                                                        \
u##bits btrfs_get_##bits(const struct extent_buffer *eb,                \
                         const void *ptr, unsigned long off)                \
{                                                                        \
        const unsigned long member_offset = (unsigned long)ptr + off;        \
        const unsigned long idx = get_eb_folio_index(eb, member_offset);\
        const unsigned long oil = get_eb_offset_in_folio(eb,                \
                                                         member_offset);\
        const int unit_size = eb->folio_size;                                \
        char *kaddr = folio_address(eb->folios[idx]);                        \
        const int size = sizeof(u##bits);                                \
        const int part = unit_size - oil;                                \
        u8 lebytes[sizeof(u##bits)];                                        \
                                                                        \
        ASSERT(check_setget_bounds(eb, ptr, off, size));                \
        if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size)        \
                return get_unaligned_le##bits(kaddr + oil);                \
                                                                        \
        memcpy(lebytes, kaddr + oil, part);                                \
        kaddr = folio_address(eb->folios[idx + 1]);                        \
        memcpy(lebytes + part, kaddr, size - part);                        \
        return get_unaligned_le##bits(lebytes);                                \
}                                                                        \
void btrfs_set_token_##bits(struct btrfs_map_token *token,                \
                            const void *ptr, unsigned long off,                \
                            u##bits val)                                \
{                                                                        \
        const unsigned long member_offset = (unsigned long)ptr + off;        \
        const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
        const unsigned long oil = get_eb_offset_in_folio(token->eb,        \
                                                         member_offset);\
        const int unit_size = token->eb->folio_size;                        \
        const int unit_shift = token->eb->folio_shift;                        \
        const int size = sizeof(u##bits);                                \
        u8 lebytes[sizeof(u##bits)];                                        \
        const int part = unit_size - oil;                                \
                                                                        \
        ASSERT(token);                                                        \
        ASSERT(token->kaddr);                                                \
        ASSERT(check_setget_bounds(token->eb, ptr, off, size));                \
        if (token->offset <= member_offset &&                                \
            member_offset + size <= token->offset + unit_size) {        \
                put_unaligned_le##bits(val, token->kaddr + oil);        \
                return;                                                        \
        }                                                                \
        token->kaddr = folio_address(token->eb->folios[idx]);                \
        token->offset = idx << unit_shift;                                \
        if (INLINE_EXTENT_BUFFER_PAGES == 1 ||                                \
            oil + size <= unit_size) {                                        \
                put_unaligned_le##bits(val, token->kaddr + oil);        \
                return;                                                        \
        }                                                                \
        put_unaligned_le##bits(val, lebytes);                                \
        memcpy(token->kaddr + oil, lebytes, part);                        \
        token->kaddr = folio_address(token->eb->folios[idx + 1]);        \
        token->offset = (idx + 1) << unit_shift;                        \
        memcpy(token->kaddr, lebytes + part, size - part);                \
}                                                                        \
void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr,        \
                      unsigned long off, u##bits val)                        \
{                                                                        \
        const unsigned long member_offset = (unsigned long)ptr + off;        \
        const unsigned long idx = get_eb_folio_index(eb, member_offset);\
        const unsigned long oil = get_eb_offset_in_folio(eb,                \
                                                         member_offset);\
        const int unit_size = eb->folio_size;                                \
        char *kaddr = folio_address(eb->folios[idx]);                        \
        const int size = sizeof(u##bits);                                \
        const int part = unit_size - oil;                                \
        u8 lebytes[sizeof(u##bits)];                                        \
                                                                        \
        ASSERT(check_setget_bounds(eb, ptr, off, size));                \
        if (INLINE_EXTENT_BUFFER_PAGES == 1 ||                                \
            oil + size <= unit_size) {                                        \
                put_unaligned_le##bits(val, kaddr + oil);                \
                return;                                                        \
        }                                                                \
                                                                        \
        put_unaligned_le##bits(val, lebytes);                                \
        memcpy(kaddr + oil, lebytes, part);                                \
        kaddr = folio_address(eb->folios[idx + 1]);                        \
        memcpy(kaddr, lebytes + part, size - part);                        \
}

DEFINE_BTRFS_SETGET_BITS(8)
DEFINE_BTRFS_SETGET_BITS(16)
DEFINE_BTRFS_SETGET_BITS(32)
DEFINE_BTRFS_SETGET_BITS(64)

void btrfs_node_key(const struct extent_buffer *eb,
                    struct btrfs_disk_key *disk_key, int nr)
{
        unsigned long ptr = btrfs_node_key_ptr_offset(eb, nr);
        read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
                       struct btrfs_key_ptr, key, disk_key);
}












































































    2 





    2 









    1 


    1 



































































































































    1 


    1 

    1 




























































































































































































































































































































    1 





    1 





























    1 








    1 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kdebug.h>
#include <linux/kprobes.h>
#include <linux/export.h>
#include <linux/notifier.h>
#include <linux/rcupdate.h>
#include <linux/vmalloc.h>
#include <linux/reboot.h>

#define CREATE_TRACE_POINTS
#include <trace/events/notifier.h>

/*
 *        Notifier list for kernel code which wants to be called
 *        at shutdown. This is used to stop any idling DMA operations
 *        and the like.
 */
BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);

/*
 *        Notifier chain core routines.  The exported routines below
 *        are layered on top of these, with appropriate locking added.
 */

static int notifier_chain_register(struct notifier_block **nl,
                                   struct notifier_block *n,
                                   bool unique_priority)
{
        while ((*nl) != NULL) {
                if (unlikely((*nl) == n)) {
                        WARN(1, "notifier callback %ps already registered",
                             n->notifier_call);
                        return -EEXIST;
                }
                if (n->priority > (*nl)->priority)
                        break;
                if (n->priority == (*nl)->priority && unique_priority)
                        return -EBUSY;
                nl = &((*nl)->next);
        }
        n->next = *nl;
        rcu_assign_pointer(*nl, n);
        trace_notifier_register((void *)n->notifier_call);
        return 0;
}

static int notifier_chain_unregister(struct notifier_block **nl,
                struct notifier_block *n)
{
        while ((*nl) != NULL) {
                if ((*nl) == n) {
                        rcu_assign_pointer(*nl, n->next);
                        trace_notifier_unregister((void *)n->notifier_call);
                        return 0;
                }
                nl = &((*nl)->next);
        }
        return -ENOENT;
}

/**
 * notifier_call_chain - Informs the registered notifiers about an event.
 *        @nl:                Pointer to head of the blocking notifier chain
 *        @val:                Value passed unmodified to notifier function
 *        @v:                Pointer passed unmodified to notifier function
 *        @nr_to_call:        Number of notifier functions to be called. Don't care
 *                        value of this parameter is -1.
 *        @nr_calls:        Records the number of notifications sent. Don't care
 *                        value of this field is NULL.
 *        Return:                notifier_call_chain returns the value returned by the
 *                        last notifier function called.
 */
static int notifier_call_chain(struct notifier_block **nl,
                               unsigned long val, void *v,
                               int nr_to_call, int *nr_calls)
{
        int ret = NOTIFY_DONE;
        struct notifier_block *nb, *next_nb;

        nb = rcu_dereference_raw(*nl);

        while (nb && nr_to_call) {
                next_nb = rcu_dereference_raw(nb->next);

#ifdef CONFIG_DEBUG_NOTIFIERS
                if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
                        WARN(1, "Invalid notifier called!");
                        nb = next_nb;
                        continue;
                }
#endif
                trace_notifier_run((void *)nb->notifier_call);
                ret = nb->notifier_call(nb, val, v);

                if (nr_calls)
                        (*nr_calls)++;

                if (ret & NOTIFY_STOP_MASK)
                        break;
                nb = next_nb;
                nr_to_call--;
        }
        return ret;
}
NOKPROBE_SYMBOL(notifier_call_chain);

/**
 * notifier_call_chain_robust - Inform the registered notifiers about an event
 *                              and rollback on error.
 * @nl:                Pointer to head of the blocking notifier chain
 * @val_up:        Value passed unmodified to the notifier function
 * @val_down:        Value passed unmodified to the notifier function when recovering
 *              from an error on @val_up
 * @v:                Pointer passed unmodified to the notifier function
 *
 * NOTE:        It is important the @nl chain doesn't change between the two
 *                invocations of notifier_call_chain() such that we visit the
 *                exact same notifier callbacks; this rules out any RCU usage.
 *
 * Return:        the return value of the @val_up call.
 */
static int notifier_call_chain_robust(struct notifier_block **nl,
                                     unsigned long val_up, unsigned long val_down,
                                     void *v)
{
        int ret, nr = 0;

        ret = notifier_call_chain(nl, val_up, v, -1, &nr);
        if (ret & NOTIFY_STOP_MASK)
                notifier_call_chain(nl, val_down, v, nr-1, NULL);

        return ret;
}

/*
 *        Atomic notifier chain routines.  Registration and unregistration
 *        use a spinlock, and call_chain is synchronized by RCU (no locks).
 */

/**
 *        atomic_notifier_chain_register - Add notifier to an atomic notifier chain
 *        @nh: Pointer to head of the atomic notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to an atomic notifier chain.
 *
 *        Returns 0 on success, %-EEXIST on error.
 */
int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
                struct notifier_block *n)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&nh->lock, flags);
        ret = notifier_chain_register(&nh->head, n, false);
        spin_unlock_irqrestore(&nh->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);

/**
 *        atomic_notifier_chain_register_unique_prio - Add notifier to an atomic notifier chain
 *        @nh: Pointer to head of the atomic notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to an atomic notifier chain if there is no other
 *        notifier registered using the same priority.
 *
 *        Returns 0 on success, %-EEXIST or %-EBUSY on error.
 */
int atomic_notifier_chain_register_unique_prio(struct atomic_notifier_head *nh,
                                               struct notifier_block *n)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&nh->lock, flags);
        ret = notifier_chain_register(&nh->head, n, true);
        spin_unlock_irqrestore(&nh->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_register_unique_prio);

/**
 *        atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
 *        @nh: Pointer to head of the atomic notifier chain
 *        @n: Entry to remove from notifier chain
 *
 *        Removes a notifier from an atomic notifier chain.
 *
 *        Returns zero on success or %-ENOENT on failure.
 */
int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
                struct notifier_block *n)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&nh->lock, flags);
        ret = notifier_chain_unregister(&nh->head, n);
        spin_unlock_irqrestore(&nh->lock, flags);
        synchronize_rcu();
        return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);

/**
 *        atomic_notifier_call_chain - Call functions in an atomic notifier chain
 *        @nh: Pointer to head of the atomic notifier chain
 *        @val: Value passed unmodified to notifier function
 *        @v: Pointer passed unmodified to notifier function
 *
 *        Calls each function in a notifier chain in turn.  The functions
 *        run in an atomic context, so they must not block.
 *        This routine uses RCU to synchronize with changes to the chain.
 *
 *        If the return value of the notifier can be and'ed
 *        with %NOTIFY_STOP_MASK then atomic_notifier_call_chain()
 *        will return immediately, with the return value of
 *        the notifier function which halted execution.
 *        Otherwise the return value is the return value
 *        of the last notifier function called.
 */
int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
                               unsigned long val, void *v)
{
        int ret;

        rcu_read_lock();
        ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
NOKPROBE_SYMBOL(atomic_notifier_call_chain);

/**
 *        atomic_notifier_call_chain_is_empty - Check whether notifier chain is empty
 *        @nh: Pointer to head of the atomic notifier chain
 *
 *        Checks whether notifier chain is empty.
 *
 *        Returns true is notifier chain is empty, false otherwise.
 */
bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh)
{
        return !rcu_access_pointer(nh->head);
}

/*
 *        Blocking notifier chain routines.  All access to the chain is
 *        synchronized by an rwsem.
 */

static int __blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                                              struct notifier_block *n,
                                              bool unique_priority)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call down_write().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_register(&nh->head, n, unique_priority);

        down_write(&nh->rwsem);
        ret = notifier_chain_register(&nh->head, n, unique_priority);
        up_write(&nh->rwsem);
        return ret;
}

/**
 *        blocking_notifier_chain_register - Add notifier to a blocking notifier chain
 *        @nh: Pointer to head of the blocking notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to a blocking notifier chain.
 *        Must be called in process context.
 *
 *        Returns 0 on success, %-EEXIST on error.
 */
int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                struct notifier_block *n)
{
        return __blocking_notifier_chain_register(nh, n, false);
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);

/**
 *        blocking_notifier_chain_register_unique_prio - Add notifier to a blocking notifier chain
 *        @nh: Pointer to head of the blocking notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to an blocking notifier chain if there is no other
 *        notifier registered using the same priority.
 *
 *        Returns 0 on success, %-EEXIST or %-EBUSY on error.
 */
int blocking_notifier_chain_register_unique_prio(struct blocking_notifier_head *nh,
                                                 struct notifier_block *n)
{
        return __blocking_notifier_chain_register(nh, n, true);
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register_unique_prio);

/**
 *        blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
 *        @nh: Pointer to head of the blocking notifier chain
 *        @n: Entry to remove from notifier chain
 *
 *        Removes a notifier from a blocking notifier chain.
 *        Must be called from process context.
 *
 *        Returns zero on success or %-ENOENT on failure.
 */
int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
                struct notifier_block *n)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call down_write().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_unregister(&nh->head, n);

        down_write(&nh->rwsem);
        ret = notifier_chain_unregister(&nh->head, n);
        up_write(&nh->rwsem);
        return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);

int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh,
                unsigned long val_up, unsigned long val_down, void *v)
{
        int ret = NOTIFY_DONE;

        /*
         * We check the head outside the lock, but if this access is
         * racy then it does not matter what the result of the test
         * is, we re-check the list after having taken the lock anyway:
         */
        if (rcu_access_pointer(nh->head)) {
                down_read(&nh->rwsem);
                ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v);
                up_read(&nh->rwsem);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_call_chain_robust);

/**
 *        blocking_notifier_call_chain - Call functions in a blocking notifier chain
 *        @nh: Pointer to head of the blocking notifier chain
 *        @val: Value passed unmodified to notifier function
 *        @v: Pointer passed unmodified to notifier function
 *
 *        Calls each function in a notifier chain in turn.  The functions
 *        run in a process context, so they are allowed to block.
 *
 *        If the return value of the notifier can be and'ed
 *        with %NOTIFY_STOP_MASK then blocking_notifier_call_chain()
 *        will return immediately, with the return value of
 *        the notifier function which halted execution.
 *        Otherwise the return value is the return value
 *        of the last notifier function called.
 */
int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
                unsigned long val, void *v)
{
        int ret = NOTIFY_DONE;

        /*
         * We check the head outside the lock, but if this access is
         * racy then it does not matter what the result of the test
         * is, we re-check the list after having taken the lock anyway:
         */
        if (rcu_access_pointer(nh->head)) {
                down_read(&nh->rwsem);
                ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
                up_read(&nh->rwsem);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);

/*
 *        Raw notifier chain routines.  There is no protection;
 *        the caller must provide it.  Use at your own risk!
 */

/**
 *        raw_notifier_chain_register - Add notifier to a raw notifier chain
 *        @nh: Pointer to head of the raw notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to a raw notifier chain.
 *        All locking must be provided by the caller.
 *
 *        Returns 0 on success, %-EEXIST on error.
 */
int raw_notifier_chain_register(struct raw_notifier_head *nh,
                struct notifier_block *n)
{
        return notifier_chain_register(&nh->head, n, false);
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_register);

/**
 *        raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
 *        @nh: Pointer to head of the raw notifier chain
 *        @n: Entry to remove from notifier chain
 *
 *        Removes a notifier from a raw notifier chain.
 *        All locking must be provided by the caller.
 *
 *        Returns zero on success or %-ENOENT on failure.
 */
int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
                struct notifier_block *n)
{
        return notifier_chain_unregister(&nh->head, n);
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);

int raw_notifier_call_chain_robust(struct raw_notifier_head *nh,
                unsigned long val_up, unsigned long val_down, void *v)
{
        return notifier_call_chain_robust(&nh->head, val_up, val_down, v);
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain_robust);

/**
 *        raw_notifier_call_chain - Call functions in a raw notifier chain
 *        @nh: Pointer to head of the raw notifier chain
 *        @val: Value passed unmodified to notifier function
 *        @v: Pointer passed unmodified to notifier function
 *
 *        Calls each function in a notifier chain in turn.  The functions
 *        run in an undefined context.
 *        All locking must be provided by the caller.
 *
 *        If the return value of the notifier can be and'ed
 *        with %NOTIFY_STOP_MASK then raw_notifier_call_chain()
 *        will return immediately, with the return value of
 *        the notifier function which halted execution.
 *        Otherwise the return value is the return value
 *        of the last notifier function called.
 */
int raw_notifier_call_chain(struct raw_notifier_head *nh,
                unsigned long val, void *v)
{
        return notifier_call_chain(&nh->head, val, v, -1, NULL);
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);

/*
 *        SRCU notifier chain routines.    Registration and unregistration
 *        use a mutex, and call_chain is synchronized by SRCU (no locks).
 */

/**
 *        srcu_notifier_chain_register - Add notifier to an SRCU notifier chain
 *        @nh: Pointer to head of the SRCU notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to an SRCU notifier chain.
 *        Must be called in process context.
 *
 *        Returns 0 on success, %-EEXIST on error.
 */
int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
                struct notifier_block *n)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call mutex_lock().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_register(&nh->head, n, false);

        mutex_lock(&nh->mutex);
        ret = notifier_chain_register(&nh->head, n, false);
        mutex_unlock(&nh->mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(srcu_notifier_chain_register);

/**
 *        srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain
 *        @nh: Pointer to head of the SRCU notifier chain
 *        @n: Entry to remove from notifier chain
 *
 *        Removes a notifier from an SRCU notifier chain.
 *        Must be called from process context.
 *
 *        Returns zero on success or %-ENOENT on failure.
 */
int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
                struct notifier_block *n)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call mutex_lock().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_unregister(&nh->head, n);

        mutex_lock(&nh->mutex);
        ret = notifier_chain_unregister(&nh->head, n);
        mutex_unlock(&nh->mutex);
        synchronize_srcu(&nh->srcu);
        return ret;
}
EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);

/**
 *        srcu_notifier_call_chain - Call functions in an SRCU notifier chain
 *        @nh: Pointer to head of the SRCU notifier chain
 *        @val: Value passed unmodified to notifier function
 *        @v: Pointer passed unmodified to notifier function
 *
 *        Calls each function in a notifier chain in turn.  The functions
 *        run in a process context, so they are allowed to block.
 *
 *        If the return value of the notifier can be and'ed
 *        with %NOTIFY_STOP_MASK then srcu_notifier_call_chain()
 *        will return immediately, with the return value of
 *        the notifier function which halted execution.
 *        Otherwise the return value is the return value
 *        of the last notifier function called.
 */
int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
                unsigned long val, void *v)
{
        int ret;
        int idx;

        idx = srcu_read_lock(&nh->srcu);
        ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
        srcu_read_unlock(&nh->srcu, idx);
        return ret;
}
EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);

/**
 *        srcu_init_notifier_head - Initialize an SRCU notifier head
 *        @nh: Pointer to head of the srcu notifier chain
 *
 *        Unlike other sorts of notifier heads, SRCU notifier heads require
 *        dynamic initialization.  Be sure to call this routine before
 *        calling any of the other SRCU notifier routines for this head.
 *
 *        If an SRCU notifier head is deallocated, it must first be cleaned
 *        up by calling srcu_cleanup_notifier_head().  Otherwise the head's
 *        per-cpu data (used by the SRCU mechanism) will leak.
 */
void srcu_init_notifier_head(struct srcu_notifier_head *nh)
{
        mutex_init(&nh->mutex);
        if (init_srcu_struct(&nh->srcu) < 0)
                BUG();
        nh->head = NULL;
}
EXPORT_SYMBOL_GPL(srcu_init_notifier_head);

static ATOMIC_NOTIFIER_HEAD(die_chain);

int notrace notify_die(enum die_val val, const char *str,
               struct pt_regs *regs, long err, int trap, int sig)
{
        struct die_args args = {
                .regs        = regs,
                .str        = str,
                .err        = err,
                .trapnr        = trap,
                .signr        = sig,

        };
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                           "notify_die called but RCU thinks we're quiescent");
        return atomic_notifier_call_chain(&die_chain, val, &args);
}
NOKPROBE_SYMBOL(notify_die);

int register_die_notifier(struct notifier_block *nb)
{
        return atomic_notifier_chain_register(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(register_die_notifier);

int unregister_die_notifier(struct notifier_block *nb)
{
        return atomic_notifier_chain_unregister(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(unregister_die_notifier);
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 





    3 


    3 



























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  linux/drivers/net/netconsole.c
 *
 *  Copyright (C) 2001  Ingo Molnar <mingo@redhat.com>
 *
 *  This file contains the implementation of an IRQ-safe, crash-safe
 *  kernel console implementation that outputs kernel messages to the
 *  network.
 *
 * Modification history:
 *
 * 2001-09-17    started by Ingo Molnar.
 * 2003-08-11    2.6 port by Matt Mackall
 *               simplified options
 *               generic card hooks
 *               works non-modular
 * 2003-09-07    rewritten with netpoll api
 */

/****************************************************************
 *
 ****************************************************************/

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/console.h>
#include <linux/moduleparam.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/netpoll.h>
#include <linux/inet.h>
#include <linux/configfs.h>
#include <linux/etherdevice.h>
#include <linux/utsname.h>

MODULE_AUTHOR("Maintainer: Matt Mackall <mpm@selenic.com>");
MODULE_DESCRIPTION("Console driver for network interfaces");
MODULE_LICENSE("GPL");

#define MAX_PARAM_LENGTH                256
#define MAX_USERDATA_ENTRY_LENGTH        256
#define MAX_USERDATA_VALUE_LENGTH        200
/* The number 3 comes from userdata entry format characters (' ', '=', '\n') */
#define MAX_USERDATA_NAME_LENGTH        (MAX_USERDATA_ENTRY_LENGTH - \
                                        MAX_USERDATA_VALUE_LENGTH - 3)
#define MAX_USERDATA_ITEMS                16
#define MAX_PRINT_CHUNK                        1000

static char config[MAX_PARAM_LENGTH];
module_param_string(netconsole, config, MAX_PARAM_LENGTH, 0);
MODULE_PARM_DESC(netconsole, " netconsole=[src-port]@[src-ip]/[dev],[tgt-port]@<tgt-ip>/[tgt-macaddr]");

static bool oops_only;
module_param(oops_only, bool, 0600);
MODULE_PARM_DESC(oops_only, "Only log oops messages");

#define NETCONSOLE_PARAM_TARGET_PREFIX "cmdline"

#ifndef        MODULE
static int __init option_setup(char *opt)
{
        strscpy(config, opt, MAX_PARAM_LENGTH);
        return 1;
}
__setup("netconsole=", option_setup);
#endif        /* MODULE */

/* Linked list of all configured targets */
static LIST_HEAD(target_list);

/* This needs to be a spinlock because write_msg() cannot sleep */
static DEFINE_SPINLOCK(target_list_lock);

/*
 * Console driver for extended netconsoles.  Registered on the first use to
 * avoid unnecessarily enabling ext message formatting.
 */
static struct console netconsole_ext;

/**
 * struct netconsole_target - Represents a configured netconsole target.
 * @list:        Links this target into the target_list.
 * @group:        Links us into the configfs subsystem hierarchy.
 * @userdata_group:        Links to the userdata configfs hierarchy
 * @userdata_complete:        Cached, formatted string of append
 * @userdata_length:        String length of userdata_complete
 * @enabled:        On / off knob to enable / disable target.
 *                Visible from userspace (read-write).
 *                We maintain a strict 1:1 correspondence between this and
 *                whether the corresponding netpoll is active or inactive.
 *                Also, other parameters of a target may be modified at
 *                runtime only when it is disabled (enabled == 0).
 * @extended:        Denotes whether console is extended or not.
 * @release:        Denotes whether kernel release version should be prepended
 *                to the message. Depends on extended console.
 * @np:                The netpoll structure for this target.
 *                Contains the other userspace visible parameters:
 *                dev_name        (read-write)
 *                local_port        (read-write)
 *                remote_port        (read-write)
 *                local_ip        (read-write)
 *                remote_ip        (read-write)
 *                local_mac        (read-only)
 *                remote_mac        (read-write)
 */
struct netconsole_target {
        struct list_head        list;
#ifdef        CONFIG_NETCONSOLE_DYNAMIC
        struct config_group        group;
        struct config_group        userdata_group;
        char userdata_complete[MAX_USERDATA_ENTRY_LENGTH * MAX_USERDATA_ITEMS];
        size_t                        userdata_length;
#endif
        bool                        enabled;
        bool                        extended;
        bool                        release;
        struct netpoll                np;
};

#ifdef        CONFIG_NETCONSOLE_DYNAMIC

static struct configfs_subsystem netconsole_subsys;
static DEFINE_MUTEX(dynamic_netconsole_mutex);

static int __init dynamic_netconsole_init(void)
{
        config_group_init(&netconsole_subsys.su_group);
        mutex_init(&netconsole_subsys.su_mutex);
        return configfs_register_subsystem(&netconsole_subsys);
}

static void __exit dynamic_netconsole_exit(void)
{
        configfs_unregister_subsystem(&netconsole_subsys);
}

/*
 * Targets that were created by parsing the boot/module option string
 * do not exist in the configfs hierarchy (and have NULL names) and will
 * never go away, so make these a no-op for them.
 */
static void netconsole_target_get(struct netconsole_target *nt)
{
        if (config_item_name(&nt->group.cg_item))
                config_group_get(&nt->group);
}

static void netconsole_target_put(struct netconsole_target *nt)
{
        if (config_item_name(&nt->group.cg_item))
                config_group_put(&nt->group);
}

#else        /* !CONFIG_NETCONSOLE_DYNAMIC */

static int __init dynamic_netconsole_init(void)
{
        return 0;
}

static void __exit dynamic_netconsole_exit(void)
{
}

/*
 * No danger of targets going away from under us when dynamic
 * reconfigurability is off.
 */
static void netconsole_target_get(struct netconsole_target *nt)
{
}

static void netconsole_target_put(struct netconsole_target *nt)
{
}

static void populate_configfs_item(struct netconsole_target *nt,
                                   int cmdline_count)
{
}
#endif        /* CONFIG_NETCONSOLE_DYNAMIC */

/* Allocate and initialize with defaults.
 * Note that these targets get their config_item fields zeroed-out.
 */
static struct netconsole_target *alloc_and_init(void)
{
        struct netconsole_target *nt;

        nt = kzalloc(sizeof(*nt), GFP_KERNEL);
        if (!nt)
                return nt;

        if (IS_ENABLED(CONFIG_NETCONSOLE_EXTENDED_LOG))
                nt->extended = true;
        if (IS_ENABLED(CONFIG_NETCONSOLE_PREPEND_RELEASE))
                nt->release = true;

        nt->np.name = "netconsole";
        strscpy(nt->np.dev_name, "eth0", IFNAMSIZ);
        nt->np.local_port = 6665;
        nt->np.remote_port = 6666;
        eth_broadcast_addr(nt->np.remote_mac);

        return nt;
}

#ifdef        CONFIG_NETCONSOLE_DYNAMIC

/*
 * Our subsystem hierarchy is:
 *
 * /sys/kernel/config/netconsole/
 *                                |
 *                                <target>/
 *                                |        enabled
 *                                |        release
 *                                |        dev_name
 *                                |        local_port
 *                                |        remote_port
 *                                |        local_ip
 *                                |        remote_ip
 *                                |        local_mac
 *                                |        remote_mac
 *                                |        userdata/
 *                                |                <key>/
 *                                |                        value
 *                                |                ...
 *                                |
 *                                <target>/...
 */

static struct netconsole_target *to_target(struct config_item *item)
{
        struct config_group *cfg_group;

        cfg_group = to_config_group(item);
        if (!cfg_group)
                return NULL;
        return container_of(to_config_group(item),
                            struct netconsole_target, group);
}

/* Get rid of possible trailing newline, returning the new length */
static void trim_newline(char *s, size_t maxlen)
{
        size_t len;

        len = strnlen(s, maxlen);
        if (s[len - 1] == '\n')
                s[len - 1] = '\0';
}

/*
 * Attribute operations for netconsole_target.
 */

static ssize_t enabled_show(struct config_item *item, char *buf)
{
        return sysfs_emit(buf, "%d\n", to_target(item)->enabled);
}

static ssize_t extended_show(struct config_item *item, char *buf)
{
        return sysfs_emit(buf, "%d\n", to_target(item)->extended);
}

static ssize_t release_show(struct config_item *item, char *buf)
{
        return sysfs_emit(buf, "%d\n", to_target(item)->release);
}

static ssize_t dev_name_show(struct config_item *item, char *buf)
{
        return sysfs_emit(buf, "%s\n", to_target(item)->np.dev_name);
}

static ssize_t local_port_show(struct config_item *item, char *buf)
{
        return sysfs_emit(buf, "%d\n", to_target(item)->np.local_port);
}

static ssize_t remote_port_show(struct config_item *item, char *buf)
{
        return sysfs_emit(buf, "%d\n", to_target(item)->np.remote_port);
}

static ssize_t local_ip_show(struct config_item *item, char *buf)
{
        struct netconsole_target *nt = to_target(item);

        if (nt->np.ipv6)
                return sysfs_emit(buf, "%pI6c\n", &nt->np.local_ip.in6);
        else
                return sysfs_emit(buf, "%pI4\n", &nt->np.local_ip);
}

static ssize_t remote_ip_show(struct config_item *item, char *buf)
{
        struct netconsole_target *nt = to_target(item);

        if (nt->np.ipv6)
                return sysfs_emit(buf, "%pI6c\n", &nt->np.remote_ip.in6);
        else
                return sysfs_emit(buf, "%pI4\n", &nt->np.remote_ip);
}

static ssize_t local_mac_show(struct config_item *item, char *buf)
{
        struct net_device *dev = to_target(item)->np.dev;
        static const u8 bcast[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };

        return sysfs_emit(buf, "%pM\n", dev ? dev->dev_addr : bcast);
}

static ssize_t remote_mac_show(struct config_item *item, char *buf)
{
        return sysfs_emit(buf, "%pM\n", to_target(item)->np.remote_mac);
}

/*
 * This one is special -- targets created through the configfs interface
 * are not enabled (and the corresponding netpoll activated) by default.
 * The user is expected to set the desired parameters first (which
 * would enable him to dynamically add new netpoll targets for new
 * network interfaces as and when they come up).
 */
static ssize_t enabled_store(struct config_item *item,
                const char *buf, size_t count)
{
        struct netconsole_target *nt = to_target(item);
        unsigned long flags;
        bool enabled;
        int err;

        mutex_lock(&dynamic_netconsole_mutex);
        err = kstrtobool(buf, &enabled);
        if (err)
                goto out_unlock;

        err = -EINVAL;
        if ((bool)enabled == nt->enabled) {
                pr_info("network logging has already %s\n",
                        nt->enabled ? "started" : "stopped");
                goto out_unlock;
        }

        if (enabled) {        /* true */
                if (nt->release && !nt->extended) {
                        pr_err("Not enabling netconsole. Release feature requires extended log message");
                        goto out_unlock;
                }

                if (nt->extended && !console_is_registered(&netconsole_ext))
                        register_console(&netconsole_ext);

                /*
                 * Skip netpoll_parse_options() -- all the attributes are
                 * already configured via configfs. Just print them out.
                 */
                netpoll_print_options(&nt->np);

                err = netpoll_setup(&nt->np);
                if (err)
                        goto out_unlock;

                pr_info("network logging started\n");
        } else {        /* false */
                /* We need to disable the netconsole before cleaning it up
                 * otherwise we might end up in write_msg() with
                 * nt->np.dev == NULL and nt->enabled == true
                 */
                spin_lock_irqsave(&target_list_lock, flags);
                nt->enabled = false;
                spin_unlock_irqrestore(&target_list_lock, flags);
                netpoll_cleanup(&nt->np);
        }

        nt->enabled = enabled;

        mutex_unlock(&dynamic_netconsole_mutex);
        return strnlen(buf, count);
out_unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return err;
}

static ssize_t release_store(struct config_item *item, const char *buf,
                             size_t count)
{
        struct netconsole_target *nt = to_target(item);
        bool release;
        int err;

        mutex_lock(&dynamic_netconsole_mutex);
        if (nt->enabled) {
                pr_err("target (%s) is enabled, disable to update parameters\n",
                       config_item_name(&nt->group.cg_item));
                err = -EINVAL;
                goto out_unlock;
        }

        err = kstrtobool(buf, &release);
        if (err)
                goto out_unlock;

        nt->release = release;

        mutex_unlock(&dynamic_netconsole_mutex);
        return strnlen(buf, count);
out_unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return err;
}

static ssize_t extended_store(struct config_item *item, const char *buf,
                size_t count)
{
        struct netconsole_target *nt = to_target(item);
        bool extended;
        int err;

        mutex_lock(&dynamic_netconsole_mutex);
        if (nt->enabled) {
                pr_err("target (%s) is enabled, disable to update parameters\n",
                       config_item_name(&nt->group.cg_item));
                err = -EINVAL;
                goto out_unlock;
        }

        err = kstrtobool(buf, &extended);
        if (err)
                goto out_unlock;

        nt->extended = extended;

        mutex_unlock(&dynamic_netconsole_mutex);
        return strnlen(buf, count);
out_unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return err;
}

static ssize_t dev_name_store(struct config_item *item, const char *buf,
                size_t count)
{
        struct netconsole_target *nt = to_target(item);

        mutex_lock(&dynamic_netconsole_mutex);
        if (nt->enabled) {
                pr_err("target (%s) is enabled, disable to update parameters\n",
                       config_item_name(&nt->group.cg_item));
                mutex_unlock(&dynamic_netconsole_mutex);
                return -EINVAL;
        }

        strscpy(nt->np.dev_name, buf, IFNAMSIZ);
        trim_newline(nt->np.dev_name, IFNAMSIZ);

        mutex_unlock(&dynamic_netconsole_mutex);
        return strnlen(buf, count);
}

static ssize_t local_port_store(struct config_item *item, const char *buf,
                size_t count)
{
        struct netconsole_target *nt = to_target(item);
        int rv = -EINVAL;

        mutex_lock(&dynamic_netconsole_mutex);
        if (nt->enabled) {
                pr_err("target (%s) is enabled, disable to update parameters\n",
                       config_item_name(&nt->group.cg_item));
                goto out_unlock;
        }

        rv = kstrtou16(buf, 10, &nt->np.local_port);
        if (rv < 0)
                goto out_unlock;
        mutex_unlock(&dynamic_netconsole_mutex);
        return strnlen(buf, count);
out_unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return rv;
}

static ssize_t remote_port_store(struct config_item *item,
                const char *buf, size_t count)
{
        struct netconsole_target *nt = to_target(item);
        int rv = -EINVAL;

        mutex_lock(&dynamic_netconsole_mutex);
        if (nt->enabled) {
                pr_err("target (%s) is enabled, disable to update parameters\n",
                       config_item_name(&nt->group.cg_item));
                goto out_unlock;
        }

        rv = kstrtou16(buf, 10, &nt->np.remote_port);
        if (rv < 0)
                goto out_unlock;
        mutex_unlock(&dynamic_netconsole_mutex);
        return strnlen(buf, count);
out_unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return rv;
}

static ssize_t local_ip_store(struct config_item *item, const char *buf,
                size_t count)
{
        struct netconsole_target *nt = to_target(item);

        mutex_lock(&dynamic_netconsole_mutex);
        if (nt->enabled) {
                pr_err("target (%s) is enabled, disable to update parameters\n",
                       config_item_name(&nt->group.cg_item));
                goto out_unlock;
        }

        if (strnchr(buf, count, ':')) {
                const char *end;

                if (in6_pton(buf, count, nt->np.local_ip.in6.s6_addr, -1, &end) > 0) {
                        if (*end && *end != '\n') {
                                pr_err("invalid IPv6 address at: <%c>\n", *end);
                                goto out_unlock;
                        }
                        nt->np.ipv6 = true;
                } else
                        goto out_unlock;
        } else {
                if (!nt->np.ipv6)
                        nt->np.local_ip.ip = in_aton(buf);
                else
                        goto out_unlock;
        }

        mutex_unlock(&dynamic_netconsole_mutex);
        return strnlen(buf, count);
out_unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return -EINVAL;
}

static ssize_t remote_ip_store(struct config_item *item, const char *buf,
               size_t count)
{
        struct netconsole_target *nt = to_target(item);

        mutex_lock(&dynamic_netconsole_mutex);
        if (nt->enabled) {
                pr_err("target (%s) is enabled, disable to update parameters\n",
                       config_item_name(&nt->group.cg_item));
                goto out_unlock;
        }

        if (strnchr(buf, count, ':')) {
                const char *end;

                if (in6_pton(buf, count, nt->np.remote_ip.in6.s6_addr, -1, &end) > 0) {
                        if (*end && *end != '\n') {
                                pr_err("invalid IPv6 address at: <%c>\n", *end);
                                goto out_unlock;
                        }
                        nt->np.ipv6 = true;
                } else
                        goto out_unlock;
        } else {
                if (!nt->np.ipv6)
                        nt->np.remote_ip.ip = in_aton(buf);
                else
                        goto out_unlock;
        }

        mutex_unlock(&dynamic_netconsole_mutex);
        return strnlen(buf, count);
out_unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return -EINVAL;
}

static ssize_t remote_mac_store(struct config_item *item, const char *buf,
                size_t count)
{
        struct netconsole_target *nt = to_target(item);
        u8 remote_mac[ETH_ALEN];

        mutex_lock(&dynamic_netconsole_mutex);
        if (nt->enabled) {
                pr_err("target (%s) is enabled, disable to update parameters\n",
                       config_item_name(&nt->group.cg_item));
                goto out_unlock;
        }

        if (!mac_pton(buf, remote_mac))
                goto out_unlock;
        if (buf[3 * ETH_ALEN - 1] && buf[3 * ETH_ALEN - 1] != '\n')
                goto out_unlock;
        memcpy(nt->np.remote_mac, remote_mac, ETH_ALEN);

        mutex_unlock(&dynamic_netconsole_mutex);
        return strnlen(buf, count);
out_unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return -EINVAL;
}

struct userdatum {
        struct config_item item;
        char value[MAX_USERDATA_VALUE_LENGTH];
};

static struct userdatum *to_userdatum(struct config_item *item)
{
        return container_of(item, struct userdatum, item);
}

struct userdata {
        struct config_group group;
};

static struct userdata *to_userdata(struct config_item *item)
{
        return container_of(to_config_group(item), struct userdata, group);
}

static struct netconsole_target *userdata_to_target(struct userdata *ud)
{
        struct config_group *netconsole_group;

        netconsole_group = to_config_group(ud->group.cg_item.ci_parent);
        return to_target(&netconsole_group->cg_item);
}

static ssize_t userdatum_value_show(struct config_item *item, char *buf)
{
        return sysfs_emit(buf, "%s\n", &(to_userdatum(item)->value[0]));
}

static void update_userdata(struct netconsole_target *nt)
{
        int complete_idx = 0, child_count = 0;
        struct list_head *entry;

        /* Clear the current string in case the last userdatum was deleted */
        nt->userdata_length = 0;
        nt->userdata_complete[0] = 0;

        list_for_each(entry, &nt->userdata_group.cg_children) {
                struct userdatum *udm_item;
                struct config_item *item;

                if (child_count >= MAX_USERDATA_ITEMS)
                        break;
                child_count++;

                item = container_of(entry, struct config_item, ci_entry);
                udm_item = to_userdatum(item);

                /* Skip userdata with no value set */
                if (strnlen(udm_item->value, MAX_USERDATA_VALUE_LENGTH) == 0)
                        continue;

                /* This doesn't overflow userdata_complete since it will write
                 * one entry length (1/MAX_USERDATA_ITEMS long), entry count is
                 * checked to not exceed MAX items with child_count above
                 */
                complete_idx += scnprintf(&nt->userdata_complete[complete_idx],
                                          MAX_USERDATA_ENTRY_LENGTH, " %s=%s\n",
                                          item->ci_name, udm_item->value);
        }
        nt->userdata_length = strnlen(nt->userdata_complete,
                                      sizeof(nt->userdata_complete));
}

static ssize_t userdatum_value_store(struct config_item *item, const char *buf,
                                     size_t count)
{
        struct userdatum *udm = to_userdatum(item);
        struct netconsole_target *nt;
        struct userdata *ud;
        int ret;

        if (count > MAX_USERDATA_VALUE_LENGTH)
                return -EMSGSIZE;

        mutex_lock(&dynamic_netconsole_mutex);

        ret = strscpy(udm->value, buf, sizeof(udm->value));
        if (ret < 0)
                goto out_unlock;
        trim_newline(udm->value, sizeof(udm->value));

        ud = to_userdata(item->ci_parent);
        nt = userdata_to_target(ud);
        update_userdata(nt);

        mutex_unlock(&dynamic_netconsole_mutex);
        return count;
out_unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return ret;
}

CONFIGFS_ATTR(userdatum_, value);

static struct configfs_attribute *userdatum_attrs[] = {
        &userdatum_attr_value,
        NULL,
};

static void userdatum_release(struct config_item *item)
{
        kfree(to_userdatum(item));
}

static struct configfs_item_operations userdatum_ops = {
        .release = userdatum_release,
};

static const struct config_item_type userdatum_type = {
        .ct_item_ops        = &userdatum_ops,
        .ct_attrs        = userdatum_attrs,
        .ct_owner        = THIS_MODULE,
};

static struct config_item *userdatum_make_item(struct config_group *group,
                                               const char *name)
{
        struct netconsole_target *nt;
        struct userdatum *udm;
        struct userdata *ud;
        size_t child_count;

        if (strlen(name) > MAX_USERDATA_NAME_LENGTH)
                return ERR_PTR(-ENAMETOOLONG);

        ud = to_userdata(&group->cg_item);
        nt = userdata_to_target(ud);
        child_count = list_count_nodes(&nt->userdata_group.cg_children);
        if (child_count >= MAX_USERDATA_ITEMS)
                return ERR_PTR(-ENOSPC);

        udm = kzalloc(sizeof(*udm), GFP_KERNEL);
        if (!udm)
                return ERR_PTR(-ENOMEM);

        config_item_init_type_name(&udm->item, name, &userdatum_type);
        return &udm->item;
}

static void userdatum_drop(struct config_group *group, struct config_item *item)
{
        struct netconsole_target *nt;
        struct userdata *ud;

        ud = to_userdata(&group->cg_item);
        nt = userdata_to_target(ud);

        mutex_lock(&dynamic_netconsole_mutex);
        update_userdata(nt);
        config_item_put(item);
        mutex_unlock(&dynamic_netconsole_mutex);
}

static struct configfs_attribute *userdata_attrs[] = {
        NULL,
};

static struct configfs_group_operations userdata_ops = {
        .make_item                = userdatum_make_item,
        .drop_item                = userdatum_drop,
};

static struct config_item_type userdata_type = {
        .ct_item_ops        = &userdatum_ops,
        .ct_group_ops        = &userdata_ops,
        .ct_attrs        = userdata_attrs,
        .ct_owner        = THIS_MODULE,
};

CONFIGFS_ATTR(, enabled);
CONFIGFS_ATTR(, extended);
CONFIGFS_ATTR(, dev_name);
CONFIGFS_ATTR(, local_port);
CONFIGFS_ATTR(, remote_port);
CONFIGFS_ATTR(, local_ip);
CONFIGFS_ATTR(, remote_ip);
CONFIGFS_ATTR_RO(, local_mac);
CONFIGFS_ATTR(, remote_mac);
CONFIGFS_ATTR(, release);

static struct configfs_attribute *netconsole_target_attrs[] = {
        &attr_enabled,
        &attr_extended,
        &attr_release,
        &attr_dev_name,
        &attr_local_port,
        &attr_remote_port,
        &attr_local_ip,
        &attr_remote_ip,
        &attr_local_mac,
        &attr_remote_mac,
        NULL,
};

/*
 * Item operations and type for netconsole_target.
 */

static void netconsole_target_release(struct config_item *item)
{
        kfree(to_target(item));
}

static struct configfs_item_operations netconsole_target_item_ops = {
        .release                = netconsole_target_release,
};

static const struct config_item_type netconsole_target_type = {
        .ct_attrs                = netconsole_target_attrs,
        .ct_item_ops                = &netconsole_target_item_ops,
        .ct_owner                = THIS_MODULE,
};

static void init_target_config_group(struct netconsole_target *nt,
                                     const char *name)
{
        config_group_init_type_name(&nt->group, name, &netconsole_target_type);
        config_group_init_type_name(&nt->userdata_group, "userdata",
                                    &userdata_type);
        configfs_add_default_group(&nt->userdata_group, &nt->group);
}

static struct netconsole_target *find_cmdline_target(const char *name)
{
        struct netconsole_target *nt, *ret = NULL;
        unsigned long flags;

        spin_lock_irqsave(&target_list_lock, flags);
        list_for_each_entry(nt, &target_list, list) {
                if (!strcmp(nt->group.cg_item.ci_name, name)) {
                        ret = nt;
                        break;
                }
        }
        spin_unlock_irqrestore(&target_list_lock, flags);

        return ret;
}

/*
 * Group operations and type for netconsole_subsys.
 */

static struct config_group *make_netconsole_target(struct config_group *group,
                                                   const char *name)
{
        struct netconsole_target *nt;
        unsigned long flags;

        /* Checking if a target by this name was created at boot time.  If so,
         * attach a configfs entry to that target.  This enables dynamic
         * control.
         */
        if (!strncmp(name, NETCONSOLE_PARAM_TARGET_PREFIX,
                     strlen(NETCONSOLE_PARAM_TARGET_PREFIX))) {
                nt = find_cmdline_target(name);
                if (nt) {
                        init_target_config_group(nt, name);
                        return &nt->group;
                }
        }

        nt = alloc_and_init();
        if (!nt)
                return ERR_PTR(-ENOMEM);

        /* Initialize the config_group member */
        init_target_config_group(nt, name);

        /* Adding, but it is disabled */
        spin_lock_irqsave(&target_list_lock, flags);
        list_add(&nt->list, &target_list);
        spin_unlock_irqrestore(&target_list_lock, flags);

        return &nt->group;
}

static void drop_netconsole_target(struct config_group *group,
                                   struct config_item *item)
{
        unsigned long flags;
        struct netconsole_target *nt = to_target(item);

        spin_lock_irqsave(&target_list_lock, flags);
        list_del(&nt->list);
        spin_unlock_irqrestore(&target_list_lock, flags);

        /*
         * The target may have never been enabled, or was manually disabled
         * before being removed so netpoll may have already been cleaned up.
         */
        if (nt->enabled)
                netpoll_cleanup(&nt->np);

        config_item_put(&nt->group.cg_item);
}

static struct configfs_group_operations netconsole_subsys_group_ops = {
        .make_group        = make_netconsole_target,
        .drop_item        = drop_netconsole_target,
};

static const struct config_item_type netconsole_subsys_type = {
        .ct_group_ops        = &netconsole_subsys_group_ops,
        .ct_owner        = THIS_MODULE,
};

/* The netconsole configfs subsystem */
static struct configfs_subsystem netconsole_subsys = {
        .su_group        = {
                .cg_item        = {
                        .ci_namebuf        = "netconsole",
                        .ci_type        = &netconsole_subsys_type,
                },
        },
};

static void populate_configfs_item(struct netconsole_target *nt,
                                   int cmdline_count)
{
        char target_name[16];

        snprintf(target_name, sizeof(target_name), "%s%d",
                 NETCONSOLE_PARAM_TARGET_PREFIX, cmdline_count);
        init_target_config_group(nt, target_name);
}

#endif        /* CONFIG_NETCONSOLE_DYNAMIC */

/* Handle network interface device notifications */
static int netconsole_netdev_event(struct notifier_block *this,
                                   unsigned long event, void *ptr)
{
        unsigned long flags;
        struct netconsole_target *nt;
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        bool stopped = false;

        if (!(event == NETDEV_CHANGENAME || event == NETDEV_UNREGISTER ||
              event == NETDEV_RELEASE || event == NETDEV_JOIN))
                goto done;

        spin_lock_irqsave(&target_list_lock, flags);
restart:
        list_for_each_entry(nt, &target_list, list) {
                netconsole_target_get(nt);
                if (nt->np.dev == dev) {
                        switch (event) {
                        case NETDEV_CHANGENAME:
                                strscpy(nt->np.dev_name, dev->name, IFNAMSIZ);
                                break;
                        case NETDEV_RELEASE:
                        case NETDEV_JOIN:
                        case NETDEV_UNREGISTER:
                                /* rtnl_lock already held
                                 * we might sleep in __netpoll_cleanup()
                                 */
                                spin_unlock_irqrestore(&target_list_lock, flags);

                                __netpoll_cleanup(&nt->np);

                                spin_lock_irqsave(&target_list_lock, flags);
                                netdev_put(nt->np.dev, &nt->np.dev_tracker);
                                nt->np.dev = NULL;
                                nt->enabled = false;
                                stopped = true;
                                netconsole_target_put(nt);
                                goto restart;
                        }
                }
                netconsole_target_put(nt);
        }
        spin_unlock_irqrestore(&target_list_lock, flags);
        if (stopped) {
                const char *msg = "had an event";

                switch (event) {
                case NETDEV_UNREGISTER:
                        msg = "unregistered";
                        break;
                case NETDEV_RELEASE:
                        msg = "released slaves";
                        break;
                case NETDEV_JOIN:
                        msg = "is joining a master device";
                        break;
                }
                pr_info("network logging stopped on interface %s as it %s\n",
                        dev->name, msg);
        }

done:
        return NOTIFY_DONE;
}

static struct notifier_block netconsole_netdev_notifier = {
        .notifier_call  = netconsole_netdev_event,
};

/**
 * send_ext_msg_udp - send extended log message to target
 * @nt: target to send message to
 * @msg: extended log message to send
 * @msg_len: length of message
 *
 * Transfer extended log @msg to @nt.  If @msg is longer than
 * MAX_PRINT_CHUNK, it'll be split and transmitted in multiple chunks with
 * ncfrag header field added to identify them.
 */
static void send_ext_msg_udp(struct netconsole_target *nt, const char *msg,
                             int msg_len)
{
        static char buf[MAX_PRINT_CHUNK]; /* protected by target_list_lock */
        const char *header, *body;
        int offset = 0;
        int header_len, body_len;
        const char *msg_ready = msg;
        const char *release;
        int release_len = 0;
        int userdata_len = 0;
        char *userdata = NULL;

#ifdef CONFIG_NETCONSOLE_DYNAMIC
        userdata = nt->userdata_complete;
        userdata_len = nt->userdata_length;
#endif

        if (nt->release) {
                release = init_utsname()->release;
                release_len = strlen(release) + 1;
        }

        if (msg_len + release_len + userdata_len <= MAX_PRINT_CHUNK) {
                /* No fragmentation needed */
                if (nt->release) {
                        scnprintf(buf, MAX_PRINT_CHUNK, "%s,%s", release, msg);
                        msg_len += release_len;
                } else {
                        memcpy(buf, msg, msg_len);
                }

                if (userdata)
                        msg_len += scnprintf(&buf[msg_len],
                                             MAX_PRINT_CHUNK - msg_len,
                                             "%s", userdata);

                msg_ready = buf;
                netpoll_send_udp(&nt->np, msg_ready, msg_len);
                return;
        }

        /* need to insert extra header fields, detect header and body */
        header = msg;
        body = memchr(msg, ';', msg_len);
        if (WARN_ON_ONCE(!body))
                return;

        header_len = body - header;
        body_len = msg_len - header_len - 1;
        body++;

        /*
         * Transfer multiple chunks with the following extra header.
         * "ncfrag=<byte-offset>/<total-bytes>"
         */
        if (nt->release)
                scnprintf(buf, MAX_PRINT_CHUNK, "%s,", release);
        memcpy(buf + release_len, header, header_len);
        header_len += release_len;

        while (offset < body_len + userdata_len) {
                int this_header = header_len;
                int this_offset = 0;
                int this_chunk = 0;

                this_header += scnprintf(buf + this_header,
                                         sizeof(buf) - this_header,
                                         ",ncfrag=%d/%d;", offset,
                                         body_len + userdata_len);

                /* Not all body data has been written yet */
                if (offset < body_len) {
                        this_chunk = min(body_len - offset,
                                         MAX_PRINT_CHUNK - this_header);
                        if (WARN_ON_ONCE(this_chunk <= 0))
                                return;
                        memcpy(buf + this_header, body + offset, this_chunk);
                        this_offset += this_chunk;
                }
                /* Body is fully written and there is pending userdata to write,
                 * append userdata in this chunk
                 */
                if (offset + this_offset >= body_len &&
                    offset + this_offset < userdata_len + body_len) {
                        int sent_userdata = (offset + this_offset) - body_len;
                        int preceding_bytes = this_chunk + this_header;

                        if (WARN_ON_ONCE(sent_userdata < 0))
                                return;

                        this_chunk = min(userdata_len - sent_userdata,
                                         MAX_PRINT_CHUNK - preceding_bytes);
                        if (WARN_ON_ONCE(this_chunk <= 0))
                                return;
                        memcpy(buf + this_header + this_offset,
                               userdata + sent_userdata,
                               this_chunk);
                        this_offset += this_chunk;
                }

                netpoll_send_udp(&nt->np, buf, this_header + this_offset);
                offset += this_offset;
        }
}

static void write_ext_msg(struct console *con, const char *msg,
                          unsigned int len)
{
        struct netconsole_target *nt;
        unsigned long flags;

        if ((oops_only && !oops_in_progress) || list_empty(&target_list))
                return;

        spin_lock_irqsave(&target_list_lock, flags);
        list_for_each_entry(nt, &target_list, list)
                if (nt->extended && nt->enabled && netif_running(nt->np.dev))
                        send_ext_msg_udp(nt, msg, len);
        spin_unlock_irqrestore(&target_list_lock, flags);
}

static void write_msg(struct console *con, const char *msg, unsigned int len)
{
        int frag, left;
        unsigned long flags;
        struct netconsole_target *nt;
        const char *tmp;

        if (oops_only && !oops_in_progress)
                return;
        /* Avoid taking lock and disabling interrupts unnecessarily */
        if (list_empty(&target_list))
                return;

        spin_lock_irqsave(&target_list_lock, flags);
        list_for_each_entry(nt, &target_list, list) {
                if (!nt->extended && nt->enabled && netif_running(nt->np.dev)) {
                        /*
                         * We nest this inside the for-each-target loop above
                         * so that we're able to get as much logging out to
                         * at least one target if we die inside here, instead
                         * of unnecessarily keeping all targets in lock-step.
                         */
                        tmp = msg;
                        for (left = len; left;) {
                                frag = min(left, MAX_PRINT_CHUNK);
                                netpoll_send_udp(&nt->np, tmp, frag);
                                tmp += frag;
                                left -= frag;
                        }
                }
        }
        spin_unlock_irqrestore(&target_list_lock, flags);
}

/* Allocate new target (from boot/module param) and setup netpoll for it */
static struct netconsole_target *alloc_param_target(char *target_config,
                                                    int cmdline_count)
{
        struct netconsole_target *nt;
        int err;

        nt = alloc_and_init();
        if (!nt) {
                err = -ENOMEM;
                goto fail;
        }

        if (*target_config == '+') {
                nt->extended = true;
                target_config++;
        }

        if (*target_config == 'r') {
                if (!nt->extended) {
                        pr_err("Netconsole configuration error. Release feature requires extended log message");
                        err = -EINVAL;
                        goto fail;
                }
                nt->release = true;
                target_config++;
        }

        /* Parse parameters and setup netpoll */
        err = netpoll_parse_options(&nt->np, target_config);
        if (err)
                goto fail;

        err = netpoll_setup(&nt->np);
        if (err)
                goto fail;

        populate_configfs_item(nt, cmdline_count);
        nt->enabled = true;

        return nt;

fail:
        kfree(nt);
        return ERR_PTR(err);
}

/* Cleanup netpoll for given target (from boot/module param) and free it */
static void free_param_target(struct netconsole_target *nt)
{
        netpoll_cleanup(&nt->np);
        kfree(nt);
}

static struct console netconsole_ext = {
        .name        = "netcon_ext",
        .flags        = CON_ENABLED | CON_EXTENDED,
        .write        = write_ext_msg,
};

static struct console netconsole = {
        .name        = "netcon",
        .flags        = CON_ENABLED,
        .write        = write_msg,
};

static int __init init_netconsole(void)
{
        int err;
        struct netconsole_target *nt, *tmp;
        unsigned int count = 0;
        bool extended = false;
        unsigned long flags;
        char *target_config;
        char *input = config;

        if (strnlen(input, MAX_PARAM_LENGTH)) {
                while ((target_config = strsep(&input, ";"))) {
                        nt = alloc_param_target(target_config, count);
                        if (IS_ERR(nt)) {
                                err = PTR_ERR(nt);
                                goto fail;
                        }
                        /* Dump existing printks when we register */
                        if (nt->extended) {
                                extended = true;
                                netconsole_ext.flags |= CON_PRINTBUFFER;
                        } else {
                                netconsole.flags |= CON_PRINTBUFFER;
                        }

                        spin_lock_irqsave(&target_list_lock, flags);
                        list_add(&nt->list, &target_list);
                        spin_unlock_irqrestore(&target_list_lock, flags);
                        count++;
                }
        }

        err = register_netdevice_notifier(&netconsole_netdev_notifier);
        if (err)
                goto fail;

        err = dynamic_netconsole_init();
        if (err)
                goto undonotifier;

        if (extended)
                register_console(&netconsole_ext);
        register_console(&netconsole);
        pr_info("network logging started\n");

        return err;

undonotifier:
        unregister_netdevice_notifier(&netconsole_netdev_notifier);

fail:
        pr_err("cleaning up\n");

        /*
         * Remove all targets and destroy them (only targets created
         * from the boot/module option exist here). Skipping the list
         * lock is safe here, and netpoll_cleanup() will sleep.
         */
        list_for_each_entry_safe(nt, tmp, &target_list, list) {
                list_del(&nt->list);
                free_param_target(nt);
        }

        return err;
}

static void __exit cleanup_netconsole(void)
{
        struct netconsole_target *nt, *tmp;

        if (console_is_registered(&netconsole_ext))
                unregister_console(&netconsole_ext);
        unregister_console(&netconsole);
        dynamic_netconsole_exit();
        unregister_netdevice_notifier(&netconsole_netdev_notifier);

        /*
         * Targets created via configfs pin references on our module
         * and would first be rmdir(2)'ed from userspace. We reach
         * here only when they are already destroyed, and only those
         * created from the boot/module option are left, so remove and
         * destroy them. Skipping the list lock is safe here, and
         * netpoll_cleanup() will sleep.
         */
        list_for_each_entry_safe(nt, tmp, &target_list, list) {
                list_del(&nt->list);
                free_param_target(nt);
        }
}

/*
 * Use late_initcall to ensure netconsole is
 * initialized after network device driver if built-in.
 *
 * late_initcall() and module_init() are identical if built as module.
 */
late_initcall(init_netconsole);
module_exit(cleanup_netconsole);


































































































































































































































































































































   36 







































   32 





   32 





   29 




    2 












   10 























































































































    1 

















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* audit.h -- Auditing support
 *
 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
 * All Rights Reserved.
 *
 * Written by Rickard E. (Rik) Faith <faith@redhat.com>
 */
#ifndef _LINUX_AUDIT_H_
#define _LINUX_AUDIT_H_

#include <linux/sched.h>
#include <linux/ptrace.h>
#include <linux/audit_arch.h>
#include <uapi/linux/audit.h>
#include <uapi/linux/netfilter/nf_tables.h>
#include <uapi/linux/fanotify.h>

#define AUDIT_INO_UNSET ((unsigned long)-1)
#define AUDIT_DEV_UNSET ((dev_t)-1)

struct audit_sig_info {
        uid_t                uid;
        pid_t                pid;
        char                ctx[];
};

struct audit_buffer;
struct audit_context;
struct inode;
struct netlink_skb_parms;
struct path;
struct linux_binprm;
struct mq_attr;
struct mqstat;
struct audit_watch;
struct audit_tree;
struct sk_buff;
struct kern_ipc_perm;

struct audit_krule {
        u32                        pflags;
        u32                        flags;
        u32                        listnr;
        u32                        action;
        u32                        mask[AUDIT_BITMASK_SIZE];
        u32                        buflen; /* for data alloc on list rules */
        u32                        field_count;
        char                        *filterkey; /* ties events to rules */
        struct audit_field        *fields;
        struct audit_field        *arch_f; /* quick access to arch field */
        struct audit_field        *inode_f; /* quick access to an inode field */
        struct audit_watch        *watch;        /* associated watch */
        struct audit_tree        *tree;        /* associated watched tree */
        struct audit_fsnotify_mark        *exe;
        struct list_head        rlist;        /* entry in audit_{watch,tree}.rules list */
        struct list_head        list;        /* for AUDIT_LIST* purposes only */
        u64                        prio;
};

/* Flag to indicate legacy AUDIT_LOGINUID unset usage */
#define AUDIT_LOGINUID_LEGACY                0x1

struct audit_field {
        u32                                type;
        union {
                u32                        val;
                kuid_t                        uid;
                kgid_t                        gid;
                struct {
                        char                *lsm_str;
                        void                *lsm_rule;
                };
        };
        u32                                op;
};

enum audit_ntp_type {
        AUDIT_NTP_OFFSET,
        AUDIT_NTP_FREQ,
        AUDIT_NTP_STATUS,
        AUDIT_NTP_TAI,
        AUDIT_NTP_TICK,
        AUDIT_NTP_ADJUST,

        AUDIT_NTP_NVALS /* count */
};

#ifdef CONFIG_AUDITSYSCALL
struct audit_ntp_val {
        long long oldval, newval;
};

struct audit_ntp_data {
        struct audit_ntp_val vals[AUDIT_NTP_NVALS];
};
#else
struct audit_ntp_data {};
#endif

enum audit_nfcfgop {
        AUDIT_XT_OP_REGISTER,
        AUDIT_XT_OP_REPLACE,
        AUDIT_XT_OP_UNREGISTER,
        AUDIT_NFT_OP_TABLE_REGISTER,
        AUDIT_NFT_OP_TABLE_UNREGISTER,
        AUDIT_NFT_OP_CHAIN_REGISTER,
        AUDIT_NFT_OP_CHAIN_UNREGISTER,
        AUDIT_NFT_OP_RULE_REGISTER,
        AUDIT_NFT_OP_RULE_UNREGISTER,
        AUDIT_NFT_OP_SET_REGISTER,
        AUDIT_NFT_OP_SET_UNREGISTER,
        AUDIT_NFT_OP_SETELEM_REGISTER,
        AUDIT_NFT_OP_SETELEM_UNREGISTER,
        AUDIT_NFT_OP_GEN_REGISTER,
        AUDIT_NFT_OP_OBJ_REGISTER,
        AUDIT_NFT_OP_OBJ_UNREGISTER,
        AUDIT_NFT_OP_OBJ_RESET,
        AUDIT_NFT_OP_FLOWTABLE_REGISTER,
        AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
        AUDIT_NFT_OP_SETELEM_RESET,
        AUDIT_NFT_OP_RULE_RESET,
        AUDIT_NFT_OP_INVALID,
};

extern int __init audit_register_class(int class, unsigned *list);
extern int audit_classify_syscall(int abi, unsigned syscall);
extern int audit_classify_arch(int arch);
/* only for compat system calls */
extern unsigned compat_write_class[];
extern unsigned compat_read_class[];
extern unsigned compat_dir_class[];
extern unsigned compat_chattr_class[];
extern unsigned compat_signal_class[];

/* audit_names->type values */
#define        AUDIT_TYPE_UNKNOWN        0        /* we don't know yet */
#define        AUDIT_TYPE_NORMAL        1        /* a "normal" audit record */
#define        AUDIT_TYPE_PARENT        2        /* a parent audit record */
#define        AUDIT_TYPE_CHILD_DELETE 3        /* a child being deleted */
#define        AUDIT_TYPE_CHILD_CREATE 4        /* a child being created */

/* maximized args number that audit_socketcall can process */
#define AUDITSC_ARGS                6

/* bit values for ->signal->audit_tty */
#define AUDIT_TTY_ENABLE        BIT(0)
#define AUDIT_TTY_LOG_PASSWD        BIT(1)

struct filename;

#define AUDIT_OFF        0
#define AUDIT_ON        1
#define AUDIT_LOCKED        2
#ifdef CONFIG_AUDIT
/* These are defined in audit.c */
                                /* Public API */
extern __printf(4, 5)
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
               const char *fmt, ...);

extern struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type);
extern __printf(2, 3)
void audit_log_format(struct audit_buffer *ab, const char *fmt, ...);
extern void                    audit_log_end(struct audit_buffer *ab);
extern bool                    audit_string_contains_control(const char *string,
                                                          size_t len);
extern void                    audit_log_n_hex(struct audit_buffer *ab,
                                          const unsigned char *buf,
                                          size_t len);
extern void                    audit_log_n_string(struct audit_buffer *ab,
                                               const char *buf,
                                               size_t n);
extern void                    audit_log_n_untrustedstring(struct audit_buffer *ab,
                                                        const char *string,
                                                        size_t n);
extern void                    audit_log_untrustedstring(struct audit_buffer *ab,
                                                      const char *string);
extern void                    audit_log_d_path(struct audit_buffer *ab,
                                             const char *prefix,
                                             const struct path *path);
extern void                    audit_log_key(struct audit_buffer *ab,
                                          char *key);
extern void                    audit_log_path_denied(int type,
                                                  const char *operation);
extern void                    audit_log_lost(const char *message);

extern int audit_log_task_context(struct audit_buffer *ab);
extern void audit_log_task_info(struct audit_buffer *ab);

extern int                    audit_update_lsm_rules(void);

                                /* Private API (for audit.c only) */
extern int audit_rule_change(int type, int seq, void *data, size_t datasz);
extern int audit_list_rules_send(struct sk_buff *request_skb, int seq);

extern int audit_set_loginuid(kuid_t loginuid);

static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
{
        return tsk->loginuid;
}

static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
{
        return tsk->sessionid;
}

extern u32 audit_enabled;

extern int audit_signal_info(int sig, struct task_struct *t);

#else /* CONFIG_AUDIT */
static inline __printf(4, 5)
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
               const char *fmt, ...)
{ }
static inline struct audit_buffer *audit_log_start(struct audit_context *ctx,
                                                   gfp_t gfp_mask, int type)
{
        return NULL;
}
static inline __printf(2, 3)
void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
{ }
static inline void audit_log_end(struct audit_buffer *ab)
{ }
static inline void audit_log_n_hex(struct audit_buffer *ab,
                                   const unsigned char *buf, size_t len)
{ }
static inline void audit_log_n_string(struct audit_buffer *ab,
                                      const char *buf, size_t n)
{ }
static inline void  audit_log_n_untrustedstring(struct audit_buffer *ab,
                                                const char *string, size_t n)
{ }
static inline void audit_log_untrustedstring(struct audit_buffer *ab,
                                             const char *string)
{ }
static inline void audit_log_d_path(struct audit_buffer *ab,
                                    const char *prefix,
                                    const struct path *path)
{ }
static inline void audit_log_key(struct audit_buffer *ab, char *key)
{ }
static inline void audit_log_path_denied(int type, const char *operation)
{ }
static inline int audit_log_task_context(struct audit_buffer *ab)
{
        return 0;
}
static inline void audit_log_task_info(struct audit_buffer *ab)
{ }

static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
{
        return INVALID_UID;
}

static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
{
        return AUDIT_SID_UNSET;
}

#define audit_enabled AUDIT_OFF

static inline int audit_signal_info(int sig, struct task_struct *t)
{
        return 0;
}

#endif /* CONFIG_AUDIT */

#ifdef CONFIG_AUDIT_COMPAT_GENERIC
#define audit_is_compat(arch)  (!((arch) & __AUDIT_ARCH_64BIT))
#else
#define audit_is_compat(arch)  false
#endif

#define AUDIT_INODE_PARENT        1        /* dentry represents the parent */
#define AUDIT_INODE_HIDDEN        2        /* audit record should be hidden */
#define AUDIT_INODE_NOEVAL        4        /* audit record incomplete */

#ifdef CONFIG_AUDITSYSCALL
#include <asm/syscall.h> /* for syscall_get_arch() */

/* These are defined in auditsc.c */
                                /* Public API */
extern int  audit_alloc(struct task_struct *task);
extern void __audit_free(struct task_struct *task);
extern void __audit_uring_entry(u8 op);
extern void __audit_uring_exit(int success, long code);
extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1,
                                  unsigned long a2, unsigned long a3);
extern void __audit_syscall_exit(int ret_success, long ret_value);
extern struct filename *__audit_reusename(const __user char *uptr);
extern void __audit_getname(struct filename *name);
extern void __audit_inode(struct filename *name, const struct dentry *dentry,
                                unsigned int flags);
extern void __audit_file(const struct file *);
extern void __audit_inode_child(struct inode *parent,
                                const struct dentry *dentry,
                                const unsigned char type);
extern void audit_seccomp(unsigned long syscall, long signr, int code);
extern void audit_seccomp_actions_logged(const char *names,
                                         const char *old_names, int res);
extern void __audit_ptrace(struct task_struct *t);

static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx)
{
        task->audit_context = ctx;
}

static inline struct audit_context *audit_context(void)
{
        return current->audit_context;
}

static inline bool audit_dummy_context(void)
{
        void *p = audit_context();
        return !p || *(int *)p;
}
static inline void audit_free(struct task_struct *task)
{
        if (unlikely(task->audit_context))
                __audit_free(task);
}
static inline void audit_uring_entry(u8 op)
{
        /*
         * We intentionally check audit_context() before audit_enabled as most
         * Linux systems (as of ~2021) rely on systemd which forces audit to
         * be enabled regardless of the user's audit configuration.
         */
        if (unlikely(audit_context() && audit_enabled))
                __audit_uring_entry(op);
}
static inline void audit_uring_exit(int success, long code)
{
        if (unlikely(audit_context()))
                __audit_uring_exit(success, code);
}
static inline void audit_syscall_entry(int major, unsigned long a0,
                                       unsigned long a1, unsigned long a2,
                                       unsigned long a3)
{
        if (unlikely(audit_context()))
                __audit_syscall_entry(major, a0, a1, a2, a3);
}
static inline void audit_syscall_exit(void *pt_regs)
{
        if (unlikely(audit_context())) {
                int success = is_syscall_success(pt_regs);
                long return_code = regs_return_value(pt_regs);

                __audit_syscall_exit(success, return_code);
        }
}
static inline struct filename *audit_reusename(const __user char *name)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_reusename(name);
        return NULL;
}
static inline void audit_getname(struct filename *name)
{
        if (unlikely(!audit_dummy_context()))
                __audit_getname(name);
}
static inline void audit_inode(struct filename *name,
                                const struct dentry *dentry,
                                unsigned int aflags) {
        if (unlikely(!audit_dummy_context()))
                __audit_inode(name, dentry, aflags);
}
static inline void audit_file(struct file *file)
{
        if (unlikely(!audit_dummy_context()))
                __audit_file(file);
}
static inline void audit_inode_parent_hidden(struct filename *name,
                                                const struct dentry *dentry)
{
        if (unlikely(!audit_dummy_context()))
                __audit_inode(name, dentry,
                                AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN);
}
static inline void audit_inode_child(struct inode *parent,
                                     const struct dentry *dentry,
                                     const unsigned char type) {
        if (unlikely(!audit_dummy_context()))
                __audit_inode_child(parent, dentry, type);
}
void audit_core_dumps(long signr);

static inline void audit_ptrace(struct task_struct *t)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ptrace(t);
}

                                /* Private API (for audit.c only) */
extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
extern void __audit_bprm(struct linux_binprm *bprm);
extern int __audit_socketcall(int nargs, unsigned long *args);
extern int __audit_sockaddr(int len, void *addr);
extern void __audit_fd_pair(int fd1, int fd2);
extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr);
extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout);
extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification);
extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                  const struct cred *new,
                                  const struct cred *old);
extern void __audit_log_capset(const struct cred *new, const struct cred *old);
extern void __audit_mmap_fd(int fd, int flags);
extern void __audit_openat2_how(struct open_how *how);
extern void __audit_log_kern_module(char *name);
extern void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar);
extern void __audit_tk_injoffset(struct timespec64 offset);
extern void __audit_ntp_log(const struct audit_ntp_data *ad);
extern void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
                              enum audit_nfcfgop op, gfp_t gfp);

static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ipc_obj(ipcp);
}
static inline void audit_fd_pair(int fd1, int fd2)
{
        if (unlikely(!audit_dummy_context()))
                __audit_fd_pair(fd1, fd2);
}
static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ipc_set_perm(qbytes, uid, gid, mode);
}
static inline void audit_bprm(struct linux_binprm *bprm)
{
        if (unlikely(!audit_dummy_context()))
                __audit_bprm(bprm);
}
static inline int audit_socketcall(int nargs, unsigned long *args)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_socketcall(nargs, args);
        return 0;
}

static inline int audit_socketcall_compat(int nargs, u32 *args)
{
        unsigned long a[AUDITSC_ARGS];
        int i;

        if (audit_dummy_context())
                return 0;

        for (i = 0; i < nargs; i++)
                a[i] = (unsigned long)args[i];
        return __audit_socketcall(nargs, a);
}

static inline int audit_sockaddr(int len, void *addr)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_sockaddr(len, addr);
        return 0;
}
static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_open(oflag, mode, attr);
}
static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_sendrecv(mqdes, msg_len, msg_prio, abs_timeout);
}
static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_notify(mqdes, notification);
}
static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_getsetattr(mqdes, mqstat);
}

static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                       const struct cred *new,
                                       const struct cred *old)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_log_bprm_fcaps(bprm, new, old);
        return 0;
}

static inline void audit_log_capset(const struct cred *new,
                                   const struct cred *old)
{
        if (unlikely(!audit_dummy_context()))
                __audit_log_capset(new, old);
}

static inline void audit_mmap_fd(int fd, int flags)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mmap_fd(fd, flags);
}

static inline void audit_openat2_how(struct open_how *how)
{
        if (unlikely(!audit_dummy_context()))
                __audit_openat2_how(how);
}

static inline void audit_log_kern_module(char *name)
{
        if (!audit_dummy_context())
                __audit_log_kern_module(name);
}

static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
{
        if (!audit_dummy_context())
                __audit_fanotify(response, friar);
}

static inline void audit_tk_injoffset(struct timespec64 offset)
{
        /* ignore no-op events */
        if (offset.tv_sec == 0 && offset.tv_nsec == 0)
                return;

        if (!audit_dummy_context())
                __audit_tk_injoffset(offset);
}

static inline void audit_ntp_init(struct audit_ntp_data *ad)
{
        memset(ad, 0, sizeof(*ad));
}

static inline void audit_ntp_set_old(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{
        ad->vals[type].oldval = val;
}

static inline void audit_ntp_set_new(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{
        ad->vals[type].newval = val;
}

static inline void audit_ntp_log(const struct audit_ntp_data *ad)
{
        if (!audit_dummy_context())
                __audit_ntp_log(ad);
}

static inline void audit_log_nfcfg(const char *name, u8 af,
                                   unsigned int nentries,
                                   enum audit_nfcfgop op, gfp_t gfp)
{
        if (audit_enabled)
                __audit_log_nfcfg(name, af, nentries, op, gfp);
}

extern int audit_n_rules;
extern int audit_signals;
#else /* CONFIG_AUDITSYSCALL */
static inline int audit_alloc(struct task_struct *task)
{
        return 0;
}
static inline void audit_free(struct task_struct *task)
{ }
static inline void audit_uring_entry(u8 op)
{ }
static inline void audit_uring_exit(int success, long code)
{ }
static inline void audit_syscall_entry(int major, unsigned long a0,
                                       unsigned long a1, unsigned long a2,
                                       unsigned long a3)
{ }
static inline void audit_syscall_exit(void *pt_regs)
{ }
static inline bool audit_dummy_context(void)
{
        return true;
}
static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx)
{ }
static inline struct audit_context *audit_context(void)
{
        return NULL;
}
static inline struct filename *audit_reusename(const __user char *name)
{
        return NULL;
}
static inline void audit_getname(struct filename *name)
{ }
static inline void audit_inode(struct filename *name,
                                const struct dentry *dentry,
                                unsigned int aflags)
{ }
static inline void audit_file(struct file *file)
{
}
static inline void audit_inode_parent_hidden(struct filename *name,
                                const struct dentry *dentry)
{ }
static inline void audit_inode_child(struct inode *parent,
                                     const struct dentry *dentry,
                                     const unsigned char type)
{ }
static inline void audit_core_dumps(long signr)
{ }
static inline void audit_seccomp(unsigned long syscall, long signr, int code)
{ }
static inline void audit_seccomp_actions_logged(const char *names,
                                                const char *old_names, int res)
{ }
static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
{ }
static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
                                        gid_t gid, umode_t mode)
{ }
static inline void audit_bprm(struct linux_binprm *bprm)
{ }
static inline int audit_socketcall(int nargs, unsigned long *args)
{
        return 0;
}

static inline int audit_socketcall_compat(int nargs, u32 *args)
{
        return 0;
}

static inline void audit_fd_pair(int fd1, int fd2)
{ }
static inline int audit_sockaddr(int len, void *addr)
{
        return 0;
}
static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{ }
static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len,
                                     unsigned int msg_prio,
                                     const struct timespec64 *abs_timeout)
{ }
static inline void audit_mq_notify(mqd_t mqdes,
                                   const struct sigevent *notification)
{ }
static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{ }
static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                       const struct cred *new,
                                       const struct cred *old)
{
        return 0;
}
static inline void audit_log_capset(const struct cred *new,
                                    const struct cred *old)
{ }
static inline void audit_mmap_fd(int fd, int flags)
{ }

static inline void audit_openat2_how(struct open_how *how)
{ }

static inline void audit_log_kern_module(char *name)
{
}

static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
{ }

static inline void audit_tk_injoffset(struct timespec64 offset)
{ }

static inline void audit_ntp_init(struct audit_ntp_data *ad)
{ }

static inline void audit_ntp_set_old(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{ }

static inline void audit_ntp_set_new(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{ }

static inline void audit_ntp_log(const struct audit_ntp_data *ad)
{ }

static inline void audit_ptrace(struct task_struct *t)
{ }

static inline void audit_log_nfcfg(const char *name, u8 af,
                                   unsigned int nentries,
                                   enum audit_nfcfgop op, gfp_t gfp)
{ }

#define audit_n_rules 0
#define audit_signals 0
#endif /* CONFIG_AUDITSYSCALL */

static inline bool audit_loginuid_set(struct task_struct *tsk)
{
        return uid_valid(audit_get_loginuid(tsk));
}

#endif



















    2 

























    2 










    2 












    1 
















   40 
   42 










   40 


    2 



    2 







    2 




    1 



    2 






    1 






















































































































































































































































































































































































   27 










   24 



   24 



   27 

   24 





































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
// SPDX-License-Identifier: GPL-2.0-only
#include "cgroup-internal.h"

#include <linux/sched/cputime.h>

#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>

#include <trace/events/cgroup.h>

static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);

static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
{
        return per_cpu_ptr(cgrp->rstat_cpu, cpu);
}

/*
 * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
 *
 * This makes it easier to diagnose locking issues and contention in
 * production environments. The parameter @fast_path determine the
 * tracepoints being added, allowing us to diagnose "flush" related
 * operations without handling high-frequency fast-path "update" events.
 */
static __always_inline
unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
                                     struct cgroup *cgrp, const bool fast_path)
{
        unsigned long flags;
        bool contended;

        /*
         * The _irqsave() is needed because cgroup_rstat_lock is
         * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
         * this lock with the _irq() suffix only disables interrupts on
         * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
         * interrupts on both configurations. The _irqsave() ensures
         * that interrupts are always disabled and later restored.
         */
        contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
        if (contended) {
                if (fast_path)
                        trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
                else
                        trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);

                raw_spin_lock_irqsave(cpu_lock, flags);
        }

        if (fast_path)
                trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
        else
                trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);

        return flags;
}

static __always_inline
void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
                              struct cgroup *cgrp, unsigned long flags,
                              const bool fast_path)
{
        if (fast_path)
                trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
        else
                trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);

        raw_spin_unlock_irqrestore(cpu_lock, flags);
}

/**
 * cgroup_rstat_updated - keep track of updated rstat_cpu
 * @cgrp: target cgroup
 * @cpu: cpu on which rstat_cpu was updated
 *
 * @cgrp's rstat_cpu on @cpu was updated.  Put it on the parent's matching
 * rstat_cpu->updated_children list.  See the comment on top of
 * cgroup_rstat_cpu definition for details.
 */
__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
{
        raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
        unsigned long flags;

        /*
         * Speculative already-on-list test. This may race leading to
         * temporary inaccuracies, which is fine.
         *
         * Because @parent's updated_children is terminated with @parent
         * instead of NULL, we can tell whether @cgrp is on the list by
         * testing the next pointer for NULL.
         */
        if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
                return;

        flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);

        /* put @cgrp and all ancestors on the corresponding updated lists */
        while (true) {
                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
                struct cgroup *parent = cgroup_parent(cgrp);
                struct cgroup_rstat_cpu *prstatc;

                /*
                 * Both additions and removals are bottom-up.  If a cgroup
                 * is already in the tree, all ancestors are.
                 */
                if (rstatc->updated_next)
                        break;

                /* Root has no parent to link it to, but mark it busy */
                if (!parent) {
                        rstatc->updated_next = cgrp;
                        break;
                }

                prstatc = cgroup_rstat_cpu(parent, cpu);
                rstatc->updated_next = prstatc->updated_children;
                prstatc->updated_children = cgrp;

                cgrp = parent;
        }

        _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
}

/**
 * cgroup_rstat_push_children - push children cgroups into the given list
 * @head: current head of the list (= subtree root)
 * @child: first child of the root
 * @cpu: target cpu
 * Return: A new singly linked list of cgroups to be flush
 *
 * Iteratively traverse down the cgroup_rstat_cpu updated tree level by
 * level and push all the parents first before their next level children
 * into a singly linked list built from the tail backward like "pushing"
 * cgroups into a stack. The root is pushed by the caller.
 */
static struct cgroup *cgroup_rstat_push_children(struct cgroup *head,
                                                 struct cgroup *child, int cpu)
{
        struct cgroup *chead = child;        /* Head of child cgroup level */
        struct cgroup *ghead = NULL;        /* Head of grandchild cgroup level */
        struct cgroup *parent, *grandchild;
        struct cgroup_rstat_cpu *crstatc;

        child->rstat_flush_next = NULL;

next_level:
        while (chead) {
                child = chead;
                chead = child->rstat_flush_next;
                parent = cgroup_parent(child);

                /* updated_next is parent cgroup terminated */
                while (child != parent) {
                        child->rstat_flush_next = head;
                        head = child;
                        crstatc = cgroup_rstat_cpu(child, cpu);
                        grandchild = crstatc->updated_children;
                        if (grandchild != child) {
                                /* Push the grand child to the next level */
                                crstatc->updated_children = child;
                                grandchild->rstat_flush_next = ghead;
                                ghead = grandchild;
                        }
                        child = crstatc->updated_next;
                        crstatc->updated_next = NULL;
                }
        }

        if (ghead) {
                chead = ghead;
                ghead = NULL;
                goto next_level;
        }
        return head;
}

/**
 * cgroup_rstat_updated_list - return a list of updated cgroups to be flushed
 * @root: root of the cgroup subtree to traverse
 * @cpu: target cpu
 * Return: A singly linked list of cgroups to be flushed
 *
 * Walks the updated rstat_cpu tree on @cpu from @root.  During traversal,
 * each returned cgroup is unlinked from the updated tree.
 *
 * The only ordering guarantee is that, for a parent and a child pair
 * covered by a given traversal, the child is before its parent in
 * the list.
 *
 * Note that updated_children is self terminated and points to a list of
 * child cgroups if not empty. Whereas updated_next is like a sibling link
 * within the children list and terminated by the parent cgroup. An exception
 * here is the cgroup root whose updated_next can be self terminated.
 */
static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
{
        raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
        struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu);
        struct cgroup *head = NULL, *parent, *child;
        unsigned long flags;

        flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);

        /* Return NULL if this subtree is not on-list */
        if (!rstatc->updated_next)
                goto unlock_ret;

        /*
         * Unlink @root from its parent. As the updated_children list is
         * singly linked, we have to walk it to find the removal point.
         */
        parent = cgroup_parent(root);
        if (parent) {
                struct cgroup_rstat_cpu *prstatc;
                struct cgroup **nextp;

                prstatc = cgroup_rstat_cpu(parent, cpu);
                nextp = &prstatc->updated_children;
                while (*nextp != root) {
                        struct cgroup_rstat_cpu *nrstatc;

                        nrstatc = cgroup_rstat_cpu(*nextp, cpu);
                        WARN_ON_ONCE(*nextp == parent);
                        nextp = &nrstatc->updated_next;
                }
                *nextp = rstatc->updated_next;
        }

        rstatc->updated_next = NULL;

        /* Push @root to the list first before pushing the children */
        head = root;
        root->rstat_flush_next = NULL;
        child = rstatc->updated_children;
        rstatc->updated_children = root;
        if (child != root)
                head = cgroup_rstat_push_children(head, child, cpu);
unlock_ret:
        _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
        return head;
}

/*
 * A hook for bpf stat collectors to attach to and flush their stats.
 * Together with providing bpf kfuncs for cgroup_rstat_updated() and
 * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
 * collect cgroup stats can integrate with rstat for efficient flushing.
 *
 * A static noinline declaration here could cause the compiler to optimize away
 * the function. A global noinline declaration will keep the definition, but may
 * optimize away the callsite. Therefore, __weak is needed to ensure that the
 * call is still emitted, by telling the compiler that we don't know what the
 * function might eventually be.
 */

__bpf_hook_start();

__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
                                     struct cgroup *parent, int cpu)
{
}

__bpf_hook_end();

/*
 * Helper functions for locking cgroup_rstat_lock.
 *
 * This makes it easier to diagnose locking issues and contention in
 * production environments.  The parameter @cpu_in_loop indicate lock
 * was released and re-taken when collection data from the CPUs. The
 * value -1 is used when obtaining the main lock else this is the CPU
 * number processed last.
 */
static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop)
        __acquires(&cgroup_rstat_lock)
{
        bool contended;

        contended = !spin_trylock_irq(&cgroup_rstat_lock);
        if (contended) {
                trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
                spin_lock_irq(&cgroup_rstat_lock);
        }
        trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
}

static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop)
        __releases(&cgroup_rstat_lock)
{
        trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
        spin_unlock_irq(&cgroup_rstat_lock);
}

/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
        __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
{
        int cpu;

        lockdep_assert_held(&cgroup_rstat_lock);

        for_each_possible_cpu(cpu) {
                struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu);

                for (; pos; pos = pos->rstat_flush_next) {
                        struct cgroup_subsys_state *css;

                        cgroup_base_stat_flush(pos, cpu);
                        bpf_rstat_flush(pos, cgroup_parent(pos), cpu);

                        rcu_read_lock();
                        list_for_each_entry_rcu(css, &pos->rstat_css_list,
                                                rstat_css_node)
                                css->ss->css_rstat_flush(css, cpu);
                        rcu_read_unlock();
                }

                /* play nice and yield if necessary */
                if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
                        __cgroup_rstat_unlock(cgrp, cpu);
                        if (!cond_resched())
                                cpu_relax();
                        __cgroup_rstat_lock(cgrp, cpu);
                }
        }
}

/**
 * cgroup_rstat_flush - flush stats in @cgrp's subtree
 * @cgrp: target cgroup
 *
 * Collect all per-cpu stats in @cgrp's subtree into the global counters
 * and propagate them upwards.  After this function returns, all cgroups in
 * the subtree have up-to-date ->stat.
 *
 * This also gets all cgroups in the subtree including @cgrp off the
 * ->updated_children lists.
 *
 * This function may block.
 */
__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
{
        might_sleep();

        __cgroup_rstat_lock(cgrp, -1);
        cgroup_rstat_flush_locked(cgrp);
        __cgroup_rstat_unlock(cgrp, -1);
}

/**
 * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
 * @cgrp: target cgroup
 *
 * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
 * paired with cgroup_rstat_flush_release().
 *
 * This function may block.
 */
void cgroup_rstat_flush_hold(struct cgroup *cgrp)
        __acquires(&cgroup_rstat_lock)
{
        might_sleep();
        __cgroup_rstat_lock(cgrp, -1);
        cgroup_rstat_flush_locked(cgrp);
}

/**
 * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
 * @cgrp: cgroup used by tracepoint
 */
void cgroup_rstat_flush_release(struct cgroup *cgrp)
        __releases(&cgroup_rstat_lock)
{
        __cgroup_rstat_unlock(cgrp, -1);
}

int cgroup_rstat_init(struct cgroup *cgrp)
{
        int cpu;

        /* the root cgrp has rstat_cpu preallocated */
        if (!cgrp->rstat_cpu) {
                cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
                if (!cgrp->rstat_cpu)
                        return -ENOMEM;
        }

        /* ->updated_children list is self terminated */
        for_each_possible_cpu(cpu) {
                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);

                rstatc->updated_children = cgrp;
                u64_stats_init(&rstatc->bsync);
        }

        return 0;
}

void cgroup_rstat_exit(struct cgroup *cgrp)
{
        int cpu;

        cgroup_rstat_flush(cgrp);

        /* sanity check */
        for_each_possible_cpu(cpu) {
                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);

                if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
                    WARN_ON_ONCE(rstatc->updated_next))
                        return;
        }

        free_percpu(cgrp->rstat_cpu);
        cgrp->rstat_cpu = NULL;
}

void __init cgroup_rstat_boot(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
}

/*
 * Functions for cgroup basic resource statistics implemented on top of
 * rstat.
 */
static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
                                 struct cgroup_base_stat *src_bstat)
{
        dst_bstat->cputime.utime += src_bstat->cputime.utime;
        dst_bstat->cputime.stime += src_bstat->cputime.stime;
        dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
#ifdef CONFIG_SCHED_CORE
        dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
#endif
}

static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
                                 struct cgroup_base_stat *src_bstat)
{
        dst_bstat->cputime.utime -= src_bstat->cputime.utime;
        dst_bstat->cputime.stime -= src_bstat->cputime.stime;
        dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
#ifdef CONFIG_SCHED_CORE
        dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
#endif
}

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
        struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
        struct cgroup *parent = cgroup_parent(cgrp);
        struct cgroup_rstat_cpu *prstatc;
        struct cgroup_base_stat delta;
        unsigned seq;

        /* Root-level stats are sourced from system-wide CPU stats */
        if (!parent)
                return;

        /* fetch the current per-cpu values */
        do {
                seq = __u64_stats_fetch_begin(&rstatc->bsync);
                delta = rstatc->bstat;
        } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));

        /* propagate per-cpu delta to cgroup and per-cpu global statistics */
        cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
        cgroup_base_stat_add(&cgrp->bstat, &delta);
        cgroup_base_stat_add(&rstatc->last_bstat, &delta);
        cgroup_base_stat_add(&rstatc->subtree_bstat, &delta);

        /* propagate cgroup and per-cpu global delta to parent (unless that's root) */
        if (cgroup_parent(parent)) {
                delta = cgrp->bstat;
                cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
                cgroup_base_stat_add(&parent->bstat, &delta);
                cgroup_base_stat_add(&cgrp->last_bstat, &delta);

                delta = rstatc->subtree_bstat;
                prstatc = cgroup_rstat_cpu(parent, cpu);
                cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat);
                cgroup_base_stat_add(&prstatc->subtree_bstat, &delta);
                cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta);
        }
}

static struct cgroup_rstat_cpu *
cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
{
        struct cgroup_rstat_cpu *rstatc;

        rstatc = get_cpu_ptr(cgrp->rstat_cpu);
        *flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
        return rstatc;
}

static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
                                                 struct cgroup_rstat_cpu *rstatc,
                                                 unsigned long flags)
{
        u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
        cgroup_rstat_updated(cgrp, smp_processor_id());
        put_cpu_ptr(rstatc);
}

void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
        struct cgroup_rstat_cpu *rstatc;
        unsigned long flags;

        rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
        rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
        cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}

void __cgroup_account_cputime_field(struct cgroup *cgrp,
                                    enum cpu_usage_stat index, u64 delta_exec)
{
        struct cgroup_rstat_cpu *rstatc;
        unsigned long flags;

        rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);

        switch (index) {
        case CPUTIME_USER:
        case CPUTIME_NICE:
                rstatc->bstat.cputime.utime += delta_exec;
                break;
        case CPUTIME_SYSTEM:
        case CPUTIME_IRQ:
        case CPUTIME_SOFTIRQ:
                rstatc->bstat.cputime.stime += delta_exec;
                break;
#ifdef CONFIG_SCHED_CORE
        case CPUTIME_FORCEIDLE:
                rstatc->bstat.forceidle_sum += delta_exec;
                break;
#endif
        default:
                break;
        }

        cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}

/*
 * compute the cputime for the root cgroup by getting the per cpu data
 * at a global level, then categorizing the fields in a manner consistent
 * with how it is done by __cgroup_account_cputime_field for each bit of
 * cpu time attributed to a cgroup.
 */
static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
{
        struct task_cputime *cputime = &bstat->cputime;
        int i;

        memset(bstat, 0, sizeof(*bstat));
        for_each_possible_cpu(i) {
                struct kernel_cpustat kcpustat;
                u64 *cpustat = kcpustat.cpustat;
                u64 user = 0;
                u64 sys = 0;

                kcpustat_cpu_fetch(&kcpustat, i);

                user += cpustat[CPUTIME_USER];
                user += cpustat[CPUTIME_NICE];
                cputime->utime += user;

                sys += cpustat[CPUTIME_SYSTEM];
                sys += cpustat[CPUTIME_IRQ];
                sys += cpustat[CPUTIME_SOFTIRQ];
                cputime->stime += sys;

                cputime->sum_exec_runtime += user;
                cputime->sum_exec_runtime += sys;
                cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];

#ifdef CONFIG_SCHED_CORE
                bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
#endif
        }
}

void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        u64 usage, utime, stime;
        struct cgroup_base_stat bstat;
#ifdef CONFIG_SCHED_CORE
        u64 forceidle_time;
#endif

        if (cgroup_parent(cgrp)) {
                cgroup_rstat_flush_hold(cgrp);
                usage = cgrp->bstat.cputime.sum_exec_runtime;
                cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
                               &utime, &stime);
#ifdef CONFIG_SCHED_CORE
                forceidle_time = cgrp->bstat.forceidle_sum;
#endif
                cgroup_rstat_flush_release(cgrp);
        } else {
                root_cgroup_cputime(&bstat);
                usage = bstat.cputime.sum_exec_runtime;
                utime = bstat.cputime.utime;
                stime = bstat.cputime.stime;
#ifdef CONFIG_SCHED_CORE
                forceidle_time = bstat.forceidle_sum;
#endif
        }

        do_div(usage, NSEC_PER_USEC);
        do_div(utime, NSEC_PER_USEC);
        do_div(stime, NSEC_PER_USEC);
#ifdef CONFIG_SCHED_CORE
        do_div(forceidle_time, NSEC_PER_USEC);
#endif

        seq_printf(seq, "usage_usec %llu\n"
                   "user_usec %llu\n"
                   "system_usec %llu\n",
                   usage, utime, stime);

#ifdef CONFIG_SCHED_CORE
        seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
#endif
}

/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
BTF_KFUNCS_START(bpf_rstat_kfunc_ids)
BTF_ID_FLAGS(func, cgroup_rstat_updated)
BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
BTF_KFUNCS_END(bpf_rstat_kfunc_ids)

static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
        .owner          = THIS_MODULE,
        .set            = &bpf_rstat_kfunc_ids,
};

static int __init bpf_rstat_kfunc_init(void)
{
        return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
                                         &bpf_rstat_kfunc_set);
}
late_initcall(bpf_rstat_kfunc_init);













































   11 
   13 










   11 













   13 










    2 




























    3 






















    4 


















    1 

















    3 






















    3 








    1 







































































































































    4 





























































































    1 









    1 






    1 












































































































































    1 

















    7 







   13 
   11 
















































   11 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_INTERNAL_H
#define BLK_INTERNAL_H

#include <linux/blk-crypto.h>
#include <linux/memblock.h>        /* for max_pfn/max_low_pfn */
#include <linux/sched/sysctl.h>
#include <linux/timekeeping.h>
#include <xen/xen.h>
#include "blk-crypto-internal.h"

struct elevator_type;

/* Max future timer expiry for timeouts */
#define BLK_MAX_TIMEOUT                (5 * HZ)

extern struct dentry *blk_debugfs_root;

struct blk_flush_queue {
        spinlock_t                mq_flush_lock;
        unsigned int                flush_pending_idx:1;
        unsigned int                flush_running_idx:1;
        blk_status_t                 rq_status;
        unsigned long                flush_pending_since;
        struct list_head        flush_queue[2];
        unsigned long                flush_data_in_flight;
        struct request                *flush_rq;
};

bool is_flush_rq(struct request *req);

struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
                                              gfp_t flags);
void blk_free_flush_queue(struct blk_flush_queue *q);

void blk_freeze_queue(struct request_queue *q);
void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
void blk_queue_start_drain(struct request_queue *q);
int __bio_queue_enter(struct request_queue *q, struct bio *bio);
void submit_bio_noacct_nocheck(struct bio *bio);
void bio_await_chain(struct bio *bio);

static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
{
        rcu_read_lock();
        if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter))
                goto fail;

        /*
         * The code that increments the pm_only counter must ensure that the
         * counter is globally visible before the queue is unfrozen.
         */
        if (blk_queue_pm_only(q) &&
            (!pm || queue_rpm_status(q) == RPM_SUSPENDED))
                goto fail_put;

        rcu_read_unlock();
        return true;

fail_put:
        blk_queue_exit(q);
fail:
        rcu_read_unlock();
        return false;
}

static inline int bio_queue_enter(struct bio *bio)
{
        struct request_queue *q = bdev_get_queue(bio->bi_bdev);

        if (blk_try_enter_queue(q, false))
                return 0;
        return __bio_queue_enter(q, bio);
}

static inline void blk_wait_io(struct completion *done)
{
        /* Prevent hang_check timer from firing at us during very long I/O */
        unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;

        if (timeout)
                while (!wait_for_completion_io_timeout(done, timeout))
                        ;
        else
                wait_for_completion_io(done);
}

#define BIO_INLINE_VECS 4
struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
                gfp_t gfp_mask);
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs);

bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
                struct page *page, unsigned len, unsigned offset,
                bool *same_page);

static inline bool biovec_phys_mergeable(struct request_queue *q,
                struct bio_vec *vec1, struct bio_vec *vec2)
{
        unsigned long mask = queue_segment_boundary(q);
        phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset;
        phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset;

        /*
         * Merging adjacent physical pages may not work correctly under KMSAN
         * if their metadata pages aren't adjacent. Just disable merging.
         */
        if (IS_ENABLED(CONFIG_KMSAN))
                return false;

        if (addr1 + vec1->bv_len != addr2)
                return false;
        if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page))
                return false;
        if ((addr1 | mask) != ((addr2 + vec2->bv_len - 1) | mask))
                return false;
        return true;
}

static inline bool __bvec_gap_to_prev(const struct queue_limits *lim,
                struct bio_vec *bprv, unsigned int offset)
{
        return (offset & lim->virt_boundary_mask) ||
                ((bprv->bv_offset + bprv->bv_len) & lim->virt_boundary_mask);
}

/*
 * Check if adding a bio_vec after bprv with offset would create a gap in
 * the SG list. Most drivers don't care about this, but some do.
 */
static inline bool bvec_gap_to_prev(const struct queue_limits *lim,
                struct bio_vec *bprv, unsigned int offset)
{
        if (!lim->virt_boundary_mask)
                return false;
        return __bvec_gap_to_prev(lim, bprv, offset);
}

static inline bool rq_mergeable(struct request *rq)
{
        if (blk_rq_is_passthrough(rq))
                return false;

        if (req_op(rq) == REQ_OP_FLUSH)
                return false;

        if (req_op(rq) == REQ_OP_WRITE_ZEROES)
                return false;

        if (req_op(rq) == REQ_OP_ZONE_APPEND)
                return false;

        if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
                return false;
        if (rq->rq_flags & RQF_NOMERGE_FLAGS)
                return false;

        return true;
}

/*
 * There are two different ways to handle DISCARD merges:
 *  1) If max_discard_segments > 1, the driver treats every bio as a range and
 *     send the bios to controller together. The ranges don't need to be
 *     contiguous.
 *  2) Otherwise, the request will be normal read/write requests.  The ranges
 *     need to be contiguous.
 */
static inline bool blk_discard_mergable(struct request *req)
{
        if (req_op(req) == REQ_OP_DISCARD &&
            queue_max_discard_segments(req->q) > 1)
                return true;
        return false;
}

static inline unsigned int blk_rq_get_max_segments(struct request *rq)
{
        if (req_op(rq) == REQ_OP_DISCARD)
                return queue_max_discard_segments(rq->q);
        return queue_max_segments(rq->q);
}

static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
                                                     enum req_op op)
{
        if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE))
                return min(q->limits.max_discard_sectors,
                           UINT_MAX >> SECTOR_SHIFT);

        if (unlikely(op == REQ_OP_WRITE_ZEROES))
                return q->limits.max_write_zeroes_sectors;

        return q->limits.max_sectors;
}

#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void);
bool __bio_integrity_endio(struct bio *);
void bio_integrity_free(struct bio *bio);
static inline bool bio_integrity_endio(struct bio *bio)
{
        if (bio_integrity(bio))
                return __bio_integrity_endio(bio);
        return true;
}

bool blk_integrity_merge_rq(struct request_queue *, struct request *,
                struct request *);
bool blk_integrity_merge_bio(struct request_queue *, struct request *,
                struct bio *);

static inline bool integrity_req_gap_back_merge(struct request *req,
                struct bio *next)
{
        struct bio_integrity_payload *bip = bio_integrity(req->bio);
        struct bio_integrity_payload *bip_next = bio_integrity(next);

        return bvec_gap_to_prev(&req->q->limits,
                                &bip->bip_vec[bip->bip_vcnt - 1],
                                bip_next->bip_vec[0].bv_offset);
}

static inline bool integrity_req_gap_front_merge(struct request *req,
                struct bio *bio)
{
        struct bio_integrity_payload *bip = bio_integrity(bio);
        struct bio_integrity_payload *bip_next = bio_integrity(req->bio);

        return bvec_gap_to_prev(&req->q->limits,
                                &bip->bip_vec[bip->bip_vcnt - 1],
                                bip_next->bip_vec[0].bv_offset);
}

extern const struct attribute_group blk_integrity_attr_group;
#else /* CONFIG_BLK_DEV_INTEGRITY */
static inline bool blk_integrity_merge_rq(struct request_queue *rq,
                struct request *r1, struct request *r2)
{
        return true;
}
static inline bool blk_integrity_merge_bio(struct request_queue *rq,
                struct request *r, struct bio *b)
{
        return true;
}
static inline bool integrity_req_gap_back_merge(struct request *req,
                struct bio *next)
{
        return false;
}
static inline bool integrity_req_gap_front_merge(struct request *req,
                struct bio *bio)
{
        return false;
}

static inline void blk_flush_integrity(void)
{
}
static inline bool bio_integrity_endio(struct bio *bio)
{
        return true;
}
static inline void bio_integrity_free(struct bio *bio)
{
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */

unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);

enum bio_merge_status {
        BIO_MERGE_OK,
        BIO_MERGE_NONE,
        BIO_MERGE_FAILED,
};

enum bio_merge_status bio_attempt_back_merge(struct request *req,
                struct bio *bio, unsigned int nr_segs);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
                unsigned int nr_segs);
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
                        struct bio *bio, unsigned int nr_segs);

/*
 * Plug flush limits
 */
#define BLK_MAX_REQUEST_COUNT        32
#define BLK_PLUG_FLUSH_SIZE        (128 * 1024)

/*
 * Internal elevator interface
 */
#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED)

bool blk_insert_flush(struct request *rq);

int elevator_switch(struct request_queue *q, struct elevator_type *new_e);
void elevator_disable(struct request_queue *q);
void elevator_exit(struct request_queue *q);
int elv_register_queue(struct request_queue *q, bool uevent);
void elv_unregister_queue(struct request_queue *q);

ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
                char *buf);
ssize_t part_stat_show(struct device *dev, struct device_attribute *attr,
                char *buf);
ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
                char *buf);
ssize_t part_fail_show(struct device *dev, struct device_attribute *attr,
                char *buf);
ssize_t part_fail_store(struct device *dev, struct device_attribute *attr,
                const char *buf, size_t count);
ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
ssize_t part_timeout_store(struct device *, struct device_attribute *,
                                const char *, size_t);

static inline bool bio_may_exceed_limits(struct bio *bio,
                                         const struct queue_limits *lim)
{
        switch (bio_op(bio)) {
        case REQ_OP_DISCARD:
        case REQ_OP_SECURE_ERASE:
        case REQ_OP_WRITE_ZEROES:
                return true; /* non-trivial splitting decisions */
        default:
                break;
        }

        /*
         * All drivers must accept single-segments bios that are <= PAGE_SIZE.
         * This is a quick and dirty check that relies on the fact that
         * bi_io_vec[0] is always valid if a bio has data.  The check might
         * lead to occasional false negatives when bios are cloned, but compared
         * to the performance impact of cloned bios themselves the loop below
         * doesn't matter anyway.
         */
        return lim->chunk_sectors || bio->bi_vcnt != 1 ||
                bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
}

struct bio *__bio_split_to_limits(struct bio *bio,
                                  const struct queue_limits *lim,
                                  unsigned int *nr_segs);
int ll_back_merge_fn(struct request *req, struct bio *bio,
                unsigned int nr_segs);
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
                                struct request *next);
unsigned int blk_recalc_rq_segments(struct request *rq);
bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
enum elv_merge blk_try_merge(struct request *rq, struct bio *bio);

int blk_set_default_limits(struct queue_limits *lim);
int blk_dev_init(void);

/*
 * Contribute to IO statistics IFF:
 *
 *        a) it's attached to a gendisk, and
 *        b) the queue had IO stats enabled when this request was started
 */
static inline bool blk_do_io_stat(struct request *rq)
{
        return (rq->rq_flags & RQF_IO_STAT) && !blk_rq_is_passthrough(rq);
}

void update_io_ticks(struct block_device *part, unsigned long now, bool end);
unsigned int part_in_flight(struct block_device *part);

static inline void req_set_nomerge(struct request_queue *q, struct request *req)
{
        req->cmd_flags |= REQ_NOMERGE;
        if (req == q->last_merge)
                q->last_merge = NULL;
}

/*
 * Internal io_context interface
 */
struct io_cq *ioc_find_get_icq(struct request_queue *q);
struct io_cq *ioc_lookup_icq(struct request_queue *q);
#ifdef CONFIG_BLK_ICQ
void ioc_clear_queue(struct request_queue *q);
#else
static inline void ioc_clear_queue(struct request_queue *q)
{
}
#endif /* CONFIG_BLK_ICQ */

struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q);

static inline bool blk_queue_may_bounce(struct request_queue *q)
{
        return IS_ENABLED(CONFIG_BOUNCE) &&
                q->limits.bounce == BLK_BOUNCE_HIGH &&
                max_low_pfn >= max_pfn;
}

static inline struct bio *blk_queue_bounce(struct bio *bio,
                struct request_queue *q)
{
        if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio)))
                return __blk_queue_bounce(bio, q);
        return bio;
}

#ifdef CONFIG_BLK_DEV_ZONED
void disk_init_zone_resources(struct gendisk *disk);
void disk_free_zone_resources(struct gendisk *disk);
static inline bool bio_zone_write_plugging(struct bio *bio)
{
        return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
}
static inline bool bio_is_zone_append(struct bio *bio)
{
        return bio_op(bio) == REQ_OP_ZONE_APPEND ||
                bio_flagged(bio, BIO_EMULATES_ZONE_APPEND);
}
void blk_zone_write_plug_bio_merged(struct bio *bio);
void blk_zone_write_plug_init_request(struct request *rq);
static inline void blk_zone_update_request_bio(struct request *rq,
                                               struct bio *bio)
{
        /*
         * For zone append requests, the request sector indicates the location
         * at which the BIO data was written. Return this value to the BIO
         * issuer through the BIO iter sector.
         * For plugged zone writes, which include emulated zone append, we need
         * the original BIO sector so that blk_zone_write_plug_bio_endio() can
         * lookup the zone write plug.
         */
        if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio))
                bio->bi_iter.bi_sector = rq->__sector;
}
void blk_zone_write_plug_bio_endio(struct bio *bio);
static inline void blk_zone_bio_endio(struct bio *bio)
{
        /*
         * For write BIOs to zoned devices, signal the completion of the BIO so
         * that the next write BIO can be submitted by zone write plugging.
         */
        if (bio_zone_write_plugging(bio))
                blk_zone_write_plug_bio_endio(bio);
}

void blk_zone_write_plug_finish_request(struct request *rq);
static inline void blk_zone_finish_request(struct request *rq)
{
        if (rq->rq_flags & RQF_ZONE_WRITE_PLUGGING)
                blk_zone_write_plug_finish_request(rq);
}
int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
                unsigned long arg);
int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
                unsigned int cmd, unsigned long arg);
#else /* CONFIG_BLK_DEV_ZONED */
static inline void disk_init_zone_resources(struct gendisk *disk)
{
}
static inline void disk_free_zone_resources(struct gendisk *disk)
{
}
static inline bool bio_zone_write_plugging(struct bio *bio)
{
        return false;
}
static inline bool bio_is_zone_append(struct bio *bio)
{
        return false;
}
static inline void blk_zone_write_plug_bio_merged(struct bio *bio)
{
}
static inline void blk_zone_write_plug_init_request(struct request *rq)
{
}
static inline void blk_zone_update_request_bio(struct request *rq,
                                               struct bio *bio)
{
}
static inline void blk_zone_bio_endio(struct bio *bio)
{
}
static inline void blk_zone_finish_request(struct request *rq)
{
}
static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
                unsigned int cmd, unsigned long arg)
{
        return -ENOTTY;
}
static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev,
                blk_mode_t mode, unsigned int cmd, unsigned long arg)
{
        return -ENOTTY;
}
#endif /* CONFIG_BLK_DEV_ZONED */

struct block_device *bdev_alloc(struct gendisk *disk, u8 partno);
void bdev_add(struct block_device *bdev, dev_t dev);
void bdev_unhash(struct block_device *bdev);
void bdev_drop(struct block_device *bdev);

int blk_alloc_ext_minor(void);
void blk_free_ext_minor(unsigned int minor);
#define ADDPART_FLAG_NONE        0
#define ADDPART_FLAG_RAID        1
#define ADDPART_FLAG_WHOLEDISK        2
int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
                sector_t length);
int bdev_del_partition(struct gendisk *disk, int partno);
int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
                sector_t length);
void drop_partition(struct block_device *part);

void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors);

struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
                struct lock_class_key *lkclass);

int bio_add_hw_page(struct request_queue *q, struct bio *bio,
                struct page *page, unsigned int len, unsigned int offset,
                unsigned int max_sectors, bool *same_page);

/*
 * Clean up a page appropriately, where the page may be pinned, may have a
 * ref taken on it or neither.
 */
static inline void bio_release_page(struct bio *bio, struct page *page)
{
        if (bio_flagged(bio, BIO_PAGE_PINNED))
                unpin_user_page(page);
}

struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id);

int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode);

int disk_alloc_events(struct gendisk *disk);
void disk_add_events(struct gendisk *disk);
void disk_del_events(struct gendisk *disk);
void disk_release_events(struct gendisk *disk);
void disk_block_events(struct gendisk *disk);
void disk_unblock_events(struct gendisk *disk);
void disk_flush_events(struct gendisk *disk, unsigned int mask);
extern struct device_attribute dev_attr_events;
extern struct device_attribute dev_attr_events_async;
extern struct device_attribute dev_attr_events_poll_msecs;

extern struct attribute_group blk_trace_attr_group;

blk_mode_t file_to_blk_mode(struct file *file);
int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode,
                loff_t lstart, loff_t lend);
long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);

extern const struct address_space_operations def_blk_aops;

int disk_register_independent_access_ranges(struct gendisk *disk);
void disk_unregister_independent_access_ranges(struct gendisk *disk);

#ifdef CONFIG_FAIL_MAKE_REQUEST
bool should_fail_request(struct block_device *part, unsigned int bytes);
#else /* CONFIG_FAIL_MAKE_REQUEST */
static inline bool should_fail_request(struct block_device *part,
                                        unsigned int bytes)
{
        return false;
}
#endif /* CONFIG_FAIL_MAKE_REQUEST */

/*
 * Optimized request reference counting. Ideally we'd make timeouts be more
 * clever, as that's the only reason we need references at all... But until
 * this happens, this is faster than using refcount_t. Also see:
 *
 * abc54d634334 ("io_uring: switch to atomic_t for io_kiocb reference count")
 */
#define req_ref_zero_or_close_to_overflow(req)        \
        ((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u)

static inline bool req_ref_inc_not_zero(struct request *req)
{
        return atomic_inc_not_zero(&req->ref);
}

static inline bool req_ref_put_and_test(struct request *req)
{
        WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
        return atomic_dec_and_test(&req->ref);
}

static inline void req_ref_set(struct request *req, int value)
{
        atomic_set(&req->ref, value);
}

static inline int req_ref_read(struct request *req)
{
        return atomic_read(&req->ref);
}

static inline u64 blk_time_get_ns(void)
{
        struct blk_plug *plug = current->plug;

        if (!plug || !in_task())
                return ktime_get_ns();

        /*
         * 0 could very well be a valid time, but rather than flag "this is
         * a valid timestamp" separately, just accept that we'll do an extra
         * ktime_get_ns() if we just happen to get 0 as the current time.
         */
        if (!plug->cur_ktime) {
                plug->cur_ktime = ktime_get_ns();
                current->flags |= PF_BLOCK_TS;
        }
        return plug->cur_ktime;
}

static inline ktime_t blk_time_get(void)
{
        return ns_to_ktime(blk_time_get_ns());
}

/*
 * From most significant bit:
 * 1 bit: reserved for other usage, see below
 * 12 bits: original size of bio
 * 51 bits: issue time of bio
 */
#define BIO_ISSUE_RES_BITS      1
#define BIO_ISSUE_SIZE_BITS     12
#define BIO_ISSUE_RES_SHIFT     (64 - BIO_ISSUE_RES_BITS)
#define BIO_ISSUE_SIZE_SHIFT    (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS)
#define BIO_ISSUE_TIME_MASK     ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1)
#define BIO_ISSUE_SIZE_MASK     \
        (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT)
#define BIO_ISSUE_RES_MASK      (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1))

/* Reserved bit for blk-throtl */
#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63)

static inline u64 __bio_issue_time(u64 time)
{
        return time & BIO_ISSUE_TIME_MASK;
}

static inline u64 bio_issue_time(struct bio_issue *issue)
{
        return __bio_issue_time(issue->value);
}

static inline sector_t bio_issue_size(struct bio_issue *issue)
{
        return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT);
}

static inline void bio_issue_init(struct bio_issue *issue,
                                       sector_t size)
{
        size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1;
        issue->value = ((issue->value & BIO_ISSUE_RES_MASK) |
                        (blk_time_get_ns() & BIO_ISSUE_TIME_MASK) |
                        ((u64)size << BIO_ISSUE_SIZE_SHIFT));
}

void bdev_release(struct file *bdev_file);
int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
              const struct blk_holder_ops *hops, struct file *bdev_file);
int bdev_permission(dev_t dev, blk_mode_t mode, void *holder);

#endif /* BLK_INTERNAL_H */

























   28 










   31 

   31 

   29 
   25 





























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Access kernel or user memory without faulting.
 */
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
#include <asm/tlb.h>

bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src,
                size_t size)
{
        return true;
}

#define copy_from_kernel_nofault_loop(dst, src, len, type, err_label)        \
        while (len >= sizeof(type)) {                                        \
                __get_kernel_nofault(dst, src, type, err_label);                \
                dst += sizeof(type);                                        \
                src += sizeof(type);                                        \
                len -= sizeof(type);                                        \
        }

long copy_from_kernel_nofault(void *dst, const void *src, size_t size)
{
        unsigned long align = 0;

        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
                align = (unsigned long)dst | (unsigned long)src;

        if (!copy_from_kernel_nofault_allowed(src, size))
                return -ERANGE;

        pagefault_disable();
        if (!(align & 7))
                copy_from_kernel_nofault_loop(dst, src, size, u64, Efault);
        if (!(align & 3))
                copy_from_kernel_nofault_loop(dst, src, size, u32, Efault);
        if (!(align & 1))
                copy_from_kernel_nofault_loop(dst, src, size, u16, Efault);
        copy_from_kernel_nofault_loop(dst, src, size, u8, Efault);
        pagefault_enable();
        return 0;
Efault:
        pagefault_enable();
        return -EFAULT;
}
EXPORT_SYMBOL_GPL(copy_from_kernel_nofault);

#define copy_to_kernel_nofault_loop(dst, src, len, type, err_label)        \
        while (len >= sizeof(type)) {                                        \
                __put_kernel_nofault(dst, src, type, err_label);                \
                dst += sizeof(type);                                        \
                src += sizeof(type);                                        \
                len -= sizeof(type);                                        \
        }

long copy_to_kernel_nofault(void *dst, const void *src, size_t size)
{
        unsigned long align = 0;

        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
                align = (unsigned long)dst | (unsigned long)src;

        pagefault_disable();
        if (!(align & 7))
                copy_to_kernel_nofault_loop(dst, src, size, u64, Efault);
        if (!(align & 3))
                copy_to_kernel_nofault_loop(dst, src, size, u32, Efault);
        if (!(align & 1))
                copy_to_kernel_nofault_loop(dst, src, size, u16, Efault);
        copy_to_kernel_nofault_loop(dst, src, size, u8, Efault);
        pagefault_enable();
        return 0;
Efault:
        pagefault_enable();
        return -EFAULT;
}

long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
{
        const void *src = unsafe_addr;

        if (unlikely(count <= 0))
                return 0;
        if (!copy_from_kernel_nofault_allowed(unsafe_addr, count))
                return -ERANGE;

        pagefault_disable();
        do {
                __get_kernel_nofault(dst, src, u8, Efault);
                dst++;
                src++;
        } while (dst[-1] && src - unsafe_addr < count);
        pagefault_enable();

        dst[-1] = '\0';
        return src - unsafe_addr;
Efault:
        pagefault_enable();
        dst[0] = '\0';
        return -EFAULT;
}

/**
 * copy_from_user_nofault(): safely attempt to read from a user-space location
 * @dst: pointer to the buffer that shall take the data
 * @src: address to read from. This must be a user address.
 * @size: size of the data chunk
 *
 * Safely read from user address @src to the buffer at @dst. If a kernel fault
 * happens, handle that and return -EFAULT.
 */
long copy_from_user_nofault(void *dst, const void __user *src, size_t size)
{
        long ret = -EFAULT;

        if (!__access_ok(src, size))
                return ret;

        if (!nmi_uaccess_okay())
                return ret;

        pagefault_disable();
        ret = __copy_from_user_inatomic(dst, src, size);
        pagefault_enable();

        if (ret)
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL_GPL(copy_from_user_nofault);

/**
 * copy_to_user_nofault(): safely attempt to write to a user-space location
 * @dst: address to write to
 * @src: pointer to the data that shall be written
 * @size: size of the data chunk
 *
 * Safely write to address @dst from the buffer at @src.  If a kernel fault
 * happens, handle that and return -EFAULT.
 */
long copy_to_user_nofault(void __user *dst, const void *src, size_t size)
{
        long ret = -EFAULT;

        if (access_ok(dst, size)) {
                pagefault_disable();
                ret = __copy_to_user_inatomic(dst, src, size);
                pagefault_enable();
        }

        if (ret)
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL_GPL(copy_to_user_nofault);

/**
 * strncpy_from_user_nofault: - Copy a NUL terminated string from unsafe user
 *                                address.
 * @dst:   Destination address, in kernel space.  This buffer must be at
 *         least @count bytes long.
 * @unsafe_addr: Unsafe user address.
 * @count: Maximum number of bytes to copy, including the trailing NUL.
 *
 * Copies a NUL-terminated string from unsafe user address to kernel buffer.
 *
 * On success, returns the length of the string INCLUDING the trailing NUL.
 *
 * If access fails, returns -EFAULT (some data may have been copied
 * and the trailing NUL added).
 *
 * If @count is smaller than the length of the string, copies @count-1 bytes,
 * sets the last byte of @dst buffer to NUL and returns @count.
 */
long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
                              long count)
{
        long ret;

        if (unlikely(count <= 0))
                return 0;

        pagefault_disable();
        ret = strncpy_from_user(dst, unsafe_addr, count);
        pagefault_enable();

        if (ret >= count) {
                ret = count;
                dst[ret - 1] = '\0';
        } else if (ret > 0) {
                ret++;
        }

        return ret;
}

/**
 * strnlen_user_nofault: - Get the size of a user string INCLUDING final NUL.
 * @unsafe_addr: The string to measure.
 * @count: Maximum count (including NUL)
 *
 * Get the size of a NUL-terminated string in user space without pagefault.
 *
 * Returns the size of the string INCLUDING the terminating NUL.
 *
 * If the string is too long, returns a number larger than @count. User
 * has to check the return value against "> count".
 * On exception (or invalid count), returns 0.
 *
 * Unlike strnlen_user, this can be used from IRQ handler etc. because
 * it disables pagefaults.
 */
long strnlen_user_nofault(const void __user *unsafe_addr, long count)
{
        int ret;

        pagefault_disable();
        ret = strnlen_user(unsafe_addr, count);
        pagefault_enable();

        return ret;
}

void __copy_overflow(int size, unsigned long count)
{
        WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
}
EXPORT_SYMBOL(__copy_overflow);



























    1 























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
// SPDX-License-Identifier: GPL-2.0-only
/*
 * partition.c
 *
 * PURPOSE
 *      Partition handling routines for the OSTA-UDF(tm) filesystem.
 *
 * COPYRIGHT
 *  (C) 1998-2001 Ben Fennema
 *
 * HISTORY
 *
 * 12/06/98 blf  Created file.
 *
 */

#include "udfdecl.h"
#include "udf_sb.h"
#include "udf_i.h"

#include <linux/fs.h>
#include <linux/string.h>
#include <linux/mutex.h>

uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
                        uint16_t partition, uint32_t offset)
{
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct udf_part_map *map;
        if (partition >= sbi->s_partitions) {
                udf_debug("block=%u, partition=%u, offset=%u: invalid partition\n",
                          block, partition, offset);
                return 0xFFFFFFFF;
        }
        map = &sbi->s_partmaps[partition];
        if (map->s_partition_func)
                return map->s_partition_func(sb, block, partition, offset);
        else
                return map->s_partition_root + block + offset;
}

uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block,
                               uint16_t partition, uint32_t offset)
{
        struct buffer_head *bh = NULL;
        uint32_t newblock;
        uint32_t index;
        uint32_t loc;
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct udf_part_map *map;
        struct udf_virtual_data *vdata;
        struct udf_inode_info *iinfo = UDF_I(sbi->s_vat_inode);
        int err;

        map = &sbi->s_partmaps[partition];
        vdata = &map->s_type_specific.s_virtual;

        if (block > vdata->s_num_entries) {
                udf_debug("Trying to access block beyond end of VAT (%u max %u)\n",
                          block, vdata->s_num_entries);
                return 0xFFFFFFFF;
        }

        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                loc = le32_to_cpu(((__le32 *)(iinfo->i_data +
                        vdata->s_start_offset))[block]);
                goto translate;
        }
        index = (sb->s_blocksize - vdata->s_start_offset) / sizeof(uint32_t);
        if (block >= index) {
                block -= index;
                newblock = 1 + (block / (sb->s_blocksize / sizeof(uint32_t)));
                index = block % (sb->s_blocksize / sizeof(uint32_t));
        } else {
                newblock = 0;
                index = vdata->s_start_offset / sizeof(uint32_t) + block;
        }

        bh = udf_bread(sbi->s_vat_inode, newblock, 0, &err);
        if (!bh) {
                udf_debug("get_pblock(UDF_VIRTUAL_MAP:%p,%u,%u)\n",
                          sb, block, partition);
                return 0xFFFFFFFF;
        }

        loc = le32_to_cpu(((__le32 *)bh->b_data)[index]);

        brelse(bh);

translate:
        if (iinfo->i_location.partitionReferenceNum == partition) {
                udf_debug("recursive call to udf_get_pblock!\n");
                return 0xFFFFFFFF;
        }

        return udf_get_pblock(sb, loc,
                              iinfo->i_location.partitionReferenceNum,
                              offset);
}

inline uint32_t udf_get_pblock_virt20(struct super_block *sb, uint32_t block,
                                      uint16_t partition, uint32_t offset)
{
        return udf_get_pblock_virt15(sb, block, partition, offset);
}

uint32_t udf_get_pblock_spar15(struct super_block *sb, uint32_t block,
                               uint16_t partition, uint32_t offset)
{
        int i;
        struct sparingTable *st = NULL;
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct udf_part_map *map;
        uint32_t packet;
        struct udf_sparing_data *sdata;

        map = &sbi->s_partmaps[partition];
        sdata = &map->s_type_specific.s_sparing;
        packet = (block + offset) & ~(sdata->s_packet_len - 1);

        for (i = 0; i < 4; i++) {
                if (sdata->s_spar_map[i] != NULL) {
                        st = (struct sparingTable *)
                                        sdata->s_spar_map[i]->b_data;
                        break;
                }
        }

        if (st) {
                for (i = 0; i < le16_to_cpu(st->reallocationTableLen); i++) {
                        struct sparingEntry *entry = &st->mapEntry[i];
                        u32 origLoc = le32_to_cpu(entry->origLocation);
                        if (origLoc >= 0xFFFFFFF0)
                                break;
                        else if (origLoc == packet)
                                return le32_to_cpu(entry->mappedLocation) +
                                        ((block + offset) &
                                                (sdata->s_packet_len - 1));
                        else if (origLoc > packet)
                                break;
                }
        }

        return map->s_partition_root + block + offset;
}

int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
{
        struct udf_sparing_data *sdata;
        struct sparingTable *st = NULL;
        struct sparingEntry mapEntry;
        uint32_t packet;
        int i, j, k, l;
        struct udf_sb_info *sbi = UDF_SB(sb);
        u16 reallocationTableLen;
        struct buffer_head *bh;
        int ret = 0;

        mutex_lock(&sbi->s_alloc_mutex);
        for (i = 0; i < sbi->s_partitions; i++) {
                struct udf_part_map *map = &sbi->s_partmaps[i];
                if (old_block > map->s_partition_root &&
                    old_block < map->s_partition_root + map->s_partition_len) {
                        sdata = &map->s_type_specific.s_sparing;
                        packet = (old_block - map->s_partition_root) &
                                                ~(sdata->s_packet_len - 1);

                        for (j = 0; j < 4; j++)
                                if (sdata->s_spar_map[j] != NULL) {
                                        st = (struct sparingTable *)
                                                sdata->s_spar_map[j]->b_data;
                                        break;
                                }

                        if (!st) {
                                ret = 1;
                                goto out;
                        }

                        reallocationTableLen =
                                        le16_to_cpu(st->reallocationTableLen);
                        for (k = 0; k < reallocationTableLen; k++) {
                                struct sparingEntry *entry = &st->mapEntry[k];
                                u32 origLoc = le32_to_cpu(entry->origLocation);

                                if (origLoc == 0xFFFFFFFF) {
                                        for (; j < 4; j++) {
                                                int len;
                                                bh = sdata->s_spar_map[j];
                                                if (!bh)
                                                        continue;

                                                st = (struct sparingTable *)
                                                                bh->b_data;
                                                entry->origLocation =
                                                        cpu_to_le32(packet);
                                                len =
                                                  sizeof(struct sparingTable) +
                                                  reallocationTableLen *
                                                  sizeof(struct sparingEntry);
                                                udf_update_tag((char *)st, len);
                                                mark_buffer_dirty(bh);
                                        }
                                        *new_block = le32_to_cpu(
                                                        entry->mappedLocation) +
                                                     ((old_block -
                                                        map->s_partition_root) &
                                                     (sdata->s_packet_len - 1));
                                        ret = 0;
                                        goto out;
                                } else if (origLoc == packet) {
                                        *new_block = le32_to_cpu(
                                                        entry->mappedLocation) +
                                                     ((old_block -
                                                        map->s_partition_root) &
                                                     (sdata->s_packet_len - 1));
                                        ret = 0;
                                        goto out;
                                } else if (origLoc > packet)
                                        break;
                        }

                        for (l = k; l < reallocationTableLen; l++) {
                                struct sparingEntry *entry = &st->mapEntry[l];
                                u32 origLoc = le32_to_cpu(entry->origLocation);

                                if (origLoc != 0xFFFFFFFF)
                                        continue;

                                for (; j < 4; j++) {
                                        bh = sdata->s_spar_map[j];
                                        if (!bh)
                                                continue;

                                        st = (struct sparingTable *)bh->b_data;
                                        mapEntry = st->mapEntry[l];
                                        mapEntry.origLocation =
                                                        cpu_to_le32(packet);
                                        memmove(&st->mapEntry[k + 1],
                                                &st->mapEntry[k],
                                                (l - k) *
                                                sizeof(struct sparingEntry));
                                        st->mapEntry[k] = mapEntry;
                                        udf_update_tag((char *)st,
                                                sizeof(struct sparingTable) +
                                                reallocationTableLen *
                                                sizeof(struct sparingEntry));
                                        mark_buffer_dirty(bh);
                                }
                                *new_block =
                                        le32_to_cpu(
                                              st->mapEntry[k].mappedLocation) +
                                        ((old_block - map->s_partition_root) &
                                         (sdata->s_packet_len - 1));
                                ret = 0;
                                goto out;
                        }

                        ret = 1;
                        goto out;
                } /* if old_block */
        }

        if (i == sbi->s_partitions) {
                /* outside of partitions */
                /* for now, fail =) */
                ret = 1;
        }

out:
        mutex_unlock(&sbi->s_alloc_mutex);
        return ret;
}

static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
                                        uint16_t partition, uint32_t offset)
{
        struct super_block *sb = inode->i_sb;
        struct udf_part_map *map;
        struct kernel_lb_addr eloc;
        uint32_t elen;
        sector_t ext_offset;
        struct extent_position epos = {};
        uint32_t phyblock;

        if (inode_bmap(inode, block, &epos, &eloc, &elen, &ext_offset) !=
                                                (EXT_RECORDED_ALLOCATED >> 30))
                phyblock = 0xFFFFFFFF;
        else {
                map = &UDF_SB(sb)->s_partmaps[partition];
                /* map to sparable/physical partition desc */
                phyblock = udf_get_pblock(sb, eloc.logicalBlockNum,
                        map->s_type_specific.s_metadata.s_phys_partition_ref,
                        ext_offset + offset);
        }

        brelse(epos.bh);
        return phyblock;
}

uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block,
                                uint16_t partition, uint32_t offset)
{
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct udf_part_map *map;
        struct udf_meta_data *mdata;
        uint32_t retblk;
        struct inode *inode;

        udf_debug("READING from METADATA\n");

        map = &sbi->s_partmaps[partition];
        mdata = &map->s_type_specific.s_metadata;
        inode = mdata->s_metadata_fe ? : mdata->s_mirror_fe;

        if (!inode)
                return 0xFFFFFFFF;

        retblk = udf_try_read_meta(inode, block, partition, offset);
        if (retblk == 0xFFFFFFFF && mdata->s_metadata_fe) {
                udf_warn(sb, "error reading from METADATA, trying to read from MIRROR\n");
                if (!(mdata->s_flags & MF_MIRROR_FE_LOADED)) {
                        mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb,
                                mdata->s_mirror_file_loc,
                                mdata->s_phys_partition_ref);
                        if (IS_ERR(mdata->s_mirror_fe))
                                mdata->s_mirror_fe = NULL;
                        mdata->s_flags |= MF_MIRROR_FE_LOADED;
                }

                inode = mdata->s_mirror_fe;
                if (!inode)
                        return 0xFFFFFFFF;
                retblk = udf_try_read_meta(inode, block, partition, offset);
        }

        return retblk;
}


































































    1 












    1 







    1 









    1 


    1 




    1 















    1 


































    1 














    1 
    1 





    1 





    1 








































































































































































































































































































































































































































































































    1 




    1 






    1 






















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
// SPDX-License-Identifier: GPL-2.0-only
/*
 * balloc.c
 *
 * PURPOSE
 *        Block allocation handling routines for the OSTA-UDF(tm) filesystem.
 *
 * COPYRIGHT
 *  (C) 1999-2001 Ben Fennema
 *  (C) 1999 Stelias Computing Inc
 *
 * HISTORY
 *
 *  02/24/99 blf  Created.
 *
 */

#include "udfdecl.h"

#include <linux/bitops.h>

#include "udf_i.h"
#include "udf_sb.h"

#define udf_clear_bit        __test_and_clear_bit_le
#define udf_set_bit        __test_and_set_bit_le
#define udf_test_bit        test_bit_le
#define udf_find_next_one_bit        find_next_bit_le

static int read_block_bitmap(struct super_block *sb,
                             struct udf_bitmap *bitmap, unsigned int block,
                             unsigned long bitmap_nr)
{
        struct buffer_head *bh = NULL;
        int i;
        int max_bits, off, count;
        struct kernel_lb_addr loc;

        loc.logicalBlockNum = bitmap->s_extPosition;
        loc.partitionReferenceNum = UDF_SB(sb)->s_partition;

        bh = sb_bread(sb, udf_get_lb_pblock(sb, &loc, block));
        bitmap->s_block_bitmap[bitmap_nr] = bh;
        if (!bh)
                return -EIO;

        /* Check consistency of Space Bitmap buffer. */
        max_bits = sb->s_blocksize * 8;
        if (!bitmap_nr) {
                off = sizeof(struct spaceBitmapDesc) << 3;
                count = min(max_bits - off, bitmap->s_nr_groups);
        } else {
                /*
                 * Rough check if bitmap number is too big to have any bitmap
                  * blocks reserved.
                 */
                if (bitmap_nr >
                    (bitmap->s_nr_groups >> (sb->s_blocksize_bits + 3)) + 2)
                        return 0;
                off = 0;
                count = bitmap->s_nr_groups - bitmap_nr * max_bits +
                                (sizeof(struct spaceBitmapDesc) << 3);
                count = min(count, max_bits);
        }

        for (i = 0; i < count; i++)
                if (udf_test_bit(i + off, bh->b_data))
                        return -EFSCORRUPTED;
        return 0;
}

static int __load_block_bitmap(struct super_block *sb,
                               struct udf_bitmap *bitmap,
                               unsigned int block_group)
{
        int retval = 0;
        int nr_groups = bitmap->s_nr_groups;

        if (block_group >= nr_groups) {
                udf_debug("block_group (%u) > nr_groups (%d)\n",
                          block_group, nr_groups);
        }

        if (bitmap->s_block_bitmap[block_group])
                return block_group;

        retval = read_block_bitmap(sb, bitmap, block_group, block_group);
        if (retval < 0)
                return retval;

        return block_group;
}

static inline int load_block_bitmap(struct super_block *sb,
                                    struct udf_bitmap *bitmap,
                                    unsigned int block_group)
{
        int slot;

        slot = __load_block_bitmap(sb, bitmap, block_group);

        if (slot < 0)
                return slot;

        if (!bitmap->s_block_bitmap[slot])
                return -EIO;

        return slot;
}

static void udf_add_free_space(struct super_block *sb, u16 partition, u32 cnt)
{
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct logicalVolIntegrityDesc *lvid;

        if (!sbi->s_lvid_bh)
                return;

        lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data;
        le32_add_cpu(&lvid->freeSpaceTable[partition], cnt);
        udf_updated_lvid(sb);
}

static void udf_bitmap_free_blocks(struct super_block *sb,
                                   struct udf_bitmap *bitmap,
                                   struct kernel_lb_addr *bloc,
                                   uint32_t offset,
                                   uint32_t count)
{
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct buffer_head *bh = NULL;
        struct udf_part_map *partmap;
        unsigned long block;
        unsigned long block_group;
        unsigned long bit;
        unsigned long i;
        int bitmap_nr;
        unsigned long overflow;

        mutex_lock(&sbi->s_alloc_mutex);
        partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
        if (bloc->logicalBlockNum + count < count ||
            (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
                udf_debug("%u < %d || %u + %u > %u\n",
                          bloc->logicalBlockNum, 0,
                          bloc->logicalBlockNum, count,
                          partmap->s_partition_len);
                goto error_return;
        }

        block = bloc->logicalBlockNum + offset +
                (sizeof(struct spaceBitmapDesc) << 3);

        do {
                overflow = 0;
                block_group = block >> (sb->s_blocksize_bits + 3);
                bit = block % (sb->s_blocksize << 3);

                /*
                * Check to see if we are freeing blocks across a group boundary.
                */
                if (bit + count > (sb->s_blocksize << 3)) {
                        overflow = bit + count - (sb->s_blocksize << 3);
                        count -= overflow;
                }
                bitmap_nr = load_block_bitmap(sb, bitmap, block_group);
                if (bitmap_nr < 0)
                        goto error_return;

                bh = bitmap->s_block_bitmap[bitmap_nr];
                for (i = 0; i < count; i++) {
                        if (udf_set_bit(bit + i, bh->b_data)) {
                                udf_debug("bit %lu already set\n", bit + i);
                                udf_debug("byte=%2x\n",
                                          ((__u8 *)bh->b_data)[(bit + i) >> 3]);
                        }
                }
                udf_add_free_space(sb, sbi->s_partition, count);
                mark_buffer_dirty(bh);
                if (overflow) {
                        block += count;
                        count = overflow;
                }
        } while (overflow);

error_return:
        mutex_unlock(&sbi->s_alloc_mutex);
}

static int udf_bitmap_prealloc_blocks(struct super_block *sb,
                                      struct udf_bitmap *bitmap,
                                      uint16_t partition, uint32_t first_block,
                                      uint32_t block_count)
{
        struct udf_sb_info *sbi = UDF_SB(sb);
        int alloc_count = 0;
        int bit, block, block_group;
        int bitmap_nr;
        struct buffer_head *bh;
        __u32 part_len;

        mutex_lock(&sbi->s_alloc_mutex);
        part_len = sbi->s_partmaps[partition].s_partition_len;
        if (first_block >= part_len)
                goto out;

        if (first_block + block_count > part_len)
                block_count = part_len - first_block;

        do {
                block = first_block + (sizeof(struct spaceBitmapDesc) << 3);
                block_group = block >> (sb->s_blocksize_bits + 3);

                bitmap_nr = load_block_bitmap(sb, bitmap, block_group);
                if (bitmap_nr < 0)
                        goto out;
                bh = bitmap->s_block_bitmap[bitmap_nr];

                bit = block % (sb->s_blocksize << 3);

                while (bit < (sb->s_blocksize << 3) && block_count > 0) {
                        if (!udf_clear_bit(bit, bh->b_data))
                                goto out;
                        block_count--;
                        alloc_count++;
                        bit++;
                        block++;
                }
                mark_buffer_dirty(bh);
        } while (block_count > 0);

out:
        udf_add_free_space(sb, partition, -alloc_count);
        mutex_unlock(&sbi->s_alloc_mutex);
        return alloc_count;
}

static udf_pblk_t udf_bitmap_new_block(struct super_block *sb,
                                struct udf_bitmap *bitmap, uint16_t partition,
                                uint32_t goal, int *err)
{
        struct udf_sb_info *sbi = UDF_SB(sb);
        int newbit, bit = 0;
        udf_pblk_t block;
        int block_group, group_start;
        int end_goal, nr_groups, bitmap_nr, i;
        struct buffer_head *bh = NULL;
        char *ptr;
        udf_pblk_t newblock = 0;

        *err = -ENOSPC;
        mutex_lock(&sbi->s_alloc_mutex);

repeat:
        if (goal >= sbi->s_partmaps[partition].s_partition_len)
                goal = 0;

        nr_groups = bitmap->s_nr_groups;
        block = goal + (sizeof(struct spaceBitmapDesc) << 3);
        block_group = block >> (sb->s_blocksize_bits + 3);
        group_start = block_group ? 0 : sizeof(struct spaceBitmapDesc);

        bitmap_nr = load_block_bitmap(sb, bitmap, block_group);
        if (bitmap_nr < 0)
                goto error_return;
        bh = bitmap->s_block_bitmap[bitmap_nr];
        ptr = memscan((char *)bh->b_data + group_start, 0xFF,
                      sb->s_blocksize - group_start);

        if ((ptr - ((char *)bh->b_data)) < sb->s_blocksize) {
                bit = block % (sb->s_blocksize << 3);
                if (udf_test_bit(bit, bh->b_data))
                        goto got_block;

                end_goal = (bit + 63) & ~63;
                bit = udf_find_next_one_bit(bh->b_data, end_goal, bit);
                if (bit < end_goal)
                        goto got_block;

                ptr = memscan((char *)bh->b_data + (bit >> 3), 0xFF,
                              sb->s_blocksize - ((bit + 7) >> 3));
                newbit = (ptr - ((char *)bh->b_data)) << 3;
                if (newbit < sb->s_blocksize << 3) {
                        bit = newbit;
                        goto search_back;
                }

                newbit = udf_find_next_one_bit(bh->b_data,
                                               sb->s_blocksize << 3, bit);
                if (newbit < sb->s_blocksize << 3) {
                        bit = newbit;
                        goto got_block;
                }
        }

        for (i = 0; i < (nr_groups * 2); i++) {
                block_group++;
                if (block_group >= nr_groups)
                        block_group = 0;
                group_start = block_group ? 0 : sizeof(struct spaceBitmapDesc);

                bitmap_nr = load_block_bitmap(sb, bitmap, block_group);
                if (bitmap_nr < 0)
                        goto error_return;
                bh = bitmap->s_block_bitmap[bitmap_nr];
                if (i < nr_groups) {
                        ptr = memscan((char *)bh->b_data + group_start, 0xFF,
                                      sb->s_blocksize - group_start);
                        if ((ptr - ((char *)bh->b_data)) < sb->s_blocksize) {
                                bit = (ptr - ((char *)bh->b_data)) << 3;
                                break;
                        }
                } else {
                        bit = udf_find_next_one_bit(bh->b_data,
                                                    sb->s_blocksize << 3,
                                                    group_start << 3);
                        if (bit < sb->s_blocksize << 3)
                                break;
                }
        }
        if (i >= (nr_groups * 2)) {
                mutex_unlock(&sbi->s_alloc_mutex);
                return newblock;
        }
        if (bit < sb->s_blocksize << 3)
                goto search_back;
        else
                bit = udf_find_next_one_bit(bh->b_data, sb->s_blocksize << 3,
                                            group_start << 3);
        if (bit >= sb->s_blocksize << 3) {
                mutex_unlock(&sbi->s_alloc_mutex);
                return 0;
        }

search_back:
        i = 0;
        while (i < 7 && bit > (group_start << 3) &&
               udf_test_bit(bit - 1, bh->b_data)) {
                ++i;
                --bit;
        }

got_block:
        newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
                (sizeof(struct spaceBitmapDesc) << 3);

        if (newblock >= sbi->s_partmaps[partition].s_partition_len) {
                /*
                 * Ran off the end of the bitmap, and bits following are
                 * non-compliant (not all zero)
                 */
                udf_err(sb, "bitmap for partition %d corrupted (block %u marked"
                        " as free, partition length is %u)\n", partition,
                        newblock, sbi->s_partmaps[partition].s_partition_len);
                goto error_return;
        }

        if (!udf_clear_bit(bit, bh->b_data)) {
                udf_debug("bit already cleared for block %d\n", bit);
                goto repeat;
        }

        mark_buffer_dirty(bh);

        udf_add_free_space(sb, partition, -1);
        mutex_unlock(&sbi->s_alloc_mutex);
        *err = 0;
        return newblock;

error_return:
        *err = -EIO;
        mutex_unlock(&sbi->s_alloc_mutex);
        return 0;
}

static void udf_table_free_blocks(struct super_block *sb,
                                  struct inode *table,
                                  struct kernel_lb_addr *bloc,
                                  uint32_t offset,
                                  uint32_t count)
{
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct udf_part_map *partmap;
        uint32_t start, end;
        uint32_t elen;
        struct kernel_lb_addr eloc;
        struct extent_position oepos, epos;
        int8_t etype;
        struct udf_inode_info *iinfo;

        mutex_lock(&sbi->s_alloc_mutex);
        partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
        if (bloc->logicalBlockNum + count < count ||
            (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
                udf_debug("%u < %d || %u + %u > %u\n",
                          bloc->logicalBlockNum, 0,
                          bloc->logicalBlockNum, count,
                          partmap->s_partition_len);
                goto error_return;
        }

        iinfo = UDF_I(table);
        udf_add_free_space(sb, sbi->s_partition, count);

        start = bloc->logicalBlockNum + offset;
        end = bloc->logicalBlockNum + offset + count - 1;

        epos.offset = oepos.offset = sizeof(struct unallocSpaceEntry);
        elen = 0;
        epos.block = oepos.block = iinfo->i_location;
        epos.bh = oepos.bh = NULL;

        while (count &&
               (etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1) {
                if (((eloc.logicalBlockNum +
                        (elen >> sb->s_blocksize_bits)) == start)) {
                        if ((0x3FFFFFFF - elen) <
                                        (count << sb->s_blocksize_bits)) {
                                uint32_t tmp = ((0x3FFFFFFF - elen) >>
                                                        sb->s_blocksize_bits);
                                count -= tmp;
                                start += tmp;
                                elen = (etype << 30) |
                                        (0x40000000 - sb->s_blocksize);
                        } else {
                                elen = (etype << 30) |
                                        (elen +
                                        (count << sb->s_blocksize_bits));
                                start += count;
                                count = 0;
                        }
                        udf_write_aext(table, &oepos, &eloc, elen, 1);
                } else if (eloc.logicalBlockNum == (end + 1)) {
                        if ((0x3FFFFFFF - elen) <
                                        (count << sb->s_blocksize_bits)) {
                                uint32_t tmp = ((0x3FFFFFFF - elen) >>
                                                sb->s_blocksize_bits);
                                count -= tmp;
                                end -= tmp;
                                eloc.logicalBlockNum -= tmp;
                                elen = (etype << 30) |
                                        (0x40000000 - sb->s_blocksize);
                        } else {
                                eloc.logicalBlockNum = start;
                                elen = (etype << 30) |
                                        (elen +
                                        (count << sb->s_blocksize_bits));
                                end -= count;
                                count = 0;
                        }
                        udf_write_aext(table, &oepos, &eloc, elen, 1);
                }

                if (epos.bh != oepos.bh) {
                        oepos.block = epos.block;
                        brelse(oepos.bh);
                        get_bh(epos.bh);
                        oepos.bh = epos.bh;
                        oepos.offset = 0;
                } else {
                        oepos.offset = epos.offset;
                }
        }

        if (count) {
                /*
                 * NOTE: we CANNOT use udf_add_aext here, as it can try to
                 * allocate a new block, and since we hold the super block
                 * lock already very bad things would happen :)
                 *
                 * We copy the behavior of udf_add_aext, but instead of
                 * trying to allocate a new block close to the existing one,
                 * we just steal a block from the extent we are trying to add.
                 *
                 * It would be nice if the blocks were close together, but it
                 * isn't required.
                 */

                int adsize;

                eloc.logicalBlockNum = start;
                elen = EXT_RECORDED_ALLOCATED |
                        (count << sb->s_blocksize_bits);

                if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
                        adsize = sizeof(struct short_ad);
                else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
                        adsize = sizeof(struct long_ad);
                else {
                        brelse(oepos.bh);
                        brelse(epos.bh);
                        goto error_return;
                }

                if (epos.offset + (2 * adsize) > sb->s_blocksize) {
                        /* Steal a block from the extent being free'd */
                        udf_setup_indirect_aext(table, eloc.logicalBlockNum,
                                                &epos);

                        eloc.logicalBlockNum++;
                        elen -= sb->s_blocksize;
                }

                /* It's possible that stealing the block emptied the extent */
                if (elen)
                        __udf_add_aext(table, &epos, &eloc, elen, 1);
        }

        brelse(epos.bh);
        brelse(oepos.bh);

error_return:
        mutex_unlock(&sbi->s_alloc_mutex);
        return;
}

static int udf_table_prealloc_blocks(struct super_block *sb,
                                     struct inode *table, uint16_t partition,
                                     uint32_t first_block, uint32_t block_count)
{
        struct udf_sb_info *sbi = UDF_SB(sb);
        int alloc_count = 0;
        uint32_t elen, adsize;
        struct kernel_lb_addr eloc;
        struct extent_position epos;
        int8_t etype = -1;
        struct udf_inode_info *iinfo;

        if (first_block >= sbi->s_partmaps[partition].s_partition_len)
                return 0;

        iinfo = UDF_I(table);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
                adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
                adsize = sizeof(struct long_ad);
        else
                return 0;

        mutex_lock(&sbi->s_alloc_mutex);
        epos.offset = sizeof(struct unallocSpaceEntry);
        epos.block = iinfo->i_location;
        epos.bh = NULL;
        eloc.logicalBlockNum = 0xFFFFFFFF;

        while (first_block != eloc.logicalBlockNum &&
               (etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1) {
                udf_debug("eloc=%u, elen=%u, first_block=%u\n",
                          eloc.logicalBlockNum, elen, first_block);
                ; /* empty loop body */
        }

        if (first_block == eloc.logicalBlockNum) {
                epos.offset -= adsize;

                alloc_count = (elen >> sb->s_blocksize_bits);
                if (alloc_count > block_count) {
                        alloc_count = block_count;
                        eloc.logicalBlockNum += alloc_count;
                        elen -= (alloc_count << sb->s_blocksize_bits);
                        udf_write_aext(table, &epos, &eloc,
                                        (etype << 30) | elen, 1);
                } else
                        udf_delete_aext(table, epos);
        } else {
                alloc_count = 0;
        }

        brelse(epos.bh);

        if (alloc_count)
                udf_add_free_space(sb, partition, -alloc_count);
        mutex_unlock(&sbi->s_alloc_mutex);
        return alloc_count;
}

static udf_pblk_t udf_table_new_block(struct super_block *sb,
                               struct inode *table, uint16_t partition,
                               uint32_t goal, int *err)
{
        struct udf_sb_info *sbi = UDF_SB(sb);
        uint32_t spread = 0xFFFFFFFF, nspread = 0xFFFFFFFF;
        udf_pblk_t newblock = 0;
        uint32_t adsize;
        uint32_t elen, goal_elen = 0;
        struct kernel_lb_addr eloc, goal_eloc;
        struct extent_position epos, goal_epos;
        int8_t etype;
        struct udf_inode_info *iinfo = UDF_I(table);

        *err = -ENOSPC;

        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
                adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
                adsize = sizeof(struct long_ad);
        else
                return newblock;

        mutex_lock(&sbi->s_alloc_mutex);
        if (goal >= sbi->s_partmaps[partition].s_partition_len)
                goal = 0;

        /* We search for the closest matching block to goal. If we find
           a exact hit, we stop. Otherwise we keep going till we run out
           of extents. We store the buffer_head, bloc, and extoffset
           of the current closest match and use that when we are done.
         */
        epos.offset = sizeof(struct unallocSpaceEntry);
        epos.block = iinfo->i_location;
        epos.bh = goal_epos.bh = NULL;

        while (spread &&
               (etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1) {
                if (goal >= eloc.logicalBlockNum) {
                        if (goal < eloc.logicalBlockNum +
                                        (elen >> sb->s_blocksize_bits))
                                nspread = 0;
                        else
                                nspread = goal - eloc.logicalBlockNum -
                                        (elen >> sb->s_blocksize_bits);
                } else {
                        nspread = eloc.logicalBlockNum - goal;
                }

                if (nspread < spread) {
                        spread = nspread;
                        if (goal_epos.bh != epos.bh) {
                                brelse(goal_epos.bh);
                                goal_epos.bh = epos.bh;
                                get_bh(goal_epos.bh);
                        }
                        goal_epos.block = epos.block;
                        goal_epos.offset = epos.offset - adsize;
                        goal_eloc = eloc;
                        goal_elen = (etype << 30) | elen;
                }
        }

        brelse(epos.bh);

        if (spread == 0xFFFFFFFF) {
                brelse(goal_epos.bh);
                mutex_unlock(&sbi->s_alloc_mutex);
                return 0;
        }

        /* Only allocate blocks from the beginning of the extent.
           That way, we only delete (empty) extents, never have to insert an
           extent because of splitting */
        /* This works, but very poorly.... */

        newblock = goal_eloc.logicalBlockNum;
        goal_eloc.logicalBlockNum++;
        goal_elen -= sb->s_blocksize;

        if (goal_elen)
                udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
        else
                udf_delete_aext(table, goal_epos);
        brelse(goal_epos.bh);

        udf_add_free_space(sb, partition, -1);

        mutex_unlock(&sbi->s_alloc_mutex);
        *err = 0;
        return newblock;
}

void udf_free_blocks(struct super_block *sb, struct inode *inode,
                     struct kernel_lb_addr *bloc, uint32_t offset,
                     uint32_t count)
{
        uint16_t partition = bloc->partitionReferenceNum;
        struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];

        if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
                udf_bitmap_free_blocks(sb, map->s_uspace.s_bitmap,
                                       bloc, offset, count);
        } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
                udf_table_free_blocks(sb, map->s_uspace.s_table,
                                      bloc, offset, count);
        }

        if (inode) {
                inode_sub_bytes(inode,
                                ((sector_t)count) << sb->s_blocksize_bits);
        }
}

inline int udf_prealloc_blocks(struct super_block *sb,
                               struct inode *inode,
                               uint16_t partition, uint32_t first_block,
                               uint32_t block_count)
{
        struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
        int allocated;

        if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
                allocated = udf_bitmap_prealloc_blocks(sb,
                                                       map->s_uspace.s_bitmap,
                                                       partition, first_block,
                                                       block_count);
        else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
                allocated = udf_table_prealloc_blocks(sb,
                                                      map->s_uspace.s_table,
                                                      partition, first_block,
                                                      block_count);
        else
                return 0;

        if (inode && allocated > 0)
                inode_add_bytes(inode, allocated << sb->s_blocksize_bits);
        return allocated;
}

inline udf_pblk_t udf_new_block(struct super_block *sb,
                         struct inode *inode,
                         uint16_t partition, uint32_t goal, int *err)
{
        struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
        udf_pblk_t block;

        if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
                block = udf_bitmap_new_block(sb,
                                             map->s_uspace.s_bitmap,
                                             partition, goal, err);
        else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
                block = udf_table_new_block(sb,
                                            map->s_uspace.s_table,
                                            partition, goal, err);
        else {
                *err = -EIO;
                return 0;
        }
        if (inode && block)
                inode_add_bytes(inode, sb->s_blocksize);
        return block;
}
















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BLK_INTEGRITY_H
#define _LINUX_BLK_INTEGRITY_H

#include <linux/blk-mq.h>

struct request;

enum blk_integrity_flags {
        BLK_INTEGRITY_VERIFY                = 1 << 0,
        BLK_INTEGRITY_GENERATE                = 1 << 1,
        BLK_INTEGRITY_DEVICE_CAPABLE        = 1 << 2,
        BLK_INTEGRITY_IP_CHECKSUM        = 1 << 3,
};

struct blk_integrity_iter {
        void                        *prot_buf;
        void                        *data_buf;
        sector_t                seed;
        unsigned int                data_size;
        unsigned short                interval;
        unsigned char                tuple_size;
        unsigned char                pi_offset;
        const char                *disk_name;
};

typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *);
typedef void (integrity_prepare_fn) (struct request *);
typedef void (integrity_complete_fn) (struct request *, unsigned int);

struct blk_integrity_profile {
        integrity_processing_fn                *generate_fn;
        integrity_processing_fn                *verify_fn;
        integrity_prepare_fn                *prepare_fn;
        integrity_complete_fn                *complete_fn;
        const char                        *name;
};

#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_integrity_register(struct gendisk *, struct blk_integrity *);
void blk_integrity_unregister(struct gendisk *);
int blk_integrity_compare(struct gendisk *, struct gendisk *);
int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
                                   struct scatterlist *);
int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);

static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
{
        struct blk_integrity *bi = &disk->queue->integrity;

        if (!bi->profile)
                return NULL;

        return bi;
}

static inline struct blk_integrity *
bdev_get_integrity(struct block_device *bdev)
{
        return blk_get_integrity(bdev->bd_disk);
}

static inline bool
blk_integrity_queue_supports_integrity(struct request_queue *q)
{
        return q->integrity.profile;
}

static inline unsigned short
queue_max_integrity_segments(const struct request_queue *q)
{
        return q->limits.max_integrity_segments;
}

/**
 * bio_integrity_intervals - Return number of integrity intervals for a bio
 * @bi:                blk_integrity profile for device
 * @sectors:        Size of the bio in 512-byte sectors
 *
 * Description: The block layer calculates everything in 512 byte
 * sectors but integrity metadata is done in terms of the data integrity
 * interval size of the storage device.  Convert the block layer sectors
 * to the appropriate number of integrity intervals.
 */
static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
                                                   unsigned int sectors)
{
        return sectors >> (bi->interval_exp - 9);
}

static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
                                               unsigned int sectors)
{
        return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
}

static inline bool blk_integrity_rq(struct request *rq)
{
        return rq->cmd_flags & REQ_INTEGRITY;
}

/*
 * Return the first bvec that contains integrity data.  Only drivers that are
 * limited to a single integrity segment should use this helper.
 */
static inline struct bio_vec *rq_integrity_vec(struct request *rq)
{
        if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1))
                return NULL;
        return rq->bio->bi_integrity->bip_vec;
}
#else /* CONFIG_BLK_DEV_INTEGRITY */
static inline int blk_rq_count_integrity_sg(struct request_queue *q,
                                            struct bio *b)
{
        return 0;
}
static inline int blk_rq_map_integrity_sg(struct request_queue *q,
                                          struct bio *b,
                                          struct scatterlist *s)
{
        return 0;
}
static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
{
        return NULL;
}
static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
{
        return NULL;
}
static inline bool
blk_integrity_queue_supports_integrity(struct request_queue *q)
{
        return false;
}
static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b)
{
        return 0;
}
static inline void blk_integrity_register(struct gendisk *d,
                                         struct blk_integrity *b)
{
}
static inline void blk_integrity_unregister(struct gendisk *d)
{
}
static inline unsigned short
queue_max_integrity_segments(const struct request_queue *q)
{
        return 0;
}

static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
                                                   unsigned int sectors)
{
        return 0;
}

static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
                                               unsigned int sectors)
{
        return 0;
}
static inline int blk_integrity_rq(struct request *rq)
{
        return 0;
}

static inline struct bio_vec *rq_integrity_vec(struct request *rq)
{
        return NULL;
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */
#endif /* _LINUX_BLK_INTEGRITY_H */










































































    1 











    1 


















































    1 







    1 

    1 













    1 

    1 






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


    1 

    1 




























    1 


    1 






















    1 




























































    1 






    1 









































    1 

















    1 




































    1 






    1 

    1 











































    1 





















    1 












    1 










    1 



    1 















































    1 









































































    1 
    1 





    1 



















    1 





















    1 




    1 








    1 





    1 




    1 
























    1 


































    1 








    1 
    1 
    1 





















    1 













    1 

































    1 




    1 














    1 
















    1 































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
// SPDX-License-Identifier: GPL-2.0-only
/*
 * inode.c
 *
 * PURPOSE
 *  Inode handling routines for the OSTA-UDF(tm) filesystem.
 *
 * COPYRIGHT
 *  (C) 1998 Dave Boynton
 *  (C) 1998-2004 Ben Fennema
 *  (C) 1999-2000 Stelias Computing Inc
 *
 * HISTORY
 *
 *  10/04/98 dgb  Added rudimentary directory functions
 *  10/07/98      Fully working udf_block_map! It works!
 *  11/25/98      bmap altered to better support extents
 *  12/06/98 blf  partition support in udf_iget, udf_block_map
 *                and udf_read_inode
 *  12/12/98      rewrote udf_block_map to handle next extents and descs across
 *                block boundaries (which is not actually allowed)
 *  12/20/98      added support for strategy 4096
 *  03/07/99      rewrote udf_block_map (again)
 *                New funcs, inode_bmap, udf_next_aext
 *  04/19/99      Support for writing device EA's for major/minor #
 */

#include "udfdecl.h"
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/slab.h>
#include <linux/crc-itu-t.h>
#include <linux/mpage.h>
#include <linux/uio.h>
#include <linux/bio.h>

#include "udf_i.h"
#include "udf_sb.h"

#define EXTENT_MERGE_SIZE 5

#define FE_MAPPED_PERMS        (FE_PERM_U_READ | FE_PERM_U_WRITE | FE_PERM_U_EXEC | \
                         FE_PERM_G_READ | FE_PERM_G_WRITE | FE_PERM_G_EXEC | \
                         FE_PERM_O_READ | FE_PERM_O_WRITE | FE_PERM_O_EXEC)

#define FE_DELETE_PERMS        (FE_PERM_U_DELETE | FE_PERM_G_DELETE | \
                         FE_PERM_O_DELETE)

struct udf_map_rq;

static umode_t udf_convert_permissions(struct fileEntry *);
static int udf_update_inode(struct inode *, int);
static int udf_sync_inode(struct inode *inode);
static int udf_alloc_i_data(struct inode *inode, size_t size);
static int inode_getblk(struct inode *inode, struct udf_map_rq *map);
static int udf_insert_aext(struct inode *, struct extent_position,
                           struct kernel_lb_addr, uint32_t);
static void udf_split_extents(struct inode *, int *, int, udf_pblk_t,
                              struct kernel_long_ad *, int *);
static void udf_prealloc_extents(struct inode *, int, int,
                                 struct kernel_long_ad *, int *);
static void udf_merge_extents(struct inode *, struct kernel_long_ad *, int *);
static int udf_update_extents(struct inode *, struct kernel_long_ad *, int,
                              int, struct extent_position *);
static int udf_get_block_wb(struct inode *inode, sector_t block,
                            struct buffer_head *bh_result, int create);

static void __udf_clear_extent_cache(struct inode *inode)
{
        struct udf_inode_info *iinfo = UDF_I(inode);

        if (iinfo->cached_extent.lstart != -1) {
                brelse(iinfo->cached_extent.epos.bh);
                iinfo->cached_extent.lstart = -1;
        }
}

/* Invalidate extent cache */
static void udf_clear_extent_cache(struct inode *inode)
{
        struct udf_inode_info *iinfo = UDF_I(inode);

        spin_lock(&iinfo->i_extent_cache_lock);
        __udf_clear_extent_cache(inode);
        spin_unlock(&iinfo->i_extent_cache_lock);
}

/* Return contents of extent cache */
static int udf_read_extent_cache(struct inode *inode, loff_t bcount,
                                 loff_t *lbcount, struct extent_position *pos)
{
        struct udf_inode_info *iinfo = UDF_I(inode);
        int ret = 0;

        spin_lock(&iinfo->i_extent_cache_lock);
        if ((iinfo->cached_extent.lstart <= bcount) &&
            (iinfo->cached_extent.lstart != -1)) {
                /* Cache hit */
                *lbcount = iinfo->cached_extent.lstart;
                memcpy(pos, &iinfo->cached_extent.epos,
                       sizeof(struct extent_position));
                if (pos->bh)
                        get_bh(pos->bh);
                ret = 1;
        }
        spin_unlock(&iinfo->i_extent_cache_lock);
        return ret;
}

/* Add extent to extent cache */
static void udf_update_extent_cache(struct inode *inode, loff_t estart,
                                    struct extent_position *pos)
{
        struct udf_inode_info *iinfo = UDF_I(inode);

        spin_lock(&iinfo->i_extent_cache_lock);
        /* Invalidate previously cached extent */
        __udf_clear_extent_cache(inode);
        if (pos->bh)
                get_bh(pos->bh);
        memcpy(&iinfo->cached_extent.epos, pos, sizeof(*pos));
        iinfo->cached_extent.lstart = estart;
        switch (iinfo->i_alloc_type) {
        case ICBTAG_FLAG_AD_SHORT:
                iinfo->cached_extent.epos.offset -= sizeof(struct short_ad);
                break;
        case ICBTAG_FLAG_AD_LONG:
                iinfo->cached_extent.epos.offset -= sizeof(struct long_ad);
                break;
        }
        spin_unlock(&iinfo->i_extent_cache_lock);
}

void udf_evict_inode(struct inode *inode)
{
        struct udf_inode_info *iinfo = UDF_I(inode);
        int want_delete = 0;

        if (!is_bad_inode(inode)) {
                if (!inode->i_nlink) {
                        want_delete = 1;
                        udf_setsize(inode, 0);
                        udf_update_inode(inode, IS_SYNC(inode));
                }
                if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
                    inode->i_size != iinfo->i_lenExtents) {
                        udf_warn(inode->i_sb,
                                 "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n",
                                 inode->i_ino, inode->i_mode,
                                 (unsigned long long)inode->i_size,
                                 (unsigned long long)iinfo->i_lenExtents);
                }
        }
        truncate_inode_pages_final(&inode->i_data);
        invalidate_inode_buffers(inode);
        clear_inode(inode);
        kfree(iinfo->i_data);
        iinfo->i_data = NULL;
        udf_clear_extent_cache(inode);
        if (want_delete) {
                udf_free_inode(inode);
        }
}

static void udf_write_failed(struct address_space *mapping, loff_t to)
{
        struct inode *inode = mapping->host;
        struct udf_inode_info *iinfo = UDF_I(inode);
        loff_t isize = inode->i_size;

        if (to > isize) {
                truncate_pagecache(inode, isize);
                if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
                        down_write(&iinfo->i_data_sem);
                        udf_clear_extent_cache(inode);
                        udf_truncate_extents(inode);
                        up_write(&iinfo->i_data_sem);
                }
        }
}

static int udf_adinicb_writepage(struct folio *folio,
                                 struct writeback_control *wbc, void *data)
{
        struct inode *inode = folio->mapping->host;
        struct udf_inode_info *iinfo = UDF_I(inode);

        BUG_ON(!folio_test_locked(folio));
        BUG_ON(folio->index != 0);
        memcpy_from_file_folio(iinfo->i_data + iinfo->i_lenEAttr, folio, 0,
                       i_size_read(inode));
        folio_unlock(folio);
        mark_inode_dirty(inode);

        return 0;
}

static int udf_writepages(struct address_space *mapping,
                          struct writeback_control *wbc)
{
        struct inode *inode = mapping->host;
        struct udf_inode_info *iinfo = UDF_I(inode);

        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB)
                return mpage_writepages(mapping, wbc, udf_get_block_wb);
        return write_cache_pages(mapping, wbc, udf_adinicb_writepage, NULL);
}

static void udf_adinicb_read_folio(struct folio *folio)
{
        struct inode *inode = folio->mapping->host;
        struct udf_inode_info *iinfo = UDF_I(inode);
        loff_t isize = i_size_read(inode);

        folio_fill_tail(folio, 0, iinfo->i_data + iinfo->i_lenEAttr, isize);
        folio_mark_uptodate(folio);
}

static int udf_read_folio(struct file *file, struct folio *folio)
{
        struct udf_inode_info *iinfo = UDF_I(file_inode(file));

        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                udf_adinicb_read_folio(folio);
                folio_unlock(folio);
                return 0;
        }
        return mpage_read_folio(folio, udf_get_block);
}

static void udf_readahead(struct readahead_control *rac)
{
        struct udf_inode_info *iinfo = UDF_I(rac->mapping->host);

        /*
         * No readahead needed for in-ICB files and udf_get_block() would get
         * confused for such file anyway.
         */
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
                return;

        mpage_readahead(rac, udf_get_block);
}

static int udf_write_begin(struct file *file, struct address_space *mapping,
                           loff_t pos, unsigned len,
                           struct page **pagep, void **fsdata)
{
        struct udf_inode_info *iinfo = UDF_I(file_inode(file));
        struct folio *folio;
        int ret;

        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
                ret = block_write_begin(mapping, pos, len, pagep,
                                        udf_get_block);
                if (unlikely(ret))
                        udf_write_failed(mapping, pos + len);
                return ret;
        }
        if (WARN_ON_ONCE(pos >= PAGE_SIZE))
                return -EIO;
        folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN,
                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio))
                return PTR_ERR(folio);
        *pagep = &folio->page;
        if (!folio_test_uptodate(folio))
                udf_adinicb_read_folio(folio);
        return 0;
}

static int udf_write_end(struct file *file, struct address_space *mapping,
                         loff_t pos, unsigned len, unsigned copied,
                         struct page *page, void *fsdata)
{
        struct inode *inode = file_inode(file);
        struct folio *folio;
        loff_t last_pos;

        if (UDF_I(inode)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB)
                return generic_write_end(file, mapping, pos, len, copied, page,
                                         fsdata);
        folio = page_folio(page);
        last_pos = pos + copied;
        if (last_pos > inode->i_size)
                i_size_write(inode, last_pos);
        folio_mark_dirty(folio);
        folio_unlock(folio);
        folio_put(folio);

        return copied;
}

static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        size_t count = iov_iter_count(iter);
        ssize_t ret;

        /* Fallback to buffered IO for in-ICB files */
        if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
                return 0;
        ret = blockdev_direct_IO(iocb, inode, iter, udf_get_block);
        if (unlikely(ret < 0 && iov_iter_rw(iter) == WRITE))
                udf_write_failed(mapping, iocb->ki_pos + count);
        return ret;
}

static sector_t udf_bmap(struct address_space *mapping, sector_t block)
{
        struct udf_inode_info *iinfo = UDF_I(mapping->host);

        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
                return -EINVAL;
        return generic_block_bmap(mapping, block, udf_get_block);
}

const struct address_space_operations udf_aops = {
        .dirty_folio        = block_dirty_folio,
        .invalidate_folio = block_invalidate_folio,
        .read_folio        = udf_read_folio,
        .readahead        = udf_readahead,
        .writepages        = udf_writepages,
        .write_begin        = udf_write_begin,
        .write_end        = udf_write_end,
        .direct_IO        = udf_direct_IO,
        .bmap                = udf_bmap,
        .migrate_folio        = buffer_migrate_folio,
};

/*
 * Expand file stored in ICB to a normal one-block-file
 *
 * This function requires i_mutex held
 */
int udf_expand_file_adinicb(struct inode *inode)
{
        struct folio *folio;
        struct udf_inode_info *iinfo = UDF_I(inode);
        int err;

        WARN_ON_ONCE(!inode_is_locked(inode));
        if (!iinfo->i_lenAlloc) {
                down_write(&iinfo->i_data_sem);
                if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
                        iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
                else
                        iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
                up_write(&iinfo->i_data_sem);
                mark_inode_dirty(inode);
                return 0;
        }

        folio = __filemap_get_folio(inode->i_mapping, 0,
                        FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_KERNEL);
        if (IS_ERR(folio))
                return PTR_ERR(folio);

        if (!folio_test_uptodate(folio))
                udf_adinicb_read_folio(folio);
        down_write(&iinfo->i_data_sem);
        memset(iinfo->i_data + iinfo->i_lenEAttr, 0x00,
               iinfo->i_lenAlloc);
        iinfo->i_lenAlloc = 0;
        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
                iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
        else
                iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
        folio_mark_dirty(folio);
        folio_unlock(folio);
        up_write(&iinfo->i_data_sem);
        err = filemap_fdatawrite(inode->i_mapping);
        if (err) {
                /* Restore everything back so that we don't lose data... */
                folio_lock(folio);
                down_write(&iinfo->i_data_sem);
                memcpy_from_folio(iinfo->i_data + iinfo->i_lenEAttr,
                                folio, 0, inode->i_size);
                folio_unlock(folio);
                iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
                iinfo->i_lenAlloc = inode->i_size;
                up_write(&iinfo->i_data_sem);
        }
        folio_put(folio);
        mark_inode_dirty(inode);

        return err;
}

#define UDF_MAP_CREATE                0x01        /* Mapping can allocate new blocks */
#define UDF_MAP_NOPREALLOC        0x02        /* Do not preallocate blocks */

#define UDF_BLK_MAPPED        0x01        /* Block was successfully mapped */
#define UDF_BLK_NEW        0x02        /* Block was freshly allocated */

struct udf_map_rq {
        sector_t lblk;
        udf_pblk_t pblk;
        int iflags;                /* UDF_MAP_ flags determining behavior */
        int oflags;                /* UDF_BLK_ flags reporting results */
};

static int udf_map_block(struct inode *inode, struct udf_map_rq *map)
{
        int err;
        struct udf_inode_info *iinfo = UDF_I(inode);

        if (WARN_ON_ONCE(iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB))
                return -EFSCORRUPTED;

        map->oflags = 0;
        if (!(map->iflags & UDF_MAP_CREATE)) {
                struct kernel_lb_addr eloc;
                uint32_t elen;
                sector_t offset;
                struct extent_position epos = {};

                down_read(&iinfo->i_data_sem);
                if (inode_bmap(inode, map->lblk, &epos, &eloc, &elen, &offset)
                                == (EXT_RECORDED_ALLOCATED >> 30)) {
                        map->pblk = udf_get_lb_pblock(inode->i_sb, &eloc,
                                                        offset);
                        map->oflags |= UDF_BLK_MAPPED;
                }
                up_read(&iinfo->i_data_sem);
                brelse(epos.bh);

                return 0;
        }

        down_write(&iinfo->i_data_sem);
        /*
         * Block beyond EOF and prealloc extents? Just discard preallocation
         * as it is not useful and complicates things.
         */
        if (((loff_t)map->lblk) << inode->i_blkbits >= iinfo->i_lenExtents)
                udf_discard_prealloc(inode);
        udf_clear_extent_cache(inode);
        err = inode_getblk(inode, map);
        up_write(&iinfo->i_data_sem);
        return err;
}

static int __udf_get_block(struct inode *inode, sector_t block,
                           struct buffer_head *bh_result, int flags)
{
        int err;
        struct udf_map_rq map = {
                .lblk = block,
                .iflags = flags,
        };

        err = udf_map_block(inode, &map);
        if (err < 0)
                return err;
        if (map.oflags & UDF_BLK_MAPPED) {
                map_bh(bh_result, inode->i_sb, map.pblk);
                if (map.oflags & UDF_BLK_NEW)
                        set_buffer_new(bh_result);
        }
        return 0;
}

int udf_get_block(struct inode *inode, sector_t block,
                  struct buffer_head *bh_result, int create)
{
        int flags = create ? UDF_MAP_CREATE : 0;

        /*
         * We preallocate blocks only for regular files. It also makes sense
         * for directories but there's a problem when to drop the
         * preallocation. We might use some delayed work for that but I feel
         * it's overengineering for a filesystem like UDF.
         */
        if (!S_ISREG(inode->i_mode))
                flags |= UDF_MAP_NOPREALLOC;
        return __udf_get_block(inode, block, bh_result, flags);
}

/*
 * We shouldn't be allocating blocks on page writeback since we allocate them
 * on page fault. We can spot dirty buffers without allocated blocks though
 * when truncate expands file. These however don't have valid data so we can
 * safely ignore them. So never allocate blocks from page writeback.
 */
static int udf_get_block_wb(struct inode *inode, sector_t block,
                            struct buffer_head *bh_result, int create)
{
        return __udf_get_block(inode, block, bh_result, 0);
}

/* Extend the file with new blocks totaling 'new_block_bytes',
 * return the number of extents added
 */
static int udf_do_extend_file(struct inode *inode,
                              struct extent_position *last_pos,
                              struct kernel_long_ad *last_ext,
                              loff_t new_block_bytes)
{
        uint32_t add;
        int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
        struct super_block *sb = inode->i_sb;
        struct udf_inode_info *iinfo;
        int err;

        /* The previous extent is fake and we should not extend by anything
         * - there's nothing to do... */
        if (!new_block_bytes && fake)
                return 0;

        iinfo = UDF_I(inode);
        /* Round the last extent up to a multiple of block size */
        if (last_ext->extLength & (sb->s_blocksize - 1)) {
                last_ext->extLength =
                        (last_ext->extLength & UDF_EXTENT_FLAG_MASK) |
                        (((last_ext->extLength & UDF_EXTENT_LENGTH_MASK) +
                          sb->s_blocksize - 1) & ~(sb->s_blocksize - 1));
                iinfo->i_lenExtents =
                        (iinfo->i_lenExtents + sb->s_blocksize - 1) &
                        ~(sb->s_blocksize - 1);
        }

        add = 0;
        /* Can we merge with the previous extent? */
        if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) ==
                                        EXT_NOT_RECORDED_NOT_ALLOCATED) {
                add = (1 << 30) - sb->s_blocksize -
                        (last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
                if (add > new_block_bytes)
                        add = new_block_bytes;
                new_block_bytes -= add;
                last_ext->extLength += add;
        }

        if (fake) {
                err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
                                   last_ext->extLength, 1);
                if (err < 0)
                        goto out_err;
                count++;
        } else {
                struct kernel_lb_addr tmploc;
                uint32_t tmplen;

                udf_write_aext(inode, last_pos, &last_ext->extLocation,
                                last_ext->extLength, 1);

                /*
                 * We've rewritten the last extent. If we are going to add
                 * more extents, we may need to enter possible following
                 * empty indirect extent.
                 */
                if (new_block_bytes)
                        udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0);
        }
        iinfo->i_lenExtents += add;

        /* Managed to do everything necessary? */
        if (!new_block_bytes)
                goto out;

        /* All further extents will be NOT_RECORDED_NOT_ALLOCATED */
        last_ext->extLocation.logicalBlockNum = 0;
        last_ext->extLocation.partitionReferenceNum = 0;
        add = (1 << 30) - sb->s_blocksize;
        last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | add;

        /* Create enough extents to cover the whole hole */
        while (new_block_bytes > add) {
                new_block_bytes -= add;
                err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
                                   last_ext->extLength, 1);
                if (err)
                        goto out_err;
                iinfo->i_lenExtents += add;
                count++;
        }
        if (new_block_bytes) {
                last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
                        new_block_bytes;
                err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
                                   last_ext->extLength, 1);
                if (err)
                        goto out_err;
                iinfo->i_lenExtents += new_block_bytes;
                count++;
        }

out:
        /* last_pos should point to the last written extent... */
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
                last_pos->offset -= sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
                last_pos->offset -= sizeof(struct long_ad);
        else
                return -EIO;

        return count;
out_err:
        /* Remove extents we've created so far */
        udf_clear_extent_cache(inode);
        udf_truncate_extents(inode);
        return err;
}

/* Extend the final block of the file to final_block_len bytes */
static void udf_do_extend_final_block(struct inode *inode,
                                      struct extent_position *last_pos,
                                      struct kernel_long_ad *last_ext,
                                      uint32_t new_elen)
{
        uint32_t added_bytes;

        /*
         * Extent already large enough? It may be already rounded up to block
         * size...
         */
        if (new_elen <= (last_ext->extLength & UDF_EXTENT_LENGTH_MASK))
                return;
        added_bytes = new_elen - (last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
        last_ext->extLength += added_bytes;
        UDF_I(inode)->i_lenExtents += added_bytes;

        udf_write_aext(inode, last_pos, &last_ext->extLocation,
                        last_ext->extLength, 1);
}

static int udf_extend_file(struct inode *inode, loff_t newsize)
{

        struct extent_position epos;
        struct kernel_lb_addr eloc;
        uint32_t elen;
        int8_t etype;
        struct super_block *sb = inode->i_sb;
        sector_t first_block = newsize >> sb->s_blocksize_bits, offset;
        loff_t new_elen;
        int adsize;
        struct udf_inode_info *iinfo = UDF_I(inode);
        struct kernel_long_ad extent;
        int err = 0;
        bool within_last_ext;

        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
                adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
                adsize = sizeof(struct long_ad);
        else
                BUG();

        down_write(&iinfo->i_data_sem);
        /*
         * When creating hole in file, just don't bother with preserving
         * preallocation. It likely won't be very useful anyway.
         */
        udf_discard_prealloc(inode);

        etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
        within_last_ext = (etype != -1);
        /* We don't expect extents past EOF... */
        WARN_ON_ONCE(within_last_ext &&
                     elen > ((loff_t)offset + 1) << inode->i_blkbits);

        if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
            (epos.bh && epos.offset == sizeof(struct allocExtDesc))) {
                /* File has no extents at all or has empty last
                 * indirect extent! Create a fake extent... */
                extent.extLocation.logicalBlockNum = 0;
                extent.extLocation.partitionReferenceNum = 0;
                extent.extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
        } else {
                epos.offset -= adsize;
                etype = udf_next_aext(inode, &epos, &extent.extLocation,
                                      &extent.extLength, 0);
                extent.extLength |= etype << 30;
        }

        new_elen = ((loff_t)offset << inode->i_blkbits) |
                                        (newsize & (sb->s_blocksize - 1));

        /* File has extent covering the new size (could happen when extending
         * inside a block)?
         */
        if (within_last_ext) {
                /* Extending file within the last file block */
                udf_do_extend_final_block(inode, &epos, &extent, new_elen);
        } else {
                err = udf_do_extend_file(inode, &epos, &extent, new_elen);
        }

        if (err < 0)
                goto out;
        err = 0;
out:
        brelse(epos.bh);
        up_write(&iinfo->i_data_sem);
        return err;
}

static int inode_getblk(struct inode *inode, struct udf_map_rq *map)
{
        struct kernel_long_ad laarr[EXTENT_MERGE_SIZE];
        struct extent_position prev_epos, cur_epos, next_epos;
        int count = 0, startnum = 0, endnum = 0;
        uint32_t elen = 0, tmpelen;
        struct kernel_lb_addr eloc, tmpeloc;
        int c = 1;
        loff_t lbcount = 0, b_off = 0;
        udf_pblk_t newblocknum;
        sector_t offset = 0;
        int8_t etype;
        struct udf_inode_info *iinfo = UDF_I(inode);
        udf_pblk_t goal = 0, pgoal = iinfo->i_location.logicalBlockNum;
        int lastblock = 0;
        bool isBeyondEOF;
        int ret = 0;

        prev_epos.offset = udf_file_entry_alloc_offset(inode);
        prev_epos.block = iinfo->i_location;
        prev_epos.bh = NULL;
        cur_epos = next_epos = prev_epos;
        b_off = (loff_t)map->lblk << inode->i_sb->s_blocksize_bits;

        /* find the extent which contains the block we are looking for.
           alternate between laarr[0] and laarr[1] for locations of the
           current extent, and the previous extent */
        do {
                if (prev_epos.bh != cur_epos.bh) {
                        brelse(prev_epos.bh);
                        get_bh(cur_epos.bh);
                        prev_epos.bh = cur_epos.bh;
                }
                if (cur_epos.bh != next_epos.bh) {
                        brelse(cur_epos.bh);
                        get_bh(next_epos.bh);
                        cur_epos.bh = next_epos.bh;
                }

                lbcount += elen;

                prev_epos.block = cur_epos.block;
                cur_epos.block = next_epos.block;

                prev_epos.offset = cur_epos.offset;
                cur_epos.offset = next_epos.offset;

                etype = udf_next_aext(inode, &next_epos, &eloc, &elen, 1);
                if (etype == -1)
                        break;

                c = !c;

                laarr[c].extLength = (etype << 30) | elen;
                laarr[c].extLocation = eloc;

                if (etype != (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))
                        pgoal = eloc.logicalBlockNum +
                                ((elen + inode->i_sb->s_blocksize - 1) >>
                                 inode->i_sb->s_blocksize_bits);

                count++;
        } while (lbcount + elen <= b_off);

        b_off -= lbcount;
        offset = b_off >> inode->i_sb->s_blocksize_bits;
        /*
         * Move prev_epos and cur_epos into indirect extent if we are at
         * the pointer to it
         */
        udf_next_aext(inode, &prev_epos, &tmpeloc, &tmpelen, 0);
        udf_next_aext(inode, &cur_epos, &tmpeloc, &tmpelen, 0);

        /* if the extent is allocated and recorded, return the block
           if the extent is not a multiple of the blocksize, round up */

        if (etype == (EXT_RECORDED_ALLOCATED >> 30)) {
                if (elen & (inode->i_sb->s_blocksize - 1)) {
                        elen = EXT_RECORDED_ALLOCATED |
                                ((elen + inode->i_sb->s_blocksize - 1) &
                                 ~(inode->i_sb->s_blocksize - 1));
                        iinfo->i_lenExtents =
                                ALIGN(iinfo->i_lenExtents,
                                      inode->i_sb->s_blocksize);
                        udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
                }
                map->oflags = UDF_BLK_MAPPED;
                map->pblk = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
                goto out_free;
        }

        /* Are we beyond EOF and preallocated extent? */
        if (etype == -1) {
                loff_t hole_len;

                isBeyondEOF = true;
                if (count) {
                        if (c)
                                laarr[0] = laarr[1];
                        startnum = 1;
                } else {
                        /* Create a fake extent when there's not one */
                        memset(&laarr[0].extLocation, 0x00,
                                sizeof(struct kernel_lb_addr));
                        laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
                        /* Will udf_do_extend_file() create real extent from
                           a fake one? */
                        startnum = (offset > 0);
                }
                /* Create extents for the hole between EOF and offset */
                hole_len = (loff_t)offset << inode->i_blkbits;
                ret = udf_do_extend_file(inode, &prev_epos, laarr, hole_len);
                if (ret < 0)
                        goto out_free;
                c = 0;
                offset = 0;
                count += ret;
                /*
                 * Is there any real extent? - otherwise we overwrite the fake
                 * one...
                 */
                if (count)
                        c = !c;
                laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
                        inode->i_sb->s_blocksize;
                memset(&laarr[c].extLocation, 0x00,
                        sizeof(struct kernel_lb_addr));
                count++;
                endnum = c + 1;
                lastblock = 1;
        } else {
                isBeyondEOF = false;
                endnum = startnum = ((count > 2) ? 2 : count);

                /* if the current extent is in position 0,
                   swap it with the previous */
                if (!c && count != 1) {
                        laarr[2] = laarr[0];
                        laarr[0] = laarr[1];
                        laarr[1] = laarr[2];
                        c = 1;
                }

                /* if the current block is located in an extent,
                   read the next extent */
                etype = udf_next_aext(inode, &next_epos, &eloc, &elen, 0);
                if (etype != -1) {
                        laarr[c + 1].extLength = (etype << 30) | elen;
                        laarr[c + 1].extLocation = eloc;
                        count++;
                        startnum++;
                        endnum++;
                } else
                        lastblock = 1;
        }

        /* if the current extent is not recorded but allocated, get the
         * block in the extent corresponding to the requested block */
        if ((laarr[c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30))
                newblocknum = laarr[c].extLocation.logicalBlockNum + offset;
        else { /* otherwise, allocate a new block */
                if (iinfo->i_next_alloc_block == map->lblk)
                        goal = iinfo->i_next_alloc_goal;

                if (!goal) {
                        if (!(goal = pgoal)) /* XXX: what was intended here? */
                                goal = iinfo->i_location.logicalBlockNum + 1;
                }

                newblocknum = udf_new_block(inode->i_sb, inode,
                                iinfo->i_location.partitionReferenceNum,
                                goal, &ret);
                if (!newblocknum)
                        goto out_free;
                if (isBeyondEOF)
                        iinfo->i_lenExtents += inode->i_sb->s_blocksize;
        }

        /* if the extent the requsted block is located in contains multiple
         * blocks, split the extent into at most three extents. blocks prior
         * to requested block, requested block, and blocks after requested
         * block */
        udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum);

        if (!(map->iflags & UDF_MAP_NOPREALLOC))
                udf_prealloc_extents(inode, c, lastblock, laarr, &endnum);

        /* merge any continuous blocks in laarr */
        udf_merge_extents(inode, laarr, &endnum);

        /* write back the new extents, inserting new extents if the new number
         * of extents is greater than the old number, and deleting extents if
         * the new number of extents is less than the old number */
        ret = udf_update_extents(inode, laarr, startnum, endnum, &prev_epos);
        if (ret < 0)
                goto out_free;

        map->pblk = udf_get_pblock(inode->i_sb, newblocknum,
                                iinfo->i_location.partitionReferenceNum, 0);
        if (!map->pblk) {
                ret = -EFSCORRUPTED;
                goto out_free;
        }
        map->oflags = UDF_BLK_NEW | UDF_BLK_MAPPED;
        iinfo->i_next_alloc_block = map->lblk + 1;
        iinfo->i_next_alloc_goal = newblocknum + 1;
        inode_set_ctime_current(inode);

        if (IS_SYNC(inode))
                udf_sync_inode(inode);
        else
                mark_inode_dirty(inode);
        ret = 0;
out_free:
        brelse(prev_epos.bh);
        brelse(cur_epos.bh);
        brelse(next_epos.bh);
        return ret;
}

static void udf_split_extents(struct inode *inode, int *c, int offset,
                               udf_pblk_t newblocknum,
                               struct kernel_long_ad *laarr, int *endnum)
{
        unsigned long blocksize = inode->i_sb->s_blocksize;
        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;

        if ((laarr[*c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30) ||
            (laarr[*c].extLength >> 30) ==
                                (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) {
                int curr = *c;
                int blen = ((laarr[curr].extLength & UDF_EXTENT_LENGTH_MASK) +
                            blocksize - 1) >> blocksize_bits;
                int8_t etype = (laarr[curr].extLength >> 30);

                if (blen == 1)
                        ;
                else if (!offset || blen == offset + 1) {
                        laarr[curr + 2] = laarr[curr + 1];
                        laarr[curr + 1] = laarr[curr];
                } else {
                        laarr[curr + 3] = laarr[curr + 1];
                        laarr[curr + 2] = laarr[curr + 1] = laarr[curr];
                }

                if (offset) {
                        if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
                                udf_free_blocks(inode->i_sb, inode,
                                                &laarr[curr].extLocation,
                                                0, offset);
                                laarr[curr].extLength =
                                        EXT_NOT_RECORDED_NOT_ALLOCATED |
                                        (offset << blocksize_bits);
                                laarr[curr].extLocation.logicalBlockNum = 0;
                                laarr[curr].extLocation.
                                                partitionReferenceNum = 0;
                        } else
                                laarr[curr].extLength = (etype << 30) |
                                        (offset << blocksize_bits);
                        curr++;
                        (*c)++;
                        (*endnum)++;
                }

                laarr[curr].extLocation.logicalBlockNum = newblocknum;
                if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))
                        laarr[curr].extLocation.partitionReferenceNum =
                                UDF_I(inode)->i_location.partitionReferenceNum;
                laarr[curr].extLength = EXT_RECORDED_ALLOCATED |
                        blocksize;
                curr++;

                if (blen != offset + 1) {
                        if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30))
                                laarr[curr].extLocation.logicalBlockNum +=
                                                                offset + 1;
                        laarr[curr].extLength = (etype << 30) |
                                ((blen - (offset + 1)) << blocksize_bits);
                        curr++;
                        (*endnum)++;
                }
        }
}

static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
                                 struct kernel_long_ad *laarr,
                                 int *endnum)
{
        int start, length = 0, currlength = 0, i;

        if (*endnum >= (c + 1)) {
                if (!lastblock)
                        return;
                else
                        start = c;
        } else {
                if ((laarr[c + 1].extLength >> 30) ==
                                        (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
                        start = c + 1;
                        length = currlength =
                                (((laarr[c + 1].extLength &
                                        UDF_EXTENT_LENGTH_MASK) +
                                inode->i_sb->s_blocksize - 1) >>
                                inode->i_sb->s_blocksize_bits);
                } else
                        start = c;
        }

        for (i = start + 1; i <= *endnum; i++) {
                if (i == *endnum) {
                        if (lastblock)
                                length += UDF_DEFAULT_PREALLOC_BLOCKS;
                } else if ((laarr[i].extLength >> 30) ==
                                (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) {
                        length += (((laarr[i].extLength &
                                                UDF_EXTENT_LENGTH_MASK) +
                                    inode->i_sb->s_blocksize - 1) >>
                                    inode->i_sb->s_blocksize_bits);
                } else
                        break;
        }

        if (length) {
                int next = laarr[start].extLocation.logicalBlockNum +
                        (((laarr[start].extLength & UDF_EXTENT_LENGTH_MASK) +
                          inode->i_sb->s_blocksize - 1) >>
                          inode->i_sb->s_blocksize_bits);
                int numalloc = udf_prealloc_blocks(inode->i_sb, inode,
                                laarr[start].extLocation.partitionReferenceNum,
                                next, (UDF_DEFAULT_PREALLOC_BLOCKS > length ?
                                length : UDF_DEFAULT_PREALLOC_BLOCKS) -
                                currlength);
                if (numalloc)         {
                        if (start == (c + 1))
                                laarr[start].extLength +=
                                        (numalloc <<
                                         inode->i_sb->s_blocksize_bits);
                        else {
                                memmove(&laarr[c + 2], &laarr[c + 1],
                                        sizeof(struct long_ad) * (*endnum - (c + 1)));
                                (*endnum)++;
                                laarr[c + 1].extLocation.logicalBlockNum = next;
                                laarr[c + 1].extLocation.partitionReferenceNum =
                                        laarr[c].extLocation.
                                                        partitionReferenceNum;
                                laarr[c + 1].extLength =
                                        EXT_NOT_RECORDED_ALLOCATED |
                                        (numalloc <<
                                         inode->i_sb->s_blocksize_bits);
                                start = c + 1;
                        }

                        for (i = start + 1; numalloc && i < *endnum; i++) {
                                int elen = ((laarr[i].extLength &
                                                UDF_EXTENT_LENGTH_MASK) +
                                            inode->i_sb->s_blocksize - 1) >>
                                            inode->i_sb->s_blocksize_bits;

                                if (elen > numalloc) {
                                        laarr[i].extLength -=
                                                (numalloc <<
                                                 inode->i_sb->s_blocksize_bits);
                                        numalloc = 0;
                                } else {
                                        numalloc -= elen;
                                        if (*endnum > (i + 1))
                                                memmove(&laarr[i],
                                                        &laarr[i + 1],
                                                        sizeof(struct long_ad) *
                                                        (*endnum - (i + 1)));
                                        i--;
                                        (*endnum)--;
                                }
                        }
                        UDF_I(inode)->i_lenExtents +=
                                numalloc << inode->i_sb->s_blocksize_bits;
                }
        }
}

static void udf_merge_extents(struct inode *inode, struct kernel_long_ad *laarr,
                              int *endnum)
{
        int i;
        unsigned long blocksize = inode->i_sb->s_blocksize;
        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;

        for (i = 0; i < (*endnum - 1); i++) {
                struct kernel_long_ad *li /*l[i]*/ = &laarr[i];
                struct kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1];

                if (((li->extLength >> 30) == (lip1->extLength >> 30)) &&
                        (((li->extLength >> 30) ==
                                (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) ||
                        ((lip1->extLocation.logicalBlockNum -
                          li->extLocation.logicalBlockNum) ==
                        (((li->extLength & UDF_EXTENT_LENGTH_MASK) +
                        blocksize - 1) >> blocksize_bits)))) {

                        if (((li->extLength & UDF_EXTENT_LENGTH_MASK) +
                             (lip1->extLength & UDF_EXTENT_LENGTH_MASK) +
                             blocksize - 1) <= UDF_EXTENT_LENGTH_MASK) {
                                li->extLength = lip1->extLength +
                                        (((li->extLength &
                                                UDF_EXTENT_LENGTH_MASK) +
                                         blocksize - 1) & ~(blocksize - 1));
                                if (*endnum > (i + 2))
                                        memmove(&laarr[i + 1], &laarr[i + 2],
                                                sizeof(struct long_ad) *
                                                (*endnum - (i + 2)));
                                i--;
                                (*endnum)--;
                        }
                } else if (((li->extLength >> 30) ==
                                (EXT_NOT_RECORDED_ALLOCATED >> 30)) &&
                           ((lip1->extLength >> 30) ==
                                (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) {
                        udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0,
                                        ((li->extLength &
                                          UDF_EXTENT_LENGTH_MASK) +
                                         blocksize - 1) >> blocksize_bits);
                        li->extLocation.logicalBlockNum = 0;
                        li->extLocation.partitionReferenceNum = 0;

                        if (((li->extLength & UDF_EXTENT_LENGTH_MASK) +
                             (lip1->extLength & UDF_EXTENT_LENGTH_MASK) +
                             blocksize - 1) & ~UDF_EXTENT_LENGTH_MASK) {
                                lip1->extLength = (lip1->extLength -
                                                   (li->extLength &
                                                   UDF_EXTENT_LENGTH_MASK) +
                                                   UDF_EXTENT_LENGTH_MASK) &
                                                   ~(blocksize - 1);
                                li->extLength = (li->extLength &
                                                 UDF_EXTENT_FLAG_MASK) +
                                                (UDF_EXTENT_LENGTH_MASK + 1) -
                                                blocksize;
                        } else {
                                li->extLength = lip1->extLength +
                                        (((li->extLength &
                                                UDF_EXTENT_LENGTH_MASK) +
                                          blocksize - 1) & ~(blocksize - 1));
                                if (*endnum > (i + 2))
                                        memmove(&laarr[i + 1], &laarr[i + 2],
                                                sizeof(struct long_ad) *
                                                (*endnum - (i + 2)));
                                i--;
                                (*endnum)--;
                        }
                } else if ((li->extLength >> 30) ==
                                        (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
                        udf_free_blocks(inode->i_sb, inode,
                                        &li->extLocation, 0,
                                        ((li->extLength &
                                                UDF_EXTENT_LENGTH_MASK) +
                                         blocksize - 1) >> blocksize_bits);
                        li->extLocation.logicalBlockNum = 0;
                        li->extLocation.partitionReferenceNum = 0;
                        li->extLength = (li->extLength &
                                                UDF_EXTENT_LENGTH_MASK) |
                                                EXT_NOT_RECORDED_NOT_ALLOCATED;
                }
        }
}

static int udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr,
                              int startnum, int endnum,
                              struct extent_position *epos)
{
        int start = 0, i;
        struct kernel_lb_addr tmploc;
        uint32_t tmplen;
        int err;

        if (startnum > endnum) {
                for (i = 0; i < (startnum - endnum); i++)
                        udf_delete_aext(inode, *epos);
        } else if (startnum < endnum) {
                for (i = 0; i < (endnum - startnum); i++) {
                        err = udf_insert_aext(inode, *epos,
                                              laarr[i].extLocation,
                                              laarr[i].extLength);
                        /*
                         * If we fail here, we are likely corrupting the extent
                         * list and leaking blocks. At least stop early to
                         * limit the damage.
                         */
                        if (err < 0)
                                return err;
                        udf_next_aext(inode, epos, &laarr[i].extLocation,
                                      &laarr[i].extLength, 1);
                        start++;
                }
        }

        for (i = start; i < endnum; i++) {
                udf_next_aext(inode, epos, &tmploc, &tmplen, 0);
                udf_write_aext(inode, epos, &laarr[i].extLocation,
                               laarr[i].extLength, 1);
        }
        return 0;
}

struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block,
                              int create, int *err)
{
        struct buffer_head *bh = NULL;
        struct udf_map_rq map = {
                .lblk = block,
                .iflags = UDF_MAP_NOPREALLOC | (create ? UDF_MAP_CREATE : 0),
        };

        *err = udf_map_block(inode, &map);
        if (*err || !(map.oflags & UDF_BLK_MAPPED))
                return NULL;

        bh = sb_getblk(inode->i_sb, map.pblk);
        if (!bh) {
                *err = -ENOMEM;
                return NULL;
        }
        if (map.oflags & UDF_BLK_NEW) {
                lock_buffer(bh);
                memset(bh->b_data, 0x00, inode->i_sb->s_blocksize);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
                mark_buffer_dirty_inode(bh, inode);
                return bh;
        }

        if (bh_read(bh, 0) >= 0)
                return bh;

        brelse(bh);
        *err = -EIO;
        return NULL;
}

int udf_setsize(struct inode *inode, loff_t newsize)
{
        int err = 0;
        struct udf_inode_info *iinfo;
        unsigned int bsize = i_blocksize(inode);

        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
              S_ISLNK(inode->i_mode)))
                return -EINVAL;
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return -EPERM;

        filemap_invalidate_lock(inode->i_mapping);
        iinfo = UDF_I(inode);
        if (newsize > inode->i_size) {
                if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                        if (bsize >=
                            (udf_file_entry_alloc_offset(inode) + newsize)) {
                                down_write(&iinfo->i_data_sem);
                                iinfo->i_lenAlloc = newsize;
                                up_write(&iinfo->i_data_sem);
                                goto set_size;
                        }
                        err = udf_expand_file_adinicb(inode);
                        if (err)
                                goto out_unlock;
                }
                err = udf_extend_file(inode, newsize);
                if (err)
                        goto out_unlock;
set_size:
                truncate_setsize(inode, newsize);
        } else {
                if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                        down_write(&iinfo->i_data_sem);
                        udf_clear_extent_cache(inode);
                        memset(iinfo->i_data + iinfo->i_lenEAttr + newsize,
                               0x00, bsize - newsize -
                               udf_file_entry_alloc_offset(inode));
                        iinfo->i_lenAlloc = newsize;
                        truncate_setsize(inode, newsize);
                        up_write(&iinfo->i_data_sem);
                        goto update_time;
                }
                err = block_truncate_page(inode->i_mapping, newsize,
                                          udf_get_block);
                if (err)
                        goto out_unlock;
                truncate_setsize(inode, newsize);
                down_write(&iinfo->i_data_sem);
                udf_clear_extent_cache(inode);
                err = udf_truncate_extents(inode);
                up_write(&iinfo->i_data_sem);
                if (err)
                        goto out_unlock;
        }
update_time:
        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        if (IS_SYNC(inode))
                udf_sync_inode(inode);
        else
                mark_inode_dirty(inode);
out_unlock:
        filemap_invalidate_unlock(inode->i_mapping);
        return err;
}

/*
 * Maximum length of linked list formed by ICB hierarchy. The chosen number is
 * arbitrary - just that we hopefully don't limit any real use of rewritten
 * inode on write-once media but avoid looping for too long on corrupted media.
 */
#define UDF_MAX_ICB_NESTING 1024

static int udf_read_inode(struct inode *inode, bool hidden_inode)
{
        struct buffer_head *bh = NULL;
        struct fileEntry *fe;
        struct extendedFileEntry *efe;
        uint16_t ident;
        struct udf_inode_info *iinfo = UDF_I(inode);
        struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
        struct kernel_lb_addr *iloc = &iinfo->i_location;
        unsigned int link_count;
        unsigned int indirections = 0;
        int bs = inode->i_sb->s_blocksize;
        int ret = -EIO;
        uint32_t uid, gid;
        struct timespec64 ts;

reread:
        if (iloc->partitionReferenceNum >= sbi->s_partitions) {
                udf_debug("partition reference: %u > logical volume partitions: %u\n",
                          iloc->partitionReferenceNum, sbi->s_partitions);
                return -EIO;
        }

        if (iloc->logicalBlockNum >=
            sbi->s_partmaps[iloc->partitionReferenceNum].s_partition_len) {
                udf_debug("block=%u, partition=%u out of range\n",
                          iloc->logicalBlockNum, iloc->partitionReferenceNum);
                return -EIO;
        }

        /*
         * Set defaults, but the inode is still incomplete!
         * Note: get_new_inode() sets the following on a new inode:
         *      i_sb = sb
         *      i_no = ino
         *      i_flags = sb->s_flags
         *      i_state = 0
         * clean_inode(): zero fills and sets
         *      i_count = 1
         *      i_nlink = 1
         *      i_op = NULL;
         */
        bh = udf_read_ptagged(inode->i_sb, iloc, 0, &ident);
        if (!bh) {
                udf_err(inode->i_sb, "(ino %lu) failed !bh\n", inode->i_ino);
                return -EIO;
        }

        if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE &&
            ident != TAG_IDENT_USE) {
                udf_err(inode->i_sb, "(ino %lu) failed ident=%u\n",
                        inode->i_ino, ident);
                goto out;
        }

        fe = (struct fileEntry *)bh->b_data;
        efe = (struct extendedFileEntry *)bh->b_data;

        if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
                struct buffer_head *ibh;

                ibh = udf_read_ptagged(inode->i_sb, iloc, 1, &ident);
                if (ident == TAG_IDENT_IE && ibh) {
                        struct kernel_lb_addr loc;
                        struct indirectEntry *ie;

                        ie = (struct indirectEntry *)ibh->b_data;
                        loc = lelb_to_cpu(ie->indirectICB.extLocation);

                        if (ie->indirectICB.extLength) {
                                brelse(ibh);
                                memcpy(&iinfo->i_location, &loc,
                                       sizeof(struct kernel_lb_addr));
                                if (++indirections > UDF_MAX_ICB_NESTING) {
                                        udf_err(inode->i_sb,
                                                "too many ICBs in ICB hierarchy"
                                                " (max %d supported)\n",
                                                UDF_MAX_ICB_NESTING);
                                        goto out;
                                }
                                brelse(bh);
                                goto reread;
                        }
                }
                brelse(ibh);
        } else if (fe->icbTag.strategyType != cpu_to_le16(4)) {
                udf_err(inode->i_sb, "unsupported strategy type: %u\n",
                        le16_to_cpu(fe->icbTag.strategyType));
                goto out;
        }
        if (fe->icbTag.strategyType == cpu_to_le16(4))
                iinfo->i_strat4096 = 0;
        else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */
                iinfo->i_strat4096 = 1;

        iinfo->i_alloc_type = le16_to_cpu(fe->icbTag.flags) &
                                                        ICBTAG_FLAG_AD_MASK;
        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_SHORT &&
            iinfo->i_alloc_type != ICBTAG_FLAG_AD_LONG &&
            iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
                ret = -EIO;
                goto out;
        }
        iinfo->i_hidden = hidden_inode;
        iinfo->i_unique = 0;
        iinfo->i_lenEAttr = 0;
        iinfo->i_lenExtents = 0;
        iinfo->i_lenAlloc = 0;
        iinfo->i_next_alloc_block = 0;
        iinfo->i_next_alloc_goal = 0;
        if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) {
                iinfo->i_efe = 1;
                iinfo->i_use = 0;
                ret = udf_alloc_i_data(inode, bs -
                                        sizeof(struct extendedFileEntry));
                if (ret)
                        goto out;
                memcpy(iinfo->i_data,
                       bh->b_data + sizeof(struct extendedFileEntry),
                       bs - sizeof(struct extendedFileEntry));
        } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) {
                iinfo->i_efe = 0;
                iinfo->i_use = 0;
                ret = udf_alloc_i_data(inode, bs - sizeof(struct fileEntry));
                if (ret)
                        goto out;
                memcpy(iinfo->i_data,
                       bh->b_data + sizeof(struct fileEntry),
                       bs - sizeof(struct fileEntry));
        } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) {
                iinfo->i_efe = 0;
                iinfo->i_use = 1;
                iinfo->i_lenAlloc = le32_to_cpu(
                                ((struct unallocSpaceEntry *)bh->b_data)->
                                 lengthAllocDescs);
                ret = udf_alloc_i_data(inode, bs -
                                        sizeof(struct unallocSpaceEntry));
                if (ret)
                        goto out;
                memcpy(iinfo->i_data,
                       bh->b_data + sizeof(struct unallocSpaceEntry),
                       bs - sizeof(struct unallocSpaceEntry));
                return 0;
        }

        ret = -EIO;
        read_lock(&sbi->s_cred_lock);
        uid = le32_to_cpu(fe->uid);
        if (uid == UDF_INVALID_ID ||
            UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_SET))
                inode->i_uid = sbi->s_uid;
        else
                i_uid_write(inode, uid);

        gid = le32_to_cpu(fe->gid);
        if (gid == UDF_INVALID_ID ||
            UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET))
                inode->i_gid = sbi->s_gid;
        else
                i_gid_write(inode, gid);

        if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY &&
                        sbi->s_fmode != UDF_INVALID_MODE)
                inode->i_mode = sbi->s_fmode;
        else if (fe->icbTag.fileType == ICBTAG_FILE_TYPE_DIRECTORY &&
                        sbi->s_dmode != UDF_INVALID_MODE)
                inode->i_mode = sbi->s_dmode;
        else
                inode->i_mode = udf_convert_permissions(fe);
        inode->i_mode &= ~sbi->s_umask;
        iinfo->i_extraPerms = le32_to_cpu(fe->permissions) & ~FE_MAPPED_PERMS;

        read_unlock(&sbi->s_cred_lock);

        link_count = le16_to_cpu(fe->fileLinkCount);
        if (!link_count) {
                if (!hidden_inode) {
                        ret = -ESTALE;
                        goto out;
                }
                link_count = 1;
        }
        set_nlink(inode, link_count);

        inode->i_size = le64_to_cpu(fe->informationLength);
        iinfo->i_lenExtents = inode->i_size;

        if (iinfo->i_efe == 0) {
                inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
                        (inode->i_sb->s_blocksize_bits - 9);

                udf_disk_stamp_to_time(&ts, fe->accessTime);
                inode_set_atime_to_ts(inode, ts);
                udf_disk_stamp_to_time(&ts, fe->modificationTime);
                inode_set_mtime_to_ts(inode, ts);
                udf_disk_stamp_to_time(&ts, fe->attrTime);
                inode_set_ctime_to_ts(inode, ts);

                iinfo->i_unique = le64_to_cpu(fe->uniqueID);
                iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr);
                iinfo->i_lenAlloc = le32_to_cpu(fe->lengthAllocDescs);
                iinfo->i_checkpoint = le32_to_cpu(fe->checkpoint);
                iinfo->i_streamdir = 0;
                iinfo->i_lenStreams = 0;
        } else {
                inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) <<
                    (inode->i_sb->s_blocksize_bits - 9);

                udf_disk_stamp_to_time(&ts, efe->accessTime);
                inode_set_atime_to_ts(inode, ts);
                udf_disk_stamp_to_time(&ts, efe->modificationTime);
                inode_set_mtime_to_ts(inode, ts);
                udf_disk_stamp_to_time(&ts, efe->attrTime);
                inode_set_ctime_to_ts(inode, ts);
                udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime);

                iinfo->i_unique = le64_to_cpu(efe->uniqueID);
                iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr);
                iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs);
                iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint);

                /* Named streams */
                iinfo->i_streamdir = (efe->streamDirectoryICB.extLength != 0);
                iinfo->i_locStreamdir =
                        lelb_to_cpu(efe->streamDirectoryICB.extLocation);
                iinfo->i_lenStreams = le64_to_cpu(efe->objectSize);
                if (iinfo->i_lenStreams >= inode->i_size)
                        iinfo->i_lenStreams -= inode->i_size;
                else
                        iinfo->i_lenStreams = 0;
        }
        inode->i_generation = iinfo->i_unique;

        /*
         * Sanity check length of allocation descriptors and extended attrs to
         * avoid integer overflows
         */
        if (iinfo->i_lenEAttr > bs || iinfo->i_lenAlloc > bs)
                goto out;
        /* Now do exact checks */
        if (udf_file_entry_alloc_offset(inode) + iinfo->i_lenAlloc > bs)
                goto out;
        /* Sanity checks for files in ICB so that we don't get confused later */
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                /*
                 * For file in ICB data is stored in allocation descriptor
                 * so sizes should match
                 */
                if (iinfo->i_lenAlloc != inode->i_size)
                        goto out;
                /* File in ICB has to fit in there... */
                if (inode->i_size > bs - udf_file_entry_alloc_offset(inode))
                        goto out;
        }

        switch (fe->icbTag.fileType) {
        case ICBTAG_FILE_TYPE_DIRECTORY:
                inode->i_op = &udf_dir_inode_operations;
                inode->i_fop = &udf_dir_operations;
                inode->i_mode |= S_IFDIR;
                inc_nlink(inode);
                break;
        case ICBTAG_FILE_TYPE_REALTIME:
        case ICBTAG_FILE_TYPE_REGULAR:
        case ICBTAG_FILE_TYPE_UNDEF:
        case ICBTAG_FILE_TYPE_VAT20:
                inode->i_data.a_ops = &udf_aops;
                inode->i_op = &udf_file_inode_operations;
                inode->i_fop = &udf_file_operations;
                inode->i_mode |= S_IFREG;
                break;
        case ICBTAG_FILE_TYPE_BLOCK:
                inode->i_mode |= S_IFBLK;
                break;
        case ICBTAG_FILE_TYPE_CHAR:
                inode->i_mode |= S_IFCHR;
                break;
        case ICBTAG_FILE_TYPE_FIFO:
                init_special_inode(inode, inode->i_mode | S_IFIFO, 0);
                break;
        case ICBTAG_FILE_TYPE_SOCKET:
                init_special_inode(inode, inode->i_mode | S_IFSOCK, 0);
                break;
        case ICBTAG_FILE_TYPE_SYMLINK:
                inode->i_data.a_ops = &udf_symlink_aops;
                inode->i_op = &udf_symlink_inode_operations;
                inode_nohighmem(inode);
                inode->i_mode = S_IFLNK | 0777;
                break;
        case ICBTAG_FILE_TYPE_MAIN:
                udf_debug("METADATA FILE-----\n");
                break;
        case ICBTAG_FILE_TYPE_MIRROR:
                udf_debug("METADATA MIRROR FILE-----\n");
                break;
        case ICBTAG_FILE_TYPE_BITMAP:
                udf_debug("METADATA BITMAP FILE-----\n");
                break;
        default:
                udf_err(inode->i_sb, "(ino %lu) failed unknown file type=%u\n",
                        inode->i_ino, fe->icbTag.fileType);
                goto out;
        }
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
                struct deviceSpec *dsea =
                        (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1);
                if (dsea) {
                        init_special_inode(inode, inode->i_mode,
                                MKDEV(le32_to_cpu(dsea->majorDeviceIdent),
                                      le32_to_cpu(dsea->minorDeviceIdent)));
                        /* Developer ID ??? */
                } else
                        goto out;
        }
        ret = 0;
out:
        brelse(bh);
        return ret;
}

static int udf_alloc_i_data(struct inode *inode, size_t size)
{
        struct udf_inode_info *iinfo = UDF_I(inode);
        iinfo->i_data = kmalloc(size, GFP_KERNEL);
        if (!iinfo->i_data)
                return -ENOMEM;
        return 0;
}

static umode_t udf_convert_permissions(struct fileEntry *fe)
{
        umode_t mode;
        uint32_t permissions;
        uint32_t flags;

        permissions = le32_to_cpu(fe->permissions);
        flags = le16_to_cpu(fe->icbTag.flags);

        mode =        ((permissions) & 0007) |
                ((permissions >> 2) & 0070) |
                ((permissions >> 4) & 0700) |
                ((flags & ICBTAG_FLAG_SETUID) ? S_ISUID : 0) |
                ((flags & ICBTAG_FLAG_SETGID) ? S_ISGID : 0) |
                ((flags & ICBTAG_FLAG_STICKY) ? S_ISVTX : 0);

        return mode;
}

void udf_update_extra_perms(struct inode *inode, umode_t mode)
{
        struct udf_inode_info *iinfo = UDF_I(inode);

        /*
         * UDF 2.01 sec. 3.3.3.3 Note 2:
         * In Unix, delete permission tracks write
         */
        iinfo->i_extraPerms &= ~FE_DELETE_PERMS;
        if (mode & 0200)
                iinfo->i_extraPerms |= FE_PERM_U_DELETE;
        if (mode & 0020)
                iinfo->i_extraPerms |= FE_PERM_G_DELETE;
        if (mode & 0002)
                iinfo->i_extraPerms |= FE_PERM_O_DELETE;
}

int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
{
        return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
}

static int udf_sync_inode(struct inode *inode)
{
        return udf_update_inode(inode, 1);
}

static void udf_adjust_time(struct udf_inode_info *iinfo, struct timespec64 time)
{
        if (iinfo->i_crtime.tv_sec > time.tv_sec ||
            (iinfo->i_crtime.tv_sec == time.tv_sec &&
             iinfo->i_crtime.tv_nsec > time.tv_nsec))
                iinfo->i_crtime = time;
}

static int udf_update_inode(struct inode *inode, int do_sync)
{
        struct buffer_head *bh = NULL;
        struct fileEntry *fe;
        struct extendedFileEntry *efe;
        uint64_t lb_recorded;
        uint32_t udfperms;
        uint16_t icbflags;
        uint16_t crclen;
        int err = 0;
        struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
        struct udf_inode_info *iinfo = UDF_I(inode);

        bh = sb_getblk(inode->i_sb,
                        udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0));
        if (!bh) {
                udf_debug("getblk failure\n");
                return -EIO;
        }

        lock_buffer(bh);
        memset(bh->b_data, 0, inode->i_sb->s_blocksize);
        fe = (struct fileEntry *)bh->b_data;
        efe = (struct extendedFileEntry *)bh->b_data;

        if (iinfo->i_use) {
                struct unallocSpaceEntry *use =
                        (struct unallocSpaceEntry *)bh->b_data;

                use->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc);
                memcpy(bh->b_data + sizeof(struct unallocSpaceEntry),
                       iinfo->i_data, inode->i_sb->s_blocksize -
                                        sizeof(struct unallocSpaceEntry));
                use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE);
                crclen = sizeof(struct unallocSpaceEntry);

                goto finish;
        }

        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET))
                fe->uid = cpu_to_le32(UDF_INVALID_ID);
        else
                fe->uid = cpu_to_le32(i_uid_read(inode));

        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET))
                fe->gid = cpu_to_le32(UDF_INVALID_ID);
        else
                fe->gid = cpu_to_le32(i_gid_read(inode));

        udfperms = ((inode->i_mode & 0007)) |
                   ((inode->i_mode & 0070) << 2) |
                   ((inode->i_mode & 0700) << 4);

        udfperms |= iinfo->i_extraPerms;
        fe->permissions = cpu_to_le32(udfperms);

        if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0)
                fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1);
        else {
                if (iinfo->i_hidden)
                        fe->fileLinkCount = cpu_to_le16(0);
                else
                        fe->fileLinkCount = cpu_to_le16(inode->i_nlink);
        }

        fe->informationLength = cpu_to_le64(inode->i_size);

        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
                struct regid *eid;
                struct deviceSpec *dsea =
                        (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1);
                if (!dsea) {
                        dsea = (struct deviceSpec *)
                                udf_add_extendedattr(inode,
                                                     sizeof(struct deviceSpec) +
                                                     sizeof(struct regid), 12, 0x3);
                        dsea->attrType = cpu_to_le32(12);
                        dsea->attrSubtype = 1;
                        dsea->attrLength = cpu_to_le32(
                                                sizeof(struct deviceSpec) +
                                                sizeof(struct regid));
                        dsea->impUseLength = cpu_to_le32(sizeof(struct regid));
                }
                eid = (struct regid *)dsea->impUse;
                memset(eid, 0, sizeof(*eid));
                strcpy(eid->ident, UDF_ID_DEVELOPER);
                eid->identSuffix[0] = UDF_OS_CLASS_UNIX;
                eid->identSuffix[1] = UDF_OS_ID_LINUX;
                dsea->majorDeviceIdent = cpu_to_le32(imajor(inode));
                dsea->minorDeviceIdent = cpu_to_le32(iminor(inode));
        }

        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
                lb_recorded = 0; /* No extents => no blocks! */
        else
                lb_recorded =
                        (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >>
                        (blocksize_bits - 9);

        if (iinfo->i_efe == 0) {
                memcpy(bh->b_data + sizeof(struct fileEntry),
                       iinfo->i_data,
                       inode->i_sb->s_blocksize - sizeof(struct fileEntry));
                fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);

                udf_time_to_disk_stamp(&fe->accessTime, inode_get_atime(inode));
                udf_time_to_disk_stamp(&fe->modificationTime, inode_get_mtime(inode));
                udf_time_to_disk_stamp(&fe->attrTime, inode_get_ctime(inode));
                memset(&(fe->impIdent), 0, sizeof(struct regid));
                strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
                fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
                fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
                fe->uniqueID = cpu_to_le64(iinfo->i_unique);
                fe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr);
                fe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc);
                fe->checkpoint = cpu_to_le32(iinfo->i_checkpoint);
                fe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_FE);
                crclen = sizeof(struct fileEntry);
        } else {
                memcpy(bh->b_data + sizeof(struct extendedFileEntry),
                       iinfo->i_data,
                       inode->i_sb->s_blocksize -
                                        sizeof(struct extendedFileEntry));
                efe->objectSize =
                        cpu_to_le64(inode->i_size + iinfo->i_lenStreams);
                efe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);

                if (iinfo->i_streamdir) {
                        struct long_ad *icb_lad = &efe->streamDirectoryICB;

                        icb_lad->extLocation =
                                cpu_to_lelb(iinfo->i_locStreamdir);
                        icb_lad->extLength =
                                cpu_to_le32(inode->i_sb->s_blocksize);
                }

                udf_adjust_time(iinfo, inode_get_atime(inode));
                udf_adjust_time(iinfo, inode_get_mtime(inode));
                udf_adjust_time(iinfo, inode_get_ctime(inode));

                udf_time_to_disk_stamp(&efe->accessTime,
                                       inode_get_atime(inode));
                udf_time_to_disk_stamp(&efe->modificationTime,
                                       inode_get_mtime(inode));
                udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
                udf_time_to_disk_stamp(&efe->attrTime, inode_get_ctime(inode));

                memset(&(efe->impIdent), 0, sizeof(efe->impIdent));
                strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER);
                efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
                efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
                efe->uniqueID = cpu_to_le64(iinfo->i_unique);
                efe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr);
                efe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc);
                efe->checkpoint = cpu_to_le32(iinfo->i_checkpoint);
                efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE);
                crclen = sizeof(struct extendedFileEntry);
        }

finish:
        if (iinfo->i_strat4096) {
                fe->icbTag.strategyType = cpu_to_le16(4096);
                fe->icbTag.strategyParameter = cpu_to_le16(1);
                fe->icbTag.numEntries = cpu_to_le16(2);
        } else {
                fe->icbTag.strategyType = cpu_to_le16(4);
                fe->icbTag.numEntries = cpu_to_le16(1);
        }

        if (iinfo->i_use)
                fe->icbTag.fileType = ICBTAG_FILE_TYPE_USE;
        else if (S_ISDIR(inode->i_mode))
                fe->icbTag.fileType = ICBTAG_FILE_TYPE_DIRECTORY;
        else if (S_ISREG(inode->i_mode))
                fe->icbTag.fileType = ICBTAG_FILE_TYPE_REGULAR;
        else if (S_ISLNK(inode->i_mode))
                fe->icbTag.fileType = ICBTAG_FILE_TYPE_SYMLINK;
        else if (S_ISBLK(inode->i_mode))
                fe->icbTag.fileType = ICBTAG_FILE_TYPE_BLOCK;
        else if (S_ISCHR(inode->i_mode))
                fe->icbTag.fileType = ICBTAG_FILE_TYPE_CHAR;
        else if (S_ISFIFO(inode->i_mode))
                fe->icbTag.fileType = ICBTAG_FILE_TYPE_FIFO;
        else if (S_ISSOCK(inode->i_mode))
                fe->icbTag.fileType = ICBTAG_FILE_TYPE_SOCKET;

        icbflags =        iinfo->i_alloc_type |
                        ((inode->i_mode & S_ISUID) ? ICBTAG_FLAG_SETUID : 0) |
                        ((inode->i_mode & S_ISGID) ? ICBTAG_FLAG_SETGID : 0) |
                        ((inode->i_mode & S_ISVTX) ? ICBTAG_FLAG_STICKY : 0) |
                        (le16_to_cpu(fe->icbTag.flags) &
                                ~(ICBTAG_FLAG_AD_MASK | ICBTAG_FLAG_SETUID |
                                ICBTAG_FLAG_SETGID | ICBTAG_FLAG_STICKY));

        fe->icbTag.flags = cpu_to_le16(icbflags);
        if (sbi->s_udfrev >= 0x0200)
                fe->descTag.descVersion = cpu_to_le16(3);
        else
                fe->descTag.descVersion = cpu_to_le16(2);
        fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number);
        fe->descTag.tagLocation = cpu_to_le32(
                                        iinfo->i_location.logicalBlockNum);
        crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - sizeof(struct tag);
        fe->descTag.descCRCLength = cpu_to_le16(crclen);
        fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag),
                                                  crclen));
        fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);

        set_buffer_uptodate(bh);
        unlock_buffer(bh);

        /* write the data blocks */
        mark_buffer_dirty(bh);
        if (do_sync) {
                sync_dirty_buffer(bh);
                if (buffer_write_io_error(bh)) {
                        udf_warn(inode->i_sb, "IO error syncing udf inode [%08lx]\n",
                                 inode->i_ino);
                        err = -EIO;
                }
        }
        brelse(bh);

        return err;
}

struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino,
                         bool hidden_inode)
{
        unsigned long block = udf_get_lb_pblock(sb, ino, 0);
        struct inode *inode = iget_locked(sb, block);
        int err;

        if (!inode)
                return ERR_PTR(-ENOMEM);

        if (!(inode->i_state & I_NEW)) {
                if (UDF_I(inode)->i_hidden != hidden_inode) {
                        iput(inode);
                        return ERR_PTR(-EFSCORRUPTED);
                }
                return inode;
        }

        memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
        err = udf_read_inode(inode, hidden_inode);
        if (err < 0) {
                iget_failed(inode);
                return ERR_PTR(err);
        }
        unlock_new_inode(inode);

        return inode;
}

int udf_setup_indirect_aext(struct inode *inode, udf_pblk_t block,
                            struct extent_position *epos)
{
        struct super_block *sb = inode->i_sb;
        struct buffer_head *bh;
        struct allocExtDesc *aed;
        struct extent_position nepos;
        struct kernel_lb_addr neloc;
        int ver, adsize;

        if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
                adsize = sizeof(struct short_ad);
        else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG)
                adsize = sizeof(struct long_ad);
        else
                return -EIO;

        neloc.logicalBlockNum = block;
        neloc.partitionReferenceNum = epos->block.partitionReferenceNum;

        bh = sb_getblk(sb, udf_get_lb_pblock(sb, &neloc, 0));
        if (!bh)
                return -EIO;
        lock_buffer(bh);
        memset(bh->b_data, 0x00, sb->s_blocksize);
        set_buffer_uptodate(bh);
        unlock_buffer(bh);
        mark_buffer_dirty_inode(bh, inode);

        aed = (struct allocExtDesc *)(bh->b_data);
        if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) {
                aed->previousAllocExtLocation =
                                cpu_to_le32(epos->block.logicalBlockNum);
        }
        aed->lengthAllocDescs = cpu_to_le32(0);
        if (UDF_SB(sb)->s_udfrev >= 0x0200)
                ver = 3;
        else
                ver = 2;
        udf_new_tag(bh->b_data, TAG_IDENT_AED, ver, 1, block,
                    sizeof(struct tag));

        nepos.block = neloc;
        nepos.offset = sizeof(struct allocExtDesc);
        nepos.bh = bh;

        /*
         * Do we have to copy current last extent to make space for indirect
         * one?
         */
        if (epos->offset + adsize > sb->s_blocksize) {
                struct kernel_lb_addr cp_loc;
                uint32_t cp_len;
                int cp_type;

                epos->offset -= adsize;
                cp_type = udf_current_aext(inode, epos, &cp_loc, &cp_len, 0);
                cp_len |= ((uint32_t)cp_type) << 30;

                __udf_add_aext(inode, &nepos, &cp_loc, cp_len, 1);
                udf_write_aext(inode, epos, &nepos.block,
                               sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDESCS, 0);
        } else {
                __udf_add_aext(inode, epos, &nepos.block,
                               sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDESCS, 0);
        }

        brelse(epos->bh);
        *epos = nepos;

        return 0;
}

/*
 * Append extent at the given position - should be the first free one in inode
 * / indirect extent. This function assumes there is enough space in the inode
 * or indirect extent. Use udf_add_aext() if you didn't check for this before.
 */
int __udf_add_aext(struct inode *inode, struct extent_position *epos,
                   struct kernel_lb_addr *eloc, uint32_t elen, int inc)
{
        struct udf_inode_info *iinfo = UDF_I(inode);
        struct allocExtDesc *aed;
        int adsize;

        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
                adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
                adsize = sizeof(struct long_ad);
        else
                return -EIO;

        if (!epos->bh) {
                WARN_ON(iinfo->i_lenAlloc !=
                        epos->offset - udf_file_entry_alloc_offset(inode));
        } else {
                aed = (struct allocExtDesc *)epos->bh->b_data;
                WARN_ON(le32_to_cpu(aed->lengthAllocDescs) !=
                        epos->offset - sizeof(struct allocExtDesc));
                WARN_ON(epos->offset + adsize > inode->i_sb->s_blocksize);
        }

        udf_write_aext(inode, epos, eloc, elen, inc);

        if (!epos->bh) {
                iinfo->i_lenAlloc += adsize;
                mark_inode_dirty(inode);
        } else {
                aed = (struct allocExtDesc *)epos->bh->b_data;
                le32_add_cpu(&aed->lengthAllocDescs, adsize);
                if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
                                UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
                        udf_update_tag(epos->bh->b_data,
                                        epos->offset + (inc ? 0 : adsize));
                else
                        udf_update_tag(epos->bh->b_data,
                                        sizeof(struct allocExtDesc));
                mark_buffer_dirty_inode(epos->bh, inode);
        }

        return 0;
}

/*
 * Append extent at given position - should be the first free one in inode
 * / indirect extent. Takes care of allocating and linking indirect blocks.
 */
int udf_add_aext(struct inode *inode, struct extent_position *epos,
                 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
{
        int adsize;
        struct super_block *sb = inode->i_sb;

        if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
                adsize = sizeof(struct short_ad);
        else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG)
                adsize = sizeof(struct long_ad);
        else
                return -EIO;

        if (epos->offset + (2 * adsize) > sb->s_blocksize) {
                int err;
                udf_pblk_t new_block;

                new_block = udf_new_block(sb, NULL,
                                          epos->block.partitionReferenceNum,
                                          epos->block.logicalBlockNum, &err);
                if (!new_block)
                        return -ENOSPC;

                err = udf_setup_indirect_aext(inode, new_block, epos);
                if (err)
                        return err;
        }

        return __udf_add_aext(inode, epos, eloc, elen, inc);
}

void udf_write_aext(struct inode *inode, struct extent_position *epos,
                    struct kernel_lb_addr *eloc, uint32_t elen, int inc)
{
        int adsize;
        uint8_t *ptr;
        struct short_ad *sad;
        struct long_ad *lad;
        struct udf_inode_info *iinfo = UDF_I(inode);

        if (!epos->bh)
                ptr = iinfo->i_data + epos->offset -
                        udf_file_entry_alloc_offset(inode) +
                        iinfo->i_lenEAttr;
        else
                ptr = epos->bh->b_data + epos->offset;

        switch (iinfo->i_alloc_type) {
        case ICBTAG_FLAG_AD_SHORT:
                sad = (struct short_ad *)ptr;
                sad->extLength = cpu_to_le32(elen);
                sad->extPosition = cpu_to_le32(eloc->logicalBlockNum);
                adsize = sizeof(struct short_ad);
                break;
        case ICBTAG_FLAG_AD_LONG:
                lad = (struct long_ad *)ptr;
                lad->extLength = cpu_to_le32(elen);
                lad->extLocation = cpu_to_lelb(*eloc);
                memset(lad->impUse, 0x00, sizeof(lad->impUse));
                adsize = sizeof(struct long_ad);
                break;
        default:
                return;
        }

        if (epos->bh) {
                if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
                    UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) {
                        struct allocExtDesc *aed =
                                (struct allocExtDesc *)epos->bh->b_data;
                        udf_update_tag(epos->bh->b_data,
                                       le32_to_cpu(aed->lengthAllocDescs) +
                                       sizeof(struct allocExtDesc));
                }
                mark_buffer_dirty_inode(epos->bh, inode);
        } else {
                mark_inode_dirty(inode);
        }

        if (inc)
                epos->offset += adsize;
}

/*
 * Only 1 indirect extent in a row really makes sense but allow upto 16 in case
 * someone does some weird stuff.
 */
#define UDF_MAX_INDIR_EXTS 16

int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
                     struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
{
        int8_t etype;
        unsigned int indirections = 0;

        while ((etype = udf_current_aext(inode, epos, eloc, elen, inc)) ==
               (EXT_NEXT_EXTENT_ALLOCDESCS >> 30)) {
                udf_pblk_t block;

                if (++indirections > UDF_MAX_INDIR_EXTS) {
                        udf_err(inode->i_sb,
                                "too many indirect extents in inode %lu\n",
                                inode->i_ino);
                        return -1;
                }

                epos->block = *eloc;
                epos->offset = sizeof(struct allocExtDesc);
                brelse(epos->bh);
                block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0);
                epos->bh = sb_bread(inode->i_sb, block);
                if (!epos->bh) {
                        udf_debug("reading block %u failed!\n", block);
                        return -1;
                }
        }

        return etype;
}

int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
                        struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
{
        int alen;
        int8_t etype;
        uint8_t *ptr;
        struct short_ad *sad;
        struct long_ad *lad;
        struct udf_inode_info *iinfo = UDF_I(inode);

        if (!epos->bh) {
                if (!epos->offset)
                        epos->offset = udf_file_entry_alloc_offset(inode);
                ptr = iinfo->i_data + epos->offset -
                        udf_file_entry_alloc_offset(inode) +
                        iinfo->i_lenEAttr;
                alen = udf_file_entry_alloc_offset(inode) +
                                                        iinfo->i_lenAlloc;
        } else {
                if (!epos->offset)
                        epos->offset = sizeof(struct allocExtDesc);
                ptr = epos->bh->b_data + epos->offset;
                alen = sizeof(struct allocExtDesc) +
                        le32_to_cpu(((struct allocExtDesc *)epos->bh->b_data)->
                                                        lengthAllocDescs);
        }

        switch (iinfo->i_alloc_type) {
        case ICBTAG_FLAG_AD_SHORT:
                sad = udf_get_fileshortad(ptr, alen, &epos->offset, inc);
                if (!sad)
                        return -1;
                etype = le32_to_cpu(sad->extLength) >> 30;
                eloc->logicalBlockNum = le32_to_cpu(sad->extPosition);
                eloc->partitionReferenceNum =
                                iinfo->i_location.partitionReferenceNum;
                *elen = le32_to_cpu(sad->extLength) & UDF_EXTENT_LENGTH_MASK;
                break;
        case ICBTAG_FLAG_AD_LONG:
                lad = udf_get_filelongad(ptr, alen, &epos->offset, inc);
                if (!lad)
                        return -1;
                etype = le32_to_cpu(lad->extLength) >> 30;
                *eloc = lelb_to_cpu(lad->extLocation);
                *elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK;
                break;
        default:
                udf_debug("alloc_type = %u unsupported\n", iinfo->i_alloc_type);
                return -1;
        }

        return etype;
}

static int udf_insert_aext(struct inode *inode, struct extent_position epos,
                           struct kernel_lb_addr neloc, uint32_t nelen)
{
        struct kernel_lb_addr oeloc;
        uint32_t oelen;
        int8_t etype;
        int err;

        if (epos.bh)
                get_bh(epos.bh);

        while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) {
                udf_write_aext(inode, &epos, &neloc, nelen, 1);
                neloc = oeloc;
                nelen = (etype << 30) | oelen;
        }
        err = udf_add_aext(inode, &epos, &neloc, nelen, 1);
        brelse(epos.bh);

        return err;
}

int8_t udf_delete_aext(struct inode *inode, struct extent_position epos)
{
        struct extent_position oepos;
        int adsize;
        int8_t etype;
        struct allocExtDesc *aed;
        struct udf_inode_info *iinfo;
        struct kernel_lb_addr eloc;
        uint32_t elen;

        if (epos.bh) {
                get_bh(epos.bh);
                get_bh(epos.bh);
        }

        iinfo = UDF_I(inode);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
                adsize = sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
                adsize = sizeof(struct long_ad);
        else
                adsize = 0;

        oepos = epos;
        if (udf_next_aext(inode, &epos, &eloc, &elen, 1) == -1)
                return -1;

        while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
                udf_write_aext(inode, &oepos, &eloc, (etype << 30) | elen, 1);
                if (oepos.bh != epos.bh) {
                        oepos.block = epos.block;
                        brelse(oepos.bh);
                        get_bh(epos.bh);
                        oepos.bh = epos.bh;
                        oepos.offset = epos.offset - adsize;
                }
        }
        memset(&eloc, 0x00, sizeof(struct kernel_lb_addr));
        elen = 0;

        if (epos.bh != oepos.bh) {
                udf_free_blocks(inode->i_sb, inode, &epos.block, 0, 1);
                udf_write_aext(inode, &oepos, &eloc, elen, 1);
                udf_write_aext(inode, &oepos, &eloc, elen, 1);
                if (!oepos.bh) {
                        iinfo->i_lenAlloc -= (adsize * 2);
                        mark_inode_dirty(inode);
                } else {
                        aed = (struct allocExtDesc *)oepos.bh->b_data;
                        le32_add_cpu(&aed->lengthAllocDescs, -(2 * adsize));
                        if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
                            UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
                                udf_update_tag(oepos.bh->b_data,
                                                oepos.offset - (2 * adsize));
                        else
                                udf_update_tag(oepos.bh->b_data,
                                                sizeof(struct allocExtDesc));
                        mark_buffer_dirty_inode(oepos.bh, inode);
                }
        } else {
                udf_write_aext(inode, &oepos, &eloc, elen, 1);
                if (!oepos.bh) {
                        iinfo->i_lenAlloc -= adsize;
                        mark_inode_dirty(inode);
                } else {
                        aed = (struct allocExtDesc *)oepos.bh->b_data;
                        le32_add_cpu(&aed->lengthAllocDescs, -adsize);
                        if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
                            UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
                                udf_update_tag(oepos.bh->b_data,
                                                epos.offset - adsize);
                        else
                                udf_update_tag(oepos.bh->b_data,
                                                sizeof(struct allocExtDesc));
                        mark_buffer_dirty_inode(oepos.bh, inode);
                }
        }

        brelse(epos.bh);
        brelse(oepos.bh);

        return (elen >> 30);
}

int8_t inode_bmap(struct inode *inode, sector_t block,
                  struct extent_position *pos, struct kernel_lb_addr *eloc,
                  uint32_t *elen, sector_t *offset)
{
        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
        loff_t lbcount = 0, bcount = (loff_t) block << blocksize_bits;
        int8_t etype;
        struct udf_inode_info *iinfo;

        iinfo = UDF_I(inode);
        if (!udf_read_extent_cache(inode, bcount, &lbcount, pos)) {
                pos->offset = 0;
                pos->block = iinfo->i_location;
                pos->bh = NULL;
        }
        *elen = 0;
        do {
                etype = udf_next_aext(inode, pos, eloc, elen, 1);
                if (etype == -1) {
                        *offset = (bcount - lbcount) >> blocksize_bits;
                        iinfo->i_lenExtents = lbcount;
                        return -1;
                }
                lbcount += *elen;
        } while (lbcount <= bcount);
        /* update extent cache */
        udf_update_extent_cache(inode, lbcount - *elen, pos);
        *offset = (bcount + *elen - lbcount) >> blocksize_bits;

        return etype;
}














































































































































































































































    1 


















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef LINUX_IOMAP_H
#define LINUX_IOMAP_H 1

#include <linux/atomic.h>
#include <linux/bitmap.h>
#include <linux/blk_types.h>
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/mm_types.h>
#include <linux/blkdev.h>

struct address_space;
struct fiemap_extent_info;
struct inode;
struct iomap_iter;
struct iomap_dio;
struct iomap_writepage_ctx;
struct iov_iter;
struct kiocb;
struct page;
struct vm_area_struct;
struct vm_fault;

/*
 * Types of block ranges for iomap mappings:
 */
#define IOMAP_HOLE        0        /* no blocks allocated, need allocation */
#define IOMAP_DELALLOC        1        /* delayed allocation blocks */
#define IOMAP_MAPPED        2        /* blocks allocated at @addr */
#define IOMAP_UNWRITTEN        3        /* blocks allocated at @addr in unwritten state */
#define IOMAP_INLINE        4        /* data inline in the inode */

/*
 * Flags reported by the file system from iomap_begin:
 *
 * IOMAP_F_NEW indicates that the blocks have been newly allocated and need
 * zeroing for areas that no data is copied to.
 *
 * IOMAP_F_DIRTY indicates the inode has uncommitted metadata needed to access
 * written data and requires fdatasync to commit them to persistent storage.
 * This needs to take into account metadata changes that *may* be made at IO
 * completion, such as file size updates from direct IO.
 *
 * IOMAP_F_SHARED indicates that the blocks are shared, and will need to be
 * unshared as part a write.
 *
 * IOMAP_F_MERGED indicates that the iomap contains the merge of multiple block
 * mappings.
 *
 * IOMAP_F_BUFFER_HEAD indicates that the file system requires the use of
 * buffer heads for this mapping.
 *
 * IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent
 * rather than a file data extent.
 */
#define IOMAP_F_NEW                (1U << 0)
#define IOMAP_F_DIRTY                (1U << 1)
#define IOMAP_F_SHARED                (1U << 2)
#define IOMAP_F_MERGED                (1U << 3)
#ifdef CONFIG_BUFFER_HEAD
#define IOMAP_F_BUFFER_HEAD        (1U << 4)
#else
#define IOMAP_F_BUFFER_HEAD        0
#endif /* CONFIG_BUFFER_HEAD */
#define IOMAP_F_XATTR                (1U << 5)

/*
 * Flags set by the core iomap code during operations:
 *
 * IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size
 * has changed as the result of this write operation.
 *
 * IOMAP_F_STALE indicates that the iomap is not valid any longer and the file
 * range it covers needs to be remapped by the high level before the operation
 * can proceed.
 */
#define IOMAP_F_SIZE_CHANGED        (1U << 8)
#define IOMAP_F_STALE                (1U << 9)

/*
 * Flags from 0x1000 up are for file system specific usage:
 */
#define IOMAP_F_PRIVATE                (1U << 12)


/*
 * Magic value for addr:
 */
#define IOMAP_NULL_ADDR -1ULL        /* addr is not valid */

struct iomap_folio_ops;

struct iomap {
        u64                        addr; /* disk offset of mapping, bytes */
        loff_t                        offset;        /* file offset of mapping, bytes */
        u64                        length;        /* length of mapping, bytes */
        u16                        type;        /* type of mapping */
        u16                        flags;        /* flags for mapping */
        struct block_device        *bdev;        /* block device for I/O */
        struct dax_device        *dax_dev; /* dax_dev for dax operations */
        void                        *inline_data;
        void                        *private; /* filesystem private */
        const struct iomap_folio_ops *folio_ops;
        u64                        validity_cookie; /* used with .iomap_valid() */
};

static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos)
{
        return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
}

/*
 * Returns the inline data pointer for logical offset @pos.
 */
static inline void *iomap_inline_data(const struct iomap *iomap, loff_t pos)
{
        return iomap->inline_data + pos - iomap->offset;
}

/*
 * Check if the mapping's length is within the valid range for inline data.
 * This is used to guard against accessing data beyond the page inline_data
 * points at.
 */
static inline bool iomap_inline_data_valid(const struct iomap *iomap)
{
        return iomap->length <= PAGE_SIZE - offset_in_page(iomap->inline_data);
}

/*
 * When a filesystem sets folio_ops in an iomap mapping it returns, get_folio
 * and put_folio will be called for each folio written to.  This only applies
 * to buffered writes as unbuffered writes will not typically have folios
 * associated with them.
 *
 * When get_folio succeeds, put_folio will always be called to do any
 * cleanup work necessary.  put_folio is responsible for unlocking and putting
 * @folio.
 */
struct iomap_folio_ops {
        struct folio *(*get_folio)(struct iomap_iter *iter, loff_t pos,
                        unsigned len);
        void (*put_folio)(struct inode *inode, loff_t pos, unsigned copied,
                        struct folio *folio);

        /*
         * Check that the cached iomap still maps correctly to the filesystem's
         * internal extent map. FS internal extent maps can change while iomap
         * is iterating a cached iomap, so this hook allows iomap to detect that
         * the iomap needs to be refreshed during a long running write
         * operation.
         *
         * The filesystem can store internal state (e.g. a sequence number) in
         * iomap->validity_cookie when the iomap is first mapped to be able to
         * detect changes between mapping time and whenever .iomap_valid() is
         * called.
         *
         * This is called with the folio over the specified file position held
         * locked by the iomap code.
         */
        bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap);
};

/*
 * Flags for iomap_begin / iomap_end.  No flag implies a read.
 */
#define IOMAP_WRITE                (1 << 0) /* writing, must allocate blocks */
#define IOMAP_ZERO                (1 << 1) /* zeroing operation, may skip holes */
#define IOMAP_REPORT                (1 << 2) /* report extent status, e.g. FIEMAP */
#define IOMAP_FAULT                (1 << 3) /* mapping for page fault */
#define IOMAP_DIRECT                (1 << 4) /* direct I/O */
#define IOMAP_NOWAIT                (1 << 5) /* do not block */
#define IOMAP_OVERWRITE_ONLY        (1 << 6) /* only pure overwrites allowed */
#define IOMAP_UNSHARE                (1 << 7) /* unshare_file_range */
#ifdef CONFIG_FS_DAX
#define IOMAP_DAX                (1 << 8) /* DAX mapping */
#else
#define IOMAP_DAX                0
#endif /* CONFIG_FS_DAX */

struct iomap_ops {
        /*
         * Return the existing mapping at pos, or reserve space starting at
         * pos for up to length, as long as we can do it as a single mapping.
         * The actual length is returned in iomap->length.
         */
        int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length,
                        unsigned flags, struct iomap *iomap,
                        struct iomap *srcmap);

        /*
         * Commit and/or unreserve space previous allocated using iomap_begin.
         * Written indicates the length of the successful write operation which
         * needs to be commited, while the rest needs to be unreserved.
         * Written might be zero if no data was written.
         */
        int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length,
                        ssize_t written, unsigned flags, struct iomap *iomap);
};

/**
 * struct iomap_iter - Iterate through a range of a file
 * @inode: Set at the start of the iteration and should not change.
 * @pos: The current file position we are operating on.  It is updated by
 *        calls to iomap_iter().  Treat as read-only in the body.
 * @len: The remaining length of the file segment we're operating on.
 *        It is updated at the same time as @pos.
 * @processed: The number of bytes processed by the body in the most recent
 *        iteration, or a negative errno. 0 causes the iteration to stop.
 * @flags: Zero or more of the iomap_begin flags above.
 * @iomap: Map describing the I/O iteration
 * @srcmap: Source map for COW operations
 */
struct iomap_iter {
        struct inode *inode;
        loff_t pos;
        u64 len;
        s64 processed;
        unsigned flags;
        struct iomap iomap;
        struct iomap srcmap;
        void *private;
};

int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops);

/**
 * iomap_length - length of the current iomap iteration
 * @iter: iteration structure
 *
 * Returns the length that the operation applies to for the current iteration.
 */
static inline u64 iomap_length(const struct iomap_iter *iter)
{
        u64 end = iter->iomap.offset + iter->iomap.length;

        if (iter->srcmap.type != IOMAP_HOLE)
                end = min(end, iter->srcmap.offset + iter->srcmap.length);
        return min(iter->len, end - iter->pos);
}

/**
 * iomap_iter_srcmap - return the source map for the current iomap iteration
 * @i: iteration structure
 *
 * Write operations on file systems with reflink support might require a
 * source and a destination map.  This function retourns the source map
 * for a given operation, which may or may no be identical to the destination
 * map in &i->iomap.
 */
static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i)
{
        if (i->srcmap.type != IOMAP_HOLE)
                return &i->srcmap;
        return &i->iomap;
}

ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
                const struct iomap_ops *ops);
int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
                struct iomap *iomap, loff_t pos, loff_t length, ssize_t written,
                int (*punch)(struct inode *inode, loff_t pos, loff_t length));

int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len);
bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
                const struct iomap_ops *ops);
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
                bool *did_zero, const struct iomap_ops *ops);
int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
                const struct iomap_ops *ops);
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
                        const struct iomap_ops *ops);
int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                u64 start, u64 len, const struct iomap_ops *ops);
loff_t iomap_seek_hole(struct inode *inode, loff_t offset,
                const struct iomap_ops *ops);
loff_t iomap_seek_data(struct inode *inode, loff_t offset,
                const struct iomap_ops *ops);
sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
                const struct iomap_ops *ops);

/*
 * Structure for writeback I/O completions.
 */
struct iomap_ioend {
        struct list_head        io_list;        /* next ioend in chain */
        u16                        io_type;
        u16                        io_flags;        /* IOMAP_F_* */
        struct inode                *io_inode;        /* file being written to */
        size_t                        io_size;        /* size of the extent */
        loff_t                        io_offset;        /* offset in the file */
        sector_t                io_sector;        /* start sector of ioend */
        struct bio                io_bio;                /* MUST BE LAST! */
};

static inline struct iomap_ioend *iomap_ioend_from_bio(struct bio *bio)
{
        return container_of(bio, struct iomap_ioend, io_bio);
}

struct iomap_writeback_ops {
        /*
         * Required, maps the blocks so that writeback can be performed on
         * the range starting at offset.
         *
         * Can return arbitrarily large regions, but we need to call into it at
         * least once per folio to allow the file systems to synchronize with
         * the write path that could be invalidating mappings.
         *
         * An existing mapping from a previous call to this method can be reused
         * by the file system if it is still valid.
         */
        int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
                          loff_t offset, unsigned len);

        /*
         * Optional, allows the file systems to perform actions just before
         * submitting the bio and/or override the bio end_io handler for complex
         * operations like copy on write extent manipulation or unwritten extent
         * conversions.
         */
        int (*prepare_ioend)(struct iomap_ioend *ioend, int status);

        /*
         * Optional, allows the file system to discard state on a page where
         * we failed to submit any I/O.
         */
        void (*discard_folio)(struct folio *folio, loff_t pos);
};

struct iomap_writepage_ctx {
        struct iomap                iomap;
        struct iomap_ioend        *ioend;
        const struct iomap_writeback_ops *ops;
        u32                        nr_folios;        /* folios added to the ioend */
};

void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
void iomap_ioend_try_merge(struct iomap_ioend *ioend,
                struct list_head *more_ioends);
void iomap_sort_ioends(struct list_head *ioend_list);
int iomap_writepages(struct address_space *mapping,
                struct writeback_control *wbc, struct iomap_writepage_ctx *wpc,
                const struct iomap_writeback_ops *ops);

/*
 * Flags for direct I/O ->end_io:
 */
#define IOMAP_DIO_UNWRITTEN        (1 << 0)        /* covers unwritten extent(s) */
#define IOMAP_DIO_COW                (1 << 1)        /* covers COW extent(s) */

struct iomap_dio_ops {
        int (*end_io)(struct kiocb *iocb, ssize_t size, int error,
                      unsigned flags);
        void (*submit_io)(const struct iomap_iter *iter, struct bio *bio,
                          loff_t file_offset);

        /*
         * Filesystems wishing to attach private information to a direct io bio
         * must provide a ->submit_io method that attaches the additional
         * information to the bio and changes the ->bi_end_io callback to a
         * custom function.  This function should, at a minimum, perform any
         * relevant post-processing of the bio and end with a call to
         * iomap_dio_bio_end_io.
         */
        struct bio_set *bio_set;
};

/*
 * Wait for the I/O to complete in iomap_dio_rw even if the kiocb is not
 * synchronous.
 */
#define IOMAP_DIO_FORCE_WAIT        (1 << 0)

/*
 * Do not allocate blocks or zero partial blocks, but instead fall back to
 * the caller by returning -EAGAIN.  Used to optimize direct I/O writes that
 * are not aligned to the file system block size.
  */
#define IOMAP_DIO_OVERWRITE_ONLY        (1 << 1)

/*
 * When a page fault occurs, return a partial synchronous result and allow
 * the caller to retry the rest of the operation after dealing with the page
 * fault.
 */
#define IOMAP_DIO_PARTIAL                (1 << 2)

ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
                unsigned int dio_flags, void *private, size_t done_before);
struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
                unsigned int dio_flags, void *private, size_t done_before);
ssize_t iomap_dio_complete(struct iomap_dio *dio);
void iomap_dio_bio_end_io(struct bio *bio);

#ifdef CONFIG_SWAP
struct file;
struct swap_info_struct;

int iomap_swapfile_activate(struct swap_info_struct *sis,
                struct file *swap_file, sector_t *pagespan,
                const struct iomap_ops *ops);
#else
# define iomap_swapfile_activate(sis, swapfile, pagespan, ops)        (-EIO)
#endif /* CONFIG_SWAP */

#endif /* LINUX_IOMAP_H */












































































































































































































































































































































































   13 

















    7 
   12 
















    1 
    1 

































    4 
    6 











































    4 














    3 



    4 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/kernel/capability.c
 *
 * Copyright (C) 1997  Andrew Main <zefram@fysh.org>
 *
 * Integrated into 2.1.97+,  Andrew G. Morgan <morgan@kernel.org>
 * 30 May 2002:        Cleanup, Robert M. Love <rml@tech9.net>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/uaccess.h>

int file_caps_enabled = 1;

static int __init file_caps_disable(char *str)
{
        file_caps_enabled = 0;
        return 1;
}
__setup("no_file_caps", file_caps_disable);

#ifdef CONFIG_MULTIUSER
/*
 * More recent versions of libcap are available from:
 *
 *   http://www.kernel.org/pub/linux/libs/security/linux-privs/
 */

static void warn_legacy_capability_use(void)
{
        char name[sizeof(current->comm)];

        pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
                     get_task_comm(name, current));
}

/*
 * Version 2 capabilities worked fine, but the linux/capability.h file
 * that accompanied their introduction encouraged their use without
 * the necessary user-space source code changes. As such, we have
 * created a version 3 with equivalent functionality to version 2, but
 * with a header change to protect legacy source code from using
 * version 2 when it wanted to use version 1. If your system has code
 * that trips the following warning, it is using version 2 specific
 * capabilities and may be doing so insecurely.
 *
 * The remedy is to either upgrade your version of libcap (to 2.10+,
 * if the application is linked against it), or recompile your
 * application with modern kernel headers and this warning will go
 * away.
 */

static void warn_deprecated_v2(void)
{
        char name[sizeof(current->comm)];

        pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
                     get_task_comm(name, current));
}

/*
 * Version check. Return the number of u32s in each capability flag
 * array, or a negative value on error.
 */
static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
{
        __u32 version;

        if (get_user(version, &header->version))
                return -EFAULT;

        switch (version) {
        case _LINUX_CAPABILITY_VERSION_1:
                warn_legacy_capability_use();
                *tocopy = _LINUX_CAPABILITY_U32S_1;
                break;
        case _LINUX_CAPABILITY_VERSION_2:
                warn_deprecated_v2();
                fallthrough;        /* v3 is otherwise equivalent to v2 */
        case _LINUX_CAPABILITY_VERSION_3:
                *tocopy = _LINUX_CAPABILITY_U32S_3;
                break;
        default:
                if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version))
                        return -EFAULT;
                return -EINVAL;
        }

        return 0;
}

/*
 * The only thing that can change the capabilities of the current
 * process is the current process. As such, we can't be in this code
 * at the same time as we are in the process of setting capabilities
 * in this process. The net result is that we can limit our use of
 * locks to when we are reading the caps of another process.
 */
static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
                                     kernel_cap_t *pIp, kernel_cap_t *pPp)
{
        int ret;

        if (pid && (pid != task_pid_vnr(current))) {
                const struct task_struct *target;

                rcu_read_lock();

                target = find_task_by_vpid(pid);
                if (!target)
                        ret = -ESRCH;
                else
                        ret = security_capget(target, pEp, pIp, pPp);

                rcu_read_unlock();
        } else
                ret = security_capget(current, pEp, pIp, pPp);

        return ret;
}

/**
 * sys_capget - get the capabilities of a given process.
 * @header: pointer to struct that contains capability version and
 *        target pid data
 * @dataptr: pointer to struct that contains the effective, permitted,
 *        and inheritable capabilities that are returned
 *
 * Returns 0 on success and < 0 on error.
 */
SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
{
        int ret = 0;
        pid_t pid;
        unsigned tocopy;
        kernel_cap_t pE, pI, pP;
        struct __user_cap_data_struct kdata[2];

        ret = cap_validate_magic(header, &tocopy);
        if ((dataptr == NULL) || (ret != 0))
                return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;

        if (get_user(pid, &header->pid))
                return -EFAULT;

        if (pid < 0)
                return -EINVAL;

        ret = cap_get_target_pid(pid, &pE, &pI, &pP);
        if (ret)
                return ret;

        /*
         * Annoying legacy format with 64-bit capabilities exposed
         * as two sets of 32-bit fields, so we need to split the
         * capability values up.
         */
        kdata[0].effective   = pE.val; kdata[1].effective   = pE.val >> 32;
        kdata[0].permitted   = pP.val; kdata[1].permitted   = pP.val >> 32;
        kdata[0].inheritable = pI.val; kdata[1].inheritable = pI.val >> 32;

        /*
         * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
         * we silently drop the upper capabilities here. This
         * has the effect of making older libcap
         * implementations implicitly drop upper capability
         * bits when they perform a: capget/modify/capset
         * sequence.
         *
         * This behavior is considered fail-safe
         * behavior. Upgrading the application to a newer
         * version of libcap will enable access to the newer
         * capabilities.
         *
         * An alternative would be to return an error here
         * (-ERANGE), but that causes legacy applications to
         * unexpectedly fail; the capget/modify/capset aborts
         * before modification is attempted and the application
         * fails.
         */
        if (copy_to_user(dataptr, kdata, tocopy * sizeof(kdata[0])))
                return -EFAULT;

        return 0;
}

static kernel_cap_t mk_kernel_cap(u32 low, u32 high)
{
        return (kernel_cap_t) { (low | ((u64)high << 32)) & CAP_VALID_MASK };
}

/**
 * sys_capset - set capabilities for a process or (*) a group of processes
 * @header: pointer to struct that contains capability version and
 *        target pid data
 * @data: pointer to struct that contains the effective, permitted,
 *        and inheritable capabilities
 *
 * Set capabilities for the current process only.  The ability to any other
 * process(es) has been deprecated and removed.
 *
 * The restrictions on setting capabilities are specified as:
 *
 * I: any raised capabilities must be a subset of the old permitted
 * P: any raised capabilities must be a subset of the old permitted
 * E: must be set to a subset of new permitted
 *
 * Returns 0 on success and < 0 on error.
 */
SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
{
        struct __user_cap_data_struct kdata[2] = { { 0, }, };
        unsigned tocopy, copybytes;
        kernel_cap_t inheritable, permitted, effective;
        struct cred *new;
        int ret;
        pid_t pid;

        ret = cap_validate_magic(header, &tocopy);
        if (ret != 0)
                return ret;

        if (get_user(pid, &header->pid))
                return -EFAULT;

        /* may only affect current now */
        if (pid != 0 && pid != task_pid_vnr(current))
                return -EPERM;

        copybytes = tocopy * sizeof(struct __user_cap_data_struct);
        if (copybytes > sizeof(kdata))
                return -EFAULT;

        if (copy_from_user(&kdata, data, copybytes))
                return -EFAULT;

        effective   = mk_kernel_cap(kdata[0].effective,   kdata[1].effective);
        permitted   = mk_kernel_cap(kdata[0].permitted,   kdata[1].permitted);
        inheritable = mk_kernel_cap(kdata[0].inheritable, kdata[1].inheritable);

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        ret = security_capset(new, current_cred(),
                              &effective, &inheritable, &permitted);
        if (ret < 0)
                goto error;

        audit_log_capset(new, current_cred());

        return commit_creds(new);

error:
        abort_creds(new);
        return ret;
}

/**
 * has_ns_capability - Does a task have a capability in a specific user ns
 * @t: The task in question
 * @ns: target user namespace
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to the specified user namespace, false if not.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_ns_capability(struct task_struct *t,
                       struct user_namespace *ns, int cap)
{
        int ret;

        rcu_read_lock();
        ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE);
        rcu_read_unlock();

        return (ret == 0);
}

/**
 * has_capability - Does a task have a capability in init_user_ns
 * @t: The task in question
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to the initial user namespace, false if not.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_capability(struct task_struct *t, int cap)
{
        return has_ns_capability(t, &init_user_ns, cap);
}
EXPORT_SYMBOL(has_capability);

/**
 * has_ns_capability_noaudit - Does a task have a capability (unaudited)
 * in a specific user ns.
 * @t: The task in question
 * @ns: target user namespace
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to the specified user namespace, false if not.
 * Do not write an audit message for the check.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_ns_capability_noaudit(struct task_struct *t,
                               struct user_namespace *ns, int cap)
{
        int ret;

        rcu_read_lock();
        ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT);
        rcu_read_unlock();

        return (ret == 0);
}

/**
 * has_capability_noaudit - Does a task have a capability (unaudited) in the
 * initial user ns
 * @t: The task in question
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to init_user_ns, false if not.  Don't write an
 * audit message for the check.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_capability_noaudit(struct task_struct *t, int cap)
{
        return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
EXPORT_SYMBOL(has_capability_noaudit);

static bool ns_capable_common(struct user_namespace *ns,
                              int cap,
                              unsigned int opts)
{
        int capable;

        if (unlikely(!cap_valid(cap))) {
                pr_crit("capable() called with invalid cap=%u\n", cap);
                BUG();
        }

        capable = security_capable(current_cred(), ns, cap, opts);
        if (capable == 0) {
                current->flags |= PF_SUPERPRIV;
                return true;
        }
        return false;
}

/**
 * ns_capable - Determine if the current task has a superior capability in effect
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_NONE);
}
EXPORT_SYMBOL(ns_capable);

/**
 * ns_capable_noaudit - Determine if the current task has a superior capability
 * (unaudited) in effect
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable_noaudit(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT);
}
EXPORT_SYMBOL(ns_capable_noaudit);

/**
 * ns_capable_setid - Determine if the current task has a superior capability
 * in effect, while signalling that this check is being done from within a
 * setid or setgroups syscall.
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable_setid(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_INSETID);
}
EXPORT_SYMBOL(ns_capable_setid);

/**
 * capable - Determine if the current task has a superior capability in effect
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool capable(int cap)
{
        return ns_capable(&init_user_ns, cap);
}
EXPORT_SYMBOL(capable);
#endif /* CONFIG_MULTIUSER */

/**
 * file_ns_capable - Determine if the file's opener had a capability in effect
 * @file:  The file we want to check
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if task that opened the file had a capability in effect
 * when the file was opened.
 *
 * This does not set PF_SUPERPRIV because the caller may not
 * actually be privileged.
 */
bool file_ns_capable(const struct file *file, struct user_namespace *ns,
                     int cap)
{

        if (WARN_ON_ONCE(!cap_valid(cap)))
                return false;

        if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0)
                return true;

        return false;
}
EXPORT_SYMBOL(file_ns_capable);

/**
 * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
 * @ns: The user namespace in question
 * @idmap: idmap of the mount @inode was found from
 * @inode: The inode in question
 *
 * Return true if the inode uid and gid are within the namespace.
 */
bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
                                 struct mnt_idmap *idmap,
                                 const struct inode *inode)
{
        return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) &&
               vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode));
}

/**
 * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
 * @idmap: idmap of the mount @inode was found from
 * @inode: The inode in question
 * @cap: The capability in question
 *
 * Return true if the current task has the given capability targeted at
 * its own user namespace and that the given inode's uid and gid are
 * mapped into the current user namespace.
 */
bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap,
                              const struct inode *inode, int cap)
{
        struct user_namespace *ns = current_user_ns();

        return ns_capable(ns, cap) &&
               privileged_wrt_inode_uidgid(ns, idmap, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);

/**
 * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace
 * @tsk: The task that may be ptraced
 * @ns: The user namespace to search for CAP_SYS_PTRACE in
 *
 * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE
 * in the specified user namespace.
 */
bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
{
        int ret = 0;  /* An absent tracer adds no restrictions */
        const struct cred *cred;

        rcu_read_lock();
        cred = rcu_dereference(tsk->ptracer_cred);
        if (cred)
                ret = security_capable(cred, ns, CAP_SYS_PTRACE,
                                       CAP_OPT_NOAUDIT);
        rcu_read_unlock();
        return (ret == 0);
}










































   51 










    1 























   51 

















    1 






































   16 
   51 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _X86_IRQFLAGS_H_
#define _X86_IRQFLAGS_H_

#include <asm/processor-flags.h>

#ifndef __ASSEMBLY__

#include <asm/nospec-branch.h>

/*
 * Interrupt control:
 */

/* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */
extern inline unsigned long native_save_fl(void);
extern __always_inline unsigned long native_save_fl(void)
{
        unsigned long flags;

        /*
         * "=rm" is safe here, because "pop" adjusts the stack before
         * it evaluates its effective address -- this is part of the
         * documented behavior of the "pop" instruction.
         */
        asm volatile("# __raw_save_flags\n\t"
                     "pushf ; pop %0"
                     : "=rm" (flags)
                     : /* no input */
                     : "memory");

        return flags;
}

static __always_inline void native_irq_disable(void)
{
        asm volatile("cli": : :"memory");
}

static __always_inline void native_irq_enable(void)
{
        asm volatile("sti": : :"memory");
}

static __always_inline void native_safe_halt(void)
{
        mds_idle_clear_cpu_buffers();
        asm volatile("sti; hlt": : :"memory");
}

static __always_inline void native_halt(void)
{
        mds_idle_clear_cpu_buffers();
        asm volatile("hlt": : :"memory");
}

#endif

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#ifndef __ASSEMBLY__
#include <linux/types.h>

static __always_inline unsigned long arch_local_save_flags(void)
{
        return native_save_fl();
}

static __always_inline void arch_local_irq_disable(void)
{
        native_irq_disable();
}

static __always_inline void arch_local_irq_enable(void)
{
        native_irq_enable();
}

/*
 * Used in the idle loop; sti takes one instruction cycle
 * to complete:
 */
static __always_inline void arch_safe_halt(void)
{
        native_safe_halt();
}

/*
 * Used when interrupts are already enabled or to
 * shutdown the processor:
 */
static __always_inline void halt(void)
{
        native_halt();
}

/*
 * For spinlocks, etc:
 */
static __always_inline unsigned long arch_local_irq_save(void)
{
        unsigned long flags = arch_local_save_flags();
        arch_local_irq_disable();
        return flags;
}
#else

#ifdef CONFIG_X86_64
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS                pushfq; popq %rax
#endif

#endif

#endif /* __ASSEMBLY__ */
#endif /* CONFIG_PARAVIRT_XXL */

#ifndef __ASSEMBLY__
static __always_inline int arch_irqs_disabled_flags(unsigned long flags)
{
        return !(flags & X86_EFLAGS_IF);
}

static __always_inline int arch_irqs_disabled(void)
{
        unsigned long flags = arch_local_save_flags();

        return arch_irqs_disabled_flags(flags);
}

static __always_inline void arch_local_irq_restore(unsigned long flags)
{
        if (!arch_irqs_disabled_flags(flags))
                arch_local_irq_enable();
}
#endif /* !__ASSEMBLY__ */

#endif














































































































    1 




    1 

































































    1 
    1 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
// SPDX-License-Identifier: GPL-2.0
/*
 *  Floating proportions with flexible aging period
 *
 *   Copyright (C) 2011, SUSE, Jan Kara <jack@suse.cz>
 *
 * The goal of this code is: Given different types of event, measure proportion
 * of each type of event over time. The proportions are measured with
 * exponentially decaying history to give smooth transitions. A formula
 * expressing proportion of event of type 'j' is:
 *
 *   p_{j} = (\Sum_{i>=0} x_{i,j}/2^{i+1})/(\Sum_{i>=0} x_i/2^{i+1})
 *
 * Where x_{i,j} is j's number of events in i-th last time period and x_i is
 * total number of events in i-th last time period.
 *
 * Note that p_{j}'s are normalised, i.e.
 *
 *   \Sum_{j} p_{j} = 1,
 *
 * This formula can be straightforwardly computed by maintaining denominator
 * (let's call it 'd') and for each event type its numerator (let's call it
 * 'n_j'). When an event of type 'j' happens, we simply need to do:
 *   n_j++; d++;
 *
 * When a new period is declared, we could do:
 *   d /= 2
 *   for each j
 *     n_j /= 2
 *
 * To avoid iteration over all event types, we instead shift numerator of event
 * j lazily when someone asks for a proportion of event j or when event j
 * occurs. This can bit trivially implemented by remembering last period in
 * which something happened with proportion of type j.
 */
#include <linux/flex_proportions.h>

int fprop_global_init(struct fprop_global *p, gfp_t gfp)
{
        int err;

        p->period = 0;
        /* Use 1 to avoid dealing with periods with 0 events... */
        err = percpu_counter_init(&p->events, 1, gfp);
        if (err)
                return err;
        seqcount_init(&p->sequence);
        return 0;
}

void fprop_global_destroy(struct fprop_global *p)
{
        percpu_counter_destroy(&p->events);
}

/*
 * Declare @periods new periods. It is upto the caller to make sure period
 * transitions cannot happen in parallel.
 *
 * The function returns true if the proportions are still defined and false
 * if aging zeroed out all events. This can be used to detect whether declaring
 * further periods has any effect.
 */
bool fprop_new_period(struct fprop_global *p, int periods)
{
        s64 events = percpu_counter_sum(&p->events);

        /*
         * Don't do anything if there are no events.
         */
        if (events <= 1)
                return false;
        preempt_disable_nested();
        write_seqcount_begin(&p->sequence);
        if (periods < 64)
                events -= events >> periods;
        /* Use addition to avoid losing events happening between sum and set */
        percpu_counter_add(&p->events, -events);
        p->period += periods;
        write_seqcount_end(&p->sequence);
        preempt_enable_nested();

        return true;
}

/*
 * ---- PERCPU ----
 */
#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids)))

int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp)
{
        int err;

        err = percpu_counter_init(&pl->events, 0, gfp);
        if (err)
                return err;
        pl->period = 0;
        raw_spin_lock_init(&pl->lock);
        return 0;
}

void fprop_local_destroy_percpu(struct fprop_local_percpu *pl)
{
        percpu_counter_destroy(&pl->events);
}

static void fprop_reflect_period_percpu(struct fprop_global *p,
                                        struct fprop_local_percpu *pl)
{
        unsigned int period = p->period;
        unsigned long flags;

        /* Fast path - period didn't change */
        if (pl->period == period)
                return;
        raw_spin_lock_irqsave(&pl->lock, flags);
        /* Someone updated pl->period while we were spinning? */
        if (pl->period >= period) {
                raw_spin_unlock_irqrestore(&pl->lock, flags);
                return;
        }
        /* Aging zeroed our fraction? */
        if (period - pl->period < BITS_PER_LONG) {
                s64 val = percpu_counter_read(&pl->events);

                if (val < (nr_cpu_ids * PROP_BATCH))
                        val = percpu_counter_sum(&pl->events);

                percpu_counter_add_batch(&pl->events,
                        -val + (val >> (period-pl->period)), PROP_BATCH);
        } else
                percpu_counter_set(&pl->events, 0);
        pl->period = period;
        raw_spin_unlock_irqrestore(&pl->lock, flags);
}

/* Event of type pl happened */
void __fprop_add_percpu(struct fprop_global *p, struct fprop_local_percpu *pl,
                long nr)
{
        fprop_reflect_period_percpu(p, pl);
        percpu_counter_add_batch(&pl->events, nr, PROP_BATCH);
        percpu_counter_add(&p->events, nr);
}

void fprop_fraction_percpu(struct fprop_global *p,
                           struct fprop_local_percpu *pl,
                           unsigned long *numerator, unsigned long *denominator)
{
        unsigned int seq;
        s64 num, den;

        do {
                seq = read_seqcount_begin(&p->sequence);
                fprop_reflect_period_percpu(p, pl);
                num = percpu_counter_read_positive(&pl->events);
                den = percpu_counter_read_positive(&p->events);
        } while (read_seqcount_retry(&p->sequence, seq));

        /*
         * Make fraction <= 1 and denominator > 0 even in presence of percpu
         * counter errors
         */
        if (den <= num) {
                if (num)
                        den = num;
                else
                        den = 1;
        }
        *denominator = den;
        *numerator = num;
}

/*
 * Like __fprop_add_percpu() except that event is counted only if the given
 * type has fraction smaller than @max_frac/FPROP_FRAC_BASE
 */
void __fprop_add_percpu_max(struct fprop_global *p,
                struct fprop_local_percpu *pl, int max_frac, long nr)
{
        if (unlikely(max_frac < FPROP_FRAC_BASE)) {
                unsigned long numerator, denominator;
                s64 tmp;

                fprop_fraction_percpu(p, pl, &numerator, &denominator);
                /* Adding 'nr' to fraction exceeds max_frac/FPROP_FRAC_BASE? */
                tmp = (u64)denominator * max_frac -
                                        ((u64)numerator << FPROP_FRAC_SHIFT);
                if (tmp < 0) {
                        /* Maximum fraction already exceeded? */
                        return;
                } else if (tmp < nr * (FPROP_FRAC_BASE - max_frac)) {
                        /* Add just enough for the fraction to saturate */
                        nr = div_u64(tmp + FPROP_FRAC_BASE - max_frac - 1,
                                        FPROP_FRAC_BASE - max_frac);
                }
        }

        __fprop_add_percpu(p, pl, nr);
}







































    1 










    1 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
// SPDX-License-Identifier: GPL-2.0-or-later
/* 
 * CRC32C
 *@Article{castagnoli-crc,
 * author =       { Guy Castagnoli and Stefan Braeuer and Martin Herrman},
 * title =        {{Optimization of Cyclic Redundancy-Check Codes with 24
 *                 and 32 Parity Bits}},
 * journal =      IEEE Transactions on Communication,
 * year =         {1993},
 * volume =       {41},
 * number =       {6},
 * pages =        {},
 * month =        {June},
 *}
 * Used by the iSCSI driver, possibly others, and derived from
 * the iscsi-crc.c module of the linux-iscsi driver at
 * http://linux-iscsi.sourceforge.net.
 *
 * Following the example of lib/crc32, this function is intended to be
 * flexible and useful for all users.  Modules that currently have their
 * own crc32c, but hopefully may be able to use this one are:
 *  net/sctp (please add all your doco to here if you change to
 *            use this one!)
 *  <endoflist>
 *
 * Copyright (c) 2004 Cisco Systems, Inc.
 */

#include <crypto/hash.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/crc32c.h>

static struct crypto_shash *tfm;

u32 crc32c(u32 crc, const void *address, unsigned int length)
{
        SHASH_DESC_ON_STACK(shash, tfm);
        u32 ret, *ctx = (u32 *)shash_desc_ctx(shash);
        int err;

        shash->tfm = tfm;
        *ctx = crc;

        err = crypto_shash_update(shash, address, length);
        BUG_ON(err);

        ret = *ctx;
        barrier_data(ctx);
        return ret;
}

EXPORT_SYMBOL(crc32c);

static int __init libcrc32c_mod_init(void)
{
        tfm = crypto_alloc_shash("crc32c", 0, 0);
        return PTR_ERR_OR_ZERO(tfm);
}

static void __exit libcrc32c_mod_fini(void)
{
        crypto_free_shash(tfm);
}

module_init(libcrc32c_mod_init);
module_exit(libcrc32c_mod_fini);

MODULE_AUTHOR("Clay Haapala <chaapala@cisco.com>");
MODULE_DESCRIPTION("CRC32c (Castagnoli) calculations");
MODULE_LICENSE("GPL");
MODULE_SOFTDEP("pre: crc32c");



















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_SPECIAL_INSNS_H
#define _ASM_X86_SPECIAL_INSNS_H

#ifdef __KERNEL__
#include <asm/nops.h>
#include <asm/processor-flags.h>

#include <linux/errno.h>
#include <linux/irqflags.h>
#include <linux/jump_label.h>

/*
 * The compiler should not reorder volatile asm statements with respect to each
 * other: they should execute in program order. However GCC 4.9.x and 5.x have
 * a bug (which was fixed in 8.1, 7.3 and 6.5) where they might reorder
 * volatile asm. The write functions are not affected since they have memory
 * clobbers preventing reordering. To prevent reads from being reordered with
 * respect to writes, use a dummy memory operand.
 */

#define __FORCE_ORDER "m"(*(unsigned int *)0x1000UL)

void native_write_cr0(unsigned long val);

static inline unsigned long native_read_cr0(void)
{
        unsigned long val;
        asm volatile("mov %%cr0,%0\n\t" : "=r" (val) : __FORCE_ORDER);
        return val;
}

static __always_inline unsigned long native_read_cr2(void)
{
        unsigned long val;
        asm volatile("mov %%cr2,%0\n\t" : "=r" (val) : __FORCE_ORDER);
        return val;
}

static __always_inline void native_write_cr2(unsigned long val)
{
        asm volatile("mov %0,%%cr2": : "r" (val) : "memory");
}

static inline unsigned long __native_read_cr3(void)
{
        unsigned long val;
        asm volatile("mov %%cr3,%0\n\t" : "=r" (val) : __FORCE_ORDER);
        return val;
}

static inline void native_write_cr3(unsigned long val)
{
        asm volatile("mov %0,%%cr3": : "r" (val) : "memory");
}

static inline unsigned long native_read_cr4(void)
{
        unsigned long val;
#ifdef CONFIG_X86_32
        /*
         * This could fault if CR4 does not exist.  Non-existent CR4
         * is functionally equivalent to CR4 == 0.  Keep it simple and pretend
         * that CR4 == 0 on CPUs that don't have CR4.
         */
        asm volatile("1: mov %%cr4, %0\n"
                     "2:\n"
                     _ASM_EXTABLE(1b, 2b)
                     : "=r" (val) : "0" (0), __FORCE_ORDER);
#else
        /* CR4 always exists on x86_64. */
        asm volatile("mov %%cr4,%0\n\t" : "=r" (val) : __FORCE_ORDER);
#endif
        return val;
}

void native_write_cr4(unsigned long val);

#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
static inline u32 rdpkru(void)
{
        u32 ecx = 0;
        u32 edx, pkru;

        /*
         * "rdpkru" instruction.  Places PKRU contents in to EAX,
         * clears EDX and requires that ecx=0.
         */
        asm volatile(".byte 0x0f,0x01,0xee\n\t"
                     : "=a" (pkru), "=d" (edx)
                     : "c" (ecx));
        return pkru;
}

static inline void wrpkru(u32 pkru)
{
        u32 ecx = 0, edx = 0;

        /*
         * "wrpkru" instruction.  Loads contents in EAX to PKRU,
         * requires that ecx = edx = 0.
         */
        asm volatile(".byte 0x0f,0x01,0xef\n\t"
                     : : "a" (pkru), "c"(ecx), "d"(edx));
}

#else
static inline u32 rdpkru(void)
{
        return 0;
}

static inline void wrpkru(u32 pkru)
{
}
#endif

static __always_inline void native_wbinvd(void)
{
        asm volatile("wbinvd": : :"memory");
}

static inline unsigned long __read_cr4(void)
{
        return native_read_cr4();
}

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else

static inline unsigned long read_cr0(void)
{
        return native_read_cr0();
}

static inline void write_cr0(unsigned long x)
{
        native_write_cr0(x);
}

static __always_inline unsigned long read_cr2(void)
{
        return native_read_cr2();
}

static __always_inline void write_cr2(unsigned long x)
{
        native_write_cr2(x);
}

/*
 * Careful!  CR3 contains more than just an address.  You probably want
 * read_cr3_pa() instead.
 */
static inline unsigned long __read_cr3(void)
{
        return __native_read_cr3();
}

static inline void write_cr3(unsigned long x)
{
        native_write_cr3(x);
}

static inline void __write_cr4(unsigned long x)
{
        native_write_cr4(x);
}

static __always_inline void wbinvd(void)
{
        native_wbinvd();
}

#endif /* CONFIG_PARAVIRT_XXL */

static __always_inline void clflush(volatile void *__p)
{
        asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
}

static inline void clflushopt(volatile void *__p)
{
        alternative_io(".byte 0x3e; clflush %0",
                       ".byte 0x66; clflush %0",
                       X86_FEATURE_CLFLUSHOPT,
                       "+m" (*(volatile char __force *)__p));
}

static inline void clwb(volatile void *__p)
{
        volatile struct { char x[64]; } *p = __p;

        asm volatile(ALTERNATIVE_2(
                ".byte 0x3e; clflush (%[pax])",
                ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */
                X86_FEATURE_CLFLUSHOPT,
                ".byte 0x66, 0x0f, 0xae, 0x30",  /* clwb (%%rax) */
                X86_FEATURE_CLWB)
                : [p] "+m" (*p)
                : [pax] "a" (p));
}

#ifdef CONFIG_X86_USER_SHADOW_STACK
static inline int write_user_shstk_64(u64 __user *addr, u64 val)
{
        asm goto("1: wrussq %[val], %[addr]\n"
                          _ASM_EXTABLE(1b, %l[fail])
                          :: [addr] "m" (*addr), [val] "r" (val)
                          :: fail);
        return 0;
fail:
        return -EFAULT;
}
#endif /* CONFIG_X86_USER_SHADOW_STACK */

#define nop() asm volatile ("nop")

static inline void serialize(void)
{
        /* Instruction opcode for SERIALIZE; supported in binutils >= 2.35. */
        asm volatile(".byte 0xf, 0x1, 0xe8" ::: "memory");
}

/* The dst parameter must be 64-bytes aligned */
static inline void movdir64b(void *dst, const void *src)
{
        const struct { char _[64]; } *__src = src;
        struct { char _[64]; } *__dst = dst;

        /*
         * MOVDIR64B %(rdx), rax.
         *
         * Both __src and __dst must be memory constraints in order to tell the
         * compiler that no other memory accesses should be reordered around
         * this one.
         *
         * Also, both must be supplied as lvalues because this tells
         * the compiler what the object is (its size) the instruction accesses.
         * I.e., not the pointers but what they point to, thus the deref'ing '*'.
         */
        asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
                     : "+m" (*__dst)
                     :  "m" (*__src), "a" (__dst), "d" (__src));
}

static inline void movdir64b_io(void __iomem *dst, const void *src)
{
        movdir64b((void __force *)dst, src);
}

/**
 * enqcmds - Enqueue a command in supervisor (CPL0) mode
 * @dst: destination, in MMIO space (must be 512-bit aligned)
 * @src: 512 bits memory operand
 *
 * The ENQCMDS instruction allows software to write a 512-bit command to
 * a 512-bit-aligned special MMIO region that supports the instruction.
 * A return status is loaded into the ZF flag in the RFLAGS register.
 * ZF = 0 equates to success, and ZF = 1 indicates retry or error.
 *
 * This function issues the ENQCMDS instruction to submit data from
 * kernel space to MMIO space, in a unit of 512 bits. Order of data access
 * is not guaranteed, nor is a memory barrier performed afterwards. It
 * returns 0 on success and -EAGAIN on failure.
 *
 * Warning: Do not use this helper unless your driver has checked that the
 * ENQCMDS instruction is supported on the platform and the device accepts
 * ENQCMDS.
 */
static inline int enqcmds(void __iomem *dst, const void *src)
{
        const struct { char _[64]; } *__src = src;
        struct { char _[64]; } __iomem *__dst = dst;
        bool zf;

        /*
         * ENQCMDS %(rdx), rax
         *
         * See movdir64b()'s comment on operand specification.
         */
        asm volatile(".byte 0xf3, 0x0f, 0x38, 0xf8, 0x02, 0x66, 0x90"
                     CC_SET(z)
                     : CC_OUT(z) (zf), "+m" (*__dst)
                     : "m" (*__src), "a" (__dst), "d" (__src));

        /* Submission failure is indicated via EFLAGS.ZF=1 */
        if (zf)
                return -EAGAIN;

        return 0;
}

static __always_inline void tile_release(void)
{
        /*
         * Instruction opcode for TILERELEASE; supported in binutils
         * version >= 2.36.
         */
        asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0");
}

#endif /* __KERNEL__ */

#endif /* _ASM_X86_SPECIAL_INSNS_H */
























































































































































































































































































   26 



   24 












































































































































































































































































































































































































































































































































































































































































































































































































































































   16 















































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/init.h>
#include <linux/export.h>
#include <linux/timer.h>
#include <linux/acpi_pmtmr.h>
#include <linux/cpufreq.h>
#include <linux/delay.h>
#include <linux/clocksource.h>
#include <linux/percpu.h>
#include <linux/timex.h>
#include <linux/static_key.h>
#include <linux/static_call.h>

#include <asm/hpet.h>
#include <asm/timer.h>
#include <asm/vgtod.h>
#include <asm/time.h>
#include <asm/delay.h>
#include <asm/hypervisor.h>
#include <asm/nmi.h>
#include <asm/x86_init.h>
#include <asm/geode.h>
#include <asm/apic.h>
#include <asm/cpu_device_id.h>
#include <asm/i8259.h>
#include <asm/uv/uv.h>

unsigned int __read_mostly cpu_khz;        /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);

unsigned int __read_mostly tsc_khz;
EXPORT_SYMBOL(tsc_khz);

#define KHZ        1000

/*
 * TSC can be unstable due to cpufreq or due to unsynced TSCs
 */
static int __read_mostly tsc_unstable;
static unsigned int __initdata tsc_early_khz;

static DEFINE_STATIC_KEY_FALSE_RO(__use_tsc);

int tsc_clocksource_reliable;

static int __read_mostly tsc_force_recalibrate;

static u32 art_to_tsc_numerator;
static u32 art_to_tsc_denominator;
static u64 art_to_tsc_offset;
static bool have_art;

struct cyc2ns {
        struct cyc2ns_data data[2];        /*  0 + 2*16 = 32 */
        seqcount_latch_t   seq;                /* 32 + 4    = 36 */

}; /* fits one cacheline */

static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);

static int __init tsc_early_khz_setup(char *buf)
{
        return kstrtouint(buf, 0, &tsc_early_khz);
}
early_param("tsc_early_khz", tsc_early_khz_setup);

__always_inline void __cyc2ns_read(struct cyc2ns_data *data)
{
        int seq, idx;

        do {
                seq = this_cpu_read(cyc2ns.seq.seqcount.sequence);
                idx = seq & 1;

                data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
                data->cyc2ns_mul    = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
                data->cyc2ns_shift  = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);

        } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence)));
}

__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
{
        preempt_disable_notrace();
        __cyc2ns_read(data);
}

__always_inline void cyc2ns_read_end(void)
{
        preempt_enable_notrace();
}

/*
 * Accelerators for sched_clock()
 * convert from cycles(64bits) => nanoseconds (64bits)
 *  basic equation:
 *              ns = cycles / (freq / ns_per_sec)
 *              ns = cycles * (ns_per_sec / freq)
 *              ns = cycles * (10^9 / (cpu_khz * 10^3))
 *              ns = cycles * (10^6 / cpu_khz)
 *
 *      Then we use scaling math (suggested by george@mvista.com) to get:
 *              ns = cycles * (10^6 * SC / cpu_khz) / SC
 *              ns = cycles * cyc2ns_scale / SC
 *
 *      And since SC is a constant power of two, we can convert the div
 *  into a shift. The larger SC is, the more accurate the conversion, but
 *  cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
 *  (64-bit result) can be used.
 *
 *  We can use khz divisor instead of mhz to keep a better precision.
 *  (mathieu.desnoyers@polymtl.ca)
 *
 *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
 */

static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc)
{
        struct cyc2ns_data data;
        unsigned long long ns;

        __cyc2ns_read(&data);

        ns = data.cyc2ns_offset;
        ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);

        return ns;
}

static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
        unsigned long long ns;
        preempt_disable_notrace();
        ns = __cycles_2_ns(cyc);
        preempt_enable_notrace();
        return ns;
}

static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
{
        unsigned long long ns_now;
        struct cyc2ns_data data;
        struct cyc2ns *c2n;

        ns_now = cycles_2_ns(tsc_now);

        /*
         * Compute a new multiplier as per the above comment and ensure our
         * time function is continuous; see the comment near struct
         * cyc2ns_data.
         */
        clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
                               NSEC_PER_MSEC, 0);

        /*
         * cyc2ns_shift is exported via arch_perf_update_userpage() where it is
         * not expected to be greater than 31 due to the original published
         * conversion algorithm shifting a 32-bit value (now specifies a 64-bit
         * value) - refer perf_event_mmap_page documentation in perf_event.h.
         */
        if (data.cyc2ns_shift == 32) {
                data.cyc2ns_shift = 31;
                data.cyc2ns_mul >>= 1;
        }

        data.cyc2ns_offset = ns_now -
                mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);

        c2n = per_cpu_ptr(&cyc2ns, cpu);

        raw_write_seqcount_latch(&c2n->seq);
        c2n->data[0] = data;
        raw_write_seqcount_latch(&c2n->seq);
        c2n->data[1] = data;
}

static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
{
        unsigned long flags;

        local_irq_save(flags);
        sched_clock_idle_sleep_event();

        if (khz)
                __set_cyc2ns_scale(khz, cpu, tsc_now);

        sched_clock_idle_wakeup_event();
        local_irq_restore(flags);
}

/*
 * Initialize cyc2ns for boot cpu
 */
static void __init cyc2ns_init_boot_cpu(void)
{
        struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);

        seqcount_latch_init(&c2n->seq);
        __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc());
}

/*
 * Secondary CPUs do not run through tsc_init(), so set up
 * all the scale factors for all CPUs, assuming the same
 * speed as the bootup CPU.
 */
static void __init cyc2ns_init_secondary_cpus(void)
{
        unsigned int cpu, this_cpu = smp_processor_id();
        struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);
        struct cyc2ns_data *data = c2n->data;

        for_each_possible_cpu(cpu) {
                if (cpu != this_cpu) {
                        seqcount_latch_init(&c2n->seq);
                        c2n = per_cpu_ptr(&cyc2ns, cpu);
                        c2n->data[0] = data[0];
                        c2n->data[1] = data[1];
                }
        }
}

/*
 * Scheduler clock - returns current time in nanosec units.
 */
noinstr u64 native_sched_clock(void)
{
        if (static_branch_likely(&__use_tsc)) {
                u64 tsc_now = rdtsc();

                /* return the value in ns */
                return __cycles_2_ns(tsc_now);
        }

        /*
         * Fall back to jiffies if there's no TSC available:
         * ( But note that we still use it if the TSC is marked
         *   unstable. We do this because unlike Time Of Day,
         *   the scheduler clock tolerates small errors and it's
         *   very important for it to be as fast as the platform
         *   can achieve it. )
         */

        /* No locking but a rare wrong value is not a big deal: */
        return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
}

/*
 * Generate a sched_clock if you already have a TSC value.
 */
u64 native_sched_clock_from_tsc(u64 tsc)
{
        return cycles_2_ns(tsc);
}

/* We need to define a real function for sched_clock, to override the
   weak default version */
#ifdef CONFIG_PARAVIRT
noinstr u64 sched_clock_noinstr(void)
{
        return paravirt_sched_clock();
}

bool using_native_sched_clock(void)
{
        return static_call_query(pv_sched_clock) == native_sched_clock;
}
#else
u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));

bool using_native_sched_clock(void) { return true; }
#endif

notrace u64 sched_clock(void)
{
        u64 now;
        preempt_disable_notrace();
        now = sched_clock_noinstr();
        preempt_enable_notrace();
        return now;
}

int check_tsc_unstable(void)
{
        return tsc_unstable;
}
EXPORT_SYMBOL_GPL(check_tsc_unstable);

#ifdef CONFIG_X86_TSC
int __init notsc_setup(char *str)
{
        mark_tsc_unstable("boot parameter notsc");
        return 1;
}
#else
/*
 * disable flag for tsc. Takes effect by clearing the TSC cpu flag
 * in cpu/common.c
 */
int __init notsc_setup(char *str)
{
        setup_clear_cpu_cap(X86_FEATURE_TSC);
        return 1;
}
#endif

__setup("notsc", notsc_setup);

static int no_sched_irq_time;
static int no_tsc_watchdog;
static int tsc_as_watchdog;

static int __init tsc_setup(char *str)
{
        if (!strcmp(str, "reliable"))
                tsc_clocksource_reliable = 1;
        if (!strncmp(str, "noirqtime", 9))
                no_sched_irq_time = 1;
        if (!strcmp(str, "unstable"))
                mark_tsc_unstable("boot parameter");
        if (!strcmp(str, "nowatchdog")) {
                no_tsc_watchdog = 1;
                if (tsc_as_watchdog)
                        pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n",
                                 __func__);
                tsc_as_watchdog = 0;
        }
        if (!strcmp(str, "recalibrate"))
                tsc_force_recalibrate = 1;
        if (!strcmp(str, "watchdog")) {
                if (no_tsc_watchdog)
                        pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n",
                                 __func__);
                else
                        tsc_as_watchdog = 1;
        }
        return 1;
}

__setup("tsc=", tsc_setup);

#define MAX_RETRIES                5
#define TSC_DEFAULT_THRESHOLD        0x20000

/*
 * Read TSC and the reference counters. Take care of any disturbances
 */
static u64 tsc_read_refs(u64 *p, int hpet)
{
        u64 t1, t2;
        u64 thresh = tsc_khz ? tsc_khz >> 5 : TSC_DEFAULT_THRESHOLD;
        int i;

        for (i = 0; i < MAX_RETRIES; i++) {
                t1 = get_cycles();
                if (hpet)
                        *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
                else
                        *p = acpi_pm_read_early();
                t2 = get_cycles();
                if ((t2 - t1) < thresh)
                        return t2;
        }
        return ULLONG_MAX;
}

/*
 * Calculate the TSC frequency from HPET reference
 */
static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
{
        u64 tmp;

        if (hpet2 < hpet1)
                hpet2 += 0x100000000ULL;
        hpet2 -= hpet1;
        tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
        do_div(tmp, 1000000);
        deltatsc = div64_u64(deltatsc, tmp);

        return (unsigned long) deltatsc;
}

/*
 * Calculate the TSC frequency from PMTimer reference
 */
static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
{
        u64 tmp;

        if (!pm1 && !pm2)
                return ULONG_MAX;

        if (pm2 < pm1)
                pm2 += (u64)ACPI_PM_OVRRUN;
        pm2 -= pm1;
        tmp = pm2 * 1000000000LL;
        do_div(tmp, PMTMR_TICKS_PER_SEC);
        do_div(deltatsc, tmp);

        return (unsigned long) deltatsc;
}

#define CAL_MS                10
#define CAL_LATCH        (PIT_TICK_RATE / (1000 / CAL_MS))
#define CAL_PIT_LOOPS        1000

#define CAL2_MS                50
#define CAL2_LATCH        (PIT_TICK_RATE / (1000 / CAL2_MS))
#define CAL2_PIT_LOOPS        5000


/*
 * Try to calibrate the TSC against the Programmable
 * Interrupt Timer and return the frequency of the TSC
 * in kHz.
 *
 * Return ULONG_MAX on failure to calibrate.
 */
static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
{
        u64 tsc, t1, t2, delta;
        unsigned long tscmin, tscmax;
        int pitcnt;

        if (!has_legacy_pic()) {
                /*
                 * Relies on tsc_early_delay_calibrate() to have given us semi
                 * usable udelay(), wait for the same 50ms we would have with
                 * the PIT loop below.
                 */
                udelay(10 * USEC_PER_MSEC);
                udelay(10 * USEC_PER_MSEC);
                udelay(10 * USEC_PER_MSEC);
                udelay(10 * USEC_PER_MSEC);
                udelay(10 * USEC_PER_MSEC);
                return ULONG_MAX;
        }

        /* Set the Gate high, disable speaker */
        outb((inb(0x61) & ~0x02) | 0x01, 0x61);

        /*
         * Setup CTC channel 2* for mode 0, (interrupt on terminal
         * count mode), binary count. Set the latch register to 50ms
         * (LSB then MSB) to begin countdown.
         */
        outb(0xb0, 0x43);
        outb(latch & 0xff, 0x42);
        outb(latch >> 8, 0x42);

        tsc = t1 = t2 = get_cycles();

        pitcnt = 0;
        tscmax = 0;
        tscmin = ULONG_MAX;
        while ((inb(0x61) & 0x20) == 0) {
                t2 = get_cycles();
                delta = t2 - tsc;
                tsc = t2;
                if ((unsigned long) delta < tscmin)
                        tscmin = (unsigned int) delta;
                if ((unsigned long) delta > tscmax)
                        tscmax = (unsigned int) delta;
                pitcnt++;
        }

        /*
         * Sanity checks:
         *
         * If we were not able to read the PIT more than loopmin
         * times, then we have been hit by a massive SMI
         *
         * If the maximum is 10 times larger than the minimum,
         * then we got hit by an SMI as well.
         */
        if (pitcnt < loopmin || tscmax > 10 * tscmin)
                return ULONG_MAX;

        /* Calculate the PIT value */
        delta = t2 - t1;
        do_div(delta, ms);
        return delta;
}

/*
 * This reads the current MSB of the PIT counter, and
 * checks if we are running on sufficiently fast and
 * non-virtualized hardware.
 *
 * Our expectations are:
 *
 *  - the PIT is running at roughly 1.19MHz
 *
 *  - each IO is going to take about 1us on real hardware,
 *    but we allow it to be much faster (by a factor of 10) or
 *    _slightly_ slower (ie we allow up to a 2us read+counter
 *    update - anything else implies a unacceptably slow CPU
 *    or PIT for the fast calibration to work.
 *
 *  - with 256 PIT ticks to read the value, we have 214us to
 *    see the same MSB (and overhead like doing a single TSC
 *    read per MSB value etc).
 *
 *  - We're doing 2 reads per loop (LSB, MSB), and we expect
 *    them each to take about a microsecond on real hardware.
 *    So we expect a count value of around 100. But we'll be
 *    generous, and accept anything over 50.
 *
 *  - if the PIT is stuck, and we see *many* more reads, we
 *    return early (and the next caller of pit_expect_msb()
 *    then consider it a failure when they don't see the
 *    next expected value).
 *
 * These expectations mean that we know that we have seen the
 * transition from one expected value to another with a fairly
 * high accuracy, and we didn't miss any events. We can thus
 * use the TSC value at the transitions to calculate a pretty
 * good value for the TSC frequency.
 */
static inline int pit_verify_msb(unsigned char val)
{
        /* Ignore LSB */
        inb(0x42);
        return inb(0x42) == val;
}

static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
{
        int count;
        u64 tsc = 0, prev_tsc = 0;

        for (count = 0; count < 50000; count++) {
                if (!pit_verify_msb(val))
                        break;
                prev_tsc = tsc;
                tsc = get_cycles();
        }
        *deltap = get_cycles() - prev_tsc;
        *tscp = tsc;

        /*
         * We require _some_ success, but the quality control
         * will be based on the error terms on the TSC values.
         */
        return count > 5;
}

/*
 * How many MSB values do we want to see? We aim for
 * a maximum error rate of 500ppm (in practice the
 * real error is much smaller), but refuse to spend
 * more than 50ms on it.
 */
#define MAX_QUICK_PIT_MS 50
#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)

static unsigned long quick_pit_calibrate(void)
{
        int i;
        u64 tsc, delta;
        unsigned long d1, d2;

        if (!has_legacy_pic())
                return 0;

        /* Set the Gate high, disable speaker */
        outb((inb(0x61) & ~0x02) | 0x01, 0x61);

        /*
         * Counter 2, mode 0 (one-shot), binary count
         *
         * NOTE! Mode 2 decrements by two (and then the
         * output is flipped each time, giving the same
         * final output frequency as a decrement-by-one),
         * so mode 0 is much better when looking at the
         * individual counts.
         */
        outb(0xb0, 0x43);

        /* Start at 0xffff */
        outb(0xff, 0x42);
        outb(0xff, 0x42);

        /*
         * The PIT starts counting at the next edge, so we
         * need to delay for a microsecond. The easiest way
         * to do that is to just read back the 16-bit counter
         * once from the PIT.
         */
        pit_verify_msb(0);

        if (pit_expect_msb(0xff, &tsc, &d1)) {
                for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
                        if (!pit_expect_msb(0xff-i, &delta, &d2))
                                break;

                        delta -= tsc;

                        /*
                         * Extrapolate the error and fail fast if the error will
                         * never be below 500 ppm.
                         */
                        if (i == 1 &&
                            d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11)
                                return 0;

                        /*
                         * Iterate until the error is less than 500 ppm
                         */
                        if (d1+d2 >= delta >> 11)
                                continue;

                        /*
                         * Check the PIT one more time to verify that
                         * all TSC reads were stable wrt the PIT.
                         *
                         * This also guarantees serialization of the
                         * last cycle read ('d2') in pit_expect_msb.
                         */
                        if (!pit_verify_msb(0xfe - i))
                                break;
                        goto success;
                }
        }
        pr_info("Fast TSC calibration failed\n");
        return 0;

success:
        /*
         * Ok, if we get here, then we've seen the
         * MSB of the PIT decrement 'i' times, and the
         * error has shrunk to less than 500 ppm.
         *
         * As a result, we can depend on there not being
         * any odd delays anywhere, and the TSC reads are
         * reliable (within the error).
         *
         * kHz = ticks / time-in-seconds / 1000;
         * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
         * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
         */
        delta *= PIT_TICK_RATE;
        do_div(delta, i*256*1000);
        pr_info("Fast TSC calibration using PIT\n");
        return delta;
}

/**
 * native_calibrate_tsc - determine TSC frequency
 * Determine TSC frequency via CPUID, else return 0.
 */
unsigned long native_calibrate_tsc(void)
{
        unsigned int eax_denominator, ebx_numerator, ecx_hz, edx;
        unsigned int crystal_khz;

        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
                return 0;

        if (boot_cpu_data.cpuid_level < 0x15)
                return 0;

        eax_denominator = ebx_numerator = ecx_hz = edx = 0;

        /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
        cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx);

        if (ebx_numerator == 0 || eax_denominator == 0)
                return 0;

        crystal_khz = ecx_hz / 1000;

        /*
         * Denverton SoCs don't report crystal clock, and also don't support
         * CPUID.0x16 for the calculation below, so hardcode the 25MHz crystal
         * clock.
         */
        if (crystal_khz == 0 &&
                        boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D)
                crystal_khz = 25000;

        /*
         * TSC frequency reported directly by CPUID is a "hardware reported"
         * frequency and is the most accurate one so far we have. This
         * is considered a known frequency.
         */
        if (crystal_khz != 0)
                setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);

        /*
         * Some Intel SoCs like Skylake and Kabylake don't report the crystal
         * clock, but we can easily calculate it to a high degree of accuracy
         * by considering the crystal ratio and the CPU speed.
         */
        if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= 0x16) {
                unsigned int eax_base_mhz, ebx, ecx, edx;

                cpuid(0x16, &eax_base_mhz, &ebx, &ecx, &edx);
                crystal_khz = eax_base_mhz * 1000 *
                        eax_denominator / ebx_numerator;
        }

        if (crystal_khz == 0)
                return 0;

        /*
         * For Atom SoCs TSC is the only reliable clocksource.
         * Mark TSC reliable so no watchdog on it.
         */
        if (boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT)
                setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);

#ifdef CONFIG_X86_LOCAL_APIC
        /*
         * The local APIC appears to be fed by the core crystal clock
         * (which sounds entirely sensible). We can set the global
         * lapic_timer_period here to avoid having to calibrate the APIC
         * timer later.
         */
        lapic_timer_period = crystal_khz * 1000 / HZ;
#endif

        return crystal_khz * ebx_numerator / eax_denominator;
}

static unsigned long cpu_khz_from_cpuid(void)
{
        unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx;

        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
                return 0;

        if (boot_cpu_data.cpuid_level < 0x16)
                return 0;

        eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0;

        cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);

        return eax_base_mhz * 1000;
}

/*
 * calibrate cpu using pit, hpet, and ptimer methods. They are available
 * later in boot after acpi is initialized.
 */
static unsigned long pit_hpet_ptimer_calibrate_cpu(void)
{
        u64 tsc1, tsc2, delta, ref1, ref2;
        unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
        unsigned long flags, latch, ms;
        int hpet = is_hpet_enabled(), i, loopmin;

        /*
         * Run 5 calibration loops to get the lowest frequency value
         * (the best estimate). We use two different calibration modes
         * here:
         *
         * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and
         * load a timeout of 50ms. We read the time right after we
         * started the timer and wait until the PIT count down reaches
         * zero. In each wait loop iteration we read the TSC and check
         * the delta to the previous read. We keep track of the min
         * and max values of that delta. The delta is mostly defined
         * by the IO time of the PIT access, so we can detect when
         * any disturbance happened between the two reads. If the
         * maximum time is significantly larger than the minimum time,
         * then we discard the result and have another try.
         *
         * 2) Reference counter. If available we use the HPET or the
         * PMTIMER as a reference to check the sanity of that value.
         * We use separate TSC readouts and check inside of the
         * reference read for any possible disturbance. We discard
         * disturbed values here as well. We do that around the PIT
         * calibration delay loop as we have to wait for a certain
         * amount of time anyway.
         */

        /* Preset PIT loop values */
        latch = CAL_LATCH;
        ms = CAL_MS;
        loopmin = CAL_PIT_LOOPS;

        for (i = 0; i < 3; i++) {
                unsigned long tsc_pit_khz;

                /*
                 * Read the start value and the reference count of
                 * hpet/pmtimer when available. Then do the PIT
                 * calibration, which will take at least 50ms, and
                 * read the end value.
                 */
                local_irq_save(flags);
                tsc1 = tsc_read_refs(&ref1, hpet);
                tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
                tsc2 = tsc_read_refs(&ref2, hpet);
                local_irq_restore(flags);

                /* Pick the lowest PIT TSC calibration so far */
                tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);

                /* hpet or pmtimer available ? */
                if (ref1 == ref2)
                        continue;

                /* Check, whether the sampling was disturbed */
                if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
                        continue;

                tsc2 = (tsc2 - tsc1) * 1000000LL;
                if (hpet)
                        tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
                else
                        tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);

                tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);

                /* Check the reference deviation */
                delta = ((u64) tsc_pit_min) * 100;
                do_div(delta, tsc_ref_min);

                /*
                 * If both calibration results are inside a 10% window
                 * then we can be sure, that the calibration
                 * succeeded. We break out of the loop right away. We
                 * use the reference value, as it is more precise.
                 */
                if (delta >= 90 && delta <= 110) {
                        pr_info("PIT calibration matches %s. %d loops\n",
                                hpet ? "HPET" : "PMTIMER", i + 1);
                        return tsc_ref_min;
                }

                /*
                 * Check whether PIT failed more than once. This
                 * happens in virtualized environments. We need to
                 * give the virtual PC a slightly longer timeframe for
                 * the HPET/PMTIMER to make the result precise.
                 */
                if (i == 1 && tsc_pit_min == ULONG_MAX) {
                        latch = CAL2_LATCH;
                        ms = CAL2_MS;
                        loopmin = CAL2_PIT_LOOPS;
                }
        }

        /*
         * Now check the results.
         */
        if (tsc_pit_min == ULONG_MAX) {
                /* PIT gave no useful value */
                pr_warn("Unable to calibrate against PIT\n");

                /* We don't have an alternative source, disable TSC */
                if (!hpet && !ref1 && !ref2) {
                        pr_notice("No reference (HPET/PMTIMER) available\n");
                        return 0;
                }

                /* The alternative source failed as well, disable TSC */
                if (tsc_ref_min == ULONG_MAX) {
                        pr_warn("HPET/PMTIMER calibration failed\n");
                        return 0;
                }

                /* Use the alternative source */
                pr_info("using %s reference calibration\n",
                        hpet ? "HPET" : "PMTIMER");

                return tsc_ref_min;
        }

        /* We don't have an alternative source, use the PIT calibration value */
        if (!hpet && !ref1 && !ref2) {
                pr_info("Using PIT calibration value\n");
                return tsc_pit_min;
        }

        /* The alternative source failed, use the PIT calibration value */
        if (tsc_ref_min == ULONG_MAX) {
                pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n");
                return tsc_pit_min;
        }

        /*
         * The calibration values differ too much. In doubt, we use
         * the PIT value as we know that there are PMTIMERs around
         * running at double speed. At least we let the user know:
         */
        pr_warn("PIT calibration deviates from %s: %lu %lu\n",
                hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
        pr_info("Using PIT calibration value\n");
        return tsc_pit_min;
}

/**
 * native_calibrate_cpu_early - can calibrate the cpu early in boot
 */
unsigned long native_calibrate_cpu_early(void)
{
        unsigned long flags, fast_calibrate = cpu_khz_from_cpuid();

        if (!fast_calibrate)
                fast_calibrate = cpu_khz_from_msr();
        if (!fast_calibrate) {
                local_irq_save(flags);
                fast_calibrate = quick_pit_calibrate();
                local_irq_restore(flags);
        }
        return fast_calibrate;
}


/**
 * native_calibrate_cpu - calibrate the cpu
 */
static unsigned long native_calibrate_cpu(void)
{
        unsigned long tsc_freq = native_calibrate_cpu_early();

        if (!tsc_freq)
                tsc_freq = pit_hpet_ptimer_calibrate_cpu();

        return tsc_freq;
}

void recalibrate_cpu_khz(void)
{
#ifndef CONFIG_SMP
        unsigned long cpu_khz_old = cpu_khz;

        if (!boot_cpu_has(X86_FEATURE_TSC))
                return;

        cpu_khz = x86_platform.calibrate_cpu();
        tsc_khz = x86_platform.calibrate_tsc();
        if (tsc_khz == 0)
                tsc_khz = cpu_khz;
        else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
                cpu_khz = tsc_khz;
        cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy,
                                                    cpu_khz_old, cpu_khz);
#endif
}
EXPORT_SYMBOL_GPL(recalibrate_cpu_khz);


static unsigned long long cyc2ns_suspend;

void tsc_save_sched_clock_state(void)
{
        if (!sched_clock_stable())
                return;

        cyc2ns_suspend = sched_clock();
}

/*
 * Even on processors with invariant TSC, TSC gets reset in some the
 * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to
 * arbitrary value (still sync'd across cpu's) during resume from such sleep
 * states. To cope up with this, recompute the cyc2ns_offset for each cpu so
 * that sched_clock() continues from the point where it was left off during
 * suspend.
 */
void tsc_restore_sched_clock_state(void)
{
        unsigned long long offset;
        unsigned long flags;
        int cpu;

        if (!sched_clock_stable())
                return;

        local_irq_save(flags);

        /*
         * We're coming out of suspend, there's no concurrency yet; don't
         * bother being nice about the RCU stuff, just write to both
         * data fields.
         */

        this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
        this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);

        offset = cyc2ns_suspend - sched_clock();

        for_each_possible_cpu(cpu) {
                per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
                per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
        }

        local_irq_restore(flags);
}

#ifdef CONFIG_CPU_FREQ
/*
 * Frequency scaling support. Adjust the TSC based timer when the CPU frequency
 * changes.
 *
 * NOTE: On SMP the situation is not fixable in general, so simply mark the TSC
 * as unstable and give up in those cases.
 *
 * Should fix up last_tsc too. Currently gettimeofday in the
 * first tick after the change will be slightly wrong.
 */

static unsigned int  ref_freq;
static unsigned long loops_per_jiffy_ref;
static unsigned long tsc_khz_ref;

static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
                                void *data)
{
        struct cpufreq_freqs *freq = data;

        if (num_online_cpus() > 1) {
                mark_tsc_unstable("cpufreq changes on SMP");
                return 0;
        }

        if (!ref_freq) {
                ref_freq = freq->old;
                loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
                tsc_khz_ref = tsc_khz;
        }

        if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
            (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
                boot_cpu_data.loops_per_jiffy =
                        cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);

                tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
                if (!(freq->flags & CPUFREQ_CONST_LOOPS))
                        mark_tsc_unstable("cpufreq changes");

                set_cyc2ns_scale(tsc_khz, freq->policy->cpu, rdtsc());
        }

        return 0;
}

static struct notifier_block time_cpufreq_notifier_block = {
        .notifier_call  = time_cpufreq_notifier
};

static int __init cpufreq_register_tsc_scaling(void)
{
        if (!boot_cpu_has(X86_FEATURE_TSC))
                return 0;
        if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                return 0;
        cpufreq_register_notifier(&time_cpufreq_notifier_block,
                                CPUFREQ_TRANSITION_NOTIFIER);
        return 0;
}

core_initcall(cpufreq_register_tsc_scaling);

#endif /* CONFIG_CPU_FREQ */

#define ART_CPUID_LEAF (0x15)
#define ART_MIN_DENOMINATOR (1)


/*
 * If ART is present detect the numerator:denominator to convert to TSC
 */
static void __init detect_art(void)
{
        unsigned int unused[2];

        if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
                return;

        /*
         * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required,
         * and the TSC counter resets must not occur asynchronously.
         */
        if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
            !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
            !boot_cpu_has(X86_FEATURE_TSC_ADJUST) ||
            tsc_async_resets)
                return;

        cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
              &art_to_tsc_numerator, unused, unused+1);

        if (art_to_tsc_denominator < ART_MIN_DENOMINATOR)
                return;

        rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset);

        /* Make this sticky over multiple CPU init calls */
        setup_force_cpu_cap(X86_FEATURE_ART);
}


/* clocksource code */

static void tsc_resume(struct clocksource *cs)
{
        tsc_verify_tsc_adjust(true);
}

/*
 * We used to compare the TSC to the cycle_last value in the clocksource
 * structure to avoid a nasty time-warp. This can be observed in a
 * very small window right after one CPU updated cycle_last under
 * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
 * is smaller than the cycle_last reference value due to a TSC which
 * is slightly behind. This delta is nowhere else observable, but in
 * that case it results in a forward time jump in the range of hours
 * due to the unsigned delta calculation of the time keeping core
 * code, which is necessary to support wrapping clocksources like pm
 * timer.
 *
 * This sanity check is now done in the core timekeeping code.
 * checking the result of read_tsc() - cycle_last for being negative.
 * That works because CLOCKSOURCE_MASK(64) does not mask out any bit.
 */
static u64 read_tsc(struct clocksource *cs)
{
        return (u64)rdtsc_ordered();
}

static void tsc_cs_mark_unstable(struct clocksource *cs)
{
        if (tsc_unstable)
                return;

        tsc_unstable = 1;
        if (using_native_sched_clock())
                clear_sched_clock_stable();
        disable_sched_clock_irqtime();
        pr_info("Marking TSC unstable due to clocksource watchdog\n");
}

static void tsc_cs_tick_stable(struct clocksource *cs)
{
        if (tsc_unstable)
                return;

        if (using_native_sched_clock())
                sched_clock_tick_stable();
}

static int tsc_cs_enable(struct clocksource *cs)
{
        vclocks_set_used(VDSO_CLOCKMODE_TSC);
        return 0;
}

/*
 * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
 */
static struct clocksource clocksource_tsc_early = {
        .name                        = "tsc-early",
        .rating                        = 299,
        .uncertainty_margin        = 32 * NSEC_PER_MSEC,
        .read                        = read_tsc,
        .mask                        = CLOCKSOURCE_MASK(64),
        .flags                        = CLOCK_SOURCE_IS_CONTINUOUS |
                                  CLOCK_SOURCE_MUST_VERIFY,
        .id                        = CSID_X86_TSC_EARLY,
        .vdso_clock_mode        = VDSO_CLOCKMODE_TSC,
        .enable                        = tsc_cs_enable,
        .resume                        = tsc_resume,
        .mark_unstable                = tsc_cs_mark_unstable,
        .tick_stable                = tsc_cs_tick_stable,
        .list                        = LIST_HEAD_INIT(clocksource_tsc_early.list),
};

/*
 * Must mark VALID_FOR_HRES early such that when we unregister tsc_early
 * this one will immediately take over. We will only register if TSC has
 * been found good.
 */
static struct clocksource clocksource_tsc = {
        .name                        = "tsc",
        .rating                        = 300,
        .read                        = read_tsc,
        .mask                        = CLOCKSOURCE_MASK(64),
        .flags                        = CLOCK_SOURCE_IS_CONTINUOUS |
                                  CLOCK_SOURCE_VALID_FOR_HRES |
                                  CLOCK_SOURCE_MUST_VERIFY |
                                  CLOCK_SOURCE_VERIFY_PERCPU,
        .id                        = CSID_X86_TSC,
        .vdso_clock_mode        = VDSO_CLOCKMODE_TSC,
        .enable                        = tsc_cs_enable,
        .resume                        = tsc_resume,
        .mark_unstable                = tsc_cs_mark_unstable,
        .tick_stable                = tsc_cs_tick_stable,
        .list                        = LIST_HEAD_INIT(clocksource_tsc.list),
};

void mark_tsc_unstable(char *reason)
{
        if (tsc_unstable)
                return;

        tsc_unstable = 1;
        if (using_native_sched_clock())
                clear_sched_clock_stable();
        disable_sched_clock_irqtime();
        pr_info("Marking TSC unstable due to %s\n", reason);

        clocksource_mark_unstable(&clocksource_tsc_early);
        clocksource_mark_unstable(&clocksource_tsc);
}

EXPORT_SYMBOL_GPL(mark_tsc_unstable);

static void __init tsc_disable_clocksource_watchdog(void)
{
        clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
        clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
}

bool tsc_clocksource_watchdog_disabled(void)
{
        return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) &&
               tsc_as_watchdog && !no_tsc_watchdog;
}

static void __init check_system_tsc_reliable(void)
{
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
        if (is_geode_lx()) {
                /* RTSC counts during suspend */
#define RTSC_SUSP 0x100
                unsigned long res_low, res_high;

                rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
                /* Geode_LX - the OLPC CPU has a very reliable TSC */
                if (res_low & RTSC_SUSP)
                        tsc_clocksource_reliable = 1;
        }
#endif
        if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
                tsc_clocksource_reliable = 1;

        /*
         * Disable the clocksource watchdog when the system has:
         *  - TSC running at constant frequency
         *  - TSC which does not stop in C-States
         *  - the TSC_ADJUST register which allows to detect even minimal
         *    modifications
         *  - not more than two sockets. As the number of sockets cannot be
         *    evaluated at the early boot stage where this has to be
         *    invoked, check the number of online memory nodes as a
         *    fallback solution which is an reasonable estimate.
         */
        if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
            boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
            boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
            nr_online_nodes <= 4)
                tsc_disable_clocksource_watchdog();
}

/*
 * Make an educated guess if the TSC is trustworthy and synchronized
 * over all CPUs.
 */
int unsynchronized_tsc(void)
{
        if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable)
                return 1;

#ifdef CONFIG_SMP
        if (apic_is_clustered_box())
                return 1;
#endif

        if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                return 0;

        if (tsc_clocksource_reliable)
                return 0;
        /*
         * Intel systems are normally all synchronized.
         * Exceptions must mark TSC as unstable:
         */
        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
                /* assume multi socket systems are not synchronized: */
                if (num_possible_cpus() > 1)
                        return 1;
        }

        return 0;
}

/*
 * Convert ART to TSC given numerator/denominator found in detect_art()
 */
struct system_counterval_t convert_art_to_tsc(u64 art)
{
        u64 tmp, res, rem;

        rem = do_div(art, art_to_tsc_denominator);

        res = art * art_to_tsc_numerator;
        tmp = rem * art_to_tsc_numerator;

        do_div(tmp, art_to_tsc_denominator);
        res += tmp + art_to_tsc_offset;

        return (struct system_counterval_t) {
                .cs_id        = have_art ? CSID_X86_TSC : CSID_GENERIC,
                .cycles        = res,
        };
}
EXPORT_SYMBOL(convert_art_to_tsc);

/**
 * convert_art_ns_to_tsc() - Convert ART in nanoseconds to TSC.
 * @art_ns: ART (Always Running Timer) in unit of nanoseconds
 *
 * PTM requires all timestamps to be in units of nanoseconds. When user
 * software requests a cross-timestamp, this function converts system timestamp
 * to TSC.
 *
 * This is valid when CPU feature flag X86_FEATURE_TSC_KNOWN_FREQ is set
 * indicating the tsc_khz is derived from CPUID[15H]. Drivers should check
 * that this flag is set before conversion to TSC is attempted.
 *
 * Return:
 * struct system_counterval_t - system counter value with the ID of the
 *        corresponding clocksource:
 *        cycles:                System counter value
 *        cs_id:                The clocksource ID for validating comparability
 */

struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns)
{
        u64 tmp, res, rem;

        rem = do_div(art_ns, USEC_PER_SEC);

        res = art_ns * tsc_khz;
        tmp = rem * tsc_khz;

        do_div(tmp, USEC_PER_SEC);
        res += tmp;

        return (struct system_counterval_t) {
                .cs_id        = have_art ? CSID_X86_TSC : CSID_GENERIC,
                .cycles        = res,
        };
}
EXPORT_SYMBOL(convert_art_ns_to_tsc);


static void tsc_refine_calibration_work(struct work_struct *work);
static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
/**
 * tsc_refine_calibration_work - Further refine tsc freq calibration
 * @work: ignored.
 *
 * This functions uses delayed work over a period of a
 * second to further refine the TSC freq value. Since this is
 * timer based, instead of loop based, we don't block the boot
 * process while this longer calibration is done.
 *
 * If there are any calibration anomalies (too many SMIs, etc),
 * or the refined calibration is off by 1% of the fast early
 * calibration, we throw out the new calibration and use the
 * early calibration.
 */
static void tsc_refine_calibration_work(struct work_struct *work)
{
        static u64 tsc_start = ULLONG_MAX, ref_start;
        static int hpet;
        u64 tsc_stop, ref_stop, delta;
        unsigned long freq;
        int cpu;

        /* Don't bother refining TSC on unstable systems */
        if (tsc_unstable)
                goto unreg;

        /*
         * Since the work is started early in boot, we may be
         * delayed the first time we expire. So set the workqueue
         * again once we know timers are working.
         */
        if (tsc_start == ULLONG_MAX) {
restart:
                /*
                 * Only set hpet once, to avoid mixing hardware
                 * if the hpet becomes enabled later.
                 */
                hpet = is_hpet_enabled();
                tsc_start = tsc_read_refs(&ref_start, hpet);
                schedule_delayed_work(&tsc_irqwork, HZ);
                return;
        }

        tsc_stop = tsc_read_refs(&ref_stop, hpet);

        /* hpet or pmtimer available ? */
        if (ref_start == ref_stop)
                goto out;

        /* Check, whether the sampling was disturbed */
        if (tsc_stop == ULLONG_MAX)
                goto restart;

        delta = tsc_stop - tsc_start;
        delta *= 1000000LL;
        if (hpet)
                freq = calc_hpet_ref(delta, ref_start, ref_stop);
        else
                freq = calc_pmtimer_ref(delta, ref_start, ref_stop);

        /* Will hit this only if tsc_force_recalibrate has been set */
        if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {

                /* Warn if the deviation exceeds 500 ppm */
                if (abs(tsc_khz - freq) > (tsc_khz >> 11)) {
                        pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n");
                        pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n",
                                (unsigned long)tsc_khz / 1000,
                                (unsigned long)tsc_khz % 1000);
                }

                pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n",
                        hpet ? "HPET" : "PM_TIMER",
                        (unsigned long)freq / 1000,
                        (unsigned long)freq % 1000);

                return;
        }

        /* Make sure we're within 1% */
        if (abs(tsc_khz - freq) > tsc_khz/100)
                goto out;

        tsc_khz = freq;
        pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n",
                (unsigned long)tsc_khz / 1000,
                (unsigned long)tsc_khz % 1000);

        /* Inform the TSC deadline clockevent devices about the recalibration */
        lapic_update_tsc_freq();

        /* Update the sched_clock() rate to match the clocksource one */
        for_each_possible_cpu(cpu)
                set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);

out:
        if (tsc_unstable)
                goto unreg;

        if (boot_cpu_has(X86_FEATURE_ART))
                have_art = true;
        clocksource_register_khz(&clocksource_tsc, tsc_khz);
unreg:
        clocksource_unregister(&clocksource_tsc_early);
}


static int __init init_tsc_clocksource(void)
{
        if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz)
                return 0;

        if (tsc_unstable) {
                clocksource_unregister(&clocksource_tsc_early);
                return 0;
        }

        if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
                clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;

        /*
         * When TSC frequency is known (retrieved via MSR or CPUID), we skip
         * the refined calibration and directly register it as a clocksource.
         */
        if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
                if (boot_cpu_has(X86_FEATURE_ART))
                        have_art = true;
                clocksource_register_khz(&clocksource_tsc, tsc_khz);
                clocksource_unregister(&clocksource_tsc_early);

                if (!tsc_force_recalibrate)
                        return 0;
        }

        schedule_delayed_work(&tsc_irqwork, 0);
        return 0;
}
/*
 * We use device_initcall here, to ensure we run after the hpet
 * is fully initialized, which may occur at fs_initcall time.
 */
device_initcall(init_tsc_clocksource);

static bool __init determine_cpu_tsc_frequencies(bool early)
{
        /* Make sure that cpu and tsc are not already calibrated */
        WARN_ON(cpu_khz || tsc_khz);

        if (early) {
                cpu_khz = x86_platform.calibrate_cpu();
                if (tsc_early_khz)
                        tsc_khz = tsc_early_khz;
                else
                        tsc_khz = x86_platform.calibrate_tsc();
        } else {
                /* We should not be here with non-native cpu calibration */
                WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu);
                cpu_khz = pit_hpet_ptimer_calibrate_cpu();
        }

        /*
         * Trust non-zero tsc_khz as authoritative,
         * and use it to sanity check cpu_khz,
         * which will be off if system timer is off.
         */
        if (tsc_khz == 0)
                tsc_khz = cpu_khz;
        else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
                cpu_khz = tsc_khz;

        if (tsc_khz == 0)
                return false;

        pr_info("Detected %lu.%03lu MHz processor\n",
                (unsigned long)cpu_khz / KHZ,
                (unsigned long)cpu_khz % KHZ);

        if (cpu_khz != tsc_khz) {
                pr_info("Detected %lu.%03lu MHz TSC",
                        (unsigned long)tsc_khz / KHZ,
                        (unsigned long)tsc_khz % KHZ);
        }
        return true;
}

static unsigned long __init get_loops_per_jiffy(void)
{
        u64 lpj = (u64)tsc_khz * KHZ;

        do_div(lpj, HZ);
        return lpj;
}

static void __init tsc_enable_sched_clock(void)
{
        loops_per_jiffy = get_loops_per_jiffy();
        use_tsc_delay();

        /* Sanitize TSC ADJUST before cyc2ns gets initialized */
        tsc_store_and_check_tsc_adjust(true);
        cyc2ns_init_boot_cpu();
        static_branch_enable(&__use_tsc);
}

void __init tsc_early_init(void)
{
        if (!boot_cpu_has(X86_FEATURE_TSC))
                return;
        /* Don't change UV TSC multi-chassis synchronization */
        if (is_early_uv_system())
                return;
        if (!determine_cpu_tsc_frequencies(true))
                return;
        tsc_enable_sched_clock();
}

void __init tsc_init(void)
{
        if (!cpu_feature_enabled(X86_FEATURE_TSC)) {
                setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
                return;
        }

        /*
         * native_calibrate_cpu_early can only calibrate using methods that are
         * available early in boot.
         */
        if (x86_platform.calibrate_cpu == native_calibrate_cpu_early)
                x86_platform.calibrate_cpu = native_calibrate_cpu;

        if (!tsc_khz) {
                /* We failed to determine frequencies earlier, try again */
                if (!determine_cpu_tsc_frequencies(false)) {
                        mark_tsc_unstable("could not calculate TSC khz");
                        setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
                        return;
                }
                tsc_enable_sched_clock();
        }

        cyc2ns_init_secondary_cpus();

        if (!no_sched_irq_time)
                enable_sched_clock_irqtime();

        lpj_fine = get_loops_per_jiffy();

        check_system_tsc_reliable();

        if (unsynchronized_tsc()) {
                mark_tsc_unstable("TSCs unsynchronized");
                return;
        }

        if (tsc_clocksource_reliable || no_tsc_watchdog)
                tsc_disable_clocksource_watchdog();

        clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
        detect_art();
}

#ifdef CONFIG_SMP
/*
 * Check whether existing calibration data can be reused.
 */
unsigned long calibrate_delay_is_known(void)
{
        int sibling, cpu = smp_processor_id();
        int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC);
        const struct cpumask *mask = topology_core_cpumask(cpu);

        /*
         * If TSC has constant frequency and TSC is synchronized across
         * sockets then reuse CPU0 calibration.
         */
        if (constant_tsc && !tsc_unstable)
                return cpu_data(0).loops_per_jiffy;

        /*
         * If TSC has constant frequency and TSC is not synchronized across
         * sockets and this is not the first CPU in the socket, then reuse
         * the calibration value of an already online CPU on that socket.
         *
         * This assumes that CONSTANT_TSC is consistent for all CPUs in a
         * socket.
         */
        if (!constant_tsc || !mask)
                return 0;

        sibling = cpumask_any_but(mask, cpu);
        if (sibling < nr_cpu_ids)
                return cpu_data(sibling).loops_per_jiffy;
        return 0;
}
#endif












































































































































































































    3 


















































































































































































































































































































































































































































    3 








    2 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef MM_SLAB_H
#define MM_SLAB_H

#include <linux/reciprocal_div.h>
#include <linux/list_lru.h>
#include <linux/local_lock.h>
#include <linux/random.h>
#include <linux/kobject.h>
#include <linux/sched/mm.h>
#include <linux/memcontrol.h>
#include <linux/kfence.h>
#include <linux/kasan.h>

/*
 * Internal slab definitions
 */

#ifdef CONFIG_64BIT
# ifdef system_has_cmpxchg128
# define system_has_freelist_aba()        system_has_cmpxchg128()
# define try_cmpxchg_freelist                try_cmpxchg128
# endif
#define this_cpu_try_cmpxchg_freelist        this_cpu_try_cmpxchg128
typedef u128 freelist_full_t;
#else /* CONFIG_64BIT */
# ifdef system_has_cmpxchg64
# define system_has_freelist_aba()        system_has_cmpxchg64()
# define try_cmpxchg_freelist                try_cmpxchg64
# endif
#define this_cpu_try_cmpxchg_freelist        this_cpu_try_cmpxchg64
typedef u64 freelist_full_t;
#endif /* CONFIG_64BIT */

#if defined(system_has_freelist_aba) && !defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
#undef system_has_freelist_aba
#endif

/*
 * Freelist pointer and counter to cmpxchg together, avoids the typical ABA
 * problems with cmpxchg of just a pointer.
 */
typedef union {
        struct {
                void *freelist;
                unsigned long counter;
        };
        freelist_full_t full;
} freelist_aba_t;

/* Reuses the bits in struct page */
struct slab {
        unsigned long __page_flags;

        struct kmem_cache *slab_cache;
        union {
                struct {
                        union {
                                struct list_head slab_list;
#ifdef CONFIG_SLUB_CPU_PARTIAL
                                struct {
                                        struct slab *next;
                                        int slabs;        /* Nr of slabs left */
                                };
#endif
                        };
                        /* Double-word boundary */
                        union {
                                struct {
                                        void *freelist;                /* first free object */
                                        union {
                                                unsigned long counters;
                                                struct {
                                                        unsigned inuse:16;
                                                        unsigned objects:15;
                                                        unsigned frozen:1;
                                                };
                                        };
                                };
#ifdef system_has_freelist_aba
                                freelist_aba_t freelist_counter;
#endif
                        };
                };
                struct rcu_head rcu_head;
        };

        unsigned int __page_type;
        atomic_t __page_refcount;
#ifdef CONFIG_SLAB_OBJ_EXT
        unsigned long obj_exts;
#endif
};

#define SLAB_MATCH(pg, sl)                                                \
        static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl))
SLAB_MATCH(flags, __page_flags);
SLAB_MATCH(compound_head, slab_cache);        /* Ensure bit 0 is clear */
SLAB_MATCH(_refcount, __page_refcount);
#ifdef CONFIG_SLAB_OBJ_EXT
SLAB_MATCH(memcg_data, obj_exts);
#endif
#undef SLAB_MATCH
static_assert(sizeof(struct slab) <= sizeof(struct page));
#if defined(system_has_freelist_aba)
static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)));
#endif

/**
 * folio_slab - Converts from folio to slab.
 * @folio: The folio.
 *
 * Currently struct slab is a different representation of a folio where
 * folio_test_slab() is true.
 *
 * Return: The slab which contains this folio.
 */
#define folio_slab(folio)        (_Generic((folio),                        \
        const struct folio *:        (const struct slab *)(folio),                \
        struct folio *:                (struct slab *)(folio)))

/**
 * slab_folio - The folio allocated for a slab
 * @slab: The slab.
 *
 * Slabs are allocated as folios that contain the individual objects and are
 * using some fields in the first struct page of the folio - those fields are
 * now accessed by struct slab. It is occasionally necessary to convert back to
 * a folio in order to communicate with the rest of the mm.  Please use this
 * helper function instead of casting yourself, as the implementation may change
 * in the future.
 */
#define slab_folio(s)                (_Generic((s),                                \
        const struct slab *:        (const struct folio *)s,                \
        struct slab *:                (struct folio *)s))

/**
 * page_slab - Converts from first struct page to slab.
 * @p: The first (either head of compound or single) page of slab.
 *
 * A temporary wrapper to convert struct page to struct slab in situations where
 * we know the page is the compound head, or single order-0 page.
 *
 * Long-term ideally everything would work with struct slab directly or go
 * through folio to struct slab.
 *
 * Return: The slab which contains this page
 */
#define page_slab(p)                (_Generic((p),                                \
        const struct page *:        (const struct slab *)(p),                \
        struct page *:                (struct slab *)(p)))

/**
 * slab_page - The first struct page allocated for a slab
 * @slab: The slab.
 *
 * A convenience wrapper for converting slab to the first struct page of the
 * underlying folio, to communicate with code not yet converted to folio or
 * struct slab.
 */
#define slab_page(s) folio_page(slab_folio(s), 0)

/*
 * If network-based swap is enabled, sl*b must keep track of whether pages
 * were allocated from pfmemalloc reserves.
 */
static inline bool slab_test_pfmemalloc(const struct slab *slab)
{
        return folio_test_active((struct folio *)slab_folio(slab));
}

static inline void slab_set_pfmemalloc(struct slab *slab)
{
        folio_set_active(slab_folio(slab));
}

static inline void slab_clear_pfmemalloc(struct slab *slab)
{
        folio_clear_active(slab_folio(slab));
}

static inline void __slab_clear_pfmemalloc(struct slab *slab)
{
        __folio_clear_active(slab_folio(slab));
}

static inline void *slab_address(const struct slab *slab)
{
        return folio_address(slab_folio(slab));
}

static inline int slab_nid(const struct slab *slab)
{
        return folio_nid(slab_folio(slab));
}

static inline pg_data_t *slab_pgdat(const struct slab *slab)
{
        return folio_pgdat(slab_folio(slab));
}

static inline struct slab *virt_to_slab(const void *addr)
{
        struct folio *folio = virt_to_folio(addr);

        if (!folio_test_slab(folio))
                return NULL;

        return folio_slab(folio);
}

static inline int slab_order(const struct slab *slab)
{
        return folio_order((struct folio *)slab_folio(slab));
}

static inline size_t slab_size(const struct slab *slab)
{
        return PAGE_SIZE << slab_order(slab);
}

#ifdef CONFIG_SLUB_CPU_PARTIAL
#define slub_percpu_partial(c)                        ((c)->partial)

#define slub_set_percpu_partial(c, p)                \
({                                                \
        slub_percpu_partial(c) = (p)->next;        \
})

#define slub_percpu_partial_read_once(c)        READ_ONCE(slub_percpu_partial(c))
#else
#define slub_percpu_partial(c)                        NULL

#define slub_set_percpu_partial(c, p)

#define slub_percpu_partial_read_once(c)        NULL
#endif // CONFIG_SLUB_CPU_PARTIAL

/*
 * Word size structure that can be atomically updated or read and that
 * contains both the order and the number of objects that a slab of the
 * given order would contain.
 */
struct kmem_cache_order_objects {
        unsigned int x;
};

/*
 * Slab cache management.
 */
struct kmem_cache {
#ifndef CONFIG_SLUB_TINY
        struct kmem_cache_cpu __percpu *cpu_slab;
#endif
        /* Used for retrieving partial slabs, etc. */
        slab_flags_t flags;
        unsigned long min_partial;
        unsigned int size;                /* Object size including metadata */
        unsigned int object_size;        /* Object size without metadata */
        struct reciprocal_value reciprocal_size;
        unsigned int offset;                /* Free pointer offset */
#ifdef CONFIG_SLUB_CPU_PARTIAL
        /* Number of per cpu partial objects to keep around */
        unsigned int cpu_partial;
        /* Number of per cpu partial slabs to keep around */
        unsigned int cpu_partial_slabs;
#endif
        struct kmem_cache_order_objects oo;

        /* Allocation and freeing of slabs */
        struct kmem_cache_order_objects min;
        gfp_t allocflags;                /* gfp flags to use on each alloc */
        int refcount;                        /* Refcount for slab cache destroy */
        void (*ctor)(void *object);        /* Object constructor */
        unsigned int inuse;                /* Offset to metadata */
        unsigned int align;                /* Alignment */
        unsigned int red_left_pad;        /* Left redzone padding size */
        const char *name;                /* Name (only for display!) */
        struct list_head list;                /* List of slab caches */
#ifdef CONFIG_SYSFS
        struct kobject kobj;                /* For sysfs */
#endif
#ifdef CONFIG_SLAB_FREELIST_HARDENED
        unsigned long random;
#endif

#ifdef CONFIG_NUMA
        /*
         * Defragmentation by allocating from a remote node.
         */
        unsigned int remote_node_defrag_ratio;
#endif

#ifdef CONFIG_SLAB_FREELIST_RANDOM
        unsigned int *random_seq;
#endif

#ifdef CONFIG_KASAN_GENERIC
        struct kasan_cache kasan_info;
#endif

#ifdef CONFIG_HARDENED_USERCOPY
        unsigned int useroffset;        /* Usercopy region offset */
        unsigned int usersize;                /* Usercopy region size */
#endif

        struct kmem_cache_node *node[MAX_NUMNODES];
};

#if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY)
#define SLAB_SUPPORTS_SYSFS
void sysfs_slab_unlink(struct kmem_cache *s);
void sysfs_slab_release(struct kmem_cache *s);
#else
static inline void sysfs_slab_unlink(struct kmem_cache *s) { }
static inline void sysfs_slab_release(struct kmem_cache *s) { }
#endif

void *fixup_red_left(struct kmem_cache *s, void *p);

static inline void *nearest_obj(struct kmem_cache *cache,
                                const struct slab *slab, void *x)
{
        void *object = x - (x - slab_address(slab)) % cache->size;
        void *last_object = slab_address(slab) +
                (slab->objects - 1) * cache->size;
        void *result = (unlikely(object > last_object)) ? last_object : object;

        result = fixup_red_left(cache, result);
        return result;
}

/* Determine object index from a given position */
static inline unsigned int __obj_to_index(const struct kmem_cache *cache,
                                          void *addr, void *obj)
{
        return reciprocal_divide(kasan_reset_tag(obj) - addr,
                                 cache->reciprocal_size);
}

static inline unsigned int obj_to_index(const struct kmem_cache *cache,
                                        const struct slab *slab, void *obj)
{
        if (is_kfence_address(obj))
                return 0;
        return __obj_to_index(cache, slab_address(slab), obj);
}

static inline int objs_per_slab(const struct kmem_cache *cache,
                                const struct slab *slab)
{
        return slab->objects;
}

/*
 * State of the slab allocator.
 *
 * This is used to describe the states of the allocator during bootup.
 * Allocators use this to gradually bootstrap themselves. Most allocators
 * have the problem that the structures used for managing slab caches are
 * allocated from slab caches themselves.
 */
enum slab_state {
        DOWN,                        /* No slab functionality yet */
        PARTIAL,                /* SLUB: kmem_cache_node available */
        UP,                        /* Slab caches usable but not all extras yet */
        FULL                        /* Everything is working */
};

extern enum slab_state slab_state;

/* The slab cache mutex protects the management structures during changes */
extern struct mutex slab_mutex;

/* The list of all slab caches on the system */
extern struct list_head slab_caches;

/* The slab cache that manages slab cache information */
extern struct kmem_cache *kmem_cache;

/* A table of kmalloc cache names and sizes */
extern const struct kmalloc_info_struct {
        const char *name[NR_KMALLOC_TYPES];
        unsigned int size;
} kmalloc_info[];

/* Kmalloc array related functions */
void setup_kmalloc_cache_index_table(void);
void create_kmalloc_caches(void);

extern u8 kmalloc_size_index[24];

static inline unsigned int size_index_elem(unsigned int bytes)
{
        return (bytes - 1) / 8;
}

/*
 * Find the kmem_cache structure that serves a given size of
 * allocation
 *
 * This assumes size is larger than zero and not larger than
 * KMALLOC_MAX_CACHE_SIZE and the caller must check that.
 */
static inline struct kmem_cache *
kmalloc_slab(size_t size, gfp_t flags, unsigned long caller)
{
        unsigned int index;

        if (size <= 192)
                index = kmalloc_size_index[size_index_elem(size)];
        else
                index = fls(size - 1);

        return kmalloc_caches[kmalloc_type(flags, caller)][index];
}

gfp_t kmalloc_fix_flags(gfp_t flags);

/* Functions provided by the slab allocators */
int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);

void __init kmem_cache_init(void);
extern void create_boot_cache(struct kmem_cache *, const char *name,
                        unsigned int size, slab_flags_t flags,
                        unsigned int useroffset, unsigned int usersize);

int slab_unmergeable(struct kmem_cache *s);
struct kmem_cache *find_mergeable(unsigned size, unsigned align,
                slab_flags_t flags, const char *name, void (*ctor)(void *));
struct kmem_cache *
__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
                   slab_flags_t flags, void (*ctor)(void *));

slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name);

static inline bool is_kmalloc_cache(struct kmem_cache *s)
{
        return (s->flags & SLAB_KMALLOC);
}

/* Legal flag mask for kmem_cache_create(), for various configurations */
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
                         SLAB_CACHE_DMA32 | SLAB_PANIC | \
                         SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS )

#ifdef CONFIG_SLUB_DEBUG
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
                          SLAB_TRACE | SLAB_CONSISTENCY_CHECKS)
#else
#define SLAB_DEBUG_FLAGS (0)
#endif

#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
                          SLAB_TEMPORARY | SLAB_ACCOUNT | \
                          SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE)

/* Common flags available with current configuration */
#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)

/* Common flags permitted for kmem_cache_create */
#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \
                              SLAB_RED_ZONE | \
                              SLAB_POISON | \
                              SLAB_STORE_USER | \
                              SLAB_TRACE | \
                              SLAB_CONSISTENCY_CHECKS | \
                              SLAB_NOLEAKTRACE | \
                              SLAB_RECLAIM_ACCOUNT | \
                              SLAB_TEMPORARY | \
                              SLAB_ACCOUNT | \
                              SLAB_KMALLOC | \
                              SLAB_NO_MERGE | \
                              SLAB_NO_USER_FLAGS)

bool __kmem_cache_empty(struct kmem_cache *);
int __kmem_cache_shutdown(struct kmem_cache *);
void __kmem_cache_release(struct kmem_cache *);
int __kmem_cache_shrink(struct kmem_cache *);
void slab_kmem_cache_release(struct kmem_cache *);

struct seq_file;
struct file;

struct slabinfo {
        unsigned long active_objs;
        unsigned long num_objs;
        unsigned long active_slabs;
        unsigned long num_slabs;
        unsigned long shared_avail;
        unsigned int limit;
        unsigned int batchcount;
        unsigned int shared;
        unsigned int objects_per_slab;
        unsigned int cache_order;
};

void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo);

#ifdef CONFIG_SLUB_DEBUG
#ifdef CONFIG_SLUB_DEBUG_ON
DECLARE_STATIC_KEY_TRUE(slub_debug_enabled);
#else
DECLARE_STATIC_KEY_FALSE(slub_debug_enabled);
#endif
extern void print_tracking(struct kmem_cache *s, void *object);
long validate_slab_cache(struct kmem_cache *s);
static inline bool __slub_debug_enabled(void)
{
        return static_branch_unlikely(&slub_debug_enabled);
}
#else
static inline void print_tracking(struct kmem_cache *s, void *object)
{
}
static inline bool __slub_debug_enabled(void)
{
        return false;
}
#endif

/*
 * Returns true if any of the specified slab_debug flags is enabled for the
 * cache. Use only for flags parsed by setup_slub_debug() as it also enables
 * the static key.
 */
static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t flags)
{
        if (IS_ENABLED(CONFIG_SLUB_DEBUG))
                VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS));
        if (__slub_debug_enabled())
                return s->flags & flags;
        return false;
}

#ifdef CONFIG_SLAB_OBJ_EXT

/*
 * slab_obj_exts - get the pointer to the slab object extension vector
 * associated with a slab.
 * @slab: a pointer to the slab struct
 *
 * Returns a pointer to the object extension vector associated with the slab,
 * or NULL if no such vector has been associated yet.
 */
static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
{
        unsigned long obj_exts = READ_ONCE(slab->obj_exts);

#ifdef CONFIG_MEMCG
        VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS),
                                                        slab_page(slab));
        VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab));
#endif
        return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK);
}

int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
                        gfp_t gfp, bool new_slab);

#else /* CONFIG_SLAB_OBJ_EXT */

static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
{
        return NULL;
}

#endif /* CONFIG_SLAB_OBJ_EXT */

static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s)
{
        return (s->flags & SLAB_RECLAIM_ACCOUNT) ?
                NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B;
}

#ifdef CONFIG_MEMCG_KMEM
bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
                                  gfp_t flags, size_t size, void **p);
void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
                            void **p, int objects, struct slabobj_ext *obj_exts);
#endif

size_t __ksize(const void *objp);

static inline size_t slab_ksize(const struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_DEBUG
        /*
         * Debugging requires use of the padding between object
         * and whatever may come after it.
         */
        if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
                return s->object_size;
#endif
        if (s->flags & SLAB_KASAN)
                return s->object_size;
        /*
         * If we have the need to store the freelist pointer
         * back there or track user information then we can
         * only use the space before that information.
         */
        if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER))
                return s->inuse;
        /*
         * Else we can use all the padding etc for the allocation
         */
        return s->size;
}

#ifdef CONFIG_SLUB_DEBUG
void dump_unreclaimable_slab(void);
#else
static inline void dump_unreclaimable_slab(void)
{
}
#endif

void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);

#ifdef CONFIG_SLAB_FREELIST_RANDOM
int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
                        gfp_t gfp);
void cache_random_seq_destroy(struct kmem_cache *cachep);
#else
static inline int cache_random_seq_create(struct kmem_cache *cachep,
                                        unsigned int count, gfp_t gfp)
{
        return 0;
}
static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
#endif /* CONFIG_SLAB_FREELIST_RANDOM */

static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c)
{
        if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
                                &init_on_alloc)) {
                if (c->ctor)
                        return false;
                if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))
                        return flags & __GFP_ZERO;
                return true;
        }
        return flags & __GFP_ZERO;
}

static inline bool slab_want_init_on_free(struct kmem_cache *c)
{
        if (static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON,
                                &init_on_free))
                return !(c->ctor ||
                         (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)));
        return false;
}

#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
void debugfs_slab_release(struct kmem_cache *);
#else
static inline void debugfs_slab_release(struct kmem_cache *s) { }
#endif

#ifdef CONFIG_PRINTK
#define KS_ADDRS_COUNT 16
struct kmem_obj_info {
        void *kp_ptr;
        struct slab *kp_slab;
        void *kp_objp;
        unsigned long kp_data_offset;
        struct kmem_cache *kp_slab_cache;
        void *kp_ret;
        void *kp_stack[KS_ADDRS_COUNT];
        void *kp_free_stack[KS_ADDRS_COUNT];
};
void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab);
#endif

void __check_heap_object(const void *ptr, unsigned long n,
                         const struct slab *slab, bool to_user);

#ifdef CONFIG_SLUB_DEBUG
void skip_orig_size_check(struct kmem_cache *s, const void *object);
#endif

#endif /* MM_SLAB_H */











































































































    3 






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Statically sized hash table implementation
 * (C) 2012  Sasha Levin <levinsasha928@gmail.com>
 */

#ifndef _LINUX_HASHTABLE_H
#define _LINUX_HASHTABLE_H

#include <linux/list.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/hash.h>
#include <linux/rculist.h>

#define DEFINE_HASHTABLE(name, bits)                                                \
        struct hlist_head name[1 << (bits)] =                                        \
                        { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT }

#define DEFINE_READ_MOSTLY_HASHTABLE(name, bits)                                \
        struct hlist_head name[1 << (bits)] __read_mostly =                        \
                        { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT }

#define DECLARE_HASHTABLE(name, bits)                                           \
        struct hlist_head name[1 << (bits)]

#define HASH_SIZE(name) (ARRAY_SIZE(name))
#define HASH_BITS(name) ilog2(HASH_SIZE(name))

/* Use hash_32 when possible to allow for fast 32bit hashing in 64bit kernels. */
#define hash_min(val, bits)                                                        \
        (sizeof(val) <= 4 ? hash_32(val, bits) : hash_long(val, bits))

static inline void __hash_init(struct hlist_head *ht, unsigned int sz)
{
        unsigned int i;

        for (i = 0; i < sz; i++)
                INIT_HLIST_HEAD(&ht[i]);
}

/**
 * hash_init - initialize a hash table
 * @hashtable: hashtable to be initialized
 *
 * Calculates the size of the hashtable from the given parameter, otherwise
 * same as hash_init_size.
 *
 * This has to be a macro since HASH_BITS() will not work on pointers since
 * it calculates the size during preprocessing.
 */
#define hash_init(hashtable) __hash_init(hashtable, HASH_SIZE(hashtable))

/**
 * hash_add - add an object to a hashtable
 * @hashtable: hashtable to add to
 * @node: the &struct hlist_node of the object to be added
 * @key: the key of the object to be added
 */
#define hash_add(hashtable, node, key)                                                \
        hlist_add_head(node, &hashtable[hash_min(key, HASH_BITS(hashtable))])

/**
 * hash_add_rcu - add an object to a rcu enabled hashtable
 * @hashtable: hashtable to add to
 * @node: the &struct hlist_node of the object to be added
 * @key: the key of the object to be added
 */
#define hash_add_rcu(hashtable, node, key)                                        \
        hlist_add_head_rcu(node, &hashtable[hash_min(key, HASH_BITS(hashtable))])

/**
 * hash_hashed - check whether an object is in any hashtable
 * @node: the &struct hlist_node of the object to be checked
 */
static inline bool hash_hashed(struct hlist_node *node)
{
        return !hlist_unhashed(node);
}

static inline bool __hash_empty(struct hlist_head *ht, unsigned int sz)
{
        unsigned int i;

        for (i = 0; i < sz; i++)
                if (!hlist_empty(&ht[i]))
                        return false;

        return true;
}

/**
 * hash_empty - check whether a hashtable is empty
 * @hashtable: hashtable to check
 *
 * This has to be a macro since HASH_BITS() will not work on pointers since
 * it calculates the size during preprocessing.
 */
#define hash_empty(hashtable) __hash_empty(hashtable, HASH_SIZE(hashtable))

/**
 * hash_del - remove an object from a hashtable
 * @node: &struct hlist_node of the object to remove
 */
static inline void hash_del(struct hlist_node *node)
{
        hlist_del_init(node);
}

/**
 * hash_del_rcu - remove an object from a rcu enabled hashtable
 * @node: &struct hlist_node of the object to remove
 */
static inline void hash_del_rcu(struct hlist_node *node)
{
        hlist_del_init_rcu(node);
}

/**
 * hash_for_each - iterate over a hashtable
 * @name: hashtable to iterate
 * @bkt: integer to use as bucket loop cursor
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 */
#define hash_for_each(name, bkt, obj, member)                                \
        for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
                        (bkt)++)\
                hlist_for_each_entry(obj, &name[bkt], member)

/**
 * hash_for_each_rcu - iterate over a rcu enabled hashtable
 * @name: hashtable to iterate
 * @bkt: integer to use as bucket loop cursor
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 */
#define hash_for_each_rcu(name, bkt, obj, member)                        \
        for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
                        (bkt)++)\
                hlist_for_each_entry_rcu(obj, &name[bkt], member)

/**
 * hash_for_each_safe - iterate over a hashtable safe against removal of
 * hash entry
 * @name: hashtable to iterate
 * @bkt: integer to use as bucket loop cursor
 * @tmp: a &struct hlist_node used for temporary storage
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 */
#define hash_for_each_safe(name, bkt, tmp, obj, member)                        \
        for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
                        (bkt)++)\
                hlist_for_each_entry_safe(obj, tmp, &name[bkt], member)

/**
 * hash_for_each_possible - iterate over all possible objects hashing to the
 * same bucket
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 */
#define hash_for_each_possible(name, obj, member, key)                        \
        hlist_for_each_entry(obj, &name[hash_min(key, HASH_BITS(name))], member)

/**
 * hash_for_each_possible_rcu - iterate over all possible objects hashing to the
 * same bucket in an rcu enabled hashtable
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 */
#define hash_for_each_possible_rcu(name, obj, member, key, cond...)        \
        hlist_for_each_entry_rcu(obj, &name[hash_min(key, HASH_BITS(name))],\
                member, ## cond)

/**
 * hash_for_each_possible_rcu_notrace - iterate over all possible objects hashing
 * to the same bucket in an rcu enabled hashtable in a rcu enabled hashtable
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 *
 * This is the same as hash_for_each_possible_rcu() except that it does
 * not do any RCU debugging or tracing.
 */
#define hash_for_each_possible_rcu_notrace(name, obj, member, key) \
        hlist_for_each_entry_rcu_notrace(obj, \
                &name[hash_min(key, HASH_BITS(name))], member)

/**
 * hash_for_each_possible_safe - iterate over all possible objects hashing to the
 * same bucket safe against removals
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @tmp: a &struct hlist_node used for temporary storage
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 */
#define hash_for_each_possible_safe(name, obj, tmp, member, key)        \
        hlist_for_each_entry_safe(obj, tmp,\
                &name[hash_min(key, HASH_BITS(name))], member)


#endif





































































































































































































































































































































































































































































    3 

    2 
    3 





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
// SPDX-License-Identifier: GPL-2.0
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/mm_inline.h>
#include <linux/kthread.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/mman.h>
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
#include <linux/rcupdate_wait.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <linux/ksm.h>

#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
#include "mm_slot.h"

enum scan_result {
        SCAN_FAIL,
        SCAN_SUCCEED,
        SCAN_PMD_NULL,
        SCAN_PMD_NONE,
        SCAN_PMD_MAPPED,
        SCAN_EXCEED_NONE_PTE,
        SCAN_EXCEED_SWAP_PTE,
        SCAN_EXCEED_SHARED_PTE,
        SCAN_PTE_NON_PRESENT,
        SCAN_PTE_UFFD_WP,
        SCAN_PTE_MAPPED_HUGEPAGE,
        SCAN_PAGE_RO,
        SCAN_LACK_REFERENCED_PAGE,
        SCAN_PAGE_NULL,
        SCAN_SCAN_ABORT,
        SCAN_PAGE_COUNT,
        SCAN_PAGE_LRU,
        SCAN_PAGE_LOCK,
        SCAN_PAGE_ANON,
        SCAN_PAGE_COMPOUND,
        SCAN_ANY_PROCESS,
        SCAN_VMA_NULL,
        SCAN_VMA_CHECK,
        SCAN_ADDRESS_RANGE,
        SCAN_DEL_PAGE_LRU,
        SCAN_ALLOC_HUGE_PAGE_FAIL,
        SCAN_CGROUP_CHARGE_FAIL,
        SCAN_TRUNCATED,
        SCAN_PAGE_HAS_PRIVATE,
        SCAN_STORE_FAILED,
        SCAN_COPY_MC,
        SCAN_PAGE_FILLED,
};

#define CREATE_TRACE_POINTS
#include <trace/events/huge_memory.h>

static struct task_struct *khugepaged_thread __read_mostly;
static DEFINE_MUTEX(khugepaged_mutex);

/* default scan 8*512 pte (or vmas) every 30 second */
static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
static unsigned int khugepaged_full_scans;
static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
/* during fragmentation poll the hugepage allocator once every minute */
static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
static unsigned long khugepaged_sleep_expire;
static DEFINE_SPINLOCK(khugepaged_mm_lock);
static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
/*
 * default collapse hugepages if there is at least one pte mapped like
 * it would have happened if the vma was large enough during page
 * fault.
 *
 * Note that these are only respected if collapse was initiated by khugepaged.
 */
static unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly;
static unsigned int khugepaged_max_ptes_shared __read_mostly;

#define MM_SLOTS_HASH_BITS 10
static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);

static struct kmem_cache *mm_slot_cache __ro_after_init;

struct collapse_control {
        bool is_khugepaged;

        /* Num pages scanned per node */
        u32 node_load[MAX_NUMNODES];

        /* nodemask for allocation fallback */
        nodemask_t alloc_nmask;
};

/**
 * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
 * @slot: hash lookup from mm to mm_slot
 */
struct khugepaged_mm_slot {
        struct mm_slot slot;
};

/**
 * struct khugepaged_scan - cursor for scanning
 * @mm_head: the head of the mm list to scan
 * @mm_slot: the current mm_slot we are scanning
 * @address: the next address inside that to be scanned
 *
 * There is only the one khugepaged_scan instance of this cursor structure.
 */
struct khugepaged_scan {
        struct list_head mm_head;
        struct khugepaged_mm_slot *mm_slot;
        unsigned long address;
};

static struct khugepaged_scan khugepaged_scan = {
        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};

#ifdef CONFIG_SYSFS
static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
                                         struct kobj_attribute *attr,
                                         char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
}

static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
                                          struct kobj_attribute *attr,
                                          const char *buf, size_t count)
{
        unsigned int msecs;
        int err;

        err = kstrtouint(buf, 10, &msecs);
        if (err)
                return -EINVAL;

        khugepaged_scan_sleep_millisecs = msecs;
        khugepaged_sleep_expire = 0;
        wake_up_interruptible(&khugepaged_wait);

        return count;
}
static struct kobj_attribute scan_sleep_millisecs_attr =
        __ATTR_RW(scan_sleep_millisecs);

static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
                                          struct kobj_attribute *attr,
                                          char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
}

static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
                                           struct kobj_attribute *attr,
                                           const char *buf, size_t count)
{
        unsigned int msecs;
        int err;

        err = kstrtouint(buf, 10, &msecs);
        if (err)
                return -EINVAL;

        khugepaged_alloc_sleep_millisecs = msecs;
        khugepaged_sleep_expire = 0;
        wake_up_interruptible(&khugepaged_wait);

        return count;
}
static struct kobj_attribute alloc_sleep_millisecs_attr =
        __ATTR_RW(alloc_sleep_millisecs);

static ssize_t pages_to_scan_show(struct kobject *kobj,
                                  struct kobj_attribute *attr,
                                  char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan);
}
static ssize_t pages_to_scan_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        unsigned int pages;
        int err;

        err = kstrtouint(buf, 10, &pages);
        if (err || !pages)
                return -EINVAL;

        khugepaged_pages_to_scan = pages;

        return count;
}
static struct kobj_attribute pages_to_scan_attr =
        __ATTR_RW(pages_to_scan);

static ssize_t pages_collapsed_show(struct kobject *kobj,
                                    struct kobj_attribute *attr,
                                    char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed);
}
static struct kobj_attribute pages_collapsed_attr =
        __ATTR_RO(pages_collapsed);

static ssize_t full_scans_show(struct kobject *kobj,
                               struct kobj_attribute *attr,
                               char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_full_scans);
}
static struct kobj_attribute full_scans_attr =
        __ATTR_RO(full_scans);

static ssize_t defrag_show(struct kobject *kobj,
                           struct kobj_attribute *attr, char *buf)
{
        return single_hugepage_flag_show(kobj, attr, buf,
                                         TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
}
static ssize_t defrag_store(struct kobject *kobj,
                            struct kobj_attribute *attr,
                            const char *buf, size_t count)
{
        return single_hugepage_flag_store(kobj, attr, buf, count,
                                 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
}
static struct kobj_attribute khugepaged_defrag_attr =
        __ATTR_RW(defrag);

/*
 * max_ptes_none controls if khugepaged should collapse hugepages over
 * any unmapped ptes in turn potentially increasing the memory
 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
 * reduce the available free memory in the system as it
 * runs. Increasing max_ptes_none will instead potentially reduce the
 * free memory in the system during the khugepaged scan.
 */
static ssize_t max_ptes_none_show(struct kobject *kobj,
                                  struct kobj_attribute *attr,
                                  char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
}
static ssize_t max_ptes_none_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        int err;
        unsigned long max_ptes_none;

        err = kstrtoul(buf, 10, &max_ptes_none);
        if (err || max_ptes_none > HPAGE_PMD_NR - 1)
                return -EINVAL;

        khugepaged_max_ptes_none = max_ptes_none;

        return count;
}
static struct kobj_attribute khugepaged_max_ptes_none_attr =
        __ATTR_RW(max_ptes_none);

static ssize_t max_ptes_swap_show(struct kobject *kobj,
                                  struct kobj_attribute *attr,
                                  char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap);
}

static ssize_t max_ptes_swap_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        int err;
        unsigned long max_ptes_swap;

        err  = kstrtoul(buf, 10, &max_ptes_swap);
        if (err || max_ptes_swap > HPAGE_PMD_NR - 1)
                return -EINVAL;

        khugepaged_max_ptes_swap = max_ptes_swap;

        return count;
}

static struct kobj_attribute khugepaged_max_ptes_swap_attr =
        __ATTR_RW(max_ptes_swap);

static ssize_t max_ptes_shared_show(struct kobject *kobj,
                                    struct kobj_attribute *attr,
                                    char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared);
}

static ssize_t max_ptes_shared_store(struct kobject *kobj,
                                     struct kobj_attribute *attr,
                                     const char *buf, size_t count)
{
        int err;
        unsigned long max_ptes_shared;

        err  = kstrtoul(buf, 10, &max_ptes_shared);
        if (err || max_ptes_shared > HPAGE_PMD_NR - 1)
                return -EINVAL;

        khugepaged_max_ptes_shared = max_ptes_shared;

        return count;
}

static struct kobj_attribute khugepaged_max_ptes_shared_attr =
        __ATTR_RW(max_ptes_shared);

static struct attribute *khugepaged_attr[] = {
        &khugepaged_defrag_attr.attr,
        &khugepaged_max_ptes_none_attr.attr,
        &khugepaged_max_ptes_swap_attr.attr,
        &khugepaged_max_ptes_shared_attr.attr,
        &pages_to_scan_attr.attr,
        &pages_collapsed_attr.attr,
        &full_scans_attr.attr,
        &scan_sleep_millisecs_attr.attr,
        &alloc_sleep_millisecs_attr.attr,
        NULL,
};

struct attribute_group khugepaged_attr_group = {
        .attrs = khugepaged_attr,
        .name = "khugepaged",
};
#endif /* CONFIG_SYSFS */

int hugepage_madvise(struct vm_area_struct *vma,
                     unsigned long *vm_flags, int advice)
{
        switch (advice) {
        case MADV_HUGEPAGE:
#ifdef CONFIG_S390
                /*
                 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
                 * can't handle this properly after s390_enable_sie, so we simply
                 * ignore the madvise to prevent qemu from causing a SIGSEGV.
                 */
                if (mm_has_pgste(vma->vm_mm))
                        return 0;
#endif
                *vm_flags &= ~VM_NOHUGEPAGE;
                *vm_flags |= VM_HUGEPAGE;
                /*
                 * If the vma become good for khugepaged to scan,
                 * register it here without waiting a page fault that
                 * may not happen any time soon.
                 */
                khugepaged_enter_vma(vma, *vm_flags);
                break;
        case MADV_NOHUGEPAGE:
                *vm_flags &= ~VM_HUGEPAGE;
                *vm_flags |= VM_NOHUGEPAGE;
                /*
                 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
                 * this vma even if we leave the mm registered in khugepaged if
                 * it got registered before VM_NOHUGEPAGE was set.
                 */
                break;
        }

        return 0;
}

int __init khugepaged_init(void)
{
        mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
                                          sizeof(struct khugepaged_mm_slot),
                                          __alignof__(struct khugepaged_mm_slot),
                                          0, NULL);
        if (!mm_slot_cache)
                return -ENOMEM;

        khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
        khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
        khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
        khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;

        return 0;
}

void __init khugepaged_destroy(void)
{
        kmem_cache_destroy(mm_slot_cache);
}

static inline int hpage_collapse_test_exit(struct mm_struct *mm)
{
        return atomic_read(&mm->mm_users) == 0;
}

static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
{
        return hpage_collapse_test_exit(mm) ||
               test_bit(MMF_DISABLE_THP, &mm->flags);
}

void __khugepaged_enter(struct mm_struct *mm)
{
        struct khugepaged_mm_slot *mm_slot;
        struct mm_slot *slot;
        int wakeup;

        /* __khugepaged_exit() must not run from under us */
        VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
        if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags)))
                return;

        mm_slot = mm_slot_alloc(mm_slot_cache);
        if (!mm_slot)
                return;

        slot = &mm_slot->slot;

        spin_lock(&khugepaged_mm_lock);
        mm_slot_insert(mm_slots_hash, mm, slot);
        /*
         * Insert just behind the scanning cursor, to let the area settle
         * down a little.
         */
        wakeup = list_empty(&khugepaged_scan.mm_head);
        list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head);
        spin_unlock(&khugepaged_mm_lock);

        mmgrab(mm);
        if (wakeup)
                wake_up_interruptible(&khugepaged_wait);
}

void khugepaged_enter_vma(struct vm_area_struct *vma,
                          unsigned long vm_flags)
{
        if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
            hugepage_flags_enabled()) {
                if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS,
                                            PMD_ORDER))
                        __khugepaged_enter(vma->vm_mm);
        }
}

void __khugepaged_exit(struct mm_struct *mm)
{
        struct khugepaged_mm_slot *mm_slot;
        struct mm_slot *slot;
        int free = 0;

        spin_lock(&khugepaged_mm_lock);
        slot = mm_slot_lookup(mm_slots_hash, mm);
        mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
        if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
                hash_del(&slot->hash);
                list_del(&slot->mm_node);
                free = 1;
        }
        spin_unlock(&khugepaged_mm_lock);

        if (free) {
                clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
                mm_slot_free(mm_slot_cache, mm_slot);
                mmdrop(mm);
        } else if (mm_slot) {
                /*
                 * This is required to serialize against
                 * hpage_collapse_test_exit() (which is guaranteed to run
                 * under mmap sem read mode). Stop here (after we return all
                 * pagetables will be destroyed) until khugepaged has finished
                 * working on the pagetables under the mmap_lock.
                 */
                mmap_write_lock(mm);
                mmap_write_unlock(mm);
        }
}

static void release_pte_folio(struct folio *folio)
{
        node_stat_mod_folio(folio,
                        NR_ISOLATED_ANON + folio_is_file_lru(folio),
                        -folio_nr_pages(folio));
        folio_unlock(folio);
        folio_putback_lru(folio);
}

static void release_pte_pages(pte_t *pte, pte_t *_pte,
                struct list_head *compound_pagelist)
{
        struct folio *folio, *tmp;

        while (--_pte >= pte) {
                pte_t pteval = ptep_get(_pte);
                unsigned long pfn;

                if (pte_none(pteval))
                        continue;
                pfn = pte_pfn(pteval);
                if (is_zero_pfn(pfn))
                        continue;
                folio = pfn_folio(pfn);
                if (folio_test_large(folio))
                        continue;
                release_pte_folio(folio);
        }

        list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) {
                list_del(&folio->lru);
                release_pte_folio(folio);
        }
}

static bool is_refcount_suitable(struct folio *folio)
{
        int expected_refcount;

        expected_refcount = folio_mapcount(folio);
        if (folio_test_swapcache(folio))
                expected_refcount += folio_nr_pages(folio);

        return folio_ref_count(folio) == expected_refcount;
}

static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                                        unsigned long address,
                                        pte_t *pte,
                                        struct collapse_control *cc,
                                        struct list_head *compound_pagelist)
{
        struct page *page = NULL;
        struct folio *folio = NULL;
        pte_t *_pte;
        int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
        bool writable = false;

        for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
             _pte++, address += PAGE_SIZE) {
                pte_t pteval = ptep_get(_pte);
                if (pte_none(pteval) || (pte_present(pteval) &&
                                is_zero_pfn(pte_pfn(pteval)))) {
                        ++none_or_zero;
                        if (!userfaultfd_armed(vma) &&
                            (!cc->is_khugepaged ||
                             none_or_zero <= khugepaged_max_ptes_none)) {
                                continue;
                        } else {
                                result = SCAN_EXCEED_NONE_PTE;
                                count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
                                goto out;
                        }
                }
                if (!pte_present(pteval)) {
                        result = SCAN_PTE_NON_PRESENT;
                        goto out;
                }
                if (pte_uffd_wp(pteval)) {
                        result = SCAN_PTE_UFFD_WP;
                        goto out;
                }
                page = vm_normal_page(vma, address, pteval);
                if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
                        result = SCAN_PAGE_NULL;
                        goto out;
                }

                folio = page_folio(page);
                VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);

                /* See hpage_collapse_scan_pmd(). */
                if (folio_likely_mapped_shared(folio)) {
                        ++shared;
                        if (cc->is_khugepaged &&
                            shared > khugepaged_max_ptes_shared) {
                                result = SCAN_EXCEED_SHARED_PTE;
                                count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
                                goto out;
                        }
                }

                if (folio_test_large(folio)) {
                        struct folio *f;

                        /*
                         * Check if we have dealt with the compound page
                         * already
                         */
                        list_for_each_entry(f, compound_pagelist, lru) {
                                if (folio == f)
                                        goto next;
                        }
                }

                /*
                 * We can do it before isolate_lru_page because the
                 * page can't be freed from under us. NOTE: PG_lock
                 * is needed to serialize against split_huge_page
                 * when invoked from the VM.
                 */
                if (!folio_trylock(folio)) {
                        result = SCAN_PAGE_LOCK;
                        goto out;
                }

                /*
                 * Check if the page has any GUP (or other external) pins.
                 *
                 * The page table that maps the page has been already unlinked
                 * from the page table tree and this process cannot get
                 * an additional pin on the page.
                 *
                 * New pins can come later if the page is shared across fork,
                 * but not from this process. The other process cannot write to
                 * the page, only trigger CoW.
                 */
                if (!is_refcount_suitable(folio)) {
                        folio_unlock(folio);
                        result = SCAN_PAGE_COUNT;
                        goto out;
                }

                /*
                 * Isolate the page to avoid collapsing an hugepage
                 * currently in use by the VM.
                 */
                if (!folio_isolate_lru(folio)) {
                        folio_unlock(folio);
                        result = SCAN_DEL_PAGE_LRU;
                        goto out;
                }
                node_stat_mod_folio(folio,
                                NR_ISOLATED_ANON + folio_is_file_lru(folio),
                                folio_nr_pages(folio));
                VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
                VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

                if (folio_test_large(folio))
                        list_add_tail(&folio->lru, compound_pagelist);
next:
                /*
                 * If collapse was initiated by khugepaged, check that there is
                 * enough young pte to justify collapsing the page
                 */
                if (cc->is_khugepaged &&
                    (pte_young(pteval) || folio_test_young(folio) ||
                     folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
                                                                     address)))
                        referenced++;

                if (pte_write(pteval))
                        writable = true;
        }

        if (unlikely(!writable)) {
                result = SCAN_PAGE_RO;
        } else if (unlikely(cc->is_khugepaged && !referenced)) {
                result = SCAN_LACK_REFERENCED_PAGE;
        } else {
                result = SCAN_SUCCEED;
                trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
                                                    referenced, writable, result);
                return result;
        }
out:
        release_pte_pages(pte, _pte, compound_pagelist);
        trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
                                            referenced, writable, result);
        return result;
}

static void __collapse_huge_page_copy_succeeded(pte_t *pte,
                                                struct vm_area_struct *vma,
                                                unsigned long address,
                                                spinlock_t *ptl,
                                                struct list_head *compound_pagelist)
{
        struct folio *src, *tmp;
        pte_t *_pte;
        pte_t pteval;

        for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
             _pte++, address += PAGE_SIZE) {
                pteval = ptep_get(_pte);
                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
                        add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
                        if (is_zero_pfn(pte_pfn(pteval))) {
                                /*
                                 * ptl mostly unnecessary.
                                 */
                                spin_lock(ptl);
                                ptep_clear(vma->vm_mm, address, _pte);
                                spin_unlock(ptl);
                                ksm_might_unmap_zero_page(vma->vm_mm, pteval);
                        }
                } else {
                        struct page *src_page = pte_page(pteval);

                        src = page_folio(src_page);
                        if (!folio_test_large(src))
                                release_pte_folio(src);
                        /*
                         * ptl mostly unnecessary, but preempt has to
                         * be disabled to update the per-cpu stats
                         * inside folio_remove_rmap_pte().
                         */
                        spin_lock(ptl);
                        ptep_clear(vma->vm_mm, address, _pte);
                        folio_remove_rmap_pte(src, src_page, vma);
                        spin_unlock(ptl);
                        free_page_and_swap_cache(src_page);
                }
        }

        list_for_each_entry_safe(src, tmp, compound_pagelist, lru) {
                list_del(&src->lru);
                node_stat_sub_folio(src, NR_ISOLATED_ANON +
                                folio_is_file_lru(src));
                folio_unlock(src);
                free_swap_cache(src);
                folio_putback_lru(src);
        }
}

static void __collapse_huge_page_copy_failed(pte_t *pte,
                                             pmd_t *pmd,
                                             pmd_t orig_pmd,
                                             struct vm_area_struct *vma,
                                             struct list_head *compound_pagelist)
{
        spinlock_t *pmd_ptl;

        /*
         * Re-establish the PMD to point to the original page table
         * entry. Restoring PMD needs to be done prior to releasing
         * pages. Since pages are still isolated and locked here,
         * acquiring anon_vma_lock_write is unnecessary.
         */
        pmd_ptl = pmd_lock(vma->vm_mm, pmd);
        pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd));
        spin_unlock(pmd_ptl);
        /*
         * Release both raw and compound pages isolated
         * in __collapse_huge_page_isolate.
         */
        release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
}

/*
 * __collapse_huge_page_copy - attempts to copy memory contents from raw
 * pages to a hugepage. Cleans up the raw pages if copying succeeds;
 * otherwise restores the original page table and releases isolated raw pages.
 * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC.
 *
 * @pte: starting of the PTEs to copy from
 * @folio: the new hugepage to copy contents to
 * @pmd: pointer to the new hugepage's PMD
 * @orig_pmd: the original raw pages' PMD
 * @vma: the original raw pages' virtual memory area
 * @address: starting address to copy
 * @ptl: lock on raw pages' PTEs
 * @compound_pagelist: list that stores compound pages
 */
static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
                pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
                unsigned long address, spinlock_t *ptl,
                struct list_head *compound_pagelist)
{
        unsigned int i;
        int result = SCAN_SUCCEED;

        /*
         * Copying pages' contents is subject to memory poison at any iteration.
         */
        for (i = 0; i < HPAGE_PMD_NR; i++) {
                pte_t pteval = ptep_get(pte + i);
                struct page *page = folio_page(folio, i);
                unsigned long src_addr = address + i * PAGE_SIZE;
                struct page *src_page;

                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
                        clear_user_highpage(page, src_addr);
                        continue;
                }
                src_page = pte_page(pteval);
                if (copy_mc_user_highpage(page, src_page, src_addr, vma) > 0) {
                        result = SCAN_COPY_MC;
                        break;
                }
        }

        if (likely(result == SCAN_SUCCEED))
                __collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
                                                    compound_pagelist);
        else
                __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
                                                 compound_pagelist);

        return result;
}

static void khugepaged_alloc_sleep(void)
{
        DEFINE_WAIT(wait);

        add_wait_queue(&khugepaged_wait, &wait);
        __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
        schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
        remove_wait_queue(&khugepaged_wait, &wait);
}

struct collapse_control khugepaged_collapse_control = {
        .is_khugepaged = true,
};

static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
{
        int i;

        /*
         * If node_reclaim_mode is disabled, then no extra effort is made to
         * allocate memory locally.
         */
        if (!node_reclaim_enabled())
                return false;

        /* If there is a count for this node already, it must be acceptable */
        if (cc->node_load[nid])
                return false;

        for (i = 0; i < MAX_NUMNODES; i++) {
                if (!cc->node_load[i])
                        continue;
                if (node_distance(nid, i) > node_reclaim_distance)
                        return true;
        }
        return false;
}

#define khugepaged_defrag()                                        \
        (transparent_hugepage_flags &                                \
         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))

/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
{
        return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
}

#ifdef CONFIG_NUMA
static int hpage_collapse_find_target_node(struct collapse_control *cc)
{
        int nid, target_node = 0, max_value = 0;

        /* find first node with max normal pages hit */
        for (nid = 0; nid < MAX_NUMNODES; nid++)
                if (cc->node_load[nid] > max_value) {
                        max_value = cc->node_load[nid];
                        target_node = nid;
                }

        for_each_online_node(nid) {
                if (max_value == cc->node_load[nid])
                        node_set(nid, cc->alloc_nmask);
        }

        return target_node;
}
#else
static int hpage_collapse_find_target_node(struct collapse_control *cc)
{
        return 0;
}
#endif

/*
 * If mmap_lock temporarily dropped, revalidate vma
 * before taking mmap_lock.
 * Returns enum scan_result value.
 */

static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
                                   bool expect_anon,
                                   struct vm_area_struct **vmap,
                                   struct collapse_control *cc)
{
        struct vm_area_struct *vma;
        unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0;

        if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
                return SCAN_ANY_PROCESS;

        *vmap = vma = find_vma(mm, address);
        if (!vma)
                return SCAN_VMA_NULL;

        if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
                return SCAN_ADDRESS_RANGE;
        if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER))
                return SCAN_VMA_CHECK;
        /*
         * Anon VMA expected, the address may be unmapped then
         * remapped to file after khugepaged reaquired the mmap_lock.
         *
         * thp_vma_allowable_order may return true for qualified file
         * vmas.
         */
        if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
                return SCAN_PAGE_ANON;
        return SCAN_SUCCEED;
}

static int find_pmd_or_thp_or_none(struct mm_struct *mm,
                                   unsigned long address,
                                   pmd_t **pmd)
{
        pmd_t pmde;

        *pmd = mm_find_pmd(mm, address);
        if (!*pmd)
                return SCAN_PMD_NULL;

        pmde = pmdp_get_lockless(*pmd);
        if (pmd_none(pmde))
                return SCAN_PMD_NONE;
        if (!pmd_present(pmde))
                return SCAN_PMD_NULL;
        if (pmd_trans_huge(pmde))
                return SCAN_PMD_MAPPED;
        if (pmd_devmap(pmde))
                return SCAN_PMD_NULL;
        if (pmd_bad(pmde))
                return SCAN_PMD_NULL;
        return SCAN_SUCCEED;
}

static int check_pmd_still_valid(struct mm_struct *mm,
                                 unsigned long address,
                                 pmd_t *pmd)
{
        pmd_t *new_pmd;
        int result = find_pmd_or_thp_or_none(mm, address, &new_pmd);

        if (result != SCAN_SUCCEED)
                return result;
        if (new_pmd != pmd)
                return SCAN_FAIL;
        return SCAN_SUCCEED;
}

/*
 * Bring missing pages in from swap, to complete THP collapse.
 * Only done if hpage_collapse_scan_pmd believes it is worthwhile.
 *
 * Called and returns without pte mapped or spinlocks held.
 * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
 */
static int __collapse_huge_page_swapin(struct mm_struct *mm,
                                       struct vm_area_struct *vma,
                                       unsigned long haddr, pmd_t *pmd,
                                       int referenced)
{
        int swapped_in = 0;
        vm_fault_t ret = 0;
        unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
        int result;
        pte_t *pte = NULL;
        spinlock_t *ptl;

        for (address = haddr; address < end; address += PAGE_SIZE) {
                struct vm_fault vmf = {
                        .vma = vma,
                        .address = address,
                        .pgoff = linear_page_index(vma, address),
                        .flags = FAULT_FLAG_ALLOW_RETRY,
                        .pmd = pmd,
                };

                if (!pte++) {
                        pte = pte_offset_map_nolock(mm, pmd, address, &ptl);
                        if (!pte) {
                                mmap_read_unlock(mm);
                                result = SCAN_PMD_NULL;
                                goto out;
                        }
                }

                vmf.orig_pte = ptep_get_lockless(pte);
                if (!is_swap_pte(vmf.orig_pte))
                        continue;

                vmf.pte = pte;
                vmf.ptl = ptl;
                ret = do_swap_page(&vmf);
                /* Which unmaps pte (after perhaps re-checking the entry) */
                pte = NULL;

                /*
                 * do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
                 * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because
                 * we do not retry here and swap entry will remain in pagetable
                 * resulting in later failure.
                 */
                if (ret & VM_FAULT_RETRY) {
                        /* Likely, but not guaranteed, that page lock failed */
                        result = SCAN_PAGE_LOCK;
                        goto out;
                }
                if (ret & VM_FAULT_ERROR) {
                        mmap_read_unlock(mm);
                        result = SCAN_FAIL;
                        goto out;
                }
                swapped_in++;
        }

        if (pte)
                pte_unmap(pte);

        /* Drain LRU cache to remove extra pin on the swapped in pages */
        if (swapped_in)
                lru_add_drain();

        result = SCAN_SUCCEED;
out:
        trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result);
        return result;
}

static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
                              struct collapse_control *cc)
{
        gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
                     GFP_TRANSHUGE);
        int node = hpage_collapse_find_target_node(cc);
        struct folio *folio;

        folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
        if (!folio) {
                *foliop = NULL;
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                return SCAN_ALLOC_HUGE_PAGE_FAIL;
        }

        count_vm_event(THP_COLLAPSE_ALLOC);
        if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
                folio_put(folio);
                *foliop = NULL;
                return SCAN_CGROUP_CHARGE_FAIL;
        }

        count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);

        *foliop = folio;
        return SCAN_SUCCEED;
}

static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
                              int referenced, int unmapped,
                              struct collapse_control *cc)
{
        LIST_HEAD(compound_pagelist);
        pmd_t *pmd, _pmd;
        pte_t *pte;
        pgtable_t pgtable;
        struct folio *folio;
        spinlock_t *pmd_ptl, *pte_ptl;
        int result = SCAN_FAIL;
        struct vm_area_struct *vma;
        struct mmu_notifier_range range;

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);

        /*
         * Before allocating the hugepage, release the mmap_lock read lock.
         * The allocation can take potentially a long time if it involves
         * sync compaction, and we do not need to hold the mmap_lock during
         * that. We will recheck the vma after taking it again in write mode.
         */
        mmap_read_unlock(mm);

        result = alloc_charge_folio(&folio, mm, cc);
        if (result != SCAN_SUCCEED)
                goto out_nolock;

        mmap_read_lock(mm);
        result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
        if (result != SCAN_SUCCEED) {
                mmap_read_unlock(mm);
                goto out_nolock;
        }

        result = find_pmd_or_thp_or_none(mm, address, &pmd);
        if (result != SCAN_SUCCEED) {
                mmap_read_unlock(mm);
                goto out_nolock;
        }

        if (unmapped) {
                /*
                 * __collapse_huge_page_swapin will return with mmap_lock
                 * released when it fails. So we jump out_nolock directly in
                 * that case.  Continuing to collapse causes inconsistency.
                 */
                result = __collapse_huge_page_swapin(mm, vma, address, pmd,
                                                     referenced);
                if (result != SCAN_SUCCEED)
                        goto out_nolock;
        }

        mmap_read_unlock(mm);
        /*
         * Prevent all access to pagetables with the exception of
         * gup_fast later handled by the ptep_clear_flush and the VM
         * handled by the anon_vma lock + PG_lock.
         *
         * UFFDIO_MOVE is prevented to race as well thanks to the
         * mmap_lock.
         */
        mmap_write_lock(mm);
        result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
        if (result != SCAN_SUCCEED)
                goto out_up_write;
        /* check if the pmd is still valid */
        result = check_pmd_still_valid(mm, address, pmd);
        if (result != SCAN_SUCCEED)
                goto out_up_write;

        vma_start_write(vma);
        anon_vma_lock_write(vma->anon_vma);

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
                                address + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
        /*
         * This removes any huge TLB entry from the CPU so we won't allow
         * huge and small TLB entries for the same virtual address to
         * avoid the risk of CPU bugs in that area.
         *
         * Parallel GUP-fast is fine since GUP-fast will back off when
         * it detects PMD is changed.
         */
        _pmd = pmdp_collapse_flush(vma, address, pmd);
        spin_unlock(pmd_ptl);
        mmu_notifier_invalidate_range_end(&range);
        tlb_remove_table_sync_one();

        pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
        if (pte) {
                result = __collapse_huge_page_isolate(vma, address, pte, cc,
                                                      &compound_pagelist);
                spin_unlock(pte_ptl);
        } else {
                result = SCAN_PMD_NULL;
        }

        if (unlikely(result != SCAN_SUCCEED)) {
                if (pte)
                        pte_unmap(pte);
                spin_lock(pmd_ptl);
                BUG_ON(!pmd_none(*pmd));
                /*
                 * We can only use set_pmd_at when establishing
                 * hugepmds and never for establishing regular pmds that
                 * points to regular pagetables. Use pmd_populate for that
                 */
                pmd_populate(mm, pmd, pmd_pgtable(_pmd));
                spin_unlock(pmd_ptl);
                anon_vma_unlock_write(vma->anon_vma);
                goto out_up_write;
        }

        /*
         * All pages are isolated and locked so anon_vma rmap
         * can't run anymore.
         */
        anon_vma_unlock_write(vma->anon_vma);

        result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
                                           vma, address, pte_ptl,
                                           &compound_pagelist);
        pte_unmap(pte);
        if (unlikely(result != SCAN_SUCCEED))
                goto out_up_write;

        /*
         * The smp_wmb() inside __folio_mark_uptodate() ensures the
         * copy_huge_page writes become visible before the set_pmd_at()
         * write.
         */
        __folio_mark_uptodate(folio);
        pgtable = pmd_pgtable(_pmd);

        _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot);
        _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);

        spin_lock(pmd_ptl);
        BUG_ON(!pmd_none(*pmd));
        folio_add_new_anon_rmap(folio, vma, address);
        folio_add_lru_vma(folio, vma);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, address, pmd, _pmd);
        update_mmu_cache_pmd(vma, address, pmd);
        spin_unlock(pmd_ptl);

        folio = NULL;

        result = SCAN_SUCCEED;
out_up_write:
        mmap_write_unlock(mm);
out_nolock:
        if (folio)
                folio_put(folio);
        trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
        return result;
}

static int hpage_collapse_scan_pmd(struct mm_struct *mm,
                                   struct vm_area_struct *vma,
                                   unsigned long address, bool *mmap_locked,
                                   struct collapse_control *cc)
{
        pmd_t *pmd;
        pte_t *pte, *_pte;
        int result = SCAN_FAIL, referenced = 0;
        int none_or_zero = 0, shared = 0;
        struct page *page = NULL;
        struct folio *folio = NULL;
        unsigned long _address;
        spinlock_t *ptl;
        int node = NUMA_NO_NODE, unmapped = 0;
        bool writable = false;

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);

        result = find_pmd_or_thp_or_none(mm, address, &pmd);
        if (result != SCAN_SUCCEED)
                goto out;

        memset(cc->node_load, 0, sizeof(cc->node_load));
        nodes_clear(cc->alloc_nmask);
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
        if (!pte) {
                result = SCAN_PMD_NULL;
                goto out;
        }

        for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
             _pte++, _address += PAGE_SIZE) {
                pte_t pteval = ptep_get(_pte);
                if (is_swap_pte(pteval)) {
                        ++unmapped;
                        if (!cc->is_khugepaged ||
                            unmapped <= khugepaged_max_ptes_swap) {
                                /*
                                 * Always be strict with uffd-wp
                                 * enabled swap entries.  Please see
                                 * comment below for pte_uffd_wp().
                                 */
                                if (pte_swp_uffd_wp_any(pteval)) {
                                        result = SCAN_PTE_UFFD_WP;
                                        goto out_unmap;
                                }
                                continue;
                        } else {
                                result = SCAN_EXCEED_SWAP_PTE;
                                count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
                                goto out_unmap;
                        }
                }
                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
                        ++none_or_zero;
                        if (!userfaultfd_armed(vma) &&
                            (!cc->is_khugepaged ||
                             none_or_zero <= khugepaged_max_ptes_none)) {
                                continue;
                        } else {
                                result = SCAN_EXCEED_NONE_PTE;
                                count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
                                goto out_unmap;
                        }
                }
                if (pte_uffd_wp(pteval)) {
                        /*
                         * Don't collapse the page if any of the small
                         * PTEs are armed with uffd write protection.
                         * Here we can also mark the new huge pmd as
                         * write protected if any of the small ones is
                         * marked but that could bring unknown
                         * userfault messages that falls outside of
                         * the registered range.  So, just be simple.
                         */
                        result = SCAN_PTE_UFFD_WP;
                        goto out_unmap;
                }
                if (pte_write(pteval))
                        writable = true;

                page = vm_normal_page(vma, _address, pteval);
                if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
                        result = SCAN_PAGE_NULL;
                        goto out_unmap;
                }
                folio = page_folio(page);

                if (!folio_test_anon(folio)) {
                        result = SCAN_PAGE_ANON;
                        goto out_unmap;
                }

                /*
                 * We treat a single page as shared if any part of the THP
                 * is shared. "False negatives" from
                 * folio_likely_mapped_shared() are not expected to matter
                 * much in practice.
                 */
                if (folio_likely_mapped_shared(folio)) {
                        ++shared;
                        if (cc->is_khugepaged &&
                            shared > khugepaged_max_ptes_shared) {
                                result = SCAN_EXCEED_SHARED_PTE;
                                count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
                                goto out_unmap;
                        }
                }

                /*
                 * Record which node the original page is from and save this
                 * information to cc->node_load[].
                 * Khugepaged will allocate hugepage from the node has the max
                 * hit record.
                 */
                node = folio_nid(folio);
                if (hpage_collapse_scan_abort(node, cc)) {
                        result = SCAN_SCAN_ABORT;
                        goto out_unmap;
                }
                cc->node_load[node]++;
                if (!folio_test_lru(folio)) {
                        result = SCAN_PAGE_LRU;
                        goto out_unmap;
                }
                if (folio_test_locked(folio)) {
                        result = SCAN_PAGE_LOCK;
                        goto out_unmap;
                }

                /*
                 * Check if the page has any GUP (or other external) pins.
                 *
                 * Here the check may be racy:
                 * it may see folio_mapcount() > folio_ref_count().
                 * But such case is ephemeral we could always retry collapse
                 * later.  However it may report false positive if the page
                 * has excessive GUP pins (i.e. 512).  Anyway the same check
                 * will be done again later the risk seems low.
                 */
                if (!is_refcount_suitable(folio)) {
                        result = SCAN_PAGE_COUNT;
                        goto out_unmap;
                }

                /*
                 * If collapse was initiated by khugepaged, check that there is
                 * enough young pte to justify collapsing the page
                 */
                if (cc->is_khugepaged &&
                    (pte_young(pteval) || folio_test_young(folio) ||
                     folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
                                                                     address)))
                        referenced++;
        }
        if (!writable) {
                result = SCAN_PAGE_RO;
        } else if (cc->is_khugepaged &&
                   (!referenced ||
                    (unmapped && referenced < HPAGE_PMD_NR / 2))) {
                result = SCAN_LACK_REFERENCED_PAGE;
        } else {
                result = SCAN_SUCCEED;
        }
out_unmap:
        pte_unmap_unlock(pte, ptl);
        if (result == SCAN_SUCCEED) {
                result = collapse_huge_page(mm, address, referenced,
                                            unmapped, cc);
                /* collapse_huge_page will return with the mmap_lock released */
                *mmap_locked = false;
        }
out:
        trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
                                     none_or_zero, result, unmapped);
        return result;
}

static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
{
        struct mm_slot *slot = &mm_slot->slot;
        struct mm_struct *mm = slot->mm;

        lockdep_assert_held(&khugepaged_mm_lock);

        if (hpage_collapse_test_exit(mm)) {
                /* free mm_slot */
                hash_del(&slot->hash);
                list_del(&slot->mm_node);

                /*
                 * Not strictly needed because the mm exited already.
                 *
                 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
                 */

                /* khugepaged_mm_lock actually not necessary for the below */
                mm_slot_free(mm_slot_cache, mm_slot);
                mmdrop(mm);
        }
}

#ifdef CONFIG_SHMEM
/* hpage must be locked, and mmap_lock must be held */
static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
                        pmd_t *pmdp, struct page *hpage)
{
        struct vm_fault vmf = {
                .vma = vma,
                .address = addr,
                .flags = 0,
                .pmd = pmdp,
        };

        VM_BUG_ON(!PageTransHuge(hpage));
        mmap_assert_locked(vma->vm_mm);

        if (do_set_pmd(&vmf, hpage))
                return SCAN_FAIL;

        get_page(hpage);
        return SCAN_SUCCEED;
}

/**
 * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
 * address haddr.
 *
 * @mm: process address space where collapse happens
 * @addr: THP collapse address
 * @install_pmd: If a huge PMD should be installed
 *
 * This function checks whether all the PTEs in the PMD are pointing to the
 * right THP. If so, retract the page table so the THP can refault in with
 * as pmd-mapped. Possibly install a huge PMD mapping the THP.
 */
int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
                            bool install_pmd)
{
        struct mmu_notifier_range range;
        bool notified = false;
        unsigned long haddr = addr & HPAGE_PMD_MASK;
        struct vm_area_struct *vma = vma_lookup(mm, haddr);
        struct folio *folio;
        pte_t *start_pte, *pte;
        pmd_t *pmd, pgt_pmd;
        spinlock_t *pml = NULL, *ptl;
        int nr_ptes = 0, result = SCAN_FAIL;
        int i;

        mmap_assert_locked(mm);

        /* First check VMA found, in case page tables are being torn down */
        if (!vma || !vma->vm_file ||
            !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
                return SCAN_VMA_CHECK;

        /* Fast check before locking page if already PMD-mapped */
        result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
        if (result == SCAN_PMD_MAPPED)
                return result;

        /*
         * If we are here, we've succeeded in replacing all the native pages
         * in the page cache with a single hugepage. If a mm were to fault-in
         * this memory (mapped by a suitably aligned VMA), we'd get the hugepage
         * and map it by a PMD, regardless of sysfs THP settings. As such, let's
         * analogously elide sysfs THP settings here.
         */
        if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
                return SCAN_VMA_CHECK;

        /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
        if (userfaultfd_wp(vma))
                return SCAN_PTE_UFFD_WP;

        folio = filemap_lock_folio(vma->vm_file->f_mapping,
                               linear_page_index(vma, haddr));
        if (IS_ERR(folio))
                return SCAN_PAGE_NULL;

        if (folio_order(folio) != HPAGE_PMD_ORDER) {
                result = SCAN_PAGE_COMPOUND;
                goto drop_folio;
        }

        result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
        switch (result) {
        case SCAN_SUCCEED:
                break;
        case SCAN_PMD_NONE:
                /*
                 * All pte entries have been removed and pmd cleared.
                 * Skip all the pte checks and just update the pmd mapping.
                 */
                goto maybe_install_pmd;
        default:
                goto drop_folio;
        }

        result = SCAN_FAIL;
        start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
        if (!start_pte)                /* mmap_lock + page lock should prevent this */
                goto drop_folio;

        /* step 1: check all mapped PTEs are to the right huge page */
        for (i = 0, addr = haddr, pte = start_pte;
             i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
                struct page *page;
                pte_t ptent = ptep_get(pte);

                /* empty pte, skip */
                if (pte_none(ptent))
                        continue;

                /* page swapped out, abort */
                if (!pte_present(ptent)) {
                        result = SCAN_PTE_NON_PRESENT;
                        goto abort;
                }

                page = vm_normal_page(vma, addr, ptent);
                if (WARN_ON_ONCE(page && is_zone_device_page(page)))
                        page = NULL;
                /*
                 * Note that uprobe, debugger, or MAP_PRIVATE may change the
                 * page table, but the new page will not be a subpage of hpage.
                 */
                if (folio_page(folio, i) != page)
                        goto abort;
        }

        pte_unmap_unlock(start_pte, ptl);
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                haddr, haddr + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);
        notified = true;

        /*
         * pmd_lock covers a wider range than ptl, and (if split from mm's
         * page_table_lock) ptl nests inside pml. The less time we hold pml,
         * the better; but userfaultfd's mfill_atomic_pte() on a private VMA
         * inserts a valid as-if-COWed PTE without even looking up page cache.
         * So page lock of folio does not protect from it, so we must not drop
         * ptl before pgt_pmd is removed, so uffd private needs pml taken now.
         */
        if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
                pml = pmd_lock(mm, pmd);

        start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
        if (!start_pte)                /* mmap_lock + page lock should prevent this */
                goto abort;
        if (!pml)
                spin_lock(ptl);
        else if (ptl != pml)
                spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);

        /* step 2: clear page table and adjust rmap */
        for (i = 0, addr = haddr, pte = start_pte;
             i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
                struct page *page;
                pte_t ptent = ptep_get(pte);

                if (pte_none(ptent))
                        continue;
                /*
                 * We dropped ptl after the first scan, to do the mmu_notifier:
                 * page lock stops more PTEs of the folio being faulted in, but
                 * does not stop write faults COWing anon copies from existing
                 * PTEs; and does not stop those being swapped out or migrated.
                 */
                if (!pte_present(ptent)) {
                        result = SCAN_PTE_NON_PRESENT;
                        goto abort;
                }
                page = vm_normal_page(vma, addr, ptent);
                if (folio_page(folio, i) != page)
                        goto abort;

                /*
                 * Must clear entry, or a racing truncate may re-remove it.
                 * TLB flush can be left until pmdp_collapse_flush() does it.
                 * PTE dirty? Shmem page is already dirty; file is read-only.
                 */
                ptep_clear(mm, addr, pte);
                folio_remove_rmap_pte(folio, page, vma);
                nr_ptes++;
        }

        pte_unmap(start_pte);
        if (!pml)
                spin_unlock(ptl);

        /* step 3: set proper refcount and mm_counters. */
        if (nr_ptes) {
                folio_ref_sub(folio, nr_ptes);
                add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
        }

        /* step 4: remove empty page table */
        if (!pml) {
                pml = pmd_lock(mm, pmd);
                if (ptl != pml)
                        spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
        }
        pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
        pmdp_get_lockless_sync();
        if (ptl != pml)
                spin_unlock(ptl);
        spin_unlock(pml);

        mmu_notifier_invalidate_range_end(&range);

        mm_dec_nr_ptes(mm);
        page_table_check_pte_clear_range(mm, haddr, pgt_pmd);
        pte_free_defer(mm, pmd_pgtable(pgt_pmd));

maybe_install_pmd:
        /* step 5: install pmd entry */
        result = install_pmd
                        ? set_huge_pmd(vma, haddr, pmd, &folio->page)
                        : SCAN_SUCCEED;
        goto drop_folio;
abort:
        if (nr_ptes) {
                flush_tlb_mm(mm);
                folio_ref_sub(folio, nr_ptes);
                add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
        }
        if (start_pte)
                pte_unmap_unlock(start_pte, ptl);
        if (pml && pml != ptl)
                spin_unlock(pml);
        if (notified)
                mmu_notifier_invalidate_range_end(&range);
drop_folio:
        folio_unlock(folio);
        folio_put(folio);
        return result;
}

static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
        struct vm_area_struct *vma;

        i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                struct mmu_notifier_range range;
                struct mm_struct *mm;
                unsigned long addr;
                pmd_t *pmd, pgt_pmd;
                spinlock_t *pml;
                spinlock_t *ptl;
                bool skipped_uffd = false;

                /*
                 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
                 * got written to. These VMAs are likely not worth removing
                 * page tables from, as PMD-mapping is likely to be split later.
                 */
                if (READ_ONCE(vma->anon_vma))
                        continue;

                addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
                if (addr & ~HPAGE_PMD_MASK ||
                    vma->vm_end < addr + HPAGE_PMD_SIZE)
                        continue;

                mm = vma->vm_mm;
                if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED)
                        continue;

                if (hpage_collapse_test_exit(mm))
                        continue;
                /*
                 * When a vma is registered with uffd-wp, we cannot recycle
                 * the page table because there may be pte markers installed.
                 * Other vmas can still have the same file mapped hugely, but
                 * skip this one: it will always be mapped in small page size
                 * for uffd-wp registered ranges.
                 */
                if (userfaultfd_wp(vma))
                        continue;

                /* PTEs were notified when unmapped; but now for the PMD? */
                mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                        addr, addr + HPAGE_PMD_SIZE);
                mmu_notifier_invalidate_range_start(&range);

                pml = pmd_lock(mm, pmd);
                ptl = pte_lockptr(mm, pmd);
                if (ptl != pml)
                        spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);

                /*
                 * Huge page lock is still held, so normally the page table
                 * must remain empty; and we have already skipped anon_vma
                 * and userfaultfd_wp() vmas.  But since the mmap_lock is not
                 * held, it is still possible for a racing userfaultfd_ioctl()
                 * to have inserted ptes or markers.  Now that we hold ptlock,
                 * repeating the anon_vma check protects from one category,
                 * and repeating the userfaultfd_wp() check from another.
                 */
                if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
                        skipped_uffd = true;
                } else {
                        pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
                        pmdp_get_lockless_sync();
                }

                if (ptl != pml)
                        spin_unlock(ptl);
                spin_unlock(pml);

                mmu_notifier_invalidate_range_end(&range);

                if (!skipped_uffd) {
                        mm_dec_nr_ptes(mm);
                        page_table_check_pte_clear_range(mm, addr, pgt_pmd);
                        pte_free_defer(mm, pmd_pgtable(pgt_pmd));
                }
        }
        i_mmap_unlock_read(mapping);
}

/**
 * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
 *
 * @mm: process address space where collapse happens
 * @addr: virtual collapse start address
 * @file: file that collapse on
 * @start: collapse start address
 * @cc: collapse context and scratchpad
 *
 * Basic scheme is simple, details are more complex:
 *  - allocate and lock a new huge page;
 *  - scan page cache, locking old pages
 *    + swap/gup in pages if necessary;
 *  - copy data to new page
 *  - handle shmem holes
 *    + re-validate that holes weren't filled by someone else
 *    + check for userfaultfd
 *  - finalize updates to the page cache;
 *  - if replacing succeeds:
 *    + unlock huge page;
 *    + free old pages;
 *  - if replacing failed;
 *    + unlock old pages
 *    + unlock and free huge page;
 */
static int collapse_file(struct mm_struct *mm, unsigned long addr,
                         struct file *file, pgoff_t start,
                         struct collapse_control *cc)
{
        struct address_space *mapping = file->f_mapping;
        struct page *dst;
        struct folio *folio, *tmp, *new_folio;
        pgoff_t index = 0, end = start + HPAGE_PMD_NR;
        LIST_HEAD(pagelist);
        XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
        int nr_none = 0, result = SCAN_SUCCEED;
        bool is_shmem = shmem_file(file);

        VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
        VM_BUG_ON(start & (HPAGE_PMD_NR - 1));

        result = alloc_charge_folio(&new_folio, mm, cc);
        if (result != SCAN_SUCCEED)
                goto out;

        __folio_set_locked(new_folio);
        if (is_shmem)
                __folio_set_swapbacked(new_folio);
        new_folio->index = start;
        new_folio->mapping = mapping;

        /*
         * Ensure we have slots for all the pages in the range.  This is
         * almost certainly a no-op because most of the pages must be present
         */
        do {
                xas_lock_irq(&xas);
                xas_create_range(&xas);
                if (!xas_error(&xas))
                        break;
                xas_unlock_irq(&xas);
                if (!xas_nomem(&xas, GFP_KERNEL)) {
                        result = SCAN_FAIL;
                        goto rollback;
                }
        } while (1);

        for (index = start; index < end; index++) {
                xas_set(&xas, index);
                folio = xas_load(&xas);

                VM_BUG_ON(index != xas.xa_index);
                if (is_shmem) {
                        if (!folio) {
                                /*
                                 * Stop if extent has been truncated or
                                 * hole-punched, and is now completely
                                 * empty.
                                 */
                                if (index == start) {
                                        if (!xas_next_entry(&xas, end - 1)) {
                                                result = SCAN_TRUNCATED;
                                                goto xa_locked;
                                        }
                                }
                                nr_none++;
                                continue;
                        }

                        if (xa_is_value(folio) || !folio_test_uptodate(folio)) {
                                xas_unlock_irq(&xas);
                                /* swap in or instantiate fallocated page */
                                if (shmem_get_folio(mapping->host, index,
                                                &folio, SGP_NOALLOC)) {
                                        result = SCAN_FAIL;
                                        goto xa_unlocked;
                                }
                                /* drain lru cache to help isolate_lru_page() */
                                lru_add_drain();
                        } else if (folio_trylock(folio)) {
                                folio_get(folio);
                                xas_unlock_irq(&xas);
                        } else {
                                result = SCAN_PAGE_LOCK;
                                goto xa_locked;
                        }
                } else {        /* !is_shmem */
                        if (!folio || xa_is_value(folio)) {
                                xas_unlock_irq(&xas);
                                page_cache_sync_readahead(mapping, &file->f_ra,
                                                          file, index,
                                                          end - index);
                                /* drain lru cache to help isolate_lru_page() */
                                lru_add_drain();
                                folio = filemap_lock_folio(mapping, index);
                                if (IS_ERR(folio)) {
                                        result = SCAN_FAIL;
                                        goto xa_unlocked;
                                }
                        } else if (folio_test_dirty(folio)) {
                                /*
                                 * khugepaged only works on read-only fd,
                                 * so this page is dirty because it hasn't
                                 * been flushed since first write. There
                                 * won't be new dirty pages.
                                 *
                                 * Trigger async flush here and hope the
                                 * writeback is done when khugepaged
                                 * revisits this page.
                                 *
                                 * This is a one-off situation. We are not
                                 * forcing writeback in loop.
                                 */
                                xas_unlock_irq(&xas);
                                filemap_flush(mapping);
                                result = SCAN_FAIL;
                                goto xa_unlocked;
                        } else if (folio_test_writeback(folio)) {
                                xas_unlock_irq(&xas);
                                result = SCAN_FAIL;
                                goto xa_unlocked;
                        } else if (folio_trylock(folio)) {
                                folio_get(folio);
                                xas_unlock_irq(&xas);
                        } else {
                                result = SCAN_PAGE_LOCK;
                                goto xa_locked;
                        }
                }

                /*
                 * The folio must be locked, so we can drop the i_pages lock
                 * without racing with truncate.
                 */
                VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

                /* make sure the folio is up to date */
                if (unlikely(!folio_test_uptodate(folio))) {
                        result = SCAN_FAIL;
                        goto out_unlock;
                }

                /*
                 * If file was truncated then extended, or hole-punched, before
                 * we locked the first folio, then a THP might be there already.
                 * This will be discovered on the first iteration.
                 */
                if (folio_test_large(folio)) {
                        result = folio_order(folio) == HPAGE_PMD_ORDER &&
                                        folio->index == start
                                        /* Maybe PMD-mapped */
                                        ? SCAN_PTE_MAPPED_HUGEPAGE
                                        : SCAN_PAGE_COMPOUND;
                        goto out_unlock;
                }

                if (folio_mapping(folio) != mapping) {
                        result = SCAN_TRUNCATED;
                        goto out_unlock;
                }

                if (!is_shmem && (folio_test_dirty(folio) ||
                                  folio_test_writeback(folio))) {
                        /*
                         * khugepaged only works on read-only fd, so this
                         * folio is dirty because it hasn't been flushed
                         * since first write.
                         */
                        result = SCAN_FAIL;
                        goto out_unlock;
                }

                if (!folio_isolate_lru(folio)) {
                        result = SCAN_DEL_PAGE_LRU;
                        goto out_unlock;
                }

                if (!filemap_release_folio(folio, GFP_KERNEL)) {
                        result = SCAN_PAGE_HAS_PRIVATE;
                        folio_putback_lru(folio);
                        goto out_unlock;
                }

                if (folio_mapped(folio))
                        try_to_unmap(folio,
                                        TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);

                xas_lock_irq(&xas);

                VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio);

                /*
                 * We control three references to the folio:
                 *  - we hold a pin on it;
                 *  - one reference from page cache;
                 *  - one from lru_isolate_folio;
                 * If those are the only references, then any new usage
                 * of the folio will have to fetch it from the page
                 * cache. That requires locking the folio to handle
                 * truncate, so any new usage will be blocked until we
                 * unlock folio after collapse/during rollback.
                 */
                if (folio_ref_count(folio) != 3) {
                        result = SCAN_PAGE_COUNT;
                        xas_unlock_irq(&xas);
                        folio_putback_lru(folio);
                        goto out_unlock;
                }

                /*
                 * Accumulate the folios that are being collapsed.
                 */
                list_add_tail(&folio->lru, &pagelist);
                continue;
out_unlock:
                folio_unlock(folio);
                folio_put(folio);
                goto xa_unlocked;
        }

        if (!is_shmem) {
                filemap_nr_thps_inc(mapping);
                /*
                 * Paired with smp_mb() in do_dentry_open() to ensure
                 * i_writecount is up to date and the update to nr_thps is
                 * visible. Ensures the page cache will be truncated if the
                 * file is opened writable.
                 */
                smp_mb();
                if (inode_is_open_for_write(mapping->host)) {
                        result = SCAN_FAIL;
                        filemap_nr_thps_dec(mapping);
                }
        }

xa_locked:
        xas_unlock_irq(&xas);
xa_unlocked:

        /*
         * If collapse is successful, flush must be done now before copying.
         * If collapse is unsuccessful, does flush actually need to be done?
         * Do it anyway, to clear the state.
         */
        try_to_unmap_flush();

        if (result == SCAN_SUCCEED && nr_none &&
            !shmem_charge(mapping->host, nr_none))
                result = SCAN_FAIL;
        if (result != SCAN_SUCCEED) {
                nr_none = 0;
                goto rollback;
        }

        /*
         * The old folios are locked, so they won't change anymore.
         */
        index = start;
        dst = folio_page(new_folio, 0);
        list_for_each_entry(folio, &pagelist, lru) {
                while (index < folio->index) {
                        clear_highpage(dst);
                        index++;
                        dst++;
                }
                if (copy_mc_highpage(dst, folio_page(folio, 0)) > 0) {
                        result = SCAN_COPY_MC;
                        goto rollback;
                }
                index++;
                dst++;
        }
        while (index < end) {
                clear_highpage(dst);
                index++;
                dst++;
        }

        if (nr_none) {
                struct vm_area_struct *vma;
                int nr_none_check = 0;

                i_mmap_lock_read(mapping);
                xas_lock_irq(&xas);

                xas_set(&xas, start);
                for (index = start; index < end; index++) {
                        if (!xas_next(&xas)) {
                                xas_store(&xas, XA_RETRY_ENTRY);
                                if (xas_error(&xas)) {
                                        result = SCAN_STORE_FAILED;
                                        goto immap_locked;
                                }
                                nr_none_check++;
                        }
                }

                if (nr_none != nr_none_check) {
                        result = SCAN_PAGE_FILLED;
                        goto immap_locked;
                }

                /*
                 * If userspace observed a missing page in a VMA with
                 * a MODE_MISSING userfaultfd, then it might expect a
                 * UFFD_EVENT_PAGEFAULT for that page. If so, we need to
                 * roll back to avoid suppressing such an event. Since
                 * wp/minor userfaultfds don't give userspace any
                 * guarantees that the kernel doesn't fill a missing
                 * page with a zero page, so they don't matter here.
                 *
                 * Any userfaultfds registered after this point will
                 * not be able to observe any missing pages due to the
                 * previously inserted retry entries.
                 */
                vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
                        if (userfaultfd_missing(vma)) {
                                result = SCAN_EXCEED_NONE_PTE;
                                goto immap_locked;
                        }
                }

immap_locked:
                i_mmap_unlock_read(mapping);
                if (result != SCAN_SUCCEED) {
                        xas_set(&xas, start);
                        for (index = start; index < end; index++) {
                                if (xas_next(&xas) == XA_RETRY_ENTRY)
                                        xas_store(&xas, NULL);
                        }

                        xas_unlock_irq(&xas);
                        goto rollback;
                }
        } else {
                xas_lock_irq(&xas);
        }

        if (is_shmem)
                __lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR);
        else
                __lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);

        if (nr_none) {
                __lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none);
                /* nr_none is always 0 for non-shmem. */
                __lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none);
        }

        /*
         * Mark new_folio as uptodate before inserting it into the
         * page cache so that it isn't mistaken for an fallocated but
         * unwritten page.
         */
        folio_mark_uptodate(new_folio);
        folio_ref_add(new_folio, HPAGE_PMD_NR - 1);

        if (is_shmem)
                folio_mark_dirty(new_folio);
        folio_add_lru(new_folio);

        /* Join all the small entries into a single multi-index entry. */
        xas_set_order(&xas, start, HPAGE_PMD_ORDER);
        xas_store(&xas, new_folio);
        WARN_ON_ONCE(xas_error(&xas));
        xas_unlock_irq(&xas);

        /*
         * Remove pte page tables, so we can re-fault the page as huge.
         * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp().
         */
        retract_page_tables(mapping, start);
        if (cc && !cc->is_khugepaged)
                result = SCAN_PTE_MAPPED_HUGEPAGE;
        folio_unlock(new_folio);

        /*
         * The collapse has succeeded, so free the old folios.
         */
        list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
                list_del(&folio->lru);
                folio->mapping = NULL;
                folio_clear_active(folio);
                folio_clear_unevictable(folio);
                folio_unlock(folio);
                folio_put_refs(folio, 3);
        }

        goto out;

rollback:
        /* Something went wrong: roll back page cache changes */
        if (nr_none) {
                xas_lock_irq(&xas);
                mapping->nrpages -= nr_none;
                xas_unlock_irq(&xas);
                shmem_uncharge(mapping->host, nr_none);
        }

        list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
                list_del(&folio->lru);
                folio_unlock(folio);
                folio_putback_lru(folio);
                folio_put(folio);
        }
        /*
         * Undo the updates of filemap_nr_thps_inc for non-SHMEM
         * file only. This undo is not needed unless failure is
         * due to SCAN_COPY_MC.
         */
        if (!is_shmem && result == SCAN_COPY_MC) {
                filemap_nr_thps_dec(mapping);
                /*
                 * Paired with smp_mb() in do_dentry_open() to
                 * ensure the update to nr_thps is visible.
                 */
                smp_mb();
        }

        new_folio->mapping = NULL;

        folio_unlock(new_folio);
        folio_put(new_folio);
out:
        VM_BUG_ON(!list_empty(&pagelist));
        trace_mm_khugepaged_collapse_file(mm, new_folio, index, is_shmem, addr, file, HPAGE_PMD_NR, result);
        return result;
}

static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
                                    struct file *file, pgoff_t start,
                                    struct collapse_control *cc)
{
        struct folio *folio = NULL;
        struct address_space *mapping = file->f_mapping;
        XA_STATE(xas, &mapping->i_pages, start);
        int present, swap;
        int node = NUMA_NO_NODE;
        int result = SCAN_SUCCEED;

        present = 0;
        swap = 0;
        memset(cc->node_load, 0, sizeof(cc->node_load));
        nodes_clear(cc->alloc_nmask);
        rcu_read_lock();
        xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) {
                if (xas_retry(&xas, folio))
                        continue;

                if (xa_is_value(folio)) {
                        ++swap;
                        if (cc->is_khugepaged &&
                            swap > khugepaged_max_ptes_swap) {
                                result = SCAN_EXCEED_SWAP_PTE;
                                count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
                                break;
                        }
                        continue;
                }

                /*
                 * TODO: khugepaged should compact smaller compound pages
                 * into a PMD sized page
                 */
                if (folio_test_large(folio)) {
                        result = folio_order(folio) == HPAGE_PMD_ORDER &&
                                        folio->index == start
                                        /* Maybe PMD-mapped */
                                        ? SCAN_PTE_MAPPED_HUGEPAGE
                                        : SCAN_PAGE_COMPOUND;
                        /*
                         * For SCAN_PTE_MAPPED_HUGEPAGE, further processing
                         * by the caller won't touch the page cache, and so
                         * it's safe to skip LRU and refcount checks before
                         * returning.
                         */
                        break;
                }

                node = folio_nid(folio);
                if (hpage_collapse_scan_abort(node, cc)) {
                        result = SCAN_SCAN_ABORT;
                        break;
                }
                cc->node_load[node]++;

                if (!folio_test_lru(folio)) {
                        result = SCAN_PAGE_LRU;
                        break;
                }

                if (folio_ref_count(folio) !=
                    1 + folio_mapcount(folio) + folio_test_private(folio)) {
                        result = SCAN_PAGE_COUNT;
                        break;
                }

                /*
                 * We probably should check if the folio is referenced
                 * here, but nobody would transfer pte_young() to
                 * folio_test_referenced() for us.  And rmap walk here
                 * is just too costly...
                 */

                present++;

                if (need_resched()) {
                        xas_pause(&xas);
                        cond_resched_rcu();
                }
        }
        rcu_read_unlock();

        if (result == SCAN_SUCCEED) {
                if (cc->is_khugepaged &&
                    present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
                        result = SCAN_EXCEED_NONE_PTE;
                        count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
                } else {
                        result = collapse_file(mm, addr, file, start, cc);
                }
        }

        trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
        return result;
}
#else
static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
                                    struct file *file, pgoff_t start,
                                    struct collapse_control *cc)
{
        BUILD_BUG();
}
#endif

static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
                                            struct collapse_control *cc)
        __releases(&khugepaged_mm_lock)
        __acquires(&khugepaged_mm_lock)
{
        struct vma_iterator vmi;
        struct khugepaged_mm_slot *mm_slot;
        struct mm_slot *slot;
        struct mm_struct *mm;
        struct vm_area_struct *vma;
        int progress = 0;

        VM_BUG_ON(!pages);
        lockdep_assert_held(&khugepaged_mm_lock);
        *result = SCAN_FAIL;

        if (khugepaged_scan.mm_slot) {
                mm_slot = khugepaged_scan.mm_slot;
                slot = &mm_slot->slot;
        } else {
                slot = list_entry(khugepaged_scan.mm_head.next,
                                     struct mm_slot, mm_node);
                mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
                khugepaged_scan.address = 0;
                khugepaged_scan.mm_slot = mm_slot;
        }
        spin_unlock(&khugepaged_mm_lock);

        mm = slot->mm;
        /*
         * Don't wait for semaphore (to avoid long wait times).  Just move to
         * the next mm on the list.
         */
        vma = NULL;
        if (unlikely(!mmap_read_trylock(mm)))
                goto breakouterloop_mmap_lock;

        progress++;
        if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
                goto breakouterloop;

        vma_iter_init(&vmi, mm, khugepaged_scan.address);
        for_each_vma(vmi, vma) {
                unsigned long hstart, hend;

                cond_resched();
                if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
                        progress++;
                        break;
                }
                if (!thp_vma_allowable_order(vma, vma->vm_flags,
                                        TVA_ENFORCE_SYSFS, PMD_ORDER)) {
skip:
                        progress++;
                        continue;
                }
                hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
                hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
                if (khugepaged_scan.address > hend)
                        goto skip;
                if (khugepaged_scan.address < hstart)
                        khugepaged_scan.address = hstart;
                VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);

                while (khugepaged_scan.address < hend) {
                        bool mmap_locked = true;

                        cond_resched();
                        if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
                                goto breakouterloop;

                        VM_BUG_ON(khugepaged_scan.address < hstart ||
                                  khugepaged_scan.address + HPAGE_PMD_SIZE >
                                  hend);
                        if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
                                struct file *file = get_file(vma->vm_file);
                                pgoff_t pgoff = linear_page_index(vma,
                                                khugepaged_scan.address);

                                mmap_read_unlock(mm);
                                mmap_locked = false;
                                *result = hpage_collapse_scan_file(mm,
                                        khugepaged_scan.address, file, pgoff, cc);
                                fput(file);
                                if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
                                        mmap_read_lock(mm);
                                        if (hpage_collapse_test_exit_or_disable(mm))
                                                goto breakouterloop;
                                        *result = collapse_pte_mapped_thp(mm,
                                                khugepaged_scan.address, false);
                                        if (*result == SCAN_PMD_MAPPED)
                                                *result = SCAN_SUCCEED;
                                        mmap_read_unlock(mm);
                                }
                        } else {
                                *result = hpage_collapse_scan_pmd(mm, vma,
                                        khugepaged_scan.address, &mmap_locked, cc);
                        }

                        if (*result == SCAN_SUCCEED)
                                ++khugepaged_pages_collapsed;

                        /* move to next address */
                        khugepaged_scan.address += HPAGE_PMD_SIZE;
                        progress += HPAGE_PMD_NR;
                        if (!mmap_locked)
                                /*
                                 * We released mmap_lock so break loop.  Note
                                 * that we drop mmap_lock before all hugepage
                                 * allocations, so if allocation fails, we are
                                 * guaranteed to break here and report the
                                 * correct result back to caller.
                                 */
                                goto breakouterloop_mmap_lock;
                        if (progress >= pages)
                                goto breakouterloop;
                }
        }
breakouterloop:
        mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
breakouterloop_mmap_lock:

        spin_lock(&khugepaged_mm_lock);
        VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
        /*
         * Release the current mm_slot if this mm is about to die, or
         * if we scanned all vmas of this mm.
         */
        if (hpage_collapse_test_exit(mm) || !vma) {
                /*
                 * Make sure that if mm_users is reaching zero while
                 * khugepaged runs here, khugepaged_exit will find
                 * mm_slot not pointing to the exiting mm.
                 */
                if (slot->mm_node.next != &khugepaged_scan.mm_head) {
                        slot = list_entry(slot->mm_node.next,
                                          struct mm_slot, mm_node);
                        khugepaged_scan.mm_slot =
                                mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
                        khugepaged_scan.address = 0;
                } else {
                        khugepaged_scan.mm_slot = NULL;
                        khugepaged_full_scans++;
                }

                collect_mm_slot(mm_slot);
        }

        return progress;
}

static int khugepaged_has_work(void)
{
        return !list_empty(&khugepaged_scan.mm_head) &&
                hugepage_flags_enabled();
}

static int khugepaged_wait_event(void)
{
        return !list_empty(&khugepaged_scan.mm_head) ||
                kthread_should_stop();
}

static void khugepaged_do_scan(struct collapse_control *cc)
{
        unsigned int progress = 0, pass_through_head = 0;
        unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
        bool wait = true;
        int result = SCAN_SUCCEED;

        lru_add_drain_all();

        while (true) {
                cond_resched();

                if (unlikely(kthread_should_stop()))
                        break;

                spin_lock(&khugepaged_mm_lock);
                if (!khugepaged_scan.mm_slot)
                        pass_through_head++;
                if (khugepaged_has_work() &&
                    pass_through_head < 2)
                        progress += khugepaged_scan_mm_slot(pages - progress,
                                                            &result, cc);
                else
                        progress = pages;
                spin_unlock(&khugepaged_mm_lock);

                if (progress >= pages)
                        break;

                if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
                        /*
                         * If fail to allocate the first time, try to sleep for
                         * a while.  When hit again, cancel the scan.
                         */
                        if (!wait)
                                break;
                        wait = false;
                        khugepaged_alloc_sleep();
                }
        }
}

static bool khugepaged_should_wakeup(void)
{
        return kthread_should_stop() ||
               time_after_eq(jiffies, khugepaged_sleep_expire);
}

static void khugepaged_wait_work(void)
{
        if (khugepaged_has_work()) {
                const unsigned long scan_sleep_jiffies =
                        msecs_to_jiffies(khugepaged_scan_sleep_millisecs);

                if (!scan_sleep_jiffies)
                        return;

                khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
                wait_event_freezable_timeout(khugepaged_wait,
                                             khugepaged_should_wakeup(),
                                             scan_sleep_jiffies);
                return;
        }

        if (hugepage_flags_enabled())
                wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
}

static int khugepaged(void *none)
{
        struct khugepaged_mm_slot *mm_slot;

        set_freezable();
        set_user_nice(current, MAX_NICE);

        while (!kthread_should_stop()) {
                khugepaged_do_scan(&khugepaged_collapse_control);
                khugepaged_wait_work();
        }

        spin_lock(&khugepaged_mm_lock);
        mm_slot = khugepaged_scan.mm_slot;
        khugepaged_scan.mm_slot = NULL;
        if (mm_slot)
                collect_mm_slot(mm_slot);
        spin_unlock(&khugepaged_mm_lock);
        return 0;
}

static void set_recommended_min_free_kbytes(void)
{
        struct zone *zone;
        int nr_zones = 0;
        unsigned long recommended_min;

        if (!hugepage_flags_enabled()) {
                calculate_min_free_kbytes();
                goto update_wmarks;
        }

        for_each_populated_zone(zone) {
                /*
                 * We don't need to worry about fragmentation of
                 * ZONE_MOVABLE since it only has movable pages.
                 */
                if (zone_idx(zone) > gfp_zone(GFP_USER))
                        continue;

                nr_zones++;
        }

        /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
        recommended_min = pageblock_nr_pages * nr_zones * 2;

        /*
         * Make sure that on average at least two pageblocks are almost free
         * of another type, one for a migratetype to fall back to and a
         * second to avoid subsequent fallbacks of other types There are 3
         * MIGRATE_TYPES we care about.
         */
        recommended_min += pageblock_nr_pages * nr_zones *
                           MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;

        /* don't ever allow to reserve more than 5% of the lowmem */
        recommended_min = min(recommended_min,
                              (unsigned long) nr_free_buffer_pages() / 20);
        recommended_min <<= (PAGE_SHIFT-10);

        if (recommended_min > min_free_kbytes) {
                if (user_min_free_kbytes >= 0)
                        pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
                                min_free_kbytes, recommended_min);

                min_free_kbytes = recommended_min;
        }

update_wmarks:
        setup_per_zone_wmarks();
}

int start_stop_khugepaged(void)
{
        int err = 0;

        mutex_lock(&khugepaged_mutex);
        if (hugepage_flags_enabled()) {
                if (!khugepaged_thread)
                        khugepaged_thread = kthread_run(khugepaged, NULL,
                                                        "khugepaged");
                if (IS_ERR(khugepaged_thread)) {
                        pr_err("khugepaged: kthread_run(khugepaged) failed\n");
                        err = PTR_ERR(khugepaged_thread);
                        khugepaged_thread = NULL;
                        goto fail;
                }

                if (!list_empty(&khugepaged_scan.mm_head))
                        wake_up_interruptible(&khugepaged_wait);
        } else if (khugepaged_thread) {
                kthread_stop(khugepaged_thread);
                khugepaged_thread = NULL;
        }
        set_recommended_min_free_kbytes();
fail:
        mutex_unlock(&khugepaged_mutex);
        return err;
}

void khugepaged_min_free_kbytes_update(void)
{
        mutex_lock(&khugepaged_mutex);
        if (hugepage_flags_enabled() && khugepaged_thread)
                set_recommended_min_free_kbytes();
        mutex_unlock(&khugepaged_mutex);
}

bool current_is_khugepaged(void)
{
        return kthread_func(current) == khugepaged;
}

static int madvise_collapse_errno(enum scan_result r)
{
        /*
         * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide
         * actionable feedback to caller, so they may take an appropriate
         * fallback measure depending on the nature of the failure.
         */
        switch (r) {
        case SCAN_ALLOC_HUGE_PAGE_FAIL:
                return -ENOMEM;
        case SCAN_CGROUP_CHARGE_FAIL:
        case SCAN_EXCEED_NONE_PTE:
                return -EBUSY;
        /* Resource temporary unavailable - trying again might succeed */
        case SCAN_PAGE_COUNT:
        case SCAN_PAGE_LOCK:
        case SCAN_PAGE_LRU:
        case SCAN_DEL_PAGE_LRU:
        case SCAN_PAGE_FILLED:
                return -EAGAIN;
        /*
         * Other: Trying again likely not to succeed / error intrinsic to
         * specified memory range. khugepaged likely won't be able to collapse
         * either.
         */
        default:
                return -EINVAL;
        }
}

int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
                     unsigned long start, unsigned long end)
{
        struct collapse_control *cc;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long hstart, hend, addr;
        int thps = 0, last_fail = SCAN_FAIL;
        bool mmap_locked = true;

        BUG_ON(vma->vm_start > start);
        BUG_ON(vma->vm_end < end);

        *prev = vma;

        if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
                return -EINVAL;

        cc = kmalloc(sizeof(*cc), GFP_KERNEL);
        if (!cc)
                return -ENOMEM;
        cc->is_khugepaged = false;

        mmgrab(mm);
        lru_add_drain_all();

        hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = end & HPAGE_PMD_MASK;

        for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
                int result = SCAN_FAIL;

                if (!mmap_locked) {
                        cond_resched();
                        mmap_read_lock(mm);
                        mmap_locked = true;
                        result = hugepage_vma_revalidate(mm, addr, false, &vma,
                                                         cc);
                        if (result  != SCAN_SUCCEED) {
                                last_fail = result;
                                goto out_nolock;
                        }

                        hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
                }
                mmap_assert_locked(mm);
                memset(cc->node_load, 0, sizeof(cc->node_load));
                nodes_clear(cc->alloc_nmask);
                if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
                        struct file *file = get_file(vma->vm_file);
                        pgoff_t pgoff = linear_page_index(vma, addr);

                        mmap_read_unlock(mm);
                        mmap_locked = false;
                        result = hpage_collapse_scan_file(mm, addr, file, pgoff,
                                                          cc);
                        fput(file);
                } else {
                        result = hpage_collapse_scan_pmd(mm, vma, addr,
                                                         &mmap_locked, cc);
                }
                if (!mmap_locked)
                        *prev = NULL;  /* Tell caller we dropped mmap_lock */

handle_result:
                switch (result) {
                case SCAN_SUCCEED:
                case SCAN_PMD_MAPPED:
                        ++thps;
                        break;
                case SCAN_PTE_MAPPED_HUGEPAGE:
                        BUG_ON(mmap_locked);
                        BUG_ON(*prev);
                        mmap_read_lock(mm);
                        result = collapse_pte_mapped_thp(mm, addr, true);
                        mmap_read_unlock(mm);
                        goto handle_result;
                /* Whitelisted set of results where continuing OK */
                case SCAN_PMD_NULL:
                case SCAN_PTE_NON_PRESENT:
                case SCAN_PTE_UFFD_WP:
                case SCAN_PAGE_RO:
                case SCAN_LACK_REFERENCED_PAGE:
                case SCAN_PAGE_NULL:
                case SCAN_PAGE_COUNT:
                case SCAN_PAGE_LOCK:
                case SCAN_PAGE_COMPOUND:
                case SCAN_PAGE_LRU:
                case SCAN_DEL_PAGE_LRU:
                        last_fail = result;
                        break;
                default:
                        last_fail = result;
                        /* Other error, exit */
                        goto out_maybelock;
                }
        }

out_maybelock:
        /* Caller expects us to hold mmap_lock on return */
        if (!mmap_locked)
                mmap_read_lock(mm);
out_nolock:
        mmap_assert_locked(mm);
        mmdrop(mm);
        kfree(cc);

        return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
                        : madvise_collapse_errno(last_fail);
}


















































































































































































































































































































































































































    1 
    7 




    4 

    1 

































    4 


































































    1 



















    2 

    1 






    3 










    3 





    4 




















    3 




















    3 


























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_SIGNAL_H
#define _LINUX_SCHED_SIGNAL_H

#include <linux/rculist.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/sched/jobctl.h>
#include <linux/sched/task.h>
#include <linux/cred.h>
#include <linux/refcount.h>
#include <linux/pid.h>
#include <linux/posix-timers.h>
#include <linux/mm_types.h>
#include <asm/ptrace.h>

/*
 * Types defining task->signal and task->sighand and APIs using them:
 */

struct sighand_struct {
        spinlock_t                siglock;
        refcount_t                count;
        wait_queue_head_t        signalfd_wqh;
        struct k_sigaction        action[_NSIG];
};

/*
 * Per-process accounting stats:
 */
struct pacct_struct {
        int                        ac_flag;
        long                        ac_exitcode;
        unsigned long                ac_mem;
        u64                        ac_utime, ac_stime;
        unsigned long                ac_minflt, ac_majflt;
};

struct cpu_itimer {
        u64 expires;
        u64 incr;
};

/*
 * This is the atomic variant of task_cputime, which can be used for
 * storing and updating task_cputime statistics without locking.
 */
struct task_cputime_atomic {
        atomic64_t utime;
        atomic64_t stime;
        atomic64_t sum_exec_runtime;
};

#define INIT_CPUTIME_ATOMIC \
        (struct task_cputime_atomic) {                                \
                .utime = ATOMIC64_INIT(0),                        \
                .stime = ATOMIC64_INIT(0),                        \
                .sum_exec_runtime = ATOMIC64_INIT(0),                \
        }
/**
 * struct thread_group_cputimer - thread group interval timer counts
 * @cputime_atomic:        atomic thread group interval timers.
 *
 * This structure contains the version of task_cputime, above, that is
 * used for thread group CPU timer calculations.
 */
struct thread_group_cputimer {
        struct task_cputime_atomic cputime_atomic;
};

struct multiprocess_signals {
        sigset_t signal;
        struct hlist_node node;
};

struct core_thread {
        struct task_struct *task;
        struct core_thread *next;
};

struct core_state {
        atomic_t nr_threads;
        struct core_thread dumper;
        struct completion startup;
};

/*
 * NOTE! "signal_struct" does not have its own
 * locking, because a shared signal_struct always
 * implies a shared sighand_struct, so locking
 * sighand_struct is always a proper superset of
 * the locking of signal_struct.
 */
struct signal_struct {
        refcount_t                sigcnt;
        atomic_t                live;
        int                        nr_threads;
        int                        quick_threads;
        struct list_head        thread_head;

        wait_queue_head_t        wait_chldexit;        /* for wait4() */

        /* current thread group signal load-balancing target: */
        struct task_struct        *curr_target;

        /* shared signal handling: */
        struct sigpending        shared_pending;

        /* For collecting multiprocess signals during fork */
        struct hlist_head        multiprocess;

        /* thread group exit support */
        int                        group_exit_code;
        /* notify group_exec_task when notify_count is less or equal to 0 */
        int                        notify_count;
        struct task_struct        *group_exec_task;

        /* thread group stop support, overloads group_exit_code too */
        int                        group_stop_count;
        unsigned int                flags; /* see SIGNAL_* flags below */

        struct core_state *core_state; /* coredumping support */

        /*
         * PR_SET_CHILD_SUBREAPER marks a process, like a service
         * manager, to re-parent orphan (double-forking) child processes
         * to this process instead of 'init'. The service manager is
         * able to receive SIGCHLD signals and is able to investigate
         * the process until it calls wait(). All children of this
         * process will inherit a flag if they should look for a
         * child_subreaper process at exit.
         */
        unsigned int                is_child_subreaper:1;
        unsigned int                has_child_subreaper:1;

#ifdef CONFIG_POSIX_TIMERS

        /* POSIX.1b Interval Timers */
        unsigned int                next_posix_timer_id;
        struct list_head        posix_timers;

        /* ITIMER_REAL timer for the process */
        struct hrtimer real_timer;
        ktime_t it_real_incr;

        /*
         * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
         * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
         * values are defined to 0 and 1 respectively
         */
        struct cpu_itimer it[2];

        /*
         * Thread group totals for process CPU timers.
         * See thread_group_cputimer(), et al, for details.
         */
        struct thread_group_cputimer cputimer;

#endif
        /* Empty if CONFIG_POSIX_TIMERS=n */
        struct posix_cputimers posix_cputimers;

        /* PID/PID hash table linkage. */
        struct pid *pids[PIDTYPE_MAX];

#ifdef CONFIG_NO_HZ_FULL
        atomic_t tick_dep_mask;
#endif

        struct pid *tty_old_pgrp;

        /* boolean value for session group leader */
        int leader;

        struct tty_struct *tty; /* NULL if no tty */

#ifdef CONFIG_SCHED_AUTOGROUP
        struct autogroup *autogroup;
#endif
        /*
         * Cumulative resource counters for dead threads in the group,
         * and for reaped dead child processes forked by this group.
         * Live threads maintain their own counters and add to these
         * in __exit_signal, except for the group leader.
         */
        seqlock_t stats_lock;
        u64 utime, stime, cutime, cstime;
        u64 gtime;
        u64 cgtime;
        struct prev_cputime prev_cputime;
        unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
        unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
        unsigned long inblock, oublock, cinblock, coublock;
        unsigned long maxrss, cmaxrss;
        struct task_io_accounting ioac;

        /*
         * Cumulative ns of schedule CPU time fo dead threads in the
         * group, not including a zombie group leader, (This only differs
         * from jiffies_to_ns(utime + stime) if sched_clock uses something
         * other than jiffies.)
         */
        unsigned long long sum_sched_runtime;

        /*
         * We don't bother to synchronize most readers of this at all,
         * because there is no reader checking a limit that actually needs
         * to get both rlim_cur and rlim_max atomically, and either one
         * alone is a single word that can safely be read normally.
         * getrlimit/setrlimit use task_lock(current->group_leader) to
         * protect this instead of the siglock, because they really
         * have no need to disable irqs.
         */
        struct rlimit rlim[RLIM_NLIMITS];

#ifdef CONFIG_BSD_PROCESS_ACCT
        struct pacct_struct pacct;        /* per-process accounting information */
#endif
#ifdef CONFIG_TASKSTATS
        struct taskstats *stats;
#endif
#ifdef CONFIG_AUDIT
        unsigned audit_tty;
        struct tty_audit_buf *tty_audit_buf;
#endif

        /*
         * Thread is the potential origin of an oom condition; kill first on
         * oom
         */
        bool oom_flag_origin;
        short oom_score_adj;                /* OOM kill score adjustment */
        short oom_score_adj_min;        /* OOM kill score adjustment min value.
                                         * Only settable by CAP_SYS_RESOURCE. */
        struct mm_struct *oom_mm;        /* recorded mm when the thread group got
                                         * killed by the oom killer */

        struct mutex cred_guard_mutex;        /* guard against foreign influences on
                                         * credential calculations
                                         * (notably. ptrace)
                                         * Deprecated do not use in new code.
                                         * Use exec_update_lock instead.
                                         */
        struct rw_semaphore exec_update_lock;        /* Held while task_struct is
                                                 * being updated during exec,
                                                 * and may have inconsistent
                                                 * permissions.
                                                 */
} __randomize_layout;

/*
 * Bits in flags field of signal_struct.
 */
#define SIGNAL_STOP_STOPPED        0x00000001 /* job control stop in effect */
#define SIGNAL_STOP_CONTINUED        0x00000002 /* SIGCONT since WCONTINUED reap */
#define SIGNAL_GROUP_EXIT        0x00000004 /* group exit in progress */
/*
 * Pending notifications to parent.
 */
#define SIGNAL_CLD_STOPPED        0x00000010
#define SIGNAL_CLD_CONTINUED        0x00000020
#define SIGNAL_CLD_MASK                (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)

#define SIGNAL_UNKILLABLE        0x00000040 /* for init: ignore fatal signals */

#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \
                          SIGNAL_STOP_CONTINUED)

static inline void signal_set_stop_flags(struct signal_struct *sig,
                                         unsigned int flags)
{
        WARN_ON(sig->flags & SIGNAL_GROUP_EXIT);
        sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags;
}

extern void flush_signals(struct task_struct *);
extern void ignore_signals(struct task_struct *);
extern void flush_signal_handlers(struct task_struct *, int force_default);
extern int dequeue_signal(struct task_struct *task, sigset_t *mask,
                          kernel_siginfo_t *info, enum pid_type *type);

static inline int kernel_dequeue_signal(void)
{
        struct task_struct *task = current;
        kernel_siginfo_t __info;
        enum pid_type __type;
        int ret;

        spin_lock_irq(&task->sighand->siglock);
        ret = dequeue_signal(task, &task->blocked, &__info, &__type);
        spin_unlock_irq(&task->sighand->siglock);

        return ret;
}

static inline void kernel_signal_stop(void)
{
        spin_lock_irq(&current->sighand->siglock);
        if (current->jobctl & JOBCTL_STOP_DEQUEUED) {
                current->jobctl |= JOBCTL_STOPPED;
                set_special_state(TASK_STOPPED);
        }
        spin_unlock_irq(&current->sighand->siglock);

        schedule();
}

int force_sig_fault_to_task(int sig, int code, void __user *addr,
                            struct task_struct *t);
int force_sig_fault(int sig, int code, void __user *addr);
int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t);

int force_sig_mceerr(int code, void __user *, short);
int send_sig_mceerr(int code, void __user *, short, struct task_struct *);

int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper);
int force_sig_pkuerr(void __user *addr, u32 pkey);
int send_sig_perf(void __user *addr, u32 type, u64 sig_data);

int force_sig_ptrace_errno_trap(int errno, void __user *addr);
int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno);
int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno,
                        struct task_struct *t);
int force_sig_seccomp(int syscall, int reason, bool force_coredump);

extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
extern void force_sigsegv(int sig);
extern int force_sig_info(struct kernel_siginfo *);
extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp);
extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid);
extern int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *,
                                const struct cred *);
extern int kill_pgrp(struct pid *pid, int sig, int priv);
extern int kill_pid(struct pid *pid, int sig, int priv);
extern __must_check bool do_notify_parent(struct task_struct *, int);
extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
extern void force_sig(int);
extern void force_fatal_sig(int);
extern void force_exit_sig(int);
extern int send_sig(int, struct task_struct *, int);
extern int zap_other_threads(struct task_struct *p);
extern struct sigqueue *sigqueue_alloc(void);
extern void sigqueue_free(struct sigqueue *);
extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type);
extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);

static inline void clear_notify_signal(void)
{
        clear_thread_flag(TIF_NOTIFY_SIGNAL);
        smp_mb__after_atomic();
}

/*
 * Returns 'true' if kick_process() is needed to force a transition from
 * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work.
 */
static inline bool __set_notify_signal(struct task_struct *task)
{
        return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
               !wake_up_state(task, TASK_INTERRUPTIBLE);
}

/*
 * Called to break out of interruptible wait loops, and enter the
 * exit_to_user_mode_loop().
 */
static inline void set_notify_signal(struct task_struct *task)
{
        if (__set_notify_signal(task))
                kick_process(task);
}

static inline int restart_syscall(void)
{
        set_tsk_thread_flag(current, TIF_SIGPENDING);
        return -ERESTARTNOINTR;
}

static inline int task_sigpending(struct task_struct *p)
{
        return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
}

static inline int signal_pending(struct task_struct *p)
{
        /*
         * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same
         * behavior in terms of ensuring that we break out of wait loops
         * so that notify signal callbacks can be processed.
         */
        if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL)))
                return 1;
        return task_sigpending(p);
}

static inline int __fatal_signal_pending(struct task_struct *p)
{
        return unlikely(sigismember(&p->pending.signal, SIGKILL));
}

static inline int fatal_signal_pending(struct task_struct *p)
{
        return task_sigpending(p) && __fatal_signal_pending(p);
}

static inline int signal_pending_state(unsigned int state, struct task_struct *p)
{
        if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
                return 0;
        if (!signal_pending(p))
                return 0;

        return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
}

/*
 * This should only be used in fault handlers to decide whether we
 * should stop the current fault routine to handle the signals
 * instead, especially with the case where we've got interrupted with
 * a VM_FAULT_RETRY.
 */
static inline bool fault_signal_pending(vm_fault_t fault_flags,
                                        struct pt_regs *regs)
{
        return unlikely((fault_flags & VM_FAULT_RETRY) &&
                        (fatal_signal_pending(current) ||
                         (user_mode(regs) && signal_pending(current))));
}

/*
 * Reevaluate whether the task has signals pending delivery.
 * Wake the task if so.
 * This is required every time the blocked sigset_t changes.
 * callers must hold sighand->siglock.
 */
extern void recalc_sigpending(void);
extern void calculate_sigpending(void);

extern void signal_wake_up_state(struct task_struct *t, unsigned int state);

static inline void signal_wake_up(struct task_struct *t, bool fatal)
{
        unsigned int state = 0;
        if (fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN)) {
                t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED);
                state = TASK_WAKEKILL | __TASK_TRACED;
        }
        signal_wake_up_state(t, state);
}
static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
{
        unsigned int state = 0;
        if (resume) {
                t->jobctl &= ~JOBCTL_TRACED;
                state = __TASK_TRACED;
        }
        signal_wake_up_state(t, state);
}

void task_join_group_stop(struct task_struct *task);

#ifdef TIF_RESTORE_SIGMASK
/*
 * Legacy restore_sigmask accessors.  These are inefficient on
 * SMP architectures because they require atomic operations.
 */

/**
 * set_restore_sigmask() - make sure saved_sigmask processing gets done
 *
 * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code
 * will run before returning to user mode, to process the flag.  For
 * all callers, TIF_SIGPENDING is already set or it's no harm to set
 * it.  TIF_RESTORE_SIGMASK need not be in the set of bits that the
 * arch code will notice on return to user mode, in case those bits
 * are scarce.  We set TIF_SIGPENDING here to ensure that the arch
 * signal code always gets run when TIF_RESTORE_SIGMASK is set.
 */
static inline void set_restore_sigmask(void)
{
        set_thread_flag(TIF_RESTORE_SIGMASK);
}

static inline void clear_tsk_restore_sigmask(struct task_struct *task)
{
        clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK);
}

static inline void clear_restore_sigmask(void)
{
        clear_thread_flag(TIF_RESTORE_SIGMASK);
}
static inline bool test_tsk_restore_sigmask(struct task_struct *task)
{
        return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK);
}
static inline bool test_restore_sigmask(void)
{
        return test_thread_flag(TIF_RESTORE_SIGMASK);
}
static inline bool test_and_clear_restore_sigmask(void)
{
        return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK);
}

#else        /* TIF_RESTORE_SIGMASK */

/* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */
static inline void set_restore_sigmask(void)
{
        current->restore_sigmask = true;
}
static inline void clear_tsk_restore_sigmask(struct task_struct *task)
{
        task->restore_sigmask = false;
}
static inline void clear_restore_sigmask(void)
{
        current->restore_sigmask = false;
}
static inline bool test_restore_sigmask(void)
{
        return current->restore_sigmask;
}
static inline bool test_tsk_restore_sigmask(struct task_struct *task)
{
        return task->restore_sigmask;
}
static inline bool test_and_clear_restore_sigmask(void)
{
        if (!current->restore_sigmask)
                return false;
        current->restore_sigmask = false;
        return true;
}
#endif

static inline void restore_saved_sigmask(void)
{
        if (test_and_clear_restore_sigmask())
                __set_current_blocked(&current->saved_sigmask);
}

extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize);

static inline void restore_saved_sigmask_unless(bool interrupted)
{
        if (interrupted)
                WARN_ON(!signal_pending(current));
        else
                restore_saved_sigmask();
}

static inline sigset_t *sigmask_to_save(void)
{
        sigset_t *res = &current->blocked;
        if (unlikely(test_restore_sigmask()))
                res = &current->saved_sigmask;
        return res;
}

static inline int kill_cad_pid(int sig, int priv)
{
        return kill_pid(cad_pid, sig, priv);
}

/* These can be the second arg to send_sig_info/send_group_sig_info.  */
#define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0)
#define SEND_SIG_PRIV        ((struct kernel_siginfo *) 1)

static inline int __on_sig_stack(unsigned long sp)
{
#ifdef CONFIG_STACK_GROWSUP
        return sp >= current->sas_ss_sp &&
                sp - current->sas_ss_sp < current->sas_ss_size;
#else
        return sp > current->sas_ss_sp &&
                sp - current->sas_ss_sp <= current->sas_ss_size;
#endif
}

/*
 * True if we are on the alternate signal stack.
 */
static inline int on_sig_stack(unsigned long sp)
{
        /*
         * If the signal stack is SS_AUTODISARM then, by construction, we
         * can't be on the signal stack unless user code deliberately set
         * SS_AUTODISARM when we were already on it.
         *
         * This improves reliability: if user state gets corrupted such that
         * the stack pointer points very close to the end of the signal stack,
         * then this check will enable the signal to be handled anyway.
         */
        if (current->sas_ss_flags & SS_AUTODISARM)
                return 0;

        return __on_sig_stack(sp);
}

static inline int sas_ss_flags(unsigned long sp)
{
        if (!current->sas_ss_size)
                return SS_DISABLE;

        return on_sig_stack(sp) ? SS_ONSTACK : 0;
}

static inline void sas_ss_reset(struct task_struct *p)
{
        p->sas_ss_sp = 0;
        p->sas_ss_size = 0;
        p->sas_ss_flags = SS_DISABLE;
}

static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
{
        if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp))
#ifdef CONFIG_STACK_GROWSUP
                return current->sas_ss_sp;
#else
                return current->sas_ss_sp + current->sas_ss_size;
#endif
        return sp;
}

extern void __cleanup_sighand(struct sighand_struct *);
extern void flush_itimer_signals(void);

#define tasklist_empty() \
        list_empty(&init_task.tasks)

#define next_task(p) \
        list_entry_rcu((p)->tasks.next, struct task_struct, tasks)

#define for_each_process(p) \
        for (p = &init_task ; (p = next_task(p)) != &init_task ; )

extern bool current_is_single_threaded(void);

/*
 * Without tasklist/siglock it is only rcu-safe if g can't exit/exec,
 * otherwise next_thread(t) will never reach g after list_del_rcu(g).
 */
#define while_each_thread(g, t) \
        while ((t = next_thread(t)) != g)

#define for_other_threads(p, t)        \
        for (t = p; (t = next_thread(t)) != p; )

#define __for_each_thread(signal, t)        \
        list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
                lockdep_is_held(&tasklist_lock))

#define for_each_thread(p, t)                \
        __for_each_thread((p)->signal, t)

/* Careful: this is a double loop, 'break' won't work as expected. */
#define for_each_process_thread(p, t)        \
        for_each_process(p) for_each_thread(p, t)

typedef int (*proc_visitor)(struct task_struct *p, void *data);
void walk_process_tree(struct task_struct *top, proc_visitor, void *);

static inline
struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
{
        struct pid *pid;
        if (type == PIDTYPE_PID)
                pid = task_pid(task);
        else
                pid = task->signal->pids[type];
        return pid;
}

static inline struct pid *task_tgid(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_TGID];
}

/*
 * Without tasklist or RCU lock it is not safe to dereference
 * the result of task_pgrp/task_session even if task == current,
 * we can race with another thread doing sys_setsid/sys_setpgid.
 */
static inline struct pid *task_pgrp(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_PGID];
}

static inline struct pid *task_session(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_SID];
}

static inline int get_nr_threads(struct task_struct *task)
{
        return task->signal->nr_threads;
}

static inline bool thread_group_leader(struct task_struct *p)
{
        return p->exit_signal >= 0;
}

static inline
bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
{
        return p1->signal == p2->signal;
}

/*
 * returns NULL if p is the last thread in the thread group
 */
static inline struct task_struct *__next_thread(struct task_struct *p)
{
        return list_next_or_null_rcu(&p->signal->thread_head,
                                        &p->thread_node,
                                        struct task_struct,
                                        thread_node);
}

static inline struct task_struct *next_thread(struct task_struct *p)
{
        return __next_thread(p) ?: p->group_leader;
}

static inline int thread_group_empty(struct task_struct *p)
{
        return thread_group_leader(p) &&
               list_is_last(&p->thread_node, &p->signal->thread_head);
}

#define delay_group_leader(p) \
                (thread_group_leader(p) && !thread_group_empty(p))

extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
                                                        unsigned long *flags);

static inline struct sighand_struct *lock_task_sighand(struct task_struct *task,
                                                       unsigned long *flags)
{
        struct sighand_struct *ret;

        ret = __lock_task_sighand(task, flags);
        (void)__cond_lock(&task->sighand->siglock, ret);
        return ret;
}

static inline void unlock_task_sighand(struct task_struct *task,
                                                unsigned long *flags)
{
        spin_unlock_irqrestore(&task->sighand->siglock, *flags);
}

#ifdef CONFIG_LOCKDEP
extern void lockdep_assert_task_sighand_held(struct task_struct *task);
#else
static inline void lockdep_assert_task_sighand_held(struct task_struct *task) { }
#endif

static inline unsigned long task_rlimit(const struct task_struct *task,
                unsigned int limit)
{
        return READ_ONCE(task->signal->rlim[limit].rlim_cur);
}

static inline unsigned long task_rlimit_max(const struct task_struct *task,
                unsigned int limit)
{
        return READ_ONCE(task->signal->rlim[limit].rlim_max);
}

static inline unsigned long rlimit(unsigned int limit)
{
        return task_rlimit(current, limit);
}

static inline unsigned long rlimit_max(unsigned int limit)
{
        return task_rlimit_max(current, limit);
}

#endif /* _LINUX_SCHED_SIGNAL_H */





























    1 






    1 
























































    1 
    1 


    1 










































    1 






    1 

















    1 















































    1 


    1 

    1 



    1 






    1 




    1 


















    1 




    1 



































































    1 



























































    1 





    1 














    1 




    1 








    1 











    2 
































    1 

    1 



    2 












































































































    2 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) Neil Brown 2002
 * Copyright (C) Christoph Hellwig 2007
 *
 * This file contains the code mapping from inodes to NFS file handles,
 * and for mapping back from file handles to dentries.
 *
 * For details on why we do all the strange and hairy things in here
 * take a look at Documentation/filesystems/nfs/exporting.rst.
 */
#include <linux/exportfs.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/sched.h>
#include <linux/cred.h>

#define dprintk(fmt, args...) pr_debug(fmt, ##args)


static int get_name(const struct path *path, char *name, struct dentry *child);


static int exportfs_get_name(struct vfsmount *mnt, struct dentry *dir,
                char *name, struct dentry *child)
{
        const struct export_operations *nop = dir->d_sb->s_export_op;
        struct path path = {.mnt = mnt, .dentry = dir};

        if (nop->get_name)
                return nop->get_name(dir, name, child);
        else
                return get_name(&path, name, child);
}

/*
 * Check if the dentry or any of it's aliases is acceptable.
 */
static struct dentry *
find_acceptable_alias(struct dentry *result,
                int (*acceptable)(void *context, struct dentry *dentry),
                void *context)
{
        struct dentry *dentry, *toput = NULL;
        struct inode *inode;

        if (acceptable(context, result))
                return result;

        inode = result->d_inode;
        spin_lock(&inode->i_lock);
        hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
                dget(dentry);
                spin_unlock(&inode->i_lock);
                if (toput)
                        dput(toput);
                if (dentry != result && acceptable(context, dentry)) {
                        dput(result);
                        return dentry;
                }
                spin_lock(&inode->i_lock);
                toput = dentry;
        }
        spin_unlock(&inode->i_lock);

        if (toput)
                dput(toput);
        return NULL;
}

static bool dentry_connected(struct dentry *dentry)
{
        dget(dentry);
        while (dentry->d_flags & DCACHE_DISCONNECTED) {
                struct dentry *parent = dget_parent(dentry);

                dput(dentry);
                if (dentry == parent) {
                        dput(parent);
                        return false;
                }
                dentry = parent;
        }
        dput(dentry);
        return true;
}

static void clear_disconnected(struct dentry *dentry)
{
        dget(dentry);
        while (dentry->d_flags & DCACHE_DISCONNECTED) {
                struct dentry *parent = dget_parent(dentry);

                WARN_ON_ONCE(IS_ROOT(dentry));

                spin_lock(&dentry->d_lock);
                dentry->d_flags &= ~DCACHE_DISCONNECTED;
                spin_unlock(&dentry->d_lock);

                dput(dentry);
                dentry = parent;
        }
        dput(dentry);
}

/*
 * Reconnect a directory dentry with its parent.
 *
 * This can return a dentry, or NULL, or an error.
 *
 * In the first case the returned dentry is the parent of the given
 * dentry, and may itself need to be reconnected to its parent.
 *
 * In the NULL case, a concurrent VFS operation has either renamed or
 * removed this directory.  The concurrent operation has reconnected our
 * dentry, so we no longer need to.
 */
static struct dentry *reconnect_one(struct vfsmount *mnt,
                struct dentry *dentry, char *nbuf)
{
        struct dentry *parent;
        struct dentry *tmp;
        int err;

        parent = ERR_PTR(-EACCES);
        inode_lock(dentry->d_inode);
        if (mnt->mnt_sb->s_export_op->get_parent)
                parent = mnt->mnt_sb->s_export_op->get_parent(dentry);
        inode_unlock(dentry->d_inode);

        if (IS_ERR(parent)) {
                dprintk("get_parent of %lu failed, err %ld\n",
                        dentry->d_inode->i_ino, PTR_ERR(parent));
                return parent;
        }

        dprintk("%s: find name of %lu in %lu\n", __func__,
                dentry->d_inode->i_ino, parent->d_inode->i_ino);
        err = exportfs_get_name(mnt, parent, nbuf, dentry);
        if (err == -ENOENT)
                goto out_reconnected;
        if (err)
                goto out_err;
        dprintk("%s: found name: %s\n", __func__, nbuf);
        tmp = lookup_one_unlocked(mnt_idmap(mnt), nbuf, parent, strlen(nbuf));
        if (IS_ERR(tmp)) {
                dprintk("lookup failed: %ld\n", PTR_ERR(tmp));
                err = PTR_ERR(tmp);
                goto out_err;
        }
        if (tmp != dentry) {
                /*
                 * Somebody has renamed it since exportfs_get_name();
                 * great, since it could've only been renamed if it
                 * got looked up and thus connected, and it would
                 * remain connected afterwards.  We are done.
                 */
                dput(tmp);
                goto out_reconnected;
        }
        dput(tmp);
        if (IS_ROOT(dentry)) {
                err = -ESTALE;
                goto out_err;
        }
        return parent;

out_err:
        dput(parent);
        return ERR_PTR(err);
out_reconnected:
        dput(parent);
        /*
         * Someone must have renamed our entry into another parent, in
         * which case it has been reconnected by the rename.
         *
         * Or someone removed it entirely, in which case filehandle
         * lookup will succeed but the directory is now IS_DEAD and
         * subsequent operations on it will fail.
         *
         * Alternatively, maybe there was no race at all, and the
         * filesystem is just corrupt and gave us a parent that doesn't
         * actually contain any entry pointing to this inode.  So,
         * double check that this worked and return -ESTALE if not:
         */
        if (!dentry_connected(dentry))
                return ERR_PTR(-ESTALE);
        return NULL;
}

/*
 * Make sure target_dir is fully connected to the dentry tree.
 *
 * On successful return, DCACHE_DISCONNECTED will be cleared on
 * target_dir, and target_dir->d_parent->...->d_parent will reach the
 * root of the filesystem.
 *
 * Whenever DCACHE_DISCONNECTED is unset, target_dir is fully connected.
 * But the converse is not true: target_dir may have DCACHE_DISCONNECTED
 * set but already be connected.  In that case we'll verify the
 * connection to root and then clear the flag.
 *
 * Note that target_dir could be removed by a concurrent operation.  In
 * that case reconnect_path may still succeed with target_dir fully
 * connected, but further operations using the filehandle will fail when
 * necessary (due to S_DEAD being set on the directory).
 */
static int
reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
{
        struct dentry *dentry, *parent;

        dentry = dget(target_dir);

        while (dentry->d_flags & DCACHE_DISCONNECTED) {
                BUG_ON(dentry == mnt->mnt_sb->s_root);

                if (IS_ROOT(dentry))
                        parent = reconnect_one(mnt, dentry, nbuf);
                else
                        parent = dget_parent(dentry);

                if (!parent)
                        break;
                dput(dentry);
                if (IS_ERR(parent))
                        return PTR_ERR(parent);
                dentry = parent;
        }
        dput(dentry);
        clear_disconnected(target_dir);
        return 0;
}

struct getdents_callback {
        struct dir_context ctx;
        char *name;                /* name that was found. It already points to a
                                   buffer NAME_MAX+1 is size */
        u64 ino;                /* the inum we are looking for */
        int found;                /* inode matched? */
        int sequence;                /* sequence counter */
};

/*
 * A rather strange filldir function to capture
 * the name matching the specified inode number.
 */
static bool filldir_one(struct dir_context *ctx, const char *name, int len,
                        loff_t pos, u64 ino, unsigned int d_type)
{
        struct getdents_callback *buf =
                container_of(ctx, struct getdents_callback, ctx);

        buf->sequence++;
        if (buf->ino == ino && len <= NAME_MAX && !is_dot_dotdot(name, len)) {
                memcpy(buf->name, name, len);
                buf->name[len] = '\0';
                buf->found = 1;
                return false;        // no more
        }
        return true;
}

/**
 * get_name - default export_operations->get_name function
 * @path:   the directory in which to find a name
 * @name:   a pointer to a %NAME_MAX+1 char buffer to store the name
 * @child:  the dentry for the child directory.
 *
 * calls readdir on the parent until it finds an entry with
 * the same inode number as the child, and returns that.
 */
static int get_name(const struct path *path, char *name, struct dentry *child)
{
        const struct cred *cred = current_cred();
        struct inode *dir = path->dentry->d_inode;
        int error;
        struct file *file;
        struct kstat stat;
        struct path child_path = {
                .mnt = path->mnt,
                .dentry = child,
        };
        struct getdents_callback buffer = {
                .ctx.actor = filldir_one,
                .name = name,
        };

        error = -ENOTDIR;
        if (!dir || !S_ISDIR(dir->i_mode))
                goto out;
        error = -EINVAL;
        if (!dir->i_fop)
                goto out;
        /*
         * inode->i_ino is unsigned long, kstat->ino is u64, so the
         * former would be insufficient on 32-bit hosts when the
         * filesystem supports 64-bit inode numbers.  So we need to
         * actually call ->getattr, not just read i_ino:
         */
        error = vfs_getattr_nosec(&child_path, &stat,
                                  STATX_INO, AT_STATX_SYNC_AS_STAT);
        if (error)
                return error;
        buffer.ino = stat.ino;
        /*
         * Open the directory ...
         */
        file = dentry_open(path, O_RDONLY, cred);
        error = PTR_ERR(file);
        if (IS_ERR(file))
                goto out;

        error = -EINVAL;
        if (!file->f_op->iterate_shared)
                goto out_close;

        buffer.sequence = 0;
        while (1) {
                int old_seq = buffer.sequence;

                error = iterate_dir(file, &buffer.ctx);
                if (buffer.found) {
                        error = 0;
                        break;
                }

                if (error < 0)
                        break;

                error = -ENOENT;
                if (old_seq == buffer.sequence)
                        break;
        }

out_close:
        fput(file);
out:
        return error;
}

#define FILEID_INO64_GEN_LEN 3

/**
 * exportfs_encode_ino64_fid - encode non-decodeable 64bit ino file id
 * @inode:   the object to encode
 * @fid:     where to store the file handle fragment
 * @max_len: maximum length to store there (in 4 byte units)
 *
 * This generic function is used to encode a non-decodeable file id for
 * fanotify for filesystems that do not support NFS export.
 */
static int exportfs_encode_ino64_fid(struct inode *inode, struct fid *fid,
                                     int *max_len)
{
        if (*max_len < FILEID_INO64_GEN_LEN) {
                *max_len = FILEID_INO64_GEN_LEN;
                return FILEID_INVALID;
        }

        fid->i64.ino = inode->i_ino;
        fid->i64.gen = inode->i_generation;
        *max_len = FILEID_INO64_GEN_LEN;

        return FILEID_INO64_GEN;
}

/**
 * exportfs_encode_inode_fh - encode a file handle from inode
 * @inode:   the object to encode
 * @fid:     where to store the file handle fragment
 * @max_len: maximum length to store there
 * @parent:  parent directory inode, if wanted
 * @flags:   properties of the requested file handle
 *
 * Returns an enum fid_type or a negative errno.
 */
int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
                             int *max_len, struct inode *parent, int flags)
{
        const struct export_operations *nop = inode->i_sb->s_export_op;

        if (!exportfs_can_encode_fh(nop, flags))
                return -EOPNOTSUPP;

        if (!nop && (flags & EXPORT_FH_FID))
                return exportfs_encode_ino64_fid(inode, fid, max_len);

        return nop->encode_fh(inode, fid->raw, max_len, parent);
}
EXPORT_SYMBOL_GPL(exportfs_encode_inode_fh);

/**
 * exportfs_encode_fh - encode a file handle from dentry
 * @dentry:  the object to encode
 * @fid:     where to store the file handle fragment
 * @max_len: maximum length to store there
 * @flags:   properties of the requested file handle
 *
 * Returns an enum fid_type or a negative errno.
 */
int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len,
                       int flags)
{
        int error;
        struct dentry *p = NULL;
        struct inode *inode = dentry->d_inode, *parent = NULL;

        if ((flags & EXPORT_FH_CONNECTABLE) && !S_ISDIR(inode->i_mode)) {
                p = dget_parent(dentry);
                /*
                 * note that while p might've ceased to be our parent already,
                 * it's still pinned by and still positive.
                 */
                parent = p->d_inode;
        }

        error = exportfs_encode_inode_fh(inode, fid, max_len, parent, flags);
        dput(p);

        return error;
}
EXPORT_SYMBOL_GPL(exportfs_encode_fh);

struct dentry *
exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
                       int fileid_type,
                       int (*acceptable)(void *, struct dentry *),
                       void *context)
{
        const struct export_operations *nop = mnt->mnt_sb->s_export_op;
        struct dentry *result, *alias;
        char nbuf[NAME_MAX+1];
        int err;

        /*
         * Try to get any dentry for the given file handle from the filesystem.
         */
        if (!exportfs_can_decode_fh(nop))
                return ERR_PTR(-ESTALE);
        result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
        if (IS_ERR_OR_NULL(result))
                return result;

        /*
         * If no acceptance criteria was specified by caller, a disconnected
         * dentry is also accepatable. Callers may use this mode to query if
         * file handle is stale or to get a reference to an inode without
         * risking the high overhead caused by directory reconnect.
         */
        if (!acceptable)
                return result;

        if (d_is_dir(result)) {
                /*
                 * This request is for a directory.
                 *
                 * On the positive side there is only one dentry for each
                 * directory inode.  On the negative side this implies that we
                 * to ensure our dentry is connected all the way up to the
                 * filesystem root.
                 */
                if (result->d_flags & DCACHE_DISCONNECTED) {
                        err = reconnect_path(mnt, result, nbuf);
                        if (err)
                                goto err_result;
                }

                if (!acceptable(context, result)) {
                        err = -EACCES;
                        goto err_result;
                }

                return result;
        } else {
                /*
                 * It's not a directory.  Life is a little more complicated.
                 */
                struct dentry *target_dir, *nresult;

                /*
                 * See if either the dentry we just got from the filesystem
                 * or any alias for it is acceptable.  This is always true
                 * if this filesystem is exported without the subtreecheck
                 * option.  If the filesystem is exported with the subtree
                 * check option there's a fair chance we need to look at
                 * the parent directory in the file handle and make sure
                 * it's connected to the filesystem root.
                 */
                alias = find_acceptable_alias(result, acceptable, context);
                if (alias)
                        return alias;

                /*
                 * Try to extract a dentry for the parent directory from the
                 * file handle.  If this fails we'll have to give up.
                 */
                err = -ESTALE;
                if (!nop->fh_to_parent)
                        goto err_result;

                target_dir = nop->fh_to_parent(mnt->mnt_sb, fid,
                                fh_len, fileid_type);
                if (!target_dir)
                        goto err_result;
                err = PTR_ERR(target_dir);
                if (IS_ERR(target_dir))
                        goto err_result;

                /*
                 * And as usual we need to make sure the parent directory is
                 * connected to the filesystem root.  The VFS really doesn't
                 * like disconnected directories..
                 */
                err = reconnect_path(mnt, target_dir, nbuf);
                if (err) {
                        dput(target_dir);
                        goto err_result;
                }

                /*
                 * Now that we've got both a well-connected parent and a
                 * dentry for the inode we're after, make sure that our
                 * inode is actually connected to the parent.
                 */
                err = exportfs_get_name(mnt, target_dir, nbuf, result);
                if (err) {
                        dput(target_dir);
                        goto err_result;
                }

                inode_lock(target_dir->d_inode);
                nresult = lookup_one(mnt_idmap(mnt), nbuf,
                                     target_dir, strlen(nbuf));
                if (!IS_ERR(nresult)) {
                        if (unlikely(nresult->d_inode != result->d_inode)) {
                                dput(nresult);
                                nresult = ERR_PTR(-ESTALE);
                        }
                }
                inode_unlock(target_dir->d_inode);
                /*
                 * At this point we are done with the parent, but it's pinned
                 * by the child dentry anyway.
                 */
                dput(target_dir);

                if (IS_ERR(nresult)) {
                        err = PTR_ERR(nresult);
                        goto err_result;
                }
                dput(result);
                result = nresult;

                /*
                 * And finally make sure the dentry is actually acceptable
                 * to NFSD.
                 */
                alias = find_acceptable_alias(result, acceptable, context);
                if (!alias) {
                        err = -EACCES;
                        goto err_result;
                }

                return alias;
        }

 err_result:
        dput(result);
        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(exportfs_decode_fh_raw);

struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
                                  int fh_len, int fileid_type,
                                  int (*acceptable)(void *, struct dentry *),
                                  void *context)
{
        struct dentry *ret;

        ret = exportfs_decode_fh_raw(mnt, fid, fh_len, fileid_type,
                                     acceptable, context);
        if (IS_ERR_OR_NULL(ret)) {
                if (ret == ERR_PTR(-ENOMEM))
                        return ret;
                return ERR_PTR(-ESTALE);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(exportfs_decode_fh);

MODULE_LICENSE("GPL");



















































    1 






































    1 
    1 



















    2 


















    3 









    3 










    1 















    2 











    3 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/fsync.c
 *
 *  Copyright (C) 1993  Stephen Tweedie (sct@redhat.com)
 *  from
 *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
 *                      Laboratoire MASI - Institut Blaise Pascal
 *                      Universite Pierre et Marie Curie (Paris VI)
 *  from
 *  linux/fs/minix/truncate.c   Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  ext4fs fsync primitive
 *
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 *
 *  Removed unnecessary code duplication for little endian machines
 *  and excessive __inline__s.
 *        Andi Kleen, 1997
 *
 * Major simplications and cleanup - we only need to do the metadata, because
 * we can depend on generic_block_fdatasync() to sync the data blocks.
 */

#include <linux/time.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>

#include "ext4.h"
#include "ext4_jbd2.h"

#include <trace/events/ext4.h>

/*
 * If we're not journaling and this is a just-created file, we have to
 * sync our parent directory (if it was freshly created) since
 * otherwise it will only be written by writeback, leaving a huge
 * window during which a crash may lose the file.  This may apply for
 * the parent directory's parent as well, and so on recursively, if
 * they are also freshly created.
 */
static int ext4_sync_parent(struct inode *inode)
{
        struct dentry *dentry, *next;
        int ret = 0;

        if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
                return 0;
        dentry = d_find_any_alias(inode);
        if (!dentry)
                return 0;
        while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
                ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);

                next = dget_parent(dentry);
                dput(dentry);
                dentry = next;
                inode = dentry->d_inode;

                /*
                 * The directory inode may have gone through rmdir by now. But
                 * the inode itself and its blocks are still allocated (we hold
                 * a reference to the inode via its dentry), so it didn't go
                 * through ext4_evict_inode()) and so we are safe to flush
                 * metadata blocks and the inode.
                 */
                ret = sync_mapping_buffers(inode->i_mapping);
                if (ret)
                        break;
                ret = sync_inode_metadata(inode, 1);
                if (ret)
                        break;
        }
        dput(dentry);
        return ret;
}

static int ext4_fsync_nojournal(struct file *file, loff_t start, loff_t end,
                                int datasync, bool *needs_barrier)
{
        struct inode *inode = file->f_inode;
        int ret;

        ret = generic_buffers_fsync_noflush(file, start, end, datasync);
        if (!ret)
                ret = ext4_sync_parent(inode);
        if (test_opt(inode->i_sb, BARRIER))
                *needs_barrier = true;

        return ret;
}

static int ext4_fsync_journal(struct inode *inode, bool datasync,
                             bool *needs_barrier)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
        tid_t commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;

        /*
         * Fastcommit does not really support fsync on directories or other
         * special files. Force a full commit.
         */
        if (!S_ISREG(inode->i_mode))
                return ext4_force_commit(inode->i_sb);

        if (journal->j_flags & JBD2_BARRIER &&
            !jbd2_trans_will_send_data_barrier(journal, commit_tid))
                *needs_barrier = true;

        return ext4_fc_commit(journal, commit_tid);
}

/*
 * akpm: A new design for ext4_sync_file().
 *
 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
 * There cannot be a transaction open by this task.
 * Another task could have dirtied this inode.  Its data can be in any
 * state in the journalling system.
 *
 * What we do is just kick off a commit and wait on it.  This will snapshot the
 * inode to disk.
 */
int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
        int ret = 0, err;
        bool needs_barrier = false;
        struct inode *inode = file->f_mapping->host;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;

        ASSERT(ext4_journal_current_handle() == NULL);

        trace_ext4_sync_file_enter(file, datasync);

        if (sb_rdonly(inode->i_sb)) {
                /* Make sure that we read updated s_ext4_flags value */
                smp_rmb();
                if (ext4_forced_shutdown(inode->i_sb))
                        ret = -EROFS;
                goto out;
        }

        if (!EXT4_SB(inode->i_sb)->s_journal) {
                ret = ext4_fsync_nojournal(file, start, end, datasync,
                                           &needs_barrier);
                if (needs_barrier)
                        goto issue_flush;
                goto out;
        }

        ret = file_write_and_wait_range(file, start, end);
        if (ret)
                goto out;

        /*
         *  The caller's filemap_fdatawrite()/wait will sync the data.
         *  Metadata is in the journal, we wait for proper transaction to
         *  commit here.
         */
        ret = ext4_fsync_journal(inode, datasync, &needs_barrier);

issue_flush:
        if (needs_barrier) {
                err = blkdev_issue_flush(inode->i_sb->s_bdev);
                if (!ret)
                        ret = err;
        }
out:
        err = file_check_and_advance_wb_err(file);
        if (ret == 0)
                ret = err;
        trace_ext4_sync_file_exit(inode, ret);
        return ret;
}




















































   17 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2005-2010 IBM Corporation
 *
 * Authors:
 * Mimi Zohar <zohar@us.ibm.com>
 * Kylene Hall <kjhall@us.ibm.com>
 *
 * File: evm.h
 */

#ifndef __INTEGRITY_EVM_H
#define __INTEGRITY_EVM_H

#include <linux/xattr.h>
#include <linux/security.h>

#include "../integrity.h"

#define EVM_INIT_HMAC        0x0001
#define EVM_INIT_X509        0x0002
#define EVM_ALLOW_METADATA_WRITES        0x0004
#define EVM_SETUP_COMPLETE 0x80000000 /* userland has signaled key load */

#define EVM_KEY_MASK (EVM_INIT_HMAC | EVM_INIT_X509)
#define EVM_INIT_MASK (EVM_INIT_HMAC | EVM_INIT_X509 | EVM_SETUP_COMPLETE | \
                       EVM_ALLOW_METADATA_WRITES)

struct xattr_list {
        struct list_head list;
        char *name;
        bool enabled;
};

#define EVM_NEW_FILE                        0x00000001
#define EVM_IMMUTABLE_DIGSIG                0x00000002

/* EVM integrity metadata associated with an inode */
struct evm_iint_cache {
        unsigned long flags;
        enum integrity_status evm_status:4;
        struct integrity_inode_attributes metadata_inode;
};

extern struct lsm_blob_sizes evm_blob_sizes;

static inline struct evm_iint_cache *evm_iint_inode(const struct inode *inode)
{
        if (unlikely(!inode->i_security))
                return NULL;

        return inode->i_security + evm_blob_sizes.lbs_inode;
}

extern int evm_initialized;

#define EVM_ATTR_FSUUID                0x0001

extern int evm_hmac_attrs;

/* List of EVM protected security xattrs */
extern struct list_head evm_config_xattrnames;

struct evm_digest {
        struct ima_digest_data_hdr hdr;
        char digest[IMA_MAX_DIGEST_SIZE];
} __packed;

int evm_protected_xattr(const char *req_xattr_name);

int evm_init_key(void);
int evm_update_evmxattr(struct dentry *dentry,
                        const char *req_xattr_name,
                        const char *req_xattr_value,
                        size_t req_xattr_value_len);
int evm_calc_hmac(struct dentry *dentry, const char *req_xattr_name,
                  const char *req_xattr_value,
                  size_t req_xattr_value_len, struct evm_digest *data,
                  struct evm_iint_cache *iint);
int evm_calc_hash(struct dentry *dentry, const char *req_xattr_name,
                  const char *req_xattr_value,
                  size_t req_xattr_value_len, char type,
                  struct evm_digest *data, struct evm_iint_cache *iint);
int evm_init_hmac(struct inode *inode, const struct xattr *xattrs,
                  char *hmac_val);
int evm_init_secfs(void);

#endif
























































    1 




















    1 


    1 



    1 




























    4 


    6 
    7 






    7 














    3 



    3 
















    5 











    1 















    5 


    5 







    1 


    5 










    5 


































    5 



































































































    3 




























































    2 


    2 




















    1 


    1 




















    3 















    3 












    3 








    3 


































    6 




    2 
    1 














    1 













    1 





    1 






























































































































































































    1 




































































    2 






































    5 














    5 












    6 







    1 
    1 










    5 





    1 
    6 

    6 



    1 
















    1 




    2 

    6 
    2 













































































































































































































































































































































































    1 














    1 
























    1 
    1 












    1 




































    1 





















    1 

















    1 































    5 















    1 




















    2 















    5 














    5 















    1 




















    1 




    2 

    1 






























    1 
    1 



















    2 


    2 




    2 
    1 


    1 




















    1 
























    4 















    4 














    1 







    1 





















    1 





    2 





























    4 








    4 
    2 





    1 


    1 






    1 



















    4 









































    4 








    4 


    2 


    4 














    4 







    5 







    1 
    2 


    1 



    1 






    1 
















    4 
























    4 








    2 





    1 
    2 





    1 








    2 






    1 
    1 






    1 




























    4 






    1 
























    1 









    4 










    4 











    2 


    1 








    2 
    1 




















    2 
    4 







    4 


    4 







































































    4 





















































































































    5 




    5 














    1 


    2 




































































    3 







    3 























    3 


    2 

























    3 



















    3 








































    2 


    1 





    1 
    3 




    3 

    3 
























































    1 



    3 








    2 









    3 



    3 



















    2 





    1 










    3 









    3 
    1 








    3 

















    3 















    1 













    3 





















    3 








    2 

























































    1 





























    1 













    3 






    3 


    3 


    3 






    1 






    1 






    1 





    1 




















    1 







    1 






    1 






    3 






    3 













    1 




    2 








    3 



    3 















































































































    1 



























    1 










    1 



    1 










    1 














    1 


    1 











































































    1 

























    1 


    1 
































    1 










































































































































































































































































































    1 


















































    1 

















    1 













































































































    2 




    1 
















    1 




    1 



    1 



























































    2 























































































































    4 



    4 



































































    5 











    6 


























    6 











    2 


    2 
















    1 








    2 


    2 














    4 

























    5 









    4 















    5 









    5 














    4 







    5 

    4 
    2 
    4 
    2 

    2 


    4 











    3 
    2 



































    1 

















    4 

    4 









    3 

    2 







    6 

    6 





    3 

















    3 




    3 











































































































































































































































































    1 




































    1 







































































    1 







    1 





    1 

    1 





    1 






    1 








    1 



    1 








































































































































































































































    1 



    1 






    1 
















    1 




    1 












    1 




















    1 
































    1 







    1 
























    1 







    1 











    1 


































    1 


    1 





    1 




























    1 


    1 




























































































    1 






























































































































































































































































































































































































    1 












    1 



    1 

























    1 

























    1 









































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
 * Written by Alex Tomas <alex@clusterfs.com>
 *
 * Architecture independence:
 *   Copyright (c) 2005, Bull S.A.
 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
 */

/*
 * Extents support for EXT4
 *
 * TODO:
 *   - ext4*_error() should be used in some situations
 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
 *   - smart tree reduction
 */

#include <linux/fs.h>
#include <linux/time.h>
#include <linux/jbd2.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fiemap.h>
#include <linux/iomap.h>
#include <linux/sched/mm.h>
#include "ext4_jbd2.h"
#include "ext4_extents.h"
#include "xattr.h"

#include <trace/events/ext4.h>

/*
 * used by extent splitting.
 */
#define EXT4_EXT_MAY_ZEROOUT        0x1  /* safe to zeroout if split fails \
                                        due to ENOSPC */
#define EXT4_EXT_MARK_UNWRIT1        0x2  /* mark first half unwritten */
#define EXT4_EXT_MARK_UNWRIT2        0x4  /* mark second half unwritten */

#define EXT4_EXT_DATA_VALID1        0x8  /* first half contains valid data */
#define EXT4_EXT_DATA_VALID2        0x10 /* second half contains valid data */

static __le32 ext4_extent_block_csum(struct inode *inode,
                                     struct ext4_extent_header *eh)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        __u32 csum;

        csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
                           EXT4_EXTENT_TAIL_OFFSET(eh));
        return cpu_to_le32(csum);
}

static int ext4_extent_block_csum_verify(struct inode *inode,
                                         struct ext4_extent_header *eh)
{
        struct ext4_extent_tail *et;

        if (!ext4_has_metadata_csum(inode->i_sb))
                return 1;

        et = find_ext4_extent_tail(eh);
        if (et->et_checksum != ext4_extent_block_csum(inode, eh))
                return 0;
        return 1;
}

static void ext4_extent_block_csum_set(struct inode *inode,
                                       struct ext4_extent_header *eh)
{
        struct ext4_extent_tail *et;

        if (!ext4_has_metadata_csum(inode->i_sb))
                return;

        et = find_ext4_extent_tail(eh);
        et->et_checksum = ext4_extent_block_csum(inode, eh);
}

static int ext4_split_extent_at(handle_t *handle,
                             struct inode *inode,
                             struct ext4_ext_path **ppath,
                             ext4_lblk_t split,
                             int split_flag,
                             int flags);

static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
{
        /*
         * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
         * moment, get_block can be called only for blocks inside i_size since
         * page cache has been already dropped and writes are blocked by
         * i_rwsem. So we can safely drop the i_data_sem here.
         */
        BUG_ON(EXT4_JOURNAL(inode) == NULL);
        ext4_discard_preallocations(inode);
        up_write(&EXT4_I(inode)->i_data_sem);
        *dropped = 1;
        return 0;
}

static void ext4_ext_drop_refs(struct ext4_ext_path *path)
{
        int depth, i;

        if (!path)
                return;
        depth = path->p_depth;
        for (i = 0; i <= depth; i++, path++) {
                brelse(path->p_bh);
                path->p_bh = NULL;
        }
}

void ext4_free_ext_path(struct ext4_ext_path *path)
{
        ext4_ext_drop_refs(path);
        kfree(path);
}

/*
 * Make sure 'handle' has at least 'check_cred' credits. If not, restart
 * transaction with 'restart_cred' credits. The function drops i_data_sem
 * when restarting transaction and gets it after transaction is restarted.
 *
 * The function returns 0 on success, 1 if transaction had to be restarted,
 * and < 0 in case of fatal error.
 */
int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
                                int check_cred, int restart_cred,
                                int revoke_cred)
{
        int ret;
        int dropped = 0;

        ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred,
                revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped));
        if (dropped)
                down_write(&EXT4_I(inode)->i_data_sem);
        return ret;
}

/*
 * could return:
 *  - EROFS
 *  - ENOMEM
 */
static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
                                struct ext4_ext_path *path)
{
        int err = 0;

        if (path->p_bh) {
                /* path points to block */
                BUFFER_TRACE(path->p_bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, inode->i_sb,
                                                    path->p_bh, EXT4_JTR_NONE);
                /*
                 * The extent buffer's verified bit will be set again in
                 * __ext4_ext_dirty(). We could leave an inconsistent
                 * buffer if the extents updating procudure break off du
                 * to some error happens, force to check it again.
                 */
                if (!err)
                        clear_buffer_verified(path->p_bh);
        }
        /* path points to leaf/index in inode body */
        /* we use in-core data, no need to protect them */
        return err;
}

/*
 * could return:
 *  - EROFS
 *  - ENOMEM
 *  - EIO
 */
static int __ext4_ext_dirty(const char *where, unsigned int line,
                            handle_t *handle, struct inode *inode,
                            struct ext4_ext_path *path)
{
        int err;

        WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
        if (path->p_bh) {
                ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
                /* path points to block */
                err = __ext4_handle_dirty_metadata(where, line, handle,
                                                   inode, path->p_bh);
                /* Extents updating done, re-set verified flag */
                if (!err)
                        set_buffer_verified(path->p_bh);
        } else {
                /* path points to leaf/index in inode body */
                err = ext4_mark_inode_dirty(handle, inode);
        }
        return err;
}

#define ext4_ext_dirty(handle, inode, path) \
                __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))

static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                              struct ext4_ext_path *path,
                              ext4_lblk_t block)
{
        if (path) {
                int depth = path->p_depth;
                struct ext4_extent *ex;

                /*
                 * Try to predict block placement assuming that we are
                 * filling in a file which will eventually be
                 * non-sparse --- i.e., in the case of libbfd writing
                 * an ELF object sections out-of-order but in a way
                 * the eventually results in a contiguous object or
                 * executable file, or some database extending a table
                 * space file.  However, this is actually somewhat
                 * non-ideal if we are writing a sparse file such as
                 * qemu or KVM writing a raw image file that is going
                 * to stay fairly sparse, since it will end up
                 * fragmenting the file system's free space.  Maybe we
                 * should have some hueristics or some way to allow
                 * userspace to pass a hint to file system,
                 * especially if the latter case turns out to be
                 * common.
                 */
                ex = path[depth].p_ext;
                if (ex) {
                        ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
                        ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);

                        if (block > ext_block)
                                return ext_pblk + (block - ext_block);
                        else
                                return ext_pblk - (ext_block - block);
                }

                /* it looks like index is empty;
                 * try to find starting block from index itself */
                if (path[depth].p_bh)
                        return path[depth].p_bh->b_blocknr;
        }

        /* OK. use inode's group */
        return ext4_inode_to_goal_block(inode);
}

/*
 * Allocation for a meta data block
 */
static ext4_fsblk_t
ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
                        struct ext4_ext_path *path,
                        struct ext4_extent *ex, int *err, unsigned int flags)
{
        ext4_fsblk_t goal, newblock;

        goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
        newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
                                        NULL, err);
        return newblock;
}

static inline int ext4_ext_space_block(struct inode *inode, int check)
{
        int size;

        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                        / sizeof(struct ext4_extent);
#ifdef AGGRESSIVE_TEST
        if (!check && size > 6)
                size = 6;
#endif
        return size;
}

static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
{
        int size;

        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                        / sizeof(struct ext4_extent_idx);
#ifdef AGGRESSIVE_TEST
        if (!check && size > 5)
                size = 5;
#endif
        return size;
}

static inline int ext4_ext_space_root(struct inode *inode, int check)
{
        int size;

        size = sizeof(EXT4_I(inode)->i_data);
        size -= sizeof(struct ext4_extent_header);
        size /= sizeof(struct ext4_extent);
#ifdef AGGRESSIVE_TEST
        if (!check && size > 3)
                size = 3;
#endif
        return size;
}

static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
{
        int size;

        size = sizeof(EXT4_I(inode)->i_data);
        size -= sizeof(struct ext4_extent_header);
        size /= sizeof(struct ext4_extent_idx);
#ifdef AGGRESSIVE_TEST
        if (!check && size > 4)
                size = 4;
#endif
        return size;
}

static inline int
ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
                           struct ext4_ext_path **ppath, ext4_lblk_t lblk,
                           int nofail)
{
        struct ext4_ext_path *path = *ppath;
        int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
        int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO;

        if (nofail)
                flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;

        return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
                        EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
                        flags);
}

static int
ext4_ext_max_entries(struct inode *inode, int depth)
{
        int max;

        if (depth == ext_depth(inode)) {
                if (depth == 0)
                        max = ext4_ext_space_root(inode, 1);
                else
                        max = ext4_ext_space_root_idx(inode, 1);
        } else {
                if (depth == 0)
                        max = ext4_ext_space_block(inode, 1);
                else
                        max = ext4_ext_space_block_idx(inode, 1);
        }

        return max;
}

static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
{
        ext4_fsblk_t block = ext4_ext_pblock(ext);
        int len = ext4_ext_get_actual_len(ext);
        ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);

        /*
         * We allow neither:
         *  - zero length
         *  - overflow/wrap-around
         */
        if (lblock + len <= lblock)
                return 0;
        return ext4_inode_block_valid(inode, block, len);
}

static int ext4_valid_extent_idx(struct inode *inode,
                                struct ext4_extent_idx *ext_idx)
{
        ext4_fsblk_t block = ext4_idx_pblock(ext_idx);

        return ext4_inode_block_valid(inode, block, 1);
}

static int ext4_valid_extent_entries(struct inode *inode,
                                     struct ext4_extent_header *eh,
                                     ext4_lblk_t lblk, ext4_fsblk_t *pblk,
                                     int depth)
{
        unsigned short entries;
        ext4_lblk_t lblock = 0;
        ext4_lblk_t cur = 0;

        if (eh->eh_entries == 0)
                return 1;

        entries = le16_to_cpu(eh->eh_entries);

        if (depth == 0) {
                /* leaf entries */
                struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);

                /*
                 * The logical block in the first entry should equal to
                 * the number in the index block.
                 */
                if (depth != ext_depth(inode) &&
                    lblk != le32_to_cpu(ext->ee_block))
                        return 0;
                while (entries) {
                        if (!ext4_valid_extent(inode, ext))
                                return 0;

                        /* Check for overlapping extents */
                        lblock = le32_to_cpu(ext->ee_block);
                        if (lblock < cur) {
                                *pblk = ext4_ext_pblock(ext);
                                return 0;
                        }
                        cur = lblock + ext4_ext_get_actual_len(ext);
                        ext++;
                        entries--;
                }
        } else {
                struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);

                /*
                 * The logical block in the first entry should equal to
                 * the number in the parent index block.
                 */
                if (depth != ext_depth(inode) &&
                    lblk != le32_to_cpu(ext_idx->ei_block))
                        return 0;
                while (entries) {
                        if (!ext4_valid_extent_idx(inode, ext_idx))
                                return 0;

                        /* Check for overlapping index extents */
                        lblock = le32_to_cpu(ext_idx->ei_block);
                        if (lblock < cur) {
                                *pblk = ext4_idx_pblock(ext_idx);
                                return 0;
                        }
                        ext_idx++;
                        entries--;
                        cur = lblock + 1;
                }
        }
        return 1;
}

static int __ext4_ext_check(const char *function, unsigned int line,
                            struct inode *inode, struct ext4_extent_header *eh,
                            int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk)
{
        const char *error_msg;
        int max = 0, err = -EFSCORRUPTED;

        if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
                error_msg = "invalid magic";
                goto corrupted;
        }
        if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
                error_msg = "unexpected eh_depth";
                goto corrupted;
        }
        if (unlikely(eh->eh_max == 0)) {
                error_msg = "invalid eh_max";
                goto corrupted;
        }
        max = ext4_ext_max_entries(inode, depth);
        if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
                error_msg = "too large eh_max";
                goto corrupted;
        }
        if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
                error_msg = "invalid eh_entries";
                goto corrupted;
        }
        if (unlikely((eh->eh_entries == 0) && (depth > 0))) {
                error_msg = "eh_entries is 0 but eh_depth is > 0";
                goto corrupted;
        }
        if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
                error_msg = "invalid extent entries";
                goto corrupted;
        }
        if (unlikely(depth > 32)) {
                error_msg = "too large eh_depth";
                goto corrupted;
        }
        /* Verify checksum on non-root extent tree nodes */
        if (ext_depth(inode) != depth &&
            !ext4_extent_block_csum_verify(inode, eh)) {
                error_msg = "extent tree corrupted";
                err = -EFSBADCRC;
                goto corrupted;
        }
        return 0;

corrupted:
        ext4_error_inode_err(inode, function, line, 0, -err,
                             "pblk %llu bad header/extent: %s - magic %x, "
                             "entries %u, max %u(%u), depth %u(%u)",
                             (unsigned long long) pblk, error_msg,
                             le16_to_cpu(eh->eh_magic),
                             le16_to_cpu(eh->eh_entries),
                             le16_to_cpu(eh->eh_max),
                             max, le16_to_cpu(eh->eh_depth), depth);
        return err;
}

#define ext4_ext_check(inode, eh, depth, pblk)                        \
        __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0)

int ext4_ext_check_inode(struct inode *inode)
{
        return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
}

static void ext4_cache_extents(struct inode *inode,
                               struct ext4_extent_header *eh)
{
        struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
        ext4_lblk_t prev = 0;
        int i;

        for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
                unsigned int status = EXTENT_STATUS_WRITTEN;
                ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
                int len = ext4_ext_get_actual_len(ex);

                if (prev && (prev != lblk))
                        ext4_es_cache_extent(inode, prev, lblk - prev, ~0,
                                             EXTENT_STATUS_HOLE);

                if (ext4_ext_is_unwritten(ex))
                        status = EXTENT_STATUS_UNWRITTEN;
                ext4_es_cache_extent(inode, lblk, len,
                                     ext4_ext_pblock(ex), status);
                prev = lblk + len;
        }
}

static struct buffer_head *
__read_extent_tree_block(const char *function, unsigned int line,
                         struct inode *inode, struct ext4_extent_idx *idx,
                         int depth, int flags)
{
        struct buffer_head                *bh;
        int                                err;
        gfp_t                                gfp_flags = __GFP_MOVABLE | GFP_NOFS;
        ext4_fsblk_t                        pblk;

        if (flags & EXT4_EX_NOFAIL)
                gfp_flags |= __GFP_NOFAIL;

        pblk = ext4_idx_pblock(idx);
        bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags);
        if (unlikely(!bh))
                return ERR_PTR(-ENOMEM);

        if (!bh_uptodate_or_lock(bh)) {
                trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
                err = ext4_read_bh(bh, 0, NULL);
                if (err < 0)
                        goto errout;
        }
        if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
                return bh;
        err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh),
                               depth, pblk, le32_to_cpu(idx->ei_block));
        if (err)
                goto errout;
        set_buffer_verified(bh);
        /*
         * If this is a leaf block, cache all of its entries
         */
        if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
                struct ext4_extent_header *eh = ext_block_hdr(bh);
                ext4_cache_extents(inode, eh);
        }
        return bh;
errout:
        put_bh(bh);
        return ERR_PTR(err);

}

#define read_extent_tree_block(inode, idx, depth, flags)                \
        __read_extent_tree_block(__func__, __LINE__, (inode), (idx),        \
                                 (depth), (flags))

/*
 * This function is called to cache a file's extent information in the
 * extent status tree
 */
int ext4_ext_precache(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_ext_path *path = NULL;
        struct buffer_head *bh;
        int i = 0, depth, ret = 0;

        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return 0;        /* not an extent-mapped inode */

        down_read(&ei->i_data_sem);
        depth = ext_depth(inode);

        /* Don't cache anything if there are no external extent blocks */
        if (!depth) {
                up_read(&ei->i_data_sem);
                return ret;
        }

        path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
                       GFP_NOFS);
        if (path == NULL) {
                up_read(&ei->i_data_sem);
                return -ENOMEM;
        }

        path[0].p_hdr = ext_inode_hdr(inode);
        ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
        if (ret)
                goto out;
        path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
        while (i >= 0) {
                /*
                 * If this is a leaf block or we've reached the end of
                 * the index block, go up
                 */
                if ((i == depth) ||
                    path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
                        brelse(path[i].p_bh);
                        path[i].p_bh = NULL;
                        i--;
                        continue;
                }
                bh = read_extent_tree_block(inode, path[i].p_idx++,
                                            depth - i - 1,
                                            EXT4_EX_FORCE_CACHE);
                if (IS_ERR(bh)) {
                        ret = PTR_ERR(bh);
                        break;
                }
                i++;
                path[i].p_bh = bh;
                path[i].p_hdr = ext_block_hdr(bh);
                path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
        }
        ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
out:
        up_read(&ei->i_data_sem);
        ext4_free_ext_path(path);
        return ret;
}

#ifdef EXT_DEBUG
static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
{
        int k, l = path->p_depth;

        ext_debug(inode, "path:");
        for (k = 0; k <= l; k++, path++) {
                if (path->p_idx) {
                        ext_debug(inode, "  %d->%llu",
                                  le32_to_cpu(path->p_idx->ei_block),
                                  ext4_idx_pblock(path->p_idx));
                } else if (path->p_ext) {
                        ext_debug(inode, "  %d:[%d]%d:%llu ",
                                  le32_to_cpu(path->p_ext->ee_block),
                                  ext4_ext_is_unwritten(path->p_ext),
                                  ext4_ext_get_actual_len(path->p_ext),
                                  ext4_ext_pblock(path->p_ext));
                } else
                        ext_debug(inode, "  []");
        }
        ext_debug(inode, "\n");
}

static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
{
        int depth = ext_depth(inode);
        struct ext4_extent_header *eh;
        struct ext4_extent *ex;
        int i;

        if (!path)
                return;

        eh = path[depth].p_hdr;
        ex = EXT_FIRST_EXTENT(eh);

        ext_debug(inode, "Displaying leaf extents\n");

        for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
                ext_debug(inode, "%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
                          ext4_ext_is_unwritten(ex),
                          ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
        }
        ext_debug(inode, "\n");
}

static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
                        ext4_fsblk_t newblock, int level)
{
        int depth = ext_depth(inode);
        struct ext4_extent *ex;

        if (depth != level) {
                struct ext4_extent_idx *idx;
                idx = path[level].p_idx;
                while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
                        ext_debug(inode, "%d: move %d:%llu in new index %llu\n",
                                  level, le32_to_cpu(idx->ei_block),
                                  ext4_idx_pblock(idx), newblock);
                        idx++;
                }

                return;
        }

        ex = path[depth].p_ext;
        while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
                ext_debug(inode, "move %d:%llu:[%d]%d in new leaf %llu\n",
                                le32_to_cpu(ex->ee_block),
                                ext4_ext_pblock(ex),
                                ext4_ext_is_unwritten(ex),
                                ext4_ext_get_actual_len(ex),
                                newblock);
                ex++;
        }
}

#else
#define ext4_ext_show_path(inode, path)
#define ext4_ext_show_leaf(inode, path)
#define ext4_ext_show_move(inode, path, newblock, level)
#endif

/*
 * ext4_ext_binsearch_idx:
 * binary search for the closest index of the given block
 * the header must be checked before calling this
 */
static void
ext4_ext_binsearch_idx(struct inode *inode,
                        struct ext4_ext_path *path, ext4_lblk_t block)
{
        struct ext4_extent_header *eh = path->p_hdr;
        struct ext4_extent_idx *r, *l, *m;


        ext_debug(inode, "binsearch for %u(idx):  ", block);

        l = EXT_FIRST_INDEX(eh) + 1;
        r = EXT_LAST_INDEX(eh);
        while (l <= r) {
                m = l + (r - l) / 2;
                ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
                          le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block),
                          r, le32_to_cpu(r->ei_block));

                if (block < le32_to_cpu(m->ei_block))
                        r = m - 1;
                else
                        l = m + 1;
        }

        path->p_idx = l - 1;
        ext_debug(inode, "  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
                  ext4_idx_pblock(path->p_idx));

#ifdef CHECK_BINSEARCH
        {
                struct ext4_extent_idx *chix, *ix;
                int k;

                chix = ix = EXT_FIRST_INDEX(eh);
                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
                        if (k != 0 && le32_to_cpu(ix->ei_block) <=
                            le32_to_cpu(ix[-1].ei_block)) {
                                printk(KERN_DEBUG "k=%d, ix=0x%p, "
                                       "first=0x%p\n", k,
                                       ix, EXT_FIRST_INDEX(eh));
                                printk(KERN_DEBUG "%u <= %u\n",
                                       le32_to_cpu(ix->ei_block),
                                       le32_to_cpu(ix[-1].ei_block));
                        }
                        BUG_ON(k && le32_to_cpu(ix->ei_block)
                                           <= le32_to_cpu(ix[-1].ei_block));
                        if (block < le32_to_cpu(ix->ei_block))
                                break;
                        chix = ix;
                }
                BUG_ON(chix != path->p_idx);
        }
#endif

}

/*
 * ext4_ext_binsearch:
 * binary search for closest extent of the given block
 * the header must be checked before calling this
 */
static void
ext4_ext_binsearch(struct inode *inode,
                struct ext4_ext_path *path, ext4_lblk_t block)
{
        struct ext4_extent_header *eh = path->p_hdr;
        struct ext4_extent *r, *l, *m;

        if (eh->eh_entries == 0) {
                /*
                 * this leaf is empty:
                 * we get such a leaf in split/add case
                 */
                return;
        }

        ext_debug(inode, "binsearch for %u:  ", block);

        l = EXT_FIRST_EXTENT(eh) + 1;
        r = EXT_LAST_EXTENT(eh);

        while (l <= r) {
                m = l + (r - l) / 2;
                ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
                          le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block),
                          r, le32_to_cpu(r->ee_block));

                if (block < le32_to_cpu(m->ee_block))
                        r = m - 1;
                else
                        l = m + 1;
        }

        path->p_ext = l - 1;
        ext_debug(inode, "  -> %d:%llu:[%d]%d ",
                        le32_to_cpu(path->p_ext->ee_block),
                        ext4_ext_pblock(path->p_ext),
                        ext4_ext_is_unwritten(path->p_ext),
                        ext4_ext_get_actual_len(path->p_ext));

#ifdef CHECK_BINSEARCH
        {
                struct ext4_extent *chex, *ex;
                int k;

                chex = ex = EXT_FIRST_EXTENT(eh);
                for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
                        BUG_ON(k && le32_to_cpu(ex->ee_block)
                                          <= le32_to_cpu(ex[-1].ee_block));
                        if (block < le32_to_cpu(ex->ee_block))
                                break;
                        chex = ex;
                }
                BUG_ON(chex != path->p_ext);
        }
#endif

}

void ext4_ext_tree_init(handle_t *handle, struct inode *inode)
{
        struct ext4_extent_header *eh;

        eh = ext_inode_hdr(inode);
        eh->eh_depth = 0;
        eh->eh_entries = 0;
        eh->eh_magic = EXT4_EXT_MAGIC;
        eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
        eh->eh_generation = 0;
        ext4_mark_inode_dirty(handle, inode);
}

struct ext4_ext_path *
ext4_find_extent(struct inode *inode, ext4_lblk_t block,
                 struct ext4_ext_path **orig_path, int flags)
{
        struct ext4_extent_header *eh;
        struct buffer_head *bh;
        struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
        short int depth, i, ppos = 0;
        int ret;
        gfp_t gfp_flags = GFP_NOFS;

        if (flags & EXT4_EX_NOFAIL)
                gfp_flags |= __GFP_NOFAIL;

        eh = ext_inode_hdr(inode);
        depth = ext_depth(inode);
        if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) {
                EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d",
                                 depth);
                ret = -EFSCORRUPTED;
                goto err;
        }

        if (path) {
                ext4_ext_drop_refs(path);
                if (depth > path[0].p_maxdepth) {
                        kfree(path);
                        *orig_path = path = NULL;
                }
        }
        if (!path) {
                /* account possible depth increase */
                path = kcalloc(depth + 2, sizeof(struct ext4_ext_path),
                                gfp_flags);
                if (unlikely(!path))
                        return ERR_PTR(-ENOMEM);
                path[0].p_maxdepth = depth + 1;
        }
        path[0].p_hdr = eh;
        path[0].p_bh = NULL;

        i = depth;
        if (!(flags & EXT4_EX_NOCACHE) && depth == 0)
                ext4_cache_extents(inode, eh);
        /* walk through the tree */
        while (i) {
                ext_debug(inode, "depth %d: num %d, max %d\n",
                          ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));

                ext4_ext_binsearch_idx(inode, path + ppos, block);
                path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
                path[ppos].p_depth = i;
                path[ppos].p_ext = NULL;

                bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags);
                if (IS_ERR(bh)) {
                        ret = PTR_ERR(bh);
                        goto err;
                }

                eh = ext_block_hdr(bh);
                ppos++;
                path[ppos].p_bh = bh;
                path[ppos].p_hdr = eh;
        }

        path[ppos].p_depth = i;
        path[ppos].p_ext = NULL;
        path[ppos].p_idx = NULL;

        /* find extent */
        ext4_ext_binsearch(inode, path + ppos, block);
        /* if not an empty leaf */
        if (path[ppos].p_ext)
                path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);

        ext4_ext_show_path(inode, path);

        return path;

err:
        ext4_free_ext_path(path);
        if (orig_path)
                *orig_path = NULL;
        return ERR_PTR(ret);
}

/*
 * ext4_ext_insert_index:
 * insert new index [@logical;@ptr] into the block at @curp;
 * check where to insert: before @curp or after @curp
 */
static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
                                 struct ext4_ext_path *curp,
                                 int logical, ext4_fsblk_t ptr)
{
        struct ext4_extent_idx *ix;
        int len, err;

        err = ext4_ext_get_access(handle, inode, curp);
        if (err)
                return err;

        if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
                EXT4_ERROR_INODE(inode,
                                 "logical %d == ei_block %d!",
                                 logical, le32_to_cpu(curp->p_idx->ei_block));
                return -EFSCORRUPTED;
        }

        if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
                             >= le16_to_cpu(curp->p_hdr->eh_max))) {
                EXT4_ERROR_INODE(inode,
                                 "eh_entries %d >= eh_max %d!",
                                 le16_to_cpu(curp->p_hdr->eh_entries),
                                 le16_to_cpu(curp->p_hdr->eh_max));
                return -EFSCORRUPTED;
        }

        if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
                /* insert after */
                ext_debug(inode, "insert new index %d after: %llu\n",
                          logical, ptr);
                ix = curp->p_idx + 1;
        } else {
                /* insert before */
                ext_debug(inode, "insert new index %d before: %llu\n",
                          logical, ptr);
                ix = curp->p_idx;
        }

        if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
                EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
                return -EFSCORRUPTED;
        }

        len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
        BUG_ON(len < 0);
        if (len > 0) {
                ext_debug(inode, "insert new index %d: "
                                "move %d indices from 0x%p to 0x%p\n",
                                logical, len, ix, ix + 1);
                memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
        }

        ix->ei_block = cpu_to_le32(logical);
        ext4_idx_store_pblock(ix, ptr);
        le16_add_cpu(&curp->p_hdr->eh_entries, 1);

        if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
                EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
                return -EFSCORRUPTED;
        }

        err = ext4_ext_dirty(handle, inode, curp);
        ext4_std_error(inode->i_sb, err);

        return err;
}

/*
 * ext4_ext_split:
 * inserts new subtree into the path, using free index entry
 * at depth @at:
 * - allocates all needed blocks (new leaf and all intermediate index blocks)
 * - makes decision where to split
 * - moves remaining extents and index entries (right to the split point)
 *   into the newly allocated blocks
 * - initializes subtree
 */
static int ext4_ext_split(handle_t *handle, struct inode *inode,
                          unsigned int flags,
                          struct ext4_ext_path *path,
                          struct ext4_extent *newext, int at)
{
        struct buffer_head *bh = NULL;
        int depth = ext_depth(inode);
        struct ext4_extent_header *neh;
        struct ext4_extent_idx *fidx;
        int i = at, k, m, a;
        ext4_fsblk_t newblock, oldblock;
        __le32 border;
        ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
        gfp_t gfp_flags = GFP_NOFS;
        int err = 0;
        size_t ext_size = 0;

        if (flags & EXT4_EX_NOFAIL)
                gfp_flags |= __GFP_NOFAIL;

        /* make decision: where to split? */
        /* FIXME: now decision is simplest: at current extent */

        /* if current leaf will be split, then we should use
         * border from split point */
        if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
                EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
                return -EFSCORRUPTED;
        }
        if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
                border = path[depth].p_ext[1].ee_block;
                ext_debug(inode, "leaf will be split."
                                " next leaf starts at %d\n",
                                  le32_to_cpu(border));
        } else {
                border = newext->ee_block;
                ext_debug(inode, "leaf will be added."
                                " next leaf starts at %d\n",
                                le32_to_cpu(border));
        }

        /*
         * If error occurs, then we break processing
         * and mark filesystem read-only. index won't
         * be inserted and tree will be in consistent
         * state. Next mount will repair buffers too.
         */

        /*
         * Get array to track all allocated blocks.
         * We need this to handle errors and free blocks
         * upon them.
         */
        ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), gfp_flags);
        if (!ablocks)
                return -ENOMEM;

        /* allocate all needed blocks */
        ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at);
        for (a = 0; a < depth - at; a++) {
                newblock = ext4_ext_new_meta_block(handle, inode, path,
                                                   newext, &err, flags);
                if (newblock == 0)
                        goto cleanup;
                ablocks[a] = newblock;
        }

        /* initialize new leaf */
        newblock = ablocks[--a];
        if (unlikely(newblock == 0)) {
                EXT4_ERROR_INODE(inode, "newblock == 0!");
                err = -EFSCORRUPTED;
                goto cleanup;
        }
        bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
        if (unlikely(!bh)) {
                err = -ENOMEM;
                goto cleanup;
        }
        lock_buffer(bh);

        err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
                                             EXT4_JTR_NONE);
        if (err)
                goto cleanup;

        neh = ext_block_hdr(bh);
        neh->eh_entries = 0;
        neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
        neh->eh_magic = EXT4_EXT_MAGIC;
        neh->eh_depth = 0;
        neh->eh_generation = 0;

        /* move remainder of path[depth] to the new leaf */
        if (unlikely(path[depth].p_hdr->eh_entries !=
                     path[depth].p_hdr->eh_max)) {
                EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
                                 path[depth].p_hdr->eh_entries,
                                 path[depth].p_hdr->eh_max);
                err = -EFSCORRUPTED;
                goto cleanup;
        }
        /* start copy from next extent */
        m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
        ext4_ext_show_move(inode, path, newblock, depth);
        if (m) {
                struct ext4_extent *ex;
                ex = EXT_FIRST_EXTENT(neh);
                memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
                le16_add_cpu(&neh->eh_entries, m);
        }

        /* zero out unused area in the extent block */
        ext_size = sizeof(struct ext4_extent_header) +
                sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries);
        memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
        ext4_extent_block_csum_set(inode, neh);
        set_buffer_uptodate(bh);
        unlock_buffer(bh);

        err = ext4_handle_dirty_metadata(handle, inode, bh);
        if (err)
                goto cleanup;
        brelse(bh);
        bh = NULL;

        /* correct old leaf */
        if (m) {
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
                        goto cleanup;
                le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
                err = ext4_ext_dirty(handle, inode, path + depth);
                if (err)
                        goto cleanup;

        }

        /* create intermediate indexes */
        k = depth - at - 1;
        if (unlikely(k < 0)) {
                EXT4_ERROR_INODE(inode, "k %d < 0!", k);
                err = -EFSCORRUPTED;
                goto cleanup;
        }
        if (k)
                ext_debug(inode, "create %d intermediate indices\n", k);
        /* insert new index into current index block */
        /* current depth stored in i var */
        i = depth - 1;
        while (k--) {
                oldblock = newblock;
                newblock = ablocks[--a];
                bh = sb_getblk(inode->i_sb, newblock);
                if (unlikely(!bh)) {
                        err = -ENOMEM;
                        goto cleanup;
                }
                lock_buffer(bh);

                err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
                                                     EXT4_JTR_NONE);
                if (err)
                        goto cleanup;

                neh = ext_block_hdr(bh);
                neh->eh_entries = cpu_to_le16(1);
                neh->eh_magic = EXT4_EXT_MAGIC;
                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
                neh->eh_depth = cpu_to_le16(depth - i);
                neh->eh_generation = 0;
                fidx = EXT_FIRST_INDEX(neh);
                fidx->ei_block = border;
                ext4_idx_store_pblock(fidx, oldblock);

                ext_debug(inode, "int.index at %d (block %llu): %u -> %llu\n",
                                i, newblock, le32_to_cpu(border), oldblock);

                /* move remainder of path[i] to the new index block */
                if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
                                        EXT_LAST_INDEX(path[i].p_hdr))) {
                        EXT4_ERROR_INODE(inode,
                                         "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
                                         le32_to_cpu(path[i].p_ext->ee_block));
                        err = -EFSCORRUPTED;
                        goto cleanup;
                }
                /* start copy indexes */
                m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
                ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx,
                                EXT_MAX_INDEX(path[i].p_hdr));
                ext4_ext_show_move(inode, path, newblock, i);
                if (m) {
                        memmove(++fidx, path[i].p_idx,
                                sizeof(struct ext4_extent_idx) * m);
                        le16_add_cpu(&neh->eh_entries, m);
                }
                /* zero out unused area in the extent block */
                ext_size = sizeof(struct ext4_extent_header) +
                   (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries));
                memset(bh->b_data + ext_size, 0,
                        inode->i_sb->s_blocksize - ext_size);
                ext4_extent_block_csum_set(inode, neh);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);

                err = ext4_handle_dirty_metadata(handle, inode, bh);
                if (err)
                        goto cleanup;
                brelse(bh);
                bh = NULL;

                /* correct old index */
                if (m) {
                        err = ext4_ext_get_access(handle, inode, path + i);
                        if (err)
                                goto cleanup;
                        le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
                        err = ext4_ext_dirty(handle, inode, path + i);
                        if (err)
                                goto cleanup;
                }

                i--;
        }

        /* insert new index */
        err = ext4_ext_insert_index(handle, inode, path + at,
                                    le32_to_cpu(border), newblock);

cleanup:
        if (bh) {
                if (buffer_locked(bh))
                        unlock_buffer(bh);
                brelse(bh);
        }

        if (err) {
                /* free all allocated blocks in error case */
                for (i = 0; i < depth; i++) {
                        if (!ablocks[i])
                                continue;
                        ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
                                         EXT4_FREE_BLOCKS_METADATA);
                }
        }
        kfree(ablocks);

        return err;
}

/*
 * ext4_ext_grow_indepth:
 * implements tree growing procedure:
 * - allocates new block
 * - moves top-level data (index block or leaf) into the new block
 * - initializes new top-level, creating index that points to the
 *   just created block
 */
static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
                                 unsigned int flags)
{
        struct ext4_extent_header *neh;
        struct buffer_head *bh;
        ext4_fsblk_t newblock, goal = 0;
        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
        int err = 0;
        size_t ext_size = 0;

        /* Try to prepend new index to old one */
        if (ext_depth(inode))
                goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
        if (goal > le32_to_cpu(es->s_first_data_block)) {
                flags |= EXT4_MB_HINT_TRY_GOAL;
                goal--;
        } else
                goal = ext4_inode_to_goal_block(inode);
        newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
                                        NULL, &err);
        if (newblock == 0)
                return err;

        bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
        if (unlikely(!bh))
                return -ENOMEM;
        lock_buffer(bh);

        err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
                                             EXT4_JTR_NONE);
        if (err) {
                unlock_buffer(bh);
                goto out;
        }

        ext_size = sizeof(EXT4_I(inode)->i_data);
        /* move top-level index/leaf into new block */
        memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size);
        /* zero out unused area in the extent block */
        memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);

        /* set size of new block */
        neh = ext_block_hdr(bh);
        /* old root could have indexes or leaves
         * so calculate e_max right way */
        if (ext_depth(inode))
                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
        else
                neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
        neh->eh_magic = EXT4_EXT_MAGIC;
        ext4_extent_block_csum_set(inode, neh);
        set_buffer_uptodate(bh);
        set_buffer_verified(bh);
        unlock_buffer(bh);

        err = ext4_handle_dirty_metadata(handle, inode, bh);
        if (err)
                goto out;

        /* Update top-level index: num,max,pointer */
        neh = ext_inode_hdr(inode);
        neh->eh_entries = cpu_to_le16(1);
        ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
        if (neh->eh_depth == 0) {
                /* Root extent block becomes index block */
                neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
                EXT_FIRST_INDEX(neh)->ei_block =
                        EXT_FIRST_EXTENT(neh)->ee_block;
        }
        ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %llu\n",
                  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
                  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
                  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));

        le16_add_cpu(&neh->eh_depth, 1);
        err = ext4_mark_inode_dirty(handle, inode);
out:
        brelse(bh);

        return err;
}

/*
 * ext4_ext_create_new_leaf:
 * finds empty index and adds new leaf.
 * if no free index is found, then it requests in-depth growing.
 */
static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
                                    unsigned int mb_flags,
                                    unsigned int gb_flags,
                                    struct ext4_ext_path **ppath,
                                    struct ext4_extent *newext)
{
        struct ext4_ext_path *path = *ppath;
        struct ext4_ext_path *curp;
        int depth, i, err = 0;

repeat:
        i = depth = ext_depth(inode);

        /* walk up to the tree and look for free index entry */
        curp = path + depth;
        while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
                i--;
                curp--;
        }

        /* we use already allocated block for index block,
         * so subsequent data blocks should be contiguous */
        if (EXT_HAS_FREE_INDEX(curp)) {
                /* if we found index with free entry, then use that
                 * entry: create all needed subtree and add new leaf */
                err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
                if (err)
                        goto out;

                /* refill path */
                path = ext4_find_extent(inode,
                                    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
                                    ppath, gb_flags);
                if (IS_ERR(path))
                        err = PTR_ERR(path);
        } else {
                /* tree is full, time to grow in depth */
                err = ext4_ext_grow_indepth(handle, inode, mb_flags);
                if (err)
                        goto out;

                /* refill path */
                path = ext4_find_extent(inode,
                                   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
                                    ppath, gb_flags);
                if (IS_ERR(path)) {
                        err = PTR_ERR(path);
                        goto out;
                }

                /*
                 * only first (depth 0 -> 1) produces free space;
                 * in all other cases we have to split the grown tree
                 */
                depth = ext_depth(inode);
                if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
                        /* now we need to split */
                        goto repeat;
                }
        }

out:
        return err;
}

/*
 * search the closest allocated block to the left for *logical
 * and returns it at @logical + it's physical address at @phys
 * if *logical is the smallest allocated block, the function
 * returns 0 at @phys
 * return value contains 0 (success) or error code
 */
static int ext4_ext_search_left(struct inode *inode,
                                struct ext4_ext_path *path,
                                ext4_lblk_t *logical, ext4_fsblk_t *phys)
{
        struct ext4_extent_idx *ix;
        struct ext4_extent *ex;
        int depth, ee_len;

        if (unlikely(path == NULL)) {
                EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
                return -EFSCORRUPTED;
        }
        depth = path->p_depth;
        *phys = 0;

        if (depth == 0 && path->p_ext == NULL)
                return 0;

        /* usually extent in the path covers blocks smaller
         * then *logical, but it can be that extent is the
         * first one in the file */

        ex = path[depth].p_ext;
        ee_len = ext4_ext_get_actual_len(ex);
        if (*logical < le32_to_cpu(ex->ee_block)) {
                if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
                        EXT4_ERROR_INODE(inode,
                                         "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
                                         *logical, le32_to_cpu(ex->ee_block));
                        return -EFSCORRUPTED;
                }
                while (--depth >= 0) {
                        ix = path[depth].p_idx;
                        if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
                                EXT4_ERROR_INODE(inode,
                                  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
                                  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
                                  le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block),
                                  depth);
                                return -EFSCORRUPTED;
                        }
                }
                return 0;
        }

        if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
                EXT4_ERROR_INODE(inode,
                                 "logical %d < ee_block %d + ee_len %d!",
                                 *logical, le32_to_cpu(ex->ee_block), ee_len);
                return -EFSCORRUPTED;
        }

        *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
        *phys = ext4_ext_pblock(ex) + ee_len - 1;
        return 0;
}

/*
 * Search the closest allocated block to the right for *logical
 * and returns it at @logical + it's physical address at @phys.
 * If not exists, return 0 and @phys is set to 0. We will return
 * 1 which means we found an allocated block and ret_ex is valid.
 * Or return a (< 0) error code.
 */
static int ext4_ext_search_right(struct inode *inode,
                                 struct ext4_ext_path *path,
                                 ext4_lblk_t *logical, ext4_fsblk_t *phys,
                                 struct ext4_extent *ret_ex)
{
        struct buffer_head *bh = NULL;
        struct ext4_extent_header *eh;
        struct ext4_extent_idx *ix;
        struct ext4_extent *ex;
        int depth;        /* Note, NOT eh_depth; depth from top of tree */
        int ee_len;

        if (unlikely(path == NULL)) {
                EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
                return -EFSCORRUPTED;
        }
        depth = path->p_depth;
        *phys = 0;

        if (depth == 0 && path->p_ext == NULL)
                return 0;

        /* usually extent in the path covers blocks smaller
         * then *logical, but it can be that extent is the
         * first one in the file */

        ex = path[depth].p_ext;
        ee_len = ext4_ext_get_actual_len(ex);
        if (*logical < le32_to_cpu(ex->ee_block)) {
                if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
                        EXT4_ERROR_INODE(inode,
                                         "first_extent(path[%d].p_hdr) != ex",
                                         depth);
                        return -EFSCORRUPTED;
                }
                while (--depth >= 0) {
                        ix = path[depth].p_idx;
                        if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
                                EXT4_ERROR_INODE(inode,
                                                 "ix != EXT_FIRST_INDEX *logical %d!",
                                                 *logical);
                                return -EFSCORRUPTED;
                        }
                }
                goto found_extent;
        }

        if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
                EXT4_ERROR_INODE(inode,
                                 "logical %d < ee_block %d + ee_len %d!",
                                 *logical, le32_to_cpu(ex->ee_block), ee_len);
                return -EFSCORRUPTED;
        }

        if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
                /* next allocated block in this leaf */
                ex++;
                goto found_extent;
        }

        /* go up and search for index to the right */
        while (--depth >= 0) {
                ix = path[depth].p_idx;
                if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
                        goto got_index;
        }

        /* we've gone up to the root and found no index to the right */
        return 0;

got_index:
        /* we've found index to the right, let's
         * follow it and find the closest allocated
         * block to the right */
        ix++;
        while (++depth < path->p_depth) {
                /* subtract from p_depth to get proper eh_depth */
                bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
                if (IS_ERR(bh))
                        return PTR_ERR(bh);
                eh = ext_block_hdr(bh);
                ix = EXT_FIRST_INDEX(eh);
                put_bh(bh);
        }

        bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
        if (IS_ERR(bh))
                return PTR_ERR(bh);
        eh = ext_block_hdr(bh);
        ex = EXT_FIRST_EXTENT(eh);
found_extent:
        *logical = le32_to_cpu(ex->ee_block);
        *phys = ext4_ext_pblock(ex);
        if (ret_ex)
                *ret_ex = *ex;
        if (bh)
                put_bh(bh);
        return 1;
}

/*
 * ext4_ext_next_allocated_block:
 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
 * NOTE: it considers block number from index entry as
 * allocated block. Thus, index entries have to be consistent
 * with leaves.
 */
ext4_lblk_t
ext4_ext_next_allocated_block(struct ext4_ext_path *path)
{
        int depth;

        BUG_ON(path == NULL);
        depth = path->p_depth;

        if (depth == 0 && path->p_ext == NULL)
                return EXT_MAX_BLOCKS;

        while (depth >= 0) {
                struct ext4_ext_path *p = &path[depth];

                if (depth == path->p_depth) {
                        /* leaf */
                        if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr))
                                return le32_to_cpu(p->p_ext[1].ee_block);
                } else {
                        /* index */
                        if (p->p_idx != EXT_LAST_INDEX(p->p_hdr))
                                return le32_to_cpu(p->p_idx[1].ei_block);
                }
                depth--;
        }

        return EXT_MAX_BLOCKS;
}

/*
 * ext4_ext_next_leaf_block:
 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
 */
static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
{
        int depth;

        BUG_ON(path == NULL);
        depth = path->p_depth;

        /* zero-tree has no leaf blocks at all */
        if (depth == 0)
                return EXT_MAX_BLOCKS;

        /* go to index block */
        depth--;

        while (depth >= 0) {
                if (path[depth].p_idx !=
                                EXT_LAST_INDEX(path[depth].p_hdr))
                        return (ext4_lblk_t)
                                le32_to_cpu(path[depth].p_idx[1].ei_block);
                depth--;
        }

        return EXT_MAX_BLOCKS;
}

/*
 * ext4_ext_correct_indexes:
 * if leaf gets modified and modified extent is first in the leaf,
 * then we have to correct all indexes above.
 * TODO: do we need to correct tree in all cases?
 */
static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
                                struct ext4_ext_path *path)
{
        struct ext4_extent_header *eh;
        int depth = ext_depth(inode);
        struct ext4_extent *ex;
        __le32 border;
        int k, err = 0;

        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;

        if (unlikely(ex == NULL || eh == NULL)) {
                EXT4_ERROR_INODE(inode,
                                 "ex %p == NULL or eh %p == NULL", ex, eh);
                return -EFSCORRUPTED;
        }

        if (depth == 0) {
                /* there is no tree at all */
                return 0;
        }

        if (ex != EXT_FIRST_EXTENT(eh)) {
                /* we correct tree if first leaf got modified only */
                return 0;
        }

        /*
         * TODO: we need correction if border is smaller than current one
         */
        k = depth - 1;
        border = path[depth].p_ext->ee_block;
        err = ext4_ext_get_access(handle, inode, path + k);
        if (err)
                return err;
        path[k].p_idx->ei_block = border;
        err = ext4_ext_dirty(handle, inode, path + k);
        if (err)
                return err;

        while (k--) {
                /* change all left-side indexes */
                if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
                        break;
                err = ext4_ext_get_access(handle, inode, path + k);
                if (err)
                        break;
                path[k].p_idx->ei_block = border;
                err = ext4_ext_dirty(handle, inode, path + k);
                if (err)
                        break;
        }

        return err;
}

static int ext4_can_extents_be_merged(struct inode *inode,
                                      struct ext4_extent *ex1,
                                      struct ext4_extent *ex2)
{
        unsigned short ext1_ee_len, ext2_ee_len;

        if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
                return 0;

        ext1_ee_len = ext4_ext_get_actual_len(ex1);
        ext2_ee_len = ext4_ext_get_actual_len(ex2);

        if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
                        le32_to_cpu(ex2->ee_block))
                return 0;

        if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
                return 0;

        if (ext4_ext_is_unwritten(ex1) &&
            ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)
                return 0;
#ifdef AGGRESSIVE_TEST
        if (ext1_ee_len >= 4)
                return 0;
#endif

        if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
                return 1;
        return 0;
}

/*
 * This function tries to merge the "ex" extent to the next extent in the tree.
 * It always tries to merge towards right. If you want to merge towards
 * left, pass "ex - 1" as argument instead of "ex".
 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
 * 1 if they got merged.
 */
static int ext4_ext_try_to_merge_right(struct inode *inode,
                                 struct ext4_ext_path *path,
                                 struct ext4_extent *ex)
{
        struct ext4_extent_header *eh;
        unsigned int depth, len;
        int merge_done = 0, unwritten;

        depth = ext_depth(inode);
        BUG_ON(path[depth].p_hdr == NULL);
        eh = path[depth].p_hdr;

        while (ex < EXT_LAST_EXTENT(eh)) {
                if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
                        break;
                /* merge with next extent! */
                unwritten = ext4_ext_is_unwritten(ex);
                ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                + ext4_ext_get_actual_len(ex + 1));
                if (unwritten)
                        ext4_ext_mark_unwritten(ex);

                if (ex + 1 < EXT_LAST_EXTENT(eh)) {
                        len = (EXT_LAST_EXTENT(eh) - ex - 1)
                                * sizeof(struct ext4_extent);
                        memmove(ex + 1, ex + 2, len);
                }
                le16_add_cpu(&eh->eh_entries, -1);
                merge_done = 1;
                WARN_ON(eh->eh_entries == 0);
                if (!eh->eh_entries)
                        EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
        }

        return merge_done;
}

/*
 * This function does a very simple check to see if we can collapse
 * an extent tree with a single extent tree leaf block into the inode.
 */
static void ext4_ext_try_to_merge_up(handle_t *handle,
                                     struct inode *inode,
                                     struct ext4_ext_path *path)
{
        size_t s;
        unsigned max_root = ext4_ext_space_root(inode, 0);
        ext4_fsblk_t blk;

        if ((path[0].p_depth != 1) ||
            (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
            (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
                return;

        /*
         * We need to modify the block allocation bitmap and the block
         * group descriptor to release the extent tree block.  If we
         * can't get the journal credits, give up.
         */
        if (ext4_journal_extend(handle, 2,
                        ext4_free_metadata_revoke_credits(inode->i_sb, 1)))
                return;

        /*
         * Copy the extent data up to the inode
         */
        blk = ext4_idx_pblock(path[0].p_idx);
        s = le16_to_cpu(path[1].p_hdr->eh_entries) *
                sizeof(struct ext4_extent_idx);
        s += sizeof(struct ext4_extent_header);

        path[1].p_maxdepth = path[0].p_maxdepth;
        memcpy(path[0].p_hdr, path[1].p_hdr, s);
        path[0].p_depth = 0;
        path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
                (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
        path[0].p_hdr->eh_max = cpu_to_le16(max_root);

        brelse(path[1].p_bh);
        ext4_free_blocks(handle, inode, NULL, blk, 1,
                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
}

/*
 * This function tries to merge the @ex extent to neighbours in the tree, then
 * tries to collapse the extent tree into the inode.
 */
static void ext4_ext_try_to_merge(handle_t *handle,
                                  struct inode *inode,
                                  struct ext4_ext_path *path,
                                  struct ext4_extent *ex)
{
        struct ext4_extent_header *eh;
        unsigned int depth;
        int merge_done = 0;

        depth = ext_depth(inode);
        BUG_ON(path[depth].p_hdr == NULL);
        eh = path[depth].p_hdr;

        if (ex > EXT_FIRST_EXTENT(eh))
                merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);

        if (!merge_done)
                (void) ext4_ext_try_to_merge_right(inode, path, ex);

        ext4_ext_try_to_merge_up(handle, inode, path);
}

/*
 * check if a portion of the "newext" extent overlaps with an
 * existing extent.
 *
 * If there is an overlap discovered, it updates the length of the newext
 * such that there will be no overlap, and then returns 1.
 * If there is no overlap found, it returns 0.
 */
static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
                                           struct inode *inode,
                                           struct ext4_extent *newext,
                                           struct ext4_ext_path *path)
{
        ext4_lblk_t b1, b2;
        unsigned int depth, len1;
        unsigned int ret = 0;

        b1 = le32_to_cpu(newext->ee_block);
        len1 = ext4_ext_get_actual_len(newext);
        depth = ext_depth(inode);
        if (!path[depth].p_ext)
                goto out;
        b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));

        /*
         * get the next allocated block if the extent in the path
         * is before the requested block(s)
         */
        if (b2 < b1) {
                b2 = ext4_ext_next_allocated_block(path);
                if (b2 == EXT_MAX_BLOCKS)
                        goto out;
                b2 = EXT4_LBLK_CMASK(sbi, b2);
        }

        /* check for wrap through zero on extent logical start block*/
        if (b1 + len1 < b1) {
                len1 = EXT_MAX_BLOCKS - b1;
                newext->ee_len = cpu_to_le16(len1);
                ret = 1;
        }

        /* check for overlap */
        if (b1 + len1 > b2) {
                newext->ee_len = cpu_to_le16(b2 - b1);
                ret = 1;
        }
out:
        return ret;
}

/*
 * ext4_ext_insert_extent:
 * tries to merge requested extent into the existing extent or
 * inserts requested extent as new one into the tree,
 * creating new leaf in the no-space case.
 */
int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
                                struct ext4_ext_path **ppath,
                                struct ext4_extent *newext, int gb_flags)
{
        struct ext4_ext_path *path = *ppath;
        struct ext4_extent_header *eh;
        struct ext4_extent *ex, *fex;
        struct ext4_extent *nearex; /* nearest extent */
        struct ext4_ext_path *npath = NULL;
        int depth, len, err;
        ext4_lblk_t next;
        int mb_flags = 0, unwritten;

        if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                mb_flags |= EXT4_MB_DELALLOC_RESERVED;
        if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
                EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
                return -EFSCORRUPTED;
        }
        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        eh = path[depth].p_hdr;
        if (unlikely(path[depth].p_hdr == NULL)) {
                EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
                return -EFSCORRUPTED;
        }

        /* try to insert block into found extent and return */
        if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {

                /*
                 * Try to see whether we should rather test the extent on
                 * right from ex, or from the left of ex. This is because
                 * ext4_find_extent() can return either extent on the
                 * left, or on the right from the searched position. This
                 * will make merging more effective.
                 */
                if (ex < EXT_LAST_EXTENT(eh) &&
                    (le32_to_cpu(ex->ee_block) +
                    ext4_ext_get_actual_len(ex) <
                    le32_to_cpu(newext->ee_block))) {
                        ex += 1;
                        goto prepend;
                } else if ((ex > EXT_FIRST_EXTENT(eh)) &&
                           (le32_to_cpu(newext->ee_block) +
                           ext4_ext_get_actual_len(newext) <
                           le32_to_cpu(ex->ee_block)))
                        ex -= 1;

                /* Try to append newex to the ex */
                if (ext4_can_extents_be_merged(inode, ex, newext)) {
                        ext_debug(inode, "append [%d]%d block to %u:[%d]%d"
                                  "(from %llu)\n",
                                  ext4_ext_is_unwritten(newext),
                                  ext4_ext_get_actual_len(newext),
                                  le32_to_cpu(ex->ee_block),
                                  ext4_ext_is_unwritten(ex),
                                  ext4_ext_get_actual_len(ex),
                                  ext4_ext_pblock(ex));
                        err = ext4_ext_get_access(handle, inode,
                                                  path + depth);
                        if (err)
                                return err;
                        unwritten = ext4_ext_is_unwritten(ex);
                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                        + ext4_ext_get_actual_len(newext));
                        if (unwritten)
                                ext4_ext_mark_unwritten(ex);
                        nearex = ex;
                        goto merge;
                }

prepend:
                /* Try to prepend newex to the ex */
                if (ext4_can_extents_be_merged(inode, newext, ex)) {
                        ext_debug(inode, "prepend %u[%d]%d block to %u:[%d]%d"
                                  "(from %llu)\n",
                                  le32_to_cpu(newext->ee_block),
                                  ext4_ext_is_unwritten(newext),
                                  ext4_ext_get_actual_len(newext),
                                  le32_to_cpu(ex->ee_block),
                                  ext4_ext_is_unwritten(ex),
                                  ext4_ext_get_actual_len(ex),
                                  ext4_ext_pblock(ex));
                        err = ext4_ext_get_access(handle, inode,
                                                  path + depth);
                        if (err)
                                return err;

                        unwritten = ext4_ext_is_unwritten(ex);
                        ex->ee_block = newext->ee_block;
                        ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                        + ext4_ext_get_actual_len(newext));
                        if (unwritten)
                                ext4_ext_mark_unwritten(ex);
                        nearex = ex;
                        goto merge;
                }
        }

        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
        if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
                goto has_space;

        /* probably next leaf has space for us? */
        fex = EXT_LAST_EXTENT(eh);
        next = EXT_MAX_BLOCKS;
        if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
                next = ext4_ext_next_leaf_block(path);
        if (next != EXT_MAX_BLOCKS) {
                ext_debug(inode, "next leaf block - %u\n", next);
                BUG_ON(npath != NULL);
                npath = ext4_find_extent(inode, next, NULL, gb_flags);
                if (IS_ERR(npath))
                        return PTR_ERR(npath);
                BUG_ON(npath->p_depth != path->p_depth);
                eh = npath[depth].p_hdr;
                if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
                        ext_debug(inode, "next leaf isn't full(%d)\n",
                                  le16_to_cpu(eh->eh_entries));
                        path = npath;
                        goto has_space;
                }
                ext_debug(inode, "next leaf has no free space(%d,%d)\n",
                          le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
        }

        /*
         * There is no free space in the found leaf.
         * We're gonna add a new leaf in the tree.
         */
        if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
                mb_flags |= EXT4_MB_USE_RESERVED;
        err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
                                       ppath, newext);
        if (err)
                goto cleanup;
        depth = ext_depth(inode);
        eh = path[depth].p_hdr;

has_space:
        nearex = path[depth].p_ext;

        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                goto cleanup;

        if (!nearex) {
                /* there is no extent in this leaf, create first one */
                ext_debug(inode, "first extent in the leaf: %u:%llu:[%d]%d\n",
                                le32_to_cpu(newext->ee_block),
                                ext4_ext_pblock(newext),
                                ext4_ext_is_unwritten(newext),
                                ext4_ext_get_actual_len(newext));
                nearex = EXT_FIRST_EXTENT(eh);
        } else {
                if (le32_to_cpu(newext->ee_block)
                           > le32_to_cpu(nearex->ee_block)) {
                        /* Insert after */
                        ext_debug(inode, "insert %u:%llu:[%d]%d before: "
                                        "nearest %p\n",
                                        le32_to_cpu(newext->ee_block),
                                        ext4_ext_pblock(newext),
                                        ext4_ext_is_unwritten(newext),
                                        ext4_ext_get_actual_len(newext),
                                        nearex);
                        nearex++;
                } else {
                        /* Insert before */
                        BUG_ON(newext->ee_block == nearex->ee_block);
                        ext_debug(inode, "insert %u:%llu:[%d]%d after: "
                                        "nearest %p\n",
                                        le32_to_cpu(newext->ee_block),
                                        ext4_ext_pblock(newext),
                                        ext4_ext_is_unwritten(newext),
                                        ext4_ext_get_actual_len(newext),
                                        nearex);
                }
                len = EXT_LAST_EXTENT(eh) - nearex + 1;
                if (len > 0) {
                        ext_debug(inode, "insert %u:%llu:[%d]%d: "
                                        "move %d extents from 0x%p to 0x%p\n",
                                        le32_to_cpu(newext->ee_block),
                                        ext4_ext_pblock(newext),
                                        ext4_ext_is_unwritten(newext),
                                        ext4_ext_get_actual_len(newext),
                                        len, nearex, nearex + 1);
                        memmove(nearex + 1, nearex,
                                len * sizeof(struct ext4_extent));
                }
        }

        le16_add_cpu(&eh->eh_entries, 1);
        path[depth].p_ext = nearex;
        nearex->ee_block = newext->ee_block;
        ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
        nearex->ee_len = newext->ee_len;

merge:
        /* try to merge extents */
        if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
                ext4_ext_try_to_merge(handle, inode, path, nearex);


        /* time to correct all indexes above */
        err = ext4_ext_correct_indexes(handle, inode, path);
        if (err)
                goto cleanup;

        err = ext4_ext_dirty(handle, inode, path + path->p_depth);

cleanup:
        ext4_free_ext_path(npath);
        return err;
}

static int ext4_fill_es_cache_info(struct inode *inode,
                                   ext4_lblk_t block, ext4_lblk_t num,
                                   struct fiemap_extent_info *fieinfo)
{
        ext4_lblk_t next, end = block + num - 1;
        struct extent_status es;
        unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
        unsigned int flags;
        int err;

        while (block <= end) {
                next = 0;
                flags = 0;
                if (!ext4_es_lookup_extent(inode, block, &next, &es))
                        break;
                if (ext4_es_is_unwritten(&es))
                        flags |= FIEMAP_EXTENT_UNWRITTEN;
                if (ext4_es_is_delayed(&es))
                        flags |= (FIEMAP_EXTENT_DELALLOC |
                                  FIEMAP_EXTENT_UNKNOWN);
                if (ext4_es_is_hole(&es))
                        flags |= EXT4_FIEMAP_EXTENT_HOLE;
                if (next == 0)
                        flags |= FIEMAP_EXTENT_LAST;
                if (flags & (FIEMAP_EXTENT_DELALLOC|
                             EXT4_FIEMAP_EXTENT_HOLE))
                        es.es_pblk = 0;
                else
                        es.es_pblk = ext4_es_pblock(&es);
                err = fiemap_fill_next_extent(fieinfo,
                                (__u64)es.es_lblk << blksize_bits,
                                (__u64)es.es_pblk << blksize_bits,
                                (__u64)es.es_len << blksize_bits,
                                flags);
                if (next == 0)
                        break;
                block = next;
                if (err < 0)
                        return err;
                if (err == 1)
                        return 0;
        }
        return 0;
}


/*
 * ext4_ext_find_hole - find hole around given block according to the given path
 * @inode:        inode we lookup in
 * @path:        path in extent tree to @lblk
 * @lblk:        pointer to logical block around which we want to determine hole
 *
 * Determine hole length (and start if easily possible) around given logical
 * block. We don't try too hard to find the beginning of the hole but @path
 * actually points to extent before @lblk, we provide it.
 *
 * The function returns the length of a hole starting at @lblk. We update @lblk
 * to the beginning of the hole if we managed to find it.
 */
static ext4_lblk_t ext4_ext_find_hole(struct inode *inode,
                                      struct ext4_ext_path *path,
                                      ext4_lblk_t *lblk)
{
        int depth = ext_depth(inode);
        struct ext4_extent *ex;
        ext4_lblk_t len;

        ex = path[depth].p_ext;
        if (ex == NULL) {
                /* there is no extent yet, so gap is [0;-] */
                *lblk = 0;
                len = EXT_MAX_BLOCKS;
        } else if (*lblk < le32_to_cpu(ex->ee_block)) {
                len = le32_to_cpu(ex->ee_block) - *lblk;
        } else if (*lblk >= le32_to_cpu(ex->ee_block)
                        + ext4_ext_get_actual_len(ex)) {
                ext4_lblk_t next;

                *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
                next = ext4_ext_next_allocated_block(path);
                BUG_ON(next == *lblk);
                len = next - *lblk;
        } else {
                BUG();
        }
        return len;
}

/*
 * ext4_ext_rm_idx:
 * removes index from the index block.
 */
static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
                        struct ext4_ext_path *path, int depth)
{
        int err;
        ext4_fsblk_t leaf;

        /* free index block */
        depth--;
        path = path + depth;
        leaf = ext4_idx_pblock(path->p_idx);
        if (unlikely(path->p_hdr->eh_entries == 0)) {
                EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
                return -EFSCORRUPTED;
        }
        err = ext4_ext_get_access(handle, inode, path);
        if (err)
                return err;

        if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
                int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
                len *= sizeof(struct ext4_extent_idx);
                memmove(path->p_idx, path->p_idx + 1, len);
        }

        le16_add_cpu(&path->p_hdr->eh_entries, -1);
        err = ext4_ext_dirty(handle, inode, path);
        if (err)
                return err;
        ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf);
        trace_ext4_ext_rm_idx(inode, leaf);

        ext4_free_blocks(handle, inode, NULL, leaf, 1,
                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);

        while (--depth >= 0) {
                if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
                        break;
                path--;
                err = ext4_ext_get_access(handle, inode, path);
                if (err)
                        break;
                path->p_idx->ei_block = (path+1)->p_idx->ei_block;
                err = ext4_ext_dirty(handle, inode, path);
                if (err)
                        break;
        }
        return err;
}

/*
 * ext4_ext_calc_credits_for_single_extent:
 * This routine returns max. credits that needed to insert an extent
 * to the extent tree.
 * When pass the actual path, the caller should calculate credits
 * under i_data_sem.
 */
int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
                                                struct ext4_ext_path *path)
{
        if (path) {
                int depth = ext_depth(inode);
                int ret = 0;

                /* probably there is space in leaf? */
                if (le16_to_cpu(path[depth].p_hdr->eh_entries)
                                < le16_to_cpu(path[depth].p_hdr->eh_max)) {

                        /*
                         *  There are some space in the leaf tree, no
                         *  need to account for leaf block credit
                         *
                         *  bitmaps and block group descriptor blocks
                         *  and other metadata blocks still need to be
                         *  accounted.
                         */
                        /* 1 bitmap, 1 block group descriptor */
                        ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
                        return ret;
                }
        }

        return ext4_chunk_trans_blocks(inode, nrblocks);
}

/*
 * How many index/leaf blocks need to change/allocate to add @extents extents?
 *
 * If we add a single extent, then in the worse case, each tree level
 * index/leaf need to be changed in case of the tree split.
 *
 * If more extents are inserted, they could cause the whole tree split more
 * than once, but this is really rare.
 */
int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
{
        int index;
        int depth;

        /* If we are converting the inline data, only one is needed here. */
        if (ext4_has_inline_data(inode))
                return 1;

        depth = ext_depth(inode);

        if (extents <= 1)
                index = depth * 2;
        else
                index = depth * 3;

        return index;
}

static inline int get_default_free_blocks_flags(struct inode *inode)
{
        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
            ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
                return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
        else if (ext4_should_journal_data(inode))
                return EXT4_FREE_BLOCKS_FORGET;
        return 0;
}

/*
 * ext4_rereserve_cluster - increment the reserved cluster count when
 *                          freeing a cluster with a pending reservation
 *
 * @inode - file containing the cluster
 * @lblk - logical block in cluster to be reserved
 *
 * Increments the reserved cluster count and adjusts quota in a bigalloc
 * file system when freeing a partial cluster containing at least one
 * delayed and unwritten block.  A partial cluster meeting that
 * requirement will have a pending reservation.  If so, the
 * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to
 * defer reserved and allocated space accounting to a subsequent call
 * to this function.
 */
static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);

        dquot_reclaim_block(inode, EXT4_C2B(sbi, 1));

        spin_lock(&ei->i_block_reservation_lock);
        ei->i_reserved_data_blocks++;
        percpu_counter_add(&sbi->s_dirtyclusters_counter, 1);
        spin_unlock(&ei->i_block_reservation_lock);

        percpu_counter_add(&sbi->s_freeclusters_counter, 1);
        ext4_remove_pending(inode, lblk);
}

static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                              struct ext4_extent *ex,
                              struct partial_cluster *partial,
                              ext4_lblk_t from, ext4_lblk_t to)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        unsigned short ee_len = ext4_ext_get_actual_len(ex);
        ext4_fsblk_t last_pblk, pblk;
        ext4_lblk_t num;
        int flags;

        /* only extent tail removal is allowed */
        if (from < le32_to_cpu(ex->ee_block) ||
            to != le32_to_cpu(ex->ee_block) + ee_len - 1) {
                ext4_error(sbi->s_sb,
                           "strange request: removal(2) %u-%u from %u:%u",
                           from, to, le32_to_cpu(ex->ee_block), ee_len);
                return 0;
        }

#ifdef EXTENTS_STATS
        spin_lock(&sbi->s_ext_stats_lock);
        sbi->s_ext_blocks += ee_len;
        sbi->s_ext_extents++;
        if (ee_len < sbi->s_ext_min)
                sbi->s_ext_min = ee_len;
        if (ee_len > sbi->s_ext_max)
                sbi->s_ext_max = ee_len;
        if (ext_depth(inode) > sbi->s_depth_max)
                sbi->s_depth_max = ext_depth(inode);
        spin_unlock(&sbi->s_ext_stats_lock);
#endif

        trace_ext4_remove_blocks(inode, ex, from, to, partial);

        /*
         * if we have a partial cluster, and it's different from the
         * cluster of the last block in the extent, we free it
         */
        last_pblk = ext4_ext_pblock(ex) + ee_len - 1;

        if (partial->state != initial &&
            partial->pclu != EXT4_B2C(sbi, last_pblk)) {
                if (partial->state == tofree) {
                        flags = get_default_free_blocks_flags(inode);
                        if (ext4_is_pending(inode, partial->lblk))
                                flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
                        ext4_free_blocks(handle, inode, NULL,
                                         EXT4_C2B(sbi, partial->pclu),
                                         sbi->s_cluster_ratio, flags);
                        if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
                                ext4_rereserve_cluster(inode, partial->lblk);
                }
                partial->state = initial;
        }

        num = le32_to_cpu(ex->ee_block) + ee_len - from;
        pblk = ext4_ext_pblock(ex) + ee_len - num;

        /*
         * We free the partial cluster at the end of the extent (if any),
         * unless the cluster is used by another extent (partial_cluster
         * state is nofree).  If a partial cluster exists here, it must be
         * shared with the last block in the extent.
         */
        flags = get_default_free_blocks_flags(inode);

        /* partial, left end cluster aligned, right end unaligned */
        if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) &&
            (EXT4_LBLK_CMASK(sbi, to) >= from) &&
            (partial->state != nofree)) {
                if (ext4_is_pending(inode, to))
                        flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
                ext4_free_blocks(handle, inode, NULL,
                                 EXT4_PBLK_CMASK(sbi, last_pblk),
                                 sbi->s_cluster_ratio, flags);
                if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
                        ext4_rereserve_cluster(inode, to);
                partial->state = initial;
                flags = get_default_free_blocks_flags(inode);
        }

        flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;

        /*
         * For bigalloc file systems, we never free a partial cluster
         * at the beginning of the extent.  Instead, we check to see if we
         * need to free it on a subsequent call to ext4_remove_blocks,
         * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
         */
        flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
        ext4_free_blocks(handle, inode, NULL, pblk, num, flags);

        /* reset the partial cluster if we've freed past it */
        if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk))
                partial->state = initial;

        /*
         * If we've freed the entire extent but the beginning is not left
         * cluster aligned and is not marked as ineligible for freeing we
         * record the partial cluster at the beginning of the extent.  It
         * wasn't freed by the preceding ext4_free_blocks() call, and we
         * need to look farther to the left to determine if it's to be freed
         * (not shared with another extent). Else, reset the partial
         * cluster - we're either  done freeing or the beginning of the
         * extent is left cluster aligned.
         */
        if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) {
                if (partial->state == initial) {
                        partial->pclu = EXT4_B2C(sbi, pblk);
                        partial->lblk = from;
                        partial->state = tofree;
                }
        } else {
                partial->state = initial;
        }

        return 0;
}

/*
 * ext4_ext_rm_leaf() Removes the extents associated with the
 * blocks appearing between "start" and "end".  Both "start"
 * and "end" must appear in the same extent or EIO is returned.
 *
 * @handle: The journal handle
 * @inode:  The files inode
 * @path:   The path to the leaf
 * @partial_cluster: The cluster which we'll have to free if all extents
 *                   has been released from it.  However, if this value is
 *                   negative, it's a cluster just to the right of the
 *                   punched region and it must not be freed.
 * @start:  The first block to remove
 * @end:   The last block to remove
 */
static int
ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                 struct ext4_ext_path *path,
                 struct partial_cluster *partial,
                 ext4_lblk_t start, ext4_lblk_t end)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int err = 0, correct_index = 0;
        int depth = ext_depth(inode), credits, revoke_credits;
        struct ext4_extent_header *eh;
        ext4_lblk_t a, b;
        unsigned num;
        ext4_lblk_t ex_ee_block;
        unsigned short ex_ee_len;
        unsigned unwritten = 0;
        struct ext4_extent *ex;
        ext4_fsblk_t pblk;

        /* the header must be checked already in ext4_ext_remove_space() */
        ext_debug(inode, "truncate since %u in leaf to %u\n", start, end);
        if (!path[depth].p_hdr)
                path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
        eh = path[depth].p_hdr;
        if (unlikely(path[depth].p_hdr == NULL)) {
                EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
                return -EFSCORRUPTED;
        }
        /* find where to start removing */
        ex = path[depth].p_ext;
        if (!ex)
                ex = EXT_LAST_EXTENT(eh);

        ex_ee_block = le32_to_cpu(ex->ee_block);
        ex_ee_len = ext4_ext_get_actual_len(ex);

        trace_ext4_ext_rm_leaf(inode, start, ex, partial);

        while (ex >= EXT_FIRST_EXTENT(eh) &&
                        ex_ee_block + ex_ee_len > start) {

                if (ext4_ext_is_unwritten(ex))
                        unwritten = 1;
                else
                        unwritten = 0;

                ext_debug(inode, "remove ext %u:[%d]%d\n", ex_ee_block,
                          unwritten, ex_ee_len);
                path[depth].p_ext = ex;

                a = max(ex_ee_block, start);
                b = min(ex_ee_block + ex_ee_len - 1, end);

                ext_debug(inode, "  border %u:%u\n", a, b);

                /* If this extent is beyond the end of the hole, skip it */
                if (end < ex_ee_block) {
                        /*
                         * We're going to skip this extent and move to another,
                         * so note that its first cluster is in use to avoid
                         * freeing it when removing blocks.  Eventually, the
                         * right edge of the truncated/punched region will
                         * be just to the left.
                         */
                        if (sbi->s_cluster_ratio > 1) {
                                pblk = ext4_ext_pblock(ex);
                                partial->pclu = EXT4_B2C(sbi, pblk);
                                partial->state = nofree;
                        }
                        ex--;
                        ex_ee_block = le32_to_cpu(ex->ee_block);
                        ex_ee_len = ext4_ext_get_actual_len(ex);
                        continue;
                } else if (b != ex_ee_block + ex_ee_len - 1) {
                        EXT4_ERROR_INODE(inode,
                                         "can not handle truncate %u:%u "
                                         "on extent %u:%u",
                                         start, end, ex_ee_block,
                                         ex_ee_block + ex_ee_len - 1);
                        err = -EFSCORRUPTED;
                        goto out;
                } else if (a != ex_ee_block) {
                        /* remove tail of the extent */
                        num = a - ex_ee_block;
                } else {
                        /* remove whole extent: excellent! */
                        num = 0;
                }
                /*
                 * 3 for leaf, sb, and inode plus 2 (bmap and group
                 * descriptor) for each block group; assume two block
                 * groups plus ex_ee_len/blocks_per_block_group for
                 * the worst case
                 */
                credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
                if (ex == EXT_FIRST_EXTENT(eh)) {
                        correct_index = 1;
                        credits += (ext_depth(inode)) + 1;
                }
                credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
                /*
                 * We may end up freeing some index blocks and data from the
                 * punched range. Note that partial clusters are accounted for
                 * by ext4_free_data_revoke_credits().
                 */
                revoke_credits =
                        ext4_free_metadata_revoke_credits(inode->i_sb,
                                                          ext_depth(inode)) +
                        ext4_free_data_revoke_credits(inode, b - a + 1);

                err = ext4_datasem_ensure_credits(handle, inode, credits,
                                                  credits, revoke_credits);
                if (err) {
                        if (err > 0)
                                err = -EAGAIN;
                        goto out;
                }

                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
                        goto out;

                err = ext4_remove_blocks(handle, inode, ex, partial, a, b);
                if (err)
                        goto out;

                if (num == 0)
                        /* this extent is removed; mark slot entirely unused */
                        ext4_ext_store_pblock(ex, 0);

                ex->ee_len = cpu_to_le16(num);
                /*
                 * Do not mark unwritten if all the blocks in the
                 * extent have been removed.
                 */
                if (unwritten && num)
                        ext4_ext_mark_unwritten(ex);
                /*
                 * If the extent was completely released,
                 * we need to remove it from the leaf
                 */
                if (num == 0) {
                        if (end != EXT_MAX_BLOCKS - 1) {
                                /*
                                 * For hole punching, we need to scoot all the
                                 * extents up when an extent is removed so that
                                 * we dont have blank extents in the middle
                                 */
                                memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
                                        sizeof(struct ext4_extent));

                                /* Now get rid of the one at the end */
                                memset(EXT_LAST_EXTENT(eh), 0,
                                        sizeof(struct ext4_extent));
                        }
                        le16_add_cpu(&eh->eh_entries, -1);
                }

                err = ext4_ext_dirty(handle, inode, path + depth);
                if (err)
                        goto out;

                ext_debug(inode, "new extent: %u:%u:%llu\n", ex_ee_block, num,
                                ext4_ext_pblock(ex));
                ex--;
                ex_ee_block = le32_to_cpu(ex->ee_block);
                ex_ee_len = ext4_ext_get_actual_len(ex);
        }

        if (correct_index && eh->eh_entries)
                err = ext4_ext_correct_indexes(handle, inode, path);

        /*
         * If there's a partial cluster and at least one extent remains in
         * the leaf, free the partial cluster if it isn't shared with the
         * current extent.  If it is shared with the current extent
         * we reset the partial cluster because we've reached the start of the
         * truncated/punched region and we're done removing blocks.
         */
        if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) {
                pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
                if (partial->pclu != EXT4_B2C(sbi, pblk)) {
                        int flags = get_default_free_blocks_flags(inode);

                        if (ext4_is_pending(inode, partial->lblk))
                                flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
                        ext4_free_blocks(handle, inode, NULL,
                                         EXT4_C2B(sbi, partial->pclu),
                                         sbi->s_cluster_ratio, flags);
                        if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
                                ext4_rereserve_cluster(inode, partial->lblk);
                }
                partial->state = initial;
        }

        /* if this leaf is free, then we should
         * remove it from index block above */
        if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
                err = ext4_ext_rm_idx(handle, inode, path, depth);

out:
        return err;
}

/*
 * ext4_ext_more_to_rm:
 * returns 1 if current index has to be freed (even partial)
 */
static int
ext4_ext_more_to_rm(struct ext4_ext_path *path)
{
        BUG_ON(path->p_idx == NULL);

        if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
                return 0;

        /*
         * if truncate on deeper level happened, it wasn't partial,
         * so we have to consider current index for truncation
         */
        if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
                return 0;
        return 1;
}

int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
                          ext4_lblk_t end)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int depth = ext_depth(inode);
        struct ext4_ext_path *path = NULL;
        struct partial_cluster partial;
        handle_t *handle;
        int i = 0, err = 0;

        partial.pclu = 0;
        partial.lblk = 0;
        partial.state = initial;

        ext_debug(inode, "truncate since %u to %u\n", start, end);

        /* probably first extent we're gonna free will be last in block */
        handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE,
                        depth + 1,
                        ext4_free_metadata_revoke_credits(inode->i_sb, depth));
        if (IS_ERR(handle))
                return PTR_ERR(handle);

again:
        trace_ext4_ext_remove_space(inode, start, end, depth);

        /*
         * Check if we are removing extents inside the extent tree. If that
         * is the case, we are going to punch a hole inside the extent tree
         * so we have to check whether we need to split the extent covering
         * the last block to remove so we can easily remove the part of it
         * in ext4_ext_rm_leaf().
         */
        if (end < EXT_MAX_BLOCKS - 1) {
                struct ext4_extent *ex;
                ext4_lblk_t ee_block, ex_end, lblk;
                ext4_fsblk_t pblk;

                /* find extent for or closest extent to this block */
                path = ext4_find_extent(inode, end, NULL,
                                        EXT4_EX_NOCACHE | EXT4_EX_NOFAIL);
                if (IS_ERR(path)) {
                        ext4_journal_stop(handle);
                        return PTR_ERR(path);
                }
                depth = ext_depth(inode);
                /* Leaf not may not exist only if inode has no blocks at all */
                ex = path[depth].p_ext;
                if (!ex) {
                        if (depth) {
                                EXT4_ERROR_INODE(inode,
                                                 "path[%d].p_hdr == NULL",
                                                 depth);
                                err = -EFSCORRUPTED;
                        }
                        goto out;
                }

                ee_block = le32_to_cpu(ex->ee_block);
                ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;

                /*
                 * See if the last block is inside the extent, if so split
                 * the extent at 'end' block so we can easily remove the
                 * tail of the first part of the split extent in
                 * ext4_ext_rm_leaf().
                 */
                if (end >= ee_block && end < ex_end) {

                        /*
                         * If we're going to split the extent, note that
                         * the cluster containing the block after 'end' is
                         * in use to avoid freeing it when removing blocks.
                         */
                        if (sbi->s_cluster_ratio > 1) {
                                pblk = ext4_ext_pblock(ex) + end - ee_block + 1;
                                partial.pclu = EXT4_B2C(sbi, pblk);
                                partial.state = nofree;
                        }

                        /*
                         * Split the extent in two so that 'end' is the last
                         * block in the first new extent. Also we should not
                         * fail removing space due to ENOSPC so try to use
                         * reserved block if that happens.
                         */
                        err = ext4_force_split_extent_at(handle, inode, &path,
                                                         end + 1, 1);
                        if (err < 0)
                                goto out;

                } else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
                           partial.state == initial) {
                        /*
                         * If we're punching, there's an extent to the right.
                         * If the partial cluster hasn't been set, set it to
                         * that extent's first cluster and its state to nofree
                         * so it won't be freed should it contain blocks to be
                         * removed. If it's already set (tofree/nofree), we're
                         * retrying and keep the original partial cluster info
                         * so a cluster marked tofree as a result of earlier
                         * extent removal is not lost.
                         */
                        lblk = ex_end + 1;
                        err = ext4_ext_search_right(inode, path, &lblk, &pblk,
                                                    NULL);
                        if (err < 0)
                                goto out;
                        if (pblk) {
                                partial.pclu = EXT4_B2C(sbi, pblk);
                                partial.state = nofree;
                        }
                }
        }
        /*
         * We start scanning from right side, freeing all the blocks
         * after i_size and walking into the tree depth-wise.
         */
        depth = ext_depth(inode);
        if (path) {
                int k = i = depth;
                while (--k > 0)
                        path[k].p_block =
                                le16_to_cpu(path[k].p_hdr->eh_entries)+1;
        } else {
                path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
                               GFP_NOFS | __GFP_NOFAIL);
                if (path == NULL) {
                        ext4_journal_stop(handle);
                        return -ENOMEM;
                }
                path[0].p_maxdepth = path[0].p_depth = depth;
                path[0].p_hdr = ext_inode_hdr(inode);
                i = 0;

                if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
                        err = -EFSCORRUPTED;
                        goto out;
                }
        }
        err = 0;

        while (i >= 0 && err == 0) {
                if (i == depth) {
                        /* this is leaf block */
                        err = ext4_ext_rm_leaf(handle, inode, path,
                                               &partial, start, end);
                        /* root level has p_bh == NULL, brelse() eats this */
                        brelse(path[i].p_bh);
                        path[i].p_bh = NULL;
                        i--;
                        continue;
                }

                /* this is index block */
                if (!path[i].p_hdr) {
                        ext_debug(inode, "initialize header\n");
                        path[i].p_hdr = ext_block_hdr(path[i].p_bh);
                }

                if (!path[i].p_idx) {
                        /* this level hasn't been touched yet */
                        path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
                        path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
                        ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n",
                                  path[i].p_hdr,
                                  le16_to_cpu(path[i].p_hdr->eh_entries));
                } else {
                        /* we were already here, see at next index */
                        path[i].p_idx--;
                }

                ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n",
                                i, EXT_FIRST_INDEX(path[i].p_hdr),
                                path[i].p_idx);
                if (ext4_ext_more_to_rm(path + i)) {
                        struct buffer_head *bh;
                        /* go to the next level */
                        ext_debug(inode, "move to level %d (block %llu)\n",
                                  i + 1, ext4_idx_pblock(path[i].p_idx));
                        memset(path + i + 1, 0, sizeof(*path));
                        bh = read_extent_tree_block(inode, path[i].p_idx,
                                                    depth - i - 1,
                                                    EXT4_EX_NOCACHE);
                        if (IS_ERR(bh)) {
                                /* should we reset i_size? */
                                err = PTR_ERR(bh);
                                break;
                        }
                        /* Yield here to deal with large extent trees.
                         * Should be a no-op if we did IO above. */
                        cond_resched();
                        if (WARN_ON(i + 1 > depth)) {
                                err = -EFSCORRUPTED;
                                break;
                        }
                        path[i + 1].p_bh = bh;

                        /* save actual number of indexes since this
                         * number is changed at the next iteration */
                        path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
                        i++;
                } else {
                        /* we finished processing this index, go up */
                        if (path[i].p_hdr->eh_entries == 0 && i > 0) {
                                /* index is empty, remove it;
                                 * handle must be already prepared by the
                                 * truncatei_leaf() */
                                err = ext4_ext_rm_idx(handle, inode, path, i);
                        }
                        /* root level has p_bh == NULL, brelse() eats this */
                        brelse(path[i].p_bh);
                        path[i].p_bh = NULL;
                        i--;
                        ext_debug(inode, "return to level %d\n", i);
                }
        }

        trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial,
                                         path->p_hdr->eh_entries);

        /*
         * if there's a partial cluster and we have removed the first extent
         * in the file, then we also free the partial cluster, if any
         */
        if (partial.state == tofree && err == 0) {
                int flags = get_default_free_blocks_flags(inode);

                if (ext4_is_pending(inode, partial.lblk))
                        flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
                ext4_free_blocks(handle, inode, NULL,
                                 EXT4_C2B(sbi, partial.pclu),
                                 sbi->s_cluster_ratio, flags);
                if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
                        ext4_rereserve_cluster(inode, partial.lblk);
                partial.state = initial;
        }

        /* TODO: flexible tree reduction should be here */
        if (path->p_hdr->eh_entries == 0) {
                /*
                 * truncate to zero freed all the tree,
                 * so we need to correct eh_depth
                 */
                err = ext4_ext_get_access(handle, inode, path);
                if (err == 0) {
                        ext_inode_hdr(inode)->eh_depth = 0;
                        ext_inode_hdr(inode)->eh_max =
                                cpu_to_le16(ext4_ext_space_root(inode, 0));
                        err = ext4_ext_dirty(handle, inode, path);
                }
        }
out:
        ext4_free_ext_path(path);
        path = NULL;
        if (err == -EAGAIN)
                goto again;
        ext4_journal_stop(handle);

        return err;
}

/*
 * called at mount time
 */
void ext4_ext_init(struct super_block *sb)
{
        /*
         * possible initialization would be here
         */

        if (ext4_has_feature_extents(sb)) {
#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
                printk(KERN_INFO "EXT4-fs: file extents enabled"
#ifdef AGGRESSIVE_TEST
                       ", aggressive tests"
#endif
#ifdef CHECK_BINSEARCH
                       ", check binsearch"
#endif
#ifdef EXTENTS_STATS
                       ", stats"
#endif
                       "\n");
#endif
#ifdef EXTENTS_STATS
                spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
                EXT4_SB(sb)->s_ext_min = 1 << 30;
                EXT4_SB(sb)->s_ext_max = 0;
#endif
        }
}

/*
 * called at umount time
 */
void ext4_ext_release(struct super_block *sb)
{
        if (!ext4_has_feature_extents(sb))
                return;

#ifdef EXTENTS_STATS
        if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
                struct ext4_sb_info *sbi = EXT4_SB(sb);
                printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
                        sbi->s_ext_blocks, sbi->s_ext_extents,
                        sbi->s_ext_blocks / sbi->s_ext_extents);
                printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
                        sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
        }
#endif
}

static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
{
        ext4_lblk_t  ee_block;
        ext4_fsblk_t ee_pblock;
        unsigned int ee_len;

        ee_block  = le32_to_cpu(ex->ee_block);
        ee_len    = ext4_ext_get_actual_len(ex);
        ee_pblock = ext4_ext_pblock(ex);

        if (ee_len == 0)
                return;

        ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
                              EXTENT_STATUS_WRITTEN);
}

/* FIXME!! we need to try to merge to left or right after zero-out  */
static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
{
        ext4_fsblk_t ee_pblock;
        unsigned int ee_len;

        ee_len    = ext4_ext_get_actual_len(ex);
        ee_pblock = ext4_ext_pblock(ex);
        return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
                                  ee_len);
}

/*
 * ext4_split_extent_at() splits an extent at given block.
 *
 * @handle: the journal handle
 * @inode: the file inode
 * @path: the path to the extent
 * @split: the logical block where the extent is splitted.
 * @split_flags: indicates if the extent could be zeroout if split fails, and
 *                 the states(init or unwritten) of new extents.
 * @flags: flags used to insert new extent to extent tree.
 *
 *
 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
 * of which are determined by split_flag.
 *
 * There are two cases:
 *  a> the extent are splitted into two extent.
 *  b> split is not needed, and just mark the extent.
 *
 * return 0 on success.
 */
static int ext4_split_extent_at(handle_t *handle,
                             struct inode *inode,
                             struct ext4_ext_path **ppath,
                             ext4_lblk_t split,
                             int split_flag,
                             int flags)
{
        struct ext4_ext_path *path = *ppath;
        ext4_fsblk_t newblock;
        ext4_lblk_t ee_block;
        struct ext4_extent *ex, newex, orig_ex, zero_ex;
        struct ext4_extent *ex2 = NULL;
        unsigned int ee_len, depth;
        int err = 0;

        BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
               (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));

        ext_debug(inode, "logical block %llu\n", (unsigned long long)split);

        ext4_ext_show_leaf(inode, path);

        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        newblock = split - ee_block + ext4_ext_pblock(ex);

        BUG_ON(split < ee_block || split >= (ee_block + ee_len));
        BUG_ON(!ext4_ext_is_unwritten(ex) &&
               split_flag & (EXT4_EXT_MAY_ZEROOUT |
                             EXT4_EXT_MARK_UNWRIT1 |
                             EXT4_EXT_MARK_UNWRIT2));

        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                goto out;

        if (split == ee_block) {
                /*
                 * case b: block @split is the block that the extent begins with
                 * then we just change the state of the extent, and splitting
                 * is not needed.
                 */
                if (split_flag & EXT4_EXT_MARK_UNWRIT2)
                        ext4_ext_mark_unwritten(ex);
                else
                        ext4_ext_mark_initialized(ex);

                if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
                        ext4_ext_try_to_merge(handle, inode, path, ex);

                err = ext4_ext_dirty(handle, inode, path + path->p_depth);
                goto out;
        }

        /* case a */
        memcpy(&orig_ex, ex, sizeof(orig_ex));
        ex->ee_len = cpu_to_le16(split - ee_block);
        if (split_flag & EXT4_EXT_MARK_UNWRIT1)
                ext4_ext_mark_unwritten(ex);

        /*
         * path may lead to new leaf, not to original leaf any more
         * after ext4_ext_insert_extent() returns,
         */
        err = ext4_ext_dirty(handle, inode, path + depth);
        if (err)
                goto fix_extent_len;

        ex2 = &newex;
        ex2->ee_block = cpu_to_le32(split);
        ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
        ext4_ext_store_pblock(ex2, newblock);
        if (split_flag & EXT4_EXT_MARK_UNWRIT2)
                ext4_ext_mark_unwritten(ex2);

        err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
        if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM)
                goto out;

        if (EXT4_EXT_MAY_ZEROOUT & split_flag) {
                if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
                        if (split_flag & EXT4_EXT_DATA_VALID1) {
                                err = ext4_ext_zeroout(inode, ex2);
                                zero_ex.ee_block = ex2->ee_block;
                                zero_ex.ee_len = cpu_to_le16(
                                                ext4_ext_get_actual_len(ex2));
                                ext4_ext_store_pblock(&zero_ex,
                                                      ext4_ext_pblock(ex2));
                        } else {
                                err = ext4_ext_zeroout(inode, ex);
                                zero_ex.ee_block = ex->ee_block;
                                zero_ex.ee_len = cpu_to_le16(
                                                ext4_ext_get_actual_len(ex));
                                ext4_ext_store_pblock(&zero_ex,
                                                      ext4_ext_pblock(ex));
                        }
                } else {
                        err = ext4_ext_zeroout(inode, &orig_ex);
                        zero_ex.ee_block = orig_ex.ee_block;
                        zero_ex.ee_len = cpu_to_le16(
                                                ext4_ext_get_actual_len(&orig_ex));
                        ext4_ext_store_pblock(&zero_ex,
                                              ext4_ext_pblock(&orig_ex));
                }

                if (!err) {
                        /* update the extent length and mark as initialized */
                        ex->ee_len = cpu_to_le16(ee_len);
                        ext4_ext_try_to_merge(handle, inode, path, ex);
                        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
                        if (!err)
                                /* update extent status tree */
                                ext4_zeroout_es(inode, &zero_ex);
                        /* If we failed at this point, we don't know in which
                         * state the extent tree exactly is so don't try to fix
                         * length of the original extent as it may do even more
                         * damage.
                         */
                        goto out;
                }
        }

fix_extent_len:
        ex->ee_len = orig_ex.ee_len;
        /*
         * Ignore ext4_ext_dirty return value since we are already in error path
         * and err is a non-zero error code.
         */
        ext4_ext_dirty(handle, inode, path + path->p_depth);
        return err;
out:
        ext4_ext_show_leaf(inode, path);
        return err;
}

/*
 * ext4_split_extents() splits an extent and mark extent which is covered
 * by @map as split_flags indicates
 *
 * It may result in splitting the extent into multiple extents (up to three)
 * There are three possibilities:
 *   a> There is no split required
 *   b> Splits in two extents: Split is happening at either end of the extent
 *   c> Splits in three extents: Somone is splitting in middle of the extent
 *
 */
static int ext4_split_extent(handle_t *handle,
                              struct inode *inode,
                              struct ext4_ext_path **ppath,
                              struct ext4_map_blocks *map,
                              int split_flag,
                              int flags)
{
        struct ext4_ext_path *path = *ppath;
        ext4_lblk_t ee_block;
        struct ext4_extent *ex;
        unsigned int ee_len, depth;
        int err = 0;
        int unwritten;
        int split_flag1, flags1;
        int allocated = map->m_len;

        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        unwritten = ext4_ext_is_unwritten(ex);

        if (map->m_lblk + map->m_len < ee_block + ee_len) {
                split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
                flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
                if (unwritten)
                        split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
                                       EXT4_EXT_MARK_UNWRIT2;
                if (split_flag & EXT4_EXT_DATA_VALID2)
                        split_flag1 |= EXT4_EXT_DATA_VALID1;
                err = ext4_split_extent_at(handle, inode, ppath,
                                map->m_lblk + map->m_len, split_flag1, flags1);
                if (err)
                        goto out;
        } else {
                allocated = ee_len - (map->m_lblk - ee_block);
        }
        /*
         * Update path is required because previous ext4_split_extent_at() may
         * result in split of original leaf or extent zeroout.
         */
        path = ext4_find_extent(inode, map->m_lblk, ppath, flags);
        if (IS_ERR(path))
                return PTR_ERR(path);
        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        if (!ex) {
                EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
                                 (unsigned long) map->m_lblk);
                return -EFSCORRUPTED;
        }
        unwritten = ext4_ext_is_unwritten(ex);

        if (map->m_lblk >= ee_block) {
                split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
                if (unwritten) {
                        split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
                        split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
                                                     EXT4_EXT_MARK_UNWRIT2);
                }
                err = ext4_split_extent_at(handle, inode, ppath,
                                map->m_lblk, split_flag1, flags);
                if (err)
                        goto out;
        }

        ext4_ext_show_leaf(inode, path);
out:
        return err ? err : allocated;
}

/*
 * This function is called by ext4_ext_map_blocks() if someone tries to write
 * to an unwritten extent. It may result in splitting the unwritten
 * extent into multiple extents (up to three - one initialized and two
 * unwritten).
 * There are three possibilities:
 *   a> There is no split required: Entire extent should be initialized
 *   b> Splits in two extents: Write is happening at either end of the extent
 *   c> Splits in three extents: Somone is writing in middle of the extent
 *
 * Pre-conditions:
 *  - The extent pointed to by 'path' is unwritten.
 *  - The extent pointed to by 'path' contains a superset
 *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
 *
 * Post-conditions on success:
 *  - the returned value is the number of blocks beyond map->l_lblk
 *    that are allocated and initialized.
 *    It is guaranteed to be >= map->m_len.
 */
static int ext4_ext_convert_to_initialized(handle_t *handle,
                                           struct inode *inode,
                                           struct ext4_map_blocks *map,
                                           struct ext4_ext_path **ppath,
                                           int flags)
{
        struct ext4_ext_path *path = *ppath;
        struct ext4_sb_info *sbi;
        struct ext4_extent_header *eh;
        struct ext4_map_blocks split_map;
        struct ext4_extent zero_ex1, zero_ex2;
        struct ext4_extent *ex, *abut_ex;
        ext4_lblk_t ee_block, eof_block;
        unsigned int ee_len, depth, map_len = map->m_len;
        int err = 0;
        int split_flag = EXT4_EXT_DATA_VALID2;
        int allocated = 0;
        unsigned int max_zeroout = 0;

        ext_debug(inode, "logical block %llu, max_blocks %u\n",
                  (unsigned long long)map->m_lblk, map_len);

        sbi = EXT4_SB(inode->i_sb);
        eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
                        >> inode->i_sb->s_blocksize_bits;
        if (eof_block < map->m_lblk + map_len)
                eof_block = map->m_lblk + map_len;

        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
        zero_ex1.ee_len = 0;
        zero_ex2.ee_len = 0;

        trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);

        /* Pre-conditions */
        BUG_ON(!ext4_ext_is_unwritten(ex));
        BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));

        /*
         * Attempt to transfer newly initialized blocks from the currently
         * unwritten extent to its neighbor. This is much cheaper
         * than an insertion followed by a merge as those involve costly
         * memmove() calls. Transferring to the left is the common case in
         * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
         * followed by append writes.
         *
         * Limitations of the current logic:
         *  - L1: we do not deal with writes covering the whole extent.
         *    This would require removing the extent if the transfer
         *    is possible.
         *  - L2: we only attempt to merge with an extent stored in the
         *    same extent tree node.
         */
        if ((map->m_lblk == ee_block) &&
                /* See if we can merge left */
                (map_len < ee_len) &&                /*L1*/
                (ex > EXT_FIRST_EXTENT(eh))) {        /*L2*/
                ext4_lblk_t prev_lblk;
                ext4_fsblk_t prev_pblk, ee_pblk;
                unsigned int prev_len;

                abut_ex = ex - 1;
                prev_lblk = le32_to_cpu(abut_ex->ee_block);
                prev_len = ext4_ext_get_actual_len(abut_ex);
                prev_pblk = ext4_ext_pblock(abut_ex);
                ee_pblk = ext4_ext_pblock(ex);

                /*
                 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
                 * upon those conditions:
                 * - C1: abut_ex is initialized,
                 * - C2: abut_ex is logically abutting ex,
                 * - C3: abut_ex is physically abutting ex,
                 * - C4: abut_ex can receive the additional blocks without
                 *   overflowing the (initialized) length limit.
                 */
                if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
                        ((prev_lblk + prev_len) == ee_block) &&                /*C2*/
                        ((prev_pblk + prev_len) == ee_pblk) &&                /*C3*/
                        (prev_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
                        err = ext4_ext_get_access(handle, inode, path + depth);
                        if (err)
                                goto out;

                        trace_ext4_ext_convert_to_initialized_fastpath(inode,
                                map, ex, abut_ex);

                        /* Shift the start of ex by 'map_len' blocks */
                        ex->ee_block = cpu_to_le32(ee_block + map_len);
                        ext4_ext_store_pblock(ex, ee_pblk + map_len);
                        ex->ee_len = cpu_to_le16(ee_len - map_len);
                        ext4_ext_mark_unwritten(ex); /* Restore the flag */

                        /* Extend abut_ex by 'map_len' blocks */
                        abut_ex->ee_len = cpu_to_le16(prev_len + map_len);

                        /* Result: number of initialized blocks past m_lblk */
                        allocated = map_len;
                }
        } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
                   (map_len < ee_len) &&        /*L1*/
                   ex < EXT_LAST_EXTENT(eh)) {        /*L2*/
                /* See if we can merge right */
                ext4_lblk_t next_lblk;
                ext4_fsblk_t next_pblk, ee_pblk;
                unsigned int next_len;

                abut_ex = ex + 1;
                next_lblk = le32_to_cpu(abut_ex->ee_block);
                next_len = ext4_ext_get_actual_len(abut_ex);
                next_pblk = ext4_ext_pblock(abut_ex);
                ee_pblk = ext4_ext_pblock(ex);

                /*
                 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
                 * upon those conditions:
                 * - C1: abut_ex is initialized,
                 * - C2: abut_ex is logically abutting ex,
                 * - C3: abut_ex is physically abutting ex,
                 * - C4: abut_ex can receive the additional blocks without
                 *   overflowing the (initialized) length limit.
                 */
                if ((!ext4_ext_is_unwritten(abut_ex)) &&                /*C1*/
                    ((map->m_lblk + map_len) == next_lblk) &&                /*C2*/
                    ((ee_pblk + ee_len) == next_pblk) &&                /*C3*/
                    (next_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
                        err = ext4_ext_get_access(handle, inode, path + depth);
                        if (err)
                                goto out;

                        trace_ext4_ext_convert_to_initialized_fastpath(inode,
                                map, ex, abut_ex);

                        /* Shift the start of abut_ex by 'map_len' blocks */
                        abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
                        ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
                        ex->ee_len = cpu_to_le16(ee_len - map_len);
                        ext4_ext_mark_unwritten(ex); /* Restore the flag */

                        /* Extend abut_ex by 'map_len' blocks */
                        abut_ex->ee_len = cpu_to_le16(next_len + map_len);

                        /* Result: number of initialized blocks past m_lblk */
                        allocated = map_len;
                }
        }
        if (allocated) {
                /* Mark the block containing both extents as dirty */
                err = ext4_ext_dirty(handle, inode, path + depth);

                /* Update path to point to the right extent */
                path[depth].p_ext = abut_ex;
                goto out;
        } else
                allocated = ee_len - (map->m_lblk - ee_block);

        WARN_ON(map->m_lblk < ee_block);
        /*
         * It is safe to convert extent to initialized via explicit
         * zeroout only if extent is fully inside i_size or new_size.
         */
        split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;

        if (EXT4_EXT_MAY_ZEROOUT & split_flag)
                max_zeroout = sbi->s_extent_max_zeroout_kb >>
                        (inode->i_sb->s_blocksize_bits - 10);

        /*
         * five cases:
         * 1. split the extent into three extents.
         * 2. split the extent into two extents, zeroout the head of the first
         *    extent.
         * 3. split the extent into two extents, zeroout the tail of the second
         *    extent.
         * 4. split the extent into two extents with out zeroout.
         * 5. no splitting needed, just possibly zeroout the head and / or the
         *    tail of the extent.
         */
        split_map.m_lblk = map->m_lblk;
        split_map.m_len = map->m_len;

        if (max_zeroout && (allocated > split_map.m_len)) {
                if (allocated <= max_zeroout) {
                        /* case 3 or 5 */
                        zero_ex1.ee_block =
                                 cpu_to_le32(split_map.m_lblk +
                                             split_map.m_len);
                        zero_ex1.ee_len =
                                cpu_to_le16(allocated - split_map.m_len);
                        ext4_ext_store_pblock(&zero_ex1,
                                ext4_ext_pblock(ex) + split_map.m_lblk +
                                split_map.m_len - ee_block);
                        err = ext4_ext_zeroout(inode, &zero_ex1);
                        if (err)
                                goto fallback;
                        split_map.m_len = allocated;
                }
                if (split_map.m_lblk - ee_block + split_map.m_len <
                                                                max_zeroout) {
                        /* case 2 or 5 */
                        if (split_map.m_lblk != ee_block) {
                                zero_ex2.ee_block = ex->ee_block;
                                zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk -
                                                        ee_block);
                                ext4_ext_store_pblock(&zero_ex2,
                                                      ext4_ext_pblock(ex));
                                err = ext4_ext_zeroout(inode, &zero_ex2);
                                if (err)
                                        goto fallback;
                        }

                        split_map.m_len += split_map.m_lblk - ee_block;
                        split_map.m_lblk = ee_block;
                        allocated = map->m_len;
                }
        }

fallback:
        err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
                                flags);
        if (err > 0)
                err = 0;
out:
        /* If we have gotten a failure, don't zero out status tree */
        if (!err) {
                ext4_zeroout_es(inode, &zero_ex1);
                ext4_zeroout_es(inode, &zero_ex2);
        }
        return err ? err : allocated;
}

/*
 * This function is called by ext4_ext_map_blocks() from
 * ext4_get_blocks_dio_write() when DIO to write
 * to an unwritten extent.
 *
 * Writing to an unwritten extent may result in splitting the unwritten
 * extent into multiple initialized/unwritten extents (up to three)
 * There are three possibilities:
 *   a> There is no split required: Entire extent should be unwritten
 *   b> Splits in two extents: Write is happening at either end of the extent
 *   c> Splits in three extents: Somone is writing in middle of the extent
 *
 * This works the same way in the case of initialized -> unwritten conversion.
 *
 * One of more index blocks maybe needed if the extent tree grow after
 * the unwritten extent split. To prevent ENOSPC occur at the IO
 * complete, we need to split the unwritten extent before DIO submit
 * the IO. The unwritten extent called at this time will be split
 * into three unwritten extent(at most). After IO complete, the part
 * being filled will be convert to initialized by the end_io callback function
 * via ext4_convert_unwritten_extents().
 *
 * Returns the size of unwritten extent to be written on success.
 */
static int ext4_split_convert_extents(handle_t *handle,
                                        struct inode *inode,
                                        struct ext4_map_blocks *map,
                                        struct ext4_ext_path **ppath,
                                        int flags)
{
        struct ext4_ext_path *path = *ppath;
        ext4_lblk_t eof_block;
        ext4_lblk_t ee_block;
        struct ext4_extent *ex;
        unsigned int ee_len;
        int split_flag = 0, depth;

        ext_debug(inode, "logical block %llu, max_blocks %u\n",
                  (unsigned long long)map->m_lblk, map->m_len);

        eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
                        >> inode->i_sb->s_blocksize_bits;
        if (eof_block < map->m_lblk + map->m_len)
                eof_block = map->m_lblk + map->m_len;
        /*
         * It is safe to convert extent to initialized via explicit
         * zeroout only if extent is fully inside i_size or new_size.
         */
        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);

        /* Convert to unwritten */
        if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
                split_flag |= EXT4_EXT_DATA_VALID1;
        /* Convert to initialized */
        } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
                split_flag |= ee_block + ee_len <= eof_block ?
                              EXT4_EXT_MAY_ZEROOUT : 0;
                split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
        }
        flags |= EXT4_GET_BLOCKS_PRE_IO;
        return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
}

static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                                                struct inode *inode,
                                                struct ext4_map_blocks *map,
                                                struct ext4_ext_path **ppath)
{
        struct ext4_ext_path *path = *ppath;
        struct ext4_extent *ex;
        ext4_lblk_t ee_block;
        unsigned int ee_len;
        int depth;
        int err = 0;

        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);

        ext_debug(inode, "logical block %llu, max_blocks %u\n",
                  (unsigned long long)ee_block, ee_len);

        /* If extent is larger than requested it is a clear sign that we still
         * have some extent state machine issues left. So extent_split is still
         * required.
         * TODO: Once all related issues will be fixed this situation should be
         * illegal.
         */
        if (ee_block != map->m_lblk || ee_len > map->m_len) {
#ifdef CONFIG_EXT4_DEBUG
                ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
                             " len %u; IO logical block %llu, len %u",
                             inode->i_ino, (unsigned long long)ee_block, ee_len,
                             (unsigned long long)map->m_lblk, map->m_len);
#endif
                err = ext4_split_convert_extents(handle, inode, map, ppath,
                                                 EXT4_GET_BLOCKS_CONVERT);
                if (err < 0)
                        return err;
                path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
                if (IS_ERR(path))
                        return PTR_ERR(path);
                depth = ext_depth(inode);
                ex = path[depth].p_ext;
        }

        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                goto out;
        /* first mark the extent as initialized */
        ext4_ext_mark_initialized(ex);

        /* note: ext4_ext_correct_indexes() isn't needed here because
         * borders are not changed
         */
        ext4_ext_try_to_merge(handle, inode, path, ex);

        /* Mark modified extent as dirty */
        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
out:
        ext4_ext_show_leaf(inode, path);
        return err;
}

static int
convert_initialized_extent(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map,
                           struct ext4_ext_path **ppath,
                           unsigned int *allocated)
{
        struct ext4_ext_path *path = *ppath;
        struct ext4_extent *ex;
        ext4_lblk_t ee_block;
        unsigned int ee_len;
        int depth;
        int err = 0;

        /*
         * Make sure that the extent is no bigger than we support with
         * unwritten extent
         */
        if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
                map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;

        depth = ext_depth(inode);
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);

        ext_debug(inode, "logical block %llu, max_blocks %u\n",
                  (unsigned long long)ee_block, ee_len);

        if (ee_block != map->m_lblk || ee_len > map->m_len) {
                err = ext4_split_convert_extents(handle, inode, map, ppath,
                                EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
                if (err < 0)
                        return err;
                path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
                if (IS_ERR(path))
                        return PTR_ERR(path);
                depth = ext_depth(inode);
                ex = path[depth].p_ext;
                if (!ex) {
                        EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
                                         (unsigned long) map->m_lblk);
                        return -EFSCORRUPTED;
                }
        }

        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                return err;
        /* first mark the extent as unwritten */
        ext4_ext_mark_unwritten(ex);

        /* note: ext4_ext_correct_indexes() isn't needed here because
         * borders are not changed
         */
        ext4_ext_try_to_merge(handle, inode, path, ex);

        /* Mark modified extent as dirty */
        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
        if (err)
                return err;
        ext4_ext_show_leaf(inode, path);

        ext4_update_inode_fsync_trans(handle, inode, 1);

        map->m_flags |= EXT4_MAP_UNWRITTEN;
        if (*allocated > map->m_len)
                *allocated = map->m_len;
        map->m_len = *allocated;
        return 0;
}

static int
ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map,
                        struct ext4_ext_path **ppath, int flags,
                        unsigned int allocated, ext4_fsblk_t newblock)
{
        struct ext4_ext_path __maybe_unused *path = *ppath;
        int ret = 0;
        int err = 0;

        ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n",
                  (unsigned long long)map->m_lblk, map->m_len, flags,
                  allocated);
        ext4_ext_show_leaf(inode, path);

        /*
         * When writing into unwritten space, we should not fail to
         * allocate metadata blocks for the new extent block if needed.
         */
        flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;

        trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
                                                    allocated, newblock);

        /* get_block() before submitting IO, split the extent */
        if (flags & EXT4_GET_BLOCKS_PRE_IO) {
                ret = ext4_split_convert_extents(handle, inode, map, ppath,
                                         flags | EXT4_GET_BLOCKS_CONVERT);
                if (ret < 0) {
                        err = ret;
                        goto out2;
                }
                /*
                 * shouldn't get a 0 return when splitting an extent unless
                 * m_len is 0 (bug) or extent has been corrupted
                 */
                if (unlikely(ret == 0)) {
                        EXT4_ERROR_INODE(inode,
                                         "unexpected ret == 0, m_len = %u",
                                         map->m_len);
                        err = -EFSCORRUPTED;
                        goto out2;
                }
                map->m_flags |= EXT4_MAP_UNWRITTEN;
                goto out;
        }
        /* IO end_io complete, convert the filled extent to written */
        if (flags & EXT4_GET_BLOCKS_CONVERT) {
                err = ext4_convert_unwritten_extents_endio(handle, inode, map,
                                                           ppath);
                if (err < 0)
                        goto out2;
                ext4_update_inode_fsync_trans(handle, inode, 1);
                goto map_out;
        }
        /* buffered IO cases */
        /*
         * repeat fallocate creation request
         * we already have an unwritten extent
         */
        if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
                map->m_flags |= EXT4_MAP_UNWRITTEN;
                goto map_out;
        }

        /* buffered READ or buffered write_begin() lookup */
        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                /*
                 * We have blocks reserved already.  We
                 * return allocated blocks so that delalloc
                 * won't do block reservation for us.  But
                 * the buffer head will be unmapped so that
                 * a read from the block returns 0s.
                 */
                map->m_flags |= EXT4_MAP_UNWRITTEN;
                goto out1;
        }

        /*
         * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1.
         * For buffered writes, at writepage time, etc.  Convert a
         * discovered unwritten extent to written.
         */
        ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
        if (ret < 0) {
                err = ret;
                goto out2;
        }
        ext4_update_inode_fsync_trans(handle, inode, 1);
        /*
         * shouldn't get a 0 return when converting an unwritten extent
         * unless m_len is 0 (bug) or extent has been corrupted
         */
        if (unlikely(ret == 0)) {
                EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u",
                                 map->m_len);
                err = -EFSCORRUPTED;
                goto out2;
        }

out:
        allocated = ret;
        map->m_flags |= EXT4_MAP_NEW;
map_out:
        map->m_flags |= EXT4_MAP_MAPPED;
out1:
        map->m_pblk = newblock;
        if (allocated > map->m_len)
                allocated = map->m_len;
        map->m_len = allocated;
        ext4_ext_show_leaf(inode, path);
out2:
        return err ? err : allocated;
}

/*
 * get_implied_cluster_alloc - check to see if the requested
 * allocation (in the map structure) overlaps with a cluster already
 * allocated in an extent.
 *        @sb        The filesystem superblock structure
 *        @map        The requested lblk->pblk mapping
 *        @ex        The extent structure which might contain an implied
 *                        cluster allocation
 *
 * This function is called by ext4_ext_map_blocks() after we failed to
 * find blocks that were already in the inode's extent tree.  Hence,
 * we know that the beginning of the requested region cannot overlap
 * the extent from the inode's extent tree.  There are three cases we
 * want to catch.  The first is this case:
 *
 *                 |--- cluster # N--|
 *    |--- extent ---|        |---- requested region ---|
 *                        |==========|
 *
 * The second case that we need to test for is this one:
 *
 *   |--------- cluster # N ----------------|
 *           |--- requested region --|   |------- extent ----|
 *           |=======================|
 *
 * The third case is when the requested region lies between two extents
 * within the same cluster:
 *          |------------- cluster # N-------------|
 * |----- ex -----|                  |---- ex_right ----|
 *                  |------ requested region ------|
 *                  |================|
 *
 * In each of the above cases, we need to set the map->m_pblk and
 * map->m_len so it corresponds to the return the extent labelled as
 * "|====|" from cluster #N, since it is already in use for data in
 * cluster EXT4_B2C(sbi, map->m_lblk).        We will then return 1 to
 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
 * as a new "allocated" block region.  Otherwise, we will return 0 and
 * ext4_ext_map_blocks() will then allocate one or more new clusters
 * by calling ext4_mb_new_blocks().
 */
static int get_implied_cluster_alloc(struct super_block *sb,
                                     struct ext4_map_blocks *map,
                                     struct ext4_extent *ex,
                                     struct ext4_ext_path *path)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
        ext4_lblk_t ex_cluster_start, ex_cluster_end;
        ext4_lblk_t rr_cluster_start;
        ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
        ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
        unsigned short ee_len = ext4_ext_get_actual_len(ex);

        /* The extent passed in that we are trying to match */
        ex_cluster_start = EXT4_B2C(sbi, ee_block);
        ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);

        /* The requested region passed into ext4_map_blocks() */
        rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);

        if ((rr_cluster_start == ex_cluster_end) ||
            (rr_cluster_start == ex_cluster_start)) {
                if (rr_cluster_start == ex_cluster_end)
                        ee_start += ee_len - 1;
                map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
                map->m_len = min(map->m_len,
                                 (unsigned) sbi->s_cluster_ratio - c_offset);
                /*
                 * Check for and handle this case:
                 *
                 *   |--------- cluster # N-------------|
                 *                       |------- extent ----|
                 *           |--- requested region ---|
                 *           |===========|
                 */

                if (map->m_lblk < ee_block)
                        map->m_len = min(map->m_len, ee_block - map->m_lblk);

                /*
                 * Check for the case where there is already another allocated
                 * block to the right of 'ex' but before the end of the cluster.
                 *
                 *          |------------- cluster # N-------------|
                 * |----- ex -----|                  |---- ex_right ----|
                 *                  |------ requested region ------|
                 *                  |================|
                 */
                if (map->m_lblk > ee_block) {
                        ext4_lblk_t next = ext4_ext_next_allocated_block(path);
                        map->m_len = min(map->m_len, next - map->m_lblk);
                }

                trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
                return 1;
        }

        trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
        return 0;
}

/*
 * Determine hole length around the given logical block, first try to
 * locate and expand the hole from the given @path, and then adjust it
 * if it's partially or completely converted to delayed extents, insert
 * it into the extent cache tree if it's indeed a hole, finally return
 * the length of the determined extent.
 */
static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode,
                                                  struct ext4_ext_path *path,
                                                  ext4_lblk_t lblk)
{
        ext4_lblk_t hole_start, len;
        struct extent_status es;

        hole_start = lblk;
        len = ext4_ext_find_hole(inode, path, &hole_start);
again:
        ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
                                  hole_start + len - 1, &es);
        if (!es.es_len)
                goto insert_hole;

        /*
         * There's a delalloc extent in the hole, handle it if the delalloc
         * extent is in front of, behind and straddle the queried range.
         */
        if (lblk >= es.es_lblk + es.es_len) {
                /*
                 * The delalloc extent is in front of the queried range,
                 * find again from the queried start block.
                 */
                len -= lblk - hole_start;
                hole_start = lblk;
                goto again;
        } else if (in_range(lblk, es.es_lblk, es.es_len)) {
                /*
                 * The delalloc extent containing lblk, it must have been
                 * added after ext4_map_blocks() checked the extent status
                 * tree so we are not holding i_rwsem and delalloc info is
                 * only stabilized by i_data_sem we are going to release
                 * soon. Don't modify the extent status tree and report
                 * extent as a hole, just adjust the length to the delalloc
                 * extent's after lblk.
                 */
                len = es.es_lblk + es.es_len - lblk;
                return len;
        } else {
                /*
                 * The delalloc extent is partially or completely behind
                 * the queried range, update hole length until the
                 * beginning of the delalloc extent.
                 */
                len = min(es.es_lblk - hole_start, len);
        }

insert_hole:
        /* Put just found gap into cache to speed up subsequent requests */
        ext_debug(inode, " -> %u:%u\n", hole_start, len);
        ext4_es_insert_extent(inode, hole_start, len, ~0, EXTENT_STATUS_HOLE);

        /* Update hole_len to reflect hole size after lblk */
        if (hole_start != lblk)
                len -= lblk - hole_start;

        return len;
}

/*
 * Block allocation/map/preallocation routine for extents based files
 *
 *
 * Need to be called with
 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
 * (ie, flags is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
 *
 * return > 0, number of blocks already mapped/allocated
 *          if flags doesn't contain EXT4_GET_BLOCKS_CREATE and these are pre-allocated blocks
 *                  buffer head is unmapped
 *          otherwise blocks are mapped
 *
 * return = 0, if plain look up failed (blocks have not been allocated)
 *          buffer head is unmapped
 *
 * return < 0, error case.
 */
int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map, int flags)
{
        struct ext4_ext_path *path = NULL;
        struct ext4_extent newex, *ex, ex2;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_fsblk_t newblock = 0, pblk;
        int err = 0, depth, ret;
        unsigned int allocated = 0, offset = 0;
        unsigned int allocated_clusters = 0;
        struct ext4_allocation_request ar;
        ext4_lblk_t cluster_offset;

        ext_debug(inode, "blocks %u/%u requested\n", map->m_lblk, map->m_len);
        trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);

        /* find extent for this block */
        path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
        if (IS_ERR(path)) {
                err = PTR_ERR(path);
                path = NULL;
                goto out;
        }

        depth = ext_depth(inode);

        /*
         * consistent leaf must not be empty;
         * this situation is possible, though, _during_ tree modification;
         * this is why assert can't be put in ext4_find_extent()
         */
        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
                EXT4_ERROR_INODE(inode, "bad extent address "
                                 "lblock: %lu, depth: %d pblock %lld",
                                 (unsigned long) map->m_lblk, depth,
                                 path[depth].p_block);
                err = -EFSCORRUPTED;
                goto out;
        }

        ex = path[depth].p_ext;
        if (ex) {
                ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
                ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
                unsigned short ee_len;


                /*
                 * unwritten extents are treated as holes, except that
                 * we split out initialized portions during a write.
                 */
                ee_len = ext4_ext_get_actual_len(ex);

                trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);

                /* if found extent covers block, simply return it */
                if (in_range(map->m_lblk, ee_block, ee_len)) {
                        newblock = map->m_lblk - ee_block + ee_start;
                        /* number of remaining blocks in the extent */
                        allocated = ee_len - (map->m_lblk - ee_block);
                        ext_debug(inode, "%u fit into %u:%d -> %llu\n",
                                  map->m_lblk, ee_block, ee_len, newblock);

                        /*
                         * If the extent is initialized check whether the
                         * caller wants to convert it to unwritten.
                         */
                        if ((!ext4_ext_is_unwritten(ex)) &&
                            (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
                                err = convert_initialized_extent(handle,
                                        inode, map, &path, &allocated);
                                goto out;
                        } else if (!ext4_ext_is_unwritten(ex)) {
                                map->m_flags |= EXT4_MAP_MAPPED;
                                map->m_pblk = newblock;
                                if (allocated > map->m_len)
                                        allocated = map->m_len;
                                map->m_len = allocated;
                                ext4_ext_show_leaf(inode, path);
                                goto out;
                        }

                        ret = ext4_ext_handle_unwritten_extents(
                                handle, inode, map, &path, flags,
                                allocated, newblock);
                        if (ret < 0)
                                err = ret;
                        else
                                allocated = ret;
                        goto out;
                }
        }

        /*
         * requested block isn't allocated yet;
         * we couldn't try to create block if flags doesn't contain EXT4_GET_BLOCKS_CREATE
         */
        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                ext4_lblk_t len;

                len = ext4_ext_determine_insert_hole(inode, path, map->m_lblk);

                map->m_pblk = 0;
                map->m_len = min_t(unsigned int, map->m_len, len);
                goto out;
        }

        /*
         * Okay, we need to do block allocation.
         */
        newex.ee_block = cpu_to_le32(map->m_lblk);
        cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);

        /*
         * If we are doing bigalloc, check to see if the extent returned
         * by ext4_find_extent() implies a cluster we can use.
         */
        if (cluster_offset && ex &&
            get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
                ar.len = allocated = map->m_len;
                newblock = map->m_pblk;
                goto got_allocated_blocks;
        }

        /* find neighbour allocated blocks */
        ar.lleft = map->m_lblk;
        err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
        if (err)
                goto out;
        ar.lright = map->m_lblk;
        err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
        if (err < 0)
                goto out;

        /* Check if the extent after searching to the right implies a
         * cluster we can use. */
        if ((sbi->s_cluster_ratio > 1) && err &&
            get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) {
                ar.len = allocated = map->m_len;
                newblock = map->m_pblk;
                goto got_allocated_blocks;
        }

        /*
         * See if request is beyond maximum number of blocks we can have in
         * a single extent. For an initialized extent this limit is
         * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
         * EXT_UNWRITTEN_MAX_LEN.
         */
        if (map->m_len > EXT_INIT_MAX_LEN &&
            !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
                map->m_len = EXT_INIT_MAX_LEN;
        else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
                 (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
                map->m_len = EXT_UNWRITTEN_MAX_LEN;

        /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
        newex.ee_len = cpu_to_le16(map->m_len);
        err = ext4_ext_check_overlap(sbi, inode, &newex, path);
        if (err)
                allocated = ext4_ext_get_actual_len(&newex);
        else
                allocated = map->m_len;

        /* allocate new block */
        ar.inode = inode;
        ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
        ar.logical = map->m_lblk;
        /*
         * We calculate the offset from the beginning of the cluster
         * for the logical block number, since when we allocate a
         * physical cluster, the physical block should start at the
         * same offset from the beginning of the cluster.  This is
         * needed so that future calls to get_implied_cluster_alloc()
         * work correctly.
         */
        offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
        ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
        ar.goal -= offset;
        ar.logical -= offset;
        if (S_ISREG(inode->i_mode))
                ar.flags = EXT4_MB_HINT_DATA;
        else
                /* disable in-core preallocation for non-regular files */
                ar.flags = 0;
        if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
                ar.flags |= EXT4_MB_HINT_NOPREALLOC;
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                ar.flags |= EXT4_MB_DELALLOC_RESERVED;
        if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
                ar.flags |= EXT4_MB_USE_RESERVED;
        newblock = ext4_mb_new_blocks(handle, &ar, &err);
        if (!newblock)
                goto out;
        allocated_clusters = ar.len;
        ar.len = EXT4_C2B(sbi, ar.len) - offset;
        ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n",
                  ar.goal, newblock, ar.len, allocated);
        if (ar.len > allocated)
                ar.len = allocated;

got_allocated_blocks:
        /* try to insert new extent into found leaf and return */
        pblk = newblock + offset;
        ext4_ext_store_pblock(&newex, pblk);
        newex.ee_len = cpu_to_le16(ar.len);
        /* Mark unwritten */
        if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
                ext4_ext_mark_unwritten(&newex);
                map->m_flags |= EXT4_MAP_UNWRITTEN;
        }

        err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags);
        if (err) {
                if (allocated_clusters) {
                        int fb_flags = 0;

                        /*
                         * free data blocks we just allocated.
                         * not a good idea to call discard here directly,
                         * but otherwise we'd need to call it every free().
                         */
                        ext4_discard_preallocations(inode);
                        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                                fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
                        ext4_free_blocks(handle, inode, NULL, newblock,
                                         EXT4_C2B(sbi, allocated_clusters),
                                         fb_flags);
                }
                goto out;
        }

        /*
         * Reduce the reserved cluster count to reflect successful deferred
         * allocation of delayed allocated clusters or direct allocation of
         * clusters discovered to be delayed allocated.  Once allocated, a
         * cluster is not included in the reserved count.
         */
        if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) {
                if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
                        /*
                         * When allocating delayed allocated clusters, simply
                         * reduce the reserved cluster count and claim quota
                         */
                        ext4_da_update_reserve_space(inode, allocated_clusters,
                                                        1);
                } else {
                        ext4_lblk_t lblk, len;
                        unsigned int n;

                        /*
                         * When allocating non-delayed allocated clusters
                         * (from fallocate, filemap, DIO, or clusters
                         * allocated when delalloc has been disabled by
                         * ext4_nonda_switch), reduce the reserved cluster
                         * count by the number of allocated clusters that
                         * have previously been delayed allocated.  Quota
                         * has been claimed by ext4_mb_new_blocks() above,
                         * so release the quota reservations made for any
                         * previously delayed allocated clusters.
                         */
                        lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
                        len = allocated_clusters << sbi->s_cluster_bits;
                        n = ext4_es_delayed_clu(inode, lblk, len);
                        if (n > 0)
                                ext4_da_update_reserve_space(inode, (int) n, 0);
                }
        }

        /*
         * Cache the extent and update transaction to commit on fdatasync only
         * when it is _not_ an unwritten extent.
         */
        if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);
        else
                ext4_update_inode_fsync_trans(handle, inode, 0);

        map->m_flags |= (EXT4_MAP_NEW | EXT4_MAP_MAPPED);
        map->m_pblk = pblk;
        map->m_len = ar.len;
        allocated = map->m_len;
        ext4_ext_show_leaf(inode, path);
out:
        ext4_free_ext_path(path);

        trace_ext4_ext_map_blocks_exit(inode, flags, map,
                                       err ? err : allocated);
        return err ? err : allocated;
}

int ext4_ext_truncate(handle_t *handle, struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        ext4_lblk_t last_block;
        int err = 0;

        /*
         * TODO: optimization is possible here.
         * Probably we need not scan at all,
         * because page truncation is enough.
         */

        /* we have to know where to truncate from in crash case */
        EXT4_I(inode)->i_disksize = inode->i_size;
        err = ext4_mark_inode_dirty(handle, inode);
        if (err)
                return err;

        last_block = (inode->i_size + sb->s_blocksize - 1)
                        >> EXT4_BLOCK_SIZE_BITS(sb);
        ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);

retry_remove_space:
        err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
        if (err == -ENOMEM) {
                memalloc_retry_wait(GFP_ATOMIC);
                goto retry_remove_space;
        }
        return err;
}

static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
                                  ext4_lblk_t len, loff_t new_size,
                                  int flags)
{
        struct inode *inode = file_inode(file);
        handle_t *handle;
        int ret = 0, ret2 = 0, ret3 = 0;
        int retries = 0;
        int depth = 0;
        struct ext4_map_blocks map;
        unsigned int credits;
        loff_t epos;

        BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
        map.m_lblk = offset;
        map.m_len = len;
        /*
         * Don't normalize the request if it can fit in one extent so
         * that it doesn't get unnecessarily split into multiple
         * extents.
         */
        if (len <= EXT_UNWRITTEN_MAX_LEN)
                flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;

        /*
         * credits to insert 1 extent into extent tree
         */
        credits = ext4_chunk_trans_blocks(inode, len);
        depth = ext_depth(inode);

retry:
        while (len) {
                /*
                 * Recalculate credits when extent tree depth changes.
                 */
                if (depth != ext_depth(inode)) {
                        credits = ext4_chunk_trans_blocks(inode, len);
                        depth = ext_depth(inode);
                }

                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
                                            credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        break;
                }
                ret = ext4_map_blocks(handle, inode, &map, flags);
                if (ret <= 0) {
                        ext4_debug("inode #%lu: block %u: len %u: "
                                   "ext4_ext_map_blocks returned %d",
                                   inode->i_ino, map.m_lblk,
                                   map.m_len, ret);
                        ext4_mark_inode_dirty(handle, inode);
                        ext4_journal_stop(handle);
                        break;
                }
                /*
                 * allow a full retry cycle for any remaining allocations
                 */
                retries = 0;
                map.m_lblk += ret;
                map.m_len = len = len - ret;
                epos = (loff_t)map.m_lblk << inode->i_blkbits;
                inode_set_ctime_current(inode);
                if (new_size) {
                        if (epos > new_size)
                                epos = new_size;
                        if (ext4_update_inode_size(inode, epos) & 0x1)
                                inode_set_mtime_to_ts(inode,
                                                      inode_get_ctime(inode));
                }
                ret2 = ext4_mark_inode_dirty(handle, inode);
                ext4_update_inode_fsync_trans(handle, inode, 1);
                ret3 = ext4_journal_stop(handle);
                ret2 = ret3 ? ret3 : ret2;
                if (unlikely(ret2))
                        break;
        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;

        return ret > 0 ? ret2 : ret;
}

static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len);

static int ext4_insert_range(struct file *file, loff_t offset, loff_t len);

static long ext4_zero_range(struct file *file, loff_t offset,
                            loff_t len, int mode)
{
        struct inode *inode = file_inode(file);
        struct address_space *mapping = file->f_mapping;
        handle_t *handle = NULL;
        unsigned int max_blocks;
        loff_t new_size = 0;
        int ret = 0;
        int flags;
        int credits;
        int partial_begin, partial_end;
        loff_t start, end;
        ext4_lblk_t lblk;
        unsigned int blkbits = inode->i_blkbits;

        trace_ext4_zero_range(inode, offset, len, mode);

        /*
         * Round up offset. This is not fallocate, we need to zero out
         * blocks, so convert interior block aligned part of the range to
         * unwritten and possibly manually zero out unaligned parts of the
         * range. Here, start and partial_begin are inclusive, end and
         * partial_end are exclusive.
         */
        start = round_up(offset, 1 << blkbits);
        end = round_down((offset + len), 1 << blkbits);

        if (start < offset || end > offset + len)
                return -EINVAL;
        partial_begin = offset & ((1 << blkbits) - 1);
        partial_end = (offset + len) & ((1 << blkbits) - 1);

        lblk = start >> blkbits;
        max_blocks = (end >> blkbits);
        if (max_blocks < lblk)
                max_blocks = 0;
        else
                max_blocks -= lblk;

        inode_lock(inode);

        /*
         * Indirect files do not support unwritten extents
         */
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                ret = -EOPNOTSUPP;
                goto out_mutex;
        }

        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
            (offset + len > inode->i_size ||
             offset + len > EXT4_I(inode)->i_disksize)) {
                new_size = offset + len;
                ret = inode_newsize_ok(inode, new_size);
                if (ret)
                        goto out_mutex;
        }

        flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;

        /* Wait all existing dio workers, newcomers will block on i_rwsem */
        inode_dio_wait(inode);

        ret = file_modified(file);
        if (ret)
                goto out_mutex;

        /* Preallocate the range including the unaligned edges */
        if (partial_begin || partial_end) {
                ret = ext4_alloc_file_blocks(file,
                                round_down(offset, 1 << blkbits) >> blkbits,
                                (round_up((offset + len), 1 << blkbits) -
                                 round_down(offset, 1 << blkbits)) >> blkbits,
                                new_size, flags);
                if (ret)
                        goto out_mutex;

        }

        /* Zero range excluding the unaligned edges */
        if (max_blocks > 0) {
                flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
                          EXT4_EX_NOCACHE);

                /*
                 * Prevent page faults from reinstantiating pages we have
                 * released from page cache.
                 */
                filemap_invalidate_lock(mapping);

                ret = ext4_break_layouts(inode);
                if (ret) {
                        filemap_invalidate_unlock(mapping);
                        goto out_mutex;
                }

                ret = ext4_update_disksize_before_punch(inode, offset, len);
                if (ret) {
                        filemap_invalidate_unlock(mapping);
                        goto out_mutex;
                }

                /*
                 * For journalled data we need to write (and checkpoint) pages
                 * before discarding page cache to avoid inconsitent data on
                 * disk in case of crash before zeroing trans is committed.
                 */
                if (ext4_should_journal_data(inode)) {
                        ret = filemap_write_and_wait_range(mapping, start,
                                                           end - 1);
                        if (ret) {
                                filemap_invalidate_unlock(mapping);
                                goto out_mutex;
                        }
                }

                /* Now release the pages and zero block aligned part of pages */
                truncate_pagecache_range(inode, start, end - 1);
                inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));

                ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
                                             flags);
                filemap_invalidate_unlock(mapping);
                if (ret)
                        goto out_mutex;
        }
        if (!partial_begin && !partial_end)
                goto out_mutex;

        /*
         * In worst case we have to writeout two nonadjacent unwritten
         * blocks and update the inode
         */
        credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
        if (ext4_should_journal_data(inode))
                credits += 2;
        handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                ext4_std_error(inode->i_sb, ret);
                goto out_mutex;
        }

        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        if (new_size)
                ext4_update_inode_size(inode, new_size);
        ret = ext4_mark_inode_dirty(handle, inode);
        if (unlikely(ret))
                goto out_handle;
        /* Zero out partial block at the edges of the range */
        ret = ext4_zero_partial_blocks(handle, inode, offset, len);
        if (ret >= 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);

        if (file->f_flags & O_SYNC)
                ext4_handle_sync(handle);

out_handle:
        ext4_journal_stop(handle);
out_mutex:
        inode_unlock(inode);
        return ret;
}

/*
 * preallocate space for a file. This implements ext4's fallocate file
 * operation, which gets called from sys_fallocate system call.
 * For block-mapped files, posix_fallocate should fall back to the method
 * of writing zeroes to the required new blocks (the same behavior which is
 * expected for file systems which do not support fallocate() system call).
 */
long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
        struct inode *inode = file_inode(file);
        loff_t new_size = 0;
        unsigned int max_blocks;
        int ret = 0;
        int flags;
        ext4_lblk_t lblk;
        unsigned int blkbits = inode->i_blkbits;

        /*
         * Encrypted inodes can't handle collapse range or insert
         * range since we would need to re-encrypt blocks with a
         * different IV or XTS tweak (which are based on the logical
         * block number).
         */
        if (IS_ENCRYPTED(inode) &&
            (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
                return -EOPNOTSUPP;

        /* Return error if mode is not supported */
        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
                     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
                     FALLOC_FL_INSERT_RANGE))
                return -EOPNOTSUPP;

        inode_lock(inode);
        ret = ext4_convert_inline_data(inode);
        inode_unlock(inode);
        if (ret)
                goto exit;

        if (mode & FALLOC_FL_PUNCH_HOLE) {
                ret = ext4_punch_hole(file, offset, len);
                goto exit;
        }

        if (mode & FALLOC_FL_COLLAPSE_RANGE) {
                ret = ext4_collapse_range(file, offset, len);
                goto exit;
        }

        if (mode & FALLOC_FL_INSERT_RANGE) {
                ret = ext4_insert_range(file, offset, len);
                goto exit;
        }

        if (mode & FALLOC_FL_ZERO_RANGE) {
                ret = ext4_zero_range(file, offset, len, mode);
                goto exit;
        }
        trace_ext4_fallocate_enter(inode, offset, len, mode);
        lblk = offset >> blkbits;

        max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
        flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;

        inode_lock(inode);

        /*
         * We only support preallocation for extent-based files only
         */
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
            (offset + len > inode->i_size ||
             offset + len > EXT4_I(inode)->i_disksize)) {
                new_size = offset + len;
                ret = inode_newsize_ok(inode, new_size);
                if (ret)
                        goto out;
        }

        /* Wait all existing dio workers, newcomers will block on i_rwsem */
        inode_dio_wait(inode);

        ret = file_modified(file);
        if (ret)
                goto out;

        ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
        if (ret)
                goto out;

        if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
                ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
                                        EXT4_I(inode)->i_sync_tid);
        }
out:
        inode_unlock(inode);
        trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
exit:
        return ret;
}

/*
 * This function convert a range of blocks to written extents
 * The caller of this function will pass the start offset and the size.
 * all unwritten extents within this range will be converted to
 * written extents.
 *
 * This function is called from the direct IO end io call back
 * function, to convert the fallocated extents after IO is completed.
 * Returns 0 on success.
 */
int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
                                   loff_t offset, ssize_t len)
{
        unsigned int max_blocks;
        int ret = 0, ret2 = 0, ret3 = 0;
        struct ext4_map_blocks map;
        unsigned int blkbits = inode->i_blkbits;
        unsigned int credits = 0;

        map.m_lblk = offset >> blkbits;
        max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);

        if (!handle) {
                /*
                 * credits to insert 1 extent into extent tree
                 */
                credits = ext4_chunk_trans_blocks(inode, max_blocks);
        }
        while (ret >= 0 && ret < max_blocks) {
                map.m_lblk += ret;
                map.m_len = (max_blocks -= ret);
                if (credits) {
                        handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
                                                    credits);
                        if (IS_ERR(handle)) {
                                ret = PTR_ERR(handle);
                                break;
                        }
                }
                ret = ext4_map_blocks(handle, inode, &map,
                                      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
                if (ret <= 0)
                        ext4_warning(inode->i_sb,
                                     "inode #%lu: block %u: len %u: "
                                     "ext4_ext_map_blocks returned %d",
                                     inode->i_ino, map.m_lblk,
                                     map.m_len, ret);
                ret2 = ext4_mark_inode_dirty(handle, inode);
                if (credits) {
                        ret3 = ext4_journal_stop(handle);
                        if (unlikely(ret3))
                                ret2 = ret3;
                }

                if (ret <= 0 || ret2)
                        break;
        }
        return ret > 0 ? ret2 : ret;
}

int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
{
        int ret = 0, err = 0;
        struct ext4_io_end_vec *io_end_vec;

        /*
         * This is somewhat ugly but the idea is clear: When transaction is
         * reserved, everything goes into it. Otherwise we rather start several
         * smaller transactions for conversion of each extent separately.
         */
        if (handle) {
                handle = ext4_journal_start_reserved(handle,
                                                     EXT4_HT_EXT_CONVERT);
                if (IS_ERR(handle))
                        return PTR_ERR(handle);
        }

        list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
                ret = ext4_convert_unwritten_extents(handle, io_end->inode,
                                                     io_end_vec->offset,
                                                     io_end_vec->size);
                if (ret)
                        break;
        }

        if (handle)
                err = ext4_journal_stop(handle);

        return ret < 0 ? ret : err;
}

static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap)
{
        __u64 physical = 0;
        __u64 length = 0;
        int blockbits = inode->i_sb->s_blocksize_bits;
        int error = 0;
        u16 iomap_type;

        /* in-inode? */
        if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
                struct ext4_iloc iloc;
                int offset;        /* offset of xattr in inode */

                error = ext4_get_inode_loc(inode, &iloc);
                if (error)
                        return error;
                physical = (__u64)iloc.bh->b_blocknr << blockbits;
                offset = EXT4_GOOD_OLD_INODE_SIZE +
                                EXT4_I(inode)->i_extra_isize;
                physical += offset;
                length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
                brelse(iloc.bh);
                iomap_type = IOMAP_INLINE;
        } else if (EXT4_I(inode)->i_file_acl) { /* external block */
                physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
                length = inode->i_sb->s_blocksize;
                iomap_type = IOMAP_MAPPED;
        } else {
                /* no in-inode or external block for xattr, so return -ENOENT */
                error = -ENOENT;
                goto out;
        }

        iomap->addr = physical;
        iomap->offset = 0;
        iomap->length = length;
        iomap->type = iomap_type;
        iomap->flags = 0;
out:
        return error;
}

static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset,
                                  loff_t length, unsigned flags,
                                  struct iomap *iomap, struct iomap *srcmap)
{
        int error;

        error = ext4_iomap_xattr_fiemap(inode, iomap);
        if (error == 0 && (offset >= iomap->length))
                error = -ENOENT;
        return error;
}

static const struct iomap_ops ext4_iomap_xattr_ops = {
        .iomap_begin                = ext4_iomap_xattr_begin,
};

static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len)
{
        u64 maxbytes;

        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                maxbytes = inode->i_sb->s_maxbytes;
        else
                maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;

        if (*len == 0)
                return -EINVAL;
        if (start > maxbytes)
                return -EFBIG;

        /*
         * Shrink request scope to what the fs can actually handle.
         */
        if (*len > maxbytes || (maxbytes - *len) < start)
                *len = maxbytes - start;
        return 0;
}

int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                u64 start, u64 len)
{
        int error = 0;

        if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
                error = ext4_ext_precache(inode);
                if (error)
                        return error;
                fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
        }

        /*
         * For bitmap files the maximum size limit could be smaller than
         * s_maxbytes, so check len here manually instead of just relying on the
         * generic check.
         */
        error = ext4_fiemap_check_ranges(inode, start, &len);
        if (error)
                return error;

        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
                fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
                return iomap_fiemap(inode, fieinfo, start, len,
                                    &ext4_iomap_xattr_ops);
        }

        return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops);
}

int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
                      __u64 start, __u64 len)
{
        ext4_lblk_t start_blk, len_blks;
        __u64 last_blk;
        int error = 0;

        if (ext4_has_inline_data(inode)) {
                int has_inline;

                down_read(&EXT4_I(inode)->xattr_sem);
                has_inline = ext4_has_inline_data(inode);
                up_read(&EXT4_I(inode)->xattr_sem);
                if (has_inline)
                        return 0;
        }

        if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
                error = ext4_ext_precache(inode);
                if (error)
                        return error;
                fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
        }

        error = fiemap_prep(inode, fieinfo, start, &len, 0);
        if (error)
                return error;

        error = ext4_fiemap_check_ranges(inode, start, &len);
        if (error)
                return error;

        start_blk = start >> inode->i_sb->s_blocksize_bits;
        last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
        if (last_blk >= EXT_MAX_BLOCKS)
                last_blk = EXT_MAX_BLOCKS-1;
        len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;

        /*
         * Walk the extent tree gathering extent information
         * and pushing extents back to the user.
         */
        return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo);
}

/*
 * ext4_ext_shift_path_extents:
 * Shift the extents of a path structure lying between path[depth].p_ext
 * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
 * if it is right shift or left shift operation.
 */
static int
ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
                            struct inode *inode, handle_t *handle,
                            enum SHIFT_DIRECTION SHIFT)
{
        int depth, err = 0;
        struct ext4_extent *ex_start, *ex_last;
        bool update = false;
        int credits, restart_credits;
        depth = path->p_depth;

        while (depth >= 0) {
                if (depth == path->p_depth) {
                        ex_start = path[depth].p_ext;
                        if (!ex_start)
                                return -EFSCORRUPTED;

                        ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
                        /* leaf + sb + inode */
                        credits = 3;
                        if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) {
                                update = true;
                                /* extent tree + sb + inode */
                                credits = depth + 2;
                        }

                        restart_credits = ext4_writepage_trans_blocks(inode);
                        err = ext4_datasem_ensure_credits(handle, inode, credits,
                                        restart_credits, 0);
                        if (err) {
                                if (err > 0)
                                        err = -EAGAIN;
                                goto out;
                        }

                        err = ext4_ext_get_access(handle, inode, path + depth);
                        if (err)
                                goto out;

                        while (ex_start <= ex_last) {
                                if (SHIFT == SHIFT_LEFT) {
                                        le32_add_cpu(&ex_start->ee_block,
                                                -shift);
                                        /* Try to merge to the left. */
                                        if ((ex_start >
                                            EXT_FIRST_EXTENT(path[depth].p_hdr))
                                            &&
                                            ext4_ext_try_to_merge_right(inode,
                                            path, ex_start - 1))
                                                ex_last--;
                                        else
                                                ex_start++;
                                } else {
                                        le32_add_cpu(&ex_last->ee_block, shift);
                                        ext4_ext_try_to_merge_right(inode, path,
                                                ex_last);
                                        ex_last--;
                                }
                        }
                        err = ext4_ext_dirty(handle, inode, path + depth);
                        if (err)
                                goto out;

                        if (--depth < 0 || !update)
                                break;
                }

                /* Update index too */
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
                        goto out;

                if (SHIFT == SHIFT_LEFT)
                        le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
                else
                        le32_add_cpu(&path[depth].p_idx->ei_block, shift);
                err = ext4_ext_dirty(handle, inode, path + depth);
                if (err)
                        goto out;

                /* we are done if current index is not a starting index */
                if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
                        break;

                depth--;
        }

out:
        return err;
}

/*
 * ext4_ext_shift_extents:
 * All the extents which lies in the range from @start to the last allocated
 * block for the @inode are shifted either towards left or right (depending
 * upon @SHIFT) by @shift blocks.
 * On success, 0 is returned, error otherwise.
 */
static int
ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
                       ext4_lblk_t start, ext4_lblk_t shift,
                       enum SHIFT_DIRECTION SHIFT)
{
        struct ext4_ext_path *path;
        int ret = 0, depth;
        struct ext4_extent *extent;
        ext4_lblk_t stop, *iterator, ex_start, ex_end;
        ext4_lblk_t tmp = EXT_MAX_BLOCKS;

        /* Let path point to the last extent */
        path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
                                EXT4_EX_NOCACHE);
        if (IS_ERR(path))
                return PTR_ERR(path);

        depth = path->p_depth;
        extent = path[depth].p_ext;
        if (!extent)
                goto out;

        stop = le32_to_cpu(extent->ee_block);

       /*
        * For left shifts, make sure the hole on the left is big enough to
        * accommodate the shift.  For right shifts, make sure the last extent
        * won't be shifted beyond EXT_MAX_BLOCKS.
        */
        if (SHIFT == SHIFT_LEFT) {
                path = ext4_find_extent(inode, start - 1, &path,
                                        EXT4_EX_NOCACHE);
                if (IS_ERR(path))
                        return PTR_ERR(path);
                depth = path->p_depth;
                extent =  path[depth].p_ext;
                if (extent) {
                        ex_start = le32_to_cpu(extent->ee_block);
                        ex_end = le32_to_cpu(extent->ee_block) +
                                ext4_ext_get_actual_len(extent);
                } else {
                        ex_start = 0;
                        ex_end = 0;
                }

                if ((start == ex_start && shift > ex_start) ||
                    (shift > start - ex_end)) {
                        ret = -EINVAL;
                        goto out;
                }
        } else {
                if (shift > EXT_MAX_BLOCKS -
                    (stop + ext4_ext_get_actual_len(extent))) {
                        ret = -EINVAL;
                        goto out;
                }
        }

        /*
         * In case of left shift, iterator points to start and it is increased
         * till we reach stop. In case of right shift, iterator points to stop
         * and it is decreased till we reach start.
         */
again:
        ret = 0;
        if (SHIFT == SHIFT_LEFT)
                iterator = &start;
        else
                iterator = &stop;

        if (tmp != EXT_MAX_BLOCKS)
                *iterator = tmp;

        /*
         * Its safe to start updating extents.  Start and stop are unsigned, so
         * in case of right shift if extent with 0 block is reached, iterator
         * becomes NULL to indicate the end of the loop.
         */
        while (iterator && start <= stop) {
                path = ext4_find_extent(inode, *iterator, &path,
                                        EXT4_EX_NOCACHE);
                if (IS_ERR(path))
                        return PTR_ERR(path);
                depth = path->p_depth;
                extent = path[depth].p_ext;
                if (!extent) {
                        EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
                                         (unsigned long) *iterator);
                        return -EFSCORRUPTED;
                }
                if (SHIFT == SHIFT_LEFT && *iterator >
                    le32_to_cpu(extent->ee_block)) {
                        /* Hole, move to the next extent */
                        if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
                                path[depth].p_ext++;
                        } else {
                                *iterator = ext4_ext_next_allocated_block(path);
                                continue;
                        }
                }

                tmp = *iterator;
                if (SHIFT == SHIFT_LEFT) {
                        extent = EXT_LAST_EXTENT(path[depth].p_hdr);
                        *iterator = le32_to_cpu(extent->ee_block) +
                                        ext4_ext_get_actual_len(extent);
                } else {
                        extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
                        if (le32_to_cpu(extent->ee_block) > start)
                                *iterator = le32_to_cpu(extent->ee_block) - 1;
                        else if (le32_to_cpu(extent->ee_block) == start)
                                iterator = NULL;
                        else {
                                extent = EXT_LAST_EXTENT(path[depth].p_hdr);
                                while (le32_to_cpu(extent->ee_block) >= start)
                                        extent--;

                                if (extent == EXT_LAST_EXTENT(path[depth].p_hdr))
                                        break;

                                extent++;
                                iterator = NULL;
                        }
                        path[depth].p_ext = extent;
                }
                ret = ext4_ext_shift_path_extents(path, shift, inode,
                                handle, SHIFT);
                /* iterator can be NULL which means we should break */
                if (ret == -EAGAIN)
                        goto again;
                if (ret)
                        break;
        }
out:
        ext4_free_ext_path(path);
        return ret;
}

/*
 * ext4_collapse_range:
 * This implements the fallocate's collapse range functionality for ext4
 * Returns: 0 and non-zero on error.
 */
static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
{
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        struct address_space *mapping = inode->i_mapping;
        ext4_lblk_t punch_start, punch_stop;
        handle_t *handle;
        unsigned int credits;
        loff_t new_size, ioffset;
        int ret;

        /*
         * We need to test this early because xfstests assumes that a
         * collapse range of (0, 1) will return EOPNOTSUPP if the file
         * system does not support collapse range.
         */
        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return -EOPNOTSUPP;

        /* Collapse range works only on fs cluster size aligned regions. */
        if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
                return -EINVAL;

        trace_ext4_collapse_range(inode, offset, len);

        punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
        punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);

        inode_lock(inode);
        /*
         * There is no need to overlap collapse range with EOF, in which case
         * it is effectively a truncate operation
         */
        if (offset + len >= inode->i_size) {
                ret = -EINVAL;
                goto out_mutex;
        }

        /* Currently just for extent based files */
        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                ret = -EOPNOTSUPP;
                goto out_mutex;
        }

        /* Wait for existing dio to complete */
        inode_dio_wait(inode);

        ret = file_modified(file);
        if (ret)
                goto out_mutex;

        /*
         * Prevent page faults from reinstantiating pages we have released from
         * page cache.
         */
        filemap_invalidate_lock(mapping);

        ret = ext4_break_layouts(inode);
        if (ret)
                goto out_mmap;

        /*
         * Need to round down offset to be aligned with page size boundary
         * for page size > block size.
         */
        ioffset = round_down(offset, PAGE_SIZE);
        /*
         * Write tail of the last page before removed range since it will get
         * removed from the page cache below.
         */
        ret = filemap_write_and_wait_range(mapping, ioffset, offset);
        if (ret)
                goto out_mmap;
        /*
         * Write data that will be shifted to preserve them when discarding
         * page cache below. We are also protected from pages becoming dirty
         * by i_rwsem and invalidate_lock.
         */
        ret = filemap_write_and_wait_range(mapping, offset + len,
                                           LLONG_MAX);
        if (ret)
                goto out_mmap;
        truncate_pagecache(inode, ioffset);

        credits = ext4_writepage_trans_blocks(inode);
        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                goto out_mmap;
        }
        ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);

        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_discard_preallocations(inode);
        ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start);

        ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
        if (ret) {
                up_write(&EXT4_I(inode)->i_data_sem);
                goto out_stop;
        }
        ext4_discard_preallocations(inode);

        ret = ext4_ext_shift_extents(inode, handle, punch_stop,
                                     punch_stop - punch_start, SHIFT_LEFT);
        if (ret) {
                up_write(&EXT4_I(inode)->i_data_sem);
                goto out_stop;
        }

        new_size = inode->i_size - len;
        i_size_write(inode, new_size);
        EXT4_I(inode)->i_disksize = new_size;

        up_write(&EXT4_I(inode)->i_data_sem);
        if (IS_SYNC(inode))
                ext4_handle_sync(handle);
        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        ret = ext4_mark_inode_dirty(handle, inode);
        ext4_update_inode_fsync_trans(handle, inode, 1);

out_stop:
        ext4_journal_stop(handle);
out_mmap:
        filemap_invalidate_unlock(mapping);
out_mutex:
        inode_unlock(inode);
        return ret;
}

/*
 * ext4_insert_range:
 * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
 * The data blocks starting from @offset to the EOF are shifted by @len
 * towards right to create a hole in the @inode. Inode size is increased
 * by len bytes.
 * Returns 0 on success, error otherwise.
 */
static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
{
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        struct address_space *mapping = inode->i_mapping;
        handle_t *handle;
        struct ext4_ext_path *path;
        struct ext4_extent *extent;
        ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
        unsigned int credits, ee_len;
        int ret = 0, depth, split_flag = 0;
        loff_t ioffset;

        /*
         * We need to test this early because xfstests assumes that an
         * insert range of (0, 1) will return EOPNOTSUPP if the file
         * system does not support insert range.
         */
        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return -EOPNOTSUPP;

        /* Insert range works only on fs cluster size aligned regions. */
        if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
                return -EINVAL;

        trace_ext4_insert_range(inode, offset, len);

        offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
        len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);

        inode_lock(inode);
        /* Currently just for extent based files */
        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                ret = -EOPNOTSUPP;
                goto out_mutex;
        }

        /* Check whether the maximum file size would be exceeded */
        if (len > inode->i_sb->s_maxbytes - inode->i_size) {
                ret = -EFBIG;
                goto out_mutex;
        }

        /* Offset must be less than i_size */
        if (offset >= inode->i_size) {
                ret = -EINVAL;
                goto out_mutex;
        }

        /* Wait for existing dio to complete */
        inode_dio_wait(inode);

        ret = file_modified(file);
        if (ret)
                goto out_mutex;

        /*
         * Prevent page faults from reinstantiating pages we have released from
         * page cache.
         */
        filemap_invalidate_lock(mapping);

        ret = ext4_break_layouts(inode);
        if (ret)
                goto out_mmap;

        /*
         * Need to round down to align start offset to page size boundary
         * for page size > block size.
         */
        ioffset = round_down(offset, PAGE_SIZE);
        /* Write out all dirty pages */
        ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
                        LLONG_MAX);
        if (ret)
                goto out_mmap;
        truncate_pagecache(inode, ioffset);

        credits = ext4_writepage_trans_blocks(inode);
        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                goto out_mmap;
        }
        ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);

        /* Expand file to avoid data loss if there is error while shifting */
        inode->i_size += len;
        EXT4_I(inode)->i_disksize += len;
        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        ret = ext4_mark_inode_dirty(handle, inode);
        if (ret)
                goto out_stop;

        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_discard_preallocations(inode);

        path = ext4_find_extent(inode, offset_lblk, NULL, 0);
        if (IS_ERR(path)) {
                up_write(&EXT4_I(inode)->i_data_sem);
                goto out_stop;
        }

        depth = ext_depth(inode);
        extent = path[depth].p_ext;
        if (extent) {
                ee_start_lblk = le32_to_cpu(extent->ee_block);
                ee_len = ext4_ext_get_actual_len(extent);

                /*
                 * If offset_lblk is not the starting block of extent, split
                 * the extent @offset_lblk
                 */
                if ((offset_lblk > ee_start_lblk) &&
                                (offset_lblk < (ee_start_lblk + ee_len))) {
                        if (ext4_ext_is_unwritten(extent))
                                split_flag = EXT4_EXT_MARK_UNWRIT1 |
                                        EXT4_EXT_MARK_UNWRIT2;
                        ret = ext4_split_extent_at(handle, inode, &path,
                                        offset_lblk, split_flag,
                                        EXT4_EX_NOCACHE |
                                        EXT4_GET_BLOCKS_PRE_IO |
                                        EXT4_GET_BLOCKS_METADATA_NOFAIL);
                }

                ext4_free_ext_path(path);
                if (ret < 0) {
                        up_write(&EXT4_I(inode)->i_data_sem);
                        goto out_stop;
                }
        } else {
                ext4_free_ext_path(path);
        }

        ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk);

        /*
         * if offset_lblk lies in a hole which is at start of file, use
         * ee_start_lblk to shift extents
         */
        ret = ext4_ext_shift_extents(inode, handle,
                max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT);

        up_write(&EXT4_I(inode)->i_data_sem);
        if (IS_SYNC(inode))
                ext4_handle_sync(handle);
        if (ret >= 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);

out_stop:
        ext4_journal_stop(handle);
out_mmap:
        filemap_invalidate_unlock(mapping);
out_mutex:
        inode_unlock(inode);
        return ret;
}

/**
 * ext4_swap_extents() - Swap extents between two inodes
 * @handle: handle for this transaction
 * @inode1:        First inode
 * @inode2:        Second inode
 * @lblk1:        Start block for first inode
 * @lblk2:        Start block for second inode
 * @count:        Number of blocks to swap
 * @unwritten: Mark second inode's extents as unwritten after swap
 * @erp:        Pointer to save error value
 *
 * This helper routine does exactly what is promise "swap extents". All other
 * stuff such as page-cache locking consistency, bh mapping consistency or
 * extent's data copying must be performed by caller.
 * Locking:
 *                i_rwsem is held for both inodes
 *                 i_data_sem is locked for write for both inodes
 * Assumptions:
 *                All pages from requested range are locked for both inodes
 */
int
ext4_swap_extents(handle_t *handle, struct inode *inode1,
                  struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
                  ext4_lblk_t count, int unwritten, int *erp)
{
        struct ext4_ext_path *path1 = NULL;
        struct ext4_ext_path *path2 = NULL;
        int replaced_count = 0;

        BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
        BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
        BUG_ON(!inode_is_locked(inode1));
        BUG_ON(!inode_is_locked(inode2));

        ext4_es_remove_extent(inode1, lblk1, count);
        ext4_es_remove_extent(inode2, lblk2, count);

        while (count) {
                struct ext4_extent *ex1, *ex2, tmp_ex;
                ext4_lblk_t e1_blk, e2_blk;
                int e1_len, e2_len, len;
                int split = 0;

                path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
                if (IS_ERR(path1)) {
                        *erp = PTR_ERR(path1);
                        path1 = NULL;
                finish:
                        count = 0;
                        goto repeat;
                }
                path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
                if (IS_ERR(path2)) {
                        *erp = PTR_ERR(path2);
                        path2 = NULL;
                        goto finish;
                }
                ex1 = path1[path1->p_depth].p_ext;
                ex2 = path2[path2->p_depth].p_ext;
                /* Do we have something to swap ? */
                if (unlikely(!ex2 || !ex1))
                        goto finish;

                e1_blk = le32_to_cpu(ex1->ee_block);
                e2_blk = le32_to_cpu(ex2->ee_block);
                e1_len = ext4_ext_get_actual_len(ex1);
                e2_len = ext4_ext_get_actual_len(ex2);

                /* Hole handling */
                if (!in_range(lblk1, e1_blk, e1_len) ||
                    !in_range(lblk2, e2_blk, e2_len)) {
                        ext4_lblk_t next1, next2;

                        /* if hole after extent, then go to next extent */
                        next1 = ext4_ext_next_allocated_block(path1);
                        next2 = ext4_ext_next_allocated_block(path2);
                        /* If hole before extent, then shift to that extent */
                        if (e1_blk > lblk1)
                                next1 = e1_blk;
                        if (e2_blk > lblk2)
                                next2 = e2_blk;
                        /* Do we have something to swap */
                        if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
                                goto finish;
                        /* Move to the rightest boundary */
                        len = next1 - lblk1;
                        if (len < next2 - lblk2)
                                len = next2 - lblk2;
                        if (len > count)
                                len = count;
                        lblk1 += len;
                        lblk2 += len;
                        count -= len;
                        goto repeat;
                }

                /* Prepare left boundary */
                if (e1_blk < lblk1) {
                        split = 1;
                        *erp = ext4_force_split_extent_at(handle, inode1,
                                                &path1, lblk1, 0);
                        if (unlikely(*erp))
                                goto finish;
                }
                if (e2_blk < lblk2) {
                        split = 1;
                        *erp = ext4_force_split_extent_at(handle, inode2,
                                                &path2,  lblk2, 0);
                        if (unlikely(*erp))
                                goto finish;
                }
                /* ext4_split_extent_at() may result in leaf extent split,
                 * path must to be revalidated. */
                if (split)
                        goto repeat;

                /* Prepare right boundary */
                len = count;
                if (len > e1_blk + e1_len - lblk1)
                        len = e1_blk + e1_len - lblk1;
                if (len > e2_blk + e2_len - lblk2)
                        len = e2_blk + e2_len - lblk2;

                if (len != e1_len) {
                        split = 1;
                        *erp = ext4_force_split_extent_at(handle, inode1,
                                                &path1, lblk1 + len, 0);
                        if (unlikely(*erp))
                                goto finish;
                }
                if (len != e2_len) {
                        split = 1;
                        *erp = ext4_force_split_extent_at(handle, inode2,
                                                &path2, lblk2 + len, 0);
                        if (*erp)
                                goto finish;
                }
                /* ext4_split_extent_at() may result in leaf extent split,
                 * path must to be revalidated. */
                if (split)
                        goto repeat;

                BUG_ON(e2_len != e1_len);
                *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
                if (unlikely(*erp))
                        goto finish;
                *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
                if (unlikely(*erp))
                        goto finish;

                /* Both extents are fully inside boundaries. Swap it now */
                tmp_ex = *ex1;
                ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
                ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
                ex1->ee_len = cpu_to_le16(e2_len);
                ex2->ee_len = cpu_to_le16(e1_len);
                if (unwritten)
                        ext4_ext_mark_unwritten(ex2);
                if (ext4_ext_is_unwritten(&tmp_ex))
                        ext4_ext_mark_unwritten(ex1);

                ext4_ext_try_to_merge(handle, inode2, path2, ex2);
                ext4_ext_try_to_merge(handle, inode1, path1, ex1);
                *erp = ext4_ext_dirty(handle, inode2, path2 +
                                      path2->p_depth);
                if (unlikely(*erp))
                        goto finish;
                *erp = ext4_ext_dirty(handle, inode1, path1 +
                                      path1->p_depth);
                /*
                 * Looks scarry ah..? second inode already points to new blocks,
                 * and it was successfully dirtied. But luckily error may happen
                 * only due to journal error, so full transaction will be
                 * aborted anyway.
                 */
                if (unlikely(*erp))
                        goto finish;
                lblk1 += len;
                lblk2 += len;
                replaced_count += len;
                count -= len;

        repeat:
                ext4_free_ext_path(path1);
                ext4_free_ext_path(path2);
                path1 = path2 = NULL;
        }
        return replaced_count;
}

/*
 * ext4_clu_mapped - determine whether any block in a logical cluster has
 *                   been mapped to a physical cluster
 *
 * @inode - file containing the logical cluster
 * @lclu - logical cluster of interest
 *
 * Returns 1 if any block in the logical cluster is mapped, signifying
 * that a physical cluster has been allocated for it.  Otherwise,
 * returns 0.  Can also return negative error codes.  Derived from
 * ext4_ext_map_blocks().
 */
int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_ext_path *path;
        int depth, mapped = 0, err = 0;
        struct ext4_extent *extent;
        ext4_lblk_t first_lblk, first_lclu, last_lclu;

        /*
         * if data can be stored inline, the logical cluster isn't
         * mapped - no physical clusters have been allocated, and the
         * file has no extents
         */
        if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) ||
            ext4_has_inline_data(inode))
                return 0;

        /* search for the extent closest to the first block in the cluster */
        path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
        if (IS_ERR(path)) {
                err = PTR_ERR(path);
                path = NULL;
                goto out;
        }

        depth = ext_depth(inode);

        /*
         * A consistent leaf must not be empty.  This situation is possible,
         * though, _during_ tree modification, and it's why an assert can't
         * be put in ext4_find_extent().
         */
        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
                EXT4_ERROR_INODE(inode,
                    "bad extent address - lblock: %lu, depth: %d, pblock: %lld",
                                 (unsigned long) EXT4_C2B(sbi, lclu),
                                 depth, path[depth].p_block);
                err = -EFSCORRUPTED;
                goto out;
        }

        extent = path[depth].p_ext;

        /* can't be mapped if the extent tree is empty */
        if (extent == NULL)
                goto out;

        first_lblk = le32_to_cpu(extent->ee_block);
        first_lclu = EXT4_B2C(sbi, first_lblk);

        /*
         * Three possible outcomes at this point - found extent spanning
         * the target cluster, to the left of the target cluster, or to the
         * right of the target cluster.  The first two cases are handled here.
         * The last case indicates the target cluster is not mapped.
         */
        if (lclu >= first_lclu) {
                last_lclu = EXT4_B2C(sbi, first_lblk +
                                     ext4_ext_get_actual_len(extent) - 1);
                if (lclu <= last_lclu) {
                        mapped = 1;
                } else {
                        first_lblk = ext4_ext_next_allocated_block(path);
                        first_lclu = EXT4_B2C(sbi, first_lblk);
                        if (lclu == first_lclu)
                                mapped = 1;
                }
        }

out:
        ext4_free_ext_path(path);

        return err ? err : mapped;
}

/*
 * Updates physical block address and unwritten status of extent
 * starting at lblk start and of len. If such an extent doesn't exist,
 * this function splits the extent tree appropriately to create an
 * extent like this.  This function is called in the fast commit
 * replay path.  Returns 0 on success and error on failure.
 */
int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
                              int len, int unwritten, ext4_fsblk_t pblk)
{
        struct ext4_ext_path *path = NULL, *ppath;
        struct ext4_extent *ex;
        int ret;

        path = ext4_find_extent(inode, start, NULL, 0);
        if (IS_ERR(path))
                return PTR_ERR(path);
        ex = path[path->p_depth].p_ext;
        if (!ex) {
                ret = -EFSCORRUPTED;
                goto out;
        }

        if (le32_to_cpu(ex->ee_block) != start ||
                ext4_ext_get_actual_len(ex) != len) {
                /* We need to split this extent to match our extent first */
                ppath = path;
                down_write(&EXT4_I(inode)->i_data_sem);
                ret = ext4_force_split_extent_at(NULL, inode, &ppath, start, 1);
                up_write(&EXT4_I(inode)->i_data_sem);
                if (ret)
                        goto out;
                kfree(path);
                path = ext4_find_extent(inode, start, NULL, 0);
                if (IS_ERR(path))
                        return -1;
                ppath = path;
                ex = path[path->p_depth].p_ext;
                WARN_ON(le32_to_cpu(ex->ee_block) != start);
                if (ext4_ext_get_actual_len(ex) != len) {
                        down_write(&EXT4_I(inode)->i_data_sem);
                        ret = ext4_force_split_extent_at(NULL, inode, &ppath,
                                                         start + len, 1);
                        up_write(&EXT4_I(inode)->i_data_sem);
                        if (ret)
                                goto out;
                        kfree(path);
                        path = ext4_find_extent(inode, start, NULL, 0);
                        if (IS_ERR(path))
                                return -EINVAL;
                        ex = path[path->p_depth].p_ext;
                }
        }
        if (unwritten)
                ext4_ext_mark_unwritten(ex);
        else
                ext4_ext_mark_initialized(ex);
        ext4_ext_store_pblock(ex, pblk);
        down_write(&EXT4_I(inode)->i_data_sem);
        ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
        up_write(&EXT4_I(inode)->i_data_sem);
out:
        ext4_free_ext_path(path);
        ext4_mark_inode_dirty(NULL, inode);
        return ret;
}

/* Try to shrink the extent tree */
void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
{
        struct ext4_ext_path *path = NULL;
        struct ext4_extent *ex;
        ext4_lblk_t old_cur, cur = 0;

        while (cur < end) {
                path = ext4_find_extent(inode, cur, NULL, 0);
                if (IS_ERR(path))
                        return;
                ex = path[path->p_depth].p_ext;
                if (!ex) {
                        ext4_free_ext_path(path);
                        ext4_mark_inode_dirty(NULL, inode);
                        return;
                }
                old_cur = cur;
                cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
                if (cur <= old_cur)
                        cur = old_cur + 1;
                ext4_ext_try_to_merge(NULL, inode, path, ex);
                down_write(&EXT4_I(inode)->i_data_sem);
                ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
                up_write(&EXT4_I(inode)->i_data_sem);
                ext4_mark_inode_dirty(NULL, inode);
                ext4_free_ext_path(path);
        }
}

/* Check if *cur is a hole and if it is, skip it */
static int skip_hole(struct inode *inode, ext4_lblk_t *cur)
{
        int ret;
        struct ext4_map_blocks map;

        map.m_lblk = *cur;
        map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur;

        ret = ext4_map_blocks(NULL, inode, &map, 0);
        if (ret < 0)
                return ret;
        if (ret != 0)
                return 0;
        *cur = *cur + map.m_len;
        return 0;
}

/* Count number of blocks used by this inode and update i_blocks */
int ext4_ext_replay_set_iblocks(struct inode *inode)
{
        struct ext4_ext_path *path = NULL, *path2 = NULL;
        struct ext4_extent *ex;
        ext4_lblk_t cur = 0, end;
        int numblks = 0, i, ret = 0;
        ext4_fsblk_t cmp1, cmp2;
        struct ext4_map_blocks map;

        /* Determin the size of the file first */
        path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
                                        EXT4_EX_NOCACHE);
        if (IS_ERR(path))
                return PTR_ERR(path);
        ex = path[path->p_depth].p_ext;
        if (!ex) {
                ext4_free_ext_path(path);
                goto out;
        }
        end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
        ext4_free_ext_path(path);

        /* Count the number of data blocks */
        cur = 0;
        while (cur < end) {
                map.m_lblk = cur;
                map.m_len = end - cur;
                ret = ext4_map_blocks(NULL, inode, &map, 0);
                if (ret < 0)
                        break;
                if (ret > 0)
                        numblks += ret;
                cur = cur + map.m_len;
        }

        /*
         * Count the number of extent tree blocks. We do it by looking up
         * two successive extents and determining the difference between
         * their paths. When path is different for 2 successive extents
         * we compare the blocks in the path at each level and increment
         * iblocks by total number of differences found.
         */
        cur = 0;
        ret = skip_hole(inode, &cur);
        if (ret < 0)
                goto out;
        path = ext4_find_extent(inode, cur, NULL, 0);
        if (IS_ERR(path))
                goto out;
        numblks += path->p_depth;
        ext4_free_ext_path(path);
        while (cur < end) {
                path = ext4_find_extent(inode, cur, NULL, 0);
                if (IS_ERR(path))
                        break;
                ex = path[path->p_depth].p_ext;
                if (!ex) {
                        ext4_free_ext_path(path);
                        return 0;
                }
                cur = max(cur + 1, le32_to_cpu(ex->ee_block) +
                                        ext4_ext_get_actual_len(ex));
                ret = skip_hole(inode, &cur);
                if (ret < 0) {
                        ext4_free_ext_path(path);
                        break;
                }
                path2 = ext4_find_extent(inode, cur, NULL, 0);
                if (IS_ERR(path2)) {
                        ext4_free_ext_path(path);
                        break;
                }
                for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
                        cmp1 = cmp2 = 0;
                        if (i <= path->p_depth)
                                cmp1 = path[i].p_bh ?
                                        path[i].p_bh->b_blocknr : 0;
                        if (i <= path2->p_depth)
                                cmp2 = path2[i].p_bh ?
                                        path2[i].p_bh->b_blocknr : 0;
                        if (cmp1 != cmp2 && cmp2 != 0)
                                numblks++;
                }
                ext4_free_ext_path(path);
                ext4_free_ext_path(path2);
        }

out:
        inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9);
        ext4_mark_inode_dirty(NULL, inode);
        return 0;
}

int ext4_ext_clear_bb(struct inode *inode)
{
        struct ext4_ext_path *path = NULL;
        struct ext4_extent *ex;
        ext4_lblk_t cur = 0, end;
        int j, ret = 0;
        struct ext4_map_blocks map;

        if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
                return 0;

        /* Determin the size of the file first */
        path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
                                        EXT4_EX_NOCACHE);
        if (IS_ERR(path))
                return PTR_ERR(path);
        ex = path[path->p_depth].p_ext;
        if (!ex) {
                ext4_free_ext_path(path);
                return 0;
        }
        end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
        ext4_free_ext_path(path);

        cur = 0;
        while (cur < end) {
                map.m_lblk = cur;
                map.m_len = end - cur;
                ret = ext4_map_blocks(NULL, inode, &map, 0);
                if (ret < 0)
                        break;
                if (ret > 0) {
                        path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
                        if (!IS_ERR_OR_NULL(path)) {
                                for (j = 0; j < path->p_depth; j++) {

                                        ext4_mb_mark_bb(inode->i_sb,
                                                        path[j].p_block, 1, false);
                                        ext4_fc_record_regions(inode->i_sb, inode->i_ino,
                                                        0, path[j].p_block, 1, 1);
                                }
                                ext4_free_ext_path(path);
                        }
                        ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
                        ext4_fc_record_regions(inode->i_sb, inode->i_ino,
                                        map.m_lblk, map.m_pblk, map.m_len, 1);
                }
                cur = cur + map.m_len;
        }

        return 0;
}








































































































   18 





   18 





    2 

































































































































































    8 


    9 
























    2 




    2 





















    2 




    2 



    3 






    3 



























    2 



    3 








    3 







    3 









    2 










    3 







    3 
    2 




    3 
    3 











































   11 

   13 




    3 



















    3 

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    9 





    9 




























































































   11 
   13 
    3 
   11 





   12 




















   13 
   11 
    2 


   13 

















   11 





   13 


    1 

   13 



   10 









    1 

    1 













   12 





   12 





































    7 



    7 
    7 





    7 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
// SPDX-License-Identifier: GPL-2.0
/*
 * Common Block IO controller cgroup interface
 *
 * Based on ideas and code from CFQ, CFS and BFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *                      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 *                       Nauman Rafique <nauman@google.com>
 *
 * For policy-specific per-blkcg data:
 * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
 *                    Arianna Avanzini <avanzini.arianna@gmail.com>
 */
#include <linux/ioprio.h>
#include <linux/kdev_t.h>
#include <linux/module.h>
#include <linux/sched/signal.h>
#include <linux/err.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/slab.h>
#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/ctype.h>
#include <linux/resume_user_mode.h>
#include <linux/psi.h>
#include <linux/part_stat.h>
#include "blk.h"
#include "blk-cgroup.h"
#include "blk-ioprio.h"
#include "blk-throttle.h"

static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);

/*
 * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
 * blkcg_pol_register_mutex nests outside of it and synchronizes entire
 * policy [un]register operations including cgroup file additions /
 * removals.  Putting cgroup file registration outside blkcg_pol_mutex
 * allows grabbing it from cgroup callbacks.
 */
static DEFINE_MUTEX(blkcg_pol_register_mutex);
static DEFINE_MUTEX(blkcg_pol_mutex);

struct blkcg blkcg_root;
EXPORT_SYMBOL_GPL(blkcg_root);

struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
EXPORT_SYMBOL_GPL(blkcg_root_css);

static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];

static LIST_HEAD(all_blkcgs);                /* protected by blkcg_pol_mutex */

bool blkcg_debug_stats = false;

static DEFINE_RAW_SPINLOCK(blkg_stat_lock);

#define BLKG_DESTROY_BATCH_SIZE  64

/*
 * Lockless lists for tracking IO stats update
 *
 * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg).
 * There are multiple blkg's (one for each block device) attached to each
 * blkcg. The rstat code keeps track of which cpu has IO stats updated,
 * but it doesn't know which blkg has the updated stats. If there are many
 * block devices in a system, the cost of iterating all the blkg's to flush
 * out the IO stats can be high. To reduce such overhead, a set of percpu
 * lockless lists (lhead) per blkcg are used to track the set of recently
 * updated iostat_cpu's since the last flush. An iostat_cpu will be put
 * onto the lockless list on the update side [blk_cgroup_bio_start()] if
 * not there yet and then removed when being flushed [blkcg_rstat_flush()].
 * References to blkg are gotten and then put back in the process to
 * protect against blkg removal.
 *
 * Return: 0 if successful or -ENOMEM if allocation fails.
 */
static int init_blkcg_llists(struct blkcg *blkcg)
{
        int cpu;

        blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL);
        if (!blkcg->lhead)
                return -ENOMEM;

        for_each_possible_cpu(cpu)
                init_llist_head(per_cpu_ptr(blkcg->lhead, cpu));
        return 0;
}

/**
 * blkcg_css - find the current css
 *
 * Find the css associated with either the kthread or the current task.
 * This may return a dying css, so it is up to the caller to use tryget logic
 * to confirm it is alive and well.
 */
static struct cgroup_subsys_state *blkcg_css(void)
{
        struct cgroup_subsys_state *css;

        css = kthread_blkcg();
        if (css)
                return css;
        return task_css(current, io_cgrp_id);
}

static bool blkcg_policy_enabled(struct request_queue *q,
                                 const struct blkcg_policy *pol)
{
        return pol && test_bit(pol->plid, q->blkcg_pols);
}

static void blkg_free_workfn(struct work_struct *work)
{
        struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
                                             free_work);
        struct request_queue *q = blkg->q;
        int i;

        /*
         * pd_free_fn() can also be called from blkcg_deactivate_policy(),
         * in order to make sure pd_free_fn() is called in order, the deletion
         * of the list blkg->q_node is delayed to here from blkg_destroy(), and
         * blkcg_mutex is used to synchronize blkg_free_workfn() and
         * blkcg_deactivate_policy().
         */
        mutex_lock(&q->blkcg_mutex);
        for (i = 0; i < BLKCG_MAX_POLS; i++)
                if (blkg->pd[i])
                        blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
        if (blkg->parent)
                blkg_put(blkg->parent);
        spin_lock_irq(&q->queue_lock);
        list_del_init(&blkg->q_node);
        spin_unlock_irq(&q->queue_lock);
        mutex_unlock(&q->blkcg_mutex);

        blk_put_queue(q);
        free_percpu(blkg->iostat_cpu);
        percpu_ref_exit(&blkg->refcnt);
        kfree(blkg);
}

/**
 * blkg_free - free a blkg
 * @blkg: blkg to free
 *
 * Free @blkg which may be partially allocated.
 */
static void blkg_free(struct blkcg_gq *blkg)
{
        if (!blkg)
                return;

        /*
         * Both ->pd_free_fn() and request queue's release handler may
         * sleep, so free us by scheduling one work func
         */
        INIT_WORK(&blkg->free_work, blkg_free_workfn);
        schedule_work(&blkg->free_work);
}

static void __blkg_release(struct rcu_head *rcu)
{
        struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
        struct blkcg *blkcg = blkg->blkcg;
        int cpu;

#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
        WARN_ON(!bio_list_empty(&blkg->async_bios));
#endif
        /*
         * Flush all the non-empty percpu lockless lists before releasing
         * us, given these stat belongs to us.
         *
         * blkg_stat_lock is for serializing blkg stat update
         */
        for_each_possible_cpu(cpu)
                __blkcg_rstat_flush(blkcg, cpu);

        /* release the blkcg and parent blkg refs this blkg has been holding */
        css_put(&blkg->blkcg->css);
        blkg_free(blkg);
}

/*
 * A group is RCU protected, but having an rcu lock does not mean that one
 * can access all the fields of blkg and assume these are valid.  For
 * example, don't try to follow throtl_data and request queue links.
 *
 * Having a reference to blkg under an rcu allows accesses to only values
 * local to groups like group stats and group rate limits.
 */
static void blkg_release(struct percpu_ref *ref)
{
        struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);

        call_rcu(&blkg->rcu_head, __blkg_release);
}

#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
static struct workqueue_struct *blkcg_punt_bio_wq;

static void blkg_async_bio_workfn(struct work_struct *work)
{
        struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
                                             async_bio_work);
        struct bio_list bios = BIO_EMPTY_LIST;
        struct bio *bio;
        struct blk_plug plug;
        bool need_plug = false;

        /* as long as there are pending bios, @blkg can't go away */
        spin_lock(&blkg->async_bio_lock);
        bio_list_merge_init(&bios, &blkg->async_bios);
        spin_unlock(&blkg->async_bio_lock);

        /* start plug only when bio_list contains at least 2 bios */
        if (bios.head && bios.head->bi_next) {
                need_plug = true;
                blk_start_plug(&plug);
        }
        while ((bio = bio_list_pop(&bios)))
                submit_bio(bio);
        if (need_plug)
                blk_finish_plug(&plug);
}

/*
 * When a shared kthread issues a bio for a cgroup, doing so synchronously can
 * lead to priority inversions as the kthread can be trapped waiting for that
 * cgroup.  Use this helper instead of submit_bio to punt the actual issuing to
 * a dedicated per-blkcg work item to avoid such priority inversions.
 */
void blkcg_punt_bio_submit(struct bio *bio)
{
        struct blkcg_gq *blkg = bio->bi_blkg;

        if (blkg->parent) {
                spin_lock(&blkg->async_bio_lock);
                bio_list_add(&blkg->async_bios, bio);
                spin_unlock(&blkg->async_bio_lock);
                queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
        } else {
                /* never bounce for the root cgroup */
                submit_bio(bio);
        }
}
EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit);

static int __init blkcg_punt_bio_init(void)
{
        blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
                                            WQ_MEM_RECLAIM | WQ_FREEZABLE |
                                            WQ_UNBOUND | WQ_SYSFS, 0);
        if (!blkcg_punt_bio_wq)
                return -ENOMEM;
        return 0;
}
subsys_initcall(blkcg_punt_bio_init);
#endif /* CONFIG_BLK_CGROUP_PUNT_BIO */

/**
 * bio_blkcg_css - return the blkcg CSS associated with a bio
 * @bio: target bio
 *
 * This returns the CSS for the blkcg associated with a bio, or %NULL if not
 * associated. Callers are expected to either handle %NULL or know association
 * has been done prior to calling this.
 */
struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio)
{
        if (!bio || !bio->bi_blkg)
                return NULL;
        return &bio->bi_blkg->blkcg->css;
}
EXPORT_SYMBOL_GPL(bio_blkcg_css);

/**
 * blkcg_parent - get the parent of a blkcg
 * @blkcg: blkcg of interest
 *
 * Return the parent blkcg of @blkcg.  Can be called anytime.
 */
static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
{
        return css_to_blkcg(blkcg->css.parent);
}

/**
 * blkg_alloc - allocate a blkg
 * @blkcg: block cgroup the new blkg is associated with
 * @disk: gendisk the new blkg is associated with
 * @gfp_mask: allocation mask to use
 *
 * Allocate a new blkg associating @blkcg and @disk.
 */
static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
                                   gfp_t gfp_mask)
{
        struct blkcg_gq *blkg;
        int i, cpu;

        /* alloc and init base part */
        blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node);
        if (!blkg)
                return NULL;
        if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
                goto out_free_blkg;
        blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
        if (!blkg->iostat_cpu)
                goto out_exit_refcnt;
        if (!blk_get_queue(disk->queue))
                goto out_free_iostat;

        blkg->q = disk->queue;
        INIT_LIST_HEAD(&blkg->q_node);
        blkg->blkcg = blkcg;
        blkg->iostat.blkg = blkg;
#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
        spin_lock_init(&blkg->async_bio_lock);
        bio_list_init(&blkg->async_bios);
        INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
#endif

        u64_stats_init(&blkg->iostat.sync);
        for_each_possible_cpu(cpu) {
                u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
                per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
        }

        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];
                struct blkg_policy_data *pd;

                if (!blkcg_policy_enabled(disk->queue, pol))
                        continue;

                /* alloc per-policy data and attach it to blkg */
                pd = pol->pd_alloc_fn(disk, blkcg, gfp_mask);
                if (!pd)
                        goto out_free_pds;
                blkg->pd[i] = pd;
                pd->blkg = blkg;
                pd->plid = i;
                pd->online = false;
        }

        return blkg;

out_free_pds:
        while (--i >= 0)
                if (blkg->pd[i])
                        blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
        blk_put_queue(disk->queue);
out_free_iostat:
        free_percpu(blkg->iostat_cpu);
out_exit_refcnt:
        percpu_ref_exit(&blkg->refcnt);
out_free_blkg:
        kfree(blkg);
        return NULL;
}

/*
 * If @new_blkg is %NULL, this function tries to allocate a new one as
 * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
 */
static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
                                    struct blkcg_gq *new_blkg)
{
        struct blkcg_gq *blkg;
        int i, ret;

        lockdep_assert_held(&disk->queue->queue_lock);

        /* request_queue is dying, do not create/recreate a blkg */
        if (blk_queue_dying(disk->queue)) {
                ret = -ENODEV;
                goto err_free_blkg;
        }

        /* blkg holds a reference to blkcg */
        if (!css_tryget_online(&blkcg->css)) {
                ret = -ENODEV;
                goto err_free_blkg;
        }

        /* allocate */
        if (!new_blkg) {
                new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN);
                if (unlikely(!new_blkg)) {
                        ret = -ENOMEM;
                        goto err_put_css;
                }
        }
        blkg = new_blkg;

        /* link parent */
        if (blkcg_parent(blkcg)) {
                blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue);
                if (WARN_ON_ONCE(!blkg->parent)) {
                        ret = -ENODEV;
                        goto err_put_css;
                }
                blkg_get(blkg->parent);
        }

        /* invoke per-policy init */
        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];

                if (blkg->pd[i] && pol->pd_init_fn)
                        pol->pd_init_fn(blkg->pd[i]);
        }

        /* insert */
        spin_lock(&blkcg->lock);
        ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg);
        if (likely(!ret)) {
                hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
                list_add(&blkg->q_node, &disk->queue->blkg_list);

                for (i = 0; i < BLKCG_MAX_POLS; i++) {
                        struct blkcg_policy *pol = blkcg_policy[i];

                        if (blkg->pd[i]) {
                                if (pol->pd_online_fn)
                                        pol->pd_online_fn(blkg->pd[i]);
                                blkg->pd[i]->online = true;
                        }
                }
        }
        blkg->online = true;
        spin_unlock(&blkcg->lock);

        if (!ret)
                return blkg;

        /* @blkg failed fully initialized, use the usual release path */
        blkg_put(blkg);
        return ERR_PTR(ret);

err_put_css:
        css_put(&blkcg->css);
err_free_blkg:
        if (new_blkg)
                blkg_free(new_blkg);
        return ERR_PTR(ret);
}

/**
 * blkg_lookup_create - lookup blkg, try to create one if not there
 * @blkcg: blkcg of interest
 * @disk: gendisk of interest
 *
 * Lookup blkg for the @blkcg - @disk pair.  If it doesn't exist, try to
 * create one.  blkg creation is performed recursively from blkcg_root such
 * that all non-root blkg's have access to the parent blkg.  This function
 * should be called under RCU read lock and takes @disk->queue->queue_lock.
 *
 * Returns the blkg or the closest blkg if blkg_create() fails as it walks
 * down from root.
 */
static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
                struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct blkcg_gq *blkg;
        unsigned long flags;

        WARN_ON_ONCE(!rcu_read_lock_held());

        blkg = blkg_lookup(blkcg, q);
        if (blkg)
                return blkg;

        spin_lock_irqsave(&q->queue_lock, flags);
        blkg = blkg_lookup(blkcg, q);
        if (blkg) {
                if (blkcg != &blkcg_root &&
                    blkg != rcu_dereference(blkcg->blkg_hint))
                        rcu_assign_pointer(blkcg->blkg_hint, blkg);
                goto found;
        }

        /*
         * Create blkgs walking down from blkcg_root to @blkcg, so that all
         * non-root blkgs have access to their parents.  Returns the closest
         * blkg to the intended blkg should blkg_create() fail.
         */
        while (true) {
                struct blkcg *pos = blkcg;
                struct blkcg *parent = blkcg_parent(blkcg);
                struct blkcg_gq *ret_blkg = q->root_blkg;

                while (parent) {
                        blkg = blkg_lookup(parent, q);
                        if (blkg) {
                                /* remember closest blkg */
                                ret_blkg = blkg;
                                break;
                        }
                        pos = parent;
                        parent = blkcg_parent(parent);
                }

                blkg = blkg_create(pos, disk, NULL);
                if (IS_ERR(blkg)) {
                        blkg = ret_blkg;
                        break;
                }
                if (pos == blkcg)
                        break;
        }

found:
        spin_unlock_irqrestore(&q->queue_lock, flags);
        return blkg;
}

static void blkg_destroy(struct blkcg_gq *blkg)
{
        struct blkcg *blkcg = blkg->blkcg;
        int i;

        lockdep_assert_held(&blkg->q->queue_lock);
        lockdep_assert_held(&blkcg->lock);

        /*
         * blkg stays on the queue list until blkg_free_workfn(), see details in
         * blkg_free_workfn(), hence this function can be called from
         * blkcg_destroy_blkgs() first and again from blkg_destroy_all() before
         * blkg_free_workfn().
         */
        if (hlist_unhashed(&blkg->blkcg_node))
                return;

        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];

                if (blkg->pd[i] && blkg->pd[i]->online) {
                        blkg->pd[i]->online = false;
                        if (pol->pd_offline_fn)
                                pol->pd_offline_fn(blkg->pd[i]);
                }
        }

        blkg->online = false;

        radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
        hlist_del_init_rcu(&blkg->blkcg_node);

        /*
         * Both setting lookup hint to and clearing it from @blkg are done
         * under queue_lock.  If it's not pointing to @blkg now, it never
         * will.  Hint assignment itself can race safely.
         */
        if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
                rcu_assign_pointer(blkcg->blkg_hint, NULL);

        /*
         * Put the reference taken at the time of creation so that when all
         * queues are gone, group can be destroyed.
         */
        percpu_ref_kill(&blkg->refcnt);
}

static void blkg_destroy_all(struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct blkcg_gq *blkg;
        int count = BLKG_DESTROY_BATCH_SIZE;
        int i;

restart:
        spin_lock_irq(&q->queue_lock);
        list_for_each_entry(blkg, &q->blkg_list, q_node) {
                struct blkcg *blkcg = blkg->blkcg;

                if (hlist_unhashed(&blkg->blkcg_node))
                        continue;

                spin_lock(&blkcg->lock);
                blkg_destroy(blkg);
                spin_unlock(&blkcg->lock);

                /*
                 * in order to avoid holding the spin lock for too long, release
                 * it when a batch of blkgs are destroyed.
                 */
                if (!(--count)) {
                        count = BLKG_DESTROY_BATCH_SIZE;
                        spin_unlock_irq(&q->queue_lock);
                        cond_resched();
                        goto restart;
                }
        }

        /*
         * Mark policy deactivated since policy offline has been done, and
         * the free is scheduled, so future blkcg_deactivate_policy() can
         * be bypassed
         */
        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];

                if (pol)
                        __clear_bit(pol->plid, q->blkcg_pols);
        }

        q->root_blkg = NULL;
        spin_unlock_irq(&q->queue_lock);
}

static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
{
        int i;

        for (i = 0; i < BLKG_IOSTAT_NR; i++) {
                dst->bytes[i] = src->bytes[i];
                dst->ios[i] = src->ios[i];
        }
}

static void __blkg_clear_stat(struct blkg_iostat_set *bis)
{
        struct blkg_iostat cur = {0};
        unsigned long flags;

        flags = u64_stats_update_begin_irqsave(&bis->sync);
        blkg_iostat_set(&bis->cur, &cur);
        blkg_iostat_set(&bis->last, &cur);
        u64_stats_update_end_irqrestore(&bis->sync, flags);
}

static void blkg_clear_stat(struct blkcg_gq *blkg)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                struct blkg_iostat_set *s = per_cpu_ptr(blkg->iostat_cpu, cpu);

                __blkg_clear_stat(s);
        }
        __blkg_clear_stat(&blkg->iostat);
}

static int blkcg_reset_stats(struct cgroup_subsys_state *css,
                             struct cftype *cftype, u64 val)
{
        struct blkcg *blkcg = css_to_blkcg(css);
        struct blkcg_gq *blkg;
        int i;

        mutex_lock(&blkcg_pol_mutex);
        spin_lock_irq(&blkcg->lock);

        /*
         * Note that stat reset is racy - it doesn't synchronize against
         * stat updates.  This is a debug feature which shouldn't exist
         * anyway.  If you get hit by a race, retry.
         */
        hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
                blkg_clear_stat(blkg);
                for (i = 0; i < BLKCG_MAX_POLS; i++) {
                        struct blkcg_policy *pol = blkcg_policy[i];

                        if (blkg->pd[i] && pol->pd_reset_stats_fn)
                                pol->pd_reset_stats_fn(blkg->pd[i]);
                }
        }

        spin_unlock_irq(&blkcg->lock);
        mutex_unlock(&blkcg_pol_mutex);
        return 0;
}

const char *blkg_dev_name(struct blkcg_gq *blkg)
{
        if (!blkg->q->disk)
                return NULL;
        return bdi_dev_name(blkg->q->disk->bdi);
}

/**
 * blkcg_print_blkgs - helper for printing per-blkg data
 * @sf: seq_file to print to
 * @blkcg: blkcg of interest
 * @prfill: fill function to print out a blkg
 * @pol: policy in question
 * @data: data to be passed to @prfill
 * @show_total: to print out sum of prfill return values or not
 *
 * This function invokes @prfill on each blkg of @blkcg if pd for the
 * policy specified by @pol exists.  @prfill is invoked with @sf, the
 * policy data and @data and the matching queue lock held.  If @show_total
 * is %true, the sum of the return values from @prfill is printed with
 * "Total" label at the end.
 *
 * This is to be used to construct print functions for
 * cftype->read_seq_string method.
 */
void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
                       u64 (*prfill)(struct seq_file *,
                                     struct blkg_policy_data *, int),
                       const struct blkcg_policy *pol, int data,
                       bool show_total)
{
        struct blkcg_gq *blkg;
        u64 total = 0;

        rcu_read_lock();
        hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
                spin_lock_irq(&blkg->q->queue_lock);
                if (blkcg_policy_enabled(blkg->q, pol))
                        total += prfill(sf, blkg->pd[pol->plid], data);
                spin_unlock_irq(&blkg->q->queue_lock);
        }
        rcu_read_unlock();

        if (show_total)
                seq_printf(sf, "Total %llu\n", (unsigned long long)total);
}
EXPORT_SYMBOL_GPL(blkcg_print_blkgs);

/**
 * __blkg_prfill_u64 - prfill helper for a single u64 value
 * @sf: seq_file to print to
 * @pd: policy private data of interest
 * @v: value to print
 *
 * Print @v to @sf for the device associated with @pd.
 */
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
{
        const char *dname = blkg_dev_name(pd->blkg);

        if (!dname)
                return 0;

        seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
        return v;
}
EXPORT_SYMBOL_GPL(__blkg_prfill_u64);

/**
 * blkg_conf_init - initialize a blkg_conf_ctx
 * @ctx: blkg_conf_ctx to initialize
 * @input: input string
 *
 * Initialize @ctx which can be used to parse blkg config input string @input.
 * Once initialized, @ctx can be used with blkg_conf_open_bdev() and
 * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit().
 */
void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input)
{
        *ctx = (struct blkg_conf_ctx){ .input = input };
}
EXPORT_SYMBOL_GPL(blkg_conf_init);

/**
 * blkg_conf_open_bdev - parse and open bdev for per-blkg config update
 * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
 *
 * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from
 * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is
 * set to point past the device node prefix.
 *
 * This function may be called multiple times on @ctx and the extra calls become
 * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function
 * explicitly if bdev access is needed without resolving the blkcg / policy part
 * of @ctx->input. Returns -errno on error.
 */
int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
{
        char *input = ctx->input;
        unsigned int major, minor;
        struct block_device *bdev;
        int key_len;

        if (ctx->bdev)
                return 0;

        if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
                return -EINVAL;

        input += key_len;
        if (!isspace(*input))
                return -EINVAL;
        input = skip_spaces(input);

        bdev = blkdev_get_no_open(MKDEV(major, minor));
        if (!bdev)
                return -ENODEV;
        if (bdev_is_partition(bdev)) {
                blkdev_put_no_open(bdev);
                return -ENODEV;
        }

        mutex_lock(&bdev->bd_queue->rq_qos_mutex);
        if (!disk_live(bdev->bd_disk)) {
                blkdev_put_no_open(bdev);
                mutex_unlock(&bdev->bd_queue->rq_qos_mutex);
                return -ENODEV;
        }

        ctx->body = input;
        ctx->bdev = bdev;
        return 0;
}

/**
 * blkg_conf_prep - parse and prepare for per-blkg config update
 * @blkcg: target block cgroup
 * @pol: target policy
 * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
 *
 * Parse per-blkg config update from @ctx->input and initialize @ctx
 * accordingly. On success, @ctx->body points to the part of @ctx->input
 * following MAJ:MIN, @ctx->bdev points to the target block device and
 * @ctx->blkg to the blkg being configured.
 *
 * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this
 * function returns with queue lock held and must be followed by
 * blkg_conf_exit().
 */
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
                   struct blkg_conf_ctx *ctx)
        __acquires(&bdev->bd_queue->queue_lock)
{
        struct gendisk *disk;
        struct request_queue *q;
        struct blkcg_gq *blkg;
        int ret;

        ret = blkg_conf_open_bdev(ctx);
        if (ret)
                return ret;

        disk = ctx->bdev->bd_disk;
        q = disk->queue;

        /*
         * blkcg_deactivate_policy() requires queue to be frozen, we can grab
         * q_usage_counter to prevent concurrent with blkcg_deactivate_policy().
         */
        ret = blk_queue_enter(q, 0);
        if (ret)
                goto fail;

        spin_lock_irq(&q->queue_lock);

        if (!blkcg_policy_enabled(q, pol)) {
                ret = -EOPNOTSUPP;
                goto fail_unlock;
        }

        blkg = blkg_lookup(blkcg, q);
        if (blkg)
                goto success;

        /*
         * Create blkgs walking down from blkcg_root to @blkcg, so that all
         * non-root blkgs have access to their parents.
         */
        while (true) {
                struct blkcg *pos = blkcg;
                struct blkcg *parent;
                struct blkcg_gq *new_blkg;

                parent = blkcg_parent(blkcg);
                while (parent && !blkg_lookup(parent, q)) {
                        pos = parent;
                        parent = blkcg_parent(parent);
                }

                /* Drop locks to do new blkg allocation with GFP_KERNEL. */
                spin_unlock_irq(&q->queue_lock);

                new_blkg = blkg_alloc(pos, disk, GFP_KERNEL);
                if (unlikely(!new_blkg)) {
                        ret = -ENOMEM;
                        goto fail_exit_queue;
                }

                if (radix_tree_preload(GFP_KERNEL)) {
                        blkg_free(new_blkg);
                        ret = -ENOMEM;
                        goto fail_exit_queue;
                }

                spin_lock_irq(&q->queue_lock);

                if (!blkcg_policy_enabled(q, pol)) {
                        blkg_free(new_blkg);
                        ret = -EOPNOTSUPP;
                        goto fail_preloaded;
                }

                blkg = blkg_lookup(pos, q);
                if (blkg) {
                        blkg_free(new_blkg);
                } else {
                        blkg = blkg_create(pos, disk, new_blkg);
                        if (IS_ERR(blkg)) {
                                ret = PTR_ERR(blkg);
                                goto fail_preloaded;
                        }
                }

                radix_tree_preload_end();

                if (pos == blkcg)
                        goto success;
        }
success:
        blk_queue_exit(q);
        ctx->blkg = blkg;
        return 0;

fail_preloaded:
        radix_tree_preload_end();
fail_unlock:
        spin_unlock_irq(&q->queue_lock);
fail_exit_queue:
        blk_queue_exit(q);
fail:
        /*
         * If queue was bypassing, we should retry.  Do so after a
         * short msleep().  It isn't strictly necessary but queue
         * can be bypassing for some time and it's always nice to
         * avoid busy looping.
         */
        if (ret == -EBUSY) {
                msleep(10);
                ret = restart_syscall();
        }
        return ret;
}
EXPORT_SYMBOL_GPL(blkg_conf_prep);

/**
 * blkg_conf_exit - clean up per-blkg config update
 * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
 *
 * Clean up after per-blkg config update. This function must be called on all
 * blkg_conf_ctx's initialized with blkg_conf_init().
 */
void blkg_conf_exit(struct blkg_conf_ctx *ctx)
        __releases(&ctx->bdev->bd_queue->queue_lock)
        __releases(&ctx->bdev->bd_queue->rq_qos_mutex)
{
        if (ctx->blkg) {
                spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
                ctx->blkg = NULL;
        }

        if (ctx->bdev) {
                mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
                blkdev_put_no_open(ctx->bdev);
                ctx->body = NULL;
                ctx->bdev = NULL;
        }
}
EXPORT_SYMBOL_GPL(blkg_conf_exit);

static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
{
        int i;

        for (i = 0; i < BLKG_IOSTAT_NR; i++) {
                dst->bytes[i] += src->bytes[i];
                dst->ios[i] += src->ios[i];
        }
}

static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
{
        int i;

        for (i = 0; i < BLKG_IOSTAT_NR; i++) {
                dst->bytes[i] -= src->bytes[i];
                dst->ios[i] -= src->ios[i];
        }
}

static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
                                struct blkg_iostat *last)
{
        struct blkg_iostat delta;
        unsigned long flags;

        /* propagate percpu delta to global */
        flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
        blkg_iostat_set(&delta, cur);
        blkg_iostat_sub(&delta, last);
        blkg_iostat_add(&blkg->iostat.cur, &delta);
        blkg_iostat_add(last, &delta);
        u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}

static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
{
        struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
        struct llist_node *lnode;
        struct blkg_iostat_set *bisc, *next_bisc;
        unsigned long flags;

        rcu_read_lock();

        lnode = llist_del_all(lhead);
        if (!lnode)
                goto out;

        /*
         * For covering concurrent parent blkg update from blkg_release().
         *
         * When flushing from cgroup, cgroup_rstat_lock is always held, so
         * this lock won't cause contention most of time.
         */
        raw_spin_lock_irqsave(&blkg_stat_lock, flags);

        /*
         * Iterate only the iostat_cpu's queued in the lockless list.
         */
        llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) {
                struct blkcg_gq *blkg = bisc->blkg;
                struct blkcg_gq *parent = blkg->parent;
                struct blkg_iostat cur;
                unsigned int seq;

                /*
                 * Order assignment of `next_bisc` from `bisc->lnode.next` in
                 * llist_for_each_entry_safe and clearing `bisc->lqueued` for
                 * avoiding to assign `next_bisc` with new next pointer added
                 * in blk_cgroup_bio_start() in case of re-ordering.
                 *
                 * The pair barrier is implied in llist_add() in blk_cgroup_bio_start().
                 */
                smp_mb();

                WRITE_ONCE(bisc->lqueued, false);
                if (bisc == &blkg->iostat)
                        goto propagate_up; /* propagate up to parent only */

                /* fetch the current per-cpu values */
                do {
                        seq = u64_stats_fetch_begin(&bisc->sync);
                        blkg_iostat_set(&cur, &bisc->cur);
                } while (u64_stats_fetch_retry(&bisc->sync, seq));

                blkcg_iostat_update(blkg, &cur, &bisc->last);

propagate_up:
                /* propagate global delta to parent (unless that's root) */
                if (parent && parent->parent) {
                        blkcg_iostat_update(parent, &blkg->iostat.cur,
                                            &blkg->iostat.last);
                        /*
                         * Queue parent->iostat to its blkcg's lockless
                         * list to propagate up to the grandparent if the
                         * iostat hasn't been queued yet.
                         */
                        if (!parent->iostat.lqueued) {
                                struct llist_head *plhead;

                                plhead = per_cpu_ptr(parent->blkcg->lhead, cpu);
                                llist_add(&parent->iostat.lnode, plhead);
                                parent->iostat.lqueued = true;
                        }
                }
        }
        raw_spin_unlock_irqrestore(&blkg_stat_lock, flags);
out:
        rcu_read_unlock();
}

static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
        /* Root-level stats are sourced from system-wide IO stats */
        if (cgroup_parent(css->cgroup))
                __blkcg_rstat_flush(css_to_blkcg(css), cpu);
}

/*
 * We source root cgroup stats from the system-wide stats to avoid
 * tracking the same information twice and incurring overhead when no
 * cgroups are defined. For that reason, cgroup_rstat_flush in
 * blkcg_print_stat does not actually fill out the iostat in the root
 * cgroup's blkcg_gq.
 *
 * However, we would like to re-use the printing code between the root and
 * non-root cgroups to the extent possible. For that reason, we simulate
 * flushing the root cgroup's stats by explicitly filling in the iostat
 * with disk level statistics.
 */
static void blkcg_fill_root_iostats(void)
{
        struct class_dev_iter iter;
        struct device *dev;

        class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
        while ((dev = class_dev_iter_next(&iter))) {
                struct block_device *bdev = dev_to_bdev(dev);
                struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg;
                struct blkg_iostat tmp;
                int cpu;
                unsigned long flags;

                memset(&tmp, 0, sizeof(tmp));
                for_each_possible_cpu(cpu) {
                        struct disk_stats *cpu_dkstats;

                        cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu);
                        tmp.ios[BLKG_IOSTAT_READ] +=
                                cpu_dkstats->ios[STAT_READ];
                        tmp.ios[BLKG_IOSTAT_WRITE] +=
                                cpu_dkstats->ios[STAT_WRITE];
                        tmp.ios[BLKG_IOSTAT_DISCARD] +=
                                cpu_dkstats->ios[STAT_DISCARD];
                        // convert sectors to bytes
                        tmp.bytes[BLKG_IOSTAT_READ] +=
                                cpu_dkstats->sectors[STAT_READ] << 9;
                        tmp.bytes[BLKG_IOSTAT_WRITE] +=
                                cpu_dkstats->sectors[STAT_WRITE] << 9;
                        tmp.bytes[BLKG_IOSTAT_DISCARD] +=
                                cpu_dkstats->sectors[STAT_DISCARD] << 9;
                }

                flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
                blkg_iostat_set(&blkg->iostat.cur, &tmp);
                u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
        }
}

static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
{
        struct blkg_iostat_set *bis = &blkg->iostat;
        u64 rbytes, wbytes, rios, wios, dbytes, dios;
        const char *dname;
        unsigned seq;
        int i;

        if (!blkg->online)
                return;

        dname = blkg_dev_name(blkg);
        if (!dname)
                return;

        seq_printf(s, "%s ", dname);

        do {
                seq = u64_stats_fetch_begin(&bis->sync);

                rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
                wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
                dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
                rios = bis->cur.ios[BLKG_IOSTAT_READ];
                wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
                dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
        } while (u64_stats_fetch_retry(&bis->sync, seq));

        if (rbytes || wbytes || rios || wios) {
                seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
                        rbytes, wbytes, rios, wios,
                        dbytes, dios);
        }

        if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
                seq_printf(s, " use_delay=%d delay_nsec=%llu",
                        atomic_read(&blkg->use_delay),
                        atomic64_read(&blkg->delay_nsec));
        }

        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];

                if (!blkg->pd[i] || !pol->pd_stat_fn)
                        continue;

                pol->pd_stat_fn(blkg->pd[i], s);
        }

        seq_puts(s, "\n");
}

static int blkcg_print_stat(struct seq_file *sf, void *v)
{
        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
        struct blkcg_gq *blkg;

        if (!seq_css(sf)->parent)
                blkcg_fill_root_iostats();
        else
                cgroup_rstat_flush(blkcg->css.cgroup);

        rcu_read_lock();
        hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
                spin_lock_irq(&blkg->q->queue_lock);
                blkcg_print_one_stat(blkg, sf);
                spin_unlock_irq(&blkg->q->queue_lock);
        }
        rcu_read_unlock();
        return 0;
}

static struct cftype blkcg_files[] = {
        {
                .name = "stat",
                .seq_show = blkcg_print_stat,
        },
        { }        /* terminate */
};

static struct cftype blkcg_legacy_files[] = {
        {
                .name = "reset_stats",
                .write_u64 = blkcg_reset_stats,
        },
        { }        /* terminate */
};

#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css)
{
        return &css_to_blkcg(css)->cgwb_list;
}
#endif

/*
 * blkcg destruction is a three-stage process.
 *
 * 1. Destruction starts.  The blkcg_css_offline() callback is invoked
 *    which offlines writeback.  Here we tie the next stage of blkg destruction
 *    to the completion of writeback associated with the blkcg.  This lets us
 *    avoid punting potentially large amounts of outstanding writeback to root
 *    while maintaining any ongoing policies.  The next stage is triggered when
 *    the nr_cgwbs count goes to zero.
 *
 * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called
 *    and handles the destruction of blkgs.  Here the css reference held by
 *    the blkg is put back eventually allowing blkcg_css_free() to be called.
 *    This work may occur in cgwb_release_workfn() on the cgwb_release
 *    workqueue.  Any submitted ios that fail to get the blkg ref will be
 *    punted to the root_blkg.
 *
 * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called.
 *    This finally frees the blkcg.
 */

/**
 * blkcg_destroy_blkgs - responsible for shooting down blkgs
 * @blkcg: blkcg of interest
 *
 * blkgs should be removed while holding both q and blkcg locks.  As blkcg lock
 * is nested inside q lock, this function performs reverse double lock dancing.
 * Destroying the blkgs releases the reference held on the blkcg's css allowing
 * blkcg_css_free to eventually be called.
 *
 * This is the blkcg counterpart of ioc_release_fn().
 */
static void blkcg_destroy_blkgs(struct blkcg *blkcg)
{
        might_sleep();

        spin_lock_irq(&blkcg->lock);

        while (!hlist_empty(&blkcg->blkg_list)) {
                struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
                                                struct blkcg_gq, blkcg_node);
                struct request_queue *q = blkg->q;

                if (need_resched() || !spin_trylock(&q->queue_lock)) {
                        /*
                         * Given that the system can accumulate a huge number
                         * of blkgs in pathological cases, check to see if we
                         * need to rescheduling to avoid softlockup.
                         */
                        spin_unlock_irq(&blkcg->lock);
                        cond_resched();
                        spin_lock_irq(&blkcg->lock);
                        continue;
                }

                blkg_destroy(blkg);
                spin_unlock(&q->queue_lock);
        }

        spin_unlock_irq(&blkcg->lock);
}

/**
 * blkcg_pin_online - pin online state
 * @blkcg_css: blkcg of interest
 *
 * While pinned, a blkcg is kept online.  This is primarily used to
 * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline
 * while an associated cgwb is still active.
 */
void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css)
{
        refcount_inc(&css_to_blkcg(blkcg_css)->online_pin);
}

/**
 * blkcg_unpin_online - unpin online state
 * @blkcg_css: blkcg of interest
 *
 * This is primarily used to impedance-match blkg and cgwb lifetimes so
 * that blkg doesn't go offline while an associated cgwb is still active.
 * When this count goes to zero, all active cgwbs have finished so the
 * blkcg can continue destruction by calling blkcg_destroy_blkgs().
 */
void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css)
{
        struct blkcg *blkcg = css_to_blkcg(blkcg_css);

        do {
                if (!refcount_dec_and_test(&blkcg->online_pin))
                        break;
                blkcg_destroy_blkgs(blkcg);
                blkcg = blkcg_parent(blkcg);
        } while (blkcg);
}

/**
 * blkcg_css_offline - cgroup css_offline callback
 * @css: css of interest
 *
 * This function is called when @css is about to go away.  Here the cgwbs are
 * offlined first and only once writeback associated with the blkcg has
 * finished do we start step 2 (see above).
 */
static void blkcg_css_offline(struct cgroup_subsys_state *css)
{
        /* this prevents anyone from attaching or migrating to this blkcg */
        wb_blkcg_offline(css);

        /* put the base online pin allowing step 2 to be triggered */
        blkcg_unpin_online(css);
}

static void blkcg_css_free(struct cgroup_subsys_state *css)
{
        struct blkcg *blkcg = css_to_blkcg(css);
        int i;

        mutex_lock(&blkcg_pol_mutex);

        list_del(&blkcg->all_blkcgs_node);

        for (i = 0; i < BLKCG_MAX_POLS; i++)
                if (blkcg->cpd[i])
                        blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);

        mutex_unlock(&blkcg_pol_mutex);

        free_percpu(blkcg->lhead);
        kfree(blkcg);
}

static struct cgroup_subsys_state *
blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct blkcg *blkcg;
        int i;

        mutex_lock(&blkcg_pol_mutex);

        if (!parent_css) {
                blkcg = &blkcg_root;
        } else {
                blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
                if (!blkcg)
                        goto unlock;
        }

        if (init_blkcg_llists(blkcg))
                goto free_blkcg;

        for (i = 0; i < BLKCG_MAX_POLS ; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];
                struct blkcg_policy_data *cpd;

                /*
                 * If the policy hasn't been attached yet, wait for it
                 * to be attached before doing anything else. Otherwise,
                 * check if the policy requires any specific per-cgroup
                 * data: if it does, allocate and initialize it.
                 */
                if (!pol || !pol->cpd_alloc_fn)
                        continue;

                cpd = pol->cpd_alloc_fn(GFP_KERNEL);
                if (!cpd)
                        goto free_pd_blkcg;

                blkcg->cpd[i] = cpd;
                cpd->blkcg = blkcg;
                cpd->plid = i;
        }

        spin_lock_init(&blkcg->lock);
        refcount_set(&blkcg->online_pin, 1);
        INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
        INIT_HLIST_HEAD(&blkcg->blkg_list);
#ifdef CONFIG_CGROUP_WRITEBACK
        INIT_LIST_HEAD(&blkcg->cgwb_list);
#endif
        list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);

        mutex_unlock(&blkcg_pol_mutex);
        return &blkcg->css;

free_pd_blkcg:
        for (i--; i >= 0; i--)
                if (blkcg->cpd[i])
                        blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
        free_percpu(blkcg->lhead);
free_blkcg:
        if (blkcg != &blkcg_root)
                kfree(blkcg);
unlock:
        mutex_unlock(&blkcg_pol_mutex);
        return ERR_PTR(-ENOMEM);
}

static int blkcg_css_online(struct cgroup_subsys_state *css)
{
        struct blkcg *parent = blkcg_parent(css_to_blkcg(css));

        /*
         * blkcg_pin_online() is used to delay blkcg offline so that blkgs
         * don't go offline while cgwbs are still active on them.  Pin the
         * parent so that offline always happens towards the root.
         */
        if (parent)
                blkcg_pin_online(&parent->css);
        return 0;
}

void blkg_init_queue(struct request_queue *q)
{
        INIT_LIST_HEAD(&q->blkg_list);
        mutex_init(&q->blkcg_mutex);
}

int blkcg_init_disk(struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct blkcg_gq *new_blkg, *blkg;
        bool preloaded;
        int ret;

        new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
        if (!new_blkg)
                return -ENOMEM;

        preloaded = !radix_tree_preload(GFP_KERNEL);

        /* Make sure the root blkg exists. */
        /* spin_lock_irq can serve as RCU read-side critical section. */
        spin_lock_irq(&q->queue_lock);
        blkg = blkg_create(&blkcg_root, disk, new_blkg);
        if (IS_ERR(blkg))
                goto err_unlock;
        q->root_blkg = blkg;
        spin_unlock_irq(&q->queue_lock);

        if (preloaded)
                radix_tree_preload_end();

        ret = blk_ioprio_init(disk);
        if (ret)
                goto err_destroy_all;

        return 0;

err_destroy_all:
        blkg_destroy_all(disk);
        return ret;
err_unlock:
        spin_unlock_irq(&q->queue_lock);
        if (preloaded)
                radix_tree_preload_end();
        return PTR_ERR(blkg);
}

void blkcg_exit_disk(struct gendisk *disk)
{
        blkg_destroy_all(disk);
        blk_throtl_exit(disk);
}

static void blkcg_exit(struct task_struct *tsk)
{
        if (tsk->throttle_disk)
                put_disk(tsk->throttle_disk);
        tsk->throttle_disk = NULL;
}

struct cgroup_subsys io_cgrp_subsys = {
        .css_alloc = blkcg_css_alloc,
        .css_online = blkcg_css_online,
        .css_offline = blkcg_css_offline,
        .css_free = blkcg_css_free,
        .css_rstat_flush = blkcg_rstat_flush,
        .dfl_cftypes = blkcg_files,
        .legacy_cftypes = blkcg_legacy_files,
        .legacy_name = "blkio",
        .exit = blkcg_exit,
#ifdef CONFIG_MEMCG
        /*
         * This ensures that, if available, memcg is automatically enabled
         * together on the default hierarchy so that the owner cgroup can
         * be retrieved from writeback pages.
         */
        .depends_on = 1 << memory_cgrp_id,
#endif
};
EXPORT_SYMBOL_GPL(io_cgrp_subsys);

/**
 * blkcg_activate_policy - activate a blkcg policy on a gendisk
 * @disk: gendisk of interest
 * @pol: blkcg policy to activate
 *
 * Activate @pol on @disk.  Requires %GFP_KERNEL context.  @disk goes through
 * bypass mode to populate its blkgs with policy_data for @pol.
 *
 * Activation happens with @disk bypassed, so nobody would be accessing blkgs
 * from IO path.  Update of each blkg is protected by both queue and blkcg
 * locks so that holding either lock and testing blkcg_policy_enabled() is
 * always enough for dereferencing policy data.
 *
 * The caller is responsible for synchronizing [de]activations and policy
 * [un]registerations.  Returns 0 on success, -errno on failure.
 */
int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
{
        struct request_queue *q = disk->queue;
        struct blkg_policy_data *pd_prealloc = NULL;
        struct blkcg_gq *blkg, *pinned_blkg = NULL;
        int ret;

        if (blkcg_policy_enabled(q, pol))
                return 0;

        if (queue_is_mq(q))
                blk_mq_freeze_queue(q);
retry:
        spin_lock_irq(&q->queue_lock);

        /* blkg_list is pushed at the head, reverse walk to initialize parents first */
        list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
                struct blkg_policy_data *pd;

                if (blkg->pd[pol->plid])
                        continue;

                /* If prealloc matches, use it; otherwise try GFP_NOWAIT */
                if (blkg == pinned_blkg) {
                        pd = pd_prealloc;
                        pd_prealloc = NULL;
                } else {
                        pd = pol->pd_alloc_fn(disk, blkg->blkcg,
                                              GFP_NOWAIT | __GFP_NOWARN);
                }

                if (!pd) {
                        /*
                         * GFP_NOWAIT failed.  Free the existing one and
                         * prealloc for @blkg w/ GFP_KERNEL.
                         */
                        if (pinned_blkg)
                                blkg_put(pinned_blkg);
                        blkg_get(blkg);
                        pinned_blkg = blkg;

                        spin_unlock_irq(&q->queue_lock);

                        if (pd_prealloc)
                                pol->pd_free_fn(pd_prealloc);
                        pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg,
                                                       GFP_KERNEL);
                        if (pd_prealloc)
                                goto retry;
                        else
                                goto enomem;
                }

                spin_lock(&blkg->blkcg->lock);

                pd->blkg = blkg;
                pd->plid = pol->plid;
                blkg->pd[pol->plid] = pd;

                if (pol->pd_init_fn)
                        pol->pd_init_fn(pd);

                if (pol->pd_online_fn)
                        pol->pd_online_fn(pd);
                pd->online = true;

                spin_unlock(&blkg->blkcg->lock);
        }

        __set_bit(pol->plid, q->blkcg_pols);
        ret = 0;

        spin_unlock_irq(&q->queue_lock);
out:
        if (queue_is_mq(q))
                blk_mq_unfreeze_queue(q);
        if (pinned_blkg)
                blkg_put(pinned_blkg);
        if (pd_prealloc)
                pol->pd_free_fn(pd_prealloc);
        return ret;

enomem:
        /* alloc failed, take down everything */
        spin_lock_irq(&q->queue_lock);
        list_for_each_entry(blkg, &q->blkg_list, q_node) {
                struct blkcg *blkcg = blkg->blkcg;
                struct blkg_policy_data *pd;

                spin_lock(&blkcg->lock);
                pd = blkg->pd[pol->plid];
                if (pd) {
                        if (pd->online && pol->pd_offline_fn)
                                pol->pd_offline_fn(pd);
                        pd->online = false;
                        pol->pd_free_fn(pd);
                        blkg->pd[pol->plid] = NULL;
                }
                spin_unlock(&blkcg->lock);
        }
        spin_unlock_irq(&q->queue_lock);
        ret = -ENOMEM;
        goto out;
}
EXPORT_SYMBOL_GPL(blkcg_activate_policy);

/**
 * blkcg_deactivate_policy - deactivate a blkcg policy on a gendisk
 * @disk: gendisk of interest
 * @pol: blkcg policy to deactivate
 *
 * Deactivate @pol on @disk.  Follows the same synchronization rules as
 * blkcg_activate_policy().
 */
void blkcg_deactivate_policy(struct gendisk *disk,
                             const struct blkcg_policy *pol)
{
        struct request_queue *q = disk->queue;
        struct blkcg_gq *blkg;

        if (!blkcg_policy_enabled(q, pol))
                return;

        if (queue_is_mq(q))
                blk_mq_freeze_queue(q);

        mutex_lock(&q->blkcg_mutex);
        spin_lock_irq(&q->queue_lock);

        __clear_bit(pol->plid, q->blkcg_pols);

        list_for_each_entry(blkg, &q->blkg_list, q_node) {
                struct blkcg *blkcg = blkg->blkcg;

                spin_lock(&blkcg->lock);
                if (blkg->pd[pol->plid]) {
                        if (blkg->pd[pol->plid]->online && pol->pd_offline_fn)
                                pol->pd_offline_fn(blkg->pd[pol->plid]);
                        pol->pd_free_fn(blkg->pd[pol->plid]);
                        blkg->pd[pol->plid] = NULL;
                }
                spin_unlock(&blkcg->lock);
        }

        spin_unlock_irq(&q->queue_lock);
        mutex_unlock(&q->blkcg_mutex);

        if (queue_is_mq(q))
                blk_mq_unfreeze_queue(q);
}
EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);

static void blkcg_free_all_cpd(struct blkcg_policy *pol)
{
        struct blkcg *blkcg;

        list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
                if (blkcg->cpd[pol->plid]) {
                        pol->cpd_free_fn(blkcg->cpd[pol->plid]);
                        blkcg->cpd[pol->plid] = NULL;
                }
        }
}

/**
 * blkcg_policy_register - register a blkcg policy
 * @pol: blkcg policy to register
 *
 * Register @pol with blkcg core.  Might sleep and @pol may be modified on
 * successful registration.  Returns 0 on success and -errno on failure.
 */
int blkcg_policy_register(struct blkcg_policy *pol)
{
        struct blkcg *blkcg;
        int i, ret;

        mutex_lock(&blkcg_pol_register_mutex);
        mutex_lock(&blkcg_pol_mutex);

        /* find an empty slot */
        ret = -ENOSPC;
        for (i = 0; i < BLKCG_MAX_POLS; i++)
                if (!blkcg_policy[i])
                        break;
        if (i >= BLKCG_MAX_POLS) {
                pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
                goto err_unlock;
        }

        /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */
        if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
                (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
                goto err_unlock;

        /* register @pol */
        pol->plid = i;
        blkcg_policy[pol->plid] = pol;

        /* allocate and install cpd's */
        if (pol->cpd_alloc_fn) {
                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
                        struct blkcg_policy_data *cpd;

                        cpd = pol->cpd_alloc_fn(GFP_KERNEL);
                        if (!cpd)
                                goto err_free_cpds;

                        blkcg->cpd[pol->plid] = cpd;
                        cpd->blkcg = blkcg;
                        cpd->plid = pol->plid;
                }
        }

        mutex_unlock(&blkcg_pol_mutex);

        /* everything is in place, add intf files for the new policy */
        if (pol->dfl_cftypes)
                WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
                                               pol->dfl_cftypes));
        if (pol->legacy_cftypes)
                WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
                                                  pol->legacy_cftypes));
        mutex_unlock(&blkcg_pol_register_mutex);
        return 0;

err_free_cpds:
        if (pol->cpd_free_fn)
                blkcg_free_all_cpd(pol);

        blkcg_policy[pol->plid] = NULL;
err_unlock:
        mutex_unlock(&blkcg_pol_mutex);
        mutex_unlock(&blkcg_pol_register_mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(blkcg_policy_register);

/**
 * blkcg_policy_unregister - unregister a blkcg policy
 * @pol: blkcg policy to unregister
 *
 * Undo blkcg_policy_register(@pol).  Might sleep.
 */
void blkcg_policy_unregister(struct blkcg_policy *pol)
{
        mutex_lock(&blkcg_pol_register_mutex);

        if (WARN_ON(blkcg_policy[pol->plid] != pol))
                goto out_unlock;

        /* kill the intf files first */
        if (pol->dfl_cftypes)
                cgroup_rm_cftypes(pol->dfl_cftypes);
        if (pol->legacy_cftypes)
                cgroup_rm_cftypes(pol->legacy_cftypes);

        /* remove cpds and unregister */
        mutex_lock(&blkcg_pol_mutex);

        if (pol->cpd_free_fn)
                blkcg_free_all_cpd(pol);

        blkcg_policy[pol->plid] = NULL;

        mutex_unlock(&blkcg_pol_mutex);
out_unlock:
        mutex_unlock(&blkcg_pol_register_mutex);
}
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);

/*
 * Scale the accumulated delay based on how long it has been since we updated
 * the delay.  We only call this when we are adding delay, in case it's been a
 * while since we added delay, and when we are checking to see if we need to
 * delay a task, to account for any delays that may have occurred.
 */
static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
{
        u64 old = atomic64_read(&blkg->delay_start);

        /* negative use_delay means no scaling, see blkcg_set_delay() */
        if (atomic_read(&blkg->use_delay) < 0)
                return;

        /*
         * We only want to scale down every second.  The idea here is that we
         * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
         * time window.  We only want to throttle tasks for recent delay that
         * has occurred, in 1 second time windows since that's the maximum
         * things can be throttled.  We save the current delay window in
         * blkg->last_delay so we know what amount is still left to be charged
         * to the blkg from this point onward.  blkg->last_use keeps track of
         * the use_delay counter.  The idea is if we're unthrottling the blkg we
         * are ok with whatever is happening now, and we can take away more of
         * the accumulated delay as we've already throttled enough that
         * everybody is happy with their IO latencies.
         */
        if (time_before64(old + NSEC_PER_SEC, now) &&
            atomic64_try_cmpxchg(&blkg->delay_start, &old, now)) {
                u64 cur = atomic64_read(&blkg->delay_nsec);
                u64 sub = min_t(u64, blkg->last_delay, now - old);
                int cur_use = atomic_read(&blkg->use_delay);

                /*
                 * We've been unthrottled, subtract a larger chunk of our
                 * accumulated delay.
                 */
                if (cur_use < blkg->last_use)
                        sub = max_t(u64, sub, blkg->last_delay >> 1);

                /*
                 * This shouldn't happen, but handle it anyway.  Our delay_nsec
                 * should only ever be growing except here where we subtract out
                 * min(last_delay, 1 second), but lord knows bugs happen and I'd
                 * rather not end up with negative numbers.
                 */
                if (unlikely(cur < sub)) {
                        atomic64_set(&blkg->delay_nsec, 0);
                        blkg->last_delay = 0;
                } else {
                        atomic64_sub(sub, &blkg->delay_nsec);
                        blkg->last_delay = cur - sub;
                }
                blkg->last_use = cur_use;
        }
}

/*
 * This is called when we want to actually walk up the hierarchy and check to
 * see if we need to throttle, and then actually throttle if there is some
 * accumulated delay.  This should only be called upon return to user space so
 * we're not holding some lock that would induce a priority inversion.
 */
static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
{
        unsigned long pflags;
        bool clamp;
        u64 now = blk_time_get_ns();
        u64 exp;
        u64 delay_nsec = 0;
        int tok;

        while (blkg->parent) {
                int use_delay = atomic_read(&blkg->use_delay);

                if (use_delay) {
                        u64 this_delay;

                        blkcg_scale_delay(blkg, now);
                        this_delay = atomic64_read(&blkg->delay_nsec);
                        if (this_delay > delay_nsec) {
                                delay_nsec = this_delay;
                                clamp = use_delay > 0;
                        }
                }
                blkg = blkg->parent;
        }

        if (!delay_nsec)
                return;

        /*
         * Let's not sleep for all eternity if we've amassed a huge delay.
         * Swapping or metadata IO can accumulate 10's of seconds worth of
         * delay, and we want userspace to be able to do _something_ so cap the
         * delays at 0.25s. If there's 10's of seconds worth of delay then the
         * tasks will be delayed for 0.25 second for every syscall. If
         * blkcg_set_delay() was used as indicated by negative use_delay, the
         * caller is responsible for regulating the range.
         */
        if (clamp)
                delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);

        if (use_memdelay)
                psi_memstall_enter(&pflags);

        exp = ktime_add_ns(now, delay_nsec);
        tok = io_schedule_prepare();
        do {
                __set_current_state(TASK_KILLABLE);
                if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
                        break;
        } while (!fatal_signal_pending(current));
        io_schedule_finish(tok);

        if (use_memdelay)
                psi_memstall_leave(&pflags);
}

/**
 * blkcg_maybe_throttle_current - throttle the current task if it has been marked
 *
 * This is only called if we've been marked with set_notify_resume().  Obviously
 * we can be set_notify_resume() for reasons other than blkcg throttling, so we
 * check to see if current->throttle_disk is set and if not this doesn't do
 * anything.  This should only ever be called by the resume code, it's not meant
 * to be called by people willy-nilly as it will actually do the work to
 * throttle the task if it is setup for throttling.
 */
void blkcg_maybe_throttle_current(void)
{
        struct gendisk *disk = current->throttle_disk;
        struct blkcg *blkcg;
        struct blkcg_gq *blkg;
        bool use_memdelay = current->use_memdelay;

        if (!disk)
                return;

        current->throttle_disk = NULL;
        current->use_memdelay = false;

        rcu_read_lock();
        blkcg = css_to_blkcg(blkcg_css());
        if (!blkcg)
                goto out;
        blkg = blkg_lookup(blkcg, disk->queue);
        if (!blkg)
                goto out;
        if (!blkg_tryget(blkg))
                goto out;
        rcu_read_unlock();

        blkcg_maybe_throttle_blkg(blkg, use_memdelay);
        blkg_put(blkg);
        put_disk(disk);
        return;
out:
        rcu_read_unlock();
}

/**
 * blkcg_schedule_throttle - this task needs to check for throttling
 * @disk: disk to throttle
 * @use_memdelay: do we charge this to memory delay for PSI
 *
 * This is called by the IO controller when we know there's delay accumulated
 * for the blkg for this task.  We do not pass the blkg because there are places
 * we call this that may not have that information, the swapping code for
 * instance will only have a block_device at that point.  This set's the
 * notify_resume for the task to check and see if it requires throttling before
 * returning to user space.
 *
 * We will only schedule once per syscall.  You can call this over and over
 * again and it will only do the check once upon return to user space, and only
 * throttle once.  If the task needs to be throttled again it'll need to be
 * re-set at the next time we see the task.
 */
void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay)
{
        if (unlikely(current->flags & PF_KTHREAD))
                return;

        if (current->throttle_disk != disk) {
                if (test_bit(GD_DEAD, &disk->state))
                        return;
                get_device(disk_to_dev(disk));

                if (current->throttle_disk)
                        put_disk(current->throttle_disk);
                current->throttle_disk = disk;
        }

        if (use_memdelay)
                current->use_memdelay = use_memdelay;
        set_notify_resume(current);
}

/**
 * blkcg_add_delay - add delay to this blkg
 * @blkg: blkg of interest
 * @now: the current time in nanoseconds
 * @delta: how many nanoseconds of delay to add
 *
 * Charge @delta to the blkg's current delay accumulation.  This is used to
 * throttle tasks if an IO controller thinks we need more throttling.
 */
void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
{
        if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
                return;
        blkcg_scale_delay(blkg, now);
        atomic64_add(delta, &blkg->delay_nsec);
}

/**
 * blkg_tryget_closest - try and get a blkg ref on the closet blkg
 * @bio: target bio
 * @css: target css
 *
 * As the failure mode here is to walk up the blkg tree, this ensure that the
 * blkg->parent pointers are always valid.  This returns the blkg that it ended
 * up taking a reference on or %NULL if no reference was taken.
 */
static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
                struct cgroup_subsys_state *css)
{
        struct blkcg_gq *blkg, *ret_blkg = NULL;

        rcu_read_lock();
        blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk);
        while (blkg) {
                if (blkg_tryget(blkg)) {
                        ret_blkg = blkg;
                        break;
                }
                blkg = blkg->parent;
        }
        rcu_read_unlock();

        return ret_blkg;
}

/**
 * bio_associate_blkg_from_css - associate a bio with a specified css
 * @bio: target bio
 * @css: target css
 *
 * Associate @bio with the blkg found by combining the css's blkg and the
 * request_queue of the @bio.  An association failure is handled by walking up
 * the blkg tree.  Therefore, the blkg associated can be anything between @blkg
 * and q->root_blkg.  This situation only happens when a cgroup is dying and
 * then the remaining bios will spill to the closest alive blkg.
 *
 * A reference will be taken on the blkg and will be released when @bio is
 * freed.
 */
void bio_associate_blkg_from_css(struct bio *bio,
                                 struct cgroup_subsys_state *css)
{
        if (bio->bi_blkg)
                blkg_put(bio->bi_blkg);

        if (css && css->parent) {
                bio->bi_blkg = blkg_tryget_closest(bio, css);
        } else {
                blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg);
                bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg;
        }
}
EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);

/**
 * bio_associate_blkg - associate a bio with a blkg
 * @bio: target bio
 *
 * Associate @bio with the blkg found from the bio's css and request_queue.
 * If one is not found, bio_lookup_blkg() creates the blkg.  If a blkg is
 * already associated, the css is reused and association redone as the
 * request_queue may have changed.
 */
void bio_associate_blkg(struct bio *bio)
{
        struct cgroup_subsys_state *css;

        if (blk_op_is_passthrough(bio->bi_opf))
                return;

        rcu_read_lock();

        if (bio->bi_blkg)
                css = bio_blkcg_css(bio);
        else
                css = blkcg_css();

        bio_associate_blkg_from_css(bio, css);

        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(bio_associate_blkg);

/**
 * bio_clone_blkg_association - clone blkg association from src to dst bio
 * @dst: destination bio
 * @src: source bio
 */
void bio_clone_blkg_association(struct bio *dst, struct bio *src)
{
        if (src->bi_blkg)
                bio_associate_blkg_from_css(dst, bio_blkcg_css(src));
}
EXPORT_SYMBOL_GPL(bio_clone_blkg_association);

static int blk_cgroup_io_type(struct bio *bio)
{
        if (op_is_discard(bio->bi_opf))
                return BLKG_IOSTAT_DISCARD;
        if (op_is_write(bio->bi_opf))
                return BLKG_IOSTAT_WRITE;
        return BLKG_IOSTAT_READ;
}

void blk_cgroup_bio_start(struct bio *bio)
{
        struct blkcg *blkcg = bio->bi_blkg->blkcg;
        int rwd = blk_cgroup_io_type(bio), cpu;
        struct blkg_iostat_set *bis;
        unsigned long flags;

        if (!cgroup_subsys_on_dfl(io_cgrp_subsys))
                return;

        /* Root-level stats are sourced from system-wide IO stats */
        if (!cgroup_parent(blkcg->css.cgroup))
                return;

        cpu = get_cpu();
        bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
        flags = u64_stats_update_begin_irqsave(&bis->sync);

        /*
         * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split
         * bio and we would have already accounted for the size of the bio.
         */
        if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
                bio_set_flag(bio, BIO_CGROUP_ACCT);
                bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
        }
        bis->cur.ios[rwd]++;

        /*
         * If the iostat_cpu isn't in a lockless list, put it into the
         * list to indicate that a stat update is pending.
         */
        if (!READ_ONCE(bis->lqueued)) {
                struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);

                llist_add(&bis->lnode, lhead);
                WRITE_ONCE(bis->lqueued, true);
        }

        u64_stats_update_end_irqrestore(&bis->sync, flags);
        cgroup_rstat_updated(blkcg->css.cgroup, cpu);
        put_cpu();
}

bool blk_cgroup_congested(void)
{
        struct cgroup_subsys_state *css;
        bool ret = false;

        rcu_read_lock();
        for (css = blkcg_css(); css; css = css->parent) {
                if (atomic_read(&css->cgroup->congestion_count)) {
                        ret = true;
                        break;
                }
        }
        rcu_read_unlock();
        return ret;
}

module_param(blkcg_debug_stats, bool, 0644);
MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");


















































































































































































    2 




















    1 




    2 

















































    4 


    4 




















    1 


















    1 








    1 





    1 
























































    3 






































    1 









    1 

    1 




    1 















    1 











    1 














































































































    1 









    1 































































    1 

    2 



    2 


















































































    1 























    1 












































































































    1 
    2 
    2 




















































































































































































































    1 















    1 









    1 









    1 






    1 











    1 
    1 







    1 

    1 
    1 





































































































































































































































































































































    1 






    2 



    2 

    2 
    1 
    1 

    1 





    1 





    1 





    1 






    1 









    1 

    1 
    1 



















    1 
























    1 







































































    1 















































































































































    1 














    1 





    1 












    1 











































    1 







    1 



    1 
    1 


    1 
















































    1 
    1 









    1 












































    1 



    1 






















    1 




    1 






    1 





















    1 

    1 















    1 


    1 















    1 


















    1 






    1 























    1 


















    1 








    1 









    1 







    1 



    1 










































































































































































































































































































































































































































    3 











    3 




































































    5 



    5 





















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/locks.c
 *
 * We implement four types of file locks: BSD locks, posix locks, open
 * file description locks, and leases.  For details about BSD locks,
 * see the flock(2) man page; for details about the other three, see
 * fcntl(2).
 *
 *
 * Locking conflicts and dependencies:
 * If multiple threads attempt to lock the same byte (or flock the same file)
 * only one can be granted the lock, and other must wait their turn.
 * The first lock has been "applied" or "granted", the others are "waiting"
 * and are "blocked" by the "applied" lock..
 *
 * Waiting and applied locks are all kept in trees whose properties are:
 *
 *        - the root of a tree may be an applied or waiting lock.
 *        - every other node in the tree is a waiting lock that
 *          conflicts with every ancestor of that node.
 *
 * Every such tree begins life as a waiting singleton which obviously
 * satisfies the above properties.
 *
 * The only ways we modify trees preserve these properties:
 *
 *        1. We may add a new leaf node, but only after first verifying that it
 *           conflicts with all of its ancestors.
 *        2. We may remove the root of a tree, creating a new singleton
 *           tree from the root and N new trees rooted in the immediate
 *           children.
 *        3. If the root of a tree is not currently an applied lock, we may
 *           apply it (if possible).
 *        4. We may upgrade the root of the tree (either extend its range,
 *           or upgrade its entire range from read to write).
 *
 * When an applied lock is modified in a way that reduces or downgrades any
 * part of its range, we remove all its children (2 above).  This particularly
 * happens when a lock is unlocked.
 *
 * For each of those child trees we "wake up" the thread which is
 * waiting for the lock so it can continue handling as follows: if the
 * root of the tree applies, we do so (3).  If it doesn't, it must
 * conflict with some applied lock.  We remove (wake up) all of its children
 * (2), and add it is a new leaf to the tree rooted in the applied
 * lock (1).  We then repeat the process recursively with those
 * children.
 *
 */
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/filelock.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/time.h>
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
#include <linux/hashtable.h>
#include <linux/percpu.h>
#include <linux/sysctl.h>

#define CREATE_TRACE_POINTS
#include <trace/events/filelock.h>

#include <linux/uaccess.h>

static struct file_lock *file_lock(struct file_lock_core *flc)
{
        return container_of(flc, struct file_lock, c);
}

static struct file_lease *file_lease(struct file_lock_core *flc)
{
        return container_of(flc, struct file_lease, c);
}

static bool lease_breaking(struct file_lease *fl)
{
        return fl->c.flc_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
}

static int target_leasetype(struct file_lease *fl)
{
        if (fl->c.flc_flags & FL_UNLOCK_PENDING)
                return F_UNLCK;
        if (fl->c.flc_flags & FL_DOWNGRADE_PENDING)
                return F_RDLCK;
        return fl->c.flc_type;
}

static int leases_enable = 1;
static int lease_break_time = 45;

#ifdef CONFIG_SYSCTL
static struct ctl_table locks_sysctls[] = {
        {
                .procname        = "leases-enable",
                .data                = &leases_enable,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#ifdef CONFIG_MMU
        {
                .procname        = "lease-break-time",
                .data                = &lease_break_time,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#endif /* CONFIG_MMU */
};

static int __init init_fs_locks_sysctls(void)
{
        register_sysctl_init("fs", locks_sysctls);
        return 0;
}
early_initcall(init_fs_locks_sysctls);
#endif /* CONFIG_SYSCTL */

/*
 * The global file_lock_list is only used for displaying /proc/locks, so we
 * keep a list on each CPU, with each list protected by its own spinlock.
 * Global serialization is done using file_rwsem.
 *
 * Note that alterations to the list also require that the relevant flc_lock is
 * held.
 */
struct file_lock_list_struct {
        spinlock_t                lock;
        struct hlist_head        hlist;
};
static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list);
DEFINE_STATIC_PERCPU_RWSEM(file_rwsem);


/*
 * The blocked_hash is used to find POSIX lock loops for deadlock detection.
 * It is protected by blocked_lock_lock.
 *
 * We hash locks by lockowner in order to optimize searching for the lock a
 * particular lockowner is waiting on.
 *
 * FIXME: make this value scale via some heuristic? We generally will want more
 * buckets when we have more lockowners holding locks, but that's a little
 * difficult to determine without knowing what the workload will look like.
 */
#define BLOCKED_HASH_BITS        7
static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);

/*
 * This lock protects the blocked_hash. Generally, if you're accessing it, you
 * want to be holding this lock.
 *
 * In addition, it also protects the fl->fl_blocked_requests list, and the
 * fl->fl_blocker pointer for file_lock structures that are acting as lock
 * requests (in contrast to those that are acting as records of acquired locks).
 *
 * Note that when we acquire this lock in order to change the above fields,
 * we often hold the flc_lock as well. In certain cases, when reading the fields
 * protected by this lock, we can skip acquiring it iff we already hold the
 * flc_lock.
 */
static DEFINE_SPINLOCK(blocked_lock_lock);

static struct kmem_cache *flctx_cache __ro_after_init;
static struct kmem_cache *filelock_cache __ro_after_init;
static struct kmem_cache *filelease_cache __ro_after_init;

static struct file_lock_context *
locks_get_lock_context(struct inode *inode, int type)
{
        struct file_lock_context *ctx;

        /* paired with cmpxchg() below */
        ctx = locks_inode_context(inode);
        if (likely(ctx) || type == F_UNLCK)
                goto out;

        ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
        if (!ctx)
                goto out;

        spin_lock_init(&ctx->flc_lock);
        INIT_LIST_HEAD(&ctx->flc_flock);
        INIT_LIST_HEAD(&ctx->flc_posix);
        INIT_LIST_HEAD(&ctx->flc_lease);

        /*
         * Assign the pointer if it's not already assigned. If it is, then
         * free the context we just allocated.
         */
        if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
                kmem_cache_free(flctx_cache, ctx);
                ctx = locks_inode_context(inode);
        }
out:
        trace_locks_get_lock_context(inode, type, ctx);
        return ctx;
}

static void
locks_dump_ctx_list(struct list_head *list, char *list_type)
{
        struct file_lock_core *flc;

        list_for_each_entry(flc, list, flc_list)
                pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n",
                        list_type, flc->flc_owner, flc->flc_flags,
                        flc->flc_type, flc->flc_pid);
}

static void
locks_check_ctx_lists(struct inode *inode)
{
        struct file_lock_context *ctx = inode->i_flctx;

        if (unlikely(!list_empty(&ctx->flc_flock) ||
                     !list_empty(&ctx->flc_posix) ||
                     !list_empty(&ctx->flc_lease))) {
                pr_warn("Leaked locks on dev=0x%x:0x%x ino=0x%lx:\n",
                        MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
                        inode->i_ino);
                locks_dump_ctx_list(&ctx->flc_flock, "FLOCK");
                locks_dump_ctx_list(&ctx->flc_posix, "POSIX");
                locks_dump_ctx_list(&ctx->flc_lease, "LEASE");
        }
}

static void
locks_check_ctx_file_list(struct file *filp, struct list_head *list, char *list_type)
{
        struct file_lock_core *flc;
        struct inode *inode = file_inode(filp);

        list_for_each_entry(flc, list, flc_list)
                if (flc->flc_file == filp)
                        pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%lx "
                                " fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n",
                                list_type, MAJOR(inode->i_sb->s_dev),
                                MINOR(inode->i_sb->s_dev), inode->i_ino,
                                flc->flc_owner, flc->flc_flags,
                                flc->flc_type, flc->flc_pid);
}

void
locks_free_lock_context(struct inode *inode)
{
        struct file_lock_context *ctx = locks_inode_context(inode);

        if (unlikely(ctx)) {
                locks_check_ctx_lists(inode);
                kmem_cache_free(flctx_cache, ctx);
        }
}

static void locks_init_lock_heads(struct file_lock_core *flc)
{
        INIT_HLIST_NODE(&flc->flc_link);
        INIT_LIST_HEAD(&flc->flc_list);
        INIT_LIST_HEAD(&flc->flc_blocked_requests);
        INIT_LIST_HEAD(&flc->flc_blocked_member);
        init_waitqueue_head(&flc->flc_wait);
}

/* Allocate an empty lock structure. */
struct file_lock *locks_alloc_lock(void)
{
        struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL);

        if (fl)
                locks_init_lock_heads(&fl->c);

        return fl;
}
EXPORT_SYMBOL_GPL(locks_alloc_lock);

/* Allocate an empty lock structure. */
struct file_lease *locks_alloc_lease(void)
{
        struct file_lease *fl = kmem_cache_zalloc(filelease_cache, GFP_KERNEL);

        if (fl)
                locks_init_lock_heads(&fl->c);

        return fl;
}
EXPORT_SYMBOL_GPL(locks_alloc_lease);

void locks_release_private(struct file_lock *fl)
{
        struct file_lock_core *flc = &fl->c;

        BUG_ON(waitqueue_active(&flc->flc_wait));
        BUG_ON(!list_empty(&flc->flc_list));
        BUG_ON(!list_empty(&flc->flc_blocked_requests));
        BUG_ON(!list_empty(&flc->flc_blocked_member));
        BUG_ON(!hlist_unhashed(&flc->flc_link));

        if (fl->fl_ops) {
                if (fl->fl_ops->fl_release_private)
                        fl->fl_ops->fl_release_private(fl);
                fl->fl_ops = NULL;
        }

        if (fl->fl_lmops) {
                if (fl->fl_lmops->lm_put_owner) {
                        fl->fl_lmops->lm_put_owner(flc->flc_owner);
                        flc->flc_owner = NULL;
                }
                fl->fl_lmops = NULL;
        }
}
EXPORT_SYMBOL_GPL(locks_release_private);

/**
 * locks_owner_has_blockers - Check for blocking lock requests
 * @flctx: file lock context
 * @owner: lock owner
 *
 * Return values:
 *   %true: @owner has at least one blocker
 *   %false: @owner has no blockers
 */
bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner)
{
        struct file_lock_core *flc;

        spin_lock(&flctx->flc_lock);
        list_for_each_entry(flc, &flctx->flc_posix, flc_list) {
                if (flc->flc_owner != owner)
                        continue;
                if (!list_empty(&flc->flc_blocked_requests)) {
                        spin_unlock(&flctx->flc_lock);
                        return true;
                }
        }
        spin_unlock(&flctx->flc_lock);
        return false;
}
EXPORT_SYMBOL_GPL(locks_owner_has_blockers);

/* Free a lock which is not in use. */
void locks_free_lock(struct file_lock *fl)
{
        locks_release_private(fl);
        kmem_cache_free(filelock_cache, fl);
}
EXPORT_SYMBOL(locks_free_lock);

/* Free a lease which is not in use. */
void locks_free_lease(struct file_lease *fl)
{
        kmem_cache_free(filelease_cache, fl);
}
EXPORT_SYMBOL(locks_free_lease);

static void
locks_dispose_list(struct list_head *dispose)
{
        struct file_lock_core *flc;

        while (!list_empty(dispose)) {
                flc = list_first_entry(dispose, struct file_lock_core, flc_list);
                list_del_init(&flc->flc_list);
                if (flc->flc_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
                        locks_free_lease(file_lease(flc));
                else
                        locks_free_lock(file_lock(flc));
        }
}

void locks_init_lock(struct file_lock *fl)
{
        memset(fl, 0, sizeof(struct file_lock));
        locks_init_lock_heads(&fl->c);
}
EXPORT_SYMBOL(locks_init_lock);

void locks_init_lease(struct file_lease *fl)
{
        memset(fl, 0, sizeof(*fl));
        locks_init_lock_heads(&fl->c);
}
EXPORT_SYMBOL(locks_init_lease);

/*
 * Initialize a new lock from an existing file_lock structure.
 */
void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
{
        new->c.flc_owner = fl->c.flc_owner;
        new->c.flc_pid = fl->c.flc_pid;
        new->c.flc_file = NULL;
        new->c.flc_flags = fl->c.flc_flags;
        new->c.flc_type = fl->c.flc_type;
        new->fl_start = fl->fl_start;
        new->fl_end = fl->fl_end;
        new->fl_lmops = fl->fl_lmops;
        new->fl_ops = NULL;

        if (fl->fl_lmops) {
                if (fl->fl_lmops->lm_get_owner)
                        fl->fl_lmops->lm_get_owner(fl->c.flc_owner);
        }
}
EXPORT_SYMBOL(locks_copy_conflock);

void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
{
        /* "new" must be a freshly-initialized lock */
        WARN_ON_ONCE(new->fl_ops);

        locks_copy_conflock(new, fl);

        new->c.flc_file = fl->c.flc_file;
        new->fl_ops = fl->fl_ops;

        if (fl->fl_ops) {
                if (fl->fl_ops->fl_copy_lock)
                        fl->fl_ops->fl_copy_lock(new, fl);
        }
}
EXPORT_SYMBOL(locks_copy_lock);

static void locks_move_blocks(struct file_lock *new, struct file_lock *fl)
{
        struct file_lock *f;

        /*
         * As ctx->flc_lock is held, new requests cannot be added to
         * ->flc_blocked_requests, so we don't need a lock to check if it
         * is empty.
         */
        if (list_empty(&fl->c.flc_blocked_requests))
                return;
        spin_lock(&blocked_lock_lock);
        list_splice_init(&fl->c.flc_blocked_requests,
                         &new->c.flc_blocked_requests);
        list_for_each_entry(f, &new->c.flc_blocked_requests,
                            c.flc_blocked_member)
                f->c.flc_blocker = &new->c;
        spin_unlock(&blocked_lock_lock);
}

static inline int flock_translate_cmd(int cmd) {
        switch (cmd) {
        case LOCK_SH:
                return F_RDLCK;
        case LOCK_EX:
                return F_WRLCK;
        case LOCK_UN:
                return F_UNLCK;
        }
        return -EINVAL;
}

/* Fill in a file_lock structure with an appropriate FLOCK lock. */
static void flock_make_lock(struct file *filp, struct file_lock *fl, int type)
{
        locks_init_lock(fl);

        fl->c.flc_file = filp;
        fl->c.flc_owner = filp;
        fl->c.flc_pid = current->tgid;
        fl->c.flc_flags = FL_FLOCK;
        fl->c.flc_type = type;
        fl->fl_end = OFFSET_MAX;
}

static int assign_type(struct file_lock_core *flc, int type)
{
        switch (type) {
        case F_RDLCK:
        case F_WRLCK:
        case F_UNLCK:
                flc->flc_type = type;
                break;
        default:
                return -EINVAL;
        }
        return 0;
}

static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
                                 struct flock64 *l)
{
        switch (l->l_whence) {
        case SEEK_SET:
                fl->fl_start = 0;
                break;
        case SEEK_CUR:
                fl->fl_start = filp->f_pos;
                break;
        case SEEK_END:
                fl->fl_start = i_size_read(file_inode(filp));
                break;
        default:
                return -EINVAL;
        }
        if (l->l_start > OFFSET_MAX - fl->fl_start)
                return -EOVERFLOW;
        fl->fl_start += l->l_start;
        if (fl->fl_start < 0)
                return -EINVAL;

        /* POSIX-1996 leaves the case l->l_len < 0 undefined;
           POSIX-2001 defines it. */
        if (l->l_len > 0) {
                if (l->l_len - 1 > OFFSET_MAX - fl->fl_start)
                        return -EOVERFLOW;
                fl->fl_end = fl->fl_start + (l->l_len - 1);

        } else if (l->l_len < 0) {
                if (fl->fl_start + l->l_len < 0)
                        return -EINVAL;
                fl->fl_end = fl->fl_start - 1;
                fl->fl_start += l->l_len;
        } else
                fl->fl_end = OFFSET_MAX;

        fl->c.flc_owner = current->files;
        fl->c.flc_pid = current->tgid;
        fl->c.flc_file = filp;
        fl->c.flc_flags = FL_POSIX;
        fl->fl_ops = NULL;
        fl->fl_lmops = NULL;

        return assign_type(&fl->c, l->l_type);
}

/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
 * style lock.
 */
static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
                               struct flock *l)
{
        struct flock64 ll = {
                .l_type = l->l_type,
                .l_whence = l->l_whence,
                .l_start = l->l_start,
                .l_len = l->l_len,
        };

        return flock64_to_posix_lock(filp, fl, &ll);
}

/* default lease lock manager operations */
static bool
lease_break_callback(struct file_lease *fl)
{
        kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG);
        return false;
}

static void
lease_setup(struct file_lease *fl, void **priv)
{
        struct file *filp = fl->c.flc_file;
        struct fasync_struct *fa = *priv;

        /*
         * fasync_insert_entry() returns the old entry if any. If there was no
         * old entry, then it used "priv" and inserted it into the fasync list.
         * Clear the pointer to indicate that it shouldn't be freed.
         */
        if (!fasync_insert_entry(fa->fa_fd, filp, &fl->fl_fasync, fa))
                *priv = NULL;

        __f_setown(filp, task_pid(current), PIDTYPE_TGID, 0);
}

static const struct lease_manager_operations lease_manager_ops = {
        .lm_break = lease_break_callback,
        .lm_change = lease_modify,
        .lm_setup = lease_setup,
};

/*
 * Initialize a lease, use the default lock manager operations
 */
static int lease_init(struct file *filp, int type, struct file_lease *fl)
{
        if (assign_type(&fl->c, type) != 0)
                return -EINVAL;

        fl->c.flc_owner = filp;
        fl->c.flc_pid = current->tgid;

        fl->c.flc_file = filp;
        fl->c.flc_flags = FL_LEASE;
        fl->fl_lmops = &lease_manager_ops;
        return 0;
}

/* Allocate a file_lock initialised to this type of lease */
static struct file_lease *lease_alloc(struct file *filp, int type)
{
        struct file_lease *fl = locks_alloc_lease();
        int error = -ENOMEM;

        if (fl == NULL)
                return ERR_PTR(error);

        error = lease_init(filp, type, fl);
        if (error) {
                locks_free_lease(fl);
                return ERR_PTR(error);
        }
        return fl;
}

/* Check if two locks overlap each other.
 */
static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2)
{
        return ((fl1->fl_end >= fl2->fl_start) &&
                (fl2->fl_end >= fl1->fl_start));
}

/*
 * Check whether two locks have the same owner.
 */
static int posix_same_owner(struct file_lock_core *fl1, struct file_lock_core *fl2)
{
        return fl1->flc_owner == fl2->flc_owner;
}

/* Must be called with the flc_lock held! */
static void locks_insert_global_locks(struct file_lock_core *flc)
{
        struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list);

        percpu_rwsem_assert_held(&file_rwsem);

        spin_lock(&fll->lock);
        flc->flc_link_cpu = smp_processor_id();
        hlist_add_head(&flc->flc_link, &fll->hlist);
        spin_unlock(&fll->lock);
}

/* Must be called with the flc_lock held! */
static void locks_delete_global_locks(struct file_lock_core *flc)
{
        struct file_lock_list_struct *fll;

        percpu_rwsem_assert_held(&file_rwsem);

        /*
         * Avoid taking lock if already unhashed. This is safe since this check
         * is done while holding the flc_lock, and new insertions into the list
         * also require that it be held.
         */
        if (hlist_unhashed(&flc->flc_link))
                return;

        fll = per_cpu_ptr(&file_lock_list, flc->flc_link_cpu);
        spin_lock(&fll->lock);
        hlist_del_init(&flc->flc_link);
        spin_unlock(&fll->lock);
}

static unsigned long
posix_owner_key(struct file_lock_core *flc)
{
        return (unsigned long) flc->flc_owner;
}

static void locks_insert_global_blocked(struct file_lock_core *waiter)
{
        lockdep_assert_held(&blocked_lock_lock);

        hash_add(blocked_hash, &waiter->flc_link, posix_owner_key(waiter));
}

static void locks_delete_global_blocked(struct file_lock_core *waiter)
{
        lockdep_assert_held(&blocked_lock_lock);

        hash_del(&waiter->flc_link);
}

/* Remove waiter from blocker's block list.
 * When blocker ends up pointing to itself then the list is empty.
 *
 * Must be called with blocked_lock_lock held.
 */
static void __locks_unlink_block(struct file_lock_core *waiter)
{
        locks_delete_global_blocked(waiter);
        list_del_init(&waiter->flc_blocked_member);
}

static void __locks_wake_up_blocks(struct file_lock_core *blocker)
{
        while (!list_empty(&blocker->flc_blocked_requests)) {
                struct file_lock_core *waiter;
                struct file_lock *fl;

                waiter = list_first_entry(&blocker->flc_blocked_requests,
                                          struct file_lock_core, flc_blocked_member);

                fl = file_lock(waiter);
                __locks_unlink_block(waiter);
                if ((waiter->flc_flags & (FL_POSIX | FL_FLOCK)) &&
                    fl->fl_lmops && fl->fl_lmops->lm_notify)
                        fl->fl_lmops->lm_notify(fl);
                else
                        locks_wake_up(fl);

                /*
                 * The setting of flc_blocker to NULL marks the "done"
                 * point in deleting a block. Paired with acquire at the top
                 * of locks_delete_block().
                 */
                smp_store_release(&waiter->flc_blocker, NULL);
        }
}

static int __locks_delete_block(struct file_lock_core *waiter)
{
        int status = -ENOENT;

        /*
         * If fl_blocker is NULL, it won't be set again as this thread "owns"
         * the lock and is the only one that might try to claim the lock.
         *
         * We use acquire/release to manage fl_blocker so that we can
         * optimize away taking the blocked_lock_lock in many cases.
         *
         * The smp_load_acquire guarantees two things:
         *
         * 1/ that fl_blocked_requests can be tested locklessly. If something
         * was recently added to that list it must have been in a locked region
         * *before* the locked region when fl_blocker was set to NULL.
         *
         * 2/ that no other thread is accessing 'waiter', so it is safe to free
         * it.  __locks_wake_up_blocks is careful not to touch waiter after
         * fl_blocker is released.
         *
         * If a lockless check of fl_blocker shows it to be NULL, we know that
         * no new locks can be inserted into its fl_blocked_requests list, and
         * can avoid doing anything further if the list is empty.
         */
        if (!smp_load_acquire(&waiter->flc_blocker) &&
            list_empty(&waiter->flc_blocked_requests))
                return status;

        spin_lock(&blocked_lock_lock);
        if (waiter->flc_blocker)
                status = 0;
        __locks_wake_up_blocks(waiter);
        __locks_unlink_block(waiter);

        /*
         * The setting of fl_blocker to NULL marks the "done" point in deleting
         * a block. Paired with acquire at the top of this function.
         */
        smp_store_release(&waiter->flc_blocker, NULL);
        spin_unlock(&blocked_lock_lock);
        return status;
}

/**
 *        locks_delete_block - stop waiting for a file lock
 *        @waiter: the lock which was waiting
 *
 *        lockd/nfsd need to disconnect the lock while working on it.
 */
int locks_delete_block(struct file_lock *waiter)
{
        return __locks_delete_block(&waiter->c);
}
EXPORT_SYMBOL(locks_delete_block);

/* Insert waiter into blocker's block list.
 * We use a circular list so that processes can be easily woken up in
 * the order they blocked. The documentation doesn't require this but
 * it seems like the reasonable thing to do.
 *
 * Must be called with both the flc_lock and blocked_lock_lock held. The
 * fl_blocked_requests list itself is protected by the blocked_lock_lock,
 * but by ensuring that the flc_lock is also held on insertions we can avoid
 * taking the blocked_lock_lock in some cases when we see that the
 * fl_blocked_requests list is empty.
 *
 * Rather than just adding to the list, we check for conflicts with any existing
 * waiters, and add beneath any waiter that blocks the new waiter.
 * Thus wakeups don't happen until needed.
 */
static void __locks_insert_block(struct file_lock_core *blocker,
                                 struct file_lock_core *waiter,
                                 bool conflict(struct file_lock_core *,
                                               struct file_lock_core *))
{
        struct file_lock_core *flc;

        BUG_ON(!list_empty(&waiter->flc_blocked_member));
new_blocker:
        list_for_each_entry(flc, &blocker->flc_blocked_requests, flc_blocked_member)
                if (conflict(flc, waiter)) {
                        blocker =  flc;
                        goto new_blocker;
                }
        waiter->flc_blocker = blocker;
        list_add_tail(&waiter->flc_blocked_member,
                      &blocker->flc_blocked_requests);

        if ((blocker->flc_flags & (FL_POSIX|FL_OFDLCK)) == FL_POSIX)
                locks_insert_global_blocked(waiter);

        /* The requests in waiter->flc_blocked are known to conflict with
         * waiter, but might not conflict with blocker, or the requests
         * and lock which block it.  So they all need to be woken.
         */
        __locks_wake_up_blocks(waiter);
}

/* Must be called with flc_lock held. */
static void locks_insert_block(struct file_lock_core *blocker,
                               struct file_lock_core *waiter,
                               bool conflict(struct file_lock_core *,
                                             struct file_lock_core *))
{
        spin_lock(&blocked_lock_lock);
        __locks_insert_block(blocker, waiter, conflict);
        spin_unlock(&blocked_lock_lock);
}

/*
 * Wake up processes blocked waiting for blocker.
 *
 * Must be called with the inode->flc_lock held!
 */
static void locks_wake_up_blocks(struct file_lock_core *blocker)
{
        /*
         * Avoid taking global lock if list is empty. This is safe since new
         * blocked requests are only added to the list under the flc_lock, and
         * the flc_lock is always held here. Note that removal from the
         * fl_blocked_requests list does not require the flc_lock, so we must
         * recheck list_empty() after acquiring the blocked_lock_lock.
         */
        if (list_empty(&blocker->flc_blocked_requests))
                return;

        spin_lock(&blocked_lock_lock);
        __locks_wake_up_blocks(blocker);
        spin_unlock(&blocked_lock_lock);
}

static void
locks_insert_lock_ctx(struct file_lock_core *fl, struct list_head *before)
{
        list_add_tail(&fl->flc_list, before);
        locks_insert_global_locks(fl);
}

static void
locks_unlink_lock_ctx(struct file_lock_core *fl)
{
        locks_delete_global_locks(fl);
        list_del_init(&fl->flc_list);
        locks_wake_up_blocks(fl);
}

static void
locks_delete_lock_ctx(struct file_lock_core *fl, struct list_head *dispose)
{
        locks_unlink_lock_ctx(fl);
        if (dispose)
                list_add(&fl->flc_list, dispose);
        else
                locks_free_lock(file_lock(fl));
}

/* Determine if lock sys_fl blocks lock caller_fl. Common functionality
 * checks for shared/exclusive status of overlapping locks.
 */
static bool locks_conflict(struct file_lock_core *caller_flc,
                           struct file_lock_core *sys_flc)
{
        if (sys_flc->flc_type == F_WRLCK)
                return true;
        if (caller_flc->flc_type == F_WRLCK)
                return true;
        return false;
}

/* Determine if lock sys_fl blocks lock caller_fl. POSIX specific
 * checking before calling the locks_conflict().
 */
static bool posix_locks_conflict(struct file_lock_core *caller_flc,
                                 struct file_lock_core *sys_flc)
{
        struct file_lock *caller_fl = file_lock(caller_flc);
        struct file_lock *sys_fl = file_lock(sys_flc);

        /* POSIX locks owned by the same process do not conflict with
         * each other.
         */
        if (posix_same_owner(caller_flc, sys_flc))
                return false;

        /* Check whether they overlap */
        if (!locks_overlap(caller_fl, sys_fl))
                return false;

        return locks_conflict(caller_flc, sys_flc);
}

/* Determine if lock sys_fl blocks lock caller_fl. Used on xx_GETLK
 * path so checks for additional GETLK-specific things like F_UNLCK.
 */
static bool posix_test_locks_conflict(struct file_lock *caller_fl,
                                      struct file_lock *sys_fl)
{
        struct file_lock_core *caller = &caller_fl->c;
        struct file_lock_core *sys = &sys_fl->c;

        /* F_UNLCK checks any locks on the same fd. */
        if (lock_is_unlock(caller_fl)) {
                if (!posix_same_owner(caller, sys))
                        return false;
                return locks_overlap(caller_fl, sys_fl);
        }
        return posix_locks_conflict(caller, sys);
}

/* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific
 * checking before calling the locks_conflict().
 */
static bool flock_locks_conflict(struct file_lock_core *caller_flc,
                                 struct file_lock_core *sys_flc)
{
        /* FLOCK locks referring to the same filp do not conflict with
         * each other.
         */
        if (caller_flc->flc_file == sys_flc->flc_file)
                return false;

        return locks_conflict(caller_flc, sys_flc);
}

void
posix_test_lock(struct file *filp, struct file_lock *fl)
{
        struct file_lock *cfl;
        struct file_lock_context *ctx;
        struct inode *inode = file_inode(filp);
        void *owner;
        void (*func)(void);

        ctx = locks_inode_context(inode);
        if (!ctx || list_empty_careful(&ctx->flc_posix)) {
                fl->c.flc_type = F_UNLCK;
                return;
        }

retry:
        spin_lock(&ctx->flc_lock);
        list_for_each_entry(cfl, &ctx->flc_posix, c.flc_list) {
                if (!posix_test_locks_conflict(fl, cfl))
                        continue;
                if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable
                        && (*cfl->fl_lmops->lm_lock_expirable)(cfl)) {
                        owner = cfl->fl_lmops->lm_mod_owner;
                        func = cfl->fl_lmops->lm_expire_lock;
                        __module_get(owner);
                        spin_unlock(&ctx->flc_lock);
                        (*func)();
                        module_put(owner);
                        goto retry;
                }
                locks_copy_conflock(fl, cfl);
                goto out;
        }
        fl->c.flc_type = F_UNLCK;
out:
        spin_unlock(&ctx->flc_lock);
        return;
}
EXPORT_SYMBOL(posix_test_lock);

/*
 * Deadlock detection:
 *
 * We attempt to detect deadlocks that are due purely to posix file
 * locks.
 *
 * We assume that a task can be waiting for at most one lock at a time.
 * So for any acquired lock, the process holding that lock may be
 * waiting on at most one other lock.  That lock in turns may be held by
 * someone waiting for at most one other lock.  Given a requested lock
 * caller_fl which is about to wait for a conflicting lock block_fl, we
 * follow this chain of waiters to ensure we are not about to create a
 * cycle.
 *
 * Since we do this before we ever put a process to sleep on a lock, we
 * are ensured that there is never a cycle; that is what guarantees that
 * the while() loop in posix_locks_deadlock() eventually completes.
 *
 * Note: the above assumption may not be true when handling lock
 * requests from a broken NFS client. It may also fail in the presence
 * of tasks (such as posix threads) sharing the same open file table.
 * To handle those cases, we just bail out after a few iterations.
 *
 * For FL_OFDLCK locks, the owner is the filp, not the files_struct.
 * Because the owner is not even nominally tied to a thread of
 * execution, the deadlock detection below can't reasonably work well. Just
 * skip it for those.
 *
 * In principle, we could do a more limited deadlock detection on FL_OFDLCK
 * locks that just checks for the case where two tasks are attempting to
 * upgrade from read to write locks on the same inode.
 */

#define MAX_DEADLK_ITERATIONS 10

/* Find a lock that the owner of the given @blocker is blocking on. */
static struct file_lock_core *what_owner_is_waiting_for(struct file_lock_core *blocker)
{
        struct file_lock_core *flc;

        hash_for_each_possible(blocked_hash, flc, flc_link, posix_owner_key(blocker)) {
                if (posix_same_owner(flc, blocker)) {
                        while (flc->flc_blocker)
                                flc = flc->flc_blocker;
                        return flc;
                }
        }
        return NULL;
}

/* Must be called with the blocked_lock_lock held! */
static bool posix_locks_deadlock(struct file_lock *caller_fl,
                                 struct file_lock *block_fl)
{
        struct file_lock_core *caller = &caller_fl->c;
        struct file_lock_core *blocker = &block_fl->c;
        int i = 0;

        lockdep_assert_held(&blocked_lock_lock);

        /*
         * This deadlock detector can't reasonably detect deadlocks with
         * FL_OFDLCK locks, since they aren't owned by a process, per-se.
         */
        if (caller->flc_flags & FL_OFDLCK)
                return false;

        while ((blocker = what_owner_is_waiting_for(blocker))) {
                if (i++ > MAX_DEADLK_ITERATIONS)
                        return false;
                if (posix_same_owner(caller, blocker))
                        return true;
        }
        return false;
}

/* Try to create a FLOCK lock on filp. We always insert new FLOCK locks
 * after any leases, but before any posix locks.
 *
 * Note that if called with an FL_EXISTS argument, the caller may determine
 * whether or not a lock was successfully freed by testing the return
 * value for -ENOENT.
 */
static int flock_lock_inode(struct inode *inode, struct file_lock *request)
{
        struct file_lock *new_fl = NULL;
        struct file_lock *fl;
        struct file_lock_context *ctx;
        int error = 0;
        bool found = false;
        LIST_HEAD(dispose);

        ctx = locks_get_lock_context(inode, request->c.flc_type);
        if (!ctx) {
                if (request->c.flc_type != F_UNLCK)
                        return -ENOMEM;
                return (request->c.flc_flags & FL_EXISTS) ? -ENOENT : 0;
        }

        if (!(request->c.flc_flags & FL_ACCESS) && (request->c.flc_type != F_UNLCK)) {
                new_fl = locks_alloc_lock();
                if (!new_fl)
                        return -ENOMEM;
        }

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        if (request->c.flc_flags & FL_ACCESS)
                goto find_conflict;

        list_for_each_entry(fl, &ctx->flc_flock, c.flc_list) {
                if (request->c.flc_file != fl->c.flc_file)
                        continue;
                if (request->c.flc_type == fl->c.flc_type)
                        goto out;
                found = true;
                locks_delete_lock_ctx(&fl->c, &dispose);
                break;
        }

        if (lock_is_unlock(request)) {
                if ((request->c.flc_flags & FL_EXISTS) && !found)
                        error = -ENOENT;
                goto out;
        }

find_conflict:
        list_for_each_entry(fl, &ctx->flc_flock, c.flc_list) {
                if (!flock_locks_conflict(&request->c, &fl->c))
                        continue;
                error = -EAGAIN;
                if (!(request->c.flc_flags & FL_SLEEP))
                        goto out;
                error = FILE_LOCK_DEFERRED;
                locks_insert_block(&fl->c, &request->c, flock_locks_conflict);
                goto out;
        }
        if (request->c.flc_flags & FL_ACCESS)
                goto out;
        locks_copy_lock(new_fl, request);
        locks_move_blocks(new_fl, request);
        locks_insert_lock_ctx(&new_fl->c, &ctx->flc_flock);
        new_fl = NULL;
        error = 0;

out:
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        if (new_fl)
                locks_free_lock(new_fl);
        locks_dispose_list(&dispose);
        trace_flock_lock_inode(inode, request, error);
        return error;
}

static int posix_lock_inode(struct inode *inode, struct file_lock *request,
                            struct file_lock *conflock)
{
        struct file_lock *fl, *tmp;
        struct file_lock *new_fl = NULL;
        struct file_lock *new_fl2 = NULL;
        struct file_lock *left = NULL;
        struct file_lock *right = NULL;
        struct file_lock_context *ctx;
        int error;
        bool added = false;
        LIST_HEAD(dispose);
        void *owner;
        void (*func)(void);

        ctx = locks_get_lock_context(inode, request->c.flc_type);
        if (!ctx)
                return lock_is_unlock(request) ? 0 : -ENOMEM;

        /*
         * We may need two file_lock structures for this operation,
         * so we get them in advance to avoid races.
         *
         * In some cases we can be sure, that no new locks will be needed
         */
        if (!(request->c.flc_flags & FL_ACCESS) &&
            (request->c.flc_type != F_UNLCK ||
             request->fl_start != 0 || request->fl_end != OFFSET_MAX)) {
                new_fl = locks_alloc_lock();
                new_fl2 = locks_alloc_lock();
        }

retry:
        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        /*
         * New lock request. Walk all POSIX locks and look for conflicts. If
         * there are any, either return error or put the request on the
         * blocker's list of waiters and the global blocked_hash.
         */
        if (request->c.flc_type != F_UNLCK) {
                list_for_each_entry(fl, &ctx->flc_posix, c.flc_list) {
                        if (!posix_locks_conflict(&request->c, &fl->c))
                                continue;
                        if (fl->fl_lmops && fl->fl_lmops->lm_lock_expirable
                                && (*fl->fl_lmops->lm_lock_expirable)(fl)) {
                                owner = fl->fl_lmops->lm_mod_owner;
                                func = fl->fl_lmops->lm_expire_lock;
                                __module_get(owner);
                                spin_unlock(&ctx->flc_lock);
                                percpu_up_read(&file_rwsem);
                                (*func)();
                                module_put(owner);
                                goto retry;
                        }
                        if (conflock)
                                locks_copy_conflock(conflock, fl);
                        error = -EAGAIN;
                        if (!(request->c.flc_flags & FL_SLEEP))
                                goto out;
                        /*
                         * Deadlock detection and insertion into the blocked
                         * locks list must be done while holding the same lock!
                         */
                        error = -EDEADLK;
                        spin_lock(&blocked_lock_lock);
                        /*
                         * Ensure that we don't find any locks blocked on this
                         * request during deadlock detection.
                         */
                        __locks_wake_up_blocks(&request->c);
                        if (likely(!posix_locks_deadlock(request, fl))) {
                                error = FILE_LOCK_DEFERRED;
                                __locks_insert_block(&fl->c, &request->c,
                                                     posix_locks_conflict);
                        }
                        spin_unlock(&blocked_lock_lock);
                        goto out;
                }
        }

        /* If we're just looking for a conflict, we're done. */
        error = 0;
        if (request->c.flc_flags & FL_ACCESS)
                goto out;

        /* Find the first old lock with the same owner as the new lock */
        list_for_each_entry(fl, &ctx->flc_posix, c.flc_list) {
                if (posix_same_owner(&request->c, &fl->c))
                        break;
        }

        /* Process locks with this owner. */
        list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, c.flc_list) {
                if (!posix_same_owner(&request->c, &fl->c))
                        break;

                /* Detect adjacent or overlapping regions (if same lock type) */
                if (request->c.flc_type == fl->c.flc_type) {
                        /* In all comparisons of start vs end, use
                         * "start - 1" rather than "end + 1". If end
                         * is OFFSET_MAX, end + 1 will become negative.
                         */
                        if (fl->fl_end < request->fl_start - 1)
                                continue;
                        /* If the next lock in the list has entirely bigger
                         * addresses than the new one, insert the lock here.
                         */
                        if (fl->fl_start - 1 > request->fl_end)
                                break;

                        /* If we come here, the new and old lock are of the
                         * same type and adjacent or overlapping. Make one
                         * lock yielding from the lower start address of both
                         * locks to the higher end address.
                         */
                        if (fl->fl_start > request->fl_start)
                                fl->fl_start = request->fl_start;
                        else
                                request->fl_start = fl->fl_start;
                        if (fl->fl_end < request->fl_end)
                                fl->fl_end = request->fl_end;
                        else
                                request->fl_end = fl->fl_end;
                        if (added) {
                                locks_delete_lock_ctx(&fl->c, &dispose);
                                continue;
                        }
                        request = fl;
                        added = true;
                } else {
                        /* Processing for different lock types is a bit
                         * more complex.
                         */
                        if (fl->fl_end < request->fl_start)
                                continue;
                        if (fl->fl_start > request->fl_end)
                                break;
                        if (lock_is_unlock(request))
                                added = true;
                        if (fl->fl_start < request->fl_start)
                                left = fl;
                        /* If the next lock in the list has a higher end
                         * address than the new one, insert the new one here.
                         */
                        if (fl->fl_end > request->fl_end) {
                                right = fl;
                                break;
                        }
                        if (fl->fl_start >= request->fl_start) {
                                /* The new lock completely replaces an old
                                 * one (This may happen several times).
                                 */
                                if (added) {
                                        locks_delete_lock_ctx(&fl->c, &dispose);
                                        continue;
                                }
                                /*
                                 * Replace the old lock with new_fl, and
                                 * remove the old one. It's safe to do the
                                 * insert here since we know that we won't be
                                 * using new_fl later, and that the lock is
                                 * just replacing an existing lock.
                                 */
                                error = -ENOLCK;
                                if (!new_fl)
                                        goto out;
                                locks_copy_lock(new_fl, request);
                                locks_move_blocks(new_fl, request);
                                request = new_fl;
                                new_fl = NULL;
                                locks_insert_lock_ctx(&request->c,
                                                      &fl->c.flc_list);
                                locks_delete_lock_ctx(&fl->c, &dispose);
                                added = true;
                        }
                }
        }

        /*
         * The above code only modifies existing locks in case of merging or
         * replacing. If new lock(s) need to be inserted all modifications are
         * done below this, so it's safe yet to bail out.
         */
        error = -ENOLCK; /* "no luck" */
        if (right && left == right && !new_fl2)
                goto out;

        error = 0;
        if (!added) {
                if (lock_is_unlock(request)) {
                        if (request->c.flc_flags & FL_EXISTS)
                                error = -ENOENT;
                        goto out;
                }

                if (!new_fl) {
                        error = -ENOLCK;
                        goto out;
                }
                locks_copy_lock(new_fl, request);
                locks_move_blocks(new_fl, request);
                locks_insert_lock_ctx(&new_fl->c, &fl->c.flc_list);
                fl = new_fl;
                new_fl = NULL;
        }
        if (right) {
                if (left == right) {
                        /* The new lock breaks the old one in two pieces,
                         * so we have to use the second new lock.
                         */
                        left = new_fl2;
                        new_fl2 = NULL;
                        locks_copy_lock(left, right);
                        locks_insert_lock_ctx(&left->c, &fl->c.flc_list);
                }
                right->fl_start = request->fl_end + 1;
                locks_wake_up_blocks(&right->c);
        }
        if (left) {
                left->fl_end = request->fl_start - 1;
                locks_wake_up_blocks(&left->c);
        }
 out:
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        trace_posix_lock_inode(inode, request, error);
        /*
         * Free any unused locks.
         */
        if (new_fl)
                locks_free_lock(new_fl);
        if (new_fl2)
                locks_free_lock(new_fl2);
        locks_dispose_list(&dispose);

        return error;
}

/**
 * posix_lock_file - Apply a POSIX-style lock to a file
 * @filp: The file to apply the lock to
 * @fl: The lock to be applied
 * @conflock: Place to return a copy of the conflicting lock, if found.
 *
 * Add a POSIX style lock to a file.
 * We merge adjacent & overlapping locks whenever possible.
 * POSIX locks are sorted by owner task, then by starting address
 *
 * Note that if called with an FL_EXISTS argument, the caller may determine
 * whether or not a lock was successfully freed by testing the return
 * value for -ENOENT.
 */
int posix_lock_file(struct file *filp, struct file_lock *fl,
                        struct file_lock *conflock)
{
        return posix_lock_inode(file_inode(filp), fl, conflock);
}
EXPORT_SYMBOL(posix_lock_file);

/**
 * posix_lock_inode_wait - Apply a POSIX-style lock to a file
 * @inode: inode of file to which lock request should be applied
 * @fl: The lock to be applied
 *
 * Apply a POSIX style lock request to an inode.
 */
static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        int error;
        might_sleep ();
        for (;;) {
                error = posix_lock_inode(inode, fl, NULL);
                if (error != FILE_LOCK_DEFERRED)
                        break;
                error = wait_event_interruptible(fl->c.flc_wait,
                                                 list_empty(&fl->c.flc_blocked_member));
                if (error)
                        break;
        }
        locks_delete_block(fl);
        return error;
}

static void lease_clear_pending(struct file_lease *fl, int arg)
{
        switch (arg) {
        case F_UNLCK:
                fl->c.flc_flags &= ~FL_UNLOCK_PENDING;
                fallthrough;
        case F_RDLCK:
                fl->c.flc_flags &= ~FL_DOWNGRADE_PENDING;
        }
}

/* We already had a lease on this file; just change its type */
int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose)
{
        int error = assign_type(&fl->c, arg);

        if (error)
                return error;
        lease_clear_pending(fl, arg);
        locks_wake_up_blocks(&fl->c);
        if (arg == F_UNLCK) {
                struct file *filp = fl->c.flc_file;

                f_delown(filp);
                filp->f_owner.signum = 0;
                fasync_helper(0, fl->c.flc_file, 0, &fl->fl_fasync);
                if (fl->fl_fasync != NULL) {
                        printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
                        fl->fl_fasync = NULL;
                }
                locks_delete_lock_ctx(&fl->c, dispose);
        }
        return 0;
}
EXPORT_SYMBOL(lease_modify);

static bool past_time(unsigned long then)
{
        if (!then)
                /* 0 is a special value meaning "this never expires": */
                return false;
        return time_after(jiffies, then);
}

static void time_out_leases(struct inode *inode, struct list_head *dispose)
{
        struct file_lock_context *ctx = inode->i_flctx;
        struct file_lease *fl, *tmp;

        lockdep_assert_held(&ctx->flc_lock);

        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) {
                trace_time_out_leases(inode, fl);
                if (past_time(fl->fl_downgrade_time))
                        lease_modify(fl, F_RDLCK, dispose);
                if (past_time(fl->fl_break_time))
                        lease_modify(fl, F_UNLCK, dispose);
        }
}

static bool leases_conflict(struct file_lock_core *lc, struct file_lock_core *bc)
{
        bool rc;
        struct file_lease *lease = file_lease(lc);
        struct file_lease *breaker = file_lease(bc);

        if (lease->fl_lmops->lm_breaker_owns_lease
                        && lease->fl_lmops->lm_breaker_owns_lease(lease))
                return false;
        if ((bc->flc_flags & FL_LAYOUT) != (lc->flc_flags & FL_LAYOUT)) {
                rc = false;
                goto trace;
        }
        if ((bc->flc_flags & FL_DELEG) && (lc->flc_flags & FL_LEASE)) {
                rc = false;
                goto trace;
        }

        rc = locks_conflict(bc, lc);
trace:
        trace_leases_conflict(rc, lease, breaker);
        return rc;
}

static bool
any_leases_conflict(struct inode *inode, struct file_lease *breaker)
{
        struct file_lock_context *ctx = inode->i_flctx;
        struct file_lock_core *flc;

        lockdep_assert_held(&ctx->flc_lock);

        list_for_each_entry(flc, &ctx->flc_lease, flc_list) {
                if (leases_conflict(flc, &breaker->c))
                        return true;
        }
        return false;
}

/**
 *        __break_lease        -        revoke all outstanding leases on file
 *        @inode: the inode of the file to return
 *        @mode: O_RDONLY: break only write leases; O_WRONLY or O_RDWR:
 *            break all leases
 *        @type: FL_LEASE: break leases and delegations; FL_DELEG: break
 *            only delegations
 *
 *        break_lease (inlined for speed) has checked there already is at least
 *        some kind of lock (maybe a lease) on this file.  Leases are broken on
 *        a call to open() or truncate().  This function can sleep unless you
 *        specified %O_NONBLOCK to your open().
 */
int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
{
        int error = 0;
        struct file_lock_context *ctx;
        struct file_lease *new_fl, *fl, *tmp;
        unsigned long break_time;
        int want_write = (mode & O_ACCMODE) != O_RDONLY;
        LIST_HEAD(dispose);

        new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
        if (IS_ERR(new_fl))
                return PTR_ERR(new_fl);
        new_fl->c.flc_flags = type;

        /* typically we will check that ctx is non-NULL before calling */
        ctx = locks_inode_context(inode);
        if (!ctx) {
                WARN_ON_ONCE(1);
                goto free_lock;
        }

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);

        time_out_leases(inode, &dispose);

        if (!any_leases_conflict(inode, new_fl))
                goto out;

        break_time = 0;
        if (lease_break_time > 0) {
                break_time = jiffies + lease_break_time * HZ;
                if (break_time == 0)
                        break_time++;        /* so that 0 means no break time */
        }

        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) {
                if (!leases_conflict(&fl->c, &new_fl->c))
                        continue;
                if (want_write) {
                        if (fl->c.flc_flags & FL_UNLOCK_PENDING)
                                continue;
                        fl->c.flc_flags |= FL_UNLOCK_PENDING;
                        fl->fl_break_time = break_time;
                } else {
                        if (lease_breaking(fl))
                                continue;
                        fl->c.flc_flags |= FL_DOWNGRADE_PENDING;
                        fl->fl_downgrade_time = break_time;
                }
                if (fl->fl_lmops->lm_break(fl))
                        locks_delete_lock_ctx(&fl->c, &dispose);
        }

        if (list_empty(&ctx->flc_lease))
                goto out;

        if (mode & O_NONBLOCK) {
                trace_break_lease_noblock(inode, new_fl);
                error = -EWOULDBLOCK;
                goto out;
        }

restart:
        fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list);
        break_time = fl->fl_break_time;
        if (break_time != 0)
                break_time -= jiffies;
        if (break_time == 0)
                break_time++;
        locks_insert_block(&fl->c, &new_fl->c, leases_conflict);
        trace_break_lease_block(inode, new_fl);
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);

        locks_dispose_list(&dispose);
        error = wait_event_interruptible_timeout(new_fl->c.flc_wait,
                                                 list_empty(&new_fl->c.flc_blocked_member),
                                                 break_time);

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        trace_break_lease_unblock(inode, new_fl);
        __locks_delete_block(&new_fl->c);
        if (error >= 0) {
                /*
                 * Wait for the next conflicting lease that has not been
                 * broken yet
                 */
                if (error == 0)
                        time_out_leases(inode, &dispose);
                if (any_leases_conflict(inode, new_fl))
                        goto restart;
                error = 0;
        }
out:
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        locks_dispose_list(&dispose);
free_lock:
        locks_free_lease(new_fl);
        return error;
}
EXPORT_SYMBOL(__break_lease);

/**
 *        lease_get_mtime - update modified time of an inode with exclusive lease
 *        @inode: the inode
 *      @time:  pointer to a timespec which contains the last modified time
 *
 * This is to force NFS clients to flush their caches for files with
 * exclusive leases.  The justification is that if someone has an
 * exclusive lease, then they could be modifying it.
 */
void lease_get_mtime(struct inode *inode, struct timespec64 *time)
{
        bool has_lease = false;
        struct file_lock_context *ctx;
        struct file_lock_core *flc;

        ctx = locks_inode_context(inode);
        if (ctx && !list_empty_careful(&ctx->flc_lease)) {
                spin_lock(&ctx->flc_lock);
                flc = list_first_entry_or_null(&ctx->flc_lease,
                                               struct file_lock_core, flc_list);
                if (flc && flc->flc_type == F_WRLCK)
                        has_lease = true;
                spin_unlock(&ctx->flc_lock);
        }

        if (has_lease)
                *time = current_time(inode);
}
EXPORT_SYMBOL(lease_get_mtime);

/**
 *        fcntl_getlease - Enquire what lease is currently active
 *        @filp: the file
 *
 *        The value returned by this function will be one of
 *        (if no lease break is pending):
 *
 *        %F_RDLCK to indicate a shared lease is held.
 *
 *        %F_WRLCK to indicate an exclusive lease is held.
 *
 *        %F_UNLCK to indicate no lease is held.
 *
 *        (if a lease break is pending):
 *
 *        %F_RDLCK to indicate an exclusive lease needs to be
 *                changed to a shared lease (or removed).
 *
 *        %F_UNLCK to indicate the lease needs to be removed.
 *
 *        XXX: sfr & willy disagree over whether F_INPROGRESS
 *        should be returned to userspace.
 */
int fcntl_getlease(struct file *filp)
{
        struct file_lease *fl;
        struct inode *inode = file_inode(filp);
        struct file_lock_context *ctx;
        int type = F_UNLCK;
        LIST_HEAD(dispose);

        ctx = locks_inode_context(inode);
        if (ctx && !list_empty_careful(&ctx->flc_lease)) {
                percpu_down_read(&file_rwsem);
                spin_lock(&ctx->flc_lock);
                time_out_leases(inode, &dispose);
                list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
                        if (fl->c.flc_file != filp)
                                continue;
                        type = target_leasetype(fl);
                        break;
                }
                spin_unlock(&ctx->flc_lock);
                percpu_up_read(&file_rwsem);

                locks_dispose_list(&dispose);
        }
        return type;
}

/**
 * check_conflicting_open - see if the given file points to an inode that has
 *                            an existing open that would conflict with the
 *                            desired lease.
 * @filp:        file to check
 * @arg:        type of lease that we're trying to acquire
 * @flags:        current lock flags
 *
 * Check to see if there's an existing open fd on this file that would
 * conflict with the lease we're trying to set.
 */
static int
check_conflicting_open(struct file *filp, const int arg, int flags)
{
        struct inode *inode = file_inode(filp);
        int self_wcount = 0, self_rcount = 0;

        if (flags & FL_LAYOUT)
                return 0;
        if (flags & FL_DELEG)
                /* We leave these checks to the caller */
                return 0;

        if (arg == F_RDLCK)
                return inode_is_open_for_write(inode) ? -EAGAIN : 0;
        else if (arg != F_WRLCK)
                return 0;

        /*
         * Make sure that only read/write count is from lease requestor.
         * Note that this will result in denying write leases when i_writecount
         * is negative, which is what we want.  (We shouldn't grant write leases
         * on files open for execution.)
         */
        if (filp->f_mode & FMODE_WRITE)
                self_wcount = 1;
        else if (filp->f_mode & FMODE_READ)
                self_rcount = 1;

        if (atomic_read(&inode->i_writecount) != self_wcount ||
            atomic_read(&inode->i_readcount) != self_rcount)
                return -EAGAIN;

        return 0;
}

static int
generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **priv)
{
        struct file_lease *fl, *my_fl = NULL, *lease;
        struct inode *inode = file_inode(filp);
        struct file_lock_context *ctx;
        bool is_deleg = (*flp)->c.flc_flags & FL_DELEG;
        int error;
        LIST_HEAD(dispose);

        lease = *flp;
        trace_generic_add_lease(inode, lease);

        /* Note that arg is never F_UNLCK here */
        ctx = locks_get_lock_context(inode, arg);
        if (!ctx)
                return -ENOMEM;

        /*
         * In the delegation case we need mutual exclusion with
         * a number of operations that take the i_mutex.  We trylock
         * because delegations are an optional optimization, and if
         * there's some chance of a conflict--we'd rather not
         * bother, maybe that's a sign this just isn't a good file to
         * hand out a delegation on.
         */
        if (is_deleg && !inode_trylock(inode))
                return -EAGAIN;

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        time_out_leases(inode, &dispose);
        error = check_conflicting_open(filp, arg, lease->c.flc_flags);
        if (error)
                goto out;

        /*
         * At this point, we know that if there is an exclusive
         * lease on this file, then we hold it on this filp
         * (otherwise our open of this file would have blocked).
         * And if we are trying to acquire an exclusive lease,
         * then the file is not open by anyone (including us)
         * except for this filp.
         */
        error = -EAGAIN;
        list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
                if (fl->c.flc_file == filp &&
                    fl->c.flc_owner == lease->c.flc_owner) {
                        my_fl = fl;
                        continue;
                }

                /*
                 * No exclusive leases if someone else has a lease on
                 * this file:
                 */
                if (arg == F_WRLCK)
                        goto out;
                /*
                 * Modifying our existing lease is OK, but no getting a
                 * new lease if someone else is opening for write:
                 */
                if (fl->c.flc_flags & FL_UNLOCK_PENDING)
                        goto out;
        }

        if (my_fl != NULL) {
                lease = my_fl;
                error = lease->fl_lmops->lm_change(lease, arg, &dispose);
                if (error)
                        goto out;
                goto out_setup;
        }

        error = -EINVAL;
        if (!leases_enable)
                goto out;

        locks_insert_lock_ctx(&lease->c, &ctx->flc_lease);
        /*
         * The check in break_lease() is lockless. It's possible for another
         * open to race in after we did the earlier check for a conflicting
         * open but before the lease was inserted. Check again for a
         * conflicting open and cancel the lease if there is one.
         *
         * We also add a barrier here to ensure that the insertion of the lock
         * precedes these checks.
         */
        smp_mb();
        error = check_conflicting_open(filp, arg, lease->c.flc_flags);
        if (error) {
                locks_unlink_lock_ctx(&lease->c);
                goto out;
        }

out_setup:
        if (lease->fl_lmops->lm_setup)
                lease->fl_lmops->lm_setup(lease, priv);
out:
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        locks_dispose_list(&dispose);
        if (is_deleg)
                inode_unlock(inode);
        if (!error && !my_fl)
                *flp = NULL;
        return error;
}

static int generic_delete_lease(struct file *filp, void *owner)
{
        int error = -EAGAIN;
        struct file_lease *fl, *victim = NULL;
        struct inode *inode = file_inode(filp);
        struct file_lock_context *ctx;
        LIST_HEAD(dispose);

        ctx = locks_inode_context(inode);
        if (!ctx) {
                trace_generic_delete_lease(inode, NULL);
                return error;
        }

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
                if (fl->c.flc_file == filp &&
                    fl->c.flc_owner == owner) {
                        victim = fl;
                        break;
                }
        }
        trace_generic_delete_lease(inode, victim);
        if (victim)
                error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        locks_dispose_list(&dispose);
        return error;
}

/**
 *        generic_setlease        -        sets a lease on an open file
 *        @filp:        file pointer
 *        @arg:        type of lease to obtain
 *        @flp:        input - file_lock to use, output - file_lock inserted
 *        @priv:        private data for lm_setup (may be NULL if lm_setup
 *                doesn't require it)
 *
 *        The (input) flp->fl_lmops->lm_break function is required
 *        by break_lease().
 */
int generic_setlease(struct file *filp, int arg, struct file_lease **flp,
                        void **priv)
{
        switch (arg) {
        case F_UNLCK:
                return generic_delete_lease(filp, *priv);
        case F_RDLCK:
        case F_WRLCK:
                if (!(*flp)->fl_lmops->lm_break) {
                        WARN_ON_ONCE(1);
                        return -ENOLCK;
                }

                return generic_add_lease(filp, arg, flp, priv);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(generic_setlease);

/*
 * Kernel subsystems can register to be notified on any attempt to set
 * a new lease with the lease_notifier_chain. This is used by (e.g.) nfsd
 * to close files that it may have cached when there is an attempt to set a
 * conflicting lease.
 */
static struct srcu_notifier_head lease_notifier_chain;

static inline void
lease_notifier_chain_init(void)
{
        srcu_init_notifier_head(&lease_notifier_chain);
}

static inline void
setlease_notifier(int arg, struct file_lease *lease)
{
        if (arg != F_UNLCK)
                srcu_notifier_call_chain(&lease_notifier_chain, arg, lease);
}

int lease_register_notifier(struct notifier_block *nb)
{
        return srcu_notifier_chain_register(&lease_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(lease_register_notifier);

void lease_unregister_notifier(struct notifier_block *nb)
{
        srcu_notifier_chain_unregister(&lease_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(lease_unregister_notifier);


int
kernel_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv)
{
        if (lease)
                setlease_notifier(arg, *lease);
        if (filp->f_op->setlease)
                return filp->f_op->setlease(filp, arg, lease, priv);
        else
                return generic_setlease(filp, arg, lease, priv);
}
EXPORT_SYMBOL_GPL(kernel_setlease);

/**
 * vfs_setlease        -       sets a lease on an open file
 * @filp:        file pointer
 * @arg:        type of lease to obtain
 * @lease:        file_lock to use when adding a lease
 * @priv:        private info for lm_setup when adding a lease (may be
 *                NULL if lm_setup doesn't require it)
 *
 * Call this to establish a lease on the file. The "lease" argument is not
 * used for F_UNLCK requests and may be NULL. For commands that set or alter
 * an existing lease, the ``(*lease)->fl_lmops->lm_break`` operation must be
 * set; if not, this function will return -ENOLCK (and generate a scary-looking
 * stack trace).
 *
 * The "priv" pointer is passed directly to the lm_setup function as-is. It
 * may be NULL if the lm_setup operation doesn't require it.
 */
int
vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv)
{
        struct inode *inode = file_inode(filp);
        vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(filp), inode);
        int error;

        if ((!vfsuid_eq_kuid(vfsuid, current_fsuid())) && !capable(CAP_LEASE))
                return -EACCES;
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;
        error = security_file_lock(filp, arg);
        if (error)
                return error;
        return kernel_setlease(filp, arg, lease, priv);
}
EXPORT_SYMBOL_GPL(vfs_setlease);

static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg)
{
        struct file_lease *fl;
        struct fasync_struct *new;
        int error;

        fl = lease_alloc(filp, arg);
        if (IS_ERR(fl))
                return PTR_ERR(fl);

        new = fasync_alloc();
        if (!new) {
                locks_free_lease(fl);
                return -ENOMEM;
        }
        new->fa_fd = fd;

        error = vfs_setlease(filp, arg, &fl, (void **)&new);
        if (fl)
                locks_free_lease(fl);
        if (new)
                fasync_free(new);
        return error;
}

/**
 *        fcntl_setlease        -        sets a lease on an open file
 *        @fd: open file descriptor
 *        @filp: file pointer
 *        @arg: type of lease to obtain
 *
 *        Call this fcntl to establish a lease on the file.
 *        Note that you also need to call %F_SETSIG to
 *        receive a signal when the lease is broken.
 */
int fcntl_setlease(unsigned int fd, struct file *filp, int arg)
{
        if (arg == F_UNLCK)
                return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
        return do_fcntl_add_lease(fd, filp, arg);
}

/**
 * flock_lock_inode_wait - Apply a FLOCK-style lock to a file
 * @inode: inode of the file to apply to
 * @fl: The lock to be applied
 *
 * Apply a FLOCK style lock request to an inode.
 */
static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        int error;
        might_sleep();
        for (;;) {
                error = flock_lock_inode(inode, fl);
                if (error != FILE_LOCK_DEFERRED)
                        break;
                error = wait_event_interruptible(fl->c.flc_wait,
                                                 list_empty(&fl->c.flc_blocked_member));
                if (error)
                        break;
        }
        locks_delete_block(fl);
        return error;
}

/**
 * locks_lock_inode_wait - Apply a lock to an inode
 * @inode: inode of the file to apply to
 * @fl: The lock to be applied
 *
 * Apply a POSIX or FLOCK style lock request to an inode.
 */
int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        int res = 0;
        switch (fl->c.flc_flags & (FL_POSIX|FL_FLOCK)) {
                case FL_POSIX:
                        res = posix_lock_inode_wait(inode, fl);
                        break;
                case FL_FLOCK:
                        res = flock_lock_inode_wait(inode, fl);
                        break;
                default:
                        BUG();
        }
        return res;
}
EXPORT_SYMBOL(locks_lock_inode_wait);

/**
 *        sys_flock: - flock() system call.
 *        @fd: the file descriptor to lock.
 *        @cmd: the type of lock to apply.
 *
 *        Apply a %FL_FLOCK style lock to an open file descriptor.
 *        The @cmd can be one of:
 *
 *        - %LOCK_SH -- a shared lock.
 *        - %LOCK_EX -- an exclusive lock.
 *        - %LOCK_UN -- remove an existing lock.
 *        - %LOCK_MAND -- a 'mandatory' flock. (DEPRECATED)
 *
 *        %LOCK_MAND support has been removed from the kernel.
 */
SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
{
        int can_sleep, error, type;
        struct file_lock fl;
        struct fd f;

        /*
         * LOCK_MAND locks were broken for a long time in that they never
         * conflicted with one another and didn't prevent any sort of open,
         * read or write activity.
         *
         * Just ignore these requests now, to preserve legacy behavior, but
         * throw a warning to let people know that they don't actually work.
         */
        if (cmd & LOCK_MAND) {
                pr_warn_once("%s(%d): Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n", current->comm, current->pid);
                return 0;
        }

        type = flock_translate_cmd(cmd & ~LOCK_NB);
        if (type < 0)
                return type;

        error = -EBADF;
        f = fdget(fd);
        if (!f.file)
                return error;

        if (type != F_UNLCK && !(f.file->f_mode & (FMODE_READ | FMODE_WRITE)))
                goto out_putf;

        flock_make_lock(f.file, &fl, type);

        error = security_file_lock(f.file, fl.c.flc_type);
        if (error)
                goto out_putf;

        can_sleep = !(cmd & LOCK_NB);
        if (can_sleep)
                fl.c.flc_flags |= FL_SLEEP;

        if (f.file->f_op->flock)
                error = f.file->f_op->flock(f.file,
                                            (can_sleep) ? F_SETLKW : F_SETLK,
                                            &fl);
        else
                error = locks_lock_file_wait(f.file, &fl);

        locks_release_private(&fl);
 out_putf:
        fdput(f);

        return error;
}

/**
 * vfs_test_lock - test file byte range lock
 * @filp: The file to test lock for
 * @fl: The lock to test; also used to hold result
 *
 * Returns -ERRNO on failure.  Indicates presence of conflicting lock by
 * setting conf->fl_type to something other than F_UNLCK.
 */
int vfs_test_lock(struct file *filp, struct file_lock *fl)
{
        WARN_ON_ONCE(filp != fl->c.flc_file);
        if (filp->f_op->lock)
                return filp->f_op->lock(filp, F_GETLK, fl);
        posix_test_lock(filp, fl);
        return 0;
}
EXPORT_SYMBOL_GPL(vfs_test_lock);

/**
 * locks_translate_pid - translate a file_lock's fl_pid number into a namespace
 * @fl: The file_lock who's fl_pid should be translated
 * @ns: The namespace into which the pid should be translated
 *
 * Used to translate a fl_pid into a namespace virtual pid number
 */
static pid_t locks_translate_pid(struct file_lock_core *fl, struct pid_namespace *ns)
{
        pid_t vnr;
        struct pid *pid;

        if (fl->flc_flags & FL_OFDLCK)
                return -1;

        /* Remote locks report a negative pid value */
        if (fl->flc_pid <= 0)
                return fl->flc_pid;

        /*
         * If the flock owner process is dead and its pid has been already
         * freed, the translation below won't work, but we still want to show
         * flock owner pid number in init pidns.
         */
        if (ns == &init_pid_ns)
                return (pid_t) fl->flc_pid;

        rcu_read_lock();
        pid = find_pid_ns(fl->flc_pid, &init_pid_ns);
        vnr = pid_nr_ns(pid, ns);
        rcu_read_unlock();
        return vnr;
}

static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
{
        flock->l_pid = locks_translate_pid(&fl->c, task_active_pid_ns(current));
#if BITS_PER_LONG == 32
        /*
         * Make sure we can represent the posix lock via
         * legacy 32bit flock.
         */
        if (fl->fl_start > OFFT_OFFSET_MAX)
                return -EOVERFLOW;
        if (fl->fl_end != OFFSET_MAX && fl->fl_end > OFFT_OFFSET_MAX)
                return -EOVERFLOW;
#endif
        flock->l_start = fl->fl_start;
        flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
                fl->fl_end - fl->fl_start + 1;
        flock->l_whence = 0;
        flock->l_type = fl->c.flc_type;
        return 0;
}

#if BITS_PER_LONG == 32
static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
{
        flock->l_pid = locks_translate_pid(&fl->c, task_active_pid_ns(current));
        flock->l_start = fl->fl_start;
        flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
                fl->fl_end - fl->fl_start + 1;
        flock->l_whence = 0;
        flock->l_type = fl->c.flc_type;
}
#endif

/* Report the first existing lock that would conflict with l.
 * This implements the F_GETLK command of fcntl().
 */
int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock)
{
        struct file_lock *fl;
        int error;

        fl = locks_alloc_lock();
        if (fl == NULL)
                return -ENOMEM;
        error = -EINVAL;
        if (cmd != F_OFD_GETLK && flock->l_type != F_RDLCK
                        && flock->l_type != F_WRLCK)
                goto out;

        error = flock_to_posix_lock(filp, fl, flock);
        if (error)
                goto out;

        if (cmd == F_OFD_GETLK) {
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                fl->c.flc_flags |= FL_OFDLCK;
                fl->c.flc_owner = filp;
        }

        error = vfs_test_lock(filp, fl);
        if (error)
                goto out;

        flock->l_type = fl->c.flc_type;
        if (fl->c.flc_type != F_UNLCK) {
                error = posix_lock_to_flock(flock, fl);
                if (error)
                        goto out;
        }
out:
        locks_free_lock(fl);
        return error;
}

/**
 * vfs_lock_file - file byte range lock
 * @filp: The file to apply the lock to
 * @cmd: type of locking operation (F_SETLK, F_GETLK, etc.)
 * @fl: The lock to be applied
 * @conf: Place to return a copy of the conflicting lock, if found.
 *
 * A caller that doesn't care about the conflicting lock may pass NULL
 * as the final argument.
 *
 * If the filesystem defines a private ->lock() method, then @conf will
 * be left unchanged; so a caller that cares should initialize it to
 * some acceptable default.
 *
 * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
 * locks, the ->lock() interface may return asynchronously, before the lock has
 * been granted or denied by the underlying filesystem, if (and only if)
 * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations
 * flags need to be set.
 *
 * Callers expecting ->lock() to return asynchronously will only use F_SETLK,
 * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a
 * blocking lock. When ->lock() does return asynchronously, it must return
 * FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock request completes.
 * If the request is for non-blocking lock the file system should return
 * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
 * with the result. If the request timed out the callback routine will return a
 * nonzero return code and the file system should release the lock. The file
 * system is also responsible to keep a corresponding posix lock when it
 * grants a lock so the VFS can find out which locks are locally held and do
 * the correct lock cleanup when required.
 * The underlying filesystem must not drop the kernel lock or call
 * ->lm_grant() before returning to the caller with a FILE_LOCK_DEFERRED
 * return code.
 */
int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
{
        WARN_ON_ONCE(filp != fl->c.flc_file);
        if (filp->f_op->lock)
                return filp->f_op->lock(filp, cmd, fl);
        else
                return posix_lock_file(filp, fl, conf);
}
EXPORT_SYMBOL_GPL(vfs_lock_file);

static int do_lock_file_wait(struct file *filp, unsigned int cmd,
                             struct file_lock *fl)
{
        int error;

        error = security_file_lock(filp, fl->c.flc_type);
        if (error)
                return error;

        for (;;) {
                error = vfs_lock_file(filp, cmd, fl, NULL);
                if (error != FILE_LOCK_DEFERRED)
                        break;
                error = wait_event_interruptible(fl->c.flc_wait,
                                                 list_empty(&fl->c.flc_blocked_member));
                if (error)
                        break;
        }
        locks_delete_block(fl);

        return error;
}

/* Ensure that fl->fl_file has compatible f_mode for F_SETLK calls */
static int
check_fmode_for_setlk(struct file_lock *fl)
{
        switch (fl->c.flc_type) {
        case F_RDLCK:
                if (!(fl->c.flc_file->f_mode & FMODE_READ))
                        return -EBADF;
                break;
        case F_WRLCK:
                if (!(fl->c.flc_file->f_mode & FMODE_WRITE))
                        return -EBADF;
        }
        return 0;
}

/* Apply the lock described by l to an open file descriptor.
 * This implements both the F_SETLK and F_SETLKW commands of fcntl().
 */
int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
                struct flock *flock)
{
        struct file_lock *file_lock = locks_alloc_lock();
        struct inode *inode = file_inode(filp);
        struct file *f;
        int error;

        if (file_lock == NULL)
                return -ENOLCK;

        error = flock_to_posix_lock(filp, file_lock, flock);
        if (error)
                goto out;

        error = check_fmode_for_setlk(file_lock);
        if (error)
                goto out;

        /*
         * If the cmd is requesting file-private locks, then set the
         * FL_OFDLCK flag and override the owner.
         */
        switch (cmd) {
        case F_OFD_SETLK:
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_SETLK;
                file_lock->c.flc_flags |= FL_OFDLCK;
                file_lock->c.flc_owner = filp;
                break;
        case F_OFD_SETLKW:
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_SETLKW;
                file_lock->c.flc_flags |= FL_OFDLCK;
                file_lock->c.flc_owner = filp;
                fallthrough;
        case F_SETLKW:
                file_lock->c.flc_flags |= FL_SLEEP;
        }

        error = do_lock_file_wait(filp, cmd, file_lock);

        /*
         * Attempt to detect a close/fcntl race and recover by releasing the
         * lock that was just acquired. There is no need to do that when we're
         * unlocking though, or for OFD locks.
         */
        if (!error && file_lock->c.flc_type != F_UNLCK &&
            !(file_lock->c.flc_flags & FL_OFDLCK)) {
                struct files_struct *files = current->files;
                /*
                 * We need that spin_lock here - it prevents reordering between
                 * update of i_flctx->flc_posix and check for it done in
                 * close(). rcu_read_lock() wouldn't do.
                 */
                spin_lock(&files->file_lock);
                f = files_lookup_fd_locked(files, fd);
                spin_unlock(&files->file_lock);
                if (f != filp) {
                        file_lock->c.flc_type = F_UNLCK;
                        error = do_lock_file_wait(filp, cmd, file_lock);
                        WARN_ON_ONCE(error);
                        error = -EBADF;
                }
        }
out:
        trace_fcntl_setlk(inode, file_lock, error);
        locks_free_lock(file_lock);
        return error;
}

#if BITS_PER_LONG == 32
/* Report the first existing lock that would conflict with l.
 * This implements the F_GETLK command of fcntl().
 */
int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock)
{
        struct file_lock *fl;
        int error;

        fl = locks_alloc_lock();
        if (fl == NULL)
                return -ENOMEM;

        error = -EINVAL;
        if (cmd != F_OFD_GETLK && flock->l_type != F_RDLCK
                        && flock->l_type != F_WRLCK)
                goto out;

        error = flock64_to_posix_lock(filp, fl, flock);
        if (error)
                goto out;

        if (cmd == F_OFD_GETLK) {
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                fl->c.flc_flags |= FL_OFDLCK;
                fl->c.flc_owner = filp;
        }

        error = vfs_test_lock(filp, fl);
        if (error)
                goto out;

        flock->l_type = fl->c.flc_type;
        if (fl->c.flc_type != F_UNLCK)
                posix_lock_to_flock64(flock, fl);

out:
        locks_free_lock(fl);
        return error;
}

/* Apply the lock described by l to an open file descriptor.
 * This implements both the F_SETLK and F_SETLKW commands of fcntl().
 */
int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
                struct flock64 *flock)
{
        struct file_lock *file_lock = locks_alloc_lock();
        struct file *f;
        int error;

        if (file_lock == NULL)
                return -ENOLCK;

        error = flock64_to_posix_lock(filp, file_lock, flock);
        if (error)
                goto out;

        error = check_fmode_for_setlk(file_lock);
        if (error)
                goto out;

        /*
         * If the cmd is requesting file-private locks, then set the
         * FL_OFDLCK flag and override the owner.
         */
        switch (cmd) {
        case F_OFD_SETLK:
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_SETLK64;
                file_lock->c.flc_flags |= FL_OFDLCK;
                file_lock->c.flc_owner = filp;
                break;
        case F_OFD_SETLKW:
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_SETLKW64;
                file_lock->c.flc_flags |= FL_OFDLCK;
                file_lock->c.flc_owner = filp;
                fallthrough;
        case F_SETLKW64:
                file_lock->c.flc_flags |= FL_SLEEP;
        }

        error = do_lock_file_wait(filp, cmd, file_lock);

        /*
         * Attempt to detect a close/fcntl race and recover by releasing the
         * lock that was just acquired. There is no need to do that when we're
         * unlocking though, or for OFD locks.
         */
        if (!error && file_lock->c.flc_type != F_UNLCK &&
            !(file_lock->c.flc_flags & FL_OFDLCK)) {
                struct files_struct *files = current->files;
                /*
                 * We need that spin_lock here - it prevents reordering between
                 * update of i_flctx->flc_posix and check for it done in
                 * close(). rcu_read_lock() wouldn't do.
                 */
                spin_lock(&files->file_lock);
                f = files_lookup_fd_locked(files, fd);
                spin_unlock(&files->file_lock);
                if (f != filp) {
                        file_lock->c.flc_type = F_UNLCK;
                        error = do_lock_file_wait(filp, cmd, file_lock);
                        WARN_ON_ONCE(error);
                        error = -EBADF;
                }
        }
out:
        locks_free_lock(file_lock);
        return error;
}
#endif /* BITS_PER_LONG == 32 */

/*
 * This function is called when the file is being removed
 * from the task's fd array.  POSIX locks belonging to this task
 * are deleted at this time.
 */
void locks_remove_posix(struct file *filp, fl_owner_t owner)
{
        int error;
        struct inode *inode = file_inode(filp);
        struct file_lock lock;
        struct file_lock_context *ctx;

        /*
         * If there are no locks held on this file, we don't need to call
         * posix_lock_file().  Another process could be setting a lock on this
         * file at the same time, but we wouldn't remove that lock anyway.
         */
        ctx = locks_inode_context(inode);
        if (!ctx || list_empty(&ctx->flc_posix))
                return;

        locks_init_lock(&lock);
        lock.c.flc_type = F_UNLCK;
        lock.c.flc_flags = FL_POSIX | FL_CLOSE;
        lock.fl_start = 0;
        lock.fl_end = OFFSET_MAX;
        lock.c.flc_owner = owner;
        lock.c.flc_pid = current->tgid;
        lock.c.flc_file = filp;
        lock.fl_ops = NULL;
        lock.fl_lmops = NULL;

        error = vfs_lock_file(filp, F_SETLK, &lock, NULL);

        if (lock.fl_ops && lock.fl_ops->fl_release_private)
                lock.fl_ops->fl_release_private(&lock);
        trace_locks_remove_posix(inode, &lock, error);
}
EXPORT_SYMBOL(locks_remove_posix);

/* The i_flctx must be valid when calling into here */
static void
locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
{
        struct file_lock fl;
        struct inode *inode = file_inode(filp);

        if (list_empty(&flctx->flc_flock))
                return;

        flock_make_lock(filp, &fl, F_UNLCK);
        fl.c.flc_flags |= FL_CLOSE;

        if (filp->f_op->flock)
                filp->f_op->flock(filp, F_SETLKW, &fl);
        else
                flock_lock_inode(inode, &fl);

        if (fl.fl_ops && fl.fl_ops->fl_release_private)
                fl.fl_ops->fl_release_private(&fl);
}

/* The i_flctx must be valid when calling into here */
static void
locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
{
        struct file_lease *fl, *tmp;
        LIST_HEAD(dispose);

        if (list_empty(&ctx->flc_lease))
                return;

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list)
                if (filp == fl->c.flc_file)
                        lease_modify(fl, F_UNLCK, &dispose);
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);

        locks_dispose_list(&dispose);
}

/*
 * This function is called on the last close of an open file.
 */
void locks_remove_file(struct file *filp)
{
        struct file_lock_context *ctx;

        ctx = locks_inode_context(file_inode(filp));
        if (!ctx)
                return;

        /* remove any OFD locks */
        locks_remove_posix(filp, filp);

        /* remove flock locks */
        locks_remove_flock(filp, ctx);

        /* remove any leases */
        locks_remove_lease(filp, ctx);

        spin_lock(&ctx->flc_lock);
        locks_check_ctx_file_list(filp, &ctx->flc_posix, "POSIX");
        locks_check_ctx_file_list(filp, &ctx->flc_flock, "FLOCK");
        locks_check_ctx_file_list(filp, &ctx->flc_lease, "LEASE");
        spin_unlock(&ctx->flc_lock);
}

/**
 * vfs_cancel_lock - file byte range unblock lock
 * @filp: The file to apply the unblock to
 * @fl: The lock to be unblocked
 *
 * Used by lock managers to cancel blocked requests
 */
int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
{
        WARN_ON_ONCE(filp != fl->c.flc_file);
        if (filp->f_op->lock)
                return filp->f_op->lock(filp, F_CANCELLK, fl);
        return 0;
}
EXPORT_SYMBOL_GPL(vfs_cancel_lock);

/**
 * vfs_inode_has_locks - are any file locks held on @inode?
 * @inode: inode to check for locks
 *
 * Return true if there are any FL_POSIX or FL_FLOCK locks currently
 * set on @inode.
 */
bool vfs_inode_has_locks(struct inode *inode)
{
        struct file_lock_context *ctx;
        bool ret;

        ctx = locks_inode_context(inode);
        if (!ctx)
                return false;

        spin_lock(&ctx->flc_lock);
        ret = !list_empty(&ctx->flc_posix) || !list_empty(&ctx->flc_flock);
        spin_unlock(&ctx->flc_lock);
        return ret;
}
EXPORT_SYMBOL_GPL(vfs_inode_has_locks);

#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <linux/seq_file.h>

struct locks_iterator {
        int        li_cpu;
        loff_t        li_pos;
};

static void lock_get_status(struct seq_file *f, struct file_lock_core *flc,
                            loff_t id, char *pfx, int repeat)
{
        struct inode *inode = NULL;
        unsigned int pid;
        struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
        int type = flc->flc_type;
        struct file_lock *fl = file_lock(flc);

        pid = locks_translate_pid(flc, proc_pidns);

        /*
         * If lock owner is dead (and pid is freed) or not visible in current
         * pidns, zero is shown as a pid value. Check lock info from
         * init_pid_ns to get saved lock pid value.
         */
        if (flc->flc_file != NULL)
                inode = file_inode(flc->flc_file);

        seq_printf(f, "%lld: ", id);

        if (repeat)
                seq_printf(f, "%*s", repeat - 1 + (int)strlen(pfx), pfx);

        if (flc->flc_flags & FL_POSIX) {
                if (flc->flc_flags & FL_ACCESS)
                        seq_puts(f, "ACCESS");
                else if (flc->flc_flags & FL_OFDLCK)
                        seq_puts(f, "OFDLCK");
                else
                        seq_puts(f, "POSIX ");

                seq_printf(f, " %s ",
                             (inode == NULL) ? "*NOINODE*" : "ADVISORY ");
        } else if (flc->flc_flags & FL_FLOCK) {
                seq_puts(f, "FLOCK  ADVISORY  ");
        } else if (flc->flc_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) {
                struct file_lease *lease = file_lease(flc);

                type = target_leasetype(lease);

                if (flc->flc_flags & FL_DELEG)
                        seq_puts(f, "DELEG  ");
                else
                        seq_puts(f, "LEASE  ");

                if (lease_breaking(lease))
                        seq_puts(f, "BREAKING  ");
                else if (flc->flc_file)
                        seq_puts(f, "ACTIVE    ");
                else
                        seq_puts(f, "BREAKER   ");
        } else {
                seq_puts(f, "UNKNOWN UNKNOWN  ");
        }

        seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
                             (type == F_RDLCK) ? "READ" : "UNLCK");
        if (inode) {
                /* userspace relies on this representation of dev_t */
                seq_printf(f, "%d %02x:%02x:%lu ", pid,
                                MAJOR(inode->i_sb->s_dev),
                                MINOR(inode->i_sb->s_dev), inode->i_ino);
        } else {
                seq_printf(f, "%d <none>:0 ", pid);
        }
        if (flc->flc_flags & FL_POSIX) {
                if (fl->fl_end == OFFSET_MAX)
                        seq_printf(f, "%Ld EOF\n", fl->fl_start);
                else
                        seq_printf(f, "%Ld %Ld\n", fl->fl_start, fl->fl_end);
        } else {
                seq_puts(f, "0 EOF\n");
        }
}

static struct file_lock_core *get_next_blocked_member(struct file_lock_core *node)
{
        struct file_lock_core *tmp;

        /* NULL node or root node */
        if (node == NULL || node->flc_blocker == NULL)
                return NULL;

        /* Next member in the linked list could be itself */
        tmp = list_next_entry(node, flc_blocked_member);
        if (list_entry_is_head(tmp, &node->flc_blocker->flc_blocked_requests,
                               flc_blocked_member)
                || tmp == node) {
                return NULL;
        }

        return tmp;
}

static int locks_show(struct seq_file *f, void *v)
{
        struct locks_iterator *iter = f->private;
        struct file_lock_core *cur, *tmp;
        struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
        int level = 0;

        cur = hlist_entry(v, struct file_lock_core, flc_link);

        if (locks_translate_pid(cur, proc_pidns) == 0)
                return 0;

        /* View this crossed linked list as a binary tree, the first member of flc_blocked_requests
         * is the left child of current node, the next silibing in flc_blocked_member is the
         * right child, we can alse get the parent of current node from flc_blocker, so this
         * question becomes traversal of a binary tree
         */
        while (cur != NULL) {
                if (level)
                        lock_get_status(f, cur, iter->li_pos, "-> ", level);
                else
                        lock_get_status(f, cur, iter->li_pos, "", level);

                if (!list_empty(&cur->flc_blocked_requests)) {
                        /* Turn left */
                        cur = list_first_entry_or_null(&cur->flc_blocked_requests,
                                                       struct file_lock_core,
                                                       flc_blocked_member);
                        level++;
                } else {
                        /* Turn right */
                        tmp = get_next_blocked_member(cur);
                        /* Fall back to parent node */
                        while (tmp == NULL && cur->flc_blocker != NULL) {
                                cur = cur->flc_blocker;
                                level--;
                                tmp = get_next_blocked_member(cur);
                        }
                        cur = tmp;
                }
        }

        return 0;
}

static void __show_fd_locks(struct seq_file *f,
                        struct list_head *head, int *id,
                        struct file *filp, struct files_struct *files)
{
        struct file_lock_core *fl;

        list_for_each_entry(fl, head, flc_list) {

                if (filp != fl->flc_file)
                        continue;
                if (fl->flc_owner != files && fl->flc_owner != filp)
                        continue;

                (*id)++;
                seq_puts(f, "lock:\t");
                lock_get_status(f, fl, *id, "", 0);
        }
}

void show_fd_locks(struct seq_file *f,
                  struct file *filp, struct files_struct *files)
{
        struct inode *inode = file_inode(filp);
        struct file_lock_context *ctx;
        int id = 0;

        ctx = locks_inode_context(inode);
        if (!ctx)
                return;

        spin_lock(&ctx->flc_lock);
        __show_fd_locks(f, &ctx->flc_flock, &id, filp, files);
        __show_fd_locks(f, &ctx->flc_posix, &id, filp, files);
        __show_fd_locks(f, &ctx->flc_lease, &id, filp, files);
        spin_unlock(&ctx->flc_lock);
}

static void *locks_start(struct seq_file *f, loff_t *pos)
        __acquires(&blocked_lock_lock)
{
        struct locks_iterator *iter = f->private;

        iter->li_pos = *pos + 1;
        percpu_down_write(&file_rwsem);
        spin_lock(&blocked_lock_lock);
        return seq_hlist_start_percpu(&file_lock_list.hlist, &iter->li_cpu, *pos);
}

static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
{
        struct locks_iterator *iter = f->private;

        ++iter->li_pos;
        return seq_hlist_next_percpu(v, &file_lock_list.hlist, &iter->li_cpu, pos);
}

static void locks_stop(struct seq_file *f, void *v)
        __releases(&blocked_lock_lock)
{
        spin_unlock(&blocked_lock_lock);
        percpu_up_write(&file_rwsem);
}

static const struct seq_operations locks_seq_operations = {
        .start        = locks_start,
        .next        = locks_next,
        .stop        = locks_stop,
        .show        = locks_show,
};

static int __init proc_locks_init(void)
{
        proc_create_seq_private("locks", 0, NULL, &locks_seq_operations,
                        sizeof(struct locks_iterator), NULL);
        return 0;
}
fs_initcall(proc_locks_init);
#endif

static int __init filelock_init(void)
{
        int i;

        flctx_cache = kmem_cache_create("file_lock_ctx",
                        sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);

        filelock_cache = kmem_cache_create("file_lock_cache",
                        sizeof(struct file_lock), 0, SLAB_PANIC, NULL);

        filelease_cache = kmem_cache_create("file_lock_cache",
                        sizeof(struct file_lease), 0, SLAB_PANIC, NULL);

        for_each_possible_cpu(i) {
                struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);

                spin_lock_init(&fll->lock);
                INIT_HLIST_HEAD(&fll->hlist);
        }

        lease_notifier_chain_init();
        return 0;
}
core_initcall(filelock_init);





















































    1 





    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_TRAPS_H
#define _ASM_X86_TRAPS_H

#include <linux/context_tracking_state.h>
#include <linux/kprobes.h>

#include <asm/debugreg.h>
#include <asm/idtentry.h>
#include <asm/siginfo.h>                        /* TRAP_TRACE, ... */
#include <asm/trap_pf.h>

#ifdef CONFIG_X86_64
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs);
asmlinkage __visible notrace
struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs);
asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs);
#endif

extern int ibt_selftest(void);
extern int ibt_selftest_noendbr(void);

#ifdef CONFIG_X86_F00F_BUG
/* For handling the FOOF bug */
void handle_invalid_op(struct pt_regs *regs);
#endif

static inline int get_si_code(unsigned long condition)
{
        if (condition & DR_STEP)
                return TRAP_TRACE;
        else if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3))
                return TRAP_HWBKPT;
        else
                return TRAP_BRKPT;
}

extern int panic_on_unrecovered_nmi;

void math_emulate(struct math_emu_info *);

bool fault_in_kernel_space(unsigned long address);

#ifdef CONFIG_VMAP_STACK
void __noreturn handle_stack_overflow(struct pt_regs *regs,
                                      unsigned long fault_address,
                                      struct stack_info *info);
#endif

static inline void cond_local_irq_enable(struct pt_regs *regs)
{
        if (regs->flags & X86_EFLAGS_IF)
                local_irq_enable();
}

static inline void cond_local_irq_disable(struct pt_regs *regs)
{
        if (regs->flags & X86_EFLAGS_IF)
                local_irq_disable();
}

#endif /* _ASM_X86_TRAPS_H */































    2 







    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FS_STRUCT_H
#define _LINUX_FS_STRUCT_H

#include <linux/path.h>
#include <linux/spinlock.h>
#include <linux/seqlock.h>

struct fs_struct {
        int users;
        spinlock_t lock;
        seqcount_spinlock_t seq;
        int umask;
        int in_exec;
        struct path root, pwd;
} __randomize_layout;

extern struct kmem_cache *fs_cachep;

extern void exit_fs(struct task_struct *);
extern void set_fs_root(struct fs_struct *, const struct path *);
extern void set_fs_pwd(struct fs_struct *, const struct path *);
extern struct fs_struct *copy_fs_struct(struct fs_struct *);
extern void free_fs_struct(struct fs_struct *);
extern int unshare_fs_struct(void);

static inline void get_fs_root(struct fs_struct *fs, struct path *root)
{
        spin_lock(&fs->lock);
        *root = fs->root;
        path_get(root);
        spin_unlock(&fs->lock);
}

static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd)
{
        spin_lock(&fs->lock);
        *pwd = fs->pwd;
        path_get(pwd);
        spin_unlock(&fs->lock);
}

extern bool current_chrooted(void);

#endif /* _LINUX_FS_STRUCT_H */




































































































































   11 










   11 

























































   13 



















































    8 









    8 
    3 














































   13 












    1 





























   10 
















    3 
















   12 



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef INT_BLK_MQ_H
#define INT_BLK_MQ_H

#include <linux/blk-mq.h>
#include "blk-stat.h"

struct blk_mq_tag_set;

struct blk_mq_ctxs {
        struct kobject kobj;
        struct blk_mq_ctx __percpu        *queue_ctx;
};

/**
 * struct blk_mq_ctx - State for a software queue facing the submitting CPUs
 */
struct blk_mq_ctx {
        struct {
                spinlock_t                lock;
                struct list_head        rq_lists[HCTX_MAX_TYPES];
        } ____cacheline_aligned_in_smp;

        unsigned int                cpu;
        unsigned short                index_hw[HCTX_MAX_TYPES];
        struct blk_mq_hw_ctx         *hctxs[HCTX_MAX_TYPES];

        struct request_queue        *queue;
        struct blk_mq_ctxs      *ctxs;
        struct kobject                kobj;
} ____cacheline_aligned_in_smp;

enum {
        BLK_MQ_NO_TAG                = -1U,
        BLK_MQ_TAG_MIN                = 1,
        BLK_MQ_TAG_MAX                = BLK_MQ_NO_TAG - 1,
};

typedef unsigned int __bitwise blk_insert_t;
#define BLK_MQ_INSERT_AT_HEAD                ((__force blk_insert_t)0x01)

void blk_mq_submit_bio(struct bio *bio);
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
                unsigned int flags);
void blk_mq_exit_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *,
                             unsigned int);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
                                        struct blk_mq_ctx *start);
void blk_mq_put_rq_ref(struct request *rq);

/*
 * Internal helpers for allocating/freeing the request map
 */
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                     unsigned int hctx_idx);
void blk_mq_free_rq_map(struct blk_mq_tags *tags);
struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
                                unsigned int hctx_idx, unsigned int depth);
void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
                             struct blk_mq_tags *tags,
                             unsigned int hctx_idx);

/*
 * CPU -> queue mappings
 */
extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int);

/*
 * blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue
 * @q: request queue
 * @type: the hctx type index
 * @cpu: CPU
 */
static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q,
                                                          enum hctx_type type,
                                                          unsigned int cpu)
{
        return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]);
}

static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf)
{
        enum hctx_type type = HCTX_TYPE_DEFAULT;

        /*
         * The caller ensure that if REQ_POLLED, poll must be enabled.
         */
        if (opf & REQ_POLLED)
                type = HCTX_TYPE_POLL;
        else if ((opf & REQ_OP_MASK) == REQ_OP_READ)
                type = HCTX_TYPE_READ;
        return type;
}

/*
 * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
 * @q: request queue
 * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED).
 * @ctx: software queue cpu ctx
 */
static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
                                                     blk_opf_t opf,
                                                     struct blk_mq_ctx *ctx)
{
        return ctx->hctxs[blk_mq_get_hctx_type(opf)];
}

/*
 * sysfs helpers
 */
extern void blk_mq_sysfs_init(struct request_queue *q);
extern void blk_mq_sysfs_deinit(struct request_queue *q);
int blk_mq_sysfs_register(struct gendisk *disk);
void blk_mq_sysfs_unregister(struct gendisk *disk);
int blk_mq_sysfs_register_hctxs(struct request_queue *q);
void blk_mq_sysfs_unregister_hctxs(struct request_queue *q);
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
void blk_mq_free_plug_rqs(struct blk_plug *plug);
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);

void blk_mq_cancel_work_sync(struct request_queue *q);

void blk_mq_release(struct request_queue *q);

static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
                                           unsigned int cpu)
{
        return per_cpu_ptr(q->queue_ctx, cpu);
}

/*
 * This assumes per-cpu software queueing queues. They could be per-node
 * as well, for instance. For now this is hardcoded as-is. Note that we don't
 * care about preemption, since we know the ctx's are persistent. This does
 * mean that we can't rely on ctx always matching the currently running CPU.
 */
static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
{
        return __blk_mq_get_ctx(q, raw_smp_processor_id());
}

struct blk_mq_alloc_data {
        /* input parameter */
        struct request_queue *q;
        blk_mq_req_flags_t flags;
        unsigned int shallow_depth;
        blk_opf_t cmd_flags;
        req_flags_t rq_flags;

        /* allocate multiple requests/tags in one go */
        unsigned int nr_tags;
        struct request **cached_rq;

        /* input & output parameter */
        struct blk_mq_ctx *ctx;
        struct blk_mq_hw_ctx *hctx;
};

struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
                unsigned int reserved_tags, int node, int alloc_policy);
void blk_mq_free_tags(struct blk_mq_tags *tags);
int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
                struct sbitmap_queue *breserved_tags, unsigned int queue_depth,
                unsigned int reserved, int node, int alloc_policy);

unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
                unsigned int *offset);
void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
                unsigned int tag);
void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags);
int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
                struct blk_mq_tags **tags, unsigned int depth, bool can_grow);
void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set,
                unsigned int size);
void blk_mq_tag_update_sched_shared_tags(struct request_queue *q);

void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
                void *priv);
void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
                void *priv);

static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
                                                 struct blk_mq_hw_ctx *hctx)
{
        if (!hctx)
                return &bt->ws[0];
        return sbq_wait_ptr(bt, &hctx->wait_index);
}

void __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);

static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
        if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
                __blk_mq_tag_busy(hctx);
}

static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
        if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
                __blk_mq_tag_idle(hctx);
}

static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
                                          unsigned int tag)
{
        return tag < tags->nr_reserved_tags;
}

static inline bool blk_mq_is_shared_tags(unsigned int flags)
{
        return flags & BLK_MQ_F_TAG_HCTX_SHARED;
}

static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
{
        if (data->rq_flags & RQF_SCHED_TAGS)
                return data->hctx->sched_tags;
        return data->hctx->tags;
}

static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
{
        return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
}

static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
{
        return hctx->nr_ctx && hctx->tags;
}

unsigned int blk_mq_in_flight(struct request_queue *q,
                struct block_device *part);
void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
                unsigned int inflight[2]);

static inline void blk_mq_put_dispatch_budget(struct request_queue *q,
                                              int budget_token)
{
        if (q->mq_ops->put_budget)
                q->mq_ops->put_budget(q, budget_token);
}

static inline int blk_mq_get_dispatch_budget(struct request_queue *q)
{
        if (q->mq_ops->get_budget)
                return q->mq_ops->get_budget(q);
        return 0;
}

static inline void blk_mq_set_rq_budget_token(struct request *rq, int token)
{
        if (token < 0)
                return;

        if (rq->q->mq_ops->set_rq_budget_token)
                rq->q->mq_ops->set_rq_budget_token(rq, token);
}

static inline int blk_mq_get_rq_budget_token(struct request *rq)
{
        if (rq->q->mq_ops->get_rq_budget_token)
                return rq->q->mq_ops->get_rq_budget_token(rq);
        return -1;
}

static inline void __blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx,
                                                int val)
{
        if (blk_mq_is_shared_tags(hctx->flags))
                atomic_add(val, &hctx->queue->nr_active_requests_shared_tags);
        else
                atomic_add(val, &hctx->nr_active);
}

static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
{
        __blk_mq_add_active_requests(hctx, 1);
}

static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
                int val)
{
        if (blk_mq_is_shared_tags(hctx->flags))
                atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags);
        else
                atomic_sub(val, &hctx->nr_active);
}

static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
{
        __blk_mq_sub_active_requests(hctx, 1);
}

static inline void blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx,
                                              int val)
{
        if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
                __blk_mq_add_active_requests(hctx, val);
}

static inline void blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
{
        if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
                __blk_mq_inc_active_requests(hctx);
}

static inline void blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
                                              int val)
{
        if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
                __blk_mq_sub_active_requests(hctx, val);
}

static inline void blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
{
        if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
                __blk_mq_dec_active_requests(hctx);
}

static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
{
        if (blk_mq_is_shared_tags(hctx->flags))
                return atomic_read(&hctx->queue->nr_active_requests_shared_tags);
        return atomic_read(&hctx->nr_active);
}
static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
                                           struct request *rq)
{
        blk_mq_dec_active_requests(hctx);
        blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag);
        rq->tag = BLK_MQ_NO_TAG;
}

static inline void blk_mq_put_driver_tag(struct request *rq)
{
        if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG)
                return;

        __blk_mq_put_driver_tag(rq->mq_hctx, rq);
}

bool __blk_mq_alloc_driver_tag(struct request *rq);

static inline bool blk_mq_get_driver_tag(struct request *rq)
{
        if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq))
                return false;

        return true;
}

static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
{
        int cpu;

        for_each_possible_cpu(cpu)
                qmap->mq_map[cpu] = 0;
}

/* Free all requests on the list */
static inline void blk_mq_free_requests(struct list_head *list)
{
        while (!list_empty(list)) {
                struct request *rq = list_entry_rq(list->next);

                list_del_init(&rq->queuelist);
                blk_mq_free_request(rq);
        }
}

/*
 * For shared tag users, we track the number of currently active users
 * and attempt to provide a fair share of the tag depth for each of them.
 */
static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
                                  struct sbitmap_queue *bt)
{
        unsigned int depth, users;

        if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
                return true;

        /*
         * Don't try dividing an ant
         */
        if (bt->sb.depth == 1)
                return true;

        if (blk_mq_is_shared_tags(hctx->flags)) {
                struct request_queue *q = hctx->queue;

                if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
                        return true;
        } else {
                if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
                        return true;
        }

        users = READ_ONCE(hctx->tags->active_queues);
        if (!users)
                return true;

        /*
         * Allow at least some tags
         */
        depth = max((bt->sb.depth + users - 1) / users, 4U);
        return __blk_mq_active_requests(hctx) < depth;
}

/* run the code block in @dispatch_ops with rcu/srcu read lock held */
#define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops)        \
do {                                                                \
        if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) {                \
                struct blk_mq_tag_set *__tag_set = (q)->tag_set; \
                int srcu_idx;                                        \
                                                                \
                might_sleep_if(check_sleep);                        \
                srcu_idx = srcu_read_lock(__tag_set->srcu);        \
                (dispatch_ops);                                        \
                srcu_read_unlock(__tag_set->srcu, srcu_idx);        \
        } else {                                                \
                rcu_read_lock();                                \
                (dispatch_ops);                                        \
                rcu_read_unlock();                                \
        }                                                        \
} while (0)

#define blk_mq_run_dispatch_ops(q, dispatch_ops)                \
        __blk_mq_run_dispatch_ops(q, true, dispatch_ops)        \

#endif





















    3 
   24 






   31 




















    1 

























    1 
























   26 





   27 







   26 






   26 










   26 

























   29 




   26 

   27 

   29 


   28 
   29 

   26 
   29 






   29 
   28 





   27 


   25 
    1 



































   26 



   27 
    1 

   24 







    1 





















    1 














    1 



    1 
    1 
    1 


    1 

    1 

    1 





































    3 




    3 



    3 
    3 








    3 
    3 






    3 



    3 


    3 





















    1 























    1 








    1 
    1 

    1 








    1 
    1 






    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/uaccess.h>
#include <linux/fs_struct.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/prefetch.h>
#include "mount.h"
#include "internal.h"

struct prepend_buffer {
        char *buf;
        int len;
};
#define DECLARE_BUFFER(__name, __buf, __len) \
        struct prepend_buffer __name = {.buf = __buf + __len, .len = __len}

static char *extract_string(struct prepend_buffer *p)
{
        if (likely(p->len >= 0))
                return p->buf;
        return ERR_PTR(-ENAMETOOLONG);
}

static bool prepend_char(struct prepend_buffer *p, unsigned char c)
{
        if (likely(p->len > 0)) {
                p->len--;
                *--p->buf = c;
                return true;
        }
        p->len = -1;
        return false;
}

/*
 * The source of the prepend data can be an optimistic load
 * of a dentry name and length. And because we don't hold any
 * locks, the length and the pointer to the name may not be
 * in sync if a concurrent rename happens, and the kernel
 * copy might fault as a result.
 *
 * The end result will correct itself when we check the
 * rename sequence count, but we need to be able to handle
 * the fault gracefully.
 */
static bool prepend_copy(void *dst, const void *src, int len)
{
        if (unlikely(copy_from_kernel_nofault(dst, src, len))) {
                memset(dst, 'x', len);
                return false;
        }
        return true;
}

static bool prepend(struct prepend_buffer *p, const char *str, int namelen)
{
        // Already overflowed?
        if (p->len < 0)
                return false;

        // Will overflow?
        if (p->len < namelen) {
                // Fill as much as possible from the end of the name
                str += namelen - p->len;
                p->buf -= p->len;
                prepend_copy(p->buf, str, p->len);
                p->len = -1;
                return false;
        }

        // Fits fully
        p->len -= namelen;
        p->buf -= namelen;
        return prepend_copy(p->buf, str, namelen);
}

/**
 * prepend_name - prepend a pathname in front of current buffer pointer
 * @p: prepend buffer which contains buffer pointer and allocated length
 * @name: name string and length qstr structure
 *
 * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to
 * make sure that either the old or the new name pointer and length are
 * fetched. However, there may be mismatch between length and pointer.
 * But since the length cannot be trusted, we need to copy the name very
 * carefully when doing the prepend_copy(). It also prepends "/" at
 * the beginning of the name. The sequence number check at the caller will
 * retry it again when a d_move() does happen. So any garbage in the buffer
 * due to mismatched pointer and length will be discarded.
 *
 * Load acquire is needed to make sure that we see the new name data even
 * if we might get the length wrong.
 */
static bool prepend_name(struct prepend_buffer *p, const struct qstr *name)
{
        const char *dname = smp_load_acquire(&name->name); /* ^^^ */
        u32 dlen = READ_ONCE(name->len);

        return prepend(p, dname, dlen) && prepend_char(p, '/');
}

static int __prepend_path(const struct dentry *dentry, const struct mount *mnt,
                          const struct path *root, struct prepend_buffer *p)
{
        while (dentry != root->dentry || &mnt->mnt != root->mnt) {
                const struct dentry *parent = READ_ONCE(dentry->d_parent);

                if (dentry == mnt->mnt.mnt_root) {
                        struct mount *m = READ_ONCE(mnt->mnt_parent);
                        struct mnt_namespace *mnt_ns;

                        if (likely(mnt != m)) {
                                dentry = READ_ONCE(mnt->mnt_mountpoint);
                                mnt = m;
                                continue;
                        }
                        /* Global root */
                        mnt_ns = READ_ONCE(mnt->mnt_ns);
                        /* open-coded is_mounted() to use local mnt_ns */
                        if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns))
                                return 1;        // absolute root
                        else
                                return 2;        // detached or not attached yet
                }

                if (unlikely(dentry == parent))
                        /* Escaped? */
                        return 3;

                prefetch(parent);
                if (!prepend_name(p, &dentry->d_name))
                        break;
                dentry = parent;
        }
        return 0;
}

/**
 * prepend_path - Prepend path string to a buffer
 * @path: the dentry/vfsmount to report
 * @root: root vfsmnt/dentry
 * @p: prepend buffer which contains buffer pointer and allocated length
 *
 * The function will first try to write out the pathname without taking any
 * lock other than the RCU read lock to make sure that dentries won't go away.
 * It only checks the sequence number of the global rename_lock as any change
 * in the dentry's d_seq will be preceded by changes in the rename_lock
 * sequence number. If the sequence number had been changed, it will restart
 * the whole pathname back-tracing sequence again by taking the rename_lock.
 * In this case, there is no need to take the RCU read lock as the recursive
 * parent pointer references will keep the dentry chain alive as long as no
 * rename operation is performed.
 */
static int prepend_path(const struct path *path,
                        const struct path *root,
                        struct prepend_buffer *p)
{
        unsigned seq, m_seq = 0;
        struct prepend_buffer b;
        int error;

        rcu_read_lock();
restart_mnt:
        read_seqbegin_or_lock(&mount_lock, &m_seq);
        seq = 0;
        rcu_read_lock();
restart:
        b = *p;
        read_seqbegin_or_lock(&rename_lock, &seq);
        error = __prepend_path(path->dentry, real_mount(path->mnt), root, &b);
        if (!(seq & 1))
                rcu_read_unlock();
        if (need_seqretry(&rename_lock, seq)) {
                seq = 1;
                goto restart;
        }
        done_seqretry(&rename_lock, seq);

        if (!(m_seq & 1))
                rcu_read_unlock();
        if (need_seqretry(&mount_lock, m_seq)) {
                m_seq = 1;
                goto restart_mnt;
        }
        done_seqretry(&mount_lock, m_seq);

        if (unlikely(error == 3))
                b = *p;

        if (b.len == p->len)
                prepend_char(&b, '/');

        *p = b;
        return error;
}

/**
 * __d_path - return the path of a dentry
 * @path: the dentry/vfsmount to report
 * @root: root vfsmnt/dentry
 * @buf: buffer to return value in
 * @buflen: buffer length
 *
 * Convert a dentry into an ASCII path name.
 *
 * Returns a pointer into the buffer or an error code if the
 * path was too long.
 *
 * "buflen" should be positive.
 *
 * If the path is not reachable from the supplied root, return %NULL.
 */
char *__d_path(const struct path *path,
               const struct path *root,
               char *buf, int buflen)
{
        DECLARE_BUFFER(b, buf, buflen);

        prepend_char(&b, 0);
        if (unlikely(prepend_path(path, root, &b) > 0))
                return NULL;
        return extract_string(&b);
}

char *d_absolute_path(const struct path *path,
               char *buf, int buflen)
{
        struct path root = {};
        DECLARE_BUFFER(b, buf, buflen);

        prepend_char(&b, 0);
        if (unlikely(prepend_path(path, &root, &b) > 1))
                return ERR_PTR(-EINVAL);
        return extract_string(&b);
}

static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
{
        unsigned seq;

        do {
                seq = read_seqcount_begin(&fs->seq);
                *root = fs->root;
        } while (read_seqcount_retry(&fs->seq, seq));
}

/**
 * d_path - return the path of a dentry
 * @path: path to report
 * @buf: buffer to return value in
 * @buflen: buffer length
 *
 * Convert a dentry into an ASCII path name. If the entry has been deleted
 * the string " (deleted)" is appended. Note that this is ambiguous.
 *
 * Returns a pointer into the buffer or an error code if the path was
 * too long. Note: Callers should use the returned pointer, not the passed
 * in buffer, to use the name! The implementation often starts at an offset
 * into the buffer, and may leave 0 bytes at the start.
 *
 * "buflen" should be positive.
 */
char *d_path(const struct path *path, char *buf, int buflen)
{
        DECLARE_BUFFER(b, buf, buflen);
        struct path root;

        /*
         * We have various synthetic filesystems that never get mounted.  On
         * these filesystems dentries are never used for lookup purposes, and
         * thus don't need to be hashed.  They also don't need a name until a
         * user wants to identify the object in /proc/pid/fd/.  The little hack
         * below allows us to generate a name for these objects on demand:
         *
         * Some pseudo inodes are mountable.  When they are mounted
         * path->dentry == path->mnt->mnt_root.  In that case don't call d_dname
         * and instead have d_path return the mounted path.
         */
        if (path->dentry->d_op && path->dentry->d_op->d_dname &&
            (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root))
                return path->dentry->d_op->d_dname(path->dentry, buf, buflen);

        rcu_read_lock();
        get_fs_root_rcu(current->fs, &root);
        if (unlikely(d_unlinked(path->dentry)))
                prepend(&b, " (deleted)", 11);
        else
                prepend_char(&b, 0);
        prepend_path(path, &root, &b);
        rcu_read_unlock();

        return extract_string(&b);
}
EXPORT_SYMBOL(d_path);

/*
 * Helper function for dentry_operations.d_dname() members
 */
char *dynamic_dname(char *buffer, int buflen, const char *fmt, ...)
{
        va_list args;
        char temp[64];
        int sz;

        va_start(args, fmt);
        sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1;
        va_end(args);

        if (sz > sizeof(temp) || sz > buflen)
                return ERR_PTR(-ENAMETOOLONG);

        buffer += buflen - sz;
        return memcpy(buffer, temp, sz);
}

char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
{
        DECLARE_BUFFER(b, buffer, buflen);
        /* these dentries are never renamed, so d_lock is not needed */
        prepend(&b, " (deleted)", 11);
        prepend(&b, dentry->d_name.name, dentry->d_name.len);
        prepend_char(&b, '/');
        return extract_string(&b);
}

/*
 * Write full pathname from the root of the filesystem into the buffer.
 */
static char *__dentry_path(const struct dentry *d, struct prepend_buffer *p)
{
        const struct dentry *dentry;
        struct prepend_buffer b;
        int seq = 0;

        rcu_read_lock();
restart:
        dentry = d;
        b = *p;
        read_seqbegin_or_lock(&rename_lock, &seq);
        while (!IS_ROOT(dentry)) {
                const struct dentry *parent = dentry->d_parent;

                prefetch(parent);
                if (!prepend_name(&b, &dentry->d_name))
                        break;
                dentry = parent;
        }
        if (!(seq & 1))
                rcu_read_unlock();
        if (need_seqretry(&rename_lock, seq)) {
                seq = 1;
                goto restart;
        }
        done_seqretry(&rename_lock, seq);
        if (b.len == p->len)
                prepend_char(&b, '/');
        return extract_string(&b);
}

char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen)
{
        DECLARE_BUFFER(b, buf, buflen);

        prepend_char(&b, 0);
        return __dentry_path(dentry, &b);
}
EXPORT_SYMBOL(dentry_path_raw);

char *dentry_path(const struct dentry *dentry, char *buf, int buflen)
{
        DECLARE_BUFFER(b, buf, buflen);

        if (unlikely(d_unlinked(dentry)))
                prepend(&b, "//deleted", 10);
        else
                prepend_char(&b, 0);
        return __dentry_path(dentry, &b);
}

static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
                                    struct path *pwd)
{
        unsigned seq;

        do {
                seq = read_seqcount_begin(&fs->seq);
                *root = fs->root;
                *pwd = fs->pwd;
        } while (read_seqcount_retry(&fs->seq, seq));
}

/*
 * NOTE! The user-level library version returns a
 * character pointer. The kernel system call just
 * returns the length of the buffer filled (which
 * includes the ending '\0' character), or a negative
 * error value. So libc would do something like
 *
 *        char *getcwd(char * buf, size_t size)
 *        {
 *                int retval;
 *
 *                retval = sys_getcwd(buf, size);
 *                if (retval >= 0)
 *                        return buf;
 *                errno = -retval;
 *                return NULL;
 *        }
 */
SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
{
        int error;
        struct path pwd, root;
        char *page = __getname();

        if (!page)
                return -ENOMEM;

        rcu_read_lock();
        get_fs_root_and_pwd_rcu(current->fs, &root, &pwd);

        if (unlikely(d_unlinked(pwd.dentry))) {
                rcu_read_unlock();
                error = -ENOENT;
        } else {
                unsigned len;
                DECLARE_BUFFER(b, page, PATH_MAX);

                prepend_char(&b, 0);
                if (unlikely(prepend_path(&pwd, &root, &b) > 0))
                        prepend(&b, "(unreachable)", 13);
                rcu_read_unlock();

                len = PATH_MAX - b.len;
                if (unlikely(len > PATH_MAX))
                        error = -ENAMETOOLONG;
                else if (unlikely(len > size))
                        error = -ERANGE;
                else if (copy_to_user(buf, b.buf, len))
                        error = -EFAULT;
                else
                        error = len;
        }
        __putname(page);
        return error;
}


































































































































































































































































































































































































    1 



    2 

    2 


    2 

















    2 























    3 




































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2007 Casey Schaufler <casey@schaufler-ca.com>
 *
 * Author:
 *      Casey Schaufler <casey@schaufler-ca.com>
 */

#ifndef _SECURITY_SMACK_H
#define _SECURITY_SMACK_H

#include <linux/capability.h>
#include <linux/spinlock.h>
#include <linux/lsm_hooks.h>
#include <linux/in.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <linux/in6.h>
#endif /* CONFIG_IPV6 */
#include <net/netlabel.h>
#include <linux/list.h>
#include <linux/rculist.h>
#include <linux/lsm_audit.h>
#include <linux/msg.h>

/*
 * Use IPv6 port labeling if IPv6 is enabled and secmarks
 * are not being used.
 */
#if IS_ENABLED(CONFIG_IPV6) && !defined(CONFIG_SECURITY_SMACK_NETFILTER)
#define SMACK_IPV6_PORT_LABELING 1
#endif

#if IS_ENABLED(CONFIG_IPV6) && defined(CONFIG_SECURITY_SMACK_NETFILTER)
#define SMACK_IPV6_SECMARK_LABELING 1
#endif

/*
 * Smack labels were limited to 23 characters for a long time.
 */
#define SMK_LABELLEN        24
#define SMK_LONGLABEL        256

/*
 * This is the repository for labels seen so that it is
 * not necessary to keep allocating tiny chuncks of memory
 * and so that they can be shared.
 *
 * Labels are never modified in place. Anytime a label
 * is imported (e.g. xattrset on a file) the list is checked
 * for it and it is added if it doesn't exist. The address
 * is passed out in either case. Entries are added, but
 * never deleted.
 *
 * Since labels are hanging around anyway it doesn't
 * hurt to maintain a secid for those awkward situations
 * where kernel components that ought to use LSM independent
 * interfaces don't. The secid should go away when all of
 * these components have been repaired.
 *
 * The cipso value associated with the label gets stored here, too.
 *
 * Keep the access rules for this subject label here so that
 * the entire set of rules does not need to be examined every
 * time.
 */
struct smack_known {
        struct list_head                list;
        struct hlist_node                smk_hashed;
        char                                *smk_known;
        u32                                smk_secid;
        struct netlbl_lsm_secattr        smk_netlabel;        /* on wire labels */
        struct list_head                smk_rules;        /* access rules */
        struct mutex                        smk_rules_lock;        /* lock for rules */
};

/*
 * Maximum number of bytes for the levels in a CIPSO IP option.
 * Why 23? CIPSO is constrained to 30, so a 32 byte buffer is
 * bigger than can be used, and 24 is the next lower multiple
 * of 8, and there are too many issues if there isn't space set
 * aside for the terminating null byte.
 */
#define SMK_CIPSOLEN        24

struct superblock_smack {
        struct smack_known        *smk_root;
        struct smack_known        *smk_floor;
        struct smack_known        *smk_hat;
        struct smack_known        *smk_default;
        int                        smk_flags;
};

/*
 * Superblock flags
 */
#define SMK_SB_INITIALIZED        0x01
#define SMK_SB_UNTRUSTED        0x02

struct socket_smack {
        struct smack_known        *smk_out;        /* outbound label */
        struct smack_known        *smk_in;        /* inbound label */
        struct smack_known        *smk_packet;        /* TCP peer label */
        int                        smk_state;        /* netlabel socket states */
};
#define        SMK_NETLBL_UNSET        0
#define        SMK_NETLBL_UNLABELED        1
#define        SMK_NETLBL_LABELED        2
#define        SMK_NETLBL_REQSKB        3

/*
 * Inode smack data
 */
struct inode_smack {
        struct smack_known        *smk_inode;        /* label of the fso */
        struct smack_known        *smk_task;        /* label of the task */
        struct smack_known        *smk_mmap;        /* label of the mmap domain */
        int                        smk_flags;        /* smack inode flags */
};

struct task_smack {
        struct smack_known        *smk_task;        /* label for access control */
        struct smack_known        *smk_forked;        /* label when forked */
        struct smack_known        *smk_transmuted;/* label when transmuted */
        struct list_head        smk_rules;        /* per task access rules */
        struct mutex                smk_rules_lock;        /* lock for the rules */
        struct list_head        smk_relabel;        /* transit allowed labels */
};

#define        SMK_INODE_INSTANT        0x01        /* inode is instantiated */
#define        SMK_INODE_TRANSMUTE        0x02        /* directory is transmuting */
#define        SMK_INODE_CHANGED        0x04        /* smack was transmuted (unused) */
#define        SMK_INODE_IMPURE        0x08        /* involved in an impure transaction */

/*
 * A label access rule.
 */
struct smack_rule {
        struct list_head        list;
        struct smack_known        *smk_subject;
        struct smack_known        *smk_object;
        int                        smk_access;
};

/*
 * An entry in the table identifying IPv4 hosts.
 */
struct smk_net4addr {
        struct list_head        list;
        struct in_addr                smk_host;        /* network address */
        struct in_addr                smk_mask;        /* network mask */
        int                        smk_masks;        /* mask size */
        struct smack_known        *smk_label;        /* label */
};

/*
 * An entry in the table identifying IPv6 hosts.
 */
struct smk_net6addr {
        struct list_head        list;
        struct in6_addr                smk_host;        /* network address */
        struct in6_addr                smk_mask;        /* network mask */
        int                        smk_masks;        /* mask size */
        struct smack_known        *smk_label;        /* label */
};

/*
 * An entry in the table identifying ports.
 */
struct smk_port_label {
        struct list_head        list;
        struct sock                *smk_sock;        /* socket initialized on */
        unsigned short                smk_port;        /* the port number */
        struct smack_known        *smk_in;        /* inbound label */
        struct smack_known        *smk_out;        /* outgoing label */
        short                        smk_sock_type;        /* Socket type */
        short                        smk_can_reuse;
};

struct smack_known_list_elem {
        struct list_head        list;
        struct smack_known        *smk_label;
};

enum {
        Opt_error = -1,
        Opt_fsdefault = 0,
        Opt_fsfloor = 1,
        Opt_fshat = 2,
        Opt_fsroot = 3,
        Opt_fstransmute = 4,
};

#define SMACK_DELETE_OPTION        "-DELETE"
#define SMACK_CIPSO_OPTION         "-CIPSO"

/*
 * CIPSO defaults.
 */
#define SMACK_CIPSO_DOI_DEFAULT                3        /* Historical */
#define SMACK_CIPSO_DOI_INVALID                -1        /* Not a DOI */
#define SMACK_CIPSO_DIRECT_DEFAULT        250        /* Arbitrary */
#define SMACK_CIPSO_MAPPED_DEFAULT        251        /* Also arbitrary */
#define SMACK_CIPSO_MAXLEVEL            255     /* CIPSO 2.2 standard */
/*
 * CIPSO 2.2 standard is 239, but Smack wants to use the
 * categories in a structured way that limits the value to
 * the bits in 23 bytes, hence the unusual number.
 */
#define SMACK_CIPSO_MAXCATNUM           184     /* 23 * 8 */

/*
 * Ptrace rules
 */
#define SMACK_PTRACE_DEFAULT        0
#define SMACK_PTRACE_EXACT        1
#define SMACK_PTRACE_DRACONIAN        2
#define SMACK_PTRACE_MAX        SMACK_PTRACE_DRACONIAN

/*
 * Flags for untraditional access modes.
 * It shouldn't be necessary to avoid conflicts with definitions
 * in fs.h, but do so anyway.
 */
#define MAY_TRANSMUTE        0x00001000        /* Controls directory labeling */
#define MAY_LOCK        0x00002000        /* Locks should be writes, but ... */
#define MAY_BRINGUP        0x00004000        /* Report use of this rule */

/*
 * The policy for delivering signals is configurable.
 * It is usually "write", but can be "append".
 */
#ifdef CONFIG_SECURITY_SMACK_APPEND_SIGNALS
#define MAY_DELIVER        MAY_APPEND        /* Signal delivery requires append */
#else
#define MAY_DELIVER        MAY_WRITE        /* Signal delivery requires write */
#endif

#define SMACK_BRINGUP_ALLOW                1        /* Allow bringup mode */
#define SMACK_UNCONFINED_SUBJECT        2        /* Allow unconfined label */
#define SMACK_UNCONFINED_OBJECT                3        /* Allow unconfined label */

/*
 * Just to make the common cases easier to deal with
 */
#define MAY_ANYREAD        (MAY_READ | MAY_EXEC)
#define MAY_READWRITE        (MAY_READ | MAY_WRITE)
#define MAY_NOT                0

/*
 * Number of access types used by Smack (rwxatlb)
 */
#define SMK_NUM_ACCESS_TYPE 7

/* SMACK data */
struct smack_audit_data {
        const char *function;
        char *subject;
        char *object;
        char *request;
        int result;
};

/*
 * Smack audit data; is empty if CONFIG_AUDIT not set
 * to save some stack
 */
struct smk_audit_info {
#ifdef CONFIG_AUDIT
        struct common_audit_data a;
        struct smack_audit_data sad;
#endif
};

/*
 * These functions are in smack_access.c
 */
int smk_access_entry(char *, char *, struct list_head *);
int smk_access(struct smack_known *, struct smack_known *,
               int, struct smk_audit_info *);
int smk_tskacc(struct task_smack *, struct smack_known *,
               u32, struct smk_audit_info *);
int smk_curacc(struct smack_known *, u32, struct smk_audit_info *);
struct smack_known *smack_from_secid(const u32);
char *smk_parse_smack(const char *string, int len);
int smk_netlbl_mls(int, char *, struct netlbl_lsm_secattr *, int);
struct smack_known *smk_import_entry(const char *, int);
void smk_insert_entry(struct smack_known *skp);
struct smack_known *smk_find_entry(const char *);
bool smack_privileged(int cap);
bool smack_privileged_cred(int cap, const struct cred *cred);
void smk_destroy_label_list(struct list_head *list);
int smack_populate_secattr(struct smack_known *skp);

/*
 * Shared data.
 */
extern int smack_enabled __initdata;
extern int smack_cipso_direct;
extern int smack_cipso_mapped;
extern struct smack_known *smack_net_ambient;
extern struct smack_known *smack_syslog_label;
#ifdef CONFIG_SECURITY_SMACK_BRINGUP
extern struct smack_known *smack_unconfined;
#endif
extern int smack_ptrace_rule;
extern struct lsm_blob_sizes smack_blob_sizes;

extern struct smack_known smack_known_floor;
extern struct smack_known smack_known_hat;
extern struct smack_known smack_known_huh;
extern struct smack_known smack_known_star;
extern struct smack_known smack_known_web;

extern struct mutex        smack_known_lock;
extern struct list_head smack_known_list;
extern struct list_head smk_net4addr_list;
extern struct list_head smk_net6addr_list;

extern struct mutex     smack_onlycap_lock;
extern struct list_head smack_onlycap_list;

#define SMACK_HASH_SLOTS 16
extern struct hlist_head smack_known_hash[SMACK_HASH_SLOTS];
extern struct kmem_cache *smack_rule_cache;

static inline struct task_smack *smack_cred(const struct cred *cred)
{
        return cred->security + smack_blob_sizes.lbs_cred;
}

static inline struct smack_known **smack_file(const struct file *file)
{
        return (struct smack_known **)(file->f_security +
                                       smack_blob_sizes.lbs_file);
}

static inline struct inode_smack *smack_inode(const struct inode *inode)
{
        return inode->i_security + smack_blob_sizes.lbs_inode;
}

static inline struct smack_known **smack_msg_msg(const struct msg_msg *msg)
{
        return msg->security + smack_blob_sizes.lbs_msg_msg;
}

static inline struct smack_known **smack_ipc(const struct kern_ipc_perm *ipc)
{
        return ipc->security + smack_blob_sizes.lbs_ipc;
}

static inline struct superblock_smack *smack_superblock(
                                        const struct super_block *superblock)
{
        return superblock->s_security + smack_blob_sizes.lbs_superblock;
}

/*
 * Is the directory transmuting?
 */
static inline int smk_inode_transmutable(const struct inode *isp)
{
        struct inode_smack *sip = smack_inode(isp);
        return (sip->smk_flags & SMK_INODE_TRANSMUTE) != 0;
}

/*
 * Present a pointer to the smack label entry in an inode blob.
 */
static inline struct smack_known *smk_of_inode(const struct inode *isp)
{
        struct inode_smack *sip = smack_inode(isp);
        return sip->smk_inode;
}

/*
 * Present a pointer to the smack label entry in an task blob.
 */
static inline struct smack_known *smk_of_task(const struct task_smack *tsp)
{
        return tsp->smk_task;
}

static inline struct smack_known *smk_of_task_struct_obj(
                                                const struct task_struct *t)
{
        struct smack_known *skp;
        const struct cred *cred;

        rcu_read_lock();

        cred = __task_cred(t);
        skp = smk_of_task(smack_cred(cred));

        rcu_read_unlock();

        return skp;
}

/*
 * Present a pointer to the forked smack label entry in an task blob.
 */
static inline struct smack_known *smk_of_forked(const struct task_smack *tsp)
{
        return tsp->smk_forked;
}

/*
 * Present a pointer to the smack label in the current task blob.
 */
static inline struct smack_known *smk_of_current(void)
{
        return smk_of_task(smack_cred(current_cred()));
}

/*
 * logging functions
 */
#define SMACK_AUDIT_DENIED 0x1
#define SMACK_AUDIT_ACCEPT 0x2
extern int log_policy;

void smack_log(char *subject_label, char *object_label,
                int request,
                int result, struct smk_audit_info *auditdata);

#ifdef CONFIG_AUDIT

/*
 * some inline functions to set up audit data
 * they do nothing if CONFIG_AUDIT is not set
 *
 */
static inline void smk_ad_init(struct smk_audit_info *a, const char *func,
                               char type)
{
        memset(&a->sad, 0, sizeof(a->sad));
        a->a.type = type;
        a->a.smack_audit_data = &a->sad;
        a->a.smack_audit_data->function = func;
}

static inline void smk_ad_init_net(struct smk_audit_info *a, const char *func,
                                   char type, struct lsm_network_audit *net)
{
        smk_ad_init(a, func, type);
        memset(net, 0, sizeof(*net));
        a->a.u.net = net;
}

static inline void smk_ad_setfield_u_tsk(struct smk_audit_info *a,
                                         struct task_struct *t)
{
        a->a.u.tsk = t;
}
static inline void smk_ad_setfield_u_fs_path_dentry(struct smk_audit_info *a,
                                                    struct dentry *d)
{
        a->a.u.dentry = d;
}
static inline void smk_ad_setfield_u_fs_inode(struct smk_audit_info *a,
                                              struct inode *i)
{
        a->a.u.inode = i;
}
static inline void smk_ad_setfield_u_fs_path(struct smk_audit_info *a,
                                             struct path p)
{
        a->a.u.path = p;
}
static inline void smk_ad_setfield_u_net_sk(struct smk_audit_info *a,
                                            struct sock *sk)
{
        a->a.u.net->sk = sk;
}

#else /* no AUDIT */

static inline void smk_ad_init(struct smk_audit_info *a, const char *func,
                               char type)
{
}
static inline void smk_ad_setfield_u_tsk(struct smk_audit_info *a,
                                         struct task_struct *t)
{
}
static inline void smk_ad_setfield_u_fs_path_dentry(struct smk_audit_info *a,
                                                    struct dentry *d)
{
}
static inline void smk_ad_setfield_u_fs_inode(struct smk_audit_info *a,
                                              struct inode *i)
{
}
static inline void smk_ad_setfield_u_fs_path(struct smk_audit_info *a,
                                             struct path p)
{
}
static inline void smk_ad_setfield_u_net_sk(struct smk_audit_info *a,
                                            struct sock *sk)
{
}
#endif

#endif  /* _SECURITY_SMACK_H */
















































































































    4 










    2 
    2 










    1 
    1 



































    4 

    2 








    4 

















    2 



























































    3 



    2 
    1 
    1 



































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
  Red Black Trees
  (C) 1999  Andrea Arcangeli <andrea@suse.de>
  

  linux/include/linux/rbtree.h

  To use rbtrees you'll have to implement your own insert and search cores.
  This will avoid us to use callbacks and to drop drammatically performances.
  I know it's not the cleaner way,  but in C (not in C++) to get
  performances and genericity...

  See Documentation/core-api/rbtree.rst for documentation and samples.
*/

#ifndef        _LINUX_RBTREE_H
#define        _LINUX_RBTREE_H

#include <linux/container_of.h>
#include <linux/rbtree_types.h>

#include <linux/stddef.h>
#include <linux/rcupdate.h>

#define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))

#define        rb_entry(ptr, type, member) container_of(ptr, type, member)

#define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)

/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
#define RB_EMPTY_NODE(node)  \
        ((node)->__rb_parent_color == (unsigned long)(node))
#define RB_CLEAR_NODE(node)  \
        ((node)->__rb_parent_color = (unsigned long)(node))


extern void rb_insert_color(struct rb_node *, struct rb_root *);
extern void rb_erase(struct rb_node *, struct rb_root *);


/* Find logical next and previous nodes in a tree */
extern struct rb_node *rb_next(const struct rb_node *);
extern struct rb_node *rb_prev(const struct rb_node *);
extern struct rb_node *rb_first(const struct rb_root *);
extern struct rb_node *rb_last(const struct rb_root *);

/* Postorder iteration - always visit the parent after its children */
extern struct rb_node *rb_first_postorder(const struct rb_root *);
extern struct rb_node *rb_next_postorder(const struct rb_node *);

/* Fast replacement of a single node without remove/rebalance/add/rebalance */
extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
                            struct rb_root *root);
extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
                                struct rb_root *root);

static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
                                struct rb_node **rb_link)
{
        node->__rb_parent_color = (unsigned long)parent;
        node->rb_left = node->rb_right = NULL;

        *rb_link = node;
}

static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
                                    struct rb_node **rb_link)
{
        node->__rb_parent_color = (unsigned long)parent;
        node->rb_left = node->rb_right = NULL;

        rcu_assign_pointer(*rb_link, node);
}

#define rb_entry_safe(ptr, type, member) \
        ({ typeof(ptr) ____ptr = (ptr); \
           ____ptr ? rb_entry(____ptr, type, member) : NULL; \
        })

/**
 * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
 * given type allowing the backing memory of @pos to be invalidated
 *
 * @pos:        the 'type *' to use as a loop cursor.
 * @n:                another 'type *' to use as temporary storage
 * @root:        'rb_root *' of the rbtree.
 * @field:        the name of the rb_node field within 'type'.
 *
 * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
 * list_for_each_entry_safe() and allows the iteration to continue independent
 * of changes to @pos by the body of the loop.
 *
 * Note, however, that it cannot handle other modifications that re-order the
 * rbtree it is iterating over. This includes calling rb_erase() on @pos, as
 * rb_erase() may rebalance the tree, causing us to miss some nodes.
 */
#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
        for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
             pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
                        typeof(*pos), field); 1; }); \
             pos = n)

/* Same as rb_first(), but O(1) */
#define rb_first_cached(root) (root)->rb_leftmost

static inline void rb_insert_color_cached(struct rb_node *node,
                                          struct rb_root_cached *root,
                                          bool leftmost)
{
        if (leftmost)
                root->rb_leftmost = node;
        rb_insert_color(node, &root->rb_root);
}


static inline struct rb_node *
rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
{
        struct rb_node *leftmost = NULL;

        if (root->rb_leftmost == node)
                leftmost = root->rb_leftmost = rb_next(node);

        rb_erase(node, &root->rb_root);

        return leftmost;
}

static inline void rb_replace_node_cached(struct rb_node *victim,
                                          struct rb_node *new,
                                          struct rb_root_cached *root)
{
        if (root->rb_leftmost == victim)
                root->rb_leftmost = new;
        rb_replace_node(victim, new, &root->rb_root);
}

/*
 * The below helper functions use 2 operators with 3 different
 * calling conventions. The operators are related like:
 *
 *        comp(a->key,b) < 0  := less(a,b)
 *        comp(a->key,b) > 0  := less(b,a)
 *        comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
 *
 * If these operators define a partial order on the elements we make no
 * guarantee on which of the elements matching the key is found. See
 * rb_find().
 *
 * The reason for this is to allow the find() interface without requiring an
 * on-stack dummy object, which might not be feasible due to object size.
 */

/**
 * rb_add_cached() - insert @node into the leftmost cached tree @tree
 * @node: node to insert
 * @tree: leftmost cached tree to insert @node into
 * @less: operator defining the (partial) node order
 *
 * Returns @node when it is the new leftmost, or NULL.
 */
static __always_inline struct rb_node *
rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
              bool (*less)(struct rb_node *, const struct rb_node *))
{
        struct rb_node **link = &tree->rb_root.rb_node;
        struct rb_node *parent = NULL;
        bool leftmost = true;

        while (*link) {
                parent = *link;
                if (less(node, parent)) {
                        link = &parent->rb_left;
                } else {
                        link = &parent->rb_right;
                        leftmost = false;
                }
        }

        rb_link_node(node, parent, link);
        rb_insert_color_cached(node, tree, leftmost);

        return leftmost ? node : NULL;
}

/**
 * rb_add() - insert @node into @tree
 * @node: node to insert
 * @tree: tree to insert @node into
 * @less: operator defining the (partial) node order
 */
static __always_inline void
rb_add(struct rb_node *node, struct rb_root *tree,
       bool (*less)(struct rb_node *, const struct rb_node *))
{
        struct rb_node **link = &tree->rb_node;
        struct rb_node *parent = NULL;

        while (*link) {
                parent = *link;
                if (less(node, parent))
                        link = &parent->rb_left;
                else
                        link = &parent->rb_right;
        }

        rb_link_node(node, parent, link);
        rb_insert_color(node, tree);
}

/**
 * rb_find_add() - find equivalent @node in @tree, or add @node
 * @node: node to look-for / insert
 * @tree: tree to search / modify
 * @cmp: operator defining the node order
 *
 * Returns the rb_node matching @node, or NULL when no match is found and @node
 * is inserted.
 */
static __always_inline struct rb_node *
rb_find_add(struct rb_node *node, struct rb_root *tree,
            int (*cmp)(struct rb_node *, const struct rb_node *))
{
        struct rb_node **link = &tree->rb_node;
        struct rb_node *parent = NULL;
        int c;

        while (*link) {
                parent = *link;
                c = cmp(node, parent);

                if (c < 0)
                        link = &parent->rb_left;
                else if (c > 0)
                        link = &parent->rb_right;
                else
                        return parent;
        }

        rb_link_node(node, parent, link);
        rb_insert_color(node, tree);
        return NULL;
}

/**
 * rb_find() - find @key in tree @tree
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining the node order
 *
 * Returns the rb_node matching @key or NULL.
 */
static __always_inline struct rb_node *
rb_find(const void *key, const struct rb_root *tree,
        int (*cmp)(const void *key, const struct rb_node *))
{
        struct rb_node *node = tree->rb_node;

        while (node) {
                int c = cmp(key, node);

                if (c < 0)
                        node = node->rb_left;
                else if (c > 0)
                        node = node->rb_right;
                else
                        return node;
        }

        return NULL;
}

/**
 * rb_find_first() - find the first @key in @tree
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining node order
 *
 * Returns the leftmost node matching @key, or NULL.
 */
static __always_inline struct rb_node *
rb_find_first(const void *key, const struct rb_root *tree,
              int (*cmp)(const void *key, const struct rb_node *))
{
        struct rb_node *node = tree->rb_node;
        struct rb_node *match = NULL;

        while (node) {
                int c = cmp(key, node);

                if (c <= 0) {
                        if (!c)
                                match = node;
                        node = node->rb_left;
                } else if (c > 0) {
                        node = node->rb_right;
                }
        }

        return match;
}

/**
 * rb_next_match() - find the next @key in @tree
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining node order
 *
 * Returns the next node matching @key, or NULL.
 */
static __always_inline struct rb_node *
rb_next_match(const void *key, struct rb_node *node,
              int (*cmp)(const void *key, const struct rb_node *))
{
        node = rb_next(node);
        if (node && cmp(key, node))
                node = NULL;
        return node;
}

/**
 * rb_for_each() - iterates a subtree matching @key
 * @node: iterator
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining node order
 */
#define rb_for_each(node, key, tree, cmp) \
        for ((node) = rb_find_first((key), (tree), (cmp)); \
             (node); (node) = rb_next_match((key), (node), (cmp)))

#endif        /* _LINUX_RBTREE_H */























































































































































    3 
























































    1 






























































































































    1 
    1 












































































































    3 


















    3 







    3 








    2 


















    2 

























    4 












































    1 




    1 

    1 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/fork.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 *  'fork.c' contains the help-routines for the 'fork' system call
 * (see also entry.S and others).
 * Fork is rather simple, once you get the hang of it, but the memory
 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
 */

#include <linux/anon_inodes.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/user.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/stat.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/seq_file.h>
#include <linux/rtmutex.h>
#include <linux/init.h>
#include <linux/unistd.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/mempolicy.h>
#include <linux/sem.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/iocontext.h>
#include <linux/key.h>
#include <linux/kmsan.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/cgroup.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
#include <linux/compat.h>
#include <linux/kthread.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/rcupdate.h>
#include <linux/ptrace.h>
#include <linux/mount.h>
#include <linux/audit.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
#include <linux/proc_fs.h>
#include <linux/profile.h>
#include <linux/rmap.h>
#include <linux/ksm.h>
#include <linux/acct.h>
#include <linux/userfaultfd_k.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/tty.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
#include <linux/perf_event.h>
#include <linux/posix-timers.h>
#include <linux/user-return-notifier.h>
#include <linux/oom.h>
#include <linux/khugepaged.h>
#include <linux/signalfd.h>
#include <linux/uprobes.h>
#include <linux/aio.h>
#include <linux/compiler.h>
#include <linux/sysctl.h>
#include <linux/kcov.h>
#include <linux/livepatch.h>
#include <linux/thread_info.h>
#include <linux/stackleak.h>
#include <linux/kasan.h>
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/bpf.h>
#include <linux/stackprotector.h>
#include <linux/user_events.h>
#include <linux/iommu.h>
#include <linux/rseq.h>
#include <uapi/linux/pidfd.h>
#include <linux/pidfs.h>

#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

#include <trace/events/sched.h>

#define CREATE_TRACE_POINTS
#include <trace/events/task.h>

/*
 * Minimum number of threads to boot the kernel
 */
#define MIN_THREADS 20

/*
 * Maximum number of threads
 */
#define MAX_THREADS FUTEX_TID_MASK

/*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
unsigned long total_forks;        /* Handle normal Linux uptimes. */
int nr_threads;                        /* The idle threads do not count.. */

static int max_threads;                /* tunable limit on nr_threads */

#define NAMED_ARRAY_INDEX(x)        [x] = __stringify(x)

static const char * const resident_page_types[] = {
        NAMED_ARRAY_INDEX(MM_FILEPAGES),
        NAMED_ARRAY_INDEX(MM_ANONPAGES),
        NAMED_ARRAY_INDEX(MM_SWAPENTS),
        NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
};

DEFINE_PER_CPU(unsigned long, process_counts) = 0;

__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */

#ifdef CONFIG_PROVE_RCU
int lockdep_tasklist_lock_is_held(void)
{
        return lockdep_is_held(&tasklist_lock);
}
EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
#endif /* #ifdef CONFIG_PROVE_RCU */

int nr_processes(void)
{
        int cpu;
        int total = 0;

        for_each_possible_cpu(cpu)
                total += per_cpu(process_counts, cpu);

        return total;
}

void __weak arch_release_task_struct(struct task_struct *tsk)
{
}

static struct kmem_cache *task_struct_cachep;

static inline struct task_struct *alloc_task_struct_node(int node)
{
        return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
}

static inline void free_task_struct(struct task_struct *tsk)
{
        kmem_cache_free(task_struct_cachep, tsk);
}

/*
 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
 * kmemcache based allocator.
 */
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)

#  ifdef CONFIG_VMAP_STACK
/*
 * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
 * flush.  Try to minimize the number of calls by caching stacks.
 */
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);

struct vm_stack {
        struct rcu_head rcu;
        struct vm_struct *stack_vm_area;
};

static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
{
        unsigned int i;

        for (i = 0; i < NR_CACHED_STACKS; i++) {
                if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL)
                        continue;
                return true;
        }
        return false;
}

static void thread_stack_free_rcu(struct rcu_head *rh)
{
        struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);

        if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
                return;

        vfree(vm_stack);
}

static void thread_stack_delayed_free(struct task_struct *tsk)
{
        struct vm_stack *vm_stack = tsk->stack;

        vm_stack->stack_vm_area = tsk->stack_vm_area;
        call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
}

static int free_vm_stack_cache(unsigned int cpu)
{
        struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
        int i;

        for (i = 0; i < NR_CACHED_STACKS; i++) {
                struct vm_struct *vm_stack = cached_vm_stacks[i];

                if (!vm_stack)
                        continue;

                vfree(vm_stack->addr);
                cached_vm_stacks[i] = NULL;
        }

        return 0;
}

static int memcg_charge_kernel_stack(struct vm_struct *vm)
{
        int i;
        int ret;
        int nr_charged = 0;

        BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);

        for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
                ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
                if (ret)
                        goto err;
                nr_charged++;
        }
        return 0;
err:
        for (i = 0; i < nr_charged; i++)
                memcg_kmem_uncharge_page(vm->pages[i], 0);
        return ret;
}

static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
        struct vm_struct *vm;
        void *stack;
        int i;

        for (i = 0; i < NR_CACHED_STACKS; i++) {
                struct vm_struct *s;

                s = this_cpu_xchg(cached_stacks[i], NULL);

                if (!s)
                        continue;

                /* Reset stack metadata. */
                kasan_unpoison_range(s->addr, THREAD_SIZE);

                stack = kasan_reset_tag(s->addr);

                /* Clear stale pointers from reused stack. */
                memset(stack, 0, THREAD_SIZE);

                if (memcg_charge_kernel_stack(s)) {
                        vfree(s->addr);
                        return -ENOMEM;
                }

                tsk->stack_vm_area = s;
                tsk->stack = stack;
                return 0;
        }

        /*
         * Allocated stacks are cached and later reused by new threads,
         * so memcg accounting is performed manually on assigning/releasing
         * stacks to tasks. Drop __GFP_ACCOUNT.
         */
        stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
                                     VMALLOC_START, VMALLOC_END,
                                     THREADINFO_GFP & ~__GFP_ACCOUNT,
                                     PAGE_KERNEL,
                                     0, node, __builtin_return_address(0));
        if (!stack)
                return -ENOMEM;

        vm = find_vm_area(stack);
        if (memcg_charge_kernel_stack(vm)) {
                vfree(stack);
                return -ENOMEM;
        }
        /*
         * We can't call find_vm_area() in interrupt context, and
         * free_thread_stack() can be called in interrupt context,
         * so cache the vm_struct.
         */
        tsk->stack_vm_area = vm;
        stack = kasan_reset_tag(stack);
        tsk->stack = stack;
        return 0;
}

static void free_thread_stack(struct task_struct *tsk)
{
        if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
                thread_stack_delayed_free(tsk);

        tsk->stack = NULL;
        tsk->stack_vm_area = NULL;
}

#  else /* !CONFIG_VMAP_STACK */

static void thread_stack_free_rcu(struct rcu_head *rh)
{
        __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
}

static void thread_stack_delayed_free(struct task_struct *tsk)
{
        struct rcu_head *rh = tsk->stack;

        call_rcu(rh, thread_stack_free_rcu);
}

static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
        struct page *page = alloc_pages_node(node, THREADINFO_GFP,
                                             THREAD_SIZE_ORDER);

        if (likely(page)) {
                tsk->stack = kasan_reset_tag(page_address(page));
                return 0;
        }
        return -ENOMEM;
}

static void free_thread_stack(struct task_struct *tsk)
{
        thread_stack_delayed_free(tsk);
        tsk->stack = NULL;
}

#  endif /* CONFIG_VMAP_STACK */
# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */

static struct kmem_cache *thread_stack_cache;

static void thread_stack_free_rcu(struct rcu_head *rh)
{
        kmem_cache_free(thread_stack_cache, rh);
}

static void thread_stack_delayed_free(struct task_struct *tsk)
{
        struct rcu_head *rh = tsk->stack;

        call_rcu(rh, thread_stack_free_rcu);
}

static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
        unsigned long *stack;
        stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
        stack = kasan_reset_tag(stack);
        tsk->stack = stack;
        return stack ? 0 : -ENOMEM;
}

static void free_thread_stack(struct task_struct *tsk)
{
        thread_stack_delayed_free(tsk);
        tsk->stack = NULL;
}

void thread_stack_cache_init(void)
{
        thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
                                        THREAD_SIZE, THREAD_SIZE, 0, 0,
                                        THREAD_SIZE, NULL);
        BUG_ON(thread_stack_cache == NULL);
}

# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */

/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;

/* SLAB cache for sighand_struct structures (tsk->sighand) */
struct kmem_cache *sighand_cachep;

/* SLAB cache for files_struct structures (tsk->files) */
struct kmem_cache *files_cachep;

/* SLAB cache for fs_struct structures (tsk->fs) */
struct kmem_cache *fs_cachep;

/* SLAB cache for vm_area_struct structures */
static struct kmem_cache *vm_area_cachep;

/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;

#ifdef CONFIG_PER_VMA_LOCK

/* SLAB cache for vm_area_struct.lock */
static struct kmem_cache *vma_lock_cachep;

static bool vma_lock_alloc(struct vm_area_struct *vma)
{
        vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
        if (!vma->vm_lock)
                return false;

        init_rwsem(&vma->vm_lock->lock);
        vma->vm_lock_seq = -1;

        return true;
}

static inline void vma_lock_free(struct vm_area_struct *vma)
{
        kmem_cache_free(vma_lock_cachep, vma->vm_lock);
}

#else /* CONFIG_PER_VMA_LOCK */

static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
static inline void vma_lock_free(struct vm_area_struct *vma) {}

#endif /* CONFIG_PER_VMA_LOCK */

struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{
        struct vm_area_struct *vma;

        vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
        if (!vma)
                return NULL;

        vma_init(vma, mm);
        if (!vma_lock_alloc(vma)) {
                kmem_cache_free(vm_area_cachep, vma);
                return NULL;
        }

        return vma;
}

struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
{
        struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);

        if (!new)
                return NULL;

        ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
        ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
        /*
         * orig->shared.rb may be modified concurrently, but the clone
         * will be reinitialized.
         */
        data_race(memcpy(new, orig, sizeof(*new)));
        if (!vma_lock_alloc(new)) {
                kmem_cache_free(vm_area_cachep, new);
                return NULL;
        }
        INIT_LIST_HEAD(&new->anon_vma_chain);
        vma_numab_state_init(new);
        dup_anon_vma_name(orig, new);

        return new;
}

void __vm_area_free(struct vm_area_struct *vma)
{
        vma_numab_state_free(vma);
        free_anon_vma_name(vma);
        vma_lock_free(vma);
        kmem_cache_free(vm_area_cachep, vma);
}

#ifdef CONFIG_PER_VMA_LOCK
static void vm_area_free_rcu_cb(struct rcu_head *head)
{
        struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
                                                  vm_rcu);

        /* The vma should not be locked while being destroyed. */
        VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
        __vm_area_free(vma);
}
#endif

void vm_area_free(struct vm_area_struct *vma)
{
#ifdef CONFIG_PER_VMA_LOCK
        call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
#else
        __vm_area_free(vma);
#endif
}

static void account_kernel_stack(struct task_struct *tsk, int account)
{
        if (IS_ENABLED(CONFIG_VMAP_STACK)) {
                struct vm_struct *vm = task_stack_vm_area(tsk);
                int i;

                for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
                        mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
                                              account * (PAGE_SIZE / 1024));
        } else {
                void *stack = task_stack_page(tsk);

                /* All stack pages are in the same node. */
                mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
                                      account * (THREAD_SIZE / 1024));
        }
}

void exit_task_stack_account(struct task_struct *tsk)
{
        account_kernel_stack(tsk, -1);

        if (IS_ENABLED(CONFIG_VMAP_STACK)) {
                struct vm_struct *vm;
                int i;

                vm = task_stack_vm_area(tsk);
                for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
                        memcg_kmem_uncharge_page(vm->pages[i], 0);
        }
}

static void release_task_stack(struct task_struct *tsk)
{
        if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
                return;  /* Better to leak the stack than to free prematurely */

        free_thread_stack(tsk);
}

#ifdef CONFIG_THREAD_INFO_IN_TASK
void put_task_stack(struct task_struct *tsk)
{
        if (refcount_dec_and_test(&tsk->stack_refcount))
                release_task_stack(tsk);
}
#endif

void free_task(struct task_struct *tsk)
{
#ifdef CONFIG_SECCOMP
        WARN_ON_ONCE(tsk->seccomp.filter);
#endif
        release_user_cpus_ptr(tsk);
        scs_release(tsk);

#ifndef CONFIG_THREAD_INFO_IN_TASK
        /*
         * The task is finally done with both the stack and thread_info,
         * so free both.
         */
        release_task_stack(tsk);
#else
        /*
         * If the task had a separate stack allocation, it should be gone
         * by now.
         */
        WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
#endif
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
        arch_release_task_struct(tsk);
        if (tsk->flags & PF_KTHREAD)
                free_kthread_struct(tsk);
        bpf_task_storage_free(tsk);
        free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);

static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
{
        struct file *exe_file;

        exe_file = get_mm_exe_file(oldmm);
        RCU_INIT_POINTER(mm->exe_file, exe_file);
        /*
         * We depend on the oldmm having properly denied write access to the
         * exe_file already.
         */
        if (exe_file && deny_write_access(exe_file))
                pr_warn_once("deny_write_access() failed in %s\n", __func__);
}

#ifdef CONFIG_MMU
static __latent_entropy int dup_mmap(struct mm_struct *mm,
                                        struct mm_struct *oldmm)
{
        struct vm_area_struct *mpnt, *tmp;
        int retval;
        unsigned long charge = 0;
        LIST_HEAD(uf);
        VMA_ITERATOR(vmi, mm, 0);

        uprobe_start_dup_mmap();
        if (mmap_write_lock_killable(oldmm)) {
                retval = -EINTR;
                goto fail_uprobe_end;
        }
        flush_cache_dup_mm(oldmm);
        uprobe_dup_mmap(oldmm, mm);
        /*
         * Not linked in yet - no deadlock potential:
         */
        mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);

        /* No ordering required: file already has been exposed. */
        dup_mm_exe_file(mm, oldmm);

        mm->total_vm = oldmm->total_vm;
        mm->data_vm = oldmm->data_vm;
        mm->exec_vm = oldmm->exec_vm;
        mm->stack_vm = oldmm->stack_vm;

        retval = ksm_fork(mm, oldmm);
        if (retval)
                goto out;
        khugepaged_fork(mm, oldmm);

        /* Use __mt_dup() to efficiently build an identical maple tree. */
        retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
        if (unlikely(retval))
                goto out;

        mt_clear_in_rcu(vmi.mas.tree);
        for_each_vma(vmi, mpnt) {
                struct file *file;

                vma_start_write(mpnt);
                if (mpnt->vm_flags & VM_DONTCOPY) {
                        retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
                                                    mpnt->vm_end, GFP_KERNEL);
                        if (retval)
                                goto loop_out;

                        vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
                        continue;
                }
                charge = 0;
                /*
                 * Don't duplicate many vmas if we've been oom-killed (for
                 * example)
                 */
                if (fatal_signal_pending(current)) {
                        retval = -EINTR;
                        goto loop_out;
                }
                if (mpnt->vm_flags & VM_ACCOUNT) {
                        unsigned long len = vma_pages(mpnt);

                        if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
                                goto fail_nomem;
                        charge = len;
                }
                tmp = vm_area_dup(mpnt);
                if (!tmp)
                        goto fail_nomem;
                retval = vma_dup_policy(mpnt, tmp);
                if (retval)
                        goto fail_nomem_policy;
                tmp->vm_mm = mm;
                retval = dup_userfaultfd(tmp, &uf);
                if (retval)
                        goto fail_nomem_anon_vma_fork;
                if (tmp->vm_flags & VM_WIPEONFORK) {
                        /*
                         * VM_WIPEONFORK gets a clean slate in the child.
                         * Don't prepare anon_vma until fault since we don't
                         * copy page for current vma.
                         */
                        tmp->anon_vma = NULL;
                } else if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
                vm_flags_clear(tmp, VM_LOCKED_MASK);
                /*
                 * Copy/update hugetlb private vma information.
                 */
                if (is_vm_hugetlb_page(tmp))
                        hugetlb_dup_vma_private(tmp);

                /*
                 * Link the vma into the MT. After using __mt_dup(), memory
                 * allocation is not necessary here, so it cannot fail.
                 */
                vma_iter_bulk_store(&vmi, tmp);

                mm->map_count++;

                if (tmp->vm_ops && tmp->vm_ops->open)
                        tmp->vm_ops->open(tmp);

                file = tmp->vm_file;
                if (file) {
                        struct address_space *mapping = file->f_mapping;

                        get_file(file);
                        i_mmap_lock_write(mapping);
                        if (vma_is_shared_maywrite(tmp))
                                mapping_allow_writable(mapping);
                        flush_dcache_mmap_lock(mapping);
                        /* insert tmp into the share list, just after mpnt */
                        vma_interval_tree_insert_after(tmp, mpnt,
                                        &mapping->i_mmap);
                        flush_dcache_mmap_unlock(mapping);
                        i_mmap_unlock_write(mapping);
                }

                if (!(tmp->vm_flags & VM_WIPEONFORK))
                        retval = copy_page_range(tmp, mpnt);

                if (retval) {
                        mpnt = vma_next(&vmi);
                        goto loop_out;
                }
        }
        /* a new mm has just been created */
        retval = arch_dup_mmap(oldmm, mm);
loop_out:
        vma_iter_free(&vmi);
        if (!retval) {
                mt_set_in_rcu(vmi.mas.tree);
        } else if (mpnt) {
                /*
                 * The entire maple tree has already been duplicated. If the
                 * mmap duplication fails, mark the failure point with
                 * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
                 * stop releasing VMAs that have not been duplicated after this
                 * point.
                 */
                mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
                mas_store(&vmi.mas, XA_ZERO_ENTRY);
        }
out:
        mmap_write_unlock(mm);
        flush_tlb_mm(oldmm);
        mmap_write_unlock(oldmm);
        dup_userfaultfd_complete(&uf);
fail_uprobe_end:
        uprobe_end_dup_mmap();
        return retval;

fail_nomem_anon_vma_fork:
        mpol_put(vma_policy(tmp));
fail_nomem_policy:
        vm_area_free(tmp);
fail_nomem:
        retval = -ENOMEM;
        vm_unacct_memory(charge);
        goto loop_out;
}

static inline int mm_alloc_pgd(struct mm_struct *mm)
{
        mm->pgd = pgd_alloc(mm);
        if (unlikely(!mm->pgd))
                return -ENOMEM;
        return 0;
}

static inline void mm_free_pgd(struct mm_struct *mm)
{
        pgd_free(mm, mm->pgd);
}
#else
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
        mmap_write_lock(oldmm);
        dup_mm_exe_file(mm, oldmm);
        mmap_write_unlock(oldmm);
        return 0;
}
#define mm_alloc_pgd(mm)        (0)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */

static void check_mm(struct mm_struct *mm)
{
        int i;

        BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
                         "Please make sure 'struct resident_page_types[]' is updated as well");

        for (i = 0; i < NR_MM_COUNTERS; i++) {
                long x = percpu_counter_sum(&mm->rss_stat[i]);

                if (unlikely(x))
                        pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
                                 mm, resident_page_types[i], x);
        }

        if (mm_pgtables_bytes(mm))
                pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
                                mm_pgtables_bytes(mm));

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
        VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
}

#define allocate_mm()        (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm)        (kmem_cache_free(mm_cachep, (mm)))

static void do_check_lazy_tlb(void *arg)
{
        struct mm_struct *mm = arg;

        WARN_ON_ONCE(current->active_mm == mm);
}

static void do_shoot_lazy_tlb(void *arg)
{
        struct mm_struct *mm = arg;

        if (current->active_mm == mm) {
                WARN_ON_ONCE(current->mm);
                current->active_mm = &init_mm;
                switch_mm(mm, &init_mm, current);
        }
}

static void cleanup_lazy_tlbs(struct mm_struct *mm)
{
        if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
                /*
                 * In this case, lazy tlb mms are refounted and would not reach
                 * __mmdrop until all CPUs have switched away and mmdrop()ed.
                 */
                return;
        }

        /*
         * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
         * requires lazy mm users to switch to another mm when the refcount
         * drops to zero, before the mm is freed. This requires IPIs here to
         * switch kernel threads to init_mm.
         *
         * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
         * switch with the final userspace teardown TLB flush which leaves the
         * mm lazy on this CPU but no others, reducing the need for additional
         * IPIs here. There are cases where a final IPI is still required here,
         * such as the final mmdrop being performed on a different CPU than the
         * one exiting, or kernel threads using the mm when userspace exits.
         *
         * IPI overheads have not found to be expensive, but they could be
         * reduced in a number of possible ways, for example (roughly
         * increasing order of complexity):
         * - The last lazy reference created by exit_mm() could instead switch
         *   to init_mm, however it's probable this will run on the same CPU
         *   immediately afterwards, so this may not reduce IPIs much.
         * - A batch of mms requiring IPIs could be gathered and freed at once.
         * - CPUs store active_mm where it can be remotely checked without a
         *   lock, to filter out false-positives in the cpumask.
         * - After mm_users or mm_count reaches zero, switching away from the
         *   mm could clear mm_cpumask to reduce some IPIs, perhaps together
         *   with some batching or delaying of the final IPIs.
         * - A delayed freeing and RCU-like quiescing sequence based on mm
         *   switching to avoid IPIs completely.
         */
        on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
        if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
                on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
}

/*
 * Called when the last reference to the mm
 * is dropped: either by a lazy thread or by
 * mmput. Free the page directory and the mm.
 */
void __mmdrop(struct mm_struct *mm)
{
        BUG_ON(mm == &init_mm);
        WARN_ON_ONCE(mm == current->mm);

        /* Ensure no CPUs are using this as their lazy tlb mm */
        cleanup_lazy_tlbs(mm);

        WARN_ON_ONCE(mm == current->active_mm);
        mm_free_pgd(mm);
        destroy_context(mm);
        mmu_notifier_subscriptions_destroy(mm);
        check_mm(mm);
        put_user_ns(mm->user_ns);
        mm_pasid_drop(mm);
        mm_destroy_cid(mm);
        percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);

        free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);

static void mmdrop_async_fn(struct work_struct *work)
{
        struct mm_struct *mm;

        mm = container_of(work, struct mm_struct, async_put_work);
        __mmdrop(mm);
}

static void mmdrop_async(struct mm_struct *mm)
{
        if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
                INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
                schedule_work(&mm->async_put_work);
        }
}

static inline void free_signal_struct(struct signal_struct *sig)
{
        taskstats_tgid_free(sig);
        sched_autogroup_exit(sig);
        /*
         * __mmdrop is not safe to call from softirq context on x86 due to
         * pgd_dtor so postpone it to the async context
         */
        if (sig->oom_mm)
                mmdrop_async(sig->oom_mm);
        kmem_cache_free(signal_cachep, sig);
}

static inline void put_signal_struct(struct signal_struct *sig)
{
        if (refcount_dec_and_test(&sig->sigcnt))
                free_signal_struct(sig);
}

void __put_task_struct(struct task_struct *tsk)
{
        WARN_ON(!tsk->exit_state);
        WARN_ON(refcount_read(&tsk->usage));
        WARN_ON(tsk == current);

        io_uring_free(tsk);
        cgroup_free(tsk);
        task_numa_free(tsk, true);
        security_task_free(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
        put_signal_struct(tsk->signal);
        sched_core_free(tsk);
        free_task(tsk);
}
EXPORT_SYMBOL_GPL(__put_task_struct);

void __put_task_struct_rcu_cb(struct rcu_head *rhp)
{
        struct task_struct *task = container_of(rhp, struct task_struct, rcu);

        __put_task_struct(task);
}
EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);

void __init __weak arch_task_cache_init(void) { }

/*
 * set_max_threads
 */
static void set_max_threads(unsigned int max_threads_suggested)
{
        u64 threads;
        unsigned long nr_pages = totalram_pages();

        /*
         * The number of threads shall be limited such that the thread
         * structures may only consume a small part of the available memory.
         */
        if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
                threads = MAX_THREADS;
        else
                threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
                                    (u64) THREAD_SIZE * 8UL);

        if (threads > max_threads_suggested)
                threads = max_threads_suggested;

        max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
}

#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
/* Initialized by the architecture: */
int arch_task_struct_size __read_mostly;
#endif

static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
        /* Fetch thread_struct whitelist for the architecture. */
        arch_thread_struct_whitelist(offset, size);

        /*
         * Handle zero-sized whitelist or empty thread_struct, otherwise
         * adjust offset to position of thread_struct in task_struct.
         */
        if (unlikely(*size == 0))
                *offset = 0;
        else
                *offset += offsetof(struct task_struct, thread);
}

void __init fork_init(void)
{
        int i;
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN        0
#endif
        int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
        unsigned long useroffset, usersize;

        /* create a slab on which task_structs can be allocated */
        task_struct_whitelist(&useroffset, &usersize);
        task_struct_cachep = kmem_cache_create_usercopy("task_struct",
                        arch_task_struct_size, align,
                        SLAB_PANIC|SLAB_ACCOUNT,
                        useroffset, usersize, NULL);

        /* do the arch specific task caches init */
        arch_task_cache_init();

        set_max_threads(MAX_THREADS);

        init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
        init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
        init_task.signal->rlim[RLIMIT_SIGPENDING] =
                init_task.signal->rlim[RLIMIT_NPROC];

        for (i = 0; i < UCOUNT_COUNTS; i++)
                init_user_ns.ucount_max[i] = max_threads/2;

        set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC,      RLIM_INFINITY);
        set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE,   RLIM_INFINITY);
        set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
        set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK,    RLIM_INFINITY);

#ifdef CONFIG_VMAP_STACK
        cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
                          NULL, free_vm_stack_cache);
#endif

        scs_init();

        lockdep_init_task(&init_task);
        uprobes_init();
}

int __weak arch_dup_task_struct(struct task_struct *dst,
                                               struct task_struct *src)
{
        *dst = *src;
        return 0;
}

void set_task_stack_end_magic(struct task_struct *tsk)
{
        unsigned long *stackend;

        stackend = end_of_stack(tsk);
        *stackend = STACK_END_MAGIC;        /* for overflow detection */
}

static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
        struct task_struct *tsk;
        int err;

        if (node == NUMA_NO_NODE)
                node = tsk_fork_get_node(orig);
        tsk = alloc_task_struct_node(node);
        if (!tsk)
                return NULL;

        err = arch_dup_task_struct(tsk, orig);
        if (err)
                goto free_tsk;

        err = alloc_thread_stack_node(tsk, node);
        if (err)
                goto free_tsk;

#ifdef CONFIG_THREAD_INFO_IN_TASK
        refcount_set(&tsk->stack_refcount, 1);
#endif
        account_kernel_stack(tsk, 1);

        err = scs_prepare(tsk, node);
        if (err)
                goto free_stack;

#ifdef CONFIG_SECCOMP
        /*
         * We must handle setting up seccomp filters once we're under
         * the sighand lock in case orig has changed between now and
         * then. Until then, filter must be NULL to avoid messing up
         * the usage counts on the error path calling free_task.
         */
        tsk->seccomp.filter = NULL;
#endif

        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
        set_task_stack_end_magic(tsk);
        clear_syscall_work_syscall_user_dispatch(tsk);

#ifdef CONFIG_STACKPROTECTOR
        tsk->stack_canary = get_random_canary();
#endif
        if (orig->cpus_ptr == &orig->cpus_mask)
                tsk->cpus_ptr = &tsk->cpus_mask;
        dup_user_cpus_ptr(tsk, orig, node);

        /*
         * One for the user space visible state that goes away when reaped.
         * One for the scheduler.
         */
        refcount_set(&tsk->rcu_users, 2);
        /* One for the rcu users */
        refcount_set(&tsk->usage, 1);
#ifdef CONFIG_BLK_DEV_IO_TRACE
        tsk->btrace_seq = 0;
#endif
        tsk->splice_pipe = NULL;
        tsk->task_frag.page = NULL;
        tsk->wake_q.next = NULL;
        tsk->worker_private = NULL;

        kcov_task_init(tsk);
        kmsan_task_create(tsk);
        kmap_local_fork(tsk);

#ifdef CONFIG_FAULT_INJECTION
        tsk->fail_nth = 0;
#endif

#ifdef CONFIG_BLK_CGROUP
        tsk->throttle_disk = NULL;
        tsk->use_memdelay = 0;
#endif

#ifdef CONFIG_ARCH_HAS_CPU_PASID
        tsk->pasid_activated = 0;
#endif

#ifdef CONFIG_MEMCG
        tsk->active_memcg = NULL;
#endif

#ifdef CONFIG_CPU_SUP_INTEL
        tsk->reported_split_lock = 0;
#endif

#ifdef CONFIG_SCHED_MM_CID
        tsk->mm_cid = -1;
        tsk->last_mm_cid = -1;
        tsk->mm_cid_active = 0;
        tsk->migrate_from_cpu = -1;
#endif
        return tsk;

free_stack:
        exit_task_stack_account(tsk);
        free_thread_stack(tsk);
free_tsk:
        free_task_struct(tsk);
        return NULL;
}

__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);

static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;

static int __init coredump_filter_setup(char *s)
{
        default_dump_filter =
                (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
                MMF_DUMP_FILTER_MASK;
        return 1;
}

__setup("coredump_filter=", coredump_filter_setup);

#include <linux/init_task.h>

static void mm_init_aio(struct mm_struct *mm)
{
#ifdef CONFIG_AIO
        spin_lock_init(&mm->ioctx_lock);
        mm->ioctx_table = NULL;
#endif
}

static __always_inline void mm_clear_owner(struct mm_struct *mm,
                                           struct task_struct *p)
{
#ifdef CONFIG_MEMCG
        if (mm->owner == p)
                WRITE_ONCE(mm->owner, NULL);
#endif
}

static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
#ifdef CONFIG_MEMCG
        mm->owner = p;
#endif
}

static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
        mm->uprobes_state.xol_area = NULL;
#endif
}

static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
        struct user_namespace *user_ns)
{
        mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
        mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
        atomic_set(&mm->mm_users, 1);
        atomic_set(&mm->mm_count, 1);
        seqcount_init(&mm->write_protect_seq);
        mmap_init_lock(mm);
        INIT_LIST_HEAD(&mm->mmlist);
#ifdef CONFIG_PER_VMA_LOCK
        mm->mm_lock_seq = 0;
#endif
        mm_pgtables_bytes_init(mm);
        mm->map_count = 0;
        mm->locked_vm = 0;
        atomic64_set(&mm->pinned_vm, 0);
        memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
        spin_lock_init(&mm->page_table_lock);
        spin_lock_init(&mm->arg_lock);
        mm_init_cpumask(mm);
        mm_init_aio(mm);
        mm_init_owner(mm, p);
        mm_pasid_init(mm);
        RCU_INIT_POINTER(mm->exe_file, NULL);
        mmu_notifier_subscriptions_init(mm);
        init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
        mm->pmd_huge_pte = NULL;
#endif
        mm_init_uprobes_state(mm);
        hugetlb_count_init(mm);

        if (current->mm) {
                mm->flags = mmf_init_flags(current->mm->flags);
                mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
        } else {
                mm->flags = default_dump_filter;
                mm->def_flags = 0;
        }

        if (mm_alloc_pgd(mm))
                goto fail_nopgd;

        if (init_new_context(p, mm))
                goto fail_nocontext;

        if (mm_alloc_cid(mm))
                goto fail_cid;

        if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
                                     NR_MM_COUNTERS))
                goto fail_pcpu;

        mm->user_ns = get_user_ns(user_ns);
        lru_gen_init_mm(mm);
        return mm;

fail_pcpu:
        mm_destroy_cid(mm);
fail_cid:
        destroy_context(mm);
fail_nocontext:
        mm_free_pgd(mm);
fail_nopgd:
        free_mm(mm);
        return NULL;
}

/*
 * Allocate and initialize an mm_struct.
 */
struct mm_struct *mm_alloc(void)
{
        struct mm_struct *mm;

        mm = allocate_mm();
        if (!mm)
                return NULL;

        memset(mm, 0, sizeof(*mm));
        return mm_init(mm, current, current_user_ns());
}

static inline void __mmput(struct mm_struct *mm)
{
        VM_BUG_ON(atomic_read(&mm->mm_users));

        uprobe_clear_state(mm);
        exit_aio(mm);
        ksm_exit(mm);
        khugepaged_exit(mm); /* must run before exit_mmap */
        exit_mmap(mm);
        mm_put_huge_zero_folio(mm);
        set_mm_exe_file(mm, NULL);
        if (!list_empty(&mm->mmlist)) {
                spin_lock(&mmlist_lock);
                list_del(&mm->mmlist);
                spin_unlock(&mmlist_lock);
        }
        if (mm->binfmt)
                module_put(mm->binfmt->module);
        lru_gen_del_mm(mm);
        mmdrop(mm);
}

/*
 * Decrement the use count and release all resources for an mm.
 */
void mmput(struct mm_struct *mm)
{
        might_sleep();

        if (atomic_dec_and_test(&mm->mm_users))
                __mmput(mm);
}
EXPORT_SYMBOL_GPL(mmput);

#ifdef CONFIG_MMU
static void mmput_async_fn(struct work_struct *work)
{
        struct mm_struct *mm = container_of(work, struct mm_struct,
                                            async_put_work);

        __mmput(mm);
}

void mmput_async(struct mm_struct *mm)
{
        if (atomic_dec_and_test(&mm->mm_users)) {
                INIT_WORK(&mm->async_put_work, mmput_async_fn);
                schedule_work(&mm->async_put_work);
        }
}
EXPORT_SYMBOL_GPL(mmput_async);
#endif

/**
 * set_mm_exe_file - change a reference to the mm's executable file
 * @mm: The mm to change.
 * @new_exe_file: The new file to use.
 *
 * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
 *
 * Main users are mmput() and sys_execve(). Callers prevent concurrent
 * invocations: in mmput() nobody alive left, in execve it happens before
 * the new mm is made visible to anyone.
 *
 * Can only fail if new_exe_file != NULL.
 */
int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
        struct file *old_exe_file;

        /*
         * It is safe to dereference the exe_file without RCU as
         * this function is only called if nobody else can access
         * this mm -- see comment above for justification.
         */
        old_exe_file = rcu_dereference_raw(mm->exe_file);

        if (new_exe_file) {
                /*
                 * We expect the caller (i.e., sys_execve) to already denied
                 * write access, so this is unlikely to fail.
                 */
                if (unlikely(deny_write_access(new_exe_file)))
                        return -EACCES;
                get_file(new_exe_file);
        }
        rcu_assign_pointer(mm->exe_file, new_exe_file);
        if (old_exe_file) {
                allow_write_access(old_exe_file);
                fput(old_exe_file);
        }
        return 0;
}

/**
 * replace_mm_exe_file - replace a reference to the mm's executable file
 * @mm: The mm to change.
 * @new_exe_file: The new file to use.
 *
 * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
 *
 * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
 */
int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
        struct vm_area_struct *vma;
        struct file *old_exe_file;
        int ret = 0;

        /* Forbid mm->exe_file change if old file still mapped. */
        old_exe_file = get_mm_exe_file(mm);
        if (old_exe_file) {
                VMA_ITERATOR(vmi, mm, 0);
                mmap_read_lock(mm);
                for_each_vma(vmi, vma) {
                        if (!vma->vm_file)
                                continue;
                        if (path_equal(&vma->vm_file->f_path,
                                       &old_exe_file->f_path)) {
                                ret = -EBUSY;
                                break;
                        }
                }
                mmap_read_unlock(mm);
                fput(old_exe_file);
                if (ret)
                        return ret;
        }

        ret = deny_write_access(new_exe_file);
        if (ret)
                return -EACCES;
        get_file(new_exe_file);

        /* set the new file */
        mmap_write_lock(mm);
        old_exe_file = rcu_dereference_raw(mm->exe_file);
        rcu_assign_pointer(mm->exe_file, new_exe_file);
        mmap_write_unlock(mm);

        if (old_exe_file) {
                allow_write_access(old_exe_file);
                fput(old_exe_file);
        }
        return 0;
}

/**
 * get_mm_exe_file - acquire a reference to the mm's executable file
 * @mm: The mm of interest.
 *
 * Returns %NULL if mm has no associated executable file.
 * User must release file via fput().
 */
struct file *get_mm_exe_file(struct mm_struct *mm)
{
        struct file *exe_file;

        rcu_read_lock();
        exe_file = get_file_rcu(&mm->exe_file);
        rcu_read_unlock();
        return exe_file;
}

/**
 * get_task_exe_file - acquire a reference to the task's executable file
 * @task: The task.
 *
 * Returns %NULL if task's mm (if any) has no associated executable file or
 * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
 * User must release file via fput().
 */
struct file *get_task_exe_file(struct task_struct *task)
{
        struct file *exe_file = NULL;
        struct mm_struct *mm;

        task_lock(task);
        mm = task->mm;
        if (mm) {
                if (!(task->flags & PF_KTHREAD))
                        exe_file = get_mm_exe_file(mm);
        }
        task_unlock(task);
        return exe_file;
}

/**
 * get_task_mm - acquire a reference to the task's mm
 * @task: The task.
 *
 * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
 * this kernel workthread has transiently adopted a user mm with use_mm,
 * to do its AIO) is not set and if so returns a reference to it, after
 * bumping up the use count.  User must release the mm via mmput()
 * after use.  Typically used by /proc and ptrace.
 */
struct mm_struct *get_task_mm(struct task_struct *task)
{
        struct mm_struct *mm;

        task_lock(task);
        mm = task->mm;
        if (mm) {
                if (task->flags & PF_KTHREAD)
                        mm = NULL;
                else
                        mmget(mm);
        }
        task_unlock(task);
        return mm;
}
EXPORT_SYMBOL_GPL(get_task_mm);

struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
{
        struct mm_struct *mm;
        int err;

        err =  down_read_killable(&task->signal->exec_update_lock);
        if (err)
                return ERR_PTR(err);

        mm = get_task_mm(task);
        if (mm && mm != current->mm &&
                        !ptrace_may_access(task, mode)) {
                mmput(mm);
                mm = ERR_PTR(-EACCES);
        }
        up_read(&task->signal->exec_update_lock);

        return mm;
}

static void complete_vfork_done(struct task_struct *tsk)
{
        struct completion *vfork;

        task_lock(tsk);
        vfork = tsk->vfork_done;
        if (likely(vfork)) {
                tsk->vfork_done = NULL;
                complete(vfork);
        }
        task_unlock(tsk);
}

static int wait_for_vfork_done(struct task_struct *child,
                                struct completion *vfork)
{
        unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
        int killed;

        cgroup_enter_frozen();
        killed = wait_for_completion_state(vfork, state);
        cgroup_leave_frozen(false);

        if (killed) {
                task_lock(child);
                child->vfork_done = NULL;
                task_unlock(child);
        }

        put_task_struct(child);
        return killed;
}

/* Please note the differences between mmput and mm_release.
 * mmput is called whenever we stop holding onto a mm_struct,
 * error success whatever.
 *
 * mm_release is called after a mm_struct has been removed
 * from the current process.
 *
 * This difference is important for error handling, when we
 * only half set up a mm_struct for a new process and need to restore
 * the old one.  Because we mmput the new mm_struct before
 * restoring the old one. . .
 * Eric Biederman 10 January 1998
 */
static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        uprobe_free_utask(tsk);

        /* Get rid of any cached register state */
        deactivate_mm(tsk, mm);

        /*
         * Signal userspace if we're not exiting with a core dump
         * because we want to leave the value intact for debugging
         * purposes.
         */
        if (tsk->clear_child_tid) {
                if (atomic_read(&mm->mm_users) > 1) {
                        /*
                         * We don't check the error code - if userspace has
                         * not set up a proper pointer then tough luck.
                         */
                        put_user(0, tsk->clear_child_tid);
                        do_futex(tsk->clear_child_tid, FUTEX_WAKE,
                                        1, NULL, NULL, 0, 0);
                }
                tsk->clear_child_tid = NULL;
        }

        /*
         * All done, finally we can wake up parent and return this mm to him.
         * Also kthread_stop() uses this completion for synchronization.
         */
        if (tsk->vfork_done)
                complete_vfork_done(tsk);
}

void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        futex_exit_release(tsk);
        mm_release(tsk, mm);
}

void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        futex_exec_release(tsk);
        mm_release(tsk, mm);
}

/**
 * dup_mm() - duplicates an existing mm structure
 * @tsk: the task_struct with which the new mm will be associated.
 * @oldmm: the mm to duplicate.
 *
 * Allocates a new mm structure and duplicates the provided @oldmm structure
 * content into it.
 *
 * Return: the duplicated mm or NULL on failure.
 */
static struct mm_struct *dup_mm(struct task_struct *tsk,
                                struct mm_struct *oldmm)
{
        struct mm_struct *mm;
        int err;

        mm = allocate_mm();
        if (!mm)
                goto fail_nomem;

        memcpy(mm, oldmm, sizeof(*mm));

        if (!mm_init(mm, tsk, mm->user_ns))
                goto fail_nomem;

        err = dup_mmap(mm, oldmm);
        if (err)
                goto free_pt;

        mm->hiwater_rss = get_mm_rss(mm);
        mm->hiwater_vm = mm->total_vm;

        if (mm->binfmt && !try_module_get(mm->binfmt->module))
                goto free_pt;

        return mm;

free_pt:
        /* don't put binfmt in mmput, we haven't got module yet */
        mm->binfmt = NULL;
        mm_init_owner(mm, NULL);
        mmput(mm);

fail_nomem:
        return NULL;
}

static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
{
        struct mm_struct *mm, *oldmm;

        tsk->min_flt = tsk->maj_flt = 0;
        tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
        tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
        tsk->last_switch_time = 0;
#endif

        tsk->mm = NULL;
        tsk->active_mm = NULL;

        /*
         * Are we cloning a kernel thread?
         *
         * We need to steal a active VM for that..
         */
        oldmm = current->mm;
        if (!oldmm)
                return 0;

        if (clone_flags & CLONE_VM) {
                mmget(oldmm);
                mm = oldmm;
        } else {
                mm = dup_mm(tsk, current->mm);
                if (!mm)
                        return -ENOMEM;
        }

        tsk->mm = mm;
        tsk->active_mm = mm;
        sched_mm_cid_fork(tsk);
        return 0;
}

static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
{
        struct fs_struct *fs = current->fs;
        if (clone_flags & CLONE_FS) {
                /* tsk->fs is already what we want */
                spin_lock(&fs->lock);
                /* "users" and "in_exec" locked for check_unsafe_exec() */
                if (fs->in_exec) {
                        spin_unlock(&fs->lock);
                        return -EAGAIN;
                }
                fs->users++;
                spin_unlock(&fs->lock);
                return 0;
        }
        tsk->fs = copy_fs_struct(fs);
        if (!tsk->fs)
                return -ENOMEM;
        return 0;
}

static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
                      int no_files)
{
        struct files_struct *oldf, *newf;
        int error = 0;

        /*
         * A background process may not have any files ...
         */
        oldf = current->files;
        if (!oldf)
                goto out;

        if (no_files) {
                tsk->files = NULL;
                goto out;
        }

        if (clone_flags & CLONE_FILES) {
                atomic_inc(&oldf->count);
                goto out;
        }

        newf = dup_fd(oldf, NR_OPEN_MAX, &error);
        if (!newf)
                goto out;

        tsk->files = newf;
        error = 0;
out:
        return error;
}

static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
{
        struct sighand_struct *sig;

        if (clone_flags & CLONE_SIGHAND) {
                refcount_inc(&current->sighand->count);
                return 0;
        }
        sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
        RCU_INIT_POINTER(tsk->sighand, sig);
        if (!sig)
                return -ENOMEM;

        refcount_set(&sig->count, 1);
        spin_lock_irq(&current->sighand->siglock);
        memcpy(sig->action, current->sighand->action, sizeof(sig->action));
        spin_unlock_irq(&current->sighand->siglock);

        /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
        if (clone_flags & CLONE_CLEAR_SIGHAND)
                flush_signal_handlers(tsk, 0);

        return 0;
}

void __cleanup_sighand(struct sighand_struct *sighand)
{
        if (refcount_dec_and_test(&sighand->count)) {
                signalfd_cleanup(sighand);
                /*
                 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
                 * without an RCU grace period, see __lock_task_sighand().
                 */
                kmem_cache_free(sighand_cachep, sighand);
        }
}

/*
 * Initialize POSIX timer handling for a thread group.
 */
static void posix_cpu_timers_init_group(struct signal_struct *sig)
{
        struct posix_cputimers *pct = &sig->posix_cputimers;
        unsigned long cpu_limit;

        cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
        posix_cputimers_group_init(pct, cpu_limit);
}

static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
{
        struct signal_struct *sig;

        if (clone_flags & CLONE_THREAD)
                return 0;

        sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
        tsk->signal = sig;
        if (!sig)
                return -ENOMEM;

        sig->nr_threads = 1;
        sig->quick_threads = 1;
        atomic_set(&sig->live, 1);
        refcount_set(&sig->sigcnt, 1);

        /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
        sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
        tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);

        init_waitqueue_head(&sig->wait_chldexit);
        sig->curr_target = tsk;
        init_sigpending(&sig->shared_pending);
        INIT_HLIST_HEAD(&sig->multiprocess);
        seqlock_init(&sig->stats_lock);
        prev_cputime_init(&sig->prev_cputime);

#ifdef CONFIG_POSIX_TIMERS
        INIT_LIST_HEAD(&sig->posix_timers);
        hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        sig->real_timer.function = it_real_fn;
#endif

        task_lock(current->group_leader);
        memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
        task_unlock(current->group_leader);

        posix_cpu_timers_init_group(sig);

        tty_audit_fork(sig);
        sched_autogroup_fork(sig);

        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;

        mutex_init(&sig->cred_guard_mutex);
        init_rwsem(&sig->exec_update_lock);

        return 0;
}

static void copy_seccomp(struct task_struct *p)
{
#ifdef CONFIG_SECCOMP
        /*
         * Must be called with sighand->lock held, which is common to
         * all threads in the group. Holding cred_guard_mutex is not
         * needed because this new task is not yet running and cannot
         * be racing exec.
         */
        assert_spin_locked(&current->sighand->siglock);

        /* Ref-count the new filter user, and assign it. */
        get_seccomp_filter(current);
        p->seccomp = current->seccomp;

        /*
         * Explicitly enable no_new_privs here in case it got set
         * between the task_struct being duplicated and holding the
         * sighand lock. The seccomp state and nnp must be in sync.
         */
        if (task_no_new_privs(current))
                task_set_no_new_privs(p);

        /*
         * If the parent gained a seccomp mode after copying thread
         * flags and between before we held the sighand lock, we have
         * to manually enable the seccomp thread flag here.
         */
        if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
                set_task_syscall_work(p, SECCOMP);
#endif
}

SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
{
        current->clear_child_tid = tidptr;

        return task_pid_vnr(current);
}

static void rt_mutex_init_task(struct task_struct *p)
{
        raw_spin_lock_init(&p->pi_lock);
#ifdef CONFIG_RT_MUTEXES
        p->pi_waiters = RB_ROOT_CACHED;
        p->pi_top_task = NULL;
        p->pi_blocked_on = NULL;
#endif
}

static inline void init_task_pid_links(struct task_struct *task)
{
        enum pid_type type;

        for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type)
                INIT_HLIST_NODE(&task->pid_links[type]);
}

static inline void
init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
{
        if (type == PIDTYPE_PID)
                task->thread_pid = pid;
        else
                task->signal->pids[type] = pid;
}

static inline void rcu_copy_process(struct task_struct *p)
{
#ifdef CONFIG_PREEMPT_RCU
        p->rcu_read_lock_nesting = 0;
        p->rcu_read_unlock_special.s = 0;
        p->rcu_blocked_node = NULL;
        INIT_LIST_HEAD(&p->rcu_node_entry);
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TASKS_RCU
        p->rcu_tasks_holdout = false;
        INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
        p->rcu_tasks_idle_cpu = -1;
        INIT_LIST_HEAD(&p->rcu_tasks_exit_list);
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_TRACE_RCU
        p->trc_reader_nesting = 0;
        p->trc_reader_special.s = 0;
        INIT_LIST_HEAD(&p->trc_holdout_list);
        INIT_LIST_HEAD(&p->trc_blkd_node);
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}

/**
 * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
 * @pid:   the struct pid for which to create a pidfd
 * @flags: flags of the new @pidfd
 * @ret: Where to return the file for the pidfd.
 *
 * Allocate a new file that stashes @pid and reserve a new pidfd number in the
 * caller's file descriptor table. The pidfd is reserved but not installed yet.
 *
 * The helper doesn't perform checks on @pid which makes it useful for pidfds
 * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
 * pidfd file are prepared.
 *
 * If this function returns successfully the caller is responsible to either
 * call fd_install() passing the returned pidfd and pidfd file as arguments in
 * order to install the pidfd into its file descriptor table or they must use
 * put_unused_fd() and fput() on the returned pidfd and pidfd file
 * respectively.
 *
 * This function is useful when a pidfd must already be reserved but there
 * might still be points of failure afterwards and the caller wants to ensure
 * that no pidfd is leaked into its file descriptor table.
 *
 * Return: On success, a reserved pidfd is returned from the function and a new
 *         pidfd file is returned in the last argument to the function. On
 *         error, a negative error code is returned from the function and the
 *         last argument remains unchanged.
 */
static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
        int pidfd;
        struct file *pidfd_file;

        pidfd = get_unused_fd_flags(O_CLOEXEC);
        if (pidfd < 0)
                return pidfd;

        pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
        if (IS_ERR(pidfd_file)) {
                put_unused_fd(pidfd);
                return PTR_ERR(pidfd_file);
        }
        /*
         * anon_inode_getfile() ignores everything outside of the
         * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
         */
        pidfd_file->f_flags |= (flags & PIDFD_THREAD);
        *ret = pidfd_file;
        return pidfd;
}

/**
 * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
 * @pid:   the struct pid for which to create a pidfd
 * @flags: flags of the new @pidfd
 * @ret: Where to return the pidfd.
 *
 * Allocate a new file that stashes @pid and reserve a new pidfd number in the
 * caller's file descriptor table. The pidfd is reserved but not installed yet.
 *
 * The helper verifies that @pid is still in use, without PIDFD_THREAD the
 * task identified by @pid must be a thread-group leader.
 *
 * If this function returns successfully the caller is responsible to either
 * call fd_install() passing the returned pidfd and pidfd file as arguments in
 * order to install the pidfd into its file descriptor table or they must use
 * put_unused_fd() and fput() on the returned pidfd and pidfd file
 * respectively.
 *
 * This function is useful when a pidfd must already be reserved but there
 * might still be points of failure afterwards and the caller wants to ensure
 * that no pidfd is leaked into its file descriptor table.
 *
 * Return: On success, a reserved pidfd is returned from the function and a new
 *         pidfd file is returned in the last argument to the function. On
 *         error, a negative error code is returned from the function and the
 *         last argument remains unchanged.
 */
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
        bool thread = flags & PIDFD_THREAD;

        if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID))
                return -EINVAL;

        return __pidfd_prepare(pid, flags, ret);
}

static void __delayed_free_task(struct rcu_head *rhp)
{
        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);

        free_task(tsk);
}

static __always_inline void delayed_free_task(struct task_struct *tsk)
{
        if (IS_ENABLED(CONFIG_MEMCG))
                call_rcu(&tsk->rcu, __delayed_free_task);
        else
                free_task(tsk);
}

static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
{
        /* Skip if kernel thread */
        if (!tsk->mm)
                return;

        /* Skip if spawning a thread or using vfork */
        if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
                return;

        /* We need to synchronize with __set_oom_adj */
        mutex_lock(&oom_adj_mutex);
        set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
        /* Update the values in case they were changed after copy_signal */
        tsk->signal->oom_score_adj = current->signal->oom_score_adj;
        tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
        mutex_unlock(&oom_adj_mutex);
}

#ifdef CONFIG_RV
static void rv_task_fork(struct task_struct *p)
{
        int i;

        for (i = 0; i < RV_PER_TASK_MONITORS; i++)
                p->rv[i].da_mon.monitoring = false;
}
#else
#define rv_task_fork(p) do {} while (0)
#endif

/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
__latent_entropy struct task_struct *copy_process(
                                        struct pid *pid,
                                        int trace,
                                        int node,
                                        struct kernel_clone_args *args)
{
        int pidfd = -1, retval;
        struct task_struct *p;
        struct multiprocess_signals delayed;
        struct file *pidfile = NULL;
        const u64 clone_flags = args->flags;
        struct nsproxy *nsp = current->nsproxy;

        /*
         * Don't allow sharing the root directory with processes in a different
         * namespace
         */
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                return ERR_PTR(-EINVAL);

        if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
                return ERR_PTR(-EINVAL);

        /*
         * Thread groups must share signals as well, and detached threads
         * can only be started up within the thread group.
         */
        if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
                return ERR_PTR(-EINVAL);

        /*
         * Shared signal handlers imply shared VM. By way of the above,
         * thread groups also imply shared VM. Blocking this case allows
         * for various simplifications in other code.
         */
        if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
                return ERR_PTR(-EINVAL);

        /*
         * Siblings of global init remain as zombies on exit since they are
         * not reaped by their parent (swapper). To solve this and to avoid
         * multi-rooted process trees, prevent global and container-inits
         * from creating siblings.
         */
        if ((clone_flags & CLONE_PARENT) &&
                                current->signal->flags & SIGNAL_UNKILLABLE)
                return ERR_PTR(-EINVAL);

        /*
         * If the new process will be in a different pid or user namespace
         * do not allow it to share a thread group with the forking task.
         */
        if (clone_flags & CLONE_THREAD) {
                if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
                    (task_active_pid_ns(current) != nsp->pid_ns_for_children))
                        return ERR_PTR(-EINVAL);
        }

        if (clone_flags & CLONE_PIDFD) {
                /*
                 * - CLONE_DETACHED is blocked so that we can potentially
                 *   reuse it later for CLONE_PIDFD.
                 */
                if (clone_flags & CLONE_DETACHED)
                        return ERR_PTR(-EINVAL);
        }

        /*
         * Force any signals received before this point to be delivered
         * before the fork happens.  Collect up signals sent to multiple
         * processes that happen during the fork and delay them so that
         * they appear to happen after the fork.
         */
        sigemptyset(&delayed.signal);
        INIT_HLIST_NODE(&delayed.node);

        spin_lock_irq(&current->sighand->siglock);
        if (!(clone_flags & CLONE_THREAD))
                hlist_add_head(&delayed.node, &current->signal->multiprocess);
        recalc_sigpending();
        spin_unlock_irq(&current->sighand->siglock);
        retval = -ERESTARTNOINTR;
        if (task_sigpending(current))
                goto fork_out;

        retval = -ENOMEM;
        p = dup_task_struct(current, node);
        if (!p)
                goto fork_out;
        p->flags &= ~PF_KTHREAD;
        if (args->kthread)
                p->flags |= PF_KTHREAD;
        if (args->user_worker) {
                /*
                 * Mark us a user worker, and block any signal that isn't
                 * fatal or STOP
                 */
                p->flags |= PF_USER_WORKER;
                siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
        }
        if (args->io_thread)
                p->flags |= PF_IO_WORKER;

        if (args->name)
                strscpy_pad(p->comm, args->name, sizeof(p->comm));

        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
        /*
         * Clear TID on mm_release()?
         */
        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;

        ftrace_graph_init_task(p);

        rt_mutex_init_task(p);

        lockdep_assert_irqs_enabled();
#ifdef CONFIG_PROVE_LOCKING
        DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
        retval = copy_creds(p, clone_flags);
        if (retval < 0)
                goto bad_fork_free;

        retval = -EAGAIN;
        if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
                if (p->real_cred->user != INIT_USER &&
                    !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
                        goto bad_fork_cleanup_count;
        }
        current->flags &= ~PF_NPROC_EXCEEDED;

        /*
         * If multiple threads are within copy_process(), then this check
         * triggers too late. This doesn't hurt, the check is only there
         * to stop root fork bombs.
         */
        retval = -EAGAIN;
        if (data_race(nr_threads >= max_threads))
                goto bad_fork_cleanup_count;

        delayacct_tsk_init(p);        /* Must remain after dup_task_struct() */
        p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY);
        p->flags |= PF_FORKNOEXEC;
        INIT_LIST_HEAD(&p->children);
        INIT_LIST_HEAD(&p->sibling);
        rcu_copy_process(p);
        p->vfork_done = NULL;
        spin_lock_init(&p->alloc_lock);

        init_sigpending(&p->pending);

        p->utime = p->stime = p->gtime = 0;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
        p->utimescaled = p->stimescaled = 0;
#endif
        prev_cputime_init(&p->prev_cputime);

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
        seqcount_init(&p->vtime.seqcount);
        p->vtime.starttime = 0;
        p->vtime.state = VTIME_INACTIVE;
#endif

#ifdef CONFIG_IO_URING
        p->io_uring = NULL;
#endif

        p->default_timer_slack_ns = current->timer_slack_ns;

#ifdef CONFIG_PSI
        p->psi_flags = 0;
#endif

        task_io_accounting_init(&p->ioac);
        acct_clear_integrals(p);

        posix_cputimers_init(&p->posix_cputimers);

        p->io_context = NULL;
        audit_set_context(p, NULL);
        cgroup_fork(p);
        if (args->kthread) {
                if (!set_kthread_struct(p))
                        goto bad_fork_cleanup_delayacct;
        }
#ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
        if (IS_ERR(p->mempolicy)) {
                retval = PTR_ERR(p->mempolicy);
                p->mempolicy = NULL;
                goto bad_fork_cleanup_delayacct;
        }
#endif
#ifdef CONFIG_CPUSETS
        p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
        p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
        seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
        memset(&p->irqtrace, 0, sizeof(p->irqtrace));
        p->irqtrace.hardirq_disable_ip        = _THIS_IP_;
        p->irqtrace.softirq_enable_ip        = _THIS_IP_;
        p->softirqs_enabled                = 1;
        p->softirq_context                = 0;
#endif

        p->pagefault_disabled = 0;

#ifdef CONFIG_LOCKDEP
        lockdep_init_task(p);
#endif

#ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_BCACHE
        p->sequential_io        = 0;
        p->sequential_io_avg        = 0;
#endif
#ifdef CONFIG_BPF_SYSCALL
        RCU_INIT_POINTER(p->bpf_storage, NULL);
        p->bpf_ctx = NULL;
#endif

        /* Perform scheduler related setup. Assign this task to a CPU. */
        retval = sched_fork(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_policy;

        retval = perf_event_init_task(p, clone_flags);
        if (retval)
                goto bad_fork_cleanup_policy;
        retval = audit_alloc(p);
        if (retval)
                goto bad_fork_cleanup_perf;
        /* copy all the process information */
        shm_init_task(p);
        retval = security_task_alloc(p, clone_flags);
        if (retval)
                goto bad_fork_cleanup_audit;
        retval = copy_semundo(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_security;
        retval = copy_files(clone_flags, p, args->no_files);
        if (retval)
                goto bad_fork_cleanup_semundo;
        retval = copy_fs(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_files;
        retval = copy_sighand(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_fs;
        retval = copy_signal(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_sighand;
        retval = copy_mm(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_signal;
        retval = copy_namespaces(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_mm;
        retval = copy_io(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_namespaces;
        retval = copy_thread(p, args);
        if (retval)
                goto bad_fork_cleanup_io;

        stackleak_task_init(p);

        if (pid != &init_struct_pid) {
                pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
                                args->set_tid_size);
                if (IS_ERR(pid)) {
                        retval = PTR_ERR(pid);
                        goto bad_fork_cleanup_thread;
                }
        }

        /*
         * This has to happen after we've potentially unshared the file
         * descriptor table (so that the pidfd doesn't leak into the child
         * if the fd table isn't shared).
         */
        if (clone_flags & CLONE_PIDFD) {
                int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;

                /* Note that no task has been attached to @pid yet. */
                retval = __pidfd_prepare(pid, flags, &pidfile);
                if (retval < 0)
                        goto bad_fork_free_pid;
                pidfd = retval;

                retval = put_user(pidfd, args->pidfd);
                if (retval)
                        goto bad_fork_put_pidfd;
        }

#ifdef CONFIG_BLOCK
        p->plug = NULL;
#endif
        futex_init_task(p);

        /*
         * sigaltstack should be cleared when sharing the same VM
         */
        if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
                sas_ss_reset(p);

        /*
         * Syscall tracing and stepping should be turned off in the
         * child regardless of CLONE_PTRACE.
         */
        user_disable_single_step(p);
        clear_task_syscall_work(p, SYSCALL_TRACE);
#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
        clear_task_syscall_work(p, SYSCALL_EMU);
#endif
        clear_tsk_latency_tracing(p);

        /* ok, now we should be set up.. */
        p->pid = pid_nr(pid);
        if (clone_flags & CLONE_THREAD) {
                p->group_leader = current->group_leader;
                p->tgid = current->tgid;
        } else {
                p->group_leader = p;
                p->tgid = p->pid;
        }

        p->nr_dirtied = 0;
        p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
        p->dirty_paused_when = 0;

        p->pdeath_signal = 0;
        p->task_works = NULL;
        clear_posix_cputimers_work(p);

#ifdef CONFIG_KRETPROBES
        p->kretprobe_instances.first = NULL;
#endif
#ifdef CONFIG_RETHOOK
        p->rethooks.first = NULL;
#endif

        /*
         * Ensure that the cgroup subsystem policies allow the new process to be
         * forked. It should be noted that the new process's css_set can be changed
         * between here and cgroup_post_fork() if an organisation operation is in
         * progress.
         */
        retval = cgroup_can_fork(p, args);
        if (retval)
                goto bad_fork_put_pidfd;

        /*
         * Now that the cgroups are pinned, re-clone the parent cgroup and put
         * the new task on the correct runqueue. All this *before* the task
         * becomes visible.
         *
         * This isn't part of ->can_fork() because while the re-cloning is
         * cgroup specific, it unconditionally needs to place the task on a
         * runqueue.
         */
        sched_cgroup_fork(p, args);

        /*
         * From this point on we must avoid any synchronous user-space
         * communication until we take the tasklist-lock. In particular, we do
         * not want user-space to be able to predict the process start-time by
         * stalling fork(2) after we recorded the start_time but before it is
         * visible to the system.
         */

        p->start_time = ktime_get_ns();
        p->start_boottime = ktime_get_boottime_ns();

        /*
         * Make it visible to the rest of the system, but dont wake it up yet.
         * Need tasklist lock for parent etc handling!
         */
        write_lock_irq(&tasklist_lock);

        /* CLONE_PARENT re-uses the old parent */
        if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
                p->real_parent = current->real_parent;
                p->parent_exec_id = current->parent_exec_id;
                if (clone_flags & CLONE_THREAD)
                        p->exit_signal = -1;
                else
                        p->exit_signal = current->group_leader->exit_signal;
        } else {
                p->real_parent = current;
                p->parent_exec_id = current->self_exec_id;
                p->exit_signal = args->exit_signal;
        }

        klp_copy_process(p);

        sched_core_fork(p);

        spin_lock(&current->sighand->siglock);

        rv_task_fork(p);

        rseq_fork(p, clone_flags);

        /* Don't start children in a dying pid namespace */
        if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
                retval = -ENOMEM;
                goto bad_fork_cancel_cgroup;
        }

        /* Let kill terminate clone/fork in the middle */
        if (fatal_signal_pending(current)) {
                retval = -EINTR;
                goto bad_fork_cancel_cgroup;
        }

        /* No more failure paths after this point. */

        /*
         * Copy seccomp details explicitly here, in case they were changed
         * before holding sighand lock.
         */
        copy_seccomp(p);

        init_task_pid_links(p);
        if (likely(p->pid)) {
                ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);

                init_task_pid(p, PIDTYPE_PID, pid);
                if (thread_group_leader(p)) {
                        init_task_pid(p, PIDTYPE_TGID, pid);
                        init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
                        init_task_pid(p, PIDTYPE_SID, task_session(current));

                        if (is_child_reaper(pid)) {
                                ns_of_pid(pid)->child_reaper = p;
                                p->signal->flags |= SIGNAL_UNKILLABLE;
                        }
                        p->signal->shared_pending.signal = delayed.signal;
                        p->signal->tty = tty_kref_get(current->signal->tty);
                        /*
                         * Inherit has_child_subreaper flag under the same
                         * tasklist_lock with adding child to the process tree
                         * for propagate_has_child_subreaper optimization.
                         */
                        p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
                                                         p->real_parent->signal->is_child_subreaper;
                        list_add_tail(&p->sibling, &p->real_parent->children);
                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
                        attach_pid(p, PIDTYPE_TGID);
                        attach_pid(p, PIDTYPE_PGID);
                        attach_pid(p, PIDTYPE_SID);
                        __this_cpu_inc(process_counts);
                } else {
                        current->signal->nr_threads++;
                        current->signal->quick_threads++;
                        atomic_inc(&current->signal->live);
                        refcount_inc(&current->signal->sigcnt);
                        task_join_group_stop(p);
                        list_add_tail_rcu(&p->thread_node,
                                          &p->signal->thread_head);
                }
                attach_pid(p, PIDTYPE_PID);
                nr_threads++;
        }
        total_forks++;
        hlist_del_init(&delayed.node);
        spin_unlock(&current->sighand->siglock);
        syscall_tracepoint_update(p);
        write_unlock_irq(&tasklist_lock);

        if (pidfile)
                fd_install(pidfd, pidfile);

        proc_fork_connector(p);
        sched_post_fork(p);
        cgroup_post_fork(p, args);
        perf_event_fork(p);

        trace_task_newtask(p, clone_flags);
        uprobe_copy_process(p, clone_flags);
        user_events_fork(p, clone_flags);

        copy_oom_score_adj(clone_flags, p);

        return p;

bad_fork_cancel_cgroup:
        sched_core_free(p);
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
        cgroup_cancel_fork(p, args);
bad_fork_put_pidfd:
        if (clone_flags & CLONE_PIDFD) {
                fput(pidfile);
                put_unused_fd(pidfd);
        }
bad_fork_free_pid:
        if (pid != &init_struct_pid)
                free_pid(pid);
bad_fork_cleanup_thread:
        exit_thread(p);
bad_fork_cleanup_io:
        if (p->io_context)
                exit_io_context(p);
bad_fork_cleanup_namespaces:
        exit_task_namespaces(p);
bad_fork_cleanup_mm:
        if (p->mm) {
                mm_clear_owner(p->mm, p);
                mmput(p->mm);
        }
bad_fork_cleanup_signal:
        if (!(clone_flags & CLONE_THREAD))
                free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
        __cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
        exit_fs(p); /* blocking */
bad_fork_cleanup_files:
        exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
        exit_sem(p);
bad_fork_cleanup_security:
        security_task_free(p);
bad_fork_cleanup_audit:
        audit_free(p);
bad_fork_cleanup_perf:
        perf_event_free_task(p);
bad_fork_cleanup_policy:
        lockdep_free_task(p);
#ifdef CONFIG_NUMA
        mpol_put(p->mempolicy);
#endif
bad_fork_cleanup_delayacct:
        delayacct_tsk_free(p);
bad_fork_cleanup_count:
        dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
        exit_creds(p);
bad_fork_free:
        WRITE_ONCE(p->__state, TASK_DEAD);
        exit_task_stack_account(p);
        put_task_stack(p);
        delayed_free_task(p);
fork_out:
        spin_lock_irq(&current->sighand->siglock);
        hlist_del_init(&delayed.node);
        spin_unlock_irq(&current->sighand->siglock);
        return ERR_PTR(retval);
}

static inline void init_idle_pids(struct task_struct *idle)
{
        enum pid_type type;

        for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
                INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */
                init_task_pid(idle, type, &init_struct_pid);
        }
}

static int idle_dummy(void *dummy)
{
        /* This function is never called */
        return 0;
}

struct task_struct * __init fork_idle(int cpu)
{
        struct task_struct *task;
        struct kernel_clone_args args = {
                .flags                = CLONE_VM,
                .fn                = &idle_dummy,
                .fn_arg                = NULL,
                .kthread        = 1,
                .idle                = 1,
        };

        task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
        if (!IS_ERR(task)) {
                init_idle_pids(task);
                init_idle(task, cpu);
        }

        return task;
}

/*
 * This is like kernel_clone(), but shaved down and tailored to just
 * creating io_uring workers. It returns a created task, or an error pointer.
 * The returned task is inactive, and the caller must fire it up through
 * wake_up_new_task(p). All signals are blocked in the created task.
 */
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
{
        unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
                                CLONE_IO;
        struct kernel_clone_args args = {
                .flags                = ((lower_32_bits(flags) | CLONE_VM |
                                    CLONE_UNTRACED) & ~CSIGNAL),
                .exit_signal        = (lower_32_bits(flags) & CSIGNAL),
                .fn                = fn,
                .fn_arg                = arg,
                .io_thread        = 1,
                .user_worker        = 1,
        };

        return copy_process(NULL, 0, node, &args);
}

/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 *
 * args->exit_signal is expected to be checked for sanity by the caller.
 */
pid_t kernel_clone(struct kernel_clone_args *args)
{
        u64 clone_flags = args->flags;
        struct completion vfork;
        struct pid *pid;
        struct task_struct *p;
        int trace = 0;
        pid_t nr;

        /*
         * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
         * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
         * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
         * field in struct clone_args and it still doesn't make sense to have
         * them both point at the same memory location. Performing this check
         * here has the advantage that we don't need to have a separate helper
         * to check for legacy clone().
         */
        if ((clone_flags & CLONE_PIDFD) &&
            (clone_flags & CLONE_PARENT_SETTID) &&
            (args->pidfd == args->parent_tid))
                return -EINVAL;

        /*
         * Determine whether and which event to report to ptracer.  When
         * called from kernel_thread or CLONE_UNTRACED is explicitly
         * requested, no event is reported; otherwise, report if the event
         * for the type of forking is enabled.
         */
        if (!(clone_flags & CLONE_UNTRACED)) {
                if (clone_flags & CLONE_VFORK)
                        trace = PTRACE_EVENT_VFORK;
                else if (args->exit_signal != SIGCHLD)
                        trace = PTRACE_EVENT_CLONE;
                else
                        trace = PTRACE_EVENT_FORK;

                if (likely(!ptrace_event_enabled(current, trace)))
                        trace = 0;
        }

        p = copy_process(NULL, trace, NUMA_NO_NODE, args);
        add_latent_entropy();

        if (IS_ERR(p))
                return PTR_ERR(p);

        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.
         */
        trace_sched_process_fork(current, p);

        pid = get_task_pid(p, PIDTYPE_PID);
        nr = pid_vnr(pid);

        if (clone_flags & CLONE_PARENT_SETTID)
                put_user(nr, args->parent_tid);

        if (clone_flags & CLONE_VFORK) {
                p->vfork_done = &vfork;
                init_completion(&vfork);
                get_task_struct(p);
        }

        if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
                /* lock the task to synchronize with memcg migration */
                task_lock(p);
                lru_gen_add_mm(p->mm);
                task_unlock(p);
        }

        wake_up_new_task(p);

        /* forking complete and child started to run, tell ptracer */
        if (unlikely(trace))
                ptrace_event_pid(trace, pid);

        if (clone_flags & CLONE_VFORK) {
                if (!wait_for_vfork_done(p, &vfork))
                        ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
        }

        put_pid(pid);
        return nr;
}

/*
 * Create a kernel thread.
 */
pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
                    unsigned long flags)
{
        struct kernel_clone_args args = {
                .flags                = ((lower_32_bits(flags) | CLONE_VM |
                                    CLONE_UNTRACED) & ~CSIGNAL),
                .exit_signal        = (lower_32_bits(flags) & CSIGNAL),
                .fn                = fn,
                .fn_arg                = arg,
                .name                = name,
                .kthread        = 1,
        };

        return kernel_clone(&args);
}

/*
 * Create a user mode thread.
 */
pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
        struct kernel_clone_args args = {
                .flags                = ((lower_32_bits(flags) | CLONE_VM |
                                    CLONE_UNTRACED) & ~CSIGNAL),
                .exit_signal        = (lower_32_bits(flags) & CSIGNAL),
                .fn                = fn,
                .fn_arg                = arg,
        };

        return kernel_clone(&args);
}

#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
        struct kernel_clone_args args = {
                .exit_signal = SIGCHLD,
        };

        return kernel_clone(&args);
#else
        /* can not support in nommu mode */
        return -EINVAL;
#endif
}
#endif

#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
        struct kernel_clone_args args = {
                .flags                = CLONE_VFORK | CLONE_VM,
                .exit_signal        = SIGCHLD,
        };

        return kernel_clone(&args);
}
#endif

#ifdef __ARCH_WANT_SYS_CLONE
#ifdef CONFIG_CLONE_BACKWARDS
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 int __user *, parent_tidptr,
                 unsigned long, tls,
                 int __user *, child_tidptr)
#elif defined(CONFIG_CLONE_BACKWARDS2)
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
                 int __user *, parent_tidptr,
                 int __user *, child_tidptr,
                 unsigned long, tls)
#elif defined(CONFIG_CLONE_BACKWARDS3)
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
                int, stack_size,
                int __user *, parent_tidptr,
                int __user *, child_tidptr,
                unsigned long, tls)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 int __user *, parent_tidptr,
                 int __user *, child_tidptr,
                 unsigned long, tls)
#endif
{
        struct kernel_clone_args args = {
                .flags                = (lower_32_bits(clone_flags) & ~CSIGNAL),
                .pidfd                = parent_tidptr,
                .child_tid        = child_tidptr,
                .parent_tid        = parent_tidptr,
                .exit_signal        = (lower_32_bits(clone_flags) & CSIGNAL),
                .stack                = newsp,
                .tls                = tls,
        };

        return kernel_clone(&args);
}
#endif

#ifdef __ARCH_WANT_SYS_CLONE3

noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
                                              struct clone_args __user *uargs,
                                              size_t usize)
{
        int err;
        struct clone_args args;
        pid_t *kset_tid = kargs->set_tid;

        BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
                     CLONE_ARGS_SIZE_VER0);
        BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
                     CLONE_ARGS_SIZE_VER1);
        BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
                     CLONE_ARGS_SIZE_VER2);
        BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);

        if (unlikely(usize > PAGE_SIZE))
                return -E2BIG;
        if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
                return -EINVAL;

        err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
        if (err)
                return err;

        if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
                return -EINVAL;

        if (unlikely(!args.set_tid && args.set_tid_size > 0))
                return -EINVAL;

        if (unlikely(args.set_tid && args.set_tid_size == 0))
                return -EINVAL;

        /*
         * Verify that higher 32bits of exit_signal are unset and that
         * it is a valid signal
         */
        if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
                     !valid_signal(args.exit_signal)))
                return -EINVAL;

        if ((args.flags & CLONE_INTO_CGROUP) &&
            (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
                return -EINVAL;

        *kargs = (struct kernel_clone_args){
                .flags                = args.flags,
                .pidfd                = u64_to_user_ptr(args.pidfd),
                .child_tid        = u64_to_user_ptr(args.child_tid),
                .parent_tid        = u64_to_user_ptr(args.parent_tid),
                .exit_signal        = args.exit_signal,
                .stack                = args.stack,
                .stack_size        = args.stack_size,
                .tls                = args.tls,
                .set_tid_size        = args.set_tid_size,
                .cgroup                = args.cgroup,
        };

        if (args.set_tid &&
                copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
                        (kargs->set_tid_size * sizeof(pid_t))))
                return -EFAULT;

        kargs->set_tid = kset_tid;

        return 0;
}

/**
 * clone3_stack_valid - check and prepare stack
 * @kargs: kernel clone args
 *
 * Verify that the stack arguments userspace gave us are sane.
 * In addition, set the stack direction for userspace since it's easy for us to
 * determine.
 */
static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
{
        if (kargs->stack == 0) {
                if (kargs->stack_size > 0)
                        return false;
        } else {
                if (kargs->stack_size == 0)
                        return false;

                if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
                        return false;

#if !defined(CONFIG_STACK_GROWSUP)
                kargs->stack += kargs->stack_size;
#endif
        }

        return true;
}

static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
        /* Verify that no unknown flags are passed along. */
        if (kargs->flags &
            ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
                return false;

        /*
         * - make the CLONE_DETACHED bit reusable for clone3
         * - make the CSIGNAL bits reusable for clone3
         */
        if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME))))
                return false;

        if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
            (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
                return false;

        if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
            kargs->exit_signal)
                return false;

        if (!clone3_stack_valid(kargs))
                return false;

        return true;
}

/**
 * sys_clone3 - create a new process with specific properties
 * @uargs: argument structure
 * @size:  size of @uargs
 *
 * clone3() is the extensible successor to clone()/clone2().
 * It takes a struct as argument that is versioned by its size.
 *
 * Return: On success, a positive PID for the child process.
 *         On error, a negative errno number.
 */
SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
{
        int err;

        struct kernel_clone_args kargs;
        pid_t set_tid[MAX_PID_NS_LEVEL];

        kargs.set_tid = set_tid;

        err = copy_clone_args_from_user(&kargs, uargs, size);
        if (err)
                return err;

        if (!clone3_args_valid(&kargs))
                return -EINVAL;

        return kernel_clone(&kargs);
}
#endif

void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
{
        struct task_struct *leader, *parent, *child;
        int res;

        read_lock(&tasklist_lock);
        leader = top = top->group_leader;
down:
        for_each_thread(leader, parent) {
                list_for_each_entry(child, &parent->children, sibling) {
                        res = visitor(child, data);
                        if (res) {
                                if (res < 0)
                                        goto out;
                                leader = child;
                                goto down;
                        }
up:
                        ;
                }
        }

        if (leader != top) {
                child = leader;
                parent = child->real_parent;
                leader = parent->group_leader;
                goto up;
        }
out:
        read_unlock(&tasklist_lock);
}

#ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif

static void sighand_ctor(void *data)
{
        struct sighand_struct *sighand = data;

        spin_lock_init(&sighand->siglock);
        init_waitqueue_head(&sighand->signalfd_wqh);
}

void __init mm_cache_init(void)
{
        unsigned int mm_size;

        /*
         * The mm_cpumask is located at the end of mm_struct, and is
         * dynamically sized based on the maximum CPU number this system
         * can have, taking hotplug into account (nr_cpu_ids).
         */
        mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();

        mm_cachep = kmem_cache_create_usercopy("mm_struct",
                        mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        offsetof(struct mm_struct, saved_auxv),
                        sizeof_field(struct mm_struct, saved_auxv),
                        NULL);
}

void __init proc_caches_init(void)
{
        sighand_cachep = kmem_cache_create("sighand_cache",
                        sizeof(struct sighand_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
                        SLAB_ACCOUNT, sighand_ctor);
        signal_cachep = kmem_cache_create("signal_cache",
                        sizeof(struct signal_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);
        files_cachep = kmem_cache_create("files_cache",
                        sizeof(struct files_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);
        fs_cachep = kmem_cache_create("fs_cache",
                        sizeof(struct fs_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);

        vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
#ifdef CONFIG_PER_VMA_LOCK
        vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
#endif
        mmap_init();
        nsproxy_cache_init();
}

/*
 * Check constraints on flags passed to the unshare system call.
 */
static int check_unshare_flags(unsigned long unshare_flags)
{
        if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
                                CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
                                CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
                                CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
                                CLONE_NEWTIME))
                return -EINVAL;
        /*
         * Not implemented, but pretend it works if there is nothing
         * to unshare.  Note that unsharing the address space or the
         * signal handlers also need to unshare the signal queues (aka
         * CLONE_THREAD).
         */
        if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
                if (!thread_group_empty(current))
                        return -EINVAL;
        }
        if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
                if (refcount_read(&current->sighand->count) > 1)
                        return -EINVAL;
        }
        if (unshare_flags & CLONE_VM) {
                if (!current_is_single_threaded())
                        return -EINVAL;
        }

        return 0;
}

/*
 * Unshare the filesystem structure if it is being shared
 */
static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
{
        struct fs_struct *fs = current->fs;

        if (!(unshare_flags & CLONE_FS) || !fs)
                return 0;

        /* don't need lock here; in the worst case we'll do useless copy */
        if (fs->users == 1)
                return 0;

        *new_fsp = copy_fs_struct(fs);
        if (!*new_fsp)
                return -ENOMEM;

        return 0;
}

/*
 * Unshare file descriptor table if it is being shared
 */
int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
               struct files_struct **new_fdp)
{
        struct files_struct *fd = current->files;
        int error = 0;

        if ((unshare_flags & CLONE_FILES) &&
            (fd && atomic_read(&fd->count) > 1)) {
                *new_fdp = dup_fd(fd, max_fds, &error);
                if (!*new_fdp)
                        return error;
        }

        return 0;
}

/*
 * unshare allows a process to 'unshare' part of the process
 * context which was originally shared using clone.  copy_*
 * functions used by kernel_clone() cannot be used here directly
 * because they modify an inactive task_struct that is being
 * constructed. Here we are modifying the current, active,
 * task_struct.
 */
int ksys_unshare(unsigned long unshare_flags)
{
        struct fs_struct *fs, *new_fs = NULL;
        struct files_struct *new_fd = NULL;
        struct cred *new_cred = NULL;
        struct nsproxy *new_nsproxy = NULL;
        int do_sysvsem = 0;
        int err;

        /*
         * If unsharing a user namespace must also unshare the thread group
         * and unshare the filesystem root and working directories.
         */
        if (unshare_flags & CLONE_NEWUSER)
                unshare_flags |= CLONE_THREAD | CLONE_FS;
        /*
         * If unsharing vm, must also unshare signal handlers.
         */
        if (unshare_flags & CLONE_VM)
                unshare_flags |= CLONE_SIGHAND;
        /*
         * If unsharing a signal handlers, must also unshare the signal queues.
         */
        if (unshare_flags & CLONE_SIGHAND)
                unshare_flags |= CLONE_THREAD;
        /*
         * If unsharing namespace, must also unshare filesystem information.
         */
        if (unshare_flags & CLONE_NEWNS)
                unshare_flags |= CLONE_FS;

        err = check_unshare_flags(unshare_flags);
        if (err)
                goto bad_unshare_out;
        /*
         * CLONE_NEWIPC must also detach from the undolist: after switching
         * to a new ipc namespace, the semaphore arrays from the old
         * namespace are unreachable.
         */
        if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
                do_sysvsem = 1;
        err = unshare_fs(unshare_flags, &new_fs);
        if (err)
                goto bad_unshare_out;
        err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
        if (err)
                goto bad_unshare_cleanup_fs;
        err = unshare_userns(unshare_flags, &new_cred);
        if (err)
                goto bad_unshare_cleanup_fd;
        err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
                                         new_cred, new_fs);
        if (err)
                goto bad_unshare_cleanup_cred;

        if (new_cred) {
                err = set_cred_ucounts(new_cred);
                if (err)
                        goto bad_unshare_cleanup_cred;
        }

        if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
                if (do_sysvsem) {
                        /*
                         * CLONE_SYSVSEM is equivalent to sys_exit().
                         */
                        exit_sem(current);
                }
                if (unshare_flags & CLONE_NEWIPC) {
                        /* Orphan segments in old ns (see sem above). */
                        exit_shm(current);
                        shm_init_task(current);
                }

                if (new_nsproxy)
                        switch_task_namespaces(current, new_nsproxy);

                task_lock(current);

                if (new_fs) {
                        fs = current->fs;
                        spin_lock(&fs->lock);
                        current->fs = new_fs;
                        if (--fs->users)
                                new_fs = NULL;
                        else
                                new_fs = fs;
                        spin_unlock(&fs->lock);
                }

                if (new_fd)
                        swap(current->files, new_fd);

                task_unlock(current);

                if (new_cred) {
                        /* Install the new user namespace */
                        commit_creds(new_cred);
                        new_cred = NULL;
                }
        }

        perf_event_namespaces(current);

bad_unshare_cleanup_cred:
        if (new_cred)
                put_cred(new_cred);
bad_unshare_cleanup_fd:
        if (new_fd)
                put_files_struct(new_fd);

bad_unshare_cleanup_fs:
        if (new_fs)
                free_fs_struct(new_fs);

bad_unshare_out:
        return err;
}

SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
{
        return ksys_unshare(unshare_flags);
}

/*
 *        Helper to unshare the files of the current task.
 *        We don't want to expose copy_files internals to
 *        the exec layer of the kernel.
 */

int unshare_files(void)
{
        struct task_struct *task = current;
        struct files_struct *old, *copy = NULL;
        int error;

        error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
        if (error || !copy)
                return error;

        old = task->files;
        task_lock(task);
        task->files = copy;
        task_unlock(task);
        put_files_struct(old);
        return 0;
}

int sysctl_max_threads(struct ctl_table *table, int write,
                       void *buffer, size_t *lenp, loff_t *ppos)
{
        struct ctl_table t;
        int ret;
        int threads = max_threads;
        int min = 1;
        int max = MAX_THREADS;

        t = *table;
        t.data = &threads;
        t.extra1 = &min;
        t.extra2 = &max;

        ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
        if (ret || !write)
                return ret;

        max_threads = threads;

        return 0;
}












































































































































































































































































































































    2 
   21 


















    5 








    3 














    9 
































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_DCACHE_H
#define __LINUX_DCACHE_H

#include <linux/atomic.h>
#include <linux/list.h>
#include <linux/math.h>
#include <linux/rculist.h>
#include <linux/rculist_bl.h>
#include <linux/spinlock.h>
#include <linux/seqlock.h>
#include <linux/cache.h>
#include <linux/rcupdate.h>
#include <linux/lockref.h>
#include <linux/stringhash.h>
#include <linux/wait.h>

struct path;
struct file;
struct vfsmount;

/*
 * linux/include/linux/dcache.h
 *
 * Dirent cache data structures
 *
 * (C) Copyright 1997 Thomas Schoebel-Theuer,
 * with heavy changes by Linus Torvalds
 */

#define IS_ROOT(x) ((x) == (x)->d_parent)

/* The hash is always the low bits of hash_len */
#ifdef __LITTLE_ENDIAN
 #define HASH_LEN_DECLARE u32 hash; u32 len
 #define bytemask_from_count(cnt)        (~(~0ul << (cnt)*8))
#else
 #define HASH_LEN_DECLARE u32 len; u32 hash
 #define bytemask_from_count(cnt)        (~(~0ul >> (cnt)*8))
#endif

/*
 * "quick string" -- eases parameter passing, but more importantly
 * saves "metadata" about the string (ie length and the hash).
 *
 * hash comes first so it snuggles against d_parent in the
 * dentry.
 */
struct qstr {
        union {
                struct {
                        HASH_LEN_DECLARE;
                };
                u64 hash_len;
        };
        const unsigned char *name;
};

#define QSTR_INIT(n,l) { { { .len = l } }, .name = n }

extern const struct qstr empty_name;
extern const struct qstr slash_name;
extern const struct qstr dotdot_name;

/*
 * Try to keep struct dentry aligned on 64 byte cachelines (this will
 * give reasonable cacheline footprint with larger lines without the
 * large memory footprint increase).
 */
#ifdef CONFIG_64BIT
# define DNAME_INLINE_LEN 40 /* 192 bytes */
#else
# ifdef CONFIG_SMP
#  define DNAME_INLINE_LEN 40 /* 128 bytes */
# else
#  define DNAME_INLINE_LEN 44 /* 128 bytes */
# endif
#endif

#define d_lock        d_lockref.lock

struct dentry {
        /* RCU lookup touched fields */
        unsigned int d_flags;                /* protected by d_lock */
        seqcount_spinlock_t d_seq;        /* per dentry seqlock */
        struct hlist_bl_node d_hash;        /* lookup hash list */
        struct dentry *d_parent;        /* parent directory */
        struct qstr d_name;
        struct inode *d_inode;                /* Where the name belongs to - NULL is
                                         * negative */
        unsigned char d_iname[DNAME_INLINE_LEN];        /* small names */

        /* Ref lookup also touches following */
        struct lockref d_lockref;        /* per-dentry lock and refcount */
        const struct dentry_operations *d_op;
        struct super_block *d_sb;        /* The root of the dentry tree */
        unsigned long d_time;                /* used by d_revalidate */
        void *d_fsdata;                        /* fs-specific data */

        union {
                struct list_head d_lru;                /* LRU list */
                wait_queue_head_t *d_wait;        /* in-lookup ones only */
        };
        struct hlist_node d_sib;        /* child of parent list */
        struct hlist_head d_children;        /* our children */
        /*
         * d_alias and d_rcu can share memory
         */
        union {
                struct hlist_node d_alias;        /* inode alias list */
                struct hlist_bl_node d_in_lookup_hash;        /* only for in-lookup ones */
                 struct rcu_head d_rcu;
        } d_u;
};

/*
 * dentry->d_lock spinlock nesting subclasses:
 *
 * 0: normal
 * 1: nested
 */
enum dentry_d_lock_class
{
        DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */
        DENTRY_D_LOCK_NESTED
};

enum d_real_type {
        D_REAL_DATA,
        D_REAL_METADATA,
};

struct dentry_operations {
        int (*d_revalidate)(struct dentry *, unsigned int);
        int (*d_weak_revalidate)(struct dentry *, unsigned int);
        int (*d_hash)(const struct dentry *, struct qstr *);
        int (*d_compare)(const struct dentry *,
                        unsigned int, const char *, const struct qstr *);
        int (*d_delete)(const struct dentry *);
        int (*d_init)(struct dentry *);
        void (*d_release)(struct dentry *);
        void (*d_prune)(struct dentry *);
        void (*d_iput)(struct dentry *, struct inode *);
        char *(*d_dname)(struct dentry *, char *, int);
        struct vfsmount *(*d_automount)(struct path *);
        int (*d_manage)(const struct path *, bool);
        struct dentry *(*d_real)(struct dentry *, enum d_real_type type);
} ____cacheline_aligned;

/*
 * Locking rules for dentry_operations callbacks are to be found in
 * Documentation/filesystems/locking.rst. Keep it updated!
 *
 * FUrther descriptions are found in Documentation/filesystems/vfs.rst.
 * Keep it updated too!
 */

/* d_flags entries */
#define DCACHE_OP_HASH                        BIT(0)
#define DCACHE_OP_COMPARE                BIT(1)
#define DCACHE_OP_REVALIDATE                BIT(2)
#define DCACHE_OP_DELETE                BIT(3)
#define DCACHE_OP_PRUNE                        BIT(4)

#define        DCACHE_DISCONNECTED                BIT(5)
     /* This dentry is possibly not currently connected to the dcache tree, in
      * which case its parent will either be itself, or will have this flag as
      * well.  nfsd will not use a dentry with this bit set, but will first
      * endeavour to clear the bit either by discovering that it is connected,
      * or by performing lookup operations.   Any filesystem which supports
      * nfsd_operations MUST have a lookup function which, if it finds a
      * directory inode with a DCACHE_DISCONNECTED dentry, will d_move that
      * dentry into place and return that dentry rather than the passed one,
      * typically using d_splice_alias. */

#define DCACHE_REFERENCED                BIT(6) /* Recently used, don't discard. */

#define DCACHE_DONTCACHE                BIT(7) /* Purge from memory on final dput() */

#define DCACHE_CANT_MOUNT                BIT(8)
#define DCACHE_GENOCIDE                        BIT(9)
#define DCACHE_SHRINK_LIST                BIT(10)

#define DCACHE_OP_WEAK_REVALIDATE        BIT(11)

#define DCACHE_NFSFS_RENAMED                BIT(12)
     /* this dentry has been "silly renamed" and has to be deleted on the last
      * dput() */
#define DCACHE_FSNOTIFY_PARENT_WATCHED        BIT(14)
     /* Parent inode is watched by some fsnotify listener */

#define DCACHE_DENTRY_KILLED                BIT(15)

#define DCACHE_MOUNTED                        BIT(16) /* is a mountpoint */
#define DCACHE_NEED_AUTOMOUNT                BIT(17) /* handle automount on this dir */
#define DCACHE_MANAGE_TRANSIT                BIT(18) /* manage transit from this dirent */
#define DCACHE_MANAGED_DENTRY \
        (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT)

#define DCACHE_LRU_LIST                        BIT(19)

#define DCACHE_ENTRY_TYPE                (7 << 20) /* bits 20..22 are for storing type: */
#define DCACHE_MISS_TYPE                (0 << 20) /* Negative dentry */
#define DCACHE_WHITEOUT_TYPE                (1 << 20) /* Whiteout dentry (stop pathwalk) */
#define DCACHE_DIRECTORY_TYPE                (2 << 20) /* Normal directory */
#define DCACHE_AUTODIR_TYPE                (3 << 20) /* Lookupless directory (presumed automount) */
#define DCACHE_REGULAR_TYPE                (4 << 20) /* Regular file type */
#define DCACHE_SPECIAL_TYPE                (5 << 20) /* Other file type */
#define DCACHE_SYMLINK_TYPE                (6 << 20) /* Symlink */

#define DCACHE_NOKEY_NAME                BIT(25) /* Encrypted name encoded without key */
#define DCACHE_OP_REAL                        BIT(26)

#define DCACHE_PAR_LOOKUP                BIT(28) /* being looked up (with parent locked shared) */
#define DCACHE_DENTRY_CURSOR                BIT(29)
#define DCACHE_NORCU                        BIT(30) /* No RCU delay for freeing */

extern seqlock_t rename_lock;

/*
 * These are the low-level FS interfaces to the dcache..
 */
extern void d_instantiate(struct dentry *, struct inode *);
extern void d_instantiate_new(struct dentry *, struct inode *);
extern void __d_drop(struct dentry *dentry);
extern void d_drop(struct dentry *dentry);
extern void d_delete(struct dentry *);
extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op);

/* allocate/de-allocate */
extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
extern struct dentry * d_alloc_anon(struct super_block *);
extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
                                        wait_queue_head_t *);
extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
extern bool d_same_name(const struct dentry *dentry, const struct dentry *parent,
                        const struct qstr *name);
extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
extern struct dentry *d_find_any_alias(struct inode *inode);
extern struct dentry * d_obtain_alias(struct inode *);
extern struct dentry * d_obtain_root(struct inode *);
extern void shrink_dcache_sb(struct super_block *);
extern void shrink_dcache_parent(struct dentry *);
extern void d_invalidate(struct dentry *);

/* only used at mount-time */
extern struct dentry * d_make_root(struct inode *);

extern void d_mark_tmpfile(struct file *, struct inode *);
extern void d_tmpfile(struct file *, struct inode *);

extern struct dentry *d_find_alias(struct inode *);
extern void d_prune_aliases(struct inode *);

extern struct dentry *d_find_alias_rcu(struct inode *);

/* test whether we have any submounts in a subdir tree */
extern int path_has_submounts(const struct path *);

/*
 * This adds the entry to the hash queues.
 */
extern void d_rehash(struct dentry *);
 
extern void d_add(struct dentry *, struct inode *);

/* used for rename() and baskets */
extern void d_move(struct dentry *, struct dentry *);
extern void d_exchange(struct dentry *, struct dentry *);
extern struct dentry *d_ancestor(struct dentry *, struct dentry *);

extern struct dentry *d_lookup(const struct dentry *, const struct qstr *);
extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *);

static inline unsigned d_count(const struct dentry *dentry)
{
        return dentry->d_lockref.count;
}

/*
 * helper function for dentry_operations.d_dname() members
 */
extern __printf(3, 4)
char *dynamic_dname(char *, int, const char *, ...);

extern char *__d_path(const struct path *, const struct path *, char *, int);
extern char *d_absolute_path(const struct path *, char *, int);
extern char *d_path(const struct path *, char *, int);
extern char *dentry_path_raw(const struct dentry *, char *, int);
extern char *dentry_path(const struct dentry *, char *, int);

/* Allocation counts.. */

/**
 * dget_dlock -        get a reference to a dentry
 * @dentry: dentry to get a reference to
 *
 * Given a live dentry, increment the reference count and return the dentry.
 * Caller must hold @dentry->d_lock.  Making sure that dentry is alive is
 * caller's resonsibility.  There are many conditions sufficient to guarantee
 * that; e.g. anything with non-negative refcount is alive, so's anything
 * hashed, anything positive, anyone's parent, etc.
 */
static inline struct dentry *dget_dlock(struct dentry *dentry)
{
        dentry->d_lockref.count++;
        return dentry;
}


/**
 * dget - get a reference to a dentry
 * @dentry: dentry to get a reference to
 *
 * Given a dentry or %NULL pointer increment the reference count
 * if appropriate and return the dentry.  A dentry will not be
 * destroyed when it has references.  Conversely, a dentry with
 * no references can disappear for any number of reasons, starting
 * with memory pressure.  In other words, that primitive is
 * used to clone an existing reference; using it on something with
 * zero refcount is a bug.
 *
 * NOTE: it will spin if @dentry->d_lock is held.  From the deadlock
 * avoidance point of view it is equivalent to spin_lock()/increment
 * refcount/spin_unlock(), so calling it under @dentry->d_lock is
 * always a bug; so's calling it under ->d_lock on any of its descendents.
 *
 */
static inline struct dentry *dget(struct dentry *dentry)
{
        if (dentry)
                lockref_get(&dentry->d_lockref);
        return dentry;
}

extern struct dentry *dget_parent(struct dentry *dentry);

/**
 * d_unhashed - is dentry hashed
 * @dentry: entry to check
 *
 * Returns true if the dentry passed is not currently hashed.
 */
static inline int d_unhashed(const struct dentry *dentry)
{
        return hlist_bl_unhashed(&dentry->d_hash);
}

static inline int d_unlinked(const struct dentry *dentry)
{
        return d_unhashed(dentry) && !IS_ROOT(dentry);
}

static inline int cant_mount(const struct dentry *dentry)
{
        return (dentry->d_flags & DCACHE_CANT_MOUNT);
}

static inline void dont_mount(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        dentry->d_flags |= DCACHE_CANT_MOUNT;
        spin_unlock(&dentry->d_lock);
}

extern void __d_lookup_unhash_wake(struct dentry *dentry);

static inline int d_in_lookup(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_PAR_LOOKUP;
}

static inline void d_lookup_done(struct dentry *dentry)
{
        if (unlikely(d_in_lookup(dentry)))
                __d_lookup_unhash_wake(dentry);
}

extern void dput(struct dentry *);

static inline bool d_managed(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_MANAGED_DENTRY;
}

static inline bool d_mountpoint(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_MOUNTED;
}

/*
 * Directory cache entry type accessor functions.
 */
static inline unsigned __d_entry_type(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_ENTRY_TYPE;
}

static inline bool d_is_miss(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_MISS_TYPE;
}

static inline bool d_is_whiteout(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_WHITEOUT_TYPE;
}

static inline bool d_can_lookup(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_DIRECTORY_TYPE;
}

static inline bool d_is_autodir(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_AUTODIR_TYPE;
}

static inline bool d_is_dir(const struct dentry *dentry)
{
        return d_can_lookup(dentry) || d_is_autodir(dentry);
}

static inline bool d_is_symlink(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_SYMLINK_TYPE;
}

static inline bool d_is_reg(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_REGULAR_TYPE;
}

static inline bool d_is_special(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_SPECIAL_TYPE;
}

static inline bool d_is_file(const struct dentry *dentry)
{
        return d_is_reg(dentry) || d_is_special(dentry);
}

static inline bool d_is_negative(const struct dentry *dentry)
{
        // TODO: check d_is_whiteout(dentry) also.
        return d_is_miss(dentry);
}

static inline bool d_flags_negative(unsigned flags)
{
        return (flags & DCACHE_ENTRY_TYPE) == DCACHE_MISS_TYPE;
}

static inline bool d_is_positive(const struct dentry *dentry)
{
        return !d_is_negative(dentry);
}

/**
 * d_really_is_negative - Determine if a dentry is really negative (ignoring fallthroughs)
 * @dentry: The dentry in question
 *
 * Returns true if the dentry represents either an absent name or a name that
 * doesn't map to an inode (ie. ->d_inode is NULL).  The dentry could represent
 * a true miss, a whiteout that isn't represented by a 0,0 chardev or a
 * fallthrough marker in an opaque directory.
 *
 * Note!  (1) This should be used *only* by a filesystem to examine its own
 * dentries.  It should not be used to look at some other filesystem's
 * dentries.  (2) It should also be used in combination with d_inode() to get
 * the inode.  (3) The dentry may have something attached to ->d_lower and the
 * type field of the flags may be set to something other than miss or whiteout.
 */
static inline bool d_really_is_negative(const struct dentry *dentry)
{
        return dentry->d_inode == NULL;
}

/**
 * d_really_is_positive - Determine if a dentry is really positive (ignoring fallthroughs)
 * @dentry: The dentry in question
 *
 * Returns true if the dentry represents a name that maps to an inode
 * (ie. ->d_inode is not NULL).  The dentry might still represent a whiteout if
 * that is represented on medium as a 0,0 chardev.
 *
 * Note!  (1) This should be used *only* by a filesystem to examine its own
 * dentries.  It should not be used to look at some other filesystem's
 * dentries.  (2) It should also be used in combination with d_inode() to get
 * the inode.
 */
static inline bool d_really_is_positive(const struct dentry *dentry)
{
        return dentry->d_inode != NULL;
}

static inline int simple_positive(const struct dentry *dentry)
{
        return d_really_is_positive(dentry) && !d_unhashed(dentry);
}

extern int sysctl_vfs_cache_pressure;

static inline unsigned long vfs_pressure_ratio(unsigned long val)
{
        return mult_frac(val, sysctl_vfs_cache_pressure, 100);
}

/**
 * d_inode - Get the actual inode of this dentry
 * @dentry: The dentry to query
 *
 * This is the helper normal filesystems should use to get at their own inodes
 * in their own dentries and ignore the layering superimposed upon them.
 */
static inline struct inode *d_inode(const struct dentry *dentry)
{
        return dentry->d_inode;
}

/**
 * d_inode_rcu - Get the actual inode of this dentry with READ_ONCE()
 * @dentry: The dentry to query
 *
 * This is the helper normal filesystems should use to get at their own inodes
 * in their own dentries and ignore the layering superimposed upon them.
 */
static inline struct inode *d_inode_rcu(const struct dentry *dentry)
{
        return READ_ONCE(dentry->d_inode);
}

/**
 * d_backing_inode - Get upper or lower inode we should be using
 * @upper: The upper layer
 *
 * This is the helper that should be used to get at the inode that will be used
 * if this dentry were to be opened as a file.  The inode may be on the upper
 * dentry or it may be on a lower dentry pinned by the upper.
 *
 * Normal filesystems should not use this to access their own inodes.
 */
static inline struct inode *d_backing_inode(const struct dentry *upper)
{
        struct inode *inode = upper->d_inode;

        return inode;
}

/**
 * d_real - Return the real dentry
 * @dentry: the dentry to query
 * @type: the type of real dentry (data or metadata)
 *
 * If dentry is on a union/overlay, then return the underlying, real dentry.
 * Otherwise return the dentry itself.
 *
 * See also: Documentation/filesystems/vfs.rst
 */
static inline struct dentry *d_real(struct dentry *dentry, enum d_real_type type)
{
        if (unlikely(dentry->d_flags & DCACHE_OP_REAL))
                return dentry->d_op->d_real(dentry, type);
        else
                return dentry;
}

/**
 * d_real_inode - Return the real inode hosting the data
 * @dentry: The dentry to query
 *
 * If dentry is on a union/overlay, then return the underlying, real inode.
 * Otherwise return d_inode().
 */
static inline struct inode *d_real_inode(const struct dentry *dentry)
{
        /* This usage of d_real() results in const dentry */
        return d_inode(d_real((struct dentry *) dentry, D_REAL_DATA));
}

struct name_snapshot {
        struct qstr name;
        unsigned char inline_name[DNAME_INLINE_LEN];
};
void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *);
void release_dentry_name_snapshot(struct name_snapshot *);

static inline struct dentry *d_first_child(const struct dentry *dentry)
{
        return hlist_entry_safe(dentry->d_children.first, struct dentry, d_sib);
}

static inline struct dentry *d_next_sibling(const struct dentry *dentry)
{
        return hlist_entry_safe(dentry->d_sib.next, struct dentry, d_sib);
}

#endif        /* __LINUX_DCACHE_H */



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   21 
   21 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   28 






   26 








































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
/*
 *  kernel/cpuset.c
 *
 *  Processor and Memory placement constraints for sets of tasks.
 *
 *  Copyright (C) 2003 BULL SA.
 *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
 *  Copyright (C) 2006 Google, Inc
 *
 *  Portions derived from Patrick Mochel's sysfs code.
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
 *
 *  2003-10-10 Written by Simon Derr.
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 *  2006 Rework by Paul Menage to use generic cgroups
 *  2008 Rework of the scheduler domains and CPU hotplug handling
 *       by Max Krasnyansky
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
 *  distribution for more details.
 */

#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/memory.h>
#include <linux/export.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/deadline.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/security.h>
#include <linux/spinlock.h>
#include <linux/oom.h>
#include <linux/sched/isolation.h>
#include <linux/cgroup.h>
#include <linux/wait.h>
#include <linux/workqueue.h>

DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);

/*
 * There could be abnormal cpuset configurations for cpu or memory
 * node binding, add this key to provide a quick low-cost judgment
 * of the situation.
 */
DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);

/* See "Frequency meter" comments, below. */

struct fmeter {
        int cnt;                /* unprocessed events count */
        int val;                /* most recent output value */
        time64_t time;                /* clock (secs) when val computed */
        spinlock_t lock;        /* guards read or write of above */
};

/*
 * Invalid partition error code
 */
enum prs_errcode {
        PERR_NONE = 0,
        PERR_INVCPUS,
        PERR_INVPARENT,
        PERR_NOTPART,
        PERR_NOTEXCL,
        PERR_NOCPUS,
        PERR_HOTPLUG,
        PERR_CPUSEMPTY,
        PERR_HKEEPING,
};

static const char * const perr_strings[] = {
        [PERR_INVCPUS]   = "Invalid cpu list in cpuset.cpus.exclusive",
        [PERR_INVPARENT] = "Parent is an invalid partition root",
        [PERR_NOTPART]   = "Parent is not a partition root",
        [PERR_NOTEXCL]   = "Cpu list in cpuset.cpus not exclusive",
        [PERR_NOCPUS]    = "Parent unable to distribute cpu downstream",
        [PERR_HOTPLUG]   = "No cpu available due to hotplug",
        [PERR_CPUSEMPTY] = "cpuset.cpus is empty",
        [PERR_HKEEPING]  = "partition config conflicts with housekeeping setup",
};

struct cpuset {
        struct cgroup_subsys_state css;

        unsigned long flags;                /* "unsigned long" so bitops work */

        /*
         * On default hierarchy:
         *
         * The user-configured masks can only be changed by writing to
         * cpuset.cpus and cpuset.mems, and won't be limited by the
         * parent masks.
         *
         * The effective masks is the real masks that apply to the tasks
         * in the cpuset. They may be changed if the configured masks are
         * changed or hotplug happens.
         *
         * effective_mask == configured_mask & parent's effective_mask,
         * and if it ends up empty, it will inherit the parent's mask.
         *
         *
         * On legacy hierarchy:
         *
         * The user-configured masks are always the same with effective masks.
         */

        /* user-configured CPUs and Memory Nodes allow to tasks */
        cpumask_var_t cpus_allowed;
        nodemask_t mems_allowed;

        /* effective CPUs and Memory Nodes allow to tasks */
        cpumask_var_t effective_cpus;
        nodemask_t effective_mems;

        /*
         * Exclusive CPUs dedicated to current cgroup (default hierarchy only)
         *
         * This exclusive CPUs must be a subset of cpus_allowed. A parent
         * cgroup can only grant exclusive CPUs to one of its children.
         *
         * When the cgroup becomes a valid partition root, effective_xcpus
         * defaults to cpus_allowed if not set. The effective_cpus of a valid
         * partition root comes solely from its effective_xcpus and some of the
         * effective_xcpus may be distributed to sub-partitions below & hence
         * excluded from its effective_cpus.
         */
        cpumask_var_t effective_xcpus;

        /*
         * Exclusive CPUs as requested by the user (default hierarchy only)
         */
        cpumask_var_t exclusive_cpus;

        /*
         * This is old Memory Nodes tasks took on.
         *
         * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
         * - A new cpuset's old_mems_allowed is initialized when some
         *   task is moved into it.
         * - old_mems_allowed is used in cpuset_migrate_mm() when we change
         *   cpuset.mems_allowed and have tasks' nodemask updated, and
         *   then old_mems_allowed is updated to mems_allowed.
         */
        nodemask_t old_mems_allowed;

        struct fmeter fmeter;                /* memory_pressure filter */

        /*
         * Tasks are being attached to this cpuset.  Used to prevent
         * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
         */
        int attach_in_progress;

        /* partition number for rebuild_sched_domains() */
        int pn;

        /* for custom sched domain */
        int relax_domain_level;

        /* number of valid sub-partitions */
        int nr_subparts;

        /* partition root state */
        int partition_root_state;

        /*
         * Default hierarchy only:
         * use_parent_ecpus - set if using parent's effective_cpus
         * child_ecpus_count - # of children with use_parent_ecpus set
         */
        int use_parent_ecpus;
        int child_ecpus_count;

        /*
         * number of SCHED_DEADLINE tasks attached to this cpuset, so that we
         * know when to rebuild associated root domain bandwidth information.
         */
        int nr_deadline_tasks;
        int nr_migrate_dl_tasks;
        u64 sum_migrate_dl_bw;

        /* Invalid partition error code, not lock protected */
        enum prs_errcode prs_err;

        /* Handle for cpuset.cpus.partition */
        struct cgroup_file partition_file;

        /* Remote partition silbling list anchored at remote_children */
        struct list_head remote_sibling;
};

/*
 * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
 */
struct cpuset_remove_tasks_struct {
        struct work_struct work;
        struct cpuset *cs;
};

/*
 * Exclusive CPUs distributed out to sub-partitions of top_cpuset
 */
static cpumask_var_t        subpartitions_cpus;

/*
 * Exclusive CPUs in isolated partitions
 */
static cpumask_var_t        isolated_cpus;

/* List of remote partition root children */
static struct list_head remote_children;

/*
 * Partition root states:
 *
 *   0 - member (not a partition root)
 *   1 - partition root
 *   2 - partition root without load balancing (isolated)
 *  -1 - invalid partition root
 *  -2 - invalid isolated partition root
 */
#define PRS_MEMBER                0
#define PRS_ROOT                1
#define PRS_ISOLATED                2
#define PRS_INVALID_ROOT        -1
#define PRS_INVALID_ISOLATED        -2

static inline bool is_prs_invalid(int prs_state)
{
        return prs_state < 0;
}

/*
 * Temporary cpumasks for working with partitions that are passed among
 * functions to avoid memory allocation in inner functions.
 */
struct tmpmasks {
        cpumask_var_t addmask, delmask;        /* For partition root */
        cpumask_var_t new_cpus;                /* For update_cpumasks_hier() */
};

static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
{
        return css ? container_of(css, struct cpuset, css) : NULL;
}

/* Retrieve the cpuset for a task */
static inline struct cpuset *task_cs(struct task_struct *task)
{
        return css_cs(task_css(task, cpuset_cgrp_id));
}

static inline struct cpuset *parent_cs(struct cpuset *cs)
{
        return css_cs(cs->css.parent);
}

void inc_dl_tasks_cs(struct task_struct *p)
{
        struct cpuset *cs = task_cs(p);

        cs->nr_deadline_tasks++;
}

void dec_dl_tasks_cs(struct task_struct *p)
{
        struct cpuset *cs = task_cs(p);

        cs->nr_deadline_tasks--;
}

/* bits in struct cpuset flags field */
typedef enum {
        CS_ONLINE,
        CS_CPU_EXCLUSIVE,
        CS_MEM_EXCLUSIVE,
        CS_MEM_HARDWALL,
        CS_MEMORY_MIGRATE,
        CS_SCHED_LOAD_BALANCE,
        CS_SPREAD_PAGE,
        CS_SPREAD_SLAB,
} cpuset_flagbits_t;

/* convenient tests for these bits */
static inline bool is_cpuset_online(struct cpuset *cs)
{
        return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
}

static inline int is_cpu_exclusive(const struct cpuset *cs)
{
        return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
}

static inline int is_mem_exclusive(const struct cpuset *cs)
{
        return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
}

static inline int is_mem_hardwall(const struct cpuset *cs)
{
        return test_bit(CS_MEM_HARDWALL, &cs->flags);
}

static inline int is_sched_load_balance(const struct cpuset *cs)
{
        return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
}

static inline int is_memory_migrate(const struct cpuset *cs)
{
        return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
}

static inline int is_spread_page(const struct cpuset *cs)
{
        return test_bit(CS_SPREAD_PAGE, &cs->flags);
}

static inline int is_spread_slab(const struct cpuset *cs)
{
        return test_bit(CS_SPREAD_SLAB, &cs->flags);
}

static inline int is_partition_valid(const struct cpuset *cs)
{
        return cs->partition_root_state > 0;
}

static inline int is_partition_invalid(const struct cpuset *cs)
{
        return cs->partition_root_state < 0;
}

/*
 * Callers should hold callback_lock to modify partition_root_state.
 */
static inline void make_partition_invalid(struct cpuset *cs)
{
        if (cs->partition_root_state > 0)
                cs->partition_root_state = -cs->partition_root_state;
}

/*
 * Send notification event of whenever partition_root_state changes.
 */
static inline void notify_partition_change(struct cpuset *cs, int old_prs)
{
        if (old_prs == cs->partition_root_state)
                return;
        cgroup_file_notify(&cs->partition_file);

        /* Reset prs_err if not invalid */
        if (is_partition_valid(cs))
                WRITE_ONCE(cs->prs_err, PERR_NONE);
}

static struct cpuset top_cpuset = {
        .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) |
                 BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
        .partition_root_state = PRS_ROOT,
        .relax_domain_level = -1,
        .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling),
};

/**
 * cpuset_for_each_child - traverse online children of a cpuset
 * @child_cs: loop cursor pointing to the current child
 * @pos_css: used for iteration
 * @parent_cs: target cpuset to walk children of
 *
 * Walk @child_cs through the online children of @parent_cs.  Must be used
 * with RCU read locked.
 */
#define cpuset_for_each_child(child_cs, pos_css, parent_cs)                \
        css_for_each_child((pos_css), &(parent_cs)->css)                \
                if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))

/**
 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
 * @des_cs: loop cursor pointing to the current descendant
 * @pos_css: used for iteration
 * @root_cs: target cpuset to walk ancestor of
 *
 * Walk @des_cs through the online descendants of @root_cs.  Must be used
 * with RCU read locked.  The caller may modify @pos_css by calling
 * css_rightmost_descendant() to skip subtree.  @root_cs is included in the
 * iteration and the first node to be visited.
 */
#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs)        \
        css_for_each_descendant_pre((pos_css), &(root_cs)->css)                \
                if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))

/*
 * There are two global locks guarding cpuset structures - cpuset_mutex and
 * callback_lock. We also require taking task_lock() when dereferencing a
 * task's cpuset pointer. See "The task_lock() exception", at the end of this
 * comment.  The cpuset code uses only cpuset_mutex. Other kernel subsystems
 * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
 * structures. Note that cpuset_mutex needs to be a mutex as it is used in
 * paths that rely on priority inheritance (e.g. scheduler - on RT) for
 * correctness.
 *
 * A task must hold both locks to modify cpusets.  If a task holds
 * cpuset_mutex, it blocks others, ensuring that it is the only task able to
 * also acquire callback_lock and be able to modify cpusets.  It can perform
 * various checks on the cpuset structure first, knowing nothing will change.
 * It can also allocate memory while just holding cpuset_mutex.  While it is
 * performing these checks, various callback routines can briefly acquire
 * callback_lock to query cpusets.  Once it is ready to make the changes, it
 * takes callback_lock, blocking everyone else.
 *
 * Calls to the kernel memory allocator can not be made while holding
 * callback_lock, as that would risk double tripping on callback_lock
 * from one of the callbacks into the cpuset code from within
 * __alloc_pages().
 *
 * If a task is only holding callback_lock, then it has read-only
 * access to cpusets.
 *
 * Now, the task_struct fields mems_allowed and mempolicy may be changed
 * by other task, we use alloc_lock in the task_struct fields to protect
 * them.
 *
 * The cpuset_common_file_read() handlers only hold callback_lock across
 * small pieces of code, such as when reading out possibly multi-word
 * cpumasks and nodemasks.
 *
 * Accessing a task's cpuset should be done in accordance with the
 * guidelines for accessing subsystem state in kernel/cgroup.c
 */

static DEFINE_MUTEX(cpuset_mutex);

void cpuset_lock(void)
{
        mutex_lock(&cpuset_mutex);
}

void cpuset_unlock(void)
{
        mutex_unlock(&cpuset_mutex);
}

static DEFINE_SPINLOCK(callback_lock);

static struct workqueue_struct *cpuset_migrate_mm_wq;

static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);

static inline void check_insane_mems_config(nodemask_t *nodes)
{
        if (!cpusets_insane_config() &&
                movable_only_nodes(nodes)) {
                static_branch_enable(&cpusets_insane_config_key);
                pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
                        "Cpuset allocations might fail even with a lot of memory available.\n",
                        nodemask_pr_args(nodes));
        }
}

/*
 * Cgroup v2 behavior is used on the "cpus" and "mems" control files when
 * on default hierarchy or when the cpuset_v2_mode flag is set by mounting
 * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
 * With v2 behavior, "cpus" and "mems" are always what the users have
 * requested and won't be changed by hotplug events. Only the effective
 * cpus or mems will be affected.
 */
static inline bool is_in_v2_mode(void)
{
        return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
              (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}

/**
 * partition_is_populated - check if partition has tasks
 * @cs: partition root to be checked
 * @excluded_child: a child cpuset to be excluded in task checking
 * Return: true if there are tasks, false otherwise
 *
 * It is assumed that @cs is a valid partition root. @excluded_child should
 * be non-NULL when this cpuset is going to become a partition itself.
 */
static inline bool partition_is_populated(struct cpuset *cs,
                                          struct cpuset *excluded_child)
{
        struct cgroup_subsys_state *css;
        struct cpuset *child;

        if (cs->css.cgroup->nr_populated_csets)
                return true;
        if (!excluded_child && !cs->nr_subparts)
                return cgroup_is_populated(cs->css.cgroup);

        rcu_read_lock();
        cpuset_for_each_child(child, css, cs) {
                if (child == excluded_child)
                        continue;
                if (is_partition_valid(child))
                        continue;
                if (cgroup_is_populated(child->css.cgroup)) {
                        rcu_read_unlock();
                        return true;
                }
        }
        rcu_read_unlock();
        return false;
}

/*
 * Return in pmask the portion of a task's cpusets's cpus_allowed that
 * are online and are capable of running the task.  If none are found,
 * walk up the cpuset hierarchy until we find one that does have some
 * appropriate cpus.
 *
 * One way or another, we guarantee to return some non-empty subset
 * of cpu_online_mask.
 *
 * Call with callback_lock or cpuset_mutex held.
 */
static void guarantee_online_cpus(struct task_struct *tsk,
                                  struct cpumask *pmask)
{
        const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
        struct cpuset *cs;

        if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
                cpumask_copy(pmask, cpu_online_mask);

        rcu_read_lock();
        cs = task_cs(tsk);

        while (!cpumask_intersects(cs->effective_cpus, pmask))
                cs = parent_cs(cs);

        cpumask_and(pmask, pmask, cs->effective_cpus);
        rcu_read_unlock();
}

/*
 * Return in *pmask the portion of a cpusets's mems_allowed that
 * are online, with memory.  If none are online with memory, walk
 * up the cpuset hierarchy until we find one that does have some
 * online mems.  The top cpuset always has some mems online.
 *
 * One way or another, we guarantee to return some non-empty subset
 * of node_states[N_MEMORY].
 *
 * Call with callback_lock or cpuset_mutex held.
 */
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
        while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
                cs = parent_cs(cs);
        nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
}

/*
 * update task's spread flag if cpuset's page/slab spread flag is set
 *
 * Call with callback_lock or cpuset_mutex held. The check can be skipped
 * if on default hierarchy.
 */
static void cpuset_update_task_spread_flags(struct cpuset *cs,
                                        struct task_struct *tsk)
{
        if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
                return;

        if (is_spread_page(cs))
                task_set_spread_page(tsk);
        else
                task_clear_spread_page(tsk);

        if (is_spread_slab(cs))
                task_set_spread_slab(tsk);
        else
                task_clear_spread_slab(tsk);
}

/*
 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
 *
 * One cpuset is a subset of another if all its allowed CPUs and
 * Memory Nodes are a subset of the other, and its exclusive flags
 * are only set if the other's are set.  Call holding cpuset_mutex.
 */

static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
{
        return        cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
                nodes_subset(p->mems_allowed, q->mems_allowed) &&
                is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
                is_mem_exclusive(p) <= is_mem_exclusive(q);
}

/**
 * alloc_cpumasks - allocate three cpumasks for cpuset
 * @cs:  the cpuset that have cpumasks to be allocated.
 * @tmp: the tmpmasks structure pointer
 * Return: 0 if successful, -ENOMEM otherwise.
 *
 * Only one of the two input arguments should be non-NULL.
 */
static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
{
        cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4;

        if (cs) {
                pmask1 = &cs->cpus_allowed;
                pmask2 = &cs->effective_cpus;
                pmask3 = &cs->effective_xcpus;
                pmask4 = &cs->exclusive_cpus;
        } else {
                pmask1 = &tmp->new_cpus;
                pmask2 = &tmp->addmask;
                pmask3 = &tmp->delmask;
                pmask4 = NULL;
        }

        if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
                return -ENOMEM;

        if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
                goto free_one;

        if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
                goto free_two;

        if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL))
                goto free_three;


        return 0;

free_three:
        free_cpumask_var(*pmask3);
free_two:
        free_cpumask_var(*pmask2);
free_one:
        free_cpumask_var(*pmask1);
        return -ENOMEM;
}

/**
 * free_cpumasks - free cpumasks in a tmpmasks structure
 * @cs:  the cpuset that have cpumasks to be free.
 * @tmp: the tmpmasks structure pointer
 */
static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
{
        if (cs) {
                free_cpumask_var(cs->cpus_allowed);
                free_cpumask_var(cs->effective_cpus);
                free_cpumask_var(cs->effective_xcpus);
                free_cpumask_var(cs->exclusive_cpus);
        }
        if (tmp) {
                free_cpumask_var(tmp->new_cpus);
                free_cpumask_var(tmp->addmask);
                free_cpumask_var(tmp->delmask);
        }
}

/**
 * alloc_trial_cpuset - allocate a trial cpuset
 * @cs: the cpuset that the trial cpuset duplicates
 */
static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
{
        struct cpuset *trial;

        trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
        if (!trial)
                return NULL;

        if (alloc_cpumasks(trial, NULL)) {
                kfree(trial);
                return NULL;
        }

        cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
        cpumask_copy(trial->effective_cpus, cs->effective_cpus);
        cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
        cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
        return trial;
}

/**
 * free_cpuset - free the cpuset
 * @cs: the cpuset to be freed
 */
static inline void free_cpuset(struct cpuset *cs)
{
        free_cpumasks(cs, NULL);
        kfree(cs);
}

static inline struct cpumask *fetch_xcpus(struct cpuset *cs)
{
        return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus :
               cpumask_empty(cs->effective_xcpus) ? cs->cpus_allowed
                                                  : cs->effective_xcpus;
}

/*
 * cpusets_are_exclusive() - check if two cpusets are exclusive
 *
 * Return true if exclusive, false if not
 */
static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
{
        struct cpumask *xcpus1 = fetch_xcpus(cs1);
        struct cpumask *xcpus2 = fetch_xcpus(cs2);

        if (cpumask_intersects(xcpus1, xcpus2))
                return false;
        return true;
}

/*
 * validate_change_legacy() - Validate conditions specific to legacy (v1)
 *                            behavior.
 */
static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial)
{
        struct cgroup_subsys_state *css;
        struct cpuset *c, *par;
        int ret;

        WARN_ON_ONCE(!rcu_read_lock_held());

        /* Each of our child cpusets must be a subset of us */
        ret = -EBUSY;
        cpuset_for_each_child(c, css, cur)
                if (!is_cpuset_subset(c, trial))
                        goto out;

        /* On legacy hierarchy, we must be a subset of our parent cpuset. */
        ret = -EACCES;
        par = parent_cs(cur);
        if (par && !is_cpuset_subset(trial, par))
                goto out;

        ret = 0;
out:
        return ret;
}

/*
 * validate_change() - Used to validate that any proposed cpuset change
 *                       follows the structural rules for cpusets.
 *
 * If we replaced the flag and mask values of the current cpuset
 * (cur) with those values in the trial cpuset (trial), would
 * our various subset and exclusive rules still be valid?  Presumes
 * cpuset_mutex held.
 *
 * 'cur' is the address of an actual, in-use cpuset.  Operations
 * such as list traversal that depend on the actual address of the
 * cpuset in the list must use cur below, not trial.
 *
 * 'trial' is the address of bulk structure copy of cur, with
 * perhaps one or more of the fields cpus_allowed, mems_allowed,
 * or flags changed to new, trial values.
 *
 * Return 0 if valid, -errno if not.
 */

static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
        struct cgroup_subsys_state *css;
        struct cpuset *c, *par;
        int ret = 0;

        rcu_read_lock();

        if (!is_in_v2_mode())
                ret = validate_change_legacy(cur, trial);
        if (ret)
                goto out;

        /* Remaining checks don't apply to root cpuset */
        if (cur == &top_cpuset)
                goto out;

        par = parent_cs(cur);

        /*
         * Cpusets with tasks - existing or newly being attached - can't
         * be changed to have empty cpus_allowed or mems_allowed.
         */
        ret = -ENOSPC;
        if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
                if (!cpumask_empty(cur->cpus_allowed) &&
                    cpumask_empty(trial->cpus_allowed))
                        goto out;
                if (!nodes_empty(cur->mems_allowed) &&
                    nodes_empty(trial->mems_allowed))
                        goto out;
        }

        /*
         * We can't shrink if we won't have enough room for SCHED_DEADLINE
         * tasks.
         */
        ret = -EBUSY;
        if (is_cpu_exclusive(cur) &&
            !cpuset_cpumask_can_shrink(cur->cpus_allowed,
                                       trial->cpus_allowed))
                goto out;

        /*
         * If either I or some sibling (!= me) is exclusive, we can't
         * overlap
         */
        ret = -EINVAL;
        cpuset_for_each_child(c, css, par) {
                if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
                    c != cur) {
                        if (!cpusets_are_exclusive(trial, c))
                                goto out;
                }
                if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
                    c != cur &&
                    nodes_intersects(trial->mems_allowed, c->mems_allowed))
                        goto out;
        }

        ret = 0;
out:
        rcu_read_unlock();
        return ret;
}

#ifdef CONFIG_SMP
/*
 * Helper routine for generate_sched_domains().
 * Do cpusets a, b have overlapping effective cpus_allowed masks?
 */
static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
{
        return cpumask_intersects(a->effective_cpus, b->effective_cpus);
}

static void
update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
{
        if (dattr->relax_domain_level < c->relax_domain_level)
                dattr->relax_domain_level = c->relax_domain_level;
        return;
}

static void update_domain_attr_tree(struct sched_domain_attr *dattr,
                                    struct cpuset *root_cs)
{
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;

        rcu_read_lock();
        cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
                /* skip the whole subtree if @cp doesn't have any CPU */
                if (cpumask_empty(cp->cpus_allowed)) {
                        pos_css = css_rightmost_descendant(pos_css);
                        continue;
                }

                if (is_sched_load_balance(cp))
                        update_domain_attr(dattr, cp);
        }
        rcu_read_unlock();
}

/* Must be called with cpuset_mutex held.  */
static inline int nr_cpusets(void)
{
        /* jump label reference count + the top-level cpuset */
        return static_key_count(&cpusets_enabled_key.key) + 1;
}

/*
 * generate_sched_domains()
 *
 * This function builds a partial partition of the systems CPUs
 * A 'partial partition' is a set of non-overlapping subsets whose
 * union is a subset of that set.
 * The output of this function needs to be passed to kernel/sched/core.c
 * partition_sched_domains() routine, which will rebuild the scheduler's
 * load balancing domains (sched domains) as specified by that partial
 * partition.
 *
 * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
 * for a background explanation of this.
 *
 * Does not return errors, on the theory that the callers of this
 * routine would rather not worry about failures to rebuild sched
 * domains when operating in the severe memory shortage situations
 * that could cause allocation failures below.
 *
 * Must be called with cpuset_mutex held.
 *
 * The three key local variables below are:
 *    cp - cpuset pointer, used (together with pos_css) to perform a
 *           top-down scan of all cpusets. For our purposes, rebuilding
 *           the schedulers sched domains, we can ignore !is_sched_load_
 *           balance cpusets.
 *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
 *           that need to be load balanced, for convenient iterative
 *           access by the subsequent code that finds the best partition,
 *           i.e the set of domains (subsets) of CPUs such that the
 *           cpus_allowed of every cpuset marked is_sched_load_balance
 *           is a subset of one of these domains, while there are as
 *           many such domains as possible, each as small as possible.
 * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
 *           the kernel/sched/core.c routine partition_sched_domains() in a
 *           convenient format, that can be easily compared to the prior
 *           value to determine what partition elements (sched domains)
 *           were changed (added or removed.)
 *
 * Finding the best partition (set of domains):
 *        The triple nested loops below over i, j, k scan over the
 *        load balanced cpusets (using the array of cpuset pointers in
 *        csa[]) looking for pairs of cpusets that have overlapping
 *        cpus_allowed, but which don't have the same 'pn' partition
 *        number and gives them in the same partition number.  It keeps
 *        looping on the 'restart' label until it can no longer find
 *        any such pairs.
 *
 *        The union of the cpus_allowed masks from the set of
 *        all cpusets having the same 'pn' value then form the one
 *        element of the partition (one sched domain) to be passed to
 *        partition_sched_domains().
 */
static int generate_sched_domains(cpumask_var_t **domains,
                        struct sched_domain_attr **attributes)
{
        struct cpuset *cp;        /* top-down scan of cpusets */
        struct cpuset **csa;        /* array of all cpuset ptrs */
        int csn;                /* how many cpuset ptrs in csa so far */
        int i, j, k;                /* indices for partition finding loops */
        cpumask_var_t *doms;        /* resulting partition; i.e. sched domains */
        struct sched_domain_attr *dattr;  /* attributes for custom domains */
        int ndoms = 0;                /* number of sched domains in result */
        int nslot;                /* next empty doms[] struct cpumask slot */
        struct cgroup_subsys_state *pos_css;
        bool root_load_balance = is_sched_load_balance(&top_cpuset);

        doms = NULL;
        dattr = NULL;
        csa = NULL;

        /* Special case for the 99% of systems with one, full, sched domain */
        if (root_load_balance && !top_cpuset.nr_subparts) {
                ndoms = 1;
                doms = alloc_sched_domains(ndoms);
                if (!doms)
                        goto done;

                dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
                if (dattr) {
                        *dattr = SD_ATTR_INIT;
                        update_domain_attr_tree(dattr, &top_cpuset);
                }
                cpumask_and(doms[0], top_cpuset.effective_cpus,
                            housekeeping_cpumask(HK_TYPE_DOMAIN));

                goto done;
        }

        csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
        if (!csa)
                goto done;
        csn = 0;

        rcu_read_lock();
        if (root_load_balance)
                csa[csn++] = &top_cpuset;
        cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
                if (cp == &top_cpuset)
                        continue;
                /*
                 * Continue traversing beyond @cp iff @cp has some CPUs and
                 * isn't load balancing.  The former is obvious.  The
                 * latter: All child cpusets contain a subset of the
                 * parent's cpus, so just skip them, and then we call
                 * update_domain_attr_tree() to calc relax_domain_level of
                 * the corresponding sched domain.
                 *
                 * If root is load-balancing, we can skip @cp if it
                 * is a subset of the root's effective_cpus.
                 */
                if (!cpumask_empty(cp->cpus_allowed) &&
                    !(is_sched_load_balance(cp) &&
                      cpumask_intersects(cp->cpus_allowed,
                                         housekeeping_cpumask(HK_TYPE_DOMAIN))))
                        continue;

                if (root_load_balance &&
                    cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
                        continue;

                if (is_sched_load_balance(cp) &&
                    !cpumask_empty(cp->effective_cpus))
                        csa[csn++] = cp;

                /* skip @cp's subtree if not a partition root */
                if (!is_partition_valid(cp))
                        pos_css = css_rightmost_descendant(pos_css);
        }
        rcu_read_unlock();

        for (i = 0; i < csn; i++)
                csa[i]->pn = i;
        ndoms = csn;

restart:
        /* Find the best partition (set of sched domains) */
        for (i = 0; i < csn; i++) {
                struct cpuset *a = csa[i];
                int apn = a->pn;

                for (j = 0; j < csn; j++) {
                        struct cpuset *b = csa[j];
                        int bpn = b->pn;

                        if (apn != bpn && cpusets_overlap(a, b)) {
                                for (k = 0; k < csn; k++) {
                                        struct cpuset *c = csa[k];

                                        if (c->pn == bpn)
                                                c->pn = apn;
                                }
                                ndoms--;        /* one less element */
                                goto restart;
                        }
                }
        }

        /*
         * Now we know how many domains to create.
         * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
         */
        doms = alloc_sched_domains(ndoms);
        if (!doms)
                goto done;

        /*
         * The rest of the code, including the scheduler, can deal with
         * dattr==NULL case. No need to abort if alloc fails.
         */
        dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
                              GFP_KERNEL);

        for (nslot = 0, i = 0; i < csn; i++) {
                struct cpuset *a = csa[i];
                struct cpumask *dp;
                int apn = a->pn;

                if (apn < 0) {
                        /* Skip completed partitions */
                        continue;
                }

                dp = doms[nslot];

                if (nslot == ndoms) {
                        static int warnings = 10;
                        if (warnings) {
                                pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
                                        nslot, ndoms, csn, i, apn);
                                warnings--;
                        }
                        continue;
                }

                cpumask_clear(dp);
                if (dattr)
                        *(dattr + nslot) = SD_ATTR_INIT;
                for (j = i; j < csn; j++) {
                        struct cpuset *b = csa[j];

                        if (apn == b->pn) {
                                cpumask_or(dp, dp, b->effective_cpus);
                                cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
                                if (dattr)
                                        update_domain_attr_tree(dattr + nslot, b);

                                /* Done with this partition */
                                b->pn = -1;
                        }
                }
                nslot++;
        }
        BUG_ON(nslot != ndoms);

done:
        kfree(csa);

        /*
         * Fallback to the default domain if kmalloc() failed.
         * See comments in partition_sched_domains().
         */
        if (doms == NULL)
                ndoms = 1;

        *domains    = doms;
        *attributes = dattr;
        return ndoms;
}

static void dl_update_tasks_root_domain(struct cpuset *cs)
{
        struct css_task_iter it;
        struct task_struct *task;

        if (cs->nr_deadline_tasks == 0)
                return;

        css_task_iter_start(&cs->css, 0, &it);

        while ((task = css_task_iter_next(&it)))
                dl_add_task_root_domain(task);

        css_task_iter_end(&it);
}

static void dl_rebuild_rd_accounting(void)
{
        struct cpuset *cs = NULL;
        struct cgroup_subsys_state *pos_css;

        lockdep_assert_held(&cpuset_mutex);
        lockdep_assert_cpus_held();
        lockdep_assert_held(&sched_domains_mutex);

        rcu_read_lock();

        /*
         * Clear default root domain DL accounting, it will be computed again
         * if a task belongs to it.
         */
        dl_clear_root_domain(&def_root_domain);

        cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {

                if (cpumask_empty(cs->effective_cpus)) {
                        pos_css = css_rightmost_descendant(pos_css);
                        continue;
                }

                css_get(&cs->css);

                rcu_read_unlock();

                dl_update_tasks_root_domain(cs);

                rcu_read_lock();
                css_put(&cs->css);
        }
        rcu_read_unlock();
}

static void
partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
                                    struct sched_domain_attr *dattr_new)
{
        mutex_lock(&sched_domains_mutex);
        partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
        dl_rebuild_rd_accounting();
        mutex_unlock(&sched_domains_mutex);
}

/*
 * Rebuild scheduler domains.
 *
 * If the flag 'sched_load_balance' of any cpuset with non-empty
 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
 * which has that flag enabled, or if any cpuset with a non-empty
 * 'cpus' is removed, then call this routine to rebuild the
 * scheduler's dynamic sched domains.
 *
 * Call with cpuset_mutex held.  Takes cpus_read_lock().
 */
static void rebuild_sched_domains_locked(void)
{
        struct cgroup_subsys_state *pos_css;
        struct sched_domain_attr *attr;
        cpumask_var_t *doms;
        struct cpuset *cs;
        int ndoms;

        lockdep_assert_cpus_held();
        lockdep_assert_held(&cpuset_mutex);

        /*
         * If we have raced with CPU hotplug, return early to avoid
         * passing doms with offlined cpu to partition_sched_domains().
         * Anyways, cpuset_handle_hotplug() will rebuild sched domains.
         *
         * With no CPUs in any subpartitions, top_cpuset's effective CPUs
         * should be the same as the active CPUs, so checking only top_cpuset
         * is enough to detect racing CPU offlines.
         */
        if (cpumask_empty(subpartitions_cpus) &&
            !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
                return;

        /*
         * With subpartition CPUs, however, the effective CPUs of a partition
         * root should be only a subset of the active CPUs.  Since a CPU in any
         * partition root could be offlined, all must be checked.
         */
        if (top_cpuset.nr_subparts) {
                rcu_read_lock();
                cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
                        if (!is_partition_valid(cs)) {
                                pos_css = css_rightmost_descendant(pos_css);
                                continue;
                        }
                        if (!cpumask_subset(cs->effective_cpus,
                                            cpu_active_mask)) {
                                rcu_read_unlock();
                                return;
                        }
                }
                rcu_read_unlock();
        }

        /* Generate domain masks and attrs */
        ndoms = generate_sched_domains(&doms, &attr);

        /* Have scheduler rebuild the domains */
        partition_and_rebuild_sched_domains(ndoms, doms, attr);
}
#else /* !CONFIG_SMP */
static void rebuild_sched_domains_locked(void)
{
}
#endif /* CONFIG_SMP */

static void rebuild_sched_domains_cpuslocked(void)
{
        mutex_lock(&cpuset_mutex);
        rebuild_sched_domains_locked();
        mutex_unlock(&cpuset_mutex);
}

void rebuild_sched_domains(void)
{
        cpus_read_lock();
        rebuild_sched_domains_cpuslocked();
        cpus_read_unlock();
}

/**
 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
 * @new_cpus: the temp variable for the new effective_cpus mask
 *
 * Iterate through each task of @cs updating its cpus_allowed to the
 * effective cpuset's.  As this function is called with cpuset_mutex held,
 * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask()
 * is used instead of effective_cpus to make sure all offline CPUs are also
 * included as hotplug code won't update cpumasks for tasks in top_cpuset.
 */
static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
{
        struct css_task_iter it;
        struct task_struct *task;
        bool top_cs = cs == &top_cpuset;

        css_task_iter_start(&cs->css, 0, &it);
        while ((task = css_task_iter_next(&it))) {
                const struct cpumask *possible_mask = task_cpu_possible_mask(task);

                if (top_cs) {
                        /*
                         * Percpu kthreads in top_cpuset are ignored
                         */
                        if (kthread_is_per_cpu(task))
                                continue;
                        cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
                } else {
                        cpumask_and(new_cpus, possible_mask, cs->effective_cpus);
                }
                set_cpus_allowed_ptr(task, new_cpus);
        }
        css_task_iter_end(&it);
}

/**
 * compute_effective_cpumask - Compute the effective cpumask of the cpuset
 * @new_cpus: the temp variable for the new effective_cpus mask
 * @cs: the cpuset the need to recompute the new effective_cpus mask
 * @parent: the parent cpuset
 *
 * The result is valid only if the given cpuset isn't a partition root.
 */
static void compute_effective_cpumask(struct cpumask *new_cpus,
                                      struct cpuset *cs, struct cpuset *parent)
{
        cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
}

/*
 * Commands for update_parent_effective_cpumask
 */
enum partition_cmd {
        partcmd_enable,                /* Enable partition root          */
        partcmd_enablei,        /* Enable isolated partition root */
        partcmd_disable,        /* Disable partition root          */
        partcmd_update,                /* Update parent's effective_cpus */
        partcmd_invalidate,        /* Make partition invalid          */
};

static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
                       int turning_on);
static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
                                    struct tmpmasks *tmp);

/*
 * Update partition exclusive flag
 *
 * Return: 0 if successful, an error code otherwise
 */
static int update_partition_exclusive(struct cpuset *cs, int new_prs)
{
        bool exclusive = (new_prs > 0);

        if (exclusive && !is_cpu_exclusive(cs)) {
                if (update_flag(CS_CPU_EXCLUSIVE, cs, 1))
                        return PERR_NOTEXCL;
        } else if (!exclusive && is_cpu_exclusive(cs)) {
                /* Turning off CS_CPU_EXCLUSIVE will not return error */
                update_flag(CS_CPU_EXCLUSIVE, cs, 0);
        }
        return 0;
}

/*
 * Update partition load balance flag and/or rebuild sched domain
 *
 * Changing load balance flag will automatically call
 * rebuild_sched_domains_locked().
 * This function is for cgroup v2 only.
 */
static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
{
        int new_prs = cs->partition_root_state;
        bool rebuild_domains = (new_prs > 0) || (old_prs > 0);
        bool new_lb;

        /*
         * If cs is not a valid partition root, the load balance state
         * will follow its parent.
         */
        if (new_prs > 0) {
                new_lb = (new_prs != PRS_ISOLATED);
        } else {
                new_lb = is_sched_load_balance(parent_cs(cs));
        }
        if (new_lb != !!is_sched_load_balance(cs)) {
                rebuild_domains = true;
                if (new_lb)
                        set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
                else
                        clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
        }

        if (rebuild_domains)
                rebuild_sched_domains_locked();
}

/*
 * tasks_nocpu_error - Return true if tasks will have no effective_cpus
 */
static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs,
                              struct cpumask *xcpus)
{
        /*
         * A populated partition (cs or parent) can't have empty effective_cpus
         */
        return (cpumask_subset(parent->effective_cpus, xcpus) &&
                partition_is_populated(parent, cs)) ||
               (!cpumask_intersects(xcpus, cpu_active_mask) &&
                partition_is_populated(cs, NULL));
}

static void reset_partition_data(struct cpuset *cs)
{
        struct cpuset *parent = parent_cs(cs);

        if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
                return;

        lockdep_assert_held(&callback_lock);

        cs->nr_subparts = 0;
        if (cpumask_empty(cs->exclusive_cpus)) {
                cpumask_clear(cs->effective_xcpus);
                if (is_cpu_exclusive(cs))
                        clear_bit(CS_CPU_EXCLUSIVE, &cs->flags);
        }
        if (!cpumask_and(cs->effective_cpus,
                         parent->effective_cpus, cs->cpus_allowed)) {
                cs->use_parent_ecpus = true;
                parent->child_ecpus_count++;
                cpumask_copy(cs->effective_cpus, parent->effective_cpus);
        }
}

/*
 * partition_xcpus_newstate - Exclusive CPUs state change
 * @old_prs: old partition_root_state
 * @new_prs: new partition_root_state
 * @xcpus: exclusive CPUs with state change
 */
static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus)
{
        WARN_ON_ONCE(old_prs == new_prs);
        if (new_prs == PRS_ISOLATED)
                cpumask_or(isolated_cpus, isolated_cpus, xcpus);
        else
                cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
}

/*
 * partition_xcpus_add - Add new exclusive CPUs to partition
 * @new_prs: new partition_root_state
 * @parent: parent cpuset
 * @xcpus: exclusive CPUs to be added
 * Return: true if isolated_cpus modified, false otherwise
 *
 * Remote partition if parent == NULL
 */
static bool partition_xcpus_add(int new_prs, struct cpuset *parent,
                                struct cpumask *xcpus)
{
        bool isolcpus_updated;

        WARN_ON_ONCE(new_prs < 0);
        lockdep_assert_held(&callback_lock);
        if (!parent)
                parent = &top_cpuset;


        if (parent == &top_cpuset)
                cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);

        isolcpus_updated = (new_prs != parent->partition_root_state);
        if (isolcpus_updated)
                partition_xcpus_newstate(parent->partition_root_state, new_prs,
                                         xcpus);

        cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
        return isolcpus_updated;
}

/*
 * partition_xcpus_del - Remove exclusive CPUs from partition
 * @old_prs: old partition_root_state
 * @parent: parent cpuset
 * @xcpus: exclusive CPUs to be removed
 * Return: true if isolated_cpus modified, false otherwise
 *
 * Remote partition if parent == NULL
 */
static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
                                struct cpumask *xcpus)
{
        bool isolcpus_updated;

        WARN_ON_ONCE(old_prs < 0);
        lockdep_assert_held(&callback_lock);
        if (!parent)
                parent = &top_cpuset;

        if (parent == &top_cpuset)
                cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);

        isolcpus_updated = (old_prs != parent->partition_root_state);
        if (isolcpus_updated)
                partition_xcpus_newstate(old_prs, parent->partition_root_state,
                                         xcpus);

        cpumask_and(xcpus, xcpus, cpu_active_mask);
        cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
        return isolcpus_updated;
}

static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
{
        int ret;

        lockdep_assert_cpus_held();

        if (!isolcpus_updated)
                return;

        ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
        WARN_ON_ONCE(ret < 0);
}

/**
 * cpuset_cpu_is_isolated - Check if the given CPU is isolated
 * @cpu: the CPU number to be checked
 * Return: true if CPU is used in an isolated partition, false otherwise
 */
bool cpuset_cpu_is_isolated(int cpu)
{
        return cpumask_test_cpu(cpu, isolated_cpus);
}
EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);

/*
 * compute_effective_exclusive_cpumask - compute effective exclusive CPUs
 * @cs: cpuset
 * @xcpus: effective exclusive CPUs value to be set
 * Return: true if xcpus is not empty, false otherwise.
 *
 * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set),
 * it must be a subset of cpus_allowed and parent's effective_xcpus.
 */
static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
                                                struct cpumask *xcpus)
{
        struct cpuset *parent = parent_cs(cs);

        if (!xcpus)
                xcpus = cs->effective_xcpus;

        if (!cpumask_empty(cs->exclusive_cpus))
                cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed);
        else
                cpumask_copy(xcpus, cs->cpus_allowed);

        return cpumask_and(xcpus, xcpus, parent->effective_xcpus);
}

static inline bool is_remote_partition(struct cpuset *cs)
{
        return !list_empty(&cs->remote_sibling);
}

static inline bool is_local_partition(struct cpuset *cs)
{
        return is_partition_valid(cs) && !is_remote_partition(cs);
}

/*
 * remote_partition_enable - Enable current cpuset as a remote partition root
 * @cs: the cpuset to update
 * @new_prs: new partition_root_state
 * @tmp: temparary masks
 * Return: 1 if successful, 0 if error
 *
 * Enable the current cpuset to become a remote partition root taking CPUs
 * directly from the top cpuset. cpuset_mutex must be held by the caller.
 */
static int remote_partition_enable(struct cpuset *cs, int new_prs,
                                   struct tmpmasks *tmp)
{
        bool isolcpus_updated;

        /*
         * The user must have sysadmin privilege.
         */
        if (!capable(CAP_SYS_ADMIN))
                return 0;

        /*
         * The requested exclusive_cpus must not be allocated to other
         * partitions and it can't use up all the root's effective_cpus.
         *
         * Note that if there is any local partition root above it or
         * remote partition root underneath it, its exclusive_cpus must
         * have overlapped with subpartitions_cpus.
         */
        compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
        if (cpumask_empty(tmp->new_cpus) ||
            cpumask_intersects(tmp->new_cpus, subpartitions_cpus) ||
            cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
                return 0;

        spin_lock_irq(&callback_lock);
        isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
        list_add(&cs->remote_sibling, &remote_children);
        if (cs->use_parent_ecpus) {
                struct cpuset *parent = parent_cs(cs);

                cs->use_parent_ecpus = false;
                parent->child_ecpus_count--;
        }
        spin_unlock_irq(&callback_lock);
        update_unbound_workqueue_cpumask(isolcpus_updated);

        /*
         * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
         */
        update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
        update_sibling_cpumasks(&top_cpuset, NULL, tmp);
        return 1;
}

/*
 * remote_partition_disable - Remove current cpuset from remote partition list
 * @cs: the cpuset to update
 * @tmp: temparary masks
 *
 * The effective_cpus is also updated.
 *
 * cpuset_mutex must be held by the caller.
 */
static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
{
        bool isolcpus_updated;

        compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
        WARN_ON_ONCE(!is_remote_partition(cs));
        WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));

        spin_lock_irq(&callback_lock);
        list_del_init(&cs->remote_sibling);
        isolcpus_updated = partition_xcpus_del(cs->partition_root_state,
                                               NULL, tmp->new_cpus);
        cs->partition_root_state = -cs->partition_root_state;
        if (!cs->prs_err)
                cs->prs_err = PERR_INVCPUS;
        reset_partition_data(cs);
        spin_unlock_irq(&callback_lock);
        update_unbound_workqueue_cpumask(isolcpus_updated);

        /*
         * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
         */
        update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
        update_sibling_cpumasks(&top_cpuset, NULL, tmp);
}

/*
 * remote_cpus_update - cpus_exclusive change of remote partition
 * @cs: the cpuset to be updated
 * @newmask: the new effective_xcpus mask
 * @tmp: temparary masks
 *
 * top_cpuset and subpartitions_cpus will be updated or partition can be
 * invalidated.
 */
static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
                               struct tmpmasks *tmp)
{
        bool adding, deleting;
        int prs = cs->partition_root_state;
        int isolcpus_updated = 0;

        if (WARN_ON_ONCE(!is_remote_partition(cs)))
                return;

        WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));

        if (cpumask_empty(newmask))
                goto invalidate;

        adding   = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus);
        deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask);

        /*
         * Additions of remote CPUs is only allowed if those CPUs are
         * not allocated to other partitions and there are effective_cpus
         * left in the top cpuset.
         */
        if (adding && (!capable(CAP_SYS_ADMIN) ||
                       cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
                       cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)))
                goto invalidate;

        spin_lock_irq(&callback_lock);
        if (adding)
                isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask);
        if (deleting)
                isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask);
        spin_unlock_irq(&callback_lock);
        update_unbound_workqueue_cpumask(isolcpus_updated);

        /*
         * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
         */
        update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
        update_sibling_cpumasks(&top_cpuset, NULL, tmp);
        return;

invalidate:
        remote_partition_disable(cs, tmp);
}

/*
 * remote_partition_check - check if a child remote partition needs update
 * @cs: the cpuset to be updated
 * @newmask: the new effective_xcpus mask
 * @delmask: temporary mask for deletion (not in tmp)
 * @tmp: temparary masks
 *
 * This should be called before the given cs has updated its cpus_allowed
 * and/or effective_xcpus.
 */
static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
                                   struct cpumask *delmask, struct tmpmasks *tmp)
{
        struct cpuset *child, *next;
        int disable_cnt = 0;

        /*
         * Compute the effective exclusive CPUs that will be deleted.
         */
        if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) ||
            !cpumask_intersects(delmask, subpartitions_cpus))
                return;        /* No deletion of exclusive CPUs in partitions */

        /*
         * Searching the remote children list to look for those that will
         * be impacted by the deletion of exclusive CPUs.
         *
         * Since a cpuset must be removed from the remote children list
         * before it can go offline and holding cpuset_mutex will prevent
         * any change in cpuset status. RCU read lock isn't needed.
         */
        lockdep_assert_held(&cpuset_mutex);
        list_for_each_entry_safe(child, next, &remote_children, remote_sibling)
                if (cpumask_intersects(child->effective_cpus, delmask)) {
                        remote_partition_disable(child, tmp);
                        disable_cnt++;
                }
        if (disable_cnt)
                rebuild_sched_domains_locked();
}

/*
 * prstate_housekeeping_conflict - check for partition & housekeeping conflicts
 * @prstate: partition root state to be checked
 * @new_cpus: cpu mask
 * Return: true if there is conflict, false otherwise
 *
 * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in
 * an isolated partition.
 */
static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
{
        const struct cpumask *hk_domain = housekeeping_cpumask(HK_TYPE_DOMAIN);
        bool all_in_hk = cpumask_subset(new_cpus, hk_domain);

        if (!all_in_hk && (prstate != PRS_ISOLATED))
                return true;

        return false;
}

/**
 * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset
 * @cs:      The cpuset that requests change in partition root state
 * @cmd:     Partition root state change command
 * @newmask: Optional new cpumask for partcmd_update
 * @tmp:     Temporary addmask and delmask
 * Return:   0 or a partition root state error code
 *
 * For partcmd_enable*, the cpuset is being transformed from a non-partition
 * root to a partition root. The effective_xcpus (cpus_allowed if
 * effective_xcpus not set) mask of the given cpuset will be taken away from
 * parent's effective_cpus. The function will return 0 if all the CPUs listed
 * in effective_xcpus can be granted or an error code will be returned.
 *
 * For partcmd_disable, the cpuset is being transformed from a partition
 * root back to a non-partition root. Any CPUs in effective_xcpus will be
 * given back to parent's effective_cpus. 0 will always be returned.
 *
 * For partcmd_update, if the optional newmask is specified, the cpu list is
 * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is
 * assumed to remain the same. The cpuset should either be a valid or invalid
 * partition root. The partition root state may change from valid to invalid
 * or vice versa. An error code will be returned if transitioning from
 * invalid to valid violates the exclusivity rule.
 *
 * For partcmd_invalidate, the current partition will be made invalid.
 *
 * The partcmd_enable* and partcmd_disable commands are used by
 * update_prstate(). An error code may be returned and the caller will check
 * for error.
 *
 * The partcmd_update command is used by update_cpumasks_hier() with newmask
 * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used
 * by update_cpumask() with NULL newmask. In both cases, the callers won't
 * check for error and so partition_root_state and prs_error will be updated
 * directly.
 */
static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
                                           struct cpumask *newmask,
                                           struct tmpmasks *tmp)
{
        struct cpuset *parent = parent_cs(cs);
        int adding;        /* Adding cpus to parent's effective_cpus        */
        int deleting;        /* Deleting cpus from parent's effective_cpus        */
        int old_prs, new_prs;
        int part_error = PERR_NONE;        /* Partition error? */
        int subparts_delta = 0;
        struct cpumask *xcpus;                /* cs effective_xcpus */
        int isolcpus_updated = 0;
        bool nocpu;

        lockdep_assert_held(&cpuset_mutex);

        /*
         * new_prs will only be changed for the partcmd_update and
         * partcmd_invalidate commands.
         */
        adding = deleting = false;
        old_prs = new_prs = cs->partition_root_state;
        xcpus = !cpumask_empty(cs->exclusive_cpus)
                ? cs->effective_xcpus : cs->cpus_allowed;

        if (cmd == partcmd_invalidate) {
                if (is_prs_invalid(old_prs))
                        return 0;

                /*
                 * Make the current partition invalid.
                 */
                if (is_partition_valid(parent))
                        adding = cpumask_and(tmp->addmask,
                                             xcpus, parent->effective_xcpus);
                if (old_prs > 0) {
                        new_prs = -old_prs;
                        subparts_delta--;
                }
                goto write_error;
        }

        /*
         * The parent must be a partition root.
         * The new cpumask, if present, or the current cpus_allowed must
         * not be empty.
         */
        if (!is_partition_valid(parent)) {
                return is_partition_invalid(parent)
                       ? PERR_INVPARENT : PERR_NOTPART;
        }
        if (!newmask && cpumask_empty(cs->cpus_allowed))
                return PERR_CPUSEMPTY;

        nocpu = tasks_nocpu_error(parent, cs, xcpus);

        if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
                /*
                 * Enabling partition root is not allowed if its
                 * effective_xcpus is empty or doesn't overlap with
                 * parent's effective_xcpus.
                 */
                if (cpumask_empty(xcpus) ||
                    !cpumask_intersects(xcpus, parent->effective_xcpus))
                        return PERR_INVCPUS;

                if (prstate_housekeeping_conflict(new_prs, xcpus))
                        return PERR_HKEEPING;

                /*
                 * A parent can be left with no CPU as long as there is no
                 * task directly associated with the parent partition.
                 */
                if (nocpu)
                        return PERR_NOCPUS;

                cpumask_copy(tmp->delmask, xcpus);
                deleting = true;
                subparts_delta++;
                new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
        } else if (cmd == partcmd_disable) {
                /*
                 * May need to add cpus to parent's effective_cpus for
                 * valid partition root.
                 */
                adding = !is_prs_invalid(old_prs) &&
                          cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus);
                if (adding)
                        subparts_delta--;
                new_prs = PRS_MEMBER;
        } else if (newmask) {
                /*
                 * Empty cpumask is not allowed
                 */
                if (cpumask_empty(newmask)) {
                        part_error = PERR_CPUSEMPTY;
                        goto write_error;
                }

                /*
                 * partcmd_update with newmask:
                 *
                 * Compute add/delete mask to/from effective_cpus
                 *
                 * For valid partition:
                 *   addmask = exclusive_cpus & ~newmask
                 *                              & parent->effective_xcpus
                 *   delmask = newmask & ~exclusive_cpus
                 *                       & parent->effective_xcpus
                 *
                 * For invalid partition:
                 *   delmask = newmask & parent->effective_xcpus
                 */
                if (is_prs_invalid(old_prs)) {
                        adding = false;
                        deleting = cpumask_and(tmp->delmask,
                                        newmask, parent->effective_xcpus);
                } else {
                        cpumask_andnot(tmp->addmask, xcpus, newmask);
                        adding = cpumask_and(tmp->addmask, tmp->addmask,
                                             parent->effective_xcpus);

                        cpumask_andnot(tmp->delmask, newmask, xcpus);
                        deleting = cpumask_and(tmp->delmask, tmp->delmask,
                                               parent->effective_xcpus);
                }
                /*
                 * Make partition invalid if parent's effective_cpus could
                 * become empty and there are tasks in the parent.
                 */
                if (nocpu && (!adding ||
                    !cpumask_intersects(tmp->addmask, cpu_active_mask))) {
                        part_error = PERR_NOCPUS;
                        deleting = false;
                        adding = cpumask_and(tmp->addmask,
                                             xcpus, parent->effective_xcpus);
                }
        } else {
                /*
                 * partcmd_update w/o newmask
                 *
                 * delmask = effective_xcpus & parent->effective_cpus
                 *
                 * This can be called from:
                 * 1) update_cpumasks_hier()
                 * 2) cpuset_hotplug_update_tasks()
                 *
                 * Check to see if it can be transitioned from valid to
                 * invalid partition or vice versa.
                 *
                 * A partition error happens when parent has tasks and all
                 * its effective CPUs will have to be distributed out.
                 */
                WARN_ON_ONCE(!is_partition_valid(parent));
                if (nocpu) {
                        part_error = PERR_NOCPUS;
                        if (is_partition_valid(cs))
                                adding = cpumask_and(tmp->addmask,
                                                xcpus, parent->effective_xcpus);
                } else if (is_partition_invalid(cs) &&
                           cpumask_subset(xcpus, parent->effective_xcpus)) {
                        struct cgroup_subsys_state *css;
                        struct cpuset *child;
                        bool exclusive = true;

                        /*
                         * Convert invalid partition to valid has to
                         * pass the cpu exclusivity test.
                         */
                        rcu_read_lock();
                        cpuset_for_each_child(child, css, parent) {
                                if (child == cs)
                                        continue;
                                if (!cpusets_are_exclusive(cs, child)) {
                                        exclusive = false;
                                        break;
                                }
                        }
                        rcu_read_unlock();
                        if (exclusive)
                                deleting = cpumask_and(tmp->delmask,
                                                xcpus, parent->effective_cpus);
                        else
                                part_error = PERR_NOTEXCL;
                }
        }

write_error:
        if (part_error)
                WRITE_ONCE(cs->prs_err, part_error);

        if (cmd == partcmd_update) {
                /*
                 * Check for possible transition between valid and invalid
                 * partition root.
                 */
                switch (cs->partition_root_state) {
                case PRS_ROOT:
                case PRS_ISOLATED:
                        if (part_error) {
                                new_prs = -old_prs;
                                subparts_delta--;
                        }
                        break;
                case PRS_INVALID_ROOT:
                case PRS_INVALID_ISOLATED:
                        if (!part_error) {
                                new_prs = -old_prs;
                                subparts_delta++;
                        }
                        break;
                }
        }

        if (!adding && !deleting && (new_prs == old_prs))
                return 0;

        /*
         * Transitioning between invalid to valid or vice versa may require
         * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update,
         * validate_change() has already been successfully called and
         * CPU lists in cs haven't been updated yet. So defer it to later.
         */
        if ((old_prs != new_prs) && (cmd != partcmd_update))  {
                int err = update_partition_exclusive(cs, new_prs);

                if (err)
                        return err;
        }

        /*
         * Change the parent's effective_cpus & effective_xcpus (top cpuset
         * only).
         *
         * Newly added CPUs will be removed from effective_cpus and
         * newly deleted ones will be added back to effective_cpus.
         */
        spin_lock_irq(&callback_lock);
        if (old_prs != new_prs) {
                cs->partition_root_state = new_prs;
                if (new_prs <= 0)
                        cs->nr_subparts = 0;
        }
        /*
         * Adding to parent's effective_cpus means deletion CPUs from cs
         * and vice versa.
         */
        if (adding)
                isolcpus_updated += partition_xcpus_del(old_prs, parent,
                                                        tmp->addmask);
        if (deleting)
                isolcpus_updated += partition_xcpus_add(new_prs, parent,
                                                        tmp->delmask);

        if (is_partition_valid(parent)) {
                parent->nr_subparts += subparts_delta;
                WARN_ON_ONCE(parent->nr_subparts < 0);
        }
        spin_unlock_irq(&callback_lock);
        update_unbound_workqueue_cpumask(isolcpus_updated);

        if ((old_prs != new_prs) && (cmd == partcmd_update))
                update_partition_exclusive(cs, new_prs);

        if (adding || deleting) {
                update_tasks_cpumask(parent, tmp->addmask);
                update_sibling_cpumasks(parent, cs, tmp);
        }

        /*
         * For partcmd_update without newmask, it is being called from
         * cpuset_handle_hotplug(). Update the load balance flag and
         * scheduling domain accordingly.
         */
        if ((cmd == partcmd_update) && !newmask)
                update_partition_sd_lb(cs, old_prs);

        notify_partition_change(cs, old_prs);
        return 0;
}

/**
 * compute_partition_effective_cpumask - compute effective_cpus for partition
 * @cs: partition root cpuset
 * @new_ecpus: previously computed effective_cpus to be updated
 *
 * Compute the effective_cpus of a partition root by scanning effective_xcpus
 * of child partition roots and excluding their effective_xcpus.
 *
 * This has the side effect of invalidating valid child partition roots,
 * if necessary. Since it is called from either cpuset_hotplug_update_tasks()
 * or update_cpumasks_hier() where parent and children are modified
 * successively, we don't need to call update_parent_effective_cpumask()
 * and the child's effective_cpus will be updated in later iterations.
 *
 * Note that rcu_read_lock() is assumed to be held.
 */
static void compute_partition_effective_cpumask(struct cpuset *cs,
                                                struct cpumask *new_ecpus)
{
        struct cgroup_subsys_state *css;
        struct cpuset *child;
        bool populated = partition_is_populated(cs, NULL);

        /*
         * Check child partition roots to see if they should be
         * invalidated when
         *  1) child effective_xcpus not a subset of new
         *     excluisve_cpus
         *  2) All the effective_cpus will be used up and cp
         *     has tasks
         */
        compute_effective_exclusive_cpumask(cs, new_ecpus);
        cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);

        rcu_read_lock();
        cpuset_for_each_child(child, css, cs) {
                if (!is_partition_valid(child))
                        continue;

                child->prs_err = 0;
                if (!cpumask_subset(child->effective_xcpus,
                                    cs->effective_xcpus))
                        child->prs_err = PERR_INVCPUS;
                else if (populated &&
                         cpumask_subset(new_ecpus, child->effective_xcpus))
                        child->prs_err = PERR_NOCPUS;

                if (child->prs_err) {
                        int old_prs = child->partition_root_state;

                        /*
                         * Invalidate child partition
                         */
                        spin_lock_irq(&callback_lock);
                        make_partition_invalid(child);
                        cs->nr_subparts--;
                        child->nr_subparts = 0;
                        spin_unlock_irq(&callback_lock);
                        notify_partition_change(child, old_prs);
                        continue;
                }
                cpumask_andnot(new_ecpus, new_ecpus,
                               child->effective_xcpus);
        }
        rcu_read_unlock();
}

/*
 * update_cpumasks_hier() flags
 */
#define HIER_CHECKALL                0x01        /* Check all cpusets with no skipping */
#define HIER_NO_SD_REBUILD        0x02        /* Don't rebuild sched domains */

/*
 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
 * @cs:  the cpuset to consider
 * @tmp: temp variables for calculating effective_cpus & partition setup
 * @force: don't skip any descendant cpusets if set
 *
 * When configured cpumask is changed, the effective cpumasks of this cpuset
 * and all its descendants need to be updated.
 *
 * On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
 *
 * Called with cpuset_mutex held
 */
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
                                 int flags)
{
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;
        bool need_rebuild_sched_domains = false;
        int old_prs, new_prs;

        rcu_read_lock();
        cpuset_for_each_descendant_pre(cp, pos_css, cs) {
                struct cpuset *parent = parent_cs(cp);
                bool remote = is_remote_partition(cp);
                bool update_parent = false;

                /*
                 * Skip descendent remote partition that acquires CPUs
                 * directly from top cpuset unless it is cs.
                 */
                if (remote && (cp != cs)) {
                        pos_css = css_rightmost_descendant(pos_css);
                        continue;
                }

                /*
                 * Update effective_xcpus if exclusive_cpus set.
                 * The case when exclusive_cpus isn't set is handled later.
                 */
                if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) {
                        spin_lock_irq(&callback_lock);
                        compute_effective_exclusive_cpumask(cp, NULL);
                        spin_unlock_irq(&callback_lock);
                }

                old_prs = new_prs = cp->partition_root_state;
                if (remote || (is_partition_valid(parent) &&
                               is_partition_valid(cp)))
                        compute_partition_effective_cpumask(cp, tmp->new_cpus);
                else
                        compute_effective_cpumask(tmp->new_cpus, cp, parent);

                /*
                 * A partition with no effective_cpus is allowed as long as
                 * there is no task associated with it. Call
                 * update_parent_effective_cpumask() to check it.
                 */
                if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) {
                        update_parent = true;
                        goto update_parent_effective;
                }

                /*
                 * If it becomes empty, inherit the effective mask of the
                 * parent, which is guaranteed to have some CPUs unless
                 * it is a partition root that has explicitly distributed
                 * out all its CPUs.
                 */
                if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) {
                        cpumask_copy(tmp->new_cpus, parent->effective_cpus);
                        if (!cp->use_parent_ecpus) {
                                cp->use_parent_ecpus = true;
                                parent->child_ecpus_count++;
                        }
                } else if (cp->use_parent_ecpus) {
                        cp->use_parent_ecpus = false;
                        WARN_ON_ONCE(!parent->child_ecpus_count);
                        parent->child_ecpus_count--;
                }

                if (remote)
                        goto get_css;

                /*
                 * Skip the whole subtree if
                 * 1) the cpumask remains the same,
                 * 2) has no partition root state,
                 * 3) HIER_CHECKALL flag not set, and
                 * 4) for v2 load balance state same as its parent.
                 */
                if (!cp->partition_root_state && !(flags & HIER_CHECKALL) &&
                    cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
                    (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
                    (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
                        pos_css = css_rightmost_descendant(pos_css);
                        continue;
                }

update_parent_effective:
                /*
                 * update_parent_effective_cpumask() should have been called
                 * for cs already in update_cpumask(). We should also call
                 * update_tasks_cpumask() again for tasks in the parent
                 * cpuset if the parent's effective_cpus changes.
                 */
                if ((cp != cs) && old_prs) {
                        switch (parent->partition_root_state) {
                        case PRS_ROOT:
                        case PRS_ISOLATED:
                                update_parent = true;
                                break;

                        default:
                                /*
                                 * When parent is not a partition root or is
                                 * invalid, child partition roots become
                                 * invalid too.
                                 */
                                if (is_partition_valid(cp))
                                        new_prs = -cp->partition_root_state;
                                WRITE_ONCE(cp->prs_err,
                                           is_partition_invalid(parent)
                                           ? PERR_INVPARENT : PERR_NOTPART);
                                break;
                        }
                }
get_css:
                if (!css_tryget_online(&cp->css))
                        continue;
                rcu_read_unlock();

                if (update_parent) {
                        update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp);
                        /*
                         * The cpuset partition_root_state may become
                         * invalid. Capture it.
                         */
                        new_prs = cp->partition_root_state;
                }

                spin_lock_irq(&callback_lock);
                cpumask_copy(cp->effective_cpus, tmp->new_cpus);
                cp->partition_root_state = new_prs;
                /*
                 * Make sure effective_xcpus is properly set for a valid
                 * partition root.
                 */
                if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus))
                        cpumask_and(cp->effective_xcpus,
                                    cp->cpus_allowed, parent->effective_xcpus);
                else if (new_prs < 0)
                        reset_partition_data(cp);
                spin_unlock_irq(&callback_lock);

                notify_partition_change(cp, old_prs);

                WARN_ON(!is_in_v2_mode() &&
                        !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));

                update_tasks_cpumask(cp, cp->effective_cpus);

                /*
                 * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
                 * from parent if current cpuset isn't a valid partition root
                 * and their load balance states differ.
                 */
                if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
                    !is_partition_valid(cp) &&
                    (is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
                        if (is_sched_load_balance(parent))
                                set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
                        else
                                clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
                }

                /*
                 * On legacy hierarchy, if the effective cpumask of any non-
                 * empty cpuset is changed, we need to rebuild sched domains.
                 * On default hierarchy, the cpuset needs to be a partition
                 * root as well.
                 */
                if (!cpumask_empty(cp->cpus_allowed) &&
                    is_sched_load_balance(cp) &&
                   (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
                    is_partition_valid(cp)))
                        need_rebuild_sched_domains = true;

                rcu_read_lock();
                css_put(&cp->css);
        }
        rcu_read_unlock();

        if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD))
                rebuild_sched_domains_locked();
}

/**
 * update_sibling_cpumasks - Update siblings cpumasks
 * @parent:  Parent cpuset
 * @cs:      Current cpuset
 * @tmp:     Temp variables
 */
static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
                                    struct tmpmasks *tmp)
{
        struct cpuset *sibling;
        struct cgroup_subsys_state *pos_css;

        lockdep_assert_held(&cpuset_mutex);

        /*
         * Check all its siblings and call update_cpumasks_hier()
         * if their effective_cpus will need to be changed.
         *
         * With the addition of effective_xcpus which is a subset of
         * cpus_allowed. It is possible a change in parent's effective_cpus
         * due to a change in a child partition's effective_xcpus will impact
         * its siblings even if they do not inherit parent's effective_cpus
         * directly.
         *
         * The update_cpumasks_hier() function may sleep. So we have to
         * release the RCU read lock before calling it. HIER_NO_SD_REBUILD
         * flag is used to suppress rebuild of sched domains as the callers
         * will take care of that.
         */
        rcu_read_lock();
        cpuset_for_each_child(sibling, pos_css, parent) {
                if (sibling == cs)
                        continue;
                if (!sibling->use_parent_ecpus &&
                    !is_partition_valid(sibling)) {
                        compute_effective_cpumask(tmp->new_cpus, sibling,
                                                  parent);
                        if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
                                continue;
                }
                if (!css_tryget_online(&sibling->css))
                        continue;

                rcu_read_unlock();
                update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD);
                rcu_read_lock();
                css_put(&sibling->css);
        }
        rcu_read_unlock();
}

/**
 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
 * @cs: the cpuset to consider
 * @trialcs: trial cpuset
 * @buf: buffer of cpu numbers written to this cpuset
 */
static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
                          const char *buf)
{
        int retval;
        struct tmpmasks tmp;
        struct cpuset *parent = parent_cs(cs);
        bool invalidate = false;
        int hier_flags = 0;
        int old_prs = cs->partition_root_state;

        /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
        if (cs == &top_cpuset)
                return -EACCES;

        /*
         * An empty cpus_allowed is ok only if the cpuset has no tasks.
         * Since cpulist_parse() fails on an empty mask, we special case
         * that parsing.  The validate_change() call ensures that cpusets
         * with tasks have cpus.
         */
        if (!*buf) {
                cpumask_clear(trialcs->cpus_allowed);
                cpumask_clear(trialcs->effective_xcpus);
        } else {
                retval = cpulist_parse(buf, trialcs->cpus_allowed);
                if (retval < 0)
                        return retval;

                if (!cpumask_subset(trialcs->cpus_allowed,
                                    top_cpuset.cpus_allowed))
                        return -EINVAL;

                /*
                 * When exclusive_cpus isn't explicitly set, it is constrainted
                 * by cpus_allowed and parent's effective_xcpus. Otherwise,
                 * trialcs->effective_xcpus is used as a temporary cpumask
                 * for checking validity of the partition root.
                 */
                if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs))
                        compute_effective_exclusive_cpumask(trialcs, NULL);
        }

        /* Nothing to do if the cpus didn't change */
        if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
                return 0;

        if (alloc_cpumasks(NULL, &tmp))
                return -ENOMEM;

        if (old_prs) {
                if (is_partition_valid(cs) &&
                    cpumask_empty(trialcs->effective_xcpus)) {
                        invalidate = true;
                        cs->prs_err = PERR_INVCPUS;
                } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
                        invalidate = true;
                        cs->prs_err = PERR_HKEEPING;
                } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
                        invalidate = true;
                        cs->prs_err = PERR_NOCPUS;
                }
        }

        /*
         * Check all the descendants in update_cpumasks_hier() if
         * effective_xcpus is to be changed.
         */
        if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
                hier_flags = HIER_CHECKALL;

        retval = validate_change(cs, trialcs);

        if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
                struct cgroup_subsys_state *css;
                struct cpuset *cp;

                /*
                 * The -EINVAL error code indicates that partition sibling
                 * CPU exclusivity rule has been violated. We still allow
                 * the cpumask change to proceed while invalidating the
                 * partition. However, any conflicting sibling partitions
                 * have to be marked as invalid too.
                 */
                invalidate = true;
                rcu_read_lock();
                cpuset_for_each_child(cp, css, parent) {
                        struct cpumask *xcpus = fetch_xcpus(trialcs);

                        if (is_partition_valid(cp) &&
                            cpumask_intersects(xcpus, cp->effective_xcpus)) {
                                rcu_read_unlock();
                                update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp);
                                rcu_read_lock();
                        }
                }
                rcu_read_unlock();
                retval = 0;
        }

        if (retval < 0)
                goto out_free;

        if (is_partition_valid(cs) ||
           (is_partition_invalid(cs) && !invalidate)) {
                struct cpumask *xcpus = trialcs->effective_xcpus;

                if (cpumask_empty(xcpus) && is_partition_invalid(cs))
                        xcpus = trialcs->cpus_allowed;

                /*
                 * Call remote_cpus_update() to handle valid remote partition
                 */
                if (is_remote_partition(cs))
                        remote_cpus_update(cs, xcpus, &tmp);
                else if (invalidate)
                        update_parent_effective_cpumask(cs, partcmd_invalidate,
                                                        NULL, &tmp);
                else
                        update_parent_effective_cpumask(cs, partcmd_update,
                                                        xcpus, &tmp);
        } else if (!cpumask_empty(cs->exclusive_cpus)) {
                /*
                 * Use trialcs->effective_cpus as a temp cpumask
                 */
                remote_partition_check(cs, trialcs->effective_xcpus,
                                       trialcs->effective_cpus, &tmp);
        }

        spin_lock_irq(&callback_lock);
        cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
        cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
        if ((old_prs > 0) && !is_partition_valid(cs))
                reset_partition_data(cs);
        spin_unlock_irq(&callback_lock);

        /* effective_cpus/effective_xcpus will be updated here */
        update_cpumasks_hier(cs, &tmp, hier_flags);

        /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
        if (cs->partition_root_state)
                update_partition_sd_lb(cs, old_prs);
out_free:
        free_cpumasks(NULL, &tmp);
        return retval;
}

/**
 * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset
 * @cs: the cpuset to consider
 * @trialcs: trial cpuset
 * @buf: buffer of cpu numbers written to this cpuset
 *
 * The tasks' cpumask will be updated if cs is a valid partition root.
 */
static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
                                    const char *buf)
{
        int retval;
        struct tmpmasks tmp;
        struct cpuset *parent = parent_cs(cs);
        bool invalidate = false;
        int hier_flags = 0;
        int old_prs = cs->partition_root_state;

        if (!*buf) {
                cpumask_clear(trialcs->exclusive_cpus);
                cpumask_clear(trialcs->effective_xcpus);
        } else {
                retval = cpulist_parse(buf, trialcs->exclusive_cpus);
                if (retval < 0)
                        return retval;
                if (!is_cpu_exclusive(cs))
                        set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags);
        }

        /* Nothing to do if the CPUs didn't change */
        if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
                return 0;

        if (*buf)
                compute_effective_exclusive_cpumask(trialcs, NULL);

        /*
         * Check all the descendants in update_cpumasks_hier() if
         * effective_xcpus is to be changed.
         */
        if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
                hier_flags = HIER_CHECKALL;

        retval = validate_change(cs, trialcs);
        if (retval)
                return retval;

        if (alloc_cpumasks(NULL, &tmp))
                return -ENOMEM;

        if (old_prs) {
                if (cpumask_empty(trialcs->effective_xcpus)) {
                        invalidate = true;
                        cs->prs_err = PERR_INVCPUS;
                } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
                        invalidate = true;
                        cs->prs_err = PERR_HKEEPING;
                } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
                        invalidate = true;
                        cs->prs_err = PERR_NOCPUS;
                }

                if (is_remote_partition(cs)) {
                        if (invalidate)
                                remote_partition_disable(cs, &tmp);
                        else
                                remote_cpus_update(cs, trialcs->effective_xcpus,
                                                   &tmp);
                } else if (invalidate) {
                        update_parent_effective_cpumask(cs, partcmd_invalidate,
                                                        NULL, &tmp);
                } else {
                        update_parent_effective_cpumask(cs, partcmd_update,
                                                trialcs->effective_xcpus, &tmp);
                }
        } else if (!cpumask_empty(trialcs->exclusive_cpus)) {
                /*
                 * Use trialcs->effective_cpus as a temp cpumask
                 */
                remote_partition_check(cs, trialcs->effective_xcpus,
                                       trialcs->effective_cpus, &tmp);
        }
        spin_lock_irq(&callback_lock);
        cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
        cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
        if ((old_prs > 0) && !is_partition_valid(cs))
                reset_partition_data(cs);
        spin_unlock_irq(&callback_lock);

        /*
         * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus
         * of the subtree when it is a valid partition root or effective_xcpus
         * is updated.
         */
        if (is_partition_valid(cs) || hier_flags)
                update_cpumasks_hier(cs, &tmp, hier_flags);

        /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
        if (cs->partition_root_state)
                update_partition_sd_lb(cs, old_prs);

        free_cpumasks(NULL, &tmp);
        return 0;
}

/*
 * Migrate memory region from one set of nodes to another.  This is
 * performed asynchronously as it can be called from process migration path
 * holding locks involved in process management.  All mm migrations are
 * performed in the queued order and can be waited for by flushing
 * cpuset_migrate_mm_wq.
 */

struct cpuset_migrate_mm_work {
        struct work_struct        work;
        struct mm_struct        *mm;
        nodemask_t                from;
        nodemask_t                to;
};

static void cpuset_migrate_mm_workfn(struct work_struct *work)
{
        struct cpuset_migrate_mm_work *mwork =
                container_of(work, struct cpuset_migrate_mm_work, work);

        /* on a wq worker, no need to worry about %current's mems_allowed */
        do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
        mmput(mwork->mm);
        kfree(mwork);
}

static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
                                                        const nodemask_t *to)
{
        struct cpuset_migrate_mm_work *mwork;

        if (nodes_equal(*from, *to)) {
                mmput(mm);
                return;
        }

        mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
        if (mwork) {
                mwork->mm = mm;
                mwork->from = *from;
                mwork->to = *to;
                INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
                queue_work(cpuset_migrate_mm_wq, &mwork->work);
        } else {
                mmput(mm);
        }
}

static void cpuset_post_attach(void)
{
        flush_workqueue(cpuset_migrate_mm_wq);
}

/*
 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
 * @tsk: the task to change
 * @newmems: new nodes that the task will be set
 *
 * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
 * and rebind an eventual tasks' mempolicy. If the task is allocating in
 * parallel, it might temporarily see an empty intersection, which results in
 * a seqlock check and retry before OOM or allocation failure.
 */
static void cpuset_change_task_nodemask(struct task_struct *tsk,
                                        nodemask_t *newmems)
{
        task_lock(tsk);

        local_irq_disable();
        write_seqcount_begin(&tsk->mems_allowed_seq);

        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
        mpol_rebind_task(tsk, newmems);
        tsk->mems_allowed = *newmems;

        write_seqcount_end(&tsk->mems_allowed_seq);
        local_irq_enable();

        task_unlock(tsk);
}

static void *cpuset_being_rebound;

/**
 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
 *
 * Iterate through each task of @cs updating its mems_allowed to the
 * effective cpuset's.  As this function is called with cpuset_mutex held,
 * cpuset membership stays stable.
 */
static void update_tasks_nodemask(struct cpuset *cs)
{
        static nodemask_t newmems;        /* protected by cpuset_mutex */
        struct css_task_iter it;
        struct task_struct *task;

        cpuset_being_rebound = cs;                /* causes mpol_dup() rebind */

        guarantee_online_mems(cs, &newmems);

        /*
         * The mpol_rebind_mm() call takes mmap_lock, which we couldn't
         * take while holding tasklist_lock.  Forks can happen - the
         * mpol_dup() cpuset_being_rebound check will catch such forks,
         * and rebind their vma mempolicies too.  Because we still hold
         * the global cpuset_mutex, we know that no other rebind effort
         * will be contending for the global variable cpuset_being_rebound.
         * It's ok if we rebind the same mm twice; mpol_rebind_mm()
         * is idempotent.  Also migrate pages in each mm to new nodes.
         */
        css_task_iter_start(&cs->css, 0, &it);
        while ((task = css_task_iter_next(&it))) {
                struct mm_struct *mm;
                bool migrate;

                cpuset_change_task_nodemask(task, &newmems);

                mm = get_task_mm(task);
                if (!mm)
                        continue;

                migrate = is_memory_migrate(cs);

                mpol_rebind_mm(mm, &cs->mems_allowed);
                if (migrate)
                        cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
                else
                        mmput(mm);
        }
        css_task_iter_end(&it);

        /*
         * All the tasks' nodemasks have been updated, update
         * cs->old_mems_allowed.
         */
        cs->old_mems_allowed = newmems;

        /* We're done rebinding vmas to this cpuset's new mems_allowed. */
        cpuset_being_rebound = NULL;
}

/*
 * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
 * @cs: the cpuset to consider
 * @new_mems: a temp variable for calculating new effective_mems
 *
 * When configured nodemask is changed, the effective nodemasks of this cpuset
 * and all its descendants need to be updated.
 *
 * On legacy hierarchy, effective_mems will be the same with mems_allowed.
 *
 * Called with cpuset_mutex held
 */
static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;

        rcu_read_lock();
        cpuset_for_each_descendant_pre(cp, pos_css, cs) {
                struct cpuset *parent = parent_cs(cp);

                nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);

                /*
                 * If it becomes empty, inherit the effective mask of the
                 * parent, which is guaranteed to have some MEMs.
                 */
                if (is_in_v2_mode() && nodes_empty(*new_mems))
                        *new_mems = parent->effective_mems;

                /* Skip the whole subtree if the nodemask remains the same. */
                if (nodes_equal(*new_mems, cp->effective_mems)) {
                        pos_css = css_rightmost_descendant(pos_css);
                        continue;
                }

                if (!css_tryget_online(&cp->css))
                        continue;
                rcu_read_unlock();

                spin_lock_irq(&callback_lock);
                cp->effective_mems = *new_mems;
                spin_unlock_irq(&callback_lock);

                WARN_ON(!is_in_v2_mode() &&
                        !nodes_equal(cp->mems_allowed, cp->effective_mems));

                update_tasks_nodemask(cp);

                rcu_read_lock();
                css_put(&cp->css);
        }
        rcu_read_unlock();
}

/*
 * Handle user request to change the 'mems' memory placement
 * of a cpuset.  Needs to validate the request, update the
 * cpusets mems_allowed, and for each task in the cpuset,
 * update mems_allowed and rebind task's mempolicy and any vma
 * mempolicies and if the cpuset is marked 'memory_migrate',
 * migrate the tasks pages to the new memory.
 *
 * Call with cpuset_mutex held. May take callback_lock during call.
 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
 * lock each such tasks mm->mmap_lock, scan its vma's and rebind
 * their mempolicies to the cpusets new mems_allowed.
 */
static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                           const char *buf)
{
        int retval;

        /*
         * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
         * it's read-only
         */
        if (cs == &top_cpuset) {
                retval = -EACCES;
                goto done;
        }

        /*
         * An empty mems_allowed is ok iff there are no tasks in the cpuset.
         * Since nodelist_parse() fails on an empty mask, we special case
         * that parsing.  The validate_change() call ensures that cpusets
         * with tasks have memory.
         */
        if (!*buf) {
                nodes_clear(trialcs->mems_allowed);
        } else {
                retval = nodelist_parse(buf, trialcs->mems_allowed);
                if (retval < 0)
                        goto done;

                if (!nodes_subset(trialcs->mems_allowed,
                                  top_cpuset.mems_allowed)) {
                        retval = -EINVAL;
                        goto done;
                }
        }

        if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
                retval = 0;                /* Too easy - nothing to do */
                goto done;
        }
        retval = validate_change(cs, trialcs);
        if (retval < 0)
                goto done;

        check_insane_mems_config(&trialcs->mems_allowed);

        spin_lock_irq(&callback_lock);
        cs->mems_allowed = trialcs->mems_allowed;
        spin_unlock_irq(&callback_lock);

        /* use trialcs->mems_allowed as a temp variable */
        update_nodemasks_hier(cs, &trialcs->mems_allowed);
done:
        return retval;
}

bool current_cpuset_is_being_rebound(void)
{
        bool ret;

        rcu_read_lock();
        ret = task_cs(current) == cpuset_being_rebound;
        rcu_read_unlock();

        return ret;
}

static int update_relax_domain_level(struct cpuset *cs, s64 val)
{
#ifdef CONFIG_SMP
        if (val < -1 || val > sched_domain_level_max + 1)
                return -EINVAL;
#endif

        if (val != cs->relax_domain_level) {
                cs->relax_domain_level = val;
                if (!cpumask_empty(cs->cpus_allowed) &&
                    is_sched_load_balance(cs))
                        rebuild_sched_domains_locked();
        }

        return 0;
}

/**
 * update_tasks_flags - update the spread flags of tasks in the cpuset.
 * @cs: the cpuset in which each task's spread flags needs to be changed
 *
 * Iterate through each task of @cs updating its spread flags.  As this
 * function is called with cpuset_mutex held, cpuset membership stays
 * stable.
 */
static void update_tasks_flags(struct cpuset *cs)
{
        struct css_task_iter it;
        struct task_struct *task;

        css_task_iter_start(&cs->css, 0, &it);
        while ((task = css_task_iter_next(&it)))
                cpuset_update_task_spread_flags(cs, task);
        css_task_iter_end(&it);
}

/*
 * update_flag - read a 0 or a 1 in a file and update associated flag
 * bit:                the bit to update (see cpuset_flagbits_t)
 * cs:                the cpuset to update
 * turning_on:         whether the flag is being set or cleared
 *
 * Call with cpuset_mutex held.
 */

static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
                       int turning_on)
{
        struct cpuset *trialcs;
        int balance_flag_changed;
        int spread_flag_changed;
        int err;

        trialcs = alloc_trial_cpuset(cs);
        if (!trialcs)
                return -ENOMEM;

        if (turning_on)
                set_bit(bit, &trialcs->flags);
        else
                clear_bit(bit, &trialcs->flags);

        err = validate_change(cs, trialcs);
        if (err < 0)
                goto out;

        balance_flag_changed = (is_sched_load_balance(cs) !=
                                is_sched_load_balance(trialcs));

        spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
                        || (is_spread_page(cs) != is_spread_page(trialcs)));

        spin_lock_irq(&callback_lock);
        cs->flags = trialcs->flags;
        spin_unlock_irq(&callback_lock);

        if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
                rebuild_sched_domains_locked();

        if (spread_flag_changed)
                update_tasks_flags(cs);
out:
        free_cpuset(trialcs);
        return err;
}

/**
 * update_prstate - update partition_root_state
 * @cs: the cpuset to update
 * @new_prs: new partition root state
 * Return: 0 if successful, != 0 if error
 *
 * Call with cpuset_mutex held.
 */
static int update_prstate(struct cpuset *cs, int new_prs)
{
        int err = PERR_NONE, old_prs = cs->partition_root_state;
        struct cpuset *parent = parent_cs(cs);
        struct tmpmasks tmpmask;
        bool new_xcpus_state = false;

        if (old_prs == new_prs)
                return 0;

        /*
         * Treat a previously invalid partition root as if it is a "member".
         */
        if (new_prs && is_prs_invalid(old_prs))
                old_prs = PRS_MEMBER;

        if (alloc_cpumasks(NULL, &tmpmask))
                return -ENOMEM;

        /*
         * Setup effective_xcpus if not properly set yet, it will be cleared
         * later if partition becomes invalid.
         */
        if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) {
                spin_lock_irq(&callback_lock);
                cpumask_and(cs->effective_xcpus,
                            cs->cpus_allowed, parent->effective_xcpus);
                spin_unlock_irq(&callback_lock);
        }

        err = update_partition_exclusive(cs, new_prs);
        if (err)
                goto out;

        if (!old_prs) {
                enum partition_cmd cmd = (new_prs == PRS_ROOT)
                                       ? partcmd_enable : partcmd_enablei;

                /*
                 * cpus_allowed cannot be empty.
                 */
                if (cpumask_empty(cs->cpus_allowed)) {
                        err = PERR_CPUSEMPTY;
                        goto out;
                }

                err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask);
                /*
                 * If an attempt to become local partition root fails,
                 * try to become a remote partition root instead.
                 */
                if (err && remote_partition_enable(cs, new_prs, &tmpmask))
                        err = 0;
        } else if (old_prs && new_prs) {
                /*
                 * A change in load balance state only, no change in cpumasks.
                 */
                new_xcpus_state = true;
        } else {
                /*
                 * Switching back to member is always allowed even if it
                 * disables child partitions.
                 */
                if (is_remote_partition(cs))
                        remote_partition_disable(cs, &tmpmask);
                else
                        update_parent_effective_cpumask(cs, partcmd_disable,
                                                        NULL, &tmpmask);

                /*
                 * Invalidation of child partitions will be done in
                 * update_cpumasks_hier().
                 */
        }
out:
        /*
         * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error
         * happens.
         */
        if (err) {
                new_prs = -new_prs;
                update_partition_exclusive(cs, new_prs);
        }

        spin_lock_irq(&callback_lock);
        cs->partition_root_state = new_prs;
        WRITE_ONCE(cs->prs_err, err);
        if (!is_partition_valid(cs))
                reset_partition_data(cs);
        else if (new_xcpus_state)
                partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus);
        spin_unlock_irq(&callback_lock);
        update_unbound_workqueue_cpumask(new_xcpus_state);

        /* Force update if switching back to member */
        update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);

        /* Update sched domains and load balance flag */
        update_partition_sd_lb(cs, old_prs);

        notify_partition_change(cs, old_prs);
        free_cpumasks(NULL, &tmpmask);
        return 0;
}

/*
 * Frequency meter - How fast is some event occurring?
 *
 * These routines manage a digitally filtered, constant time based,
 * event frequency meter.  There are four routines:
 *   fmeter_init() - initialize a frequency meter.
 *   fmeter_markevent() - called each time the event happens.
 *   fmeter_getrate() - returns the recent rate of such events.
 *   fmeter_update() - internal routine used to update fmeter.
 *
 * A common data structure is passed to each of these routines,
 * which is used to keep track of the state required to manage the
 * frequency meter and its digital filter.
 *
 * The filter works on the number of events marked per unit time.
 * The filter is single-pole low-pass recursive (IIR).  The time unit
 * is 1 second.  Arithmetic is done using 32-bit integers scaled to
 * simulate 3 decimal digits of precision (multiplied by 1000).
 *
 * With an FM_COEF of 933, and a time base of 1 second, the filter
 * has a half-life of 10 seconds, meaning that if the events quit
 * happening, then the rate returned from the fmeter_getrate()
 * will be cut in half each 10 seconds, until it converges to zero.
 *
 * It is not worth doing a real infinitely recursive filter.  If more
 * than FM_MAXTICKS ticks have elapsed since the last filter event,
 * just compute FM_MAXTICKS ticks worth, by which point the level
 * will be stable.
 *
 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
 * arithmetic overflow in the fmeter_update() routine.
 *
 * Given the simple 32 bit integer arithmetic used, this meter works
 * best for reporting rates between one per millisecond (msec) and
 * one per 32 (approx) seconds.  At constant rates faster than one
 * per msec it maxes out at values just under 1,000,000.  At constant
 * rates between one per msec, and one per second it will stabilize
 * to a value N*1000, where N is the rate of events per second.
 * At constant rates between one per second and one per 32 seconds,
 * it will be choppy, moving up on the seconds that have an event,
 * and then decaying until the next event.  At rates slower than
 * about one in 32 seconds, it decays all the way back to zero between
 * each event.
 */

#define FM_COEF 933                /* coefficient for half-life of 10 secs */
#define FM_MAXTICKS ((u32)99)   /* useless computing more ticks than this */
#define FM_MAXCNT 1000000        /* limit cnt to avoid overflow */
#define FM_SCALE 1000                /* faux fixed point scale */

/* Initialize a frequency meter */
static void fmeter_init(struct fmeter *fmp)
{
        fmp->cnt = 0;
        fmp->val = 0;
        fmp->time = 0;
        spin_lock_init(&fmp->lock);
}

/* Internal meter update - process cnt events and update value */
static void fmeter_update(struct fmeter *fmp)
{
        time64_t now;
        u32 ticks;

        now = ktime_get_seconds();
        ticks = now - fmp->time;

        if (ticks == 0)
                return;

        ticks = min(FM_MAXTICKS, ticks);
        while (ticks-- > 0)
                fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
        fmp->time = now;

        fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
        fmp->cnt = 0;
}

/* Process any previous ticks, then bump cnt by one (times scale). */
static void fmeter_markevent(struct fmeter *fmp)
{
        spin_lock(&fmp->lock);
        fmeter_update(fmp);
        fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
        spin_unlock(&fmp->lock);
}

/* Process any previous ticks, then return current value. */
static int fmeter_getrate(struct fmeter *fmp)
{
        int val;

        spin_lock(&fmp->lock);
        fmeter_update(fmp);
        val = fmp->val;
        spin_unlock(&fmp->lock);
        return val;
}

static struct cpuset *cpuset_attach_old_cs;

/*
 * Check to see if a cpuset can accept a new task
 * For v1, cpus_allowed and mems_allowed can't be empty.
 * For v2, effective_cpus can't be empty.
 * Note that in v1, effective_cpus = cpus_allowed.
 */
static int cpuset_can_attach_check(struct cpuset *cs)
{
        if (cpumask_empty(cs->effective_cpus) ||
           (!is_in_v2_mode() && nodes_empty(cs->mems_allowed)))
                return -ENOSPC;
        return 0;
}

static void reset_migrate_dl_data(struct cpuset *cs)
{
        cs->nr_migrate_dl_tasks = 0;
        cs->sum_migrate_dl_bw = 0;
}

/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
        struct cgroup_subsys_state *css;
        struct cpuset *cs, *oldcs;
        struct task_struct *task;
        bool cpus_updated, mems_updated;
        int ret;

        /* used later by cpuset_attach() */
        cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
        oldcs = cpuset_attach_old_cs;
        cs = css_cs(css);

        mutex_lock(&cpuset_mutex);

        /* Check to see if task is allowed in the cpuset */
        ret = cpuset_can_attach_check(cs);
        if (ret)
                goto out_unlock;

        cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
        mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);

        cgroup_taskset_for_each(task, css, tset) {
                ret = task_can_attach(task);
                if (ret)
                        goto out_unlock;

                /*
                 * Skip rights over task check in v2 when nothing changes,
                 * migration permission derives from hierarchy ownership in
                 * cgroup_procs_write_permission()).
                 */
                if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
                    (cpus_updated || mems_updated)) {
                        ret = security_task_setscheduler(task);
                        if (ret)
                                goto out_unlock;
                }

                if (dl_task(task)) {
                        cs->nr_migrate_dl_tasks++;
                        cs->sum_migrate_dl_bw += task->dl.dl_bw;
                }
        }

        if (!cs->nr_migrate_dl_tasks)
                goto out_success;

        if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
                int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);

                if (unlikely(cpu >= nr_cpu_ids)) {
                        reset_migrate_dl_data(cs);
                        ret = -EINVAL;
                        goto out_unlock;
                }

                ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
                if (ret) {
                        reset_migrate_dl_data(cs);
                        goto out_unlock;
                }
        }

out_success:
        /*
         * Mark attach is in progress.  This makes validate_change() fail
         * changes which zero cpus/mems_allowed.
         */
        cs->attach_in_progress++;
out_unlock:
        mutex_unlock(&cpuset_mutex);
        return ret;
}

static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
        struct cgroup_subsys_state *css;
        struct cpuset *cs;

        cgroup_taskset_first(tset, &css);
        cs = css_cs(css);

        mutex_lock(&cpuset_mutex);
        cs->attach_in_progress--;
        if (!cs->attach_in_progress)
                wake_up(&cpuset_attach_wq);

        if (cs->nr_migrate_dl_tasks) {
                int cpu = cpumask_any(cs->effective_cpus);

                dl_bw_free(cpu, cs->sum_migrate_dl_bw);
                reset_migrate_dl_data(cs);
        }

        mutex_unlock(&cpuset_mutex);
}

/*
 * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task()
 * but we can't allocate it dynamically there.  Define it global and
 * allocate from cpuset_init().
 */
static cpumask_var_t cpus_attach;
static nodemask_t cpuset_attach_nodemask_to;

static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
{
        lockdep_assert_held(&cpuset_mutex);

        if (cs != &top_cpuset)
                guarantee_online_cpus(task, cpus_attach);
        else
                cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
                               subpartitions_cpus);
        /*
         * can_attach beforehand should guarantee that this doesn't
         * fail.  TODO: have a better way to handle failure here
         */
        WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));

        cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
        cpuset_update_task_spread_flags(cs, task);
}

static void cpuset_attach(struct cgroup_taskset *tset)
{
        struct task_struct *task;
        struct task_struct *leader;
        struct cgroup_subsys_state *css;
        struct cpuset *cs;
        struct cpuset *oldcs = cpuset_attach_old_cs;
        bool cpus_updated, mems_updated;

        cgroup_taskset_first(tset, &css);
        cs = css_cs(css);

        lockdep_assert_cpus_held();        /* see cgroup_attach_lock() */
        mutex_lock(&cpuset_mutex);
        cpus_updated = !cpumask_equal(cs->effective_cpus,
                                      oldcs->effective_cpus);
        mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);

        /*
         * In the default hierarchy, enabling cpuset in the child cgroups
         * will trigger a number of cpuset_attach() calls with no change
         * in effective cpus and mems. In that case, we can optimize out
         * by skipping the task iteration and update.
         */
        if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
            !cpus_updated && !mems_updated) {
                cpuset_attach_nodemask_to = cs->effective_mems;
                goto out;
        }

        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);

        cgroup_taskset_for_each(task, css, tset)
                cpuset_attach_task(cs, task);

        /*
         * Change mm for all threadgroup leaders. This is expensive and may
         * sleep and should be moved outside migration path proper. Skip it
         * if there is no change in effective_mems and CS_MEMORY_MIGRATE is
         * not set.
         */
        cpuset_attach_nodemask_to = cs->effective_mems;
        if (!is_memory_migrate(cs) && !mems_updated)
                goto out;

        cgroup_taskset_for_each_leader(leader, css, tset) {
                struct mm_struct *mm = get_task_mm(leader);

                if (mm) {
                        mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);

                        /*
                         * old_mems_allowed is the same with mems_allowed
                         * here, except if this task is being moved
                         * automatically due to hotplug.  In that case
                         * @mems_allowed has been updated and is empty, so
                         * @old_mems_allowed is the right nodesets that we
                         * migrate mm from.
                         */
                        if (is_memory_migrate(cs))
                                cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
                                                  &cpuset_attach_nodemask_to);
                        else
                                mmput(mm);
                }
        }

out:
        cs->old_mems_allowed = cpuset_attach_nodemask_to;

        if (cs->nr_migrate_dl_tasks) {
                cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
                oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
                reset_migrate_dl_data(cs);
        }

        cs->attach_in_progress--;
        if (!cs->attach_in_progress)
                wake_up(&cpuset_attach_wq);

        mutex_unlock(&cpuset_mutex);
}

/* The various types of files and directories in a cpuset file system */

typedef enum {
        FILE_MEMORY_MIGRATE,
        FILE_CPULIST,
        FILE_MEMLIST,
        FILE_EFFECTIVE_CPULIST,
        FILE_EFFECTIVE_MEMLIST,
        FILE_SUBPARTS_CPULIST,
        FILE_EXCLUSIVE_CPULIST,
        FILE_EFFECTIVE_XCPULIST,
        FILE_ISOLATED_CPULIST,
        FILE_CPU_EXCLUSIVE,
        FILE_MEM_EXCLUSIVE,
        FILE_MEM_HARDWALL,
        FILE_SCHED_LOAD_BALANCE,
        FILE_PARTITION_ROOT,
        FILE_SCHED_RELAX_DOMAIN_LEVEL,
        FILE_MEMORY_PRESSURE_ENABLED,
        FILE_MEMORY_PRESSURE,
        FILE_SPREAD_PAGE,
        FILE_SPREAD_SLAB,
} cpuset_filetype_t;

static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
                            u64 val)
{
        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        int retval = 0;

        cpus_read_lock();
        mutex_lock(&cpuset_mutex);
        if (!is_cpuset_online(cs)) {
                retval = -ENODEV;
                goto out_unlock;
        }

        switch (type) {
        case FILE_CPU_EXCLUSIVE:
                retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
                break;
        case FILE_MEM_EXCLUSIVE:
                retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
                break;
        case FILE_MEM_HARDWALL:
                retval = update_flag(CS_MEM_HARDWALL, cs, val);
                break;
        case FILE_SCHED_LOAD_BALANCE:
                retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
                break;
        case FILE_MEMORY_MIGRATE:
                retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
                break;
        case FILE_MEMORY_PRESSURE_ENABLED:
                cpuset_memory_pressure_enabled = !!val;
                break;
        case FILE_SPREAD_PAGE:
                retval = update_flag(CS_SPREAD_PAGE, cs, val);
                break;
        case FILE_SPREAD_SLAB:
                retval = update_flag(CS_SPREAD_SLAB, cs, val);
                break;
        default:
                retval = -EINVAL;
                break;
        }
out_unlock:
        mutex_unlock(&cpuset_mutex);
        cpus_read_unlock();
        return retval;
}

static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
                            s64 val)
{
        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        int retval = -ENODEV;

        cpus_read_lock();
        mutex_lock(&cpuset_mutex);
        if (!is_cpuset_online(cs))
                goto out_unlock;

        switch (type) {
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
                retval = update_relax_domain_level(cs, val);
                break;
        default:
                retval = -EINVAL;
                break;
        }
out_unlock:
        mutex_unlock(&cpuset_mutex);
        cpus_read_unlock();
        return retval;
}

/*
 * Common handling for a write to a "cpus" or "mems" file.
 */
static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
                                    char *buf, size_t nbytes, loff_t off)
{
        struct cpuset *cs = css_cs(of_css(of));
        struct cpuset *trialcs;
        int retval = -ENODEV;

        buf = strstrip(buf);

        /*
         * CPU or memory hotunplug may leave @cs w/o any execution
         * resources, in which case the hotplug code asynchronously updates
         * configuration and transfers all tasks to the nearest ancestor
         * which can execute.
         *
         * As writes to "cpus" or "mems" may restore @cs's execution
         * resources, wait for the previously scheduled operations before
         * proceeding, so that we don't end up keep removing tasks added
         * after execution capability is restored.
         *
         * cpuset_handle_hotplug may call back into cgroup core asynchronously
         * via cgroup_transfer_tasks() and waiting for it from a cgroupfs
         * operation like this one can lead to a deadlock through kernfs
         * active_ref protection.  Let's break the protection.  Losing the
         * protection is okay as we check whether @cs is online after
         * grabbing cpuset_mutex anyway.  This only happens on the legacy
         * hierarchies.
         */
        css_get(&cs->css);
        kernfs_break_active_protection(of->kn);

        cpus_read_lock();
        mutex_lock(&cpuset_mutex);
        if (!is_cpuset_online(cs))
                goto out_unlock;

        trialcs = alloc_trial_cpuset(cs);
        if (!trialcs) {
                retval = -ENOMEM;
                goto out_unlock;
        }

        switch (of_cft(of)->private) {
        case FILE_CPULIST:
                retval = update_cpumask(cs, trialcs, buf);
                break;
        case FILE_EXCLUSIVE_CPULIST:
                retval = update_exclusive_cpumask(cs, trialcs, buf);
                break;
        case FILE_MEMLIST:
                retval = update_nodemask(cs, trialcs, buf);
                break;
        default:
                retval = -EINVAL;
                break;
        }

        free_cpuset(trialcs);
out_unlock:
        mutex_unlock(&cpuset_mutex);
        cpus_read_unlock();
        kernfs_unbreak_active_protection(of->kn);
        css_put(&cs->css);
        flush_workqueue(cpuset_migrate_mm_wq);
        return retval ?: nbytes;
}

/*
 * These ascii lists should be read in a single call, by using a user
 * buffer large enough to hold the entire map.  If read in smaller
 * chunks, there is no guarantee of atomicity.  Since the display format
 * used, list of ranges of sequential numbers, is variable length,
 * and since these maps can change value dynamically, one could read
 * gibberish by doing partial reads while a list was changing.
 */
static int cpuset_common_seq_show(struct seq_file *sf, void *v)
{
        struct cpuset *cs = css_cs(seq_css(sf));
        cpuset_filetype_t type = seq_cft(sf)->private;
        int ret = 0;

        spin_lock_irq(&callback_lock);

        switch (type) {
        case FILE_CPULIST:
                seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
                break;
        case FILE_MEMLIST:
                seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
                break;
        case FILE_EFFECTIVE_CPULIST:
                seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
                break;
        case FILE_EFFECTIVE_MEMLIST:
                seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
                break;
        case FILE_EXCLUSIVE_CPULIST:
                seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus));
                break;
        case FILE_EFFECTIVE_XCPULIST:
                seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus));
                break;
        case FILE_SUBPARTS_CPULIST:
                seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus));
                break;
        case FILE_ISOLATED_CPULIST:
                seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus));
                break;
        default:
                ret = -EINVAL;
        }

        spin_unlock_irq(&callback_lock);
        return ret;
}

static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
{
        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        switch (type) {
        case FILE_CPU_EXCLUSIVE:
                return is_cpu_exclusive(cs);
        case FILE_MEM_EXCLUSIVE:
                return is_mem_exclusive(cs);
        case FILE_MEM_HARDWALL:
                return is_mem_hardwall(cs);
        case FILE_SCHED_LOAD_BALANCE:
                return is_sched_load_balance(cs);
        case FILE_MEMORY_MIGRATE:
                return is_memory_migrate(cs);
        case FILE_MEMORY_PRESSURE_ENABLED:
                return cpuset_memory_pressure_enabled;
        case FILE_MEMORY_PRESSURE:
                return fmeter_getrate(&cs->fmeter);
        case FILE_SPREAD_PAGE:
                return is_spread_page(cs);
        case FILE_SPREAD_SLAB:
                return is_spread_slab(cs);
        default:
                BUG();
        }

        /* Unreachable but makes gcc happy */
        return 0;
}

static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
{
        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        switch (type) {
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
                return cs->relax_domain_level;
        default:
                BUG();
        }

        /* Unreachable but makes gcc happy */
        return 0;
}

static int sched_partition_show(struct seq_file *seq, void *v)
{
        struct cpuset *cs = css_cs(seq_css(seq));
        const char *err, *type = NULL;

        switch (cs->partition_root_state) {
        case PRS_ROOT:
                seq_puts(seq, "root\n");
                break;
        case PRS_ISOLATED:
                seq_puts(seq, "isolated\n");
                break;
        case PRS_MEMBER:
                seq_puts(seq, "member\n");
                break;
        case PRS_INVALID_ROOT:
                type = "root";
                fallthrough;
        case PRS_INVALID_ISOLATED:
                if (!type)
                        type = "isolated";
                err = perr_strings[READ_ONCE(cs->prs_err)];
                if (err)
                        seq_printf(seq, "%s invalid (%s)\n", type, err);
                else
                        seq_printf(seq, "%s invalid\n", type);
                break;
        }
        return 0;
}

static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
                                     size_t nbytes, loff_t off)
{
        struct cpuset *cs = css_cs(of_css(of));
        int val;
        int retval = -ENODEV;

        buf = strstrip(buf);

        if (!strcmp(buf, "root"))
                val = PRS_ROOT;
        else if (!strcmp(buf, "member"))
                val = PRS_MEMBER;
        else if (!strcmp(buf, "isolated"))
                val = PRS_ISOLATED;
        else
                return -EINVAL;

        css_get(&cs->css);
        cpus_read_lock();
        mutex_lock(&cpuset_mutex);
        if (!is_cpuset_online(cs))
                goto out_unlock;

        retval = update_prstate(cs, val);
out_unlock:
        mutex_unlock(&cpuset_mutex);
        cpus_read_unlock();
        css_put(&cs->css);
        return retval ?: nbytes;
}

/*
 * for the common functions, 'private' gives the type of file
 */

static struct cftype legacy_files[] = {
        {
                .name = "cpus",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * NR_CPUS),
                .private = FILE_CPULIST,
        },

        {
                .name = "mems",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * MAX_NUMNODES),
                .private = FILE_MEMLIST,
        },

        {
                .name = "effective_cpus",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_CPULIST,
        },

        {
                .name = "effective_mems",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_MEMLIST,
        },

        {
                .name = "cpu_exclusive",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_CPU_EXCLUSIVE,
        },

        {
                .name = "mem_exclusive",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEM_EXCLUSIVE,
        },

        {
                .name = "mem_hardwall",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEM_HARDWALL,
        },

        {
                .name = "sched_load_balance",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_SCHED_LOAD_BALANCE,
        },

        {
                .name = "sched_relax_domain_level",
                .read_s64 = cpuset_read_s64,
                .write_s64 = cpuset_write_s64,
                .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
        },

        {
                .name = "memory_migrate",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEMORY_MIGRATE,
        },

        {
                .name = "memory_pressure",
                .read_u64 = cpuset_read_u64,
                .private = FILE_MEMORY_PRESSURE,
        },

        {
                .name = "memory_spread_page",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_SPREAD_PAGE,
        },

        {
                /* obsolete, may be removed in the future */
                .name = "memory_spread_slab",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_SPREAD_SLAB,
        },

        {
                .name = "memory_pressure_enabled",
                .flags = CFTYPE_ONLY_ON_ROOT,
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEMORY_PRESSURE_ENABLED,
        },

        { }        /* terminate */
};

/*
 * This is currently a minimal set for the default hierarchy. It can be
 * expanded later on by migrating more features and control files from v1.
 */
static struct cftype dfl_files[] = {
        {
                .name = "cpus",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * NR_CPUS),
                .private = FILE_CPULIST,
                .flags = CFTYPE_NOT_ON_ROOT,
        },

        {
                .name = "mems",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * MAX_NUMNODES),
                .private = FILE_MEMLIST,
                .flags = CFTYPE_NOT_ON_ROOT,
        },

        {
                .name = "cpus.effective",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_CPULIST,
        },

        {
                .name = "mems.effective",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_MEMLIST,
        },

        {
                .name = "cpus.partition",
                .seq_show = sched_partition_show,
                .write = sched_partition_write,
                .private = FILE_PARTITION_ROOT,
                .flags = CFTYPE_NOT_ON_ROOT,
                .file_offset = offsetof(struct cpuset, partition_file),
        },

        {
                .name = "cpus.exclusive",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * NR_CPUS),
                .private = FILE_EXCLUSIVE_CPULIST,
                .flags = CFTYPE_NOT_ON_ROOT,
        },

        {
                .name = "cpus.exclusive.effective",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_XCPULIST,
                .flags = CFTYPE_NOT_ON_ROOT,
        },

        {
                .name = "cpus.subpartitions",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_SUBPARTS_CPULIST,
                .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
        },

        {
                .name = "cpus.isolated",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_ISOLATED_CPULIST,
                .flags = CFTYPE_ONLY_ON_ROOT,
        },

        { }        /* terminate */
};


/**
 * cpuset_css_alloc - Allocate a cpuset css
 * @parent_css: Parent css of the control group that the new cpuset will be
 *              part of
 * Return: cpuset css on success, -ENOMEM on failure.
 *
 * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return
 * top cpuset css otherwise.
 */
static struct cgroup_subsys_state *
cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct cpuset *cs;

        if (!parent_css)
                return &top_cpuset.css;

        cs = kzalloc(sizeof(*cs), GFP_KERNEL);
        if (!cs)
                return ERR_PTR(-ENOMEM);

        if (alloc_cpumasks(cs, NULL)) {
                kfree(cs);
                return ERR_PTR(-ENOMEM);
        }

        __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
        nodes_clear(cs->mems_allowed);
        nodes_clear(cs->effective_mems);
        fmeter_init(&cs->fmeter);
        cs->relax_domain_level = -1;
        INIT_LIST_HEAD(&cs->remote_sibling);

        /* Set CS_MEMORY_MIGRATE for default hierarchy */
        if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
                __set_bit(CS_MEMORY_MIGRATE, &cs->flags);

        return &cs->css;
}

static int cpuset_css_online(struct cgroup_subsys_state *css)
{
        struct cpuset *cs = css_cs(css);
        struct cpuset *parent = parent_cs(cs);
        struct cpuset *tmp_cs;
        struct cgroup_subsys_state *pos_css;

        if (!parent)
                return 0;

        cpus_read_lock();
        mutex_lock(&cpuset_mutex);

        set_bit(CS_ONLINE, &cs->flags);
        if (is_spread_page(parent))
                set_bit(CS_SPREAD_PAGE, &cs->flags);
        if (is_spread_slab(parent))
                set_bit(CS_SPREAD_SLAB, &cs->flags);

        cpuset_inc();

        spin_lock_irq(&callback_lock);
        if (is_in_v2_mode()) {
                cpumask_copy(cs->effective_cpus, parent->effective_cpus);
                cs->effective_mems = parent->effective_mems;
                cs->use_parent_ecpus = true;
                parent->child_ecpus_count++;
        }

        /*
         * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
         */
        if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
            !is_sched_load_balance(parent))
                clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);

        spin_unlock_irq(&callback_lock);

        if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
                goto out_unlock;

        /*
         * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
         * set.  This flag handling is implemented in cgroup core for
         * historical reasons - the flag may be specified during mount.
         *
         * Currently, if any sibling cpusets have exclusive cpus or mem, we
         * refuse to clone the configuration - thereby refusing the task to
         * be entered, and as a result refusing the sys_unshare() or
         * clone() which initiated it.  If this becomes a problem for some
         * users who wish to allow that scenario, then this could be
         * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
         * (and likewise for mems) to the new cgroup.
         */
        rcu_read_lock();
        cpuset_for_each_child(tmp_cs, pos_css, parent) {
                if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
                        rcu_read_unlock();
                        goto out_unlock;
                }
        }
        rcu_read_unlock();

        spin_lock_irq(&callback_lock);
        cs->mems_allowed = parent->mems_allowed;
        cs->effective_mems = parent->mems_allowed;
        cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
        cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
        spin_unlock_irq(&callback_lock);
out_unlock:
        mutex_unlock(&cpuset_mutex);
        cpus_read_unlock();
        return 0;
}

/*
 * If the cpuset being removed has its flag 'sched_load_balance'
 * enabled, then simulate turning sched_load_balance off, which
 * will call rebuild_sched_domains_locked(). That is not needed
 * in the default hierarchy where only changes in partition
 * will cause repartitioning.
 *
 * If the cpuset has the 'sched.partition' flag enabled, simulate
 * turning 'sched.partition" off.
 */

static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
        struct cpuset *cs = css_cs(css);

        cpus_read_lock();
        mutex_lock(&cpuset_mutex);

        if (is_partition_valid(cs))
                update_prstate(cs, 0);

        if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
            is_sched_load_balance(cs))
                update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);

        if (cs->use_parent_ecpus) {
                struct cpuset *parent = parent_cs(cs);

                cs->use_parent_ecpus = false;
                parent->child_ecpus_count--;
        }

        cpuset_dec();
        clear_bit(CS_ONLINE, &cs->flags);

        mutex_unlock(&cpuset_mutex);
        cpus_read_unlock();
}

static void cpuset_css_free(struct cgroup_subsys_state *css)
{
        struct cpuset *cs = css_cs(css);

        free_cpuset(cs);
}

static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
        mutex_lock(&cpuset_mutex);
        spin_lock_irq(&callback_lock);

        if (is_in_v2_mode()) {
                cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
                cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask);
                top_cpuset.mems_allowed = node_possible_map;
        } else {
                cpumask_copy(top_cpuset.cpus_allowed,
                             top_cpuset.effective_cpus);
                top_cpuset.mems_allowed = top_cpuset.effective_mems;
        }

        spin_unlock_irq(&callback_lock);
        mutex_unlock(&cpuset_mutex);
}

/*
 * In case the child is cloned into a cpuset different from its parent,
 * additional checks are done to see if the move is allowed.
 */
static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)
{
        struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
        bool same_cs;
        int ret;

        rcu_read_lock();
        same_cs = (cs == task_cs(current));
        rcu_read_unlock();

        if (same_cs)
                return 0;

        lockdep_assert_held(&cgroup_mutex);
        mutex_lock(&cpuset_mutex);

        /* Check to see if task is allowed in the cpuset */
        ret = cpuset_can_attach_check(cs);
        if (ret)
                goto out_unlock;

        ret = task_can_attach(task);
        if (ret)
                goto out_unlock;

        ret = security_task_setscheduler(task);
        if (ret)
                goto out_unlock;

        /*
         * Mark attach is in progress.  This makes validate_change() fail
         * changes which zero cpus/mems_allowed.
         */
        cs->attach_in_progress++;
out_unlock:
        mutex_unlock(&cpuset_mutex);
        return ret;
}

static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)
{
        struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
        bool same_cs;

        rcu_read_lock();
        same_cs = (cs == task_cs(current));
        rcu_read_unlock();

        if (same_cs)
                return;

        mutex_lock(&cpuset_mutex);
        cs->attach_in_progress--;
        if (!cs->attach_in_progress)
                wake_up(&cpuset_attach_wq);
        mutex_unlock(&cpuset_mutex);
}

/*
 * Make sure the new task conform to the current state of its parent,
 * which could have been changed by cpuset just after it inherits the
 * state from the parent and before it sits on the cgroup's task list.
 */
static void cpuset_fork(struct task_struct *task)
{
        struct cpuset *cs;
        bool same_cs;

        rcu_read_lock();
        cs = task_cs(task);
        same_cs = (cs == task_cs(current));
        rcu_read_unlock();

        if (same_cs) {
                if (cs == &top_cpuset)
                        return;

                set_cpus_allowed_ptr(task, current->cpus_ptr);
                task->mems_allowed = current->mems_allowed;
                return;
        }

        /* CLONE_INTO_CGROUP */
        mutex_lock(&cpuset_mutex);
        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
        cpuset_attach_task(cs, task);

        cs->attach_in_progress--;
        if (!cs->attach_in_progress)
                wake_up(&cpuset_attach_wq);

        mutex_unlock(&cpuset_mutex);
}

struct cgroup_subsys cpuset_cgrp_subsys = {
        .css_alloc        = cpuset_css_alloc,
        .css_online        = cpuset_css_online,
        .css_offline        = cpuset_css_offline,
        .css_free        = cpuset_css_free,
        .can_attach        = cpuset_can_attach,
        .cancel_attach        = cpuset_cancel_attach,
        .attach                = cpuset_attach,
        .post_attach        = cpuset_post_attach,
        .bind                = cpuset_bind,
        .can_fork        = cpuset_can_fork,
        .cancel_fork        = cpuset_cancel_fork,
        .fork                = cpuset_fork,
        .legacy_cftypes        = legacy_files,
        .dfl_cftypes        = dfl_files,
        .early_init        = true,
        .threaded        = true,
};

/**
 * cpuset_init - initialize cpusets at system boot
 *
 * Description: Initialize top_cpuset
 **/

int __init cpuset_init(void)
{
        BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
        BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
        BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));
        BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
        BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
        BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));

        cpumask_setall(top_cpuset.cpus_allowed);
        nodes_setall(top_cpuset.mems_allowed);
        cpumask_setall(top_cpuset.effective_cpus);
        cpumask_setall(top_cpuset.effective_xcpus);
        cpumask_setall(top_cpuset.exclusive_cpus);
        nodes_setall(top_cpuset.effective_mems);

        fmeter_init(&top_cpuset.fmeter);
        INIT_LIST_HEAD(&remote_children);

        BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));

        return 0;
}

/*
 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
 * or memory nodes, we need to walk over the cpuset hierarchy,
 * removing that CPU or node from all cpusets.  If this removes the
 * last CPU or node from a cpuset, then move the tasks in the empty
 * cpuset to its next-highest non-empty parent.
 */
static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
{
        struct cpuset *parent;

        /*
         * Find its next-highest non-empty parent, (top cpuset
         * has online cpus, so can't be empty).
         */
        parent = parent_cs(cs);
        while (cpumask_empty(parent->cpus_allowed) ||
                        nodes_empty(parent->mems_allowed))
                parent = parent_cs(parent);

        if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
                pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
                pr_cont_cgroup_name(cs->css.cgroup);
                pr_cont("\n");
        }
}

static void cpuset_migrate_tasks_workfn(struct work_struct *work)
{
        struct cpuset_remove_tasks_struct *s;

        s = container_of(work, struct cpuset_remove_tasks_struct, work);
        remove_tasks_in_empty_cpuset(s->cs);
        css_put(&s->cs->css);
        kfree(s);
}

static void
hotplug_update_tasks_legacy(struct cpuset *cs,
                            struct cpumask *new_cpus, nodemask_t *new_mems,
                            bool cpus_updated, bool mems_updated)
{
        bool is_empty;

        spin_lock_irq(&callback_lock);
        cpumask_copy(cs->cpus_allowed, new_cpus);
        cpumask_copy(cs->effective_cpus, new_cpus);
        cs->mems_allowed = *new_mems;
        cs->effective_mems = *new_mems;
        spin_unlock_irq(&callback_lock);

        /*
         * Don't call update_tasks_cpumask() if the cpuset becomes empty,
         * as the tasks will be migrated to an ancestor.
         */
        if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
                update_tasks_cpumask(cs, new_cpus);
        if (mems_updated && !nodes_empty(cs->mems_allowed))
                update_tasks_nodemask(cs);

        is_empty = cpumask_empty(cs->cpus_allowed) ||
                   nodes_empty(cs->mems_allowed);

        /*
         * Move tasks to the nearest ancestor with execution resources,
         * This is full cgroup operation which will also call back into
         * cpuset. Execute it asynchronously using workqueue.
         */
        if (is_empty && cs->css.cgroup->nr_populated_csets &&
            css_tryget_online(&cs->css)) {
                struct cpuset_remove_tasks_struct *s;

                s = kzalloc(sizeof(*s), GFP_KERNEL);
                if (WARN_ON_ONCE(!s)) {
                        css_put(&cs->css);
                        return;
                }

                s->cs = cs;
                INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
                schedule_work(&s->work);
        }
}

static void
hotplug_update_tasks(struct cpuset *cs,
                     struct cpumask *new_cpus, nodemask_t *new_mems,
                     bool cpus_updated, bool mems_updated)
{
        /* A partition root is allowed to have empty effective cpus */
        if (cpumask_empty(new_cpus) && !is_partition_valid(cs))
                cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
        if (nodes_empty(*new_mems))
                *new_mems = parent_cs(cs)->effective_mems;

        spin_lock_irq(&callback_lock);
        cpumask_copy(cs->effective_cpus, new_cpus);
        cs->effective_mems = *new_mems;
        spin_unlock_irq(&callback_lock);

        if (cpus_updated)
                update_tasks_cpumask(cs, new_cpus);
        if (mems_updated)
                update_tasks_nodemask(cs);
}

static bool force_rebuild;

void cpuset_force_rebuild(void)
{
        force_rebuild = true;
}

/**
 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
 * @cs: cpuset in interest
 * @tmp: the tmpmasks structure pointer
 *
 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
 * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
 * all its tasks are moved to the nearest ancestor with both resources.
 */
static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
{
        static cpumask_t new_cpus;
        static nodemask_t new_mems;
        bool cpus_updated;
        bool mems_updated;
        bool remote;
        int partcmd = -1;
        struct cpuset *parent;
retry:
        wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);

        mutex_lock(&cpuset_mutex);

        /*
         * We have raced with task attaching. We wait until attaching
         * is finished, so we won't attach a task to an empty cpuset.
         */
        if (cs->attach_in_progress) {
                mutex_unlock(&cpuset_mutex);
                goto retry;
        }

        parent = parent_cs(cs);
        compute_effective_cpumask(&new_cpus, cs, parent);
        nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);

        if (!tmp || !cs->partition_root_state)
                goto update_tasks;

        /*
         * Compute effective_cpus for valid partition root, may invalidate
         * child partition roots if necessary.
         */
        remote = is_remote_partition(cs);
        if (remote || (is_partition_valid(cs) && is_partition_valid(parent)))
                compute_partition_effective_cpumask(cs, &new_cpus);

        if (remote && cpumask_empty(&new_cpus) &&
            partition_is_populated(cs, NULL)) {
                remote_partition_disable(cs, tmp);
                compute_effective_cpumask(&new_cpus, cs, parent);
                remote = false;
                cpuset_force_rebuild();
        }

        /*
         * Force the partition to become invalid if either one of
         * the following conditions hold:
         * 1) empty effective cpus but not valid empty partition.
         * 2) parent is invalid or doesn't grant any cpus to child
         *    partitions.
         */
        if (is_local_partition(cs) && (!is_partition_valid(parent) ||
                                tasks_nocpu_error(parent, cs, &new_cpus)))
                partcmd = partcmd_invalidate;
        /*
         * On the other hand, an invalid partition root may be transitioned
         * back to a regular one.
         */
        else if (is_partition_valid(parent) && is_partition_invalid(cs))
                partcmd = partcmd_update;

        if (partcmd >= 0) {
                update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
                if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
                        compute_partition_effective_cpumask(cs, &new_cpus);
                        cpuset_force_rebuild();
                }
        }

update_tasks:
        cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
        mems_updated = !nodes_equal(new_mems, cs->effective_mems);
        if (!cpus_updated && !mems_updated)
                goto unlock;        /* Hotplug doesn't affect this cpuset */

        if (mems_updated)
                check_insane_mems_config(&new_mems);

        if (is_in_v2_mode())
                hotplug_update_tasks(cs, &new_cpus, &new_mems,
                                     cpus_updated, mems_updated);
        else
                hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
                                            cpus_updated, mems_updated);

unlock:
        mutex_unlock(&cpuset_mutex);
}

/**
 * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset
 *
 * This function is called after either CPU or memory configuration has
 * changed and updates cpuset accordingly.  The top_cpuset is always
 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
 * order to make cpusets transparent (of no affect) on systems that are
 * actively using CPU hotplug but making no active use of cpusets.
 *
 * Non-root cpusets are only affected by offlining.  If any CPUs or memory
 * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
 * all descendants.
 *
 * Note that CPU offlining during suspend is ignored.  We don't modify
 * cpusets across suspend/resume cycles at all.
 *
 * CPU / memory hotplug is handled synchronously.
 */
static void cpuset_handle_hotplug(void)
{
        static cpumask_t new_cpus;
        static nodemask_t new_mems;
        bool cpus_updated, mems_updated;
        bool on_dfl = is_in_v2_mode();
        struct tmpmasks tmp, *ptmp = NULL;

        if (on_dfl && !alloc_cpumasks(NULL, &tmp))
                ptmp = &tmp;

        lockdep_assert_cpus_held();
        mutex_lock(&cpuset_mutex);

        /* fetch the available cpus/mems and find out which changed how */
        cpumask_copy(&new_cpus, cpu_active_mask);
        new_mems = node_states[N_MEMORY];

        /*
         * If subpartitions_cpus is populated, it is likely that the check
         * below will produce a false positive on cpus_updated when the cpu
         * list isn't changed. It is extra work, but it is better to be safe.
         */
        cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) ||
                       !cpumask_empty(subpartitions_cpus);
        mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);

        /*
         * In the rare case that hotplug removes all the cpus in
         * subpartitions_cpus, we assumed that cpus are updated.
         */
        if (!cpus_updated && top_cpuset.nr_subparts)
                cpus_updated = true;

        /* For v1, synchronize cpus_allowed to cpu_active_mask */
        if (cpus_updated) {
                spin_lock_irq(&callback_lock);
                if (!on_dfl)
                        cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
                /*
                 * Make sure that CPUs allocated to child partitions
                 * do not show up in effective_cpus. If no CPU is left,
                 * we clear the subpartitions_cpus & let the child partitions
                 * fight for the CPUs again.
                 */
                if (!cpumask_empty(subpartitions_cpus)) {
                        if (cpumask_subset(&new_cpus, subpartitions_cpus)) {
                                top_cpuset.nr_subparts = 0;
                                cpumask_clear(subpartitions_cpus);
                        } else {
                                cpumask_andnot(&new_cpus, &new_cpus,
                                               subpartitions_cpus);
                        }
                }
                cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
                spin_unlock_irq(&callback_lock);
                /* we don't mess with cpumasks of tasks in top_cpuset */
        }

        /* synchronize mems_allowed to N_MEMORY */
        if (mems_updated) {
                spin_lock_irq(&callback_lock);
                if (!on_dfl)
                        top_cpuset.mems_allowed = new_mems;
                top_cpuset.effective_mems = new_mems;
                spin_unlock_irq(&callback_lock);
                update_tasks_nodemask(&top_cpuset);
        }

        mutex_unlock(&cpuset_mutex);

        /* if cpus or mems changed, we need to propagate to descendants */
        if (cpus_updated || mems_updated) {
                struct cpuset *cs;
                struct cgroup_subsys_state *pos_css;

                rcu_read_lock();
                cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
                        if (cs == &top_cpuset || !css_tryget_online(&cs->css))
                                continue;
                        rcu_read_unlock();

                        cpuset_hotplug_update_tasks(cs, ptmp);

                        rcu_read_lock();
                        css_put(&cs->css);
                }
                rcu_read_unlock();
        }

        /* rebuild sched domains if cpus_allowed has changed */
        if (cpus_updated || force_rebuild) {
                force_rebuild = false;
                rebuild_sched_domains_cpuslocked();
        }

        free_cpumasks(NULL, ptmp);
}

void cpuset_update_active_cpus(void)
{
        /*
         * We're inside cpu hotplug critical region which usually nests
         * inside cgroup synchronization.  Bounce actual hotplug processing
         * to a work item to avoid reverse locking order.
         */
        cpuset_handle_hotplug();
}

/*
 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
 * Call this routine anytime after node_states[N_MEMORY] changes.
 * See cpuset_update_active_cpus() for CPU hotplug handling.
 */
static int cpuset_track_online_nodes(struct notifier_block *self,
                                unsigned long action, void *arg)
{
        cpuset_handle_hotplug();
        return NOTIFY_OK;
}

/**
 * cpuset_init_smp - initialize cpus_allowed
 *
 * Description: Finish top cpuset after cpu, node maps are initialized
 */
void __init cpuset_init_smp(void)
{
        /*
         * cpus_allowd/mems_allowed set to v2 values in the initial
         * cpuset_bind() call will be reset to v1 values in another
         * cpuset_bind() call when v1 cpuset is mounted.
         */
        top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;

        cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
        top_cpuset.effective_mems = node_states[N_MEMORY];

        hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);

        cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
        BUG_ON(!cpuset_migrate_mm_wq);
}

/**
 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
 *
 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
 * attached to the specified @tsk.  Guaranteed to return some non-empty
 * subset of cpu_online_mask, even if this means going outside the
 * tasks cpuset, except when the task is in the top cpuset.
 **/

void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
        unsigned long flags;
        struct cpuset *cs;

        spin_lock_irqsave(&callback_lock, flags);
        rcu_read_lock();

        cs = task_cs(tsk);
        if (cs != &top_cpuset)
                guarantee_online_cpus(tsk, pmask);
        /*
         * Tasks in the top cpuset won't get update to their cpumasks
         * when a hotplug online/offline event happens. So we include all
         * offline cpus in the allowed cpu list.
         */
        if ((cs == &top_cpuset) || cpumask_empty(pmask)) {
                const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);

                /*
                 * We first exclude cpus allocated to partitions. If there is no
                 * allowable online cpu left, we fall back to all possible cpus.
                 */
                cpumask_andnot(pmask, possible_mask, subpartitions_cpus);
                if (!cpumask_intersects(pmask, cpu_online_mask))
                        cpumask_copy(pmask, possible_mask);
        }

        rcu_read_unlock();
        spin_unlock_irqrestore(&callback_lock, flags);
}

/**
 * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
 * @tsk: pointer to task_struct with which the scheduler is struggling
 *
 * Description: In the case that the scheduler cannot find an allowed cpu in
 * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
 * mode however, this value is the same as task_cs(tsk)->effective_cpus,
 * which will not contain a sane cpumask during cases such as cpu hotplugging.
 * This is the absolute last resort for the scheduler and it is only used if
 * _every_ other avenue has been traveled.
 *
 * Returns true if the affinity of @tsk was changed, false otherwise.
 **/

bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
        const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
        const struct cpumask *cs_mask;
        bool changed = false;

        rcu_read_lock();
        cs_mask = task_cs(tsk)->cpus_allowed;
        if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
                do_set_cpus_allowed(tsk, cs_mask);
                changed = true;
        }
        rcu_read_unlock();

        /*
         * We own tsk->cpus_allowed, nobody can change it under us.
         *
         * But we used cs && cs->cpus_allowed lockless and thus can
         * race with cgroup_attach_task() or update_cpumask() and get
         * the wrong tsk->cpus_allowed. However, both cases imply the
         * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
         * which takes task_rq_lock().
         *
         * If we are called after it dropped the lock we must see all
         * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
         * set any mask even if it is not right from task_cs() pov,
         * the pending set_cpus_allowed_ptr() will fix things.
         *
         * select_fallback_rq() will fix things ups and set cpu_possible_mask
         * if required.
         */
        return changed;
}

void __init cpuset_init_current_mems_allowed(void)
{
        nodes_setall(current->mems_allowed);
}

/**
 * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
 * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
 *
 * Description: Returns the nodemask_t mems_allowed of the cpuset
 * attached to the specified @tsk.  Guaranteed to return some non-empty
 * subset of node_states[N_MEMORY], even if this means going outside the
 * tasks cpuset.
 **/

nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
        nodemask_t mask;
        unsigned long flags;

        spin_lock_irqsave(&callback_lock, flags);
        rcu_read_lock();
        guarantee_online_mems(task_cs(tsk), &mask);
        rcu_read_unlock();
        spin_unlock_irqrestore(&callback_lock, flags);

        return mask;
}

/**
 * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed
 * @nodemask: the nodemask to be checked
 *
 * Are any of the nodes in the nodemask allowed in current->mems_allowed?
 */
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
{
        return nodes_intersects(*nodemask, current->mems_allowed);
}

/*
 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
 * mem_hardwall ancestor to the specified cpuset.  Call holding
 * callback_lock.  If no ancestor is mem_exclusive or mem_hardwall
 * (an unusual configuration), then returns the root cpuset.
 */
static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
{
        while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
                cs = parent_cs(cs);
        return cs;
}

/*
 * cpuset_node_allowed - Can we allocate on a memory node?
 * @node: is this an allowed node?
 * @gfp_mask: memory allocation flags
 *
 * If we're in interrupt, yes, we can always allocate.  If @node is set in
 * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
 * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
 * yes.  If current has access to memory reserves as an oom victim, yes.
 * Otherwise, no.
 *
 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
 * and do not allow allocations outside the current tasks cpuset
 * unless the task has been OOM killed.
 * GFP_KERNEL allocations are not so marked, so can escape to the
 * nearest enclosing hardwalled ancestor cpuset.
 *
 * Scanning up parent cpusets requires callback_lock.  The
 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
 * current tasks mems_allowed came up empty on the first pass over
 * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
 * cpuset are short of memory, might require taking the callback_lock.
 *
 * The first call here from mm/page_alloc:get_page_from_freelist()
 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
 * so no allocation on a node outside the cpuset is allowed (unless
 * in interrupt, of course).
 *
 * The second pass through get_page_from_freelist() doesn't even call
 * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
 * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
 * in alloc_flags.  That logic and the checks below have the combined
 * affect that:
 *        in_interrupt - any node ok (current task context irrelevant)
 *        GFP_ATOMIC   - any node ok
 *        tsk_is_oom_victim   - any node ok
 *        GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
 *        GFP_USER     - only nodes in current tasks mems allowed ok.
 */
bool cpuset_node_allowed(int node, gfp_t gfp_mask)
{
        struct cpuset *cs;                /* current cpuset ancestors */
        bool allowed;                        /* is allocation in zone z allowed? */
        unsigned long flags;

        if (in_interrupt())
                return true;
        if (node_isset(node, current->mems_allowed))
                return true;
        /*
         * Allow tasks that have access to memory reserves because they have
         * been OOM killed to get memory anywhere.
         */
        if (unlikely(tsk_is_oom_victim(current)))
                return true;
        if (gfp_mask & __GFP_HARDWALL)        /* If hardwall request, stop here */
                return false;

        if (current->flags & PF_EXITING) /* Let dying task have memory */
                return true;

        /* Not hardwall and node outside mems_allowed: scan up cpusets */
        spin_lock_irqsave(&callback_lock, flags);

        rcu_read_lock();
        cs = nearest_hardwall_ancestor(task_cs(current));
        allowed = node_isset(node, cs->mems_allowed);
        rcu_read_unlock();

        spin_unlock_irqrestore(&callback_lock, flags);
        return allowed;
}

/**
 * cpuset_spread_node() - On which node to begin search for a page
 * @rotor: round robin rotor
 *
 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
 * tasks in a cpuset with is_spread_page or is_spread_slab set),
 * and if the memory allocation used cpuset_mem_spread_node()
 * to determine on which node to start looking, as it will for
 * certain page cache or slab cache pages such as used for file
 * system buffers and inode caches, then instead of starting on the
 * local node to look for a free page, rather spread the starting
 * node around the tasks mems_allowed nodes.
 *
 * We don't have to worry about the returned node being offline
 * because "it can't happen", and even if it did, it would be ok.
 *
 * The routines calling guarantee_online_mems() are careful to
 * only set nodes in task->mems_allowed that are online.  So it
 * should not be possible for the following code to return an
 * offline node.  But if it did, that would be ok, as this routine
 * is not returning the node where the allocation must be, only
 * the node where the search should start.  The zonelist passed to
 * __alloc_pages() will include all nodes.  If the slab allocator
 * is passed an offline node, it will fall back to the local node.
 * See kmem_cache_alloc_node().
 */
static int cpuset_spread_node(int *rotor)
{
        return *rotor = next_node_in(*rotor, current->mems_allowed);
}

/**
 * cpuset_mem_spread_node() - On which node to begin search for a file page
 */
int cpuset_mem_spread_node(void)
{
        if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
                current->cpuset_mem_spread_rotor =
                        node_random(&current->mems_allowed);

        return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
}

/**
 * cpuset_slab_spread_node() - On which node to begin search for a slab page
 */
int cpuset_slab_spread_node(void)
{
        if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
                current->cpuset_slab_spread_rotor =
                        node_random(&current->mems_allowed);

        return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
}
EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);

/**
 * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
 * @tsk1: pointer to task_struct of some task.
 * @tsk2: pointer to task_struct of some other task.
 *
 * Description: Return true if @tsk1's mems_allowed intersects the
 * mems_allowed of @tsk2.  Used by the OOM killer to determine if
 * one of the task's memory usage might impact the memory available
 * to the other.
 **/

int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
                                   const struct task_struct *tsk2)
{
        return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
}

/**
 * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
 *
 * Description: Prints current's name, cpuset name, and cached copy of its
 * mems_allowed to the kernel log.
 */
void cpuset_print_current_mems_allowed(void)
{
        struct cgroup *cgrp;

        rcu_read_lock();

        cgrp = task_cs(current)->css.cgroup;
        pr_cont(",cpuset=");
        pr_cont_cgroup_name(cgrp);
        pr_cont(",mems_allowed=%*pbl",
                nodemask_pr_args(&current->mems_allowed));

        rcu_read_unlock();
}

/*
 * Collection of memory_pressure is suppressed unless
 * this flag is enabled by writing "1" to the special
 * cpuset file 'memory_pressure_enabled' in the root cpuset.
 */

int cpuset_memory_pressure_enabled __read_mostly;

/*
 * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
 *
 * Keep a running average of the rate of synchronous (direct)
 * page reclaim efforts initiated by tasks in each cpuset.
 *
 * This represents the rate at which some task in the cpuset
 * ran low on memory on all nodes it was allowed to use, and
 * had to enter the kernels page reclaim code in an effort to
 * create more free memory by tossing clean pages or swapping
 * or writing dirty pages.
 *
 * Display to user space in the per-cpuset read-only file
 * "memory_pressure".  Value displayed is an integer
 * representing the recent rate of entry into the synchronous
 * (direct) page reclaim by any task attached to the cpuset.
 */

void __cpuset_memory_pressure_bump(void)
{
        rcu_read_lock();
        fmeter_markevent(&task_cs(current)->fmeter);
        rcu_read_unlock();
}

#ifdef CONFIG_PROC_PID_CPUSET
/*
 * proc_cpuset_show()
 *  - Print tasks cpuset path into seq_file.
 *  - Used for /proc/<pid>/cpuset.
 *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
 *    doesn't really matter if tsk->cpuset changes after we read it,
 *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
 *    anyway.
 */
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
                     struct pid *pid, struct task_struct *tsk)
{
        char *buf;
        struct cgroup_subsys_state *css;
        int retval;

        retval = -ENOMEM;
        buf = kmalloc(PATH_MAX, GFP_KERNEL);
        if (!buf)
                goto out;

        css = task_get_css(tsk, cpuset_cgrp_id);
        retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
                                current->nsproxy->cgroup_ns);
        css_put(css);
        if (retval == -E2BIG)
                retval = -ENAMETOOLONG;
        if (retval < 0)
                goto out_free;
        seq_puts(m, buf);
        seq_putc(m, '\n');
        retval = 0;
out_free:
        kfree(buf);
out:
        return retval;
}
#endif /* CONFIG_PROC_PID_CPUSET */

/* Display task mems_allowed in /proc/<pid>/status file. */
void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
{
        seq_printf(m, "Mems_allowed:\t%*pb\n",
                   nodemask_pr_args(&task->mems_allowed));
        seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
                   nodemask_pr_args(&task->mems_allowed));
}































    1 
























    1 








    1 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
// SPDX-License-Identifier: LGPL-2.0+
/* Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Contributed by Paul Eggert (eggert@twinsun.com). */

/*
 * dgb 10/02/98: ripped this from glibc source to help convert timestamps
 *               to unix time
 *     10/04/98: added new table-based lookup after seeing how ugly
 *               the gnu code is
 * blf 09/27/99: ripped out all the old code and inserted new table from
 *                 John Brockmeyer (without leap second corrections)
 *                 rewrote udf_stamp_to_time and fixed timezone accounting in
 *                 udf_time_to_stamp.
 */

/*
 * We don't take into account leap seconds. This may be correct or incorrect.
 * For more NIST information (especially dealing with leap seconds), see:
 * http://www.boulder.nist.gov/timefreq/pubs/bulletin/leapsecond.htm
 */

#include "udfdecl.h"

#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/time.h>

void
udf_disk_stamp_to_time(struct timespec64 *dest, struct timestamp src)
{
        u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone);
        u16 year = le16_to_cpu(src.year);
        uint8_t type = typeAndTimezone >> 12;
        int16_t offset;

        if (type == 1) {
                offset = typeAndTimezone << 4;
                /* sign extent offset */
                offset = (offset >> 4);
                if (offset == -2047) /* unspecified offset */
                        offset = 0;
        } else
                offset = 0;

        dest->tv_sec = mktime64(year, src.month, src.day, src.hour, src.minute,
                        src.second);
        dest->tv_sec -= offset * 60;

        /*
         * Sanitize nanosecond field since reportedly some filesystems are
         * recorded with bogus sub-second values.
         */
        if (src.centiseconds < 100 && src.hundredsOfMicroseconds < 100 &&
            src.microseconds < 100) {
                dest->tv_nsec = 1000 * (src.centiseconds * 10000 +
                        src.hundredsOfMicroseconds * 100 + src.microseconds);
        } else {
                dest->tv_nsec = 0;
        }
}

void
udf_time_to_disk_stamp(struct timestamp *dest, struct timespec64 ts)
{
        time64_t seconds;
        int16_t offset;
        struct tm tm;

        offset = -sys_tz.tz_minuteswest;

        dest->typeAndTimezone = cpu_to_le16(0x1000 | (offset & 0x0FFF));

        seconds = ts.tv_sec + offset * 60;
        time64_to_tm(seconds, 0, &tm);
        dest->year = cpu_to_le16(tm.tm_year + 1900);
        dest->month = tm.tm_mon + 1;
        dest->day = tm.tm_mday;
        dest->hour = tm.tm_hour;
        dest->minute = tm.tm_min;
        dest->second = tm.tm_sec;
        dest->centiseconds = ts.tv_nsec / 10000000;
        dest->hundredsOfMicroseconds = (ts.tv_nsec / 1000 -
                                        dest->centiseconds * 10000) / 100;
        dest->microseconds = (ts.tv_nsec / 1000 - dest->centiseconds * 10000 -
                              dest->hundredsOfMicroseconds * 100);
}

/* EOF */






































































































































    7 











































   12 















































    1 






















































































































































































































    7 







































































































































































































































































































   10 















   10 




















































































































































































































    2 








































































































































































































































































































    2 

















    2 































































































































































































































































































































   11 



   11 

























    1 





    1 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PGTABLE_H
#define _ASM_X86_PGTABLE_H

#include <linux/mem_encrypt.h>
#include <asm/page.h>
#include <asm/pgtable_types.h>

/*
 * Macro to mark a page protection value as UC-
 */
#define pgprot_noncached(prot)                                                \
        ((boot_cpu_data.x86 > 3)                                        \
         ? (__pgprot(pgprot_val(prot) |                                        \
                     cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)))        \
         : (prot))

#ifndef __ASSEMBLY__
#include <linux/spinlock.h>
#include <asm/x86_init.h>
#include <asm/pkru.h>
#include <asm/fpu/api.h>
#include <asm/coco.h>
#include <asm-generic/pgtable_uffd.h>
#include <linux/page_table_check.h>

extern pgd_t early_top_pgt[PTRS_PER_PGD];
bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd);

struct seq_file;
void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm);
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
                                   bool user);
bool ptdump_walk_pgd_level_checkwx(void);
#define ptdump_check_wx ptdump_walk_pgd_level_checkwx
void ptdump_walk_user_pgd_level_checkwx(void);

/*
 * Macros to add or remove encryption attribute
 */
#define pgprot_encrypted(prot)        __pgprot(cc_mkenc(pgprot_val(prot)))
#define pgprot_decrypted(prot)        __pgprot(cc_mkdec(pgprot_val(prot)))

#ifdef CONFIG_DEBUG_WX
#define debug_checkwx_user()        ptdump_walk_user_pgd_level_checkwx()
#else
#define debug_checkwx_user()        do { } while (0)
#endif

/*
 * ZERO_PAGE is a global shared page that is always zero: used
 * for zero-mapped memory areas etc..
 */
extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
        __visible;
#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page))

extern spinlock_t pgd_lock;
extern struct list_head pgd_list;

extern struct mm_struct *pgd_page_get_mm(struct page *page);

extern pmdval_t early_pmd_flags;

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else  /* !CONFIG_PARAVIRT_XXL */
#define set_pte(ptep, pte)                native_set_pte(ptep, pte)

#define set_pte_atomic(ptep, pte)                                        \
        native_set_pte_atomic(ptep, pte)

#define set_pmd(pmdp, pmd)                native_set_pmd(pmdp, pmd)

#ifndef __PAGETABLE_P4D_FOLDED
#define set_pgd(pgdp, pgd)                native_set_pgd(pgdp, pgd)
#define pgd_clear(pgd)                        (pgtable_l5_enabled() ? native_pgd_clear(pgd) : 0)
#endif

#ifndef set_p4d
# define set_p4d(p4dp, p4d)                native_set_p4d(p4dp, p4d)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
#define p4d_clear(p4d)                        native_p4d_clear(p4d)
#endif

#ifndef set_pud
# define set_pud(pudp, pud)                native_set_pud(pudp, pud)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
#define pud_clear(pud)                        native_pud_clear(pud)
#endif

#define pte_clear(mm, addr, ptep)        native_pte_clear(mm, addr, ptep)
#define pmd_clear(pmd)                        native_pmd_clear(pmd)

#define pgd_val(x)        native_pgd_val(x)
#define __pgd(x)        native_make_pgd(x)

#ifndef __PAGETABLE_P4D_FOLDED
#define p4d_val(x)        native_p4d_val(x)
#define __p4d(x)        native_make_p4d(x)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
#define pud_val(x)        native_pud_val(x)
#define __pud(x)        native_make_pud(x)
#endif

#ifndef __PAGETABLE_PMD_FOLDED
#define pmd_val(x)        native_pmd_val(x)
#define __pmd(x)        native_make_pmd(x)
#endif

#define pte_val(x)        native_pte_val(x)
#define __pte(x)        native_make_pte(x)

#define arch_end_context_switch(prev)        do {} while(0)
#endif        /* CONFIG_PARAVIRT_XXL */

/*
 * The following only work if pte_present() is true.
 * Undefined behaviour if not..
 */
static inline bool pte_dirty(pte_t pte)
{
        return pte_flags(pte) & _PAGE_DIRTY_BITS;
}

static inline bool pte_shstk(pte_t pte)
{
        return cpu_feature_enabled(X86_FEATURE_SHSTK) &&
               (pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY;
}

static inline int pte_young(pte_t pte)
{
        return pte_flags(pte) & _PAGE_ACCESSED;
}

#define pmd_dirty pmd_dirty
static inline bool pmd_dirty(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_DIRTY_BITS;
}

static inline bool pmd_shstk(pmd_t pmd)
{
        return cpu_feature_enabled(X86_FEATURE_SHSTK) &&
               (pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) ==
               (_PAGE_DIRTY | _PAGE_PSE);
}

#define pmd_young pmd_young
static inline int pmd_young(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_ACCESSED;
}

static inline bool pud_dirty(pud_t pud)
{
        return pud_flags(pud) & _PAGE_DIRTY_BITS;
}

static inline int pud_young(pud_t pud)
{
        return pud_flags(pud) & _PAGE_ACCESSED;
}

static inline int pte_write(pte_t pte)
{
        /*
         * Shadow stack pages are logically writable, but do not have
         * _PAGE_RW.  Check for them separately from _PAGE_RW itself.
         */
        return (pte_flags(pte) & _PAGE_RW) || pte_shstk(pte);
}

#define pmd_write pmd_write
static inline int pmd_write(pmd_t pmd)
{
        /*
         * Shadow stack pages are logically writable, but do not have
         * _PAGE_RW.  Check for them separately from _PAGE_RW itself.
         */
        return (pmd_flags(pmd) & _PAGE_RW) || pmd_shstk(pmd);
}

#define pud_write pud_write
static inline int pud_write(pud_t pud)
{
        return pud_flags(pud) & _PAGE_RW;
}

static inline int pte_huge(pte_t pte)
{
        return pte_flags(pte) & _PAGE_PSE;
}

static inline int pte_global(pte_t pte)
{
        return pte_flags(pte) & _PAGE_GLOBAL;
}

static inline int pte_exec(pte_t pte)
{
        return !(pte_flags(pte) & _PAGE_NX);
}

static inline int pte_special(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SPECIAL;
}

/* Entries that were set to PROT_NONE are inverted */

static inline u64 protnone_mask(u64 val);

#define PFN_PTE_SHIFT        PAGE_SHIFT

static inline unsigned long pte_pfn(pte_t pte)
{
        phys_addr_t pfn = pte_val(pte);
        pfn ^= protnone_mask(pfn);
        return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT;
}

static inline unsigned long pmd_pfn(pmd_t pmd)
{
        phys_addr_t pfn = pmd_val(pmd);
        pfn ^= protnone_mask(pfn);
        return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
}

#define pud_pfn pud_pfn
static inline unsigned long pud_pfn(pud_t pud)
{
        phys_addr_t pfn = pud_val(pud);
        pfn ^= protnone_mask(pfn);
        return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT;
}

static inline unsigned long p4d_pfn(p4d_t p4d)
{
        return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
}

static inline unsigned long pgd_pfn(pgd_t pgd)
{
        return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT;
}

#define p4d_leaf p4d_leaf
static inline bool p4d_leaf(p4d_t p4d)
{
        /* No 512 GiB pages yet */
        return 0;
}

#define pte_page(pte)        pfn_to_page(pte_pfn(pte))

#define pmd_leaf pmd_leaf
static inline bool pmd_leaf(pmd_t pte)
{
        return pmd_flags(pte) & _PAGE_PSE;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_leaf */
static inline int pmd_trans_huge(pmd_t pmd)
{
        return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static inline int pud_trans_huge(pud_t pud)
{
        return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
}
#endif

#define has_transparent_hugepage has_transparent_hugepage
static inline int has_transparent_hugepage(void)
{
        return boot_cpu_has(X86_FEATURE_PSE);
}

#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
static inline int pmd_devmap(pmd_t pmd)
{
        return !!(pmd_val(pmd) & _PAGE_DEVMAP);
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static inline int pud_devmap(pud_t pud)
{
        return !!(pud_val(pud) & _PAGE_DEVMAP);
}
#else
static inline int pud_devmap(pud_t pud)
{
        return 0;
}
#endif

static inline int pgd_devmap(pgd_t pgd)
{
        return 0;
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
{
        pteval_t v = native_pte_val(pte);

        return native_make_pte(v | set);
}

static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
{
        pteval_t v = native_pte_val(pte);

        return native_make_pte(v & ~clear);
}

/*
 * Write protection operations can result in Dirty=1,Write=0 PTEs. But in the
 * case of X86_FEATURE_USER_SHSTK, these PTEs denote shadow stack memory. So
 * when creating dirty, write-protected memory, a software bit is used:
 * _PAGE_BIT_SAVED_DIRTY. The following functions take a PTE and transition the
 * Dirty bit to SavedDirty, and vice-vesra.
 *
 * This shifting is only done if needed. In the case of shifting
 * Dirty->SavedDirty, the condition is if the PTE is Write=0. In the case of
 * shifting SavedDirty->Dirty, the condition is Write=1.
 */
static inline pgprotval_t mksaveddirty_shift(pgprotval_t v)
{
        pgprotval_t cond = (~v >> _PAGE_BIT_RW) & 1;

        v |= ((v >> _PAGE_BIT_DIRTY) & cond) << _PAGE_BIT_SAVED_DIRTY;
        v &= ~(cond << _PAGE_BIT_DIRTY);

        return v;
}

static inline pgprotval_t clear_saveddirty_shift(pgprotval_t v)
{
        pgprotval_t cond = (v >> _PAGE_BIT_RW) & 1;

        v |= ((v >> _PAGE_BIT_SAVED_DIRTY) & cond) << _PAGE_BIT_DIRTY;
        v &= ~(cond << _PAGE_BIT_SAVED_DIRTY);

        return v;
}

static inline pte_t pte_mksaveddirty(pte_t pte)
{
        pteval_t v = native_pte_val(pte);

        v = mksaveddirty_shift(v);
        return native_make_pte(v);
}

static inline pte_t pte_clear_saveddirty(pte_t pte)
{
        pteval_t v = native_pte_val(pte);

        v = clear_saveddirty_shift(v);
        return native_make_pte(v);
}

static inline pte_t pte_wrprotect(pte_t pte)
{
        pte = pte_clear_flags(pte, _PAGE_RW);

        /*
         * Blindly clearing _PAGE_RW might accidentally create
         * a shadow stack PTE (Write=0,Dirty=1). Move the hardware
         * dirty value to the software bit, if present.
         */
        return pte_mksaveddirty(pte);
}

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pte_uffd_wp(pte_t pte)
{
        return pte_flags(pte) & _PAGE_UFFD_WP;
}

static inline pte_t pte_mkuffd_wp(pte_t pte)
{
        return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP));
}

static inline pte_t pte_clear_uffd_wp(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_UFFD_WP);
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

static inline pte_t pte_mkclean(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_DIRTY_BITS);
}

static inline pte_t pte_mkold(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_ACCESSED);
}

static inline pte_t pte_mkexec(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_NX);
}

static inline pte_t pte_mkdirty(pte_t pte)
{
        pte = pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);

        return pte_mksaveddirty(pte);
}

static inline pte_t pte_mkwrite_shstk(pte_t pte)
{
        pte = pte_clear_flags(pte, _PAGE_RW);

        return pte_set_flags(pte, _PAGE_DIRTY);
}

static inline pte_t pte_mkyoung(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_ACCESSED);
}

static inline pte_t pte_mkwrite_novma(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_RW);
}

struct vm_area_struct;
pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma);
#define pte_mkwrite pte_mkwrite

static inline pte_t pte_mkhuge(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_PSE);
}

static inline pte_t pte_clrhuge(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_PSE);
}

static inline pte_t pte_mkglobal(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_GLOBAL);
}

static inline pte_t pte_clrglobal(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_GLOBAL);
}

static inline pte_t pte_mkspecial(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SPECIAL);
}

static inline pte_t pte_mkdevmap(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP);
}

static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
{
        pmdval_t v = native_pmd_val(pmd);

        return native_make_pmd(v | set);
}

static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
{
        pmdval_t v = native_pmd_val(pmd);

        return native_make_pmd(v & ~clear);
}

/* See comments above mksaveddirty_shift() */
static inline pmd_t pmd_mksaveddirty(pmd_t pmd)
{
        pmdval_t v = native_pmd_val(pmd);

        v = mksaveddirty_shift(v);
        return native_make_pmd(v);
}

/* See comments above mksaveddirty_shift() */
static inline pmd_t pmd_clear_saveddirty(pmd_t pmd)
{
        pmdval_t v = native_pmd_val(pmd);

        v = clear_saveddirty_shift(v);
        return native_make_pmd(v);
}

static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
        pmd = pmd_clear_flags(pmd, _PAGE_RW);

        /*
         * Blindly clearing _PAGE_RW might accidentally create
         * a shadow stack PMD (RW=0, Dirty=1). Move the hardware
         * dirty value to the software bit.
         */
        return pmd_mksaveddirty(pmd);
}

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pmd_uffd_wp(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_UFFD_WP;
}

static inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
{
        return pmd_wrprotect(pmd_set_flags(pmd, _PAGE_UFFD_WP));
}

static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_UFFD_WP);
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

static inline pmd_t pmd_mkold(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_ACCESSED);
}

static inline pmd_t pmd_mkclean(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS);
}

static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
        pmd = pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);

        return pmd_mksaveddirty(pmd);
}

static inline pmd_t pmd_mkwrite_shstk(pmd_t pmd)
{
        pmd = pmd_clear_flags(pmd, _PAGE_RW);

        return pmd_set_flags(pmd, _PAGE_DIRTY);
}

static inline pmd_t pmd_mkdevmap(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_DEVMAP);
}

static inline pmd_t pmd_mkhuge(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_PSE);
}

static inline pmd_t pmd_mkyoung(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_ACCESSED);
}

static inline pmd_t pmd_mkwrite_novma(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_RW);
}

pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
#define pmd_mkwrite pmd_mkwrite

static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
{
        pudval_t v = native_pud_val(pud);

        return native_make_pud(v | set);
}

static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
{
        pudval_t v = native_pud_val(pud);

        return native_make_pud(v & ~clear);
}

/* See comments above mksaveddirty_shift() */
static inline pud_t pud_mksaveddirty(pud_t pud)
{
        pudval_t v = native_pud_val(pud);

        v = mksaveddirty_shift(v);
        return native_make_pud(v);
}

/* See comments above mksaveddirty_shift() */
static inline pud_t pud_clear_saveddirty(pud_t pud)
{
        pudval_t v = native_pud_val(pud);

        v = clear_saveddirty_shift(v);
        return native_make_pud(v);
}

static inline pud_t pud_mkold(pud_t pud)
{
        return pud_clear_flags(pud, _PAGE_ACCESSED);
}

static inline pud_t pud_mkclean(pud_t pud)
{
        return pud_clear_flags(pud, _PAGE_DIRTY_BITS);
}

static inline pud_t pud_wrprotect(pud_t pud)
{
        pud = pud_clear_flags(pud, _PAGE_RW);

        /*
         * Blindly clearing _PAGE_RW might accidentally create
         * a shadow stack PUD (RW=0, Dirty=1). Move the hardware
         * dirty value to the software bit.
         */
        return pud_mksaveddirty(pud);
}

static inline pud_t pud_mkdirty(pud_t pud)
{
        pud = pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);

        return pud_mksaveddirty(pud);
}

static inline pud_t pud_mkdevmap(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_DEVMAP);
}

static inline pud_t pud_mkhuge(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_PSE);
}

static inline pud_t pud_mkyoung(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_ACCESSED);
}

static inline pud_t pud_mkwrite(pud_t pud)
{
        pud = pud_set_flags(pud, _PAGE_RW);

        return pud_clear_saveddirty(pud);
}

#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline int pte_soft_dirty(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SOFT_DIRTY;
}

static inline int pmd_soft_dirty(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_SOFT_DIRTY;
}

static inline int pud_soft_dirty(pud_t pud)
{
        return pud_flags(pud) & _PAGE_SOFT_DIRTY;
}

static inline pte_t pte_mksoft_dirty(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
}

static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
}

static inline pud_t pud_mksoft_dirty(pud_t pud)
{
        return pud_set_flags(pud, _PAGE_SOFT_DIRTY);
}

static inline pte_t pte_clear_soft_dirty(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
}

static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
}

static inline pud_t pud_clear_soft_dirty(pud_t pud)
{
        return pud_clear_flags(pud, _PAGE_SOFT_DIRTY);
}

#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */

/*
 * Mask out unsupported bits in a present pgprot.  Non-present pgprots
 * can use those bits for other purposes, so leave them be.
 */
static inline pgprotval_t massage_pgprot(pgprot_t pgprot)
{
        pgprotval_t protval = pgprot_val(pgprot);

        if (protval & _PAGE_PRESENT)
                protval &= __supported_pte_mask;

        return protval;
}

static inline pgprotval_t check_pgprot(pgprot_t pgprot)
{
        pgprotval_t massaged_val = massage_pgprot(pgprot);

        /* mmdebug.h can not be included here because of dependencies */
#ifdef CONFIG_DEBUG_VM
        WARN_ONCE(pgprot_val(pgprot) != massaged_val,
                  "attempted to set unsupported pgprot: %016llx "
                  "bits: %016llx supported: %016llx\n",
                  (u64)pgprot_val(pgprot),
                  (u64)pgprot_val(pgprot) ^ massaged_val,
                  (u64)__supported_pte_mask);
#endif

        return massaged_val;
}

static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
        phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
        pfn ^= protnone_mask(pgprot_val(pgprot));
        pfn &= PTE_PFN_MASK;
        return __pte(pfn | check_pgprot(pgprot));
}

static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
{
        phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
        pfn ^= protnone_mask(pgprot_val(pgprot));
        pfn &= PHYSICAL_PMD_PAGE_MASK;
        return __pmd(pfn | check_pgprot(pgprot));
}

static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
{
        phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
        pfn ^= protnone_mask(pgprot_val(pgprot));
        pfn &= PHYSICAL_PUD_PAGE_MASK;
        return __pud(pfn | check_pgprot(pgprot));
}

static inline pmd_t pmd_mkinvalid(pmd_t pmd)
{
        return pfn_pmd(pmd_pfn(pmd),
                      __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
}

static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);

static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
        pteval_t val = pte_val(pte), oldval = val;
        pte_t pte_result;

        /*
         * Chop off the NX bit (if present), and add the NX portion of
         * the newprot (if present):
         */
        val &= _PAGE_CHG_MASK;
        val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK;
        val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);

        pte_result = __pte(val);

        /*
         * To avoid creating Write=0,Dirty=1 PTEs, pte_modify() needs to avoid:
         *  1. Marking Write=0 PTEs Dirty=1
         *  2. Marking Dirty=1 PTEs Write=0
         *
         * The first case cannot happen because the _PAGE_CHG_MASK will filter
         * out any Dirty bit passed in newprot. Handle the second case by
         * going through the mksaveddirty exercise. Only do this if the old
         * value was Write=1 to avoid doing this on Shadow Stack PTEs.
         */
        if (oldval & _PAGE_RW)
                pte_result = pte_mksaveddirty(pte_result);
        else
                pte_result = pte_clear_saveddirty(pte_result);

        return pte_result;
}

static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
        pmdval_t val = pmd_val(pmd), oldval = val;
        pmd_t pmd_result;

        val &= (_HPAGE_CHG_MASK & ~_PAGE_DIRTY);
        val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
        val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);

        pmd_result = __pmd(val);

        /*
         * To avoid creating Write=0,Dirty=1 PMDs, pte_modify() needs to avoid:
         *  1. Marking Write=0 PMDs Dirty=1
         *  2. Marking Dirty=1 PMDs Write=0
         *
         * The first case cannot happen because the _PAGE_CHG_MASK will filter
         * out any Dirty bit passed in newprot. Handle the second case by
         * going through the mksaveddirty exercise. Only do this if the old
         * value was Write=1 to avoid doing this on Shadow Stack PTEs.
         */
        if (oldval & _PAGE_RW)
                pmd_result = pmd_mksaveddirty(pmd_result);
        else
                pmd_result = pmd_clear_saveddirty(pmd_result);

        return pmd_result;
}

/*
 * mprotect needs to preserve PAT and encryption bits when updating
 * vm_page_prot
 */
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
{
        pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
        pgprotval_t addbits = pgprot_val(newprot) & ~_PAGE_CHG_MASK;
        return __pgprot(preservebits | addbits);
}

#define pte_pgprot(x) __pgprot(pte_flags(x))
#define pmd_pgprot(x) __pgprot(pmd_flags(x))
#define pud_pgprot(x) __pgprot(pud_flags(x))
#define p4d_pgprot(x) __pgprot(p4d_flags(x))

#define canon_pgprot(p) __pgprot(massage_pgprot(p))

static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
                                         enum page_cache_mode pcm,
                                         enum page_cache_mode new_pcm)
{
        /*
         * PAT type is always WB for untracked ranges, so no need to check.
         */
        if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
                return 1;

        /*
         * Certain new memtypes are not allowed with certain
         * requested memtype:
         * - request is uncached, return cannot be write-back
         * - request is write-combine, return cannot be write-back
         * - request is write-through, return cannot be write-back
         * - request is write-through, return cannot be write-combine
         */
        if ((pcm == _PAGE_CACHE_MODE_UC_MINUS &&
             new_pcm == _PAGE_CACHE_MODE_WB) ||
            (pcm == _PAGE_CACHE_MODE_WC &&
             new_pcm == _PAGE_CACHE_MODE_WB) ||
            (pcm == _PAGE_CACHE_MODE_WT &&
             new_pcm == _PAGE_CACHE_MODE_WB) ||
            (pcm == _PAGE_CACHE_MODE_WT &&
             new_pcm == _PAGE_CACHE_MODE_WC)) {
                return 0;
        }

        return 1;
}

pmd_t *populate_extra_pmd(unsigned long vaddr);
pte_t *populate_extra_pte(unsigned long vaddr);

#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd);

/*
 * Take a PGD location (pgdp) and a pgd value that needs to be set there.
 * Populates the user and returns the resulting PGD that must be set in
 * the kernel copy of the page tables.
 */
static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
        if (!static_cpu_has(X86_FEATURE_PTI))
                return pgd;
        return __pti_set_user_pgtbl(pgdp, pgd);
}
#else   /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
        return pgd;
}
#endif  /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */

#endif        /* __ASSEMBLY__ */


#ifdef CONFIG_X86_32
# include <asm/pgtable_32.h>
#else
# include <asm/pgtable_64.h>
#endif

#ifndef __ASSEMBLY__
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/log2.h>
#include <asm/fixmap.h>

static inline int pte_none(pte_t pte)
{
        return !(pte.pte & ~(_PAGE_KNL_ERRATUM_MASK));
}

#define __HAVE_ARCH_PTE_SAME
static inline int pte_same(pte_t a, pte_t b)
{
        return a.pte == b.pte;
}

static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
{
        if (__pte_needs_invert(pte_val(pte)))
                return __pte(pte_val(pte) - (nr << PFN_PTE_SHIFT));
        return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
}
#define pte_advance_pfn        pte_advance_pfn

static inline int pte_present(pte_t a)
{
        return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
}

#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
static inline int pte_devmap(pte_t a)
{
        return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
}
#endif

#define pte_accessible pte_accessible
static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
{
        if (pte_flags(a) & _PAGE_PRESENT)
                return true;

        if ((pte_flags(a) & _PAGE_PROTNONE) &&
                        atomic_read(&mm->tlb_flush_pending))
                return true;

        return false;
}

static inline int pmd_present(pmd_t pmd)
{
        /*
         * Checking for _PAGE_PSE is needed too because
         * split_huge_page will temporarily clear the present bit (but
         * the _PAGE_PSE flag will remain set at all times while the
         * _PAGE_PRESENT bit is clear).
         */
        return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
}

#ifdef CONFIG_NUMA_BALANCING
/*
 * These work without NUMA balancing but the kernel does not care. See the
 * comment in include/linux/pgtable.h
 */
static inline int pte_protnone(pte_t pte)
{
        return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT))
                == _PAGE_PROTNONE;
}

static inline int pmd_protnone(pmd_t pmd)
{
        return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT))
                == _PAGE_PROTNONE;
}
#endif /* CONFIG_NUMA_BALANCING */

static inline int pmd_none(pmd_t pmd)
{
        /* Only check low word on 32-bit platforms, since it might be
           out of sync with upper half. */
        unsigned long val = native_pmd_val(pmd);
        return (val & ~_PAGE_KNL_ERRATUM_MASK) == 0;
}

static inline unsigned long pmd_page_vaddr(pmd_t pmd)
{
        return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd));
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define pmd_page(pmd)        pfn_to_page(pmd_pfn(pmd))

/*
 * Conversion functions: convert a page and protection to a page entry,
 * and a page entry and page directory to the page they refer to.
 *
 * (Currently stuck as a macro because of indirect forward reference
 * to linux/mm.h:page_to_nid())
 */
#define mk_pte(page, pgprot)                                                  \
({                                                                          \
        pgprot_t __pgprot = pgprot;                                          \
                                                                          \
        WARN_ON_ONCE((pgprot_val(__pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == \
                    _PAGE_DIRTY);                                          \
        pfn_pte(page_to_pfn(page), __pgprot);                                  \
})

static inline int pmd_bad(pmd_t pmd)
{
        return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
               (_KERNPG_TABLE & ~_PAGE_ACCESSED);
}

static inline unsigned long pages_to_mb(unsigned long npg)
{
        return npg >> (20 - PAGE_SHIFT);
}

#if CONFIG_PGTABLE_LEVELS > 2
static inline int pud_none(pud_t pud)
{
        return (native_pud_val(pud) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
}

static inline int pud_present(pud_t pud)
{
        return pud_flags(pud) & _PAGE_PRESENT;
}

static inline pmd_t *pud_pgtable(pud_t pud)
{
        return (pmd_t *)__va(pud_val(pud) & pud_pfn_mask(pud));
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define pud_page(pud)        pfn_to_page(pud_pfn(pud))

#define pud_leaf pud_leaf
static inline bool pud_leaf(pud_t pud)
{
        return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) ==
                (_PAGE_PSE | _PAGE_PRESENT);
}

static inline int pud_bad(pud_t pud)
{
        return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
}
#endif        /* CONFIG_PGTABLE_LEVELS > 2 */

#if CONFIG_PGTABLE_LEVELS > 3
static inline int p4d_none(p4d_t p4d)
{
        return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
}

static inline int p4d_present(p4d_t p4d)
{
        return p4d_flags(p4d) & _PAGE_PRESENT;
}

static inline pud_t *p4d_pgtable(p4d_t p4d)
{
        return (pud_t *)__va(p4d_val(p4d) & p4d_pfn_mask(p4d));
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define p4d_page(p4d)        pfn_to_page(p4d_pfn(p4d))

static inline int p4d_bad(p4d_t p4d)
{
        unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;

        if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION))
                ignore_flags |= _PAGE_NX;

        return (p4d_flags(p4d) & ~ignore_flags) != 0;
}
#endif  /* CONFIG_PGTABLE_LEVELS > 3 */

static inline unsigned long p4d_index(unsigned long address)
{
        return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1);
}

#if CONFIG_PGTABLE_LEVELS > 4
static inline int pgd_present(pgd_t pgd)
{
        if (!pgtable_l5_enabled())
                return 1;
        return pgd_flags(pgd) & _PAGE_PRESENT;
}

static inline unsigned long pgd_page_vaddr(pgd_t pgd)
{
        return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK);
}

/*
 * Currently stuck as a macro due to indirect forward reference to
 * linux/mmzone.h's __section_mem_map_addr() definition:
 */
#define pgd_page(pgd)        pfn_to_page(pgd_pfn(pgd))

/* to find an entry in a page-table-directory. */
static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
{
        if (!pgtable_l5_enabled())
                return (p4d_t *)pgd;
        return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
}

static inline int pgd_bad(pgd_t pgd)
{
        unsigned long ignore_flags = _PAGE_USER;

        if (!pgtable_l5_enabled())
                return 0;

        if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION))
                ignore_flags |= _PAGE_NX;

        return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}

static inline int pgd_none(pgd_t pgd)
{
        if (!pgtable_l5_enabled())
                return 0;
        /*
         * There is no need to do a workaround for the KNL stray
         * A/D bit erratum here.  PGDs only point to page tables
         * except on 32-bit non-PAE which is not supported on
         * KNL.
         */
        return !native_pgd_val(pgd);
}
#endif        /* CONFIG_PGTABLE_LEVELS > 4 */

#endif        /* __ASSEMBLY__ */

#define KERNEL_PGD_BOUNDARY        pgd_index(PAGE_OFFSET)
#define KERNEL_PGD_PTRS                (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)

#ifndef __ASSEMBLY__

extern int direct_gbpages;
void init_mem_mapping(void);
void early_alloc_pgt_buf(void);
void __init poking_init(void);
unsigned long init_memory_mapping(unsigned long start,
                                  unsigned long end, pgprot_t prot);

#ifdef CONFIG_X86_64
extern pgd_t trampoline_pgd_entry;
#endif

/* local pte updates need not use xchg for locking */
static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
{
        pte_t res = *ptep;

        /* Pure native function needs no input for mm, addr */
        native_pte_clear(NULL, 0, ptep);
        return res;
}

static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
{
        pmd_t res = *pmdp;

        native_pmd_clear(pmdp);
        return res;
}

static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
{
        pud_t res = *pudp;

        native_pud_clear(pudp);
        return res;
}

static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                              pmd_t *pmdp, pmd_t pmd)
{
        page_table_check_pmd_set(mm, pmdp, pmd);
        set_pmd(pmdp, pmd);
}

static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
                              pud_t *pudp, pud_t pud)
{
        page_table_check_pud_set(mm, pudp, pud);
        native_set_pud(pudp, pud);
}

/*
 * We only update the dirty/accessed state if we set
 * the dirty bit by hand in the kernel, since the hardware
 * will do the accessed bit for us, and we don't want to
 * race with other CPU's that might be updating the dirty
 * bit at the same time.
 */
struct vm_area_struct;

#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pte_t *ptep,
                                 pte_t entry, int dirty);

#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
                                     unsigned long addr, pte_t *ptep);

#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
extern int ptep_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pte_t *ptep);

#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
                                       pte_t *ptep)
{
        pte_t pte = native_ptep_get_and_clear(ptep);
        page_table_check_pte_clear(mm, pte);
        return pte;
}

#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
                                            unsigned long addr, pte_t *ptep,
                                            int full)
{
        pte_t pte;
        if (full) {
                /*
                 * Full address destruction in progress; paravirt does not
                 * care about updates and native needs no locking
                 */
                pte = native_local_ptep_get_and_clear(ptep);
                page_table_check_pte_clear(mm, pte);
        } else {
                pte = ptep_get_and_clear(mm, addr, ptep);
        }
        return pte;
}

#define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm,
                                      unsigned long addr, pte_t *ptep)
{
        /*
         * Avoid accidentally creating shadow stack PTEs
         * (Write=0,Dirty=1).  Use cmpxchg() to prevent races with
         * the hardware setting Dirty=1.
         */
        pte_t old_pte, new_pte;

        old_pte = READ_ONCE(*ptep);
        do {
                new_pte = pte_wrprotect(old_pte);
        } while (!try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte));
}

#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)

#define mk_pmd(page, pgprot)   pfn_pmd(page_to_pfn(page), (pgprot))

#define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp,
                                 pmd_t entry, int dirty);
extern int pudp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pud_t *pudp,
                                 pud_t entry, int dirty);

#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                     unsigned long addr, pmd_t *pmdp);
extern int pudp_test_and_clear_young(struct vm_area_struct *vma,
                                     unsigned long addr, pud_t *pudp);

#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp);


#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
                                       pmd_t *pmdp)
{
        pmd_t pmd = native_pmdp_get_and_clear(pmdp);

        page_table_check_pmd_clear(mm, pmd);

        return pmd;
}

#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
                                        unsigned long addr, pud_t *pudp)
{
        pud_t pud = native_pudp_get_and_clear(pudp);

        page_table_check_pud_clear(mm, pud);

        return pud;
}

#define __HAVE_ARCH_PMDP_SET_WRPROTECT
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long addr, pmd_t *pmdp)
{
        /*
         * Avoid accidentally creating shadow stack PTEs
         * (Write=0,Dirty=1).  Use cmpxchg() to prevent races with
         * the hardware setting Dirty=1.
         */
        pmd_t old_pmd, new_pmd;

        old_pmd = READ_ONCE(*pmdp);
        do {
                new_pmd = pmd_wrprotect(old_pmd);
        } while (!try_cmpxchg((long *)pmdp, (long *)&old_pmd, *(long *)&new_pmd));
}

#ifndef pmdp_establish
#define pmdp_establish pmdp_establish
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
        page_table_check_pmd_set(vma->vm_mm, pmdp, pmd);
        if (IS_ENABLED(CONFIG_SMP)) {
                return xchg(pmdp, pmd);
        } else {
                pmd_t old = *pmdp;
                WRITE_ONCE(*pmdp, pmd);
                return old;
        }
}
#endif

#define __HAVE_ARCH_PMDP_INVALIDATE_AD
extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma,
                                unsigned long address, pmd_t *pmdp);

/*
 * Page table pages are page-aligned.  The lower half of the top
 * level is used for userspace and the top half for the kernel.
 *
 * Returns true for parts of the PGD that map userspace and
 * false for the parts that map the kernel.
 */
static inline bool pgdp_maps_userspace(void *__ptr)
{
        unsigned long ptr = (unsigned long)__ptr;

        return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START);
}

#define pgd_leaf        pgd_leaf
static inline bool pgd_leaf(pgd_t pgd) { return false; }

#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
 * All top-level MITIGATION_PAGE_TABLE_ISOLATION page tables are order-1 pages
 * (8k-aligned and 8k in size).  The kernel one is at the beginning 4k and
 * the user one is in the last 4k.  To switch between them, you
 * just need to flip the 12th bit in their addresses.
 */
#define PTI_PGTABLE_SWITCH_BIT        PAGE_SHIFT

/*
 * This generates better code than the inline assembly in
 * __set_bit().
 */
static inline void *ptr_set_bit(void *ptr, int bit)
{
        unsigned long __ptr = (unsigned long)ptr;

        __ptr |= BIT(bit);
        return (void *)__ptr;
}
static inline void *ptr_clear_bit(void *ptr, int bit)
{
        unsigned long __ptr = (unsigned long)ptr;

        __ptr &= ~BIT(bit);
        return (void *)__ptr;
}

static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
{
        return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}

static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
{
        return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}

static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
{
        return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}

static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
{
        return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}
#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */

/*
 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
 *
 *  dst - pointer to pgd range anywhere on a pgd page
 *  src - ""
 *  count - the number of pgds to copy.
 *
 * dst and src can be on the same page, but the range must not overlap,
 * and must not cross a page boundary.
 */
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
        memcpy(dst, src, count * sizeof(pgd_t));
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
        if (!static_cpu_has(X86_FEATURE_PTI))
                return;
        /* Clone the user space pgd as well */
        memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
               count * sizeof(pgd_t));
#endif
}

#define PTE_SHIFT ilog2(PTRS_PER_PTE)
static inline int page_level_shift(enum pg_level level)
{
        return (PAGE_SHIFT - PTE_SHIFT) + level * PTE_SHIFT;
}
static inline unsigned long page_level_size(enum pg_level level)
{
        return 1UL << page_level_shift(level);
}
static inline unsigned long page_level_mask(enum pg_level level)
{
        return ~(page_level_size(level) - 1);
}

/*
 * The x86 doesn't have any external MMU info: the kernel page
 * tables contain all the necessary information.
 */
static inline void update_mmu_cache(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep)
{
}
static inline void update_mmu_cache_range(struct vm_fault *vmf,
                struct vm_area_struct *vma, unsigned long addr,
                pte_t *ptep, unsigned int nr)
{
}
static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
                unsigned long addr, pmd_t *pmd)
{
}
static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
                unsigned long addr, pud_t *pud)
{
}
static inline pte_t pte_swp_mkexclusive(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE);
}

static inline int pte_swp_exclusive(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SWP_EXCLUSIVE;
}

static inline pte_t pte_swp_clear_exclusive(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SWP_EXCLUSIVE);
}

#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}

static inline int pte_swp_soft_dirty(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
}

static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
}

static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_SWP_SOFT_DIRTY;
}

static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
}
#endif
#endif

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline pte_t pte_swp_mkuffd_wp(pte_t pte)
{
        return pte_set_flags(pte, _PAGE_SWP_UFFD_WP);
}

static inline int pte_swp_uffd_wp(pte_t pte)
{
        return pte_flags(pte) & _PAGE_SWP_UFFD_WP;
}

static inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
{
        return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP);
}

static inline pmd_t pmd_swp_mkuffd_wp(pmd_t pmd)
{
        return pmd_set_flags(pmd, _PAGE_SWP_UFFD_WP);
}

static inline int pmd_swp_uffd_wp(pmd_t pmd)
{
        return pmd_flags(pmd) & _PAGE_SWP_UFFD_WP;
}

static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd)
{
        return pmd_clear_flags(pmd, _PAGE_SWP_UFFD_WP);
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

static inline u16 pte_flags_pkey(unsigned long pte_flags)
{
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        /* ifdef to avoid doing 59-bit shift on 32-bit values */
        return (pte_flags & _PAGE_PKEY_MASK) >> _PAGE_BIT_PKEY_BIT0;
#else
        return 0;
#endif
}

static inline bool __pkru_allows_pkey(u16 pkey, bool write)
{
        u32 pkru = read_pkru();

        if (!__pkru_allows_read(pkru, pkey))
                return false;
        if (write && !__pkru_allows_write(pkru, pkey))
                return false;

        return true;
}

/*
 * 'pteval' can come from a PTE, PMD or PUD.  We only check
 * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
 * same value on all 3 types.
 */
static inline bool __pte_access_permitted(unsigned long pteval, bool write)
{
        unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;

        /*
         * Write=0,Dirty=1 PTEs are shadow stack, which the kernel
         * shouldn't generally allow access to, but since they
         * are already Write=0, the below logic covers both cases.
         */
        if (write)
                need_pte_bits |= _PAGE_RW;

        if ((pteval & need_pte_bits) != need_pte_bits)
                return 0;

        return __pkru_allows_pkey(pte_flags_pkey(pteval), write);
}

#define pte_access_permitted pte_access_permitted
static inline bool pte_access_permitted(pte_t pte, bool write)
{
        return __pte_access_permitted(pte_val(pte), write);
}

#define pmd_access_permitted pmd_access_permitted
static inline bool pmd_access_permitted(pmd_t pmd, bool write)
{
        return __pte_access_permitted(pmd_val(pmd), write);
}

#define pud_access_permitted pud_access_permitted
static inline bool pud_access_permitted(pud_t pud, bool write)
{
        return __pte_access_permitted(pud_val(pud), write);
}

#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1
extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot);

static inline bool arch_has_pfn_modify_check(void)
{
        return boot_cpu_has_bug(X86_BUG_L1TF);
}

#define arch_check_zapped_pte arch_check_zapped_pte
void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte);

#define arch_check_zapped_pmd arch_check_zapped_pmd
void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd);

#ifdef CONFIG_XEN_PV
#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
static inline bool arch_has_hw_nonleaf_pmd_young(void)
{
        return !cpu_feature_enabled(X86_FEATURE_XENPV);
}
#endif

#ifdef CONFIG_PAGE_TABLE_CHECK
static inline bool pte_user_accessible_page(pte_t pte)
{
        return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER);
}

static inline bool pmd_user_accessible_page(pmd_t pmd)
{
        return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && (pmd_val(pmd) & _PAGE_USER);
}

static inline bool pud_user_accessible_page(pud_t pud)
{
        return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && (pud_val(pud) & _PAGE_USER);
}
#endif

#ifdef CONFIG_X86_SGX
int arch_memory_failure(unsigned long pfn, int flags);
#define arch_memory_failure arch_memory_failure

bool arch_is_platform_page(u64 paddr);
#define arch_is_platform_page arch_is_platform_page
#endif

#endif        /* __ASSEMBLY__ */

#endif /* _ASM_X86_PGTABLE_H */


































































































































    1 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright 2019 Google LLC
 */

#ifndef __LINUX_BLK_CRYPTO_H
#define __LINUX_BLK_CRYPTO_H

#include <linux/types.h>

enum blk_crypto_mode_num {
        BLK_ENCRYPTION_MODE_INVALID,
        BLK_ENCRYPTION_MODE_AES_256_XTS,
        BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV,
        BLK_ENCRYPTION_MODE_ADIANTUM,
        BLK_ENCRYPTION_MODE_SM4_XTS,
        BLK_ENCRYPTION_MODE_MAX,
};

#define BLK_CRYPTO_MAX_KEY_SIZE                64
/**
 * struct blk_crypto_config - an inline encryption key's crypto configuration
 * @crypto_mode: encryption algorithm this key is for
 * @data_unit_size: the data unit size for all encryption/decryptions with this
 *        key.  This is the size in bytes of each individual plaintext and
 *        ciphertext.  This is always a power of 2.  It might be e.g. the
 *        filesystem block size or the disk sector size.
 * @dun_bytes: the maximum number of bytes of DUN used when using this key
 */
struct blk_crypto_config {
        enum blk_crypto_mode_num crypto_mode;
        unsigned int data_unit_size;
        unsigned int dun_bytes;
};

/**
 * struct blk_crypto_key - an inline encryption key
 * @crypto_cfg: the crypto configuration (like crypto_mode, key size) for this
 *                key
 * @data_unit_size_bits: log2 of data_unit_size
 * @size: size of this key in bytes (determined by @crypto_cfg.crypto_mode)
 * @raw: the raw bytes of this key.  Only the first @size bytes are used.
 *
 * A blk_crypto_key is immutable once created, and many bios can reference it at
 * the same time.  It must not be freed until all bios using it have completed
 * and it has been evicted from all devices on which it may have been used.
 */
struct blk_crypto_key {
        struct blk_crypto_config crypto_cfg;
        unsigned int data_unit_size_bits;
        unsigned int size;
        u8 raw[BLK_CRYPTO_MAX_KEY_SIZE];
};

#define BLK_CRYPTO_MAX_IV_SIZE                32
#define BLK_CRYPTO_DUN_ARRAY_SIZE        (BLK_CRYPTO_MAX_IV_SIZE / sizeof(u64))

/**
 * struct bio_crypt_ctx - an inline encryption context
 * @bc_key: the key, algorithm, and data unit size to use
 * @bc_dun: the data unit number (starting IV) to use
 *
 * A bio_crypt_ctx specifies that the contents of the bio will be encrypted (for
 * write requests) or decrypted (for read requests) inline by the storage device
 * or controller, or by the crypto API fallback.
 */
struct bio_crypt_ctx {
        const struct blk_crypto_key        *bc_key;
        u64                                bc_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
};

#include <linux/blk_types.h>
#include <linux/blkdev.h>

#ifdef CONFIG_BLK_INLINE_ENCRYPTION

static inline bool bio_has_crypt_ctx(struct bio *bio)
{
        return bio->bi_crypt_context;
}

void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key,
                       const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
                       gfp_t gfp_mask);

bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc,
                                 unsigned int bytes,
                                 const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]);

int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
                        enum blk_crypto_mode_num crypto_mode,
                        unsigned int dun_bytes,
                        unsigned int data_unit_size);

int blk_crypto_start_using_key(struct block_device *bdev,
                               const struct blk_crypto_key *key);

void blk_crypto_evict_key(struct block_device *bdev,
                          const struct blk_crypto_key *key);

bool blk_crypto_config_supported_natively(struct block_device *bdev,
                                          const struct blk_crypto_config *cfg);
bool blk_crypto_config_supported(struct block_device *bdev,
                                 const struct blk_crypto_config *cfg);

#else /* CONFIG_BLK_INLINE_ENCRYPTION */

static inline bool bio_has_crypt_ctx(struct bio *bio)
{
        return false;
}

#endif /* CONFIG_BLK_INLINE_ENCRYPTION */

int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask);
/**
 * bio_crypt_clone - clone bio encryption context
 * @dst: destination bio
 * @src: source bio
 * @gfp_mask: memory allocation flags
 *
 * If @src has an encryption context, clone it to @dst.
 *
 * Return: 0 on success, -ENOMEM if out of memory.  -ENOMEM is only possible if
 *           @gfp_mask doesn't include %__GFP_DIRECT_RECLAIM.
 */
static inline int bio_crypt_clone(struct bio *dst, struct bio *src,
                                  gfp_t gfp_mask)
{
        if (bio_has_crypt_ctx(src))
                return __bio_crypt_clone(dst, src, gfp_mask);
        return 0;
}

#endif /* __LINUX_BLK_CRYPTO_H */









































































































   32 
































































































































    3 










































    3 



































    3 



    1 

    2 








    3 
    2 



    1 
























    3 




    8 
    7 
    7 

    1 






    1 

    6 









    4 



    4 
    4 




    4 
    4 


























   11 
   10 


    8 

   10 

























































    6 









    6 


    6 
    6 




    6 
    2 








































    1 

    7 



















    2 

    5 






    8 












    8 


    8 




    8 

    1 

    7 

    8 




    1 




    8 




    7 
    8 


















    8 





    8 





































    6 









    2 












   16 



    3 


    9 






























   33 


















   35 
































   36 


































   33 
   27 


   35 

   34 





    7 























    3 
    3 

    2 








    3 








    3 



    3 

    3 





























    5 


























































    1 


    1 



































































    5 
    5 





































































































































































    2 






    2 




    2 












    2 
































    2 

    2 


















    2 





    2 




























































    2 



    4 









    4 


    3 


































    2 



    2 























































    2 



























    2 












































































































   22 
















    3 


   19 








    3 





























   23 
























   10 









   16 






































    7 






    5 
    2 















   22 
   22 
   22 






    8 

   18 
    3 
   17 
    3 
   11 
   11 
   20 

   15 
    5 
   17 

   21 


















    6 




    6 
    3 



    7 


    6 



   17 





   12 
   13 
   13 





   11 

   12 



   12 



















   11 


   11 














    5 






    5 








    2 





    3 








    1 










    1 
    2 










    1 

    1 






    1 
    1 





































    2 





























































































    3 















    3 

    3 



    3 



























































    9 


    9 











    2 














    8 




















    1 

























   11 






    6 

   10 




















   30 

   31 
























   32 

   30 

    3 








   26 









   28 


















































    3 








    3 










   15 
   16 


   13 
   17 


















    9 
   10 






    8 


























    8 



    9 







    9 








































   10 







































    9 



    9 
    8 














   10 



    9 




   10 


    9 
















   15 




    8 
    8 



   10 
    5 
    4 



    6 


    8 
    8 

   10 
    5 












    3 
    3 



































































































    4 
    1 

    1 


    4 




    5 
















    5 





    5 



    5 

    1 
    1 









    4 






    3 
    2 




    4 
    5 


    1 
    4 
    2 
    3 




    5 


    1 
    4 









    1 
    4 
    5 


    5 




    3 
    2 

    4 
    1 
    1 
    4 














    3 





































    5 
    6 















    1 


















































   15 





    8 




    1 
    4 




    2 










    1 


    1 





    1 





























    1 













    1 
    1 



    1 











































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/dcache.c
 *
 * Complete reimplementation
 * (C) 1997 Thomas Schoebel-Theuer,
 * with heavy changes by Linus Torvalds
 */

/*
 * Notes on the allocation strategy:
 *
 * The dcache is a master of the icache - whenever a dcache entry
 * exists, the inode will always exist. "iput()" is done either when
 * the dcache entry is deleted or garbage collected.
 */

#include <linux/ratelimit.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/fscrypt.h>
#include <linux/fsnotify.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/cache.h>
#include <linux/export.h>
#include <linux/security.h>
#include <linux/seqlock.h>
#include <linux/memblock.h>
#include <linux/bit_spinlock.h>
#include <linux/rculist_bl.h>
#include <linux/list_lru.h>
#include "internal.h"
#include "mount.h"

/*
 * Usage:
 * dcache->d_inode->i_lock protects:
 *   - i_dentry, d_u.d_alias, d_inode of aliases
 * dcache_hash_bucket lock protects:
 *   - the dcache hash table
 * s_roots bl list spinlock protects:
 *   - the s_roots list (see __d_drop)
 * dentry->d_sb->s_dentry_lru_lock protects:
 *   - the dcache lru lists and counters
 * d_lock protects:
 *   - d_flags
 *   - d_name
 *   - d_lru
 *   - d_count
 *   - d_unhashed()
 *   - d_parent and d_chilren
 *   - childrens' d_sib and d_parent
 *   - d_u.d_alias, d_inode
 *
 * Ordering:
 * dentry->d_inode->i_lock
 *   dentry->d_lock
 *     dentry->d_sb->s_dentry_lru_lock
 *     dcache_hash_bucket lock
 *     s_roots lock
 *
 * If there is an ancestor relationship:
 * dentry->d_parent->...->d_parent->d_lock
 *   ...
 *     dentry->d_parent->d_lock
 *       dentry->d_lock
 *
 * If no ancestor relationship:
 * arbitrary, since it's serialized on rename_lock
 */
int sysctl_vfs_cache_pressure __read_mostly = 100;
EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);

__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);

EXPORT_SYMBOL(rename_lock);

static struct kmem_cache *dentry_cache __ro_after_init;

const struct qstr empty_name = QSTR_INIT("", 0);
EXPORT_SYMBOL(empty_name);
const struct qstr slash_name = QSTR_INIT("/", 1);
EXPORT_SYMBOL(slash_name);
const struct qstr dotdot_name = QSTR_INIT("..", 2);
EXPORT_SYMBOL(dotdot_name);

/*
 * This is the single most critical data structure when it comes
 * to the dcache: the hashtable for lookups. Somebody should try
 * to make this good - I've just made it work.
 *
 * This hash-function tries to avoid losing too many bits of hash
 * information, yet avoid using a prime hash-size or similar.
 */

static unsigned int d_hash_shift __ro_after_init;

static struct hlist_bl_head *dentry_hashtable __ro_after_init;

static inline struct hlist_bl_head *d_hash(unsigned int hash)
{
        return dentry_hashtable + (hash >> d_hash_shift);
}

#define IN_LOOKUP_SHIFT 10
static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT];

static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent,
                                        unsigned int hash)
{
        hash += (unsigned long) parent / L1_CACHE_BYTES;
        return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT);
}

struct dentry_stat_t {
        long nr_dentry;
        long nr_unused;
        long age_limit;                /* age in seconds */
        long want_pages;        /* pages requested by system */
        long nr_negative;        /* # of unused negative dentries */
        long dummy;                /* Reserved for future use */
};

static DEFINE_PER_CPU(long, nr_dentry);
static DEFINE_PER_CPU(long, nr_dentry_unused);
static DEFINE_PER_CPU(long, nr_dentry_negative);

#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
/* Statistics gathering. */
static struct dentry_stat_t dentry_stat = {
        .age_limit = 45,
};

/*
 * Here we resort to our own counters instead of using generic per-cpu counters
 * for consistency with what the vfs inode code does. We are expected to harvest
 * better code and performance by having our own specialized counters.
 *
 * Please note that the loop is done over all possible CPUs, not over all online
 * CPUs. The reason for this is that we don't want to play games with CPUs going
 * on and off. If one of them goes off, we will just keep their counters.
 *
 * glommer: See cffbc8a for details, and if you ever intend to change this,
 * please update all vfs counters to match.
 */
static long get_nr_dentry(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_dentry, i);
        return sum < 0 ? 0 : sum;
}

static long get_nr_dentry_unused(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_dentry_unused, i);
        return sum < 0 ? 0 : sum;
}

static long get_nr_dentry_negative(void)
{
        int i;
        long sum = 0;

        for_each_possible_cpu(i)
                sum += per_cpu(nr_dentry_negative, i);
        return sum < 0 ? 0 : sum;
}

static int proc_nr_dentry(struct ctl_table *table, int write, void *buffer,
                          size_t *lenp, loff_t *ppos)
{
        dentry_stat.nr_dentry = get_nr_dentry();
        dentry_stat.nr_unused = get_nr_dentry_unused();
        dentry_stat.nr_negative = get_nr_dentry_negative();
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

static struct ctl_table fs_dcache_sysctls[] = {
        {
                .procname        = "dentry-state",
                .data                = &dentry_stat,
                .maxlen                = 6*sizeof(long),
                .mode                = 0444,
                .proc_handler        = proc_nr_dentry,
        },
};

static int __init init_fs_dcache_sysctls(void)
{
        register_sysctl_init("fs", fs_dcache_sysctls);
        return 0;
}
fs_initcall(init_fs_dcache_sysctls);
#endif

/*
 * Compare 2 name strings, return 0 if they match, otherwise non-zero.
 * The strings are both count bytes long, and count is non-zero.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

#include <asm/word-at-a-time.h>
/*
 * NOTE! 'cs' and 'scount' come from a dentry, so it has a
 * aligned allocation for this particular component. We don't
 * strictly need the load_unaligned_zeropad() safety, but it
 * doesn't hurt either.
 *
 * In contrast, 'ct' and 'tcount' can be from a pathname, and do
 * need the careful unaligned handling.
 */
static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
{
        unsigned long a,b,mask;

        for (;;) {
                a = read_word_at_a_time(cs);
                b = load_unaligned_zeropad(ct);
                if (tcount < sizeof(unsigned long))
                        break;
                if (unlikely(a != b))
                        return 1;
                cs += sizeof(unsigned long);
                ct += sizeof(unsigned long);
                tcount -= sizeof(unsigned long);
                if (!tcount)
                        return 0;
        }
        mask = bytemask_from_count(tcount);
        return unlikely(!!((a ^ b) & mask));
}

#else

static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
{
        do {
                if (*cs != *ct)
                        return 1;
                cs++;
                ct++;
                tcount--;
        } while (tcount);
        return 0;
}

#endif

static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
{
        /*
         * Be careful about RCU walk racing with rename:
         * use 'READ_ONCE' to fetch the name pointer.
         *
         * NOTE! Even if a rename will mean that the length
         * was not loaded atomically, we don't care. The
         * RCU walk will check the sequence count eventually,
         * and catch it. And we won't overrun the buffer,
         * because we're reading the name pointer atomically,
         * and a dentry name is guaranteed to be properly
         * terminated with a NUL byte.
         *
         * End result: even if 'len' is wrong, we'll exit
         * early because the data cannot match (there can
         * be no NUL in the ct/tcount data)
         */
        const unsigned char *cs = READ_ONCE(dentry->d_name.name);

        return dentry_string_cmp(cs, ct, tcount);
}

struct external_name {
        union {
                atomic_t count;
                struct rcu_head head;
        } u;
        unsigned char name[];
};

static inline struct external_name *external_name(struct dentry *dentry)
{
        return container_of(dentry->d_name.name, struct external_name, name[0]);
}

static void __d_free(struct rcu_head *head)
{
        struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);

        kmem_cache_free(dentry_cache, dentry); 
}

static void __d_free_external(struct rcu_head *head)
{
        struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
        kfree(external_name(dentry));
        kmem_cache_free(dentry_cache, dentry);
}

static inline int dname_external(const struct dentry *dentry)
{
        return dentry->d_name.name != dentry->d_iname;
}

void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        name->name = dentry->d_name;
        if (unlikely(dname_external(dentry))) {
                atomic_inc(&external_name(dentry)->u.count);
        } else {
                memcpy(name->inline_name, dentry->d_iname,
                       dentry->d_name.len + 1);
                name->name.name = name->inline_name;
        }
        spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(take_dentry_name_snapshot);

void release_dentry_name_snapshot(struct name_snapshot *name)
{
        if (unlikely(name->name.name != name->inline_name)) {
                struct external_name *p;
                p = container_of(name->name.name, struct external_name, name[0]);
                if (unlikely(atomic_dec_and_test(&p->u.count)))
                        kfree_rcu(p, u.head);
        }
}
EXPORT_SYMBOL(release_dentry_name_snapshot);

static inline void __d_set_inode_and_type(struct dentry *dentry,
                                          struct inode *inode,
                                          unsigned type_flags)
{
        unsigned flags;

        dentry->d_inode = inode;
        flags = READ_ONCE(dentry->d_flags);
        flags &= ~DCACHE_ENTRY_TYPE;
        flags |= type_flags;
        smp_store_release(&dentry->d_flags, flags);
}

static inline void __d_clear_type_and_inode(struct dentry *dentry)
{
        unsigned flags = READ_ONCE(dentry->d_flags);

        flags &= ~DCACHE_ENTRY_TYPE;
        WRITE_ONCE(dentry->d_flags, flags);
        dentry->d_inode = NULL;
        if (flags & DCACHE_LRU_LIST)
                this_cpu_inc(nr_dentry_negative);
}

static void dentry_free(struct dentry *dentry)
{
        WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias));
        if (unlikely(dname_external(dentry))) {
                struct external_name *p = external_name(dentry);
                if (likely(atomic_dec_and_test(&p->u.count))) {
                        call_rcu(&dentry->d_u.d_rcu, __d_free_external);
                        return;
                }
        }
        /* if dentry was never visible to RCU, immediate free is OK */
        if (dentry->d_flags & DCACHE_NORCU)
                __d_free(&dentry->d_u.d_rcu);
        else
                call_rcu(&dentry->d_u.d_rcu, __d_free);
}

/*
 * Release the dentry's inode, using the filesystem
 * d_iput() operation if defined.
 */
static void dentry_unlink_inode(struct dentry * dentry)
        __releases(dentry->d_lock)
        __releases(dentry->d_inode->i_lock)
{
        struct inode *inode = dentry->d_inode;

        raw_write_seqcount_begin(&dentry->d_seq);
        __d_clear_type_and_inode(dentry);
        hlist_del_init(&dentry->d_u.d_alias);
        raw_write_seqcount_end(&dentry->d_seq);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&inode->i_lock);
        if (!inode->i_nlink)
                fsnotify_inoderemove(inode);
        if (dentry->d_op && dentry->d_op->d_iput)
                dentry->d_op->d_iput(dentry, inode);
        else
                iput(inode);
}

/*
 * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry
 * is in use - which includes both the "real" per-superblock
 * LRU list _and_ the DCACHE_SHRINK_LIST use.
 *
 * The DCACHE_SHRINK_LIST bit is set whenever the dentry is
 * on the shrink list (ie not on the superblock LRU list).
 *
 * The per-cpu "nr_dentry_unused" counters are updated with
 * the DCACHE_LRU_LIST bit.
 *
 * The per-cpu "nr_dentry_negative" counters are only updated
 * when deleted from or added to the per-superblock LRU list, not
 * from/to the shrink list. That is to avoid an unneeded dec/inc
 * pair when moving from LRU to shrink list in select_collect().
 *
 * These helper functions make sure we always follow the
 * rules. d_lock must be held by the caller.
 */
#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x))
static void d_lru_add(struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, 0);
        dentry->d_flags |= DCACHE_LRU_LIST;
        this_cpu_inc(nr_dentry_unused);
        if (d_is_negative(dentry))
                this_cpu_inc(nr_dentry_negative);
        WARN_ON_ONCE(!list_lru_add_obj(
                        &dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}

static void d_lru_del(struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags &= ~DCACHE_LRU_LIST;
        this_cpu_dec(nr_dentry_unused);
        if (d_is_negative(dentry))
                this_cpu_dec(nr_dentry_negative);
        WARN_ON_ONCE(!list_lru_del_obj(
                        &dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}

static void d_shrink_del(struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
        list_del_init(&dentry->d_lru);
        dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
        this_cpu_dec(nr_dentry_unused);
}

static void d_shrink_add(struct dentry *dentry, struct list_head *list)
{
        D_FLAG_VERIFY(dentry, 0);
        list_add(&dentry->d_lru, list);
        dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST;
        this_cpu_inc(nr_dentry_unused);
}

/*
 * These can only be called under the global LRU lock, ie during the
 * callback for freeing the LRU list. "isolate" removes it from the
 * LRU lists entirely, while shrink_move moves it to the indicated
 * private list.
 */
static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags &= ~DCACHE_LRU_LIST;
        this_cpu_dec(nr_dentry_unused);
        if (d_is_negative(dentry))
                this_cpu_dec(nr_dentry_negative);
        list_lru_isolate(lru, &dentry->d_lru);
}

static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
                              struct list_head *list)
{
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags |= DCACHE_SHRINK_LIST;
        if (d_is_negative(dentry))
                this_cpu_dec(nr_dentry_negative);
        list_lru_isolate_move(lru, &dentry->d_lru, list);
}

static void ___d_drop(struct dentry *dentry)
{
        struct hlist_bl_head *b;
        /*
         * Hashed dentries are normally on the dentry hashtable,
         * with the exception of those newly allocated by
         * d_obtain_root, which are always IS_ROOT:
         */
        if (unlikely(IS_ROOT(dentry)))
                b = &dentry->d_sb->s_roots;
        else
                b = d_hash(dentry->d_name.hash);

        hlist_bl_lock(b);
        __hlist_bl_del(&dentry->d_hash);
        hlist_bl_unlock(b);
}

void __d_drop(struct dentry *dentry)
{
        if (!d_unhashed(dentry)) {
                ___d_drop(dentry);
                dentry->d_hash.pprev = NULL;
                write_seqcount_invalidate(&dentry->d_seq);
        }
}
EXPORT_SYMBOL(__d_drop);

/**
 * d_drop - drop a dentry
 * @dentry: dentry to drop
 *
 * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
 * be found through a VFS lookup any more. Note that this is different from
 * deleting the dentry - d_delete will try to mark the dentry negative if
 * possible, giving a successful _negative_ lookup, while d_drop will
 * just make the cache lookup fail.
 *
 * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
 * reason (NFS timeouts or autofs deletes).
 *
 * __d_drop requires dentry->d_lock
 *
 * ___d_drop doesn't mark dentry as "unhashed"
 * (dentry->d_hash.pprev will be LIST_POISON2, not NULL).
 */
void d_drop(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(d_drop);

static inline void dentry_unlist(struct dentry *dentry)
{
        struct dentry *next;
        /*
         * Inform d_walk() and shrink_dentry_list() that we are no longer
         * attached to the dentry tree
         */
        dentry->d_flags |= DCACHE_DENTRY_KILLED;
        if (unlikely(hlist_unhashed(&dentry->d_sib)))
                return;
        __hlist_del(&dentry->d_sib);
        /*
         * Cursors can move around the list of children.  While we'd been
         * a normal list member, it didn't matter - ->d_sib.next would've
         * been updated.  However, from now on it won't be and for the
         * things like d_walk() it might end up with a nasty surprise.
         * Normally d_walk() doesn't care about cursors moving around -
         * ->d_lock on parent prevents that and since a cursor has no children
         * of its own, we get through it without ever unlocking the parent.
         * There is one exception, though - if we ascend from a child that
         * gets killed as soon as we unlock it, the next sibling is found
         * using the value left in its ->d_sib.next.  And if _that_
         * pointed to a cursor, and cursor got moved (e.g. by lseek())
         * before d_walk() regains parent->d_lock, we'll end up skipping
         * everything the cursor had been moved past.
         *
         * Solution: make sure that the pointer left behind in ->d_sib.next
         * points to something that won't be moving around.  I.e. skip the
         * cursors.
         */
        while (dentry->d_sib.next) {
                next = hlist_entry(dentry->d_sib.next, struct dentry, d_sib);
                if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR)))
                        break;
                dentry->d_sib.next = next->d_sib.next;
        }
}

static struct dentry *__dentry_kill(struct dentry *dentry)
{
        struct dentry *parent = NULL;
        bool can_free = true;

        /*
         * The dentry is now unrecoverably dead to the world.
         */
        lockref_mark_dead(&dentry->d_lockref);

        /*
         * inform the fs via d_prune that this dentry is about to be
         * unhashed and destroyed.
         */
        if (dentry->d_flags & DCACHE_OP_PRUNE)
                dentry->d_op->d_prune(dentry);

        if (dentry->d_flags & DCACHE_LRU_LIST) {
                if (!(dentry->d_flags & DCACHE_SHRINK_LIST))
                        d_lru_del(dentry);
        }
        /* if it was on the hash then remove it */
        __d_drop(dentry);
        if (dentry->d_inode)
                dentry_unlink_inode(dentry);
        else
                spin_unlock(&dentry->d_lock);
        this_cpu_dec(nr_dentry);
        if (dentry->d_op && dentry->d_op->d_release)
                dentry->d_op->d_release(dentry);

        cond_resched();
        /* now that it's negative, ->d_parent is stable */
        if (!IS_ROOT(dentry)) {
                parent = dentry->d_parent;
                spin_lock(&parent->d_lock);
        }
        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
        dentry_unlist(dentry);
        if (dentry->d_flags & DCACHE_SHRINK_LIST)
                can_free = false;
        spin_unlock(&dentry->d_lock);
        if (likely(can_free))
                dentry_free(dentry);
        if (parent && --parent->d_lockref.count) {
                spin_unlock(&parent->d_lock);
                return NULL;
        }
        return parent;
}

/*
 * Lock a dentry for feeding it to __dentry_kill().
 * Called under rcu_read_lock() and dentry->d_lock; the former
 * guarantees that nothing we access will be freed under us.
 * Note that dentry is *not* protected from concurrent dentry_kill(),
 * d_delete(), etc.
 *
 * Return false if dentry is busy.  Otherwise, return true and have
 * that dentry's inode locked.
 */

static bool lock_for_kill(struct dentry *dentry)
{
        struct inode *inode = dentry->d_inode;

        if (unlikely(dentry->d_lockref.count))
                return false;

        if (!inode || likely(spin_trylock(&inode->i_lock)))
                return true;

        do {
                spin_unlock(&dentry->d_lock);
                spin_lock(&inode->i_lock);
                spin_lock(&dentry->d_lock);
                if (likely(inode == dentry->d_inode))
                        break;
                spin_unlock(&inode->i_lock);
                inode = dentry->d_inode;
        } while (inode);
        if (likely(!dentry->d_lockref.count))
                return true;
        if (inode)
                spin_unlock(&inode->i_lock);
        return false;
}

/*
 * Decide if dentry is worth retaining.  Usually this is called with dentry
 * locked; if not locked, we are more limited and might not be able to tell
 * without a lock.  False in this case means "punt to locked path and recheck".
 *
 * In case we aren't locked, these predicates are not "stable". However, it is
 * sufficient that at some point after we dropped the reference the dentry was
 * hashed and the flags had the proper value. Other dentry users may have
 * re-gotten a reference to the dentry and change that, but our work is done -
 * we can leave the dentry around with a zero refcount.
 */
static inline bool retain_dentry(struct dentry *dentry, bool locked)
{
        unsigned int d_flags;

        smp_rmb();
        d_flags = READ_ONCE(dentry->d_flags);

        // Unreachable? Nobody would be able to look it up, no point retaining
        if (unlikely(d_unhashed(dentry)))
                return false;

        // Same if it's disconnected
        if (unlikely(d_flags & DCACHE_DISCONNECTED))
                return false;

        // ->d_delete() might tell us not to bother, but that requires
        // ->d_lock; can't decide without it
        if (unlikely(d_flags & DCACHE_OP_DELETE)) {
                if (!locked || dentry->d_op->d_delete(dentry))
                        return false;
        }

        // Explicitly told not to bother
        if (unlikely(d_flags & DCACHE_DONTCACHE))
                return false;

        // At this point it looks like we ought to keep it.  We also might
        // need to do something - put it on LRU if it wasn't there already
        // and mark it referenced if it was on LRU, but not marked yet.
        // Unfortunately, both actions require ->d_lock, so in lockless
        // case we'd have to punt rather than doing those.
        if (unlikely(!(d_flags & DCACHE_LRU_LIST))) {
                if (!locked)
                        return false;
                d_lru_add(dentry);
        } else if (unlikely(!(d_flags & DCACHE_REFERENCED))) {
                if (!locked)
                        return false;
                dentry->d_flags |= DCACHE_REFERENCED;
        }
        return true;
}

void d_mark_dontcache(struct inode *inode)
{
        struct dentry *de;

        spin_lock(&inode->i_lock);
        hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) {
                spin_lock(&de->d_lock);
                de->d_flags |= DCACHE_DONTCACHE;
                spin_unlock(&de->d_lock);
        }
        inode->i_state |= I_DONTCACHE;
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_mark_dontcache);

/*
 * Try to do a lockless dput(), and return whether that was successful.
 *
 * If unsuccessful, we return false, having already taken the dentry lock.
 * In that case refcount is guaranteed to be zero and we have already
 * decided that it's not worth keeping around.
 *
 * The caller needs to hold the RCU read lock, so that the dentry is
 * guaranteed to stay around even if the refcount goes down to zero!
 */
static inline bool fast_dput(struct dentry *dentry)
{
        int ret;

        /*
         * try to decrement the lockref optimistically.
         */
        ret = lockref_put_return(&dentry->d_lockref);

        /*
         * If the lockref_put_return() failed due to the lock being held
         * by somebody else, the fast path has failed. We will need to
         * get the lock, and then check the count again.
         */
        if (unlikely(ret < 0)) {
                spin_lock(&dentry->d_lock);
                if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) {
                        spin_unlock(&dentry->d_lock);
                        return true;
                }
                dentry->d_lockref.count--;
                goto locked;
        }

        /*
         * If we weren't the last ref, we're done.
         */
        if (ret)
                return true;

        /*
         * Can we decide that decrement of refcount is all we needed without
         * taking the lock?  There's a very common case when it's all we need -
         * dentry looks like it ought to be retained and there's nothing else
         * to do.
         */
        if (retain_dentry(dentry, false))
                return true;

        /*
         * Either not worth retaining or we can't tell without the lock.
         * Get the lock, then.  We've already decremented the refcount to 0,
         * but we'll need to re-check the situation after getting the lock.
         */
        spin_lock(&dentry->d_lock);

        /*
         * Did somebody else grab a reference to it in the meantime, and
         * we're no longer the last user after all? Alternatively, somebody
         * else could have killed it and marked it dead. Either way, we
         * don't need to do anything else.
         */
locked:
        if (dentry->d_lockref.count || retain_dentry(dentry, true)) {
                spin_unlock(&dentry->d_lock);
                return true;
        }
        return false;
}


/* 
 * This is dput
 *
 * This is complicated by the fact that we do not want to put
 * dentries that are no longer on any hash chain on the unused
 * list: we'd much rather just get rid of them immediately.
 *
 * However, that implies that we have to traverse the dentry
 * tree upwards to the parents which might _also_ now be
 * scheduled for deletion (it may have been only waiting for
 * its last child to go away).
 *
 * This tail recursion is done by hand as we don't want to depend
 * on the compiler to always get this right (gcc generally doesn't).
 * Real recursion would eat up our stack space.
 */

/*
 * dput - release a dentry
 * @dentry: dentry to release 
 *
 * Release a dentry. This will drop the usage count and if appropriate
 * call the dentry unlink method as well as removing it from the queues and
 * releasing its resources. If the parent dentries were scheduled for release
 * they too may now get deleted.
 */
void dput(struct dentry *dentry)
{
        if (!dentry)
                return;
        might_sleep();
        rcu_read_lock();
        if (likely(fast_dput(dentry))) {
                rcu_read_unlock();
                return;
        }
        while (lock_for_kill(dentry)) {
                rcu_read_unlock();
                dentry = __dentry_kill(dentry);
                if (!dentry)
                        return;
                if (retain_dentry(dentry, true)) {
                        spin_unlock(&dentry->d_lock);
                        return;
                }
                rcu_read_lock();
        }
        rcu_read_unlock();
        spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(dput);

static void to_shrink_list(struct dentry *dentry, struct list_head *list)
__must_hold(&dentry->d_lock)
{
        if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
                if (dentry->d_flags & DCACHE_LRU_LIST)
                        d_lru_del(dentry);
                d_shrink_add(dentry, list);
        }
}

void dput_to_list(struct dentry *dentry, struct list_head *list)
{
        rcu_read_lock();
        if (likely(fast_dput(dentry))) {
                rcu_read_unlock();
                return;
        }
        rcu_read_unlock();
        to_shrink_list(dentry, list);
        spin_unlock(&dentry->d_lock);
}

struct dentry *dget_parent(struct dentry *dentry)
{
        int gotref;
        struct dentry *ret;
        unsigned seq;

        /*
         * Do optimistic parent lookup without any
         * locking.
         */
        rcu_read_lock();
        seq = raw_seqcount_begin(&dentry->d_seq);
        ret = READ_ONCE(dentry->d_parent);
        gotref = lockref_get_not_zero(&ret->d_lockref);
        rcu_read_unlock();
        if (likely(gotref)) {
                if (!read_seqcount_retry(&dentry->d_seq, seq))
                        return ret;
                dput(ret);
        }

repeat:
        /*
         * Don't need rcu_dereference because we re-check it was correct under
         * the lock.
         */
        rcu_read_lock();
        ret = dentry->d_parent;
        spin_lock(&ret->d_lock);
        if (unlikely(ret != dentry->d_parent)) {
                spin_unlock(&ret->d_lock);
                rcu_read_unlock();
                goto repeat;
        }
        rcu_read_unlock();
        BUG_ON(!ret->d_lockref.count);
        ret->d_lockref.count++;
        spin_unlock(&ret->d_lock);
        return ret;
}
EXPORT_SYMBOL(dget_parent);

static struct dentry * __d_find_any_alias(struct inode *inode)
{
        struct dentry *alias;

        if (hlist_empty(&inode->i_dentry))
                return NULL;
        alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
        lockref_get(&alias->d_lockref);
        return alias;
}

/**
 * d_find_any_alias - find any alias for a given inode
 * @inode: inode to find an alias for
 *
 * If any aliases exist for the given inode, take and return a
 * reference for one of them.  If no aliases exist, return %NULL.
 */
struct dentry *d_find_any_alias(struct inode *inode)
{
        struct dentry *de;

        spin_lock(&inode->i_lock);
        de = __d_find_any_alias(inode);
        spin_unlock(&inode->i_lock);
        return de;
}
EXPORT_SYMBOL(d_find_any_alias);

static struct dentry *__d_find_alias(struct inode *inode)
{
        struct dentry *alias;

        if (S_ISDIR(inode->i_mode))
                return __d_find_any_alias(inode);

        hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
                spin_lock(&alias->d_lock);
                 if (!d_unhashed(alias)) {
                        dget_dlock(alias);
                        spin_unlock(&alias->d_lock);
                        return alias;
                }
                spin_unlock(&alias->d_lock);
        }
        return NULL;
}

/**
 * d_find_alias - grab a hashed alias of inode
 * @inode: inode in question
 *
 * If inode has a hashed alias, or is a directory and has any alias,
 * acquire the reference to alias and return it. Otherwise return NULL.
 * Notice that if inode is a directory there can be only one alias and
 * it can be unhashed only if it has no children, or if it is the root
 * of a filesystem, or if the directory was renamed and d_revalidate
 * was the first vfs operation to notice.
 *
 * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
 * any other hashed alias over that one.
 */
struct dentry *d_find_alias(struct inode *inode)
{
        struct dentry *de = NULL;

        if (!hlist_empty(&inode->i_dentry)) {
                spin_lock(&inode->i_lock);
                de = __d_find_alias(inode);
                spin_unlock(&inode->i_lock);
        }
        return de;
}
EXPORT_SYMBOL(d_find_alias);

/*
 *  Caller MUST be holding rcu_read_lock() and be guaranteed
 *  that inode won't get freed until rcu_read_unlock().
 */
struct dentry *d_find_alias_rcu(struct inode *inode)
{
        struct hlist_head *l = &inode->i_dentry;
        struct dentry *de = NULL;

        spin_lock(&inode->i_lock);
        // ->i_dentry and ->i_rcu are colocated, but the latter won't be
        // used without having I_FREEING set, which means no aliases left
        if (likely(!(inode->i_state & I_FREEING) && !hlist_empty(l))) {
                if (S_ISDIR(inode->i_mode)) {
                        de = hlist_entry(l->first, struct dentry, d_u.d_alias);
                } else {
                        hlist_for_each_entry(de, l, d_u.d_alias)
                                if (!d_unhashed(de))
                                        break;
                }
        }
        spin_unlock(&inode->i_lock);
        return de;
}

/*
 *        Try to kill dentries associated with this inode.
 * WARNING: you must own a reference to inode.
 */
void d_prune_aliases(struct inode *inode)
{
        LIST_HEAD(dispose);
        struct dentry *dentry;

        spin_lock(&inode->i_lock);
        hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
                spin_lock(&dentry->d_lock);
                if (!dentry->d_lockref.count)
                        to_shrink_list(dentry, &dispose);
                spin_unlock(&dentry->d_lock);
        }
        spin_unlock(&inode->i_lock);
        shrink_dentry_list(&dispose);
}
EXPORT_SYMBOL(d_prune_aliases);

static inline void shrink_kill(struct dentry *victim)
{
        do {
                rcu_read_unlock();
                victim = __dentry_kill(victim);
                rcu_read_lock();
        } while (victim && lock_for_kill(victim));
        rcu_read_unlock();
        if (victim)
                spin_unlock(&victim->d_lock);
}

void shrink_dentry_list(struct list_head *list)
{
        while (!list_empty(list)) {
                struct dentry *dentry;

                dentry = list_entry(list->prev, struct dentry, d_lru);
                spin_lock(&dentry->d_lock);
                rcu_read_lock();
                if (!lock_for_kill(dentry)) {
                        bool can_free;
                        rcu_read_unlock();
                        d_shrink_del(dentry);
                        can_free = dentry->d_flags & DCACHE_DENTRY_KILLED;
                        spin_unlock(&dentry->d_lock);
                        if (can_free)
                                dentry_free(dentry);
                        continue;
                }
                d_shrink_del(dentry);
                shrink_kill(dentry);
        }
}

static enum lru_status dentry_lru_isolate(struct list_head *item,
                struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
        struct list_head *freeable = arg;
        struct dentry        *dentry = container_of(item, struct dentry, d_lru);


        /*
         * we are inverting the lru lock/dentry->d_lock here,
         * so use a trylock. If we fail to get the lock, just skip
         * it
         */
        if (!spin_trylock(&dentry->d_lock))
                return LRU_SKIP;

        /*
         * Referenced dentries are still in use. If they have active
         * counts, just remove them from the LRU. Otherwise give them
         * another pass through the LRU.
         */
        if (dentry->d_lockref.count) {
                d_lru_isolate(lru, dentry);
                spin_unlock(&dentry->d_lock);
                return LRU_REMOVED;
        }

        if (dentry->d_flags & DCACHE_REFERENCED) {
                dentry->d_flags &= ~DCACHE_REFERENCED;
                spin_unlock(&dentry->d_lock);

                /*
                 * The list move itself will be made by the common LRU code. At
                 * this point, we've dropped the dentry->d_lock but keep the
                 * lru lock. This is safe to do, since every list movement is
                 * protected by the lru lock even if both locks are held.
                 *
                 * This is guaranteed by the fact that all LRU management
                 * functions are intermediated by the LRU API calls like
                 * list_lru_add_obj and list_lru_del_obj. List movement in this file
                 * only ever occur through this functions or through callbacks
                 * like this one, that are called from the LRU API.
                 *
                 * The only exceptions to this are functions like
                 * shrink_dentry_list, and code that first checks for the
                 * DCACHE_SHRINK_LIST flag.  Those are guaranteed to be
                 * operating only with stack provided lists after they are
                 * properly isolated from the main list.  It is thus, always a
                 * local access.
                 */
                return LRU_ROTATE;
        }

        d_lru_shrink_move(lru, dentry, freeable);
        spin_unlock(&dentry->d_lock);

        return LRU_REMOVED;
}

/**
 * prune_dcache_sb - shrink the dcache
 * @sb: superblock
 * @sc: shrink control, passed to list_lru_shrink_walk()
 *
 * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
 * is done when we need more memory and called from the superblock shrinker
 * function.
 *
 * This function may fail to free any resources if all the dentries are in
 * use.
 */
long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
{
        LIST_HEAD(dispose);
        long freed;

        freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
                                     dentry_lru_isolate, &dispose);
        shrink_dentry_list(&dispose);
        return freed;
}

static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
                struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
        struct list_head *freeable = arg;
        struct dentry        *dentry = container_of(item, struct dentry, d_lru);

        /*
         * we are inverting the lru lock/dentry->d_lock here,
         * so use a trylock. If we fail to get the lock, just skip
         * it
         */
        if (!spin_trylock(&dentry->d_lock))
                return LRU_SKIP;

        d_lru_shrink_move(lru, dentry, freeable);
        spin_unlock(&dentry->d_lock);

        return LRU_REMOVED;
}


/**
 * shrink_dcache_sb - shrink dcache for a superblock
 * @sb: superblock
 *
 * Shrink the dcache for the specified super block. This is used to free
 * the dcache before unmounting a file system.
 */
void shrink_dcache_sb(struct super_block *sb)
{
        do {
                LIST_HEAD(dispose);

                list_lru_walk(&sb->s_dentry_lru,
                        dentry_lru_isolate_shrink, &dispose, 1024);
                shrink_dentry_list(&dispose);
        } while (list_lru_count(&sb->s_dentry_lru) > 0);
}
EXPORT_SYMBOL(shrink_dcache_sb);

/**
 * enum d_walk_ret - action to talke during tree walk
 * @D_WALK_CONTINUE:        contrinue walk
 * @D_WALK_QUIT:        quit walk
 * @D_WALK_NORETRY:        quit when retry is needed
 * @D_WALK_SKIP:        skip this dentry and its children
 */
enum d_walk_ret {
        D_WALK_CONTINUE,
        D_WALK_QUIT,
        D_WALK_NORETRY,
        D_WALK_SKIP,
};

/**
 * d_walk - walk the dentry tree
 * @parent:        start of walk
 * @data:        data passed to @enter() and @finish()
 * @enter:        callback when first entering the dentry
 *
 * The @enter() callbacks are called with d_lock held.
 */
static void d_walk(struct dentry *parent, void *data,
                   enum d_walk_ret (*enter)(void *, struct dentry *))
{
        struct dentry *this_parent, *dentry;
        unsigned seq = 0;
        enum d_walk_ret ret;
        bool retry = true;

again:
        read_seqbegin_or_lock(&rename_lock, &seq);
        this_parent = parent;
        spin_lock(&this_parent->d_lock);

        ret = enter(data, this_parent);
        switch (ret) {
        case D_WALK_CONTINUE:
                break;
        case D_WALK_QUIT:
        case D_WALK_SKIP:
                goto out_unlock;
        case D_WALK_NORETRY:
                retry = false;
                break;
        }
repeat:
        dentry = d_first_child(this_parent);
resume:
        hlist_for_each_entry_from(dentry, d_sib) {
                if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR))
                        continue;

                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);

                ret = enter(data, dentry);
                switch (ret) {
                case D_WALK_CONTINUE:
                        break;
                case D_WALK_QUIT:
                        spin_unlock(&dentry->d_lock);
                        goto out_unlock;
                case D_WALK_NORETRY:
                        retry = false;
                        break;
                case D_WALK_SKIP:
                        spin_unlock(&dentry->d_lock);
                        continue;
                }

                if (!hlist_empty(&dentry->d_children)) {
                        spin_unlock(&this_parent->d_lock);
                        spin_release(&dentry->d_lock.dep_map, _RET_IP_);
                        this_parent = dentry;
                        spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
                        goto repeat;
                }
                spin_unlock(&dentry->d_lock);
        }
        /*
         * All done at this level ... ascend and resume the search.
         */
        rcu_read_lock();
ascend:
        if (this_parent != parent) {
                dentry = this_parent;
                this_parent = dentry->d_parent;

                spin_unlock(&dentry->d_lock);
                spin_lock(&this_parent->d_lock);

                /* might go back up the wrong parent if we have had a rename. */
                if (need_seqretry(&rename_lock, seq))
                        goto rename_retry;
                /* go into the first sibling still alive */
                hlist_for_each_entry_continue(dentry, d_sib) {
                        if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) {
                                rcu_read_unlock();
                                goto resume;
                        }
                }
                goto ascend;
        }
        if (need_seqretry(&rename_lock, seq))
                goto rename_retry;
        rcu_read_unlock();

out_unlock:
        spin_unlock(&this_parent->d_lock);
        done_seqretry(&rename_lock, seq);
        return;

rename_retry:
        spin_unlock(&this_parent->d_lock);
        rcu_read_unlock();
        BUG_ON(seq & 1);
        if (!retry)
                return;
        seq = 1;
        goto again;
}

struct check_mount {
        struct vfsmount *mnt;
        unsigned int mounted;
};

static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry)
{
        struct check_mount *info = data;
        struct path path = { .mnt = info->mnt, .dentry = dentry };

        if (likely(!d_mountpoint(dentry)))
                return D_WALK_CONTINUE;
        if (__path_is_mountpoint(&path)) {
                info->mounted = 1;
                return D_WALK_QUIT;
        }
        return D_WALK_CONTINUE;
}

/**
 * path_has_submounts - check for mounts over a dentry in the
 *                      current namespace.
 * @parent: path to check.
 *
 * Return true if the parent or its subdirectories contain
 * a mount point in the current namespace.
 */
int path_has_submounts(const struct path *parent)
{
        struct check_mount data = { .mnt = parent->mnt, .mounted = 0 };

        read_seqlock_excl(&mount_lock);
        d_walk(parent->dentry, &data, path_check_mount);
        read_sequnlock_excl(&mount_lock);

        return data.mounted;
}
EXPORT_SYMBOL(path_has_submounts);

/*
 * Called by mount code to set a mountpoint and check if the mountpoint is
 * reachable (e.g. NFS can unhash a directory dentry and then the complete
 * subtree can become unreachable).
 *
 * Only one of d_invalidate() and d_set_mounted() must succeed.  For
 * this reason take rename_lock and d_lock on dentry and ancestors.
 */
int d_set_mounted(struct dentry *dentry)
{
        struct dentry *p;
        int ret = -ENOENT;
        write_seqlock(&rename_lock);
        for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
                /* Need exclusion wrt. d_invalidate() */
                spin_lock(&p->d_lock);
                if (unlikely(d_unhashed(p))) {
                        spin_unlock(&p->d_lock);
                        goto out;
                }
                spin_unlock(&p->d_lock);
        }
        spin_lock(&dentry->d_lock);
        if (!d_unlinked(dentry)) {
                ret = -EBUSY;
                if (!d_mountpoint(dentry)) {
                        dentry->d_flags |= DCACHE_MOUNTED;
                        ret = 0;
                }
        }
         spin_unlock(&dentry->d_lock);
out:
        write_sequnlock(&rename_lock);
        return ret;
}

/*
 * Search the dentry child list of the specified parent,
 * and move any unused dentries to the end of the unused
 * list for prune_dcache(). We descend to the next level
 * whenever the d_children list is non-empty and continue
 * searching.
 *
 * It returns zero iff there are no unused children,
 * otherwise  it returns the number of children moved to
 * the end of the unused list. This may not be the total
 * number of unused children, because select_parent can
 * drop the lock and return early due to latency
 * constraints.
 */

struct select_data {
        struct dentry *start;
        union {
                long found;
                struct dentry *victim;
        };
        struct list_head dispose;
};

static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
{
        struct select_data *data = _data;
        enum d_walk_ret ret = D_WALK_CONTINUE;

        if (data->start == dentry)
                goto out;

        if (dentry->d_flags & DCACHE_SHRINK_LIST) {
                data->found++;
        } else if (!dentry->d_lockref.count) {
                to_shrink_list(dentry, &data->dispose);
                data->found++;
        } else if (dentry->d_lockref.count < 0) {
                data->found++;
        }
        /*
         * We can return to the caller if we have found some (this
         * ensures forward progress). We'll be coming back to find
         * the rest.
         */
        if (!list_empty(&data->dispose))
                ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
out:
        return ret;
}

static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry)
{
        struct select_data *data = _data;
        enum d_walk_ret ret = D_WALK_CONTINUE;

        if (data->start == dentry)
                goto out;

        if (!dentry->d_lockref.count) {
                if (dentry->d_flags & DCACHE_SHRINK_LIST) {
                        rcu_read_lock();
                        data->victim = dentry;
                        return D_WALK_QUIT;
                }
                to_shrink_list(dentry, &data->dispose);
        }
        /*
         * We can return to the caller if we have found some (this
         * ensures forward progress). We'll be coming back to find
         * the rest.
         */
        if (!list_empty(&data->dispose))
                ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
out:
        return ret;
}

/**
 * shrink_dcache_parent - prune dcache
 * @parent: parent of entries to prune
 *
 * Prune the dcache to remove unused children of the parent dentry.
 */
void shrink_dcache_parent(struct dentry *parent)
{
        for (;;) {
                struct select_data data = {.start = parent};

                INIT_LIST_HEAD(&data.dispose);
                d_walk(parent, &data, select_collect);

                if (!list_empty(&data.dispose)) {
                        shrink_dentry_list(&data.dispose);
                        continue;
                }

                cond_resched();
                if (!data.found)
                        break;
                data.victim = NULL;
                d_walk(parent, &data, select_collect2);
                if (data.victim) {
                        spin_lock(&data.victim->d_lock);
                        if (!lock_for_kill(data.victim)) {
                                spin_unlock(&data.victim->d_lock);
                                rcu_read_unlock();
                        } else {
                                shrink_kill(data.victim);
                        }
                }
                if (!list_empty(&data.dispose))
                        shrink_dentry_list(&data.dispose);
        }
}
EXPORT_SYMBOL(shrink_dcache_parent);

static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
{
        /* it has busy descendents; complain about those instead */
        if (!hlist_empty(&dentry->d_children))
                return D_WALK_CONTINUE;

        /* root with refcount 1 is fine */
        if (dentry == _data && dentry->d_lockref.count == 1)
                return D_WALK_CONTINUE;

        WARN(1, "BUG: Dentry %p{i=%lx,n=%pd} "
                        " still in use (%d) [unmount of %s %s]\n",
                       dentry,
                       dentry->d_inode ?
                       dentry->d_inode->i_ino : 0UL,
                       dentry,
                       dentry->d_lockref.count,
                       dentry->d_sb->s_type->name,
                       dentry->d_sb->s_id);
        return D_WALK_CONTINUE;
}

static void do_one_tree(struct dentry *dentry)
{
        shrink_dcache_parent(dentry);
        d_walk(dentry, dentry, umount_check);
        d_drop(dentry);
        dput(dentry);
}

/*
 * destroy the dentries attached to a superblock on unmounting
 */
void shrink_dcache_for_umount(struct super_block *sb)
{
        struct dentry *dentry;

        WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked");

        dentry = sb->s_root;
        sb->s_root = NULL;
        do_one_tree(dentry);

        while (!hlist_bl_empty(&sb->s_roots)) {
                dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash));
                do_one_tree(dentry);
        }
}

static enum d_walk_ret find_submount(void *_data, struct dentry *dentry)
{
        struct dentry **victim = _data;
        if (d_mountpoint(dentry)) {
                *victim = dget_dlock(dentry);
                return D_WALK_QUIT;
        }
        return D_WALK_CONTINUE;
}

/**
 * d_invalidate - detach submounts, prune dcache, and drop
 * @dentry: dentry to invalidate (aka detach, prune and drop)
 */
void d_invalidate(struct dentry *dentry)
{
        bool had_submounts = false;
        spin_lock(&dentry->d_lock);
        if (d_unhashed(dentry)) {
                spin_unlock(&dentry->d_lock);
                return;
        }
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);

        /* Negative dentries can be dropped without further checks */
        if (!dentry->d_inode)
                return;

        shrink_dcache_parent(dentry);
        for (;;) {
                struct dentry *victim = NULL;
                d_walk(dentry, &victim, find_submount);
                if (!victim) {
                        if (had_submounts)
                                shrink_dcache_parent(dentry);
                        return;
                }
                had_submounts = true;
                detach_mounts(victim);
                dput(victim);
        }
}
EXPORT_SYMBOL(d_invalidate);

/**
 * __d_alloc        -        allocate a dcache entry
 * @sb: filesystem it will belong to
 * @name: qstr of the name
 *
 * Allocates a dentry. It returns %NULL if there is insufficient memory
 * available. On a success the dentry is returned. The name passed in is
 * copied and the copy passed in may be reused after this call.
 */
 
static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
{
        struct dentry *dentry;
        char *dname;
        int err;

        dentry = kmem_cache_alloc_lru(dentry_cache, &sb->s_dentry_lru,
                                      GFP_KERNEL);
        if (!dentry)
                return NULL;

        /*
         * We guarantee that the inline name is always NUL-terminated.
         * This way the memcpy() done by the name switching in rename
         * will still always have a NUL at the end, even if we might
         * be overwriting an internal NUL character
         */
        dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
        if (unlikely(!name)) {
                name = &slash_name;
                dname = dentry->d_iname;
        } else if (name->len > DNAME_INLINE_LEN-1) {
                size_t size = offsetof(struct external_name, name[1]);
                struct external_name *p = kmalloc(size + name->len,
                                                  GFP_KERNEL_ACCOUNT |
                                                  __GFP_RECLAIMABLE);
                if (!p) {
                        kmem_cache_free(dentry_cache, dentry); 
                        return NULL;
                }
                atomic_set(&p->u.count, 1);
                dname = p->name;
        } else  {
                dname = dentry->d_iname;
        }        

        dentry->d_name.len = name->len;
        dentry->d_name.hash = name->hash;
        memcpy(dname, name->name, name->len);
        dname[name->len] = 0;

        /* Make sure we always see the terminating NUL character */
        smp_store_release(&dentry->d_name.name, dname); /* ^^^ */

        dentry->d_lockref.count = 1;
        dentry->d_flags = 0;
        spin_lock_init(&dentry->d_lock);
        seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock);
        dentry->d_inode = NULL;
        dentry->d_parent = dentry;
        dentry->d_sb = sb;
        dentry->d_op = NULL;
        dentry->d_fsdata = NULL;
        INIT_HLIST_BL_NODE(&dentry->d_hash);
        INIT_LIST_HEAD(&dentry->d_lru);
        INIT_HLIST_HEAD(&dentry->d_children);
        INIT_HLIST_NODE(&dentry->d_u.d_alias);
        INIT_HLIST_NODE(&dentry->d_sib);
        d_set_d_op(dentry, dentry->d_sb->s_d_op);

        if (dentry->d_op && dentry->d_op->d_init) {
                err = dentry->d_op->d_init(dentry);
                if (err) {
                        if (dname_external(dentry))
                                kfree(external_name(dentry));
                        kmem_cache_free(dentry_cache, dentry);
                        return NULL;
                }
        }

        this_cpu_inc(nr_dentry);

        return dentry;
}

/**
 * d_alloc        -        allocate a dcache entry
 * @parent: parent of entry to allocate
 * @name: qstr of the name
 *
 * Allocates a dentry. It returns %NULL if there is insufficient memory
 * available. On a success the dentry is returned. The name passed in is
 * copied and the copy passed in may be reused after this call.
 */
struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
{
        struct dentry *dentry = __d_alloc(parent->d_sb, name);
        if (!dentry)
                return NULL;
        spin_lock(&parent->d_lock);
        /*
         * don't need child lock because it is not subject
         * to concurrency here
         */
        dentry->d_parent = dget_dlock(parent);
        hlist_add_head(&dentry->d_sib, &parent->d_children);
        spin_unlock(&parent->d_lock);

        return dentry;
}
EXPORT_SYMBOL(d_alloc);

struct dentry *d_alloc_anon(struct super_block *sb)
{
        return __d_alloc(sb, NULL);
}
EXPORT_SYMBOL(d_alloc_anon);

struct dentry *d_alloc_cursor(struct dentry * parent)
{
        struct dentry *dentry = d_alloc_anon(parent->d_sb);
        if (dentry) {
                dentry->d_flags |= DCACHE_DENTRY_CURSOR;
                dentry->d_parent = dget(parent);
        }
        return dentry;
}

/**
 * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems)
 * @sb: the superblock
 * @name: qstr of the name
 *
 * For a filesystem that just pins its dentries in memory and never
 * performs lookups at all, return an unhashed IS_ROOT dentry.
 * This is used for pipes, sockets et.al. - the stuff that should
 * never be anyone's children or parents.  Unlike all other
 * dentries, these will not have RCU delay between dropping the
 * last reference and freeing them.
 *
 * The only user is alloc_file_pseudo() and that's what should
 * be considered a public interface.  Don't use directly.
 */
struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
{
        static const struct dentry_operations anon_ops = {
                .d_dname = simple_dname
        };
        struct dentry *dentry = __d_alloc(sb, name);
        if (likely(dentry)) {
                dentry->d_flags |= DCACHE_NORCU;
                if (!sb->s_d_op)
                        d_set_d_op(dentry, &anon_ops);
        }
        return dentry;
}

struct dentry *d_alloc_name(struct dentry *parent, const char *name)
{
        struct qstr q;

        q.name = name;
        q.hash_len = hashlen_string(parent, name);
        return d_alloc(parent, &q);
}
EXPORT_SYMBOL(d_alloc_name);

void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
{
        WARN_ON_ONCE(dentry->d_op);
        WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH        |
                                DCACHE_OP_COMPARE        |
                                DCACHE_OP_REVALIDATE        |
                                DCACHE_OP_WEAK_REVALIDATE        |
                                DCACHE_OP_DELETE        |
                                DCACHE_OP_REAL));
        dentry->d_op = op;
        if (!op)
                return;
        if (op->d_hash)
                dentry->d_flags |= DCACHE_OP_HASH;
        if (op->d_compare)
                dentry->d_flags |= DCACHE_OP_COMPARE;
        if (op->d_revalidate)
                dentry->d_flags |= DCACHE_OP_REVALIDATE;
        if (op->d_weak_revalidate)
                dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE;
        if (op->d_delete)
                dentry->d_flags |= DCACHE_OP_DELETE;
        if (op->d_prune)
                dentry->d_flags |= DCACHE_OP_PRUNE;
        if (op->d_real)
                dentry->d_flags |= DCACHE_OP_REAL;

}
EXPORT_SYMBOL(d_set_d_op);

static unsigned d_flags_for_inode(struct inode *inode)
{
        unsigned add_flags = DCACHE_REGULAR_TYPE;

        if (!inode)
                return DCACHE_MISS_TYPE;

        if (S_ISDIR(inode->i_mode)) {
                add_flags = DCACHE_DIRECTORY_TYPE;
                if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) {
                        if (unlikely(!inode->i_op->lookup))
                                add_flags = DCACHE_AUTODIR_TYPE;
                        else
                                inode->i_opflags |= IOP_LOOKUP;
                }
                goto type_determined;
        }

        if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
                if (unlikely(inode->i_op->get_link)) {
                        add_flags = DCACHE_SYMLINK_TYPE;
                        goto type_determined;
                }
                inode->i_opflags |= IOP_NOFOLLOW;
        }

        if (unlikely(!S_ISREG(inode->i_mode)))
                add_flags = DCACHE_SPECIAL_TYPE;

type_determined:
        if (unlikely(IS_AUTOMOUNT(inode)))
                add_flags |= DCACHE_NEED_AUTOMOUNT;
        return add_flags;
}

static void __d_instantiate(struct dentry *dentry, struct inode *inode)
{
        unsigned add_flags = d_flags_for_inode(inode);
        WARN_ON(d_in_lookup(dentry));

        spin_lock(&dentry->d_lock);
        /*
         * Decrement negative dentry count if it was in the LRU list.
         */
        if (dentry->d_flags & DCACHE_LRU_LIST)
                this_cpu_dec(nr_dentry_negative);
        hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
        raw_write_seqcount_begin(&dentry->d_seq);
        __d_set_inode_and_type(dentry, inode, add_flags);
        raw_write_seqcount_end(&dentry->d_seq);
        fsnotify_update_flags(dentry);
        spin_unlock(&dentry->d_lock);
}

/**
 * d_instantiate - fill in inode information for a dentry
 * @entry: dentry to complete
 * @inode: inode to attach to this dentry
 *
 * Fill in inode information in the entry.
 *
 * This turns negative dentries into productive full members
 * of society.
 *
 * NOTE! This assumes that the inode count has been incremented
 * (or otherwise set) by the caller to indicate that it is now
 * in use by the dcache.
 */
 
void d_instantiate(struct dentry *entry, struct inode * inode)
{
        BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
        if (inode) {
                security_d_instantiate(entry, inode);
                spin_lock(&inode->i_lock);
                __d_instantiate(entry, inode);
                spin_unlock(&inode->i_lock);
        }
}
EXPORT_SYMBOL(d_instantiate);

/*
 * This should be equivalent to d_instantiate() + unlock_new_inode(),
 * with lockdep-related part of unlock_new_inode() done before
 * anything else.  Use that instead of open-coding d_instantiate()/
 * unlock_new_inode() combinations.
 */
void d_instantiate_new(struct dentry *entry, struct inode *inode)
{
        BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
        BUG_ON(!inode);
        lockdep_annotate_inode_mutex_key(inode);
        security_d_instantiate(entry, inode);
        spin_lock(&inode->i_lock);
        __d_instantiate(entry, inode);
        WARN_ON(!(inode->i_state & I_NEW));
        inode->i_state &= ~I_NEW & ~I_CREATING;
        smp_mb();
        wake_up_bit(&inode->i_state, __I_NEW);
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_instantiate_new);

struct dentry *d_make_root(struct inode *root_inode)
{
        struct dentry *res = NULL;

        if (root_inode) {
                res = d_alloc_anon(root_inode->i_sb);
                if (res)
                        d_instantiate(res, root_inode);
                else
                        iput(root_inode);
        }
        return res;
}
EXPORT_SYMBOL(d_make_root);

static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected)
{
        struct super_block *sb;
        struct dentry *new, *res;

        if (!inode)
                return ERR_PTR(-ESTALE);
        if (IS_ERR(inode))
                return ERR_CAST(inode);

        sb = inode->i_sb;

        res = d_find_any_alias(inode); /* existing alias? */
        if (res)
                goto out;

        new = d_alloc_anon(sb);
        if (!new) {
                res = ERR_PTR(-ENOMEM);
                goto out;
        }

        security_d_instantiate(new, inode);
        spin_lock(&inode->i_lock);
        res = __d_find_any_alias(inode); /* recheck under lock */
        if (likely(!res)) { /* still no alias, attach a disconnected dentry */
                unsigned add_flags = d_flags_for_inode(inode);

                if (disconnected)
                        add_flags |= DCACHE_DISCONNECTED;

                spin_lock(&new->d_lock);
                __d_set_inode_and_type(new, inode, add_flags);
                hlist_add_head(&new->d_u.d_alias, &inode->i_dentry);
                if (!disconnected) {
                        hlist_bl_lock(&sb->s_roots);
                        hlist_bl_add_head(&new->d_hash, &sb->s_roots);
                        hlist_bl_unlock(&sb->s_roots);
                }
                spin_unlock(&new->d_lock);
                spin_unlock(&inode->i_lock);
                inode = NULL; /* consumed by new->d_inode */
                res = new;
        } else {
                spin_unlock(&inode->i_lock);
                dput(new);
        }

 out:
        iput(inode);
        return res;
}

/**
 * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode
 * @inode: inode to allocate the dentry for
 *
 * Obtain a dentry for an inode resulting from NFS filehandle conversion or
 * similar open by handle operations.  The returned dentry may be anonymous,
 * or may have a full name (if the inode was already in the cache).
 *
 * When called on a directory inode, we must ensure that the inode only ever
 * has one dentry.  If a dentry is found, that is returned instead of
 * allocating a new one.
 *
 * On successful return, the reference to the inode has been transferred
 * to the dentry.  In case of an error the reference on the inode is released.
 * To make it easier to use in export operations a %NULL or IS_ERR inode may
 * be passed in and the error will be propagated to the return value,
 * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
 */
struct dentry *d_obtain_alias(struct inode *inode)
{
        return __d_obtain_alias(inode, true);
}
EXPORT_SYMBOL(d_obtain_alias);

/**
 * d_obtain_root - find or allocate a dentry for a given inode
 * @inode: inode to allocate the dentry for
 *
 * Obtain an IS_ROOT dentry for the root of a filesystem.
 *
 * We must ensure that directory inodes only ever have one dentry.  If a
 * dentry is found, that is returned instead of allocating a new one.
 *
 * On successful return, the reference to the inode has been transferred
 * to the dentry.  In case of an error the reference on the inode is
 * released.  A %NULL or IS_ERR inode may be passed in and will be the
 * error will be propagate to the return value, with a %NULL @inode
 * replaced by ERR_PTR(-ESTALE).
 */
struct dentry *d_obtain_root(struct inode *inode)
{
        return __d_obtain_alias(inode, false);
}
EXPORT_SYMBOL(d_obtain_root);

/**
 * d_add_ci - lookup or allocate new dentry with case-exact name
 * @inode:  the inode case-insensitive lookup has found
 * @dentry: the negative dentry that was passed to the parent's lookup func
 * @name:   the case-exact name to be associated with the returned dentry
 *
 * This is to avoid filling the dcache with case-insensitive names to the
 * same inode, only the actual correct case is stored in the dcache for
 * case-insensitive filesystems.
 *
 * For a case-insensitive lookup match and if the case-exact dentry
 * already exists in the dcache, use it and return it.
 *
 * If no entry exists with the exact case name, allocate new dentry with
 * the exact case, and return the spliced entry.
 */
struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
                        struct qstr *name)
{
        struct dentry *found, *res;

        /*
         * First check if a dentry matching the name already exists,
         * if not go ahead and create it now.
         */
        found = d_hash_and_lookup(dentry->d_parent, name);
        if (found) {
                iput(inode);
                return found;
        }
        if (d_in_lookup(dentry)) {
                found = d_alloc_parallel(dentry->d_parent, name,
                                        dentry->d_wait);
                if (IS_ERR(found) || !d_in_lookup(found)) {
                        iput(inode);
                        return found;
                }
        } else {
                found = d_alloc(dentry->d_parent, name);
                if (!found) {
                        iput(inode);
                        return ERR_PTR(-ENOMEM);
                } 
        }
        res = d_splice_alias(inode, found);
        if (res) {
                d_lookup_done(found);
                dput(found);
                return res;
        }
        return found;
}
EXPORT_SYMBOL(d_add_ci);

/**
 * d_same_name - compare dentry name with case-exact name
 * @parent: parent dentry
 * @dentry: the negative dentry that was passed to the parent's lookup func
 * @name:   the case-exact name to be associated with the returned dentry
 *
 * Return: true if names are same, or false
 */
bool d_same_name(const struct dentry *dentry, const struct dentry *parent,
                 const struct qstr *name)
{
        if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) {
                if (dentry->d_name.len != name->len)
                        return false;
                return dentry_cmp(dentry, name->name, name->len) == 0;
        }
        return parent->d_op->d_compare(dentry,
                                       dentry->d_name.len, dentry->d_name.name,
                                       name) == 0;
}
EXPORT_SYMBOL_GPL(d_same_name);

/*
 * This is __d_lookup_rcu() when the parent dentry has
 * DCACHE_OP_COMPARE, which makes things much nastier.
 */
static noinline struct dentry *__d_lookup_rcu_op_compare(
        const struct dentry *parent,
        const struct qstr *name,
        unsigned *seqp)
{
        u64 hashlen = name->hash_len;
        struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen));
        struct hlist_bl_node *node;
        struct dentry *dentry;

        hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
                int tlen;
                const char *tname;
                unsigned seq;

seqretry:
                seq = raw_seqcount_begin(&dentry->d_seq);
                if (dentry->d_parent != parent)
                        continue;
                if (d_unhashed(dentry))
                        continue;
                if (dentry->d_name.hash != hashlen_hash(hashlen))
                        continue;
                tlen = dentry->d_name.len;
                tname = dentry->d_name.name;
                /* we want a consistent (name,len) pair */
                if (read_seqcount_retry(&dentry->d_seq, seq)) {
                        cpu_relax();
                        goto seqretry;
                }
                if (parent->d_op->d_compare(dentry, tlen, tname, name) != 0)
                        continue;
                *seqp = seq;
                return dentry;
        }
        return NULL;
}

/**
 * __d_lookup_rcu - search for a dentry (racy, store-free)
 * @parent: parent dentry
 * @name: qstr of name we wish to find
 * @seqp: returns d_seq value at the point where the dentry was found
 * Returns: dentry, or NULL
 *
 * __d_lookup_rcu is the dcache lookup function for rcu-walk name
 * resolution (store-free path walking) design described in
 * Documentation/filesystems/path-lookup.txt.
 *
 * This is not to be used outside core vfs.
 *
 * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
 * held, and rcu_read_lock held. The returned dentry must not be stored into
 * without taking d_lock and checking d_seq sequence count against @seq
 * returned here.
 *
 * A refcount may be taken on the found dentry with the d_rcu_to_refcount
 * function.
 *
 * Alternatively, __d_lookup_rcu may be called again to look up the child of
 * the returned dentry, so long as its parent's seqlock is checked after the
 * child is looked up. Thus, an interlocking stepping of sequence lock checks
 * is formed, giving integrity down the path walk.
 *
 * NOTE! The caller *has* to check the resulting dentry against the sequence
 * number we've returned before using any of the resulting dentry state!
 */
struct dentry *__d_lookup_rcu(const struct dentry *parent,
                                const struct qstr *name,
                                unsigned *seqp)
{
        u64 hashlen = name->hash_len;
        const unsigned char *str = name->name;
        struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen));
        struct hlist_bl_node *node;
        struct dentry *dentry;

        /*
         * Note: There is significant duplication with __d_lookup_rcu which is
         * required to prevent single threaded performance regressions
         * especially on architectures where smp_rmb (in seqcounts) are costly.
         * Keep the two functions in sync.
         */

        if (unlikely(parent->d_flags & DCACHE_OP_COMPARE))
                return __d_lookup_rcu_op_compare(parent, name, seqp);

        /*
         * The hash list is protected using RCU.
         *
         * Carefully use d_seq when comparing a candidate dentry, to avoid
         * races with d_move().
         *
         * It is possible that concurrent renames can mess up our list
         * walk here and result in missing our dentry, resulting in the
         * false-negative result. d_lookup() protects against concurrent
         * renames using rename_lock seqlock.
         *
         * See Documentation/filesystems/path-lookup.txt for more details.
         */
        hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
                unsigned seq;

                /*
                 * The dentry sequence count protects us from concurrent
                 * renames, and thus protects parent and name fields.
                 *
                 * The caller must perform a seqcount check in order
                 * to do anything useful with the returned dentry.
                 *
                 * NOTE! We do a "raw" seqcount_begin here. That means that
                 * we don't wait for the sequence count to stabilize if it
                 * is in the middle of a sequence change. If we do the slow
                 * dentry compare, we will do seqretries until it is stable,
                 * and if we end up with a successful lookup, we actually
                 * want to exit RCU lookup anyway.
                 *
                 * Note that raw_seqcount_begin still *does* smp_rmb(), so
                 * we are still guaranteed NUL-termination of ->d_name.name.
                 */
                seq = raw_seqcount_begin(&dentry->d_seq);
                if (dentry->d_parent != parent)
                        continue;
                if (d_unhashed(dentry))
                        continue;
                if (dentry->d_name.hash_len != hashlen)
                        continue;
                if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
                        continue;
                *seqp = seq;
                return dentry;
        }
        return NULL;
}

/**
 * d_lookup - search for a dentry
 * @parent: parent dentry
 * @name: qstr of name we wish to find
 * Returns: dentry, or NULL
 *
 * d_lookup searches the children of the parent dentry for the name in
 * question. If the dentry is found its reference count is incremented and the
 * dentry is returned. The caller must use dput to free the entry when it has
 * finished using it. %NULL is returned if the dentry does not exist.
 */
struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name)
{
        struct dentry *dentry;
        unsigned seq;

        do {
                seq = read_seqbegin(&rename_lock);
                dentry = __d_lookup(parent, name);
                if (dentry)
                        break;
        } while (read_seqretry(&rename_lock, seq));
        return dentry;
}
EXPORT_SYMBOL(d_lookup);

/**
 * __d_lookup - search for a dentry (racy)
 * @parent: parent dentry
 * @name: qstr of name we wish to find
 * Returns: dentry, or NULL
 *
 * __d_lookup is like d_lookup, however it may (rarely) return a
 * false-negative result due to unrelated rename activity.
 *
 * __d_lookup is slightly faster by avoiding rename_lock read seqlock,
 * however it must be used carefully, eg. with a following d_lookup in
 * the case of failure.
 *
 * __d_lookup callers must be commented.
 */
struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
{
        unsigned int hash = name->hash;
        struct hlist_bl_head *b = d_hash(hash);
        struct hlist_bl_node *node;
        struct dentry *found = NULL;
        struct dentry *dentry;

        /*
         * Note: There is significant duplication with __d_lookup_rcu which is
         * required to prevent single threaded performance regressions
         * especially on architectures where smp_rmb (in seqcounts) are costly.
         * Keep the two functions in sync.
         */

        /*
         * The hash list is protected using RCU.
         *
         * Take d_lock when comparing a candidate dentry, to avoid races
         * with d_move().
         *
         * It is possible that concurrent renames can mess up our list
         * walk here and result in missing our dentry, resulting in the
         * false-negative result. d_lookup() protects against concurrent
         * renames using rename_lock seqlock.
         *
         * See Documentation/filesystems/path-lookup.txt for more details.
         */
        rcu_read_lock();
        
        hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {

                if (dentry->d_name.hash != hash)
                        continue;

                spin_lock(&dentry->d_lock);
                if (dentry->d_parent != parent)
                        goto next;
                if (d_unhashed(dentry))
                        goto next;

                if (!d_same_name(dentry, parent, name))
                        goto next;

                dentry->d_lockref.count++;
                found = dentry;
                spin_unlock(&dentry->d_lock);
                break;
next:
                spin_unlock(&dentry->d_lock);
         }
         rcu_read_unlock();

         return found;
}

/**
 * d_hash_and_lookup - hash the qstr then search for a dentry
 * @dir: Directory to search in
 * @name: qstr of name we wish to find
 *
 * On lookup failure NULL is returned; on bad name - ERR_PTR(-error)
 */
struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
{
        /*
         * Check for a fs-specific hash function. Note that we must
         * calculate the standard hash first, as the d_op->d_hash()
         * routine may choose to leave the hash value unchanged.
         */
        name->hash = full_name_hash(dir, name->name, name->len);
        if (dir->d_flags & DCACHE_OP_HASH) {
                int err = dir->d_op->d_hash(dir, name);
                if (unlikely(err < 0))
                        return ERR_PTR(err);
        }
        return d_lookup(dir, name);
}
EXPORT_SYMBOL(d_hash_and_lookup);

/*
 * When a file is deleted, we have two options:
 * - turn this dentry into a negative dentry
 * - unhash this dentry and free it.
 *
 * Usually, we want to just turn this into
 * a negative dentry, but if anybody else is
 * currently using the dentry or the inode
 * we can't do that and we fall back on removing
 * it from the hash queues and waiting for
 * it to be deleted later when it has no users
 */
 
/**
 * d_delete - delete a dentry
 * @dentry: The dentry to delete
 *
 * Turn the dentry into a negative dentry if possible, otherwise
 * remove it from the hash queues so it can be deleted later
 */
 
void d_delete(struct dentry * dentry)
{
        struct inode *inode = dentry->d_inode;

        spin_lock(&inode->i_lock);
        spin_lock(&dentry->d_lock);
        /*
         * Are we the only user?
         */
        if (dentry->d_lockref.count == 1) {
                dentry->d_flags &= ~DCACHE_CANT_MOUNT;
                dentry_unlink_inode(dentry);
        } else {
                __d_drop(dentry);
                spin_unlock(&dentry->d_lock);
                spin_unlock(&inode->i_lock);
        }
}
EXPORT_SYMBOL(d_delete);

static void __d_rehash(struct dentry *entry)
{
        struct hlist_bl_head *b = d_hash(entry->d_name.hash);

        hlist_bl_lock(b);
        hlist_bl_add_head_rcu(&entry->d_hash, b);
        hlist_bl_unlock(b);
}

/**
 * d_rehash        - add an entry back to the hash
 * @entry: dentry to add to the hash
 *
 * Adds a dentry to the hash according to its name.
 */
 
void d_rehash(struct dentry * entry)
{
        spin_lock(&entry->d_lock);
        __d_rehash(entry);
        spin_unlock(&entry->d_lock);
}
EXPORT_SYMBOL(d_rehash);

static inline unsigned start_dir_add(struct inode *dir)
{
        preempt_disable_nested();
        for (;;) {
                unsigned n = dir->i_dir_seq;
                if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
                        return n;
                cpu_relax();
        }
}

static inline void end_dir_add(struct inode *dir, unsigned int n,
                               wait_queue_head_t *d_wait)
{
        smp_store_release(&dir->i_dir_seq, n + 2);
        preempt_enable_nested();
        wake_up_all(d_wait);
}

static void d_wait_lookup(struct dentry *dentry)
{
        if (d_in_lookup(dentry)) {
                DECLARE_WAITQUEUE(wait, current);
                add_wait_queue(dentry->d_wait, &wait);
                do {
                        set_current_state(TASK_UNINTERRUPTIBLE);
                        spin_unlock(&dentry->d_lock);
                        schedule();
                        spin_lock(&dentry->d_lock);
                } while (d_in_lookup(dentry));
        }
}

struct dentry *d_alloc_parallel(struct dentry *parent,
                                const struct qstr *name,
                                wait_queue_head_t *wq)
{
        unsigned int hash = name->hash;
        struct hlist_bl_head *b = in_lookup_hash(parent, hash);
        struct hlist_bl_node *node;
        struct dentry *new = d_alloc(parent, name);
        struct dentry *dentry;
        unsigned seq, r_seq, d_seq;

        if (unlikely(!new))
                return ERR_PTR(-ENOMEM);

retry:
        rcu_read_lock();
        seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
        r_seq = read_seqbegin(&rename_lock);
        dentry = __d_lookup_rcu(parent, name, &d_seq);
        if (unlikely(dentry)) {
                if (!lockref_get_not_dead(&dentry->d_lockref)) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (read_seqcount_retry(&dentry->d_seq, d_seq)) {
                        rcu_read_unlock();
                        dput(dentry);
                        goto retry;
                }
                rcu_read_unlock();
                dput(new);
                return dentry;
        }
        if (unlikely(read_seqretry(&rename_lock, r_seq))) {
                rcu_read_unlock();
                goto retry;
        }

        if (unlikely(seq & 1)) {
                rcu_read_unlock();
                goto retry;
        }

        hlist_bl_lock(b);
        if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) {
                hlist_bl_unlock(b);
                rcu_read_unlock();
                goto retry;
        }
        /*
         * No changes for the parent since the beginning of d_lookup().
         * Since all removals from the chain happen with hlist_bl_lock(),
         * any potential in-lookup matches are going to stay here until
         * we unlock the chain.  All fields are stable in everything
         * we encounter.
         */
        hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) {
                if (dentry->d_name.hash != hash)
                        continue;
                if (dentry->d_parent != parent)
                        continue;
                if (!d_same_name(dentry, parent, name))
                        continue;
                hlist_bl_unlock(b);
                /* now we can try to grab a reference */
                if (!lockref_get_not_dead(&dentry->d_lockref)) {
                        rcu_read_unlock();
                        goto retry;
                }

                rcu_read_unlock();
                /*
                 * somebody is likely to be still doing lookup for it;
                 * wait for them to finish
                 */
                spin_lock(&dentry->d_lock);
                d_wait_lookup(dentry);
                /*
                 * it's not in-lookup anymore; in principle we should repeat
                 * everything from dcache lookup, but it's likely to be what
                 * d_lookup() would've found anyway.  If it is, just return it;
                 * otherwise we really have to repeat the whole thing.
                 */
                if (unlikely(dentry->d_name.hash != hash))
                        goto mismatch;
                if (unlikely(dentry->d_parent != parent))
                        goto mismatch;
                if (unlikely(d_unhashed(dentry)))
                        goto mismatch;
                if (unlikely(!d_same_name(dentry, parent, name)))
                        goto mismatch;
                /* OK, it *is* a hashed match; return it */
                spin_unlock(&dentry->d_lock);
                dput(new);
                return dentry;
        }
        rcu_read_unlock();
        /* we can't take ->d_lock here; it's OK, though. */
        new->d_flags |= DCACHE_PAR_LOOKUP;
        new->d_wait = wq;
        hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b);
        hlist_bl_unlock(b);
        return new;
mismatch:
        spin_unlock(&dentry->d_lock);
        dput(dentry);
        goto retry;
}
EXPORT_SYMBOL(d_alloc_parallel);

/*
 * - Unhash the dentry
 * - Retrieve and clear the waitqueue head in dentry
 * - Return the waitqueue head
 */
static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry)
{
        wait_queue_head_t *d_wait;
        struct hlist_bl_head *b;

        lockdep_assert_held(&dentry->d_lock);

        b = in_lookup_hash(dentry->d_parent, dentry->d_name.hash);
        hlist_bl_lock(b);
        dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
        __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
        d_wait = dentry->d_wait;
        dentry->d_wait = NULL;
        hlist_bl_unlock(b);
        INIT_HLIST_NODE(&dentry->d_u.d_alias);
        INIT_LIST_HEAD(&dentry->d_lru);
        return d_wait;
}

void __d_lookup_unhash_wake(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        wake_up_all(__d_lookup_unhash(dentry));
        spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(__d_lookup_unhash_wake);

/* inode->i_lock held if inode is non-NULL */

static inline void __d_add(struct dentry *dentry, struct inode *inode)
{
        wait_queue_head_t *d_wait;
        struct inode *dir = NULL;
        unsigned n;
        spin_lock(&dentry->d_lock);
        if (unlikely(d_in_lookup(dentry))) {
                dir = dentry->d_parent->d_inode;
                n = start_dir_add(dir);
                d_wait = __d_lookup_unhash(dentry);
        }
        if (inode) {
                unsigned add_flags = d_flags_for_inode(inode);
                hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
                raw_write_seqcount_begin(&dentry->d_seq);
                __d_set_inode_and_type(dentry, inode, add_flags);
                raw_write_seqcount_end(&dentry->d_seq);
                fsnotify_update_flags(dentry);
        }
        __d_rehash(dentry);
        if (dir)
                end_dir_add(dir, n, d_wait);
        spin_unlock(&dentry->d_lock);
        if (inode)
                spin_unlock(&inode->i_lock);
}

/**
 * d_add - add dentry to hash queues
 * @entry: dentry to add
 * @inode: The inode to attach to this dentry
 *
 * This adds the entry to the hash queues and initializes @inode.
 * The entry was actually filled in earlier during d_alloc().
 */

void d_add(struct dentry *entry, struct inode *inode)
{
        if (inode) {
                security_d_instantiate(entry, inode);
                spin_lock(&inode->i_lock);
        }
        __d_add(entry, inode);
}
EXPORT_SYMBOL(d_add);

/**
 * d_exact_alias - find and hash an exact unhashed alias
 * @entry: dentry to add
 * @inode: The inode to go with this dentry
 *
 * If an unhashed dentry with the same name/parent and desired
 * inode already exists, hash and return it.  Otherwise, return
 * NULL.
 *
 * Parent directory should be locked.
 */
struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
{
        struct dentry *alias;
        unsigned int hash = entry->d_name.hash;

        spin_lock(&inode->i_lock);
        hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
                /*
                 * Don't need alias->d_lock here, because aliases with
                 * d_parent == entry->d_parent are not subject to name or
                 * parent changes, because the parent inode i_mutex is held.
                 */
                if (alias->d_name.hash != hash)
                        continue;
                if (alias->d_parent != entry->d_parent)
                        continue;
                if (!d_same_name(alias, entry->d_parent, &entry->d_name))
                        continue;
                spin_lock(&alias->d_lock);
                if (!d_unhashed(alias)) {
                        spin_unlock(&alias->d_lock);
                        alias = NULL;
                } else {
                        dget_dlock(alias);
                        __d_rehash(alias);
                        spin_unlock(&alias->d_lock);
                }
                spin_unlock(&inode->i_lock);
                return alias;
        }
        spin_unlock(&inode->i_lock);
        return NULL;
}
EXPORT_SYMBOL(d_exact_alias);

static void swap_names(struct dentry *dentry, struct dentry *target)
{
        if (unlikely(dname_external(target))) {
                if (unlikely(dname_external(dentry))) {
                        /*
                         * Both external: swap the pointers
                         */
                        swap(target->d_name.name, dentry->d_name.name);
                } else {
                        /*
                         * dentry:internal, target:external.  Steal target's
                         * storage and make target internal.
                         */
                        memcpy(target->d_iname, dentry->d_name.name,
                                        dentry->d_name.len + 1);
                        dentry->d_name.name = target->d_name.name;
                        target->d_name.name = target->d_iname;
                }
        } else {
                if (unlikely(dname_external(dentry))) {
                        /*
                         * dentry:external, target:internal.  Give dentry's
                         * storage to target and make dentry internal
                         */
                        memcpy(dentry->d_iname, target->d_name.name,
                                        target->d_name.len + 1);
                        target->d_name.name = dentry->d_name.name;
                        dentry->d_name.name = dentry->d_iname;
                } else {
                        /*
                         * Both are internal.
                         */
                        unsigned int i;
                        BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
                        for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
                                swap(((long *) &dentry->d_iname)[i],
                                     ((long *) &target->d_iname)[i]);
                        }
                }
        }
        swap(dentry->d_name.hash_len, target->d_name.hash_len);
}

static void copy_name(struct dentry *dentry, struct dentry *target)
{
        struct external_name *old_name = NULL;
        if (unlikely(dname_external(dentry)))
                old_name = external_name(dentry);
        if (unlikely(dname_external(target))) {
                atomic_inc(&external_name(target)->u.count);
                dentry->d_name = target->d_name;
        } else {
                memcpy(dentry->d_iname, target->d_name.name,
                                target->d_name.len + 1);
                dentry->d_name.name = dentry->d_iname;
                dentry->d_name.hash_len = target->d_name.hash_len;
        }
        if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
                kfree_rcu(old_name, u.head);
}

/*
 * __d_move - move a dentry
 * @dentry: entry to move
 * @target: new dentry
 * @exchange: exchange the two dentries
 *
 * Update the dcache to reflect the move of a file name. Negative
 * dcache entries should not be moved in this way. Caller must hold
 * rename_lock, the i_mutex of the source and target directories,
 * and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
 */
static void __d_move(struct dentry *dentry, struct dentry *target,
                     bool exchange)
{
        struct dentry *old_parent, *p;
        wait_queue_head_t *d_wait;
        struct inode *dir = NULL;
        unsigned n;

        WARN_ON(!dentry->d_inode);
        if (WARN_ON(dentry == target))
                return;

        BUG_ON(d_ancestor(target, dentry));
        old_parent = dentry->d_parent;
        p = d_ancestor(old_parent, target);
        if (IS_ROOT(dentry)) {
                BUG_ON(p);
                spin_lock(&target->d_parent->d_lock);
        } else if (!p) {
                /* target is not a descendent of dentry->d_parent */
                spin_lock(&target->d_parent->d_lock);
                spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED);
        } else {
                BUG_ON(p == dentry);
                spin_lock(&old_parent->d_lock);
                if (p != target)
                        spin_lock_nested(&target->d_parent->d_lock,
                                        DENTRY_D_LOCK_NESTED);
        }
        spin_lock_nested(&dentry->d_lock, 2);
        spin_lock_nested(&target->d_lock, 3);

        if (unlikely(d_in_lookup(target))) {
                dir = target->d_parent->d_inode;
                n = start_dir_add(dir);
                d_wait = __d_lookup_unhash(target);
        }

        write_seqcount_begin(&dentry->d_seq);
        write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);

        /* unhash both */
        if (!d_unhashed(dentry))
                ___d_drop(dentry);
        if (!d_unhashed(target))
                ___d_drop(target);

        /* ... and switch them in the tree */
        dentry->d_parent = target->d_parent;
        if (!exchange) {
                copy_name(dentry, target);
                target->d_hash.pprev = NULL;
                dentry->d_parent->d_lockref.count++;
                if (dentry != old_parent) /* wasn't IS_ROOT */
                        WARN_ON(!--old_parent->d_lockref.count);
        } else {
                target->d_parent = old_parent;
                swap_names(dentry, target);
                if (!hlist_unhashed(&target->d_sib))
                        __hlist_del(&target->d_sib);
                hlist_add_head(&target->d_sib, &target->d_parent->d_children);
                __d_rehash(target);
                fsnotify_update_flags(target);
        }
        if (!hlist_unhashed(&dentry->d_sib))
                __hlist_del(&dentry->d_sib);
        hlist_add_head(&dentry->d_sib, &dentry->d_parent->d_children);
        __d_rehash(dentry);
        fsnotify_update_flags(dentry);
        fscrypt_handle_d_move(dentry);

        write_seqcount_end(&target->d_seq);
        write_seqcount_end(&dentry->d_seq);

        if (dir)
                end_dir_add(dir, n, d_wait);

        if (dentry->d_parent != old_parent)
                spin_unlock(&dentry->d_parent->d_lock);
        if (dentry != old_parent)
                spin_unlock(&old_parent->d_lock);
        spin_unlock(&target->d_lock);
        spin_unlock(&dentry->d_lock);
}

/*
 * d_move - move a dentry
 * @dentry: entry to move
 * @target: new dentry
 *
 * Update the dcache to reflect the move of a file name. Negative
 * dcache entries should not be moved in this way. See the locking
 * requirements for __d_move.
 */
void d_move(struct dentry *dentry, struct dentry *target)
{
        write_seqlock(&rename_lock);
        __d_move(dentry, target, false);
        write_sequnlock(&rename_lock);
}
EXPORT_SYMBOL(d_move);

/*
 * d_exchange - exchange two dentries
 * @dentry1: first dentry
 * @dentry2: second dentry
 */
void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
{
        write_seqlock(&rename_lock);

        WARN_ON(!dentry1->d_inode);
        WARN_ON(!dentry2->d_inode);
        WARN_ON(IS_ROOT(dentry1));
        WARN_ON(IS_ROOT(dentry2));

        __d_move(dentry1, dentry2, true);

        write_sequnlock(&rename_lock);
}

/**
 * d_ancestor - search for an ancestor
 * @p1: ancestor dentry
 * @p2: child dentry
 *
 * Returns the ancestor dentry of p2 which is a child of p1, if p1 is
 * an ancestor of p2, else NULL.
 */
struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
{
        struct dentry *p;

        for (p = p2; !IS_ROOT(p); p = p->d_parent) {
                if (p->d_parent == p1)
                        return p;
        }
        return NULL;
}

/*
 * This helper attempts to cope with remotely renamed directories
 *
 * It assumes that the caller is already holding
 * dentry->d_parent->d_inode->i_mutex, and rename_lock
 *
 * Note: If ever the locking in lock_rename() changes, then please
 * remember to update this too...
 */
static int __d_unalias(struct dentry *dentry, struct dentry *alias)
{
        struct mutex *m1 = NULL;
        struct rw_semaphore *m2 = NULL;
        int ret = -ESTALE;

        /* If alias and dentry share a parent, then no extra locks required */
        if (alias->d_parent == dentry->d_parent)
                goto out_unalias;

        /* See lock_rename() */
        if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
                goto out_err;
        m1 = &dentry->d_sb->s_vfs_rename_mutex;
        if (!inode_trylock_shared(alias->d_parent->d_inode))
                goto out_err;
        m2 = &alias->d_parent->d_inode->i_rwsem;
out_unalias:
        __d_move(alias, dentry, false);
        ret = 0;
out_err:
        if (m2)
                up_read(m2);
        if (m1)
                mutex_unlock(m1);
        return ret;
}

/**
 * d_splice_alias - splice a disconnected dentry into the tree if one exists
 * @inode:  the inode which may have a disconnected dentry
 * @dentry: a negative dentry which we want to point to the inode.
 *
 * If inode is a directory and has an IS_ROOT alias, then d_move that in
 * place of the given dentry and return it, else simply d_add the inode
 * to the dentry and return NULL.
 *
 * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
 * we should error out: directories can't have multiple aliases.
 *
 * This is needed in the lookup routine of any filesystem that is exportable
 * (via knfsd) so that we can build dcache paths to directories effectively.
 *
 * If a dentry was found and moved, then it is returned.  Otherwise NULL
 * is returned.  This matches the expected return value of ->lookup.
 *
 * Cluster filesystems may call this function with a negative, hashed dentry.
 * In that case, we know that the inode will be a regular file, and also this
 * will only occur during atomic_open. So we need to check for the dentry
 * being already hashed only in the final case.
 */
struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
{
        if (IS_ERR(inode))
                return ERR_CAST(inode);

        BUG_ON(!d_unhashed(dentry));

        if (!inode)
                goto out;

        security_d_instantiate(dentry, inode);
        spin_lock(&inode->i_lock);
        if (S_ISDIR(inode->i_mode)) {
                struct dentry *new = __d_find_any_alias(inode);
                if (unlikely(new)) {
                        /* The reference to new ensures it remains an alias */
                        spin_unlock(&inode->i_lock);
                        write_seqlock(&rename_lock);
                        if (unlikely(d_ancestor(new, dentry))) {
                                write_sequnlock(&rename_lock);
                                dput(new);
                                new = ERR_PTR(-ELOOP);
                                pr_warn_ratelimited(
                                        "VFS: Lookup of '%s' in %s %s"
                                        " would have caused loop\n",
                                        dentry->d_name.name,
                                        inode->i_sb->s_type->name,
                                        inode->i_sb->s_id);
                        } else if (!IS_ROOT(new)) {
                                struct dentry *old_parent = dget(new->d_parent);
                                int err = __d_unalias(dentry, new);
                                write_sequnlock(&rename_lock);
                                if (err) {
                                        dput(new);
                                        new = ERR_PTR(err);
                                }
                                dput(old_parent);
                        } else {
                                __d_move(new, dentry, false);
                                write_sequnlock(&rename_lock);
                        }
                        iput(inode);
                        return new;
                }
        }
out:
        __d_add(dentry, inode);
        return NULL;
}
EXPORT_SYMBOL(d_splice_alias);

/*
 * Test whether new_dentry is a subdirectory of old_dentry.
 *
 * Trivially implemented using the dcache structure
 */

/**
 * is_subdir - is new dentry a subdirectory of old_dentry
 * @new_dentry: new dentry
 * @old_dentry: old dentry
 *
 * Returns true if new_dentry is a subdirectory of the parent (at any depth).
 * Returns false otherwise.
 * Caller must ensure that "new_dentry" is pinned before calling is_subdir()
 */
  
bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
{
        bool result;
        unsigned seq;

        if (new_dentry == old_dentry)
                return true;

        do {
                /* for restarting inner loop in case of seq retry */
                seq = read_seqbegin(&rename_lock);
                /*
                 * Need rcu_readlock to protect against the d_parent trashing
                 * due to d_move
                 */
                rcu_read_lock();
                if (d_ancestor(old_dentry, new_dentry))
                        result = true;
                else
                        result = false;
                rcu_read_unlock();
        } while (read_seqretry(&rename_lock, seq));

        return result;
}
EXPORT_SYMBOL(is_subdir);

static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
{
        struct dentry *root = data;
        if (dentry != root) {
                if (d_unhashed(dentry) || !dentry->d_inode)
                        return D_WALK_SKIP;

                if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
                        dentry->d_flags |= DCACHE_GENOCIDE;
                        dentry->d_lockref.count--;
                }
        }
        return D_WALK_CONTINUE;
}

void d_genocide(struct dentry *parent)
{
        d_walk(parent, parent, d_genocide_kill);
}

void d_mark_tmpfile(struct file *file, struct inode *inode)
{
        struct dentry *dentry = file->f_path.dentry;

        BUG_ON(dentry->d_name.name != dentry->d_iname ||
                !hlist_unhashed(&dentry->d_u.d_alias) ||
                !d_unlinked(dentry));
        spin_lock(&dentry->d_parent->d_lock);
        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
        dentry->d_name.len = sprintf(dentry->d_iname, "#%llu",
                                (unsigned long long)inode->i_ino);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&dentry->d_parent->d_lock);
}
EXPORT_SYMBOL(d_mark_tmpfile);

void d_tmpfile(struct file *file, struct inode *inode)
{
        struct dentry *dentry = file->f_path.dentry;

        inode_dec_link_count(inode);
        d_mark_tmpfile(file, inode);
        d_instantiate(dentry, inode);
}
EXPORT_SYMBOL(d_tmpfile);

static __initdata unsigned long dhash_entries;
static int __init set_dhash_entries(char *str)
{
        if (!str)
                return 0;
        dhash_entries = simple_strtoul(str, &str, 0);
        return 1;
}
__setup("dhash_entries=", set_dhash_entries);

static void __init dcache_init_early(void)
{
        /* If hashes are distributed across NUMA nodes, defer
         * hash allocation until vmalloc space is available.
         */
        if (hashdist)
                return;

        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
                                        sizeof(struct hlist_bl_head),
                                        dhash_entries,
                                        13,
                                        HASH_EARLY | HASH_ZERO,
                                        &d_hash_shift,
                                        NULL,
                                        0,
                                        0);
        d_hash_shift = 32 - d_hash_shift;
}

static void __init dcache_init(void)
{
        /*
         * A constructor could be added for stable state like the lists,
         * but it is probably not worth it because of the cache nature
         * of the dcache.
         */
        dentry_cache = KMEM_CACHE_USERCOPY(dentry,
                SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT,
                d_iname);

        /* Hash may have been set up in dcache_init_early */
        if (!hashdist)
                return;

        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
                                        sizeof(struct hlist_bl_head),
                                        dhash_entries,
                                        13,
                                        HASH_ZERO,
                                        &d_hash_shift,
                                        NULL,
                                        0,
                                        0);
        d_hash_shift = 32 - d_hash_shift;
}

/* SLAB cache for __getname() consumers */
struct kmem_cache *names_cachep __ro_after_init;
EXPORT_SYMBOL(names_cachep);

void __init vfs_caches_init_early(void)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
                INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);

        dcache_init_early();
        inode_init_early();
}

void __init vfs_caches_init(void)
{
        names_cachep = kmem_cache_create_usercopy("names_cache", PATH_MAX, 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, 0, PATH_MAX, NULL);

        dcache_init();
        inode_init();
        files_init();
        files_maxfiles_init();
        mnt_init();
        bdev_cache_init();
        chrdev_init();
}
































    1 

































































    1 


















































































    1 

    1 

    1 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
// SPDX-License-Identifier: GPL-2.0
/*
 * mm/fadvise.c
 *
 * Copyright (C) 2002, Linus Torvalds
 *
 * 11Jan2003        Andrew Morton
 *                Initial version.
 */

#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
#include <linux/fadvise.h>
#include <linux/writeback.h>
#include <linux/syscalls.h>
#include <linux/swap.h>

#include <asm/unistd.h>

#include "internal.h"

/*
 * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
 * deactivate the pages and clear PG_Referenced.
 */

int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
{
        struct inode *inode;
        struct address_space *mapping;
        struct backing_dev_info *bdi;
        loff_t endbyte;                        /* inclusive */
        pgoff_t start_index;
        pgoff_t end_index;
        unsigned long nrpages;

        inode = file_inode(file);
        if (S_ISFIFO(inode->i_mode))
                return -ESPIPE;

        mapping = file->f_mapping;
        if (!mapping || len < 0)
                return -EINVAL;

        bdi = inode_to_bdi(mapping->host);

        if (IS_DAX(inode) || (bdi == &noop_backing_dev_info)) {
                switch (advice) {
                case POSIX_FADV_NORMAL:
                case POSIX_FADV_RANDOM:
                case POSIX_FADV_SEQUENTIAL:
                case POSIX_FADV_WILLNEED:
                case POSIX_FADV_NOREUSE:
                case POSIX_FADV_DONTNEED:
                        /* no bad return value, but ignore advice */
                        break;
                default:
                        return -EINVAL;
                }
                return 0;
        }

        /*
         * Careful about overflows. Len == 0 means "as much as possible".  Use
         * unsigned math because signed overflows are undefined and UBSan
         * complains.
         */
        endbyte = (u64)offset + (u64)len;
        if (!len || endbyte < len)
                endbyte = LLONG_MAX;
        else
                endbyte--;                /* inclusive */

        switch (advice) {
        case POSIX_FADV_NORMAL:
                file->f_ra.ra_pages = bdi->ra_pages;
                spin_lock(&file->f_lock);
                file->f_mode &= ~(FMODE_RANDOM | FMODE_NOREUSE);
                spin_unlock(&file->f_lock);
                break;
        case POSIX_FADV_RANDOM:
                spin_lock(&file->f_lock);
                file->f_mode |= FMODE_RANDOM;
                spin_unlock(&file->f_lock);
                break;
        case POSIX_FADV_SEQUENTIAL:
                file->f_ra.ra_pages = bdi->ra_pages * 2;
                spin_lock(&file->f_lock);
                file->f_mode &= ~FMODE_RANDOM;
                spin_unlock(&file->f_lock);
                break;
        case POSIX_FADV_WILLNEED:
                /* First and last PARTIAL page! */
                start_index = offset >> PAGE_SHIFT;
                end_index = endbyte >> PAGE_SHIFT;

                /* Careful about overflow on the "+1" */
                nrpages = end_index - start_index + 1;
                if (!nrpages)
                        nrpages = ~0UL;

                force_page_cache_readahead(mapping, file, start_index, nrpages);
                break;
        case POSIX_FADV_NOREUSE:
                spin_lock(&file->f_lock);
                file->f_mode |= FMODE_NOREUSE;
                spin_unlock(&file->f_lock);
                break;
        case POSIX_FADV_DONTNEED:
                __filemap_fdatawrite_range(mapping, offset, endbyte,
                                           WB_SYNC_NONE);

                /*
                 * First and last FULL page! Partial pages are deliberately
                 * preserved on the expectation that it is better to preserve
                 * needed memory than to discard unneeded memory.
                 */
                start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT;
                end_index = (endbyte >> PAGE_SHIFT);
                /*
                 * The page at end_index will be inclusively discarded according
                 * by invalidate_mapping_pages(), so subtracting 1 from
                 * end_index means we will skip the last page.  But if endbyte
                 * is page aligned or is at the end of file, we should not skip
                 * that page - discarding the last page is safe enough.
                 */
                if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK &&
                                endbyte != inode->i_size - 1) {
                        /* First page is tricky as 0 - 1 = -1, but pgoff_t
                         * is unsigned, so the end_index >= start_index
                         * check below would be true and we'll discard the whole
                         * file cache which is not what was asked.
                         */
                        if (end_index == 0)
                                break;

                        end_index--;
                }

                if (end_index >= start_index) {
                        unsigned long nr_failed = 0;

                        /*
                         * It's common to FADV_DONTNEED right after
                         * the read or write that instantiates the
                         * pages, in which case there will be some
                         * sitting on the local LRU cache. Try to
                         * avoid the expensive remote drain and the
                         * second cache tree walk below by flushing
                         * them out right away.
                         */
                        lru_add_drain();

                        mapping_try_invalidate(mapping, start_index, end_index,
                                        &nr_failed);

                        /*
                         * The failures may be due to the folio being
                         * in the LRU cache of a remote CPU. Drain all
                         * caches and try again.
                         */
                        if (nr_failed) {
                                lru_add_drain_all();
                                invalidate_mapping_pages(mapping, start_index,
                                                end_index);
                        }
                }
                break;
        default:
                return -EINVAL;
        }
        return 0;
}
EXPORT_SYMBOL(generic_fadvise);

int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
{
        if (file->f_op->fadvise)
                return file->f_op->fadvise(file, offset, len, advice);

        return generic_fadvise(file, offset, len, advice);
}
EXPORT_SYMBOL(vfs_fadvise);

#ifdef CONFIG_ADVISE_SYSCALLS

int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
{
        struct fd f = fdget(fd);
        int ret;

        if (!f.file)
                return -EBADF;

        ret = vfs_fadvise(f.file, offset, len, advice);

        fdput(f);
        return ret;
}

SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
{
        return ksys_fadvise64_64(fd, offset, len, advice);
}

#ifdef __ARCH_WANT_SYS_FADVISE64

SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice)
{
        return ksys_fadvise64_64(fd, offset, len, advice);
}

#endif

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FADVISE64_64)

COMPAT_SYSCALL_DEFINE6(fadvise64_64, int, fd, compat_arg_u64_dual(offset),
                       compat_arg_u64_dual(len), int, advice)
{
        return ksys_fadvise64_64(fd, compat_arg_u64_glue(offset),
                                 compat_arg_u64_glue(len), advice);
}

#endif
#endif
































































































































































































    1 

















    1 


















    1 






    1 





















































    1 


















































































    2 
    2 






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FAT_H
#define _FAT_H

#include <linux/buffer_head.h>
#include <linux/nls.h>
#include <linux/hash.h>
#include <linux/ratelimit.h>
#include <linux/msdos_fs.h>

/*
 * vfat shortname flags
 */
#define VFAT_SFN_DISPLAY_LOWER        0x0001 /* convert to lowercase for display */
#define VFAT_SFN_DISPLAY_WIN95        0x0002 /* emulate win95 rule for display */
#define VFAT_SFN_DISPLAY_WINNT        0x0004 /* emulate winnt rule for display */
#define VFAT_SFN_CREATE_WIN95        0x0100 /* emulate win95 rule for create */
#define VFAT_SFN_CREATE_WINNT        0x0200 /* emulate winnt rule for create */

#define FAT_ERRORS_CONT                1      /* ignore error and continue */
#define FAT_ERRORS_PANIC        2      /* panic on error */
#define FAT_ERRORS_RO                3      /* remount r/o on error */

#define FAT_NFS_STALE_RW        1      /* NFS RW support, can cause ESTALE */
#define FAT_NFS_NOSTALE_RO        2      /* NFS RO support, no ESTALE issue */

struct fat_mount_options {
        kuid_t fs_uid;
        kgid_t fs_gid;
        unsigned short fs_fmask;
        unsigned short fs_dmask;
        unsigned short codepage;   /* Codepage for shortname conversions */
        int time_offset;           /* Offset of timestamps from UTC (in minutes) */
        char *iocharset;           /* Charset used for filename input/display */
        unsigned short shortname;  /* flags for shortname display/create rule */
        unsigned char name_check;  /* r = relaxed, n = normal, s = strict */
        unsigned char errors;           /* On error: continue, panic, remount-ro */
        unsigned char nfs;          /* NFS support: nostale_ro, stale_rw */
        unsigned short allow_utime;/* permission for setting the [am]time */
        unsigned quiet:1,          /* set = fake successful chmods and chowns */
                 showexec:1,       /* set = only set x bit for com/exe/bat */
                 sys_immutable:1,  /* set = system files are immutable */
                 dotsOK:1,         /* set = hidden and system files are named '.filename' */
                 isvfat:1,         /* 0=no vfat long filename support, 1=vfat support */
                 utf8:1,           /* Use of UTF-8 character set (Default) */
                 unicode_xlate:1,  /* create escape sequences for unhandled Unicode */
                 numtail:1,        /* Does first alias have a numeric '~1' type tail? */
                 flush:1,           /* write things quickly */
                 nocase:1,           /* Does this need case conversion? 0=need case conversion*/
                 usefree:1,           /* Use free_clusters for FAT32 */
                 tz_set:1,           /* Filesystem timestamps' offset set */
                 rodir:1,           /* allow ATTR_RO for directory */
                 discard:1,           /* Issue discard requests on deletions */
                 dos1xfloppy:1;           /* Assume default BPB for DOS 1.x floppies */
};

#define FAT_HASH_BITS        8
#define FAT_HASH_SIZE        (1UL << FAT_HASH_BITS)

/*
 * MS-DOS file system in-core superblock data
 */
struct msdos_sb_info {
        unsigned short sec_per_clus;  /* sectors/cluster */
        unsigned short cluster_bits;  /* log2(cluster_size) */
        unsigned int cluster_size;    /* cluster size */
        unsigned char fats, fat_bits; /* number of FATs, FAT bits (12,16 or 32) */
        unsigned short fat_start;
        unsigned long fat_length;     /* FAT start & length (sec.) */
        unsigned long dir_start;
        unsigned short dir_entries;   /* root dir start & entries */
        unsigned long data_start;     /* first data sector */
        unsigned long max_cluster;    /* maximum cluster number */
        unsigned long root_cluster;   /* first cluster of the root directory */
        unsigned long fsinfo_sector;  /* sector number of FAT32 fsinfo */
        struct mutex fat_lock;
        struct mutex nfs_build_inode_lock;
        struct mutex s_lock;
        unsigned int prev_free;      /* previously allocated cluster number */
        unsigned int free_clusters;  /* -1 if undefined */
        unsigned int free_clus_valid; /* is free_clusters valid? */
        struct fat_mount_options options;
        struct nls_table *nls_disk;   /* Codepage used on disk */
        struct nls_table *nls_io;     /* Charset used for input and display */
        const void *dir_ops;              /* Opaque; default directory operations */
        int dir_per_block;              /* dir entries per block */
        int dir_per_block_bits;              /* log2(dir_per_block) */
        unsigned int vol_id;                /*volume ID*/

        int fatent_shift;
        const struct fatent_operations *fatent_ops;
        struct inode *fat_inode;
        struct inode *fsinfo_inode;

        struct ratelimit_state ratelimit;

        spinlock_t inode_hash_lock;
        struct hlist_head inode_hashtable[FAT_HASH_SIZE];

        spinlock_t dir_hash_lock;
        struct hlist_head dir_hashtable[FAT_HASH_SIZE];

        unsigned int dirty;           /* fs state before mount */
        struct rcu_head rcu;
};

#define FAT_CACHE_VALID        0        /* special case for valid cache */

/*
 * MS-DOS file system inode data in memory
 */
struct msdos_inode_info {
        spinlock_t cache_lru_lock;
        struct list_head cache_lru;
        int nr_caches;
        /* for avoiding the race between fat_free() and fat_get_cluster() */
        unsigned int cache_valid_id;

        /* NOTE: mmu_private is 64bits, so must hold ->i_mutex to access */
        loff_t mmu_private;        /* physically allocated size */

        int i_start;                /* first cluster or 0 */
        int i_logstart;                /* logical first cluster */
        int i_attrs;                /* unused attribute bits */
        loff_t i_pos;                /* on-disk position of directory entry or 0 */
        struct hlist_node i_fat_hash;        /* hash by i_location */
        struct hlist_node i_dir_hash;        /* hash by i_logstart */
        struct rw_semaphore truncate_lock; /* protect bmap against truncate */
        struct timespec64 i_crtime;        /* File creation (birth) time */
        struct inode vfs_inode;
};

struct fat_slot_info {
        loff_t i_pos;                /* on-disk position of directory entry */
        loff_t slot_off;        /* offset for slot or de start */
        int nr_slots;                /* number of slots + 1(de) in filename */
        struct msdos_dir_entry *de;
        struct buffer_head *bh;
};

static inline struct msdos_sb_info *MSDOS_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}

/*
 * Functions that determine the variant of the FAT file system (i.e.,
 * whether this is FAT12, FAT16 or FAT32.
 */
static inline bool is_fat12(const struct msdos_sb_info *sbi)
{
        return sbi->fat_bits == 12;
}

static inline bool is_fat16(const struct msdos_sb_info *sbi)
{
        return sbi->fat_bits == 16;
}

static inline bool is_fat32(const struct msdos_sb_info *sbi)
{
        return sbi->fat_bits == 32;
}

/* Maximum number of clusters */
static inline u32 max_fat(struct super_block *sb)
{
        struct msdos_sb_info *sbi = MSDOS_SB(sb);

        return is_fat32(sbi) ? MAX_FAT32 :
                is_fat16(sbi) ? MAX_FAT16 : MAX_FAT12;
}

static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
{
        return container_of(inode, struct msdos_inode_info, vfs_inode);
}

/*
 * If ->i_mode can't hold S_IWUGO (i.e. ATTR_RO), we use ->i_attrs to
 * save ATTR_RO instead of ->i_mode.
 *
 * If it's directory and !sbi->options.rodir, ATTR_RO isn't read-only
 * bit, it's just used as flag for app.
 */
static inline int fat_mode_can_hold_ro(struct inode *inode)
{
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
        umode_t mask;

        if (S_ISDIR(inode->i_mode)) {
                if (!sbi->options.rodir)
                        return 0;
                mask = ~sbi->options.fs_dmask;
        } else
                mask = ~sbi->options.fs_fmask;

        if (!(mask & S_IWUGO))
                return 0;
        return 1;
}

/* Convert attribute bits and a mask to the UNIX mode. */
static inline umode_t fat_make_mode(struct msdos_sb_info *sbi,
                                   u8 attrs, umode_t mode)
{
        if (attrs & ATTR_RO && !((attrs & ATTR_DIR) && !sbi->options.rodir))
                mode &= ~S_IWUGO;

        if (attrs & ATTR_DIR)
                return (mode & ~sbi->options.fs_dmask) | S_IFDIR;
        else
                return (mode & ~sbi->options.fs_fmask) | S_IFREG;
}

/* Return the FAT attribute byte for this inode */
static inline u8 fat_make_attrs(struct inode *inode)
{
        u8 attrs = MSDOS_I(inode)->i_attrs;
        if (S_ISDIR(inode->i_mode))
                attrs |= ATTR_DIR;
        if (fat_mode_can_hold_ro(inode) && !(inode->i_mode & S_IWUGO))
                attrs |= ATTR_RO;
        return attrs;
}

static inline void fat_save_attrs(struct inode *inode, u8 attrs)
{
        if (fat_mode_can_hold_ro(inode))
                MSDOS_I(inode)->i_attrs = attrs & ATTR_UNUSED;
        else
                MSDOS_I(inode)->i_attrs = attrs & (ATTR_UNUSED | ATTR_RO);
}

static inline unsigned char fat_checksum(const __u8 *name)
{
        unsigned char s = name[0];
        s = (s<<7) + (s>>1) + name[1];        s = (s<<7) + (s>>1) + name[2];
        s = (s<<7) + (s>>1) + name[3];        s = (s<<7) + (s>>1) + name[4];
        s = (s<<7) + (s>>1) + name[5];        s = (s<<7) + (s>>1) + name[6];
        s = (s<<7) + (s>>1) + name[7];        s = (s<<7) + (s>>1) + name[8];
        s = (s<<7) + (s>>1) + name[9];        s = (s<<7) + (s>>1) + name[10];
        return s;
}

static inline sector_t fat_clus_to_blknr(struct msdos_sb_info *sbi, int clus)
{
        return ((sector_t)clus - FAT_START_ENT) * sbi->sec_per_clus
                + sbi->data_start;
}

static inline void fat_get_blknr_offset(struct msdos_sb_info *sbi,
                                loff_t i_pos, sector_t *blknr, int *offset)
{
        *blknr = i_pos >> sbi->dir_per_block_bits;
        *offset = i_pos & (sbi->dir_per_block - 1);
}

static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi,
                                        struct inode *inode)
{
        loff_t i_pos;
#if BITS_PER_LONG == 32
        spin_lock(&sbi->inode_hash_lock);
#endif
        i_pos = MSDOS_I(inode)->i_pos;
#if BITS_PER_LONG == 32
        spin_unlock(&sbi->inode_hash_lock);
#endif
        return i_pos;
}

static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len)
{
#ifdef __BIG_ENDIAN
        while (len--) {
                *dst++ = src[0] | (src[1] << 8);
                src += 2;
        }
#else
        memcpy(dst, src, len * 2);
#endif
}

static inline int fat_get_start(const struct msdos_sb_info *sbi,
                                const struct msdos_dir_entry *de)
{
        int cluster = le16_to_cpu(de->start);
        if (is_fat32(sbi))
                cluster |= (le16_to_cpu(de->starthi) << 16);
        return cluster;
}

static inline void fat_set_start(struct msdos_dir_entry *de, int cluster)
{
        de->start   = cpu_to_le16(cluster);
        de->starthi = cpu_to_le16(cluster >> 16);
}

static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
{
#ifdef __BIG_ENDIAN
        while (len--) {
                dst[0] = *src & 0x00FF;
                dst[1] = (*src & 0xFF00) >> 8;
                dst += 2;
                src++;
        }
#else
        memcpy(dst, src, len * 2);
#endif
}

/* fat/cache.c */
extern void fat_cache_inval_inode(struct inode *inode);
extern int fat_get_cluster(struct inode *inode, int cluster,
                           int *fclus, int *dclus);
extern int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
                                  sector_t last_block,
                                  unsigned long *mapped_blocks, sector_t *bmap);
extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
                    unsigned long *mapped_blocks, int create, bool from_bmap);

/* fat/dir.c */
extern const struct file_operations fat_dir_operations;
extern int fat_search_long(struct inode *inode, const unsigned char *name,
                           int name_len, struct fat_slot_info *sinfo);
extern int fat_dir_empty(struct inode *dir);
extern int fat_subdirs(struct inode *dir);
extern int fat_scan(struct inode *dir, const unsigned char *name,
                    struct fat_slot_info *sinfo);
extern int fat_scan_logstart(struct inode *dir, int i_logstart,
                             struct fat_slot_info *sinfo);
extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
                                struct msdos_dir_entry **de);
extern int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts);
extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
                           struct fat_slot_info *sinfo);
extern int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo);

/* fat/fatent.c */
struct fat_entry {
        int entry;
        union {
                u8 *ent12_p[2];
                __le16 *ent16_p;
                __le32 *ent32_p;
        } u;
        int nr_bhs;
        struct buffer_head *bhs[2];
        struct inode *fat_inode;
};

static inline void fatent_init(struct fat_entry *fatent)
{
        fatent->nr_bhs = 0;
        fatent->entry = 0;
        fatent->u.ent32_p = NULL;
        fatent->bhs[0] = fatent->bhs[1] = NULL;
        fatent->fat_inode = NULL;
}

static inline void fatent_set_entry(struct fat_entry *fatent, int entry)
{
        fatent->entry = entry;
        fatent->u.ent32_p = NULL;
}

static inline void fatent_brelse(struct fat_entry *fatent)
{
        int i;
        fatent->u.ent32_p = NULL;
        for (i = 0; i < fatent->nr_bhs; i++)
                brelse(fatent->bhs[i]);
        fatent->nr_bhs = 0;
        fatent->bhs[0] = fatent->bhs[1] = NULL;
        fatent->fat_inode = NULL;
}

static inline bool fat_valid_entry(struct msdos_sb_info *sbi, int entry)
{
        return FAT_START_ENT <= entry && entry < sbi->max_cluster;
}

extern void fat_ent_access_init(struct super_block *sb);
extern int fat_ent_read(struct inode *inode, struct fat_entry *fatent,
                        int entry);
extern int fat_ent_write(struct inode *inode, struct fat_entry *fatent,
                         int new, int wait);
extern int fat_alloc_clusters(struct inode *inode, int *cluster,
                              int nr_cluster);
extern int fat_free_clusters(struct inode *inode, int cluster);
extern int fat_count_free_clusters(struct super_block *sb);
extern int fat_trim_fs(struct inode *inode, struct fstrim_range *range);

/* fat/file.c */
extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
                              unsigned long arg);
extern const struct file_operations fat_file_operations;
extern const struct inode_operations fat_file_inode_operations;
extern int fat_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                       struct iattr *attr);
extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
extern int fat_getattr(struct mnt_idmap *idmap,
                       const struct path *path, struct kstat *stat,
                       u32 request_mask, unsigned int flags);
extern int fat_file_fsync(struct file *file, loff_t start, loff_t end,
                          int datasync);

/* fat/inode.c */
extern int fat_block_truncate_page(struct inode *inode, loff_t from);
extern void fat_attach(struct inode *inode, loff_t i_pos);
extern void fat_detach(struct inode *inode);
extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos);
extern struct inode *fat_build_inode(struct super_block *sb,
                        struct msdos_dir_entry *de, loff_t i_pos);
extern int fat_sync_inode(struct inode *inode);
extern int fat_fill_super(struct super_block *sb, void *data, int silent,
                          int isvfat, void (*setup)(struct super_block *));
extern int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de);

extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
                            struct inode *i2);
static inline unsigned long fat_dir_hash(int logstart)
{
        return hash_32(logstart, FAT_HASH_BITS);
}
extern int fat_add_cluster(struct inode *inode);

/* fat/misc.c */
extern __printf(3, 4) __cold
void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...);
#define fat_fs_error(sb, fmt, args...)                \
        __fat_fs_error(sb, 1, fmt , ## args)
#define fat_fs_error_ratelimit(sb, fmt, args...) \
        __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args)

#define FAT_PRINTK_PREFIX "%sFAT-fs (%s): "
#define fat_msg(sb, level, fmt, args...)                                \
do {                                                                        \
        printk_index_subsys_emit(FAT_PRINTK_PREFIX, level, fmt, ##args);\
        _fat_msg(sb, level, fmt, ##args);                                \
} while (0)
__printf(3, 4) __cold
void _fat_msg(struct super_block *sb, const char *level, const char *fmt, ...);
#define fat_msg_ratelimit(sb, level, fmt, args...)        \
        do {        \
                        if (__ratelimit(&MSDOS_SB(sb)->ratelimit))        \
                                fat_msg(sb, level, fmt, ## args);        \
         } while (0)
extern int fat_clusters_flush(struct super_block *sb);
extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts,
                              __le16 __time, __le16 __date, u8 time_cs);
extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts,
                              __le16 *time, __le16 *date, u8 *time_cs);
extern struct timespec64 fat_truncate_atime(const struct msdos_sb_info *sbi,
                                            const struct timespec64 *ts);
extern struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi,
                                            const struct timespec64 *ts);
extern int fat_truncate_time(struct inode *inode, struct timespec64 *now,
                             int flags);
extern int fat_update_time(struct inode *inode, int flags);
extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);

int fat_cache_init(void);
void fat_cache_destroy(void);

/* fat/nfs.c */
extern const struct export_operations fat_export_ops;
extern const struct export_operations fat_export_ops_nostale;

/* helper for printk */
typedef unsigned long long        llu;

#endif /* !_FAT_H */


































































































































































































    3 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMAN_H
#define _LINUX_MMAN_H

#include <linux/mm.h>
#include <linux/percpu_counter.h>

#include <linux/atomic.h>
#include <uapi/linux/mman.h>

/*
 * Arrange for legacy / undefined architecture specific flags to be
 * ignored by mmap handling code.
 */
#ifndef MAP_32BIT
#define MAP_32BIT 0
#endif
#ifndef MAP_ABOVE4G
#define MAP_ABOVE4G 0
#endif
#ifndef MAP_HUGE_2MB
#define MAP_HUGE_2MB 0
#endif
#ifndef MAP_HUGE_1GB
#define MAP_HUGE_1GB 0
#endif
#ifndef MAP_UNINITIALIZED
#define MAP_UNINITIALIZED 0
#endif
#ifndef MAP_SYNC
#define MAP_SYNC 0
#endif

/*
 * The historical set of flags that all mmap implementations implicitly
 * support when a ->mmap_validate() op is not provided in file_operations.
 *
 * MAP_EXECUTABLE and MAP_DENYWRITE are completely ignored throughout the
 * kernel.
 */
#define LEGACY_MAP_MASK (MAP_SHARED \
                | MAP_PRIVATE \
                | MAP_FIXED \
                | MAP_ANONYMOUS \
                | MAP_DENYWRITE \
                | MAP_EXECUTABLE \
                | MAP_UNINITIALIZED \
                | MAP_GROWSDOWN \
                | MAP_LOCKED \
                | MAP_NORESERVE \
                | MAP_POPULATE \
                | MAP_NONBLOCK \
                | MAP_STACK \
                | MAP_HUGETLB \
                | MAP_32BIT \
                | MAP_ABOVE4G \
                | MAP_HUGE_2MB \
                | MAP_HUGE_1GB)

extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern unsigned long sysctl_overcommit_kbytes;
extern struct percpu_counter vm_committed_as;

#ifdef CONFIG_SMP
extern s32 vm_committed_as_batch;
extern void mm_compute_batch(int overcommit_policy);
#else
#define vm_committed_as_batch 0
static inline void mm_compute_batch(int overcommit_policy)
{
}
#endif

unsigned long vm_memory_committed(void);

static inline void vm_acct_memory(long pages)
{
        percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch);
}

static inline void vm_unacct_memory(long pages)
{
        vm_acct_memory(-pages);
}

/*
 * Allow architectures to handle additional protection and flag bits. The
 * overriding macros must be defined in the arch-specific asm/mman.h file.
 */

#ifndef arch_calc_vm_prot_bits
#define arch_calc_vm_prot_bits(prot, pkey) 0
#endif

#ifndef arch_calc_vm_flag_bits
#define arch_calc_vm_flag_bits(flags) 0
#endif

#ifndef arch_validate_prot
/*
 * This is called from mprotect().  PROT_GROWSDOWN and PROT_GROWSUP have
 * already been masked out.
 *
 * Returns true if the prot flags are valid
 */
static inline bool arch_validate_prot(unsigned long prot, unsigned long addr)
{
        return (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) == 0;
}
#define arch_validate_prot arch_validate_prot
#endif

#ifndef arch_validate_flags
/*
 * This is called from mmap() and mprotect() with the updated vma->vm_flags.
 *
 * Returns true if the VM_* flags are valid.
 */
static inline bool arch_validate_flags(unsigned long flags)
{
        return true;
}
#define arch_validate_flags arch_validate_flags
#endif

/*
 * Optimisation macro.  It is equivalent to:
 *      (x & bit1) ? bit2 : 0
 * but this version is faster.
 * ("bit1" and "bit2" must be single bits)
 */
#define _calc_vm_trans(x, bit1, bit2) \
  ((!(bit1) || !(bit2)) ? 0 : \
  ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \
   : ((x) & (bit1)) / ((bit1) / (bit2))))

/*
 * Combine the mmap "prot" argument into "vm_flags" used internally.
 */
static inline unsigned long
calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
{
        return _calc_vm_trans(prot, PROT_READ,  VM_READ ) |
               _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) |
               _calc_vm_trans(prot, PROT_EXEC,  VM_EXEC) |
               arch_calc_vm_prot_bits(prot, pkey);
}

/*
 * Combine the mmap "flags" argument into "vm_flags" used internally.
 */
static inline unsigned long
calc_vm_flag_bits(unsigned long flags)
{
        return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
               _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
               _calc_vm_trans(flags, MAP_SYNC,             VM_SYNC      ) |
               _calc_vm_trans(flags, MAP_STACK,             VM_NOHUGEPAGE) |
               arch_calc_vm_flag_bits(flags);
}

unsigned long vm_commit_limit(void);

#ifndef arch_memory_deny_write_exec_supported
static inline bool arch_memory_deny_write_exec_supported(void)
{
        return true;
}
#define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported
#endif

/*
 * Denies creating a writable executable mapping or gaining executable permissions.
 *
 * This denies the following:
 *
 *         a)        mmap(PROT_WRITE | PROT_EXEC)
 *
 *        b)        mmap(PROT_WRITE)
 *                mprotect(PROT_EXEC)
 *
 *        c)        mmap(PROT_WRITE)
 *                mprotect(PROT_READ)
 *                mprotect(PROT_EXEC)
 *
 * But allows the following:
 *
 *        d)        mmap(PROT_READ | PROT_EXEC)
 *                mmap(PROT_READ | PROT_EXEC | PROT_BTI)
 */
static inline bool map_deny_write_exec(struct vm_area_struct *vma,  unsigned long vm_flags)
{
        if (!test_bit(MMF_HAS_MDWE, &current->mm->flags))
                return false;

        if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
                return true;

        if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
                return true;

        return false;
}

#endif /* _LINUX_MMAN_H */




































































    3 













    3 























    3 






    3 





















    3 









    3 











    3 




    3 
    3 

    3 

























































































































































































































































































































































































































































































































































    1 


















    1 










    1 

















    1 



































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/vfat/namei.c
 *
 *  Written 1992,1993 by Werner Almesberger
 *
 *  Windows95/Windows NT compatible extended MSDOS filesystem
 *    by Gordon Chaffee Copyright (C) 1995.  Send bug reports for the
 *    VFAT filesystem to <chaffee@cs.berkeley.edu>.  Specify
 *    what file operation caused you trouble and if you can duplicate
 *    the problem, send a script that demonstrates it.
 *
 *  Short name translation 1999, 2001 by Wolfram Pienkoss <wp@bszh.de>
 *
 *  Support Multibyte characters and cleanup by
 *                                OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
 */

#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/kernel.h>
#include <linux/iversion.h>
#include "fat.h"

static inline unsigned long vfat_d_version(struct dentry *dentry)
{
        return (unsigned long) dentry->d_fsdata;
}

static inline void vfat_d_version_set(struct dentry *dentry,
                                      unsigned long version)
{
        dentry->d_fsdata = (void *) version;
}

/*
 * If new entry was created in the parent, it could create the 8.3
 * alias (the shortname of logname).  So, the parent may have the
 * negative-dentry which matches the created 8.3 alias.
 *
 * If it happened, the negative dentry isn't actually negative
 * anymore.  So, drop it.
 */
static int vfat_revalidate_shortname(struct dentry *dentry)
{
        int ret = 1;
        spin_lock(&dentry->d_lock);
        if (!inode_eq_iversion(d_inode(dentry->d_parent), vfat_d_version(dentry)))
                ret = 0;
        spin_unlock(&dentry->d_lock);
        return ret;
}

static int vfat_revalidate(struct dentry *dentry, unsigned int flags)
{
        if (flags & LOOKUP_RCU)
                return -ECHILD;

        /* This is not negative dentry. Always valid. */
        if (d_really_is_positive(dentry))
                return 1;
        return vfat_revalidate_shortname(dentry);
}

static int vfat_revalidate_ci(struct dentry *dentry, unsigned int flags)
{
        if (flags & LOOKUP_RCU)
                return -ECHILD;

        /*
         * This is not negative dentry. Always valid.
         *
         * Note, rename() to existing directory entry will have ->d_inode,
         * and will use existing name which isn't specified name by user.
         *
         * We may be able to drop this positive dentry here. But dropping
         * positive dentry isn't good idea. So it's unsupported like
         * rename("filename", "FILENAME") for now.
         */
        if (d_really_is_positive(dentry))
                return 1;

        /*
         * This may be nfsd (or something), anyway, we can't see the
         * intent of this. So, since this can be for creation, drop it.
         */
        if (!flags)
                return 0;

        /*
         * Drop the negative dentry, in order to make sure to use the
         * case sensitive name which is specified by user if this is
         * for creation.
         */
        if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
                return 0;

        return vfat_revalidate_shortname(dentry);
}

/* returns the length of a struct qstr, ignoring trailing dots */
static unsigned int __vfat_striptail_len(unsigned int len, const char *name)
{
        while (len && name[len - 1] == '.')
                len--;
        return len;
}

static unsigned int vfat_striptail_len(const struct qstr *qstr)
{
        return __vfat_striptail_len(qstr->len, qstr->name);
}

/*
 * Compute the hash for the vfat name corresponding to the dentry.
 * Note: if the name is invalid, we leave the hash code unchanged so
 * that the existing dentry can be used. The vfat fs routines will
 * return ENOENT or EINVAL as appropriate.
 */
static int vfat_hash(const struct dentry *dentry, struct qstr *qstr)
{
        qstr->hash = full_name_hash(dentry, qstr->name, vfat_striptail_len(qstr));
        return 0;
}

/*
 * Compute the hash for the vfat name corresponding to the dentry.
 * Note: if the name is invalid, we leave the hash code unchanged so
 * that the existing dentry can be used. The vfat fs routines will
 * return ENOENT or EINVAL as appropriate.
 */
static int vfat_hashi(const struct dentry *dentry, struct qstr *qstr)
{
        struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
        const unsigned char *name;
        unsigned int len;
        unsigned long hash;

        name = qstr->name;
        len = vfat_striptail_len(qstr);

        hash = init_name_hash(dentry);
        while (len--)
                hash = partial_name_hash(nls_tolower(t, *name++), hash);
        qstr->hash = end_name_hash(hash);

        return 0;
}

/*
 * Case insensitive compare of two vfat names.
 */
static int vfat_cmpi(const struct dentry *dentry,
                unsigned int len, const char *str, const struct qstr *name)
{
        struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
        unsigned int alen, blen;

        /* A filename cannot end in '.' or we treat it like it has none */
        alen = vfat_striptail_len(name);
        blen = __vfat_striptail_len(len, str);
        if (alen == blen) {
                if (nls_strnicmp(t, name->name, str, alen) == 0)
                        return 0;
        }
        return 1;
}

/*
 * Case sensitive compare of two vfat names.
 */
static int vfat_cmp(const struct dentry *dentry,
                unsigned int len, const char *str, const struct qstr *name)
{
        unsigned int alen, blen;

        /* A filename cannot end in '.' or we treat it like it has none */
        alen = vfat_striptail_len(name);
        blen = __vfat_striptail_len(len, str);
        if (alen == blen) {
                if (strncmp(name->name, str, alen) == 0)
                        return 0;
        }
        return 1;
}

static const struct dentry_operations vfat_ci_dentry_ops = {
        .d_revalidate        = vfat_revalidate_ci,
        .d_hash                = vfat_hashi,
        .d_compare        = vfat_cmpi,
};

static const struct dentry_operations vfat_dentry_ops = {
        .d_revalidate        = vfat_revalidate,
        .d_hash                = vfat_hash,
        .d_compare        = vfat_cmp,
};

/* Characters that are undesirable in an MS-DOS file name */

static inline bool vfat_bad_char(wchar_t w)
{
        return (w < 0x0020)
            || (w == '*') || (w == '?') || (w == '<') || (w == '>')
            || (w == '|') || (w == '"') || (w == ':') || (w == '/')
            || (w == '\\');
}

static inline bool vfat_replace_char(wchar_t w)
{
        return (w == '[') || (w == ']') || (w == ';') || (w == ',')
            || (w == '+') || (w == '=');
}

static wchar_t vfat_skip_char(wchar_t w)
{
        return (w == '.') || (w == ' ');
}

static inline int vfat_is_used_badchars(const wchar_t *s, int len)
{
        int i;

        for (i = 0; i < len; i++)
                if (vfat_bad_char(s[i]))
                        return -EINVAL;

        if (s[i - 1] == ' ') /* last character cannot be space */
                return -EINVAL;

        return 0;
}

static int vfat_find_form(struct inode *dir, unsigned char *name)
{
        struct fat_slot_info sinfo;
        int err = fat_scan(dir, name, &sinfo);
        if (err)
                return -ENOENT;
        brelse(sinfo.bh);
        return 0;
}

/*
 * 1) Valid characters for the 8.3 format alias are any combination of
 * letters, uppercase alphabets, digits, any of the
 * following special characters:
 *     $ % ' ` - @ { } ~ ! # ( ) & _ ^
 * In this case Longfilename is not stored in disk.
 *
 * WinNT's Extension:
 * File name and extension name is contain uppercase/lowercase
 * only. And it is expressed by CASE_LOWER_BASE and CASE_LOWER_EXT.
 *
 * 2) File name is 8.3 format, but it contain the uppercase and
 * lowercase char, muliti bytes char, etc. In this case numtail is not
 * added, but Longfilename is stored.
 *
 * 3) When the one except for the above, or the following special
 * character are contained:
 *        .   [ ] ; , + =
 * numtail is added, and Longfilename must be stored in disk .
 */
struct shortname_info {
        unsigned char lower:1,
                      upper:1,
                      valid:1;
};
#define INIT_SHORTNAME_INFO(x)        do {                \
        (x)->lower = 1;                                \
        (x)->upper = 1;                                \
        (x)->valid = 1;                                \
} while (0)

static inline int to_shortname_char(struct nls_table *nls,
                                    unsigned char *buf, int buf_size,
                                    wchar_t *src, struct shortname_info *info)
{
        int len;

        if (vfat_skip_char(*src)) {
                info->valid = 0;
                return 0;
        }
        if (vfat_replace_char(*src)) {
                info->valid = 0;
                buf[0] = '_';
                return 1;
        }

        len = nls->uni2char(*src, buf, buf_size);
        if (len <= 0) {
                info->valid = 0;
                buf[0] = '_';
                len = 1;
        } else if (len == 1) {
                unsigned char prev = buf[0];

                if (buf[0] >= 0x7F) {
                        info->lower = 0;
                        info->upper = 0;
                }

                buf[0] = nls_toupper(nls, buf[0]);
                if (isalpha(buf[0])) {
                        if (buf[0] == prev)
                                info->lower = 0;
                        else
                                info->upper = 0;
                }
        } else {
                info->lower = 0;
                info->upper = 0;
        }

        return len;
}

/*
 * Given a valid longname, create a unique shortname.  Make sure the
 * shortname does not exist
 * Returns negative number on error, 0 for a normal
 * return, and 1 for valid shortname
 */
static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
                                 wchar_t *uname, int ulen,
                                 unsigned char *name_res, unsigned char *lcase)
{
        struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
        wchar_t *ip, *ext_start, *end, *name_start;
        unsigned char base[9], ext[4], buf[5], *p;
        unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
        int chl, chi;
        int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
        int is_shortname;
        struct shortname_info base_info, ext_info;

        is_shortname = 1;
        INIT_SHORTNAME_INFO(&base_info);
        INIT_SHORTNAME_INFO(&ext_info);

        /* Now, we need to create a shortname from the long name */
        ext_start = end = &uname[ulen];
        while (--ext_start >= uname) {
                if (*ext_start == 0x002E) {        /* is `.' */
                        if (ext_start == end - 1) {
                                sz = ulen;
                                ext_start = NULL;
                        }
                        break;
                }
        }

        if (ext_start == uname - 1) {
                sz = ulen;
                ext_start = NULL;
        } else if (ext_start) {
                /*
                 * Names which start with a dot could be just
                 * an extension eg. "...test".  In this case Win95
                 * uses the extension as the name and sets no extension.
                 */
                name_start = &uname[0];
                while (name_start < ext_start) {
                        if (!vfat_skip_char(*name_start))
                                break;
                        name_start++;
                }
                if (name_start != ext_start) {
                        sz = ext_start - uname;
                        ext_start++;
                } else {
                        sz = ulen;
                        ext_start = NULL;
                }
        }

        numtail_baselen = 6;
        numtail2_baselen = 2;
        for (baselen = i = 0, p = base, ip = uname; i < sz; i++, ip++) {
                chl = to_shortname_char(nls, charbuf, sizeof(charbuf),
                                        ip, &base_info);
                if (chl == 0)
                        continue;

                if (baselen < 2 && (baselen + chl) > 2)
                        numtail2_baselen = baselen;
                if (baselen < 6 && (baselen + chl) > 6)
                        numtail_baselen = baselen;
                for (chi = 0; chi < chl; chi++) {
                        *p++ = charbuf[chi];
                        baselen++;
                        if (baselen >= 8)
                                break;
                }
                if (baselen >= 8) {
                        if ((chi < chl - 1) || (ip + 1) - uname < sz)
                                is_shortname = 0;
                        break;
                }
        }
        if (baselen == 0) {
                return -EINVAL;
        }

        extlen = 0;
        if (ext_start) {
                for (p = ext, ip = ext_start; extlen < 3 && ip < end; ip++) {
                        chl = to_shortname_char(nls, charbuf, sizeof(charbuf),
                                                ip, &ext_info);
                        if (chl == 0)
                                continue;

                        if ((extlen + chl) > 3) {
                                is_shortname = 0;
                                break;
                        }
                        for (chi = 0; chi < chl; chi++) {
                                *p++ = charbuf[chi];
                                extlen++;
                        }
                        if (extlen >= 3) {
                                if (ip + 1 != end)
                                        is_shortname = 0;
                                break;
                        }
                }
        }
        ext[extlen] = '\0';
        base[baselen] = '\0';

        /* Yes, it can happen. ".\xe5" would do it. */
        if (base[0] == DELETED_FLAG)
                base[0] = 0x05;

        /* OK, at this point we know that base is not longer than 8 symbols,
         * ext is not longer than 3, base is nonempty, both don't contain
         * any bad symbols (lowercase transformed to uppercase).
         */

        memset(name_res, ' ', MSDOS_NAME);
        memcpy(name_res, base, baselen);
        memcpy(name_res + 8, ext, extlen);
        *lcase = 0;
        if (is_shortname && base_info.valid && ext_info.valid) {
                if (vfat_find_form(dir, name_res) == 0)
                        return -EEXIST;

                if (opts->shortname & VFAT_SFN_CREATE_WIN95) {
                        return (base_info.upper && ext_info.upper);
                } else if (opts->shortname & VFAT_SFN_CREATE_WINNT) {
                        if ((base_info.upper || base_info.lower) &&
                            (ext_info.upper || ext_info.lower)) {
                                if (!base_info.upper && base_info.lower)
                                        *lcase |= CASE_LOWER_BASE;
                                if (!ext_info.upper && ext_info.lower)
                                        *lcase |= CASE_LOWER_EXT;
                                return 1;
                        }
                        return 0;
                } else {
                        BUG();
                }
        }

        if (opts->numtail == 0)
                if (vfat_find_form(dir, name_res) < 0)
                        return 0;

        /*
         * Try to find a unique extension.  This used to
         * iterate through all possibilities sequentially,
         * but that gave extremely bad performance.  Windows
         * only tries a few cases before using random
         * values for part of the base.
         */

        if (baselen > 6) {
                baselen = numtail_baselen;
                name_res[7] = ' ';
        }
        name_res[baselen] = '~';
        for (i = 1; i < 10; i++) {
                name_res[baselen + 1] = i + '0';
                if (vfat_find_form(dir, name_res) < 0)
                        return 0;
        }

        i = jiffies;
        sz = (jiffies >> 16) & 0x7;
        if (baselen > 2) {
                baselen = numtail2_baselen;
                name_res[7] = ' ';
        }
        name_res[baselen + 4] = '~';
        name_res[baselen + 5] = '1' + sz;
        while (1) {
                snprintf(buf, sizeof(buf), "%04X", i & 0xffff);
                memcpy(&name_res[baselen], buf, 4);
                if (vfat_find_form(dir, name_res) < 0)
                        break;
                i -= 11;
        }
        return 0;
}

/* Translate a string, including coded sequences into Unicode */
static int
xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
             int *longlen, int *outlen, int escape, int utf8,
             struct nls_table *nls)
{
        const unsigned char *ip;
        unsigned char *op;
        int i, fill;
        int charlen;

        if (utf8) {
                *outlen = utf8s_to_utf16s(name, len, UTF16_HOST_ENDIAN,
                                (wchar_t *) outname, FAT_LFN_LEN + 2);
                if (*outlen < 0)
                        return *outlen;
                else if (*outlen > FAT_LFN_LEN)
                        return -ENAMETOOLONG;

                op = &outname[*outlen * sizeof(wchar_t)];
        } else {
                for (i = 0, ip = name, op = outname, *outlen = 0;
                         i < len && *outlen < FAT_LFN_LEN;
                         *outlen += 1) {
                        if (escape && (*ip == ':')) {
                                u8 uc[2];

                                if (i > len - 5)
                                        return -EINVAL;

                                if (hex2bin(uc, ip + 1, 2) < 0)
                                        return -EINVAL;

                                *(wchar_t *)op = uc[0] << 8 | uc[1];

                                op += 2;
                                ip += 5;
                                i += 5;
                        } else {
                                charlen = nls->char2uni(ip, len - i,
                                                        (wchar_t *)op);
                                if (charlen < 0)
                                        return -EINVAL;
                                ip += charlen;
                                i += charlen;
                                op += 2;
                        }
                }
                if (i < len)
                        return -ENAMETOOLONG;
        }

        *longlen = *outlen;
        if (*outlen % 13) {
                *op++ = 0;
                *op++ = 0;
                *outlen += 1;
                if (*outlen % 13) {
                        fill = 13 - (*outlen % 13);
                        for (i = 0; i < fill; i++) {
                                *op++ = 0xff;
                                *op++ = 0xff;
                        }
                        *outlen += fill;
                }
        }

        return 0;
}

static int vfat_build_slots(struct inode *dir, const unsigned char *name,
                            int len, int is_dir, int cluster,
                            struct timespec64 *ts,
                            struct msdos_dir_slot *slots, int *nr_slots)
{
        struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
        struct fat_mount_options *opts = &sbi->options;
        struct msdos_dir_slot *ps;
        struct msdos_dir_entry *de;
        unsigned char cksum, lcase;
        unsigned char msdos_name[MSDOS_NAME];
        wchar_t *uname;
        __le16 time, date;
        u8 time_cs;
        int err, ulen, usize, i;
        loff_t offset;

        *nr_slots = 0;

        uname = __getname();
        if (!uname)
                return -ENOMEM;

        err = xlate_to_uni(name, len, (unsigned char *)uname, &ulen, &usize,
                           opts->unicode_xlate, opts->utf8, sbi->nls_io);
        if (err)
                goto out_free;

        err = vfat_is_used_badchars(uname, ulen);
        if (err)
                goto out_free;

        err = vfat_create_shortname(dir, sbi->nls_disk, uname, ulen,
                                    msdos_name, &lcase);
        if (err < 0)
                goto out_free;
        else if (err == 1) {
                de = (struct msdos_dir_entry *)slots;
                err = 0;
                goto shortname;
        }

        /* build the entry of long file name */
        cksum = fat_checksum(msdos_name);

        *nr_slots = usize / 13;
        for (ps = slots, i = *nr_slots; i > 0; i--, ps++) {
                ps->id = i;
                ps->attr = ATTR_EXT;
                ps->reserved = 0;
                ps->alias_checksum = cksum;
                ps->start = 0;
                offset = (i - 1) * 13;
                fatwchar_to16(ps->name0_4, uname + offset, 5);
                fatwchar_to16(ps->name5_10, uname + offset + 5, 6);
                fatwchar_to16(ps->name11_12, uname + offset + 11, 2);
        }
        slots[0].id |= 0x40;
        de = (struct msdos_dir_entry *)ps;

shortname:
        /* build the entry of 8.3 alias name */
        (*nr_slots)++;
        memcpy(de->name, msdos_name, MSDOS_NAME);
        de->attr = is_dir ? ATTR_DIR : ATTR_ARCH;
        de->lcase = lcase;
        fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);
        de->time = de->ctime = time;
        de->date = de->cdate = de->adate = date;
        de->ctime_cs = time_cs;
        fat_set_start(de, cluster);
        de->size = 0;
out_free:
        __putname(uname);
        return err;
}

static int vfat_add_entry(struct inode *dir, const struct qstr *qname,
                          int is_dir, int cluster, struct timespec64 *ts,
                          struct fat_slot_info *sinfo)
{
        struct msdos_dir_slot *slots;
        unsigned int len;
        int err, nr_slots;

        len = vfat_striptail_len(qname);
        if (len == 0)
                return -ENOENT;

        slots = kmalloc_array(MSDOS_SLOTS, sizeof(*slots), GFP_NOFS);
        if (slots == NULL)
                return -ENOMEM;

        err = vfat_build_slots(dir, qname->name, len, is_dir, cluster, ts,
                               slots, &nr_slots);
        if (err)
                goto cleanup;

        err = fat_add_entries(dir, slots, nr_slots, sinfo);
        if (err)
                goto cleanup;

        /* update timestamp */
        fat_truncate_time(dir, ts, S_CTIME|S_MTIME);
        if (IS_DIRSYNC(dir))
                (void)fat_sync_inode(dir);
        else
                mark_inode_dirty(dir);
cleanup:
        kfree(slots);
        return err;
}

static int vfat_find(struct inode *dir, const struct qstr *qname,
                     struct fat_slot_info *sinfo)
{
        unsigned int len = vfat_striptail_len(qname);
        if (len == 0)
                return -ENOENT;
        return fat_search_long(dir, qname->name, len, sinfo);
}

static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
                                  unsigned int flags)
{
        struct super_block *sb = dir->i_sb;
        struct fat_slot_info sinfo;
        struct inode *inode;
        struct dentry *alias;
        int err;

        mutex_lock(&MSDOS_SB(sb)->s_lock);

        err = vfat_find(dir, &dentry->d_name, &sinfo);
        if (err) {
                if (err == -ENOENT) {
                        inode = NULL;
                        goto out;
                }
                goto error;
        }

        inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
        brelse(sinfo.bh);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                goto error;
        }

        alias = d_find_alias(inode);
        /*
         * Checking "alias->d_parent == dentry->d_parent" to make sure
         * FS is not corrupted (especially double linked dir).
         */
        if (alias && alias->d_parent == dentry->d_parent) {
                /*
                 * This inode has non anonymous-DCACHE_DISCONNECTED
                 * dentry. This means, the user did ->lookup() by an
                 * another name (longname vs 8.3 alias of it) in past.
                 *
                 * Switch to new one for reason of locality if possible.
                 */
                if (!S_ISDIR(inode->i_mode))
                        d_move(alias, dentry);
                iput(inode);
                mutex_unlock(&MSDOS_SB(sb)->s_lock);
                return alias;
        } else
                dput(alias);

out:
        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        if (!inode)
                vfat_d_version_set(dentry, inode_query_iversion(dir));
        return d_splice_alias(inode, dentry);
error:
        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        return ERR_PTR(err);
}

static int vfat_create(struct mnt_idmap *idmap, struct inode *dir,
                       struct dentry *dentry, umode_t mode, bool excl)
{
        struct super_block *sb = dir->i_sb;
        struct inode *inode;
        struct fat_slot_info sinfo;
        struct timespec64 ts;
        int err;

        mutex_lock(&MSDOS_SB(sb)->s_lock);

        ts = current_time(dir);
        err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
        if (err)
                goto out;
        inode_inc_iversion(dir);

        inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
        brelse(sinfo.bh);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                goto out;
        }
        inode_inc_iversion(inode);

        d_instantiate(dentry, inode);
out:
        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        return err;
}

static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        struct super_block *sb = dir->i_sb;
        struct fat_slot_info sinfo;
        int err;

        mutex_lock(&MSDOS_SB(sb)->s_lock);

        err = fat_dir_empty(inode);
        if (err)
                goto out;
        err = vfat_find(dir, &dentry->d_name, &sinfo);
        if (err)
                goto out;

        err = fat_remove_entries(dir, &sinfo);        /* and releases bh */
        if (err)
                goto out;
        drop_nlink(dir);

        clear_nlink(inode);
        fat_truncate_time(inode, NULL, S_ATIME|S_MTIME);
        fat_detach(inode);
        vfat_d_version_set(dentry, inode_query_iversion(dir));
out:
        mutex_unlock(&MSDOS_SB(sb)->s_lock);

        return err;
}

static int vfat_unlink(struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        struct super_block *sb = dir->i_sb;
        struct fat_slot_info sinfo;
        int err;

        mutex_lock(&MSDOS_SB(sb)->s_lock);

        err = vfat_find(dir, &dentry->d_name, &sinfo);
        if (err)
                goto out;

        err = fat_remove_entries(dir, &sinfo);        /* and releases bh */
        if (err)
                goto out;
        clear_nlink(inode);
        fat_truncate_time(inode, NULL, S_ATIME|S_MTIME);
        fat_detach(inode);
        vfat_d_version_set(dentry, inode_query_iversion(dir));
out:
        mutex_unlock(&MSDOS_SB(sb)->s_lock);

        return err;
}

static int vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
                      struct dentry *dentry, umode_t mode)
{
        struct super_block *sb = dir->i_sb;
        struct inode *inode;
        struct fat_slot_info sinfo;
        struct timespec64 ts;
        int err, cluster;

        mutex_lock(&MSDOS_SB(sb)->s_lock);

        ts = current_time(dir);
        cluster = fat_alloc_new_dir(dir, &ts);
        if (cluster < 0) {
                err = cluster;
                goto out;
        }
        err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &ts, &sinfo);
        if (err)
                goto out_free;
        inode_inc_iversion(dir);
        inc_nlink(dir);

        inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
        brelse(sinfo.bh);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                /* the directory was completed, just return a error */
                goto out;
        }
        inode_inc_iversion(inode);
        set_nlink(inode, 2);

        d_instantiate(dentry, inode);

        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        return 0;

out_free:
        fat_free_clusters(dir, cluster);
out:
        mutex_unlock(&MSDOS_SB(sb)->s_lock);
        return err;
}

static int vfat_get_dotdot_de(struct inode *inode, struct buffer_head **bh,
                              struct msdos_dir_entry **de)
{
        if (S_ISDIR(inode->i_mode)) {
                if (fat_get_dotdot_entry(inode, bh, de))
                        return -EIO;
        }
        return 0;
}

static int vfat_sync_ipos(struct inode *dir, struct inode *inode)
{
        if (IS_DIRSYNC(dir))
                return fat_sync_inode(inode);
        mark_inode_dirty(inode);
        return 0;
}

static int vfat_update_dotdot_de(struct inode *dir, struct inode *inode,
                                 struct buffer_head *dotdot_bh,
                                 struct msdos_dir_entry *dotdot_de)
{
        fat_set_start(dotdot_de, MSDOS_I(dir)->i_logstart);
        mark_buffer_dirty_inode(dotdot_bh, inode);
        if (IS_DIRSYNC(dir))
                return sync_dirty_buffer(dotdot_bh);
        return 0;
}

static void vfat_update_dir_metadata(struct inode *dir, struct timespec64 *ts)
{
        inode_inc_iversion(dir);
        fat_truncate_time(dir, ts, S_CTIME | S_MTIME);
        if (IS_DIRSYNC(dir))
                (void)fat_sync_inode(dir);
        else
                mark_inode_dirty(dir);
}

static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
                       struct inode *new_dir, struct dentry *new_dentry)
{
        struct buffer_head *dotdot_bh;
        struct msdos_dir_entry *dotdot_de = NULL;
        struct inode *old_inode, *new_inode;
        struct fat_slot_info old_sinfo, sinfo;
        struct timespec64 ts;
        loff_t new_i_pos;
        int err, is_dir, corrupt = 0;
        struct super_block *sb = old_dir->i_sb;

        old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
        old_inode = d_inode(old_dentry);
        new_inode = d_inode(new_dentry);
        mutex_lock(&MSDOS_SB(sb)->s_lock);
        err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
        if (err)
                goto out;

        if (old_dir != new_dir) {
                err = vfat_get_dotdot_de(old_inode, &dotdot_bh, &dotdot_de);
                if (err)
                        goto out;
        }

        is_dir = S_ISDIR(old_inode->i_mode);
        ts = current_time(old_dir);
        if (new_inode) {
                if (is_dir) {
                        err = fat_dir_empty(new_inode);
                        if (err)
                                goto out;
                }
                new_i_pos = MSDOS_I(new_inode)->i_pos;
                fat_detach(new_inode);
        } else {
                err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0,
                                     &ts, &sinfo);
                if (err)
                        goto out;
                new_i_pos = sinfo.i_pos;
        }
        inode_inc_iversion(new_dir);

        fat_detach(old_inode);
        fat_attach(old_inode, new_i_pos);
        err = vfat_sync_ipos(new_dir, old_inode);
        if (err)
                goto error_inode;

        if (dotdot_de) {
                err = vfat_update_dotdot_de(new_dir, old_inode, dotdot_bh,
                                            dotdot_de);
                if (err)
                        goto error_dotdot;
                drop_nlink(old_dir);
                if (!new_inode)
                         inc_nlink(new_dir);
        }

        err = fat_remove_entries(old_dir, &old_sinfo);        /* and releases bh */
        old_sinfo.bh = NULL;
        if (err)
                goto error_dotdot;
        vfat_update_dir_metadata(old_dir, &ts);

        if (new_inode) {
                drop_nlink(new_inode);
                if (is_dir)
                        drop_nlink(new_inode);
                fat_truncate_time(new_inode, &ts, S_CTIME);
        }
out:
        brelse(sinfo.bh);
        brelse(dotdot_bh);
        brelse(old_sinfo.bh);
        mutex_unlock(&MSDOS_SB(sb)->s_lock);

        return err;

error_dotdot:
        /* data cluster is shared, serious corruption */
        corrupt = 1;

        if (dotdot_de) {
                corrupt |= vfat_update_dotdot_de(old_dir, old_inode, dotdot_bh,
                                                 dotdot_de);
        }
error_inode:
        fat_detach(old_inode);
        fat_attach(old_inode, old_sinfo.i_pos);
        if (new_inode) {
                fat_attach(new_inode, new_i_pos);
                if (corrupt)
                        corrupt |= fat_sync_inode(new_inode);
        } else {
                /*
                 * If new entry was not sharing the data cluster, it
                 * shouldn't be serious corruption.
                 */
                int err2 = fat_remove_entries(new_dir, &sinfo);
                if (corrupt)
                        corrupt |= err2;
                sinfo.bh = NULL;
        }
        if (corrupt < 0) {
                fat_fs_error(new_dir->i_sb,
                             "%s: Filesystem corrupted (i_pos %lld)",
                             __func__, sinfo.i_pos);
        }
        goto out;
}

static void vfat_exchange_ipos(struct inode *old_inode, struct inode *new_inode,
                               loff_t old_i_pos, loff_t new_i_pos)
{
        fat_detach(old_inode);
        fat_detach(new_inode);
        fat_attach(old_inode, new_i_pos);
        fat_attach(new_inode, old_i_pos);
}

static void vfat_move_nlink(struct inode *src, struct inode *dst)
{
        drop_nlink(src);
        inc_nlink(dst);
}

static int vfat_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
                                struct inode *new_dir, struct dentry *new_dentry)
{
        struct buffer_head *old_dotdot_bh = NULL, *new_dotdot_bh = NULL;
        struct msdos_dir_entry *old_dotdot_de = NULL, *new_dotdot_de = NULL;
        struct inode *old_inode, *new_inode;
        struct timespec64 ts = current_time(old_dir);
        loff_t old_i_pos, new_i_pos;
        int err, corrupt = 0;
        struct super_block *sb = old_dir->i_sb;

        old_inode = d_inode(old_dentry);
        new_inode = d_inode(new_dentry);

        /* Acquire super block lock for the operation to be atomic */
        mutex_lock(&MSDOS_SB(sb)->s_lock);

        /* if directories are not the same, get ".." info to update */
        if (old_dir != new_dir) {
                err = vfat_get_dotdot_de(old_inode, &old_dotdot_bh,
                                         &old_dotdot_de);
                if (err)
                        goto out;

                err = vfat_get_dotdot_de(new_inode, &new_dotdot_bh,
                                         &new_dotdot_de);
                if (err)
                        goto out;
        }

        old_i_pos = MSDOS_I(old_inode)->i_pos;
        new_i_pos = MSDOS_I(new_inode)->i_pos;

        vfat_exchange_ipos(old_inode, new_inode, old_i_pos, new_i_pos);

        err = vfat_sync_ipos(old_dir, new_inode);
        if (err)
                goto error_exchange;
        err = vfat_sync_ipos(new_dir, old_inode);
        if (err)
                goto error_exchange;

        /* update ".." directory entry info */
        if (old_dotdot_de) {
                err = vfat_update_dotdot_de(new_dir, old_inode, old_dotdot_bh,
                                            old_dotdot_de);
                if (err)
                        goto error_old_dotdot;
        }
        if (new_dotdot_de) {
                err = vfat_update_dotdot_de(old_dir, new_inode, new_dotdot_bh,
                                            new_dotdot_de);
                if (err)
                        goto error_new_dotdot;
        }

        /* if cross directory and only one is a directory, adjust nlink */
        if (!old_dotdot_de != !new_dotdot_de) {
                if (old_dotdot_de)
                        vfat_move_nlink(old_dir, new_dir);
                else
                        vfat_move_nlink(new_dir, old_dir);
        }

        vfat_update_dir_metadata(old_dir, &ts);
        /* if directories are not the same, update new_dir as well */
        if (old_dir != new_dir)
                vfat_update_dir_metadata(new_dir, &ts);

out:
        brelse(old_dotdot_bh);
        brelse(new_dotdot_bh);
        mutex_unlock(&MSDOS_SB(sb)->s_lock);

        return err;

error_new_dotdot:
        if (new_dotdot_de) {
                corrupt |= vfat_update_dotdot_de(new_dir, new_inode,
                                                 new_dotdot_bh, new_dotdot_de);
        }

error_old_dotdot:
        if (old_dotdot_de) {
                corrupt |= vfat_update_dotdot_de(old_dir, old_inode,
                                                 old_dotdot_bh, old_dotdot_de);
        }

error_exchange:
        vfat_exchange_ipos(old_inode, new_inode, new_i_pos, old_i_pos);
        corrupt |= vfat_sync_ipos(new_dir, new_inode);
        corrupt |= vfat_sync_ipos(old_dir, old_inode);

        if (corrupt < 0) {
                fat_fs_error(new_dir->i_sb,
                             "%s: Filesystem corrupted (i_pos %lld, %lld)",
                             __func__, old_i_pos, new_i_pos);
        }
        goto out;
}

static int vfat_rename2(struct mnt_idmap *idmap, struct inode *old_dir,
                        struct dentry *old_dentry, struct inode *new_dir,
                        struct dentry *new_dentry, unsigned int flags)
{
        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
                return -EINVAL;

        if (flags & RENAME_EXCHANGE) {
                return vfat_rename_exchange(old_dir, old_dentry,
                                            new_dir, new_dentry);
        }

        /* VFS already handled RENAME_NOREPLACE, handle it as a normal rename */
        return vfat_rename(old_dir, old_dentry, new_dir, new_dentry);
}

static const struct inode_operations vfat_dir_inode_operations = {
        .create                = vfat_create,
        .lookup                = vfat_lookup,
        .unlink                = vfat_unlink,
        .mkdir                = vfat_mkdir,
        .rmdir                = vfat_rmdir,
        .rename                = vfat_rename2,
        .setattr        = fat_setattr,
        .getattr        = fat_getattr,
        .update_time        = fat_update_time,
};

static void setup(struct super_block *sb)
{
        MSDOS_SB(sb)->dir_ops = &vfat_dir_inode_operations;
        if (MSDOS_SB(sb)->options.name_check != 's')
                sb->s_d_op = &vfat_ci_dentry_ops;
        else
                sb->s_d_op = &vfat_dentry_ops;
}

static int vfat_fill_super(struct super_block *sb, void *data, int silent)
{
        return fat_fill_super(sb, data, silent, 1, setup);
}

static struct dentry *vfat_mount(struct file_system_type *fs_type,
                       int flags, const char *dev_name,
                       void *data)
{
        return mount_bdev(fs_type, flags, dev_name, data, vfat_fill_super);
}

static struct file_system_type vfat_fs_type = {
        .owner                = THIS_MODULE,
        .name                = "vfat",
        .mount                = vfat_mount,
        .kill_sb        = kill_block_super,
        .fs_flags        = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("vfat");

static int __init init_vfat_fs(void)
{
        return register_filesystem(&vfat_fs_type);
}

static void __exit exit_vfat_fs(void)
{
        unregister_filesystem(&vfat_fs_type);
}

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("VFAT filesystem support");
MODULE_AUTHOR("Gordon Chaffee");

module_init(init_vfat_fs)
module_exit(exit_vfat_fs)



















































    3 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_USER_H
#define _LINUX_SCHED_USER_H

#include <linux/uidgid.h>
#include <linux/atomic.h>
#include <linux/percpu_counter.h>
#include <linux/refcount.h>
#include <linux/ratelimit.h>

/*
 * Some day this will be a full-fledged user tracking system..
 */
struct user_struct {
        refcount_t __count;        /* reference count */
#ifdef CONFIG_EPOLL
        struct percpu_counter epoll_watches; /* The number of file descriptors currently watched */
#endif
        unsigned long unix_inflight;        /* How many files in flight in unix sockets */
        atomic_long_t pipe_bufs;  /* how many pages are allocated in pipe buffers */

        /* Hash table maintenance information */
        struct hlist_node uidhash_node;
        kuid_t uid;

#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \
        defined(CONFIG_NET) || defined(CONFIG_IO_URING) || \
        defined(CONFIG_VFIO_PCI_ZDEV_KVM) || IS_ENABLED(CONFIG_IOMMUFD)
        atomic_long_t locked_vm;
#endif
#ifdef CONFIG_WATCH_QUEUE
        atomic_t nr_watches;        /* The number of watches this user currently has */
#endif

        /* Miscellaneous per-user rate limit */
        struct ratelimit_state ratelimit;
};

extern int uids_sysfs_init(void);

extern struct user_struct *find_user(kuid_t);

extern struct user_struct root_user;
#define INIT_USER (&root_user)


/* per-UID process charging. */
extern struct user_struct * alloc_uid(kuid_t);
static inline struct user_struct *get_uid(struct user_struct *u)
{
        refcount_inc(&u->__count);
        return u;
}
extern void free_uid(struct user_struct *);

#endif /* _LINUX_SCHED_USER_H */







































































































































































































































































































































































    7 





    7 






    2 
    5 






    6 





    7 






    4 
    6 








































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
// SPDX-License-Identifier: GPL-2.0-only
/*
 * lib/bitmap.c
 * Helper functions for bitmap.h.
 */

#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/ctype.h>
#include <linux/device.h>
#include <linux/export.h>
#include <linux/slab.h>

/**
 * DOC: bitmap introduction
 *
 * bitmaps provide an array of bits, implemented using an
 * array of unsigned longs.  The number of valid bits in a
 * given bitmap does _not_ need to be an exact multiple of
 * BITS_PER_LONG.
 *
 * The possible unused bits in the last, partially used word
 * of a bitmap are 'don't care'.  The implementation makes
 * no particular effort to keep them zero.  It ensures that
 * their value will not affect the results of any operation.
 * The bitmap operations that return Boolean (bitmap_empty,
 * for example) or scalar (bitmap_weight, for example) results
 * carefully filter out these unused bits from impacting their
 * results.
 *
 * The byte ordering of bitmaps is more natural on little
 * endian architectures.  See the big-endian headers
 * include/asm-ppc64/bitops.h and include/asm-s390/bitops.h
 * for the best explanations of this ordering.
 */

bool __bitmap_equal(const unsigned long *bitmap1,
                    const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] != bitmap2[k])
                        return false;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] ^ bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return false;

        return true;
}
EXPORT_SYMBOL(__bitmap_equal);

bool __bitmap_or_equal(const unsigned long *bitmap1,
                       const unsigned long *bitmap2,
                       const unsigned long *bitmap3,
                       unsigned int bits)
{
        unsigned int k, lim = bits / BITS_PER_LONG;
        unsigned long tmp;

        for (k = 0; k < lim; ++k) {
                if ((bitmap1[k] | bitmap2[k]) != bitmap3[k])
                        return false;
        }

        if (!(bits % BITS_PER_LONG))
                return true;

        tmp = (bitmap1[k] | bitmap2[k]) ^ bitmap3[k];
        return (tmp & BITMAP_LAST_WORD_MASK(bits)) == 0;
}

void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits)
{
        unsigned int k, lim = BITS_TO_LONGS(bits);
        for (k = 0; k < lim; ++k)
                dst[k] = ~src[k];
}
EXPORT_SYMBOL(__bitmap_complement);

/**
 * __bitmap_shift_right - logical right shift of the bits in a bitmap
 *   @dst : destination bitmap
 *   @src : source bitmap
 *   @shift : shift by this many bits
 *   @nbits : bitmap size, in bits
 *
 * Shifting right (dividing) means moving bits in the MS -> LS bit
 * direction.  Zeros are fed into the vacated MS positions and the
 * LS bits shifted off the bottom are lost.
 */
void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
                        unsigned shift, unsigned nbits)
{
        unsigned k, lim = BITS_TO_LONGS(nbits);
        unsigned off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
        unsigned long mask = BITMAP_LAST_WORD_MASK(nbits);
        for (k = 0; off + k < lim; ++k) {
                unsigned long upper, lower;

                /*
                 * If shift is not word aligned, take lower rem bits of
                 * word above and make them the top rem bits of result.
                 */
                if (!rem || off + k + 1 >= lim)
                        upper = 0;
                else {
                        upper = src[off + k + 1];
                        if (off + k + 1 == lim - 1)
                                upper &= mask;
                        upper <<= (BITS_PER_LONG - rem);
                }
                lower = src[off + k];
                if (off + k == lim - 1)
                        lower &= mask;
                lower >>= rem;
                dst[k] = lower | upper;
        }
        if (off)
                memset(&dst[lim - off], 0, off*sizeof(unsigned long));
}
EXPORT_SYMBOL(__bitmap_shift_right);


/**
 * __bitmap_shift_left - logical left shift of the bits in a bitmap
 *   @dst : destination bitmap
 *   @src : source bitmap
 *   @shift : shift by this many bits
 *   @nbits : bitmap size, in bits
 *
 * Shifting left (multiplying) means moving bits in the LS -> MS
 * direction.  Zeros are fed into the vacated LS bit positions
 * and those MS bits shifted off the top are lost.
 */

void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
                        unsigned int shift, unsigned int nbits)
{
        int k;
        unsigned int lim = BITS_TO_LONGS(nbits);
        unsigned int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
        for (k = lim - off - 1; k >= 0; --k) {
                unsigned long upper, lower;

                /*
                 * If shift is not word aligned, take upper rem bits of
                 * word below and make them the bottom rem bits of result.
                 */
                if (rem && k > 0)
                        lower = src[k - 1] >> (BITS_PER_LONG - rem);
                else
                        lower = 0;
                upper = src[k] << rem;
                dst[k + off] = lower | upper;
        }
        if (off)
                memset(dst, 0, off*sizeof(unsigned long));
}
EXPORT_SYMBOL(__bitmap_shift_left);

/**
 * bitmap_cut() - remove bit region from bitmap and right shift remaining bits
 * @dst: destination bitmap, might overlap with src
 * @src: source bitmap
 * @first: start bit of region to be removed
 * @cut: number of bits to remove
 * @nbits: bitmap size, in bits
 *
 * Set the n-th bit of @dst iff the n-th bit of @src is set and
 * n is less than @first, or the m-th bit of @src is set for any
 * m such that @first <= n < nbits, and m = n + @cut.
 *
 * In pictures, example for a big-endian 32-bit architecture:
 *
 * The @src bitmap is::
 *
 *   31                                   63
 *   |                                    |
 *   10000000 11000001 11110010 00010101  10000000 11000001 01110010 00010101
 *                   |  |              |                                    |
 *                  16  14             0                                   32
 *
 * if @cut is 3, and @first is 14, bits 14-16 in @src are cut and @dst is::
 *
 *   31                                   63
 *   |                                    |
 *   10110000 00011000 00110010 00010101  00010000 00011000 00101110 01000010
 *                      |              |                                    |
 *                      14 (bit 17     0                                   32
 *                          from @src)
 *
 * Note that @dst and @src might overlap partially or entirely.
 *
 * This is implemented in the obvious way, with a shift and carry
 * step for each moved bit. Optimisation is left as an exercise
 * for the compiler.
 */
void bitmap_cut(unsigned long *dst, const unsigned long *src,
                unsigned int first, unsigned int cut, unsigned int nbits)
{
        unsigned int len = BITS_TO_LONGS(nbits);
        unsigned long keep = 0, carry;
        int i;

        if (first % BITS_PER_LONG) {
                keep = src[first / BITS_PER_LONG] &
                       (~0UL >> (BITS_PER_LONG - first % BITS_PER_LONG));
        }

        memmove(dst, src, len * sizeof(*dst));

        while (cut--) {
                for (i = first / BITS_PER_LONG; i < len; i++) {
                        if (i < len - 1)
                                carry = dst[i + 1] & 1UL;
                        else
                                carry = 0;

                        dst[i] = (dst[i] >> 1) | (carry << (BITS_PER_LONG - 1));
                }
        }

        dst[first / BITS_PER_LONG] &= ~0UL << (first % BITS_PER_LONG);
        dst[first / BITS_PER_LONG] |= keep;
}
EXPORT_SYMBOL(bitmap_cut);

bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int lim = bits/BITS_PER_LONG;
        unsigned long result = 0;

        for (k = 0; k < lim; k++)
                result |= (dst[k] = bitmap1[k] & bitmap2[k]);
        if (bits % BITS_PER_LONG)
                result |= (dst[k] = bitmap1[k] & bitmap2[k] &
                           BITMAP_LAST_WORD_MASK(bits));
        return result != 0;
}
EXPORT_SYMBOL(__bitmap_and);

void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(bits);

        for (k = 0; k < nr; k++)
                dst[k] = bitmap1[k] | bitmap2[k];
}
EXPORT_SYMBOL(__bitmap_or);

void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(bits);

        for (k = 0; k < nr; k++)
                dst[k] = bitmap1[k] ^ bitmap2[k];
}
EXPORT_SYMBOL(__bitmap_xor);

bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int lim = bits/BITS_PER_LONG;
        unsigned long result = 0;

        for (k = 0; k < lim; k++)
                result |= (dst[k] = bitmap1[k] & ~bitmap2[k]);
        if (bits % BITS_PER_LONG)
                result |= (dst[k] = bitmap1[k] & ~bitmap2[k] &
                           BITMAP_LAST_WORD_MASK(bits));
        return result != 0;
}
EXPORT_SYMBOL(__bitmap_andnot);

void __bitmap_replace(unsigned long *dst,
                      const unsigned long *old, const unsigned long *new,
                      const unsigned long *mask, unsigned int nbits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(nbits);

        for (k = 0; k < nr; k++)
                dst[k] = (old[k] & ~mask[k]) | (new[k] & mask[k]);
}
EXPORT_SYMBOL(__bitmap_replace);

bool __bitmap_intersects(const unsigned long *bitmap1,
                         const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] & bitmap2[k])
                        return true;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] & bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return true;
        return false;
}
EXPORT_SYMBOL(__bitmap_intersects);

bool __bitmap_subset(const unsigned long *bitmap1,
                     const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] & ~bitmap2[k])
                        return false;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] & ~bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return false;
        return true;
}
EXPORT_SYMBOL(__bitmap_subset);

#define BITMAP_WEIGHT(FETCH, bits)        \
({                                                                                \
        unsigned int __bits = (bits), idx, w = 0;                                \
                                                                                \
        for (idx = 0; idx < __bits / BITS_PER_LONG; idx++)                        \
                w += hweight_long(FETCH);                                        \
                                                                                \
        if (__bits % BITS_PER_LONG)                                                \
                w += hweight_long((FETCH) & BITMAP_LAST_WORD_MASK(__bits));        \
                                                                                \
        w;                                                                        \
})

unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int bits)
{
        return BITMAP_WEIGHT(bitmap[idx], bits);
}
EXPORT_SYMBOL(__bitmap_weight);

unsigned int __bitmap_weight_and(const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        return BITMAP_WEIGHT(bitmap1[idx] & bitmap2[idx], bits);
}
EXPORT_SYMBOL(__bitmap_weight_and);

unsigned int __bitmap_weight_andnot(const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        return BITMAP_WEIGHT(bitmap1[idx] & ~bitmap2[idx], bits);
}
EXPORT_SYMBOL(__bitmap_weight_andnot);

void __bitmap_set(unsigned long *map, unsigned int start, int len)
{
        unsigned long *p = map + BIT_WORD(start);
        const unsigned int size = start + len;
        int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
        unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);

        while (len - bits_to_set >= 0) {
                *p |= mask_to_set;
                len -= bits_to_set;
                bits_to_set = BITS_PER_LONG;
                mask_to_set = ~0UL;
                p++;
        }
        if (len) {
                mask_to_set &= BITMAP_LAST_WORD_MASK(size);
                *p |= mask_to_set;
        }
}
EXPORT_SYMBOL(__bitmap_set);

void __bitmap_clear(unsigned long *map, unsigned int start, int len)
{
        unsigned long *p = map + BIT_WORD(start);
        const unsigned int size = start + len;
        int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
        unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);

        while (len - bits_to_clear >= 0) {
                *p &= ~mask_to_clear;
                len -= bits_to_clear;
                bits_to_clear = BITS_PER_LONG;
                mask_to_clear = ~0UL;
                p++;
        }
        if (len) {
                mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
                *p &= ~mask_to_clear;
        }
}
EXPORT_SYMBOL(__bitmap_clear);

/**
 * bitmap_find_next_zero_area_off - find a contiguous aligned zero area
 * @map: The address to base the search on
 * @size: The bitmap size in bits
 * @start: The bitnumber to start searching at
 * @nr: The number of zeroed bits we're looking for
 * @align_mask: Alignment mask for zero area
 * @align_offset: Alignment offset for zero area.
 *
 * The @align_mask should be one less than a power of 2; the effect is that
 * the bit offset of all zero areas this function finds plus @align_offset
 * is multiple of that power of 2.
 */
unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
                                             unsigned long size,
                                             unsigned long start,
                                             unsigned int nr,
                                             unsigned long align_mask,
                                             unsigned long align_offset)
{
        unsigned long index, end, i;
again:
        index = find_next_zero_bit(map, size, start);

        /* Align allocation */
        index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset;

        end = index + nr;
        if (end > size)
                return end;
        i = find_next_bit(map, end, index);
        if (i < end) {
                start = i + 1;
                goto again;
        }
        return index;
}
EXPORT_SYMBOL(bitmap_find_next_zero_area_off);

/**
 * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap
 *        @buf: pointer to a bitmap
 *        @pos: a bit position in @buf (0 <= @pos < @nbits)
 *        @nbits: number of valid bit positions in @buf
 *
 * Map the bit at position @pos in @buf (of length @nbits) to the
 * ordinal of which set bit it is.  If it is not set or if @pos
 * is not a valid bit position, map to -1.
 *
 * If for example, just bits 4 through 7 are set in @buf, then @pos
 * values 4 through 7 will get mapped to 0 through 3, respectively,
 * and other @pos values will get mapped to -1.  When @pos value 7
 * gets mapped to (returns) @ord value 3 in this example, that means
 * that bit 7 is the 3rd (starting with 0th) set bit in @buf.
 *
 * The bit positions 0 through @bits are valid positions in @buf.
 */
static int bitmap_pos_to_ord(const unsigned long *buf, unsigned int pos, unsigned int nbits)
{
        if (pos >= nbits || !test_bit(pos, buf))
                return -1;

        return bitmap_weight(buf, pos);
}

/**
 * bitmap_remap - Apply map defined by a pair of bitmaps to another bitmap
 *        @dst: remapped result
 *        @src: subset to be remapped
 *        @old: defines domain of map
 *        @new: defines range of map
 *        @nbits: number of bits in each of these bitmaps
 *
 * Let @old and @new define a mapping of bit positions, such that
 * whatever position is held by the n-th set bit in @old is mapped
 * to the n-th set bit in @new.  In the more general case, allowing
 * for the possibility that the weight 'w' of @new is less than the
 * weight of @old, map the position of the n-th set bit in @old to
 * the position of the m-th set bit in @new, where m == n % w.
 *
 * If either of the @old and @new bitmaps are empty, or if @src and
 * @dst point to the same location, then this routine copies @src
 * to @dst.
 *
 * The positions of unset bits in @old are mapped to themselves
 * (the identity map).
 *
 * Apply the above specified mapping to @src, placing the result in
 * @dst, clearing any bits previously set in @dst.
 *
 * For example, lets say that @old has bits 4 through 7 set, and
 * @new has bits 12 through 15 set.  This defines the mapping of bit
 * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other
 * bit positions unchanged.  So if say @src comes into this routine
 * with bits 1, 5 and 7 set, then @dst should leave with bits 1,
 * 13 and 15 set.
 */
void bitmap_remap(unsigned long *dst, const unsigned long *src,
                const unsigned long *old, const unsigned long *new,
                unsigned int nbits)
{
        unsigned int oldbit, w;

        if (dst == src)                /* following doesn't handle inplace remaps */
                return;
        bitmap_zero(dst, nbits);

        w = bitmap_weight(new, nbits);
        for_each_set_bit(oldbit, src, nbits) {
                int n = bitmap_pos_to_ord(old, oldbit, nbits);

                if (n < 0 || w == 0)
                        set_bit(oldbit, dst);        /* identity map */
                else
                        set_bit(find_nth_bit(new, nbits, n % w), dst);
        }
}
EXPORT_SYMBOL(bitmap_remap);

/**
 * bitmap_bitremap - Apply map defined by a pair of bitmaps to a single bit
 *        @oldbit: bit position to be mapped
 *        @old: defines domain of map
 *        @new: defines range of map
 *        @bits: number of bits in each of these bitmaps
 *
 * Let @old and @new define a mapping of bit positions, such that
 * whatever position is held by the n-th set bit in @old is mapped
 * to the n-th set bit in @new.  In the more general case, allowing
 * for the possibility that the weight 'w' of @new is less than the
 * weight of @old, map the position of the n-th set bit in @old to
 * the position of the m-th set bit in @new, where m == n % w.
 *
 * The positions of unset bits in @old are mapped to themselves
 * (the identity map).
 *
 * Apply the above specified mapping to bit position @oldbit, returning
 * the new bit position.
 *
 * For example, lets say that @old has bits 4 through 7 set, and
 * @new has bits 12 through 15 set.  This defines the mapping of bit
 * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other
 * bit positions unchanged.  So if say @oldbit is 5, then this routine
 * returns 13.
 */
int bitmap_bitremap(int oldbit, const unsigned long *old,
                                const unsigned long *new, int bits)
{
        int w = bitmap_weight(new, bits);
        int n = bitmap_pos_to_ord(old, oldbit, bits);
        if (n < 0 || w == 0)
                return oldbit;
        else
                return find_nth_bit(new, bits, n % w);
}
EXPORT_SYMBOL(bitmap_bitremap);

#ifdef CONFIG_NUMA
/**
 * bitmap_onto - translate one bitmap relative to another
 *        @dst: resulting translated bitmap
 *         @orig: original untranslated bitmap
 *         @relmap: bitmap relative to which translated
 *        @bits: number of bits in each of these bitmaps
 *
 * Set the n-th bit of @dst iff there exists some m such that the
 * n-th bit of @relmap is set, the m-th bit of @orig is set, and
 * the n-th bit of @relmap is also the m-th _set_ bit of @relmap.
 * (If you understood the previous sentence the first time your
 * read it, you're overqualified for your current job.)
 *
 * In other words, @orig is mapped onto (surjectively) @dst,
 * using the map { <n, m> | the n-th bit of @relmap is the
 * m-th set bit of @relmap }.
 *
 * Any set bits in @orig above bit number W, where W is the
 * weight of (number of set bits in) @relmap are mapped nowhere.
 * In particular, if for all bits m set in @orig, m >= W, then
 * @dst will end up empty.  In situations where the possibility
 * of such an empty result is not desired, one way to avoid it is
 * to use the bitmap_fold() operator, below, to first fold the
 * @orig bitmap over itself so that all its set bits x are in the
 * range 0 <= x < W.  The bitmap_fold() operator does this by
 * setting the bit (m % W) in @dst, for each bit (m) set in @orig.
 *
 * Example [1] for bitmap_onto():
 *  Let's say @relmap has bits 30-39 set, and @orig has bits
 *  1, 3, 5, 7, 9 and 11 set.  Then on return from this routine,
 *  @dst will have bits 31, 33, 35, 37 and 39 set.
 *
 *  When bit 0 is set in @orig, it means turn on the bit in
 *  @dst corresponding to whatever is the first bit (if any)
 *  that is turned on in @relmap.  Since bit 0 was off in the
 *  above example, we leave off that bit (bit 30) in @dst.
 *
 *  When bit 1 is set in @orig (as in the above example), it
 *  means turn on the bit in @dst corresponding to whatever
 *  is the second bit that is turned on in @relmap.  The second
 *  bit in @relmap that was turned on in the above example was
 *  bit 31, so we turned on bit 31 in @dst.
 *
 *  Similarly, we turned on bits 33, 35, 37 and 39 in @dst,
 *  because they were the 4th, 6th, 8th and 10th set bits
 *  set in @relmap, and the 4th, 6th, 8th and 10th bits of
 *  @orig (i.e. bits 3, 5, 7 and 9) were also set.
 *
 *  When bit 11 is set in @orig, it means turn on the bit in
 *  @dst corresponding to whatever is the twelfth bit that is
 *  turned on in @relmap.  In the above example, there were
 *  only ten bits turned on in @relmap (30..39), so that bit
 *  11 was set in @orig had no affect on @dst.
 *
 * Example [2] for bitmap_fold() + bitmap_onto():
 *  Let's say @relmap has these ten bits set::
 *
 *                40 41 42 43 45 48 53 61 74 95
 *
 *  (for the curious, that's 40 plus the first ten terms of the
 *  Fibonacci sequence.)
 *
 *  Further lets say we use the following code, invoking
 *  bitmap_fold() then bitmap_onto, as suggested above to
 *  avoid the possibility of an empty @dst result::
 *
 *        unsigned long *tmp;        // a temporary bitmap's bits
 *
 *        bitmap_fold(tmp, orig, bitmap_weight(relmap, bits), bits);
 *        bitmap_onto(dst, tmp, relmap, bits);
 *
 *  Then this table shows what various values of @dst would be, for
 *  various @orig's.  I list the zero-based positions of each set bit.
 *  The tmp column shows the intermediate result, as computed by
 *  using bitmap_fold() to fold the @orig bitmap modulo ten
 *  (the weight of @relmap):
 *
 *      =============== ============== =================
 *      @orig           tmp            @dst
 *      0                0             40
 *      1                1             41
 *      9                9             95
 *      10               0             40 [#f1]_
 *      1 3 5 7          1 3 5 7       41 43 48 61
 *      0 1 2 3 4        0 1 2 3 4     40 41 42 43 45
 *      0 9 18 27        0 9 8 7       40 61 74 95
 *      0 10 20 30       0             40
 *      0 11 22 33       0 1 2 3       40 41 42 43
 *      0 12 24 36       0 2 4 6       40 42 45 53
 *      78 102 211       1 2 8         41 42 74 [#f1]_
 *      =============== ============== =================
 *
 * .. [#f1]
 *
 *     For these marked lines, if we hadn't first done bitmap_fold()
 *     into tmp, then the @dst result would have been empty.
 *
 * If either of @orig or @relmap is empty (no set bits), then @dst
 * will be returned empty.
 *
 * If (as explained above) the only set bits in @orig are in positions
 * m where m >= W, (where W is the weight of @relmap) then @dst will
 * once again be returned empty.
 *
 * All bits in @dst not set by the above rule are cleared.
 */
void bitmap_onto(unsigned long *dst, const unsigned long *orig,
                        const unsigned long *relmap, unsigned int bits)
{
        unsigned int n, m;        /* same meaning as in above comment */

        if (dst == orig)        /* following doesn't handle inplace mappings */
                return;
        bitmap_zero(dst, bits);

        /*
         * The following code is a more efficient, but less
         * obvious, equivalent to the loop:
         *        for (m = 0; m < bitmap_weight(relmap, bits); m++) {
         *                n = find_nth_bit(orig, bits, m);
         *                if (test_bit(m, orig))
         *                        set_bit(n, dst);
         *        }
         */

        m = 0;
        for_each_set_bit(n, relmap, bits) {
                /* m == bitmap_pos_to_ord(relmap, n, bits) */
                if (test_bit(m, orig))
                        set_bit(n, dst);
                m++;
        }
}

/**
 * bitmap_fold - fold larger bitmap into smaller, modulo specified size
 *        @dst: resulting smaller bitmap
 *        @orig: original larger bitmap
 *        @sz: specified size
 *        @nbits: number of bits in each of these bitmaps
 *
 * For each bit oldbit in @orig, set bit oldbit mod @sz in @dst.
 * Clear all other bits in @dst.  See further the comment and
 * Example [2] for bitmap_onto() for why and how to use this.
 */
void bitmap_fold(unsigned long *dst, const unsigned long *orig,
                        unsigned int sz, unsigned int nbits)
{
        unsigned int oldbit;

        if (dst == orig)        /* following doesn't handle inplace mappings */
                return;
        bitmap_zero(dst, nbits);

        for_each_set_bit(oldbit, orig, nbits)
                set_bit(oldbit % sz, dst);
}
#endif /* CONFIG_NUMA */

unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags)
{
        return kmalloc_array(BITS_TO_LONGS(nbits), sizeof(unsigned long),
                             flags);
}
EXPORT_SYMBOL(bitmap_alloc);

unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags)
{
        return bitmap_alloc(nbits, flags | __GFP_ZERO);
}
EXPORT_SYMBOL(bitmap_zalloc);

unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node)
{
        return kmalloc_array_node(BITS_TO_LONGS(nbits), sizeof(unsigned long),
                                  flags, node);
}
EXPORT_SYMBOL(bitmap_alloc_node);

unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node)
{
        return bitmap_alloc_node(nbits, flags | __GFP_ZERO, node);
}
EXPORT_SYMBOL(bitmap_zalloc_node);

void bitmap_free(const unsigned long *bitmap)
{
        kfree(bitmap);
}
EXPORT_SYMBOL(bitmap_free);

static void devm_bitmap_free(void *data)
{
        unsigned long *bitmap = data;

        bitmap_free(bitmap);
}

unsigned long *devm_bitmap_alloc(struct device *dev,
                                 unsigned int nbits, gfp_t flags)
{
        unsigned long *bitmap;
        int ret;

        bitmap = bitmap_alloc(nbits, flags);
        if (!bitmap)
                return NULL;

        ret = devm_add_action_or_reset(dev, devm_bitmap_free, bitmap);
        if (ret)
                return NULL;

        return bitmap;
}
EXPORT_SYMBOL_GPL(devm_bitmap_alloc);

unsigned long *devm_bitmap_zalloc(struct device *dev,
                                  unsigned int nbits, gfp_t flags)
{
        return devm_bitmap_alloc(dev, nbits, flags | __GFP_ZERO);
}
EXPORT_SYMBOL_GPL(devm_bitmap_zalloc);

#if BITS_PER_LONG == 64
/**
 * bitmap_from_arr32 - copy the contents of u32 array of bits to bitmap
 *        @bitmap: array of unsigned longs, the destination bitmap
 *        @buf: array of u32 (in host byte order), the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits)
{
        unsigned int i, halfwords;

        halfwords = DIV_ROUND_UP(nbits, 32);
        for (i = 0; i < halfwords; i++) {
                bitmap[i/2] = (unsigned long) buf[i];
                if (++i < halfwords)
                        bitmap[i/2] |= ((unsigned long) buf[i]) << 32;
        }

        /* Clear tail bits in last word beyond nbits. */
        if (nbits % BITS_PER_LONG)
                bitmap[(halfwords - 1) / 2] &= BITMAP_LAST_WORD_MASK(nbits);
}
EXPORT_SYMBOL(bitmap_from_arr32);

/**
 * bitmap_to_arr32 - copy the contents of bitmap to a u32 array of bits
 *        @buf: array of u32 (in host byte order), the dest bitmap
 *        @bitmap: array of unsigned longs, the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits)
{
        unsigned int i, halfwords;

        halfwords = DIV_ROUND_UP(nbits, 32);
        for (i = 0; i < halfwords; i++) {
                buf[i] = (u32) (bitmap[i/2] & UINT_MAX);
                if (++i < halfwords)
                        buf[i] = (u32) (bitmap[i/2] >> 32);
        }

        /* Clear tail bits in last element of array beyond nbits. */
        if (nbits % BITS_PER_LONG)
                buf[halfwords - 1] &= (u32) (UINT_MAX >> ((-nbits) & 31));
}
EXPORT_SYMBOL(bitmap_to_arr32);
#endif

#if BITS_PER_LONG == 32
/**
 * bitmap_from_arr64 - copy the contents of u64 array of bits to bitmap
 *        @bitmap: array of unsigned longs, the destination bitmap
 *        @buf: array of u64 (in host byte order), the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_from_arr64(unsigned long *bitmap, const u64 *buf, unsigned int nbits)
{
        int n;

        for (n = nbits; n > 0; n -= 64) {
                u64 val = *buf++;

                *bitmap++ = val;
                if (n > 32)
                        *bitmap++ = val >> 32;
        }

        /*
         * Clear tail bits in the last word beyond nbits.
         *
         * Negative index is OK because here we point to the word next
         * to the last word of the bitmap, except for nbits == 0, which
         * is tested implicitly.
         */
        if (nbits % BITS_PER_LONG)
                bitmap[-1] &= BITMAP_LAST_WORD_MASK(nbits);
}
EXPORT_SYMBOL(bitmap_from_arr64);

/**
 * bitmap_to_arr64 - copy the contents of bitmap to a u64 array of bits
 *        @buf: array of u64 (in host byte order), the dest bitmap
 *        @bitmap: array of unsigned longs, the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits)
{
        const unsigned long *end = bitmap + BITS_TO_LONGS(nbits);

        while (bitmap < end) {
                *buf = *bitmap++;
                if (bitmap < end)
                        *buf |= (u64)(*bitmap++) << 32;
                buf++;
        }

        /* Clear tail bits in the last element of array beyond nbits. */
        if (nbits % 64)
                buf[-1] &= GENMASK_ULL((nbits - 1) % 64, 0);
}
EXPORT_SYMBOL(bitmap_to_arr64);
#endif




















































































































































    4 













    5 









    3 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_USER_NAMESPACE_H
#define _LINUX_USER_NAMESPACE_H

#include <linux/kref.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/rwsem.h>
#include <linux/sysctl.h>
#include <linux/err.h>

#define UID_GID_MAP_MAX_BASE_EXTENTS 5
#define UID_GID_MAP_MAX_EXTENTS 340

struct uid_gid_extent {
        u32 first;
        u32 lower_first;
        u32 count;
};

struct uid_gid_map { /* 64 bytes -- 1 cache line */
        u32 nr_extents;
        union {
                struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
                struct {
                        struct uid_gid_extent *forward;
                        struct uid_gid_extent *reverse;
                };
        };
};

#define USERNS_SETGROUPS_ALLOWED 1UL

#define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED

struct ucounts;

enum ucount_type {
        UCOUNT_USER_NAMESPACES,
        UCOUNT_PID_NAMESPACES,
        UCOUNT_UTS_NAMESPACES,
        UCOUNT_IPC_NAMESPACES,
        UCOUNT_NET_NAMESPACES,
        UCOUNT_MNT_NAMESPACES,
        UCOUNT_CGROUP_NAMESPACES,
        UCOUNT_TIME_NAMESPACES,
#ifdef CONFIG_INOTIFY_USER
        UCOUNT_INOTIFY_INSTANCES,
        UCOUNT_INOTIFY_WATCHES,
#endif
#ifdef CONFIG_FANOTIFY
        UCOUNT_FANOTIFY_GROUPS,
        UCOUNT_FANOTIFY_MARKS,
#endif
        UCOUNT_COUNTS,
};

enum rlimit_type {
        UCOUNT_RLIMIT_NPROC,
        UCOUNT_RLIMIT_MSGQUEUE,
        UCOUNT_RLIMIT_SIGPENDING,
        UCOUNT_RLIMIT_MEMLOCK,
        UCOUNT_RLIMIT_COUNTS,
};

#if IS_ENABLED(CONFIG_BINFMT_MISC)
struct binfmt_misc;
#endif

struct user_namespace {
        struct uid_gid_map        uid_map;
        struct uid_gid_map        gid_map;
        struct uid_gid_map        projid_map;
        struct user_namespace        *parent;
        int                        level;
        kuid_t                        owner;
        kgid_t                        group;
        struct ns_common        ns;
        unsigned long                flags;
        /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP
         * in its effective capability set at the child ns creation time. */
        bool                        parent_could_setfcap;

#ifdef CONFIG_KEYS
        /* List of joinable keyrings in this namespace.  Modification access of
         * these pointers is controlled by keyring_sem.  Once
         * user_keyring_register is set, it won't be changed, so it can be
         * accessed directly with READ_ONCE().
         */
        struct list_head        keyring_name_list;
        struct key                *user_keyring_register;
        struct rw_semaphore        keyring_sem;
#endif

        /* Register of per-UID persistent keyrings for this namespace */
#ifdef CONFIG_PERSISTENT_KEYRINGS
        struct key                *persistent_keyring_register;
#endif
        struct work_struct        work;
#ifdef CONFIG_SYSCTL
        struct ctl_table_set        set;
        struct ctl_table_header *sysctls;
#endif
        struct ucounts                *ucounts;
        long ucount_max[UCOUNT_COUNTS];
        long rlimit_max[UCOUNT_RLIMIT_COUNTS];

#if IS_ENABLED(CONFIG_BINFMT_MISC)
        struct binfmt_misc *binfmt_misc;
#endif
} __randomize_layout;

struct ucounts {
        struct hlist_node node;
        struct user_namespace *ns;
        kuid_t uid;
        atomic_t count;
        atomic_long_t ucount[UCOUNT_COUNTS];
        atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS];
};

extern struct user_namespace init_user_ns;
extern struct ucounts init_ucounts;

bool setup_userns_sysctls(struct user_namespace *ns);
void retire_userns_sysctls(struct user_namespace *ns);
struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid);
struct ucounts * __must_check get_ucounts(struct ucounts *ucounts);
void put_ucounts(struct ucounts *ucounts);

static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type)
{
        return atomic_long_read(&ucounts->rlimit[type]);
}

long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type);
void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type);
bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max);

static inline long get_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type)
{
        return READ_ONCE(ns->rlimit_max[type]);
}

static inline void set_userns_rlimit_max(struct user_namespace *ns,
                enum rlimit_type type, unsigned long max)
{
        ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX;
}

#ifdef CONFIG_USER_NS

static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
        if (ns)
                refcount_inc(&ns->ns.count);
        return ns;
}

extern int create_user_ns(struct cred *new);
extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred);
extern void __put_user_ns(struct user_namespace *ns);

static inline void put_user_ns(struct user_namespace *ns)
{
        if (ns && refcount_dec_and_test(&ns->ns.count))
                __put_user_ns(ns);
}

struct seq_operations;
extern const struct seq_operations proc_uid_seq_operations;
extern const struct seq_operations proc_gid_seq_operations;
extern const struct seq_operations proc_projid_seq_operations;
extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *);
extern int proc_setgroups_show(struct seq_file *m, void *v);
extern bool userns_may_setgroups(const struct user_namespace *ns);
extern bool in_userns(const struct user_namespace *ancestor,
                       const struct user_namespace *child);
extern bool current_in_userns(const struct user_namespace *target_ns);
struct ns_common *ns_get_owner(struct ns_common *ns);
#else

static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
        return &init_user_ns;
}

static inline int create_user_ns(struct cred *new)
{
        return -EINVAL;
}

static inline int unshare_userns(unsigned long unshare_flags,
                                 struct cred **new_cred)
{
        if (unshare_flags & CLONE_NEWUSER)
                return -EINVAL;
        return 0;
}

static inline void put_user_ns(struct user_namespace *ns)
{
}

static inline bool userns_may_setgroups(const struct user_namespace *ns)
{
        return true;
}

static inline bool in_userns(const struct user_namespace *ancestor,
                             const struct user_namespace *child)
{
        return true;
}

static inline bool current_in_userns(const struct user_namespace *target_ns)
{
        return true;
}

static inline struct ns_common *ns_get_owner(struct ns_common *ns)
{
        return ERR_PTR(-EPERM);
}
#endif

#endif /* _LINUX_USER_H */
















































































































    3 






















    6 












    1 


    2 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/ns_common.h>
#include <linux/fs_pin.h>

struct mnt_namespace {
        struct ns_common        ns;
        struct mount *        root;
        struct rb_root                mounts; /* Protected by namespace_sem */
        struct user_namespace        *user_ns;
        struct ucounts                *ucounts;
        u64                        seq;        /* Sequence number to prevent loops */
        wait_queue_head_t poll;
        u64 event;
        unsigned int                nr_mounts; /* # of mounts in the namespace */
        unsigned int                pending_mounts;
} __randomize_layout;

struct mnt_pcp {
        int mnt_count;
        int mnt_writers;
};

struct mountpoint {
        struct hlist_node m_hash;
        struct dentry *m_dentry;
        struct hlist_head m_list;
        int m_count;
};

struct mount {
        struct hlist_node mnt_hash;
        struct mount *mnt_parent;
        struct dentry *mnt_mountpoint;
        struct vfsmount mnt;
        union {
                struct rcu_head mnt_rcu;
                struct llist_node mnt_llist;
        };
#ifdef CONFIG_SMP
        struct mnt_pcp __percpu *mnt_pcp;
#else
        int mnt_count;
        int mnt_writers;
#endif
        struct list_head mnt_mounts;        /* list of children, anchored here */
        struct list_head mnt_child;        /* and going through their mnt_child */
        struct list_head mnt_instance;        /* mount instance on sb->s_mounts */
        const char *mnt_devname;        /* Name of device e.g. /dev/dsk/hda1 */
        union {
                struct rb_node mnt_node;        /* Under ns->mounts */
                struct list_head mnt_list;
        };
        struct list_head mnt_expire;        /* link in fs-specific expiry list */
        struct list_head mnt_share;        /* circular list of shared mounts */
        struct list_head mnt_slave_list;/* list of slave mounts */
        struct list_head mnt_slave;        /* slave list entry */
        struct mount *mnt_master;        /* slave is on master->mnt_slave_list */
        struct mnt_namespace *mnt_ns;        /* containing namespace */
        struct mountpoint *mnt_mp;        /* where is it mounted */
        union {
                struct hlist_node mnt_mp_list;        /* list mounts with the same mountpoint */
                struct hlist_node mnt_umount;
        };
        struct list_head mnt_umounting; /* list entry for umount propagation */
#ifdef CONFIG_FSNOTIFY
        struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
        __u32 mnt_fsnotify_mask;
#endif
        int mnt_id;                        /* mount identifier, reused */
        u64 mnt_id_unique;                /* mount ID unique until reboot */
        int mnt_group_id;                /* peer group identifier */
        int mnt_expiry_mark;                /* true if marked for expiry */
        struct hlist_head mnt_pins;
        struct hlist_head mnt_stuck_children;
} __randomize_layout;

#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */

static inline struct mount *real_mount(struct vfsmount *mnt)
{
        return container_of(mnt, struct mount, mnt);
}

static inline int mnt_has_parent(struct mount *mnt)
{
        return mnt != mnt->mnt_parent;
}

static inline int is_mounted(struct vfsmount *mnt)
{
        /* neither detached nor internal? */
        return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns);
}

extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);

extern int __legitimize_mnt(struct vfsmount *, unsigned);

static inline bool __path_is_mountpoint(const struct path *path)
{
        struct mount *m = __lookup_mnt(path->mnt, path->dentry);
        return m && likely(!(m->mnt.mnt_flags & MNT_SYNC_UMOUNT));
}

extern void __detach_mounts(struct dentry *dentry);

static inline void detach_mounts(struct dentry *dentry)
{
        if (!d_mountpoint(dentry))
                return;
        __detach_mounts(dentry);
}

static inline void get_mnt_ns(struct mnt_namespace *ns)
{
        refcount_inc(&ns->ns.count);
}

extern seqlock_t mount_lock;

struct proc_mounts {
        struct mnt_namespace *ns;
        struct path root;
        int (*show)(struct seq_file *, struct vfsmount *);
};

extern const struct seq_operations mounts_op;

extern bool __is_local_mountpoint(struct dentry *dentry);
static inline bool is_local_mountpoint(struct dentry *dentry)
{
        if (!d_mountpoint(dentry))
                return false;

        return __is_local_mountpoint(dentry);
}

static inline bool is_anon_ns(struct mnt_namespace *ns)
{
        return ns->seq == 0;
}

static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
{
        WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB));
        mnt->mnt.mnt_flags &= ~MNT_ONRB;
        rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts);
        list_add_tail(&mnt->mnt_list, dt_list);
}

extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);













































































































































































































































































































































































































































































































































    1 


    1 





















































































































































































    5 




    5 




































    1 

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
// SPDX-License-Identifier: GPL-2.0
/*
 * Key setup facility for FS encryption support.
 *
 * Copyright (C) 2015, Google, Inc.
 *
 * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar.
 * Heavily modified since then.
 */

#include <crypto/skcipher.h>
#include <linux/random.h>

#include "fscrypt_private.h"

struct fscrypt_mode fscrypt_modes[] = {
        [FSCRYPT_MODE_AES_256_XTS] = {
                .friendly_name = "AES-256-XTS",
                .cipher_str = "xts(aes)",
                .keysize = 64,
                .security_strength = 32,
                .ivsize = 16,
                .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS,
        },
        [FSCRYPT_MODE_AES_256_CTS] = {
                .friendly_name = "AES-256-CBC-CTS",
                .cipher_str = "cts(cbc(aes))",
                .keysize = 32,
                .security_strength = 32,
                .ivsize = 16,
        },
        [FSCRYPT_MODE_AES_128_CBC] = {
                .friendly_name = "AES-128-CBC-ESSIV",
                .cipher_str = "essiv(cbc(aes),sha256)",
                .keysize = 16,
                .security_strength = 16,
                .ivsize = 16,
                .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV,
        },
        [FSCRYPT_MODE_AES_128_CTS] = {
                .friendly_name = "AES-128-CBC-CTS",
                .cipher_str = "cts(cbc(aes))",
                .keysize = 16,
                .security_strength = 16,
                .ivsize = 16,
        },
        [FSCRYPT_MODE_SM4_XTS] = {
                .friendly_name = "SM4-XTS",
                .cipher_str = "xts(sm4)",
                .keysize = 32,
                .security_strength = 16,
                .ivsize = 16,
                .blk_crypto_mode = BLK_ENCRYPTION_MODE_SM4_XTS,
        },
        [FSCRYPT_MODE_SM4_CTS] = {
                .friendly_name = "SM4-CBC-CTS",
                .cipher_str = "cts(cbc(sm4))",
                .keysize = 16,
                .security_strength = 16,
                .ivsize = 16,
        },
        [FSCRYPT_MODE_ADIANTUM] = {
                .friendly_name = "Adiantum",
                .cipher_str = "adiantum(xchacha12,aes)",
                .keysize = 32,
                .security_strength = 32,
                .ivsize = 32,
                .blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM,
        },
        [FSCRYPT_MODE_AES_256_HCTR2] = {
                .friendly_name = "AES-256-HCTR2",
                .cipher_str = "hctr2(aes)",
                .keysize = 32,
                .security_strength = 32,
                .ivsize = 32,
        },
};

static DEFINE_MUTEX(fscrypt_mode_key_setup_mutex);

static struct fscrypt_mode *
select_encryption_mode(const union fscrypt_policy *policy,
                       const struct inode *inode)
{
        BUILD_BUG_ON(ARRAY_SIZE(fscrypt_modes) != FSCRYPT_MODE_MAX + 1);

        if (S_ISREG(inode->i_mode))
                return &fscrypt_modes[fscrypt_policy_contents_mode(policy)];

        if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
                return &fscrypt_modes[fscrypt_policy_fnames_mode(policy)];

        WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n",
                  inode->i_ino, (inode->i_mode & S_IFMT));
        return ERR_PTR(-EINVAL);
}

/* Create a symmetric cipher object for the given encryption mode and key */
static struct crypto_skcipher *
fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key,
                          const struct inode *inode)
{
        struct crypto_skcipher *tfm;
        int err;

        tfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0);
        if (IS_ERR(tfm)) {
                if (PTR_ERR(tfm) == -ENOENT) {
                        fscrypt_warn(inode,
                                     "Missing crypto API support for %s (API name: \"%s\")",
                                     mode->friendly_name, mode->cipher_str);
                        return ERR_PTR(-ENOPKG);
                }
                fscrypt_err(inode, "Error allocating '%s' transform: %ld",
                            mode->cipher_str, PTR_ERR(tfm));
                return tfm;
        }
        if (!xchg(&mode->logged_cryptoapi_impl, 1)) {
                /*
                 * fscrypt performance can vary greatly depending on which
                 * crypto algorithm implementation is used.  Help people debug
                 * performance problems by logging the ->cra_driver_name the
                 * first time a mode is used.
                 */
                pr_info("fscrypt: %s using implementation \"%s\"\n",
                        mode->friendly_name, crypto_skcipher_driver_name(tfm));
        }
        if (WARN_ON_ONCE(crypto_skcipher_ivsize(tfm) != mode->ivsize)) {
                err = -EINVAL;
                goto err_free_tfm;
        }
        crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
        err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize);
        if (err)
                goto err_free_tfm;

        return tfm;

err_free_tfm:
        crypto_free_skcipher(tfm);
        return ERR_PTR(err);
}

/*
 * Prepare the crypto transform object or blk-crypto key in @prep_key, given the
 * raw key, encryption mode (@ci->ci_mode), flag indicating which encryption
 * implementation (fs-layer or blk-crypto) will be used (@ci->ci_inlinecrypt),
 * and IV generation method (@ci->ci_policy.flags).
 */
int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
                        const u8 *raw_key, const struct fscrypt_inode_info *ci)
{
        struct crypto_skcipher *tfm;

        if (fscrypt_using_inline_encryption(ci))
                return fscrypt_prepare_inline_crypt_key(prep_key, raw_key, ci);

        tfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key, ci->ci_inode);
        if (IS_ERR(tfm))
                return PTR_ERR(tfm);
        /*
         * Pairs with the smp_load_acquire() in fscrypt_is_key_prepared().
         * I.e., here we publish ->tfm with a RELEASE barrier so that
         * concurrent tasks can ACQUIRE it.  Note that this concurrency is only
         * possible for per-mode keys, not for per-file keys.
         */
        smp_store_release(&prep_key->tfm, tfm);
        return 0;
}

/* Destroy a crypto transform object and/or blk-crypto key. */
void fscrypt_destroy_prepared_key(struct super_block *sb,
                                  struct fscrypt_prepared_key *prep_key)
{
        crypto_free_skcipher(prep_key->tfm);
        fscrypt_destroy_inline_crypt_key(sb, prep_key);
        memzero_explicit(prep_key, sizeof(*prep_key));
}

/* Given a per-file encryption key, set up the file's crypto transform object */
int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci,
                                 const u8 *raw_key)
{
        ci->ci_owns_key = true;
        return fscrypt_prepare_key(&ci->ci_enc_key, raw_key, ci);
}

static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci,
                                  struct fscrypt_master_key *mk,
                                  struct fscrypt_prepared_key *keys,
                                  u8 hkdf_context, bool include_fs_uuid)
{
        const struct inode *inode = ci->ci_inode;
        const struct super_block *sb = inode->i_sb;
        struct fscrypt_mode *mode = ci->ci_mode;
        const u8 mode_num = mode - fscrypt_modes;
        struct fscrypt_prepared_key *prep_key;
        u8 mode_key[FSCRYPT_MAX_KEY_SIZE];
        u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)];
        unsigned int hkdf_infolen = 0;
        int err;

        if (WARN_ON_ONCE(mode_num > FSCRYPT_MODE_MAX))
                return -EINVAL;

        prep_key = &keys[mode_num];
        if (fscrypt_is_key_prepared(prep_key, ci)) {
                ci->ci_enc_key = *prep_key;
                return 0;
        }

        mutex_lock(&fscrypt_mode_key_setup_mutex);

        if (fscrypt_is_key_prepared(prep_key, ci))
                goto done_unlock;

        BUILD_BUG_ON(sizeof(mode_num) != 1);
        BUILD_BUG_ON(sizeof(sb->s_uuid) != 16);
        BUILD_BUG_ON(sizeof(hkdf_info) != 17);
        hkdf_info[hkdf_infolen++] = mode_num;
        if (include_fs_uuid) {
                memcpy(&hkdf_info[hkdf_infolen], &sb->s_uuid,
                       sizeof(sb->s_uuid));
                hkdf_infolen += sizeof(sb->s_uuid);
        }
        err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
                                  hkdf_context, hkdf_info, hkdf_infolen,
                                  mode_key, mode->keysize);
        if (err)
                goto out_unlock;
        err = fscrypt_prepare_key(prep_key, mode_key, ci);
        memzero_explicit(mode_key, mode->keysize);
        if (err)
                goto out_unlock;
done_unlock:
        ci->ci_enc_key = *prep_key;
        err = 0;
out_unlock:
        mutex_unlock(&fscrypt_mode_key_setup_mutex);
        return err;
}

/*
 * Derive a SipHash key from the given fscrypt master key and the given
 * application-specific information string.
 *
 * Note that the KDF produces a byte array, but the SipHash APIs expect the key
 * as a pair of 64-bit words.  Therefore, on big endian CPUs we have to do an
 * endianness swap in order to get the same results as on little endian CPUs.
 */
static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk,
                                      u8 context, const u8 *info,
                                      unsigned int infolen, siphash_key_t *key)
{
        int err;

        err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen,
                                  (u8 *)key, sizeof(*key));
        if (err)
                return err;

        BUILD_BUG_ON(sizeof(*key) != 16);
        BUILD_BUG_ON(ARRAY_SIZE(key->key) != 2);
        le64_to_cpus(&key->key[0]);
        le64_to_cpus(&key->key[1]);
        return 0;
}

int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
                               const struct fscrypt_master_key *mk)
{
        int err;

        err = fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY,
                                         ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
                                         &ci->ci_dirhash_key);
        if (err)
                return err;
        ci->ci_dirhash_key_initialized = true;
        return 0;
}

void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci,
                               const struct fscrypt_master_key *mk)
{
        WARN_ON_ONCE(ci->ci_inode->i_ino == 0);
        WARN_ON_ONCE(!mk->mk_ino_hash_key_initialized);

        ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino,
                                              &mk->mk_ino_hash_key);
}

static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_inode_info *ci,
                                            struct fscrypt_master_key *mk)
{
        int err;

        err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_32_keys,
                                     HKDF_CONTEXT_IV_INO_LBLK_32_KEY, true);
        if (err)
                return err;

        /* pairs with smp_store_release() below */
        if (!smp_load_acquire(&mk->mk_ino_hash_key_initialized)) {

                mutex_lock(&fscrypt_mode_key_setup_mutex);

                if (mk->mk_ino_hash_key_initialized)
                        goto unlock;

                err = fscrypt_derive_siphash_key(mk,
                                                 HKDF_CONTEXT_INODE_HASH_KEY,
                                                 NULL, 0, &mk->mk_ino_hash_key);
                if (err)
                        goto unlock;
                /* pairs with smp_load_acquire() above */
                smp_store_release(&mk->mk_ino_hash_key_initialized, true);
unlock:
                mutex_unlock(&fscrypt_mode_key_setup_mutex);
                if (err)
                        return err;
        }

        /*
         * New inodes may not have an inode number assigned yet.
         * Hashing their inode number is delayed until later.
         */
        if (ci->ci_inode->i_ino)
                fscrypt_hash_inode_number(ci, mk);
        return 0;
}

static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci,
                                     struct fscrypt_master_key *mk,
                                     bool need_dirhash_key)
{
        int err;

        if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
                /*
                 * DIRECT_KEY: instead of deriving per-file encryption keys, the
                 * per-file nonce will be included in all the IVs.  But unlike
                 * v1 policies, for v2 policies in this case we don't encrypt
                 * with the master key directly but rather derive a per-mode
                 * encryption key.  This ensures that the master key is
                 * consistently used only for HKDF, avoiding key reuse issues.
                 */
                err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_keys,
                                             HKDF_CONTEXT_DIRECT_KEY, false);
        } else if (ci->ci_policy.v2.flags &
                   FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) {
                /*
                 * IV_INO_LBLK_64: encryption keys are derived from (master_key,
                 * mode_num, filesystem_uuid), and inode number is included in
                 * the IVs.  This format is optimized for use with inline
                 * encryption hardware compliant with the UFS standard.
                 */
                err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_keys,
                                             HKDF_CONTEXT_IV_INO_LBLK_64_KEY,
                                             true);
        } else if (ci->ci_policy.v2.flags &
                   FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) {
                err = fscrypt_setup_iv_ino_lblk_32_key(ci, mk);
        } else {
                u8 derived_key[FSCRYPT_MAX_KEY_SIZE];

                err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
                                          HKDF_CONTEXT_PER_FILE_ENC_KEY,
                                          ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
                                          derived_key, ci->ci_mode->keysize);
                if (err)
                        return err;

                err = fscrypt_set_per_file_enc_key(ci, derived_key);
                memzero_explicit(derived_key, ci->ci_mode->keysize);
        }
        if (err)
                return err;

        /* Derive a secret dirhash key for directories that need it. */
        if (need_dirhash_key) {
                err = fscrypt_derive_dirhash_key(ci, mk);
                if (err)
                        return err;
        }

        return 0;
}

/*
 * Check whether the size of the given master key (@mk) is appropriate for the
 * encryption settings which a particular file will use (@ci).
 *
 * If the file uses a v1 encryption policy, then the master key must be at least
 * as long as the derived key, as this is a requirement of the v1 KDF.
 *
 * Otherwise, the KDF can accept any size key, so we enforce a slightly looser
 * requirement: we require that the size of the master key be at least the
 * maximum security strength of any algorithm whose key will be derived from it
 * (but in practice we only need to consider @ci->ci_mode, since any other
 * possible subkeys such as DIRHASH and INODE_HASH will never increase the
 * required key size over @ci->ci_mode).  This allows AES-256-XTS keys to be
 * derived from a 256-bit master key, which is cryptographically sufficient,
 * rather than requiring a 512-bit master key which is unnecessarily long.  (We
 * still allow 512-bit master keys if the user chooses to use them, though.)
 */
static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
                                          const struct fscrypt_inode_info *ci)
{
        unsigned int min_keysize;

        if (ci->ci_policy.version == FSCRYPT_POLICY_V1)
                min_keysize = ci->ci_mode->keysize;
        else
                min_keysize = ci->ci_mode->security_strength;

        if (mk->mk_secret.size < min_keysize) {
                fscrypt_warn(NULL,
                             "key with %s %*phN is too short (got %u bytes, need %u+ bytes)",
                             master_key_spec_type(&mk->mk_spec),
                             master_key_spec_len(&mk->mk_spec),
                             (u8 *)&mk->mk_spec.u,
                             mk->mk_secret.size, min_keysize);
                return false;
        }
        return true;
}

/*
 * Find the master key, then set up the inode's actual encryption key.
 *
 * If the master key is found in the filesystem-level keyring, then it is
 * returned in *mk_ret with its semaphore read-locked.  This is needed to ensure
 * that only one task links the fscrypt_inode_info into ->mk_decrypted_inodes
 * (as multiple tasks may race to create an fscrypt_inode_info for the same
 * inode), and to synchronize the master key being removed with a new inode
 * starting to use it.
 */
static int setup_file_encryption_key(struct fscrypt_inode_info *ci,
                                     bool need_dirhash_key,
                                     struct fscrypt_master_key **mk_ret)
{
        struct super_block *sb = ci->ci_inode->i_sb;
        struct fscrypt_key_specifier mk_spec;
        struct fscrypt_master_key *mk;
        int err;

        err = fscrypt_select_encryption_impl(ci);
        if (err)
                return err;

        err = fscrypt_policy_to_key_spec(&ci->ci_policy, &mk_spec);
        if (err)
                return err;

        mk = fscrypt_find_master_key(sb, &mk_spec);
        if (unlikely(!mk)) {
                const union fscrypt_policy *dummy_policy =
                        fscrypt_get_dummy_policy(sb);

                /*
                 * Add the test_dummy_encryption key on-demand.  In principle,
                 * it should be added at mount time.  Do it here instead so that
                 * the individual filesystems don't need to worry about adding
                 * this key at mount time and cleaning up on mount failure.
                 */
                if (dummy_policy &&
                    fscrypt_policies_equal(dummy_policy, &ci->ci_policy)) {
                        err = fscrypt_add_test_dummy_key(sb, &mk_spec);
                        if (err)
                                return err;
                        mk = fscrypt_find_master_key(sb, &mk_spec);
                }
        }
        if (unlikely(!mk)) {
                if (ci->ci_policy.version != FSCRYPT_POLICY_V1)
                        return -ENOKEY;

                /*
                 * As a legacy fallback for v1 policies, search for the key in
                 * the current task's subscribed keyrings too.  Don't move this
                 * to before the search of ->s_master_keys, since users
                 * shouldn't be able to override filesystem-level keys.
                 */
                return fscrypt_setup_v1_file_key_via_subscribed_keyrings(ci);
        }
        down_read(&mk->mk_sem);

        if (!mk->mk_present) {
                /* FS_IOC_REMOVE_ENCRYPTION_KEY has been executed on this key */
                err = -ENOKEY;
                goto out_release_key;
        }

        if (!fscrypt_valid_master_key_size(mk, ci)) {
                err = -ENOKEY;
                goto out_release_key;
        }

        switch (ci->ci_policy.version) {
        case FSCRYPT_POLICY_V1:
                err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.raw);
                break;
        case FSCRYPT_POLICY_V2:
                err = fscrypt_setup_v2_file_key(ci, mk, need_dirhash_key);
                break;
        default:
                WARN_ON_ONCE(1);
                err = -EINVAL;
                break;
        }
        if (err)
                goto out_release_key;

        *mk_ret = mk;
        return 0;

out_release_key:
        up_read(&mk->mk_sem);
        fscrypt_put_master_key(mk);
        return err;
}

static void put_crypt_info(struct fscrypt_inode_info *ci)
{
        struct fscrypt_master_key *mk;

        if (!ci)
                return;

        if (ci->ci_direct_key)
                fscrypt_put_direct_key(ci->ci_direct_key);
        else if (ci->ci_owns_key)
                fscrypt_destroy_prepared_key(ci->ci_inode->i_sb,
                                             &ci->ci_enc_key);

        mk = ci->ci_master_key;
        if (mk) {
                /*
                 * Remove this inode from the list of inodes that were unlocked
                 * with the master key.  In addition, if we're removing the last
                 * inode from an incompletely removed key, then complete the
                 * full removal of the key.
                 */
                spin_lock(&mk->mk_decrypted_inodes_lock);
                list_del(&ci->ci_master_key_link);
                spin_unlock(&mk->mk_decrypted_inodes_lock);
                fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk);
        }
        memzero_explicit(ci, sizeof(*ci));
        kmem_cache_free(fscrypt_inode_info_cachep, ci);
}

static int
fscrypt_setup_encryption_info(struct inode *inode,
                              const union fscrypt_policy *policy,
                              const u8 nonce[FSCRYPT_FILE_NONCE_SIZE],
                              bool need_dirhash_key)
{
        struct fscrypt_inode_info *crypt_info;
        struct fscrypt_mode *mode;
        struct fscrypt_master_key *mk = NULL;
        int res;

        res = fscrypt_initialize(inode->i_sb);
        if (res)
                return res;

        crypt_info = kmem_cache_zalloc(fscrypt_inode_info_cachep, GFP_KERNEL);
        if (!crypt_info)
                return -ENOMEM;

        crypt_info->ci_inode = inode;
        crypt_info->ci_policy = *policy;
        memcpy(crypt_info->ci_nonce, nonce, FSCRYPT_FILE_NONCE_SIZE);

        mode = select_encryption_mode(&crypt_info->ci_policy, inode);
        if (IS_ERR(mode)) {
                res = PTR_ERR(mode);
                goto out;
        }
        WARN_ON_ONCE(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
        crypt_info->ci_mode = mode;

        crypt_info->ci_data_unit_bits =
                fscrypt_policy_du_bits(&crypt_info->ci_policy, inode);
        crypt_info->ci_data_units_per_block_bits =
                inode->i_blkbits - crypt_info->ci_data_unit_bits;

        res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk);
        if (res)
                goto out;

        /*
         * For existing inodes, multiple tasks may race to set ->i_crypt_info.
         * So use cmpxchg_release().  This pairs with the smp_load_acquire() in
         * fscrypt_get_inode_info().  I.e., here we publish ->i_crypt_info with
         * a RELEASE barrier so that other tasks can ACQUIRE it.
         */
        if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) {
                /*
                 * We won the race and set ->i_crypt_info to our crypt_info.
                 * Now link it into the master key's inode list.
                 */
                if (mk) {
                        crypt_info->ci_master_key = mk;
                        refcount_inc(&mk->mk_active_refs);
                        spin_lock(&mk->mk_decrypted_inodes_lock);
                        list_add(&crypt_info->ci_master_key_link,
                                 &mk->mk_decrypted_inodes);
                        spin_unlock(&mk->mk_decrypted_inodes_lock);
                }
                crypt_info = NULL;
        }
        res = 0;
out:
        if (mk) {
                up_read(&mk->mk_sem);
                fscrypt_put_master_key(mk);
        }
        put_crypt_info(crypt_info);
        return res;
}

/**
 * fscrypt_get_encryption_info() - set up an inode's encryption key
 * @inode: the inode to set up the key for.  Must be encrypted.
 * @allow_unsupported: if %true, treat an unsupported encryption policy (or
 *                       unrecognized encryption context) the same way as the key
 *                       being unavailable, instead of returning an error.  Use
 *                       %false unless the operation being performed is needed in
 *                       order for files (or directories) to be deleted.
 *
 * Set up ->i_crypt_info, if it hasn't already been done.
 *
 * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe.  So
 * generally this shouldn't be called from within a filesystem transaction.
 *
 * Return: 0 if ->i_crypt_info was set or was already set, *or* if the
 *           encryption key is unavailable.  (Use fscrypt_has_encryption_key() to
 *           distinguish these cases.)  Also can return another -errno code.
 */
int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported)
{
        int res;
        union fscrypt_context ctx;
        union fscrypt_policy policy;

        if (fscrypt_has_encryption_key(inode))
                return 0;

        res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
        if (res < 0) {
                if (res == -ERANGE && allow_unsupported)
                        return 0;
                fscrypt_warn(inode, "Error %d getting encryption context", res);
                return res;
        }

        res = fscrypt_policy_from_context(&policy, &ctx, res);
        if (res) {
                if (allow_unsupported)
                        return 0;
                fscrypt_warn(inode,
                             "Unrecognized or corrupt encryption context");
                return res;
        }

        if (!fscrypt_supported_policy(&policy, inode)) {
                if (allow_unsupported)
                        return 0;
                return -EINVAL;
        }

        res = fscrypt_setup_encryption_info(inode, &policy,
                                            fscrypt_context_nonce(&ctx),
                                            IS_CASEFOLDED(inode) &&
                                            S_ISDIR(inode->i_mode));

        if (res == -ENOPKG && allow_unsupported) /* Algorithm unavailable? */
                res = 0;
        if (res == -ENOKEY)
                res = 0;
        return res;
}

/**
 * fscrypt_prepare_new_inode() - prepare to create a new inode in a directory
 * @dir: a possibly-encrypted directory
 * @inode: the new inode.  ->i_mode and ->i_blkbits must be set already.
 *           ->i_ino doesn't need to be set yet.
 * @encrypt_ret: (output) set to %true if the new inode will be encrypted
 *
 * If the directory is encrypted, set up its ->i_crypt_info in preparation for
 * encrypting the name of the new file.  Also, if the new inode will be
 * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true.
 *
 * This isn't %GFP_NOFS-safe, and therefore it should be called before starting
 * any filesystem transaction to create the inode.  For this reason, ->i_ino
 * isn't required to be set yet, as the filesystem may not have set it yet.
 *
 * This doesn't persist the new inode's encryption context.  That still needs to
 * be done later by calling fscrypt_set_context().
 *
 * Return: 0 on success, -ENOKEY if the encryption key is missing, or another
 *           -errno code
 */
int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode,
                              bool *encrypt_ret)
{
        const union fscrypt_policy *policy;
        u8 nonce[FSCRYPT_FILE_NONCE_SIZE];

        policy = fscrypt_policy_to_inherit(dir);
        if (policy == NULL)
                return 0;
        if (IS_ERR(policy))
                return PTR_ERR(policy);

        if (WARN_ON_ONCE(inode->i_blkbits == 0))
                return -EINVAL;

        if (WARN_ON_ONCE(inode->i_mode == 0))
                return -EINVAL;

        /*
         * Only regular files, directories, and symlinks are encrypted.
         * Special files like device nodes and named pipes aren't.
         */
        if (!S_ISREG(inode->i_mode) &&
            !S_ISDIR(inode->i_mode) &&
            !S_ISLNK(inode->i_mode))
                return 0;

        *encrypt_ret = true;

        get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE);
        return fscrypt_setup_encryption_info(inode, policy, nonce,
                                             IS_CASEFOLDED(dir) &&
                                             S_ISDIR(inode->i_mode));
}
EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode);

/**
 * fscrypt_put_encryption_info() - free most of an inode's fscrypt data
 * @inode: an inode being evicted
 *
 * Free the inode's fscrypt_inode_info.  Filesystems must call this when the
 * inode is being evicted.  An RCU grace period need not have elapsed yet.
 */
void fscrypt_put_encryption_info(struct inode *inode)
{
        put_crypt_info(inode->i_crypt_info);
        inode->i_crypt_info = NULL;
}
EXPORT_SYMBOL(fscrypt_put_encryption_info);

/**
 * fscrypt_free_inode() - free an inode's fscrypt data requiring RCU delay
 * @inode: an inode being freed
 *
 * Free the inode's cached decrypted symlink target, if any.  Filesystems must
 * call this after an RCU grace period, just before they free the inode.
 */
void fscrypt_free_inode(struct inode *inode)
{
        if (IS_ENCRYPTED(inode) && S_ISLNK(inode->i_mode)) {
                kfree(inode->i_link);
                inode->i_link = NULL;
        }
}
EXPORT_SYMBOL(fscrypt_free_inode);

/**
 * fscrypt_drop_inode() - check whether the inode's master key has been removed
 * @inode: an inode being considered for eviction
 *
 * Filesystems supporting fscrypt must call this from their ->drop_inode()
 * method so that encrypted inodes are evicted as soon as they're no longer in
 * use and their master key has been removed.
 *
 * Return: 1 if fscrypt wants the inode to be evicted now, otherwise 0
 */
int fscrypt_drop_inode(struct inode *inode)
{
        const struct fscrypt_inode_info *ci = fscrypt_get_inode_info(inode);

        /*
         * If ci is NULL, then the inode doesn't have an encryption key set up
         * so it's irrelevant.  If ci_master_key is NULL, then the master key
         * was provided via the legacy mechanism of the process-subscribed
         * keyrings, so we don't know whether it's been removed or not.
         */
        if (!ci || !ci->ci_master_key)
                return 0;

        /*
         * With proper, non-racy use of FS_IOC_REMOVE_ENCRYPTION_KEY, all inodes
         * protected by the key were cleaned by sync_filesystem().  But if
         * userspace is still using the files, inodes can be dirtied between
         * then and now.  We mustn't lose any writes, so skip dirty inodes here.
         */
        if (inode->i_state & I_DIRTY_ALL)
                return 0;

        /*
         * We can't take ->mk_sem here, since this runs in atomic context.
         * Therefore, ->mk_present can change concurrently, and our result may
         * immediately become outdated.  But there's no correctness problem with
         * unnecessarily evicting.  Nor is there a correctness problem with not
         * evicting while iput() is racing with the key being removed, since
         * then the thread removing the key will either evict the inode itself
         * or will correctly detect that it wasn't evicted due to the race.
         */
        return !READ_ONCE(ci->ci_master_key->mk_present);
}
EXPORT_SYMBOL_GPL(fscrypt_drop_inode);






















    1 

























































































































































   17 










































































   21 




































    8 







    8 

















    7 


    7 

























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/backing-dev.h
 *
 * low-level device information and state which is propagated up through
 * to high-level code.
 */

#ifndef _LINUX_BACKING_DEV_H
#define _LINUX_BACKING_DEV_H

#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/device.h>
#include <linux/writeback.h>
#include <linux/backing-dev-defs.h>
#include <linux/slab.h>

static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
{
        kref_get(&bdi->refcnt);
        return bdi;
}

struct backing_dev_info *bdi_get_by_id(u64 id);
void bdi_put(struct backing_dev_info *bdi);

__printf(2, 3)
int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...);
__printf(2, 0)
int bdi_register_va(struct backing_dev_info *bdi, const char *fmt,
                    va_list args);
void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner);
void bdi_unregister(struct backing_dev_info *bdi);

struct backing_dev_info *bdi_alloc(int node_id);

void wb_start_background_writeback(struct bdi_writeback *wb);
void wb_workfn(struct work_struct *work);

void wb_wait_for_completion(struct wb_completion *done);

extern spinlock_t bdi_lock;
extern struct list_head bdi_list;

extern struct workqueue_struct *bdi_wq;

static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
{
        return test_bit(WB_has_dirty_io, &wb->state);
}

static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi)
{
        /*
         * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are
         * any dirty wbs.  See wb_update_write_bandwidth().
         */
        return atomic_long_read(&bdi->tot_write_bandwidth);
}

static inline void wb_stat_mod(struct bdi_writeback *wb,
                                 enum wb_stat_item item, s64 amount)
{
        percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH);
}

static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
        wb_stat_mod(wb, item, 1);
}

static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
        wb_stat_mod(wb, item, -1);
}

static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
        return percpu_counter_read_positive(&wb->stat[item]);
}

static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item)
{
        return percpu_counter_sum_positive(&wb->stat[item]);
}

extern void wb_writeout_inc(struct bdi_writeback *wb);

/*
 * maximal error of a stat counter.
 */
static inline unsigned long wb_stat_error(void)
{
#ifdef CONFIG_SMP
        return nr_cpu_ids * WB_STAT_BATCH;
#else
        return 1;
#endif
}

/* BDI ratio is expressed as part per 1000000 for finer granularity. */
#define BDI_RATIO_SCALE 10000

u64 bdi_get_min_bytes(struct backing_dev_info *bdi);
u64 bdi_get_max_bytes(struct backing_dev_info *bdi);
int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio);
int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio);
int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes);
int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes);
int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit);

/*
 * Flags in backing_dev_info::capability
 *
 * BDI_CAP_WRITEBACK:                Supports dirty page writeback, and dirty pages
 *                                should contribute to accounting
 * BDI_CAP_WRITEBACK_ACCT:        Automatically account writeback pages
 * BDI_CAP_STRICTLIMIT:                Keep number of dirty pages below bdi threshold
 */
#define BDI_CAP_WRITEBACK                (1 << 0)
#define BDI_CAP_WRITEBACK_ACCT                (1 << 1)
#define BDI_CAP_STRICTLIMIT                (1 << 2)

extern struct backing_dev_info noop_backing_dev_info;

int bdi_init(struct backing_dev_info *bdi);

/**
 * writeback_in_progress - determine whether there is writeback in progress
 * @wb: bdi_writeback of interest
 *
 * Determine whether there is writeback waiting to be handled against a
 * bdi_writeback.
 */
static inline bool writeback_in_progress(struct bdi_writeback *wb)
{
        return test_bit(WB_writeback_running, &wb->state);
}

struct backing_dev_info *inode_to_bdi(struct inode *inode);

static inline bool mapping_can_writeback(struct address_space *mapping)
{
        return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK;
}

#ifdef CONFIG_CGROUP_WRITEBACK

struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
                                    struct cgroup_subsys_state *memcg_css);
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
                                    struct cgroup_subsys_state *memcg_css,
                                    gfp_t gfp);
void wb_memcg_offline(struct mem_cgroup *memcg);
void wb_blkcg_offline(struct cgroup_subsys_state *css);

/**
 * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
 * @inode: inode of interest
 *
 * Cgroup writeback requires support from the filesystem.  Also, both memcg and
 * iocg have to be on the default hierarchy.  Test whether all conditions are
 * met.
 *
 * Note that the test result may change dynamically on the same inode
 * depending on how memcg and iocg are configured.
 */
static inline bool inode_cgwb_enabled(struct inode *inode)
{
        struct backing_dev_info *bdi = inode_to_bdi(inode);

        return cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
                cgroup_subsys_on_dfl(io_cgrp_subsys) &&
                (bdi->capabilities & BDI_CAP_WRITEBACK) &&
                (inode->i_sb->s_iflags & SB_I_CGROUPWB);
}

/**
 * wb_find_current - find wb for %current on a bdi
 * @bdi: bdi of interest
 *
 * Find the wb of @bdi which matches both the memcg and blkcg of %current.
 * Must be called under rcu_read_lock() which protects the returend wb.
 * NULL if not found.
 */
static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
{
        struct cgroup_subsys_state *memcg_css;
        struct bdi_writeback *wb;

        memcg_css = task_css(current, memory_cgrp_id);
        if (!memcg_css->parent)
                return &bdi->wb;

        wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);

        /*
         * %current's blkcg equals the effective blkcg of its memcg.  No
         * need to use the relatively expensive cgroup_get_e_css().
         */
        if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id)))
                return wb;
        return NULL;
}

/**
 * wb_get_create_current - get or create wb for %current on a bdi
 * @bdi: bdi of interest
 * @gfp: allocation mask
 *
 * Equivalent to wb_get_create() on %current's memcg.  This function is
 * called from a relatively hot path and optimizes the common cases using
 * wb_find_current().
 */
static inline struct bdi_writeback *
wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
{
        struct bdi_writeback *wb;

        rcu_read_lock();
        wb = wb_find_current(bdi);
        if (wb && unlikely(!wb_tryget(wb)))
                wb = NULL;
        rcu_read_unlock();

        if (unlikely(!wb)) {
                struct cgroup_subsys_state *memcg_css;

                memcg_css = task_get_css(current, memory_cgrp_id);
                wb = wb_get_create(bdi, memcg_css, gfp);
                css_put(memcg_css);
        }
        return wb;
}

/**
 * inode_to_wb - determine the wb of an inode
 * @inode: inode of interest
 *
 * Returns the wb @inode is currently associated with.  The caller must be
 * holding either @inode->i_lock, the i_pages lock, or the
 * associated wb's list_lock.
 */
static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
{
#ifdef CONFIG_LOCKDEP
        WARN_ON_ONCE(debug_locks &&
                     (!lockdep_is_held(&inode->i_lock) &&
                      !lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) &&
                      !lockdep_is_held(&inode->i_wb->list_lock)));
#endif
        return inode->i_wb;
}

static inline struct bdi_writeback *inode_to_wb_wbc(
                                struct inode *inode,
                                struct writeback_control *wbc)
{
        /*
         * If wbc does not have inode attached, it means cgroup writeback was
         * disabled when wbc started. Just use the default wb in that case.
         */
        return wbc->wb ? wbc->wb : &inode_to_bdi(inode)->wb;
}

/**
 * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
 * @inode: target inode
 * @cookie: output param, to be passed to the end function
 *
 * The caller wants to access the wb associated with @inode but isn't
 * holding inode->i_lock, the i_pages lock or wb->list_lock.  This
 * function determines the wb associated with @inode and ensures that the
 * association doesn't change until the transaction is finished with
 * unlocked_inode_to_wb_end().
 *
 * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and
 * can't sleep during the transaction.  IRQs may or may not be disabled on
 * return.
 */
static inline struct bdi_writeback *
unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
{
        rcu_read_lock();

        /*
         * Paired with store_release in inode_switch_wbs_work_fn() and
         * ensures that we see the new wb if we see cleared I_WB_SWITCH.
         */
        cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;

        if (unlikely(cookie->locked))
                xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags);

        /*
         * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages
         * lock.  inode_to_wb() will bark.  Deref directly.
         */
        return inode->i_wb;
}

/**
 * unlocked_inode_to_wb_end - end inode wb access transaction
 * @inode: target inode
 * @cookie: @cookie from unlocked_inode_to_wb_begin()
 */
static inline void unlocked_inode_to_wb_end(struct inode *inode,
                                            struct wb_lock_cookie *cookie)
{
        if (unlikely(cookie->locked))
                xa_unlock_irqrestore(&inode->i_mapping->i_pages, cookie->flags);

        rcu_read_unlock();
}

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline bool inode_cgwb_enabled(struct inode *inode)
{
        return false;
}

static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
{
        return &bdi->wb;
}

static inline struct bdi_writeback *
wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
{
        return &bdi->wb;
}

static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
{
        return &inode_to_bdi(inode)->wb;
}

static inline struct bdi_writeback *inode_to_wb_wbc(
                                struct inode *inode,
                                struct writeback_control *wbc)
{
        return inode_to_wb(inode);
}


static inline struct bdi_writeback *
unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
{
        return inode_to_wb(inode);
}

static inline void unlocked_inode_to_wb_end(struct inode *inode,
                                            struct wb_lock_cookie *cookie)
{
}

static inline void wb_memcg_offline(struct mem_cgroup *memcg)
{
}

static inline void wb_blkcg_offline(struct cgroup_subsys_state *css)
{
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

const char *bdi_dev_name(struct backing_dev_info *bdi);

#endif        /* _LINUX_BACKING_DEV_H */





















































































































































































































































































    1 




















    1 



















































    1 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
// SPDX-License-Identifier: GPL-2.0-only
/*
 * linux/kernel/ptrace.c
 *
 * (C) Copyright 1999 Linus Torvalds
 *
 * Common interfaces for "ptrace()" which we do not want
 * to continually duplicate across every architecture.
 */

#include <linux/capability.h>
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/signal.h>
#include <linux/uio.h>
#include <linux/audit.h>
#include <linux/pid_namespace.h>
#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <linux/regset.h>
#include <linux/hw_breakpoint.h>
#include <linux/cn_proc.h>
#include <linux/compat.h>
#include <linux/sched/signal.h>
#include <linux/minmax.h>
#include <linux/syscall_user_dispatch.h>

#include <asm/syscall.h>        /* for syscall_get_* */

/*
 * Access another process' address space via ptrace.
 * Source/target buffer must be kernel space,
 * Do not walk the page table directly, use get_user_pages
 */
int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
                     void *buf, int len, unsigned int gup_flags)
{
        struct mm_struct *mm;
        int ret;

        mm = get_task_mm(tsk);
        if (!mm)
                return 0;

        if (!tsk->ptrace ||
            (current != tsk->parent) ||
            ((get_dumpable(mm) != SUID_DUMP_USER) &&
             !ptracer_capable(tsk, mm->user_ns))) {
                mmput(mm);
                return 0;
        }

        ret = access_remote_vm(mm, addr, buf, len, gup_flags);
        mmput(mm);

        return ret;
}


void __ptrace_link(struct task_struct *child, struct task_struct *new_parent,
                   const struct cred *ptracer_cred)
{
        BUG_ON(!list_empty(&child->ptrace_entry));
        list_add(&child->ptrace_entry, &new_parent->ptraced);
        child->parent = new_parent;
        child->ptracer_cred = get_cred(ptracer_cred);
}

/*
 * ptrace a task: make the debugger its new parent and
 * move it to the ptrace list.
 *
 * Must be called with the tasklist lock write-held.
 */
static void ptrace_link(struct task_struct *child, struct task_struct *new_parent)
{
        __ptrace_link(child, new_parent, current_cred());
}

/**
 * __ptrace_unlink - unlink ptracee and restore its execution state
 * @child: ptracee to be unlinked
 *
 * Remove @child from the ptrace list, move it back to the original parent,
 * and restore the execution state so that it conforms to the group stop
 * state.
 *
 * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer
 * exiting.  For PTRACE_DETACH, unless the ptracee has been killed between
 * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED.
 * If the ptracer is exiting, the ptracee can be in any state.
 *
 * After detach, the ptracee should be in a state which conforms to the
 * group stop.  If the group is stopped or in the process of stopping, the
 * ptracee should be put into TASK_STOPPED; otherwise, it should be woken
 * up from TASK_TRACED.
 *
 * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED,
 * it goes through TRACED -> RUNNING -> STOPPED transition which is similar
 * to but in the opposite direction of what happens while attaching to a
 * stopped task.  However, in this direction, the intermediate RUNNING
 * state is not hidden even from the current ptracer and if it immediately
 * re-attaches and performs a WNOHANG wait(2), it may fail.
 *
 * CONTEXT:
 * write_lock_irq(tasklist_lock)
 */
void __ptrace_unlink(struct task_struct *child)
{
        const struct cred *old_cred;
        BUG_ON(!child->ptrace);

        clear_task_syscall_work(child, SYSCALL_TRACE);
#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
        clear_task_syscall_work(child, SYSCALL_EMU);
#endif

        child->parent = child->real_parent;
        list_del_init(&child->ptrace_entry);
        old_cred = child->ptracer_cred;
        child->ptracer_cred = NULL;
        put_cred(old_cred);

        spin_lock(&child->sighand->siglock);
        child->ptrace = 0;
        /*
         * Clear all pending traps and TRAPPING.  TRAPPING should be
         * cleared regardless of JOBCTL_STOP_PENDING.  Do it explicitly.
         */
        task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK);
        task_clear_jobctl_trapping(child);

        /*
         * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and
         * @child isn't dead.
         */
        if (!(child->flags & PF_EXITING) &&
            (child->signal->flags & SIGNAL_STOP_STOPPED ||
             child->signal->group_stop_count))
                child->jobctl |= JOBCTL_STOP_PENDING;

        /*
         * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
         * @child in the butt.  Note that @resume should be used iff @child
         * is in TASK_TRACED; otherwise, we might unduly disrupt
         * TASK_KILLABLE sleeps.
         */
        if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child))
                ptrace_signal_wake_up(child, true);

        spin_unlock(&child->sighand->siglock);
}

static bool looks_like_a_spurious_pid(struct task_struct *task)
{
        if (task->exit_code != ((PTRACE_EVENT_EXEC << 8) | SIGTRAP))
                return false;

        if (task_pid_vnr(task) == task->ptrace_message)
                return false;
        /*
         * The tracee changed its pid but the PTRACE_EVENT_EXEC event
         * was not wait()'ed, most probably debugger targets the old
         * leader which was destroyed in de_thread().
         */
        return true;
}

/*
 * Ensure that nothing can wake it up, even SIGKILL
 *
 * A task is switched to this state while a ptrace operation is in progress;
 * such that the ptrace operation is uninterruptible.
 */
static bool ptrace_freeze_traced(struct task_struct *task)
{
        bool ret = false;

        /* Lockless, nobody but us can set this flag */
        if (task->jobctl & JOBCTL_LISTENING)
                return ret;

        spin_lock_irq(&task->sighand->siglock);
        if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
            !__fatal_signal_pending(task)) {
                task->jobctl |= JOBCTL_PTRACE_FROZEN;
                ret = true;
        }
        spin_unlock_irq(&task->sighand->siglock);

        return ret;
}

static void ptrace_unfreeze_traced(struct task_struct *task)
{
        unsigned long flags;

        /*
         * The child may be awake and may have cleared
         * JOBCTL_PTRACE_FROZEN (see ptrace_resume).  The child will
         * not set JOBCTL_PTRACE_FROZEN or enter __TASK_TRACED anew.
         */
        if (lock_task_sighand(task, &flags)) {
                task->jobctl &= ~JOBCTL_PTRACE_FROZEN;
                if (__fatal_signal_pending(task)) {
                        task->jobctl &= ~JOBCTL_TRACED;
                        wake_up_state(task, __TASK_TRACED);
                }
                unlock_task_sighand(task, &flags);
        }
}

/**
 * ptrace_check_attach - check whether ptracee is ready for ptrace operation
 * @child: ptracee to check for
 * @ignore_state: don't check whether @child is currently %TASK_TRACED
 *
 * Check whether @child is being ptraced by %current and ready for further
 * ptrace operations.  If @ignore_state is %false, @child also should be in
 * %TASK_TRACED state and on return the child is guaranteed to be traced
 * and not executing.  If @ignore_state is %true, @child can be in any
 * state.
 *
 * CONTEXT:
 * Grabs and releases tasklist_lock and @child->sighand->siglock.
 *
 * RETURNS:
 * 0 on success, -ESRCH if %child is not ready.
 */
static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
{
        int ret = -ESRCH;

        /*
         * We take the read lock around doing both checks to close a
         * possible race where someone else was tracing our child and
         * detached between these two checks.  After this locked check,
         * we are sure that this is our traced child and that can only
         * be changed by us so it's not changing right after this.
         */
        read_lock(&tasklist_lock);
        if (child->ptrace && child->parent == current) {
                /*
                 * child->sighand can't be NULL, release_task()
                 * does ptrace_unlink() before __exit_signal().
                 */
                if (ignore_state || ptrace_freeze_traced(child))
                        ret = 0;
        }
        read_unlock(&tasklist_lock);

        if (!ret && !ignore_state &&
            WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED|TASK_FROZEN)))
                ret = -ESRCH;

        return ret;
}

static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
{
        if (mode & PTRACE_MODE_NOAUDIT)
                return ns_capable_noaudit(ns, CAP_SYS_PTRACE);
        return ns_capable(ns, CAP_SYS_PTRACE);
}

/* Returns 0 on success, -errno on denial. */
static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
{
        const struct cred *cred = current_cred(), *tcred;
        struct mm_struct *mm;
        kuid_t caller_uid;
        kgid_t caller_gid;

        if (!(mode & PTRACE_MODE_FSCREDS) == !(mode & PTRACE_MODE_REALCREDS)) {
                WARN(1, "denying ptrace access check without PTRACE_MODE_*CREDS\n");
                return -EPERM;
        }

        /* May we inspect the given task?
         * This check is used both for attaching with ptrace
         * and for allowing access to sensitive information in /proc.
         *
         * ptrace_attach denies several cases that /proc allows
         * because setting up the necessary parent/child relationship
         * or halting the specified task is impossible.
         */

        /* Don't let security modules deny introspection */
        if (same_thread_group(task, current))
                return 0;
        rcu_read_lock();
        if (mode & PTRACE_MODE_FSCREDS) {
                caller_uid = cred->fsuid;
                caller_gid = cred->fsgid;
        } else {
                /*
                 * Using the euid would make more sense here, but something
                 * in userland might rely on the old behavior, and this
                 * shouldn't be a security problem since
                 * PTRACE_MODE_REALCREDS implies that the caller explicitly
                 * used a syscall that requests access to another process
                 * (and not a filesystem syscall to procfs).
                 */
                caller_uid = cred->uid;
                caller_gid = cred->gid;
        }
        tcred = __task_cred(task);
        if (uid_eq(caller_uid, tcred->euid) &&
            uid_eq(caller_uid, tcred->suid) &&
            uid_eq(caller_uid, tcred->uid)  &&
            gid_eq(caller_gid, tcred->egid) &&
            gid_eq(caller_gid, tcred->sgid) &&
            gid_eq(caller_gid, tcred->gid))
                goto ok;
        if (ptrace_has_cap(tcred->user_ns, mode))
                goto ok;
        rcu_read_unlock();
        return -EPERM;
ok:
        rcu_read_unlock();
        /*
         * If a task drops privileges and becomes nondumpable (through a syscall
         * like setresuid()) while we are trying to access it, we must ensure
         * that the dumpability is read after the credentials; otherwise,
         * we may be able to attach to a task that we shouldn't be able to
         * attach to (as if the task had dropped privileges without becoming
         * nondumpable).
         * Pairs with a write barrier in commit_creds().
         */
        smp_rmb();
        mm = task->mm;
        if (mm &&
            ((get_dumpable(mm) != SUID_DUMP_USER) &&
             !ptrace_has_cap(mm->user_ns, mode)))
            return -EPERM;

        return security_ptrace_access_check(task, mode);
}

bool ptrace_may_access(struct task_struct *task, unsigned int mode)
{
        int err;
        task_lock(task);
        err = __ptrace_may_access(task, mode);
        task_unlock(task);
        return !err;
}

static int check_ptrace_options(unsigned long data)
{
        if (data & ~(unsigned long)PTRACE_O_MASK)
                return -EINVAL;

        if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
                if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
                    !IS_ENABLED(CONFIG_SECCOMP))
                        return -EINVAL;

                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;

                if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
                    current->ptrace & PT_SUSPEND_SECCOMP)
                        return -EPERM;
        }
        return 0;
}

static inline void ptrace_set_stopped(struct task_struct *task, bool seize)
{
        guard(spinlock)(&task->sighand->siglock);

        /* SEIZE doesn't trap tracee on attach */
        if (!seize)
                send_signal_locked(SIGSTOP, SEND_SIG_PRIV, task, PIDTYPE_PID);
        /*
         * If the task is already STOPPED, set JOBCTL_TRAP_STOP and
         * TRAPPING, and kick it so that it transits to TRACED.  TRAPPING
         * will be cleared if the child completes the transition or any
         * event which clears the group stop states happens.  We'll wait
         * for the transition to complete before returning from this
         * function.
         *
         * This hides STOPPED -> RUNNING -> TRACED transition from the
         * attaching thread but a different thread in the same group can
         * still observe the transient RUNNING state.  IOW, if another
         * thread's WNOHANG wait(2) on the stopped tracee races against
         * ATTACH, the wait(2) may fail due to the transient RUNNING.
         *
         * The following task_is_stopped() test is safe as both transitions
         * in and out of STOPPED are protected by siglock.
         */
        if (task_is_stopped(task) &&
            task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) {
                task->jobctl &= ~JOBCTL_STOPPED;
                signal_wake_up_state(task, __TASK_STOPPED);
        }
}

static int ptrace_attach(struct task_struct *task, long request,
                         unsigned long addr,
                         unsigned long flags)
{
        bool seize = (request == PTRACE_SEIZE);
        int retval;

        if (seize) {
                if (addr != 0)
                        return -EIO;
                /*
                 * This duplicates the check in check_ptrace_options() because
                 * ptrace_attach() and ptrace_setoptions() have historically
                 * used different error codes for unknown ptrace options.
                 */
                if (flags & ~(unsigned long)PTRACE_O_MASK)
                        return -EIO;

                retval = check_ptrace_options(flags);
                if (retval)
                        return retval;
                flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
        } else {
                flags = PT_PTRACED;
        }

        audit_ptrace(task);

        if (unlikely(task->flags & PF_KTHREAD))
                return -EPERM;
        if (same_thread_group(task, current))
                return -EPERM;

        /*
         * Protect exec's credential calculations against our interference;
         * SUID, SGID and LSM creds get determined differently
         * under ptrace.
         */
        scoped_cond_guard (mutex_intr, return -ERESTARTNOINTR,
                           &task->signal->cred_guard_mutex) {

                scoped_guard (task_lock, task) {
                        retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
                        if (retval)
                                return retval;
                }

                scoped_guard (write_lock_irq, &tasklist_lock) {
                        if (unlikely(task->exit_state))
                                return -EPERM;
                        if (task->ptrace)
                                return -EPERM;

                        task->ptrace = flags;
                        ptrace_link(task, current);
                        ptrace_set_stopped(task, seize);
                }
        }

        /*
         * We do not bother to change retval or clear JOBCTL_TRAPPING
         * if wait_on_bit() was interrupted by SIGKILL. The tracer will
         * not return to user-mode, it will exit and clear this bit in
         * __ptrace_unlink() if it wasn't already cleared by the tracee;
         * and until then nobody can ptrace this task.
         */
        wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE);
        proc_ptrace_connector(task, PTRACE_ATTACH);

        return 0;
}

/**
 * ptrace_traceme  --  helper for PTRACE_TRACEME
 *
 * Performs checks and sets PT_PTRACED.
 * Should be used by all ptrace implementations for PTRACE_TRACEME.
 */
static int ptrace_traceme(void)
{
        int ret = -EPERM;

        write_lock_irq(&tasklist_lock);
        /* Are we already being traced? */
        if (!current->ptrace) {
                ret = security_ptrace_traceme(current->parent);
                /*
                 * Check PF_EXITING to ensure ->real_parent has not passed
                 * exit_ptrace(). Otherwise we don't report the error but
                 * pretend ->real_parent untraces us right after return.
                 */
                if (!ret && !(current->real_parent->flags & PF_EXITING)) {
                        current->ptrace = PT_PTRACED;
                        ptrace_link(current, current->real_parent);
                }
        }
        write_unlock_irq(&tasklist_lock);

        return ret;
}

/*
 * Called with irqs disabled, returns true if childs should reap themselves.
 */
static int ignoring_children(struct sighand_struct *sigh)
{
        int ret;
        spin_lock(&sigh->siglock);
        ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
              (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
        spin_unlock(&sigh->siglock);
        return ret;
}

/*
 * Called with tasklist_lock held for writing.
 * Unlink a traced task, and clean it up if it was a traced zombie.
 * Return true if it needs to be reaped with release_task().
 * (We can't call release_task() here because we already hold tasklist_lock.)
 *
 * If it's a zombie, our attachedness prevented normal parent notification
 * or self-reaping.  Do notification now if it would have happened earlier.
 * If it should reap itself, return true.
 *
 * If it's our own child, there is no notification to do. But if our normal
 * children self-reap, then this child was prevented by ptrace and we must
 * reap it now, in that case we must also wake up sub-threads sleeping in
 * do_wait().
 */
static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
{
        bool dead;

        __ptrace_unlink(p);

        if (p->exit_state != EXIT_ZOMBIE)
                return false;

        dead = !thread_group_leader(p);

        if (!dead && thread_group_empty(p)) {
                if (!same_thread_group(p->real_parent, tracer))
                        dead = do_notify_parent(p, p->exit_signal);
                else if (ignoring_children(tracer->sighand)) {
                        __wake_up_parent(p, tracer);
                        dead = true;
                }
        }
        /* Mark it as in the process of being reaped. */
        if (dead)
                p->exit_state = EXIT_DEAD;
        return dead;
}

static int ptrace_detach(struct task_struct *child, unsigned int data)
{
        if (!valid_signal(data))
                return -EIO;

        /* Architecture-specific hardware disable .. */
        ptrace_disable(child);

        write_lock_irq(&tasklist_lock);
        /*
         * We rely on ptrace_freeze_traced(). It can't be killed and
         * untraced by another thread, it can't be a zombie.
         */
        WARN_ON(!child->ptrace || child->exit_state);
        /*
         * tasklist_lock avoids the race with wait_task_stopped(), see
         * the comment in ptrace_resume().
         */
        child->exit_code = data;
        __ptrace_detach(current, child);
        write_unlock_irq(&tasklist_lock);

        proc_ptrace_connector(child, PTRACE_DETACH);

        return 0;
}

/*
 * Detach all tasks we were using ptrace on. Called with tasklist held
 * for writing.
 */
void exit_ptrace(struct task_struct *tracer, struct list_head *dead)
{
        struct task_struct *p, *n;

        list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
                if (unlikely(p->ptrace & PT_EXITKILL))
                        send_sig_info(SIGKILL, SEND_SIG_PRIV, p);

                if (__ptrace_detach(tracer, p))
                        list_add(&p->ptrace_entry, dead);
        }
}

int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
{
        int copied = 0;

        while (len > 0) {
                char buf[128];
                int this_len, retval;

                this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
                retval = ptrace_access_vm(tsk, src, buf, this_len, FOLL_FORCE);

                if (!retval) {
                        if (copied)
                                break;
                        return -EIO;
                }
                if (copy_to_user(dst, buf, retval))
                        return -EFAULT;
                copied += retval;
                src += retval;
                dst += retval;
                len -= retval;
        }
        return copied;
}

int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len)
{
        int copied = 0;

        while (len > 0) {
                char buf[128];
                int this_len, retval;

                this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
                if (copy_from_user(buf, src, this_len))
                        return -EFAULT;
                retval = ptrace_access_vm(tsk, dst, buf, this_len,
                                FOLL_FORCE | FOLL_WRITE);
                if (!retval) {
                        if (copied)
                                break;
                        return -EIO;
                }
                copied += retval;
                src += retval;
                dst += retval;
                len -= retval;
        }
        return copied;
}

static int ptrace_setoptions(struct task_struct *child, unsigned long data)
{
        unsigned flags;
        int ret;

        ret = check_ptrace_options(data);
        if (ret)
                return ret;

        /* Avoid intermediate state when all opts are cleared */
        flags = child->ptrace;
        flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
        flags |= (data << PT_OPT_FLAG_SHIFT);
        child->ptrace = flags;

        return 0;
}

static int ptrace_getsiginfo(struct task_struct *child, kernel_siginfo_t *info)
{
        unsigned long flags;
        int error = -ESRCH;

        if (lock_task_sighand(child, &flags)) {
                error = -EINVAL;
                if (likely(child->last_siginfo != NULL)) {
                        copy_siginfo(info, child->last_siginfo);
                        error = 0;
                }
                unlock_task_sighand(child, &flags);
        }
        return error;
}

static int ptrace_setsiginfo(struct task_struct *child, const kernel_siginfo_t *info)
{
        unsigned long flags;
        int error = -ESRCH;

        if (lock_task_sighand(child, &flags)) {
                error = -EINVAL;
                if (likely(child->last_siginfo != NULL)) {
                        copy_siginfo(child->last_siginfo, info);
                        error = 0;
                }
                unlock_task_sighand(child, &flags);
        }
        return error;
}

static int ptrace_peek_siginfo(struct task_struct *child,
                                unsigned long addr,
                                unsigned long data)
{
        struct ptrace_peeksiginfo_args arg;
        struct sigpending *pending;
        struct sigqueue *q;
        int ret, i;

        ret = copy_from_user(&arg, (void __user *) addr,
                                sizeof(struct ptrace_peeksiginfo_args));
        if (ret)
                return -EFAULT;

        if (arg.flags & ~PTRACE_PEEKSIGINFO_SHARED)
                return -EINVAL; /* unknown flags */

        if (arg.nr < 0)
                return -EINVAL;

        /* Ensure arg.off fits in an unsigned long */
        if (arg.off > ULONG_MAX)
                return 0;

        if (arg.flags & PTRACE_PEEKSIGINFO_SHARED)
                pending = &child->signal->shared_pending;
        else
                pending = &child->pending;

        for (i = 0; i < arg.nr; ) {
                kernel_siginfo_t info;
                unsigned long off = arg.off + i;
                bool found = false;

                spin_lock_irq(&child->sighand->siglock);
                list_for_each_entry(q, &pending->list, list) {
                        if (!off--) {
                                found = true;
                                copy_siginfo(&info, &q->info);
                                break;
                        }
                }
                spin_unlock_irq(&child->sighand->siglock);

                if (!found) /* beyond the end of the list */
                        break;

#ifdef CONFIG_COMPAT
                if (unlikely(in_compat_syscall())) {
                        compat_siginfo_t __user *uinfo = compat_ptr(data);

                        if (copy_siginfo_to_user32(uinfo, &info)) {
                                ret = -EFAULT;
                                break;
                        }

                } else
#endif
                {
                        siginfo_t __user *uinfo = (siginfo_t __user *) data;

                        if (copy_siginfo_to_user(uinfo, &info)) {
                                ret = -EFAULT;
                                break;
                        }
                }

                data += sizeof(siginfo_t);
                i++;

                if (signal_pending(current))
                        break;

                cond_resched();
        }

        if (i > 0)
                return i;

        return ret;
}

#ifdef CONFIG_RSEQ
static long ptrace_get_rseq_configuration(struct task_struct *task,
                                          unsigned long size, void __user *data)
{
        struct ptrace_rseq_configuration conf = {
                .rseq_abi_pointer = (u64)(uintptr_t)task->rseq,
                .rseq_abi_size = task->rseq_len,
                .signature = task->rseq_sig,
                .flags = 0,
        };

        size = min_t(unsigned long, size, sizeof(conf));
        if (copy_to_user(data, &conf, size))
                return -EFAULT;
        return sizeof(conf);
}
#endif

#define is_singlestep(request)                ((request) == PTRACE_SINGLESTEP)

#ifdef PTRACE_SINGLEBLOCK
#define is_singleblock(request)                ((request) == PTRACE_SINGLEBLOCK)
#else
#define is_singleblock(request)                0
#endif

#ifdef PTRACE_SYSEMU
#define is_sysemu_singlestep(request)        ((request) == PTRACE_SYSEMU_SINGLESTEP)
#else
#define is_sysemu_singlestep(request)        0
#endif

static int ptrace_resume(struct task_struct *child, long request,
                         unsigned long data)
{
        if (!valid_signal(data))
                return -EIO;

        if (request == PTRACE_SYSCALL)
                set_task_syscall_work(child, SYSCALL_TRACE);
        else
                clear_task_syscall_work(child, SYSCALL_TRACE);

#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
        if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP)
                set_task_syscall_work(child, SYSCALL_EMU);
        else
                clear_task_syscall_work(child, SYSCALL_EMU);
#endif

        if (is_singleblock(request)) {
                if (unlikely(!arch_has_block_step()))
                        return -EIO;
                user_enable_block_step(child);
        } else if (is_singlestep(request) || is_sysemu_singlestep(request)) {
                if (unlikely(!arch_has_single_step()))
                        return -EIO;
                user_enable_single_step(child);
        } else {
                user_disable_single_step(child);
        }

        /*
         * Change ->exit_code and ->state under siglock to avoid the race
         * with wait_task_stopped() in between; a non-zero ->exit_code will
         * wrongly look like another report from tracee.
         *
         * Note that we need siglock even if ->exit_code == data and/or this
         * status was not reported yet, the new status must not be cleared by
         * wait_task_stopped() after resume.
         */
        spin_lock_irq(&child->sighand->siglock);
        child->exit_code = data;
        child->jobctl &= ~JOBCTL_TRACED;
        wake_up_state(child, __TASK_TRACED);
        spin_unlock_irq(&child->sighand->siglock);

        return 0;
}

#ifdef CONFIG_HAVE_ARCH_TRACEHOOK

static const struct user_regset *
find_regset(const struct user_regset_view *view, unsigned int type)
{
        const struct user_regset *regset;
        int n;

        for (n = 0; n < view->n; ++n) {
                regset = view->regsets + n;
                if (regset->core_note_type == type)
                        return regset;
        }

        return NULL;
}

static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
                         struct iovec *kiov)
{
        const struct user_regset_view *view = task_user_regset_view(task);
        const struct user_regset *regset = find_regset(view, type);
        int regset_no;

        if (!regset || (kiov->iov_len % regset->size) != 0)
                return -EINVAL;

        regset_no = regset - view->regsets;
        kiov->iov_len = min(kiov->iov_len,
                            (__kernel_size_t) (regset->n * regset->size));

        if (req == PTRACE_GETREGSET)
                return copy_regset_to_user(task, view, regset_no, 0,
                                           kiov->iov_len, kiov->iov_base);
        else
                return copy_regset_from_user(task, view, regset_no, 0,
                                             kiov->iov_len, kiov->iov_base);
}

/*
 * This is declared in linux/regset.h and defined in machine-dependent
 * code.  We put the export here, near the primary machine-neutral use,
 * to ensure no machine forgets it.
 */
EXPORT_SYMBOL_GPL(task_user_regset_view);

static unsigned long
ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs,
                              struct ptrace_syscall_info *info)
{
        unsigned long args[ARRAY_SIZE(info->entry.args)];
        int i;

        info->op = PTRACE_SYSCALL_INFO_ENTRY;
        info->entry.nr = syscall_get_nr(child, regs);
        syscall_get_arguments(child, regs, args);
        for (i = 0; i < ARRAY_SIZE(args); i++)
                info->entry.args[i] = args[i];

        /* args is the last field in struct ptrace_syscall_info.entry */
        return offsetofend(struct ptrace_syscall_info, entry.args);
}

static unsigned long
ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs,
                                struct ptrace_syscall_info *info)
{
        /*
         * As struct ptrace_syscall_info.entry is currently a subset
         * of struct ptrace_syscall_info.seccomp, it makes sense to
         * initialize that subset using ptrace_get_syscall_info_entry().
         * This can be reconsidered in the future if these structures
         * diverge significantly enough.
         */
        ptrace_get_syscall_info_entry(child, regs, info);
        info->op = PTRACE_SYSCALL_INFO_SECCOMP;
        info->seccomp.ret_data = child->ptrace_message;

        /* ret_data is the last field in struct ptrace_syscall_info.seccomp */
        return offsetofend(struct ptrace_syscall_info, seccomp.ret_data);
}

static unsigned long
ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs,
                             struct ptrace_syscall_info *info)
{
        info->op = PTRACE_SYSCALL_INFO_EXIT;
        info->exit.rval = syscall_get_error(child, regs);
        info->exit.is_error = !!info->exit.rval;
        if (!info->exit.is_error)
                info->exit.rval = syscall_get_return_value(child, regs);

        /* is_error is the last field in struct ptrace_syscall_info.exit */
        return offsetofend(struct ptrace_syscall_info, exit.is_error);
}

static int
ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
                        void __user *datavp)
{
        struct pt_regs *regs = task_pt_regs(child);
        struct ptrace_syscall_info info = {
                .op = PTRACE_SYSCALL_INFO_NONE,
                .arch = syscall_get_arch(child),
                .instruction_pointer = instruction_pointer(regs),
                .stack_pointer = user_stack_pointer(regs),
        };
        unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry);
        unsigned long write_size;

        /*
         * This does not need lock_task_sighand() to access
         * child->last_siginfo because ptrace_freeze_traced()
         * called earlier by ptrace_check_attach() ensures that
         * the tracee cannot go away and clear its last_siginfo.
         */
        switch (child->last_siginfo ? child->last_siginfo->si_code : 0) {
        case SIGTRAP | 0x80:
                switch (child->ptrace_message) {
                case PTRACE_EVENTMSG_SYSCALL_ENTRY:
                        actual_size = ptrace_get_syscall_info_entry(child, regs,
                                                                    &info);
                        break;
                case PTRACE_EVENTMSG_SYSCALL_EXIT:
                        actual_size = ptrace_get_syscall_info_exit(child, regs,
                                                                   &info);
                        break;
                }
                break;
        case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8):
                actual_size = ptrace_get_syscall_info_seccomp(child, regs,
                                                              &info);
                break;
        }

        write_size = min(actual_size, user_size);
        return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size;
}
#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */

int ptrace_request(struct task_struct *child, long request,
                   unsigned long addr, unsigned long data)
{
        bool seized = child->ptrace & PT_SEIZED;
        int ret = -EIO;
        kernel_siginfo_t siginfo, *si;
        void __user *datavp = (void __user *) data;
        unsigned long __user *datalp = datavp;
        unsigned long flags;

        switch (request) {
        case PTRACE_PEEKTEXT:
        case PTRACE_PEEKDATA:
                return generic_ptrace_peekdata(child, addr, data);
        case PTRACE_POKETEXT:
        case PTRACE_POKEDATA:
                return generic_ptrace_pokedata(child, addr, data);

#ifdef PTRACE_OLDSETOPTIONS
        case PTRACE_OLDSETOPTIONS:
#endif
        case PTRACE_SETOPTIONS:
                ret = ptrace_setoptions(child, data);
                break;
        case PTRACE_GETEVENTMSG:
                ret = put_user(child->ptrace_message, datalp);
                break;

        case PTRACE_PEEKSIGINFO:
                ret = ptrace_peek_siginfo(child, addr, data);
                break;

        case PTRACE_GETSIGINFO:
                ret = ptrace_getsiginfo(child, &siginfo);
                if (!ret)
                        ret = copy_siginfo_to_user(datavp, &siginfo);
                break;

        case PTRACE_SETSIGINFO:
                ret = copy_siginfo_from_user(&siginfo, datavp);
                if (!ret)
                        ret = ptrace_setsiginfo(child, &siginfo);
                break;

        case PTRACE_GETSIGMASK: {
                sigset_t *mask;

                if (addr != sizeof(sigset_t)) {
                        ret = -EINVAL;
                        break;
                }

                if (test_tsk_restore_sigmask(child))
                        mask = &child->saved_sigmask;
                else
                        mask = &child->blocked;

                if (copy_to_user(datavp, mask, sizeof(sigset_t)))
                        ret = -EFAULT;
                else
                        ret = 0;

                break;
        }

        case PTRACE_SETSIGMASK: {
                sigset_t new_set;

                if (addr != sizeof(sigset_t)) {
                        ret = -EINVAL;
                        break;
                }

                if (copy_from_user(&new_set, datavp, sizeof(sigset_t))) {
                        ret = -EFAULT;
                        break;
                }

                sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));

                /*
                 * Every thread does recalc_sigpending() after resume, so
                 * retarget_shared_pending() and recalc_sigpending() are not
                 * called here.
                 */
                spin_lock_irq(&child->sighand->siglock);
                child->blocked = new_set;
                spin_unlock_irq(&child->sighand->siglock);

                clear_tsk_restore_sigmask(child);

                ret = 0;
                break;
        }

        case PTRACE_INTERRUPT:
                /*
                 * Stop tracee without any side-effect on signal or job
                 * control.  At least one trap is guaranteed to happen
                 * after this request.  If @child is already trapped, the
                 * current trap is not disturbed and another trap will
                 * happen after the current trap is ended with PTRACE_CONT.
                 *
                 * The actual trap might not be PTRACE_EVENT_STOP trap but
                 * the pending condition is cleared regardless.
                 */
                if (unlikely(!seized || !lock_task_sighand(child, &flags)))
                        break;

                /*
                 * INTERRUPT doesn't disturb existing trap sans one
                 * exception.  If ptracer issued LISTEN for the current
                 * STOP, this INTERRUPT should clear LISTEN and re-trap
                 * tracee into STOP.
                 */
                if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
                        ptrace_signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);

                unlock_task_sighand(child, &flags);
                ret = 0;
                break;

        case PTRACE_LISTEN:
                /*
                 * Listen for events.  Tracee must be in STOP.  It's not
                 * resumed per-se but is not considered to be in TRACED by
                 * wait(2) or ptrace(2).  If an async event (e.g. group
                 * stop state change) happens, tracee will enter STOP trap
                 * again.  Alternatively, ptracer can issue INTERRUPT to
                 * finish listening and re-trap tracee into STOP.
                 */
                if (unlikely(!seized || !lock_task_sighand(child, &flags)))
                        break;

                si = child->last_siginfo;
                if (likely(si && (si->si_code >> 8) == PTRACE_EVENT_STOP)) {
                        child->jobctl |= JOBCTL_LISTENING;
                        /*
                         * If NOTIFY is set, it means event happened between
                         * start of this trap and now.  Trigger re-trap.
                         */
                        if (child->jobctl & JOBCTL_TRAP_NOTIFY)
                                ptrace_signal_wake_up(child, true);
                        ret = 0;
                }
                unlock_task_sighand(child, &flags);
                break;

        case PTRACE_DETACH:         /* detach a process that was attached. */
                ret = ptrace_detach(child, data);
                break;

#ifdef CONFIG_BINFMT_ELF_FDPIC
        case PTRACE_GETFDPIC: {
                struct mm_struct *mm = get_task_mm(child);
                unsigned long tmp = 0;

                ret = -ESRCH;
                if (!mm)
                        break;

                switch (addr) {
                case PTRACE_GETFDPIC_EXEC:
                        tmp = mm->context.exec_fdpic_loadmap;
                        break;
                case PTRACE_GETFDPIC_INTERP:
                        tmp = mm->context.interp_fdpic_loadmap;
                        break;
                default:
                        break;
                }
                mmput(mm);

                ret = put_user(tmp, datalp);
                break;
        }
#endif

        case PTRACE_SINGLESTEP:
#ifdef PTRACE_SINGLEBLOCK
        case PTRACE_SINGLEBLOCK:
#endif
#ifdef PTRACE_SYSEMU
        case PTRACE_SYSEMU:
        case PTRACE_SYSEMU_SINGLESTEP:
#endif
        case PTRACE_SYSCALL:
        case PTRACE_CONT:
                return ptrace_resume(child, request, data);

        case PTRACE_KILL:
                send_sig_info(SIGKILL, SEND_SIG_NOINFO, child);
                return 0;

#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        case PTRACE_GETREGSET:
        case PTRACE_SETREGSET: {
                struct iovec kiov;
                struct iovec __user *uiov = datavp;

                if (!access_ok(uiov, sizeof(*uiov)))
                        return -EFAULT;

                if (__get_user(kiov.iov_base, &uiov->iov_base) ||
                    __get_user(kiov.iov_len, &uiov->iov_len))
                        return -EFAULT;

                ret = ptrace_regset(child, request, addr, &kiov);
                if (!ret)
                        ret = __put_user(kiov.iov_len, &uiov->iov_len);
                break;
        }

        case PTRACE_GET_SYSCALL_INFO:
                ret = ptrace_get_syscall_info(child, addr, datavp);
                break;
#endif

        case PTRACE_SECCOMP_GET_FILTER:
                ret = seccomp_get_filter(child, addr, datavp);
                break;

        case PTRACE_SECCOMP_GET_METADATA:
                ret = seccomp_get_metadata(child, addr, datavp);
                break;

#ifdef CONFIG_RSEQ
        case PTRACE_GET_RSEQ_CONFIGURATION:
                ret = ptrace_get_rseq_configuration(child, addr, datavp);
                break;
#endif

        case PTRACE_SET_SYSCALL_USER_DISPATCH_CONFIG:
                ret = syscall_user_dispatch_set_config(child, addr, datavp);
                break;

        case PTRACE_GET_SYSCALL_USER_DISPATCH_CONFIG:
                ret = syscall_user_dispatch_get_config(child, addr, datavp);
                break;

        default:
                break;
        }

        return ret;
}

SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
                unsigned long, data)
{
        struct task_struct *child;
        long ret;

        if (request == PTRACE_TRACEME) {
                ret = ptrace_traceme();
                goto out;
        }

        child = find_get_task_by_vpid(pid);
        if (!child) {
                ret = -ESRCH;
                goto out;
        }

        if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
                ret = ptrace_attach(child, request, addr, data);
                goto out_put_task_struct;
        }

        ret = ptrace_check_attach(child, request == PTRACE_KILL ||
                                  request == PTRACE_INTERRUPT);
        if (ret < 0)
                goto out_put_task_struct;

        ret = arch_ptrace(child, request, addr, data);
        if (ret || request != PTRACE_DETACH)
                ptrace_unfreeze_traced(child);

 out_put_task_struct:
        put_task_struct(child);
 out:
        return ret;
}

int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
                            unsigned long data)
{
        unsigned long tmp;
        int copied;

        copied = ptrace_access_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
        if (copied != sizeof(tmp))
                return -EIO;
        return put_user(tmp, (unsigned long __user *)data);
}

int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
                            unsigned long data)
{
        int copied;

        copied = ptrace_access_vm(tsk, addr, &data, sizeof(data),
                        FOLL_FORCE | FOLL_WRITE);
        return (copied == sizeof(data)) ? 0 : -EIO;
}

#if defined CONFIG_COMPAT

int compat_ptrace_request(struct task_struct *child, compat_long_t request,
                          compat_ulong_t addr, compat_ulong_t data)
{
        compat_ulong_t __user *datap = compat_ptr(data);
        compat_ulong_t word;
        kernel_siginfo_t siginfo;
        int ret;

        switch (request) {
        case PTRACE_PEEKTEXT:
        case PTRACE_PEEKDATA:
                ret = ptrace_access_vm(child, addr, &word, sizeof(word),
                                FOLL_FORCE);
                if (ret != sizeof(word))
                        ret = -EIO;
                else
                        ret = put_user(word, datap);
                break;

        case PTRACE_POKETEXT:
        case PTRACE_POKEDATA:
                ret = ptrace_access_vm(child, addr, &data, sizeof(data),
                                FOLL_FORCE | FOLL_WRITE);
                ret = (ret != sizeof(data) ? -EIO : 0);
                break;

        case PTRACE_GETEVENTMSG:
                ret = put_user((compat_ulong_t) child->ptrace_message, datap);
                break;

        case PTRACE_GETSIGINFO:
                ret = ptrace_getsiginfo(child, &siginfo);
                if (!ret)
                        ret = copy_siginfo_to_user32(
                                (struct compat_siginfo __user *) datap,
                                &siginfo);
                break;

        case PTRACE_SETSIGINFO:
                ret = copy_siginfo_from_user32(
                        &siginfo, (struct compat_siginfo __user *) datap);
                if (!ret)
                        ret = ptrace_setsiginfo(child, &siginfo);
                break;
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        case PTRACE_GETREGSET:
        case PTRACE_SETREGSET:
        {
                struct iovec kiov;
                struct compat_iovec __user *uiov =
                        (struct compat_iovec __user *) datap;
                compat_uptr_t ptr;
                compat_size_t len;

                if (!access_ok(uiov, sizeof(*uiov)))
                        return -EFAULT;

                if (__get_user(ptr, &uiov->iov_base) ||
                    __get_user(len, &uiov->iov_len))
                        return -EFAULT;

                kiov.iov_base = compat_ptr(ptr);
                kiov.iov_len = len;

                ret = ptrace_regset(child, request, addr, &kiov);
                if (!ret)
                        ret = __put_user(kiov.iov_len, &uiov->iov_len);
                break;
        }
#endif

        default:
                ret = ptrace_request(child, request, addr, data);
        }

        return ret;
}

COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid,
                       compat_long_t, addr, compat_long_t, data)
{
        struct task_struct *child;
        long ret;

        if (request == PTRACE_TRACEME) {
                ret = ptrace_traceme();
                goto out;
        }

        child = find_get_task_by_vpid(pid);
        if (!child) {
                ret = -ESRCH;
                goto out;
        }

        if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
                ret = ptrace_attach(child, request, addr, data);
                goto out_put_task_struct;
        }

        ret = ptrace_check_attach(child, request == PTRACE_KILL ||
                                  request == PTRACE_INTERRUPT);
        if (!ret) {
                ret = compat_arch_ptrace(child, request, addr, data);
                if (ret || request != PTRACE_DETACH)
                        ptrace_unfreeze_traced(child);
        }

 out_put_task_struct:
        put_task_struct(child);
 out:
        return ret;
}
#endif        /* CONFIG_COMPAT */





















































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MSR_H
#define _ASM_X86_MSR_H

#include "msr-index.h"

#ifndef __ASSEMBLY__

#include <asm/asm.h>
#include <asm/errno.h>
#include <asm/cpumask.h>
#include <uapi/asm/msr.h>
#include <asm/shared/msr.h>

#include <linux/percpu.h>

struct msr_info {
        u32                        msr_no;
        struct msr                reg;
        struct msr __percpu        *msrs;
        int                        err;
};

struct msr_regs_info {
        u32 *regs;
        int err;
};

struct saved_msr {
        bool valid;
        struct msr_info info;
};

struct saved_msrs {
        unsigned int num;
        struct saved_msr *array;
};

/*
 * both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A"
 * constraint has different meanings. For i386, "A" means exactly
 * edx:eax, while for x86_64 it doesn't mean rdx:rax or edx:eax. Instead,
 * it means rax *or* rdx.
 */
#ifdef CONFIG_X86_64
/* Using 64-bit values saves one instruction clearing the high half of low */
#define DECLARE_ARGS(val, low, high)        unsigned long low, high
#define EAX_EDX_VAL(val, low, high)        ((low) | (high) << 32)
#define EAX_EDX_RET(val, low, high)        "=a" (low), "=d" (high)
#else
#define DECLARE_ARGS(val, low, high)        unsigned long long val
#define EAX_EDX_VAL(val, low, high)        (val)
#define EAX_EDX_RET(val, low, high)        "=A" (val)
#endif

/*
 * Be very careful with includes. This header is prone to include loops.
 */
#include <asm/atomic.h>
#include <linux/tracepoint-defs.h>

#ifdef CONFIG_TRACEPOINTS
DECLARE_TRACEPOINT(read_msr);
DECLARE_TRACEPOINT(write_msr);
DECLARE_TRACEPOINT(rdpmc);
extern void do_trace_write_msr(unsigned int msr, u64 val, int failed);
extern void do_trace_read_msr(unsigned int msr, u64 val, int failed);
extern void do_trace_rdpmc(unsigned int msr, u64 val, int failed);
#else
static inline void do_trace_write_msr(unsigned int msr, u64 val, int failed) {}
static inline void do_trace_read_msr(unsigned int msr, u64 val, int failed) {}
static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {}
#endif

/*
 * __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR
 * accessors and should not have any tracing or other functionality piggybacking
 * on them - those are *purely* for accessing MSRs and nothing more. So don't even
 * think of extending them - you will be slapped with a stinking trout or a frozen
 * shark will reach you, wherever you are! You've been warned.
 */
static __always_inline unsigned long long __rdmsr(unsigned int msr)
{
        DECLARE_ARGS(val, low, high);

        asm volatile("1: rdmsr\n"
                     "2:\n"
                     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR)
                     : EAX_EDX_RET(val, low, high) : "c" (msr));

        return EAX_EDX_VAL(val, low, high);
}

static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high)
{
        asm volatile("1: wrmsr\n"
                     "2:\n"
                     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
                     : : "c" (msr), "a"(low), "d" (high) : "memory");
}

/*
 * WRMSRNS behaves exactly like WRMSR with the only difference being
 * that it is not a serializing instruction by default.
 */
static __always_inline void __wrmsrns(u32 msr, u32 low, u32 high)
{
        /* Instruction opcode for WRMSRNS; supported in binutils >= 2.40. */
        asm volatile("1: .byte 0x0f,0x01,0xc6\n"
                     "2:\n"
                     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
                     : : "c" (msr), "a"(low), "d" (high));
}

#define native_rdmsr(msr, val1, val2)                        \
do {                                                        \
        u64 __val = __rdmsr((msr));                        \
        (void)((val1) = (u32)__val);                        \
        (void)((val2) = (u32)(__val >> 32));                \
} while (0)

#define native_wrmsr(msr, low, high)                        \
        __wrmsr(msr, low, high)

#define native_wrmsrl(msr, val)                                \
        __wrmsr((msr), (u32)((u64)(val)),                \
                       (u32)((u64)(val) >> 32))

static inline unsigned long long native_read_msr(unsigned int msr)
{
        unsigned long long val;

        val = __rdmsr(msr);

        if (tracepoint_enabled(read_msr))
                do_trace_read_msr(msr, val, 0);

        return val;
}

static inline unsigned long long native_read_msr_safe(unsigned int msr,
                                                      int *err)
{
        DECLARE_ARGS(val, low, high);

        asm volatile("1: rdmsr ; xor %[err],%[err]\n"
                     "2:\n\t"
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err])
                     : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
                     : "c" (msr));
        if (tracepoint_enabled(read_msr))
                do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err);
        return EAX_EDX_VAL(val, low, high);
}

/* Can be uninlined because referenced by paravirt */
static inline void notrace
native_write_msr(unsigned int msr, u32 low, u32 high)
{
        __wrmsr(msr, low, high);

        if (tracepoint_enabled(write_msr))
                do_trace_write_msr(msr, ((u64)high << 32 | low), 0);
}

/* Can be uninlined because referenced by paravirt */
static inline int notrace
native_write_msr_safe(unsigned int msr, u32 low, u32 high)
{
        int err;

        asm volatile("1: wrmsr ; xor %[err],%[err]\n"
                     "2:\n\t"
                     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %[err])
                     : [err] "=a" (err)
                     : "c" (msr), "0" (low), "d" (high)
                     : "memory");
        if (tracepoint_enabled(write_msr))
                do_trace_write_msr(msr, ((u64)high << 32 | low), err);
        return err;
}

extern int rdmsr_safe_regs(u32 regs[8]);
extern int wrmsr_safe_regs(u32 regs[8]);

/**
 * rdtsc() - returns the current TSC without ordering constraints
 *
 * rdtsc() returns the result of RDTSC as a 64-bit integer.  The
 * only ordering constraint it supplies is the ordering implied by
 * "asm volatile": it will put the RDTSC in the place you expect.  The
 * CPU can and will speculatively execute that RDTSC, though, so the
 * results can be non-monotonic if compared on different CPUs.
 */
static __always_inline unsigned long long rdtsc(void)
{
        DECLARE_ARGS(val, low, high);

        asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));

        return EAX_EDX_VAL(val, low, high);
}

/**
 * rdtsc_ordered() - read the current TSC in program order
 *
 * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer.
 * It is ordered like a load to a global in-memory counter.  It should
 * be impossible to observe non-monotonic rdtsc_unordered() behavior
 * across multiple CPUs as long as the TSC is synced.
 */
static __always_inline unsigned long long rdtsc_ordered(void)
{
        DECLARE_ARGS(val, low, high);

        /*
         * The RDTSC instruction is not ordered relative to memory
         * access.  The Intel SDM and the AMD APM are both vague on this
         * point, but empirically an RDTSC instruction can be
         * speculatively executed before prior loads.  An RDTSC
         * immediately after an appropriate barrier appears to be
         * ordered as a normal load, that is, it provides the same
         * ordering guarantees as reading from a global memory location
         * that some other imaginary CPU is updating continuously with a
         * time stamp.
         *
         * Thus, use the preferred barrier on the respective CPU, aiming for
         * RDTSCP as the default.
         */
        asm volatile(ALTERNATIVE_2("rdtsc",
                                   "lfence; rdtsc", X86_FEATURE_LFENCE_RDTSC,
                                   "rdtscp", X86_FEATURE_RDTSCP)
                        : EAX_EDX_RET(val, low, high)
                        /* RDTSCP clobbers ECX with MSR_TSC_AUX. */
                        :: "ecx");

        return EAX_EDX_VAL(val, low, high);
}

static inline unsigned long long native_read_pmc(int counter)
{
        DECLARE_ARGS(val, low, high);

        asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
        if (tracepoint_enabled(rdpmc))
                do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0);
        return EAX_EDX_VAL(val, low, high);
}

#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#include <linux/errno.h>
/*
 * Access to machine-specific registers (available on 586 and better only)
 * Note: the rd* operations modify the parameters directly (without using
 * pointer indirection), this allows gcc to optimize better
 */

#define rdmsr(msr, low, high)                                        \
do {                                                                \
        u64 __val = native_read_msr((msr));                        \
        (void)((low) = (u32)__val);                                \
        (void)((high) = (u32)(__val >> 32));                        \
} while (0)

static inline void wrmsr(unsigned int msr, u32 low, u32 high)
{
        native_write_msr(msr, low, high);
}

#define rdmsrl(msr, val)                        \
        ((val) = native_read_msr((msr)))

static inline void wrmsrl(unsigned int msr, u64 val)
{
        native_write_msr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32));
}

/* wrmsr with exception handling */
static inline int wrmsr_safe(unsigned int msr, u32 low, u32 high)
{
        return native_write_msr_safe(msr, low, high);
}

/* rdmsr with exception handling */
#define rdmsr_safe(msr, low, high)                                \
({                                                                \
        int __err;                                                \
        u64 __val = native_read_msr_safe((msr), &__err);        \
        (*low) = (u32)__val;                                        \
        (*high) = (u32)(__val >> 32);                                \
        __err;                                                        \
})

static inline int rdmsrl_safe(unsigned int msr, unsigned long long *p)
{
        int err;

        *p = native_read_msr_safe(msr, &err);
        return err;
}

#define rdpmc(counter, low, high)                        \
do {                                                        \
        u64 _l = native_read_pmc((counter));                \
        (low)  = (u32)_l;                                \
        (high) = (u32)(_l >> 32);                        \
} while (0)

#define rdpmcl(counter, val) ((val) = native_read_pmc(counter))

#endif        /* !CONFIG_PARAVIRT_XXL */

static __always_inline void wrmsrns(u32 msr, u64 val)
{
        __wrmsrns(msr, val, val >> 32);
}

/*
 * 64-bit version of wrmsr_safe():
 */
static inline int wrmsrl_safe(u32 msr, u64 val)
{
        return wrmsr_safe(msr, (u32)val,  (u32)(val >> 32));
}

struct msr __percpu *msrs_alloc(void);
void msrs_free(struct msr __percpu *msrs);
int msr_set_bit(u32 msr, u8 bit);
int msr_clear_bit(u32 msr, u8 bit);

#ifdef CONFIG_SMP
int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q);
int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q);
void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs);
void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs);
int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q);
int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q);
int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
#else  /*  CONFIG_SMP  */
static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
{
        rdmsr(msr_no, *l, *h);
        return 0;
}
static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
{
        wrmsr(msr_no, l, h);
        return 0;
}
static inline int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
{
        rdmsrl(msr_no, *q);
        return 0;
}
static inline int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
{
        wrmsrl(msr_no, q);
        return 0;
}
static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no,
                                struct msr __percpu *msrs)
{
        rdmsr_on_cpu(0, msr_no, raw_cpu_ptr(&msrs->l), raw_cpu_ptr(&msrs->h));
}
static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no,
                                struct msr __percpu *msrs)
{
        wrmsr_on_cpu(0, msr_no, raw_cpu_read(msrs->l), raw_cpu_read(msrs->h));
}
static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
                                    u32 *l, u32 *h)
{
        return rdmsr_safe(msr_no, l, h);
}
static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
{
        return wrmsr_safe(msr_no, l, h);
}
static inline int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
{
        return rdmsrl_safe(msr_no, q);
}
static inline int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
{
        return wrmsrl_safe(msr_no, q);
}
static inline int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
{
        return rdmsr_safe_regs(regs);
}
static inline int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
{
        return wrmsr_safe_regs(regs);
}
#endif  /* CONFIG_SMP */
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_MSR_H */



























































































































































































































































































































































































































































































































































































































































































    1 














    1 





























































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2014 Facebook.  All rights reserved.
 */

#include <linux/sched.h>
#include <linux/stacktrace.h>
#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "locking.h"
#include "delayed-ref.h"
#include "ref-verify.h"
#include "fs.h"
#include "accessors.h"

/*
 * Used to keep track the roots and number of refs each root has for a given
 * bytenr.  This just tracks the number of direct references, no shared
 * references.
 */
struct root_entry {
        u64 root_objectid;
        u64 num_refs;
        struct rb_node node;
};

/*
 * These are meant to represent what should exist in the extent tree, these can
 * be used to verify the extent tree is consistent as these should all match
 * what the extent tree says.
 */
struct ref_entry {
        u64 root_objectid;
        u64 parent;
        u64 owner;
        u64 offset;
        u64 num_refs;
        struct rb_node node;
};

#define MAX_TRACE        16

/*
 * Whenever we add/remove a reference we record the action.  The action maps
 * back to the delayed ref action.  We hold the ref we are changing in the
 * action so we can account for the history properly, and we record the root we
 * were called with since it could be different from ref_root.  We also store
 * stack traces because that's how I roll.
 */
struct ref_action {
        int action;
        u64 root;
        struct ref_entry ref;
        struct list_head list;
        unsigned long trace[MAX_TRACE];
        unsigned int trace_len;
};

/*
 * One of these for every block we reference, it holds the roots and references
 * to it as well as all of the ref actions that have occurred to it.  We never
 * free it until we unmount the file system in order to make sure re-allocations
 * are happening properly.
 */
struct block_entry {
        u64 bytenr;
        u64 len;
        u64 num_refs;
        int metadata;
        int from_disk;
        struct rb_root roots;
        struct rb_root refs;
        struct rb_node node;
        struct list_head actions;
};

static struct block_entry *insert_block_entry(struct rb_root *root,
                                              struct block_entry *be)
{
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent_node = NULL;
        struct block_entry *entry;

        while (*p) {
                parent_node = *p;
                entry = rb_entry(parent_node, struct block_entry, node);
                if (entry->bytenr > be->bytenr)
                        p = &(*p)->rb_left;
                else if (entry->bytenr < be->bytenr)
                        p = &(*p)->rb_right;
                else
                        return entry;
        }

        rb_link_node(&be->node, parent_node, p);
        rb_insert_color(&be->node, root);
        return NULL;
}

static struct block_entry *lookup_block_entry(struct rb_root *root, u64 bytenr)
{
        struct rb_node *n;
        struct block_entry *entry = NULL;

        n = root->rb_node;
        while (n) {
                entry = rb_entry(n, struct block_entry, node);
                if (entry->bytenr < bytenr)
                        n = n->rb_right;
                else if (entry->bytenr > bytenr)
                        n = n->rb_left;
                else
                        return entry;
        }
        return NULL;
}

static struct root_entry *insert_root_entry(struct rb_root *root,
                                            struct root_entry *re)
{
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent_node = NULL;
        struct root_entry *entry;

        while (*p) {
                parent_node = *p;
                entry = rb_entry(parent_node, struct root_entry, node);
                if (entry->root_objectid > re->root_objectid)
                        p = &(*p)->rb_left;
                else if (entry->root_objectid < re->root_objectid)
                        p = &(*p)->rb_right;
                else
                        return entry;
        }

        rb_link_node(&re->node, parent_node, p);
        rb_insert_color(&re->node, root);
        return NULL;

}

static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2)
{
        if (ref1->root_objectid < ref2->root_objectid)
                return -1;
        if (ref1->root_objectid > ref2->root_objectid)
                return 1;
        if (ref1->parent < ref2->parent)
                return -1;
        if (ref1->parent > ref2->parent)
                return 1;
        if (ref1->owner < ref2->owner)
                return -1;
        if (ref1->owner > ref2->owner)
                return 1;
        if (ref1->offset < ref2->offset)
                return -1;
        if (ref1->offset > ref2->offset)
                return 1;
        return 0;
}

static struct ref_entry *insert_ref_entry(struct rb_root *root,
                                          struct ref_entry *ref)
{
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent_node = NULL;
        struct ref_entry *entry;
        int cmp;

        while (*p) {
                parent_node = *p;
                entry = rb_entry(parent_node, struct ref_entry, node);
                cmp = comp_refs(entry, ref);
                if (cmp > 0)
                        p = &(*p)->rb_left;
                else if (cmp < 0)
                        p = &(*p)->rb_right;
                else
                        return entry;
        }

        rb_link_node(&ref->node, parent_node, p);
        rb_insert_color(&ref->node, root);
        return NULL;

}

static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid)
{
        struct rb_node *n;
        struct root_entry *entry = NULL;

        n = root->rb_node;
        while (n) {
                entry = rb_entry(n, struct root_entry, node);
                if (entry->root_objectid < objectid)
                        n = n->rb_right;
                else if (entry->root_objectid > objectid)
                        n = n->rb_left;
                else
                        return entry;
        }
        return NULL;
}

#ifdef CONFIG_STACKTRACE
static void __save_stack_trace(struct ref_action *ra)
{
        ra->trace_len = stack_trace_save(ra->trace, MAX_TRACE, 2);
}

static void __print_stack_trace(struct btrfs_fs_info *fs_info,
                                struct ref_action *ra)
{
        if (ra->trace_len == 0) {
                btrfs_err(fs_info, "  ref-verify: no stacktrace");
                return;
        }
        stack_trace_print(ra->trace, ra->trace_len, 2);
}
#else
static inline void __save_stack_trace(struct ref_action *ra)
{
}

static inline void __print_stack_trace(struct btrfs_fs_info *fs_info,
                                       struct ref_action *ra)
{
        btrfs_err(fs_info, "  ref-verify: no stacktrace support");
}
#endif

static void free_block_entry(struct block_entry *be)
{
        struct root_entry *re;
        struct ref_entry *ref;
        struct ref_action *ra;
        struct rb_node *n;

        while ((n = rb_first(&be->roots))) {
                re = rb_entry(n, struct root_entry, node);
                rb_erase(&re->node, &be->roots);
                kfree(re);
        }

        while((n = rb_first(&be->refs))) {
                ref = rb_entry(n, struct ref_entry, node);
                rb_erase(&ref->node, &be->refs);
                kfree(ref);
        }

        while (!list_empty(&be->actions)) {
                ra = list_first_entry(&be->actions, struct ref_action,
                                      list);
                list_del(&ra->list);
                kfree(ra);
        }
        kfree(be);
}

static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info,
                                           u64 bytenr, u64 len,
                                           u64 root_objectid)
{
        struct block_entry *be = NULL, *exist;
        struct root_entry *re = NULL;

        re = kzalloc(sizeof(struct root_entry), GFP_NOFS);
        be = kzalloc(sizeof(struct block_entry), GFP_NOFS);
        if (!be || !re) {
                kfree(re);
                kfree(be);
                return ERR_PTR(-ENOMEM);
        }
        be->bytenr = bytenr;
        be->len = len;

        re->root_objectid = root_objectid;
        re->num_refs = 0;

        spin_lock(&fs_info->ref_verify_lock);
        exist = insert_block_entry(&fs_info->block_tree, be);
        if (exist) {
                if (root_objectid) {
                        struct root_entry *exist_re;

                        exist_re = insert_root_entry(&exist->roots, re);
                        if (exist_re)
                                kfree(re);
                } else {
                        kfree(re);
                }
                kfree(be);
                return exist;
        }

        be->num_refs = 0;
        be->metadata = 0;
        be->from_disk = 0;
        be->roots = RB_ROOT;
        be->refs = RB_ROOT;
        INIT_LIST_HEAD(&be->actions);
        if (root_objectid)
                insert_root_entry(&be->roots, re);
        else
                kfree(re);
        return be;
}

static int add_tree_block(struct btrfs_fs_info *fs_info, u64 ref_root,
                          u64 parent, u64 bytenr, int level)
{
        struct block_entry *be;
        struct root_entry *re;
        struct ref_entry *ref = NULL, *exist;

        ref = kmalloc(sizeof(struct ref_entry), GFP_NOFS);
        if (!ref)
                return -ENOMEM;

        if (parent)
                ref->root_objectid = 0;
        else
                ref->root_objectid = ref_root;
        ref->parent = parent;
        ref->owner = level;
        ref->offset = 0;
        ref->num_refs = 1;

        be = add_block_entry(fs_info, bytenr, fs_info->nodesize, ref_root);
        if (IS_ERR(be)) {
                kfree(ref);
                return PTR_ERR(be);
        }
        be->num_refs++;
        be->from_disk = 1;
        be->metadata = 1;

        if (!parent) {
                ASSERT(ref_root);
                re = lookup_root_entry(&be->roots, ref_root);
                ASSERT(re);
                re->num_refs++;
        }
        exist = insert_ref_entry(&be->refs, ref);
        if (exist) {
                exist->num_refs++;
                kfree(ref);
        }
        spin_unlock(&fs_info->ref_verify_lock);

        return 0;
}

static int add_shared_data_ref(struct btrfs_fs_info *fs_info,
                               u64 parent, u32 num_refs, u64 bytenr,
                               u64 num_bytes)
{
        struct block_entry *be;
        struct ref_entry *ref;

        ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
        if (!ref)
                return -ENOMEM;
        be = add_block_entry(fs_info, bytenr, num_bytes, 0);
        if (IS_ERR(be)) {
                kfree(ref);
                return PTR_ERR(be);
        }
        be->num_refs += num_refs;

        ref->parent = parent;
        ref->num_refs = num_refs;
        if (insert_ref_entry(&be->refs, ref)) {
                spin_unlock(&fs_info->ref_verify_lock);
                btrfs_err(fs_info, "existing shared ref when reading from disk?");
                kfree(ref);
                return -EINVAL;
        }
        spin_unlock(&fs_info->ref_verify_lock);
        return 0;
}

static int add_extent_data_ref(struct btrfs_fs_info *fs_info,
                               struct extent_buffer *leaf,
                               struct btrfs_extent_data_ref *dref,
                               u64 bytenr, u64 num_bytes)
{
        struct block_entry *be;
        struct ref_entry *ref;
        struct root_entry *re;
        u64 ref_root = btrfs_extent_data_ref_root(leaf, dref);
        u64 owner = btrfs_extent_data_ref_objectid(leaf, dref);
        u64 offset = btrfs_extent_data_ref_offset(leaf, dref);
        u32 num_refs = btrfs_extent_data_ref_count(leaf, dref);

        ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
        if (!ref)
                return -ENOMEM;
        be = add_block_entry(fs_info, bytenr, num_bytes, ref_root);
        if (IS_ERR(be)) {
                kfree(ref);
                return PTR_ERR(be);
        }
        be->num_refs += num_refs;

        ref->parent = 0;
        ref->owner = owner;
        ref->root_objectid = ref_root;
        ref->offset = offset;
        ref->num_refs = num_refs;
        if (insert_ref_entry(&be->refs, ref)) {
                spin_unlock(&fs_info->ref_verify_lock);
                btrfs_err(fs_info, "existing ref when reading from disk?");
                kfree(ref);
                return -EINVAL;
        }

        re = lookup_root_entry(&be->roots, ref_root);
        if (!re) {
                spin_unlock(&fs_info->ref_verify_lock);
                btrfs_err(fs_info, "missing root in new block entry?");
                return -EINVAL;
        }
        re->num_refs += num_refs;
        spin_unlock(&fs_info->ref_verify_lock);
        return 0;
}

static int process_extent_item(struct btrfs_fs_info *fs_info,
                               struct btrfs_path *path, struct btrfs_key *key,
                               int slot, int *tree_block_level)
{
        struct btrfs_extent_item *ei;
        struct btrfs_extent_inline_ref *iref;
        struct btrfs_extent_data_ref *dref;
        struct btrfs_shared_data_ref *sref;
        struct extent_buffer *leaf = path->nodes[0];
        u32 item_size = btrfs_item_size(leaf, slot);
        unsigned long end, ptr;
        u64 offset, flags, count;
        int type, ret;

        ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
        flags = btrfs_extent_flags(leaf, ei);

        if ((key->type == BTRFS_EXTENT_ITEM_KEY) &&
            flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                struct btrfs_tree_block_info *info;

                info = (struct btrfs_tree_block_info *)(ei + 1);
                *tree_block_level = btrfs_tree_block_level(leaf, info);
                iref = (struct btrfs_extent_inline_ref *)(info + 1);
        } else {
                if (key->type == BTRFS_METADATA_ITEM_KEY)
                        *tree_block_level = key->offset;
                iref = (struct btrfs_extent_inline_ref *)(ei + 1);
        }

        ptr = (unsigned long)iref;
        end = (unsigned long)ei + item_size;
        while (ptr < end) {
                iref = (struct btrfs_extent_inline_ref *)ptr;
                type = btrfs_extent_inline_ref_type(leaf, iref);
                offset = btrfs_extent_inline_ref_offset(leaf, iref);
                switch (type) {
                case BTRFS_TREE_BLOCK_REF_KEY:
                        ret = add_tree_block(fs_info, offset, 0, key->objectid,
                                             *tree_block_level);
                        break;
                case BTRFS_SHARED_BLOCK_REF_KEY:
                        ret = add_tree_block(fs_info, 0, offset, key->objectid,
                                             *tree_block_level);
                        break;
                case BTRFS_EXTENT_DATA_REF_KEY:
                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
                        ret = add_extent_data_ref(fs_info, leaf, dref,
                                                  key->objectid, key->offset);
                        break;
                case BTRFS_SHARED_DATA_REF_KEY:
                        sref = (struct btrfs_shared_data_ref *)(iref + 1);
                        count = btrfs_shared_data_ref_count(leaf, sref);
                        ret = add_shared_data_ref(fs_info, offset, count,
                                                  key->objectid, key->offset);
                        break;
                case BTRFS_EXTENT_OWNER_REF_KEY:
                        WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
                        break;
                default:
                        btrfs_err(fs_info, "invalid key type in iref");
                        ret = -EINVAL;
                        break;
                }
                if (ret)
                        break;
                ptr += btrfs_extent_inline_ref_size(type);
        }
        return ret;
}

static int process_leaf(struct btrfs_root *root,
                        struct btrfs_path *path, u64 *bytenr, u64 *num_bytes,
                        int *tree_block_level)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_buffer *leaf = path->nodes[0];
        struct btrfs_extent_data_ref *dref;
        struct btrfs_shared_data_ref *sref;
        u32 count;
        int i = 0, ret = 0;
        struct btrfs_key key;
        int nritems = btrfs_header_nritems(leaf);

        for (i = 0; i < nritems; i++) {
                btrfs_item_key_to_cpu(leaf, &key, i);
                switch (key.type) {
                case BTRFS_EXTENT_ITEM_KEY:
                        *num_bytes = key.offset;
                        fallthrough;
                case BTRFS_METADATA_ITEM_KEY:
                        *bytenr = key.objectid;
                        ret = process_extent_item(fs_info, path, &key, i,
                                                  tree_block_level);
                        break;
                case BTRFS_TREE_BLOCK_REF_KEY:
                        ret = add_tree_block(fs_info, key.offset, 0,
                                             key.objectid, *tree_block_level);
                        break;
                case BTRFS_SHARED_BLOCK_REF_KEY:
                        ret = add_tree_block(fs_info, 0, key.offset,
                                             key.objectid, *tree_block_level);
                        break;
                case BTRFS_EXTENT_DATA_REF_KEY:
                        dref = btrfs_item_ptr(leaf, i,
                                              struct btrfs_extent_data_ref);
                        ret = add_extent_data_ref(fs_info, leaf, dref, *bytenr,
                                                  *num_bytes);
                        break;
                case BTRFS_SHARED_DATA_REF_KEY:
                        sref = btrfs_item_ptr(leaf, i,
                                              struct btrfs_shared_data_ref);
                        count = btrfs_shared_data_ref_count(leaf, sref);
                        ret = add_shared_data_ref(fs_info, key.offset, count,
                                                  *bytenr, *num_bytes);
                        break;
                default:
                        break;
                }
                if (ret)
                        break;
        }
        return ret;
}

/* Walk down to the leaf from the given level */
static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
                          int level, u64 *bytenr, u64 *num_bytes,
                          int *tree_block_level)
{
        struct extent_buffer *eb;
        int ret = 0;

        while (level >= 0) {
                if (level) {
                        eb = btrfs_read_node_slot(path->nodes[level],
                                                  path->slots[level]);
                        if (IS_ERR(eb))
                                return PTR_ERR(eb);
                        btrfs_tree_read_lock(eb);
                        path->nodes[level-1] = eb;
                        path->slots[level-1] = 0;
                        path->locks[level-1] = BTRFS_READ_LOCK;
                } else {
                        ret = process_leaf(root, path, bytenr, num_bytes,
                                           tree_block_level);
                        if (ret)
                                break;
                }
                level--;
        }
        return ret;
}

/* Walk up to the next node that needs to be processed */
static int walk_up_tree(struct btrfs_path *path, int *level)
{
        int l;

        for (l = 0; l < BTRFS_MAX_LEVEL; l++) {
                if (!path->nodes[l])
                        continue;
                if (l) {
                        path->slots[l]++;
                        if (path->slots[l] <
                            btrfs_header_nritems(path->nodes[l])) {
                                *level = l;
                                return 0;
                        }
                }
                btrfs_tree_unlock_rw(path->nodes[l], path->locks[l]);
                free_extent_buffer(path->nodes[l]);
                path->nodes[l] = NULL;
                path->slots[l] = 0;
                path->locks[l] = 0;
        }

        return 1;
}

static void dump_ref_action(struct btrfs_fs_info *fs_info,
                            struct ref_action *ra)
{
        btrfs_err(fs_info,
"  Ref action %d, root %llu, ref_root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu",
                  ra->action, ra->root, ra->ref.root_objectid, ra->ref.parent,
                  ra->ref.owner, ra->ref.offset, ra->ref.num_refs);
        __print_stack_trace(fs_info, ra);
}

/*
 * Dumps all the information from the block entry to printk, it's going to be
 * awesome.
 */
static void dump_block_entry(struct btrfs_fs_info *fs_info,
                             struct block_entry *be)
{
        struct ref_entry *ref;
        struct root_entry *re;
        struct ref_action *ra;
        struct rb_node *n;

        btrfs_err(fs_info,
"dumping block entry [%llu %llu], num_refs %llu, metadata %d, from disk %d",
                  be->bytenr, be->len, be->num_refs, be->metadata,
                  be->from_disk);

        for (n = rb_first(&be->refs); n; n = rb_next(n)) {
                ref = rb_entry(n, struct ref_entry, node);
                btrfs_err(fs_info,
"  ref root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu",
                          ref->root_objectid, ref->parent, ref->owner,
                          ref->offset, ref->num_refs);
        }

        for (n = rb_first(&be->roots); n; n = rb_next(n)) {
                re = rb_entry(n, struct root_entry, node);
                btrfs_err(fs_info, "  root entry %llu, num_refs %llu",
                          re->root_objectid, re->num_refs);
        }

        list_for_each_entry(ra, &be->actions, list)
                dump_ref_action(fs_info, ra);
}

/*
 * Called when we modify a ref for a bytenr.
 *
 * This will add an action item to the given bytenr and do sanity checks to make
 * sure we haven't messed something up.  If we are making a new allocation and
 * this block entry has history we will delete all previous actions as long as
 * our sanity checks pass as they are no longer needed.
 */
int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
                       struct btrfs_ref *generic_ref)
{
        struct ref_entry *ref = NULL, *exist;
        struct ref_action *ra = NULL;
        struct block_entry *be = NULL;
        struct root_entry *re = NULL;
        int action = generic_ref->action;
        int ret = 0;
        bool metadata;
        u64 bytenr = generic_ref->bytenr;
        u64 num_bytes = generic_ref->num_bytes;
        u64 parent = generic_ref->parent;
        u64 ref_root = 0;
        u64 owner = 0;
        u64 offset = 0;

        if (!btrfs_test_opt(fs_info, REF_VERIFY))
                return 0;

        if (generic_ref->type == BTRFS_REF_METADATA) {
                if (!parent)
                        ref_root = generic_ref->ref_root;
                owner = generic_ref->tree_ref.level;
        } else if (!parent) {
                ref_root = generic_ref->ref_root;
                owner = generic_ref->data_ref.objectid;
                offset = generic_ref->data_ref.offset;
        }
        metadata = owner < BTRFS_FIRST_FREE_OBJECTID;

        ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
        ra = kmalloc(sizeof(struct ref_action), GFP_NOFS);
        if (!ra || !ref) {
                kfree(ref);
                kfree(ra);
                ret = -ENOMEM;
                goto out;
        }

        ref->parent = parent;
        ref->owner = owner;
        ref->root_objectid = ref_root;
        ref->offset = offset;
        ref->num_refs = (action == BTRFS_DROP_DELAYED_REF) ? -1 : 1;

        memcpy(&ra->ref, ref, sizeof(struct ref_entry));
        /*
         * Save the extra info from the delayed ref in the ref action to make it
         * easier to figure out what is happening.  The real ref's we add to the
         * ref tree need to reflect what we save on disk so it matches any
         * on-disk refs we pre-loaded.
         */
        ra->ref.owner = owner;
        ra->ref.offset = offset;
        ra->ref.root_objectid = ref_root;
        __save_stack_trace(ra);

        INIT_LIST_HEAD(&ra->list);
        ra->action = action;
        ra->root = generic_ref->real_root;

        /*
         * This is an allocation, preallocate the block_entry in case we haven't
         * used it before.
         */
        ret = -EINVAL;
        if (action == BTRFS_ADD_DELAYED_EXTENT) {
                /*
                 * For subvol_create we'll just pass in whatever the parent root
                 * is and the new root objectid, so let's not treat the passed
                 * in root as if it really has a ref for this bytenr.
                 */
                be = add_block_entry(fs_info, bytenr, num_bytes, ref_root);
                if (IS_ERR(be)) {
                        kfree(ref);
                        kfree(ra);
                        ret = PTR_ERR(be);
                        goto out;
                }
                be->num_refs++;
                if (metadata)
                        be->metadata = 1;

                if (be->num_refs != 1) {
                        btrfs_err(fs_info,
                        "re-allocated a block that still has references to it!");
                        dump_block_entry(fs_info, be);
                        dump_ref_action(fs_info, ra);
                        kfree(ref);
                        kfree(ra);
                        goto out_unlock;
                }

                while (!list_empty(&be->actions)) {
                        struct ref_action *tmp;

                        tmp = list_first_entry(&be->actions, struct ref_action,
                                               list);
                        list_del(&tmp->list);
                        kfree(tmp);
                }
        } else {
                struct root_entry *tmp;

                if (!parent) {
                        re = kmalloc(sizeof(struct root_entry), GFP_NOFS);
                        if (!re) {
                                kfree(ref);
                                kfree(ra);
                                ret = -ENOMEM;
                                goto out;
                        }
                        /*
                         * This is the root that is modifying us, so it's the
                         * one we want to lookup below when we modify the
                         * re->num_refs.
                         */
                        ref_root = generic_ref->real_root;
                        re->root_objectid = generic_ref->real_root;
                        re->num_refs = 0;
                }

                spin_lock(&fs_info->ref_verify_lock);
                be = lookup_block_entry(&fs_info->block_tree, bytenr);
                if (!be) {
                        btrfs_err(fs_info,
"trying to do action %d to bytenr %llu num_bytes %llu but there is no existing entry!",
                                  action, bytenr, num_bytes);
                        dump_ref_action(fs_info, ra);
                        kfree(ref);
                        kfree(ra);
                        kfree(re);
                        goto out_unlock;
                } else if (be->num_refs == 0) {
                        btrfs_err(fs_info,
                "trying to do action %d for a bytenr that has 0 total references",
                                action);
                        dump_block_entry(fs_info, be);
                        dump_ref_action(fs_info, ra);
                        kfree(ref);
                        kfree(ra);
                        kfree(re);
                        goto out_unlock;
                }

                if (!parent) {
                        tmp = insert_root_entry(&be->roots, re);
                        if (tmp) {
                                kfree(re);
                                re = tmp;
                        }
                }
        }

        exist = insert_ref_entry(&be->refs, ref);
        if (exist) {
                if (action == BTRFS_DROP_DELAYED_REF) {
                        if (exist->num_refs == 0) {
                                btrfs_err(fs_info,
"dropping a ref for a existing root that doesn't have a ref on the block");
                                dump_block_entry(fs_info, be);
                                dump_ref_action(fs_info, ra);
                                kfree(ref);
                                kfree(ra);
                                goto out_unlock;
                        }
                        exist->num_refs--;
                        if (exist->num_refs == 0) {
                                rb_erase(&exist->node, &be->refs);
                                kfree(exist);
                        }
                } else if (!be->metadata) {
                        exist->num_refs++;
                } else {
                        btrfs_err(fs_info,
"attempting to add another ref for an existing ref on a tree block");
                        dump_block_entry(fs_info, be);
                        dump_ref_action(fs_info, ra);
                        kfree(ref);
                        kfree(ra);
                        goto out_unlock;
                }
                kfree(ref);
        } else {
                if (action == BTRFS_DROP_DELAYED_REF) {
                        btrfs_err(fs_info,
"dropping a ref for a root that doesn't have a ref on the block");
                        dump_block_entry(fs_info, be);
                        dump_ref_action(fs_info, ra);
                        kfree(ref);
                        kfree(ra);
                        goto out_unlock;
                }
        }

        if (!parent && !re) {
                re = lookup_root_entry(&be->roots, ref_root);
                if (!re) {
                        /*
                         * This shouldn't happen because we will add our re
                         * above when we lookup the be with !parent, but just in
                         * case catch this case so we don't panic because I
                         * didn't think of some other corner case.
                         */
                        btrfs_err(fs_info, "failed to find root %llu for %llu",
                                  generic_ref->real_root, be->bytenr);
                        dump_block_entry(fs_info, be);
                        dump_ref_action(fs_info, ra);
                        kfree(ra);
                        goto out_unlock;
                }
        }
        if (action == BTRFS_DROP_DELAYED_REF) {
                if (re)
                        re->num_refs--;
                be->num_refs--;
        } else if (action == BTRFS_ADD_DELAYED_REF) {
                be->num_refs++;
                if (re)
                        re->num_refs++;
        }
        list_add_tail(&ra->list, &be->actions);
        ret = 0;
out_unlock:
        spin_unlock(&fs_info->ref_verify_lock);
out:
        if (ret) {
                btrfs_free_ref_cache(fs_info);
                btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
        }
        return ret;
}

/* Free up the ref cache */
void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info)
{
        struct block_entry *be;
        struct rb_node *n;

        if (!btrfs_test_opt(fs_info, REF_VERIFY))
                return;

        spin_lock(&fs_info->ref_verify_lock);
        while ((n = rb_first(&fs_info->block_tree))) {
                be = rb_entry(n, struct block_entry, node);
                rb_erase(&be->node, &fs_info->block_tree);
                free_block_entry(be);
                cond_resched_lock(&fs_info->ref_verify_lock);
        }
        spin_unlock(&fs_info->ref_verify_lock);
}

void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
                               u64 len)
{
        struct block_entry *be = NULL, *entry;
        struct rb_node *n;

        if (!btrfs_test_opt(fs_info, REF_VERIFY))
                return;

        spin_lock(&fs_info->ref_verify_lock);
        n = fs_info->block_tree.rb_node;
        while (n) {
                entry = rb_entry(n, struct block_entry, node);
                if (entry->bytenr < start) {
                        n = n->rb_right;
                } else if (entry->bytenr > start) {
                        n = n->rb_left;
                } else {
                        be = entry;
                        break;
                }
                /* We want to get as close to start as possible */
                if (be == NULL ||
                    (entry->bytenr < start && be->bytenr > start) ||
                    (entry->bytenr < start && entry->bytenr > be->bytenr))
                        be = entry;
        }

        /*
         * Could have an empty block group, maybe have something to check for
         * this case to verify we were actually empty?
         */
        if (!be) {
                spin_unlock(&fs_info->ref_verify_lock);
                return;
        }

        n = &be->node;
        while (n) {
                be = rb_entry(n, struct block_entry, node);
                n = rb_next(n);
                if (be->bytenr < start && be->bytenr + be->len > start) {
                        btrfs_err(fs_info,
                                "block entry overlaps a block group [%llu,%llu]!",
                                start, len);
                        dump_block_entry(fs_info, be);
                        continue;
                }
                if (be->bytenr < start)
                        continue;
                if (be->bytenr >= start + len)
                        break;
                if (be->bytenr + be->len > start + len) {
                        btrfs_err(fs_info,
                                "block entry overlaps a block group [%llu,%llu]!",
                                start, len);
                        dump_block_entry(fs_info, be);
                }
                rb_erase(&be->node, &fs_info->block_tree);
                free_block_entry(be);
        }
        spin_unlock(&fs_info->ref_verify_lock);
}

/* Walk down all roots and build the ref tree, meant to be called at mount */
int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
{
        struct btrfs_root *extent_root;
        struct btrfs_path *path;
        struct extent_buffer *eb;
        int tree_block_level = 0;
        u64 bytenr = 0, num_bytes = 0;
        int ret, level;

        if (!btrfs_test_opt(fs_info, REF_VERIFY))
                return 0;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        extent_root = btrfs_extent_root(fs_info, 0);
        eb = btrfs_read_lock_root_node(extent_root);
        level = btrfs_header_level(eb);
        path->nodes[level] = eb;
        path->slots[level] = 0;
        path->locks[level] = BTRFS_READ_LOCK;

        while (1) {
                /*
                 * We have to keep track of the bytenr/num_bytes we last hit
                 * because we could have run out of space for an inline ref, and
                 * would have had to added a ref key item which may appear on a
                 * different leaf from the original extent item.
                 */
                ret = walk_down_tree(extent_root, path, level,
                                     &bytenr, &num_bytes, &tree_block_level);
                if (ret)
                        break;
                ret = walk_up_tree(path, &level);
                if (ret < 0)
                        break;
                if (ret > 0) {
                        ret = 0;
                        break;
                }
        }
        if (ret) {
                btrfs_free_ref_cache(fs_info);
                btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
        }
        btrfs_free_path(path);
        return ret;
}






































    2 












































































    1 




















































    1 


    1 


































    1 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Events for filesystem locks
 *
 * Copyright 2013 Jeff Layton <jlayton@poochiereds.net>
 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM filelock

#if !defined(_TRACE_FILELOCK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_FILELOCK_H

#include <linux/tracepoint.h>
#include <linux/fs.h>
#include <linux/device.h>
#include <linux/kdev_t.h>

#define show_fl_flags(val)                                                \
        __print_flags(val, "|",                                         \
                { FL_POSIX,                "FL_POSIX" },                        \
                { FL_FLOCK,                "FL_FLOCK" },                        \
                { FL_DELEG,                "FL_DELEG" },                        \
                { FL_ACCESS,                "FL_ACCESS" },                        \
                { FL_EXISTS,                "FL_EXISTS" },                        \
                { FL_LEASE,                "FL_LEASE" },                        \
                { FL_CLOSE,                "FL_CLOSE" },                        \
                { FL_SLEEP,                "FL_SLEEP" },                        \
                { FL_DOWNGRADE_PENDING,        "FL_DOWNGRADE_PENDING" },        \
                { FL_UNLOCK_PENDING,        "FL_UNLOCK_PENDING" },                \
                { FL_OFDLCK,                "FL_OFDLCK" })

#define show_fl_type(val)                                \
        __print_symbolic(val,                                \
                        { F_RDLCK, "F_RDLCK" },                \
                        { F_WRLCK, "F_WRLCK" },                \
                        { F_UNLCK, "F_UNLCK" })

TRACE_EVENT(locks_get_lock_context,
        TP_PROTO(struct inode *inode, int type, struct file_lock_context *ctx),

        TP_ARGS(inode, type, ctx),

        TP_STRUCT__entry(
                __field(unsigned long, i_ino)
                __field(dev_t, s_dev)
                __field(unsigned char, type)
                __field(struct file_lock_context *, ctx)
        ),

        TP_fast_assign(
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->i_ino = inode->i_ino;
                __entry->type = type;
                __entry->ctx = ctx;
        ),

        TP_printk("dev=0x%x:0x%x ino=0x%lx type=%s ctx=%p",
                  MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                  __entry->i_ino, show_fl_type(__entry->type), __entry->ctx)
);

DECLARE_EVENT_CLASS(filelock_lock,
        TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),

        TP_ARGS(inode, fl, ret),

        TP_STRUCT__entry(
                __field(struct file_lock *, fl)
                __field(unsigned long, i_ino)
                __field(dev_t, s_dev)
                __field(struct file_lock_core *, blocker)
                __field(fl_owner_t, owner)
                __field(unsigned int, pid)
                __field(unsigned int, flags)
                __field(unsigned char, type)
                __field(loff_t, fl_start)
                __field(loff_t, fl_end)
                __field(int, ret)
        ),

        TP_fast_assign(
                __entry->fl = fl ? fl : NULL;
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->i_ino = inode->i_ino;
                __entry->blocker = fl ? fl->c.flc_blocker : NULL;
                __entry->owner = fl ? fl->c.flc_owner : NULL;
                __entry->pid = fl ? fl->c.flc_pid : 0;
                __entry->flags = fl ? fl->c.flc_flags : 0;
                __entry->type = fl ? fl->c.flc_type : 0;
                __entry->fl_start = fl ? fl->fl_start : 0;
                __entry->fl_end = fl ? fl->fl_end : 0;
                __entry->ret = ret;
        ),

        TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_pid=%u fl_flags=%s fl_type=%s fl_start=%lld fl_end=%lld ret=%d",
                __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                __entry->i_ino, __entry->blocker, __entry->owner,
                __entry->pid, show_fl_flags(__entry->flags),
                show_fl_type(__entry->type),
                __entry->fl_start, __entry->fl_end, __entry->ret)
);

DEFINE_EVENT(filelock_lock, posix_lock_inode,
                TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
                TP_ARGS(inode, fl, ret));

DEFINE_EVENT(filelock_lock, fcntl_setlk,
                TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
                TP_ARGS(inode, fl, ret));

DEFINE_EVENT(filelock_lock, locks_remove_posix,
                TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
                TP_ARGS(inode, fl, ret));

DEFINE_EVENT(filelock_lock, flock_lock_inode,
                TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
                TP_ARGS(inode, fl, ret));

DECLARE_EVENT_CLASS(filelock_lease,
        TP_PROTO(struct inode *inode, struct file_lease *fl),

        TP_ARGS(inode, fl),

        TP_STRUCT__entry(
                __field(struct file_lease *, fl)
                __field(unsigned long, i_ino)
                __field(dev_t, s_dev)
                __field(struct file_lock_core *, blocker)
                __field(fl_owner_t, owner)
                __field(unsigned int, flags)
                __field(unsigned char, type)
                __field(unsigned long, break_time)
                __field(unsigned long, downgrade_time)
        ),

        TP_fast_assign(
                __entry->fl = fl ? fl : NULL;
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->i_ino = inode->i_ino;
                __entry->blocker = fl ? fl->c.flc_blocker : NULL;
                __entry->owner = fl ? fl->c.flc_owner : NULL;
                __entry->flags = fl ? fl->c.flc_flags : 0;
                __entry->type = fl ? fl->c.flc_type : 0;
                __entry->break_time = fl ? fl->fl_break_time : 0;
                __entry->downgrade_time = fl ? fl->fl_downgrade_time : 0;
        ),

        TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_flags=%s fl_type=%s fl_break_time=%lu fl_downgrade_time=%lu",
                __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                __entry->i_ino, __entry->blocker, __entry->owner,
                show_fl_flags(__entry->flags),
                show_fl_type(__entry->type),
                __entry->break_time, __entry->downgrade_time)
);

DEFINE_EVENT(filelock_lease, break_lease_noblock, TP_PROTO(struct inode *inode, struct file_lease *fl),
                TP_ARGS(inode, fl));

DEFINE_EVENT(filelock_lease, break_lease_block, TP_PROTO(struct inode *inode, struct file_lease *fl),
                TP_ARGS(inode, fl));

DEFINE_EVENT(filelock_lease, break_lease_unblock, TP_PROTO(struct inode *inode, struct file_lease *fl),
                TP_ARGS(inode, fl));

DEFINE_EVENT(filelock_lease, generic_delete_lease, TP_PROTO(struct inode *inode, struct file_lease *fl),
                TP_ARGS(inode, fl));

DEFINE_EVENT(filelock_lease, time_out_leases, TP_PROTO(struct inode *inode, struct file_lease *fl),
                TP_ARGS(inode, fl));

TRACE_EVENT(generic_add_lease,
        TP_PROTO(struct inode *inode, struct file_lease *fl),

        TP_ARGS(inode, fl),

        TP_STRUCT__entry(
                __field(unsigned long, i_ino)
                __field(int, wcount)
                __field(int, rcount)
                __field(int, icount)
                __field(dev_t, s_dev)
                __field(fl_owner_t, owner)
                __field(unsigned int, flags)
                __field(unsigned char, type)
        ),

        TP_fast_assign(
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->i_ino = inode->i_ino;
                __entry->wcount = atomic_read(&inode->i_writecount);
                __entry->rcount = atomic_read(&inode->i_readcount);
                __entry->icount = atomic_read(&inode->i_count);
                __entry->owner = fl->c.flc_owner;
                __entry->flags = fl->c.flc_flags;
                __entry->type = fl->c.flc_type;
        ),

        TP_printk("dev=0x%x:0x%x ino=0x%lx wcount=%d rcount=%d icount=%d fl_owner=%p fl_flags=%s fl_type=%s",
                MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                __entry->i_ino, __entry->wcount, __entry->rcount,
                __entry->icount, __entry->owner,
                show_fl_flags(__entry->flags),
                show_fl_type(__entry->type))
);

TRACE_EVENT(leases_conflict,
        TP_PROTO(bool conflict, struct file_lease *lease, struct file_lease *breaker),

        TP_ARGS(conflict, lease, breaker),

        TP_STRUCT__entry(
                __field(void *, lease)
                __field(void *, breaker)
                __field(unsigned int, l_fl_flags)
                __field(unsigned int, b_fl_flags)
                __field(unsigned char, l_fl_type)
                __field(unsigned char, b_fl_type)
                __field(bool, conflict)
        ),

        TP_fast_assign(
                __entry->lease = lease;
                __entry->l_fl_flags = lease->c.flc_flags;
                __entry->l_fl_type = lease->c.flc_type;
                __entry->breaker = breaker;
                __entry->b_fl_flags = breaker->c.flc_flags;
                __entry->b_fl_type = breaker->c.flc_type;
                __entry->conflict = conflict;
        ),

        TP_printk("conflict %d: lease=%p fl_flags=%s fl_type=%s; breaker=%p fl_flags=%s fl_type=%s",
                __entry->conflict,
                __entry->lease,
                show_fl_flags(__entry->l_fl_flags),
                show_fl_type(__entry->l_fl_type),
                __entry->breaker,
                show_fl_flags(__entry->b_fl_flags),
                show_fl_type(__entry->b_fl_type))
);

#endif /* _TRACE_FILELOCK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>







































































































































































































































































































































































































































































































































































































































































































































































































































































    2 







    2 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2009 Oracle.  All rights reserved.
 */

#include <linux/sched.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/error-injection.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "volumes.h"
#include "locking.h"
#include "btrfs_inode.h"
#include "async-thread.h"
#include "free-space-cache.h"
#include "qgroup.h"
#include "print-tree.h"
#include "delalloc-space.h"
#include "block-group.h"
#include "backref.h"
#include "misc.h"
#include "subpage.h"
#include "zoned.h"
#include "inode-item.h"
#include "space-info.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
#include "file-item.h"
#include "relocation.h"
#include "super.h"
#include "tree-checker.h"

/*
 * Relocation overview
 *
 * [What does relocation do]
 *
 * The objective of relocation is to relocate all extents of the target block
 * group to other block groups.
 * This is utilized by resize (shrink only), profile converting, compacting
 * space, or balance routine to spread chunks over devices.
 *
 *                 Before                |                After
 * ------------------------------------------------------------------
 *  BG A: 10 data extents        | BG A: deleted
 *  BG B:  2 data extents        | BG B: 10 data extents (2 old + 8 relocated)
 *  BG C:  1 extents                | BG C:  3 data extents (1 old + 2 relocated)
 *
 * [How does relocation work]
 *
 * 1.   Mark the target block group read-only
 *      New extents won't be allocated from the target block group.
 *
 * 2.1  Record each extent in the target block group
 *      To build a proper map of extents to be relocated.
 *
 * 2.2  Build data reloc tree and reloc trees
 *      Data reloc tree will contain an inode, recording all newly relocated
 *      data extents.
 *      There will be only one data reloc tree for one data block group.
 *
 *      Reloc tree will be a special snapshot of its source tree, containing
 *      relocated tree blocks.
 *      Each tree referring to a tree block in target block group will get its
 *      reloc tree built.
 *
 * 2.3  Swap source tree with its corresponding reloc tree
 *      Each involved tree only refers to new extents after swap.
 *
 * 3.   Cleanup reloc trees and data reloc tree.
 *      As old extents in the target block group are still referenced by reloc
 *      trees, we need to clean them up before really freeing the target block
 *      group.
 *
 * The main complexity is in steps 2.2 and 2.3.
 *
 * The entry point of relocation is relocate_block_group() function.
 */

#define RELOCATION_RESERVED_NODES        256
/*
 * map address of tree root to tree
 */
struct mapping_node {
        struct {
                struct rb_node rb_node;
                u64 bytenr;
        }; /* Use rb_simle_node for search/insert */
        void *data;
};

struct mapping_tree {
        struct rb_root rb_root;
        spinlock_t lock;
};

/*
 * present a tree block to process
 */
struct tree_block {
        struct {
                struct rb_node rb_node;
                u64 bytenr;
        }; /* Use rb_simple_node for search/insert */
        u64 owner;
        struct btrfs_key key;
        u8 level;
        bool key_ready;
};

#define MAX_EXTENTS 128

struct file_extent_cluster {
        u64 start;
        u64 end;
        u64 boundary[MAX_EXTENTS];
        unsigned int nr;
        u64 owning_root;
};

/* Stages of data relocation. */
enum reloc_stage {
        MOVE_DATA_EXTENTS,
        UPDATE_DATA_PTRS
};

struct reloc_control {
        /* block group to relocate */
        struct btrfs_block_group *block_group;
        /* extent tree */
        struct btrfs_root *extent_root;
        /* inode for moving data */
        struct inode *data_inode;

        struct btrfs_block_rsv *block_rsv;

        struct btrfs_backref_cache backref_cache;

        struct file_extent_cluster cluster;
        /* tree blocks have been processed */
        struct extent_io_tree processed_blocks;
        /* map start of tree root to corresponding reloc tree */
        struct mapping_tree reloc_root_tree;
        /* list of reloc trees */
        struct list_head reloc_roots;
        /* list of subvolume trees that get relocated */
        struct list_head dirty_subvol_roots;
        /* size of metadata reservation for merging reloc trees */
        u64 merging_rsv_size;
        /* size of relocated tree nodes */
        u64 nodes_relocated;
        /* reserved size for block group relocation*/
        u64 reserved_bytes;

        u64 search_start;
        u64 extents_found;

        enum reloc_stage stage;
        bool create_reloc_tree;
        bool merge_reloc_tree;
        bool found_file_extent;
};

static void mark_block_processed(struct reloc_control *rc,
                                 struct btrfs_backref_node *node)
{
        u32 blocksize;

        if (node->level == 0 ||
            in_range(node->bytenr, rc->block_group->start,
                     rc->block_group->length)) {
                blocksize = rc->extent_root->fs_info->nodesize;
                set_extent_bit(&rc->processed_blocks, node->bytenr,
                               node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL);
        }
        node->processed = 1;
}

/*
 * walk up backref nodes until reach node presents tree root
 */
static struct btrfs_backref_node *walk_up_backref(
                struct btrfs_backref_node *node,
                struct btrfs_backref_edge *edges[], int *index)
{
        struct btrfs_backref_edge *edge;
        int idx = *index;

        while (!list_empty(&node->upper)) {
                edge = list_entry(node->upper.next,
                                  struct btrfs_backref_edge, list[LOWER]);
                edges[idx++] = edge;
                node = edge->node[UPPER];
        }
        BUG_ON(node->detached);
        *index = idx;
        return node;
}

/*
 * walk down backref nodes to find start of next reference path
 */
static struct btrfs_backref_node *walk_down_backref(
                struct btrfs_backref_edge *edges[], int *index)
{
        struct btrfs_backref_edge *edge;
        struct btrfs_backref_node *lower;
        int idx = *index;

        while (idx > 0) {
                edge = edges[idx - 1];
                lower = edge->node[LOWER];
                if (list_is_last(&edge->list[LOWER], &lower->upper)) {
                        idx--;
                        continue;
                }
                edge = list_entry(edge->list[LOWER].next,
                                  struct btrfs_backref_edge, list[LOWER]);
                edges[idx - 1] = edge;
                *index = idx;
                return edge->node[UPPER];
        }
        *index = 0;
        return NULL;
}

static void update_backref_node(struct btrfs_backref_cache *cache,
                                struct btrfs_backref_node *node, u64 bytenr)
{
        struct rb_node *rb_node;
        rb_erase(&node->rb_node, &cache->rb_root);
        node->bytenr = bytenr;
        rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node);
        if (rb_node)
                btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST);
}

/*
 * update backref cache after a transaction commit
 */
static int update_backref_cache(struct btrfs_trans_handle *trans,
                                struct btrfs_backref_cache *cache)
{
        struct btrfs_backref_node *node;
        int level = 0;

        if (cache->last_trans == 0) {
                cache->last_trans = trans->transid;
                return 0;
        }

        if (cache->last_trans == trans->transid)
                return 0;

        /*
         * detached nodes are used to avoid unnecessary backref
         * lookup. transaction commit changes the extent tree.
         * so the detached nodes are no longer useful.
         */
        while (!list_empty(&cache->detached)) {
                node = list_entry(cache->detached.next,
                                  struct btrfs_backref_node, list);
                btrfs_backref_cleanup_node(cache, node);
        }

        while (!list_empty(&cache->changed)) {
                node = list_entry(cache->changed.next,
                                  struct btrfs_backref_node, list);
                list_del_init(&node->list);
                BUG_ON(node->pending);
                update_backref_node(cache, node, node->new_bytenr);
        }

        /*
         * some nodes can be left in the pending list if there were
         * errors during processing the pending nodes.
         */
        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
                list_for_each_entry(node, &cache->pending[level], list) {
                        BUG_ON(!node->pending);
                        if (node->bytenr == node->new_bytenr)
                                continue;
                        update_backref_node(cache, node, node->new_bytenr);
                }
        }

        cache->last_trans = 0;
        return 1;
}

static bool reloc_root_is_dead(const struct btrfs_root *root)
{
        /*
         * Pair with set_bit/clear_bit in clean_dirty_subvols and
         * btrfs_update_reloc_root. We need to see the updated bit before
         * trying to access reloc_root
         */
        smp_rmb();
        if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
                return true;
        return false;
}

/*
 * Check if this subvolume tree has valid reloc tree.
 *
 * Reloc tree after swap is considered dead, thus not considered as valid.
 * This is enough for most callers, as they don't distinguish dead reloc root
 * from no reloc root.  But btrfs_should_ignore_reloc_root() below is a
 * special case.
 */
static bool have_reloc_root(const struct btrfs_root *root)
{
        if (reloc_root_is_dead(root))
                return false;
        if (!root->reloc_root)
                return false;
        return true;
}

bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root)
{
        struct btrfs_root *reloc_root;

        if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
                return false;

        /* This root has been merged with its reloc tree, we can ignore it */
        if (reloc_root_is_dead(root))
                return true;

        reloc_root = root->reloc_root;
        if (!reloc_root)
                return false;

        if (btrfs_header_generation(reloc_root->commit_root) ==
            root->fs_info->running_transaction->transid)
                return false;
        /*
         * If there is reloc tree and it was created in previous transaction
         * backref lookup can find the reloc tree, so backref node for the fs
         * tree root is useless for relocation.
         */
        return true;
}

/*
 * find reloc tree by address of tree root
 */
struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
{
        struct reloc_control *rc = fs_info->reloc_ctl;
        struct rb_node *rb_node;
        struct mapping_node *node;
        struct btrfs_root *root = NULL;

        ASSERT(rc);
        spin_lock(&rc->reloc_root_tree.lock);
        rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr);
        if (rb_node) {
                node = rb_entry(rb_node, struct mapping_node, rb_node);
                root = node->data;
        }
        spin_unlock(&rc->reloc_root_tree.lock);
        return btrfs_grab_root(root);
}

/*
 * For useless nodes, do two major clean ups:
 *
 * - Cleanup the children edges and nodes
 *   If child node is also orphan (no parent) during cleanup, then the child
 *   node will also be cleaned up.
 *
 * - Freeing up leaves (level 0), keeps nodes detached
 *   For nodes, the node is still cached as "detached"
 *
 * Return false if @node is not in the @useless_nodes list.
 * Return true if @node is in the @useless_nodes list.
 */
static bool handle_useless_nodes(struct reloc_control *rc,
                                 struct btrfs_backref_node *node)
{
        struct btrfs_backref_cache *cache = &rc->backref_cache;
        struct list_head *useless_node = &cache->useless_node;
        bool ret = false;

        while (!list_empty(useless_node)) {
                struct btrfs_backref_node *cur;

                cur = list_first_entry(useless_node, struct btrfs_backref_node,
                                 list);
                list_del_init(&cur->list);

                /* Only tree root nodes can be added to @useless_nodes */
                ASSERT(list_empty(&cur->upper));

                if (cur == node)
                        ret = true;

                /* The node is the lowest node */
                if (cur->lowest) {
                        list_del_init(&cur->lower);
                        cur->lowest = 0;
                }

                /* Cleanup the lower edges */
                while (!list_empty(&cur->lower)) {
                        struct btrfs_backref_edge *edge;
                        struct btrfs_backref_node *lower;

                        edge = list_entry(cur->lower.next,
                                        struct btrfs_backref_edge, list[UPPER]);
                        list_del(&edge->list[UPPER]);
                        list_del(&edge->list[LOWER]);
                        lower = edge->node[LOWER];
                        btrfs_backref_free_edge(cache, edge);

                        /* Child node is also orphan, queue for cleanup */
                        if (list_empty(&lower->upper))
                                list_add(&lower->list, useless_node);
                }
                /* Mark this block processed for relocation */
                mark_block_processed(rc, cur);

                /*
                 * Backref nodes for tree leaves are deleted from the cache.
                 * Backref nodes for upper level tree blocks are left in the
                 * cache to avoid unnecessary backref lookup.
                 */
                if (cur->level > 0) {
                        list_add(&cur->list, &cache->detached);
                        cur->detached = 1;
                } else {
                        rb_erase(&cur->rb_node, &cache->rb_root);
                        btrfs_backref_free_node(cache, cur);
                }
        }
        return ret;
}

/*
 * Build backref tree for a given tree block. Root of the backref tree
 * corresponds the tree block, leaves of the backref tree correspond roots of
 * b-trees that reference the tree block.
 *
 * The basic idea of this function is check backrefs of a given block to find
 * upper level blocks that reference the block, and then check backrefs of
 * these upper level blocks recursively. The recursion stops when tree root is
 * reached or backrefs for the block is cached.
 *
 * NOTE: if we find that backrefs for a block are cached, we know backrefs for
 * all upper level blocks that directly/indirectly reference the block are also
 * cached.
 */
static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
                        struct btrfs_trans_handle *trans,
                        struct reloc_control *rc, struct btrfs_key *node_key,
                        int level, u64 bytenr)
{
        struct btrfs_backref_iter *iter;
        struct btrfs_backref_cache *cache = &rc->backref_cache;
        /* For searching parent of TREE_BLOCK_REF */
        struct btrfs_path *path;
        struct btrfs_backref_node *cur;
        struct btrfs_backref_node *node = NULL;
        struct btrfs_backref_edge *edge;
        int ret;

        iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info);
        if (!iter)
                return ERR_PTR(-ENOMEM);
        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }

        node = btrfs_backref_alloc_node(cache, bytenr, level);
        if (!node) {
                ret = -ENOMEM;
                goto out;
        }

        node->lowest = 1;
        cur = node;

        /* Breadth-first search to build backref cache */
        do {
                ret = btrfs_backref_add_tree_node(trans, cache, path, iter,
                                                  node_key, cur);
                if (ret < 0)
                        goto out;

                edge = list_first_entry_or_null(&cache->pending_edge,
                                struct btrfs_backref_edge, list[UPPER]);
                /*
                 * The pending list isn't empty, take the first block to
                 * process
                 */
                if (edge) {
                        list_del_init(&edge->list[UPPER]);
                        cur = edge->node[UPPER];
                }
        } while (edge);

        /* Finish the upper linkage of newly added edges/nodes */
        ret = btrfs_backref_finish_upper_links(cache, node);
        if (ret < 0)
                goto out;

        if (handle_useless_nodes(rc, node))
                node = NULL;
out:
        btrfs_free_path(iter->path);
        kfree(iter);
        btrfs_free_path(path);
        if (ret) {
                btrfs_backref_error_cleanup(cache, node);
                return ERR_PTR(ret);
        }
        ASSERT(!node || !node->detached);
        ASSERT(list_empty(&cache->useless_node) &&
               list_empty(&cache->pending_edge));
        return node;
}

/*
 * helper to add backref node for the newly created snapshot.
 * the backref node is created by cloning backref node that
 * corresponds to root of source tree
 */
static int clone_backref_node(struct btrfs_trans_handle *trans,
                              struct reloc_control *rc,
                              const struct btrfs_root *src,
                              struct btrfs_root *dest)
{
        struct btrfs_root *reloc_root = src->reloc_root;
        struct btrfs_backref_cache *cache = &rc->backref_cache;
        struct btrfs_backref_node *node = NULL;
        struct btrfs_backref_node *new_node;
        struct btrfs_backref_edge *edge;
        struct btrfs_backref_edge *new_edge;
        struct rb_node *rb_node;

        if (cache->last_trans > 0)
                update_backref_cache(trans, cache);

        rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
        if (rb_node) {
                node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
                if (node->detached)
                        node = NULL;
                else
                        BUG_ON(node->new_bytenr != reloc_root->node->start);
        }

        if (!node) {
                rb_node = rb_simple_search(&cache->rb_root,
                                           reloc_root->commit_root->start);
                if (rb_node) {
                        node = rb_entry(rb_node, struct btrfs_backref_node,
                                        rb_node);
                        BUG_ON(node->detached);
                }
        }

        if (!node)
                return 0;

        new_node = btrfs_backref_alloc_node(cache, dest->node->start,
                                            node->level);
        if (!new_node)
                return -ENOMEM;

        new_node->lowest = node->lowest;
        new_node->checked = 1;
        new_node->root = btrfs_grab_root(dest);
        ASSERT(new_node->root);

        if (!node->lowest) {
                list_for_each_entry(edge, &node->lower, list[UPPER]) {
                        new_edge = btrfs_backref_alloc_edge(cache);
                        if (!new_edge)
                                goto fail;

                        btrfs_backref_link_edge(new_edge, edge->node[LOWER],
                                                new_node, LINK_UPPER);
                }
        } else {
                list_add_tail(&new_node->lower, &cache->leaves);
        }

        rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr,
                                   &new_node->rb_node);
        if (rb_node)
                btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST);

        if (!new_node->lowest) {
                list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
                        list_add_tail(&new_edge->list[LOWER],
                                      &new_edge->node[LOWER]->upper);
                }
        }
        return 0;
fail:
        while (!list_empty(&new_node->lower)) {
                new_edge = list_entry(new_node->lower.next,
                                      struct btrfs_backref_edge, list[UPPER]);
                list_del(&new_edge->list[UPPER]);
                btrfs_backref_free_edge(cache, new_edge);
        }
        btrfs_backref_free_node(cache, new_node);
        return -ENOMEM;
}

/*
 * helper to add 'address of tree root -> reloc tree' mapping
 */
static int __add_reloc_root(struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct rb_node *rb_node;
        struct mapping_node *node;
        struct reloc_control *rc = fs_info->reloc_ctl;

        node = kmalloc(sizeof(*node), GFP_NOFS);
        if (!node)
                return -ENOMEM;

        node->bytenr = root->commit_root->start;
        node->data = root;

        spin_lock(&rc->reloc_root_tree.lock);
        rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
                                   node->bytenr, &node->rb_node);
        spin_unlock(&rc->reloc_root_tree.lock);
        if (rb_node) {
                btrfs_err(fs_info,
                            "Duplicate root found for start=%llu while inserting into relocation tree",
                            node->bytenr);
                return -EEXIST;
        }

        list_add_tail(&root->root_list, &rc->reloc_roots);
        return 0;
}

/*
 * helper to delete the 'address of tree root -> reloc tree'
 * mapping
 */
static void __del_reloc_root(struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct rb_node *rb_node;
        struct mapping_node *node = NULL;
        struct reloc_control *rc = fs_info->reloc_ctl;
        bool put_ref = false;

        if (rc && root->node) {
                spin_lock(&rc->reloc_root_tree.lock);
                rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
                                           root->commit_root->start);
                if (rb_node) {
                        node = rb_entry(rb_node, struct mapping_node, rb_node);
                        rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
                        RB_CLEAR_NODE(&node->rb_node);
                }
                spin_unlock(&rc->reloc_root_tree.lock);
                ASSERT(!node || (struct btrfs_root *)node->data == root);
        }

        /*
         * We only put the reloc root here if it's on the list.  There's a lot
         * of places where the pattern is to splice the rc->reloc_roots, process
         * the reloc roots, and then add the reloc root back onto
         * rc->reloc_roots.  If we call __del_reloc_root while it's off of the
         * list we don't want the reference being dropped, because the guy
         * messing with the list is in charge of the reference.
         */
        spin_lock(&fs_info->trans_lock);
        if (!list_empty(&root->root_list)) {
                put_ref = true;
                list_del_init(&root->root_list);
        }
        spin_unlock(&fs_info->trans_lock);
        if (put_ref)
                btrfs_put_root(root);
        kfree(node);
}

/*
 * helper to update the 'address of tree root -> reloc tree'
 * mapping
 */
static int __update_reloc_root(struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct rb_node *rb_node;
        struct mapping_node *node = NULL;
        struct reloc_control *rc = fs_info->reloc_ctl;

        spin_lock(&rc->reloc_root_tree.lock);
        rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
                                   root->commit_root->start);
        if (rb_node) {
                node = rb_entry(rb_node, struct mapping_node, rb_node);
                rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
        }
        spin_unlock(&rc->reloc_root_tree.lock);

        if (!node)
                return 0;
        BUG_ON((struct btrfs_root *)node->data != root);

        spin_lock(&rc->reloc_root_tree.lock);
        node->bytenr = root->node->start;
        rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
                                   node->bytenr, &node->rb_node);
        spin_unlock(&rc->reloc_root_tree.lock);
        if (rb_node)
                btrfs_backref_panic(fs_info, node->bytenr, -EEXIST);
        return 0;
}

static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root, u64 objectid)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_root *reloc_root;
        struct extent_buffer *eb;
        struct btrfs_root_item *root_item;
        struct btrfs_key root_key;
        int ret = 0;
        bool must_abort = false;

        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
        if (!root_item)
                return ERR_PTR(-ENOMEM);

        root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
        root_key.type = BTRFS_ROOT_ITEM_KEY;
        root_key.offset = objectid;

        if (btrfs_root_id(root) == objectid) {
                u64 commit_root_gen;

                /* called by btrfs_init_reloc_root */
                ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
                                      BTRFS_TREE_RELOC_OBJECTID);
                if (ret)
                        goto fail;

                /*
                 * Set the last_snapshot field to the generation of the commit
                 * root - like this ctree.c:btrfs_block_can_be_shared() behaves
                 * correctly (returns true) when the relocation root is created
                 * either inside the critical section of a transaction commit
                 * (through transaction.c:qgroup_account_snapshot()) and when
                 * it's created before the transaction commit is started.
                 */
                commit_root_gen = btrfs_header_generation(root->commit_root);
                btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
        } else {
                /*
                 * called by btrfs_reloc_post_snapshot_hook.
                 * the source tree is a reloc tree, all tree blocks
                 * modified after it was created have RELOC flag
                 * set in their headers. so it's OK to not update
                 * the 'last_snapshot'.
                 */
                ret = btrfs_copy_root(trans, root, root->node, &eb,
                                      BTRFS_TREE_RELOC_OBJECTID);
                if (ret)
                        goto fail;
        }

        /*
         * We have changed references at this point, we must abort the
         * transaction if anything fails.
         */
        must_abort = true;

        memcpy(root_item, &root->root_item, sizeof(*root_item));
        btrfs_set_root_bytenr(root_item, eb->start);
        btrfs_set_root_level(root_item, btrfs_header_level(eb));
        btrfs_set_root_generation(root_item, trans->transid);

        if (btrfs_root_id(root) == objectid) {
                btrfs_set_root_refs(root_item, 0);
                memset(&root_item->drop_progress, 0,
                       sizeof(struct btrfs_disk_key));
                btrfs_set_root_drop_level(root_item, 0);
        }

        btrfs_tree_unlock(eb);
        free_extent_buffer(eb);

        ret = btrfs_insert_root(trans, fs_info->tree_root,
                                &root_key, root_item);
        if (ret)
                goto fail;

        kfree(root_item);

        reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
        if (IS_ERR(reloc_root)) {
                ret = PTR_ERR(reloc_root);
                goto abort;
        }
        set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
        reloc_root->last_trans = trans->transid;
        return reloc_root;
fail:
        kfree(root_item);
abort:
        if (must_abort)
                btrfs_abort_transaction(trans, ret);
        return ERR_PTR(ret);
}

/*
 * create reloc tree for a given fs tree. reloc tree is just a
 * snapshot of the fs tree with special root objectid.
 *
 * The reloc_root comes out of here with two references, one for
 * root->reloc_root, and another for being on the rc->reloc_roots list.
 */
int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_root *reloc_root;
        struct reloc_control *rc = fs_info->reloc_ctl;
        struct btrfs_block_rsv *rsv;
        int clear_rsv = 0;
        int ret;

        if (!rc)
                return 0;

        /*
         * The subvolume has reloc tree but the swap is finished, no need to
         * create/update the dead reloc tree
         */
        if (reloc_root_is_dead(root))
                return 0;

        /*
         * This is subtle but important.  We do not do
         * record_root_in_transaction for reloc roots, instead we record their
         * corresponding fs root, and then here we update the last trans for the
         * reloc root.  This means that we have to do this for the entire life
         * of the reloc root, regardless of which stage of the relocation we are
         * in.
         */
        if (root->reloc_root) {
                reloc_root = root->reloc_root;
                reloc_root->last_trans = trans->transid;
                return 0;
        }

        /*
         * We are merging reloc roots, we do not need new reloc trees.  Also
         * reloc trees never need their own reloc tree.
         */
        if (!rc->create_reloc_tree || btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
                return 0;

        if (!trans->reloc_reserved) {
                rsv = trans->block_rsv;
                trans->block_rsv = rc->block_rsv;
                clear_rsv = 1;
        }
        reloc_root = create_reloc_root(trans, root, btrfs_root_id(root));
        if (clear_rsv)
                trans->block_rsv = rsv;
        if (IS_ERR(reloc_root))
                return PTR_ERR(reloc_root);

        ret = __add_reloc_root(reloc_root);
        ASSERT(ret != -EEXIST);
        if (ret) {
                /* Pairs with create_reloc_root */
                btrfs_put_root(reloc_root);
                return ret;
        }
        root->reloc_root = btrfs_grab_root(reloc_root);
        return 0;
}

/*
 * update root item of reloc tree
 */
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_root *reloc_root;
        struct btrfs_root_item *root_item;
        int ret;

        if (!have_reloc_root(root))
                return 0;

        reloc_root = root->reloc_root;
        root_item = &reloc_root->root_item;

        /*
         * We are probably ok here, but __del_reloc_root() will drop its ref of
         * the root.  We have the ref for root->reloc_root, but just in case
         * hold it while we update the reloc root.
         */
        btrfs_grab_root(reloc_root);

        /* root->reloc_root will stay until current relocation finished */
        if (fs_info->reloc_ctl->merge_reloc_tree &&
            btrfs_root_refs(root_item) == 0) {
                set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
                /*
                 * Mark the tree as dead before we change reloc_root so
                 * have_reloc_root will not touch it from now on.
                 */
                smp_wmb();
                __del_reloc_root(reloc_root);
        }

        if (reloc_root->commit_root != reloc_root->node) {
                __update_reloc_root(reloc_root);
                btrfs_set_root_node(root_item, reloc_root->node);
                free_extent_buffer(reloc_root->commit_root);
                reloc_root->commit_root = btrfs_root_node(reloc_root);
        }

        ret = btrfs_update_root(trans, fs_info->tree_root,
                                &reloc_root->root_key, root_item);
        btrfs_put_root(reloc_root);
        return ret;
}

/*
 * get new location of data
 */
static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
                            u64 bytenr, u64 num_bytes)
{
        struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
        struct btrfs_path *path;
        struct btrfs_file_extent_item *fi;
        struct extent_buffer *leaf;
        int ret;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        bytenr -= BTRFS_I(reloc_inode)->index_cnt;
        ret = btrfs_lookup_file_extent(NULL, root, path,
                        btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
        if (ret < 0)
                goto out;
        if (ret > 0) {
                ret = -ENOENT;
                goto out;
        }

        leaf = path->nodes[0];
        fi = btrfs_item_ptr(leaf, path->slots[0],
                            struct btrfs_file_extent_item);

        BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
               btrfs_file_extent_compression(leaf, fi) ||
               btrfs_file_extent_encryption(leaf, fi) ||
               btrfs_file_extent_other_encoding(leaf, fi));

        if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
                ret = -EINVAL;
                goto out;
        }

        *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
        ret = 0;
out:
        btrfs_free_path(path);
        return ret;
}

/*
 * update file extent items in the tree leaf to point to
 * the new locations.
 */
static noinline_for_stack
int replace_file_extents(struct btrfs_trans_handle *trans,
                         struct reloc_control *rc,
                         struct btrfs_root *root,
                         struct extent_buffer *leaf)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_key key;
        struct btrfs_file_extent_item *fi;
        struct btrfs_inode *inode = NULL;
        u64 parent;
        u64 bytenr;
        u64 new_bytenr = 0;
        u64 num_bytes;
        u64 end;
        u32 nritems;
        u32 i;
        int ret = 0;
        int first = 1;
        int dirty = 0;

        if (rc->stage != UPDATE_DATA_PTRS)
                return 0;

        /* reloc trees always use full backref */
        if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
                parent = leaf->start;
        else
                parent = 0;

        nritems = btrfs_header_nritems(leaf);
        for (i = 0; i < nritems; i++) {
                struct btrfs_ref ref = { 0 };

                cond_resched();
                btrfs_item_key_to_cpu(leaf, &key, i);
                if (key.type != BTRFS_EXTENT_DATA_KEY)
                        continue;
                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
                if (btrfs_file_extent_type(leaf, fi) ==
                    BTRFS_FILE_EXTENT_INLINE)
                        continue;
                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
                num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
                if (bytenr == 0)
                        continue;
                if (!in_range(bytenr, rc->block_group->start,
                              rc->block_group->length))
                        continue;

                /*
                 * if we are modifying block in fs tree, wait for read_folio
                 * to complete and drop the extent cache
                 */
                if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) {
                        if (first) {
                                inode = btrfs_find_first_inode(root, key.objectid);
                                first = 0;
                        } else if (inode && btrfs_ino(inode) < key.objectid) {
                                btrfs_add_delayed_iput(inode);
                                inode = btrfs_find_first_inode(root, key.objectid);
                        }
                        if (inode && btrfs_ino(inode) == key.objectid) {
                                struct extent_state *cached_state = NULL;

                                end = key.offset +
                                      btrfs_file_extent_num_bytes(leaf, fi);
                                WARN_ON(!IS_ALIGNED(key.offset,
                                                    fs_info->sectorsize));
                                WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
                                end--;
                                /* Take mmap lock to serialize with reflinks. */
                                if (!down_read_trylock(&inode->i_mmap_lock))
                                        continue;
                                ret = try_lock_extent(&inode->io_tree, key.offset,
                                                      end, &cached_state);
                                if (!ret) {
                                        up_read(&inode->i_mmap_lock);
                                        continue;
                                }

                                btrfs_drop_extent_map_range(inode, key.offset, end, true);
                                unlock_extent(&inode->io_tree, key.offset, end,
                                              &cached_state);
                                up_read(&inode->i_mmap_lock);
                        }
                }

                ret = get_new_location(rc->data_inode, &new_bytenr,
                                       bytenr, num_bytes);
                if (ret) {
                        /*
                         * Don't have to abort since we've not changed anything
                         * in the file extent yet.
                         */
                        break;
                }

                btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
                dirty = 1;

                key.offset -= btrfs_file_extent_offset(leaf, fi);
                ref.action = BTRFS_ADD_DELAYED_REF;
                ref.bytenr = new_bytenr;
                ref.num_bytes = num_bytes;
                ref.parent = parent;
                ref.owning_root = btrfs_root_id(root);
                ref.ref_root = btrfs_header_owner(leaf);
                btrfs_init_data_ref(&ref, key.objectid, key.offset,
                                    btrfs_root_id(root), false);
                ret = btrfs_inc_extent_ref(trans, &ref);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        break;
                }

                ref.action = BTRFS_DROP_DELAYED_REF;
                ref.bytenr = bytenr;
                ref.num_bytes = num_bytes;
                ref.parent = parent;
                ref.owning_root = btrfs_root_id(root);
                ref.ref_root = btrfs_header_owner(leaf);
                btrfs_init_data_ref(&ref, key.objectid, key.offset,
                                    btrfs_root_id(root), false);
                ret = btrfs_free_extent(trans, &ref);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        break;
                }
        }
        if (dirty)
                btrfs_mark_buffer_dirty(trans, leaf);
        if (inode)
                btrfs_add_delayed_iput(inode);
        return ret;
}

static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb,
                                               int slot, const struct btrfs_path *path,
                                               int level)
{
        struct btrfs_disk_key key1;
        struct btrfs_disk_key key2;
        btrfs_node_key(eb, &key1, slot);
        btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
        return memcmp(&key1, &key2, sizeof(key1));
}

/*
 * try to replace tree blocks in fs tree with the new blocks
 * in reloc tree. tree blocks haven't been modified since the
 * reloc tree was create can be replaced.
 *
 * if a block was replaced, level of the block + 1 is returned.
 * if no block got replaced, 0 is returned. if there are other
 * errors, a negative error number is returned.
 */
static noinline_for_stack
int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
                 struct btrfs_root *dest, struct btrfs_root *src,
                 struct btrfs_path *path, struct btrfs_key *next_key,
                 int lowest_level, int max_level)
{
        struct btrfs_fs_info *fs_info = dest->fs_info;
        struct extent_buffer *eb;
        struct extent_buffer *parent;
        struct btrfs_ref ref = { 0 };
        struct btrfs_key key;
        u64 old_bytenr;
        u64 new_bytenr;
        u64 old_ptr_gen;
        u64 new_ptr_gen;
        u64 last_snapshot;
        u32 blocksize;
        int cow = 0;
        int level;
        int ret;
        int slot;

        ASSERT(btrfs_root_id(src) == BTRFS_TREE_RELOC_OBJECTID);
        ASSERT(btrfs_root_id(dest) != BTRFS_TREE_RELOC_OBJECTID);

        last_snapshot = btrfs_root_last_snapshot(&src->root_item);
again:
        slot = path->slots[lowest_level];
        btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);

        eb = btrfs_lock_root_node(dest);
        level = btrfs_header_level(eb);

        if (level < lowest_level) {
                btrfs_tree_unlock(eb);
                free_extent_buffer(eb);
                return 0;
        }

        if (cow) {
                ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
                                      BTRFS_NESTING_COW);
                if (ret) {
                        btrfs_tree_unlock(eb);
                        free_extent_buffer(eb);
                        return ret;
                }
        }

        if (next_key) {
                next_key->objectid = (u64)-1;
                next_key->type = (u8)-1;
                next_key->offset = (u64)-1;
        }

        parent = eb;
        while (1) {
                level = btrfs_header_level(parent);
                ASSERT(level >= lowest_level);

                ret = btrfs_bin_search(parent, 0, &key, &slot);
                if (ret < 0)
                        break;
                if (ret && slot > 0)
                        slot--;

                if (next_key && slot + 1 < btrfs_header_nritems(parent))
                        btrfs_node_key_to_cpu(parent, next_key, slot + 1);

                old_bytenr = btrfs_node_blockptr(parent, slot);
                blocksize = fs_info->nodesize;
                old_ptr_gen = btrfs_node_ptr_generation(parent, slot);

                if (level <= max_level) {
                        eb = path->nodes[level];
                        new_bytenr = btrfs_node_blockptr(eb,
                                                        path->slots[level]);
                        new_ptr_gen = btrfs_node_ptr_generation(eb,
                                                        path->slots[level]);
                } else {
                        new_bytenr = 0;
                        new_ptr_gen = 0;
                }

                if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
                        ret = level;
                        break;
                }

                if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
                    memcmp_node_keys(parent, slot, path, level)) {
                        if (level <= lowest_level) {
                                ret = 0;
                                break;
                        }

                        eb = btrfs_read_node_slot(parent, slot);
                        if (IS_ERR(eb)) {
                                ret = PTR_ERR(eb);
                                break;
                        }
                        btrfs_tree_lock(eb);
                        if (cow) {
                                ret = btrfs_cow_block(trans, dest, eb, parent,
                                                      slot, &eb,
                                                      BTRFS_NESTING_COW);
                                if (ret) {
                                        btrfs_tree_unlock(eb);
                                        free_extent_buffer(eb);
                                        break;
                                }
                        }

                        btrfs_tree_unlock(parent);
                        free_extent_buffer(parent);

                        parent = eb;
                        continue;
                }

                if (!cow) {
                        btrfs_tree_unlock(parent);
                        free_extent_buffer(parent);
                        cow = 1;
                        goto again;
                }

                btrfs_node_key_to_cpu(path->nodes[level], &key,
                                      path->slots[level]);
                btrfs_release_path(path);

                path->lowest_level = level;
                set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
                ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
                clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
                path->lowest_level = 0;
                if (ret) {
                        if (ret > 0)
                                ret = -ENOENT;
                        break;
                }

                /*
                 * Info qgroup to trace both subtrees.
                 *
                 * We must trace both trees.
                 * 1) Tree reloc subtree
                 *    If not traced, we will leak data numbers
                 * 2) Fs subtree
                 *    If not traced, we will double count old data
                 *
                 * We don't scan the subtree right now, but only record
                 * the swapped tree blocks.
                 * The real subtree rescan is delayed until we have new
                 * CoW on the subtree root node before transaction commit.
                 */
                ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
                                rc->block_group, parent, slot,
                                path->nodes[level], path->slots[level],
                                last_snapshot);
                if (ret < 0)
                        break;
                /*
                 * swap blocks in fs tree and reloc tree.
                 */
                btrfs_set_node_blockptr(parent, slot, new_bytenr);
                btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
                btrfs_mark_buffer_dirty(trans, parent);

                btrfs_set_node_blockptr(path->nodes[level],
                                        path->slots[level], old_bytenr);
                btrfs_set_node_ptr_generation(path->nodes[level],
                                              path->slots[level], old_ptr_gen);
                btrfs_mark_buffer_dirty(trans, path->nodes[level]);

                ref.action = BTRFS_ADD_DELAYED_REF;
                ref.bytenr = old_bytenr;
                ref.num_bytes = blocksize;
                ref.parent = path->nodes[level]->start;
                ref.owning_root = btrfs_root_id(src);
                ref.ref_root = btrfs_root_id(src);
                btrfs_init_tree_ref(&ref, level - 1, 0, true);
                ret = btrfs_inc_extent_ref(trans, &ref);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        break;
                }

                ref.action = BTRFS_ADD_DELAYED_REF;
                ref.bytenr = new_bytenr;
                ref.num_bytes = blocksize;
                ref.parent = 0;
                ref.owning_root = btrfs_root_id(dest);
                ref.ref_root = btrfs_root_id(dest);
                btrfs_init_tree_ref(&ref, level - 1, 0, true);
                ret = btrfs_inc_extent_ref(trans, &ref);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        break;
                }

                /* We don't know the real owning_root, use 0. */
                ref.action = BTRFS_DROP_DELAYED_REF;
                ref.bytenr = new_bytenr;
                ref.num_bytes = blocksize;
                ref.parent = path->nodes[level]->start;
                ref.owning_root = 0;
                ref.ref_root = btrfs_root_id(src);
                btrfs_init_tree_ref(&ref, level - 1, 0, true);
                ret = btrfs_free_extent(trans, &ref);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        break;
                }

                /* We don't know the real owning_root, use 0. */
                ref.action = BTRFS_DROP_DELAYED_REF;
                ref.bytenr = old_bytenr;
                ref.num_bytes = blocksize;
                ref.parent = 0;
                ref.owning_root = 0;
                ref.ref_root = btrfs_root_id(dest);
                btrfs_init_tree_ref(&ref, level - 1, 0, true);
                ret = btrfs_free_extent(trans, &ref);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        break;
                }

                btrfs_unlock_up_safe(path, 0);

                ret = level;
                break;
        }
        btrfs_tree_unlock(parent);
        free_extent_buffer(parent);
        return ret;
}

/*
 * helper to find next relocated block in reloc tree
 */
static noinline_for_stack
int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
                       int *level)
{
        struct extent_buffer *eb;
        int i;
        u64 last_snapshot;
        u32 nritems;

        last_snapshot = btrfs_root_last_snapshot(&root->root_item);

        for (i = 0; i < *level; i++) {
                free_extent_buffer(path->nodes[i]);
                path->nodes[i] = NULL;
        }

        for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
                eb = path->nodes[i];
                nritems = btrfs_header_nritems(eb);
                while (path->slots[i] + 1 < nritems) {
                        path->slots[i]++;
                        if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
                            last_snapshot)
                                continue;

                        *level = i;
                        return 0;
                }
                free_extent_buffer(path->nodes[i]);
                path->nodes[i] = NULL;
        }
        return 1;
}

/*
 * walk down reloc tree to find relocated block of lowest level
 */
static noinline_for_stack
int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
                         int *level)
{
        struct extent_buffer *eb = NULL;
        int i;
        u64 ptr_gen = 0;
        u64 last_snapshot;
        u32 nritems;

        last_snapshot = btrfs_root_last_snapshot(&root->root_item);

        for (i = *level; i > 0; i--) {
                eb = path->nodes[i];
                nritems = btrfs_header_nritems(eb);
                while (path->slots[i] < nritems) {
                        ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
                        if (ptr_gen > last_snapshot)
                                break;
                        path->slots[i]++;
                }
                if (path->slots[i] >= nritems) {
                        if (i == *level)
                                break;
                        *level = i + 1;
                        return 0;
                }
                if (i == 1) {
                        *level = i;
                        return 0;
                }

                eb = btrfs_read_node_slot(eb, path->slots[i]);
                if (IS_ERR(eb))
                        return PTR_ERR(eb);
                BUG_ON(btrfs_header_level(eb) != i - 1);
                path->nodes[i - 1] = eb;
                path->slots[i - 1] = 0;
        }
        return 1;
}

/*
 * invalidate extent cache for file extents whose key in range of
 * [min_key, max_key)
 */
static int invalidate_extent_cache(struct btrfs_root *root,
                                   const struct btrfs_key *min_key,
                                   const struct btrfs_key *max_key)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_inode *inode = NULL;
        u64 objectid;
        u64 start, end;
        u64 ino;

        objectid = min_key->objectid;
        while (1) {
                struct extent_state *cached_state = NULL;

                cond_resched();
                if (inode)
                        iput(&inode->vfs_inode);

                if (objectid > max_key->objectid)
                        break;

                inode = btrfs_find_first_inode(root, objectid);
                if (!inode)
                        break;
                ino = btrfs_ino(inode);

                if (ino > max_key->objectid) {
                        iput(&inode->vfs_inode);
                        break;
                }

                objectid = ino + 1;
                if (!S_ISREG(inode->vfs_inode.i_mode))
                        continue;

                if (unlikely(min_key->objectid == ino)) {
                        if (min_key->type > BTRFS_EXTENT_DATA_KEY)
                                continue;
                        if (min_key->type < BTRFS_EXTENT_DATA_KEY)
                                start = 0;
                        else {
                                start = min_key->offset;
                                WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
                        }
                } else {
                        start = 0;
                }

                if (unlikely(max_key->objectid == ino)) {
                        if (max_key->type < BTRFS_EXTENT_DATA_KEY)
                                continue;
                        if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
                                end = (u64)-1;
                        } else {
                                if (max_key->offset == 0)
                                        continue;
                                end = max_key->offset;
                                WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
                                end--;
                        }
                } else {
                        end = (u64)-1;
                }

                /* the lock_extent waits for read_folio to complete */
                lock_extent(&inode->io_tree, start, end, &cached_state);
                btrfs_drop_extent_map_range(inode, start, end, true);
                unlock_extent(&inode->io_tree, start, end, &cached_state);
        }
        return 0;
}

static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key)

{
        while (level < BTRFS_MAX_LEVEL) {
                if (!path->nodes[level])
                        break;
                if (path->slots[level] + 1 <
                    btrfs_header_nritems(path->nodes[level])) {
                        btrfs_node_key_to_cpu(path->nodes[level], key,
                                              path->slots[level] + 1);
                        return 0;
                }
                level++;
        }
        return 1;
}

/*
 * Insert current subvolume into reloc_control::dirty_subvol_roots
 */
static int insert_dirty_subvol(struct btrfs_trans_handle *trans,
                               struct reloc_control *rc,
                               struct btrfs_root *root)
{
        struct btrfs_root *reloc_root = root->reloc_root;
        struct btrfs_root_item *reloc_root_item;
        int ret;

        /* @root must be a subvolume tree root with a valid reloc tree */
        ASSERT(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID);
        ASSERT(reloc_root);

        reloc_root_item = &reloc_root->root_item;
        memset(&reloc_root_item->drop_progress, 0,
                sizeof(reloc_root_item->drop_progress));
        btrfs_set_root_drop_level(reloc_root_item, 0);
        btrfs_set_root_refs(reloc_root_item, 0);
        ret = btrfs_update_reloc_root(trans, root);
        if (ret)
                return ret;

        if (list_empty(&root->reloc_dirty_list)) {
                btrfs_grab_root(root);
                list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
        }

        return 0;
}

static int clean_dirty_subvols(struct reloc_control *rc)
{
        struct btrfs_root *root;
        struct btrfs_root *next;
        int ret = 0;
        int ret2;

        list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
                                 reloc_dirty_list) {
                if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) {
                        /* Merged subvolume, cleanup its reloc root */
                        struct btrfs_root *reloc_root = root->reloc_root;

                        list_del_init(&root->reloc_dirty_list);
                        root->reloc_root = NULL;
                        /*
                         * Need barrier to ensure clear_bit() only happens after
                         * root->reloc_root = NULL. Pairs with have_reloc_root.
                         */
                        smp_wmb();
                        clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
                        if (reloc_root) {
                                /*
                                 * btrfs_drop_snapshot drops our ref we hold for
                                 * ->reloc_root.  If it fails however we must
                                 * drop the ref ourselves.
                                 */
                                ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
                                if (ret2 < 0) {
                                        btrfs_put_root(reloc_root);
                                        if (!ret)
                                                ret = ret2;
                                }
                        }
                        btrfs_put_root(root);
                } else {
                        /* Orphan reloc tree, just clean it up */
                        ret2 = btrfs_drop_snapshot(root, 0, 1);
                        if (ret2 < 0) {
                                btrfs_put_root(root);
                                if (!ret)
                                        ret = ret2;
                        }
                }
        }
        return ret;
}

/*
 * merge the relocated tree blocks in reloc tree with corresponding
 * fs tree.
 */
static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                                               struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
        struct btrfs_key key;
        struct btrfs_key next_key;
        struct btrfs_trans_handle *trans = NULL;
        struct btrfs_root *reloc_root;
        struct btrfs_root_item *root_item;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        int reserve_level;
        int level;
        int max_level;
        int replaced = 0;
        int ret = 0;
        u32 min_reserved;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        path->reada = READA_FORWARD;

        reloc_root = root->reloc_root;
        root_item = &reloc_root->root_item;

        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
                level = btrfs_root_level(root_item);
                atomic_inc(&reloc_root->node->refs);
                path->nodes[level] = reloc_root->node;
                path->slots[level] = 0;
        } else {
                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);

                level = btrfs_root_drop_level(root_item);
                BUG_ON(level == 0);
                path->lowest_level = level;
                ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
                path->lowest_level = 0;
                if (ret < 0) {
                        btrfs_free_path(path);
                        return ret;
                }

                btrfs_node_key_to_cpu(path->nodes[level], &next_key,
                                      path->slots[level]);
                WARN_ON(memcmp(&key, &next_key, sizeof(key)));

                btrfs_unlock_up_safe(path, 0);
        }

        /*
         * In merge_reloc_root(), we modify the upper level pointer to swap the
         * tree blocks between reloc tree and subvolume tree.  Thus for tree
         * block COW, we COW at most from level 1 to root level for each tree.
         *
         * Thus the needed metadata size is at most root_level * nodesize,
         * and * 2 since we have two trees to COW.
         */
        reserve_level = max_t(int, 1, btrfs_root_level(root_item));
        min_reserved = fs_info->nodesize * reserve_level * 2;
        memset(&next_key, 0, sizeof(next_key));

        while (1) {
                ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
                                             min_reserved,
                                             BTRFS_RESERVE_FLUSH_LIMIT);
                if (ret)
                        goto out;
                trans = btrfs_start_transaction(root, 0);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
                        trans = NULL;
                        goto out;
                }

                /*
                 * At this point we no longer have a reloc_control, so we can't
                 * depend on btrfs_init_reloc_root to update our last_trans.
                 *
                 * But that's ok, we started the trans handle on our
                 * corresponding fs_root, which means it's been added to the
                 * dirty list.  At commit time we'll still call
                 * btrfs_update_reloc_root() and update our root item
                 * appropriately.
                 */
                reloc_root->last_trans = trans->transid;
                trans->block_rsv = rc->block_rsv;

                replaced = 0;
                max_level = level;

                ret = walk_down_reloc_tree(reloc_root, path, &level);
                if (ret < 0)
                        goto out;
                if (ret > 0)
                        break;

                if (!find_next_key(path, level, &key) &&
                    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
                        ret = 0;
                } else {
                        ret = replace_path(trans, rc, root, reloc_root, path,
                                           &next_key, level, max_level);
                }
                if (ret < 0)
                        goto out;
                if (ret > 0) {
                        level = ret;
                        btrfs_node_key_to_cpu(path->nodes[level], &key,
                                              path->slots[level]);
                        replaced = 1;
                }

                ret = walk_up_reloc_tree(reloc_root, path, &level);
                if (ret > 0)
                        break;

                BUG_ON(level == 0);
                /*
                 * save the merging progress in the drop_progress.
                 * this is OK since root refs == 1 in this case.
                 */
                btrfs_node_key(path->nodes[level], &root_item->drop_progress,
                               path->slots[level]);
                btrfs_set_root_drop_level(root_item, level);

                btrfs_end_transaction_throttle(trans);
                trans = NULL;

                btrfs_btree_balance_dirty(fs_info);

                if (replaced && rc->stage == UPDATE_DATA_PTRS)
                        invalidate_extent_cache(root, &key, &next_key);
        }

        /*
         * handle the case only one block in the fs tree need to be
         * relocated and the block is tree root.
         */
        leaf = btrfs_lock_root_node(root);
        ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf,
                              BTRFS_NESTING_COW);
        btrfs_tree_unlock(leaf);
        free_extent_buffer(leaf);
out:
        btrfs_free_path(path);

        if (ret == 0) {
                ret = insert_dirty_subvol(trans, rc, root);
                if (ret)
                        btrfs_abort_transaction(trans, ret);
        }

        if (trans)
                btrfs_end_transaction_throttle(trans);

        btrfs_btree_balance_dirty(fs_info);

        if (replaced && rc->stage == UPDATE_DATA_PTRS)
                invalidate_extent_cache(root, &key, &next_key);

        return ret;
}

static noinline_for_stack
int prepare_to_merge(struct reloc_control *rc, int err)
{
        struct btrfs_root *root = rc->extent_root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_root *reloc_root;
        struct btrfs_trans_handle *trans;
        LIST_HEAD(reloc_roots);
        u64 num_bytes = 0;
        int ret;

        mutex_lock(&fs_info->reloc_mutex);
        rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
        rc->merging_rsv_size += rc->nodes_relocated * 2;
        mutex_unlock(&fs_info->reloc_mutex);

again:
        if (!err) {
                num_bytes = rc->merging_rsv_size;
                ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes,
                                          BTRFS_RESERVE_FLUSH_ALL);
                if (ret)
                        err = ret;
        }

        trans = btrfs_join_transaction(rc->extent_root);
        if (IS_ERR(trans)) {
                if (!err)
                        btrfs_block_rsv_release(fs_info, rc->block_rsv,
                                                num_bytes, NULL);
                return PTR_ERR(trans);
        }

        if (!err) {
                if (num_bytes != rc->merging_rsv_size) {
                        btrfs_end_transaction(trans);
                        btrfs_block_rsv_release(fs_info, rc->block_rsv,
                                                num_bytes, NULL);
                        goto again;
                }
        }

        rc->merge_reloc_tree = true;

        while (!list_empty(&rc->reloc_roots)) {
                reloc_root = list_entry(rc->reloc_roots.next,
                                        struct btrfs_root, root_list);
                list_del_init(&reloc_root->root_list);

                root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
                                false);
                if (IS_ERR(root)) {
                        /*
                         * Even if we have an error we need this reloc root
                         * back on our list so we can clean up properly.
                         */
                        list_add(&reloc_root->root_list, &reloc_roots);
                        btrfs_abort_transaction(trans, (int)PTR_ERR(root));
                        if (!err)
                                err = PTR_ERR(root);
                        break;
                }

                if (unlikely(root->reloc_root != reloc_root)) {
                        if (root->reloc_root) {
                                btrfs_err(fs_info,
"reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu",
                                          btrfs_root_id(root),
                                          btrfs_root_id(root->reloc_root),
                                          root->reloc_root->root_key.type,
                                          root->reloc_root->root_key.offset,
                                          btrfs_root_generation(
                                                  &root->reloc_root->root_item),
                                          btrfs_root_id(reloc_root),
                                          reloc_root->root_key.type,
                                          reloc_root->root_key.offset,
                                          btrfs_root_generation(
                                                  &reloc_root->root_item));
                        } else {
                                btrfs_err(fs_info,
"reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu",
                                          btrfs_root_id(root),
                                          btrfs_root_id(reloc_root),
                                          reloc_root->root_key.type,
                                          reloc_root->root_key.offset,
                                          btrfs_root_generation(
                                                  &reloc_root->root_item));
                        }
                        list_add(&reloc_root->root_list, &reloc_roots);
                        btrfs_put_root(root);
                        btrfs_abort_transaction(trans, -EUCLEAN);
                        if (!err)
                                err = -EUCLEAN;
                        break;
                }

                /*
                 * set reference count to 1, so btrfs_recover_relocation
                 * knows it should resumes merging
                 */
                if (!err)
                        btrfs_set_root_refs(&reloc_root->root_item, 1);
                ret = btrfs_update_reloc_root(trans, root);

                /*
                 * Even if we have an error we need this reloc root back on our
                 * list so we can clean up properly.
                 */
                list_add(&reloc_root->root_list, &reloc_roots);
                btrfs_put_root(root);

                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        if (!err)
                                err = ret;
                        break;
                }
        }

        list_splice(&reloc_roots, &rc->reloc_roots);

        if (!err)
                err = btrfs_commit_transaction(trans);
        else
                btrfs_end_transaction(trans);
        return err;
}

static noinline_for_stack
void free_reloc_roots(struct list_head *list)
{
        struct btrfs_root *reloc_root, *tmp;

        list_for_each_entry_safe(reloc_root, tmp, list, root_list)
                __del_reloc_root(reloc_root);
}

static noinline_for_stack
void merge_reloc_roots(struct reloc_control *rc)
{
        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
        struct btrfs_root *root;
        struct btrfs_root *reloc_root;
        LIST_HEAD(reloc_roots);
        int found = 0;
        int ret = 0;
again:
        root = rc->extent_root;

        /*
         * this serializes us with btrfs_record_root_in_transaction,
         * we have to make sure nobody is in the middle of
         * adding their roots to the list while we are
         * doing this splice
         */
        mutex_lock(&fs_info->reloc_mutex);
        list_splice_init(&rc->reloc_roots, &reloc_roots);
        mutex_unlock(&fs_info->reloc_mutex);

        while (!list_empty(&reloc_roots)) {
                found = 1;
                reloc_root = list_entry(reloc_roots.next,
                                        struct btrfs_root, root_list);

                root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
                                         false);
                if (btrfs_root_refs(&reloc_root->root_item) > 0) {
                        if (WARN_ON(IS_ERR(root))) {
                                /*
                                 * For recovery we read the fs roots on mount,
                                 * and if we didn't find the root then we marked
                                 * the reloc root as a garbage root.  For normal
                                 * relocation obviously the root should exist in
                                 * memory.  However there's no reason we can't
                                 * handle the error properly here just in case.
                                 */
                                ret = PTR_ERR(root);
                                goto out;
                        }
                        if (WARN_ON(root->reloc_root != reloc_root)) {
                                /*
                                 * This can happen if on-disk metadata has some
                                 * corruption, e.g. bad reloc tree key offset.
                                 */
                                ret = -EINVAL;
                                goto out;
                        }
                        ret = merge_reloc_root(rc, root);
                        btrfs_put_root(root);
                        if (ret) {
                                if (list_empty(&reloc_root->root_list))
                                        list_add_tail(&reloc_root->root_list,
                                                      &reloc_roots);
                                goto out;
                        }
                } else {
                        if (!IS_ERR(root)) {
                                if (root->reloc_root == reloc_root) {
                                        root->reloc_root = NULL;
                                        btrfs_put_root(reloc_root);
                                }
                                clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE,
                                          &root->state);
                                btrfs_put_root(root);
                        }

                        list_del_init(&reloc_root->root_list);
                        /* Don't forget to queue this reloc root for cleanup */
                        list_add_tail(&reloc_root->reloc_dirty_list,
                                      &rc->dirty_subvol_roots);
                }
        }

        if (found) {
                found = 0;
                goto again;
        }
out:
        if (ret) {
                btrfs_handle_fs_error(fs_info, ret, NULL);
                free_reloc_roots(&reloc_roots);

                /* new reloc root may be added */
                mutex_lock(&fs_info->reloc_mutex);
                list_splice_init(&rc->reloc_roots, &reloc_roots);
                mutex_unlock(&fs_info->reloc_mutex);
                free_reloc_roots(&reloc_roots);
        }

        /*
         * We used to have
         *
         * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
         *
         * here, but it's wrong.  If we fail to start the transaction in
         * prepare_to_merge() we will have only 0 ref reloc roots, none of which
         * have actually been removed from the reloc_root_tree rb tree.  This is
         * fine because we're bailing here, and we hold a reference on the root
         * for the list that holds it, so these roots will be cleaned up when we
         * do the reloc_dirty_list afterwards.  Meanwhile the root->reloc_root
         * will be cleaned up on unmount.
         *
         * The remaining nodes will be cleaned up by free_reloc_control.
         */
}

static void free_block_list(struct rb_root *blocks)
{
        struct tree_block *block;
        struct rb_node *rb_node;
        while ((rb_node = rb_first(blocks))) {
                block = rb_entry(rb_node, struct tree_block, rb_node);
                rb_erase(rb_node, blocks);
                kfree(block);
        }
}

static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *reloc_root)
{
        struct btrfs_fs_info *fs_info = reloc_root->fs_info;
        struct btrfs_root *root;
        int ret;

        if (reloc_root->last_trans == trans->transid)
                return 0;

        root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);

        /*
         * This should succeed, since we can't have a reloc root without having
         * already looked up the actual root and created the reloc root for this
         * root.
         *
         * However if there's some sort of corruption where we have a ref to a
         * reloc root without a corresponding root this could return ENOENT.
         */
        if (IS_ERR(root)) {
                ASSERT(0);
                return PTR_ERR(root);
        }
        if (root->reloc_root != reloc_root) {
                ASSERT(0);
                btrfs_err(fs_info,
                          "root %llu has two reloc roots associated with it",
                          reloc_root->root_key.offset);
                btrfs_put_root(root);
                return -EUCLEAN;
        }
        ret = btrfs_record_root_in_trans(trans, root);
        btrfs_put_root(root);

        return ret;
}

static noinline_for_stack
struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
                                     struct reloc_control *rc,
                                     struct btrfs_backref_node *node,
                                     struct btrfs_backref_edge *edges[])
{
        struct btrfs_backref_node *next;
        struct btrfs_root *root;
        int index = 0;
        int ret;

        next = node;
        while (1) {
                cond_resched();
                next = walk_up_backref(next, edges, &index);
                root = next->root;

                /*
                 * If there is no root, then our references for this block are
                 * incomplete, as we should be able to walk all the way up to a
                 * block that is owned by a root.
                 *
                 * This path is only for SHAREABLE roots, so if we come upon a
                 * non-SHAREABLE root then we have backrefs that resolve
                 * improperly.
                 *
                 * Both of these cases indicate file system corruption, or a bug
                 * in the backref walking code.
                 */
                if (!root) {
                        ASSERT(0);
                        btrfs_err(trans->fs_info,
                "bytenr %llu doesn't have a backref path ending in a root",
                                  node->bytenr);
                        return ERR_PTR(-EUCLEAN);
                }
                if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
                        ASSERT(0);
                        btrfs_err(trans->fs_info,
        "bytenr %llu has multiple refs with one ending in a non-shareable root",
                                  node->bytenr);
                        return ERR_PTR(-EUCLEAN);
                }

                if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
                        ret = record_reloc_root_in_trans(trans, root);
                        if (ret)
                                return ERR_PTR(ret);
                        break;
                }

                ret = btrfs_record_root_in_trans(trans, root);
                if (ret)
                        return ERR_PTR(ret);
                root = root->reloc_root;

                /*
                 * We could have raced with another thread which failed, so
                 * root->reloc_root may not be set, return ENOENT in this case.
                 */
                if (!root)
                        return ERR_PTR(-ENOENT);

                if (next->new_bytenr != root->node->start) {
                        /*
                         * We just created the reloc root, so we shouldn't have
                         * ->new_bytenr set and this shouldn't be in the changed
                         *  list.  If it is then we have multiple roots pointing
                         *  at the same bytenr which indicates corruption, or
                         *  we've made a mistake in the backref walking code.
                         */
                        ASSERT(next->new_bytenr == 0);
                        ASSERT(list_empty(&next->list));
                        if (next->new_bytenr || !list_empty(&next->list)) {
                                btrfs_err(trans->fs_info,
        "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
                                          node->bytenr, next->bytenr);
                                return ERR_PTR(-EUCLEAN);
                        }

                        next->new_bytenr = root->node->start;
                        btrfs_put_root(next->root);
                        next->root = btrfs_grab_root(root);
                        ASSERT(next->root);
                        list_add_tail(&next->list,
                                      &rc->backref_cache.changed);
                        mark_block_processed(rc, next);
                        break;
                }

                WARN_ON(1);
                root = NULL;
                next = walk_down_backref(edges, &index);
                if (!next || next->level <= node->level)
                        break;
        }
        if (!root) {
                /*
                 * This can happen if there's fs corruption or if there's a bug
                 * in the backref lookup code.
                 */
                ASSERT(0);
                return ERR_PTR(-ENOENT);
        }

        next = node;
        /* setup backref node path for btrfs_reloc_cow_block */
        while (1) {
                rc->backref_cache.path[next->level] = next;
                if (--index < 0)
                        break;
                next = edges[index]->node[UPPER];
        }
        return root;
}

/*
 * Select a tree root for relocation.
 *
 * Return NULL if the block is not shareable. We should use do_relocation() in
 * this case.
 *
 * Return a tree root pointer if the block is shareable.
 * Return -ENOENT if the block is root of reloc tree.
 */
static noinline_for_stack
struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
{
        struct btrfs_backref_node *next;
        struct btrfs_root *root;
        struct btrfs_root *fs_root = NULL;
        struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
        int index = 0;

        next = node;
        while (1) {
                cond_resched();
                next = walk_up_backref(next, edges, &index);
                root = next->root;

                /*
                 * This can occur if we have incomplete extent refs leading all
                 * the way up a particular path, in this case return -EUCLEAN.
                 */
                if (!root)
                        return ERR_PTR(-EUCLEAN);

                /* No other choice for non-shareable tree */
                if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
                        return root;

                if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID)
                        fs_root = root;

                if (next != node)
                        return NULL;

                next = walk_down_backref(edges, &index);
                if (!next || next->level <= node->level)
                        break;
        }

        if (!fs_root)
                return ERR_PTR(-ENOENT);
        return fs_root;
}

static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc,
                                                  struct btrfs_backref_node *node)
{
        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
        struct btrfs_backref_node *next = node;
        struct btrfs_backref_edge *edge;
        struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
        u64 num_bytes = 0;
        int index = 0;

        BUG_ON(node->processed);

        while (next) {
                cond_resched();
                while (1) {
                        if (next->processed)
                                break;

                        num_bytes += fs_info->nodesize;

                        if (list_empty(&next->upper))
                                break;

                        edge = list_entry(next->upper.next,
                                        struct btrfs_backref_edge, list[LOWER]);
                        edges[index++] = edge;
                        next = edge->node[UPPER];
                }
                next = walk_down_backref(edges, &index);
        }
        return num_bytes;
}

static int reserve_metadata_space(struct btrfs_trans_handle *trans,
                                  struct reloc_control *rc,
                                  struct btrfs_backref_node *node)
{
        struct btrfs_root *root = rc->extent_root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 num_bytes;
        int ret;
        u64 tmp;

        num_bytes = calcu_metadata_size(rc, node) * 2;

        trans->block_rsv = rc->block_rsv;
        rc->reserved_bytes += num_bytes;

        /*
         * We are under a transaction here so we can only do limited flushing.
         * If we get an enospc just kick back -EAGAIN so we know to drop the
         * transaction and try to refill when we can flush all the things.
         */
        ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes,
                                     BTRFS_RESERVE_FLUSH_LIMIT);
        if (ret) {
                tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
                while (tmp <= rc->reserved_bytes)
                        tmp <<= 1;
                /*
                 * only one thread can access block_rsv at this point,
                 * so we don't need hold lock to protect block_rsv.
                 * we expand more reservation size here to allow enough
                 * space for relocation and we will return earlier in
                 * enospc case.
                 */
                rc->block_rsv->size = tmp + fs_info->nodesize *
                                      RELOCATION_RESERVED_NODES;
                return -EAGAIN;
        }

        return 0;
}

/*
 * relocate a block tree, and then update pointers in upper level
 * blocks that reference the block to point to the new location.
 *
 * if called by link_to_upper, the block has already been relocated.
 * in that case this function just updates pointers.
 */
static int do_relocation(struct btrfs_trans_handle *trans,
                         struct reloc_control *rc,
                         struct btrfs_backref_node *node,
                         struct btrfs_key *key,
                         struct btrfs_path *path, int lowest)
{
        struct btrfs_backref_node *upper;
        struct btrfs_backref_edge *edge;
        struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
        struct btrfs_root *root;
        struct extent_buffer *eb;
        u32 blocksize;
        u64 bytenr;
        int slot;
        int ret = 0;

        /*
         * If we are lowest then this is the first time we're processing this
         * block, and thus shouldn't have an eb associated with it yet.
         */
        ASSERT(!lowest || !node->eb);

        path->lowest_level = node->level + 1;
        rc->backref_cache.path[node->level] = node;
        list_for_each_entry(edge, &node->upper, list[LOWER]) {
                cond_resched();

                upper = edge->node[UPPER];
                root = select_reloc_root(trans, rc, upper, edges);
                if (IS_ERR(root)) {
                        ret = PTR_ERR(root);
                        goto next;
                }

                if (upper->eb && !upper->locked) {
                        if (!lowest) {
                                ret = btrfs_bin_search(upper->eb, 0, key, &slot);
                                if (ret < 0)
                                        goto next;
                                BUG_ON(ret);
                                bytenr = btrfs_node_blockptr(upper->eb, slot);
                                if (node->eb->start == bytenr)
                                        goto next;
                        }
                        btrfs_backref_drop_node_buffer(upper);
                }

                if (!upper->eb) {
                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
                        if (ret) {
                                if (ret > 0)
                                        ret = -ENOENT;

                                btrfs_release_path(path);
                                break;
                        }

                        if (!upper->eb) {
                                upper->eb = path->nodes[upper->level];
                                path->nodes[upper->level] = NULL;
                        } else {
                                BUG_ON(upper->eb != path->nodes[upper->level]);
                        }

                        upper->locked = 1;
                        path->locks[upper->level] = 0;

                        slot = path->slots[upper->level];
                        btrfs_release_path(path);
                } else {
                        ret = btrfs_bin_search(upper->eb, 0, key, &slot);
                        if (ret < 0)
                                goto next;
                        BUG_ON(ret);
                }

                bytenr = btrfs_node_blockptr(upper->eb, slot);
                if (lowest) {
                        if (bytenr != node->bytenr) {
                                btrfs_err(root->fs_info,
                "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
                                          bytenr, node->bytenr, slot,
                                          upper->eb->start);
                                ret = -EIO;
                                goto next;
                        }
                } else {
                        if (node->eb->start == bytenr)
                                goto next;
                }

                blocksize = root->fs_info->nodesize;
                eb = btrfs_read_node_slot(upper->eb, slot);
                if (IS_ERR(eb)) {
                        ret = PTR_ERR(eb);
                        goto next;
                }
                btrfs_tree_lock(eb);

                if (!node->eb) {
                        ret = btrfs_cow_block(trans, root, eb, upper->eb,
                                              slot, &eb, BTRFS_NESTING_COW);
                        btrfs_tree_unlock(eb);
                        free_extent_buffer(eb);
                        if (ret < 0)
                                goto next;
                        /*
                         * We've just COWed this block, it should have updated
                         * the correct backref node entry.
                         */
                        ASSERT(node->eb == eb);
                } else {
                        struct btrfs_ref ref = {
                                .action = BTRFS_ADD_DELAYED_REF,
                                .bytenr = node->eb->start,
                                .num_bytes = blocksize,
                                .parent = upper->eb->start,
                                .owning_root = btrfs_header_owner(upper->eb),
                                .ref_root = btrfs_header_owner(upper->eb),
                        };

                        btrfs_set_node_blockptr(upper->eb, slot,
                                                node->eb->start);
                        btrfs_set_node_ptr_generation(upper->eb, slot,
                                                      trans->transid);
                        btrfs_mark_buffer_dirty(trans, upper->eb);

                        btrfs_init_tree_ref(&ref, node->level,
                                            btrfs_root_id(root), false);
                        ret = btrfs_inc_extent_ref(trans, &ref);
                        if (!ret)
                                ret = btrfs_drop_subtree(trans, root, eb,
                                                         upper->eb);
                        if (ret)
                                btrfs_abort_transaction(trans, ret);
                }
next:
                if (!upper->pending)
                        btrfs_backref_drop_node_buffer(upper);
                else
                        btrfs_backref_unlock_node_buffer(upper);
                if (ret)
                        break;
        }

        if (!ret && node->pending) {
                btrfs_backref_drop_node_buffer(node);
                list_move_tail(&node->list, &rc->backref_cache.changed);
                node->pending = 0;
        }

        path->lowest_level = 0;

        /*
         * We should have allocated all of our space in the block rsv and thus
         * shouldn't ENOSPC.
         */
        ASSERT(ret != -ENOSPC);
        return ret;
}

static int link_to_upper(struct btrfs_trans_handle *trans,
                         struct reloc_control *rc,
                         struct btrfs_backref_node *node,
                         struct btrfs_path *path)
{
        struct btrfs_key key;

        btrfs_node_key_to_cpu(node->eb, &key, 0);
        return do_relocation(trans, rc, node, &key, path, 0);
}

static int finish_pending_nodes(struct btrfs_trans_handle *trans,
                                struct reloc_control *rc,
                                struct btrfs_path *path, int err)
{
        LIST_HEAD(list);
        struct btrfs_backref_cache *cache = &rc->backref_cache;
        struct btrfs_backref_node *node;
        int level;
        int ret;

        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
                while (!list_empty(&cache->pending[level])) {
                        node = list_entry(cache->pending[level].next,
                                          struct btrfs_backref_node, list);
                        list_move_tail(&node->list, &list);
                        BUG_ON(!node->pending);

                        if (!err) {
                                ret = link_to_upper(trans, rc, node, path);
                                if (ret < 0)
                                        err = ret;
                        }
                }
                list_splice_init(&list, &cache->pending[level]);
        }
        return err;
}

/*
 * mark a block and all blocks directly/indirectly reference the block
 * as processed.
 */
static void update_processed_blocks(struct reloc_control *rc,
                                    struct btrfs_backref_node *node)
{
        struct btrfs_backref_node *next = node;
        struct btrfs_backref_edge *edge;
        struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
        int index = 0;

        while (next) {
                cond_resched();
                while (1) {
                        if (next->processed)
                                break;

                        mark_block_processed(rc, next);

                        if (list_empty(&next->upper))
                                break;

                        edge = list_entry(next->upper.next,
                                        struct btrfs_backref_edge, list[LOWER]);
                        edges[index++] = edge;
                        next = edge->node[UPPER];
                }
                next = walk_down_backref(edges, &index);
        }
}

static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
{
        u32 blocksize = rc->extent_root->fs_info->nodesize;

        if (test_range_bit(&rc->processed_blocks, bytenr,
                           bytenr + blocksize - 1, EXTENT_DIRTY, NULL))
                return 1;
        return 0;
}

static int get_tree_block_key(struct btrfs_fs_info *fs_info,
                              struct tree_block *block)
{
        struct btrfs_tree_parent_check check = {
                .level = block->level,
                .owner_root = block->owner,
                .transid = block->key.offset
        };
        struct extent_buffer *eb;

        eb = read_tree_block(fs_info, block->bytenr, &check);
        if (IS_ERR(eb))
                return PTR_ERR(eb);
        if (!extent_buffer_uptodate(eb)) {
                free_extent_buffer(eb);
                return -EIO;
        }
        if (block->level == 0)
                btrfs_item_key_to_cpu(eb, &block->key, 0);
        else
                btrfs_node_key_to_cpu(eb, &block->key, 0);
        free_extent_buffer(eb);
        block->key_ready = true;
        return 0;
}

/*
 * helper function to relocate a tree block
 */
static int relocate_tree_block(struct btrfs_trans_handle *trans,
                                struct reloc_control *rc,
                                struct btrfs_backref_node *node,
                                struct btrfs_key *key,
                                struct btrfs_path *path)
{
        struct btrfs_root *root;
        int ret = 0;

        if (!node)
                return 0;

        /*
         * If we fail here we want to drop our backref_node because we are going
         * to start over and regenerate the tree for it.
         */
        ret = reserve_metadata_space(trans, rc, node);
        if (ret)
                goto out;

        BUG_ON(node->processed);
        root = select_one_root(node);
        if (IS_ERR(root)) {
                ret = PTR_ERR(root);

                /* See explanation in select_one_root for the -EUCLEAN case. */
                ASSERT(ret == -ENOENT);
                if (ret == -ENOENT) {
                        ret = 0;
                        update_processed_blocks(rc, node);
                }
                goto out;
        }

        if (root) {
                if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
                        /*
                         * This block was the root block of a root, and this is
                         * the first time we're processing the block and thus it
                         * should not have had the ->new_bytenr modified and
                         * should have not been included on the changed list.
                         *
                         * However in the case of corruption we could have
                         * multiple refs pointing to the same block improperly,
                         * and thus we would trip over these checks.  ASSERT()
                         * for the developer case, because it could indicate a
                         * bug in the backref code, however error out for a
                         * normal user in the case of corruption.
                         */
                        ASSERT(node->new_bytenr == 0);
                        ASSERT(list_empty(&node->list));
                        if (node->new_bytenr || !list_empty(&node->list)) {
                                btrfs_err(root->fs_info,
                                  "bytenr %llu has improper references to it",
                                          node->bytenr);
                                ret = -EUCLEAN;
                                goto out;
                        }
                        ret = btrfs_record_root_in_trans(trans, root);
                        if (ret)
                                goto out;
                        /*
                         * Another thread could have failed, need to check if we
                         * have reloc_root actually set.
                         */
                        if (!root->reloc_root) {
                                ret = -ENOENT;
                                goto out;
                        }
                        root = root->reloc_root;
                        node->new_bytenr = root->node->start;
                        btrfs_put_root(node->root);
                        node->root = btrfs_grab_root(root);
                        ASSERT(node->root);
                        list_add_tail(&node->list, &rc->backref_cache.changed);
                } else {
                        path->lowest_level = node->level;
                        if (root == root->fs_info->chunk_root)
                                btrfs_reserve_chunk_metadata(trans, false);
                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
                        btrfs_release_path(path);
                        if (root == root->fs_info->chunk_root)
                                btrfs_trans_release_chunk_metadata(trans);
                        if (ret > 0)
                                ret = 0;
                }
                if (!ret)
                        update_processed_blocks(rc, node);
        } else {
                ret = do_relocation(trans, rc, node, key, path, 1);
        }
out:
        if (ret || node->level == 0 || node->cowonly)
                btrfs_backref_cleanup_node(&rc->backref_cache, node);
        return ret;
}

/*
 * relocate a list of blocks
 */
static noinline_for_stack
int relocate_tree_blocks(struct btrfs_trans_handle *trans,
                         struct reloc_control *rc, struct rb_root *blocks)
{
        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
        struct btrfs_backref_node *node;
        struct btrfs_path *path;
        struct tree_block *block;
        struct tree_block *next;
        int ret = 0;

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out_free_blocks;
        }

        /* Kick in readahead for tree blocks with missing keys */
        rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
                if (!block->key_ready)
                        btrfs_readahead_tree_block(fs_info, block->bytenr,
                                                   block->owner, 0,
                                                   block->level);
        }

        /* Get first keys */
        rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
                if (!block->key_ready) {
                        ret = get_tree_block_key(fs_info, block);
                        if (ret)
                                goto out_free_path;
                }
        }

        /* Do tree relocation */
        rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
                node = build_backref_tree(trans, rc, &block->key,
                                          block->level, block->bytenr);
                if (IS_ERR(node)) {
                        ret = PTR_ERR(node);
                        goto out;
                }

                ret = relocate_tree_block(trans, rc, node, &block->key,
                                          path);
                if (ret < 0)
                        break;
        }
out:
        ret = finish_pending_nodes(trans, rc, path, ret);

out_free_path:
        btrfs_free_path(path);
out_free_blocks:
        free_block_list(blocks);
        return ret;
}

static noinline_for_stack int prealloc_file_extent_cluster(
                                struct btrfs_inode *inode,
                                const struct file_extent_cluster *cluster)
{
        u64 alloc_hint = 0;
        u64 start;
        u64 end;
        u64 offset = inode->index_cnt;
        u64 num_bytes;
        int nr;
        int ret = 0;
        u64 i_size = i_size_read(&inode->vfs_inode);
        u64 prealloc_start = cluster->start - offset;
        u64 prealloc_end = cluster->end - offset;
        u64 cur_offset = prealloc_start;

        /*
         * For subpage case, previous i_size may not be aligned to PAGE_SIZE.
         * This means the range [i_size, PAGE_END + 1) is filled with zeros by
         * btrfs_do_readpage() call of previously relocated file cluster.
         *
         * If the current cluster starts in the above range, btrfs_do_readpage()
         * will skip the read, and relocate_one_folio() will later writeback
         * the padding zeros as new data, causing data corruption.
         *
         * Here we have to manually invalidate the range (i_size, PAGE_END + 1).
         */
        if (!PAGE_ALIGNED(i_size)) {
                struct address_space *mapping = inode->vfs_inode.i_mapping;
                struct btrfs_fs_info *fs_info = inode->root->fs_info;
                const u32 sectorsize = fs_info->sectorsize;
                struct folio *folio;

                ASSERT(sectorsize < PAGE_SIZE);
                ASSERT(IS_ALIGNED(i_size, sectorsize));

                /*
                 * Subpage can't handle page with DIRTY but without UPTODATE
                 * bit as it can lead to the following deadlock:
                 *
                 * btrfs_read_folio()
                 * | Page already *locked*
                 * |- btrfs_lock_and_flush_ordered_range()
                 *    |- btrfs_start_ordered_extent()
                 *       |- extent_write_cache_pages()
                 *          |- lock_page()
                 *             We try to lock the page we already hold.
                 *
                 * Here we just writeback the whole data reloc inode, so that
                 * we will be ensured to have no dirty range in the page, and
                 * are safe to clear the uptodate bits.
                 *
                 * This shouldn't cause too much overhead, as we need to write
                 * the data back anyway.
                 */
                ret = filemap_write_and_wait(mapping);
                if (ret < 0)
                        return ret;

                clear_extent_bits(&inode->io_tree, i_size,
                                  round_up(i_size, PAGE_SIZE) - 1,
                                  EXTENT_UPTODATE);
                folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT);
                /*
                 * If page is freed we don't need to do anything then, as we
                 * will re-read the whole page anyway.
                 */
                if (!IS_ERR(folio)) {
                        btrfs_subpage_clear_uptodate(fs_info, folio, i_size,
                                        round_up(i_size, PAGE_SIZE) - i_size);
                        folio_unlock(folio);
                        folio_put(folio);
                }
        }

        BUG_ON(cluster->start != cluster->boundary[0]);
        ret = btrfs_alloc_data_chunk_ondemand(inode,
                                              prealloc_end + 1 - prealloc_start);
        if (ret)
                return ret;

        btrfs_inode_lock(inode, 0);
        for (nr = 0; nr < cluster->nr; nr++) {
                struct extent_state *cached_state = NULL;

                start = cluster->boundary[nr] - offset;
                if (nr + 1 < cluster->nr)
                        end = cluster->boundary[nr + 1] - 1 - offset;
                else
                        end = cluster->end - offset;

                lock_extent(&inode->io_tree, start, end, &cached_state);
                num_bytes = end + 1 - start;
                ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
                                                num_bytes, num_bytes,
                                                end + 1, &alloc_hint);
                cur_offset = end + 1;
                unlock_extent(&inode->io_tree, start, end, &cached_state);
                if (ret)
                        break;
        }
        btrfs_inode_unlock(inode, 0);

        if (cur_offset < prealloc_end)
                btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
                                               prealloc_end + 1 - cur_offset);
        return ret;
}

static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode,
                                u64 start, u64 end, u64 block_start)
{
        struct extent_map *em;
        struct extent_state *cached_state = NULL;
        int ret = 0;

        em = alloc_extent_map();
        if (!em)
                return -ENOMEM;

        em->start = start;
        em->len = end + 1 - start;
        em->block_len = em->len;
        em->block_start = block_start;
        em->flags |= EXTENT_FLAG_PINNED;

        lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
        ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
        unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
        free_extent_map(em);

        return ret;
}

/*
 * Allow error injection to test balance/relocation cancellation
 */
noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info)
{
        return atomic_read(&fs_info->balance_cancel_req) ||
                atomic_read(&fs_info->reloc_cancel_req) ||
                fatal_signal_pending(current);
}
ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);

static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster,
                                    int cluster_nr)
{
        /* Last extent, use cluster end directly */
        if (cluster_nr >= cluster->nr - 1)
                return cluster->end;

        /* Use next boundary start*/
        return cluster->boundary[cluster_nr + 1] - 1;
}

static int relocate_one_folio(struct inode *inode, struct file_ra_state *ra,
                              const struct file_extent_cluster *cluster,
                              int *cluster_nr, unsigned long index)
{
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        u64 offset = BTRFS_I(inode)->index_cnt;
        const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
        struct folio *folio;
        u64 folio_start;
        u64 folio_end;
        u64 cur;
        int ret;

        ASSERT(index <= last_index);
        folio = filemap_lock_folio(inode->i_mapping, index);
        if (IS_ERR(folio)) {
                page_cache_sync_readahead(inode->i_mapping, ra, NULL,
                                          index, last_index + 1 - index);
                folio = __filemap_get_folio(inode->i_mapping, index,
                                            FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
                if (IS_ERR(folio))
                        return PTR_ERR(folio);
        }

        WARN_ON(folio_order(folio));

        if (folio_test_readahead(folio))
                page_cache_async_readahead(inode->i_mapping, ra, NULL,
                                           folio, index,
                                           last_index + 1 - index);

        if (!folio_test_uptodate(folio)) {
                btrfs_read_folio(NULL, folio);
                folio_lock(folio);
                if (!folio_test_uptodate(folio)) {
                        ret = -EIO;
                        goto release_folio;
                }
        }

        /*
         * We could have lost folio private when we dropped the lock to read the
         * folio above, make sure we set_page_extent_mapped here so we have any
         * of the subpage blocksize stuff we need in place.
         */
        ret = set_folio_extent_mapped(folio);
        if (ret < 0)
                goto release_folio;

        folio_start = folio_pos(folio);
        folio_end = folio_start + PAGE_SIZE - 1;

        /*
         * Start from the cluster, as for subpage case, the cluster can start
         * inside the folio.
         */
        cur = max(folio_start, cluster->boundary[*cluster_nr] - offset);
        while (cur <= folio_end) {
                struct extent_state *cached_state = NULL;
                u64 extent_start = cluster->boundary[*cluster_nr] - offset;
                u64 extent_end = get_cluster_boundary_end(cluster,
                                                *cluster_nr) - offset;
                u64 clamped_start = max(folio_start, extent_start);
                u64 clamped_end = min(folio_end, extent_end);
                u32 clamped_len = clamped_end + 1 - clamped_start;

                /* Reserve metadata for this range */
                ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
                                                      clamped_len, clamped_len,
                                                      false);
                if (ret)
                        goto release_folio;

                /* Mark the range delalloc and dirty for later writeback */
                lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
                            &cached_state);
                ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
                                                clamped_end, 0, &cached_state);
                if (ret) {
                        clear_extent_bit(&BTRFS_I(inode)->io_tree,
                                         clamped_start, clamped_end,
                                         EXTENT_LOCKED | EXTENT_BOUNDARY,
                                         &cached_state);
                        btrfs_delalloc_release_metadata(BTRFS_I(inode),
                                                        clamped_len, true);
                        btrfs_delalloc_release_extents(BTRFS_I(inode),
                                                       clamped_len);
                        goto release_folio;
                }
                btrfs_folio_set_dirty(fs_info, folio, clamped_start, clamped_len);

                /*
                 * Set the boundary if it's inside the folio.
                 * Data relocation requires the destination extents to have the
                 * same size as the source.
                 * EXTENT_BOUNDARY bit prevents current extent from being merged
                 * with previous extent.
                 */
                if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) {
                        u64 boundary_start = cluster->boundary[*cluster_nr] -
                                                offset;
                        u64 boundary_end = boundary_start +
                                           fs_info->sectorsize - 1;

                        set_extent_bit(&BTRFS_I(inode)->io_tree,
                                       boundary_start, boundary_end,
                                       EXTENT_BOUNDARY, NULL);
                }
                unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
                              &cached_state);
                btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
                cur += clamped_len;

                /* Crossed extent end, go to next extent */
                if (cur >= extent_end) {
                        (*cluster_nr)++;
                        /* Just finished the last extent of the cluster, exit. */
                        if (*cluster_nr >= cluster->nr)
                                break;
                }
        }
        folio_unlock(folio);
        folio_put(folio);

        balance_dirty_pages_ratelimited(inode->i_mapping);
        btrfs_throttle(fs_info);
        if (btrfs_should_cancel_balance(fs_info))
                ret = -ECANCELED;
        return ret;

release_folio:
        folio_unlock(folio);
        folio_put(folio);
        return ret;
}

static int relocate_file_extent_cluster(struct inode *inode,
                                        const struct file_extent_cluster *cluster)
{
        u64 offset = BTRFS_I(inode)->index_cnt;
        unsigned long index;
        unsigned long last_index;
        struct file_ra_state *ra;
        int cluster_nr = 0;
        int ret = 0;

        if (!cluster->nr)
                return 0;

        ra = kzalloc(sizeof(*ra), GFP_NOFS);
        if (!ra)
                return -ENOMEM;

        ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster);
        if (ret)
                goto out;

        file_ra_state_init(ra, inode->i_mapping);

        ret = setup_relocation_extent_mapping(inode, cluster->start - offset,
                                   cluster->end - offset, cluster->start);
        if (ret)
                goto out;

        last_index = (cluster->end - offset) >> PAGE_SHIFT;
        for (index = (cluster->start - offset) >> PAGE_SHIFT;
             index <= last_index && !ret; index++)
                ret = relocate_one_folio(inode, ra, cluster, &cluster_nr, index);
        if (ret == 0)
                WARN_ON(cluster_nr != cluster->nr);
out:
        kfree(ra);
        return ret;
}

static noinline_for_stack int relocate_data_extent(struct inode *inode,
                                const struct btrfs_key *extent_key,
                                struct file_extent_cluster *cluster)
{
        int ret;
        struct btrfs_root *root = BTRFS_I(inode)->root;

        if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
                ret = relocate_file_extent_cluster(inode, cluster);
                if (ret)
                        return ret;
                cluster->nr = 0;
        }

        /*
         * Under simple quotas, we set root->relocation_src_root when we find
         * the extent. If adjacent extents have different owners, we can't merge
         * them while relocating. Handle this by storing the owning root that
         * started a cluster and if we see an extent from a different root break
         * cluster formation (just like the above case of non-adjacent extents).
         *
         * Without simple quotas, relocation_src_root is always 0, so we should
         * never see a mismatch, and it should have no effect on relocation
         * clusters.
         */
        if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) {
                u64 tmp = root->relocation_src_root;

                /*
                 * root->relocation_src_root is the state that actually affects
                 * the preallocation we do here, so set it to the root owning
                 * the cluster we need to relocate.
                 */
                root->relocation_src_root = cluster->owning_root;
                ret = relocate_file_extent_cluster(inode, cluster);
                if (ret)
                        return ret;
                cluster->nr = 0;
                /* And reset it back for the current extent's owning root. */
                root->relocation_src_root = tmp;
        }

        if (!cluster->nr) {
                cluster->start = extent_key->objectid;
                cluster->owning_root = root->relocation_src_root;
        }
        else
                BUG_ON(cluster->nr >= MAX_EXTENTS);
        cluster->end = extent_key->objectid + extent_key->offset - 1;
        cluster->boundary[cluster->nr] = extent_key->objectid;
        cluster->nr++;

        if (cluster->nr >= MAX_EXTENTS) {
                ret = relocate_file_extent_cluster(inode, cluster);
                if (ret)
                        return ret;
                cluster->nr = 0;
        }
        return 0;
}

/*
 * helper to add a tree block to the list.
 * the major work is getting the generation and level of the block
 */
static int add_tree_block(struct reloc_control *rc,
                          const struct btrfs_key *extent_key,
                          struct btrfs_path *path,
                          struct rb_root *blocks)
{
        struct extent_buffer *eb;
        struct btrfs_extent_item *ei;
        struct btrfs_tree_block_info *bi;
        struct tree_block *block;
        struct rb_node *rb_node;
        u32 item_size;
        int level = -1;
        u64 generation;
        u64 owner = 0;

        eb =  path->nodes[0];
        item_size = btrfs_item_size(eb, path->slots[0]);

        if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
            item_size >= sizeof(*ei) + sizeof(*bi)) {
                unsigned long ptr = 0, end;

                ei = btrfs_item_ptr(eb, path->slots[0],
                                struct btrfs_extent_item);
                end = (unsigned long)ei + item_size;
                if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
                        bi = (struct btrfs_tree_block_info *)(ei + 1);
                        level = btrfs_tree_block_level(eb, bi);
                        ptr = (unsigned long)(bi + 1);
                } else {
                        level = (int)extent_key->offset;
                        ptr = (unsigned long)(ei + 1);
                }
                generation = btrfs_extent_generation(eb, ei);

                /*
                 * We're reading random blocks without knowing their owner ahead
                 * of time.  This is ok most of the time, as all reloc roots and
                 * fs roots have the same lock type.  However normal trees do
                 * not, and the only way to know ahead of time is to read the
                 * inline ref offset.  We know it's an fs root if
                 *
                 * 1. There's more than one ref.
                 * 2. There's a SHARED_DATA_REF_KEY set.
                 * 3. FULL_BACKREF is set on the flags.
                 *
                 * Otherwise it's safe to assume that the ref offset == the
                 * owner of this block, so we can use that when calling
                 * read_tree_block.
                 */
                if (btrfs_extent_refs(eb, ei) == 1 &&
                    !(btrfs_extent_flags(eb, ei) &
                      BTRFS_BLOCK_FLAG_FULL_BACKREF) &&
                    ptr < end) {
                        struct btrfs_extent_inline_ref *iref;
                        int type;

                        iref = (struct btrfs_extent_inline_ref *)ptr;
                        type = btrfs_get_extent_inline_ref_type(eb, iref,
                                                        BTRFS_REF_TYPE_BLOCK);
                        if (type == BTRFS_REF_TYPE_INVALID)
                                return -EINVAL;
                        if (type == BTRFS_TREE_BLOCK_REF_KEY)
                                owner = btrfs_extent_inline_ref_offset(eb, iref);
                }
        } else {
                btrfs_print_leaf(eb);
                btrfs_err(rc->block_group->fs_info,
                          "unrecognized tree backref at tree block %llu slot %u",
                          eb->start, path->slots[0]);
                btrfs_release_path(path);
                return -EUCLEAN;
        }

        btrfs_release_path(path);

        BUG_ON(level == -1);

        block = kmalloc(sizeof(*block), GFP_NOFS);
        if (!block)
                return -ENOMEM;

        block->bytenr = extent_key->objectid;
        block->key.objectid = rc->extent_root->fs_info->nodesize;
        block->key.offset = generation;
        block->level = level;
        block->key_ready = false;
        block->owner = owner;

        rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
        if (rb_node)
                btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr,
                                    -EEXIST);

        return 0;
}

/*
 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
 */
static int __add_tree_block(struct reloc_control *rc,
                            u64 bytenr, u32 blocksize,
                            struct rb_root *blocks)
{
        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
        struct btrfs_path *path;
        struct btrfs_key key;
        int ret;
        bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);

        if (tree_block_processed(bytenr, rc))
                return 0;

        if (rb_simple_search(blocks, bytenr))
                return 0;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
again:
        key.objectid = bytenr;
        if (skinny) {
                key.type = BTRFS_METADATA_ITEM_KEY;
                key.offset = (u64)-1;
        } else {
                key.type = BTRFS_EXTENT_ITEM_KEY;
                key.offset = blocksize;
        }

        path->search_commit_root = 1;
        path->skip_locking = 1;
        ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
        if (ret < 0)
                goto out;

        if (ret > 0 && skinny) {
                if (path->slots[0]) {
                        path->slots[0]--;
                        btrfs_item_key_to_cpu(path->nodes[0], &key,
                                              path->slots[0]);
                        if (key.objectid == bytenr &&
                            (key.type == BTRFS_METADATA_ITEM_KEY ||
                             (key.type == BTRFS_EXTENT_ITEM_KEY &&
                              key.offset == blocksize)))
                                ret = 0;
                }

                if (ret) {
                        skinny = false;
                        btrfs_release_path(path);
                        goto again;
                }
        }
        if (ret) {
                ASSERT(ret == 1);
                btrfs_print_leaf(path->nodes[0]);
                btrfs_err(fs_info,
             "tree block extent item (%llu) is not found in extent tree",
                     bytenr);
                WARN_ON(1);
                ret = -EINVAL;
                goto out;
        }

        ret = add_tree_block(rc, &key, path, blocks);
out:
        btrfs_free_path(path);
        return ret;
}

static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
                                    struct btrfs_block_group *block_group,
                                    struct inode *inode,
                                    u64 ino)
{
        struct btrfs_root *root = fs_info->tree_root;
        struct btrfs_trans_handle *trans;
        int ret = 0;

        if (inode)
                goto truncate;

        inode = btrfs_iget(fs_info->sb, ino, root);
        if (IS_ERR(inode))
                return -ENOENT;

truncate:
        ret = btrfs_check_trunc_cache_free_space(fs_info,
                                                 &fs_info->global_block_rsv);
        if (ret)
                goto out;

        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto out;
        }

        ret = btrfs_truncate_free_space_cache(trans, block_group, inode);

        btrfs_end_transaction(trans);
        btrfs_btree_balance_dirty(fs_info);
out:
        iput(inode);
        return ret;
}

/*
 * Locate the free space cache EXTENT_DATA in root tree leaf and delete the
 * cache inode, to avoid free space cache data extent blocking data relocation.
 */
static int delete_v1_space_cache(struct extent_buffer *leaf,
                                 struct btrfs_block_group *block_group,
                                 u64 data_bytenr)
{
        u64 space_cache_ino;
        struct btrfs_file_extent_item *ei;
        struct btrfs_key key;
        bool found = false;
        int i;
        int ret;

        if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID)
                return 0;

        for (i = 0; i < btrfs_header_nritems(leaf); i++) {
                u8 type;

                btrfs_item_key_to_cpu(leaf, &key, i);
                if (key.type != BTRFS_EXTENT_DATA_KEY)
                        continue;
                ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
                type = btrfs_file_extent_type(leaf, ei);

                if ((type == BTRFS_FILE_EXTENT_REG ||
                     type == BTRFS_FILE_EXTENT_PREALLOC) &&
                    btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) {
                        found = true;
                        space_cache_ino = key.objectid;
                        break;
                }
        }
        if (!found)
                return -ENOENT;
        ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
                                        space_cache_ino);
        return ret;
}

/*
 * helper to find all tree blocks that reference a given data extent
 */
static noinline_for_stack int add_data_references(struct reloc_control *rc,
                                                  const struct btrfs_key *extent_key,
                                                  struct btrfs_path *path,
                                                  struct rb_root *blocks)
{
        struct btrfs_backref_walk_ctx ctx = { 0 };
        struct ulist_iterator leaf_uiter;
        struct ulist_node *ref_node = NULL;
        const u32 blocksize = rc->extent_root->fs_info->nodesize;
        int ret = 0;

        btrfs_release_path(path);

        ctx.bytenr = extent_key->objectid;
        ctx.skip_inode_ref_list = true;
        ctx.fs_info = rc->extent_root->fs_info;

        ret = btrfs_find_all_leafs(&ctx);
        if (ret < 0)
                return ret;

        ULIST_ITER_INIT(&leaf_uiter);
        while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) {
                struct btrfs_tree_parent_check check = { 0 };
                struct extent_buffer *eb;

                eb = read_tree_block(ctx.fs_info, ref_node->val, &check);
                if (IS_ERR(eb)) {
                        ret = PTR_ERR(eb);
                        break;
                }
                ret = delete_v1_space_cache(eb, rc->block_group,
                                            extent_key->objectid);
                free_extent_buffer(eb);
                if (ret < 0)
                        break;
                ret = __add_tree_block(rc, ref_node->val, blocksize, blocks);
                if (ret < 0)
                        break;
        }
        if (ret < 0)
                free_block_list(blocks);
        ulist_free(ctx.refs);
        return ret;
}

/*
 * helper to find next unprocessed extent
 */
static noinline_for_stack
int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
                     struct btrfs_key *extent_key)
{
        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
        struct btrfs_key key;
        struct extent_buffer *leaf;
        u64 start, end, last;
        int ret;

        last = rc->block_group->start + rc->block_group->length;
        while (1) {
                bool block_found;

                cond_resched();
                if (rc->search_start >= last) {
                        ret = 1;
                        break;
                }

                key.objectid = rc->search_start;
                key.type = BTRFS_EXTENT_ITEM_KEY;
                key.offset = 0;

                path->search_commit_root = 1;
                path->skip_locking = 1;
                ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
                                        0, 0);
                if (ret < 0)
                        break;
next:
                leaf = path->nodes[0];
                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
                        ret = btrfs_next_leaf(rc->extent_root, path);
                        if (ret != 0)
                                break;
                        leaf = path->nodes[0];
                }

                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
                if (key.objectid >= last) {
                        ret = 1;
                        break;
                }

                if (key.type != BTRFS_EXTENT_ITEM_KEY &&
                    key.type != BTRFS_METADATA_ITEM_KEY) {
                        path->slots[0]++;
                        goto next;
                }

                if (key.type == BTRFS_EXTENT_ITEM_KEY &&
                    key.objectid + key.offset <= rc->search_start) {
                        path->slots[0]++;
                        goto next;
                }

                if (key.type == BTRFS_METADATA_ITEM_KEY &&
                    key.objectid + fs_info->nodesize <=
                    rc->search_start) {
                        path->slots[0]++;
                        goto next;
                }

                block_found = find_first_extent_bit(&rc->processed_blocks,
                                                    key.objectid, &start, &end,
                                                    EXTENT_DIRTY, NULL);

                if (block_found && start <= key.objectid) {
                        btrfs_release_path(path);
                        rc->search_start = end + 1;
                } else {
                        if (key.type == BTRFS_EXTENT_ITEM_KEY)
                                rc->search_start = key.objectid + key.offset;
                        else
                                rc->search_start = key.objectid +
                                        fs_info->nodesize;
                        memcpy(extent_key, &key, sizeof(key));
                        return 0;
                }
        }
        btrfs_release_path(path);
        return ret;
}

static void set_reloc_control(struct reloc_control *rc)
{
        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;

        mutex_lock(&fs_info->reloc_mutex);
        fs_info->reloc_ctl = rc;
        mutex_unlock(&fs_info->reloc_mutex);
}

static void unset_reloc_control(struct reloc_control *rc)
{
        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;

        mutex_lock(&fs_info->reloc_mutex);
        fs_info->reloc_ctl = NULL;
        mutex_unlock(&fs_info->reloc_mutex);
}

static noinline_for_stack
int prepare_to_relocate(struct reloc_control *rc)
{
        struct btrfs_trans_handle *trans;
        int ret;

        rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
                                              BTRFS_BLOCK_RSV_TEMP);
        if (!rc->block_rsv)
                return -ENOMEM;

        memset(&rc->cluster, 0, sizeof(rc->cluster));
        rc->search_start = rc->block_group->start;
        rc->extents_found = 0;
        rc->nodes_relocated = 0;
        rc->merging_rsv_size = 0;
        rc->reserved_bytes = 0;
        rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
                              RELOCATION_RESERVED_NODES;
        ret = btrfs_block_rsv_refill(rc->extent_root->fs_info,
                                     rc->block_rsv, rc->block_rsv->size,
                                     BTRFS_RESERVE_FLUSH_ALL);
        if (ret)
                return ret;

        rc->create_reloc_tree = true;
        set_reloc_control(rc);

        trans = btrfs_join_transaction(rc->extent_root);
        if (IS_ERR(trans)) {
                unset_reloc_control(rc);
                /*
                 * extent tree is not a ref_cow tree and has no reloc_root to
                 * cleanup.  And callers are responsible to free the above
                 * block rsv.
                 */
                return PTR_ERR(trans);
        }

        ret = btrfs_commit_transaction(trans);
        if (ret)
                unset_reloc_control(rc);

        return ret;
}

static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
{
        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
        struct rb_root blocks = RB_ROOT;
        struct btrfs_key key;
        struct btrfs_trans_handle *trans = NULL;
        struct btrfs_path *path;
        struct btrfs_extent_item *ei;
        u64 flags;
        int ret;
        int err = 0;
        int progress = 0;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        path->reada = READA_FORWARD;

        ret = prepare_to_relocate(rc);
        if (ret) {
                err = ret;
                goto out_free;
        }

        while (1) {
                rc->reserved_bytes = 0;
                ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
                                             rc->block_rsv->size,
                                             BTRFS_RESERVE_FLUSH_ALL);
                if (ret) {
                        err = ret;
                        break;
                }
                progress++;
                trans = btrfs_start_transaction(rc->extent_root, 0);
                if (IS_ERR(trans)) {
                        err = PTR_ERR(trans);
                        trans = NULL;
                        break;
                }
restart:
                if (update_backref_cache(trans, &rc->backref_cache)) {
                        btrfs_end_transaction(trans);
                        trans = NULL;
                        continue;
                }

                ret = find_next_extent(rc, path, &key);
                if (ret < 0)
                        err = ret;
                if (ret != 0)
                        break;

                rc->extents_found++;

                ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                    struct btrfs_extent_item);
                flags = btrfs_extent_flags(path->nodes[0], ei);

                /*
                 * If we are relocating a simple quota owned extent item, we
                 * need to note the owner on the reloc data root so that when
                 * we allocate the replacement item, we can attribute it to the
                 * correct eventual owner (rather than the reloc data root).
                 */
                if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) {
                        struct btrfs_root *root = BTRFS_I(rc->data_inode)->root;
                        u64 owning_root_id = btrfs_get_extent_owner_root(fs_info,
                                                                 path->nodes[0],
                                                                 path->slots[0]);

                        root->relocation_src_root = owning_root_id;
                }

                if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                        ret = add_tree_block(rc, &key, path, &blocks);
                } else if (rc->stage == UPDATE_DATA_PTRS &&
                           (flags & BTRFS_EXTENT_FLAG_DATA)) {
                        ret = add_data_references(rc, &key, path, &blocks);
                } else {
                        btrfs_release_path(path);
                        ret = 0;
                }
                if (ret < 0) {
                        err = ret;
                        break;
                }

                if (!RB_EMPTY_ROOT(&blocks)) {
                        ret = relocate_tree_blocks(trans, rc, &blocks);
                        if (ret < 0) {
                                if (ret != -EAGAIN) {
                                        err = ret;
                                        break;
                                }
                                rc->extents_found--;
                                rc->search_start = key.objectid;
                        }
                }

                btrfs_end_transaction_throttle(trans);
                btrfs_btree_balance_dirty(fs_info);
                trans = NULL;

                if (rc->stage == MOVE_DATA_EXTENTS &&
                    (flags & BTRFS_EXTENT_FLAG_DATA)) {
                        rc->found_file_extent = true;
                        ret = relocate_data_extent(rc->data_inode,
                                                   &key, &rc->cluster);
                        if (ret < 0) {
                                err = ret;
                                break;
                        }
                }
                if (btrfs_should_cancel_balance(fs_info)) {
                        err = -ECANCELED;
                        break;
                }
        }
        if (trans && progress && err == -ENOSPC) {
                ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
                if (ret == 1) {
                        err = 0;
                        progress = 0;
                        goto restart;
                }
        }

        btrfs_release_path(path);
        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);

        if (trans) {
                btrfs_end_transaction_throttle(trans);
                btrfs_btree_balance_dirty(fs_info);
        }

        if (!err) {
                ret = relocate_file_extent_cluster(rc->data_inode,
                                                   &rc->cluster);
                if (ret < 0)
                        err = ret;
        }

        rc->create_reloc_tree = false;
        set_reloc_control(rc);

        btrfs_backref_release_cache(&rc->backref_cache);
        btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);

        /*
         * Even in the case when the relocation is cancelled, we should all go
         * through prepare_to_merge() and merge_reloc_roots().
         *
         * For error (including cancelled balance), prepare_to_merge() will
         * mark all reloc trees orphan, then queue them for cleanup in
         * merge_reloc_roots()
         */
        err = prepare_to_merge(rc, err);

        merge_reloc_roots(rc);

        rc->merge_reloc_tree = false;
        unset_reloc_control(rc);
        btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);

        /* get rid of pinned extents */
        trans = btrfs_join_transaction(rc->extent_root);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto out_free;
        }
        ret = btrfs_commit_transaction(trans);
        if (ret && !err)
                err = ret;
out_free:
        ret = clean_dirty_subvols(rc);
        if (ret < 0 && !err)
                err = ret;
        btrfs_free_block_rsv(fs_info, rc->block_rsv);
        btrfs_free_path(path);
        return err;
}

static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root, u64 objectid)
{
        struct btrfs_path *path;
        struct btrfs_inode_item *item;
        struct extent_buffer *leaf;
        int ret;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        ret = btrfs_insert_empty_inode(trans, root, path, objectid);
        if (ret)
                goto out;

        leaf = path->nodes[0];
        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
        memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
        btrfs_set_inode_generation(leaf, item, 1);
        btrfs_set_inode_size(leaf, item, 0);
        btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
                                          BTRFS_INODE_PREALLOC);
        btrfs_mark_buffer_dirty(trans, leaf);
out:
        btrfs_free_path(path);
        return ret;
}

static void delete_orphan_inode(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root, u64 objectid)
{
        struct btrfs_path *path;
        struct btrfs_key key;
        int ret = 0;

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }

        key.objectid = objectid;
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret) {
                if (ret > 0)
                        ret = -ENOENT;
                goto out;
        }
        ret = btrfs_del_item(trans, root, path);
out:
        if (ret)
                btrfs_abort_transaction(trans, ret);
        btrfs_free_path(path);
}

/*
 * helper to create inode for data relocation.
 * the inode is in data relocation tree and its link count is 0
 */
static noinline_for_stack struct inode *create_reloc_inode(
                                        struct btrfs_fs_info *fs_info,
                                        const struct btrfs_block_group *group)
{
        struct inode *inode = NULL;
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root;
        u64 objectid;
        int ret = 0;

        root = btrfs_grab_root(fs_info->data_reloc_root);
        trans = btrfs_start_transaction(root, 6);
        if (IS_ERR(trans)) {
                btrfs_put_root(root);
                return ERR_CAST(trans);
        }

        ret = btrfs_get_free_objectid(root, &objectid);
        if (ret)
                goto out;

        ret = __insert_orphan_inode(trans, root, objectid);
        if (ret)
                goto out;

        inode = btrfs_iget(fs_info->sb, objectid, root);
        if (IS_ERR(inode)) {
                delete_orphan_inode(trans, root, objectid);
                ret = PTR_ERR(inode);
                inode = NULL;
                goto out;
        }
        BTRFS_I(inode)->index_cnt = group->start;

        ret = btrfs_orphan_add(trans, BTRFS_I(inode));
out:
        btrfs_put_root(root);
        btrfs_end_transaction(trans);
        btrfs_btree_balance_dirty(fs_info);
        if (ret) {
                iput(inode);
                inode = ERR_PTR(ret);
        }
        return inode;
}

/*
 * Mark start of chunk relocation that is cancellable. Check if the cancellation
 * has been requested meanwhile and don't start in that case.
 *
 * Return:
 *   0             success
 *   -EINPROGRESS  operation is already in progress, that's probably a bug
 *   -ECANCELED    cancellation request was set before the operation started
 */
static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
{
        if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
                /* This should not happen */
                btrfs_err(fs_info, "reloc already running, cannot start");
                return -EINPROGRESS;
        }

        if (atomic_read(&fs_info->reloc_cancel_req) > 0) {
                btrfs_info(fs_info, "chunk relocation canceled on start");
                /*
                 * On cancel, clear all requests but let the caller mark
                 * the end after cleanup operations.
                 */
                atomic_set(&fs_info->reloc_cancel_req, 0);
                return -ECANCELED;
        }
        return 0;
}

/*
 * Mark end of chunk relocation that is cancellable and wake any waiters.
 */
static void reloc_chunk_end(struct btrfs_fs_info *fs_info)
{
        /* Requested after start, clear bit first so any waiters can continue */
        if (atomic_read(&fs_info->reloc_cancel_req) > 0)
                btrfs_info(fs_info, "chunk relocation canceled during operation");
        clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);
        atomic_set(&fs_info->reloc_cancel_req, 0);
}

static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
{
        struct reloc_control *rc;

        rc = kzalloc(sizeof(*rc), GFP_NOFS);
        if (!rc)
                return NULL;

        INIT_LIST_HEAD(&rc->reloc_roots);
        INIT_LIST_HEAD(&rc->dirty_subvol_roots);
        btrfs_backref_init_cache(fs_info, &rc->backref_cache, true);
        rc->reloc_root_tree.rb_root = RB_ROOT;
        spin_lock_init(&rc->reloc_root_tree.lock);
        extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
        return rc;
}

static void free_reloc_control(struct reloc_control *rc)
{
        struct mapping_node *node, *tmp;

        free_reloc_roots(&rc->reloc_roots);
        rbtree_postorder_for_each_entry_safe(node, tmp,
                        &rc->reloc_root_tree.rb_root, rb_node)
                kfree(node);

        kfree(rc);
}

/*
 * Print the block group being relocated
 */
static void describe_relocation(struct btrfs_fs_info *fs_info,
                                struct btrfs_block_group *block_group)
{
        char buf[128] = {'\0'};

        btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));

        btrfs_info(fs_info,
                   "relocating block group %llu flags %s",
                   block_group->start, buf);
}

static const char *stage_to_string(enum reloc_stage stage)
{
        if (stage == MOVE_DATA_EXTENTS)
                return "move data extents";
        if (stage == UPDATE_DATA_PTRS)
                return "update data pointers";
        return "unknown";
}

/*
 * function to relocate all extents in a block group.
 */
int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
{
        struct btrfs_block_group *bg;
        struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start);
        struct reloc_control *rc;
        struct inode *inode;
        struct btrfs_path *path;
        int ret;
        int rw = 0;
        int err = 0;

        /*
         * This only gets set if we had a half-deleted snapshot on mount.  We
         * cannot allow relocation to start while we're still trying to clean up
         * these pending deletions.
         */
        ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE);
        if (ret)
                return ret;

        /* We may have been woken up by close_ctree, so bail if we're closing. */
        if (btrfs_fs_closing(fs_info))
                return -EINTR;

        bg = btrfs_lookup_block_group(fs_info, group_start);
        if (!bg)
                return -ENOENT;

        /*
         * Relocation of a data block group creates ordered extents.  Without
         * sb_start_write(), we can freeze the filesystem while unfinished
         * ordered extents are left. Such ordered extents can cause a deadlock
         * e.g. when syncfs() is waiting for their completion but they can't
         * finish because they block when joining a transaction, due to the
         * fact that the freeze locks are being held in write mode.
         */
        if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
                ASSERT(sb_write_started(fs_info->sb));

        if (btrfs_pinned_by_swapfile(fs_info, bg)) {
                btrfs_put_block_group(bg);
                return -ETXTBSY;
        }

        rc = alloc_reloc_control(fs_info);
        if (!rc) {
                btrfs_put_block_group(bg);
                return -ENOMEM;
        }

        ret = reloc_chunk_start(fs_info);
        if (ret < 0) {
                err = ret;
                goto out_put_bg;
        }

        rc->extent_root = extent_root;
        rc->block_group = bg;

        ret = btrfs_inc_block_group_ro(rc->block_group, true);
        if (ret) {
                err = ret;
                goto out;
        }
        rw = 1;

        path = btrfs_alloc_path();
        if (!path) {
                err = -ENOMEM;
                goto out;
        }

        inode = lookup_free_space_inode(rc->block_group, path);
        btrfs_free_path(path);

        if (!IS_ERR(inode))
                ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
        else
                ret = PTR_ERR(inode);

        if (ret && ret != -ENOENT) {
                err = ret;
                goto out;
        }

        rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
        if (IS_ERR(rc->data_inode)) {
                err = PTR_ERR(rc->data_inode);
                rc->data_inode = NULL;
                goto out;
        }

        describe_relocation(fs_info, rc->block_group);

        btrfs_wait_block_group_reservations(rc->block_group);
        btrfs_wait_nocow_writers(rc->block_group);
        btrfs_wait_ordered_roots(fs_info, U64_MAX,
                                 rc->block_group->start,
                                 rc->block_group->length);

        ret = btrfs_zone_finish(rc->block_group);
        WARN_ON(ret && ret != -EAGAIN);

        while (1) {
                enum reloc_stage finishes_stage;

                mutex_lock(&fs_info->cleaner_mutex);
                ret = relocate_block_group(rc);
                mutex_unlock(&fs_info->cleaner_mutex);
                if (ret < 0)
                        err = ret;

                finishes_stage = rc->stage;
                /*
                 * We may have gotten ENOSPC after we already dirtied some
                 * extents.  If writeout happens while we're relocating a
                 * different block group we could end up hitting the
                 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
                 * btrfs_reloc_cow_block.  Make sure we write everything out
                 * properly so we don't trip over this problem, and then break
                 * out of the loop if we hit an error.
                 */
                if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
                        ret = btrfs_wait_ordered_range(rc->data_inode, 0,
                                                       (u64)-1);
                        if (ret)
                                err = ret;
                        invalidate_mapping_pages(rc->data_inode->i_mapping,
                                                 0, -1);
                        rc->stage = UPDATE_DATA_PTRS;
                }

                if (err < 0)
                        goto out;

                if (rc->extents_found == 0)
                        break;

                btrfs_info(fs_info, "found %llu extents, stage: %s",
                           rc->extents_found, stage_to_string(finishes_stage));
        }

        WARN_ON(rc->block_group->pinned > 0);
        WARN_ON(rc->block_group->reserved > 0);
        WARN_ON(rc->block_group->used > 0);
out:
        if (err && rw)
                btrfs_dec_block_group_ro(rc->block_group);
        iput(rc->data_inode);
out_put_bg:
        btrfs_put_block_group(bg);
        reloc_chunk_end(fs_info);
        free_reloc_control(rc);
        return err;
}

static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_trans_handle *trans;
        int ret, err;

        trans = btrfs_start_transaction(fs_info->tree_root, 0);
        if (IS_ERR(trans))
                return PTR_ERR(trans);

        memset(&root->root_item.drop_progress, 0,
                sizeof(root->root_item.drop_progress));
        btrfs_set_root_drop_level(&root->root_item, 0);
        btrfs_set_root_refs(&root->root_item, 0);
        ret = btrfs_update_root(trans, fs_info->tree_root,
                                &root->root_key, &root->root_item);

        err = btrfs_end_transaction(trans);
        if (err)
                return err;
        return ret;
}

/*
 * recover relocation interrupted by system crash.
 *
 * this function resumes merging reloc trees with corresponding fs trees.
 * this is important for keeping the sharing of tree blocks
 */
int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
{
        LIST_HEAD(reloc_roots);
        struct btrfs_key key;
        struct btrfs_root *fs_root;
        struct btrfs_root *reloc_root;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct reloc_control *rc = NULL;
        struct btrfs_trans_handle *trans;
        int ret;
        int err = 0;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        path->reada = READA_BACK;

        key.objectid = BTRFS_TREE_RELOC_OBJECTID;
        key.type = BTRFS_ROOT_ITEM_KEY;
        key.offset = (u64)-1;

        while (1) {
                ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
                                        path, 0, 0);
                if (ret < 0) {
                        err = ret;
                        goto out;
                }
                if (ret > 0) {
                        if (path->slots[0] == 0)
                                break;
                        path->slots[0]--;
                }
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
                btrfs_release_path(path);

                if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
                    key.type != BTRFS_ROOT_ITEM_KEY)
                        break;

                reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key);
                if (IS_ERR(reloc_root)) {
                        err = PTR_ERR(reloc_root);
                        goto out;
                }

                set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
                list_add(&reloc_root->root_list, &reloc_roots);

                if (btrfs_root_refs(&reloc_root->root_item) > 0) {
                        fs_root = btrfs_get_fs_root(fs_info,
                                        reloc_root->root_key.offset, false);
                        if (IS_ERR(fs_root)) {
                                ret = PTR_ERR(fs_root);
                                if (ret != -ENOENT) {
                                        err = ret;
                                        goto out;
                                }
                                ret = mark_garbage_root(reloc_root);
                                if (ret < 0) {
                                        err = ret;
                                        goto out;
                                }
                        } else {
                                btrfs_put_root(fs_root);
                        }
                }

                if (key.offset == 0)
                        break;

                key.offset--;
        }
        btrfs_release_path(path);

        if (list_empty(&reloc_roots))
                goto out;

        rc = alloc_reloc_control(fs_info);
        if (!rc) {
                err = -ENOMEM;
                goto out;
        }

        ret = reloc_chunk_start(fs_info);
        if (ret < 0) {
                err = ret;
                goto out_end;
        }

        rc->extent_root = btrfs_extent_root(fs_info, 0);

        set_reloc_control(rc);

        trans = btrfs_join_transaction(rc->extent_root);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto out_unset;
        }

        rc->merge_reloc_tree = true;

        while (!list_empty(&reloc_roots)) {
                reloc_root = list_entry(reloc_roots.next,
                                        struct btrfs_root, root_list);
                list_del(&reloc_root->root_list);

                if (btrfs_root_refs(&reloc_root->root_item) == 0) {
                        list_add_tail(&reloc_root->root_list,
                                      &rc->reloc_roots);
                        continue;
                }

                fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
                                            false);
                if (IS_ERR(fs_root)) {
                        err = PTR_ERR(fs_root);
                        list_add_tail(&reloc_root->root_list, &reloc_roots);
                        btrfs_end_transaction(trans);
                        goto out_unset;
                }

                err = __add_reloc_root(reloc_root);
                ASSERT(err != -EEXIST);
                if (err) {
                        list_add_tail(&reloc_root->root_list, &reloc_roots);
                        btrfs_put_root(fs_root);
                        btrfs_end_transaction(trans);
                        goto out_unset;
                }
                fs_root->reloc_root = btrfs_grab_root(reloc_root);
                btrfs_put_root(fs_root);
        }

        err = btrfs_commit_transaction(trans);
        if (err)
                goto out_unset;

        merge_reloc_roots(rc);

        unset_reloc_control(rc);

        trans = btrfs_join_transaction(rc->extent_root);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto out_clean;
        }
        err = btrfs_commit_transaction(trans);
out_clean:
        ret = clean_dirty_subvols(rc);
        if (ret < 0 && !err)
                err = ret;
out_unset:
        unset_reloc_control(rc);
out_end:
        reloc_chunk_end(fs_info);
        free_reloc_control(rc);
out:
        free_reloc_roots(&reloc_roots);

        btrfs_free_path(path);

        if (err == 0) {
                /* cleanup orphan inode in data relocation tree */
                fs_root = btrfs_grab_root(fs_info->data_reloc_root);
                ASSERT(fs_root);
                err = btrfs_orphan_cleanup(fs_root);
                btrfs_put_root(fs_root);
        }
        return err;
}

/*
 * helper to add ordered checksum for data relocation.
 *
 * cloning checksum properly handles the nodatasum extents.
 * it also saves CPU time to re-calculate the checksum.
 */
int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered)
{
        struct btrfs_inode *inode = BTRFS_I(ordered->inode);
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        u64 disk_bytenr = ordered->file_offset + inode->index_cnt;
        struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr);
        LIST_HEAD(list);
        int ret;

        ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
                                      disk_bytenr + ordered->num_bytes - 1,
                                      &list, false);
        if (ret < 0) {
                btrfs_mark_ordered_extent_error(ordered);
                return ret;
        }

        while (!list_empty(&list)) {
                struct btrfs_ordered_sum *sums =
                        list_entry(list.next, struct btrfs_ordered_sum, list);

                list_del_init(&sums->list);

                /*
                 * We need to offset the new_bytenr based on where the csum is.
                 * We need to do this because we will read in entire prealloc
                 * extents but we may have written to say the middle of the
                 * prealloc extent, so we need to make sure the csum goes with
                 * the right disk offset.
                 *
                 * We can do this because the data reloc inode refers strictly
                 * to the on disk bytes, so we don't have to worry about
                 * disk_len vs real len like with real inodes since it's all
                 * disk length.
                 */
                sums->logical = ordered->disk_bytenr + sums->logical - disk_bytenr;
                btrfs_add_ordered_sum(ordered, sums);
        }

        return 0;
}

int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          const struct extent_buffer *buf,
                          struct extent_buffer *cow)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct reloc_control *rc;
        struct btrfs_backref_node *node;
        int first_cow = 0;
        int level;
        int ret = 0;

        rc = fs_info->reloc_ctl;
        if (!rc)
                return 0;

        BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root));

        level = btrfs_header_level(buf);
        if (btrfs_header_generation(buf) <=
            btrfs_root_last_snapshot(&root->root_item))
                first_cow = 1;

        if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID && rc->create_reloc_tree) {
                WARN_ON(!first_cow && level == 0);

                node = rc->backref_cache.path[level];
                BUG_ON(node->bytenr != buf->start &&
                       node->new_bytenr != buf->start);

                btrfs_backref_drop_node_buffer(node);
                atomic_inc(&cow->refs);
                node->eb = cow;
                node->new_bytenr = cow->start;

                if (!node->pending) {
                        list_move_tail(&node->list,
                                       &rc->backref_cache.pending[level]);
                        node->pending = 1;
                }

                if (first_cow)
                        mark_block_processed(rc, node);

                if (first_cow && level > 0)
                        rc->nodes_relocated += buf->len;
        }

        if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
                ret = replace_file_extents(trans, rc, root, cow);
        return ret;
}

/*
 * called before creating snapshot. it calculates metadata reservation
 * required for relocating tree blocks in the snapshot
 */
void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
                              u64 *bytes_to_reserve)
{
        struct btrfs_root *root = pending->root;
        struct reloc_control *rc = root->fs_info->reloc_ctl;

        if (!rc || !have_reloc_root(root))
                return;

        if (!rc->merge_reloc_tree)
                return;

        root = root->reloc_root;
        BUG_ON(btrfs_root_refs(&root->root_item) == 0);
        /*
         * relocation is in the stage of merging trees. the space
         * used by merging a reloc tree is twice the size of
         * relocated tree nodes in the worst case. half for cowing
         * the reloc tree, half for cowing the fs tree. the space
         * used by cowing the reloc tree will be freed after the
         * tree is dropped. if we create snapshot, cowing the fs
         * tree may use more space than it frees. so we need
         * reserve extra space.
         */
        *bytes_to_reserve += rc->nodes_relocated;
}

/*
 * called after snapshot is created. migrate block reservation
 * and create reloc root for the newly created snapshot
 *
 * This is similar to btrfs_init_reloc_root(), we come out of here with two
 * references held on the reloc_root, one for root->reloc_root and one for
 * rc->reloc_roots.
 */
int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
                               struct btrfs_pending_snapshot *pending)
{
        struct btrfs_root *root = pending->root;
        struct btrfs_root *reloc_root;
        struct btrfs_root *new_root;
        struct reloc_control *rc = root->fs_info->reloc_ctl;
        int ret;

        if (!rc || !have_reloc_root(root))
                return 0;

        rc = root->fs_info->reloc_ctl;
        rc->merging_rsv_size += rc->nodes_relocated;

        if (rc->merge_reloc_tree) {
                ret = btrfs_block_rsv_migrate(&pending->block_rsv,
                                              rc->block_rsv,
                                              rc->nodes_relocated, true);
                if (ret)
                        return ret;
        }

        new_root = pending->snap;
        reloc_root = create_reloc_root(trans, root->reloc_root, btrfs_root_id(new_root));
        if (IS_ERR(reloc_root))
                return PTR_ERR(reloc_root);

        ret = __add_reloc_root(reloc_root);
        ASSERT(ret != -EEXIST);
        if (ret) {
                /* Pairs with create_reloc_root */
                btrfs_put_root(reloc_root);
                return ret;
        }
        new_root->reloc_root = btrfs_grab_root(reloc_root);

        if (rc->create_reloc_tree)
                ret = clone_backref_node(trans, rc, root, reloc_root);
        return ret;
}

/*
 * Get the current bytenr for the block group which is being relocated.
 *
 * Return U64_MAX if no running relocation.
 */
u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info)
{
        u64 logical = U64_MAX;

        lockdep_assert_held(&fs_info->reloc_mutex);

        if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group)
                logical = fs_info->reloc_ctl->block_group->start;
        return logical;
}































    1 




    1 



    1 










    1 



    1 


































































































































    1 









    1 












































    1 










    1 



























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/crypto/hooks.c
 *
 * Encryption hooks for higher-level filesystem operations.
 */

#include "fscrypt_private.h"

/**
 * fscrypt_file_open() - prepare to open a possibly-encrypted regular file
 * @inode: the inode being opened
 * @filp: the struct file being set up
 *
 * Currently, an encrypted regular file can only be opened if its encryption key
 * is available; access to the raw encrypted contents is not supported.
 * Therefore, we first set up the inode's encryption key (if not already done)
 * and return an error if it's unavailable.
 *
 * We also verify that if the parent directory (from the path via which the file
 * is being opened) is encrypted, then the inode being opened uses the same
 * encryption policy.  This is needed as part of the enforcement that all files
 * in an encrypted directory tree use the same encryption policy, as a
 * protection against certain types of offline attacks.  Note that this check is
 * needed even when opening an *unencrypted* file, since it's forbidden to have
 * an unencrypted file in an encrypted directory.
 *
 * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
 */
int fscrypt_file_open(struct inode *inode, struct file *filp)
{
        int err;
        struct dentry *dentry, *dentry_parent;
        struct inode *inode_parent;

        err = fscrypt_require_key(inode);
        if (err)
                return err;

        dentry = file_dentry(filp);

        /*
         * Getting a reference to the parent dentry is needed for the actual
         * encryption policy comparison, but it's expensive on multi-core
         * systems.  Since this function runs on unencrypted files too, start
         * with a lightweight RCU-mode check for the parent directory being
         * unencrypted (in which case it's fine for the child to be either
         * unencrypted, or encrypted with any policy).  Only continue on to the
         * full policy check if the parent directory is actually encrypted.
         */
        rcu_read_lock();
        dentry_parent = READ_ONCE(dentry->d_parent);
        inode_parent = d_inode_rcu(dentry_parent);
        if (inode_parent != NULL && !IS_ENCRYPTED(inode_parent)) {
                rcu_read_unlock();
                return 0;
        }
        rcu_read_unlock();

        dentry_parent = dget_parent(dentry);
        if (!fscrypt_has_permitted_context(d_inode(dentry_parent), inode)) {
                fscrypt_warn(inode,
                             "Inconsistent encryption context (parent directory: %lu)",
                             d_inode(dentry_parent)->i_ino);
                err = -EPERM;
        }
        dput(dentry_parent);
        return err;
}
EXPORT_SYMBOL_GPL(fscrypt_file_open);

int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
                           struct dentry *dentry)
{
        if (fscrypt_is_nokey_name(dentry))
                return -ENOKEY;
        /*
         * We don't need to separately check that the directory inode's key is
         * available, as it's implied by the dentry not being a no-key name.
         */

        if (!fscrypt_has_permitted_context(dir, inode))
                return -EXDEV;

        return 0;
}
EXPORT_SYMBOL_GPL(__fscrypt_prepare_link);

int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry,
                             unsigned int flags)
{
        if (fscrypt_is_nokey_name(old_dentry) ||
            fscrypt_is_nokey_name(new_dentry))
                return -ENOKEY;
        /*
         * We don't need to separately check that the directory inodes' keys are
         * available, as it's implied by the dentries not being no-key names.
         */

        if (old_dir != new_dir) {
                if (IS_ENCRYPTED(new_dir) &&
                    !fscrypt_has_permitted_context(new_dir,
                                                   d_inode(old_dentry)))
                        return -EXDEV;

                if ((flags & RENAME_EXCHANGE) &&
                    IS_ENCRYPTED(old_dir) &&
                    !fscrypt_has_permitted_context(old_dir,
                                                   d_inode(new_dentry)))
                        return -EXDEV;
        }
        return 0;
}
EXPORT_SYMBOL_GPL(__fscrypt_prepare_rename);

int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
                             struct fscrypt_name *fname)
{
        int err = fscrypt_setup_filename(dir, &dentry->d_name, 1, fname);

        if (err && err != -ENOENT)
                return err;

        fscrypt_prepare_dentry(dentry, fname->is_nokey_name);

        return err;
}
EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);

/**
 * fscrypt_prepare_lookup_partial() - prepare lookup without filename setup
 * @dir: the encrypted directory being searched
 * @dentry: the dentry being looked up in @dir
 *
 * This function should be used by the ->lookup and ->atomic_open methods of
 * filesystems that handle filename encryption and no-key name encoding
 * themselves and thus can't use fscrypt_prepare_lookup().  Like
 * fscrypt_prepare_lookup(), this will try to set up the directory's encryption
 * key and will set DCACHE_NOKEY_NAME on the dentry if the key is unavailable.
 * However, this function doesn't set up a struct fscrypt_name for the filename.
 *
 * Return: 0 on success; -errno on error.  Note that the encryption key being
 *           unavailable is not considered an error.  It is also not an error if
 *           the encryption policy is unsupported by this kernel; that is treated
 *           like the key being unavailable, so that files can still be deleted.
 */
int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry)
{
        int err = fscrypt_get_encryption_info(dir, true);
        bool is_nokey_name = (!err && !fscrypt_has_encryption_key(dir));

        fscrypt_prepare_dentry(dentry, is_nokey_name);

        return err;
}
EXPORT_SYMBOL_GPL(fscrypt_prepare_lookup_partial);

int __fscrypt_prepare_readdir(struct inode *dir)
{
        return fscrypt_get_encryption_info(dir, true);
}
EXPORT_SYMBOL_GPL(__fscrypt_prepare_readdir);

int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr)
{
        if (attr->ia_valid & ATTR_SIZE)
                return fscrypt_require_key(d_inode(dentry));
        return 0;
}
EXPORT_SYMBOL_GPL(__fscrypt_prepare_setattr);

/**
 * fscrypt_prepare_setflags() - prepare to change flags with FS_IOC_SETFLAGS
 * @inode: the inode on which flags are being changed
 * @oldflags: the old flags
 * @flags: the new flags
 *
 * The caller should be holding i_rwsem for write.
 *
 * Return: 0 on success; -errno if the flags change isn't allowed or if
 *           another error occurs.
 */
int fscrypt_prepare_setflags(struct inode *inode,
                             unsigned int oldflags, unsigned int flags)
{
        struct fscrypt_inode_info *ci;
        struct fscrypt_master_key *mk;
        int err;

        /*
         * When the CASEFOLD flag is set on an encrypted directory, we must
         * derive the secret key needed for the dirhash.  This is only possible
         * if the directory uses a v2 encryption policy.
         */
        if (IS_ENCRYPTED(inode) && (flags & ~oldflags & FS_CASEFOLD_FL)) {
                err = fscrypt_require_key(inode);
                if (err)
                        return err;
                ci = inode->i_crypt_info;
                if (ci->ci_policy.version != FSCRYPT_POLICY_V2)
                        return -EINVAL;
                mk = ci->ci_master_key;
                down_read(&mk->mk_sem);
                if (mk->mk_present)
                        err = fscrypt_derive_dirhash_key(ci, mk);
                else
                        err = -ENOKEY;
                up_read(&mk->mk_sem);
                return err;
        }
        return 0;
}

/**
 * fscrypt_prepare_symlink() - prepare to create a possibly-encrypted symlink
 * @dir: directory in which the symlink is being created
 * @target: plaintext symlink target
 * @len: length of @target excluding null terminator
 * @max_len: space the filesystem has available to store the symlink target
 * @disk_link: (out) the on-disk symlink target being prepared
 *
 * This function computes the size the symlink target will require on-disk,
 * stores it in @disk_link->len, and validates it against @max_len.  An
 * encrypted symlink may be longer than the original.
 *
 * Additionally, @disk_link->name is set to @target if the symlink will be
 * unencrypted, but left NULL if the symlink will be encrypted.  For encrypted
 * symlinks, the filesystem must call fscrypt_encrypt_symlink() to create the
 * on-disk target later.  (The reason for the two-step process is that some
 * filesystems need to know the size of the symlink target before creating the
 * inode, e.g. to determine whether it will be a "fast" or "slow" symlink.)
 *
 * Return: 0 on success, -ENAMETOOLONG if the symlink target is too long,
 * -ENOKEY if the encryption key is missing, or another -errno code if a problem
 * occurred while setting up the encryption key.
 */
int fscrypt_prepare_symlink(struct inode *dir, const char *target,
                            unsigned int len, unsigned int max_len,
                            struct fscrypt_str *disk_link)
{
        const union fscrypt_policy *policy;

        /*
         * To calculate the size of the encrypted symlink target we need to know
         * the amount of NUL padding, which is determined by the flags set in
         * the encryption policy which will be inherited from the directory.
         */
        policy = fscrypt_policy_to_inherit(dir);
        if (policy == NULL) {
                /* Not encrypted */
                disk_link->name = (unsigned char *)target;
                disk_link->len = len + 1;
                if (disk_link->len > max_len)
                        return -ENAMETOOLONG;
                return 0;
        }
        if (IS_ERR(policy))
                return PTR_ERR(policy);

        /*
         * Calculate the size of the encrypted symlink and verify it won't
         * exceed max_len.  Note that for historical reasons, encrypted symlink
         * targets are prefixed with the ciphertext length, despite this
         * actually being redundant with i_size.  This decreases by 2 bytes the
         * longest symlink target we can accept.
         *
         * We could recover 1 byte by not counting a null terminator, but
         * counting it (even though it is meaningless for ciphertext) is simpler
         * for now since filesystems will assume it is there and subtract it.
         */
        if (!__fscrypt_fname_encrypted_size(policy, len,
                                            max_len - sizeof(struct fscrypt_symlink_data) - 1,
                                            &disk_link->len))
                return -ENAMETOOLONG;
        disk_link->len += sizeof(struct fscrypt_symlink_data) + 1;

        disk_link->name = NULL;
        return 0;
}
EXPORT_SYMBOL_GPL(fscrypt_prepare_symlink);

int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
                              unsigned int len, struct fscrypt_str *disk_link)
{
        int err;
        struct qstr iname = QSTR_INIT(target, len);
        struct fscrypt_symlink_data *sd;
        unsigned int ciphertext_len;

        /*
         * fscrypt_prepare_new_inode() should have already set up the new
         * symlink inode's encryption key.  We don't wait until now to do it,
         * since we may be in a filesystem transaction now.
         */
        if (WARN_ON_ONCE(!fscrypt_has_encryption_key(inode)))
                return -ENOKEY;

        if (disk_link->name) {
                /* filesystem-provided buffer */
                sd = (struct fscrypt_symlink_data *)disk_link->name;
        } else {
                sd = kmalloc(disk_link->len, GFP_NOFS);
                if (!sd)
                        return -ENOMEM;
        }
        ciphertext_len = disk_link->len - sizeof(*sd) - 1;
        sd->len = cpu_to_le16(ciphertext_len);

        err = fscrypt_fname_encrypt(inode, &iname, sd->encrypted_path,
                                    ciphertext_len);
        if (err)
                goto err_free_sd;

        /*
         * Null-terminating the ciphertext doesn't make sense, but we still
         * count the null terminator in the length, so we might as well
         * initialize it just in case the filesystem writes it out.
         */
        sd->encrypted_path[ciphertext_len] = '\0';

        /* Cache the plaintext symlink target for later use by get_link() */
        err = -ENOMEM;
        inode->i_link = kmemdup(target, len + 1, GFP_NOFS);
        if (!inode->i_link)
                goto err_free_sd;

        if (!disk_link->name)
                disk_link->name = (unsigned char *)sd;
        return 0;

err_free_sd:
        if (!disk_link->name)
                kfree(sd);
        return err;
}
EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink);

/**
 * fscrypt_get_symlink() - get the target of an encrypted symlink
 * @inode: the symlink inode
 * @caddr: the on-disk contents of the symlink
 * @max_size: size of @caddr buffer
 * @done: if successful, will be set up to free the returned target if needed
 *
 * If the symlink's encryption key is available, we decrypt its target.
 * Otherwise, we encode its target for presentation.
 *
 * This may sleep, so the filesystem must have dropped out of RCU mode already.
 *
 * Return: the presentable symlink target or an ERR_PTR()
 */
const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
                                unsigned int max_size,
                                struct delayed_call *done)
{
        const struct fscrypt_symlink_data *sd;
        struct fscrypt_str cstr, pstr;
        bool has_key;
        int err;

        /* This is for encrypted symlinks only */
        if (WARN_ON_ONCE(!IS_ENCRYPTED(inode)))
                return ERR_PTR(-EINVAL);

        /* If the decrypted target is already cached, just return it. */
        pstr.name = READ_ONCE(inode->i_link);
        if (pstr.name)
                return pstr.name;

        /*
         * Try to set up the symlink's encryption key, but we can continue
         * regardless of whether the key is available or not.
         */
        err = fscrypt_get_encryption_info(inode, false);
        if (err)
                return ERR_PTR(err);
        has_key = fscrypt_has_encryption_key(inode);

        /*
         * For historical reasons, encrypted symlink targets are prefixed with
         * the ciphertext length, even though this is redundant with i_size.
         */

        if (max_size < sizeof(*sd) + 1)
                return ERR_PTR(-EUCLEAN);
        sd = caddr;
        cstr.name = (unsigned char *)sd->encrypted_path;
        cstr.len = le16_to_cpu(sd->len);

        if (cstr.len == 0)
                return ERR_PTR(-EUCLEAN);

        if (cstr.len + sizeof(*sd) > max_size)
                return ERR_PTR(-EUCLEAN);

        err = fscrypt_fname_alloc_buffer(cstr.len, &pstr);
        if (err)
                return ERR_PTR(err);

        err = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
        if (err)
                goto err_kfree;

        err = -EUCLEAN;
        if (pstr.name[0] == '\0')
                goto err_kfree;

        pstr.name[pstr.len] = '\0';

        /*
         * Cache decrypted symlink targets in i_link for later use.  Don't cache
         * symlink targets encoded without the key, since those become outdated
         * once the key is added.  This pairs with the READ_ONCE() above and in
         * the VFS path lookup code.
         */
        if (!has_key ||
            cmpxchg_release(&inode->i_link, NULL, pstr.name) != NULL)
                set_delayed_call(done, kfree_link, pstr.name);

        return pstr.name;

err_kfree:
        kfree(pstr.name);
        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(fscrypt_get_symlink);

/**
 * fscrypt_symlink_getattr() - set the correct st_size for encrypted symlinks
 * @path: the path for the encrypted symlink being queried
 * @stat: the struct being filled with the symlink's attributes
 *
 * Override st_size of encrypted symlinks to be the length of the decrypted
 * symlink target (or the no-key encoded symlink target, if the key is
 * unavailable) rather than the length of the encrypted symlink target.  This is
 * necessary for st_size to match the symlink target that userspace actually
 * sees.  POSIX requires this, and some userspace programs depend on it.
 *
 * This requires reading the symlink target from disk if needed, setting up the
 * inode's encryption key if possible, and then decrypting or encoding the
 * symlink target.  This makes lstat() more heavyweight than is normally the
 * case.  However, decrypted symlink targets will be cached in ->i_link, so
 * usually the symlink won't have to be read and decrypted again later if/when
 * it is actually followed, readlink() is called, or lstat() is called again.
 *
 * Return: 0 on success, -errno on failure
 */
int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat)
{
        struct dentry *dentry = path->dentry;
        struct inode *inode = d_inode(dentry);
        const char *link;
        DEFINE_DELAYED_CALL(done);

        /*
         * To get the symlink target that userspace will see (whether it's the
         * decrypted target or the no-key encoded target), we can just get it in
         * the same way the VFS does during path resolution and readlink().
         */
        link = READ_ONCE(inode->i_link);
        if (!link) {
                link = inode->i_op->get_link(dentry, inode, &done);
                if (IS_ERR(link))
                        return PTR_ERR(link);
        }
        stat->size = strlen(link);
        do_delayed_call(&done);
        return 0;
}
EXPORT_SYMBOL_GPL(fscrypt_symlink_getattr);





































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef __SOUND_PCM_H
#define __SOUND_PCM_H

/*
 *  Digital Audio (PCM) abstract layer
 *  Copyright (c) by Jaroslav Kysela <perex@perex.cz>
 *                   Abramo Bagnara <abramo@alsa-project.org>
 */

#include <sound/asound.h>
#include <sound/memalloc.h>
#include <sound/minors.h>
#include <linux/poll.h>
#include <linux/mm.h>
#include <linux/bitops.h>
#include <linux/pm_qos.h>
#include <linux/refcount.h>
#include <linux/uio.h>

#define snd_pcm_substream_chip(substream) ((substream)->private_data)
#define snd_pcm_chip(pcm) ((pcm)->private_data)

#if IS_ENABLED(CONFIG_SND_PCM_OSS)
#include <sound/pcm_oss.h>
#endif

/*
 *  Hardware (lowlevel) section
 */

struct snd_pcm_hardware {
        unsigned int info;                /* SNDRV_PCM_INFO_* */
        u64 formats;                        /* SNDRV_PCM_FMTBIT_* */
        u32 subformats;                        /* for S32_LE, SNDRV_PCM_SUBFMTBIT_* */
        unsigned int rates;                /* SNDRV_PCM_RATE_* */
        unsigned int rate_min;                /* min rate */
        unsigned int rate_max;                /* max rate */
        unsigned int channels_min;        /* min channels */
        unsigned int channels_max;        /* max channels */
        size_t buffer_bytes_max;        /* max buffer size */
        size_t period_bytes_min;        /* min period size */
        size_t period_bytes_max;        /* max period size */
        unsigned int periods_min;        /* min # of periods */
        unsigned int periods_max;        /* max # of periods */
        size_t fifo_size;                /* fifo size in bytes */
};

struct snd_pcm_status64;
struct snd_pcm_substream;

struct snd_pcm_audio_tstamp_config; /* definitions further down */
struct snd_pcm_audio_tstamp_report;

struct snd_pcm_ops {
        int (*open)(struct snd_pcm_substream *substream);
        int (*close)(struct snd_pcm_substream *substream);
        int (*ioctl)(struct snd_pcm_substream * substream,
                     unsigned int cmd, void *arg);
        int (*hw_params)(struct snd_pcm_substream *substream,
                         struct snd_pcm_hw_params *params);
        int (*hw_free)(struct snd_pcm_substream *substream);
        int (*prepare)(struct snd_pcm_substream *substream);
        int (*trigger)(struct snd_pcm_substream *substream, int cmd);
        int (*sync_stop)(struct snd_pcm_substream *substream);
        snd_pcm_uframes_t (*pointer)(struct snd_pcm_substream *substream);
        int (*get_time_info)(struct snd_pcm_substream *substream,
                        struct timespec64 *system_ts, struct timespec64 *audio_ts,
                        struct snd_pcm_audio_tstamp_config *audio_tstamp_config,
                        struct snd_pcm_audio_tstamp_report *audio_tstamp_report);
        int (*fill_silence)(struct snd_pcm_substream *substream, int channel,
                            unsigned long pos, unsigned long bytes);
        int (*copy)(struct snd_pcm_substream *substream, int channel,
                    unsigned long pos, struct iov_iter *iter, unsigned long bytes);
        struct page *(*page)(struct snd_pcm_substream *substream,
                             unsigned long offset);
        int (*mmap)(struct snd_pcm_substream *substream, struct vm_area_struct *vma);
        int (*ack)(struct snd_pcm_substream *substream);
};

/*
 *
 */

#if defined(CONFIG_SND_DYNAMIC_MINORS)
#define SNDRV_PCM_DEVICES        (SNDRV_OS_MINORS-2)
#else
#define SNDRV_PCM_DEVICES        8
#endif

#define SNDRV_PCM_IOCTL1_RESET                0
/* 1 is absent slot. */
#define SNDRV_PCM_IOCTL1_CHANNEL_INFO        2
/* 3 is absent slot. */
#define SNDRV_PCM_IOCTL1_FIFO_SIZE        4

#define SNDRV_PCM_TRIGGER_STOP                0
#define SNDRV_PCM_TRIGGER_START                1
#define SNDRV_PCM_TRIGGER_PAUSE_PUSH        3
#define SNDRV_PCM_TRIGGER_PAUSE_RELEASE        4
#define SNDRV_PCM_TRIGGER_SUSPEND        5
#define SNDRV_PCM_TRIGGER_RESUME        6
#define SNDRV_PCM_TRIGGER_DRAIN                7

#define SNDRV_PCM_POS_XRUN                ((snd_pcm_uframes_t)-1)

/* If you change this don't forget to change rates[] table in pcm_native.c */
#define SNDRV_PCM_RATE_5512                (1U<<0)                /* 5512Hz */
#define SNDRV_PCM_RATE_8000                (1U<<1)                /* 8000Hz */
#define SNDRV_PCM_RATE_11025                (1U<<2)                /* 11025Hz */
#define SNDRV_PCM_RATE_16000                (1U<<3)                /* 16000Hz */
#define SNDRV_PCM_RATE_22050                (1U<<4)                /* 22050Hz */
#define SNDRV_PCM_RATE_32000                (1U<<5)                /* 32000Hz */
#define SNDRV_PCM_RATE_44100                (1U<<6)                /* 44100Hz */
#define SNDRV_PCM_RATE_48000                (1U<<7)                /* 48000Hz */
#define SNDRV_PCM_RATE_64000                (1U<<8)                /* 64000Hz */
#define SNDRV_PCM_RATE_88200                (1U<<9)                /* 88200Hz */
#define SNDRV_PCM_RATE_96000                (1U<<10)        /* 96000Hz */
#define SNDRV_PCM_RATE_176400                (1U<<11)        /* 176400Hz */
#define SNDRV_PCM_RATE_192000                (1U<<12)        /* 192000Hz */
#define SNDRV_PCM_RATE_352800                (1U<<13)        /* 352800Hz */
#define SNDRV_PCM_RATE_384000                (1U<<14)        /* 384000Hz */
#define SNDRV_PCM_RATE_705600                (1U<<15)        /* 705600Hz */
#define SNDRV_PCM_RATE_768000                (1U<<16)        /* 768000Hz */

#define SNDRV_PCM_RATE_CONTINUOUS        (1U<<30)        /* continuous range */
#define SNDRV_PCM_RATE_KNOT                (1U<<31)        /* supports more non-continuous rates */

#define SNDRV_PCM_RATE_8000_44100        (SNDRV_PCM_RATE_8000|SNDRV_PCM_RATE_11025|\
                                         SNDRV_PCM_RATE_16000|SNDRV_PCM_RATE_22050|\
                                         SNDRV_PCM_RATE_32000|SNDRV_PCM_RATE_44100)
#define SNDRV_PCM_RATE_8000_48000        (SNDRV_PCM_RATE_8000_44100|SNDRV_PCM_RATE_48000)
#define SNDRV_PCM_RATE_8000_96000        (SNDRV_PCM_RATE_8000_48000|SNDRV_PCM_RATE_64000|\
                                         SNDRV_PCM_RATE_88200|SNDRV_PCM_RATE_96000)
#define SNDRV_PCM_RATE_8000_192000        (SNDRV_PCM_RATE_8000_96000|SNDRV_PCM_RATE_176400|\
                                         SNDRV_PCM_RATE_192000)
#define SNDRV_PCM_RATE_8000_384000        (SNDRV_PCM_RATE_8000_192000|\
                                         SNDRV_PCM_RATE_352800|\
                                         SNDRV_PCM_RATE_384000)
#define SNDRV_PCM_RATE_8000_768000        (SNDRV_PCM_RATE_8000_384000|\
                                         SNDRV_PCM_RATE_705600|\
                                         SNDRV_PCM_RATE_768000)
#define _SNDRV_PCM_FMTBIT(fmt)                (1ULL << (__force int)SNDRV_PCM_FORMAT_##fmt)
#define SNDRV_PCM_FMTBIT_S8                _SNDRV_PCM_FMTBIT(S8)
#define SNDRV_PCM_FMTBIT_U8                _SNDRV_PCM_FMTBIT(U8)
#define SNDRV_PCM_FMTBIT_S16_LE                _SNDRV_PCM_FMTBIT(S16_LE)
#define SNDRV_PCM_FMTBIT_S16_BE                _SNDRV_PCM_FMTBIT(S16_BE)
#define SNDRV_PCM_FMTBIT_U16_LE                _SNDRV_PCM_FMTBIT(U16_LE)
#define SNDRV_PCM_FMTBIT_U16_BE                _SNDRV_PCM_FMTBIT(U16_BE)
#define SNDRV_PCM_FMTBIT_S24_LE                _SNDRV_PCM_FMTBIT(S24_LE)
#define SNDRV_PCM_FMTBIT_S24_BE                _SNDRV_PCM_FMTBIT(S24_BE)
#define SNDRV_PCM_FMTBIT_U24_LE                _SNDRV_PCM_FMTBIT(U24_LE)
#define SNDRV_PCM_FMTBIT_U24_BE                _SNDRV_PCM_FMTBIT(U24_BE)
// For S32/U32 formats, 'msbits' hardware parameter is often used to deliver information about the
// available bit count in most significant bit. It's for the case of so-called 'left-justified' or
// `right-padding` sample which has less width than 32 bit.
#define SNDRV_PCM_FMTBIT_S32_LE                _SNDRV_PCM_FMTBIT(S32_LE)
#define SNDRV_PCM_FMTBIT_S32_BE                _SNDRV_PCM_FMTBIT(S32_BE)
#define SNDRV_PCM_FMTBIT_U32_LE                _SNDRV_PCM_FMTBIT(U32_LE)
#define SNDRV_PCM_FMTBIT_U32_BE                _SNDRV_PCM_FMTBIT(U32_BE)
#define SNDRV_PCM_FMTBIT_FLOAT_LE        _SNDRV_PCM_FMTBIT(FLOAT_LE)
#define SNDRV_PCM_FMTBIT_FLOAT_BE        _SNDRV_PCM_FMTBIT(FLOAT_BE)
#define SNDRV_PCM_FMTBIT_FLOAT64_LE        _SNDRV_PCM_FMTBIT(FLOAT64_LE)
#define SNDRV_PCM_FMTBIT_FLOAT64_BE        _SNDRV_PCM_FMTBIT(FLOAT64_BE)
#define SNDRV_PCM_FMTBIT_IEC958_SUBFRAME_LE _SNDRV_PCM_FMTBIT(IEC958_SUBFRAME_LE)
#define SNDRV_PCM_FMTBIT_IEC958_SUBFRAME_BE _SNDRV_PCM_FMTBIT(IEC958_SUBFRAME_BE)
#define SNDRV_PCM_FMTBIT_MU_LAW                _SNDRV_PCM_FMTBIT(MU_LAW)
#define SNDRV_PCM_FMTBIT_A_LAW                _SNDRV_PCM_FMTBIT(A_LAW)
#define SNDRV_PCM_FMTBIT_IMA_ADPCM        _SNDRV_PCM_FMTBIT(IMA_ADPCM)
#define SNDRV_PCM_FMTBIT_MPEG                _SNDRV_PCM_FMTBIT(MPEG)
#define SNDRV_PCM_FMTBIT_GSM                _SNDRV_PCM_FMTBIT(GSM)
#define SNDRV_PCM_FMTBIT_S20_LE        _SNDRV_PCM_FMTBIT(S20_LE)
#define SNDRV_PCM_FMTBIT_U20_LE        _SNDRV_PCM_FMTBIT(U20_LE)
#define SNDRV_PCM_FMTBIT_S20_BE        _SNDRV_PCM_FMTBIT(S20_BE)
#define SNDRV_PCM_FMTBIT_U20_BE        _SNDRV_PCM_FMTBIT(U20_BE)
#define SNDRV_PCM_FMTBIT_SPECIAL        _SNDRV_PCM_FMTBIT(SPECIAL)
#define SNDRV_PCM_FMTBIT_S24_3LE        _SNDRV_PCM_FMTBIT(S24_3LE)
#define SNDRV_PCM_FMTBIT_U24_3LE        _SNDRV_PCM_FMTBIT(U24_3LE)
#define SNDRV_PCM_FMTBIT_S24_3BE        _SNDRV_PCM_FMTBIT(S24_3BE)
#define SNDRV_PCM_FMTBIT_U24_3BE        _SNDRV_PCM_FMTBIT(U24_3BE)
#define SNDRV_PCM_FMTBIT_S20_3LE        _SNDRV_PCM_FMTBIT(S20_3LE)
#define SNDRV_PCM_FMTBIT_U20_3LE        _SNDRV_PCM_FMTBIT(U20_3LE)
#define SNDRV_PCM_FMTBIT_S20_3BE        _SNDRV_PCM_FMTBIT(S20_3BE)
#define SNDRV_PCM_FMTBIT_U20_3BE        _SNDRV_PCM_FMTBIT(U20_3BE)
#define SNDRV_PCM_FMTBIT_S18_3LE        _SNDRV_PCM_FMTBIT(S18_3LE)
#define SNDRV_PCM_FMTBIT_U18_3LE        _SNDRV_PCM_FMTBIT(U18_3LE)
#define SNDRV_PCM_FMTBIT_S18_3BE        _SNDRV_PCM_FMTBIT(S18_3BE)
#define SNDRV_PCM_FMTBIT_U18_3BE        _SNDRV_PCM_FMTBIT(U18_3BE)
#define SNDRV_PCM_FMTBIT_G723_24        _SNDRV_PCM_FMTBIT(G723_24)
#define SNDRV_PCM_FMTBIT_G723_24_1B        _SNDRV_PCM_FMTBIT(G723_24_1B)
#define SNDRV_PCM_FMTBIT_G723_40        _SNDRV_PCM_FMTBIT(G723_40)
#define SNDRV_PCM_FMTBIT_G723_40_1B        _SNDRV_PCM_FMTBIT(G723_40_1B)
#define SNDRV_PCM_FMTBIT_DSD_U8                _SNDRV_PCM_FMTBIT(DSD_U8)
#define SNDRV_PCM_FMTBIT_DSD_U16_LE        _SNDRV_PCM_FMTBIT(DSD_U16_LE)
#define SNDRV_PCM_FMTBIT_DSD_U32_LE        _SNDRV_PCM_FMTBIT(DSD_U32_LE)
#define SNDRV_PCM_FMTBIT_DSD_U16_BE        _SNDRV_PCM_FMTBIT(DSD_U16_BE)
#define SNDRV_PCM_FMTBIT_DSD_U32_BE        _SNDRV_PCM_FMTBIT(DSD_U32_BE)

#ifdef SNDRV_LITTLE_ENDIAN
#define SNDRV_PCM_FMTBIT_S16                SNDRV_PCM_FMTBIT_S16_LE
#define SNDRV_PCM_FMTBIT_U16                SNDRV_PCM_FMTBIT_U16_LE
#define SNDRV_PCM_FMTBIT_S24                SNDRV_PCM_FMTBIT_S24_LE
#define SNDRV_PCM_FMTBIT_U24                SNDRV_PCM_FMTBIT_U24_LE
#define SNDRV_PCM_FMTBIT_S32                SNDRV_PCM_FMTBIT_S32_LE
#define SNDRV_PCM_FMTBIT_U32                SNDRV_PCM_FMTBIT_U32_LE
#define SNDRV_PCM_FMTBIT_FLOAT                SNDRV_PCM_FMTBIT_FLOAT_LE
#define SNDRV_PCM_FMTBIT_FLOAT64        SNDRV_PCM_FMTBIT_FLOAT64_LE
#define SNDRV_PCM_FMTBIT_IEC958_SUBFRAME SNDRV_PCM_FMTBIT_IEC958_SUBFRAME_LE
#define SNDRV_PCM_FMTBIT_S20                SNDRV_PCM_FMTBIT_S20_LE
#define SNDRV_PCM_FMTBIT_U20                SNDRV_PCM_FMTBIT_U20_LE
#endif
#ifdef SNDRV_BIG_ENDIAN
#define SNDRV_PCM_FMTBIT_S16                SNDRV_PCM_FMTBIT_S16_BE
#define SNDRV_PCM_FMTBIT_U16                SNDRV_PCM_FMTBIT_U16_BE
#define SNDRV_PCM_FMTBIT_S24                SNDRV_PCM_FMTBIT_S24_BE
#define SNDRV_PCM_FMTBIT_U24                SNDRV_PCM_FMTBIT_U24_BE
#define SNDRV_PCM_FMTBIT_S32                SNDRV_PCM_FMTBIT_S32_BE
#define SNDRV_PCM_FMTBIT_U32                SNDRV_PCM_FMTBIT_U32_BE
#define SNDRV_PCM_FMTBIT_FLOAT                SNDRV_PCM_FMTBIT_FLOAT_BE
#define SNDRV_PCM_FMTBIT_FLOAT64        SNDRV_PCM_FMTBIT_FLOAT64_BE
#define SNDRV_PCM_FMTBIT_IEC958_SUBFRAME SNDRV_PCM_FMTBIT_IEC958_SUBFRAME_BE
#define SNDRV_PCM_FMTBIT_S20                SNDRV_PCM_FMTBIT_S20_BE
#define SNDRV_PCM_FMTBIT_U20                SNDRV_PCM_FMTBIT_U20_BE
#endif

#define _SNDRV_PCM_SUBFMTBIT(fmt)        BIT((__force int)SNDRV_PCM_SUBFORMAT_##fmt)
#define SNDRV_PCM_SUBFMTBIT_STD                _SNDRV_PCM_SUBFMTBIT(STD)
#define SNDRV_PCM_SUBFMTBIT_MSBITS_MAX        _SNDRV_PCM_SUBFMTBIT(MSBITS_MAX)
#define SNDRV_PCM_SUBFMTBIT_MSBITS_20        _SNDRV_PCM_SUBFMTBIT(MSBITS_20)
#define SNDRV_PCM_SUBFMTBIT_MSBITS_24        _SNDRV_PCM_SUBFMTBIT(MSBITS_24)

struct snd_pcm_file {
        struct snd_pcm_substream *substream;
        int no_compat_mmap;
        unsigned int user_pversion;        /* supported protocol version */
};

struct snd_pcm_hw_rule;
typedef int (*snd_pcm_hw_rule_func_t)(struct snd_pcm_hw_params *params,
                                      struct snd_pcm_hw_rule *rule);

struct snd_pcm_hw_rule {
        unsigned int cond;
        int var;
        int deps[5];

        snd_pcm_hw_rule_func_t func;
        void *private;
};

struct snd_pcm_hw_constraints {
        struct snd_mask masks[SNDRV_PCM_HW_PARAM_LAST_MASK - 
                         SNDRV_PCM_HW_PARAM_FIRST_MASK + 1];
        struct snd_interval intervals[SNDRV_PCM_HW_PARAM_LAST_INTERVAL -
                             SNDRV_PCM_HW_PARAM_FIRST_INTERVAL + 1];
        unsigned int rules_num;
        unsigned int rules_all;
        struct snd_pcm_hw_rule *rules;
};

static inline struct snd_mask *constrs_mask(struct snd_pcm_hw_constraints *constrs,
                                            snd_pcm_hw_param_t var)
{
        return &constrs->masks[var - SNDRV_PCM_HW_PARAM_FIRST_MASK];
}

static inline struct snd_interval *constrs_interval(struct snd_pcm_hw_constraints *constrs,
                                                    snd_pcm_hw_param_t var)
{
        return &constrs->intervals[var - SNDRV_PCM_HW_PARAM_FIRST_INTERVAL];
}

struct snd_ratnum {
        unsigned int num;
        unsigned int den_min, den_max, den_step;
};

struct snd_ratden {
        unsigned int num_min, num_max, num_step;
        unsigned int den;
};

struct snd_pcm_hw_constraint_ratnums {
        int nrats;
        const struct snd_ratnum *rats;
};

struct snd_pcm_hw_constraint_ratdens {
        int nrats;
        const struct snd_ratden *rats;
};

struct snd_pcm_hw_constraint_list {
        const unsigned int *list;
        unsigned int count;
        unsigned int mask;
};

struct snd_pcm_hw_constraint_ranges {
        unsigned int count;
        const struct snd_interval *ranges;
        unsigned int mask;
};

/*
 * userspace-provided audio timestamp config to kernel,
 * structure is for internal use only and filled with dedicated unpack routine
 */
struct snd_pcm_audio_tstamp_config {
        /* 5 of max 16 bits used */
        u32 type_requested:4;
        u32 report_delay:1; /* add total delay to A/D or D/A */
};

static inline void snd_pcm_unpack_audio_tstamp_config(__u32 data,
                                                struct snd_pcm_audio_tstamp_config *config)
{
        config->type_requested = data & 0xF;
        config->report_delay = (data >> 4) & 1;
}

/*
 * kernel-provided audio timestamp report to user-space
 * structure is for internal use only and read by dedicated pack routine
 */
struct snd_pcm_audio_tstamp_report {
        /* 6 of max 16 bits used for bit-fields */

        /* for backwards compatibility */
        u32 valid:1;

        /* actual type if hardware could not support requested timestamp */
        u32 actual_type:4;

        /* accuracy represented in ns units */
        u32 accuracy_report:1; /* 0 if accuracy unknown, 1 if accuracy field is valid */
        u32 accuracy; /* up to 4.29s, will be packed in separate field  */
};

static inline void snd_pcm_pack_audio_tstamp_report(__u32 *data, __u32 *accuracy,
                                                const struct snd_pcm_audio_tstamp_report *report)
{
        u32 tmp;

        tmp = report->accuracy_report;
        tmp <<= 4;
        tmp |= report->actual_type;
        tmp <<= 1;
        tmp |= report->valid;

        *data &= 0xffff; /* zero-clear MSBs */
        *data |= (tmp << 16);
        *accuracy = report->accuracy;
}


struct snd_pcm_runtime {
        /* -- Status -- */
        snd_pcm_state_t state;                /* stream state */
        snd_pcm_state_t suspended_state; /* suspended stream state */
        struct snd_pcm_substream *trigger_master;
        struct timespec64 trigger_tstamp;        /* trigger timestamp */
        bool trigger_tstamp_latched;     /* trigger timestamp latched in low-level driver/hardware */
        int overrange;
        snd_pcm_uframes_t avail_max;
        snd_pcm_uframes_t hw_ptr_base;        /* Position at buffer restart */
        snd_pcm_uframes_t hw_ptr_interrupt; /* Position at interrupt time */
        unsigned long hw_ptr_jiffies;        /* Time when hw_ptr is updated */
        unsigned long hw_ptr_buffer_jiffies; /* buffer time in jiffies */
        snd_pcm_sframes_t delay;        /* extra delay; typically FIFO size */
        u64 hw_ptr_wrap;                /* offset for hw_ptr due to boundary wrap-around */

        /* -- HW params -- */
        snd_pcm_access_t access;        /* access mode */
        snd_pcm_format_t format;        /* SNDRV_PCM_FORMAT_* */
        snd_pcm_subformat_t subformat;        /* subformat */
        unsigned int rate;                /* rate in Hz */
        unsigned int channels;                /* channels */
        snd_pcm_uframes_t period_size;        /* period size */
        unsigned int periods;                /* periods */
        snd_pcm_uframes_t buffer_size;        /* buffer size */
        snd_pcm_uframes_t min_align;        /* Min alignment for the format */
        size_t byte_align;
        unsigned int frame_bits;
        unsigned int sample_bits;
        unsigned int info;
        unsigned int rate_num;
        unsigned int rate_den;
        unsigned int no_period_wakeup: 1;

        /* -- SW params; see struct snd_pcm_sw_params for comments -- */
        int tstamp_mode;
          unsigned int period_step;
        snd_pcm_uframes_t start_threshold;
        snd_pcm_uframes_t stop_threshold;
        snd_pcm_uframes_t silence_threshold;
        snd_pcm_uframes_t silence_size;
        snd_pcm_uframes_t boundary;

        /* internal data of auto-silencer */
        snd_pcm_uframes_t silence_start; /* starting pointer to silence area */
        snd_pcm_uframes_t silence_filled; /* already filled part of silence area */

        union snd_pcm_sync_id sync;        /* hardware synchronization ID */

        /* -- mmap -- */
        struct snd_pcm_mmap_status *status;
        struct snd_pcm_mmap_control *control;

        /* -- locking / scheduling -- */
        snd_pcm_uframes_t twake;         /* do transfer (!poll) wakeup if non-zero */
        wait_queue_head_t sleep;        /* poll sleep */
        wait_queue_head_t tsleep;        /* transfer sleep */
        struct snd_fasync *fasync;
        bool stop_operating;                /* sync_stop will be called */
        struct mutex buffer_mutex;        /* protect for buffer changes */
        atomic_t buffer_accessing;        /* >0: in r/w operation, <0: blocked */

        /* -- private section -- */
        void *private_data;
        void (*private_free)(struct snd_pcm_runtime *runtime);

        /* -- hardware description -- */
        struct snd_pcm_hardware hw;
        struct snd_pcm_hw_constraints hw_constraints;

        /* -- timer -- */
        unsigned int timer_resolution;        /* timer resolution */
        int tstamp_type;                /* timestamp type */

        /* -- DMA -- */           
        unsigned char *dma_area;        /* DMA area */
        dma_addr_t dma_addr;                /* physical bus address (not accessible from main CPU) */
        size_t dma_bytes;                /* size of DMA area */

        struct snd_dma_buffer *dma_buffer_p;        /* allocated buffer */
        unsigned int buffer_changed:1;        /* buffer allocation changed; set only in managed mode */

        /* -- audio timestamp config -- */
        struct snd_pcm_audio_tstamp_config audio_tstamp_config;
        struct snd_pcm_audio_tstamp_report audio_tstamp_report;
        struct timespec64 driver_tstamp;

#if IS_ENABLED(CONFIG_SND_PCM_OSS)
        /* -- OSS things -- */
        struct snd_pcm_oss_runtime oss;
#endif
};

struct snd_pcm_group {                /* keep linked substreams */
        spinlock_t lock;
        struct mutex mutex;
        struct list_head substreams;
        refcount_t refs;
};

struct pid;

struct snd_pcm_substream {
        struct snd_pcm *pcm;
        struct snd_pcm_str *pstr;
        void *private_data;                /* copied from pcm->private_data */
        int number;
        char name[32];                        /* substream name */
        int stream;                        /* stream (direction) */
        struct pm_qos_request latency_pm_qos_req; /* pm_qos request */
        size_t buffer_bytes_max;        /* limit ring buffer size */
        struct snd_dma_buffer dma_buffer;
        size_t dma_max;
        /* -- hardware operations -- */
        const struct snd_pcm_ops *ops;
        /* -- runtime information -- */
        struct snd_pcm_runtime *runtime;
        /* -- timer section -- */
        struct snd_timer *timer;                /* timer */
        unsigned timer_running: 1;        /* time is running */
        long wait_time;        /* time in ms for R/W to wait for avail */
        /* -- next substream -- */
        struct snd_pcm_substream *next;
        /* -- linked substreams -- */
        struct list_head link_list;        /* linked list member */
        struct snd_pcm_group self_group;        /* fake group for non linked substream (with substream lock inside) */
        struct snd_pcm_group *group;                /* pointer to current group */
        /* -- assigned files -- */
        int ref_count;
        atomic_t mmap_count;
        unsigned int f_flags;
        void (*pcm_release)(struct snd_pcm_substream *);
        struct pid *pid;
#if IS_ENABLED(CONFIG_SND_PCM_OSS)
        /* -- OSS things -- */
        struct snd_pcm_oss_substream oss;
#endif
#ifdef CONFIG_SND_VERBOSE_PROCFS
        struct snd_info_entry *proc_root;
#endif /* CONFIG_SND_VERBOSE_PROCFS */
        /* misc flags */
        unsigned int hw_opened: 1;
        unsigned int managed_buffer_alloc:1;
};

#define SUBSTREAM_BUSY(substream) ((substream)->ref_count > 0)


struct snd_pcm_str {
        int stream;                                /* stream (direction) */
        struct snd_pcm *pcm;
        /* -- substreams -- */
        unsigned int substream_count;
        unsigned int substream_opened;
        struct snd_pcm_substream *substream;
#if IS_ENABLED(CONFIG_SND_PCM_OSS)
        /* -- OSS things -- */
        struct snd_pcm_oss_stream oss;
#endif
#ifdef CONFIG_SND_VERBOSE_PROCFS
        struct snd_info_entry *proc_root;
#ifdef CONFIG_SND_PCM_XRUN_DEBUG
        unsigned int xrun_debug;        /* 0 = disabled, 1 = verbose, 2 = stacktrace */
#endif
#endif
        struct snd_kcontrol *chmap_kctl; /* channel-mapping controls */
        struct device *dev;
};

struct snd_pcm {
        struct snd_card *card;
        struct list_head list;
        int device; /* device number */
        unsigned int info_flags;
        unsigned short dev_class;
        unsigned short dev_subclass;
        char id[64];
        char name[80];
        struct snd_pcm_str streams[2];
        struct mutex open_mutex;
        wait_queue_head_t open_wait;
        void *private_data;
        void (*private_free) (struct snd_pcm *pcm);
        bool internal; /* pcm is for internal use only */
        bool nonatomic; /* whole PCM operations are in non-atomic context */
        bool no_device_suspend; /* don't invoke device PM suspend */
#if IS_ENABLED(CONFIG_SND_PCM_OSS)
        struct snd_pcm_oss oss;
#endif
};

/*
 *  Registering
 */

extern const struct file_operations snd_pcm_f_ops[2];

int snd_pcm_new(struct snd_card *card, const char *id, int device,
                int playback_count, int capture_count,
                struct snd_pcm **rpcm);
int snd_pcm_new_internal(struct snd_card *card, const char *id, int device,
                int playback_count, int capture_count,
                struct snd_pcm **rpcm);
int snd_pcm_new_stream(struct snd_pcm *pcm, int stream, int substream_count);

#if IS_ENABLED(CONFIG_SND_PCM_OSS)
struct snd_pcm_notify {
        int (*n_register) (struct snd_pcm * pcm);
        int (*n_disconnect) (struct snd_pcm * pcm);
        int (*n_unregister) (struct snd_pcm * pcm);
        struct list_head list;
};
int snd_pcm_notify(struct snd_pcm_notify *notify, int nfree);
#endif

/*
 *  Native I/O
 */

int snd_pcm_info(struct snd_pcm_substream *substream, struct snd_pcm_info *info);
int snd_pcm_info_user(struct snd_pcm_substream *substream,
                      struct snd_pcm_info __user *info);
int snd_pcm_status64(struct snd_pcm_substream *substream,
                     struct snd_pcm_status64 *status);
int snd_pcm_start(struct snd_pcm_substream *substream);
int snd_pcm_stop(struct snd_pcm_substream *substream, snd_pcm_state_t status);
int snd_pcm_drain_done(struct snd_pcm_substream *substream);
int snd_pcm_stop_xrun(struct snd_pcm_substream *substream);
#ifdef CONFIG_PM
int snd_pcm_suspend_all(struct snd_pcm *pcm);
#else
static inline int snd_pcm_suspend_all(struct snd_pcm *pcm)
{
        return 0;
}
#endif
int snd_pcm_kernel_ioctl(struct snd_pcm_substream *substream, unsigned int cmd, void *arg);
int snd_pcm_open_substream(struct snd_pcm *pcm, int stream, struct file *file,
                           struct snd_pcm_substream **rsubstream);
void snd_pcm_release_substream(struct snd_pcm_substream *substream);
int snd_pcm_attach_substream(struct snd_pcm *pcm, int stream, struct file *file,
                             struct snd_pcm_substream **rsubstream);
void snd_pcm_detach_substream(struct snd_pcm_substream *substream);
int snd_pcm_mmap_data(struct snd_pcm_substream *substream, struct file *file, struct vm_area_struct *area);


#ifdef CONFIG_SND_DEBUG
void snd_pcm_debug_name(struct snd_pcm_substream *substream,
                           char *name, size_t len);
#else
static inline void
snd_pcm_debug_name(struct snd_pcm_substream *substream, char *buf, size_t size)
{
        *buf = 0;
}
#endif

/*
 *  PCM library
 */

/**
 * snd_pcm_stream_linked - Check whether the substream is linked with others
 * @substream: substream to check
 *
 * Return: true if the given substream is being linked with others
 */
static inline int snd_pcm_stream_linked(struct snd_pcm_substream *substream)
{
        return substream->group != &substream->self_group;
}

void snd_pcm_stream_lock(struct snd_pcm_substream *substream);
void snd_pcm_stream_unlock(struct snd_pcm_substream *substream);
void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream);
void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream);
unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream);
unsigned long _snd_pcm_stream_lock_irqsave_nested(struct snd_pcm_substream *substream);

/**
 * snd_pcm_stream_lock_irqsave - Lock the PCM stream
 * @substream: PCM substream
 * @flags: irq flags
 *
 * This locks the PCM stream like snd_pcm_stream_lock() but with the local
 * IRQ (only when nonatomic is false).  In nonatomic case, this is identical
 * as snd_pcm_stream_lock().
 */
#define snd_pcm_stream_lock_irqsave(substream, flags)                 \
        do {                                                         \
                typecheck(unsigned long, flags);                 \
                flags = _snd_pcm_stream_lock_irqsave(substream); \
        } while (0)
void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream,
                                      unsigned long flags);

/**
 * snd_pcm_stream_lock_irqsave_nested - Single-nested PCM stream locking
 * @substream: PCM substream
 * @flags: irq flags
 *
 * This locks the PCM stream like snd_pcm_stream_lock_irqsave() but with
 * the single-depth lockdep subclass.
 */
#define snd_pcm_stream_lock_irqsave_nested(substream, flags)                \
        do {                                                                \
                typecheck(unsigned long, flags);                        \
                flags = _snd_pcm_stream_lock_irqsave_nested(substream); \
        } while (0)

/* definitions for guard(); use like guard(pcm_stream_lock) */
DEFINE_LOCK_GUARD_1(pcm_stream_lock, struct snd_pcm_substream,
                    snd_pcm_stream_lock(_T->lock),
                    snd_pcm_stream_unlock(_T->lock))
DEFINE_LOCK_GUARD_1(pcm_stream_lock_irq, struct snd_pcm_substream,
                    snd_pcm_stream_lock_irq(_T->lock),
                    snd_pcm_stream_unlock_irq(_T->lock))
DEFINE_LOCK_GUARD_1(pcm_stream_lock_irqsave, struct snd_pcm_substream,
                    snd_pcm_stream_lock_irqsave(_T->lock, _T->flags),
                    snd_pcm_stream_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)

/**
 * snd_pcm_group_for_each_entry - iterate over the linked substreams
 * @s: the iterator
 * @substream: the substream
 *
 * Iterate over the all linked substreams to the given @substream.
 * When @substream isn't linked with any others, this gives returns @substream
 * itself once.
 */
#define snd_pcm_group_for_each_entry(s, substream) \
        list_for_each_entry(s, &substream->group->substreams, link_list)

#define for_each_pcm_streams(stream)                        \
        for (stream  = SNDRV_PCM_STREAM_PLAYBACK;        \
             stream <= SNDRV_PCM_STREAM_LAST;                \
             stream++)

/**
 * snd_pcm_running - Check whether the substream is in a running state
 * @substream: substream to check
 *
 * Return: true if the given substream is in the state RUNNING, or in the
 * state DRAINING for playback.
 */
static inline int snd_pcm_running(struct snd_pcm_substream *substream)
{
        return (substream->runtime->state == SNDRV_PCM_STATE_RUNNING ||
                (substream->runtime->state == SNDRV_PCM_STATE_DRAINING &&
                 substream->stream == SNDRV_PCM_STREAM_PLAYBACK));
}

/**
 * __snd_pcm_set_state - Change the current PCM state
 * @runtime: PCM runtime to set
 * @state: the current state to set
 *
 * Call within the stream lock
 */
static inline void __snd_pcm_set_state(struct snd_pcm_runtime *runtime,
                                       snd_pcm_state_t state)
{
        runtime->state = state;
        runtime->status->state = state; /* copy for mmap */
}

/**
 * bytes_to_samples - Unit conversion of the size from bytes to samples
 * @runtime: PCM runtime instance
 * @size: size in bytes
 *
 * Return: the size in samples
 */
static inline ssize_t bytes_to_samples(struct snd_pcm_runtime *runtime, ssize_t size)
{
        return size * 8 / runtime->sample_bits;
}

/**
 * bytes_to_frames - Unit conversion of the size from bytes to frames
 * @runtime: PCM runtime instance
 * @size: size in bytes
 *
 * Return: the size in frames
 */
static inline snd_pcm_sframes_t bytes_to_frames(struct snd_pcm_runtime *runtime, ssize_t size)
{
        return size * 8 / runtime->frame_bits;
}

/**
 * samples_to_bytes - Unit conversion of the size from samples to bytes
 * @runtime: PCM runtime instance
 * @size: size in samples
 *
 * Return: the byte size
 */
static inline ssize_t samples_to_bytes(struct snd_pcm_runtime *runtime, ssize_t size)
{
        return size * runtime->sample_bits / 8;
}

/**
 * frames_to_bytes - Unit conversion of the size from frames to bytes
 * @runtime: PCM runtime instance
 * @size: size in frames
 *
 * Return: the byte size
 */
static inline ssize_t frames_to_bytes(struct snd_pcm_runtime *runtime, snd_pcm_sframes_t size)
{
        return size * runtime->frame_bits / 8;
}

/**
 * frame_aligned - Check whether the byte size is aligned to frames
 * @runtime: PCM runtime instance
 * @bytes: size in bytes
 *
 * Return: true if aligned, or false if not
 */
static inline int frame_aligned(struct snd_pcm_runtime *runtime, ssize_t bytes)
{
        return bytes % runtime->byte_align == 0;
}

/**
 * snd_pcm_lib_buffer_bytes - Get the buffer size of the current PCM in bytes
 * @substream: PCM substream
 *
 * Return: buffer byte size
 */
static inline size_t snd_pcm_lib_buffer_bytes(struct snd_pcm_substream *substream)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        return frames_to_bytes(runtime, runtime->buffer_size);
}

/**
 * snd_pcm_lib_period_bytes - Get the period size of the current PCM in bytes
 * @substream: PCM substream
 *
 * Return: period byte size
 */
static inline size_t snd_pcm_lib_period_bytes(struct snd_pcm_substream *substream)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        return frames_to_bytes(runtime, runtime->period_size);
}

/**
 * snd_pcm_playback_avail - Get the available (writable) space for playback
 * @runtime: PCM runtime instance
 *
 * Result is between 0 ... (boundary - 1)
 *
 * Return: available frame size
 */
static inline snd_pcm_uframes_t snd_pcm_playback_avail(struct snd_pcm_runtime *runtime)
{
        snd_pcm_sframes_t avail = runtime->status->hw_ptr + runtime->buffer_size - runtime->control->appl_ptr;
        if (avail < 0)
                avail += runtime->boundary;
        else if ((snd_pcm_uframes_t) avail >= runtime->boundary)
                avail -= runtime->boundary;
        return avail;
}

/**
 * snd_pcm_capture_avail - Get the available (readable) space for capture
 * @runtime: PCM runtime instance
 *
 * Result is between 0 ... (boundary - 1)
 *
 * Return: available frame size
 */
static inline snd_pcm_uframes_t snd_pcm_capture_avail(struct snd_pcm_runtime *runtime)
{
        snd_pcm_sframes_t avail = runtime->status->hw_ptr - runtime->control->appl_ptr;
        if (avail < 0)
                avail += runtime->boundary;
        return avail;
}

/**
 * snd_pcm_playback_hw_avail - Get the queued space for playback
 * @runtime: PCM runtime instance
 *
 * Return: available frame size
 */
static inline snd_pcm_sframes_t snd_pcm_playback_hw_avail(struct snd_pcm_runtime *runtime)
{
        return runtime->buffer_size - snd_pcm_playback_avail(runtime);
}

/**
 * snd_pcm_capture_hw_avail - Get the free space for capture
 * @runtime: PCM runtime instance
 *
 * Return: available frame size
 */
static inline snd_pcm_sframes_t snd_pcm_capture_hw_avail(struct snd_pcm_runtime *runtime)
{
        return runtime->buffer_size - snd_pcm_capture_avail(runtime);
}

/**
 * snd_pcm_playback_ready - check whether the playback buffer is available
 * @substream: the pcm substream instance
 *
 * Checks whether enough free space is available on the playback buffer.
 *
 * Return: Non-zero if available, or zero if not.
 */
static inline int snd_pcm_playback_ready(struct snd_pcm_substream *substream)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        return snd_pcm_playback_avail(runtime) >= runtime->control->avail_min;
}

/**
 * snd_pcm_capture_ready - check whether the capture buffer is available
 * @substream: the pcm substream instance
 *
 * Checks whether enough capture data is available on the capture buffer.
 *
 * Return: Non-zero if available, or zero if not.
 */
static inline int snd_pcm_capture_ready(struct snd_pcm_substream *substream)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        return snd_pcm_capture_avail(runtime) >= runtime->control->avail_min;
}

/**
 * snd_pcm_playback_data - check whether any data exists on the playback buffer
 * @substream: the pcm substream instance
 *
 * Checks whether any data exists on the playback buffer.
 *
 * Return: Non-zero if any data exists, or zero if not. If stop_threshold
 * is bigger or equal to boundary, then this function returns always non-zero.
 */
static inline int snd_pcm_playback_data(struct snd_pcm_substream *substream)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        
        if (runtime->stop_threshold >= runtime->boundary)
                return 1;
        return snd_pcm_playback_avail(runtime) < runtime->buffer_size;
}

/**
 * snd_pcm_playback_empty - check whether the playback buffer is empty
 * @substream: the pcm substream instance
 *
 * Checks whether the playback buffer is empty.
 *
 * Return: Non-zero if empty, or zero if not.
 */
static inline int snd_pcm_playback_empty(struct snd_pcm_substream *substream)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        return snd_pcm_playback_avail(runtime) >= runtime->buffer_size;
}

/**
 * snd_pcm_capture_empty - check whether the capture buffer is empty
 * @substream: the pcm substream instance
 *
 * Checks whether the capture buffer is empty.
 *
 * Return: Non-zero if empty, or zero if not.
 */
static inline int snd_pcm_capture_empty(struct snd_pcm_substream *substream)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        return snd_pcm_capture_avail(runtime) == 0;
}

/**
 * snd_pcm_trigger_done - Mark the master substream
 * @substream: the pcm substream instance
 * @master: the linked master substream
 *
 * When multiple substreams of the same card are linked and the hardware
 * supports the single-shot operation, the driver calls this in the loop
 * in snd_pcm_group_for_each_entry() for marking the substream as "done".
 * Then most of trigger operations are performed only to the given master
 * substream.
 *
 * The trigger_master mark is cleared at timestamp updates at the end
 * of trigger operations.
 */
static inline void snd_pcm_trigger_done(struct snd_pcm_substream *substream, 
                                        struct snd_pcm_substream *master)
{
        substream->runtime->trigger_master = master;
}

static inline int hw_is_mask(int var)
{
        return var >= SNDRV_PCM_HW_PARAM_FIRST_MASK &&
                var <= SNDRV_PCM_HW_PARAM_LAST_MASK;
}

static inline int hw_is_interval(int var)
{
        return var >= SNDRV_PCM_HW_PARAM_FIRST_INTERVAL &&
                var <= SNDRV_PCM_HW_PARAM_LAST_INTERVAL;
}

static inline struct snd_mask *hw_param_mask(struct snd_pcm_hw_params *params,
                                     snd_pcm_hw_param_t var)
{
        return &params->masks[var - SNDRV_PCM_HW_PARAM_FIRST_MASK];
}

static inline struct snd_interval *hw_param_interval(struct snd_pcm_hw_params *params,
                                             snd_pcm_hw_param_t var)
{
        return &params->intervals[var - SNDRV_PCM_HW_PARAM_FIRST_INTERVAL];
}

static inline const struct snd_mask *hw_param_mask_c(const struct snd_pcm_hw_params *params,
                                             snd_pcm_hw_param_t var)
{
        return &params->masks[var - SNDRV_PCM_HW_PARAM_FIRST_MASK];
}

static inline const struct snd_interval *hw_param_interval_c(const struct snd_pcm_hw_params *params,
                                                     snd_pcm_hw_param_t var)
{
        return &params->intervals[var - SNDRV_PCM_HW_PARAM_FIRST_INTERVAL];
}

/**
 * params_channels - Get the number of channels from the hw params
 * @p: hw params
 *
 * Return: the number of channels
 */
static inline unsigned int params_channels(const struct snd_pcm_hw_params *p)
{
        return hw_param_interval_c(p, SNDRV_PCM_HW_PARAM_CHANNELS)->min;
}

/**
 * params_rate - Get the sample rate from the hw params
 * @p: hw params
 *
 * Return: the sample rate
 */
static inline unsigned int params_rate(const struct snd_pcm_hw_params *p)
{
        return hw_param_interval_c(p, SNDRV_PCM_HW_PARAM_RATE)->min;
}

/**
 * params_period_size - Get the period size (in frames) from the hw params
 * @p: hw params
 *
 * Return: the period size in frames
 */
static inline unsigned int params_period_size(const struct snd_pcm_hw_params *p)
{
        return hw_param_interval_c(p, SNDRV_PCM_HW_PARAM_PERIOD_SIZE)->min;
}

/**
 * params_periods - Get the number of periods from the hw params
 * @p: hw params
 *
 * Return: the number of periods
 */
static inline unsigned int params_periods(const struct snd_pcm_hw_params *p)
{
        return hw_param_interval_c(p, SNDRV_PCM_HW_PARAM_PERIODS)->min;
}

/**
 * params_buffer_size - Get the buffer size (in frames) from the hw params
 * @p: hw params
 *
 * Return: the buffer size in frames
 */
static inline unsigned int params_buffer_size(const struct snd_pcm_hw_params *p)
{
        return hw_param_interval_c(p, SNDRV_PCM_HW_PARAM_BUFFER_SIZE)->min;
}

/**
 * params_buffer_bytes - Get the buffer size (in bytes) from the hw params
 * @p: hw params
 *
 * Return: the buffer size in bytes
 */
static inline unsigned int params_buffer_bytes(const struct snd_pcm_hw_params *p)
{
        return hw_param_interval_c(p, SNDRV_PCM_HW_PARAM_BUFFER_BYTES)->min;
}

int snd_interval_refine(struct snd_interval *i, const struct snd_interval *v);
int snd_interval_list(struct snd_interval *i, unsigned int count,
                      const unsigned int *list, unsigned int mask);
int snd_interval_ranges(struct snd_interval *i, unsigned int count,
                        const struct snd_interval *list, unsigned int mask);
int snd_interval_ratnum(struct snd_interval *i,
                        unsigned int rats_count, const struct snd_ratnum *rats,
                        unsigned int *nump, unsigned int *denp);

void _snd_pcm_hw_params_any(struct snd_pcm_hw_params *params);
void _snd_pcm_hw_param_setempty(struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var);

int snd_pcm_hw_refine(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params);

int snd_pcm_hw_constraint_mask64(struct snd_pcm_runtime *runtime, snd_pcm_hw_param_t var,
                                 u_int64_t mask);
int snd_pcm_hw_constraint_minmax(struct snd_pcm_runtime *runtime, snd_pcm_hw_param_t var,
                                 unsigned int min, unsigned int max);
int snd_pcm_hw_constraint_integer(struct snd_pcm_runtime *runtime, snd_pcm_hw_param_t var);
int snd_pcm_hw_constraint_list(struct snd_pcm_runtime *runtime, 
                               unsigned int cond,
                               snd_pcm_hw_param_t var,
                               const struct snd_pcm_hw_constraint_list *l);
int snd_pcm_hw_constraint_ranges(struct snd_pcm_runtime *runtime,
                                 unsigned int cond,
                                 snd_pcm_hw_param_t var,
                                 const struct snd_pcm_hw_constraint_ranges *r);
int snd_pcm_hw_constraint_ratnums(struct snd_pcm_runtime *runtime, 
                                  unsigned int cond,
                                  snd_pcm_hw_param_t var,
                                  const struct snd_pcm_hw_constraint_ratnums *r);
int snd_pcm_hw_constraint_ratdens(struct snd_pcm_runtime *runtime, 
                                  unsigned int cond,
                                  snd_pcm_hw_param_t var,
                                  const struct snd_pcm_hw_constraint_ratdens *r);
int snd_pcm_hw_constraint_msbits(struct snd_pcm_runtime *runtime, 
                                 unsigned int cond,
                                 unsigned int width,
                                 unsigned int msbits);
int snd_pcm_hw_constraint_step(struct snd_pcm_runtime *runtime,
                               unsigned int cond,
                               snd_pcm_hw_param_t var,
                               unsigned long step);
int snd_pcm_hw_constraint_pow2(struct snd_pcm_runtime *runtime,
                               unsigned int cond,
                               snd_pcm_hw_param_t var);
int snd_pcm_hw_rule_noresample(struct snd_pcm_runtime *runtime,
                               unsigned int base_rate);
int snd_pcm_hw_rule_add(struct snd_pcm_runtime *runtime,
                        unsigned int cond,
                        int var,
                        snd_pcm_hw_rule_func_t func, void *private,
                        int dep, ...);

/**
 * snd_pcm_hw_constraint_single() - Constrain parameter to a single value
 * @runtime: PCM runtime instance
 * @var: The hw_params variable to constrain
 * @val: The value to constrain to
 *
 * Return: Positive if the value is changed, zero if it's not changed, or a
 * negative error code.
 */
static inline int snd_pcm_hw_constraint_single(
        struct snd_pcm_runtime *runtime, snd_pcm_hw_param_t var,
        unsigned int val)
{
        return snd_pcm_hw_constraint_minmax(runtime, var, val, val);
}

int snd_pcm_format_signed(snd_pcm_format_t format);
int snd_pcm_format_unsigned(snd_pcm_format_t format);
int snd_pcm_format_linear(snd_pcm_format_t format);
int snd_pcm_format_little_endian(snd_pcm_format_t format);
int snd_pcm_format_big_endian(snd_pcm_format_t format);
#if 0 /* just for kernel-doc */
/**
 * snd_pcm_format_cpu_endian - Check the PCM format is CPU-endian
 * @format: the format to check
 *
 * Return: 1 if the given PCM format is CPU-endian, 0 if
 * opposite, or a negative error code if endian not specified.
 */
int snd_pcm_format_cpu_endian(snd_pcm_format_t format);
#endif /* DocBook */
#ifdef SNDRV_LITTLE_ENDIAN
#define snd_pcm_format_cpu_endian(format) snd_pcm_format_little_endian(format)
#else
#define snd_pcm_format_cpu_endian(format) snd_pcm_format_big_endian(format)
#endif
int snd_pcm_format_width(snd_pcm_format_t format);                        /* in bits */
int snd_pcm_format_physical_width(snd_pcm_format_t format);                /* in bits */
ssize_t snd_pcm_format_size(snd_pcm_format_t format, size_t samples);
const unsigned char *snd_pcm_format_silence_64(snd_pcm_format_t format);
int snd_pcm_format_set_silence(snd_pcm_format_t format, void *buf, unsigned int frames);

void snd_pcm_set_ops(struct snd_pcm * pcm, int direction,
                     const struct snd_pcm_ops *ops);
void snd_pcm_set_sync(struct snd_pcm_substream *substream);
int snd_pcm_lib_ioctl(struct snd_pcm_substream *substream,
                      unsigned int cmd, void *arg);                      
void snd_pcm_period_elapsed_under_stream_lock(struct snd_pcm_substream *substream);
void snd_pcm_period_elapsed(struct snd_pcm_substream *substream);
snd_pcm_sframes_t __snd_pcm_lib_xfer(struct snd_pcm_substream *substream,
                                     void *buf, bool interleaved,
                                     snd_pcm_uframes_t frames, bool in_kernel);

static inline snd_pcm_sframes_t
snd_pcm_lib_write(struct snd_pcm_substream *substream,
                  const void __user *buf, snd_pcm_uframes_t frames)
{
        return __snd_pcm_lib_xfer(substream, (void __force *)buf, true, frames, false);
}

static inline snd_pcm_sframes_t
snd_pcm_lib_read(struct snd_pcm_substream *substream,
                 void __user *buf, snd_pcm_uframes_t frames)
{
        return __snd_pcm_lib_xfer(substream, (void __force *)buf, true, frames, false);
}

static inline snd_pcm_sframes_t
snd_pcm_lib_writev(struct snd_pcm_substream *substream,
                   void __user **bufs, snd_pcm_uframes_t frames)
{
        return __snd_pcm_lib_xfer(substream, (void *)bufs, false, frames, false);
}

static inline snd_pcm_sframes_t
snd_pcm_lib_readv(struct snd_pcm_substream *substream,
                  void __user **bufs, snd_pcm_uframes_t frames)
{
        return __snd_pcm_lib_xfer(substream, (void *)bufs, false, frames, false);
}

static inline snd_pcm_sframes_t
snd_pcm_kernel_write(struct snd_pcm_substream *substream,
                     const void *buf, snd_pcm_uframes_t frames)
{
        return __snd_pcm_lib_xfer(substream, (void *)buf, true, frames, true);
}

static inline snd_pcm_sframes_t
snd_pcm_kernel_read(struct snd_pcm_substream *substream,
                    void *buf, snd_pcm_uframes_t frames)
{
        return __snd_pcm_lib_xfer(substream, buf, true, frames, true);
}

static inline snd_pcm_sframes_t
snd_pcm_kernel_writev(struct snd_pcm_substream *substream,
                      void **bufs, snd_pcm_uframes_t frames)
{
        return __snd_pcm_lib_xfer(substream, bufs, false, frames, true);
}

static inline snd_pcm_sframes_t
snd_pcm_kernel_readv(struct snd_pcm_substream *substream,
                     void **bufs, snd_pcm_uframes_t frames)
{
        return __snd_pcm_lib_xfer(substream, bufs, false, frames, true);
}

int snd_pcm_hw_limit_rates(struct snd_pcm_hardware *hw);

static inline int
snd_pcm_limit_hw_rates(struct snd_pcm_runtime *runtime)
{
        return snd_pcm_hw_limit_rates(&runtime->hw);
}

unsigned int snd_pcm_rate_to_rate_bit(unsigned int rate);
unsigned int snd_pcm_rate_bit_to_rate(unsigned int rate_bit);
unsigned int snd_pcm_rate_mask_intersect(unsigned int rates_a,
                                         unsigned int rates_b);
unsigned int snd_pcm_rate_range_to_bits(unsigned int rate_min,
                                        unsigned int rate_max);

/**
 * snd_pcm_set_runtime_buffer - Set the PCM runtime buffer
 * @substream: PCM substream to set
 * @bufp: the buffer information, NULL to clear
 *
 * Copy the buffer information to runtime->dma_buffer when @bufp is non-NULL.
 * Otherwise it clears the current buffer information.
 */
static inline void snd_pcm_set_runtime_buffer(struct snd_pcm_substream *substream,
                                              struct snd_dma_buffer *bufp)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        if (bufp) {
                runtime->dma_buffer_p = bufp;
                runtime->dma_area = bufp->area;
                runtime->dma_addr = bufp->addr;
                runtime->dma_bytes = bufp->bytes;
        } else {
                runtime->dma_buffer_p = NULL;
                runtime->dma_area = NULL;
                runtime->dma_addr = 0;
                runtime->dma_bytes = 0;
        }
}

/**
 * snd_pcm_gettime - Fill the timespec64 depending on the timestamp mode
 * @runtime: PCM runtime instance
 * @tv: timespec64 to fill
 */
static inline void snd_pcm_gettime(struct snd_pcm_runtime *runtime,
                                   struct timespec64 *tv)
{
        switch (runtime->tstamp_type) {
        case SNDRV_PCM_TSTAMP_TYPE_MONOTONIC:
                ktime_get_ts64(tv);
                break;
        case SNDRV_PCM_TSTAMP_TYPE_MONOTONIC_RAW:
                ktime_get_raw_ts64(tv);
                break;
        default:
                ktime_get_real_ts64(tv);
                break;
        }
}

/*
 *  Memory
 */

void snd_pcm_lib_preallocate_free(struct snd_pcm_substream *substream);
void snd_pcm_lib_preallocate_free_for_all(struct snd_pcm *pcm);
void snd_pcm_lib_preallocate_pages(struct snd_pcm_substream *substream,
                                  int type, struct device *data,
                                  size_t size, size_t max);
void snd_pcm_lib_preallocate_pages_for_all(struct snd_pcm *pcm,
                                          int type, void *data,
                                          size_t size, size_t max);
int snd_pcm_lib_malloc_pages(struct snd_pcm_substream *substream, size_t size);
int snd_pcm_lib_free_pages(struct snd_pcm_substream *substream);

int snd_pcm_set_managed_buffer(struct snd_pcm_substream *substream, int type,
                               struct device *data, size_t size, size_t max);
int snd_pcm_set_managed_buffer_all(struct snd_pcm *pcm, int type,
                                   struct device *data,
                                   size_t size, size_t max);

/**
 * snd_pcm_set_fixed_buffer - Preallocate and set up the fixed size PCM buffer
 * @substream: the pcm substream instance
 * @type: DMA type (SNDRV_DMA_TYPE_*)
 * @data: DMA type dependent data
 * @size: the requested pre-allocation size in bytes
 *
 * This is a variant of snd_pcm_set_managed_buffer(), but this pre-allocates
 * only the given sized buffer and doesn't allow re-allocation nor dynamic
 * allocation of a larger buffer unlike the standard one.
 * The function may return -ENOMEM error, hence the caller must check it.
 *
 * Return: zero if successful, or a negative error code
 */
static inline int __must_check
snd_pcm_set_fixed_buffer(struct snd_pcm_substream *substream, int type,
                                 struct device *data, size_t size)
{
        return snd_pcm_set_managed_buffer(substream, type, data, size, 0);
}

/**
 * snd_pcm_set_fixed_buffer_all - Preallocate and set up the fixed size PCM buffer
 * @pcm: the pcm instance
 * @type: DMA type (SNDRV_DMA_TYPE_*)
 * @data: DMA type dependent data
 * @size: the requested pre-allocation size in bytes
 *
 * Apply the set up of the fixed buffer via snd_pcm_set_fixed_buffer() for
 * all substream.  If any of allocation fails, it returns -ENOMEM, hence the
 * caller must check the return value.
 *
 * Return: zero if successful, or a negative error code
 */
static inline int __must_check
snd_pcm_set_fixed_buffer_all(struct snd_pcm *pcm, int type,
                             struct device *data, size_t size)
{
        return snd_pcm_set_managed_buffer_all(pcm, type, data, size, 0);
}

int _snd_pcm_lib_alloc_vmalloc_buffer(struct snd_pcm_substream *substream,
                                      size_t size, gfp_t gfp_flags);
int snd_pcm_lib_free_vmalloc_buffer(struct snd_pcm_substream *substream);
struct page *snd_pcm_lib_get_vmalloc_page(struct snd_pcm_substream *substream,
                                          unsigned long offset);
/**
 * snd_pcm_lib_alloc_vmalloc_buffer - allocate virtual DMA buffer
 * @substream: the substream to allocate the buffer to
 * @size: the requested buffer size, in bytes
 *
 * Allocates the PCM substream buffer using vmalloc(), i.e., the memory is
 * contiguous in kernel virtual space, but not in physical memory.  Use this
 * if the buffer is accessed by kernel code but not by device DMA.
 *
 * Return: 1 if the buffer was changed, 0 if not changed, or a negative error
 * code.
 */
static inline int snd_pcm_lib_alloc_vmalloc_buffer
                        (struct snd_pcm_substream *substream, size_t size)
{
        return _snd_pcm_lib_alloc_vmalloc_buffer(substream, size,
                                                 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
}

/**
 * snd_pcm_lib_alloc_vmalloc_32_buffer - allocate 32-bit-addressable buffer
 * @substream: the substream to allocate the buffer to
 * @size: the requested buffer size, in bytes
 *
 * This function works like snd_pcm_lib_alloc_vmalloc_buffer(), but uses
 * vmalloc_32(), i.e., the pages are allocated from 32-bit-addressable memory.
 *
 * Return: 1 if the buffer was changed, 0 if not changed, or a negative error
 * code.
 */
static inline int snd_pcm_lib_alloc_vmalloc_32_buffer
                        (struct snd_pcm_substream *substream, size_t size)
{
        return _snd_pcm_lib_alloc_vmalloc_buffer(substream, size,
                                                 GFP_KERNEL | GFP_DMA32 | __GFP_ZERO);
}

#define snd_pcm_get_dma_buf(substream) ((substream)->runtime->dma_buffer_p)

/**
 * snd_pcm_sgbuf_get_addr - Get the DMA address at the corresponding offset
 * @substream: PCM substream
 * @ofs: byte offset
 *
 * Return: DMA address
 */
static inline dma_addr_t
snd_pcm_sgbuf_get_addr(struct snd_pcm_substream *substream, unsigned int ofs)
{
        return snd_sgbuf_get_addr(snd_pcm_get_dma_buf(substream), ofs);
}

/**
 * snd_pcm_sgbuf_get_chunk_size - Compute the max size that fits within the
 * contig. page from the given size
 * @substream: PCM substream
 * @ofs: byte offset
 * @size: byte size to examine
 *
 * Return: chunk size
 */
static inline unsigned int
snd_pcm_sgbuf_get_chunk_size(struct snd_pcm_substream *substream,
                             unsigned int ofs, unsigned int size)
{
        return snd_sgbuf_get_chunk_size(snd_pcm_get_dma_buf(substream), ofs, size);
}

/**
 * snd_pcm_mmap_data_open - increase the mmap counter
 * @area: VMA
 *
 * PCM mmap callback should handle this counter properly
 */
static inline void snd_pcm_mmap_data_open(struct vm_area_struct *area)
{
        struct snd_pcm_substream *substream = (struct snd_pcm_substream *)area->vm_private_data;
        atomic_inc(&substream->mmap_count);
}

/**
 * snd_pcm_mmap_data_close - decrease the mmap counter
 * @area: VMA
 *
 * PCM mmap callback should handle this counter properly
 */
static inline void snd_pcm_mmap_data_close(struct vm_area_struct *area)
{
        struct snd_pcm_substream *substream = (struct snd_pcm_substream *)area->vm_private_data;
        atomic_dec(&substream->mmap_count);
}

int snd_pcm_lib_default_mmap(struct snd_pcm_substream *substream,
                             struct vm_area_struct *area);
/* mmap for io-memory area */
#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_ALPHA)
#define SNDRV_PCM_INFO_MMAP_IOMEM        SNDRV_PCM_INFO_MMAP
int snd_pcm_lib_mmap_iomem(struct snd_pcm_substream *substream, struct vm_area_struct *area);
#else
#define SNDRV_PCM_INFO_MMAP_IOMEM        0
#define snd_pcm_lib_mmap_iomem        NULL
#endif

/**
 * snd_pcm_limit_isa_dma_size - Get the max size fitting with ISA DMA transfer
 * @dma: DMA number
 * @max: pointer to store the max size
 */
static inline void snd_pcm_limit_isa_dma_size(int dma, size_t *max)
{
        *max = dma < 4 ? 64 * 1024 : 128 * 1024;
}

/*
 *  Misc
 */

#define SNDRV_PCM_DEFAULT_CON_SPDIF        (IEC958_AES0_CON_EMPHASIS_NONE|\
                                         (IEC958_AES1_CON_ORIGINAL<<8)|\
                                         (IEC958_AES1_CON_PCM_CODER<<8)|\
                                         (IEC958_AES3_CON_FS_48000<<24))

const char *snd_pcm_format_name(snd_pcm_format_t format);

/**
 * snd_pcm_direction_name - Get a string naming the direction of a stream
 * @direction: Stream's direction, one of SNDRV_PCM_STREAM_XXX
 *
 * Returns a string naming the direction of the stream.
 */
static inline const char *snd_pcm_direction_name(int direction)
{
        if (direction == SNDRV_PCM_STREAM_PLAYBACK)
                return "Playback";
        else
                return "Capture";
}

/**
 * snd_pcm_stream_str - Get a string naming the direction of a stream
 * @substream: the pcm substream instance
 *
 * Return: A string naming the direction of the stream.
 */
static inline const char *snd_pcm_stream_str(struct snd_pcm_substream *substream)
{
        return snd_pcm_direction_name(substream->stream);
}

/*
 * PCM channel-mapping control API
 */
/* array element of channel maps */
struct snd_pcm_chmap_elem {
        unsigned char channels;
        unsigned char map[15];
};

/* channel map information; retrieved via snd_kcontrol_chip() */
struct snd_pcm_chmap {
        struct snd_pcm *pcm;        /* assigned PCM instance */
        int stream;                /* PLAYBACK or CAPTURE */
        struct snd_kcontrol *kctl;
        const struct snd_pcm_chmap_elem *chmap;
        unsigned int max_channels;
        unsigned int channel_mask;        /* optional: active channels bitmask */
        void *private_data;        /* optional: private data pointer */
};

/**
 * snd_pcm_chmap_substream - get the PCM substream assigned to the given chmap info
 * @info: chmap information
 * @idx: the substream number index
 *
 * Return: the matched PCM substream, or NULL if not found
 */
static inline struct snd_pcm_substream *
snd_pcm_chmap_substream(struct snd_pcm_chmap *info, unsigned int idx)
{
        struct snd_pcm_substream *s;
        for (s = info->pcm->streams[info->stream].substream; s; s = s->next)
                if (s->number == idx)
                        return s;
        return NULL;
}

/* ALSA-standard channel maps (RL/RR prior to C/LFE) */
extern const struct snd_pcm_chmap_elem snd_pcm_std_chmaps[];
/* Other world's standard channel maps (C/LFE prior to RL/RR) */
extern const struct snd_pcm_chmap_elem snd_pcm_alt_chmaps[];

/* bit masks to be passed to snd_pcm_chmap.channel_mask field */
#define SND_PCM_CHMAP_MASK_24        ((1U << 2) | (1U << 4))
#define SND_PCM_CHMAP_MASK_246        (SND_PCM_CHMAP_MASK_24 | (1U << 6))
#define SND_PCM_CHMAP_MASK_2468        (SND_PCM_CHMAP_MASK_246 | (1U << 8))

int snd_pcm_add_chmap_ctls(struct snd_pcm *pcm, int stream,
                           const struct snd_pcm_chmap_elem *chmap,
                           int max_channels,
                           unsigned long private_value,
                           struct snd_pcm_chmap **info_ret);

/**
 * pcm_format_to_bits - Strong-typed conversion of pcm_format to bitwise
 * @pcm_format: PCM format
 *
 * Return: 64bit mask corresponding to the given PCM format
 */
static inline u64 pcm_format_to_bits(snd_pcm_format_t pcm_format)
{
        return 1ULL << (__force int) pcm_format;
}

/**
 * pcm_for_each_format - helper to iterate for each format type
 * @f: the iterator variable in snd_pcm_format_t type
 */
#define pcm_for_each_format(f)                                                \
        for ((f) = SNDRV_PCM_FORMAT_FIRST;                                \
             (__force int)(f) <= (__force int)SNDRV_PCM_FORMAT_LAST;        \
             (f) = (__force snd_pcm_format_t)((__force int)(f) + 1))

/* printk helpers */
#define pcm_err(pcm, fmt, args...) \
        dev_err((pcm)->card->dev, fmt, ##args)
#define pcm_warn(pcm, fmt, args...) \
        dev_warn((pcm)->card->dev, fmt, ##args)
#define pcm_dbg(pcm, fmt, args...) \
        dev_dbg((pcm)->card->dev, fmt, ##args)

/* helpers for copying between iov_iter and iomem */
int copy_to_iter_fromio(struct iov_iter *itert, const void __iomem *src,
                        size_t count);
int copy_from_iter_toio(void __iomem *dst, struct iov_iter *iter, size_t count);

struct snd_pcm_status64 {
        snd_pcm_state_t state;                /* stream state */
        u8 rsvd[4];
        s64 trigger_tstamp_sec;                /* time when stream was started/stopped/paused */
        s64 trigger_tstamp_nsec;
        s64 tstamp_sec;                        /* reference timestamp */
        s64 tstamp_nsec;
        snd_pcm_uframes_t appl_ptr;        /* appl ptr */
        snd_pcm_uframes_t hw_ptr;        /* hw ptr */
        snd_pcm_sframes_t delay;        /* current delay in frames */
        snd_pcm_uframes_t avail;        /* number of frames available */
        snd_pcm_uframes_t avail_max;        /* max frames available on hw since last status */
        snd_pcm_uframes_t overrange;        /* count of ADC (capture) overrange detections from last status */
        snd_pcm_state_t suspended_state; /* suspended stream state */
        __u32 audio_tstamp_data;         /* needed for 64-bit alignment, used for configs/report to/from userspace */
        s64 audio_tstamp_sec;                /* sample counter, wall clock, PHC or on-demand sync'ed */
        s64 audio_tstamp_nsec;
        s64 driver_tstamp_sec;                /* useful in case reference system tstamp is reported with delay */
        s64 driver_tstamp_nsec;
        __u32 audio_tstamp_accuracy;        /* in ns units, only valid if indicated in audio_tstamp_data */
        unsigned char reserved[52-4*sizeof(s64)]; /* must be filled with zero */
};

#define SNDRV_PCM_IOCTL_STATUS64        _IOR('A', 0x20, struct snd_pcm_status64)
#define SNDRV_PCM_IOCTL_STATUS_EXT64        _IOWR('A', 0x24, struct snd_pcm_status64)

struct snd_pcm_status32 {
        snd_pcm_state_t state;                /* stream state */
        s32 trigger_tstamp_sec;        /* time when stream was started/stopped/paused */
        s32 trigger_tstamp_nsec;
        s32 tstamp_sec;                /* reference timestamp */
        s32 tstamp_nsec;
        u32 appl_ptr;                /* appl ptr */
        u32 hw_ptr;                /* hw ptr */
        s32 delay;                /* current delay in frames */
        u32 avail;                /* number of frames available */
        u32 avail_max;                /* max frames available on hw since last status */
        u32 overrange;                /* count of ADC (capture) overrange detections from last status */
        snd_pcm_state_t suspended_state;        /* suspended stream state */
        u32 audio_tstamp_data;        /* needed for 64-bit alignment, used for configs/report to/from userspace */
        s32 audio_tstamp_sec;        /* sample counter, wall clock, PHC or on-demand sync'ed */
        s32 audio_tstamp_nsec;
        s32 driver_tstamp_sec;        /* useful in case reference system tstamp is reported with delay */
        s32 driver_tstamp_nsec;
        u32 audio_tstamp_accuracy;        /* in ns units, only valid if indicated in audio_tstamp_data */
        unsigned char reserved[52-4*sizeof(s32)]; /* must be filled with zero */
};

#define SNDRV_PCM_IOCTL_STATUS32        _IOR('A', 0x20, struct snd_pcm_status32)
#define SNDRV_PCM_IOCTL_STATUS_EXT32        _IOWR('A', 0x24, struct snd_pcm_status32)

#endif /* __SOUND_PCM_H */





















































































































































































    9 
    9 



























    1 

    1 
    1 


    1 



























    1 

    1 



    2 





























    2 
























    4 






























    2 

    2 




    3 




    2 











    2 
    2 





    4 









    4 






    2 
    2 










    4 















    4 






    2 





















    2 




    2 





























    2 



    2 
    2 








































































































    5 





    2 



    2 














































    5 
















































































































































































































































































































































































































    6 











    6 



    6 

















































































































































































































































































































































































    2 































































    2 












    2 


    2 


























    1 





    1 



    2 
    2 






    2 








    2 

































    2 


    1 




    2 



























































    2 





    2 



    2 







    2 
























    3 










    2 



    2 


    2 















    3 







    1 

    3 


    4 

















    4 









    3 




    4 





    4 

    4 







    4 









    3 
    1 

    4 






















    3 




























    4 









    4 



    4 


    4 

    4 
    4 


























































    1 









    1 


    1 



    1 















    1 










    1 
    1 





    1 

    1 



































    1 

    1 






























    1 


    1 








    1 
    1 










    1 



    1 
























    1 





    1 

    1 


    1 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    8 
    4 















    3 





    3 






    1 









    3 









    3 


    3 

    3 












    3 














    3 


    2 

















    1 






    2 











    2 


    2 












    2 

























    6 


    1 







    1 

































    1 

    1 

    1 
    1 







    1 





















    1 




    1 












    1 



    1 







    1 












    1 
    1 

    1 




























    1 





    1 

    1 

    1 





    1 





























    2 
    2 













    1 








































    2 
    2 








    1 



    1 









    1 












































































    2 






    2 
























    2 







    1 





    2 









    2 


























    1 




    1 
























































































    3 

























    3 











    3 


























































































    8 

    3 
































































































































































































































































































































































































































































    6 

    6 










































































    8 





    3 





    3 







    3 







    6 



    6 





    6 







    6 

    6 


    5 




    6 






















    6 






    8 





    8 














    2 



















    2 






    2 



    2 


















    2 




































































































    3 






    3 






    1 
    2 



    2 






    3 





























    2 




    2 











    2 





    2 


















    2 
    2 
    2 









    2 
















































































    1 





    1 



    1 





























    1 
    1 




































































    2 


    2 








    2 














    2 









































    1 



    2 


    3 

































































































































































































































































































   11 









    2 















    2 
   11 





   11 

    2 












    2 











































   11 












   11 



   11 









   11 
















   11 







   11 
    2 














   11 








































   11 
















    1 



















    1 



    2 

    9 



































    4 




    6 

























   11 






    9 



   11 










    8 
    3 






   11 



    9 
    3 






    3 



   11 










   10 
























































    9 


   10 



    9 






















































    4 



    4 


    1 

















    3 





    1 









































































































































































































































































































































































   46 





   45 















































































































































































































    2 





    2 




    1 





    2 
    2 




    2 
    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468

// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/mm/memory.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * demand-loading started 01.12.91 - seems it is high on the list of
 * things wanted, and it should be easy to implement. - Linus
 */

/*
 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
 * pages started 02.12.91, seems to work. - Linus.
 *
 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
 * would have taken more than the 6M I have free, but it worked well as
 * far as I could see.
 *
 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
 */

/*
 * Real VM (paging to/from disk) started 18.12.91. Much more work and
 * thought has to go into this. Oh, well..
 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
 *                Found it. Everything seems to work now.
 * 20.12.91  -  Ok, making the swap-device changeable like the root.
 */

/*
 * 05.04.94  -  Multi-page memory management added for v1.1.
 *              Idea by Alex Bligh (alex@cconcepts.co.uk)
 *
 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
 *                (Gerhard.Wichert@pdb.siemens.de)
 *
 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
 */

#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>
#include <linux/kmsan.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/pfn_t.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/swapops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
#include <linux/memory-tiers.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>

#include <trace/events/kmem.h>

#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>

#include "pgalloc-track.h"
#include "internal.h"
#include "swap.h"

#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif

#ifndef CONFIG_NUMA
unsigned long max_mapnr;
EXPORT_SYMBOL(max_mapnr);

struct page *mem_map;
EXPORT_SYMBOL(mem_map);
#endif

static vm_fault_t do_fault(struct vm_fault *vmf);
static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
static bool vmf_pte_changed(struct vm_fault *vmf);

/*
 * Return true if the original pte was a uffd-wp pte marker (so the pte was
 * wr-protected).
 */
static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
{
        if (!userfaultfd_wp(vmf->vma))
                return false;
        if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
                return false;

        return pte_marker_uffd_wp(vmf->orig_pte);
}

/*
 * A number of key systems in x86 including ioremap() rely on the assumption
 * that high_memory defines the upper bound on direct map memory, then end
 * of ZONE_NORMAL.
 */
void *high_memory;
EXPORT_SYMBOL(high_memory);

/*
 * Randomize the address space (stacks, mmaps, brk, etc.).
 *
 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
 *   as ancient (libc5 based) binaries can segfault. )
 */
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
                                        1;
#else
                                        2;
#endif

#ifndef arch_wants_old_prefaulted_pte
static inline bool arch_wants_old_prefaulted_pte(void)
{
        /*
         * Transitioning a PTE from 'old' to 'young' can be expensive on
         * some architectures, even if it's performed in hardware. By
         * default, "false" means prefaulted entries will be 'young'.
         */
        return false;
}
#endif

static int __init disable_randmaps(char *s)
{
        randomize_va_space = 0;
        return 1;
}
__setup("norandmaps", disable_randmaps);

unsigned long zero_pfn __read_mostly;
EXPORT_SYMBOL(zero_pfn);

unsigned long highest_memmap_pfn __read_mostly;

/*
 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
 */
static int __init init_zero_pfn(void)
{
        zero_pfn = page_to_pfn(ZERO_PAGE(0));
        return 0;
}
early_initcall(init_zero_pfn);

void mm_trace_rss_stat(struct mm_struct *mm, int member)
{
        trace_rss_stat(mm, member);
}

/*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
                           unsigned long addr)
{
        pgtable_t token = pmd_pgtable(*pmd);
        pmd_clear(pmd);
        pte_free_tlb(tlb, token, addr);
        mm_dec_nr_ptes(tlb->mm);
}

static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        pmd_t *pmd;
        unsigned long next;
        unsigned long start;

        start = addr;
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                free_pte_range(tlb, pmd, addr);
        } while (pmd++, addr = next, addr != end);

        start &= PUD_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= PUD_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        pmd = pmd_offset(pud, start);
        pud_clear(pud);
        pmd_free_tlb(tlb, pmd, start);
        mm_dec_nr_pmds(tlb->mm);
}

static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        pud_t *pud;
        unsigned long next;
        unsigned long start;

        start = addr;
        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
                free_pmd_range(tlb, pud, addr, next, floor, ceiling);
        } while (pud++, addr = next, addr != end);

        start &= P4D_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= P4D_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        pud = pud_offset(p4d, start);
        p4d_clear(p4d);
        pud_free_tlb(tlb, pud, start);
        mm_dec_nr_puds(tlb->mm);
}

static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        p4d_t *p4d;
        unsigned long next;
        unsigned long start;

        start = addr;
        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(p4d))
                        continue;
                free_pud_range(tlb, p4d, addr, next, floor, ceiling);
        } while (p4d++, addr = next, addr != end);

        start &= PGDIR_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= PGDIR_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        p4d = p4d_offset(pgd, start);
        pgd_clear(pgd);
        p4d_free_tlb(tlb, p4d, start);
}

/*
 * This function frees user-level page tables of a process.
 */
void free_pgd_range(struct mmu_gather *tlb,
                        unsigned long addr, unsigned long end,
                        unsigned long floor, unsigned long ceiling)
{
        pgd_t *pgd;
        unsigned long next;

        /*
         * The next few lines have given us lots of grief...
         *
         * Why are we testing PMD* at this top level?  Because often
         * there will be no work to do at all, and we'd prefer not to
         * go all the way down to the bottom just to discover that.
         *
         * Why all these "- 1"s?  Because 0 represents both the bottom
         * of the address space and the top of it (using -1 for the
         * top wouldn't help much: the masks would do the wrong thing).
         * The rule is that addr 0 and floor 0 refer to the bottom of
         * the address space, but end 0 and ceiling 0 refer to the top
         * Comparisons need to use "end - 1" and "ceiling - 1" (though
         * that end 0 case should be mythical).
         *
         * Wherever addr is brought up or ceiling brought down, we must
         * be careful to reject "the opposite 0" before it confuses the
         * subsequent tests.  But what about where end is brought down
         * by PMD_SIZE below? no, end can't go down to 0 there.
         *
         * Whereas we round start (addr) and ceiling down, by different
         * masks at different levels, in order to test whether a table
         * now has no other vmas using it, so can be freed, we don't
         * bother to round floor or end up - the tests don't need that.
         */

        addr &= PMD_MASK;
        if (addr < floor) {
                addr += PMD_SIZE;
                if (!addr)
                        return;
        }
        if (ceiling) {
                ceiling &= PMD_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                end -= PMD_SIZE;
        if (addr > end - 1)
                return;
        /*
         * We add page table cache pages with PAGE_SIZE,
         * (see pte_free_tlb()), flush the tlb if we need
         */
        tlb_change_page_size(tlb, PAGE_SIZE);
        pgd = pgd_offset(tlb->mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
        } while (pgd++, addr = next, addr != end);
}

void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
                   struct vm_area_struct *vma, unsigned long floor,
                   unsigned long ceiling, bool mm_wr_locked)
{
        do {
                unsigned long addr = vma->vm_start;
                struct vm_area_struct *next;

                /*
                 * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
                 * be 0.  This will underflow and is okay.
                 */
                next = mas_find(mas, ceiling - 1);
                if (unlikely(xa_is_zero(next)))
                        next = NULL;

                /*
                 * Hide vma from rmap and truncate_pagecache before freeing
                 * pgtables
                 */
                if (mm_wr_locked)
                        vma_start_write(vma);
                unlink_anon_vmas(vma);
                unlink_file_vma(vma);

                if (is_vm_hugetlb_page(vma)) {
                        hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
                                floor, next ? next->vm_start : ceiling);
                } else {
                        /*
                         * Optimization: gather nearby vmas into one call down
                         */
                        while (next && next->vm_start <= vma->vm_end + PMD_SIZE
                               && !is_vm_hugetlb_page(next)) {
                                vma = next;
                                next = mas_find(mas, ceiling - 1);
                                if (unlikely(xa_is_zero(next)))
                                        next = NULL;
                                if (mm_wr_locked)
                                        vma_start_write(vma);
                                unlink_anon_vmas(vma);
                                unlink_file_vma(vma);
                        }
                        free_pgd_range(tlb, addr, vma->vm_end,
                                floor, next ? next->vm_start : ceiling);
                }
                vma = next;
        } while (vma);
}

void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
{
        spinlock_t *ptl = pmd_lock(mm, pmd);

        if (likely(pmd_none(*pmd))) {        /* Has another populated it ? */
                mm_inc_nr_ptes(mm);
                /*
                 * Ensure all pte setup (eg. pte page lock and page clearing) are
                 * visible before the pte is made visible to other CPUs by being
                 * put into page tables.
                 *
                 * The other side of the story is the pointer chasing in the page
                 * table walking code (when walking the page table without locking;
                 * ie. most of the time). Fortunately, these data accesses consist
                 * of a chain of data-dependent loads, meaning most CPUs (alpha
                 * being the notable exception) will already guarantee loads are
                 * seen in-order. See the alpha page table accessors for the
                 * smp_rmb() barriers in page table walking code.
                 */
                smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
                pmd_populate(mm, pmd, *pte);
                *pte = NULL;
        }
        spin_unlock(ptl);
}

int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
{
        pgtable_t new = pte_alloc_one(mm);
        if (!new)
                return -ENOMEM;

        pmd_install(mm, pmd, &new);
        if (new)
                pte_free(mm, new);
        return 0;
}

int __pte_alloc_kernel(pmd_t *pmd)
{
        pte_t *new = pte_alloc_one_kernel(&init_mm);
        if (!new)
                return -ENOMEM;

        spin_lock(&init_mm.page_table_lock);
        if (likely(pmd_none(*pmd))) {        /* Has another populated it ? */
                smp_wmb(); /* See comment in pmd_install() */
                pmd_populate_kernel(&init_mm, pmd, new);
                new = NULL;
        }
        spin_unlock(&init_mm.page_table_lock);
        if (new)
                pte_free_kernel(&init_mm, new);
        return 0;
}

static inline void init_rss_vec(int *rss)
{
        memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
}

static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
{
        int i;

        for (i = 0; i < NR_MM_COUNTERS; i++)
                if (rss[i])
                        add_mm_counter(mm, i, rss[i]);
}

/*
 * This function is called to print an error when a bad pte
 * is found. For example, we might have a PFN-mapped pte in
 * a region that doesn't allow it.
 *
 * The calling function must still handle the error.
 */
static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
                          pte_t pte, struct page *page)
{
        pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
        p4d_t *p4d = p4d_offset(pgd, addr);
        pud_t *pud = pud_offset(p4d, addr);
        pmd_t *pmd = pmd_offset(pud, addr);
        struct address_space *mapping;
        pgoff_t index;
        static unsigned long resume;
        static unsigned long nr_shown;
        static unsigned long nr_unshown;

        /*
         * Allow a burst of 60 reports, then keep quiet for that minute;
         * or allow a steady drip of one report per second.
         */
        if (nr_shown == 60) {
                if (time_before(jiffies, resume)) {
                        nr_unshown++;
                        return;
                }
                if (nr_unshown) {
                        pr_alert("BUG: Bad page map: %lu messages suppressed\n",
                                 nr_unshown);
                        nr_unshown = 0;
                }
                nr_shown = 0;
        }
        if (nr_shown++ == 0)
                resume = jiffies + 60 * HZ;

        mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
        index = linear_page_index(vma, addr);

        pr_alert("BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
                 current->comm,
                 (long long)pte_val(pte), (long long)pmd_val(*pmd));
        if (page)
                dump_page(page, "bad pte");
        pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
                 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
        pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n",
                 vma->vm_file,
                 vma->vm_ops ? vma->vm_ops->fault : NULL,
                 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
                 mapping ? mapping->a_ops->read_folio : NULL);
        dump_stack();
        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}

/*
 * vm_normal_page -- This function gets the "struct page" associated with a pte.
 *
 * "Special" mappings do not wish to be associated with a "struct page" (either
 * it doesn't exist, or it exists but they don't want to touch it). In this
 * case, NULL is returned here. "Normal" mappings do have a struct page.
 *
 * There are 2 broad cases. Firstly, an architecture may define a pte_special()
 * pte bit, in which case this function is trivial. Secondly, an architecture
 * may not have a spare pte bit, which requires a more complicated scheme,
 * described below.
 *
 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
 * special mapping (even if there are underlying and valid "struct pages").
 * COWed pages of a VM_PFNMAP are always normal.
 *
 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
 * mapping will always honor the rule
 *
 *        pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
 *
 * And for normal mappings this is false.
 *
 * This restricts such mappings to be a linear translation from virtual address
 * to pfn. To get around this restriction, we allow arbitrary mappings so long
 * as the vma is not a COW mapping; in that case, we know that all ptes are
 * special (because none can have been COWed).
 *
 *
 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
 *
 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
 * page" backing, however the difference is that _all_ pages with a struct
 * page (that is, those where pfn_valid is true) are refcounted and considered
 * normal pages by the VM. The disadvantage is that pages are refcounted
 * (which can be slower and simply not an option for some PFNMAP users). The
 * advantage is that we don't have to follow the strict linearity rule of
 * PFNMAP mappings in order to support COWable mappings.
 *
 */
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                            pte_t pte)
{
        unsigned long pfn = pte_pfn(pte);

        if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
                if (likely(!pte_special(pte)))
                        goto check_pfn;
                if (vma->vm_ops && vma->vm_ops->find_special_page)
                        return vma->vm_ops->find_special_page(vma, addr);
                if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
                        return NULL;
                if (is_zero_pfn(pfn))
                        return NULL;
                if (pte_devmap(pte))
                /*
                 * NOTE: New users of ZONE_DEVICE will not set pte_devmap()
                 * and will have refcounts incremented on their struct pages
                 * when they are inserted into PTEs, thus they are safe to
                 * return here. Legacy ZONE_DEVICE pages that set pte_devmap()
                 * do not have refcounts. Example of legacy ZONE_DEVICE is
                 * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
                 */
                        return NULL;

                print_bad_pte(vma, addr, pte, NULL);
                return NULL;
        }

        /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */

        if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
                if (vma->vm_flags & VM_MIXEDMAP) {
                        if (!pfn_valid(pfn))
                                return NULL;
                        goto out;
                } else {
                        unsigned long off;
                        off = (addr - vma->vm_start) >> PAGE_SHIFT;
                        if (pfn == vma->vm_pgoff + off)
                                return NULL;
                        if (!is_cow_mapping(vma->vm_flags))
                                return NULL;
                }
        }

        if (is_zero_pfn(pfn))
                return NULL;

check_pfn:
        if (unlikely(pfn > highest_memmap_pfn)) {
                print_bad_pte(vma, addr, pte, NULL);
                return NULL;
        }

        /*
         * NOTE! We still have PageReserved() pages in the page tables.
         * eg. VDSO mappings can cause them to exist.
         */
out:
        return pfn_to_page(pfn);
}

struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
                            pte_t pte)
{
        struct page *page = vm_normal_page(vma, addr, pte);

        if (page)
                return page_folio(page);
        return NULL;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
                                pmd_t pmd)
{
        unsigned long pfn = pmd_pfn(pmd);

        /*
         * There is no pmd_special() but there may be special pmds, e.g.
         * in a direct-access (dax) mapping, so let's just replicate the
         * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
         */
        if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
                if (vma->vm_flags & VM_MIXEDMAP) {
                        if (!pfn_valid(pfn))
                                return NULL;
                        goto out;
                } else {
                        unsigned long off;
                        off = (addr - vma->vm_start) >> PAGE_SHIFT;
                        if (pfn == vma->vm_pgoff + off)
                                return NULL;
                        if (!is_cow_mapping(vma->vm_flags))
                                return NULL;
                }
        }

        if (pmd_devmap(pmd))
                return NULL;
        if (is_huge_zero_pmd(pmd))
                return NULL;
        if (unlikely(pfn > highest_memmap_pfn))
                return NULL;

        /*
         * NOTE! We still have PageReserved() pages in the page tables.
         * eg. VDSO mappings can cause them to exist.
         */
out:
        return pfn_to_page(pfn);
}

struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
                                  unsigned long addr, pmd_t pmd)
{
        struct page *page = vm_normal_page_pmd(vma, addr, pmd);

        if (page)
                return page_folio(page);
        return NULL;
}
#endif

static void restore_exclusive_pte(struct vm_area_struct *vma,
                                  struct page *page, unsigned long address,
                                  pte_t *ptep)
{
        struct folio *folio = page_folio(page);
        pte_t orig_pte;
        pte_t pte;
        swp_entry_t entry;

        orig_pte = ptep_get(ptep);
        pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
        if (pte_swp_soft_dirty(orig_pte))
                pte = pte_mksoft_dirty(pte);

        entry = pte_to_swp_entry(orig_pte);
        if (pte_swp_uffd_wp(orig_pte))
                pte = pte_mkuffd_wp(pte);
        else if (is_writable_device_exclusive_entry(entry))
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);

        VM_BUG_ON_FOLIO(pte_write(pte) && (!folio_test_anon(folio) &&
                                           PageAnonExclusive(page)), folio);

        /*
         * No need to take a page reference as one was already
         * created when the swap entry was made.
         */
        if (folio_test_anon(folio))
                folio_add_anon_rmap_pte(folio, page, vma, address, RMAP_NONE);
        else
                /*
                 * Currently device exclusive access only supports anonymous
                 * memory so the entry shouldn't point to a filebacked page.
                 */
                WARN_ON_ONCE(1);

        set_pte_at(vma->vm_mm, address, ptep, pte);

        /*
         * No need to invalidate - it was non-present before. However
         * secondary CPUs may have mappings that need invalidating.
         */
        update_mmu_cache(vma, address, ptep);
}

/*
 * Tries to restore an exclusive pte if the page lock can be acquired without
 * sleeping.
 */
static int
try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
                        unsigned long addr)
{
        swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte));
        struct page *page = pfn_swap_entry_to_page(entry);

        if (trylock_page(page)) {
                restore_exclusive_pte(vma, page, addr, src_pte);
                unlock_page(page);
                return 0;
        }

        return -EBUSY;
}

/*
 * copy one vm_area from one task to the other. Assumes the page tables
 * already present in the new task to be cleared in the whole range
 * covered by this vma.
 */

static unsigned long
copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma, unsigned long addr, int *rss)
{
        unsigned long vm_flags = dst_vma->vm_flags;
        pte_t orig_pte = ptep_get(src_pte);
        pte_t pte = orig_pte;
        struct folio *folio;
        struct page *page;
        swp_entry_t entry = pte_to_swp_entry(orig_pte);

        if (likely(!non_swap_entry(entry))) {
                if (swap_duplicate(entry) < 0)
                        return -EIO;

                /* make sure dst_mm is on swapoff's mmlist. */
                if (unlikely(list_empty(&dst_mm->mmlist))) {
                        spin_lock(&mmlist_lock);
                        if (list_empty(&dst_mm->mmlist))
                                list_add(&dst_mm->mmlist,
                                                &src_mm->mmlist);
                        spin_unlock(&mmlist_lock);
                }
                /* Mark the swap entry as shared. */
                if (pte_swp_exclusive(orig_pte)) {
                        pte = pte_swp_clear_exclusive(orig_pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
                rss[MM_SWAPENTS]++;
        } else if (is_migration_entry(entry)) {
                folio = pfn_swap_entry_folio(entry);

                rss[mm_counter(folio)]++;

                if (!is_readable_migration_entry(entry) &&
                                is_cow_mapping(vm_flags)) {
                        /*
                         * COW mappings require pages in both parent and child
                         * to be set to read. A previously exclusive entry is
                         * now shared.
                         */
                        entry = make_readable_migration_entry(
                                                        swp_offset(entry));
                        pte = swp_entry_to_pte(entry);
                        if (pte_swp_soft_dirty(orig_pte))
                                pte = pte_swp_mksoft_dirty(pte);
                        if (pte_swp_uffd_wp(orig_pte))
                                pte = pte_swp_mkuffd_wp(pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
        } else if (is_device_private_entry(entry)) {
                page = pfn_swap_entry_to_page(entry);
                folio = page_folio(page);

                /*
                 * Update rss count even for unaddressable pages, as
                 * they should treated just like normal pages in this
                 * respect.
                 *
                 * We will likely want to have some new rss counters
                 * for unaddressable pages, at some point. But for now
                 * keep things as they are.
                 */
                folio_get(folio);
                rss[mm_counter(folio)]++;
                /* Cannot fail as these pages cannot get pinned. */
                folio_try_dup_anon_rmap_pte(folio, page, src_vma);

                /*
                 * We do not preserve soft-dirty information, because so
                 * far, checkpoint/restore is the only feature that
                 * requires that. And checkpoint/restore does not work
                 * when a device driver is involved (you cannot easily
                 * save and restore device driver state).
                 */
                if (is_writable_device_private_entry(entry) &&
                    is_cow_mapping(vm_flags)) {
                        entry = make_readable_device_private_entry(
                                                        swp_offset(entry));
                        pte = swp_entry_to_pte(entry);
                        if (pte_swp_uffd_wp(orig_pte))
                                pte = pte_swp_mkuffd_wp(pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
        } else if (is_device_exclusive_entry(entry)) {
                /*
                 * Make device exclusive entries present by restoring the
                 * original entry then copying as for a present pte. Device
                 * exclusive entries currently only support private writable
                 * (ie. COW) mappings.
                 */
                VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
                if (try_restore_exclusive_pte(src_pte, src_vma, addr))
                        return -EBUSY;
                return -ENOENT;
        } else if (is_pte_marker_entry(entry)) {
                pte_marker marker = copy_pte_marker(entry, dst_vma);

                if (marker)
                        set_pte_at(dst_mm, addr, dst_pte,
                                   make_pte_marker(marker));
                return 0;
        }
        if (!userfaultfd_wp(dst_vma))
                pte = pte_swp_clear_uffd_wp(pte);
        set_pte_at(dst_mm, addr, dst_pte, pte);
        return 0;
}

/*
 * Copy a present and normal page.
 *
 * NOTE! The usual case is that this isn't required;
 * instead, the caller can just increase the page refcount
 * and re-use the pte the traditional way.
 *
 * And if we need a pre-allocated page but don't yet have
 * one, return a negative error to let the preallocation
 * code know so that it can do so outside the page table
 * lock.
 */
static inline int
copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                  pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
                  struct folio **prealloc, struct page *page)
{
        struct folio *new_folio;
        pte_t pte;

        new_folio = *prealloc;
        if (!new_folio)
                return -EAGAIN;

        /*
         * We have a prealloc page, all good!  Take it
         * over and copy the page & arm it.
         */
        *prealloc = NULL;
        copy_user_highpage(&new_folio->page, page, addr, src_vma);
        __folio_mark_uptodate(new_folio);
        folio_add_new_anon_rmap(new_folio, dst_vma, addr);
        folio_add_lru_vma(new_folio, dst_vma);
        rss[MM_ANONPAGES]++;

        /* All done, just insert the new page copy in the child */
        pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot);
        pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
        if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte)))
                /* Uffd-wp needs to be delivered to dest pte as well */
                pte = pte_mkuffd_wp(pte);
        set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
        return 0;
}

static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte,
                pte_t pte, unsigned long addr, int nr)
{
        struct mm_struct *src_mm = src_vma->vm_mm;

        /* If it's a COW mapping, write protect it both processes. */
        if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) {
                wrprotect_ptes(src_mm, addr, src_pte, nr);
                pte = pte_wrprotect(pte);
        }

        /* If it's a shared mapping, mark it clean in the child. */
        if (src_vma->vm_flags & VM_SHARED)
                pte = pte_mkclean(pte);
        pte = pte_mkold(pte);

        if (!userfaultfd_wp(dst_vma))
                pte = pte_clear_uffd_wp(pte);

        set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
}

/*
 * Copy one present PTE, trying to batch-process subsequent PTEs that map
 * consecutive pages of the same folio by copying them as well.
 *
 * Returns -EAGAIN if one preallocated page is required to copy the next PTE.
 * Otherwise, returns the number of copied PTEs (at least 1).
 */
static inline int
copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                 pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
                 int max_nr, int *rss, struct folio **prealloc)
{
        struct page *page;
        struct folio *folio;
        bool any_writable;
        fpb_t flags = 0;
        int err, nr;

        page = vm_normal_page(src_vma, addr, pte);
        if (unlikely(!page))
                goto copy_pte;

        folio = page_folio(page);

        /*
         * If we likely have to copy, just don't bother with batching. Make
         * sure that the common "small folio" case is as fast as possible
         * by keeping the batching logic separate.
         */
        if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
                if (src_vma->vm_flags & VM_SHARED)
                        flags |= FPB_IGNORE_DIRTY;
                if (!vma_soft_dirty_enabled(src_vma))
                        flags |= FPB_IGNORE_SOFT_DIRTY;

                nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
                                     &any_writable, NULL, NULL);
                folio_ref_add(folio, nr);
                if (folio_test_anon(folio)) {
                        if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
                                                                  nr, src_vma))) {
                                folio_ref_sub(folio, nr);
                                return -EAGAIN;
                        }
                        rss[MM_ANONPAGES] += nr;
                        VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
                } else {
                        folio_dup_file_rmap_ptes(folio, page, nr);
                        rss[mm_counter_file(folio)] += nr;
                }
                if (any_writable)
                        pte = pte_mkwrite(pte, src_vma);
                __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
                                    addr, nr);
                return nr;
        }

        folio_get(folio);
        if (folio_test_anon(folio)) {
                /*
                 * If this page may have been pinned by the parent process,
                 * copy the page immediately for the child so that we'll always
                 * guarantee the pinned page won't be randomly replaced in the
                 * future.
                 */
                if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
                        /* Page may be pinned, we have to copy. */
                        folio_put(folio);
                        err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
                                                addr, rss, prealloc, page);
                        return err ? err : 1;
                }
                rss[MM_ANONPAGES]++;
                VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
        } else {
                folio_dup_file_rmap_pte(folio, page);
                rss[mm_counter_file(folio)]++;
        }

copy_pte:
        __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1);
        return 1;
}

static inline struct folio *folio_prealloc(struct mm_struct *src_mm,
                struct vm_area_struct *vma, unsigned long addr, bool need_zero)
{
        struct folio *new_folio;

        if (need_zero)
                new_folio = vma_alloc_zeroed_movable_folio(vma, addr);
        else
                new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
                                            addr, false);

        if (!new_folio)
                return NULL;

        if (mem_cgroup_charge(new_folio, src_mm, GFP_KERNEL)) {
                folio_put(new_folio);
                return NULL;
        }
        folio_throttle_swaprate(new_folio, GFP_KERNEL);

        return new_folio;
}

static int
copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pte_t *orig_src_pte, *orig_dst_pte;
        pte_t *src_pte, *dst_pte;
        pte_t ptent;
        spinlock_t *src_ptl, *dst_ptl;
        int progress, max_nr, ret = 0;
        int rss[NR_MM_COUNTERS];
        swp_entry_t entry = (swp_entry_t){0};
        struct folio *prealloc = NULL;
        int nr;

again:
        progress = 0;
        init_rss_vec(rss);

        /*
         * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the
         * error handling here, assume that exclusive mmap_lock on dst and src
         * protects anon from unexpected THP transitions; with shmem and file
         * protected by mmap_lock-less collapse skipping areas with anon_vma
         * (whereas vma_needs_copy() skips areas without anon_vma).  A rework
         * can remove such assumptions later, but this is good enough for now.
         */
        dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
        if (!dst_pte) {
                ret = -ENOMEM;
                goto out;
        }
        src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl);
        if (!src_pte) {
                pte_unmap_unlock(dst_pte, dst_ptl);
                /* ret == 0 */
                goto out;
        }
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
        orig_src_pte = src_pte;
        orig_dst_pte = dst_pte;
        arch_enter_lazy_mmu_mode();

        do {
                nr = 1;

                /*
                 * We are holding two locks at this point - either of them
                 * could generate latencies in another task on another CPU.
                 */
                if (progress >= 32) {
                        progress = 0;
                        if (need_resched() ||
                            spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
                                break;
                }
                ptent = ptep_get(src_pte);
                if (pte_none(ptent)) {
                        progress++;
                        continue;
                }
                if (unlikely(!pte_present(ptent))) {
                        ret = copy_nonpresent_pte(dst_mm, src_mm,
                                                  dst_pte, src_pte,
                                                  dst_vma, src_vma,
                                                  addr, rss);
                        if (ret == -EIO) {
                                entry = pte_to_swp_entry(ptep_get(src_pte));
                                break;
                        } else if (ret == -EBUSY) {
                                break;
                        } else if (!ret) {
                                progress += 8;
                                continue;
                        }
                        ptent = ptep_get(src_pte);
                        VM_WARN_ON_ONCE(!pte_present(ptent));

                        /*
                         * Device exclusive entry restored, continue by copying
                         * the now present pte.
                         */
                        WARN_ON_ONCE(ret != -ENOENT);
                }
                /* copy_present_ptes() will clear `*prealloc' if consumed */
                max_nr = (end - addr) / PAGE_SIZE;
                ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
                                        ptent, addr, max_nr, rss, &prealloc);
                /*
                 * If we need a pre-allocated page for this pte, drop the
                 * locks, allocate, and try again.
                 */
                if (unlikely(ret == -EAGAIN))
                        break;
                if (unlikely(prealloc)) {
                        /*
                         * pre-alloc page cannot be reused by next time so as
                         * to strictly follow mempolicy (e.g., alloc_page_vma()
                         * will allocate page according to address).  This
                         * could only happen if one pinned pte changed.
                         */
                        folio_put(prealloc);
                        prealloc = NULL;
                }
                nr = ret;
                progress += 8 * nr;
        } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
                 addr != end);

        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(orig_src_pte, src_ptl);
        add_mm_rss_vec(dst_mm, rss);
        pte_unmap_unlock(orig_dst_pte, dst_ptl);
        cond_resched();

        if (ret == -EIO) {
                VM_WARN_ON_ONCE(!entry.val);
                if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
                        ret = -ENOMEM;
                        goto out;
                }
                entry.val = 0;
        } else if (ret == -EBUSY) {
                goto out;
        } else if (ret ==  -EAGAIN) {
                prealloc = folio_prealloc(src_mm, src_vma, addr, false);
                if (!prealloc)
                        return -ENOMEM;
        } else if (ret < 0) {
                VM_WARN_ON_ONCE(1);
        }

        /* We've captured and resolved the error. Reset, try again. */
        ret = 0;

        if (addr != end)
                goto again;
out:
        if (unlikely(prealloc))
                folio_put(prealloc);
        return ret;
}

static inline int
copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pmd_t *src_pmd, *dst_pmd;
        unsigned long next;

        dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
        if (!dst_pmd)
                return -ENOMEM;
        src_pmd = pmd_offset(src_pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
                        || pmd_devmap(*src_pmd)) {
                        int err;
                        VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
                        err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
                                            addr, dst_vma, src_vma);
                        if (err == -ENOMEM)
                                return -ENOMEM;
                        if (!err)
                                continue;
                        /* fall through */
                }
                if (pmd_none_or_clear_bad(src_pmd))
                        continue;
                if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_pmd++, src_pmd++, addr = next, addr != end);
        return 0;
}

static inline int
copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pud_t *src_pud, *dst_pud;
        unsigned long next;

        dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
        if (!dst_pud)
                return -ENOMEM;
        src_pud = pud_offset(src_p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
                        int err;

                        VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
                        err = copy_huge_pud(dst_mm, src_mm,
                                            dst_pud, src_pud, addr, src_vma);
                        if (err == -ENOMEM)
                                return -ENOMEM;
                        if (!err)
                                continue;
                        /* fall through */
                }
                if (pud_none_or_clear_bad(src_pud))
                        continue;
                if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_pud++, src_pud++, addr = next, addr != end);
        return 0;
}

static inline int
copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        p4d_t *src_p4d, *dst_p4d;
        unsigned long next;

        dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
        if (!dst_p4d)
                return -ENOMEM;
        src_p4d = p4d_offset(src_pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(src_p4d))
                        continue;
                if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_p4d++, src_p4d++, addr = next, addr != end);
        return 0;
}

/*
 * Return true if the vma needs to copy the pgtable during this fork().  Return
 * false when we can speed up fork() by allowing lazy page faults later until
 * when the child accesses the memory range.
 */
static bool
vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
        /*
         * Always copy pgtables when dst_vma has uffd-wp enabled even if it's
         * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable
         * contains uffd-wp protection information, that's something we can't
         * retrieve from page cache, and skip copying will lose those info.
         */
        if (userfaultfd_wp(dst_vma))
                return true;

        if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
                return true;

        if (src_vma->anon_vma)
                return true;

        /*
         * Don't copy ptes where a page fault will fill them correctly.  Fork
         * becomes much lighter when there are big shared or private readonly
         * mappings. The tradeoff is that copy_page_range is more efficient
         * than faulting.
         */
        return false;
}

int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
        pgd_t *src_pgd, *dst_pgd;
        unsigned long next;
        unsigned long addr = src_vma->vm_start;
        unsigned long end = src_vma->vm_end;
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        struct mmu_notifier_range range;
        bool is_cow;
        int ret;

        if (!vma_needs_copy(dst_vma, src_vma))
                return 0;

        if (is_vm_hugetlb_page(src_vma))
                return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);

        if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
                /*
                 * We do not free on error cases below as remove_vma
                 * gets called on error from higher level routine
                 */
                ret = track_pfn_copy(src_vma);
                if (ret)
                        return ret;
        }

        /*
         * We need to invalidate the secondary MMU mappings only when
         * there could be a permission downgrade on the ptes of the
         * parent mm. And a permission downgrade will only happen if
         * is_cow_mapping() returns true.
         */
        is_cow = is_cow_mapping(src_vma->vm_flags);

        if (is_cow) {
                mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
                                        0, src_mm, addr, end);
                mmu_notifier_invalidate_range_start(&range);
                /*
                 * Disabling preemption is not needed for the write side, as
                 * the read side doesn't spin, but goes to the mmap_lock.
                 *
                 * Use the raw variant of the seqcount_t write API to avoid
                 * lockdep complaining about preemptibility.
                 */
                vma_assert_write_locked(src_vma);
                raw_write_seqcount_begin(&src_mm->write_protect_seq);
        }

        ret = 0;
        dst_pgd = pgd_offset(dst_mm, addr);
        src_pgd = pgd_offset(src_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(src_pgd))
                        continue;
                if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
                                            addr, next))) {
                        untrack_pfn_clear(dst_vma);
                        ret = -ENOMEM;
                        break;
                }
        } while (dst_pgd++, src_pgd++, addr = next, addr != end);

        if (is_cow) {
                raw_write_seqcount_end(&src_mm->write_protect_seq);
                mmu_notifier_invalidate_range_end(&range);
        }
        return ret;
}

/* Whether we should zap all COWed (private) pages too */
static inline bool should_zap_cows(struct zap_details *details)
{
        /* By default, zap all pages */
        if (!details)
                return true;

        /* Or, we zap COWed pages only if the caller wants to */
        return details->even_cows;
}

/* Decides whether we should zap this folio with the folio pointer specified */
static inline bool should_zap_folio(struct zap_details *details,
                                    struct folio *folio)
{
        /* If we can make a decision without *folio.. */
        if (should_zap_cows(details))
                return true;

        /* Otherwise we should only zap non-anon folios */
        return !folio_test_anon(folio);
}

static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
{
        if (!details)
                return false;

        return details->zap_flags & ZAP_FLAG_DROP_MARKER;
}

/*
 * This function makes sure that we'll replace the none pte with an uffd-wp
 * swap special pte marker when necessary. Must be with the pgtable lock held.
 */
static inline void
zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
                              unsigned long addr, pte_t *pte, int nr,
                              struct zap_details *details, pte_t pteval)
{
        /* Zap on anonymous always means dropping everything */
        if (vma_is_anonymous(vma))
                return;

        if (zap_drop_file_uffd_wp(details))
                return;

        for (;;) {
                /* the PFN in the PTE is irrelevant. */
                pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
                if (--nr == 0)
                        break;
                pte++;
                addr += PAGE_SIZE;
        }
}

static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
                struct vm_area_struct *vma, struct folio *folio,
                struct page *page, pte_t *pte, pte_t ptent, unsigned int nr,
                unsigned long addr, struct zap_details *details, int *rss,
                bool *force_flush, bool *force_break)
{
        struct mm_struct *mm = tlb->mm;
        bool delay_rmap = false;

        if (!folio_test_anon(folio)) {
                ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
                if (pte_dirty(ptent)) {
                        folio_mark_dirty(folio);
                        if (tlb_delay_rmap(tlb)) {
                                delay_rmap = true;
                                *force_flush = true;
                        }
                }
                if (pte_young(ptent) && likely(vma_has_recency(vma)))
                        folio_mark_accessed(folio);
                rss[mm_counter(folio)] -= nr;
        } else {
                /* We don't need up-to-date accessed/dirty bits. */
                clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
                rss[MM_ANONPAGES] -= nr;
        }
        /* Checking a single PTE in a batch is sufficient. */
        arch_check_zapped_pte(vma, ptent);
        tlb_remove_tlb_entries(tlb, pte, nr, addr);
        if (unlikely(userfaultfd_pte_wp(vma, ptent)))
                zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details,
                                              ptent);

        if (!delay_rmap) {
                folio_remove_rmap_ptes(folio, page, nr, vma);

                if (unlikely(folio_mapcount(folio) < 0))
                        print_bad_pte(vma, addr, ptent, page);
        }
        if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) {
                *force_flush = true;
                *force_break = true;
        }
}

/*
 * Zap or skip at least one present PTE, trying to batch-process subsequent
 * PTEs that map consecutive pages of the same folio.
 *
 * Returns the number of processed (skipped or zapped) PTEs (at least 1).
 */
static inline int zap_present_ptes(struct mmu_gather *tlb,
                struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
                unsigned int max_nr, unsigned long addr,
                struct zap_details *details, int *rss, bool *force_flush,
                bool *force_break)
{
        const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
        struct mm_struct *mm = tlb->mm;
        struct folio *folio;
        struct page *page;
        int nr;

        page = vm_normal_page(vma, addr, ptent);
        if (!page) {
                /* We don't need up-to-date accessed/dirty bits. */
                ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
                arch_check_zapped_pte(vma, ptent);
                tlb_remove_tlb_entry(tlb, pte, addr);
                if (userfaultfd_pte_wp(vma, ptent))
                        zap_install_uffd_wp_if_needed(vma, addr, pte, 1,
                                                      details, ptent);
                ksm_might_unmap_zero_page(mm, ptent);
                return 1;
        }

        folio = page_folio(page);
        if (unlikely(!should_zap_folio(details, folio)))
                return 1;

        /*
         * Make sure that the common "small folio" case is as fast as possible
         * by keeping the batching logic separate.
         */
        if (unlikely(folio_test_large(folio) && max_nr != 1)) {
                nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
                                     NULL, NULL, NULL);

                zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
                                       addr, details, rss, force_flush,
                                       force_break);
                return nr;
        }
        zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr,
                               details, rss, force_flush, force_break);
        return 1;
}

static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pmd_t *pmd,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        bool force_flush = false, force_break = false;
        struct mm_struct *mm = tlb->mm;
        int rss[NR_MM_COUNTERS];
        spinlock_t *ptl;
        pte_t *start_pte;
        pte_t *pte;
        swp_entry_t entry;
        int nr;

        tlb_change_page_size(tlb, PAGE_SIZE);
        init_rss_vec(rss);
        start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        if (!pte)
                return addr;

        flush_tlb_batched_pending(mm);
        arch_enter_lazy_mmu_mode();
        do {
                pte_t ptent = ptep_get(pte);
                struct folio *folio;
                struct page *page;
                int max_nr;

                nr = 1;
                if (pte_none(ptent))
                        continue;

                if (need_resched())
                        break;

                if (pte_present(ptent)) {
                        max_nr = (end - addr) / PAGE_SIZE;
                        nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr,
                                              addr, details, rss, &force_flush,
                                              &force_break);
                        if (unlikely(force_break)) {
                                addr += nr * PAGE_SIZE;
                                break;
                        }
                        continue;
                }

                entry = pte_to_swp_entry(ptent);
                if (is_device_private_entry(entry) ||
                    is_device_exclusive_entry(entry)) {
                        page = pfn_swap_entry_to_page(entry);
                        folio = page_folio(page);
                        if (unlikely(!should_zap_folio(details, folio)))
                                continue;
                        /*
                         * Both device private/exclusive mappings should only
                         * work with anonymous page so far, so we don't need to
                         * consider uffd-wp bit when zap. For more information,
                         * see zap_install_uffd_wp_if_needed().
                         */
                        WARN_ON_ONCE(!vma_is_anonymous(vma));
                        rss[mm_counter(folio)]--;
                        if (is_device_private_entry(entry))
                                folio_remove_rmap_pte(folio, page, vma);
                        folio_put(folio);
                } else if (!non_swap_entry(entry)) {
                        max_nr = (end - addr) / PAGE_SIZE;
                        nr = swap_pte_batch(pte, max_nr, ptent);
                        /* Genuine swap entries, hence a private anon pages */
                        if (!should_zap_cows(details))
                                continue;
                        rss[MM_SWAPENTS] -= nr;
                        free_swap_and_cache_nr(entry, nr);
                } else if (is_migration_entry(entry)) {
                        folio = pfn_swap_entry_folio(entry);
                        if (!should_zap_folio(details, folio))
                                continue;
                        rss[mm_counter(folio)]--;
                } else if (pte_marker_entry_uffd_wp(entry)) {
                        /*
                         * For anon: always drop the marker; for file: only
                         * drop the marker if explicitly requested.
                         */
                        if (!vma_is_anonymous(vma) &&
                            !zap_drop_file_uffd_wp(details))
                                continue;
                } else if (is_hwpoison_entry(entry) ||
                           is_poisoned_swp_entry(entry)) {
                        if (!should_zap_cows(details))
                                continue;
                } else {
                        /* We should have covered all the swap entry types */
                        pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
                        WARN_ON_ONCE(1);
                }
                clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
                zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
        } while (pte += nr, addr += PAGE_SIZE * nr, addr != end);

        add_mm_rss_vec(mm, rss);
        arch_leave_lazy_mmu_mode();

        /* Do the actual TLB flush before dropping ptl */
        if (force_flush) {
                tlb_flush_mmu_tlbonly(tlb);
                tlb_flush_rmaps(tlb, vma);
        }
        pte_unmap_unlock(start_pte, ptl);

        /*
         * If we forced a TLB flush (either due to running out of
         * batch buffers or because we needed to flush dirty TLB
         * entries before releasing the ptl), free the batched
         * memory too. Come back again if we didn't do everything.
         */
        if (force_flush)
                tlb_flush_mmu(tlb);

        return addr;
}

static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pud_t *pud,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        pmd_t *pmd;
        unsigned long next;

        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE)
                                __split_huge_pmd(vma, pmd, addr, false, NULL);
                        else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
                                addr = next;
                                continue;
                        }
                        /* fall through */
                } else if (details && details->single_folio &&
                           folio_test_pmd_mappable(details->single_folio) &&
                           next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
                        spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
                        /*
                         * Take and drop THP pmd lock so that we cannot return
                         * prematurely, while zap_huge_pmd() has cleared *pmd,
                         * but not yet decremented compound_mapcount().
                         */
                        spin_unlock(ptl);
                }
                if (pmd_none(*pmd)) {
                        addr = next;
                        continue;
                }
                addr = zap_pte_range(tlb, vma, pmd, addr, next, details);
                if (addr != next)
                        pmd--;
        } while (pmd++, cond_resched(), addr != end);

        return addr;
}

static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, p4d_t *p4d,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        pud_t *pud;
        unsigned long next;

        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
                        if (next - addr != HPAGE_PUD_SIZE) {
                                mmap_assert_locked(tlb->mm);
                                split_huge_pud(vma, pud, addr);
                        } else if (zap_huge_pud(tlb, vma, pud, addr))
                                goto next;
                        /* fall through */
                }
                if (pud_none_or_clear_bad(pud))
                        continue;
                next = zap_pmd_range(tlb, vma, pud, addr, next, details);
next:
                cond_resched();
        } while (pud++, addr = next, addr != end);

        return addr;
}

static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(p4d))
                        continue;
                next = zap_pud_range(tlb, vma, p4d, addr, next, details);
        } while (p4d++, addr = next, addr != end);

        return addr;
}

void unmap_page_range(struct mmu_gather *tlb,
                             struct vm_area_struct *vma,
                             unsigned long addr, unsigned long end,
                             struct zap_details *details)
{
        pgd_t *pgd;
        unsigned long next;

        BUG_ON(addr >= end);
        tlb_start_vma(tlb, vma);
        pgd = pgd_offset(vma->vm_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
        } while (pgd++, addr = next, addr != end);
        tlb_end_vma(tlb, vma);
}


static void unmap_single_vma(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr,
                struct zap_details *details, bool mm_wr_locked)
{
        unsigned long start = max(vma->vm_start, start_addr);
        unsigned long end;

        if (start >= vma->vm_end)
                return;
        end = min(vma->vm_end, end_addr);
        if (end <= vma->vm_start)
                return;

        if (vma->vm_file)
                uprobe_munmap(vma, start, end);

        if (unlikely(vma->vm_flags & VM_PFNMAP))
                untrack_pfn(vma, 0, 0, mm_wr_locked);

        if (start != end) {
                if (unlikely(is_vm_hugetlb_page(vma))) {
                        /*
                         * It is undesirable to test vma->vm_file as it
                         * should be non-null for valid hugetlb area.
                         * However, vm_file will be NULL in the error
                         * cleanup path of mmap_region. When
                         * hugetlbfs ->mmap method fails,
                         * mmap_region() nullifies vma->vm_file
                         * before calling this function to clean up.
                         * Since no pte has actually been setup, it is
                         * safe to do nothing in this case.
                         */
                        if (vma->vm_file) {
                                zap_flags_t zap_flags = details ?
                                    details->zap_flags : 0;
                                __unmap_hugepage_range(tlb, vma, start, end,
                                                             NULL, zap_flags);
                        }
                } else
                        unmap_page_range(tlb, vma, start, end, details);
        }
}

/**
 * unmap_vmas - unmap a range of memory covered by a list of vma's
 * @tlb: address of the caller's struct mmu_gather
 * @mas: the maple state
 * @vma: the starting vma
 * @start_addr: virtual address at which to start unmapping
 * @end_addr: virtual address at which to end unmapping
 * @tree_end: The maximum index to check
 * @mm_wr_locked: lock flag
 *
 * Unmap all pages in the vma list.
 *
 * Only addresses between `start' and `end' will be unmapped.
 *
 * The VMA list must be sorted in ascending virtual address order.
 *
 * unmap_vmas() assumes that the caller will flush the whole unmapped address
 * range after unmap_vmas() returns.  So the only responsibility here is to
 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
 * drops the lock and schedules.
 */
void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr, unsigned long tree_end,
                bool mm_wr_locked)
{
        struct mmu_notifier_range range;
        struct zap_details details = {
                .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
                /* Careful - we need to zap private pages too! */
                .even_cows = true,
        };

        mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
                                start_addr, end_addr);
        mmu_notifier_invalidate_range_start(&range);
        do {
                unsigned long start = start_addr;
                unsigned long end = end_addr;
                hugetlb_zap_begin(vma, &start, &end);
                unmap_single_vma(tlb, vma, start, end, &details,
                                 mm_wr_locked);
                hugetlb_zap_end(vma, &details);
                vma = mas_find(mas, tree_end - 1);
        } while (vma && likely(!xa_is_zero(vma)));
        mmu_notifier_invalidate_range_end(&range);
}

/**
 * zap_page_range_single - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 * @details: details of shared cache invalidation
 *
 * The range must fit into one VMA.
 */
void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
                unsigned long size, struct zap_details *details)
{
        const unsigned long end = address + size;
        struct mmu_notifier_range range;
        struct mmu_gather tlb;

        lru_add_drain();
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address, end);
        hugetlb_zap_begin(vma, &range.start, &range.end);
        tlb_gather_mmu(&tlb, vma->vm_mm);
        update_hiwater_rss(vma->vm_mm);
        mmu_notifier_invalidate_range_start(&range);
        /*
         * unmap 'address-end' not 'range.start-range.end' as range
         * could have been expanded for hugetlb pmd sharing.
         */
        unmap_single_vma(&tlb, vma, address, end, details, false);
        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb);
        hugetlb_zap_end(vma, details);
}

/**
 * zap_vma_ptes - remove ptes mapping the vma
 * @vma: vm_area_struct holding ptes to be zapped
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 *
 * This function only unmaps ptes assigned to VM_PFNMAP vmas.
 *
 * The entire address range must be fully contained within the vma.
 *
 */
void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
                unsigned long size)
{
        if (!range_in_vma(vma, address, address + size) ||
                            !(vma->vm_flags & VM_PFNMAP))
                return;

        zap_page_range_single(vma, address, size, NULL);
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);

static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        pgd = pgd_offset(mm, addr);
        p4d = p4d_alloc(mm, pgd, addr);
        if (!p4d)
                return NULL;
        pud = pud_alloc(mm, p4d, addr);
        if (!pud)
                return NULL;
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return NULL;

        VM_BUG_ON(pmd_trans_huge(*pmd));
        return pmd;
}

pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
                        spinlock_t **ptl)
{
        pmd_t *pmd = walk_to_pmd(mm, addr);

        if (!pmd)
                return NULL;
        return pte_alloc_map_lock(mm, pmd, addr, ptl);
}

static int validate_page_before_insert(struct page *page)
{
        struct folio *folio = page_folio(page);

        if (folio_test_anon(folio) || folio_test_slab(folio) ||
            page_has_type(page))
                return -EINVAL;
        flush_dcache_folio(folio);
        return 0;
}

static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
                        unsigned long addr, struct page *page, pgprot_t prot)
{
        struct folio *folio = page_folio(page);

        if (!pte_none(ptep_get(pte)))
                return -EBUSY;
        /* Ok, finally just insert the thing.. */
        folio_get(folio);
        inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
        folio_add_file_rmap_pte(folio, page, vma);
        set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
        return 0;
}

/*
 * This is the old fallback for page remapping.
 *
 * For historical reasons, it only allows reserved pages. Only
 * old drivers should use this, and they needed to mark their
 * pages reserved for the old functions anyway.
 */
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
                        struct page *page, pgprot_t prot)
{
        int retval;
        pte_t *pte;
        spinlock_t *ptl;

        retval = validate_page_before_insert(page);
        if (retval)
                goto out;
        retval = -ENOMEM;
        pte = get_locked_pte(vma->vm_mm, addr, &ptl);
        if (!pte)
                goto out;
        retval = insert_page_into_pte_locked(vma, pte, addr, page, prot);
        pte_unmap_unlock(pte, ptl);
out:
        return retval;
}

static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
                        unsigned long addr, struct page *page, pgprot_t prot)
{
        int err;

        if (!page_count(page))
                return -EINVAL;
        err = validate_page_before_insert(page);
        if (err)
                return err;
        return insert_page_into_pte_locked(vma, pte, addr, page, prot);
}

/* insert_pages() amortizes the cost of spinlock operations
 * when inserting pages in a loop.
 */
static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num, pgprot_t prot)
{
        pmd_t *pmd = NULL;
        pte_t *start_pte, *pte;
        spinlock_t *pte_lock;
        struct mm_struct *const mm = vma->vm_mm;
        unsigned long curr_page_idx = 0;
        unsigned long remaining_pages_total = *num;
        unsigned long pages_to_write_in_pmd;
        int ret;
more:
        ret = -EFAULT;
        pmd = walk_to_pmd(mm, addr);
        if (!pmd)
                goto out;

        pages_to_write_in_pmd = min_t(unsigned long,
                remaining_pages_total, PTRS_PER_PTE - pte_index(addr));

        /* Allocate the PTE if necessary; takes PMD lock once only. */
        ret = -ENOMEM;
        if (pte_alloc(mm, pmd))
                goto out;

        while (pages_to_write_in_pmd) {
                int pte_idx = 0;
                const int batch_size = min_t(int, pages_to_write_in_pmd, 8);

                start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
                if (!start_pte) {
                        ret = -EFAULT;
                        goto out;
                }
                for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
                        int err = insert_page_in_batch_locked(vma, pte,
                                addr, pages[curr_page_idx], prot);
                        if (unlikely(err)) {
                                pte_unmap_unlock(start_pte, pte_lock);
                                ret = err;
                                remaining_pages_total -= pte_idx;
                                goto out;
                        }
                        addr += PAGE_SIZE;
                        ++curr_page_idx;
                }
                pte_unmap_unlock(start_pte, pte_lock);
                pages_to_write_in_pmd -= batch_size;
                remaining_pages_total -= batch_size;
        }
        if (remaining_pages_total)
                goto more;
        ret = 0;
out:
        *num = remaining_pages_total;
        return ret;
}

/**
 * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
 * @vma: user vma to map to
 * @addr: target start user address of these pages
 * @pages: source kernel pages
 * @num: in: number of pages to map. out: number of pages that were *not*
 * mapped. (0 means all pages were successfully mapped).
 *
 * Preferred over vm_insert_page() when inserting multiple pages.
 *
 * In case of error, we may have mapped a subset of the provided
 * pages. It is the caller's responsibility to account for this case.
 *
 * The same restrictions apply as in vm_insert_page().
 */
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num)
{
        const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;

        if (addr < vma->vm_start || end_addr >= vma->vm_end)
                return -EFAULT;
        if (!(vma->vm_flags & VM_MIXEDMAP)) {
                BUG_ON(mmap_read_trylock(vma->vm_mm));
                BUG_ON(vma->vm_flags & VM_PFNMAP);
                vm_flags_set(vma, VM_MIXEDMAP);
        }
        /* Defer page refcount checking till we're about to map that page. */
        return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_insert_pages);

/**
 * vm_insert_page - insert single page into user vma
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @page: source kernel page
 *
 * This allows drivers to insert individual pages they've allocated
 * into a user vma.
 *
 * The page has to be a nice clean _individual_ kernel allocation.
 * If you allocate a compound page, you need to have marked it as
 * such (__GFP_COMP), or manually just split the page up yourself
 * (see split_page()).
 *
 * NOTE! Traditionally this was done with "remap_pfn_range()" which
 * took an arbitrary page protection parameter. This doesn't allow
 * that. Your vma protection will have to be set up correctly, which
 * means that if you want a shared writable mapping, you'd better
 * ask for a shared writable mapping!
 *
 * The page does not need to be reserved.
 *
 * Usually this function is called from f_op->mmap() handler
 * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
 * Caller must set VM_MIXEDMAP on vma if it wants to call this
 * function from other places, for example from page-fault handler.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
                        struct page *page)
{
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
        if (!page_count(page))
                return -EINVAL;
        if (!(vma->vm_flags & VM_MIXEDMAP)) {
                BUG_ON(mmap_read_trylock(vma->vm_mm));
                BUG_ON(vma->vm_flags & VM_PFNMAP);
                vm_flags_set(vma, VM_MIXEDMAP);
        }
        return insert_page(vma, addr, page, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_insert_page);

/*
 * __vm_map_pages - maps range of kernel pages into user vma
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 * @offset: user's requested vm_pgoff
 *
 * This allows drivers to map range of kernel pages into a user vma.
 *
 * Return: 0 on success and error code otherwise.
 */
static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num, unsigned long offset)
{
        unsigned long count = vma_pages(vma);
        unsigned long uaddr = vma->vm_start;
        int ret, i;

        /* Fail if the user requested offset is beyond the end of the object */
        if (offset >= num)
                return -ENXIO;

        /* Fail if the user requested size exceeds available object size */
        if (count > num - offset)
                return -ENXIO;

        for (i = 0; i < count; i++) {
                ret = vm_insert_page(vma, uaddr, pages[offset + i]);
                if (ret < 0)
                        return ret;
                uaddr += PAGE_SIZE;
        }

        return 0;
}

/**
 * vm_map_pages - maps range of kernel pages starts with non zero offset
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 *
 * Maps an object consisting of @num pages, catering for the user's
 * requested vm_pgoff
 *
 * If we fail to insert any page into the vma, the function will return
 * immediately leaving any previously inserted pages present.  Callers
 * from the mmap handler may immediately return the error as their caller
 * will destroy the vma, removing any successfully inserted pages. Other
 * callers should make their own arrangements for calling unmap_region().
 *
 * Context: Process context. Called by mmap handlers.
 * Return: 0 on success and error code otherwise.
 */
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num)
{
        return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
}
EXPORT_SYMBOL(vm_map_pages);

/**
 * vm_map_pages_zero - map range of kernel pages starts with zero offset
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 *
 * Similar to vm_map_pages(), except that it explicitly sets the offset
 * to 0. This function is intended for the drivers that did not consider
 * vm_pgoff.
 *
 * Context: Process context. Called by mmap handlers.
 * Return: 0 on success and error code otherwise.
 */
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num)
{
        return __vm_map_pages(vma, pages, num, 0);
}
EXPORT_SYMBOL(vm_map_pages_zero);

static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        pfn_t pfn, pgprot_t prot, bool mkwrite)
{
        struct mm_struct *mm = vma->vm_mm;
        pte_t *pte, entry;
        spinlock_t *ptl;

        pte = get_locked_pte(mm, addr, &ptl);
        if (!pte)
                return VM_FAULT_OOM;
        entry = ptep_get(pte);
        if (!pte_none(entry)) {
                if (mkwrite) {
                        /*
                         * For read faults on private mappings the PFN passed
                         * in may not match the PFN we have mapped if the
                         * mapped PFN is a writeable COW page.  In the mkwrite
                         * case we are creating a writable PTE for a shared
                         * mapping and we expect the PFNs to match. If they
                         * don't match, we are likely racing with block
                         * allocation and mapping invalidation so just skip the
                         * update.
                         */
                        if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) {
                                WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
                                goto out_unlock;
                        }
                        entry = pte_mkyoung(entry);
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                        if (ptep_set_access_flags(vma, addr, pte, entry, 1))
                                update_mmu_cache(vma, addr, pte);
                }
                goto out_unlock;
        }

        /* Ok, finally just insert the thing.. */
        if (pfn_t_devmap(pfn))
                entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
        else
                entry = pte_mkspecial(pfn_t_pte(pfn, prot));

        if (mkwrite) {
                entry = pte_mkyoung(entry);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        }

        set_pte_at(mm, addr, pte, entry);
        update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */

out_unlock:
        pte_unmap_unlock(pte, ptl);
        return VM_FAULT_NOPAGE;
}

/**
 * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 * @pgprot: pgprot flags for the inserted page
 *
 * This is exactly like vmf_insert_pfn(), except that it allows drivers
 * to override pgprot on a per-page basis.
 *
 * This only makes sense for IO mappings, and it makes no sense for
 * COW mappings.  In general, using multiple vmas is preferable;
 * vmf_insert_pfn_prot should only be used if using multiple VMAs is
 * impractical.
 *
 * pgprot typically only differs from @vma->vm_page_prot when drivers set
 * caching- and encryption bits different than those of @vma->vm_page_prot,
 * because the caching- or encryption mode may not be known at mmap() time.
 *
 * This is ok as long as @vma->vm_page_prot is not used by the core vm
 * to set caching and encryption bits for those vmas (except for COW pages).
 * This is ensured by core vm only modifying these page table entries using
 * functions that don't touch caching- or encryption bits, using pte_modify()
 * if needed. (See for example mprotect()).
 *
 * Also when new page-table entries are created, this is only done using the
 * fault() callback, and never using the value of vma->vm_page_prot,
 * except for page-table entries that point to anonymous pages as the result
 * of COW.
 *
 * Context: Process context.  May allocate using %GFP_KERNEL.
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn, pgprot_t pgprot)
{
        /*
         * Technically, architectures with pte_special can avoid all these
         * restrictions (same for remap_pfn_range).  However we would like
         * consistency in testing and feature parity among all, so we should
         * try to keep these invariants in place for everybody.
         */
        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                (VM_PFNMAP|VM_MIXEDMAP));
        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
        BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        if (!pfn_modify_allowed(pfn, pgprot))
                return VM_FAULT_SIGBUS;

        track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));

        return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
                        false);
}
EXPORT_SYMBOL(vmf_insert_pfn_prot);

/**
 * vmf_insert_pfn - insert single pfn into user vma
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 *
 * Similar to vm_insert_page, this allows drivers to insert individual pages
 * they've allocated into a user vma. Same comments apply.
 *
 * This function should only be called from a vm_ops->fault handler, and
 * in that case the handler should return the result of this function.
 *
 * vma cannot be a COW mapping.
 *
 * As this is called only for pages that do not currently exist, we
 * do not need to flush old virtual caches or the TLB.
 *
 * Context: Process context.  May allocate using %GFP_KERNEL.
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn)
{
        return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
}
EXPORT_SYMBOL(vmf_insert_pfn);

static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
{
        /* these checks mirror the abort conditions in vm_normal_page */
        if (vma->vm_flags & VM_MIXEDMAP)
                return true;
        if (pfn_t_devmap(pfn))
                return true;
        if (pfn_t_special(pfn))
                return true;
        if (is_zero_pfn(pfn_t_to_pfn(pfn)))
                return true;
        return false;
}

static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
                unsigned long addr, pfn_t pfn, bool mkwrite)
{
        pgprot_t pgprot = vma->vm_page_prot;
        int err;

        BUG_ON(!vm_mixed_ok(vma, pfn));

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        track_pfn_insert(vma, &pgprot, pfn);

        if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
                return VM_FAULT_SIGBUS;

        /*
         * If we don't have pte special, then we have to use the pfn_valid()
         * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
         * refcount the page if pfn_valid is true (hence insert_page rather
         * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
         * without pte special, it would there be refcounted as a normal page.
         */
        if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
            !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
                struct page *page;

                /*
                 * At this point we are committed to insert_page()
                 * regardless of whether the caller specified flags that
                 * result in pfn_t_has_page() == false.
                 */
                page = pfn_to_page(pfn_t_to_pfn(pfn));
                err = insert_page(vma, addr, page, pgprot);
        } else {
                return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
        }

        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        if (err < 0 && err != -EBUSY)
                return VM_FAULT_SIGBUS;

        return VM_FAULT_NOPAGE;
}

vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                pfn_t pfn)
{
        return __vm_insert_mixed(vma, addr, pfn, false);
}
EXPORT_SYMBOL(vmf_insert_mixed);

/*
 *  If the insertion of PTE failed because someone else already added a
 *  different entry in the mean time, we treat that as success as we assume
 *  the same entry was actually inserted.
 */
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
                unsigned long addr, pfn_t pfn)
{
        return __vm_insert_mixed(vma, addr, pfn, true);
}
EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);

/*
 * maps a range of physical memory into the requested pages. the old
 * mappings are removed. any references to nonexistent pages results
 * in null mappings (currently treated as "copy-on-access")
 */
static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pte_t *pte, *mapped_pte;
        spinlock_t *ptl;
        int err = 0;

        mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
        if (!pte)
                return -ENOMEM;
        arch_enter_lazy_mmu_mode();
        do {
                BUG_ON(!pte_none(ptep_get(pte)));
                if (!pfn_modify_allowed(pfn, prot)) {
                        err = -EACCES;
                        break;
                }
                set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
                pfn++;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(mapped_pte, ptl);
        return err;
}

static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pmd_t *pmd;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return -ENOMEM;
        VM_BUG_ON(pmd_trans_huge(*pmd));
        do {
                next = pmd_addr_end(addr, end);
                err = remap_pte_range(mm, pmd, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (pmd++, addr = next, addr != end);
        return 0;
}

static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pud_t *pud;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        pud = pud_alloc(mm, p4d, addr);
        if (!pud)
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);
                err = remap_pmd_range(mm, pud, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (pud++, addr = next, addr != end);
        return 0;
}

static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        p4d_t *p4d;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        p4d = p4d_alloc(mm, pgd, addr);
        if (!p4d)
                return -ENOMEM;
        do {
                next = p4d_addr_end(addr, end);
                err = remap_pud_range(mm, p4d, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (p4d++, addr = next, addr != end);
        return 0;
}

/*
 * Variant of remap_pfn_range that does not call track_pfn_remap.  The caller
 * must have pre-validated the caching bits of the pgprot_t.
 */
int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
                unsigned long pfn, unsigned long size, pgprot_t prot)
{
        pgd_t *pgd;
        unsigned long next;
        unsigned long end = addr + PAGE_ALIGN(size);
        struct mm_struct *mm = vma->vm_mm;
        int err;

        if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
                return -EINVAL;

        /*
         * Physically remapped pages are special. Tell the
         * rest of the world about it:
         *   VM_IO tells people not to look at these pages
         *        (accesses can have side effects).
         *   VM_PFNMAP tells the core MM that the base pages are just
         *        raw PFN mappings, and do not have a "struct page" associated
         *        with them.
         *   VM_DONTEXPAND
         *      Disable vma merging and expanding with mremap().
         *   VM_DONTDUMP
         *      Omit vma from core dump, even when VM_IO turned off.
         *
         * There's a horrible special case to handle copy-on-write
         * behaviour that some programs depend on. We mark the "original"
         * un-COW'ed pages by matching them up with "vma->vm_pgoff".
         * See vm_normal_page() for details.
         */
        if (is_cow_mapping(vma->vm_flags)) {
                if (addr != vma->vm_start || end != vma->vm_end)
                        return -EINVAL;
                vma->vm_pgoff = pfn;
        }

        vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);

        BUG_ON(addr >= end);
        pfn -= addr >> PAGE_SHIFT;
        pgd = pgd_offset(mm, addr);
        flush_cache_range(vma, addr, end);
        do {
                next = pgd_addr_end(addr, end);
                err = remap_p4d_range(mm, pgd, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (pgd++, addr = next, addr != end);

        return 0;
}

/**
 * remap_pfn_range - remap kernel memory to userspace
 * @vma: user vma to map to
 * @addr: target page aligned user address to start at
 * @pfn: page frame number of kernel physical memory address
 * @size: size of mapping area
 * @prot: page protection flags for this mapping
 *
 * Note: this is only safe if the mm semaphore is held when called.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
                    unsigned long pfn, unsigned long size, pgprot_t prot)
{
        int err;

        err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
        if (err)
                return -EINVAL;

        err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
        if (err)
                untrack_pfn(vma, pfn, PAGE_ALIGN(size), true);
        return err;
}
EXPORT_SYMBOL(remap_pfn_range);

/**
 * vm_iomap_memory - remap memory to userspace
 * @vma: user vma to map to
 * @start: start of the physical memory to be mapped
 * @len: size of area
 *
 * This is a simplified io_remap_pfn_range() for common driver use. The
 * driver just needs to give us the physical memory range to be mapped,
 * we'll figure out the rest from the vma information.
 *
 * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
 * whatever write-combining details or similar.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
{
        unsigned long vm_len, pfn, pages;

        /* Check that the physical memory area passed in looks valid */
        if (start + len < start)
                return -EINVAL;
        /*
         * You *really* shouldn't map things that aren't page-aligned,
         * but we've historically allowed it because IO memory might
         * just have smaller alignment.
         */
        len += start & ~PAGE_MASK;
        pfn = start >> PAGE_SHIFT;
        pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
        if (pfn + pages < pfn)
                return -EINVAL;

        /* We start the mapping 'vm_pgoff' pages into the area */
        if (vma->vm_pgoff > pages)
                return -EINVAL;
        pfn += vma->vm_pgoff;
        pages -= vma->vm_pgoff;

        /* Can we fit all of the mapping? */
        vm_len = vma->vm_end - vma->vm_start;
        if (vm_len >> PAGE_SHIFT > pages)
                return -EINVAL;

        /* Ok, let it rip */
        return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_iomap_memory);

static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pte_t *pte, *mapped_pte;
        int err = 0;
        spinlock_t *ptl;

        if (create) {
                mapped_pte = pte = (mm == &init_mm) ?
                        pte_alloc_kernel_track(pmd, addr, mask) :
                        pte_alloc_map_lock(mm, pmd, addr, &ptl);
                if (!pte)
                        return -ENOMEM;
        } else {
                mapped_pte = pte = (mm == &init_mm) ?
                        pte_offset_kernel(pmd, addr) :
                        pte_offset_map_lock(mm, pmd, addr, &ptl);
                if (!pte)
                        return -EINVAL;
        }

        arch_enter_lazy_mmu_mode();

        if (fn) {
                do {
                        if (create || !pte_none(ptep_get(pte))) {
                                err = fn(pte++, addr, data);
                                if (err)
                                        break;
                        }
                } while (addr += PAGE_SIZE, addr != end);
        }
        *mask |= PGTBL_PTE_MODIFIED;

        arch_leave_lazy_mmu_mode();

        if (mm != &init_mm)
                pte_unmap_unlock(mapped_pte, ptl);
        return err;
}

static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;
        int err = 0;

        BUG_ON(pud_leaf(*pud));

        if (create) {
                pmd = pmd_alloc_track(mm, pud, addr, mask);
                if (!pmd)
                        return -ENOMEM;
        } else {
                pmd = pmd_offset(pud, addr);
        }
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_none(*pmd) && !create)
                        continue;
                if (WARN_ON_ONCE(pmd_leaf(*pmd)))
                        return -EINVAL;
                if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
                        if (!create)
                                continue;
                        pmd_clear_bad(pmd);
                }
                err = apply_to_pte_range(mm, pmd, addr, next,
                                         fn, data, create, mask);
                if (err)
                        break;
        } while (pmd++, addr = next, addr != end);

        return err;
}

static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;
        int err = 0;

        if (create) {
                pud = pud_alloc_track(mm, p4d, addr, mask);
                if (!pud)
                        return -ENOMEM;
        } else {
                pud = pud_offset(p4d, addr);
        }
        do {
                next = pud_addr_end(addr, end);
                if (pud_none(*pud) && !create)
                        continue;
                if (WARN_ON_ONCE(pud_leaf(*pud)))
                        return -EINVAL;
                if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
                        if (!create)
                                continue;
                        pud_clear_bad(pud);
                }
                err = apply_to_pmd_range(mm, pud, addr, next,
                                         fn, data, create, mask);
                if (err)
                        break;
        } while (pud++, addr = next, addr != end);

        return err;
}

static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;
        int err = 0;

        if (create) {
                p4d = p4d_alloc_track(mm, pgd, addr, mask);
                if (!p4d)
                        return -ENOMEM;
        } else {
                p4d = p4d_offset(pgd, addr);
        }
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none(*p4d) && !create)
                        continue;
                if (WARN_ON_ONCE(p4d_leaf(*p4d)))
                        return -EINVAL;
                if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
                        if (!create)
                                continue;
                        p4d_clear_bad(p4d);
                }
                err = apply_to_pud_range(mm, p4d, addr, next,
                                         fn, data, create, mask);
                if (err)
                        break;
        } while (p4d++, addr = next, addr != end);

        return err;
}

static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
                                 unsigned long size, pte_fn_t fn,
                                 void *data, bool create)
{
        pgd_t *pgd;
        unsigned long start = addr, next;
        unsigned long end = addr + size;
        pgtbl_mod_mask mask = 0;
        int err = 0;

        if (WARN_ON(addr >= end))
                return -EINVAL;

        pgd = pgd_offset(mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none(*pgd) && !create)
                        continue;
                if (WARN_ON_ONCE(pgd_leaf(*pgd)))
                        return -EINVAL;
                if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
                        if (!create)
                                continue;
                        pgd_clear_bad(pgd);
                }
                err = apply_to_p4d_range(mm, pgd, addr, next,
                                         fn, data, create, &mask);
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, start + size);

        return err;
}

/*
 * Scan a region of virtual memory, filling in page tables as necessary
 * and calling a provided function on each leaf page table.
 */
int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
                        unsigned long size, pte_fn_t fn, void *data)
{
        return __apply_to_page_range(mm, addr, size, fn, data, true);
}
EXPORT_SYMBOL_GPL(apply_to_page_range);

/*
 * Scan a region of virtual memory, calling a provided function on
 * each leaf page table where it exists.
 *
 * Unlike apply_to_page_range, this does _not_ fill in page tables
 * where they are absent.
 */
int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
                                 unsigned long size, pte_fn_t fn, void *data)
{
        return __apply_to_page_range(mm, addr, size, fn, data, false);
}
EXPORT_SYMBOL_GPL(apply_to_existing_page_range);

/*
 * handle_pte_fault chooses page fault handler according to an entry which was
 * read non-atomically.  Before making any commitment, on those architectures
 * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
 * parts, do_swap_page must check under lock before unmapping the pte and
 * proceeding (but do_wp_page is only called after already making such a check;
 * and do_anonymous_page can safely check later on).
 */
static inline int pte_unmap_same(struct vm_fault *vmf)
{
        int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
        if (sizeof(pte_t) > sizeof(unsigned long)) {
                spin_lock(vmf->ptl);
                same = pte_same(ptep_get(vmf->pte), vmf->orig_pte);
                spin_unlock(vmf->ptl);
        }
#endif
        pte_unmap(vmf->pte);
        vmf->pte = NULL;
        return same;
}

/*
 * Return:
 *        0:                copied succeeded
 *        -EHWPOISON:        copy failed due to hwpoison in source page
 *        -EAGAIN:        copied failed (some other reason)
 */
static inline int __wp_page_copy_user(struct page *dst, struct page *src,
                                      struct vm_fault *vmf)
{
        int ret;
        void *kaddr;
        void __user *uaddr;
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long addr = vmf->address;

        if (likely(src)) {
                if (copy_mc_user_highpage(dst, src, addr, vma)) {
                        memory_failure_queue(page_to_pfn(src), 0);
                        return -EHWPOISON;
                }
                return 0;
        }

        /*
         * If the source page was a PFN mapping, we don't have
         * a "struct page" for it. We do a best-effort copy by
         * just copying from the original user address. If that
         * fails, we just zero-fill it. Live with it.
         */
        kaddr = kmap_local_page(dst);
        pagefault_disable();
        uaddr = (void __user *)(addr & PAGE_MASK);

        /*
         * On architectures with software "accessed" bits, we would
         * take a double page fault, so mark it accessed here.
         */
        vmf->pte = NULL;
        if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
                pte_t entry;

                vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
                if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                        /*
                         * Other thread has already handled the fault
                         * and update local tlb only
                         */
                        if (vmf->pte)
                                update_mmu_tlb(vma, addr, vmf->pte);
                        ret = -EAGAIN;
                        goto pte_unlock;
                }

                entry = pte_mkyoung(vmf->orig_pte);
                if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
                        update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1);
        }

        /*
         * This really shouldn't fail, because the page is there
         * in the page tables. But it might just be unreadable,
         * in which case we just give up and fill the result with
         * zeroes.
         */
        if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
                if (vmf->pte)
                        goto warn;

                /* Re-validate under PTL if the page is still mapped */
                vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
                if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                        /* The PTE changed under us, update local tlb */
                        if (vmf->pte)
                                update_mmu_tlb(vma, addr, vmf->pte);
                        ret = -EAGAIN;
                        goto pte_unlock;
                }

                /*
                 * The same page can be mapped back since last copy attempt.
                 * Try to copy again under PTL.
                 */
                if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
                        /*
                         * Give a warn in case there can be some obscure
                         * use-case
                         */
warn:
                        WARN_ON_ONCE(1);
                        clear_page(kaddr);
                }
        }

        ret = 0;

pte_unlock:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        pagefault_enable();
        kunmap_local(kaddr);
        flush_dcache_page(dst);

        return ret;
}

static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
{
        struct file *vm_file = vma->vm_file;

        if (vm_file)
                return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;

        /*
         * Special mappings (e.g. VDSO) do not have any file so fake
         * a default GFP_KERNEL for them.
         */
        return GFP_KERNEL;
}

/*
 * Notify the address space that the page is about to become writable so that
 * it can prohibit this or wait for the page to get into an appropriate state.
 *
 * We do this without the lock held, so that it can sleep if it needs to.
 */
static vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio)
{
        vm_fault_t ret;
        unsigned int old_flags = vmf->flags;

        vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;

        if (vmf->vma->vm_file &&
            IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
                return VM_FAULT_SIGBUS;

        ret = vmf->vma->vm_ops->page_mkwrite(vmf);
        /* Restore original flags so that caller is not surprised */
        vmf->flags = old_flags;
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
                return ret;
        if (unlikely(!(ret & VM_FAULT_LOCKED))) {
                folio_lock(folio);
                if (!folio->mapping) {
                        folio_unlock(folio);
                        return 0; /* retry */
                }
                ret |= VM_FAULT_LOCKED;
        } else
                VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        return ret;
}

/*
 * Handle dirtying of a page in shared file mapping on a write fault.
 *
 * The function expects the page to be locked and unlocks it.
 */
static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct address_space *mapping;
        struct folio *folio = page_folio(vmf->page);
        bool dirtied;
        bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;

        dirtied = folio_mark_dirty(folio);
        VM_BUG_ON_FOLIO(folio_test_anon(folio), folio);
        /*
         * Take a local copy of the address_space - folio.mapping may be zeroed
         * by truncate after folio_unlock().   The address_space itself remains
         * pinned by vma->vm_file's reference.  We rely on folio_unlock()'s
         * release semantics to prevent the compiler from undoing this copying.
         */
        mapping = folio_raw_mapping(folio);
        folio_unlock(folio);

        if (!page_mkwrite)
                file_update_time(vma->vm_file);

        /*
         * Throttle page dirtying rate down to writeback speed.
         *
         * mapping may be NULL here because some device drivers do not
         * set page.mapping but still dirty their pages
         *
         * Drop the mmap_lock before waiting on IO, if we can. The file
         * is pinning the mapping, as per above.
         */
        if ((dirtied || page_mkwrite) && mapping) {
                struct file *fpin;

                fpin = maybe_unlock_mmap_for_io(vmf, NULL);
                balance_dirty_pages_ratelimited(mapping);
                if (fpin) {
                        fput(fpin);
                        return VM_FAULT_COMPLETED;
                }
        }

        return 0;
}

/*
 * Handle write page faults for pages that can be reused in the current vma
 *
 * This can happen either due to the mapping being with the VM_SHARED flag,
 * or due to us being the last reference standing to the page. In either
 * case, all we need to do here is to mark the page as writable and update
 * any related book-keeping.
 */
static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio)
        __releases(vmf->ptl)
{
        struct vm_area_struct *vma = vmf->vma;
        pte_t entry;

        VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));

        if (folio) {
                VM_BUG_ON(folio_test_anon(folio) &&
                          !PageAnonExclusive(vmf->page));
                /*
                 * Clear the folio's cpupid information as the existing
                 * information potentially belongs to a now completely
                 * unrelated process.
                 */
                folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1);
        }

        flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
        entry = pte_mkyoung(vmf->orig_pte);
        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
                update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        count_vm_event(PGREUSE);
}

/*
 * We could add a bitflag somewhere, but for now, we know that all
 * vm_ops that have a ->map_pages have been audited and don't need
 * the mmap_lock to be held.
 */
static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK))
                return 0;
        vma_end_read(vma);
        return VM_FAULT_RETRY;
}

/**
 * vmf_anon_prepare - Prepare to handle an anonymous fault.
 * @vmf: The vm_fault descriptor passed from the fault handler.
 *
 * When preparing to insert an anonymous page into a VMA from a
 * fault handler, call this function rather than anon_vma_prepare().
 * If this vma does not already have an associated anon_vma and we are
 * only protected by the per-VMA lock, the caller must retry with the
 * mmap_lock held.  __anon_vma_prepare() will look at adjacent VMAs to
 * determine if this VMA can share its anon_vma, and that's not safe to
 * do with only the per-VMA lock held for this VMA.
 *
 * Return: 0 if fault handling can proceed.  Any other value should be
 * returned to the caller.
 */
vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret = 0;

        if (likely(vma->anon_vma))
                return 0;
        if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
                if (!mmap_read_trylock(vma->vm_mm)) {
                        vma_end_read(vma);
                        return VM_FAULT_RETRY;
                }
        }
        if (__anon_vma_prepare(vma))
                ret = VM_FAULT_OOM;
        if (vmf->flags & FAULT_FLAG_VMA_LOCK)
                mmap_read_unlock(vma->vm_mm);
        return ret;
}

/*
 * Handle the case of a page which we actually need to copy to a new page,
 * either due to COW or unsharing.
 *
 * Called with mmap_lock locked and the old page referenced, but
 * without the ptl held.
 *
 * High level logic flow:
 *
 * - Allocate a page, copy the content of the old page to the new one.
 * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
 * - Take the PTL. If the pte changed, bail out and release the allocated page
 * - If the pte is still the way we remember it, update the page table and all
 *   relevant references. This includes dropping the reference the page-table
 *   held to the old page, as well as updating the rmap.
 * - In any case, unlock the PTL and drop the reference we took to the old page.
 */
static vm_fault_t wp_page_copy(struct vm_fault *vmf)
{
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        struct folio *old_folio = NULL;
        struct folio *new_folio = NULL;
        pte_t entry;
        int page_copied = 0;
        struct mmu_notifier_range range;
        vm_fault_t ret;
        bool pfn_is_zero;

        delayacct_wpcopy_start();

        if (vmf->page)
                old_folio = page_folio(vmf->page);
        ret = vmf_anon_prepare(vmf);
        if (unlikely(ret))
                goto out;

        pfn_is_zero = is_zero_pfn(pte_pfn(vmf->orig_pte));
        new_folio = folio_prealloc(mm, vma, vmf->address, pfn_is_zero);
        if (!new_folio)
                goto oom;

        if (!pfn_is_zero) {
                int err;

                err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf);
                if (err) {
                        /*
                         * COW failed, if the fault was solved by other,
                         * it's fine. If not, userspace would re-fault on
                         * the same address and we will handle the fault
                         * from the second attempt.
                         * The -EHWPOISON case will not be retried.
                         */
                        folio_put(new_folio);
                        if (old_folio)
                                folio_put(old_folio);

                        delayacct_wpcopy_end();
                        return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
                }
                kmsan_copy_page_meta(&new_folio->page, vmf->page);
        }

        __folio_mark_uptodate(new_folio);

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                vmf->address & PAGE_MASK,
                                (vmf->address & PAGE_MASK) + PAGE_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        /*
         * Re-check the pte - we dropped the lock
         */
        vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
        if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                if (old_folio) {
                        if (!folio_test_anon(old_folio)) {
                                dec_mm_counter(mm, mm_counter_file(old_folio));
                                inc_mm_counter(mm, MM_ANONPAGES);
                        }
                } else {
                        ksm_might_unmap_zero_page(mm, vmf->orig_pte);
                        inc_mm_counter(mm, MM_ANONPAGES);
                }
                flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
                entry = mk_pte(&new_folio->page, vma->vm_page_prot);
                entry = pte_sw_mkyoung(entry);
                if (unlikely(unshare)) {
                        if (pte_soft_dirty(vmf->orig_pte))
                                entry = pte_mksoft_dirty(entry);
                        if (pte_uffd_wp(vmf->orig_pte))
                                entry = pte_mkuffd_wp(entry);
                } else {
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                }

                /*
                 * Clear the pte entry and flush it first, before updating the
                 * pte with the new entry, to keep TLBs on different CPUs in
                 * sync. This code used to set the new PTE then flush TLBs, but
                 * that left a window where the new PTE could be loaded into
                 * some TLBs while the old PTE remains in others.
                 */
                ptep_clear_flush(vma, vmf->address, vmf->pte);
                folio_add_new_anon_rmap(new_folio, vma, vmf->address);
                folio_add_lru_vma(new_folio, vma);
                BUG_ON(unshare && pte_write(entry));
                set_pte_at(mm, vmf->address, vmf->pte, entry);
                update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
                if (old_folio) {
                        /*
                         * Only after switching the pte to the new page may
                         * we remove the mapcount here. Otherwise another
                         * process may come and find the rmap count decremented
                         * before the pte is switched to the new page, and
                         * "reuse" the old page writing into it while our pte
                         * here still points into it and can be read by other
                         * threads.
                         *
                         * The critical issue is to order this
                         * folio_remove_rmap_pte() with the ptp_clear_flush
                         * above. Those stores are ordered by (if nothing else,)
                         * the barrier present in the atomic_add_negative
                         * in folio_remove_rmap_pte();
                         *
                         * Then the TLB flush in ptep_clear_flush ensures that
                         * no process can access the old page before the
                         * decremented mapcount is visible. And the old page
                         * cannot be reused until after the decremented
                         * mapcount is visible. So transitively, TLBs to
                         * old page will be flushed before it can be reused.
                         */
                        folio_remove_rmap_pte(old_folio, vmf->page, vma);
                }

                /* Free the old page.. */
                new_folio = old_folio;
                page_copied = 1;
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        } else if (vmf->pte) {
                update_mmu_tlb(vma, vmf->address, vmf->pte);
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        }

        mmu_notifier_invalidate_range_end(&range);

        if (new_folio)
                folio_put(new_folio);
        if (old_folio) {
                if (page_copied)
                        free_swap_cache(old_folio);
                folio_put(old_folio);
        }

        delayacct_wpcopy_end();
        return 0;
oom:
        ret = VM_FAULT_OOM;
out:
        if (old_folio)
                folio_put(old_folio);

        delayacct_wpcopy_end();
        return ret;
}

/**
 * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
 *                          writeable once the page is prepared
 *
 * @vmf: structure describing the fault
 * @folio: the folio of vmf->page
 *
 * This function handles all that is needed to finish a write page fault in a
 * shared mapping due to PTE being read-only once the mapped page is prepared.
 * It handles locking of PTE and modifying it.
 *
 * The function expects the page to be locked or other protection against
 * concurrent faults / writeback (such as DAX radix tree locks).
 *
 * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
 * we acquired PTE lock.
 */
static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio)
{
        WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
        vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
                                       &vmf->ptl);
        if (!vmf->pte)
                return VM_FAULT_NOPAGE;
        /*
         * We might have raced with another page fault while we released the
         * pte_offset_map_lock.
         */
        if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) {
                update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return VM_FAULT_NOPAGE;
        }
        wp_page_reuse(vmf, folio);
        return 0;
}

/*
 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
 * mapping
 */
static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
                vm_fault_t ret;

                pte_unmap_unlock(vmf->pte, vmf->ptl);
                ret = vmf_can_call_fault(vmf);
                if (ret)
                        return ret;

                vmf->flags |= FAULT_FLAG_MKWRITE;
                ret = vma->vm_ops->pfn_mkwrite(vmf);
                if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
                        return ret;
                return finish_mkwrite_fault(vmf, NULL);
        }
        wp_page_reuse(vmf, NULL);
        return 0;
}

static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
        __releases(vmf->ptl)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret = 0;

        folio_get(folio);

        if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
                vm_fault_t tmp;

                pte_unmap_unlock(vmf->pte, vmf->ptl);
                tmp = vmf_can_call_fault(vmf);
                if (tmp) {
                        folio_put(folio);
                        return tmp;
                }

                tmp = do_page_mkwrite(vmf, folio);
                if (unlikely(!tmp || (tmp &
                                      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
                        folio_put(folio);
                        return tmp;
                }
                tmp = finish_mkwrite_fault(vmf, folio);
                if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
                        folio_unlock(folio);
                        folio_put(folio);
                        return tmp;
                }
        } else {
                wp_page_reuse(vmf, folio);
                folio_lock(folio);
        }
        ret |= fault_dirty_shared_page(vmf);
        folio_put(folio);

        return ret;
}

static bool wp_can_reuse_anon_folio(struct folio *folio,
                                    struct vm_area_struct *vma)
{
        /*
         * We could currently only reuse a subpage of a large folio if no
         * other subpages of the large folios are still mapped. However,
         * let's just consistently not reuse subpages even if we could
         * reuse in that scenario, and give back a large folio a bit
         * sooner.
         */
        if (folio_test_large(folio))
                return false;

        /*
         * We have to verify under folio lock: these early checks are
         * just an optimization to avoid locking the folio and freeing
         * the swapcache if there is little hope that we can reuse.
         *
         * KSM doesn't necessarily raise the folio refcount.
         */
        if (folio_test_ksm(folio) || folio_ref_count(folio) > 3)
                return false;
        if (!folio_test_lru(folio))
                /*
                 * We cannot easily detect+handle references from
                 * remote LRU caches or references to LRU folios.
                 */
                lru_add_drain();
        if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
                return false;
        if (!folio_trylock(folio))
                return false;
        if (folio_test_swapcache(folio))
                folio_free_swap(folio);
        if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) {
                folio_unlock(folio);
                return false;
        }
        /*
         * Ok, we've got the only folio reference from our mapping
         * and the folio is locked, it's dark out, and we're wearing
         * sunglasses. Hit it.
         */
        folio_move_anon_rmap(folio, vma);
        folio_unlock(folio);
        return true;
}

/*
 * This routine handles present pages, when
 * * users try to write to a shared page (FAULT_FLAG_WRITE)
 * * GUP wants to take a R/O pin on a possibly shared anonymous page
 *   (FAULT_FLAG_UNSHARE)
 *
 * It is done by copying the page to a new address and decrementing the
 * shared-page counter for the old page.
 *
 * Note that this routine assumes that the protection checks have been
 * done by the caller (the low-level page fault routine in most cases).
 * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've
 * done any necessary COW.
 *
 * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even
 * though the page will change only once the write actually happens. This
 * avoids a few races, and potentially makes it more efficient.
 *
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), with pte both mapped and locked.
 * We return with mmap_lock still held, but pte unmapped and unlocked.
 */
static vm_fault_t do_wp_page(struct vm_fault *vmf)
        __releases(vmf->ptl)
{
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio = NULL;
        pte_t pte;

        if (likely(!unshare)) {
                if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) {
                        if (!userfaultfd_wp_async(vma)) {
                                pte_unmap_unlock(vmf->pte, vmf->ptl);
                                return handle_userfault(vmf, VM_UFFD_WP);
                        }

                        /*
                         * Nothing needed (cache flush, TLB invalidations,
                         * etc.) because we're only removing the uffd-wp bit,
                         * which is completely invisible to the user.
                         */
                        pte = pte_clear_uffd_wp(ptep_get(vmf->pte));

                        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
                        /*
                         * Update this to be prepared for following up CoW
                         * handling
                         */
                        vmf->orig_pte = pte;
                }

                /*
                 * Userfaultfd write-protect can defer flushes. Ensure the TLB
                 * is flushed in this case before copying.
                 */
                if (unlikely(userfaultfd_wp(vmf->vma) &&
                             mm_tlb_flush_pending(vmf->vma->vm_mm)))
                        flush_tlb_page(vmf->vma, vmf->address);
        }

        vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);

        if (vmf->page)
                folio = page_folio(vmf->page);

        /*
         * Shared mapping: we are guaranteed to have VM_WRITE and
         * FAULT_FLAG_WRITE set at this point.
         */
        if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
                /*
                 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
                 * VM_PFNMAP VMA.
                 *
                 * We should not cow pages in a shared writeable mapping.
                 * Just mark the pages writable and/or call ops->pfn_mkwrite.
                 */
                if (!vmf->page)
                        return wp_pfn_shared(vmf);
                return wp_page_shared(vmf, folio);
        }

        /*
         * Private mapping: create an exclusive anonymous page copy if reuse
         * is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
         *
         * If we encounter a page that is marked exclusive, we must reuse
         * the page without further checks.
         */
        if (folio && folio_test_anon(folio) &&
            (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) {
                if (!PageAnonExclusive(vmf->page))
                        SetPageAnonExclusive(vmf->page);
                if (unlikely(unshare)) {
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                        return 0;
                }
                wp_page_reuse(vmf, folio);
                return 0;
        }
        /*
         * Ok, we need to copy. Oh, well..
         */
        if (folio)
                folio_get(folio);

        pte_unmap_unlock(vmf->pte, vmf->ptl);
#ifdef CONFIG_KSM
        if (folio && folio_test_ksm(folio))
                count_vm_event(COW_KSM);
#endif
        return wp_page_copy(vmf);
}

static void unmap_mapping_range_vma(struct vm_area_struct *vma,
                unsigned long start_addr, unsigned long end_addr,
                struct zap_details *details)
{
        zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
}

static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
                                            pgoff_t first_index,
                                            pgoff_t last_index,
                                            struct zap_details *details)
{
        struct vm_area_struct *vma;
        pgoff_t vba, vea, zba, zea;

        vma_interval_tree_foreach(vma, root, first_index, last_index) {
                vba = vma->vm_pgoff;
                vea = vba + vma_pages(vma) - 1;
                zba = max(first_index, vba);
                zea = min(last_index, vea);

                unmap_mapping_range_vma(vma,
                        ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
                        ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
                                details);
        }
}

/**
 * unmap_mapping_folio() - Unmap single folio from processes.
 * @folio: The locked folio to be unmapped.
 *
 * Unmap this folio from any userspace process which still has it mmaped.
 * Typically, for efficiency, the range of nearby pages has already been
 * unmapped by unmap_mapping_pages() or unmap_mapping_range().  But once
 * truncation or invalidation holds the lock on a folio, it may find that
 * the page has been remapped again: and then uses unmap_mapping_folio()
 * to unmap it finally.
 */
void unmap_mapping_folio(struct folio *folio)
{
        struct address_space *mapping = folio->mapping;
        struct zap_details details = { };
        pgoff_t        first_index;
        pgoff_t        last_index;

        VM_BUG_ON(!folio_test_locked(folio));

        first_index = folio->index;
        last_index = folio_next_index(folio) - 1;

        details.even_cows = false;
        details.single_folio = folio;
        details.zap_flags = ZAP_FLAG_DROP_MARKER;

        i_mmap_lock_read(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
                unmap_mapping_range_tree(&mapping->i_mmap, first_index,
                                         last_index, &details);
        i_mmap_unlock_read(mapping);
}

/**
 * unmap_mapping_pages() - Unmap pages from processes.
 * @mapping: The address space containing pages to be unmapped.
 * @start: Index of first page to be unmapped.
 * @nr: Number of pages to be unmapped.  0 to unmap to end of file.
 * @even_cows: Whether to unmap even private COWed pages.
 *
 * Unmap the pages in this address space from any userspace process which
 * has them mmaped.  Generally, you want to remove COWed pages as well when
 * a file is being truncated, but not when invalidating pages from the page
 * cache.
 */
void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
                pgoff_t nr, bool even_cows)
{
        struct zap_details details = { };
        pgoff_t        first_index = start;
        pgoff_t        last_index = start + nr - 1;

        details.even_cows = even_cows;
        if (last_index < first_index)
                last_index = ULONG_MAX;

        i_mmap_lock_read(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
                unmap_mapping_range_tree(&mapping->i_mmap, first_index,
                                         last_index, &details);
        i_mmap_unlock_read(mapping);
}
EXPORT_SYMBOL_GPL(unmap_mapping_pages);

/**
 * unmap_mapping_range - unmap the portion of all mmaps in the specified
 * address_space corresponding to the specified byte range in the underlying
 * file.
 *
 * @mapping: the address space containing mmaps to be unmapped.
 * @holebegin: byte in first page to unmap, relative to the start of
 * the underlying file.  This will be rounded down to a PAGE_SIZE
 * boundary.  Note that this is different from truncate_pagecache(), which
 * must keep the partial page.  In contrast, we must get rid of
 * partial pages.
 * @holelen: size of prospective hole in bytes.  This will be rounded
 * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
 * end of the file.
 * @even_cows: 1 when truncating a file, unmap even private COWed pages;
 * but 0 when invalidating pagecache, don't throw away private data.
 */
void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows)
{
        pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT;
        pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT;

        /* Check for overflow. */
        if (sizeof(holelen) > sizeof(hlen)) {
                long long holeend =
                        (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
                if (holeend & ~(long long)ULONG_MAX)
                        hlen = ULONG_MAX - hba + 1;
        }

        unmap_mapping_pages(mapping, hba, hlen, even_cows);
}
EXPORT_SYMBOL(unmap_mapping_range);

/*
 * Restore a potential device exclusive pte to a working pte entry
 */
static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
{
        struct folio *folio = page_folio(vmf->page);
        struct vm_area_struct *vma = vmf->vma;
        struct mmu_notifier_range range;
        vm_fault_t ret;

        /*
         * We need a reference to lock the folio because we don't hold
         * the PTL so a racing thread can remove the device-exclusive
         * entry and unmap it. If the folio is free the entry must
         * have been removed already. If it happens to have already
         * been re-allocated after being freed all we do is lock and
         * unlock it.
         */
        if (!folio_try_get(folio))
                return 0;

        ret = folio_lock_or_retry(folio, vmf);
        if (ret) {
                folio_put(folio);
                return ret;
        }
        mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
                                vma->vm_mm, vmf->address & PAGE_MASK,
                                (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
        mmu_notifier_invalidate_range_start(&range);

        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                                &vmf->ptl);
        if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
                restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);

        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        folio_unlock(folio);
        folio_put(folio);

        mmu_notifier_invalidate_range_end(&range);
        return 0;
}

static inline bool should_try_to_free_swap(struct folio *folio,
                                           struct vm_area_struct *vma,
                                           unsigned int fault_flags)
{
        if (!folio_test_swapcache(folio))
                return false;
        if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) ||
            folio_test_mlocked(folio))
                return true;
        /*
         * If we want to map a page that's in the swapcache writable, we
         * have to detect via the refcount if we're really the exclusive
         * user. Try freeing the swapcache to get rid of the swapcache
         * reference only in case it's likely that we'll be the exlusive user.
         */
        return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
                folio_ref_count(folio) == 2;
}

static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
{
        vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
                                       vmf->address, &vmf->ptl);
        if (!vmf->pte)
                return 0;
        /*
         * Be careful so that we will only recover a special uffd-wp pte into a
         * none pte.  Otherwise it means the pte could have changed, so retry.
         *
         * This should also cover the case where e.g. the pte changed
         * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
         * So is_pte_marker() check is not enough to safely drop the pte.
         */
        if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
                pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return 0;
}

static vm_fault_t do_pte_missing(struct vm_fault *vmf)
{
        if (vma_is_anonymous(vmf->vma))
                return do_anonymous_page(vmf);
        else
                return do_fault(vmf);
}

/*
 * This is actually a page-missing access, but with uffd-wp special pte
 * installed.  It means this pte was wr-protected before being unmapped.
 */
static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
{
        /*
         * Just in case there're leftover special ptes even after the region
         * got unregistered - we can simply clear them.
         */
        if (unlikely(!userfaultfd_wp(vmf->vma)))
                return pte_marker_clear(vmf);

        return do_pte_missing(vmf);
}

static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
{
        swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
        unsigned long marker = pte_marker_get(entry);

        /*
         * PTE markers should never be empty.  If anything weird happened,
         * the best thing to do is to kill the process along with its mm.
         */
        if (WARN_ON_ONCE(!marker))
                return VM_FAULT_SIGBUS;

        /* Higher priority than uffd-wp when data corrupted */
        if (marker & PTE_MARKER_POISONED)
                return VM_FAULT_HWPOISON;

        if (pte_marker_entry_uffd_wp(entry))
                return pte_marker_handle_uffd_wp(vmf);

        /* This is an unknown pte marker */
        return VM_FAULT_SIGBUS;
}

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with pte unmapped and unlocked.
 *
 * We return with the mmap_lock locked or unlocked in the same cases
 * as does filemap_fault().
 */
vm_fault_t do_swap_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *swapcache, *folio = NULL;
        struct page *page;
        struct swap_info_struct *si = NULL;
        rmap_t rmap_flags = RMAP_NONE;
        bool need_clear_cache = false;
        bool exclusive = false;
        swp_entry_t entry;
        pte_t pte;
        vm_fault_t ret = 0;
        void *shadow = NULL;

        if (!pte_unmap_same(vmf))
                goto out;

        entry = pte_to_swp_entry(vmf->orig_pte);
        if (unlikely(non_swap_entry(entry))) {
                if (is_migration_entry(entry)) {
                        migration_entry_wait(vma->vm_mm, vmf->pmd,
                                             vmf->address);
                } else if (is_device_exclusive_entry(entry)) {
                        vmf->page = pfn_swap_entry_to_page(entry);
                        ret = remove_device_exclusive_entry(vmf);
                } else if (is_device_private_entry(entry)) {
                        if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
                                /*
                                 * migrate_to_ram is not yet ready to operate
                                 * under VMA lock.
                                 */
                                vma_end_read(vma);
                                ret = VM_FAULT_RETRY;
                                goto out;
                        }

                        vmf->page = pfn_swap_entry_to_page(entry);
                        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                        vmf->address, &vmf->ptl);
                        if (unlikely(!vmf->pte ||
                                     !pte_same(ptep_get(vmf->pte),
                                                        vmf->orig_pte)))
                                goto unlock;

                        /*
                         * Get a page reference while we know the page can't be
                         * freed.
                         */
                        get_page(vmf->page);
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                        ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
                        put_page(vmf->page);
                } else if (is_hwpoison_entry(entry)) {
                        ret = VM_FAULT_HWPOISON;
                } else if (is_pte_marker_entry(entry)) {
                        ret = handle_pte_marker(vmf);
                } else {
                        print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
                        ret = VM_FAULT_SIGBUS;
                }
                goto out;
        }

        /* Prevent swapoff from happening to us. */
        si = get_swap_device(entry);
        if (unlikely(!si))
                goto out;

        folio = swap_cache_get_folio(entry, vma, vmf->address);
        if (folio)
                page = folio_file_page(folio, swp_offset(entry));
        swapcache = folio;

        if (!folio) {
                if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
                    __swap_count(entry) == 1) {
                        /*
                         * Prevent parallel swapin from proceeding with
                         * the cache flag. Otherwise, another thread may
                         * finish swapin first, free the entry, and swapout
                         * reusing the same entry. It's undetectable as
                         * pte_same() returns true due to entry reuse.
                         */
                        if (swapcache_prepare(entry)) {
                                /* Relax a bit to prevent rapid repeated page faults */
                                schedule_timeout_uninterruptible(1);
                                goto out;
                        }
                        need_clear_cache = true;

                        /* skip swapcache */
                        folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
                                                vma, vmf->address, false);
                        page = &folio->page;
                        if (folio) {
                                __folio_set_locked(folio);
                                __folio_set_swapbacked(folio);

                                if (mem_cgroup_swapin_charge_folio(folio,
                                                        vma->vm_mm, GFP_KERNEL,
                                                        entry)) {
                                        ret = VM_FAULT_OOM;
                                        goto out_page;
                                }
                                mem_cgroup_swapin_uncharge_swap(entry);

                                shadow = get_shadow_from_swap_cache(entry);
                                if (shadow)
                                        workingset_refault(folio, shadow);

                                folio_add_lru(folio);

                                /* To provide entry to swap_read_folio() */
                                folio->swap = entry;
                                swap_read_folio(folio, true, NULL);
                                folio->private = NULL;
                        }
                } else {
                        page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
                                                vmf);
                        if (page)
                                folio = page_folio(page);
                        swapcache = folio;
                }

                if (!folio) {
                        /*
                         * Back out if somebody else faulted in this pte
                         * while we released the pte lock.
                         */
                        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                        vmf->address, &vmf->ptl);
                        if (likely(vmf->pte &&
                                   pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
                                ret = VM_FAULT_OOM;
                        goto unlock;
                }

                /* Had to read the page from swap area: Major fault */
                ret = VM_FAULT_MAJOR;
                count_vm_event(PGMAJFAULT);
                count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
        } else if (PageHWPoison(page)) {
                /*
                 * hwpoisoned dirty swapcache pages are kept for killing
                 * owner processes (which may be unknown at hwpoison time)
                 */
                ret = VM_FAULT_HWPOISON;
                goto out_release;
        }

        ret |= folio_lock_or_retry(folio, vmf);
        if (ret & VM_FAULT_RETRY)
                goto out_release;

        if (swapcache) {
                /*
                 * Make sure folio_free_swap() or swapoff did not release the
                 * swapcache from under us.  The page pin, and pte_same test
                 * below, are not enough to exclude that.  Even if it is still
                 * swapcache, we need to check that the page's swap has not
                 * changed.
                 */
                if (unlikely(!folio_test_swapcache(folio) ||
                             page_swap_entry(page).val != entry.val))
                        goto out_page;

                /*
                 * KSM sometimes has to copy on read faults, for example, if
                 * page->index of !PageKSM() pages would be nonlinear inside the
                 * anon VMA -- PageKSM() is lost on actual swapout.
                 */
                folio = ksm_might_need_to_copy(folio, vma, vmf->address);
                if (unlikely(!folio)) {
                        ret = VM_FAULT_OOM;
                        folio = swapcache;
                        goto out_page;
                } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
                        ret = VM_FAULT_HWPOISON;
                        folio = swapcache;
                        goto out_page;
                }
                if (folio != swapcache)
                        page = folio_page(folio, 0);

                /*
                 * If we want to map a page that's in the swapcache writable, we
                 * have to detect via the refcount if we're really the exclusive
                 * owner. Try removing the extra reference from the local LRU
                 * caches if required.
                 */
                if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
                    !folio_test_ksm(folio) && !folio_test_lru(folio))
                        lru_add_drain();
        }

        folio_throttle_swaprate(folio, GFP_KERNEL);

        /*
         * Back out if somebody else already faulted in this pte.
         */
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                        &vmf->ptl);
        if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
                goto out_nomap;

        if (unlikely(!folio_test_uptodate(folio))) {
                ret = VM_FAULT_SIGBUS;
                goto out_nomap;
        }

        /*
         * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte
         * must never point at an anonymous page in the swapcache that is
         * PG_anon_exclusive. Sanity check that this holds and especially, that
         * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity
         * check after taking the PT lock and making sure that nobody
         * concurrently faulted in this page and set PG_anon_exclusive.
         */
        BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio));
        BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));

        /*
         * Check under PT lock (to protect against concurrent fork() sharing
         * the swap entry concurrently) for certainly exclusive pages.
         */
        if (!folio_test_ksm(folio)) {
                exclusive = pte_swp_exclusive(vmf->orig_pte);
                if (folio != swapcache) {
                        /*
                         * We have a fresh page that is not exposed to the
                         * swapcache -> certainly exclusive.
                         */
                        exclusive = true;
                } else if (exclusive && folio_test_writeback(folio) &&
                          data_race(si->flags & SWP_STABLE_WRITES)) {
                        /*
                         * This is tricky: not all swap backends support
                         * concurrent page modifications while under writeback.
                         *
                         * So if we stumble over such a page in the swapcache
                         * we must not set the page exclusive, otherwise we can
                         * map it writable without further checks and modify it
                         * while still under writeback.
                         *
                         * For these problematic swap backends, simply drop the
                         * exclusive marker: this is perfectly fine as we start
                         * writeback only if we fully unmapped the page and
                         * there are no unexpected references on the page after
                         * unmapping succeeded. After fully unmapped, no
                         * further GUP references (FOLL_GET and FOLL_PIN) can
                         * appear, so dropping the exclusive marker and mapping
                         * it only R/O is fine.
                         */
                        exclusive = false;
                }
        }

        /*
         * Some architectures may have to restore extra metadata to the page
         * when reading from swap. This metadata may be indexed by swap entry
         * so this must be called before swap_free().
         */
        arch_swap_restore(folio_swap(entry, folio), folio);

        /*
         * Remove the swap entry and conditionally try to free up the swapcache.
         * We're already holding a reference on the page but haven't mapped it
         * yet.
         */
        swap_free(entry);
        if (should_try_to_free_swap(folio, vma, vmf->flags))
                folio_free_swap(folio);

        inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
        dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
        pte = mk_pte(page, vma->vm_page_prot);

        /*
         * Same logic as in do_wp_page(); however, optimize for pages that are
         * certainly not shared either because we just allocated them without
         * exposing them to the swapcache or because the swap entry indicates
         * exclusivity.
         */
        if (!folio_test_ksm(folio) &&
            (exclusive || folio_ref_count(folio) == 1)) {
                if (vmf->flags & FAULT_FLAG_WRITE) {
                        pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                        vmf->flags &= ~FAULT_FLAG_WRITE;
                }
                rmap_flags |= RMAP_EXCLUSIVE;
        }
        flush_icache_page(vma, page);
        if (pte_swp_soft_dirty(vmf->orig_pte))
                pte = pte_mksoft_dirty(pte);
        if (pte_swp_uffd_wp(vmf->orig_pte))
                pte = pte_mkuffd_wp(pte);
        vmf->orig_pte = pte;

        /* ksm created a completely new copy */
        if (unlikely(folio != swapcache && swapcache)) {
                folio_add_new_anon_rmap(folio, vma, vmf->address);
                folio_add_lru_vma(folio, vma);
        } else {
                folio_add_anon_rmap_pte(folio, page, vma, vmf->address,
                                        rmap_flags);
        }

        VM_BUG_ON(!folio_test_anon(folio) ||
                        (pte_write(pte) && !PageAnonExclusive(page)));
        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
        arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);

        folio_unlock(folio);
        if (folio != swapcache && swapcache) {
                /*
                 * Hold the lock to avoid the swap entry to be reused
                 * until we take the PT lock for the pte_same() check
                 * (to avoid false positives from pte_same). For
                 * further safety release the lock after the swap_free
                 * so that the swap count won't change under a
                 * parallel locked swapcache.
                 */
                folio_unlock(swapcache);
                folio_put(swapcache);
        }

        if (vmf->flags & FAULT_FLAG_WRITE) {
                ret |= do_wp_page(vmf);
                if (ret & VM_FAULT_ERROR)
                        ret &= VM_FAULT_ERROR;
                goto out;
        }

        /* No need to invalidate - it was non-present before */
        update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
unlock:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
        /* Clear the swap cache pin for direct swapin after PTL unlock */
        if (need_clear_cache)
                swapcache_clear(si, entry);
        if (si)
                put_swap_device(si);
        return ret;
out_nomap:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
        folio_unlock(folio);
out_release:
        folio_put(folio);
        if (folio != swapcache && swapcache) {
                folio_unlock(swapcache);
                folio_put(swapcache);
        }
        if (need_clear_cache)
                swapcache_clear(si, entry);
        if (si)
                put_swap_device(si);
        return ret;
}

static bool pte_range_none(pte_t *pte, int nr_pages)
{
        int i;

        for (i = 0; i < nr_pages; i++) {
                if (!pte_none(ptep_get_lockless(pte + i)))
                        return false;
        }

        return true;
}

static struct folio *alloc_anon_folio(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        unsigned long orders;
        struct folio *folio;
        unsigned long addr;
        pte_t *pte;
        gfp_t gfp;
        int order;

        /*
         * If uffd is active for the vma we need per-page fault fidelity to
         * maintain the uffd semantics.
         */
        if (unlikely(userfaultfd_armed(vma)))
                goto fallback;

        /*
         * Get a list of all the (large) orders below PMD_ORDER that are enabled
         * for this vma. Then filter out the orders that can't be allocated over
         * the faulting address and still be fully contained in the vma.
         */
        orders = thp_vma_allowable_orders(vma, vma->vm_flags,
                        TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
        orders = thp_vma_suitable_orders(vma, vmf->address, orders);

        if (!orders)
                goto fallback;

        pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
        if (!pte)
                return ERR_PTR(-EAGAIN);

        /*
         * Find the highest order where the aligned range is completely
         * pte_none(). Note that all remaining orders will be completely
         * pte_none().
         */
        order = highest_order(orders);
        while (orders) {
                addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
                if (pte_range_none(pte + pte_index(addr), 1 << order))
                        break;
                order = next_order(&orders, order);
        }

        pte_unmap(pte);

        if (!orders)
                goto fallback;

        /* Try allocating the highest of the remaining orders. */
        gfp = vma_thp_gfp_mask(vma);
        while (orders) {
                addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
                folio = vma_alloc_folio(gfp, order, vma, addr, true);
                if (folio) {
                        if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
                                count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
                                folio_put(folio);
                                goto next;
                        }
                        folio_throttle_swaprate(folio, gfp);
                        clear_huge_page(&folio->page, vmf->address, 1 << order);
                        return folio;
                }
next:
                count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
                order = next_order(&orders, order);
        }

fallback:
#endif
        return folio_prealloc(vma->vm_mm, vma, vmf->address, true);
}

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with mmap_lock still held, but pte unmapped and unlocked.
 */
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        unsigned long addr = vmf->address;
        struct folio *folio;
        vm_fault_t ret = 0;
        int nr_pages = 1;
        pte_t entry;
        int i;

        /* File mapping without ->vm_ops ? */
        if (vma->vm_flags & VM_SHARED)
                return VM_FAULT_SIGBUS;

        /*
         * Use pte_alloc() instead of pte_alloc_map(), so that OOM can
         * be distinguished from a transient failure of pte_offset_map().
         */
        if (pte_alloc(vma->vm_mm, vmf->pmd))
                return VM_FAULT_OOM;

        /* Use the zero-page for reads */
        if (!(vmf->flags & FAULT_FLAG_WRITE) &&
                        !mm_forbids_zeropage(vma->vm_mm)) {
                entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
                                                vma->vm_page_prot));
                vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                vmf->address, &vmf->ptl);
                if (!vmf->pte)
                        goto unlock;
                if (vmf_pte_changed(vmf)) {
                        update_mmu_tlb(vma, vmf->address, vmf->pte);
                        goto unlock;
                }
                ret = check_stable_address_space(vma->vm_mm);
                if (ret)
                        goto unlock;
                /* Deliver the page fault to userland, check inside PT lock */
                if (userfaultfd_missing(vma)) {
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                        return handle_userfault(vmf, VM_UFFD_MISSING);
                }
                goto setpte;
        }

        /* Allocate our own private page. */
        ret = vmf_anon_prepare(vmf);
        if (ret)
                return ret;
        /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */
        folio = alloc_anon_folio(vmf);
        if (IS_ERR(folio))
                return 0;
        if (!folio)
                goto oom;

        nr_pages = folio_nr_pages(folio);
        addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);

        /*
         * The memory barrier inside __folio_mark_uptodate makes sure that
         * preceding stores to the page contents become visible before
         * the set_pte_at() write.
         */
        __folio_mark_uptodate(folio);

        entry = mk_pte(&folio->page, vma->vm_page_prot);
        entry = pte_sw_mkyoung(entry);
        if (vma->vm_flags & VM_WRITE)
                entry = pte_mkwrite(pte_mkdirty(entry), vma);

        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
        if (!vmf->pte)
                goto release;
        if (nr_pages == 1 && vmf_pte_changed(vmf)) {
                update_mmu_tlb(vma, addr, vmf->pte);
                goto release;
        } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
                for (i = 0; i < nr_pages; i++)
                        update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
                goto release;
        }

        ret = check_stable_address_space(vma->vm_mm);
        if (ret)
                goto release;

        /* Deliver the page fault to userland, check inside PT lock */
        if (userfaultfd_missing(vma)) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                folio_put(folio);
                return handle_userfault(vmf, VM_UFFD_MISSING);
        }

        folio_ref_add(folio, nr_pages - 1);
        add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
#endif
        folio_add_new_anon_rmap(folio, vma, addr);
        folio_add_lru_vma(folio, vma);
setpte:
        if (vmf_orig_pte_uffd_wp(vmf))
                entry = pte_mkuffd_wp(entry);
        set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);

        /* No need to invalidate - it was non-present before */
        update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages);
unlock:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        return ret;
release:
        folio_put(folio);
        goto unlock;
oom:
        return VM_FAULT_OOM;
}

/*
 * The mmap_lock must have been held on entry, and may have been
 * released depending on flags and vma->vm_ops->fault() return value.
 * See filemap_fault() and __lock_page_retry().
 */
static vm_fault_t __do_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        vm_fault_t ret;

        /*
         * Preallocate pte before we take page_lock because this might lead to
         * deadlocks for memcg reclaim which waits for pages under writeback:
         *                                lock_page(A)
         *                                SetPageWriteback(A)
         *                                unlock_page(A)
         * lock_page(B)
         *                                lock_page(B)
         * pte_alloc_one
         *   shrink_page_list
         *     wait_on_page_writeback(A)
         *                                SetPageWriteback(B)
         *                                unlock_page(B)
         *                                # flush A, B to clear the writeback
         */
        if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
                vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
        }

        ret = vma->vm_ops->fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
                            VM_FAULT_DONE_COW)))
                return ret;

        folio = page_folio(vmf->page);
        if (unlikely(PageHWPoison(vmf->page))) {
                vm_fault_t poisonret = VM_FAULT_HWPOISON;
                if (ret & VM_FAULT_LOCKED) {
                        if (page_mapped(vmf->page))
                                unmap_mapping_folio(folio);
                        /* Retry if a clean folio was removed from the cache. */
                        if (mapping_evict_folio(folio->mapping, folio))
                                poisonret = VM_FAULT_NOPAGE;
                        folio_unlock(folio);
                }
                folio_put(folio);
                vmf->page = NULL;
                return poisonret;
        }

        if (unlikely(!(ret & VM_FAULT_LOCKED)))
                folio_lock(folio);
        else
                VM_BUG_ON_PAGE(!folio_test_locked(folio), vmf->page);

        return ret;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void deposit_prealloc_pte(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
        /*
         * We are going to consume the prealloc table,
         * count that as nr_ptes.
         */
        mm_inc_nr_ptes(vma->vm_mm);
        vmf->prealloc_pte = NULL;
}

vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
        struct folio *folio = page_folio(page);
        struct vm_area_struct *vma = vmf->vma;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        pmd_t entry;
        vm_fault_t ret = VM_FAULT_FALLBACK;

        if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
                return ret;

        if (folio_order(folio) != HPAGE_PMD_ORDER)
                return ret;
        page = &folio->page;

        /*
         * Just backoff if any subpage of a THP is corrupted otherwise
         * the corrupted page may mapped by PMD silently to escape the
         * check.  This kind of THP just can be PTE mapped.  Access to
         * the corrupted subpage should trigger SIGBUS as expected.
         */
        if (unlikely(folio_test_has_hwpoisoned(folio)))
                return ret;

        /*
         * Archs like ppc64 need additional space to store information
         * related to pte entry. Use the preallocated table for that.
         */
        if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
                vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
        }

        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_none(*vmf->pmd)))
                goto out;

        flush_icache_pages(vma, page, HPAGE_PMD_NR);

        entry = mk_huge_pmd(page, vma->vm_page_prot);
        if (write)
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);

        add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR);
        folio_add_file_rmap_pmd(folio, page, vma);

        /*
         * deposit and withdraw with pmd lock held
         */
        if (arch_needs_pgtable_deposit())
                deposit_prealloc_pte(vmf);

        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);

        update_mmu_cache_pmd(vma, haddr, vmf->pmd);

        /* fault is handled */
        ret = 0;
        count_vm_event(THP_FILE_MAPPED);
out:
        spin_unlock(vmf->ptl);
        return ret;
}
#else
vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
        return VM_FAULT_FALLBACK;
}
#endif

/**
 * set_pte_range - Set a range of PTEs to point to pages in a folio.
 * @vmf: Fault decription.
 * @folio: The folio that contains @page.
 * @page: The first page to create a PTE for.
 * @nr: The number of PTEs to create.
 * @addr: The first address to create a PTE for.
 */
void set_pte_range(struct vm_fault *vmf, struct folio *folio,
                struct page *page, unsigned int nr, unsigned long addr)
{
        struct vm_area_struct *vma = vmf->vma;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
        bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE);
        pte_t entry;

        flush_icache_pages(vma, page, nr);
        entry = mk_pte(page, vma->vm_page_prot);

        if (prefault && arch_wants_old_prefaulted_pte())
                entry = pte_mkold(entry);
        else
                entry = pte_sw_mkyoung(entry);

        if (write)
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        if (unlikely(vmf_orig_pte_uffd_wp(vmf)))
                entry = pte_mkuffd_wp(entry);
        /* copy-on-write page */
        if (write && !(vma->vm_flags & VM_SHARED)) {
                VM_BUG_ON_FOLIO(nr != 1, folio);
                folio_add_new_anon_rmap(folio, vma, addr);
                folio_add_lru_vma(folio, vma);
        } else {
                folio_add_file_rmap_ptes(folio, page, nr, vma);
        }
        set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);

        /* no need to invalidate: a not-present page won't be cached */
        update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr);
}

static bool vmf_pte_changed(struct vm_fault *vmf)
{
        if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
                return !pte_same(ptep_get(vmf->pte), vmf->orig_pte);

        return !pte_none(ptep_get(vmf->pte));
}

/**
 * finish_fault - finish page fault once we have prepared the page to fault
 *
 * @vmf: structure describing the fault
 *
 * This function handles all that is needed to finish a page fault once the
 * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
 * given page, adds reverse page mapping, handles memcg charges and LRU
 * addition.
 *
 * The function expects the page to be locked and on success it consumes a
 * reference of a page being mapped (for the PTE which maps it).
 *
 * Return: %0 on success, %VM_FAULT_ code in case of error.
 */
vm_fault_t finish_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct page *page;
        vm_fault_t ret;
        bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
                      !(vma->vm_flags & VM_SHARED);

        /* Did we COW the page? */
        if (is_cow)
                page = vmf->cow_page;
        else
                page = vmf->page;

        /*
         * check even for read faults because we might have lost our CoWed
         * page
         */
        if (!(vma->vm_flags & VM_SHARED)) {
                ret = check_stable_address_space(vma->vm_mm);
                if (ret)
                        return ret;
        }

        if (pmd_none(*vmf->pmd)) {
                if (PageTransCompound(page)) {
                        ret = do_set_pmd(vmf, page);
                        if (ret != VM_FAULT_FALLBACK)
                                return ret;
                }

                if (vmf->prealloc_pte)
                        pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
                else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
                        return VM_FAULT_OOM;
        }

        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                      vmf->address, &vmf->ptl);
        if (!vmf->pte)
                return VM_FAULT_NOPAGE;

        /* Re-check under ptl */
        if (likely(!vmf_pte_changed(vmf))) {
                struct folio *folio = page_folio(page);
                int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);

                set_pte_range(vmf, folio, page, 1, vmf->address);
                add_mm_counter(vma->vm_mm, type, 1);
                ret = 0;
        } else {
                update_mmu_tlb(vma, vmf->address, vmf->pte);
                ret = VM_FAULT_NOPAGE;
        }

        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return ret;
}

static unsigned long fault_around_pages __read_mostly =
        65536 >> PAGE_SHIFT;

#ifdef CONFIG_DEBUG_FS
static int fault_around_bytes_get(void *data, u64 *val)
{
        *val = fault_around_pages << PAGE_SHIFT;
        return 0;
}

/*
 * fault_around_bytes must be rounded down to the nearest page order as it's
 * what do_fault_around() expects to see.
 */
static int fault_around_bytes_set(void *data, u64 val)
{
        if (val / PAGE_SIZE > PTRS_PER_PTE)
                return -EINVAL;

        /*
         * The minimum value is 1 page, however this results in no fault-around
         * at all. See should_fault_around().
         */
        val = max(val, PAGE_SIZE);
        fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT;

        return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
                fault_around_bytes_get, fault_around_bytes_set, "%llu\n");

static int __init fault_around_debugfs(void)
{
        debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
                                   &fault_around_bytes_fops);
        return 0;
}
late_initcall(fault_around_debugfs);
#endif

/*
 * do_fault_around() tries to map few pages around the fault address. The hope
 * is that the pages will be needed soon and this will lower the number of
 * faults to handle.
 *
 * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
 * not ready to be mapped: not up-to-date, locked, etc.
 *
 * This function doesn't cross VMA or page table boundaries, in order to call
 * map_pages() and acquire a PTE lock only once.
 *
 * fault_around_pages defines how many pages we'll try to map.
 * do_fault_around() expects it to be set to a power of two less than or equal
 * to PTRS_PER_PTE.
 *
 * The virtual address of the area that we map is naturally aligned to
 * fault_around_pages * PAGE_SIZE rounded down to the machine page size
 * (and therefore to page order).  This way it's easier to guarantee
 * that we don't cross page table boundaries.
 */
static vm_fault_t do_fault_around(struct vm_fault *vmf)
{
        pgoff_t nr_pages = READ_ONCE(fault_around_pages);
        pgoff_t pte_off = pte_index(vmf->address);
        /* The page offset of vmf->address within the VMA. */
        pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
        pgoff_t from_pte, to_pte;
        vm_fault_t ret;

        /* The PTE offset of the start address, clamped to the VMA. */
        from_pte = max(ALIGN_DOWN(pte_off, nr_pages),
                       pte_off - min(pte_off, vma_off));

        /* The PTE offset of the end address, clamped to the VMA and PTE. */
        to_pte = min3(from_pte + nr_pages, (pgoff_t)PTRS_PER_PTE,
                      pte_off + vma_pages(vmf->vma) - vma_off) - 1;

        if (pmd_none(*vmf->pmd)) {
                vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
        }

        rcu_read_lock();
        ret = vmf->vma->vm_ops->map_pages(vmf,
                        vmf->pgoff + from_pte - pte_off,
                        vmf->pgoff + to_pte - pte_off);
        rcu_read_unlock();

        return ret;
}

/* Return true if we should do read fault-around, false otherwise */
static inline bool should_fault_around(struct vm_fault *vmf)
{
        /* No ->map_pages?  No way to fault around... */
        if (!vmf->vma->vm_ops->map_pages)
                return false;

        if (uffd_disable_fault_around(vmf->vma))
                return false;

        /* A single page implies no faulting 'around' at all. */
        return fault_around_pages > 1;
}

static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
        vm_fault_t ret = 0;
        struct folio *folio;

        /*
         * Let's call ->map_pages() first and use ->fault() as fallback
         * if page by the offset is not ready to be mapped (cold cache or
         * something).
         */
        if (should_fault_around(vmf)) {
                ret = do_fault_around(vmf);
                if (ret)
                        return ret;
        }

        ret = vmf_can_call_fault(vmf);
        if (ret)
                return ret;

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;

        ret |= finish_fault(vmf);
        folio = page_folio(vmf->page);
        folio_unlock(folio);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                folio_put(folio);
        return ret;
}

static vm_fault_t do_cow_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        vm_fault_t ret;

        ret = vmf_can_call_fault(vmf);
        if (!ret)
                ret = vmf_anon_prepare(vmf);
        if (ret)
                return ret;

        folio = folio_prealloc(vma->vm_mm, vma, vmf->address, false);
        if (!folio)
                return VM_FAULT_OOM;

        vmf->cow_page = &folio->page;

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
        if (ret & VM_FAULT_DONE_COW)
                return ret;

        copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
        __folio_mark_uptodate(folio);

        ret |= finish_fault(vmf);
        unlock_page(vmf->page);
        put_page(vmf->page);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
        return ret;
uncharge_out:
        folio_put(folio);
        return ret;
}

static vm_fault_t do_shared_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret, tmp;
        struct folio *folio;

        ret = vmf_can_call_fault(vmf);
        if (ret)
                return ret;

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;

        folio = page_folio(vmf->page);

        /*
         * Check if the backing address space wants to know that the page is
         * about to become writable
         */
        if (vma->vm_ops->page_mkwrite) {
                folio_unlock(folio);
                tmp = do_page_mkwrite(vmf, folio);
                if (unlikely(!tmp ||
                                (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
                        folio_put(folio);
                        return tmp;
                }
        }

        ret |= finish_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
                                        VM_FAULT_RETRY))) {
                folio_unlock(folio);
                folio_put(folio);
                return ret;
        }

        ret |= fault_dirty_shared_page(vmf);
        return ret;
}

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults).
 * The mmap_lock may have been released depending on flags and our
 * return value.  See filemap_fault() and __folio_lock_or_retry().
 * If mmap_lock is released, vma may become invalid (for example
 * by other thread calling munmap()).
 */
static vm_fault_t do_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *vm_mm = vma->vm_mm;
        vm_fault_t ret;

        /*
         * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
         */
        if (!vma->vm_ops->fault) {
                vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
                                               vmf->address, &vmf->ptl);
                if (unlikely(!vmf->pte))
                        ret = VM_FAULT_SIGBUS;
                else {
                        /*
                         * Make sure this is not a temporary clearing of pte
                         * by holding ptl and checking again. A R/M/W update
                         * of pte involves: take ptl, clearing the pte so that
                         * we don't have concurrent modification by hardware
                         * followed by an update.
                         */
                        if (unlikely(pte_none(ptep_get(vmf->pte))))
                                ret = VM_FAULT_SIGBUS;
                        else
                                ret = VM_FAULT_NOPAGE;

                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                }
        } else if (!(vmf->flags & FAULT_FLAG_WRITE))
                ret = do_read_fault(vmf);
        else if (!(vma->vm_flags & VM_SHARED))
                ret = do_cow_fault(vmf);
        else
                ret = do_shared_fault(vmf);

        /* preallocated pagetable is unused: free it */
        if (vmf->prealloc_pte) {
                pte_free(vm_mm, vmf->prealloc_pte);
                vmf->prealloc_pte = NULL;
        }
        return ret;
}

int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
                      unsigned long addr, int page_nid, int *flags)
{
        struct vm_area_struct *vma = vmf->vma;

        folio_get(folio);

        /* Record the current PID acceesing VMA */
        vma_set_access_pid_bit(vma);

        count_vm_numa_event(NUMA_HINT_FAULTS);
        if (page_nid == numa_node_id()) {
                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
                *flags |= TNF_FAULT_LOCAL;
        }

        return mpol_misplaced(folio, vmf, addr);
}

static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
                                        unsigned long fault_addr, pte_t *fault_pte,
                                        bool writable)
{
        pte_t pte, old_pte;

        old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte);
        pte = pte_modify(old_pte, vma->vm_page_prot);
        pte = pte_mkyoung(pte);
        if (writable)
                pte = pte_mkwrite(pte, vma);
        ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte);
        update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1);
}

static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
                                       struct folio *folio, pte_t fault_pte,
                                       bool ignore_writable, bool pte_write_upgrade)
{
        int nr = pte_pfn(fault_pte) - folio_pfn(folio);
        unsigned long start, end, addr = vmf->address;
        unsigned long addr_start = addr - (nr << PAGE_SHIFT);
        unsigned long pt_start = ALIGN_DOWN(addr, PMD_SIZE);
        pte_t *start_ptep;

        /* Stay within the VMA and within the page table. */
        start = max3(addr_start, pt_start, vma->vm_start);
        end = min3(addr_start + folio_size(folio), pt_start + PMD_SIZE,
                   vma->vm_end);
        start_ptep = vmf->pte - ((addr - start) >> PAGE_SHIFT);

        /* Restore all PTEs' mapping of the large folio */
        for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) {
                pte_t ptent = ptep_get(start_ptep);
                bool writable = false;

                if (!pte_present(ptent) || !pte_protnone(ptent))
                        continue;

                if (pfn_folio(pte_pfn(ptent)) != folio)
                        continue;

                if (!ignore_writable) {
                        ptent = pte_modify(ptent, vma->vm_page_prot);
                        writable = pte_write(ptent);
                        if (!writable && pte_write_upgrade &&
                            can_change_pte_writable(vma, addr, ptent))
                                writable = true;
                }

                numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable);
        }
}

static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio = NULL;
        int nid = NUMA_NO_NODE;
        bool writable = false, ignore_writable = false;
        bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma);
        int last_cpupid;
        int target_nid;
        pte_t pte, old_pte;
        int flags = 0, nr_pages;

        /*
         * The pte cannot be used safely until we verify, while holding the page
         * table lock, that its contents have not changed during fault handling.
         */
        spin_lock(vmf->ptl);
        /* Read the live PTE from the page tables: */
        old_pte = ptep_get(vmf->pte);

        if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                goto out;
        }

        pte = pte_modify(old_pte, vma->vm_page_prot);

        /*
         * Detect now whether the PTE could be writable; this information
         * is only valid while holding the PT lock.
         */
        writable = pte_write(pte);
        if (!writable && pte_write_upgrade &&
            can_change_pte_writable(vma, vmf->address, pte))
                writable = true;

        folio = vm_normal_folio(vma, vmf->address, pte);
        if (!folio || folio_is_zone_device(folio))
                goto out_map;

        /*
         * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
         * much anyway since they can be in shared cache state. This misses
         * the case where a mapping is writable but the process never writes
         * to it but pte_write gets cleared during protection updates and
         * pte_dirty has unpredictable behaviour between PTE scan updates,
         * background writeback, dirty balancing and application behaviour.
         */
        if (!writable)
                flags |= TNF_NO_GROUP;

        /*
         * Flag if the folio is shared between multiple address spaces. This
         * is later used when determining whether to group tasks together
         */
        if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
                flags |= TNF_SHARED;

        nid = folio_nid(folio);
        nr_pages = folio_nr_pages(folio);
        /*
         * For memory tiering mode, cpupid of slow memory page is used
         * to record page access time.  So use default value.
         */
        if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
            !node_is_toptier(nid))
                last_cpupid = (-1 & LAST_CPUPID_MASK);
        else
                last_cpupid = folio_last_cpupid(folio);
        target_nid = numa_migrate_prep(folio, vmf, vmf->address, nid, &flags);
        if (target_nid == NUMA_NO_NODE) {
                folio_put(folio);
                goto out_map;
        }
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        writable = false;
        ignore_writable = true;

        /* Migrate to the requested node */
        if (migrate_misplaced_folio(folio, vma, target_nid)) {
                nid = target_nid;
                flags |= TNF_MIGRATED;
        } else {
                flags |= TNF_MIGRATE_FAIL;
                vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                               vmf->address, &vmf->ptl);
                if (unlikely(!vmf->pte))
                        goto out;
                if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                        goto out;
                }
                goto out_map;
        }

out:
        if (nid != NUMA_NO_NODE)
                task_numa_fault(last_cpupid, nid, nr_pages, flags);
        return 0;
out_map:
        /*
         * Make it present again, depending on how arch implements
         * non-accessible ptes, some can allow access by kernel mode.
         */
        if (folio && folio_test_large(folio))
                numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable,
                                           pte_write_upgrade);
        else
                numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte,
                                            writable);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        goto out;
}

static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        if (vma_is_anonymous(vma))
                return do_huge_pmd_anonymous_page(vmf);
        if (vma->vm_ops->huge_fault)
                return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
        return VM_FAULT_FALLBACK;
}

/* `inline' is required to avoid gcc 4.1.2 build error */
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        vm_fault_t ret;

        if (vma_is_anonymous(vma)) {
                if (likely(!unshare) &&
                    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
                        if (userfaultfd_wp_async(vmf->vma))
                                goto split;
                        return handle_userfault(vmf, VM_UFFD_WP);
                }
                return do_huge_pmd_wp_page(vmf);
        }

        if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
                if (vma->vm_ops->huge_fault) {
                        ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
                        if (!(ret & VM_FAULT_FALLBACK))
                                return ret;
                }
        }

split:
        /* COW or write-notify handled on pte level: split pmd. */
        __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);

        return VM_FAULT_FALLBACK;
}

static vm_fault_t create_huge_pud(struct vm_fault *vmf)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                        \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        struct vm_area_struct *vma = vmf->vma;
        /* No support for anonymous transparent PUD pages yet */
        if (vma_is_anonymous(vma))
                return VM_FAULT_FALLBACK;
        if (vma->vm_ops->huge_fault)
                return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
        return VM_FAULT_FALLBACK;
}

static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                        \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret;

        /* No support for anonymous transparent PUD pages yet */
        if (vma_is_anonymous(vma))
                goto split;
        if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
                if (vma->vm_ops->huge_fault) {
                        ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
                        if (!(ret & VM_FAULT_FALLBACK))
                                return ret;
                }
        }
split:
        /* COW or write-notify not handled on PUD level: split pud.*/
        __split_huge_pud(vma, vmf->pud, vmf->address);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
        return VM_FAULT_FALLBACK;
}

/*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
 * RISC architectures).  The early dirtying is also good on the i386.
 *
 * There is also a hook called "update_mmu_cache()" that architectures
 * with external mmu caches can use to update those (ie the Sparc or
 * PowerPC hashed page tables that act as extended TLBs).
 *
 * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
 * concurrent faults).
 *
 * The mmap_lock may have been released depending on flags and our return value.
 * See filemap_fault() and __folio_lock_or_retry().
 */
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
        pte_t entry;

        if (unlikely(pmd_none(*vmf->pmd))) {
                /*
                 * Leave __pte_alloc() until later: because vm_ops->fault may
                 * want to allocate huge page, and if we expose page table
                 * for an instant, it will be difficult to retract from
                 * concurrent faults and from rmap lookups.
                 */
                vmf->pte = NULL;
                vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
        } else {
                /*
                 * A regular pmd is established and it can't morph into a huge
                 * pmd by anon khugepaged, since that takes mmap_lock in write
                 * mode; but shmem or file collapse to THP could still morph
                 * it into a huge pmd: just retry later if so.
                 */
                vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd,
                                                 vmf->address, &vmf->ptl);
                if (unlikely(!vmf->pte))
                        return 0;
                vmf->orig_pte = ptep_get_lockless(vmf->pte);
                vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;

                if (pte_none(vmf->orig_pte)) {
                        pte_unmap(vmf->pte);
                        vmf->pte = NULL;
                }
        }

        if (!vmf->pte)
                return do_pte_missing(vmf);

        if (!pte_present(vmf->orig_pte))
                return do_swap_page(vmf);

        if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
                return do_numa_page(vmf);

        spin_lock(vmf->ptl);
        entry = vmf->orig_pte;
        if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
                update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
                goto unlock;
        }
        if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
                if (!pte_write(entry))
                        return do_wp_page(vmf);
                else if (likely(vmf->flags & FAULT_FLAG_WRITE))
                        entry = pte_mkdirty(entry);
        }
        entry = pte_mkyoung(entry);
        if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
                                vmf->flags & FAULT_FLAG_WRITE)) {
                update_mmu_cache_range(vmf, vmf->vma, vmf->address,
                                vmf->pte, 1);
        } else {
                /* Skip spurious TLB flush for retried page fault */
                if (vmf->flags & FAULT_FLAG_TRIED)
                        goto unlock;
                /*
                 * This is needed only for protection faults but the arch code
                 * is not yet telling us if this is a protection fault or not.
                 * This still avoids useless tlb flushes for .text page faults
                 * with threads.
                 */
                if (vmf->flags & FAULT_FLAG_WRITE)
                        flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
                                                     vmf->pte);
        }
unlock:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return 0;
}

/*
 * On entry, we hold either the VMA lock or the mmap_lock
 * (FAULT_FLAG_VMA_LOCK tells you which).  If VM_FAULT_RETRY is set in
 * the result, the mmap_lock is not held on exit.  See filemap_fault()
 * and __folio_lock_or_retry().
 */
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
                unsigned long address, unsigned int flags)
{
        struct vm_fault vmf = {
                .vma = vma,
                .address = address & PAGE_MASK,
                .real_address = address,
                .flags = flags,
                .pgoff = linear_page_index(vma, address),
                .gfp_mask = __get_fault_gfp_mask(vma),
        };
        struct mm_struct *mm = vma->vm_mm;
        unsigned long vm_flags = vma->vm_flags;
        pgd_t *pgd;
        p4d_t *p4d;
        vm_fault_t ret;

        pgd = pgd_offset(mm, address);
        p4d = p4d_alloc(mm, pgd, address);
        if (!p4d)
                return VM_FAULT_OOM;

        vmf.pud = pud_alloc(mm, p4d, address);
        if (!vmf.pud)
                return VM_FAULT_OOM;
retry_pud:
        if (pud_none(*vmf.pud) &&
            thp_vma_allowable_order(vma, vm_flags,
                                TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) {
                ret = create_huge_pud(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
                pud_t orig_pud = *vmf.pud;

                barrier();
                if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {

                        /*
                         * TODO once we support anonymous PUDs: NUMA case and
                         * FAULT_FLAG_UNSHARE handling.
                         */
                        if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) {
                                ret = wp_huge_pud(&vmf, orig_pud);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
                        } else {
                                huge_pud_set_accessed(&vmf, orig_pud);
                                return 0;
                        }
                }
        }

        vmf.pmd = pmd_alloc(mm, vmf.pud, address);
        if (!vmf.pmd)
                return VM_FAULT_OOM;

        /* Huge pud page fault raced with pmd_alloc? */
        if (pud_trans_unstable(vmf.pud))
                goto retry_pud;

        if (pmd_none(*vmf.pmd) &&
            thp_vma_allowable_order(vma, vm_flags,
                                TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) {
                ret = create_huge_pmd(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
                vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);

                if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
                        VM_BUG_ON(thp_migration_supported() &&
                                          !is_pmd_migration_entry(vmf.orig_pmd));
                        if (is_pmd_migration_entry(vmf.orig_pmd))
                                pmd_migration_entry_wait(mm, vmf.pmd);
                        return 0;
                }
                if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
                        if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
                                return do_huge_pmd_numa_page(&vmf);

                        if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
                            !pmd_write(vmf.orig_pmd)) {
                                ret = wp_huge_pmd(&vmf);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
                        } else {
                                huge_pmd_set_accessed(&vmf);
                                return 0;
                        }
                }
        }

        return handle_pte_fault(&vmf);
}

/**
 * mm_account_fault - Do page fault accounting
 * @mm: mm from which memcg should be extracted. It can be NULL.
 * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
 *        of perf event counters, but we'll still do the per-task accounting to
 *        the task who triggered this page fault.
 * @address: the faulted address.
 * @flags: the fault flags.
 * @ret: the fault retcode.
 *
 * This will take care of most of the page fault accounting.  Meanwhile, it
 * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
 * updates.  However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
 * still be in per-arch page fault handlers at the entry of page fault.
 */
static inline void mm_account_fault(struct mm_struct *mm, struct pt_regs *regs,
                                    unsigned long address, unsigned int flags,
                                    vm_fault_t ret)
{
        bool major;

        /* Incomplete faults will be accounted upon completion. */
        if (ret & VM_FAULT_RETRY)
                return;

        /*
         * To preserve the behavior of older kernels, PGFAULT counters record
         * both successful and failed faults, as opposed to perf counters,
         * which ignore failed cases.
         */
        count_vm_event(PGFAULT);
        count_memcg_event_mm(mm, PGFAULT);

        /*
         * Do not account for unsuccessful faults (e.g. when the address wasn't
         * valid).  That includes arch_vma_access_permitted() failing before
         * reaching here. So this is not a "this many hardware page faults"
         * counter.  We should use the hw profiling for that.
         */
        if (ret & VM_FAULT_ERROR)
                return;

        /*
         * We define the fault as a major fault when the final successful fault
         * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
         * handle it immediately previously).
         */
        major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);

        if (major)
                current->maj_flt++;
        else
                current->min_flt++;

        /*
         * If the fault is done for GUP, regs will be NULL.  We only do the
         * accounting for the per thread fault counters who triggered the
         * fault, and we skip the perf event updates.
         */
        if (!regs)
                return;

        if (major)
                perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
        else
                perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}

#ifdef CONFIG_LRU_GEN
static void lru_gen_enter_fault(struct vm_area_struct *vma)
{
        /* the LRU algorithm only applies to accesses with recency */
        current->in_lru_fault = vma_has_recency(vma);
}

static void lru_gen_exit_fault(void)
{
        current->in_lru_fault = false;
}
#else
static void lru_gen_enter_fault(struct vm_area_struct *vma)
{
}

static void lru_gen_exit_fault(void)
{
}
#endif /* CONFIG_LRU_GEN */

static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
                                       unsigned int *flags)
{
        if (unlikely(*flags & FAULT_FLAG_UNSHARE)) {
                if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE))
                        return VM_FAULT_SIGSEGV;
                /*
                 * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's
                 * just treat it like an ordinary read-fault otherwise.
                 */
                if (!is_cow_mapping(vma->vm_flags))
                        *flags &= ~FAULT_FLAG_UNSHARE;
        } else if (*flags & FAULT_FLAG_WRITE) {
                /* Write faults on read-only mappings are impossible ... */
                if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)))
                        return VM_FAULT_SIGSEGV;
                /* ... and FOLL_FORCE only applies to COW mappings. */
                if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) &&
                                 !is_cow_mapping(vma->vm_flags)))
                        return VM_FAULT_SIGSEGV;
        }
#ifdef CONFIG_PER_VMA_LOCK
        /*
         * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of
         * the assumption that lock is dropped on VM_FAULT_RETRY.
         */
        if (WARN_ON_ONCE((*flags &
                        (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) ==
                        (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)))
                return VM_FAULT_SIGSEGV;
#endif

        return 0;
}

/*
 * By the time we get here, we already hold the mm semaphore
 *
 * The mmap_lock may have been released depending on flags and our
 * return value.  See filemap_fault() and __folio_lock_or_retry().
 */
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                           unsigned int flags, struct pt_regs *regs)
{
        /* If the fault handler drops the mmap_lock, vma may be freed */
        struct mm_struct *mm = vma->vm_mm;
        vm_fault_t ret;

        __set_current_state(TASK_RUNNING);

        ret = sanitize_fault_flags(vma, &flags);
        if (ret)
                goto out;

        if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
                                            flags & FAULT_FLAG_INSTRUCTION,
                                            flags & FAULT_FLAG_REMOTE)) {
                ret = VM_FAULT_SIGSEGV;
                goto out;
        }

        /*
         * Enable the memcg OOM handling for faults triggered in user
         * space.  Kernel faults are handled more gracefully.
         */
        if (flags & FAULT_FLAG_USER)
                mem_cgroup_enter_user_fault();

        lru_gen_enter_fault(vma);

        if (unlikely(is_vm_hugetlb_page(vma)))
                ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
        else
                ret = __handle_mm_fault(vma, address, flags);

        lru_gen_exit_fault();

        if (flags & FAULT_FLAG_USER) {
                mem_cgroup_exit_user_fault();
                /*
                 * The task may have entered a memcg OOM situation but
                 * if the allocation error was handled gracefully (no
                 * VM_FAULT_OOM), there is no need to kill anything.
                 * Just clean up the OOM state peacefully.
                 */
                if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
                        mem_cgroup_oom_synchronize(false);
        }
out:
        mm_account_fault(mm, regs, address, flags, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);

#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
#include <linux/extable.h>

static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
{
        if (likely(mmap_read_trylock(mm)))
                return true;

        if (regs && !user_mode(regs)) {
                unsigned long ip = exception_ip(regs);
                if (!search_exception_tables(ip))
                        return false;
        }

        return !mmap_read_lock_killable(mm);
}

static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
{
        /*
         * We don't have this operation yet.
         *
         * It should be easy enough to do: it's basically a
         *    atomic_long_try_cmpxchg_acquire()
         * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
         * it also needs the proper lockdep magic etc.
         */
        return false;
}

static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
{
        mmap_read_unlock(mm);
        if (regs && !user_mode(regs)) {
                unsigned long ip = exception_ip(regs);
                if (!search_exception_tables(ip))
                        return false;
        }
        return !mmap_write_lock_killable(mm);
}

/*
 * Helper for page fault handling.
 *
 * This is kind of equivalend to "mmap_read_lock()" followed
 * by "find_extend_vma()", except it's a lot more careful about
 * the locking (and will drop the lock on failure).
 *
 * For example, if we have a kernel bug that causes a page
 * fault, we don't want to just use mmap_read_lock() to get
 * the mm lock, because that would deadlock if the bug were
 * to happen while we're holding the mm lock for writing.
 *
 * So this checks the exception tables on kernel faults in
 * order to only do this all for instructions that are actually
 * expected to fault.
 *
 * We can also actually take the mm lock for writing if we
 * need to extend the vma, which helps the VM layer a lot.
 */
struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
                        unsigned long addr, struct pt_regs *regs)
{
        struct vm_area_struct *vma;

        if (!get_mmap_lock_carefully(mm, regs))
                return NULL;

        vma = find_vma(mm, addr);
        if (likely(vma && (vma->vm_start <= addr)))
                return vma;

        /*
         * Well, dang. We might still be successful, but only
         * if we can extend a vma to do so.
         */
        if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
                mmap_read_unlock(mm);
                return NULL;
        }

        /*
         * We can try to upgrade the mmap lock atomically,
         * in which case we can continue to use the vma
         * we already looked up.
         *
         * Otherwise we'll have to drop the mmap lock and
         * re-take it, and also look up the vma again,
         * re-checking it.
         */
        if (!mmap_upgrade_trylock(mm)) {
                if (!upgrade_mmap_lock_carefully(mm, regs))
                        return NULL;

                vma = find_vma(mm, addr);
                if (!vma)
                        goto fail;
                if (vma->vm_start <= addr)
                        goto success;
                if (!(vma->vm_flags & VM_GROWSDOWN))
                        goto fail;
        }

        if (expand_stack_locked(vma, addr))
                goto fail;

success:
        mmap_write_downgrade(mm);
        return vma;

fail:
        mmap_write_unlock(mm);
        return NULL;
}
#endif

#ifdef CONFIG_PER_VMA_LOCK
/*
 * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
 * stable and not isolated. If the VMA is not found or is being modified the
 * function returns NULL.
 */
struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
                                          unsigned long address)
{
        MA_STATE(mas, &mm->mm_mt, address, address);
        struct vm_area_struct *vma;

        rcu_read_lock();
retry:
        vma = mas_walk(&mas);
        if (!vma)
                goto inval;

        if (!vma_start_read(vma))
                goto inval;

        /* Check since vm_start/vm_end might change before we lock the VMA */
        if (unlikely(address < vma->vm_start || address >= vma->vm_end))
                goto inval_end_read;

        /* Check if the VMA got isolated after we found it */
        if (vma->detached) {
                vma_end_read(vma);
                count_vm_vma_lock_event(VMA_LOCK_MISS);
                /* The area was replaced with another one */
                goto retry;
        }

        rcu_read_unlock();
        return vma;

inval_end_read:
        vma_end_read(vma);
inval:
        rcu_read_unlock();
        count_vm_vma_lock_event(VMA_LOCK_ABORT);
        return NULL;
}
#endif /* CONFIG_PER_VMA_LOCK */

#ifndef __PAGETABLE_P4D_FOLDED
/*
 * Allocate p4d page table.
 * We've already handled the fast-path in-line.
 */
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
        p4d_t *new = p4d_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        spin_lock(&mm->page_table_lock);
        if (pgd_present(*pgd)) {        /* Another has populated it */
                p4d_free(mm, new);
        } else {
                smp_wmb(); /* See comment in pmd_install() */
                pgd_populate(mm, pgd, new);
        }
        spin_unlock(&mm->page_table_lock);
        return 0;
}
#endif /* __PAGETABLE_P4D_FOLDED */

#ifndef __PAGETABLE_PUD_FOLDED
/*
 * Allocate page upper directory.
 * We've already handled the fast-path in-line.
 */
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
{
        pud_t *new = pud_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        spin_lock(&mm->page_table_lock);
        if (!p4d_present(*p4d)) {
                mm_inc_nr_puds(mm);
                smp_wmb(); /* See comment in pmd_install() */
                p4d_populate(mm, p4d, new);
        } else        /* Another has populated it */
                pud_free(mm, new);
        spin_unlock(&mm->page_table_lock);
        return 0;
}
#endif /* __PAGETABLE_PUD_FOLDED */

#ifndef __PAGETABLE_PMD_FOLDED
/*
 * Allocate page middle directory.
 * We've already handled the fast-path in-line.
 */
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
        spinlock_t *ptl;
        pmd_t *new = pmd_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        ptl = pud_lock(mm, pud);
        if (!pud_present(*pud)) {
                mm_inc_nr_pmds(mm);
                smp_wmb(); /* See comment in pmd_install() */
                pud_populate(mm, pud, new);
        } else {        /* Another has populated it */
                pmd_free(mm, new);
        }
        spin_unlock(ptl);
        return 0;
}
#endif /* __PAGETABLE_PMD_FOLDED */

/**
 * follow_pte - look up PTE at a user virtual address
 * @vma: the memory mapping
 * @address: user virtual address
 * @ptepp: location to store found PTE
 * @ptlp: location to store the lock for the PTE
 *
 * On a successful return, the pointer to the PTE is stored in @ptepp;
 * the corresponding lock is taken and its location is stored in @ptlp.
 *
 * The contents of the PTE are only stable until @ptlp is released using
 * pte_unmap_unlock(). This function will fail if the PTE is non-present.
 * Present PTEs may include PTEs that map refcounted pages, such as
 * anonymous folios in COW mappings.
 *
 * Callers must be careful when relying on PTE content after
 * pte_unmap_unlock(). Especially if the PTE maps a refcounted page,
 * callers must protect against invalidation with MMU notifiers; otherwise
 * access to the PFN at a later point in time can trigger use-after-free.
 *
 * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
 * should be taken for read.
 *
 * This function must not be used to modify PTE content.
 *
 * Return: zero on success, -ve otherwise.
 */
int follow_pte(struct vm_area_struct *vma, unsigned long address,
               pte_t **ptepp, spinlock_t **ptlp)
{
        struct mm_struct *mm = vma->vm_mm;
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *ptep;

        mmap_assert_locked(mm);
        if (unlikely(address < vma->vm_start || address >= vma->vm_end))
                goto out;

        if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
                goto out;

        pgd = pgd_offset(mm, address);
        if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
                goto out;

        p4d = p4d_offset(pgd, address);
        if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
                goto out;

        pud = pud_offset(p4d, address);
        if (pud_none(*pud) || unlikely(pud_bad(*pud)))
                goto out;

        pmd = pmd_offset(pud, address);
        VM_BUG_ON(pmd_trans_huge(*pmd));

        ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
        if (!ptep)
                goto out;
        if (!pte_present(ptep_get(ptep)))
                goto unlock;
        *ptepp = ptep;
        return 0;
unlock:
        pte_unmap_unlock(ptep, *ptlp);
out:
        return -EINVAL;
}
EXPORT_SYMBOL_GPL(follow_pte);

#ifdef CONFIG_HAVE_IOREMAP_PROT
/**
 * generic_access_phys - generic implementation for iomem mmap access
 * @vma: the vma to access
 * @addr: userspace address, not relative offset within @vma
 * @buf: buffer to read/write
 * @len: length of transfer
 * @write: set to FOLL_WRITE when writing, otherwise reading
 *
 * This is a generic implementation for &vm_operations_struct.access for an
 * iomem mapping. This callback is used by access_process_vm() when the @vma is
 * not page based.
 */
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                        void *buf, int len, int write)
{
        resource_size_t phys_addr;
        unsigned long prot = 0;
        void __iomem *maddr;
        pte_t *ptep, pte;
        spinlock_t *ptl;
        int offset = offset_in_page(addr);
        int ret = -EINVAL;

retry:
        if (follow_pte(vma, addr, &ptep, &ptl))
                return -EINVAL;
        pte = ptep_get(ptep);
        pte_unmap_unlock(ptep, ptl);

        prot = pgprot_val(pte_pgprot(pte));
        phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;

        if ((write & FOLL_WRITE) && !pte_write(pte))
                return -EINVAL;

        maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
        if (!maddr)
                return -ENOMEM;

        if (follow_pte(vma, addr, &ptep, &ptl))
                goto out_unmap;

        if (!pte_same(pte, ptep_get(ptep))) {
                pte_unmap_unlock(ptep, ptl);
                iounmap(maddr);

                goto retry;
        }

        if (write)
                memcpy_toio(maddr + offset, buf, len);
        else
                memcpy_fromio(buf, maddr + offset, len);
        ret = len;
        pte_unmap_unlock(ptep, ptl);
out_unmap:
        iounmap(maddr);

        return ret;
}
EXPORT_SYMBOL_GPL(generic_access_phys);
#endif

/*
 * Access another process' address space as given in mm.
 */
static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
                              void *buf, int len, unsigned int gup_flags)
{
        void *old_buf = buf;
        int write = gup_flags & FOLL_WRITE;

        if (mmap_read_lock_killable(mm))
                return 0;

        /* Untag the address before looking up the VMA */
        addr = untagged_addr_remote(mm, addr);

        /* Avoid triggering the temporary warning in __get_user_pages */
        if (!vma_lookup(mm, addr) && !expand_stack(mm, addr))
                return 0;

        /* ignore errors, just check how much was successfully transferred */
        while (len) {
                int bytes, offset;
                void *maddr;
                struct vm_area_struct *vma = NULL;
                struct page *page = get_user_page_vma_remote(mm, addr,
                                                             gup_flags, &vma);

                if (IS_ERR(page)) {
                        /* We might need to expand the stack to access it */
                        vma = vma_lookup(mm, addr);
                        if (!vma) {
                                vma = expand_stack(mm, addr);

                                /* mmap_lock was dropped on failure */
                                if (!vma)
                                        return buf - old_buf;

                                /* Try again if stack expansion worked */
                                continue;
                        }

                        /*
                         * Check if this is a VM_IO | VM_PFNMAP VMA, which
                         * we can access using slightly different code.
                         */
                        bytes = 0;
#ifdef CONFIG_HAVE_IOREMAP_PROT
                        if (vma->vm_ops && vma->vm_ops->access)
                                bytes = vma->vm_ops->access(vma, addr, buf,
                                                            len, write);
#endif
                        if (bytes <= 0)
                                break;
                } else {
                        bytes = len;
                        offset = addr & (PAGE_SIZE-1);
                        if (bytes > PAGE_SIZE-offset)
                                bytes = PAGE_SIZE-offset;

                        maddr = kmap_local_page(page);
                        if (write) {
                                copy_to_user_page(vma, page, addr,
                                                  maddr + offset, buf, bytes);
                                set_page_dirty_lock(page);
                        } else {
                                copy_from_user_page(vma, page, addr,
                                                    buf, maddr + offset, bytes);
                        }
                        unmap_and_put_page(page, maddr);
                }
                len -= bytes;
                buf += bytes;
                addr += bytes;
        }
        mmap_read_unlock(mm);

        return buf - old_buf;
}

/**
 * access_remote_vm - access another process' address space
 * @mm:                the mm_struct of the target address space
 * @addr:        start address to access
 * @buf:        source or destination buffer
 * @len:        number of bytes to transfer
 * @gup_flags:        flags modifying lookup behaviour
 *
 * The caller must hold a reference on @mm.
 *
 * Return: number of bytes copied from source to destination.
 */
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
                void *buf, int len, unsigned int gup_flags)
{
        return __access_remote_vm(mm, addr, buf, len, gup_flags);
}

/*
 * Access another process' address space.
 * Source/target buffer must be kernel space,
 * Do not walk the page table directly, use get_user_pages
 */
int access_process_vm(struct task_struct *tsk, unsigned long addr,
                void *buf, int len, unsigned int gup_flags)
{
        struct mm_struct *mm;
        int ret;

        mm = get_task_mm(tsk);
        if (!mm)
                return 0;

        ret = __access_remote_vm(mm, addr, buf, len, gup_flags);

        mmput(mm);

        return ret;
}
EXPORT_SYMBOL_GPL(access_process_vm);

/*
 * Print the name of a VMA.
 */
void print_vma_addr(char *prefix, unsigned long ip)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;

        /*
         * we might be running from an atomic context so we cannot sleep
         */
        if (!mmap_read_trylock(mm))
                return;

        vma = vma_lookup(mm, ip);
        if (vma && vma->vm_file) {
                struct file *f = vma->vm_file;
                ip -= vma->vm_start;
                ip += vma->vm_pgoff << PAGE_SHIFT;
                printk("%s%pD[%lx,%lx+%lx]", prefix, f, ip,
                                vma->vm_start,
                                vma->vm_end - vma->vm_start);
        }
        mmap_read_unlock(mm);
}

#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
void __might_fault(const char *file, int line)
{
        if (pagefault_disabled())
                return;
        __might_sleep(file, line);
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
        if (current->mm)
                might_lock_read(&current->mm->mmap_lock);
#endif
}
EXPORT_SYMBOL(__might_fault);
#endif

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
/*
 * Process all subpages of the specified huge page with the specified
 * operation.  The target subpage will be processed last to keep its
 * cache lines hot.
 */
static inline int process_huge_page(
        unsigned long addr_hint, unsigned int pages_per_huge_page,
        int (*process_subpage)(unsigned long addr, int idx, void *arg),
        void *arg)
{
        int i, n, base, l, ret;
        unsigned long addr = addr_hint &
                ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);

        /* Process target subpage last to keep its cache lines hot */
        might_sleep();
        n = (addr_hint - addr) / PAGE_SIZE;
        if (2 * n <= pages_per_huge_page) {
                /* If target subpage in first half of huge page */
                base = 0;
                l = n;
                /* Process subpages at the end of huge page */
                for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
                        cond_resched();
                        ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
                        if (ret)
                                return ret;
                }
        } else {
                /* If target subpage in second half of huge page */
                base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
                l = pages_per_huge_page - n;
                /* Process subpages at the begin of huge page */
                for (i = 0; i < base; i++) {
                        cond_resched();
                        ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
                        if (ret)
                                return ret;
                }
        }
        /*
         * Process remaining subpages in left-right-left-right pattern
         * towards the target subpage
         */
        for (i = 0; i < l; i++) {
                int left_idx = base + i;
                int right_idx = base + 2 * l - 1 - i;

                cond_resched();
                ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
                if (ret)
                        return ret;
                cond_resched();
                ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
                if (ret)
                        return ret;
        }
        return 0;
}

static void clear_gigantic_page(struct page *page,
                                unsigned long addr,
                                unsigned int pages_per_huge_page)
{
        int i;
        struct page *p;

        might_sleep();
        for (i = 0; i < pages_per_huge_page; i++) {
                p = nth_page(page, i);
                cond_resched();
                clear_user_highpage(p, addr + i * PAGE_SIZE);
        }
}

static int clear_subpage(unsigned long addr, int idx, void *arg)
{
        struct page *page = arg;

        clear_user_highpage(nth_page(page, idx), addr);
        return 0;
}

void clear_huge_page(struct page *page,
                     unsigned long addr_hint, unsigned int pages_per_huge_page)
{
        unsigned long addr = addr_hint &
                ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);

        if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
                clear_gigantic_page(page, addr, pages_per_huge_page);
                return;
        }

        process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page);
}

static int copy_user_gigantic_page(struct folio *dst, struct folio *src,
                                     unsigned long addr,
                                     struct vm_area_struct *vma,
                                     unsigned int pages_per_huge_page)
{
        int i;
        struct page *dst_page;
        struct page *src_page;

        for (i = 0; i < pages_per_huge_page; i++) {
                dst_page = folio_page(dst, i);
                src_page = folio_page(src, i);

                cond_resched();
                if (copy_mc_user_highpage(dst_page, src_page,
                                          addr + i*PAGE_SIZE, vma)) {
                        memory_failure_queue(page_to_pfn(src_page), 0);
                        return -EHWPOISON;
                }
        }
        return 0;
}

struct copy_subpage_arg {
        struct page *dst;
        struct page *src;
        struct vm_area_struct *vma;
};

static int copy_subpage(unsigned long addr, int idx, void *arg)
{
        struct copy_subpage_arg *copy_arg = arg;
        struct page *dst = nth_page(copy_arg->dst, idx);
        struct page *src = nth_page(copy_arg->src, idx);

        if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) {
                memory_failure_queue(page_to_pfn(src), 0);
                return -EHWPOISON;
        }
        return 0;
}

int copy_user_large_folio(struct folio *dst, struct folio *src,
                          unsigned long addr_hint, struct vm_area_struct *vma)
{
        unsigned int pages_per_huge_page = folio_nr_pages(dst);
        unsigned long addr = addr_hint &
                ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
        struct copy_subpage_arg arg = {
                .dst = &dst->page,
                .src = &src->page,
                .vma = vma,
        };

        if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES))
                return copy_user_gigantic_page(dst, src, addr, vma,
                                               pages_per_huge_page);

        return process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg);
}

long copy_folio_from_user(struct folio *dst_folio,
                           const void __user *usr_src,
                           bool allow_pagefault)
{
        void *kaddr;
        unsigned long i, rc = 0;
        unsigned int nr_pages = folio_nr_pages(dst_folio);
        unsigned long ret_val = nr_pages * PAGE_SIZE;
        struct page *subpage;

        for (i = 0; i < nr_pages; i++) {
                subpage = folio_page(dst_folio, i);
                kaddr = kmap_local_page(subpage);
                if (!allow_pagefault)
                        pagefault_disable();
                rc = copy_from_user(kaddr, usr_src + i * PAGE_SIZE, PAGE_SIZE);
                if (!allow_pagefault)
                        pagefault_enable();
                kunmap_local(kaddr);

                ret_val -= (PAGE_SIZE - rc);
                if (rc)
                        break;

                flush_dcache_page(subpage);

                cond_resched();
        }
        return ret_val;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */

#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS

static struct kmem_cache *page_ptl_cachep;

void __init ptlock_cache_init(void)
{
        page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
                        SLAB_PANIC, NULL);
}

bool ptlock_alloc(struct ptdesc *ptdesc)
{
        spinlock_t *ptl;

        ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
        if (!ptl)
                return false;
        ptdesc->ptl = ptl;
        return true;
}

void ptlock_free(struct ptdesc *ptdesc)
{
        kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
}
#endif

void vma_pgtable_walk_begin(struct vm_area_struct *vma)
{
        if (is_vm_hugetlb_page(vma))
                hugetlb_vma_lock_read(vma);
}

void vma_pgtable_walk_end(struct vm_area_struct *vma)
{
        if (is_vm_hugetlb_page(vma))
                hugetlb_vma_unlock_read(vma);
}























































































































































































































































































































































































































































































    1 
    1 

    1 

    1 

    1 

    1 

    1 













    1 



    1 

    1 

    1 

    1 

    1 

    1 

    1 

























































    1 
























    1 























    1 





    1 


    1 



    1 







    1 





    1 

    1 






















    1 





















    1 

    1 



























































    1 



    1 













































    1 






































































    1 
















    1 












    1 



    1 












































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ioctl.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/syscalls.h>
#include <linux/mm.h>
#include <linux/capability.h>
#include <linux/compat.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/security.h>
#include <linux/export.h>
#include <linux/uaccess.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h>
#include <linux/falloc.h>
#include <linux/sched/signal.h>
#include <linux/fiemap.h>
#include <linux/mount.h>
#include <linux/fscrypt.h>
#include <linux/fileattr.h>

#include "internal.h"

#include <asm/ioctls.h>

/* So that the fiemap access checks can't overflow on 32 bit machines. */
#define FIEMAP_MAX_EXTENTS        (UINT_MAX / sizeof(struct fiemap_extent))

/**
 * vfs_ioctl - call filesystem specific ioctl methods
 * @filp:        open file to invoke ioctl method on
 * @cmd:        ioctl command to execute
 * @arg:        command-specific argument for ioctl
 *
 * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise
 * returns -ENOTTY.
 *
 * Returns 0 on success, -errno on error.
 */
long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
        int error = -ENOTTY;

        if (!filp->f_op->unlocked_ioctl)
                goto out;

        error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
        if (error == -ENOIOCTLCMD)
                error = -ENOTTY;
 out:
        return error;
}
EXPORT_SYMBOL(vfs_ioctl);

static int ioctl_fibmap(struct file *filp, int __user *p)
{
        struct inode *inode = file_inode(filp);
        struct super_block *sb = inode->i_sb;
        int error, ur_block;
        sector_t block;

        if (!capable(CAP_SYS_RAWIO))
                return -EPERM;

        error = get_user(ur_block, p);
        if (error)
                return error;

        if (ur_block < 0)
                return -EINVAL;

        block = ur_block;
        error = bmap(inode, &block);

        if (block > INT_MAX) {
                error = -ERANGE;
                pr_warn_ratelimited("[%s/%d] FS: %s File: %pD4 would truncate fibmap result\n",
                                    current->comm, task_pid_nr(current),
                                    sb->s_id, filp);
        }

        if (error)
                ur_block = 0;
        else
                ur_block = block;

        if (put_user(ur_block, p))
                error = -EFAULT;

        return error;
}

/**
 * fiemap_fill_next_extent - Fiemap helper function
 * @fieinfo:        Fiemap context passed into ->fiemap
 * @logical:        Extent logical start offset, in bytes
 * @phys:        Extent physical start offset, in bytes
 * @len:        Extent length, in bytes
 * @flags:        FIEMAP_EXTENT flags that describe this extent
 *
 * Called from file system ->fiemap callback. Will populate extent
 * info as passed in via arguments and copy to user memory. On
 * success, extent count on fieinfo is incremented.
 *
 * Returns 0 on success, -errno on error, 1 if this was the last
 * extent that will fit in user array.
 */
int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
                            u64 phys, u64 len, u32 flags)
{
        struct fiemap_extent extent;
        struct fiemap_extent __user *dest = fieinfo->fi_extents_start;

        /* only count the extents */
        if (fieinfo->fi_extents_max == 0) {
                fieinfo->fi_extents_mapped++;
                return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
        }

        if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
                return 1;

#define SET_UNKNOWN_FLAGS        (FIEMAP_EXTENT_DELALLOC)
#define SET_NO_UNMOUNTED_IO_FLAGS        (FIEMAP_EXTENT_DATA_ENCRYPTED)
#define SET_NOT_ALIGNED_FLAGS        (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)

        if (flags & SET_UNKNOWN_FLAGS)
                flags |= FIEMAP_EXTENT_UNKNOWN;
        if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
                flags |= FIEMAP_EXTENT_ENCODED;
        if (flags & SET_NOT_ALIGNED_FLAGS)
                flags |= FIEMAP_EXTENT_NOT_ALIGNED;

        memset(&extent, 0, sizeof(extent));
        extent.fe_logical = logical;
        extent.fe_physical = phys;
        extent.fe_length = len;
        extent.fe_flags = flags;

        dest += fieinfo->fi_extents_mapped;
        if (copy_to_user(dest, &extent, sizeof(extent)))
                return -EFAULT;

        fieinfo->fi_extents_mapped++;
        if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max)
                return 1;
        return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
}
EXPORT_SYMBOL(fiemap_fill_next_extent);

/**
 * fiemap_prep - check validity of requested flags for fiemap
 * @inode:        Inode to operate on
 * @fieinfo:        Fiemap context passed into ->fiemap
 * @start:        Start of the mapped range
 * @len:        Length of the mapped range, can be truncated by this function.
 * @supported_flags:        Set of fiemap flags that the file system understands
 *
 * This function must be called from each ->fiemap instance to validate the
 * fiemap request against the file system parameters.
 *
 * Returns 0 on success, or a negative error on failure.
 */
int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo,
                u64 start, u64 *len, u32 supported_flags)
{
        u64 maxbytes = inode->i_sb->s_maxbytes;
        u32 incompat_flags;
        int ret = 0;

        if (*len == 0)
                return -EINVAL;
        if (start >= maxbytes)
                return -EFBIG;

        /*
         * Shrink request scope to what the fs can actually handle.
         */
        if (*len > maxbytes || (maxbytes - *len) < start)
                *len = maxbytes - start;

        supported_flags |= FIEMAP_FLAG_SYNC;
        supported_flags &= FIEMAP_FLAGS_COMPAT;
        incompat_flags = fieinfo->fi_flags & ~supported_flags;
        if (incompat_flags) {
                fieinfo->fi_flags = incompat_flags;
                return -EBADR;
        }

        if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC)
                ret = filemap_write_and_wait(inode->i_mapping);
        return ret;
}
EXPORT_SYMBOL(fiemap_prep);

static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap)
{
        struct fiemap fiemap;
        struct fiemap_extent_info fieinfo = { 0, };
        struct inode *inode = file_inode(filp);
        int error;

        if (!inode->i_op->fiemap)
                return -EOPNOTSUPP;

        if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
                return -EFAULT;

        if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
                return -EINVAL;

        fieinfo.fi_flags = fiemap.fm_flags;
        fieinfo.fi_extents_max = fiemap.fm_extent_count;
        fieinfo.fi_extents_start = ufiemap->fm_extents;

        error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start,
                        fiemap.fm_length);

        fiemap.fm_flags = fieinfo.fi_flags;
        fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
        if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
                error = -EFAULT;

        return error;
}

static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
                             u64 off, u64 olen, u64 destoff)
{
        struct fd src_file = fdget(srcfd);
        loff_t cloned;
        int ret;

        if (!src_file.file)
                return -EBADF;
        cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff,
                                      olen, 0);
        if (cloned < 0)
                ret = cloned;
        else if (olen && cloned != olen)
                ret = -EINVAL;
        else
                ret = 0;
        fdput(src_file);
        return ret;
}

static long ioctl_file_clone_range(struct file *file,
                                   struct file_clone_range __user *argp)
{
        struct file_clone_range args;

        if (copy_from_user(&args, argp, sizeof(args)))
                return -EFAULT;
        return ioctl_file_clone(file, args.src_fd, args.src_offset,
                                args.src_length, args.dest_offset);
}

/*
 * This provides compatibility with legacy XFS pre-allocation ioctls
 * which predate the fallocate syscall.
 *
 * Only the l_start, l_len and l_whence fields of the 'struct space_resv'
 * are used here, rest are ignored.
 */
static int ioctl_preallocate(struct file *filp, int mode, void __user *argp)
{
        struct inode *inode = file_inode(filp);
        struct space_resv sr;

        if (copy_from_user(&sr, argp, sizeof(sr)))
                return -EFAULT;

        switch (sr.l_whence) {
        case SEEK_SET:
                break;
        case SEEK_CUR:
                sr.l_start += filp->f_pos;
                break;
        case SEEK_END:
                sr.l_start += i_size_read(inode);
                break;
        default:
                return -EINVAL;
        }

        return vfs_fallocate(filp, mode | FALLOC_FL_KEEP_SIZE, sr.l_start,
                        sr.l_len);
}

/* on ia32 l_start is on a 32-bit boundary */
#if defined CONFIG_COMPAT && defined(CONFIG_X86_64)
/* just account for different alignment */
static int compat_ioctl_preallocate(struct file *file, int mode,
                                    struct space_resv_32 __user *argp)
{
        struct inode *inode = file_inode(file);
        struct space_resv_32 sr;

        if (copy_from_user(&sr, argp, sizeof(sr)))
                return -EFAULT;

        switch (sr.l_whence) {
        case SEEK_SET:
                break;
        case SEEK_CUR:
                sr.l_start += file->f_pos;
                break;
        case SEEK_END:
                sr.l_start += i_size_read(inode);
                break;
        default:
                return -EINVAL;
        }

        return vfs_fallocate(file, mode | FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len);
}
#endif

static int file_ioctl(struct file *filp, unsigned int cmd, int __user *p)
{
        switch (cmd) {
        case FIBMAP:
                return ioctl_fibmap(filp, p);
        case FS_IOC_RESVSP:
        case FS_IOC_RESVSP64:
                return ioctl_preallocate(filp, 0, p);
        case FS_IOC_UNRESVSP:
        case FS_IOC_UNRESVSP64:
                return ioctl_preallocate(filp, FALLOC_FL_PUNCH_HOLE, p);
        case FS_IOC_ZERO_RANGE:
                return ioctl_preallocate(filp, FALLOC_FL_ZERO_RANGE, p);
        }

        return -ENOIOCTLCMD;
}

static int ioctl_fionbio(struct file *filp, int __user *argp)
{
        unsigned int flag;
        int on, error;

        error = get_user(on, argp);
        if (error)
                return error;
        flag = O_NONBLOCK;
#ifdef __sparc__
        /* SunOS compatibility item. */
        if (O_NONBLOCK != O_NDELAY)
                flag |= O_NDELAY;
#endif
        spin_lock(&filp->f_lock);
        if (on)
                filp->f_flags |= flag;
        else
                filp->f_flags &= ~flag;
        spin_unlock(&filp->f_lock);
        return error;
}

static int ioctl_fioasync(unsigned int fd, struct file *filp,
                          int __user *argp)
{
        unsigned int flag;
        int on, error;

        error = get_user(on, argp);
        if (error)
                return error;
        flag = on ? FASYNC : 0;

        /* Did FASYNC state change ? */
        if ((flag ^ filp->f_flags) & FASYNC) {
                if (filp->f_op->fasync)
                        /* fasync() adjusts filp->f_flags */
                        error = filp->f_op->fasync(fd, filp, on);
                else
                        error = -ENOTTY;
        }
        return error < 0 ? error : 0;
}

static int ioctl_fsfreeze(struct file *filp)
{
        struct super_block *sb = file_inode(filp)->i_sb;

        if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        /* If filesystem doesn't support freeze feature, return. */
        if (sb->s_op->freeze_fs == NULL && sb->s_op->freeze_super == NULL)
                return -EOPNOTSUPP;

        /* Freeze */
        if (sb->s_op->freeze_super)
                return sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE);
        return freeze_super(sb, FREEZE_HOLDER_USERSPACE);
}

static int ioctl_fsthaw(struct file *filp)
{
        struct super_block *sb = file_inode(filp)->i_sb;

        if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        /* Thaw */
        if (sb->s_op->thaw_super)
                return sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE);
        return thaw_super(sb, FREEZE_HOLDER_USERSPACE);
}

static int ioctl_file_dedupe_range(struct file *file,
                                   struct file_dedupe_range __user *argp)
{
        struct file_dedupe_range *same = NULL;
        int ret;
        unsigned long size;
        u16 count;

        if (get_user(count, &argp->dest_count)) {
                ret = -EFAULT;
                goto out;
        }

        size = offsetof(struct file_dedupe_range, info[count]);
        if (size > PAGE_SIZE) {
                ret = -ENOMEM;
                goto out;
        }

        same = memdup_user(argp, size);
        if (IS_ERR(same)) {
                ret = PTR_ERR(same);
                same = NULL;
                goto out;
        }

        same->dest_count = count;
        ret = vfs_dedupe_file_range(file, same);
        if (ret)
                goto out;

        ret = copy_to_user(argp, same, size);
        if (ret)
                ret = -EFAULT;

out:
        kfree(same);
        return ret;
}

/**
 * fileattr_fill_xflags - initialize fileattr with xflags
 * @fa:                fileattr pointer
 * @xflags:        FS_XFLAG_* flags
 *
 * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags).  All
 * other fields are zeroed.
 */
void fileattr_fill_xflags(struct fileattr *fa, u32 xflags)
{
        memset(fa, 0, sizeof(*fa));
        fa->fsx_valid = true;
        fa->fsx_xflags = xflags;
        if (fa->fsx_xflags & FS_XFLAG_IMMUTABLE)
                fa->flags |= FS_IMMUTABLE_FL;
        if (fa->fsx_xflags & FS_XFLAG_APPEND)
                fa->flags |= FS_APPEND_FL;
        if (fa->fsx_xflags & FS_XFLAG_SYNC)
                fa->flags |= FS_SYNC_FL;
        if (fa->fsx_xflags & FS_XFLAG_NOATIME)
                fa->flags |= FS_NOATIME_FL;
        if (fa->fsx_xflags & FS_XFLAG_NODUMP)
                fa->flags |= FS_NODUMP_FL;
        if (fa->fsx_xflags & FS_XFLAG_DAX)
                fa->flags |= FS_DAX_FL;
        if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
                fa->flags |= FS_PROJINHERIT_FL;
}
EXPORT_SYMBOL(fileattr_fill_xflags);

/**
 * fileattr_fill_flags - initialize fileattr with flags
 * @fa:                fileattr pointer
 * @flags:        FS_*_FL flags
 *
 * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags).
 * All other fields are zeroed.
 */
void fileattr_fill_flags(struct fileattr *fa, u32 flags)
{
        memset(fa, 0, sizeof(*fa));
        fa->flags_valid = true;
        fa->flags = flags;
        if (fa->flags & FS_SYNC_FL)
                fa->fsx_xflags |= FS_XFLAG_SYNC;
        if (fa->flags & FS_IMMUTABLE_FL)
                fa->fsx_xflags |= FS_XFLAG_IMMUTABLE;
        if (fa->flags & FS_APPEND_FL)
                fa->fsx_xflags |= FS_XFLAG_APPEND;
        if (fa->flags & FS_NODUMP_FL)
                fa->fsx_xflags |= FS_XFLAG_NODUMP;
        if (fa->flags & FS_NOATIME_FL)
                fa->fsx_xflags |= FS_XFLAG_NOATIME;
        if (fa->flags & FS_DAX_FL)
                fa->fsx_xflags |= FS_XFLAG_DAX;
        if (fa->flags & FS_PROJINHERIT_FL)
                fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
}
EXPORT_SYMBOL(fileattr_fill_flags);

/**
 * vfs_fileattr_get - retrieve miscellaneous file attributes
 * @dentry:        the object to retrieve from
 * @fa:                fileattr pointer
 *
 * Call i_op->fileattr_get() callback, if exists.
 *
 * Return: 0 on success, or a negative error on failure.
 */
int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
        struct inode *inode = d_inode(dentry);

        if (!inode->i_op->fileattr_get)
                return -ENOIOCTLCMD;

        return inode->i_op->fileattr_get(dentry, fa);
}
EXPORT_SYMBOL(vfs_fileattr_get);

/**
 * copy_fsxattr_to_user - copy fsxattr to userspace.
 * @fa:                fileattr pointer
 * @ufa:        fsxattr user pointer
 *
 * Return: 0 on success, or -EFAULT on failure.
 */
int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa)
{
        struct fsxattr xfa;

        memset(&xfa, 0, sizeof(xfa));
        xfa.fsx_xflags = fa->fsx_xflags;
        xfa.fsx_extsize = fa->fsx_extsize;
        xfa.fsx_nextents = fa->fsx_nextents;
        xfa.fsx_projid = fa->fsx_projid;
        xfa.fsx_cowextsize = fa->fsx_cowextsize;

        if (copy_to_user(ufa, &xfa, sizeof(xfa)))
                return -EFAULT;

        return 0;
}
EXPORT_SYMBOL(copy_fsxattr_to_user);

static int copy_fsxattr_from_user(struct fileattr *fa,
                                  struct fsxattr __user *ufa)
{
        struct fsxattr xfa;

        if (copy_from_user(&xfa, ufa, sizeof(xfa)))
                return -EFAULT;

        fileattr_fill_xflags(fa, xfa.fsx_xflags);
        fa->fsx_extsize = xfa.fsx_extsize;
        fa->fsx_nextents = xfa.fsx_nextents;
        fa->fsx_projid = xfa.fsx_projid;
        fa->fsx_cowextsize = xfa.fsx_cowextsize;

        return 0;
}

/*
 * Generic function to check FS_IOC_FSSETXATTR/FS_IOC_SETFLAGS values and reject
 * any invalid configurations.
 *
 * Note: must be called with inode lock held.
 */
static int fileattr_set_prepare(struct inode *inode,
                              const struct fileattr *old_ma,
                              struct fileattr *fa)
{
        int err;

        /*
         * The IMMUTABLE and APPEND_ONLY flags can only be changed by
         * the relevant capability.
         */
        if ((fa->flags ^ old_ma->flags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) &&
            !capable(CAP_LINUX_IMMUTABLE))
                return -EPERM;

        err = fscrypt_prepare_setflags(inode, old_ma->flags, fa->flags);
        if (err)
                return err;

        /*
         * Project Quota ID state is only allowed to change from within the init
         * namespace. Enforce that restriction only if we are trying to change
         * the quota ID state. Everything else is allowed in user namespaces.
         */
        if (current_user_ns() != &init_user_ns) {
                if (old_ma->fsx_projid != fa->fsx_projid)
                        return -EINVAL;
                if ((old_ma->fsx_xflags ^ fa->fsx_xflags) &
                                FS_XFLAG_PROJINHERIT)
                        return -EINVAL;
        } else {
                /*
                 * Caller is allowed to change the project ID. If it is being
                 * changed, make sure that the new value is valid.
                 */
                if (old_ma->fsx_projid != fa->fsx_projid &&
                    !projid_valid(make_kprojid(&init_user_ns, fa->fsx_projid)))
                        return -EINVAL;
        }

        /* Check extent size hints. */
        if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode))
                return -EINVAL;

        if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
                        !S_ISDIR(inode->i_mode))
                return -EINVAL;

        if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
            !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
                return -EINVAL;

        /*
         * It is only valid to set the DAX flag on regular files and
         * directories on filesystems.
         */
        if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
            !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
                return -EINVAL;

        /* Extent size hints of zero turn off the flags. */
        if (fa->fsx_extsize == 0)
                fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
        if (fa->fsx_cowextsize == 0)
                fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;

        return 0;
}

/**
 * vfs_fileattr_set - change miscellaneous file attributes
 * @idmap:        idmap of the mount
 * @dentry:        the object to change
 * @fa:                fileattr pointer
 *
 * After verifying permissions, call i_op->fileattr_set() callback, if
 * exists.
 *
 * Verifying attributes involves retrieving current attributes with
 * i_op->fileattr_get(), this also allows initializing attributes that have
 * not been set by the caller to current values.  Inode lock is held
 * thoughout to prevent racing with another instance.
 *
 * Return: 0 on success, or a negative error on failure.
 */
int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
                     struct fileattr *fa)
{
        struct inode *inode = d_inode(dentry);
        struct fileattr old_ma = {};
        int err;

        if (!inode->i_op->fileattr_set)
                return -ENOIOCTLCMD;

        if (!inode_owner_or_capable(idmap, inode))
                return -EPERM;

        inode_lock(inode);
        err = vfs_fileattr_get(dentry, &old_ma);
        if (!err) {
                /* initialize missing bits from old_ma */
                if (fa->flags_valid) {
                        fa->fsx_xflags |= old_ma.fsx_xflags & ~FS_XFLAG_COMMON;
                        fa->fsx_extsize = old_ma.fsx_extsize;
                        fa->fsx_nextents = old_ma.fsx_nextents;
                        fa->fsx_projid = old_ma.fsx_projid;
                        fa->fsx_cowextsize = old_ma.fsx_cowextsize;
                } else {
                        fa->flags |= old_ma.flags & ~FS_COMMON_FL;
                }
                err = fileattr_set_prepare(inode, &old_ma, fa);
                if (!err)
                        err = inode->i_op->fileattr_set(idmap, dentry, fa);
        }
        inode_unlock(inode);

        return err;
}
EXPORT_SYMBOL(vfs_fileattr_set);

static int ioctl_getflags(struct file *file, unsigned int __user *argp)
{
        struct fileattr fa = { .flags_valid = true }; /* hint only */
        int err;

        err = vfs_fileattr_get(file->f_path.dentry, &fa);
        if (!err)
                err = put_user(fa.flags, argp);
        return err;
}

static int ioctl_setflags(struct file *file, unsigned int __user *argp)
{
        struct mnt_idmap *idmap = file_mnt_idmap(file);
        struct dentry *dentry = file->f_path.dentry;
        struct fileattr fa;
        unsigned int flags;
        int err;

        err = get_user(flags, argp);
        if (!err) {
                err = mnt_want_write_file(file);
                if (!err) {
                        fileattr_fill_flags(&fa, flags);
                        err = vfs_fileattr_set(idmap, dentry, &fa);
                        mnt_drop_write_file(file);
                }
        }
        return err;
}

static int ioctl_fsgetxattr(struct file *file, void __user *argp)
{
        struct fileattr fa = { .fsx_valid = true }; /* hint only */
        int err;

        err = vfs_fileattr_get(file->f_path.dentry, &fa);
        if (!err)
                err = copy_fsxattr_to_user(&fa, argp);

        return err;
}

static int ioctl_fssetxattr(struct file *file, void __user *argp)
{
        struct mnt_idmap *idmap = file_mnt_idmap(file);
        struct dentry *dentry = file->f_path.dentry;
        struct fileattr fa;
        int err;

        err = copy_fsxattr_from_user(&fa, argp);
        if (!err) {
                err = mnt_want_write_file(file);
                if (!err) {
                        err = vfs_fileattr_set(idmap, dentry, &fa);
                        mnt_drop_write_file(file);
                }
        }
        return err;
}

static int ioctl_getfsuuid(struct file *file, void __user *argp)
{
        struct super_block *sb = file_inode(file)->i_sb;
        struct fsuuid2 u = { .len = sb->s_uuid_len, };

        if (!sb->s_uuid_len)
                return -ENOTTY;

        memcpy(&u.uuid[0], &sb->s_uuid, sb->s_uuid_len);

        return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0;
}

static int ioctl_get_fs_sysfs_path(struct file *file, void __user *argp)
{
        struct super_block *sb = file_inode(file)->i_sb;

        if (!strlen(sb->s_sysfs_name))
                return -ENOTTY;

        struct fs_sysfs_path u = {};

        u.len = scnprintf(u.name, sizeof(u.name), "%s/%s", sb->s_type->name, sb->s_sysfs_name);

        return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0;
}

/*
 * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d.
 * It's just a simple helper for sys_ioctl and compat_sys_ioctl.
 *
 * When you add any new common ioctls to the switches above and below,
 * please ensure they have compatible arguments in compat mode.
 *
 * The LSM mailing list should also be notified of any command additions or
 * changes, as specific LSMs may be affected.
 */
static int do_vfs_ioctl(struct file *filp, unsigned int fd,
                        unsigned int cmd, unsigned long arg)
{
        void __user *argp = (void __user *)arg;
        struct inode *inode = file_inode(filp);

        switch (cmd) {
        case FIOCLEX:
                set_close_on_exec(fd, 1);
                return 0;

        case FIONCLEX:
                set_close_on_exec(fd, 0);
                return 0;

        case FIONBIO:
                return ioctl_fionbio(filp, argp);

        case FIOASYNC:
                return ioctl_fioasync(fd, filp, argp);

        case FIOQSIZE:
                if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
                    S_ISLNK(inode->i_mode)) {
                        loff_t res = inode_get_bytes(inode);
                        return copy_to_user(argp, &res, sizeof(res)) ?
                                            -EFAULT : 0;
                }

                return -ENOTTY;

        case FIFREEZE:
                return ioctl_fsfreeze(filp);

        case FITHAW:
                return ioctl_fsthaw(filp);

        case FS_IOC_FIEMAP:
                return ioctl_fiemap(filp, argp);

        case FIGETBSZ:
                /* anon_bdev filesystems may not have a block size */
                if (!inode->i_sb->s_blocksize)
                        return -EINVAL;

                return put_user(inode->i_sb->s_blocksize, (int __user *)argp);

        case FICLONE:
                return ioctl_file_clone(filp, arg, 0, 0, 0);

        case FICLONERANGE:
                return ioctl_file_clone_range(filp, argp);

        case FIDEDUPERANGE:
                return ioctl_file_dedupe_range(filp, argp);

        case FIONREAD:
                if (!S_ISREG(inode->i_mode))
                        return vfs_ioctl(filp, cmd, arg);

                return put_user(i_size_read(inode) - filp->f_pos,
                                (int __user *)argp);

        case FS_IOC_GETFLAGS:
                return ioctl_getflags(filp, argp);

        case FS_IOC_SETFLAGS:
                return ioctl_setflags(filp, argp);

        case FS_IOC_FSGETXATTR:
                return ioctl_fsgetxattr(filp, argp);

        case FS_IOC_FSSETXATTR:
                return ioctl_fssetxattr(filp, argp);

        case FS_IOC_GETFSUUID:
                return ioctl_getfsuuid(filp, argp);

        case FS_IOC_GETFSSYSFSPATH:
                return ioctl_get_fs_sysfs_path(filp, argp);

        default:
                if (S_ISREG(inode->i_mode))
                        return file_ioctl(filp, cmd, argp);
                break;
        }

        return -ENOIOCTLCMD;
}

SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
        struct fd f = fdget(fd);
        int error;

        if (!f.file)
                return -EBADF;

        error = security_file_ioctl(f.file, cmd, arg);
        if (error)
                goto out;

        error = do_vfs_ioctl(f.file, fd, cmd, arg);
        if (error == -ENOIOCTLCMD)
                error = vfs_ioctl(f.file, cmd, arg);

out:
        fdput(f);
        return error;
}

#ifdef CONFIG_COMPAT
/**
 * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation
 * @file: The file to operate on.
 * @cmd: The ioctl command number.
 * @arg: The argument to the ioctl.
 *
 * This is not normally called as a function, but instead set in struct
 * file_operations as
 *
 *     .compat_ioctl = compat_ptr_ioctl,
 *
 * On most architectures, the compat_ptr_ioctl() just passes all arguments
 * to the corresponding ->ioctl handler. The exception is arch/s390, where
 * compat_ptr() clears the top bit of a 32-bit pointer value, so user space
 * pointers to the second 2GB alias the first 2GB, as is the case for
 * native 32-bit s390 user space.
 *
 * The compat_ptr_ioctl() function must therefore be used only with ioctl
 * functions that either ignore the argument or pass a pointer to a
 * compatible data type.
 *
 * If any ioctl command handled by fops->unlocked_ioctl passes a plain
 * integer instead of a pointer, or any of the passed data types
 * is incompatible between 32-bit and 64-bit architectures, a proper
 * handler is required instead of compat_ptr_ioctl.
 */
long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        if (!file->f_op->unlocked_ioctl)
                return -ENOIOCTLCMD;

        return file->f_op->unlocked_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
}
EXPORT_SYMBOL(compat_ptr_ioctl);

COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
                       compat_ulong_t, arg)
{
        struct fd f = fdget(fd);
        int error;

        if (!f.file)
                return -EBADF;

        error = security_file_ioctl_compat(f.file, cmd, arg);
        if (error)
                goto out;

        switch (cmd) {
        /* FICLONE takes an int argument, so don't use compat_ptr() */
        case FICLONE:
                error = ioctl_file_clone(f.file, arg, 0, 0, 0);
                break;

#if defined(CONFIG_X86_64)
        /* these get messy on amd64 due to alignment differences */
        case FS_IOC_RESVSP_32:
        case FS_IOC_RESVSP64_32:
                error = compat_ioctl_preallocate(f.file, 0, compat_ptr(arg));
                break;
        case FS_IOC_UNRESVSP_32:
        case FS_IOC_UNRESVSP64_32:
                error = compat_ioctl_preallocate(f.file, FALLOC_FL_PUNCH_HOLE,
                                compat_ptr(arg));
                break;
        case FS_IOC_ZERO_RANGE_32:
                error = compat_ioctl_preallocate(f.file, FALLOC_FL_ZERO_RANGE,
                                compat_ptr(arg));
                break;
#endif

        /*
         * These access 32-bit values anyway so no further handling is
         * necessary.
         */
        case FS_IOC32_GETFLAGS:
        case FS_IOC32_SETFLAGS:
                cmd = (cmd == FS_IOC32_GETFLAGS) ?
                        FS_IOC_GETFLAGS : FS_IOC_SETFLAGS;
                fallthrough;
        /*
         * everything else in do_vfs_ioctl() takes either a compatible
         * pointer argument or no argument -- call it with a modified
         * argument.
         */
        default:
                error = do_vfs_ioctl(f.file, fd, cmd,
                                     (unsigned long)compat_ptr(arg));
                if (error != -ENOIOCTLCMD)
                        break;

                if (f.file->f_op->compat_ioctl)
                        error = f.file->f_op->compat_ioctl(f.file, cmd, arg);
                if (error == -ENOIOCTLCMD)
                        error = -ENOTTY;
                break;
        }

 out:
        fdput(f);

        return error;
}
#endif













































































   11 











   10 




   11 




























    2 


    3 





























































   11 
   12 


   13 



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
// SPDX-License-Identifier: GPL-2.0
/*
 * Block rq-qos policy for assigning an I/O priority class to requests.
 *
 * Using an rq-qos policy for assigning I/O priority class has two advantages
 * over using the ioprio_set() system call:
 *
 * - This policy is cgroup based so it has all the advantages of cgroups.
 * - While ioprio_set() does not affect page cache writeback I/O, this rq-qos
 *   controller affects page cache writeback I/O for filesystems that support
 *   assiociating a cgroup with writeback I/O. See also
 *   Documentation/admin-guide/cgroup-v2.rst.
 */

#include <linux/blk-mq.h>
#include <linux/blk_types.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include "blk-cgroup.h"
#include "blk-ioprio.h"
#include "blk-rq-qos.h"

/**
 * enum prio_policy - I/O priority class policy.
 * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class.
 * @POLICY_PROMOTE_TO_RT: modify no-IOPRIO_CLASS_RT to IOPRIO_CLASS_RT.
 * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into
 *                IOPRIO_CLASS_BE.
 * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE.
 * @POLICY_NONE_TO_RT: an alias for POLICY_PROMOTE_TO_RT.
 *
 * See also <linux/ioprio.h>.
 */
enum prio_policy {
        POLICY_NO_CHANGE        = 0,
        POLICY_PROMOTE_TO_RT        = 1,
        POLICY_RESTRICT_TO_BE        = 2,
        POLICY_ALL_TO_IDLE        = 3,
        POLICY_NONE_TO_RT        = 4,
};

static const char *policy_name[] = {
        [POLICY_NO_CHANGE]        = "no-change",
        [POLICY_PROMOTE_TO_RT]        = "promote-to-rt",
        [POLICY_RESTRICT_TO_BE]        = "restrict-to-be",
        [POLICY_ALL_TO_IDLE]        = "idle",
        [POLICY_NONE_TO_RT]        = "none-to-rt",
};

static struct blkcg_policy ioprio_policy;

/**
 * struct ioprio_blkg - Per (cgroup, request queue) data.
 * @pd: blkg_policy_data structure.
 */
struct ioprio_blkg {
        struct blkg_policy_data pd;
};

/**
 * struct ioprio_blkcg - Per cgroup data.
 * @cpd: blkcg_policy_data structure.
 * @prio_policy: One of the IOPRIO_CLASS_* values. See also <linux/ioprio.h>.
 */
struct ioprio_blkcg {
        struct blkcg_policy_data cpd;
        enum prio_policy         prio_policy;
};

static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd)
{
        return pd ? container_of(pd, struct ioprio_blkg, pd) : NULL;
}

static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg)
{
        return container_of(blkcg_to_cpd(blkcg, &ioprio_policy),
                            struct ioprio_blkcg, cpd);
}

static struct ioprio_blkcg *
ioprio_blkcg_from_css(struct cgroup_subsys_state *css)
{
        return blkcg_to_ioprio_blkcg(css_to_blkcg(css));
}

static struct ioprio_blkcg *ioprio_blkcg_from_bio(struct bio *bio)
{
        struct blkg_policy_data *pd = blkg_to_pd(bio->bi_blkg, &ioprio_policy);

        if (!pd)
                return NULL;

        return blkcg_to_ioprio_blkcg(pd->blkg->blkcg);
}

static int ioprio_show_prio_policy(struct seq_file *sf, void *v)
{
        struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf));

        seq_printf(sf, "%s\n", policy_name[blkcg->prio_policy]);
        return 0;
}

static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf,
                                      size_t nbytes, loff_t off)
{
        struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(of_css(of));
        int ret;

        if (off != 0)
                return -EIO;
        /* kernfs_fop_write_iter() terminates 'buf' with '\0'. */
        ret = sysfs_match_string(policy_name, buf);
        if (ret < 0)
                return ret;
        blkcg->prio_policy = ret;
        return nbytes;
}

static struct blkg_policy_data *
ioprio_alloc_pd(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp)
{
        struct ioprio_blkg *ioprio_blkg;

        ioprio_blkg = kzalloc(sizeof(*ioprio_blkg), gfp);
        if (!ioprio_blkg)
                return NULL;

        return &ioprio_blkg->pd;
}

static void ioprio_free_pd(struct blkg_policy_data *pd)
{
        struct ioprio_blkg *ioprio_blkg = pd_to_ioprio(pd);

        kfree(ioprio_blkg);
}

static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp)
{
        struct ioprio_blkcg *blkcg;

        blkcg = kzalloc(sizeof(*blkcg), gfp);
        if (!blkcg)
                return NULL;
        blkcg->prio_policy = POLICY_NO_CHANGE;
        return &blkcg->cpd;
}

static void ioprio_free_cpd(struct blkcg_policy_data *cpd)
{
        struct ioprio_blkcg *blkcg = container_of(cpd, typeof(*blkcg), cpd);

        kfree(blkcg);
}

#define IOPRIO_ATTRS                                                \
        {                                                        \
                .name                = "prio.class",                        \
                .seq_show        = ioprio_show_prio_policy,        \
                .write                = ioprio_set_prio_policy,        \
        },                                                        \
        { } /* sentinel */

/* cgroup v2 attributes */
static struct cftype ioprio_files[] = {
        IOPRIO_ATTRS
};

/* cgroup v1 attributes */
static struct cftype ioprio_legacy_files[] = {
        IOPRIO_ATTRS
};

static struct blkcg_policy ioprio_policy = {
        .dfl_cftypes        = ioprio_files,
        .legacy_cftypes = ioprio_legacy_files,

        .cpd_alloc_fn        = ioprio_alloc_cpd,
        .cpd_free_fn        = ioprio_free_cpd,

        .pd_alloc_fn        = ioprio_alloc_pd,
        .pd_free_fn        = ioprio_free_pd,
};

void blkcg_set_ioprio(struct bio *bio)
{
        struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio);
        u16 prio;

        if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE)
                return;

        if (blkcg->prio_policy == POLICY_PROMOTE_TO_RT ||
            blkcg->prio_policy == POLICY_NONE_TO_RT) {
                /*
                 * For RT threads, the default priority level is 4 because
                 * task_nice is 0. By promoting non-RT io-priority to RT-class
                 * and default level 4, those requests that are already
                 * RT-class but need a higher io-priority can use ioprio_set()
                 * to achieve this.
                 */
                if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) != IOPRIO_CLASS_RT)
                        bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 4);
                return;
        }

        /*
         * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers
         * correspond to a lower priority. Hence, the max_t() below selects
         * the lower priority of bi_ioprio and the cgroup I/O priority class.
         * If the bio I/O priority equals IOPRIO_CLASS_NONE, the cgroup I/O
         * priority is assigned to the bio.
         */
        prio = max_t(u16, bio->bi_ioprio,
                        IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
        if (prio > bio->bi_ioprio)
                bio->bi_ioprio = prio;
}

void blk_ioprio_exit(struct gendisk *disk)
{
        blkcg_deactivate_policy(disk, &ioprio_policy);
}

int blk_ioprio_init(struct gendisk *disk)
{
        return blkcg_activate_policy(disk, &ioprio_policy);
}

static int __init ioprio_init(void)
{
        return blkcg_policy_register(&ioprio_policy);
}

static void __exit ioprio_exit(void)
{
        blkcg_policy_unregister(&ioprio_policy);
}

module_init(ioprio_init);
module_exit(ioprio_exit);






































    2 




    2 












    7 




    7 




    7 











































































































    1 



    1 

























































































   18 

















   16 

   18 
















   15 
    8 









    1 











    9 





    9 

    9 

































    1 
    1 






    1 
    1 






    1 
    1 


    1 
    1 

    1 









    7 








    5 




    1 










    1 








    1 








    1 



















    7 














    7 














    7 





    7 


    1 
    1 




    1 






























    1 






    1 







    6 










    6 





    7 




    1 









    1 






    6 







    6 

















    8 












    8 


    6 

    2 





    7 



    1 




    1 














    7 
    1 
    8 

















    1 


    1 













































    1 







    1 









    1 













































































































































































































    5 



    5 




    1 








    4 




    5 





    5 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/balloc.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 */

#include <linux/time.h>
#include <linux/capability.h>
#include <linux/fs.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "mballoc.h"

#include <trace/events/ext4.h>
#include <kunit/static_stub.h>

static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
                                            ext4_group_t block_group);
/*
 * balloc.c contains the blocks allocation and deallocation routines
 */

/*
 * Calculate block group number for a given block number
 */
ext4_group_t ext4_get_group_number(struct super_block *sb,
                                   ext4_fsblk_t block)
{
        ext4_group_t group;

        if (test_opt2(sb, STD_GROUP_SIZE))
                group = (block -
                         le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >>
                        (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
        else
                ext4_get_group_no_and_offset(sb, block, &group, NULL);
        return group;
}

/*
 * Calculate the block group number and offset into the block/cluster
 * allocation bitmap, given a block number
 */
void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
                ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
{
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        ext4_grpblk_t offset;

        blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
        offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
                EXT4_SB(sb)->s_cluster_bits;
        if (offsetp)
                *offsetp = offset;
        if (blockgrpp)
                *blockgrpp = blocknr;

}

/*
 * Check whether the 'block' lives within the 'block_group'. Returns 1 if so
 * and 0 otherwise.
 */
static inline int ext4_block_in_group(struct super_block *sb,
                                      ext4_fsblk_t block,
                                      ext4_group_t block_group)
{
        ext4_group_t actual_group;

        actual_group = ext4_get_group_number(sb, block);
        return (actual_group == block_group) ? 1 : 0;
}

/*
 * Return the number of clusters used for file system metadata; this
 * represents the overhead needed by the file system.
 */
static unsigned ext4_num_overhead_clusters(struct super_block *sb,
                                           ext4_group_t block_group,
                                           struct ext4_group_desc *gdp)
{
        unsigned base_clusters, num_clusters;
        int block_cluster = -1, inode_cluster;
        int itbl_cluster_start = -1, itbl_cluster_end = -1;
        ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group);
        ext4_fsblk_t end = start + EXT4_BLOCKS_PER_GROUP(sb) - 1;
        ext4_fsblk_t itbl_blk_start, itbl_blk_end;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /* This is the number of clusters used by the superblock,
         * block group descriptors, and reserved block group
         * descriptor blocks */
        base_clusters = ext4_num_base_meta_clusters(sb, block_group);
        num_clusters = base_clusters;

        /*
         * Account and record inode table clusters if any cluster
         * is in the block group, or inode table cluster range is
         * [-1, -1] and won't overlap with block/inode bitmap cluster
         * accounted below.
         */
        itbl_blk_start = ext4_inode_table(sb, gdp);
        itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1;
        if (itbl_blk_start <= end && itbl_blk_end >= start) {
                itbl_blk_start = max(itbl_blk_start, start);
                itbl_blk_end = min(itbl_blk_end, end);

                itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start);
                itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start);

                num_clusters += itbl_cluster_end - itbl_cluster_start + 1;
                /* check if border cluster is overlapped */
                if (itbl_cluster_start == base_clusters - 1)
                        num_clusters--;
        }

        /*
         * For the allocation bitmaps, we first need to check to see
         * if the block is in the block group.  If it is, then check
         * to see if the cluster is already accounted for in the clusters
         * used for the base metadata cluster and inode tables cluster.
         * Normally all of these blocks are contiguous, so the special
         * case handling shouldn't be necessary except for *very*
         * unusual file system layouts.
         */
        if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) {
                block_cluster = EXT4_B2C(sbi,
                                         ext4_block_bitmap(sb, gdp) - start);
                if (block_cluster >= base_clusters &&
                    (block_cluster < itbl_cluster_start ||
                    block_cluster > itbl_cluster_end))
                        num_clusters++;
        }

        if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) {
                inode_cluster = EXT4_B2C(sbi,
                                         ext4_inode_bitmap(sb, gdp) - start);
                /*
                 * Additional check if inode bitmap is in just accounted
                 * block_cluster
                 */
                if (inode_cluster != block_cluster &&
                    inode_cluster >= base_clusters &&
                    (inode_cluster < itbl_cluster_start ||
                    inode_cluster > itbl_cluster_end))
                        num_clusters++;
        }

        return num_clusters;
}

static unsigned int num_clusters_in_group(struct super_block *sb,
                                          ext4_group_t block_group)
{
        unsigned int blocks;

        if (block_group == ext4_get_groups_count(sb) - 1) {
                /*
                 * Even though mke2fs always initializes the first and
                 * last group, just in case some other tool was used,
                 * we need to make sure we calculate the right free
                 * blocks.
                 */
                blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) -
                        ext4_group_first_block_no(sb, block_group);
        } else
                blocks = EXT4_BLOCKS_PER_GROUP(sb);
        return EXT4_NUM_B2C(EXT4_SB(sb), blocks);
}

/* Initializes an uninitialized block bitmap */
static int ext4_init_block_bitmap(struct super_block *sb,
                                   struct buffer_head *bh,
                                   ext4_group_t block_group,
                                   struct ext4_group_desc *gdp)
{
        unsigned int bit, bit_max;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_fsblk_t start, tmp;

        ASSERT(buffer_locked(bh));

        if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT |
                                        EXT4_GROUP_INFO_IBITMAP_CORRUPT);
                return -EFSBADCRC;
        }
        memset(bh->b_data, 0, sb->s_blocksize);

        bit_max = ext4_num_base_meta_clusters(sb, block_group);
        if ((bit_max >> 3) >= bh->b_size)
                return -EFSCORRUPTED;

        for (bit = 0; bit < bit_max; bit++)
                ext4_set_bit(bit, bh->b_data);

        start = ext4_group_first_block_no(sb, block_group);

        /* Set bits for block and inode bitmaps, and inode table */
        tmp = ext4_block_bitmap(sb, gdp);
        if (ext4_block_in_group(sb, tmp, block_group))
                ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);

        tmp = ext4_inode_bitmap(sb, gdp);
        if (ext4_block_in_group(sb, tmp, block_group))
                ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);

        tmp = ext4_inode_table(sb, gdp);
        for (; tmp < ext4_inode_table(sb, gdp) +
                     sbi->s_itb_per_group; tmp++) {
                if (ext4_block_in_group(sb, tmp, block_group))
                        ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
        }

        /*
         * Also if the number of blocks within the group is less than
         * the blocksize * 8 ( which is the size of bitmap ), set rest
         * of the block bitmap to 1
         */
        ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
                             sb->s_blocksize * 8, bh->b_data);
        return 0;
}

/* Return the number of free blocks in a block group.  It is used when
 * the block bitmap is uninitialized, so we can't just count the bits
 * in the bitmap. */
unsigned ext4_free_clusters_after_init(struct super_block *sb,
                                       ext4_group_t block_group,
                                       struct ext4_group_desc *gdp)
{
        return num_clusters_in_group(sb, block_group) -
                ext4_num_overhead_clusters(sb, block_group, gdp);
}

/*
 * The free blocks are managed by bitmaps.  A file system contains several
 * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
 * block for inodes, N blocks for the inode table and data blocks.
 *
 * The file system contains group descriptors which are located after the
 * super block.  Each descriptor contains the number of the bitmap block and
 * the free blocks count in the block.  The descriptors are loaded in memory
 * when a file system is mounted (see ext4_fill_super).
 */

/**
 * ext4_get_group_desc() -- load group descriptor from disk
 * @sb:                        super block
 * @block_group:        given block group
 * @bh:                        pointer to the buffer head to store the block
 *                        group descriptor
 */
struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
                                             ext4_group_t block_group,
                                             struct buffer_head **bh)
{
        unsigned int group_desc;
        unsigned int offset;
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct buffer_head *bh_p;

        KUNIT_STATIC_STUB_REDIRECT(ext4_get_group_desc,
                                   sb, block_group, bh);

        if (block_group >= ngroups) {
                ext4_error(sb, "block_group >= groups_count - block_group = %u,"
                           " groups_count = %u", block_group, ngroups);

                return NULL;
        }

        group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
        offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
        bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc);
        /*
         * sbi_array_rcu_deref returns with rcu unlocked, this is ok since
         * the pointer being dereferenced won't be dereferenced again. By
         * looking at the usage in add_new_gdb() the value isn't modified,
         * just the pointer, and so it remains valid.
         */
        if (!bh_p) {
                ext4_error(sb, "Group descriptor not loaded - "
                           "block_group = %u, group_desc = %u, desc = %u",
                           block_group, group_desc, offset);
                return NULL;
        }

        desc = (struct ext4_group_desc *)(
                (__u8 *)bh_p->b_data +
                offset * EXT4_DESC_SIZE(sb));
        if (bh)
                *bh = bh_p;
        return desc;
}

static ext4_fsblk_t ext4_valid_block_bitmap_padding(struct super_block *sb,
                                                    ext4_group_t block_group,
                                                    struct buffer_head *bh)
{
        ext4_grpblk_t next_zero_bit;
        unsigned long bitmap_size = sb->s_blocksize * 8;
        unsigned int offset = num_clusters_in_group(sb, block_group);

        if (bitmap_size <= offset)
                return 0;

        next_zero_bit = ext4_find_next_zero_bit(bh->b_data, bitmap_size, offset);

        return (next_zero_bit < bitmap_size ? next_zero_bit : 0);
}

struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
                                            ext4_group_t group)
{
        struct ext4_group_info **grp_info;
        long indexv, indexh;

        if (unlikely(group >= EXT4_SB(sb)->s_groups_count))
                return NULL;
        indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
        indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
        grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv);
        return grp_info[indexh];
}

/*
 * Return the block number which was discovered to be invalid, or 0 if
 * the block bitmap is valid.
 */
static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
                                            struct ext4_group_desc *desc,
                                            ext4_group_t block_group,
                                            struct buffer_head *bh)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_grpblk_t offset;
        ext4_grpblk_t next_zero_bit;
        ext4_grpblk_t max_bit = EXT4_CLUSTERS_PER_GROUP(sb);
        ext4_fsblk_t blk;
        ext4_fsblk_t group_first_block;

        if (ext4_has_feature_flex_bg(sb)) {
                /* with FLEX_BG, the inode/block bitmaps and itable
                 * blocks may not be in the group at all
                 * so the bitmap validation will be skipped for those groups
                 * or it has to also read the block group where the bitmaps
                 * are located to verify they are set.
                 */
                return 0;
        }
        group_first_block = ext4_group_first_block_no(sb, block_group);

        /* check whether block bitmap block number is set */
        blk = ext4_block_bitmap(sb, desc);
        offset = blk - group_first_block;
        if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit ||
            !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
                /* bad block bitmap */
                return blk;

        /* check whether the inode bitmap block number is set */
        blk = ext4_inode_bitmap(sb, desc);
        offset = blk - group_first_block;
        if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit ||
            !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
                /* bad block bitmap */
                return blk;

        /* check whether the inode table block number is set */
        blk = ext4_inode_table(sb, desc);
        offset = blk - group_first_block;
        if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit ||
            EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) >= max_bit)
                return blk;
        next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
                        EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1,
                        EXT4_B2C(sbi, offset));
        if (next_zero_bit <
            EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1)
                /* bad bitmap for inode tables */
                return blk;
        return 0;
}

static int ext4_validate_block_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *desc,
                                      ext4_group_t block_group,
                                      struct buffer_head *bh)
{
        ext4_fsblk_t        blk;
        struct ext4_group_info *grp;

        if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
                return 0;

        grp = ext4_get_group_info(sb, block_group);

        if (buffer_verified(bh))
                return 0;
        if (!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
                return -EFSCORRUPTED;

        ext4_lock_group(sb, block_group);
        if (buffer_verified(bh))
                goto verified;
        if (unlikely(!ext4_block_bitmap_csum_verify(sb, desc, bh) ||
                     ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) {
                ext4_unlock_group(sb, block_group);
                ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return -EFSBADCRC;
        }
        blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
        if (unlikely(blk != 0)) {
                ext4_unlock_group(sb, block_group);
                ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
                           block_group, blk);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return -EFSCORRUPTED;
        }
        blk = ext4_valid_block_bitmap_padding(sb, block_group, bh);
        if (unlikely(blk != 0)) {
                ext4_unlock_group(sb, block_group);
                ext4_error(sb, "bg %u: block %llu: padding at end of block bitmap is not set",
                           block_group, blk);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                                 EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return -EFSCORRUPTED;
        }
        set_buffer_verified(bh);
verified:
        ext4_unlock_group(sb, block_group);
        return 0;
}

/**
 * ext4_read_block_bitmap_nowait()
 * @sb:                        super block
 * @block_group:        given block group
 * @ignore_locked:        ignore locked buffers
 *
 * Read the bitmap for a given block_group,and validate the
 * bits for block/inode/inode tables are set in the bitmaps
 *
 * Return buffer_head on success or an ERR_PTR in case of failure.
 */
struct buffer_head *
ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
                              bool ignore_locked)
{
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct buffer_head *bh;
        ext4_fsblk_t bitmap_blk;
        int err;

        KUNIT_STATIC_STUB_REDIRECT(ext4_read_block_bitmap_nowait,
                                   sb, block_group, ignore_locked);

        desc = ext4_get_group_desc(sb, block_group, NULL);
        if (!desc)
                return ERR_PTR(-EFSCORRUPTED);
        bitmap_blk = ext4_block_bitmap(sb, desc);
        if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
            (bitmap_blk >= ext4_blocks_count(sbi->s_es))) {
                ext4_error(sb, "Invalid block bitmap block %llu in "
                           "block_group %u", bitmap_blk, block_group);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return ERR_PTR(-EFSCORRUPTED);
        }
        bh = sb_getblk(sb, bitmap_blk);
        if (unlikely(!bh)) {
                ext4_warning(sb, "Cannot get buffer for block bitmap - "
                             "block_group = %u, block_bitmap = %llu",
                             block_group, bitmap_blk);
                return ERR_PTR(-ENOMEM);
        }

        if (ignore_locked && buffer_locked(bh)) {
                /* buffer under IO already, return if called for prefetching */
                put_bh(bh);
                return NULL;
        }

        if (bitmap_uptodate(bh))
                goto verify;

        lock_buffer(bh);
        if (bitmap_uptodate(bh)) {
                unlock_buffer(bh);
                goto verify;
        }
        ext4_lock_group(sb, block_group);
        if (ext4_has_group_desc_csum(sb) &&
            (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
                if (block_group == 0) {
                        ext4_unlock_group(sb, block_group);
                        unlock_buffer(bh);
                        ext4_error(sb, "Block bitmap for bg 0 marked "
                                   "uninitialized");
                        err = -EFSCORRUPTED;
                        goto out;
                }
                err = ext4_init_block_bitmap(sb, bh, block_group, desc);
                if (err) {
                        ext4_unlock_group(sb, block_group);
                        unlock_buffer(bh);
                        ext4_error(sb, "Failed to init block bitmap for group "
                                   "%u: %d", block_group, err);
                        goto out;
                }
                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
                set_buffer_verified(bh);
                ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
                return bh;
        }
        ext4_unlock_group(sb, block_group);
        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
                 * bitmap is also uptodate
                 */
                set_bitmap_uptodate(bh);
                unlock_buffer(bh);
                goto verify;
        }
        /*
         * submit the buffer_head for reading
         */
        set_buffer_new(bh);
        trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked);
        ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO |
                            (ignore_locked ? REQ_RAHEAD : 0),
                            ext4_end_bitmap_read);
        return bh;
verify:
        err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
        if (err)
                goto out;
        return bh;
out:
        put_bh(bh);
        return ERR_PTR(err);
}

/* Returns 0 on success, -errno on error */
int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
                           struct buffer_head *bh)
{
        struct ext4_group_desc *desc;

        KUNIT_STATIC_STUB_REDIRECT(ext4_wait_block_bitmap,
                                   sb, block_group, bh);

        if (!buffer_new(bh))
                return 0;
        desc = ext4_get_group_desc(sb, block_group, NULL);
        if (!desc)
                return -EFSCORRUPTED;
        wait_on_buffer(bh);
        ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO);
        if (!buffer_uptodate(bh)) {
                ext4_error_err(sb, EIO, "Cannot read block bitmap - "
                               "block_group = %u, block_bitmap = %llu",
                               block_group, (unsigned long long) bh->b_blocknr);
                ext4_mark_group_bitmap_corrupted(sb, block_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                return -EIO;
        }
        clear_buffer_new(bh);
        /* Panic or remount fs read-only if block bitmap is invalid */
        return ext4_validate_block_bitmap(sb, desc, block_group, bh);
}

struct buffer_head *
ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
{
        struct buffer_head *bh;
        int err;

        bh = ext4_read_block_bitmap_nowait(sb, block_group, false);
        if (IS_ERR(bh))
                return bh;
        err = ext4_wait_block_bitmap(sb, block_group, bh);
        if (err) {
                put_bh(bh);
                return ERR_PTR(err);
        }
        return bh;
}

/**
 * ext4_has_free_clusters()
 * @sbi:        in-core super block structure.
 * @nclusters:        number of needed blocks
 * @flags:        flags from ext4_mb_new_blocks()
 *
 * Check if filesystem has nclusters free & available for allocation.
 * On success return 1, return 0 on failure.
 */
static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
                                  s64 nclusters, unsigned int flags)
{
        s64 free_clusters, dirty_clusters, rsv, resv_clusters;
        struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
        struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;

        free_clusters  = percpu_counter_read_positive(fcc);
        dirty_clusters = percpu_counter_read_positive(dcc);
        resv_clusters = atomic64_read(&sbi->s_resv_clusters);

        /*
         * r_blocks_count should always be multiple of the cluster ratio so
         * we are safe to do a plane bit shift only.
         */
        rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
              resv_clusters;

        if (free_clusters - (nclusters + rsv + dirty_clusters) <
                                        EXT4_FREECLUSTERS_WATERMARK) {
                free_clusters  = percpu_counter_sum_positive(fcc);
                dirty_clusters = percpu_counter_sum_positive(dcc);
        }
        /* Check whether we have space after accounting for current
         * dirty clusters & root reserved clusters.
         */
        if (free_clusters >= (rsv + nclusters + dirty_clusters))
                return 1;

        /* Hm, nope.  Are (enough) root reserved clusters available? */
        if (uid_eq(sbi->s_resuid, current_fsuid()) ||
            (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
            capable(CAP_SYS_RESOURCE) ||
            (flags & EXT4_MB_USE_ROOT_BLOCKS)) {

                if (free_clusters >= (nclusters + dirty_clusters +
                                      resv_clusters))
                        return 1;
        }
        /* No free blocks. Let's see if we can dip into reserved pool */
        if (flags & EXT4_MB_USE_RESERVED) {
                if (free_clusters >= (nclusters + dirty_clusters))
                        return 1;
        }

        return 0;
}

int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
                             s64 nclusters, unsigned int flags)
{
        if (ext4_has_free_clusters(sbi, nclusters, flags)) {
                percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
                return 0;
        } else
                return -ENOSPC;
}

/**
 * ext4_should_retry_alloc() - check if a block allocation should be retried
 * @sb:                        superblock
 * @retries:                number of retry attempts made so far
 *
 * ext4_should_retry_alloc() is called when ENOSPC is returned while
 * attempting to allocate blocks.  If there's an indication that a pending
 * journal transaction might free some space and allow another attempt to
 * succeed, this function will wait for the current or committing transaction
 * to complete and then return TRUE.
 */
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (!sbi->s_journal)
                return 0;

        if (++(*retries) > 3) {
                percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit);
                return 0;
        }

        /*
         * if there's no indication that blocks are about to be freed it's
         * possible we just missed a transaction commit that did so
         */
        smp_mb();
        if (sbi->s_mb_free_pending == 0) {
                if (test_opt(sb, DISCARD)) {
                        atomic_inc(&sbi->s_retry_alloc_pending);
                        flush_work(&sbi->s_discard_work);
                        atomic_dec(&sbi->s_retry_alloc_pending);
                }
                return ext4_has_free_clusters(sbi, 1, 0);
        }

        /*
         * it's possible we've just missed a transaction commit here,
         * so ignore the returned status
         */
        ext4_debug("%s: retrying operation after ENOSPC\n", sb->s_id);
        (void) jbd2_journal_force_commit_nested(sbi->s_journal);
        return 1;
}

/*
 * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
 *
 * @handle:             handle to this transaction
 * @inode:              file inode
 * @goal:               given target block(filesystem wide)
 * @count:                pointer to total number of clusters needed
 * @errp:               error code
 *
 * Return 1st allocated block number on success, *count stores total account
 * error stores in errp pointer
 */
ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                                  ext4_fsblk_t goal, unsigned int flags,
                                  unsigned long *count, int *errp)
{
        struct ext4_allocation_request ar;
        ext4_fsblk_t ret;

        memset(&ar, 0, sizeof(ar));
        /* Fill with neighbour allocated blocks */
        ar.inode = inode;
        ar.goal = goal;
        ar.len = count ? *count : 1;
        ar.flags = flags;

        ret = ext4_mb_new_blocks(handle, &ar, errp);
        if (count)
                *count = ar.len;
        /*
         * Account for the allocated meta blocks.  We will never
         * fail EDQUOT for metdata, but we do account for it.
         */
        if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) {
                dquot_alloc_block_nofail(inode,
                                EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
        }
        return ret;
}

/**
 * ext4_count_free_clusters() -- count filesystem free clusters
 * @sb:                superblock
 *
 * Adds up the number of free clusters from each block group.
 */
ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
{
        ext4_fsblk_t desc_count;
        struct ext4_group_desc *gdp;
        ext4_group_t i;
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        struct ext4_group_info *grp;
#ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        ext4_fsblk_t bitmap_count;
        unsigned int x;
        struct buffer_head *bitmap_bh = NULL;

        es = EXT4_SB(sb)->s_es;
        desc_count = 0;
        bitmap_count = 0;
        gdp = NULL;

        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
                grp = NULL;
                if (EXT4_SB(sb)->s_group_info)
                        grp = ext4_get_group_info(sb, i);
                if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
                        desc_count += ext4_free_group_clusters(sb, gdp);
                brelse(bitmap_bh);
                bitmap_bh = ext4_read_block_bitmap(sb, i);
                if (IS_ERR(bitmap_bh)) {
                        bitmap_bh = NULL;
                        continue;
                }

                x = ext4_count_free(bitmap_bh->b_data,
                                    EXT4_CLUSTERS_PER_GROUP(sb) / 8);
                printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
                        i, ext4_free_group_clusters(sb, gdp), x);
                bitmap_count += x;
        }
        brelse(bitmap_bh);
        printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
               ", computed = %llu, %llu\n",
               EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
               desc_count, bitmap_count);
        return bitmap_count;
#else
        desc_count = 0;
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
                grp = NULL;
                if (EXT4_SB(sb)->s_group_info)
                        grp = ext4_get_group_info(sb, i);
                if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
                        desc_count += ext4_free_group_clusters(sb, gdp);
        }

        return desc_count;
#endif
}

static inline int test_root(ext4_group_t a, int b)
{
        while (1) {
                if (a < b)
                        return 0;
                if (a == b)
                        return 1;
                if ((a % b) != 0)
                        return 0;
                a = a / b;
        }
}

/**
 *        ext4_bg_has_super - number of blocks used by the superblock in group
 *        @sb: superblock for filesystem
 *        @group: group number to check
 *
 *        Return the number of blocks used by the superblock (primary or backup)
 *        in this group.  Currently this will be only 0 or 1.
 */
int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
{
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;

        if (group == 0)
                return 1;
        if (ext4_has_feature_sparse_super2(sb)) {
                if (group == le32_to_cpu(es->s_backup_bgs[0]) ||
                    group == le32_to_cpu(es->s_backup_bgs[1]))
                        return 1;
                return 0;
        }
        if ((group <= 1) || !ext4_has_feature_sparse_super(sb))
                return 1;
        if (!(group & 1))
                return 0;
        if (test_root(group, 3) || (test_root(group, 5)) ||
            test_root(group, 7))
                return 1;

        return 0;
}

static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
                                        ext4_group_t group)
{
        unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
        ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb);
        ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1;

        if (group == first || group == first + 1 || group == last)
                return 1;
        return 0;
}

static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
                                        ext4_group_t group)
{
        if (!ext4_bg_has_super(sb, group))
                return 0;

        if (ext4_has_feature_meta_bg(sb))
                return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
        else
                return EXT4_SB(sb)->s_gdb_count;
}

/**
 *        ext4_bg_num_gdb - number of blocks used by the group table in group
 *        @sb: superblock for filesystem
 *        @group: group number to check
 *
 *        Return the number of blocks used by the group descriptor table
 *        (primary or backup) in this group.  In the future there may be a
 *        different number of descriptor blocks in each group.
 */
unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
{
        unsigned long first_meta_bg =
                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
        unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);

        if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg)
                return ext4_bg_num_gdb_nometa(sb, group);

        return ext4_bg_num_gdb_meta(sb,group);

}

/*
 * This function returns the number of file system metadata blocks at
 * the beginning of a block group, including the reserved gdt blocks.
 */
unsigned int ext4_num_base_meta_blocks(struct super_block *sb,
                                       ext4_group_t block_group)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned num;

        /* Check for superblock and gdt backups in this group */
        num = ext4_bg_has_super(sb, block_group);

        if (!ext4_has_feature_meta_bg(sb) ||
            block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
                          sbi->s_desc_per_block) {
                if (num) {
                        num += ext4_bg_num_gdb_nometa(sb, block_group);
                        num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
                }
        } else { /* For META_BG_BLOCK_GROUPS */
                num += ext4_bg_num_gdb_meta(sb, block_group);
        }
        return num;
}

static unsigned int ext4_num_base_meta_clusters(struct super_block *sb,
                                                ext4_group_t block_group)
{
        return EXT4_NUM_B2C(EXT4_SB(sb), ext4_num_base_meta_blocks(sb, block_group));
}

/**
 *        ext4_inode_to_goal_block - return a hint for block allocation
 *        @inode: inode for block allocation
 *
 *        Return the ideal location to start allocating blocks for a
 *        newly created inode.
 */
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        ext4_group_t block_group;
        ext4_grpblk_t colour;
        int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
        ext4_fsblk_t bg_start;
        ext4_fsblk_t last_block;

        block_group = ei->i_block_group;
        if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
                /*
                 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
                 * block groups per flexgroup, reserve the first block
                 * group for directories and special files.  Regular
                 * files will start at the second block group.  This
                 * tends to speed up directory access and improves
                 * fsck times.
                 */
                block_group &= ~(flex_size-1);
                if (S_ISREG(inode->i_mode))
                        block_group++;
        }
        bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;

        /*
         * If we are doing delayed allocation, we don't need take
         * colour into account.
         */
        if (test_opt(inode->i_sb, DELALLOC))
                return bg_start;

        if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
                colour = (task_pid_nr(current) % 16) *
                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
        else
                colour = (task_pid_nr(current) % 16) *
                        ((last_block - bg_start) / 16);
        return bg_start + colour;
}






























































   52 




























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_UNWIND_H
#define _ASM_X86_UNWIND_H

#include <linux/sched.h>
#include <linux/ftrace.h>
#include <linux/rethook.h>
#include <asm/ptrace.h>
#include <asm/stacktrace.h>

#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip))
#define IRET_FRAME_SIZE   (sizeof(struct pt_regs) - IRET_FRAME_OFFSET)

struct unwind_state {
        struct stack_info stack_info;
        unsigned long stack_mask;
        struct task_struct *task;
        int graph_idx;
#if defined(CONFIG_RETHOOK)
        struct llist_node *kr_cur;
#endif
        bool error;
#if defined(CONFIG_UNWINDER_ORC)
        bool signal, full_regs;
        unsigned long sp, bp, ip;
        struct pt_regs *regs, *prev_regs;
#elif defined(CONFIG_UNWINDER_FRAME_POINTER)
        bool got_irq;
        unsigned long *bp, *orig_sp, ip;
        /*
         * If non-NULL: The current frame is incomplete and doesn't contain a
         * valid BP. When looking for the next frame, use this instead of the
         * non-existent saved BP.
         */
        unsigned long *next_bp;
        struct pt_regs *regs;
#else
        unsigned long *sp;
#endif
};

void __unwind_start(struct unwind_state *state, struct task_struct *task,
                    struct pt_regs *regs, unsigned long *first_frame);
bool unwind_next_frame(struct unwind_state *state);
unsigned long unwind_get_return_address(struct unwind_state *state);
unsigned long *unwind_get_return_address_ptr(struct unwind_state *state);

static inline bool unwind_done(struct unwind_state *state)
{
        return state->stack_info.type == STACK_TYPE_UNKNOWN;
}

static inline bool unwind_error(struct unwind_state *state)
{
        return state->error;
}

static inline
void unwind_start(struct unwind_state *state, struct task_struct *task,
                  struct pt_regs *regs, unsigned long *first_frame)
{
        first_frame = first_frame ? : get_stack_pointer(task, regs);

        __unwind_start(state, task, regs, first_frame);
}

#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
/*
 * If 'partial' returns true, only the iret frame registers are valid.
 */
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
                                                    bool *partial)
{
        if (unwind_done(state))
                return NULL;

        if (partial) {
#ifdef CONFIG_UNWINDER_ORC
                *partial = !state->full_regs;
#else
                *partial = false;
#endif
        }

        return state->regs;
}
#else
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
                                                    bool *partial)
{
        return NULL;
}
#endif

#ifdef CONFIG_UNWINDER_ORC
void unwind_init(void);
void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
                        void *orc, size_t orc_size);
#else
static inline void unwind_init(void) {}
static inline
void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
                        void *orc, size_t orc_size) {}
#endif

static inline
unsigned long unwind_recover_rethook(struct unwind_state *state,
                                     unsigned long addr, unsigned long *addr_p)
{
#ifdef CONFIG_RETHOOK
        if (is_rethook_trampoline(addr))
                return rethook_find_ret_addr(state->task, (unsigned long)addr_p,
                                             &state->kr_cur);
#endif
        return addr;
}

/* Recover the return address modified by rethook and ftrace_graph. */
static inline
unsigned long unwind_recover_ret_addr(struct unwind_state *state,
                                     unsigned long addr, unsigned long *addr_p)
{
        unsigned long ret;

        ret = ftrace_graph_ret_addr(state->task, &state->graph_idx,
                                    addr, addr_p);
        return unwind_recover_rethook(state, ret, addr_p);
}

/*
 * This disables KASAN checking when reading a value from another task's stack,
 * since the other task could be running on another CPU and could have poisoned
 * the stack in the meantime.
 */
#define READ_ONCE_TASK_STACK(task, x)                        \
({                                                        \
        unsigned long val;                                \
        if (task == current)                                \
                val = READ_ONCE(x);                        \
        else                                                \
                val = READ_ONCE_NOCHECK(x);                \
        val;                                                \
})

static inline bool task_on_another_cpu(struct task_struct *task)
{
#ifdef CONFIG_SMP
        return task != current && task->on_cpu;
#else
        return false;
#endif
}

#endif /* _ASM_X86_UNWIND_H */







































































































































    1 
    1 





















    1 
    1 

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Digital Audio (PCM) abstract layer
 *  Copyright (c) by Jaroslav Kysela <perex@perex.cz>
 */

#include <linux/compat.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/time.h>
#include <linux/pm_qos.h>
#include <linux/io.h>
#include <linux/dma-mapping.h>
#include <linux/vmalloc.h>
#include <sound/core.h>
#include <sound/control.h>
#include <sound/info.h>
#include <sound/pcm.h>
#include <sound/pcm_params.h>
#include <sound/timer.h>
#include <sound/minors.h>
#include <linux/uio.h>
#include <linux/delay.h>

#include "pcm_local.h"

#ifdef CONFIG_SND_DEBUG
#define CREATE_TRACE_POINTS
#include "pcm_param_trace.h"
#else
#define trace_hw_mask_param_enabled()                0
#define trace_hw_interval_param_enabled()        0
#define trace_hw_mask_param(substream, type, index, prev, curr)
#define trace_hw_interval_param(substream, type, index, prev, curr)
#endif

/*
 *  Compatibility
 */

struct snd_pcm_hw_params_old {
        unsigned int flags;
        unsigned int masks[SNDRV_PCM_HW_PARAM_SUBFORMAT -
                           SNDRV_PCM_HW_PARAM_ACCESS + 1];
        struct snd_interval intervals[SNDRV_PCM_HW_PARAM_TICK_TIME -
                                        SNDRV_PCM_HW_PARAM_SAMPLE_BITS + 1];
        unsigned int rmask;
        unsigned int cmask;
        unsigned int info;
        unsigned int msbits;
        unsigned int rate_num;
        unsigned int rate_den;
        snd_pcm_uframes_t fifo_size;
        unsigned char reserved[64];
};

#ifdef CONFIG_SND_SUPPORT_OLD_API
#define SNDRV_PCM_IOCTL_HW_REFINE_OLD _IOWR('A', 0x10, struct snd_pcm_hw_params_old)
#define SNDRV_PCM_IOCTL_HW_PARAMS_OLD _IOWR('A', 0x11, struct snd_pcm_hw_params_old)

static int snd_pcm_hw_refine_old_user(struct snd_pcm_substream *substream,
                                      struct snd_pcm_hw_params_old __user * _oparams);
static int snd_pcm_hw_params_old_user(struct snd_pcm_substream *substream,
                                      struct snd_pcm_hw_params_old __user * _oparams);
#endif
static int snd_pcm_open(struct file *file, struct snd_pcm *pcm, int stream);

/*
 *
 */

static DECLARE_RWSEM(snd_pcm_link_rwsem);

void snd_pcm_group_init(struct snd_pcm_group *group)
{
        spin_lock_init(&group->lock);
        mutex_init(&group->mutex);
        INIT_LIST_HEAD(&group->substreams);
        refcount_set(&group->refs, 1);
}

/* define group lock helpers */
#define DEFINE_PCM_GROUP_LOCK(action, mutex_action) \
static void snd_pcm_group_ ## action(struct snd_pcm_group *group, bool nonatomic) \
{ \
        if (nonatomic) \
                mutex_ ## mutex_action(&group->mutex); \
        else \
                spin_ ## action(&group->lock); \
}

DEFINE_PCM_GROUP_LOCK(lock, lock);
DEFINE_PCM_GROUP_LOCK(unlock, unlock);
DEFINE_PCM_GROUP_LOCK(lock_irq, lock);
DEFINE_PCM_GROUP_LOCK(unlock_irq, unlock);

/**
 * snd_pcm_stream_lock - Lock the PCM stream
 * @substream: PCM substream
 *
 * This locks the PCM stream's spinlock or mutex depending on the nonatomic
 * flag of the given substream.  This also takes the global link rw lock
 * (or rw sem), too, for avoiding the race with linked streams.
 */
void snd_pcm_stream_lock(struct snd_pcm_substream *substream)
{
        snd_pcm_group_lock(&substream->self_group, substream->pcm->nonatomic);
}
EXPORT_SYMBOL_GPL(snd_pcm_stream_lock);

/**
 * snd_pcm_stream_unlock - Unlock the PCM stream
 * @substream: PCM substream
 *
 * This unlocks the PCM stream that has been locked via snd_pcm_stream_lock().
 */
void snd_pcm_stream_unlock(struct snd_pcm_substream *substream)
{
        snd_pcm_group_unlock(&substream->self_group, substream->pcm->nonatomic);
}
EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock);

/**
 * snd_pcm_stream_lock_irq - Lock the PCM stream
 * @substream: PCM substream
 *
 * This locks the PCM stream like snd_pcm_stream_lock() and disables the local
 * IRQ (only when nonatomic is false).  In nonatomic case, this is identical
 * as snd_pcm_stream_lock().
 */
void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
{
        snd_pcm_group_lock_irq(&substream->self_group,
                               substream->pcm->nonatomic);
}
EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);

static void snd_pcm_stream_lock_nested(struct snd_pcm_substream *substream)
{
        struct snd_pcm_group *group = &substream->self_group;

        if (substream->pcm->nonatomic)
                mutex_lock_nested(&group->mutex, SINGLE_DEPTH_NESTING);
        else
                spin_lock_nested(&group->lock, SINGLE_DEPTH_NESTING);
}

/**
 * snd_pcm_stream_unlock_irq - Unlock the PCM stream
 * @substream: PCM substream
 *
 * This is a counter-part of snd_pcm_stream_lock_irq().
 */
void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream)
{
        snd_pcm_group_unlock_irq(&substream->self_group,
                                 substream->pcm->nonatomic);
}
EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);

unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream)
{
        unsigned long flags = 0;
        if (substream->pcm->nonatomic)
                mutex_lock(&substream->self_group.mutex);
        else
                spin_lock_irqsave(&substream->self_group.lock, flags);
        return flags;
}
EXPORT_SYMBOL_GPL(_snd_pcm_stream_lock_irqsave);

unsigned long _snd_pcm_stream_lock_irqsave_nested(struct snd_pcm_substream *substream)
{
        unsigned long flags = 0;
        if (substream->pcm->nonatomic)
                mutex_lock_nested(&substream->self_group.mutex,
                                  SINGLE_DEPTH_NESTING);
        else
                spin_lock_irqsave_nested(&substream->self_group.lock, flags,
                                         SINGLE_DEPTH_NESTING);
        return flags;
}
EXPORT_SYMBOL_GPL(_snd_pcm_stream_lock_irqsave_nested);

/**
 * snd_pcm_stream_unlock_irqrestore - Unlock the PCM stream
 * @substream: PCM substream
 * @flags: irq flags
 *
 * This is a counter-part of snd_pcm_stream_lock_irqsave().
 */
void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream,
                                      unsigned long flags)
{
        if (substream->pcm->nonatomic)
                mutex_unlock(&substream->self_group.mutex);
        else
                spin_unlock_irqrestore(&substream->self_group.lock, flags);
}
EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);

/* Run PCM ioctl ops */
static int snd_pcm_ops_ioctl(struct snd_pcm_substream *substream,
                             unsigned cmd, void *arg)
{
        if (substream->ops->ioctl)
                return substream->ops->ioctl(substream, cmd, arg);
        else
                return snd_pcm_lib_ioctl(substream, cmd, arg);
}

int snd_pcm_info(struct snd_pcm_substream *substream, struct snd_pcm_info *info)
{
        struct snd_pcm *pcm = substream->pcm;
        struct snd_pcm_str *pstr = substream->pstr;

        memset(info, 0, sizeof(*info));
        info->card = pcm->card->number;
        info->device = pcm->device;
        info->stream = substream->stream;
        info->subdevice = substream->number;
        strscpy(info->id, pcm->id, sizeof(info->id));
        strscpy(info->name, pcm->name, sizeof(info->name));
        info->dev_class = pcm->dev_class;
        info->dev_subclass = pcm->dev_subclass;
        info->subdevices_count = pstr->substream_count;
        info->subdevices_avail = pstr->substream_count - pstr->substream_opened;
        strscpy(info->subname, substream->name, sizeof(info->subname));

        return 0;
}

int snd_pcm_info_user(struct snd_pcm_substream *substream,
                      struct snd_pcm_info __user * _info)
{
        struct snd_pcm_info *info __free(kfree) = NULL;
        int err;

        info = kmalloc(sizeof(*info), GFP_KERNEL);
        if (! info)
                return -ENOMEM;
        err = snd_pcm_info(substream, info);
        if (err >= 0) {
                if (copy_to_user(_info, info, sizeof(*info)))
                        err = -EFAULT;
        }
        return err;
}

/* macro for simplified cast */
#define PARAM_MASK_BIT(b)        (1U << (__force int)(b))

static bool hw_support_mmap(struct snd_pcm_substream *substream)
{
        struct snd_dma_buffer *dmabuf;

        if (!(substream->runtime->hw.info & SNDRV_PCM_INFO_MMAP))
                return false;

        if (substream->ops->mmap || substream->ops->page)
                return true;

        dmabuf = snd_pcm_get_dma_buf(substream);
        if (!dmabuf)
                dmabuf = &substream->dma_buffer;
        switch (dmabuf->dev.type) {
        case SNDRV_DMA_TYPE_UNKNOWN:
                /* we can't know the device, so just assume that the driver does
                 * everything right
                 */
                return true;
        case SNDRV_DMA_TYPE_CONTINUOUS:
        case SNDRV_DMA_TYPE_VMALLOC:
                return true;
        default:
                return dma_can_mmap(dmabuf->dev.dev);
        }
}

static int constrain_mask_params(struct snd_pcm_substream *substream,
                                 struct snd_pcm_hw_params *params)
{
        struct snd_pcm_hw_constraints *constrs =
                                        &substream->runtime->hw_constraints;
        struct snd_mask *m;
        unsigned int k;
        struct snd_mask old_mask __maybe_unused;
        int changed;

        for (k = SNDRV_PCM_HW_PARAM_FIRST_MASK; k <= SNDRV_PCM_HW_PARAM_LAST_MASK; k++) {
                m = hw_param_mask(params, k);
                if (snd_mask_empty(m))
                        return -EINVAL;

                /* This parameter is not requested to change by a caller. */
                if (!(params->rmask & PARAM_MASK_BIT(k)))
                        continue;

                if (trace_hw_mask_param_enabled())
                        old_mask = *m;

                changed = snd_mask_refine(m, constrs_mask(constrs, k));
                if (changed < 0)
                        return changed;
                if (changed == 0)
                        continue;

                /* Set corresponding flag so that the caller gets it. */
                trace_hw_mask_param(substream, k, 0, &old_mask, m);
                params->cmask |= PARAM_MASK_BIT(k);
        }

        return 0;
}

static int constrain_interval_params(struct snd_pcm_substream *substream,
                                     struct snd_pcm_hw_params *params)
{
        struct snd_pcm_hw_constraints *constrs =
                                        &substream->runtime->hw_constraints;
        struct snd_interval *i;
        unsigned int k;
        struct snd_interval old_interval __maybe_unused;
        int changed;

        for (k = SNDRV_PCM_HW_PARAM_FIRST_INTERVAL; k <= SNDRV_PCM_HW_PARAM_LAST_INTERVAL; k++) {
                i = hw_param_interval(params, k);
                if (snd_interval_empty(i))
                        return -EINVAL;

                /* This parameter is not requested to change by a caller. */
                if (!(params->rmask & PARAM_MASK_BIT(k)))
                        continue;

                if (trace_hw_interval_param_enabled())
                        old_interval = *i;

                changed = snd_interval_refine(i, constrs_interval(constrs, k));
                if (changed < 0)
                        return changed;
                if (changed == 0)
                        continue;

                /* Set corresponding flag so that the caller gets it. */
                trace_hw_interval_param(substream, k, 0, &old_interval, i);
                params->cmask |= PARAM_MASK_BIT(k);
        }

        return 0;
}

static int constrain_params_by_rules(struct snd_pcm_substream *substream,
                                     struct snd_pcm_hw_params *params)
{
        struct snd_pcm_hw_constraints *constrs =
                                        &substream->runtime->hw_constraints;
        unsigned int k;
        unsigned int *rstamps __free(kfree) = NULL;
        unsigned int vstamps[SNDRV_PCM_HW_PARAM_LAST_INTERVAL + 1];
        unsigned int stamp;
        struct snd_pcm_hw_rule *r;
        unsigned int d;
        struct snd_mask old_mask __maybe_unused;
        struct snd_interval old_interval __maybe_unused;
        bool again;
        int changed, err = 0;

        /*
         * Each application of rule has own sequence number.
         *
         * Each member of 'rstamps' array represents the sequence number of
         * recent application of corresponding rule.
         */
        rstamps = kcalloc(constrs->rules_num, sizeof(unsigned int), GFP_KERNEL);
        if (!rstamps)
                return -ENOMEM;

        /*
         * Each member of 'vstamps' array represents the sequence number of
         * recent application of rule in which corresponding parameters were
         * changed.
         *
         * In initial state, elements corresponding to parameters requested by
         * a caller is 1. For unrequested parameters, corresponding members
         * have 0 so that the parameters are never changed anymore.
         */
        for (k = 0; k <= SNDRV_PCM_HW_PARAM_LAST_INTERVAL; k++)
                vstamps[k] = (params->rmask & PARAM_MASK_BIT(k)) ? 1 : 0;

        /* Due to the above design, actual sequence number starts at 2. */
        stamp = 2;
retry:
        /* Apply all rules in order. */
        again = false;
        for (k = 0; k < constrs->rules_num; k++) {
                r = &constrs->rules[k];

                /*
                 * Check condition bits of this rule. When the rule has
                 * some condition bits, parameter without the bits is
                 * never processed. SNDRV_PCM_HW_PARAMS_NO_PERIOD_WAKEUP
                 * is an example of the condition bits.
                 */
                if (r->cond && !(r->cond & params->flags))
                        continue;

                /*
                 * The 'deps' array includes maximum four dependencies
                 * to SNDRV_PCM_HW_PARAM_XXXs for this rule. The fifth
                 * member of this array is a sentinel and should be
                 * negative value.
                 *
                 * This rule should be processed in this time when dependent
                 * parameters were changed at former applications of the other
                 * rules.
                 */
                for (d = 0; r->deps[d] >= 0; d++) {
                        if (vstamps[r->deps[d]] > rstamps[k])
                                break;
                }
                if (r->deps[d] < 0)
                        continue;

                if (trace_hw_mask_param_enabled()) {
                        if (hw_is_mask(r->var))
                                old_mask = *hw_param_mask(params, r->var);
                }
                if (trace_hw_interval_param_enabled()) {
                        if (hw_is_interval(r->var))
                                old_interval = *hw_param_interval(params, r->var);
                }

                changed = r->func(params, r);
                if (changed < 0)
                        return changed;

                /*
                 * When the parameter is changed, notify it to the caller
                 * by corresponding returned bit, then preparing for next
                 * iteration.
                 */
                if (changed && r->var >= 0) {
                        if (hw_is_mask(r->var)) {
                                trace_hw_mask_param(substream, r->var,
                                        k + 1, &old_mask,
                                        hw_param_mask(params, r->var));
                        }
                        if (hw_is_interval(r->var)) {
                                trace_hw_interval_param(substream, r->var,
                                        k + 1, &old_interval,
                                        hw_param_interval(params, r->var));
                        }

                        params->cmask |= PARAM_MASK_BIT(r->var);
                        vstamps[r->var] = stamp;
                        again = true;
                }

                rstamps[k] = stamp++;
        }

        /* Iterate to evaluate all rules till no parameters are changed. */
        if (again)
                goto retry;

        return err;
}

static int fixup_unreferenced_params(struct snd_pcm_substream *substream,
                                     struct snd_pcm_hw_params *params)
{
        const struct snd_interval *i;
        const struct snd_mask *m;
        struct snd_mask *m_rw;
        int err;

        if (!params->msbits) {
                i = hw_param_interval_c(params, SNDRV_PCM_HW_PARAM_SAMPLE_BITS);
                if (snd_interval_single(i))
                        params->msbits = snd_interval_value(i);
                m = hw_param_mask_c(params, SNDRV_PCM_HW_PARAM_FORMAT);
                if (snd_mask_single(m)) {
                        snd_pcm_format_t format = (__force snd_pcm_format_t)snd_mask_min(m);
                        params->msbits = snd_pcm_format_width(format);
                }
        }

        if (params->msbits) {
                m = hw_param_mask_c(params, SNDRV_PCM_HW_PARAM_FORMAT);
                if (snd_mask_single(m)) {
                        snd_pcm_format_t format = (__force snd_pcm_format_t)snd_mask_min(m);

                        if (snd_pcm_format_linear(format) &&
                            snd_pcm_format_width(format) != params->msbits) {
                                m_rw = hw_param_mask(params, SNDRV_PCM_HW_PARAM_SUBFORMAT);
                                snd_mask_reset(m_rw,
                                               (__force unsigned)SNDRV_PCM_SUBFORMAT_MSBITS_MAX);
                                if (snd_mask_empty(m_rw))
                                        return -EINVAL;
                        }
                }
        }

        if (!params->rate_den) {
                i = hw_param_interval_c(params, SNDRV_PCM_HW_PARAM_RATE);
                if (snd_interval_single(i)) {
                        params->rate_num = snd_interval_value(i);
                        params->rate_den = 1;
                }
        }

        if (!params->fifo_size) {
                m = hw_param_mask_c(params, SNDRV_PCM_HW_PARAM_FORMAT);
                i = hw_param_interval_c(params, SNDRV_PCM_HW_PARAM_CHANNELS);
                if (snd_mask_single(m) && snd_interval_single(i)) {
                        err = snd_pcm_ops_ioctl(substream,
                                                SNDRV_PCM_IOCTL1_FIFO_SIZE,
                                                params);
                        if (err < 0)
                                return err;
                }
        }

        if (!params->info) {
                params->info = substream->runtime->hw.info;
                params->info &= ~(SNDRV_PCM_INFO_FIFO_IN_FRAMES |
                                  SNDRV_PCM_INFO_DRAIN_TRIGGER);
                if (!hw_support_mmap(substream))
                        params->info &= ~(SNDRV_PCM_INFO_MMAP |
                                          SNDRV_PCM_INFO_MMAP_VALID);
        }

        return 0;
}

int snd_pcm_hw_refine(struct snd_pcm_substream *substream,
                      struct snd_pcm_hw_params *params)
{
        int err;

        params->info = 0;
        params->fifo_size = 0;
        if (params->rmask & PARAM_MASK_BIT(SNDRV_PCM_HW_PARAM_SAMPLE_BITS))
                params->msbits = 0;
        if (params->rmask & PARAM_MASK_BIT(SNDRV_PCM_HW_PARAM_RATE)) {
                params->rate_num = 0;
                params->rate_den = 0;
        }

        err = constrain_mask_params(substream, params);
        if (err < 0)
                return err;

        err = constrain_interval_params(substream, params);
        if (err < 0)
                return err;

        err = constrain_params_by_rules(substream, params);
        if (err < 0)
                return err;

        params->rmask = 0;

        return 0;
}
EXPORT_SYMBOL(snd_pcm_hw_refine);

static int snd_pcm_hw_refine_user(struct snd_pcm_substream *substream,
                                  struct snd_pcm_hw_params __user * _params)
{
        struct snd_pcm_hw_params *params __free(kfree) = NULL;
        int err;

        params = memdup_user(_params, sizeof(*params));
        if (IS_ERR(params))
                return PTR_ERR(no_free_ptr(params));

        err = snd_pcm_hw_refine(substream, params);
        if (err < 0)
                return err;

        err = fixup_unreferenced_params(substream, params);
        if (err < 0)
                return err;

        if (copy_to_user(_params, params, sizeof(*params)))
                return -EFAULT;
        return 0;
}

static int period_to_usecs(struct snd_pcm_runtime *runtime)
{
        int usecs;

        if (! runtime->rate)
                return -1; /* invalid */

        /* take 75% of period time as the deadline */
        usecs = (750000 / runtime->rate) * runtime->period_size;
        usecs += ((750000 % runtime->rate) * runtime->period_size) /
                runtime->rate;

        return usecs;
}

static void snd_pcm_set_state(struct snd_pcm_substream *substream,
                              snd_pcm_state_t state)
{
        guard(pcm_stream_lock_irq)(substream);
        if (substream->runtime->state != SNDRV_PCM_STATE_DISCONNECTED)
                __snd_pcm_set_state(substream->runtime, state);
}

static inline void snd_pcm_timer_notify(struct snd_pcm_substream *substream,
                                        int event)
{
#ifdef CONFIG_SND_PCM_TIMER
        if (substream->timer)
                snd_timer_notify(substream->timer, event,
                                        &substream->runtime->trigger_tstamp);
#endif
}

void snd_pcm_sync_stop(struct snd_pcm_substream *substream, bool sync_irq)
{
        if (substream->runtime && substream->runtime->stop_operating) {
                substream->runtime->stop_operating = false;
                if (substream->ops && substream->ops->sync_stop)
                        substream->ops->sync_stop(substream);
                else if (sync_irq && substream->pcm->card->sync_irq > 0)
                        synchronize_irq(substream->pcm->card->sync_irq);
        }
}

/**
 * snd_pcm_hw_params_choose - choose a configuration defined by @params
 * @pcm: PCM instance
 * @params: the hw_params instance
 *
 * Choose one configuration from configuration space defined by @params.
 * The configuration chosen is that obtained fixing in this order:
 * first access, first format, first subformat, min channels,
 * min rate, min period time, max buffer size, min tick time
 *
 * Return: Zero if successful, or a negative error code on failure.
 */
static int snd_pcm_hw_params_choose(struct snd_pcm_substream *pcm,
                                    struct snd_pcm_hw_params *params)
{
        static const int vars[] = {
                SNDRV_PCM_HW_PARAM_ACCESS,
                SNDRV_PCM_HW_PARAM_FORMAT,
                SNDRV_PCM_HW_PARAM_SUBFORMAT,
                SNDRV_PCM_HW_PARAM_CHANNELS,
                SNDRV_PCM_HW_PARAM_RATE,
                SNDRV_PCM_HW_PARAM_PERIOD_TIME,
                SNDRV_PCM_HW_PARAM_BUFFER_SIZE,
                SNDRV_PCM_HW_PARAM_TICK_TIME,
                -1
        };
        const int *v;
        struct snd_mask old_mask __maybe_unused;
        struct snd_interval old_interval __maybe_unused;
        int changed;

        for (v = vars; *v != -1; v++) {
                /* Keep old parameter to trace. */
                if (trace_hw_mask_param_enabled()) {
                        if (hw_is_mask(*v))
                                old_mask = *hw_param_mask(params, *v);
                }
                if (trace_hw_interval_param_enabled()) {
                        if (hw_is_interval(*v))
                                old_interval = *hw_param_interval(params, *v);
                }
                if (*v != SNDRV_PCM_HW_PARAM_BUFFER_SIZE)
                        changed = snd_pcm_hw_param_first(pcm, params, *v, NULL);
                else
                        changed = snd_pcm_hw_param_last(pcm, params, *v, NULL);
                if (changed < 0)
                        return changed;
                if (changed == 0)
                        continue;

                /* Trace the changed parameter. */
                if (hw_is_mask(*v)) {
                        trace_hw_mask_param(pcm, *v, 0, &old_mask,
                                            hw_param_mask(params, *v));
                }
                if (hw_is_interval(*v)) {
                        trace_hw_interval_param(pcm, *v, 0, &old_interval,
                                                hw_param_interval(params, *v));
                }
        }

        return 0;
}

/* acquire buffer_mutex; if it's in r/w operation, return -EBUSY, otherwise
 * block the further r/w operations
 */
static int snd_pcm_buffer_access_lock(struct snd_pcm_runtime *runtime)
{
        if (!atomic_dec_unless_positive(&runtime->buffer_accessing))
                return -EBUSY;
        mutex_lock(&runtime->buffer_mutex);
        return 0; /* keep buffer_mutex, unlocked by below */
}

/* release buffer_mutex and clear r/w access flag */
static void snd_pcm_buffer_access_unlock(struct snd_pcm_runtime *runtime)
{
        mutex_unlock(&runtime->buffer_mutex);
        atomic_inc(&runtime->buffer_accessing);
}

#if IS_ENABLED(CONFIG_SND_PCM_OSS)
#define is_oss_stream(substream)        ((substream)->oss.oss)
#else
#define is_oss_stream(substream)        false
#endif

static int snd_pcm_hw_params(struct snd_pcm_substream *substream,
                             struct snd_pcm_hw_params *params)
{
        struct snd_pcm_runtime *runtime;
        int err, usecs;
        unsigned int bits;
        snd_pcm_uframes_t frames;

        if (PCM_RUNTIME_CHECK(substream))
                return -ENXIO;
        runtime = substream->runtime;
        err = snd_pcm_buffer_access_lock(runtime);
        if (err < 0)
                return err;
        scoped_guard(pcm_stream_lock_irq, substream) {
                switch (runtime->state) {
                case SNDRV_PCM_STATE_OPEN:
                case SNDRV_PCM_STATE_SETUP:
                case SNDRV_PCM_STATE_PREPARED:
                        if (!is_oss_stream(substream) &&
                            atomic_read(&substream->mmap_count))
                                err = -EBADFD;
                        break;
                default:
                        err = -EBADFD;
                        break;
                }
        }
        if (err)
                goto unlock;

        snd_pcm_sync_stop(substream, true);

        params->rmask = ~0U;
        err = snd_pcm_hw_refine(substream, params);
        if (err < 0)
                goto _error;

        err = snd_pcm_hw_params_choose(substream, params);
        if (err < 0)
                goto _error;

        err = fixup_unreferenced_params(substream, params);
        if (err < 0)
                goto _error;

        if (substream->managed_buffer_alloc) {
                err = snd_pcm_lib_malloc_pages(substream,
                                               params_buffer_bytes(params));
                if (err < 0)
                        goto _error;
                runtime->buffer_changed = err > 0;
        }

        if (substream->ops->hw_params != NULL) {
                err = substream->ops->hw_params(substream, params);
                if (err < 0)
                        goto _error;
        }

        runtime->access = params_access(params);
        runtime->format = params_format(params);
        runtime->subformat = params_subformat(params);
        runtime->channels = params_channels(params);
        runtime->rate = params_rate(params);
        runtime->period_size = params_period_size(params);
        runtime->periods = params_periods(params);
        runtime->buffer_size = params_buffer_size(params);
        runtime->info = params->info;
        runtime->rate_num = params->rate_num;
        runtime->rate_den = params->rate_den;
        runtime->no_period_wakeup =
                        (params->info & SNDRV_PCM_INFO_NO_PERIOD_WAKEUP) &&
                        (params->flags & SNDRV_PCM_HW_PARAMS_NO_PERIOD_WAKEUP);

        bits = snd_pcm_format_physical_width(runtime->format);
        runtime->sample_bits = bits;
        bits *= runtime->channels;
        runtime->frame_bits = bits;
        frames = 1;
        while (bits % 8 != 0) {
                bits *= 2;
                frames *= 2;
        }
        runtime->byte_align = bits / 8;
        runtime->min_align = frames;

        /* Default sw params */
        runtime->tstamp_mode = SNDRV_PCM_TSTAMP_NONE;
        runtime->period_step = 1;
        runtime->control->avail_min = runtime->period_size;
        runtime->start_threshold = 1;
        runtime->stop_threshold = runtime->buffer_size;
        runtime->silence_threshold = 0;
        runtime->silence_size = 0;
        runtime->boundary = runtime->buffer_size;
        while (runtime->boundary * 2 <= LONG_MAX - runtime->buffer_size)
                runtime->boundary *= 2;

        /* clear the buffer for avoiding possible kernel info leaks */
        if (runtime->dma_area && !substream->ops->copy) {
                size_t size = runtime->dma_bytes;

                if (runtime->info & SNDRV_PCM_INFO_MMAP)
                        size = PAGE_ALIGN(size);
                memset(runtime->dma_area, 0, size);
        }

        snd_pcm_timer_resolution_change(substream);
        snd_pcm_set_state(substream, SNDRV_PCM_STATE_SETUP);

        if (cpu_latency_qos_request_active(&substream->latency_pm_qos_req))
                cpu_latency_qos_remove_request(&substream->latency_pm_qos_req);
        usecs = period_to_usecs(runtime);
        if (usecs >= 0)
                cpu_latency_qos_add_request(&substream->latency_pm_qos_req,
                                            usecs);
        err = 0;
 _error:
        if (err) {
                /* hardware might be unusable from this time,
                 * so we force application to retry to set
                 * the correct hardware parameter settings
                 */
                snd_pcm_set_state(substream, SNDRV_PCM_STATE_OPEN);
                if (substream->ops->hw_free != NULL)
                        substream->ops->hw_free(substream);
                if (substream->managed_buffer_alloc)
                        snd_pcm_lib_free_pages(substream);
        }
 unlock:
        snd_pcm_buffer_access_unlock(runtime);
        return err;
}

static int snd_pcm_hw_params_user(struct snd_pcm_substream *substream,
                                  struct snd_pcm_hw_params __user * _params)
{
        struct snd_pcm_hw_params *params __free(kfree) = NULL;
        int err;

        params = memdup_user(_params, sizeof(*params));
        if (IS_ERR(params))
                return PTR_ERR(no_free_ptr(params));

        err = snd_pcm_hw_params(substream, params);
        if (err < 0)
                return err;

        if (copy_to_user(_params, params, sizeof(*params)))
                return -EFAULT;
        return err;
}

static int do_hw_free(struct snd_pcm_substream *substream)
{
        int result = 0;

        snd_pcm_sync_stop(substream, true);
        if (substream->ops->hw_free)
                result = substream->ops->hw_free(substream);
        if (substream->managed_buffer_alloc)
                snd_pcm_lib_free_pages(substream);
        return result;
}

static int snd_pcm_hw_free(struct snd_pcm_substream *substream)
{
        struct snd_pcm_runtime *runtime;
        int result = 0;

        if (PCM_RUNTIME_CHECK(substream))
                return -ENXIO;
        runtime = substream->runtime;
        result = snd_pcm_buffer_access_lock(runtime);
        if (result < 0)
                return result;
        scoped_guard(pcm_stream_lock_irq, substream) {
                switch (runtime->state) {
                case SNDRV_PCM_STATE_SETUP:
                case SNDRV_PCM_STATE_PREPARED:
                        if (atomic_read(&substream->mmap_count))
                                result = -EBADFD;
                        break;
                default:
                        result = -EBADFD;
                        break;
                }
        }
        if (result)
                goto unlock;
        result = do_hw_free(substream);
        snd_pcm_set_state(substream, SNDRV_PCM_STATE_OPEN);
        cpu_latency_qos_remove_request(&substream->latency_pm_qos_req);
 unlock:
        snd_pcm_buffer_access_unlock(runtime);
        return result;
}

static int snd_pcm_sw_params(struct snd_pcm_substream *substream,
                             struct snd_pcm_sw_params *params)
{
        struct snd_pcm_runtime *runtime;
        int err;

        if (PCM_RUNTIME_CHECK(substream))
                return -ENXIO;
        runtime = substream->runtime;
        scoped_guard(pcm_stream_lock_irq, substream) {
                if (runtime->state == SNDRV_PCM_STATE_OPEN)
                        return -EBADFD;
        }

        if (params->tstamp_mode < 0 ||
            params->tstamp_mode > SNDRV_PCM_TSTAMP_LAST)
                return -EINVAL;
        if (params->proto >= SNDRV_PROTOCOL_VERSION(2, 0, 12) &&
            params->tstamp_type > SNDRV_PCM_TSTAMP_TYPE_LAST)
                return -EINVAL;
        if (params->avail_min == 0)
                return -EINVAL;
        if (params->silence_size >= runtime->boundary) {
                if (params->silence_threshold != 0)
                        return -EINVAL;
        } else {
                if (params->silence_size > params->silence_threshold)
                        return -EINVAL;
                if (params->silence_threshold > runtime->buffer_size)
                        return -EINVAL;
        }
        err = 0;
        scoped_guard(pcm_stream_lock_irq, substream) {
                runtime->tstamp_mode = params->tstamp_mode;
                if (params->proto >= SNDRV_PROTOCOL_VERSION(2, 0, 12))
                        runtime->tstamp_type = params->tstamp_type;
                runtime->period_step = params->period_step;
                runtime->control->avail_min = params->avail_min;
                runtime->start_threshold = params->start_threshold;
                runtime->stop_threshold = params->stop_threshold;
                runtime->silence_threshold = params->silence_threshold;
                runtime->silence_size = params->silence_size;
                params->boundary = runtime->boundary;
                if (snd_pcm_running(substream)) {
                        if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK &&
                            runtime->silence_size > 0)
                                snd_pcm_playback_silence(substream, ULONG_MAX);
                        err = snd_pcm_update_state(substream, runtime);
                }
        }
        return err;
}

static int snd_pcm_sw_params_user(struct snd_pcm_substream *substream,
                                  struct snd_pcm_sw_params __user * _params)
{
        struct snd_pcm_sw_params params;
        int err;
        if (copy_from_user(&params, _params, sizeof(params)))
                return -EFAULT;
        err = snd_pcm_sw_params(substream, &params);
        if (copy_to_user(_params, &params, sizeof(params)))
                return -EFAULT;
        return err;
}

static inline snd_pcm_uframes_t
snd_pcm_calc_delay(struct snd_pcm_substream *substream)
{
        snd_pcm_uframes_t delay;

        if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK)
                delay = snd_pcm_playback_hw_avail(substream->runtime);
        else
                delay = snd_pcm_capture_avail(substream->runtime);
        return delay + substream->runtime->delay;
}

int snd_pcm_status64(struct snd_pcm_substream *substream,
                     struct snd_pcm_status64 *status)
{
        struct snd_pcm_runtime *runtime = substream->runtime;

        guard(pcm_stream_lock_irq)(substream);

        snd_pcm_unpack_audio_tstamp_config(status->audio_tstamp_data,
                                        &runtime->audio_tstamp_config);

        /* backwards compatible behavior */
        if (runtime->audio_tstamp_config.type_requested ==
                SNDRV_PCM_AUDIO_TSTAMP_TYPE_COMPAT) {
                if (runtime->hw.info & SNDRV_PCM_INFO_HAS_WALL_CLOCK)
                        runtime->audio_tstamp_config.type_requested =
                                SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK;
                else
                        runtime->audio_tstamp_config.type_requested =
                                SNDRV_PCM_AUDIO_TSTAMP_TYPE_DEFAULT;
                runtime->audio_tstamp_report.valid = 0;
        } else
                runtime->audio_tstamp_report.valid = 1;

        status->state = runtime->state;
        status->suspended_state = runtime->suspended_state;
        if (status->state == SNDRV_PCM_STATE_OPEN)
                return 0;
        status->trigger_tstamp_sec = runtime->trigger_tstamp.tv_sec;
        status->trigger_tstamp_nsec = runtime->trigger_tstamp.tv_nsec;
        if (snd_pcm_running(substream)) {
                snd_pcm_update_hw_ptr(substream);
                if (runtime->tstamp_mode == SNDRV_PCM_TSTAMP_ENABLE) {
                        status->tstamp_sec = runtime->status->tstamp.tv_sec;
                        status->tstamp_nsec =
                                runtime->status->tstamp.tv_nsec;
                        status->driver_tstamp_sec =
                                runtime->driver_tstamp.tv_sec;
                        status->driver_tstamp_nsec =
                                runtime->driver_tstamp.tv_nsec;
                        status->audio_tstamp_sec =
                                runtime->status->audio_tstamp.tv_sec;
                        status->audio_tstamp_nsec =
                                runtime->status->audio_tstamp.tv_nsec;
                        if (runtime->audio_tstamp_report.valid == 1)
                                /* backwards compatibility, no report provided in COMPAT mode */
                                snd_pcm_pack_audio_tstamp_report(&status->audio_tstamp_data,
                                                                &status->audio_tstamp_accuracy,
                                                                &runtime->audio_tstamp_report);

                        goto _tstamp_end;
                }
        } else {
                /* get tstamp only in fallback mode and only if enabled */
                if (runtime->tstamp_mode == SNDRV_PCM_TSTAMP_ENABLE) {
                        struct timespec64 tstamp;

                        snd_pcm_gettime(runtime, &tstamp);
                        status->tstamp_sec = tstamp.tv_sec;
                        status->tstamp_nsec = tstamp.tv_nsec;
                }
        }
 _tstamp_end:
        status->appl_ptr = runtime->control->appl_ptr;
        status->hw_ptr = runtime->status->hw_ptr;
        status->avail = snd_pcm_avail(substream);
        status->delay = snd_pcm_running(substream) ?
                snd_pcm_calc_delay(substream) : 0;
        status->avail_max = runtime->avail_max;
        status->overrange = runtime->overrange;
        runtime->avail_max = 0;
        runtime->overrange = 0;
        return 0;
}

static int snd_pcm_status_user64(struct snd_pcm_substream *substream,
                                 struct snd_pcm_status64 __user * _status,
                                 bool ext)
{
        struct snd_pcm_status64 status;
        int res;

        memset(&status, 0, sizeof(status));
        /*
         * with extension, parameters are read/write,
         * get audio_tstamp_data from user,
         * ignore rest of status structure
         */
        if (ext && get_user(status.audio_tstamp_data,
                                (u32 __user *)(&_status->audio_tstamp_data)))
                return -EFAULT;
        res = snd_pcm_status64(substream, &status);
        if (res < 0)
                return res;
        if (copy_to_user(_status, &status, sizeof(status)))
                return -EFAULT;
        return 0;
}

static int snd_pcm_status_user32(struct snd_pcm_substream *substream,
                                 struct snd_pcm_status32 __user * _status,
                                 bool ext)
{
        struct snd_pcm_status64 status64;
        struct snd_pcm_status32 status32;
        int res;

        memset(&status64, 0, sizeof(status64));
        memset(&status32, 0, sizeof(status32));
        /*
         * with extension, parameters are read/write,
         * get audio_tstamp_data from user,
         * ignore rest of status structure
         */
        if (ext && get_user(status64.audio_tstamp_data,
                            (u32 __user *)(&_status->audio_tstamp_data)))
                return -EFAULT;
        res = snd_pcm_status64(substream, &status64);
        if (res < 0)
                return res;

        status32 = (struct snd_pcm_status32) {
                .state = status64.state,
                .trigger_tstamp_sec = status64.trigger_tstamp_sec,
                .trigger_tstamp_nsec = status64.trigger_tstamp_nsec,
                .tstamp_sec = status64.tstamp_sec,
                .tstamp_nsec = status64.tstamp_nsec,
                .appl_ptr = status64.appl_ptr,
                .hw_ptr = status64.hw_ptr,
                .delay = status64.delay,
                .avail = status64.avail,
                .avail_max = status64.avail_max,
                .overrange = status64.overrange,
                .suspended_state = status64.suspended_state,
                .audio_tstamp_data = status64.audio_tstamp_data,
                .audio_tstamp_sec = status64.audio_tstamp_sec,
                .audio_tstamp_nsec = status64.audio_tstamp_nsec,
                .driver_tstamp_sec = status64.audio_tstamp_sec,
                .driver_tstamp_nsec = status64.audio_tstamp_nsec,
                .audio_tstamp_accuracy = status64.audio_tstamp_accuracy,
        };

        if (copy_to_user(_status, &status32, sizeof(status32)))
                return -EFAULT;

        return 0;
}

static int snd_pcm_channel_info(struct snd_pcm_substream *substream,
                                struct snd_pcm_channel_info * info)
{
        struct snd_pcm_runtime *runtime;
        unsigned int channel;
        
        channel = info->channel;
        runtime = substream->runtime;
        scoped_guard(pcm_stream_lock_irq, substream) {
                if (runtime->state == SNDRV_PCM_STATE_OPEN)
                        return -EBADFD;
        }
        if (channel >= runtime->channels)
                return -EINVAL;
        memset(info, 0, sizeof(*info));
        info->channel = channel;
        return snd_pcm_ops_ioctl(substream, SNDRV_PCM_IOCTL1_CHANNEL_INFO, info);
}

static int snd_pcm_channel_info_user(struct snd_pcm_substream *substream,
                                     struct snd_pcm_channel_info __user * _info)
{
        struct snd_pcm_channel_info info;
        int res;
        
        if (copy_from_user(&info, _info, sizeof(info)))
                return -EFAULT;
        res = snd_pcm_channel_info(substream, &info);
        if (res < 0)
                return res;
        if (copy_to_user(_info, &info, sizeof(info)))
                return -EFAULT;
        return 0;
}

static void snd_pcm_trigger_tstamp(struct snd_pcm_substream *substream)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        if (runtime->trigger_master == NULL)
                return;
        if (runtime->trigger_master == substream) {
                if (!runtime->trigger_tstamp_latched)
                        snd_pcm_gettime(runtime, &runtime->trigger_tstamp);
        } else {
                snd_pcm_trigger_tstamp(runtime->trigger_master);
                runtime->trigger_tstamp = runtime->trigger_master->runtime->trigger_tstamp;
        }
        runtime->trigger_master = NULL;
}

#define ACTION_ARG_IGNORE        (__force snd_pcm_state_t)0

struct action_ops {
        int (*pre_action)(struct snd_pcm_substream *substream,
                          snd_pcm_state_t state);
        int (*do_action)(struct snd_pcm_substream *substream,
                         snd_pcm_state_t state);
        void (*undo_action)(struct snd_pcm_substream *substream,
                            snd_pcm_state_t state);
        void (*post_action)(struct snd_pcm_substream *substream,
                            snd_pcm_state_t state);
};

/*
 *  this functions is core for handling of linked stream
 *  Note: the stream state might be changed also on failure
 *  Note2: call with calling stream lock + link lock
 */
static int snd_pcm_action_group(const struct action_ops *ops,
                                struct snd_pcm_substream *substream,
                                snd_pcm_state_t state,
                                bool stream_lock)
{
        struct snd_pcm_substream *s = NULL;
        struct snd_pcm_substream *s1;
        int res = 0, depth = 1;

        snd_pcm_group_for_each_entry(s, substream) {
                if (s != substream) {
                        if (!stream_lock)
                                mutex_lock_nested(&s->runtime->buffer_mutex, depth);
                        else if (s->pcm->nonatomic)
                                mutex_lock_nested(&s->self_group.mutex, depth);
                        else
                                spin_lock_nested(&s->self_group.lock, depth);
                        depth++;
                }
                res = ops->pre_action(s, state);
                if (res < 0)
                        goto _unlock;
        }
        snd_pcm_group_for_each_entry(s, substream) {
                res = ops->do_action(s, state);
                if (res < 0) {
                        if (ops->undo_action) {
                                snd_pcm_group_for_each_entry(s1, substream) {
                                        if (s1 == s) /* failed stream */
                                                break;
                                        ops->undo_action(s1, state);
                                }
                        }
                        s = NULL; /* unlock all */
                        goto _unlock;
                }
        }
        snd_pcm_group_for_each_entry(s, substream) {
                ops->post_action(s, state);
        }
 _unlock:
        /* unlock streams */
        snd_pcm_group_for_each_entry(s1, substream) {
                if (s1 != substream) {
                        if (!stream_lock)
                                mutex_unlock(&s1->runtime->buffer_mutex);
                        else if (s1->pcm->nonatomic)
                                mutex_unlock(&s1->self_group.mutex);
                        else
                                spin_unlock(&s1->self_group.lock);
                }
                if (s1 == s)        /* end */
                        break;
        }
        return res;
}

/*
 *  Note: call with stream lock
 */
static int snd_pcm_action_single(const struct action_ops *ops,
                                 struct snd_pcm_substream *substream,
                                 snd_pcm_state_t state)
{
        int res;
        
        res = ops->pre_action(substream, state);
        if (res < 0)
                return res;
        res = ops->do_action(substream, state);
        if (res == 0)
                ops->post_action(substream, state);
        else if (ops->undo_action)
                ops->undo_action(substream, state);
        return res;
}

static void snd_pcm_group_assign(struct snd_pcm_substream *substream,
                                 struct snd_pcm_group *new_group)
{
        substream->group = new_group;
        list_move(&substream->link_list, &new_group->substreams);
}

/*
 * Unref and unlock the group, but keep the stream lock;
 * when the group becomes empty and no longer referred, destroy itself
 */
static void snd_pcm_group_unref(struct snd_pcm_group *group,
                                struct snd_pcm_substream *substream)
{
        bool do_free;

        if (!group)
                return;
        do_free = refcount_dec_and_test(&group->refs);
        snd_pcm_group_unlock(group, substream->pcm->nonatomic);
        if (do_free)
                kfree(group);
}

/*
 * Lock the group inside a stream lock and reference it;
 * return the locked group object, or NULL if not linked
 */
static struct snd_pcm_group *
snd_pcm_stream_group_ref(struct snd_pcm_substream *substream)
{
        bool nonatomic = substream->pcm->nonatomic;
        struct snd_pcm_group *group;
        bool trylock;

        for (;;) {
                if (!snd_pcm_stream_linked(substream))
                        return NULL;
                group = substream->group;
                /* block freeing the group object */
                refcount_inc(&group->refs);

                trylock = nonatomic ? mutex_trylock(&group->mutex) :
                        spin_trylock(&group->lock);
                if (trylock)
                        break; /* OK */

                /* re-lock for avoiding ABBA deadlock */
                snd_pcm_stream_unlock(substream);
                snd_pcm_group_lock(group, nonatomic);
                snd_pcm_stream_lock(substream);

                /* check the group again; the above opens a small race window */
                if (substream->group == group)
                        break; /* OK */
                /* group changed, try again */
                snd_pcm_group_unref(group, substream);
        }
        return group;
}

/*
 *  Note: call with stream lock
 */
static int snd_pcm_action(const struct action_ops *ops,
                          struct snd_pcm_substream *substream,
                          snd_pcm_state_t state)
{
        struct snd_pcm_group *group;
        int res;

        group = snd_pcm_stream_group_ref(substream);
        if (group)
                res = snd_pcm_action_group(ops, substream, state, true);
        else
                res = snd_pcm_action_single(ops, substream, state);
        snd_pcm_group_unref(group, substream);
        return res;
}

/*
 *  Note: don't use any locks before
 */
static int snd_pcm_action_lock_irq(const struct action_ops *ops,
                                   struct snd_pcm_substream *substream,
                                   snd_pcm_state_t state)
{
        guard(pcm_stream_lock_irq)(substream);
        return snd_pcm_action(ops, substream, state);
}

/*
 */
static int snd_pcm_action_nonatomic(const struct action_ops *ops,
                                    struct snd_pcm_substream *substream,
                                    snd_pcm_state_t state)
{
        int res;

        /* Guarantee the group members won't change during non-atomic action */
        guard(rwsem_read)(&snd_pcm_link_rwsem);
        res = snd_pcm_buffer_access_lock(substream->runtime);
        if (res < 0)
                return res;
        if (snd_pcm_stream_linked(substream))
                res = snd_pcm_action_group(ops, substream, state, false);
        else
                res = snd_pcm_action_single(ops, substream, state);
        snd_pcm_buffer_access_unlock(substream->runtime);
        return res;
}

/*
 * start callbacks
 */
static int snd_pcm_pre_start(struct snd_pcm_substream *substream,
                             snd_pcm_state_t state)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        if (runtime->state != SNDRV_PCM_STATE_PREPARED)
                return -EBADFD;
        if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK &&
            !snd_pcm_playback_data(substream))
                return -EPIPE;
        runtime->trigger_tstamp_latched = false;
        runtime->trigger_master = substream;
        return 0;
}

static int snd_pcm_do_start(struct snd_pcm_substream *substream,
                            snd_pcm_state_t state)
{
        int err;

        if (substream->runtime->trigger_master != substream)
                return 0;
        err = substream->ops->trigger(substream, SNDRV_PCM_TRIGGER_START);
        /* XRUN happened during the start */
        if (err == -EPIPE)
                __snd_pcm_set_state(substream->runtime, SNDRV_PCM_STATE_XRUN);
        return err;
}

static void snd_pcm_undo_start(struct snd_pcm_substream *substream,
                               snd_pcm_state_t state)
{
        if (substream->runtime->trigger_master == substream) {
                substream->ops->trigger(substream, SNDRV_PCM_TRIGGER_STOP);
                substream->runtime->stop_operating = true;
        }
}

static void snd_pcm_post_start(struct snd_pcm_substream *substream,
                               snd_pcm_state_t state)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        snd_pcm_trigger_tstamp(substream);
        runtime->hw_ptr_jiffies = jiffies;
        runtime->hw_ptr_buffer_jiffies = (runtime->buffer_size * HZ) / 
                                                            runtime->rate;
        __snd_pcm_set_state(runtime, state);
        if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK &&
            runtime->silence_size > 0)
                snd_pcm_playback_silence(substream, ULONG_MAX);
        snd_pcm_timer_notify(substream, SNDRV_TIMER_EVENT_MSTART);
}

static const struct action_ops snd_pcm_action_start = {
        .pre_action = snd_pcm_pre_start,
        .do_action = snd_pcm_do_start,
        .undo_action = snd_pcm_undo_start,
        .post_action = snd_pcm_post_start
};

/**
 * snd_pcm_start - start all linked streams
 * @substream: the PCM substream instance
 *
 * Return: Zero if successful, or a negative error code.
 * The stream lock must be acquired before calling this function.
 */
int snd_pcm_start(struct snd_pcm_substream *substream)
{
        return snd_pcm_action(&snd_pcm_action_start, substream,
                              SNDRV_PCM_STATE_RUNNING);
}

/* take the stream lock and start the streams */
static int snd_pcm_start_lock_irq(struct snd_pcm_substream *substream)
{
        return snd_pcm_action_lock_irq(&snd_pcm_action_start, substream,
                                       SNDRV_PCM_STATE_RUNNING);
}

/*
 * stop callbacks
 */
static int snd_pcm_pre_stop(struct snd_pcm_substream *substream,
                            snd_pcm_state_t state)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        if (runtime->state == SNDRV_PCM_STATE_OPEN)
                return -EBADFD;
        runtime->trigger_master = substream;
        return 0;
}

static int snd_pcm_do_stop(struct snd_pcm_substream *substream,
                           snd_pcm_state_t state)
{
        if (substream->runtime->trigger_master == substream &&
            snd_pcm_running(substream)) {
                substream->ops->trigger(substream, SNDRV_PCM_TRIGGER_STOP);
                substream->runtime->stop_operating = true;
        }
        return 0; /* unconditionally stop all substreams */
}

static void snd_pcm_post_stop(struct snd_pcm_substream *substream,
                              snd_pcm_state_t state)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        if (runtime->state != state) {
                snd_pcm_trigger_tstamp(substream);
                __snd_pcm_set_state(runtime, state);
                snd_pcm_timer_notify(substream, SNDRV_TIMER_EVENT_MSTOP);
        }
        wake_up(&runtime->sleep);
        wake_up(&runtime->tsleep);
}

static const struct action_ops snd_pcm_action_stop = {
        .pre_action = snd_pcm_pre_stop,
        .do_action = snd_pcm_do_stop,
        .post_action = snd_pcm_post_stop
};

/**
 * snd_pcm_stop - try to stop all running streams in the substream group
 * @substream: the PCM substream instance
 * @state: PCM state after stopping the stream
 *
 * The state of each stream is then changed to the given state unconditionally.
 *
 * Return: Zero if successful, or a negative error code.
 */
int snd_pcm_stop(struct snd_pcm_substream *substream, snd_pcm_state_t state)
{
        return snd_pcm_action(&snd_pcm_action_stop, substream, state);
}
EXPORT_SYMBOL(snd_pcm_stop);

/**
 * snd_pcm_drain_done - stop the DMA only when the given stream is playback
 * @substream: the PCM substream
 *
 * After stopping, the state is changed to SETUP.
 * Unlike snd_pcm_stop(), this affects only the given stream.
 *
 * Return: Zero if successful, or a negative error code.
 */
int snd_pcm_drain_done(struct snd_pcm_substream *substream)
{
        return snd_pcm_action_single(&snd_pcm_action_stop, substream,
                                     SNDRV_PCM_STATE_SETUP);
}

/**
 * snd_pcm_stop_xrun - stop the running streams as XRUN
 * @substream: the PCM substream instance
 *
 * This stops the given running substream (and all linked substreams) as XRUN.
 * Unlike snd_pcm_stop(), this function takes the substream lock by itself.
 *
 * Return: Zero if successful, or a negative error code.
 */
int snd_pcm_stop_xrun(struct snd_pcm_substream *substream)
{
        guard(pcm_stream_lock_irqsave)(substream);
        if (substream->runtime && snd_pcm_running(substream))
                __snd_pcm_xrun(substream);
        return 0;
}
EXPORT_SYMBOL_GPL(snd_pcm_stop_xrun);

/*
 * pause callbacks: pass boolean (to start pause or resume) as state argument
 */
#define pause_pushed(state)        (__force bool)(state)

static int snd_pcm_pre_pause(struct snd_pcm_substream *substream,
                             snd_pcm_state_t state)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        if (!(runtime->info & SNDRV_PCM_INFO_PAUSE))
                return -ENOSYS;
        if (pause_pushed(state)) {
                if (runtime->state != SNDRV_PCM_STATE_RUNNING)
                        return -EBADFD;
        } else if (runtime->state != SNDRV_PCM_STATE_PAUSED)
                return -EBADFD;
        runtime->trigger_master = substream;
        return 0;
}

static int snd_pcm_do_pause(struct snd_pcm_substream *substream,
                            snd_pcm_state_t state)
{
        if (substream->runtime->trigger_master != substream)
                return 0;
        /* The jiffies check in snd_pcm_update_hw_ptr*() is done by
         * a delta between the current jiffies, this gives a large enough
         * delta, effectively to skip the check once.
         */
        substream->runtime->hw_ptr_jiffies = jiffies - HZ * 1000;
        return substream->ops->trigger(substream,
                                       pause_pushed(state) ?
                                       SNDRV_PCM_TRIGGER_PAUSE_PUSH :
                                       SNDRV_PCM_TRIGGER_PAUSE_RELEASE);
}

static void snd_pcm_undo_pause(struct snd_pcm_substream *substream,
                               snd_pcm_state_t state)
{
        if (substream->runtime->trigger_master == substream)
                substream->ops->trigger(substream,
                                        pause_pushed(state) ?
                                        SNDRV_PCM_TRIGGER_PAUSE_RELEASE :
                                        SNDRV_PCM_TRIGGER_PAUSE_PUSH);
}

static void snd_pcm_post_pause(struct snd_pcm_substream *substream,
                               snd_pcm_state_t state)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        snd_pcm_trigger_tstamp(substream);
        if (pause_pushed(state)) {
                __snd_pcm_set_state(runtime, SNDRV_PCM_STATE_PAUSED);
                snd_pcm_timer_notify(substream, SNDRV_TIMER_EVENT_MPAUSE);
                wake_up(&runtime->sleep);
                wake_up(&runtime->tsleep);
        } else {
                __snd_pcm_set_state(runtime, SNDRV_PCM_STATE_RUNNING);
                snd_pcm_timer_notify(substream, SNDRV_TIMER_EVENT_MCONTINUE);
        }
}

static const struct action_ops snd_pcm_action_pause = {
        .pre_action = snd_pcm_pre_pause,
        .do_action = snd_pcm_do_pause,
        .undo_action = snd_pcm_undo_pause,
        .post_action = snd_pcm_post_pause
};

/*
 * Push/release the pause for all linked streams.
 */
static int snd_pcm_pause(struct snd_pcm_substream *substream, bool push)
{
        return snd_pcm_action(&snd_pcm_action_pause, substream,
                              (__force snd_pcm_state_t)push);
}

static int snd_pcm_pause_lock_irq(struct snd_pcm_substream *substream,
                                  bool push)
{
        return snd_pcm_action_lock_irq(&snd_pcm_action_pause, substream,
                                       (__force snd_pcm_state_t)push);
}

#ifdef CONFIG_PM
/* suspend callback: state argument ignored */

static int snd_pcm_pre_suspend(struct snd_pcm_substream *substream,
                               snd_pcm_state_t state)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        switch (runtime->state) {
        case SNDRV_PCM_STATE_SUSPENDED:
                return -EBUSY;
        /* unresumable PCM state; return -EBUSY for skipping suspend */
        case SNDRV_PCM_STATE_OPEN:
        case SNDRV_PCM_STATE_SETUP:
        case SNDRV_PCM_STATE_DISCONNECTED:
                return -EBUSY;
        }
        runtime->trigger_master = substream;
        return 0;
}

static int snd_pcm_do_suspend(struct snd_pcm_substream *substream,
                              snd_pcm_state_t state)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        if (runtime->trigger_master != substream)
                return 0;
        if (! snd_pcm_running(substream))
                return 0;
        substream->ops->trigger(substream, SNDRV_PCM_TRIGGER_SUSPEND);
        runtime->stop_operating = true;
        return 0; /* suspend unconditionally */
}

static void snd_pcm_post_suspend(struct snd_pcm_substream *substream,
                                 snd_pcm_state_t state)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        snd_pcm_trigger_tstamp(substream);
        runtime->suspended_state = runtime->state;
        runtime->status->suspended_state = runtime->suspended_state;
        __snd_pcm_set_state(runtime, SNDRV_PCM_STATE_SUSPENDED);
        snd_pcm_timer_notify(substream, SNDRV_TIMER_EVENT_MSUSPEND);
        wake_up(&runtime->sleep);
        wake_up(&runtime->tsleep);
}

static const struct action_ops snd_pcm_action_suspend = {
        .pre_action = snd_pcm_pre_suspend,
        .do_action = snd_pcm_do_suspend,
        .post_action = snd_pcm_post_suspend
};

/*
 * snd_pcm_suspend - trigger SUSPEND to all linked streams
 * @substream: the PCM substream
 *
 * After this call, all streams are changed to SUSPENDED state.
 *
 * Return: Zero if successful, or a negative error code.
 */
static int snd_pcm_suspend(struct snd_pcm_substream *substream)
{
        guard(pcm_stream_lock_irqsave)(substream);
        return snd_pcm_action(&snd_pcm_action_suspend, substream,
                              ACTION_ARG_IGNORE);
}

/**
 * snd_pcm_suspend_all - trigger SUSPEND to all substreams in the given pcm
 * @pcm: the PCM instance
 *
 * After this call, all streams are changed to SUSPENDED state.
 *
 * Return: Zero if successful (or @pcm is %NULL), or a negative error code.
 */
int snd_pcm_suspend_all(struct snd_pcm *pcm)
{
        struct snd_pcm_substream *substream;
        int stream, err = 0;

        if (! pcm)
                return 0;

        for_each_pcm_substream(pcm, stream, substream) {
                /* FIXME: the open/close code should lock this as well */
                if (!substream->runtime)
                        continue;

                /*
                 * Skip BE dai link PCM's that are internal and may
                 * not have their substream ops set.
                 */
                if (!substream->ops)
                        continue;

                err = snd_pcm_suspend(substream);
                if (err < 0 && err != -EBUSY)
                        return err;
        }

        for_each_pcm_substream(pcm, stream, substream)
                snd_pcm_sync_stop(substream, false);

        return 0;
}
EXPORT_SYMBOL(snd_pcm_suspend_all);

/* resume callbacks: state argument ignored */

static int snd_pcm_pre_resume(struct snd_pcm_substream *substream,
                              snd_pcm_state_t state)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        if (!(runtime->info & SNDRV_PCM_INFO_RESUME))
                return -ENOSYS;
        runtime->trigger_master = substream;
        return 0;
}

static int snd_pcm_do_resume(struct snd_pcm_substream *substream,
                             snd_pcm_state_t state)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        if (runtime->trigger_master != substream)
                return 0;
        /* DMA not running previously? */
        if (runtime->suspended_state != SNDRV_PCM_STATE_RUNNING &&
            (runtime->suspended_state != SNDRV_PCM_STATE_DRAINING ||
             substream->stream != SNDRV_PCM_STREAM_PLAYBACK))
                return 0;
        return substream->ops->trigger(substream, SNDRV_PCM_TRIGGER_RESUME);
}

static void snd_pcm_undo_resume(struct snd_pcm_substream *substream,
                                snd_pcm_state_t state)
{
        if (substream->runtime->trigger_master == substream &&
            snd_pcm_running(substream))
                substream->ops->trigger(substream, SNDRV_PCM_TRIGGER_SUSPEND);
}

static void snd_pcm_post_resume(struct snd_pcm_substream *substream,
                                snd_pcm_state_t state)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        snd_pcm_trigger_tstamp(substream);
        __snd_pcm_set_state(runtime, runtime->suspended_state);
        snd_pcm_timer_notify(substream, SNDRV_TIMER_EVENT_MRESUME);
}

static const struct action_ops snd_pcm_action_resume = {
        .pre_action = snd_pcm_pre_resume,
        .do_action = snd_pcm_do_resume,
        .undo_action = snd_pcm_undo_resume,
        .post_action = snd_pcm_post_resume
};

static int snd_pcm_resume(struct snd_pcm_substream *substream)
{
        return snd_pcm_action_lock_irq(&snd_pcm_action_resume, substream,
                                       ACTION_ARG_IGNORE);
}

#else

static int snd_pcm_resume(struct snd_pcm_substream *substream)
{
        return -ENOSYS;
}

#endif /* CONFIG_PM */

/*
 * xrun ioctl
 *
 * Change the RUNNING stream(s) to XRUN state.
 */
static int snd_pcm_xrun(struct snd_pcm_substream *substream)
{
        struct snd_pcm_runtime *runtime = substream->runtime;

        guard(pcm_stream_lock_irq)(substream);
        switch (runtime->state) {
        case SNDRV_PCM_STATE_XRUN:
                return 0;        /* already there */
        case SNDRV_PCM_STATE_RUNNING:
                __snd_pcm_xrun(substream);
                return 0;
        default:
                return -EBADFD;
        }
}

/*
 * reset ioctl
 */
/* reset callbacks:  state argument ignored */
static int snd_pcm_pre_reset(struct snd_pcm_substream *substream,
                             snd_pcm_state_t state)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        switch (runtime->state) {
        case SNDRV_PCM_STATE_RUNNING:
        case SNDRV_PCM_STATE_PREPARED:
        case SNDRV_PCM_STATE_PAUSED:
        case SNDRV_PCM_STATE_SUSPENDED:
                return 0;
        default:
                return -EBADFD;
        }
}

static int snd_pcm_do_reset(struct snd_pcm_substream *substream,
                            snd_pcm_state_t state)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        int err = snd_pcm_ops_ioctl(substream, SNDRV_PCM_IOCTL1_RESET, NULL);
        if (err < 0)
                return err;
        guard(pcm_stream_lock_irq)(substream);
        runtime->hw_ptr_base = 0;
        runtime->hw_ptr_interrupt = runtime->status->hw_ptr -
                runtime->status->hw_ptr % runtime->period_size;
        runtime->silence_start = runtime->status->hw_ptr;
        runtime->silence_filled = 0;
        return 0;
}

static void snd_pcm_post_reset(struct snd_pcm_substream *substream,
                               snd_pcm_state_t state)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        guard(pcm_stream_lock_irq)(substream);
        runtime->control->appl_ptr = runtime->status->hw_ptr;
        if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK &&
            runtime->silence_size > 0)
                snd_pcm_playback_silence(substream, ULONG_MAX);
}

static const struct action_ops snd_pcm_action_reset = {
        .pre_action = snd_pcm_pre_reset,
        .do_action = snd_pcm_do_reset,
        .post_action = snd_pcm_post_reset
};

static int snd_pcm_reset(struct snd_pcm_substream *substream)
{
        return snd_pcm_action_nonatomic(&snd_pcm_action_reset, substream,
                                        ACTION_ARG_IGNORE);
}

/*
 * prepare ioctl
 */
/* pass f_flags as state argument */
static int snd_pcm_pre_prepare(struct snd_pcm_substream *substream,
                               snd_pcm_state_t state)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        int f_flags = (__force int)state;

        if (runtime->state == SNDRV_PCM_STATE_OPEN ||
            runtime->state == SNDRV_PCM_STATE_DISCONNECTED)
                return -EBADFD;
        if (snd_pcm_running(substream))
                return -EBUSY;
        substream->f_flags = f_flags;
        return 0;
}

static int snd_pcm_do_prepare(struct snd_pcm_substream *substream,
                              snd_pcm_state_t state)
{
        int err;
        snd_pcm_sync_stop(substream, true);
        err = substream->ops->prepare(substream);
        if (err < 0)
                return err;
        return snd_pcm_do_reset(substream, state);
}

static void snd_pcm_post_prepare(struct snd_pcm_substream *substream,
                                 snd_pcm_state_t state)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        runtime->control->appl_ptr = runtime->status->hw_ptr;
        snd_pcm_set_state(substream, SNDRV_PCM_STATE_PREPARED);
}

static const struct action_ops snd_pcm_action_prepare = {
        .pre_action = snd_pcm_pre_prepare,
        .do_action = snd_pcm_do_prepare,
        .post_action = snd_pcm_post_prepare
};

/**
 * snd_pcm_prepare - prepare the PCM substream to be triggerable
 * @substream: the PCM substream instance
 * @file: file to refer f_flags
 *
 * Return: Zero if successful, or a negative error code.
 */
static int snd_pcm_prepare(struct snd_pcm_substream *substream,
                           struct file *file)
{
        int f_flags;

        if (file)
                f_flags = file->f_flags;
        else
                f_flags = substream->f_flags;

        scoped_guard(pcm_stream_lock_irq, substream) {
                switch (substream->runtime->state) {
                case SNDRV_PCM_STATE_PAUSED:
                        snd_pcm_pause(substream, false);
                        fallthrough;
                case SNDRV_PCM_STATE_SUSPENDED:
                        snd_pcm_stop(substream, SNDRV_PCM_STATE_SETUP);
                        break;
                }
        }

        return snd_pcm_action_nonatomic(&snd_pcm_action_prepare,
                                        substream,
                                        (__force snd_pcm_state_t)f_flags);
}

/*
 * drain ioctl
 */

/* drain init callbacks: state argument ignored */
static int snd_pcm_pre_drain_init(struct snd_pcm_substream *substream,
                                  snd_pcm_state_t state)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        switch (runtime->state) {
        case SNDRV_PCM_STATE_OPEN:
        case SNDRV_PCM_STATE_DISCONNECTED:
        case SNDRV_PCM_STATE_SUSPENDED:
                return -EBADFD;
        }
        runtime->trigger_master = substream;
        return 0;
}

static int snd_pcm_do_drain_init(struct snd_pcm_substream *substream,
                                 snd_pcm_state_t state)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) {
                switch (runtime->state) {
                case SNDRV_PCM_STATE_PREPARED:
                        /* start playback stream if possible */
                        if (! snd_pcm_playback_empty(substream)) {
                                snd_pcm_do_start(substream, SNDRV_PCM_STATE_DRAINING);
                                snd_pcm_post_start(substream, SNDRV_PCM_STATE_DRAINING);
                        } else {
                                __snd_pcm_set_state(runtime, SNDRV_PCM_STATE_SETUP);
                        }
                        break;
                case SNDRV_PCM_STATE_RUNNING:
                        __snd_pcm_set_state(runtime, SNDRV_PCM_STATE_DRAINING);
                        break;
                case SNDRV_PCM_STATE_XRUN:
                        __snd_pcm_set_state(runtime, SNDRV_PCM_STATE_SETUP);
                        break;
                default:
                        break;
                }
        } else {
                /* stop running stream */
                if (runtime->state == SNDRV_PCM_STATE_RUNNING) {
                        snd_pcm_state_t new_state;

                        new_state = snd_pcm_capture_avail(runtime) > 0 ?
                                SNDRV_PCM_STATE_DRAINING : SNDRV_PCM_STATE_SETUP;
                        snd_pcm_do_stop(substream, new_state);
                        snd_pcm_post_stop(substream, new_state);
                }
        }

        if (runtime->state == SNDRV_PCM_STATE_DRAINING &&
            runtime->trigger_master == substream &&
            (runtime->hw.info & SNDRV_PCM_INFO_DRAIN_TRIGGER))
                return substream->ops->trigger(substream,
                                               SNDRV_PCM_TRIGGER_DRAIN);

        return 0;
}

static void snd_pcm_post_drain_init(struct snd_pcm_substream *substream,
                                    snd_pcm_state_t state)
{
}

static const struct action_ops snd_pcm_action_drain_init = {
        .pre_action = snd_pcm_pre_drain_init,
        .do_action = snd_pcm_do_drain_init,
        .post_action = snd_pcm_post_drain_init
};

/*
 * Drain the stream(s).
 * When the substream is linked, sync until the draining of all playback streams
 * is finished.
 * After this call, all streams are supposed to be either SETUP or DRAINING
 * (capture only) state.
 */
static int snd_pcm_drain(struct snd_pcm_substream *substream,
                         struct file *file)
{
        struct snd_card *card;
        struct snd_pcm_runtime *runtime;
        struct snd_pcm_substream *s;
        struct snd_pcm_group *group;
        wait_queue_entry_t wait;
        int result = 0;
        int nonblock = 0;

        card = substream->pcm->card;
        runtime = substream->runtime;

        if (runtime->state == SNDRV_PCM_STATE_OPEN)
                return -EBADFD;

        if (file) {
                if (file->f_flags & O_NONBLOCK)
                        nonblock = 1;
        } else if (substream->f_flags & O_NONBLOCK)
                nonblock = 1;

        snd_pcm_stream_lock_irq(substream);
        /* resume pause */
        if (runtime->state == SNDRV_PCM_STATE_PAUSED)
                snd_pcm_pause(substream, false);

        /* pre-start/stop - all running streams are changed to DRAINING state */
        result = snd_pcm_action(&snd_pcm_action_drain_init, substream,
                                ACTION_ARG_IGNORE);
        if (result < 0)
                goto unlock;
        /* in non-blocking, we don't wait in ioctl but let caller poll */
        if (nonblock) {
                result = -EAGAIN;
                goto unlock;
        }

        for (;;) {
                long tout;
                struct snd_pcm_runtime *to_check;
                if (signal_pending(current)) {
                        result = -ERESTARTSYS;
                        break;
                }
                /* find a substream to drain */
                to_check = NULL;
                group = snd_pcm_stream_group_ref(substream);
                snd_pcm_group_for_each_entry(s, substream) {
                        if (s->stream != SNDRV_PCM_STREAM_PLAYBACK)
                                continue;
                        runtime = s->runtime;
                        if (runtime->state == SNDRV_PCM_STATE_DRAINING) {
                                to_check = runtime;
                                break;
                        }
                }
                snd_pcm_group_unref(group, substream);
                if (!to_check)
                        break; /* all drained */
                init_waitqueue_entry(&wait, current);
                set_current_state(TASK_INTERRUPTIBLE);
                add_wait_queue(&to_check->sleep, &wait);
                snd_pcm_stream_unlock_irq(substream);
                if (runtime->no_period_wakeup)
                        tout = MAX_SCHEDULE_TIMEOUT;
                else {
                        tout = 100;
                        if (runtime->rate) {
                                long t = runtime->buffer_size * 1100 / runtime->rate;
                                tout = max(t, tout);
                        }
                        tout = msecs_to_jiffies(tout);
                }
                tout = schedule_timeout(tout);

                snd_pcm_stream_lock_irq(substream);
                group = snd_pcm_stream_group_ref(substream);
                snd_pcm_group_for_each_entry(s, substream) {
                        if (s->runtime == to_check) {
                                remove_wait_queue(&to_check->sleep, &wait);
                                break;
                        }
                }
                snd_pcm_group_unref(group, substream);

                if (card->shutdown) {
                        result = -ENODEV;
                        break;
                }
                if (tout == 0) {
                        if (substream->runtime->state == SNDRV_PCM_STATE_SUSPENDED)
                                result = -ESTRPIPE;
                        else {
                                dev_dbg(substream->pcm->card->dev,
                                        "playback drain timeout (DMA or IRQ trouble?)\n");
                                snd_pcm_stop(substream, SNDRV_PCM_STATE_SETUP);
                                result = -EIO;
                        }
                        break;
                }
        }

 unlock:
        snd_pcm_stream_unlock_irq(substream);

        return result;
}

/*
 * drop ioctl
 *
 * Immediately put all linked substreams into SETUP state.
 */
static int snd_pcm_drop(struct snd_pcm_substream *substream)
{
        struct snd_pcm_runtime *runtime;
        int result = 0;
        
        if (PCM_RUNTIME_CHECK(substream))
                return -ENXIO;
        runtime = substream->runtime;

        if (runtime->state == SNDRV_PCM_STATE_OPEN ||
            runtime->state == SNDRV_PCM_STATE_DISCONNECTED)
                return -EBADFD;

        guard(pcm_stream_lock_irq)(substream);
        /* resume pause */
        if (runtime->state == SNDRV_PCM_STATE_PAUSED)
                snd_pcm_pause(substream, false);

        snd_pcm_stop(substream, SNDRV_PCM_STATE_SETUP);
        /* runtime->control->appl_ptr = runtime->status->hw_ptr; */

        return result;
}


static bool is_pcm_file(struct file *file)
{
        struct inode *inode = file_inode(file);
        struct snd_pcm *pcm;
        unsigned int minor;

        if (!S_ISCHR(inode->i_mode) || imajor(inode) != snd_major)
                return false;
        minor = iminor(inode);
        pcm = snd_lookup_minor_data(minor, SNDRV_DEVICE_TYPE_PCM_PLAYBACK);
        if (!pcm)
                pcm = snd_lookup_minor_data(minor, SNDRV_DEVICE_TYPE_PCM_CAPTURE);
        if (!pcm)
                return false;
        snd_card_unref(pcm->card);
        return true;
}

/*
 * PCM link handling
 */
static int snd_pcm_link(struct snd_pcm_substream *substream, int fd)
{
        struct snd_pcm_file *pcm_file;
        struct snd_pcm_substream *substream1;
        struct snd_pcm_group *group __free(kfree) = NULL;
        struct snd_pcm_group *target_group;
        bool nonatomic = substream->pcm->nonatomic;
        CLASS(fd, f)(fd);

        if (!f.file)
                return -EBADFD;
        if (!is_pcm_file(f.file))
                return -EBADFD;

        pcm_file = f.file->private_data;
        substream1 = pcm_file->substream;

        if (substream == substream1)
                return -EINVAL;

        group = kzalloc(sizeof(*group), GFP_KERNEL);
        if (!group)
                return -ENOMEM;
        snd_pcm_group_init(group);

        guard(rwsem_write)(&snd_pcm_link_rwsem);
        if (substream->runtime->state == SNDRV_PCM_STATE_OPEN ||
            substream->runtime->state != substream1->runtime->state ||
            substream->pcm->nonatomic != substream1->pcm->nonatomic)
                return -EBADFD;
        if (snd_pcm_stream_linked(substream1))
                return -EALREADY;

        scoped_guard(pcm_stream_lock_irq, substream) {
                if (!snd_pcm_stream_linked(substream)) {
                        snd_pcm_group_assign(substream, group);
                        group = NULL; /* assigned, don't free this one below */
                }
                target_group = substream->group;
        }

        snd_pcm_group_lock_irq(target_group, nonatomic);
        snd_pcm_stream_lock_nested(substream1);
        snd_pcm_group_assign(substream1, target_group);
        refcount_inc(&target_group->refs);
        snd_pcm_stream_unlock(substream1);
        snd_pcm_group_unlock_irq(target_group, nonatomic);
        return 0;
}

static void relink_to_local(struct snd_pcm_substream *substream)
{
        snd_pcm_stream_lock_nested(substream);
        snd_pcm_group_assign(substream, &substream->self_group);
        snd_pcm_stream_unlock(substream);
}

static int snd_pcm_unlink(struct snd_pcm_substream *substream)
{
        struct snd_pcm_group *group;
        bool nonatomic = substream->pcm->nonatomic;
        bool do_free = false;

        guard(rwsem_write)(&snd_pcm_link_rwsem);

        if (!snd_pcm_stream_linked(substream))
                return -EALREADY;

        group = substream->group;
        snd_pcm_group_lock_irq(group, nonatomic);

        relink_to_local(substream);
        refcount_dec(&group->refs);

        /* detach the last stream, too */
        if (list_is_singular(&group->substreams)) {
                relink_to_local(list_first_entry(&group->substreams,
                                                 struct snd_pcm_substream,
                                                 link_list));
                do_free = refcount_dec_and_test(&group->refs);
        }

        snd_pcm_group_unlock_irq(group, nonatomic);
        if (do_free)
                kfree(group);
        return 0;
}

/*
 * hw configurator
 */
static int snd_pcm_hw_rule_mul(struct snd_pcm_hw_params *params,
                               struct snd_pcm_hw_rule *rule)
{
        struct snd_interval t;
        snd_interval_mul(hw_param_interval_c(params, rule->deps[0]),
                     hw_param_interval_c(params, rule->deps[1]), &t);
        return snd_interval_refine(hw_param_interval(params, rule->var), &t);
}

static int snd_pcm_hw_rule_div(struct snd_pcm_hw_params *params,
                               struct snd_pcm_hw_rule *rule)
{
        struct snd_interval t;
        snd_interval_div(hw_param_interval_c(params, rule->deps[0]),
                     hw_param_interval_c(params, rule->deps[1]), &t);
        return snd_interval_refine(hw_param_interval(params, rule->var), &t);
}

static int snd_pcm_hw_rule_muldivk(struct snd_pcm_hw_params *params,
                                   struct snd_pcm_hw_rule *rule)
{
        struct snd_interval t;
        snd_interval_muldivk(hw_param_interval_c(params, rule->deps[0]),
                         hw_param_interval_c(params, rule->deps[1]),
                         (unsigned long) rule->private, &t);
        return snd_interval_refine(hw_param_interval(params, rule->var), &t);
}

static int snd_pcm_hw_rule_mulkdiv(struct snd_pcm_hw_params *params,
                                   struct snd_pcm_hw_rule *rule)
{
        struct snd_interval t;
        snd_interval_mulkdiv(hw_param_interval_c(params, rule->deps[0]),
                         (unsigned long) rule->private,
                         hw_param_interval_c(params, rule->deps[1]), &t);
        return snd_interval_refine(hw_param_interval(params, rule->var), &t);
}

static int snd_pcm_hw_rule_format(struct snd_pcm_hw_params *params,
                                  struct snd_pcm_hw_rule *rule)
{
        snd_pcm_format_t k;
        const struct snd_interval *i =
                                hw_param_interval_c(params, rule->deps[0]);
        struct snd_mask m;
        struct snd_mask *mask = hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT);
        snd_mask_any(&m);
        pcm_for_each_format(k) {
                int bits;
                if (!snd_mask_test_format(mask, k))
                        continue;
                bits = snd_pcm_format_physical_width(k);
                if (bits <= 0)
                        continue; /* ignore invalid formats */
                if ((unsigned)bits < i->min || (unsigned)bits > i->max)
                        snd_mask_reset(&m, (__force unsigned)k);
        }
        return snd_mask_refine(mask, &m);
}

static int snd_pcm_hw_rule_sample_bits(struct snd_pcm_hw_params *params,
                                       struct snd_pcm_hw_rule *rule)
{
        struct snd_interval t;
        snd_pcm_format_t k;

        t.min = UINT_MAX;
        t.max = 0;
        t.openmin = 0;
        t.openmax = 0;
        pcm_for_each_format(k) {
                int bits;
                if (!snd_mask_test_format(hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT), k))
                        continue;
                bits = snd_pcm_format_physical_width(k);
                if (bits <= 0)
                        continue; /* ignore invalid formats */
                if (t.min > (unsigned)bits)
                        t.min = bits;
                if (t.max < (unsigned)bits)
                        t.max = bits;
        }
        t.integer = 1;
        return snd_interval_refine(hw_param_interval(params, rule->var), &t);
}

#if SNDRV_PCM_RATE_5512 != 1 << 0 || SNDRV_PCM_RATE_192000 != 1 << 12
#error "Change this table"
#endif

static const unsigned int rates[] = {
        5512, 8000, 11025, 16000, 22050, 32000, 44100,
        48000, 64000, 88200, 96000, 176400, 192000, 352800, 384000, 705600, 768000
};

const struct snd_pcm_hw_constraint_list snd_pcm_known_rates = {
        .count = ARRAY_SIZE(rates),
        .list = rates,
};

static int snd_pcm_hw_rule_rate(struct snd_pcm_hw_params *params,
                                struct snd_pcm_hw_rule *rule)
{
        struct snd_pcm_hardware *hw = rule->private;
        return snd_interval_list(hw_param_interval(params, rule->var),
                                 snd_pcm_known_rates.count,
                                 snd_pcm_known_rates.list, hw->rates);
}                

static int snd_pcm_hw_rule_buffer_bytes_max(struct snd_pcm_hw_params *params,
                                            struct snd_pcm_hw_rule *rule)
{
        struct snd_interval t;
        struct snd_pcm_substream *substream = rule->private;
        t.min = 0;
        t.max = substream->buffer_bytes_max;
        t.openmin = 0;
        t.openmax = 0;
        t.integer = 1;
        return snd_interval_refine(hw_param_interval(params, rule->var), &t);
}                

static int snd_pcm_hw_rule_subformats(struct snd_pcm_hw_params *params,
                                      struct snd_pcm_hw_rule *rule)
{
        struct snd_mask *sfmask = hw_param_mask(params, SNDRV_PCM_HW_PARAM_SUBFORMAT);
        struct snd_mask *fmask = hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT);
        u32 *subformats = rule->private;
        snd_pcm_format_t f;
        struct snd_mask m;

        snd_mask_none(&m);
        /* All PCMs support at least the default STD subformat. */
        snd_mask_set(&m, (__force unsigned)SNDRV_PCM_SUBFORMAT_STD);

        pcm_for_each_format(f) {
                if (!snd_mask_test(fmask, (__force unsigned)f))
                        continue;

                if (f == SNDRV_PCM_FORMAT_S32_LE && *subformats)
                        m.bits[0] |= *subformats;
                else if (snd_pcm_format_linear(f))
                        snd_mask_set(&m, (__force unsigned)SNDRV_PCM_SUBFORMAT_MSBITS_MAX);
        }

        return snd_mask_refine(sfmask, &m);
}

static int snd_pcm_hw_constraint_subformats(struct snd_pcm_runtime *runtime,
                                           unsigned int cond, u32 *subformats)
{
        return snd_pcm_hw_rule_add(runtime, cond, -1,
                                   snd_pcm_hw_rule_subformats, (void *)subformats,
                                   SNDRV_PCM_HW_PARAM_SUBFORMAT,
                                   SNDRV_PCM_HW_PARAM_FORMAT, -1);
}

static int snd_pcm_hw_constraints_init(struct snd_pcm_substream *substream)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        struct snd_pcm_hw_constraints *constrs = &runtime->hw_constraints;
        int k, err;

        for (k = SNDRV_PCM_HW_PARAM_FIRST_MASK; k <= SNDRV_PCM_HW_PARAM_LAST_MASK; k++) {
                snd_mask_any(constrs_mask(constrs, k));
        }

        for (k = SNDRV_PCM_HW_PARAM_FIRST_INTERVAL; k <= SNDRV_PCM_HW_PARAM_LAST_INTERVAL; k++) {
                snd_interval_any(constrs_interval(constrs, k));
        }

        snd_interval_setinteger(constrs_interval(constrs, SNDRV_PCM_HW_PARAM_CHANNELS));
        snd_interval_setinteger(constrs_interval(constrs, SNDRV_PCM_HW_PARAM_BUFFER_SIZE));
        snd_interval_setinteger(constrs_interval(constrs, SNDRV_PCM_HW_PARAM_BUFFER_BYTES));
        snd_interval_setinteger(constrs_interval(constrs, SNDRV_PCM_HW_PARAM_SAMPLE_BITS));
        snd_interval_setinteger(constrs_interval(constrs, SNDRV_PCM_HW_PARAM_FRAME_BITS));

        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_FORMAT,
                                   snd_pcm_hw_rule_format, NULL,
                                   SNDRV_PCM_HW_PARAM_SAMPLE_BITS, -1);
        if (err < 0)
                return err;
        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_SAMPLE_BITS, 
                                  snd_pcm_hw_rule_sample_bits, NULL,
                                  SNDRV_PCM_HW_PARAM_FORMAT, 
                                  SNDRV_PCM_HW_PARAM_SAMPLE_BITS, -1);
        if (err < 0)
                return err;
        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_SAMPLE_BITS, 
                                  snd_pcm_hw_rule_div, NULL,
                                  SNDRV_PCM_HW_PARAM_FRAME_BITS, SNDRV_PCM_HW_PARAM_CHANNELS, -1);
        if (err < 0)
                return err;
        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_FRAME_BITS, 
                                  snd_pcm_hw_rule_mul, NULL,
                                  SNDRV_PCM_HW_PARAM_SAMPLE_BITS, SNDRV_PCM_HW_PARAM_CHANNELS, -1);
        if (err < 0)
                return err;
        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_FRAME_BITS, 
                                  snd_pcm_hw_rule_mulkdiv, (void*) 8,
                                  SNDRV_PCM_HW_PARAM_PERIOD_BYTES, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, -1);
        if (err < 0)
                return err;
        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_FRAME_BITS, 
                                  snd_pcm_hw_rule_mulkdiv, (void*) 8,
                                  SNDRV_PCM_HW_PARAM_BUFFER_BYTES, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, -1);
        if (err < 0)
                return err;
        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_CHANNELS, 
                                  snd_pcm_hw_rule_div, NULL,
                                  SNDRV_PCM_HW_PARAM_FRAME_BITS, SNDRV_PCM_HW_PARAM_SAMPLE_BITS, -1);
        if (err < 0)
                return err;
        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_RATE, 
                                  snd_pcm_hw_rule_mulkdiv, (void*) 1000000,
                                  SNDRV_PCM_HW_PARAM_PERIOD_SIZE, SNDRV_PCM_HW_PARAM_PERIOD_TIME, -1);
        if (err < 0)
                return err;
        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_RATE, 
                                  snd_pcm_hw_rule_mulkdiv, (void*) 1000000,
                                  SNDRV_PCM_HW_PARAM_BUFFER_SIZE, SNDRV_PCM_HW_PARAM_BUFFER_TIME, -1);
        if (err < 0)
                return err;
        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIODS, 
                                  snd_pcm_hw_rule_div, NULL,
                                  SNDRV_PCM_HW_PARAM_BUFFER_SIZE, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, -1);
        if (err < 0)
                return err;
        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, 
                                  snd_pcm_hw_rule_div, NULL,
                                  SNDRV_PCM_HW_PARAM_BUFFER_SIZE, SNDRV_PCM_HW_PARAM_PERIODS, -1);
        if (err < 0)
                return err;
        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, 
                                  snd_pcm_hw_rule_mulkdiv, (void*) 8,
                                  SNDRV_PCM_HW_PARAM_PERIOD_BYTES, SNDRV_PCM_HW_PARAM_FRAME_BITS, -1);
        if (err < 0)
                return err;
        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, 
                                  snd_pcm_hw_rule_muldivk, (void*) 1000000,
                                  SNDRV_PCM_HW_PARAM_PERIOD_TIME, SNDRV_PCM_HW_PARAM_RATE, -1);
        if (err < 0)
                return err;
        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, 
                                  snd_pcm_hw_rule_mul, NULL,
                                  SNDRV_PCM_HW_PARAM_PERIOD_SIZE, SNDRV_PCM_HW_PARAM_PERIODS, -1);
        if (err < 0)
                return err;
        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, 
                                  snd_pcm_hw_rule_mulkdiv, (void*) 8,
                                  SNDRV_PCM_HW_PARAM_BUFFER_BYTES, SNDRV_PCM_HW_PARAM_FRAME_BITS, -1);
        if (err < 0)
                return err;
        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, 
                                  snd_pcm_hw_rule_muldivk, (void*) 1000000,
                                  SNDRV_PCM_HW_PARAM_BUFFER_TIME, SNDRV_PCM_HW_PARAM_RATE, -1);
        if (err < 0)
                return err;
        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_BYTES, 
                                  snd_pcm_hw_rule_muldivk, (void*) 8,
                                  SNDRV_PCM_HW_PARAM_PERIOD_SIZE, SNDRV_PCM_HW_PARAM_FRAME_BITS, -1);
        if (err < 0)
                return err;
        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, 
                                  snd_pcm_hw_rule_muldivk, (void*) 8,
                                  SNDRV_PCM_HW_PARAM_BUFFER_SIZE, SNDRV_PCM_HW_PARAM_FRAME_BITS, -1);
        if (err < 0)
                return err;
        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_TIME, 
                                  snd_pcm_hw_rule_mulkdiv, (void*) 1000000,
                                  SNDRV_PCM_HW_PARAM_PERIOD_SIZE, SNDRV_PCM_HW_PARAM_RATE, -1);
        if (err < 0)
                return err;
        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_TIME, 
                                  snd_pcm_hw_rule_mulkdiv, (void*) 1000000,
                                  SNDRV_PCM_HW_PARAM_BUFFER_SIZE, SNDRV_PCM_HW_PARAM_RATE, -1);
        if (err < 0)
                return err;
        return 0;
}

static int snd_pcm_hw_constraints_complete(struct snd_pcm_substream *substream)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        struct snd_pcm_hardware *hw = &runtime->hw;
        int err;
        unsigned int mask = 0;

        if (hw->info & SNDRV_PCM_INFO_INTERLEAVED)
                mask |= PARAM_MASK_BIT(SNDRV_PCM_ACCESS_RW_INTERLEAVED);
        if (hw->info & SNDRV_PCM_INFO_NONINTERLEAVED)
                mask |= PARAM_MASK_BIT(SNDRV_PCM_ACCESS_RW_NONINTERLEAVED);
        if (hw_support_mmap(substream)) {
                if (hw->info & SNDRV_PCM_INFO_INTERLEAVED)
                        mask |= PARAM_MASK_BIT(SNDRV_PCM_ACCESS_MMAP_INTERLEAVED);
                if (hw->info & SNDRV_PCM_INFO_NONINTERLEAVED)
                        mask |= PARAM_MASK_BIT(SNDRV_PCM_ACCESS_MMAP_NONINTERLEAVED);
                if (hw->info & SNDRV_PCM_INFO_COMPLEX)
                        mask |= PARAM_MASK_BIT(SNDRV_PCM_ACCESS_MMAP_COMPLEX);
        }
        err = snd_pcm_hw_constraint_mask(runtime, SNDRV_PCM_HW_PARAM_ACCESS, mask);
        if (err < 0)
                return err;

        err = snd_pcm_hw_constraint_mask64(runtime, SNDRV_PCM_HW_PARAM_FORMAT, hw->formats);
        if (err < 0)
                return err;

        err = snd_pcm_hw_constraint_subformats(runtime, 0, &hw->subformats);
        if (err < 0)
                return err;

        err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_CHANNELS,
                                           hw->channels_min, hw->channels_max);
        if (err < 0)
                return err;

        err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_RATE,
                                           hw->rate_min, hw->rate_max);
        if (err < 0)
                return err;

        err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_PERIOD_BYTES,
                                           hw->period_bytes_min, hw->period_bytes_max);
        if (err < 0)
                return err;

        err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_PERIODS,
                                           hw->periods_min, hw->periods_max);
        if (err < 0)
                return err;

        err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_BUFFER_BYTES,
                                           hw->period_bytes_min, hw->buffer_bytes_max);
        if (err < 0)
                return err;

        err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, 
                                  snd_pcm_hw_rule_buffer_bytes_max, substream,
                                  SNDRV_PCM_HW_PARAM_BUFFER_BYTES, -1);
        if (err < 0)
                return err;

        /* FIXME: remove */
        if (runtime->dma_bytes) {
                err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, 0, runtime->dma_bytes);
                if (err < 0)
                        return err;
        }

        if (!(hw->rates & (SNDRV_PCM_RATE_KNOT | SNDRV_PCM_RATE_CONTINUOUS))) {
                err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_RATE, 
                                          snd_pcm_hw_rule_rate, hw,
                                          SNDRV_PCM_HW_PARAM_RATE, -1);
                if (err < 0)
                        return err;
        }

        /* FIXME: this belong to lowlevel */
        snd_pcm_hw_constraint_integer(runtime, SNDRV_PCM_HW_PARAM_PERIOD_SIZE);

        return 0;
}

static void pcm_release_private(struct snd_pcm_substream *substream)
{
        if (snd_pcm_stream_linked(substream))
                snd_pcm_unlink(substream);
}

void snd_pcm_release_substream(struct snd_pcm_substream *substream)
{
        substream->ref_count--;
        if (substream->ref_count > 0)
                return;

        snd_pcm_drop(substream);
        if (substream->hw_opened) {
                if (substream->runtime->state != SNDRV_PCM_STATE_OPEN)
                        do_hw_free(substream);
                substream->ops->close(substream);
                substream->hw_opened = 0;
        }
        if (cpu_latency_qos_request_active(&substream->latency_pm_qos_req))
                cpu_latency_qos_remove_request(&substream->latency_pm_qos_req);
        if (substream->pcm_release) {
                substream->pcm_release(substream);
                substream->pcm_release = NULL;
        }
        snd_pcm_detach_substream(substream);
}
EXPORT_SYMBOL(snd_pcm_release_substream);

int snd_pcm_open_substream(struct snd_pcm *pcm, int stream,
                           struct file *file,
                           struct snd_pcm_substream **rsubstream)
{
        struct snd_pcm_substream *substream;
        int err;

        err = snd_pcm_attach_substream(pcm, stream, file, &substream);
        if (err < 0)
                return err;
        if (substream->ref_count > 1) {
                *rsubstream = substream;
                return 0;
        }

        err = snd_pcm_hw_constraints_init(substream);
        if (err < 0) {
                pcm_dbg(pcm, "snd_pcm_hw_constraints_init failed\n");
                goto error;
        }

        err = substream->ops->open(substream);
        if (err < 0)
                goto error;

        substream->hw_opened = 1;

        err = snd_pcm_hw_constraints_complete(substream);
        if (err < 0) {
                pcm_dbg(pcm, "snd_pcm_hw_constraints_complete failed\n");
                goto error;
        }

        /* automatically set EXPLICIT_SYNC flag in the managed mode whenever
         * the DMA buffer requires it
         */
        if (substream->managed_buffer_alloc &&
            substream->dma_buffer.dev.need_sync)
                substream->runtime->hw.info |= SNDRV_PCM_INFO_EXPLICIT_SYNC;

        *rsubstream = substream;
        return 0;

 error:
        snd_pcm_release_substream(substream);
        return err;
}
EXPORT_SYMBOL(snd_pcm_open_substream);

static int snd_pcm_open_file(struct file *file,
                             struct snd_pcm *pcm,
                             int stream)
{
        struct snd_pcm_file *pcm_file;
        struct snd_pcm_substream *substream;
        int err;

        err = snd_pcm_open_substream(pcm, stream, file, &substream);
        if (err < 0)
                return err;

        pcm_file = kzalloc(sizeof(*pcm_file), GFP_KERNEL);
        if (pcm_file == NULL) {
                snd_pcm_release_substream(substream);
                return -ENOMEM;
        }
        pcm_file->substream = substream;
        if (substream->ref_count == 1)
                substream->pcm_release = pcm_release_private;
        file->private_data = pcm_file;

        return 0;
}

static int snd_pcm_playback_open(struct inode *inode, struct file *file)
{
        struct snd_pcm *pcm;
        int err = nonseekable_open(inode, file);
        if (err < 0)
                return err;
        pcm = snd_lookup_minor_data(iminor(inode),
                                    SNDRV_DEVICE_TYPE_PCM_PLAYBACK);
        err = snd_pcm_open(file, pcm, SNDRV_PCM_STREAM_PLAYBACK);
        if (pcm)
                snd_card_unref(pcm->card);
        return err;
}

static int snd_pcm_capture_open(struct inode *inode, struct file *file)
{
        struct snd_pcm *pcm;
        int err = nonseekable_open(inode, file);
        if (err < 0)
                return err;
        pcm = snd_lookup_minor_data(iminor(inode),
                                    SNDRV_DEVICE_TYPE_PCM_CAPTURE);
        err = snd_pcm_open(file, pcm, SNDRV_PCM_STREAM_CAPTURE);
        if (pcm)
                snd_card_unref(pcm->card);
        return err;
}

static int snd_pcm_open(struct file *file, struct snd_pcm *pcm, int stream)
{
        int err;
        wait_queue_entry_t wait;

        if (pcm == NULL) {
                err = -ENODEV;
                goto __error1;
        }
        err = snd_card_file_add(pcm->card, file);
        if (err < 0)
                goto __error1;
        if (!try_module_get(pcm->card->module)) {
                err = -EFAULT;
                goto __error2;
        }
        init_waitqueue_entry(&wait, current);
        add_wait_queue(&pcm->open_wait, &wait);
        mutex_lock(&pcm->open_mutex);
        while (1) {
                err = snd_pcm_open_file(file, pcm, stream);
                if (err >= 0)
                        break;
                if (err == -EAGAIN) {
                        if (file->f_flags & O_NONBLOCK) {
                                err = -EBUSY;
                                break;
                        }
                } else
                        break;
                set_current_state(TASK_INTERRUPTIBLE);
                mutex_unlock(&pcm->open_mutex);
                schedule();
                mutex_lock(&pcm->open_mutex);
                if (pcm->card->shutdown) {
                        err = -ENODEV;
                        break;
                }
                if (signal_pending(current)) {
                        err = -ERESTARTSYS;
                        break;
                }
        }
        remove_wait_queue(&pcm->open_wait, &wait);
        mutex_unlock(&pcm->open_mutex);
        if (err < 0)
                goto __error;
        return err;

      __error:
        module_put(pcm->card->module);
      __error2:
              snd_card_file_remove(pcm->card, file);
      __error1:
              return err;
}

static int snd_pcm_release(struct inode *inode, struct file *file)
{
        struct snd_pcm *pcm;
        struct snd_pcm_substream *substream;
        struct snd_pcm_file *pcm_file;

        pcm_file = file->private_data;
        substream = pcm_file->substream;
        if (snd_BUG_ON(!substream))
                return -ENXIO;
        pcm = substream->pcm;

        /* block until the device gets woken up as it may touch the hardware */
        snd_power_wait(pcm->card);

        scoped_guard(mutex, &pcm->open_mutex) {
                snd_pcm_release_substream(substream);
                kfree(pcm_file);
        }
        wake_up(&pcm->open_wait);
        module_put(pcm->card->module);
        snd_card_file_remove(pcm->card, file);
        return 0;
}

/* check and update PCM state; return 0 or a negative error
 * call this inside PCM lock
 */
static int do_pcm_hwsync(struct snd_pcm_substream *substream)
{
        switch (substream->runtime->state) {
        case SNDRV_PCM_STATE_DRAINING:
                if (substream->stream == SNDRV_PCM_STREAM_CAPTURE)
                        return -EBADFD;
                fallthrough;
        case SNDRV_PCM_STATE_RUNNING:
                return snd_pcm_update_hw_ptr(substream);
        case SNDRV_PCM_STATE_PREPARED:
        case SNDRV_PCM_STATE_PAUSED:
                return 0;
        case SNDRV_PCM_STATE_SUSPENDED:
                return -ESTRPIPE;
        case SNDRV_PCM_STATE_XRUN:
                return -EPIPE;
        default:
                return -EBADFD;
        }
}

/* increase the appl_ptr; returns the processed frames or a negative error */
static snd_pcm_sframes_t forward_appl_ptr(struct snd_pcm_substream *substream,
                                          snd_pcm_uframes_t frames,
                                           snd_pcm_sframes_t avail)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        snd_pcm_sframes_t appl_ptr;
        int ret;

        if (avail <= 0)
                return 0;
        if (frames > (snd_pcm_uframes_t)avail)
                frames = avail;
        appl_ptr = runtime->control->appl_ptr + frames;
        if (appl_ptr >= (snd_pcm_sframes_t)runtime->boundary)
                appl_ptr -= runtime->boundary;
        ret = pcm_lib_apply_appl_ptr(substream, appl_ptr);
        return ret < 0 ? ret : frames;
}

/* decrease the appl_ptr; returns the processed frames or zero for error */
static snd_pcm_sframes_t rewind_appl_ptr(struct snd_pcm_substream *substream,
                                         snd_pcm_uframes_t frames,
                                         snd_pcm_sframes_t avail)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        snd_pcm_sframes_t appl_ptr;
        int ret;

        if (avail <= 0)
                return 0;
        if (frames > (snd_pcm_uframes_t)avail)
                frames = avail;
        appl_ptr = runtime->control->appl_ptr - frames;
        if (appl_ptr < 0)
                appl_ptr += runtime->boundary;
        ret = pcm_lib_apply_appl_ptr(substream, appl_ptr);
        /* NOTE: we return zero for errors because PulseAudio gets depressed
         * upon receiving an error from rewind ioctl and stops processing
         * any longer.  Returning zero means that no rewind is done, so
         * it's not absolutely wrong to answer like that.
         */
        return ret < 0 ? 0 : frames;
}

static snd_pcm_sframes_t snd_pcm_rewind(struct snd_pcm_substream *substream,
                                        snd_pcm_uframes_t frames)
{
        snd_pcm_sframes_t ret;

        if (frames == 0)
                return 0;

        scoped_guard(pcm_stream_lock_irq, substream) {
                ret = do_pcm_hwsync(substream);
                if (!ret)
                        ret = rewind_appl_ptr(substream, frames,
                                              snd_pcm_hw_avail(substream));
        }
        if (ret >= 0)
                snd_pcm_dma_buffer_sync(substream, SNDRV_DMA_SYNC_DEVICE);
        return ret;
}

static snd_pcm_sframes_t snd_pcm_forward(struct snd_pcm_substream *substream,
                                         snd_pcm_uframes_t frames)
{
        snd_pcm_sframes_t ret;

        if (frames == 0)
                return 0;

        scoped_guard(pcm_stream_lock_irq, substream) {
                ret = do_pcm_hwsync(substream);
                if (!ret)
                        ret = forward_appl_ptr(substream, frames,
                                               snd_pcm_avail(substream));
        }
        if (ret >= 0)
                snd_pcm_dma_buffer_sync(substream, SNDRV_DMA_SYNC_DEVICE);
        return ret;
}

static int snd_pcm_delay(struct snd_pcm_substream *substream,
                         snd_pcm_sframes_t *delay)
{
        int err;

        scoped_guard(pcm_stream_lock_irq, substream) {
                err = do_pcm_hwsync(substream);
                if (delay && !err)
                        *delay = snd_pcm_calc_delay(substream);
        }
        snd_pcm_dma_buffer_sync(substream, SNDRV_DMA_SYNC_CPU);

        return err;
}
                
static inline int snd_pcm_hwsync(struct snd_pcm_substream *substream)
{
        return snd_pcm_delay(substream, NULL);
}

static int snd_pcm_sync_ptr(struct snd_pcm_substream *substream,
                            struct snd_pcm_sync_ptr __user *_sync_ptr)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        struct snd_pcm_sync_ptr sync_ptr;
        volatile struct snd_pcm_mmap_status *status;
        volatile struct snd_pcm_mmap_control *control;
        int err;

        memset(&sync_ptr, 0, sizeof(sync_ptr));
        if (get_user(sync_ptr.flags, (unsigned __user *)&(_sync_ptr->flags)))
                return -EFAULT;
        if (copy_from_user(&sync_ptr.c.control, &(_sync_ptr->c.control), sizeof(struct snd_pcm_mmap_control)))
                return -EFAULT;        
        status = runtime->status;
        control = runtime->control;
        if (sync_ptr.flags & SNDRV_PCM_SYNC_PTR_HWSYNC) {
                err = snd_pcm_hwsync(substream);
                if (err < 0)
                        return err;
        }
        scoped_guard(pcm_stream_lock_irq, substream) {
                if (!(sync_ptr.flags & SNDRV_PCM_SYNC_PTR_APPL)) {
                        err = pcm_lib_apply_appl_ptr(substream,
                                                     sync_ptr.c.control.appl_ptr);
                        if (err < 0)
                                return err;
                } else {
                        sync_ptr.c.control.appl_ptr = control->appl_ptr;
                }
                if (!(sync_ptr.flags & SNDRV_PCM_SYNC_PTR_AVAIL_MIN))
                        control->avail_min = sync_ptr.c.control.avail_min;
                else
                        sync_ptr.c.control.avail_min = control->avail_min;
                sync_ptr.s.status.state = status->state;
                sync_ptr.s.status.hw_ptr = status->hw_ptr;
                sync_ptr.s.status.tstamp = status->tstamp;
                sync_ptr.s.status.suspended_state = status->suspended_state;
                sync_ptr.s.status.audio_tstamp = status->audio_tstamp;
        }
        if (!(sync_ptr.flags & SNDRV_PCM_SYNC_PTR_APPL))
                snd_pcm_dma_buffer_sync(substream, SNDRV_DMA_SYNC_DEVICE);
        if (copy_to_user(_sync_ptr, &sync_ptr, sizeof(sync_ptr)))
                return -EFAULT;
        return 0;
}

struct snd_pcm_mmap_status32 {
        snd_pcm_state_t state;
        s32 pad1;
        u32 hw_ptr;
        s32 tstamp_sec;
        s32 tstamp_nsec;
        snd_pcm_state_t suspended_state;
        s32 audio_tstamp_sec;
        s32 audio_tstamp_nsec;
} __packed;

struct snd_pcm_mmap_control32 {
        u32 appl_ptr;
        u32 avail_min;
};

struct snd_pcm_sync_ptr32 {
        u32 flags;
        union {
                struct snd_pcm_mmap_status32 status;
                unsigned char reserved[64];
        } s;
        union {
                struct snd_pcm_mmap_control32 control;
                unsigned char reserved[64];
        } c;
} __packed;

/* recalcuate the boundary within 32bit */
static snd_pcm_uframes_t recalculate_boundary(struct snd_pcm_runtime *runtime)
{
        snd_pcm_uframes_t boundary;

        if (! runtime->buffer_size)
                return 0;
        boundary = runtime->buffer_size;
        while (boundary * 2 <= 0x7fffffffUL - runtime->buffer_size)
                boundary *= 2;
        return boundary;
}

static int snd_pcm_ioctl_sync_ptr_compat(struct snd_pcm_substream *substream,
                                         struct snd_pcm_sync_ptr32 __user *src)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        volatile struct snd_pcm_mmap_status *status;
        volatile struct snd_pcm_mmap_control *control;
        u32 sflags;
        struct snd_pcm_mmap_control scontrol;
        struct snd_pcm_mmap_status sstatus;
        snd_pcm_uframes_t boundary;
        int err;

        if (snd_BUG_ON(!runtime))
                return -EINVAL;

        if (get_user(sflags, &src->flags) ||
            get_user(scontrol.appl_ptr, &src->c.control.appl_ptr) ||
            get_user(scontrol.avail_min, &src->c.control.avail_min))
                return -EFAULT;
        if (sflags & SNDRV_PCM_SYNC_PTR_HWSYNC) {
                err = snd_pcm_hwsync(substream);
                if (err < 0)
                        return err;
        }
        status = runtime->status;
        control = runtime->control;
        boundary = recalculate_boundary(runtime);
        if (! boundary)
                boundary = 0x7fffffff;
        scoped_guard(pcm_stream_lock_irq, substream) {
                /* FIXME: we should consider the boundary for the sync from app */
                if (!(sflags & SNDRV_PCM_SYNC_PTR_APPL)) {
                        err = pcm_lib_apply_appl_ptr(substream,
                                                     scontrol.appl_ptr);
                        if (err < 0)
                                return err;
                } else
                        scontrol.appl_ptr = control->appl_ptr % boundary;
                if (!(sflags & SNDRV_PCM_SYNC_PTR_AVAIL_MIN))
                        control->avail_min = scontrol.avail_min;
                else
                        scontrol.avail_min = control->avail_min;
                sstatus.state = status->state;
                sstatus.hw_ptr = status->hw_ptr % boundary;
                sstatus.tstamp = status->tstamp;
                sstatus.suspended_state = status->suspended_state;
                sstatus.audio_tstamp = status->audio_tstamp;
        }
        if (!(sflags & SNDRV_PCM_SYNC_PTR_APPL))
                snd_pcm_dma_buffer_sync(substream, SNDRV_DMA_SYNC_DEVICE);
        if (put_user(sstatus.state, &src->s.status.state) ||
            put_user(sstatus.hw_ptr, &src->s.status.hw_ptr) ||
            put_user(sstatus.tstamp.tv_sec, &src->s.status.tstamp_sec) ||
            put_user(sstatus.tstamp.tv_nsec, &src->s.status.tstamp_nsec) ||
            put_user(sstatus.suspended_state, &src->s.status.suspended_state) ||
            put_user(sstatus.audio_tstamp.tv_sec, &src->s.status.audio_tstamp_sec) ||
            put_user(sstatus.audio_tstamp.tv_nsec, &src->s.status.audio_tstamp_nsec) ||
            put_user(scontrol.appl_ptr, &src->c.control.appl_ptr) ||
            put_user(scontrol.avail_min, &src->c.control.avail_min))
                return -EFAULT;

        return 0;
}
#define __SNDRV_PCM_IOCTL_SYNC_PTR32 _IOWR('A', 0x23, struct snd_pcm_sync_ptr32)

static int snd_pcm_tstamp(struct snd_pcm_substream *substream, int __user *_arg)
{
        struct snd_pcm_runtime *runtime = substream->runtime;
        int arg;
        
        if (get_user(arg, _arg))
                return -EFAULT;
        if (arg < 0 || arg > SNDRV_PCM_TSTAMP_TYPE_LAST)
                return -EINVAL;
        runtime->tstamp_type = arg;
        return 0;
}

static int snd_pcm_xferi_frames_ioctl(struct snd_pcm_substream *substream,
                                      struct snd_xferi __user *_xferi)
{
        struct snd_xferi xferi;
        struct snd_pcm_runtime *runtime = substream->runtime;
        snd_pcm_sframes_t result;

        if (runtime->state == SNDRV_PCM_STATE_OPEN)
                return -EBADFD;
        if (put_user(0, &_xferi->result))
                return -EFAULT;
        if (copy_from_user(&xferi, _xferi, sizeof(xferi)))
                return -EFAULT;
        if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK)
                result = snd_pcm_lib_write(substream, xferi.buf, xferi.frames);
        else
                result = snd_pcm_lib_read(substream, xferi.buf, xferi.frames);
        if (put_user(result, &_xferi->result))
                return -EFAULT;
        return result < 0 ? result : 0;
}

static int snd_pcm_xfern_frames_ioctl(struct snd_pcm_substream *substream,
                                      struct snd_xfern __user *_xfern)
{
        struct snd_xfern xfern;
        struct snd_pcm_runtime *runtime = substream->runtime;
        void *bufs __free(kfree) = NULL;
        snd_pcm_sframes_t result;

        if (runtime->state == SNDRV_PCM_STATE_OPEN)
                return -EBADFD;
        if (runtime->channels > 128)
                return -EINVAL;
        if (put_user(0, &_xfern->result))
                return -EFAULT;
        if (copy_from_user(&xfern, _xfern, sizeof(xfern)))
                return -EFAULT;

        bufs = memdup_user(xfern.bufs, sizeof(void *) * runtime->channels);
        if (IS_ERR(bufs))
                return PTR_ERR(no_free_ptr(bufs));
        if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK)
                result = snd_pcm_lib_writev(substream, bufs, xfern.frames);
        else
                result = snd_pcm_lib_readv(substream, bufs, xfern.frames);
        if (put_user(result, &_xfern->result))
                return -EFAULT;
        return result < 0 ? result : 0;
}

static int snd_pcm_rewind_ioctl(struct snd_pcm_substream *substream,
                                snd_pcm_uframes_t __user *_frames)
{
        snd_pcm_uframes_t frames;
        snd_pcm_sframes_t result;

        if (get_user(frames, _frames))
                return -EFAULT;
        if (put_user(0, _frames))
                return -EFAULT;
        result = snd_pcm_rewind(substream, frames);
        if (put_user(result, _frames))
                return -EFAULT;
        return result < 0 ? result : 0;
}

static int snd_pcm_forward_ioctl(struct snd_pcm_substream *substream,
                                 snd_pcm_uframes_t __user *_frames)
{
        snd_pcm_uframes_t frames;
        snd_pcm_sframes_t result;

        if (get_user(frames, _frames))
                return -EFAULT;
        if (put_user(0, _frames))
                return -EFAULT;
        result = snd_pcm_forward(substream, frames);
        if (put_user(result, _frames))
                return -EFAULT;
        return result < 0 ? result : 0;
}

static int snd_pcm_common_ioctl(struct file *file,
                                 struct snd_pcm_substream *substream,
                                 unsigned int cmd, void __user *arg)
{
        struct snd_pcm_file *pcm_file = file->private_data;
        int res;

        if (PCM_RUNTIME_CHECK(substream))
                return -ENXIO;

        if (substream->runtime->state == SNDRV_PCM_STATE_DISCONNECTED)
                return -EBADFD;

        res = snd_power_wait(substream->pcm->card);
        if (res < 0)
                return res;

        switch (cmd) {
        case SNDRV_PCM_IOCTL_PVERSION:
                return put_user(SNDRV_PCM_VERSION, (int __user *)arg) ? -EFAULT : 0;
        case SNDRV_PCM_IOCTL_INFO:
                return snd_pcm_info_user(substream, arg);
        case SNDRV_PCM_IOCTL_TSTAMP:        /* just for compatibility */
                return 0;
        case SNDRV_PCM_IOCTL_TTSTAMP:
                return snd_pcm_tstamp(substream, arg);
        case SNDRV_PCM_IOCTL_USER_PVERSION:
                if (get_user(pcm_file->user_pversion,
                             (unsigned int __user *)arg))
                        return -EFAULT;
                return 0;
        case SNDRV_PCM_IOCTL_HW_REFINE:
                return snd_pcm_hw_refine_user(substream, arg);
        case SNDRV_PCM_IOCTL_HW_PARAMS:
                return snd_pcm_hw_params_user(substream, arg);
        case SNDRV_PCM_IOCTL_HW_FREE:
                return snd_pcm_hw_free(substream);
        case SNDRV_PCM_IOCTL_SW_PARAMS:
                return snd_pcm_sw_params_user(substream, arg);
        case SNDRV_PCM_IOCTL_STATUS32:
                return snd_pcm_status_user32(substream, arg, false);
        case SNDRV_PCM_IOCTL_STATUS_EXT32:
                return snd_pcm_status_user32(substream, arg, true);
        case SNDRV_PCM_IOCTL_STATUS64:
                return snd_pcm_status_user64(substream, arg, false);
        case SNDRV_PCM_IOCTL_STATUS_EXT64:
                return snd_pcm_status_user64(substream, arg, true);
        case SNDRV_PCM_IOCTL_CHANNEL_INFO:
                return snd_pcm_channel_info_user(substream, arg);
        case SNDRV_PCM_IOCTL_PREPARE:
                return snd_pcm_prepare(substream, file);
        case SNDRV_PCM_IOCTL_RESET:
                return snd_pcm_reset(substream);
        case SNDRV_PCM_IOCTL_START:
                return snd_pcm_start_lock_irq(substream);
        case SNDRV_PCM_IOCTL_LINK:
                return snd_pcm_link(substream, (int)(unsigned long) arg);
        case SNDRV_PCM_IOCTL_UNLINK:
                return snd_pcm_unlink(substream);
        case SNDRV_PCM_IOCTL_RESUME:
                return snd_pcm_resume(substream);
        case SNDRV_PCM_IOCTL_XRUN:
                return snd_pcm_xrun(substream);
        case SNDRV_PCM_IOCTL_HWSYNC:
                return snd_pcm_hwsync(substream);
        case SNDRV_PCM_IOCTL_DELAY:
        {
                snd_pcm_sframes_t delay = 0;
                snd_pcm_sframes_t __user *res = arg;
                int err;

                err = snd_pcm_delay(substream, &delay);
                if (err)
                        return err;
                if (put_user(delay, res))
                        return -EFAULT;
                return 0;
        }
        case __SNDRV_PCM_IOCTL_SYNC_PTR32:
                return snd_pcm_ioctl_sync_ptr_compat(substream, arg);
        case __SNDRV_PCM_IOCTL_SYNC_PTR64:
                return snd_pcm_sync_ptr(substream, arg);
#ifdef CONFIG_SND_SUPPORT_OLD_API
        case SNDRV_PCM_IOCTL_HW_REFINE_OLD:
                return snd_pcm_hw_refine_old_user(substream, arg);
        case SNDRV_PCM_IOCTL_HW_PARAMS_OLD:
                return snd_pcm_hw_params_old_user(substream, arg);
#endif
        case SNDRV_PCM_IOCTL_DRAIN:
                return snd_pcm_drain(substream, file);
        case SNDRV_PCM_IOCTL_DROP:
                return snd_pcm_drop(substream);
        case SNDRV_PCM_IOCTL_PAUSE:
                return snd_pcm_pause_lock_irq(substream, (unsigned long)arg);
        case SNDRV_PCM_IOCTL_WRITEI_FRAMES:
        case SNDRV_PCM_IOCTL_READI_FRAMES:
                return snd_pcm_xferi_frames_ioctl(substream, arg);
        case SNDRV_PCM_IOCTL_WRITEN_FRAMES:
        case SNDRV_PCM_IOCTL_READN_FRAMES:
                return snd_pcm_xfern_frames_ioctl(substream, arg);
        case SNDRV_PCM_IOCTL_REWIND:
                return snd_pcm_rewind_ioctl(substream, arg);
        case SNDRV_PCM_IOCTL_FORWARD:
                return snd_pcm_forward_ioctl(substream, arg);
        }
        pcm_dbg(substream->pcm, "unknown ioctl = 0x%x\n", cmd);
        return -ENOTTY;
}

static long snd_pcm_ioctl(struct file *file, unsigned int cmd,
                          unsigned long arg)
{
        struct snd_pcm_file *pcm_file;

        pcm_file = file->private_data;

        if (((cmd >> 8) & 0xff) != 'A')
                return -ENOTTY;

        return snd_pcm_common_ioctl(file, pcm_file->substream, cmd,
                                     (void __user *)arg);
}

/**
 * snd_pcm_kernel_ioctl - Execute PCM ioctl in the kernel-space
 * @substream: PCM substream
 * @cmd: IOCTL cmd
 * @arg: IOCTL argument
 *
 * The function is provided primarily for OSS layer and USB gadget drivers,
 * and it allows only the limited set of ioctls (hw_params, sw_params,
 * prepare, start, drain, drop, forward).
 *
 * Return: zero if successful, or a negative error code
 */
int snd_pcm_kernel_ioctl(struct snd_pcm_substream *substream,
                         unsigned int cmd, void *arg)
{
        snd_pcm_uframes_t *frames = arg;
        snd_pcm_sframes_t result;
        
        if (substream->runtime->state == SNDRV_PCM_STATE_DISCONNECTED)
                return -EBADFD;

        switch (cmd) {
        case SNDRV_PCM_IOCTL_FORWARD:
        {
                /* provided only for OSS; capture-only and no value returned */
                if (substream->stream != SNDRV_PCM_STREAM_CAPTURE)
                        return -EINVAL;
                result = snd_pcm_forward(substream, *frames);
                return result < 0 ? result : 0;
        }
        case SNDRV_PCM_IOCTL_HW_PARAMS:
                return snd_pcm_hw_params(substream, arg);
        case SNDRV_PCM_IOCTL_SW_PARAMS:
                return snd_pcm_sw_params(substream, arg);
        case SNDRV_PCM_IOCTL_PREPARE:
                return snd_pcm_prepare(substream, NULL);
        case SNDRV_PCM_IOCTL_START:
                return snd_pcm_start_lock_irq(substream);
        case SNDRV_PCM_IOCTL_DRAIN:
                return snd_pcm_drain(substream, NULL);
        case SNDRV_PCM_IOCTL_DROP:
                return snd_pcm_drop(substream);
        case SNDRV_PCM_IOCTL_DELAY:
                return snd_pcm_delay(substream, frames);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(snd_pcm_kernel_ioctl);

static ssize_t snd_pcm_read(struct file *file, char __user *buf, size_t count,
                            loff_t * offset)
{
        struct snd_pcm_file *pcm_file;
        struct snd_pcm_substream *substream;
        struct snd_pcm_runtime *runtime;
        snd_pcm_sframes_t result;

        pcm_file = file->private_data;
        substream = pcm_file->substream;
        if (PCM_RUNTIME_CHECK(substream))
                return -ENXIO;
        runtime = substream->runtime;
        if (runtime->state == SNDRV_PCM_STATE_OPEN ||
            runtime->state == SNDRV_PCM_STATE_DISCONNECTED)
                return -EBADFD;
        if (!frame_aligned(runtime, count))
                return -EINVAL;
        count = bytes_to_frames(runtime, count);
        result = snd_pcm_lib_read(substream, buf, count);
        if (result > 0)
                result = frames_to_bytes(runtime, result);
        return result;
}

static ssize_t snd_pcm_write(struct file *file, const char __user *buf,
                             size_t count, loff_t * offset)
{
        struct snd_pcm_file *pcm_file;
        struct snd_pcm_substream *substream;
        struct snd_pcm_runtime *runtime;
        snd_pcm_sframes_t result;

        pcm_file = file->private_data;
        substream = pcm_file->substream;
        if (PCM_RUNTIME_CHECK(substream))
                return -ENXIO;
        runtime = substream->runtime;
        if (runtime->state == SNDRV_PCM_STATE_OPEN ||
            runtime->state == SNDRV_PCM_STATE_DISCONNECTED)
                return -EBADFD;
        if (!frame_aligned(runtime, count))
                return -EINVAL;
        count = bytes_to_frames(runtime, count);
        result = snd_pcm_lib_write(substream, buf, count);
        if (result > 0)
                result = frames_to_bytes(runtime, result);
        return result;
}

static ssize_t snd_pcm_readv(struct kiocb *iocb, struct iov_iter *to)
{
        struct snd_pcm_file *pcm_file;
        struct snd_pcm_substream *substream;
        struct snd_pcm_runtime *runtime;
        snd_pcm_sframes_t result;
        unsigned long i;
        void __user **bufs __free(kfree) = NULL;
        snd_pcm_uframes_t frames;
        const struct iovec *iov = iter_iov(to);

        pcm_file = iocb->ki_filp->private_data;
        substream = pcm_file->substream;
        if (PCM_RUNTIME_CHECK(substream))
                return -ENXIO;
        runtime = substream->runtime;
        if (runtime->state == SNDRV_PCM_STATE_OPEN ||
            runtime->state == SNDRV_PCM_STATE_DISCONNECTED)
                return -EBADFD;
        if (!user_backed_iter(to))
                return -EINVAL;
        if (to->nr_segs > 1024 || to->nr_segs != runtime->channels)
                return -EINVAL;
        if (!frame_aligned(runtime, iov->iov_len))
                return -EINVAL;
        frames = bytes_to_samples(runtime, iov->iov_len);
        bufs = kmalloc_array(to->nr_segs, sizeof(void *), GFP_KERNEL);
        if (bufs == NULL)
                return -ENOMEM;
        for (i = 0; i < to->nr_segs; ++i) {
                bufs[i] = iov->iov_base;
                iov++;
        }
        result = snd_pcm_lib_readv(substream, bufs, frames);
        if (result > 0)
                result = frames_to_bytes(runtime, result);
        return result;
}

static ssize_t snd_pcm_writev(struct kiocb *iocb, struct iov_iter *from)
{
        struct snd_pcm_file *pcm_file;
        struct snd_pcm_substream *substream;
        struct snd_pcm_runtime *runtime;
        snd_pcm_sframes_t result;
        unsigned long i;
        void __user **bufs __free(kfree) = NULL;
        snd_pcm_uframes_t frames;
        const struct iovec *iov = iter_iov(from);

        pcm_file = iocb->ki_filp->private_data;
        substream = pcm_file->substream;
        if (PCM_RUNTIME_CHECK(substream))
                return -ENXIO;
        runtime = substream->runtime;
        if (runtime->state == SNDRV_PCM_STATE_OPEN ||
            runtime->state == SNDRV_PCM_STATE_DISCONNECTED)
                return -EBADFD;
        if (!user_backed_iter(from))
                return -EINVAL;
        if (from->nr_segs > 128 || from->nr_segs != runtime->channels ||
            !frame_aligned(runtime, iov->iov_len))
                return -EINVAL;
        frames = bytes_to_samples(runtime, iov->iov_len);
        bufs = kmalloc_array(from->nr_segs, sizeof(void *), GFP_KERNEL);
        if (bufs == NULL)
                return -ENOMEM;
        for (i = 0; i < from->nr_segs; ++i) {
                bufs[i] = iov->iov_base;
                iov++;
        }
        result = snd_pcm_lib_writev(substream, bufs, frames);
        if (result > 0)
                result = frames_to_bytes(runtime, result);
        return result;
}

static __poll_t snd_pcm_poll(struct file *file, poll_table *wait)
{
        struct snd_pcm_file *pcm_file;
        struct snd_pcm_substream *substream;
        struct snd_pcm_runtime *runtime;
        __poll_t mask, ok;
        snd_pcm_uframes_t avail;

        pcm_file = file->private_data;

        substream = pcm_file->substream;
        if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK)
                ok = EPOLLOUT | EPOLLWRNORM;
        else
                ok = EPOLLIN | EPOLLRDNORM;
        if (PCM_RUNTIME_CHECK(substream))
                return ok | EPOLLERR;

        runtime = substream->runtime;
        if (runtime->state == SNDRV_PCM_STATE_DISCONNECTED)
                return ok | EPOLLERR;

        poll_wait(file, &runtime->sleep, wait);

        mask = 0;
        guard(pcm_stream_lock_irq)(substream);
        avail = snd_pcm_avail(substream);
        switch (runtime->state) {
        case SNDRV_PCM_STATE_RUNNING:
        case SNDRV_PCM_STATE_PREPARED:
        case SNDRV_PCM_STATE_PAUSED:
                if (avail >= runtime->control->avail_min)
                        mask = ok;
                break;
        case SNDRV_PCM_STATE_DRAINING:
                if (substream->stream == SNDRV_PCM_STREAM_CAPTURE) {
                        mask = ok;
                        if (!avail)
                                mask |= EPOLLERR;
                }
                break;
        default:
                mask = ok | EPOLLERR;
                break;
        }
        return mask;
}

/*
 * mmap support
 */

/*
 * Only on coherent architectures, we can mmap the status and the control records
 * for effcient data transfer.  On others, we have to use HWSYNC ioctl...
 */
#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_ALPHA)
/*
 * mmap status record
 */
static vm_fault_t snd_pcm_mmap_status_fault(struct vm_fault *vmf)
{
        struct snd_pcm_substream *substream = vmf->vma->vm_private_data;
        struct snd_pcm_runtime *runtime;
        
        if (substream == NULL)
                return VM_FAULT_SIGBUS;
        runtime = substream->runtime;
        vmf->page = virt_to_page(runtime->status);
        get_page(vmf->page);
        return 0;
}

static const struct vm_operations_struct snd_pcm_vm_ops_status =
{
        .fault =        snd_pcm_mmap_status_fault,
};

static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file *file,
                               struct vm_area_struct *area)
{
        long size;
        if (!(area->vm_flags & VM_READ))
                return -EINVAL;
        size = area->vm_end - area->vm_start;
        if (size != PAGE_ALIGN(sizeof(struct snd_pcm_mmap_status)))
                return -EINVAL;
        area->vm_ops = &snd_pcm_vm_ops_status;
        area->vm_private_data = substream;
        vm_flags_mod(area, VM_DONTEXPAND | VM_DONTDUMP,
                     VM_WRITE | VM_MAYWRITE);

        return 0;
}

/*
 * mmap control record
 */
static vm_fault_t snd_pcm_mmap_control_fault(struct vm_fault *vmf)
{
        struct snd_pcm_substream *substream = vmf->vma->vm_private_data;
        struct snd_pcm_runtime *runtime;
        
        if (substream == NULL)
                return VM_FAULT_SIGBUS;
        runtime = substream->runtime;
        vmf->page = virt_to_page(runtime->control);
        get_page(vmf->page);
        return 0;
}

static const struct vm_operations_struct snd_pcm_vm_ops_control =
{
        .fault =        snd_pcm_mmap_control_fault,
};

static int snd_pcm_mmap_control(struct snd_pcm_substream *substream, struct file *file,
                                struct vm_area_struct *area)
{
        long size;
        if (!(area->vm_flags & VM_READ))
                return -EINVAL;
        size = area->vm_end - area->vm_start;
        if (size != PAGE_ALIGN(sizeof(struct snd_pcm_mmap_control)))
                return -EINVAL;
        area->vm_ops = &snd_pcm_vm_ops_control;
        area->vm_private_data = substream;
        vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP);
        return 0;
}

static bool pcm_status_mmap_allowed(struct snd_pcm_file *pcm_file)
{
        /* If drivers require the explicit sync (typically for non-coherent
         * pages), we have to disable the mmap of status and control data
         * to enforce the control via SYNC_PTR ioctl.
         */
        if (pcm_file->substream->runtime->hw.info & SNDRV_PCM_INFO_EXPLICIT_SYNC)
                return false;
        /* See pcm_control_mmap_allowed() below.
         * Since older alsa-lib requires both status and control mmaps to be
         * coupled, we have to disable the status mmap for old alsa-lib, too.
         */
        if (pcm_file->user_pversion < SNDRV_PROTOCOL_VERSION(2, 0, 14) &&
            (pcm_file->substream->runtime->hw.info & SNDRV_PCM_INFO_SYNC_APPLPTR))
                return false;
        return true;
}

static bool pcm_control_mmap_allowed(struct snd_pcm_file *pcm_file)
{
        if (pcm_file->no_compat_mmap)
                return false;
        /* see above */
        if (pcm_file->substream->runtime->hw.info & SNDRV_PCM_INFO_EXPLICIT_SYNC)
                return false;
        /* Disallow the control mmap when SYNC_APPLPTR flag is set;
         * it enforces the user-space to fall back to snd_pcm_sync_ptr(),
         * thus it effectively assures the manual update of appl_ptr.
         */
        if (pcm_file->substream->runtime->hw.info & SNDRV_PCM_INFO_SYNC_APPLPTR)
                return false;
        return true;
}

#else /* ! coherent mmap */
/*
 * don't support mmap for status and control records.
 */
#define pcm_status_mmap_allowed(pcm_file)        false
#define pcm_control_mmap_allowed(pcm_file)        false

static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file *file,
                               struct vm_area_struct *area)
{
        return -ENXIO;
}
static int snd_pcm_mmap_control(struct snd_pcm_substream *substream, struct file *file,
                                struct vm_area_struct *area)
{
        return -ENXIO;
}
#endif /* coherent mmap */

/*
 * fault callback for mmapping a RAM page
 */
static vm_fault_t snd_pcm_mmap_data_fault(struct vm_fault *vmf)
{
        struct snd_pcm_substream *substream = vmf->vma->vm_private_data;
        struct snd_pcm_runtime *runtime;
        unsigned long offset;
        struct page * page;
        size_t dma_bytes;
        
        if (substream == NULL)
                return VM_FAULT_SIGBUS;
        runtime = substream->runtime;
        offset = vmf->pgoff << PAGE_SHIFT;
        dma_bytes = PAGE_ALIGN(runtime->dma_bytes);
        if (offset > dma_bytes - PAGE_SIZE)
                return VM_FAULT_SIGBUS;
        if (substream->ops->page)
                page = substream->ops->page(substream, offset);
        else if (!snd_pcm_get_dma_buf(substream))
                page = virt_to_page(runtime->dma_area + offset);
        else
                page = snd_sgbuf_get_page(snd_pcm_get_dma_buf(substream), offset);
        if (!page)
                return VM_FAULT_SIGBUS;
        get_page(page);
        vmf->page = page;
        return 0;
}

static const struct vm_operations_struct snd_pcm_vm_ops_data = {
        .open =                snd_pcm_mmap_data_open,
        .close =        snd_pcm_mmap_data_close,
};

static const struct vm_operations_struct snd_pcm_vm_ops_data_fault = {
        .open =                snd_pcm_mmap_data_open,
        .close =        snd_pcm_mmap_data_close,
        .fault =        snd_pcm_mmap_data_fault,
};

/*
 * mmap the DMA buffer on RAM
 */

/**
 * snd_pcm_lib_default_mmap - Default PCM data mmap function
 * @substream: PCM substream
 * @area: VMA
 *
 * This is the default mmap handler for PCM data.  When mmap pcm_ops is NULL,
 * this function is invoked implicitly.
 *
 * Return: zero if successful, or a negative error code
 */
int snd_pcm_lib_default_mmap(struct snd_pcm_substream *substream,
                             struct vm_area_struct *area)
{
        vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP);
        if (!substream->ops->page &&
            !snd_dma_buffer_mmap(snd_pcm_get_dma_buf(substream), area))
                return 0;
        /* mmap with fault handler */
        area->vm_ops = &snd_pcm_vm_ops_data_fault;
        return 0;
}
EXPORT_SYMBOL_GPL(snd_pcm_lib_default_mmap);

/*
 * mmap the DMA buffer on I/O memory area
 */
#if SNDRV_PCM_INFO_MMAP_IOMEM
/**
 * snd_pcm_lib_mmap_iomem - Default PCM data mmap function for I/O mem
 * @substream: PCM substream
 * @area: VMA
 *
 * When your hardware uses the iomapped pages as the hardware buffer and
 * wants to mmap it, pass this function as mmap pcm_ops.  Note that this
 * is supposed to work only on limited architectures.
 *
 * Return: zero if successful, or a negative error code
 */
int snd_pcm_lib_mmap_iomem(struct snd_pcm_substream *substream,
                           struct vm_area_struct *area)
{
        struct snd_pcm_runtime *runtime = substream->runtime;

        area->vm_page_prot = pgprot_noncached(area->vm_page_prot);
        return vm_iomap_memory(area, runtime->dma_addr, runtime->dma_bytes);
}
EXPORT_SYMBOL(snd_pcm_lib_mmap_iomem);
#endif /* SNDRV_PCM_INFO_MMAP */

/*
 * mmap DMA buffer
 */
int snd_pcm_mmap_data(struct snd_pcm_substream *substream, struct file *file,
                      struct vm_area_struct *area)
{
        struct snd_pcm_runtime *runtime;
        long size;
        unsigned long offset;
        size_t dma_bytes;
        int err;

        if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) {
                if (!(area->vm_flags & (VM_WRITE|VM_READ)))
                        return -EINVAL;
        } else {
                if (!(area->vm_flags & VM_READ))
                        return -EINVAL;
        }
        runtime = substream->runtime;
        if (runtime->state == SNDRV_PCM_STATE_OPEN)
                return -EBADFD;
        if (!(runtime->info & SNDRV_PCM_INFO_MMAP))
                return -ENXIO;
        if (runtime->access == SNDRV_PCM_ACCESS_RW_INTERLEAVED ||
            runtime->access == SNDRV_PCM_ACCESS_RW_NONINTERLEAVED)
                return -EINVAL;
        size = area->vm_end - area->vm_start;
        offset = area->vm_pgoff << PAGE_SHIFT;
        dma_bytes = PAGE_ALIGN(runtime->dma_bytes);
        if ((size_t)size > dma_bytes)
                return -EINVAL;
        if (offset > dma_bytes - size)
                return -EINVAL;

        area->vm_ops = &snd_pcm_vm_ops_data;
        area->vm_private_data = substream;
        if (substream->ops->mmap)
                err = substream->ops->mmap(substream, area);
        else
                err = snd_pcm_lib_default_mmap(substream, area);
        if (!err)
                atomic_inc(&substream->mmap_count);
        return err;
}
EXPORT_SYMBOL(snd_pcm_mmap_data);

static int snd_pcm_mmap(struct file *file, struct vm_area_struct *area)
{
        struct snd_pcm_file * pcm_file;
        struct snd_pcm_substream *substream;        
        unsigned long offset;
        
        pcm_file = file->private_data;
        substream = pcm_file->substream;
        if (PCM_RUNTIME_CHECK(substream))
                return -ENXIO;
        if (substream->runtime->state == SNDRV_PCM_STATE_DISCONNECTED)
                return -EBADFD;

        offset = area->vm_pgoff << PAGE_SHIFT;
        switch (offset) {
        case SNDRV_PCM_MMAP_OFFSET_STATUS_OLD:
                if (pcm_file->no_compat_mmap || !IS_ENABLED(CONFIG_64BIT))
                        return -ENXIO;
                fallthrough;
        case SNDRV_PCM_MMAP_OFFSET_STATUS_NEW:
                if (!pcm_status_mmap_allowed(pcm_file))
                        return -ENXIO;
                return snd_pcm_mmap_status(substream, file, area);
        case SNDRV_PCM_MMAP_OFFSET_CONTROL_OLD:
                if (pcm_file->no_compat_mmap || !IS_ENABLED(CONFIG_64BIT))
                        return -ENXIO;
                fallthrough;
        case SNDRV_PCM_MMAP_OFFSET_CONTROL_NEW:
                if (!pcm_control_mmap_allowed(pcm_file))
                        return -ENXIO;
                return snd_pcm_mmap_control(substream, file, area);
        default:
                return snd_pcm_mmap_data(substream, file, area);
        }
        return 0;
}

static int snd_pcm_fasync(int fd, struct file * file, int on)
{
        struct snd_pcm_file * pcm_file;
        struct snd_pcm_substream *substream;
        struct snd_pcm_runtime *runtime;

        pcm_file = file->private_data;
        substream = pcm_file->substream;
        if (PCM_RUNTIME_CHECK(substream))
                return -ENXIO;
        runtime = substream->runtime;
        if (runtime->state == SNDRV_PCM_STATE_DISCONNECTED)
                return -EBADFD;
        return snd_fasync_helper(fd, file, on, &runtime->fasync);
}

/*
 * ioctl32 compat
 */
#ifdef CONFIG_COMPAT
#include "pcm_compat.c"
#else
#define snd_pcm_ioctl_compat        NULL
#endif

/*
 *  To be removed helpers to keep binary compatibility
 */

#ifdef CONFIG_SND_SUPPORT_OLD_API
#define __OLD_TO_NEW_MASK(x) ((x&7)|((x&0x07fffff8)<<5))
#define __NEW_TO_OLD_MASK(x) ((x&7)|((x&0xffffff00)>>5))

static void snd_pcm_hw_convert_from_old_params(struct snd_pcm_hw_params *params,
                                               struct snd_pcm_hw_params_old *oparams)
{
        unsigned int i;

        memset(params, 0, sizeof(*params));
        params->flags = oparams->flags;
        for (i = 0; i < ARRAY_SIZE(oparams->masks); i++)
                params->masks[i].bits[0] = oparams->masks[i];
        memcpy(params->intervals, oparams->intervals, sizeof(oparams->intervals));
        params->rmask = __OLD_TO_NEW_MASK(oparams->rmask);
        params->cmask = __OLD_TO_NEW_MASK(oparams->cmask);
        params->info = oparams->info;
        params->msbits = oparams->msbits;
        params->rate_num = oparams->rate_num;
        params->rate_den = oparams->rate_den;
        params->fifo_size = oparams->fifo_size;
}

static void snd_pcm_hw_convert_to_old_params(struct snd_pcm_hw_params_old *oparams,
                                             struct snd_pcm_hw_params *params)
{
        unsigned int i;

        memset(oparams, 0, sizeof(*oparams));
        oparams->flags = params->flags;
        for (i = 0; i < ARRAY_SIZE(oparams->masks); i++)
                oparams->masks[i] = params->masks[i].bits[0];
        memcpy(oparams->intervals, params->intervals, sizeof(oparams->intervals));
        oparams->rmask = __NEW_TO_OLD_MASK(params->rmask);
        oparams->cmask = __NEW_TO_OLD_MASK(params->cmask);
        oparams->info = params->info;
        oparams->msbits = params->msbits;
        oparams->rate_num = params->rate_num;
        oparams->rate_den = params->rate_den;
        oparams->fifo_size = params->fifo_size;
}

static int snd_pcm_hw_refine_old_user(struct snd_pcm_substream *substream,
                                      struct snd_pcm_hw_params_old __user * _oparams)
{
        struct snd_pcm_hw_params *params __free(kfree) = NULL;
        struct snd_pcm_hw_params_old *oparams __free(kfree) = NULL;
        int err;

        params = kmalloc(sizeof(*params), GFP_KERNEL);
        if (!params)
                return -ENOMEM;

        oparams = memdup_user(_oparams, sizeof(*oparams));
        if (IS_ERR(oparams))
                return PTR_ERR(no_free_ptr(oparams));
        snd_pcm_hw_convert_from_old_params(params, oparams);
        err = snd_pcm_hw_refine(substream, params);
        if (err < 0)
                return err;

        err = fixup_unreferenced_params(substream, params);
        if (err < 0)
                return err;

        snd_pcm_hw_convert_to_old_params(oparams, params);
        if (copy_to_user(_oparams, oparams, sizeof(*oparams)))
                return -EFAULT;
        return 0;
}

static int snd_pcm_hw_params_old_user(struct snd_pcm_substream *substream,
                                      struct snd_pcm_hw_params_old __user * _oparams)
{
        struct snd_pcm_hw_params *params __free(kfree) = NULL;
        struct snd_pcm_hw_params_old *oparams __free(kfree) = NULL;
        int err;

        params = kmalloc(sizeof(*params), GFP_KERNEL);
        if (!params)
                return -ENOMEM;

        oparams = memdup_user(_oparams, sizeof(*oparams));
        if (IS_ERR(oparams))
                return PTR_ERR(no_free_ptr(oparams));

        snd_pcm_hw_convert_from_old_params(params, oparams);
        err = snd_pcm_hw_params(substream, params);
        if (err < 0)
                return err;

        snd_pcm_hw_convert_to_old_params(oparams, params);
        if (copy_to_user(_oparams, oparams, sizeof(*oparams)))
                return -EFAULT;
        return 0;
}
#endif /* CONFIG_SND_SUPPORT_OLD_API */

#ifndef CONFIG_MMU
static unsigned long snd_pcm_get_unmapped_area(struct file *file,
                                               unsigned long addr,
                                               unsigned long len,
                                               unsigned long pgoff,
                                               unsigned long flags)
{
        struct snd_pcm_file *pcm_file = file->private_data;
        struct snd_pcm_substream *substream = pcm_file->substream;
        struct snd_pcm_runtime *runtime = substream->runtime;
        unsigned long offset = pgoff << PAGE_SHIFT;

        switch (offset) {
        case SNDRV_PCM_MMAP_OFFSET_STATUS_NEW:
                return (unsigned long)runtime->status;
        case SNDRV_PCM_MMAP_OFFSET_CONTROL_NEW:
                return (unsigned long)runtime->control;
        default:
                return (unsigned long)runtime->dma_area + offset;
        }
}
#else
# define snd_pcm_get_unmapped_area NULL
#endif

/*
 *  Register section
 */

const struct file_operations snd_pcm_f_ops[2] = {
        {
                .owner =                THIS_MODULE,
                .write =                snd_pcm_write,
                .write_iter =                snd_pcm_writev,
                .open =                        snd_pcm_playback_open,
                .release =                snd_pcm_release,
                .llseek =                no_llseek,
                .poll =                        snd_pcm_poll,
                .unlocked_ioctl =        snd_pcm_ioctl,
                .compat_ioctl =         snd_pcm_ioctl_compat,
                .mmap =                        snd_pcm_mmap,
                .fasync =                snd_pcm_fasync,
                .get_unmapped_area =        snd_pcm_get_unmapped_area,
        },
        {
                .owner =                THIS_MODULE,
                .read =                        snd_pcm_read,
                .read_iter =                snd_pcm_readv,
                .open =                        snd_pcm_capture_open,
                .release =                snd_pcm_release,
                .llseek =                no_llseek,
                .poll =                        snd_pcm_poll,
                .unlocked_ioctl =        snd_pcm_ioctl,
                .compat_ioctl =         snd_pcm_ioctl_compat,
                .mmap =                        snd_pcm_mmap,
                .fasync =                snd_pcm_fasync,
                .get_unmapped_area =        snd_pcm_get_unmapped_area,
        }
};

















   53 






   54 


   55 







































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
/*
 * Stack trace management functions
 *
 *  Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 */
#include <linux/sched.h>
#include <linux/sched/debug.h>
#include <linux/sched/task_stack.h>
#include <linux/stacktrace.h>
#include <linux/export.h>
#include <linux/uaccess.h>
#include <asm/stacktrace.h>
#include <asm/unwind.h>

void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
                     struct task_struct *task, struct pt_regs *regs)
{
        struct unwind_state state;
        unsigned long addr;

        if (regs && !consume_entry(cookie, regs->ip))
                return;

        for (unwind_start(&state, task, regs, NULL); !unwind_done(&state);
             unwind_next_frame(&state)) {
                addr = unwind_get_return_address(&state);
                if (!addr || !consume_entry(cookie, addr))
                        break;
        }
}

int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
                             void *cookie, struct task_struct *task)
{
        struct unwind_state state;
        struct pt_regs *regs;
        unsigned long addr;

        for (unwind_start(&state, task, NULL, NULL);
             !unwind_done(&state) && !unwind_error(&state);
             unwind_next_frame(&state)) {

                regs = unwind_get_entry_regs(&state, NULL);
                if (regs) {
                        /* Success path for user tasks */
                        if (user_mode(regs))
                                return 0;

                        /*
                         * Kernel mode registers on the stack indicate an
                         * in-kernel interrupt or exception (e.g., preemption
                         * or a page fault), which can make frame pointers
                         * unreliable.
                         */
                        if (IS_ENABLED(CONFIG_FRAME_POINTER))
                                return -EINVAL;
                }

                addr = unwind_get_return_address(&state);

                /*
                 * A NULL or invalid return address probably means there's some
                 * generated code which __kernel_text_address() doesn't know
                 * about.
                 */
                if (!addr)
                        return -EINVAL;

                if (!consume_entry(cookie, addr))
                        return -EINVAL;
        }

        /* Check for stack corruption */
        if (unwind_error(&state))
                return -EINVAL;

        return 0;
}

/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */

struct stack_frame_user {
        const void __user        *next_fp;
        unsigned long                ret_addr;
};

static int
copy_stack_frame(const struct stack_frame_user __user *fp,
                 struct stack_frame_user *frame)
{
        int ret;

        if (!__access_ok(fp, sizeof(*frame)))
                return 0;

        ret = 1;
        pagefault_disable();
        if (__get_user(frame->next_fp, &fp->next_fp) ||
            __get_user(frame->ret_addr, &fp->ret_addr))
                ret = 0;
        pagefault_enable();

        return ret;
}

void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
                          const struct pt_regs *regs)
{
        const void __user *fp = (const void __user *)regs->bp;

        if (!consume_entry(cookie, regs->ip))
                return;

        while (1) {
                struct stack_frame_user frame;

                frame.next_fp = NULL;
                frame.ret_addr = 0;
                if (!copy_stack_frame(fp, &frame))
                        break;
                if ((unsigned long)fp < regs->sp)
                        break;
                if (!frame.ret_addr)
                        break;
                if (!consume_entry(cookie, frame.ret_addr))
                        break;
                fp = frame.next_fp;
        }
}









































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef IOU_NAPI_H
#define IOU_NAPI_H

#include <linux/kernel.h>
#include <linux/io_uring.h>
#include <net/busy_poll.h>

#ifdef CONFIG_NET_RX_BUSY_POLL

void io_napi_init(struct io_ring_ctx *ctx);
void io_napi_free(struct io_ring_ctx *ctx);

int io_register_napi(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg);

void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock);

void __io_napi_adjust_timeout(struct io_ring_ctx *ctx,
                struct io_wait_queue *iowq, struct timespec64 *ts);
void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq);
int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx);

static inline bool io_napi(struct io_ring_ctx *ctx)
{
        return !list_empty(&ctx->napi_list);
}

static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx,
                                          struct io_wait_queue *iowq,
                                          struct timespec64 *ts)
{
        if (!io_napi(ctx))
                return;
        __io_napi_adjust_timeout(ctx, iowq, ts);
}

static inline void io_napi_busy_loop(struct io_ring_ctx *ctx,
                                     struct io_wait_queue *iowq)
{
        if (!io_napi(ctx))
                return;
        __io_napi_busy_loop(ctx, iowq);
}

/*
 * io_napi_add() - Add napi id to the busy poll list
 * @req: pointer to io_kiocb request
 *
 * Add the napi id of the socket to the napi busy poll list and hash table.
 */
static inline void io_napi_add(struct io_kiocb *req)
{
        struct io_ring_ctx *ctx = req->ctx;
        struct socket *sock;

        if (!READ_ONCE(ctx->napi_busy_poll_to))
                return;

        sock = sock_from_file(req->file);
        if (sock)
                __io_napi_add(ctx, sock);
}

#else

static inline void io_napi_init(struct io_ring_ctx *ctx)
{
}
static inline void io_napi_free(struct io_ring_ctx *ctx)
{
}
static inline int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
{
        return -EOPNOTSUPP;
}
static inline int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
{
        return -EOPNOTSUPP;
}
static inline bool io_napi(struct io_ring_ctx *ctx)
{
        return false;
}
static inline void io_napi_add(struct io_kiocb *req)
{
}
static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx,
                                          struct io_wait_queue *iowq,
                                          struct timespec64 *ts)
{
}
static inline void io_napi_busy_loop(struct io_ring_ctx *ctx,
                                     struct io_wait_queue *iowq)
{
}
static inline int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
{
        return 0;
}
#endif /* CONFIG_NET_RX_BUSY_POLL */

#endif
































































    1 






























































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM sched

#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SCHED_H

#include <linux/kthread.h>
#include <linux/sched/numa_balancing.h>
#include <linux/tracepoint.h>
#include <linux/binfmts.h>

/*
 * Tracepoint for calling kthread_stop, performed to end a kthread:
 */
TRACE_EVENT(sched_kthread_stop,

        TP_PROTO(struct task_struct *t),

        TP_ARGS(t),

        TP_STRUCT__entry(
                __array(        char,        comm,        TASK_COMM_LEN        )
                __field(        pid_t,        pid                        )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
                __entry->pid        = t->pid;
        ),

        TP_printk("comm=%s pid=%d", __entry->comm, __entry->pid)
);

/*
 * Tracepoint for the return value of the kthread stopping:
 */
TRACE_EVENT(sched_kthread_stop_ret,

        TP_PROTO(int ret),

        TP_ARGS(ret),

        TP_STRUCT__entry(
                __field(        int,        ret        )
        ),

        TP_fast_assign(
                __entry->ret        = ret;
        ),

        TP_printk("ret=%d", __entry->ret)
);

/**
 * sched_kthread_work_queue_work - called when a work gets queued
 * @worker:        pointer to the kthread_worker
 * @work:        pointer to struct kthread_work
 *
 * This event occurs when a work is queued immediately or once a
 * delayed work is actually queued (ie: once the delay has been
 * reached).
 */
TRACE_EVENT(sched_kthread_work_queue_work,

        TP_PROTO(struct kthread_worker *worker,
                 struct kthread_work *work),

        TP_ARGS(worker, work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
                __field( void *,        worker)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
                __entry->worker                = worker;
        ),

        TP_printk("work struct=%p function=%ps worker=%p",
                  __entry->work, __entry->function, __entry->worker)
);

/**
 * sched_kthread_work_execute_start - called immediately before the work callback
 * @work:        pointer to struct kthread_work
 *
 * Allows to track kthread work execution.
 */
TRACE_EVENT(sched_kthread_work_execute_start,

        TP_PROTO(struct kthread_work *work),

        TP_ARGS(work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
        ),

        TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
);

/**
 * sched_kthread_work_execute_end - called immediately after the work callback
 * @work:        pointer to struct work_struct
 * @function:   pointer to worker function
 *
 * Allows to track workqueue execution.
 */
TRACE_EVENT(sched_kthread_work_execute_end,

        TP_PROTO(struct kthread_work *work, kthread_work_func_t function),

        TP_ARGS(work, function),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = function;
        ),

        TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
);

/*
 * Tracepoint for waking up a task:
 */
DECLARE_EVENT_CLASS(sched_wakeup_template,

        TP_PROTO(struct task_struct *p),

        TP_ARGS(__perf_task(p)),

        TP_STRUCT__entry(
                __array(        char,        comm,        TASK_COMM_LEN        )
                __field(        pid_t,        pid                        )
                __field(        int,        prio                        )
                __field(        int,        target_cpu                )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
                __entry->pid                = p->pid;
                __entry->prio                = p->prio; /* XXX SCHED_DEADLINE */
                __entry->target_cpu        = task_cpu(p);
        ),

        TP_printk("comm=%s pid=%d prio=%d target_cpu=%03d",
                  __entry->comm, __entry->pid, __entry->prio,
                  __entry->target_cpu)
);

/*
 * Tracepoint called when waking a task; this tracepoint is guaranteed to be
 * called from the waking context.
 */
DEFINE_EVENT(sched_wakeup_template, sched_waking,
             TP_PROTO(struct task_struct *p),
             TP_ARGS(p));

/*
 * Tracepoint called when the task is actually woken; p->state == TASK_RUNNING.
 * It is not always called from the waking context.
 */
DEFINE_EVENT(sched_wakeup_template, sched_wakeup,
             TP_PROTO(struct task_struct *p),
             TP_ARGS(p));

/*
 * Tracepoint for waking up a new task:
 */
DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
             TP_PROTO(struct task_struct *p),
             TP_ARGS(p));

#ifdef CREATE_TRACE_POINTS
static inline long __trace_sched_switch_state(bool preempt,
                                              unsigned int prev_state,
                                              struct task_struct *p)
{
        unsigned int state;

#ifdef CONFIG_SCHED_DEBUG
        BUG_ON(p != current);
#endif /* CONFIG_SCHED_DEBUG */

        /*
         * Preemption ignores task state, therefore preempted tasks are always
         * RUNNING (we will not have dequeued if state != RUNNING).
         */
        if (preempt)
                return TASK_REPORT_MAX;

        /*
         * task_state_index() uses fls() and returns a value from 0-8 range.
         * Decrement it by 1 (except TASK_RUNNING state i.e 0) before using
         * it for left shift operation to get the correct task->state
         * mapping.
         */
        state = __task_state_index(prev_state, p->exit_state);

        return state ? (1 << (state - 1)) : state;
}
#endif /* CREATE_TRACE_POINTS */

/*
 * Tracepoint for task switches, performed by the scheduler:
 */
TRACE_EVENT(sched_switch,

        TP_PROTO(bool preempt,
                 struct task_struct *prev,
                 struct task_struct *next,
                 unsigned int prev_state),

        TP_ARGS(preempt, prev, next, prev_state),

        TP_STRUCT__entry(
                __array(        char,        prev_comm,        TASK_COMM_LEN        )
                __field(        pid_t,        prev_pid                        )
                __field(        int,        prev_prio                        )
                __field(        long,        prev_state                        )
                __array(        char,        next_comm,        TASK_COMM_LEN        )
                __field(        pid_t,        next_pid                        )
                __field(        int,        next_prio                        )
        ),

        TP_fast_assign(
                memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
                __entry->prev_pid        = prev->pid;
                __entry->prev_prio        = prev->prio;
                __entry->prev_state        = __trace_sched_switch_state(preempt, prev_state, prev);
                memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
                __entry->next_pid        = next->pid;
                __entry->next_prio        = next->prio;
                /* XXX SCHED_DEADLINE */
        ),

        TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
                __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,

                (__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
                  __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
                                { TASK_INTERRUPTIBLE, "S" },
                                { TASK_UNINTERRUPTIBLE, "D" },
                                { __TASK_STOPPED, "T" },
                                { __TASK_TRACED, "t" },
                                { EXIT_DEAD, "X" },
                                { EXIT_ZOMBIE, "Z" },
                                { TASK_PARKED, "P" },
                                { TASK_DEAD, "I" }) :
                  "R",

                __entry->prev_state & TASK_REPORT_MAX ? "+" : "",
                __entry->next_comm, __entry->next_pid, __entry->next_prio)
);

/*
 * Tracepoint for a task being migrated:
 */
TRACE_EVENT(sched_migrate_task,

        TP_PROTO(struct task_struct *p, int dest_cpu),

        TP_ARGS(p, dest_cpu),

        TP_STRUCT__entry(
                __array(        char,        comm,        TASK_COMM_LEN        )
                __field(        pid_t,        pid                        )
                __field(        int,        prio                        )
                __field(        int,        orig_cpu                )
                __field(        int,        dest_cpu                )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
                __entry->pid                = p->pid;
                __entry->prio                = p->prio; /* XXX SCHED_DEADLINE */
                __entry->orig_cpu        = task_cpu(p);
                __entry->dest_cpu        = dest_cpu;
        ),

        TP_printk("comm=%s pid=%d prio=%d orig_cpu=%d dest_cpu=%d",
                  __entry->comm, __entry->pid, __entry->prio,
                  __entry->orig_cpu, __entry->dest_cpu)
);

DECLARE_EVENT_CLASS(sched_process_template,

        TP_PROTO(struct task_struct *p),

        TP_ARGS(p),

        TP_STRUCT__entry(
                __array(        char,        comm,        TASK_COMM_LEN        )
                __field(        pid_t,        pid                        )
                __field(        int,        prio                        )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
                __entry->pid                = p->pid;
                __entry->prio                = p->prio; /* XXX SCHED_DEADLINE */
        ),

        TP_printk("comm=%s pid=%d prio=%d",
                  __entry->comm, __entry->pid, __entry->prio)
);

/*
 * Tracepoint for freeing a task:
 */
DEFINE_EVENT(sched_process_template, sched_process_free,
             TP_PROTO(struct task_struct *p),
             TP_ARGS(p));

/*
 * Tracepoint for a task exiting:
 */
DEFINE_EVENT(sched_process_template, sched_process_exit,
             TP_PROTO(struct task_struct *p),
             TP_ARGS(p));

/*
 * Tracepoint for waiting on task to unschedule:
 */
DEFINE_EVENT(sched_process_template, sched_wait_task,
        TP_PROTO(struct task_struct *p),
        TP_ARGS(p));

/*
 * Tracepoint for a waiting task:
 */
TRACE_EVENT(sched_process_wait,

        TP_PROTO(struct pid *pid),

        TP_ARGS(pid),

        TP_STRUCT__entry(
                __array(        char,        comm,        TASK_COMM_LEN        )
                __field(        pid_t,        pid                        )
                __field(        int,        prio                        )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
                __entry->pid                = pid_nr(pid);
                __entry->prio                = current->prio; /* XXX SCHED_DEADLINE */
        ),

        TP_printk("comm=%s pid=%d prio=%d",
                  __entry->comm, __entry->pid, __entry->prio)
);

/*
 * Tracepoint for kernel_clone:
 */
TRACE_EVENT(sched_process_fork,

        TP_PROTO(struct task_struct *parent, struct task_struct *child),

        TP_ARGS(parent, child),

        TP_STRUCT__entry(
                __array(        char,        parent_comm,        TASK_COMM_LEN        )
                __field(        pid_t,        parent_pid                        )
                __array(        char,        child_comm,        TASK_COMM_LEN        )
                __field(        pid_t,        child_pid                        )
        ),

        TP_fast_assign(
                memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
                __entry->parent_pid        = parent->pid;
                memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
                __entry->child_pid        = child->pid;
        ),

        TP_printk("comm=%s pid=%d child_comm=%s child_pid=%d",
                __entry->parent_comm, __entry->parent_pid,
                __entry->child_comm, __entry->child_pid)
);

/*
 * Tracepoint for exec:
 */
TRACE_EVENT(sched_process_exec,

        TP_PROTO(struct task_struct *p, pid_t old_pid,
                 struct linux_binprm *bprm),

        TP_ARGS(p, old_pid, bprm),

        TP_STRUCT__entry(
                __string(        filename,        bprm->filename        )
                __field(        pid_t,                pid                )
                __field(        pid_t,                old_pid                )
        ),

        TP_fast_assign(
                __assign_str(filename);
                __entry->pid                = p->pid;
                __entry->old_pid        = old_pid;
        ),

        TP_printk("filename=%s pid=%d old_pid=%d", __get_str(filename),
                  __entry->pid, __entry->old_pid)
);

/**
 * sched_prepare_exec - called before setting up new exec
 * @task:        pointer to the current task
 * @bprm:        pointer to linux_binprm used for new exec
 *
 * Called before flushing the old exec, where @task is still unchanged, but at
 * the point of no return during switching to the new exec. At the point it is
 * called the exec will either succeed, or on failure terminate the task. Also
 * see the "sched_process_exec" tracepoint, which is called right after @task
 * has successfully switched to the new exec.
 */
TRACE_EVENT(sched_prepare_exec,

        TP_PROTO(struct task_struct *task, struct linux_binprm *bprm),

        TP_ARGS(task, bprm),

        TP_STRUCT__entry(
                __string(        interp,                bprm->interp        )
                __string(        filename,        bprm->filename        )
                __field(        pid_t,                pid                )
                __string(        comm,                task->comm        )
        ),

        TP_fast_assign(
                __assign_str(interp);
                __assign_str(filename);
                __entry->pid = task->pid;
                __assign_str(comm);
        ),

        TP_printk("interp=%s filename=%s pid=%d comm=%s",
                  __get_str(interp), __get_str(filename),
                  __entry->pid, __get_str(comm))
);

#ifdef CONFIG_SCHEDSTATS
#define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT
#define DECLARE_EVENT_CLASS_SCHEDSTAT DECLARE_EVENT_CLASS
#else
#define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT_NOP
#define DECLARE_EVENT_CLASS_SCHEDSTAT DECLARE_EVENT_CLASS_NOP
#endif

/*
 * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE
 *     adding sched_stat support to SCHED_FIFO/RR would be welcome.
 */
DECLARE_EVENT_CLASS_SCHEDSTAT(sched_stat_template,

        TP_PROTO(struct task_struct *tsk, u64 delay),

        TP_ARGS(__perf_task(tsk), __perf_count(delay)),

        TP_STRUCT__entry(
                __array( char,        comm,        TASK_COMM_LEN        )
                __field( pid_t,        pid                        )
                __field( u64,        delay                        )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
                __entry->pid        = tsk->pid;
                __entry->delay        = delay;
        ),

        TP_printk("comm=%s pid=%d delay=%Lu [ns]",
                        __entry->comm, __entry->pid,
                        (unsigned long long)__entry->delay)
);

/*
 * Tracepoint for accounting wait time (time the task is runnable
 * but not actually running due to scheduler contention).
 */
DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_wait,
             TP_PROTO(struct task_struct *tsk, u64 delay),
             TP_ARGS(tsk, delay));

/*
 * Tracepoint for accounting sleep time (time the task is not runnable,
 * including iowait, see below).
 */
DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_sleep,
             TP_PROTO(struct task_struct *tsk, u64 delay),
             TP_ARGS(tsk, delay));

/*
 * Tracepoint for accounting iowait time (time the task is not runnable
 * due to waiting on IO to complete).
 */
DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_iowait,
             TP_PROTO(struct task_struct *tsk, u64 delay),
             TP_ARGS(tsk, delay));

/*
 * Tracepoint for accounting blocked time (time the task is in uninterruptible).
 */
DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_blocked,
             TP_PROTO(struct task_struct *tsk, u64 delay),
             TP_ARGS(tsk, delay));

/*
 * Tracepoint for accounting runtime (time the task is executing
 * on a CPU).
 */
DECLARE_EVENT_CLASS(sched_stat_runtime,

        TP_PROTO(struct task_struct *tsk, u64 runtime),

        TP_ARGS(tsk, __perf_count(runtime)),

        TP_STRUCT__entry(
                __array( char,        comm,        TASK_COMM_LEN        )
                __field( pid_t,        pid                        )
                __field( u64,        runtime                        )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
                __entry->pid                = tsk->pid;
                __entry->runtime        = runtime;
        ),

        TP_printk("comm=%s pid=%d runtime=%Lu [ns]",
                        __entry->comm, __entry->pid,
                        (unsigned long long)__entry->runtime)
);

DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime,
             TP_PROTO(struct task_struct *tsk, u64 runtime),
             TP_ARGS(tsk, runtime));

/*
 * Tracepoint for showing priority inheritance modifying a tasks
 * priority.
 */
TRACE_EVENT(sched_pi_setprio,

        TP_PROTO(struct task_struct *tsk, struct task_struct *pi_task),

        TP_ARGS(tsk, pi_task),

        TP_STRUCT__entry(
                __array( char,        comm,        TASK_COMM_LEN        )
                __field( pid_t,        pid                        )
                __field( int,        oldprio                        )
                __field( int,        newprio                        )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
                __entry->pid                = tsk->pid;
                __entry->oldprio        = tsk->prio;
                __entry->newprio        = pi_task ?
                                min(tsk->normal_prio, pi_task->prio) :
                                tsk->normal_prio;
                /* XXX SCHED_DEADLINE bits missing */
        ),

        TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
                        __entry->comm, __entry->pid,
                        __entry->oldprio, __entry->newprio)
);

#ifdef CONFIG_DETECT_HUNG_TASK
TRACE_EVENT(sched_process_hang,
        TP_PROTO(struct task_struct *tsk),
        TP_ARGS(tsk),

        TP_STRUCT__entry(
                __array( char,        comm,        TASK_COMM_LEN        )
                __field( pid_t,        pid                        )
        ),

        TP_fast_assign(
                memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
                __entry->pid = tsk->pid;
        ),

        TP_printk("comm=%s pid=%d", __entry->comm, __entry->pid)
);
#endif /* CONFIG_DETECT_HUNG_TASK */

/*
 * Tracks migration of tasks from one runqueue to another. Can be used to
 * detect if automatic NUMA balancing is bouncing between nodes.
 */
TRACE_EVENT(sched_move_numa,

        TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu),

        TP_ARGS(tsk, src_cpu, dst_cpu),

        TP_STRUCT__entry(
                __field( pid_t,        pid                        )
                __field( pid_t,        tgid                        )
                __field( pid_t,        ngid                        )
                __field( int,        src_cpu                        )
                __field( int,        src_nid                        )
                __field( int,        dst_cpu                        )
                __field( int,        dst_nid                        )
        ),

        TP_fast_assign(
                __entry->pid                = task_pid_nr(tsk);
                __entry->tgid                = task_tgid_nr(tsk);
                __entry->ngid                = task_numa_group_id(tsk);
                __entry->src_cpu        = src_cpu;
                __entry->src_nid        = cpu_to_node(src_cpu);
                __entry->dst_cpu        = dst_cpu;
                __entry->dst_nid        = cpu_to_node(dst_cpu);
        ),

        TP_printk("pid=%d tgid=%d ngid=%d src_cpu=%d src_nid=%d dst_cpu=%d dst_nid=%d",
                        __entry->pid, __entry->tgid, __entry->ngid,
                        __entry->src_cpu, __entry->src_nid,
                        __entry->dst_cpu, __entry->dst_nid)
);

DECLARE_EVENT_CLASS(sched_numa_pair_template,

        TP_PROTO(struct task_struct *src_tsk, int src_cpu,
                 struct task_struct *dst_tsk, int dst_cpu),

        TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu),

        TP_STRUCT__entry(
                __field( pid_t,        src_pid                        )
                __field( pid_t,        src_tgid                )
                __field( pid_t,        src_ngid                )
                __field( int,        src_cpu                        )
                __field( int,        src_nid                        )
                __field( pid_t,        dst_pid                        )
                __field( pid_t,        dst_tgid                )
                __field( pid_t,        dst_ngid                )
                __field( int,        dst_cpu                        )
                __field( int,        dst_nid                        )
        ),

        TP_fast_assign(
                __entry->src_pid        = task_pid_nr(src_tsk);
                __entry->src_tgid        = task_tgid_nr(src_tsk);
                __entry->src_ngid        = task_numa_group_id(src_tsk);
                __entry->src_cpu        = src_cpu;
                __entry->src_nid        = cpu_to_node(src_cpu);
                __entry->dst_pid        = dst_tsk ? task_pid_nr(dst_tsk) : 0;
                __entry->dst_tgid        = dst_tsk ? task_tgid_nr(dst_tsk) : 0;
                __entry->dst_ngid        = dst_tsk ? task_numa_group_id(dst_tsk) : 0;
                __entry->dst_cpu        = dst_cpu;
                __entry->dst_nid        = dst_cpu >= 0 ? cpu_to_node(dst_cpu) : -1;
        ),

        TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d",
                        __entry->src_pid, __entry->src_tgid, __entry->src_ngid,
                        __entry->src_cpu, __entry->src_nid,
                        __entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid,
                        __entry->dst_cpu, __entry->dst_nid)
);

DEFINE_EVENT(sched_numa_pair_template, sched_stick_numa,

        TP_PROTO(struct task_struct *src_tsk, int src_cpu,
                 struct task_struct *dst_tsk, int dst_cpu),

        TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
);

DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,

        TP_PROTO(struct task_struct *src_tsk, int src_cpu,
                 struct task_struct *dst_tsk, int dst_cpu),

        TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
);

#ifdef CONFIG_NUMA_BALANCING
#define NUMAB_SKIP_REASON                                        \
        EM( NUMAB_SKIP_UNSUITABLE,                "unsuitable" )        \
        EM( NUMAB_SKIP_SHARED_RO,                "shared_ro" )        \
        EM( NUMAB_SKIP_INACCESSIBLE,                "inaccessible" )        \
        EM( NUMAB_SKIP_SCAN_DELAY,                "scan_delay" )        \
        EM( NUMAB_SKIP_PID_INACTIVE,                "pid_inactive" )        \
        EM( NUMAB_SKIP_IGNORE_PID,                "ignore_pid_inactive" )                \
        EMe(NUMAB_SKIP_SEQ_COMPLETED,                "seq_completed" )

/* Redefine for export. */
#undef EM
#undef EMe
#define EM(a, b)        TRACE_DEFINE_ENUM(a);
#define EMe(a, b)        TRACE_DEFINE_ENUM(a);

NUMAB_SKIP_REASON

/* Redefine for symbolic printing. */
#undef EM
#undef EMe
#define EM(a, b)        { a, b },
#define EMe(a, b)        { a, b }

TRACE_EVENT(sched_skip_vma_numa,

        TP_PROTO(struct mm_struct *mm, struct vm_area_struct *vma,
                 enum numa_vmaskip_reason reason),

        TP_ARGS(mm, vma, reason),

        TP_STRUCT__entry(
                __field(unsigned long, numa_scan_offset)
                __field(unsigned long, vm_start)
                __field(unsigned long, vm_end)
                __field(enum numa_vmaskip_reason, reason)
        ),

        TP_fast_assign(
                __entry->numa_scan_offset        = mm->numa_scan_offset;
                __entry->vm_start                = vma->vm_start;
                __entry->vm_end                        = vma->vm_end;
                __entry->reason                        = reason;
        ),

        TP_printk("numa_scan_offset=%lX vm_start=%lX vm_end=%lX reason=%s",
                  __entry->numa_scan_offset,
                  __entry->vm_start,
                  __entry->vm_end,
                  __print_symbolic(__entry->reason, NUMAB_SKIP_REASON))
);
#endif /* CONFIG_NUMA_BALANCING */

/*
 * Tracepoint for waking a polling cpu without an IPI.
 */
TRACE_EVENT(sched_wake_idle_without_ipi,

        TP_PROTO(int cpu),

        TP_ARGS(cpu),

        TP_STRUCT__entry(
                __field(        int,        cpu        )
        ),

        TP_fast_assign(
                __entry->cpu        = cpu;
        ),

        TP_printk("cpu=%d", __entry->cpu)
);

/*
 * Following tracepoints are not exported in tracefs and provide hooking
 * mechanisms only for testing and debugging purposes.
 *
 * Postfixed with _tp to make them easily identifiable in the code.
 */
DECLARE_TRACE(pelt_cfs_tp,
        TP_PROTO(struct cfs_rq *cfs_rq),
        TP_ARGS(cfs_rq));

DECLARE_TRACE(pelt_rt_tp,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));

DECLARE_TRACE(pelt_dl_tp,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));

DECLARE_TRACE(pelt_hw_tp,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));

DECLARE_TRACE(pelt_irq_tp,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));

DECLARE_TRACE(pelt_se_tp,
        TP_PROTO(struct sched_entity *se),
        TP_ARGS(se));

DECLARE_TRACE(sched_cpu_capacity_tp,
        TP_PROTO(struct rq *rq),
        TP_ARGS(rq));

DECLARE_TRACE(sched_overutilized_tp,
        TP_PROTO(struct root_domain *rd, bool overutilized),
        TP_ARGS(rd, overutilized));

DECLARE_TRACE(sched_util_est_cfs_tp,
        TP_PROTO(struct cfs_rq *cfs_rq),
        TP_ARGS(cfs_rq));

DECLARE_TRACE(sched_util_est_se_tp,
        TP_PROTO(struct sched_entity *se),
        TP_ARGS(se));

DECLARE_TRACE(sched_update_nr_running_tp,
        TP_PROTO(struct rq *rq, int change),
        TP_ARGS(rq, change));

DECLARE_TRACE(sched_compute_energy_tp,
        TP_PROTO(struct task_struct *p, int dst_cpu, unsigned long energy,
                 unsigned long max_util, unsigned long busy_time),
        TP_ARGS(p, dst_cpu, energy, max_util, busy_time));

#endif /* _TRACE_SCHED_H */

/* This part must be outside protection */
#include <trace/define_trace.h>




























































    2 














    3 



    3 



























































































































































































    1 














    2 











    2 


































































































































































































































































































































































































































































































































































































































    1 









    1 


































    1 
    1 




    1 
    1 




    1 






    1 



    1 















































































































































































































































































































































































































































    1 















    1 

























    2 








    2 






















































    1 















    1 

























    1 






    1 




































    1 


















    1 





    1 








    1 























































































































































































































































































   18 
















   17 




   15 


















    1 
















    1 






















































    9 























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
// SPDX-License-Identifier: GPL-2.0-only
/*
 *        fs/libfs.c
 *        Library for filesystems writers.
 */

#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/mount.h>
#include <linux/vfs.h>
#include <linux/quotaops.h>
#include <linux/mutex.h>
#include <linux/namei.h>
#include <linux/exportfs.h>
#include <linux/iversion.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h> /* sync_mapping_buffers */
#include <linux/fs_context.h>
#include <linux/pseudo_fs.h>
#include <linux/fsnotify.h>
#include <linux/unicode.h>
#include <linux/fscrypt.h>
#include <linux/pidfs.h>

#include <linux/uaccess.h>

#include "internal.h"

int simple_getattr(struct mnt_idmap *idmap, const struct path *path,
                   struct kstat *stat, u32 request_mask,
                   unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
        stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9);
        return 0;
}
EXPORT_SYMBOL(simple_getattr);

int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        u64 id = huge_encode_dev(dentry->d_sb->s_dev);

        buf->f_fsid = u64_to_fsid(id);
        buf->f_type = dentry->d_sb->s_magic;
        buf->f_bsize = PAGE_SIZE;
        buf->f_namelen = NAME_MAX;
        return 0;
}
EXPORT_SYMBOL(simple_statfs);

/*
 * Retaining negative dentries for an in-memory filesystem just wastes
 * memory and lookup time: arrange for them to be deleted immediately.
 */
int always_delete_dentry(const struct dentry *dentry)
{
        return 1;
}
EXPORT_SYMBOL(always_delete_dentry);

const struct dentry_operations simple_dentry_operations = {
        .d_delete = always_delete_dentry,
};
EXPORT_SYMBOL(simple_dentry_operations);

/*
 * Lookup the data. This is trivial - if the dentry didn't already
 * exist, we know it is negative.  Set d_op to delete negative dentries.
 */
struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        if (dentry->d_name.len > NAME_MAX)
                return ERR_PTR(-ENAMETOOLONG);
        if (!dentry->d_sb->s_d_op)
                d_set_d_op(dentry, &simple_dentry_operations);
        d_add(dentry, NULL);
        return NULL;
}
EXPORT_SYMBOL(simple_lookup);

int dcache_dir_open(struct inode *inode, struct file *file)
{
        file->private_data = d_alloc_cursor(file->f_path.dentry);

        return file->private_data ? 0 : -ENOMEM;
}
EXPORT_SYMBOL(dcache_dir_open);

int dcache_dir_close(struct inode *inode, struct file *file)
{
        dput(file->private_data);
        return 0;
}
EXPORT_SYMBOL(dcache_dir_close);

/* parent is locked at least shared */
/*
 * Returns an element of siblings' list.
 * We are looking for <count>th positive after <p>; if
 * found, dentry is grabbed and returned to caller.
 * If no such element exists, NULL is returned.
 */
static struct dentry *scan_positives(struct dentry *cursor,
                                        struct hlist_node **p,
                                        loff_t count,
                                        struct dentry *last)
{
        struct dentry *dentry = cursor->d_parent, *found = NULL;

        spin_lock(&dentry->d_lock);
        while (*p) {
                struct dentry *d = hlist_entry(*p, struct dentry, d_sib);
                p = &d->d_sib.next;
                // we must at least skip cursors, to avoid livelocks
                if (d->d_flags & DCACHE_DENTRY_CURSOR)
                        continue;
                if (simple_positive(d) && !--count) {
                        spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
                        if (simple_positive(d))
                                found = dget_dlock(d);
                        spin_unlock(&d->d_lock);
                        if (likely(found))
                                break;
                        count = 1;
                }
                if (need_resched()) {
                        if (!hlist_unhashed(&cursor->d_sib))
                                __hlist_del(&cursor->d_sib);
                        hlist_add_behind(&cursor->d_sib, &d->d_sib);
                        p = &cursor->d_sib.next;
                        spin_unlock(&dentry->d_lock);
                        cond_resched();
                        spin_lock(&dentry->d_lock);
                }
        }
        spin_unlock(&dentry->d_lock);
        dput(last);
        return found;
}

loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
{
        struct dentry *dentry = file->f_path.dentry;
        switch (whence) {
                case 1:
                        offset += file->f_pos;
                        fallthrough;
                case 0:
                        if (offset >= 0)
                                break;
                        fallthrough;
                default:
                        return -EINVAL;
        }
        if (offset != file->f_pos) {
                struct dentry *cursor = file->private_data;
                struct dentry *to = NULL;

                inode_lock_shared(dentry->d_inode);

                if (offset > 2)
                        to = scan_positives(cursor, &dentry->d_children.first,
                                            offset - 2, NULL);
                spin_lock(&dentry->d_lock);
                hlist_del_init(&cursor->d_sib);
                if (to)
                        hlist_add_behind(&cursor->d_sib, &to->d_sib);
                spin_unlock(&dentry->d_lock);
                dput(to);

                file->f_pos = offset;

                inode_unlock_shared(dentry->d_inode);
        }
        return offset;
}
EXPORT_SYMBOL(dcache_dir_lseek);

/*
 * Directory is locked and all positive dentries in it are safe, since
 * for ramfs-type trees they can't go away without unlink() or rmdir(),
 * both impossible due to the lock on directory.
 */

int dcache_readdir(struct file *file, struct dir_context *ctx)
{
        struct dentry *dentry = file->f_path.dentry;
        struct dentry *cursor = file->private_data;
        struct dentry *next = NULL;
        struct hlist_node **p;

        if (!dir_emit_dots(file, ctx))
                return 0;

        if (ctx->pos == 2)
                p = &dentry->d_children.first;
        else
                p = &cursor->d_sib.next;

        while ((next = scan_positives(cursor, p, 1, next)) != NULL) {
                if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
                              d_inode(next)->i_ino,
                              fs_umode_to_dtype(d_inode(next)->i_mode)))
                        break;
                ctx->pos++;
                p = &next->d_sib.next;
        }
        spin_lock(&dentry->d_lock);
        hlist_del_init(&cursor->d_sib);
        if (next)
                hlist_add_before(&cursor->d_sib, &next->d_sib);
        spin_unlock(&dentry->d_lock);
        dput(next);

        return 0;
}
EXPORT_SYMBOL(dcache_readdir);

ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos)
{
        return -EISDIR;
}
EXPORT_SYMBOL(generic_read_dir);

const struct file_operations simple_dir_operations = {
        .open                = dcache_dir_open,
        .release        = dcache_dir_close,
        .llseek                = dcache_dir_lseek,
        .read                = generic_read_dir,
        .iterate_shared        = dcache_readdir,
        .fsync                = noop_fsync,
};
EXPORT_SYMBOL(simple_dir_operations);

const struct inode_operations simple_dir_inode_operations = {
        .lookup                = simple_lookup,
};
EXPORT_SYMBOL(simple_dir_inode_operations);

/* 0 is '.', 1 is '..', so always start with offset 2 or more */
enum {
        DIR_OFFSET_MIN        = 2,
};

static void offset_set(struct dentry *dentry, long offset)
{
        dentry->d_fsdata = (void *)offset;
}

static long dentry2offset(struct dentry *dentry)
{
        return (long)dentry->d_fsdata;
}

static struct lock_class_key simple_offset_lock_class;

/**
 * simple_offset_init - initialize an offset_ctx
 * @octx: directory offset map to be initialized
 *
 */
void simple_offset_init(struct offset_ctx *octx)
{
        mt_init_flags(&octx->mt, MT_FLAGS_ALLOC_RANGE);
        lockdep_set_class(&octx->mt.ma_lock, &simple_offset_lock_class);
        octx->next_offset = DIR_OFFSET_MIN;
}

/**
 * simple_offset_add - Add an entry to a directory's offset map
 * @octx: directory offset ctx to be updated
 * @dentry: new dentry being added
 *
 * Returns zero on success. @octx and the dentry's offset are updated.
 * Otherwise, a negative errno value is returned.
 */
int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
{
        unsigned long offset;
        int ret;

        if (dentry2offset(dentry) != 0)
                return -EBUSY;

        ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN,
                                 LONG_MAX, &octx->next_offset, GFP_KERNEL);
        if (ret < 0)
                return ret;

        offset_set(dentry, offset);
        return 0;
}

static int simple_offset_replace(struct offset_ctx *octx, struct dentry *dentry,
                                 long offset)
{
        int ret;

        ret = mtree_store(&octx->mt, offset, dentry, GFP_KERNEL);
        if (ret)
                return ret;
        offset_set(dentry, offset);
        return 0;
}

/**
 * simple_offset_remove - Remove an entry to a directory's offset map
 * @octx: directory offset ctx to be updated
 * @dentry: dentry being removed
 *
 */
void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
{
        long offset;

        offset = dentry2offset(dentry);
        if (offset == 0)
                return;

        mtree_erase(&octx->mt, offset);
        offset_set(dentry, 0);
}

/**
 * simple_offset_empty - Check if a dentry can be unlinked
 * @dentry: dentry to be tested
 *
 * Returns 0 if @dentry is a non-empty directory; otherwise returns 1.
 */
int simple_offset_empty(struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        struct offset_ctx *octx;
        struct dentry *child;
        unsigned long index;
        int ret = 1;

        if (!inode || !S_ISDIR(inode->i_mode))
                return ret;

        index = DIR_OFFSET_MIN;
        octx = inode->i_op->get_offset_ctx(inode);
        mt_for_each(&octx->mt, child, index, LONG_MAX) {
                spin_lock(&child->d_lock);
                if (simple_positive(child)) {
                        spin_unlock(&child->d_lock);
                        ret = 0;
                        break;
                }
                spin_unlock(&child->d_lock);
        }

        return ret;
}

/**
 * simple_offset_rename - handle directory offsets for rename
 * @old_dir: parent directory of source entry
 * @old_dentry: dentry of source entry
 * @new_dir: parent_directory of destination entry
 * @new_dentry: dentry of destination
 *
 * Caller provides appropriate serialization.
 *
 * User space expects the directory offset value of the replaced
 * (new) directory entry to be unchanged after a rename.
 *
 * Returns zero on success, a negative errno value on failure.
 */
int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
                         struct inode *new_dir, struct dentry *new_dentry)
{
        struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
        struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
        long new_offset = dentry2offset(new_dentry);

        simple_offset_remove(old_ctx, old_dentry);

        if (new_offset) {
                offset_set(new_dentry, 0);
                return simple_offset_replace(new_ctx, old_dentry, new_offset);
        }
        return simple_offset_add(new_ctx, old_dentry);
}

/**
 * simple_offset_rename_exchange - exchange rename with directory offsets
 * @old_dir: parent of dentry being moved
 * @old_dentry: dentry being moved
 * @new_dir: destination parent
 * @new_dentry: destination dentry
 *
 * This API preserves the directory offset values. Caller provides
 * appropriate serialization.
 *
 * Returns zero on success. Otherwise a negative errno is returned and the
 * rename is rolled back.
 */
int simple_offset_rename_exchange(struct inode *old_dir,
                                  struct dentry *old_dentry,
                                  struct inode *new_dir,
                                  struct dentry *new_dentry)
{
        struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
        struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
        long old_index = dentry2offset(old_dentry);
        long new_index = dentry2offset(new_dentry);
        int ret;

        simple_offset_remove(old_ctx, old_dentry);
        simple_offset_remove(new_ctx, new_dentry);

        ret = simple_offset_replace(new_ctx, old_dentry, new_index);
        if (ret)
                goto out_restore;

        ret = simple_offset_replace(old_ctx, new_dentry, old_index);
        if (ret) {
                simple_offset_remove(new_ctx, old_dentry);
                goto out_restore;
        }

        ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
        if (ret) {
                simple_offset_remove(new_ctx, old_dentry);
                simple_offset_remove(old_ctx, new_dentry);
                goto out_restore;
        }
        return 0;

out_restore:
        (void)simple_offset_replace(old_ctx, old_dentry, old_index);
        (void)simple_offset_replace(new_ctx, new_dentry, new_index);
        return ret;
}

/**
 * simple_offset_destroy - Release offset map
 * @octx: directory offset ctx that is about to be destroyed
 *
 * During fs teardown (eg. umount), a directory's offset map might still
 * contain entries. xa_destroy() cleans out anything that remains.
 */
void simple_offset_destroy(struct offset_ctx *octx)
{
        mtree_destroy(&octx->mt);
}

/**
 * offset_dir_llseek - Advance the read position of a directory descriptor
 * @file: an open directory whose position is to be updated
 * @offset: a byte offset
 * @whence: enumerator describing the starting position for this update
 *
 * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories.
 *
 * Returns the updated read position if successful; otherwise a
 * negative errno is returned and the read position remains unchanged.
 */
static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
{
        switch (whence) {
        case SEEK_CUR:
                offset += file->f_pos;
                fallthrough;
        case SEEK_SET:
                if (offset >= 0)
                        break;
                fallthrough;
        default:
                return -EINVAL;
        }

        /* In this case, ->private_data is protected by f_pos_lock */
        file->private_data = NULL;
        return vfs_setpos(file, offset, LONG_MAX);
}

static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset)
{
        MA_STATE(mas, &octx->mt, offset, offset);
        struct dentry *child, *found = NULL;

        rcu_read_lock();
        child = mas_find(&mas, LONG_MAX);
        if (!child)
                goto out;
        spin_lock(&child->d_lock);
        if (simple_positive(child))
                found = dget_dlock(child);
        spin_unlock(&child->d_lock);
out:
        rcu_read_unlock();
        return found;
}

static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        long offset = dentry2offset(dentry);

        return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset,
                          inode->i_ino, fs_umode_to_dtype(inode->i_mode));
}

static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
{
        struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
        struct dentry *dentry;

        while (true) {
                dentry = offset_find_next(octx, ctx->pos);
                if (!dentry)
                        return ERR_PTR(-ENOENT);

                if (!offset_dir_emit(ctx, dentry)) {
                        dput(dentry);
                        break;
                }

                ctx->pos = dentry2offset(dentry) + 1;
                dput(dentry);
        }
        return NULL;
}

/**
 * offset_readdir - Emit entries starting at offset @ctx->pos
 * @file: an open directory to iterate over
 * @ctx: directory iteration context
 *
 * Caller must hold @file's i_rwsem to prevent insertion or removal of
 * entries during this call.
 *
 * On entry, @ctx->pos contains an offset that represents the first entry
 * to be read from the directory.
 *
 * The operation continues until there are no more entries to read, or
 * until the ctx->actor indicates there is no more space in the caller's
 * output buffer.
 *
 * On return, @ctx->pos contains an offset that will read the next entry
 * in this directory when offset_readdir() is called again with @ctx.
 *
 * Return values:
 *   %0 - Complete
 */
static int offset_readdir(struct file *file, struct dir_context *ctx)
{
        struct dentry *dir = file->f_path.dentry;

        lockdep_assert_held(&d_inode(dir)->i_rwsem);

        if (!dir_emit_dots(file, ctx))
                return 0;

        /* In this case, ->private_data is protected by f_pos_lock */
        if (ctx->pos == DIR_OFFSET_MIN)
                file->private_data = NULL;
        else if (file->private_data == ERR_PTR(-ENOENT))
                return 0;
        file->private_data = offset_iterate_dir(d_inode(dir), ctx);
        return 0;
}

const struct file_operations simple_offset_dir_operations = {
        .llseek                = offset_dir_llseek,
        .iterate_shared        = offset_readdir,
        .read                = generic_read_dir,
        .fsync                = noop_fsync,
};

static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
{
        struct dentry *child = NULL, *d;

        spin_lock(&parent->d_lock);
        d = prev ? d_next_sibling(prev) : d_first_child(parent);
        hlist_for_each_entry_from(d, d_sib) {
                if (simple_positive(d)) {
                        spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
                        if (simple_positive(d))
                                child = dget_dlock(d);
                        spin_unlock(&d->d_lock);
                        if (likely(child))
                                break;
                }
        }
        spin_unlock(&parent->d_lock);
        dput(prev);
        return child;
}

void simple_recursive_removal(struct dentry *dentry,
                              void (*callback)(struct dentry *))
{
        struct dentry *this = dget(dentry);
        while (true) {
                struct dentry *victim = NULL, *child;
                struct inode *inode = this->d_inode;

                inode_lock(inode);
                if (d_is_dir(this))
                        inode->i_flags |= S_DEAD;
                while ((child = find_next_child(this, victim)) == NULL) {
                        // kill and ascend
                        // update metadata while it's still locked
                        inode_set_ctime_current(inode);
                        clear_nlink(inode);
                        inode_unlock(inode);
                        victim = this;
                        this = this->d_parent;
                        inode = this->d_inode;
                        inode_lock(inode);
                        if (simple_positive(victim)) {
                                d_invalidate(victim);        // avoid lost mounts
                                if (d_is_dir(victim))
                                        fsnotify_rmdir(inode, victim);
                                else
                                        fsnotify_unlink(inode, victim);
                                if (callback)
                                        callback(victim);
                                dput(victim);                // unpin it
                        }
                        if (victim == dentry) {
                                inode_set_mtime_to_ts(inode,
                                                      inode_set_ctime_current(inode));
                                if (d_is_dir(dentry))
                                        drop_nlink(inode);
                                inode_unlock(inode);
                                dput(dentry);
                                return;
                        }
                }
                inode_unlock(inode);
                this = child;
        }
}
EXPORT_SYMBOL(simple_recursive_removal);

static const struct super_operations simple_super_operations = {
        .statfs                = simple_statfs,
};

static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = fc->fs_private;
        struct inode *root;

        s->s_maxbytes = MAX_LFS_FILESIZE;
        s->s_blocksize = PAGE_SIZE;
        s->s_blocksize_bits = PAGE_SHIFT;
        s->s_magic = ctx->magic;
        s->s_op = ctx->ops ?: &simple_super_operations;
        s->s_xattr = ctx->xattr;
        s->s_time_gran = 1;
        root = new_inode(s);
        if (!root)
                return -ENOMEM;

        /*
         * since this is the first inode, make it number 1. New inodes created
         * after this must take care not to collide with it (by passing
         * max_reserved of 1 to iunique).
         */
        root->i_ino = 1;
        root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
        simple_inode_init_ts(root);
        s->s_root = d_make_root(root);
        if (!s->s_root)
                return -ENOMEM;
        s->s_d_op = ctx->dops;
        return 0;
}

static int pseudo_fs_get_tree(struct fs_context *fc)
{
        return get_tree_nodev(fc, pseudo_fs_fill_super);
}

static void pseudo_fs_free(struct fs_context *fc)
{
        kfree(fc->fs_private);
}

static const struct fs_context_operations pseudo_fs_context_ops = {
        .free                = pseudo_fs_free,
        .get_tree        = pseudo_fs_get_tree,
};

/*
 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
 * will never be mountable)
 */
struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
                                        unsigned long magic)
{
        struct pseudo_fs_context *ctx;

        ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL);
        if (likely(ctx)) {
                ctx->magic = magic;
                fc->fs_private = ctx;
                fc->ops = &pseudo_fs_context_ops;
                fc->sb_flags |= SB_NOUSER;
                fc->global = true;
        }
        return ctx;
}
EXPORT_SYMBOL(init_pseudo);

int simple_open(struct inode *inode, struct file *file)
{
        if (inode->i_private)
                file->private_data = inode->i_private;
        return 0;
}
EXPORT_SYMBOL(simple_open);

int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(old_dentry);

        inode_set_mtime_to_ts(dir,
                              inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
        inc_nlink(inode);
        ihold(inode);
        dget(dentry);
        d_instantiate(dentry, inode);
        return 0;
}
EXPORT_SYMBOL(simple_link);

int simple_empty(struct dentry *dentry)
{
        struct dentry *child;
        int ret = 0;

        spin_lock(&dentry->d_lock);
        hlist_for_each_entry(child, &dentry->d_children, d_sib) {
                spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
                if (simple_positive(child)) {
                        spin_unlock(&child->d_lock);
                        goto out;
                }
                spin_unlock(&child->d_lock);
        }
        ret = 1;
out:
        spin_unlock(&dentry->d_lock);
        return ret;
}
EXPORT_SYMBOL(simple_empty);

int simple_unlink(struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        inode_set_mtime_to_ts(dir,
                              inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
        drop_nlink(inode);
        dput(dentry);
        return 0;
}
EXPORT_SYMBOL(simple_unlink);

int simple_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (!simple_empty(dentry))
                return -ENOTEMPTY;

        drop_nlink(d_inode(dentry));
        simple_unlink(dir, dentry);
        drop_nlink(dir);
        return 0;
}
EXPORT_SYMBOL(simple_rmdir);

/**
 * simple_rename_timestamp - update the various inode timestamps for rename
 * @old_dir: old parent directory
 * @old_dentry: dentry that is being renamed
 * @new_dir: new parent directory
 * @new_dentry: target for rename
 *
 * POSIX mandates that the old and new parent directories have their ctime and
 * mtime updated, and that inodes of @old_dentry and @new_dentry (if any), have
 * their ctime updated.
 */
void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry)
{
        struct inode *newino = d_inode(new_dentry);

        inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
        if (new_dir != old_dir)
                inode_set_mtime_to_ts(new_dir,
                                      inode_set_ctime_current(new_dir));
        inode_set_ctime_current(d_inode(old_dentry));
        if (newino)
                inode_set_ctime_current(newino);
}
EXPORT_SYMBOL_GPL(simple_rename_timestamp);

int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
                           struct inode *new_dir, struct dentry *new_dentry)
{
        bool old_is_dir = d_is_dir(old_dentry);
        bool new_is_dir = d_is_dir(new_dentry);

        if (old_dir != new_dir && old_is_dir != new_is_dir) {
                if (old_is_dir) {
                        drop_nlink(old_dir);
                        inc_nlink(new_dir);
                } else {
                        drop_nlink(new_dir);
                        inc_nlink(old_dir);
                }
        }
        simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
        return 0;
}
EXPORT_SYMBOL_GPL(simple_rename_exchange);

int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir,
                  struct dentry *old_dentry, struct inode *new_dir,
                  struct dentry *new_dentry, unsigned int flags)
{
        int they_are_dirs = d_is_dir(old_dentry);

        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
                return -EINVAL;

        if (flags & RENAME_EXCHANGE)
                return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);

        if (!simple_empty(new_dentry))
                return -ENOTEMPTY;

        if (d_really_is_positive(new_dentry)) {
                simple_unlink(new_dir, new_dentry);
                if (they_are_dirs) {
                        drop_nlink(d_inode(new_dentry));
                        drop_nlink(old_dir);
                }
        } else if (they_are_dirs) {
                drop_nlink(old_dir);
                inc_nlink(new_dir);
        }

        simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
        return 0;
}
EXPORT_SYMBOL(simple_rename);

/**
 * simple_setattr - setattr for simple filesystem
 * @idmap: idmap of the target mount
 * @dentry: dentry
 * @iattr: iattr structure
 *
 * Returns 0 on success, -error on failure.
 *
 * simple_setattr is a simple ->setattr implementation without a proper
 * implementation of size changes.
 *
 * It can either be used for in-memory filesystems or special files
 * on simple regular filesystems.  Anything that needs to change on-disk
 * or wire state on size changes needs its own setattr method.
 */
int simple_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                   struct iattr *iattr)
{
        struct inode *inode = d_inode(dentry);
        int error;

        error = setattr_prepare(idmap, dentry, iattr);
        if (error)
                return error;

        if (iattr->ia_valid & ATTR_SIZE)
                truncate_setsize(inode, iattr->ia_size);
        setattr_copy(idmap, inode, iattr);
        mark_inode_dirty(inode);
        return 0;
}
EXPORT_SYMBOL(simple_setattr);

static int simple_read_folio(struct file *file, struct folio *folio)
{
        folio_zero_range(folio, 0, folio_size(folio));
        flush_dcache_folio(folio);
        folio_mark_uptodate(folio);
        folio_unlock(folio);
        return 0;
}

int simple_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len,
                        struct page **pagep, void **fsdata)
{
        struct folio *folio;

        folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN,
                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio))
                return PTR_ERR(folio);

        *pagep = &folio->page;

        if (!folio_test_uptodate(folio) && (len != folio_size(folio))) {
                size_t from = offset_in_folio(folio, pos);

                folio_zero_segments(folio, 0, from,
                                from + len, folio_size(folio));
        }
        return 0;
}
EXPORT_SYMBOL(simple_write_begin);

/**
 * simple_write_end - .write_end helper for non-block-device FSes
 * @file: See .write_end of address_space_operations
 * @mapping:                 "
 * @pos:                 "
 * @len:                 "
 * @copied:                 "
 * @page:                 "
 * @fsdata:                 "
 *
 * simple_write_end does the minimum needed for updating a page after writing is
 * done. It has the same API signature as the .write_end of
 * address_space_operations vector. So it can just be set onto .write_end for
 * FSes that don't need any other processing. i_mutex is assumed to be held.
 * Block based filesystems should use generic_write_end().
 * NOTE: Even though i_size might get updated by this function, mark_inode_dirty
 * is not called, so a filesystem that actually does store data in .write_inode
 * should extend on what's done here with a call to mark_inode_dirty() in the
 * case that i_size has changed.
 *
 * Use *ONLY* with simple_read_folio()
 */
static int simple_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
{
        struct folio *folio = page_folio(page);
        struct inode *inode = folio->mapping->host;
        loff_t last_pos = pos + copied;

        /* zero the stale part of the folio if we did a short copy */
        if (!folio_test_uptodate(folio)) {
                if (copied < len) {
                        size_t from = offset_in_folio(folio, pos);

                        folio_zero_range(folio, from + copied, len - copied);
                }
                folio_mark_uptodate(folio);
        }
        /*
         * No need to use i_size_read() here, the i_size
         * cannot change under us because we hold the i_mutex.
         */
        if (last_pos > inode->i_size)
                i_size_write(inode, last_pos);

        folio_mark_dirty(folio);
        folio_unlock(folio);
        folio_put(folio);

        return copied;
}

/*
 * Provides ramfs-style behavior: data in the pagecache, but no writeback.
 */
const struct address_space_operations ram_aops = {
        .read_folio        = simple_read_folio,
        .write_begin        = simple_write_begin,
        .write_end        = simple_write_end,
        .dirty_folio        = noop_dirty_folio,
};
EXPORT_SYMBOL(ram_aops);

/*
 * the inodes created here are not hashed. If you use iunique to generate
 * unique inode values later for this filesystem, then you must take care
 * to pass it an appropriate max_reserved value to avoid collisions.
 */
int simple_fill_super(struct super_block *s, unsigned long magic,
                      const struct tree_descr *files)
{
        struct inode *inode;
        struct dentry *dentry;
        int i;

        s->s_blocksize = PAGE_SIZE;
        s->s_blocksize_bits = PAGE_SHIFT;
        s->s_magic = magic;
        s->s_op = &simple_super_operations;
        s->s_time_gran = 1;

        inode = new_inode(s);
        if (!inode)
                return -ENOMEM;
        /*
         * because the root inode is 1, the files array must not contain an
         * entry at index 1
         */
        inode->i_ino = 1;
        inode->i_mode = S_IFDIR | 0755;
        simple_inode_init_ts(inode);
        inode->i_op = &simple_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;
        set_nlink(inode, 2);
        s->s_root = d_make_root(inode);
        if (!s->s_root)
                return -ENOMEM;
        for (i = 0; !files->name || files->name[0]; i++, files++) {
                if (!files->name)
                        continue;

                /* warn if it tries to conflict with the root inode */
                if (unlikely(i == 1))
                        printk(KERN_WARNING "%s: %s passed in a files array"
                                "with an index of 1!\n", __func__,
                                s->s_type->name);

                dentry = d_alloc_name(s->s_root, files->name);
                if (!dentry)
                        return -ENOMEM;
                inode = new_inode(s);
                if (!inode) {
                        dput(dentry);
                        return -ENOMEM;
                }
                inode->i_mode = S_IFREG | files->mode;
                simple_inode_init_ts(inode);
                inode->i_fop = files->ops;
                inode->i_ino = i;
                d_add(dentry, inode);
        }
        return 0;
}
EXPORT_SYMBOL(simple_fill_super);

static DEFINE_SPINLOCK(pin_fs_lock);

int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count)
{
        struct vfsmount *mnt = NULL;
        spin_lock(&pin_fs_lock);
        if (unlikely(!*mount)) {
                spin_unlock(&pin_fs_lock);
                mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
                if (IS_ERR(mnt))
                        return PTR_ERR(mnt);
                spin_lock(&pin_fs_lock);
                if (!*mount)
                        *mount = mnt;
        }
        mntget(*mount);
        ++*count;
        spin_unlock(&pin_fs_lock);
        mntput(mnt);
        return 0;
}
EXPORT_SYMBOL(simple_pin_fs);

void simple_release_fs(struct vfsmount **mount, int *count)
{
        struct vfsmount *mnt;
        spin_lock(&pin_fs_lock);
        mnt = *mount;
        if (!--*count)
                *mount = NULL;
        spin_unlock(&pin_fs_lock);
        mntput(mnt);
}
EXPORT_SYMBOL(simple_release_fs);

/**
 * simple_read_from_buffer - copy data from the buffer to user space
 * @to: the user space buffer to read to
 * @count: the maximum number of bytes to read
 * @ppos: the current position in the buffer
 * @from: the buffer to read from
 * @available: the size of the buffer
 *
 * The simple_read_from_buffer() function reads up to @count bytes from the
 * buffer @from at offset @ppos into the user space address starting at @to.
 *
 * On success, the number of bytes read is returned and the offset @ppos is
 * advanced by this number, or negative value is returned on error.
 **/
ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
                                const void *from, size_t available)
{
        loff_t pos = *ppos;
        size_t ret;

        if (pos < 0)
                return -EINVAL;
        if (pos >= available || !count)
                return 0;
        if (count > available - pos)
                count = available - pos;
        ret = copy_to_user(to, from + pos, count);
        if (ret == count)
                return -EFAULT;
        count -= ret;
        *ppos = pos + count;
        return count;
}
EXPORT_SYMBOL(simple_read_from_buffer);

/**
 * simple_write_to_buffer - copy data from user space to the buffer
 * @to: the buffer to write to
 * @available: the size of the buffer
 * @ppos: the current position in the buffer
 * @from: the user space buffer to read from
 * @count: the maximum number of bytes to read
 *
 * The simple_write_to_buffer() function reads up to @count bytes from the user
 * space address starting at @from into the buffer @to at offset @ppos.
 *
 * On success, the number of bytes written is returned and the offset @ppos is
 * advanced by this number, or negative value is returned on error.
 **/
ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
                const void __user *from, size_t count)
{
        loff_t pos = *ppos;
        size_t res;

        if (pos < 0)
                return -EINVAL;
        if (pos >= available || !count)
                return 0;
        if (count > available - pos)
                count = available - pos;
        res = copy_from_user(to + pos, from, count);
        if (res == count)
                return -EFAULT;
        count -= res;
        *ppos = pos + count;
        return count;
}
EXPORT_SYMBOL(simple_write_to_buffer);

/**
 * memory_read_from_buffer - copy data from the buffer
 * @to: the kernel space buffer to read to
 * @count: the maximum number of bytes to read
 * @ppos: the current position in the buffer
 * @from: the buffer to read from
 * @available: the size of the buffer
 *
 * The memory_read_from_buffer() function reads up to @count bytes from the
 * buffer @from at offset @ppos into the kernel space address starting at @to.
 *
 * On success, the number of bytes read is returned and the offset @ppos is
 * advanced by this number, or negative value is returned on error.
 **/
ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
                                const void *from, size_t available)
{
        loff_t pos = *ppos;

        if (pos < 0)
                return -EINVAL;
        if (pos >= available)
                return 0;
        if (count > available - pos)
                count = available - pos;
        memcpy(to, from + pos, count);
        *ppos = pos + count;

        return count;
}
EXPORT_SYMBOL(memory_read_from_buffer);

/*
 * Transaction based IO.
 * The file expects a single write which triggers the transaction, and then
 * possibly a read which collects the result - which is stored in a
 * file-local buffer.
 */

void simple_transaction_set(struct file *file, size_t n)
{
        struct simple_transaction_argresp *ar = file->private_data;

        BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);

        /*
         * The barrier ensures that ar->size will really remain zero until
         * ar->data is ready for reading.
         */
        smp_mb();
        ar->size = n;
}
EXPORT_SYMBOL(simple_transaction_set);

char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
{
        struct simple_transaction_argresp *ar;
        static DEFINE_SPINLOCK(simple_transaction_lock);

        if (size > SIMPLE_TRANSACTION_LIMIT - 1)
                return ERR_PTR(-EFBIG);

        ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL);
        if (!ar)
                return ERR_PTR(-ENOMEM);

        spin_lock(&simple_transaction_lock);

        /* only one write allowed per open */
        if (file->private_data) {
                spin_unlock(&simple_transaction_lock);
                free_page((unsigned long)ar);
                return ERR_PTR(-EBUSY);
        }

        file->private_data = ar;

        spin_unlock(&simple_transaction_lock);

        if (copy_from_user(ar->data, buf, size))
                return ERR_PTR(-EFAULT);

        return ar->data;
}
EXPORT_SYMBOL(simple_transaction_get);

ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
{
        struct simple_transaction_argresp *ar = file->private_data;

        if (!ar)
                return 0;
        return simple_read_from_buffer(buf, size, pos, ar->data, ar->size);
}
EXPORT_SYMBOL(simple_transaction_read);

int simple_transaction_release(struct inode *inode, struct file *file)
{
        free_page((unsigned long)file->private_data);
        return 0;
}
EXPORT_SYMBOL(simple_transaction_release);

/* Simple attribute files */

struct simple_attr {
        int (*get)(void *, u64 *);
        int (*set)(void *, u64);
        char get_buf[24];        /* enough to store a u64 and "\n\0" */
        char set_buf[24];
        void *data;
        const char *fmt;        /* format for read operation */
        struct mutex mutex;        /* protects access to these buffers */
};

/* simple_attr_open is called by an actual attribute open file operation
 * to set the attribute specific access operations. */
int simple_attr_open(struct inode *inode, struct file *file,
                     int (*get)(void *, u64 *), int (*set)(void *, u64),
                     const char *fmt)
{
        struct simple_attr *attr;

        attr = kzalloc(sizeof(*attr), GFP_KERNEL);
        if (!attr)
                return -ENOMEM;

        attr->get = get;
        attr->set = set;
        attr->data = inode->i_private;
        attr->fmt = fmt;
        mutex_init(&attr->mutex);

        file->private_data = attr;

        return nonseekable_open(inode, file);
}
EXPORT_SYMBOL_GPL(simple_attr_open);

int simple_attr_release(struct inode *inode, struct file *file)
{
        kfree(file->private_data);
        return 0;
}
EXPORT_SYMBOL_GPL(simple_attr_release);        /* GPL-only?  This?  Really? */

/* read from the buffer that is filled with the get function */
ssize_t simple_attr_read(struct file *file, char __user *buf,
                         size_t len, loff_t *ppos)
{
        struct simple_attr *attr;
        size_t size;
        ssize_t ret;

        attr = file->private_data;

        if (!attr->get)
                return -EACCES;

        ret = mutex_lock_interruptible(&attr->mutex);
        if (ret)
                return ret;

        if (*ppos && attr->get_buf[0]) {
                /* continued read */
                size = strlen(attr->get_buf);
        } else {
                /* first read */
                u64 val;
                ret = attr->get(attr->data, &val);
                if (ret)
                        goto out;

                size = scnprintf(attr->get_buf, sizeof(attr->get_buf),
                                 attr->fmt, (unsigned long long)val);
        }

        ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size);
out:
        mutex_unlock(&attr->mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(simple_attr_read);

/* interpret the buffer as a number to call the set function with */
static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos, bool is_signed)
{
        struct simple_attr *attr;
        unsigned long long val;
        size_t size;
        ssize_t ret;

        attr = file->private_data;
        if (!attr->set)
                return -EACCES;

        ret = mutex_lock_interruptible(&attr->mutex);
        if (ret)
                return ret;

        ret = -EFAULT;
        size = min(sizeof(attr->set_buf) - 1, len);
        if (copy_from_user(attr->set_buf, buf, size))
                goto out;

        attr->set_buf[size] = '\0';
        if (is_signed)
                ret = kstrtoll(attr->set_buf, 0, &val);
        else
                ret = kstrtoull(attr->set_buf, 0, &val);
        if (ret)
                goto out;
        ret = attr->set(attr->data, val);
        if (ret == 0)
                ret = len; /* on success, claim we got the whole input */
out:
        mutex_unlock(&attr->mutex);
        return ret;
}

ssize_t simple_attr_write(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos)
{
        return simple_attr_write_xsigned(file, buf, len, ppos, false);
}
EXPORT_SYMBOL_GPL(simple_attr_write);

ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos)
{
        return simple_attr_write_xsigned(file, buf, len, ppos, true);
}
EXPORT_SYMBOL_GPL(simple_attr_write_signed);

/**
 * generic_encode_ino32_fh - generic export_operations->encode_fh function
 * @inode:   the object to encode
 * @fh:      where to store the file handle fragment
 * @max_len: maximum length to store there (in 4 byte units)
 * @parent:  parent directory inode, if wanted
 *
 * This generic encode_fh function assumes that the 32 inode number
 * is suitable for locating an inode, and that the generation number
 * can be used to check that it is still valid.  It places them in the
 * filehandle fragment where export_decode_fh expects to find them.
 */
int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len,
                            struct inode *parent)
{
        struct fid *fid = (void *)fh;
        int len = *max_len;
        int type = FILEID_INO32_GEN;

        if (parent && (len < 4)) {
                *max_len = 4;
                return FILEID_INVALID;
        } else if (len < 2) {
                *max_len = 2;
                return FILEID_INVALID;
        }

        len = 2;
        fid->i32.ino = inode->i_ino;
        fid->i32.gen = inode->i_generation;
        if (parent) {
                fid->i32.parent_ino = parent->i_ino;
                fid->i32.parent_gen = parent->i_generation;
                len = 4;
                type = FILEID_INO32_GEN_PARENT;
        }
        *max_len = len;
        return type;
}
EXPORT_SYMBOL_GPL(generic_encode_ino32_fh);

/**
 * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
 * @sb:                filesystem to do the file handle conversion on
 * @fid:        file handle to convert
 * @fh_len:        length of the file handle in bytes
 * @fh_type:        type of file handle
 * @get_inode:        filesystem callback to retrieve inode
 *
 * This function decodes @fid as long as it has one of the well-known
 * Linux filehandle types and calls @get_inode on it to retrieve the
 * inode for the object specified in the file handle.
 */
struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid,
                int fh_len, int fh_type, struct inode *(*get_inode)
                        (struct super_block *sb, u64 ino, u32 gen))
{
        struct inode *inode = NULL;

        if (fh_len < 2)
                return NULL;

        switch (fh_type) {
        case FILEID_INO32_GEN:
        case FILEID_INO32_GEN_PARENT:
                inode = get_inode(sb, fid->i32.ino, fid->i32.gen);
                break;
        }

        return d_obtain_alias(inode);
}
EXPORT_SYMBOL_GPL(generic_fh_to_dentry);

/**
 * generic_fh_to_parent - generic helper for the fh_to_parent export operation
 * @sb:                filesystem to do the file handle conversion on
 * @fid:        file handle to convert
 * @fh_len:        length of the file handle in bytes
 * @fh_type:        type of file handle
 * @get_inode:        filesystem callback to retrieve inode
 *
 * This function decodes @fid as long as it has one of the well-known
 * Linux filehandle types and calls @get_inode on it to retrieve the
 * inode for the _parent_ object specified in the file handle if it
 * is specified in the file handle, or NULL otherwise.
 */
struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
                int fh_len, int fh_type, struct inode *(*get_inode)
                        (struct super_block *sb, u64 ino, u32 gen))
{
        struct inode *inode = NULL;

        if (fh_len <= 2)
                return NULL;

        switch (fh_type) {
        case FILEID_INO32_GEN_PARENT:
                inode = get_inode(sb, fid->i32.parent_ino,
                                  (fh_len > 3 ? fid->i32.parent_gen : 0));
                break;
        }

        return d_obtain_alias(inode);
}
EXPORT_SYMBOL_GPL(generic_fh_to_parent);

/**
 * __generic_file_fsync - generic fsync implementation for simple filesystems
 *
 * @file:        file to synchronize
 * @start:        start offset in bytes
 * @end:        end offset in bytes (inclusive)
 * @datasync:        only synchronize essential metadata if true
 *
 * This is a generic implementation of the fsync method for simple
 * filesystems which track all non-inode metadata in the buffers list
 * hanging off the address_space structure.
 */
int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
                                 int datasync)
{
        struct inode *inode = file->f_mapping->host;
        int err;
        int ret;

        err = file_write_and_wait_range(file, start, end);
        if (err)
                return err;

        inode_lock(inode);
        ret = sync_mapping_buffers(inode->i_mapping);
        if (!(inode->i_state & I_DIRTY_ALL))
                goto out;
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
                goto out;

        err = sync_inode_metadata(inode, 1);
        if (ret == 0)
                ret = err;

out:
        inode_unlock(inode);
        /* check and advance again to catch errors after syncing out buffers */
        err = file_check_and_advance_wb_err(file);
        if (ret == 0)
                ret = err;
        return ret;
}
EXPORT_SYMBOL(__generic_file_fsync);

/**
 * generic_file_fsync - generic fsync implementation for simple filesystems
 *                        with flush
 * @file:        file to synchronize
 * @start:        start offset in bytes
 * @end:        end offset in bytes (inclusive)
 * @datasync:        only synchronize essential metadata if true
 *
 */

int generic_file_fsync(struct file *file, loff_t start, loff_t end,
                       int datasync)
{
        struct inode *inode = file->f_mapping->host;
        int err;

        err = __generic_file_fsync(file, start, end, datasync);
        if (err)
                return err;
        return blkdev_issue_flush(inode->i_sb->s_bdev);
}
EXPORT_SYMBOL(generic_file_fsync);

/**
 * generic_check_addressable - Check addressability of file system
 * @blocksize_bits:        log of file system block size
 * @num_blocks:                number of blocks in file system
 *
 * Determine whether a file system with @num_blocks blocks (and a
 * block size of 2**@blocksize_bits) is addressable by the sector_t
 * and page cache of the system.  Return 0 if so and -EFBIG otherwise.
 */
int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
{
        u64 last_fs_block = num_blocks - 1;
        u64 last_fs_page =
                last_fs_block >> (PAGE_SHIFT - blocksize_bits);

        if (unlikely(num_blocks == 0))
                return 0;

        if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT))
                return -EINVAL;

        if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
            (last_fs_page > (pgoff_t)(~0ULL))) {
                return -EFBIG;
        }
        return 0;
}
EXPORT_SYMBOL(generic_check_addressable);

/*
 * No-op implementation of ->fsync for in-memory filesystems.
 */
int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
        return 0;
}
EXPORT_SYMBOL(noop_fsync);

ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
        /*
         * iomap based filesystems support direct I/O without need for
         * this callback. However, it still needs to be set in
         * inode->a_ops so that open/fcntl know that direct I/O is
         * generally supported.
         */
        return -EINVAL;
}
EXPORT_SYMBOL_GPL(noop_direct_IO);

/* Because kfree isn't assignment-compatible with void(void*) ;-/ */
void kfree_link(void *p)
{
        kfree(p);
}
EXPORT_SYMBOL(kfree_link);

struct inode *alloc_anon_inode(struct super_block *s)
{
        static const struct address_space_operations anon_aops = {
                .dirty_folio        = noop_dirty_folio,
        };
        struct inode *inode = new_inode_pseudo(s);

        if (!inode)
                return ERR_PTR(-ENOMEM);

        inode->i_ino = get_next_ino();
        inode->i_mapping->a_ops = &anon_aops;

        /*
         * Mark the inode dirty from the very beginning,
         * that way it will never be moved to the dirty
         * list because mark_inode_dirty() will think
         * that it already _is_ on the dirty list.
         */
        inode->i_state = I_DIRTY;
        inode->i_mode = S_IRUSR | S_IWUSR;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
        inode->i_flags |= S_PRIVATE;
        simple_inode_init_ts(inode);
        return inode;
}
EXPORT_SYMBOL(alloc_anon_inode);

/**
 * simple_nosetlease - generic helper for prohibiting leases
 * @filp: file pointer
 * @arg: type of lease to obtain
 * @flp: new lease supplied for insertion
 * @priv: private data for lm_setup operation
 *
 * Generic helper for filesystems that do not wish to allow leases to be set.
 * All arguments are ignored and it just returns -EINVAL.
 */
int
simple_nosetlease(struct file *filp, int arg, struct file_lease **flp,
                  void **priv)
{
        return -EINVAL;
}
EXPORT_SYMBOL(simple_nosetlease);

/**
 * simple_get_link - generic helper to get the target of "fast" symlinks
 * @dentry: not used here
 * @inode: the symlink inode
 * @done: not used here
 *
 * Generic helper for filesystems to use for symlink inodes where a pointer to
 * the symlink target is stored in ->i_link.  NOTE: this isn't normally called,
 * since as an optimization the path lookup code uses any non-NULL ->i_link
 * directly, without calling ->get_link().  But ->get_link() still must be set,
 * to mark the inode_operations as being for a symlink.
 *
 * Return: the symlink target
 */
const char *simple_get_link(struct dentry *dentry, struct inode *inode,
                            struct delayed_call *done)
{
        return inode->i_link;
}
EXPORT_SYMBOL(simple_get_link);

const struct inode_operations simple_symlink_inode_operations = {
        .get_link = simple_get_link,
};
EXPORT_SYMBOL(simple_symlink_inode_operations);

/*
 * Operations for a permanently empty directory.
 */
static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        return ERR_PTR(-ENOENT);
}

static int empty_dir_getattr(struct mnt_idmap *idmap,
                             const struct path *path, struct kstat *stat,
                             u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
        return 0;
}

static int empty_dir_setattr(struct mnt_idmap *idmap,
                             struct dentry *dentry, struct iattr *attr)
{
        return -EPERM;
}

static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size)
{
        return -EOPNOTSUPP;
}

static const struct inode_operations empty_dir_inode_operations = {
        .lookup                = empty_dir_lookup,
        .permission        = generic_permission,
        .setattr        = empty_dir_setattr,
        .getattr        = empty_dir_getattr,
        .listxattr        = empty_dir_listxattr,
};

static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence)
{
        /* An empty directory has two entries . and .. at offsets 0 and 1 */
        return generic_file_llseek_size(file, offset, whence, 2, 2);
}

static int empty_dir_readdir(struct file *file, struct dir_context *ctx)
{
        dir_emit_dots(file, ctx);
        return 0;
}

static const struct file_operations empty_dir_operations = {
        .llseek                = empty_dir_llseek,
        .read                = generic_read_dir,
        .iterate_shared        = empty_dir_readdir,
        .fsync                = noop_fsync,
};


void make_empty_dir_inode(struct inode *inode)
{
        set_nlink(inode, 2);
        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
        inode->i_uid = GLOBAL_ROOT_UID;
        inode->i_gid = GLOBAL_ROOT_GID;
        inode->i_rdev = 0;
        inode->i_size = 0;
        inode->i_blkbits = PAGE_SHIFT;
        inode->i_blocks = 0;

        inode->i_op = &empty_dir_inode_operations;
        inode->i_opflags &= ~IOP_XATTR;
        inode->i_fop = &empty_dir_operations;
}

bool is_empty_dir_inode(struct inode *inode)
{
        return (inode->i_fop == &empty_dir_operations) &&
                (inode->i_op == &empty_dir_inode_operations);
}

#if IS_ENABLED(CONFIG_UNICODE)
/**
 * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems
 * @dentry:        dentry whose name we are checking against
 * @len:        len of name of dentry
 * @str:        str pointer to name of dentry
 * @name:        Name to compare against
 *
 * Return: 0 if names match, 1 if mismatch, or -ERRNO
 */
static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
                                const char *str, const struct qstr *name)
{
        const struct dentry *parent;
        const struct inode *dir;
        char strbuf[DNAME_INLINE_LEN];
        struct qstr qstr;

        /*
         * Attempt a case-sensitive match first. It is cheaper and
         * should cover most lookups, including all the sane
         * applications that expect a case-sensitive filesystem.
         *
         * This comparison is safe under RCU because the caller
         * guarantees the consistency between str and len. See
         * __d_lookup_rcu_op_compare() for details.
         */
        if (len == name->len && !memcmp(str, name->name, len))
                return 0;

        parent = READ_ONCE(dentry->d_parent);
        dir = READ_ONCE(parent->d_inode);
        if (!dir || !IS_CASEFOLDED(dir))
                return 1;

        /*
         * If the dentry name is stored in-line, then it may be concurrently
         * modified by a rename.  If this happens, the VFS will eventually retry
         * the lookup, so it doesn't matter what ->d_compare() returns.
         * However, it's unsafe to call utf8_strncasecmp() with an unstable
         * string.  Therefore, we have to copy the name into a temporary buffer.
         */
        if (len <= DNAME_INLINE_LEN - 1) {
                memcpy(strbuf, str, len);
                strbuf[len] = 0;
                str = strbuf;
                /* prevent compiler from optimizing out the temporary buffer */
                barrier();
        }
        qstr.len = len;
        qstr.name = str;

        return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr);
}

/**
 * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems
 * @dentry:        dentry of the parent directory
 * @str:        qstr of name whose hash we should fill in
 *
 * Return: 0 if hash was successful or unchanged, and -EINVAL on error
 */
static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
{
        const struct inode *dir = READ_ONCE(dentry->d_inode);
        struct super_block *sb = dentry->d_sb;
        const struct unicode_map *um = sb->s_encoding;
        int ret;

        if (!dir || !IS_CASEFOLDED(dir))
                return 0;

        ret = utf8_casefold_hash(um, dentry, str);
        if (ret < 0 && sb_has_strict_encoding(sb))
                return -EINVAL;
        return 0;
}

static const struct dentry_operations generic_ci_dentry_ops = {
        .d_hash = generic_ci_d_hash,
        .d_compare = generic_ci_d_compare,
#ifdef CONFIG_FS_ENCRYPTION
        .d_revalidate = fscrypt_d_revalidate,
#endif
};
#endif

#ifdef CONFIG_FS_ENCRYPTION
static const struct dentry_operations generic_encrypted_dentry_ops = {
        .d_revalidate = fscrypt_d_revalidate,
};
#endif

/**
 * generic_set_sb_d_ops - helper for choosing the set of
 * filesystem-wide dentry operations for the enabled features
 * @sb: superblock to be configured
 *
 * Filesystems supporting casefolding and/or fscrypt can call this
 * helper at mount-time to configure sb->s_d_op to best set of dentry
 * operations required for the enabled features. The helper must be
 * called after these have been configured, but before the root dentry
 * is created.
 */
void generic_set_sb_d_ops(struct super_block *sb)
{
#if IS_ENABLED(CONFIG_UNICODE)
        if (sb->s_encoding) {
                sb->s_d_op = &generic_ci_dentry_ops;
                return;
        }
#endif
#ifdef CONFIG_FS_ENCRYPTION
        if (sb->s_cop) {
                sb->s_d_op = &generic_encrypted_dentry_ops;
                return;
        }
#endif
}
EXPORT_SYMBOL(generic_set_sb_d_ops);

/**
 * inode_maybe_inc_iversion - increments i_version
 * @inode: inode with the i_version that should be updated
 * @force: increment the counter even if it's not necessary?
 *
 * Every time the inode is modified, the i_version field must be seen to have
 * changed by any observer.
 *
 * If "force" is set or the QUERIED flag is set, then ensure that we increment
 * the value, and clear the queried flag.
 *
 * In the common case where neither is set, then we can return "false" without
 * updating i_version.
 *
 * If this function returns false, and no other metadata has changed, then we
 * can avoid logging the metadata.
 */
bool inode_maybe_inc_iversion(struct inode *inode, bool force)
{
        u64 cur, new;

        /*
         * The i_version field is not strictly ordered with any other inode
         * information, but the legacy inode_inc_iversion code used a spinlock
         * to serialize increments.
         *
         * Here, we add full memory barriers to ensure that any de-facto
         * ordering with other info is preserved.
         *
         * This barrier pairs with the barrier in inode_query_iversion()
         */
        smp_mb();
        cur = inode_peek_iversion_raw(inode);
        do {
                /* If flag is clear then we needn't do anything */
                if (!force && !(cur & I_VERSION_QUERIED))
                        return false;

                /* Since lowest bit is flag, add 2 to avoid it */
                new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
        } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
        return true;
}
EXPORT_SYMBOL(inode_maybe_inc_iversion);

/**
 * inode_query_iversion - read i_version for later use
 * @inode: inode from which i_version should be read
 *
 * Read the inode i_version counter. This should be used by callers that wish
 * to store the returned i_version for later comparison. This will guarantee
 * that a later query of the i_version will result in a different value if
 * anything has changed.
 *
 * In this implementation, we fetch the current value, set the QUERIED flag and
 * then try to swap it into place with a cmpxchg, if it wasn't already set. If
 * that fails, we try again with the newly fetched value from the cmpxchg.
 */
u64 inode_query_iversion(struct inode *inode)
{
        u64 cur, new;

        cur = inode_peek_iversion_raw(inode);
        do {
                /* If flag is already set, then no need to swap */
                if (cur & I_VERSION_QUERIED) {
                        /*
                         * This barrier (and the implicit barrier in the
                         * cmpxchg below) pairs with the barrier in
                         * inode_maybe_inc_iversion().
                         */
                        smp_mb();
                        break;
                }

                new = cur | I_VERSION_QUERIED;
        } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
        return cur >> I_VERSION_QUERIED_SHIFT;
}
EXPORT_SYMBOL(inode_query_iversion);

ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
                ssize_t direct_written, ssize_t buffered_written)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;
        loff_t pos = iocb->ki_pos - buffered_written;
        loff_t end = iocb->ki_pos - 1;
        int err;

        /*
         * If the buffered write fallback returned an error, we want to return
         * the number of bytes which were written by direct I/O, or the error
         * code if that was zero.
         *
         * Note that this differs from normal direct-io semantics, which will
         * return -EFOO even if some bytes were written.
         */
        if (unlikely(buffered_written < 0)) {
                if (direct_written)
                        return direct_written;
                return buffered_written;
        }

        /*
         * We need to ensure that the page cache pages are written to disk and
         * invalidated to preserve the expected O_DIRECT semantics.
         */
        err = filemap_write_and_wait_range(mapping, pos, end);
        if (err < 0) {
                /*
                 * We don't know how much we wrote, so just return the number of
                 * bytes which were direct-written
                 */
                iocb->ki_pos -= buffered_written;
                if (direct_written)
                        return direct_written;
                return err;
        }
        invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
        return direct_written + buffered_written;
}
EXPORT_SYMBOL_GPL(direct_write_fallback);

/**
 * simple_inode_init_ts - initialize the timestamps for a new inode
 * @inode: inode to be initialized
 *
 * When a new inode is created, most filesystems set the timestamps to the
 * current time. Add a helper to do this.
 */
struct timespec64 simple_inode_init_ts(struct inode *inode)
{
        struct timespec64 ts = inode_set_ctime_current(inode);

        inode_set_atime_to_ts(inode, ts);
        inode_set_mtime_to_ts(inode, ts);
        return ts;
}
EXPORT_SYMBOL(simple_inode_init_ts);

static inline struct dentry *get_stashed_dentry(struct dentry *stashed)
{
        struct dentry *dentry;

        guard(rcu)();
        dentry = READ_ONCE(stashed);
        if (!dentry)
                return NULL;
        if (!lockref_get_not_dead(&dentry->d_lockref))
                return NULL;
        return dentry;
}

static struct dentry *prepare_anon_dentry(struct dentry **stashed,
                                          struct super_block *sb,
                                          void *data)
{
        struct dentry *dentry;
        struct inode *inode;
        const struct stashed_operations *sops = sb->s_fs_info;
        int ret;

        inode = new_inode_pseudo(sb);
        if (!inode) {
                sops->put_data(data);
                return ERR_PTR(-ENOMEM);
        }

        inode->i_flags |= S_IMMUTABLE;
        inode->i_mode = S_IFREG;
        simple_inode_init_ts(inode);

        ret = sops->init_inode(inode, data);
        if (ret < 0) {
                iput(inode);
                return ERR_PTR(ret);
        }

        /* Notice when this is changed. */
        WARN_ON_ONCE(!S_ISREG(inode->i_mode));
        WARN_ON_ONCE(!IS_IMMUTABLE(inode));

        dentry = d_alloc_anon(sb);
        if (!dentry) {
                iput(inode);
                return ERR_PTR(-ENOMEM);
        }

        /* Store address of location where dentry's supposed to be stashed. */
        dentry->d_fsdata = stashed;

        /* @data is now owned by the fs */
        d_instantiate(dentry, inode);
        return dentry;
}

static struct dentry *stash_dentry(struct dentry **stashed,
                                   struct dentry *dentry)
{
        guard(rcu)();
        for (;;) {
                struct dentry *old;

                /* Assume any old dentry was cleared out. */
                old = cmpxchg(stashed, NULL, dentry);
                if (likely(!old))
                        return dentry;

                /* Check if somebody else installed a reusable dentry. */
                if (lockref_get_not_dead(&old->d_lockref))
                        return old;

                /* There's an old dead dentry there, try to take it over. */
                if (likely(try_cmpxchg(stashed, &old, dentry)))
                        return dentry;
        }
}

/**
 * path_from_stashed - create path from stashed or new dentry
 * @stashed:    where to retrieve or stash dentry
 * @mnt:        mnt of the filesystems to use
 * @data:       data to store in inode->i_private
 * @path:       path to create
 *
 * The function tries to retrieve a stashed dentry from @stashed. If the dentry
 * is still valid then it will be reused. If the dentry isn't able the function
 * will allocate a new dentry and inode. It will then check again whether it
 * can reuse an existing dentry in case one has been added in the meantime or
 * update @stashed with the newly added dentry.
 *
 * Special-purpose helper for nsfs and pidfs.
 *
 * Return: On success zero and on failure a negative error is returned.
 */
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
                      struct path *path)
{
        struct dentry *dentry;
        const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;

        /* See if dentry can be reused. */
        path->dentry = get_stashed_dentry(*stashed);
        if (path->dentry) {
                sops->put_data(data);
                goto out_path;
        }

        /* Allocate a new dentry. */
        dentry = prepare_anon_dentry(stashed, mnt->mnt_sb, data);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);

        /* Added a new dentry. @data is now owned by the filesystem. */
        path->dentry = stash_dentry(stashed, dentry);
        if (path->dentry != dentry)
                dput(dentry);

out_path:
        WARN_ON_ONCE(path->dentry->d_fsdata != stashed);
        WARN_ON_ONCE(d_inode(path->dentry)->i_private != data);
        path->mnt = mntget(mnt);
        return 0;
}

void stashed_dentry_prune(struct dentry *dentry)
{
        struct dentry **stashed = dentry->d_fsdata;
        struct inode *inode = d_inode(dentry);

        if (WARN_ON_ONCE(!stashed))
                return;

        if (!inode)
                return;

        /*
         * Only replace our own @dentry as someone else might've
         * already cleared out @dentry and stashed their own
         * dentry in there.
         */
        cmpxchg(stashed, dentry, NULL);
}







































































    5 





    4 
    4 





































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
// SPDX-License-Identifier: GPL-2.0
/*
 *  Implement mseal() syscall.
 *
 *  Copyright (c) 2023,2024 Google, Inc.
 *
 *  Author: Jeff Xu <jeffxu@chromium.org>
 */

#include <linux/mempolicy.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_context.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
#include "internal.h"

static inline bool vma_is_sealed(struct vm_area_struct *vma)
{
        return (vma->vm_flags & VM_SEALED);
}

static inline void set_vma_sealed(struct vm_area_struct *vma)
{
        vm_flags_set(vma, VM_SEALED);
}

/*
 * check if a vma is sealed for modification.
 * return true, if modification is allowed.
 */
static bool can_modify_vma(struct vm_area_struct *vma)
{
        if (unlikely(vma_is_sealed(vma)))
                return false;

        return true;
}

static bool is_madv_discard(int behavior)
{
        return        behavior &
                (MADV_FREE | MADV_DONTNEED | MADV_DONTNEED_LOCKED |
                 MADV_REMOVE | MADV_DONTFORK | MADV_WIPEONFORK);
}

static bool is_ro_anon(struct vm_area_struct *vma)
{
        /* check anonymous mapping. */
        if (vma->vm_file || vma->vm_flags & VM_SHARED)
                return false;

        /*
         * check for non-writable:
         * PROT=RO or PKRU is not writeable.
         */
        if (!(vma->vm_flags & VM_WRITE) ||
                !arch_vma_access_permitted(vma, true, false, false))
                return true;

        return false;
}

/*
 * Check if the vmas of a memory range are allowed to be modified.
 * the memory ranger can have a gap (unallocated memory).
 * return true, if it is allowed.
 */
bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end)
{
        struct vm_area_struct *vma;

        VMA_ITERATOR(vmi, mm, start);

        /* going through each vma to check. */
        for_each_vma_range(vmi, vma, end) {
                if (unlikely(!can_modify_vma(vma)))
                        return false;
        }

        /* Allow by default. */
        return true;
}

/*
 * Check if the vmas of a memory range are allowed to be modified by madvise.
 * the memory ranger can have a gap (unallocated memory).
 * return true, if it is allowed.
 */
bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
                int behavior)
{
        struct vm_area_struct *vma;

        VMA_ITERATOR(vmi, mm, start);

        if (!is_madv_discard(behavior))
                return true;

        /* going through each vma to check. */
        for_each_vma_range(vmi, vma, end)
                if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)))
                        return false;

        /* Allow by default. */
        return true;
}

static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
                struct vm_area_struct **prev, unsigned long start,
                unsigned long end, vm_flags_t newflags)
{
        int ret = 0;
        vm_flags_t oldflags = vma->vm_flags;

        if (newflags == oldflags)
                goto out;

        vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
        if (IS_ERR(vma)) {
                ret = PTR_ERR(vma);
                goto out;
        }

        set_vma_sealed(vma);
out:
        *prev = vma;
        return ret;
}

/*
 * Check for do_mseal:
 * 1> start is part of a valid vma.
 * 2> end is part of a valid vma.
 * 3> No gap (unallocated address) between start and end.
 * 4> map is sealable.
 */
static int check_mm_seal(unsigned long start, unsigned long end)
{
        struct vm_area_struct *vma;
        unsigned long nstart = start;

        VMA_ITERATOR(vmi, current->mm, start);

        /* going through each vma to check. */
        for_each_vma_range(vmi, vma, end) {
                if (vma->vm_start > nstart)
                        /* unallocated memory found. */
                        return -ENOMEM;

                if (vma->vm_end >= end)
                        return 0;

                nstart = vma->vm_end;
        }

        return -ENOMEM;
}

/*
 * Apply sealing.
 */
static int apply_mm_seal(unsigned long start, unsigned long end)
{
        unsigned long nstart;
        struct vm_area_struct *vma, *prev;

        VMA_ITERATOR(vmi, current->mm, start);

        vma = vma_iter_load(&vmi);
        /*
         * Note: check_mm_seal should already checked ENOMEM case.
         * so vma should not be null, same for the other ENOMEM cases.
         */
        prev = vma_prev(&vmi);
        if (start > vma->vm_start)
                prev = vma;

        nstart = start;
        for_each_vma_range(vmi, vma, end) {
                int error;
                unsigned long tmp;
                vm_flags_t newflags;

                newflags = vma->vm_flags | VM_SEALED;
                tmp = vma->vm_end;
                if (tmp > end)
                        tmp = end;
                error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
                if (error)
                        return error;
                nstart = vma_iter_end(&vmi);
        }

        return 0;
}

/*
 * mseal(2) seals the VM's meta data from
 * selected syscalls.
 *
 * addr/len: VM address range.
 *
 *  The address range by addr/len must meet:
 *   start (addr) must be in a valid VMA.
 *   end (addr + len) must be in a valid VMA.
 *   no gap (unallocated memory) between start and end.
 *   start (addr) must be page aligned.
 *
 *  len: len will be page aligned implicitly.
 *
 *   Below VMA operations are blocked after sealing.
 *   1> Unmapping, moving to another location, and shrinking
 *        the size, via munmap() and mremap(), can leave an empty
 *        space, therefore can be replaced with a VMA with a new
 *        set of attributes.
 *   2> Moving or expanding a different vma into the current location,
 *        via mremap().
 *   3> Modifying a VMA via mmap(MAP_FIXED).
 *   4> Size expansion, via mremap(), does not appear to pose any
 *        specific risks to sealed VMAs. It is included anyway because
 *        the use case is unclear. In any case, users can rely on
 *        merging to expand a sealed VMA.
 *   5> mprotect and pkey_mprotect.
 *   6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
 *      for anonymous memory, when users don't have write permission to the
 *        memory. Those behaviors can alter region contents by discarding pages,
 *        effectively a memset(0) for anonymous memory.
 *
 *  flags: reserved.
 *
 * return values:
 *  zero: success.
 *  -EINVAL:
 *   invalid input flags.
 *   start address is not page aligned.
 *   Address arange (start + len) overflow.
 *  -ENOMEM:
 *   addr is not a valid address (not allocated).
 *   end (start + len) is not a valid address.
 *   a gap (unallocated memory) between start and end.
 *  -EPERM:
 *  - In 32 bit architecture, sealing is not supported.
 * Note:
 *  user can call mseal(2) multiple times, adding a seal on an
 *  already sealed memory is a no-action (no error).
 *
 *  unseal() is not supported.
 */
static int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
{
        size_t len;
        int ret = 0;
        unsigned long end;
        struct mm_struct *mm = current->mm;

        ret = can_do_mseal(flags);
        if (ret)
                return ret;

        start = untagged_addr(start);
        if (!PAGE_ALIGNED(start))
                return -EINVAL;

        len = PAGE_ALIGN(len_in);
        /* Check to see whether len was rounded up from small -ve to zero. */
        if (len_in && !len)
                return -EINVAL;

        end = start + len;
        if (end < start)
                return -EINVAL;

        if (end == start)
                return 0;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        /*
         * First pass, this helps to avoid
         * partial sealing in case of error in input address range,
         * e.g. ENOMEM error.
         */
        ret = check_mm_seal(start, end);
        if (ret)
                goto out;

        /*
         * Second pass, this should success, unless there are errors
         * from vma_modify_flags, e.g. merge/split error, or process
         * reaching the max supported VMAs, however, those cases shall
         * be rare.
         */
        ret = apply_mm_seal(start, end);

out:
        mmap_write_unlock(current->mm);
        return ret;
}

SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
                flags)
{
        return do_mseal(start, len, flags);
}




















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MEMREMAP_H_
#define _LINUX_MEMREMAP_H_

#include <linux/mmzone.h>
#include <linux/range.h>
#include <linux/ioport.h>
#include <linux/percpu-refcount.h>

struct resource;
struct device;

/**
 * struct vmem_altmap - pre-allocated storage for vmemmap_populate
 * @base_pfn: base of the entire dev_pagemap mapping
 * @reserve: pages mapped, but reserved for driver use (relative to @base)
 * @free: free pages set aside in the mapping for memmap storage
 * @align: pages reserved to meet allocation alignments
 * @alloc: track pages consumed, private to vmemmap_populate()
 */
struct vmem_altmap {
        unsigned long base_pfn;
        const unsigned long end_pfn;
        const unsigned long reserve;
        unsigned long free;
        unsigned long align;
        unsigned long alloc;
        bool inaccessible;
};

/*
 * Specialize ZONE_DEVICE memory into multiple types each has a different
 * usage.
 *
 * MEMORY_DEVICE_PRIVATE:
 * Device memory that is not directly addressable by the CPU: CPU can neither
 * read nor write private memory. In this case, we do still have struct pages
 * backing the device memory. Doing so simplifies the implementation, but it is
 * important to remember that there are certain points at which the struct page
 * must be treated as an opaque object, rather than a "normal" struct page.
 *
 * A more complete discussion of unaddressable memory may be found in
 * include/linux/hmm.h and Documentation/mm/hmm.rst.
 *
 * MEMORY_DEVICE_COHERENT:
 * Device memory that is cache coherent from device and CPU point of view. This
 * is used on platforms that have an advanced system bus (like CAPI or CXL). A
 * driver can hotplug the device memory using ZONE_DEVICE and with that memory
 * type. Any page of a process can be migrated to such memory. However no one
 * should be allowed to pin such memory so that it can always be evicted.
 *
 * MEMORY_DEVICE_FS_DAX:
 * Host memory that has similar access semantics as System RAM i.e. DMA
 * coherent and supports page pinning. In support of coordinating page
 * pinning vs other operations MEMORY_DEVICE_FS_DAX arranges for a
 * wakeup event whenever a page is unpinned and becomes idle. This
 * wakeup is used to coordinate physical address space management (ex:
 * fs truncate/hole punch) vs pinned pages (ex: device dma).
 *
 * MEMORY_DEVICE_GENERIC:
 * Host memory that has similar access semantics as System RAM i.e. DMA
 * coherent and supports page pinning. This is for example used by DAX devices
 * that expose memory using a character device.
 *
 * MEMORY_DEVICE_PCI_P2PDMA:
 * Device memory residing in a PCI BAR intended for use with Peer-to-Peer
 * transactions.
 */
enum memory_type {
        /* 0 is reserved to catch uninitialized type fields */
        MEMORY_DEVICE_PRIVATE = 1,
        MEMORY_DEVICE_COHERENT,
        MEMORY_DEVICE_FS_DAX,
        MEMORY_DEVICE_GENERIC,
        MEMORY_DEVICE_PCI_P2PDMA,
};

struct dev_pagemap_ops {
        /*
         * Called once the page refcount reaches 0.  The reference count will be
         * reset to one by the core code after the method is called to prepare
         * for handing out the page again.
         */
        void (*page_free)(struct page *page);

        /*
         * Used for private (un-addressable) device memory only.  Must migrate
         * the page back to a CPU accessible page.
         */
        vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);

        /*
         * Handle the memory failure happens on a range of pfns.  Notify the
         * processes who are using these pfns, and try to recover the data on
         * them if necessary.  The mf_flags is finally passed to the recover
         * function through the whole notify routine.
         *
         * When this is not implemented, or it returns -EOPNOTSUPP, the caller
         * will fall back to a common handler called mf_generic_kill_procs().
         */
        int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn,
                              unsigned long nr_pages, int mf_flags);
};

#define PGMAP_ALTMAP_VALID        (1 << 0)

/**
 * struct dev_pagemap - metadata for ZONE_DEVICE mappings
 * @altmap: pre-allocated/reserved memory for vmemmap allocations
 * @ref: reference count that pins the devm_memremap_pages() mapping
 * @done: completion for @ref
 * @type: memory type: see MEMORY_* above in memremap.h
 * @flags: PGMAP_* flags to specify defailed behavior
 * @vmemmap_shift: structural definition of how the vmemmap page metadata
 *      is populated, specifically the metadata page order.
 *        A zero value (default) uses base pages as the vmemmap metadata
 *        representation. A bigger value will set up compound struct pages
 *        of the requested order value.
 * @ops: method table
 * @owner: an opaque pointer identifying the entity that manages this
 *        instance.  Used by various helpers to make sure that no
 *        foreign ZONE_DEVICE memory is accessed.
 * @nr_range: number of ranges to be mapped
 * @range: range to be mapped when nr_range == 1
 * @ranges: array of ranges to be mapped when nr_range > 1
 */
struct dev_pagemap {
        struct vmem_altmap altmap;
        struct percpu_ref ref;
        struct completion done;
        enum memory_type type;
        unsigned int flags;
        unsigned long vmemmap_shift;
        const struct dev_pagemap_ops *ops;
        void *owner;
        int nr_range;
        union {
                struct range range;
                DECLARE_FLEX_ARRAY(struct range, ranges);
        };
};

static inline bool pgmap_has_memory_failure(struct dev_pagemap *pgmap)
{
        return pgmap->ops && pgmap->ops->memory_failure;
}

static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
{
        if (pgmap->flags & PGMAP_ALTMAP_VALID)
                return &pgmap->altmap;
        return NULL;
}

static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap)
{
        return 1 << pgmap->vmemmap_shift;
}

static inline bool is_device_private_page(const struct page *page)
{
        return IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
                is_zone_device_page(page) &&
                page->pgmap->type == MEMORY_DEVICE_PRIVATE;
}

static inline bool folio_is_device_private(const struct folio *folio)
{
        return is_device_private_page(&folio->page);
}

static inline bool is_pci_p2pdma_page(const struct page *page)
{
        return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
                is_zone_device_page(page) &&
                page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
}

static inline bool is_device_coherent_page(const struct page *page)
{
        return is_zone_device_page(page) &&
                page->pgmap->type == MEMORY_DEVICE_COHERENT;
}

static inline bool folio_is_device_coherent(const struct folio *folio)
{
        return is_device_coherent_page(&folio->page);
}

#ifdef CONFIG_ZONE_DEVICE
void zone_device_page_init(struct page *page);
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
                struct dev_pagemap *pgmap);
bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);

unsigned long memremap_compat_align(void);
#else
static inline void *devm_memremap_pages(struct device *dev,
                struct dev_pagemap *pgmap)
{
        /*
         * Fail attempts to call devm_memremap_pages() without
         * ZONE_DEVICE support enabled, this requires callers to fall
         * back to plain devm_memremap() based on config
         */
        WARN_ON_ONCE(1);
        return ERR_PTR(-ENXIO);
}

static inline void devm_memunmap_pages(struct device *dev,
                struct dev_pagemap *pgmap)
{
}

static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
                struct dev_pagemap *pgmap)
{
        return NULL;
}

static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
{
        return false;
}

/* when memremap_pages() is disabled all archs can remap a single page */
static inline unsigned long memremap_compat_align(void)
{
        return PAGE_SIZE;
}
#endif /* CONFIG_ZONE_DEVICE */

static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
{
        if (pgmap)
                percpu_ref_put(&pgmap->ref);
}

#endif /* _LINUX_MEMREMAP_H_ */









































































































































































































































































































































































































































































































    7 

    9 

















































































    8 





    8 




    9 


    8 

    7 
    7 

    7 






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
/*
 *  Generic process-grouping system.
 *
 *  Based originally on the cpuset system, extracted by Paul Menage
 *  Copyright (C) 2006 Google, Inc
 *
 *  Notifications support
 *  Copyright (C) 2009 Nokia Corporation
 *  Author: Kirill A. Shutemov
 *
 *  Copyright notices from the original cpuset code:
 *  --------------------------------------------------
 *  Copyright (C) 2003 BULL SA.
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 *  Portions derived from Patrick Mochel's sysfs code.
 *  sysfs is Copyright (c) 2001-3 Patrick Mochel
 *
 *  2003-10-10 Written by Simon Derr.
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 *  ---------------------------------------------------
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
 *  distribution for more details.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include "cgroup-internal.h"

#include <linux/bpf-cgroup.h>
#include <linux/cred.h>
#include <linux/errno.h>
#include <linux/init_task.h>
#include <linux/kernel.h>
#include <linux/magic.h>
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/percpu-rwsem.h>
#include <linux/string.h>
#include <linux/hashtable.h>
#include <linux/idr.h>
#include <linux/kthread.h>
#include <linux/atomic.h>
#include <linux/cpuset.h>
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
#include <linux/file.h>
#include <linux/fs_parser.h>
#include <linux/sched/cputime.h>
#include <linux/sched/deadline.h>
#include <linux/psi.h>
#include <net/sock.h>

#define CREATE_TRACE_POINTS
#include <trace/events/cgroup.h>

#define CGROUP_FILE_NAME_MAX                (MAX_CGROUP_TYPE_NAMELEN +        \
                                         MAX_CFTYPE_NAME + 2)
/* let's not notify more than 100 times per second */
#define CGROUP_FILE_NOTIFY_MIN_INTV        DIV_ROUND_UP(HZ, 100)

/*
 * To avoid confusing the compiler (and generating warnings) with code
 * that attempts to access what would be a 0-element array (i.e. sized
 * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this
 * constant expression can be added.
 */
#define CGROUP_HAS_SUBSYS_CONFIG        (CGROUP_SUBSYS_COUNT > 0)

/*
 * cgroup_mutex is the master lock.  Any modification to cgroup or its
 * hierarchy must be performed while holding it.
 *
 * css_set_lock protects task->cgroups pointer, the list of css_set
 * objects, and the chain of tasks off each css_set.
 *
 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
 * cgroup.h can use them for lockdep annotations.
 */
DEFINE_MUTEX(cgroup_mutex);
DEFINE_SPINLOCK(css_set_lock);

#ifdef CONFIG_PROVE_RCU
EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock);
#endif

DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
static bool cgroup_debug __read_mostly;

/*
 * Protects cgroup_idr and css_idr so that IDs can be released without
 * grabbing cgroup_mutex.
 */
static DEFINE_SPINLOCK(cgroup_idr_lock);

/*
 * Protects cgroup_file->kn for !self csses.  It synchronizes notifications
 * against file removal/re-creation across css hiding.
 */
static DEFINE_SPINLOCK(cgroup_file_kn_lock);

DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);

#define cgroup_assert_mutex_or_rcu_locked()                                \
        RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                        \
                           !lockdep_is_held(&cgroup_mutex),                \
                           "cgroup_mutex or RCU read lock required");

/*
 * cgroup destruction makes heavy use of work items and there can be a lot
 * of concurrent destructions.  Use a separate workqueue so that cgroup
 * destruction work items don't end up filling up max_active of system_wq
 * which may lead to deadlock.
 */
static struct workqueue_struct *cgroup_destroy_wq;

/* generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
struct cgroup_subsys *cgroup_subsys[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

/* array of cgroup subsystem names */
#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
static const char *cgroup_subsys_name[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
#define SUBSYS(_x)                                                                \
        DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);                        \
        DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);                        \
        EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);                        \
        EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
#include <linux/cgroup_subsys.h>
#undef SUBSYS

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
static struct static_key_true *cgroup_subsys_enabled_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS

static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);

/* the default hierarchy */
struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
EXPORT_SYMBOL_GPL(cgrp_dfl_root);

/*
 * The default hierarchy always exists but is hidden until mounted for the
 * first time.  This is for backward compatibility.
 */
static bool cgrp_dfl_visible;

/* some controllers are not supported in the default hierarchy */
static u16 cgrp_dfl_inhibit_ss_mask;

/* some controllers are implicitly enabled on the default hierarchy */
static u16 cgrp_dfl_implicit_ss_mask;

/* some controllers can be threaded on the default hierarchy */
static u16 cgrp_dfl_threaded_ss_mask;

/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots);
static int cgroup_root_count;

/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
static DEFINE_IDR(cgroup_hierarchy_idr);

/*
 * Assign a monotonically increasing serial number to csses.  It guarantees
 * cgroups with bigger numbers are newer than those with smaller numbers.
 * Also, as csses are always appended to the parent's ->children list, it
 * guarantees that sibling csses are always sorted in the ascending serial
 * number order on the list.  Protected by cgroup_mutex.
 */
static u64 css_serial_nr_next = 1;

/*
 * These bitmasks identify subsystems with specific features to avoid
 * having to do iterative checks repeatedly.
 */
static u16 have_fork_callback __read_mostly;
static u16 have_exit_callback __read_mostly;
static u16 have_release_callback __read_mostly;
static u16 have_canfork_callback __read_mostly;

static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);

/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
        .ns.count        = REFCOUNT_INIT(2),
        .user_ns        = &init_user_ns,
        .ns.ops                = &cgroupns_operations,
        .ns.inum        = PROC_CGROUP_INIT_INO,
        .root_cset        = &init_css_set,
};

static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_base_files[];
static struct cftype cgroup_psi_files[];

/* cgroup optional features */
enum cgroup_opt_features {
#ifdef CONFIG_PSI
        OPT_FEATURE_PRESSURE,
#endif
        OPT_FEATURE_COUNT
};

static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
#ifdef CONFIG_PSI
        "pressure",
#endif
};

static u16 cgroup_feature_disable_mask __read_mostly;

static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_skip(struct css_task_iter *it,
                               struct task_struct *task);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
                                              struct cgroup_subsys *ss);
static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
                              struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add);

#ifdef CONFIG_DEBUG_CGROUP_REF
#define CGROUP_REF_FN_ATTRS        noinline
#define CGROUP_REF_EXPORT(fn)        EXPORT_SYMBOL_GPL(fn);
#include <linux/cgroup_refcnt.h>
#endif

/**
 * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
 * @ssid: subsys ID of interest
 *
 * cgroup_subsys_enabled() can only be used with literal subsys names which
 * is fine for individual subsystems but unsuitable for cgroup core.  This
 * is slower static_key_enabled() based test indexed by @ssid.
 */
bool cgroup_ssid_enabled(int ssid)
{
        if (!CGROUP_HAS_SUBSYS_CONFIG)
                return false;

        return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
}

/**
 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
 * @cgrp: the cgroup of interest
 *
 * The default hierarchy is the v2 interface of cgroup and this function
 * can be used to test whether a cgroup is on the default hierarchy for
 * cases where a subsystem should behave differently depending on the
 * interface version.
 *
 * List of changed behaviors:
 *
 * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
 *   and "name" are disallowed.
 *
 * - When mounting an existing superblock, mount options should match.
 *
 * - rename(2) is disallowed.
 *
 * - "tasks" is removed.  Everything should be at process granularity.  Use
 *   "cgroup.procs" instead.
 *
 * - "cgroup.procs" is not sorted.  pids will be unique unless they got
 *   recycled in-between reads.
 *
 * - "release_agent" and "notify_on_release" are removed.  Replacement
 *   notification mechanism will be implemented.
 *
 * - "cgroup.clone_children" is removed.
 *
 * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
 *   and its descendants contain no task; otherwise, 1.  The file also
 *   generates kernfs notification which can be monitored through poll and
 *   [di]notify when the value of the file changes.
 *
 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
 *   take masks of ancestors with non-empty cpus/mems, instead of being
 *   moved to an ancestor.
 *
 * - cpuset: a task can be moved into an empty cpuset, and again it takes
 *   masks of ancestors.
 *
 * - blkcg: blk-throttle becomes properly hierarchical.
 */
bool cgroup_on_dfl(const struct cgroup *cgrp)
{
        return cgrp->root == &cgrp_dfl_root;
}

/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
                            gfp_t gfp_mask)
{
        int ret;

        idr_preload(gfp_mask);
        spin_lock_bh(&cgroup_idr_lock);
        ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
        spin_unlock_bh(&cgroup_idr_lock);
        idr_preload_end();
        return ret;
}

static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
{
        void *ret;

        spin_lock_bh(&cgroup_idr_lock);
        ret = idr_replace(idr, ptr, id);
        spin_unlock_bh(&cgroup_idr_lock);
        return ret;
}

static void cgroup_idr_remove(struct idr *idr, int id)
{
        spin_lock_bh(&cgroup_idr_lock);
        idr_remove(idr, id);
        spin_unlock_bh(&cgroup_idr_lock);
}

static bool cgroup_has_tasks(struct cgroup *cgrp)
{
        return cgrp->nr_populated_csets;
}

static bool cgroup_is_threaded(struct cgroup *cgrp)
{
        return cgrp->dom_cgrp != cgrp;
}

/* can @cgrp host both domain and threaded children? */
static bool cgroup_is_mixable(struct cgroup *cgrp)
{
        /*
         * Root isn't under domain level resource control exempting it from
         * the no-internal-process constraint, so it can serve as a thread
         * root and a parent of resource domains at the same time.
         */
        return !cgroup_parent(cgrp);
}

/* can @cgrp become a thread root? Should always be true for a thread root */
static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
{
        /* mixables don't care */
        if (cgroup_is_mixable(cgrp))
                return true;

        /* domain roots can't be nested under threaded */
        if (cgroup_is_threaded(cgrp))
                return false;

        /* can only have either domain or threaded children */
        if (cgrp->nr_populated_domain_children)
                return false;

        /* and no domain controllers can be enabled */
        if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
                return false;

        return true;
}

/* is @cgrp root of a threaded subtree? */
static bool cgroup_is_thread_root(struct cgroup *cgrp)
{
        /* thread root should be a domain */
        if (cgroup_is_threaded(cgrp))
                return false;

        /* a domain w/ threaded children is a thread root */
        if (cgrp->nr_threaded_children)
                return true;

        /*
         * A domain which has tasks and explicit threaded controllers
         * enabled is a thread root.
         */
        if (cgroup_has_tasks(cgrp) &&
            (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
                return true;

        return false;
}

/* a domain which isn't connected to the root w/o brekage can't be used */
static bool cgroup_is_valid_domain(struct cgroup *cgrp)
{
        /* the cgroup itself can be a thread root */
        if (cgroup_is_threaded(cgrp))
                return false;

        /* but the ancestors can't be unless mixable */
        while ((cgrp = cgroup_parent(cgrp))) {
                if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
                        return false;
                if (cgroup_is_threaded(cgrp))
                        return false;
        }

        return true;
}

/* subsystems visibly enabled on a cgroup */
static u16 cgroup_control(struct cgroup *cgrp)
{
        struct cgroup *parent = cgroup_parent(cgrp);
        u16 root_ss_mask = cgrp->root->subsys_mask;

        if (parent) {
                u16 ss_mask = parent->subtree_control;

                /* threaded cgroups can only have threaded controllers */
                if (cgroup_is_threaded(cgrp))
                        ss_mask &= cgrp_dfl_threaded_ss_mask;
                return ss_mask;
        }

        if (cgroup_on_dfl(cgrp))
                root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
                                  cgrp_dfl_implicit_ss_mask);
        return root_ss_mask;
}

/* subsystems enabled on a cgroup */
static u16 cgroup_ss_mask(struct cgroup *cgrp)
{
        struct cgroup *parent = cgroup_parent(cgrp);

        if (parent) {
                u16 ss_mask = parent->subtree_ss_mask;

                /* threaded cgroups can only have threaded controllers */
                if (cgroup_is_threaded(cgrp))
                        ss_mask &= cgrp_dfl_threaded_ss_mask;
                return ss_mask;
        }

        return cgrp->root->subsys_mask;
}

/**
 * cgroup_css - obtain a cgroup's css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 *
 * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
 * function must be called either under cgroup_mutex or rcu_read_lock() and
 * the caller is responsible for pinning the returned css if it wants to
 * keep accessing it outside the said locks.  This function may return
 * %NULL if @cgrp doesn't have @subsys_id enabled.
 */
static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
                                              struct cgroup_subsys *ss)
{
        if (CGROUP_HAS_SUBSYS_CONFIG && ss)
                return rcu_dereference_check(cgrp->subsys[ss->id],
                                        lockdep_is_held(&cgroup_mutex));
        else
                return &cgrp->self;
}

/**
 * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 *
 * Similar to cgroup_css() but returns the effective css, which is defined
 * as the matching css of the nearest ancestor including self which has @ss
 * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
 * function is guaranteed to return non-NULL css.
 */
static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
                                                        struct cgroup_subsys *ss)
{
        lockdep_assert_held(&cgroup_mutex);

        if (!ss)
                return &cgrp->self;

        /*
         * This function is used while updating css associations and thus
         * can't test the csses directly.  Test ss_mask.
         */
        while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
                cgrp = cgroup_parent(cgrp);
                if (!cgrp)
                        return NULL;
        }

        return cgroup_css(cgrp, ss);
}

/**
 * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest
 *
 * Find and get the effective css of @cgrp for @ss.  The effective css is
 * defined as the matching css of the nearest ancestor including self which
 * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
 * the root css is returned, so this function always returns a valid css.
 *
 * The returned css is not guaranteed to be online, and therefore it is the
 * callers responsibility to try get a reference for it.
 */
struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
                                         struct cgroup_subsys *ss)
{
        struct cgroup_subsys_state *css;

        if (!CGROUP_HAS_SUBSYS_CONFIG)
                return NULL;

        do {
                css = cgroup_css(cgrp, ss);

                if (css)
                        return css;
                cgrp = cgroup_parent(cgrp);
        } while (cgrp);

        return init_css_set.subsys[ss->id];
}

/**
 * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest
 *
 * Find and get the effective css of @cgrp for @ss.  The effective css is
 * defined as the matching css of the nearest ancestor including self which
 * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
 * the root css is returned, so this function always returns a valid css.
 * The returned css must be put using css_put().
 */
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
                                             struct cgroup_subsys *ss)
{
        struct cgroup_subsys_state *css;

        if (!CGROUP_HAS_SUBSYS_CONFIG)
                return NULL;

        rcu_read_lock();

        do {
                css = cgroup_css(cgrp, ss);

                if (css && css_tryget_online(css))
                        goto out_unlock;
                cgrp = cgroup_parent(cgrp);
        } while (cgrp);

        css = init_css_set.subsys[ss->id];
        css_get(css);
out_unlock:
        rcu_read_unlock();
        return css;
}
EXPORT_SYMBOL_GPL(cgroup_get_e_css);

static void cgroup_get_live(struct cgroup *cgrp)
{
        WARN_ON_ONCE(cgroup_is_dead(cgrp));
        cgroup_get(cgrp);
}

/**
 * __cgroup_task_count - count the number of tasks in a cgroup. The caller
 * is responsible for taking the css_set_lock.
 * @cgrp: the cgroup in question
 */
int __cgroup_task_count(const struct cgroup *cgrp)
{
        int count = 0;
        struct cgrp_cset_link *link;

        lockdep_assert_held(&css_set_lock);

        list_for_each_entry(link, &cgrp->cset_links, cset_link)
                count += link->cset->nr_tasks;

        return count;
}

/**
 * cgroup_task_count - count the number of tasks in a cgroup.
 * @cgrp: the cgroup in question
 */
int cgroup_task_count(const struct cgroup *cgrp)
{
        int count;

        spin_lock_irq(&css_set_lock);
        count = __cgroup_task_count(cgrp);
        spin_unlock_irq(&css_set_lock);

        return count;
}

struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
        struct cgroup *cgrp = of->kn->parent->priv;
        struct cftype *cft = of_cft(of);

        /*
         * This is open and unprotected implementation of cgroup_css().
         * seq_css() is only called from a kernfs file operation which has
         * an active reference on the file.  Because all the subsystem
         * files are drained before a css is disassociated with a cgroup,
         * the matching css from the cgroup's subsys table is guaranteed to
         * be and stay valid until the enclosing operation is complete.
         */
        if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss)
                return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
        else
                return &cgrp->self;
}
EXPORT_SYMBOL_GPL(of_css);

/**
 * for_each_css - iterate all css's of a cgroup
 * @css: the iteration cursor
 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 * @cgrp: the target cgroup to iterate css's of
 *
 * Should be called under cgroup_mutex.
 */
#define for_each_css(css, ssid, cgrp)                                        \
        for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
                if (!((css) = rcu_dereference_check(                        \
                                (cgrp)->subsys[(ssid)],                        \
                                lockdep_is_held(&cgroup_mutex)))) { }        \
                else

/**
 * do_each_subsys_mask - filter for_each_subsys with a bitmask
 * @ss: the iteration cursor
 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
 * @ss_mask: the bitmask
 *
 * The block will only run for cases where the ssid-th bit (1 << ssid) of
 * @ss_mask is set.
 */
#define do_each_subsys_mask(ss, ssid, ss_mask) do {                        \
        unsigned long __ss_mask = (ss_mask);                                \
        if (!CGROUP_HAS_SUBSYS_CONFIG) {                                \
                (ssid) = 0;                                                \
                break;                                                        \
        }                                                                \
        for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) {        \
                (ss) = cgroup_subsys[ssid];                                \
                {

#define while_each_subsys_mask()                                        \
                }                                                        \
        }                                                                \
} while (false)

/* iterate over child cgrps, lock should be held throughout iteration */
#define cgroup_for_each_live_child(child, cgrp)                                \
        list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
                if (({ lockdep_assert_held(&cgroup_mutex);                \
                       cgroup_is_dead(child); }))                        \
                        ;                                                \
                else

/* walk live descendants in pre order */
#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)                \
        css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL))        \
                if (({ lockdep_assert_held(&cgroup_mutex);                \
                       (dsct) = (d_css)->cgroup;                        \
                       cgroup_is_dead(dsct); }))                        \
                        ;                                                \
                else

/* walk live descendants in postorder */
#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp)                \
        css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL))        \
                if (({ lockdep_assert_held(&cgroup_mutex);                \
                       (dsct) = (d_css)->cgroup;                        \
                       cgroup_is_dead(dsct); }))                        \
                        ;                                                \
                else

/*
 * The default css_set - used by init and its children prior to any
 * hierarchies being mounted. It contains a pointer to the root state
 * for each subsystem. Also used to anchor the list of css_sets. Not
 * reference-counted, to improve performance when child cgroups
 * haven't been created.
 */
struct css_set init_css_set = {
        .refcount                = REFCOUNT_INIT(1),
        .dom_cset                = &init_css_set,
        .tasks                        = LIST_HEAD_INIT(init_css_set.tasks),
        .mg_tasks                = LIST_HEAD_INIT(init_css_set.mg_tasks),
        .dying_tasks                = LIST_HEAD_INIT(init_css_set.dying_tasks),
        .task_iters                = LIST_HEAD_INIT(init_css_set.task_iters),
        .threaded_csets                = LIST_HEAD_INIT(init_css_set.threaded_csets),
        .cgrp_links                = LIST_HEAD_INIT(init_css_set.cgrp_links),
        .mg_src_preload_node        = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
        .mg_dst_preload_node        = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
        .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),

        /*
         * The following field is re-initialized when this cset gets linked
         * in cgroup_init().  However, let's initialize the field
         * statically too so that the default cgroup can be accessed safely
         * early during boot.
         */
        .dfl_cgrp                = &cgrp_dfl_root.cgrp,
};

static int css_set_count        = 1;        /* 1 for init_css_set */

static bool css_set_threaded(struct css_set *cset)
{
        return cset->dom_cset != cset;
}

/**
 * css_set_populated - does a css_set contain any tasks?
 * @cset: target css_set
 *
 * css_set_populated() should be the same as !!cset->nr_tasks at steady
 * state. However, css_set_populated() can be called while a task is being
 * added to or removed from the linked list before the nr_tasks is
 * properly updated. Hence, we can't just look at ->nr_tasks here.
 */
static bool css_set_populated(struct css_set *cset)
{
        lockdep_assert_held(&css_set_lock);

        return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
}

/**
 * cgroup_update_populated - update the populated count of a cgroup
 * @cgrp: the target cgroup
 * @populated: inc or dec populated count
 *
 * One of the css_sets associated with @cgrp is either getting its first
 * task or losing the last.  Update @cgrp->nr_populated_* accordingly.  The
 * count is propagated towards root so that a given cgroup's
 * nr_populated_children is zero iff none of its descendants contain any
 * tasks.
 *
 * @cgrp's interface file "cgroup.populated" is zero if both
 * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
 * 1 otherwise.  When the sum changes from or to zero, userland is notified
 * that the content of the interface file has changed.  This can be used to
 * detect when @cgrp and its descendants become populated or empty.
 */
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
{
        struct cgroup *child = NULL;
        int adj = populated ? 1 : -1;

        lockdep_assert_held(&css_set_lock);

        do {
                bool was_populated = cgroup_is_populated(cgrp);

                if (!child) {
                        cgrp->nr_populated_csets += adj;
                } else {
                        if (cgroup_is_threaded(child))
                                cgrp->nr_populated_threaded_children += adj;
                        else
                                cgrp->nr_populated_domain_children += adj;
                }

                if (was_populated == cgroup_is_populated(cgrp))
                        break;

                cgroup1_check_for_release(cgrp);
                TRACE_CGROUP_PATH(notify_populated, cgrp,
                                  cgroup_is_populated(cgrp));
                cgroup_file_notify(&cgrp->events_file);

                child = cgrp;
                cgrp = cgroup_parent(cgrp);
        } while (cgrp);
}

/**
 * css_set_update_populated - update populated state of a css_set
 * @cset: target css_set
 * @populated: whether @cset is populated or depopulated
 *
 * @cset is either getting the first task or losing the last.  Update the
 * populated counters of all associated cgroups accordingly.
 */
static void css_set_update_populated(struct css_set *cset, bool populated)
{
        struct cgrp_cset_link *link;

        lockdep_assert_held(&css_set_lock);

        list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
                cgroup_update_populated(link->cgrp, populated);
}

/*
 * @task is leaving, advance task iterators which are pointing to it so
 * that they can resume at the next position.  Advancing an iterator might
 * remove it from the list, use safe walk.  See css_task_iter_skip() for
 * details.
 */
static void css_set_skip_task_iters(struct css_set *cset,
                                    struct task_struct *task)
{
        struct css_task_iter *it, *pos;

        list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
                css_task_iter_skip(it, task);
}

/**
 * css_set_move_task - move a task from one css_set to another
 * @task: task being moved
 * @from_cset: css_set @task currently belongs to (may be NULL)
 * @to_cset: new css_set @task is being moved to (may be NULL)
 * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
 *
 * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
 * css_set, @from_cset can be NULL.  If @task is being disassociated
 * instead of moved, @to_cset can be NULL.
 *
 * This function automatically handles populated counter updates and
 * css_task_iter adjustments but the caller is responsible for managing
 * @from_cset and @to_cset's reference counts.
 */
static void css_set_move_task(struct task_struct *task,
                              struct css_set *from_cset, struct css_set *to_cset,
                              bool use_mg_tasks)
{
        lockdep_assert_held(&css_set_lock);

        if (to_cset && !css_set_populated(to_cset))
                css_set_update_populated(to_cset, true);

        if (from_cset) {
                WARN_ON_ONCE(list_empty(&task->cg_list));

                css_set_skip_task_iters(from_cset, task);
                list_del_init(&task->cg_list);
                if (!css_set_populated(from_cset))
                        css_set_update_populated(from_cset, false);
        } else {
                WARN_ON_ONCE(!list_empty(&task->cg_list));
        }

        if (to_cset) {
                /*
                 * We are synchronized through cgroup_threadgroup_rwsem
                 * against PF_EXITING setting such that we can't race
                 * against cgroup_exit()/cgroup_free() dropping the css_set.
                 */
                WARN_ON_ONCE(task->flags & PF_EXITING);

                cgroup_move_task(task, to_cset);
                list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
                                                             &to_cset->tasks);
        }
}

/*
 * hash table for cgroup groups. This improves the performance to find
 * an existing css_set. This hash doesn't (currently) take into
 * account cgroups in empty hierarchies.
 */
#define CSS_SET_HASH_BITS        7
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);

static unsigned long css_set_hash(struct cgroup_subsys_state **css)
{
        unsigned long key = 0UL;
        struct cgroup_subsys *ss;
        int i;

        for_each_subsys(ss, i)
                key += (unsigned long)css[i];
        key = (key >> 16) ^ key;

        return key;
}

void put_css_set_locked(struct css_set *cset)
{
        struct cgrp_cset_link *link, *tmp_link;
        struct cgroup_subsys *ss;
        int ssid;

        lockdep_assert_held(&css_set_lock);

        if (!refcount_dec_and_test(&cset->refcount))
                return;

        WARN_ON_ONCE(!list_empty(&cset->threaded_csets));

        /* This css_set is dead. Unlink it and release cgroup and css refs */
        for_each_subsys(ss, ssid) {
                list_del(&cset->e_cset_node[ssid]);
                css_put(cset->subsys[ssid]);
        }
        hash_del(&cset->hlist);
        css_set_count--;

        list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
                list_del(&link->cset_link);
                list_del(&link->cgrp_link);
                if (cgroup_parent(link->cgrp))
                        cgroup_put(link->cgrp);
                kfree(link);
        }

        if (css_set_threaded(cset)) {
                list_del(&cset->threaded_csets_node);
                put_css_set_locked(cset->dom_cset);
        }

        kfree_rcu(cset, rcu_head);
}

/**
 * compare_css_sets - helper function for find_existing_css_set().
 * @cset: candidate css_set being tested
 * @old_cset: existing css_set for a task
 * @new_cgrp: cgroup that's being entered by the task
 * @template: desired set of css pointers in css_set (pre-calculated)
 *
 * Returns true if "cset" matches "old_cset" except for the hierarchy
 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
 */
static bool compare_css_sets(struct css_set *cset,
                             struct css_set *old_cset,
                             struct cgroup *new_cgrp,
                             struct cgroup_subsys_state *template[])
{
        struct cgroup *new_dfl_cgrp;
        struct list_head *l1, *l2;

        /*
         * On the default hierarchy, there can be csets which are
         * associated with the same set of cgroups but different csses.
         * Let's first ensure that csses match.
         */
        if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
                return false;


        /* @cset's domain should match the default cgroup's */
        if (cgroup_on_dfl(new_cgrp))
                new_dfl_cgrp = new_cgrp;
        else
                new_dfl_cgrp = old_cset->dfl_cgrp;

        if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
                return false;

        /*
         * Compare cgroup pointers in order to distinguish between
         * different cgroups in hierarchies.  As different cgroups may
         * share the same effective css, this comparison is always
         * necessary.
         */
        l1 = &cset->cgrp_links;
        l2 = &old_cset->cgrp_links;
        while (1) {
                struct cgrp_cset_link *link1, *link2;
                struct cgroup *cgrp1, *cgrp2;

                l1 = l1->next;
                l2 = l2->next;
                /* See if we reached the end - both lists are equal length. */
                if (l1 == &cset->cgrp_links) {
                        BUG_ON(l2 != &old_cset->cgrp_links);
                        break;
                } else {
                        BUG_ON(l2 == &old_cset->cgrp_links);
                }
                /* Locate the cgroups associated with these links. */
                link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
                link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
                cgrp1 = link1->cgrp;
                cgrp2 = link2->cgrp;
                /* Hierarchies should be linked in the same order. */
                BUG_ON(cgrp1->root != cgrp2->root);

                /*
                 * If this hierarchy is the hierarchy of the cgroup
                 * that's changing, then we need to check that this
                 * css_set points to the new cgroup; if it's any other
                 * hierarchy, then this css_set should point to the
                 * same cgroup as the old css_set.
                 */
                if (cgrp1->root == new_cgrp->root) {
                        if (cgrp1 != new_cgrp)
                                return false;
                } else {
                        if (cgrp1 != cgrp2)
                                return false;
                }
        }
        return true;
}

/**
 * find_existing_css_set - init css array and find the matching css_set
 * @old_cset: the css_set that we're using before the cgroup transition
 * @cgrp: the cgroup that we're moving into
 * @template: out param for the new set of csses, should be clear on entry
 */
static struct css_set *find_existing_css_set(struct css_set *old_cset,
                                        struct cgroup *cgrp,
                                        struct cgroup_subsys_state **template)
{
        struct cgroup_root *root = cgrp->root;
        struct cgroup_subsys *ss;
        struct css_set *cset;
        unsigned long key;
        int i;

        /*
         * Build the set of subsystem state objects that we want to see in the
         * new css_set. While subsystems can change globally, the entries here
         * won't change, so no need for locking.
         */
        for_each_subsys(ss, i) {
                if (root->subsys_mask & (1UL << i)) {
                        /*
                         * @ss is in this hierarchy, so we want the
                         * effective css from @cgrp.
                         */
                        template[i] = cgroup_e_css_by_mask(cgrp, ss);
                } else {
                        /*
                         * @ss is not in this hierarchy, so we don't want
                         * to change the css.
                         */
                        template[i] = old_cset->subsys[i];
                }
        }

        key = css_set_hash(template);
        hash_for_each_possible(css_set_table, cset, hlist, key) {
                if (!compare_css_sets(cset, old_cset, cgrp, template))
                        continue;

                /* This css_set matches what we need */
                return cset;
        }

        /* No existing cgroup group matched */
        return NULL;
}

static void free_cgrp_cset_links(struct list_head *links_to_free)
{
        struct cgrp_cset_link *link, *tmp_link;

        list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
                list_del(&link->cset_link);
                kfree(link);
        }
}

/**
 * allocate_cgrp_cset_links - allocate cgrp_cset_links
 * @count: the number of links to allocate
 * @tmp_links: list_head the allocated links are put on
 *
 * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
 * through ->cset_link.  Returns 0 on success or -errno.
 */
static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
{
        struct cgrp_cset_link *link;
        int i;

        INIT_LIST_HEAD(tmp_links);

        for (i = 0; i < count; i++) {
                link = kzalloc(sizeof(*link), GFP_KERNEL);
                if (!link) {
                        free_cgrp_cset_links(tmp_links);
                        return -ENOMEM;
                }
                list_add(&link->cset_link, tmp_links);
        }
        return 0;
}

/**
 * link_css_set - a helper function to link a css_set to a cgroup
 * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
 * @cset: the css_set to be linked
 * @cgrp: the destination cgroup
 */
static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
                         struct cgroup *cgrp)
{
        struct cgrp_cset_link *link;

        BUG_ON(list_empty(tmp_links));

        if (cgroup_on_dfl(cgrp))
                cset->dfl_cgrp = cgrp;

        link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
        link->cset = cset;
        link->cgrp = cgrp;

        /*
         * Always add links to the tail of the lists so that the lists are
         * in chronological order.
         */
        list_move_tail(&link->cset_link, &cgrp->cset_links);
        list_add_tail(&link->cgrp_link, &cset->cgrp_links);

        if (cgroup_parent(cgrp))
                cgroup_get_live(cgrp);
}

/**
 * find_css_set - return a new css_set with one cgroup updated
 * @old_cset: the baseline css_set
 * @cgrp: the cgroup to be updated
 *
 * Return a new css_set that's equivalent to @old_cset, but with @cgrp
 * substituted into the appropriate hierarchy.
 */
static struct css_set *find_css_set(struct css_set *old_cset,
                                    struct cgroup *cgrp)
{
        struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
        struct css_set *cset;
        struct list_head tmp_links;
        struct cgrp_cset_link *link;
        struct cgroup_subsys *ss;
        unsigned long key;
        int ssid;

        lockdep_assert_held(&cgroup_mutex);

        /* First see if we already have a cgroup group that matches
         * the desired set */
        spin_lock_irq(&css_set_lock);
        cset = find_existing_css_set(old_cset, cgrp, template);
        if (cset)
                get_css_set(cset);
        spin_unlock_irq(&css_set_lock);

        if (cset)
                return cset;

        cset = kzalloc(sizeof(*cset), GFP_KERNEL);
        if (!cset)
                return NULL;

        /* Allocate all the cgrp_cset_link objects that we'll need */
        if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
                kfree(cset);
                return NULL;
        }

        refcount_set(&cset->refcount, 1);
        cset->dom_cset = cset;
        INIT_LIST_HEAD(&cset->tasks);
        INIT_LIST_HEAD(&cset->mg_tasks);
        INIT_LIST_HEAD(&cset->dying_tasks);
        INIT_LIST_HEAD(&cset->task_iters);
        INIT_LIST_HEAD(&cset->threaded_csets);
        INIT_HLIST_NODE(&cset->hlist);
        INIT_LIST_HEAD(&cset->cgrp_links);
        INIT_LIST_HEAD(&cset->mg_src_preload_node);
        INIT_LIST_HEAD(&cset->mg_dst_preload_node);
        INIT_LIST_HEAD(&cset->mg_node);

        /* Copy the set of subsystem state objects generated in
         * find_existing_css_set() */
        memcpy(cset->subsys, template, sizeof(cset->subsys));

        spin_lock_irq(&css_set_lock);
        /* Add reference counts and links from the new css_set. */
        list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
                struct cgroup *c = link->cgrp;

                if (c->root == cgrp->root)
                        c = cgrp;
                link_css_set(&tmp_links, cset, c);
        }

        BUG_ON(!list_empty(&tmp_links));

        css_set_count++;

        /* Add @cset to the hash table */
        key = css_set_hash(cset->subsys);
        hash_add(css_set_table, &cset->hlist, key);

        for_each_subsys(ss, ssid) {
                struct cgroup_subsys_state *css = cset->subsys[ssid];

                list_add_tail(&cset->e_cset_node[ssid],
                              &css->cgroup->e_csets[ssid]);
                css_get(css);
        }

        spin_unlock_irq(&css_set_lock);

        /*
         * If @cset should be threaded, look up the matching dom_cset and
         * link them up.  We first fully initialize @cset then look for the
         * dom_cset.  It's simpler this way and safe as @cset is guaranteed
         * to stay empty until we return.
         */
        if (cgroup_is_threaded(cset->dfl_cgrp)) {
                struct css_set *dcset;

                dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
                if (!dcset) {
                        put_css_set(cset);
                        return NULL;
                }

                spin_lock_irq(&css_set_lock);
                cset->dom_cset = dcset;
                list_add_tail(&cset->threaded_csets_node,
                              &dcset->threaded_csets);
                spin_unlock_irq(&css_set_lock);
        }

        return cset;
}

struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
        struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv;

        return root_cgrp->root;
}

void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
{
        bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;

        /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */
        if (favor && !favoring) {
                rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
                root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
        } else if (!favor && favoring) {
                rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
                root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
        }
}

static int cgroup_init_root_id(struct cgroup_root *root)
{
        int id;

        lockdep_assert_held(&cgroup_mutex);

        id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
        if (id < 0)
                return id;

        root->hierarchy_id = id;
        return 0;
}

static void cgroup_exit_root_id(struct cgroup_root *root)
{
        lockdep_assert_held(&cgroup_mutex);

        idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
}

void cgroup_free_root(struct cgroup_root *root)
{
        kfree_rcu(root, rcu);
}

static void cgroup_destroy_root(struct cgroup_root *root)
{
        struct cgroup *cgrp = &root->cgrp;
        struct cgrp_cset_link *link, *tmp_link;

        trace_cgroup_destroy_root(root);

        cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);

        BUG_ON(atomic_read(&root->nr_cgrps));
        BUG_ON(!list_empty(&cgrp->self.children));

        /* Rebind all subsystems back to the default hierarchy */
        WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));

        /*
         * Release all the links from cset_links to this hierarchy's
         * root cgroup
         */
        spin_lock_irq(&css_set_lock);

        list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
                list_del(&link->cset_link);
                list_del(&link->cgrp_link);
                kfree(link);
        }

        spin_unlock_irq(&css_set_lock);

        WARN_ON_ONCE(list_empty(&root->root_list));
        list_del_rcu(&root->root_list);
        cgroup_root_count--;

        if (!have_favordynmods)
                cgroup_favor_dynmods(root, false);

        cgroup_exit_root_id(root);

        cgroup_unlock();

        cgroup_rstat_exit(cgrp);
        kernfs_destroy_root(root->kf_root);
        cgroup_free_root(root);
}

/*
 * Returned cgroup is without refcount but it's valid as long as cset pins it.
 */
static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
                                            struct cgroup_root *root)
{
        struct cgroup *res_cgroup = NULL;

        if (cset == &init_css_set) {
                res_cgroup = &root->cgrp;
        } else if (root == &cgrp_dfl_root) {
                res_cgroup = cset->dfl_cgrp;
        } else {
                struct cgrp_cset_link *link;
                lockdep_assert_held(&css_set_lock);

                list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
                        struct cgroup *c = link->cgrp;

                        if (c->root == root) {
                                res_cgroup = c;
                                break;
                        }
                }
        }

        /*
         * If cgroup_mutex is not held, the cgrp_cset_link will be freed
         * before we remove the cgroup root from the root_list. Consequently,
         * when accessing a cgroup root, the cset_link may have already been
         * freed, resulting in a NULL res_cgroup. However, by holding the
         * cgroup_mutex, we ensure that res_cgroup can't be NULL.
         * If we don't hold cgroup_mutex in the caller, we must do the NULL
         * check.
         */
        return res_cgroup;
}

/*
 * look up cgroup associated with current task's cgroup namespace on the
 * specified hierarchy
 */
static struct cgroup *
current_cgns_cgroup_from_root(struct cgroup_root *root)
{
        struct cgroup *res = NULL;
        struct css_set *cset;

        lockdep_assert_held(&css_set_lock);

        rcu_read_lock();

        cset = current->nsproxy->cgroup_ns->root_cset;
        res = __cset_cgroup_from_root(cset, root);

        rcu_read_unlock();

        /*
         * The namespace_sem is held by current, so the root cgroup can't
         * be umounted. Therefore, we can ensure that the res is non-NULL.
         */
        WARN_ON_ONCE(!res);
        return res;
}

/*
 * Look up cgroup associated with current task's cgroup namespace on the default
 * hierarchy.
 *
 * Unlike current_cgns_cgroup_from_root(), this doesn't need locks:
 * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu
 *   pointers.
 * - css_set_lock is not needed because we just read cset->dfl_cgrp.
 * - As a bonus returned cgrp is pinned with the current because it cannot
 *   switch cgroup_ns asynchronously.
 */
static struct cgroup *current_cgns_cgroup_dfl(void)
{
        struct css_set *cset;

        if (current->nsproxy) {
                cset = current->nsproxy->cgroup_ns->root_cset;
                return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
        } else {
                /*
                 * NOTE: This function may be called from bpf_cgroup_from_id()
                 * on a task which has already passed exit_task_namespaces() and
                 * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all
                 * cgroups visible for lookups.
                 */
                return &cgrp_dfl_root.cgrp;
        }
}

/* look up cgroup associated with given css_set on the specified hierarchy */
static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
                                            struct cgroup_root *root)
{
        lockdep_assert_held(&css_set_lock);

        return __cset_cgroup_from_root(cset, root);
}

/*
 * Return the cgroup for "task" from the given hierarchy. Must be
 * called with css_set_lock held to prevent task's groups from being modified.
 * Must be called with either cgroup_mutex or rcu read lock to prevent the
 * cgroup root from being destroyed.
 */
struct cgroup *task_cgroup_from_root(struct task_struct *task,
                                     struct cgroup_root *root)
{
        /*
         * No need to lock the task - since we hold css_set_lock the
         * task can't change groups.
         */
        return cset_cgroup_from_root(task_css_set(task), root);
}

/*
 * A task must hold cgroup_mutex to modify cgroups.
 *
 * Any task can increment and decrement the count field without lock.
 * So in general, code holding cgroup_mutex can't rely on the count
 * field not changing.  However, if the count goes to zero, then only
 * cgroup_attach_task() can increment it again.  Because a count of zero
 * means that no tasks are currently attached, therefore there is no
 * way a task attached to that cgroup can fork (the other way to
 * increment the count).  So code holding cgroup_mutex can safely
 * assume that if the count is zero, it will stay zero. Similarly, if
 * a task holds cgroup_mutex on a cgroup with zero count, it
 * knows that the cgroup won't be removed, as cgroup_rmdir()
 * needs that mutex.
 *
 * A cgroup can only be deleted if both its 'count' of using tasks
 * is zero, and its list of 'children' cgroups is empty.  Since all
 * tasks in the system use _some_ cgroup, and since there is always at
 * least one task in the system (init, pid == 1), therefore, root cgroup
 * always has either children cgroups and/or using tasks.  So we don't
 * need a special hack to ensure that root cgroup cannot be deleted.
 *
 * P.S.  One more locking exception.  RCU is used to guard the
 * update of a tasks cgroup pointer by cgroup_attach_task()
 */

static struct kernfs_syscall_ops cgroup_kf_syscall_ops;

static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
                              char *buf)
{
        struct cgroup_subsys *ss = cft->ss;

        if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
            !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
                const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";

                snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
                         dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
                         cft->name);
        } else {
                strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
        }
        return buf;
}

/**
 * cgroup_file_mode - deduce file mode of a control file
 * @cft: the control file in question
 *
 * S_IRUGO for read, S_IWUSR for write.
 */
static umode_t cgroup_file_mode(const struct cftype *cft)
{
        umode_t mode = 0;

        if (cft->read_u64 || cft->read_s64 || cft->seq_show)
                mode |= S_IRUGO;

        if (cft->write_u64 || cft->write_s64 || cft->write) {
                if (cft->flags & CFTYPE_WORLD_WRITABLE)
                        mode |= S_IWUGO;
                else
                        mode |= S_IWUSR;
        }

        return mode;
}

/**
 * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
 * @subtree_control: the new subtree_control mask to consider
 * @this_ss_mask: available subsystems
 *
 * On the default hierarchy, a subsystem may request other subsystems to be
 * enabled together through its ->depends_on mask.  In such cases, more
 * subsystems than specified in "cgroup.subtree_control" may be enabled.
 *
 * This function calculates which subsystems need to be enabled if
 * @subtree_control is to be applied while restricted to @this_ss_mask.
 */
static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
{
        u16 cur_ss_mask = subtree_control;
        struct cgroup_subsys *ss;
        int ssid;

        lockdep_assert_held(&cgroup_mutex);

        cur_ss_mask |= cgrp_dfl_implicit_ss_mask;

        while (true) {
                u16 new_ss_mask = cur_ss_mask;

                do_each_subsys_mask(ss, ssid, cur_ss_mask) {
                        new_ss_mask |= ss->depends_on;
                } while_each_subsys_mask();

                /*
                 * Mask out subsystems which aren't available.  This can
                 * happen only if some depended-upon subsystems were bound
                 * to non-default hierarchies.
                 */
                new_ss_mask &= this_ss_mask;

                if (new_ss_mask == cur_ss_mask)
                        break;
                cur_ss_mask = new_ss_mask;
        }

        return cur_ss_mask;
}

/**
 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
 * @kn: the kernfs_node being serviced
 *
 * This helper undoes cgroup_kn_lock_live() and should be invoked before
 * the method finishes if locking succeeded.  Note that once this function
 * returns the cgroup returned by cgroup_kn_lock_live() may become
 * inaccessible any time.  If the caller intends to continue to access the
 * cgroup, it should pin it before invoking this function.
 */
void cgroup_kn_unlock(struct kernfs_node *kn)
{
        struct cgroup *cgrp;

        if (kernfs_type(kn) == KERNFS_DIR)
                cgrp = kn->priv;
        else
                cgrp = kn->parent->priv;

        cgroup_unlock();

        kernfs_unbreak_active_protection(kn);
        cgroup_put(cgrp);
}

/**
 * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
 * @kn: the kernfs_node being serviced
 * @drain_offline: perform offline draining on the cgroup
 *
 * This helper is to be used by a cgroup kernfs method currently servicing
 * @kn.  It breaks the active protection, performs cgroup locking and
 * verifies that the associated cgroup is alive.  Returns the cgroup if
 * alive; otherwise, %NULL.  A successful return should be undone by a
 * matching cgroup_kn_unlock() invocation.  If @drain_offline is %true, the
 * cgroup is drained of offlining csses before return.
 *
 * Any cgroup kernfs method implementation which requires locking the
 * associated cgroup should use this helper.  It avoids nesting cgroup
 * locking under kernfs active protection and allows all kernfs operations
 * including self-removal.
 */
struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
{
        struct cgroup *cgrp;

        if (kernfs_type(kn) == KERNFS_DIR)
                cgrp = kn->priv;
        else
                cgrp = kn->parent->priv;

        /*
         * We're gonna grab cgroup_mutex which nests outside kernfs
         * active_ref.  cgroup liveliness check alone provides enough
         * protection against removal.  Ensure @cgrp stays accessible and
         * break the active_ref protection.
         */
        if (!cgroup_tryget(cgrp))
                return NULL;
        kernfs_break_active_protection(kn);

        if (drain_offline)
                cgroup_lock_and_drain_offline(cgrp);
        else
                cgroup_lock();

        if (!cgroup_is_dead(cgrp))
                return cgrp;

        cgroup_kn_unlock(kn);
        return NULL;
}

static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
{
        char name[CGROUP_FILE_NAME_MAX];

        lockdep_assert_held(&cgroup_mutex);

        if (cft->file_offset) {
                struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
                struct cgroup_file *cfile = (void *)css + cft->file_offset;

                spin_lock_irq(&cgroup_file_kn_lock);
                cfile->kn = NULL;
                spin_unlock_irq(&cgroup_file_kn_lock);

                del_timer_sync(&cfile->notify_timer);
        }

        kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
}

/**
 * css_clear_dir - remove subsys files in a cgroup directory
 * @css: target css
 */
static void css_clear_dir(struct cgroup_subsys_state *css)
{
        struct cgroup *cgrp = css->cgroup;
        struct cftype *cfts;

        if (!(css->flags & CSS_VISIBLE))
                return;

        css->flags &= ~CSS_VISIBLE;

        if (!css->ss) {
                if (cgroup_on_dfl(cgrp)) {
                        cgroup_addrm_files(css, cgrp,
                                           cgroup_base_files, false);
                        if (cgroup_psi_enabled())
                                cgroup_addrm_files(css, cgrp,
                                                   cgroup_psi_files, false);
                } else {
                        cgroup_addrm_files(css, cgrp,
                                           cgroup1_base_files, false);
                }
        } else {
                list_for_each_entry(cfts, &css->ss->cfts, node)
                        cgroup_addrm_files(css, cgrp, cfts, false);
        }
}

/**
 * css_populate_dir - create subsys files in a cgroup directory
 * @css: target css
 *
 * On failure, no file is added.
 */
static int css_populate_dir(struct cgroup_subsys_state *css)
{
        struct cgroup *cgrp = css->cgroup;
        struct cftype *cfts, *failed_cfts;
        int ret;

        if (css->flags & CSS_VISIBLE)
                return 0;

        if (!css->ss) {
                if (cgroup_on_dfl(cgrp)) {
                        ret = cgroup_addrm_files(css, cgrp,
                                                 cgroup_base_files, true);
                        if (ret < 0)
                                return ret;

                        if (cgroup_psi_enabled()) {
                                ret = cgroup_addrm_files(css, cgrp,
                                                         cgroup_psi_files, true);
                                if (ret < 0)
                                        return ret;
                        }
                } else {
                        ret = cgroup_addrm_files(css, cgrp,
                                                 cgroup1_base_files, true);
                        if (ret < 0)
                                return ret;
                }
        } else {
                list_for_each_entry(cfts, &css->ss->cfts, node) {
                        ret = cgroup_addrm_files(css, cgrp, cfts, true);
                        if (ret < 0) {
                                failed_cfts = cfts;
                                goto err;
                        }
                }
        }

        css->flags |= CSS_VISIBLE;

        return 0;
err:
        list_for_each_entry(cfts, &css->ss->cfts, node) {
                if (cfts == failed_cfts)
                        break;
                cgroup_addrm_files(css, cgrp, cfts, false);
        }
        return ret;
}

int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
{
        struct cgroup *dcgrp = &dst_root->cgrp;
        struct cgroup_subsys *ss;
        int ssid, ret;
        u16 dfl_disable_ss_mask = 0;

        lockdep_assert_held(&cgroup_mutex);

        do_each_subsys_mask(ss, ssid, ss_mask) {
                /*
                 * If @ss has non-root csses attached to it, can't move.
                 * If @ss is an implicit controller, it is exempt from this
                 * rule and can be stolen.
                 */
                if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
                    !ss->implicit_on_dfl)
                        return -EBUSY;

                /* can't move between two non-dummy roots either */
                if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
                        return -EBUSY;

                /*
                 * Collect ssid's that need to be disabled from default
                 * hierarchy.
                 */
                if (ss->root == &cgrp_dfl_root)
                        dfl_disable_ss_mask |= 1 << ssid;

        } while_each_subsys_mask();

        if (dfl_disable_ss_mask) {
                struct cgroup *scgrp = &cgrp_dfl_root.cgrp;

                /*
                 * Controllers from default hierarchy that need to be rebound
                 * are all disabled together in one go.
                 */
                cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
                WARN_ON(cgroup_apply_control(scgrp));
                cgroup_finalize_control(scgrp, 0);
        }

        do_each_subsys_mask(ss, ssid, ss_mask) {
                struct cgroup_root *src_root = ss->root;
                struct cgroup *scgrp = &src_root->cgrp;
                struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
                struct css_set *cset, *cset_pos;
                struct css_task_iter *it;

                WARN_ON(!css || cgroup_css(dcgrp, ss));

                if (src_root != &cgrp_dfl_root) {
                        /* disable from the source */
                        src_root->subsys_mask &= ~(1 << ssid);
                        WARN_ON(cgroup_apply_control(scgrp));
                        cgroup_finalize_control(scgrp, 0);
                }

                /* rebind */
                RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
                rcu_assign_pointer(dcgrp->subsys[ssid], css);
                ss->root = dst_root;
                css->cgroup = dcgrp;

                spin_lock_irq(&css_set_lock);
                WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
                list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
                                         e_cset_node[ss->id]) {
                        list_move_tail(&cset->e_cset_node[ss->id],
                                       &dcgrp->e_csets[ss->id]);
                        /*
                         * all css_sets of scgrp together in same order to dcgrp,
                         * patch in-flight iterators to preserve correct iteration.
                         * since the iterator is always advanced right away and
                         * finished when it->cset_pos meets it->cset_head, so only
                         * update it->cset_head is enough here.
                         */
                        list_for_each_entry(it, &cset->task_iters, iters_node)
                                if (it->cset_head == &scgrp->e_csets[ss->id])
                                        it->cset_head = &dcgrp->e_csets[ss->id];
                }
                spin_unlock_irq(&css_set_lock);

                if (ss->css_rstat_flush) {
                        list_del_rcu(&css->rstat_css_node);
                        synchronize_rcu();
                        list_add_rcu(&css->rstat_css_node,
                                     &dcgrp->rstat_css_list);
                }

                /* default hierarchy doesn't enable controllers by default */
                dst_root->subsys_mask |= 1 << ssid;
                if (dst_root == &cgrp_dfl_root) {
                        static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
                } else {
                        dcgrp->subtree_control |= 1 << ssid;
                        static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
                }

                ret = cgroup_apply_control(dcgrp);
                if (ret)
                        pr_warn("partial failure to rebind %s controller (err=%d)\n",
                                ss->name, ret);

                if (ss->bind)
                        ss->bind(css);
        } while_each_subsys_mask();

        kernfs_activate(dcgrp->kn);
        return 0;
}

int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
                     struct kernfs_root *kf_root)
{
        int len = 0;
        char *buf = NULL;
        struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
        struct cgroup *ns_cgroup;

        buf = kmalloc(PATH_MAX, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;

        spin_lock_irq(&css_set_lock);
        ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
        len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
        spin_unlock_irq(&css_set_lock);

        if (len == -E2BIG)
                len = -ERANGE;
        else if (len > 0) {
                seq_escape(sf, buf, " \t\n\\");
                len = 0;
        }
        kfree(buf);
        return len;
}

enum cgroup2_param {
        Opt_nsdelegate,
        Opt_favordynmods,
        Opt_memory_localevents,
        Opt_memory_recursiveprot,
        Opt_memory_hugetlb_accounting,
        nr__cgroup2_params
};

static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
        fsparam_flag("nsdelegate",                Opt_nsdelegate),
        fsparam_flag("favordynmods",                Opt_favordynmods),
        fsparam_flag("memory_localevents",        Opt_memory_localevents),
        fsparam_flag("memory_recursiveprot",        Opt_memory_recursiveprot),
        fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting),
        {}
};

static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
        struct fs_parse_result result;
        int opt;

        opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case Opt_nsdelegate:
                ctx->flags |= CGRP_ROOT_NS_DELEGATE;
                return 0;
        case Opt_favordynmods:
                ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
                return 0;
        case Opt_memory_localevents:
                ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
                return 0;
        case Opt_memory_recursiveprot:
                ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
                return 0;
        case Opt_memory_hugetlb_accounting:
                ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
                return 0;
        }
        return -EINVAL;
}

static void apply_cgroup_root_flags(unsigned int root_flags)
{
        if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
                if (root_flags & CGRP_ROOT_NS_DELEGATE)
                        cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
                else
                        cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;

                cgroup_favor_dynmods(&cgrp_dfl_root,
                                     root_flags & CGRP_ROOT_FAVOR_DYNMODS);

                if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
                        cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
                else
                        cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;

                if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
                        cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
                else
                        cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;

                if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
                        cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
                else
                        cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
        }
}

static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
{
        if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
                seq_puts(seq, ",nsdelegate");
        if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS)
                seq_puts(seq, ",favordynmods");
        if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
                seq_puts(seq, ",memory_localevents");
        if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
                seq_puts(seq, ",memory_recursiveprot");
        if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
                seq_puts(seq, ",memory_hugetlb_accounting");
        return 0;
}

static int cgroup_reconfigure(struct fs_context *fc)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);

        apply_cgroup_root_flags(ctx->flags);
        return 0;
}

static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
        struct cgroup_subsys *ss;
        int ssid;

        INIT_LIST_HEAD(&cgrp->self.sibling);
        INIT_LIST_HEAD(&cgrp->self.children);
        INIT_LIST_HEAD(&cgrp->cset_links);
        INIT_LIST_HEAD(&cgrp->pidlists);
        mutex_init(&cgrp->pidlist_mutex);
        cgrp->self.cgroup = cgrp;
        cgrp->self.flags |= CSS_ONLINE;
        cgrp->dom_cgrp = cgrp;
        cgrp->max_descendants = INT_MAX;
        cgrp->max_depth = INT_MAX;
        INIT_LIST_HEAD(&cgrp->rstat_css_list);
        prev_cputime_init(&cgrp->prev_cputime);

        for_each_subsys(ss, ssid)
                INIT_LIST_HEAD(&cgrp->e_csets[ssid]);

        init_waitqueue_head(&cgrp->offline_waitq);
        INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}

void init_cgroup_root(struct cgroup_fs_context *ctx)
{
        struct cgroup_root *root = ctx->root;
        struct cgroup *cgrp = &root->cgrp;

        INIT_LIST_HEAD_RCU(&root->root_list);
        atomic_set(&root->nr_cgrps, 1);
        cgrp->root = root;
        init_cgroup_housekeeping(cgrp);

        /* DYNMODS must be modified through cgroup_favor_dynmods() */
        root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS;
        if (ctx->release_agent)
                strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
        if (ctx->name)
                strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
        if (ctx->cpuset_clone_children)
                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}

int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{
        LIST_HEAD(tmp_links);
        struct cgroup *root_cgrp = &root->cgrp;
        struct kernfs_syscall_ops *kf_sops;
        struct css_set *cset;
        int i, ret;

        lockdep_assert_held(&cgroup_mutex);

        ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
                              0, GFP_KERNEL);
        if (ret)
                goto out;

        /*
         * We're accessing css_set_count without locking css_set_lock here,
         * but that's OK - it can only be increased by someone holding
         * cgroup_lock, and that's us.  Later rebinding may disable
         * controllers on the default hierarchy and thus create new csets,
         * which can't be more than the existing ones.  Allocate 2x.
         */
        ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
        if (ret)
                goto cancel_ref;

        ret = cgroup_init_root_id(root);
        if (ret)
                goto cancel_ref;

        kf_sops = root == &cgrp_dfl_root ?
                &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;

        root->kf_root = kernfs_create_root(kf_sops,
                                           KERNFS_ROOT_CREATE_DEACTIVATED |
                                           KERNFS_ROOT_SUPPORT_EXPORTOP |
                                           KERNFS_ROOT_SUPPORT_USER_XATTR,
                                           root_cgrp);
        if (IS_ERR(root->kf_root)) {
                ret = PTR_ERR(root->kf_root);
                goto exit_root_id;
        }
        root_cgrp->kn = kernfs_root_to_node(root->kf_root);
        WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
        root_cgrp->ancestors[0] = root_cgrp;

        ret = css_populate_dir(&root_cgrp->self);
        if (ret)
                goto destroy_root;

        ret = cgroup_rstat_init(root_cgrp);
        if (ret)
                goto destroy_root;

        ret = rebind_subsystems(root, ss_mask);
        if (ret)
                goto exit_stats;

        ret = cgroup_bpf_inherit(root_cgrp);
        WARN_ON_ONCE(ret);

        trace_cgroup_setup_root(root);

        /*
         * There must be no failure case after here, since rebinding takes
         * care of subsystems' refcounts, which are explicitly dropped in
         * the failure exit path.
         */
        list_add_rcu(&root->root_list, &cgroup_roots);
        cgroup_root_count++;

        /*
         * Link the root cgroup in this hierarchy into all the css_set
         * objects.
         */
        spin_lock_irq(&css_set_lock);
        hash_for_each(css_set_table, i, cset, hlist) {
                link_css_set(&tmp_links, cset, root_cgrp);
                if (css_set_populated(cset))
                        cgroup_update_populated(root_cgrp, true);
        }
        spin_unlock_irq(&css_set_lock);

        BUG_ON(!list_empty(&root_cgrp->self.children));
        BUG_ON(atomic_read(&root->nr_cgrps) != 1);

        ret = 0;
        goto out;

exit_stats:
        cgroup_rstat_exit(root_cgrp);
destroy_root:
        kernfs_destroy_root(root->kf_root);
        root->kf_root = NULL;
exit_root_id:
        cgroup_exit_root_id(root);
cancel_ref:
        percpu_ref_exit(&root_cgrp->self.refcnt);
out:
        free_cgrp_cset_links(&tmp_links);
        return ret;
}

int cgroup_do_get_tree(struct fs_context *fc)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
        int ret;

        ctx->kfc.root = ctx->root->kf_root;
        if (fc->fs_type == &cgroup2_fs_type)
                ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
        else
                ctx->kfc.magic = CGROUP_SUPER_MAGIC;
        ret = kernfs_get_tree(fc);

        /*
         * In non-init cgroup namespace, instead of root cgroup's dentry,
         * we return the dentry corresponding to the cgroupns->root_cgrp.
         */
        if (!ret && ctx->ns != &init_cgroup_ns) {
                struct dentry *nsdentry;
                struct super_block *sb = fc->root->d_sb;
                struct cgroup *cgrp;

                cgroup_lock();
                spin_lock_irq(&css_set_lock);

                cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);

                spin_unlock_irq(&css_set_lock);
                cgroup_unlock();

                nsdentry = kernfs_node_dentry(cgrp->kn, sb);
                dput(fc->root);
                if (IS_ERR(nsdentry)) {
                        deactivate_locked_super(sb);
                        ret = PTR_ERR(nsdentry);
                        nsdentry = NULL;
                }
                fc->root = nsdentry;
        }

        if (!ctx->kfc.new_sb_created)
                cgroup_put(&ctx->root->cgrp);

        return ret;
}

/*
 * Destroy a cgroup filesystem context.
 */
static void cgroup_fs_context_free(struct fs_context *fc)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);

        kfree(ctx->name);
        kfree(ctx->release_agent);
        put_cgroup_ns(ctx->ns);
        kernfs_free_fs_context(fc);
        kfree(ctx);
}

static int cgroup_get_tree(struct fs_context *fc)
{
        struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
        int ret;

        WRITE_ONCE(cgrp_dfl_visible, true);
        cgroup_get_live(&cgrp_dfl_root.cgrp);
        ctx->root = &cgrp_dfl_root;

        ret = cgroup_do_get_tree(fc);
        if (!ret)
                apply_cgroup_root_flags(ctx->flags);
        return ret;
}

static const struct fs_context_operations cgroup_fs_context_ops = {
        .free                = cgroup_fs_context_free,
        .parse_param        = cgroup2_parse_param,
        .get_tree        = cgroup_get_tree,
        .reconfigure        = cgroup_reconfigure,
};

static const struct fs_context_operations cgroup1_fs_context_ops = {
        .free                = cgroup_fs_context_free,
        .parse_param        = cgroup1_parse_param,
        .get_tree        = cgroup1_get_tree,
        .reconfigure        = cgroup1_reconfigure,
};

/*
 * Initialise the cgroup filesystem creation/reconfiguration context.  Notably,
 * we select the namespace we're going to use.
 */
static int cgroup_init_fs_context(struct fs_context *fc)
{
        struct cgroup_fs_context *ctx;

        ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        ctx->ns = current->nsproxy->cgroup_ns;
        get_cgroup_ns(ctx->ns);
        fc->fs_private = &ctx->kfc;
        if (fc->fs_type == &cgroup2_fs_type)
                fc->ops = &cgroup_fs_context_ops;
        else
                fc->ops = &cgroup1_fs_context_ops;
        put_user_ns(fc->user_ns);
        fc->user_ns = get_user_ns(ctx->ns->user_ns);
        fc->global = true;

        if (have_favordynmods)
                ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;

        return 0;
}

static void cgroup_kill_sb(struct super_block *sb)
{
        struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
        struct cgroup_root *root = cgroup_root_from_kf(kf_root);

        /*
         * If @root doesn't have any children, start killing it.
         * This prevents new mounts by disabling percpu_ref_tryget_live().
         *
         * And don't kill the default root.
         */
        if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
            !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
                cgroup_bpf_offline(&root->cgrp);
                percpu_ref_kill(&root->cgrp.self.refcnt);
        }
        cgroup_put(&root->cgrp);
        kernfs_kill_sb(sb);
}

struct file_system_type cgroup_fs_type = {
        .name                        = "cgroup",
        .init_fs_context        = cgroup_init_fs_context,
        .parameters                = cgroup1_fs_parameters,
        .kill_sb                = cgroup_kill_sb,
        .fs_flags                = FS_USERNS_MOUNT,
};

static struct file_system_type cgroup2_fs_type = {
        .name                        = "cgroup2",
        .init_fs_context        = cgroup_init_fs_context,
        .parameters                = cgroup2_fs_parameters,
        .kill_sb                = cgroup_kill_sb,
        .fs_flags                = FS_USERNS_MOUNT,
};

#ifdef CONFIG_CPUSETS
static const struct fs_context_operations cpuset_fs_context_ops = {
        .get_tree        = cgroup1_get_tree,
        .free                = cgroup_fs_context_free,
};

/*
 * This is ugly, but preserves the userspace API for existing cpuset
 * users. If someone tries to mount the "cpuset" filesystem, we
 * silently switch it to mount "cgroup" instead
 */
static int cpuset_init_fs_context(struct fs_context *fc)
{
        char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
        struct cgroup_fs_context *ctx;
        int err;

        err = cgroup_init_fs_context(fc);
        if (err) {
                kfree(agent);
                return err;
        }

        fc->ops = &cpuset_fs_context_ops;

        ctx = cgroup_fc2context(fc);
        ctx->subsys_mask = 1 << cpuset_cgrp_id;
        ctx->flags |= CGRP_ROOT_NOPREFIX;
        ctx->release_agent = agent;

        get_filesystem(&cgroup_fs_type);
        put_filesystem(fc->fs_type);
        fc->fs_type = &cgroup_fs_type;

        return 0;
}

static struct file_system_type cpuset_fs_type = {
        .name                        = "cpuset",
        .init_fs_context        = cpuset_init_fs_context,
        .fs_flags                = FS_USERNS_MOUNT,
};
#endif

int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
                          struct cgroup_namespace *ns)
{
        struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);

        return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
}

int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
                   struct cgroup_namespace *ns)
{
        int ret;

        cgroup_lock();
        spin_lock_irq(&css_set_lock);

        ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);

        spin_unlock_irq(&css_set_lock);
        cgroup_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(cgroup_path_ns);

/**
 * cgroup_attach_lock - Lock for ->attach()
 * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
 *
 * cgroup migration sometimes needs to stabilize threadgroups against forks and
 * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
 * implementations (e.g. cpuset), also need to disable CPU hotplug.
 * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
 * lead to deadlocks.
 *
 * Bringing up a CPU may involve creating and destroying tasks which requires
 * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
 * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
 * write-locking threadgroup_rwsem, the locking order is reversed and we end up
 * waiting for an on-going CPU hotplug operation which in turn is waiting for
 * the threadgroup_rwsem to be released to create new tasks. For more details:
 *
 *   http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
 *
 * Resolve the situation by always acquiring cpus_read_lock() before optionally
 * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
 * CPU hotplug is disabled on entry.
 */
void cgroup_attach_lock(bool lock_threadgroup)
{
        cpus_read_lock();
        if (lock_threadgroup)
                percpu_down_write(&cgroup_threadgroup_rwsem);
}

/**
 * cgroup_attach_unlock - Undo cgroup_attach_lock()
 * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
 */
void cgroup_attach_unlock(bool lock_threadgroup)
{
        if (lock_threadgroup)
                percpu_up_write(&cgroup_threadgroup_rwsem);
        cpus_read_unlock();
}

/**
 * cgroup_migrate_add_task - add a migration target task to a migration context
 * @task: target task
 * @mgctx: target migration context
 *
 * Add @task, which is a migration target, to @mgctx->tset.  This function
 * becomes noop if @task doesn't need to be migrated.  @task's css_set
 * should have been added as a migration source and @task->cg_list will be
 * moved from the css_set's tasks list to mg_tasks one.
 */
static void cgroup_migrate_add_task(struct task_struct *task,
                                    struct cgroup_mgctx *mgctx)
{
        struct css_set *cset;

        lockdep_assert_held(&css_set_lock);

        /* @task either already exited or can't exit until the end */
        if (task->flags & PF_EXITING)
                return;

        /* cgroup_threadgroup_rwsem protects racing against forks */
        WARN_ON_ONCE(list_empty(&task->cg_list));

        cset = task_css_set(task);
        if (!cset->mg_src_cgrp)
                return;

        mgctx->tset.nr_tasks++;

        list_move_tail(&task->cg_list, &cset->mg_tasks);
        if (list_empty(&cset->mg_node))
                list_add_tail(&cset->mg_node,
                              &mgctx->tset.src_csets);
        if (list_empty(&cset->mg_dst_cset->mg_node))
                list_add_tail(&cset->mg_dst_cset->mg_node,
                              &mgctx->tset.dst_csets);
}

/**
 * cgroup_taskset_first - reset taskset and return the first task
 * @tset: taskset of interest
 * @dst_cssp: output variable for the destination css
 *
 * @tset iteration is initialized and the first task is returned.
 */
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
                                         struct cgroup_subsys_state **dst_cssp)
{
        tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
        tset->cur_task = NULL;

        return cgroup_taskset_next(tset, dst_cssp);
}

/**
 * cgroup_taskset_next - iterate to the next task in taskset
 * @tset: taskset of interest
 * @dst_cssp: output variable for the destination css
 *
 * Return the next task in @tset.  Iteration must have been initialized
 * with cgroup_taskset_first().
 */
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
                                        struct cgroup_subsys_state **dst_cssp)
{
        struct css_set *cset = tset->cur_cset;
        struct task_struct *task = tset->cur_task;

        while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) {
                if (!task)
                        task = list_first_entry(&cset->mg_tasks,
                                                struct task_struct, cg_list);
                else
                        task = list_next_entry(task, cg_list);

                if (&task->cg_list != &cset->mg_tasks) {
                        tset->cur_cset = cset;
                        tset->cur_task = task;

                        /*
                         * This function may be called both before and
                         * after cgroup_migrate_execute().  The two cases
                         * can be distinguished by looking at whether @cset
                         * has its ->mg_dst_cset set.
                         */
                        if (cset->mg_dst_cset)
                                *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
                        else
                                *dst_cssp = cset->subsys[tset->ssid];

                        return task;
                }

                cset = list_next_entry(cset, mg_node);
                task = NULL;
        }

        return NULL;
}

/**
 * cgroup_migrate_execute - migrate a taskset
 * @mgctx: migration context
 *
 * Migrate tasks in @mgctx as setup by migration preparation functions.
 * This function fails iff one of the ->can_attach callbacks fails and
 * guarantees that either all or none of the tasks in @mgctx are migrated.
 * @mgctx is consumed regardless of success.
 */
static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
{
        struct cgroup_taskset *tset = &mgctx->tset;
        struct cgroup_subsys *ss;
        struct task_struct *task, *tmp_task;
        struct css_set *cset, *tmp_cset;
        int ssid, failed_ssid, ret;

        /* check that we can legitimately attach to the cgroup */
        if (tset->nr_tasks) {
                do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
                        if (ss->can_attach) {
                                tset->ssid = ssid;
                                ret = ss->can_attach(tset);
                                if (ret) {
                                        failed_ssid = ssid;
                                        goto out_cancel_attach;
                                }
                        }
                } while_each_subsys_mask();
        }

        /*
         * Now that we're guaranteed success, proceed to move all tasks to
         * the new cgroup.  There are no failure cases after here, so this
         * is the commit point.
         */
        spin_lock_irq(&css_set_lock);
        list_for_each_entry(cset, &tset->src_csets, mg_node) {
                list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
                        struct css_set *from_cset = task_css_set(task);
                        struct css_set *to_cset = cset->mg_dst_cset;

                        get_css_set(to_cset);
                        to_cset->nr_tasks++;
                        css_set_move_task(task, from_cset, to_cset, true);
                        from_cset->nr_tasks--;
                        /*
                         * If the source or destination cgroup is frozen,
                         * the task might require to change its state.
                         */
                        cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
                                                    to_cset->dfl_cgrp);
                        put_css_set_locked(from_cset);

                }
        }
        spin_unlock_irq(&css_set_lock);

        /*
         * Migration is committed, all target tasks are now on dst_csets.
         * Nothing is sensitive to fork() after this point.  Notify
         * controllers that migration is complete.
         */
        tset->csets = &tset->dst_csets;

        if (tset->nr_tasks) {
                do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
                        if (ss->attach) {
                                tset->ssid = ssid;
                                ss->attach(tset);
                        }
                } while_each_subsys_mask();
        }

        ret = 0;
        goto out_release_tset;

out_cancel_attach:
        if (tset->nr_tasks) {
                do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
                        if (ssid == failed_ssid)
                                break;
                        if (ss->cancel_attach) {
                                tset->ssid = ssid;
                                ss->cancel_attach(tset);
                        }
                } while_each_subsys_mask();
        }
out_release_tset:
        spin_lock_irq(&css_set_lock);
        list_splice_init(&tset->dst_csets, &tset->src_csets);
        list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
                list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
                list_del_init(&cset->mg_node);
        }
        spin_unlock_irq(&css_set_lock);

        /*
         * Re-initialize the cgroup_taskset structure in case it is reused
         * again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
         * iteration.
         */
        tset->nr_tasks = 0;
        tset->csets    = &tset->src_csets;
        return ret;
}

/**
 * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
 * @dst_cgrp: destination cgroup to test
 *
 * On the default hierarchy, except for the mixable, (possible) thread root
 * and threaded cgroups, subtree_control must be zero for migration
 * destination cgroups with tasks so that child cgroups don't compete
 * against tasks.
 */
int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
{
        /* v1 doesn't have any restriction */
        if (!cgroup_on_dfl(dst_cgrp))
                return 0;

        /* verify @dst_cgrp can host resources */
        if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
                return -EOPNOTSUPP;

        /*
         * If @dst_cgrp is already or can become a thread root or is
         * threaded, it doesn't matter.
         */
        if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
                return 0;

        /* apply no-internal-process constraint */
        if (dst_cgrp->subtree_control)
                return -EBUSY;

        return 0;
}

/**
 * cgroup_migrate_finish - cleanup after attach
 * @mgctx: migration context
 *
 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
 * those functions for details.
 */
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
{
        struct css_set *cset, *tmp_cset;

        lockdep_assert_held(&cgroup_mutex);

        spin_lock_irq(&css_set_lock);

        list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
                                 mg_src_preload_node) {
                cset->mg_src_cgrp = NULL;
                cset->mg_dst_cgrp = NULL;
                cset->mg_dst_cset = NULL;
                list_del_init(&cset->mg_src_preload_node);
                put_css_set_locked(cset);
        }

        list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
                                 mg_dst_preload_node) {
                cset->mg_src_cgrp = NULL;
                cset->mg_dst_cgrp = NULL;
                cset->mg_dst_cset = NULL;
                list_del_init(&cset->mg_dst_preload_node);
                put_css_set_locked(cset);
        }

        spin_unlock_irq(&css_set_lock);
}

/**
 * cgroup_migrate_add_src - add a migration source css_set
 * @src_cset: the source css_set to add
 * @dst_cgrp: the destination cgroup
 * @mgctx: migration context
 *
 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
 * @src_cset and add it to @mgctx->src_csets, which should later be cleaned
 * up by cgroup_migrate_finish().
 *
 * This function may be called without holding cgroup_threadgroup_rwsem
 * even if the target is a process.  Threads may be created and destroyed
 * but as long as cgroup_mutex is not dropped, no new css_set can be put
 * into play and the preloaded css_sets are guaranteed to cover all
 * migrations.
 */
void cgroup_migrate_add_src(struct css_set *src_cset,
                            struct cgroup *dst_cgrp,
                            struct cgroup_mgctx *mgctx)
{
        struct cgroup *src_cgrp;

        lockdep_assert_held(&cgroup_mutex);
        lockdep_assert_held(&css_set_lock);

        /*
         * If ->dead, @src_set is associated with one or more dead cgroups
         * and doesn't contain any migratable tasks.  Ignore it early so
         * that the rest of migration path doesn't get confused by it.
         */
        if (src_cset->dead)
                return;

        if (!list_empty(&src_cset->mg_src_preload_node))
                return;

        src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);

        WARN_ON(src_cset->mg_src_cgrp);
        WARN_ON(src_cset->mg_dst_cgrp);
        WARN_ON(!list_empty(&src_cset->mg_tasks));
        WARN_ON(!list_empty(&src_cset->mg_node));

        src_cset->mg_src_cgrp = src_cgrp;
        src_cset->mg_dst_cgrp = dst_cgrp;
        get_css_set(src_cset);
        list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
}

/**
 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
 * @mgctx: migration context
 *
 * Tasks are about to be moved and all the source css_sets have been
 * preloaded to @mgctx->preloaded_src_csets.  This function looks up and
 * pins all destination css_sets, links each to its source, and append them
 * to @mgctx->preloaded_dst_csets.
 *
 * This function must be called after cgroup_migrate_add_src() has been
 * called on each migration source css_set.  After migration is performed
 * using cgroup_migrate(), cgroup_migrate_finish() must be called on
 * @mgctx.
 */
int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
{
        struct css_set *src_cset, *tmp_cset;

        lockdep_assert_held(&cgroup_mutex);

        /* look up the dst cset for each src cset and link it to src */
        list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
                                 mg_src_preload_node) {
                struct css_set *dst_cset;
                struct cgroup_subsys *ss;
                int ssid;

                dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
                if (!dst_cset)
                        return -ENOMEM;

                WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);

                /*
                 * If src cset equals dst, it's noop.  Drop the src.
                 * cgroup_migrate() will skip the cset too.  Note that we
                 * can't handle src == dst as some nodes are used by both.
                 */
                if (src_cset == dst_cset) {
                        src_cset->mg_src_cgrp = NULL;
                        src_cset->mg_dst_cgrp = NULL;
                        list_del_init(&src_cset->mg_src_preload_node);
                        put_css_set(src_cset);
                        put_css_set(dst_cset);
                        continue;
                }

                src_cset->mg_dst_cset = dst_cset;

                if (list_empty(&dst_cset->mg_dst_preload_node))
                        list_add_tail(&dst_cset->mg_dst_preload_node,
                                      &mgctx->preloaded_dst_csets);
                else
                        put_css_set(dst_cset);

                for_each_subsys(ss, ssid)
                        if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
                                mgctx->ss_mask |= 1 << ssid;
        }

        return 0;
}

/**
 * cgroup_migrate - migrate a process or task to a cgroup
 * @leader: the leader of the process or the task to migrate
 * @threadgroup: whether @leader points to the whole process or a single task
 * @mgctx: migration context
 *
 * Migrate a process or task denoted by @leader.  If migrating a process,
 * the caller must be holding cgroup_threadgroup_rwsem.  The caller is also
 * responsible for invoking cgroup_migrate_add_src() and
 * cgroup_migrate_prepare_dst() on the targets before invoking this
 * function and following up with cgroup_migrate_finish().
 *
 * As long as a controller's ->can_attach() doesn't fail, this function is
 * guaranteed to succeed.  This means that, excluding ->can_attach()
 * failure, when migrating multiple targets, the success or failure can be
 * decided for all targets by invoking group_migrate_prepare_dst() before
 * actually starting migrating.
 */
int cgroup_migrate(struct task_struct *leader, bool threadgroup,
                   struct cgroup_mgctx *mgctx)
{
        struct task_struct *task;

        /*
         * The following thread iteration should be inside an RCU critical
         * section to prevent tasks from being freed while taking the snapshot.
         * spin_lock_irq() implies RCU critical section here.
         */
        spin_lock_irq(&css_set_lock);
        task = leader;
        do {
                cgroup_migrate_add_task(task, mgctx);
                if (!threadgroup)
                        break;
        } while_each_thread(leader, task);
        spin_unlock_irq(&css_set_lock);

        return cgroup_migrate_execute(mgctx);
}

/**
 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
 * @dst_cgrp: the cgroup to attach to
 * @leader: the task or the leader of the threadgroup to be attached
 * @threadgroup: attach the whole threadgroup?
 *
 * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
 */
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
                       bool threadgroup)
{
        DEFINE_CGROUP_MGCTX(mgctx);
        struct task_struct *task;
        int ret = 0;

        /* look up all src csets */
        spin_lock_irq(&css_set_lock);
        rcu_read_lock();
        task = leader;
        do {
                cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
                if (!threadgroup)
                        break;
        } while_each_thread(leader, task);
        rcu_read_unlock();
        spin_unlock_irq(&css_set_lock);

        /* prepare dst csets and commit */
        ret = cgroup_migrate_prepare_dst(&mgctx);
        if (!ret)
                ret = cgroup_migrate(leader, threadgroup, &mgctx);

        cgroup_migrate_finish(&mgctx);

        if (!ret)
                TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);

        return ret;
}

struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
                                             bool *threadgroup_locked)
{
        struct task_struct *tsk;
        pid_t pid;

        if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
                return ERR_PTR(-EINVAL);

        /*
         * If we migrate a single thread, we don't care about threadgroup
         * stability. If the thread is `current`, it won't exit(2) under our
         * hands or change PID through exec(2). We exclude
         * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
         * callers by cgroup_mutex.
         * Therefore, we can skip the global lock.
         */
        lockdep_assert_held(&cgroup_mutex);
        *threadgroup_locked = pid || threadgroup;
        cgroup_attach_lock(*threadgroup_locked);

        rcu_read_lock();
        if (pid) {
                tsk = find_task_by_vpid(pid);
                if (!tsk) {
                        tsk = ERR_PTR(-ESRCH);
                        goto out_unlock_threadgroup;
                }
        } else {
                tsk = current;
        }

        if (threadgroup)
                tsk = tsk->group_leader;

        /*
         * kthreads may acquire PF_NO_SETAFFINITY during initialization.
         * If userland migrates such a kthread to a non-root cgroup, it can
         * become trapped in a cpuset, or RT kthread may be born in a
         * cgroup with no rt_runtime allocated.  Just say no.
         */
        if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
                tsk = ERR_PTR(-EINVAL);
                goto out_unlock_threadgroup;
        }

        get_task_struct(tsk);
        goto out_unlock_rcu;

out_unlock_threadgroup:
        cgroup_attach_unlock(*threadgroup_locked);
        *threadgroup_locked = false;
out_unlock_rcu:
        rcu_read_unlock();
        return tsk;
}

void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
{
        struct cgroup_subsys *ss;
        int ssid;

        /* release reference from cgroup_procs_write_start() */
        put_task_struct(task);

        cgroup_attach_unlock(threadgroup_locked);

        for_each_subsys(ss, ssid)
                if (ss->post_attach)
                        ss->post_attach();
}

static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
{
        struct cgroup_subsys *ss;
        bool printed = false;
        int ssid;

        do_each_subsys_mask(ss, ssid, ss_mask) {
                if (printed)
                        seq_putc(seq, ' ');
                seq_puts(seq, ss->name);
                printed = true;
        } while_each_subsys_mask();
        if (printed)
                seq_putc(seq, '\n');
}

/* show controllers which are enabled from the parent */
static int cgroup_controllers_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;

        cgroup_print_ss_mask(seq, cgroup_control(cgrp));
        return 0;
}

/* show controllers which are enabled for a given cgroup's children */
static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;

        cgroup_print_ss_mask(seq, cgrp->subtree_control);
        return 0;
}

/**
 * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
 * @cgrp: root of the subtree to update csses for
 *
 * @cgrp's control masks have changed and its subtree's css associations
 * need to be updated accordingly.  This function looks up all css_sets
 * which are attached to the subtree, creates the matching updated css_sets
 * and migrates the tasks to the new ones.
 */
static int cgroup_update_dfl_csses(struct cgroup *cgrp)
{
        DEFINE_CGROUP_MGCTX(mgctx);
        struct cgroup_subsys_state *d_css;
        struct cgroup *dsct;
        struct css_set *src_cset;
        bool has_tasks;
        int ret;

        lockdep_assert_held(&cgroup_mutex);

        /* look up all csses currently attached to @cgrp's subtree */
        spin_lock_irq(&css_set_lock);
        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                struct cgrp_cset_link *link;

                /*
                 * As cgroup_update_dfl_csses() is only called by
                 * cgroup_apply_control(). The csses associated with the
                 * given cgrp will not be affected by changes made to
                 * its subtree_control file. We can skip them.
                 */
                if (dsct == cgrp)
                        continue;

                list_for_each_entry(link, &dsct->cset_links, cset_link)
                        cgroup_migrate_add_src(link->cset, dsct, &mgctx);
        }
        spin_unlock_irq(&css_set_lock);

        /*
         * We need to write-lock threadgroup_rwsem while migrating tasks.
         * However, if there are no source csets for @cgrp, changing its
         * controllers isn't gonna produce any task migrations and the
         * write-locking can be skipped safely.
         */
        has_tasks = !list_empty(&mgctx.preloaded_src_csets);
        cgroup_attach_lock(has_tasks);

        /* NULL dst indicates self on default hierarchy */
        ret = cgroup_migrate_prepare_dst(&mgctx);
        if (ret)
                goto out_finish;

        spin_lock_irq(&css_set_lock);
        list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,
                            mg_src_preload_node) {
                struct task_struct *task, *ntask;

                /* all tasks in src_csets need to be migrated */
                list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
                        cgroup_migrate_add_task(task, &mgctx);
        }
        spin_unlock_irq(&css_set_lock);

        ret = cgroup_migrate_execute(&mgctx);
out_finish:
        cgroup_migrate_finish(&mgctx);
        cgroup_attach_unlock(has_tasks);
        return ret;
}

/**
 * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
 * @cgrp: root of the target subtree
 *
 * Because css offlining is asynchronous, userland may try to re-enable a
 * controller while the previous css is still around.  This function grabs
 * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
 */
void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
        __acquires(&cgroup_mutex)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;
        struct cgroup_subsys *ss;
        int ssid;

restart:
        cgroup_lock();

        cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
                for_each_subsys(ss, ssid) {
                        struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
                        DEFINE_WAIT(wait);

                        if (!css || !percpu_ref_is_dying(&css->refcnt))
                                continue;

                        cgroup_get_live(dsct);
                        prepare_to_wait(&dsct->offline_waitq, &wait,
                                        TASK_UNINTERRUPTIBLE);

                        cgroup_unlock();
                        schedule();
                        finish_wait(&dsct->offline_waitq, &wait);

                        cgroup_put(dsct);
                        goto restart;
                }
        }
}

/**
 * cgroup_save_control - save control masks and dom_cgrp of a subtree
 * @cgrp: root of the target subtree
 *
 * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the
 * respective old_ prefixed fields for @cgrp's subtree including @cgrp
 * itself.
 */
static void cgroup_save_control(struct cgroup *cgrp)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;

        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                dsct->old_subtree_control = dsct->subtree_control;
                dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
                dsct->old_dom_cgrp = dsct->dom_cgrp;
        }
}

/**
 * cgroup_propagate_control - refresh control masks of a subtree
 * @cgrp: root of the target subtree
 *
 * For @cgrp and its subtree, ensure ->subtree_ss_mask matches
 * ->subtree_control and propagate controller availability through the
 * subtree so that descendants don't have unavailable controllers enabled.
 */
static void cgroup_propagate_control(struct cgroup *cgrp)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;

        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                dsct->subtree_control &= cgroup_control(dsct);
                dsct->subtree_ss_mask =
                        cgroup_calc_subtree_ss_mask(dsct->subtree_control,
                                                    cgroup_ss_mask(dsct));
        }
}

/**
 * cgroup_restore_control - restore control masks and dom_cgrp of a subtree
 * @cgrp: root of the target subtree
 *
 * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the
 * respective old_ prefixed fields for @cgrp's subtree including @cgrp
 * itself.
 */
static void cgroup_restore_control(struct cgroup *cgrp)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;

        cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
                dsct->subtree_control = dsct->old_subtree_control;
                dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
                dsct->dom_cgrp = dsct->old_dom_cgrp;
        }
}

static bool css_visible(struct cgroup_subsys_state *css)
{
        struct cgroup_subsys *ss = css->ss;
        struct cgroup *cgrp = css->cgroup;

        if (cgroup_control(cgrp) & (1 << ss->id))
                return true;
        if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
                return false;
        return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
}

/**
 * cgroup_apply_control_enable - enable or show csses according to control
 * @cgrp: root of the target subtree
 *
 * Walk @cgrp's subtree and create new csses or make the existing ones
 * visible.  A css is created invisible if it's being implicitly enabled
 * through dependency.  An invisible css is made visible when the userland
 * explicitly enables it.
 *
 * Returns 0 on success, -errno on failure.  On failure, csses which have
 * been processed already aren't cleaned up.  The caller is responsible for
 * cleaning up with cgroup_apply_control_disable().
 */
static int cgroup_apply_control_enable(struct cgroup *cgrp)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;
        struct cgroup_subsys *ss;
        int ssid, ret;

        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
                for_each_subsys(ss, ssid) {
                        struct cgroup_subsys_state *css = cgroup_css(dsct, ss);

                        if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
                                continue;

                        if (!css) {
                                css = css_create(dsct, ss);
                                if (IS_ERR(css))
                                        return PTR_ERR(css);
                        }

                        WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));

                        if (css_visible(css)) {
                                ret = css_populate_dir(css);
                                if (ret)
                                        return ret;
                        }
                }
        }

        return 0;
}

/**
 * cgroup_apply_control_disable - kill or hide csses according to control
 * @cgrp: root of the target subtree
 *
 * Walk @cgrp's subtree and kill and hide csses so that they match
 * cgroup_ss_mask() and cgroup_visible_mask().
 *
 * A css is hidden when the userland requests it to be disabled while other
 * subsystems are still depending on it.  The css must not actively control
 * resources and be in the vanilla state if it's made visible again later.
 * Controllers which may be depended upon should provide ->css_reset() for
 * this purpose.
 */
static void cgroup_apply_control_disable(struct cgroup *cgrp)
{
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;
        struct cgroup_subsys *ss;
        int ssid;

        cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
                for_each_subsys(ss, ssid) {
                        struct cgroup_subsys_state *css = cgroup_css(dsct, ss);

                        if (!css)
                                continue;

                        WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));

                        if (css->parent &&
                            !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
                                kill_css(css);
                        } else if (!css_visible(css)) {
                                css_clear_dir(css);
                                if (ss->css_reset)
                                        ss->css_reset(css);
                        }
                }
        }
}

/**
 * cgroup_apply_control - apply control mask updates to the subtree
 * @cgrp: root of the target subtree
 *
 * subsystems can be enabled and disabled in a subtree using the following
 * steps.
 *
 * 1. Call cgroup_save_control() to stash the current state.
 * 2. Update ->subtree_control masks in the subtree as desired.
 * 3. Call cgroup_apply_control() to apply the changes.
 * 4. Optionally perform other related operations.
 * 5. Call cgroup_finalize_control() to finish up.
 *
 * This function implements step 3 and propagates the mask changes
 * throughout @cgrp's subtree, updates csses accordingly and perform
 * process migrations.
 */
static int cgroup_apply_control(struct cgroup *cgrp)
{
        int ret;

        cgroup_propagate_control(cgrp);

        ret = cgroup_apply_control_enable(cgrp);
        if (ret)
                return ret;

        /*
         * At this point, cgroup_e_css_by_mask() results reflect the new csses
         * making the following cgroup_update_dfl_csses() properly update
         * css associations of all tasks in the subtree.
         */
        return cgroup_update_dfl_csses(cgrp);
}

/**
 * cgroup_finalize_control - finalize control mask update
 * @cgrp: root of the target subtree
 * @ret: the result of the update
 *
 * Finalize control mask update.  See cgroup_apply_control() for more info.
 */
static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
{
        if (ret) {
                cgroup_restore_control(cgrp);
                cgroup_propagate_control(cgrp);
        }

        cgroup_apply_control_disable(cgrp);
}

static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
{
        u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;

        /* if nothing is getting enabled, nothing to worry about */
        if (!enable)
                return 0;

        /* can @cgrp host any resources? */
        if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
                return -EOPNOTSUPP;

        /* mixables don't care */
        if (cgroup_is_mixable(cgrp))
                return 0;

        if (domain_enable) {
                /* can't enable domain controllers inside a thread subtree */
                if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
                        return -EOPNOTSUPP;
        } else {
                /*
                 * Threaded controllers can handle internal competitions
                 * and are always allowed inside a (prospective) thread
                 * subtree.
                 */
                if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
                        return 0;
        }

        /*
         * Controllers can't be enabled for a cgroup with tasks to avoid
         * child cgroups competing against tasks.
         */
        if (cgroup_has_tasks(cgrp))
                return -EBUSY;

        return 0;
}

/* change the enabled child controllers for a cgroup in the default hierarchy */
static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                            char *buf, size_t nbytes,
                                            loff_t off)
{
        u16 enable = 0, disable = 0;
        struct cgroup *cgrp, *child;
        struct cgroup_subsys *ss;
        char *tok;
        int ssid, ret;

        /*
         * Parse input - space separated list of subsystem names prefixed
         * with either + or -.
         */
        buf = strstrip(buf);
        while ((tok = strsep(&buf, " "))) {
                if (tok[0] == '\0')
                        continue;
                do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
                        if (!cgroup_ssid_enabled(ssid) ||
                            strcmp(tok + 1, ss->name))
                                continue;

                        if (*tok == '+') {
                                enable |= 1 << ssid;
                                disable &= ~(1 << ssid);
                        } else if (*tok == '-') {
                                disable |= 1 << ssid;
                                enable &= ~(1 << ssid);
                        } else {
                                return -EINVAL;
                        }
                        break;
                } while_each_subsys_mask();
                if (ssid == CGROUP_SUBSYS_COUNT)
                        return -EINVAL;
        }

        cgrp = cgroup_kn_lock_live(of->kn, true);
        if (!cgrp)
                return -ENODEV;

        for_each_subsys(ss, ssid) {
                if (enable & (1 << ssid)) {
                        if (cgrp->subtree_control & (1 << ssid)) {
                                enable &= ~(1 << ssid);
                                continue;
                        }

                        if (!(cgroup_control(cgrp) & (1 << ssid))) {
                                ret = -ENOENT;
                                goto out_unlock;
                        }
                } else if (disable & (1 << ssid)) {
                        if (!(cgrp->subtree_control & (1 << ssid))) {
                                disable &= ~(1 << ssid);
                                continue;
                        }

                        /* a child has it enabled? */
                        cgroup_for_each_live_child(child, cgrp) {
                                if (child->subtree_control & (1 << ssid)) {
                                        ret = -EBUSY;
                                        goto out_unlock;
                                }
                        }
                }
        }

        if (!enable && !disable) {
                ret = 0;
                goto out_unlock;
        }

        ret = cgroup_vet_subtree_control_enable(cgrp, enable);
        if (ret)
                goto out_unlock;

        /* save and update control masks and prepare csses */
        cgroup_save_control(cgrp);

        cgrp->subtree_control |= enable;
        cgrp->subtree_control &= ~disable;

        ret = cgroup_apply_control(cgrp);
        cgroup_finalize_control(cgrp, ret);
        if (ret)
                goto out_unlock;

        kernfs_activate(cgrp->kn);
out_unlock:
        cgroup_kn_unlock(of->kn);
        return ret ?: nbytes;
}

/**
 * cgroup_enable_threaded - make @cgrp threaded
 * @cgrp: the target cgroup
 *
 * Called when "threaded" is written to the cgroup.type interface file and
 * tries to make @cgrp threaded and join the parent's resource domain.
 * This function is never called on the root cgroup as cgroup.type doesn't
 * exist on it.
 */
static int cgroup_enable_threaded(struct cgroup *cgrp)
{
        struct cgroup *parent = cgroup_parent(cgrp);
        struct cgroup *dom_cgrp = parent->dom_cgrp;
        struct cgroup *dsct;
        struct cgroup_subsys_state *d_css;
        int ret;

        lockdep_assert_held(&cgroup_mutex);

        /* noop if already threaded */
        if (cgroup_is_threaded(cgrp))
                return 0;

        /*
         * If @cgroup is populated or has domain controllers enabled, it
         * can't be switched.  While the below cgroup_can_be_thread_root()
         * test can catch the same conditions, that's only when @parent is
         * not mixable, so let's check it explicitly.
         */
        if (cgroup_is_populated(cgrp) ||
            cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
                return -EOPNOTSUPP;

        /* we're joining the parent's domain, ensure its validity */
        if (!cgroup_is_valid_domain(dom_cgrp) ||
            !cgroup_can_be_thread_root(dom_cgrp))
                return -EOPNOTSUPP;

        /*
         * The following shouldn't cause actual migrations and should
         * always succeed.
         */
        cgroup_save_control(cgrp);

        cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)
                if (dsct == cgrp || cgroup_is_threaded(dsct))
                        dsct->dom_cgrp = dom_cgrp;

        ret = cgroup_apply_control(cgrp);
        if (!ret)
                parent->nr_threaded_children++;

        cgroup_finalize_control(cgrp, ret);
        return ret;
}

static int cgroup_type_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;

        if (cgroup_is_threaded(cgrp))
                seq_puts(seq, "threaded\n");
        else if (!cgroup_is_valid_domain(cgrp))
                seq_puts(seq, "domain invalid\n");
        else if (cgroup_is_thread_root(cgrp))
                seq_puts(seq, "domain threaded\n");
        else
                seq_puts(seq, "domain\n");

        return 0;
}

static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
                                 size_t nbytes, loff_t off)
{
        struct cgroup *cgrp;
        int ret;

        /* only switching to threaded mode is supported */
        if (strcmp(strstrip(buf), "threaded"))
                return -EINVAL;

        /* drain dying csses before we re-apply (threaded) subtree control */
        cgrp = cgroup_kn_lock_live(of->kn, true);
        if (!cgrp)
                return -ENOENT;

        /* threaded can only be enabled */
        ret = cgroup_enable_threaded(cgrp);

        cgroup_kn_unlock(of->kn);
        return ret ?: nbytes;
}

static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        int descendants = READ_ONCE(cgrp->max_descendants);

        if (descendants == INT_MAX)
                seq_puts(seq, "max\n");
        else
                seq_printf(seq, "%d\n", descendants);

        return 0;
}

static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
                                           char *buf, size_t nbytes, loff_t off)
{
        struct cgroup *cgrp;
        int descendants;
        ssize_t ret;

        buf = strstrip(buf);
        if (!strcmp(buf, "max")) {
                descendants = INT_MAX;
        } else {
                ret = kstrtoint(buf, 0, &descendants);
                if (ret)
                        return ret;
        }

        if (descendants < 0)
                return -ERANGE;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENOENT;

        cgrp->max_descendants = descendants;

        cgroup_kn_unlock(of->kn);

        return nbytes;
}

static int cgroup_max_depth_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        int depth = READ_ONCE(cgrp->max_depth);

        if (depth == INT_MAX)
                seq_puts(seq, "max\n");
        else
                seq_printf(seq, "%d\n", depth);

        return 0;
}

static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
                                      char *buf, size_t nbytes, loff_t off)
{
        struct cgroup *cgrp;
        ssize_t ret;
        int depth;

        buf = strstrip(buf);
        if (!strcmp(buf, "max")) {
                depth = INT_MAX;
        } else {
                ret = kstrtoint(buf, 0, &depth);
                if (ret)
                        return ret;
        }

        if (depth < 0)
                return -ERANGE;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENOENT;

        cgrp->max_depth = depth;

        cgroup_kn_unlock(of->kn);

        return nbytes;
}

static int cgroup_events_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;

        seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
        seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));

        return 0;
}

static int cgroup_stat_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgroup = seq_css(seq)->cgroup;

        seq_printf(seq, "nr_descendants %d\n",
                   cgroup->nr_descendants);
        seq_printf(seq, "nr_dying_descendants %d\n",
                   cgroup->nr_dying_descendants);

        return 0;
}

#ifdef CONFIG_CGROUP_SCHED
/**
 * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest
 *
 * Find and get @cgrp's css associated with @ss.  If the css doesn't exist
 * or is offline, %NULL is returned.
 */
static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
                                                     struct cgroup_subsys *ss)
{
        struct cgroup_subsys_state *css;

        rcu_read_lock();
        css = cgroup_css(cgrp, ss);
        if (css && !css_tryget_online(css))
                css = NULL;
        rcu_read_unlock();

        return css;
}

static int cgroup_extra_stat_show(struct seq_file *seq, int ssid)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct cgroup_subsys *ss = cgroup_subsys[ssid];
        struct cgroup_subsys_state *css;
        int ret;

        if (!ss->css_extra_stat_show)
                return 0;

        css = cgroup_tryget_css(cgrp, ss);
        if (!css)
                return 0;

        ret = ss->css_extra_stat_show(seq, css);
        css_put(css);
        return ret;
}

static int cgroup_local_stat_show(struct seq_file *seq,
                                  struct cgroup *cgrp, int ssid)
{
        struct cgroup_subsys *ss = cgroup_subsys[ssid];
        struct cgroup_subsys_state *css;
        int ret;

        if (!ss->css_local_stat_show)
                return 0;

        css = cgroup_tryget_css(cgrp, ss);
        if (!css)
                return 0;

        ret = ss->css_local_stat_show(seq, css);
        css_put(css);
        return ret;
}
#endif

static int cpu_stat_show(struct seq_file *seq, void *v)
{
        int ret = 0;

        cgroup_base_stat_cputime_show(seq);
#ifdef CONFIG_CGROUP_SCHED
        ret = cgroup_extra_stat_show(seq, cpu_cgrp_id);
#endif
        return ret;
}

static int cpu_local_stat_show(struct seq_file *seq, void *v)
{
        struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
        int ret = 0;

#ifdef CONFIG_CGROUP_SCHED
        ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id);
#endif
        return ret;
}

#ifdef CONFIG_PSI
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct psi_group *psi = cgroup_psi(cgrp);

        return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct psi_group *psi = cgroup_psi(cgrp);

        return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct psi_group *psi = cgroup_psi(cgrp);

        return psi_show(seq, psi, PSI_CPU);
}

static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
                              size_t nbytes, enum psi_res res)
{
        struct cgroup_file_ctx *ctx = of->priv;
        struct psi_trigger *new;
        struct cgroup *cgrp;
        struct psi_group *psi;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENODEV;

        cgroup_get(cgrp);
        cgroup_kn_unlock(of->kn);

        /* Allow only one trigger per file descriptor */
        if (ctx->psi.trigger) {
                cgroup_put(cgrp);
                return -EBUSY;
        }

        psi = cgroup_psi(cgrp);
        new = psi_trigger_create(psi, buf, res, of->file, of);
        if (IS_ERR(new)) {
                cgroup_put(cgrp);
                return PTR_ERR(new);
        }

        smp_store_release(&ctx->psi.trigger, new);
        cgroup_put(cgrp);

        return nbytes;
}

static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
                                          char *buf, size_t nbytes,
                                          loff_t off)
{
        return pressure_write(of, buf, nbytes, PSI_IO);
}

static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
                                          char *buf, size_t nbytes,
                                          loff_t off)
{
        return pressure_write(of, buf, nbytes, PSI_MEM);
}

static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
                                          char *buf, size_t nbytes,
                                          loff_t off)
{
        return pressure_write(of, buf, nbytes, PSI_CPU);
}

#ifdef CONFIG_IRQ_TIME_ACCOUNTING
static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct psi_group *psi = cgroup_psi(cgrp);

        return psi_show(seq, psi, PSI_IRQ);
}

static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
                                         char *buf, size_t nbytes,
                                         loff_t off)
{
        return pressure_write(of, buf, nbytes, PSI_IRQ);
}
#endif

static int cgroup_pressure_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct psi_group *psi = cgroup_psi(cgrp);

        seq_printf(seq, "%d\n", psi->enabled);

        return 0;
}

static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
                                     char *buf, size_t nbytes,
                                     loff_t off)
{
        ssize_t ret;
        int enable;
        struct cgroup *cgrp;
        struct psi_group *psi;

        ret = kstrtoint(strstrip(buf), 0, &enable);
        if (ret)
                return ret;

        if (enable < 0 || enable > 1)
                return -ERANGE;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENOENT;

        psi = cgroup_psi(cgrp);
        if (psi->enabled != enable) {
                int i;

                /* show or hide {cpu,memory,io,irq}.pressure files */
                for (i = 0; i < NR_PSI_RESOURCES; i++)
                        cgroup_file_show(&cgrp->psi_files[i], enable);

                psi->enabled = enable;
                if (enable)
                        psi_cgroup_restart(psi);
        }

        cgroup_kn_unlock(of->kn);

        return nbytes;
}

static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
                                          poll_table *pt)
{
        struct cgroup_file_ctx *ctx = of->priv;

        return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}

static void cgroup_pressure_release(struct kernfs_open_file *of)
{
        struct cgroup_file_ctx *ctx = of->priv;

        psi_trigger_destroy(ctx->psi.trigger);
}

bool cgroup_psi_enabled(void)
{
        if (static_branch_likely(&psi_disabled))
                return false;

        return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
}

#else /* CONFIG_PSI */
bool cgroup_psi_enabled(void)
{
        return false;
}

#endif /* CONFIG_PSI */

static int cgroup_freeze_show(struct seq_file *seq, void *v)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;

        seq_printf(seq, "%d\n", cgrp->freezer.freeze);

        return 0;
}

static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
                                   char *buf, size_t nbytes, loff_t off)
{
        struct cgroup *cgrp;
        ssize_t ret;
        int freeze;

        ret = kstrtoint(strstrip(buf), 0, &freeze);
        if (ret)
                return ret;

        if (freeze < 0 || freeze > 1)
                return -ERANGE;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENOENT;

        cgroup_freeze(cgrp, freeze);

        cgroup_kn_unlock(of->kn);

        return nbytes;
}

static void __cgroup_kill(struct cgroup *cgrp)
{
        struct css_task_iter it;
        struct task_struct *task;

        lockdep_assert_held(&cgroup_mutex);

        spin_lock_irq(&css_set_lock);
        set_bit(CGRP_KILL, &cgrp->flags);
        spin_unlock_irq(&css_set_lock);

        css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
        while ((task = css_task_iter_next(&it))) {
                /* Ignore kernel threads here. */
                if (task->flags & PF_KTHREAD)
                        continue;

                /* Skip tasks that are already dying. */
                if (__fatal_signal_pending(task))
                        continue;

                send_sig(SIGKILL, task, 0);
        }
        css_task_iter_end(&it);

        spin_lock_irq(&css_set_lock);
        clear_bit(CGRP_KILL, &cgrp->flags);
        spin_unlock_irq(&css_set_lock);
}

static void cgroup_kill(struct cgroup *cgrp)
{
        struct cgroup_subsys_state *css;
        struct cgroup *dsct;

        lockdep_assert_held(&cgroup_mutex);

        cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
                __cgroup_kill(dsct);
}

static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
                                 size_t nbytes, loff_t off)
{
        ssize_t ret = 0;
        int kill;
        struct cgroup *cgrp;

        ret = kstrtoint(strstrip(buf), 0, &kill);
        if (ret)
                return ret;

        if (kill != 1)
                return -ERANGE;

        cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!cgrp)
                return -ENOENT;

        /*
         * Killing is a process directed operation, i.e. the whole thread-group
         * is taken down so act like we do for cgroup.procs and only make this
         * writable in non-threaded cgroups.
         */
        if (cgroup_is_threaded(cgrp))
                ret = -EOPNOTSUPP;
        else
                cgroup_kill(cgrp);

        cgroup_kn_unlock(of->kn);

        return ret ?: nbytes;
}

static int cgroup_file_open(struct kernfs_open_file *of)
{
        struct cftype *cft = of_cft(of);
        struct cgroup_file_ctx *ctx;
        int ret;

        ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        ctx->ns = current->nsproxy->cgroup_ns;
        get_cgroup_ns(ctx->ns);
        of->priv = ctx;

        if (!cft->open)
                return 0;

        ret = cft->open(of);
        if (ret) {
                put_cgroup_ns(ctx->ns);
                kfree(ctx);
        }
        return ret;
}

static void cgroup_file_release(struct kernfs_open_file *of)
{
        struct cftype *cft = of_cft(of);
        struct cgroup_file_ctx *ctx = of->priv;

        if (cft->release)
                cft->release(of);
        put_cgroup_ns(ctx->ns);
        kfree(ctx);
}

static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
                                 size_t nbytes, loff_t off)
{
        struct cgroup_file_ctx *ctx = of->priv;
        struct cgroup *cgrp = of->kn->parent->priv;
        struct cftype *cft = of_cft(of);
        struct cgroup_subsys_state *css;
        int ret;

        if (!nbytes)
                return 0;

        /*
         * If namespaces are delegation boundaries, disallow writes to
         * files in an non-init namespace root from inside the namespace
         * except for the files explicitly marked delegatable -
         * cgroup.procs and cgroup.subtree_control.
         */
        if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
            !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
            ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
                return -EPERM;

        if (cft->write)
                return cft->write(of, buf, nbytes, off);

        /*
         * kernfs guarantees that a file isn't deleted with operations in
         * flight, which means that the matching css is and stays alive and
         * doesn't need to be pinned.  The RCU locking is not necessary
         * either.  It's just for the convenience of using cgroup_css().
         */
        rcu_read_lock();
        css = cgroup_css(cgrp, cft->ss);
        rcu_read_unlock();

        if (cft->write_u64) {
                unsigned long long v;
                ret = kstrtoull(buf, 0, &v);
                if (!ret)
                        ret = cft->write_u64(css, cft, v);
        } else if (cft->write_s64) {
                long long v;
                ret = kstrtoll(buf, 0, &v);
                if (!ret)
                        ret = cft->write_s64(css, cft, v);
        } else {
                ret = -EINVAL;
        }

        return ret ?: nbytes;
}

static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
{
        struct cftype *cft = of_cft(of);

        if (cft->poll)
                return cft->poll(of, pt);

        return kernfs_generic_poll(of, pt);
}

static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
        return seq_cft(seq)->seq_start(seq, ppos);
}

static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
{
        return seq_cft(seq)->seq_next(seq, v, ppos);
}

static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
{
        if (seq_cft(seq)->seq_stop)
                seq_cft(seq)->seq_stop(seq, v);
}

static int cgroup_seqfile_show(struct seq_file *m, void *arg)
{
        struct cftype *cft = seq_cft(m);
        struct cgroup_subsys_state *css = seq_css(m);

        if (cft->seq_show)
                return cft->seq_show(m, arg);

        if (cft->read_u64)
                seq_printf(m, "%llu\n", cft->read_u64(css, cft));
        else if (cft->read_s64)
                seq_printf(m, "%lld\n", cft->read_s64(css, cft));
        else
                return -EINVAL;
        return 0;
}

static struct kernfs_ops cgroup_kf_single_ops = {
        .atomic_write_len        = PAGE_SIZE,
        .open                        = cgroup_file_open,
        .release                = cgroup_file_release,
        .write                        = cgroup_file_write,
        .poll                        = cgroup_file_poll,
        .seq_show                = cgroup_seqfile_show,
};

static struct kernfs_ops cgroup_kf_ops = {
        .atomic_write_len        = PAGE_SIZE,
        .open                        = cgroup_file_open,
        .release                = cgroup_file_release,
        .write                        = cgroup_file_write,
        .poll                        = cgroup_file_poll,
        .seq_start                = cgroup_seqfile_start,
        .seq_next                = cgroup_seqfile_next,
        .seq_stop                = cgroup_seqfile_stop,
        .seq_show                = cgroup_seqfile_show,
};

static void cgroup_file_notify_timer(struct timer_list *timer)
{
        cgroup_file_notify(container_of(timer, struct cgroup_file,
                                        notify_timer));
}

static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
                           struct cftype *cft)
{
        char name[CGROUP_FILE_NAME_MAX];
        struct kernfs_node *kn;
        struct lock_class_key *key = NULL;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        key = &cft->lockdep_key;
#endif
        kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
                                  cgroup_file_mode(cft),
                                  current_fsuid(), current_fsgid(),
                                  0, cft->kf_ops, cft,
                                  NULL, key);
        if (IS_ERR(kn))
                return PTR_ERR(kn);

        if (cft->file_offset) {
                struct cgroup_file *cfile = (void *)css + cft->file_offset;

                timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);

                spin_lock_irq(&cgroup_file_kn_lock);
                cfile->kn = kn;
                spin_unlock_irq(&cgroup_file_kn_lock);
        }

        return 0;
}

/**
 * cgroup_addrm_files - add or remove files to a cgroup directory
 * @css: the target css
 * @cgrp: the target cgroup (usually css->cgroup)
 * @cfts: array of cftypes to be added
 * @is_add: whether to add or remove
 *
 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
 * For removals, this function never fails.
 */
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
                              struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add)
{
        struct cftype *cft, *cft_end = NULL;
        int ret = 0;

        lockdep_assert_held(&cgroup_mutex);

restart:
        for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
                /* does cft->flags tell us to skip this file on @cgrp? */
                if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
                        continue;
                if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
                        continue;
                if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
                        continue;
                if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
                        continue;
                if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
                        continue;
                if (is_add) {
                        ret = cgroup_add_file(css, cgrp, cft);
                        if (ret) {
                                pr_warn("%s: failed to add %s, err=%d\n",
                                        __func__, cft->name, ret);
                                cft_end = cft;
                                is_add = false;
                                goto restart;
                        }
                } else {
                        cgroup_rm_file(cgrp, cft);
                }
        }
        return ret;
}

static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
{
        struct cgroup_subsys *ss = cfts[0].ss;
        struct cgroup *root = &ss->root->cgrp;
        struct cgroup_subsys_state *css;
        int ret = 0;

        lockdep_assert_held(&cgroup_mutex);

        /* add/rm files for all cgroups created before */
        css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
                struct cgroup *cgrp = css->cgroup;

                if (!(css->flags & CSS_VISIBLE))
                        continue;

                ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
                if (ret)
                        break;
        }

        if (is_add && !ret)
                kernfs_activate(root->kn);
        return ret;
}

static void cgroup_exit_cftypes(struct cftype *cfts)
{
        struct cftype *cft;

        for (cft = cfts; cft->name[0] != '\0'; cft++) {
                /* free copy for custom atomic_write_len, see init_cftypes() */
                if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
                        kfree(cft->kf_ops);
                cft->kf_ops = NULL;
                cft->ss = NULL;

                /* revert flags set by cgroup core while adding @cfts */
                cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL |
                                __CFTYPE_ADDED);
        }
}

static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
        struct cftype *cft;
        int ret = 0;

        for (cft = cfts; cft->name[0] != '\0'; cft++) {
                struct kernfs_ops *kf_ops;

                WARN_ON(cft->ss || cft->kf_ops);

                if (cft->flags & __CFTYPE_ADDED) {
                        ret = -EBUSY;
                        break;
                }

                if (cft->seq_start)
                        kf_ops = &cgroup_kf_ops;
                else
                        kf_ops = &cgroup_kf_single_ops;

                /*
                 * Ugh... if @cft wants a custom max_write_len, we need to
                 * make a copy of kf_ops to set its atomic_write_len.
                 */
                if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
                        kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
                        if (!kf_ops) {
                                ret = -ENOMEM;
                                break;
                        }
                        kf_ops->atomic_write_len = cft->max_write_len;
                }

                cft->kf_ops = kf_ops;
                cft->ss = ss;
                cft->flags |= __CFTYPE_ADDED;
        }

        if (ret)
                cgroup_exit_cftypes(cfts);
        return ret;
}

static void cgroup_rm_cftypes_locked(struct cftype *cfts)
{
        lockdep_assert_held(&cgroup_mutex);

        list_del(&cfts->node);
        cgroup_apply_cftypes(cfts, false);
        cgroup_exit_cftypes(cfts);
}

/**
 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Unregister @cfts.  Files described by @cfts are removed from all
 * existing cgroups and all future cgroups won't have them either.  This
 * function can be called anytime whether @cfts' subsys is attached or not.
 *
 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
 * registered.
 */
int cgroup_rm_cftypes(struct cftype *cfts)
{
        if (!cfts || cfts[0].name[0] == '\0')
                return 0;

        if (!(cfts[0].flags & __CFTYPE_ADDED))
                return -ENOENT;

        cgroup_lock();
        cgroup_rm_cftypes_locked(cfts);
        cgroup_unlock();
        return 0;
}

/**
 * cgroup_add_cftypes - add an array of cftypes to a subsystem
 * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Register @cfts to @ss.  Files described by @cfts are created for all
 * existing cgroups to which @ss is attached and all future cgroups will
 * have them too.  This function can be called anytime whether @ss is
 * attached or not.
 *
 * Returns 0 on successful registration, -errno on failure.  Note that this
 * function currently returns 0 as long as @cfts registration is successful
 * even if some file creation attempts on existing cgroups fail.
 */
static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
        int ret;

        if (!cgroup_ssid_enabled(ss->id))
                return 0;

        if (!cfts || cfts[0].name[0] == '\0')
                return 0;

        ret = cgroup_init_cftypes(ss, cfts);
        if (ret)
                return ret;

        cgroup_lock();

        list_add_tail(&cfts->node, &ss->cfts);
        ret = cgroup_apply_cftypes(cfts, true);
        if (ret)
                cgroup_rm_cftypes_locked(cfts);

        cgroup_unlock();
        return ret;
}

/**
 * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
 * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Similar to cgroup_add_cftypes() but the added files are only used for
 * the default hierarchy.
 */
int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
        struct cftype *cft;

        for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
                cft->flags |= __CFTYPE_ONLY_ON_DFL;
        return cgroup_add_cftypes(ss, cfts);
}

/**
 * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
 * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Similar to cgroup_add_cftypes() but the added files are only used for
 * the legacy hierarchies.
 */
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
        struct cftype *cft;

        for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
                cft->flags |= __CFTYPE_NOT_ON_DFL;
        return cgroup_add_cftypes(ss, cfts);
}

/**
 * cgroup_file_notify - generate a file modified event for a cgroup_file
 * @cfile: target cgroup_file
 *
 * @cfile must have been obtained by setting cftype->file_offset.
 */
void cgroup_file_notify(struct cgroup_file *cfile)
{
        unsigned long flags;

        spin_lock_irqsave(&cgroup_file_kn_lock, flags);
        if (cfile->kn) {
                unsigned long last = cfile->notified_at;
                unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;

                if (time_in_range(jiffies, last, next)) {
                        timer_reduce(&cfile->notify_timer, next);
                } else {
                        kernfs_notify(cfile->kn);
                        cfile->notified_at = jiffies;
                }
        }
        spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
}

/**
 * cgroup_file_show - show or hide a hidden cgroup file
 * @cfile: target cgroup_file obtained by setting cftype->file_offset
 * @show: whether to show or hide
 */
void cgroup_file_show(struct cgroup_file *cfile, bool show)
{
        struct kernfs_node *kn;

        spin_lock_irq(&cgroup_file_kn_lock);
        kn = cfile->kn;
        kernfs_get(kn);
        spin_unlock_irq(&cgroup_file_kn_lock);

        if (kn)
                kernfs_show(kn, show);

        kernfs_put(kn);
}

/**
 * css_next_child - find the next child of a given css
 * @pos: the current position (%NULL to initiate traversal)
 * @parent: css whose children to walk
 *
 * This function returns the next child of @parent and should be called
 * under either cgroup_mutex or RCU read lock.  The only requirement is
 * that @parent and @pos are accessible.  The next sibling is guaranteed to
 * be returned regardless of their states.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 */
struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
                                           struct cgroup_subsys_state *parent)
{
        struct cgroup_subsys_state *next;

        cgroup_assert_mutex_or_rcu_locked();

        /*
         * @pos could already have been unlinked from the sibling list.
         * Once a cgroup is removed, its ->sibling.next is no longer
         * updated when its next sibling changes.  CSS_RELEASED is set when
         * @pos is taken off list, at which time its next pointer is valid,
         * and, as releases are serialized, the one pointed to by the next
         * pointer is guaranteed to not have started release yet.  This
         * implies that if we observe !CSS_RELEASED on @pos in this RCU
         * critical section, the one pointed to by its next pointer is
         * guaranteed to not have finished its RCU grace period even if we
         * have dropped rcu_read_lock() in-between iterations.
         *
         * If @pos has CSS_RELEASED set, its next pointer can't be
         * dereferenced; however, as each css is given a monotonically
         * increasing unique serial number and always appended to the
         * sibling list, the next one can be found by walking the parent's
         * children until the first css with higher serial number than
         * @pos's.  While this path can be slower, it happens iff iteration
         * races against release and the race window is very small.
         */
        if (!pos) {
                next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
        } else if (likely(!(pos->flags & CSS_RELEASED))) {
                next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
        } else {
                list_for_each_entry_rcu(next, &parent->children, sibling,
                                        lockdep_is_held(&cgroup_mutex))
                        if (next->serial_nr > pos->serial_nr)
                                break;
        }

        /*
         * @next, if not pointing to the head, can be dereferenced and is
         * the next sibling.
         */
        if (&next->sibling != &parent->children)
                return next;
        return NULL;
}

/**
 * css_next_descendant_pre - find the next descendant for pre-order walk
 * @pos: the current position (%NULL to initiate traversal)
 * @root: css whose descendants to walk
 *
 * To be used by css_for_each_descendant_pre().  Find the next descendant
 * to visit for pre-order traversal of @root's descendants.  @root is
 * included in the iteration and the first node to be visited.
 *
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct next descendant as long
 * as both @pos and @root are accessible and @pos is a descendant of @root.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 */
struct cgroup_subsys_state *
css_next_descendant_pre(struct cgroup_subsys_state *pos,
                        struct cgroup_subsys_state *root)
{
        struct cgroup_subsys_state *next;

        cgroup_assert_mutex_or_rcu_locked();

        /* if first iteration, visit @root */
        if (!pos)
                return root;

        /* visit the first child if exists */
        next = css_next_child(NULL, pos);
        if (next)
                return next;

        /* no child, visit my or the closest ancestor's next sibling */
        while (pos != root) {
                next = css_next_child(pos, pos->parent);
                if (next)
                        return next;
                pos = pos->parent;
        }

        return NULL;
}
EXPORT_SYMBOL_GPL(css_next_descendant_pre);

/**
 * css_rightmost_descendant - return the rightmost descendant of a css
 * @pos: css of interest
 *
 * Return the rightmost descendant of @pos.  If there's no descendant, @pos
 * is returned.  This can be used during pre-order traversal to skip
 * subtree of @pos.
 *
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct rightmost descendant as
 * long as @pos is accessible.
 */
struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
{
        struct cgroup_subsys_state *last, *tmp;

        cgroup_assert_mutex_or_rcu_locked();

        do {
                last = pos;
                /* ->prev isn't RCU safe, walk ->next till the end */
                pos = NULL;
                css_for_each_child(tmp, last)
                        pos = tmp;
        } while (pos);

        return last;
}

static struct cgroup_subsys_state *
css_leftmost_descendant(struct cgroup_subsys_state *pos)
{
        struct cgroup_subsys_state *last;

        do {
                last = pos;
                pos = css_next_child(NULL, pos);
        } while (pos);

        return last;
}

/**
 * css_next_descendant_post - find the next descendant for post-order walk
 * @pos: the current position (%NULL to initiate traversal)
 * @root: css whose descendants to walk
 *
 * To be used by css_for_each_descendant_post().  Find the next descendant
 * to visit for post-order traversal of @root's descendants.  @root is
 * included in the iteration and the last node to be visited.
 *
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct next descendant as long
 * as both @pos and @cgroup are accessible and @pos is a descendant of
 * @cgroup.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 */
struct cgroup_subsys_state *
css_next_descendant_post(struct cgroup_subsys_state *pos,
                         struct cgroup_subsys_state *root)
{
        struct cgroup_subsys_state *next;

        cgroup_assert_mutex_or_rcu_locked();

        /* if first iteration, visit leftmost descendant which may be @root */
        if (!pos)
                return css_leftmost_descendant(root);

        /* if we visited @root, we're done */
        if (pos == root)
                return NULL;

        /* if there's an unvisited sibling, visit its leftmost descendant */
        next = css_next_child(pos, pos->parent);
        if (next)
                return css_leftmost_descendant(next);

        /* no sibling left, visit parent */
        return pos->parent;
}

/**
 * css_has_online_children - does a css have online children
 * @css: the target css
 *
 * Returns %true if @css has any online children; otherwise, %false.  This
 * function can be called from any context but the caller is responsible
 * for synchronizing against on/offlining as necessary.
 */
bool css_has_online_children(struct cgroup_subsys_state *css)
{
        struct cgroup_subsys_state *child;
        bool ret = false;

        rcu_read_lock();
        css_for_each_child(child, css) {
                if (child->flags & CSS_ONLINE) {
                        ret = true;
                        break;
                }
        }
        rcu_read_unlock();
        return ret;
}

static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
{
        struct list_head *l;
        struct cgrp_cset_link *link;
        struct css_set *cset;

        lockdep_assert_held(&css_set_lock);

        /* find the next threaded cset */
        if (it->tcset_pos) {
                l = it->tcset_pos->next;

                if (l != it->tcset_head) {
                        it->tcset_pos = l;
                        return container_of(l, struct css_set,
                                            threaded_csets_node);
                }

                it->tcset_pos = NULL;
        }

        /* find the next cset */
        l = it->cset_pos;
        l = l->next;
        if (l == it->cset_head) {
                it->cset_pos = NULL;
                return NULL;
        }

        if (it->ss) {
                cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
        } else {
                link = list_entry(l, struct cgrp_cset_link, cset_link);
                cset = link->cset;
        }

        it->cset_pos = l;

        /* initialize threaded css_set walking */
        if (it->flags & CSS_TASK_ITER_THREADED) {
                if (it->cur_dcset)
                        put_css_set_locked(it->cur_dcset);
                it->cur_dcset = cset;
                get_css_set(cset);

                it->tcset_head = &cset->threaded_csets;
                it->tcset_pos = &cset->threaded_csets;
        }

        return cset;
}

/**
 * css_task_iter_advance_css_set - advance a task iterator to the next css_set
 * @it: the iterator to advance
 *
 * Advance @it to the next css_set to walk.
 */
static void css_task_iter_advance_css_set(struct css_task_iter *it)
{
        struct css_set *cset;

        lockdep_assert_held(&css_set_lock);

        /* Advance to the next non-empty css_set and find first non-empty tasks list*/
        while ((cset = css_task_iter_next_css_set(it))) {
                if (!list_empty(&cset->tasks)) {
                        it->cur_tasks_head = &cset->tasks;
                        break;
                } else if (!list_empty(&cset->mg_tasks)) {
                        it->cur_tasks_head = &cset->mg_tasks;
                        break;
                } else if (!list_empty(&cset->dying_tasks)) {
                        it->cur_tasks_head = &cset->dying_tasks;
                        break;
                }
        }
        if (!cset) {
                it->task_pos = NULL;
                return;
        }
        it->task_pos = it->cur_tasks_head->next;

        /*
         * We don't keep css_sets locked across iteration steps and thus
         * need to take steps to ensure that iteration can be resumed after
         * the lock is re-acquired.  Iteration is performed at two levels -
         * css_sets and tasks in them.
         *
         * Once created, a css_set never leaves its cgroup lists, so a
         * pinned css_set is guaranteed to stay put and we can resume
         * iteration afterwards.
         *
         * Tasks may leave @cset across iteration steps.  This is resolved
         * by registering each iterator with the css_set currently being
         * walked and making css_set_move_task() advance iterators whose
         * next task is leaving.
         */
        if (it->cur_cset) {
                list_del(&it->iters_node);
                put_css_set_locked(it->cur_cset);
        }
        get_css_set(cset);
        it->cur_cset = cset;
        list_add(&it->iters_node, &cset->task_iters);
}

static void css_task_iter_skip(struct css_task_iter *it,
                               struct task_struct *task)
{
        lockdep_assert_held(&css_set_lock);

        if (it->task_pos == &task->cg_list) {
                it->task_pos = it->task_pos->next;
                it->flags |= CSS_TASK_ITER_SKIPPED;
        }
}

static void css_task_iter_advance(struct css_task_iter *it)
{
        struct task_struct *task;

        lockdep_assert_held(&css_set_lock);
repeat:
        if (it->task_pos) {
                /*
                 * Advance iterator to find next entry. We go through cset
                 * tasks, mg_tasks and dying_tasks, when consumed we move onto
                 * the next cset.
                 */
                if (it->flags & CSS_TASK_ITER_SKIPPED)
                        it->flags &= ~CSS_TASK_ITER_SKIPPED;
                else
                        it->task_pos = it->task_pos->next;

                if (it->task_pos == &it->cur_cset->tasks) {
                        it->cur_tasks_head = &it->cur_cset->mg_tasks;
                        it->task_pos = it->cur_tasks_head->next;
                }
                if (it->task_pos == &it->cur_cset->mg_tasks) {
                        it->cur_tasks_head = &it->cur_cset->dying_tasks;
                        it->task_pos = it->cur_tasks_head->next;
                }
                if (it->task_pos == &it->cur_cset->dying_tasks)
                        css_task_iter_advance_css_set(it);
        } else {
                /* called from start, proceed to the first cset */
                css_task_iter_advance_css_set(it);
        }

        if (!it->task_pos)
                return;

        task = list_entry(it->task_pos, struct task_struct, cg_list);

        if (it->flags & CSS_TASK_ITER_PROCS) {
                /* if PROCS, skip over tasks which aren't group leaders */
                if (!thread_group_leader(task))
                        goto repeat;

                /* and dying leaders w/o live member threads */
                if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
                    !atomic_read(&task->signal->live))
                        goto repeat;
        } else {
                /* skip all dying ones */
                if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
                        goto repeat;
        }
}

/**
 * css_task_iter_start - initiate task iteration
 * @css: the css to walk tasks of
 * @flags: CSS_TASK_ITER_* flags
 * @it: the task iterator to use
 *
 * Initiate iteration through the tasks of @css.  The caller can call
 * css_task_iter_next() to walk through the tasks until the function
 * returns NULL.  On completion of iteration, css_task_iter_end() must be
 * called.
 */
void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
                         struct css_task_iter *it)
{
        unsigned long irqflags;

        memset(it, 0, sizeof(*it));

        spin_lock_irqsave(&css_set_lock, irqflags);

        it->ss = css->ss;
        it->flags = flags;

        if (CGROUP_HAS_SUBSYS_CONFIG && it->ss)
                it->cset_pos = &css->cgroup->e_csets[css->ss->id];
        else
                it->cset_pos = &css->cgroup->cset_links;

        it->cset_head = it->cset_pos;

        css_task_iter_advance(it);

        spin_unlock_irqrestore(&css_set_lock, irqflags);
}

/**
 * css_task_iter_next - return the next task for the iterator
 * @it: the task iterator being iterated
 *
 * The "next" function for task iteration.  @it should have been
 * initialized via css_task_iter_start().  Returns NULL when the iteration
 * reaches the end.
 */
struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
        unsigned long irqflags;

        if (it->cur_task) {
                put_task_struct(it->cur_task);
                it->cur_task = NULL;
        }

        spin_lock_irqsave(&css_set_lock, irqflags);

        /* @it may be half-advanced by skips, finish advancing */
        if (it->flags & CSS_TASK_ITER_SKIPPED)
                css_task_iter_advance(it);

        if (it->task_pos) {
                it->cur_task = list_entry(it->task_pos, struct task_struct,
                                          cg_list);
                get_task_struct(it->cur_task);
                css_task_iter_advance(it);
        }

        spin_unlock_irqrestore(&css_set_lock, irqflags);

        return it->cur_task;
}

/**
 * css_task_iter_end - finish task iteration
 * @it: the task iterator to finish
 *
 * Finish task iteration started by css_task_iter_start().
 */
void css_task_iter_end(struct css_task_iter *it)
{
        unsigned long irqflags;

        if (it->cur_cset) {
                spin_lock_irqsave(&css_set_lock, irqflags);
                list_del(&it->iters_node);
                put_css_set_locked(it->cur_cset);
                spin_unlock_irqrestore(&css_set_lock, irqflags);
        }

        if (it->cur_dcset)
                put_css_set(it->cur_dcset);

        if (it->cur_task)
                put_task_struct(it->cur_task);
}

static void cgroup_procs_release(struct kernfs_open_file *of)
{
        struct cgroup_file_ctx *ctx = of->priv;

        if (ctx->procs.started)
                css_task_iter_end(&ctx->procs.iter);
}

static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
        struct kernfs_open_file *of = s->private;
        struct cgroup_file_ctx *ctx = of->priv;

        if (pos)
                (*pos)++;

        return css_task_iter_next(&ctx->procs.iter);
}

static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
                                  unsigned int iter_flags)
{
        struct kernfs_open_file *of = s->private;
        struct cgroup *cgrp = seq_css(s)->cgroup;
        struct cgroup_file_ctx *ctx = of->priv;
        struct css_task_iter *it = &ctx->procs.iter;

        /*
         * When a seq_file is seeked, it's always traversed sequentially
         * from position 0, so we can simply keep iterating on !0 *pos.
         */
        if (!ctx->procs.started) {
                if (WARN_ON_ONCE((*pos)))
                        return ERR_PTR(-EINVAL);
                css_task_iter_start(&cgrp->self, iter_flags, it);
                ctx->procs.started = true;
        } else if (!(*pos)) {
                css_task_iter_end(it);
                css_task_iter_start(&cgrp->self, iter_flags, it);
        } else
                return it->cur_task;

        return cgroup_procs_next(s, NULL, NULL);
}

static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
{
        struct cgroup *cgrp = seq_css(s)->cgroup;

        /*
         * All processes of a threaded subtree belong to the domain cgroup
         * of the subtree.  Only threads can be distributed across the
         * subtree.  Reject reads on cgroup.procs in the subtree proper.
         * They're always empty anyway.
         */
        if (cgroup_is_threaded(cgrp))
                return ERR_PTR(-EOPNOTSUPP);

        return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
                                            CSS_TASK_ITER_THREADED);
}

static int cgroup_procs_show(struct seq_file *s, void *v)
{
        seq_printf(s, "%d\n", task_pid_vnr(v));
        return 0;
}

static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
{
        int ret;
        struct inode *inode;

        lockdep_assert_held(&cgroup_mutex);

        inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
        if (!inode)
                return -ENOMEM;

        ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE);
        iput(inode);
        return ret;
}

static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
                                         struct cgroup *dst_cgrp,
                                         struct super_block *sb,
                                         struct cgroup_namespace *ns)
{
        struct cgroup *com_cgrp = src_cgrp;
        int ret;

        lockdep_assert_held(&cgroup_mutex);

        /* find the common ancestor */
        while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
                com_cgrp = cgroup_parent(com_cgrp);

        /* %current should be authorized to migrate to the common ancestor */
        ret = cgroup_may_write(com_cgrp, sb);
        if (ret)
                return ret;

        /*
         * If namespaces are delegation boundaries, %current must be able
         * to see both source and destination cgroups from its namespace.
         */
        if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
            (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
             !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
                return -ENOENT;

        return 0;
}

static int cgroup_attach_permissions(struct cgroup *src_cgrp,
                                     struct cgroup *dst_cgrp,
                                     struct super_block *sb, bool threadgroup,
                                     struct cgroup_namespace *ns)
{
        int ret = 0;

        ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
        if (ret)
                return ret;

        ret = cgroup_migrate_vet_dst(dst_cgrp);
        if (ret)
                return ret;

        if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
                ret = -EOPNOTSUPP;

        return ret;
}

static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
                                    bool threadgroup)
{
        struct cgroup_file_ctx *ctx = of->priv;
        struct cgroup *src_cgrp, *dst_cgrp;
        struct task_struct *task;
        const struct cred *saved_cred;
        ssize_t ret;
        bool threadgroup_locked;

        dst_cgrp = cgroup_kn_lock_live(of->kn, false);
        if (!dst_cgrp)
                return -ENODEV;

        task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
        ret = PTR_ERR_OR_ZERO(task);
        if (ret)
                goto out_unlock;

        /* find the source cgroup */
        spin_lock_irq(&css_set_lock);
        src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
        spin_unlock_irq(&css_set_lock);

        /*
         * Process and thread migrations follow same delegation rule. Check
         * permissions using the credentials from file open to protect against
         * inherited fd attacks.
         */
        saved_cred = override_creds(of->file->f_cred);
        ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
                                        of->file->f_path.dentry->d_sb,
                                        threadgroup, ctx->ns);
        revert_creds(saved_cred);
        if (ret)
                goto out_finish;

        ret = cgroup_attach_task(dst_cgrp, task, threadgroup);

out_finish:
        cgroup_procs_write_finish(task, threadgroup_locked);
out_unlock:
        cgroup_kn_unlock(of->kn);

        return ret;
}

static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
                                  char *buf, size_t nbytes, loff_t off)
{
        return __cgroup_procs_write(of, buf, true) ?: nbytes;
}

static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
{
        return __cgroup_procs_start(s, pos, 0);
}

static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
                                    char *buf, size_t nbytes, loff_t off)
{
        return __cgroup_procs_write(of, buf, false) ?: nbytes;
}

/* cgroup core interface files for the default hierarchy */
static struct cftype cgroup_base_files[] = {
        {
                .name = "cgroup.type",
                .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = cgroup_type_show,
                .write = cgroup_type_write,
        },
        {
                .name = "cgroup.procs",
                .flags = CFTYPE_NS_DELEGATABLE,
                .file_offset = offsetof(struct cgroup, procs_file),
                .release = cgroup_procs_release,
                .seq_start = cgroup_procs_start,
                .seq_next = cgroup_procs_next,
                .seq_show = cgroup_procs_show,
                .write = cgroup_procs_write,
        },
        {
                .name = "cgroup.threads",
                .flags = CFTYPE_NS_DELEGATABLE,
                .release = cgroup_procs_release,
                .seq_start = cgroup_threads_start,
                .seq_next = cgroup_procs_next,
                .seq_show = cgroup_procs_show,
                .write = cgroup_threads_write,
        },
        {
                .name = "cgroup.controllers",
                .seq_show = cgroup_controllers_show,
        },
        {
                .name = "cgroup.subtree_control",
                .flags = CFTYPE_NS_DELEGATABLE,
                .seq_show = cgroup_subtree_control_show,
                .write = cgroup_subtree_control_write,
        },
        {
                .name = "cgroup.events",
                .flags = CFTYPE_NOT_ON_ROOT,
                .file_offset = offsetof(struct cgroup, events_file),
                .seq_show = cgroup_events_show,
        },
        {
                .name = "cgroup.max.descendants",
                .seq_show = cgroup_max_descendants_show,
                .write = cgroup_max_descendants_write,
        },
        {
                .name = "cgroup.max.depth",
                .seq_show = cgroup_max_depth_show,
                .write = cgroup_max_depth_write,
        },
        {
                .name = "cgroup.stat",
                .seq_show = cgroup_stat_show,
        },
        {
                .name = "cgroup.freeze",
                .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = cgroup_freeze_show,
                .write = cgroup_freeze_write,
        },
        {
                .name = "cgroup.kill",
                .flags = CFTYPE_NOT_ON_ROOT,
                .write = cgroup_kill_write,
        },
        {
                .name = "cpu.stat",
                .seq_show = cpu_stat_show,
        },
        {
                .name = "cpu.stat.local",
                .seq_show = cpu_local_stat_show,
        },
        { }        /* terminate */
};

static struct cftype cgroup_psi_files[] = {
#ifdef CONFIG_PSI
        {
                .name = "io.pressure",
                .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
                .seq_show = cgroup_io_pressure_show,
                .write = cgroup_io_pressure_write,
                .poll = cgroup_pressure_poll,
                .release = cgroup_pressure_release,
        },
        {
                .name = "memory.pressure",
                .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
                .seq_show = cgroup_memory_pressure_show,
                .write = cgroup_memory_pressure_write,
                .poll = cgroup_pressure_poll,
                .release = cgroup_pressure_release,
        },
        {
                .name = "cpu.pressure",
                .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
                .seq_show = cgroup_cpu_pressure_show,
                .write = cgroup_cpu_pressure_write,
                .poll = cgroup_pressure_poll,
                .release = cgroup_pressure_release,
        },
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
        {
                .name = "irq.pressure",
                .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
                .seq_show = cgroup_irq_pressure_show,
                .write = cgroup_irq_pressure_write,
                .poll = cgroup_pressure_poll,
                .release = cgroup_pressure_release,
        },
#endif
        {
                .name = "cgroup.pressure",
                .seq_show = cgroup_pressure_show,
                .write = cgroup_pressure_write,
        },
#endif /* CONFIG_PSI */
        { }        /* terminate */
};

/*
 * css destruction is four-stage process.
 *
 * 1. Destruction starts.  Killing of the percpu_ref is initiated.
 *    Implemented in kill_css().
 *
 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
 *    and thus css_tryget_online() is guaranteed to fail, the css can be
 *    offlined by invoking offline_css().  After offlining, the base ref is
 *    put.  Implemented in css_killed_work_fn().
 *
 * 3. When the percpu_ref reaches zero, the only possible remaining
 *    accessors are inside RCU read sections.  css_release() schedules the
 *    RCU callback.
 *
 * 4. After the grace period, the css can be freed.  Implemented in
 *    css_free_rwork_fn().
 *
 * It is actually hairier because both step 2 and 4 require process context
 * and thus involve punting to css->destroy_work adding two additional
 * steps to the already complex sequence.
 */
static void css_free_rwork_fn(struct work_struct *work)
{
        struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
                                struct cgroup_subsys_state, destroy_rwork);
        struct cgroup_subsys *ss = css->ss;
        struct cgroup *cgrp = css->cgroup;

        percpu_ref_exit(&css->refcnt);

        if (ss) {
                /* css free path */
                struct cgroup_subsys_state *parent = css->parent;
                int id = css->id;

                ss->css_free(css);
                cgroup_idr_remove(&ss->css_idr, id);
                cgroup_put(cgrp);

                if (parent)
                        css_put(parent);
        } else {
                /* cgroup free path */
                atomic_dec(&cgrp->root->nr_cgrps);
                if (!cgroup_on_dfl(cgrp))
                        cgroup1_pidlist_destroy_all(cgrp);
                cancel_work_sync(&cgrp->release_agent_work);
                bpf_cgrp_storage_free(cgrp);

                if (cgroup_parent(cgrp)) {
                        /*
                         * We get a ref to the parent, and put the ref when
                         * this cgroup is being freed, so it's guaranteed
                         * that the parent won't be destroyed before its
                         * children.
                         */
                        cgroup_put(cgroup_parent(cgrp));
                        kernfs_put(cgrp->kn);
                        psi_cgroup_free(cgrp);
                        cgroup_rstat_exit(cgrp);
                        kfree(cgrp);
                } else {
                        /*
                         * This is root cgroup's refcnt reaching zero,
                         * which indicates that the root should be
                         * released.
                         */
                        cgroup_destroy_root(cgrp->root);
                }
        }
}

static void css_release_work_fn(struct work_struct *work)
{
        struct cgroup_subsys_state *css =
                container_of(work, struct cgroup_subsys_state, destroy_work);
        struct cgroup_subsys *ss = css->ss;
        struct cgroup *cgrp = css->cgroup;

        cgroup_lock();

        css->flags |= CSS_RELEASED;
        list_del_rcu(&css->sibling);

        if (ss) {
                /* css release path */
                if (!list_empty(&css->rstat_css_node)) {
                        cgroup_rstat_flush(cgrp);
                        list_del_rcu(&css->rstat_css_node);
                }

                cgroup_idr_replace(&ss->css_idr, NULL, css->id);
                if (ss->css_released)
                        ss->css_released(css);
        } else {
                struct cgroup *tcgrp;

                /* cgroup release path */
                TRACE_CGROUP_PATH(release, cgrp);

                cgroup_rstat_flush(cgrp);

                spin_lock_irq(&css_set_lock);
                for (tcgrp = cgroup_parent(cgrp); tcgrp;
                     tcgrp = cgroup_parent(tcgrp))
                        tcgrp->nr_dying_descendants--;
                spin_unlock_irq(&css_set_lock);

                /*
                 * There are two control paths which try to determine
                 * cgroup from dentry without going through kernfs -
                 * cgroupstats_build() and css_tryget_online_from_dir().
                 * Those are supported by RCU protecting clearing of
                 * cgrp->kn->priv backpointer.
                 */
                if (cgrp->kn)
                        RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
                                         NULL);
        }

        cgroup_unlock();

        INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
        queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
}

static void css_release(struct percpu_ref *ref)
{
        struct cgroup_subsys_state *css =
                container_of(ref, struct cgroup_subsys_state, refcnt);

        INIT_WORK(&css->destroy_work, css_release_work_fn);
        queue_work(cgroup_destroy_wq, &css->destroy_work);
}

static void init_and_link_css(struct cgroup_subsys_state *css,
                              struct cgroup_subsys *ss, struct cgroup *cgrp)
{
        lockdep_assert_held(&cgroup_mutex);

        cgroup_get_live(cgrp);

        memset(css, 0, sizeof(*css));
        css->cgroup = cgrp;
        css->ss = ss;
        css->id = -1;
        INIT_LIST_HEAD(&css->sibling);
        INIT_LIST_HEAD(&css->children);
        INIT_LIST_HEAD(&css->rstat_css_node);
        css->serial_nr = css_serial_nr_next++;
        atomic_set(&css->online_cnt, 0);

        if (cgroup_parent(cgrp)) {
                css->parent = cgroup_css(cgroup_parent(cgrp), ss);
                css_get(css->parent);
        }

        if (ss->css_rstat_flush)
                list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);

        BUG_ON(cgroup_css(cgrp, ss));
}

/* invoke ->css_online() on a new CSS and mark it online if successful */
static int online_css(struct cgroup_subsys_state *css)
{
        struct cgroup_subsys *ss = css->ss;
        int ret = 0;

        lockdep_assert_held(&cgroup_mutex);

        if (ss->css_online)
                ret = ss->css_online(css);
        if (!ret) {
                css->flags |= CSS_ONLINE;
                rcu_assign_pointer(css->cgroup->subsys[ss->id], css);

                atomic_inc(&css->online_cnt);
                if (css->parent)
                        atomic_inc(&css->parent->online_cnt);
        }
        return ret;
}

/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
static void offline_css(struct cgroup_subsys_state *css)
{
        struct cgroup_subsys *ss = css->ss;

        lockdep_assert_held(&cgroup_mutex);

        if (!(css->flags & CSS_ONLINE))
                return;

        if (ss->css_offline)
                ss->css_offline(css);

        css->flags &= ~CSS_ONLINE;
        RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);

        wake_up_all(&css->cgroup->offline_waitq);
}

/**
 * css_create - create a cgroup_subsys_state
 * @cgrp: the cgroup new css will be associated with
 * @ss: the subsys of new css
 *
 * Create a new css associated with @cgrp - @ss pair.  On success, the new
 * css is online and installed in @cgrp.  This function doesn't create the
 * interface files.  Returns 0 on success, -errno on failure.
 */
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
                                              struct cgroup_subsys *ss)
{
        struct cgroup *parent = cgroup_parent(cgrp);
        struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
        struct cgroup_subsys_state *css;
        int err;

        lockdep_assert_held(&cgroup_mutex);

        css = ss->css_alloc(parent_css);
        if (!css)
                css = ERR_PTR(-ENOMEM);
        if (IS_ERR(css))
                return css;

        init_and_link_css(css, ss, cgrp);

        err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
        if (err)
                goto err_free_css;

        err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
        if (err < 0)
                goto err_free_css;
        css->id = err;

        /* @css is ready to be brought online now, make it visible */
        list_add_tail_rcu(&css->sibling, &parent_css->children);
        cgroup_idr_replace(&ss->css_idr, css, css->id);

        err = online_css(css);
        if (err)
                goto err_list_del;

        return css;

err_list_del:
        list_del_rcu(&css->sibling);
err_free_css:
        list_del_rcu(&css->rstat_css_node);
        INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
        queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
        return ERR_PTR(err);
}

/*
 * The returned cgroup is fully initialized including its control mask, but
 * it doesn't have the control mask applied.
 */
static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
                                    umode_t mode)
{
        struct cgroup_root *root = parent->root;
        struct cgroup *cgrp, *tcgrp;
        struct kernfs_node *kn;
        int level = parent->level + 1;
        int ret;

        /* allocate the cgroup and its ID, 0 is reserved for the root */
        cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL);
        if (!cgrp)
                return ERR_PTR(-ENOMEM);

        ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
        if (ret)
                goto out_free_cgrp;

        ret = cgroup_rstat_init(cgrp);
        if (ret)
                goto out_cancel_ref;

        /* create the directory */
        kn = kernfs_create_dir_ns(parent->kn, name, mode,
                                  current_fsuid(), current_fsgid(),
                                  cgrp, NULL);
        if (IS_ERR(kn)) {
                ret = PTR_ERR(kn);
                goto out_stat_exit;
        }
        cgrp->kn = kn;

        init_cgroup_housekeeping(cgrp);

        cgrp->self.parent = &parent->self;
        cgrp->root = root;
        cgrp->level = level;

        ret = psi_cgroup_alloc(cgrp);
        if (ret)
                goto out_kernfs_remove;

        ret = cgroup_bpf_inherit(cgrp);
        if (ret)
                goto out_psi_free;

        /*
         * New cgroup inherits effective freeze counter, and
         * if the parent has to be frozen, the child has too.
         */
        cgrp->freezer.e_freeze = parent->freezer.e_freeze;
        if (cgrp->freezer.e_freeze) {
                /*
                 * Set the CGRP_FREEZE flag, so when a process will be
                 * attached to the child cgroup, it will become frozen.
                 * At this point the new cgroup is unpopulated, so we can
                 * consider it frozen immediately.
                 */
                set_bit(CGRP_FREEZE, &cgrp->flags);
                set_bit(CGRP_FROZEN, &cgrp->flags);
        }

        spin_lock_irq(&css_set_lock);
        for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
                cgrp->ancestors[tcgrp->level] = tcgrp;

                if (tcgrp != cgrp) {
                        tcgrp->nr_descendants++;

                        /*
                         * If the new cgroup is frozen, all ancestor cgroups
                         * get a new frozen descendant, but their state can't
                         * change because of this.
                         */
                        if (cgrp->freezer.e_freeze)
                                tcgrp->freezer.nr_frozen_descendants++;
                }
        }
        spin_unlock_irq(&css_set_lock);

        if (notify_on_release(parent))
                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

        if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);

        cgrp->self.serial_nr = css_serial_nr_next++;

        /* allocation complete, commit to creation */
        list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
        atomic_inc(&root->nr_cgrps);
        cgroup_get_live(parent);

        /*
         * On the default hierarchy, a child doesn't automatically inherit
         * subtree_control from the parent.  Each is configured manually.
         */
        if (!cgroup_on_dfl(cgrp))
                cgrp->subtree_control = cgroup_control(cgrp);

        cgroup_propagate_control(cgrp);

        return cgrp;

out_psi_free:
        psi_cgroup_free(cgrp);
out_kernfs_remove:
        kernfs_remove(cgrp->kn);
out_stat_exit:
        cgroup_rstat_exit(cgrp);
out_cancel_ref:
        percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
        kfree(cgrp);
        return ERR_PTR(ret);
}

static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
{
        struct cgroup *cgroup;
        int ret = false;
        int level = 1;

        lockdep_assert_held(&cgroup_mutex);

        for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
                if (cgroup->nr_descendants >= cgroup->max_descendants)
                        goto fail;

                if (level > cgroup->max_depth)
                        goto fail;

                level++;
        }

        ret = true;
fail:
        return ret;
}

int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
{
        struct cgroup *parent, *cgrp;
        int ret;

        /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
        if (strchr(name, '\n'))
                return -EINVAL;

        parent = cgroup_kn_lock_live(parent_kn, false);
        if (!parent)
                return -ENODEV;

        if (!cgroup_check_hierarchy_limits(parent)) {
                ret = -EAGAIN;
                goto out_unlock;
        }

        cgrp = cgroup_create(parent, name, mode);
        if (IS_ERR(cgrp)) {
                ret = PTR_ERR(cgrp);
                goto out_unlock;
        }

        /*
         * This extra ref will be put in cgroup_free_fn() and guarantees
         * that @cgrp->kn is always accessible.
         */
        kernfs_get(cgrp->kn);

        ret = css_populate_dir(&cgrp->self);
        if (ret)
                goto out_destroy;

        ret = cgroup_apply_control_enable(cgrp);
        if (ret)
                goto out_destroy;

        TRACE_CGROUP_PATH(mkdir, cgrp);

        /* let's create and online css's */
        kernfs_activate(cgrp->kn);

        ret = 0;
        goto out_unlock;

out_destroy:
        cgroup_destroy_locked(cgrp);
out_unlock:
        cgroup_kn_unlock(parent_kn);
        return ret;
}

/*
 * This is called when the refcnt of a css is confirmed to be killed.
 * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to
 * initiate destruction and put the css ref from kill_css().
 */
static void css_killed_work_fn(struct work_struct *work)
{
        struct cgroup_subsys_state *css =
                container_of(work, struct cgroup_subsys_state, destroy_work);

        cgroup_lock();

        do {
                offline_css(css);
                css_put(css);
                /* @css can't go away while we're holding cgroup_mutex */
                css = css->parent;
        } while (css && atomic_dec_and_test(&css->online_cnt));

        cgroup_unlock();
}

/* css kill confirmation processing requires process context, bounce */
static void css_killed_ref_fn(struct percpu_ref *ref)
{
        struct cgroup_subsys_state *css =
                container_of(ref, struct cgroup_subsys_state, refcnt);

        if (atomic_dec_and_test(&css->online_cnt)) {
                INIT_WORK(&css->destroy_work, css_killed_work_fn);
                queue_work(cgroup_destroy_wq, &css->destroy_work);
        }
}

/**
 * kill_css - destroy a css
 * @css: css to destroy
 *
 * This function initiates destruction of @css by removing cgroup interface
 * files and putting its base reference.  ->css_offline() will be invoked
 * asynchronously once css_tryget_online() is guaranteed to fail and when
 * the reference count reaches zero, @css will be released.
 */
static void kill_css(struct cgroup_subsys_state *css)
{
        lockdep_assert_held(&cgroup_mutex);

        if (css->flags & CSS_DYING)
                return;

        css->flags |= CSS_DYING;

        /*
         * This must happen before css is disassociated with its cgroup.
         * See seq_css() for details.
         */
        css_clear_dir(css);

        /*
         * Killing would put the base ref, but we need to keep it alive
         * until after ->css_offline().
         */
        css_get(css);

        /*
         * cgroup core guarantees that, by the time ->css_offline() is
         * invoked, no new css reference will be given out via
         * css_tryget_online().  We can't simply call percpu_ref_kill() and
         * proceed to offlining css's because percpu_ref_kill() doesn't
         * guarantee that the ref is seen as killed on all CPUs on return.
         *
         * Use percpu_ref_kill_and_confirm() to get notifications as each
         * css is confirmed to be seen as killed on all CPUs.
         */
        percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
}

/**
 * cgroup_destroy_locked - the first stage of cgroup destruction
 * @cgrp: cgroup to be destroyed
 *
 * css's make use of percpu refcnts whose killing latency shouldn't be
 * exposed to userland and are RCU protected.  Also, cgroup core needs to
 * guarantee that css_tryget_online() won't succeed by the time
 * ->css_offline() is invoked.  To satisfy all the requirements,
 * destruction is implemented in the following two steps.
 *
 * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
 *     userland visible parts and start killing the percpu refcnts of
 *     css's.  Set up so that the next stage will be kicked off once all
 *     the percpu refcnts are confirmed to be killed.
 *
 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
 *     rest of destruction.  Once all cgroup references are gone, the
 *     cgroup is RCU-freed.
 *
 * This function implements s1.  After this step, @cgrp is gone as far as
 * the userland is concerned and a new cgroup with the same name may be
 * created.  As cgroup doesn't care about the names internally, this
 * doesn't cause any problem.
 */
static int cgroup_destroy_locked(struct cgroup *cgrp)
        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
        struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
        struct cgroup_subsys_state *css;
        struct cgrp_cset_link *link;
        int ssid;

        lockdep_assert_held(&cgroup_mutex);

        /*
         * Only migration can raise populated from zero and we're already
         * holding cgroup_mutex.
         */
        if (cgroup_is_populated(cgrp))
                return -EBUSY;

        /*
         * Make sure there's no live children.  We can't test emptiness of
         * ->self.children as dead children linger on it while being
         * drained; otherwise, "rmdir parent/child parent" may fail.
         */
        if (css_has_online_children(&cgrp->self))
                return -EBUSY;

        /*
         * Mark @cgrp and the associated csets dead.  The former prevents
         * further task migration and child creation by disabling
         * cgroup_kn_lock_live().  The latter makes the csets ignored by
         * the migration path.
         */
        cgrp->self.flags &= ~CSS_ONLINE;

        spin_lock_irq(&css_set_lock);
        list_for_each_entry(link, &cgrp->cset_links, cset_link)
                link->cset->dead = true;
        spin_unlock_irq(&css_set_lock);

        /* initiate massacre of all css's */
        for_each_css(css, ssid, cgrp)
                kill_css(css);

        /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
        css_clear_dir(&cgrp->self);
        kernfs_remove(cgrp->kn);

        if (cgroup_is_threaded(cgrp))
                parent->nr_threaded_children--;

        spin_lock_irq(&css_set_lock);
        for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
                tcgrp->nr_descendants--;
                tcgrp->nr_dying_descendants++;
                /*
                 * If the dying cgroup is frozen, decrease frozen descendants
                 * counters of ancestor cgroups.
                 */
                if (test_bit(CGRP_FROZEN, &cgrp->flags))
                        tcgrp->freezer.nr_frozen_descendants--;
        }
        spin_unlock_irq(&css_set_lock);

        cgroup1_check_for_release(parent);

        cgroup_bpf_offline(cgrp);

        /* put the base reference */
        percpu_ref_kill(&cgrp->self.refcnt);

        return 0;
};

int cgroup_rmdir(struct kernfs_node *kn)
{
        struct cgroup *cgrp;
        int ret = 0;

        cgrp = cgroup_kn_lock_live(kn, false);
        if (!cgrp)
                return 0;

        ret = cgroup_destroy_locked(cgrp);
        if (!ret)
                TRACE_CGROUP_PATH(rmdir, cgrp);

        cgroup_kn_unlock(kn);
        return ret;
}

static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
        .show_options                = cgroup_show_options,
        .mkdir                        = cgroup_mkdir,
        .rmdir                        = cgroup_rmdir,
        .show_path                = cgroup_show_path,
};

static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
{
        struct cgroup_subsys_state *css;

        pr_debug("Initializing cgroup subsys %s\n", ss->name);

        cgroup_lock();

        idr_init(&ss->css_idr);
        INIT_LIST_HEAD(&ss->cfts);

        /* Create the root cgroup state for this subsystem */
        ss->root = &cgrp_dfl_root;
        css = ss->css_alloc(NULL);
        /* We don't handle early failures gracefully */
        BUG_ON(IS_ERR(css));
        init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);

        /*
         * Root csses are never destroyed and we can't initialize
         * percpu_ref during early init.  Disable refcnting.
         */
        css->flags |= CSS_NO_REF;

        if (early) {
                /* allocation can't be done safely during early init */
                css->id = 1;
        } else {
                css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
                BUG_ON(css->id < 0);
        }

        /* Update the init_css_set to contain a subsys
         * pointer to this state - since the subsystem is
         * newly registered, all tasks and hence the
         * init_css_set is in the subsystem's root cgroup. */
        init_css_set.subsys[ss->id] = css;

        have_fork_callback |= (bool)ss->fork << ss->id;
        have_exit_callback |= (bool)ss->exit << ss->id;
        have_release_callback |= (bool)ss->release << ss->id;
        have_canfork_callback |= (bool)ss->can_fork << ss->id;

        /* At system boot, before all subsystems have been
         * registered, no tasks have been forked, so we don't
         * need to invoke fork callbacks here. */
        BUG_ON(!list_empty(&init_task.tasks));

        BUG_ON(online_css(css));

        cgroup_unlock();
}

/**
 * cgroup_init_early - cgroup initialization at system boot
 *
 * Initialize cgroups at system boot, and initialize any
 * subsystems that request early init.
 */
int __init cgroup_init_early(void)
{
        static struct cgroup_fs_context __initdata ctx;
        struct cgroup_subsys *ss;
        int i;

        ctx.root = &cgrp_dfl_root;
        init_cgroup_root(&ctx);
        cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;

        RCU_INIT_POINTER(init_task.cgroups, &init_css_set);

        for_each_subsys(ss, i) {
                WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
                     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
                     i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
                     ss->id, ss->name);
                WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
                     "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);

                ss->id = i;
                ss->name = cgroup_subsys_name[i];
                if (!ss->legacy_name)
                        ss->legacy_name = cgroup_subsys_name[i];

                if (ss->early_init)
                        cgroup_init_subsys(ss, true);
        }
        return 0;
}

/**
 * cgroup_init - cgroup initialization
 *
 * Register cgroup filesystem and /proc file, and initialize
 * any subsystems that didn't request early init.
 */
int __init cgroup_init(void)
{
        struct cgroup_subsys *ss;
        int ssid;

        BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));

        cgroup_rstat_boot();

        get_user_ns(init_cgroup_ns.user_ns);

        cgroup_lock();

        /*
         * Add init_css_set to the hash table so that dfl_root can link to
         * it during init.
         */
        hash_add(css_set_table, &init_css_set.hlist,
                 css_set_hash(init_css_set.subsys));

        BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));

        cgroup_unlock();

        for_each_subsys(ss, ssid) {
                if (ss->early_init) {
                        struct cgroup_subsys_state *css =
                                init_css_set.subsys[ss->id];

                        css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
                                                   GFP_KERNEL);
                        BUG_ON(css->id < 0);
                } else {
                        cgroup_init_subsys(ss, false);
                }

                list_add_tail(&init_css_set.e_cset_node[ssid],
                              &cgrp_dfl_root.cgrp.e_csets[ssid]);

                /*
                 * Setting dfl_root subsys_mask needs to consider the
                 * disabled flag and cftype registration needs kmalloc,
                 * both of which aren't available during early_init.
                 */
                if (!cgroup_ssid_enabled(ssid))
                        continue;

                if (cgroup1_ssid_disabled(ssid))
                        pr_info("Disabling %s control group subsystem in v1 mounts\n",
                                ss->legacy_name);

                cgrp_dfl_root.subsys_mask |= 1 << ss->id;

                /* implicit controllers must be threaded too */
                WARN_ON(ss->implicit_on_dfl && !ss->threaded);

                if (ss->implicit_on_dfl)
                        cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
                else if (!ss->dfl_cftypes)
                        cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;

                if (ss->threaded)
                        cgrp_dfl_threaded_ss_mask |= 1 << ss->id;

                if (ss->dfl_cftypes == ss->legacy_cftypes) {
                        WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
                } else {
                        WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
                        WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
                }

                if (ss->bind)
                        ss->bind(init_css_set.subsys[ssid]);

                cgroup_lock();
                css_populate_dir(init_css_set.subsys[ssid]);
                cgroup_unlock();
        }

        /* init_css_set.subsys[] has been updated, re-hash */
        hash_del(&init_css_set.hlist);
        hash_add(css_set_table, &init_css_set.hlist,
                 css_set_hash(init_css_set.subsys));

        WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
        WARN_ON(register_filesystem(&cgroup_fs_type));
        WARN_ON(register_filesystem(&cgroup2_fs_type));
        WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
#ifdef CONFIG_CPUSETS
        WARN_ON(register_filesystem(&cpuset_fs_type));
#endif

        return 0;
}

static int __init cgroup_wq_init(void)
{
        /*
         * There isn't much point in executing destruction path in
         * parallel.  Good chunk is serialized with cgroup_mutex anyway.
         * Use 1 for @max_active.
         *
         * We would prefer to do this in cgroup_init() above, but that
         * is called before init_workqueues(): so leave this until after.
         */
        cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
        BUG_ON(!cgroup_destroy_wq);
        return 0;
}
core_initcall(cgroup_wq_init);

void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
{
        struct kernfs_node *kn;

        kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
        if (!kn)
                return;
        kernfs_path(kn, buf, buflen);
        kernfs_put(kn);
}

/*
 * cgroup_get_from_id : get the cgroup associated with cgroup id
 * @id: cgroup id
 * On success return the cgrp or ERR_PTR on failure
 * Only cgroups within current task's cgroup NS are valid.
 */
struct cgroup *cgroup_get_from_id(u64 id)
{
        struct kernfs_node *kn;
        struct cgroup *cgrp, *root_cgrp;

        kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
        if (!kn)
                return ERR_PTR(-ENOENT);

        if (kernfs_type(kn) != KERNFS_DIR) {
                kernfs_put(kn);
                return ERR_PTR(-ENOENT);
        }

        rcu_read_lock();

        cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
        if (cgrp && !cgroup_tryget(cgrp))
                cgrp = NULL;

        rcu_read_unlock();
        kernfs_put(kn);

        if (!cgrp)
                return ERR_PTR(-ENOENT);

        root_cgrp = current_cgns_cgroup_dfl();
        if (!cgroup_is_descendant(cgrp, root_cgrp)) {
                cgroup_put(cgrp);
                return ERR_PTR(-ENOENT);
        }

        return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_id);

/*
 * proc_cgroup_show()
 *  - Print task's cgroup paths into seq_file, one line for each hierarchy
 *  - Used for /proc/<pid>/cgroup.
 */
int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                     struct pid *pid, struct task_struct *tsk)
{
        char *buf;
        int retval;
        struct cgroup_root *root;

        retval = -ENOMEM;
        buf = kmalloc(PATH_MAX, GFP_KERNEL);
        if (!buf)
                goto out;

        rcu_read_lock();
        spin_lock_irq(&css_set_lock);

        for_each_root(root) {
                struct cgroup_subsys *ss;
                struct cgroup *cgrp;
                int ssid, count = 0;

                if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
                        continue;

                cgrp = task_cgroup_from_root(tsk, root);
                /* The root has already been unmounted. */
                if (!cgrp)
                        continue;

                seq_printf(m, "%d:", root->hierarchy_id);
                if (root != &cgrp_dfl_root)
                        for_each_subsys(ss, ssid)
                                if (root->subsys_mask & (1 << ssid))
                                        seq_printf(m, "%s%s", count++ ? "," : "",
                                                   ss->legacy_name);
                if (strlen(root->name))
                        seq_printf(m, "%sname=%s", count ? "," : "",
                                   root->name);
                seq_putc(m, ':');
                /*
                 * On traditional hierarchies, all zombie tasks show up as
                 * belonging to the root cgroup.  On the default hierarchy,
                 * while a zombie doesn't show up in "cgroup.procs" and
                 * thus can't be migrated, its /proc/PID/cgroup keeps
                 * reporting the cgroup it belonged to before exiting.  If
                 * the cgroup is removed before the zombie is reaped,
                 * " (deleted)" is appended to the cgroup path.
                 */
                if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
                        retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
                                                current->nsproxy->cgroup_ns);
                        if (retval == -E2BIG)
                                retval = -ENAMETOOLONG;
                        if (retval < 0)
                                goto out_unlock;

                        seq_puts(m, buf);
                } else {
                        seq_puts(m, "/");
                }

                if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
                        seq_puts(m, " (deleted)\n");
                else
                        seq_putc(m, '\n');
        }

        retval = 0;
out_unlock:
        spin_unlock_irq(&css_set_lock);
        rcu_read_unlock();
        kfree(buf);
out:
        return retval;
}

/**
 * cgroup_fork - initialize cgroup related fields during copy_process()
 * @child: pointer to task_struct of forking parent process.
 *
 * A task is associated with the init_css_set until cgroup_post_fork()
 * attaches it to the target css_set.
 */
void cgroup_fork(struct task_struct *child)
{
        RCU_INIT_POINTER(child->cgroups, &init_css_set);
        INIT_LIST_HEAD(&child->cg_list);
}

/**
 * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer
 * @f: file corresponding to cgroup_dir
 *
 * Find the cgroup from a file pointer associated with a cgroup directory.
 * Returns a pointer to the cgroup on success. ERR_PTR is returned if the
 * cgroup cannot be found.
 */
static struct cgroup *cgroup_v1v2_get_from_file(struct file *f)
{
        struct cgroup_subsys_state *css;

        css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
        if (IS_ERR(css))
                return ERR_CAST(css);

        return css->cgroup;
}

/**
 * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports
 * cgroup2.
 * @f: file corresponding to cgroup2_dir
 */
static struct cgroup *cgroup_get_from_file(struct file *f)
{
        struct cgroup *cgrp = cgroup_v1v2_get_from_file(f);

        if (IS_ERR(cgrp))
                return ERR_CAST(cgrp);

        if (!cgroup_on_dfl(cgrp)) {
                cgroup_put(cgrp);
                return ERR_PTR(-EBADF);
        }

        return cgrp;
}

/**
 * cgroup_css_set_fork - find or create a css_set for a child process
 * @kargs: the arguments passed to create the child process
 *
 * This functions finds or creates a new css_set which the child
 * process will be attached to in cgroup_post_fork(). By default,
 * the child process will be given the same css_set as its parent.
 *
 * If CLONE_INTO_CGROUP is specified this function will try to find an
 * existing css_set which includes the requested cgroup and if not create
 * a new css_set that the child will be attached to later. If this function
 * succeeds it will hold cgroup_threadgroup_rwsem on return. If
 * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
 * before grabbing cgroup_threadgroup_rwsem and will hold a reference
 * to the target cgroup.
 */
static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
        __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
{
        int ret;
        struct cgroup *dst_cgrp = NULL;
        struct css_set *cset;
        struct super_block *sb;
        struct file *f;

        if (kargs->flags & CLONE_INTO_CGROUP)
                cgroup_lock();

        cgroup_threadgroup_change_begin(current);

        spin_lock_irq(&css_set_lock);
        cset = task_css_set(current);
        get_css_set(cset);
        spin_unlock_irq(&css_set_lock);

        if (!(kargs->flags & CLONE_INTO_CGROUP)) {
                kargs->cset = cset;
                return 0;
        }

        f = fget_raw(kargs->cgroup);
        if (!f) {
                ret = -EBADF;
                goto err;
        }
        sb = f->f_path.dentry->d_sb;

        dst_cgrp = cgroup_get_from_file(f);
        if (IS_ERR(dst_cgrp)) {
                ret = PTR_ERR(dst_cgrp);
                dst_cgrp = NULL;
                goto err;
        }

        if (cgroup_is_dead(dst_cgrp)) {
                ret = -ENODEV;
                goto err;
        }

        /*
         * Verify that we the target cgroup is writable for us. This is
         * usually done by the vfs layer but since we're not going through
         * the vfs layer here we need to do it "manually".
         */
        ret = cgroup_may_write(dst_cgrp, sb);
        if (ret)
                goto err;

        /*
         * Spawning a task directly into a cgroup works by passing a file
         * descriptor to the target cgroup directory. This can even be an O_PATH
         * file descriptor. But it can never be a cgroup.procs file descriptor.
         * This was done on purpose so spawning into a cgroup could be
         * conceptualized as an atomic
         *
         *   fd = openat(dfd_cgroup, "cgroup.procs", ...);
         *   write(fd, <child-pid>, ...);
         *
         * sequence, i.e. it's a shorthand for the caller opening and writing
         * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us
         * to always use the caller's credentials.
         */
        ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
                                        !(kargs->flags & CLONE_THREAD),
                                        current->nsproxy->cgroup_ns);
        if (ret)
                goto err;

        kargs->cset = find_css_set(cset, dst_cgrp);
        if (!kargs->cset) {
                ret = -ENOMEM;
                goto err;
        }

        put_css_set(cset);
        fput(f);
        kargs->cgrp = dst_cgrp;
        return ret;

err:
        cgroup_threadgroup_change_end(current);
        cgroup_unlock();
        if (f)
                fput(f);
        if (dst_cgrp)
                cgroup_put(dst_cgrp);
        put_css_set(cset);
        if (kargs->cset)
                put_css_set(kargs->cset);
        return ret;
}

/**
 * cgroup_css_set_put_fork - drop references we took during fork
 * @kargs: the arguments passed to create the child process
 *
 * Drop references to the prepared css_set and target cgroup if
 * CLONE_INTO_CGROUP was requested.
 */
static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
        __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
        struct cgroup *cgrp = kargs->cgrp;
        struct css_set *cset = kargs->cset;

        cgroup_threadgroup_change_end(current);

        if (cset) {
                put_css_set(cset);
                kargs->cset = NULL;
        }

        if (kargs->flags & CLONE_INTO_CGROUP) {
                cgroup_unlock();
                if (cgrp) {
                        cgroup_put(cgrp);
                        kargs->cgrp = NULL;
                }
        }
}

/**
 * cgroup_can_fork - called on a new task before the process is exposed
 * @child: the child process
 * @kargs: the arguments passed to create the child process
 *
 * This prepares a new css_set for the child process which the child will
 * be attached to in cgroup_post_fork().
 * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
 * callback returns an error, the fork aborts with that error code. This
 * allows for a cgroup subsystem to conditionally allow or deny new forks.
 */
int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
{
        struct cgroup_subsys *ss;
        int i, j, ret;

        ret = cgroup_css_set_fork(kargs);
        if (ret)
                return ret;

        do_each_subsys_mask(ss, i, have_canfork_callback) {
                ret = ss->can_fork(child, kargs->cset);
                if (ret)
                        goto out_revert;
        } while_each_subsys_mask();

        return 0;

out_revert:
        for_each_subsys(ss, j) {
                if (j >= i)
                        break;
                if (ss->cancel_fork)
                        ss->cancel_fork(child, kargs->cset);
        }

        cgroup_css_set_put_fork(kargs);

        return ret;
}

/**
 * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
 * @child: the child process
 * @kargs: the arguments passed to create the child process
 *
 * This calls the cancel_fork() callbacks if a fork failed *after*
 * cgroup_can_fork() succeeded and cleans up references we took to
 * prepare a new css_set for the child process in cgroup_can_fork().
 */
void cgroup_cancel_fork(struct task_struct *child,
                        struct kernel_clone_args *kargs)
{
        struct cgroup_subsys *ss;
        int i;

        for_each_subsys(ss, i)
                if (ss->cancel_fork)
                        ss->cancel_fork(child, kargs->cset);

        cgroup_css_set_put_fork(kargs);
}

/**
 * cgroup_post_fork - finalize cgroup setup for the child process
 * @child: the child process
 * @kargs: the arguments passed to create the child process
 *
 * Attach the child process to its css_set calling the subsystem fork()
 * callbacks.
 */
void cgroup_post_fork(struct task_struct *child,
                      struct kernel_clone_args *kargs)
        __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
        unsigned long cgrp_flags = 0;
        bool kill = false;
        struct cgroup_subsys *ss;
        struct css_set *cset;
        int i;

        cset = kargs->cset;
        kargs->cset = NULL;

        spin_lock_irq(&css_set_lock);

        /* init tasks are special, only link regular threads */
        if (likely(child->pid)) {
                if (kargs->cgrp)
                        cgrp_flags = kargs->cgrp->flags;
                else
                        cgrp_flags = cset->dfl_cgrp->flags;

                WARN_ON_ONCE(!list_empty(&child->cg_list));
                cset->nr_tasks++;
                css_set_move_task(child, NULL, cset, false);
        } else {
                put_css_set(cset);
                cset = NULL;
        }

        if (!(child->flags & PF_KTHREAD)) {
                if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
                        /*
                         * If the cgroup has to be frozen, the new task has
                         * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to
                         * get the task into the frozen state.
                         */
                        spin_lock(&child->sighand->siglock);
                        WARN_ON_ONCE(child->frozen);
                        child->jobctl |= JOBCTL_TRAP_FREEZE;
                        spin_unlock(&child->sighand->siglock);

                        /*
                         * Calling cgroup_update_frozen() isn't required here,
                         * because it will be called anyway a bit later from
                         * do_freezer_trap(). So we avoid cgroup's transient
                         * switch from the frozen state and back.
                         */
                }

                /*
                 * If the cgroup is to be killed notice it now and take the
                 * child down right after we finished preparing it for
                 * userspace.
                 */
                kill = test_bit(CGRP_KILL, &cgrp_flags);
        }

        spin_unlock_irq(&css_set_lock);

        /*
         * Call ss->fork().  This must happen after @child is linked on
         * css_set; otherwise, @child might change state between ->fork()
         * and addition to css_set.
         */
        do_each_subsys_mask(ss, i, have_fork_callback) {
                ss->fork(child);
        } while_each_subsys_mask();

        /* Make the new cset the root_cset of the new cgroup namespace. */
        if (kargs->flags & CLONE_NEWCGROUP) {
                struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;

                get_css_set(cset);
                child->nsproxy->cgroup_ns->root_cset = cset;
                put_css_set(rcset);
        }

        /* Cgroup has to be killed so take down child immediately. */
        if (unlikely(kill))
                do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);

        cgroup_css_set_put_fork(kargs);
}

/**
 * cgroup_exit - detach cgroup from exiting task
 * @tsk: pointer to task_struct of exiting process
 *
 * Description: Detach cgroup from @tsk.
 *
 */
void cgroup_exit(struct task_struct *tsk)
{
        struct cgroup_subsys *ss;
        struct css_set *cset;
        int i;

        spin_lock_irq(&css_set_lock);

        WARN_ON_ONCE(list_empty(&tsk->cg_list));
        cset = task_css_set(tsk);
        css_set_move_task(tsk, cset, NULL, false);
        list_add_tail(&tsk->cg_list, &cset->dying_tasks);
        cset->nr_tasks--;

        if (dl_task(tsk))
                dec_dl_tasks_cs(tsk);

        WARN_ON_ONCE(cgroup_task_frozen(tsk));
        if (unlikely(!(tsk->flags & PF_KTHREAD) &&
                     test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
                cgroup_update_frozen(task_dfl_cgroup(tsk));

        spin_unlock_irq(&css_set_lock);

        /* see cgroup_post_fork() for details */
        do_each_subsys_mask(ss, i, have_exit_callback) {
                ss->exit(tsk);
        } while_each_subsys_mask();
}

void cgroup_release(struct task_struct *task)
{
        struct cgroup_subsys *ss;
        int ssid;

        do_each_subsys_mask(ss, ssid, have_release_callback) {
                ss->release(task);
        } while_each_subsys_mask();

        spin_lock_irq(&css_set_lock);
        css_set_skip_task_iters(task_css_set(task), task);
        list_del_init(&task->cg_list);
        spin_unlock_irq(&css_set_lock);
}

void cgroup_free(struct task_struct *task)
{
        struct css_set *cset = task_css_set(task);
        put_css_set(cset);
}

static int __init cgroup_disable(char *str)
{
        struct cgroup_subsys *ss;
        char *token;
        int i;

        while ((token = strsep(&str, ",")) != NULL) {
                if (!*token)
                        continue;

                for_each_subsys(ss, i) {
                        if (strcmp(token, ss->name) &&
                            strcmp(token, ss->legacy_name))
                                continue;

                        static_branch_disable(cgroup_subsys_enabled_key[i]);
                        pr_info("Disabling %s control group subsystem\n",
                                ss->name);
                }

                for (i = 0; i < OPT_FEATURE_COUNT; i++) {
                        if (strcmp(token, cgroup_opt_feature_names[i]))
                                continue;
                        cgroup_feature_disable_mask |= 1 << i;
                        pr_info("Disabling %s control group feature\n",
                                cgroup_opt_feature_names[i]);
                        break;
                }
        }
        return 1;
}
__setup("cgroup_disable=", cgroup_disable);

void __init __weak enable_debug_cgroup(void) { }

static int __init enable_cgroup_debug(char *str)
{
        cgroup_debug = true;
        enable_debug_cgroup();
        return 1;
}
__setup("cgroup_debug", enable_cgroup_debug);

static int __init cgroup_favordynmods_setup(char *str)
{
        return (kstrtobool(str, &have_favordynmods) == 0);
}
__setup("cgroup_favordynmods=", cgroup_favordynmods_setup);

/**
 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
 * @dentry: directory dentry of interest
 * @ss: subsystem of interest
 *
 * If @dentry is a directory for a cgroup which has @ss enabled on it, try
 * to get the corresponding css and return it.  If such css doesn't exist
 * or can't be pinned, an ERR_PTR value is returned.
 */
struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
                                                       struct cgroup_subsys *ss)
{
        struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
        struct file_system_type *s_type = dentry->d_sb->s_type;
        struct cgroup_subsys_state *css = NULL;
        struct cgroup *cgrp;

        /* is @dentry a cgroup dir? */
        if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
            !kn || kernfs_type(kn) != KERNFS_DIR)
                return ERR_PTR(-EBADF);

        rcu_read_lock();

        /*
         * This path doesn't originate from kernfs and @kn could already
         * have been or be removed at any point.  @kn->priv is RCU
         * protected for this access.  See css_release_work_fn() for details.
         */
        cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
        if (cgrp)
                css = cgroup_css(cgrp, ss);

        if (!css || !css_tryget_online(css))
                css = ERR_PTR(-ENOENT);

        rcu_read_unlock();
        return css;
}

/**
 * css_from_id - lookup css by id
 * @id: the cgroup id
 * @ss: cgroup subsys to be looked into
 *
 * Returns the css if there's valid one with @id, otherwise returns NULL.
 * Should be called under rcu_read_lock().
 */
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
        WARN_ON_ONCE(!rcu_read_lock_held());
        return idr_find(&ss->css_idr, id);
}

/**
 * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
 * @path: path on the default hierarchy
 *
 * Find the cgroup at @path on the default hierarchy, increment its
 * reference count and return it.  Returns pointer to the found cgroup on
 * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already
 * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory.
 */
struct cgroup *cgroup_get_from_path(const char *path)
{
        struct kernfs_node *kn;
        struct cgroup *cgrp = ERR_PTR(-ENOENT);
        struct cgroup *root_cgrp;

        root_cgrp = current_cgns_cgroup_dfl();
        kn = kernfs_walk_and_get(root_cgrp->kn, path);
        if (!kn)
                goto out;

        if (kernfs_type(kn) != KERNFS_DIR) {
                cgrp = ERR_PTR(-ENOTDIR);
                goto out_kernfs;
        }

        rcu_read_lock();

        cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
        if (!cgrp || !cgroup_tryget(cgrp))
                cgrp = ERR_PTR(-ENOENT);

        rcu_read_unlock();

out_kernfs:
        kernfs_put(kn);
out:
        return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);

/**
 * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd
 * @fd: fd obtained by open(cgroup_dir)
 *
 * Find the cgroup from a fd which should be obtained
 * by opening a cgroup directory.  Returns a pointer to the
 * cgroup on success. ERR_PTR is returned if the cgroup
 * cannot be found.
 */
struct cgroup *cgroup_v1v2_get_from_fd(int fd)
{
        struct cgroup *cgrp;
        struct fd f = fdget_raw(fd);
        if (!f.file)
                return ERR_PTR(-EBADF);

        cgrp = cgroup_v1v2_get_from_file(f.file);
        fdput(f);
        return cgrp;
}

/**
 * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports
 * cgroup2.
 * @fd: fd obtained by open(cgroup2_dir)
 */
struct cgroup *cgroup_get_from_fd(int fd)
{
        struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd);

        if (IS_ERR(cgrp))
                return ERR_CAST(cgrp);

        if (!cgroup_on_dfl(cgrp)) {
                cgroup_put(cgrp);
                return ERR_PTR(-EBADF);
        }
        return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);

static u64 power_of_ten(int power)
{
        u64 v = 1;
        while (power--)
                v *= 10;
        return v;
}

/**
 * cgroup_parse_float - parse a floating number
 * @input: input string
 * @dec_shift: number of decimal digits to shift
 * @v: output
 *
 * Parse a decimal floating point number in @input and store the result in
 * @v with decimal point right shifted @dec_shift times.  For example, if
 * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345.
 * Returns 0 on success, -errno otherwise.
 *
 * There's nothing cgroup specific about this function except that it's
 * currently the only user.
 */
int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
{
        s64 whole, frac = 0;
        int fstart = 0, fend = 0, flen;

        if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
                return -EINVAL;
        if (frac < 0)
                return -EINVAL;

        flen = fend > fstart ? fend - fstart : 0;
        if (flen < dec_shift)
                frac *= power_of_ten(dec_shift - flen);
        else
                frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));

        *v = whole * power_of_ten(dec_shift) + frac;
        return 0;
}

/*
 * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
 * definition in cgroup-defs.h.
 */
#ifdef CONFIG_SOCK_CGROUP_DATA

void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
        struct cgroup *cgroup;

        rcu_read_lock();
        /* Don't associate the sock with unrelated interrupted task's cgroup. */
        if (in_interrupt()) {
                cgroup = &cgrp_dfl_root.cgrp;
                cgroup_get(cgroup);
                goto out;
        }

        while (true) {
                struct css_set *cset;

                cset = task_css_set(current);
                if (likely(cgroup_tryget(cset->dfl_cgrp))) {
                        cgroup = cset->dfl_cgrp;
                        break;
                }
                cpu_relax();
        }
out:
        skcd->cgroup = cgroup;
        cgroup_bpf_get(cgroup);
        rcu_read_unlock();
}

void cgroup_sk_clone(struct sock_cgroup_data *skcd)
{
        struct cgroup *cgrp = sock_cgroup_ptr(skcd);

        /*
         * We might be cloning a socket which is left in an empty
         * cgroup and the cgroup might have already been rmdir'd.
         * Don't use cgroup_get_live().
         */
        cgroup_get(cgrp);
        cgroup_bpf_get(cgrp);
}

void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
        struct cgroup *cgrp = sock_cgroup_ptr(skcd);

        cgroup_bpf_put(cgrp);
        cgroup_put(cgrp);
}

#endif        /* CONFIG_SOCK_CGROUP_DATA */

#ifdef CONFIG_SYSFS
static ssize_t show_delegatable_files(struct cftype *files, char *buf,
                                      ssize_t size, const char *prefix)
{
        struct cftype *cft;
        ssize_t ret = 0;

        for (cft = files; cft && cft->name[0] != '\0'; cft++) {
                if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
                        continue;

                if (prefix)
                        ret += snprintf(buf + ret, size - ret, "%s.", prefix);

                ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);

                if (WARN_ON(ret >= size))
                        break;
        }

        return ret;
}

static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
                              char *buf)
{
        struct cgroup_subsys *ss;
        int ssid;
        ssize_t ret = 0;

        ret = show_delegatable_files(cgroup_base_files, buf + ret,
                                     PAGE_SIZE - ret, NULL);
        if (cgroup_psi_enabled())
                ret += show_delegatable_files(cgroup_psi_files, buf + ret,
                                              PAGE_SIZE - ret, NULL);

        for_each_subsys(ss, ssid)
                ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
                                              PAGE_SIZE - ret,
                                              cgroup_subsys_name[ssid]);

        return ret;
}
static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);

static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
                             char *buf)
{
        return snprintf(buf, PAGE_SIZE,
                        "nsdelegate\n"
                        "favordynmods\n"
                        "memory_localevents\n"
                        "memory_recursiveprot\n"
                        "memory_hugetlb_accounting\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);

static struct attribute *cgroup_sysfs_attrs[] = {
        &cgroup_delegate_attr.attr,
        &cgroup_features_attr.attr,
        NULL,
};

static const struct attribute_group cgroup_sysfs_attr_group = {
        .attrs = cgroup_sysfs_attrs,
        .name = "cgroup",
};

static int __init cgroup_sysfs_init(void)
{
        return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
}
subsys_initcall(cgroup_sysfs_init);

#endif /* CONFIG_SYSFS */








































































































































































































































    1 
    1 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/exit.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
#include <linux/sched/stat.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/capability.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/tty.h>
#include <linux/iocontext.h>
#include <linux/key.h>
#include <linux/cpu.h>
#include <linux/acct.h>
#include <linux/tsacct_kern.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/freezer.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
#include <linux/pid_namespace.h>
#include <linux/ptrace.h>
#include <linux/profile.h>
#include <linux/mount.h>
#include <linux/proc_fs.h>
#include <linux/kthread.h>
#include <linux/mempolicy.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
#include <linux/cgroup.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
#include <linux/posix-timers.h>
#include <linux/cn_proc.h>
#include <linux/mutex.h>
#include <linux/futex.h>
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/blkdev.h>
#include <linux/task_work.h>
#include <linux/fs_struct.h>
#include <linux/init_task.h>
#include <linux/perf_event.h>
#include <trace/events/sched.h>
#include <linux/hw_breakpoint.h>
#include <linux/oom.h>
#include <linux/writeback.h>
#include <linux/shm.h>
#include <linux/kcov.h>
#include <linux/kmsan.h>
#include <linux/random.h>
#include <linux/rcuwait.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/kprobes.h>
#include <linux/rethook.h>
#include <linux/sysfs.h>
#include <linux/user_events.h>
#include <linux/uaccess.h>

#include <uapi/linux/wait.h>

#include <asm/unistd.h>
#include <asm/mmu_context.h>

#include "exit.h"

/*
 * The default value should be high enough to not crash a system that randomly
 * crashes its kernel from time to time, but low enough to at least not permit
 * overflowing 32-bit refcounts or the ldsem writer count.
 */
static unsigned int oops_limit = 10000;

#ifdef CONFIG_SYSCTL
static struct ctl_table kern_exit_table[] = {
        {
                .procname       = "oops_limit",
                .data           = &oops_limit,
                .maxlen         = sizeof(oops_limit),
                .mode           = 0644,
                .proc_handler   = proc_douintvec,
        },
};

static __init int kernel_exit_sysctls_init(void)
{
        register_sysctl_init("kernel", kern_exit_table);
        return 0;
}
late_initcall(kernel_exit_sysctls_init);
#endif

static atomic_t oops_count = ATOMIC_INIT(0);

#ifdef CONFIG_SYSFS
static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr,
                               char *page)
{
        return sysfs_emit(page, "%d\n", atomic_read(&oops_count));
}

static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count);

static __init int kernel_exit_sysfs_init(void)
{
        sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL);
        return 0;
}
late_initcall(kernel_exit_sysfs_init);
#endif

static void __unhash_process(struct task_struct *p, bool group_dead)
{
        nr_threads--;
        detach_pid(p, PIDTYPE_PID);
        if (group_dead) {
                detach_pid(p, PIDTYPE_TGID);
                detach_pid(p, PIDTYPE_PGID);
                detach_pid(p, PIDTYPE_SID);

                list_del_rcu(&p->tasks);
                list_del_init(&p->sibling);
                __this_cpu_dec(process_counts);
        }
        list_del_rcu(&p->thread_node);
}

/*
 * This function expects the tasklist_lock write-locked.
 */
static void __exit_signal(struct task_struct *tsk)
{
        struct signal_struct *sig = tsk->signal;
        bool group_dead = thread_group_leader(tsk);
        struct sighand_struct *sighand;
        struct tty_struct *tty;
        u64 utime, stime;

        sighand = rcu_dereference_check(tsk->sighand,
                                        lockdep_tasklist_lock_is_held());
        spin_lock(&sighand->siglock);

#ifdef CONFIG_POSIX_TIMERS
        posix_cpu_timers_exit(tsk);
        if (group_dead)
                posix_cpu_timers_exit_group(tsk);
#endif

        if (group_dead) {
                tty = sig->tty;
                sig->tty = NULL;
        } else {
                /*
                 * If there is any task waiting for the group exit
                 * then notify it:
                 */
                if (sig->notify_count > 0 && !--sig->notify_count)
                        wake_up_process(sig->group_exec_task);

                if (tsk == sig->curr_target)
                        sig->curr_target = next_thread(tsk);
        }

        add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
                              sizeof(unsigned long long));

        /*
         * Accumulate here the counters for all threads as they die. We could
         * skip the group leader because it is the last user of signal_struct,
         * but we want to avoid the race with thread_group_cputime() which can
         * see the empty ->thread_head list.
         */
        task_cputime(tsk, &utime, &stime);
        write_seqlock(&sig->stats_lock);
        sig->utime += utime;
        sig->stime += stime;
        sig->gtime += task_gtime(tsk);
        sig->min_flt += tsk->min_flt;
        sig->maj_flt += tsk->maj_flt;
        sig->nvcsw += tsk->nvcsw;
        sig->nivcsw += tsk->nivcsw;
        sig->inblock += task_io_get_inblock(tsk);
        sig->oublock += task_io_get_oublock(tsk);
        task_io_accounting_add(&sig->ioac, &tsk->ioac);
        sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
        sig->nr_threads--;
        __unhash_process(tsk, group_dead);
        write_sequnlock(&sig->stats_lock);

        /*
         * Do this under ->siglock, we can race with another thread
         * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
         */
        flush_sigqueue(&tsk->pending);
        tsk->sighand = NULL;
        spin_unlock(&sighand->siglock);

        __cleanup_sighand(sighand);
        clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
        if (group_dead) {
                flush_sigqueue(&sig->shared_pending);
                tty_kref_put(tty);
        }
}

static void delayed_put_task_struct(struct rcu_head *rhp)
{
        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);

        kprobe_flush_task(tsk);
        rethook_flush_task(tsk);
        perf_event_delayed_put(tsk);
        trace_sched_process_free(tsk);
        put_task_struct(tsk);
}

void put_task_struct_rcu_user(struct task_struct *task)
{
        if (refcount_dec_and_test(&task->rcu_users))
                call_rcu(&task->rcu, delayed_put_task_struct);
}

void __weak release_thread(struct task_struct *dead_task)
{
}

void release_task(struct task_struct *p)
{
        struct task_struct *leader;
        struct pid *thread_pid;
        int zap_leader;
repeat:
        /* don't need to get the RCU readlock here - the process is dead and
         * can't be modifying its own credentials. But shut RCU-lockdep up */
        rcu_read_lock();
        dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
        rcu_read_unlock();

        cgroup_release(p);

        write_lock_irq(&tasklist_lock);
        ptrace_release_task(p);
        thread_pid = get_pid(p->thread_pid);
        __exit_signal(p);

        /*
         * If we are the last non-leader member of the thread
         * group, and the leader is zombie, then notify the
         * group leader's parent process. (if it wants notification.)
         */
        zap_leader = 0;
        leader = p->group_leader;
        if (leader != p && thread_group_empty(leader)
                        && leader->exit_state == EXIT_ZOMBIE) {
                /*
                 * If we were the last child thread and the leader has
                 * exited already, and the leader's parent ignores SIGCHLD,
                 * then we are the one who should release the leader.
                 */
                zap_leader = do_notify_parent(leader, leader->exit_signal);
                if (zap_leader)
                        leader->exit_state = EXIT_DEAD;
        }

        write_unlock_irq(&tasklist_lock);
        seccomp_filter_release(p);
        proc_flush_pid(thread_pid);
        put_pid(thread_pid);
        release_thread(p);
        put_task_struct_rcu_user(p);

        p = leader;
        if (unlikely(zap_leader))
                goto repeat;
}

int rcuwait_wake_up(struct rcuwait *w)
{
        int ret = 0;
        struct task_struct *task;

        rcu_read_lock();

        /*
         * Order condition vs @task, such that everything prior to the load
         * of @task is visible. This is the condition as to why the user called
         * rcuwait_wake() in the first place. Pairs with set_current_state()
         * barrier (A) in rcuwait_wait_event().
         *
         *    WAIT                WAKE
         *    [S] tsk = current          [S] cond = true
         *        MB (A)              MB (B)
         *    [L] cond                  [L] tsk
         */
        smp_mb(); /* (B) */

        task = rcu_dereference(w->task);
        if (task)
                ret = wake_up_process(task);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(rcuwait_wake_up);

/*
 * Determine if a process group is "orphaned", according to the POSIX
 * definition in 2.2.2.52.  Orphaned process groups are not to be affected
 * by terminal-generated stop signals.  Newly orphaned process groups are
 * to receive a SIGHUP and a SIGCONT.
 *
 * "I ask you, have you ever known what it is to be an orphan?"
 */
static int will_become_orphaned_pgrp(struct pid *pgrp,
                                        struct task_struct *ignored_task)
{
        struct task_struct *p;

        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
                if ((p == ignored_task) ||
                    (p->exit_state && thread_group_empty(p)) ||
                    is_global_init(p->real_parent))
                        continue;

                if (task_pgrp(p->real_parent) != pgrp &&
                    task_session(p->real_parent) == task_session(p))
                        return 0;
        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);

        return 1;
}

int is_current_pgrp_orphaned(void)
{
        int retval;

        read_lock(&tasklist_lock);
        retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
        read_unlock(&tasklist_lock);

        return retval;
}

static bool has_stopped_jobs(struct pid *pgrp)
{
        struct task_struct *p;

        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
                if (p->signal->flags & SIGNAL_STOP_STOPPED)
                        return true;
        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);

        return false;
}

/*
 * Check to see if any process groups have become orphaned as
 * a result of our exiting, and if they have any stopped jobs,
 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
 */
static void
kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
{
        struct pid *pgrp = task_pgrp(tsk);
        struct task_struct *ignored_task = tsk;

        if (!parent)
                /* exit: our father is in a different pgrp than
                 * we are and we were the only connection outside.
                 */
                parent = tsk->real_parent;
        else
                /* reparent: our child is in a different pgrp than
                 * we are, and it was the only connection outside.
                 */
                ignored_task = NULL;

        if (task_pgrp(parent) != pgrp &&
            task_session(parent) == task_session(tsk) &&
            will_become_orphaned_pgrp(pgrp, ignored_task) &&
            has_stopped_jobs(pgrp)) {
                __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
                __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
        }
}

static void coredump_task_exit(struct task_struct *tsk)
{
        struct core_state *core_state;

        /*
         * Serialize with any possible pending coredump.
         * We must hold siglock around checking core_state
         * and setting PF_POSTCOREDUMP.  The core-inducing thread
         * will increment ->nr_threads for each thread in the
         * group without PF_POSTCOREDUMP set.
         */
        spin_lock_irq(&tsk->sighand->siglock);
        tsk->flags |= PF_POSTCOREDUMP;
        core_state = tsk->signal->core_state;
        spin_unlock_irq(&tsk->sighand->siglock);
        if (core_state) {
                struct core_thread self;

                self.task = current;
                if (self.task->flags & PF_SIGNALED)
                        self.next = xchg(&core_state->dumper.next, &self);
                else
                        self.task = NULL;
                /*
                 * Implies mb(), the result of xchg() must be visible
                 * to core_state->dumper.
                 */
                if (atomic_dec_and_test(&core_state->nr_threads))
                        complete(&core_state->startup);

                for (;;) {
                        set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
                        if (!self.task) /* see coredump_finish() */
                                break;
                        schedule();
                }
                __set_current_state(TASK_RUNNING);
        }
}

#ifdef CONFIG_MEMCG
/*
 * A task is exiting.   If it owned this mm, find a new owner for the mm.
 */
void mm_update_next_owner(struct mm_struct *mm)
{
        struct task_struct *c, *g, *p = current;

retry:
        /*
         * If the exiting or execing task is not the owner, it's
         * someone else's problem.
         */
        if (mm->owner != p)
                return;
        /*
         * The current owner is exiting/execing and there are no other
         * candidates.  Do not leave the mm pointing to a possibly
         * freed task structure.
         */
        if (atomic_read(&mm->mm_users) <= 1) {
                WRITE_ONCE(mm->owner, NULL);
                return;
        }

        read_lock(&tasklist_lock);
        /*
         * Search in the children
         */
        list_for_each_entry(c, &p->children, sibling) {
                if (c->mm == mm)
                        goto assign_new_owner;
        }

        /*
         * Search in the siblings
         */
        list_for_each_entry(c, &p->real_parent->children, sibling) {
                if (c->mm == mm)
                        goto assign_new_owner;
        }

        /*
         * Search through everything else, we should not get here often.
         */
        for_each_process(g) {
                if (g->flags & PF_KTHREAD)
                        continue;
                for_each_thread(g, c) {
                        if (c->mm == mm)
                                goto assign_new_owner;
                        if (c->mm)
                                break;
                }
        }
        read_unlock(&tasklist_lock);
        /*
         * We found no owner yet mm_users > 1: this implies that we are
         * most likely racing with swapoff (try_to_unuse()) or /proc or
         * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
         */
        WRITE_ONCE(mm->owner, NULL);
        return;

assign_new_owner:
        BUG_ON(c == p);
        get_task_struct(c);
        /*
         * The task_lock protects c->mm from changing.
         * We always want mm->owner->mm == mm
         */
        task_lock(c);
        /*
         * Delay read_unlock() till we have the task_lock()
         * to ensure that c does not slip away underneath us
         */
        read_unlock(&tasklist_lock);
        if (c->mm != mm) {
                task_unlock(c);
                put_task_struct(c);
                goto retry;
        }
        WRITE_ONCE(mm->owner, c);
        lru_gen_migrate_mm(mm);
        task_unlock(c);
        put_task_struct(c);
}
#endif /* CONFIG_MEMCG */

/*
 * Turn us into a lazy TLB process if we
 * aren't already..
 */
static void exit_mm(void)
{
        struct mm_struct *mm = current->mm;

        exit_mm_release(current, mm);
        if (!mm)
                return;
        mmap_read_lock(mm);
        mmgrab_lazy_tlb(mm);
        BUG_ON(mm != current->active_mm);
        /* more a memory barrier than a real lock */
        task_lock(current);
        /*
         * When a thread stops operating on an address space, the loop
         * in membarrier_private_expedited() may not observe that
         * tsk->mm, and the loop in membarrier_global_expedited() may
         * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
         * rq->membarrier_state, so those would not issue an IPI.
         * Membarrier requires a memory barrier after accessing
         * user-space memory, before clearing tsk->mm or the
         * rq->membarrier_state.
         */
        smp_mb__after_spinlock();
        local_irq_disable();
        current->mm = NULL;
        membarrier_update_current_mm(NULL);
        enter_lazy_tlb(mm, current);
        local_irq_enable();
        task_unlock(current);
        mmap_read_unlock(mm);
        mm_update_next_owner(mm);
        mmput(mm);
        if (test_thread_flag(TIF_MEMDIE))
                exit_oom_victim();
}

static struct task_struct *find_alive_thread(struct task_struct *p)
{
        struct task_struct *t;

        for_each_thread(p, t) {
                if (!(t->flags & PF_EXITING))
                        return t;
        }
        return NULL;
}

static struct task_struct *find_child_reaper(struct task_struct *father,
                                                struct list_head *dead)
        __releases(&tasklist_lock)
        __acquires(&tasklist_lock)
{
        struct pid_namespace *pid_ns = task_active_pid_ns(father);
        struct task_struct *reaper = pid_ns->child_reaper;
        struct task_struct *p, *n;

        if (likely(reaper != father))
                return reaper;

        reaper = find_alive_thread(father);
        if (reaper) {
                pid_ns->child_reaper = reaper;
                return reaper;
        }

        write_unlock_irq(&tasklist_lock);

        list_for_each_entry_safe(p, n, dead, ptrace_entry) {
                list_del_init(&p->ptrace_entry);
                release_task(p);
        }

        zap_pid_ns_processes(pid_ns);
        write_lock_irq(&tasklist_lock);

        return father;
}

/*
 * When we die, we re-parent all our children, and try to:
 * 1. give them to another thread in our thread group, if such a member exists
 * 2. give it to the first ancestor process which prctl'd itself as a
 *    child_subreaper for its children (like a service manager)
 * 3. give it to the init process (PID 1) in our pid namespace
 */
static struct task_struct *find_new_reaper(struct task_struct *father,
                                           struct task_struct *child_reaper)
{
        struct task_struct *thread, *reaper;

        thread = find_alive_thread(father);
        if (thread)
                return thread;

        if (father->signal->has_child_subreaper) {
                unsigned int ns_level = task_pid(father)->level;
                /*
                 * Find the first ->is_child_subreaper ancestor in our pid_ns.
                 * We can't check reaper != child_reaper to ensure we do not
                 * cross the namespaces, the exiting parent could be injected
                 * by setns() + fork().
                 * We check pid->level, this is slightly more efficient than
                 * task_active_pid_ns(reaper) != task_active_pid_ns(father).
                 */
                for (reaper = father->real_parent;
                     task_pid(reaper)->level == ns_level;
                     reaper = reaper->real_parent) {
                        if (reaper == &init_task)
                                break;
                        if (!reaper->signal->is_child_subreaper)
                                continue;
                        thread = find_alive_thread(reaper);
                        if (thread)
                                return thread;
                }
        }

        return child_reaper;
}

/*
* Any that need to be release_task'd are put on the @dead list.
 */
static void reparent_leader(struct task_struct *father, struct task_struct *p,
                                struct list_head *dead)
{
        if (unlikely(p->exit_state == EXIT_DEAD))
                return;

        /* We don't want people slaying init. */
        p->exit_signal = SIGCHLD;

        /* If it has exited notify the new parent about this child's death. */
        if (!p->ptrace &&
            p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
                if (do_notify_parent(p, p->exit_signal)) {
                        p->exit_state = EXIT_DEAD;
                        list_add(&p->ptrace_entry, dead);
                }
        }

        kill_orphaned_pgrp(p, father);
}

/*
 * This does two things:
 *
 * A.  Make init inherit all the child processes
 * B.  Check to see if any process groups have become orphaned
 *        as a result of our exiting, and if they have any stopped
 *        jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
 */
static void forget_original_parent(struct task_struct *father,
                                        struct list_head *dead)
{
        struct task_struct *p, *t, *reaper;

        if (unlikely(!list_empty(&father->ptraced)))
                exit_ptrace(father, dead);

        /* Can drop and reacquire tasklist_lock */
        reaper = find_child_reaper(father, dead);
        if (list_empty(&father->children))
                return;

        reaper = find_new_reaper(father, reaper);
        list_for_each_entry(p, &father->children, sibling) {
                for_each_thread(p, t) {
                        RCU_INIT_POINTER(t->real_parent, reaper);
                        BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father));
                        if (likely(!t->ptrace))
                                t->parent = t->real_parent;
                        if (t->pdeath_signal)
                                group_send_sig_info(t->pdeath_signal,
                                                    SEND_SIG_NOINFO, t,
                                                    PIDTYPE_TGID);
                }
                /*
                 * If this is a threaded reparent there is no need to
                 * notify anyone anything has happened.
                 */
                if (!same_thread_group(reaper, father))
                        reparent_leader(father, p, dead);
        }
        list_splice_tail_init(&father->children, &reaper->children);
}

/*
 * Send signals to all our closest relatives so that they know
 * to properly mourn us..
 */
static void exit_notify(struct task_struct *tsk, int group_dead)
{
        bool autoreap;
        struct task_struct *p, *n;
        LIST_HEAD(dead);

        write_lock_irq(&tasklist_lock);
        forget_original_parent(tsk, &dead);

        if (group_dead)
                kill_orphaned_pgrp(tsk->group_leader, NULL);

        tsk->exit_state = EXIT_ZOMBIE;
        /*
         * sub-thread or delay_group_leader(), wake up the
         * PIDFD_THREAD waiters.
         */
        if (!thread_group_empty(tsk))
                do_notify_pidfd(tsk);

        if (unlikely(tsk->ptrace)) {
                int sig = thread_group_leader(tsk) &&
                                thread_group_empty(tsk) &&
                                !ptrace_reparented(tsk) ?
                        tsk->exit_signal : SIGCHLD;
                autoreap = do_notify_parent(tsk, sig);
        } else if (thread_group_leader(tsk)) {
                autoreap = thread_group_empty(tsk) &&
                        do_notify_parent(tsk, tsk->exit_signal);
        } else {
                autoreap = true;
        }

        if (autoreap) {
                tsk->exit_state = EXIT_DEAD;
                list_add(&tsk->ptrace_entry, &dead);
        }

        /* mt-exec, de_thread() is waiting for group leader */
        if (unlikely(tsk->signal->notify_count < 0))
                wake_up_process(tsk->signal->group_exec_task);
        write_unlock_irq(&tasklist_lock);

        list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
                list_del_init(&p->ptrace_entry);
                release_task(p);
        }
}

#ifdef CONFIG_DEBUG_STACK_USAGE
static void check_stack_usage(void)
{
        static DEFINE_SPINLOCK(low_water_lock);
        static int lowest_to_date = THREAD_SIZE;
        unsigned long free;

        free = stack_not_used(current);

        if (free >= lowest_to_date)
                return;

        spin_lock(&low_water_lock);
        if (free < lowest_to_date) {
                pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
                        current->comm, task_pid_nr(current), free);
                lowest_to_date = free;
        }
        spin_unlock(&low_water_lock);
}
#else
static inline void check_stack_usage(void) {}
#endif

static void synchronize_group_exit(struct task_struct *tsk, long code)
{
        struct sighand_struct *sighand = tsk->sighand;
        struct signal_struct *signal = tsk->signal;

        spin_lock_irq(&sighand->siglock);
        signal->quick_threads--;
        if ((signal->quick_threads == 0) &&
            !(signal->flags & SIGNAL_GROUP_EXIT)) {
                signal->flags = SIGNAL_GROUP_EXIT;
                signal->group_exit_code = code;
                signal->group_stop_count = 0;
        }
        spin_unlock_irq(&sighand->siglock);
}

void __noreturn do_exit(long code)
{
        struct task_struct *tsk = current;
        int group_dead;

        WARN_ON(irqs_disabled());

        synchronize_group_exit(tsk, code);

        WARN_ON(tsk->plug);

        kcov_task_exit(tsk);
        kmsan_task_exit(tsk);

        coredump_task_exit(tsk);
        ptrace_event(PTRACE_EVENT_EXIT, code);
        user_events_exit(tsk);

        io_uring_files_cancel();
        exit_signals(tsk);  /* sets PF_EXITING */

        acct_update_integrals(tsk);
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
                /*
                 * If the last thread of global init has exited, panic
                 * immediately to get a useable coredump.
                 */
                if (unlikely(is_global_init(tsk)))
                        panic("Attempted to kill init! exitcode=0x%08x\n",
                                tsk->signal->group_exit_code ?: (int)code);

#ifdef CONFIG_POSIX_TIMERS
                hrtimer_cancel(&tsk->signal->real_timer);
                exit_itimers(tsk);
#endif
                if (tsk->mm)
                        setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
        }
        acct_collect(code, group_dead);
        if (group_dead)
                tty_audit_exit();
        audit_free(tsk);

        tsk->exit_code = code;
        taskstats_exit(tsk, group_dead);

        exit_mm();

        if (group_dead)
                acct_process();
        trace_sched_process_exit(tsk);

        exit_sem(tsk);
        exit_shm(tsk);
        exit_files(tsk);
        exit_fs(tsk);
        if (group_dead)
                disassociate_ctty(1);
        exit_task_namespaces(tsk);
        exit_task_work(tsk);
        exit_thread(tsk);

        /*
         * Flush inherited counters to the parent - before the parent
         * gets woken up by child-exit notifications.
         *
         * because of cgroup mode, must be called before cgroup_exit()
         */
        perf_event_exit_task(tsk);

        sched_autogroup_exit_task(tsk);
        cgroup_exit(tsk);

        /*
         * FIXME: do that only when needed, using sched_exit tracepoint
         */
        flush_ptrace_hw_breakpoint(tsk);

        exit_tasks_rcu_start();
        exit_notify(tsk, group_dead);
        proc_exit_connector(tsk);
        mpol_put_task_policy(tsk);
#ifdef CONFIG_FUTEX
        if (unlikely(current->pi_state_cache))
                kfree(current->pi_state_cache);
#endif
        /*
         * Make sure we are holding no locks:
         */
        debug_check_no_locks_held();

        if (tsk->io_context)
                exit_io_context(tsk);

        if (tsk->splice_pipe)
                free_pipe_info(tsk->splice_pipe);

        if (tsk->task_frag.page)
                put_page(tsk->task_frag.page);

        exit_task_stack_account(tsk);

        check_stack_usage();
        preempt_disable();
        if (tsk->nr_dirtied)
                __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
        exit_rcu();
        exit_tasks_rcu_finish();

        lockdep_free_task(tsk);
        do_task_dead();
}

void __noreturn make_task_dead(int signr)
{
        /*
         * Take the task off the cpu after something catastrophic has
         * happened.
         *
         * We can get here from a kernel oops, sometimes with preemption off.
         * Start by checking for critical errors.
         * Then fix up important state like USER_DS and preemption.
         * Then do everything else.
         */
        struct task_struct *tsk = current;
        unsigned int limit;

        if (unlikely(in_interrupt()))
                panic("Aiee, killing interrupt handler!");
        if (unlikely(!tsk->pid))
                panic("Attempted to kill the idle task!");

        if (unlikely(irqs_disabled())) {
                pr_info("note: %s[%d] exited with irqs disabled\n",
                        current->comm, task_pid_nr(current));
                local_irq_enable();
        }
        if (unlikely(in_atomic())) {
                pr_info("note: %s[%d] exited with preempt_count %d\n",
                        current->comm, task_pid_nr(current),
                        preempt_count());
                preempt_count_set(PREEMPT_ENABLED);
        }

        /*
         * Every time the system oopses, if the oops happens while a reference
         * to an object was held, the reference leaks.
         * If the oops doesn't also leak memory, repeated oopsing can cause
         * reference counters to wrap around (if they're not using refcount_t).
         * This means that repeated oopsing can make unexploitable-looking bugs
         * exploitable through repeated oopsing.
         * To make sure this can't happen, place an upper bound on how often the
         * kernel may oops without panic().
         */
        limit = READ_ONCE(oops_limit);
        if (atomic_inc_return(&oops_count) >= limit && limit)
                panic("Oopsed too often (kernel.oops_limit is %d)", limit);

        /*
         * We're taking recursive faults here in make_task_dead. Safest is to just
         * leave this task alone and wait for reboot.
         */
        if (unlikely(tsk->flags & PF_EXITING)) {
                pr_alert("Fixing recursive fault but reboot is needed!\n");
                futex_exit_recursive(tsk);
                tsk->exit_state = EXIT_DEAD;
                refcount_inc(&tsk->rcu_users);
                do_task_dead();
        }

        do_exit(signr);
}

SYSCALL_DEFINE1(exit, int, error_code)
{
        do_exit((error_code&0xff)<<8);
}

/*
 * Take down every thread in the group.  This is called by fatal signals
 * as well as by sys_exit_group (below).
 */
void __noreturn
do_group_exit(int exit_code)
{
        struct signal_struct *sig = current->signal;

        if (sig->flags & SIGNAL_GROUP_EXIT)
                exit_code = sig->group_exit_code;
        else if (sig->group_exec_task)
                exit_code = 0;
        else {
                struct sighand_struct *const sighand = current->sighand;

                spin_lock_irq(&sighand->siglock);
                if (sig->flags & SIGNAL_GROUP_EXIT)
                        /* Another thread got here before we took the lock.  */
                        exit_code = sig->group_exit_code;
                else if (sig->group_exec_task)
                        exit_code = 0;
                else {
                        sig->group_exit_code = exit_code;
                        sig->flags = SIGNAL_GROUP_EXIT;
                        zap_other_threads(current);
                }
                spin_unlock_irq(&sighand->siglock);
        }

        do_exit(exit_code);
        /* NOTREACHED */
}

/*
 * this kills every thread in the thread group. Note that any externally
 * wait4()-ing process will get the correct exit code - even if this
 * thread is not the thread group leader.
 */
SYSCALL_DEFINE1(exit_group, int, error_code)
{
        do_group_exit((error_code & 0xff) << 8);
        /* NOTREACHED */
        return 0;
}

static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
{
        return        wo->wo_type == PIDTYPE_MAX ||
                task_pid_type(p, wo->wo_type) == wo->wo_pid;
}

static int
eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
{
        if (!eligible_pid(wo, p))
                return 0;

        /*
         * Wait for all children (clone and not) if __WALL is set or
         * if it is traced by us.
         */
        if (ptrace || (wo->wo_flags & __WALL))
                return 1;

        /*
         * Otherwise, wait for clone children *only* if __WCLONE is set;
         * otherwise, wait for non-clone children *only*.
         *
         * Note: a "clone" child here is one that reports to its parent
         * using a signal other than SIGCHLD, or a non-leader thread which
         * we can only see if it is traced by us.
         */
        if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
                return 0;

        return 1;
}

/*
 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
 * the lock and this task is uninteresting.  If we return nonzero, we have
 * released the lock and the system call should return.
 */
static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
{
        int state, status;
        pid_t pid = task_pid_vnr(p);
        uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
        struct waitid_info *infop;

        if (!likely(wo->wo_flags & WEXITED))
                return 0;

        if (unlikely(wo->wo_flags & WNOWAIT)) {
                status = (p->signal->flags & SIGNAL_GROUP_EXIT)
                        ? p->signal->group_exit_code : p->exit_code;
                get_task_struct(p);
                read_unlock(&tasklist_lock);
                sched_annotate_sleep();
                if (wo->wo_rusage)
                        getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
                put_task_struct(p);
                goto out_info;
        }
        /*
         * Move the task's state to DEAD/TRACE, only one thread can do this.
         */
        state = (ptrace_reparented(p) && thread_group_leader(p)) ?
                EXIT_TRACE : EXIT_DEAD;
        if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
                return 0;
        /*
         * We own this thread, nobody else can reap it.
         */
        read_unlock(&tasklist_lock);
        sched_annotate_sleep();

        /*
         * Check thread_group_leader() to exclude the traced sub-threads.
         */
        if (state == EXIT_DEAD && thread_group_leader(p)) {
                struct signal_struct *sig = p->signal;
                struct signal_struct *psig = current->signal;
                unsigned long maxrss;
                u64 tgutime, tgstime;

                /*
                 * The resource counters for the group leader are in its
                 * own task_struct.  Those for dead threads in the group
                 * are in its signal_struct, as are those for the child
                 * processes it has previously reaped.  All these
                 * accumulate in the parent's signal_struct c* fields.
                 *
                 * We don't bother to take a lock here to protect these
                 * p->signal fields because the whole thread group is dead
                 * and nobody can change them.
                 *
                 * psig->stats_lock also protects us from our sub-threads
                 * which can reap other children at the same time.
                 *
                 * We use thread_group_cputime_adjusted() to get times for
                 * the thread group, which consolidates times for all threads
                 * in the group including the group leader.
                 */
                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
                write_seqlock_irq(&psig->stats_lock);
                psig->cutime += tgutime + sig->cutime;
                psig->cstime += tgstime + sig->cstime;
                psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
                psig->cmin_flt +=
                        p->min_flt + sig->min_flt + sig->cmin_flt;
                psig->cmaj_flt +=
                        p->maj_flt + sig->maj_flt + sig->cmaj_flt;
                psig->cnvcsw +=
                        p->nvcsw + sig->nvcsw + sig->cnvcsw;
                psig->cnivcsw +=
                        p->nivcsw + sig->nivcsw + sig->cnivcsw;
                psig->cinblock +=
                        task_io_get_inblock(p) +
                        sig->inblock + sig->cinblock;
                psig->coublock +=
                        task_io_get_oublock(p) +
                        sig->oublock + sig->coublock;
                maxrss = max(sig->maxrss, sig->cmaxrss);
                if (psig->cmaxrss < maxrss)
                        psig->cmaxrss = maxrss;
                task_io_accounting_add(&psig->ioac, &p->ioac);
                task_io_accounting_add(&psig->ioac, &sig->ioac);
                write_sequnlock_irq(&psig->stats_lock);
        }

        if (wo->wo_rusage)
                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
        status = (p->signal->flags & SIGNAL_GROUP_EXIT)
                ? p->signal->group_exit_code : p->exit_code;
        wo->wo_stat = status;

        if (state == EXIT_TRACE) {
                write_lock_irq(&tasklist_lock);
                /* We dropped tasklist, ptracer could die and untrace */
                ptrace_unlink(p);

                /* If parent wants a zombie, don't release it now */
                state = EXIT_ZOMBIE;
                if (do_notify_parent(p, p->exit_signal))
                        state = EXIT_DEAD;
                p->exit_state = state;
                write_unlock_irq(&tasklist_lock);
        }
        if (state == EXIT_DEAD)
                release_task(p);

out_info:
        infop = wo->wo_info;
        if (infop) {
                if ((status & 0x7f) == 0) {
                        infop->cause = CLD_EXITED;
                        infop->status = status >> 8;
                } else {
                        infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
                        infop->status = status & 0x7f;
                }
                infop->pid = pid;
                infop->uid = uid;
        }

        return pid;
}

static int *task_stopped_code(struct task_struct *p, bool ptrace)
{
        if (ptrace) {
                if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
                        return &p->exit_code;
        } else {
                if (p->signal->flags & SIGNAL_STOP_STOPPED)
                        return &p->signal->group_exit_code;
        }
        return NULL;
}

/**
 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
 * @wo: wait options
 * @ptrace: is the wait for ptrace
 * @p: task to wait for
 *
 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
 *
 * CONTEXT:
 * read_lock(&tasklist_lock), which is released if return value is
 * non-zero.  Also, grabs and releases @p->sighand->siglock.
 *
 * RETURNS:
 * 0 if wait condition didn't exist and search for other wait conditions
 * should continue.  Non-zero return, -errno on failure and @p's pid on
 * success, implies that tasklist_lock is released and wait condition
 * search should terminate.
 */
static int wait_task_stopped(struct wait_opts *wo,
                                int ptrace, struct task_struct *p)
{
        struct waitid_info *infop;
        int exit_code, *p_code, why;
        uid_t uid = 0; /* unneeded, required by compiler */
        pid_t pid;

        /*
         * Traditionally we see ptrace'd stopped tasks regardless of options.
         */
        if (!ptrace && !(wo->wo_flags & WUNTRACED))
                return 0;

        if (!task_stopped_code(p, ptrace))
                return 0;

        exit_code = 0;
        spin_lock_irq(&p->sighand->siglock);

        p_code = task_stopped_code(p, ptrace);
        if (unlikely(!p_code))
                goto unlock_sig;

        exit_code = *p_code;
        if (!exit_code)
                goto unlock_sig;

        if (!unlikely(wo->wo_flags & WNOWAIT))
                *p_code = 0;

        uid = from_kuid_munged(current_user_ns(), task_uid(p));
unlock_sig:
        spin_unlock_irq(&p->sighand->siglock);
        if (!exit_code)
                return 0;

        /*
         * Now we are pretty sure this task is interesting.
         * Make sure it doesn't get reaped out from under us while we
         * give up the lock and then examine it below.  We don't want to
         * keep holding onto the tasklist_lock while we call getrusage and
         * possibly take page faults for user memory.
         */
        get_task_struct(p);
        pid = task_pid_vnr(p);
        why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
        read_unlock(&tasklist_lock);
        sched_annotate_sleep();
        if (wo->wo_rusage)
                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
        put_task_struct(p);

        if (likely(!(wo->wo_flags & WNOWAIT)))
                wo->wo_stat = (exit_code << 8) | 0x7f;

        infop = wo->wo_info;
        if (infop) {
                infop->cause = why;
                infop->status = exit_code;
                infop->pid = pid;
                infop->uid = uid;
        }
        return pid;
}

/*
 * Handle do_wait work for one task in a live, non-stopped state.
 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
 * the lock and this task is uninteresting.  If we return nonzero, we have
 * released the lock and the system call should return.
 */
static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
{
        struct waitid_info *infop;
        pid_t pid;
        uid_t uid;

        if (!unlikely(wo->wo_flags & WCONTINUED))
                return 0;

        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
                return 0;

        spin_lock_irq(&p->sighand->siglock);
        /* Re-check with the lock held.  */
        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
                spin_unlock_irq(&p->sighand->siglock);
                return 0;
        }
        if (!unlikely(wo->wo_flags & WNOWAIT))
                p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
        uid = from_kuid_munged(current_user_ns(), task_uid(p));
        spin_unlock_irq(&p->sighand->siglock);

        pid = task_pid_vnr(p);
        get_task_struct(p);
        read_unlock(&tasklist_lock);
        sched_annotate_sleep();
        if (wo->wo_rusage)
                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
        put_task_struct(p);

        infop = wo->wo_info;
        if (!infop) {
                wo->wo_stat = 0xffff;
        } else {
                infop->cause = CLD_CONTINUED;
                infop->pid = pid;
                infop->uid = uid;
                infop->status = SIGCONT;
        }
        return pid;
}

/*
 * Consider @p for a wait by @parent.
 *
 * -ECHILD should be in ->notask_error before the first call.
 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
 * Returns zero if the search for a child should continue;
 * then ->notask_error is 0 if @p is an eligible child,
 * or still -ECHILD.
 */
static int wait_consider_task(struct wait_opts *wo, int ptrace,
                                struct task_struct *p)
{
        /*
         * We can race with wait_task_zombie() from another thread.
         * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
         * can't confuse the checks below.
         */
        int exit_state = READ_ONCE(p->exit_state);
        int ret;

        if (unlikely(exit_state == EXIT_DEAD))
                return 0;

        ret = eligible_child(wo, ptrace, p);
        if (!ret)
                return ret;

        if (unlikely(exit_state == EXIT_TRACE)) {
                /*
                 * ptrace == 0 means we are the natural parent. In this case
                 * we should clear notask_error, debugger will notify us.
                 */
                if (likely(!ptrace))
                        wo->notask_error = 0;
                return 0;
        }

        if (likely(!ptrace) && unlikely(p->ptrace)) {
                /*
                 * If it is traced by its real parent's group, just pretend
                 * the caller is ptrace_do_wait() and reap this child if it
                 * is zombie.
                 *
                 * This also hides group stop state from real parent; otherwise
                 * a single stop can be reported twice as group and ptrace stop.
                 * If a ptracer wants to distinguish these two events for its
                 * own children it should create a separate process which takes
                 * the role of real parent.
                 */
                if (!ptrace_reparented(p))
                        ptrace = 1;
        }

        /* slay zombie? */
        if (exit_state == EXIT_ZOMBIE) {
                /* we don't reap group leaders with subthreads */
                if (!delay_group_leader(p)) {
                        /*
                         * A zombie ptracee is only visible to its ptracer.
                         * Notification and reaping will be cascaded to the
                         * real parent when the ptracer detaches.
                         */
                        if (unlikely(ptrace) || likely(!p->ptrace))
                                return wait_task_zombie(wo, p);
                }

                /*
                 * Allow access to stopped/continued state via zombie by
                 * falling through.  Clearing of notask_error is complex.
                 *
                 * When !@ptrace:
                 *
                 * If WEXITED is set, notask_error should naturally be
                 * cleared.  If not, subset of WSTOPPED|WCONTINUED is set,
                 * so, if there are live subthreads, there are events to
                 * wait for.  If all subthreads are dead, it's still safe
                 * to clear - this function will be called again in finite
                 * amount time once all the subthreads are released and
                 * will then return without clearing.
                 *
                 * When @ptrace:
                 *
                 * Stopped state is per-task and thus can't change once the
                 * target task dies.  Only continued and exited can happen.
                 * Clear notask_error if WCONTINUED | WEXITED.
                 */
                if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
                        wo->notask_error = 0;
        } else {
                /*
                 * @p is alive and it's gonna stop, continue or exit, so
                 * there always is something to wait for.
                 */
                wo->notask_error = 0;
        }

        /*
         * Wait for stopped.  Depending on @ptrace, different stopped state
         * is used and the two don't interact with each other.
         */
        ret = wait_task_stopped(wo, ptrace, p);
        if (ret)
                return ret;

        /*
         * Wait for continued.  There's only one continued state and the
         * ptracer can consume it which can confuse the real parent.  Don't
         * use WCONTINUED from ptracer.  You don't need or want it.
         */
        return wait_task_continued(wo, p);
}

/*
 * Do the work of do_wait() for one thread in the group, @tsk.
 *
 * -ECHILD should be in ->notask_error before the first call.
 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
 * Returns zero if the search for a child should continue; then
 * ->notask_error is 0 if there were any eligible children,
 * or still -ECHILD.
 */
static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
{
        struct task_struct *p;

        list_for_each_entry(p, &tsk->children, sibling) {
                int ret = wait_consider_task(wo, 0, p);

                if (ret)
                        return ret;
        }

        return 0;
}

static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
{
        struct task_struct *p;

        list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
                int ret = wait_consider_task(wo, 1, p);

                if (ret)
                        return ret;
        }

        return 0;
}

bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
{
        if (!eligible_pid(wo, p))
                return false;

        if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
                return false;

        return true;
}

static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
                                int sync, void *key)
{
        struct wait_opts *wo = container_of(wait, struct wait_opts,
                                                child_wait);
        struct task_struct *p = key;

        if (pid_child_should_wake(wo, p))
                return default_wake_function(wait, mode, sync, key);

        return 0;
}

void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
{
        __wake_up_sync_key(&parent->signal->wait_chldexit,
                           TASK_INTERRUPTIBLE, p);
}

static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
                                 struct task_struct *target)
{
        struct task_struct *parent =
                !ptrace ? target->real_parent : target->parent;

        return current == parent || (!(wo->wo_flags & __WNOTHREAD) &&
                                     same_thread_group(current, parent));
}

/*
 * Optimization for waiting on PIDTYPE_PID. No need to iterate through child
 * and tracee lists to find the target task.
 */
static int do_wait_pid(struct wait_opts *wo)
{
        bool ptrace;
        struct task_struct *target;
        int retval;

        ptrace = false;
        target = pid_task(wo->wo_pid, PIDTYPE_TGID);
        if (target && is_effectively_child(wo, ptrace, target)) {
                retval = wait_consider_task(wo, ptrace, target);
                if (retval)
                        return retval;
        }

        ptrace = true;
        target = pid_task(wo->wo_pid, PIDTYPE_PID);
        if (target && target->ptrace &&
            is_effectively_child(wo, ptrace, target)) {
                retval = wait_consider_task(wo, ptrace, target);
                if (retval)
                        return retval;
        }

        return 0;
}

long __do_wait(struct wait_opts *wo)
{
        long retval;

        /*
         * If there is nothing that can match our criteria, just get out.
         * We will clear ->notask_error to zero if we see any child that
         * might later match our criteria, even if we are not able to reap
         * it yet.
         */
        wo->notask_error = -ECHILD;
        if ((wo->wo_type < PIDTYPE_MAX) &&
           (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
                goto notask;

        read_lock(&tasklist_lock);

        if (wo->wo_type == PIDTYPE_PID) {
                retval = do_wait_pid(wo);
                if (retval)
                        return retval;
        } else {
                struct task_struct *tsk = current;

                do {
                        retval = do_wait_thread(wo, tsk);
                        if (retval)
                                return retval;

                        retval = ptrace_do_wait(wo, tsk);
                        if (retval)
                                return retval;

                        if (wo->wo_flags & __WNOTHREAD)
                                break;
                } while_each_thread(current, tsk);
        }
        read_unlock(&tasklist_lock);

notask:
        retval = wo->notask_error;
        if (!retval && !(wo->wo_flags & WNOHANG))
                return -ERESTARTSYS;

        return retval;
}

static long do_wait(struct wait_opts *wo)
{
        int retval;

        trace_sched_process_wait(wo->wo_pid);

        init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
        wo->child_wait.private = current;
        add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);

        do {
                set_current_state(TASK_INTERRUPTIBLE);
                retval = __do_wait(wo);
                if (retval != -ERESTARTSYS)
                        break;
                if (signal_pending(current))
                        break;
                schedule();
        } while (1);

        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
        return retval;
}

int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
                          struct waitid_info *infop, int options,
                          struct rusage *ru)
{
        unsigned int f_flags = 0;
        struct pid *pid = NULL;
        enum pid_type type;

        if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
                        __WNOTHREAD|__WCLONE|__WALL))
                return -EINVAL;
        if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
                return -EINVAL;

        switch (which) {
        case P_ALL:
                type = PIDTYPE_MAX;
                break;
        case P_PID:
                type = PIDTYPE_PID;
                if (upid <= 0)
                        return -EINVAL;

                pid = find_get_pid(upid);
                break;
        case P_PGID:
                type = PIDTYPE_PGID;
                if (upid < 0)
                        return -EINVAL;

                if (upid)
                        pid = find_get_pid(upid);
                else
                        pid = get_task_pid(current, PIDTYPE_PGID);
                break;
        case P_PIDFD:
                type = PIDTYPE_PID;
                if (upid < 0)
                        return -EINVAL;

                pid = pidfd_get_pid(upid, &f_flags);
                if (IS_ERR(pid))
                        return PTR_ERR(pid);

                break;
        default:
                return -EINVAL;
        }

        wo->wo_type        = type;
        wo->wo_pid        = pid;
        wo->wo_flags        = options;
        wo->wo_info        = infop;
        wo->wo_rusage        = ru;
        if (f_flags & O_NONBLOCK)
                wo->wo_flags |= WNOHANG;

        return 0;
}

static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
                          int options, struct rusage *ru)
{
        struct wait_opts wo;
        long ret;

        ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru);
        if (ret)
                return ret;

        ret = do_wait(&wo);
        if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
                ret = -EAGAIN;

        put_pid(wo.wo_pid);
        return ret;
}

SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
                infop, int, options, struct rusage __user *, ru)
{
        struct rusage r;
        struct waitid_info info = {.status = 0};
        long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
        int signo = 0;

        if (err > 0) {
                signo = SIGCHLD;
                err = 0;
                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
                        return -EFAULT;
        }
        if (!infop)
                return err;

        if (!user_write_access_begin(infop, sizeof(*infop)))
                return -EFAULT;

        unsafe_put_user(signo, &infop->si_signo, Efault);
        unsafe_put_user(0, &infop->si_errno, Efault);
        unsafe_put_user(info.cause, &infop->si_code, Efault);
        unsafe_put_user(info.pid, &infop->si_pid, Efault);
        unsafe_put_user(info.uid, &infop->si_uid, Efault);
        unsafe_put_user(info.status, &infop->si_status, Efault);
        user_write_access_end();
        return err;
Efault:
        user_write_access_end();
        return -EFAULT;
}

long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
                  struct rusage *ru)
{
        struct wait_opts wo;
        struct pid *pid = NULL;
        enum pid_type type;
        long ret;

        if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
                        __WNOTHREAD|__WCLONE|__WALL))
                return -EINVAL;

        /* -INT_MIN is not defined */
        if (upid == INT_MIN)
                return -ESRCH;

        if (upid == -1)
                type = PIDTYPE_MAX;
        else if (upid < 0) {
                type = PIDTYPE_PGID;
                pid = find_get_pid(-upid);
        } else if (upid == 0) {
                type = PIDTYPE_PGID;
                pid = get_task_pid(current, PIDTYPE_PGID);
        } else /* upid > 0 */ {
                type = PIDTYPE_PID;
                pid = find_get_pid(upid);
        }

        wo.wo_type        = type;
        wo.wo_pid        = pid;
        wo.wo_flags        = options | WEXITED;
        wo.wo_info        = NULL;
        wo.wo_stat        = 0;
        wo.wo_rusage        = ru;
        ret = do_wait(&wo);
        put_pid(pid);
        if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
                ret = -EFAULT;

        return ret;
}

int kernel_wait(pid_t pid, int *stat)
{
        struct wait_opts wo = {
                .wo_type        = PIDTYPE_PID,
                .wo_pid                = find_get_pid(pid),
                .wo_flags        = WEXITED,
        };
        int ret;

        ret = do_wait(&wo);
        if (ret > 0 && wo.wo_stat)
                *stat = wo.wo_stat;
        put_pid(wo.wo_pid);
        return ret;
}

SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
                int, options, struct rusage __user *, ru)
{
        struct rusage r;
        long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);

        if (err > 0) {
                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
                        return -EFAULT;
        }
        return err;
}

#ifdef __ARCH_WANT_SYS_WAITPID

/*
 * sys_waitpid() remains for compatibility. waitpid() should be
 * implemented by calling sys_wait4() from libc.a.
 */
SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
{
        return kernel_wait4(pid, stat_addr, options, NULL);
}

#endif

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(wait4,
        compat_pid_t, pid,
        compat_uint_t __user *, stat_addr,
        int, options,
        struct compat_rusage __user *, ru)
{
        struct rusage r;
        long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
        if (err > 0) {
                if (ru && put_compat_rusage(&r, ru))
                        return -EFAULT;
        }
        return err;
}

COMPAT_SYSCALL_DEFINE5(waitid,
                int, which, compat_pid_t, pid,
                struct compat_siginfo __user *, infop, int, options,
                struct compat_rusage __user *, uru)
{
        struct rusage ru;
        struct waitid_info info = {.status = 0};
        long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
        int signo = 0;
        if (err > 0) {
                signo = SIGCHLD;
                err = 0;
                if (uru) {
                        /* kernel_waitid() overwrites everything in ru */
                        if (COMPAT_USE_64BIT_TIME)
                                err = copy_to_user(uru, &ru, sizeof(ru));
                        else
                                err = put_compat_rusage(&ru, uru);
                        if (err)
                                return -EFAULT;
                }
        }

        if (!infop)
                return err;

        if (!user_write_access_begin(infop, sizeof(*infop)))
                return -EFAULT;

        unsafe_put_user(signo, &infop->si_signo, Efault);
        unsafe_put_user(0, &infop->si_errno, Efault);
        unsafe_put_user(info.cause, &infop->si_code, Efault);
        unsafe_put_user(info.pid, &infop->si_pid, Efault);
        unsafe_put_user(info.uid, &infop->si_uid, Efault);
        unsafe_put_user(info.status, &infop->si_status, Efault);
        user_write_access_end();
        return err;
Efault:
        user_write_access_end();
        return -EFAULT;
}
#endif

/*
 * This needs to be __function_aligned as GCC implicitly makes any
 * implementation of abort() cold and drops alignment specified by
 * -falign-functions=N.
 *
 * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11
 */
__weak __function_aligned void abort(void)
{
        BUG();

        /* if that doesn't kill us, halt */
        panic("Oops failed to kill thread");
}
EXPORT_SYMBOL(abort);

















































































































    3 








    2 













































    2 









    2 



    2 




    2 
    1 


























    2 





















































































    1 



    2 
























    2 

















    2 



















































    2 


    3 



    3 






































    1 










    2 















    2 





































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
// SPDX-License-Identifier: GPL-2.0
/*
 * Functions to sequence PREFLUSH and FUA writes.
 *
 * Copyright (C) 2011                Max Planck Institute for Gravitational Physics
 * Copyright (C) 2011                Tejun Heo <tj@kernel.org>
 *
 * REQ_{PREFLUSH|FUA} requests are decomposed to sequences consisted of three
 * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
 * properties and hardware capability.
 *
 * If a request doesn't have data, only REQ_PREFLUSH makes sense, which
 * indicates a simple flush request.  If there is data, REQ_PREFLUSH indicates
 * that the device cache should be flushed before the data is executed, and
 * REQ_FUA means that the data must be on non-volatile media on request
 * completion.
 *
 * If the device doesn't have writeback cache, PREFLUSH and FUA don't make any
 * difference.  The requests are either completed immediately if there's no data
 * or executed as normal requests otherwise.
 *
 * If the device has writeback cache and supports FUA, REQ_PREFLUSH is
 * translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
 *
 * If the device has writeback cache and doesn't support FUA, REQ_PREFLUSH
 * is translated to PREFLUSH and REQ_FUA to POSTFLUSH.
 *
 * The actual execution of flush is double buffered.  Whenever a request
 * needs to execute PRE or POSTFLUSH, it queues at
 * fq->flush_queue[fq->flush_pending_idx].  Once certain criteria are met, a
 * REQ_OP_FLUSH is issued and the pending_idx is toggled.  When the flush
 * completes, all the requests which were pending are proceeded to the next
 * step.  This allows arbitrary merging of different types of PREFLUSH/FUA
 * requests.
 *
 * Currently, the following conditions are used to determine when to issue
 * flush.
 *
 * C1. At any given time, only one flush shall be in progress.  This makes
 *     double buffering sufficient.
 *
 * C2. Flush is deferred if any request is executing DATA of its sequence.
 *     This avoids issuing separate POSTFLUSHes for requests which shared
 *     PREFLUSH.
 *
 * C3. The second condition is ignored if there is a request which has
 *     waited longer than FLUSH_PENDING_TIMEOUT.  This is to avoid
 *     starvation in the unlikely case where there are continuous stream of
 *     FUA (without PREFLUSH) requests.
 *
 * For devices which support FUA, it isn't clear whether C2 (and thus C3)
 * is beneficial.
 *
 * Note that a sequenced PREFLUSH/FUA request with DATA is completed twice.
 * Once while executing DATA and again after the whole sequence is
 * complete.  The first completion updates the contained bio but doesn't
 * finish it so that the bio submitter is notified only after the whole
 * sequence is complete.  This is implemented by testing RQF_FLUSH_SEQ in
 * req_bio_endio().
 *
 * The above peculiarity requires that each PREFLUSH/FUA request has only one
 * bio attached to it, which is guaranteed as they aren't allowed to be
 * merged in the usual way.
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/gfp.h>
#include <linux/part_stat.h>

#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-sched.h"

/* PREFLUSH/FUA sequences */
enum {
        REQ_FSEQ_PREFLUSH        = (1 << 0), /* pre-flushing in progress */
        REQ_FSEQ_DATA                = (1 << 1), /* data write in progress */
        REQ_FSEQ_POSTFLUSH        = (1 << 2), /* post-flushing in progress */
        REQ_FSEQ_DONE                = (1 << 3),

        REQ_FSEQ_ACTIONS        = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |
                                  REQ_FSEQ_POSTFLUSH,

        /*
         * If flush has been pending longer than the following timeout,
         * it's issued even if flush_data requests are still in flight.
         */
        FLUSH_PENDING_TIMEOUT        = 5 * HZ,
};

static void blk_kick_flush(struct request_queue *q,
                           struct blk_flush_queue *fq, blk_opf_t flags);

static inline struct blk_flush_queue *
blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
{
        return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
}

static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
{
        unsigned int policy = 0;

        if (blk_rq_sectors(rq))
                policy |= REQ_FSEQ_DATA;

        if (fflags & (1UL << QUEUE_FLAG_WC)) {
                if (rq->cmd_flags & REQ_PREFLUSH)
                        policy |= REQ_FSEQ_PREFLUSH;
                if (!(fflags & (1UL << QUEUE_FLAG_FUA)) &&
                    (rq->cmd_flags & REQ_FUA))
                        policy |= REQ_FSEQ_POSTFLUSH;
        }
        return policy;
}

static unsigned int blk_flush_cur_seq(struct request *rq)
{
        return 1 << ffz(rq->flush.seq);
}

static void blk_flush_restore_request(struct request *rq)
{
        /*
         * After flush data completion, @rq->bio is %NULL but we need to
         * complete the bio again.  @rq->biotail is guaranteed to equal the
         * original @rq->bio.  Restore it.
         */
        rq->bio = rq->biotail;
        if (rq->bio)
                rq->__sector = rq->bio->bi_iter.bi_sector;

        /* make @rq a normal request */
        rq->rq_flags &= ~RQF_FLUSH_SEQ;
        rq->end_io = rq->flush.saved_end_io;
}

static void blk_account_io_flush(struct request *rq)
{
        struct block_device *part = rq->q->disk->part0;

        part_stat_lock();
        part_stat_inc(part, ios[STAT_FLUSH]);
        part_stat_add(part, nsecs[STAT_FLUSH],
                      blk_time_get_ns() - rq->start_time_ns);
        part_stat_unlock();
}

/**
 * blk_flush_complete_seq - complete flush sequence
 * @rq: PREFLUSH/FUA request being sequenced
 * @fq: flush queue
 * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
 * @error: whether an error occurred
 *
 * @rq just completed @seq part of its flush sequence, record the
 * completion and trigger the next step.
 *
 * CONTEXT:
 * spin_lock_irq(fq->mq_flush_lock)
 */
static void blk_flush_complete_seq(struct request *rq,
                                   struct blk_flush_queue *fq,
                                   unsigned int seq, blk_status_t error)
{
        struct request_queue *q = rq->q;
        struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
        blk_opf_t cmd_flags;

        BUG_ON(rq->flush.seq & seq);
        rq->flush.seq |= seq;
        cmd_flags = rq->cmd_flags;

        if (likely(!error))
                seq = blk_flush_cur_seq(rq);
        else
                seq = REQ_FSEQ_DONE;

        switch (seq) {
        case REQ_FSEQ_PREFLUSH:
        case REQ_FSEQ_POSTFLUSH:
                /* queue for flush */
                if (list_empty(pending))
                        fq->flush_pending_since = jiffies;
                list_add_tail(&rq->queuelist, pending);
                break;

        case REQ_FSEQ_DATA:
                fq->flush_data_in_flight++;
                spin_lock(&q->requeue_lock);
                list_move(&rq->queuelist, &q->requeue_list);
                spin_unlock(&q->requeue_lock);
                blk_mq_kick_requeue_list(q);
                break;

        case REQ_FSEQ_DONE:
                /*
                 * @rq was previously adjusted by blk_insert_flush() for
                 * flush sequencing and may already have gone through the
                 * flush data request completion path.  Restore @rq for
                 * normal completion and end it.
                 */
                list_del_init(&rq->queuelist);
                blk_flush_restore_request(rq);
                blk_mq_end_request(rq, error);
                break;

        default:
                BUG();
        }

        blk_kick_flush(q, fq, cmd_flags);
}

static enum rq_end_io_ret flush_end_io(struct request *flush_rq,
                                       blk_status_t error)
{
        struct request_queue *q = flush_rq->q;
        struct list_head *running;
        struct request *rq, *n;
        unsigned long flags = 0;
        struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);

        /* release the tag's ownership to the req cloned from */
        spin_lock_irqsave(&fq->mq_flush_lock, flags);

        if (!req_ref_put_and_test(flush_rq)) {
                fq->rq_status = error;
                spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
                return RQ_END_IO_NONE;
        }

        blk_account_io_flush(flush_rq);
        /*
         * Flush request has to be marked as IDLE when it is really ended
         * because its .end_io() is called from timeout code path too for
         * avoiding use-after-free.
         */
        WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE);
        if (fq->rq_status != BLK_STS_OK) {
                error = fq->rq_status;
                fq->rq_status = BLK_STS_OK;
        }

        if (!q->elevator) {
                flush_rq->tag = BLK_MQ_NO_TAG;
        } else {
                blk_mq_put_driver_tag(flush_rq);
                flush_rq->internal_tag = BLK_MQ_NO_TAG;
        }

        running = &fq->flush_queue[fq->flush_running_idx];
        BUG_ON(fq->flush_pending_idx == fq->flush_running_idx);

        /* account completion of the flush request */
        fq->flush_running_idx ^= 1;

        /* and push the waiting requests to the next stage */
        list_for_each_entry_safe(rq, n, running, queuelist) {
                unsigned int seq = blk_flush_cur_seq(rq);

                BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
                list_del_init(&rq->queuelist);
                blk_flush_complete_seq(rq, fq, seq, error);
        }

        spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
        return RQ_END_IO_NONE;
}

bool is_flush_rq(struct request *rq)
{
        return rq->end_io == flush_end_io;
}

/**
 * blk_kick_flush - consider issuing flush request
 * @q: request_queue being kicked
 * @fq: flush queue
 * @flags: cmd_flags of the original request
 *
 * Flush related states of @q have changed, consider issuing flush request.
 * Please read the comment at the top of this file for more info.
 *
 * CONTEXT:
 * spin_lock_irq(fq->mq_flush_lock)
 *
 */
static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
                           blk_opf_t flags)
{
        struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
        struct request *first_rq =
                list_first_entry(pending, struct request, queuelist);
        struct request *flush_rq = fq->flush_rq;

        /* C1 described at the top of this file */
        if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending))
                return;

        /* C2 and C3 */
        if (fq->flush_data_in_flight &&
            time_before(jiffies,
                        fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
                return;

        /*
         * Issue flush and toggle pending_idx.  This makes pending_idx
         * different from running_idx, which means flush is in flight.
         */
        fq->flush_pending_idx ^= 1;

        blk_rq_init(q, flush_rq);

        /*
         * In case of none scheduler, borrow tag from the first request
         * since they can't be in flight at the same time. And acquire
         * the tag's ownership for flush req.
         *
         * In case of IO scheduler, flush rq need to borrow scheduler tag
         * just for cheating put/get driver tag.
         */
        flush_rq->mq_ctx = first_rq->mq_ctx;
        flush_rq->mq_hctx = first_rq->mq_hctx;

        if (!q->elevator)
                flush_rq->tag = first_rq->tag;
        else
                flush_rq->internal_tag = first_rq->internal_tag;

        flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
        flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK);
        flush_rq->rq_flags |= RQF_FLUSH_SEQ;
        flush_rq->end_io = flush_end_io;
        /*
         * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one
         * implied in refcount_inc_not_zero() called from
         * blk_mq_find_and_get_req(), which orders WRITE/READ flush_rq->ref
         * and READ flush_rq->end_io
         */
        smp_wmb();
        req_ref_set(flush_rq, 1);

        spin_lock(&q->requeue_lock);
        list_add_tail(&flush_rq->queuelist, &q->flush_list);
        spin_unlock(&q->requeue_lock);

        blk_mq_kick_requeue_list(q);
}

static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
                                               blk_status_t error)
{
        struct request_queue *q = rq->q;
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
        struct blk_mq_ctx *ctx = rq->mq_ctx;
        unsigned long flags;
        struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);

        if (q->elevator) {
                WARN_ON(rq->tag < 0);
                blk_mq_put_driver_tag(rq);
        }

        /*
         * After populating an empty queue, kick it to avoid stall.  Read
         * the comment in flush_end_io().
         */
        spin_lock_irqsave(&fq->mq_flush_lock, flags);
        fq->flush_data_in_flight--;
        /*
         * May have been corrupted by rq->rq_next reuse, we need to
         * re-initialize rq->queuelist before reusing it here.
         */
        INIT_LIST_HEAD(&rq->queuelist);
        blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error);
        spin_unlock_irqrestore(&fq->mq_flush_lock, flags);

        blk_mq_sched_restart(hctx);
        return RQ_END_IO_NONE;
}

static void blk_rq_init_flush(struct request *rq)
{
        rq->flush.seq = 0;
        rq->rq_flags |= RQF_FLUSH_SEQ;
        rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
        rq->end_io = mq_flush_data_end_io;
}

/*
 * Insert a PREFLUSH/FUA request into the flush state machine.
 * Returns true if the request has been consumed by the flush state machine,
 * or false if the caller should continue to process it.
 */
bool blk_insert_flush(struct request *rq)
{
        struct request_queue *q = rq->q;
        unsigned long fflags = q->queue_flags;        /* may change, cache */
        unsigned int policy = blk_flush_policy(fflags, rq);
        struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);

        /* FLUSH/FUA request must never be merged */
        WARN_ON_ONCE(rq->bio != rq->biotail);

        /*
         * @policy now records what operations need to be done.  Adjust
         * REQ_PREFLUSH and FUA for the driver.
         */
        rq->cmd_flags &= ~REQ_PREFLUSH;
        if (!(fflags & (1UL << QUEUE_FLAG_FUA)))
                rq->cmd_flags &= ~REQ_FUA;

        /*
         * REQ_PREFLUSH|REQ_FUA implies REQ_SYNC, so if we clear any
         * of those flags, we have to set REQ_SYNC to avoid skewing
         * the request accounting.
         */
        rq->cmd_flags |= REQ_SYNC;

        switch (policy) {
        case 0:
                /*
                 * An empty flush handed down from a stacking driver may
                 * translate into nothing if the underlying device does not
                 * advertise a write-back cache.  In this case, simply
                 * complete the request.
                 */
                blk_mq_end_request(rq, 0);
                return true;
        case REQ_FSEQ_DATA:
                /*
                 * If there's data, but no flush is necessary, the request can
                 * be processed directly without going through flush machinery.
                 * Queue for normal execution.
                 */
                return false;
        case REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH:
                /*
                 * Initialize the flush fields and completion handler to trigger
                 * the post flush, and then just pass the command on.
                 */
                blk_rq_init_flush(rq);
                rq->flush.seq |= REQ_FSEQ_PREFLUSH;
                spin_lock_irq(&fq->mq_flush_lock);
                fq->flush_data_in_flight++;
                spin_unlock_irq(&fq->mq_flush_lock);
                return false;
        default:
                /*
                 * Mark the request as part of a flush sequence and submit it
                 * for further processing to the flush state machine.
                 */
                blk_rq_init_flush(rq);
                spin_lock_irq(&fq->mq_flush_lock);
                blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
                spin_unlock_irq(&fq->mq_flush_lock);
                return true;
        }
}

/**
 * blkdev_issue_flush - queue a flush
 * @bdev:        blockdev to issue flush for
 *
 * Description:
 *    Issue a flush for the block device in question.
 */
int blkdev_issue_flush(struct block_device *bdev)
{
        struct bio bio;

        bio_init(&bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH);
        return submit_bio_wait(&bio);
}
EXPORT_SYMBOL(blkdev_issue_flush);

struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
                                              gfp_t flags)
{
        struct blk_flush_queue *fq;
        int rq_sz = sizeof(struct request);

        fq = kzalloc_node(sizeof(*fq), flags, node);
        if (!fq)
                goto fail;

        spin_lock_init(&fq->mq_flush_lock);

        rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
        fq->flush_rq = kzalloc_node(rq_sz, flags, node);
        if (!fq->flush_rq)
                goto fail_rq;

        INIT_LIST_HEAD(&fq->flush_queue[0]);
        INIT_LIST_HEAD(&fq->flush_queue[1]);

        return fq;

 fail_rq:
        kfree(fq);
 fail:
        return NULL;
}

void blk_free_flush_queue(struct blk_flush_queue *fq)
{
        /* bio based request queue hasn't flush queue */
        if (!fq)
                return;

        kfree(fq->flush_rq);
        kfree(fq);
}

/*
 * Allow driver to set its own lock class to fq->mq_flush_lock for
 * avoiding lockdep complaint.
 *
 * flush_end_io() may be called recursively from some driver, such as
 * nvme-loop, so lockdep may complain 'possible recursive locking' because
 * all 'struct blk_flush_queue' instance share same mq_flush_lock lock class
 * key. We need to assign different lock class for these driver's
 * fq->mq_flush_lock for avoiding the lockdep warning.
 *
 * Use dynamically allocated lock class key for each 'blk_flush_queue'
 * instance is over-kill, and more worse it introduces horrible boot delay
 * issue because synchronize_rcu() is implied in lockdep_unregister_key which
 * is called for each hctx release. SCSI probing may synchronously create and
 * destroy lots of MQ request_queues for non-existent devices, and some robot
 * test kernel always enable lockdep option. It is observed that more than half
 * an hour is taken during SCSI MQ probe with per-fq lock class.
 */
void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
                struct lock_class_key *key)
{
        lockdep_set_class(&hctx->fq->mq_flush_lock, key);
}
EXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class);



















































   16 
















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Synchronous Cryptographic Hash operations.
 *
 * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
 */

#include <crypto/scatterwalk.h>
#include <linux/cryptouser.h>
#include <linux/err.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/string.h>
#include <net/netlink.h>

#include "hash.h"

int shash_no_setkey(struct crypto_shash *tfm, const u8 *key,
                    unsigned int keylen)
{
        return -ENOSYS;
}
EXPORT_SYMBOL_GPL(shash_no_setkey);

static void shash_set_needkey(struct crypto_shash *tfm, struct shash_alg *alg)
{
        if (crypto_shash_alg_needs_key(alg))
                crypto_shash_set_flags(tfm, CRYPTO_TFM_NEED_KEY);
}

int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key,
                        unsigned int keylen)
{
        struct shash_alg *shash = crypto_shash_alg(tfm);
        int err;

        err = shash->setkey(tfm, key, keylen);
        if (unlikely(err)) {
                shash_set_needkey(tfm, shash);
                return err;
        }

        crypto_shash_clear_flags(tfm, CRYPTO_TFM_NEED_KEY);
        return 0;
}
EXPORT_SYMBOL_GPL(crypto_shash_setkey);

int crypto_shash_update(struct shash_desc *desc, const u8 *data,
                        unsigned int len)
{
        return crypto_shash_alg(desc->tfm)->update(desc, data, len);
}
EXPORT_SYMBOL_GPL(crypto_shash_update);

int crypto_shash_final(struct shash_desc *desc, u8 *out)
{
        return crypto_shash_alg(desc->tfm)->final(desc, out);
}
EXPORT_SYMBOL_GPL(crypto_shash_final);

static int shash_default_finup(struct shash_desc *desc, const u8 *data,
                               unsigned int len, u8 *out)
{
        struct shash_alg *shash = crypto_shash_alg(desc->tfm);

        return shash->update(desc, data, len) ?:
               shash->final(desc, out);
}

int crypto_shash_finup(struct shash_desc *desc, const u8 *data,
                       unsigned int len, u8 *out)
{
        return crypto_shash_alg(desc->tfm)->finup(desc, data, len, out);
}
EXPORT_SYMBOL_GPL(crypto_shash_finup);

static int shash_default_digest(struct shash_desc *desc, const u8 *data,
                                unsigned int len, u8 *out)
{
        struct shash_alg *shash = crypto_shash_alg(desc->tfm);

        return shash->init(desc) ?:
               shash->finup(desc, data, len, out);
}

int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
                        unsigned int len, u8 *out)
{
        struct crypto_shash *tfm = desc->tfm;

        if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;

        return crypto_shash_alg(tfm)->digest(desc, data, len, out);
}
EXPORT_SYMBOL_GPL(crypto_shash_digest);

int crypto_shash_tfm_digest(struct crypto_shash *tfm, const u8 *data,
                            unsigned int len, u8 *out)
{
        SHASH_DESC_ON_STACK(desc, tfm);
        int err;

        desc->tfm = tfm;

        err = crypto_shash_digest(desc, data, len, out);

        shash_desc_zero(desc);

        return err;
}
EXPORT_SYMBOL_GPL(crypto_shash_tfm_digest);

int crypto_shash_export(struct shash_desc *desc, void *out)
{
        struct crypto_shash *tfm = desc->tfm;
        struct shash_alg *shash = crypto_shash_alg(tfm);

        if (shash->export)
                return shash->export(desc, out);

        memcpy(out, shash_desc_ctx(desc), crypto_shash_descsize(tfm));
        return 0;
}
EXPORT_SYMBOL_GPL(crypto_shash_export);

int crypto_shash_import(struct shash_desc *desc, const void *in)
{
        struct crypto_shash *tfm = desc->tfm;
        struct shash_alg *shash = crypto_shash_alg(tfm);

        if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
                return -ENOKEY;

        if (shash->import)
                return shash->import(desc, in);

        memcpy(shash_desc_ctx(desc), in, crypto_shash_descsize(tfm));
        return 0;
}
EXPORT_SYMBOL_GPL(crypto_shash_import);

static void crypto_shash_exit_tfm(struct crypto_tfm *tfm)
{
        struct crypto_shash *hash = __crypto_shash_cast(tfm);
        struct shash_alg *alg = crypto_shash_alg(hash);

        alg->exit_tfm(hash);
}

static int crypto_shash_init_tfm(struct crypto_tfm *tfm)
{
        struct crypto_shash *hash = __crypto_shash_cast(tfm);
        struct shash_alg *alg = crypto_shash_alg(hash);
        int err;

        hash->descsize = alg->descsize;

        shash_set_needkey(hash, alg);

        if (alg->exit_tfm)
                tfm->exit = crypto_shash_exit_tfm;

        if (!alg->init_tfm)
                return 0;

        err = alg->init_tfm(hash);
        if (err)
                return err;

        /* ->init_tfm() may have increased the descsize. */
        if (WARN_ON_ONCE(hash->descsize > HASH_MAX_DESCSIZE)) {
                if (alg->exit_tfm)
                        alg->exit_tfm(hash);
                return -EINVAL;
        }

        return 0;
}

static void crypto_shash_free_instance(struct crypto_instance *inst)
{
        struct shash_instance *shash = shash_instance(inst);

        shash->free(shash);
}

static int __maybe_unused crypto_shash_report(
        struct sk_buff *skb, struct crypto_alg *alg)
{
        struct crypto_report_hash rhash;
        struct shash_alg *salg = __crypto_shash_alg(alg);

        memset(&rhash, 0, sizeof(rhash));

        strscpy(rhash.type, "shash", sizeof(rhash.type));

        rhash.blocksize = alg->cra_blocksize;
        rhash.digestsize = salg->digestsize;

        return nla_put(skb, CRYPTOCFGA_REPORT_HASH, sizeof(rhash), &rhash);
}

static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
        __maybe_unused;
static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
{
        struct shash_alg *salg = __crypto_shash_alg(alg);

        seq_printf(m, "type         : shash\n");
        seq_printf(m, "blocksize    : %u\n", alg->cra_blocksize);
        seq_printf(m, "digestsize   : %u\n", salg->digestsize);
}

const struct crypto_type crypto_shash_type = {
        .extsize = crypto_alg_extsize,
        .init_tfm = crypto_shash_init_tfm,
        .free = crypto_shash_free_instance,
#ifdef CONFIG_PROC_FS
        .show = crypto_shash_show,
#endif
#if IS_ENABLED(CONFIG_CRYPTO_USER)
        .report = crypto_shash_report,
#endif
        .maskclear = ~CRYPTO_ALG_TYPE_MASK,
        .maskset = CRYPTO_ALG_TYPE_MASK,
        .type = CRYPTO_ALG_TYPE_SHASH,
        .tfmsize = offsetof(struct crypto_shash, base),
};

int crypto_grab_shash(struct crypto_shash_spawn *spawn,
                      struct crypto_instance *inst,
                      const char *name, u32 type, u32 mask)
{
        spawn->base.frontend = &crypto_shash_type;
        return crypto_grab_spawn(&spawn->base, inst, name, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_grab_shash);

struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type,
                                        u32 mask)
{
        return crypto_alloc_tfm(alg_name, &crypto_shash_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_alloc_shash);

int crypto_has_shash(const char *alg_name, u32 type, u32 mask)
{
        return crypto_type_has_alg(alg_name, &crypto_shash_type, type, mask);
}
EXPORT_SYMBOL_GPL(crypto_has_shash);

struct crypto_shash *crypto_clone_shash(struct crypto_shash *hash)
{
        struct crypto_tfm *tfm = crypto_shash_tfm(hash);
        struct shash_alg *alg = crypto_shash_alg(hash);
        struct crypto_shash *nhash;
        int err;

        if (!crypto_shash_alg_has_setkey(alg)) {
                tfm = crypto_tfm_get(tfm);
                if (IS_ERR(tfm))
                        return ERR_CAST(tfm);

                return hash;
        }

        if (!alg->clone_tfm && (alg->init_tfm || alg->base.cra_init))
                return ERR_PTR(-ENOSYS);

        nhash = crypto_clone_tfm(&crypto_shash_type, tfm);
        if (IS_ERR(nhash))
                return nhash;

        nhash->descsize = hash->descsize;

        if (alg->clone_tfm) {
                err = alg->clone_tfm(nhash, hash);
                if (err) {
                        crypto_free_shash(nhash);
                        return ERR_PTR(err);
                }
        }

        return nhash;
}
EXPORT_SYMBOL_GPL(crypto_clone_shash);

int hash_prepare_alg(struct hash_alg_common *alg)
{
        struct crypto_alg *base = &alg->base;

        if (alg->digestsize > HASH_MAX_DIGESTSIZE)
                return -EINVAL;

        /* alignmask is not useful for hashes, so it is not supported. */
        if (base->cra_alignmask)
                return -EINVAL;

        base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;

        return 0;
}

static int shash_prepare_alg(struct shash_alg *alg)
{
        struct crypto_alg *base = &alg->halg.base;
        int err;

        if (alg->descsize > HASH_MAX_DESCSIZE)
                return -EINVAL;

        if ((alg->export && !alg->import) || (alg->import && !alg->export))
                return -EINVAL;

        err = hash_prepare_alg(&alg->halg);
        if (err)
                return err;

        base->cra_type = &crypto_shash_type;
        base->cra_flags |= CRYPTO_ALG_TYPE_SHASH;

        /*
         * Handle missing optional functions.  For each one we can either
         * install a default here, or we can leave the pointer as NULL and check
         * the pointer for NULL in crypto_shash_*(), avoiding an indirect call
         * when the default behavior is desired.  For ->finup and ->digest we
         * install defaults, since for optimal performance algorithms should
         * implement these anyway.  On the other hand, for ->import and
         * ->export the common case and best performance comes from the simple
         * memcpy of the shash_desc_ctx, so when those pointers are NULL we
         * leave them NULL and provide the memcpy with no indirect call.
         */
        if (!alg->finup)
                alg->finup = shash_default_finup;
        if (!alg->digest)
                alg->digest = shash_default_digest;
        if (!alg->export)
                alg->halg.statesize = alg->descsize;
        if (!alg->setkey)
                alg->setkey = shash_no_setkey;

        return 0;
}

int crypto_register_shash(struct shash_alg *alg)
{
        struct crypto_alg *base = &alg->base;
        int err;

        err = shash_prepare_alg(alg);
        if (err)
                return err;

        return crypto_register_alg(base);
}
EXPORT_SYMBOL_GPL(crypto_register_shash);

void crypto_unregister_shash(struct shash_alg *alg)
{
        crypto_unregister_alg(&alg->base);
}
EXPORT_SYMBOL_GPL(crypto_unregister_shash);

int crypto_register_shashes(struct shash_alg *algs, int count)
{
        int i, ret;

        for (i = 0; i < count; i++) {
                ret = crypto_register_shash(&algs[i]);
                if (ret)
                        goto err;
        }

        return 0;

err:
        for (--i; i >= 0; --i)
                crypto_unregister_shash(&algs[i]);

        return ret;
}
EXPORT_SYMBOL_GPL(crypto_register_shashes);

void crypto_unregister_shashes(struct shash_alg *algs, int count)
{
        int i;

        for (i = count - 1; i >= 0; --i)
                crypto_unregister_shash(&algs[i]);
}
EXPORT_SYMBOL_GPL(crypto_unregister_shashes);

int shash_register_instance(struct crypto_template *tmpl,
                            struct shash_instance *inst)
{
        int err;

        if (WARN_ON(!inst->free))
                return -EINVAL;

        err = shash_prepare_alg(&inst->alg);
        if (err)
                return err;

        return crypto_register_instance(tmpl, shash_crypto_instance(inst));
}
EXPORT_SYMBOL_GPL(shash_register_instance);

void shash_free_singlespawn_instance(struct shash_instance *inst)
{
        crypto_drop_spawn(shash_instance_ctx(inst));
        kfree(inst);
}
EXPORT_SYMBOL_GPL(shash_free_singlespawn_instance);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Synchronous cryptographic hash type");





















    2 















































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __KERNEL_PRINTK__
#define __KERNEL_PRINTK__

#include <linux/stdarg.h>
#include <linux/init.h>
#include <linux/kern_levels.h>
#include <linux/linkage.h>
#include <linux/ratelimit_types.h>
#include <linux/once_lite.h>

extern const char linux_banner[];
extern const char linux_proc_banner[];

extern int oops_in_progress;        /* If set, an oops, panic(), BUG() or die() is in progress */

#define PRINTK_MAX_SINGLE_HEADER_LEN 2

static inline int printk_get_level(const char *buffer)
{
        if (buffer[0] == KERN_SOH_ASCII && buffer[1]) {
                switch (buffer[1]) {
                case '0' ... '7':
                case 'c':        /* KERN_CONT */
                        return buffer[1];
                }
        }
        return 0;
}

static inline const char *printk_skip_level(const char *buffer)
{
        if (printk_get_level(buffer))
                return buffer + 2;

        return buffer;
}

static inline const char *printk_skip_headers(const char *buffer)
{
        while (printk_get_level(buffer))
                buffer = printk_skip_level(buffer);

        return buffer;
}

/* printk's without a loglevel use this.. */
#define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT

/* We show everything that is MORE important than this.. */
#define CONSOLE_LOGLEVEL_SILENT  0 /* Mum's the word */
#define CONSOLE_LOGLEVEL_MIN         1 /* Minimum loglevel we let people use */
#define CONSOLE_LOGLEVEL_DEBUG        10 /* issue debug messages */
#define CONSOLE_LOGLEVEL_MOTORMOUTH 15        /* You can't shut this one up */

/*
 * Default used to be hard-coded at 7, quiet used to be hardcoded at 4,
 * we're now allowing both to be set from kernel config.
 */
#define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT
#define CONSOLE_LOGLEVEL_QUIET         CONFIG_CONSOLE_LOGLEVEL_QUIET

int add_preferred_console_match(const char *match, const char *name,
                                const short idx);

extern int console_printk[];

#define console_loglevel (console_printk[0])
#define default_message_loglevel (console_printk[1])
#define minimum_console_loglevel (console_printk[2])
#define default_console_loglevel (console_printk[3])

extern void console_verbose(void);

/* strlen("ratelimit") + 1 */
#define DEVKMSG_STR_MAX_SIZE 10
extern char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE];
struct ctl_table;

extern int suppress_printk;

struct va_format {
        const char *fmt;
        va_list *va;
};

/*
 * FW_BUG
 * Add this to a message where you are sure the firmware is buggy or behaves
 * really stupid or out of spec. Be aware that the responsible BIOS developer
 * should be able to fix this issue or at least get a concrete idea of the
 * problem by reading your message without the need of looking at the kernel
 * code.
 *
 * Use it for definite and high priority BIOS bugs.
 *
 * FW_WARN
 * Use it for not that clear (e.g. could the kernel messed up things already?)
 * and medium priority BIOS bugs.
 *
 * FW_INFO
 * Use this one if you want to tell the user or vendor about something
 * suspicious, but generally harmless related to the firmware.
 *
 * Use it for information or very low priority BIOS bugs.
 */
#define FW_BUG                "[Firmware Bug]: "
#define FW_WARN                "[Firmware Warn]: "
#define FW_INFO                "[Firmware Info]: "

/*
 * HW_ERR
 * Add this to a message for hardware errors, so that user can report
 * it to hardware vendor instead of LKML or software vendor.
 */
#define HW_ERR                "[Hardware Error]: "

/*
 * DEPRECATED
 * Add this to a message whenever you want to warn user space about the use
 * of a deprecated aspect of an API so they can stop using it
 */
#define DEPRECATED        "[Deprecated]: "

/*
 * Dummy printk for disabled debugging statements to use whilst maintaining
 * gcc's format checking.
 */
#define no_printk(fmt, ...)                                \
({                                                        \
        if (0)                                                \
                _printk(fmt, ##__VA_ARGS__);                \
        0;                                                \
})

#ifdef CONFIG_EARLY_PRINTK
extern asmlinkage __printf(1, 2)
void early_printk(const char *fmt, ...);
#else
static inline __printf(1, 2) __cold
void early_printk(const char *s, ...) { }
#endif

struct dev_printk_info;

#ifdef CONFIG_PRINTK
asmlinkage __printf(4, 0)
int vprintk_emit(int facility, int level,
                 const struct dev_printk_info *dev_info,
                 const char *fmt, va_list args);

asmlinkage __printf(1, 0)
int vprintk(const char *fmt, va_list args);

asmlinkage __printf(1, 2) __cold
int _printk(const char *fmt, ...);

/*
 * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ !
 */
__printf(1, 2) __cold int _printk_deferred(const char *fmt, ...);

extern void __printk_safe_enter(void);
extern void __printk_safe_exit(void);
/*
 * The printk_deferred_enter/exit macros are available only as a hack for
 * some code paths that need to defer all printk console printing. Interrupts
 * must be disabled for the deferred duration.
 */
#define printk_deferred_enter __printk_safe_enter
#define printk_deferred_exit __printk_safe_exit

/*
 * Please don't use printk_ratelimit(), because it shares ratelimiting state
 * with all other unrelated printk_ratelimit() callsites.  Instead use
 * printk_ratelimited() or plain old __ratelimit().
 */
extern int __printk_ratelimit(const char *func);
#define printk_ratelimit() __printk_ratelimit(__func__)
extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
                                   unsigned int interval_msec);

extern int printk_delay_msec;
extern int dmesg_restrict;

extern void wake_up_klogd(void);

char *log_buf_addr_get(void);
u32 log_buf_len_get(void);
void log_buf_vmcoreinfo_setup(void);
void __init setup_log_buf(int early);
__printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
void dump_stack_print_info(const char *log_lvl);
void show_regs_print_info(const char *log_lvl);
extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold;
extern asmlinkage void dump_stack(void) __cold;
void printk_trigger_flush(void);
void console_replay_all(void);
#else
static inline __printf(1, 0)
int vprintk(const char *s, va_list args)
{
        return 0;
}
static inline __printf(1, 2) __cold
int _printk(const char *s, ...)
{
        return 0;
}
static inline __printf(1, 2) __cold
int _printk_deferred(const char *s, ...)
{
        return 0;
}

static inline void printk_deferred_enter(void)
{
}

static inline void printk_deferred_exit(void)
{
}

static inline int printk_ratelimit(void)
{
        return 0;
}
static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies,
                                          unsigned int interval_msec)
{
        return false;
}

static inline void wake_up_klogd(void)
{
}

static inline char *log_buf_addr_get(void)
{
        return NULL;
}

static inline u32 log_buf_len_get(void)
{
        return 0;
}

static inline void log_buf_vmcoreinfo_setup(void)
{
}

static inline void setup_log_buf(int early)
{
}

static inline __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...)
{
}

static inline void dump_stack_print_info(const char *log_lvl)
{
}

static inline void show_regs_print_info(const char *log_lvl)
{
}

static inline void dump_stack_lvl(const char *log_lvl)
{
}

static inline void dump_stack(void)
{
}
static inline void printk_trigger_flush(void)
{
}
static inline void console_replay_all(void)
{
}
#endif

bool this_cpu_in_panic(void);

#ifdef CONFIG_SMP
extern int __printk_cpu_sync_try_get(void);
extern void __printk_cpu_sync_wait(void);
extern void __printk_cpu_sync_put(void);

#else

#define __printk_cpu_sync_try_get() true
#define __printk_cpu_sync_wait()
#define __printk_cpu_sync_put()
#endif /* CONFIG_SMP */

/**
 * printk_cpu_sync_get_irqsave() - Disable interrupts and acquire the printk
 *                                 cpu-reentrant spinning lock.
 * @flags: Stack-allocated storage for saving local interrupt state,
 *         to be passed to printk_cpu_sync_put_irqrestore().
 *
 * If the lock is owned by another CPU, spin until it becomes available.
 * Interrupts are restored while spinning.
 *
 * CAUTION: This function must be used carefully. It does not behave like a
 * typical lock. Here are important things to watch out for...
 *
 *     * This function is reentrant on the same CPU. Therefore the calling
 *       code must not assume exclusive access to data if code accessing the
 *       data can run reentrant or within NMI context on the same CPU.
 *
 *     * If there exists usage of this function from NMI context, it becomes
 *       unsafe to perform any type of locking or spinning to wait for other
 *       CPUs after calling this function from any context. This includes
 *       using spinlocks or any other busy-waiting synchronization methods.
 */
#define printk_cpu_sync_get_irqsave(flags)                \
        for (;;) {                                        \
                local_irq_save(flags);                        \
                if (__printk_cpu_sync_try_get())        \
                        break;                                \
                local_irq_restore(flags);                \
                __printk_cpu_sync_wait();                \
        }

/**
 * printk_cpu_sync_put_irqrestore() - Release the printk cpu-reentrant spinning
 *                                    lock and restore interrupts.
 * @flags: Caller's saved interrupt state, from printk_cpu_sync_get_irqsave().
 */
#define printk_cpu_sync_put_irqrestore(flags)        \
        do {                                        \
                __printk_cpu_sync_put();        \
                local_irq_restore(flags);        \
        } while (0)

extern int kptr_restrict;

/**
 * pr_fmt - used by the pr_*() macros to generate the printk format string
 * @fmt: format string passed from a pr_*() macro
 *
 * This macro can be used to generate a unified format string for pr_*()
 * macros. A common use is to prefix all pr_*() messages in a file with a common
 * string. For example, defining this at the top of a source file:
 *
 *        #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 *
 * would prefix all pr_info, pr_emerg... messages in the file with the module
 * name.
 */
#ifndef pr_fmt
#define pr_fmt(fmt) fmt
#endif

struct module;

#ifdef CONFIG_PRINTK_INDEX
struct pi_entry {
        const char *fmt;
        const char *func;
        const char *file;
        unsigned int line;

        /*
         * While printk and pr_* have the level stored in the string at compile
         * time, some subsystems dynamically add it at runtime through the
         * format string. For these dynamic cases, we allow the subsystem to
         * tell us the level at compile time.
         *
         * NULL indicates that the level, if any, is stored in fmt.
         */
        const char *level;

        /*
         * The format string used by various subsystem specific printk()
         * wrappers to prefix the message.
         *
         * Note that the static prefix defined by the pr_fmt() macro is stored
         * directly in the message format (@fmt), not here.
         */
        const char *subsys_fmt_prefix;
} __packed;

#define __printk_index_emit(_fmt, _level, _subsys_fmt_prefix)                \
        do {                                                                \
                if (__builtin_constant_p(_fmt) && __builtin_constant_p(_level)) { \
                        /*
                         * We check __builtin_constant_p multiple times here
                         * for the same input because GCC will produce an error
                         * if we try to assign a static variable to fmt if it
                         * is not a constant, even with the outer if statement.
                         */                                                \
                        static const struct pi_entry _entry                \
                        __used = {                                        \
                                .fmt = __builtin_constant_p(_fmt) ? (_fmt) : NULL, \
                                .func = __func__,                        \
                                .file = __FILE__,                        \
                                .line = __LINE__,                        \
                                .level = __builtin_constant_p(_level) ? (_level) : NULL, \
                                .subsys_fmt_prefix = _subsys_fmt_prefix,\
                        };                                                \
                        static const struct pi_entry *_entry_ptr        \
                        __used __section(".printk_index") = &_entry;        \
                }                                                        \
        } while (0)

#else /* !CONFIG_PRINTK_INDEX */
#define __printk_index_emit(...) do {} while (0)
#endif /* CONFIG_PRINTK_INDEX */

/*
 * Some subsystems have their own custom printk that applies a va_format to a
 * generic format, for example, to include a device number or other metadata
 * alongside the format supplied by the caller.
 *
 * In order to store these in the way they would be emitted by the printk
 * infrastructure, the subsystem provides us with the start, fixed string, and
 * any subsequent text in the format string.
 *
 * We take a variable argument list as pr_fmt/dev_fmt/etc are sometimes passed
 * as multiple arguments (eg: `"%s: ", "blah"`), and we must only take the
 * first one.
 *
 * subsys_fmt_prefix must be known at compile time, or compilation will fail
 * (since this is a mistake). If fmt or level is not known at compile time, no
 * index entry will be made (since this can legitimately happen).
 */
#define printk_index_subsys_emit(subsys_fmt_prefix, level, fmt, ...) \
        __printk_index_emit(fmt, level, subsys_fmt_prefix)

#define printk_index_wrap(_p_func, _fmt, ...)                                \
        ({                                                                \
                __printk_index_emit(_fmt, NULL, NULL);                        \
                _p_func(_fmt, ##__VA_ARGS__);                                \
        })


/**
 * printk - print a kernel message
 * @fmt: format string
 *
 * This is printk(). It can be called from any context. We want it to work.
 *
 * If printk indexing is enabled, _printk() is called from printk_index_wrap.
 * Otherwise, printk is simply #defined to _printk.
 *
 * We try to grab the console_lock. If we succeed, it's easy - we log the
 * output and call the console drivers.  If we fail to get the semaphore, we
 * place the output into the log buffer and return. The current holder of
 * the console_sem will notice the new output in console_unlock(); and will
 * send it to the consoles before releasing the lock.
 *
 * One effect of this deferred printing is that code which calls printk() and
 * then changes console_loglevel may break. This is because console_loglevel
 * is inspected when the actual printing occurs.
 *
 * See also:
 * printf(3)
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
#define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
#define printk_deferred(fmt, ...)                                        \
        printk_index_wrap(_printk_deferred, fmt, ##__VA_ARGS__)

/**
 * pr_emerg - Print an emergency-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_EMERG loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_emerg(fmt, ...) \
        printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_alert - Print an alert-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_ALERT loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_alert(fmt, ...) \
        printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_crit - Print a critical-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_CRIT loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_crit(fmt, ...) \
        printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_err - Print an error-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_ERR loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_err(fmt, ...) \
        printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_warn - Print a warning-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_WARNING loglevel. It uses pr_fmt()
 * to generate the format string.
 */
#define pr_warn(fmt, ...) \
        printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_notice - Print a notice-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_NOTICE loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_notice(fmt, ...) \
        printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_info - Print an info-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_INFO loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_info(fmt, ...) \
        printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)

/**
 * pr_cont - Continues a previous log message in the same line.
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_CONT loglevel. It should only be
 * used when continuing a log message with no newline ('\n') enclosed. Otherwise
 * it defaults back to KERN_DEFAULT loglevel.
 */
#define pr_cont(fmt, ...) \
        printk(KERN_CONT fmt, ##__VA_ARGS__)

/**
 * pr_devel - Print a debug-level message conditionally
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_DEBUG loglevel if DEBUG is
 * defined. Otherwise it does nothing.
 *
 * It uses pr_fmt() to generate the format string.
 */
#ifdef DEBUG
#define pr_devel(fmt, ...) \
        printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_devel(fmt, ...) \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif


/* If you are writing a driver, please use dev_dbg instead */
#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#include <linux/dynamic_debug.h>

/**
 * pr_debug - Print a debug-level message conditionally
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to dynamic_pr_debug() if CONFIG_DYNAMIC_DEBUG is
 * set. Otherwise, if DEBUG is defined, it's equivalent to a printk with
 * KERN_DEBUG loglevel. If DEBUG is not defined it does nothing.
 *
 * It uses pr_fmt() to generate the format string (dynamic_pr_debug() uses
 * pr_fmt() internally).
 */
#define pr_debug(fmt, ...)                        \
        dynamic_pr_debug(fmt, ##__VA_ARGS__)
#elif defined(DEBUG)
#define pr_debug(fmt, ...) \
        printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_debug(fmt, ...) \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

/*
 * Print a one-time message (analogous to WARN_ONCE() et al):
 */

#ifdef CONFIG_PRINTK
#define printk_once(fmt, ...)                                        \
        DO_ONCE_LITE(printk, fmt, ##__VA_ARGS__)
#define printk_deferred_once(fmt, ...)                                \
        DO_ONCE_LITE(printk_deferred, fmt, ##__VA_ARGS__)
#else
#define printk_once(fmt, ...)                                        \
        no_printk(fmt, ##__VA_ARGS__)
#define printk_deferred_once(fmt, ...)                                \
        no_printk(fmt, ##__VA_ARGS__)
#endif

#define pr_emerg_once(fmt, ...)                                        \
        printk_once(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
#define pr_alert_once(fmt, ...)                                        \
        printk_once(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_crit_once(fmt, ...)                                        \
        printk_once(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_err_once(fmt, ...)                                        \
        printk_once(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
#define pr_warn_once(fmt, ...)                                        \
        printk_once(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
#define pr_notice_once(fmt, ...)                                \
        printk_once(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
#define pr_info_once(fmt, ...)                                        \
        printk_once(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
/* no pr_cont_once, don't do that... */

#if defined(DEBUG)
#define pr_devel_once(fmt, ...)                                        \
        printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_devel_once(fmt, ...)                                        \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

/* If you are writing a driver, please use dev_dbg instead */
#if defined(DEBUG)
#define pr_debug_once(fmt, ...)                                        \
        printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_debug_once(fmt, ...)                                        \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

/*
 * ratelimited messages with local ratelimit_state,
 * no local ratelimit_state used in the !PRINTK case
 */
#ifdef CONFIG_PRINTK
#define printk_ratelimited(fmt, ...)                                        \
({                                                                        \
        static DEFINE_RATELIMIT_STATE(_rs,                                \
                                      DEFAULT_RATELIMIT_INTERVAL,        \
                                      DEFAULT_RATELIMIT_BURST);                \
                                                                        \
        if (__ratelimit(&_rs))                                                \
                printk(fmt, ##__VA_ARGS__);                                \
})
#else
#define printk_ratelimited(fmt, ...)                                        \
        no_printk(fmt, ##__VA_ARGS__)
#endif

#define pr_emerg_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
#define pr_alert_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_crit_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_err_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
#define pr_warn_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
#define pr_notice_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
#define pr_info_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
/* no pr_cont_ratelimited, don't do that... */

#if defined(DEBUG)
#define pr_devel_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_devel_ratelimited(fmt, ...)                                        \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

/* If you are writing a driver, please use dev_dbg instead */
#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
/* descriptor check is first to prevent flooding with "callbacks suppressed" */
#define pr_debug_ratelimited(fmt, ...)                                        \
do {                                                                        \
        static DEFINE_RATELIMIT_STATE(_rs,                                \
                                      DEFAULT_RATELIMIT_INTERVAL,        \
                                      DEFAULT_RATELIMIT_BURST);                \
        DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, pr_fmt(fmt));                \
        if (DYNAMIC_DEBUG_BRANCH(descriptor) &&                                \
            __ratelimit(&_rs))                                                \
                __dynamic_pr_debug(&descriptor, pr_fmt(fmt), ##__VA_ARGS__);        \
} while (0)
#elif defined(DEBUG)
#define pr_debug_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_debug_ratelimited(fmt, ...) \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

extern const struct file_operations kmsg_fops;

enum {
        DUMP_PREFIX_NONE,
        DUMP_PREFIX_ADDRESS,
        DUMP_PREFIX_OFFSET
};
extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
                              int groupsize, char *linebuf, size_t linebuflen,
                              bool ascii);
#ifdef CONFIG_PRINTK
extern void print_hex_dump(const char *level, const char *prefix_str,
                           int prefix_type, int rowsize, int groupsize,
                           const void *buf, size_t len, bool ascii);
#else
static inline void print_hex_dump(const char *level, const char *prefix_str,
                                  int prefix_type, int rowsize, int groupsize,
                                  const void *buf, size_t len, bool ascii)
{
}
static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
                                        const void *buf, size_t len)
{
}

#endif

#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#define print_hex_dump_debug(prefix_str, prefix_type, rowsize,        \
                             groupsize, buf, len, ascii)        \
        dynamic_hex_dump(prefix_str, prefix_type, rowsize,        \
                         groupsize, buf, len, ascii)
#elif defined(DEBUG)
#define print_hex_dump_debug(prefix_str, prefix_type, rowsize,                \
                             groupsize, buf, len, ascii)                \
        print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, rowsize,        \
                       groupsize, buf, len, ascii)
#else
static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type,
                                        int rowsize, int groupsize,
                                        const void *buf, size_t len, bool ascii)
{
}
#endif

/**
 * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params
 * @prefix_str: string to prefix each line with;
 *  caller supplies trailing spaces for alignment if desired
 * @prefix_type: controls whether prefix of an offset, address, or none
 *  is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE)
 * @buf: data blob to dump
 * @len: number of bytes in the @buf
 *
 * Calls print_hex_dump(), with log level of KERN_DEBUG,
 * rowsize of 16, groupsize of 1, and ASCII output included.
 */
#define print_hex_dump_bytes(prefix_str, prefix_type, buf, len)        \
        print_hex_dump_debug(prefix_str, prefix_type, 16, 1, buf, len, true)

#endif


























































































































































    2 



    2 




























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Flexible mmap layout support
 *
 * Based on code by Ingo Molnar and Andi Kleen, copyrighted
 * as follows:
 *
 * Copyright 2003-2009 Red Hat Inc.
 * All Rights Reserved.
 * Copyright 2005 Andi Kleen, SUSE Labs.
 * Copyright 2007 Jiri Kosina, SUSE Labs.
 */

#include <linux/personality.h>
#include <linux/mm.h>
#include <linux/random.h>
#include <linux/limits.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/compat.h>
#include <linux/elf-randomize.h>
#include <asm/elf.h>
#include <asm/io.h>

#include "physaddr.h"

struct va_alignment __read_mostly va_align = {
        .flags = -1,
};

unsigned long task_size_32bit(void)
{
        return IA32_PAGE_OFFSET;
}

unsigned long task_size_64bit(int full_addr_space)
{
        return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW;
}

static unsigned long stack_maxrandom_size(unsigned long task_size)
{
        unsigned long max = 0;
        if (current->flags & PF_RANDOMIZE) {
                max = (-1UL) & __STACK_RND_MASK(task_size == task_size_32bit());
                max <<= PAGE_SHIFT;
        }

        return max;
}

#ifdef CONFIG_COMPAT
# define mmap32_rnd_bits  mmap_rnd_compat_bits
# define mmap64_rnd_bits  mmap_rnd_bits
#else
# define mmap32_rnd_bits  mmap_rnd_bits
# define mmap64_rnd_bits  mmap_rnd_bits
#endif

#define SIZE_128M    (128 * 1024 * 1024UL)

static int mmap_is_legacy(void)
{
        if (current->personality & ADDR_COMPAT_LAYOUT)
                return 1;

        return sysctl_legacy_va_layout;
}

static unsigned long arch_rnd(unsigned int rndbits)
{
        if (!(current->flags & PF_RANDOMIZE))
                return 0;
        return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT;
}

unsigned long arch_mmap_rnd(void)
{
        return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits);
}

static unsigned long mmap_base(unsigned long rnd, unsigned long task_size,
                               struct rlimit *rlim_stack)
{
        unsigned long gap = rlim_stack->rlim_cur;
        unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap;
        unsigned long gap_min, gap_max;

        /* Values close to RLIM_INFINITY can overflow. */
        if (gap + pad > gap)
                gap += pad;

        /*
         * Top of mmap area (just below the process stack).
         * Leave an at least ~128 MB hole with possible stack randomization.
         */
        gap_min = SIZE_128M;
        gap_max = (task_size / 6) * 5;

        if (gap < gap_min)
                gap = gap_min;
        else if (gap > gap_max)
                gap = gap_max;

        return PAGE_ALIGN(task_size - gap - rnd);
}

static unsigned long mmap_legacy_base(unsigned long rnd,
                                      unsigned long task_size)
{
        return __TASK_UNMAPPED_BASE(task_size) + rnd;
}

/*
 * This function, called very early during the creation of a new
 * process VM image, sets up which VM layout function to use:
 */
static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base,
                unsigned long random_factor, unsigned long task_size,
                struct rlimit *rlim_stack)
{
        *legacy_base = mmap_legacy_base(random_factor, task_size);
        if (mmap_is_legacy())
                *base = *legacy_base;
        else
                *base = mmap_base(random_factor, task_size, rlim_stack);
}

void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
        if (mmap_is_legacy())
                clear_bit(MMF_TOPDOWN, &mm->flags);
        else
                set_bit(MMF_TOPDOWN, &mm->flags);

        arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
                        arch_rnd(mmap64_rnd_bits), task_size_64bit(0),
                        rlim_stack);

#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
        /*
         * The mmap syscall mapping base decision depends solely on the
         * syscall type (64-bit or compat). This applies for 64bit
         * applications and 32bit applications. The 64bit syscall uses
         * mmap_base, the compat syscall uses mmap_compat_base.
         */
        arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base,
                        arch_rnd(mmap32_rnd_bits), task_size_32bit(),
                        rlim_stack);
#endif
}

unsigned long get_mmap_base(int is_legacy)
{
        struct mm_struct *mm = current->mm;

#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
        if (in_32bit_syscall()) {
                return is_legacy ? mm->mmap_compat_legacy_base
                                 : mm->mmap_compat_base;
        }
#endif
        return is_legacy ? mm->mmap_legacy_base : mm->mmap_base;
}

const char *arch_vma_name(struct vm_area_struct *vma)
{
        return NULL;
}

/**
 * mmap_address_hint_valid - Validate the address hint of mmap
 * @addr:        Address hint
 * @len:        Mapping length
 *
 * Check whether @addr and @addr + @len result in a valid mapping.
 *
 * On 32bit this only checks whether @addr + @len is <= TASK_SIZE.
 *
 * On 64bit with 5-level page tables another sanity check is required
 * because mappings requested by mmap(@addr, 0) which cross the 47-bit
 * virtual address boundary can cause the following theoretical issue:
 *
 *  An application calls mmap(addr, 0), i.e. without MAP_FIXED, where @addr
 *  is below the border of the 47-bit address space and @addr + @len is
 *  above the border.
 *
 *  With 4-level paging this request succeeds, but the resulting mapping
 *  address will always be within the 47-bit virtual address space, because
 *  the hint address does not result in a valid mapping and is
 *  ignored. Hence applications which are not prepared to handle virtual
 *  addresses above 47-bit work correctly.
 *
 *  With 5-level paging this request would be granted and result in a
 *  mapping which crosses the border of the 47-bit virtual address
 *  space. If the application cannot handle addresses above 47-bit this
 *  will lead to misbehaviour and hard to diagnose failures.
 *
 * Therefore ignore address hints which would result in a mapping crossing
 * the 47-bit virtual address boundary.
 *
 * Note, that in the same scenario with MAP_FIXED the behaviour is
 * different. The request with @addr < 47-bit and @addr + @len > 47-bit
 * fails on a 4-level paging machine but succeeds on a 5-level paging
 * machine. It is reasonable to expect that an application does not rely on
 * the failure of such a fixed mapping request, so the restriction is not
 * applied.
 */
bool mmap_address_hint_valid(unsigned long addr, unsigned long len)
{
        if (TASK_SIZE - len < addr)
                return false;

        return (addr > DEFAULT_MAP_WINDOW) == (addr + len > DEFAULT_MAP_WINDOW);
}

/* Can we access it for direct reading/writing? Must be RAM: */
int valid_phys_addr_range(phys_addr_t addr, size_t count)
{
        return addr + count - 1 <= __pa(high_memory - 1);
}

/* Can we access it through mmap? Must be a valid physical address: */
int valid_mmap_phys_addr_range(unsigned long pfn, size_t count)
{
        phys_addr_t addr = (phys_addr_t)pfn << PAGE_SHIFT;

        return phys_addr_valid(addr + count - 1);
}

/*
 * Only allow root to set high MMIO mappings to PROT_NONE.
 * This prevents an unpriv. user to set them to PROT_NONE and invert
 * them, then pointing to valid memory for L1TF speculation.
 *
 * Note: for locked down kernels may want to disable the root override.
 */
bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
{
        if (!boot_cpu_has_bug(X86_BUG_L1TF))
                return true;
        if (!__pte_needs_invert(pgprot_val(prot)))
                return true;
        /* If it's real memory always allow */
        if (pfn_valid(pfn))
                return true;
        if (pfn >= l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN))
                return false;
        return true;
}























    1 











    1 














































































    2 

































    4 
    2 







    4 









    2 

















    2 









    2 
    1 



    2 






    2 






    1 










































































































    4 


    4 









































    4 




    4 
    3 
    4 























    4 












    4 
    4 


























    3 












    3 
















    4 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
#include <linux/gfp.h>
#include <linux/highmem.h>
#include <linux/kernel.h>
#include <linux/mmdebug.h>
#include <linux/mm_types.h>
#include <linux/mm_inline.h>
#include <linux/pagemap.h>
#include <linux/rcupdate.h>
#include <linux/smp.h>
#include <linux/swap.h>
#include <linux/rmap.h>

#include <asm/pgalloc.h>
#include <asm/tlb.h>

#ifndef CONFIG_MMU_GATHER_NO_GATHER

static bool tlb_next_batch(struct mmu_gather *tlb)
{
        struct mmu_gather_batch *batch;

        /* Limit batching if we have delayed rmaps pending */
        if (tlb->delayed_rmap && tlb->active != &tlb->local)
                return false;

        batch = tlb->active;
        if (batch->next) {
                tlb->active = batch->next;
                return true;
        }

        if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
                return false;

        batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
        if (!batch)
                return false;

        tlb->batch_count++;
        batch->next = NULL;
        batch->nr   = 0;
        batch->max  = MAX_GATHER_BATCH;

        tlb->active->next = batch;
        tlb->active = batch;

        return true;
}

#ifdef CONFIG_SMP
static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma)
{
        struct encoded_page **pages = batch->encoded_pages;

        for (int i = 0; i < batch->nr; i++) {
                struct encoded_page *enc = pages[i];

                if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) {
                        struct page *page = encoded_page_ptr(enc);
                        unsigned int nr_pages = 1;

                        if (unlikely(encoded_page_flags(enc) &
                                     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
                                nr_pages = encoded_nr_pages(pages[++i]);

                        folio_remove_rmap_ptes(page_folio(page), page, nr_pages,
                                               vma);
                }
        }
}

/**
 * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB
 * @tlb: the current mmu_gather
 * @vma: The memory area from which the pages are being removed.
 *
 * Note that because of how tlb_next_batch() above works, we will
 * never start multiple new batches with pending delayed rmaps, so
 * we only need to walk through the current active batch and the
 * original local one.
 */
void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        if (!tlb->delayed_rmap)
                return;

        tlb_flush_rmap_batch(&tlb->local, vma);
        if (tlb->active != &tlb->local)
                tlb_flush_rmap_batch(tlb->active, vma);
        tlb->delayed_rmap = 0;
}
#endif

/*
 * We might end up freeing a lot of pages. Reschedule on a regular
 * basis to avoid soft lockups in configurations without full
 * preemption enabled. The magic number of 512 folios seems to work.
 */
#define MAX_NR_FOLIOS_PER_FREE                512

static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
{
        struct encoded_page **pages = batch->encoded_pages;
        unsigned int nr, nr_pages;

        while (batch->nr) {
                if (!page_poisoning_enabled_static() && !want_init_on_free()) {
                        nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr);

                        /*
                         * Make sure we cover page + nr_pages, and don't leave
                         * nr_pages behind when capping the number of entries.
                         */
                        if (unlikely(encoded_page_flags(pages[nr - 1]) &
                                     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
                                nr++;
                } else {
                        /*
                         * With page poisoning and init_on_free, the time it
                         * takes to free memory grows proportionally with the
                         * actual memory size. Therefore, limit based on the
                         * actual memory size and not the number of involved
                         * folios.
                         */
                        for (nr = 0, nr_pages = 0;
                             nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE;
                             nr++) {
                                if (unlikely(encoded_page_flags(pages[nr]) &
                                             ENCODED_PAGE_BIT_NR_PAGES_NEXT))
                                        nr_pages += encoded_nr_pages(pages[++nr]);
                                else
                                        nr_pages++;
                        }
                }

                free_pages_and_swap_cache(pages, nr);
                pages += nr;
                batch->nr -= nr;

                cond_resched();
        }
}

static void tlb_batch_pages_flush(struct mmu_gather *tlb)
{
        struct mmu_gather_batch *batch;

        for (batch = &tlb->local; batch && batch->nr; batch = batch->next)
                __tlb_batch_free_encoded_pages(batch);
        tlb->active = &tlb->local;
}

static void tlb_batch_list_free(struct mmu_gather *tlb)
{
        struct mmu_gather_batch *batch, *next;

        for (batch = tlb->local.next; batch; batch = next) {
                next = batch->next;
                free_pages((unsigned long)batch, 0);
        }
        tlb->local.next = NULL;
}

static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb,
                struct page *page, unsigned int nr_pages, bool delay_rmap,
                int page_size)
{
        int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0;
        struct mmu_gather_batch *batch;

        VM_BUG_ON(!tlb->end);

#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        VM_WARN_ON(tlb->page_size != page_size);
        VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE);
        VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
#endif

        batch = tlb->active;
        /*
         * Add the page and check if we are full. If so
         * force a flush.
         */
        if (likely(nr_pages == 1)) {
                batch->encoded_pages[batch->nr++] = encode_page(page, flags);
        } else {
                flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT;
                batch->encoded_pages[batch->nr++] = encode_page(page, flags);
                batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages);
        }
        /*
         * Make sure that we can always add another "page" + "nr_pages",
         * requiring two entries instead of only a single one.
         */
        if (batch->nr >= batch->max - 1) {
                if (!tlb_next_batch(tlb))
                        return true;
                batch = tlb->active;
        }
        VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page);

        return false;
}

bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page,
                unsigned int nr_pages, bool delay_rmap)
{
        return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap,
                                             PAGE_SIZE);
}

bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
                bool delay_rmap, int page_size)
{
        return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size);
}

#endif /* MMU_GATHER_NO_GATHER */

#ifdef CONFIG_MMU_GATHER_TABLE_FREE

static void __tlb_remove_table_free(struct mmu_table_batch *batch)
{
        int i;

        for (i = 0; i < batch->nr; i++)
                __tlb_remove_table(batch->tables[i]);

        free_page((unsigned long)batch);
}

#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE

/*
 * Semi RCU freeing of the page directories.
 *
 * This is needed by some architectures to implement software pagetable walkers.
 *
 * gup_fast() and other software pagetable walkers do a lockless page-table
 * walk and therefore needs some synchronization with the freeing of the page
 * directories. The chosen means to accomplish that is by disabling IRQs over
 * the walk.
 *
 * Architectures that use IPIs to flush TLBs will then automagically DTRT,
 * since we unlink the page, flush TLBs, free the page. Since the disabling of
 * IRQs delays the completion of the TLB flush we can never observe an already
 * freed page.
 *
 * Architectures that do not have this (PPC) need to delay the freeing by some
 * other means, this is that means.
 *
 * What we do is batch the freed directory pages (tables) and RCU free them.
 * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
 * holds off grace periods.
 *
 * However, in order to batch these pages we need to allocate storage, this
 * allocation is deep inside the MM code and can thus easily fail on memory
 * pressure. To guarantee progress we fall back to single table freeing, see
 * the implementation of tlb_remove_table_one().
 *
 */

static void tlb_remove_table_smp_sync(void *arg)
{
        /* Simply deliver the interrupt */
}

void tlb_remove_table_sync_one(void)
{
        /*
         * This isn't an RCU grace period and hence the page-tables cannot be
         * assumed to be actually RCU-freed.
         *
         * It is however sufficient for software page-table walkers that rely on
         * IRQ disabling.
         */
        smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
}

static void tlb_remove_table_rcu(struct rcu_head *head)
{
        __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
}

static void tlb_remove_table_free(struct mmu_table_batch *batch)
{
        call_rcu(&batch->rcu, tlb_remove_table_rcu);
}

#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */

static void tlb_remove_table_free(struct mmu_table_batch *batch)
{
        __tlb_remove_table_free(batch);
}

#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */

/*
 * If we want tlb_remove_table() to imply TLB invalidates.
 */
static inline void tlb_table_invalidate(struct mmu_gather *tlb)
{
        if (tlb_needs_table_invalidate()) {
                /*
                 * Invalidate page-table caches used by hardware walkers. Then
                 * we still need to RCU-sched wait while freeing the pages
                 * because software walkers can still be in-flight.
                 */
                tlb_flush_mmu_tlbonly(tlb);
        }
}

static void tlb_remove_table_one(void *table)
{
        tlb_remove_table_sync_one();
        __tlb_remove_table(table);
}

static void tlb_table_flush(struct mmu_gather *tlb)
{
        struct mmu_table_batch **batch = &tlb->batch;

        if (*batch) {
                tlb_table_invalidate(tlb);
                tlb_remove_table_free(*batch);
                *batch = NULL;
        }
}

void tlb_remove_table(struct mmu_gather *tlb, void *table)
{
        struct mmu_table_batch **batch = &tlb->batch;

        if (*batch == NULL) {
                *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
                if (*batch == NULL) {
                        tlb_table_invalidate(tlb);
                        tlb_remove_table_one(table);
                        return;
                }
                (*batch)->nr = 0;
        }

        (*batch)->tables[(*batch)->nr++] = table;
        if ((*batch)->nr == MAX_TABLE_BATCH)
                tlb_table_flush(tlb);
}

static inline void tlb_table_init(struct mmu_gather *tlb)
{
        tlb->batch = NULL;
}

#else /* !CONFIG_MMU_GATHER_TABLE_FREE */

static inline void tlb_table_flush(struct mmu_gather *tlb) { }
static inline void tlb_table_init(struct mmu_gather *tlb) { }

#endif /* CONFIG_MMU_GATHER_TABLE_FREE */

static void tlb_flush_mmu_free(struct mmu_gather *tlb)
{
        tlb_table_flush(tlb);
#ifndef CONFIG_MMU_GATHER_NO_GATHER
        tlb_batch_pages_flush(tlb);
#endif
}

void tlb_flush_mmu(struct mmu_gather *tlb)
{
        tlb_flush_mmu_tlbonly(tlb);
        tlb_flush_mmu_free(tlb);
}

static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
                             bool fullmm)
{
        tlb->mm = mm;
        tlb->fullmm = fullmm;

#ifndef CONFIG_MMU_GATHER_NO_GATHER
        tlb->need_flush_all = 0;
        tlb->local.next = NULL;
        tlb->local.nr   = 0;
        tlb->local.max  = ARRAY_SIZE(tlb->__pages);
        tlb->active     = &tlb->local;
        tlb->batch_count = 0;
#endif
        tlb->delayed_rmap = 0;

        tlb_table_init(tlb);
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        tlb->page_size = 0;
#endif

        __tlb_reset_range(tlb);
        inc_tlb_flush_pending(tlb->mm);
}

/**
 * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
 * @tlb: the mmu_gather structure to initialize
 * @mm: the mm_struct of the target address space
 *
 * Called to initialize an (on-stack) mmu_gather structure for page-table
 * tear-down from @mm.
 */
void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
{
        __tlb_gather_mmu(tlb, mm, false);
}

/**
 * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down
 * @tlb: the mmu_gather structure to initialize
 * @mm: the mm_struct of the target address space
 *
 * In this case, @mm is without users and we're going to destroy the
 * full address space (exit/execve).
 *
 * Called to initialize an (on-stack) mmu_gather structure for page-table
 * tear-down from @mm.
 */
void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
{
        __tlb_gather_mmu(tlb, mm, true);
}

/**
 * tlb_finish_mmu - finish an mmu_gather structure
 * @tlb: the mmu_gather structure to finish
 *
 * Called at the end of the shootdown operation to free up any resources that
 * were required.
 */
void tlb_finish_mmu(struct mmu_gather *tlb)
{
        /*
         * If there are parallel threads are doing PTE changes on same range
         * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
         * flush by batching, one thread may end up seeing inconsistent PTEs
         * and result in having stale TLB entries.  So flush TLB forcefully
         * if we detect parallel PTE batching threads.
         *
         * However, some syscalls, e.g. munmap(), may free page tables, this
         * needs force flush everything in the given range. Otherwise this
         * may result in having stale TLB entries for some architectures,
         * e.g. aarch64, that could specify flush what level TLB.
         */
        if (mm_tlb_flush_nested(tlb->mm)) {
                /*
                 * The aarch64 yields better performance with fullmm by
                 * avoiding multiple CPUs spamming TLBI messages at the
                 * same time.
                 *
                 * On x86 non-fullmm doesn't yield significant difference
                 * against fullmm.
                 */
                tlb->fullmm = 1;
                __tlb_reset_range(tlb);
                tlb->freed_tables = 1;
        }

        tlb_flush_mmu(tlb);

#ifndef CONFIG_MMU_GATHER_NO_GATHER
        tlb_batch_list_free(tlb);
#endif
        dec_tlb_flush_pending(tlb->mm);
}




































































































































































































    5 





































   14 


















































    1 




    1 
















































    6 



    6 

































































































































































    1 


    1 























    2 


    2 























    4 


    4 




















   18 


   11 


   12 

















































    6 



    3 


    2 































































































































































































































































































































































































































































































































































    2 








    2 

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
// SPDX-License-Identifier: GPL-2.0

/*
 * fs/ext4/fast_commit.c
 *
 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
 *
 * Ext4 fast commits routines.
 */
#include "ext4.h"
#include "ext4_jbd2.h"
#include "ext4_extents.h"
#include "mballoc.h"

/*
 * Ext4 Fast Commits
 * -----------------
 *
 * Ext4 fast commits implement fine grained journalling for Ext4.
 *
 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
 * TLV during the recovery phase. For the scenarios for which we currently
 * don't have replay code, fast commit falls back to full commits.
 * Fast commits record delta in one of the following three categories.
 *
 * (A) Directory entry updates:
 *
 * - EXT4_FC_TAG_UNLINK                - records directory entry unlink
 * - EXT4_FC_TAG_LINK                - records directory entry link
 * - EXT4_FC_TAG_CREAT                - records inode and directory entry creation
 *
 * (B) File specific data range updates:
 *
 * - EXT4_FC_TAG_ADD_RANGE        - records addition of new blocks to an inode
 * - EXT4_FC_TAG_DEL_RANGE        - records deletion of blocks from an inode
 *
 * (C) Inode metadata (mtime / ctime etc):
 *
 * - EXT4_FC_TAG_INODE                - record the inode that should be replayed
 *                                  during recovery. Note that iblocks field is
 *                                  not replayed and instead derived during
 *                                  replay.
 * Commit Operation
 * ----------------
 * With fast commits, we maintain all the directory entry operations in the
 * order in which they are issued in an in-memory queue. This queue is flushed
 * to disk during the commit operation. We also maintain a list of inodes
 * that need to be committed during a fast commit in another in memory queue of
 * inodes. During the commit operation, we commit in the following order:
 *
 * [1] Lock inodes for any further data updates by setting COMMITTING state
 * [2] Submit data buffers of all the inodes
 * [3] Wait for [2] to complete
 * [4] Commit all the directory entry updates in the fast commit space
 * [5] Commit all the changed inode structures
 * [6] Write tail tag (this tag ensures the atomicity, please read the following
 *     section for more details).
 * [7] Wait for [4], [5] and [6] to complete.
 *
 * All the inode updates must call ext4_fc_start_update() before starting an
 * update. If such an ongoing update is present, fast commit waits for it to
 * complete. The completion of such an update is marked by
 * ext4_fc_stop_update().
 *
 * Fast Commit Ineligibility
 * -------------------------
 *
 * Not all operations are supported by fast commits today (e.g extended
 * attributes). Fast commit ineligibility is marked by calling
 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
 * to full commit.
 *
 * Atomicity of commits
 * --------------------
 * In order to guarantee atomicity during the commit operation, fast commit
 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
 * tag contains CRC of the contents and TID of the transaction after which
 * this fast commit should be applied. Recovery code replays fast commit
 * logs only if there's at least 1 valid tail present. For every fast commit
 * operation, there is 1 tail. This means, we may end up with multiple tails
 * in the fast commit space. Here's an example:
 *
 * - Create a new file A and remove existing file B
 * - fsync()
 * - Append contents to file A
 * - Truncate file A
 * - fsync()
 *
 * The fast commit space at the end of above operations would look like this:
 *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 *
 * Replay code should thus check for all the valid tails in the FC area.
 *
 * Fast Commit Replay Idempotence
 * ------------------------------
 *
 * Fast commits tags are idempotent in nature provided the recovery code follows
 * certain rules. The guiding principle that the commit path follows while
 * committing is that it stores the result of a particular operation instead of
 * storing the procedure.
 *
 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 * was associated with inode 10. During fast commit, instead of storing this
 * operation as a procedure "rename a to b", we store the resulting file system
 * state as a "series" of outcomes:
 *
 * - Link dirent b to inode 10
 * - Unlink dirent a
 * - Inode <10> with valid refcount
 *
 * Now when recovery code runs, it needs "enforce" this state on the file
 * system. This is what guarantees idempotence of fast commit replay.
 *
 * Let's take an example of a procedure that is not idempotent and see how fast
 * commits make it idempotent. Consider following sequence of operations:
 *
 *     rm A;    mv B A;    read A
 *  (x)     (y)        (z)
 *
 * (x), (y) and (z) are the points at which we can crash. If we store this
 * sequence of operations as is then the replay is not idempotent. Let's say
 * while in replay, we crash at (z). During the second replay, file A (which was
 * actually created as a result of "mv B A" operation) would get deleted. Thus,
 * file named A would be absent when we try to read A. So, this sequence of
 * operations is not idempotent. However, as mentioned above, instead of storing
 * the procedure fast commits store the outcome of each procedure. Thus the fast
 * commit log for above procedure would be as follows:
 *
 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 * inode 11 before the replay)
 *
 *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 * (w)          (x)                    (y)          (z)
 *
 * If we crash at (z), we will have file A linked to inode 11. During the second
 * replay, we will remove file A (inode 11). But we will create it back and make
 * it point to inode 11. We won't find B, so we'll just skip that step. At this
 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 * similarly. Thus, by converting a non-idempotent procedure into a series of
 * idempotent outcomes, fast commits ensured idempotence during the replay.
 *
 * TODOs
 * -----
 *
 * 0) Fast commit replay path hardening: Fast commit replay code should use
 *    journal handles to make sure all the updates it does during the replay
 *    path are atomic. With that if we crash during fast commit replay, after
 *    trying to do recovery again, we will find a file system where fast commit
 *    area is invalid (because new full commit would be found). In order to deal
 *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 *    superblock state is persisted before starting the replay, so that after
 *    the crash, fast commit recovery code can look at that flag and perform
 *    fast commit recovery even if that area is invalidated by later full
 *    commits.
 *
 * 1) Fast commit's commit path locks the entire file system during fast
 *    commit. This has significant performance penalty. Instead of that, we
 *    should use ext4_fc_start/stop_update functions to start inode level
 *    updates from ext4_journal_start/stop. Once we do that we can drop file
 *    system locking during commit path.
 *
 * 2) Handle more ineligible cases.
 */

#include <trace/events/ext4.h>
static struct kmem_cache *ext4_fc_dentry_cachep;

static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{
        BUFFER_TRACE(bh, "");
        if (uptodate) {
                ext4_debug("%s: Block %lld up-to-date",
                           __func__, bh->b_blocknr);
                set_buffer_uptodate(bh);
        } else {
                ext4_debug("%s: Block %lld not up-to-date",
                           __func__, bh->b_blocknr);
                clear_buffer_uptodate(bh);
        }

        unlock_buffer(bh);
}

static inline void ext4_fc_reset_inode(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        ei->i_fc_lblk_start = 0;
        ei->i_fc_lblk_len = 0;
}

void ext4_fc_init_inode(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        ext4_fc_reset_inode(inode);
        ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
        INIT_LIST_HEAD(&ei->i_fc_list);
        INIT_LIST_HEAD(&ei->i_fc_dilist);
        init_waitqueue_head(&ei->i_fc_wait);
        atomic_set(&ei->i_fc_updates, 0);
}

/* This function must be called with sbi->s_fc_lock held. */
static void ext4_fc_wait_committing_inode(struct inode *inode)
__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
{
        wait_queue_head_t *wq;
        struct ext4_inode_info *ei = EXT4_I(inode);

#if (BITS_PER_LONG < 64)
        DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
                        EXT4_STATE_FC_COMMITTING);
        wq = bit_waitqueue(&ei->i_state_flags,
                                EXT4_STATE_FC_COMMITTING);
#else
        DEFINE_WAIT_BIT(wait, &ei->i_flags,
                        EXT4_STATE_FC_COMMITTING);
        wq = bit_waitqueue(&ei->i_flags,
                                EXT4_STATE_FC_COMMITTING);
#endif
        lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
        prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
        spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
        schedule();
        finish_wait(wq, &wait.wq_entry);
}

static bool ext4_fc_disabled(struct super_block *sb)
{
        return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
                (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
}

/*
 * Inform Ext4's fast about start of an inode update
 *
 * This function is called by the high level call VFS callbacks before
 * performing any inode update. This function blocks if there's an ongoing
 * fast commit on the inode in question.
 */
void ext4_fc_start_update(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (ext4_fc_disabled(inode->i_sb))
                return;

restart:
        spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
        if (list_empty(&ei->i_fc_list))
                goto out;

        if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
                ext4_fc_wait_committing_inode(inode);
                goto restart;
        }
out:
        atomic_inc(&ei->i_fc_updates);
        spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
}

/*
 * Stop inode update and wake up waiting fast commits if any.
 */
void ext4_fc_stop_update(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (ext4_fc_disabled(inode->i_sb))
                return;

        if (atomic_dec_and_test(&ei->i_fc_updates))
                wake_up_all(&ei->i_fc_wait);
}

/*
 * Remove inode from fast commit list. If the inode is being committed
 * we wait until inode commit is done.
 */
void ext4_fc_del(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_fc_dentry_update *fc_dentry;

        if (ext4_fc_disabled(inode->i_sb))
                return;

restart:
        spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
        if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
                spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
                return;
        }

        if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
                ext4_fc_wait_committing_inode(inode);
                goto restart;
        }

        if (!list_empty(&ei->i_fc_list))
                list_del_init(&ei->i_fc_list);

        /*
         * Since this inode is getting removed, let's also remove all FC
         * dentry create references, since it is not needed to log it anyways.
         */
        if (list_empty(&ei->i_fc_dilist)) {
                spin_unlock(&sbi->s_fc_lock);
                return;
        }

        fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
        WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
        list_del_init(&fc_dentry->fcd_list);
        list_del_init(&fc_dentry->fcd_dilist);

        WARN_ON(!list_empty(&ei->i_fc_dilist));
        spin_unlock(&sbi->s_fc_lock);

        if (fc_dentry->fcd_name.name &&
                fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
                kfree(fc_dentry->fcd_name.name);
        kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);

        return;
}

/*
 * Mark file system as fast commit ineligible, and record latest
 * ineligible transaction tid. This means until the recorded
 * transaction, commit operation would result in a full jbd2 commit.
 */
void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        tid_t tid;

        if (ext4_fc_disabled(sb))
                return;

        ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
        if (handle && !IS_ERR(handle))
                tid = handle->h_transaction->t_tid;
        else {
                read_lock(&sbi->s_journal->j_state_lock);
                tid = sbi->s_journal->j_running_transaction ?
                                sbi->s_journal->j_running_transaction->t_tid : 0;
                read_unlock(&sbi->s_journal->j_state_lock);
        }
        spin_lock(&sbi->s_fc_lock);
        if (sbi->s_fc_ineligible_tid < tid)
                sbi->s_fc_ineligible_tid = tid;
        spin_unlock(&sbi->s_fc_lock);
        WARN_ON(reason >= EXT4_FC_REASON_MAX);
        sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
}

/*
 * Generic fast commit tracking function. If this is the first time this we are
 * called after a full commit, we initialize fast commit fields and then call
 * __fc_track_fn() with update = 0. If we have already been called after a full
 * commit, we pass update = 1. Based on that, the track function can determine
 * if it needs to track a field for the first time or if it needs to just
 * update the previously tracked value.
 *
 * If enqueue is set, this function enqueues the inode in fast commit list.
 */
static int ext4_fc_track_template(
        handle_t *handle, struct inode *inode,
        int (*__fc_track_fn)(struct inode *, void *, bool),
        void *args, int enqueue)
{
        bool update = false;
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        tid_t tid = 0;
        int ret;

        tid = handle->h_transaction->t_tid;
        mutex_lock(&ei->i_fc_lock);
        if (tid == ei->i_sync_tid) {
                update = true;
        } else {
                ext4_fc_reset_inode(inode);
                ei->i_sync_tid = tid;
        }
        ret = __fc_track_fn(inode, args, update);
        mutex_unlock(&ei->i_fc_lock);

        if (!enqueue)
                return ret;

        spin_lock(&sbi->s_fc_lock);
        if (list_empty(&EXT4_I(inode)->i_fc_list))
                list_add_tail(&EXT4_I(inode)->i_fc_list,
                                (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
                                 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
                                &sbi->s_fc_q[FC_Q_STAGING] :
                                &sbi->s_fc_q[FC_Q_MAIN]);
        spin_unlock(&sbi->s_fc_lock);

        return ret;
}

struct __track_dentry_update_args {
        struct dentry *dentry;
        int op;
};

/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
static int __track_dentry_update(struct inode *inode, void *arg, bool update)
{
        struct ext4_fc_dentry_update *node;
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct __track_dentry_update_args *dentry_update =
                (struct __track_dentry_update_args *)arg;
        struct dentry *dentry = dentry_update->dentry;
        struct inode *dir = dentry->d_parent->d_inode;
        struct super_block *sb = inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        mutex_unlock(&ei->i_fc_lock);

        if (IS_ENCRYPTED(dir)) {
                ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
                                        NULL);
                mutex_lock(&ei->i_fc_lock);
                return -EOPNOTSUPP;
        }

        node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
        if (!node) {
                ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL);
                mutex_lock(&ei->i_fc_lock);
                return -ENOMEM;
        }

        node->fcd_op = dentry_update->op;
        node->fcd_parent = dir->i_ino;
        node->fcd_ino = inode->i_ino;
        if (dentry->d_name.len > DNAME_INLINE_LEN) {
                node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
                if (!node->fcd_name.name) {
                        kmem_cache_free(ext4_fc_dentry_cachep, node);
                        ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL);
                        mutex_lock(&ei->i_fc_lock);
                        return -ENOMEM;
                }
                memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
                        dentry->d_name.len);
        } else {
                memcpy(node->fcd_iname, dentry->d_name.name,
                        dentry->d_name.len);
                node->fcd_name.name = node->fcd_iname;
        }
        node->fcd_name.len = dentry->d_name.len;
        INIT_LIST_HEAD(&node->fcd_dilist);
        spin_lock(&sbi->s_fc_lock);
        if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
                sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
                list_add_tail(&node->fcd_list,
                                &sbi->s_fc_dentry_q[FC_Q_STAGING]);
        else
                list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);

        /*
         * This helps us keep a track of all fc_dentry updates which is part of
         * this ext4 inode. So in case the inode is getting unlinked, before
         * even we get a chance to fsync, we could remove all fc_dentry
         * references while evicting the inode in ext4_fc_del().
         * Also with this, we don't need to loop over all the inodes in
         * sbi->s_fc_q to get the corresponding inode in
         * ext4_fc_commit_dentry_updates().
         */
        if (dentry_update->op == EXT4_FC_TAG_CREAT) {
                WARN_ON(!list_empty(&ei->i_fc_dilist));
                list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
        }
        spin_unlock(&sbi->s_fc_lock);
        mutex_lock(&ei->i_fc_lock);

        return 0;
}

void __ext4_fc_track_unlink(handle_t *handle,
                struct inode *inode, struct dentry *dentry)
{
        struct __track_dentry_update_args args;
        int ret;

        args.dentry = dentry;
        args.op = EXT4_FC_TAG_UNLINK;

        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
                                        (void *)&args, 0);
        trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
}

void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        if (ext4_fc_disabled(inode->i_sb))
                return;

        if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
                return;

        __ext4_fc_track_unlink(handle, inode, dentry);
}

void __ext4_fc_track_link(handle_t *handle,
        struct inode *inode, struct dentry *dentry)
{
        struct __track_dentry_update_args args;
        int ret;

        args.dentry = dentry;
        args.op = EXT4_FC_TAG_LINK;

        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
                                        (void *)&args, 0);
        trace_ext4_fc_track_link(handle, inode, dentry, ret);
}

void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        if (ext4_fc_disabled(inode->i_sb))
                return;

        if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
                return;

        __ext4_fc_track_link(handle, inode, dentry);
}

void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
                          struct dentry *dentry)
{
        struct __track_dentry_update_args args;
        int ret;

        args.dentry = dentry;
        args.op = EXT4_FC_TAG_CREAT;

        ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
                                        (void *)&args, 0);
        trace_ext4_fc_track_create(handle, inode, dentry, ret);
}

void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        if (ext4_fc_disabled(inode->i_sb))
                return;

        if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
                return;

        __ext4_fc_track_create(handle, inode, dentry);
}

/* __track_fn for inode tracking */
static int __track_inode(struct inode *inode, void *arg, bool update)
{
        if (update)
                return -EEXIST;

        EXT4_I(inode)->i_fc_lblk_len = 0;

        return 0;
}

void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
{
        int ret;

        if (S_ISDIR(inode->i_mode))
                return;

        if (ext4_fc_disabled(inode->i_sb))
                return;

        if (ext4_should_journal_data(inode)) {
                ext4_fc_mark_ineligible(inode->i_sb,
                                        EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
                return;
        }

        if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
                return;

        ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
        trace_ext4_fc_track_inode(handle, inode, ret);
}

struct __track_range_args {
        ext4_lblk_t start, end;
};

/* __track_fn for tracking data updates */
static int __track_range(struct inode *inode, void *arg, bool update)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        ext4_lblk_t oldstart;
        struct __track_range_args *__arg =
                (struct __track_range_args *)arg;

        if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
                ext4_debug("Special inode %ld being modified\n", inode->i_ino);
                return -ECANCELED;
        }

        oldstart = ei->i_fc_lblk_start;

        if (update && ei->i_fc_lblk_len > 0) {
                ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
                ei->i_fc_lblk_len =
                        max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
                                ei->i_fc_lblk_start + 1;
        } else {
                ei->i_fc_lblk_start = __arg->start;
                ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
        }

        return 0;
}

void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
                         ext4_lblk_t end)
{
        struct __track_range_args args;
        int ret;

        if (S_ISDIR(inode->i_mode))
                return;

        if (ext4_fc_disabled(inode->i_sb))
                return;

        if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
                return;

        args.start = start;
        args.end = end;

        ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);

        trace_ext4_fc_track_range(handle, inode, start, end, ret);
}

static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
{
        blk_opf_t write_flags = REQ_SYNC;
        struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;

        /* Add REQ_FUA | REQ_PREFLUSH only its tail */
        if (test_opt(sb, BARRIER) && is_tail)
                write_flags |= REQ_FUA | REQ_PREFLUSH;
        lock_buffer(bh);
        set_buffer_dirty(bh);
        set_buffer_uptodate(bh);
        bh->b_end_io = ext4_end_buffer_io_sync;
        submit_bh(REQ_OP_WRITE | write_flags, bh);
        EXT4_SB(sb)->s_fc_bh = NULL;
}

/* Ext4 commit path routines */

/*
 * Allocate len bytes on a fast commit buffer.
 *
 * During the commit time this function is used to manage fast commit
 * block space. We don't split a fast commit log onto different
 * blocks. So this function makes sure that if there's not enough space
 * on the current block, the remaining space in the current block is
 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 * new block is from jbd2 and CRC is updated to reflect the padding
 * we added.
 */
static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
{
        struct ext4_fc_tl tl;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct buffer_head *bh;
        int bsize = sbi->s_journal->j_blocksize;
        int ret, off = sbi->s_fc_bytes % bsize;
        int remaining;
        u8 *dst;

        /*
         * If 'len' is too long to fit in any block alongside a PAD tlv, then we
         * cannot fulfill the request.
         */
        if (len > bsize - EXT4_FC_TAG_BASE_LEN)
                return NULL;

        if (!sbi->s_fc_bh) {
                ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
                if (ret)
                        return NULL;
                sbi->s_fc_bh = bh;
        }
        dst = sbi->s_fc_bh->b_data + off;

        /*
         * Allocate the bytes in the current block if we can do so while still
         * leaving enough space for a PAD tlv.
         */
        remaining = bsize - EXT4_FC_TAG_BASE_LEN - off;
        if (len <= remaining) {
                sbi->s_fc_bytes += len;
                return dst;
        }

        /*
         * Else, terminate the current block with a PAD tlv, then allocate a new
         * block and allocate the bytes at the start of that new block.
         */

        tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
        tl.fc_len = cpu_to_le16(remaining);
        memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
        memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
        *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize);

        ext4_fc_submit_bh(sb, false);

        ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
        if (ret)
                return NULL;
        sbi->s_fc_bh = bh;
        sbi->s_fc_bytes += bsize - off + len;
        return sbi->s_fc_bh->b_data;
}

/*
 * Complete a fast commit by writing tail tag.
 *
 * Writing tail tag marks the end of a fast commit. In order to guarantee
 * atomicity, after writing tail tag, even if there's space remaining
 * in the block, next commit shouldn't use it. That's why tail tag
 * has the length as that of the remaining space on the block.
 */
static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_fc_tl tl;
        struct ext4_fc_tail tail;
        int off, bsize = sbi->s_journal->j_blocksize;
        u8 *dst;

        /*
         * ext4_fc_reserve_space takes care of allocating an extra block if
         * there's no enough space on this block for accommodating this tail.
         */
        dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc);
        if (!dst)
                return -ENOSPC;

        off = sbi->s_fc_bytes % bsize;

        tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
        tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
        sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);

        memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
        dst += EXT4_FC_TAG_BASE_LEN;
        tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
        memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
        dst += sizeof(tail.fc_tid);
        crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data,
                          dst - (u8 *)sbi->s_fc_bh->b_data);
        tail.fc_crc = cpu_to_le32(crc);
        memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
        dst += sizeof(tail.fc_crc);
        memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */

        ext4_fc_submit_bh(sb, true);

        return 0;
}

/*
 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 * Returns false if there's not enough space.
 */
static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
                           u32 *crc)
{
        struct ext4_fc_tl tl;
        u8 *dst;

        dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
        if (!dst)
                return false;

        tl.fc_tag = cpu_to_le16(tag);
        tl.fc_len = cpu_to_le16(len);

        memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
        memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len);

        return true;
}

/* Same as above, but adds dentry tlv. */
static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
                                   struct ext4_fc_dentry_update *fc_dentry)
{
        struct ext4_fc_dentry_info fcd;
        struct ext4_fc_tl tl;
        int dlen = fc_dentry->fcd_name.len;
        u8 *dst = ext4_fc_reserve_space(sb,
                        EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);

        if (!dst)
                return false;

        fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
        fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
        tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
        tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
        memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
        dst += EXT4_FC_TAG_BASE_LEN;
        memcpy(dst, &fcd, sizeof(fcd));
        dst += sizeof(fcd);
        memcpy(dst, fc_dentry->fcd_name.name, dlen);

        return true;
}

/*
 * Writes inode in the fast commit space under TLV with tag @tag.
 * Returns 0 on success, error on failure.
 */
static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
        int ret;
        struct ext4_iloc iloc;
        struct ext4_fc_inode fc_inode;
        struct ext4_fc_tl tl;
        u8 *dst;

        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
                return ret;

        if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
                inode_len = EXT4_INODE_SIZE(inode->i_sb);
        else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
                inode_len += ei->i_extra_isize;

        fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
        tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
        tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));

        ret = -ECANCELED;
        dst = ext4_fc_reserve_space(inode->i_sb,
                EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
        if (!dst)
                goto err;

        memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
        dst += EXT4_FC_TAG_BASE_LEN;
        memcpy(dst, &fc_inode, sizeof(fc_inode));
        dst += sizeof(fc_inode);
        memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
        ret = 0;
err:
        brelse(iloc.bh);
        return ret;
}

/*
 * Writes updated data ranges for the inode in question. Updates CRC.
 * Returns 0 on success, error otherwise.
 */
static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
{
        ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_map_blocks map;
        struct ext4_fc_add_range fc_ext;
        struct ext4_fc_del_range lrange;
        struct ext4_extent *ex;
        int ret;

        mutex_lock(&ei->i_fc_lock);
        if (ei->i_fc_lblk_len == 0) {
                mutex_unlock(&ei->i_fc_lock);
                return 0;
        }
        old_blk_size = ei->i_fc_lblk_start;
        new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
        ei->i_fc_lblk_len = 0;
        mutex_unlock(&ei->i_fc_lock);

        cur_lblk_off = old_blk_size;
        ext4_debug("will try writing %d to %d for inode %ld\n",
                   cur_lblk_off, new_blk_size, inode->i_ino);

        while (cur_lblk_off <= new_blk_size) {
                map.m_lblk = cur_lblk_off;
                map.m_len = new_blk_size - cur_lblk_off + 1;
                ret = ext4_map_blocks(NULL, inode, &map, 0);
                if (ret < 0)
                        return -ECANCELED;

                if (map.m_len == 0) {
                        cur_lblk_off++;
                        continue;
                }

                if (ret == 0) {
                        lrange.fc_ino = cpu_to_le32(inode->i_ino);
                        lrange.fc_lblk = cpu_to_le32(map.m_lblk);
                        lrange.fc_len = cpu_to_le32(map.m_len);
                        if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
                                            sizeof(lrange), (u8 *)&lrange, crc))
                                return -ENOSPC;
                } else {
                        unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
                                EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;

                        /* Limit the number of blocks in one extent */
                        map.m_len = min(max, map.m_len);

                        fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
                        ex = (struct ext4_extent *)&fc_ext.fc_ex;
                        ex->ee_block = cpu_to_le32(map.m_lblk);
                        ex->ee_len = cpu_to_le16(map.m_len);
                        ext4_ext_store_pblock(ex, map.m_pblk);
                        if (map.m_flags & EXT4_MAP_UNWRITTEN)
                                ext4_ext_mark_unwritten(ex);
                        else
                                ext4_ext_mark_initialized(ex);
                        if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
                                            sizeof(fc_ext), (u8 *)&fc_ext, crc))
                                return -ENOSPC;
                }

                cur_lblk_off += map.m_len;
        }

        return 0;
}


/* Submit data for all the fast commit inodes */
static int ext4_fc_submit_inode_data_all(journal_t *journal)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_inode_info *ei;
        int ret = 0;

        spin_lock(&sbi->s_fc_lock);
        list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
                ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
                while (atomic_read(&ei->i_fc_updates)) {
                        DEFINE_WAIT(wait);

                        prepare_to_wait(&ei->i_fc_wait, &wait,
                                                TASK_UNINTERRUPTIBLE);
                        if (atomic_read(&ei->i_fc_updates)) {
                                spin_unlock(&sbi->s_fc_lock);
                                schedule();
                                spin_lock(&sbi->s_fc_lock);
                        }
                        finish_wait(&ei->i_fc_wait, &wait);
                }
                spin_unlock(&sbi->s_fc_lock);
                ret = jbd2_submit_inode_data(journal, ei->jinode);
                if (ret)
                        return ret;
                spin_lock(&sbi->s_fc_lock);
        }
        spin_unlock(&sbi->s_fc_lock);

        return ret;
}

/* Wait for completion of data for all the fast commit inodes */
static int ext4_fc_wait_inode_data_all(journal_t *journal)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_inode_info *pos, *n;
        int ret = 0;

        spin_lock(&sbi->s_fc_lock);
        list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
                if (!ext4_test_inode_state(&pos->vfs_inode,
                                           EXT4_STATE_FC_COMMITTING))
                        continue;
                spin_unlock(&sbi->s_fc_lock);

                ret = jbd2_wait_inode_data(journal, pos->jinode);
                if (ret)
                        return ret;
                spin_lock(&sbi->s_fc_lock);
        }
        spin_unlock(&sbi->s_fc_lock);

        return 0;
}

/* Commit all the directory entry updates */
static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
__acquires(&sbi->s_fc_lock)
__releases(&sbi->s_fc_lock)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
        struct inode *inode;
        struct ext4_inode_info *ei;
        int ret;

        if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
                return 0;
        list_for_each_entry_safe(fc_dentry, fc_dentry_n,
                                 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
                if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
                        spin_unlock(&sbi->s_fc_lock);
                        if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
                                ret = -ENOSPC;
                                goto lock_and_exit;
                        }
                        spin_lock(&sbi->s_fc_lock);
                        continue;
                }
                /*
                 * With fcd_dilist we need not loop in sbi->s_fc_q to get the
                 * corresponding inode pointer
                 */
                WARN_ON(list_empty(&fc_dentry->fcd_dilist));
                ei = list_first_entry(&fc_dentry->fcd_dilist,
                                struct ext4_inode_info, i_fc_dilist);
                inode = &ei->vfs_inode;
                WARN_ON(inode->i_ino != fc_dentry->fcd_ino);

                spin_unlock(&sbi->s_fc_lock);

                /*
                 * We first write the inode and then the create dirent. This
                 * allows the recovery code to create an unnamed inode first
                 * and then link it to a directory entry. This allows us
                 * to use namei.c routines almost as is and simplifies
                 * the recovery code.
                 */
                ret = ext4_fc_write_inode(inode, crc);
                if (ret)
                        goto lock_and_exit;

                ret = ext4_fc_write_inode_data(inode, crc);
                if (ret)
                        goto lock_and_exit;

                if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
                        ret = -ENOSPC;
                        goto lock_and_exit;
                }

                spin_lock(&sbi->s_fc_lock);
        }
        return 0;
lock_and_exit:
        spin_lock(&sbi->s_fc_lock);
        return ret;
}

static int ext4_fc_perform_commit(journal_t *journal)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_inode_info *iter;
        struct ext4_fc_head head;
        struct inode *inode;
        struct blk_plug plug;
        int ret = 0;
        u32 crc = 0;

        ret = ext4_fc_submit_inode_data_all(journal);
        if (ret)
                return ret;

        ret = ext4_fc_wait_inode_data_all(journal);
        if (ret)
                return ret;

        /*
         * If file system device is different from journal device, issue a cache
         * flush before we start writing fast commit blocks.
         */
        if (journal->j_fs_dev != journal->j_dev)
                blkdev_issue_flush(journal->j_fs_dev);

        blk_start_plug(&plug);
        if (sbi->s_fc_bytes == 0) {
                /*
                 * Add a head tag only if this is the first fast commit
                 * in this TID.
                 */
                head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
                head.fc_tid = cpu_to_le32(
                        sbi->s_journal->j_running_transaction->t_tid);
                if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
                        (u8 *)&head, &crc)) {
                        ret = -ENOSPC;
                        goto out;
                }
        }

        spin_lock(&sbi->s_fc_lock);
        ret = ext4_fc_commit_dentry_updates(journal, &crc);
        if (ret) {
                spin_unlock(&sbi->s_fc_lock);
                goto out;
        }

        list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
                inode = &iter->vfs_inode;
                if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
                        continue;

                spin_unlock(&sbi->s_fc_lock);
                ret = ext4_fc_write_inode_data(inode, &crc);
                if (ret)
                        goto out;
                ret = ext4_fc_write_inode(inode, &crc);
                if (ret)
                        goto out;
                spin_lock(&sbi->s_fc_lock);
        }
        spin_unlock(&sbi->s_fc_lock);

        ret = ext4_fc_write_tail(sb, crc);

out:
        blk_finish_plug(&plug);
        return ret;
}

static void ext4_fc_update_stats(struct super_block *sb, int status,
                                 u64 commit_time, int nblks, tid_t commit_tid)
{
        struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;

        ext4_debug("Fast commit ended with status = %d for tid %u",
                        status, commit_tid);
        if (status == EXT4_FC_STATUS_OK) {
                stats->fc_num_commits++;
                stats->fc_numblks += nblks;
                if (likely(stats->s_fc_avg_commit_time))
                        stats->s_fc_avg_commit_time =
                                (commit_time +
                                 stats->s_fc_avg_commit_time * 3) / 4;
                else
                        stats->s_fc_avg_commit_time = commit_time;
        } else if (status == EXT4_FC_STATUS_FAILED ||
                   status == EXT4_FC_STATUS_INELIGIBLE) {
                if (status == EXT4_FC_STATUS_FAILED)
                        stats->fc_failed_commits++;
                stats->fc_ineligible_commits++;
        } else {
                stats->fc_skipped_commits++;
        }
        trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
}

/*
 * The main commit entry point. Performs a fast commit for transaction
 * commit_tid if needed. If it's not possible to perform a fast commit
 * due to various reasons, we fall back to full commit. Returns 0
 * on success, error otherwise.
 */
int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int nblks = 0, ret, bsize = journal->j_blocksize;
        int subtid = atomic_read(&sbi->s_fc_subtid);
        int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
        ktime_t start_time, commit_time;

        if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
                return jbd2_complete_transaction(journal, commit_tid);

        trace_ext4_fc_commit_start(sb, commit_tid);

        start_time = ktime_get();

restart_fc:
        ret = jbd2_fc_begin_commit(journal, commit_tid);
        if (ret == -EALREADY) {
                /* There was an ongoing commit, check if we need to restart */
                if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
                        commit_tid > journal->j_commit_sequence)
                        goto restart_fc;
                ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
                                commit_tid);
                return 0;
        } else if (ret) {
                /*
                 * Commit couldn't start. Just update stats and perform a
                 * full commit.
                 */
                ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
                                commit_tid);
                return jbd2_complete_transaction(journal, commit_tid);
        }

        /*
         * After establishing journal barrier via jbd2_fc_begin_commit(), check
         * if we are fast commit ineligible.
         */
        if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
                status = EXT4_FC_STATUS_INELIGIBLE;
                goto fallback;
        }

        fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
        ret = ext4_fc_perform_commit(journal);
        if (ret < 0) {
                status = EXT4_FC_STATUS_FAILED;
                goto fallback;
        }
        nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
        ret = jbd2_fc_wait_bufs(journal, nblks);
        if (ret < 0) {
                status = EXT4_FC_STATUS_FAILED;
                goto fallback;
        }
        atomic_inc(&sbi->s_fc_subtid);
        ret = jbd2_fc_end_commit(journal);
        /*
         * weight the commit time higher than the average time so we
         * don't react too strongly to vast changes in the commit time
         */
        commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
        ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
        return ret;

fallback:
        ret = jbd2_fc_end_commit_fallback(journal);
        ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
        return ret;
}

/*
 * Fast commit cleanup routine. This is called after every fast commit and
 * full commit. full is true if we are called after a full commit.
 */
static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_inode_info *iter, *iter_n;
        struct ext4_fc_dentry_update *fc_dentry;

        if (full && sbi->s_fc_bh)
                sbi->s_fc_bh = NULL;

        trace_ext4_fc_cleanup(journal, full, tid);
        jbd2_fc_release_bufs(journal);

        spin_lock(&sbi->s_fc_lock);
        list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
                                 i_fc_list) {
                list_del_init(&iter->i_fc_list);
                ext4_clear_inode_state(&iter->vfs_inode,
                                       EXT4_STATE_FC_COMMITTING);
                if (iter->i_sync_tid <= tid)
                        ext4_fc_reset_inode(&iter->vfs_inode);
                /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
                smp_mb();
#if (BITS_PER_LONG < 64)
                wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
#else
                wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
#endif
        }

        while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
                fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
                                             struct ext4_fc_dentry_update,
                                             fcd_list);
                list_del_init(&fc_dentry->fcd_list);
                list_del_init(&fc_dentry->fcd_dilist);
                spin_unlock(&sbi->s_fc_lock);

                if (fc_dentry->fcd_name.name &&
                        fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
                        kfree(fc_dentry->fcd_name.name);
                kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
                spin_lock(&sbi->s_fc_lock);
        }

        list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
                                &sbi->s_fc_dentry_q[FC_Q_MAIN]);
        list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
                                &sbi->s_fc_q[FC_Q_MAIN]);

        if (tid >= sbi->s_fc_ineligible_tid) {
                sbi->s_fc_ineligible_tid = 0;
                ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
        }

        if (full)
                sbi->s_fc_bytes = 0;
        spin_unlock(&sbi->s_fc_lock);
        trace_ext4_fc_stats(sb);
}

/* Ext4 Replay Path Routines */

/* Helper struct for dentry replay routines */
struct dentry_info_args {
        int parent_ino, dname_len, ino, inode_len;
        char *dname;
};

/* Same as struct ext4_fc_tl, but uses native endianness fields */
struct ext4_fc_tl_mem {
        u16 fc_tag;
        u16 fc_len;
};

static inline void tl_to_darg(struct dentry_info_args *darg,
                              struct ext4_fc_tl_mem *tl, u8 *val)
{
        struct ext4_fc_dentry_info fcd;

        memcpy(&fcd, val, sizeof(fcd));

        darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
        darg->ino = le32_to_cpu(fcd.fc_ino);
        darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
        darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
}

static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val)
{
        struct ext4_fc_tl tl_disk;

        memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN);
        tl->fc_len = le16_to_cpu(tl_disk.fc_len);
        tl->fc_tag = le16_to_cpu(tl_disk.fc_tag);
}

/* Unlink replay function */
static int ext4_fc_replay_unlink(struct super_block *sb,
                                 struct ext4_fc_tl_mem *tl, u8 *val)
{
        struct inode *inode, *old_parent;
        struct qstr entry;
        struct dentry_info_args darg;
        int ret = 0;

        tl_to_darg(&darg, tl, val);

        trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
                        darg.parent_ino, darg.dname_len);

        entry.name = darg.dname;
        entry.len = darg.dname_len;
        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);

        if (IS_ERR(inode)) {
                ext4_debug("Inode %d not found", darg.ino);
                return 0;
        }

        old_parent = ext4_iget(sb, darg.parent_ino,
                                EXT4_IGET_NORMAL);
        if (IS_ERR(old_parent)) {
                ext4_debug("Dir with inode %d not found", darg.parent_ino);
                iput(inode);
                return 0;
        }

        ret = __ext4_unlink(old_parent, &entry, inode, NULL);
        /* -ENOENT ok coz it might not exist anymore. */
        if (ret == -ENOENT)
                ret = 0;
        iput(old_parent);
        iput(inode);
        return ret;
}

static int ext4_fc_replay_link_internal(struct super_block *sb,
                                struct dentry_info_args *darg,
                                struct inode *inode)
{
        struct inode *dir = NULL;
        struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
        struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
        int ret = 0;

        dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
        if (IS_ERR(dir)) {
                ext4_debug("Dir with inode %d not found.", darg->parent_ino);
                dir = NULL;
                goto out;
        }

        dentry_dir = d_obtain_alias(dir);
        if (IS_ERR(dentry_dir)) {
                ext4_debug("Failed to obtain dentry");
                dentry_dir = NULL;
                goto out;
        }

        dentry_inode = d_alloc(dentry_dir, &qstr_dname);
        if (!dentry_inode) {
                ext4_debug("Inode dentry not created.");
                ret = -ENOMEM;
                goto out;
        }

        ret = __ext4_link(dir, inode, dentry_inode);
        /*
         * It's possible that link already existed since data blocks
         * for the dir in question got persisted before we crashed OR
         * we replayed this tag and crashed before the entire replay
         * could complete.
         */
        if (ret && ret != -EEXIST) {
                ext4_debug("Failed to link\n");
                goto out;
        }

        ret = 0;
out:
        if (dentry_dir) {
                d_drop(dentry_dir);
                dput(dentry_dir);
        } else if (dir) {
                iput(dir);
        }
        if (dentry_inode) {
                d_drop(dentry_inode);
                dput(dentry_inode);
        }

        return ret;
}

/* Link replay function */
static int ext4_fc_replay_link(struct super_block *sb,
                               struct ext4_fc_tl_mem *tl, u8 *val)
{
        struct inode *inode;
        struct dentry_info_args darg;
        int ret = 0;

        tl_to_darg(&darg, tl, val);
        trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
                        darg.parent_ino, darg.dname_len);

        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                ext4_debug("Inode not found.");
                return 0;
        }

        ret = ext4_fc_replay_link_internal(sb, &darg, inode);
        iput(inode);
        return ret;
}

/*
 * Record all the modified inodes during replay. We use this later to setup
 * block bitmaps correctly.
 */
static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
{
        struct ext4_fc_replay_state *state;
        int i;

        state = &EXT4_SB(sb)->s_fc_replay_state;
        for (i = 0; i < state->fc_modified_inodes_used; i++)
                if (state->fc_modified_inodes[i] == ino)
                        return 0;
        if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
                int *fc_modified_inodes;

                fc_modified_inodes = krealloc(state->fc_modified_inodes,
                                sizeof(int) * (state->fc_modified_inodes_size +
                                EXT4_FC_REPLAY_REALLOC_INCREMENT),
                                GFP_KERNEL);
                if (!fc_modified_inodes)
                        return -ENOMEM;
                state->fc_modified_inodes = fc_modified_inodes;
                state->fc_modified_inodes_size +=
                        EXT4_FC_REPLAY_REALLOC_INCREMENT;
        }
        state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
        return 0;
}

/*
 * Inode replay function
 */
static int ext4_fc_replay_inode(struct super_block *sb,
                                struct ext4_fc_tl_mem *tl, u8 *val)
{
        struct ext4_fc_inode fc_inode;
        struct ext4_inode *raw_inode;
        struct ext4_inode *raw_fc_inode;
        struct inode *inode = NULL;
        struct ext4_iloc iloc;
        int inode_len, ino, ret, tag = tl->fc_tag;
        struct ext4_extent_header *eh;
        size_t off_gen = offsetof(struct ext4_inode, i_generation);

        memcpy(&fc_inode, val, sizeof(fc_inode));

        ino = le32_to_cpu(fc_inode.fc_ino);
        trace_ext4_fc_replay(sb, tag, ino, 0, 0);

        inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
        if (!IS_ERR(inode)) {
                ext4_ext_clear_bb(inode);
                iput(inode);
        }
        inode = NULL;

        ret = ext4_fc_record_modified_inode(sb, ino);
        if (ret)
                goto out;

        raw_fc_inode = (struct ext4_inode *)
                (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
        ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
        if (ret)
                goto out;

        inode_len = tl->fc_len - sizeof(struct ext4_fc_inode);
        raw_inode = ext4_raw_inode(&iloc);

        memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
        memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen,
               inode_len - off_gen);
        if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
                eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
                if (eh->eh_magic != EXT4_EXT_MAGIC) {
                        memset(eh, 0, sizeof(*eh));
                        eh->eh_magic = EXT4_EXT_MAGIC;
                        eh->eh_max = cpu_to_le16(
                                (sizeof(raw_inode->i_block) -
                                 sizeof(struct ext4_extent_header))
                                 / sizeof(struct ext4_extent));
                }
        } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
                memcpy(raw_inode->i_block, raw_fc_inode->i_block,
                        sizeof(raw_inode->i_block));
        }

        /* Immediately update the inode on disk. */
        ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
        if (ret)
                goto out;
        ret = sync_dirty_buffer(iloc.bh);
        if (ret)
                goto out;
        ret = ext4_mark_inode_used(sb, ino);
        if (ret)
                goto out;

        /* Given that we just wrote the inode on disk, this SHOULD succeed. */
        inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                ext4_debug("Inode not found.");
                return -EFSCORRUPTED;
        }

        /*
         * Our allocator could have made different decisions than before
         * crashing. This should be fixed but until then, we calculate
         * the number of blocks the inode.
         */
        if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
                ext4_ext_replay_set_iblocks(inode);

        inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
        ext4_reset_inode_seed(inode);

        ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
        ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
        sync_dirty_buffer(iloc.bh);
        brelse(iloc.bh);
out:
        iput(inode);
        if (!ret)
                blkdev_issue_flush(sb->s_bdev);

        return 0;
}

/*
 * Dentry create replay function.
 *
 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
 * inode for which we are trying to create a dentry here, should already have
 * been replayed before we start here.
 */
static int ext4_fc_replay_create(struct super_block *sb,
                                 struct ext4_fc_tl_mem *tl, u8 *val)
{
        int ret = 0;
        struct inode *inode = NULL;
        struct inode *dir = NULL;
        struct dentry_info_args darg;

        tl_to_darg(&darg, tl, val);

        trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
                        darg.parent_ino, darg.dname_len);

        /* This takes care of update group descriptor and other metadata */
        ret = ext4_mark_inode_used(sb, darg.ino);
        if (ret)
                goto out;

        inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                ext4_debug("inode %d not found.", darg.ino);
                inode = NULL;
                ret = -EINVAL;
                goto out;
        }

        if (S_ISDIR(inode->i_mode)) {
                /*
                 * If we are creating a directory, we need to make sure that the
                 * dot and dot dot dirents are setup properly.
                 */
                dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
                if (IS_ERR(dir)) {
                        ext4_debug("Dir %d not found.", darg.ino);
                        goto out;
                }
                ret = ext4_init_new_dir(NULL, dir, inode);
                iput(dir);
                if (ret) {
                        ret = 0;
                        goto out;
                }
        }
        ret = ext4_fc_replay_link_internal(sb, &darg, inode);
        if (ret)
                goto out;
        set_nlink(inode, 1);
        ext4_mark_inode_dirty(NULL, inode);
out:
        iput(inode);
        return ret;
}

/*
 * Record physical disk regions which are in use as per fast commit area,
 * and used by inodes during replay phase. Our simple replay phase
 * allocator excludes these regions from allocation.
 */
int ext4_fc_record_regions(struct super_block *sb, int ino,
                ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
{
        struct ext4_fc_replay_state *state;
        struct ext4_fc_alloc_region *region;

        state = &EXT4_SB(sb)->s_fc_replay_state;
        /*
         * during replay phase, the fc_regions_valid may not same as
         * fc_regions_used, update it when do new additions.
         */
        if (replay && state->fc_regions_used != state->fc_regions_valid)
                state->fc_regions_used = state->fc_regions_valid;
        if (state->fc_regions_used == state->fc_regions_size) {
                struct ext4_fc_alloc_region *fc_regions;

                fc_regions = krealloc(state->fc_regions,
                                      sizeof(struct ext4_fc_alloc_region) *
                                      (state->fc_regions_size +
                                       EXT4_FC_REPLAY_REALLOC_INCREMENT),
                                      GFP_KERNEL);
                if (!fc_regions)
                        return -ENOMEM;
                state->fc_regions_size +=
                        EXT4_FC_REPLAY_REALLOC_INCREMENT;
                state->fc_regions = fc_regions;
        }
        region = &state->fc_regions[state->fc_regions_used++];
        region->ino = ino;
        region->lblk = lblk;
        region->pblk = pblk;
        region->len = len;

        if (replay)
                state->fc_regions_valid++;

        return 0;
}

/* Replay add range tag */
static int ext4_fc_replay_add_range(struct super_block *sb,
                                    struct ext4_fc_tl_mem *tl, u8 *val)
{
        struct ext4_fc_add_range fc_add_ex;
        struct ext4_extent newex, *ex;
        struct inode *inode;
        ext4_lblk_t start, cur;
        int remaining, len;
        ext4_fsblk_t start_pblk;
        struct ext4_map_blocks map;
        struct ext4_ext_path *path = NULL;
        int ret;

        memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
        ex = (struct ext4_extent *)&fc_add_ex.fc_ex;

        trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
                le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
                ext4_ext_get_actual_len(ex));

        inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                ext4_debug("Inode not found.");
                return 0;
        }

        ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
        if (ret)
                goto out;

        start = le32_to_cpu(ex->ee_block);
        start_pblk = ext4_ext_pblock(ex);
        len = ext4_ext_get_actual_len(ex);

        cur = start;
        remaining = len;
        ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
                  start, start_pblk, len, ext4_ext_is_unwritten(ex),
                  inode->i_ino);

        while (remaining > 0) {
                map.m_lblk = cur;
                map.m_len = remaining;
                map.m_pblk = 0;
                ret = ext4_map_blocks(NULL, inode, &map, 0);

                if (ret < 0)
                        goto out;

                if (ret == 0) {
                        /* Range is not mapped */
                        path = ext4_find_extent(inode, cur, NULL, 0);
                        if (IS_ERR(path))
                                goto out;
                        memset(&newex, 0, sizeof(newex));
                        newex.ee_block = cpu_to_le32(cur);
                        ext4_ext_store_pblock(
                                &newex, start_pblk + cur - start);
                        newex.ee_len = cpu_to_le16(map.m_len);
                        if (ext4_ext_is_unwritten(ex))
                                ext4_ext_mark_unwritten(&newex);
                        down_write(&EXT4_I(inode)->i_data_sem);
                        ret = ext4_ext_insert_extent(
                                NULL, inode, &path, &newex, 0);
                        up_write((&EXT4_I(inode)->i_data_sem));
                        ext4_free_ext_path(path);
                        if (ret)
                                goto out;
                        goto next;
                }

                if (start_pblk + cur - start != map.m_pblk) {
                        /*
                         * Logical to physical mapping changed. This can happen
                         * if this range was removed and then reallocated to
                         * map to new physical blocks during a fast commit.
                         */
                        ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
                                        ext4_ext_is_unwritten(ex),
                                        start_pblk + cur - start);
                        if (ret)
                                goto out;
                        /*
                         * Mark the old blocks as free since they aren't used
                         * anymore. We maintain an array of all the modified
                         * inodes. In case these blocks are still used at either
                         * a different logical range in the same inode or in
                         * some different inode, we will mark them as allocated
                         * at the end of the FC replay using our array of
                         * modified inodes.
                         */
                        ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
                        goto next;
                }

                /* Range is mapped and needs a state change */
                ext4_debug("Converting from %ld to %d %lld",
                                map.m_flags & EXT4_MAP_UNWRITTEN,
                        ext4_ext_is_unwritten(ex), map.m_pblk);
                ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
                                        ext4_ext_is_unwritten(ex), map.m_pblk);
                if (ret)
                        goto out;
                /*
                 * We may have split the extent tree while toggling the state.
                 * Try to shrink the extent tree now.
                 */
                ext4_ext_replay_shrink_inode(inode, start + len);
next:
                cur += map.m_len;
                remaining -= map.m_len;
        }
        ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
                                        sb->s_blocksize_bits);
out:
        iput(inode);
        return 0;
}

/* Replay DEL_RANGE tag */
static int
ext4_fc_replay_del_range(struct super_block *sb,
                         struct ext4_fc_tl_mem *tl, u8 *val)
{
        struct inode *inode;
        struct ext4_fc_del_range lrange;
        struct ext4_map_blocks map;
        ext4_lblk_t cur, remaining;
        int ret;

        memcpy(&lrange, val, sizeof(lrange));
        cur = le32_to_cpu(lrange.fc_lblk);
        remaining = le32_to_cpu(lrange.fc_len);

        trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
                le32_to_cpu(lrange.fc_ino), cur, remaining);

        inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
        if (IS_ERR(inode)) {
                ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
                return 0;
        }

        ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
        if (ret)
                goto out;

        ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
                        inode->i_ino, le32_to_cpu(lrange.fc_lblk),
                        le32_to_cpu(lrange.fc_len));
        while (remaining > 0) {
                map.m_lblk = cur;
                map.m_len = remaining;

                ret = ext4_map_blocks(NULL, inode, &map, 0);
                if (ret < 0)
                        goto out;
                if (ret > 0) {
                        remaining -= ret;
                        cur += ret;
                        ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
                } else {
                        remaining -= map.m_len;
                        cur += map.m_len;
                }
        }

        down_write(&EXT4_I(inode)->i_data_sem);
        ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
                                le32_to_cpu(lrange.fc_lblk) +
                                le32_to_cpu(lrange.fc_len) - 1);
        up_write(&EXT4_I(inode)->i_data_sem);
        if (ret)
                goto out;
        ext4_ext_replay_shrink_inode(inode,
                i_size_read(inode) >> sb->s_blocksize_bits);
        ext4_mark_inode_dirty(NULL, inode);
out:
        iput(inode);
        return 0;
}

static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
{
        struct ext4_fc_replay_state *state;
        struct inode *inode;
        struct ext4_ext_path *path = NULL;
        struct ext4_map_blocks map;
        int i, ret, j;
        ext4_lblk_t cur, end;

        state = &EXT4_SB(sb)->s_fc_replay_state;
        for (i = 0; i < state->fc_modified_inodes_used; i++) {
                inode = ext4_iget(sb, state->fc_modified_inodes[i],
                        EXT4_IGET_NORMAL);
                if (IS_ERR(inode)) {
                        ext4_debug("Inode %d not found.",
                                state->fc_modified_inodes[i]);
                        continue;
                }
                cur = 0;
                end = EXT_MAX_BLOCKS;
                if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
                        iput(inode);
                        continue;
                }
                while (cur < end) {
                        map.m_lblk = cur;
                        map.m_len = end - cur;

                        ret = ext4_map_blocks(NULL, inode, &map, 0);
                        if (ret < 0)
                                break;

                        if (ret > 0) {
                                path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
                                if (!IS_ERR(path)) {
                                        for (j = 0; j < path->p_depth; j++)
                                                ext4_mb_mark_bb(inode->i_sb,
                                                        path[j].p_block, 1, true);
                                        ext4_free_ext_path(path);
                                }
                                cur += ret;
                                ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
                                                        map.m_len, true);
                        } else {
                                cur = cur + (map.m_len ? map.m_len : 1);
                        }
                }
                iput(inode);
        }
}

/*
 * Check if block is in excluded regions for block allocation. The simple
 * allocator that runs during replay phase is calls this function to see
 * if it is okay to use a block.
 */
bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
{
        int i;
        struct ext4_fc_replay_state *state;

        state = &EXT4_SB(sb)->s_fc_replay_state;
        for (i = 0; i < state->fc_regions_valid; i++) {
                if (state->fc_regions[i].ino == 0 ||
                        state->fc_regions[i].len == 0)
                        continue;
                if (in_range(blk, state->fc_regions[i].pblk,
                                        state->fc_regions[i].len))
                        return true;
        }
        return false;
}

/* Cleanup function called after replay */
void ext4_fc_replay_cleanup(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        sbi->s_mount_state &= ~EXT4_FC_REPLAY;
        kfree(sbi->s_fc_replay_state.fc_regions);
        kfree(sbi->s_fc_replay_state.fc_modified_inodes);
}

static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi,
                                      int tag, int len)
{
        switch (tag) {
        case EXT4_FC_TAG_ADD_RANGE:
                return len == sizeof(struct ext4_fc_add_range);
        case EXT4_FC_TAG_DEL_RANGE:
                return len == sizeof(struct ext4_fc_del_range);
        case EXT4_FC_TAG_CREAT:
        case EXT4_FC_TAG_LINK:
        case EXT4_FC_TAG_UNLINK:
                len -= sizeof(struct ext4_fc_dentry_info);
                return len >= 1 && len <= EXT4_NAME_LEN;
        case EXT4_FC_TAG_INODE:
                len -= sizeof(struct ext4_fc_inode);
                return len >= EXT4_GOOD_OLD_INODE_SIZE &&
                        len <= sbi->s_inode_size;
        case EXT4_FC_TAG_PAD:
                return true; /* padding can have any length */
        case EXT4_FC_TAG_TAIL:
                return len >= sizeof(struct ext4_fc_tail);
        case EXT4_FC_TAG_HEAD:
                return len == sizeof(struct ext4_fc_head);
        }
        return false;
}

/*
 * Recovery Scan phase handler
 *
 * This function is called during the scan phase and is responsible
 * for doing following things:
 * - Make sure the fast commit area has valid tags for replay
 * - Count number of tags that need to be replayed by the replay handler
 * - Verify CRC
 * - Create a list of excluded blocks for allocation during replay phase
 *
 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
 * to indicate that scan has finished and JBD2 can now start replay phase.
 * It returns a negative error to indicate that there was an error. At the end
 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
 * to indicate the number of tags that need to replayed during the replay phase.
 */
static int ext4_fc_replay_scan(journal_t *journal,
                                struct buffer_head *bh, int off,
                                tid_t expected_tid)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_fc_replay_state *state;
        int ret = JBD2_FC_REPLAY_CONTINUE;
        struct ext4_fc_add_range ext;
        struct ext4_fc_tl_mem tl;
        struct ext4_fc_tail tail;
        __u8 *start, *end, *cur, *val;
        struct ext4_fc_head head;
        struct ext4_extent *ex;

        state = &sbi->s_fc_replay_state;

        start = (u8 *)bh->b_data;
        end = start + journal->j_blocksize;

        if (state->fc_replay_expected_off == 0) {
                state->fc_cur_tag = 0;
                state->fc_replay_num_tags = 0;
                state->fc_crc = 0;
                state->fc_regions = NULL;
                state->fc_regions_valid = state->fc_regions_used =
                        state->fc_regions_size = 0;
                /* Check if we can stop early */
                if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
                        != EXT4_FC_TAG_HEAD)
                        return 0;
        }

        if (off != state->fc_replay_expected_off) {
                ret = -EFSCORRUPTED;
                goto out_err;
        }

        state->fc_replay_expected_off++;
        for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
             cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
                ext4_fc_get_tl(&tl, cur);
                val = cur + EXT4_FC_TAG_BASE_LEN;
                if (tl.fc_len > end - val ||
                    !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) {
                        ret = state->fc_replay_num_tags ?
                                JBD2_FC_REPLAY_STOP : -ECANCELED;
                        goto out_err;
                }
                ext4_debug("Scan phase, tag:%s, blk %lld\n",
                           tag2str(tl.fc_tag), bh->b_blocknr);
                switch (tl.fc_tag) {
                case EXT4_FC_TAG_ADD_RANGE:
                        memcpy(&ext, val, sizeof(ext));
                        ex = (struct ext4_extent *)&ext.fc_ex;
                        ret = ext4_fc_record_regions(sb,
                                le32_to_cpu(ext.fc_ino),
                                le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
                                ext4_ext_get_actual_len(ex), 0);
                        if (ret < 0)
                                break;
                        ret = JBD2_FC_REPLAY_CONTINUE;
                        fallthrough;
                case EXT4_FC_TAG_DEL_RANGE:
                case EXT4_FC_TAG_LINK:
                case EXT4_FC_TAG_UNLINK:
                case EXT4_FC_TAG_CREAT:
                case EXT4_FC_TAG_INODE:
                case EXT4_FC_TAG_PAD:
                        state->fc_cur_tag++;
                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
                                EXT4_FC_TAG_BASE_LEN + tl.fc_len);
                        break;
                case EXT4_FC_TAG_TAIL:
                        state->fc_cur_tag++;
                        memcpy(&tail, val, sizeof(tail));
                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
                                                EXT4_FC_TAG_BASE_LEN +
                                                offsetof(struct ext4_fc_tail,
                                                fc_crc));
                        if (le32_to_cpu(tail.fc_tid) == expected_tid &&
                                le32_to_cpu(tail.fc_crc) == state->fc_crc) {
                                state->fc_replay_num_tags = state->fc_cur_tag;
                                state->fc_regions_valid =
                                        state->fc_regions_used;
                        } else {
                                ret = state->fc_replay_num_tags ?
                                        JBD2_FC_REPLAY_STOP : -EFSBADCRC;
                        }
                        state->fc_crc = 0;
                        break;
                case EXT4_FC_TAG_HEAD:
                        memcpy(&head, val, sizeof(head));
                        if (le32_to_cpu(head.fc_features) &
                                ~EXT4_FC_SUPPORTED_FEATURES) {
                                ret = -EOPNOTSUPP;
                                break;
                        }
                        if (le32_to_cpu(head.fc_tid) != expected_tid) {
                                ret = JBD2_FC_REPLAY_STOP;
                                break;
                        }
                        state->fc_cur_tag++;
                        state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
                                EXT4_FC_TAG_BASE_LEN + tl.fc_len);
                        break;
                default:
                        ret = state->fc_replay_num_tags ?
                                JBD2_FC_REPLAY_STOP : -ECANCELED;
                }
                if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
                        break;
        }

out_err:
        trace_ext4_fc_replay_scan(sb, ret, off);
        return ret;
}

/*
 * Main recovery path entry point.
 * The meaning of return codes is similar as above.
 */
static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
                                enum passtype pass, int off, tid_t expected_tid)
{
        struct super_block *sb = journal->j_private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_fc_tl_mem tl;
        __u8 *start, *end, *cur, *val;
        int ret = JBD2_FC_REPLAY_CONTINUE;
        struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
        struct ext4_fc_tail tail;

        if (pass == PASS_SCAN) {
                state->fc_current_pass = PASS_SCAN;
                return ext4_fc_replay_scan(journal, bh, off, expected_tid);
        }

        if (state->fc_current_pass != pass) {
                state->fc_current_pass = pass;
                sbi->s_mount_state |= EXT4_FC_REPLAY;
        }
        if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
                ext4_debug("Replay stops\n");
                ext4_fc_set_bitmaps_and_counters(sb);
                return 0;
        }

#ifdef CONFIG_EXT4_DEBUG
        if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
                pr_warn("Dropping fc block %d because max_replay set\n", off);
                return JBD2_FC_REPLAY_STOP;
        }
#endif

        start = (u8 *)bh->b_data;
        end = start + journal->j_blocksize;

        for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
             cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
                ext4_fc_get_tl(&tl, cur);
                val = cur + EXT4_FC_TAG_BASE_LEN;

                if (state->fc_replay_num_tags == 0) {
                        ret = JBD2_FC_REPLAY_STOP;
                        ext4_fc_set_bitmaps_and_counters(sb);
                        break;
                }

                ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag));
                state->fc_replay_num_tags--;
                switch (tl.fc_tag) {
                case EXT4_FC_TAG_LINK:
                        ret = ext4_fc_replay_link(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_UNLINK:
                        ret = ext4_fc_replay_unlink(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_ADD_RANGE:
                        ret = ext4_fc_replay_add_range(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_CREAT:
                        ret = ext4_fc_replay_create(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_DEL_RANGE:
                        ret = ext4_fc_replay_del_range(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_INODE:
                        ret = ext4_fc_replay_inode(sb, &tl, val);
                        break;
                case EXT4_FC_TAG_PAD:
                        trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
                                             tl.fc_len, 0);
                        break;
                case EXT4_FC_TAG_TAIL:
                        trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL,
                                             0, tl.fc_len, 0);
                        memcpy(&tail, val, sizeof(tail));
                        WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
                        break;
                case EXT4_FC_TAG_HEAD:
                        break;
                default:
                        trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
                        ret = -ECANCELED;
                        break;
                }
                if (ret < 0)
                        break;
                ret = JBD2_FC_REPLAY_CONTINUE;
        }
        return ret;
}

void ext4_fc_init(struct super_block *sb, journal_t *journal)
{
        /*
         * We set replay callback even if fast commit disabled because we may
         * could still have fast commit blocks that need to be replayed even if
         * fast commit has now been turned off.
         */
        journal->j_fc_replay_callback = ext4_fc_replay;
        if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
                return;
        journal->j_fc_cleanup_callback = ext4_fc_cleanup;
}

static const char * const fc_ineligible_reasons[] = {
        [EXT4_FC_REASON_XATTR] = "Extended attributes changed",
        [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename",
        [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed",
        [EXT4_FC_REASON_NOMEM] = "Insufficient memory",
        [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot",
        [EXT4_FC_REASON_RESIZE] = "Resize",
        [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed",
        [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
        [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
        [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
};

int ext4_fc_info_show(struct seq_file *seq, void *v)
{
        struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
        struct ext4_fc_stats *stats = &sbi->s_fc_stats;
        int i;

        if (v != SEQ_START_TOKEN)
                return 0;

        seq_printf(seq,
                "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
                   stats->fc_num_commits, stats->fc_ineligible_commits,
                   stats->fc_numblks,
                   div_u64(stats->s_fc_avg_commit_time, 1000));
        seq_puts(seq, "Ineligible reasons:\n");
        for (i = 0; i < EXT4_FC_REASON_MAX; i++)
                seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
                        stats->fc_ineligible_reason_count[i]);

        return 0;
}

int __init ext4_fc_init_dentry_cache(void)
{
        ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
                                           SLAB_RECLAIM_ACCOUNT);

        if (ext4_fc_dentry_cachep == NULL)
                return -ENOMEM;

        return 0;
}

void ext4_fc_destroy_dentry_cache(void)
{
        kmem_cache_destroy(ext4_fc_dentry_cachep);
}



































    3 





    3 








    3 





























    2 










    2 

















































































    9 
















































































































































































































    3 
























    2 




    2 
















    3 







    4 






















    3 


    4 








































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/bitmap.h>
#include <linux/bug.h>
#include <linux/export.h>
#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/xarray.h>

/**
 * idr_alloc_u32() - Allocate an ID.
 * @idr: IDR handle.
 * @ptr: Pointer to be associated with the new ID.
 * @nextid: Pointer to an ID.
 * @max: The maximum ID to allocate (inclusive).
 * @gfp: Memory allocation flags.
 *
 * Allocates an unused ID in the range specified by @nextid and @max.
 * Note that @max is inclusive whereas the @end parameter to idr_alloc()
 * is exclusive.  The new ID is assigned to @nextid before the pointer
 * is inserted into the IDR, so if @nextid points into the object pointed
 * to by @ptr, a concurrent lookup will not find an uninitialised ID.
 *
 * The caller should provide their own locking to ensure that two
 * concurrent modifications to the IDR are not possible.  Read-only
 * accesses to the IDR may be done under the RCU read lock or may
 * exclude simultaneous writers.
 *
 * Return: 0 if an ID was allocated, -ENOMEM if memory allocation failed,
 * or -ENOSPC if no free IDs could be found.  If an error occurred,
 * @nextid is unchanged.
 */
int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid,
                        unsigned long max, gfp_t gfp)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int base = idr->idr_base;
        unsigned int id = *nextid;

        if (WARN_ON_ONCE(!(idr->idr_rt.xa_flags & ROOT_IS_IDR)))
                idr->idr_rt.xa_flags |= IDR_RT_MARKER;

        id = (id < base) ? 0 : id - base;
        radix_tree_iter_init(&iter, id);
        slot = idr_get_free(&idr->idr_rt, &iter, gfp, max - base);
        if (IS_ERR(slot))
                return PTR_ERR(slot);

        *nextid = iter.index + base;
        /* there is a memory barrier inside radix_tree_iter_replace() */
        radix_tree_iter_replace(&idr->idr_rt, &iter, slot, ptr);
        radix_tree_iter_tag_clear(&idr->idr_rt, &iter, IDR_FREE);

        return 0;
}
EXPORT_SYMBOL_GPL(idr_alloc_u32);

/**
 * idr_alloc() - Allocate an ID.
 * @idr: IDR handle.
 * @ptr: Pointer to be associated with the new ID.
 * @start: The minimum ID (inclusive).
 * @end: The maximum ID (exclusive).
 * @gfp: Memory allocation flags.
 *
 * Allocates an unused ID in the range specified by @start and @end.  If
 * @end is <= 0, it is treated as one larger than %INT_MAX.  This allows
 * callers to use @start + N as @end as long as N is within integer range.
 *
 * The caller should provide their own locking to ensure that two
 * concurrent modifications to the IDR are not possible.  Read-only
 * accesses to the IDR may be done under the RCU read lock or may
 * exclude simultaneous writers.
 *
 * Return: The newly allocated ID, -ENOMEM if memory allocation failed,
 * or -ENOSPC if no free IDs could be found.
 */
int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
{
        u32 id = start;
        int ret;

        if (WARN_ON_ONCE(start < 0))
                return -EINVAL;

        ret = idr_alloc_u32(idr, ptr, &id, end > 0 ? end - 1 : INT_MAX, gfp);
        if (ret)
                return ret;

        return id;
}
EXPORT_SYMBOL_GPL(idr_alloc);

/**
 * idr_alloc_cyclic() - Allocate an ID cyclically.
 * @idr: IDR handle.
 * @ptr: Pointer to be associated with the new ID.
 * @start: The minimum ID (inclusive).
 * @end: The maximum ID (exclusive).
 * @gfp: Memory allocation flags.
 *
 * Allocates an unused ID in the range specified by @start and @end.  If
 * @end is <= 0, it is treated as one larger than %INT_MAX.  This allows
 * callers to use @start + N as @end as long as N is within integer range.
 * The search for an unused ID will start at the last ID allocated and will
 * wrap around to @start if no free IDs are found before reaching @end.
 *
 * The caller should provide their own locking to ensure that two
 * concurrent modifications to the IDR are not possible.  Read-only
 * accesses to the IDR may be done under the RCU read lock or may
 * exclude simultaneous writers.
 *
 * Return: The newly allocated ID, -ENOMEM if memory allocation failed,
 * or -ENOSPC if no free IDs could be found.
 */
int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
{
        u32 id = idr->idr_next;
        int err, max = end > 0 ? end - 1 : INT_MAX;

        if ((int)id < start)
                id = start;

        err = idr_alloc_u32(idr, ptr, &id, max, gfp);
        if ((err == -ENOSPC) && (id > start)) {
                id = start;
                err = idr_alloc_u32(idr, ptr, &id, max, gfp);
        }
        if (err)
                return err;

        idr->idr_next = id + 1;
        return id;
}
EXPORT_SYMBOL(idr_alloc_cyclic);

/**
 * idr_remove() - Remove an ID from the IDR.
 * @idr: IDR handle.
 * @id: Pointer ID.
 *
 * Removes this ID from the IDR.  If the ID was not previously in the IDR,
 * this function returns %NULL.
 *
 * Since this function modifies the IDR, the caller should provide their
 * own locking to ensure that concurrent modification of the same IDR is
 * not possible.
 *
 * Return: The pointer formerly associated with this ID.
 */
void *idr_remove(struct idr *idr, unsigned long id)
{
        return radix_tree_delete_item(&idr->idr_rt, id - idr->idr_base, NULL);
}
EXPORT_SYMBOL_GPL(idr_remove);

/**
 * idr_find() - Return pointer for given ID.
 * @idr: IDR handle.
 * @id: Pointer ID.
 *
 * Looks up the pointer associated with this ID.  A %NULL pointer may
 * indicate that @id is not allocated or that the %NULL pointer was
 * associated with this ID.
 *
 * This function can be called under rcu_read_lock(), given that the leaf
 * pointers lifetimes are correctly managed.
 *
 * Return: The pointer associated with this ID.
 */
void *idr_find(const struct idr *idr, unsigned long id)
{
        return radix_tree_lookup(&idr->idr_rt, id - idr->idr_base);
}
EXPORT_SYMBOL_GPL(idr_find);

/**
 * idr_for_each() - Iterate through all stored pointers.
 * @idr: IDR handle.
 * @fn: Function to be called for each pointer.
 * @data: Data passed to callback function.
 *
 * The callback function will be called for each entry in @idr, passing
 * the ID, the entry and @data.
 *
 * If @fn returns anything other than %0, the iteration stops and that
 * value is returned from this function.
 *
 * idr_for_each() can be called concurrently with idr_alloc() and
 * idr_remove() if protected by RCU.  Newly added entries may not be
 * seen and deleted entries may be seen, but adding and removing entries
 * will not cause other entries to be skipped, nor spurious ones to be seen.
 */
int idr_for_each(const struct idr *idr,
                int (*fn)(int id, void *p, void *data), void *data)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        int base = idr->idr_base;

        radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, 0) {
                int ret;
                unsigned long id = iter.index + base;

                if (WARN_ON_ONCE(id > INT_MAX))
                        break;
                ret = fn(id, rcu_dereference_raw(*slot), data);
                if (ret)
                        return ret;
        }

        return 0;
}
EXPORT_SYMBOL(idr_for_each);

/**
 * idr_get_next_ul() - Find next populated entry.
 * @idr: IDR handle.
 * @nextid: Pointer to an ID.
 *
 * Returns the next populated entry in the tree with an ID greater than
 * or equal to the value pointed to by @nextid.  On exit, @nextid is updated
 * to the ID of the found value.  To use in a loop, the value pointed to by
 * nextid must be incremented by the user.
 */
void *idr_get_next_ul(struct idr *idr, unsigned long *nextid)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        void *entry = NULL;
        unsigned long base = idr->idr_base;
        unsigned long id = *nextid;

        id = (id < base) ? 0 : id - base;
        radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, id) {
                entry = rcu_dereference_raw(*slot);
                if (!entry)
                        continue;
                if (!xa_is_internal(entry))
                        break;
                if (slot != &idr->idr_rt.xa_head && !xa_is_retry(entry))
                        break;
                slot = radix_tree_iter_retry(&iter);
        }
        if (!slot)
                return NULL;

        *nextid = iter.index + base;
        return entry;
}
EXPORT_SYMBOL(idr_get_next_ul);

/**
 * idr_get_next() - Find next populated entry.
 * @idr: IDR handle.
 * @nextid: Pointer to an ID.
 *
 * Returns the next populated entry in the tree with an ID greater than
 * or equal to the value pointed to by @nextid.  On exit, @nextid is updated
 * to the ID of the found value.  To use in a loop, the value pointed to by
 * nextid must be incremented by the user.
 */
void *idr_get_next(struct idr *idr, int *nextid)
{
        unsigned long id = *nextid;
        void *entry = idr_get_next_ul(idr, &id);

        if (WARN_ON_ONCE(id > INT_MAX))
                return NULL;
        *nextid = id;
        return entry;
}
EXPORT_SYMBOL(idr_get_next);

/**
 * idr_replace() - replace pointer for given ID.
 * @idr: IDR handle.
 * @ptr: New pointer to associate with the ID.
 * @id: ID to change.
 *
 * Replace the pointer registered with an ID and return the old value.
 * This function can be called under the RCU read lock concurrently with
 * idr_alloc() and idr_remove() (as long as the ID being removed is not
 * the one being replaced!).
 *
 * Returns: the old value on success.  %-ENOENT indicates that @id was not
 * found.  %-EINVAL indicates that @ptr was not valid.
 */
void *idr_replace(struct idr *idr, void *ptr, unsigned long id)
{
        struct radix_tree_node *node;
        void __rcu **slot = NULL;
        void *entry;

        id -= idr->idr_base;

        entry = __radix_tree_lookup(&idr->idr_rt, id, &node, &slot);
        if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE))
                return ERR_PTR(-ENOENT);

        __radix_tree_replace(&idr->idr_rt, node, slot, ptr);

        return entry;
}
EXPORT_SYMBOL(idr_replace);

/**
 * DOC: IDA description
 *
 * The IDA is an ID allocator which does not provide the ability to
 * associate an ID with a pointer.  As such, it only needs to store one
 * bit per ID, and so is more space efficient than an IDR.  To use an IDA,
 * define it using DEFINE_IDA() (or embed a &struct ida in a data structure,
 * then initialise it using ida_init()).  To allocate a new ID, call
 * ida_alloc(), ida_alloc_min(), ida_alloc_max() or ida_alloc_range().
 * To free an ID, call ida_free().
 *
 * ida_destroy() can be used to dispose of an IDA without needing to
 * free the individual IDs in it.  You can use ida_is_empty() to find
 * out whether the IDA has any IDs currently allocated.
 *
 * The IDA handles its own locking.  It is safe to call any of the IDA
 * functions without synchronisation in your code.
 *
 * IDs are currently limited to the range [0-INT_MAX].  If this is an awkward
 * limitation, it should be quite straightforward to raise the maximum.
 */

/*
 * Developer's notes:
 *
 * The IDA uses the functionality provided by the XArray to store bitmaps in
 * each entry.  The XA_FREE_MARK is only cleared when all bits in the bitmap
 * have been set.
 *
 * I considered telling the XArray that each slot is an order-10 node
 * and indexing by bit number, but the XArray can't allow a single multi-index
 * entry in the head, which would significantly increase memory consumption
 * for the IDA.  So instead we divide the index by the number of bits in the
 * leaf bitmap before doing a radix tree lookup.
 *
 * As an optimisation, if there are only a few low bits set in any given
 * leaf, instead of allocating a 128-byte bitmap, we store the bits
 * as a value entry.  Value entries never have the XA_FREE_MARK cleared
 * because we can always convert them into a bitmap entry.
 *
 * It would be possible to optimise further; once we've run out of a
 * single 128-byte bitmap, we currently switch to a 576-byte node, put
 * the 128-byte bitmap in the first entry and then start allocating extra
 * 128-byte entries.  We could instead use the 512 bytes of the node's
 * data as a bitmap before moving to that scheme.  I do not believe this
 * is a worthwhile optimisation; Rasmus Villemoes surveyed the current
 * users of the IDA and almost none of them use more than 1024 entries.
 * Those that do use more than the 8192 IDs that the 512 bytes would
 * provide.
 *
 * The IDA always uses a lock to alloc/free.  If we add a 'test_bit'
 * equivalent, it will still need locking.  Going to RCU lookup would require
 * using RCU to free bitmaps, and that's not trivial without embedding an
 * RCU head in the bitmap, which adds a 2-pointer overhead to each 128-byte
 * bitmap, which is excessive.
 */

/**
 * ida_alloc_range() - Allocate an unused ID.
 * @ida: IDA handle.
 * @min: Lowest ID to allocate.
 * @max: Highest ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Allocate an ID between @min and @max, inclusive.  The allocated ID will
 * not exceed %INT_MAX, even if @max is larger.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
 * or %-ENOSPC if there are no free IDs.
 */
int ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max,
                        gfp_t gfp)
{
        XA_STATE(xas, &ida->xa, min / IDA_BITMAP_BITS);
        unsigned bit = min % IDA_BITMAP_BITS;
        unsigned long flags;
        struct ida_bitmap *bitmap, *alloc = NULL;

        if ((int)min < 0)
                return -ENOSPC;

        if ((int)max < 0)
                max = INT_MAX;

retry:
        xas_lock_irqsave(&xas, flags);
next:
        bitmap = xas_find_marked(&xas, max / IDA_BITMAP_BITS, XA_FREE_MARK);
        if (xas.xa_index > min / IDA_BITMAP_BITS)
                bit = 0;
        if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
                goto nospc;

        if (xa_is_value(bitmap)) {
                unsigned long tmp = xa_to_value(bitmap);

                if (bit < BITS_PER_XA_VALUE) {
                        bit = find_next_zero_bit(&tmp, BITS_PER_XA_VALUE, bit);
                        if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
                                goto nospc;
                        if (bit < BITS_PER_XA_VALUE) {
                                tmp |= 1UL << bit;
                                xas_store(&xas, xa_mk_value(tmp));
                                goto out;
                        }
                }
                bitmap = alloc;
                if (!bitmap)
                        bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT);
                if (!bitmap)
                        goto alloc;
                bitmap->bitmap[0] = tmp;
                xas_store(&xas, bitmap);
                if (xas_error(&xas)) {
                        bitmap->bitmap[0] = 0;
                        goto out;
                }
        }

        if (bitmap) {
                bit = find_next_zero_bit(bitmap->bitmap, IDA_BITMAP_BITS, bit);
                if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
                        goto nospc;
                if (bit == IDA_BITMAP_BITS)
                        goto next;

                __set_bit(bit, bitmap->bitmap);
                if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS))
                        xas_clear_mark(&xas, XA_FREE_MARK);
        } else {
                if (bit < BITS_PER_XA_VALUE) {
                        bitmap = xa_mk_value(1UL << bit);
                } else {
                        bitmap = alloc;
                        if (!bitmap)
                                bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT);
                        if (!bitmap)
                                goto alloc;
                        __set_bit(bit, bitmap->bitmap);
                }
                xas_store(&xas, bitmap);
        }
out:
        xas_unlock_irqrestore(&xas, flags);
        if (xas_nomem(&xas, gfp)) {
                xas.xa_index = min / IDA_BITMAP_BITS;
                bit = min % IDA_BITMAP_BITS;
                goto retry;
        }
        if (bitmap != alloc)
                kfree(alloc);
        if (xas_error(&xas))
                return xas_error(&xas);
        return xas.xa_index * IDA_BITMAP_BITS + bit;
alloc:
        xas_unlock_irqrestore(&xas, flags);
        alloc = kzalloc(sizeof(*bitmap), gfp);
        if (!alloc)
                return -ENOMEM;
        xas_set(&xas, min / IDA_BITMAP_BITS);
        bit = min % IDA_BITMAP_BITS;
        goto retry;
nospc:
        xas_unlock_irqrestore(&xas, flags);
        kfree(alloc);
        return -ENOSPC;
}
EXPORT_SYMBOL(ida_alloc_range);

/**
 * ida_free() - Release an allocated ID.
 * @ida: IDA handle.
 * @id: Previously allocated ID.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 */
void ida_free(struct ida *ida, unsigned int id)
{
        XA_STATE(xas, &ida->xa, id / IDA_BITMAP_BITS);
        unsigned bit = id % IDA_BITMAP_BITS;
        struct ida_bitmap *bitmap;
        unsigned long flags;

        if ((int)id < 0)
                return;

        xas_lock_irqsave(&xas, flags);
        bitmap = xas_load(&xas);

        if (xa_is_value(bitmap)) {
                unsigned long v = xa_to_value(bitmap);
                if (bit >= BITS_PER_XA_VALUE)
                        goto err;
                if (!(v & (1UL << bit)))
                        goto err;
                v &= ~(1UL << bit);
                if (!v)
                        goto delete;
                xas_store(&xas, xa_mk_value(v));
        } else {
                if (!bitmap || !test_bit(bit, bitmap->bitmap))
                        goto err;
                __clear_bit(bit, bitmap->bitmap);
                xas_set_mark(&xas, XA_FREE_MARK);
                if (bitmap_empty(bitmap->bitmap, IDA_BITMAP_BITS)) {
                        kfree(bitmap);
delete:
                        xas_store(&xas, NULL);
                }
        }
        xas_unlock_irqrestore(&xas, flags);
        return;
 err:
        xas_unlock_irqrestore(&xas, flags);
        WARN(1, "ida_free called for id=%d which is not allocated.\n", id);
}
EXPORT_SYMBOL(ida_free);

/**
 * ida_destroy() - Free all IDs.
 * @ida: IDA handle.
 *
 * Calling this function frees all IDs and releases all resources used
 * by an IDA.  When this call returns, the IDA is empty and can be reused
 * or freed.  If the IDA is already empty, there is no need to call this
 * function.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 */
void ida_destroy(struct ida *ida)
{
        XA_STATE(xas, &ida->xa, 0);
        struct ida_bitmap *bitmap;
        unsigned long flags;

        xas_lock_irqsave(&xas, flags);
        xas_for_each(&xas, bitmap, ULONG_MAX) {
                if (!xa_is_value(bitmap))
                        kfree(bitmap);
                xas_store(&xas, NULL);
        }
        xas_unlock_irqrestore(&xas, flags);
}
EXPORT_SYMBOL(ida_destroy);

#ifndef __KERNEL__
extern void xa_dump_index(unsigned long index, unsigned int shift);
#define IDA_CHUNK_SHIFT                ilog2(IDA_BITMAP_BITS)

static void ida_dump_entry(void *entry, unsigned long index)
{
        unsigned long i;

        if (!entry)
                return;

        if (xa_is_node(entry)) {
                struct xa_node *node = xa_to_node(entry);
                unsigned int shift = node->shift + IDA_CHUNK_SHIFT +
                        XA_CHUNK_SHIFT;

                xa_dump_index(index * IDA_BITMAP_BITS, shift);
                xa_dump_node(node);
                for (i = 0; i < XA_CHUNK_SIZE; i++)
                        ida_dump_entry(node->slots[i],
                                        index | (i << node->shift));
        } else if (xa_is_value(entry)) {
                xa_dump_index(index * IDA_BITMAP_BITS, ilog2(BITS_PER_LONG));
                pr_cont("value: data %lx [%px]\n", xa_to_value(entry), entry);
        } else {
                struct ida_bitmap *bitmap = entry;

                xa_dump_index(index * IDA_BITMAP_BITS, IDA_CHUNK_SHIFT);
                pr_cont("bitmap: %p data", bitmap);
                for (i = 0; i < IDA_BITMAP_LONGS; i++)
                        pr_cont(" %lx", bitmap->bitmap[i]);
                pr_cont("\n");
        }
}

static void ida_dump(struct ida *ida)
{
        struct xarray *xa = &ida->xa;
        pr_debug("ida: %p node %p free %d\n", ida, xa->xa_head,
                                xa->xa_flags >> ROOT_TAG_SHIFT);
        ida_dump_entry(xa->xa_head, 0);
}
#endif
























    7 











    7 







    8 



    8 







    8 






    8 


































































































































































    5 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
// SPDX-License-Identifier: GPL-2.0

#include <linux/quotaops.h>
#include <linux/uuid.h>

#include "ext4.h"
#include "xattr.h"
#include "ext4_jbd2.h"

static void ext4_fname_from_fscrypt_name(struct ext4_filename *dst,
                                         const struct fscrypt_name *src)
{
        memset(dst, 0, sizeof(*dst));

        dst->usr_fname = src->usr_fname;
        dst->disk_name = src->disk_name;
        dst->hinfo.hash = src->hash;
        dst->hinfo.minor_hash = src->minor_hash;
        dst->crypto_buf = src->crypto_buf;
}

int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
                              int lookup, struct ext4_filename *fname)
{
        struct fscrypt_name name;
        int err;

        err = fscrypt_setup_filename(dir, iname, lookup, &name);
        if (err)
                return err;

        ext4_fname_from_fscrypt_name(fname, &name);

#if IS_ENABLED(CONFIG_UNICODE)
        err = ext4_fname_setup_ci_filename(dir, iname, fname);
        if (err)
                ext4_fname_free_filename(fname);
#endif
        return err;
}

int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry,
                              struct ext4_filename *fname)
{
        struct fscrypt_name name;
        int err;

        err = fscrypt_prepare_lookup(dir, dentry, &name);
        if (err)
                return err;

        ext4_fname_from_fscrypt_name(fname, &name);

#if IS_ENABLED(CONFIG_UNICODE)
        err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname);
        if (err)
                ext4_fname_free_filename(fname);
#endif
        return err;
}

void ext4_fname_free_filename(struct ext4_filename *fname)
{
        struct fscrypt_name name;

        name.crypto_buf = fname->crypto_buf;
        fscrypt_free_filename(&name);

        fname->crypto_buf.name = NULL;
        fname->usr_fname = NULL;
        fname->disk_name.name = NULL;

#if IS_ENABLED(CONFIG_UNICODE)
        kfree(fname->cf_name.name);
        fname->cf_name.name = NULL;
#endif
}

static bool uuid_is_zero(__u8 u[16])
{
        int i;

        for (i = 0; i < 16; i++)
                if (u[i])
                        return false;
        return true;
}

int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg)
{
        struct super_block *sb = file_inode(filp)->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int err, err2;
        handle_t *handle;

        if (!ext4_has_feature_encrypt(sb))
                return -EOPNOTSUPP;

        if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) {
                err = mnt_want_write_file(filp);
                if (err)
                        return err;
                handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
                if (IS_ERR(handle)) {
                        err = PTR_ERR(handle);
                        goto pwsalt_err_exit;
                }
                err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh,
                                                    EXT4_JTR_NONE);
                if (err)
                        goto pwsalt_err_journal;
                lock_buffer(sbi->s_sbh);
                generate_random_uuid(sbi->s_es->s_encrypt_pw_salt);
                ext4_superblock_csum_set(sb);
                unlock_buffer(sbi->s_sbh);
                err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
pwsalt_err_journal:
                err2 = ext4_journal_stop(handle);
                if (err2 && !err)
                        err = err2;
pwsalt_err_exit:
                mnt_drop_write_file(filp);
                if (err)
                        return err;
        }

        if (copy_to_user(arg, sbi->s_es->s_encrypt_pw_salt, 16))
                return -EFAULT;
        return 0;
}

static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
{
        return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
                                 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len);
}

static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
                                                        void *fs_data)
{
        handle_t *handle = fs_data;
        int res, res2, credits, retries = 0;

        /*
         * Encrypting the root directory is not allowed because e2fsck expects
         * lost+found to exist and be unencrypted, and encrypting the root
         * directory would imply encrypting the lost+found directory as well as
         * the filename "lost+found" itself.
         */
        if (inode->i_ino == EXT4_ROOT_INO)
                return -EPERM;

        if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
                return -EINVAL;

        if (ext4_test_inode_flag(inode, EXT4_INODE_DAX))
                return -EOPNOTSUPP;

        res = ext4_convert_inline_data(inode);
        if (res)
                return res;

        /*
         * If a journal handle was specified, then the encryption context is
         * being set on a new inode via inheritance and is part of a larger
         * transaction to create the inode.  Otherwise the encryption context is
         * being set on an existing inode in its own transaction.  Only in the
         * latter case should the "retry on ENOSPC" logic be used.
         */

        if (handle) {
                res = ext4_xattr_set_handle(handle, inode,
                                            EXT4_XATTR_INDEX_ENCRYPTION,
                                            EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
                                            ctx, len, 0);
                if (!res) {
                        ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
                        ext4_clear_inode_state(inode,
                                        EXT4_STATE_MAY_INLINE_DATA);
                        /*
                         * Update inode->i_flags - S_ENCRYPTED will be enabled,
                         * S_DAX may be disabled
                         */
                        ext4_set_inode_flags(inode, false);
                }
                return res;
        }

        res = dquot_initialize(inode);
        if (res)
                return res;
retry:
        res = ext4_xattr_set_credits(inode, len, false /* is_create */,
                                     &credits);
        if (res)
                return res;

        handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
                                    EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
                                    ctx, len, 0);
        if (!res) {
                ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
                /*
                 * Update inode->i_flags - S_ENCRYPTED will be enabled,
                 * S_DAX may be disabled
                 */
                ext4_set_inode_flags(inode, false);
                res = ext4_mark_inode_dirty(handle, inode);
                if (res)
                        EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
        }
        res2 = ext4_journal_stop(handle);

        if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
        if (!res)
                res = res2;
        return res;
}

static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb)
{
        return EXT4_SB(sb)->s_dummy_enc_policy.policy;
}

static bool ext4_has_stable_inodes(struct super_block *sb)
{
        return ext4_has_feature_stable_inodes(sb);
}

const struct fscrypt_operations ext4_cryptops = {
        .needs_bounce_pages        = 1,
        .has_32bit_inodes        = 1,
        .supports_subblock_data_units = 1,
        .legacy_key_prefix        = "ext4:",
        .get_context                = ext4_get_context,
        .set_context                = ext4_set_context,
        .get_dummy_policy        = ext4_get_dummy_policy,
        .empty_dir                = ext4_empty_dir,
        .has_stable_inodes        = ext4_has_stable_inodes,
};































    1 






    1 
















    1 









    1 









    1 
    1 
    1 



    1 
    1 

    1 













    1 








    1 




    1 




































    1 















    1 















    4 




    3 
    1 
    1 













    2 








    2 
    2 
    2 




    2 

    2 


    1 

    1 




    1 










































    1 







    1 
















    1 



































































    1 



    1 


    1 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
// SPDX-License-Identifier: GPL-2.0
/*
 * High-level sync()-related operations
 */

#include <linux/blkdev.h>
#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/namei.h>
#include <linux/sched.h>
#include <linux/writeback.h>
#include <linux/syscalls.h>
#include <linux/linkage.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
#include <linux/backing-dev.h>
#include "internal.h"

#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
                        SYNC_FILE_RANGE_WAIT_AFTER)

/*
 * Write out and wait upon all dirty data associated with this
 * superblock.  Filesystem data as well as the underlying block
 * device.  Takes the superblock lock.
 */
int sync_filesystem(struct super_block *sb)
{
        int ret = 0;

        /*
         * We need to be protected against the filesystem going from
         * r/o to r/w or vice versa.
         */
        WARN_ON(!rwsem_is_locked(&sb->s_umount));

        /*
         * No point in syncing out anything if the filesystem is read-only.
         */
        if (sb_rdonly(sb))
                return 0;

        /*
         * Do the filesystem syncing work.  For simple filesystems
         * writeback_inodes_sb(sb) just dirties buffers with inodes so we have
         * to submit I/O for these buffers via sync_blockdev().  This also
         * speeds up the wait == 1 case since in that case write_inode()
         * methods call sync_dirty_buffer() and thus effectively write one block
         * at a time.
         */
        writeback_inodes_sb(sb, WB_REASON_SYNC);
        if (sb->s_op->sync_fs) {
                ret = sb->s_op->sync_fs(sb, 0);
                if (ret)
                        return ret;
        }
        ret = sync_blockdev_nowait(sb->s_bdev);
        if (ret)
                return ret;

        sync_inodes_sb(sb);
        if (sb->s_op->sync_fs) {
                ret = sb->s_op->sync_fs(sb, 1);
                if (ret)
                        return ret;
        }
        return sync_blockdev(sb->s_bdev);
}
EXPORT_SYMBOL(sync_filesystem);

static void sync_inodes_one_sb(struct super_block *sb, void *arg)
{
        if (!sb_rdonly(sb))
                sync_inodes_sb(sb);
}

static void sync_fs_one_sb(struct super_block *sb, void *arg)
{
        if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC) &&
            sb->s_op->sync_fs)
                sb->s_op->sync_fs(sb, *(int *)arg);
}

/*
 * Sync everything. We start by waking flusher threads so that most of
 * writeback runs on all devices in parallel. Then we sync all inodes reliably
 * which effectively also waits for all flusher threads to finish doing
 * writeback. At this point all data is on disk so metadata should be stable
 * and we tell filesystems to sync their metadata via ->sync_fs() calls.
 * Finally, we writeout all block devices because some filesystems (e.g. ext2)
 * just write metadata (such as inodes or bitmaps) to block device page cache
 * and do not sync it on their own in ->sync_fs().
 */
void ksys_sync(void)
{
        int nowait = 0, wait = 1;

        wakeup_flusher_threads(WB_REASON_SYNC);
        iterate_supers(sync_inodes_one_sb, NULL);
        iterate_supers(sync_fs_one_sb, &nowait);
        iterate_supers(sync_fs_one_sb, &wait);
        sync_bdevs(false);
        sync_bdevs(true);
        if (unlikely(laptop_mode))
                laptop_sync_completion();
}

SYSCALL_DEFINE0(sync)
{
        ksys_sync();
        return 0;
}

static void do_sync_work(struct work_struct *work)
{
        int nowait = 0;

        /*
         * Sync twice to reduce the possibility we skipped some inodes / pages
         * because they were temporarily locked
         */
        iterate_supers(sync_inodes_one_sb, &nowait);
        iterate_supers(sync_fs_one_sb, &nowait);
        sync_bdevs(false);
        iterate_supers(sync_inodes_one_sb, &nowait);
        iterate_supers(sync_fs_one_sb, &nowait);
        sync_bdevs(false);
        printk("Emergency Sync complete\n");
        kfree(work);
}

void emergency_sync(void)
{
        struct work_struct *work;

        work = kmalloc(sizeof(*work), GFP_ATOMIC);
        if (work) {
                INIT_WORK(work, do_sync_work);
                schedule_work(work);
        }
}

/*
 * sync a single super
 */
SYSCALL_DEFINE1(syncfs, int, fd)
{
        struct fd f = fdget(fd);
        struct super_block *sb;
        int ret, ret2;

        if (!f.file)
                return -EBADF;
        sb = f.file->f_path.dentry->d_sb;

        down_read(&sb->s_umount);
        ret = sync_filesystem(sb);
        up_read(&sb->s_umount);

        ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err);

        fdput(f);
        return ret ? ret : ret2;
}

/**
 * vfs_fsync_range - helper to sync a range of data & metadata to disk
 * @file:                file to sync
 * @start:                offset in bytes of the beginning of data range to sync
 * @end:                offset in bytes of the end of data range (inclusive)
 * @datasync:                perform only datasync
 *
 * Write back data in range @start..@end and metadata for @file to disk.  If
 * @datasync is set only metadata needed to access modified file data is
 * written.
 */
int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
{
        struct inode *inode = file->f_mapping->host;

        if (!file->f_op->fsync)
                return -EINVAL;
        if (!datasync && (inode->i_state & I_DIRTY_TIME))
                mark_inode_dirty_sync(inode);
        return file->f_op->fsync(file, start, end, datasync);
}
EXPORT_SYMBOL(vfs_fsync_range);

/**
 * vfs_fsync - perform a fsync or fdatasync on a file
 * @file:                file to sync
 * @datasync:                only perform a fdatasync operation
 *
 * Write back data and metadata for @file to disk.  If @datasync is
 * set only metadata needed to access modified file data is written.
 */
int vfs_fsync(struct file *file, int datasync)
{
        return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
}
EXPORT_SYMBOL(vfs_fsync);

static int do_fsync(unsigned int fd, int datasync)
{
        struct fd f = fdget(fd);
        int ret = -EBADF;

        if (f.file) {
                ret = vfs_fsync(f.file, datasync);
                fdput(f);
        }
        return ret;
}

SYSCALL_DEFINE1(fsync, unsigned int, fd)
{
        return do_fsync(fd, 0);
}

SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
{
        return do_fsync(fd, 1);
}

int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
                    unsigned int flags)
{
        int ret;
        struct address_space *mapping;
        loff_t endbyte;                        /* inclusive */
        umode_t i_mode;

        ret = -EINVAL;
        if (flags & ~VALID_FLAGS)
                goto out;

        endbyte = offset + nbytes;

        if ((s64)offset < 0)
                goto out;
        if ((s64)endbyte < 0)
                goto out;
        if (endbyte < offset)
                goto out;

        if (sizeof(pgoff_t) == 4) {
                if (offset >= (0x100000000ULL << PAGE_SHIFT)) {
                        /*
                         * The range starts outside a 32 bit machine's
                         * pagecache addressing capabilities.  Let it "succeed"
                         */
                        ret = 0;
                        goto out;
                }
                if (endbyte >= (0x100000000ULL << PAGE_SHIFT)) {
                        /*
                         * Out to EOF
                         */
                        nbytes = 0;
                }
        }

        if (nbytes == 0)
                endbyte = LLONG_MAX;
        else
                endbyte--;                /* inclusive */

        i_mode = file_inode(file)->i_mode;
        ret = -ESPIPE;
        if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
                        !S_ISLNK(i_mode))
                goto out;

        mapping = file->f_mapping;
        ret = 0;
        if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
                ret = file_fdatawait_range(file, offset, endbyte);
                if (ret < 0)
                        goto out;
        }

        if (flags & SYNC_FILE_RANGE_WRITE) {
                int sync_mode = WB_SYNC_NONE;

                if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) ==
                             SYNC_FILE_RANGE_WRITE_AND_WAIT)
                        sync_mode = WB_SYNC_ALL;

                ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
                                                 sync_mode);
                if (ret < 0)
                        goto out;
        }

        if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
                ret = file_fdatawait_range(file, offset, endbyte);

out:
        return ret;
}

/*
 * ksys_sync_file_range() permits finely controlled syncing over a segment of
 * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
 * zero then ksys_sync_file_range() will operate from offset out to EOF.
 *
 * The flag bits are:
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range
 * before performing the write.
 *
 * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
 * range which are not presently under writeback. Note that this may block for
 * significant periods due to exhaustion of disk request structures.
 *
 * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
 * after performing the write.
 *
 * Useful combinations of the flag bits are:
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
 * in the range which were dirty on entry to ksys_sync_file_range() are placed
 * under writeout.  This is a start-write-for-data-integrity operation.
 *
 * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
 * are not presently under writeout.  This is an asynchronous flush-to-disk
 * operation.  Not suitable for data integrity operations.
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for
 * completion of writeout of all pages in the range.  This will be used after an
 * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
 * for that operation to complete and to return the result.
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER
 * (a.k.a. SYNC_FILE_RANGE_WRITE_AND_WAIT):
 * a traditional sync() operation.  This is a write-for-data-integrity operation
 * which will ensure that all pages in the range which were dirty on entry to
 * ksys_sync_file_range() are written to disk.  It should be noted that disk
 * caches are not flushed by this call, so there are no guarantees here that the
 * data will be available on disk after a crash.
 *
 *
 * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
 * I/O errors or ENOSPC conditions and will return those to the caller, after
 * clearing the EIO and ENOSPC flags in the address_space.
 *
 * It should be noted that none of these operations write out the file's
 * metadata.  So unless the application is strictly performing overwrites of
 * already-instantiated disk blocks, there are no guarantees here that the data
 * will be available after a crash.
 */
int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
                         unsigned int flags)
{
        int ret;
        struct fd f;

        ret = -EBADF;
        f = fdget(fd);
        if (f.file)
                ret = sync_file_range(f.file, offset, nbytes, flags);

        fdput(f);
        return ret;
}

SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
                                unsigned int, flags)
{
        return ksys_sync_file_range(fd, offset, nbytes, flags);
}

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_SYNC_FILE_RANGE)
COMPAT_SYSCALL_DEFINE6(sync_file_range, int, fd, compat_arg_u64_dual(offset),
                       compat_arg_u64_dual(nbytes), unsigned int, flags)
{
        return ksys_sync_file_range(fd, compat_arg_u64_glue(offset),
                                    compat_arg_u64_glue(nbytes), flags);
}
#endif

/* It would be nice if people remember that not all the world's an i386
   when they introduce new system calls */
SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
                                 loff_t, offset, loff_t, nbytes)
{
        return ksys_sync_file_range(fd, offset, nbytes, flags);
}




























































































































































































































































































































































































































































    7 











































































































































































































































































































































    9 





































































































































   10 









    3 





    7 



































   10 
    9 




















































    6 





    6 













































   15 
   14 









    5 

    5 


    4 


    1 




    2 







    2 



















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2005-2010 IBM Corporation
 *
 * Author:
 * Mimi Zohar <zohar@us.ibm.com>
 * Kylene Hall <kjhall@us.ibm.com>
 *
 * File: evm_main.c
 *        implements evm_inode_setxattr, evm_inode_post_setxattr,
 *        evm_inode_removexattr, evm_verifyxattr, and evm_inode_set_acl.
 */

#define pr_fmt(fmt) "EVM: "fmt

#include <linux/init.h>
#include <linux/audit.h>
#include <linux/xattr.h>
#include <linux/integrity.h>
#include <linux/evm.h>
#include <linux/magic.h>
#include <linux/posix_acl_xattr.h>
#include <linux/lsm_hooks.h>

#include <crypto/hash.h>
#include <crypto/hash_info.h>
#include <crypto/utils.h>
#include "evm.h"

int evm_initialized;

static const char * const integrity_status_msg[] = {
        "pass", "pass_immutable", "fail", "fail_immutable", "no_label",
        "no_xattrs", "unknown"
};
int evm_hmac_attrs;

static struct xattr_list evm_config_default_xattrnames[] = {
        {
         .name = XATTR_NAME_SELINUX,
         .enabled = IS_ENABLED(CONFIG_SECURITY_SELINUX)
        },
        {
         .name = XATTR_NAME_SMACK,
         .enabled = IS_ENABLED(CONFIG_SECURITY_SMACK)
        },
        {
         .name = XATTR_NAME_SMACKEXEC,
         .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS)
        },
        {
         .name = XATTR_NAME_SMACKTRANSMUTE,
         .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS)
        },
        {
         .name = XATTR_NAME_SMACKMMAP,
         .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS)
        },
        {
         .name = XATTR_NAME_APPARMOR,
         .enabled = IS_ENABLED(CONFIG_SECURITY_APPARMOR)
        },
        {
         .name = XATTR_NAME_IMA,
         .enabled = IS_ENABLED(CONFIG_IMA_APPRAISE)
        },
        {
         .name = XATTR_NAME_CAPS,
         .enabled = true
        },
};

LIST_HEAD(evm_config_xattrnames);

static int evm_fixmode __ro_after_init;
static int __init evm_set_fixmode(char *str)
{
        if (strncmp(str, "fix", 3) == 0)
                evm_fixmode = 1;
        else
                pr_err("invalid \"%s\" mode", str);

        return 1;
}
__setup("evm=", evm_set_fixmode);

static void __init evm_init_config(void)
{
        int i, xattrs;

        xattrs = ARRAY_SIZE(evm_config_default_xattrnames);

        pr_info("Initialising EVM extended attributes:\n");
        for (i = 0; i < xattrs; i++) {
                pr_info("%s%s\n", evm_config_default_xattrnames[i].name,
                        !evm_config_default_xattrnames[i].enabled ?
                        " (disabled)" : "");
                list_add_tail(&evm_config_default_xattrnames[i].list,
                              &evm_config_xattrnames);
        }

#ifdef CONFIG_EVM_ATTR_FSUUID
        evm_hmac_attrs |= EVM_ATTR_FSUUID;
#endif
        pr_info("HMAC attrs: 0x%x\n", evm_hmac_attrs);
}

static bool evm_key_loaded(void)
{
        return (bool)(evm_initialized & EVM_KEY_MASK);
}

/*
 * This function determines whether or not it is safe to ignore verification
 * errors, based on the ability of EVM to calculate HMACs. If the HMAC key
 * is not loaded, and it cannot be loaded in the future due to the
 * EVM_SETUP_COMPLETE initialization flag, allowing an operation despite the
 * attrs/xattrs being found invalid will not make them valid.
 */
static bool evm_hmac_disabled(void)
{
        if (evm_initialized & EVM_INIT_HMAC)
                return false;

        if (!(evm_initialized & EVM_SETUP_COMPLETE))
                return false;

        return true;
}

static int evm_find_protected_xattrs(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);
        struct xattr_list *xattr;
        int error;
        int count = 0;

        if (!(inode->i_opflags & IOP_XATTR))
                return -EOPNOTSUPP;

        list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) {
                error = __vfs_getxattr(dentry, inode, xattr->name, NULL, 0);
                if (error < 0) {
                        if (error == -ENODATA)
                                continue;
                        return error;
                }
                count++;
        }

        return count;
}

static int is_unsupported_hmac_fs(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);

        if (inode->i_sb->s_iflags & SB_I_EVM_HMAC_UNSUPPORTED) {
                pr_info_once("%s not supported\n", inode->i_sb->s_type->name);
                return 1;
        }
        return 0;
}

/*
 * evm_verify_hmac - calculate and compare the HMAC with the EVM xattr
 *
 * Compute the HMAC on the dentry's protected set of extended attributes
 * and compare it against the stored security.evm xattr.
 *
 * For performance:
 * - use the previoulsy retrieved xattr value and length to calculate the
 *   HMAC.)
 * - cache the verification result in the iint, when available.
 *
 * Returns integrity status
 */
static enum integrity_status evm_verify_hmac(struct dentry *dentry,
                                             const char *xattr_name,
                                             char *xattr_value,
                                             size_t xattr_value_len)
{
        struct evm_ima_xattr_data *xattr_data = NULL;
        struct signature_v2_hdr *hdr;
        enum integrity_status evm_status = INTEGRITY_PASS;
        struct evm_digest digest;
        struct inode *inode = d_backing_inode(dentry);
        struct evm_iint_cache *iint = evm_iint_inode(inode);
        int rc, xattr_len, evm_immutable = 0;

        if (iint && (iint->evm_status == INTEGRITY_PASS ||
                     iint->evm_status == INTEGRITY_PASS_IMMUTABLE))
                return iint->evm_status;

        /*
         * On unsupported filesystems without EVM_INIT_X509 enabled, skip
         * signature verification.
         */
        if (!(evm_initialized & EVM_INIT_X509) &&
            is_unsupported_hmac_fs(dentry))
                return INTEGRITY_UNKNOWN;

        /* if status is not PASS, try to check again - against -ENOMEM */

        /* first need to know the sig type */
        rc = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_EVM,
                                (char **)&xattr_data, 0, GFP_NOFS);
        if (rc <= 0) {
                evm_status = INTEGRITY_FAIL;
                if (rc == -ENODATA) {
                        rc = evm_find_protected_xattrs(dentry);
                        if (rc > 0)
                                evm_status = INTEGRITY_NOLABEL;
                        else if (rc == 0)
                                evm_status = INTEGRITY_NOXATTRS; /* new file */
                } else if (rc == -EOPNOTSUPP) {
                        evm_status = INTEGRITY_UNKNOWN;
                }
                goto out;
        }

        xattr_len = rc;

        /* check value type */
        switch (xattr_data->type) {
        case EVM_XATTR_HMAC:
                if (xattr_len != sizeof(struct evm_xattr)) {
                        evm_status = INTEGRITY_FAIL;
                        goto out;
                }

                digest.hdr.algo = HASH_ALGO_SHA1;
                rc = evm_calc_hmac(dentry, xattr_name, xattr_value,
                                   xattr_value_len, &digest, iint);
                if (rc)
                        break;
                rc = crypto_memneq(xattr_data->data, digest.digest,
                                   SHA1_DIGEST_SIZE);
                if (rc)
                        rc = -EINVAL;
                break;
        case EVM_XATTR_PORTABLE_DIGSIG:
                evm_immutable = 1;
                fallthrough;
        case EVM_IMA_XATTR_DIGSIG:
                /* accept xattr with non-empty signature field */
                if (xattr_len <= sizeof(struct signature_v2_hdr)) {
                        evm_status = INTEGRITY_FAIL;
                        goto out;
                }

                hdr = (struct signature_v2_hdr *)xattr_data;
                digest.hdr.algo = hdr->hash_algo;
                rc = evm_calc_hash(dentry, xattr_name, xattr_value,
                                   xattr_value_len, xattr_data->type, &digest,
                                   iint);
                if (rc)
                        break;
                rc = integrity_digsig_verify(INTEGRITY_KEYRING_EVM,
                                        (const char *)xattr_data, xattr_len,
                                        digest.digest, digest.hdr.length);
                if (!rc) {
                        if (xattr_data->type == EVM_XATTR_PORTABLE_DIGSIG) {
                                if (iint)
                                        iint->flags |= EVM_IMMUTABLE_DIGSIG;
                                evm_status = INTEGRITY_PASS_IMMUTABLE;
                        } else if (!IS_RDONLY(inode) &&
                                   !(inode->i_sb->s_readonly_remount) &&
                                   !IS_IMMUTABLE(inode) &&
                                   !is_unsupported_hmac_fs(dentry)) {
                                evm_update_evmxattr(dentry, xattr_name,
                                                    xattr_value,
                                                    xattr_value_len);
                        }
                }
                break;
        default:
                rc = -EINVAL;
                break;
        }

        if (rc) {
                if (rc == -ENODATA)
                        evm_status = INTEGRITY_NOXATTRS;
                else if (evm_immutable)
                        evm_status = INTEGRITY_FAIL_IMMUTABLE;
                else
                        evm_status = INTEGRITY_FAIL;
        }
        pr_debug("digest: (%d) [%*phN]\n", digest.hdr.length, digest.hdr.length,
                  digest.digest);
out:
        if (iint)
                iint->evm_status = evm_status;
        kfree(xattr_data);
        return evm_status;
}

static int evm_protected_xattr_common(const char *req_xattr_name,
                                      bool all_xattrs)
{
        int namelen;
        int found = 0;
        struct xattr_list *xattr;

        namelen = strlen(req_xattr_name);
        list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) {
                if (!all_xattrs && !xattr->enabled)
                        continue;

                if ((strlen(xattr->name) == namelen)
                    && (strncmp(req_xattr_name, xattr->name, namelen) == 0)) {
                        found = 1;
                        break;
                }
                if (strncmp(req_xattr_name,
                            xattr->name + XATTR_SECURITY_PREFIX_LEN,
                            strlen(req_xattr_name)) == 0) {
                        found = 1;
                        break;
                }
        }

        return found;
}

int evm_protected_xattr(const char *req_xattr_name)
{
        return evm_protected_xattr_common(req_xattr_name, false);
}

int evm_protected_xattr_if_enabled(const char *req_xattr_name)
{
        return evm_protected_xattr_common(req_xattr_name, true);
}

/**
 * evm_read_protected_xattrs - read EVM protected xattr names, lengths, values
 * @dentry: dentry of the read xattrs
 * @buffer: buffer xattr names, lengths or values are copied to
 * @buffer_size: size of buffer
 * @type: n: names, l: lengths, v: values
 * @canonical_fmt: data format (true: little endian, false: native format)
 *
 * Read protected xattr names (separated by |), lengths (u32) or values for a
 * given dentry and return the total size of copied data. If buffer is NULL,
 * just return the total size.
 *
 * Returns the total size on success, a negative value on error.
 */
int evm_read_protected_xattrs(struct dentry *dentry, u8 *buffer,
                              int buffer_size, char type, bool canonical_fmt)
{
        struct xattr_list *xattr;
        int rc, size, total_size = 0;

        list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) {
                rc = __vfs_getxattr(dentry, d_backing_inode(dentry),
                                    xattr->name, NULL, 0);
                if (rc < 0 && rc == -ENODATA)
                        continue;
                else if (rc < 0)
                        return rc;

                switch (type) {
                case 'n':
                        size = strlen(xattr->name) + 1;
                        if (buffer) {
                                if (total_size)
                                        *(buffer + total_size - 1) = '|';

                                memcpy(buffer + total_size, xattr->name, size);
                        }
                        break;
                case 'l':
                        size = sizeof(u32);
                        if (buffer) {
                                if (canonical_fmt)
                                        rc = (__force int)cpu_to_le32(rc);

                                *(u32 *)(buffer + total_size) = rc;
                        }
                        break;
                case 'v':
                        size = rc;
                        if (buffer) {
                                rc = __vfs_getxattr(dentry,
                                        d_backing_inode(dentry), xattr->name,
                                        buffer + total_size,
                                        buffer_size - total_size);
                                if (rc < 0)
                                        return rc;
                        }
                        break;
                default:
                        return -EINVAL;
                }

                total_size += size;
        }

        return total_size;
}

/**
 * evm_verifyxattr - verify the integrity of the requested xattr
 * @dentry: object of the verify xattr
 * @xattr_name: requested xattr
 * @xattr_value: requested xattr value
 * @xattr_value_len: requested xattr value length
 *
 * Calculate the HMAC for the given dentry and verify it against the stored
 * security.evm xattr. For performance, use the xattr value and length
 * previously retrieved to calculate the HMAC.
 *
 * Returns the xattr integrity status.
 *
 * This function requires the caller to lock the inode's i_mutex before it
 * is executed.
 */
enum integrity_status evm_verifyxattr(struct dentry *dentry,
                                      const char *xattr_name,
                                      void *xattr_value, size_t xattr_value_len)
{
        if (!evm_key_loaded() || !evm_protected_xattr(xattr_name))
                return INTEGRITY_UNKNOWN;

        return evm_verify_hmac(dentry, xattr_name, xattr_value,
                                 xattr_value_len);
}
EXPORT_SYMBOL_GPL(evm_verifyxattr);

/*
 * evm_verify_current_integrity - verify the dentry's metadata integrity
 * @dentry: pointer to the affected dentry
 *
 * Verify and return the dentry's metadata integrity. The exceptions are
 * before EVM is initialized or in 'fix' mode.
 */
static enum integrity_status evm_verify_current_integrity(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);

        if (!evm_key_loaded() || !S_ISREG(inode->i_mode) || evm_fixmode)
                return INTEGRITY_PASS;
        return evm_verify_hmac(dentry, NULL, NULL, 0);
}

/*
 * evm_xattr_change - check if passed xattr value differs from current value
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @xattr_name: requested xattr
 * @xattr_value: requested xattr value
 * @xattr_value_len: requested xattr value length
 *
 * Check if passed xattr value differs from current value.
 *
 * Returns 1 if passed xattr value differs from current value, 0 otherwise.
 */
static int evm_xattr_change(struct mnt_idmap *idmap,
                            struct dentry *dentry, const char *xattr_name,
                            const void *xattr_value, size_t xattr_value_len)
{
        char *xattr_data = NULL;
        int rc = 0;

        rc = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, xattr_name, &xattr_data,
                                0, GFP_NOFS);
        if (rc < 0) {
                rc = 1;
                goto out;
        }

        if (rc == xattr_value_len)
                rc = !!memcmp(xattr_value, xattr_data, rc);
        else
                rc = 1;

out:
        kfree(xattr_data);
        return rc;
}

/*
 * evm_protect_xattr - protect the EVM extended attribute
 *
 * Prevent security.evm from being modified or removed without the
 * necessary permissions or when the existing value is invalid.
 *
 * The posix xattr acls are 'system' prefixed, which normally would not
 * affect security.evm.  An interesting side affect of writing posix xattr
 * acls is their modifying of the i_mode, which is included in security.evm.
 * For posix xattr acls only, permit security.evm, even if it currently
 * doesn't exist, to be updated unless the EVM signature is immutable.
 */
static int evm_protect_xattr(struct mnt_idmap *idmap,
                             struct dentry *dentry, const char *xattr_name,
                             const void *xattr_value, size_t xattr_value_len)
{
        enum integrity_status evm_status;

        if (strcmp(xattr_name, XATTR_NAME_EVM) == 0) {
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (is_unsupported_hmac_fs(dentry))
                        return -EPERM;
        } else if (!evm_protected_xattr(xattr_name)) {
                if (!posix_xattr_acl(xattr_name))
                        return 0;
                if (is_unsupported_hmac_fs(dentry))
                        return 0;

                evm_status = evm_verify_current_integrity(dentry);
                if ((evm_status == INTEGRITY_PASS) ||
                    (evm_status == INTEGRITY_NOXATTRS))
                        return 0;
                goto out;
        } else if (is_unsupported_hmac_fs(dentry))
                return 0;

        evm_status = evm_verify_current_integrity(dentry);
        if (evm_status == INTEGRITY_NOXATTRS) {
                struct evm_iint_cache *iint;

                /* Exception if the HMAC is not going to be calculated. */
                if (evm_hmac_disabled())
                        return 0;

                iint = evm_iint_inode(d_backing_inode(dentry));
                if (iint && (iint->flags & EVM_NEW_FILE))
                        return 0;

                /* exception for pseudo filesystems */
                if (dentry->d_sb->s_magic == TMPFS_MAGIC
                    || dentry->d_sb->s_magic == SYSFS_MAGIC)
                        return 0;

                integrity_audit_msg(AUDIT_INTEGRITY_METADATA,
                                    dentry->d_inode, dentry->d_name.name,
                                    "update_metadata",
                                    integrity_status_msg[evm_status],
                                    -EPERM, 0);
        }
out:
        /* Exception if the HMAC is not going to be calculated. */
        if (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL ||
            evm_status == INTEGRITY_UNKNOWN))
                return 0;

        /*
         * Writing other xattrs is safe for portable signatures, as portable
         * signatures are immutable and can never be updated.
         */
        if (evm_status == INTEGRITY_FAIL_IMMUTABLE)
                return 0;

        if (evm_status == INTEGRITY_PASS_IMMUTABLE &&
            !evm_xattr_change(idmap, dentry, xattr_name, xattr_value,
                              xattr_value_len))
                return 0;

        if (evm_status != INTEGRITY_PASS &&
            evm_status != INTEGRITY_PASS_IMMUTABLE)
                integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry),
                                    dentry->d_name.name, "appraise_metadata",
                                    integrity_status_msg[evm_status],
                                    -EPERM, 0);
        return evm_status == INTEGRITY_PASS ? 0 : -EPERM;
}

/**
 * evm_inode_setxattr - protect the EVM extended attribute
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @xattr_name: pointer to the affected extended attribute name
 * @xattr_value: pointer to the new extended attribute value
 * @xattr_value_len: pointer to the new extended attribute value length
 * @flags: flags to pass into filesystem operations
 *
 * Before allowing the 'security.evm' protected xattr to be updated,
 * verify the existing value is valid.  As only the kernel should have
 * access to the EVM encrypted key needed to calculate the HMAC, prevent
 * userspace from writing HMAC value.  Writing 'security.evm' requires
 * requires CAP_SYS_ADMIN privileges.
 */
static int evm_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
                              const char *xattr_name, const void *xattr_value,
                              size_t xattr_value_len, int flags)
{
        const struct evm_ima_xattr_data *xattr_data = xattr_value;

        /* Policy permits modification of the protected xattrs even though
         * there's no HMAC key loaded
         */
        if (evm_initialized & EVM_ALLOW_METADATA_WRITES)
                return 0;

        if (strcmp(xattr_name, XATTR_NAME_EVM) == 0) {
                if (!xattr_value_len)
                        return -EINVAL;
                if (xattr_data->type != EVM_IMA_XATTR_DIGSIG &&
                    xattr_data->type != EVM_XATTR_PORTABLE_DIGSIG)
                        return -EPERM;
        }
        return evm_protect_xattr(idmap, dentry, xattr_name, xattr_value,
                                 xattr_value_len);
}

/**
 * evm_inode_removexattr - protect the EVM extended attribute
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @xattr_name: pointer to the affected extended attribute name
 *
 * Removing 'security.evm' requires CAP_SYS_ADMIN privileges and that
 * the current value is valid.
 */
static int evm_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry,
                                 const char *xattr_name)
{
        /* Policy permits modification of the protected xattrs even though
         * there's no HMAC key loaded
         */
        if (evm_initialized & EVM_ALLOW_METADATA_WRITES)
                return 0;

        return evm_protect_xattr(idmap, dentry, xattr_name, NULL, 0);
}

#ifdef CONFIG_FS_POSIX_ACL
static int evm_inode_set_acl_change(struct mnt_idmap *idmap,
                                    struct dentry *dentry, const char *name,
                                    struct posix_acl *kacl)
{
        int rc;

        umode_t mode;
        struct inode *inode = d_backing_inode(dentry);

        if (!kacl)
                return 1;

        rc = posix_acl_update_mode(idmap, inode, &mode, &kacl);
        if (rc || (inode->i_mode != mode))
                return 1;

        return 0;
}
#else
static inline int evm_inode_set_acl_change(struct mnt_idmap *idmap,
                                           struct dentry *dentry,
                                           const char *name,
                                           struct posix_acl *kacl)
{
        return 0;
}
#endif

/**
 * evm_inode_set_acl - protect the EVM extended attribute from posix acls
 * @idmap: idmap of the idmapped mount
 * @dentry: pointer to the affected dentry
 * @acl_name: name of the posix acl
 * @kacl: pointer to the posix acls
 *
 * Prevent modifying posix acls causing the EVM HMAC to be re-calculated
 * and 'security.evm' xattr updated, unless the existing 'security.evm' is
 * valid.
 *
 * Return: zero on success, -EPERM on failure.
 */
static int evm_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                             const char *acl_name, struct posix_acl *kacl)
{
        enum integrity_status evm_status;

        /* Policy permits modification of the protected xattrs even though
         * there's no HMAC key loaded
         */
        if (evm_initialized & EVM_ALLOW_METADATA_WRITES)
                return 0;

        evm_status = evm_verify_current_integrity(dentry);
        if ((evm_status == INTEGRITY_PASS) ||
            (evm_status == INTEGRITY_NOXATTRS))
                return 0;

        /* Exception if the HMAC is not going to be calculated. */
        if (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL ||
            evm_status == INTEGRITY_UNKNOWN))
                return 0;

        /*
         * Writing other xattrs is safe for portable signatures, as portable
         * signatures are immutable and can never be updated.
         */
        if (evm_status == INTEGRITY_FAIL_IMMUTABLE)
                return 0;

        if (evm_status == INTEGRITY_PASS_IMMUTABLE &&
            !evm_inode_set_acl_change(idmap, dentry, acl_name, kacl))
                return 0;

        if (evm_status != INTEGRITY_PASS_IMMUTABLE)
                integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry),
                                    dentry->d_name.name, "appraise_metadata",
                                    integrity_status_msg[evm_status],
                                    -EPERM, 0);
        return -EPERM;
}

/**
 * evm_inode_remove_acl - Protect the EVM extended attribute from posix acls
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @acl_name: name of the posix acl
 *
 * Prevent removing posix acls causing the EVM HMAC to be re-calculated
 * and 'security.evm' xattr updated, unless the existing 'security.evm' is
 * valid.
 *
 * Return: zero on success, -EPERM on failure.
 */
static int evm_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                                const char *acl_name)
{
        return evm_inode_set_acl(idmap, dentry, acl_name, NULL);
}

static void evm_reset_status(struct inode *inode)
{
        struct evm_iint_cache *iint;

        iint = evm_iint_inode(inode);
        if (iint)
                iint->evm_status = INTEGRITY_UNKNOWN;
}

/**
 * evm_metadata_changed: Detect changes to the metadata
 * @inode: a file's inode
 * @metadata_inode: metadata inode
 *
 * On a stacked filesystem detect whether the metadata has changed. If this is
 * the case reset the evm_status associated with the inode that represents the
 * file.
 */
bool evm_metadata_changed(struct inode *inode, struct inode *metadata_inode)
{
        struct evm_iint_cache *iint = evm_iint_inode(inode);
        bool ret = false;

        if (iint) {
                ret = (!IS_I_VERSION(metadata_inode) ||
                       integrity_inode_attrs_changed(&iint->metadata_inode,
                                                     metadata_inode));
                if (ret)
                        iint->evm_status = INTEGRITY_UNKNOWN;
        }

        return ret;
}

/**
 * evm_revalidate_status - report whether EVM status re-validation is necessary
 * @xattr_name: pointer to the affected extended attribute name
 *
 * Report whether callers of evm_verifyxattr() should re-validate the
 * EVM status.
 *
 * Return true if re-validation is necessary, false otherwise.
 */
bool evm_revalidate_status(const char *xattr_name)
{
        if (!evm_key_loaded())
                return false;

        /* evm_inode_post_setattr() passes NULL */
        if (!xattr_name)
                return true;

        if (!evm_protected_xattr(xattr_name) && !posix_xattr_acl(xattr_name) &&
            strcmp(xattr_name, XATTR_NAME_EVM))
                return false;

        return true;
}

/**
 * evm_inode_post_setxattr - update 'security.evm' to reflect the changes
 * @dentry: pointer to the affected dentry
 * @xattr_name: pointer to the affected extended attribute name
 * @xattr_value: pointer to the new extended attribute value
 * @xattr_value_len: pointer to the new extended attribute value length
 * @flags: flags to pass into filesystem operations
 *
 * Update the HMAC stored in 'security.evm' to reflect the change.
 *
 * No need to take the i_mutex lock here, as this function is called from
 * __vfs_setxattr_noperm().  The caller of which has taken the inode's
 * i_mutex lock.
 */
static void evm_inode_post_setxattr(struct dentry *dentry,
                                    const char *xattr_name,
                                    const void *xattr_value,
                                    size_t xattr_value_len,
                                    int flags)
{
        if (!evm_revalidate_status(xattr_name))
                return;

        evm_reset_status(dentry->d_inode);

        if (!strcmp(xattr_name, XATTR_NAME_EVM))
                return;

        if (!(evm_initialized & EVM_INIT_HMAC))
                return;

        if (is_unsupported_hmac_fs(dentry))
                return;

        evm_update_evmxattr(dentry, xattr_name, xattr_value, xattr_value_len);
}

/**
 * evm_inode_post_set_acl - Update the EVM extended attribute from posix acls
 * @dentry: pointer to the affected dentry
 * @acl_name: name of the posix acl
 * @kacl: pointer to the posix acls
 *
 * Update the 'security.evm' xattr with the EVM HMAC re-calculated after setting
 * posix acls.
 */
static void evm_inode_post_set_acl(struct dentry *dentry, const char *acl_name,
                                   struct posix_acl *kacl)
{
        return evm_inode_post_setxattr(dentry, acl_name, NULL, 0, 0);
}

/**
 * evm_inode_post_removexattr - update 'security.evm' after removing the xattr
 * @dentry: pointer to the affected dentry
 * @xattr_name: pointer to the affected extended attribute name
 *
 * Update the HMAC stored in 'security.evm' to reflect removal of the xattr.
 *
 * No need to take the i_mutex lock here, as this function is called from
 * vfs_removexattr() which takes the i_mutex.
 */
static void evm_inode_post_removexattr(struct dentry *dentry,
                                       const char *xattr_name)
{
        if (!evm_revalidate_status(xattr_name))
                return;

        evm_reset_status(dentry->d_inode);

        if (!strcmp(xattr_name, XATTR_NAME_EVM))
                return;

        if (!(evm_initialized & EVM_INIT_HMAC))
                return;

        evm_update_evmxattr(dentry, xattr_name, NULL, 0);
}

/**
 * evm_inode_post_remove_acl - Update the EVM extended attribute from posix acls
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @acl_name: name of the posix acl
 *
 * Update the 'security.evm' xattr with the EVM HMAC re-calculated after
 * removing posix acls.
 */
static inline void evm_inode_post_remove_acl(struct mnt_idmap *idmap,
                                             struct dentry *dentry,
                                             const char *acl_name)
{
        evm_inode_post_removexattr(dentry, acl_name);
}

static int evm_attr_change(struct mnt_idmap *idmap,
                           struct dentry *dentry, struct iattr *attr)
{
        struct inode *inode = d_backing_inode(dentry);
        unsigned int ia_valid = attr->ia_valid;

        if (!i_uid_needs_update(idmap, attr, inode) &&
            !i_gid_needs_update(idmap, attr, inode) &&
            (!(ia_valid & ATTR_MODE) || attr->ia_mode == inode->i_mode))
                return 0;

        return 1;
}

/**
 * evm_inode_setattr - prevent updating an invalid EVM extended attribute
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @attr: iattr structure containing the new file attributes
 *
 * Permit update of file attributes when files have a valid EVM signature,
 * except in the case of them having an immutable portable signature.
 */
static int evm_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                             struct iattr *attr)
{
        unsigned int ia_valid = attr->ia_valid;
        enum integrity_status evm_status;

        /* Policy permits modification of the protected attrs even though
         * there's no HMAC key loaded
         */
        if (evm_initialized & EVM_ALLOW_METADATA_WRITES)
                return 0;

        if (is_unsupported_hmac_fs(dentry))
                return 0;

        if (!(ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)))
                return 0;

        evm_status = evm_verify_current_integrity(dentry);
        /*
         * Writing attrs is safe for portable signatures, as portable signatures
         * are immutable and can never be updated.
         */
        if ((evm_status == INTEGRITY_PASS) ||
            (evm_status == INTEGRITY_NOXATTRS) ||
            (evm_status == INTEGRITY_FAIL_IMMUTABLE) ||
            (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL ||
             evm_status == INTEGRITY_UNKNOWN)))
                return 0;

        if (evm_status == INTEGRITY_PASS_IMMUTABLE &&
            !evm_attr_change(idmap, dentry, attr))
                return 0;

        integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry),
                            dentry->d_name.name, "appraise_metadata",
                            integrity_status_msg[evm_status], -EPERM, 0);
        return -EPERM;
}

/**
 * evm_inode_post_setattr - update 'security.evm' after modifying metadata
 * @idmap: idmap of the idmapped mount
 * @dentry: pointer to the affected dentry
 * @ia_valid: for the UID and GID status
 *
 * For now, update the HMAC stored in 'security.evm' to reflect UID/GID
 * changes.
 *
 * This function is called from notify_change(), which expects the caller
 * to lock the inode's i_mutex.
 */
static void evm_inode_post_setattr(struct mnt_idmap *idmap,
                                   struct dentry *dentry, int ia_valid)
{
        if (!evm_revalidate_status(NULL))
                return;

        evm_reset_status(dentry->d_inode);

        if (!(evm_initialized & EVM_INIT_HMAC))
                return;

        if (is_unsupported_hmac_fs(dentry))
                return;

        if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))
                evm_update_evmxattr(dentry, NULL, NULL, 0);
}

static int evm_inode_copy_up_xattr(struct dentry *src, const char *name)
{
        struct evm_ima_xattr_data *xattr_data = NULL;
        int rc;

        if (strcmp(name, XATTR_NAME_EVM) != 0)
                return -EOPNOTSUPP;

        /* first need to know the sig type */
        rc = vfs_getxattr_alloc(&nop_mnt_idmap, src, XATTR_NAME_EVM,
                                (char **)&xattr_data, 0, GFP_NOFS);
        if (rc <= 0)
                return -EPERM;

        if (rc < offsetof(struct evm_ima_xattr_data, type) +
                 sizeof(xattr_data->type))
                return -EPERM;

        switch (xattr_data->type) {
        case EVM_XATTR_PORTABLE_DIGSIG:
                rc = 0; /* allow copy-up */
                break;
        case EVM_XATTR_HMAC:
        case EVM_IMA_XATTR_DIGSIG:
        default:
                rc = 1; /* discard */
        }

        kfree(xattr_data);
        return rc;
}

/*
 * evm_inode_init_security - initializes security.evm HMAC value
 */
int evm_inode_init_security(struct inode *inode, struct inode *dir,
                            const struct qstr *qstr, struct xattr *xattrs,
                            int *xattr_count)
{
        struct evm_xattr *xattr_data;
        struct xattr *xattr, *evm_xattr;
        bool evm_protected_xattrs = false;
        int rc;

        if (!(evm_initialized & EVM_INIT_HMAC) || !xattrs)
                return 0;

        /*
         * security_inode_init_security() makes sure that the xattrs array is
         * contiguous, there is enough space for security.evm, and that there is
         * a terminator at the end of the array.
         */
        for (xattr = xattrs; xattr->name; xattr++) {
                if (evm_protected_xattr(xattr->name))
                        evm_protected_xattrs = true;
        }

        /* EVM xattr not needed. */
        if (!evm_protected_xattrs)
                return 0;

        evm_xattr = lsm_get_xattr_slot(xattrs, xattr_count);
        /*
         * Array terminator (xattr name = NULL) must be the first non-filled
         * xattr slot.
         */
        WARN_ONCE(evm_xattr != xattr,
                  "%s: xattrs terminator is not the first non-filled slot\n",
                  __func__);

        xattr_data = kzalloc(sizeof(*xattr_data), GFP_NOFS);
        if (!xattr_data)
                return -ENOMEM;

        xattr_data->data.type = EVM_XATTR_HMAC;
        rc = evm_init_hmac(inode, xattrs, xattr_data->digest);
        if (rc < 0)
                goto out;

        evm_xattr->value = xattr_data;
        evm_xattr->value_len = sizeof(*xattr_data);
        evm_xattr->name = XATTR_EVM_SUFFIX;
        return 0;
out:
        kfree(xattr_data);
        return rc;
}
EXPORT_SYMBOL_GPL(evm_inode_init_security);

static int evm_inode_alloc_security(struct inode *inode)
{
        struct evm_iint_cache *iint = evm_iint_inode(inode);

        /* Called by security_inode_alloc(), it cannot be NULL. */
        iint->flags = 0UL;
        iint->evm_status = INTEGRITY_UNKNOWN;

        return 0;
}

static void evm_file_release(struct file *file)
{
        struct inode *inode = file_inode(file);
        struct evm_iint_cache *iint = evm_iint_inode(inode);
        fmode_t mode = file->f_mode;

        if (!S_ISREG(inode->i_mode) || !(mode & FMODE_WRITE))
                return;

        if (iint && atomic_read(&inode->i_writecount) == 1)
                iint->flags &= ~EVM_NEW_FILE;
}

static void evm_post_path_mknod(struct mnt_idmap *idmap, struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);
        struct evm_iint_cache *iint = evm_iint_inode(inode);

        if (!S_ISREG(inode->i_mode))
                return;

        if (iint)
                iint->flags |= EVM_NEW_FILE;
}

#ifdef CONFIG_EVM_LOAD_X509
void __init evm_load_x509(void)
{
        int rc;

        rc = integrity_load_x509(INTEGRITY_KEYRING_EVM, CONFIG_EVM_X509_PATH);
        if (!rc)
                evm_initialized |= EVM_INIT_X509;
}
#endif

static int __init init_evm(void)
{
        int error;
        struct list_head *pos, *q;

        evm_init_config();

        error = integrity_init_keyring(INTEGRITY_KEYRING_EVM);
        if (error)
                goto error;

        error = evm_init_secfs();
        if (error < 0) {
                pr_info("Error registering secfs\n");
                goto error;
        }

error:
        if (error != 0) {
                if (!list_empty(&evm_config_xattrnames)) {
                        list_for_each_safe(pos, q, &evm_config_xattrnames)
                                list_del(pos);
                }
        }

        return error;
}

static struct security_hook_list evm_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(inode_setattr, evm_inode_setattr),
        LSM_HOOK_INIT(inode_post_setattr, evm_inode_post_setattr),
        LSM_HOOK_INIT(inode_copy_up_xattr, evm_inode_copy_up_xattr),
        LSM_HOOK_INIT(inode_setxattr, evm_inode_setxattr),
        LSM_HOOK_INIT(inode_post_setxattr, evm_inode_post_setxattr),
        LSM_HOOK_INIT(inode_set_acl, evm_inode_set_acl),
        LSM_HOOK_INIT(inode_post_set_acl, evm_inode_post_set_acl),
        LSM_HOOK_INIT(inode_remove_acl, evm_inode_remove_acl),
        LSM_HOOK_INIT(inode_post_remove_acl, evm_inode_post_remove_acl),
        LSM_HOOK_INIT(inode_removexattr, evm_inode_removexattr),
        LSM_HOOK_INIT(inode_post_removexattr, evm_inode_post_removexattr),
        LSM_HOOK_INIT(inode_init_security, evm_inode_init_security),
        LSM_HOOK_INIT(inode_alloc_security, evm_inode_alloc_security),
        LSM_HOOK_INIT(file_release, evm_file_release),
        LSM_HOOK_INIT(path_post_mknod, evm_post_path_mknod),
};

static const struct lsm_id evm_lsmid = {
        .name = "evm",
        .id = LSM_ID_EVM,
};

static int __init init_evm_lsm(void)
{
        security_add_hooks(evm_hooks, ARRAY_SIZE(evm_hooks), &evm_lsmid);
        return 0;
}

struct lsm_blob_sizes evm_blob_sizes __ro_after_init = {
        .lbs_inode = sizeof(struct evm_iint_cache),
        .lbs_xattr_count = 1,
};

DEFINE_LSM(evm) = {
        .name = "evm",
        .init = init_evm_lsm,
        .order = LSM_ORDER_LAST,
        .blobs = &evm_blob_sizes,
};

late_initcall(init_evm);


















































































































































































































    3 



























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
// SPDX-License-Identifier: GPL-2.0
/*
 * Implement the default iomap interfaces
 *
 * (C) Copyright 2004 Linus Torvalds
 */
#include <linux/pci.h>
#include <linux/io.h>
#include <linux/kmsan-checks.h>

#include <linux/export.h>

/*
 * Read/write from/to an (offsettable) iomem cookie. It might be a PIO
 * access or a MMIO access, these functions don't care. The info is
 * encoded in the hardware mapping set up by the mapping functions
 * (or the cookie itself, depending on implementation and hw).
 *
 * The generic routines don't assume any hardware mappings, and just
 * encode the PIO/MMIO as part of the cookie. They coldly assume that
 * the MMIO IO mappings are not in the low address range.
 *
 * Architectures for which this is not true can't use this generic
 * implementation and should do their own copy.
 */

#ifndef HAVE_ARCH_PIO_SIZE
/*
 * We encode the physical PIO addresses (0-0xffff) into the
 * pointer by offsetting them with a constant (0x10000) and
 * assuming that all the low addresses are always PIO. That means
 * we can do some sanity checks on the low bits, and don't
 * need to just take things for granted.
 */
#define PIO_OFFSET        0x10000UL
#define PIO_MASK        0x0ffffUL
#define PIO_RESERVED        0x40000UL
#endif

static void bad_io_access(unsigned long port, const char *access)
{
        static int count = 10;
        if (count) {
                count--;
                WARN(1, KERN_ERR "Bad IO access at port %#lx (%s)\n", port, access);
        }
}

/*
 * Ugly macros are a way of life.
 */
#define IO_COND(addr, is_pio, is_mmio) do {                        \
        unsigned long port = (unsigned long __force)addr;        \
        if (port >= PIO_RESERVED) {                                \
                is_mmio;                                        \
        } else if (port > PIO_OFFSET) {                                \
                port &= PIO_MASK;                                \
                is_pio;                                                \
        } else                                                        \
                bad_io_access(port, #is_pio );                        \
} while (0)

#ifndef pio_read16be
#define pio_read16be(port) swab16(inw(port))
#define pio_read32be(port) swab32(inl(port))
#endif

#ifndef mmio_read16be
#define mmio_read16be(addr) swab16(readw(addr))
#define mmio_read32be(addr) swab32(readl(addr))
#define mmio_read64be(addr) swab64(readq(addr))
#endif

/*
 * Here and below, we apply __no_kmsan_checks to functions reading data from
 * hardware, to ensure that KMSAN marks their return values as initialized.
 */
__no_kmsan_checks
unsigned int ioread8(const void __iomem *addr)
{
        IO_COND(addr, return inb(port), return readb(addr));
        return 0xff;
}
__no_kmsan_checks
unsigned int ioread16(const void __iomem *addr)
{
        IO_COND(addr, return inw(port), return readw(addr));
        return 0xffff;
}
__no_kmsan_checks
unsigned int ioread16be(const void __iomem *addr)
{
        IO_COND(addr, return pio_read16be(port), return mmio_read16be(addr));
        return 0xffff;
}
__no_kmsan_checks
unsigned int ioread32(const void __iomem *addr)
{
        IO_COND(addr, return inl(port), return readl(addr));
        return 0xffffffff;
}
__no_kmsan_checks
unsigned int ioread32be(const void __iomem *addr)
{
        IO_COND(addr, return pio_read32be(port), return mmio_read32be(addr));
        return 0xffffffff;
}
EXPORT_SYMBOL(ioread8);
EXPORT_SYMBOL(ioread16);
EXPORT_SYMBOL(ioread16be);
EXPORT_SYMBOL(ioread32);
EXPORT_SYMBOL(ioread32be);

#ifdef readq
static u64 pio_read64_lo_hi(unsigned long port)
{
        u64 lo, hi;

        lo = inl(port);
        hi = inl(port + sizeof(u32));

        return lo | (hi << 32);
}

static u64 pio_read64_hi_lo(unsigned long port)
{
        u64 lo, hi;

        hi = inl(port + sizeof(u32));
        lo = inl(port);

        return lo | (hi << 32);
}

static u64 pio_read64be_lo_hi(unsigned long port)
{
        u64 lo, hi;

        lo = pio_read32be(port + sizeof(u32));
        hi = pio_read32be(port);

        return lo | (hi << 32);
}

static u64 pio_read64be_hi_lo(unsigned long port)
{
        u64 lo, hi;

        hi = pio_read32be(port);
        lo = pio_read32be(port + sizeof(u32));

        return lo | (hi << 32);
}

__no_kmsan_checks
u64 ioread64_lo_hi(const void __iomem *addr)
{
        IO_COND(addr, return pio_read64_lo_hi(port), return readq(addr));
        return 0xffffffffffffffffULL;
}

__no_kmsan_checks
u64 ioread64_hi_lo(const void __iomem *addr)
{
        IO_COND(addr, return pio_read64_hi_lo(port), return readq(addr));
        return 0xffffffffffffffffULL;
}

__no_kmsan_checks
u64 ioread64be_lo_hi(const void __iomem *addr)
{
        IO_COND(addr, return pio_read64be_lo_hi(port),
                return mmio_read64be(addr));
        return 0xffffffffffffffffULL;
}

__no_kmsan_checks
u64 ioread64be_hi_lo(const void __iomem *addr)
{
        IO_COND(addr, return pio_read64be_hi_lo(port),
                return mmio_read64be(addr));
        return 0xffffffffffffffffULL;
}

EXPORT_SYMBOL(ioread64_lo_hi);
EXPORT_SYMBOL(ioread64_hi_lo);
EXPORT_SYMBOL(ioread64be_lo_hi);
EXPORT_SYMBOL(ioread64be_hi_lo);

#endif /* readq */

#ifndef pio_write16be
#define pio_write16be(val,port) outw(swab16(val),port)
#define pio_write32be(val,port) outl(swab32(val),port)
#endif

#ifndef mmio_write16be
#define mmio_write16be(val,port) writew(swab16(val),port)
#define mmio_write32be(val,port) writel(swab32(val),port)
#define mmio_write64be(val,port) writeq(swab64(val),port)
#endif

void iowrite8(u8 val, void __iomem *addr)
{
        /* Make sure uninitialized memory isn't copied to devices. */
        kmsan_check_memory(&val, sizeof(val));
        IO_COND(addr, outb(val,port), writeb(val, addr));
}
void iowrite16(u16 val, void __iomem *addr)
{
        /* Make sure uninitialized memory isn't copied to devices. */
        kmsan_check_memory(&val, sizeof(val));
        IO_COND(addr, outw(val,port), writew(val, addr));
}
void iowrite16be(u16 val, void __iomem *addr)
{
        /* Make sure uninitialized memory isn't copied to devices. */
        kmsan_check_memory(&val, sizeof(val));
        IO_COND(addr, pio_write16be(val,port), mmio_write16be(val, addr));
}
void iowrite32(u32 val, void __iomem *addr)
{
        /* Make sure uninitialized memory isn't copied to devices. */
        kmsan_check_memory(&val, sizeof(val));
        IO_COND(addr, outl(val,port), writel(val, addr));
}
void iowrite32be(u32 val, void __iomem *addr)
{
        /* Make sure uninitialized memory isn't copied to devices. */
        kmsan_check_memory(&val, sizeof(val));
        IO_COND(addr, pio_write32be(val,port), mmio_write32be(val, addr));
}
EXPORT_SYMBOL(iowrite8);
EXPORT_SYMBOL(iowrite16);
EXPORT_SYMBOL(iowrite16be);
EXPORT_SYMBOL(iowrite32);
EXPORT_SYMBOL(iowrite32be);

#ifdef writeq
static void pio_write64_lo_hi(u64 val, unsigned long port)
{
        outl(val, port);
        outl(val >> 32, port + sizeof(u32));
}

static void pio_write64_hi_lo(u64 val, unsigned long port)
{
        outl(val >> 32, port + sizeof(u32));
        outl(val, port);
}

static void pio_write64be_lo_hi(u64 val, unsigned long port)
{
        pio_write32be(val, port + sizeof(u32));
        pio_write32be(val >> 32, port);
}

static void pio_write64be_hi_lo(u64 val, unsigned long port)
{
        pio_write32be(val >> 32, port);
        pio_write32be(val, port + sizeof(u32));
}

void iowrite64_lo_hi(u64 val, void __iomem *addr)
{
        /* Make sure uninitialized memory isn't copied to devices. */
        kmsan_check_memory(&val, sizeof(val));
        IO_COND(addr, pio_write64_lo_hi(val, port),
                writeq(val, addr));
}

void iowrite64_hi_lo(u64 val, void __iomem *addr)
{
        /* Make sure uninitialized memory isn't copied to devices. */
        kmsan_check_memory(&val, sizeof(val));
        IO_COND(addr, pio_write64_hi_lo(val, port),
                writeq(val, addr));
}

void iowrite64be_lo_hi(u64 val, void __iomem *addr)
{
        /* Make sure uninitialized memory isn't copied to devices. */
        kmsan_check_memory(&val, sizeof(val));
        IO_COND(addr, pio_write64be_lo_hi(val, port),
                mmio_write64be(val, addr));
}

void iowrite64be_hi_lo(u64 val, void __iomem *addr)
{
        /* Make sure uninitialized memory isn't copied to devices. */
        kmsan_check_memory(&val, sizeof(val));
        IO_COND(addr, pio_write64be_hi_lo(val, port),
                mmio_write64be(val, addr));
}

EXPORT_SYMBOL(iowrite64_lo_hi);
EXPORT_SYMBOL(iowrite64_hi_lo);
EXPORT_SYMBOL(iowrite64be_lo_hi);
EXPORT_SYMBOL(iowrite64be_hi_lo);

#endif /* readq */

/*
 * These are the "repeat MMIO read/write" functions.
 * Note the "__raw" accesses, since we don't want to
 * convert to CPU byte order. We write in "IO byte
 * order" (we also don't have IO barriers).
 */
#ifndef mmio_insb
static inline void mmio_insb(const void __iomem *addr, u8 *dst, int count)
{
        while (--count >= 0) {
                u8 data = __raw_readb(addr);
                *dst = data;
                dst++;
        }
}
static inline void mmio_insw(const void __iomem *addr, u16 *dst, int count)
{
        while (--count >= 0) {
                u16 data = __raw_readw(addr);
                *dst = data;
                dst++;
        }
}
static inline void mmio_insl(const void __iomem *addr, u32 *dst, int count)
{
        while (--count >= 0) {
                u32 data = __raw_readl(addr);
                *dst = data;
                dst++;
        }
}
#endif

#ifndef mmio_outsb
static inline void mmio_outsb(void __iomem *addr, const u8 *src, int count)
{
        while (--count >= 0) {
                __raw_writeb(*src, addr);
                src++;
        }
}
static inline void mmio_outsw(void __iomem *addr, const u16 *src, int count)
{
        while (--count >= 0) {
                __raw_writew(*src, addr);
                src++;
        }
}
static inline void mmio_outsl(void __iomem *addr, const u32 *src, int count)
{
        while (--count >= 0) {
                __raw_writel(*src, addr);
                src++;
        }
}
#endif

void ioread8_rep(const void __iomem *addr, void *dst, unsigned long count)
{
        IO_COND(addr, insb(port,dst,count), mmio_insb(addr, dst, count));
        /* KMSAN must treat values read from devices as initialized. */
        kmsan_unpoison_memory(dst, count);
}
void ioread16_rep(const void __iomem *addr, void *dst, unsigned long count)
{
        IO_COND(addr, insw(port,dst,count), mmio_insw(addr, dst, count));
        /* KMSAN must treat values read from devices as initialized. */
        kmsan_unpoison_memory(dst, count * 2);
}
void ioread32_rep(const void __iomem *addr, void *dst, unsigned long count)
{
        IO_COND(addr, insl(port,dst,count), mmio_insl(addr, dst, count));
        /* KMSAN must treat values read from devices as initialized. */
        kmsan_unpoison_memory(dst, count * 4);
}
EXPORT_SYMBOL(ioread8_rep);
EXPORT_SYMBOL(ioread16_rep);
EXPORT_SYMBOL(ioread32_rep);

void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count)
{
        /* Make sure uninitialized memory isn't copied to devices. */
        kmsan_check_memory(src, count);
        IO_COND(addr, outsb(port, src, count), mmio_outsb(addr, src, count));
}
void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count)
{
        /* Make sure uninitialized memory isn't copied to devices. */
        kmsan_check_memory(src, count * 2);
        IO_COND(addr, outsw(port, src, count), mmio_outsw(addr, src, count));
}
void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count)
{
        /* Make sure uninitialized memory isn't copied to devices. */
        kmsan_check_memory(src, count * 4);
        IO_COND(addr, outsl(port, src,count), mmio_outsl(addr, src, count));
}
EXPORT_SYMBOL(iowrite8_rep);
EXPORT_SYMBOL(iowrite16_rep);
EXPORT_SYMBOL(iowrite32_rep);

#ifdef CONFIG_HAS_IOPORT_MAP
/* Create a virtual mapping cookie for an IO port range */
void __iomem *ioport_map(unsigned long port, unsigned int nr)
{
        if (port > PIO_MASK)
                return NULL;
        return (void __iomem *) (unsigned long) (port + PIO_OFFSET);
}

void ioport_unmap(void __iomem *addr)
{
        /* Nothing to do */
}
EXPORT_SYMBOL(ioport_map);
EXPORT_SYMBOL(ioport_unmap);
#endif /* CONFIG_HAS_IOPORT_MAP */

#ifdef CONFIG_PCI
/* Hide the details if this is a MMIO or PIO address space and just do what
 * you expect in the correct way. */
void pci_iounmap(struct pci_dev *dev, void __iomem * addr)
{
        IO_COND(addr, /* nothing */, iounmap(addr));
}
EXPORT_SYMBOL(pci_iounmap);
#endif /* CONFIG_PCI */

















































































































































































































    2 
























    2 
    2 

    2 





























    2 



















































































































































































































    2 



    1 

    1 






















































































































































































































































































































































































































































































































































































































    2 
    2 
    2 

    2 




















    1 








    1 










    1 












    2 











    1 










































































    2 













    2 

    2 








    2 





























    2 









    2 





    2 


















    2 










    2 





    1 














































































































    2 






















































































    2 










    2 


























    2 


    2 





































    2 

    1 



    1 





























































































































































































































































































































































































































































































































































































































































    2 































    2 

    2 











































    2 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
 *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
 *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
 *
 *  High-resolution kernel timers
 *
 *  In contrast to the low-resolution timeout API, aka timer wheel,
 *  hrtimers provide finer resolution and accuracy depending on system
 *  configuration and capabilities.
 *
 *  Started by: Thomas Gleixner and Ingo Molnar
 *
 *  Credits:
 *        Based on the original timer wheel code
 *
 *        Help, testing, suggestions, bugfixes, improvements were
 *        provided by:
 *
 *        George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
 *        et. al.
 */

#include <linux/cpu.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/hrtimer.h>
#include <linux/notifier.h>
#include <linux/syscalls.h>
#include <linux/interrupt.h>
#include <linux/tick.h>
#include <linux/err.h>
#include <linux/debugobjects.h>
#include <linux/sched/signal.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
#include <linux/sched/nohz.h>
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>
#include <linux/timer.h>
#include <linux/freezer.h>
#include <linux/compat.h>

#include <linux/uaccess.h>

#include <trace/events/timer.h>

#include "tick-internal.h"

/*
 * Masks for selecting the soft and hard context timers from
 * cpu_base->active
 */
#define MASK_SHIFT                (HRTIMER_BASE_MONOTONIC_SOFT)
#define HRTIMER_ACTIVE_HARD        ((1U << MASK_SHIFT) - 1)
#define HRTIMER_ACTIVE_SOFT        (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
#define HRTIMER_ACTIVE_ALL        (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)

/*
 * The timer bases:
 *
 * There are more clockids than hrtimer bases. Thus, we index
 * into the timer bases by the hrtimer_base_type enum. When trying
 * to reach a base using a clockid, hrtimer_clockid_to_base()
 * is used to convert from clockid to the proper hrtimer_base_type.
 */
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
        .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
        .clock_base =
        {
                {
                        .index = HRTIMER_BASE_MONOTONIC,
                        .clockid = CLOCK_MONOTONIC,
                        .get_time = &ktime_get,
                },
                {
                        .index = HRTIMER_BASE_REALTIME,
                        .clockid = CLOCK_REALTIME,
                        .get_time = &ktime_get_real,
                },
                {
                        .index = HRTIMER_BASE_BOOTTIME,
                        .clockid = CLOCK_BOOTTIME,
                        .get_time = &ktime_get_boottime,
                },
                {
                        .index = HRTIMER_BASE_TAI,
                        .clockid = CLOCK_TAI,
                        .get_time = &ktime_get_clocktai,
                },
                {
                        .index = HRTIMER_BASE_MONOTONIC_SOFT,
                        .clockid = CLOCK_MONOTONIC,
                        .get_time = &ktime_get,
                },
                {
                        .index = HRTIMER_BASE_REALTIME_SOFT,
                        .clockid = CLOCK_REALTIME,
                        .get_time = &ktime_get_real,
                },
                {
                        .index = HRTIMER_BASE_BOOTTIME_SOFT,
                        .clockid = CLOCK_BOOTTIME,
                        .get_time = &ktime_get_boottime,
                },
                {
                        .index = HRTIMER_BASE_TAI_SOFT,
                        .clockid = CLOCK_TAI,
                        .get_time = &ktime_get_clocktai,
                },
        }
};

static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
        /* Make sure we catch unsupported clockids */
        [0 ... MAX_CLOCKS - 1]        = HRTIMER_MAX_CLOCK_BASES,

        [CLOCK_REALTIME]        = HRTIMER_BASE_REALTIME,
        [CLOCK_MONOTONIC]        = HRTIMER_BASE_MONOTONIC,
        [CLOCK_BOOTTIME]        = HRTIMER_BASE_BOOTTIME,
        [CLOCK_TAI]                = HRTIMER_BASE_TAI,
};

/*
 * Functions and macros which are different for UP/SMP systems are kept in a
 * single place
 */
#ifdef CONFIG_SMP

/*
 * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
 * such that hrtimer_callback_running() can unconditionally dereference
 * timer->base->cpu_base
 */
static struct hrtimer_cpu_base migration_cpu_base = {
        .clock_base = { {
                .cpu_base = &migration_cpu_base,
                .seq      = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
                                                     &migration_cpu_base.lock),
        }, },
};

#define migration_base        migration_cpu_base.clock_base[0]

static inline bool is_migration_base(struct hrtimer_clock_base *base)
{
        return base == &migration_base;
}

/*
 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
 * means that all timers which are tied to this base via timer->base are
 * locked, and the base itself is locked too.
 *
 * So __run_timers/migrate_timers can safely modify all timers which could
 * be found on the lists/queues.
 *
 * When the timer's base is locked, and the timer removed from list, it is
 * possible to set timer->base = &migration_base and drop the lock: the timer
 * remains locked.
 */
static
struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
                                             unsigned long *flags)
        __acquires(&timer->base->lock)
{
        struct hrtimer_clock_base *base;

        for (;;) {
                base = READ_ONCE(timer->base);
                if (likely(base != &migration_base)) {
                        raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
                        if (likely(base == timer->base))
                                return base;
                        /* The timer has migrated to another CPU: */
                        raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
                }
                cpu_relax();
        }
}

/*
 * We do not migrate the timer when it is expiring before the next
 * event on the target cpu. When high resolution is enabled, we cannot
 * reprogram the target cpu hardware and we would cause it to fire
 * late. To keep it simple, we handle the high resolution enabled and
 * disabled case similar.
 *
 * Called with cpu_base->lock of target cpu held.
 */
static int
hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
{
        ktime_t expires;

        expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
        return expires < new_base->cpu_base->expires_next;
}

static inline
struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
                                         int pinned)
{
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
        if (static_branch_likely(&timers_migration_enabled) && !pinned)
                return &per_cpu(hrtimer_bases, get_nohz_timer_target());
#endif
        return base;
}

/*
 * We switch the timer base to a power-optimized selected CPU target,
 * if:
 *        - NO_HZ_COMMON is enabled
 *        - timer migration is enabled
 *        - the timer callback is not running
 *        - the timer is not the first expiring timer on the new target
 *
 * If one of the above requirements is not fulfilled we move the timer
 * to the current CPU or leave it on the previously assigned CPU if
 * the timer callback is currently running.
 */
static inline struct hrtimer_clock_base *
switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
                    int pinned)
{
        struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
        struct hrtimer_clock_base *new_base;
        int basenum = base->index;

        this_cpu_base = this_cpu_ptr(&hrtimer_bases);
        new_cpu_base = get_target_base(this_cpu_base, pinned);
again:
        new_base = &new_cpu_base->clock_base[basenum];

        if (base != new_base) {
                /*
                 * We are trying to move timer to new_base.
                 * However we can't change timer's base while it is running,
                 * so we keep it on the same CPU. No hassle vs. reprogramming
                 * the event source in the high resolution case. The softirq
                 * code will take care of this when the timer function has
                 * completed. There is no conflict as we hold the lock until
                 * the timer is enqueued.
                 */
                if (unlikely(hrtimer_callback_running(timer)))
                        return base;

                /* See the comment in lock_hrtimer_base() */
                WRITE_ONCE(timer->base, &migration_base);
                raw_spin_unlock(&base->cpu_base->lock);
                raw_spin_lock(&new_base->cpu_base->lock);

                if (new_cpu_base != this_cpu_base &&
                    hrtimer_check_target(timer, new_base)) {
                        raw_spin_unlock(&new_base->cpu_base->lock);
                        raw_spin_lock(&base->cpu_base->lock);
                        new_cpu_base = this_cpu_base;
                        WRITE_ONCE(timer->base, base);
                        goto again;
                }
                WRITE_ONCE(timer->base, new_base);
        } else {
                if (new_cpu_base != this_cpu_base &&
                    hrtimer_check_target(timer, new_base)) {
                        new_cpu_base = this_cpu_base;
                        goto again;
                }
        }
        return new_base;
}

#else /* CONFIG_SMP */

static inline bool is_migration_base(struct hrtimer_clock_base *base)
{
        return false;
}

static inline struct hrtimer_clock_base *
lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
        __acquires(&timer->base->cpu_base->lock)
{
        struct hrtimer_clock_base *base = timer->base;

        raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);

        return base;
}

# define switch_hrtimer_base(t, b, p)        (b)

#endif        /* !CONFIG_SMP */

/*
 * Functions for the union type storage format of ktime_t which are
 * too large for inlining:
 */
#if BITS_PER_LONG < 64
/*
 * Divide a ktime value by a nanosecond value
 */
s64 __ktime_divns(const ktime_t kt, s64 div)
{
        int sft = 0;
        s64 dclc;
        u64 tmp;

        dclc = ktime_to_ns(kt);
        tmp = dclc < 0 ? -dclc : dclc;

        /* Make sure the divisor is less than 2^32: */
        while (div >> 32) {
                sft++;
                div >>= 1;
        }
        tmp >>= sft;
        do_div(tmp, (u32) div);
        return dclc < 0 ? -tmp : tmp;
}
EXPORT_SYMBOL_GPL(__ktime_divns);
#endif /* BITS_PER_LONG >= 64 */

/*
 * Add two ktime values and do a safety check for overflow:
 */
ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
{
        ktime_t res = ktime_add_unsafe(lhs, rhs);

        /*
         * We use KTIME_SEC_MAX here, the maximum timeout which we can
         * return to user space in a timespec:
         */
        if (res < 0 || res < lhs || res < rhs)
                res = ktime_set(KTIME_SEC_MAX, 0);

        return res;
}

EXPORT_SYMBOL_GPL(ktime_add_safe);

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS

static const struct debug_obj_descr hrtimer_debug_descr;

static void *hrtimer_debug_hint(void *addr)
{
        return ((struct hrtimer *) addr)->function;
}

/*
 * fixup_init is called when:
 * - an active object is initialized
 */
static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
{
        struct hrtimer *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                hrtimer_cancel(timer);
                debug_object_init(timer, &hrtimer_debug_descr);
                return true;
        default:
                return false;
        }
}

/*
 * fixup_activate is called when:
 * - an active object is activated
 * - an unknown non-static object is activated
 */
static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
{
        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                WARN_ON(1);
                fallthrough;
        default:
                return false;
        }
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
{
        struct hrtimer *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                hrtimer_cancel(timer);
                debug_object_free(timer, &hrtimer_debug_descr);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr hrtimer_debug_descr = {
        .name                = "hrtimer",
        .debug_hint        = hrtimer_debug_hint,
        .fixup_init        = hrtimer_fixup_init,
        .fixup_activate        = hrtimer_fixup_activate,
        .fixup_free        = hrtimer_fixup_free,
};

static inline void debug_hrtimer_init(struct hrtimer *timer)
{
        debug_object_init(timer, &hrtimer_debug_descr);
}

static inline void debug_hrtimer_activate(struct hrtimer *timer,
                                          enum hrtimer_mode mode)
{
        debug_object_activate(timer, &hrtimer_debug_descr);
}

static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
{
        debug_object_deactivate(timer, &hrtimer_debug_descr);
}

static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
                           enum hrtimer_mode mode);

void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
                           enum hrtimer_mode mode)
{
        debug_object_init_on_stack(timer, &hrtimer_debug_descr);
        __hrtimer_init(timer, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);

static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
                                   clockid_t clock_id, enum hrtimer_mode mode);

void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
                                   clockid_t clock_id, enum hrtimer_mode mode)
{
        debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr);
        __hrtimer_init_sleeper(sl, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);

void destroy_hrtimer_on_stack(struct hrtimer *timer)
{
        debug_object_free(timer, &hrtimer_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);

#else

static inline void debug_hrtimer_init(struct hrtimer *timer) { }
static inline void debug_hrtimer_activate(struct hrtimer *timer,
                                          enum hrtimer_mode mode) { }
static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
#endif

static inline void
debug_init(struct hrtimer *timer, clockid_t clockid,
           enum hrtimer_mode mode)
{
        debug_hrtimer_init(timer);
        trace_hrtimer_init(timer, clockid, mode);
}

static inline void debug_activate(struct hrtimer *timer,
                                  enum hrtimer_mode mode)
{
        debug_hrtimer_activate(timer, mode);
        trace_hrtimer_start(timer, mode);
}

static inline void debug_deactivate(struct hrtimer *timer)
{
        debug_hrtimer_deactivate(timer);
        trace_hrtimer_cancel(timer);
}

static struct hrtimer_clock_base *
__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
{
        unsigned int idx;

        if (!*active)
                return NULL;

        idx = __ffs(*active);
        *active &= ~(1U << idx);

        return &cpu_base->clock_base[idx];
}

#define for_each_active_base(base, cpu_base, active)        \
        while ((base = __next_base((cpu_base), &(active))))

static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
                                         const struct hrtimer *exclude,
                                         unsigned int active,
                                         ktime_t expires_next)
{
        struct hrtimer_clock_base *base;
        ktime_t expires;

        for_each_active_base(base, cpu_base, active) {
                struct timerqueue_node *next;
                struct hrtimer *timer;

                next = timerqueue_getnext(&base->active);
                timer = container_of(next, struct hrtimer, node);
                if (timer == exclude) {
                        /* Get to the next timer in the queue. */
                        next = timerqueue_iterate_next(next);
                        if (!next)
                                continue;

                        timer = container_of(next, struct hrtimer, node);
                }
                expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
                if (expires < expires_next) {
                        expires_next = expires;

                        /* Skip cpu_base update if a timer is being excluded. */
                        if (exclude)
                                continue;

                        if (timer->is_soft)
                                cpu_base->softirq_next_timer = timer;
                        else
                                cpu_base->next_timer = timer;
                }
        }
        /*
         * clock_was_set() might have changed base->offset of any of
         * the clock bases so the result might be negative. Fix it up
         * to prevent a false positive in clockevents_program_event().
         */
        if (expires_next < 0)
                expires_next = 0;
        return expires_next;
}

/*
 * Recomputes cpu_base::*next_timer and returns the earliest expires_next
 * but does not set cpu_base::*expires_next, that is done by
 * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating
 * cpu_base::*expires_next right away, reprogramming logic would no longer
 * work.
 *
 * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
 * those timers will get run whenever the softirq gets handled, at the end of
 * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
 *
 * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
 * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
 * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
 *
 * @active_mask must be one of:
 *  - HRTIMER_ACTIVE_ALL,
 *  - HRTIMER_ACTIVE_SOFT, or
 *  - HRTIMER_ACTIVE_HARD.
 */
static ktime_t
__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
{
        unsigned int active;
        struct hrtimer *next_timer = NULL;
        ktime_t expires_next = KTIME_MAX;

        if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
                active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
                cpu_base->softirq_next_timer = NULL;
                expires_next = __hrtimer_next_event_base(cpu_base, NULL,
                                                         active, KTIME_MAX);

                next_timer = cpu_base->softirq_next_timer;
        }

        if (active_mask & HRTIMER_ACTIVE_HARD) {
                active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
                cpu_base->next_timer = next_timer;
                expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
                                                         expires_next);
        }

        return expires_next;
}

static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
{
        ktime_t expires_next, soft = KTIME_MAX;

        /*
         * If the soft interrupt has already been activated, ignore the
         * soft bases. They will be handled in the already raised soft
         * interrupt.
         */
        if (!cpu_base->softirq_activated) {
                soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
                /*
                 * Update the soft expiry time. clock_settime() might have
                 * affected it.
                 */
                cpu_base->softirq_expires_next = soft;
        }

        expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD);
        /*
         * If a softirq timer is expiring first, update cpu_base->next_timer
         * and program the hardware with the soft expiry time.
         */
        if (expires_next > soft) {
                cpu_base->next_timer = cpu_base->softirq_next_timer;
                expires_next = soft;
        }

        return expires_next;
}

static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
        ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
        ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
        ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;

        ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
                                            offs_real, offs_boot, offs_tai);

        base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
        base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
        base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;

        return now;
}

/*
 * Is the high resolution mode active ?
 */
static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
{
        return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
                cpu_base->hres_active : 0;
}

static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
                                struct hrtimer *next_timer,
                                ktime_t expires_next)
{
        cpu_base->expires_next = expires_next;

        /*
         * If hres is not active, hardware does not have to be
         * reprogrammed yet.
         *
         * If a hang was detected in the last timer interrupt then we
         * leave the hang delay active in the hardware. We want the
         * system to make progress. That also prevents the following
         * scenario:
         * T1 expires 50ms from now
         * T2 expires 5s from now
         *
         * T1 is removed, so this code is called and would reprogram
         * the hardware to 5s from now. Any hrtimer_start after that
         * will not reprogram the hardware due to hang_detected being
         * set. So we'd effectively block all timers until the T2 event
         * fires.
         */
        if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
                return;

        tick_program_event(expires_next, 1);
}

/*
 * Reprogram the event source with checking both queues for the
 * next event
 * Called with interrupts disabled and base->lock held
 */
static void
hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
        ktime_t expires_next;

        expires_next = hrtimer_update_next_event(cpu_base);

        if (skip_equal && expires_next == cpu_base->expires_next)
                return;

        __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next);
}

/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS

/*
 * High resolution timer enabled ?
 */
static bool hrtimer_hres_enabled __read_mostly  = true;
unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
EXPORT_SYMBOL_GPL(hrtimer_resolution);

/*
 * Enable / Disable high resolution mode
 */
static int __init setup_hrtimer_hres(char *str)
{
        return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
}

__setup("highres=", setup_hrtimer_hres);

/*
 * hrtimer_high_res_enabled - query, if the highres mode is enabled
 */
static inline int hrtimer_is_hres_enabled(void)
{
        return hrtimer_hres_enabled;
}

static void retrigger_next_event(void *arg);

/*
 * Switch to high resolution mode
 */
static void hrtimer_switch_to_hres(void)
{
        struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);

        if (tick_init_highres()) {
                pr_warn("Could not switch to high resolution mode on CPU %u\n",
                        base->cpu);
                return;
        }
        base->hres_active = 1;
        hrtimer_resolution = HIGH_RES_NSEC;

        tick_setup_sched_timer(true);
        /* "Retrigger" the interrupt to get things going */
        retrigger_next_event(NULL);
}

#else

static inline int hrtimer_is_hres_enabled(void) { return 0; }
static inline void hrtimer_switch_to_hres(void) { }

#endif /* CONFIG_HIGH_RES_TIMERS */
/*
 * Retrigger next event is called after clock was set with interrupts
 * disabled through an SMP function call or directly from low level
 * resume code.
 *
 * This is only invoked when:
 *        - CONFIG_HIGH_RES_TIMERS is enabled.
 *        - CONFIG_NOHZ_COMMON is enabled
 *
 * For the other cases this function is empty and because the call sites
 * are optimized out it vanishes as well, i.e. no need for lots of
 * #ifdeffery.
 */
static void retrigger_next_event(void *arg)
{
        struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);

        /*
         * When high resolution mode or nohz is active, then the offsets of
         * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the
         * next tick will take care of that.
         *
         * If high resolution mode is active then the next expiring timer
         * must be reevaluated and the clock event device reprogrammed if
         * necessary.
         *
         * In the NOHZ case the update of the offset and the reevaluation
         * of the next expiring timer is enough. The return from the SMP
         * function call will take care of the reprogramming in case the
         * CPU was in a NOHZ idle sleep.
         */
        if (!hrtimer_hres_active(base) && !tick_nohz_active)
                return;

        raw_spin_lock(&base->lock);
        hrtimer_update_base(base);
        if (hrtimer_hres_active(base))
                hrtimer_force_reprogram(base, 0);
        else
                hrtimer_update_next_event(base);
        raw_spin_unlock(&base->lock);
}

/*
 * When a timer is enqueued and expires earlier than the already enqueued
 * timers, we have to check, whether it expires earlier than the timer for
 * which the clock event device was armed.
 *
 * Called with interrupts disabled and base->cpu_base.lock held
 */
static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        struct hrtimer_clock_base *base = timer->base;
        ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);

        WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);

        /*
         * CLOCK_REALTIME timer might be requested with an absolute
         * expiry time which is less than base->offset. Set it to 0.
         */
        if (expires < 0)
                expires = 0;

        if (timer->is_soft) {
                /*
                 * soft hrtimer could be started on a remote CPU. In this
                 * case softirq_expires_next needs to be updated on the
                 * remote CPU. The soft hrtimer will not expire before the
                 * first hard hrtimer on the remote CPU -
                 * hrtimer_check_target() prevents this case.
                 */
                struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;

                if (timer_cpu_base->softirq_activated)
                        return;

                if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
                        return;

                timer_cpu_base->softirq_next_timer = timer;
                timer_cpu_base->softirq_expires_next = expires;

                if (!ktime_before(expires, timer_cpu_base->expires_next) ||
                    !reprogram)
                        return;
        }

        /*
         * If the timer is not on the current cpu, we cannot reprogram
         * the other cpus clock event device.
         */
        if (base->cpu_base != cpu_base)
                return;

        if (expires >= cpu_base->expires_next)
                return;

        /*
         * If the hrtimer interrupt is running, then it will reevaluate the
         * clock bases and reprogram the clock event device.
         */
        if (cpu_base->in_hrtirq)
                return;

        cpu_base->next_timer = timer;

        __hrtimer_reprogram(cpu_base, timer, expires);
}

static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
                             unsigned int active)
{
        struct hrtimer_clock_base *base;
        unsigned int seq;
        ktime_t expires;

        /*
         * Update the base offsets unconditionally so the following
         * checks whether the SMP function call is required works.
         *
         * The update is safe even when the remote CPU is in the hrtimer
         * interrupt or the hrtimer soft interrupt and expiring affected
         * bases. Either it will see the update before handling a base or
         * it will see it when it finishes the processing and reevaluates
         * the next expiring timer.
         */
        seq = cpu_base->clock_was_set_seq;
        hrtimer_update_base(cpu_base);

        /*
         * If the sequence did not change over the update then the
         * remote CPU already handled it.
         */
        if (seq == cpu_base->clock_was_set_seq)
                return false;

        /*
         * If the remote CPU is currently handling an hrtimer interrupt, it
         * will reevaluate the first expiring timer of all clock bases
         * before reprogramming. Nothing to do here.
         */
        if (cpu_base->in_hrtirq)
                return false;

        /*
         * Walk the affected clock bases and check whether the first expiring
         * timer in a clock base is moving ahead of the first expiring timer of
         * @cpu_base. If so, the IPI must be invoked because per CPU clock
         * event devices cannot be remotely reprogrammed.
         */
        active &= cpu_base->active_bases;

        for_each_active_base(base, cpu_base, active) {
                struct timerqueue_node *next;

                next = timerqueue_getnext(&base->active);
                expires = ktime_sub(next->expires, base->offset);
                if (expires < cpu_base->expires_next)
                        return true;

                /* Extra check for softirq clock bases */
                if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT)
                        continue;
                if (cpu_base->softirq_activated)
                        continue;
                if (expires < cpu_base->softirq_expires_next)
                        return true;
        }
        return false;
}

/*
 * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and
 * CLOCK_BOOTTIME (for late sleep time injection).
 *
 * This requires to update the offsets for these clocks
 * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this
 * also requires to eventually reprogram the per CPU clock event devices
 * when the change moves an affected timer ahead of the first expiring
 * timer on that CPU. Obviously remote per CPU clock event devices cannot
 * be reprogrammed. The other reason why an IPI has to be sent is when the
 * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets
 * in the tick, which obviously might be stopped, so this has to bring out
 * the remote CPU which might sleep in idle to get this sorted.
 */
void clock_was_set(unsigned int bases)
{
        struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases);
        cpumask_var_t mask;
        int cpu;

        if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active)
                goto out_timerfd;

        if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
                on_each_cpu(retrigger_next_event, NULL, 1);
                goto out_timerfd;
        }

        /* Avoid interrupting CPUs if possible */
        cpus_read_lock();
        for_each_online_cpu(cpu) {
                unsigned long flags;

                cpu_base = &per_cpu(hrtimer_bases, cpu);
                raw_spin_lock_irqsave(&cpu_base->lock, flags);

                if (update_needs_ipi(cpu_base, bases))
                        cpumask_set_cpu(cpu, mask);

                raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
        }

        preempt_disable();
        smp_call_function_many(mask, retrigger_next_event, NULL, 1);
        preempt_enable();
        cpus_read_unlock();
        free_cpumask_var(mask);

out_timerfd:
        timerfd_clock_was_set();
}

static void clock_was_set_work(struct work_struct *work)
{
        clock_was_set(CLOCK_SET_WALL);
}

static DECLARE_WORK(hrtimer_work, clock_was_set_work);

/*
 * Called from timekeeping code to reprogram the hrtimer interrupt device
 * on all cpus and to notify timerfd.
 */
void clock_was_set_delayed(void)
{
        schedule_work(&hrtimer_work);
}

/*
 * Called during resume either directly from via timekeeping_resume()
 * or in the case of s2idle from tick_unfreeze() to ensure that the
 * hrtimers are up to date.
 */
void hrtimers_resume_local(void)
{
        lockdep_assert_irqs_disabled();
        /* Retrigger on the local CPU */
        retrigger_next_event(NULL);
}

/*
 * Counterpart to lock_hrtimer_base above:
 */
static inline
void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
        __releases(&timer->base->cpu_base->lock)
{
        raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
}

/**
 * hrtimer_forward() - forward the timer expiry
 * @timer:        hrtimer to forward
 * @now:        forward past this time
 * @interval:        the interval to forward
 *
 * Forward the timer expiry so it will expire in the future.
 *
 * .. note::
 *  This only updates the timer expiry value and does not requeue the timer.
 *
 * There is also a variant of the function hrtimer_forward_now().
 *
 * Context: Can be safely called from the callback function of @timer. If called
 *          from other contexts @timer must neither be enqueued nor running the
 *          callback and the caller needs to take care of serialization.
 *
 * Return: The number of overruns are returned.
 */
u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
{
        u64 orun = 1;
        ktime_t delta;

        delta = ktime_sub(now, hrtimer_get_expires(timer));

        if (delta < 0)
                return 0;

        if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
                return 0;

        if (interval < hrtimer_resolution)
                interval = hrtimer_resolution;

        if (unlikely(delta >= interval)) {
                s64 incr = ktime_to_ns(interval);

                orun = ktime_divns(delta, incr);
                hrtimer_add_expires_ns(timer, incr * orun);
                if (hrtimer_get_expires_tv64(timer) > now)
                        return orun;
                /*
                 * This (and the ktime_add() below) is the
                 * correction for exact:
                 */
                orun++;
        }
        hrtimer_add_expires(timer, interval);

        return orun;
}
EXPORT_SYMBOL_GPL(hrtimer_forward);

/*
 * enqueue_hrtimer - internal function to (re)start a timer
 *
 * The timer is inserted in expiry order. Insertion into the
 * red black tree is O(log(n)). Must hold the base lock.
 *
 * Returns 1 when the new timer is the leftmost timer in the tree.
 */
static int enqueue_hrtimer(struct hrtimer *timer,
                           struct hrtimer_clock_base *base,
                           enum hrtimer_mode mode)
{
        debug_activate(timer, mode);
        WARN_ON_ONCE(!base->cpu_base->online);

        base->cpu_base->active_bases |= 1 << base->index;

        /* Pairs with the lockless read in hrtimer_is_queued() */
        WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED);

        return timerqueue_add(&base->active, &timer->node);
}

/*
 * __remove_hrtimer - internal function to remove a timer
 *
 * Caller must hold the base lock.
 *
 * High resolution timer mode reprograms the clock event device when the
 * timer is the one which expires next. The caller can disable this by setting
 * reprogram to zero. This is useful, when the context does a reprogramming
 * anyway (e.g. timer interrupt)
 */
static void __remove_hrtimer(struct hrtimer *timer,
                             struct hrtimer_clock_base *base,
                             u8 newstate, int reprogram)
{
        struct hrtimer_cpu_base *cpu_base = base->cpu_base;
        u8 state = timer->state;

        /* Pairs with the lockless read in hrtimer_is_queued() */
        WRITE_ONCE(timer->state, newstate);
        if (!(state & HRTIMER_STATE_ENQUEUED))
                return;

        if (!timerqueue_del(&base->active, &timer->node))
                cpu_base->active_bases &= ~(1 << base->index);

        /*
         * Note: If reprogram is false we do not update
         * cpu_base->next_timer. This happens when we remove the first
         * timer on a remote cpu. No harm as we never dereference
         * cpu_base->next_timer. So the worst thing what can happen is
         * an superfluous call to hrtimer_force_reprogram() on the
         * remote cpu later on if the same timer gets enqueued again.
         */
        if (reprogram && timer == cpu_base->next_timer)
                hrtimer_force_reprogram(cpu_base, 1);
}

/*
 * remove hrtimer, called with base lock held
 */
static inline int
remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
               bool restart, bool keep_local)
{
        u8 state = timer->state;

        if (state & HRTIMER_STATE_ENQUEUED) {
                bool reprogram;

                /*
                 * Remove the timer and force reprogramming when high
                 * resolution mode is active and the timer is on the current
                 * CPU. If we remove a timer on another CPU, reprogramming is
                 * skipped. The interrupt event on this CPU is fired and
                 * reprogramming happens in the interrupt handler. This is a
                 * rare case and less expensive than a smp call.
                 */
                debug_deactivate(timer);
                reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);

                /*
                 * If the timer is not restarted then reprogramming is
                 * required if the timer is local. If it is local and about
                 * to be restarted, avoid programming it twice (on removal
                 * and a moment later when it's requeued).
                 */
                if (!restart)
                        state = HRTIMER_STATE_INACTIVE;
                else
                        reprogram &= !keep_local;

                __remove_hrtimer(timer, base, state, reprogram);
                return 1;
        }
        return 0;
}

static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
                                            const enum hrtimer_mode mode)
{
#ifdef CONFIG_TIME_LOW_RES
        /*
         * CONFIG_TIME_LOW_RES indicates that the system has no way to return
         * granular time values. For relative timers we add hrtimer_resolution
         * (i.e. one jiffie) to prevent short timeouts.
         */
        timer->is_rel = mode & HRTIMER_MODE_REL;
        if (timer->is_rel)
                tim = ktime_add_safe(tim, hrtimer_resolution);
#endif
        return tim;
}

static void
hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
{
        ktime_t expires;

        /*
         * Find the next SOFT expiration.
         */
        expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);

        /*
         * reprogramming needs to be triggered, even if the next soft
         * hrtimer expires at the same time than the next hard
         * hrtimer. cpu_base->softirq_expires_next needs to be updated!
         */
        if (expires == KTIME_MAX)
                return;

        /*
         * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
         * cpu_base->*expires_next is only set by hrtimer_reprogram()
         */
        hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
}

static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
                                    u64 delta_ns, const enum hrtimer_mode mode,
                                    struct hrtimer_clock_base *base)
{
        struct hrtimer_clock_base *new_base;
        bool force_local, first;

        /*
         * If the timer is on the local cpu base and is the first expiring
         * timer then this might end up reprogramming the hardware twice
         * (on removal and on enqueue). To avoid that by prevent the
         * reprogram on removal, keep the timer local to the current CPU
         * and enforce reprogramming after it is queued no matter whether
         * it is the new first expiring timer again or not.
         */
        force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
        force_local &= base->cpu_base->next_timer == timer;

        /*
         * Remove an active timer from the queue. In case it is not queued
         * on the current CPU, make sure that remove_hrtimer() updates the
         * remote data correctly.
         *
         * If it's on the current CPU and the first expiring timer, then
         * skip reprogramming, keep the timer local and enforce
         * reprogramming later if it was the first expiring timer.  This
         * avoids programming the underlying clock event twice (once at
         * removal and once after enqueue).
         */
        remove_hrtimer(timer, base, true, force_local);

        if (mode & HRTIMER_MODE_REL)
                tim = ktime_add_safe(tim, base->get_time());

        tim = hrtimer_update_lowres(timer, tim, mode);

        hrtimer_set_expires_range_ns(timer, tim, delta_ns);

        /* Switch the timer base, if necessary: */
        if (!force_local) {
                new_base = switch_hrtimer_base(timer, base,
                                               mode & HRTIMER_MODE_PINNED);
        } else {
                new_base = base;
        }

        first = enqueue_hrtimer(timer, new_base, mode);
        if (!force_local)
                return first;

        /*
         * Timer was forced to stay on the current CPU to avoid
         * reprogramming on removal and enqueue. Force reprogram the
         * hardware by evaluating the new first expiring timer.
         */
        hrtimer_force_reprogram(new_base->cpu_base, 1);
        return 0;
}

/**
 * hrtimer_start_range_ns - (re)start an hrtimer
 * @timer:        the timer to be added
 * @tim:        expiry time
 * @delta_ns:        "slack" range for the timer
 * @mode:        timer mode: absolute (HRTIMER_MODE_ABS) or
 *                relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
 *                softirq based mode is considered for debug purpose only!
 */
void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
                            u64 delta_ns, const enum hrtimer_mode mode)
{
        struct hrtimer_clock_base *base;
        unsigned long flags;

        /*
         * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
         * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
         * expiry mode because unmarked timers are moved to softirq expiry.
         */
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
        else
                WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);

        base = lock_hrtimer_base(timer, &flags);

        if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
                hrtimer_reprogram(timer, true);

        unlock_hrtimer_base(timer, &flags);
}
EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);

/**
 * hrtimer_try_to_cancel - try to deactivate a timer
 * @timer:        hrtimer to stop
 *
 * Returns:
 *
 *  *  0 when the timer was not active
 *  *  1 when the timer was active
 *  * -1 when the timer is currently executing the callback function and
 *    cannot be stopped
 */
int hrtimer_try_to_cancel(struct hrtimer *timer)
{
        struct hrtimer_clock_base *base;
        unsigned long flags;
        int ret = -1;

        /*
         * Check lockless first. If the timer is not active (neither
         * enqueued nor running the callback, nothing to do here.  The
         * base lock does not serialize against a concurrent enqueue,
         * so we can avoid taking it.
         */
        if (!hrtimer_active(timer))
                return 0;

        base = lock_hrtimer_base(timer, &flags);

        if (!hrtimer_callback_running(timer))
                ret = remove_hrtimer(timer, base, false, false);

        unlock_hrtimer_base(timer, &flags);

        return ret;

}
EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);

#ifdef CONFIG_PREEMPT_RT
static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base)
{
        spin_lock_init(&base->softirq_expiry_lock);
}

static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base)
{
        spin_lock(&base->softirq_expiry_lock);
}

static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
{
        spin_unlock(&base->softirq_expiry_lock);
}

/*
 * The counterpart to hrtimer_cancel_wait_running().
 *
 * If there is a waiter for cpu_base->expiry_lock, then it was waiting for
 * the timer callback to finish. Drop expiry_lock and reacquire it. That
 * allows the waiter to acquire the lock and make progress.
 */
static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
                                      unsigned long flags)
{
        if (atomic_read(&cpu_base->timer_waiters)) {
                raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
                spin_unlock(&cpu_base->softirq_expiry_lock);
                spin_lock(&cpu_base->softirq_expiry_lock);
                raw_spin_lock_irq(&cpu_base->lock);
        }
}

/*
 * This function is called on PREEMPT_RT kernels when the fast path
 * deletion of a timer failed because the timer callback function was
 * running.
 *
 * This prevents priority inversion: if the soft irq thread is preempted
 * in the middle of a timer callback, then calling del_timer_sync() can
 * lead to two issues:
 *
 *  - If the caller is on a remote CPU then it has to spin wait for the timer
 *    handler to complete. This can result in unbound priority inversion.
 *
 *  - If the caller originates from the task which preempted the timer
 *    handler on the same CPU, then spin waiting for the timer handler to
 *    complete is never going to end.
 */
void hrtimer_cancel_wait_running(const struct hrtimer *timer)
{
        /* Lockless read. Prevent the compiler from reloading it below */
        struct hrtimer_clock_base *base = READ_ONCE(timer->base);

        /*
         * Just relax if the timer expires in hard interrupt context or if
         * it is currently on the migration base.
         */
        if (!timer->is_soft || is_migration_base(base)) {
                cpu_relax();
                return;
        }

        /*
         * Mark the base as contended and grab the expiry lock, which is
         * held by the softirq across the timer callback. Drop the lock
         * immediately so the softirq can expire the next timer. In theory
         * the timer could already be running again, but that's more than
         * unlikely and just causes another wait loop.
         */
        atomic_inc(&base->cpu_base->timer_waiters);
        spin_lock_bh(&base->cpu_base->softirq_expiry_lock);
        atomic_dec(&base->cpu_base->timer_waiters);
        spin_unlock_bh(&base->cpu_base->softirq_expiry_lock);
}
#else
static inline void
hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
static inline void
hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
static inline void
hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base,
                                             unsigned long flags) { }
#endif

/**
 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
 * @timer:        the timer to be cancelled
 *
 * Returns:
 *  0 when the timer was not active
 *  1 when the timer was active
 */
int hrtimer_cancel(struct hrtimer *timer)
{
        int ret;

        do {
                ret = hrtimer_try_to_cancel(timer);

                if (ret < 0)
                        hrtimer_cancel_wait_running(timer);
        } while (ret < 0);
        return ret;
}
EXPORT_SYMBOL_GPL(hrtimer_cancel);

/**
 * __hrtimer_get_remaining - get remaining time for the timer
 * @timer:        the timer to read
 * @adjust:        adjust relative timers when CONFIG_TIME_LOW_RES=y
 */
ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust)
{
        unsigned long flags;
        ktime_t rem;

        lock_hrtimer_base(timer, &flags);
        if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust)
                rem = hrtimer_expires_remaining_adjusted(timer);
        else
                rem = hrtimer_expires_remaining(timer);
        unlock_hrtimer_base(timer, &flags);

        return rem;
}
EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);

#ifdef CONFIG_NO_HZ_COMMON
/**
 * hrtimer_get_next_event - get the time until next expiry event
 *
 * Returns the next expiry time or KTIME_MAX if no timer is pending.
 */
u64 hrtimer_get_next_event(void)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        u64 expires = KTIME_MAX;
        unsigned long flags;

        raw_spin_lock_irqsave(&cpu_base->lock, flags);

        if (!hrtimer_hres_active(cpu_base))
                expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);

        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

        return expires;
}

/**
 * hrtimer_next_event_without - time until next expiry event w/o one timer
 * @exclude:        timer to exclude
 *
 * Returns the next expiry time over all timers except for the @exclude one or
 * KTIME_MAX if none of them is pending.
 */
u64 hrtimer_next_event_without(const struct hrtimer *exclude)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        u64 expires = KTIME_MAX;
        unsigned long flags;

        raw_spin_lock_irqsave(&cpu_base->lock, flags);

        if (hrtimer_hres_active(cpu_base)) {
                unsigned int active;

                if (!cpu_base->softirq_activated) {
                        active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
                        expires = __hrtimer_next_event_base(cpu_base, exclude,
                                                            active, KTIME_MAX);
                }
                active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
                expires = __hrtimer_next_event_base(cpu_base, exclude, active,
                                                    expires);
        }

        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

        return expires;
}
#endif

static inline int hrtimer_clockid_to_base(clockid_t clock_id)
{
        if (likely(clock_id < MAX_CLOCKS)) {
                int base = hrtimer_clock_to_base_table[clock_id];

                if (likely(base != HRTIMER_MAX_CLOCK_BASES))
                        return base;
        }
        WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
        return HRTIMER_BASE_MONOTONIC;
}

static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
                           enum hrtimer_mode mode)
{
        bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
        struct hrtimer_cpu_base *cpu_base;
        int base;

        /*
         * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
         * marked for hard interrupt expiry mode are moved into soft
         * interrupt context for latency reasons and because the callbacks
         * can invoke functions which might sleep on RT, e.g. spin_lock().
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD))
                softtimer = true;

        memset(timer, 0, sizeof(struct hrtimer));

        cpu_base = raw_cpu_ptr(&hrtimer_bases);

        /*
         * POSIX magic: Relative CLOCK_REALTIME timers are not affected by
         * clock modifications, so they needs to become CLOCK_MONOTONIC to
         * ensure POSIX compliance.
         */
        if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
                clock_id = CLOCK_MONOTONIC;

        base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
        base += hrtimer_clockid_to_base(clock_id);
        timer->is_soft = softtimer;
        timer->is_hard = !!(mode & HRTIMER_MODE_HARD);
        timer->base = &cpu_base->clock_base[base];
        timerqueue_init(&timer->node);
}

/**
 * hrtimer_init - initialize a timer to the given clock
 * @timer:        the timer to be initialized
 * @clock_id:        the clock to be used
 * @mode:       The modes which are relevant for initialization:
 *              HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
 *              HRTIMER_MODE_REL_SOFT
 *
 *              The PINNED variants of the above can be handed in,
 *              but the PINNED bit is ignored as pinning happens
 *              when the hrtimer is started
 */
void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
                  enum hrtimer_mode mode)
{
        debug_init(timer, clock_id, mode);
        __hrtimer_init(timer, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_init);

/*
 * A timer is active, when it is enqueued into the rbtree or the
 * callback function is running or it's in the state of being migrated
 * to another cpu.
 *
 * It is important for this function to not return a false negative.
 */
bool hrtimer_active(const struct hrtimer *timer)
{
        struct hrtimer_clock_base *base;
        unsigned int seq;

        do {
                base = READ_ONCE(timer->base);
                seq = raw_read_seqcount_begin(&base->seq);

                if (timer->state != HRTIMER_STATE_INACTIVE ||
                    base->running == timer)
                        return true;

        } while (read_seqcount_retry(&base->seq, seq) ||
                 base != READ_ONCE(timer->base));

        return false;
}
EXPORT_SYMBOL_GPL(hrtimer_active);

/*
 * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
 * distinct sections:
 *
 *  - queued:        the timer is queued
 *  - callback:        the timer is being ran
 *  - post:        the timer is inactive or (re)queued
 *
 * On the read side we ensure we observe timer->state and cpu_base->running
 * from the same section, if anything changed while we looked at it, we retry.
 * This includes timer->base changing because sequence numbers alone are
 * insufficient for that.
 *
 * The sequence numbers are required because otherwise we could still observe
 * a false negative if the read side got smeared over multiple consecutive
 * __run_hrtimer() invocations.
 */

static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
                          struct hrtimer_clock_base *base,
                          struct hrtimer *timer, ktime_t *now,
                          unsigned long flags) __must_hold(&cpu_base->lock)
{
        enum hrtimer_restart (*fn)(struct hrtimer *);
        bool expires_in_hardirq;
        int restart;

        lockdep_assert_held(&cpu_base->lock);

        debug_deactivate(timer);
        base->running = timer;

        /*
         * Separate the ->running assignment from the ->state assignment.
         *
         * As with a regular write barrier, this ensures the read side in
         * hrtimer_active() cannot observe base->running == NULL &&
         * timer->state == INACTIVE.
         */
        raw_write_seqcount_barrier(&base->seq);

        __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
        fn = timer->function;

        /*
         * Clear the 'is relative' flag for the TIME_LOW_RES case. If the
         * timer is restarted with a period then it becomes an absolute
         * timer. If its not restarted it does not matter.
         */
        if (IS_ENABLED(CONFIG_TIME_LOW_RES))
                timer->is_rel = false;

        /*
         * The timer is marked as running in the CPU base, so it is
         * protected against migration to a different CPU even if the lock
         * is dropped.
         */
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
        trace_hrtimer_expire_entry(timer, now);
        expires_in_hardirq = lockdep_hrtimer_enter(timer);

        restart = fn(timer);

        lockdep_hrtimer_exit(expires_in_hardirq);
        trace_hrtimer_expire_exit(timer);
        raw_spin_lock_irq(&cpu_base->lock);

        /*
         * Note: We clear the running state after enqueue_hrtimer and
         * we do not reprogram the event hardware. Happens either in
         * hrtimer_start_range_ns() or in hrtimer_interrupt()
         *
         * Note: Because we dropped the cpu_base->lock above,
         * hrtimer_start_range_ns() can have popped in and enqueued the timer
         * for us already.
         */
        if (restart != HRTIMER_NORESTART &&
            !(timer->state & HRTIMER_STATE_ENQUEUED))
                enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);

        /*
         * Separate the ->running assignment from the ->state assignment.
         *
         * As with a regular write barrier, this ensures the read side in
         * hrtimer_active() cannot observe base->running.timer == NULL &&
         * timer->state == INACTIVE.
         */
        raw_write_seqcount_barrier(&base->seq);

        WARN_ON_ONCE(base->running != timer);
        base->running = NULL;
}

static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
                                 unsigned long flags, unsigned int active_mask)
{
        struct hrtimer_clock_base *base;
        unsigned int active = cpu_base->active_bases & active_mask;

        for_each_active_base(base, cpu_base, active) {
                struct timerqueue_node *node;
                ktime_t basenow;

                basenow = ktime_add(now, base->offset);

                while ((node = timerqueue_getnext(&base->active))) {
                        struct hrtimer *timer;

                        timer = container_of(node, struct hrtimer, node);

                        /*
                         * The immediate goal for using the softexpires is
                         * minimizing wakeups, not running timers at the
                         * earliest interrupt after their soft expiration.
                         * This allows us to avoid using a Priority Search
                         * Tree, which can answer a stabbing query for
                         * overlapping intervals and instead use the simple
                         * BST we already have.
                         * We don't add extra wakeups by delaying timers that
                         * are right-of a not yet expired timer, because that
                         * timer will have to trigger a wakeup anyway.
                         */
                        if (basenow < hrtimer_get_softexpires_tv64(timer))
                                break;

                        __run_hrtimer(cpu_base, base, timer, &basenow, flags);
                        if (active_mask == HRTIMER_ACTIVE_SOFT)
                                hrtimer_sync_wait_running(cpu_base, flags);
                }
        }
}

static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        unsigned long flags;
        ktime_t now;

        hrtimer_cpu_base_lock_expiry(cpu_base);
        raw_spin_lock_irqsave(&cpu_base->lock, flags);

        now = hrtimer_update_base(cpu_base);
        __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);

        cpu_base->softirq_activated = 0;
        hrtimer_update_softirq_timer(cpu_base, true);

        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
        hrtimer_cpu_base_unlock_expiry(cpu_base);
}

#ifdef CONFIG_HIGH_RES_TIMERS

/*
 * High resolution timer interrupt
 * Called with interrupts disabled
 */
void hrtimer_interrupt(struct clock_event_device *dev)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        ktime_t expires_next, now, entry_time, delta;
        unsigned long flags;
        int retries = 0;

        BUG_ON(!cpu_base->hres_active);
        cpu_base->nr_events++;
        dev->next_event = KTIME_MAX;

        raw_spin_lock_irqsave(&cpu_base->lock, flags);
        entry_time = now = hrtimer_update_base(cpu_base);
retry:
        cpu_base->in_hrtirq = 1;
        /*
         * We set expires_next to KTIME_MAX here with cpu_base->lock
         * held to prevent that a timer is enqueued in our queue via
         * the migration code. This does not affect enqueueing of
         * timers which run their callback and need to be requeued on
         * this CPU.
         */
        cpu_base->expires_next = KTIME_MAX;

        if (!ktime_before(now, cpu_base->softirq_expires_next)) {
                cpu_base->softirq_expires_next = KTIME_MAX;
                cpu_base->softirq_activated = 1;
                raise_softirq_irqoff(HRTIMER_SOFTIRQ);
        }

        __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);

        /* Reevaluate the clock bases for the [soft] next expiry */
        expires_next = hrtimer_update_next_event(cpu_base);
        /*
         * Store the new expiry value so the migration code can verify
         * against it.
         */
        cpu_base->expires_next = expires_next;
        cpu_base->in_hrtirq = 0;
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

        /* Reprogramming necessary ? */
        if (!tick_program_event(expires_next, 0)) {
                cpu_base->hang_detected = 0;
                return;
        }

        /*
         * The next timer was already expired due to:
         * - tracing
         * - long lasting callbacks
         * - being scheduled away when running in a VM
         *
         * We need to prevent that we loop forever in the hrtimer
         * interrupt routine. We give it 3 attempts to avoid
         * overreacting on some spurious event.
         *
         * Acquire base lock for updating the offsets and retrieving
         * the current time.
         */
        raw_spin_lock_irqsave(&cpu_base->lock, flags);
        now = hrtimer_update_base(cpu_base);
        cpu_base->nr_retries++;
        if (++retries < 3)
                goto retry;
        /*
         * Give the system a chance to do something else than looping
         * here. We stored the entry time, so we know exactly how long
         * we spent here. We schedule the next event this amount of
         * time away.
         */
        cpu_base->nr_hangs++;
        cpu_base->hang_detected = 1;
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

        delta = ktime_sub(now, entry_time);
        if ((unsigned int)delta > cpu_base->max_hang_time)
                cpu_base->max_hang_time = (unsigned int) delta;
        /*
         * Limit it to a sensible value as we enforce a longer
         * delay. Give the CPU at least 100ms to catch up.
         */
        if (delta > 100 * NSEC_PER_MSEC)
                expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
        else
                expires_next = ktime_add(now, delta);
        tick_program_event(expires_next, 1);
        pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
}
#endif /* !CONFIG_HIGH_RES_TIMERS */

/*
 * Called from run_local_timers in hardirq context every jiffy
 */
void hrtimer_run_queues(void)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        unsigned long flags;
        ktime_t now;

        if (hrtimer_hres_active(cpu_base))
                return;

        /*
         * This _is_ ugly: We have to check periodically, whether we
         * can switch to highres and / or nohz mode. The clocksource
         * switch happens with xtime_lock held. Notification from
         * there only sets the check bit in the tick_oneshot code,
         * otherwise we might deadlock vs. xtime_lock.
         */
        if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
                hrtimer_switch_to_hres();
                return;
        }

        raw_spin_lock_irqsave(&cpu_base->lock, flags);
        now = hrtimer_update_base(cpu_base);

        if (!ktime_before(now, cpu_base->softirq_expires_next)) {
                cpu_base->softirq_expires_next = KTIME_MAX;
                cpu_base->softirq_activated = 1;
                raise_softirq_irqoff(HRTIMER_SOFTIRQ);
        }

        __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}

/*
 * Sleep related functions:
 */
static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
{
        struct hrtimer_sleeper *t =
                container_of(timer, struct hrtimer_sleeper, timer);
        struct task_struct *task = t->task;

        t->task = NULL;
        if (task)
                wake_up_process(task);

        return HRTIMER_NORESTART;
}

/**
 * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer
 * @sl:                sleeper to be started
 * @mode:        timer mode abs/rel
 *
 * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers
 * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context)
 */
void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
                                   enum hrtimer_mode mode)
{
        /*
         * Make the enqueue delivery mode check work on RT. If the sleeper
         * was initialized for hard interrupt delivery, force the mode bit.
         * This is a special case for hrtimer_sleepers because
         * hrtimer_init_sleeper() determines the delivery mode on RT so the
         * fiddling with this decision is avoided at the call sites.
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
                mode |= HRTIMER_MODE_HARD;

        hrtimer_start_expires(&sl->timer, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);

static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
                                   clockid_t clock_id, enum hrtimer_mode mode)
{
        /*
         * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
         * marked for hard interrupt expiry mode are moved into soft
         * interrupt context either for latency reasons or because the
         * hrtimer callback takes regular spinlocks or invokes other
         * functions which are not suitable for hard interrupt context on
         * PREEMPT_RT.
         *
         * The hrtimer_sleeper callback is RT compatible in hard interrupt
         * context, but there is a latency concern: Untrusted userspace can
         * spawn many threads which arm timers for the same expiry time on
         * the same CPU. That causes a latency spike due to the wakeup of
         * a gazillion threads.
         *
         * OTOH, privileged real-time user space applications rely on the
         * low latency of hard interrupt wakeups. If the current task is in
         * a real-time scheduling class, mark the mode for hard interrupt
         * expiry.
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
                if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT))
                        mode |= HRTIMER_MODE_HARD;
        }

        __hrtimer_init(&sl->timer, clock_id, mode);
        sl->timer.function = hrtimer_wakeup;
        sl->task = current;
}

/**
 * hrtimer_init_sleeper - initialize sleeper to the given clock
 * @sl:                sleeper to be initialized
 * @clock_id:        the clock to be used
 * @mode:        timer mode abs/rel
 */
void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
                          enum hrtimer_mode mode)
{
        debug_init(&sl->timer, clock_id, mode);
        __hrtimer_init_sleeper(sl, clock_id, mode);

}
EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);

int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
{
        switch(restart->nanosleep.type) {
#ifdef CONFIG_COMPAT_32BIT_TIME
        case TT_COMPAT:
                if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp))
                        return -EFAULT;
                break;
#endif
        case TT_NATIVE:
                if (put_timespec64(ts, restart->nanosleep.rmtp))
                        return -EFAULT;
                break;
        default:
                BUG();
        }
        return -ERESTART_RESTARTBLOCK;
}

static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
{
        struct restart_block *restart;

        do {
                set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
                hrtimer_sleeper_start_expires(t, mode);

                if (likely(t->task))
                        schedule();

                hrtimer_cancel(&t->timer);
                mode = HRTIMER_MODE_ABS;

        } while (t->task && !signal_pending(current));

        __set_current_state(TASK_RUNNING);

        if (!t->task)
                return 0;

        restart = &current->restart_block;
        if (restart->nanosleep.type != TT_NONE) {
                ktime_t rem = hrtimer_expires_remaining(&t->timer);
                struct timespec64 rmt;

                if (rem <= 0)
                        return 0;
                rmt = ktime_to_timespec64(rem);

                return nanosleep_copyout(restart, &rmt);
        }
        return -ERESTART_RESTARTBLOCK;
}

static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
{
        struct hrtimer_sleeper t;
        int ret;

        hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid,
                                      HRTIMER_MODE_ABS);
        hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
        ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
        destroy_hrtimer_on_stack(&t.timer);
        return ret;
}

long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
                       const clockid_t clockid)
{
        struct restart_block *restart;
        struct hrtimer_sleeper t;
        int ret = 0;
        u64 slack;

        slack = current->timer_slack_ns;
        if (rt_task(current))
                slack = 0;

        hrtimer_init_sleeper_on_stack(&t, clockid, mode);
        hrtimer_set_expires_range_ns(&t.timer, rqtp, slack);
        ret = do_nanosleep(&t, mode);
        if (ret != -ERESTART_RESTARTBLOCK)
                goto out;

        /* Absolute timers do not update the rmtp value and restart: */
        if (mode == HRTIMER_MODE_ABS) {
                ret = -ERESTARTNOHAND;
                goto out;
        }

        restart = &current->restart_block;
        restart->nanosleep.clockid = t.timer.base->clockid;
        restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
        set_restart_fn(restart, hrtimer_nanosleep_restart);
out:
        destroy_hrtimer_on_stack(&t.timer);
        return ret;
}

#ifdef CONFIG_64BIT

SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
                struct __kernel_timespec __user *, rmtp)
{
        struct timespec64 tu;

        if (get_timespec64(&tu, rqtp))
                return -EFAULT;

        if (!timespec64_valid(&tu))
                return -EINVAL;

        current->restart_block.fn = do_no_restart_syscall;
        current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
        current->restart_block.nanosleep.rmtp = rmtp;
        return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
                                 CLOCK_MONOTONIC);
}

#endif

#ifdef CONFIG_COMPAT_32BIT_TIME

SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
                       struct old_timespec32 __user *, rmtp)
{
        struct timespec64 tu;

        if (get_old_timespec32(&tu, rqtp))
                return -EFAULT;

        if (!timespec64_valid(&tu))
                return -EINVAL;

        current->restart_block.fn = do_no_restart_syscall;
        current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
        current->restart_block.nanosleep.compat_rmtp = rmtp;
        return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
                                 CLOCK_MONOTONIC);
}
#endif

/*
 * Functions related to boot-time initialization:
 */
int hrtimers_prepare_cpu(unsigned int cpu)
{
        struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
        int i;

        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];

                clock_b->cpu_base = cpu_base;
                seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
                timerqueue_init_head(&clock_b->active);
        }

        cpu_base->cpu = cpu;
        cpu_base->active_bases = 0;
        cpu_base->hres_active = 0;
        cpu_base->hang_detected = 0;
        cpu_base->next_timer = NULL;
        cpu_base->softirq_next_timer = NULL;
        cpu_base->expires_next = KTIME_MAX;
        cpu_base->softirq_expires_next = KTIME_MAX;
        cpu_base->online = 1;
        hrtimer_cpu_base_init_expiry_lock(cpu_base);
        return 0;
}

#ifdef CONFIG_HOTPLUG_CPU

static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
                                struct hrtimer_clock_base *new_base)
{
        struct hrtimer *timer;
        struct timerqueue_node *node;

        while ((node = timerqueue_getnext(&old_base->active))) {
                timer = container_of(node, struct hrtimer, node);
                BUG_ON(hrtimer_callback_running(timer));
                debug_deactivate(timer);

                /*
                 * Mark it as ENQUEUED not INACTIVE otherwise the
                 * timer could be seen as !active and just vanish away
                 * under us on another CPU
                 */
                __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
                timer->base = new_base;
                /*
                 * Enqueue the timers on the new cpu. This does not
                 * reprogram the event device in case the timer
                 * expires before the earliest on this CPU, but we run
                 * hrtimer_interrupt after we migrated everything to
                 * sort out already expired timers and reprogram the
                 * event device.
                 */
                enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
        }
}

int hrtimers_cpu_dying(unsigned int dying_cpu)
{
        int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER));
        struct hrtimer_cpu_base *old_base, *new_base;

        old_base = this_cpu_ptr(&hrtimer_bases);
        new_base = &per_cpu(hrtimer_bases, ncpu);

        /*
         * The caller is globally serialized and nobody else
         * takes two locks at once, deadlock is not possible.
         */
        raw_spin_lock(&old_base->lock);
        raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);

        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                migrate_hrtimer_list(&old_base->clock_base[i],
                                     &new_base->clock_base[i]);
        }

        /*
         * The migration might have changed the first expiring softirq
         * timer on this CPU. Update it.
         */
        __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
        /* Tell the other CPU to retrigger the next event */
        smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);

        raw_spin_unlock(&new_base->lock);
        old_base->online = 0;
        raw_spin_unlock(&old_base->lock);

        return 0;
}

#endif /* CONFIG_HOTPLUG_CPU */

void __init hrtimers_init(void)
{
        hrtimers_prepare_cpu(smp_processor_id());
        open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
}

/**
 * schedule_hrtimeout_range_clock - sleep until timeout
 * @expires:        timeout value (ktime_t)
 * @delta:        slack in expires timeout (ktime_t) for SCHED_OTHER tasks
 * @mode:        timer mode
 * @clock_id:        timer clock to be used
 */
int __sched
schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
                               const enum hrtimer_mode mode, clockid_t clock_id)
{
        struct hrtimer_sleeper t;

        /*
         * Optimize when a zero timeout value is given. It does not
         * matter whether this is an absolute or a relative time.
         */
        if (expires && *expires == 0) {
                __set_current_state(TASK_RUNNING);
                return 0;
        }

        /*
         * A NULL parameter means "infinite"
         */
        if (!expires) {
                schedule();
                return -EINTR;
        }

        /*
         * Override any slack passed by the user if under
         * rt contraints.
         */
        if (rt_task(current))
                delta = 0;

        hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
        hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
        hrtimer_sleeper_start_expires(&t, mode);

        if (likely(t.task))
                schedule();

        hrtimer_cancel(&t.timer);
        destroy_hrtimer_on_stack(&t.timer);

        __set_current_state(TASK_RUNNING);

        return !t.task ? 0 : -EINTR;
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock);

/**
 * schedule_hrtimeout_range - sleep until timeout
 * @expires:        timeout value (ktime_t)
 * @delta:        slack in expires timeout (ktime_t) for SCHED_OTHER tasks
 * @mode:        timer mode
 *
 * Make the current task sleep until the given expiry time has
 * elapsed. The routine will return immediately unless
 * the current task state has been set (see set_current_state()).
 *
 * The @delta argument gives the kernel the freedom to schedule the
 * actual wakeup to a time that is both power and performance friendly
 * for regular (non RT/DL) tasks.
 * The kernel give the normal best effort behavior for "@expires+@delta",
 * but may decide to fire the timer earlier, but no earlier than @expires.
 *
 * You can set the task state as follows -
 *
 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
 * pass before the routine returns unless the current task is explicitly
 * woken up, (e.g. by wake_up_process()).
 *
 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 * delivered to the current task or the current task is explicitly woken
 * up.
 *
 * The current task state is guaranteed to be TASK_RUNNING when this
 * routine returns.
 *
 * Returns 0 when the timer has expired. If the task was woken before the
 * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
 * by an explicit wakeup, it returns -EINTR.
 */
int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta,
                                     const enum hrtimer_mode mode)
{
        return schedule_hrtimeout_range_clock(expires, delta, mode,
                                              CLOCK_MONOTONIC);
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);

/**
 * schedule_hrtimeout - sleep until timeout
 * @expires:        timeout value (ktime_t)
 * @mode:        timer mode
 *
 * Make the current task sleep until the given expiry time has
 * elapsed. The routine will return immediately unless
 * the current task state has been set (see set_current_state()).
 *
 * You can set the task state as follows -
 *
 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
 * pass before the routine returns unless the current task is explicitly
 * woken up, (e.g. by wake_up_process()).
 *
 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 * delivered to the current task or the current task is explicitly woken
 * up.
 *
 * The current task state is guaranteed to be TASK_RUNNING when this
 * routine returns.
 *
 * Returns 0 when the timer has expired. If the task was woken before the
 * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
 * by an explicit wakeup, it returns -EINTR.
 */
int __sched schedule_hrtimeout(ktime_t *expires,
                               const enum hrtimer_mode mode)
{
        return schedule_hrtimeout_range(expires, 0, mode);
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout);






























































   16 



   19 

























    1 



    1 
























































    4 



    4 

















































   22 







































































































































    1 


































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_FIND_H_
#define __LINUX_FIND_H_

#ifndef __LINUX_BITMAP_H
#error only <linux/bitmap.h> can be included directly
#endif

#include <linux/bitops.h>

unsigned long _find_next_bit(const unsigned long *addr1, unsigned long nbits,
                                unsigned long start);
unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start);
unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start);
unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start);
unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits,
                                         unsigned long start);
extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n);
unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                unsigned long size, unsigned long n);
unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long size, unsigned long n);
unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        const unsigned long *addr3, unsigned long size,
                                        unsigned long n);
extern unsigned long _find_first_and_bit(const unsigned long *addr1,
                                         const unsigned long *addr2, unsigned long size);
unsigned long _find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                      const unsigned long *addr3, unsigned long size);
extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);

#ifdef __BIG_ENDIAN
unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size);
unsigned long _find_next_zero_bit_le(const  unsigned long *addr, unsigned
                                        long size, unsigned long offset);
unsigned long _find_next_bit_le(const unsigned long *addr, unsigned
                                long size, unsigned long offset);
#endif

#ifndef find_next_bit
/**
 * find_next_bit - find the next set bit in a memory region
 * @addr: The address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static inline
unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
                            unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = *addr & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_bit(addr, size, offset);
}
#endif

#ifndef find_next_and_bit
/**
 * find_next_and_bit - find the next set bit in both memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static inline
unsigned long find_next_and_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
                unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = *addr1 & *addr2 & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_and_bit(addr1, addr2, size, offset);
}
#endif

#ifndef find_next_andnot_bit
/**
 * find_next_andnot_bit - find the next set bit in *addr1 excluding all the bits
 *                        in *addr2
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static inline
unsigned long find_next_andnot_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
                unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = *addr1 & ~*addr2 & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_andnot_bit(addr1, addr2, size, offset);
}
#endif

#ifndef find_next_or_bit
/**
 * find_next_or_bit - find the next set bit in either memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static inline
unsigned long find_next_or_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
                unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = (*addr1 | *addr2) & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_or_bit(addr1, addr2, size, offset);
}
#endif

#ifndef find_next_zero_bit
/**
 * find_next_zero_bit - find the next cleared bit in a memory region
 * @addr: The address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number of the next zero bit
 * If no bits are zero, returns @size.
 */
static inline
unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
                                 unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = *addr | ~GENMASK(size - 1, offset);
                return val == ~0UL ? size : ffz(val);
        }

        return _find_next_zero_bit(addr, size, offset);
}
#endif

#ifndef find_first_bit
/**
 * find_first_bit - find the first set bit in a memory region
 * @addr: The address to start the search at
 * @size: The maximum number of bits to search
 *
 * Returns the bit number of the first set bit.
 * If no bits are set, returns @size.
 */
static inline
unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr & GENMASK(size - 1, 0);

                return val ? __ffs(val) : size;
        }

        return _find_first_bit(addr, size);
}
#endif

/**
 * find_nth_bit - find N'th set bit in a memory region
 * @addr: The address to start the search at
 * @size: The maximum number of bits to search
 * @n: The number of set bit, which position is needed, counting from 0
 *
 * The following is semantically equivalent:
 *         idx = find_nth_bit(addr, size, 0);
 *         idx = find_first_bit(addr, size);
 *
 * Returns the bit number of the N'th set bit.
 * If no such, returns >= @size.
 */
static inline
unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n)
{
        if (n >= size)
                return size;

        if (small_const_nbits(size)) {
                unsigned long val =  *addr & GENMASK(size - 1, 0);

                return val ? fns(val, n) : size;
        }

        return __find_nth_bit(addr, size, n);
}

/**
 * find_nth_and_bit - find N'th set bit in 2 memory regions
 * @addr1: The 1st address to start the search at
 * @addr2: The 2nd address to start the search at
 * @size: The maximum number of bits to search
 * @n: The number of set bit, which position is needed, counting from 0
 *
 * Returns the bit number of the N'th set bit.
 * If no such, returns @size.
 */
static inline
unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                unsigned long size, unsigned long n)
{
        if (n >= size)
                return size;

        if (small_const_nbits(size)) {
                unsigned long val =  *addr1 & *addr2 & GENMASK(size - 1, 0);

                return val ? fns(val, n) : size;
        }

        return __find_nth_and_bit(addr1, addr2, size, n);
}

/**
 * find_nth_andnot_bit - find N'th set bit in 2 memory regions,
 *                         flipping bits in 2nd region
 * @addr1: The 1st address to start the search at
 * @addr2: The 2nd address to start the search at
 * @size: The maximum number of bits to search
 * @n: The number of set bit, which position is needed, counting from 0
 *
 * Returns the bit number of the N'th set bit.
 * If no such, returns @size.
 */
static inline
unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                unsigned long size, unsigned long n)
{
        if (n >= size)
                return size;

        if (small_const_nbits(size)) {
                unsigned long val =  *addr1 & (~*addr2) & GENMASK(size - 1, 0);

                return val ? fns(val, n) : size;
        }

        return __find_nth_andnot_bit(addr1, addr2, size, n);
}

/**
 * find_nth_and_andnot_bit - find N'th set bit in 2 memory regions,
 *                             excluding those set in 3rd region
 * @addr1: The 1st address to start the search at
 * @addr2: The 2nd address to start the search at
 * @addr3: The 3rd address to start the search at
 * @size: The maximum number of bits to search
 * @n: The number of set bit, which position is needed, counting from 0
 *
 * Returns the bit number of the N'th set bit.
 * If no such, returns @size.
 */
static __always_inline
unsigned long find_nth_and_andnot_bit(const unsigned long *addr1,
                                        const unsigned long *addr2,
                                        const unsigned long *addr3,
                                        unsigned long size, unsigned long n)
{
        if (n >= size)
                return size;

        if (small_const_nbits(size)) {
                unsigned long val =  *addr1 & *addr2 & (~*addr3) & GENMASK(size - 1, 0);

                return val ? fns(val, n) : size;
        }

        return __find_nth_and_andnot_bit(addr1, addr2, addr3, size, n);
}

#ifndef find_first_and_bit
/**
 * find_first_and_bit - find the first set bit in both memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static inline
unsigned long find_first_and_bit(const unsigned long *addr1,
                                 const unsigned long *addr2,
                                 unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0);

                return val ? __ffs(val) : size;
        }

        return _find_first_and_bit(addr1, addr2, size);
}
#endif

/**
 * find_first_and_and_bit - find the first set bit in 3 memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @addr3: The third address to base the search on
 * @size: The bitmap size in bits
 *
 * Returns the bit number for the first set bit
 * If no bits are set, returns @size.
 */
static inline
unsigned long find_first_and_and_bit(const unsigned long *addr1,
                                     const unsigned long *addr2,
                                     const unsigned long *addr3,
                                     unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr1 & *addr2 & *addr3 & GENMASK(size - 1, 0);

                return val ? __ffs(val) : size;
        }

        return _find_first_and_and_bit(addr1, addr2, addr3, size);
}

#ifndef find_first_zero_bit
/**
 * find_first_zero_bit - find the first cleared bit in a memory region
 * @addr: The address to start the search at
 * @size: The maximum number of bits to search
 *
 * Returns the bit number of the first cleared bit.
 * If no bits are zero, returns @size.
 */
static inline
unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr | ~GENMASK(size - 1, 0);

                return val == ~0UL ? size : ffz(val);
        }

        return _find_first_zero_bit(addr, size);
}
#endif

#ifndef find_last_bit
/**
 * find_last_bit - find the last set bit in a memory region
 * @addr: The address to start the search at
 * @size: The number of bits to search
 *
 * Returns the bit number of the last set bit, or size.
 */
static inline
unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr & GENMASK(size - 1, 0);

                return val ? __fls(val) : size;
        }

        return _find_last_bit(addr, size);
}
#endif

/**
 * find_next_and_bit_wrap - find the next set bit in both memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit, or first set bit up to @offset
 * If no bits are set, returns @size.
 */
static inline
unsigned long find_next_and_bit_wrap(const unsigned long *addr1,
                                        const unsigned long *addr2,
                                        unsigned long size, unsigned long offset)
{
        unsigned long bit = find_next_and_bit(addr1, addr2, size, offset);

        if (bit < size || offset == 0)
                return bit;

        bit = find_first_and_bit(addr1, addr2, offset);
        return bit < offset ? bit : size;
}

/**
 * find_next_bit_wrap - find the next set bit in a memory region
 * @addr: The address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit, or first set bit up to @offset
 * If no bits are set, returns @size.
 */
static inline
unsigned long find_next_bit_wrap(const unsigned long *addr,
                                        unsigned long size, unsigned long offset)
{
        unsigned long bit = find_next_bit(addr, size, offset);

        if (bit < size || offset == 0)
                return bit;

        bit = find_first_bit(addr, offset);
        return bit < offset ? bit : size;
}

/*
 * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing
 * before using it alone.
 */
static inline
unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size,
                                 unsigned long start, unsigned long n)
{
        unsigned long bit;

        /* If not wrapped around */
        if (n > start) {
                /* and have a bit, just return it. */
                bit = find_next_bit(bitmap, size, n);
                if (bit < size)
                        return bit;

                /* Otherwise, wrap around and ... */
                n = 0;
        }

        /* Search the other part. */
        bit = find_next_bit(bitmap, start, n);
        return bit < start ? bit : size;
}

/**
 * find_next_clump8 - find next 8-bit clump with set bits in a memory region
 * @clump: location to store copy of found clump
 * @addr: address to base the search on
 * @size: bitmap size in number of bits
 * @offset: bit offset at which to start searching
 *
 * Returns the bit offset for the next set clump; the found clump value is
 * copied to the location pointed by @clump. If no bits are set, returns @size.
 */
extern unsigned long find_next_clump8(unsigned long *clump,
                                      const unsigned long *addr,
                                      unsigned long size, unsigned long offset);

#define find_first_clump8(clump, bits, size) \
        find_next_clump8((clump), (bits), (size), 0)

#if defined(__LITTLE_ENDIAN)

static inline unsigned long find_next_zero_bit_le(const void *addr,
                unsigned long size, unsigned long offset)
{
        return find_next_zero_bit(addr, size, offset);
}

static inline unsigned long find_next_bit_le(const void *addr,
                unsigned long size, unsigned long offset)
{
        return find_next_bit(addr, size, offset);
}

static inline unsigned long find_first_zero_bit_le(const void *addr,
                unsigned long size)
{
        return find_first_zero_bit(addr, size);
}

#elif defined(__BIG_ENDIAN)

#ifndef find_next_zero_bit_le
static inline
unsigned long find_next_zero_bit_le(const void *addr, unsigned
                long size, unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val = *(const unsigned long *)addr;

                if (unlikely(offset >= size))
                        return size;

                val = swab(val) | ~GENMASK(size - 1, offset);
                return val == ~0UL ? size : ffz(val);
        }

        return _find_next_zero_bit_le(addr, size, offset);
}
#endif

#ifndef find_first_zero_bit_le
static inline
unsigned long find_first_zero_bit_le(const void *addr, unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = swab(*(const unsigned long *)addr) | ~GENMASK(size - 1, 0);

                return val == ~0UL ? size : ffz(val);
        }

        return _find_first_zero_bit_le(addr, size);
}
#endif

#ifndef find_next_bit_le
static inline
unsigned long find_next_bit_le(const void *addr, unsigned
                long size, unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val = *(const unsigned long *)addr;

                if (unlikely(offset >= size))
                        return size;

                val = swab(val) & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_bit_le(addr, size, offset);
}
#endif

#else
#error "Please fix <asm/byteorder.h>"
#endif

#define for_each_set_bit(bit, addr, size) \
        for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)

#define for_each_and_bit(bit, addr1, addr2, size) \
        for ((bit) = 0;                                                                        \
             (bit) = find_next_and_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
             (bit)++)

#define for_each_andnot_bit(bit, addr1, addr2, size) \
        for ((bit) = 0;                                                                        \
             (bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
             (bit)++)

#define for_each_or_bit(bit, addr1, addr2, size) \
        for ((bit) = 0;                                                                        \
             (bit) = find_next_or_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
             (bit)++)

/* same as for_each_set_bit() but use bit as value to start with */
#define for_each_set_bit_from(bit, addr, size) \
        for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)

#define for_each_clear_bit(bit, addr, size) \
        for ((bit) = 0;                                                                        \
             (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size);                \
             (bit)++)

/* same as for_each_clear_bit() but use bit as value to start with */
#define for_each_clear_bit_from(bit, addr, size) \
        for (; (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); (bit)++)

/**
 * for_each_set_bitrange - iterate over all set bit ranges [b; e)
 * @b: bit offset of start of current bitrange (first set bit)
 * @e: bit offset of end of current bitrange (first unset bit)
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_set_bitrange(b, e, addr, size)                        \
        for ((b) = 0;                                                \
             (b) = find_next_bit((addr), (size), b),                \
             (e) = find_next_zero_bit((addr), (size), (b) + 1),        \
             (b) < (size);                                        \
             (b) = (e) + 1)

/**
 * for_each_set_bitrange_from - iterate over all set bit ranges [b; e)
 * @b: bit offset of start of current bitrange (first set bit); must be initialized
 * @e: bit offset of end of current bitrange (first unset bit)
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_set_bitrange_from(b, e, addr, size)                \
        for (;                                                        \
             (b) = find_next_bit((addr), (size), (b)),                \
             (e) = find_next_zero_bit((addr), (size), (b) + 1),        \
             (b) < (size);                                        \
             (b) = (e) + 1)

/**
 * for_each_clear_bitrange - iterate over all unset bit ranges [b; e)
 * @b: bit offset of start of current bitrange (first unset bit)
 * @e: bit offset of end of current bitrange (first set bit)
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_clear_bitrange(b, e, addr, size)                \
        for ((b) = 0;                                                \
             (b) = find_next_zero_bit((addr), (size), (b)),        \
             (e) = find_next_bit((addr), (size), (b) + 1),        \
             (b) < (size);                                        \
             (b) = (e) + 1)

/**
 * for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e)
 * @b: bit offset of start of current bitrange (first set bit); must be initialized
 * @e: bit offset of end of current bitrange (first unset bit)
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_clear_bitrange_from(b, e, addr, size)                \
        for (;                                                        \
             (b) = find_next_zero_bit((addr), (size), (b)),        \
             (e) = find_next_bit((addr), (size), (b) + 1),        \
             (b) < (size);                                        \
             (b) = (e) + 1)

/**
 * for_each_set_bit_wrap - iterate over all set bits starting from @start, and
 * wrapping around the end of bitmap.
 * @bit: offset for current iteration
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 * @start: Starting bit for bitmap traversing, wrapping around the bitmap end
 */
#define for_each_set_bit_wrap(bit, addr, size, start) \
        for ((bit) = find_next_bit_wrap((addr), (size), (start));                \
             (bit) < (size);                                                        \
             (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1))

/**
 * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits
 * @start: bit offset to start search and to store the current iteration offset
 * @clump: location to store copy of current 8-bit clump
 * @bits: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_set_clump8(start, clump, bits, size) \
        for ((start) = find_first_clump8(&(clump), (bits), (size)); \
             (start) < (size); \
             (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8))

#endif /*__LINUX_FIND_H_ */










































































    2 




    2 






















































    2 
    2 


































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
// SPDX-License-Identifier: GPL-2.0
/*
 * Disk events - monitor disk events like media change and eject request.
 */
#include <linux/export.h>
#include <linux/moduleparam.h>
#include <linux/blkdev.h>
#include "blk.h"

struct disk_events {
        struct list_head        node;                /* all disk_event's */
        struct gendisk                *disk;                /* the associated disk */
        spinlock_t                lock;

        struct mutex                block_mutex;        /* protects blocking */
        int                        block;                /* event blocking depth */
        unsigned int                pending;        /* events already sent out */
        unsigned int                clearing;        /* events being cleared */

        long                        poll_msecs;        /* interval, -1 for default */
        struct delayed_work        dwork;
};

static const char *disk_events_strs[] = {
        [ilog2(DISK_EVENT_MEDIA_CHANGE)]        = "media_change",
        [ilog2(DISK_EVENT_EJECT_REQUEST)]        = "eject_request",
};

static char *disk_uevents[] = {
        [ilog2(DISK_EVENT_MEDIA_CHANGE)]        = "DISK_MEDIA_CHANGE=1",
        [ilog2(DISK_EVENT_EJECT_REQUEST)]        = "DISK_EJECT_REQUEST=1",
};

/* list of all disk_events */
static DEFINE_MUTEX(disk_events_mutex);
static LIST_HEAD(disk_events);

/* disable in-kernel polling by default */
static unsigned long disk_events_dfl_poll_msecs;

static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
{
        struct disk_events *ev = disk->ev;
        long intv_msecs = 0;

        /*
         * If device-specific poll interval is set, always use it.  If
         * the default is being used, poll if the POLL flag is set.
         */
        if (ev->poll_msecs >= 0)
                intv_msecs = ev->poll_msecs;
        else if (disk->event_flags & DISK_EVENT_FLAG_POLL)
                intv_msecs = disk_events_dfl_poll_msecs;

        return msecs_to_jiffies(intv_msecs);
}

/**
 * disk_block_events - block and flush disk event checking
 * @disk: disk to block events for
 *
 * On return from this function, it is guaranteed that event checking
 * isn't in progress and won't happen until unblocked by
 * disk_unblock_events().  Events blocking is counted and the actual
 * unblocking happens after the matching number of unblocks are done.
 *
 * Note that this intentionally does not block event checking from
 * disk_clear_events().
 *
 * CONTEXT:
 * Might sleep.
 */
void disk_block_events(struct gendisk *disk)
{
        struct disk_events *ev = disk->ev;
        unsigned long flags;
        bool cancel;

        if (!ev)
                return;

        /*
         * Outer mutex ensures that the first blocker completes canceling
         * the event work before further blockers are allowed to finish.
         */
        mutex_lock(&ev->block_mutex);

        spin_lock_irqsave(&ev->lock, flags);
        cancel = !ev->block++;
        spin_unlock_irqrestore(&ev->lock, flags);

        if (cancel)
                cancel_delayed_work_sync(&disk->ev->dwork);

        mutex_unlock(&ev->block_mutex);
}

static void __disk_unblock_events(struct gendisk *disk, bool check_now)
{
        struct disk_events *ev = disk->ev;
        unsigned long intv;
        unsigned long flags;

        spin_lock_irqsave(&ev->lock, flags);

        if (WARN_ON_ONCE(ev->block <= 0))
                goto out_unlock;

        if (--ev->block)
                goto out_unlock;

        intv = disk_events_poll_jiffies(disk);
        if (check_now)
                queue_delayed_work(system_freezable_power_efficient_wq,
                                &ev->dwork, 0);
        else if (intv)
                queue_delayed_work(system_freezable_power_efficient_wq,
                                &ev->dwork, intv);
out_unlock:
        spin_unlock_irqrestore(&ev->lock, flags);
}

/**
 * disk_unblock_events - unblock disk event checking
 * @disk: disk to unblock events for
 *
 * Undo disk_block_events().  When the block count reaches zero, it
 * starts events polling if configured.
 *
 * CONTEXT:
 * Don't care.  Safe to call from irq context.
 */
void disk_unblock_events(struct gendisk *disk)
{
        if (disk->ev)
                __disk_unblock_events(disk, false);
}

/**
 * disk_flush_events - schedule immediate event checking and flushing
 * @disk: disk to check and flush events for
 * @mask: events to flush
 *
 * Schedule immediate event checking on @disk if not blocked.  Events in
 * @mask are scheduled to be cleared from the driver.  Note that this
 * doesn't clear the events from @disk->ev.
 *
 * CONTEXT:
 * If @mask is non-zero must be called with disk->open_mutex held.
 */
void disk_flush_events(struct gendisk *disk, unsigned int mask)
{
        struct disk_events *ev = disk->ev;

        if (!ev)
                return;

        spin_lock_irq(&ev->lock);
        ev->clearing |= mask;
        if (!ev->block)
                mod_delayed_work(system_freezable_power_efficient_wq,
                                &ev->dwork, 0);
        spin_unlock_irq(&ev->lock);
}

/*
 * Tell userland about new events.  Only the events listed in @disk->events are
 * reported, and only if DISK_EVENT_FLAG_UEVENT is set.  Otherwise, events are
 * processed internally but never get reported to userland.
 */
static void disk_event_uevent(struct gendisk *disk, unsigned int events)
{
        char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
        int nr_events = 0, i;

        for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
                if (events & disk->events & (1 << i))
                        envp[nr_events++] = disk_uevents[i];

        if (nr_events)
                kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
}

static void disk_check_events(struct disk_events *ev,
                              unsigned int *clearing_ptr)
{
        struct gendisk *disk = ev->disk;
        unsigned int clearing = *clearing_ptr;
        unsigned int events;
        unsigned long intv;

        /* check events */
        events = disk->fops->check_events(disk, clearing);

        /* accumulate pending events and schedule next poll if necessary */
        spin_lock_irq(&ev->lock);

        events &= ~ev->pending;
        ev->pending |= events;
        *clearing_ptr &= ~clearing;

        intv = disk_events_poll_jiffies(disk);
        if (!ev->block && intv)
                queue_delayed_work(system_freezable_power_efficient_wq,
                                &ev->dwork, intv);

        spin_unlock_irq(&ev->lock);

        if (events & DISK_EVENT_MEDIA_CHANGE)
                inc_diskseq(disk);

        if (disk->event_flags & DISK_EVENT_FLAG_UEVENT)
                disk_event_uevent(disk, events);
}

/**
 * disk_clear_events - synchronously check, clear and return pending events
 * @disk: disk to fetch and clear events from
 * @mask: mask of events to be fetched and cleared
 *
 * Disk events are synchronously checked and pending events in @mask
 * are cleared and returned.  This ignores the block count.
 *
 * CONTEXT:
 * Might sleep.
 */
static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
{
        struct disk_events *ev = disk->ev;
        unsigned int pending;
        unsigned int clearing = mask;

        if (!ev)
                return 0;

        disk_block_events(disk);

        /*
         * store the union of mask and ev->clearing on the stack so that the
         * race with disk_flush_events does not cause ambiguity (ev->clearing
         * can still be modified even if events are blocked).
         */
        spin_lock_irq(&ev->lock);
        clearing |= ev->clearing;
        ev->clearing = 0;
        spin_unlock_irq(&ev->lock);

        disk_check_events(ev, &clearing);
        /*
         * if ev->clearing is not 0, the disk_flush_events got called in the
         * middle of this function, so we want to run the workfn without delay.
         */
        __disk_unblock_events(disk, ev->clearing ? true : false);

        /* then, fetch and clear pending events */
        spin_lock_irq(&ev->lock);
        pending = ev->pending & mask;
        ev->pending &= ~mask;
        spin_unlock_irq(&ev->lock);
        WARN_ON_ONCE(clearing & mask);

        return pending;
}

/**
 * disk_check_media_change - check if a removable media has been changed
 * @disk: gendisk to check
 *
 * Returns %true and marks the disk for a partition rescan whether a removable
 * media has been changed, and %false if the media did not change.
 */
bool disk_check_media_change(struct gendisk *disk)
{
        unsigned int events;

        events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
                                   DISK_EVENT_EJECT_REQUEST);
        if (events & DISK_EVENT_MEDIA_CHANGE) {
                set_bit(GD_NEED_PART_SCAN, &disk->state);
                return true;
        }
        return false;
}
EXPORT_SYMBOL(disk_check_media_change);

/**
 * disk_force_media_change - force a media change event
 * @disk: the disk which will raise the event
 *
 * Should be called when the media changes for @disk.  Generates a uevent
 * and attempts to free all dentries and inodes and invalidates all block
 * device page cache entries in that case.
 */
void disk_force_media_change(struct gendisk *disk)
{
        disk_event_uevent(disk, DISK_EVENT_MEDIA_CHANGE);
        inc_diskseq(disk);
        bdev_mark_dead(disk->part0, true);
        set_bit(GD_NEED_PART_SCAN, &disk->state);
}
EXPORT_SYMBOL_GPL(disk_force_media_change);

/*
 * Separate this part out so that a different pointer for clearing_ptr can be
 * passed in for disk_clear_events.
 */
static void disk_events_workfn(struct work_struct *work)
{
        struct delayed_work *dwork = to_delayed_work(work);
        struct disk_events *ev = container_of(dwork, struct disk_events, dwork);

        disk_check_events(ev, &ev->clearing);
}

/*
 * A disk events enabled device has the following sysfs nodes under
 * its /sys/block/X/ directory.
 *
 * events                : list of all supported events
 * events_async                : list of events which can be detected w/o polling
 *                          (always empty, only for backwards compatibility)
 * events_poll_msecs        : polling interval, 0: disable, -1: system default
 */
static ssize_t __disk_events_show(unsigned int events, char *buf)
{
        const char *delim = "";
        ssize_t pos = 0;
        int i;

        for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
                if (events & (1 << i)) {
                        pos += sprintf(buf + pos, "%s%s",
                                       delim, disk_events_strs[i]);
                        delim = " ";
                }
        if (pos)
                pos += sprintf(buf + pos, "\n");
        return pos;
}

static ssize_t disk_events_show(struct device *dev,
                                struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT))
                return 0;
        return __disk_events_show(disk->events, buf);
}

static ssize_t disk_events_async_show(struct device *dev,
                                      struct device_attribute *attr, char *buf)
{
        return 0;
}

static ssize_t disk_events_poll_msecs_show(struct device *dev,
                                           struct device_attribute *attr,
                                           char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        if (!disk->ev)
                return sprintf(buf, "-1\n");
        return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
}

static ssize_t disk_events_poll_msecs_store(struct device *dev,
                                            struct device_attribute *attr,
                                            const char *buf, size_t count)
{
        struct gendisk *disk = dev_to_disk(dev);
        long intv;

        if (!count || !sscanf(buf, "%ld", &intv))
                return -EINVAL;

        if (intv < 0 && intv != -1)
                return -EINVAL;

        if (!disk->ev)
                return -ENODEV;

        disk_block_events(disk);
        disk->ev->poll_msecs = intv;
        __disk_unblock_events(disk, true);
        return count;
}

DEVICE_ATTR(events, 0444, disk_events_show, NULL);
DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL);
DEVICE_ATTR(events_poll_msecs, 0644, disk_events_poll_msecs_show,
            disk_events_poll_msecs_store);

/*
 * The default polling interval can be specified by the kernel
 * parameter block.events_dfl_poll_msecs which defaults to 0
 * (disable).  This can also be modified runtime by writing to
 * /sys/module/block/parameters/events_dfl_poll_msecs.
 */
static int disk_events_set_dfl_poll_msecs(const char *val,
                                          const struct kernel_param *kp)
{
        struct disk_events *ev;
        int ret;

        ret = param_set_ulong(val, kp);
        if (ret < 0)
                return ret;

        mutex_lock(&disk_events_mutex);
        list_for_each_entry(ev, &disk_events, node)
                disk_flush_events(ev->disk, 0);
        mutex_unlock(&disk_events_mutex);
        return 0;
}

static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
        .set        = disk_events_set_dfl_poll_msecs,
        .get        = param_get_ulong,
};

#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX        "block."

module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
                &disk_events_dfl_poll_msecs, 0644);

/*
 * disk_{alloc|add|del|release}_events - initialize and destroy disk_events.
 */
int disk_alloc_events(struct gendisk *disk)
{
        struct disk_events *ev;

        if (!disk->fops->check_events || !disk->events)
                return 0;

        ev = kzalloc(sizeof(*ev), GFP_KERNEL);
        if (!ev) {
                pr_warn("%s: failed to initialize events\n", disk->disk_name);
                return -ENOMEM;
        }

        INIT_LIST_HEAD(&ev->node);
        ev->disk = disk;
        spin_lock_init(&ev->lock);
        mutex_init(&ev->block_mutex);
        ev->block = 1;
        ev->poll_msecs = -1;
        INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);

        disk->ev = ev;
        return 0;
}

void disk_add_events(struct gendisk *disk)
{
        if (!disk->ev)
                return;

        mutex_lock(&disk_events_mutex);
        list_add_tail(&disk->ev->node, &disk_events);
        mutex_unlock(&disk_events_mutex);

        /*
         * Block count is initialized to 1 and the following initial
         * unblock kicks it into action.
         */
        __disk_unblock_events(disk, true);
}

void disk_del_events(struct gendisk *disk)
{
        if (disk->ev) {
                disk_block_events(disk);

                mutex_lock(&disk_events_mutex);
                list_del_init(&disk->ev->node);
                mutex_unlock(&disk_events_mutex);
        }
}

void disk_release_events(struct gendisk *disk)
{
        /* the block count should be 1 from disk_del_events() */
        WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
        kfree(disk->ev);
}

















































































































    3 





































































































































































































































































































































































































































































































    3 



























































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
// SPDX-License-Identifier: GPL-2.0
/*
 * Detect hard and soft lockups on a system
 *
 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
 *
 * Note: Most of this code is borrowed heavily from the original softlockup
 * detector, so thanks to Ingo for the initial implementation.
 * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
 * to those contributors as well.
 */

#define pr_fmt(fmt) "watchdog: " fmt

#include <linux/cpu.h>
#include <linux/init.h>
#include <linux/irq.h>
#include <linux/irqdesc.h>
#include <linux/kernel_stat.h>
#include <linux/kvm_para.h>
#include <linux/math64.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/nmi.h>
#include <linux/stop_machine.h>
#include <linux/sysctl.h>
#include <linux/tick.h>

#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>

#include <asm/irq_regs.h>

static DEFINE_MUTEX(watchdog_mutex);

#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HARDLOCKUP_DETECTOR_SPARC64)
# define WATCHDOG_HARDLOCKUP_DEFAULT        1
#else
# define WATCHDOG_HARDLOCKUP_DEFAULT        0
#endif

#define NUM_SAMPLE_PERIODS        5

unsigned long __read_mostly watchdog_enabled;
int __read_mostly watchdog_user_enabled = 1;
static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT;
static int __read_mostly watchdog_softlockup_user_enabled = 1;
int __read_mostly watchdog_thresh = 10;
static int __read_mostly watchdog_hardlockup_available;

struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);

#ifdef CONFIG_HARDLOCKUP_DETECTOR

# ifdef CONFIG_SMP
int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
# endif /* CONFIG_SMP */

/*
 * Should we panic when a soft-lockup or hard-lockup occurs:
 */
unsigned int __read_mostly hardlockup_panic =
                        IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC);
/*
 * We may not want to enable hard lockup detection by default in all cases,
 * for example when running the kernel as a guest on a hypervisor. In these
 * cases this function can be called to disable hard lockup detection. This
 * function should only be executed once by the boot processor before the
 * kernel command line parameters are parsed, because otherwise it is not
 * possible to override this in hardlockup_panic_setup().
 */
void __init hardlockup_detector_disable(void)
{
        watchdog_hardlockup_user_enabled = 0;
}

static int __init hardlockup_panic_setup(char *str)
{
next:
        if (!strncmp(str, "panic", 5))
                hardlockup_panic = 1;
        else if (!strncmp(str, "nopanic", 7))
                hardlockup_panic = 0;
        else if (!strncmp(str, "0", 1))
                watchdog_hardlockup_user_enabled = 0;
        else if (!strncmp(str, "1", 1))
                watchdog_hardlockup_user_enabled = 1;
        else if (!strncmp(str, "r", 1))
                hardlockup_config_perf_event(str + 1);
        while (*(str++)) {
                if (*str == ',') {
                        str++;
                        goto next;
                }
        }
        return 1;
}
__setup("nmi_watchdog=", hardlockup_panic_setup);

#endif /* CONFIG_HARDLOCKUP_DETECTOR */

#if defined(CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER)

static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts);
static DEFINE_PER_CPU(int, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned);
static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched);
static unsigned long hard_lockup_nmi_warn;

notrace void arch_touch_nmi_watchdog(void)
{
        /*
         * Using __raw here because some code paths have
         * preemption enabled.  If preemption is enabled
         * then interrupts should be enabled too, in which
         * case we shouldn't have to worry about the watchdog
         * going off.
         */
        raw_cpu_write(watchdog_hardlockup_touched, true);
}
EXPORT_SYMBOL(arch_touch_nmi_watchdog);

void watchdog_hardlockup_touch_cpu(unsigned int cpu)
{
        per_cpu(watchdog_hardlockup_touched, cpu) = true;
}

static bool is_hardlockup(unsigned int cpu)
{
        int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu));

        if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
                return true;

        /*
         * NOTE: we don't need any fancy atomic_t or READ_ONCE/WRITE_ONCE
         * for hrtimer_interrupts_saved. hrtimer_interrupts_saved is
         * written/read by a single CPU.
         */
        per_cpu(hrtimer_interrupts_saved, cpu) = hrint;

        return false;
}

static void watchdog_hardlockup_kick(void)
{
        int new_interrupts;

        new_interrupts = atomic_inc_return(this_cpu_ptr(&hrtimer_interrupts));
        watchdog_buddy_check_hardlockup(new_interrupts);
}

void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
{
        if (per_cpu(watchdog_hardlockup_touched, cpu)) {
                per_cpu(watchdog_hardlockup_touched, cpu) = false;
                return;
        }

        /*
         * Check for a hardlockup by making sure the CPU's timer
         * interrupt is incrementing. The timer interrupt should have
         * fired multiple times before we overflow'd. If it hasn't
         * then this is a good indication the cpu is stuck
         */
        if (is_hardlockup(cpu)) {
                unsigned int this_cpu = smp_processor_id();
                unsigned long flags;

                /* Only print hardlockups once. */
                if (per_cpu(watchdog_hardlockup_warned, cpu))
                        return;

                /*
                 * Prevent multiple hard-lockup reports if one cpu is already
                 * engaged in dumping all cpu back traces.
                 */
                if (sysctl_hardlockup_all_cpu_backtrace) {
                        if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn))
                                return;
                }

                /*
                 * NOTE: we call printk_cpu_sync_get_irqsave() after printing
                 * the lockup message. While it would be nice to serialize
                 * that printout, we really want to make sure that if some
                 * other CPU somehow locked up while holding the lock associated
                 * with printk_cpu_sync_get_irqsave() that we can still at least
                 * get the message about the lockup out.
                 */
                pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu);
                printk_cpu_sync_get_irqsave(flags);

                print_modules();
                print_irqtrace_events(current);
                if (cpu == this_cpu) {
                        if (regs)
                                show_regs(regs);
                        else
                                dump_stack();
                        printk_cpu_sync_put_irqrestore(flags);
                } else {
                        printk_cpu_sync_put_irqrestore(flags);
                        trigger_single_cpu_backtrace(cpu);
                }

                if (sysctl_hardlockup_all_cpu_backtrace) {
                        trigger_allbutcpu_cpu_backtrace(cpu);
                        if (!hardlockup_panic)
                                clear_bit_unlock(0, &hard_lockup_nmi_warn);
                }

                if (hardlockup_panic)
                        nmi_panic(regs, "Hard LOCKUP");

                per_cpu(watchdog_hardlockup_warned, cpu) = true;
        } else {
                per_cpu(watchdog_hardlockup_warned, cpu) = false;
        }
}

#else /* CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */

static inline void watchdog_hardlockup_kick(void) { }

#endif /* !CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */

/*
 * These functions can be overridden based on the configured hardlockdup detector.
 *
 * watchdog_hardlockup_enable/disable can be implemented to start and stop when
 * softlockup watchdog start and stop. The detector must select the
 * SOFTLOCKUP_DETECTOR Kconfig.
 */
void __weak watchdog_hardlockup_enable(unsigned int cpu) { }

void __weak watchdog_hardlockup_disable(unsigned int cpu) { }

/*
 * Watchdog-detector specific API.
 *
 * Return 0 when hardlockup watchdog is available, negative value otherwise.
 * Note that the negative value means that a delayed probe might
 * succeed later.
 */
int __weak __init watchdog_hardlockup_probe(void)
{
        return -ENODEV;
}

/**
 * watchdog_hardlockup_stop - Stop the watchdog for reconfiguration
 *
 * The reconfiguration steps are:
 * watchdog_hardlockup_stop();
 * update_variables();
 * watchdog_hardlockup_start();
 */
void __weak watchdog_hardlockup_stop(void) { }

/**
 * watchdog_hardlockup_start - Start the watchdog after reconfiguration
 *
 * Counterpart to watchdog_hardlockup_stop().
 *
 * The following variables have been updated in update_variables() and
 * contain the currently valid configuration:
 * - watchdog_enabled
 * - watchdog_thresh
 * - watchdog_cpumask
 */
void __weak watchdog_hardlockup_start(void) { }

/**
 * lockup_detector_update_enable - Update the sysctl enable bit
 *
 * Caller needs to make sure that the hard watchdogs are off, so this
 * can't race with watchdog_hardlockup_disable().
 */
static void lockup_detector_update_enable(void)
{
        watchdog_enabled = 0;
        if (!watchdog_user_enabled)
                return;
        if (watchdog_hardlockup_available && watchdog_hardlockup_user_enabled)
                watchdog_enabled |= WATCHDOG_HARDLOCKUP_ENABLED;
        if (watchdog_softlockup_user_enabled)
                watchdog_enabled |= WATCHDOG_SOFTOCKUP_ENABLED;
}

#ifdef CONFIG_SOFTLOCKUP_DETECTOR

/*
 * Delay the soflockup report when running a known slow code.
 * It does _not_ affect the timestamp of the last successdul reschedule.
 */
#define SOFTLOCKUP_DELAY_REPORT        ULONG_MAX

#ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
#endif

static struct cpumask watchdog_allowed_mask __read_mostly;

/* Global variables, exported for sysctl */
unsigned int __read_mostly softlockup_panic =
                        IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC);

static bool softlockup_initialized __read_mostly;
static u64 __read_mostly sample_period;

/* Timestamp taken after the last successful reschedule. */
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
/* Timestamp of the last softlockup report. */
static DEFINE_PER_CPU(unsigned long, watchdog_report_ts);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static unsigned long soft_lockup_nmi_warn;

static int __init softlockup_panic_setup(char *str)
{
        softlockup_panic = simple_strtoul(str, NULL, 0);
        return 1;
}
__setup("softlockup_panic=", softlockup_panic_setup);

static int __init nowatchdog_setup(char *str)
{
        watchdog_user_enabled = 0;
        return 1;
}
__setup("nowatchdog", nowatchdog_setup);

static int __init nosoftlockup_setup(char *str)
{
        watchdog_softlockup_user_enabled = 0;
        return 1;
}
__setup("nosoftlockup", nosoftlockup_setup);

static int __init watchdog_thresh_setup(char *str)
{
        get_option(&str, &watchdog_thresh);
        return 1;
}
__setup("watchdog_thresh=", watchdog_thresh_setup);

static void __lockup_detector_cleanup(void);

#ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM
enum stats_per_group {
        STATS_SYSTEM,
        STATS_SOFTIRQ,
        STATS_HARDIRQ,
        STATS_IDLE,
        NUM_STATS_PER_GROUP,
};

static const enum cpu_usage_stat tracked_stats[NUM_STATS_PER_GROUP] = {
        CPUTIME_SYSTEM,
        CPUTIME_SOFTIRQ,
        CPUTIME_IRQ,
        CPUTIME_IDLE,
};

static DEFINE_PER_CPU(u16, cpustat_old[NUM_STATS_PER_GROUP]);
static DEFINE_PER_CPU(u8, cpustat_util[NUM_SAMPLE_PERIODS][NUM_STATS_PER_GROUP]);
static DEFINE_PER_CPU(u8, cpustat_tail);

/*
 * We don't need nanosecond resolution. A granularity of 16ms is
 * sufficient for our precision, allowing us to use u16 to store
 * cpustats, which will roll over roughly every ~1000 seconds.
 * 2^24 ~= 16 * 10^6
 */
static u16 get_16bit_precision(u64 data_ns)
{
        return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */
}

static void update_cpustat(void)
{
        int i;
        u8 util;
        u16 old_stat, new_stat;
        struct kernel_cpustat kcpustat;
        u64 *cpustat = kcpustat.cpustat;
        u8 tail = __this_cpu_read(cpustat_tail);
        u16 sample_period_16 = get_16bit_precision(sample_period);

        kcpustat_cpu_fetch(&kcpustat, smp_processor_id());

        for (i = 0; i < NUM_STATS_PER_GROUP; i++) {
                old_stat = __this_cpu_read(cpustat_old[i]);
                new_stat = get_16bit_precision(cpustat[tracked_stats[i]]);
                util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16);
                __this_cpu_write(cpustat_util[tail][i], util);
                __this_cpu_write(cpustat_old[i], new_stat);
        }

        __this_cpu_write(cpustat_tail, (tail + 1) % NUM_SAMPLE_PERIODS);
}

static void print_cpustat(void)
{
        int i, group;
        u8 tail = __this_cpu_read(cpustat_tail);
        u64 sample_period_second = sample_period;

        do_div(sample_period_second, NSEC_PER_SEC);

        /*
         * Outputting the "watchdog" prefix on every line is redundant and not
         * concise, and the original alarm information is sufficient for
         * positioning in logs, hence here printk() is used instead of pr_crit().
         */
        printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n",
               smp_processor_id(), sample_period_second);

        for (i = 0; i < NUM_SAMPLE_PERIODS; i++) {
                group = (tail + i) % NUM_SAMPLE_PERIODS;
                printk(KERN_CRIT "\t#%d: %3u%% system,\t%3u%% softirq,\t"
                        "%3u%% hardirq,\t%3u%% idle\n", i + 1,
                        __this_cpu_read(cpustat_util[group][STATS_SYSTEM]),
                        __this_cpu_read(cpustat_util[group][STATS_SOFTIRQ]),
                        __this_cpu_read(cpustat_util[group][STATS_HARDIRQ]),
                        __this_cpu_read(cpustat_util[group][STATS_IDLE]));
        }
}

#define HARDIRQ_PERCENT_THRESH          50
#define NUM_HARDIRQ_REPORT              5
struct irq_counts {
        int irq;
        u32 counts;
};

static DEFINE_PER_CPU(bool, snapshot_taken);

/* Tabulate the most frequent interrupts. */
static void tabulate_irq_count(struct irq_counts *irq_counts, int irq, u32 counts, int rank)
{
        int i;
        struct irq_counts new_count = {irq, counts};

        for (i = 0; i < rank; i++) {
                if (counts > irq_counts[i].counts)
                        swap(new_count, irq_counts[i]);
        }
}

/*
 * If the hardirq time exceeds HARDIRQ_PERCENT_THRESH% of the sample_period,
 * then the cause of softlockup might be interrupt storm. In this case, it
 * would be useful to start interrupt counting.
 */
static bool need_counting_irqs(void)
{
        u8 util;
        int tail = __this_cpu_read(cpustat_tail);

        tail = (tail + NUM_HARDIRQ_REPORT - 1) % NUM_HARDIRQ_REPORT;
        util = __this_cpu_read(cpustat_util[tail][STATS_HARDIRQ]);
        return util > HARDIRQ_PERCENT_THRESH;
}

static void start_counting_irqs(void)
{
        if (!__this_cpu_read(snapshot_taken)) {
                kstat_snapshot_irqs();
                __this_cpu_write(snapshot_taken, true);
        }
}

static void stop_counting_irqs(void)
{
        __this_cpu_write(snapshot_taken, false);
}

static void print_irq_counts(void)
{
        unsigned int i, count;
        struct irq_counts irq_counts_sorted[NUM_HARDIRQ_REPORT] = {
                {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}
        };

        if (__this_cpu_read(snapshot_taken)) {
                for_each_active_irq(i) {
                        count = kstat_get_irq_since_snapshot(i);
                        tabulate_irq_count(irq_counts_sorted, i, count, NUM_HARDIRQ_REPORT);
                }

                /*
                 * Outputting the "watchdog" prefix on every line is redundant and not
                 * concise, and the original alarm information is sufficient for
                 * positioning in logs, hence here printk() is used instead of pr_crit().
                 */
                printk(KERN_CRIT "CPU#%d Detect HardIRQ Time exceeds %d%%. Most frequent HardIRQs:\n",
                       smp_processor_id(), HARDIRQ_PERCENT_THRESH);

                for (i = 0; i < NUM_HARDIRQ_REPORT; i++) {
                        if (irq_counts_sorted[i].irq == -1)
                                break;

                        printk(KERN_CRIT "\t#%u: %-10u\tirq#%d\n",
                               i + 1, irq_counts_sorted[i].counts,
                               irq_counts_sorted[i].irq);
                }

                /*
                 * If the hardirq time is less than HARDIRQ_PERCENT_THRESH% in the last
                 * sample_period, then we suspect the interrupt storm might be subsiding.
                 */
                if (!need_counting_irqs())
                        stop_counting_irqs();
        }
}

static void report_cpu_status(void)
{
        print_cpustat();
        print_irq_counts();
}
#else
static inline void update_cpustat(void) { }
static inline void report_cpu_status(void) { }
static inline bool need_counting_irqs(void) { return false; }
static inline void start_counting_irqs(void) { }
static inline void stop_counting_irqs(void) { }
#endif

/*
 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
 * lockups can have false positives under extreme conditions. So we generally
 * want a higher threshold for soft lockups than for hard lockups. So we couple
 * the thresholds with a factor: we make the soft threshold twice the amount of
 * time the hard threshold is.
 */
static int get_softlockup_thresh(void)
{
        return watchdog_thresh * 2;
}

/*
 * Returns seconds, approximately.  We don't need nanosecond
 * resolution, and we don't need to waste time with a big divide when
 * 2^30ns == 1.074s.
 */
static unsigned long get_timestamp(void)
{
        return running_clock() >> 30LL;  /* 2^30 ~= 10^9 */
}

static void set_sample_period(void)
{
        /*
         * convert watchdog_thresh from seconds to ns
         * the divide by 5 is to give hrtimer several chances (two
         * or three with the current relation between the soft
         * and hard thresholds) to increment before the
         * hardlockup detector generates a warning
         */
        sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / NUM_SAMPLE_PERIODS);
        watchdog_update_hrtimer_threshold(sample_period);
}

static void update_report_ts(void)
{
        __this_cpu_write(watchdog_report_ts, get_timestamp());
}

/* Commands for resetting the watchdog */
static void update_touch_ts(void)
{
        __this_cpu_write(watchdog_touch_ts, get_timestamp());
        update_report_ts();
}

/**
 * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls
 *
 * Call when the scheduler may have stalled for legitimate reasons
 * preventing the watchdog task from executing - e.g. the scheduler
 * entering idle state.  This should only be used for scheduler events.
 * Use touch_softlockup_watchdog() for everything else.
 */
notrace void touch_softlockup_watchdog_sched(void)
{
        /*
         * Preemption can be enabled.  It doesn't matter which CPU's watchdog
         * report period gets restarted here, so use the raw_ operation.
         */
        raw_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT);
}

notrace void touch_softlockup_watchdog(void)
{
        touch_softlockup_watchdog_sched();
        wq_watchdog_touch(raw_smp_processor_id());
}
EXPORT_SYMBOL(touch_softlockup_watchdog);

void touch_all_softlockup_watchdogs(void)
{
        int cpu;

        /*
         * watchdog_mutex cannpt be taken here, as this might be called
         * from (soft)interrupt context, so the access to
         * watchdog_allowed_cpumask might race with a concurrent update.
         *
         * The watchdog time stamp can race against a concurrent real
         * update as well, the only side effect might be a cycle delay for
         * the softlockup check.
         */
        for_each_cpu(cpu, &watchdog_allowed_mask) {
                per_cpu(watchdog_report_ts, cpu) = SOFTLOCKUP_DELAY_REPORT;
                wq_watchdog_touch(cpu);
        }
}

void touch_softlockup_watchdog_sync(void)
{
        __this_cpu_write(softlockup_touch_sync, true);
        __this_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT);
}

static int is_softlockup(unsigned long touch_ts,
                         unsigned long period_ts,
                         unsigned long now)
{
        if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) {
                /*
                 * If period_ts has not been updated during a sample_period, then
                 * in the subsequent few sample_periods, period_ts might also not
                 * be updated, which could indicate a potential softlockup. In
                 * this case, if we suspect the cause of the potential softlockup
                 * might be interrupt storm, then we need to count the interrupts
                 * to find which interrupt is storming.
                 */
                if (time_after_eq(now, period_ts + get_softlockup_thresh() / NUM_SAMPLE_PERIODS) &&
                    need_counting_irqs())
                        start_counting_irqs();

                /* Warn about unreasonable delays. */
                if (time_after(now, period_ts + get_softlockup_thresh()))
                        return now - touch_ts;
        }
        return 0;
}

/* watchdog detector functions */
static DEFINE_PER_CPU(struct completion, softlockup_completion);
static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);

/*
 * The watchdog feed function - touches the timestamp.
 *
 * It only runs once every sample_period seconds (4 seconds by
 * default) to reset the softlockup timestamp. If this gets delayed
 * for more than 2*watchdog_thresh seconds then the debug-printout
 * triggers in watchdog_timer_fn().
 */
static int softlockup_fn(void *data)
{
        update_touch_ts();
        stop_counting_irqs();
        complete(this_cpu_ptr(&softlockup_completion));

        return 0;
}

/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
        unsigned long touch_ts, period_ts, now;
        struct pt_regs *regs = get_irq_regs();
        int duration;
        int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
        unsigned long flags;

        if (!watchdog_enabled)
                return HRTIMER_NORESTART;

        watchdog_hardlockup_kick();

        /* kick the softlockup detector */
        if (completion_done(this_cpu_ptr(&softlockup_completion))) {
                reinit_completion(this_cpu_ptr(&softlockup_completion));
                stop_one_cpu_nowait(smp_processor_id(),
                                softlockup_fn, NULL,
                                this_cpu_ptr(&softlockup_stop_work));
        }

        /* .. and repeat */
        hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));

        /*
         * Read the current timestamp first. It might become invalid anytime
         * when a virtual machine is stopped by the host or when the watchog
         * is touched from NMI.
         */
        now = get_timestamp();
        /*
         * If a virtual machine is stopped by the host it can look to
         * the watchdog like a soft lockup. This function touches the watchdog.
         */
        kvm_check_and_clear_guest_paused();
        /*
         * The stored timestamp is comparable with @now only when not touched.
         * It might get touched anytime from NMI. Make sure that is_softlockup()
         * uses the same (valid) value.
         */
        period_ts = READ_ONCE(*this_cpu_ptr(&watchdog_report_ts));

        update_cpustat();

        /* Reset the interval when touched by known problematic code. */
        if (period_ts == SOFTLOCKUP_DELAY_REPORT) {
                if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
                        /*
                         * If the time stamp was touched atomically
                         * make sure the scheduler tick is up to date.
                         */
                        __this_cpu_write(softlockup_touch_sync, false);
                        sched_clock_tick();
                }

                update_report_ts();
                return HRTIMER_RESTART;
        }

        /* Check for a softlockup. */
        touch_ts = __this_cpu_read(watchdog_touch_ts);
        duration = is_softlockup(touch_ts, period_ts, now);
        if (unlikely(duration)) {
                /*
                 * Prevent multiple soft-lockup reports if one cpu is already
                 * engaged in dumping all cpu back traces.
                 */
                if (softlockup_all_cpu_backtrace) {
                        if (test_and_set_bit_lock(0, &soft_lockup_nmi_warn))
                                return HRTIMER_RESTART;
                }

                /* Start period for the next softlockup warning. */
                update_report_ts();

                printk_cpu_sync_get_irqsave(flags);
                pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
                        smp_processor_id(), duration,
                        current->comm, task_pid_nr(current));
                report_cpu_status();
                print_modules();
                print_irqtrace_events(current);
                if (regs)
                        show_regs(regs);
                else
                        dump_stack();
                printk_cpu_sync_put_irqrestore(flags);

                if (softlockup_all_cpu_backtrace) {
                        trigger_allbutcpu_cpu_backtrace(smp_processor_id());
                        if (!softlockup_panic)
                                clear_bit_unlock(0, &soft_lockup_nmi_warn);
                }

                add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
                if (softlockup_panic)
                        panic("softlockup: hung tasks");
        }

        return HRTIMER_RESTART;
}

static void watchdog_enable(unsigned int cpu)
{
        struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
        struct completion *done = this_cpu_ptr(&softlockup_completion);

        WARN_ON_ONCE(cpu != smp_processor_id());

        init_completion(done);
        complete(done);

        /*
         * Start the timer first to prevent the hardlockup watchdog triggering
         * before the timer has a chance to fire.
         */
        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
        hrtimer->function = watchdog_timer_fn;
        hrtimer_start(hrtimer, ns_to_ktime(sample_period),
                      HRTIMER_MODE_REL_PINNED_HARD);

        /* Initialize timestamp */
        update_touch_ts();
        /* Enable the hardlockup detector */
        if (watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED)
                watchdog_hardlockup_enable(cpu);
}

static void watchdog_disable(unsigned int cpu)
{
        struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);

        WARN_ON_ONCE(cpu != smp_processor_id());

        /*
         * Disable the hardlockup detector first. That prevents that a large
         * delay between disabling the timer and disabling the hardlockup
         * detector causes a false positive.
         */
        watchdog_hardlockup_disable(cpu);
        hrtimer_cancel(hrtimer);
        wait_for_completion(this_cpu_ptr(&softlockup_completion));
}

static int softlockup_stop_fn(void *data)
{
        watchdog_disable(smp_processor_id());
        return 0;
}

static void softlockup_stop_all(void)
{
        int cpu;

        if (!softlockup_initialized)
                return;

        for_each_cpu(cpu, &watchdog_allowed_mask)
                smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false);

        cpumask_clear(&watchdog_allowed_mask);
}

static int softlockup_start_fn(void *data)
{
        watchdog_enable(smp_processor_id());
        return 0;
}

static void softlockup_start_all(void)
{
        int cpu;

        cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
        for_each_cpu(cpu, &watchdog_allowed_mask)
                smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false);
}

int lockup_detector_online_cpu(unsigned int cpu)
{
        if (cpumask_test_cpu(cpu, &watchdog_allowed_mask))
                watchdog_enable(cpu);
        return 0;
}

int lockup_detector_offline_cpu(unsigned int cpu)
{
        if (cpumask_test_cpu(cpu, &watchdog_allowed_mask))
                watchdog_disable(cpu);
        return 0;
}

static void __lockup_detector_reconfigure(void)
{
        cpus_read_lock();
        watchdog_hardlockup_stop();

        softlockup_stop_all();
        set_sample_period();
        lockup_detector_update_enable();
        if (watchdog_enabled && watchdog_thresh)
                softlockup_start_all();

        watchdog_hardlockup_start();
        cpus_read_unlock();
        /*
         * Must be called outside the cpus locked section to prevent
         * recursive locking in the perf code.
         */
        __lockup_detector_cleanup();
}

void lockup_detector_reconfigure(void)
{
        mutex_lock(&watchdog_mutex);
        __lockup_detector_reconfigure();
        mutex_unlock(&watchdog_mutex);
}

/*
 * Create the watchdog infrastructure and configure the detector(s).
 */
static __init void lockup_detector_setup(void)
{
        /*
         * If sysctl is off and watchdog got disabled on the command line,
         * nothing to do here.
         */
        lockup_detector_update_enable();

        if (!IS_ENABLED(CONFIG_SYSCTL) &&
            !(watchdog_enabled && watchdog_thresh))
                return;

        mutex_lock(&watchdog_mutex);
        __lockup_detector_reconfigure();
        softlockup_initialized = true;
        mutex_unlock(&watchdog_mutex);
}

#else /* CONFIG_SOFTLOCKUP_DETECTOR */
static void __lockup_detector_reconfigure(void)
{
        cpus_read_lock();
        watchdog_hardlockup_stop();
        lockup_detector_update_enable();
        watchdog_hardlockup_start();
        cpus_read_unlock();
}
void lockup_detector_reconfigure(void)
{
        __lockup_detector_reconfigure();
}
static inline void lockup_detector_setup(void)
{
        __lockup_detector_reconfigure();
}
#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */

static void __lockup_detector_cleanup(void)
{
        lockdep_assert_held(&watchdog_mutex);
        hardlockup_detector_perf_cleanup();
}

/**
 * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes
 *
 * Caller must not hold the cpu hotplug rwsem.
 */
void lockup_detector_cleanup(void)
{
        mutex_lock(&watchdog_mutex);
        __lockup_detector_cleanup();
        mutex_unlock(&watchdog_mutex);
}

/**
 * lockup_detector_soft_poweroff - Interface to stop lockup detector(s)
 *
 * Special interface for parisc. It prevents lockup detector warnings from
 * the default pm_poweroff() function which busy loops forever.
 */
void lockup_detector_soft_poweroff(void)
{
        watchdog_enabled = 0;
}

#ifdef CONFIG_SYSCTL

/* Propagate any changes to the watchdog infrastructure */
static void proc_watchdog_update(void)
{
        /* Remove impossible cpus to keep sysctl output clean. */
        cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
        __lockup_detector_reconfigure();
}

/*
 * common function for watchdog, nmi_watchdog and soft_watchdog parameter
 *
 * caller             | table->data points to            | 'which'
 * -------------------|----------------------------------|-------------------------------
 * proc_watchdog      | watchdog_user_enabled            | WATCHDOG_HARDLOCKUP_ENABLED |
 *                    |                                  | WATCHDOG_SOFTOCKUP_ENABLED
 * -------------------|----------------------------------|-------------------------------
 * proc_nmi_watchdog  | watchdog_hardlockup_user_enabled | WATCHDOG_HARDLOCKUP_ENABLED
 * -------------------|----------------------------------|-------------------------------
 * proc_soft_watchdog | watchdog_softlockup_user_enabled | WATCHDOG_SOFTOCKUP_ENABLED
 */
static int proc_watchdog_common(int which, struct ctl_table *table, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
        int err, old, *param = table->data;

        mutex_lock(&watchdog_mutex);

        if (!write) {
                /*
                 * On read synchronize the userspace interface. This is a
                 * racy snapshot.
                 */
                *param = (watchdog_enabled & which) != 0;
                err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        } else {
                old = READ_ONCE(*param);
                err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
                if (!err && old != READ_ONCE(*param))
                        proc_watchdog_update();
        }
        mutex_unlock(&watchdog_mutex);
        return err;
}

/*
 * /proc/sys/kernel/watchdog
 */
static int proc_watchdog(struct ctl_table *table, int write,
                         void *buffer, size_t *lenp, loff_t *ppos)
{
        return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED |
                                    WATCHDOG_SOFTOCKUP_ENABLED,
                                    table, write, buffer, lenp, ppos);
}

/*
 * /proc/sys/kernel/nmi_watchdog
 */
static int proc_nmi_watchdog(struct ctl_table *table, int write,
                             void *buffer, size_t *lenp, loff_t *ppos)
{
        if (!watchdog_hardlockup_available && write)
                return -ENOTSUPP;
        return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED,
                                    table, write, buffer, lenp, ppos);
}

#ifdef CONFIG_SOFTLOCKUP_DETECTOR
/*
 * /proc/sys/kernel/soft_watchdog
 */
static int proc_soft_watchdog(struct ctl_table *table, int write,
                              void *buffer, size_t *lenp, loff_t *ppos)
{
        return proc_watchdog_common(WATCHDOG_SOFTOCKUP_ENABLED,
                                    table, write, buffer, lenp, ppos);
}
#endif

/*
 * /proc/sys/kernel/watchdog_thresh
 */
static int proc_watchdog_thresh(struct ctl_table *table, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
        int err, old;

        mutex_lock(&watchdog_mutex);

        old = READ_ONCE(watchdog_thresh);
        err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);

        if (!err && write && old != READ_ONCE(watchdog_thresh))
                proc_watchdog_update();

        mutex_unlock(&watchdog_mutex);
        return err;
}

/*
 * The cpumask is the mask of possible cpus that the watchdog can run
 * on, not the mask of cpus it is actually running on.  This allows the
 * user to specify a mask that will include cpus that have not yet
 * been brought online, if desired.
 */
static int proc_watchdog_cpumask(struct ctl_table *table, int write,
                                 void *buffer, size_t *lenp, loff_t *ppos)
{
        int err;

        mutex_lock(&watchdog_mutex);

        err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
        if (!err && write)
                proc_watchdog_update();

        mutex_unlock(&watchdog_mutex);
        return err;
}

static const int sixty = 60;

static struct ctl_table watchdog_sysctls[] = {
        {
                .procname       = "watchdog",
                .data                = &watchdog_user_enabled,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler   = proc_watchdog,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
        {
                .procname        = "watchdog_thresh",
                .data                = &watchdog_thresh,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_watchdog_thresh,
                .extra1                = SYSCTL_ZERO,
                .extra2                = (void *)&sixty,
        },
        {
                .procname        = "watchdog_cpumask",
                .data                = &watchdog_cpumask_bits,
                .maxlen                = NR_CPUS,
                .mode                = 0644,
                .proc_handler        = proc_watchdog_cpumask,
        },
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
        {
                .procname       = "soft_watchdog",
                .data                = &watchdog_softlockup_user_enabled,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler   = proc_soft_watchdog,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
        {
                .procname        = "softlockup_panic",
                .data                = &softlockup_panic,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
#ifdef CONFIG_SMP
        {
                .procname        = "softlockup_all_cpu_backtrace",
                .data                = &sysctl_softlockup_all_cpu_backtrace,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
#endif /* CONFIG_SMP */
#endif
#ifdef CONFIG_HARDLOCKUP_DETECTOR
        {
                .procname        = "hardlockup_panic",
                .data                = &hardlockup_panic,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
#ifdef CONFIG_SMP
        {
                .procname        = "hardlockup_all_cpu_backtrace",
                .data                = &sysctl_hardlockup_all_cpu_backtrace,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
#endif /* CONFIG_SMP */
#endif
};

static struct ctl_table watchdog_hardlockup_sysctl[] = {
        {
                .procname       = "nmi_watchdog",
                .data                = &watchdog_hardlockup_user_enabled,
                .maxlen                = sizeof(int),
                .mode                = 0444,
                .proc_handler   = proc_nmi_watchdog,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
};

static void __init watchdog_sysctl_init(void)
{
        register_sysctl_init("kernel", watchdog_sysctls);

        if (watchdog_hardlockup_available)
                watchdog_hardlockup_sysctl[0].mode = 0644;
        register_sysctl_init("kernel", watchdog_hardlockup_sysctl);
}

#else
#define watchdog_sysctl_init() do { } while (0)
#endif /* CONFIG_SYSCTL */

static void __init lockup_detector_delay_init(struct work_struct *work);
static bool allow_lockup_detector_init_retry __initdata;

static struct work_struct detector_work __initdata =
                __WORK_INITIALIZER(detector_work, lockup_detector_delay_init);

static void __init lockup_detector_delay_init(struct work_struct *work)
{
        int ret;

        ret = watchdog_hardlockup_probe();
        if (ret) {
                pr_info("Delayed init of the lockup detector failed: %d\n", ret);
                pr_info("Hard watchdog permanently disabled\n");
                return;
        }

        allow_lockup_detector_init_retry = false;

        watchdog_hardlockup_available = true;
        lockup_detector_setup();
}

/*
 * lockup_detector_retry_init - retry init lockup detector if possible.
 *
 * Retry hardlockup detector init. It is useful when it requires some
 * functionality that has to be initialized later on a particular
 * platform.
 */
void __init lockup_detector_retry_init(void)
{
        /* Must be called before late init calls */
        if (!allow_lockup_detector_init_retry)
                return;

        schedule_work(&detector_work);
}

/*
 * Ensure that optional delayed hardlockup init is proceed before
 * the init code and memory is freed.
 */
static int __init lockup_detector_check(void)
{
        /* Prevent any later retry. */
        allow_lockup_detector_init_retry = false;

        /* Make sure no work is pending. */
        flush_work(&detector_work);

        watchdog_sysctl_init();

        return 0;

}
late_initcall_sync(lockup_detector_check);

void __init lockup_detector_init(void)
{
        if (tick_nohz_full_enabled())
                pr_info("Disabling watchdog on nohz_full cores by default\n");

        cpumask_copy(&watchdog_cpumask,
                     housekeeping_cpumask(HK_TYPE_TIMER));

        if (!watchdog_hardlockup_probe())
                watchdog_hardlockup_available = true;
        else
                allow_lockup_detector_init_retry = true;

        lockup_detector_setup();
}












































































































































































































































    1 


    1 
















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/nospec.h>
#include <linux/io_uring.h>

#include <uapi/linux/io_uring.h>

#include "io_uring.h"
#include "tctx.h"
#include "poll.h"
#include "timeout.h"
#include "waitid.h"
#include "futex.h"
#include "cancel.h"

struct io_cancel {
        struct file                        *file;
        u64                                addr;
        u32                                flags;
        s32                                fd;
        u8                                opcode;
};

#define CANCEL_FLAGS        (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
                         IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED | \
                         IORING_ASYNC_CANCEL_USERDATA | IORING_ASYNC_CANCEL_OP)

/*
 * Returns true if the request matches the criteria outlined by 'cd'.
 */
bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd)
{
        bool match_user_data = cd->flags & IORING_ASYNC_CANCEL_USERDATA;

        if (req->ctx != cd->ctx)
                return false;

        if (!(cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP)))
                match_user_data = true;

        if (cd->flags & IORING_ASYNC_CANCEL_ANY)
                goto check_seq;
        if (cd->flags & IORING_ASYNC_CANCEL_FD) {
                if (req->file != cd->file)
                        return false;
        }
        if (cd->flags & IORING_ASYNC_CANCEL_OP) {
                if (req->opcode != cd->opcode)
                        return false;
        }
        if (match_user_data && req->cqe.user_data != cd->data)
                return false;
        if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
check_seq:
                if (io_cancel_match_sequence(req, cd->seq))
                        return false;
        }

        return true;
}

static bool io_cancel_cb(struct io_wq_work *work, void *data)
{
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
        struct io_cancel_data *cd = data;

        return io_cancel_req_match(req, cd);
}

static int io_async_cancel_one(struct io_uring_task *tctx,
                               struct io_cancel_data *cd)
{
        enum io_wq_cancel cancel_ret;
        int ret = 0;
        bool all;

        if (!tctx || !tctx->io_wq)
                return -ENOENT;

        all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
        cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all);
        switch (cancel_ret) {
        case IO_WQ_CANCEL_OK:
                ret = 0;
                break;
        case IO_WQ_CANCEL_RUNNING:
                ret = -EALREADY;
                break;
        case IO_WQ_CANCEL_NOTFOUND:
                ret = -ENOENT;
                break;
        }

        return ret;
}

int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
                  unsigned issue_flags)
{
        struct io_ring_ctx *ctx = cd->ctx;
        int ret;

        WARN_ON_ONCE(!io_wq_current_is_worker() && tctx != current->io_uring);

        ret = io_async_cancel_one(tctx, cd);
        /*
         * Fall-through even for -EALREADY, as we may have poll armed
         * that need unarming.
         */
        if (!ret)
                return 0;

        ret = io_poll_cancel(ctx, cd, issue_flags);
        if (ret != -ENOENT)
                return ret;

        ret = io_waitid_cancel(ctx, cd, issue_flags);
        if (ret != -ENOENT)
                return ret;

        ret = io_futex_cancel(ctx, cd, issue_flags);
        if (ret != -ENOENT)
                return ret;

        spin_lock(&ctx->completion_lock);
        if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
                ret = io_timeout_cancel(ctx, cd);
        spin_unlock(&ctx->completion_lock);
        return ret;
}

int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
        struct io_cancel *cancel = io_kiocb_to_cmd(req, struct io_cancel);

        if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
                return -EINVAL;
        if (sqe->off || sqe->splice_fd_in)
                return -EINVAL;

        cancel->addr = READ_ONCE(sqe->addr);
        cancel->flags = READ_ONCE(sqe->cancel_flags);
        if (cancel->flags & ~CANCEL_FLAGS)
                return -EINVAL;
        if (cancel->flags & IORING_ASYNC_CANCEL_FD) {
                if (cancel->flags & IORING_ASYNC_CANCEL_ANY)
                        return -EINVAL;
                cancel->fd = READ_ONCE(sqe->fd);
        }
        if (cancel->flags & IORING_ASYNC_CANCEL_OP) {
                if (cancel->flags & IORING_ASYNC_CANCEL_ANY)
                        return -EINVAL;
                cancel->opcode = READ_ONCE(sqe->len);
        }

        return 0;
}

static int __io_async_cancel(struct io_cancel_data *cd,
                             struct io_uring_task *tctx,
                             unsigned int issue_flags)
{
        bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
        struct io_ring_ctx *ctx = cd->ctx;
        struct io_tctx_node *node;
        int ret, nr = 0;

        do {
                ret = io_try_cancel(tctx, cd, issue_flags);
                if (ret == -ENOENT)
                        break;
                if (!all)
                        return ret;
                nr++;
        } while (1);

        /* slow path, try all io-wq's */
        io_ring_submit_lock(ctx, issue_flags);
        ret = -ENOENT;
        list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
                ret = io_async_cancel_one(node->task->io_uring, cd);
                if (ret != -ENOENT) {
                        if (!all)
                                break;
                        nr++;
                }
        }
        io_ring_submit_unlock(ctx, issue_flags);
        return all ? nr : ret;
}

int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
{
        struct io_cancel *cancel = io_kiocb_to_cmd(req, struct io_cancel);
        struct io_cancel_data cd = {
                .ctx        = req->ctx,
                .data        = cancel->addr,
                .flags        = cancel->flags,
                .opcode        = cancel->opcode,
                .seq        = atomic_inc_return(&req->ctx->cancel_seq),
        };
        struct io_uring_task *tctx = req->task->io_uring;
        int ret;

        if (cd.flags & IORING_ASYNC_CANCEL_FD) {
                if (req->flags & REQ_F_FIXED_FILE ||
                    cd.flags & IORING_ASYNC_CANCEL_FD_FIXED) {
                        req->flags |= REQ_F_FIXED_FILE;
                        req->file = io_file_get_fixed(req, cancel->fd,
                                                        issue_flags);
                } else {
                        req->file = io_file_get_normal(req, cancel->fd);
                }
                if (!req->file) {
                        ret = -EBADF;
                        goto done;
                }
                cd.file = req->file;
        }

        ret = __io_async_cancel(&cd, tctx, issue_flags);
done:
        if (ret < 0)
                req_set_fail(req);
        io_req_set_res(req, ret, 0);
        return IOU_OK;
}

void init_hash_table(struct io_hash_table *table, unsigned size)
{
        unsigned int i;

        for (i = 0; i < size; i++) {
                spin_lock_init(&table->hbs[i].lock);
                INIT_HLIST_HEAD(&table->hbs[i].list);
        }
}

static int __io_sync_cancel(struct io_uring_task *tctx,
                            struct io_cancel_data *cd, int fd)
{
        struct io_ring_ctx *ctx = cd->ctx;

        /* fixed must be grabbed every time since we drop the uring_lock */
        if ((cd->flags & IORING_ASYNC_CANCEL_FD) &&
            (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
                if (unlikely(fd >= ctx->nr_user_files))
                        return -EBADF;
                fd = array_index_nospec(fd, ctx->nr_user_files);
                cd->file = io_file_from_index(&ctx->file_table, fd);
                if (!cd->file)
                        return -EBADF;
        }

        return __io_async_cancel(cd, tctx, 0);
}

int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
        __must_hold(&ctx->uring_lock)
{
        struct io_cancel_data cd = {
                .ctx        = ctx,
                .seq        = atomic_inc_return(&ctx->cancel_seq),
        };
        ktime_t timeout = KTIME_MAX;
        struct io_uring_sync_cancel_reg sc;
        struct file *file = NULL;
        DEFINE_WAIT(wait);
        int ret, i;

        if (copy_from_user(&sc, arg, sizeof(sc)))
                return -EFAULT;
        if (sc.flags & ~CANCEL_FLAGS)
                return -EINVAL;
        for (i = 0; i < ARRAY_SIZE(sc.pad); i++)
                if (sc.pad[i])
                        return -EINVAL;
        for (i = 0; i < ARRAY_SIZE(sc.pad2); i++)
                if (sc.pad2[i])
                        return -EINVAL;

        cd.data = sc.addr;
        cd.flags = sc.flags;
        cd.opcode = sc.opcode;

        /* we can grab a normal file descriptor upfront */
        if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&
           !(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
                file = fget(sc.fd);
                if (!file)
                        return -EBADF;
                cd.file = file;
        }

        ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);

        /* found something, done! */
        if (ret != -EALREADY)
                goto out;

        if (sc.timeout.tv_sec != -1UL || sc.timeout.tv_nsec != -1UL) {
                struct timespec64 ts = {
                        .tv_sec                = sc.timeout.tv_sec,
                        .tv_nsec        = sc.timeout.tv_nsec
                };

                timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
        }

        /*
         * Keep looking until we get -ENOENT. we'll get woken everytime
         * every time a request completes and will retry the cancelation.
         */
        do {
                cd.seq = atomic_inc_return(&ctx->cancel_seq);

                prepare_to_wait(&ctx->cq_wait, &wait, TASK_INTERRUPTIBLE);

                ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);

                mutex_unlock(&ctx->uring_lock);
                if (ret != -EALREADY)
                        break;

                ret = io_run_task_work_sig(ctx);
                if (ret < 0)
                        break;
                ret = schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS);
                if (!ret) {
                        ret = -ETIME;
                        break;
                }
                mutex_lock(&ctx->uring_lock);
        } while (1);

        finish_wait(&ctx->cq_wait, &wait);
        mutex_lock(&ctx->uring_lock);

        if (ret == -ENOENT || ret > 0)
                ret = 0;
out:
        if (file)
                fput(file);
        return ret;
}




















    1 





















    1 











    1 













    1 

    1 




















    1 




















































































































































    1 
















    1 




















    1 





    1 

    1 





    1 





















    1 

    1 
    1 



















    1 




















    1 


















    1 


    1 



    1 







    1 




    1 



    1 



    1 



    1 










    1 



    1 













































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
// SPDX-License-Identifier: GPL-2.0-only
/*
 * directory.c
 *
 * PURPOSE
 *        Directory related functions
 *
 */

#include "udfdecl.h"
#include "udf_i.h"

#include <linux/fs.h>
#include <linux/string.h>
#include <linux/bio.h>
#include <linux/crc-itu-t.h>
#include <linux/iversion.h>

static int udf_verify_fi(struct udf_fileident_iter *iter)
{
        unsigned int len;

        if (iter->fi.descTag.tagIdent != cpu_to_le16(TAG_IDENT_FID)) {
                udf_err(iter->dir->i_sb,
                        "directory (ino %lu) has entry at pos %llu with incorrect tag %x\n",
                        iter->dir->i_ino, (unsigned long long)iter->pos,
                        le16_to_cpu(iter->fi.descTag.tagIdent));
                return -EFSCORRUPTED;
        }
        len = udf_dir_entry_len(&iter->fi);
        if (le16_to_cpu(iter->fi.lengthOfImpUse) & 3) {
                udf_err(iter->dir->i_sb,
                        "directory (ino %lu) has entry at pos %llu with unaligned length of impUse field\n",
                        iter->dir->i_ino, (unsigned long long)iter->pos);
                return -EFSCORRUPTED;
        }
        /*
         * This is in fact allowed by the spec due to long impUse field but
         * we don't support it. If there is real media with this large impUse
         * field, support can be added.
         */
        if (len > 1 << iter->dir->i_blkbits) {
                udf_err(iter->dir->i_sb,
                        "directory (ino %lu) has too big (%u) entry at pos %llu\n",
                        iter->dir->i_ino, len, (unsigned long long)iter->pos);
                return -EFSCORRUPTED;
        }
        if (iter->pos + len > iter->dir->i_size) {
                udf_err(iter->dir->i_sb,
                        "directory (ino %lu) has entry past directory size at pos %llu\n",
                        iter->dir->i_ino, (unsigned long long)iter->pos);
                return -EFSCORRUPTED;
        }
        if (udf_dir_entry_len(&iter->fi) !=
            sizeof(struct tag) + le16_to_cpu(iter->fi.descTag.descCRCLength)) {
                udf_err(iter->dir->i_sb,
                        "directory (ino %lu) has entry where CRC length (%u) does not match entry length (%u)\n",
                        iter->dir->i_ino,
                        (unsigned)le16_to_cpu(iter->fi.descTag.descCRCLength),
                        (unsigned)(udf_dir_entry_len(&iter->fi) -
                                                        sizeof(struct tag)));
                return -EFSCORRUPTED;
        }
        return 0;
}

static int udf_copy_fi(struct udf_fileident_iter *iter)
{
        struct udf_inode_info *iinfo = UDF_I(iter->dir);
        u32 blksize = 1 << iter->dir->i_blkbits;
        u32 off, len, nameoff;
        int err;

        /* Skip copying when we are at EOF */
        if (iter->pos >= iter->dir->i_size) {
                iter->name = NULL;
                return 0;
        }
        if (iter->dir->i_size < iter->pos + sizeof(struct fileIdentDesc)) {
                udf_err(iter->dir->i_sb,
                        "directory (ino %lu) has entry straddling EOF\n",
                        iter->dir->i_ino);
                return -EFSCORRUPTED;
        }
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                memcpy(&iter->fi, iinfo->i_data + iinfo->i_lenEAttr + iter->pos,
                       sizeof(struct fileIdentDesc));
                err = udf_verify_fi(iter);
                if (err < 0)
                        return err;
                iter->name = iinfo->i_data + iinfo->i_lenEAttr + iter->pos +
                        sizeof(struct fileIdentDesc) +
                        le16_to_cpu(iter->fi.lengthOfImpUse);
                return 0;
        }

        off = iter->pos & (blksize - 1);
        len = min_t(u32, sizeof(struct fileIdentDesc), blksize - off);
        memcpy(&iter->fi, iter->bh[0]->b_data + off, len);
        if (len < sizeof(struct fileIdentDesc))
                memcpy((char *)(&iter->fi) + len, iter->bh[1]->b_data,
                       sizeof(struct fileIdentDesc) - len);
        err = udf_verify_fi(iter);
        if (err < 0)
                return err;

        /* Handle directory entry name */
        nameoff = off + sizeof(struct fileIdentDesc) +
                                le16_to_cpu(iter->fi.lengthOfImpUse);
        if (off + udf_dir_entry_len(&iter->fi) <= blksize) {
                iter->name = iter->bh[0]->b_data + nameoff;
        } else if (nameoff >= blksize) {
                iter->name = iter->bh[1]->b_data + (nameoff - blksize);
        } else {
                iter->name = iter->namebuf;
                len = blksize - nameoff;
                memcpy(iter->name, iter->bh[0]->b_data + nameoff, len);
                memcpy(iter->name + len, iter->bh[1]->b_data,
                       iter->fi.lengthFileIdent - len);
        }
        return 0;
}

/* Readahead 8k once we are at 8k boundary */
static void udf_readahead_dir(struct udf_fileident_iter *iter)
{
        unsigned int ralen = 16 >> (iter->dir->i_blkbits - 9);
        struct buffer_head *tmp, *bha[16];
        int i, num;
        udf_pblk_t blk;

        if (iter->loffset & (ralen - 1))
                return;

        if (iter->loffset + ralen > (iter->elen >> iter->dir->i_blkbits))
                ralen = (iter->elen >> iter->dir->i_blkbits) - iter->loffset;
        num = 0;
        for (i = 0; i < ralen; i++) {
                blk = udf_get_lb_pblock(iter->dir->i_sb, &iter->eloc,
                                        iter->loffset + i);
                tmp = sb_getblk(iter->dir->i_sb, blk);
                if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp))
                        bha[num++] = tmp;
                else
                        brelse(tmp);
        }
        if (num) {
                bh_readahead_batch(num, bha, REQ_RAHEAD);
                for (i = 0; i < num; i++)
                        brelse(bha[i]);
        }
}

static struct buffer_head *udf_fiiter_bread_blk(struct udf_fileident_iter *iter)
{
        udf_pblk_t blk;

        udf_readahead_dir(iter);
        blk = udf_get_lb_pblock(iter->dir->i_sb, &iter->eloc, iter->loffset);
        return sb_bread(iter->dir->i_sb, blk);
}

/*
 * Updates loffset to point to next directory block; eloc, elen & epos are
 * updated if we need to traverse to the next extent as well.
 */
static int udf_fiiter_advance_blk(struct udf_fileident_iter *iter)
{
        iter->loffset++;
        if (iter->loffset < DIV_ROUND_UP(iter->elen, 1<<iter->dir->i_blkbits))
                return 0;

        iter->loffset = 0;
        if (udf_next_aext(iter->dir, &iter->epos, &iter->eloc, &iter->elen, 1)
                        != (EXT_RECORDED_ALLOCATED >> 30)) {
                if (iter->pos == iter->dir->i_size) {
                        iter->elen = 0;
                        return 0;
                }
                udf_err(iter->dir->i_sb,
                        "extent after position %llu not allocated in directory (ino %lu)\n",
                        (unsigned long long)iter->pos, iter->dir->i_ino);
                return -EFSCORRUPTED;
        }
        return 0;
}

static int udf_fiiter_load_bhs(struct udf_fileident_iter *iter)
{
        int blksize = 1 << iter->dir->i_blkbits;
        int off = iter->pos & (blksize - 1);
        int err;
        struct fileIdentDesc *fi;

        /* Is there any further extent we can map from? */
        if (!iter->bh[0] && iter->elen) {
                iter->bh[0] = udf_fiiter_bread_blk(iter);
                if (!iter->bh[0]) {
                        err = -ENOMEM;
                        goto out_brelse;
                }
                if (!buffer_uptodate(iter->bh[0])) {
                        err = -EIO;
                        goto out_brelse;
                }
        }
        /* There's no next block so we are done */
        if (iter->pos >= iter->dir->i_size)
                return 0;
        /* Need to fetch next block as well? */
        if (off + sizeof(struct fileIdentDesc) > blksize)
                goto fetch_next;
        fi = (struct fileIdentDesc *)(iter->bh[0]->b_data + off);
        /* Need to fetch next block to get name? */
        if (off + udf_dir_entry_len(fi) > blksize) {
fetch_next:
                err = udf_fiiter_advance_blk(iter);
                if (err)
                        goto out_brelse;
                iter->bh[1] = udf_fiiter_bread_blk(iter);
                if (!iter->bh[1]) {
                        err = -ENOMEM;
                        goto out_brelse;
                }
                if (!buffer_uptodate(iter->bh[1])) {
                        err = -EIO;
                        goto out_brelse;
                }
        }
        return 0;
out_brelse:
        brelse(iter->bh[0]);
        brelse(iter->bh[1]);
        iter->bh[0] = iter->bh[1] = NULL;
        return err;
}

int udf_fiiter_init(struct udf_fileident_iter *iter, struct inode *dir,
                    loff_t pos)
{
        struct udf_inode_info *iinfo = UDF_I(dir);
        int err = 0;

        iter->dir = dir;
        iter->bh[0] = iter->bh[1] = NULL;
        iter->pos = pos;
        iter->elen = 0;
        iter->epos.bh = NULL;
        iter->name = NULL;
        /*
         * When directory is verified, we don't expect directory iteration to
         * fail and it can be difficult to undo without corrupting filesystem.
         * So just do not allow memory allocation failures here.
         */
        iter->namebuf = kmalloc(UDF_NAME_LEN_CS0, GFP_KERNEL | __GFP_NOFAIL);

        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                err = udf_copy_fi(iter);
                goto out;
        }

        if (inode_bmap(dir, iter->pos >> dir->i_blkbits, &iter->epos,
                       &iter->eloc, &iter->elen, &iter->loffset) !=
            (EXT_RECORDED_ALLOCATED >> 30)) {
                if (pos == dir->i_size)
                        return 0;
                udf_err(dir->i_sb,
                        "position %llu not allocated in directory (ino %lu)\n",
                        (unsigned long long)pos, dir->i_ino);
                err = -EFSCORRUPTED;
                goto out;
        }
        err = udf_fiiter_load_bhs(iter);
        if (err < 0)
                goto out;
        err = udf_copy_fi(iter);
out:
        if (err < 0)
                udf_fiiter_release(iter);
        return err;
}

int udf_fiiter_advance(struct udf_fileident_iter *iter)
{
        unsigned int oldoff, len;
        int blksize = 1 << iter->dir->i_blkbits;
        int err;

        oldoff = iter->pos & (blksize - 1);
        len = udf_dir_entry_len(&iter->fi);
        iter->pos += len;
        if (UDF_I(iter->dir)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
                if (oldoff + len >= blksize) {
                        brelse(iter->bh[0]);
                        iter->bh[0] = NULL;
                        /* Next block already loaded? */
                        if (iter->bh[1]) {
                                iter->bh[0] = iter->bh[1];
                                iter->bh[1] = NULL;
                        } else {
                                err = udf_fiiter_advance_blk(iter);
                                if (err < 0)
                                        return err;
                        }
                }
                err = udf_fiiter_load_bhs(iter);
                if (err < 0)
                        return err;
        }
        return udf_copy_fi(iter);
}

void udf_fiiter_release(struct udf_fileident_iter *iter)
{
        iter->dir = NULL;
        brelse(iter->bh[0]);
        brelse(iter->bh[1]);
        iter->bh[0] = iter->bh[1] = NULL;
        kfree(iter->namebuf);
        iter->namebuf = NULL;
}

static void udf_copy_to_bufs(void *buf1, int len1, void *buf2, int len2,
                             int off, void *src, int len)
{
        int copy;

        if (off >= len1) {
                off -= len1;
        } else {
                copy = min(off + len, len1) - off;
                memcpy(buf1 + off, src, copy);
                src += copy;
                len -= copy;
                off = 0;
        }
        if (len > 0) {
                if (WARN_ON_ONCE(off + len > len2 || !buf2))
                        return;
                memcpy(buf2 + off, src, len);
        }
}

static uint16_t udf_crc_fi_bufs(void *buf1, int len1, void *buf2, int len2,
                                int off, int len)
{
        int copy;
        uint16_t crc = 0;

        if (off >= len1) {
                off -= len1;
        } else {
                copy = min(off + len, len1) - off;
                crc = crc_itu_t(crc, buf1 + off, copy);
                len -= copy;
                off = 0;
        }
        if (len > 0) {
                if (WARN_ON_ONCE(off + len > len2 || !buf2))
                        return 0;
                crc = crc_itu_t(crc, buf2 + off, len);
        }
        return crc;
}

static void udf_copy_fi_to_bufs(char *buf1, int len1, char *buf2, int len2,
                                int off, struct fileIdentDesc *fi,
                                uint8_t *impuse, uint8_t *name)
{
        uint16_t crc;
        int fioff = off;
        int crcoff = off + sizeof(struct tag);
        unsigned int crclen = udf_dir_entry_len(fi) - sizeof(struct tag);
        char zeros[UDF_NAME_PAD] = {};
        int endoff = off + udf_dir_entry_len(fi);

        udf_copy_to_bufs(buf1, len1, buf2, len2, off, fi,
                         sizeof(struct fileIdentDesc));
        off += sizeof(struct fileIdentDesc);
        if (impuse)
                udf_copy_to_bufs(buf1, len1, buf2, len2, off, impuse,
                                 le16_to_cpu(fi->lengthOfImpUse));
        off += le16_to_cpu(fi->lengthOfImpUse);
        if (name) {
                udf_copy_to_bufs(buf1, len1, buf2, len2, off, name,
                                 fi->lengthFileIdent);
                off += fi->lengthFileIdent;
                udf_copy_to_bufs(buf1, len1, buf2, len2, off, zeros,
                                 endoff - off);
        }

        crc = udf_crc_fi_bufs(buf1, len1, buf2, len2, crcoff, crclen);
        fi->descTag.descCRC = cpu_to_le16(crc);
        fi->descTag.descCRCLength = cpu_to_le16(crclen);
        fi->descTag.tagChecksum = udf_tag_checksum(&fi->descTag);

        udf_copy_to_bufs(buf1, len1, buf2, len2, fioff, fi, sizeof(struct tag));
}

void udf_fiiter_write_fi(struct udf_fileident_iter *iter, uint8_t *impuse)
{
        struct udf_inode_info *iinfo = UDF_I(iter->dir);
        void *buf1, *buf2 = NULL;
        int len1, len2 = 0, off;
        int blksize = 1 << iter->dir->i_blkbits;

        off = iter->pos & (blksize - 1);
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                buf1 = iinfo->i_data + iinfo->i_lenEAttr;
                len1 = iter->dir->i_size;
        } else {
                buf1 = iter->bh[0]->b_data;
                len1 = blksize;
                if (iter->bh[1]) {
                        buf2 = iter->bh[1]->b_data;
                        len2 = blksize;
                }
        }

        udf_copy_fi_to_bufs(buf1, len1, buf2, len2, off, &iter->fi, impuse,
                            iter->name == iter->namebuf ? iter->name : NULL);

        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                mark_inode_dirty(iter->dir);
        } else {
                mark_buffer_dirty_inode(iter->bh[0], iter->dir);
                if (iter->bh[1])
                        mark_buffer_dirty_inode(iter->bh[1], iter->dir);
        }
        inode_inc_iversion(iter->dir);
}

void udf_fiiter_update_elen(struct udf_fileident_iter *iter, uint32_t new_elen)
{
        struct udf_inode_info *iinfo = UDF_I(iter->dir);
        int diff = new_elen - iter->elen;

        /* Skip update when we already went past the last extent */
        if (!iter->elen)
                return;
        iter->elen = new_elen;
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
                iter->epos.offset -= sizeof(struct short_ad);
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
                iter->epos.offset -= sizeof(struct long_ad);
        udf_write_aext(iter->dir, &iter->epos, &iter->eloc, iter->elen, 1);
        iinfo->i_lenExtents += diff;
        mark_inode_dirty(iter->dir);
}

/* Append new block to directory. @iter is expected to point at EOF */
int udf_fiiter_append_blk(struct udf_fileident_iter *iter)
{
        struct udf_inode_info *iinfo = UDF_I(iter->dir);
        int blksize = 1 << iter->dir->i_blkbits;
        struct buffer_head *bh;
        sector_t block;
        uint32_t old_elen = iter->elen;
        int err;

        if (WARN_ON_ONCE(iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB))
                return -EINVAL;

        /* Round up last extent in the file */
        udf_fiiter_update_elen(iter, ALIGN(iter->elen, blksize));

        /* Allocate new block and refresh mapping information */
        block = iinfo->i_lenExtents >> iter->dir->i_blkbits;
        bh = udf_bread(iter->dir, block, 1, &err);
        if (!bh) {
                udf_fiiter_update_elen(iter, old_elen);
                return err;
        }
        if (inode_bmap(iter->dir, block, &iter->epos, &iter->eloc, &iter->elen,
                       &iter->loffset) != (EXT_RECORDED_ALLOCATED >> 30)) {
                udf_err(iter->dir->i_sb,
                        "block %llu not allocated in directory (ino %lu)\n",
                        (unsigned long long)block, iter->dir->i_ino);
                return -EFSCORRUPTED;
        }
        if (!(iter->pos & (blksize - 1))) {
                brelse(iter->bh[0]);
                iter->bh[0] = bh;
        } else {
                iter->bh[1] = bh;
        }
        return 0;
}

struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
                              int inc)
{
        struct short_ad *sa;

        if ((!ptr) || (!offset)) {
                pr_err("%s: invalidparms\n", __func__);
                return NULL;
        }

        if ((*offset + sizeof(struct short_ad)) > maxoffset)
                return NULL;
        else {
                sa = (struct short_ad *)ptr;
                if (sa->extLength == 0)
                        return NULL;
        }

        if (inc)
                *offset += sizeof(struct short_ad);
        return sa;
}

struct long_ad *udf_get_filelongad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc)
{
        struct long_ad *la;

        if ((!ptr) || (!offset)) {
                pr_err("%s: invalidparms\n", __func__);
                return NULL;
        }

        if ((*offset + sizeof(struct long_ad)) > maxoffset)
                return NULL;
        else {
                la = (struct long_ad *)ptr;
                if (la->extLength == 0)
                        return NULL;
        }

        if (inc)
                *offset += sizeof(struct long_ad);
        return la;
}


















































   17 














   17 





    8 
   17 
















































































   15 







    9 



   17 





















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
 * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
 * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
 * http://www.intel.com/products/processor/manuals/
 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
 * Volume 2A: Instruction Set Reference, A-M
 *
 * Copyright (C) 2008 Intel Corporation
 * Authors: Austin Zhang <austin_zhang@linux.intel.com>
 *          Kent Liu <kent.liu@intel.com>
 */
#include <linux/init.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>

#include <asm/cpufeatures.h>
#include <asm/cpu_device_id.h>
#include <asm/simd.h>

#define CHKSUM_BLOCK_SIZE        1
#define CHKSUM_DIGEST_SIZE        4

#define SCALE_F        sizeof(unsigned long)

#ifdef CONFIG_X86_64
#define CRC32_INST "crc32q %1, %q0"
#else
#define CRC32_INST "crc32l %1, %0"
#endif

#ifdef CONFIG_X86_64
/*
 * use carryless multiply version of crc32c when buffer
 * size is >= 512 to account
 * for fpu state save/restore overhead.
 */
#define CRC32C_PCL_BREAKEVEN        512

asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
                                unsigned int crc_init);
#endif /* CONFIG_X86_64 */

static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
{
        while (length--) {
                asm("crc32b %1, %0"
                    : "+r" (crc) : "rm" (*data));
                data++;
        }

        return crc;
}

static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len)
{
        unsigned int iquotient = len / SCALE_F;
        unsigned int iremainder = len % SCALE_F;
        unsigned long *ptmp = (unsigned long *)p;

        while (iquotient--) {
                asm(CRC32_INST
                    : "+r" (crc) : "rm" (*ptmp));
                ptmp++;
        }

        if (iremainder)
                crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp,
                                 iremainder);

        return crc;
}

/*
 * Setting the seed allows arbitrary accumulators and flexible XOR policy
 * If your algorithm starts with ~0, then XOR with ~0 before you set
 * the seed.
 */
static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key,
                        unsigned int keylen)
{
        u32 *mctx = crypto_shash_ctx(hash);

        if (keylen != sizeof(u32))
                return -EINVAL;
        *mctx = le32_to_cpup((__le32 *)key);
        return 0;
}

static int crc32c_intel_init(struct shash_desc *desc)
{
        u32 *mctx = crypto_shash_ctx(desc->tfm);
        u32 *crcp = shash_desc_ctx(desc);

        *crcp = *mctx;

        return 0;
}

static int crc32c_intel_update(struct shash_desc *desc, const u8 *data,
                               unsigned int len)
{
        u32 *crcp = shash_desc_ctx(desc);

        *crcp = crc32c_intel_le_hw(*crcp, data, len);
        return 0;
}

static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
                                u8 *out)
{
        *(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
        return 0;
}

static int crc32c_intel_finup(struct shash_desc *desc, const u8 *data,
                              unsigned int len, u8 *out)
{
        return __crc32c_intel_finup(shash_desc_ctx(desc), data, len, out);
}

static int crc32c_intel_final(struct shash_desc *desc, u8 *out)
{
        u32 *crcp = shash_desc_ctx(desc);

        *(__le32 *)out = ~cpu_to_le32p(crcp);
        return 0;
}

static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data,
                               unsigned int len, u8 *out)
{
        return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
                                    out);
}

static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
{
        u32 *key = crypto_tfm_ctx(tfm);

        *key = ~0;

        return 0;
}

#ifdef CONFIG_X86_64
static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
                               unsigned int len)
{
        u32 *crcp = shash_desc_ctx(desc);

        /*
         * use faster PCL version if datasize is large enough to
         * overcome kernel fpu state save/restore overhead
         */
        if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
                kernel_fpu_begin();
                *crcp = crc_pcl(data, len, *crcp);
                kernel_fpu_end();
        } else
                *crcp = crc32c_intel_le_hw(*crcp, data, len);
        return 0;
}

static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
                                u8 *out)
{
        if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
                kernel_fpu_begin();
                *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
                kernel_fpu_end();
        } else
                *(__le32 *)out =
                        ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
        return 0;
}

static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data,
                              unsigned int len, u8 *out)
{
        return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out);
}

static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data,
                               unsigned int len, u8 *out)
{
        return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
                                    out);
}
#endif /* CONFIG_X86_64 */

static struct shash_alg alg = {
        .setkey                        =        crc32c_intel_setkey,
        .init                        =        crc32c_intel_init,
        .update                        =        crc32c_intel_update,
        .final                        =        crc32c_intel_final,
        .finup                        =        crc32c_intel_finup,
        .digest                        =        crc32c_intel_digest,
        .descsize                =        sizeof(u32),
        .digestsize                =        CHKSUM_DIGEST_SIZE,
        .base                        =        {
                .cra_name                =        "crc32c",
                .cra_driver_name        =        "crc32c-intel",
                .cra_priority                =        200,
                .cra_flags                =        CRYPTO_ALG_OPTIONAL_KEY,
                .cra_blocksize                =        CHKSUM_BLOCK_SIZE,
                .cra_ctxsize                =        sizeof(u32),
                .cra_module                =        THIS_MODULE,
                .cra_init                =        crc32c_intel_cra_init,
        }
};

static const struct x86_cpu_id crc32c_cpu_id[] = {
        X86_MATCH_FEATURE(X86_FEATURE_XMM4_2, NULL),
        {}
};
MODULE_DEVICE_TABLE(x86cpu, crc32c_cpu_id);

static int __init crc32c_intel_mod_init(void)
{
        if (!x86_match_cpu(crc32c_cpu_id))
                return -ENODEV;
#ifdef CONFIG_X86_64
        if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
                alg.update = crc32c_pcl_intel_update;
                alg.finup = crc32c_pcl_intel_finup;
                alg.digest = crc32c_pcl_intel_digest;
        }
#endif
        return crypto_register_shash(&alg);
}

static void __exit crc32c_intel_mod_fini(void)
{
        crypto_unregister_shash(&alg);
}

module_init(crc32c_intel_mod_init);
module_exit(crc32c_intel_mod_fini);

MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.com>");
MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware.");
MODULE_LICENSE("GPL");

MODULE_ALIAS_CRYPTO("crc32c");
MODULE_ALIAS_CRYPTO("crc32c-intel");





































































































































































































































































































































































































































   10 

















    8 



































    1 



















































































    8 

    8 

























































    2 




    1 
    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_VMSTAT_H
#define _LINUX_VMSTAT_H

#include <linux/types.h>
#include <linux/percpu.h>
#include <linux/mmzone.h>
#include <linux/vm_event_item.h>
#include <linux/atomic.h>
#include <linux/static_key.h>
#include <linux/mmdebug.h>

extern int sysctl_stat_interval;

#ifdef CONFIG_NUMA
#define ENABLE_NUMA_STAT   1
#define DISABLE_NUMA_STAT   0
extern int sysctl_vm_numa_stat;
DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key);
int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
                void *buffer, size_t *length, loff_t *ppos);
#endif

struct reclaim_stat {
        unsigned nr_dirty;
        unsigned nr_unqueued_dirty;
        unsigned nr_congested;
        unsigned nr_writeback;
        unsigned nr_immediate;
        unsigned nr_pageout;
        unsigned nr_activate[ANON_AND_FILE];
        unsigned nr_ref_keep;
        unsigned nr_unmap_fail;
        unsigned nr_lazyfree_fail;
};

enum writeback_stat_item {
        NR_DIRTY_THRESHOLD,
        NR_DIRTY_BG_THRESHOLD,
        NR_VM_WRITEBACK_STAT_ITEMS,
};

#ifdef CONFIG_VM_EVENT_COUNTERS
/*
 * Light weight per cpu counter implementation.
 *
 * Counters should only be incremented and no critical kernel component
 * should rely on the counter values.
 *
 * Counters are handled completely inline. On many platforms the code
 * generated will simply be the increment of a global address.
 */

struct vm_event_state {
        unsigned long event[NR_VM_EVENT_ITEMS];
};

DECLARE_PER_CPU(struct vm_event_state, vm_event_states);

/*
 * vm counters are allowed to be racy. Use raw_cpu_ops to avoid the
 * local_irq_disable overhead.
 */
static inline void __count_vm_event(enum vm_event_item item)
{
        raw_cpu_inc(vm_event_states.event[item]);
}

static inline void count_vm_event(enum vm_event_item item)
{
        this_cpu_inc(vm_event_states.event[item]);
}

static inline void __count_vm_events(enum vm_event_item item, long delta)
{
        raw_cpu_add(vm_event_states.event[item], delta);
}

static inline void count_vm_events(enum vm_event_item item, long delta)
{
        this_cpu_add(vm_event_states.event[item], delta);
}

extern void all_vm_events(unsigned long *);

extern void vm_events_fold_cpu(int cpu);

#else

/* Disable counters */
static inline void count_vm_event(enum vm_event_item item)
{
}
static inline void count_vm_events(enum vm_event_item item, long delta)
{
}
static inline void __count_vm_event(enum vm_event_item item)
{
}
static inline void __count_vm_events(enum vm_event_item item, long delta)
{
}
static inline void all_vm_events(unsigned long *ret)
{
}
static inline void vm_events_fold_cpu(int cpu)
{
}

#endif /* CONFIG_VM_EVENT_COUNTERS */

#ifdef CONFIG_NUMA_BALANCING
#define count_vm_numa_event(x)     count_vm_event(x)
#define count_vm_numa_events(x, y) count_vm_events(x, y)
#else
#define count_vm_numa_event(x) do {} while (0)
#define count_vm_numa_events(x, y) do { (void)(y); } while (0)
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_DEBUG_TLBFLUSH
#define count_vm_tlb_event(x)           count_vm_event(x)
#define count_vm_tlb_events(x, y)  count_vm_events(x, y)
#else
#define count_vm_tlb_event(x)     do {} while (0)
#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
#endif

#ifdef CONFIG_PER_VMA_LOCK_STATS
#define count_vm_vma_lock_event(x) count_vm_event(x)
#else
#define count_vm_vma_lock_event(x) do {} while (0)
#endif

#define __count_zid_vm_events(item, zid, delta) \
        __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)

/*
 * Zone and node-based page accounting with per cpu differentials.
 */
extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS];
extern atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];

#ifdef CONFIG_NUMA
static inline void zone_numa_event_add(long x, struct zone *zone,
                                enum numa_stat_item item)
{
        atomic_long_add(x, &zone->vm_numa_event[item]);
        atomic_long_add(x, &vm_numa_event[item]);
}

static inline unsigned long zone_numa_event_state(struct zone *zone,
                                        enum numa_stat_item item)
{
        return atomic_long_read(&zone->vm_numa_event[item]);
}

static inline unsigned long
global_numa_event_state(enum numa_stat_item item)
{
        return atomic_long_read(&vm_numa_event[item]);
}
#endif /* CONFIG_NUMA */

static inline void zone_page_state_add(long x, struct zone *zone,
                                 enum zone_stat_item item)
{
        atomic_long_add(x, &zone->vm_stat[item]);
        atomic_long_add(x, &vm_zone_stat[item]);
}

static inline void node_page_state_add(long x, struct pglist_data *pgdat,
                                 enum node_stat_item item)
{
        atomic_long_add(x, &pgdat->vm_stat[item]);
        atomic_long_add(x, &vm_node_stat[item]);
}

static inline unsigned long global_zone_page_state(enum zone_stat_item item)
{
        long x = atomic_long_read(&vm_zone_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

static inline
unsigned long global_node_page_state_pages(enum node_stat_item item)
{
        long x = atomic_long_read(&vm_node_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

static inline unsigned long global_node_page_state(enum node_stat_item item)
{
        VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));

        return global_node_page_state_pages(item);
}

static inline unsigned long zone_page_state(struct zone *zone,
                                        enum zone_stat_item item)
{
        long x = atomic_long_read(&zone->vm_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

/*
 * More accurate version that also considers the currently pending
 * deltas. For that we need to loop over all cpus to find the current
 * deltas. There is no synchronization so the result cannot be
 * exactly accurate either.
 */
static inline unsigned long zone_page_state_snapshot(struct zone *zone,
                                        enum zone_stat_item item)
{
        long x = atomic_long_read(&zone->vm_stat[item]);

#ifdef CONFIG_SMP
        int cpu;
        for_each_online_cpu(cpu)
                x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_stat_diff[item];

        if (x < 0)
                x = 0;
#endif
        return x;
}

#ifdef CONFIG_NUMA
/* See __count_vm_event comment on why raw_cpu_inc is used. */
static inline void
__count_numa_event(struct zone *zone, enum numa_stat_item item)
{
        struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;

        raw_cpu_inc(pzstats->vm_numa_event[item]);
}

static inline void
__count_numa_events(struct zone *zone, enum numa_stat_item item, long delta)
{
        struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;

        raw_cpu_add(pzstats->vm_numa_event[item], delta);
}

extern unsigned long sum_zone_node_page_state(int node,
                                              enum zone_stat_item item);
extern unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item);
extern unsigned long node_page_state(struct pglist_data *pgdat,
                                                enum node_stat_item item);
extern unsigned long node_page_state_pages(struct pglist_data *pgdat,
                                           enum node_stat_item item);
extern void fold_vm_numa_events(void);
#else
#define sum_zone_node_page_state(node, item) global_zone_page_state(item)
#define node_page_state(node, item) global_node_page_state(item)
#define node_page_state_pages(node, item) global_node_page_state_pages(item)
static inline void fold_vm_numa_events(void)
{
}
#endif /* CONFIG_NUMA */

#ifdef CONFIG_SMP
void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long);
void __inc_zone_page_state(struct page *, enum zone_stat_item);
void __dec_zone_page_state(struct page *, enum zone_stat_item);

void __mod_node_page_state(struct pglist_data *, enum node_stat_item item, long);
void __inc_node_page_state(struct page *, enum node_stat_item);
void __dec_node_page_state(struct page *, enum node_stat_item);

void mod_zone_page_state(struct zone *, enum zone_stat_item, long);
void inc_zone_page_state(struct page *, enum zone_stat_item);
void dec_zone_page_state(struct page *, enum zone_stat_item);

void mod_node_page_state(struct pglist_data *, enum node_stat_item, long);
void inc_node_page_state(struct page *, enum node_stat_item);
void dec_node_page_state(struct page *, enum node_stat_item);

extern void inc_node_state(struct pglist_data *, enum node_stat_item);
extern void __inc_zone_state(struct zone *, enum zone_stat_item);
extern void __inc_node_state(struct pglist_data *, enum node_stat_item);
extern void dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_node_state(struct pglist_data *, enum node_stat_item);

void quiet_vmstat(void);
void cpu_vm_stats_fold(int cpu);
void refresh_zone_stat_thresholds(void);

struct ctl_table;
int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp,
                loff_t *ppos);

void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *);

int calculate_pressure_threshold(struct zone *zone);
int calculate_normal_threshold(struct zone *zone);
void set_pgdat_percpu_threshold(pg_data_t *pgdat,
                                int (*calculate_pressure)(struct zone *));
#else /* CONFIG_SMP */

/*
 * We do not maintain differentials in a single processor configuration.
 * The functions directly modify the zone and global counters.
 */
static inline void __mod_zone_page_state(struct zone *zone,
                        enum zone_stat_item item, long delta)
{
        zone_page_state_add(delta, zone, item);
}

static inline void __mod_node_page_state(struct pglist_data *pgdat,
                        enum node_stat_item item, int delta)
{
        if (vmstat_item_in_bytes(item)) {
                /*
                 * Only cgroups use subpage accounting right now; at
                 * the global level, these items still change in
                 * multiples of whole pages. Store them as pages
                 * internally to keep the per-cpu counters compact.
                 */
                VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
                delta >>= PAGE_SHIFT;
        }

        node_page_state_add(delta, pgdat, item);
}

static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
        atomic_long_inc(&zone->vm_stat[item]);
        atomic_long_inc(&vm_zone_stat[item]);
}

static inline void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
        atomic_long_inc(&pgdat->vm_stat[item]);
        atomic_long_inc(&vm_node_stat[item]);
}

static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
        atomic_long_dec(&zone->vm_stat[item]);
        atomic_long_dec(&vm_zone_stat[item]);
}

static inline void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
        atomic_long_dec(&pgdat->vm_stat[item]);
        atomic_long_dec(&vm_node_stat[item]);
}

static inline void __inc_zone_page_state(struct page *page,
                        enum zone_stat_item item)
{
        __inc_zone_state(page_zone(page), item);
}

static inline void __inc_node_page_state(struct page *page,
                        enum node_stat_item item)
{
        __inc_node_state(page_pgdat(page), item);
}


static inline void __dec_zone_page_state(struct page *page,
                        enum zone_stat_item item)
{
        __dec_zone_state(page_zone(page), item);
}

static inline void __dec_node_page_state(struct page *page,
                        enum node_stat_item item)
{
        __dec_node_state(page_pgdat(page), item);
}


/*
 * We only use atomic operations to update counters. So there is no need to
 * disable interrupts.
 */
#define inc_zone_page_state __inc_zone_page_state
#define dec_zone_page_state __dec_zone_page_state
#define mod_zone_page_state __mod_zone_page_state

#define inc_node_page_state __inc_node_page_state
#define dec_node_page_state __dec_node_page_state
#define mod_node_page_state __mod_node_page_state

#define inc_zone_state __inc_zone_state
#define inc_node_state __inc_node_state
#define dec_zone_state __dec_zone_state

#define set_pgdat_percpu_threshold(pgdat, callback) { }

static inline void refresh_zone_stat_thresholds(void) { }
static inline void cpu_vm_stats_fold(int cpu) { }
static inline void quiet_vmstat(void) { }

static inline void drain_zonestat(struct zone *zone,
                        struct per_cpu_zonestat *pzstats) { }
#endif                /* CONFIG_SMP */

static inline void __zone_stat_mod_folio(struct folio *folio,
                enum zone_stat_item item, long nr)
{
        __mod_zone_page_state(folio_zone(folio), item, nr);
}

static inline void __zone_stat_add_folio(struct folio *folio,
                enum zone_stat_item item)
{
        __mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio));
}

static inline void __zone_stat_sub_folio(struct folio *folio,
                enum zone_stat_item item)
{
        __mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio));
}

static inline void zone_stat_mod_folio(struct folio *folio,
                enum zone_stat_item item, long nr)
{
        mod_zone_page_state(folio_zone(folio), item, nr);
}

static inline void zone_stat_add_folio(struct folio *folio,
                enum zone_stat_item item)
{
        mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio));
}

static inline void zone_stat_sub_folio(struct folio *folio,
                enum zone_stat_item item)
{
        mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio));
}

static inline void __node_stat_mod_folio(struct folio *folio,
                enum node_stat_item item, long nr)
{
        __mod_node_page_state(folio_pgdat(folio), item, nr);
}

static inline void __node_stat_add_folio(struct folio *folio,
                enum node_stat_item item)
{
        __mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio));
}

static inline void __node_stat_sub_folio(struct folio *folio,
                enum node_stat_item item)
{
        __mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
}

static inline void node_stat_mod_folio(struct folio *folio,
                enum node_stat_item item, long nr)
{
        mod_node_page_state(folio_pgdat(folio), item, nr);
}

static inline void node_stat_add_folio(struct folio *folio,
                enum node_stat_item item)
{
        mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio));
}

static inline void node_stat_sub_folio(struct folio *folio,
                enum node_stat_item item)
{
        mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
}

extern const char * const vmstat_text[];

static inline const char *zone_stat_name(enum zone_stat_item item)
{
        return vmstat_text[item];
}

#ifdef CONFIG_NUMA
static inline const char *numa_stat_name(enum numa_stat_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           item];
}
#endif /* CONFIG_NUMA */

static inline const char *node_stat_name(enum node_stat_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           NR_VM_NUMA_EVENT_ITEMS +
                           item];
}

static inline const char *lru_list_name(enum lru_list lru)
{
        return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
}

static inline const char *writeback_stat_name(enum writeback_stat_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           NR_VM_NUMA_EVENT_ITEMS +
                           NR_VM_NODE_STAT_ITEMS +
                           item];
}

#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
static inline const char *vm_event_name(enum vm_event_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           NR_VM_NUMA_EVENT_ITEMS +
                           NR_VM_NODE_STAT_ITEMS +
                           NR_VM_WRITEBACK_STAT_ITEMS +
                           item];
}
#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */

#ifdef CONFIG_MEMCG

void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
                        int val);

static inline void mod_lruvec_state(struct lruvec *lruvec,
                                    enum node_stat_item idx, int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_lruvec_state(lruvec, idx, val);
        local_irq_restore(flags);
}

void __lruvec_stat_mod_folio(struct folio *folio,
                             enum node_stat_item idx, int val);

static inline void lruvec_stat_mod_folio(struct folio *folio,
                                         enum node_stat_item idx, int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __lruvec_stat_mod_folio(folio, idx, val);
        local_irq_restore(flags);
}

static inline void mod_lruvec_page_state(struct page *page,
                                         enum node_stat_item idx, int val)
{
        lruvec_stat_mod_folio(page_folio(page), idx, val);
}

#else

static inline void __mod_lruvec_state(struct lruvec *lruvec,
                                      enum node_stat_item idx, int val)
{
        __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
}

static inline void mod_lruvec_state(struct lruvec *lruvec,
                                    enum node_stat_item idx, int val)
{
        mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
}

static inline void __lruvec_stat_mod_folio(struct folio *folio,
                                         enum node_stat_item idx, int val)
{
        __mod_node_page_state(folio_pgdat(folio), idx, val);
}

static inline void lruvec_stat_mod_folio(struct folio *folio,
                                         enum node_stat_item idx, int val)
{
        mod_node_page_state(folio_pgdat(folio), idx, val);
}

static inline void mod_lruvec_page_state(struct page *page,
                                         enum node_stat_item idx, int val)
{
        mod_node_page_state(page_pgdat(page), idx, val);
}

#endif /* CONFIG_MEMCG */

static inline void __lruvec_stat_add_folio(struct folio *folio,
                                           enum node_stat_item idx)
{
        __lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio));
}

static inline void __lruvec_stat_sub_folio(struct folio *folio,
                                           enum node_stat_item idx)
{
        __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
}

static inline void lruvec_stat_add_folio(struct folio *folio,
                                         enum node_stat_item idx)
{
        lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio));
}

static inline void lruvec_stat_sub_folio(struct folio *folio,
                                         enum node_stat_item idx)
{
        lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
}
#endif /* _LINUX_VMSTAT_H */

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 


    2 































































































    1 




















    1 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Memory merging support.
 *
 * This code enables dynamic sharing of identical pages found in different
 * memory areas, even if they are not shared by fork()
 *
 * Copyright (C) 2008-2009 Red Hat, Inc.
 * Authors:
 *        Izik Eidus
 *        Andrea Arcangeli
 *        Chris Wright
 *        Hugh Dickins
 */

#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/fs.h>
#include <linux/mman.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/cputime.h>
#include <linux/rwsem.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/spinlock.h>
#include <linux/xxhash.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/wait.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/memory.h>
#include <linux/mmu_notifier.h>
#include <linux/swap.h>
#include <linux/ksm.h>
#include <linux/hashtable.h>
#include <linux/freezer.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/pagewalk.h>

#include <asm/tlbflush.h>
#include "internal.h"
#include "mm_slot.h"

#define CREATE_TRACE_POINTS
#include <trace/events/ksm.h>

#ifdef CONFIG_NUMA
#define NUMA(x)                (x)
#define DO_NUMA(x)        do { (x); } while (0)
#else
#define NUMA(x)                (0)
#define DO_NUMA(x)        do { } while (0)
#endif

typedef u8 rmap_age_t;

/**
 * DOC: Overview
 *
 * A few notes about the KSM scanning process,
 * to make it easier to understand the data structures below:
 *
 * In order to reduce excessive scanning, KSM sorts the memory pages by their
 * contents into a data structure that holds pointers to the pages' locations.
 *
 * Since the contents of the pages may change at any moment, KSM cannot just
 * insert the pages into a normal sorted tree and expect it to find anything.
 * Therefore KSM uses two data structures - the stable and the unstable tree.
 *
 * The stable tree holds pointers to all the merged pages (ksm pages), sorted
 * by their contents.  Because each such page is write-protected, searching on
 * this tree is fully assured to be working (except when pages are unmapped),
 * and therefore this tree is called the stable tree.
 *
 * The stable tree node includes information required for reverse
 * mapping from a KSM page to virtual addresses that map this page.
 *
 * In order to avoid large latencies of the rmap walks on KSM pages,
 * KSM maintains two types of nodes in the stable tree:
 *
 * * the regular nodes that keep the reverse mapping structures in a
 *   linked list
 * * the "chains" that link nodes ("dups") that represent the same
 *   write protected memory content, but each "dup" corresponds to a
 *   different KSM page copy of that content
 *
 * Internally, the regular nodes, "dups" and "chains" are represented
 * using the same struct ksm_stable_node structure.
 *
 * In addition to the stable tree, KSM uses a second data structure called the
 * unstable tree: this tree holds pointers to pages which have been found to
 * be "unchanged for a period of time".  The unstable tree sorts these pages
 * by their contents, but since they are not write-protected, KSM cannot rely
 * upon the unstable tree to work correctly - the unstable tree is liable to
 * be corrupted as its contents are modified, and so it is called unstable.
 *
 * KSM solves this problem by several techniques:
 *
 * 1) The unstable tree is flushed every time KSM completes scanning all
 *    memory areas, and then the tree is rebuilt again from the beginning.
 * 2) KSM will only insert into the unstable tree, pages whose hash value
 *    has not changed since the previous scan of all memory areas.
 * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
 *    colors of the nodes and not on their contents, assuring that even when
 *    the tree gets "corrupted" it won't get out of balance, so scanning time
 *    remains the same (also, searching and inserting nodes in an rbtree uses
 *    the same algorithm, so we have no overhead when we flush and rebuild).
 * 4) KSM never flushes the stable tree, which means that even if it were to
 *    take 10 attempts to find a page in the unstable tree, once it is found,
 *    it is secured in the stable tree.  (When we scan a new page, we first
 *    compare it against the stable tree, and then against the unstable tree.)
 *
 * If the merge_across_nodes tunable is unset, then KSM maintains multiple
 * stable trees and multiple unstable trees: one of each for each NUMA node.
 */

/**
 * struct ksm_mm_slot - ksm information per mm that is being scanned
 * @slot: hash lookup from mm to mm_slot
 * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
 */
struct ksm_mm_slot {
        struct mm_slot slot;
        struct ksm_rmap_item *rmap_list;
};

/**
 * struct ksm_scan - cursor for scanning
 * @mm_slot: the current mm_slot we are scanning
 * @address: the next address inside that to be scanned
 * @rmap_list: link to the next rmap to be scanned in the rmap_list
 * @seqnr: count of completed full scans (needed when removing unstable node)
 *
 * There is only the one ksm_scan instance of this cursor structure.
 */
struct ksm_scan {
        struct ksm_mm_slot *mm_slot;
        unsigned long address;
        struct ksm_rmap_item **rmap_list;
        unsigned long seqnr;
};

/**
 * struct ksm_stable_node - node of the stable rbtree
 * @node: rb node of this ksm page in the stable tree
 * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
 * @hlist_dup: linked into the stable_node->hlist with a stable_node chain
 * @list: linked into migrate_nodes, pending placement in the proper node tree
 * @hlist: hlist head of rmap_items using this ksm page
 * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
 * @chain_prune_time: time of the last full garbage collection
 * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
 * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
 */
struct ksm_stable_node {
        union {
                struct rb_node node;        /* when node of stable tree */
                struct {                /* when listed for migration */
                        struct list_head *head;
                        struct {
                                struct hlist_node hlist_dup;
                                struct list_head list;
                        };
                };
        };
        struct hlist_head hlist;
        union {
                unsigned long kpfn;
                unsigned long chain_prune_time;
        };
        /*
         * STABLE_NODE_CHAIN can be any negative number in
         * rmap_hlist_len negative range, but better not -1 to be able
         * to reliably detect underflows.
         */
#define STABLE_NODE_CHAIN -1024
        int rmap_hlist_len;
#ifdef CONFIG_NUMA
        int nid;
#endif
};

/**
 * struct ksm_rmap_item - reverse mapping item for virtual addresses
 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
 * @nid: NUMA node id of unstable tree in which linked (may not match page)
 * @mm: the memory structure this rmap_item is pointing into
 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
 * @oldchecksum: previous checksum of the page at that virtual address
 * @node: rb node of this rmap_item in the unstable tree
 * @head: pointer to stable_node heading this list in the stable tree
 * @hlist: link into hlist of rmap_items hanging off that stable_node
 * @age: number of scan iterations since creation
 * @remaining_skips: how many scans to skip
 */
struct ksm_rmap_item {
        struct ksm_rmap_item *rmap_list;
        union {
                struct anon_vma *anon_vma;        /* when stable */
#ifdef CONFIG_NUMA
                int nid;                /* when node of unstable tree */
#endif
        };
        struct mm_struct *mm;
        unsigned long address;                /* + low bits used for flags below */
        unsigned int oldchecksum;        /* when unstable */
        rmap_age_t age;
        rmap_age_t remaining_skips;
        union {
                struct rb_node node;        /* when node of unstable tree */
                struct {                /* when listed from stable tree */
                        struct ksm_stable_node *head;
                        struct hlist_node hlist;
                };
        };
};

#define SEQNR_MASK        0x0ff        /* low bits of unstable tree seqnr */
#define UNSTABLE_FLAG        0x100        /* is a node of the unstable tree */
#define STABLE_FLAG        0x200        /* is listed from the stable tree */

/* The stable and unstable tree heads */
static struct rb_root one_stable_tree[1] = { RB_ROOT };
static struct rb_root one_unstable_tree[1] = { RB_ROOT };
static struct rb_root *root_stable_tree = one_stable_tree;
static struct rb_root *root_unstable_tree = one_unstable_tree;

/* Recently migrated nodes of stable tree, pending proper placement */
static LIST_HEAD(migrate_nodes);
#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)

#define MM_SLOTS_HASH_BITS 10
static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);

static struct ksm_mm_slot ksm_mm_head = {
        .slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node),
};
static struct ksm_scan ksm_scan = {
        .mm_slot = &ksm_mm_head,
};

static struct kmem_cache *rmap_item_cache;
static struct kmem_cache *stable_node_cache;
static struct kmem_cache *mm_slot_cache;

/* Default number of pages to scan per batch */
#define DEFAULT_PAGES_TO_SCAN 100

/* The number of pages scanned */
static unsigned long ksm_pages_scanned;

/* The number of nodes in the stable tree */
static unsigned long ksm_pages_shared;

/* The number of page slots additionally sharing those nodes */
static unsigned long ksm_pages_sharing;

/* The number of nodes in the unstable tree */
static unsigned long ksm_pages_unshared;

/* The number of rmap_items in use: to calculate pages_volatile */
static unsigned long ksm_rmap_items;

/* The number of stable_node chains */
static unsigned long ksm_stable_node_chains;

/* The number of stable_node dups linked to the stable_node chains */
static unsigned long ksm_stable_node_dups;

/* Delay in pruning stale stable_node_dups in the stable_node_chains */
static unsigned int ksm_stable_node_chains_prune_millisecs = 2000;

/* Maximum number of page slots sharing a stable node */
static int ksm_max_page_sharing = 256;

/* Number of pages ksmd should scan in one batch */
static unsigned int ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN;

/* Milliseconds ksmd should sleep between batches */
static unsigned int ksm_thread_sleep_millisecs = 20;

/* Checksum of an empty (zeroed) page */
static unsigned int zero_checksum __read_mostly;

/* Whether to merge empty (zeroed) pages with actual zero pages */
static bool ksm_use_zero_pages __read_mostly;

/* Skip pages that couldn't be de-duplicated previously */
/* Default to true at least temporarily, for testing */
static bool ksm_smart_scan = true;

/* The number of zero pages which is placed by KSM */
atomic_long_t ksm_zero_pages = ATOMIC_LONG_INIT(0);

/* The number of pages that have been skipped due to "smart scanning" */
static unsigned long ksm_pages_skipped;

/* Don't scan more than max pages per batch. */
static unsigned long ksm_advisor_max_pages_to_scan = 30000;

/* Min CPU for scanning pages per scan */
#define KSM_ADVISOR_MIN_CPU 10

/* Max CPU for scanning pages per scan */
static unsigned int ksm_advisor_max_cpu =  70;

/* Target scan time in seconds to analyze all KSM candidate pages. */
static unsigned long ksm_advisor_target_scan_time = 200;

/* Exponentially weighted moving average. */
#define EWMA_WEIGHT 30

/**
 * struct advisor_ctx - metadata for KSM advisor
 * @start_scan: start time of the current scan
 * @scan_time: scan time of previous scan
 * @change: change in percent to pages_to_scan parameter
 * @cpu_time: cpu time consumed by the ksmd thread in the previous scan
 */
struct advisor_ctx {
        ktime_t start_scan;
        unsigned long scan_time;
        unsigned long change;
        unsigned long long cpu_time;
};
static struct advisor_ctx advisor_ctx;

/* Define different advisor's */
enum ksm_advisor_type {
        KSM_ADVISOR_NONE,
        KSM_ADVISOR_SCAN_TIME,
};
static enum ksm_advisor_type ksm_advisor;

#ifdef CONFIG_SYSFS
/*
 * Only called through the sysfs control interface:
 */

/* At least scan this many pages per batch. */
static unsigned long ksm_advisor_min_pages_to_scan = 500;

static void set_advisor_defaults(void)
{
        if (ksm_advisor == KSM_ADVISOR_NONE) {
                ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN;
        } else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) {
                advisor_ctx = (const struct advisor_ctx){ 0 };
                ksm_thread_pages_to_scan = ksm_advisor_min_pages_to_scan;
        }
}
#endif /* CONFIG_SYSFS */

static inline void advisor_start_scan(void)
{
        if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
                advisor_ctx.start_scan = ktime_get();
}

/*
 * Use previous scan time if available, otherwise use current scan time as an
 * approximation for the previous scan time.
 */
static inline unsigned long prev_scan_time(struct advisor_ctx *ctx,
                                           unsigned long scan_time)
{
        return ctx->scan_time ? ctx->scan_time : scan_time;
}

/* Calculate exponential weighted moving average */
static unsigned long ewma(unsigned long prev, unsigned long curr)
{
        return ((100 - EWMA_WEIGHT) * prev + EWMA_WEIGHT * curr) / 100;
}

/*
 * The scan time advisor is based on the current scan rate and the target
 * scan rate.
 *
 *      new_pages_to_scan = pages_to_scan * (scan_time / target_scan_time)
 *
 * To avoid perturbations it calculates a change factor of previous changes.
 * A new change factor is calculated for each iteration and it uses an
 * exponentially weighted moving average. The new pages_to_scan value is
 * multiplied with that change factor:
 *
 *      new_pages_to_scan *= change facor
 *
 * The new_pages_to_scan value is limited by the cpu min and max values. It
 * calculates the cpu percent for the last scan and calculates the new
 * estimated cpu percent cost for the next scan. That value is capped by the
 * cpu min and max setting.
 *
 * In addition the new pages_to_scan value is capped by the max and min
 * limits.
 */
static void scan_time_advisor(void)
{
        unsigned int cpu_percent;
        unsigned long cpu_time;
        unsigned long cpu_time_diff;
        unsigned long cpu_time_diff_ms;
        unsigned long pages;
        unsigned long per_page_cost;
        unsigned long factor;
        unsigned long change;
        unsigned long last_scan_time;
        unsigned long scan_time;

        /* Convert scan time to seconds */
        scan_time = div_s64(ktime_ms_delta(ktime_get(), advisor_ctx.start_scan),
                            MSEC_PER_SEC);
        scan_time = scan_time ? scan_time : 1;

        /* Calculate CPU consumption of ksmd background thread */
        cpu_time = task_sched_runtime(current);
        cpu_time_diff = cpu_time - advisor_ctx.cpu_time;
        cpu_time_diff_ms = cpu_time_diff / 1000 / 1000;

        cpu_percent = (cpu_time_diff_ms * 100) / (scan_time * 1000);
        cpu_percent = cpu_percent ? cpu_percent : 1;
        last_scan_time = prev_scan_time(&advisor_ctx, scan_time);

        /* Calculate scan time as percentage of target scan time */
        factor = ksm_advisor_target_scan_time * 100 / scan_time;
        factor = factor ? factor : 1;

        /*
         * Calculate scan time as percentage of last scan time and use
         * exponentially weighted average to smooth it
         */
        change = scan_time * 100 / last_scan_time;
        change = change ? change : 1;
        change = ewma(advisor_ctx.change, change);

        /* Calculate new scan rate based on target scan rate. */
        pages = ksm_thread_pages_to_scan * 100 / factor;
        /* Update pages_to_scan by weighted change percentage. */
        pages = pages * change / 100;

        /* Cap new pages_to_scan value */
        per_page_cost = ksm_thread_pages_to_scan / cpu_percent;
        per_page_cost = per_page_cost ? per_page_cost : 1;

        pages = min(pages, per_page_cost * ksm_advisor_max_cpu);
        pages = max(pages, per_page_cost * KSM_ADVISOR_MIN_CPU);
        pages = min(pages, ksm_advisor_max_pages_to_scan);

        /* Update advisor context */
        advisor_ctx.change = change;
        advisor_ctx.scan_time = scan_time;
        advisor_ctx.cpu_time = cpu_time;

        ksm_thread_pages_to_scan = pages;
        trace_ksm_advisor(scan_time, pages, cpu_percent);
}

static void advisor_stop_scan(void)
{
        if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
                scan_time_advisor();
}

#ifdef CONFIG_NUMA
/* Zeroed when merging across nodes is not allowed */
static unsigned int ksm_merge_across_nodes = 1;
static int ksm_nr_node_ids = 1;
#else
#define ksm_merge_across_nodes        1U
#define ksm_nr_node_ids                1
#endif

#define KSM_RUN_STOP        0
#define KSM_RUN_MERGE        1
#define KSM_RUN_UNMERGE        2
#define KSM_RUN_OFFLINE        4
static unsigned long ksm_run = KSM_RUN_STOP;
static void wait_while_offlining(void);

static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
static DEFINE_MUTEX(ksm_thread_mutex);
static DEFINE_SPINLOCK(ksm_mmlist_lock);

#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\
                sizeof(struct __struct), __alignof__(struct __struct),\
                (__flags), NULL)

static int __init ksm_slab_init(void)
{
        rmap_item_cache = KSM_KMEM_CACHE(ksm_rmap_item, 0);
        if (!rmap_item_cache)
                goto out;

        stable_node_cache = KSM_KMEM_CACHE(ksm_stable_node, 0);
        if (!stable_node_cache)
                goto out_free1;

        mm_slot_cache = KSM_KMEM_CACHE(ksm_mm_slot, 0);
        if (!mm_slot_cache)
                goto out_free2;

        return 0;

out_free2:
        kmem_cache_destroy(stable_node_cache);
out_free1:
        kmem_cache_destroy(rmap_item_cache);
out:
        return -ENOMEM;
}

static void __init ksm_slab_free(void)
{
        kmem_cache_destroy(mm_slot_cache);
        kmem_cache_destroy(stable_node_cache);
        kmem_cache_destroy(rmap_item_cache);
        mm_slot_cache = NULL;
}

static __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain)
{
        return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
}

static __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup)
{
        return dup->head == STABLE_NODE_DUP_HEAD;
}

static inline void stable_node_chain_add_dup(struct ksm_stable_node *dup,
                                             struct ksm_stable_node *chain)
{
        VM_BUG_ON(is_stable_node_dup(dup));
        dup->head = STABLE_NODE_DUP_HEAD;
        VM_BUG_ON(!is_stable_node_chain(chain));
        hlist_add_head(&dup->hlist_dup, &chain->hlist);
        ksm_stable_node_dups++;
}

static inline void __stable_node_dup_del(struct ksm_stable_node *dup)
{
        VM_BUG_ON(!is_stable_node_dup(dup));
        hlist_del(&dup->hlist_dup);
        ksm_stable_node_dups--;
}

static inline void stable_node_dup_del(struct ksm_stable_node *dup)
{
        VM_BUG_ON(is_stable_node_chain(dup));
        if (is_stable_node_dup(dup))
                __stable_node_dup_del(dup);
        else
                rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
#ifdef CONFIG_DEBUG_VM
        dup->head = NULL;
#endif
}

static inline struct ksm_rmap_item *alloc_rmap_item(void)
{
        struct ksm_rmap_item *rmap_item;

        rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
                                                __GFP_NORETRY | __GFP_NOWARN);
        if (rmap_item)
                ksm_rmap_items++;
        return rmap_item;
}

static inline void free_rmap_item(struct ksm_rmap_item *rmap_item)
{
        ksm_rmap_items--;
        rmap_item->mm->ksm_rmap_items--;
        rmap_item->mm = NULL;        /* debug safety */
        kmem_cache_free(rmap_item_cache, rmap_item);
}

static inline struct ksm_stable_node *alloc_stable_node(void)
{
        /*
         * The allocation can take too long with GFP_KERNEL when memory is under
         * pressure, which may lead to hung task warnings.  Adding __GFP_HIGH
         * grants access to memory reserves, helping to avoid this problem.
         */
        return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
}

static inline void free_stable_node(struct ksm_stable_node *stable_node)
{
        VM_BUG_ON(stable_node->rmap_hlist_len &&
                  !is_stable_node_chain(stable_node));
        kmem_cache_free(stable_node_cache, stable_node);
}

/*
 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
 * page tables after it has passed through ksm_exit() - which, if necessary,
 * takes mmap_lock briefly to serialize against them.  ksm_exit() does not set
 * a special flag: they can just back out as soon as mm_users goes to zero.
 * ksm_test_exit() is used throughout to make this test for exit: in some
 * places for correctness, in some places just to avoid unnecessary work.
 */
static inline bool ksm_test_exit(struct mm_struct *mm)
{
        return atomic_read(&mm->mm_users) == 0;
}

static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
                        struct mm_walk *walk)
{
        struct page *page = NULL;
        spinlock_t *ptl;
        pte_t *pte;
        pte_t ptent;
        int ret;

        pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
        if (!pte)
                return 0;
        ptent = ptep_get(pte);
        if (pte_present(ptent)) {
                page = vm_normal_page(walk->vma, addr, ptent);
        } else if (!pte_none(ptent)) {
                swp_entry_t entry = pte_to_swp_entry(ptent);

                /*
                 * As KSM pages remain KSM pages until freed, no need to wait
                 * here for migration to end.
                 */
                if (is_migration_entry(entry))
                        page = pfn_swap_entry_to_page(entry);
        }
        /* return 1 if the page is an normal ksm page or KSM-placed zero page */
        ret = (page && PageKsm(page)) || is_ksm_zero_pte(ptent);
        pte_unmap_unlock(pte, ptl);
        return ret;
}

static const struct mm_walk_ops break_ksm_ops = {
        .pmd_entry = break_ksm_pmd_entry,
        .walk_lock = PGWALK_RDLOCK,
};

static const struct mm_walk_ops break_ksm_lock_vma_ops = {
        .pmd_entry = break_ksm_pmd_entry,
        .walk_lock = PGWALK_WRLOCK,
};

/*
 * We use break_ksm to break COW on a ksm page by triggering unsharing,
 * such that the ksm page will get replaced by an exclusive anonymous page.
 *
 * We take great care only to touch a ksm page, in a VM_MERGEABLE vma,
 * in case the application has unmapped and remapped mm,addr meanwhile.
 * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
 * mmap of /dev/mem, where we would not want to touch it.
 *
 * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context
 * of the process that owns 'vma'.  We also do not want to enforce
 * protection keys here anyway.
 */
static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma)
{
        vm_fault_t ret = 0;
        const struct mm_walk_ops *ops = lock_vma ?
                                &break_ksm_lock_vma_ops : &break_ksm_ops;

        do {
                int ksm_page;

                cond_resched();
                ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL);
                if (WARN_ON_ONCE(ksm_page < 0))
                        return ksm_page;
                if (!ksm_page)
                        return 0;
                ret = handle_mm_fault(vma, addr,
                                      FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
                                      NULL);
        } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
        /*
         * We must loop until we no longer find a KSM page because
         * handle_mm_fault() may back out if there's any difficulty e.g. if
         * pte accessed bit gets updated concurrently.
         *
         * VM_FAULT_SIGBUS could occur if we race with truncation of the
         * backing file, which also invalidates anonymous pages: that's
         * okay, that truncation will have unmapped the PageKsm for us.
         *
         * VM_FAULT_OOM: at the time of writing (late July 2009), setting
         * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
         * current task has TIF_MEMDIE set, and will be OOM killed on return
         * to user; and ksmd, having no mm, would never be chosen for that.
         *
         * But if the mm is in a limited mem_cgroup, then the fault may fail
         * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
         * even ksmd can fail in this way - though it's usually breaking ksm
         * just to undo a merge it made a moment before, so unlikely to oom.
         *
         * That's a pity: we might therefore have more kernel pages allocated
         * than we're counting as nodes in the stable tree; but ksm_do_scan
         * will retry to break_cow on each pass, so should recover the page
         * in due course.  The important thing is to not let VM_MERGEABLE
         * be cleared while any such pages might remain in the area.
         */
        return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
}

static bool vma_ksm_compatible(struct vm_area_struct *vma)
{
        if (vma->vm_flags & (VM_SHARED  | VM_MAYSHARE   | VM_PFNMAP  |
                             VM_IO      | VM_DONTEXPAND | VM_HUGETLB |
                             VM_MIXEDMAP))
                return false;                /* just ignore the advice */

        if (vma_is_dax(vma))
                return false;

#ifdef VM_SAO
        if (vma->vm_flags & VM_SAO)
                return false;
#endif
#ifdef VM_SPARC_ADI
        if (vma->vm_flags & VM_SPARC_ADI)
                return false;
#endif

        return true;
}

static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
                unsigned long addr)
{
        struct vm_area_struct *vma;
        if (ksm_test_exit(mm))
                return NULL;
        vma = vma_lookup(mm, addr);
        if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
                return NULL;
        return vma;
}

static void break_cow(struct ksm_rmap_item *rmap_item)
{
        struct mm_struct *mm = rmap_item->mm;
        unsigned long addr = rmap_item->address;
        struct vm_area_struct *vma;

        /*
         * It is not an accident that whenever we want to break COW
         * to undo, we also need to drop a reference to the anon_vma.
         */
        put_anon_vma(rmap_item->anon_vma);

        mmap_read_lock(mm);
        vma = find_mergeable_vma(mm, addr);
        if (vma)
                break_ksm(vma, addr, false);
        mmap_read_unlock(mm);
}

static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item)
{
        struct mm_struct *mm = rmap_item->mm;
        unsigned long addr = rmap_item->address;
        struct vm_area_struct *vma;
        struct page *page;

        mmap_read_lock(mm);
        vma = find_mergeable_vma(mm, addr);
        if (!vma)
                goto out;

        page = follow_page(vma, addr, FOLL_GET);
        if (IS_ERR_OR_NULL(page))
                goto out;
        if (is_zone_device_page(page))
                goto out_putpage;
        if (PageAnon(page)) {
                flush_anon_page(vma, page, addr);
                flush_dcache_page(page);
        } else {
out_putpage:
                put_page(page);
out:
                page = NULL;
        }
        mmap_read_unlock(mm);
        return page;
}

/*
 * This helper is used for getting right index into array of tree roots.
 * When merge_across_nodes knob is set to 1, there are only two rb-trees for
 * stable and unstable pages from all nodes with roots in index 0. Otherwise,
 * every node has its own stable and unstable tree.
 */
static inline int get_kpfn_nid(unsigned long kpfn)
{
        return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
}

static struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup,
                                                   struct rb_root *root)
{
        struct ksm_stable_node *chain = alloc_stable_node();
        VM_BUG_ON(is_stable_node_chain(dup));
        if (likely(chain)) {
                INIT_HLIST_HEAD(&chain->hlist);
                chain->chain_prune_time = jiffies;
                chain->rmap_hlist_len = STABLE_NODE_CHAIN;
#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
                chain->nid = NUMA_NO_NODE; /* debug */
#endif
                ksm_stable_node_chains++;

                /*
                 * Put the stable node chain in the first dimension of
                 * the stable tree and at the same time remove the old
                 * stable node.
                 */
                rb_replace_node(&dup->node, &chain->node, root);

                /*
                 * Move the old stable node to the second dimension
                 * queued in the hlist_dup. The invariant is that all
                 * dup stable_nodes in the chain->hlist point to pages
                 * that are write protected and have the exact same
                 * content.
                 */
                stable_node_chain_add_dup(dup, chain);
        }
        return chain;
}

static inline void free_stable_node_chain(struct ksm_stable_node *chain,
                                          struct rb_root *root)
{
        rb_erase(&chain->node, root);
        free_stable_node(chain);
        ksm_stable_node_chains--;
}

static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node)
{
        struct ksm_rmap_item *rmap_item;

        /* check it's not STABLE_NODE_CHAIN or negative */
        BUG_ON(stable_node->rmap_hlist_len < 0);

        hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
                if (rmap_item->hlist.next) {
                        ksm_pages_sharing--;
                        trace_ksm_remove_rmap_item(stable_node->kpfn, rmap_item, rmap_item->mm);
                } else {
                        ksm_pages_shared--;
                }

                rmap_item->mm->ksm_merging_pages--;

                VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
                stable_node->rmap_hlist_len--;
                put_anon_vma(rmap_item->anon_vma);
                rmap_item->address &= PAGE_MASK;
                cond_resched();
        }

        /*
         * We need the second aligned pointer of the migrate_nodes
         * list_head to stay clear from the rb_parent_color union
         * (aligned and different than any node) and also different
         * from &migrate_nodes. This will verify that future list.h changes
         * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it.
         */
        BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
        BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);

        trace_ksm_remove_ksm_page(stable_node->kpfn);
        if (stable_node->head == &migrate_nodes)
                list_del(&stable_node->list);
        else
                stable_node_dup_del(stable_node);
        free_stable_node(stable_node);
}

enum ksm_get_folio_flags {
        KSM_GET_FOLIO_NOLOCK,
        KSM_GET_FOLIO_LOCK,
        KSM_GET_FOLIO_TRYLOCK
};

/*
 * ksm_get_folio: checks if the page indicated by the stable node
 * is still its ksm page, despite having held no reference to it.
 * In which case we can trust the content of the page, and it
 * returns the gotten page; but if the page has now been zapped,
 * remove the stale node from the stable tree and return NULL.
 * But beware, the stable node's page might be being migrated.
 *
 * You would expect the stable_node to hold a reference to the ksm page.
 * But if it increments the page's count, swapping out has to wait for
 * ksmd to come around again before it can free the page, which may take
 * seconds or even minutes: much too unresponsive.  So instead we use a
 * "keyhole reference": access to the ksm page from the stable node peeps
 * out through its keyhole to see if that page still holds the right key,
 * pointing back to this stable node.  This relies on freeing a PageAnon
 * page to reset its page->mapping to NULL, and relies on no other use of
 * a page to put something that might look like our key in page->mapping.
 * is on its way to being freed; but it is an anomaly to bear in mind.
 */
static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node,
                                 enum ksm_get_folio_flags flags)
{
        struct folio *folio;
        void *expected_mapping;
        unsigned long kpfn;

        expected_mapping = (void *)((unsigned long)stable_node |
                                        PAGE_MAPPING_KSM);
again:
        kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */
        folio = pfn_folio(kpfn);
        if (READ_ONCE(folio->mapping) != expected_mapping)
                goto stale;

        /*
         * We cannot do anything with the page while its refcount is 0.
         * Usually 0 means free, or tail of a higher-order page: in which
         * case this node is no longer referenced, and should be freed;
         * however, it might mean that the page is under page_ref_freeze().
         * The __remove_mapping() case is easy, again the node is now stale;
         * the same is in reuse_ksm_page() case; but if page is swapcache
         * in folio_migrate_mapping(), it might still be our page,
         * in which case it's essential to keep the node.
         */
        while (!folio_try_get(folio)) {
                /*
                 * Another check for page->mapping != expected_mapping would
                 * work here too.  We have chosen the !PageSwapCache test to
                 * optimize the common case, when the page is or is about to
                 * be freed: PageSwapCache is cleared (under spin_lock_irq)
                 * in the ref_freeze section of __remove_mapping(); but Anon
                 * folio->mapping reset to NULL later, in free_pages_prepare().
                 */
                if (!folio_test_swapcache(folio))
                        goto stale;
                cpu_relax();
        }

        if (READ_ONCE(folio->mapping) != expected_mapping) {
                folio_put(folio);
                goto stale;
        }

        if (flags == KSM_GET_FOLIO_TRYLOCK) {
                if (!folio_trylock(folio)) {
                        folio_put(folio);
                        return ERR_PTR(-EBUSY);
                }
        } else if (flags == KSM_GET_FOLIO_LOCK)
                folio_lock(folio);

        if (flags != KSM_GET_FOLIO_NOLOCK) {
                if (READ_ONCE(folio->mapping) != expected_mapping) {
                        folio_unlock(folio);
                        folio_put(folio);
                        goto stale;
                }
        }
        return folio;

stale:
        /*
         * We come here from above when page->mapping or !PageSwapCache
         * suggests that the node is stale; but it might be under migration.
         * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(),
         * before checking whether node->kpfn has been changed.
         */
        smp_rmb();
        if (READ_ONCE(stable_node->kpfn) != kpfn)
                goto again;
        remove_node_from_stable_tree(stable_node);
        return NULL;
}

/*
 * Removing rmap_item from stable or unstable tree.
 * This function will clean the information from the stable/unstable tree.
 */
static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item)
{
        if (rmap_item->address & STABLE_FLAG) {
                struct ksm_stable_node *stable_node;
                struct folio *folio;

                stable_node = rmap_item->head;
                folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK);
                if (!folio)
                        goto out;

                hlist_del(&rmap_item->hlist);
                folio_unlock(folio);
                folio_put(folio);

                if (!hlist_empty(&stable_node->hlist))
                        ksm_pages_sharing--;
                else
                        ksm_pages_shared--;

                rmap_item->mm->ksm_merging_pages--;

                VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
                stable_node->rmap_hlist_len--;

                put_anon_vma(rmap_item->anon_vma);
                rmap_item->head = NULL;
                rmap_item->address &= PAGE_MASK;

        } else if (rmap_item->address & UNSTABLE_FLAG) {
                unsigned char age;
                /*
                 * Usually ksmd can and must skip the rb_erase, because
                 * root_unstable_tree was already reset to RB_ROOT.
                 * But be careful when an mm is exiting: do the rb_erase
                 * if this rmap_item was inserted by this scan, rather
                 * than left over from before.
                 */
                age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
                BUG_ON(age > 1);
                if (!age)
                        rb_erase(&rmap_item->node,
                                 root_unstable_tree + NUMA(rmap_item->nid));
                ksm_pages_unshared--;
                rmap_item->address &= PAGE_MASK;
        }
out:
        cond_resched();                /* we're called from many long loops */
}

static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
{
        while (*rmap_list) {
                struct ksm_rmap_item *rmap_item = *rmap_list;
                *rmap_list = rmap_item->rmap_list;
                remove_rmap_item_from_tree(rmap_item);
                free_rmap_item(rmap_item);
        }
}

/*
 * Though it's very tempting to unmerge rmap_items from stable tree rather
 * than check every pte of a given vma, the locking doesn't quite work for
 * that - an rmap_item is assigned to the stable tree after inserting ksm
 * page and upping mmap_lock.  Nor does it fit with the way we skip dup'ing
 * rmap_items from parent to child at fork time (so as not to waste time
 * if exit comes before the next scan reaches it).
 *
 * Similarly, although we'd like to remove rmap_items (so updating counts
 * and freeing memory) when unmerging an area, it's easier to leave that
 * to the next pass of ksmd - consider, for example, how ksmd might be
 * in cmp_and_merge_page on one of the rmap_items we would be removing.
 */
static int unmerge_ksm_pages(struct vm_area_struct *vma,
                             unsigned long start, unsigned long end, bool lock_vma)
{
        unsigned long addr;
        int err = 0;

        for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
                if (ksm_test_exit(vma->vm_mm))
                        break;
                if (signal_pending(current))
                        err = -ERESTARTSYS;
                else
                        err = break_ksm(vma, addr, lock_vma);
        }
        return err;
}

static inline struct ksm_stable_node *folio_stable_node(struct folio *folio)
{
        return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
}

static inline struct ksm_stable_node *page_stable_node(struct page *page)
{
        return folio_stable_node(page_folio(page));
}

static inline void folio_set_stable_node(struct folio *folio,
                                         struct ksm_stable_node *stable_node)
{
        VM_WARN_ON_FOLIO(folio_test_anon(folio) && PageAnonExclusive(&folio->page), folio);
        folio->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
}

#ifdef CONFIG_SYSFS
/*
 * Only called through the sysfs control interface:
 */
static int remove_stable_node(struct ksm_stable_node *stable_node)
{
        struct folio *folio;
        int err;

        folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK);
        if (!folio) {
                /*
                 * ksm_get_folio did remove_node_from_stable_tree itself.
                 */
                return 0;
        }

        /*
         * Page could be still mapped if this races with __mmput() running in
         * between ksm_exit() and exit_mmap(). Just refuse to let
         * merge_across_nodes/max_page_sharing be switched.
         */
        err = -EBUSY;
        if (!folio_mapped(folio)) {
                /*
                 * The stable node did not yet appear stale to ksm_get_folio(),
                 * since that allows for an unmapped ksm folio to be recognized
                 * right up until it is freed; but the node is safe to remove.
                 * This folio might be in an LRU cache waiting to be freed,
                 * or it might be in the swapcache (perhaps under writeback),
                 * or it might have been removed from swapcache a moment ago.
                 */
                folio_set_stable_node(folio, NULL);
                remove_node_from_stable_tree(stable_node);
                err = 0;
        }

        folio_unlock(folio);
        folio_put(folio);
        return err;
}

static int remove_stable_node_chain(struct ksm_stable_node *stable_node,
                                    struct rb_root *root)
{
        struct ksm_stable_node *dup;
        struct hlist_node *hlist_safe;

        if (!is_stable_node_chain(stable_node)) {
                VM_BUG_ON(is_stable_node_dup(stable_node));
                if (remove_stable_node(stable_node))
                        return true;
                else
                        return false;
        }

        hlist_for_each_entry_safe(dup, hlist_safe,
                                  &stable_node->hlist, hlist_dup) {
                VM_BUG_ON(!is_stable_node_dup(dup));
                if (remove_stable_node(dup))
                        return true;
        }
        BUG_ON(!hlist_empty(&stable_node->hlist));
        free_stable_node_chain(stable_node, root);
        return false;
}

static int remove_all_stable_nodes(void)
{
        struct ksm_stable_node *stable_node, *next;
        int nid;
        int err = 0;

        for (nid = 0; nid < ksm_nr_node_ids; nid++) {
                while (root_stable_tree[nid].rb_node) {
                        stable_node = rb_entry(root_stable_tree[nid].rb_node,
                                                struct ksm_stable_node, node);
                        if (remove_stable_node_chain(stable_node,
                                                     root_stable_tree + nid)) {
                                err = -EBUSY;
                                break;        /* proceed to next nid */
                        }
                        cond_resched();
                }
        }
        list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
                if (remove_stable_node(stable_node))
                        err = -EBUSY;
                cond_resched();
        }
        return err;
}

static int unmerge_and_remove_all_rmap_items(void)
{
        struct ksm_mm_slot *mm_slot;
        struct mm_slot *slot;
        struct mm_struct *mm;
        struct vm_area_struct *vma;
        int err = 0;

        spin_lock(&ksm_mmlist_lock);
        slot = list_entry(ksm_mm_head.slot.mm_node.next,
                          struct mm_slot, mm_node);
        ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
        spin_unlock(&ksm_mmlist_lock);

        for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head;
             mm_slot = ksm_scan.mm_slot) {
                VMA_ITERATOR(vmi, mm_slot->slot.mm, 0);

                mm = mm_slot->slot.mm;
                mmap_read_lock(mm);

                /*
                 * Exit right away if mm is exiting to avoid lockdep issue in
                 * the maple tree
                 */
                if (ksm_test_exit(mm))
                        goto mm_exiting;

                for_each_vma(vmi, vma) {
                        if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
                                continue;
                        err = unmerge_ksm_pages(vma,
                                                vma->vm_start, vma->vm_end, false);
                        if (err)
                                goto error;
                }

mm_exiting:
                remove_trailing_rmap_items(&mm_slot->rmap_list);
                mmap_read_unlock(mm);

                spin_lock(&ksm_mmlist_lock);
                slot = list_entry(mm_slot->slot.mm_node.next,
                                  struct mm_slot, mm_node);
                ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
                if (ksm_test_exit(mm)) {
                        hash_del(&mm_slot->slot.hash);
                        list_del(&mm_slot->slot.mm_node);
                        spin_unlock(&ksm_mmlist_lock);

                        mm_slot_free(mm_slot_cache, mm_slot);
                        clear_bit(MMF_VM_MERGEABLE, &mm->flags);
                        clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
                        mmdrop(mm);
                } else
                        spin_unlock(&ksm_mmlist_lock);
        }

        /* Clean up stable nodes, but don't worry if some are still busy */
        remove_all_stable_nodes();
        ksm_scan.seqnr = 0;
        return 0;

error:
        mmap_read_unlock(mm);
        spin_lock(&ksm_mmlist_lock);
        ksm_scan.mm_slot = &ksm_mm_head;
        spin_unlock(&ksm_mmlist_lock);
        return err;
}
#endif /* CONFIG_SYSFS */

static u32 calc_checksum(struct page *page)
{
        u32 checksum;
        void *addr = kmap_local_page(page);
        checksum = xxhash(addr, PAGE_SIZE, 0);
        kunmap_local(addr);
        return checksum;
}

static int write_protect_page(struct vm_area_struct *vma, struct folio *folio,
                              pte_t *orig_pte)
{
        struct mm_struct *mm = vma->vm_mm;
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, 0, 0);
        int swapped;
        int err = -EFAULT;
        struct mmu_notifier_range range;
        bool anon_exclusive;
        pte_t entry;

        if (WARN_ON_ONCE(folio_test_large(folio)))
                return err;

        pvmw.address = page_address_in_vma(&folio->page, vma);
        if (pvmw.address == -EFAULT)
                goto out;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address,
                                pvmw.address + PAGE_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        if (!page_vma_mapped_walk(&pvmw))
                goto out_mn;
        if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
                goto out_unlock;

        anon_exclusive = PageAnonExclusive(&folio->page);
        entry = ptep_get(pvmw.pte);
        if (pte_write(entry) || pte_dirty(entry) ||
            anon_exclusive || mm_tlb_flush_pending(mm)) {
                swapped = folio_test_swapcache(folio);
                flush_cache_page(vma, pvmw.address, folio_pfn(folio));
                /*
                 * Ok this is tricky, when get_user_pages_fast() run it doesn't
                 * take any lock, therefore the check that we are going to make
                 * with the pagecount against the mapcount is racy and
                 * O_DIRECT can happen right after the check.
                 * So we clear the pte and flush the tlb before the check
                 * this assure us that no O_DIRECT can happen after the check
                 * or in the middle of the check.
                 *
                 * No need to notify as we are downgrading page table to read
                 * only not changing it to point to a new page.
                 *
                 * See Documentation/mm/mmu_notifier.rst
                 */
                entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
                /*
                 * Check that no O_DIRECT or similar I/O is in progress on the
                 * page
                 */
                if (folio_mapcount(folio) + 1 + swapped != folio_ref_count(folio)) {
                        set_pte_at(mm, pvmw.address, pvmw.pte, entry);
                        goto out_unlock;
                }

                /* See folio_try_share_anon_rmap_pte(): clear PTE first. */
                if (anon_exclusive &&
                    folio_try_share_anon_rmap_pte(folio, &folio->page)) {
                        set_pte_at(mm, pvmw.address, pvmw.pte, entry);
                        goto out_unlock;
                }

                if (pte_dirty(entry))
                        folio_mark_dirty(folio);
                entry = pte_mkclean(entry);

                if (pte_write(entry))
                        entry = pte_wrprotect(entry);

                set_pte_at(mm, pvmw.address, pvmw.pte, entry);
        }
        *orig_pte = entry;
        err = 0;

out_unlock:
        page_vma_mapped_walk_done(&pvmw);
out_mn:
        mmu_notifier_invalidate_range_end(&range);
out:
        return err;
}

/**
 * replace_page - replace page in vma by new ksm page
 * @vma:      vma that holds the pte pointing to page
 * @page:     the page we are replacing by kpage
 * @kpage:    the ksm page we replace page by
 * @orig_pte: the original value of the pte
 *
 * Returns 0 on success, -EFAULT on failure.
 */
static int replace_page(struct vm_area_struct *vma, struct page *page,
                        struct page *kpage, pte_t orig_pte)
{
        struct folio *kfolio = page_folio(kpage);
        struct mm_struct *mm = vma->vm_mm;
        struct folio *folio;
        pmd_t *pmd;
        pmd_t pmde;
        pte_t *ptep;
        pte_t newpte;
        spinlock_t *ptl;
        unsigned long addr;
        int err = -EFAULT;
        struct mmu_notifier_range range;

        addr = page_address_in_vma(page, vma);
        if (addr == -EFAULT)
                goto out;

        pmd = mm_find_pmd(mm, addr);
        if (!pmd)
                goto out;
        /*
         * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
         * without holding anon_vma lock for write.  So when looking for a
         * genuine pmde (in which to find pte), test present and !THP together.
         */
        pmde = pmdp_get_lockless(pmd);
        if (!pmd_present(pmde) || pmd_trans_huge(pmde))
                goto out;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
                                addr + PAGE_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
        if (!ptep)
                goto out_mn;
        if (!pte_same(ptep_get(ptep), orig_pte)) {
                pte_unmap_unlock(ptep, ptl);
                goto out_mn;
        }
        VM_BUG_ON_PAGE(PageAnonExclusive(page), page);
        VM_BUG_ON_FOLIO(folio_test_anon(kfolio) && PageAnonExclusive(kpage),
                        kfolio);

        /*
         * No need to check ksm_use_zero_pages here: we can only have a
         * zero_page here if ksm_use_zero_pages was enabled already.
         */
        if (!is_zero_pfn(page_to_pfn(kpage))) {
                folio_get(kfolio);
                folio_add_anon_rmap_pte(kfolio, kpage, vma, addr, RMAP_NONE);
                newpte = mk_pte(kpage, vma->vm_page_prot);
        } else {
                /*
                 * Use pte_mkdirty to mark the zero page mapped by KSM, and then
                 * we can easily track all KSM-placed zero pages by checking if
                 * the dirty bit in zero page's PTE is set.
                 */
                newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot)));
                ksm_map_zero_page(mm);
                /*
                 * We're replacing an anonymous page with a zero page, which is
                 * not anonymous. We need to do proper accounting otherwise we
                 * will get wrong values in /proc, and a BUG message in dmesg
                 * when tearing down the mm.
                 */
                dec_mm_counter(mm, MM_ANONPAGES);
        }

        flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep)));
        /*
         * No need to notify as we are replacing a read only page with another
         * read only page with the same content.
         *
         * See Documentation/mm/mmu_notifier.rst
         */
        ptep_clear_flush(vma, addr, ptep);
        set_pte_at(mm, addr, ptep, newpte);

        folio = page_folio(page);
        folio_remove_rmap_pte(folio, page, vma);
        if (!folio_mapped(folio))
                folio_free_swap(folio);
        folio_put(folio);

        pte_unmap_unlock(ptep, ptl);
        err = 0;
out_mn:
        mmu_notifier_invalidate_range_end(&range);
out:
        return err;
}

/*
 * try_to_merge_one_page - take two pages and merge them into one
 * @vma: the vma that holds the pte pointing to page
 * @page: the PageAnon page that we want to replace with kpage
 * @kpage: the PageKsm page that we want to map instead of page,
 *         or NULL the first time when we want to use page as kpage.
 *
 * This function returns 0 if the pages were merged, -EFAULT otherwise.
 */
static int try_to_merge_one_page(struct vm_area_struct *vma,
                                 struct page *page, struct page *kpage)
{
        pte_t orig_pte = __pte(0);
        int err = -EFAULT;

        if (page == kpage)                        /* ksm page forked */
                return 0;

        if (!PageAnon(page))
                goto out;

        /*
         * We need the page lock to read a stable PageSwapCache in
         * write_protect_page().  We use trylock_page() instead of
         * lock_page() because we don't want to wait here - we
         * prefer to continue scanning and merging different pages,
         * then come back to this page when it is unlocked.
         */
        if (!trylock_page(page))
                goto out;

        if (PageTransCompound(page)) {
                if (split_huge_page(page))
                        goto out_unlock;
        }

        /*
         * If this anonymous page is mapped only here, its pte may need
         * to be write-protected.  If it's mapped elsewhere, all of its
         * ptes are necessarily already write-protected.  But in either
         * case, we need to lock and check page_count is not raised.
         */
        if (write_protect_page(vma, page_folio(page), &orig_pte) == 0) {
                if (!kpage) {
                        /*
                         * While we hold page lock, upgrade page from
                         * PageAnon+anon_vma to PageKsm+NULL stable_node:
                         * stable_tree_insert() will update stable_node.
                         */
                        folio_set_stable_node(page_folio(page), NULL);
                        mark_page_accessed(page);
                        /*
                         * Page reclaim just frees a clean page with no dirty
                         * ptes: make sure that the ksm page would be swapped.
                         */
                        if (!PageDirty(page))
                                SetPageDirty(page);
                        err = 0;
                } else if (pages_identical(page, kpage))
                        err = replace_page(vma, page, kpage, orig_pte);
        }

out_unlock:
        unlock_page(page);
out:
        return err;
}

/*
 * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
 * but no new kernel page is allocated: kpage must already be a ksm page.
 *
 * This function returns 0 if the pages were merged, -EFAULT otherwise.
 */
static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item,
                                      struct page *page, struct page *kpage)
{
        struct mm_struct *mm = rmap_item->mm;
        struct vm_area_struct *vma;
        int err = -EFAULT;

        mmap_read_lock(mm);
        vma = find_mergeable_vma(mm, rmap_item->address);
        if (!vma)
                goto out;

        err = try_to_merge_one_page(vma, page, kpage);
        if (err)
                goto out;

        /* Unstable nid is in union with stable anon_vma: remove first */
        remove_rmap_item_from_tree(rmap_item);

        /* Must get reference to anon_vma while still holding mmap_lock */
        rmap_item->anon_vma = vma->anon_vma;
        get_anon_vma(vma->anon_vma);
out:
        mmap_read_unlock(mm);
        trace_ksm_merge_with_ksm_page(kpage, page_to_pfn(kpage ? kpage : page),
                                rmap_item, mm, err);
        return err;
}

/*
 * try_to_merge_two_pages - take two identical pages and prepare them
 * to be merged into one page.
 *
 * This function returns the kpage if we successfully merged two identical
 * pages into one ksm page, NULL otherwise.
 *
 * Note that this function upgrades page to ksm page: if one of the pages
 * is already a ksm page, try_to_merge_with_ksm_page should be used.
 */
static struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item,
                                           struct page *page,
                                           struct ksm_rmap_item *tree_rmap_item,
                                           struct page *tree_page)
{
        int err;

        err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
        if (!err) {
                err = try_to_merge_with_ksm_page(tree_rmap_item,
                                                        tree_page, page);
                /*
                 * If that fails, we have a ksm page with only one pte
                 * pointing to it: so break it.
                 */
                if (err)
                        break_cow(rmap_item);
        }
        return err ? NULL : page;
}

static __always_inline
bool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset)
{
        VM_BUG_ON(stable_node->rmap_hlist_len < 0);
        /*
         * Check that at least one mapping still exists, otherwise
         * there's no much point to merge and share with this
         * stable_node, as the underlying tree_page of the other
         * sharer is going to be freed soon.
         */
        return stable_node->rmap_hlist_len &&
                stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
}

static __always_inline
bool is_page_sharing_candidate(struct ksm_stable_node *stable_node)
{
        return __is_page_sharing_candidate(stable_node, 0);
}

static struct folio *stable_node_dup(struct ksm_stable_node **_stable_node_dup,
                                     struct ksm_stable_node **_stable_node,
                                     struct rb_root *root,
                                     bool prune_stale_stable_nodes)
{
        struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node;
        struct hlist_node *hlist_safe;
        struct folio *folio, *tree_folio = NULL;
        int nr = 0;
        int found_rmap_hlist_len;

        if (!prune_stale_stable_nodes ||
            time_before(jiffies, stable_node->chain_prune_time +
                        msecs_to_jiffies(
                                ksm_stable_node_chains_prune_millisecs)))
                prune_stale_stable_nodes = false;
        else
                stable_node->chain_prune_time = jiffies;

        hlist_for_each_entry_safe(dup, hlist_safe,
                                  &stable_node->hlist, hlist_dup) {
                cond_resched();
                /*
                 * We must walk all stable_node_dup to prune the stale
                 * stable nodes during lookup.
                 *
                 * ksm_get_folio can drop the nodes from the
                 * stable_node->hlist if they point to freed pages
                 * (that's why we do a _safe walk). The "dup"
                 * stable_node parameter itself will be freed from
                 * under us if it returns NULL.
                 */
                folio = ksm_get_folio(dup, KSM_GET_FOLIO_NOLOCK);
                if (!folio)
                        continue;
                nr += 1;
                if (is_page_sharing_candidate(dup)) {
                        if (!found ||
                            dup->rmap_hlist_len > found_rmap_hlist_len) {
                                if (found)
                                        folio_put(tree_folio);
                                found = dup;
                                found_rmap_hlist_len = found->rmap_hlist_len;
                                tree_folio = folio;

                                /* skip put_page for found dup */
                                if (!prune_stale_stable_nodes)
                                        break;
                                continue;
                        }
                }
                folio_put(folio);
        }

        if (found) {
                /*
                 * nr is counting all dups in the chain only if
                 * prune_stale_stable_nodes is true, otherwise we may
                 * break the loop at nr == 1 even if there are
                 * multiple entries.
                 */
                if (prune_stale_stable_nodes && nr == 1) {
                        /*
                         * If there's not just one entry it would
                         * corrupt memory, better BUG_ON. In KSM
                         * context with no lock held it's not even
                         * fatal.
                         */
                        BUG_ON(stable_node->hlist.first->next);

                        /*
                         * There's just one entry and it is below the
                         * deduplication limit so drop the chain.
                         */
                        rb_replace_node(&stable_node->node, &found->node,
                                        root);
                        free_stable_node(stable_node);
                        ksm_stable_node_chains--;
                        ksm_stable_node_dups--;
                        /*
                         * NOTE: the caller depends on the stable_node
                         * to be equal to stable_node_dup if the chain
                         * was collapsed.
                         */
                        *_stable_node = found;
                        /*
                         * Just for robustness, as stable_node is
                         * otherwise left as a stable pointer, the
                         * compiler shall optimize it away at build
                         * time.
                         */
                        stable_node = NULL;
                } else if (stable_node->hlist.first != &found->hlist_dup &&
                           __is_page_sharing_candidate(found, 1)) {
                        /*
                         * If the found stable_node dup can accept one
                         * more future merge (in addition to the one
                         * that is underway) and is not at the head of
                         * the chain, put it there so next search will
                         * be quicker in the !prune_stale_stable_nodes
                         * case.
                         *
                         * NOTE: it would be inaccurate to use nr > 1
                         * instead of checking the hlist.first pointer
                         * directly, because in the
                         * prune_stale_stable_nodes case "nr" isn't
                         * the position of the found dup in the chain,
                         * but the total number of dups in the chain.
                         */
                        hlist_del(&found->hlist_dup);
                        hlist_add_head(&found->hlist_dup,
                                       &stable_node->hlist);
                }
        }

        *_stable_node_dup = found;
        return tree_folio;
}

static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stable_node,
                                               struct rb_root *root)
{
        if (!is_stable_node_chain(stable_node))
                return stable_node;
        if (hlist_empty(&stable_node->hlist)) {
                free_stable_node_chain(stable_node, root);
                return NULL;
        }
        return hlist_entry(stable_node->hlist.first,
                           typeof(*stable_node), hlist_dup);
}

/*
 * Like for ksm_get_folio, this function can free the *_stable_node and
 * *_stable_node_dup if the returned tree_page is NULL.
 *
 * It can also free and overwrite *_stable_node with the found
 * stable_node_dup if the chain is collapsed (in which case
 * *_stable_node will be equal to *_stable_node_dup like if the chain
 * never existed). It's up to the caller to verify tree_page is not
 * NULL before dereferencing *_stable_node or *_stable_node_dup.
 *
 * *_stable_node_dup is really a second output parameter of this
 * function and will be overwritten in all cases, the caller doesn't
 * need to initialize it.
 */
static struct folio *__stable_node_chain(struct ksm_stable_node **_stable_node_dup,
                                         struct ksm_stable_node **_stable_node,
                                         struct rb_root *root,
                                         bool prune_stale_stable_nodes)
{
        struct ksm_stable_node *stable_node = *_stable_node;
        if (!is_stable_node_chain(stable_node)) {
                if (is_page_sharing_candidate(stable_node)) {
                        *_stable_node_dup = stable_node;
                        return ksm_get_folio(stable_node, KSM_GET_FOLIO_NOLOCK);
                }
                /*
                 * _stable_node_dup set to NULL means the stable_node
                 * reached the ksm_max_page_sharing limit.
                 */
                *_stable_node_dup = NULL;
                return NULL;
        }
        return stable_node_dup(_stable_node_dup, _stable_node, root,
                               prune_stale_stable_nodes);
}

static __always_inline struct folio *chain_prune(struct ksm_stable_node **s_n_d,
                                                 struct ksm_stable_node **s_n,
                                                 struct rb_root *root)
{
        return __stable_node_chain(s_n_d, s_n, root, true);
}

static __always_inline struct folio *chain(struct ksm_stable_node **s_n_d,
                                           struct ksm_stable_node *s_n,
                                           struct rb_root *root)
{
        struct ksm_stable_node *old_stable_node = s_n;
        struct folio *tree_folio;

        tree_folio = __stable_node_chain(s_n_d, &s_n, root, false);
        /* not pruning dups so s_n cannot have changed */
        VM_BUG_ON(s_n != old_stable_node);
        return tree_folio;
}

/*
 * stable_tree_search - search for page inside the stable tree
 *
 * This function checks if there is a page inside the stable tree
 * with identical content to the page that we are scanning right now.
 *
 * This function returns the stable tree node of identical content if found,
 * NULL otherwise.
 */
static struct page *stable_tree_search(struct page *page)
{
        int nid;
        struct rb_root *root;
        struct rb_node **new;
        struct rb_node *parent;
        struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any;
        struct ksm_stable_node *page_node;
        struct folio *folio;

        folio = page_folio(page);
        page_node = folio_stable_node(folio);
        if (page_node && page_node->head != &migrate_nodes) {
                /* ksm page forked */
                folio_get(folio);
                return &folio->page;
        }

        nid = get_kpfn_nid(folio_pfn(folio));
        root = root_stable_tree + nid;
again:
        new = &root->rb_node;
        parent = NULL;

        while (*new) {
                struct folio *tree_folio;
                int ret;

                cond_resched();
                stable_node = rb_entry(*new, struct ksm_stable_node, node);
                stable_node_any = NULL;
                tree_folio = chain_prune(&stable_node_dup, &stable_node, root);
                /*
                 * NOTE: stable_node may have been freed by
                 * chain_prune() if the returned stable_node_dup is
                 * not NULL. stable_node_dup may have been inserted in
                 * the rbtree instead as a regular stable_node (in
                 * order to collapse the stable_node chain if a single
                 * stable_node dup was found in it). In such case the
                 * stable_node is overwritten by the callee to point
                 * to the stable_node_dup that was collapsed in the
                 * stable rbtree and stable_node will be equal to
                 * stable_node_dup like if the chain never existed.
                 */
                if (!stable_node_dup) {
                        /*
                         * Either all stable_node dups were full in
                         * this stable_node chain, or this chain was
                         * empty and should be rb_erased.
                         */
                        stable_node_any = stable_node_dup_any(stable_node,
                                                              root);
                        if (!stable_node_any) {
                                /* rb_erase just run */
                                goto again;
                        }
                        /*
                         * Take any of the stable_node dups page of
                         * this stable_node chain to let the tree walk
                         * continue. All KSM pages belonging to the
                         * stable_node dups in a stable_node chain
                         * have the same content and they're
                         * write protected at all times. Any will work
                         * fine to continue the walk.
                         */
                        tree_folio = ksm_get_folio(stable_node_any,
                                                   KSM_GET_FOLIO_NOLOCK);
                }
                VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
                if (!tree_folio) {
                        /*
                         * If we walked over a stale stable_node,
                         * ksm_get_folio() will call rb_erase() and it
                         * may rebalance the tree from under us. So
                         * restart the search from scratch. Returning
                         * NULL would be safe too, but we'd generate
                         * false negative insertions just because some
                         * stable_node was stale.
                         */
                        goto again;
                }

                ret = memcmp_pages(page, &tree_folio->page);
                folio_put(tree_folio);

                parent = *new;
                if (ret < 0)
                        new = &parent->rb_left;
                else if (ret > 0)
                        new = &parent->rb_right;
                else {
                        if (page_node) {
                                VM_BUG_ON(page_node->head != &migrate_nodes);
                                /*
                                 * If the mapcount of our migrated KSM folio is
                                 * at most 1, we can merge it with another
                                 * KSM folio where we know that we have space
                                 * for one more mapping without exceeding the
                                 * ksm_max_page_sharing limit: see
                                 * chain_prune(). This way, we can avoid adding
                                 * this stable node to the chain.
                                 */
                                if (folio_mapcount(folio) > 1)
                                        goto chain_append;
                        }

                        if (!stable_node_dup) {
                                /*
                                 * If the stable_node is a chain and
                                 * we got a payload match in memcmp
                                 * but we cannot merge the scanned
                                 * page in any of the existing
                                 * stable_node dups because they're
                                 * all full, we need to wait the
                                 * scanned page to find itself a match
                                 * in the unstable tree to create a
                                 * brand new KSM page to add later to
                                 * the dups of this stable_node.
                                 */
                                return NULL;
                        }

                        /*
                         * Lock and unlock the stable_node's page (which
                         * might already have been migrated) so that page
                         * migration is sure to notice its raised count.
                         * It would be more elegant to return stable_node
                         * than kpage, but that involves more changes.
                         */
                        tree_folio = ksm_get_folio(stable_node_dup,
                                                   KSM_GET_FOLIO_TRYLOCK);

                        if (PTR_ERR(tree_folio) == -EBUSY)
                                return ERR_PTR(-EBUSY);

                        if (unlikely(!tree_folio))
                                /*
                                 * The tree may have been rebalanced,
                                 * so re-evaluate parent and new.
                                 */
                                goto again;
                        folio_unlock(tree_folio);

                        if (get_kpfn_nid(stable_node_dup->kpfn) !=
                            NUMA(stable_node_dup->nid)) {
                                folio_put(tree_folio);
                                goto replace;
                        }
                        return &tree_folio->page;
                }
        }

        if (!page_node)
                return NULL;

        list_del(&page_node->list);
        DO_NUMA(page_node->nid = nid);
        rb_link_node(&page_node->node, parent, new);
        rb_insert_color(&page_node->node, root);
out:
        if (is_page_sharing_candidate(page_node)) {
                folio_get(folio);
                return &folio->page;
        } else
                return NULL;

replace:
        /*
         * If stable_node was a chain and chain_prune collapsed it,
         * stable_node has been updated to be the new regular
         * stable_node. A collapse of the chain is indistinguishable
         * from the case there was no chain in the stable
         * rbtree. Otherwise stable_node is the chain and
         * stable_node_dup is the dup to replace.
         */
        if (stable_node_dup == stable_node) {
                VM_BUG_ON(is_stable_node_chain(stable_node_dup));
                VM_BUG_ON(is_stable_node_dup(stable_node_dup));
                /* there is no chain */
                if (page_node) {
                        VM_BUG_ON(page_node->head != &migrate_nodes);
                        list_del(&page_node->list);
                        DO_NUMA(page_node->nid = nid);
                        rb_replace_node(&stable_node_dup->node,
                                        &page_node->node,
                                        root);
                        if (is_page_sharing_candidate(page_node))
                                folio_get(folio);
                        else
                                folio = NULL;
                } else {
                        rb_erase(&stable_node_dup->node, root);
                        folio = NULL;
                }
        } else {
                VM_BUG_ON(!is_stable_node_chain(stable_node));
                __stable_node_dup_del(stable_node_dup);
                if (page_node) {
                        VM_BUG_ON(page_node->head != &migrate_nodes);
                        list_del(&page_node->list);
                        DO_NUMA(page_node->nid = nid);
                        stable_node_chain_add_dup(page_node, stable_node);
                        if (is_page_sharing_candidate(page_node))
                                folio_get(folio);
                        else
                                folio = NULL;
                } else {
                        folio = NULL;
                }
        }
        stable_node_dup->head = &migrate_nodes;
        list_add(&stable_node_dup->list, stable_node_dup->head);
        return &folio->page;

chain_append:
        /* stable_node_dup could be null if it reached the limit */
        if (!stable_node_dup)
                stable_node_dup = stable_node_any;
        /*
         * If stable_node was a chain and chain_prune collapsed it,
         * stable_node has been updated to be the new regular
         * stable_node. A collapse of the chain is indistinguishable
         * from the case there was no chain in the stable
         * rbtree. Otherwise stable_node is the chain and
         * stable_node_dup is the dup to replace.
         */
        if (stable_node_dup == stable_node) {
                VM_BUG_ON(is_stable_node_dup(stable_node_dup));
                /* chain is missing so create it */
                stable_node = alloc_stable_node_chain(stable_node_dup,
                                                      root);
                if (!stable_node)
                        return NULL;
        }
        /*
         * Add this stable_node dup that was
         * migrated to the stable_node chain
         * of the current nid for this page
         * content.
         */
        VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
        VM_BUG_ON(page_node->head != &migrate_nodes);
        list_del(&page_node->list);
        DO_NUMA(page_node->nid = nid);
        stable_node_chain_add_dup(page_node, stable_node);
        goto out;
}

/*
 * stable_tree_insert - insert stable tree node pointing to new ksm page
 * into the stable tree.
 *
 * This function returns the stable tree node just allocated on success,
 * NULL otherwise.
 */
static struct ksm_stable_node *stable_tree_insert(struct folio *kfolio)
{
        int nid;
        unsigned long kpfn;
        struct rb_root *root;
        struct rb_node **new;
        struct rb_node *parent;
        struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any;
        bool need_chain = false;

        kpfn = folio_pfn(kfolio);
        nid = get_kpfn_nid(kpfn);
        root = root_stable_tree + nid;
again:
        parent = NULL;
        new = &root->rb_node;

        while (*new) {
                struct folio *tree_folio;
                int ret;

                cond_resched();
                stable_node = rb_entry(*new, struct ksm_stable_node, node);
                stable_node_any = NULL;
                tree_folio = chain(&stable_node_dup, stable_node, root);
                if (!stable_node_dup) {
                        /*
                         * Either all stable_node dups were full in
                         * this stable_node chain, or this chain was
                         * empty and should be rb_erased.
                         */
                        stable_node_any = stable_node_dup_any(stable_node,
                                                              root);
                        if (!stable_node_any) {
                                /* rb_erase just run */
                                goto again;
                        }
                        /*
                         * Take any of the stable_node dups page of
                         * this stable_node chain to let the tree walk
                         * continue. All KSM pages belonging to the
                         * stable_node dups in a stable_node chain
                         * have the same content and they're
                         * write protected at all times. Any will work
                         * fine to continue the walk.
                         */
                        tree_folio = ksm_get_folio(stable_node_any,
                                                   KSM_GET_FOLIO_NOLOCK);
                }
                VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
                if (!tree_folio) {
                        /*
                         * If we walked over a stale stable_node,
                         * ksm_get_folio() will call rb_erase() and it
                         * may rebalance the tree from under us. So
                         * restart the search from scratch. Returning
                         * NULL would be safe too, but we'd generate
                         * false negative insertions just because some
                         * stable_node was stale.
                         */
                        goto again;
                }

                ret = memcmp_pages(&kfolio->page, &tree_folio->page);
                folio_put(tree_folio);

                parent = *new;
                if (ret < 0)
                        new = &parent->rb_left;
                else if (ret > 0)
                        new = &parent->rb_right;
                else {
                        need_chain = true;
                        break;
                }
        }

        stable_node_dup = alloc_stable_node();
        if (!stable_node_dup)
                return NULL;

        INIT_HLIST_HEAD(&stable_node_dup->hlist);
        stable_node_dup->kpfn = kpfn;
        stable_node_dup->rmap_hlist_len = 0;
        DO_NUMA(stable_node_dup->nid = nid);
        if (!need_chain) {
                rb_link_node(&stable_node_dup->node, parent, new);
                rb_insert_color(&stable_node_dup->node, root);
        } else {
                if (!is_stable_node_chain(stable_node)) {
                        struct ksm_stable_node *orig = stable_node;
                        /* chain is missing so create it */
                        stable_node = alloc_stable_node_chain(orig, root);
                        if (!stable_node) {
                                free_stable_node(stable_node_dup);
                                return NULL;
                        }
                }
                stable_node_chain_add_dup(stable_node_dup, stable_node);
        }

        folio_set_stable_node(kfolio, stable_node_dup);

        return stable_node_dup;
}

/*
 * unstable_tree_search_insert - search for identical page,
 * else insert rmap_item into the unstable tree.
 *
 * This function searches for a page in the unstable tree identical to the
 * page currently being scanned; and if no identical page is found in the
 * tree, we insert rmap_item as a new object into the unstable tree.
 *
 * This function returns pointer to rmap_item found to be identical
 * to the currently scanned page, NULL otherwise.
 *
 * This function does both searching and inserting, because they share
 * the same walking algorithm in an rbtree.
 */
static
struct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item,
                                              struct page *page,
                                              struct page **tree_pagep)
{
        struct rb_node **new;
        struct rb_root *root;
        struct rb_node *parent = NULL;
        int nid;

        nid = get_kpfn_nid(page_to_pfn(page));
        root = root_unstable_tree + nid;
        new = &root->rb_node;

        while (*new) {
                struct ksm_rmap_item *tree_rmap_item;
                struct page *tree_page;
                int ret;

                cond_resched();
                tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node);
                tree_page = get_mergeable_page(tree_rmap_item);
                if (!tree_page)
                        return NULL;

                /*
                 * Don't substitute a ksm page for a forked page.
                 */
                if (page == tree_page) {
                        put_page(tree_page);
                        return NULL;
                }

                ret = memcmp_pages(page, tree_page);

                parent = *new;
                if (ret < 0) {
                        put_page(tree_page);
                        new = &parent->rb_left;
                } else if (ret > 0) {
                        put_page(tree_page);
                        new = &parent->rb_right;
                } else if (!ksm_merge_across_nodes &&
                           page_to_nid(tree_page) != nid) {
                        /*
                         * If tree_page has been migrated to another NUMA node,
                         * it will be flushed out and put in the right unstable
                         * tree next time: only merge with it when across_nodes.
                         */
                        put_page(tree_page);
                        return NULL;
                } else {
                        *tree_pagep = tree_page;
                        return tree_rmap_item;
                }
        }

        rmap_item->address |= UNSTABLE_FLAG;
        rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
        DO_NUMA(rmap_item->nid = nid);
        rb_link_node(&rmap_item->node, parent, new);
        rb_insert_color(&rmap_item->node, root);

        ksm_pages_unshared++;
        return NULL;
}

/*
 * stable_tree_append - add another rmap_item to the linked list of
 * rmap_items hanging off a given node of the stable tree, all sharing
 * the same ksm page.
 */
static void stable_tree_append(struct ksm_rmap_item *rmap_item,
                               struct ksm_stable_node *stable_node,
                               bool max_page_sharing_bypass)
{
        /*
         * rmap won't find this mapping if we don't insert the
         * rmap_item in the right stable_node
         * duplicate. page_migration could break later if rmap breaks,
         * so we can as well crash here. We really need to check for
         * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
         * for other negative values as an underflow if detected here
         * for the first time (and not when decreasing rmap_hlist_len)
         * would be sign of memory corruption in the stable_node.
         */
        BUG_ON(stable_node->rmap_hlist_len < 0);

        stable_node->rmap_hlist_len++;
        if (!max_page_sharing_bypass)
                /* possibly non fatal but unexpected overflow, only warn */
                WARN_ON_ONCE(stable_node->rmap_hlist_len >
                             ksm_max_page_sharing);

        rmap_item->head = stable_node;
        rmap_item->address |= STABLE_FLAG;
        hlist_add_head(&rmap_item->hlist, &stable_node->hlist);

        if (rmap_item->hlist.next)
                ksm_pages_sharing++;
        else
                ksm_pages_shared++;

        rmap_item->mm->ksm_merging_pages++;
}

/*
 * cmp_and_merge_page - first see if page can be merged into the stable tree;
 * if not, compare checksum to previous and if it's the same, see if page can
 * be inserted into the unstable tree, or merged with a page already there and
 * both transferred to the stable tree.
 *
 * @page: the page that we are searching identical page to.
 * @rmap_item: the reverse mapping into the virtual address of this page
 */
static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item)
{
        struct mm_struct *mm = rmap_item->mm;
        struct ksm_rmap_item *tree_rmap_item;
        struct page *tree_page = NULL;
        struct ksm_stable_node *stable_node;
        struct page *kpage;
        unsigned int checksum;
        int err;
        bool max_page_sharing_bypass = false;

        stable_node = page_stable_node(page);
        if (stable_node) {
                if (stable_node->head != &migrate_nodes &&
                    get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
                    NUMA(stable_node->nid)) {
                        stable_node_dup_del(stable_node);
                        stable_node->head = &migrate_nodes;
                        list_add(&stable_node->list, stable_node->head);
                }
                if (stable_node->head != &migrate_nodes &&
                    rmap_item->head == stable_node)
                        return;
                /*
                 * If it's a KSM fork, allow it to go over the sharing limit
                 * without warnings.
                 */
                if (!is_page_sharing_candidate(stable_node))
                        max_page_sharing_bypass = true;
        }

        /* We first start with searching the page inside the stable tree */
        kpage = stable_tree_search(page);
        if (kpage == page && rmap_item->head == stable_node) {
                put_page(kpage);
                return;
        }

        remove_rmap_item_from_tree(rmap_item);

        if (kpage) {
                if (PTR_ERR(kpage) == -EBUSY)
                        return;

                err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
                if (!err) {
                        /*
                         * The page was successfully merged:
                         * add its rmap_item to the stable tree.
                         */
                        lock_page(kpage);
                        stable_tree_append(rmap_item, page_stable_node(kpage),
                                           max_page_sharing_bypass);
                        unlock_page(kpage);
                }
                put_page(kpage);
                return;
        }

        /*
         * If the hash value of the page has changed from the last time
         * we calculated it, this page is changing frequently: therefore we
         * don't want to insert it in the unstable tree, and we don't want
         * to waste our time searching for something identical to it there.
         */
        checksum = calc_checksum(page);
        if (rmap_item->oldchecksum != checksum) {
                rmap_item->oldchecksum = checksum;
                return;
        }

        /*
         * Same checksum as an empty page. We attempt to merge it with the
         * appropriate zero page if the user enabled this via sysfs.
         */
        if (ksm_use_zero_pages && (checksum == zero_checksum)) {
                struct vm_area_struct *vma;

                mmap_read_lock(mm);
                vma = find_mergeable_vma(mm, rmap_item->address);
                if (vma) {
                        err = try_to_merge_one_page(vma, page,
                                        ZERO_PAGE(rmap_item->address));
                        trace_ksm_merge_one_page(
                                page_to_pfn(ZERO_PAGE(rmap_item->address)),
                                rmap_item, mm, err);
                } else {
                        /*
                         * If the vma is out of date, we do not need to
                         * continue.
                         */
                        err = 0;
                }
                mmap_read_unlock(mm);
                /*
                 * In case of failure, the page was not really empty, so we
                 * need to continue. Otherwise we're done.
                 */
                if (!err)
                        return;
        }
        tree_rmap_item =
                unstable_tree_search_insert(rmap_item, page, &tree_page);
        if (tree_rmap_item) {
                bool split;

                kpage = try_to_merge_two_pages(rmap_item, page,
                                                tree_rmap_item, tree_page);
                /*
                 * If both pages we tried to merge belong to the same compound
                 * page, then we actually ended up increasing the reference
                 * count of the same compound page twice, and split_huge_page
                 * failed.
                 * Here we set a flag if that happened, and we use it later to
                 * try split_huge_page again. Since we call put_page right
                 * afterwards, the reference count will be correct and
                 * split_huge_page should succeed.
                 */
                split = PageTransCompound(page)
                        && compound_head(page) == compound_head(tree_page);
                put_page(tree_page);
                if (kpage) {
                        /*
                         * The pages were successfully merged: insert new
                         * node in the stable tree and add both rmap_items.
                         */
                        lock_page(kpage);
                        stable_node = stable_tree_insert(page_folio(kpage));
                        if (stable_node) {
                                stable_tree_append(tree_rmap_item, stable_node,
                                                   false);
                                stable_tree_append(rmap_item, stable_node,
                                                   false);
                        }
                        unlock_page(kpage);

                        /*
                         * If we fail to insert the page into the stable tree,
                         * we will have 2 virtual addresses that are pointing
                         * to a ksm page left outside the stable tree,
                         * in which case we need to break_cow on both.
                         */
                        if (!stable_node) {
                                break_cow(tree_rmap_item);
                                break_cow(rmap_item);
                        }
                } else if (split) {
                        /*
                         * We are here if we tried to merge two pages and
                         * failed because they both belonged to the same
                         * compound page. We will split the page now, but no
                         * merging will take place.
                         * We do not want to add the cost of a full lock; if
                         * the page is locked, it is better to skip it and
                         * perhaps try again later.
                         */
                        if (!trylock_page(page))
                                return;
                        split_huge_page(page);
                        unlock_page(page);
                }
        }
}

static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot,
                                            struct ksm_rmap_item **rmap_list,
                                            unsigned long addr)
{
        struct ksm_rmap_item *rmap_item;

        while (*rmap_list) {
                rmap_item = *rmap_list;
                if ((rmap_item->address & PAGE_MASK) == addr)
                        return rmap_item;
                if (rmap_item->address > addr)
                        break;
                *rmap_list = rmap_item->rmap_list;
                remove_rmap_item_from_tree(rmap_item);
                free_rmap_item(rmap_item);
        }

        rmap_item = alloc_rmap_item();
        if (rmap_item) {
                /* It has already been zeroed */
                rmap_item->mm = mm_slot->slot.mm;
                rmap_item->mm->ksm_rmap_items++;
                rmap_item->address = addr;
                rmap_item->rmap_list = *rmap_list;
                *rmap_list = rmap_item;
        }
        return rmap_item;
}

/*
 * Calculate skip age for the ksm page age. The age determines how often
 * de-duplicating has already been tried unsuccessfully. If the age is
 * smaller, the scanning of this page is skipped for less scans.
 *
 * @age: rmap_item age of page
 */
static unsigned int skip_age(rmap_age_t age)
{
        if (age <= 3)
                return 1;
        if (age <= 5)
                return 2;
        if (age <= 8)
                return 4;

        return 8;
}

/*
 * Determines if a page should be skipped for the current scan.
 *
 * @page: page to check
 * @rmap_item: associated rmap_item of page
 */
static bool should_skip_rmap_item(struct page *page,
                                  struct ksm_rmap_item *rmap_item)
{
        rmap_age_t age;

        if (!ksm_smart_scan)
                return false;

        /*
         * Never skip pages that are already KSM; pages cmp_and_merge_page()
         * will essentially ignore them, but we still have to process them
         * properly.
         */
        if (PageKsm(page))
                return false;

        age = rmap_item->age;
        if (age != U8_MAX)
                rmap_item->age++;

        /*
         * Smaller ages are not skipped, they need to get a chance to go
         * through the different phases of the KSM merging.
         */
        if (age < 3)
                return false;

        /*
         * Are we still allowed to skip? If not, then don't skip it
         * and determine how much more often we are allowed to skip next.
         */
        if (!rmap_item->remaining_skips) {
                rmap_item->remaining_skips = skip_age(age);
                return false;
        }

        /* Skip this page */
        ksm_pages_skipped++;
        rmap_item->remaining_skips--;
        remove_rmap_item_from_tree(rmap_item);
        return true;
}

static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
{
        struct mm_struct *mm;
        struct ksm_mm_slot *mm_slot;
        struct mm_slot *slot;
        struct vm_area_struct *vma;
        struct ksm_rmap_item *rmap_item;
        struct vma_iterator vmi;
        int nid;

        if (list_empty(&ksm_mm_head.slot.mm_node))
                return NULL;

        mm_slot = ksm_scan.mm_slot;
        if (mm_slot == &ksm_mm_head) {
                advisor_start_scan();
                trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items);

                /*
                 * A number of pages can hang around indefinitely in per-cpu
                 * LRU cache, raised page count preventing write_protect_page
                 * from merging them.  Though it doesn't really matter much,
                 * it is puzzling to see some stuck in pages_volatile until
                 * other activity jostles them out, and they also prevented
                 * LTP's KSM test from succeeding deterministically; so drain
                 * them here (here rather than on entry to ksm_do_scan(),
                 * so we don't IPI too often when pages_to_scan is set low).
                 */
                lru_add_drain_all();

                /*
                 * Whereas stale stable_nodes on the stable_tree itself
                 * get pruned in the regular course of stable_tree_search(),
                 * those moved out to the migrate_nodes list can accumulate:
                 * so prune them once before each full scan.
                 */
                if (!ksm_merge_across_nodes) {
                        struct ksm_stable_node *stable_node, *next;
                        struct folio *folio;

                        list_for_each_entry_safe(stable_node, next,
                                                 &migrate_nodes, list) {
                                folio = ksm_get_folio(stable_node,
                                                      KSM_GET_FOLIO_NOLOCK);
                                if (folio)
                                        folio_put(folio);
                                cond_resched();
                        }
                }

                for (nid = 0; nid < ksm_nr_node_ids; nid++)
                        root_unstable_tree[nid] = RB_ROOT;

                spin_lock(&ksm_mmlist_lock);
                slot = list_entry(mm_slot->slot.mm_node.next,
                                  struct mm_slot, mm_node);
                mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
                ksm_scan.mm_slot = mm_slot;
                spin_unlock(&ksm_mmlist_lock);
                /*
                 * Although we tested list_empty() above, a racing __ksm_exit
                 * of the last mm on the list may have removed it since then.
                 */
                if (mm_slot == &ksm_mm_head)
                        return NULL;
next_mm:
                ksm_scan.address = 0;
                ksm_scan.rmap_list = &mm_slot->rmap_list;
        }

        slot = &mm_slot->slot;
        mm = slot->mm;
        vma_iter_init(&vmi, mm, ksm_scan.address);

        mmap_read_lock(mm);
        if (ksm_test_exit(mm))
                goto no_vmas;

        for_each_vma(vmi, vma) {
                if (!(vma->vm_flags & VM_MERGEABLE))
                        continue;
                if (ksm_scan.address < vma->vm_start)
                        ksm_scan.address = vma->vm_start;
                if (!vma->anon_vma)
                        ksm_scan.address = vma->vm_end;

                while (ksm_scan.address < vma->vm_end) {
                        if (ksm_test_exit(mm))
                                break;
                        *page = follow_page(vma, ksm_scan.address, FOLL_GET);
                        if (IS_ERR_OR_NULL(*page)) {
                                ksm_scan.address += PAGE_SIZE;
                                cond_resched();
                                continue;
                        }
                        if (is_zone_device_page(*page))
                                goto next_page;
                        if (PageAnon(*page)) {
                                flush_anon_page(vma, *page, ksm_scan.address);
                                flush_dcache_page(*page);
                                rmap_item = get_next_rmap_item(mm_slot,
                                        ksm_scan.rmap_list, ksm_scan.address);
                                if (rmap_item) {
                                        ksm_scan.rmap_list =
                                                        &rmap_item->rmap_list;

                                        if (should_skip_rmap_item(*page, rmap_item))
                                                goto next_page;

                                        ksm_scan.address += PAGE_SIZE;
                                } else
                                        put_page(*page);
                                mmap_read_unlock(mm);
                                return rmap_item;
                        }
next_page:
                        put_page(*page);
                        ksm_scan.address += PAGE_SIZE;
                        cond_resched();
                }
        }

        if (ksm_test_exit(mm)) {
no_vmas:
                ksm_scan.address = 0;
                ksm_scan.rmap_list = &mm_slot->rmap_list;
        }
        /*
         * Nuke all the rmap_items that are above this current rmap:
         * because there were no VM_MERGEABLE vmas with such addresses.
         */
        remove_trailing_rmap_items(ksm_scan.rmap_list);

        spin_lock(&ksm_mmlist_lock);
        slot = list_entry(mm_slot->slot.mm_node.next,
                          struct mm_slot, mm_node);
        ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
        if (ksm_scan.address == 0) {
                /*
                 * We've completed a full scan of all vmas, holding mmap_lock
                 * throughout, and found no VM_MERGEABLE: so do the same as
                 * __ksm_exit does to remove this mm from all our lists now.
                 * This applies either when cleaning up after __ksm_exit
                 * (but beware: we can reach here even before __ksm_exit),
                 * or when all VM_MERGEABLE areas have been unmapped (and
                 * mmap_lock then protects against race with MADV_MERGEABLE).
                 */
                hash_del(&mm_slot->slot.hash);
                list_del(&mm_slot->slot.mm_node);
                spin_unlock(&ksm_mmlist_lock);

                mm_slot_free(mm_slot_cache, mm_slot);
                clear_bit(MMF_VM_MERGEABLE, &mm->flags);
                clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
                mmap_read_unlock(mm);
                mmdrop(mm);
        } else {
                mmap_read_unlock(mm);
                /*
                 * mmap_read_unlock(mm) first because after
                 * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
                 * already have been freed under us by __ksm_exit()
                 * because the "mm_slot" is still hashed and
                 * ksm_scan.mm_slot doesn't point to it anymore.
                 */
                spin_unlock(&ksm_mmlist_lock);
        }

        /* Repeat until we've completed scanning the whole list */
        mm_slot = ksm_scan.mm_slot;
        if (mm_slot != &ksm_mm_head)
                goto next_mm;

        advisor_stop_scan();

        trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items);
        ksm_scan.seqnr++;
        return NULL;
}

/**
 * ksm_do_scan  - the ksm scanner main worker function.
 * @scan_npages:  number of pages we want to scan before we return.
 */
static void ksm_do_scan(unsigned int scan_npages)
{
        struct ksm_rmap_item *rmap_item;
        struct page *page;

        while (scan_npages-- && likely(!freezing(current))) {
                cond_resched();
                rmap_item = scan_get_next_rmap_item(&page);
                if (!rmap_item)
                        return;
                cmp_and_merge_page(page, rmap_item);
                put_page(page);
                ksm_pages_scanned++;
        }
}

static int ksmd_should_run(void)
{
        return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node);
}

static int ksm_scan_thread(void *nothing)
{
        unsigned int sleep_ms;

        set_freezable();
        set_user_nice(current, 5);

        while (!kthread_should_stop()) {
                mutex_lock(&ksm_thread_mutex);
                wait_while_offlining();
                if (ksmd_should_run())
                        ksm_do_scan(ksm_thread_pages_to_scan);
                mutex_unlock(&ksm_thread_mutex);

                if (ksmd_should_run()) {
                        sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
                        wait_event_freezable_timeout(ksm_iter_wait,
                                sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
                                msecs_to_jiffies(sleep_ms));
                } else {
                        wait_event_freezable(ksm_thread_wait,
                                ksmd_should_run() || kthread_should_stop());
                }
        }
        return 0;
}

static void __ksm_add_vma(struct vm_area_struct *vma)
{
        unsigned long vm_flags = vma->vm_flags;

        if (vm_flags & VM_MERGEABLE)
                return;

        if (vma_ksm_compatible(vma))
                vm_flags_set(vma, VM_MERGEABLE);
}

static int __ksm_del_vma(struct vm_area_struct *vma)
{
        int err;

        if (!(vma->vm_flags & VM_MERGEABLE))
                return 0;

        if (vma->anon_vma) {
                err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true);
                if (err)
                        return err;
        }

        vm_flags_clear(vma, VM_MERGEABLE);
        return 0;
}
/**
 * ksm_add_vma - Mark vma as mergeable if compatible
 *
 * @vma:  Pointer to vma
 */
void ksm_add_vma(struct vm_area_struct *vma)
{
        struct mm_struct *mm = vma->vm_mm;

        if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
                __ksm_add_vma(vma);
}

static void ksm_add_vmas(struct mm_struct *mm)
{
        struct vm_area_struct *vma;

        VMA_ITERATOR(vmi, mm, 0);
        for_each_vma(vmi, vma)
                __ksm_add_vma(vma);
}

static int ksm_del_vmas(struct mm_struct *mm)
{
        struct vm_area_struct *vma;
        int err;

        VMA_ITERATOR(vmi, mm, 0);
        for_each_vma(vmi, vma) {
                err = __ksm_del_vma(vma);
                if (err)
                        return err;
        }
        return 0;
}

/**
 * ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all
 *                        compatible VMA's
 *
 * @mm:  Pointer to mm
 *
 * Returns 0 on success, otherwise error code
 */
int ksm_enable_merge_any(struct mm_struct *mm)
{
        int err;

        if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
                return 0;

        if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
                err = __ksm_enter(mm);
                if (err)
                        return err;
        }

        set_bit(MMF_VM_MERGE_ANY, &mm->flags);
        ksm_add_vmas(mm);

        return 0;
}

/**
 * ksm_disable_merge_any - Disable merging on all compatible VMA's of the mm,
 *                           previously enabled via ksm_enable_merge_any().
 *
 * Disabling merging implies unmerging any merged pages, like setting
 * MADV_UNMERGEABLE would. If unmerging fails, the whole operation fails and
 * merging on all compatible VMA's remains enabled.
 *
 * @mm: Pointer to mm
 *
 * Returns 0 on success, otherwise error code
 */
int ksm_disable_merge_any(struct mm_struct *mm)
{
        int err;

        if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags))
                return 0;

        err = ksm_del_vmas(mm);
        if (err) {
                ksm_add_vmas(mm);
                return err;
        }

        clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
        return 0;
}

int ksm_disable(struct mm_struct *mm)
{
        mmap_assert_write_locked(mm);

        if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))
                return 0;
        if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
                return ksm_disable_merge_any(mm);
        return ksm_del_vmas(mm);
}

int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                unsigned long end, int advice, unsigned long *vm_flags)
{
        struct mm_struct *mm = vma->vm_mm;
        int err;

        switch (advice) {
        case MADV_MERGEABLE:
                if (vma->vm_flags & VM_MERGEABLE)
                        return 0;
                if (!vma_ksm_compatible(vma))
                        return 0;

                if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
                        err = __ksm_enter(mm);
                        if (err)
                                return err;
                }

                *vm_flags |= VM_MERGEABLE;
                break;

        case MADV_UNMERGEABLE:
                if (!(*vm_flags & VM_MERGEABLE))
                        return 0;                /* just ignore the advice */

                if (vma->anon_vma) {
                        err = unmerge_ksm_pages(vma, start, end, true);
                        if (err)
                                return err;
                }

                *vm_flags &= ~VM_MERGEABLE;
                break;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(ksm_madvise);

int __ksm_enter(struct mm_struct *mm)
{
        struct ksm_mm_slot *mm_slot;
        struct mm_slot *slot;
        int needs_wakeup;

        mm_slot = mm_slot_alloc(mm_slot_cache);
        if (!mm_slot)
                return -ENOMEM;

        slot = &mm_slot->slot;

        /* Check ksm_run too?  Would need tighter locking */
        needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node);

        spin_lock(&ksm_mmlist_lock);
        mm_slot_insert(mm_slots_hash, mm, slot);
        /*
         * When KSM_RUN_MERGE (or KSM_RUN_STOP),
         * insert just behind the scanning cursor, to let the area settle
         * down a little; when fork is followed by immediate exec, we don't
         * want ksmd to waste time setting up and tearing down an rmap_list.
         *
         * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
         * scanning cursor, otherwise KSM pages in newly forked mms will be
         * missed: then we might as well insert at the end of the list.
         */
        if (ksm_run & KSM_RUN_UNMERGE)
                list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node);
        else
                list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node);
        spin_unlock(&ksm_mmlist_lock);

        set_bit(MMF_VM_MERGEABLE, &mm->flags);
        mmgrab(mm);

        if (needs_wakeup)
                wake_up_interruptible(&ksm_thread_wait);

        trace_ksm_enter(mm);
        return 0;
}

void __ksm_exit(struct mm_struct *mm)
{
        struct ksm_mm_slot *mm_slot;
        struct mm_slot *slot;
        int easy_to_free = 0;

        /*
         * This process is exiting: if it's straightforward (as is the
         * case when ksmd was never running), free mm_slot immediately.
         * But if it's at the cursor or has rmap_items linked to it, use
         * mmap_lock to synchronize with any break_cows before pagetables
         * are freed, and leave the mm_slot on the list for ksmd to free.
         * Beware: ksm may already have noticed it exiting and freed the slot.
         */

        spin_lock(&ksm_mmlist_lock);
        slot = mm_slot_lookup(mm_slots_hash, mm);
        mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
        if (mm_slot && ksm_scan.mm_slot != mm_slot) {
                if (!mm_slot->rmap_list) {
                        hash_del(&slot->hash);
                        list_del(&slot->mm_node);
                        easy_to_free = 1;
                } else {
                        list_move(&slot->mm_node,
                                  &ksm_scan.mm_slot->slot.mm_node);
                }
        }
        spin_unlock(&ksm_mmlist_lock);

        if (easy_to_free) {
                mm_slot_free(mm_slot_cache, mm_slot);
                clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
                clear_bit(MMF_VM_MERGEABLE, &mm->flags);
                mmdrop(mm);
        } else if (mm_slot) {
                mmap_write_lock(mm);
                mmap_write_unlock(mm);
        }

        trace_ksm_exit(mm);
}

struct folio *ksm_might_need_to_copy(struct folio *folio,
                        struct vm_area_struct *vma, unsigned long addr)
{
        struct page *page = folio_page(folio, 0);
        struct anon_vma *anon_vma = folio_anon_vma(folio);
        struct folio *new_folio;

        if (folio_test_large(folio))
                return folio;

        if (folio_test_ksm(folio)) {
                if (folio_stable_node(folio) &&
                    !(ksm_run & KSM_RUN_UNMERGE))
                        return folio;        /* no need to copy it */
        } else if (!anon_vma) {
                return folio;                /* no need to copy it */
        } else if (folio->index == linear_page_index(vma, addr) &&
                        anon_vma->root == vma->anon_vma->root) {
                return folio;                /* still no need to copy it */
        }
        if (PageHWPoison(page))
                return ERR_PTR(-EHWPOISON);
        if (!folio_test_uptodate(folio))
                return folio;                /* let do_swap_page report the error */

        new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false);
        if (new_folio &&
            mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) {
                folio_put(new_folio);
                new_folio = NULL;
        }
        if (new_folio) {
                if (copy_mc_user_highpage(folio_page(new_folio, 0), page,
                                                                addr, vma)) {
                        folio_put(new_folio);
                        memory_failure_queue(folio_pfn(folio), 0);
                        return ERR_PTR(-EHWPOISON);
                }
                folio_set_dirty(new_folio);
                __folio_mark_uptodate(new_folio);
                __folio_set_locked(new_folio);
#ifdef CONFIG_SWAP
                count_vm_event(KSM_SWPIN_COPY);
#endif
        }

        return new_folio;
}

void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc)
{
        struct ksm_stable_node *stable_node;
        struct ksm_rmap_item *rmap_item;
        int search_new_forks = 0;

        VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio);

        /*
         * Rely on the page lock to protect against concurrent modifications
         * to that page's node of the stable tree.
         */
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        stable_node = folio_stable_node(folio);
        if (!stable_node)
                return;
again:
        hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
                struct anon_vma *anon_vma = rmap_item->anon_vma;
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;

                cond_resched();
                if (!anon_vma_trylock_read(anon_vma)) {
                        if (rwc->try_lock) {
                                rwc->contended = true;
                                return;
                        }
                        anon_vma_lock_read(anon_vma);
                }
                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
                                               0, ULONG_MAX) {
                        unsigned long addr;

                        cond_resched();
                        vma = vmac->vma;

                        /* Ignore the stable/unstable/sqnr flags */
                        addr = rmap_item->address & PAGE_MASK;

                        if (addr < vma->vm_start || addr >= vma->vm_end)
                                continue;
                        /*
                         * Initially we examine only the vma which covers this
                         * rmap_item; but later, if there is still work to do,
                         * we examine covering vmas in other mms: in case they
                         * were forked from the original since ksmd passed.
                         */
                        if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
                                continue;

                        if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                                continue;

                        if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) {
                                anon_vma_unlock_read(anon_vma);
                                return;
                        }
                        if (rwc->done && rwc->done(folio)) {
                                anon_vma_unlock_read(anon_vma);
                                return;
                        }
                }
                anon_vma_unlock_read(anon_vma);
        }
        if (!search_new_forks++)
                goto again;
}

#ifdef CONFIG_MEMORY_FAILURE
/*
 * Collect processes when the error hit an ksm page.
 */
void collect_procs_ksm(struct folio *folio, struct page *page,
                struct list_head *to_kill, int force_early)
{
        struct ksm_stable_node *stable_node;
        struct ksm_rmap_item *rmap_item;
        struct vm_area_struct *vma;
        struct task_struct *tsk;

        stable_node = folio_stable_node(folio);
        if (!stable_node)
                return;
        hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
                struct anon_vma *av = rmap_item->anon_vma;

                anon_vma_lock_read(av);
                rcu_read_lock();
                for_each_process(tsk) {
                        struct anon_vma_chain *vmac;
                        unsigned long addr;
                        struct task_struct *t =
                                task_early_kill(tsk, force_early);
                        if (!t)
                                continue;
                        anon_vma_interval_tree_foreach(vmac, &av->rb_root, 0,
                                                       ULONG_MAX)
                        {
                                vma = vmac->vma;
                                if (vma->vm_mm == t->mm) {
                                        addr = rmap_item->address & PAGE_MASK;
                                        add_to_kill_ksm(t, page, vma, to_kill,
                                                        addr);
                                }
                        }
                }
                rcu_read_unlock();
                anon_vma_unlock_read(av);
        }
}
#endif

#ifdef CONFIG_MIGRATION
void folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
{
        struct ksm_stable_node *stable_node;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio);
        VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio);

        stable_node = folio_stable_node(folio);
        if (stable_node) {
                VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio);
                stable_node->kpfn = folio_pfn(newfolio);
                /*
                 * newfolio->mapping was set in advance; now we need smp_wmb()
                 * to make sure that the new stable_node->kpfn is visible
                 * to ksm_get_folio() before it can see that folio->mapping
                 * has gone stale (or that folio_test_swapcache has been cleared).
                 */
                smp_wmb();
                folio_set_stable_node(folio, NULL);
        }
}
#endif /* CONFIG_MIGRATION */

#ifdef CONFIG_MEMORY_HOTREMOVE
static void wait_while_offlining(void)
{
        while (ksm_run & KSM_RUN_OFFLINE) {
                mutex_unlock(&ksm_thread_mutex);
                wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
                            TASK_UNINTERRUPTIBLE);
                mutex_lock(&ksm_thread_mutex);
        }
}

static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node,
                                         unsigned long start_pfn,
                                         unsigned long end_pfn)
{
        if (stable_node->kpfn >= start_pfn &&
            stable_node->kpfn < end_pfn) {
                /*
                 * Don't ksm_get_folio, page has already gone:
                 * which is why we keep kpfn instead of page*
                 */
                remove_node_from_stable_tree(stable_node);
                return true;
        }
        return false;
}

static bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node,
                                           unsigned long start_pfn,
                                           unsigned long end_pfn,
                                           struct rb_root *root)
{
        struct ksm_stable_node *dup;
        struct hlist_node *hlist_safe;

        if (!is_stable_node_chain(stable_node)) {
                VM_BUG_ON(is_stable_node_dup(stable_node));
                return stable_node_dup_remove_range(stable_node, start_pfn,
                                                    end_pfn);
        }

        hlist_for_each_entry_safe(dup, hlist_safe,
                                  &stable_node->hlist, hlist_dup) {
                VM_BUG_ON(!is_stable_node_dup(dup));
                stable_node_dup_remove_range(dup, start_pfn, end_pfn);
        }
        if (hlist_empty(&stable_node->hlist)) {
                free_stable_node_chain(stable_node, root);
                return true; /* notify caller that tree was rebalanced */
        } else
                return false;
}

static void ksm_check_stable_tree(unsigned long start_pfn,
                                  unsigned long end_pfn)
{
        struct ksm_stable_node *stable_node, *next;
        struct rb_node *node;
        int nid;

        for (nid = 0; nid < ksm_nr_node_ids; nid++) {
                node = rb_first(root_stable_tree + nid);
                while (node) {
                        stable_node = rb_entry(node, struct ksm_stable_node, node);
                        if (stable_node_chain_remove_range(stable_node,
                                                           start_pfn, end_pfn,
                                                           root_stable_tree +
                                                           nid))
                                node = rb_first(root_stable_tree + nid);
                        else
                                node = rb_next(node);
                        cond_resched();
                }
        }
        list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
                if (stable_node->kpfn >= start_pfn &&
                    stable_node->kpfn < end_pfn)
                        remove_node_from_stable_tree(stable_node);
                cond_resched();
        }
}

static int ksm_memory_callback(struct notifier_block *self,
                               unsigned long action, void *arg)
{
        struct memory_notify *mn = arg;

        switch (action) {
        case MEM_GOING_OFFLINE:
                /*
                 * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
                 * and remove_all_stable_nodes() while memory is going offline:
                 * it is unsafe for them to touch the stable tree at this time.
                 * But unmerge_ksm_pages(), rmap lookups and other entry points
                 * which do not need the ksm_thread_mutex are all safe.
                 */
                mutex_lock(&ksm_thread_mutex);
                ksm_run |= KSM_RUN_OFFLINE;
                mutex_unlock(&ksm_thread_mutex);
                break;

        case MEM_OFFLINE:
                /*
                 * Most of the work is done by page migration; but there might
                 * be a few stable_nodes left over, still pointing to struct
                 * pages which have been offlined: prune those from the tree,
                 * otherwise ksm_get_folio() might later try to access a
                 * non-existent struct page.
                 */
                ksm_check_stable_tree(mn->start_pfn,
                                      mn->start_pfn + mn->nr_pages);
                fallthrough;
        case MEM_CANCEL_OFFLINE:
                mutex_lock(&ksm_thread_mutex);
                ksm_run &= ~KSM_RUN_OFFLINE;
                mutex_unlock(&ksm_thread_mutex);

                smp_mb();        /* wake_up_bit advises this */
                wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
                break;
        }
        return NOTIFY_OK;
}
#else
static void wait_while_offlining(void)
{
}
#endif /* CONFIG_MEMORY_HOTREMOVE */

#ifdef CONFIG_PROC_FS
long ksm_process_profit(struct mm_struct *mm)
{
        return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE -
                mm->ksm_rmap_items * sizeof(struct ksm_rmap_item);
}
#endif /* CONFIG_PROC_FS */

#ifdef CONFIG_SYSFS
/*
 * This all compiles without CONFIG_SYSFS, but is a waste of space.
 */

#define KSM_ATTR_RO(_name) \
        static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
#define KSM_ATTR(_name) \
        static struct kobj_attribute _name##_attr = __ATTR_RW(_name)

static ssize_t sleep_millisecs_show(struct kobject *kobj,
                                    struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs);
}

static ssize_t sleep_millisecs_store(struct kobject *kobj,
                                     struct kobj_attribute *attr,
                                     const char *buf, size_t count)
{
        unsigned int msecs;
        int err;

        err = kstrtouint(buf, 10, &msecs);
        if (err)
                return -EINVAL;

        ksm_thread_sleep_millisecs = msecs;
        wake_up_interruptible(&ksm_iter_wait);

        return count;
}
KSM_ATTR(sleep_millisecs);

static ssize_t pages_to_scan_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan);
}

static ssize_t pages_to_scan_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        unsigned int nr_pages;
        int err;

        if (ksm_advisor != KSM_ADVISOR_NONE)
                return -EINVAL;

        err = kstrtouint(buf, 10, &nr_pages);
        if (err)
                return -EINVAL;

        ksm_thread_pages_to_scan = nr_pages;

        return count;
}
KSM_ATTR(pages_to_scan);

static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
                        char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_run);
}

static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
                         const char *buf, size_t count)
{
        unsigned int flags;
        int err;

        err = kstrtouint(buf, 10, &flags);
        if (err)
                return -EINVAL;
        if (flags > KSM_RUN_UNMERGE)
                return -EINVAL;

        /*
         * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
         * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
         * breaking COW to free the pages_shared (but leaves mm_slots
         * on the list for when ksmd may be set running again).
         */

        mutex_lock(&ksm_thread_mutex);
        wait_while_offlining();
        if (ksm_run != flags) {
                ksm_run = flags;
                if (flags & KSM_RUN_UNMERGE) {
                        set_current_oom_origin();
                        err = unmerge_and_remove_all_rmap_items();
                        clear_current_oom_origin();
                        if (err) {
                                ksm_run = KSM_RUN_STOP;
                                count = err;
                        }
                }
        }
        mutex_unlock(&ksm_thread_mutex);

        if (flags & KSM_RUN_MERGE)
                wake_up_interruptible(&ksm_thread_wait);

        return count;
}
KSM_ATTR(run);

#ifdef CONFIG_NUMA
static ssize_t merge_across_nodes_show(struct kobject *kobj,
                                       struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes);
}

static ssize_t merge_across_nodes_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        int err;
        unsigned long knob;

        err = kstrtoul(buf, 10, &knob);
        if (err)
                return err;
        if (knob > 1)
                return -EINVAL;

        mutex_lock(&ksm_thread_mutex);
        wait_while_offlining();
        if (ksm_merge_across_nodes != knob) {
                if (ksm_pages_shared || remove_all_stable_nodes())
                        err = -EBUSY;
                else if (root_stable_tree == one_stable_tree) {
                        struct rb_root *buf;
                        /*
                         * This is the first time that we switch away from the
                         * default of merging across nodes: must now allocate
                         * a buffer to hold as many roots as may be needed.
                         * Allocate stable and unstable together:
                         * MAXSMP NODES_SHIFT 10 will use 16kB.
                         */
                        buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
                                      GFP_KERNEL);
                        /* Let us assume that RB_ROOT is NULL is zero */
                        if (!buf)
                                err = -ENOMEM;
                        else {
                                root_stable_tree = buf;
                                root_unstable_tree = buf + nr_node_ids;
                                /* Stable tree is empty but not the unstable */
                                root_unstable_tree[0] = one_unstable_tree[0];
                        }
                }
                if (!err) {
                        ksm_merge_across_nodes = knob;
                        ksm_nr_node_ids = knob ? 1 : nr_node_ids;
                }
        }
        mutex_unlock(&ksm_thread_mutex);

        return err ? err : count;
}
KSM_ATTR(merge_across_nodes);
#endif

static ssize_t use_zero_pages_show(struct kobject *kobj,
                                   struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_use_zero_pages);
}
static ssize_t use_zero_pages_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        int err;
        bool value;

        err = kstrtobool(buf, &value);
        if (err)
                return -EINVAL;

        ksm_use_zero_pages = value;

        return count;
}
KSM_ATTR(use_zero_pages);

static ssize_t max_page_sharing_show(struct kobject *kobj,
                                     struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_max_page_sharing);
}

static ssize_t max_page_sharing_store(struct kobject *kobj,
                                      struct kobj_attribute *attr,
                                      const char *buf, size_t count)
{
        int err;
        int knob;

        err = kstrtoint(buf, 10, &knob);
        if (err)
                return err;
        /*
         * When a KSM page is created it is shared by 2 mappings. This
         * being a signed comparison, it implicitly verifies it's not
         * negative.
         */
        if (knob < 2)
                return -EINVAL;

        if (READ_ONCE(ksm_max_page_sharing) == knob)
                return count;

        mutex_lock(&ksm_thread_mutex);
        wait_while_offlining();
        if (ksm_max_page_sharing != knob) {
                if (ksm_pages_shared || remove_all_stable_nodes())
                        err = -EBUSY;
                else
                        ksm_max_page_sharing = knob;
        }
        mutex_unlock(&ksm_thread_mutex);

        return err ? err : count;
}
KSM_ATTR(max_page_sharing);

static ssize_t pages_scanned_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_pages_scanned);
}
KSM_ATTR_RO(pages_scanned);

static ssize_t pages_shared_show(struct kobject *kobj,
                                 struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_pages_shared);
}
KSM_ATTR_RO(pages_shared);

static ssize_t pages_sharing_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_pages_sharing);
}
KSM_ATTR_RO(pages_sharing);

static ssize_t pages_unshared_show(struct kobject *kobj,
                                   struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_pages_unshared);
}
KSM_ATTR_RO(pages_unshared);

static ssize_t pages_volatile_show(struct kobject *kobj,
                                   struct kobj_attribute *attr, char *buf)
{
        long ksm_pages_volatile;

        ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
                                - ksm_pages_sharing - ksm_pages_unshared;
        /*
         * It was not worth any locking to calculate that statistic,
         * but it might therefore sometimes be negative: conceal that.
         */
        if (ksm_pages_volatile < 0)
                ksm_pages_volatile = 0;
        return sysfs_emit(buf, "%ld\n", ksm_pages_volatile);
}
KSM_ATTR_RO(pages_volatile);

static ssize_t pages_skipped_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_pages_skipped);
}
KSM_ATTR_RO(pages_skipped);

static ssize_t ksm_zero_pages_show(struct kobject *kobj,
                                struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%ld\n", atomic_long_read(&ksm_zero_pages));
}
KSM_ATTR_RO(ksm_zero_pages);

static ssize_t general_profit_show(struct kobject *kobj,
                                   struct kobj_attribute *attr, char *buf)
{
        long general_profit;

        general_profit = (ksm_pages_sharing + atomic_long_read(&ksm_zero_pages)) * PAGE_SIZE -
                                ksm_rmap_items * sizeof(struct ksm_rmap_item);

        return sysfs_emit(buf, "%ld\n", general_profit);
}
KSM_ATTR_RO(general_profit);

static ssize_t stable_node_dups_show(struct kobject *kobj,
                                     struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups);
}
KSM_ATTR_RO(stable_node_dups);

static ssize_t stable_node_chains_show(struct kobject *kobj,
                                       struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains);
}
KSM_ATTR_RO(stable_node_chains);

static ssize_t
stable_node_chains_prune_millisecs_show(struct kobject *kobj,
                                        struct kobj_attribute *attr,
                                        char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
}

static ssize_t
stable_node_chains_prune_millisecs_store(struct kobject *kobj,
                                         struct kobj_attribute *attr,
                                         const char *buf, size_t count)
{
        unsigned int msecs;
        int err;

        err = kstrtouint(buf, 10, &msecs);
        if (err)
                return -EINVAL;

        ksm_stable_node_chains_prune_millisecs = msecs;

        return count;
}
KSM_ATTR(stable_node_chains_prune_millisecs);

static ssize_t full_scans_show(struct kobject *kobj,
                               struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr);
}
KSM_ATTR_RO(full_scans);

static ssize_t smart_scan_show(struct kobject *kobj,
                               struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_smart_scan);
}

static ssize_t smart_scan_store(struct kobject *kobj,
                                struct kobj_attribute *attr,
                                const char *buf, size_t count)
{
        int err;
        bool value;

        err = kstrtobool(buf, &value);
        if (err)
                return -EINVAL;

        ksm_smart_scan = value;
        return count;
}
KSM_ATTR(smart_scan);

static ssize_t advisor_mode_show(struct kobject *kobj,
                                 struct kobj_attribute *attr, char *buf)
{
        const char *output;

        if (ksm_advisor == KSM_ADVISOR_NONE)
                output = "[none] scan-time";
        else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
                output = "none [scan-time]";

        return sysfs_emit(buf, "%s\n", output);
}

static ssize_t advisor_mode_store(struct kobject *kobj,
                                  struct kobj_attribute *attr, const char *buf,
                                  size_t count)
{
        enum ksm_advisor_type curr_advisor = ksm_advisor;

        if (sysfs_streq("scan-time", buf))
                ksm_advisor = KSM_ADVISOR_SCAN_TIME;
        else if (sysfs_streq("none", buf))
                ksm_advisor = KSM_ADVISOR_NONE;
        else
                return -EINVAL;

        /* Set advisor default values */
        if (curr_advisor != ksm_advisor)
                set_advisor_defaults();

        return count;
}
KSM_ATTR(advisor_mode);

static ssize_t advisor_max_cpu_show(struct kobject *kobj,
                                    struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_advisor_max_cpu);
}

static ssize_t advisor_max_cpu_store(struct kobject *kobj,
                                     struct kobj_attribute *attr,
                                     const char *buf, size_t count)
{
        int err;
        unsigned long value;

        err = kstrtoul(buf, 10, &value);
        if (err)
                return -EINVAL;

        ksm_advisor_max_cpu = value;
        return count;
}
KSM_ATTR(advisor_max_cpu);

static ssize_t advisor_min_pages_to_scan_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_advisor_min_pages_to_scan);
}

static ssize_t advisor_min_pages_to_scan_store(struct kobject *kobj,
                                        struct kobj_attribute *attr,
                                        const char *buf, size_t count)
{
        int err;
        unsigned long value;

        err = kstrtoul(buf, 10, &value);
        if (err)
                return -EINVAL;

        ksm_advisor_min_pages_to_scan = value;
        return count;
}
KSM_ATTR(advisor_min_pages_to_scan);

static ssize_t advisor_max_pages_to_scan_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_advisor_max_pages_to_scan);
}

static ssize_t advisor_max_pages_to_scan_store(struct kobject *kobj,
                                        struct kobj_attribute *attr,
                                        const char *buf, size_t count)
{
        int err;
        unsigned long value;

        err = kstrtoul(buf, 10, &value);
        if (err)
                return -EINVAL;

        ksm_advisor_max_pages_to_scan = value;
        return count;
}
KSM_ATTR(advisor_max_pages_to_scan);

static ssize_t advisor_target_scan_time_show(struct kobject *kobj,
                                             struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_advisor_target_scan_time);
}

static ssize_t advisor_target_scan_time_store(struct kobject *kobj,
                                              struct kobj_attribute *attr,
                                              const char *buf, size_t count)
{
        int err;
        unsigned long value;

        err = kstrtoul(buf, 10, &value);
        if (err)
                return -EINVAL;
        if (value < 1)
                return -EINVAL;

        ksm_advisor_target_scan_time = value;
        return count;
}
KSM_ATTR(advisor_target_scan_time);

static struct attribute *ksm_attrs[] = {
        &sleep_millisecs_attr.attr,
        &pages_to_scan_attr.attr,
        &run_attr.attr,
        &pages_scanned_attr.attr,
        &pages_shared_attr.attr,
        &pages_sharing_attr.attr,
        &pages_unshared_attr.attr,
        &pages_volatile_attr.attr,
        &pages_skipped_attr.attr,
        &ksm_zero_pages_attr.attr,
        &full_scans_attr.attr,
#ifdef CONFIG_NUMA
        &merge_across_nodes_attr.attr,
#endif
        &max_page_sharing_attr.attr,
        &stable_node_chains_attr.attr,
        &stable_node_dups_attr.attr,
        &stable_node_chains_prune_millisecs_attr.attr,
        &use_zero_pages_attr.attr,
        &general_profit_attr.attr,
        &smart_scan_attr.attr,
        &advisor_mode_attr.attr,
        &advisor_max_cpu_attr.attr,
        &advisor_min_pages_to_scan_attr.attr,
        &advisor_max_pages_to_scan_attr.attr,
        &advisor_target_scan_time_attr.attr,
        NULL,
};

static const struct attribute_group ksm_attr_group = {
        .attrs = ksm_attrs,
        .name = "ksm",
};
#endif /* CONFIG_SYSFS */

static int __init ksm_init(void)
{
        struct task_struct *ksm_thread;
        int err;

        /* The correct value depends on page size and endianness */
        zero_checksum = calc_checksum(ZERO_PAGE(0));
        /* Default to false for backwards compatibility */
        ksm_use_zero_pages = false;

        err = ksm_slab_init();
        if (err)
                goto out;

        ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
        if (IS_ERR(ksm_thread)) {
                pr_err("ksm: creating kthread failed\n");
                err = PTR_ERR(ksm_thread);
                goto out_free;
        }

#ifdef CONFIG_SYSFS
        err = sysfs_create_group(mm_kobj, &ksm_attr_group);
        if (err) {
                pr_err("ksm: register sysfs failed\n");
                kthread_stop(ksm_thread);
                goto out_free;
        }
#else
        ksm_run = KSM_RUN_MERGE;        /* no way for user to start it */

#endif /* CONFIG_SYSFS */

#ifdef CONFIG_MEMORY_HOTREMOVE
        /* There is no significance to this priority 100 */
        hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI);
#endif
        return 0;

out_free:
        ksm_slab_free();
out:
        return err;
}
subsys_initcall(ksm_init);


















































































































































































































































































































































































































































   18 

















































































































































































































































































































































































    1 




    1 

















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CGROUP_H
#define _LINUX_CGROUP_H
/*
 *  cgroup interface
 *
 *  Copyright (C) 2003 BULL SA
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 */

#include <linux/sched.h>
#include <linux/cpumask.h>
#include <linux/nodemask.h>
#include <linux/rculist.h>
#include <linux/cgroupstats.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/kernfs.h>
#include <linux/jump_label.h>
#include <linux/types.h>
#include <linux/ns_common.h>
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
#include <linux/refcount.h>
#include <linux/kernel_stat.h>

#include <linux/cgroup-defs.h>

struct kernel_clone_args;

#ifdef CONFIG_CGROUPS

/*
 * All weight knobs on the default hierarchy should use the following min,
 * default and max values.  The default value is the logarithmic center of
 * MIN and MAX and allows 100x to be expressed in both directions.
 */
#define CGROUP_WEIGHT_MIN                1
#define CGROUP_WEIGHT_DFL                100
#define CGROUP_WEIGHT_MAX                10000

enum {
        CSS_TASK_ITER_PROCS    = (1U << 0),  /* walk only threadgroup leaders */
        CSS_TASK_ITER_THREADED = (1U << 1),  /* walk all threaded css_sets in the domain */
        CSS_TASK_ITER_SKIPPED  = (1U << 16), /* internal flags */
};

/* a css_task_iter should be treated as an opaque object */
struct css_task_iter {
        struct cgroup_subsys                *ss;
        unsigned int                        flags;

        struct list_head                *cset_pos;
        struct list_head                *cset_head;

        struct list_head                *tcset_pos;
        struct list_head                *tcset_head;

        struct list_head                *task_pos;

        struct list_head                *cur_tasks_head;
        struct css_set                        *cur_cset;
        struct css_set                        *cur_dcset;
        struct task_struct                *cur_task;
        struct list_head                iters_node;        /* css_set->task_iters */
};

extern struct file_system_type cgroup_fs_type;
extern struct cgroup_root cgrp_dfl_root;
extern struct css_set init_css_set;
extern spinlock_t css_set_lock;

#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
#include <linux/cgroup_subsys.h>
#undef SUBSYS

#define SUBSYS(_x)                                                                \
        extern struct static_key_true _x ## _cgrp_subsys_enabled_key;                \
        extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key;
#include <linux/cgroup_subsys.h>
#undef SUBSYS

/**
 * cgroup_subsys_enabled - fast test on whether a subsys is enabled
 * @ss: subsystem in question
 */
#define cgroup_subsys_enabled(ss)                                                \
        static_branch_likely(&ss ## _enabled_key)

/**
 * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy
 * @ss: subsystem in question
 */
#define cgroup_subsys_on_dfl(ss)                                                \
        static_branch_likely(&ss ## _on_dfl_key)

bool css_has_online_children(struct cgroup_subsys_state *css);
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup,
                                         struct cgroup_subsys *ss);
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
                                             struct cgroup_subsys *ss);
struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
                                                       struct cgroup_subsys *ss);

struct cgroup *cgroup_get_from_path(const char *path);
struct cgroup *cgroup_get_from_fd(int fd);
struct cgroup *cgroup_v1v2_get_from_fd(int fd);

int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);

int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_rm_cftypes(struct cftype *cfts);
void cgroup_file_notify(struct cgroup_file *cfile);
void cgroup_file_show(struct cgroup_file *cfile, bool show);

int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry);
int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                     struct pid *pid, struct task_struct *tsk);

void cgroup_fork(struct task_struct *p);
extern int cgroup_can_fork(struct task_struct *p,
                           struct kernel_clone_args *kargs);
extern void cgroup_cancel_fork(struct task_struct *p,
                               struct kernel_clone_args *kargs);
extern void cgroup_post_fork(struct task_struct *p,
                             struct kernel_clone_args *kargs);
void cgroup_exit(struct task_struct *p);
void cgroup_release(struct task_struct *p);
void cgroup_free(struct task_struct *p);

int cgroup_init_early(void);
int cgroup_init(void);

int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v);

/*
 * Iteration helpers and macros.
 */

struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
                                           struct cgroup_subsys_state *parent);
struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos,
                                                    struct cgroup_subsys_state *css);
struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos);
struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos,
                                                     struct cgroup_subsys_state *css);

struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
                                         struct cgroup_subsys_state **dst_cssp);
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
                                        struct cgroup_subsys_state **dst_cssp);

void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
                         struct css_task_iter *it);
struct task_struct *css_task_iter_next(struct css_task_iter *it);
void css_task_iter_end(struct css_task_iter *it);

/**
 * css_for_each_child - iterate through children of a css
 * @pos: the css * to use as the loop cursor
 * @parent: css whose children to walk
 *
 * Walk @parent's children.  Must be called under rcu_read_lock().
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * It is allowed to temporarily drop RCU read lock during iteration.  The
 * caller is responsible for ensuring that @pos remains accessible until
 * the start of the next iteration by, for example, bumping the css refcnt.
 */
#define css_for_each_child(pos, parent)                                        \
        for ((pos) = css_next_child(NULL, (parent)); (pos);                \
             (pos) = css_next_child((pos), (parent)))

/**
 * css_for_each_descendant_pre - pre-order walk of a css's descendants
 * @pos: the css * to use as the loop cursor
 * @root: css whose descendants to walk
 *
 * Walk @root's descendants.  @root is included in the iteration and the
 * first node to be visited.  Must be called under rcu_read_lock().
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * For example, the following guarantees that a descendant can't escape
 * state updates of its ancestors.
 *
 * my_online(@css)
 * {
 *        Lock @css's parent and @css;
 *        Inherit state from the parent;
 *        Unlock both.
 * }
 *
 * my_update_state(@css)
 * {
 *        css_for_each_descendant_pre(@pos, @css) {
 *                Lock @pos;
 *                if (@pos == @css)
 *                        Update @css's state;
 *                else
 *                        Verify @pos is alive and inherit state from its parent;
 *                Unlock @pos;
 *        }
 * }
 *
 * As long as the inheriting step, including checking the parent state, is
 * enclosed inside @pos locking, double-locking the parent isn't necessary
 * while inheriting.  The state update to the parent is guaranteed to be
 * visible by walking order and, as long as inheriting operations to the
 * same @pos are atomic to each other, multiple updates racing each other
 * still result in the correct state.  It's guaranateed that at least one
 * inheritance happens for any css after the latest update to its parent.
 *
 * If checking parent's state requires locking the parent, each inheriting
 * iteration should lock and unlock both @pos->parent and @pos.
 *
 * Alternatively, a subsystem may choose to use a single global lock to
 * synchronize ->css_online() and ->css_offline() against tree-walking
 * operations.
 *
 * It is allowed to temporarily drop RCU read lock during iteration.  The
 * caller is responsible for ensuring that @pos remains accessible until
 * the start of the next iteration by, for example, bumping the css refcnt.
 */
#define css_for_each_descendant_pre(pos, css)                                \
        for ((pos) = css_next_descendant_pre(NULL, (css)); (pos);        \
             (pos) = css_next_descendant_pre((pos), (css)))

/**
 * css_for_each_descendant_post - post-order walk of a css's descendants
 * @pos: the css * to use as the loop cursor
 * @css: css whose descendants to walk
 *
 * Similar to css_for_each_descendant_pre() but performs post-order
 * traversal instead.  @root is included in the iteration and the last
 * node to be visited.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * Note that the walk visibility guarantee example described in pre-order
 * walk doesn't apply the same to post-order walks.
 */
#define css_for_each_descendant_post(pos, css)                                \
        for ((pos) = css_next_descendant_post(NULL, (css)); (pos);        \
             (pos) = css_next_descendant_post((pos), (css)))

/**
 * cgroup_taskset_for_each - iterate cgroup_taskset
 * @task: the loop cursor
 * @dst_css: the destination css
 * @tset: taskset to iterate
 *
 * @tset may contain multiple tasks and they may belong to multiple
 * processes.
 *
 * On the v2 hierarchy, there may be tasks from multiple processes and they
 * may not share the source or destination csses.
 *
 * On traditional hierarchies, when there are multiple tasks in @tset, if a
 * task of a process is in @tset, all tasks of the process are in @tset.
 * Also, all are guaranteed to share the same source and destination csses.
 *
 * Iteration is not in any specific order.
 */
#define cgroup_taskset_for_each(task, dst_css, tset)                        \
        for ((task) = cgroup_taskset_first((tset), &(dst_css));                \
             (task);                                                        \
             (task) = cgroup_taskset_next((tset), &(dst_css)))

/**
 * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
 * @leader: the loop cursor
 * @dst_css: the destination css
 * @tset: taskset to iterate
 *
 * Iterate threadgroup leaders of @tset.  For single-task migrations, @tset
 * may not contain any.
 */
#define cgroup_taskset_for_each_leader(leader, dst_css, tset)                \
        for ((leader) = cgroup_taskset_first((tset), &(dst_css));        \
             (leader);                                                        \
             (leader) = cgroup_taskset_next((tset), &(dst_css)))        \
                if ((leader) != (leader)->group_leader)                        \
                        ;                                                \
                else

/*
 * Inline functions.
 */

#ifdef CONFIG_DEBUG_CGROUP_REF
void css_get(struct cgroup_subsys_state *css);
void css_get_many(struct cgroup_subsys_state *css, unsigned int n);
bool css_tryget(struct cgroup_subsys_state *css);
bool css_tryget_online(struct cgroup_subsys_state *css);
void css_put(struct cgroup_subsys_state *css);
void css_put_many(struct cgroup_subsys_state *css, unsigned int n);
#else
#define CGROUP_REF_FN_ATTRS        static inline
#define CGROUP_REF_EXPORT(fn)
#include <linux/cgroup_refcnt.h>
#endif

static inline u64 cgroup_id(const struct cgroup *cgrp)
{
        return cgrp->kn->id;
}

/**
 * css_is_dying - test whether the specified css is dying
 * @css: target css
 *
 * Test whether @css is in the process of offlining or already offline.  In
 * most cases, ->css_online() and ->css_offline() callbacks should be
 * enough; however, the actual offline operations are RCU delayed and this
 * test returns %true also when @css is scheduled to be offlined.
 *
 * This is useful, for example, when the use case requires synchronous
 * behavior with respect to cgroup removal.  cgroup removal schedules css
 * offlining but the css can seem alive while the operation is being
 * delayed.  If the delay affects user visible semantics, this test can be
 * used to resolve the situation.
 */
static inline bool css_is_dying(struct cgroup_subsys_state *css)
{
        return !(css->flags & CSS_NO_REF) && percpu_ref_is_dying(&css->refcnt);
}

static inline void cgroup_get(struct cgroup *cgrp)
{
        css_get(&cgrp->self);
}

static inline bool cgroup_tryget(struct cgroup *cgrp)
{
        return css_tryget(&cgrp->self);
}

static inline void cgroup_put(struct cgroup *cgrp)
{
        css_put(&cgrp->self);
}

extern struct mutex cgroup_mutex;

static inline void cgroup_lock(void)
{
        mutex_lock(&cgroup_mutex);
}

static inline void cgroup_unlock(void)
{
        mutex_unlock(&cgroup_mutex);
}

/**
 * task_css_set_check - obtain a task's css_set with extra access conditions
 * @task: the task to obtain css_set for
 * @__c: extra condition expression to be passed to rcu_dereference_check()
 *
 * A task's css_set is RCU protected, initialized and exited while holding
 * task_lock(), and can only be modified while holding both cgroup_mutex
 * and task_lock() while the task is alive.  This macro verifies that the
 * caller is inside proper critical section and returns @task's css_set.
 *
 * The caller can also specify additional allowed conditions via @__c, such
 * as locks used during the cgroup_subsys::attach() methods.
 */
#ifdef CONFIG_PROVE_RCU
#define task_css_set_check(task, __c)                                        \
        rcu_dereference_check((task)->cgroups,                                \
                rcu_read_lock_sched_held() ||                                \
                lockdep_is_held(&cgroup_mutex) ||                        \
                lockdep_is_held(&css_set_lock) ||                        \
                ((task)->flags & PF_EXITING) || (__c))
#else
#define task_css_set_check(task, __c)                                        \
        rcu_dereference((task)->cgroups)
#endif

/**
 * task_css_check - obtain css for (task, subsys) w/ extra access conds
 * @task: the target task
 * @subsys_id: the target subsystem ID
 * @__c: extra condition expression to be passed to rcu_dereference_check()
 *
 * Return the cgroup_subsys_state for the (@task, @subsys_id) pair.  The
 * synchronization rules are the same as task_css_set_check().
 */
#define task_css_check(task, subsys_id, __c)                                \
        task_css_set_check((task), (__c))->subsys[(subsys_id)]

/**
 * task_css_set - obtain a task's css_set
 * @task: the task to obtain css_set for
 *
 * See task_css_set_check().
 */
static inline struct css_set *task_css_set(struct task_struct *task)
{
        return task_css_set_check(task, false);
}

/**
 * task_css - obtain css for (task, subsys)
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * See task_css_check().
 */
static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
                                                   int subsys_id)
{
        return task_css_check(task, subsys_id, false);
}

/**
 * task_get_css - find and get the css for (task, subsys)
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * Find the css for the (@task, @subsys_id) combination, increment a
 * reference on and return it.  This function is guaranteed to return a
 * valid css.  The returned css may already have been offlined.
 */
static inline struct cgroup_subsys_state *
task_get_css(struct task_struct *task, int subsys_id)
{
        struct cgroup_subsys_state *css;

        rcu_read_lock();
        while (true) {
                css = task_css(task, subsys_id);
                /*
                 * Can't use css_tryget_online() here.  A task which has
                 * PF_EXITING set may stay associated with an offline css.
                 * If such task calls this function, css_tryget_online()
                 * will keep failing.
                 */
                if (likely(css_tryget(css)))
                        break;
                cpu_relax();
        }
        rcu_read_unlock();
        return css;
}

/**
 * task_css_is_root - test whether a task belongs to the root css
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * Test whether @task belongs to the root css on the specified subsystem.
 * May be invoked in any context.
 */
static inline bool task_css_is_root(struct task_struct *task, int subsys_id)
{
        return task_css_check(task, subsys_id, true) ==
                init_css_set.subsys[subsys_id];
}

static inline struct cgroup *task_cgroup(struct task_struct *task,
                                         int subsys_id)
{
        return task_css(task, subsys_id)->cgroup;
}

static inline struct cgroup *task_dfl_cgroup(struct task_struct *task)
{
        return task_css_set(task)->dfl_cgrp;
}

static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
        struct cgroup_subsys_state *parent_css = cgrp->self.parent;

        if (parent_css)
                return container_of(parent_css, struct cgroup, self);
        return NULL;
}

/**
 * cgroup_is_descendant - test ancestry
 * @cgrp: the cgroup to be tested
 * @ancestor: possible ancestor of @cgrp
 *
 * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
 * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
 * and @ancestor are accessible.
 */
static inline bool cgroup_is_descendant(struct cgroup *cgrp,
                                        struct cgroup *ancestor)
{
        if (cgrp->root != ancestor->root || cgrp->level < ancestor->level)
                return false;
        return cgrp->ancestors[ancestor->level] == ancestor;
}

/**
 * cgroup_ancestor - find ancestor of cgroup
 * @cgrp: cgroup to find ancestor of
 * @ancestor_level: level of ancestor to find starting from root
 *
 * Find ancestor of cgroup at specified level starting from root if it exists
 * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at
 * @ancestor_level.
 *
 * This function is safe to call as long as @cgrp is accessible.
 */
static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp,
                                             int ancestor_level)
{
        if (ancestor_level < 0 || ancestor_level > cgrp->level)
                return NULL;
        return cgrp->ancestors[ancestor_level];
}

/**
 * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry
 * @task: the task to be tested
 * @ancestor: possible ancestor of @task's cgroup
 *
 * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
 * It follows all the same rules as cgroup_is_descendant, and only applies
 * to the default hierarchy.
 */
static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
                                               struct cgroup *ancestor)
{
        struct css_set *cset = task_css_set(task);

        return cgroup_is_descendant(cset->dfl_cgrp, ancestor);
}

/* no synchronization, the result can only be used as a hint */
static inline bool cgroup_is_populated(struct cgroup *cgrp)
{
        return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children +
                cgrp->nr_populated_threaded_children;
}

/* returns ino associated with a cgroup */
static inline ino_t cgroup_ino(struct cgroup *cgrp)
{
        return kernfs_ino(cgrp->kn);
}

/* cft/css accessors for cftype->write() operation */
static inline struct cftype *of_cft(struct kernfs_open_file *of)
{
        return of->kn->priv;
}

struct cgroup_subsys_state *of_css(struct kernfs_open_file *of);

/* cft/css accessors for cftype->seq_*() operations */
static inline struct cftype *seq_cft(struct seq_file *seq)
{
        return of_cft(seq->private);
}

static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
{
        return of_css(seq->private);
}

/*
 * Name / path handling functions.  All are thin wrappers around the kernfs
 * counterparts and can be called under any context.
 */

static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
{
        return kernfs_name(cgrp->kn, buf, buflen);
}

static inline int cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen)
{
        return kernfs_path(cgrp->kn, buf, buflen);
}

static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
{
        pr_cont_kernfs_name(cgrp->kn);
}

static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
{
        pr_cont_kernfs_path(cgrp->kn);
}

bool cgroup_psi_enabled(void);

static inline void cgroup_init_kthreadd(void)
{
        /*
         * kthreadd is inherited by all kthreads, keep it in the root so
         * that the new kthreads are guaranteed to stay in the root until
         * initialization is finished.
         */
        current->no_cgroup_migration = 1;
}

static inline void cgroup_kthread_ready(void)
{
        /*
         * This kthread finished initialization.  The creator should have
         * set PF_NO_SETAFFINITY if this kthread should stay in the root.
         */
        current->no_cgroup_migration = 0;
}

void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen);
struct cgroup *cgroup_get_from_id(u64 id);
#else /* !CONFIG_CGROUPS */

struct cgroup_subsys_state;
struct cgroup;

static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {}
static inline void cgroup_lock(void) {}
static inline void cgroup_unlock(void) {}
static inline int cgroup_attach_task_all(struct task_struct *from,
                                         struct task_struct *t) { return 0; }
static inline int cgroupstats_build(struct cgroupstats *stats,
                                    struct dentry *dentry) { return -EINVAL; }

static inline void cgroup_fork(struct task_struct *p) {}
static inline int cgroup_can_fork(struct task_struct *p,
                                  struct kernel_clone_args *kargs) { return 0; }
static inline void cgroup_cancel_fork(struct task_struct *p,
                                      struct kernel_clone_args *kargs) {}
static inline void cgroup_post_fork(struct task_struct *p,
                                    struct kernel_clone_args *kargs) {}
static inline void cgroup_exit(struct task_struct *p) {}
static inline void cgroup_release(struct task_struct *p) {}
static inline void cgroup_free(struct task_struct *p) {}

static inline int cgroup_init_early(void) { return 0; }
static inline int cgroup_init(void) { return 0; }
static inline void cgroup_init_kthreadd(void) {}
static inline void cgroup_kthread_ready(void) {}

static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
        return NULL;
}

static inline bool cgroup_psi_enabled(void)
{
        return false;
}

static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
                                               struct cgroup *ancestor)
{
        return true;
}

static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
{}
#endif /* !CONFIG_CGROUPS */

#ifdef CONFIG_CGROUPS
/*
 * cgroup scalable recursive statistics.
 */
void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
void cgroup_rstat_flush(struct cgroup *cgrp);
void cgroup_rstat_flush_hold(struct cgroup *cgrp);
void cgroup_rstat_flush_release(struct cgroup *cgrp);

/*
 * Basic resource stats.
 */
#ifdef CONFIG_CGROUP_CPUACCT
void cpuacct_charge(struct task_struct *tsk, u64 cputime);
void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
static inline void cpuacct_account_field(struct task_struct *tsk, int index,
                                         u64 val) {}
#endif

void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec);
void __cgroup_account_cputime_field(struct cgroup *cgrp,
                                    enum cpu_usage_stat index, u64 delta_exec);

static inline void cgroup_account_cputime(struct task_struct *task,
                                          u64 delta_exec)
{
        struct cgroup *cgrp;

        cpuacct_charge(task, delta_exec);

        cgrp = task_dfl_cgroup(task);
        if (cgroup_parent(cgrp))
                __cgroup_account_cputime(cgrp, delta_exec);
}

static inline void cgroup_account_cputime_field(struct task_struct *task,
                                                enum cpu_usage_stat index,
                                                u64 delta_exec)
{
        struct cgroup *cgrp;

        cpuacct_account_field(task, index, delta_exec);

        cgrp = task_dfl_cgroup(task);
        if (cgroup_parent(cgrp))
                __cgroup_account_cputime_field(cgrp, index, delta_exec);
}

#else        /* CONFIG_CGROUPS */

static inline void cgroup_account_cputime(struct task_struct *task,
                                          u64 delta_exec) {}
static inline void cgroup_account_cputime_field(struct task_struct *task,
                                                enum cpu_usage_stat index,
                                                u64 delta_exec) {}

#endif        /* CONFIG_CGROUPS */

/*
 * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
 * definition in cgroup-defs.h.
 */
#ifdef CONFIG_SOCK_CGROUP_DATA

void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
void cgroup_sk_clone(struct sock_cgroup_data *skcd);
void cgroup_sk_free(struct sock_cgroup_data *skcd);

static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
{
        return skcd->cgroup;
}

#else        /* CONFIG_CGROUP_DATA */

static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {}
static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {}
static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}

#endif        /* CONFIG_CGROUP_DATA */

struct cgroup_namespace {
        struct ns_common        ns;
        struct user_namespace        *user_ns;
        struct ucounts                *ucounts;
        struct css_set          *root_cset;
};

extern struct cgroup_namespace init_cgroup_ns;

#ifdef CONFIG_CGROUPS

void free_cgroup_ns(struct cgroup_namespace *ns);

struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
                                        struct user_namespace *user_ns,
                                        struct cgroup_namespace *old_ns);

int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
                   struct cgroup_namespace *ns);

#else /* !CONFIG_CGROUPS */

static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
static inline struct cgroup_namespace *
copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
               struct cgroup_namespace *old_ns)
{
        return old_ns;
}

#endif /* !CONFIG_CGROUPS */

static inline void get_cgroup_ns(struct cgroup_namespace *ns)
{
        if (ns)
                refcount_inc(&ns->ns.count);
}

static inline void put_cgroup_ns(struct cgroup_namespace *ns)
{
        if (ns && refcount_dec_and_test(&ns->ns.count))
                free_cgroup_ns(ns);
}

#ifdef CONFIG_CGROUPS

void cgroup_enter_frozen(void);
void cgroup_leave_frozen(bool always_leave);
void cgroup_update_frozen(struct cgroup *cgrp);
void cgroup_freeze(struct cgroup *cgrp, bool freeze);
void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src,
                                 struct cgroup *dst);

static inline bool cgroup_task_frozen(struct task_struct *task)
{
        return task->frozen;
}

#else /* !CONFIG_CGROUPS */

static inline void cgroup_enter_frozen(void) { }
static inline void cgroup_leave_frozen(bool always_leave) { }
static inline bool cgroup_task_frozen(struct task_struct *task)
{
        return false;
}

#endif /* !CONFIG_CGROUPS */

#ifdef CONFIG_CGROUP_BPF
static inline void cgroup_bpf_get(struct cgroup *cgrp)
{
        percpu_ref_get(&cgrp->bpf.refcnt);
}

static inline void cgroup_bpf_put(struct cgroup *cgrp)
{
        percpu_ref_put(&cgrp->bpf.refcnt);
}

#else /* CONFIG_CGROUP_BPF */

static inline void cgroup_bpf_get(struct cgroup *cgrp) {}
static inline void cgroup_bpf_put(struct cgroup *cgrp) {}

#endif /* CONFIG_CGROUP_BPF */

struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id);

#endif /* _LINUX_CGROUP_H */



















    2 







    2 

    2 













    1 


    2 




    2 


    2 




















































































































































































    1 









    1 
























    1 


















    1 




























    2 






    1 



    2 
    2 




























    1 














    1 

    1 





    1 


    1 
    1 








































































    2 










    2 


    2 









    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/hfs/bnode.c
 *
 * Copyright (C) 2001
 * Brad Boyer (flar@allandria.com)
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 *
 * Handle basic btree node operations
 */

#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/swap.h>

#include "btree.h"

void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
{
        struct page *page;
        int pagenum;
        int bytes_read;
        int bytes_to_read;

        off += node->page_offset;
        pagenum = off >> PAGE_SHIFT;
        off &= ~PAGE_MASK; /* compute page offset for the first page */

        for (bytes_read = 0; bytes_read < len; bytes_read += bytes_to_read) {
                if (pagenum >= node->tree->pages_per_bnode)
                        break;
                page = node->page[pagenum];
                bytes_to_read = min_t(int, len - bytes_read, PAGE_SIZE - off);

                memcpy_from_page(buf + bytes_read, page, off, bytes_to_read);

                pagenum++;
                off = 0; /* page offset only applies to the first page */
        }
}

u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
{
        __be16 data;
        // optimize later...
        hfs_bnode_read(node, &data, off, 2);
        return be16_to_cpu(data);
}

u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off)
{
        u8 data;
        // optimize later...
        hfs_bnode_read(node, &data, off, 1);
        return data;
}

void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off)
{
        struct hfs_btree *tree;
        int key_len;

        tree = node->tree;
        if (node->type == HFS_NODE_LEAF ||
            tree->attributes & HFS_TREE_VARIDXKEYS)
                key_len = hfs_bnode_read_u8(node, off) + 1;
        else
                key_len = tree->max_key_len + 1;

        hfs_bnode_read(node, key, off, key_len);
}

void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
{
        struct page *page;

        off += node->page_offset;
        page = node->page[0];

        memcpy_to_page(page, off, buf, len);
        set_page_dirty(page);
}

void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data)
{
        __be16 v = cpu_to_be16(data);
        // optimize later...
        hfs_bnode_write(node, &v, off, 2);
}

void hfs_bnode_write_u8(struct hfs_bnode *node, int off, u8 data)
{
        // optimize later...
        hfs_bnode_write(node, &data, off, 1);
}

void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
{
        struct page *page;

        off += node->page_offset;
        page = node->page[0];

        memzero_page(page, off, len);
        set_page_dirty(page);
}

void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
                struct hfs_bnode *src_node, int src, int len)
{
        struct page *src_page, *dst_page;

        hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
        if (!len)
                return;
        src += src_node->page_offset;
        dst += dst_node->page_offset;
        src_page = src_node->page[0];
        dst_page = dst_node->page[0];

        memcpy_page(dst_page, dst, src_page, src, len);
        set_page_dirty(dst_page);
}

void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
{
        struct page *page;
        void *ptr;

        hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
        if (!len)
                return;
        src += node->page_offset;
        dst += node->page_offset;
        page = node->page[0];
        ptr = kmap_local_page(page);
        memmove(ptr + dst, ptr + src, len);
        kunmap_local(ptr);
        set_page_dirty(page);
}

void hfs_bnode_dump(struct hfs_bnode *node)
{
        struct hfs_bnode_desc desc;
        __be32 cnid;
        int i, off, key_off;

        hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this);
        hfs_bnode_read(node, &desc, 0, sizeof(desc));
        hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n",
                be32_to_cpu(desc.next), be32_to_cpu(desc.prev),
                desc.type, desc.height, be16_to_cpu(desc.num_recs));

        off = node->tree->node_size - 2;
        for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) {
                key_off = hfs_bnode_read_u16(node, off);
                hfs_dbg_cont(BNODE_MOD, " %d", key_off);
                if (i && node->type == HFS_NODE_INDEX) {
                        int tmp;

                        if (node->tree->attributes & HFS_TREE_VARIDXKEYS)
                                tmp = (hfs_bnode_read_u8(node, key_off) | 1) + 1;
                        else
                                tmp = node->tree->max_key_len + 1;
                        hfs_dbg_cont(BNODE_MOD, " (%d,%d",
                                     tmp, hfs_bnode_read_u8(node, key_off));
                        hfs_bnode_read(node, &cnid, key_off + tmp, 4);
                        hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid));
                } else if (i && node->type == HFS_NODE_LEAF) {
                        int tmp;

                        tmp = hfs_bnode_read_u8(node, key_off);
                        hfs_dbg_cont(BNODE_MOD, " (%d)", tmp);
                }
        }
        hfs_dbg_cont(BNODE_MOD, "\n");
}

void hfs_bnode_unlink(struct hfs_bnode *node)
{
        struct hfs_btree *tree;
        struct hfs_bnode *tmp;
        __be32 cnid;

        tree = node->tree;
        if (node->prev) {
                tmp = hfs_bnode_find(tree, node->prev);
                if (IS_ERR(tmp))
                        return;
                tmp->next = node->next;
                cnid = cpu_to_be32(tmp->next);
                hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, next), 4);
                hfs_bnode_put(tmp);
        } else if (node->type == HFS_NODE_LEAF)
                tree->leaf_head = node->next;

        if (node->next) {
                tmp = hfs_bnode_find(tree, node->next);
                if (IS_ERR(tmp))
                        return;
                tmp->prev = node->prev;
                cnid = cpu_to_be32(tmp->prev);
                hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, prev), 4);
                hfs_bnode_put(tmp);
        } else if (node->type == HFS_NODE_LEAF)
                tree->leaf_tail = node->prev;

        // move down?
        if (!node->prev && !node->next) {
                printk(KERN_DEBUG "hfs_btree_del_level\n");
        }
        if (!node->parent) {
                tree->root = 0;
                tree->depth = 0;
        }
        set_bit(HFS_BNODE_DELETED, &node->flags);
}

static inline int hfs_bnode_hash(u32 num)
{
        num = (num >> 16) + num;
        num += num >> 8;
        return num & (NODE_HASH_SIZE - 1);
}

struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
{
        struct hfs_bnode *node;

        if (cnid >= tree->node_count) {
                pr_err("request for non-existent node %d in B*Tree\n", cnid);
                return NULL;
        }

        for (node = tree->node_hash[hfs_bnode_hash(cnid)];
             node; node = node->next_hash) {
                if (node->this == cnid) {
                        return node;
                }
        }
        return NULL;
}

static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
{
        struct hfs_bnode *node, *node2;
        struct address_space *mapping;
        struct page *page;
        int size, block, i, hash;
        loff_t off;

        if (cnid >= tree->node_count) {
                pr_err("request for non-existent node %d in B*Tree\n", cnid);
                return NULL;
        }

        size = sizeof(struct hfs_bnode) + tree->pages_per_bnode *
                sizeof(struct page *);
        node = kzalloc(size, GFP_KERNEL);
        if (!node)
                return NULL;
        node->tree = tree;
        node->this = cnid;
        set_bit(HFS_BNODE_NEW, &node->flags);
        atomic_set(&node->refcnt, 1);
        hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n",
                node->tree->cnid, node->this);
        init_waitqueue_head(&node->lock_wq);
        spin_lock(&tree->hash_lock);
        node2 = hfs_bnode_findhash(tree, cnid);
        if (!node2) {
                hash = hfs_bnode_hash(cnid);
                node->next_hash = tree->node_hash[hash];
                tree->node_hash[hash] = node;
                tree->node_hash_cnt++;
        } else {
                hfs_bnode_get(node2);
                spin_unlock(&tree->hash_lock);
                kfree(node);
                wait_event(node2->lock_wq, !test_bit(HFS_BNODE_NEW, &node2->flags));
                return node2;
        }
        spin_unlock(&tree->hash_lock);

        mapping = tree->inode->i_mapping;
        off = (loff_t)cnid * tree->node_size;
        block = off >> PAGE_SHIFT;
        node->page_offset = off & ~PAGE_MASK;
        for (i = 0; i < tree->pages_per_bnode; i++) {
                page = read_mapping_page(mapping, block++, NULL);
                if (IS_ERR(page))
                        goto fail;
                node->page[i] = page;
        }

        return node;
fail:
        set_bit(HFS_BNODE_ERROR, &node->flags);
        return node;
}

void hfs_bnode_unhash(struct hfs_bnode *node)
{
        struct hfs_bnode **p;

        hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n",
                node->tree->cnid, node->this, atomic_read(&node->refcnt));
        for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)];
             *p && *p != node; p = &(*p)->next_hash)
                ;
        BUG_ON(!*p);
        *p = node->next_hash;
        node->tree->node_hash_cnt--;
}

/* Load a particular node out of a tree */
struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
{
        struct hfs_bnode *node;
        struct hfs_bnode_desc *desc;
        int i, rec_off, off, next_off;
        int entry_size, key_size;

        spin_lock(&tree->hash_lock);
        node = hfs_bnode_findhash(tree, num);
        if (node) {
                hfs_bnode_get(node);
                spin_unlock(&tree->hash_lock);
                wait_event(node->lock_wq, !test_bit(HFS_BNODE_NEW, &node->flags));
                if (test_bit(HFS_BNODE_ERROR, &node->flags))
                        goto node_error;
                return node;
        }
        spin_unlock(&tree->hash_lock);
        node = __hfs_bnode_create(tree, num);
        if (!node)
                return ERR_PTR(-ENOMEM);
        if (test_bit(HFS_BNODE_ERROR, &node->flags))
                goto node_error;
        if (!test_bit(HFS_BNODE_NEW, &node->flags))
                return node;

        desc = (struct hfs_bnode_desc *)(kmap_local_page(node->page[0]) +
                                         node->page_offset);
        node->prev = be32_to_cpu(desc->prev);
        node->next = be32_to_cpu(desc->next);
        node->num_recs = be16_to_cpu(desc->num_recs);
        node->type = desc->type;
        node->height = desc->height;
        kunmap_local(desc);

        switch (node->type) {
        case HFS_NODE_HEADER:
        case HFS_NODE_MAP:
                if (node->height != 0)
                        goto node_error;
                break;
        case HFS_NODE_LEAF:
                if (node->height != 1)
                        goto node_error;
                break;
        case HFS_NODE_INDEX:
                if (node->height <= 1 || node->height > tree->depth)
                        goto node_error;
                break;
        default:
                goto node_error;
        }

        rec_off = tree->node_size - 2;
        off = hfs_bnode_read_u16(node, rec_off);
        if (off != sizeof(struct hfs_bnode_desc))
                goto node_error;
        for (i = 1; i <= node->num_recs; off = next_off, i++) {
                rec_off -= 2;
                next_off = hfs_bnode_read_u16(node, rec_off);
                if (next_off <= off ||
                    next_off > tree->node_size ||
                    next_off & 1)
                        goto node_error;
                entry_size = next_off - off;
                if (node->type != HFS_NODE_INDEX &&
                    node->type != HFS_NODE_LEAF)
                        continue;
                key_size = hfs_bnode_read_u8(node, off) + 1;
                if (key_size >= entry_size /*|| key_size & 1*/)
                        goto node_error;
        }
        clear_bit(HFS_BNODE_NEW, &node->flags);
        wake_up(&node->lock_wq);
        return node;

node_error:
        set_bit(HFS_BNODE_ERROR, &node->flags);
        clear_bit(HFS_BNODE_NEW, &node->flags);
        wake_up(&node->lock_wq);
        hfs_bnode_put(node);
        return ERR_PTR(-EIO);
}

void hfs_bnode_free(struct hfs_bnode *node)
{
        int i;

        for (i = 0; i < node->tree->pages_per_bnode; i++)
                if (node->page[i])
                        put_page(node->page[i]);
        kfree(node);
}

struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
{
        struct hfs_bnode *node;
        struct page **pagep;
        int i;

        spin_lock(&tree->hash_lock);
        node = hfs_bnode_findhash(tree, num);
        spin_unlock(&tree->hash_lock);
        if (node) {
                pr_crit("new node %u already hashed?\n", num);
                WARN_ON(1);
                return node;
        }
        node = __hfs_bnode_create(tree, num);
        if (!node)
                return ERR_PTR(-ENOMEM);
        if (test_bit(HFS_BNODE_ERROR, &node->flags)) {
                hfs_bnode_put(node);
                return ERR_PTR(-EIO);
        }

        pagep = node->page;
        memzero_page(*pagep, node->page_offset,
                     min((int)PAGE_SIZE, (int)tree->node_size));
        set_page_dirty(*pagep);
        for (i = 1; i < tree->pages_per_bnode; i++) {
                memzero_page(*++pagep, 0, PAGE_SIZE);
                set_page_dirty(*pagep);
        }
        clear_bit(HFS_BNODE_NEW, &node->flags);
        wake_up(&node->lock_wq);

        return node;
}

void hfs_bnode_get(struct hfs_bnode *node)
{
        if (node) {
                atomic_inc(&node->refcnt);
                hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n",
                        node->tree->cnid, node->this,
                        atomic_read(&node->refcnt));
        }
}

/* Dispose of resources used by a node */
void hfs_bnode_put(struct hfs_bnode *node)
{
        if (node) {
                struct hfs_btree *tree = node->tree;
                int i;

                hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n",
                        node->tree->cnid, node->this,
                        atomic_read(&node->refcnt));
                BUG_ON(!atomic_read(&node->refcnt));
                if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock))
                        return;
                for (i = 0; i < tree->pages_per_bnode; i++) {
                        if (!node->page[i])
                                continue;
                        mark_page_accessed(node->page[i]);
                }

                if (test_bit(HFS_BNODE_DELETED, &node->flags)) {
                        hfs_bnode_unhash(node);
                        spin_unlock(&tree->hash_lock);
                        hfs_bmap_free(node);
                        hfs_bnode_free(node);
                        return;
                }
                spin_unlock(&tree->hash_lock);
        }
}




































































































































































































































   12 

    4 

    8 

    1 























































































































































































































































































































































































































   30 





    2 










    2 


















   26 



   30 
   29 































































































































































































































































   13 






   12 















































   18 






   27 

   29 


   30 














   27 



    8 
   26 




   30 













   26 








   29 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/util.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include <linux/slab.h>
#include <linux/rculist.h>

#include "common.h"

/* Lock for protecting policy. */
DEFINE_MUTEX(tomoyo_policy_lock);

/* Has /sbin/init started? */
bool tomoyo_policy_loaded;

/*
 * Mapping table from "enum tomoyo_mac_index" to
 * "enum tomoyo_mac_category_index".
 */
const u8 tomoyo_index2category[TOMOYO_MAX_MAC_INDEX] = {
        /* CONFIG::file group */
        [TOMOYO_MAC_FILE_EXECUTE]    = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_OPEN]       = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_CREATE]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_UNLINK]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_GETATTR]    = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MKDIR]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_RMDIR]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MKFIFO]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MKSOCK]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_TRUNCATE]   = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_SYMLINK]    = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MKBLOCK]    = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MKCHAR]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_LINK]       = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_RENAME]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_CHMOD]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_CHOWN]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_CHGRP]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_IOCTL]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_CHROOT]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MOUNT]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_UMOUNT]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_PIVOT_ROOT] = TOMOYO_MAC_CATEGORY_FILE,
        /* CONFIG::network group */
        [TOMOYO_MAC_NETWORK_INET_STREAM_BIND]       =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN]     =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT]    =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_DGRAM_BIND]        =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_DGRAM_SEND]        =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_RAW_BIND]          =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_RAW_SEND]          =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND]       =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN]     =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT]    =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND]        =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND]        =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND]    =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN]  =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT] =
        TOMOYO_MAC_CATEGORY_NETWORK,
        /* CONFIG::misc group */
        [TOMOYO_MAC_ENVIRON]         = TOMOYO_MAC_CATEGORY_MISC,
};

/**
 * tomoyo_convert_time - Convert time_t to YYYY/MM/DD hh/mm/ss.
 *
 * @time64: Seconds since 1970/01/01 00:00:00.
 * @stamp:  Pointer to "struct tomoyo_time".
 *
 * Returns nothing.
 */
void tomoyo_convert_time(time64_t time64, struct tomoyo_time *stamp)
{
        struct tm tm;

        time64_to_tm(time64, 0, &tm);
        stamp->sec = tm.tm_sec;
        stamp->min = tm.tm_min;
        stamp->hour = tm.tm_hour;
        stamp->day = tm.tm_mday;
        stamp->month = tm.tm_mon + 1;
        stamp->year = tm.tm_year + 1900;
}

/**
 * tomoyo_permstr - Find permission keywords.
 *
 * @string: String representation for permissions in foo/bar/buz format.
 * @keyword: Keyword to find from @string/
 *
 * Returns true if @keyword was found in @string, false otherwise.
 *
 * This function assumes that strncmp(w1, w2, strlen(w1)) != 0 if w1 != w2.
 */
bool tomoyo_permstr(const char *string, const char *keyword)
{
        const char *cp = strstr(string, keyword);

        if (cp)
                return cp == string || *(cp - 1) == '/';
        return false;
}

/**
 * tomoyo_read_token - Read a word from a line.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns a word on success, "" otherwise.
 *
 * To allow the caller to skip NULL check, this function returns "" rather than
 * NULL if there is no more words to read.
 */
char *tomoyo_read_token(struct tomoyo_acl_param *param)
{
        char *pos = param->data;
        char *del = strchr(pos, ' ');

        if (del)
                *del++ = '\0';
        else
                del = pos + strlen(pos);
        param->data = del;
        return pos;
}

static bool tomoyo_correct_path2(const char *filename, const size_t len);

/**
 * tomoyo_get_domainname - Read a domainname from a line.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns a domainname on success, NULL otherwise.
 */
const struct tomoyo_path_info *tomoyo_get_domainname
(struct tomoyo_acl_param *param)
{
        char *start = param->data;
        char *pos = start;

        while (*pos) {
                if (*pos++ != ' ' ||
                    tomoyo_correct_path2(pos, strchrnul(pos, ' ') - pos))
                        continue;
                *(pos - 1) = '\0';
                break;
        }
        param->data = pos;
        if (tomoyo_correct_domain(start))
                return tomoyo_get_name(start);
        return NULL;
}

/**
 * tomoyo_parse_ulong - Parse an "unsigned long" value.
 *
 * @result: Pointer to "unsigned long".
 * @str:    Pointer to string to parse.
 *
 * Returns one of values in "enum tomoyo_value_type".
 *
 * The @src is updated to point the first character after the value
 * on success.
 */
u8 tomoyo_parse_ulong(unsigned long *result, char **str)
{
        const char *cp = *str;
        char *ep;
        int base = 10;

        if (*cp == '0') {
                char c = *(cp + 1);

                if (c == 'x' || c == 'X') {
                        base = 16;
                        cp += 2;
                } else if (c >= '0' && c <= '7') {
                        base = 8;
                        cp++;
                }
        }
        *result = simple_strtoul(cp, &ep, base);
        if (cp == ep)
                return TOMOYO_VALUE_TYPE_INVALID;
        *str = ep;
        switch (base) {
        case 16:
                return TOMOYO_VALUE_TYPE_HEXADECIMAL;
        case 8:
                return TOMOYO_VALUE_TYPE_OCTAL;
        default:
                return TOMOYO_VALUE_TYPE_DECIMAL;
        }
}

/**
 * tomoyo_print_ulong - Print an "unsigned long" value.
 *
 * @buffer:     Pointer to buffer.
 * @buffer_len: Size of @buffer.
 * @value:      An "unsigned long" value.
 * @type:       Type of @value.
 *
 * Returns nothing.
 */
void tomoyo_print_ulong(char *buffer, const int buffer_len,
                        const unsigned long value, const u8 type)
{
        if (type == TOMOYO_VALUE_TYPE_DECIMAL)
                snprintf(buffer, buffer_len, "%lu", value);
        else if (type == TOMOYO_VALUE_TYPE_OCTAL)
                snprintf(buffer, buffer_len, "0%lo", value);
        else if (type == TOMOYO_VALUE_TYPE_HEXADECIMAL)
                snprintf(buffer, buffer_len, "0x%lX", value);
        else
                snprintf(buffer, buffer_len, "type(%u)", type);
}

/**
 * tomoyo_parse_name_union - Parse a tomoyo_name_union.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 * @ptr:   Pointer to "struct tomoyo_name_union".
 *
 * Returns true on success, false otherwise.
 */
bool tomoyo_parse_name_union(struct tomoyo_acl_param *param,
                             struct tomoyo_name_union *ptr)
{
        char *filename;

        if (param->data[0] == '@') {
                param->data++;
                ptr->group = tomoyo_get_group(param, TOMOYO_PATH_GROUP);
                return ptr->group != NULL;
        }
        filename = tomoyo_read_token(param);
        if (!tomoyo_correct_word(filename))
                return false;
        ptr->filename = tomoyo_get_name(filename);
        return ptr->filename != NULL;
}

/**
 * tomoyo_parse_number_union - Parse a tomoyo_number_union.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 * @ptr:   Pointer to "struct tomoyo_number_union".
 *
 * Returns true on success, false otherwise.
 */
bool tomoyo_parse_number_union(struct tomoyo_acl_param *param,
                               struct tomoyo_number_union *ptr)
{
        char *data;
        u8 type;
        unsigned long v;

        memset(ptr, 0, sizeof(*ptr));
        if (param->data[0] == '@') {
                param->data++;
                ptr->group = tomoyo_get_group(param, TOMOYO_NUMBER_GROUP);
                return ptr->group != NULL;
        }
        data = tomoyo_read_token(param);
        type = tomoyo_parse_ulong(&v, &data);
        if (type == TOMOYO_VALUE_TYPE_INVALID)
                return false;
        ptr->values[0] = v;
        ptr->value_type[0] = type;
        if (!*data) {
                ptr->values[1] = v;
                ptr->value_type[1] = type;
                return true;
        }
        if (*data++ != '-')
                return false;
        type = tomoyo_parse_ulong(&v, &data);
        if (type == TOMOYO_VALUE_TYPE_INVALID || *data || ptr->values[0] > v)
                return false;
        ptr->values[1] = v;
        ptr->value_type[1] = type;
        return true;
}

/**
 * tomoyo_byte_range - Check whether the string is a \ooo style octal value.
 *
 * @str: Pointer to the string.
 *
 * Returns true if @str is a \ooo style octal value, false otherwise.
 *
 * TOMOYO uses \ooo style representation for 0x01 - 0x20 and 0x7F - 0xFF.
 * This function verifies that \ooo is in valid range.
 */
static inline bool tomoyo_byte_range(const char *str)
{
        return *str >= '0' && *str++ <= '3' &&
                *str >= '0' && *str++ <= '7' &&
                *str >= '0' && *str <= '7';
}

/**
 * tomoyo_alphabet_char - Check whether the character is an alphabet.
 *
 * @c: The character to check.
 *
 * Returns true if @c is an alphabet character, false otherwise.
 */
static inline bool tomoyo_alphabet_char(const char c)
{
        return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
}

/**
 * tomoyo_make_byte - Make byte value from three octal characters.
 *
 * @c1: The first character.
 * @c2: The second character.
 * @c3: The third character.
 *
 * Returns byte value.
 */
static inline u8 tomoyo_make_byte(const u8 c1, const u8 c2, const u8 c3)
{
        return ((c1 - '0') << 6) + ((c2 - '0') << 3) + (c3 - '0');
}

/**
 * tomoyo_valid - Check whether the character is a valid char.
 *
 * @c: The character to check.
 *
 * Returns true if @c is a valid character, false otherwise.
 */
static inline bool tomoyo_valid(const unsigned char c)
{
        return c > ' ' && c < 127;
}

/**
 * tomoyo_invalid - Check whether the character is an invalid char.
 *
 * @c: The character to check.
 *
 * Returns true if @c is an invalid character, false otherwise.
 */
static inline bool tomoyo_invalid(const unsigned char c)
{
        return c && (c <= ' ' || c >= 127);
}

/**
 * tomoyo_str_starts - Check whether the given string starts with the given keyword.
 *
 * @src:  Pointer to pointer to the string.
 * @find: Pointer to the keyword.
 *
 * Returns true if @src starts with @find, false otherwise.
 *
 * The @src is updated to point the first character after the @find
 * if @src starts with @find.
 */
bool tomoyo_str_starts(char **src, const char *find)
{
        const int len = strlen(find);
        char *tmp = *src;

        if (strncmp(tmp, find, len))
                return false;
        tmp += len;
        *src = tmp;
        return true;
}

/**
 * tomoyo_normalize_line - Format string.
 *
 * @buffer: The line to normalize.
 *
 * Leading and trailing whitespaces are removed.
 * Multiple whitespaces are packed into single space.
 *
 * Returns nothing.
 */
void tomoyo_normalize_line(unsigned char *buffer)
{
        unsigned char *sp = buffer;
        unsigned char *dp = buffer;
        bool first = true;

        while (tomoyo_invalid(*sp))
                sp++;
        while (*sp) {
                if (!first)
                        *dp++ = ' ';
                first = false;
                while (tomoyo_valid(*sp))
                        *dp++ = *sp++;
                while (tomoyo_invalid(*sp))
                        sp++;
        }
        *dp = '\0';
}

/**
 * tomoyo_correct_word2 - Validate a string.
 *
 * @string: The string to check. Maybe non-'\0'-terminated.
 * @len:    Length of @string.
 *
 * Check whether the given string follows the naming rules.
 * Returns true if @string follows the naming rules, false otherwise.
 */
static bool tomoyo_correct_word2(const char *string, size_t len)
{
        u8 recursion = 20;
        const char *const start = string;
        bool in_repetition = false;

        if (!len)
                goto out;
        while (len--) {
                unsigned char c = *string++;

                if (c == '\\') {
                        if (!len--)
                                goto out;
                        c = *string++;
                        if (c >= '0' && c <= '3') {
                                unsigned char d;
                                unsigned char e;

                                if (!len-- || !len--)
                                        goto out;
                                d = *string++;
                                e = *string++;
                                if (d < '0' || d > '7' || e < '0' || e > '7')
                                        goto out;
                                c = tomoyo_make_byte(c, d, e);
                                if (c <= ' ' || c >= 127)
                                        continue;
                                goto out;
                        }
                        switch (c) {
                        case '\\':  /* "\\" */
                        case '+':   /* "\+" */
                        case '?':   /* "\?" */
                        case 'x':   /* "\x" */
                        case 'a':   /* "\a" */
                        case '-':   /* "\-" */
                                continue;
                        }
                        if (!recursion--)
                                goto out;
                        switch (c) {
                        case '*':   /* "\*" */
                        case '@':   /* "\@" */
                        case '$':   /* "\$" */
                        case 'X':   /* "\X" */
                        case 'A':   /* "\A" */
                                continue;
                        case '{':   /* "/\{" */
                                if (string - 3 < start || *(string - 3) != '/')
                                        goto out;
                                in_repetition = true;
                                continue;
                        case '}':   /* "\}/" */
                                if (*string != '/')
                                        goto out;
                                if (!in_repetition)
                                        goto out;
                                in_repetition = false;
                                continue;
                        }
                        goto out;
                } else if (in_repetition && c == '/') {
                        goto out;
                } else if (c <= ' ' || c >= 127) {
                        goto out;
                }
        }
        if (in_repetition)
                goto out;
        return true;
 out:
        return false;
}

/**
 * tomoyo_correct_word - Validate a string.
 *
 * @string: The string to check.
 *
 * Check whether the given string follows the naming rules.
 * Returns true if @string follows the naming rules, false otherwise.
 */
bool tomoyo_correct_word(const char *string)
{
        return tomoyo_correct_word2(string, strlen(string));
}

/**
 * tomoyo_correct_path2 - Check whether the given pathname follows the naming rules.
 *
 * @filename: The pathname to check.
 * @len:      Length of @filename.
 *
 * Returns true if @filename follows the naming rules, false otherwise.
 */
static bool tomoyo_correct_path2(const char *filename, const size_t len)
{
        const char *cp1 = memchr(filename, '/', len);
        const char *cp2 = memchr(filename, '.', len);

        return cp1 && (!cp2 || (cp1 < cp2)) && tomoyo_correct_word2(filename, len);
}

/**
 * tomoyo_correct_path - Validate a pathname.
 *
 * @filename: The pathname to check.
 *
 * Check whether the given pathname follows the naming rules.
 * Returns true if @filename follows the naming rules, false otherwise.
 */
bool tomoyo_correct_path(const char *filename)
{
        return tomoyo_correct_path2(filename, strlen(filename));
}

/**
 * tomoyo_correct_domain - Check whether the given domainname follows the naming rules.
 *
 * @domainname: The domainname to check.
 *
 * Returns true if @domainname follows the naming rules, false otherwise.
 */
bool tomoyo_correct_domain(const unsigned char *domainname)
{
        if (!domainname || !tomoyo_domain_def(domainname))
                return false;
        domainname = strchr(domainname, ' ');
        if (!domainname++)
                return true;
        while (1) {
                const unsigned char *cp = strchr(domainname, ' ');

                if (!cp)
                        break;
                if (!tomoyo_correct_path2(domainname, cp - domainname))
                        return false;
                domainname = cp + 1;
        }
        return tomoyo_correct_path(domainname);
}

/**
 * tomoyo_domain_def - Check whether the given token can be a domainname.
 *
 * @buffer: The token to check.
 *
 * Returns true if @buffer possibly be a domainname, false otherwise.
 */
bool tomoyo_domain_def(const unsigned char *buffer)
{
        const unsigned char *cp;
        int len;

        if (*buffer != '<')
                return false;
        cp = strchr(buffer, ' ');
        if (!cp)
                len = strlen(buffer);
        else
                len = cp - buffer;
        if (buffer[len - 1] != '>' ||
            !tomoyo_correct_word2(buffer + 1, len - 2))
                return false;
        return true;
}

/**
 * tomoyo_find_domain - Find a domain by the given name.
 *
 * @domainname: The domainname to find.
 *
 * Returns pointer to "struct tomoyo_domain_info" if found, NULL otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname)
{
        struct tomoyo_domain_info *domain;
        struct tomoyo_path_info name;

        name.name = domainname;
        tomoyo_fill_path_info(&name);
        list_for_each_entry_rcu(domain, &tomoyo_domain_list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (!domain->is_deleted &&
                    !tomoyo_pathcmp(&name, domain->domainname))
                        return domain;
        }
        return NULL;
}

/**
 * tomoyo_const_part_length - Evaluate the initial length without a pattern in a token.
 *
 * @filename: The string to evaluate.
 *
 * Returns the initial length without a pattern in @filename.
 */
static int tomoyo_const_part_length(const char *filename)
{
        char c;
        int len = 0;

        if (!filename)
                return 0;
        while ((c = *filename++) != '\0') {
                if (c != '\\') {
                        len++;
                        continue;
                }
                c = *filename++;
                switch (c) {
                case '\\':  /* "\\" */
                        len += 2;
                        continue;
                case '0':   /* "\ooo" */
                case '1':
                case '2':
                case '3':
                        c = *filename++;
                        if (c < '0' || c > '7')
                                break;
                        c = *filename++;
                        if (c < '0' || c > '7')
                                break;
                        len += 4;
                        continue;
                }
                break;
        }
        return len;
}

/**
 * tomoyo_fill_path_info - Fill in "struct tomoyo_path_info" members.
 *
 * @ptr: Pointer to "struct tomoyo_path_info" to fill in.
 *
 * The caller sets "struct tomoyo_path_info"->name.
 */
void tomoyo_fill_path_info(struct tomoyo_path_info *ptr)
{
        const char *name = ptr->name;
        const int len = strlen(name);

        ptr->const_len = tomoyo_const_part_length(name);
        ptr->is_dir = len && (name[len - 1] == '/');
        ptr->is_patterned = (ptr->const_len < len);
        ptr->hash = full_name_hash(NULL, name, len);
}

/**
 * tomoyo_file_matches_pattern2 - Pattern matching without '/' character and "\-" pattern.
 *
 * @filename:     The start of string to check.
 * @filename_end: The end of string to check.
 * @pattern:      The start of pattern to compare.
 * @pattern_end:  The end of pattern to compare.
 *
 * Returns true if @filename matches @pattern, false otherwise.
 */
static bool tomoyo_file_matches_pattern2(const char *filename,
                                         const char *filename_end,
                                         const char *pattern,
                                         const char *pattern_end)
{
        while (filename < filename_end && pattern < pattern_end) {
                char c;
                int i;
                int j;

                if (*pattern != '\\') {
                        if (*filename++ != *pattern++)
                                return false;
                        continue;
                }
                c = *filename;
                pattern++;
                switch (*pattern) {
                case '?':
                        if (c == '/') {
                                return false;
                        } else if (c == '\\') {
                                if (filename[1] == '\\')
                                        filename++;
                                else if (tomoyo_byte_range(filename + 1))
                                        filename += 3;
                                else
                                        return false;
                        }
                        break;
                case '\\':
                        if (c != '\\')
                                return false;
                        if (*++filename != '\\')
                                return false;
                        break;
                case '+':
                        if (!isdigit(c))
                                return false;
                        break;
                case 'x':
                        if (!isxdigit(c))
                                return false;
                        break;
                case 'a':
                        if (!tomoyo_alphabet_char(c))
                                return false;
                        break;
                case '0':
                case '1':
                case '2':
                case '3':
                        if (c == '\\' && tomoyo_byte_range(filename + 1)
                            && strncmp(filename + 1, pattern, 3) == 0) {
                                filename += 3;
                                pattern += 2;
                                break;
                        }
                        return false; /* Not matched. */
                case '*':
                case '@':
                        for (i = 0; i <= filename_end - filename; i++) {
                                if (tomoyo_file_matches_pattern2(
                                                    filename + i, filename_end,
                                                    pattern + 1, pattern_end))
                                        return true;
                                c = filename[i];
                                if (c == '.' && *pattern == '@')
                                        break;
                                if (c != '\\')
                                        continue;
                                if (filename[i + 1] == '\\')
                                        i++;
                                else if (tomoyo_byte_range(filename + i + 1))
                                        i += 3;
                                else
                                        break; /* Bad pattern. */
                        }
                        return false; /* Not matched. */
                default:
                        j = 0;
                        c = *pattern;
                        if (c == '$') {
                                while (isdigit(filename[j]))
                                        j++;
                        } else if (c == 'X') {
                                while (isxdigit(filename[j]))
                                        j++;
                        } else if (c == 'A') {
                                while (tomoyo_alphabet_char(filename[j]))
                                        j++;
                        }
                        for (i = 1; i <= j; i++) {
                                if (tomoyo_file_matches_pattern2(
                                                    filename + i, filename_end,
                                                    pattern + 1, pattern_end))
                                        return true;
                        }
                        return false; /* Not matched or bad pattern. */
                }
                filename++;
                pattern++;
        }
        while (*pattern == '\\' &&
               (*(pattern + 1) == '*' || *(pattern + 1) == '@'))
                pattern += 2;
        return filename == filename_end && pattern == pattern_end;
}

/**
 * tomoyo_file_matches_pattern - Pattern matching without '/' character.
 *
 * @filename:     The start of string to check.
 * @filename_end: The end of string to check.
 * @pattern:      The start of pattern to compare.
 * @pattern_end:  The end of pattern to compare.
 *
 * Returns true if @filename matches @pattern, false otherwise.
 */
static bool tomoyo_file_matches_pattern(const char *filename,
                                        const char *filename_end,
                                        const char *pattern,
                                        const char *pattern_end)
{
        const char *pattern_start = pattern;
        bool first = true;
        bool result;

        while (pattern < pattern_end - 1) {
                /* Split at "\-" pattern. */
                if (*pattern++ != '\\' || *pattern++ != '-')
                        continue;
                result = tomoyo_file_matches_pattern2(filename,
                                                      filename_end,
                                                      pattern_start,
                                                      pattern - 2);
                if (first)
                        result = !result;
                if (result)
                        return false;
                first = false;
                pattern_start = pattern;
        }
        result = tomoyo_file_matches_pattern2(filename, filename_end,
                                              pattern_start, pattern_end);
        return first ? result : !result;
}

/**
 * tomoyo_path_matches_pattern2 - Do pathname pattern matching.
 *
 * @f: The start of string to check.
 * @p: The start of pattern to compare.
 *
 * Returns true if @f matches @p, false otherwise.
 */
static bool tomoyo_path_matches_pattern2(const char *f, const char *p)
{
        const char *f_delimiter;
        const char *p_delimiter;

        while (*f && *p) {
                f_delimiter = strchr(f, '/');
                if (!f_delimiter)
                        f_delimiter = f + strlen(f);
                p_delimiter = strchr(p, '/');
                if (!p_delimiter)
                        p_delimiter = p + strlen(p);
                if (*p == '\\' && *(p + 1) == '{')
                        goto recursive;
                if (!tomoyo_file_matches_pattern(f, f_delimiter, p,
                                                 p_delimiter))
                        return false;
                f = f_delimiter;
                if (*f)
                        f++;
                p = p_delimiter;
                if (*p)
                        p++;
        }
        /* Ignore trailing "\*" and "\@" in @pattern. */
        while (*p == '\\' &&
               (*(p + 1) == '*' || *(p + 1) == '@'))
                p += 2;
        return !*f && !*p;
 recursive:
        /*
         * The "\{" pattern is permitted only after '/' character.
         * This guarantees that below "*(p - 1)" is safe.
         * Also, the "\}" pattern is permitted only before '/' character
         * so that "\{" + "\}" pair will not break the "\-" operator.
         */
        if (*(p - 1) != '/' || p_delimiter <= p + 3 || *p_delimiter != '/' ||
            *(p_delimiter - 1) != '}' || *(p_delimiter - 2) != '\\')
                return false; /* Bad pattern. */
        do {
                /* Compare current component with pattern. */
                if (!tomoyo_file_matches_pattern(f, f_delimiter, p + 2,
                                                 p_delimiter - 2))
                        break;
                /* Proceed to next component. */
                f = f_delimiter;
                if (!*f)
                        break;
                f++;
                /* Continue comparison. */
                if (tomoyo_path_matches_pattern2(f, p_delimiter + 1))
                        return true;
                f_delimiter = strchr(f, '/');
        } while (f_delimiter);
        return false; /* Not matched. */
}

/**
 * tomoyo_path_matches_pattern - Check whether the given filename matches the given pattern.
 *
 * @filename: The filename to check.
 * @pattern:  The pattern to compare.
 *
 * Returns true if matches, false otherwise.
 *
 * The following patterns are available.
 *   \\     \ itself.
 *   \ooo   Octal representation of a byte.
 *   \*     Zero or more repetitions of characters other than '/'.
 *   \@     Zero or more repetitions of characters other than '/' or '.'.
 *   \?     1 byte character other than '/'.
 *   \$     One or more repetitions of decimal digits.
 *   \+     1 decimal digit.
 *   \X     One or more repetitions of hexadecimal digits.
 *   \x     1 hexadecimal digit.
 *   \A     One or more repetitions of alphabet characters.
 *   \a     1 alphabet character.
 *
 *   \-     Subtraction operator.
 *
 *   /\{dir\}/   '/' + 'One or more repetitions of dir/' (e.g. /dir/ /dir/dir/
 *               /dir/dir/dir/ ).
 */
bool tomoyo_path_matches_pattern(const struct tomoyo_path_info *filename,
                                 const struct tomoyo_path_info *pattern)
{
        const char *f = filename->name;
        const char *p = pattern->name;
        const int len = pattern->const_len;

        /* If @pattern doesn't contain pattern, I can use strcmp(). */
        if (!pattern->is_patterned)
                return !tomoyo_pathcmp(filename, pattern);
        /* Don't compare directory and non-directory. */
        if (filename->is_dir != pattern->is_dir)
                return false;
        /* Compare the initial length without patterns. */
        if (strncmp(f, p, len))
                return false;
        f += len;
        p += len;
        return tomoyo_path_matches_pattern2(f, p);
}

/**
 * tomoyo_get_exe - Get tomoyo_realpath() of current process.
 *
 * Returns the tomoyo_realpath() of current process on success, NULL otherwise.
 *
 * This function uses kzalloc(), so the caller must call kfree()
 * if this function didn't return NULL.
 */
const char *tomoyo_get_exe(void)
{
        struct file *exe_file;
        const char *cp;
        struct mm_struct *mm = current->mm;

        if (!mm)
                return NULL;
        exe_file = get_mm_exe_file(mm);
        if (!exe_file)
                return NULL;

        cp = tomoyo_realpath_from_path(&exe_file->f_path);
        fput(exe_file);
        return cp;
}

/**
 * tomoyo_get_mode - Get MAC mode.
 *
 * @ns:      Pointer to "struct tomoyo_policy_namespace".
 * @profile: Profile number.
 * @index:   Index number of functionality.
 *
 * Returns mode.
 */
int tomoyo_get_mode(const struct tomoyo_policy_namespace *ns, const u8 profile,
                    const u8 index)
{
        u8 mode;
        struct tomoyo_profile *p;

        if (!tomoyo_policy_loaded)
                return TOMOYO_CONFIG_DISABLED;
        p = tomoyo_profile(ns, profile);
        mode = p->config[index];
        if (mode == TOMOYO_CONFIG_USE_DEFAULT)
                mode = p->config[tomoyo_index2category[index]
                                 + TOMOYO_MAX_MAC_INDEX];
        if (mode == TOMOYO_CONFIG_USE_DEFAULT)
                mode = p->default_config;
        return mode & 3;
}

/**
 * tomoyo_init_request_info - Initialize "struct tomoyo_request_info" members.
 *
 * @r:      Pointer to "struct tomoyo_request_info" to initialize.
 * @domain: Pointer to "struct tomoyo_domain_info". NULL for tomoyo_domain().
 * @index:  Index number of functionality.
 *
 * Returns mode.
 */
int tomoyo_init_request_info(struct tomoyo_request_info *r,
                             struct tomoyo_domain_info *domain, const u8 index)
{
        u8 profile;

        memset(r, 0, sizeof(*r));
        if (!domain)
                domain = tomoyo_domain();
        r->domain = domain;
        profile = domain->profile;
        r->profile = profile;
        r->type = index;
        r->mode = tomoyo_get_mode(domain->ns, profile, index);
        return r->mode;
}

/**
 * tomoyo_domain_quota_is_ok - Check for domain's quota.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns true if the domain is not exceeded quota, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
bool tomoyo_domain_quota_is_ok(struct tomoyo_request_info *r)
{
        unsigned int count = 0;
        struct tomoyo_domain_info *domain = r->domain;
        struct tomoyo_acl_info *ptr;

        if (r->mode != TOMOYO_CONFIG_LEARNING)
                return false;
        if (!domain)
                return true;
        if (READ_ONCE(domain->flags[TOMOYO_DIF_QUOTA_WARNED]))
                return false;
        list_for_each_entry_rcu(ptr, &domain->acl_info_list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                u16 perm;

                if (ptr->is_deleted)
                        continue;
                /*
                 * Reading perm bitmap might race with tomoyo_merge_*() because
                 * caller does not hold tomoyo_policy_lock mutex. But exceeding
                 * max_learning_entry parameter by a few entries does not harm.
                 */
                switch (ptr->type) {
                case TOMOYO_TYPE_PATH_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_path_acl, head)->perm);
                        break;
                case TOMOYO_TYPE_PATH2_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_path2_acl, head)->perm);
                        break;
                case TOMOYO_TYPE_PATH_NUMBER_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_path_number_acl, head)
                                  ->perm);
                        break;
                case TOMOYO_TYPE_MKDEV_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_mkdev_acl, head)->perm);
                        break;
                case TOMOYO_TYPE_INET_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_inet_acl, head)->perm);
                        break;
                case TOMOYO_TYPE_UNIX_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_unix_acl, head)->perm);
                        break;
                case TOMOYO_TYPE_MANUAL_TASK_ACL:
                        perm = 0;
                        break;
                default:
                        perm = 1;
                }
                count += hweight16(perm);
        }
        if (count < tomoyo_profile(domain->ns, domain->profile)->
            pref[TOMOYO_PREF_MAX_LEARNING_ENTRY])
                return true;
        WRITE_ONCE(domain->flags[TOMOYO_DIF_QUOTA_WARNED], true);
        /* r->granted = false; */
        tomoyo_write_log(r, "%s", tomoyo_dif[TOMOYO_DIF_QUOTA_WARNED]);
#ifndef CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING
        pr_warn("WARNING: Domain '%s' has too many ACLs to hold. Stopped learning mode.\n",
                domain->domainname->name);
#endif
        return false;
}











































































































    1 




    1 







    8 






    3 













































    1 













    1 




    1 










    4 











    4 




























































    1 
    1 






































































   23 












   24 
























   21 



   22 















   16 




   15 















    4 









    1 












    2 




    2 














    6 


   23 












   18 
   17 





    6 
    5 
    3 



    2 
    2 






































































































































   29 



    4 




   28 




















































    7 


    4 
    7 





















    8 




    8 


    8 


    6 





































    1 




    1 







    4 



    4 

    1 



    1 







    3 

















    3 

    4 















    4 
    3 






    2 





















    4 










    1 












    1 
    1 
    1 










    1 














    4 































    4 
    4 
    4 


























    1 



    1 






    1 



























    5 

    5 















    3 







    2 
    2 

    3 

    3 




    3 
    3 







    4 






























    2 









    2 




    3 




    3 































































    1 











    1 

    1 












    1 



    1 



    1 




    1 

    1 





    1 




    1 






















































   34 



   36 











   34 











    1 







































   31 
   29 


   35 







   15 

   16 

























































































































































































    5 





    4 
    4 





    4 




    1 
    1 
    1 





















    1 


























    1 



    1 



    1 


    1 





    1 
    1 



    1 


    1 



    1 
    1 



    1 





    1 










    1 
    1 






































































































































































    2 
































































































    1 



















    1 








    1 


    1 


    1 









    1 



































































    1 






    1 






    1 
























































































































    1 


    1 
    1 
























    3 


    3 


















































































    3 



















    3 




    2 




    1 



    1 





    1 
    1 










    2 



    1 
    1 
    1 




    3 



    2 






































































    4 













    3 










    3 



















    4 





    4 































    2 























































    1 





    1 






    1 




    1 








































































    1 
    1 







    1 





    1 



    1 

























    1 











    1 








    1 





























    1 




























    1 


    2 











































































































    1 

    1 













    1 




    1 
    1 

























































































































































































    1 
































    1 














    1 


    1 













    1 





    1 








    1 
    1 







    1 































    1 









    2 






    2 






































    2 

    1 










    2 












    2 





















    1 


    1 

    1 






    2 







































































































































































































    1 













    1 


















    1 


















    2 








    1 
    1 









    2 

























    2 


























    2 












    2 















    2 
    2 


    2 













    2 





















    2 





    2 
























































































































    2 









    1 


    1 

    1 


    2 

    1 


    2 












































    1 





























    1 


































    1 







    1 

















    1 
    1 














    1 








    1 











    1 
















































    1 
















    1 
    1 








































    1 













































    1 














    1 


    1 


    1 
    1 
    1 








    1 

    1 
















































































    1 










    1 








    1 





    1 





























    1 




















    1 




    1 



    1 



    1 










    1 
















    1 
























    1 







    1 

    1 





    1 















    1 



















































































































    1 



    1 




    1 


    1 



    1 
























    1 




    1 




    1 





    1 


    1 





































































































































































































































































































































































































































































































































































































































































    2 




    3 
































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/namespace.c
 *
 * (C) Copyright Al Viro 2000, 2001
 *
 * Based on code from fs/super.c, copyright Linus Torvalds and others.
 * Heavily rewritten.
 */

#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/capability.h>
#include <linux/mnt_namespace.h>
#include <linux/user_namespace.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/idr.h>
#include <linux/init.h>                /* init_rootfs */
#include <linux/fs_struct.h>        /* get_fs_root et.al. */
#include <linux/fsnotify.h>        /* fsnotify_vfsmount_delete */
#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
#include <linux/memblock.h>
#include <linux/proc_fs.h>
#include <linux/task_work.h>
#include <linux/sched/task.h>
#include <uapi/linux/mount.h>
#include <linux/fs_context.h>
#include <linux/shmem_fs.h>
#include <linux/mnt_idmapping.h>
#include <linux/nospec.h>

#include "pnode.h"
#include "internal.h"

/* Maximum number of mounts in a mount namespace */
static unsigned int sysctl_mount_max __read_mostly = 100000;

static unsigned int m_hash_mask __ro_after_init;
static unsigned int m_hash_shift __ro_after_init;
static unsigned int mp_hash_mask __ro_after_init;
static unsigned int mp_hash_shift __ro_after_init;

static __initdata unsigned long mhash_entries;
static int __init set_mhash_entries(char *str)
{
        if (!str)
                return 0;
        mhash_entries = simple_strtoul(str, &str, 0);
        return 1;
}
__setup("mhash_entries=", set_mhash_entries);

static __initdata unsigned long mphash_entries;
static int __init set_mphash_entries(char *str)
{
        if (!str)
                return 0;
        mphash_entries = simple_strtoul(str, &str, 0);
        return 1;
}
__setup("mphash_entries=", set_mphash_entries);

static u64 event;
static DEFINE_IDA(mnt_id_ida);
static DEFINE_IDA(mnt_group_ida);

/* Don't allow confusion with old 32bit mount ID */
static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32);

static struct hlist_head *mount_hashtable __ro_after_init;
static struct hlist_head *mountpoint_hashtable __ro_after_init;
static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted);        /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */

struct mount_kattr {
        unsigned int attr_set;
        unsigned int attr_clr;
        unsigned int propagation;
        unsigned int lookup_flags;
        bool recurse;
        struct user_namespace *mnt_userns;
        struct mnt_idmap *mnt_idmap;
};

/* /sys/fs */
struct kobject *fs_kobj __ro_after_init;
EXPORT_SYMBOL_GPL(fs_kobj);

/*
 * vfsmount lock may be taken for read to prevent changes to the
 * vfsmount hash, ie. during mountpoint lookups or walking back
 * up the tree.
 *
 * It should be taken for write in all cases where the vfsmount
 * tree or hash is modified or when a vfsmount structure is modified.
 */
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);

static inline void lock_mount_hash(void)
{
        write_seqlock(&mount_lock);
}

static inline void unlock_mount_hash(void)
{
        write_sequnlock(&mount_lock);
}

static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
{
        unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
        tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
        tmp = tmp + (tmp >> m_hash_shift);
        return &mount_hashtable[tmp & m_hash_mask];
}

static inline struct hlist_head *mp_hash(struct dentry *dentry)
{
        unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
        tmp = tmp + (tmp >> mp_hash_shift);
        return &mountpoint_hashtable[tmp & mp_hash_mask];
}

static int mnt_alloc_id(struct mount *mnt)
{
        int res = ida_alloc(&mnt_id_ida, GFP_KERNEL);

        if (res < 0)
                return res;
        mnt->mnt_id = res;
        mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr);
        return 0;
}

static void mnt_free_id(struct mount *mnt)
{
        ida_free(&mnt_id_ida, mnt->mnt_id);
}

/*
 * Allocate a new peer group ID
 */
static int mnt_alloc_group_id(struct mount *mnt)
{
        int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL);

        if (res < 0)
                return res;
        mnt->mnt_group_id = res;
        return 0;
}

/*
 * Release a peer group ID
 */
void mnt_release_group_id(struct mount *mnt)
{
        ida_free(&mnt_group_ida, mnt->mnt_group_id);
        mnt->mnt_group_id = 0;
}

/*
 * vfsmount lock must be held for read
 */
static inline void mnt_add_count(struct mount *mnt, int n)
{
#ifdef CONFIG_SMP
        this_cpu_add(mnt->mnt_pcp->mnt_count, n);
#else
        preempt_disable();
        mnt->mnt_count += n;
        preempt_enable();
#endif
}

/*
 * vfsmount lock must be held for write
 */
int mnt_get_count(struct mount *mnt)
{
#ifdef CONFIG_SMP
        int count = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
        }

        return count;
#else
        return mnt->mnt_count;
#endif
}

static struct mount *alloc_vfsmnt(const char *name)
{
        struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
        if (mnt) {
                int err;

                err = mnt_alloc_id(mnt);
                if (err)
                        goto out_free_cache;

                if (name) {
                        mnt->mnt_devname = kstrdup_const(name,
                                                         GFP_KERNEL_ACCOUNT);
                        if (!mnt->mnt_devname)
                                goto out_free_id;
                }

#ifdef CONFIG_SMP
                mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
                if (!mnt->mnt_pcp)
                        goto out_free_devname;

                this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
#else
                mnt->mnt_count = 1;
                mnt->mnt_writers = 0;
#endif

                INIT_HLIST_NODE(&mnt->mnt_hash);
                INIT_LIST_HEAD(&mnt->mnt_child);
                INIT_LIST_HEAD(&mnt->mnt_mounts);
                INIT_LIST_HEAD(&mnt->mnt_list);
                INIT_LIST_HEAD(&mnt->mnt_expire);
                INIT_LIST_HEAD(&mnt->mnt_share);
                INIT_LIST_HEAD(&mnt->mnt_slave_list);
                INIT_LIST_HEAD(&mnt->mnt_slave);
                INIT_HLIST_NODE(&mnt->mnt_mp_list);
                INIT_LIST_HEAD(&mnt->mnt_umounting);
                INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
                mnt->mnt.mnt_idmap = &nop_mnt_idmap;
        }
        return mnt;

#ifdef CONFIG_SMP
out_free_devname:
        kfree_const(mnt->mnt_devname);
#endif
out_free_id:
        mnt_free_id(mnt);
out_free_cache:
        kmem_cache_free(mnt_cache, mnt);
        return NULL;
}

/*
 * Most r/o checks on a fs are for operations that take
 * discrete amounts of time, like a write() or unlink().
 * We must keep track of when those operations start
 * (for permission checks) and when they end, so that
 * we can determine when writes are able to occur to
 * a filesystem.
 */
/*
 * __mnt_is_readonly: check whether a mount is read-only
 * @mnt: the mount to check for its write status
 *
 * This shouldn't be used directly ouside of the VFS.
 * It does not guarantee that the filesystem will stay
 * r/w, just that it is right *now*.  This can not and
 * should not be used in place of IS_RDONLY(inode).
 * mnt_want/drop_write() will _keep_ the filesystem
 * r/w.
 */
bool __mnt_is_readonly(struct vfsmount *mnt)
{
        return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);

static inline void mnt_inc_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
        this_cpu_inc(mnt->mnt_pcp->mnt_writers);
#else
        mnt->mnt_writers++;
#endif
}

static inline void mnt_dec_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
        this_cpu_dec(mnt->mnt_pcp->mnt_writers);
#else
        mnt->mnt_writers--;
#endif
}

static unsigned int mnt_get_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
        unsigned int count = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
        }

        return count;
#else
        return mnt->mnt_writers;
#endif
}

static int mnt_is_readonly(struct vfsmount *mnt)
{
        if (READ_ONCE(mnt->mnt_sb->s_readonly_remount))
                return 1;
        /*
         * The barrier pairs with the barrier in sb_start_ro_state_change()
         * making sure if we don't see s_readonly_remount set yet, we also will
         * not see any superblock / mount flag changes done by remount.
         * It also pairs with the barrier in sb_end_ro_state_change()
         * assuring that if we see s_readonly_remount already cleared, we will
         * see the values of superblock / mount flags updated by remount.
         */
        smp_rmb();
        return __mnt_is_readonly(mnt);
}

/*
 * Most r/o & frozen checks on a fs are for operations that take discrete
 * amounts of time, like a write() or unlink().  We must keep track of when
 * those operations start (for permission checks) and when they end, so that we
 * can determine when writes are able to occur to a filesystem.
 */
/**
 * mnt_get_write_access - get write access to a mount without freeze protection
 * @m: the mount on which to take a write
 *
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mnt it read-write) before
 * returning success. This operation does not protect against filesystem being
 * frozen. When the write operation is finished, mnt_put_write_access() must be
 * called. This is effectively a refcount.
 */
int mnt_get_write_access(struct vfsmount *m)
{
        struct mount *mnt = real_mount(m);
        int ret = 0;

        preempt_disable();
        mnt_inc_writers(mnt);
        /*
         * The store to mnt_inc_writers must be visible before we pass
         * MNT_WRITE_HOLD loop below, so that the slowpath can see our
         * incremented count after it has set MNT_WRITE_HOLD.
         */
        smp_mb();
        might_lock(&mount_lock.lock);
        while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
                if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
                        cpu_relax();
                } else {
                        /*
                         * This prevents priority inversion, if the task
                         * setting MNT_WRITE_HOLD got preempted on a remote
                         * CPU, and it prevents life lock if the task setting
                         * MNT_WRITE_HOLD has a lower priority and is bound to
                         * the same CPU as the task that is spinning here.
                         */
                        preempt_enable();
                        lock_mount_hash();
                        unlock_mount_hash();
                        preempt_disable();
                }
        }
        /*
         * The barrier pairs with the barrier sb_start_ro_state_change() making
         * sure that if we see MNT_WRITE_HOLD cleared, we will also see
         * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in
         * mnt_is_readonly() and bail in case we are racing with remount
         * read-only.
         */
        smp_rmb();
        if (mnt_is_readonly(m)) {
                mnt_dec_writers(mnt);
                ret = -EROFS;
        }
        preempt_enable();

        return ret;
}
EXPORT_SYMBOL_GPL(mnt_get_write_access);

/**
 * mnt_want_write - get write access to a mount
 * @m: the mount on which to take a write
 *
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mount is read-write, filesystem
 * is not frozen) before returning success.  When the write operation is
 * finished, mnt_drop_write() must be called.  This is effectively a refcount.
 */
int mnt_want_write(struct vfsmount *m)
{
        int ret;

        sb_start_write(m->mnt_sb);
        ret = mnt_get_write_access(m);
        if (ret)
                sb_end_write(m->mnt_sb);
        return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write);

/**
 * mnt_get_write_access_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_get_write_access, but if @file is already open for write it
 * skips incrementing mnt_writers (since the open file already has a reference)
 * and instead only does the check for emergency r/o remounts.  This must be
 * paired with mnt_put_write_access_file.
 */
int mnt_get_write_access_file(struct file *file)
{
        if (file->f_mode & FMODE_WRITER) {
                /*
                 * Superblock may have become readonly while there are still
                 * writable fd's, e.g. due to a fs error with errors=remount-ro
                 */
                if (__mnt_is_readonly(file->f_path.mnt))
                        return -EROFS;
                return 0;
        }
        return mnt_get_write_access(file->f_path.mnt);
}

/**
 * mnt_want_write_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_want_write, but if the file is already open for writing it
 * skips incrementing mnt_writers (since the open file already has a reference)
 * and instead only does the freeze protection and the check for emergency r/o
 * remounts.  This must be paired with mnt_drop_write_file.
 */
int mnt_want_write_file(struct file *file)
{
        int ret;

        sb_start_write(file_inode(file)->i_sb);
        ret = mnt_get_write_access_file(file);
        if (ret)
                sb_end_write(file_inode(file)->i_sb);
        return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write_file);

/**
 * mnt_put_write_access - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done
 * performing writes to it.  Must be matched with
 * mnt_get_write_access() call above.
 */
void mnt_put_write_access(struct vfsmount *mnt)
{
        preempt_disable();
        mnt_dec_writers(real_mount(mnt));
        preempt_enable();
}
EXPORT_SYMBOL_GPL(mnt_put_write_access);

/**
 * mnt_drop_write - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done performing writes to it and
 * also allows filesystem to be frozen again.  Must be matched with
 * mnt_want_write() call above.
 */
void mnt_drop_write(struct vfsmount *mnt)
{
        mnt_put_write_access(mnt);
        sb_end_write(mnt->mnt_sb);
}
EXPORT_SYMBOL_GPL(mnt_drop_write);

void mnt_put_write_access_file(struct file *file)
{
        if (!(file->f_mode & FMODE_WRITER))
                mnt_put_write_access(file->f_path.mnt);
}

void mnt_drop_write_file(struct file *file)
{
        mnt_put_write_access_file(file);
        sb_end_write(file_inode(file)->i_sb);
}
EXPORT_SYMBOL(mnt_drop_write_file);

/**
 * mnt_hold_writers - prevent write access to the given mount
 * @mnt: mnt to prevent write access to
 *
 * Prevents write access to @mnt if there are no active writers for @mnt.
 * This function needs to be called and return successfully before changing
 * properties of @mnt that need to remain stable for callers with write access
 * to @mnt.
 *
 * After this functions has been called successfully callers must pair it with
 * a call to mnt_unhold_writers() in order to stop preventing write access to
 * @mnt.
 *
 * Context: This function expects lock_mount_hash() to be held serializing
 *          setting MNT_WRITE_HOLD.
 * Return: On success 0 is returned.
 *           On error, -EBUSY is returned.
 */
static inline int mnt_hold_writers(struct mount *mnt)
{
        mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
        /*
         * After storing MNT_WRITE_HOLD, we'll read the counters. This store
         * should be visible before we do.
         */
        smp_mb();

        /*
         * With writers on hold, if this value is zero, then there are
         * definitely no active writers (although held writers may subsequently
         * increment the count, they'll have to wait, and decrement it after
         * seeing MNT_READONLY).
         *
         * It is OK to have counter incremented on one CPU and decremented on
         * another: the sum will add up correctly. The danger would be when we
         * sum up each counter, if we read a counter before it is incremented,
         * but then read another CPU's count which it has been subsequently
         * decremented from -- we would see more decrements than we should.
         * MNT_WRITE_HOLD protects against this scenario, because
         * mnt_want_write first increments count, then smp_mb, then spins on
         * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
         * we're counting up here.
         */
        if (mnt_get_writers(mnt) > 0)
                return -EBUSY;

        return 0;
}

/**
 * mnt_unhold_writers - stop preventing write access to the given mount
 * @mnt: mnt to stop preventing write access to
 *
 * Stop preventing write access to @mnt allowing callers to gain write access
 * to @mnt again.
 *
 * This function can only be called after a successful call to
 * mnt_hold_writers().
 *
 * Context: This function expects lock_mount_hash() to be held.
 */
static inline void mnt_unhold_writers(struct mount *mnt)
{
        /*
         * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
         * that become unheld will see MNT_READONLY.
         */
        smp_wmb();
        mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
}

static int mnt_make_readonly(struct mount *mnt)
{
        int ret;

        ret = mnt_hold_writers(mnt);
        if (!ret)
                mnt->mnt.mnt_flags |= MNT_READONLY;
        mnt_unhold_writers(mnt);
        return ret;
}

int sb_prepare_remount_readonly(struct super_block *sb)
{
        struct mount *mnt;
        int err = 0;

        /* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
        if (atomic_long_read(&sb->s_remove_count))
                return -EBUSY;

        lock_mount_hash();
        list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
                if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
                        err = mnt_hold_writers(mnt);
                        if (err)
                                break;
                }
        }
        if (!err && atomic_long_read(&sb->s_remove_count))
                err = -EBUSY;

        if (!err)
                sb_start_ro_state_change(sb);
        list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
                if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
                        mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
        }
        unlock_mount_hash();

        return err;
}

static void free_vfsmnt(struct mount *mnt)
{
        mnt_idmap_put(mnt_idmap(&mnt->mnt));
        kfree_const(mnt->mnt_devname);
#ifdef CONFIG_SMP
        free_percpu(mnt->mnt_pcp);
#endif
        kmem_cache_free(mnt_cache, mnt);
}

static void delayed_free_vfsmnt(struct rcu_head *head)
{
        free_vfsmnt(container_of(head, struct mount, mnt_rcu));
}

/* call under rcu_read_lock */
int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
        struct mount *mnt;
        if (read_seqretry(&mount_lock, seq))
                return 1;
        if (bastard == NULL)
                return 0;
        mnt = real_mount(bastard);
        mnt_add_count(mnt, 1);
        smp_mb();                        // see mntput_no_expire()
        if (likely(!read_seqretry(&mount_lock, seq)))
                return 0;
        if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
                mnt_add_count(mnt, -1);
                return 1;
        }
        lock_mount_hash();
        if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
                mnt_add_count(mnt, -1);
                unlock_mount_hash();
                return 1;
        }
        unlock_mount_hash();
        /* caller will mntput() */
        return -1;
}

/* call under rcu_read_lock */
static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
        int res = __legitimize_mnt(bastard, seq);
        if (likely(!res))
                return true;
        if (unlikely(res < 0)) {
                rcu_read_unlock();
                mntput(bastard);
                rcu_read_lock();
        }
        return false;
}

/**
 * __lookup_mnt - find first child mount
 * @mnt:        parent mount
 * @dentry:        mountpoint
 *
 * If @mnt has a child mount @c mounted @dentry find and return it.
 *
 * Note that the child mount @c need not be unique. There are cases
 * where shadow mounts are created. For example, during mount
 * propagation when a source mount @mnt whose root got overmounted by a
 * mount @o after path lookup but before @namespace_sem could be
 * acquired gets copied and propagated. So @mnt gets copied including
 * @o. When @mnt is propagated to a destination mount @d that already
 * has another mount @n mounted at the same mountpoint then the source
 * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on
 * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt
 * on @dentry.
 *
 * Return: The first child of @mnt mounted @dentry or NULL.
 */
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
{
        struct hlist_head *head = m_hash(mnt, dentry);
        struct mount *p;

        hlist_for_each_entry_rcu(p, head, mnt_hash)
                if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
                        return p;
        return NULL;
}

/*
 * lookup_mnt - Return the first child mount mounted at path
 *
 * "First" means first mounted chronologically.  If you create the
 * following mounts:
 *
 * mount /dev/sda1 /mnt
 * mount /dev/sda2 /mnt
 * mount /dev/sda3 /mnt
 *
 * Then lookup_mnt() on the base /mnt dentry in the root mount will
 * return successively the root dentry and vfsmount of /dev/sda1, then
 * /dev/sda2, then /dev/sda3, then NULL.
 *
 * lookup_mnt takes a reference to the found vfsmount.
 */
struct vfsmount *lookup_mnt(const struct path *path)
{
        struct mount *child_mnt;
        struct vfsmount *m;
        unsigned seq;

        rcu_read_lock();
        do {
                seq = read_seqbegin(&mount_lock);
                child_mnt = __lookup_mnt(path->mnt, path->dentry);
                m = child_mnt ? &child_mnt->mnt : NULL;
        } while (!legitimize_mnt(m, seq));
        rcu_read_unlock();
        return m;
}

/*
 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
 *                         current mount namespace.
 *
 * The common case is dentries are not mountpoints at all and that
 * test is handled inline.  For the slow case when we are actually
 * dealing with a mountpoint of some kind, walk through all of the
 * mounts in the current mount namespace and test to see if the dentry
 * is a mountpoint.
 *
 * The mount_hashtable is not usable in the context because we
 * need to identify all mounts that may be in the current mount
 * namespace not just a mount that happens to have some specified
 * parent mount.
 */
bool __is_local_mountpoint(struct dentry *dentry)
{
        struct mnt_namespace *ns = current->nsproxy->mnt_ns;
        struct mount *mnt, *n;
        bool is_covered = false;

        down_read(&namespace_sem);
        rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
                is_covered = (mnt->mnt_mountpoint == dentry);
                if (is_covered)
                        break;
        }
        up_read(&namespace_sem);

        return is_covered;
}

static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
{
        struct hlist_head *chain = mp_hash(dentry);
        struct mountpoint *mp;

        hlist_for_each_entry(mp, chain, m_hash) {
                if (mp->m_dentry == dentry) {
                        mp->m_count++;
                        return mp;
                }
        }
        return NULL;
}

static struct mountpoint *get_mountpoint(struct dentry *dentry)
{
        struct mountpoint *mp, *new = NULL;
        int ret;

        if (d_mountpoint(dentry)) {
                /* might be worth a WARN_ON() */
                if (d_unlinked(dentry))
                        return ERR_PTR(-ENOENT);
mountpoint:
                read_seqlock_excl(&mount_lock);
                mp = lookup_mountpoint(dentry);
                read_sequnlock_excl(&mount_lock);
                if (mp)
                        goto done;
        }

        if (!new)
                new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
        if (!new)
                return ERR_PTR(-ENOMEM);


        /* Exactly one processes may set d_mounted */
        ret = d_set_mounted(dentry);

        /* Someone else set d_mounted? */
        if (ret == -EBUSY)
                goto mountpoint;

        /* The dentry is not available as a mountpoint? */
        mp = ERR_PTR(ret);
        if (ret)
                goto done;

        /* Add the new mountpoint to the hash table */
        read_seqlock_excl(&mount_lock);
        new->m_dentry = dget(dentry);
        new->m_count = 1;
        hlist_add_head(&new->m_hash, mp_hash(dentry));
        INIT_HLIST_HEAD(&new->m_list);
        read_sequnlock_excl(&mount_lock);

        mp = new;
        new = NULL;
done:
        kfree(new);
        return mp;
}

/*
 * vfsmount lock must be held.  Additionally, the caller is responsible
 * for serializing calls for given disposal list.
 */
static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
{
        if (!--mp->m_count) {
                struct dentry *dentry = mp->m_dentry;
                BUG_ON(!hlist_empty(&mp->m_list));
                spin_lock(&dentry->d_lock);
                dentry->d_flags &= ~DCACHE_MOUNTED;
                spin_unlock(&dentry->d_lock);
                dput_to_list(dentry, list);
                hlist_del(&mp->m_hash);
                kfree(mp);
        }
}

/* called with namespace_lock and vfsmount lock */
static void put_mountpoint(struct mountpoint *mp)
{
        __put_mountpoint(mp, &ex_mountpoints);
}

static inline int check_mnt(struct mount *mnt)
{
        return mnt->mnt_ns == current->nsproxy->mnt_ns;
}

/*
 * vfsmount lock must be held for write
 */
static void touch_mnt_namespace(struct mnt_namespace *ns)
{
        if (ns) {
                ns->event = ++event;
                wake_up_interruptible(&ns->poll);
        }
}

/*
 * vfsmount lock must be held for write
 */
static void __touch_mnt_namespace(struct mnt_namespace *ns)
{
        if (ns && ns->event != event) {
                ns->event = event;
                wake_up_interruptible(&ns->poll);
        }
}

/*
 * vfsmount lock must be held for write
 */
static struct mountpoint *unhash_mnt(struct mount *mnt)
{
        struct mountpoint *mp;
        mnt->mnt_parent = mnt;
        mnt->mnt_mountpoint = mnt->mnt.mnt_root;
        list_del_init(&mnt->mnt_child);
        hlist_del_init_rcu(&mnt->mnt_hash);
        hlist_del_init(&mnt->mnt_mp_list);
        mp = mnt->mnt_mp;
        mnt->mnt_mp = NULL;
        return mp;
}

/*
 * vfsmount lock must be held for write
 */
static void umount_mnt(struct mount *mnt)
{
        put_mountpoint(unhash_mnt(mnt));
}

/*
 * vfsmount lock must be held for write
 */
void mnt_set_mountpoint(struct mount *mnt,
                        struct mountpoint *mp,
                        struct mount *child_mnt)
{
        mp->m_count++;
        mnt_add_count(mnt, 1);        /* essentially, that's mntget */
        child_mnt->mnt_mountpoint = mp->m_dentry;
        child_mnt->mnt_parent = mnt;
        child_mnt->mnt_mp = mp;
        hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
}

/**
 * mnt_set_mountpoint_beneath - mount a mount beneath another one
 *
 * @new_parent: the source mount
 * @top_mnt:    the mount beneath which @new_parent is mounted
 * @new_mp:     the new mountpoint of @top_mnt on @new_parent
 *
 * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and
 * parent @top_mnt->mnt_parent and mount it on top of @new_parent at
 * @new_mp. And mount @new_parent on the old parent and old
 * mountpoint of @top_mnt.
 *
 * Context: This function expects namespace_lock() and lock_mount_hash()
 *          to have been acquired in that order.
 */
static void mnt_set_mountpoint_beneath(struct mount *new_parent,
                                       struct mount *top_mnt,
                                       struct mountpoint *new_mp)
{
        struct mount *old_top_parent = top_mnt->mnt_parent;
        struct mountpoint *old_top_mp = top_mnt->mnt_mp;

        mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent);
        mnt_change_mountpoint(new_parent, new_mp, top_mnt);
}


static void __attach_mnt(struct mount *mnt, struct mount *parent)
{
        hlist_add_head_rcu(&mnt->mnt_hash,
                           m_hash(&parent->mnt, mnt->mnt_mountpoint));
        list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
}

/**
 * attach_mnt - mount a mount, attach to @mount_hashtable and parent's
 *              list of child mounts
 * @parent:  the parent
 * @mnt:     the new mount
 * @mp:      the new mountpoint
 * @beneath: whether to mount @mnt beneath or on top of @parent
 *
 * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt
 * to @parent's child mount list and to @mount_hashtable.
 *
 * If @beneath is true, remove @mnt from its current parent and
 * mountpoint and mount it on @mp on @parent, and mount @parent on the
 * old parent and old mountpoint of @mnt. Finally, attach @parent to
 * @mnt_hashtable and @parent->mnt_parent->mnt_mounts.
 *
 * Note, when __attach_mnt() is called @mnt->mnt_parent already points
 * to the correct parent.
 *
 * Context: This function expects namespace_lock() and lock_mount_hash()
 *          to have been acquired in that order.
 */
static void attach_mnt(struct mount *mnt, struct mount *parent,
                       struct mountpoint *mp, bool beneath)
{
        if (beneath)
                mnt_set_mountpoint_beneath(mnt, parent, mp);
        else
                mnt_set_mountpoint(parent, mp, mnt);
        /*
         * Note, @mnt->mnt_parent has to be used. If @mnt was mounted
         * beneath @parent then @mnt will need to be attached to
         * @parent's old parent, not @parent. IOW, @mnt->mnt_parent
         * isn't the same mount as @parent.
         */
        __attach_mnt(mnt, mnt->mnt_parent);
}

void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
{
        struct mountpoint *old_mp = mnt->mnt_mp;
        struct mount *old_parent = mnt->mnt_parent;

        list_del_init(&mnt->mnt_child);
        hlist_del_init(&mnt->mnt_mp_list);
        hlist_del_init_rcu(&mnt->mnt_hash);

        attach_mnt(mnt, parent, mp, false);

        put_mountpoint(old_mp);
        mnt_add_count(old_parent, -1);
}

static inline struct mount *node_to_mount(struct rb_node *node)
{
        return node ? rb_entry(node, struct mount, mnt_node) : NULL;
}

static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
{
        struct rb_node **link = &ns->mounts.rb_node;
        struct rb_node *parent = NULL;

        WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB);
        mnt->mnt_ns = ns;
        while (*link) {
                parent = *link;
                if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique)
                        link = &parent->rb_left;
                else
                        link = &parent->rb_right;
        }
        rb_link_node(&mnt->mnt_node, parent, link);
        rb_insert_color(&mnt->mnt_node, &ns->mounts);
        mnt->mnt.mnt_flags |= MNT_ONRB;
}

/*
 * vfsmount lock must be held for write
 */
static void commit_tree(struct mount *mnt)
{
        struct mount *parent = mnt->mnt_parent;
        struct mount *m;
        LIST_HEAD(head);
        struct mnt_namespace *n = parent->mnt_ns;

        BUG_ON(parent == mnt);

        list_add_tail(&head, &mnt->mnt_list);
        while (!list_empty(&head)) {
                m = list_first_entry(&head, typeof(*m), mnt_list);
                list_del(&m->mnt_list);

                mnt_add_to_ns(n, m);
        }
        n->nr_mounts += n->pending_mounts;
        n->pending_mounts = 0;

        __attach_mnt(mnt, parent);
        touch_mnt_namespace(n);
}

static struct mount *next_mnt(struct mount *p, struct mount *root)
{
        struct list_head *next = p->mnt_mounts.next;
        if (next == &p->mnt_mounts) {
                while (1) {
                        if (p == root)
                                return NULL;
                        next = p->mnt_child.next;
                        if (next != &p->mnt_parent->mnt_mounts)
                                break;
                        p = p->mnt_parent;
                }
        }
        return list_entry(next, struct mount, mnt_child);
}

static struct mount *skip_mnt_tree(struct mount *p)
{
        struct list_head *prev = p->mnt_mounts.prev;
        while (prev != &p->mnt_mounts) {
                p = list_entry(prev, struct mount, mnt_child);
                prev = p->mnt_mounts.prev;
        }
        return p;
}

/**
 * vfs_create_mount - Create a mount for a configured superblock
 * @fc: The configuration context with the superblock attached
 *
 * Create a mount to an already configured superblock.  If necessary, the
 * caller should invoke vfs_get_tree() before calling this.
 *
 * Note that this does not attach the mount to anything.
 */
struct vfsmount *vfs_create_mount(struct fs_context *fc)
{
        struct mount *mnt;

        if (!fc->root)
                return ERR_PTR(-EINVAL);

        mnt = alloc_vfsmnt(fc->source ?: "none");
        if (!mnt)
                return ERR_PTR(-ENOMEM);

        if (fc->sb_flags & SB_KERNMOUNT)
                mnt->mnt.mnt_flags = MNT_INTERNAL;

        atomic_inc(&fc->root->d_sb->s_active);
        mnt->mnt.mnt_sb                = fc->root->d_sb;
        mnt->mnt.mnt_root        = dget(fc->root);
        mnt->mnt_mountpoint        = mnt->mnt.mnt_root;
        mnt->mnt_parent                = mnt;

        lock_mount_hash();
        list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
        unlock_mount_hash();
        return &mnt->mnt;
}
EXPORT_SYMBOL(vfs_create_mount);

struct vfsmount *fc_mount(struct fs_context *fc)
{
        int err = vfs_get_tree(fc);
        if (!err) {
                up_write(&fc->root->d_sb->s_umount);
                return vfs_create_mount(fc);
        }
        return ERR_PTR(err);
}
EXPORT_SYMBOL(fc_mount);

struct vfsmount *vfs_kern_mount(struct file_system_type *type,
                                int flags, const char *name,
                                void *data)
{
        struct fs_context *fc;
        struct vfsmount *mnt;
        int ret = 0;

        if (!type)
                return ERR_PTR(-EINVAL);

        fc = fs_context_for_mount(type, flags);
        if (IS_ERR(fc))
                return ERR_CAST(fc);

        if (name)
                ret = vfs_parse_fs_string(fc, "source",
                                          name, strlen(name));
        if (!ret)
                ret = parse_monolithic_mount_data(fc, data);
        if (!ret)
                mnt = fc_mount(fc);
        else
                mnt = ERR_PTR(ret);

        put_fs_context(fc);
        return mnt;
}
EXPORT_SYMBOL_GPL(vfs_kern_mount);

struct vfsmount *
vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
             const char *name, void *data)
{
        /* Until it is worked out how to pass the user namespace
         * through from the parent mount to the submount don't support
         * unprivileged mounts with submounts.
         */
        if (mountpoint->d_sb->s_user_ns != &init_user_ns)
                return ERR_PTR(-EPERM);

        return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
}
EXPORT_SYMBOL_GPL(vfs_submount);

static struct mount *clone_mnt(struct mount *old, struct dentry *root,
                                        int flag)
{
        struct super_block *sb = old->mnt.mnt_sb;
        struct mount *mnt;
        int err;

        mnt = alloc_vfsmnt(old->mnt_devname);
        if (!mnt)
                return ERR_PTR(-ENOMEM);

        if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
                mnt->mnt_group_id = 0; /* not a peer of original */
        else
                mnt->mnt_group_id = old->mnt_group_id;

        if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
                err = mnt_alloc_group_id(mnt);
                if (err)
                        goto out_free;
        }

        mnt->mnt.mnt_flags = old->mnt.mnt_flags;
        mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB);

        atomic_inc(&sb->s_active);
        mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));

        mnt->mnt.mnt_sb = sb;
        mnt->mnt.mnt_root = dget(root);
        mnt->mnt_mountpoint = mnt->mnt.mnt_root;
        mnt->mnt_parent = mnt;
        lock_mount_hash();
        list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
        unlock_mount_hash();

        if ((flag & CL_SLAVE) ||
            ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
                list_add(&mnt->mnt_slave, &old->mnt_slave_list);
                mnt->mnt_master = old;
                CLEAR_MNT_SHARED(mnt);
        } else if (!(flag & CL_PRIVATE)) {
                if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
                        list_add(&mnt->mnt_share, &old->mnt_share);
                if (IS_MNT_SLAVE(old))
                        list_add(&mnt->mnt_slave, &old->mnt_slave);
                mnt->mnt_master = old->mnt_master;
        } else {
                CLEAR_MNT_SHARED(mnt);
        }
        if (flag & CL_MAKE_SHARED)
                set_mnt_shared(mnt);

        /* stick the duplicate mount on the same expiry list
         * as the original if that was on one */
        if (flag & CL_EXPIRE) {
                if (!list_empty(&old->mnt_expire))
                        list_add(&mnt->mnt_expire, &old->mnt_expire);
        }

        return mnt;

 out_free:
        mnt_free_id(mnt);
        free_vfsmnt(mnt);
        return ERR_PTR(err);
}

static void cleanup_mnt(struct mount *mnt)
{
        struct hlist_node *p;
        struct mount *m;
        /*
         * The warning here probably indicates that somebody messed
         * up a mnt_want/drop_write() pair.  If this happens, the
         * filesystem was probably unable to make r/w->r/o transitions.
         * The locking used to deal with mnt_count decrement provides barriers,
         * so mnt_get_writers() below is safe.
         */
        WARN_ON(mnt_get_writers(mnt));
        if (unlikely(mnt->mnt_pins.first))
                mnt_pin_kill(mnt);
        hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
                hlist_del(&m->mnt_umount);
                mntput(&m->mnt);
        }
        fsnotify_vfsmount_delete(&mnt->mnt);
        dput(mnt->mnt.mnt_root);
        deactivate_super(mnt->mnt.mnt_sb);
        mnt_free_id(mnt);
        call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
}

static void __cleanup_mnt(struct rcu_head *head)
{
        cleanup_mnt(container_of(head, struct mount, mnt_rcu));
}

static LLIST_HEAD(delayed_mntput_list);
static void delayed_mntput(struct work_struct *unused)
{
        struct llist_node *node = llist_del_all(&delayed_mntput_list);
        struct mount *m, *t;

        llist_for_each_entry_safe(m, t, node, mnt_llist)
                cleanup_mnt(m);
}
static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);

static void mntput_no_expire(struct mount *mnt)
{
        LIST_HEAD(list);
        int count;

        rcu_read_lock();
        if (likely(READ_ONCE(mnt->mnt_ns))) {
                /*
                 * Since we don't do lock_mount_hash() here,
                 * ->mnt_ns can change under us.  However, if it's
                 * non-NULL, then there's a reference that won't
                 * be dropped until after an RCU delay done after
                 * turning ->mnt_ns NULL.  So if we observe it
                 * non-NULL under rcu_read_lock(), the reference
                 * we are dropping is not the final one.
                 */
                mnt_add_count(mnt, -1);
                rcu_read_unlock();
                return;
        }
        lock_mount_hash();
        /*
         * make sure that if __legitimize_mnt() has not seen us grab
         * mount_lock, we'll see their refcount increment here.
         */
        smp_mb();
        mnt_add_count(mnt, -1);
        count = mnt_get_count(mnt);
        if (count != 0) {
                WARN_ON(count < 0);
                rcu_read_unlock();
                unlock_mount_hash();
                return;
        }
        if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
                rcu_read_unlock();
                unlock_mount_hash();
                return;
        }
        mnt->mnt.mnt_flags |= MNT_DOOMED;
        rcu_read_unlock();

        list_del(&mnt->mnt_instance);

        if (unlikely(!list_empty(&mnt->mnt_mounts))) {
                struct mount *p, *tmp;
                list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) {
                        __put_mountpoint(unhash_mnt(p), &list);
                        hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
                }
        }
        unlock_mount_hash();
        shrink_dentry_list(&list);

        if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
                struct task_struct *task = current;
                if (likely(!(task->flags & PF_KTHREAD))) {
                        init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
                        if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
                                return;
                }
                if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
                        schedule_delayed_work(&delayed_mntput_work, 1);
                return;
        }
        cleanup_mnt(mnt);
}

void mntput(struct vfsmount *mnt)
{
        if (mnt) {
                struct mount *m = real_mount(mnt);
                /* avoid cacheline pingpong */
                if (unlikely(m->mnt_expiry_mark))
                        WRITE_ONCE(m->mnt_expiry_mark, 0);
                mntput_no_expire(m);
        }
}
EXPORT_SYMBOL(mntput);

struct vfsmount *mntget(struct vfsmount *mnt)
{
        if (mnt)
                mnt_add_count(real_mount(mnt), 1);
        return mnt;
}
EXPORT_SYMBOL(mntget);

/*
 * Make a mount point inaccessible to new lookups.
 * Because there may still be current users, the caller MUST WAIT
 * for an RCU grace period before destroying the mount point.
 */
void mnt_make_shortterm(struct vfsmount *mnt)
{
        if (mnt)
                real_mount(mnt)->mnt_ns = NULL;
}

/**
 * path_is_mountpoint() - Check if path is a mount in the current namespace.
 * @path: path to check
 *
 *  d_mountpoint() can only be used reliably to establish if a dentry is
 *  not mounted in any namespace and that common case is handled inline.
 *  d_mountpoint() isn't aware of the possibility there may be multiple
 *  mounts using a given dentry in a different namespace. This function
 *  checks if the passed in path is a mountpoint rather than the dentry
 *  alone.
 */
bool path_is_mountpoint(const struct path *path)
{
        unsigned seq;
        bool res;

        if (!d_mountpoint(path->dentry))
                return false;

        rcu_read_lock();
        do {
                seq = read_seqbegin(&mount_lock);
                res = __path_is_mountpoint(path);
        } while (read_seqretry(&mount_lock, seq));
        rcu_read_unlock();

        return res;
}
EXPORT_SYMBOL(path_is_mountpoint);

struct vfsmount *mnt_clone_internal(const struct path *path)
{
        struct mount *p;
        p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
        if (IS_ERR(p))
                return ERR_CAST(p);
        p->mnt.mnt_flags |= MNT_INTERNAL;
        return &p->mnt;
}

/*
 * Returns the mount which either has the specified mnt_id, or has the next
 * smallest id afer the specified one.
 */
static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
{
        struct rb_node *node = ns->mounts.rb_node;
        struct mount *ret = NULL;

        while (node) {
                struct mount *m = node_to_mount(node);

                if (mnt_id <= m->mnt_id_unique) {
                        ret = node_to_mount(node);
                        if (mnt_id == m->mnt_id_unique)
                                break;
                        node = node->rb_left;
                } else {
                        node = node->rb_right;
                }
        }
        return ret;
}

#ifdef CONFIG_PROC_FS

/* iterator; we want it to have access to namespace_sem, thus here... */
static void *m_start(struct seq_file *m, loff_t *pos)
{
        struct proc_mounts *p = m->private;

        down_read(&namespace_sem);

        return mnt_find_id_at(p->ns, *pos);
}

static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct mount *next = NULL, *mnt = v;
        struct rb_node *node = rb_next(&mnt->mnt_node);

        ++*pos;
        if (node) {
                next = node_to_mount(node);
                *pos = next->mnt_id_unique;
        }
        return next;
}

static void m_stop(struct seq_file *m, void *v)
{
        up_read(&namespace_sem);
}

static int m_show(struct seq_file *m, void *v)
{
        struct proc_mounts *p = m->private;
        struct mount *r = v;
        return p->show(m, &r->mnt);
}

const struct seq_operations mounts_op = {
        .start        = m_start,
        .next        = m_next,
        .stop        = m_stop,
        .show        = m_show,
};

#endif  /* CONFIG_PROC_FS */

/**
 * may_umount_tree - check if a mount tree is busy
 * @m: root of mount tree
 *
 * This is called to check if a tree of mounts has any
 * open files, pwds, chroots or sub mounts that are
 * busy.
 */
int may_umount_tree(struct vfsmount *m)
{
        struct mount *mnt = real_mount(m);
        int actual_refs = 0;
        int minimum_refs = 0;
        struct mount *p;
        BUG_ON(!m);

        /* write lock needed for mnt_get_count */
        lock_mount_hash();
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                actual_refs += mnt_get_count(p);
                minimum_refs += 2;
        }
        unlock_mount_hash();

        if (actual_refs > minimum_refs)
                return 0;

        return 1;
}

EXPORT_SYMBOL(may_umount_tree);

/**
 * may_umount - check if a mount point is busy
 * @mnt: root of mount
 *
 * This is called to check if a mount point has any
 * open files, pwds, chroots or sub mounts. If the
 * mount has sub mounts this will return busy
 * regardless of whether the sub mounts are busy.
 *
 * Doesn't take quota and stuff into account. IOW, in some cases it will
 * give false negatives. The main reason why it's here is that we need
 * a non-destructive way to look for easily umountable filesystems.
 */
int may_umount(struct vfsmount *mnt)
{
        int ret = 1;
        down_read(&namespace_sem);
        lock_mount_hash();
        if (propagate_mount_busy(real_mount(mnt), 2))
                ret = 0;
        unlock_mount_hash();
        up_read(&namespace_sem);
        return ret;
}

EXPORT_SYMBOL(may_umount);

static void namespace_unlock(void)
{
        struct hlist_head head;
        struct hlist_node *p;
        struct mount *m;
        LIST_HEAD(list);

        hlist_move_list(&unmounted, &head);
        list_splice_init(&ex_mountpoints, &list);

        up_write(&namespace_sem);

        shrink_dentry_list(&list);

        if (likely(hlist_empty(&head)))
                return;

        synchronize_rcu_expedited();

        hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
                hlist_del(&m->mnt_umount);
                mntput(&m->mnt);
        }
}

static inline void namespace_lock(void)
{
        down_write(&namespace_sem);
}

enum umount_tree_flags {
        UMOUNT_SYNC = 1,
        UMOUNT_PROPAGATE = 2,
        UMOUNT_CONNECTED = 4,
};

static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
{
        /* Leaving mounts connected is only valid for lazy umounts */
        if (how & UMOUNT_SYNC)
                return true;

        /* A mount without a parent has nothing to be connected to */
        if (!mnt_has_parent(mnt))
                return true;

        /* Because the reference counting rules change when mounts are
         * unmounted and connected, umounted mounts may not be
         * connected to mounted mounts.
         */
        if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
                return true;

        /* Has it been requested that the mount remain connected? */
        if (how & UMOUNT_CONNECTED)
                return false;

        /* Is the mount locked such that it needs to remain connected? */
        if (IS_MNT_LOCKED(mnt))
                return false;

        /* By default disconnect the mount */
        return true;
}

/*
 * mount_lock must be held
 * namespace_sem must be held for write
 */
static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
{
        LIST_HEAD(tmp_list);
        struct mount *p;

        if (how & UMOUNT_PROPAGATE)
                propagate_mount_unlock(mnt);

        /* Gather the mounts to umount */
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                p->mnt.mnt_flags |= MNT_UMOUNT;
                if (p->mnt.mnt_flags & MNT_ONRB)
                        move_from_ns(p, &tmp_list);
                else
                        list_move(&p->mnt_list, &tmp_list);
        }

        /* Hide the mounts from mnt_mounts */
        list_for_each_entry(p, &tmp_list, mnt_list) {
                list_del_init(&p->mnt_child);
        }

        /* Add propogated mounts to the tmp_list */
        if (how & UMOUNT_PROPAGATE)
                propagate_umount(&tmp_list);

        while (!list_empty(&tmp_list)) {
                struct mnt_namespace *ns;
                bool disconnect;
                p = list_first_entry(&tmp_list, struct mount, mnt_list);
                list_del_init(&p->mnt_expire);
                list_del_init(&p->mnt_list);
                ns = p->mnt_ns;
                if (ns) {
                        ns->nr_mounts--;
                        __touch_mnt_namespace(ns);
                }
                p->mnt_ns = NULL;
                if (how & UMOUNT_SYNC)
                        p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;

                disconnect = disconnect_mount(p, how);
                if (mnt_has_parent(p)) {
                        mnt_add_count(p->mnt_parent, -1);
                        if (!disconnect) {
                                /* Don't forget about p */
                                list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
                        } else {
                                umount_mnt(p);
                        }
                }
                change_mnt_propagation(p, MS_PRIVATE);
                if (disconnect)
                        hlist_add_head(&p->mnt_umount, &unmounted);
        }
}

static void shrink_submounts(struct mount *mnt);

static int do_umount_root(struct super_block *sb)
{
        int ret = 0;

        down_write(&sb->s_umount);
        if (!sb_rdonly(sb)) {
                struct fs_context *fc;

                fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
                                                SB_RDONLY);
                if (IS_ERR(fc)) {
                        ret = PTR_ERR(fc);
                } else {
                        ret = parse_monolithic_mount_data(fc, NULL);
                        if (!ret)
                                ret = reconfigure_super(fc);
                        put_fs_context(fc);
                }
        }
        up_write(&sb->s_umount);
        return ret;
}

static int do_umount(struct mount *mnt, int flags)
{
        struct super_block *sb = mnt->mnt.mnt_sb;
        int retval;

        retval = security_sb_umount(&mnt->mnt, flags);
        if (retval)
                return retval;

        /*
         * Allow userspace to request a mountpoint be expired rather than
         * unmounting unconditionally. Unmount only happens if:
         *  (1) the mark is already set (the mark is cleared by mntput())
         *  (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
         */
        if (flags & MNT_EXPIRE) {
                if (&mnt->mnt == current->fs->root.mnt ||
                    flags & (MNT_FORCE | MNT_DETACH))
                        return -EINVAL;

                /*
                 * probably don't strictly need the lock here if we examined
                 * all race cases, but it's a slowpath.
                 */
                lock_mount_hash();
                if (mnt_get_count(mnt) != 2) {
                        unlock_mount_hash();
                        return -EBUSY;
                }
                unlock_mount_hash();

                if (!xchg(&mnt->mnt_expiry_mark, 1))
                        return -EAGAIN;
        }

        /*
         * If we may have to abort operations to get out of this
         * mount, and they will themselves hold resources we must
         * allow the fs to do things. In the Unix tradition of
         * 'Gee thats tricky lets do it in userspace' the umount_begin
         * might fail to complete on the first run through as other tasks
         * must return, and the like. Thats for the mount program to worry
         * about for the moment.
         */

        if (flags & MNT_FORCE && sb->s_op->umount_begin) {
                sb->s_op->umount_begin(sb);
        }

        /*
         * No sense to grab the lock for this test, but test itself looks
         * somewhat bogus. Suggestions for better replacement?
         * Ho-hum... In principle, we might treat that as umount + switch
         * to rootfs. GC would eventually take care of the old vfsmount.
         * Actually it makes sense, especially if rootfs would contain a
         * /reboot - static binary that would close all descriptors and
         * call reboot(9). Then init(8) could umount root and exec /reboot.
         */
        if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
                /*
                 * Special case for "unmounting" root ...
                 * we just try to remount it readonly.
                 */
                if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
                        return -EPERM;
                return do_umount_root(sb);
        }

        namespace_lock();
        lock_mount_hash();

        /* Recheck MNT_LOCKED with the locks held */
        retval = -EINVAL;
        if (mnt->mnt.mnt_flags & MNT_LOCKED)
                goto out;

        event++;
        if (flags & MNT_DETACH) {
                if (mnt->mnt.mnt_flags & MNT_ONRB ||
                    !list_empty(&mnt->mnt_list))
                        umount_tree(mnt, UMOUNT_PROPAGATE);
                retval = 0;
        } else {
                shrink_submounts(mnt);
                retval = -EBUSY;
                if (!propagate_mount_busy(mnt, 2)) {
                        if (mnt->mnt.mnt_flags & MNT_ONRB ||
                            !list_empty(&mnt->mnt_list))
                                umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
                        retval = 0;
                }
        }
out:
        unlock_mount_hash();
        namespace_unlock();
        return retval;
}

/*
 * __detach_mounts - lazily unmount all mounts on the specified dentry
 *
 * During unlink, rmdir, and d_drop it is possible to loose the path
 * to an existing mountpoint, and wind up leaking the mount.
 * detach_mounts allows lazily unmounting those mounts instead of
 * leaking them.
 *
 * The caller may hold dentry->d_inode->i_mutex.
 */
void __detach_mounts(struct dentry *dentry)
{
        struct mountpoint *mp;
        struct mount *mnt;

        namespace_lock();
        lock_mount_hash();
        mp = lookup_mountpoint(dentry);
        if (!mp)
                goto out_unlock;

        event++;
        while (!hlist_empty(&mp->m_list)) {
                mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
                if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
                        umount_mnt(mnt);
                        hlist_add_head(&mnt->mnt_umount, &unmounted);
                }
                else umount_tree(mnt, UMOUNT_CONNECTED);
        }
        put_mountpoint(mp);
out_unlock:
        unlock_mount_hash();
        namespace_unlock();
}

/*
 * Is the caller allowed to modify his namespace?
 */
bool may_mount(void)
{
        return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
}

/**
 * path_mounted - check whether path is mounted
 * @path: path to check
 *
 * Determine whether @path refers to the root of a mount.
 *
 * Return: true if @path is the root of a mount, false if not.
 */
static inline bool path_mounted(const struct path *path)
{
        return path->mnt->mnt_root == path->dentry;
}

static void warn_mandlock(void)
{
        pr_warn_once("=======================================================\n"
                     "WARNING: The mand mount option has been deprecated and\n"
                     "         and is ignored by this kernel. Remove the mand\n"
                     "         option from the mount to silence this warning.\n"
                     "=======================================================\n");
}

static int can_umount(const struct path *path, int flags)
{
        struct mount *mnt = real_mount(path->mnt);

        if (!may_mount())
                return -EPERM;
        if (!path_mounted(path))
                return -EINVAL;
        if (!check_mnt(mnt))
                return -EINVAL;
        if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
                return -EINVAL;
        if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

// caller is responsible for flags being sane
int path_umount(struct path *path, int flags)
{
        struct mount *mnt = real_mount(path->mnt);
        int ret;

        ret = can_umount(path, flags);
        if (!ret)
                ret = do_umount(mnt, flags);

        /* we mustn't call path_put() as that would clear mnt_expiry_mark */
        dput(path->dentry);
        mntput_no_expire(mnt);
        return ret;
}

static int ksys_umount(char __user *name, int flags)
{
        int lookup_flags = LOOKUP_MOUNTPOINT;
        struct path path;
        int ret;

        // basic validity checks done first
        if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
                return -EINVAL;

        if (!(flags & UMOUNT_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
        ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
        if (ret)
                return ret;
        return path_umount(&path, flags);
}

SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
{
        return ksys_umount(name, flags);
}

#ifdef __ARCH_WANT_SYS_OLDUMOUNT

/*
 *        The 2.0 compatible umount. No flags.
 */
SYSCALL_DEFINE1(oldumount, char __user *, name)
{
        return ksys_umount(name, 0);
}

#endif

static bool is_mnt_ns_file(struct dentry *dentry)
{
        /* Is this a proxy for a mount namespace? */
        return dentry->d_op == &ns_dentry_operations &&
               dentry->d_fsdata == &mntns_operations;
}

static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
        return container_of(ns, struct mnt_namespace, ns);
}

struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
{
        return &mnt->ns;
}

static bool mnt_ns_loop(struct dentry *dentry)
{
        /* Could bind mounting the mount namespace inode cause a
         * mount namespace loop?
         */
        struct mnt_namespace *mnt_ns;
        if (!is_mnt_ns_file(dentry))
                return false;

        mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
        return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
}

struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
                                        int flag)
{
        struct mount *res, *p, *q, *r, *parent;

        if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
                return ERR_PTR(-EINVAL);

        if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
                return ERR_PTR(-EINVAL);

        res = q = clone_mnt(mnt, dentry, flag);
        if (IS_ERR(q))
                return q;

        q->mnt_mountpoint = mnt->mnt_mountpoint;

        p = mnt;
        list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
                struct mount *s;
                if (!is_subdir(r->mnt_mountpoint, dentry))
                        continue;

                for (s = r; s; s = next_mnt(s, r)) {
                        if (!(flag & CL_COPY_UNBINDABLE) &&
                            IS_MNT_UNBINDABLE(s)) {
                                if (s->mnt.mnt_flags & MNT_LOCKED) {
                                        /* Both unbindable and locked. */
                                        q = ERR_PTR(-EPERM);
                                        goto out;
                                } else {
                                        s = skip_mnt_tree(s);
                                        continue;
                                }
                        }
                        if (!(flag & CL_COPY_MNT_NS_FILE) &&
                            is_mnt_ns_file(s->mnt.mnt_root)) {
                                s = skip_mnt_tree(s);
                                continue;
                        }
                        while (p != s->mnt_parent) {
                                p = p->mnt_parent;
                                q = q->mnt_parent;
                        }
                        p = s;
                        parent = q;
                        q = clone_mnt(p, p->mnt.mnt_root, flag);
                        if (IS_ERR(q))
                                goto out;
                        lock_mount_hash();
                        list_add_tail(&q->mnt_list, &res->mnt_list);
                        attach_mnt(q, parent, p->mnt_mp, false);
                        unlock_mount_hash();
                }
        }
        return res;
out:
        if (res) {
                lock_mount_hash();
                umount_tree(res, UMOUNT_SYNC);
                unlock_mount_hash();
        }
        return q;
}

/* Caller should check returned pointer for errors */

struct vfsmount *collect_mounts(const struct path *path)
{
        struct mount *tree;
        namespace_lock();
        if (!check_mnt(real_mount(path->mnt)))
                tree = ERR_PTR(-EINVAL);
        else
                tree = copy_tree(real_mount(path->mnt), path->dentry,
                                 CL_COPY_ALL | CL_PRIVATE);
        namespace_unlock();
        if (IS_ERR(tree))
                return ERR_CAST(tree);
        return &tree->mnt;
}

static void free_mnt_ns(struct mnt_namespace *);
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);

void dissolve_on_fput(struct vfsmount *mnt)
{
        struct mnt_namespace *ns;
        namespace_lock();
        lock_mount_hash();
        ns = real_mount(mnt)->mnt_ns;
        if (ns) {
                if (is_anon_ns(ns))
                        umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
                else
                        ns = NULL;
        }
        unlock_mount_hash();
        namespace_unlock();
        if (ns)
                free_mnt_ns(ns);
}

void drop_collected_mounts(struct vfsmount *mnt)
{
        namespace_lock();
        lock_mount_hash();
        umount_tree(real_mount(mnt), 0);
        unlock_mount_hash();
        namespace_unlock();
}

static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
{
        struct mount *child;

        list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
                if (!is_subdir(child->mnt_mountpoint, dentry))
                        continue;

                if (child->mnt.mnt_flags & MNT_LOCKED)
                        return true;
        }
        return false;
}

/**
 * clone_private_mount - create a private clone of a path
 * @path: path to clone
 *
 * This creates a new vfsmount, which will be the clone of @path.  The new mount
 * will not be attached anywhere in the namespace and will be private (i.e.
 * changes to the originating mount won't be propagated into this).
 *
 * Release with mntput().
 */
struct vfsmount *clone_private_mount(const struct path *path)
{
        struct mount *old_mnt = real_mount(path->mnt);
        struct mount *new_mnt;

        down_read(&namespace_sem);
        if (IS_MNT_UNBINDABLE(old_mnt))
                goto invalid;

        if (!check_mnt(old_mnt))
                goto invalid;

        if (has_locked_children(old_mnt, path->dentry))
                goto invalid;

        new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
        up_read(&namespace_sem);

        if (IS_ERR(new_mnt))
                return ERR_CAST(new_mnt);

        /* Longterm mount to be removed by kern_unmount*() */
        new_mnt->mnt_ns = MNT_NS_INTERNAL;

        return &new_mnt->mnt;

invalid:
        up_read(&namespace_sem);
        return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(clone_private_mount);

int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
                   struct vfsmount *root)
{
        struct mount *mnt;
        int res = f(root, arg);
        if (res)
                return res;
        list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
                res = f(&mnt->mnt, arg);
                if (res)
                        return res;
        }
        return 0;
}

static void lock_mnt_tree(struct mount *mnt)
{
        struct mount *p;

        for (p = mnt; p; p = next_mnt(p, mnt)) {
                int flags = p->mnt.mnt_flags;
                /* Don't allow unprivileged users to change mount flags */
                flags |= MNT_LOCK_ATIME;

                if (flags & MNT_READONLY)
                        flags |= MNT_LOCK_READONLY;

                if (flags & MNT_NODEV)
                        flags |= MNT_LOCK_NODEV;

                if (flags & MNT_NOSUID)
                        flags |= MNT_LOCK_NOSUID;

                if (flags & MNT_NOEXEC)
                        flags |= MNT_LOCK_NOEXEC;
                /* Don't allow unprivileged users to reveal what is under a mount */
                if (list_empty(&p->mnt_expire))
                        flags |= MNT_LOCKED;
                p->mnt.mnt_flags = flags;
        }
}

static void cleanup_group_ids(struct mount *mnt, struct mount *end)
{
        struct mount *p;

        for (p = mnt; p != end; p = next_mnt(p, mnt)) {
                if (p->mnt_group_id && !IS_MNT_SHARED(p))
                        mnt_release_group_id(p);
        }
}

static int invent_group_ids(struct mount *mnt, bool recurse)
{
        struct mount *p;

        for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
                if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
                        int err = mnt_alloc_group_id(p);
                        if (err) {
                                cleanup_group_ids(mnt, p);
                                return err;
                        }
                }
        }

        return 0;
}

int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
{
        unsigned int max = READ_ONCE(sysctl_mount_max);
        unsigned int mounts = 0;
        struct mount *p;

        if (ns->nr_mounts >= max)
                return -ENOSPC;
        max -= ns->nr_mounts;
        if (ns->pending_mounts >= max)
                return -ENOSPC;
        max -= ns->pending_mounts;

        for (p = mnt; p; p = next_mnt(p, mnt))
                mounts++;

        if (mounts > max)
                return -ENOSPC;

        ns->pending_mounts += mounts;
        return 0;
}

enum mnt_tree_flags_t {
        MNT_TREE_MOVE = BIT(0),
        MNT_TREE_BENEATH = BIT(1),
};

/**
 * attach_recursive_mnt - attach a source mount tree
 * @source_mnt: mount tree to be attached
 * @top_mnt:    mount that @source_mnt will be mounted on or mounted beneath
 * @dest_mp:    the mountpoint @source_mnt will be mounted at
 * @flags:      modify how @source_mnt is supposed to be attached
 *
 *  NOTE: in the table below explains the semantics when a source mount
 *  of a given type is attached to a destination mount of a given type.
 * ---------------------------------------------------------------------------
 * |         BIND MOUNT OPERATION                                            |
 * |**************************************************************************
 * | source-->| shared        |       private  |       slave    | unbindable |
 * | dest     |               |                |                |            |
 * |   |      |               |                |                |            |
 * |   v      |               |                |                |            |
 * |**************************************************************************
 * |  shared  | shared (++)   |     shared (+) |     shared(+++)|  invalid   |
 * |          |               |                |                |            |
 * |non-shared| shared (+)    |      private   |      slave (*) |  invalid   |
 * ***************************************************************************
 * A bind operation clones the source mount and mounts the clone on the
 * destination mount.
 *
 * (++)  the cloned mount is propagated to all the mounts in the propagation
 *          tree of the destination mount and the cloned mount is added to
 *          the peer group of the source mount.
 * (+)   the cloned mount is created under the destination mount and is marked
 *       as shared. The cloned mount is added to the peer group of the source
 *       mount.
 * (+++) the mount is propagated to all the mounts in the propagation tree
 *       of the destination mount and the cloned mount is made slave
 *       of the same master as that of the source mount. The cloned mount
 *       is marked as 'shared and slave'.
 * (*)   the cloned mount is made a slave of the same master as that of the
 *          source mount.
 *
 * ---------------------------------------------------------------------------
 * |                         MOVE MOUNT OPERATION                                 |
 * |**************************************************************************
 * | source-->| shared        |       private  |       slave    | unbindable |
 * | dest     |               |                |                |            |
 * |   |      |               |                |                |            |
 * |   v      |               |                |                |            |
 * |**************************************************************************
 * |  shared  | shared (+)    |     shared (+) |    shared(+++) |  invalid   |
 * |          |               |                |                |            |
 * |non-shared| shared (+*)   |      private   |    slave (*)   | unbindable |
 * ***************************************************************************
 *
 * (+)  the mount is moved to the destination. And is then propagated to
 *         all the mounts in the propagation tree of the destination mount.
 * (+*)  the mount is moved to the destination.
 * (+++)  the mount is moved to the destination and is then propagated to
 *         all the mounts belonging to the destination mount's propagation tree.
 *         the mount is marked as 'shared and slave'.
 * (*)        the mount continues to be a slave at the new location.
 *
 * if the source mount is a tree, the operations explained above is
 * applied to each mount in the tree.
 * Must be called without spinlocks held, since this function can sleep
 * in allocations.
 *
 * Context: The function expects namespace_lock() to be held.
 * Return: If @source_mnt was successfully attached 0 is returned.
 *         Otherwise a negative error code is returned.
 */
static int attach_recursive_mnt(struct mount *source_mnt,
                                struct mount *top_mnt,
                                struct mountpoint *dest_mp,
                                enum mnt_tree_flags_t flags)
{
        struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
        HLIST_HEAD(tree_list);
        struct mnt_namespace *ns = top_mnt->mnt_ns;
        struct mountpoint *smp;
        struct mount *child, *dest_mnt, *p;
        struct hlist_node *n;
        int err = 0;
        bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH;

        /*
         * Preallocate a mountpoint in case the new mounts need to be
         * mounted beneath mounts on the same mountpoint.
         */
        smp = get_mountpoint(source_mnt->mnt.mnt_root);
        if (IS_ERR(smp))
                return PTR_ERR(smp);

        /* Is there space to add these mounts to the mount namespace? */
        if (!moving) {
                err = count_mounts(ns, source_mnt);
                if (err)
                        goto out;
        }

        if (beneath)
                dest_mnt = top_mnt->mnt_parent;
        else
                dest_mnt = top_mnt;

        if (IS_MNT_SHARED(dest_mnt)) {
                err = invent_group_ids(source_mnt, true);
                if (err)
                        goto out;
                err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
        }
        lock_mount_hash();
        if (err)
                goto out_cleanup_ids;

        if (IS_MNT_SHARED(dest_mnt)) {
                for (p = source_mnt; p; p = next_mnt(p, source_mnt))
                        set_mnt_shared(p);
        }

        if (moving) {
                if (beneath)
                        dest_mp = smp;
                unhash_mnt(source_mnt);
                attach_mnt(source_mnt, top_mnt, dest_mp, beneath);
                touch_mnt_namespace(source_mnt->mnt_ns);
        } else {
                if (source_mnt->mnt_ns) {
                        LIST_HEAD(head);

                        /* move from anon - the caller will destroy */
                        for (p = source_mnt; p; p = next_mnt(p, source_mnt))
                                move_from_ns(p, &head);
                        list_del_init(&head);
                }
                if (beneath)
                        mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
                else
                        mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
                commit_tree(source_mnt);
        }

        hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
                struct mount *q;
                hlist_del_init(&child->mnt_hash);
                q = __lookup_mnt(&child->mnt_parent->mnt,
                                 child->mnt_mountpoint);
                if (q)
                        mnt_change_mountpoint(child, smp, q);
                /* Notice when we are propagating across user namespaces */
                if (child->mnt_parent->mnt_ns->user_ns != user_ns)
                        lock_mnt_tree(child);
                child->mnt.mnt_flags &= ~MNT_LOCKED;
                commit_tree(child);
        }
        put_mountpoint(smp);
        unlock_mount_hash();

        return 0;

 out_cleanup_ids:
        while (!hlist_empty(&tree_list)) {
                child = hlist_entry(tree_list.first, struct mount, mnt_hash);
                child->mnt_parent->mnt_ns->pending_mounts = 0;
                umount_tree(child, UMOUNT_SYNC);
        }
        unlock_mount_hash();
        cleanup_group_ids(source_mnt, NULL);
 out:
        ns->pending_mounts = 0;

        read_seqlock_excl(&mount_lock);
        put_mountpoint(smp);
        read_sequnlock_excl(&mount_lock);

        return err;
}

/**
 * do_lock_mount - lock mount and mountpoint
 * @path:    target path
 * @beneath: whether the intention is to mount beneath @path
 *
 * Follow the mount stack on @path until the top mount @mnt is found. If
 * the initial @path->{mnt,dentry} is a mountpoint lookup the first
 * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root}
 * until nothing is stacked on top of it anymore.
 *
 * Acquire the inode_lock() on the top mount's ->mnt_root to protect
 * against concurrent removal of the new mountpoint from another mount
 * namespace.
 *
 * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint
 * @mp on @mnt->mnt_parent must be acquired. This protects against a
 * concurrent unlink of @mp->mnt_dentry from another mount namespace
 * where @mnt doesn't have a child mount mounted @mp. A concurrent
 * removal of @mnt->mnt_root doesn't matter as nothing will be mounted
 * on top of it for @beneath.
 *
 * In addition, @beneath needs to make sure that @mnt hasn't been
 * unmounted or moved from its current mountpoint in between dropping
 * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt
 * being unmounted would be detected later by e.g., calling
 * check_mnt(mnt) in the function it's called from. For the @beneath
 * case however, it's useful to detect it directly in do_lock_mount().
 * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points
 * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will
 * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL.
 *
 * Return: Either the target mountpoint on the top mount or the top
 *         mount's mountpoint.
 */
static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
{
        struct vfsmount *mnt = path->mnt;
        struct dentry *dentry;
        struct mountpoint *mp = ERR_PTR(-ENOENT);

        for (;;) {
                struct mount *m;

                if (beneath) {
                        m = real_mount(mnt);
                        read_seqlock_excl(&mount_lock);
                        dentry = dget(m->mnt_mountpoint);
                        read_sequnlock_excl(&mount_lock);
                } else {
                        dentry = path->dentry;
                }

                inode_lock(dentry->d_inode);
                if (unlikely(cant_mount(dentry))) {
                        inode_unlock(dentry->d_inode);
                        goto out;
                }

                namespace_lock();

                if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) {
                        namespace_unlock();
                        inode_unlock(dentry->d_inode);
                        goto out;
                }

                mnt = lookup_mnt(path);
                if (likely(!mnt))
                        break;

                namespace_unlock();
                inode_unlock(dentry->d_inode);
                if (beneath)
                        dput(dentry);
                path_put(path);
                path->mnt = mnt;
                path->dentry = dget(mnt->mnt_root);
        }

        mp = get_mountpoint(dentry);
        if (IS_ERR(mp)) {
                namespace_unlock();
                inode_unlock(dentry->d_inode);
        }

out:
        if (beneath)
                dput(dentry);

        return mp;
}

static inline struct mountpoint *lock_mount(struct path *path)
{
        return do_lock_mount(path, false);
}

static void unlock_mount(struct mountpoint *where)
{
        struct dentry *dentry = where->m_dentry;

        read_seqlock_excl(&mount_lock);
        put_mountpoint(where);
        read_sequnlock_excl(&mount_lock);

        namespace_unlock();
        inode_unlock(dentry->d_inode);
}

static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
{
        if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
                return -EINVAL;

        if (d_is_dir(mp->m_dentry) !=
              d_is_dir(mnt->mnt.mnt_root))
                return -ENOTDIR;

        return attach_recursive_mnt(mnt, p, mp, 0);
}

/*
 * Sanity check the flags to change_mnt_propagation.
 */

static int flags_to_propagation_type(int ms_flags)
{
        int type = ms_flags & ~(MS_REC | MS_SILENT);

        /* Fail if any non-propagation flags are set */
        if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
                return 0;
        /* Only one propagation flag should be set */
        if (!is_power_of_2(type))
                return 0;
        return type;
}

/*
 * recursively change the type of the mountpoint.
 */
static int do_change_type(struct path *path, int ms_flags)
{
        struct mount *m;
        struct mount *mnt = real_mount(path->mnt);
        int recurse = ms_flags & MS_REC;
        int type;
        int err = 0;

        if (!path_mounted(path))
                return -EINVAL;

        type = flags_to_propagation_type(ms_flags);
        if (!type)
                return -EINVAL;

        namespace_lock();
        if (type == MS_SHARED) {
                err = invent_group_ids(mnt, recurse);
                if (err)
                        goto out_unlock;
        }

        lock_mount_hash();
        for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
                change_mnt_propagation(m, type);
        unlock_mount_hash();

 out_unlock:
        namespace_unlock();
        return err;
}

static struct mount *__do_loopback(struct path *old_path, int recurse)
{
        struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);

        if (IS_MNT_UNBINDABLE(old))
                return mnt;

        if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
                return mnt;

        if (!recurse && has_locked_children(old, old_path->dentry))
                return mnt;

        if (recurse)
                mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
        else
                mnt = clone_mnt(old, old_path->dentry, 0);

        if (!IS_ERR(mnt))
                mnt->mnt.mnt_flags &= ~MNT_LOCKED;

        return mnt;
}

/*
 * do loopback mount.
 */
static int do_loopback(struct path *path, const char *old_name,
                                int recurse)
{
        struct path old_path;
        struct mount *mnt = NULL, *parent;
        struct mountpoint *mp;
        int err;
        if (!old_name || !*old_name)
                return -EINVAL;
        err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
        if (err)
                return err;

        err = -EINVAL;
        if (mnt_ns_loop(old_path.dentry))
                goto out;

        mp = lock_mount(path);
        if (IS_ERR(mp)) {
                err = PTR_ERR(mp);
                goto out;
        }

        parent = real_mount(path->mnt);
        if (!check_mnt(parent))
                goto out2;

        mnt = __do_loopback(&old_path, recurse);
        if (IS_ERR(mnt)) {
                err = PTR_ERR(mnt);
                goto out2;
        }

        err = graft_tree(mnt, parent, mp);
        if (err) {
                lock_mount_hash();
                umount_tree(mnt, UMOUNT_SYNC);
                unlock_mount_hash();
        }
out2:
        unlock_mount(mp);
out:
        path_put(&old_path);
        return err;
}

static struct file *open_detached_copy(struct path *path, bool recursive)
{
        struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
        struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
        struct mount *mnt, *p;
        struct file *file;

        if (IS_ERR(ns))
                return ERR_CAST(ns);

        namespace_lock();
        mnt = __do_loopback(path, recursive);
        if (IS_ERR(mnt)) {
                namespace_unlock();
                free_mnt_ns(ns);
                return ERR_CAST(mnt);
        }

        lock_mount_hash();
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                mnt_add_to_ns(ns, p);
                ns->nr_mounts++;
        }
        ns->root = mnt;
        mntget(&mnt->mnt);
        unlock_mount_hash();
        namespace_unlock();

        mntput(path->mnt);
        path->mnt = &mnt->mnt;
        file = dentry_open(path, O_PATH, current_cred());
        if (IS_ERR(file))
                dissolve_on_fput(path->mnt);
        else
                file->f_mode |= FMODE_NEED_UNMOUNT;
        return file;
}

SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
{
        struct file *file;
        struct path path;
        int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
        bool detached = flags & OPEN_TREE_CLONE;
        int error;
        int fd;

        BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);

        if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
                      AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
                      OPEN_TREE_CLOEXEC))
                return -EINVAL;

        if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
                return -EINVAL;

        if (flags & AT_NO_AUTOMOUNT)
                lookup_flags &= ~LOOKUP_AUTOMOUNT;
        if (flags & AT_SYMLINK_NOFOLLOW)
                lookup_flags &= ~LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;

        if (detached && !may_mount())
                return -EPERM;

        fd = get_unused_fd_flags(flags & O_CLOEXEC);
        if (fd < 0)
                return fd;

        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (unlikely(error)) {
                file = ERR_PTR(error);
        } else {
                if (detached)
                        file = open_detached_copy(&path, flags & AT_RECURSIVE);
                else
                        file = dentry_open(&path, O_PATH, current_cred());
                path_put(&path);
        }
        if (IS_ERR(file)) {
                put_unused_fd(fd);
                return PTR_ERR(file);
        }
        fd_install(fd, file);
        return fd;
}

/*
 * Don't allow locked mount flags to be cleared.
 *
 * No locks need to be held here while testing the various MNT_LOCK
 * flags because those flags can never be cleared once they are set.
 */
static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
{
        unsigned int fl = mnt->mnt.mnt_flags;

        if ((fl & MNT_LOCK_READONLY) &&
            !(mnt_flags & MNT_READONLY))
                return false;

        if ((fl & MNT_LOCK_NODEV) &&
            !(mnt_flags & MNT_NODEV))
                return false;

        if ((fl & MNT_LOCK_NOSUID) &&
            !(mnt_flags & MNT_NOSUID))
                return false;

        if ((fl & MNT_LOCK_NOEXEC) &&
            !(mnt_flags & MNT_NOEXEC))
                return false;

        if ((fl & MNT_LOCK_ATIME) &&
            ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
                return false;

        return true;
}

static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
{
        bool readonly_request = (mnt_flags & MNT_READONLY);

        if (readonly_request == __mnt_is_readonly(&mnt->mnt))
                return 0;

        if (readonly_request)
                return mnt_make_readonly(mnt);

        mnt->mnt.mnt_flags &= ~MNT_READONLY;
        return 0;
}

static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
{
        mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
        mnt->mnt.mnt_flags = mnt_flags;
        touch_mnt_namespace(mnt->mnt_ns);
}

static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
{
        struct super_block *sb = mnt->mnt_sb;

        if (!__mnt_is_readonly(mnt) &&
           (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
           (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
                char *buf = (char *)__get_free_page(GFP_KERNEL);
                char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);

                pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n",
                        sb->s_type->name,
                        is_mounted(mnt) ? "remounted" : "mounted",
                        mntpath, &sb->s_time_max,
                        (unsigned long long)sb->s_time_max);

                free_page((unsigned long)buf);
                sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
        }
}

/*
 * Handle reconfiguration of the mountpoint only without alteration of the
 * superblock it refers to.  This is triggered by specifying MS_REMOUNT|MS_BIND
 * to mount(2).
 */
static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
{
        struct super_block *sb = path->mnt->mnt_sb;
        struct mount *mnt = real_mount(path->mnt);
        int ret;

        if (!check_mnt(mnt))
                return -EINVAL;

        if (!path_mounted(path))
                return -EINVAL;

        if (!can_change_locked_flags(mnt, mnt_flags))
                return -EPERM;

        /*
         * We're only checking whether the superblock is read-only not
         * changing it, so only take down_read(&sb->s_umount).
         */
        down_read(&sb->s_umount);
        lock_mount_hash();
        ret = change_mount_ro_state(mnt, mnt_flags);
        if (ret == 0)
                set_mount_attributes(mnt, mnt_flags);
        unlock_mount_hash();
        up_read(&sb->s_umount);

        mnt_warn_timestamp_expiry(path, &mnt->mnt);

        return ret;
}

/*
 * change filesystem flags. dir should be a physical root of filesystem.
 * If you've mounted a non-root directory somewhere and want to do remount
 * on it - tough luck.
 */
static int do_remount(struct path *path, int ms_flags, int sb_flags,
                      int mnt_flags, void *data)
{
        int err;
        struct super_block *sb = path->mnt->mnt_sb;
        struct mount *mnt = real_mount(path->mnt);
        struct fs_context *fc;

        if (!check_mnt(mnt))
                return -EINVAL;

        if (!path_mounted(path))
                return -EINVAL;

        if (!can_change_locked_flags(mnt, mnt_flags))
                return -EPERM;

        fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
        if (IS_ERR(fc))
                return PTR_ERR(fc);

        /*
         * Indicate to the filesystem that the remount request is coming
         * from the legacy mount system call.
         */
        fc->oldapi = true;

        err = parse_monolithic_mount_data(fc, data);
        if (!err) {
                down_write(&sb->s_umount);
                err = -EPERM;
                if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
                        err = reconfigure_super(fc);
                        if (!err) {
                                lock_mount_hash();
                                set_mount_attributes(mnt, mnt_flags);
                                unlock_mount_hash();
                        }
                }
                up_write(&sb->s_umount);
        }

        mnt_warn_timestamp_expiry(path, &mnt->mnt);

        put_fs_context(fc);
        return err;
}

static inline int tree_contains_unbindable(struct mount *mnt)
{
        struct mount *p;
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                if (IS_MNT_UNBINDABLE(p))
                        return 1;
        }
        return 0;
}

/*
 * Check that there aren't references to earlier/same mount namespaces in the
 * specified subtree.  Such references can act as pins for mount namespaces
 * that aren't checked by the mount-cycle checking code, thereby allowing
 * cycles to be made.
 */
static bool check_for_nsfs_mounts(struct mount *subtree)
{
        struct mount *p;
        bool ret = false;

        lock_mount_hash();
        for (p = subtree; p; p = next_mnt(p, subtree))
                if (mnt_ns_loop(p->mnt.mnt_root))
                        goto out;

        ret = true;
out:
        unlock_mount_hash();
        return ret;
}

static int do_set_group(struct path *from_path, struct path *to_path)
{
        struct mount *from, *to;
        int err;

        from = real_mount(from_path->mnt);
        to = real_mount(to_path->mnt);

        namespace_lock();

        err = -EINVAL;
        /* To and From must be mounted */
        if (!is_mounted(&from->mnt))
                goto out;
        if (!is_mounted(&to->mnt))
                goto out;

        err = -EPERM;
        /* We should be allowed to modify mount namespaces of both mounts */
        if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
                goto out;
        if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
                goto out;

        err = -EINVAL;
        /* To and From paths should be mount roots */
        if (!path_mounted(from_path))
                goto out;
        if (!path_mounted(to_path))
                goto out;

        /* Setting sharing groups is only allowed across same superblock */
        if (from->mnt.mnt_sb != to->mnt.mnt_sb)
                goto out;

        /* From mount root should be wider than To mount root */
        if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
                goto out;

        /* From mount should not have locked children in place of To's root */
        if (has_locked_children(from, to->mnt.mnt_root))
                goto out;

        /* Setting sharing groups is only allowed on private mounts */
        if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
                goto out;

        /* From should not be private */
        if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
                goto out;

        if (IS_MNT_SLAVE(from)) {
                struct mount *m = from->mnt_master;

                list_add(&to->mnt_slave, &m->mnt_slave_list);
                to->mnt_master = m;
        }

        if (IS_MNT_SHARED(from)) {
                to->mnt_group_id = from->mnt_group_id;
                list_add(&to->mnt_share, &from->mnt_share);
                lock_mount_hash();
                set_mnt_shared(to);
                unlock_mount_hash();
        }

        err = 0;
out:
        namespace_unlock();
        return err;
}

/**
 * path_overmounted - check if path is overmounted
 * @path: path to check
 *
 * Check if path is overmounted, i.e., if there's a mount on top of
 * @path->mnt with @path->dentry as mountpoint.
 *
 * Context: This function expects namespace_lock() to be held.
 * Return: If path is overmounted true is returned, false if not.
 */
static inline bool path_overmounted(const struct path *path)
{
        rcu_read_lock();
        if (unlikely(__lookup_mnt(path->mnt, path->dentry))) {
                rcu_read_unlock();
                return true;
        }
        rcu_read_unlock();
        return false;
}

/**
 * can_move_mount_beneath - check that we can mount beneath the top mount
 * @from: mount to mount beneath
 * @to:   mount under which to mount
 * @mp:   mountpoint of @to
 *
 * - Make sure that @to->dentry is actually the root of a mount under
 *   which we can mount another mount.
 * - Make sure that nothing can be mounted beneath the caller's current
 *   root or the rootfs of the namespace.
 * - Make sure that the caller can unmount the topmost mount ensuring
 *   that the caller could reveal the underlying mountpoint.
 * - Ensure that nothing has been mounted on top of @from before we
 *   grabbed @namespace_sem to avoid creating pointless shadow mounts.
 * - Prevent mounting beneath a mount if the propagation relationship
 *   between the source mount, parent mount, and top mount would lead to
 *   nonsensical mount trees.
 *
 * Context: This function expects namespace_lock() to be held.
 * Return: On success 0, and on error a negative error code is returned.
 */
static int can_move_mount_beneath(const struct path *from,
                                  const struct path *to,
                                  const struct mountpoint *mp)
{
        struct mount *mnt_from = real_mount(from->mnt),
                     *mnt_to = real_mount(to->mnt),
                     *parent_mnt_to = mnt_to->mnt_parent;

        if (!mnt_has_parent(mnt_to))
                return -EINVAL;

        if (!path_mounted(to))
                return -EINVAL;

        if (IS_MNT_LOCKED(mnt_to))
                return -EINVAL;

        /* Avoid creating shadow mounts during mount propagation. */
        if (path_overmounted(from))
                return -EINVAL;

        /*
         * Mounting beneath the rootfs only makes sense when the
         * semantics of pivot_root(".", ".") are used.
         */
        if (&mnt_to->mnt == current->fs->root.mnt)
                return -EINVAL;
        if (parent_mnt_to == current->nsproxy->mnt_ns->root)
                return -EINVAL;

        for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent)
                if (p == mnt_to)
                        return -EINVAL;

        /*
         * If the parent mount propagates to the child mount this would
         * mean mounting @mnt_from on @mnt_to->mnt_parent and then
         * propagating a copy @c of @mnt_from on top of @mnt_to. This
         * defeats the whole purpose of mounting beneath another mount.
         */
        if (propagation_would_overmount(parent_mnt_to, mnt_to, mp))
                return -EINVAL;

        /*
         * If @mnt_to->mnt_parent propagates to @mnt_from this would
         * mean propagating a copy @c of @mnt_from on top of @mnt_from.
         * Afterwards @mnt_from would be mounted on top of
         * @mnt_to->mnt_parent and @mnt_to would be unmounted from
         * @mnt->mnt_parent and remounted on @mnt_from. But since @c is
         * already mounted on @mnt_from, @mnt_to would ultimately be
         * remounted on top of @c. Afterwards, @mnt_from would be
         * covered by a copy @c of @mnt_from and @c would be covered by
         * @mnt_from itself. This defeats the whole purpose of mounting
         * @mnt_from beneath @mnt_to.
         */
        if (propagation_would_overmount(parent_mnt_to, mnt_from, mp))
                return -EINVAL;

        return 0;
}

static int do_move_mount(struct path *old_path, struct path *new_path,
                         bool beneath)
{
        struct mnt_namespace *ns;
        struct mount *p;
        struct mount *old;
        struct mount *parent;
        struct mountpoint *mp, *old_mp;
        int err;
        bool attached;
        enum mnt_tree_flags_t flags = 0;

        mp = do_lock_mount(new_path, beneath);
        if (IS_ERR(mp))
                return PTR_ERR(mp);

        old = real_mount(old_path->mnt);
        p = real_mount(new_path->mnt);
        parent = old->mnt_parent;
        attached = mnt_has_parent(old);
        if (attached)
                flags |= MNT_TREE_MOVE;
        old_mp = old->mnt_mp;
        ns = old->mnt_ns;

        err = -EINVAL;
        /* The mountpoint must be in our namespace. */
        if (!check_mnt(p))
                goto out;

        /* The thing moved must be mounted... */
        if (!is_mounted(&old->mnt))
                goto out;

        /* ... and either ours or the root of anon namespace */
        if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
                goto out;

        if (old->mnt.mnt_flags & MNT_LOCKED)
                goto out;

        if (!path_mounted(old_path))
                goto out;

        if (d_is_dir(new_path->dentry) !=
            d_is_dir(old_path->dentry))
                goto out;
        /*
         * Don't move a mount residing in a shared parent.
         */
        if (attached && IS_MNT_SHARED(parent))
                goto out;

        if (beneath) {
                err = can_move_mount_beneath(old_path, new_path, mp);
                if (err)
                        goto out;

                err = -EINVAL;
                p = p->mnt_parent;
                flags |= MNT_TREE_BENEATH;
        }

        /*
         * Don't move a mount tree containing unbindable mounts to a destination
         * mount which is shared.
         */
        if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
                goto out;
        err = -ELOOP;
        if (!check_for_nsfs_mounts(old))
                goto out;
        for (; mnt_has_parent(p); p = p->mnt_parent)
                if (p == old)
                        goto out;

        err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags);
        if (err)
                goto out;

        /* if the mount is moved, it should no longer be expire
         * automatically */
        list_del_init(&old->mnt_expire);
        if (attached)
                put_mountpoint(old_mp);
out:
        unlock_mount(mp);
        if (!err) {
                if (attached)
                        mntput_no_expire(parent);
                else
                        free_mnt_ns(ns);
        }
        return err;
}

static int do_move_mount_old(struct path *path, const char *old_name)
{
        struct path old_path;
        int err;

        if (!old_name || !*old_name)
                return -EINVAL;

        err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
        if (err)
                return err;

        err = do_move_mount(&old_path, path, false);
        path_put(&old_path);
        return err;
}

/*
 * add a mount into a namespace's mount tree
 */
static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
                        const struct path *path, int mnt_flags)
{
        struct mount *parent = real_mount(path->mnt);

        mnt_flags &= ~MNT_INTERNAL_FLAGS;

        if (unlikely(!check_mnt(parent))) {
                /* that's acceptable only for automounts done in private ns */
                if (!(mnt_flags & MNT_SHRINKABLE))
                        return -EINVAL;
                /* ... and for those we'd better have mountpoint still alive */
                if (!parent->mnt_ns)
                        return -EINVAL;
        }

        /* Refuse the same filesystem on the same mount point */
        if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path))
                return -EBUSY;

        if (d_is_symlink(newmnt->mnt.mnt_root))
                return -EINVAL;

        newmnt->mnt.mnt_flags = mnt_flags;
        return graft_tree(newmnt, parent, mp);
}

static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);

/*
 * Create a new mount using a superblock configuration and request it
 * be added to the namespace tree.
 */
static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
                           unsigned int mnt_flags)
{
        struct vfsmount *mnt;
        struct mountpoint *mp;
        struct super_block *sb = fc->root->d_sb;
        int error;

        error = security_sb_kern_mount(sb);
        if (!error && mount_too_revealing(sb, &mnt_flags))
                error = -EPERM;

        if (unlikely(error)) {
                fc_drop_locked(fc);
                return error;
        }

        up_write(&sb->s_umount);

        mnt = vfs_create_mount(fc);
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);

        mnt_warn_timestamp_expiry(mountpoint, mnt);

        mp = lock_mount(mountpoint);
        if (IS_ERR(mp)) {
                mntput(mnt);
                return PTR_ERR(mp);
        }
        error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
        unlock_mount(mp);
        if (error < 0)
                mntput(mnt);
        return error;
}

/*
 * create a new mount for userspace and request it to be added into the
 * namespace's tree
 */
static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
                        int mnt_flags, const char *name, void *data)
{
        struct file_system_type *type;
        struct fs_context *fc;
        const char *subtype = NULL;
        int err = 0;

        if (!fstype)
                return -EINVAL;

        type = get_fs_type(fstype);
        if (!type)
                return -ENODEV;

        if (type->fs_flags & FS_HAS_SUBTYPE) {
                subtype = strchr(fstype, '.');
                if (subtype) {
                        subtype++;
                        if (!*subtype) {
                                put_filesystem(type);
                                return -EINVAL;
                        }
                }
        }

        fc = fs_context_for_mount(type, sb_flags);
        put_filesystem(type);
        if (IS_ERR(fc))
                return PTR_ERR(fc);

        /*
         * Indicate to the filesystem that the mount request is coming
         * from the legacy mount system call.
         */
        fc->oldapi = true;

        if (subtype)
                err = vfs_parse_fs_string(fc, "subtype",
                                          subtype, strlen(subtype));
        if (!err && name)
                err = vfs_parse_fs_string(fc, "source", name, strlen(name));
        if (!err)
                err = parse_monolithic_mount_data(fc, data);
        if (!err && !mount_capable(fc))
                err = -EPERM;
        if (!err)
                err = vfs_get_tree(fc);
        if (!err)
                err = do_new_mount_fc(fc, path, mnt_flags);

        put_fs_context(fc);
        return err;
}

int finish_automount(struct vfsmount *m, const struct path *path)
{
        struct dentry *dentry = path->dentry;
        struct mountpoint *mp;
        struct mount *mnt;
        int err;

        if (!m)
                return 0;
        if (IS_ERR(m))
                return PTR_ERR(m);

        mnt = real_mount(m);
        /* The new mount record should have at least 2 refs to prevent it being
         * expired before we get a chance to add it
         */
        BUG_ON(mnt_get_count(mnt) < 2);

        if (m->mnt_sb == path->mnt->mnt_sb &&
            m->mnt_root == dentry) {
                err = -ELOOP;
                goto discard;
        }

        /*
         * we don't want to use lock_mount() - in this case finding something
         * that overmounts our mountpoint to be means "quitely drop what we've
         * got", not "try to mount it on top".
         */
        inode_lock(dentry->d_inode);
        namespace_lock();
        if (unlikely(cant_mount(dentry))) {
                err = -ENOENT;
                goto discard_locked;
        }
        if (path_overmounted(path)) {
                err = 0;
                goto discard_locked;
        }
        mp = get_mountpoint(dentry);
        if (IS_ERR(mp)) {
                err = PTR_ERR(mp);
                goto discard_locked;
        }

        err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
        unlock_mount(mp);
        if (unlikely(err))
                goto discard;
        mntput(m);
        return 0;

discard_locked:
        namespace_unlock();
        inode_unlock(dentry->d_inode);
discard:
        /* remove m from any expiration list it may be on */
        if (!list_empty(&mnt->mnt_expire)) {
                namespace_lock();
                list_del_init(&mnt->mnt_expire);
                namespace_unlock();
        }
        mntput(m);
        mntput(m);
        return err;
}

/**
 * mnt_set_expiry - Put a mount on an expiration list
 * @mnt: The mount to list.
 * @expiry_list: The list to add the mount to.
 */
void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
{
        namespace_lock();

        list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);

        namespace_unlock();
}
EXPORT_SYMBOL(mnt_set_expiry);

/*
 * process a list of expirable mountpoints with the intent of discarding any
 * mountpoints that aren't in use and haven't been touched since last we came
 * here
 */
void mark_mounts_for_expiry(struct list_head *mounts)
{
        struct mount *mnt, *next;
        LIST_HEAD(graveyard);

        if (list_empty(mounts))
                return;

        namespace_lock();
        lock_mount_hash();

        /* extract from the expiration list every vfsmount that matches the
         * following criteria:
         * - only referenced by its parent vfsmount
         * - still marked for expiry (marked on the last call here; marks are
         *   cleared by mntput())
         */
        list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
                if (!xchg(&mnt->mnt_expiry_mark, 1) ||
                        propagate_mount_busy(mnt, 1))
                        continue;
                list_move(&mnt->mnt_expire, &graveyard);
        }
        while (!list_empty(&graveyard)) {
                mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
                touch_mnt_namespace(mnt->mnt_ns);
                umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
        }
        unlock_mount_hash();
        namespace_unlock();
}

EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);

/*
 * Ripoff of 'select_parent()'
 *
 * search the list of submounts for a given mountpoint, and move any
 * shrinkable submounts to the 'graveyard' list.
 */
static int select_submounts(struct mount *parent, struct list_head *graveyard)
{
        struct mount *this_parent = parent;
        struct list_head *next;
        int found = 0;

repeat:
        next = this_parent->mnt_mounts.next;
resume:
        while (next != &this_parent->mnt_mounts) {
                struct list_head *tmp = next;
                struct mount *mnt = list_entry(tmp, struct mount, mnt_child);

                next = tmp->next;
                if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
                        continue;
                /*
                 * Descend a level if the d_mounts list is non-empty.
                 */
                if (!list_empty(&mnt->mnt_mounts)) {
                        this_parent = mnt;
                        goto repeat;
                }

                if (!propagate_mount_busy(mnt, 1)) {
                        list_move_tail(&mnt->mnt_expire, graveyard);
                        found++;
                }
        }
        /*
         * All done at this level ... ascend and resume the search
         */
        if (this_parent != parent) {
                next = this_parent->mnt_child.next;
                this_parent = this_parent->mnt_parent;
                goto resume;
        }
        return found;
}

/*
 * process a list of expirable mountpoints with the intent of discarding any
 * submounts of a specific parent mountpoint
 *
 * mount_lock must be held for write
 */
static void shrink_submounts(struct mount *mnt)
{
        LIST_HEAD(graveyard);
        struct mount *m;

        /* extract submounts of 'mountpoint' from the expiration list */
        while (select_submounts(mnt, &graveyard)) {
                while (!list_empty(&graveyard)) {
                        m = list_first_entry(&graveyard, struct mount,
                                                mnt_expire);
                        touch_mnt_namespace(m->mnt_ns);
                        umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
                }
        }
}

static void *copy_mount_options(const void __user * data)
{
        char *copy;
        unsigned left, offset;

        if (!data)
                return NULL;

        copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
        if (!copy)
                return ERR_PTR(-ENOMEM);

        left = copy_from_user(copy, data, PAGE_SIZE);

        /*
         * Not all architectures have an exact copy_from_user(). Resort to
         * byte at a time.
         */
        offset = PAGE_SIZE - left;
        while (left) {
                char c;
                if (get_user(c, (const char __user *)data + offset))
                        break;
                copy[offset] = c;
                left--;
                offset++;
        }

        if (left == PAGE_SIZE) {
                kfree(copy);
                return ERR_PTR(-EFAULT);
        }

        return copy;
}

static char *copy_mount_string(const void __user *data)
{
        return data ? strndup_user(data, PATH_MAX) : NULL;
}

/*
 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
 *
 * data is a (void *) that can point to any structure up to
 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
 * information (or be NULL).
 *
 * Pre-0.97 versions of mount() didn't have a flags word.
 * When the flags word was introduced its top half was required
 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
 * Therefore, if this magic number is present, it carries no information
 * and must be discarded.
 */
int path_mount(const char *dev_name, struct path *path,
                const char *type_page, unsigned long flags, void *data_page)
{
        unsigned int mnt_flags = 0, sb_flags;
        int ret;

        /* Discard magic */
        if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
                flags &= ~MS_MGC_MSK;

        /* Basic sanity checks */
        if (data_page)
                ((char *)data_page)[PAGE_SIZE - 1] = 0;

        if (flags & MS_NOUSER)
                return -EINVAL;

        ret = security_sb_mount(dev_name, path, type_page, flags, data_page);
        if (ret)
                return ret;
        if (!may_mount())
                return -EPERM;
        if (flags & SB_MANDLOCK)
                warn_mandlock();

        /* Default to relatime unless overriden */
        if (!(flags & MS_NOATIME))
                mnt_flags |= MNT_RELATIME;

        /* Separate the per-mountpoint flags */
        if (flags & MS_NOSUID)
                mnt_flags |= MNT_NOSUID;
        if (flags & MS_NODEV)
                mnt_flags |= MNT_NODEV;
        if (flags & MS_NOEXEC)
                mnt_flags |= MNT_NOEXEC;
        if (flags & MS_NOATIME)
                mnt_flags |= MNT_NOATIME;
        if (flags & MS_NODIRATIME)
                mnt_flags |= MNT_NODIRATIME;
        if (flags & MS_STRICTATIME)
                mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
        if (flags & MS_RDONLY)
                mnt_flags |= MNT_READONLY;
        if (flags & MS_NOSYMFOLLOW)
                mnt_flags |= MNT_NOSYMFOLLOW;

        /* The default atime for remount is preservation */
        if ((flags & MS_REMOUNT) &&
            ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
                       MS_STRICTATIME)) == 0)) {
                mnt_flags &= ~MNT_ATIME_MASK;
                mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK;
        }

        sb_flags = flags & (SB_RDONLY |
                            SB_SYNCHRONOUS |
                            SB_MANDLOCK |
                            SB_DIRSYNC |
                            SB_SILENT |
                            SB_POSIXACL |
                            SB_LAZYTIME |
                            SB_I_VERSION);

        if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
                return do_reconfigure_mnt(path, mnt_flags);
        if (flags & MS_REMOUNT)
                return do_remount(path, flags, sb_flags, mnt_flags, data_page);
        if (flags & MS_BIND)
                return do_loopback(path, dev_name, flags & MS_REC);
        if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
                return do_change_type(path, flags);
        if (flags & MS_MOVE)
                return do_move_mount_old(path, dev_name);

        return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
                            data_page);
}

long do_mount(const char *dev_name, const char __user *dir_name,
                const char *type_page, unsigned long flags, void *data_page)
{
        struct path path;
        int ret;

        ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
        if (ret)
                return ret;
        ret = path_mount(dev_name, &path, type_page, flags, data_page);
        path_put(&path);
        return ret;
}

static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
{
        return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
}

static void dec_mnt_namespaces(struct ucounts *ucounts)
{
        dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
}

static void free_mnt_ns(struct mnt_namespace *ns)
{
        if (!is_anon_ns(ns))
                ns_free_inum(&ns->ns);
        dec_mnt_namespaces(ns->ucounts);
        put_user_ns(ns->user_ns);
        kfree(ns);
}

/*
 * Assign a sequence number so we can detect when we attempt to bind
 * mount a reference to an older mount namespace into the current
 * mount namespace, preventing reference counting loops.  A 64bit
 * number incrementing at 10Ghz will take 12,427 years to wrap which
 * is effectively never, so we can ignore the possibility.
 */
static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);

static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
{
        struct mnt_namespace *new_ns;
        struct ucounts *ucounts;
        int ret;

        ucounts = inc_mnt_namespaces(user_ns);
        if (!ucounts)
                return ERR_PTR(-ENOSPC);

        new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT);
        if (!new_ns) {
                dec_mnt_namespaces(ucounts);
                return ERR_PTR(-ENOMEM);
        }
        if (!anon) {
                ret = ns_alloc_inum(&new_ns->ns);
                if (ret) {
                        kfree(new_ns);
                        dec_mnt_namespaces(ucounts);
                        return ERR_PTR(ret);
                }
        }
        new_ns->ns.ops = &mntns_operations;
        if (!anon)
                new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
        refcount_set(&new_ns->ns.count, 1);
        new_ns->mounts = RB_ROOT;
        init_waitqueue_head(&new_ns->poll);
        new_ns->user_ns = get_user_ns(user_ns);
        new_ns->ucounts = ucounts;
        return new_ns;
}

__latent_entropy
struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
                struct user_namespace *user_ns, struct fs_struct *new_fs)
{
        struct mnt_namespace *new_ns;
        struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
        struct mount *p, *q;
        struct mount *old;
        struct mount *new;
        int copy_flags;

        BUG_ON(!ns);

        if (likely(!(flags & CLONE_NEWNS))) {
                get_mnt_ns(ns);
                return ns;
        }

        old = ns->root;

        new_ns = alloc_mnt_ns(user_ns, false);
        if (IS_ERR(new_ns))
                return new_ns;

        namespace_lock();
        /* First pass: copy the tree topology */
        copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
        if (user_ns != ns->user_ns)
                copy_flags |= CL_SHARED_TO_SLAVE;
        new = copy_tree(old, old->mnt.mnt_root, copy_flags);
        if (IS_ERR(new)) {
                namespace_unlock();
                free_mnt_ns(new_ns);
                return ERR_CAST(new);
        }
        if (user_ns != ns->user_ns) {
                lock_mount_hash();
                lock_mnt_tree(new);
                unlock_mount_hash();
        }
        new_ns->root = new;

        /*
         * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
         * as belonging to new namespace.  We have already acquired a private
         * fs_struct, so tsk->fs->lock is not needed.
         */
        p = old;
        q = new;
        while (p) {
                mnt_add_to_ns(new_ns, q);
                new_ns->nr_mounts++;
                if (new_fs) {
                        if (&p->mnt == new_fs->root.mnt) {
                                new_fs->root.mnt = mntget(&q->mnt);
                                rootmnt = &p->mnt;
                        }
                        if (&p->mnt == new_fs->pwd.mnt) {
                                new_fs->pwd.mnt = mntget(&q->mnt);
                                pwdmnt = &p->mnt;
                        }
                }
                p = next_mnt(p, old);
                q = next_mnt(q, new);
                if (!q)
                        break;
                // an mntns binding we'd skipped?
                while (p->mnt.mnt_root != q->mnt.mnt_root)
                        p = next_mnt(skip_mnt_tree(p), old);
        }
        namespace_unlock();

        if (rootmnt)
                mntput(rootmnt);
        if (pwdmnt)
                mntput(pwdmnt);

        return new_ns;
}

struct dentry *mount_subtree(struct vfsmount *m, const char *name)
{
        struct mount *mnt = real_mount(m);
        struct mnt_namespace *ns;
        struct super_block *s;
        struct path path;
        int err;

        ns = alloc_mnt_ns(&init_user_ns, true);
        if (IS_ERR(ns)) {
                mntput(m);
                return ERR_CAST(ns);
        }
        ns->root = mnt;
        ns->nr_mounts++;
        mnt_add_to_ns(ns, mnt);

        err = vfs_path_lookup(m->mnt_root, m,
                        name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);

        put_mnt_ns(ns);

        if (err)
                return ERR_PTR(err);

        /* trade a vfsmount reference for active sb one */
        s = path.mnt->mnt_sb;
        atomic_inc(&s->s_active);
        mntput(path.mnt);
        /* lock the sucker */
        down_write(&s->s_umount);
        /* ... and return the root of (sub)tree on it */
        return path.dentry;
}
EXPORT_SYMBOL(mount_subtree);

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
                char __user *, type, unsigned long, flags, void __user *, data)
{
        int ret;
        char *kernel_type;
        char *kernel_dev;
        void *options;

        kernel_type = copy_mount_string(type);
        ret = PTR_ERR(kernel_type);
        if (IS_ERR(kernel_type))
                goto out_type;

        kernel_dev = copy_mount_string(dev_name);
        ret = PTR_ERR(kernel_dev);
        if (IS_ERR(kernel_dev))
                goto out_dev;

        options = copy_mount_options(data);
        ret = PTR_ERR(options);
        if (IS_ERR(options))
                goto out_data;

        ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);

        kfree(options);
out_data:
        kfree(kernel_dev);
out_dev:
        kfree(kernel_type);
out_type:
        return ret;
}

#define FSMOUNT_VALID_FLAGS                                                    \
        (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |            \
         MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME |       \
         MOUNT_ATTR_NOSYMFOLLOW)

#define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP)

#define MOUNT_SETATTR_PROPAGATION_FLAGS \
        (MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED)

static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
{
        unsigned int mnt_flags = 0;

        if (attr_flags & MOUNT_ATTR_RDONLY)
                mnt_flags |= MNT_READONLY;
        if (attr_flags & MOUNT_ATTR_NOSUID)
                mnt_flags |= MNT_NOSUID;
        if (attr_flags & MOUNT_ATTR_NODEV)
                mnt_flags |= MNT_NODEV;
        if (attr_flags & MOUNT_ATTR_NOEXEC)
                mnt_flags |= MNT_NOEXEC;
        if (attr_flags & MOUNT_ATTR_NODIRATIME)
                mnt_flags |= MNT_NODIRATIME;
        if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW)
                mnt_flags |= MNT_NOSYMFOLLOW;

        return mnt_flags;
}

/*
 * Create a kernel mount representation for a new, prepared superblock
 * (specified by fs_fd) and attach to an open_tree-like file descriptor.
 */
SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
                unsigned int, attr_flags)
{
        struct mnt_namespace *ns;
        struct fs_context *fc;
        struct file *file;
        struct path newmount;
        struct mount *mnt;
        struct fd f;
        unsigned int mnt_flags = 0;
        long ret;

        if (!may_mount())
                return -EPERM;

        if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
                return -EINVAL;

        if (attr_flags & ~FSMOUNT_VALID_FLAGS)
                return -EINVAL;

        mnt_flags = attr_flags_to_mnt_flags(attr_flags);

        switch (attr_flags & MOUNT_ATTR__ATIME) {
        case MOUNT_ATTR_STRICTATIME:
                break;
        case MOUNT_ATTR_NOATIME:
                mnt_flags |= MNT_NOATIME;
                break;
        case MOUNT_ATTR_RELATIME:
                mnt_flags |= MNT_RELATIME;
                break;
        default:
                return -EINVAL;
        }

        f = fdget(fs_fd);
        if (!f.file)
                return -EBADF;

        ret = -EINVAL;
        if (f.file->f_op != &fscontext_fops)
                goto err_fsfd;

        fc = f.file->private_data;

        ret = mutex_lock_interruptible(&fc->uapi_mutex);
        if (ret < 0)
                goto err_fsfd;

        /* There must be a valid superblock or we can't mount it */
        ret = -EINVAL;
        if (!fc->root)
                goto err_unlock;

        ret = -EPERM;
        if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
                pr_warn("VFS: Mount too revealing\n");
                goto err_unlock;
        }

        ret = -EBUSY;
        if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
                goto err_unlock;

        if (fc->sb_flags & SB_MANDLOCK)
                warn_mandlock();

        newmount.mnt = vfs_create_mount(fc);
        if (IS_ERR(newmount.mnt)) {
                ret = PTR_ERR(newmount.mnt);
                goto err_unlock;
        }
        newmount.dentry = dget(fc->root);
        newmount.mnt->mnt_flags = mnt_flags;

        /* We've done the mount bit - now move the file context into more or
         * less the same state as if we'd done an fspick().  We don't want to
         * do any memory allocation or anything like that at this point as we
         * don't want to have to handle any errors incurred.
         */
        vfs_clean_context(fc);

        ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
        if (IS_ERR(ns)) {
                ret = PTR_ERR(ns);
                goto err_path;
        }
        mnt = real_mount(newmount.mnt);
        ns->root = mnt;
        ns->nr_mounts = 1;
        mnt_add_to_ns(ns, mnt);
        mntget(newmount.mnt);

        /* Attach to an apparent O_PATH fd with a note that we need to unmount
         * it, not just simply put it.
         */
        file = dentry_open(&newmount, O_PATH, fc->cred);
        if (IS_ERR(file)) {
                dissolve_on_fput(newmount.mnt);
                ret = PTR_ERR(file);
                goto err_path;
        }
        file->f_mode |= FMODE_NEED_UNMOUNT;

        ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
        if (ret >= 0)
                fd_install(ret, file);
        else
                fput(file);

err_path:
        path_put(&newmount);
err_unlock:
        mutex_unlock(&fc->uapi_mutex);
err_fsfd:
        fdput(f);
        return ret;
}

/*
 * Move a mount from one place to another.  In combination with
 * fsopen()/fsmount() this is used to install a new mount and in combination
 * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
 * a mount subtree.
 *
 * Note the flags value is a combination of MOVE_MOUNT_* flags.
 */
SYSCALL_DEFINE5(move_mount,
                int, from_dfd, const char __user *, from_pathname,
                int, to_dfd, const char __user *, to_pathname,
                unsigned int, flags)
{
        struct path from_path, to_path;
        unsigned int lflags;
        int ret = 0;

        if (!may_mount())
                return -EPERM;

        if (flags & ~MOVE_MOUNT__MASK)
                return -EINVAL;

        if ((flags & (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) ==
            (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP))
                return -EINVAL;

        /* If someone gives a pathname, they aren't permitted to move
         * from an fd that requires unmount as we can't get at the flag
         * to clear it afterwards.
         */
        lflags = 0;
        if (flags & MOVE_MOUNT_F_SYMLINKS)        lflags |= LOOKUP_FOLLOW;
        if (flags & MOVE_MOUNT_F_AUTOMOUNTS)        lflags |= LOOKUP_AUTOMOUNT;
        if (flags & MOVE_MOUNT_F_EMPTY_PATH)        lflags |= LOOKUP_EMPTY;

        ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
        if (ret < 0)
                return ret;

        lflags = 0;
        if (flags & MOVE_MOUNT_T_SYMLINKS)        lflags |= LOOKUP_FOLLOW;
        if (flags & MOVE_MOUNT_T_AUTOMOUNTS)        lflags |= LOOKUP_AUTOMOUNT;
        if (flags & MOVE_MOUNT_T_EMPTY_PATH)        lflags |= LOOKUP_EMPTY;

        ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
        if (ret < 0)
                goto out_from;

        ret = security_move_mount(&from_path, &to_path);
        if (ret < 0)
                goto out_to;

        if (flags & MOVE_MOUNT_SET_GROUP)
                ret = do_set_group(&from_path, &to_path);
        else
                ret = do_move_mount(&from_path, &to_path,
                                    (flags & MOVE_MOUNT_BENEATH));

out_to:
        path_put(&to_path);
out_from:
        path_put(&from_path);
        return ret;
}

/*
 * Return true if path is reachable from root
 *
 * namespace_sem or mount_lock is held
 */
bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
                         const struct path *root)
{
        while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
                dentry = mnt->mnt_mountpoint;
                mnt = mnt->mnt_parent;
        }
        return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
}

bool path_is_under(const struct path *path1, const struct path *path2)
{
        bool res;
        read_seqlock_excl(&mount_lock);
        res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
        read_sequnlock_excl(&mount_lock);
        return res;
}
EXPORT_SYMBOL(path_is_under);

/*
 * pivot_root Semantics:
 * Moves the root file system of the current process to the directory put_old,
 * makes new_root as the new root file system of the current process, and sets
 * root/cwd of all processes which had them on the current root to new_root.
 *
 * Restrictions:
 * The new_root and put_old must be directories, and  must not be on the
 * same file  system as the current process root. The put_old  must  be
 * underneath new_root,  i.e. adding a non-zero number of /.. to the string
 * pointed to by put_old must yield the same directory as new_root. No other
 * file system may be mounted on put_old. After all, new_root is a mountpoint.
 *
 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
 * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
 * in this situation.
 *
 * Notes:
 *  - we don't move root/cwd if they are not at the root (reason: if something
 *    cared enough to change them, it's probably wrong to force them elsewhere)
 *  - it's okay to pick a root that isn't the root of a file system, e.g.
 *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
 *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
 *    first.
 */
SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
                const char __user *, put_old)
{
        struct path new, old, root;
        struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
        struct mountpoint *old_mp, *root_mp;
        int error;

        if (!may_mount())
                return -EPERM;

        error = user_path_at(AT_FDCWD, new_root,
                             LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
        if (error)
                goto out0;

        error = user_path_at(AT_FDCWD, put_old,
                             LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
        if (error)
                goto out1;

        error = security_sb_pivotroot(&old, &new);
        if (error)
                goto out2;

        get_fs_root(current->fs, &root);
        old_mp = lock_mount(&old);
        error = PTR_ERR(old_mp);
        if (IS_ERR(old_mp))
                goto out3;

        error = -EINVAL;
        new_mnt = real_mount(new.mnt);
        root_mnt = real_mount(root.mnt);
        old_mnt = real_mount(old.mnt);
        ex_parent = new_mnt->mnt_parent;
        root_parent = root_mnt->mnt_parent;
        if (IS_MNT_SHARED(old_mnt) ||
                IS_MNT_SHARED(ex_parent) ||
                IS_MNT_SHARED(root_parent))
                goto out4;
        if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
                goto out4;
        if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
                goto out4;
        error = -ENOENT;
        if (d_unlinked(new.dentry))
                goto out4;
        error = -EBUSY;
        if (new_mnt == root_mnt || old_mnt == root_mnt)
                goto out4; /* loop, on the same file system  */
        error = -EINVAL;
        if (!path_mounted(&root))
                goto out4; /* not a mountpoint */
        if (!mnt_has_parent(root_mnt))
                goto out4; /* not attached */
        if (!path_mounted(&new))
                goto out4; /* not a mountpoint */
        if (!mnt_has_parent(new_mnt))
                goto out4; /* not attached */
        /* make sure we can reach put_old from new_root */
        if (!is_path_reachable(old_mnt, old.dentry, &new))
                goto out4;
        /* make certain new is below the root */
        if (!is_path_reachable(new_mnt, new.dentry, &root))
                goto out4;
        lock_mount_hash();
        umount_mnt(new_mnt);
        root_mp = unhash_mnt(root_mnt);  /* we'll need its mountpoint */
        if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
                new_mnt->mnt.mnt_flags |= MNT_LOCKED;
                root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
        }
        /* mount old root on put_old */
        attach_mnt(root_mnt, old_mnt, old_mp, false);
        /* mount new_root on / */
        attach_mnt(new_mnt, root_parent, root_mp, false);
        mnt_add_count(root_parent, -1);
        touch_mnt_namespace(current->nsproxy->mnt_ns);
        /* A moved mount should not expire automatically */
        list_del_init(&new_mnt->mnt_expire);
        put_mountpoint(root_mp);
        unlock_mount_hash();
        chroot_fs_refs(&root, &new);
        error = 0;
out4:
        unlock_mount(old_mp);
        if (!error)
                mntput_no_expire(ex_parent);
out3:
        path_put(&root);
out2:
        path_put(&old);
out1:
        path_put(&new);
out0:
        return error;
}

static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
{
        unsigned int flags = mnt->mnt.mnt_flags;

        /*  flags to clear */
        flags &= ~kattr->attr_clr;
        /* flags to raise */
        flags |= kattr->attr_set;

        return flags;
}

static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{
        struct vfsmount *m = &mnt->mnt;
        struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;

        if (!kattr->mnt_idmap)
                return 0;

        /*
         * Creating an idmapped mount with the filesystem wide idmapping
         * doesn't make sense so block that. We don't allow mushy semantics.
         */
        if (kattr->mnt_userns == m->mnt_sb->s_user_ns)
                return -EINVAL;

        /*
         * Once a mount has been idmapped we don't allow it to change its
         * mapping. It makes things simpler and callers can just create
         * another bind-mount they can idmap if they want to.
         */
        if (is_idmapped_mnt(m))
                return -EPERM;

        /* The underlying filesystem doesn't support idmapped mounts yet. */
        if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
                return -EINVAL;

        /* We're not controlling the superblock. */
        if (!ns_capable(fs_userns, CAP_SYS_ADMIN))
                return -EPERM;

        /* Mount has already been visible in the filesystem hierarchy. */
        if (!is_anon_ns(mnt->mnt_ns))
                return -EINVAL;

        return 0;
}

/**
 * mnt_allow_writers() - check whether the attribute change allows writers
 * @kattr: the new mount attributes
 * @mnt: the mount to which @kattr will be applied
 *
 * Check whether thew new mount attributes in @kattr allow concurrent writers.
 *
 * Return: true if writers need to be held, false if not
 */
static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
                                     const struct mount *mnt)
{
        return (!(kattr->attr_set & MNT_READONLY) ||
                (mnt->mnt.mnt_flags & MNT_READONLY)) &&
               !kattr->mnt_idmap;
}

static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
{
        struct mount *m;
        int err;

        for (m = mnt; m; m = next_mnt(m, mnt)) {
                if (!can_change_locked_flags(m, recalc_flags(kattr, m))) {
                        err = -EPERM;
                        break;
                }

                err = can_idmap_mount(kattr, m);
                if (err)
                        break;

                if (!mnt_allow_writers(kattr, m)) {
                        err = mnt_hold_writers(m);
                        if (err)
                                break;
                }

                if (!kattr->recurse)
                        return 0;
        }

        if (err) {
                struct mount *p;

                /*
                 * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will
                 * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all
                 * mounts and needs to take care to include the first mount.
                 */
                for (p = mnt; p; p = next_mnt(p, mnt)) {
                        /* If we had to hold writers unblock them. */
                        if (p->mnt.mnt_flags & MNT_WRITE_HOLD)
                                mnt_unhold_writers(p);

                        /*
                         * We're done once the first mount we changed got
                         * MNT_WRITE_HOLD unset.
                         */
                        if (p == m)
                                break;
                }
        }
        return err;
}

static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{
        if (!kattr->mnt_idmap)
                return;

        /*
         * Pairs with smp_load_acquire() in mnt_idmap().
         *
         * Since we only allow a mount to change the idmapping once and
         * verified this in can_idmap_mount() we know that the mount has
         * @nop_mnt_idmap attached to it. So there's no need to drop any
         * references.
         */
        smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
}

static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
{
        struct mount *m;

        for (m = mnt; m; m = next_mnt(m, mnt)) {
                unsigned int flags;

                do_idmap_mount(kattr, m);
                flags = recalc_flags(kattr, m);
                WRITE_ONCE(m->mnt.mnt_flags, flags);

                /* If we had to hold writers unblock them. */
                if (m->mnt.mnt_flags & MNT_WRITE_HOLD)
                        mnt_unhold_writers(m);

                if (kattr->propagation)
                        change_mnt_propagation(m, kattr->propagation);
                if (!kattr->recurse)
                        break;
        }
        touch_mnt_namespace(mnt->mnt_ns);
}

static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
{
        struct mount *mnt = real_mount(path->mnt);
        int err = 0;

        if (!path_mounted(path))
                return -EINVAL;

        if (kattr->mnt_userns) {
                struct mnt_idmap *mnt_idmap;

                mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns);
                if (IS_ERR(mnt_idmap))
                        return PTR_ERR(mnt_idmap);
                kattr->mnt_idmap = mnt_idmap;
        }

        if (kattr->propagation) {
                /*
                 * Only take namespace_lock() if we're actually changing
                 * propagation.
                 */
                namespace_lock();
                if (kattr->propagation == MS_SHARED) {
                        err = invent_group_ids(mnt, kattr->recurse);
                        if (err) {
                                namespace_unlock();
                                return err;
                        }
                }
        }

        err = -EINVAL;
        lock_mount_hash();

        /* Ensure that this isn't anything purely vfs internal. */
        if (!is_mounted(&mnt->mnt))
                goto out;

        /*
         * If this is an attached mount make sure it's located in the callers
         * mount namespace. If it's not don't let the caller interact with it.
         *
         * If this mount doesn't have a parent it's most often simply a
         * detached mount with an anonymous mount namespace. IOW, something
         * that's simply not attached yet. But there are apparently also users
         * that do change mount properties on the rootfs itself. That obviously
         * neither has a parent nor is it a detached mount so we cannot
         * unconditionally check for detached mounts.
         */
        if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt))
                goto out;

        /*
         * First, we get the mount tree in a shape where we can change mount
         * properties without failure. If we succeeded to do so we commit all
         * changes and if we failed we clean up.
         */
        err = mount_setattr_prepare(kattr, mnt);
        if (!err)
                mount_setattr_commit(kattr, mnt);

out:
        unlock_mount_hash();

        if (kattr->propagation) {
                if (err)
                        cleanup_group_ids(mnt, NULL);
                namespace_unlock();
        }

        return err;
}

static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
                                struct mount_kattr *kattr, unsigned int flags)
{
        int err = 0;
        struct ns_common *ns;
        struct user_namespace *mnt_userns;
        struct fd f;

        if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
                return 0;

        /*
         * We currently do not support clearing an idmapped mount. If this ever
         * is a use-case we can revisit this but for now let's keep it simple
         * and not allow it.
         */
        if (attr->attr_clr & MOUNT_ATTR_IDMAP)
                return -EINVAL;

        if (attr->userns_fd > INT_MAX)
                return -EINVAL;

        f = fdget(attr->userns_fd);
        if (!f.file)
                return -EBADF;

        if (!proc_ns_file(f.file)) {
                err = -EINVAL;
                goto out_fput;
        }

        ns = get_proc_ns(file_inode(f.file));
        if (ns->ops->type != CLONE_NEWUSER) {
                err = -EINVAL;
                goto out_fput;
        }

        /*
         * The initial idmapping cannot be used to create an idmapped
         * mount. We use the initial idmapping as an indicator of a mount
         * that is not idmapped. It can simply be passed into helpers that
         * are aware of idmapped mounts as a convenient shortcut. A user
         * can just create a dedicated identity mapping to achieve the same
         * result.
         */
        mnt_userns = container_of(ns, struct user_namespace, ns);
        if (mnt_userns == &init_user_ns) {
                err = -EPERM;
                goto out_fput;
        }

        /* We're not controlling the target namespace. */
        if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) {
                err = -EPERM;
                goto out_fput;
        }

        kattr->mnt_userns = get_user_ns(mnt_userns);

out_fput:
        fdput(f);
        return err;
}

static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
                             struct mount_kattr *kattr, unsigned int flags)
{
        unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;

        if (flags & AT_NO_AUTOMOUNT)
                lookup_flags &= ~LOOKUP_AUTOMOUNT;
        if (flags & AT_SYMLINK_NOFOLLOW)
                lookup_flags &= ~LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;

        *kattr = (struct mount_kattr) {
                .lookup_flags        = lookup_flags,
                .recurse        = !!(flags & AT_RECURSIVE),
        };

        if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
                return -EINVAL;
        if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
                return -EINVAL;
        kattr->propagation = attr->propagation;

        if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
                return -EINVAL;

        kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set);
        kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr);

        /*
         * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap,
         * users wanting to transition to a different atime setting cannot
         * simply specify the atime setting in @attr_set, but must also
         * specify MOUNT_ATTR__ATIME in the @attr_clr field.
         * So ensure that MOUNT_ATTR__ATIME can't be partially set in
         * @attr_clr and that @attr_set can't have any atime bits set if
         * MOUNT_ATTR__ATIME isn't set in @attr_clr.
         */
        if (attr->attr_clr & MOUNT_ATTR__ATIME) {
                if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME)
                        return -EINVAL;

                /*
                 * Clear all previous time settings as they are mutually
                 * exclusive.
                 */
                kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME;
                switch (attr->attr_set & MOUNT_ATTR__ATIME) {
                case MOUNT_ATTR_RELATIME:
                        kattr->attr_set |= MNT_RELATIME;
                        break;
                case MOUNT_ATTR_NOATIME:
                        kattr->attr_set |= MNT_NOATIME;
                        break;
                case MOUNT_ATTR_STRICTATIME:
                        break;
                default:
                        return -EINVAL;
                }
        } else {
                if (attr->attr_set & MOUNT_ATTR__ATIME)
                        return -EINVAL;
        }

        return build_mount_idmapped(attr, usize, kattr, flags);
}

static void finish_mount_kattr(struct mount_kattr *kattr)
{
        put_user_ns(kattr->mnt_userns);
        kattr->mnt_userns = NULL;

        if (kattr->mnt_idmap)
                mnt_idmap_put(kattr->mnt_idmap);
}

SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
                unsigned int, flags, struct mount_attr __user *, uattr,
                size_t, usize)
{
        int err;
        struct path target;
        struct mount_attr attr;
        struct mount_kattr kattr;

        BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);

        if (flags & ~(AT_EMPTY_PATH |
                      AT_RECURSIVE |
                      AT_SYMLINK_NOFOLLOW |
                      AT_NO_AUTOMOUNT))
                return -EINVAL;

        if (unlikely(usize > PAGE_SIZE))
                return -E2BIG;
        if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
                return -EINVAL;

        if (!may_mount())
                return -EPERM;

        err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
        if (err)
                return err;

        /* Don't bother walking through the mounts if this is a nop. */
        if (attr.attr_set == 0 &&
            attr.attr_clr == 0 &&
            attr.propagation == 0)
                return 0;

        err = build_mount_kattr(&attr, usize, &kattr, flags);
        if (err)
                return err;

        err = user_path_at(dfd, path, kattr.lookup_flags, &target);
        if (!err) {
                err = do_mount_setattr(&target, &kattr);
                path_put(&target);
        }
        finish_mount_kattr(&kattr);
        return err;
}

int show_path(struct seq_file *m, struct dentry *root)
{
        if (root->d_sb->s_op->show_path)
                return root->d_sb->s_op->show_path(m, root);

        seq_dentry(m, root, " \t\n\\");
        return 0;
}

static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
{
        struct mount *mnt = mnt_find_id_at(ns, id);

        if (!mnt || mnt->mnt_id_unique != id)
                return NULL;

        return &mnt->mnt;
}

struct kstatmount {
        struct statmount __user *buf;
        size_t bufsize;
        struct vfsmount *mnt;
        u64 mask;
        struct path root;
        struct statmount sm;
        struct seq_file seq;
};

static u64 mnt_to_attr_flags(struct vfsmount *mnt)
{
        unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
        u64 attr_flags = 0;

        if (mnt_flags & MNT_READONLY)
                attr_flags |= MOUNT_ATTR_RDONLY;
        if (mnt_flags & MNT_NOSUID)
                attr_flags |= MOUNT_ATTR_NOSUID;
        if (mnt_flags & MNT_NODEV)
                attr_flags |= MOUNT_ATTR_NODEV;
        if (mnt_flags & MNT_NOEXEC)
                attr_flags |= MOUNT_ATTR_NOEXEC;
        if (mnt_flags & MNT_NODIRATIME)
                attr_flags |= MOUNT_ATTR_NODIRATIME;
        if (mnt_flags & MNT_NOSYMFOLLOW)
                attr_flags |= MOUNT_ATTR_NOSYMFOLLOW;

        if (mnt_flags & MNT_NOATIME)
                attr_flags |= MOUNT_ATTR_NOATIME;
        else if (mnt_flags & MNT_RELATIME)
                attr_flags |= MOUNT_ATTR_RELATIME;
        else
                attr_flags |= MOUNT_ATTR_STRICTATIME;

        if (is_idmapped_mnt(mnt))
                attr_flags |= MOUNT_ATTR_IDMAP;

        return attr_flags;
}

static u64 mnt_to_propagation_flags(struct mount *m)
{
        u64 propagation = 0;

        if (IS_MNT_SHARED(m))
                propagation |= MS_SHARED;
        if (IS_MNT_SLAVE(m))
                propagation |= MS_SLAVE;
        if (IS_MNT_UNBINDABLE(m))
                propagation |= MS_UNBINDABLE;
        if (!propagation)
                propagation |= MS_PRIVATE;

        return propagation;
}

static void statmount_sb_basic(struct kstatmount *s)
{
        struct super_block *sb = s->mnt->mnt_sb;

        s->sm.mask |= STATMOUNT_SB_BASIC;
        s->sm.sb_dev_major = MAJOR(sb->s_dev);
        s->sm.sb_dev_minor = MINOR(sb->s_dev);
        s->sm.sb_magic = sb->s_magic;
        s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME);
}

static void statmount_mnt_basic(struct kstatmount *s)
{
        struct mount *m = real_mount(s->mnt);

        s->sm.mask |= STATMOUNT_MNT_BASIC;
        s->sm.mnt_id = m->mnt_id_unique;
        s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
        s->sm.mnt_id_old = m->mnt_id;
        s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
        s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
        s->sm.mnt_propagation = mnt_to_propagation_flags(m);
        s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0;
        s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
}

static void statmount_propagate_from(struct kstatmount *s)
{
        struct mount *m = real_mount(s->mnt);

        s->sm.mask |= STATMOUNT_PROPAGATE_FROM;
        if (IS_MNT_SLAVE(m))
                s->sm.propagate_from = get_dominating_id(m, &current->fs->root);
}

static int statmount_mnt_root(struct kstatmount *s, struct seq_file *seq)
{
        int ret;
        size_t start = seq->count;

        ret = show_path(seq, s->mnt->mnt_root);
        if (ret)
                return ret;

        if (unlikely(seq_has_overflowed(seq)))
                return -EAGAIN;

        /*
         * Unescape the result. It would be better if supplied string was not
         * escaped in the first place, but that's a pretty invasive change.
         */
        seq->buf[seq->count] = '\0';
        seq->count = start;
        seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
        return 0;
}

static int statmount_mnt_point(struct kstatmount *s, struct seq_file *seq)
{
        struct vfsmount *mnt = s->mnt;
        struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
        int err;

        err = seq_path_root(seq, &mnt_path, &s->root, "");
        return err == SEQ_SKIP ? 0 : err;
}

static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
{
        struct super_block *sb = s->mnt->mnt_sb;

        seq_puts(seq, sb->s_type->name);
        return 0;
}

static int statmount_string(struct kstatmount *s, u64 flag)
{
        int ret;
        size_t kbufsize;
        struct seq_file *seq = &s->seq;
        struct statmount *sm = &s->sm;

        switch (flag) {
        case STATMOUNT_FS_TYPE:
                sm->fs_type = seq->count;
                ret = statmount_fs_type(s, seq);
                break;
        case STATMOUNT_MNT_ROOT:
                sm->mnt_root = seq->count;
                ret = statmount_mnt_root(s, seq);
                break;
        case STATMOUNT_MNT_POINT:
                sm->mnt_point = seq->count;
                ret = statmount_mnt_point(s, seq);
                break;
        default:
                WARN_ON_ONCE(true);
                return -EINVAL;
        }

        if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize)))
                return -EOVERFLOW;
        if (kbufsize >= s->bufsize)
                return -EOVERFLOW;

        /* signal a retry */
        if (unlikely(seq_has_overflowed(seq)))
                return -EAGAIN;

        if (ret)
                return ret;

        seq->buf[seq->count++] = '\0';
        sm->mask |= flag;
        return 0;
}

static int copy_statmount_to_user(struct kstatmount *s)
{
        struct statmount *sm = &s->sm;
        struct seq_file *seq = &s->seq;
        char __user *str = ((char __user *)s->buf) + sizeof(*sm);
        size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm));

        if (seq->count && copy_to_user(str, seq->buf, seq->count))
                return -EFAULT;

        /* Return the number of bytes copied to the buffer */
        sm->size = copysize + seq->count;
        if (copy_to_user(s->buf, sm, copysize))
                return -EFAULT;

        return 0;
}

static int do_statmount(struct kstatmount *s)
{
        struct mount *m = real_mount(s->mnt);
        int err;

        /*
         * Don't trigger audit denials. We just want to determine what
         * mounts to show users.
         */
        if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
            !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        err = security_sb_statfs(s->mnt->mnt_root);
        if (err)
                return err;

        if (s->mask & STATMOUNT_SB_BASIC)
                statmount_sb_basic(s);

        if (s->mask & STATMOUNT_MNT_BASIC)
                statmount_mnt_basic(s);

        if (s->mask & STATMOUNT_PROPAGATE_FROM)
                statmount_propagate_from(s);

        if (s->mask & STATMOUNT_FS_TYPE)
                err = statmount_string(s, STATMOUNT_FS_TYPE);

        if (!err && s->mask & STATMOUNT_MNT_ROOT)
                err = statmount_string(s, STATMOUNT_MNT_ROOT);

        if (!err && s->mask & STATMOUNT_MNT_POINT)
                err = statmount_string(s, STATMOUNT_MNT_POINT);

        if (err)
                return err;

        return 0;
}

static inline bool retry_statmount(const long ret, size_t *seq_size)
{
        if (likely(ret != -EAGAIN))
                return false;
        if (unlikely(check_mul_overflow(*seq_size, 2, seq_size)))
                return false;
        if (unlikely(*seq_size > MAX_RW_COUNT))
                return false;
        return true;
}

static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
                              struct statmount __user *buf, size_t bufsize,
                              size_t seq_size)
{
        if (!access_ok(buf, bufsize))
                return -EFAULT;

        memset(ks, 0, sizeof(*ks));
        ks->mask = kreq->param;
        ks->buf = buf;
        ks->bufsize = bufsize;
        ks->seq.size = seq_size;
        ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT);
        if (!ks->seq.buf)
                return -ENOMEM;
        return 0;
}

static int copy_mnt_id_req(const struct mnt_id_req __user *req,
                           struct mnt_id_req *kreq)
{
        int ret;
        size_t usize;

        BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER0);

        ret = get_user(usize, &req->size);
        if (ret)
                return -EFAULT;
        if (unlikely(usize > PAGE_SIZE))
                return -E2BIG;
        if (unlikely(usize < MNT_ID_REQ_SIZE_VER0))
                return -EINVAL;
        memset(kreq, 0, sizeof(*kreq));
        ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
        if (ret)
                return ret;
        if (kreq->spare != 0)
                return -EINVAL;
        return 0;
}

SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
                struct statmount __user *, buf, size_t, bufsize,
                unsigned int, flags)
{
        struct vfsmount *mnt;
        struct mnt_id_req kreq;
        struct kstatmount ks;
        /* We currently support retrieval of 3 strings. */
        size_t seq_size = 3 * PATH_MAX;
        int ret;

        if (flags)
                return -EINVAL;

        ret = copy_mnt_id_req(req, &kreq);
        if (ret)
                return ret;

retry:
        ret = prepare_kstatmount(&ks, &kreq, buf, bufsize, seq_size);
        if (ret)
                return ret;

        down_read(&namespace_sem);
        mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns);
        if (!mnt) {
                up_read(&namespace_sem);
                kvfree(ks.seq.buf);
                return -ENOENT;
        }

        ks.mnt = mnt;
        get_fs_root(current->fs, &ks.root);
        ret = do_statmount(&ks);
        path_put(&ks.root);
        up_read(&namespace_sem);

        if (!ret)
                ret = copy_statmount_to_user(&ks);
        kvfree(ks.seq.buf);
        if (retry_statmount(ret, &seq_size))
                goto retry;
        return ret;
}

static struct mount *listmnt_next(struct mount *curr)
{
        return node_to_mount(rb_next(&curr->mnt_node));
}

static ssize_t do_listmount(struct mount *first, struct path *orig,
                            u64 mnt_parent_id, u64 __user *mnt_ids,
                            size_t nr_mnt_ids, const struct path *root)
{
        struct mount *r;
        ssize_t ret;

        /*
         * Don't trigger audit denials. We just want to determine what
         * mounts to show users.
         */
        if (!is_path_reachable(real_mount(orig->mnt), orig->dentry, root) &&
            !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        ret = security_sb_statfs(orig->dentry);
        if (ret)
                return ret;

        for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r)) {
                if (r->mnt_id_unique == mnt_parent_id)
                        continue;
                if (!is_path_reachable(r, r->mnt.mnt_root, orig))
                        continue;
                if (put_user(r->mnt_id_unique, mnt_ids))
                        return -EFAULT;
                mnt_ids++;
                nr_mnt_ids--;
                ret++;
        }
        return ret;
}

SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *,
                mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
{
        struct mnt_namespace *ns = current->nsproxy->mnt_ns;
        struct mnt_id_req kreq;
        struct mount *first;
        struct path root, orig;
        u64 mnt_parent_id, last_mnt_id;
        const size_t maxcount = (size_t)-1 >> 3;
        ssize_t ret;

        if (flags)
                return -EINVAL;

        if (unlikely(nr_mnt_ids > maxcount))
                return -EFAULT;

        if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids)))
                return -EFAULT;

        ret = copy_mnt_id_req(req, &kreq);
        if (ret)
                return ret;
        mnt_parent_id = kreq.mnt_id;
        last_mnt_id = kreq.param;

        down_read(&namespace_sem);
        get_fs_root(current->fs, &root);
        if (mnt_parent_id == LSMT_ROOT) {
                orig = root;
        } else {
                ret = -ENOENT;
                orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
                if (!orig.mnt)
                        goto err;
                orig.dentry = orig.mnt->mnt_root;
        }
        if (!last_mnt_id)
                first = node_to_mount(rb_first(&ns->mounts));
        else
                first = mnt_find_id_at(ns, last_mnt_id + 1);

        ret = do_listmount(first, &orig, mnt_parent_id, mnt_ids, nr_mnt_ids, &root);
err:
        path_put(&root);
        up_read(&namespace_sem);
        return ret;
}


static void __init init_mount_tree(void)
{
        struct vfsmount *mnt;
        struct mount *m;
        struct mnt_namespace *ns;
        struct path root;

        mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
        if (IS_ERR(mnt))
                panic("Can't create rootfs");

        ns = alloc_mnt_ns(&init_user_ns, false);
        if (IS_ERR(ns))
                panic("Can't allocate initial namespace");
        m = real_mount(mnt);
        ns->root = m;
        ns->nr_mounts = 1;
        mnt_add_to_ns(ns, m);
        init_task.nsproxy->mnt_ns = ns;
        get_mnt_ns(ns);

        root.mnt = mnt;
        root.dentry = mnt->mnt_root;
        mnt->mnt_flags |= MNT_LOCKED;

        set_fs_pwd(current->fs, &root);
        set_fs_root(current->fs, &root);
}

void __init mnt_init(void)
{
        int err;

        mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);

        mount_hashtable = alloc_large_system_hash("Mount-cache",
                                sizeof(struct hlist_head),
                                mhash_entries, 19,
                                HASH_ZERO,
                                &m_hash_shift, &m_hash_mask, 0, 0);
        mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
                                sizeof(struct hlist_head),
                                mphash_entries, 19,
                                HASH_ZERO,
                                &mp_hash_shift, &mp_hash_mask, 0, 0);

        if (!mount_hashtable || !mountpoint_hashtable)
                panic("Failed to allocate mount hash table\n");

        kernfs_init();

        err = sysfs_init();
        if (err)
                printk(KERN_WARNING "%s: sysfs_init error: %d\n",
                        __func__, err);
        fs_kobj = kobject_create_and_add("fs", NULL);
        if (!fs_kobj)
                printk(KERN_WARNING "%s: kobj create error\n", __func__);
        shmem_init();
        init_rootfs();
        init_mount_tree();
}

void put_mnt_ns(struct mnt_namespace *ns)
{
        if (!refcount_dec_and_test(&ns->ns.count))
                return;
        drop_collected_mounts(&ns->root->mnt);
        free_mnt_ns(ns);
}

struct vfsmount *kern_mount(struct file_system_type *type)
{
        struct vfsmount *mnt;
        mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
        if (!IS_ERR(mnt)) {
                /*
                 * it is a longterm mount, don't release mnt until
                 * we unmount before file sys is unregistered
                */
                real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
        }
        return mnt;
}
EXPORT_SYMBOL_GPL(kern_mount);

void kern_unmount(struct vfsmount *mnt)
{
        /* release long term mount so mount point can be released */
        if (!IS_ERR(mnt)) {
                mnt_make_shortterm(mnt);
                synchronize_rcu();        /* yecchhh... */
                mntput(mnt);
        }
}
EXPORT_SYMBOL(kern_unmount);

void kern_unmount_array(struct vfsmount *mnt[], unsigned int num)
{
        unsigned int i;

        for (i = 0; i < num; i++)
                mnt_make_shortterm(mnt[i]);
        synchronize_rcu_expedited();
        for (i = 0; i < num; i++)
                mntput(mnt[i]);
}
EXPORT_SYMBOL(kern_unmount_array);

bool our_mnt(struct vfsmount *mnt)
{
        return check_mnt(real_mount(mnt));
}

bool current_chrooted(void)
{
        /* Does the current process have a non-standard root */
        struct path ns_root;
        struct path fs_root;
        bool chrooted;

        /* Find the namespace root */
        ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
        ns_root.dentry = ns_root.mnt->mnt_root;
        path_get(&ns_root);
        while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
                ;

        get_fs_root(current->fs, &fs_root);

        chrooted = !path_equal(&fs_root, &ns_root);

        path_put(&fs_root);
        path_put(&ns_root);

        return chrooted;
}

static bool mnt_already_visible(struct mnt_namespace *ns,
                                const struct super_block *sb,
                                int *new_mnt_flags)
{
        int new_flags = *new_mnt_flags;
        struct mount *mnt, *n;
        bool visible = false;

        down_read(&namespace_sem);
        rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
                struct mount *child;
                int mnt_flags;

                if (mnt->mnt.mnt_sb->s_type != sb->s_type)
                        continue;

                /* This mount is not fully visible if it's root directory
                 * is not the root directory of the filesystem.
                 */
                if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
                        continue;

                /* A local view of the mount flags */
                mnt_flags = mnt->mnt.mnt_flags;

                /* Don't miss readonly hidden in the superblock flags */
                if (sb_rdonly(mnt->mnt.mnt_sb))
                        mnt_flags |= MNT_LOCK_READONLY;

                /* Verify the mount flags are equal to or more permissive
                 * than the proposed new mount.
                 */
                if ((mnt_flags & MNT_LOCK_READONLY) &&
                    !(new_flags & MNT_READONLY))
                        continue;
                if ((mnt_flags & MNT_LOCK_ATIME) &&
                    ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
                        continue;

                /* This mount is not fully visible if there are any
                 * locked child mounts that cover anything except for
                 * empty directories.
                 */
                list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
                        struct inode *inode = child->mnt_mountpoint->d_inode;
                        /* Only worry about locked mounts */
                        if (!(child->mnt.mnt_flags & MNT_LOCKED))
                                continue;
                        /* Is the directory permanetly empty? */
                        if (!is_empty_dir_inode(inode))
                                goto next;
                }
                /* Preserve the locked attributes */
                *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
                                               MNT_LOCK_ATIME);
                visible = true;
                goto found;
        next:        ;
        }
found:
        up_read(&namespace_sem);
        return visible;
}

static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
{
        const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
        struct mnt_namespace *ns = current->nsproxy->mnt_ns;
        unsigned long s_iflags;

        if (ns->user_ns == &init_user_ns)
                return false;

        /* Can this filesystem be too revealing? */
        s_iflags = sb->s_iflags;
        if (!(s_iflags & SB_I_USERNS_VISIBLE))
                return false;

        if ((s_iflags & required_iflags) != required_iflags) {
                WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
                          required_iflags);
                return true;
        }

        return !mnt_already_visible(ns, sb, new_mnt_flags);
}

bool mnt_may_suid(struct vfsmount *mnt)
{
        /*
         * Foreign mounts (accessed via fchdir or through /proc
         * symlinks) are always treated as if they are nosuid.  This
         * prevents namespaces from trusting potentially unsafe
         * suid/sgid bits, file caps, or security labels that originate
         * in other namespaces.
         */
        return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
               current_in_userns(mnt->mnt_sb->s_user_ns);
}

static struct ns_common *mntns_get(struct task_struct *task)
{
        struct ns_common *ns = NULL;
        struct nsproxy *nsproxy;

        task_lock(task);
        nsproxy = task->nsproxy;
        if (nsproxy) {
                ns = &nsproxy->mnt_ns->ns;
                get_mnt_ns(to_mnt_ns(ns));
        }
        task_unlock(task);

        return ns;
}

static void mntns_put(struct ns_common *ns)
{
        put_mnt_ns(to_mnt_ns(ns));
}

static int mntns_install(struct nsset *nsset, struct ns_common *ns)
{
        struct nsproxy *nsproxy = nsset->nsproxy;
        struct fs_struct *fs = nsset->fs;
        struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
        struct user_namespace *user_ns = nsset->cred->user_ns;
        struct path root;
        int err;

        if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
            !ns_capable(user_ns, CAP_SYS_CHROOT) ||
            !ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        if (is_anon_ns(mnt_ns))
                return -EINVAL;

        if (fs->users != 1)
                return -EINVAL;

        get_mnt_ns(mnt_ns);
        old_mnt_ns = nsproxy->mnt_ns;
        nsproxy->mnt_ns = mnt_ns;

        /* Find the root */
        err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
                                "/", LOOKUP_DOWN, &root);
        if (err) {
                /* revert to old namespace */
                nsproxy->mnt_ns = old_mnt_ns;
                put_mnt_ns(mnt_ns);
                return err;
        }

        put_mnt_ns(old_mnt_ns);

        /* Update the pwd and root */
        set_fs_pwd(fs, &root);
        set_fs_root(fs, &root);

        path_put(&root);
        return 0;
}

static struct user_namespace *mntns_owner(struct ns_common *ns)
{
        return to_mnt_ns(ns)->user_ns;
}

const struct proc_ns_operations mntns_operations = {
        .name                = "mnt",
        .type                = CLONE_NEWNS,
        .get                = mntns_get,
        .put                = mntns_put,
        .install        = mntns_install,
        .owner                = mntns_owner,
};

#ifdef CONFIG_SYSCTL
static struct ctl_table fs_namespace_sysctls[] = {
        {
                .procname        = "mount-max",
                .data                = &sysctl_mount_max,
                .maxlen                = sizeof(unsigned int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ONE,
        },
};

static int __init init_fs_namespace_sysctls(void)
{
        register_sysctl_init("fs", fs_namespace_sysctls);
        return 0;
}
fs_initcall(init_fs_namespace_sysctls);

#endif /* CONFIG_SYSCTL */






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_H
#define _LINUX_SCHED_H

/*
 * Define 'struct task_struct' and provide the main scheduler
 * APIs (schedule(), wakeup variants, etc.)
 */

#include <uapi/linux/sched.h>

#include <asm/current.h>
#include <asm/processor.h>
#include <linux/thread_info.h>
#include <linux/preempt.h>
#include <linux/cpumask.h>

#include <linux/cache.h>
#include <linux/irqflags_types.h>
#include <linux/smp_types.h>
#include <linux/pid_types.h>
#include <linux/sem_types.h>
#include <linux/shm.h>
#include <linux/kmsan_types.h>
#include <linux/mutex_types.h>
#include <linux/plist_types.h>
#include <linux/hrtimer_types.h>
#include <linux/timer_types.h>
#include <linux/seccomp_types.h>
#include <linux/nodemask_types.h>
#include <linux/refcount_types.h>
#include <linux/resource.h>
#include <linux/latencytop.h>
#include <linux/sched/prio.h>
#include <linux/sched/types.h>
#include <linux/signal_types.h>
#include <linux/syscall_user_dispatch_types.h>
#include <linux/mm_types_task.h>
#include <linux/task_io_accounting.h>
#include <linux/posix-timers_types.h>
#include <linux/restart_block.h>
#include <uapi/linux/rseq.h>
#include <linux/seqlock_types.h>
#include <linux/kcsan.h>
#include <linux/rv.h>
#include <linux/livepatch_sched.h>
#include <linux/uidgid_types.h>
#include <asm/kmap_size.h>

/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
struct bio_list;
struct blk_plug;
struct bpf_local_storage;
struct bpf_run_ctx;
struct capture_control;
struct cfs_rq;
struct fs_struct;
struct futex_pi_state;
struct io_context;
struct io_uring_task;
struct mempolicy;
struct nameidata;
struct nsproxy;
struct perf_event_context;
struct pid_namespace;
struct pipe_inode_info;
struct rcu_node;
struct reclaim_state;
struct robust_list_head;
struct root_domain;
struct rq;
struct sched_attr;
struct sched_dl_entity;
struct seq_file;
struct sighand_struct;
struct signal_struct;
struct task_delay_info;
struct task_group;
struct task_struct;
struct user_event_mm;

/*
 * Task state bitmask. NOTE! These bits are also
 * encoded in fs/proc/array.c: get_task_state().
 *
 * We have two separate sets of flags: task->__state
 * is about runnability, while task->exit_state are
 * about the task exiting. Confusing, but this way
 * modifying one set can't modify the other one by
 * mistake.
 */

/* Used in tsk->__state: */
#define TASK_RUNNING                        0x00000000
#define TASK_INTERRUPTIBLE                0x00000001
#define TASK_UNINTERRUPTIBLE                0x00000002
#define __TASK_STOPPED                        0x00000004
#define __TASK_TRACED                        0x00000008
/* Used in tsk->exit_state: */
#define EXIT_DEAD                        0x00000010
#define EXIT_ZOMBIE                        0x00000020
#define EXIT_TRACE                        (EXIT_ZOMBIE | EXIT_DEAD)
/* Used in tsk->__state again: */
#define TASK_PARKED                        0x00000040
#define TASK_DEAD                        0x00000080
#define TASK_WAKEKILL                        0x00000100
#define TASK_WAKING                        0x00000200
#define TASK_NOLOAD                        0x00000400
#define TASK_NEW                        0x00000800
#define TASK_RTLOCK_WAIT                0x00001000
#define TASK_FREEZABLE                        0x00002000
#define __TASK_FREEZABLE_UNSAFE               (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP))
#define TASK_FROZEN                        0x00008000
#define TASK_STATE_MAX                        0x00010000

#define TASK_ANY                        (TASK_STATE_MAX-1)

/*
 * DO NOT ADD ANY NEW USERS !
 */
#define TASK_FREEZABLE_UNSAFE                (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE)

/* Convenience macros for the sake of set_current_state: */
#define TASK_KILLABLE                        (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
#define TASK_STOPPED                        (TASK_WAKEKILL | __TASK_STOPPED)
#define TASK_TRACED                        __TASK_TRACED

#define TASK_IDLE                        (TASK_UNINTERRUPTIBLE | TASK_NOLOAD)

/* Convenience macros for the sake of wake_up(): */
#define TASK_NORMAL                        (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)

/* get_task_state(): */
#define TASK_REPORT                        (TASK_RUNNING | TASK_INTERRUPTIBLE | \
                                         TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
                                         __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
                                         TASK_PARKED)

#define task_is_running(task)                (READ_ONCE((task)->__state) == TASK_RUNNING)

#define task_is_traced(task)                ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0)
#define task_is_stopped(task)                ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0)
#define task_is_stopped_or_traced(task)        ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0)

/*
 * Special states are those that do not use the normal wait-loop pattern. See
 * the comment with set_special_state().
 */
#define is_special_task_state(state)                                \
        ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
# define debug_normal_state_change(state_value)                                \
        do {                                                                \
                WARN_ON_ONCE(is_special_task_state(state_value));        \
                current->task_state_change = _THIS_IP_;                        \
        } while (0)

# define debug_special_state_change(state_value)                        \
        do {                                                                \
                WARN_ON_ONCE(!is_special_task_state(state_value));        \
                current->task_state_change = _THIS_IP_;                        \
        } while (0)

# define debug_rtlock_wait_set_state()                                        \
        do {                                                                 \
                current->saved_state_change = current->task_state_change;\
                current->task_state_change = _THIS_IP_;                         \
        } while (0)

# define debug_rtlock_wait_restore_state()                                \
        do {                                                                 \
                current->task_state_change = current->saved_state_change;\
        } while (0)

#else
# define debug_normal_state_change(cond)        do { } while (0)
# define debug_special_state_change(cond)        do { } while (0)
# define debug_rtlock_wait_set_state()                do { } while (0)
# define debug_rtlock_wait_restore_state()        do { } while (0)
#endif

/*
 * set_current_state() includes a barrier so that the write of current->__state
 * is correctly serialised wrt the caller's subsequent test of whether to
 * actually sleep:
 *
 *   for (;;) {
 *        set_current_state(TASK_UNINTERRUPTIBLE);
 *        if (CONDITION)
 *           break;
 *
 *        schedule();
 *   }
 *   __set_current_state(TASK_RUNNING);
 *
 * If the caller does not need such serialisation (because, for instance, the
 * CONDITION test and condition change and wakeup are under the same lock) then
 * use __set_current_state().
 *
 * The above is typically ordered against the wakeup, which does:
 *
 *   CONDITION = 1;
 *   wake_up_state(p, TASK_UNINTERRUPTIBLE);
 *
 * where wake_up_state()/try_to_wake_up() executes a full memory barrier before
 * accessing p->__state.
 *
 * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is,
 * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
 * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
 *
 * However, with slightly different timing the wakeup TASK_RUNNING store can
 * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not
 * a problem either because that will result in one extra go around the loop
 * and our @cond test will save the day.
 *
 * Also see the comments of try_to_wake_up().
 */
#define __set_current_state(state_value)                                \
        do {                                                                \
                debug_normal_state_change((state_value));                \
                WRITE_ONCE(current->__state, (state_value));                \
        } while (0)

#define set_current_state(state_value)                                        \
        do {                                                                \
                debug_normal_state_change((state_value));                \
                smp_store_mb(current->__state, (state_value));                \
        } while (0)

/*
 * set_special_state() should be used for those states when the blocking task
 * can not use the regular condition based wait-loop. In that case we must
 * serialize against wakeups such that any possible in-flight TASK_RUNNING
 * stores will not collide with our state change.
 */
#define set_special_state(state_value)                                        \
        do {                                                                \
                unsigned long flags; /* may shadow */                        \
                                                                        \
                raw_spin_lock_irqsave(&current->pi_lock, flags);        \
                debug_special_state_change((state_value));                \
                WRITE_ONCE(current->__state, (state_value));                \
                raw_spin_unlock_irqrestore(&current->pi_lock, flags);        \
        } while (0)

/*
 * PREEMPT_RT specific variants for "sleeping" spin/rwlocks
 *
 * RT's spin/rwlock substitutions are state preserving. The state of the
 * task when blocking on the lock is saved in task_struct::saved_state and
 * restored after the lock has been acquired.  These operations are
 * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT
 * lock related wakeups while the task is blocked on the lock are
 * redirected to operate on task_struct::saved_state to ensure that these
 * are not dropped. On restore task_struct::saved_state is set to
 * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail.
 *
 * The lock operation looks like this:
 *
 *        current_save_and_set_rtlock_wait_state();
 *        for (;;) {
 *                if (try_lock())
 *                        break;
 *                raw_spin_unlock_irq(&lock->wait_lock);
 *                schedule_rtlock();
 *                raw_spin_lock_irq(&lock->wait_lock);
 *                set_current_state(TASK_RTLOCK_WAIT);
 *        }
 *        current_restore_rtlock_saved_state();
 */
#define current_save_and_set_rtlock_wait_state()                        \
        do {                                                                \
                lockdep_assert_irqs_disabled();                                \
                raw_spin_lock(&current->pi_lock);                        \
                current->saved_state = current->__state;                \
                debug_rtlock_wait_set_state();                                \
                WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT);                \
                raw_spin_unlock(&current->pi_lock);                        \
        } while (0);

#define current_restore_rtlock_saved_state()                                \
        do {                                                                \
                lockdep_assert_irqs_disabled();                                \
                raw_spin_lock(&current->pi_lock);                        \
                debug_rtlock_wait_restore_state();                        \
                WRITE_ONCE(current->__state, current->saved_state);        \
                current->saved_state = TASK_RUNNING;                        \
                raw_spin_unlock(&current->pi_lock);                        \
        } while (0);

#define get_current_state()        READ_ONCE(current->__state)

/*
 * Define the task command name length as enum, then it can be visible to
 * BPF programs.
 */
enum {
        TASK_COMM_LEN = 16,
};

extern void sched_tick(void);

#define        MAX_SCHEDULE_TIMEOUT                LONG_MAX

extern long schedule_timeout(long timeout);
extern long schedule_timeout_interruptible(long timeout);
extern long schedule_timeout_killable(long timeout);
extern long schedule_timeout_uninterruptible(long timeout);
extern long schedule_timeout_idle(long timeout);
asmlinkage void schedule(void);
extern void schedule_preempt_disabled(void);
asmlinkage void preempt_schedule_irq(void);
#ifdef CONFIG_PREEMPT_RT
 extern void schedule_rtlock(void);
#endif

extern int __must_check io_schedule_prepare(void);
extern void io_schedule_finish(int token);
extern long io_schedule_timeout(long timeout);
extern void io_schedule(void);

/**
 * struct prev_cputime - snapshot of system and user cputime
 * @utime: time spent in user mode
 * @stime: time spent in system mode
 * @lock: protects the above two fields
 *
 * Stores previous user/system time values such that we can guarantee
 * monotonicity.
 */
struct prev_cputime {
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
        u64                                utime;
        u64                                stime;
        raw_spinlock_t                        lock;
#endif
};

enum vtime_state {
        /* Task is sleeping or running in a CPU with VTIME inactive: */
        VTIME_INACTIVE = 0,
        /* Task is idle */
        VTIME_IDLE,
        /* Task runs in kernelspace in a CPU with VTIME active: */
        VTIME_SYS,
        /* Task runs in userspace in a CPU with VTIME active: */
        VTIME_USER,
        /* Task runs as guests in a CPU with VTIME active: */
        VTIME_GUEST,
};

struct vtime {
        seqcount_t                seqcount;
        unsigned long long        starttime;
        enum vtime_state        state;
        unsigned int                cpu;
        u64                        utime;
        u64                        stime;
        u64                        gtime;
};

/*
 * Utilization clamp constraints.
 * @UCLAMP_MIN:        Minimum utilization
 * @UCLAMP_MAX:        Maximum utilization
 * @UCLAMP_CNT:        Utilization clamp constraints count
 */
enum uclamp_id {
        UCLAMP_MIN = 0,
        UCLAMP_MAX,
        UCLAMP_CNT
};

#ifdef CONFIG_SMP
extern struct root_domain def_root_domain;
extern struct mutex sched_domains_mutex;
#endif

struct sched_param {
        int sched_priority;
};

struct sched_info {
#ifdef CONFIG_SCHED_INFO
        /* Cumulative counters: */

        /* # of times we have run on this CPU: */
        unsigned long                        pcount;

        /* Time spent waiting on a runqueue: */
        unsigned long long                run_delay;

        /* Timestamps: */

        /* When did we last run on a CPU? */
        unsigned long long                last_arrival;

        /* When were we last queued to run? */
        unsigned long long                last_queued;

#endif /* CONFIG_SCHED_INFO */
};

/*
 * Integer metrics need fixed point arithmetic, e.g., sched/fair
 * has a few: load, load_avg, util_avg, freq, and capacity.
 *
 * We define a basic fixed point arithmetic range, and then formalize
 * all these metrics based on that basic range.
 */
# define SCHED_FIXEDPOINT_SHIFT                10
# define SCHED_FIXEDPOINT_SCALE                (1L << SCHED_FIXEDPOINT_SHIFT)

/* Increase resolution of cpu_capacity calculations */
# define SCHED_CAPACITY_SHIFT                SCHED_FIXEDPOINT_SHIFT
# define SCHED_CAPACITY_SCALE                (1L << SCHED_CAPACITY_SHIFT)

struct load_weight {
        unsigned long                        weight;
        u32                                inv_weight;
};

/*
 * The load/runnable/util_avg accumulates an infinite geometric series
 * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
 *
 * [load_avg definition]
 *
 *   load_avg = runnable% * scale_load_down(load)
 *
 * [runnable_avg definition]
 *
 *   runnable_avg = runnable% * SCHED_CAPACITY_SCALE
 *
 * [util_avg definition]
 *
 *   util_avg = running% * SCHED_CAPACITY_SCALE
 *
 * where runnable% is the time ratio that a sched_entity is runnable and
 * running% the time ratio that a sched_entity is running.
 *
 * For cfs_rq, they are the aggregated values of all runnable and blocked
 * sched_entities.
 *
 * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU
 * capacity scaling. The scaling is done through the rq_clock_pelt that is used
 * for computing those signals (see update_rq_clock_pelt())
 *
 * N.B., the above ratios (runnable% and running%) themselves are in the
 * range of [0, 1]. To do fixed point arithmetics, we therefore scale them
 * to as large a range as necessary. This is for example reflected by
 * util_avg's SCHED_CAPACITY_SCALE.
 *
 * [Overflow issue]
 *
 * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities
 * with the highest load (=88761), always runnable on a single cfs_rq,
 * and should not overflow as the number already hits PID_MAX_LIMIT.
 *
 * For all other cases (including 32-bit kernels), struct load_weight's
 * weight will overflow first before we do, because:
 *
 *    Max(load_avg) <= Max(load.weight)
 *
 * Then it is the load_weight's responsibility to consider overflow
 * issues.
 */
struct sched_avg {
        u64                                last_update_time;
        u64                                load_sum;
        u64                                runnable_sum;
        u32                                util_sum;
        u32                                period_contrib;
        unsigned long                        load_avg;
        unsigned long                        runnable_avg;
        unsigned long                        util_avg;
        unsigned int                        util_est;
} ____cacheline_aligned;

/*
 * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
 * updates. When a task is dequeued, its util_est should not be updated if its
 * util_avg has not been updated in the meantime.
 * This information is mapped into the MSB bit of util_est at dequeue time.
 * Since max value of util_est for a task is 1024 (PELT util_avg for a task)
 * it is safe to use MSB.
 */
#define UTIL_EST_WEIGHT_SHIFT                2
#define UTIL_AVG_UNCHANGED                0x80000000

struct sched_statistics {
#ifdef CONFIG_SCHEDSTATS
        u64                                wait_start;
        u64                                wait_max;
        u64                                wait_count;
        u64                                wait_sum;
        u64                                iowait_count;
        u64                                iowait_sum;

        u64                                sleep_start;
        u64                                sleep_max;
        s64                                sum_sleep_runtime;

        u64                                block_start;
        u64                                block_max;
        s64                                sum_block_runtime;

        s64                                exec_max;
        u64                                slice_max;

        u64                                nr_migrations_cold;
        u64                                nr_failed_migrations_affine;
        u64                                nr_failed_migrations_running;
        u64                                nr_failed_migrations_hot;
        u64                                nr_forced_migrations;

        u64                                nr_wakeups;
        u64                                nr_wakeups_sync;
        u64                                nr_wakeups_migrate;
        u64                                nr_wakeups_local;
        u64                                nr_wakeups_remote;
        u64                                nr_wakeups_affine;
        u64                                nr_wakeups_affine_attempts;
        u64                                nr_wakeups_passive;
        u64                                nr_wakeups_idle;

#ifdef CONFIG_SCHED_CORE
        u64                                core_forceidle_sum;
#endif
#endif /* CONFIG_SCHEDSTATS */
} ____cacheline_aligned;

struct sched_entity {
        /* For load-balancing: */
        struct load_weight                load;
        struct rb_node                        run_node;
        u64                                deadline;
        u64                                min_vruntime;

        struct list_head                group_node;
        unsigned int                        on_rq;

        u64                                exec_start;
        u64                                sum_exec_runtime;
        u64                                prev_sum_exec_runtime;
        u64                                vruntime;
        s64                                vlag;
        u64                                slice;

        u64                                nr_migrations;

#ifdef CONFIG_FAIR_GROUP_SCHED
        int                                depth;
        struct sched_entity                *parent;
        /* rq on which this entity is (to be) queued: */
        struct cfs_rq                        *cfs_rq;
        /* rq "owned" by this entity/group: */
        struct cfs_rq                        *my_q;
        /* cached value of my_q->h_nr_running */
        unsigned long                        runnable_weight;
#endif

#ifdef CONFIG_SMP
        /*
         * Per entity load average tracking.
         *
         * Put into separate cache line so it does not
         * collide with read-mostly values above.
         */
        struct sched_avg                avg;
#endif
};

struct sched_rt_entity {
        struct list_head                run_list;
        unsigned long                        timeout;
        unsigned long                        watchdog_stamp;
        unsigned int                        time_slice;
        unsigned short                        on_rq;
        unsigned short                        on_list;

        struct sched_rt_entity                *back;
#ifdef CONFIG_RT_GROUP_SCHED
        struct sched_rt_entity                *parent;
        /* rq on which this entity is (to be) queued: */
        struct rt_rq                        *rt_rq;
        /* rq "owned" by this entity/group: */
        struct rt_rq                        *my_q;
#endif
} __randomize_layout;

typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);

struct sched_dl_entity {
        struct rb_node                        rb_node;

        /*
         * Original scheduling parameters. Copied here from sched_attr
         * during sched_setattr(), they will remain the same until
         * the next sched_setattr().
         */
        u64                                dl_runtime;        /* Maximum runtime for each instance        */
        u64                                dl_deadline;        /* Relative deadline of each instance        */
        u64                                dl_period;        /* Separation of two instances (period) */
        u64                                dl_bw;                /* dl_runtime / dl_period                */
        u64                                dl_density;        /* dl_runtime / dl_deadline                */

        /*
         * Actual scheduling parameters. Initialized with the values above,
         * they are continuously updated during task execution. Note that
         * the remaining runtime could be < 0 in case we are in overrun.
         */
        s64                                runtime;        /* Remaining runtime for this instance        */
        u64                                deadline;        /* Absolute deadline for this instance        */
        unsigned int                        flags;                /* Specifying the scheduler behaviour        */

        /*
         * Some bool flags:
         *
         * @dl_throttled tells if we exhausted the runtime. If so, the
         * task has to wait for a replenishment to be performed at the
         * next firing of dl_timer.
         *
         * @dl_yielded tells if task gave up the CPU before consuming
         * all its available runtime during the last job.
         *
         * @dl_non_contending tells if the task is inactive while still
         * contributing to the active utilization. In other words, it
         * indicates if the inactive timer has been armed and its handler
         * has not been executed yet. This flag is useful to avoid race
         * conditions between the inactive timer handler and the wakeup
         * code.
         *
         * @dl_overrun tells if the task asked to be informed about runtime
         * overruns.
         */
        unsigned int                        dl_throttled      : 1;
        unsigned int                        dl_yielded        : 1;
        unsigned int                        dl_non_contending : 1;
        unsigned int                        dl_overrun          : 1;
        unsigned int                        dl_server         : 1;

        /*
         * Bandwidth enforcement timer. Each -deadline task has its
         * own bandwidth to be enforced, thus we need one timer per task.
         */
        struct hrtimer                        dl_timer;

        /*
         * Inactive timer, responsible for decreasing the active utilization
         * at the "0-lag time". When a -deadline task blocks, it contributes
         * to GRUB's active utilization until the "0-lag time", hence a
         * timer is needed to decrease the active utilization at the correct
         * time.
         */
        struct hrtimer                        inactive_timer;

        /*
         * Bits for DL-server functionality. Also see the comment near
         * dl_server_update().
         *
         * @rq the runqueue this server is for
         *
         * @server_has_tasks() returns true if @server_pick return a
         * runnable task.
         */
        struct rq                        *rq;
        dl_server_has_tasks_f                server_has_tasks;
        dl_server_pick_f                server_pick;

#ifdef CONFIG_RT_MUTEXES
        /*
         * Priority Inheritance. When a DEADLINE scheduling entity is boosted
         * pi_se points to the donor, otherwise points to the dl_se it belongs
         * to (the original one/itself).
         */
        struct sched_dl_entity *pi_se;
#endif
};

#ifdef CONFIG_UCLAMP_TASK
/* Number of utilization clamp buckets (shorter alias) */
#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT

/*
 * Utilization clamp for a scheduling entity
 * @value:                clamp value "assigned" to a se
 * @bucket_id:                bucket index corresponding to the "assigned" value
 * @active:                the se is currently refcounted in a rq's bucket
 * @user_defined:        the requested clamp value comes from user-space
 *
 * The bucket_id is the index of the clamp bucket matching the clamp value
 * which is pre-computed and stored to avoid expensive integer divisions from
 * the fast path.
 *
 * The active bit is set whenever a task has got an "effective" value assigned,
 * which can be different from the clamp value "requested" from user-space.
 * This allows to know a task is refcounted in the rq's bucket corresponding
 * to the "effective" bucket_id.
 *
 * The user_defined bit is set whenever a task has got a task-specific clamp
 * value requested from userspace, i.e. the system defaults apply to this task
 * just as a restriction. This allows to relax default clamps when a less
 * restrictive task-specific value has been requested, thus allowing to
 * implement a "nice" semantic. For example, a task running with a 20%
 * default boost can still drop its own boosting to 0%.
 */
struct uclamp_se {
        unsigned int value                : bits_per(SCHED_CAPACITY_SCALE);
        unsigned int bucket_id                : bits_per(UCLAMP_BUCKETS);
        unsigned int active                : 1;
        unsigned int user_defined        : 1;
};
#endif /* CONFIG_UCLAMP_TASK */

union rcu_special {
        struct {
                u8                        blocked;
                u8                        need_qs;
                u8                        exp_hint; /* Hint for performance. */
                u8                        need_mb; /* Readers need smp_mb(). */
        } b; /* Bits. */
        u32 s; /* Set of bits. */
};

enum perf_event_task_context {
        perf_invalid_context = -1,
        perf_hw_context = 0,
        perf_sw_context,
        perf_nr_task_contexts,
};

struct wake_q_node {
        struct wake_q_node *next;
};

struct kmap_ctrl {
#ifdef CONFIG_KMAP_LOCAL
        int                                idx;
        pte_t                                pteval[KM_MAX_IDX];
#endif
};

struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
        /*
         * For reasons of header soup (see current_thread_info()), this
         * must be the first element of task_struct.
         */
        struct thread_info                thread_info;
#endif
        unsigned int                        __state;

        /* saved state for "spinlock sleepers" */
        unsigned int                        saved_state;

        /*
         * This begins the randomizable portion of task_struct. Only
         * scheduling-critical items should be added above here.
         */
        randomized_struct_fields_start

        void                                *stack;
        refcount_t                        usage;
        /* Per task flags (PF_*), defined further below: */
        unsigned int                        flags;
        unsigned int                        ptrace;

#ifdef CONFIG_MEM_ALLOC_PROFILING
        struct alloc_tag                *alloc_tag;
#endif

#ifdef CONFIG_SMP
        int                                on_cpu;
        struct __call_single_node        wake_entry;
        unsigned int                        wakee_flips;
        unsigned long                        wakee_flip_decay_ts;
        struct task_struct                *last_wakee;

        /*
         * recent_used_cpu is initially set as the last CPU used by a task
         * that wakes affine another task. Waker/wakee relationships can
         * push tasks around a CPU where each wakeup moves to the next one.
         * Tracking a recently used CPU allows a quick search for a recently
         * used CPU that may be idle.
         */
        int                                recent_used_cpu;
        int                                wake_cpu;
#endif
        int                                on_rq;

        int                                prio;
        int                                static_prio;
        int                                normal_prio;
        unsigned int                        rt_priority;

        struct sched_entity                se;
        struct sched_rt_entity                rt;
        struct sched_dl_entity                dl;
        struct sched_dl_entity                *dl_server;
        const struct sched_class        *sched_class;

#ifdef CONFIG_SCHED_CORE
        struct rb_node                        core_node;
        unsigned long                        core_cookie;
        unsigned int                        core_occupation;
#endif

#ifdef CONFIG_CGROUP_SCHED
        struct task_group                *sched_task_group;
#endif


#ifdef CONFIG_UCLAMP_TASK
        /*
         * Clamp values requested for a scheduling entity.
         * Must be updated with task_rq_lock() held.
         */
        struct uclamp_se                uclamp_req[UCLAMP_CNT];
        /*
         * Effective clamp values used for a scheduling entity.
         * Must be updated with task_rq_lock() held.
         */
        struct uclamp_se                uclamp[UCLAMP_CNT];
#endif

        struct sched_statistics         stats;

#ifdef CONFIG_PREEMPT_NOTIFIERS
        /* List of struct preempt_notifier: */
        struct hlist_head                preempt_notifiers;
#endif

#ifdef CONFIG_BLK_DEV_IO_TRACE
        unsigned int                        btrace_seq;
#endif

        unsigned int                        policy;
        unsigned long                        max_allowed_capacity;
        int                                nr_cpus_allowed;
        const cpumask_t                        *cpus_ptr;
        cpumask_t                        *user_cpus_ptr;
        cpumask_t                        cpus_mask;
        void                                *migration_pending;
#ifdef CONFIG_SMP
        unsigned short                        migration_disabled;
#endif
        unsigned short                        migration_flags;

#ifdef CONFIG_PREEMPT_RCU
        int                                rcu_read_lock_nesting;
        union rcu_special                rcu_read_unlock_special;
        struct list_head                rcu_node_entry;
        struct rcu_node                        *rcu_blocked_node;
#endif /* #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_TASKS_RCU
        unsigned long                        rcu_tasks_nvcsw;
        u8                                rcu_tasks_holdout;
        u8                                rcu_tasks_idx;
        int                                rcu_tasks_idle_cpu;
        struct list_head                rcu_tasks_holdout_list;
        int                                rcu_tasks_exit_cpu;
        struct list_head                rcu_tasks_exit_list;
#endif /* #ifdef CONFIG_TASKS_RCU */

#ifdef CONFIG_TASKS_TRACE_RCU
        int                                trc_reader_nesting;
        int                                trc_ipi_to_cpu;
        union rcu_special                trc_reader_special;
        struct list_head                trc_holdout_list;
        struct list_head                trc_blkd_node;
        int                                trc_blkd_cpu;
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */

        struct sched_info                sched_info;

        struct list_head                tasks;
#ifdef CONFIG_SMP
        struct plist_node                pushable_tasks;
        struct rb_node                        pushable_dl_tasks;
#endif

        struct mm_struct                *mm;
        struct mm_struct                *active_mm;
        struct address_space                *faults_disabled_mapping;

        int                                exit_state;
        int                                exit_code;
        int                                exit_signal;
        /* The signal sent when the parent dies: */
        int                                pdeath_signal;
        /* JOBCTL_*, siglock protected: */
        unsigned long                        jobctl;

        /* Used for emulating ABI behavior of previous Linux versions: */
        unsigned int                        personality;

        /* Scheduler bits, serialized by scheduler locks: */
        unsigned                        sched_reset_on_fork:1;
        unsigned                        sched_contributes_to_load:1;
        unsigned                        sched_migrated:1;

        /* Force alignment to the next boundary: */
        unsigned                        :0;

        /* Unserialized, strictly 'current' */

        /*
         * This field must not be in the scheduler word above due to wakelist
         * queueing no longer being serialized by p->on_cpu. However:
         *
         * p->XXX = X;                        ttwu()
         * schedule()                          if (p->on_rq && ..) // false
         *   smp_mb__after_spinlock();          if (smp_load_acquire(&p->on_cpu) && //true
         *   deactivate_task()                      ttwu_queue_wakelist())
         *     p->on_rq = 0;                        p->sched_remote_wakeup = Y;
         *
         * guarantees all stores of 'current' are visible before
         * ->sched_remote_wakeup gets used, so it can be in this word.
         */
        unsigned                        sched_remote_wakeup:1;
#ifdef CONFIG_RT_MUTEXES
        unsigned                        sched_rt_mutex:1;
#endif

        /* Bit to tell TOMOYO we're in execve(): */
        unsigned                        in_execve:1;
        unsigned                        in_iowait:1;
#ifndef TIF_RESTORE_SIGMASK
        unsigned                        restore_sigmask:1;
#endif
#ifdef CONFIG_MEMCG
        unsigned                        in_user_fault:1;
#endif
#ifdef CONFIG_LRU_GEN
        /* whether the LRU algorithm may apply to this access */
        unsigned                        in_lru_fault:1;
#endif
#ifdef CONFIG_COMPAT_BRK
        unsigned                        brk_randomized:1;
#endif
#ifdef CONFIG_CGROUPS
        /* disallow userland-initiated cgroup migration */
        unsigned                        no_cgroup_migration:1;
        /* task is frozen/stopped (used by the cgroup freezer) */
        unsigned                        frozen:1;
#endif
#ifdef CONFIG_BLK_CGROUP
        unsigned                        use_memdelay:1;
#endif
#ifdef CONFIG_PSI
        /* Stalled due to lack of memory */
        unsigned                        in_memstall:1;
#endif
#ifdef CONFIG_PAGE_OWNER
        /* Used by page_owner=on to detect recursion in page tracking. */
        unsigned                        in_page_owner:1;
#endif
#ifdef CONFIG_EVENTFD
        /* Recursion prevention for eventfd_signal() */
        unsigned                        in_eventfd:1;
#endif
#ifdef CONFIG_ARCH_HAS_CPU_PASID
        unsigned                        pasid_activated:1;
#endif
#ifdef        CONFIG_CPU_SUP_INTEL
        unsigned                        reported_split_lock:1;
#endif
#ifdef CONFIG_TASK_DELAY_ACCT
        /* delay due to memory thrashing */
        unsigned                        in_thrashing:1;
#endif

        unsigned long                        atomic_flags; /* Flags requiring atomic access. */

        struct restart_block                restart_block;

        pid_t                                pid;
        pid_t                                tgid;

#ifdef CONFIG_STACKPROTECTOR
        /* Canary value for the -fstack-protector GCC feature: */
        unsigned long                        stack_canary;
#endif
        /*
         * Pointers to the (original) parent process, youngest child, younger sibling,
         * older sibling, respectively.  (p->father can be replaced with
         * p->real_parent->pid)
         */

        /* Real parent process: */
        struct task_struct __rcu        *real_parent;

        /* Recipient of SIGCHLD, wait4() reports: */
        struct task_struct __rcu        *parent;

        /*
         * Children/sibling form the list of natural children:
         */
        struct list_head                children;
        struct list_head                sibling;
        struct task_struct                *group_leader;

        /*
         * 'ptraced' is the list of tasks this task is using ptrace() on.
         *
         * This includes both natural children and PTRACE_ATTACH targets.
         * 'ptrace_entry' is this task's link on the p->parent->ptraced list.
         */
        struct list_head                ptraced;
        struct list_head                ptrace_entry;

        /* PID/PID hash table linkage. */
        struct pid                        *thread_pid;
        struct hlist_node                pid_links[PIDTYPE_MAX];
        struct list_head                thread_node;

        struct completion                *vfork_done;

        /* CLONE_CHILD_SETTID: */
        int __user                        *set_child_tid;

        /* CLONE_CHILD_CLEARTID: */
        int __user                        *clear_child_tid;

        /* PF_KTHREAD | PF_IO_WORKER */
        void                                *worker_private;

        u64                                utime;
        u64                                stime;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
        u64                                utimescaled;
        u64                                stimescaled;
#endif
        u64                                gtime;
        struct prev_cputime                prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
        struct vtime                        vtime;
#endif

#ifdef CONFIG_NO_HZ_FULL
        atomic_t                        tick_dep_mask;
#endif
        /* Context switch counts: */
        unsigned long                        nvcsw;
        unsigned long                        nivcsw;

        /* Monotonic time in nsecs: */
        u64                                start_time;

        /* Boot based time in nsecs: */
        u64                                start_boottime;

        /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
        unsigned long                        min_flt;
        unsigned long                        maj_flt;

        /* Empty if CONFIG_POSIX_CPUTIMERS=n */
        struct posix_cputimers                posix_cputimers;

#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
        struct posix_cputimers_work        posix_cputimers_work;
#endif

        /* Process credentials: */

        /* Tracer's credentials at attach: */
        const struct cred __rcu                *ptracer_cred;

        /* Objective and real subjective task credentials (COW): */
        const struct cred __rcu                *real_cred;

        /* Effective (overridable) subjective task credentials (COW): */
        const struct cred __rcu                *cred;

#ifdef CONFIG_KEYS
        /* Cached requested key. */
        struct key                        *cached_requested_key;
#endif

        /*
         * executable name, excluding path.
         *
         * - normally initialized setup_new_exec()
         * - access it with [gs]et_task_comm()
         * - lock it with task_lock()
         */
        char                                comm[TASK_COMM_LEN];

        struct nameidata                *nameidata;

#ifdef CONFIG_SYSVIPC
        struct sysv_sem                        sysvsem;
        struct sysv_shm                        sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
        unsigned long                        last_switch_count;
        unsigned long                        last_switch_time;
#endif
        /* Filesystem information: */
        struct fs_struct                *fs;

        /* Open file information: */
        struct files_struct                *files;

#ifdef CONFIG_IO_URING
        struct io_uring_task                *io_uring;
#endif

        /* Namespaces: */
        struct nsproxy                        *nsproxy;

        /* Signal handlers: */
        struct signal_struct                *signal;
        struct sighand_struct __rcu                *sighand;
        sigset_t                        blocked;
        sigset_t                        real_blocked;
        /* Restored if set_restore_sigmask() was used: */
        sigset_t                        saved_sigmask;
        struct sigpending                pending;
        unsigned long                        sas_ss_sp;
        size_t                                sas_ss_size;
        unsigned int                        sas_ss_flags;

        struct callback_head                *task_works;

#ifdef CONFIG_AUDIT
#ifdef CONFIG_AUDITSYSCALL
        struct audit_context                *audit_context;
#endif
        kuid_t                                loginuid;
        unsigned int                        sessionid;
#endif
        struct seccomp                        seccomp;
        struct syscall_user_dispatch        syscall_dispatch;

        /* Thread group tracking: */
        u64                                parent_exec_id;
        u64                                self_exec_id;

        /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
        spinlock_t                        alloc_lock;

        /* Protection of the PI data structures: */
        raw_spinlock_t                        pi_lock;

        struct wake_q_node                wake_q;

#ifdef CONFIG_RT_MUTEXES
        /* PI waiters blocked on a rt_mutex held by this task: */
        struct rb_root_cached                pi_waiters;
        /* Updated under owner's pi_lock and rq lock */
        struct task_struct                *pi_top_task;
        /* Deadlock detection and priority inheritance handling: */
        struct rt_mutex_waiter                *pi_blocked_on;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
        /* Mutex deadlock detection: */
        struct mutex_waiter                *blocked_on;
#endif

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
        int                                non_block_count;
#endif

#ifdef CONFIG_TRACE_IRQFLAGS
        struct irqtrace_events                irqtrace;
        unsigned int                        hardirq_threaded;
        u64                                hardirq_chain_key;
        int                                softirqs_enabled;
        int                                softirq_context;
        int                                irq_config;
#endif
#ifdef CONFIG_PREEMPT_RT
        int                                softirq_disable_cnt;
#endif

#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH                        48UL
        u64                                curr_chain_key;
        int                                lockdep_depth;
        unsigned int                        lockdep_recursion;
        struct held_lock                held_locks[MAX_LOCK_DEPTH];
#endif

#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP)
        unsigned int                        in_ubsan;
#endif

        /* Journalling filesystem info: */
        void                                *journal_info;

        /* Stacked block device info: */
        struct bio_list                        *bio_list;

        /* Stack plugging: */
        struct blk_plug                        *plug;

        /* VM state: */
        struct reclaim_state                *reclaim_state;

        struct io_context                *io_context;

#ifdef CONFIG_COMPACTION
        struct capture_control                *capture_control;
#endif
        /* Ptrace state: */
        unsigned long                        ptrace_message;
        kernel_siginfo_t                *last_siginfo;

        struct task_io_accounting        ioac;
#ifdef CONFIG_PSI
        /* Pressure stall state */
        unsigned int                        psi_flags;
#endif
#ifdef CONFIG_TASK_XACCT
        /* Accumulated RSS usage: */
        u64                                acct_rss_mem1;
        /* Accumulated virtual memory usage: */
        u64                                acct_vm_mem1;
        /* stime + utime since last update: */
        u64                                acct_timexpd;
#endif
#ifdef CONFIG_CPUSETS
        /* Protected by ->alloc_lock: */
        nodemask_t                        mems_allowed;
        /* Sequence number to catch updates: */
        seqcount_spinlock_t                mems_allowed_seq;
        int                                cpuset_mem_spread_rotor;
        int                                cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
        /* Control Group info protected by css_set_lock: */
        struct css_set __rcu                *cgroups;
        /* cg_list protected by css_set_lock and tsk->alloc_lock: */
        struct list_head                cg_list;
#endif
#ifdef CONFIG_X86_CPU_RESCTRL
        u32                                closid;
        u32                                rmid;
#endif
#ifdef CONFIG_FUTEX
        struct robust_list_head __user        *robust_list;
#ifdef CONFIG_COMPAT
        struct compat_robust_list_head __user *compat_robust_list;
#endif
        struct list_head                pi_state_list;
        struct futex_pi_state                *pi_state_cache;
        struct mutex                        futex_exit_mutex;
        unsigned int                        futex_state;
#endif
#ifdef CONFIG_PERF_EVENTS
        struct perf_event_context        *perf_event_ctxp;
        struct mutex                        perf_event_mutex;
        struct list_head                perf_event_list;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
        unsigned long                        preempt_disable_ip;
#endif
#ifdef CONFIG_NUMA
        /* Protected by alloc_lock: */
        struct mempolicy                *mempolicy;
        short                                il_prev;
        u8                                il_weight;
        short                                pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
        int                                numa_scan_seq;
        unsigned int                        numa_scan_period;
        unsigned int                        numa_scan_period_max;
        int                                numa_preferred_nid;
        unsigned long                        numa_migrate_retry;
        /* Migration stamp: */
        u64                                node_stamp;
        u64                                last_task_numa_placement;
        u64                                last_sum_exec_runtime;
        struct callback_head                numa_work;

        /*
         * This pointer is only modified for current in syscall and
         * pagefault context (and for tasks being destroyed), so it can be read
         * from any of the following contexts:
         *  - RCU read-side critical section
         *  - current->numa_group from everywhere
         *  - task's runqueue locked, task not running
         */
        struct numa_group __rcu                *numa_group;

        /*
         * numa_faults is an array split into four regions:
         * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
         * in this precise order.
         *
         * faults_memory: Exponential decaying average of faults on a per-node
         * basis. Scheduling placement decisions are made based on these
         * counts. The values remain static for the duration of a PTE scan.
         * faults_cpu: Track the nodes the process was running on when a NUMA
         * hinting fault was incurred.
         * faults_memory_buffer and faults_cpu_buffer: Record faults per node
         * during the current scan window. When the scan completes, the counts
         * in faults_memory and faults_cpu decay and these values are copied.
         */
        unsigned long                        *numa_faults;
        unsigned long                        total_numa_faults;

        /*
         * numa_faults_locality tracks if faults recorded during the last
         * scan window were remote/local or failed to migrate. The task scan
         * period is adapted based on the locality of the faults with different
         * weights depending on whether they were shared or private faults
         */
        unsigned long                        numa_faults_locality[3];

        unsigned long                        numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_RSEQ
        struct rseq __user *rseq;
        u32 rseq_len;
        u32 rseq_sig;
        /*
         * RmW on rseq_event_mask must be performed atomically
         * with respect to preemption.
         */
        unsigned long rseq_event_mask;
#endif

#ifdef CONFIG_SCHED_MM_CID
        int                                mm_cid;                /* Current cid in mm */
        int                                last_mm_cid;        /* Most recent cid in mm */
        int                                migrate_from_cpu;
        int                                mm_cid_active;        /* Whether cid bitmap is active */
        struct callback_head                cid_work;
#endif

        struct tlbflush_unmap_batch        tlb_ubc;

        /* Cache last used pipe for splice(): */
        struct pipe_inode_info                *splice_pipe;

        struct page_frag                task_frag;

#ifdef CONFIG_TASK_DELAY_ACCT
        struct task_delay_info                *delays;
#endif

#ifdef CONFIG_FAULT_INJECTION
        int                                make_it_fail;
        unsigned int                        fail_nth;
#endif
        /*
         * When (nr_dirtied >= nr_dirtied_pause), it's time to call
         * balance_dirty_pages() for a dirty throttling pause:
         */
        int                                nr_dirtied;
        int                                nr_dirtied_pause;
        /* Start of a write-and-pause period: */
        unsigned long                        dirty_paused_when;

#ifdef CONFIG_LATENCYTOP
        int                                latency_record_count;
        struct latency_record                latency_record[LT_SAVECOUNT];
#endif
        /*
         * Time slack values; these are used to round up poll() and
         * select() etc timeout values. These are in nanoseconds.
         */
        u64                                timer_slack_ns;
        u64                                default_timer_slack_ns;

#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
        unsigned int                        kasan_depth;
#endif

#ifdef CONFIG_KCSAN
        struct kcsan_ctx                kcsan_ctx;
#ifdef CONFIG_TRACE_IRQFLAGS
        struct irqtrace_events                kcsan_save_irqtrace;
#endif
#ifdef CONFIG_KCSAN_WEAK_MEMORY
        int                                kcsan_stack_depth;
#endif
#endif

#ifdef CONFIG_KMSAN
        struct kmsan_ctx                kmsan_ctx;
#endif

#if IS_ENABLED(CONFIG_KUNIT)
        struct kunit                        *kunit_test;
#endif

#ifdef CONFIG_FUNCTION_GRAPH_TRACER
        /* Index of current stored address in ret_stack: */
        int                                curr_ret_stack;
        int                                curr_ret_depth;

        /* Stack of return addresses for return function tracing: */
        struct ftrace_ret_stack                *ret_stack;

        /* Timestamp for last schedule: */
        unsigned long long                ftrace_timestamp;

        /*
         * Number of functions that haven't been traced
         * because of depth overrun:
         */
        atomic_t                        trace_overrun;

        /* Pause tracing: */
        atomic_t                        tracing_graph_pause;
#endif

#ifdef CONFIG_TRACING
        /* Bitmask and counter of trace recursion: */
        unsigned long                        trace_recursion;
#endif /* CONFIG_TRACING */

#ifdef CONFIG_KCOV
        /* See kernel/kcov.c for more details. */

        /* Coverage collection mode enabled for this task (0 if disabled): */
        unsigned int                        kcov_mode;

        /* Size of the kcov_area: */
        unsigned int                        kcov_size;

        /* Buffer for coverage collection: */
        void                                *kcov_area;

        /* KCOV descriptor wired with this task or NULL: */
        struct kcov                        *kcov;

        /* KCOV common handle for remote coverage collection: */
        u64                                kcov_handle;

        /* KCOV sequence number: */
        int                                kcov_sequence;

        /* Collect coverage from softirq context: */
        unsigned int                        kcov_softirq;
#endif

#ifdef CONFIG_MEMCG
        struct mem_cgroup                *memcg_in_oom;

        /* Number of pages to reclaim on returning to userland: */
        unsigned int                        memcg_nr_pages_over_high;

        /* Used by memcontrol for targeted memcg charge: */
        struct mem_cgroup                *active_memcg;
#endif

#ifdef CONFIG_MEMCG_KMEM
        struct obj_cgroup                *objcg;
#endif

#ifdef CONFIG_BLK_CGROUP
        struct gendisk                        *throttle_disk;
#endif

#ifdef CONFIG_UPROBES
        struct uprobe_task                *utask;
#endif
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
        unsigned int                        sequential_io;
        unsigned int                        sequential_io_avg;
#endif
        struct kmap_ctrl                kmap_ctrl;
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
        unsigned long                        task_state_change;
# ifdef CONFIG_PREEMPT_RT
        unsigned long                        saved_state_change;
# endif
#endif
        struct rcu_head                        rcu;
        refcount_t                        rcu_users;
        int                                pagefault_disabled;
#ifdef CONFIG_MMU
        struct task_struct                *oom_reaper_list;
        struct timer_list                oom_reaper_timer;
#endif
#ifdef CONFIG_VMAP_STACK
        struct vm_struct                *stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
        /* A live task holds one reference: */
        refcount_t                        stack_refcount;
#endif
#ifdef CONFIG_LIVEPATCH
        int patch_state;
#endif
#ifdef CONFIG_SECURITY
        /* Used by LSM modules for access restriction: */
        void                                *security;
#endif
#ifdef CONFIG_BPF_SYSCALL
        /* Used by BPF task local storage */
        struct bpf_local_storage __rcu        *bpf_storage;
        /* Used for BPF run context */
        struct bpf_run_ctx                *bpf_ctx;
#endif

#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
        unsigned long                        lowest_stack;
        unsigned long                        prev_lowest_stack;
#endif

#ifdef CONFIG_X86_MCE
        void __user                        *mce_vaddr;
        __u64                                mce_kflags;
        u64                                mce_addr;
        __u64                                mce_ripv : 1,
                                        mce_whole_page : 1,
                                        __mce_reserved : 62;
        struct callback_head                mce_kill_me;
        int                                mce_count;
#endif

#ifdef CONFIG_KRETPROBES
        struct llist_head               kretprobe_instances;
#endif
#ifdef CONFIG_RETHOOK
        struct llist_head               rethooks;
#endif

#ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH
        /*
         * If L1D flush is supported on mm context switch
         * then we use this callback head to queue kill work
         * to kill tasks that are not running on SMT disabled
         * cores
         */
        struct callback_head                l1d_flush_kill;
#endif

#ifdef CONFIG_RV
        /*
         * Per-task RV monitor. Nowadays fixed in RV_PER_TASK_MONITORS.
         * If we find justification for more monitors, we can think
         * about adding more or developing a dynamic method. So far,
         * none of these are justified.
         */
        union rv_task_monitor                rv[RV_PER_TASK_MONITORS];
#endif

#ifdef CONFIG_USER_EVENTS
        struct user_event_mm                *user_event_mm;
#endif

        /*
         * New fields for task_struct should be added above here, so that
         * they are included in the randomized portion of task_struct.
         */
        randomized_struct_fields_end

        /* CPU-specific state of this task: */
        struct thread_struct                thread;

        /*
         * WARNING: on x86, 'thread_struct' contains a variable-sized
         * structure.  It *MUST* be at the end of 'task_struct'.
         *
         * Do not put anything below here!
         */
};

#define TASK_REPORT_IDLE        (TASK_REPORT + 1)
#define TASK_REPORT_MAX                (TASK_REPORT_IDLE << 1)

static inline unsigned int __task_state_index(unsigned int tsk_state,
                                              unsigned int tsk_exit_state)
{
        unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT;

        BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);

        if ((tsk_state & TASK_IDLE) == TASK_IDLE)
                state = TASK_REPORT_IDLE;

        /*
         * We're lying here, but rather than expose a completely new task state
         * to userspace, we can make this appear as if the task has gone through
         * a regular rt_mutex_lock() call.
         */
        if (tsk_state & TASK_RTLOCK_WAIT)
                state = TASK_UNINTERRUPTIBLE;

        return fls(state);
}

static inline unsigned int task_state_index(struct task_struct *tsk)
{
        return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state);
}

static inline char task_index_to_char(unsigned int state)
{
        static const char state_char[] = "RSDTtXZPI";

        BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1);

        return state_char[state];
}

static inline char task_state_to_char(struct task_struct *tsk)
{
        return task_index_to_char(task_state_index(tsk));
}

extern struct pid *cad_pid;

/*
 * Per process flags
 */
#define PF_VCPU                        0x00000001        /* I'm a virtual CPU */
#define PF_IDLE                        0x00000002        /* I am an IDLE thread */
#define PF_EXITING                0x00000004        /* Getting shut down */
#define PF_POSTCOREDUMP                0x00000008        /* Coredumps should ignore this task */
#define PF_IO_WORKER                0x00000010        /* Task is an IO worker */
#define PF_WQ_WORKER                0x00000020        /* I'm a workqueue worker */
#define PF_FORKNOEXEC                0x00000040        /* Forked but didn't exec */
#define PF_MCE_PROCESS                0x00000080      /* Process policy on mce errors */
#define PF_SUPERPRIV                0x00000100        /* Used super-user privileges */
#define PF_DUMPCORE                0x00000200        /* Dumped core */
#define PF_SIGNALED                0x00000400        /* Killed by a signal */
#define PF_MEMALLOC                0x00000800        /* Allocating memory to free memory. See memalloc_noreclaim_save() */
#define PF_NPROC_EXCEEDED        0x00001000        /* set_user() noticed that RLIMIT_NPROC was exceeded */
#define PF_USED_MATH                0x00002000        /* If unset the fpu must be initialized before use */
#define PF_USER_WORKER                0x00004000        /* Kernel thread cloned from userspace thread */
#define PF_NOFREEZE                0x00008000        /* This thread should not be frozen */
#define PF__HOLE__00010000        0x00010000
#define PF_KSWAPD                0x00020000        /* I am kswapd */
#define PF_MEMALLOC_NOFS        0x00040000        /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */
#define PF_MEMALLOC_NOIO        0x00080000        /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */
#define PF_LOCAL_THROTTLE        0x00100000        /* Throttle writes only against the bdi I write to,
                                                 * I am cleaning dirty pages from some other bdi. */
#define PF_KTHREAD                0x00200000        /* I am a kernel thread */
#define PF_RANDOMIZE                0x00400000        /* Randomize virtual address space */
#define PF_MEMALLOC_NORECLAIM        0x00800000        /* All allocation requests will clear __GFP_DIRECT_RECLAIM */
#define PF_MEMALLOC_NOWARN        0x01000000        /* All allocation requests will inherit __GFP_NOWARN */
#define PF__HOLE__02000000        0x02000000
#define PF_NO_SETAFFINITY        0x04000000        /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY                0x08000000      /* Early kill for mce process policy */
#define PF_MEMALLOC_PIN                0x10000000        /* Allocations constrained to zones which allow long term pinning.
                                                 * See memalloc_pin_save() */
#define PF_BLOCK_TS                0x20000000        /* plug has ts that needs updating */
#define PF__HOLE__40000000        0x40000000
#define PF_SUSPEND_TASK                0x80000000      /* This thread called freeze_processes() and should not be frozen */

/*
 * Only the _current_ task can read/write to tsk->flags, but other
 * tasks can access tsk->flags in readonly mode for example
 * with tsk_used_math (like during threaded core dumping).
 * There is however an exception to this rule during ptrace
 * or during fork: the ptracer task is allowed to write to the
 * child->flags of its traced child (same goes for fork, the parent
 * can write to the child->flags), because we're guaranteed the
 * child is not running and in turn not changing child->flags
 * at the same time the parent does it.
 */
#define clear_stopped_child_used_math(child)        do { (child)->flags &= ~PF_USED_MATH; } while (0)
#define set_stopped_child_used_math(child)        do { (child)->flags |= PF_USED_MATH; } while (0)
#define clear_used_math()                        clear_stopped_child_used_math(current)
#define set_used_math()                                set_stopped_child_used_math(current)

#define conditional_stopped_child_used_math(condition, child) \
        do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)

#define conditional_used_math(condition)        conditional_stopped_child_used_math(condition, current)

#define copy_to_stopped_child_used_math(child) \
        do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)

/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
#define tsk_used_math(p)                        ((p)->flags & PF_USED_MATH)
#define used_math()                                tsk_used_math(current)

static __always_inline bool is_percpu_thread(void)
{
#ifdef CONFIG_SMP
        return (current->flags & PF_NO_SETAFFINITY) &&
                (current->nr_cpus_allowed  == 1);
#else
        return true;
#endif
}

/* Per-process atomic flags. */
#define PFA_NO_NEW_PRIVS                0        /* May not gain new privileges. */
#define PFA_SPREAD_PAGE                        1        /* Spread page cache over cpuset */
#define PFA_SPREAD_SLAB                        2        /* Spread some slab caches over cpuset */
#define PFA_SPEC_SSB_DISABLE                3        /* Speculative Store Bypass disabled */
#define PFA_SPEC_SSB_FORCE_DISABLE        4        /* Speculative Store Bypass force disabled*/
#define PFA_SPEC_IB_DISABLE                5        /* Indirect branch speculation restricted */
#define PFA_SPEC_IB_FORCE_DISABLE        6        /* Indirect branch speculation permanently restricted */
#define PFA_SPEC_SSB_NOEXEC                7        /* Speculative Store Bypass clear on execve() */

#define TASK_PFA_TEST(name, func)                                        \
        static inline bool task_##func(struct task_struct *p)                \
        { return test_bit(PFA_##name, &p->atomic_flags); }

#define TASK_PFA_SET(name, func)                                        \
        static inline void task_set_##func(struct task_struct *p)        \
        { set_bit(PFA_##name, &p->atomic_flags); }

#define TASK_PFA_CLEAR(name, func)                                        \
        static inline void task_clear_##func(struct task_struct *p)        \
        { clear_bit(PFA_##name, &p->atomic_flags); }

TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs)
TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs)

TASK_PFA_TEST(SPREAD_PAGE, spread_page)
TASK_PFA_SET(SPREAD_PAGE, spread_page)
TASK_PFA_CLEAR(SPREAD_PAGE, spread_page)

TASK_PFA_TEST(SPREAD_SLAB, spread_slab)
TASK_PFA_SET(SPREAD_SLAB, spread_slab)
TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)

TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable)
TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable)
TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable)

TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec)
TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec)
TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec)

TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)

TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable)

TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)

static inline void
current_restore_flags(unsigned long orig_flags, unsigned long flags)
{
        current->flags &= ~flags;
        current->flags |= orig_flags & flags;
}

extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
extern int task_can_attach(struct task_struct *p);
extern int dl_bw_alloc(int cpu, u64 dl_bw);
extern void dl_bw_free(int cpu, u64 dl_bw);
#ifdef CONFIG_SMP

/* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);

/**
 * set_cpus_allowed_ptr - set CPU affinity mask of a task
 * @p: the task
 * @new_mask: CPU affinity mask
 *
 * Return: zero if successful, or a negative error code
 */
extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
extern void release_user_cpus_ptr(struct task_struct *p);
extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
extern void force_compatible_cpus_allowed_ptr(struct task_struct *p);
extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p);
#else
static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
}
static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
        if (!cpumask_test_cpu(0, new_mask))
                return -EINVAL;
        return 0;
}
static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
{
        if (src->user_cpus_ptr)
                return -EINVAL;
        return 0;
}
static inline void release_user_cpus_ptr(struct task_struct *p)
{
        WARN_ON(p->user_cpus_ptr);
}

static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
{
        return 0;
}
#endif

extern int yield_to(struct task_struct *p, bool preempt);
extern void set_user_nice(struct task_struct *p, long nice);
extern int task_prio(const struct task_struct *p);

/**
 * task_nice - return the nice value of a given task.
 * @p: the task in question.
 *
 * Return: The nice value [ -20 ... 0 ... 19 ].
 */
static inline int task_nice(const struct task_struct *p)
{
        return PRIO_TO_NICE((p)->static_prio);
}

extern int can_nice(const struct task_struct *p, const int nice);
extern int task_curr(const struct task_struct *p);
extern int idle_cpu(int cpu);
extern int available_idle_cpu(int cpu);
extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
extern void sched_set_fifo(struct task_struct *p);
extern void sched_set_fifo_low(struct task_struct *p);
extern void sched_set_normal(struct task_struct *p, int nice);
extern int sched_setattr(struct task_struct *, const struct sched_attr *);
extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
extern struct task_struct *idle_task(int cpu);

/**
 * is_idle_task - is the specified task an idle task?
 * @p: the task in question.
 *
 * Return: 1 if @p is an idle task. 0 otherwise.
 */
static __always_inline bool is_idle_task(const struct task_struct *p)
{
        return !!(p->flags & PF_IDLE);
}

extern struct task_struct *curr_task(int cpu);
extern void ia64_set_curr_task(int cpu, struct task_struct *p);

void yield(void);

union thread_union {
        struct task_struct task;
#ifndef CONFIG_THREAD_INFO_IN_TASK
        struct thread_info thread_info;
#endif
        unsigned long stack[THREAD_SIZE/sizeof(long)];
};

#ifndef CONFIG_THREAD_INFO_IN_TASK
extern struct thread_info init_thread_info;
#endif

extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)];

#ifdef CONFIG_THREAD_INFO_IN_TASK
# define task_thread_info(task)        (&(task)->thread_info)
#elif !defined(__HAVE_THREAD_FUNCTIONS)
# define task_thread_info(task)        ((struct thread_info *)(task)->stack)
#endif

/*
 * find a task by one of its numerical ids
 *
 * find_task_by_pid_ns():
 *      finds a task by its pid in the specified namespace
 * find_task_by_vpid():
 *      finds a task by its virtual pid
 *
 * see also find_vpid() etc in include/linux/pid.h
 */

extern struct task_struct *find_task_by_vpid(pid_t nr);
extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns);

/*
 * find a task by its virtual pid and get the task struct
 */
extern struct task_struct *find_get_task_by_vpid(pid_t nr);

extern int wake_up_state(struct task_struct *tsk, unsigned int state);
extern int wake_up_process(struct task_struct *tsk);
extern void wake_up_new_task(struct task_struct *tsk);

#ifdef CONFIG_SMP
extern void kick_process(struct task_struct *tsk);
#else
static inline void kick_process(struct task_struct *tsk) { }
#endif

extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);

static inline void set_task_comm(struct task_struct *tsk, const char *from)
{
        __set_task_comm(tsk, from, false);
}

extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk);
#define get_task_comm(buf, tsk) ({                        \
        BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN);        \
        __get_task_comm(buf, sizeof(buf), tsk);                \
})

#ifdef CONFIG_SMP
static __always_inline void scheduler_ipi(void)
{
        /*
         * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
         * TIF_NEED_RESCHED remotely (for the first time) will also send
         * this IPI.
         */
        preempt_fold_need_resched();
}
#else
static inline void scheduler_ipi(void) { }
#endif

extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);

/*
 * Set thread flags in other task's structures.
 * See asm/thread_info.h for TIF_xxxx flags available:
 */
static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        set_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        clear_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag,
                                          bool value)
{
        update_ti_thread_flag(task_thread_info(tsk), flag, value);
}

static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        return test_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline void set_tsk_need_resched(struct task_struct *tsk)
{
        set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}

static inline void clear_tsk_need_resched(struct task_struct *tsk)
{
        clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}

static inline int test_tsk_need_resched(struct task_struct *tsk)
{
        return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
}

/*
 * cond_resched() and cond_resched_lock(): latency reduction via
 * explicit rescheduling in places that are safe. The return
 * value indicates whether a reschedule was done in fact.
 * cond_resched_lock() will drop the spinlock before scheduling,
 */
#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
extern int __cond_resched(void);

#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)

void sched_dynamic_klp_enable(void);
void sched_dynamic_klp_disable(void);

DECLARE_STATIC_CALL(cond_resched, __cond_resched);

static __always_inline int _cond_resched(void)
{
        return static_call_mod(cond_resched)();
}

#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)

extern int dynamic_cond_resched(void);

static __always_inline int _cond_resched(void)
{
        return dynamic_cond_resched();
}

#else /* !CONFIG_PREEMPTION */

static inline int _cond_resched(void)
{
        klp_sched_try_switch();
        return __cond_resched();
}

#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */

#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */

static inline int _cond_resched(void)
{
        klp_sched_try_switch();
        return 0;
}

#endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */

#define cond_resched() ({                        \
        __might_resched(__FILE__, __LINE__, 0);        \
        _cond_resched();                        \
})

extern int __cond_resched_lock(spinlock_t *lock);
extern int __cond_resched_rwlock_read(rwlock_t *lock);
extern int __cond_resched_rwlock_write(rwlock_t *lock);

#define MIGHT_RESCHED_RCU_SHIFT                8
#define MIGHT_RESCHED_PREEMPT_MASK        ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1)

#ifndef CONFIG_PREEMPT_RT
/*
 * Non RT kernels have an elevated preempt count due to the held lock,
 * but are not allowed to be inside a RCU read side critical section
 */
# define PREEMPT_LOCK_RESCHED_OFFSETS        PREEMPT_LOCK_OFFSET
#else
/*
 * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in
 * cond_resched*lock() has to take that into account because it checks for
 * preempt_count() and rcu_preempt_depth().
 */
# define PREEMPT_LOCK_RESCHED_OFFSETS        \
        (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT))
#endif

#define cond_resched_lock(lock) ({                                                \
        __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);        \
        __cond_resched_lock(lock);                                                \
})

#define cond_resched_rwlock_read(lock) ({                                        \
        __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);        \
        __cond_resched_rwlock_read(lock);                                        \
})

#define cond_resched_rwlock_write(lock) ({                                        \
        __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);        \
        __cond_resched_rwlock_write(lock);                                        \
})

#ifdef CONFIG_PREEMPT_DYNAMIC

extern bool preempt_model_none(void);
extern bool preempt_model_voluntary(void);
extern bool preempt_model_full(void);

#else

static inline bool preempt_model_none(void)
{
        return IS_ENABLED(CONFIG_PREEMPT_NONE);
}
static inline bool preempt_model_voluntary(void)
{
        return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY);
}
static inline bool preempt_model_full(void)
{
        return IS_ENABLED(CONFIG_PREEMPT);
}

#endif

static inline bool preempt_model_rt(void)
{
        return IS_ENABLED(CONFIG_PREEMPT_RT);
}

/*
 * Does the preemption model allow non-cooperative preemption?
 *
 * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with
 * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the
 * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the
 * PREEMPT_NONE model.
 */
static inline bool preempt_model_preemptible(void)
{
        return preempt_model_full() || preempt_model_rt();
}

static __always_inline bool need_resched(void)
{
        return unlikely(tif_need_resched());
}

/*
 * Wrappers for p->thread_info->cpu access. No-op on UP.
 */
#ifdef CONFIG_SMP

static inline unsigned int task_cpu(const struct task_struct *p)
{
        return READ_ONCE(task_thread_info(p)->cpu);
}

extern void set_task_cpu(struct task_struct *p, unsigned int cpu);

#else

static inline unsigned int task_cpu(const struct task_struct *p)
{
        return 0;
}

static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
{
}

#endif /* CONFIG_SMP */

extern bool sched_task_on_rq(struct task_struct *p);
extern unsigned long get_wchan(struct task_struct *p);
extern struct task_struct *cpu_curr_snapshot(int cpu);

#include <linux/spinlock.h>

/*
 * In order to reduce various lock holder preemption latencies provide an
 * interface to see if a vCPU is currently running or not.
 *
 * This allows us to terminate optimistic spin loops and block, analogous to
 * the native optimistic spin heuristic of testing if the lock owner task is
 * running or not.
 */
#ifndef vcpu_is_preempted
static inline bool vcpu_is_preempted(int cpu)
{
        return false;
}
#endif

extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
extern long sched_getaffinity(pid_t pid, struct cpumask *mask);

#ifndef TASK_SIZE_OF
#define TASK_SIZE_OF(tsk)        TASK_SIZE
#endif

#ifdef CONFIG_SMP
static inline bool owner_on_cpu(struct task_struct *owner)
{
        /*
         * As lock holder preemption issue, we both skip spinning if
         * task is not on cpu or its cpu is preempted
         */
        return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner));
}

/* Returns effective CPU energy utilization, as seen by the scheduler */
unsigned long sched_cpu_util(int cpu);
#endif /* CONFIG_SMP */

#ifdef CONFIG_SCHED_CORE
extern void sched_core_free(struct task_struct *tsk);
extern void sched_core_fork(struct task_struct *p);
extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
                                unsigned long uaddr);
extern int sched_core_idle_cpu(int cpu);
#else
static inline void sched_core_free(struct task_struct *tsk) { }
static inline void sched_core_fork(struct task_struct *p) { }
static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
#endif

extern void sched_set_stop_task(int cpu, struct task_struct *stop);

#ifdef CONFIG_MEM_ALLOC_PROFILING
static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag)
{
        swap(current->alloc_tag, tag);
        return tag;
}

static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old)
{
#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
        WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n");
#endif
        current->alloc_tag = old;
}
#else
#define alloc_tag_save(_tag)                        NULL
#define alloc_tag_restore(_tag, _old)                do {} while (0)
#endif

#endif


































































































































































































































































   11 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MMU_CONTEXT_H
#define _ASM_X86_MMU_CONTEXT_H

#include <asm/desc.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/pkeys.h>

#include <trace/events/tlb.h>

#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/debugreg.h>
#include <asm/gsseg.h>

extern atomic64_t last_mm_ctx_id;

#ifdef CONFIG_PERF_EVENTS
DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key);
DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);
void cr4_update_pce(void *ignored);
#endif

#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
 * ldt_structs can be allocated, used, and freed, but they are never
 * modified while live.
 */
struct ldt_struct {
        /*
         * Xen requires page-aligned LDTs with special permissions.  This is
         * needed to prevent us from installing evil descriptors such as
         * call gates.  On native, we could merge the ldt_struct and LDT
         * allocations, but it's not worth trying to optimize.
         */
        struct desc_struct        *entries;
        unsigned int                nr_entries;

        /*
         * If PTI is in use, then the entries array is not mapped while we're
         * in user mode.  The whole array will be aliased at the addressed
         * given by ldt_slot_va(slot).  We use two slots so that we can allocate
         * and map, and enable a new LDT without invalidating the mapping
         * of an older, still-in-use LDT.
         *
         * slot will be -1 if this LDT doesn't have an alias mapping.
         */
        int                        slot;
};

/*
 * Used for LDT copy/destruction.
 */
static inline void init_new_context_ldt(struct mm_struct *mm)
{
        mm->context.ldt = NULL;
        init_rwsem(&mm->context.ldt_usr_sem);
}
int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm);
void ldt_arch_exit_mmap(struct mm_struct *mm);
#else        /* CONFIG_MODIFY_LDT_SYSCALL */
static inline void init_new_context_ldt(struct mm_struct *mm) { }
static inline int ldt_dup_context(struct mm_struct *oldmm,
                                  struct mm_struct *mm)
{
        return 0;
}
static inline void destroy_context_ldt(struct mm_struct *mm) { }
static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
#endif

#ifdef CONFIG_MODIFY_LDT_SYSCALL
extern void load_mm_ldt(struct mm_struct *mm);
extern void switch_ldt(struct mm_struct *prev, struct mm_struct *next);
#else
static inline void load_mm_ldt(struct mm_struct *mm)
{
        clear_LDT();
}
static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
{
        DEBUG_LOCKS_WARN_ON(preemptible());
}
#endif

#ifdef CONFIG_ADDRESS_MASKING
static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
{
        return mm->context.lam_cr3_mask;
}

static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
{
        mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask;
        mm->context.untag_mask = oldmm->context.untag_mask;
}

#define mm_untag_mask mm_untag_mask
static inline unsigned long mm_untag_mask(struct mm_struct *mm)
{
        return mm->context.untag_mask;
}

static inline void mm_reset_untag_mask(struct mm_struct *mm)
{
        mm->context.untag_mask = -1UL;
}

#define arch_pgtable_dma_compat arch_pgtable_dma_compat
static inline bool arch_pgtable_dma_compat(struct mm_struct *mm)
{
        return !mm_lam_cr3_mask(mm) ||
                test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags);
}
#else

static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
{
        return 0;
}

static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
{
}

static inline void mm_reset_untag_mask(struct mm_struct *mm)
{
}
#endif

#define enter_lazy_tlb enter_lazy_tlb
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);

/*
 * Init a new mm.  Used on mm copies, like at fork()
 * and on mm's that are brand-new, like at execve().
 */
#define init_new_context init_new_context
static inline int init_new_context(struct task_struct *tsk,
                                   struct mm_struct *mm)
{
        mutex_init(&mm->context.lock);

        mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
        atomic64_set(&mm->context.tlb_gen, 0);

#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
                /* pkey 0 is the default and allocated implicitly */
                mm->context.pkey_allocation_map = 0x1;
                /* -1 means unallocated or invalid */
                mm->context.execute_only_pkey = -1;
        }
#endif
        mm_reset_untag_mask(mm);
        init_new_context_ldt(mm);
        return 0;
}

#define destroy_context destroy_context
static inline void destroy_context(struct mm_struct *mm)
{
        destroy_context_ldt(mm);
}

extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
                      struct task_struct *tsk);

extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                               struct task_struct *tsk);
#define switch_mm_irqs_off switch_mm_irqs_off

#define activate_mm(prev, next)                        \
do {                                                \
        paravirt_enter_mmap(next);                \
        switch_mm((prev), (next), NULL);        \
} while (0);

#ifdef CONFIG_X86_32
#define deactivate_mm(tsk, mm)                        \
do {                                                \
        loadsegment(gs, 0);                        \
} while (0)
#else
#define deactivate_mm(tsk, mm)                        \
do {                                                \
        shstk_free(tsk);                        \
        load_gs_index(0);                        \
        loadsegment(fs, 0);                        \
} while (0)
#endif

static inline void arch_dup_pkeys(struct mm_struct *oldmm,
                                  struct mm_struct *mm)
{
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return;

        /* Duplicate the oldmm pkey state in mm: */
        mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map;
        mm->context.execute_only_pkey   = oldmm->context.execute_only_pkey;
#endif
}

static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
        arch_dup_pkeys(oldmm, mm);
        paravirt_enter_mmap(mm);
        dup_lam(oldmm, mm);
        return ldt_dup_context(oldmm, mm);
}

static inline void arch_exit_mmap(struct mm_struct *mm)
{
        paravirt_arch_exit_mmap(mm);
        ldt_arch_exit_mmap(mm);
}

#ifdef CONFIG_X86_64
static inline bool is_64bit_mm(struct mm_struct *mm)
{
        return        !IS_ENABLED(CONFIG_IA32_EMULATION) ||
                !test_bit(MM_CONTEXT_UPROBE_IA32, &mm->context.flags);
}
#else
static inline bool is_64bit_mm(struct mm_struct *mm)
{
        return false;
}
#endif

static inline void arch_unmap(struct mm_struct *mm, unsigned long start,
                              unsigned long end)
{
}

/*
 * We only want to enforce protection keys on the current process
 * because we effectively have no access to PKRU for other
 * processes or any way to tell *which * PKRU in a threaded
 * process we could use.
 *
 * So do not enforce things if the VMA is not from the current
 * mm, or if we are in a kernel thread.
 */
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
                bool write, bool execute, bool foreign)
{
        /* pkeys never affect instruction fetches */
        if (execute)
                return true;
        /* allow access if the VMA is not one from this process */
        if (foreign || vma_is_foreign(vma))
                return true;
        return __pkru_allows_pkey(vma_pkey(vma), write);
}

unsigned long __get_current_cr3_fast(void);

#include <asm-generic/mmu_context.h>

#endif /* _ASM_X86_MMU_CONTEXT_H */










































































































































































































































































































































































































































































































































































































    5 




    2 

    1 

    2 

    2 





    1 





















































    5 

    5 



    5 






    5 

















































































































































































































    4 





    4 














    3 




















    4 
    4 
    3 




    3 


    4 

    3 

    4 











    4 
    4 


















    6 









    5 





    5 






    6 









    5 


















    5 
















    5 












    6 















    5 












    5 





























































    5 



    5 


    5 

    5 
















    3 












    5 




























































    3 




















































































    1 


    1 














































































    4 

















    2 

    1 
    1 






















    4 














































    3 









    3 
    3 





























































































































    3 









    3 


    3 





    3 











    3 











































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 














































    3 































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
// SPDX-License-Identifier: GPL-2.0
/*
 *  Kernel internal timers
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  1997-01-28  Modified by Finn Arne Gangstad to make timers scale better.
 *
 *  1997-09-10  Updated NTP code according to technical memorandum Jan '96
 *              "A Kernel Model for Precision Timekeeping" by Dave Mills
 *  1998-12-24  Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
 *              serialize accesses to xtime/lost_ticks).
 *                              Copyright (C) 1998  Andrea Arcangeli
 *  1999-03-10  Improved NTP compatibility by Ulrich Windl
 *  2002-05-31        Move sys_sysinfo here and make its locking sane, Robert Love
 *  2000-10-05  Implemented scalable SMP per-CPU timer handling.
 *                              Copyright (C) 2000, 2001, 2002  Ingo Molnar
 *              Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
 */

#include <linux/kernel_stat.h>
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/pid_namespace.h>
#include <linux/notifier.h>
#include <linux/thread_info.h>
#include <linux/time.h>
#include <linux/jiffies.h>
#include <linux/posix-timers.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
#include <linux/irq_work.h>
#include <linux/sched/signal.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/nohz.h>
#include <linux/sched/debug.h>
#include <linux/slab.h>
#include <linux/compat.h>
#include <linux/random.h>
#include <linux/sysctl.h>

#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/div64.h>
#include <asm/timex.h>
#include <asm/io.h>

#include "tick-internal.h"
#include "timer_migration.h"

#define CREATE_TRACE_POINTS
#include <trace/events/timer.h>

__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;

EXPORT_SYMBOL(jiffies_64);

/*
 * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
 * LVL_SIZE buckets. Each level is driven by its own clock and therefore each
 * level has a different granularity.
 *
 * The level granularity is:                LVL_CLK_DIV ^ level
 * The level clock frequency is:        HZ / (LVL_CLK_DIV ^ level)
 *
 * The array level of a newly armed timer depends on the relative expiry
 * time. The farther the expiry time is away the higher the array level and
 * therefore the granularity becomes.
 *
 * Contrary to the original timer wheel implementation, which aims for 'exact'
 * expiry of the timers, this implementation removes the need for recascading
 * the timers into the lower array levels. The previous 'classic' timer wheel
 * implementation of the kernel already violated the 'exact' expiry by adding
 * slack to the expiry time to provide batched expiration. The granularity
 * levels provide implicit batching.
 *
 * This is an optimization of the original timer wheel implementation for the
 * majority of the timer wheel use cases: timeouts. The vast majority of
 * timeout timers (networking, disk I/O ...) are canceled before expiry. If
 * the timeout expires it indicates that normal operation is disturbed, so it
 * does not matter much whether the timeout comes with a slight delay.
 *
 * The only exception to this are networking timers with a small expiry
 * time. They rely on the granularity. Those fit into the first wheel level,
 * which has HZ granularity.
 *
 * We don't have cascading anymore. timers with a expiry time above the
 * capacity of the last wheel level are force expired at the maximum timeout
 * value of the last wheel level. From data sampling we know that the maximum
 * value observed is 5 days (network connection tracking), so this should not
 * be an issue.
 *
 * The currently chosen array constants values are a good compromise between
 * array size and granularity.
 *
 * This results in the following granularity and range levels:
 *
 * HZ 1000 steps
 * Level Offset  Granularity            Range
 *  0      0         1 ms                0 ms -         63 ms
 *  1     64         8 ms               64 ms -        511 ms
 *  2    128        64 ms              512 ms -       4095 ms (512ms - ~4s)
 *  3    192       512 ms             4096 ms -      32767 ms (~4s - ~32s)
 *  4    256      4096 ms (~4s)      32768 ms -     262143 ms (~32s - ~4m)
 *  5    320     32768 ms (~32s)    262144 ms -    2097151 ms (~4m - ~34m)
 *  6    384    262144 ms (~4m)    2097152 ms -   16777215 ms (~34m - ~4h)
 *  7    448   2097152 ms (~34m)  16777216 ms -  134217727 ms (~4h - ~1d)
 *  8    512  16777216 ms (~4h)  134217728 ms - 1073741822 ms (~1d - ~12d)
 *
 * HZ  300
 * Level Offset  Granularity            Range
 *  0           0         3 ms                0 ms -        210 ms
 *  1          64        26 ms              213 ms -       1703 ms (213ms - ~1s)
 *  2         128       213 ms             1706 ms -      13650 ms (~1s - ~13s)
 *  3         192      1706 ms (~1s)      13653 ms -     109223 ms (~13s - ~1m)
 *  4         256     13653 ms (~13s)    109226 ms -     873810 ms (~1m - ~14m)
 *  5         320    109226 ms (~1m)     873813 ms -    6990503 ms (~14m - ~1h)
 *  6         384    873813 ms (~14m)   6990506 ms -   55924050 ms (~1h - ~15h)
 *  7         448   6990506 ms (~1h)   55924053 ms -  447392423 ms (~15h - ~5d)
 *  8    512  55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
 *
 * HZ  250
 * Level Offset  Granularity            Range
 *  0           0         4 ms                0 ms -        255 ms
 *  1          64        32 ms              256 ms -       2047 ms (256ms - ~2s)
 *  2         128       256 ms             2048 ms -      16383 ms (~2s - ~16s)
 *  3         192      2048 ms (~2s)      16384 ms -     131071 ms (~16s - ~2m)
 *  4         256     16384 ms (~16s)    131072 ms -    1048575 ms (~2m - ~17m)
 *  5         320    131072 ms (~2m)    1048576 ms -    8388607 ms (~17m - ~2h)
 *  6         384   1048576 ms (~17m)   8388608 ms -   67108863 ms (~2h - ~18h)
 *  7         448   8388608 ms (~2h)   67108864 ms -  536870911 ms (~18h - ~6d)
 *  8    512  67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
 *
 * HZ  100
 * Level Offset  Granularity            Range
 *  0           0         10 ms               0 ms -        630 ms
 *  1          64         80 ms             640 ms -       5110 ms (640ms - ~5s)
 *  2         128        640 ms            5120 ms -      40950 ms (~5s - ~40s)
 *  3         192       5120 ms (~5s)     40960 ms -     327670 ms (~40s - ~5m)
 *  4         256      40960 ms (~40s)   327680 ms -    2621430 ms (~5m - ~43m)
 *  5         320     327680 ms (~5m)   2621440 ms -   20971510 ms (~43m - ~5h)
 *  6         384    2621440 ms (~43m) 20971520 ms -  167772150 ms (~5h - ~1d)
 *  7         448   20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
 */

/* Clock divisor for the next level */
#define LVL_CLK_SHIFT        3
#define LVL_CLK_DIV        (1UL << LVL_CLK_SHIFT)
#define LVL_CLK_MASK        (LVL_CLK_DIV - 1)
#define LVL_SHIFT(n)        ((n) * LVL_CLK_SHIFT)
#define LVL_GRAN(n)        (1UL << LVL_SHIFT(n))

/*
 * The time start value for each level to select the bucket at enqueue
 * time. We start from the last possible delta of the previous level
 * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
 */
#define LVL_START(n)        ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))

/* Size of each clock level */
#define LVL_BITS        6
#define LVL_SIZE        (1UL << LVL_BITS)
#define LVL_MASK        (LVL_SIZE - 1)
#define LVL_OFFS(n)        ((n) * LVL_SIZE)

/* Level depth */
#if HZ > 100
# define LVL_DEPTH        9
# else
# define LVL_DEPTH        8
#endif

/* The cutoff (max. capacity of the wheel) */
#define WHEEL_TIMEOUT_CUTOFF        (LVL_START(LVL_DEPTH))
#define WHEEL_TIMEOUT_MAX        (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))

/*
 * The resulting wheel size. If NOHZ is configured we allocate two
 * wheels so we have a separate storage for the deferrable timers.
 */
#define WHEEL_SIZE        (LVL_SIZE * LVL_DEPTH)

#ifdef CONFIG_NO_HZ_COMMON
/*
 * If multiple bases need to be locked, use the base ordering for lock
 * nesting, i.e. lowest number first.
 */
# define NR_BASES        3
# define BASE_LOCAL        0
# define BASE_GLOBAL        1
# define BASE_DEF        2
#else
# define NR_BASES        1
# define BASE_LOCAL        0
# define BASE_GLOBAL        0
# define BASE_DEF        0
#endif

/**
 * struct timer_base - Per CPU timer base (number of base depends on config)
 * @lock:                Lock protecting the timer_base
 * @running_timer:        When expiring timers, the lock is dropped. To make
 *                        sure not to race against deleting/modifying a
 *                        currently running timer, the pointer is set to the
 *                        timer, which expires at the moment. If no timer is
 *                        running, the pointer is NULL.
 * @expiry_lock:        PREEMPT_RT only: Lock is taken in softirq around
 *                        timer expiry callback execution and when trying to
 *                        delete a running timer and it wasn't successful in
 *                        the first glance. It prevents priority inversion
 *                        when callback was preempted on a remote CPU and a
 *                        caller tries to delete the running timer. It also
 *                        prevents a life lock, when the task which tries to
 *                        delete a timer preempted the softirq thread which
 *                        is running the timer callback function.
 * @timer_waiters:        PREEMPT_RT only: Tells, if there is a waiter
 *                        waiting for the end of the timer callback function
 *                        execution.
 * @clk:                clock of the timer base; is updated before enqueue
 *                        of a timer; during expiry, it is 1 offset ahead of
 *                        jiffies to avoid endless requeuing to current
 *                        jiffies
 * @next_expiry:        expiry value of the first timer; it is updated when
 *                        finding the next timer and during enqueue; the
 *                        value is not valid, when next_expiry_recalc is set
 * @cpu:                Number of CPU the timer base belongs to
 * @next_expiry_recalc: States, whether a recalculation of next_expiry is
 *                        required. Value is set true, when a timer was
 *                        deleted.
 * @is_idle:                Is set, when timer_base is idle. It is triggered by NOHZ
 *                        code. This state is only used in standard
 *                        base. Deferrable timers, which are enqueued remotely
 *                        never wake up an idle CPU. So no matter of supporting it
 *                        for this base.
 * @timers_pending:        Is set, when a timer is pending in the base. It is only
 *                        reliable when next_expiry_recalc is not set.
 * @pending_map:        bitmap of the timer wheel; each bit reflects a
 *                        bucket of the wheel. When a bit is set, at least a
 *                        single timer is enqueued in the related bucket.
 * @vectors:                Array of lists; Each array member reflects a bucket
 *                        of the timer wheel. The list contains all timers
 *                        which are enqueued into a specific bucket.
 */
struct timer_base {
        raw_spinlock_t                lock;
        struct timer_list        *running_timer;
#ifdef CONFIG_PREEMPT_RT
        spinlock_t                expiry_lock;
        atomic_t                timer_waiters;
#endif
        unsigned long                clk;
        unsigned long                next_expiry;
        unsigned int                cpu;
        bool                        next_expiry_recalc;
        bool                        is_idle;
        bool                        timers_pending;
        DECLARE_BITMAP(pending_map, WHEEL_SIZE);
        struct hlist_head        vectors[WHEEL_SIZE];
} ____cacheline_aligned;

static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);

#ifdef CONFIG_NO_HZ_COMMON

static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
static DEFINE_MUTEX(timer_keys_mutex);

static void timer_update_keys(struct work_struct *work);
static DECLARE_WORK(timer_update_work, timer_update_keys);

#ifdef CONFIG_SMP
static unsigned int sysctl_timer_migration = 1;

DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);

static void timers_update_migration(void)
{
        if (sysctl_timer_migration && tick_nohz_active)
                static_branch_enable(&timers_migration_enabled);
        else
                static_branch_disable(&timers_migration_enabled);
}

#ifdef CONFIG_SYSCTL
static int timer_migration_handler(struct ctl_table *table, int write,
                            void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        mutex_lock(&timer_keys_mutex);
        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (!ret && write)
                timers_update_migration();
        mutex_unlock(&timer_keys_mutex);
        return ret;
}

static struct ctl_table timer_sysctl[] = {
        {
                .procname        = "timer_migration",
                .data                = &sysctl_timer_migration,
                .maxlen                = sizeof(unsigned int),
                .mode                = 0644,
                .proc_handler        = timer_migration_handler,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
};

static int __init timer_sysctl_init(void)
{
        register_sysctl("kernel", timer_sysctl);
        return 0;
}
device_initcall(timer_sysctl_init);
#endif /* CONFIG_SYSCTL */
#else /* CONFIG_SMP */
static inline void timers_update_migration(void) { }
#endif /* !CONFIG_SMP */

static void timer_update_keys(struct work_struct *work)
{
        mutex_lock(&timer_keys_mutex);
        timers_update_migration();
        static_branch_enable(&timers_nohz_active);
        mutex_unlock(&timer_keys_mutex);
}

void timers_update_nohz(void)
{
        schedule_work(&timer_update_work);
}

static inline bool is_timers_nohz_active(void)
{
        return static_branch_unlikely(&timers_nohz_active);
}
#else
static inline bool is_timers_nohz_active(void) { return false; }
#endif /* NO_HZ_COMMON */

static unsigned long round_jiffies_common(unsigned long j, int cpu,
                bool force_up)
{
        int rem;
        unsigned long original = j;

        /*
         * We don't want all cpus firing their timers at once hitting the
         * same lock or cachelines, so we skew each extra cpu with an extra
         * 3 jiffies. This 3 jiffies came originally from the mm/ code which
         * already did this.
         * The skew is done by adding 3*cpunr, then round, then subtract this
         * extra offset again.
         */
        j += cpu * 3;

        rem = j % HZ;

        /*
         * If the target jiffie is just after a whole second (which can happen
         * due to delays of the timer irq, long irq off times etc etc) then
         * we should round down to the whole second, not up. Use 1/4th second
         * as cutoff for this rounding as an extreme upper bound for this.
         * But never round down if @force_up is set.
         */
        if (rem < HZ/4 && !force_up) /* round down */
                j = j - rem;
        else /* round up */
                j = j - rem + HZ;

        /* now that we have rounded, subtract the extra skew again */
        j -= cpu * 3;

        /*
         * Make sure j is still in the future. Otherwise return the
         * unmodified value.
         */
        return time_is_after_jiffies(j) ? j : original;
}

/**
 * __round_jiffies - function to round jiffies to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * __round_jiffies() rounds an absolute time in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The exact rounding is skewed for each processor to avoid all
 * processors firing at the exact same time, which could lead
 * to lock contention or spurious cache line bouncing.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long __round_jiffies(unsigned long j, int cpu)
{
        return round_jiffies_common(j, cpu, false);
}
EXPORT_SYMBOL_GPL(__round_jiffies);

/**
 * __round_jiffies_relative - function to round jiffies to a full second
 * @j: the time in (relative) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * __round_jiffies_relative() rounds a time delta  in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The exact rounding is skewed for each processor to avoid all
 * processors firing at the exact same time, which could lead
 * to lock contention or spurious cache line bouncing.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long __round_jiffies_relative(unsigned long j, int cpu)
{
        unsigned long j0 = jiffies;

        /* Use j0 because jiffies might change while we run */
        return round_jiffies_common(j + j0, cpu, false) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_relative);

/**
 * round_jiffies - function to round jiffies to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 *
 * round_jiffies() rounds an absolute time in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long round_jiffies(unsigned long j)
{
        return round_jiffies_common(j, raw_smp_processor_id(), false);
}
EXPORT_SYMBOL_GPL(round_jiffies);

/**
 * round_jiffies_relative - function to round jiffies to a full second
 * @j: the time in (relative) jiffies that should be rounded
 *
 * round_jiffies_relative() rounds a time delta  in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long round_jiffies_relative(unsigned long j)
{
        return __round_jiffies_relative(j, raw_smp_processor_id());
}
EXPORT_SYMBOL_GPL(round_jiffies_relative);

/**
 * __round_jiffies_up - function to round jiffies up to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * This is the same as __round_jiffies() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long __round_jiffies_up(unsigned long j, int cpu)
{
        return round_jiffies_common(j, cpu, true);
}
EXPORT_SYMBOL_GPL(__round_jiffies_up);

/**
 * __round_jiffies_up_relative - function to round jiffies up to a full second
 * @j: the time in (relative) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * This is the same as __round_jiffies_relative() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
{
        unsigned long j0 = jiffies;

        /* Use j0 because jiffies might change while we run */
        return round_jiffies_common(j + j0, cpu, true) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);

/**
 * round_jiffies_up - function to round jiffies up to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 *
 * This is the same as round_jiffies() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long round_jiffies_up(unsigned long j)
{
        return round_jiffies_common(j, raw_smp_processor_id(), true);
}
EXPORT_SYMBOL_GPL(round_jiffies_up);

/**
 * round_jiffies_up_relative - function to round jiffies up to a full second
 * @j: the time in (relative) jiffies that should be rounded
 *
 * This is the same as round_jiffies_relative() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long round_jiffies_up_relative(unsigned long j)
{
        return __round_jiffies_up_relative(j, raw_smp_processor_id());
}
EXPORT_SYMBOL_GPL(round_jiffies_up_relative);


static inline unsigned int timer_get_idx(struct timer_list *timer)
{
        return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
}

static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
{
        timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
                        idx << TIMER_ARRAYSHIFT;
}

/*
 * Helper function to calculate the array index for a given expiry
 * time.
 */
static inline unsigned calc_index(unsigned long expires, unsigned lvl,
                                  unsigned long *bucket_expiry)
{

        /*
         * The timer wheel has to guarantee that a timer does not fire
         * early. Early expiry can happen due to:
         * - Timer is armed at the edge of a tick
         * - Truncation of the expiry time in the outer wheel levels
         *
         * Round up with level granularity to prevent this.
         */
        expires = (expires >> LVL_SHIFT(lvl)) + 1;
        *bucket_expiry = expires << LVL_SHIFT(lvl);
        return LVL_OFFS(lvl) + (expires & LVL_MASK);
}

static int calc_wheel_index(unsigned long expires, unsigned long clk,
                            unsigned long *bucket_expiry)
{
        unsigned long delta = expires - clk;
        unsigned int idx;

        if (delta < LVL_START(1)) {
                idx = calc_index(expires, 0, bucket_expiry);
        } else if (delta < LVL_START(2)) {
                idx = calc_index(expires, 1, bucket_expiry);
        } else if (delta < LVL_START(3)) {
                idx = calc_index(expires, 2, bucket_expiry);
        } else if (delta < LVL_START(4)) {
                idx = calc_index(expires, 3, bucket_expiry);
        } else if (delta < LVL_START(5)) {
                idx = calc_index(expires, 4, bucket_expiry);
        } else if (delta < LVL_START(6)) {
                idx = calc_index(expires, 5, bucket_expiry);
        } else if (delta < LVL_START(7)) {
                idx = calc_index(expires, 6, bucket_expiry);
        } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
                idx = calc_index(expires, 7, bucket_expiry);
        } else if ((long) delta < 0) {
                idx = clk & LVL_MASK;
                *bucket_expiry = clk;
        } else {
                /*
                 * Force expire obscene large timeouts to expire at the
                 * capacity limit of the wheel.
                 */
                if (delta >= WHEEL_TIMEOUT_CUTOFF)
                        expires = clk + WHEEL_TIMEOUT_MAX;

                idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry);
        }
        return idx;
}

static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
        /*
         * Deferrable timers do not prevent the CPU from entering dynticks and
         * are not taken into account on the idle/nohz_full path. An IPI when a
         * new deferrable timer is enqueued will wake up the remote CPU but
         * nothing will be done with the deferrable timer base. Therefore skip
         * the remote IPI for deferrable timers completely.
         */
        if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE)
                return;

        /*
         * We might have to IPI the remote CPU if the base is idle and the
         * timer is pinned. If it is a non pinned timer, it is only queued
         * on the remote CPU, when timer was running during queueing. Then
         * everything is handled by remote CPU anyway. If the other CPU is
         * on the way to idle then it can't set base->is_idle as we hold
         * the base lock:
         */
        if (base->is_idle) {
                WARN_ON_ONCE(!(timer->flags & TIMER_PINNED ||
                               tick_nohz_full_cpu(base->cpu)));
                wake_up_nohz_cpu(base->cpu);
        }
}

/*
 * Enqueue the timer into the hash bucket, mark it pending in
 * the bitmap, store the index in the timer flags then wake up
 * the target CPU if needed.
 */
static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
                          unsigned int idx, unsigned long bucket_expiry)
{

        hlist_add_head(&timer->entry, base->vectors + idx);
        __set_bit(idx, base->pending_map);
        timer_set_idx(timer, idx);

        trace_timer_start(timer, bucket_expiry);

        /*
         * Check whether this is the new first expiring timer. The
         * effective expiry time of the timer is required here
         * (bucket_expiry) instead of timer->expires.
         */
        if (time_before(bucket_expiry, base->next_expiry)) {
                /*
                 * Set the next expiry time and kick the CPU so it
                 * can reevaluate the wheel:
                 */
                base->next_expiry = bucket_expiry;
                base->timers_pending = true;
                base->next_expiry_recalc = false;
                trigger_dyntick_cpu(base, timer);
        }
}

static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
{
        unsigned long bucket_expiry;
        unsigned int idx;

        idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry);
        enqueue_timer(base, timer, idx, bucket_expiry);
}

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS

static const struct debug_obj_descr timer_debug_descr;

struct timer_hint {
        void        (*function)(struct timer_list *t);
        long        offset;
};

#define TIMER_HINT(fn, container, timr, hintfn)                        \
        {                                                        \
                .function = fn,                                        \
                .offset          = offsetof(container, hintfn) -        \
                            offsetof(container, timr)                \
        }

static const struct timer_hint timer_hints[] = {
        TIMER_HINT(delayed_work_timer_fn,
                   struct delayed_work, timer, work.func),
        TIMER_HINT(kthread_delayed_work_timer_fn,
                   struct kthread_delayed_work, timer, work.func),
};

static void *timer_debug_hint(void *addr)
{
        struct timer_list *timer = addr;
        int i;

        for (i = 0; i < ARRAY_SIZE(timer_hints); i++) {
                if (timer_hints[i].function == timer->function) {
                        void (**fn)(void) = addr + timer_hints[i].offset;

                        return *fn;
                }
        }

        return timer->function;
}

static bool timer_is_static_object(void *addr)
{
        struct timer_list *timer = addr;

        return (timer->entry.pprev == NULL &&
                timer->entry.next == TIMER_ENTRY_STATIC);
}

/*
 * timer_fixup_init is called when:
 * - an active object is initialized
 */
static bool timer_fixup_init(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                del_timer_sync(timer);
                debug_object_init(timer, &timer_debug_descr);
                return true;
        default:
                return false;
        }
}

/* Stub timer callback for improperly used timers. */
static void stub_timer(struct timer_list *unused)
{
        WARN_ON(1);
}

/*
 * timer_fixup_activate is called when:
 * - an active object is activated
 * - an unknown non-static object is activated
 */
static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_NOTAVAILABLE:
                timer_setup(timer, stub_timer, 0);
                return true;

        case ODEBUG_STATE_ACTIVE:
                WARN_ON(1);
                fallthrough;
        default:
                return false;
        }
}

/*
 * timer_fixup_free is called when:
 * - an active object is freed
 */
static bool timer_fixup_free(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                del_timer_sync(timer);
                debug_object_free(timer, &timer_debug_descr);
                return true;
        default:
                return false;
        }
}

/*
 * timer_fixup_assert_init is called when:
 * - an untracked/uninit-ed object is found
 */
static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_NOTAVAILABLE:
                timer_setup(timer, stub_timer, 0);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr timer_debug_descr = {
        .name                        = "timer_list",
        .debug_hint                = timer_debug_hint,
        .is_static_object        = timer_is_static_object,
        .fixup_init                = timer_fixup_init,
        .fixup_activate                = timer_fixup_activate,
        .fixup_free                = timer_fixup_free,
        .fixup_assert_init        = timer_fixup_assert_init,
};

static inline void debug_timer_init(struct timer_list *timer)
{
        debug_object_init(timer, &timer_debug_descr);
}

static inline void debug_timer_activate(struct timer_list *timer)
{
        debug_object_activate(timer, &timer_debug_descr);
}

static inline void debug_timer_deactivate(struct timer_list *timer)
{
        debug_object_deactivate(timer, &timer_debug_descr);
}

static inline void debug_timer_assert_init(struct timer_list *timer)
{
        debug_object_assert_init(timer, &timer_debug_descr);
}

static void do_init_timer(struct timer_list *timer,
                          void (*func)(struct timer_list *),
                          unsigned int flags,
                          const char *name, struct lock_class_key *key);

void init_timer_on_stack_key(struct timer_list *timer,
                             void (*func)(struct timer_list *),
                             unsigned int flags,
                             const char *name, struct lock_class_key *key)
{
        debug_object_init_on_stack(timer, &timer_debug_descr);
        do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL_GPL(init_timer_on_stack_key);

void destroy_timer_on_stack(struct timer_list *timer)
{
        debug_object_free(timer, &timer_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_timer_on_stack);

#else
static inline void debug_timer_init(struct timer_list *timer) { }
static inline void debug_timer_activate(struct timer_list *timer) { }
static inline void debug_timer_deactivate(struct timer_list *timer) { }
static inline void debug_timer_assert_init(struct timer_list *timer) { }
#endif

static inline void debug_init(struct timer_list *timer)
{
        debug_timer_init(timer);
        trace_timer_init(timer);
}

static inline void debug_deactivate(struct timer_list *timer)
{
        debug_timer_deactivate(timer);
        trace_timer_cancel(timer);
}

static inline void debug_assert_init(struct timer_list *timer)
{
        debug_timer_assert_init(timer);
}

static void do_init_timer(struct timer_list *timer,
                          void (*func)(struct timer_list *),
                          unsigned int flags,
                          const char *name, struct lock_class_key *key)
{
        timer->entry.pprev = NULL;
        timer->function = func;
        if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS))
                flags &= TIMER_INIT_FLAGS;
        timer->flags = flags | raw_smp_processor_id();
        lockdep_init_map(&timer->lockdep_map, name, key, 0);
}

/**
 * init_timer_key - initialize a timer
 * @timer: the timer to be initialized
 * @func: timer callback function
 * @flags: timer flags
 * @name: name of the timer
 * @key: lockdep class key of the fake lock used for tracking timer
 *       sync lock dependencies
 *
 * init_timer_key() must be done to a timer prior to calling *any* of the
 * other timer functions.
 */
void init_timer_key(struct timer_list *timer,
                    void (*func)(struct timer_list *), unsigned int flags,
                    const char *name, struct lock_class_key *key)
{
        debug_init(timer);
        do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL(init_timer_key);

static inline void detach_timer(struct timer_list *timer, bool clear_pending)
{
        struct hlist_node *entry = &timer->entry;

        debug_deactivate(timer);

        __hlist_del(entry);
        if (clear_pending)
                entry->pprev = NULL;
        entry->next = LIST_POISON2;
}

static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
                             bool clear_pending)
{
        unsigned idx = timer_get_idx(timer);

        if (!timer_pending(timer))
                return 0;

        if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
                __clear_bit(idx, base->pending_map);
                base->next_expiry_recalc = true;
        }

        detach_timer(timer, clear_pending);
        return 1;
}

static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
{
        int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
        struct timer_base *base;

        base = per_cpu_ptr(&timer_bases[index], cpu);

        /*
         * If the timer is deferrable and NO_HZ_COMMON is set then we need
         * to use the deferrable base.
         */
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
                base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
        return base;
}

static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
{
        int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
        struct timer_base *base;

        base = this_cpu_ptr(&timer_bases[index]);

        /*
         * If the timer is deferrable and NO_HZ_COMMON is set then we need
         * to use the deferrable base.
         */
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
                base = this_cpu_ptr(&timer_bases[BASE_DEF]);
        return base;
}

static inline struct timer_base *get_timer_base(u32 tflags)
{
        return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
}

static inline void __forward_timer_base(struct timer_base *base,
                                        unsigned long basej)
{
        /*
         * Check whether we can forward the base. We can only do that when
         * @basej is past base->clk otherwise we might rewind base->clk.
         */
        if (time_before_eq(basej, base->clk))
                return;

        /*
         * If the next expiry value is > jiffies, then we fast forward to
         * jiffies otherwise we forward to the next expiry value.
         */
        if (time_after(base->next_expiry, basej)) {
                base->clk = basej;
        } else {
                if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
                        return;
                base->clk = base->next_expiry;
        }

}

static inline void forward_timer_base(struct timer_base *base)
{
        __forward_timer_base(base, READ_ONCE(jiffies));
}

/*
 * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
 * that all timers which are tied to this base are locked, and the base itself
 * is locked too.
 *
 * So __run_timers/migrate_timers can safely modify all timers which could
 * be found in the base->vectors array.
 *
 * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
 * to wait until the migration is done.
 */
static struct timer_base *lock_timer_base(struct timer_list *timer,
                                          unsigned long *flags)
        __acquires(timer->base->lock)
{
        for (;;) {
                struct timer_base *base;
                u32 tf;

                /*
                 * We need to use READ_ONCE() here, otherwise the compiler
                 * might re-read @tf between the check for TIMER_MIGRATING
                 * and spin_lock().
                 */
                tf = READ_ONCE(timer->flags);

                if (!(tf & TIMER_MIGRATING)) {
                        base = get_timer_base(tf);
                        raw_spin_lock_irqsave(&base->lock, *flags);
                        if (timer->flags == tf)
                                return base;
                        raw_spin_unlock_irqrestore(&base->lock, *flags);
                }
                cpu_relax();
        }
}

#define MOD_TIMER_PENDING_ONLY                0x01
#define MOD_TIMER_REDUCE                0x02
#define MOD_TIMER_NOTPENDING                0x04

static inline int
__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
{
        unsigned long clk = 0, flags, bucket_expiry;
        struct timer_base *base, *new_base;
        unsigned int idx = UINT_MAX;
        int ret = 0;

        debug_assert_init(timer);

        /*
         * This is a common optimization triggered by the networking code - if
         * the timer is re-modified to have the same timeout or ends up in the
         * same array bucket then just return:
         */
        if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) {
                /*
                 * The downside of this optimization is that it can result in
                 * larger granularity than you would get from adding a new
                 * timer with this expiry.
                 */
                long diff = timer->expires - expires;

                if (!diff)
                        return 1;
                if (options & MOD_TIMER_REDUCE && diff <= 0)
                        return 1;

                /*
                 * We lock timer base and calculate the bucket index right
                 * here. If the timer ends up in the same bucket, then we
                 * just update the expiry time and avoid the whole
                 * dequeue/enqueue dance.
                 */
                base = lock_timer_base(timer, &flags);
                /*
                 * Has @timer been shutdown? This needs to be evaluated
                 * while holding base lock to prevent a race against the
                 * shutdown code.
                 */
                if (!timer->function)
                        goto out_unlock;

                forward_timer_base(base);

                if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
                    time_before_eq(timer->expires, expires)) {
                        ret = 1;
                        goto out_unlock;
                }

                clk = base->clk;
                idx = calc_wheel_index(expires, clk, &bucket_expiry);

                /*
                 * Retrieve and compare the array index of the pending
                 * timer. If it matches set the expiry to the new value so a
                 * subsequent call will exit in the expires check above.
                 */
                if (idx == timer_get_idx(timer)) {
                        if (!(options & MOD_TIMER_REDUCE))
                                timer->expires = expires;
                        else if (time_after(timer->expires, expires))
                                timer->expires = expires;
                        ret = 1;
                        goto out_unlock;
                }
        } else {
                base = lock_timer_base(timer, &flags);
                /*
                 * Has @timer been shutdown? This needs to be evaluated
                 * while holding base lock to prevent a race against the
                 * shutdown code.
                 */
                if (!timer->function)
                        goto out_unlock;

                forward_timer_base(base);
        }

        ret = detach_if_pending(timer, base, false);
        if (!ret && (options & MOD_TIMER_PENDING_ONLY))
                goto out_unlock;

        new_base = get_timer_this_cpu_base(timer->flags);

        if (base != new_base) {
                /*
                 * We are trying to schedule the timer on the new base.
                 * However we can't change timer's base while it is running,
                 * otherwise timer_delete_sync() can't detect that the timer's
                 * handler yet has not finished. This also guarantees that the
                 * timer is serialized wrt itself.
                 */
                if (likely(base->running_timer != timer)) {
                        /* See the comment in lock_timer_base() */
                        timer->flags |= TIMER_MIGRATING;

                        raw_spin_unlock(&base->lock);
                        base = new_base;
                        raw_spin_lock(&base->lock);
                        WRITE_ONCE(timer->flags,
                                   (timer->flags & ~TIMER_BASEMASK) | base->cpu);
                        forward_timer_base(base);
                }
        }

        debug_timer_activate(timer);

        timer->expires = expires;
        /*
         * If 'idx' was calculated above and the base time did not advance
         * between calculating 'idx' and possibly switching the base, only
         * enqueue_timer() is required. Otherwise we need to (re)calculate
         * the wheel index via internal_add_timer().
         */
        if (idx != UINT_MAX && clk == base->clk)
                enqueue_timer(base, timer, idx, bucket_expiry);
        else
                internal_add_timer(base, timer);

out_unlock:
        raw_spin_unlock_irqrestore(&base->lock, flags);

        return ret;
}

/**
 * mod_timer_pending - Modify a pending timer's timeout
 * @timer:        The pending timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * mod_timer_pending() is the same for pending timers as mod_timer(), but
 * will not activate inactive timers.
 *
 * If @timer->function == NULL then the start operation is silently
 * discarded.
 *
 * Return:
 * * %0 - The timer was inactive and not modified or was in
 *          shutdown state and the operation was discarded
 * * %1 - The timer was active and requeued to expire at @expires
 */
int mod_timer_pending(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY);
}
EXPORT_SYMBOL(mod_timer_pending);

/**
 * mod_timer - Modify a timer's timeout
 * @timer:        The timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * mod_timer(timer, expires) is equivalent to:
 *
 *     del_timer(timer); timer->expires = expires; add_timer(timer);
 *
 * mod_timer() is more efficient than the above open coded sequence. In
 * case that the timer is inactive, the del_timer() part is a NOP. The
 * timer is in any case activated with the new expiry time @expires.
 *
 * Note that if there are multiple unserialized concurrent users of the
 * same timer, then mod_timer() is the only safe way to modify the timeout,
 * since add_timer() cannot modify an already running timer.
 *
 * If @timer->function == NULL then the start operation is silently
 * discarded. In this case the return value is 0 and meaningless.
 *
 * Return:
 * * %0 - The timer was inactive and started or was in shutdown
 *          state and the operation was discarded
 * * %1 - The timer was active and requeued to expire at @expires or
 *          the timer was active and not modified because @expires did
 *          not change the effective expiry time
 */
int mod_timer(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, 0);
}
EXPORT_SYMBOL(mod_timer);

/**
 * timer_reduce - Modify a timer's timeout if it would reduce the timeout
 * @timer:        The timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * timer_reduce() is very similar to mod_timer(), except that it will only
 * modify an enqueued timer if that would reduce the expiration time. If
 * @timer is not enqueued it starts the timer.
 *
 * If @timer->function == NULL then the start operation is silently
 * discarded.
 *
 * Return:
 * * %0 - The timer was inactive and started or was in shutdown
 *          state and the operation was discarded
 * * %1 - The timer was active and requeued to expire at @expires or
 *          the timer was active and not modified because @expires
 *          did not change the effective expiry time such that the
 *          timer would expire earlier than already scheduled
 */
int timer_reduce(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, MOD_TIMER_REDUCE);
}
EXPORT_SYMBOL(timer_reduce);

/**
 * add_timer - Start a timer
 * @timer:        The timer to be started
 *
 * Start @timer to expire at @timer->expires in the future. @timer->expires
 * is the absolute expiry time measured in 'jiffies'. When the timer expires
 * timer->function(timer) will be invoked from soft interrupt context.
 *
 * The @timer->expires and @timer->function fields must be set prior
 * to calling this function.
 *
 * If @timer->function == NULL then the start operation is silently
 * discarded.
 *
 * If @timer->expires is already in the past @timer will be queued to
 * expire at the next timer tick.
 *
 * This can only operate on an inactive timer. Attempts to invoke this on
 * an active timer are rejected with a warning.
 */
void add_timer(struct timer_list *timer)
{
        if (WARN_ON_ONCE(timer_pending(timer)))
                return;
        __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer);

/**
 * add_timer_local() - Start a timer on the local CPU
 * @timer:        The timer to be started
 *
 * Same as add_timer() except that the timer flag TIMER_PINNED is set.
 *
 * See add_timer() for further details.
 */
void add_timer_local(struct timer_list *timer)
{
        if (WARN_ON_ONCE(timer_pending(timer)))
                return;
        timer->flags |= TIMER_PINNED;
        __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer_local);

/**
 * add_timer_global() - Start a timer without TIMER_PINNED flag set
 * @timer:        The timer to be started
 *
 * Same as add_timer() except that the timer flag TIMER_PINNED is unset.
 *
 * See add_timer() for further details.
 */
void add_timer_global(struct timer_list *timer)
{
        if (WARN_ON_ONCE(timer_pending(timer)))
                return;
        timer->flags &= ~TIMER_PINNED;
        __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer_global);

/**
 * add_timer_on - Start a timer on a particular CPU
 * @timer:        The timer to be started
 * @cpu:        The CPU to start it on
 *
 * Same as add_timer() except that it starts the timer on the given CPU and
 * the TIMER_PINNED flag is set. When timer shouldn't be a pinned timer in
 * the next round, add_timer_global() should be used instead as it unsets
 * the TIMER_PINNED flag.
 *
 * See add_timer() for further details.
 */
void add_timer_on(struct timer_list *timer, int cpu)
{
        struct timer_base *new_base, *base;
        unsigned long flags;

        debug_assert_init(timer);

        if (WARN_ON_ONCE(timer_pending(timer)))
                return;

        /* Make sure timer flags have TIMER_PINNED flag set */
        timer->flags |= TIMER_PINNED;

        new_base = get_timer_cpu_base(timer->flags, cpu);

        /*
         * If @timer was on a different CPU, it should be migrated with the
         * old base locked to prevent other operations proceeding with the
         * wrong base locked.  See lock_timer_base().
         */
        base = lock_timer_base(timer, &flags);
        /*
         * Has @timer been shutdown? This needs to be evaluated while
         * holding base lock to prevent a race against the shutdown code.
         */
        if (!timer->function)
                goto out_unlock;

        if (base != new_base) {
                timer->flags |= TIMER_MIGRATING;

                raw_spin_unlock(&base->lock);
                base = new_base;
                raw_spin_lock(&base->lock);
                WRITE_ONCE(timer->flags,
                           (timer->flags & ~TIMER_BASEMASK) | cpu);
        }
        forward_timer_base(base);

        debug_timer_activate(timer);
        internal_add_timer(base, timer);
out_unlock:
        raw_spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(add_timer_on);

/**
 * __timer_delete - Internal function: Deactivate a timer
 * @timer:        The timer to be deactivated
 * @shutdown:        If true, this indicates that the timer is about to be
 *                shutdown permanently.
 *
 * If @shutdown is true then @timer->function is set to NULL under the
 * timer base lock which prevents further rearming of the time. In that
 * case any attempt to rearm @timer after this function returns will be
 * silently ignored.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending and deactivated
 */
static int __timer_delete(struct timer_list *timer, bool shutdown)
{
        struct timer_base *base;
        unsigned long flags;
        int ret = 0;

        debug_assert_init(timer);

        /*
         * If @shutdown is set then the lock has to be taken whether the
         * timer is pending or not to protect against a concurrent rearm
         * which might hit between the lockless pending check and the lock
         * acquisition. By taking the lock it is ensured that such a newly
         * enqueued timer is dequeued and cannot end up with
         * timer->function == NULL in the expiry code.
         *
         * If timer->function is currently executed, then this makes sure
         * that the callback cannot requeue the timer.
         */
        if (timer_pending(timer) || shutdown) {
                base = lock_timer_base(timer, &flags);
                ret = detach_if_pending(timer, base, true);
                if (shutdown)
                        timer->function = NULL;
                raw_spin_unlock_irqrestore(&base->lock, flags);
        }

        return ret;
}

/**
 * timer_delete - Deactivate a timer
 * @timer:        The timer to be deactivated
 *
 * The function only deactivates a pending timer, but contrary to
 * timer_delete_sync() it does not take into account whether the timer's
 * callback function is concurrently executed on a different CPU or not.
 * It neither prevents rearming of the timer.  If @timer can be rearmed
 * concurrently then the return value of this function is meaningless.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending and deactivated
 */
int timer_delete(struct timer_list *timer)
{
        return __timer_delete(timer, false);
}
EXPORT_SYMBOL(timer_delete);

/**
 * timer_shutdown - Deactivate a timer and prevent rearming
 * @timer:        The timer to be deactivated
 *
 * The function does not wait for an eventually running timer callback on a
 * different CPU but it prevents rearming of the timer. Any attempt to arm
 * @timer after this function returns will be silently ignored.
 *
 * This function is useful for teardown code and should only be used when
 * timer_shutdown_sync() cannot be invoked due to locking or context constraints.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending
 */
int timer_shutdown(struct timer_list *timer)
{
        return __timer_delete(timer, true);
}
EXPORT_SYMBOL_GPL(timer_shutdown);

/**
 * __try_to_del_timer_sync - Internal function: Try to deactivate a timer
 * @timer:        Timer to deactivate
 * @shutdown:        If true, this indicates that the timer is about to be
 *                shutdown permanently.
 *
 * If @shutdown is true then @timer->function is set to NULL under the
 * timer base lock which prevents further rearming of the timer. Any
 * attempt to rearm @timer after this function returns will be silently
 * ignored.
 *
 * This function cannot guarantee that the timer cannot be rearmed
 * right after dropping the base lock if @shutdown is false. That
 * needs to be prevented by the calling code if necessary.
 *
 * Return:
 * * %0  - The timer was not pending
 * * %1  - The timer was pending and deactivated
 * * %-1 - The timer callback function is running on a different CPU
 */
static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
{
        struct timer_base *base;
        unsigned long flags;
        int ret = -1;

        debug_assert_init(timer);

        base = lock_timer_base(timer, &flags);

        if (base->running_timer != timer)
                ret = detach_if_pending(timer, base, true);
        if (shutdown)
                timer->function = NULL;

        raw_spin_unlock_irqrestore(&base->lock, flags);

        return ret;
}

/**
 * try_to_del_timer_sync - Try to deactivate a timer
 * @timer:        Timer to deactivate
 *
 * This function tries to deactivate a timer. On success the timer is not
 * queued and the timer callback function is not running on any CPU.
 *
 * This function does not guarantee that the timer cannot be rearmed right
 * after dropping the base lock. That needs to be prevented by the calling
 * code if necessary.
 *
 * Return:
 * * %0  - The timer was not pending
 * * %1  - The timer was pending and deactivated
 * * %-1 - The timer callback function is running on a different CPU
 */
int try_to_del_timer_sync(struct timer_list *timer)
{
        return __try_to_del_timer_sync(timer, false);
}
EXPORT_SYMBOL(try_to_del_timer_sync);

#ifdef CONFIG_PREEMPT_RT
static __init void timer_base_init_expiry_lock(struct timer_base *base)
{
        spin_lock_init(&base->expiry_lock);
}

static inline void timer_base_lock_expiry(struct timer_base *base)
{
        spin_lock(&base->expiry_lock);
}

static inline void timer_base_unlock_expiry(struct timer_base *base)
{
        spin_unlock(&base->expiry_lock);
}

/*
 * The counterpart to del_timer_wait_running().
 *
 * If there is a waiter for base->expiry_lock, then it was waiting for the
 * timer callback to finish. Drop expiry_lock and reacquire it. That allows
 * the waiter to acquire the lock and make progress.
 */
static void timer_sync_wait_running(struct timer_base *base)
{
        if (atomic_read(&base->timer_waiters)) {
                raw_spin_unlock_irq(&base->lock);
                spin_unlock(&base->expiry_lock);
                spin_lock(&base->expiry_lock);
                raw_spin_lock_irq(&base->lock);
        }
}

/*
 * This function is called on PREEMPT_RT kernels when the fast path
 * deletion of a timer failed because the timer callback function was
 * running.
 *
 * This prevents priority inversion, if the softirq thread on a remote CPU
 * got preempted, and it prevents a life lock when the task which tries to
 * delete a timer preempted the softirq thread running the timer callback
 * function.
 */
static void del_timer_wait_running(struct timer_list *timer)
{
        u32 tf;

        tf = READ_ONCE(timer->flags);
        if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) {
                struct timer_base *base = get_timer_base(tf);

                /*
                 * Mark the base as contended and grab the expiry lock,
                 * which is held by the softirq across the timer
                 * callback. Drop the lock immediately so the softirq can
                 * expire the next timer. In theory the timer could already
                 * be running again, but that's more than unlikely and just
                 * causes another wait loop.
                 */
                atomic_inc(&base->timer_waiters);
                spin_lock_bh(&base->expiry_lock);
                atomic_dec(&base->timer_waiters);
                spin_unlock_bh(&base->expiry_lock);
        }
}
#else
static inline void timer_base_init_expiry_lock(struct timer_base *base) { }
static inline void timer_base_lock_expiry(struct timer_base *base) { }
static inline void timer_base_unlock_expiry(struct timer_base *base) { }
static inline void timer_sync_wait_running(struct timer_base *base) { }
static inline void del_timer_wait_running(struct timer_list *timer) { }
#endif

/**
 * __timer_delete_sync - Internal function: Deactivate a timer and wait
 *                         for the handler to finish.
 * @timer:        The timer to be deactivated
 * @shutdown:        If true, @timer->function will be set to NULL under the
 *                timer base lock which prevents rearming of @timer
 *
 * If @shutdown is not set the timer can be rearmed later. If the timer can
 * be rearmed concurrently, i.e. after dropping the base lock then the
 * return value is meaningless.
 *
 * If @shutdown is set then @timer->function is set to NULL under timer
 * base lock which prevents rearming of the timer. Any attempt to rearm
 * a shutdown timer is silently ignored.
 *
 * If the timer should be reused after shutdown it has to be initialized
 * again.
 *
 * Return:
 * * %0        - The timer was not pending
 * * %1        - The timer was pending and deactivated
 */
static int __timer_delete_sync(struct timer_list *timer, bool shutdown)
{
        int ret;

#ifdef CONFIG_LOCKDEP
        unsigned long flags;

        /*
         * If lockdep gives a backtrace here, please reference
         * the synchronization rules above.
         */
        local_irq_save(flags);
        lock_map_acquire(&timer->lockdep_map);
        lock_map_release(&timer->lockdep_map);
        local_irq_restore(flags);
#endif
        /*
         * don't use it in hardirq context, because it
         * could lead to deadlock.
         */
        WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE));

        /*
         * Must be able to sleep on PREEMPT_RT because of the slowpath in
         * del_timer_wait_running().
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE))
                lockdep_assert_preemption_enabled();

        do {
                ret = __try_to_del_timer_sync(timer, shutdown);

                if (unlikely(ret < 0)) {
                        del_timer_wait_running(timer);
                        cpu_relax();
                }
        } while (ret < 0);

        return ret;
}

/**
 * timer_delete_sync - Deactivate a timer and wait for the handler to finish.
 * @timer:        The timer to be deactivated
 *
 * Synchronization rules: Callers must prevent restarting of the timer,
 * otherwise this function is meaningless. It must not be called from
 * interrupt contexts unless the timer is an irqsafe one. The caller must
 * not hold locks which would prevent completion of the timer's callback
 * function. The timer's handler must not call add_timer_on(). Upon exit
 * the timer is not queued and the handler is not running on any CPU.
 *
 * For !irqsafe timers, the caller must not hold locks that are held in
 * interrupt context. Even if the lock has nothing to do with the timer in
 * question.  Here's why::
 *
 *    CPU0                             CPU1
 *    ----                             ----
 *                                     <SOFTIRQ>
 *                                       call_timer_fn();
 *                                       base->running_timer = mytimer;
 *    spin_lock_irq(somelock);
 *                                     <IRQ>
 *                                        spin_lock(somelock);
 *    timer_delete_sync(mytimer);
 *    while (base->running_timer == mytimer);
 *
 * Now timer_delete_sync() will never return and never release somelock.
 * The interrupt on the other CPU is waiting to grab somelock but it has
 * interrupted the softirq that CPU0 is waiting to finish.
 *
 * This function cannot guarantee that the timer is not rearmed again by
 * some concurrent or preempting code, right after it dropped the base
 * lock. If there is the possibility of a concurrent rearm then the return
 * value of the function is meaningless.
 *
 * If such a guarantee is needed, e.g. for teardown situations then use
 * timer_shutdown_sync() instead.
 *
 * Return:
 * * %0        - The timer was not pending
 * * %1        - The timer was pending and deactivated
 */
int timer_delete_sync(struct timer_list *timer)
{
        return __timer_delete_sync(timer, false);
}
EXPORT_SYMBOL(timer_delete_sync);

/**
 * timer_shutdown_sync - Shutdown a timer and prevent rearming
 * @timer: The timer to be shutdown
 *
 * When the function returns it is guaranteed that:
 *   - @timer is not queued
 *   - The callback function of @timer is not running
 *   - @timer cannot be enqueued again. Any attempt to rearm
 *     @timer is silently ignored.
 *
 * See timer_delete_sync() for synchronization rules.
 *
 * This function is useful for final teardown of an infrastructure where
 * the timer is subject to a circular dependency problem.
 *
 * A common pattern for this is a timer and a workqueue where the timer can
 * schedule work and work can arm the timer. On shutdown the workqueue must
 * be destroyed and the timer must be prevented from rearming. Unless the
 * code has conditionals like 'if (mything->in_shutdown)' to prevent that
 * there is no way to get this correct with timer_delete_sync().
 *
 * timer_shutdown_sync() is solving the problem. The correct ordering of
 * calls in this case is:
 *
 *        timer_shutdown_sync(&mything->timer);
 *        workqueue_destroy(&mything->workqueue);
 *
 * After this 'mything' can be safely freed.
 *
 * This obviously implies that the timer is not required to be functional
 * for the rest of the shutdown operation.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending
 */
int timer_shutdown_sync(struct timer_list *timer)
{
        return __timer_delete_sync(timer, true);
}
EXPORT_SYMBOL_GPL(timer_shutdown_sync);

static void call_timer_fn(struct timer_list *timer,
                          void (*fn)(struct timer_list *),
                          unsigned long baseclk)
{
        int count = preempt_count();

#ifdef CONFIG_LOCKDEP
        /*
         * It is permissible to free the timer from inside the
         * function that is called from it, this we need to take into
         * account for lockdep too. To avoid bogus "held lock freed"
         * warnings as well as problems when looking into
         * timer->lockdep_map, make a copy and use that here.
         */
        struct lockdep_map lockdep_map;

        lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
#endif
        /*
         * Couple the lock chain with the lock chain at
         * timer_delete_sync() by acquiring the lock_map around the fn()
         * call here and in timer_delete_sync().
         */
        lock_map_acquire(&lockdep_map);

        trace_timer_expire_entry(timer, baseclk);
        fn(timer);
        trace_timer_expire_exit(timer);

        lock_map_release(&lockdep_map);

        if (count != preempt_count()) {
                WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
                          fn, count, preempt_count());
                /*
                 * Restore the preempt count. That gives us a decent
                 * chance to survive and extract information. If the
                 * callback kept a lock held, bad luck, but not worse
                 * than the BUG() we had.
                 */
                preempt_count_set(count);
        }
}

static void expire_timers(struct timer_base *base, struct hlist_head *head)
{
        /*
         * This value is required only for tracing. base->clk was
         * incremented directly before expire_timers was called. But expiry
         * is related to the old base->clk value.
         */
        unsigned long baseclk = base->clk - 1;

        while (!hlist_empty(head)) {
                struct timer_list *timer;
                void (*fn)(struct timer_list *);

                timer = hlist_entry(head->first, struct timer_list, entry);

                base->running_timer = timer;
                detach_timer(timer, true);

                fn = timer->function;

                if (WARN_ON_ONCE(!fn)) {
                        /* Should never happen. Emphasis on should! */
                        base->running_timer = NULL;
                        continue;
                }

                if (timer->flags & TIMER_IRQSAFE) {
                        raw_spin_unlock(&base->lock);
                        call_timer_fn(timer, fn, baseclk);
                        raw_spin_lock(&base->lock);
                        base->running_timer = NULL;
                } else {
                        raw_spin_unlock_irq(&base->lock);
                        call_timer_fn(timer, fn, baseclk);
                        raw_spin_lock_irq(&base->lock);
                        base->running_timer = NULL;
                        timer_sync_wait_running(base);
                }
        }
}

static int collect_expired_timers(struct timer_base *base,
                                  struct hlist_head *heads)
{
        unsigned long clk = base->clk = base->next_expiry;
        struct hlist_head *vec;
        int i, levels = 0;
        unsigned int idx;

        for (i = 0; i < LVL_DEPTH; i++) {
                idx = (clk & LVL_MASK) + i * LVL_SIZE;

                if (__test_and_clear_bit(idx, base->pending_map)) {
                        vec = base->vectors + idx;
                        hlist_move_list(vec, heads++);
                        levels++;
                }
                /* Is it time to look at the next level? */
                if (clk & LVL_CLK_MASK)
                        break;
                /* Shift clock for the next level granularity */
                clk >>= LVL_CLK_SHIFT;
        }
        return levels;
}

/*
 * Find the next pending bucket of a level. Search from level start (@offset)
 * + @clk upwards and if nothing there, search from start of the level
 * (@offset) up to @offset + clk.
 */
static int next_pending_bucket(struct timer_base *base, unsigned offset,
                               unsigned clk)
{
        unsigned pos, start = offset + clk;
        unsigned end = offset + LVL_SIZE;

        pos = find_next_bit(base->pending_map, end, start);
        if (pos < end)
                return pos - start;

        pos = find_next_bit(base->pending_map, start, offset);
        return pos < start ? pos + LVL_SIZE - start : -1;
}

/*
 * Search the first expiring timer in the various clock levels. Caller must
 * hold base->lock.
 *
 * Store next expiry time in base->next_expiry.
 */
static void next_expiry_recalc(struct timer_base *base)
{
        unsigned long clk, next, adj;
        unsigned lvl, offset = 0;

        next = base->clk + NEXT_TIMER_MAX_DELTA;
        clk = base->clk;
        for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
                int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
                unsigned long lvl_clk = clk & LVL_CLK_MASK;

                if (pos >= 0) {
                        unsigned long tmp = clk + (unsigned long) pos;

                        tmp <<= LVL_SHIFT(lvl);
                        if (time_before(tmp, next))
                                next = tmp;

                        /*
                         * If the next expiration happens before we reach
                         * the next level, no need to check further.
                         */
                        if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
                                break;
                }
                /*
                 * Clock for the next level. If the current level clock lower
                 * bits are zero, we look at the next level as is. If not we
                 * need to advance it by one because that's going to be the
                 * next expiring bucket in that level. base->clk is the next
                 * expiring jiffie. So in case of:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    0    0
                 *
                 * we have to look at all levels @index 0. With
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    0    2
                 *
                 * LVL0 has the next expiring bucket @index 2. The upper
                 * levels have the next expiring bucket @index 1.
                 *
                 * In case that the propagation wraps the next level the same
                 * rules apply:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    F    2
                 *
                 * So after looking at LVL0 we get:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1
                 *  0    0    0    1    0
                 *
                 * So no propagation from LVL1 to LVL2 because that happened
                 * with the add already, but then we need to propagate further
                 * from LVL2 to LVL3.
                 *
                 * So the simple check whether the lower bits of the current
                 * level are 0 or not is sufficient for all cases.
                 */
                adj = lvl_clk ? 1 : 0;
                clk >>= LVL_CLK_SHIFT;
                clk += adj;
        }

        base->next_expiry = next;
        base->next_expiry_recalc = false;
        base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
}

#ifdef CONFIG_NO_HZ_COMMON
/*
 * Check, if the next hrtimer event is before the next timer wheel
 * event:
 */
static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
{
        u64 nextevt = hrtimer_get_next_event();

        /*
         * If high resolution timers are enabled
         * hrtimer_get_next_event() returns KTIME_MAX.
         */
        if (expires <= nextevt)
                return expires;

        /*
         * If the next timer is already expired, return the tick base
         * time so the tick is fired immediately.
         */
        if (nextevt <= basem)
                return basem;

        /*
         * Round up to the next jiffie. High resolution timers are
         * off, so the hrtimers are expired in the tick and we need to
         * make sure that this tick really expires the timer to avoid
         * a ping pong of the nohz stop code.
         *
         * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
         */
        return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
}

static unsigned long next_timer_interrupt(struct timer_base *base,
                                          unsigned long basej)
{
        if (base->next_expiry_recalc)
                next_expiry_recalc(base);

        /*
         * Move next_expiry for the empty base into the future to prevent an
         * unnecessary raise of the timer softirq when the next_expiry value
         * will be reached even if there is no timer pending.
         *
         * This update is also required to make timer_base::next_expiry values
         * easy comparable to find out which base holds the first pending timer.
         */
        if (!base->timers_pending)
                base->next_expiry = basej + NEXT_TIMER_MAX_DELTA;

        return base->next_expiry;
}

static unsigned long fetch_next_timer_interrupt(unsigned long basej, u64 basem,
                                                struct timer_base *base_local,
                                                struct timer_base *base_global,
                                                struct timer_events *tevt)
{
        unsigned long nextevt, nextevt_local, nextevt_global;
        bool local_first;

        nextevt_local = next_timer_interrupt(base_local, basej);
        nextevt_global = next_timer_interrupt(base_global, basej);

        local_first = time_before_eq(nextevt_local, nextevt_global);

        nextevt = local_first ? nextevt_local : nextevt_global;

        /*
         * If the @nextevt is at max. one tick away, use @nextevt and store
         * it in the local expiry value. The next global event is irrelevant in
         * this case and can be left as KTIME_MAX.
         */
        if (time_before_eq(nextevt, basej + 1)) {
                /* If we missed a tick already, force 0 delta */
                if (time_before(nextevt, basej))
                        nextevt = basej;
                tevt->local = basem + (u64)(nextevt - basej) * TICK_NSEC;

                /*
                 * This is required for the remote check only but it doesn't
                 * hurt, when it is done for both call sites:
                 *
                 * * The remote callers will only take care of the global timers
                 *   as local timers will be handled by CPU itself. When not
                 *   updating tevt->global with the already missed first global
                 *   timer, it is possible that it will be missed completely.
                 *
                 * * The local callers will ignore the tevt->global anyway, when
                 *   nextevt is max. one tick away.
                 */
                if (!local_first)
                        tevt->global = tevt->local;
                return nextevt;
        }

        /*
         * Update tevt.* values:
         *
         * If the local queue expires first, then the global event can be
         * ignored. If the global queue is empty, nothing to do either.
         */
        if (!local_first && base_global->timers_pending)
                tevt->global = basem + (u64)(nextevt_global - basej) * TICK_NSEC;

        if (base_local->timers_pending)
                tevt->local = basem + (u64)(nextevt_local - basej) * TICK_NSEC;

        return nextevt;
}

# ifdef CONFIG_SMP
/**
 * fetch_next_timer_interrupt_remote() - Store next timers into @tevt
 * @basej:        base time jiffies
 * @basem:        base time clock monotonic
 * @tevt:        Pointer to the storage for the expiry values
 * @cpu:        Remote CPU
 *
 * Stores the next pending local and global timer expiry values in the
 * struct pointed to by @tevt. If a queue is empty the corresponding
 * field is set to KTIME_MAX. If local event expires before global
 * event, global event is set to KTIME_MAX as well.
 *
 * Caller needs to make sure timer base locks are held (use
 * timer_lock_remote_bases() for this purpose).
 */
void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem,
                                       struct timer_events *tevt,
                                       unsigned int cpu)
{
        struct timer_base *base_local, *base_global;

        /* Preset local / global events */
        tevt->local = tevt->global = KTIME_MAX;

        base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
        base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

        lockdep_assert_held(&base_local->lock);
        lockdep_assert_held(&base_global->lock);

        fetch_next_timer_interrupt(basej, basem, base_local, base_global, tevt);
}

/**
 * timer_unlock_remote_bases - unlock timer bases of cpu
 * @cpu:        Remote CPU
 *
 * Unlocks the remote timer bases.
 */
void timer_unlock_remote_bases(unsigned int cpu)
        __releases(timer_bases[BASE_LOCAL]->lock)
        __releases(timer_bases[BASE_GLOBAL]->lock)
{
        struct timer_base *base_local, *base_global;

        base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
        base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

        raw_spin_unlock(&base_global->lock);
        raw_spin_unlock(&base_local->lock);
}

/**
 * timer_lock_remote_bases - lock timer bases of cpu
 * @cpu:        Remote CPU
 *
 * Locks the remote timer bases.
 */
void timer_lock_remote_bases(unsigned int cpu)
        __acquires(timer_bases[BASE_LOCAL]->lock)
        __acquires(timer_bases[BASE_GLOBAL]->lock)
{
        struct timer_base *base_local, *base_global;

        base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
        base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

        lockdep_assert_irqs_disabled();

        raw_spin_lock(&base_local->lock);
        raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);
}

/**
 * timer_base_is_idle() - Return whether timer base is set idle
 *
 * Returns value of local timer base is_idle value.
 */
bool timer_base_is_idle(void)
{
        return __this_cpu_read(timer_bases[BASE_LOCAL].is_idle);
}

static void __run_timer_base(struct timer_base *base);

/**
 * timer_expire_remote() - expire global timers of cpu
 * @cpu:        Remote CPU
 *
 * Expire timers of global base of remote CPU.
 */
void timer_expire_remote(unsigned int cpu)
{
        struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

        __run_timer_base(base);
}

static void timer_use_tmigr(unsigned long basej, u64 basem,
                            unsigned long *nextevt, bool *tick_stop_path,
                            bool timer_base_idle, struct timer_events *tevt)
{
        u64 next_tmigr;

        if (timer_base_idle)
                next_tmigr = tmigr_cpu_new_timer(tevt->global);
        else if (tick_stop_path)
                next_tmigr = tmigr_cpu_deactivate(tevt->global);
        else
                next_tmigr = tmigr_quick_check(tevt->global);

        /*
         * If the CPU is the last going idle in timer migration hierarchy, make
         * sure the CPU will wake up in time to handle remote timers.
         * next_tmigr == KTIME_MAX if other CPUs are still active.
         */
        if (next_tmigr < tevt->local) {
                u64 tmp;

                /* If we missed a tick already, force 0 delta */
                if (next_tmigr < basem)
                        next_tmigr = basem;

                tmp = div_u64(next_tmigr - basem, TICK_NSEC);

                *nextevt = basej + (unsigned long)tmp;
                tevt->local = next_tmigr;
        }
}
# else
static void timer_use_tmigr(unsigned long basej, u64 basem,
                            unsigned long *nextevt, bool *tick_stop_path,
                            bool timer_base_idle, struct timer_events *tevt)
{
        /*
         * Make sure first event is written into tevt->local to not miss a
         * timer on !SMP systems.
         */
        tevt->local = min_t(u64, tevt->local, tevt->global);
}
# endif /* CONFIG_SMP */

static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
                                             bool *idle)
{
        struct timer_events tevt = { .local = KTIME_MAX, .global = KTIME_MAX };
        struct timer_base *base_local, *base_global;
        unsigned long nextevt;
        bool idle_is_possible;

        /*
         * When the CPU is offline, the tick is cancelled and nothing is supposed
         * to try to stop it.
         */
        if (WARN_ON_ONCE(cpu_is_offline(smp_processor_id()))) {
                if (idle)
                        *idle = true;
                return tevt.local;
        }

        base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
        base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]);

        raw_spin_lock(&base_local->lock);
        raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);

        nextevt = fetch_next_timer_interrupt(basej, basem, base_local,
                                             base_global, &tevt);

        /*
         * If the next event is only one jiffie ahead there is no need to call
         * timer migration hierarchy related functions. The value for the next
         * global timer in @tevt struct equals then KTIME_MAX. This is also
         * true, when the timer base is idle.
         *
         * The proper timer migration hierarchy function depends on the callsite
         * and whether timer base is idle or not. @nextevt will be updated when
         * this CPU needs to handle the first timer migration hierarchy
         * event. See timer_use_tmigr() for detailed information.
         */
        idle_is_possible = time_after(nextevt, basej + 1);
        if (idle_is_possible)
                timer_use_tmigr(basej, basem, &nextevt, idle,
                                base_local->is_idle, &tevt);

        /*
         * We have a fresh next event. Check whether we can forward the
         * base.
         */
        __forward_timer_base(base_local, basej);
        __forward_timer_base(base_global, basej);

        /*
         * Set base->is_idle only when caller is timer_base_try_to_set_idle()
         */
        if (idle) {
                /*
                 * Bases are idle if the next event is more than a tick
                 * away. Caution: @nextevt could have changed by enqueueing a
                 * global timer into timer migration hierarchy. Therefore a new
                 * check is required here.
                 *
                 * If the base is marked idle then any timer add operation must
                 * forward the base clk itself to keep granularity small. This
                 * idle logic is only maintained for the BASE_LOCAL and
                 * BASE_GLOBAL base, deferrable timers may still see large
                 * granularity skew (by design).
                 */
                if (!base_local->is_idle && time_after(nextevt, basej + 1)) {
                        base_local->is_idle = true;
                        /*
                         * Global timers queued locally while running in a task
                         * in nohz_full mode need a self-IPI to kick reprogramming
                         * in IRQ tail.
                         */
                        if (tick_nohz_full_cpu(base_local->cpu))
                                base_global->is_idle = true;
                        trace_timer_base_idle(true, base_local->cpu);
                }
                *idle = base_local->is_idle;

                /*
                 * When timer base is not set idle, undo the effect of
                 * tmigr_cpu_deactivate() to prevent inconsistent states - active
                 * timer base but inactive timer migration hierarchy.
                 *
                 * When timer base was already marked idle, nothing will be
                 * changed here.
                 */
                if (!base_local->is_idle && idle_is_possible)
                        tmigr_cpu_activate();
        }

        raw_spin_unlock(&base_global->lock);
        raw_spin_unlock(&base_local->lock);

        return cmp_next_hrtimer_event(basem, tevt.local);
}

/**
 * get_next_timer_interrupt() - return the time (clock mono) of the next timer
 * @basej:        base time jiffies
 * @basem:        base time clock monotonic
 *
 * Returns the tick aligned clock monotonic time of the next pending timer or
 * KTIME_MAX if no timer is pending. If timer of global base was queued into
 * timer migration hierarchy, first global timer is not taken into account. If
 * it was the last CPU of timer migration hierarchy going idle, first global
 * event is taken into account.
 */
u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
{
        return __get_next_timer_interrupt(basej, basem, NULL);
}

/**
 * timer_base_try_to_set_idle() - Try to set the idle state of the timer bases
 * @basej:        base time jiffies
 * @basem:        base time clock monotonic
 * @idle:        pointer to store the value of timer_base->is_idle on return;
 *                *idle contains the information whether tick was already stopped
 *
 * Returns the tick aligned clock monotonic time of the next pending timer or
 * KTIME_MAX if no timer is pending. When tick was already stopped KTIME_MAX is
 * returned as well.
 */
u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle)
{
        if (*idle)
                return KTIME_MAX;

        return __get_next_timer_interrupt(basej, basem, idle);
}

/**
 * timer_clear_idle - Clear the idle state of the timer base
 *
 * Called with interrupts disabled
 */
void timer_clear_idle(void)
{
        /*
         * We do this unlocked. The worst outcome is a remote pinned timer
         * enqueue sending a pointless IPI, but taking the lock would just
         * make the window for sending the IPI a few instructions smaller
         * for the cost of taking the lock in the exit from idle
         * path. Required for BASE_LOCAL only.
         */
        __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
        if (tick_nohz_full_cpu(smp_processor_id()))
                __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
        trace_timer_base_idle(false, smp_processor_id());

        /* Activate without holding the timer_base->lock */
        tmigr_cpu_activate();
}
#endif

/**
 * __run_timers - run all expired timers (if any) on this CPU.
 * @base: the timer vector to be processed.
 */
static inline void __run_timers(struct timer_base *base)
{
        struct hlist_head heads[LVL_DEPTH];
        int levels;

        lockdep_assert_held(&base->lock);

        if (base->running_timer)
                return;

        while (time_after_eq(jiffies, base->clk) &&
               time_after_eq(jiffies, base->next_expiry)) {
                levels = collect_expired_timers(base, heads);
                /*
                 * The two possible reasons for not finding any expired
                 * timer at this clk are that all matching timers have been
                 * dequeued or no timer has been queued since
                 * base::next_expiry was set to base::clk +
                 * NEXT_TIMER_MAX_DELTA.
                 */
                WARN_ON_ONCE(!levels && !base->next_expiry_recalc
                             && base->timers_pending);
                /*
                 * While executing timers, base->clk is set 1 offset ahead of
                 * jiffies to avoid endless requeuing to current jiffies.
                 */
                base->clk++;
                next_expiry_recalc(base);

                while (levels--)
                        expire_timers(base, heads + levels);
        }
}

static void __run_timer_base(struct timer_base *base)
{
        if (time_before(jiffies, base->next_expiry))
                return;

        timer_base_lock_expiry(base);
        raw_spin_lock_irq(&base->lock);
        __run_timers(base);
        raw_spin_unlock_irq(&base->lock);
        timer_base_unlock_expiry(base);
}

static void run_timer_base(int index)
{
        struct timer_base *base = this_cpu_ptr(&timer_bases[index]);

        __run_timer_base(base);
}

/*
 * This function runs timers and the timer-tq in bottom half context.
 */
static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{
        run_timer_base(BASE_LOCAL);
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) {
                run_timer_base(BASE_GLOBAL);
                run_timer_base(BASE_DEF);

                if (is_timers_nohz_active())
                        tmigr_handle_remote();
        }
}

/*
 * Called by the local, per-CPU timer interrupt on SMP.
 */
static void run_local_timers(void)
{
        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]);

        hrtimer_run_queues();

        for (int i = 0; i < NR_BASES; i++, base++) {
                /* Raise the softirq only if required. */
                if (time_after_eq(jiffies, base->next_expiry) ||
                    (i == BASE_DEF && tmigr_requires_handle_remote())) {
                        raise_softirq(TIMER_SOFTIRQ);
                        return;
                }
        }
}

/*
 * Called from the timer interrupt handler to charge one tick to the current
 * process.  user_tick is 1 if the tick is user time, 0 for system.
 */
void update_process_times(int user_tick)
{
        struct task_struct *p = current;

        /* Note: this timer irq context must be accounted for as well. */
        account_process_tick(p, user_tick);
        run_local_timers();
        rcu_sched_clock_irq(user_tick);
#ifdef CONFIG_IRQ_WORK
        if (in_irq())
                irq_work_tick();
#endif
        sched_tick();
        if (IS_ENABLED(CONFIG_POSIX_TIMERS))
                run_posix_cpu_timers();
}

/*
 * Since schedule_timeout()'s timer is defined on the stack, it must store
 * the target task on the stack as well.
 */
struct process_timer {
        struct timer_list timer;
        struct task_struct *task;
};

static void process_timeout(struct timer_list *t)
{
        struct process_timer *timeout = from_timer(timeout, t, timer);

        wake_up_process(timeout->task);
}

/**
 * schedule_timeout - sleep until timeout
 * @timeout: timeout value in jiffies
 *
 * Make the current task sleep until @timeout jiffies have elapsed.
 * The function behavior depends on the current task state
 * (see also set_current_state() description):
 *
 * %TASK_RUNNING - the scheduler is called, but the task does not sleep
 * at all. That happens because sched_submit_work() does nothing for
 * tasks in %TASK_RUNNING state.
 *
 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
 * pass before the routine returns unless the current task is explicitly
 * woken up, (e.g. by wake_up_process()).
 *
 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 * delivered to the current task or the current task is explicitly woken
 * up.
 *
 * The current task state is guaranteed to be %TASK_RUNNING when this
 * routine returns.
 *
 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
 * the CPU away without a bound on the timeout. In this case the return
 * value will be %MAX_SCHEDULE_TIMEOUT.
 *
 * Returns 0 when the timer has expired otherwise the remaining time in
 * jiffies will be returned. In all cases the return value is guaranteed
 * to be non-negative.
 */
signed long __sched schedule_timeout(signed long timeout)
{
        struct process_timer timer;
        unsigned long expire;

        switch (timeout)
        {
        case MAX_SCHEDULE_TIMEOUT:
                /*
                 * These two special cases are useful to be comfortable
                 * in the caller. Nothing more. We could take
                 * MAX_SCHEDULE_TIMEOUT from one of the negative value
                 * but I' d like to return a valid offset (>=0) to allow
                 * the caller to do everything it want with the retval.
                 */
                schedule();
                goto out;
        default:
                /*
                 * Another bit of PARANOID. Note that the retval will be
                 * 0 since no piece of kernel is supposed to do a check
                 * for a negative retval of schedule_timeout() (since it
                 * should never happens anyway). You just have the printk()
                 * that will tell you if something is gone wrong and where.
                 */
                if (timeout < 0) {
                        printk(KERN_ERR "schedule_timeout: wrong timeout "
                                "value %lx\n", timeout);
                        dump_stack();
                        __set_current_state(TASK_RUNNING);
                        goto out;
                }
        }

        expire = timeout + jiffies;

        timer.task = current;
        timer_setup_on_stack(&timer.timer, process_timeout, 0);
        __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
        schedule();
        del_timer_sync(&timer.timer);

        /* Remove the timer from the object tracker */
        destroy_timer_on_stack(&timer.timer);

        timeout = expire - jiffies;

 out:
        return timeout < 0 ? 0 : timeout;
}
EXPORT_SYMBOL(schedule_timeout);

/*
 * We can use __set_current_state() here because schedule_timeout() calls
 * schedule() unconditionally.
 */
signed long __sched schedule_timeout_interruptible(signed long timeout)
{
        __set_current_state(TASK_INTERRUPTIBLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_interruptible);

signed long __sched schedule_timeout_killable(signed long timeout)
{
        __set_current_state(TASK_KILLABLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_killable);

signed long __sched schedule_timeout_uninterruptible(signed long timeout)
{
        __set_current_state(TASK_UNINTERRUPTIBLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_uninterruptible);

/*
 * Like schedule_timeout_uninterruptible(), except this task will not contribute
 * to load average.
 */
signed long __sched schedule_timeout_idle(signed long timeout)
{
        __set_current_state(TASK_IDLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_idle);

#ifdef CONFIG_HOTPLUG_CPU
static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
{
        struct timer_list *timer;
        int cpu = new_base->cpu;

        while (!hlist_empty(head)) {
                timer = hlist_entry(head->first, struct timer_list, entry);
                detach_timer(timer, false);
                timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
                internal_add_timer(new_base, timer);
        }
}

int timers_prepare_cpu(unsigned int cpu)
{
        struct timer_base *base;
        int b;

        for (b = 0; b < NR_BASES; b++) {
                base = per_cpu_ptr(&timer_bases[b], cpu);
                base->clk = jiffies;
                base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
                base->next_expiry_recalc = false;
                base->timers_pending = false;
                base->is_idle = false;
        }
        return 0;
}

int timers_dead_cpu(unsigned int cpu)
{
        struct timer_base *old_base;
        struct timer_base *new_base;
        int b, i;

        for (b = 0; b < NR_BASES; b++) {
                old_base = per_cpu_ptr(&timer_bases[b], cpu);
                new_base = get_cpu_ptr(&timer_bases[b]);
                /*
                 * The caller is globally serialized and nobody else
                 * takes two locks at once, deadlock is not possible.
                 */
                raw_spin_lock_irq(&new_base->lock);
                raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);

                /*
                 * The current CPUs base clock might be stale. Update it
                 * before moving the timers over.
                 */
                forward_timer_base(new_base);

                WARN_ON_ONCE(old_base->running_timer);
                old_base->running_timer = NULL;

                for (i = 0; i < WHEEL_SIZE; i++)
                        migrate_timer_list(new_base, old_base->vectors + i);

                raw_spin_unlock(&old_base->lock);
                raw_spin_unlock_irq(&new_base->lock);
                put_cpu_ptr(&timer_bases);
        }
        return 0;
}

#endif /* CONFIG_HOTPLUG_CPU */

static void __init init_timer_cpu(int cpu)
{
        struct timer_base *base;
        int i;

        for (i = 0; i < NR_BASES; i++) {
                base = per_cpu_ptr(&timer_bases[i], cpu);
                base->cpu = cpu;
                raw_spin_lock_init(&base->lock);
                base->clk = jiffies;
                base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
                timer_base_init_expiry_lock(base);
        }
}

static void __init init_timer_cpus(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                init_timer_cpu(cpu);
}

void __init init_timers(void)
{
        init_timer_cpus();
        posix_cputimers_init_work();
        open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}

/**
 * msleep - sleep safely even with waitqueue interruptions
 * @msecs: Time in milliseconds to sleep for
 */
void msleep(unsigned int msecs)
{
        unsigned long timeout = msecs_to_jiffies(msecs) + 1;

        while (timeout)
                timeout = schedule_timeout_uninterruptible(timeout);
}

EXPORT_SYMBOL(msleep);

/**
 * msleep_interruptible - sleep waiting for signals
 * @msecs: Time in milliseconds to sleep for
 */
unsigned long msleep_interruptible(unsigned int msecs)
{
        unsigned long timeout = msecs_to_jiffies(msecs) + 1;

        while (timeout && !signal_pending(current))
                timeout = schedule_timeout_interruptible(timeout);
        return jiffies_to_msecs(timeout);
}

EXPORT_SYMBOL(msleep_interruptible);

/**
 * usleep_range_state - Sleep for an approximate time in a given state
 * @min:        Minimum time in usecs to sleep
 * @max:        Maximum time in usecs to sleep
 * @state:        State of the current task that will be while sleeping
 *
 * In non-atomic context where the exact wakeup time is flexible, use
 * usleep_range_state() instead of udelay().  The sleep improves responsiveness
 * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
 * power usage by allowing hrtimers to take advantage of an already-
 * scheduled interrupt instead of scheduling a new one just for this sleep.
 */
void __sched usleep_range_state(unsigned long min, unsigned long max,
                                unsigned int state)
{
        ktime_t exp = ktime_add_us(ktime_get(), min);
        u64 delta = (u64)(max - min) * NSEC_PER_USEC;

        for (;;) {
                __set_current_state(state);
                /* Do not return before the requested sleep time has elapsed */
                if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
                        break;
        }
}
EXPORT_SYMBOL(usleep_range_state);




























































































































































    5 
    7 
























































































    1 




    1 




    1 
















































































































































    2 


















































































































































    7 

    1 


















    8 

    8 
































    8 

















    7 

    8 






















   12 



    7 


    7 





    7 
    2 



   13 

   11 



   11 












    7 
    5 


























































    1 
































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
// SPDX-License-Identifier: GPL-2.0
/*
 * buffered writeback throttling. loosely based on CoDel. We can't drop
 * packets for IO scheduling, so the logic is something like this:
 *
 * - Monitor latencies in a defined window of time.
 * - If the minimum latency in the above window exceeds some target, increment
 *   scaling step and scale down queue depth by a factor of 2x. The monitoring
 *   window is then shrunk to 100 / sqrt(scaling step + 1).
 * - For any window where we don't have solid data on what the latencies
 *   look like, retain status quo.
 * - If latencies look good, decrement scaling step.
 * - If we're only doing writes, allow the scaling step to go negative. This
 *   will temporarily boost write performance, snapping back to a stable
 *   scaling step of 0 if reads show up or the heavy writers finish. Unlike
 *   positive scaling steps where we shrink the monitoring window, a negative
 *   scaling step retains the default step==0 window size.
 *
 * Copyright (C) 2016 Jens Axboe
 *
 */
#include <linux/kernel.h>
#include <linux/blk_types.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/swap.h>

#include "blk-stat.h"
#include "blk-wbt.h"
#include "blk-rq-qos.h"
#include "elevator.h"
#include "blk.h"

#define CREATE_TRACE_POINTS
#include <trace/events/wbt.h>

enum wbt_flags {
        WBT_TRACKED                = 1,        /* write, tracked for throttling */
        WBT_READ                = 2,        /* read */
        WBT_KSWAPD                = 4,        /* write, from kswapd */
        WBT_DISCARD                = 8,        /* discard */

        WBT_NR_BITS                = 4,        /* number of bits */
};

enum {
        WBT_RWQ_BG                = 0,
        WBT_RWQ_KSWAPD,
        WBT_RWQ_DISCARD,
        WBT_NUM_RWQ,
};

/*
 * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other
 * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered
 * to WBT_STATE_OFF/ON_MANUAL.
 */
enum {
        WBT_STATE_ON_DEFAULT        = 1,        /* on by default */
        WBT_STATE_ON_MANUAL        = 2,        /* on manually by sysfs */
        WBT_STATE_OFF_DEFAULT        = 3,        /* off by default */
        WBT_STATE_OFF_MANUAL        = 4,        /* off manually by sysfs */
};

struct rq_wb {
        /*
         * Settings that govern how we throttle
         */
        unsigned int wb_background;                /* background writeback */
        unsigned int wb_normal;                        /* normal writeback */

        short enable_state;                        /* WBT_STATE_* */

        /*
         * Number of consecutive periods where we don't have enough
         * information to make a firm scale up/down decision.
         */
        unsigned int unknown_cnt;

        u64 win_nsec;                                /* default window size */
        u64 cur_win_nsec;                        /* current window size */

        struct blk_stat_callback *cb;

        u64 sync_issue;
        void *sync_cookie;

        unsigned long last_issue;                /* last non-throttled issue */
        unsigned long last_comp;                /* last non-throttled comp */
        unsigned long min_lat_nsec;
        struct rq_qos rqos;
        struct rq_wait rq_wait[WBT_NUM_RWQ];
        struct rq_depth rq_depth;
};

static inline struct rq_wb *RQWB(struct rq_qos *rqos)
{
        return container_of(rqos, struct rq_wb, rqos);
}

static inline void wbt_clear_state(struct request *rq)
{
        rq->wbt_flags = 0;
}

static inline enum wbt_flags wbt_flags(struct request *rq)
{
        return rq->wbt_flags;
}

static inline bool wbt_is_tracked(struct request *rq)
{
        return rq->wbt_flags & WBT_TRACKED;
}

static inline bool wbt_is_read(struct request *rq)
{
        return rq->wbt_flags & WBT_READ;
}

enum {
        /*
         * Default setting, we'll scale up (to 75% of QD max) or down (min 1)
         * from here depending on device stats
         */
        RWB_DEF_DEPTH        = 16,

        /*
         * 100msec window
         */
        RWB_WINDOW_NSEC                = 100 * 1000 * 1000ULL,

        /*
         * Disregard stats, if we don't meet this minimum
         */
        RWB_MIN_WRITE_SAMPLES        = 3,

        /*
         * If we have this number of consecutive windows with not enough
         * information to scale up or down, scale up.
         */
        RWB_UNKNOWN_BUMP        = 5,
};

static inline bool rwb_enabled(struct rq_wb *rwb)
{
        return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT &&
                      rwb->enable_state != WBT_STATE_OFF_MANUAL;
}

static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
{
        if (rwb_enabled(rwb)) {
                const unsigned long cur = jiffies;

                if (cur != *var)
                        *var = cur;
        }
}

/*
 * If a task was rate throttled in balance_dirty_pages() within the last
 * second or so, use that to indicate a higher cleaning rate.
 */
static bool wb_recent_wait(struct rq_wb *rwb)
{
        struct backing_dev_info *bdi = rwb->rqos.disk->bdi;

        return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}

static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
                                          enum wbt_flags wb_acct)
{
        if (wb_acct & WBT_KSWAPD)
                return &rwb->rq_wait[WBT_RWQ_KSWAPD];
        else if (wb_acct & WBT_DISCARD)
                return &rwb->rq_wait[WBT_RWQ_DISCARD];

        return &rwb->rq_wait[WBT_RWQ_BG];
}

static void rwb_wake_all(struct rq_wb *rwb)
{
        int i;

        for (i = 0; i < WBT_NUM_RWQ; i++) {
                struct rq_wait *rqw = &rwb->rq_wait[i];

                if (wq_has_sleeper(&rqw->wait))
                        wake_up_all(&rqw->wait);
        }
}

static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw,
                         enum wbt_flags wb_acct)
{
        int inflight, limit;

        inflight = atomic_dec_return(&rqw->inflight);

        /*
         * For discards, our limit is always the background. For writes, if
         * the device does write back caching, drop further down before we
         * wake people up.
         */
        if (wb_acct & WBT_DISCARD)
                limit = rwb->wb_background;
        else if (test_bit(QUEUE_FLAG_WC, &rwb->rqos.disk->queue->queue_flags) &&
                 !wb_recent_wait(rwb))
                limit = 0;
        else
                limit = rwb->wb_normal;

        /*
         * Don't wake anyone up if we are above the normal limit.
         */
        if (inflight && inflight >= limit)
                return;

        if (wq_has_sleeper(&rqw->wait)) {
                int diff = limit - inflight;

                if (!inflight || diff >= rwb->wb_background / 2)
                        wake_up_all(&rqw->wait);
        }
}

static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct)
{
        struct rq_wb *rwb = RQWB(rqos);
        struct rq_wait *rqw;

        if (!(wb_acct & WBT_TRACKED))
                return;

        rqw = get_rq_wait(rwb, wb_acct);
        wbt_rqw_done(rwb, rqw, wb_acct);
}

/*
 * Called on completion of a request. Note that it's also called when
 * a request is merged, when the request gets freed.
 */
static void wbt_done(struct rq_qos *rqos, struct request *rq)
{
        struct rq_wb *rwb = RQWB(rqos);

        if (!wbt_is_tracked(rq)) {
                if (rwb->sync_cookie == rq) {
                        rwb->sync_issue = 0;
                        rwb->sync_cookie = NULL;
                }

                if (wbt_is_read(rq))
                        wb_timestamp(rwb, &rwb->last_comp);
        } else {
                WARN_ON_ONCE(rq == rwb->sync_cookie);
                __wbt_done(rqos, wbt_flags(rq));
        }
        wbt_clear_state(rq);
}

static inline bool stat_sample_valid(struct blk_rq_stat *stat)
{
        /*
         * We need at least one read sample, and a minimum of
         * RWB_MIN_WRITE_SAMPLES. We require some write samples to know
         * that it's writes impacting us, and not just some sole read on
         * a device that is in a lower power state.
         */
        return (stat[READ].nr_samples >= 1 &&
                stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES);
}

static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
{
        u64 issue = READ_ONCE(rwb->sync_issue);

        if (!issue || !rwb->sync_cookie)
                return 0;

        return blk_time_get_ns() - issue;
}

static inline unsigned int wbt_inflight(struct rq_wb *rwb)
{
        unsigned int i, ret = 0;

        for (i = 0; i < WBT_NUM_RWQ; i++)
                ret += atomic_read(&rwb->rq_wait[i].inflight);

        return ret;
}

enum {
        LAT_OK = 1,
        LAT_UNKNOWN,
        LAT_UNKNOWN_WRITES,
        LAT_EXCEEDED,
};

static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
{
        struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
        struct rq_depth *rqd = &rwb->rq_depth;
        u64 thislat;

        /*
         * If our stored sync issue exceeds the window size, or it
         * exceeds our min target AND we haven't logged any entries,
         * flag the latency as exceeded. wbt works off completion latencies,
         * but for a flooded device, a single sync IO can take a long time
         * to complete after being issued. If this time exceeds our
         * monitoring window AND we didn't see any other completions in that
         * window, then count that sync IO as a violation of the latency.
         */
        thislat = rwb_sync_issue_lat(rwb);
        if (thislat > rwb->cur_win_nsec ||
            (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) {
                trace_wbt_lat(bdi, thislat);
                return LAT_EXCEEDED;
        }

        /*
         * No read/write mix, if stat isn't valid
         */
        if (!stat_sample_valid(stat)) {
                /*
                 * If we had writes in this stat window and the window is
                 * current, we're only doing writes. If a task recently
                 * waited or still has writes in flights, consider us doing
                 * just writes as well.
                 */
                if (stat[WRITE].nr_samples || wb_recent_wait(rwb) ||
                    wbt_inflight(rwb))
                        return LAT_UNKNOWN_WRITES;
                return LAT_UNKNOWN;
        }

        /*
         * If the 'min' latency exceeds our target, step down.
         */
        if (stat[READ].min > rwb->min_lat_nsec) {
                trace_wbt_lat(bdi, stat[READ].min);
                trace_wbt_stat(bdi, stat);
                return LAT_EXCEEDED;
        }

        if (rqd->scale_step)
                trace_wbt_stat(bdi, stat);

        return LAT_OK;
}

static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
{
        struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
        struct rq_depth *rqd = &rwb->rq_depth;

        trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec,
                        rwb->wb_background, rwb->wb_normal, rqd->max_depth);
}

static void calc_wb_limits(struct rq_wb *rwb)
{
        if (rwb->min_lat_nsec == 0) {
                rwb->wb_normal = rwb->wb_background = 0;
        } else if (rwb->rq_depth.max_depth <= 2) {
                rwb->wb_normal = rwb->rq_depth.max_depth;
                rwb->wb_background = 1;
        } else {
                rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2;
                rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4;
        }
}

static void scale_up(struct rq_wb *rwb)
{
        if (!rq_depth_scale_up(&rwb->rq_depth))
                return;
        calc_wb_limits(rwb);
        rwb->unknown_cnt = 0;
        rwb_wake_all(rwb);
        rwb_trace_step(rwb, tracepoint_string("scale up"));
}

static void scale_down(struct rq_wb *rwb, bool hard_throttle)
{
        if (!rq_depth_scale_down(&rwb->rq_depth, hard_throttle))
                return;
        calc_wb_limits(rwb);
        rwb->unknown_cnt = 0;
        rwb_trace_step(rwb, tracepoint_string("scale down"));
}

static void rwb_arm_timer(struct rq_wb *rwb)
{
        struct rq_depth *rqd = &rwb->rq_depth;

        if (rqd->scale_step > 0) {
                /*
                 * We should speed this up, using some variant of a fast
                 * integer inverse square root calculation. Since we only do
                 * this for every window expiration, it's not a huge deal,
                 * though.
                 */
                rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
                                        int_sqrt((rqd->scale_step + 1) << 8));
        } else {
                /*
                 * For step < 0, we don't want to increase/decrease the
                 * window size.
                 */
                rwb->cur_win_nsec = rwb->win_nsec;
        }

        blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
}

static void wb_timer_fn(struct blk_stat_callback *cb)
{
        struct rq_wb *rwb = cb->data;
        struct rq_depth *rqd = &rwb->rq_depth;
        unsigned int inflight = wbt_inflight(rwb);
        int status;

        if (!rwb->rqos.disk)
                return;

        status = latency_exceeded(rwb, cb->stat);

        trace_wbt_timer(rwb->rqos.disk->bdi, status, rqd->scale_step, inflight);

        /*
         * If we exceeded the latency target, step down. If we did not,
         * step one level up. If we don't know enough to say either exceeded
         * or ok, then don't do anything.
         */
        switch (status) {
        case LAT_EXCEEDED:
                scale_down(rwb, true);
                break;
        case LAT_OK:
                scale_up(rwb);
                break;
        case LAT_UNKNOWN_WRITES:
                /*
                 * We started a the center step, but don't have a valid
                 * read/write sample, but we do have writes going on.
                 * Allow step to go negative, to increase write perf.
                 */
                scale_up(rwb);
                break;
        case LAT_UNKNOWN:
                if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
                        break;
                /*
                 * We get here when previously scaled reduced depth, and we
                 * currently don't have a valid read/write sample. For that
                 * case, slowly return to center state (step == 0).
                 */
                if (rqd->scale_step > 0)
                        scale_up(rwb);
                else if (rqd->scale_step < 0)
                        scale_down(rwb, false);
                break;
        default:
                break;
        }

        /*
         * Re-arm timer, if we have IO in flight
         */
        if (rqd->scale_step || inflight)
                rwb_arm_timer(rwb);
}

static void wbt_update_limits(struct rq_wb *rwb)
{
        struct rq_depth *rqd = &rwb->rq_depth;

        rqd->scale_step = 0;
        rqd->scaled_max = false;

        rq_depth_calc_max_depth(rqd);
        calc_wb_limits(rwb);

        rwb_wake_all(rwb);
}

bool wbt_disabled(struct request_queue *q)
{
        struct rq_qos *rqos = wbt_rq_qos(q);

        return !rqos || !rwb_enabled(RQWB(rqos));
}

u64 wbt_get_min_lat(struct request_queue *q)
{
        struct rq_qos *rqos = wbt_rq_qos(q);
        if (!rqos)
                return 0;
        return RQWB(rqos)->min_lat_nsec;
}

void wbt_set_min_lat(struct request_queue *q, u64 val)
{
        struct rq_qos *rqos = wbt_rq_qos(q);
        if (!rqos)
                return;

        RQWB(rqos)->min_lat_nsec = val;
        if (val)
                RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
        else
                RQWB(rqos)->enable_state = WBT_STATE_OFF_MANUAL;

        wbt_update_limits(RQWB(rqos));
}


static bool close_io(struct rq_wb *rwb)
{
        const unsigned long now = jiffies;

        return time_before(now, rwb->last_issue + HZ / 10) ||
                time_before(now, rwb->last_comp + HZ / 10);
}

#define REQ_HIPRIO        (REQ_SYNC | REQ_META | REQ_PRIO)

static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf)
{
        unsigned int limit;

        if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD)
                return rwb->wb_background;

        /*
         * At this point we know it's a buffered write. If this is
         * kswapd trying to free memory, or REQ_SYNC is set, then
         * it's WB_SYNC_ALL writeback, and we'll use the max limit for
         * that. If the write is marked as a background write, then use
         * the idle limit, or go to normal if we haven't had competing
         * IO for a bit.
         */
        if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
                limit = rwb->rq_depth.max_depth;
        else if ((opf & REQ_BACKGROUND) || close_io(rwb)) {
                /*
                 * If less than 100ms since we completed unrelated IO,
                 * limit us to half the depth for background writeback.
                 */
                limit = rwb->wb_background;
        } else
                limit = rwb->wb_normal;

        return limit;
}

struct wbt_wait_data {
        struct rq_wb *rwb;
        enum wbt_flags wb_acct;
        blk_opf_t opf;
};

static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data)
{
        struct wbt_wait_data *data = private_data;
        return rq_wait_inc_below(rqw, get_limit(data->rwb, data->opf));
}

static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data)
{
        struct wbt_wait_data *data = private_data;
        wbt_rqw_done(data->rwb, rqw, data->wb_acct);
}

/*
 * Block if we will exceed our limit, or if we are currently waiting for
 * the timer to kick off queuing again.
 */
static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
                       blk_opf_t opf)
{
        struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
        struct wbt_wait_data data = {
                .rwb = rwb,
                .wb_acct = wb_acct,
                .opf = opf,
        };

        rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb);
}

static inline bool wbt_should_throttle(struct bio *bio)
{
        switch (bio_op(bio)) {
        case REQ_OP_WRITE:
                /*
                 * Don't throttle WRITE_ODIRECT
                 */
                if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) ==
                    (REQ_SYNC | REQ_IDLE))
                        return false;
                fallthrough;
        case REQ_OP_DISCARD:
                return true;
        default:
                return false;
        }
}

static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio)
{
        enum wbt_flags flags = 0;

        if (!rwb_enabled(rwb))
                return 0;

        if (bio_op(bio) == REQ_OP_READ) {
                flags = WBT_READ;
        } else if (wbt_should_throttle(bio)) {
                if (current_is_kswapd())
                        flags |= WBT_KSWAPD;
                if (bio_op(bio) == REQ_OP_DISCARD)
                        flags |= WBT_DISCARD;
                flags |= WBT_TRACKED;
        }
        return flags;
}

static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio)
{
        struct rq_wb *rwb = RQWB(rqos);
        enum wbt_flags flags = bio_to_wbt_flags(rwb, bio);
        __wbt_done(rqos, flags);
}

/*
 * May sleep, if we have exceeded the writeback limits. Caller can pass
 * in an irq held spinlock, if it holds one when calling this function.
 * If we do sleep, we'll release and re-grab it.
 */
static void wbt_wait(struct rq_qos *rqos, struct bio *bio)
{
        struct rq_wb *rwb = RQWB(rqos);
        enum wbt_flags flags;

        flags = bio_to_wbt_flags(rwb, bio);
        if (!(flags & WBT_TRACKED)) {
                if (flags & WBT_READ)
                        wb_timestamp(rwb, &rwb->last_issue);
                return;
        }

        __wbt_wait(rwb, flags, bio->bi_opf);

        if (!blk_stat_is_active(rwb->cb))
                rwb_arm_timer(rwb);
}

static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
{
        struct rq_wb *rwb = RQWB(rqos);
        rq->wbt_flags |= bio_to_wbt_flags(rwb, bio);
}

static void wbt_issue(struct rq_qos *rqos, struct request *rq)
{
        struct rq_wb *rwb = RQWB(rqos);

        if (!rwb_enabled(rwb))
                return;

        /*
         * Track sync issue, in case it takes a long time to complete. Allows us
         * to react quicker, if a sync IO takes a long time to complete. Note
         * that this is just a hint. The request can go away when it completes,
         * so it's important we never dereference it. We only use the address to
         * compare with, which is why we store the sync_issue time locally.
         */
        if (wbt_is_read(rq) && !rwb->sync_issue) {
                rwb->sync_cookie = rq;
                rwb->sync_issue = rq->io_start_time_ns;
        }
}

static void wbt_requeue(struct rq_qos *rqos, struct request *rq)
{
        struct rq_wb *rwb = RQWB(rqos);
        if (!rwb_enabled(rwb))
                return;
        if (rq == rwb->sync_cookie) {
                rwb->sync_issue = 0;
                rwb->sync_cookie = NULL;
        }
}

/*
 * Enable wbt if defaults are configured that way
 */
void wbt_enable_default(struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct rq_qos *rqos;
        bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ);

        if (q->elevator &&
            test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags))
                enable = false;

        /* Throttling already enabled? */
        rqos = wbt_rq_qos(q);
        if (rqos) {
                if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
                        RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
                return;
        }

        /* Queue not registered? Maybe shutting down... */
        if (!blk_queue_registered(q))
                return;

        if (queue_is_mq(q) && enable)
                wbt_init(disk);
}
EXPORT_SYMBOL_GPL(wbt_enable_default);

u64 wbt_default_latency_nsec(struct request_queue *q)
{
        /*
         * We default to 2msec for non-rotational storage, and 75msec
         * for rotational storage.
         */
        if (blk_queue_nonrot(q))
                return 2000000ULL;
        else
                return 75000000ULL;
}

static int wbt_data_dir(const struct request *rq)
{
        const enum req_op op = req_op(rq);

        if (op == REQ_OP_READ)
                return READ;
        else if (op_is_write(op))
                return WRITE;

        /* don't account */
        return -1;
}

static void wbt_queue_depth_changed(struct rq_qos *rqos)
{
        RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->disk->queue);
        wbt_update_limits(RQWB(rqos));
}

static void wbt_exit(struct rq_qos *rqos)
{
        struct rq_wb *rwb = RQWB(rqos);

        blk_stat_remove_callback(rqos->disk->queue, rwb->cb);
        blk_stat_free_callback(rwb->cb);
        kfree(rwb);
}

/*
 * Disable wbt, if enabled by default.
 */
void wbt_disable_default(struct gendisk *disk)
{
        struct rq_qos *rqos = wbt_rq_qos(disk->queue);
        struct rq_wb *rwb;
        if (!rqos)
                return;
        rwb = RQWB(rqos);
        if (rwb->enable_state == WBT_STATE_ON_DEFAULT) {
                blk_stat_deactivate(rwb->cb);
                rwb->enable_state = WBT_STATE_OFF_DEFAULT;
        }
}
EXPORT_SYMBOL_GPL(wbt_disable_default);

#ifdef CONFIG_BLK_DEBUG_FS
static int wbt_curr_win_nsec_show(void *data, struct seq_file *m)
{
        struct rq_qos *rqos = data;
        struct rq_wb *rwb = RQWB(rqos);

        seq_printf(m, "%llu\n", rwb->cur_win_nsec);
        return 0;
}

static int wbt_enabled_show(void *data, struct seq_file *m)
{
        struct rq_qos *rqos = data;
        struct rq_wb *rwb = RQWB(rqos);

        seq_printf(m, "%d\n", rwb->enable_state);
        return 0;
}

static int wbt_id_show(void *data, struct seq_file *m)
{
        struct rq_qos *rqos = data;

        seq_printf(m, "%u\n", rqos->id);
        return 0;
}

static int wbt_inflight_show(void *data, struct seq_file *m)
{
        struct rq_qos *rqos = data;
        struct rq_wb *rwb = RQWB(rqos);
        int i;

        for (i = 0; i < WBT_NUM_RWQ; i++)
                seq_printf(m, "%d: inflight %d\n", i,
                           atomic_read(&rwb->rq_wait[i].inflight));
        return 0;
}

static int wbt_min_lat_nsec_show(void *data, struct seq_file *m)
{
        struct rq_qos *rqos = data;
        struct rq_wb *rwb = RQWB(rqos);

        seq_printf(m, "%lu\n", rwb->min_lat_nsec);
        return 0;
}

static int wbt_unknown_cnt_show(void *data, struct seq_file *m)
{
        struct rq_qos *rqos = data;
        struct rq_wb *rwb = RQWB(rqos);

        seq_printf(m, "%u\n", rwb->unknown_cnt);
        return 0;
}

static int wbt_normal_show(void *data, struct seq_file *m)
{
        struct rq_qos *rqos = data;
        struct rq_wb *rwb = RQWB(rqos);

        seq_printf(m, "%u\n", rwb->wb_normal);
        return 0;
}

static int wbt_background_show(void *data, struct seq_file *m)
{
        struct rq_qos *rqos = data;
        struct rq_wb *rwb = RQWB(rqos);

        seq_printf(m, "%u\n", rwb->wb_background);
        return 0;
}

static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = {
        {"curr_win_nsec", 0400, wbt_curr_win_nsec_show},
        {"enabled", 0400, wbt_enabled_show},
        {"id", 0400, wbt_id_show},
        {"inflight", 0400, wbt_inflight_show},
        {"min_lat_nsec", 0400, wbt_min_lat_nsec_show},
        {"unknown_cnt", 0400, wbt_unknown_cnt_show},
        {"wb_normal", 0400, wbt_normal_show},
        {"wb_background", 0400, wbt_background_show},
        {},
};
#endif

static const struct rq_qos_ops wbt_rqos_ops = {
        .throttle = wbt_wait,
        .issue = wbt_issue,
        .track = wbt_track,
        .requeue = wbt_requeue,
        .done = wbt_done,
        .cleanup = wbt_cleanup,
        .queue_depth_changed = wbt_queue_depth_changed,
        .exit = wbt_exit,
#ifdef CONFIG_BLK_DEBUG_FS
        .debugfs_attrs = wbt_debugfs_attrs,
#endif
};

int wbt_init(struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct rq_wb *rwb;
        int i;
        int ret;

        rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
        if (!rwb)
                return -ENOMEM;

        rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
        if (!rwb->cb) {
                kfree(rwb);
                return -ENOMEM;
        }

        for (i = 0; i < WBT_NUM_RWQ; i++)
                rq_wait_init(&rwb->rq_wait[i]);

        rwb->last_comp = rwb->last_issue = jiffies;
        rwb->win_nsec = RWB_WINDOW_NSEC;
        rwb->enable_state = WBT_STATE_ON_DEFAULT;
        rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
        rwb->min_lat_nsec = wbt_default_latency_nsec(q);
        rwb->rq_depth.queue_depth = blk_queue_depth(q);
        wbt_update_limits(rwb);

        /*
         * Assign rwb and add the stats callback.
         */
        mutex_lock(&q->rq_qos_mutex);
        ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
        mutex_unlock(&q->rq_qos_mutex);
        if (ret)
                goto err_free;

        blk_stat_add_callback(q, rwb->cb);

        return 0;

err_free:
        blk_stat_free_callback(rwb->cb);
        kfree(rwb);
        return ret;

}




















































































































































































































































































































































































































    1 


    1 











    2 

















    1 















































































































































    2 
















    2 




    2 

    2 











    2 





























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 1991, 1992  Linus Torvalds
 * Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
 * Copyright (C) 2016 - 2020 Christoph Hellwig
 */
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/mpage.h>
#include <linux/uio.h>
#include <linux/namei.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/falloc.h>
#include <linux/suspend.h>
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/module.h>
#include "blk.h"

static inline struct inode *bdev_file_inode(struct file *file)
{
        return file->f_mapping->host;
}

static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
{
        blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;

        /* avoid the need for a I/O completion work item */
        if (iocb_is_dsync(iocb))
                opf |= REQ_FUA;
        return opf;
}

static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos,
                              struct iov_iter *iter)
{
        return pos & (bdev_logical_block_size(bdev) - 1) ||
                !bdev_iter_is_aligned(bdev, iter);
}

#define DIO_INLINE_BIO_VECS 4

static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
                struct iov_iter *iter, struct block_device *bdev,
                unsigned int nr_pages)
{
        struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
        loff_t pos = iocb->ki_pos;
        bool should_dirty = false;
        struct bio bio;
        ssize_t ret;

        if (nr_pages <= DIO_INLINE_BIO_VECS)
                vecs = inline_vecs;
        else {
                vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec),
                                     GFP_KERNEL);
                if (!vecs)
                        return -ENOMEM;
        }

        if (iov_iter_rw(iter) == READ) {
                bio_init(&bio, bdev, vecs, nr_pages, REQ_OP_READ);
                if (user_backed_iter(iter))
                        should_dirty = true;
        } else {
                bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb));
        }
        bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
        bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
        bio.bi_ioprio = iocb->ki_ioprio;

        ret = bio_iov_iter_get_pages(&bio, iter);
        if (unlikely(ret))
                goto out;
        ret = bio.bi_iter.bi_size;

        if (iov_iter_rw(iter) == WRITE)
                task_io_account_write(ret);

        if (iocb->ki_flags & IOCB_NOWAIT)
                bio.bi_opf |= REQ_NOWAIT;

        submit_bio_wait(&bio);

        bio_release_pages(&bio, should_dirty);
        if (unlikely(bio.bi_status))
                ret = blk_status_to_errno(bio.bi_status);

out:
        if (vecs != inline_vecs)
                kfree(vecs);

        bio_uninit(&bio);

        return ret;
}

enum {
        DIO_SHOULD_DIRTY        = 1,
        DIO_IS_SYNC                = 2,
};

struct blkdev_dio {
        union {
                struct kiocb                *iocb;
                struct task_struct        *waiter;
        };
        size_t                        size;
        atomic_t                ref;
        unsigned int                flags;
        struct bio                bio ____cacheline_aligned_in_smp;
};

static struct bio_set blkdev_dio_pool;

static void blkdev_bio_end_io(struct bio *bio)
{
        struct blkdev_dio *dio = bio->bi_private;
        bool should_dirty = dio->flags & DIO_SHOULD_DIRTY;

        if (bio->bi_status && !dio->bio.bi_status)
                dio->bio.bi_status = bio->bi_status;

        if (atomic_dec_and_test(&dio->ref)) {
                if (!(dio->flags & DIO_IS_SYNC)) {
                        struct kiocb *iocb = dio->iocb;
                        ssize_t ret;

                        WRITE_ONCE(iocb->private, NULL);

                        if (likely(!dio->bio.bi_status)) {
                                ret = dio->size;
                                iocb->ki_pos += ret;
                        } else {
                                ret = blk_status_to_errno(dio->bio.bi_status);
                        }

                        dio->iocb->ki_complete(iocb, ret);
                        bio_put(&dio->bio);
                } else {
                        struct task_struct *waiter = dio->waiter;

                        WRITE_ONCE(dio->waiter, NULL);
                        blk_wake_io_task(waiter);
                }
        }

        if (should_dirty) {
                bio_check_pages_dirty(bio);
        } else {
                bio_release_pages(bio, false);
                bio_put(bio);
        }
}

static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                struct block_device *bdev, unsigned int nr_pages)
{
        struct blk_plug plug;
        struct blkdev_dio *dio;
        struct bio *bio;
        bool is_read = (iov_iter_rw(iter) == READ), is_sync;
        blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
        loff_t pos = iocb->ki_pos;
        int ret = 0;

        if (iocb->ki_flags & IOCB_ALLOC_CACHE)
                opf |= REQ_ALLOC_CACHE;
        bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
                               &blkdev_dio_pool);
        dio = container_of(bio, struct blkdev_dio, bio);
        atomic_set(&dio->ref, 1);
        /*
         * Grab an extra reference to ensure the dio structure which is embedded
         * into the first bio stays around.
         */
        bio_get(bio);

        is_sync = is_sync_kiocb(iocb);
        if (is_sync) {
                dio->flags = DIO_IS_SYNC;
                dio->waiter = current;
        } else {
                dio->flags = 0;
                dio->iocb = iocb;
        }

        dio->size = 0;
        if (is_read && user_backed_iter(iter))
                dio->flags |= DIO_SHOULD_DIRTY;

        blk_start_plug(&plug);

        for (;;) {
                bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
                bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
                bio->bi_private = dio;
                bio->bi_end_io = blkdev_bio_end_io;
                bio->bi_ioprio = iocb->ki_ioprio;

                ret = bio_iov_iter_get_pages(bio, iter);
                if (unlikely(ret)) {
                        bio->bi_status = BLK_STS_IOERR;
                        bio_endio(bio);
                        break;
                }
                if (iocb->ki_flags & IOCB_NOWAIT) {
                        /*
                         * This is nonblocking IO, and we need to allocate
                         * another bio if we have data left to map. As we
                         * cannot guarantee that one of the sub bios will not
                         * fail getting issued FOR NOWAIT and as error results
                         * are coalesced across all of them, be safe and ask for
                         * a retry of this from blocking context.
                         */
                        if (unlikely(iov_iter_count(iter))) {
                                bio_release_pages(bio, false);
                                bio_clear_flag(bio, BIO_REFFED);
                                bio_put(bio);
                                blk_finish_plug(&plug);
                                return -EAGAIN;
                        }
                        bio->bi_opf |= REQ_NOWAIT;
                }

                if (is_read) {
                        if (dio->flags & DIO_SHOULD_DIRTY)
                                bio_set_pages_dirty(bio);
                } else {
                        task_io_account_write(bio->bi_iter.bi_size);
                }
                dio->size += bio->bi_iter.bi_size;
                pos += bio->bi_iter.bi_size;

                nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
                if (!nr_pages) {
                        submit_bio(bio);
                        break;
                }
                atomic_inc(&dio->ref);
                submit_bio(bio);
                bio = bio_alloc(bdev, nr_pages, opf, GFP_KERNEL);
        }

        blk_finish_plug(&plug);

        if (!is_sync)
                return -EIOCBQUEUED;

        for (;;) {
                set_current_state(TASK_UNINTERRUPTIBLE);
                if (!READ_ONCE(dio->waiter))
                        break;
                blk_io_schedule();
        }
        __set_current_state(TASK_RUNNING);

        if (!ret)
                ret = blk_status_to_errno(dio->bio.bi_status);
        if (likely(!ret))
                ret = dio->size;

        bio_put(&dio->bio);
        return ret;
}

static void blkdev_bio_end_io_async(struct bio *bio)
{
        struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio);
        struct kiocb *iocb = dio->iocb;
        ssize_t ret;

        WRITE_ONCE(iocb->private, NULL);

        if (likely(!bio->bi_status)) {
                ret = dio->size;
                iocb->ki_pos += ret;
        } else {
                ret = blk_status_to_errno(bio->bi_status);
        }

        iocb->ki_complete(iocb, ret);

        if (dio->flags & DIO_SHOULD_DIRTY) {
                bio_check_pages_dirty(bio);
        } else {
                bio_release_pages(bio, false);
                bio_put(bio);
        }
}

static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
                                        struct iov_iter *iter,
                                        struct block_device *bdev,
                                        unsigned int nr_pages)
{
        bool is_read = iov_iter_rw(iter) == READ;
        blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
        struct blkdev_dio *dio;
        struct bio *bio;
        loff_t pos = iocb->ki_pos;
        int ret = 0;

        if (iocb->ki_flags & IOCB_ALLOC_CACHE)
                opf |= REQ_ALLOC_CACHE;
        bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
                               &blkdev_dio_pool);
        dio = container_of(bio, struct blkdev_dio, bio);
        dio->flags = 0;
        dio->iocb = iocb;
        bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
        bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
        bio->bi_end_io = blkdev_bio_end_io_async;
        bio->bi_ioprio = iocb->ki_ioprio;

        if (iov_iter_is_bvec(iter)) {
                /*
                 * Users don't rely on the iterator being in any particular
                 * state for async I/O returning -EIOCBQUEUED, hence we can
                 * avoid expensive iov_iter_advance(). Bypass
                 * bio_iov_iter_get_pages() and set the bvec directly.
                 */
                bio_iov_bvec_set(bio, iter);
        } else {
                ret = bio_iov_iter_get_pages(bio, iter);
                if (unlikely(ret)) {
                        bio_put(bio);
                        return ret;
                }
        }
        dio->size = bio->bi_iter.bi_size;

        if (is_read) {
                if (user_backed_iter(iter)) {
                        dio->flags |= DIO_SHOULD_DIRTY;
                        bio_set_pages_dirty(bio);
                }
        } else {
                task_io_account_write(bio->bi_iter.bi_size);
        }

        if (iocb->ki_flags & IOCB_NOWAIT)
                bio->bi_opf |= REQ_NOWAIT;

        if (iocb->ki_flags & IOCB_HIPRI) {
                bio->bi_opf |= REQ_POLLED;
                submit_bio(bio);
                WRITE_ONCE(iocb->private, bio);
        } else {
                submit_bio(bio);
        }
        return -EIOCBQUEUED;
}

static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
        struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
        unsigned int nr_pages;

        if (!iov_iter_count(iter))
                return 0;

        if (blkdev_dio_unaligned(bdev, iocb->ki_pos, iter))
                return -EINVAL;

        nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
        if (likely(nr_pages <= BIO_MAX_VECS)) {
                if (is_sync_kiocb(iocb))
                        return __blkdev_direct_IO_simple(iocb, iter, bdev,
                                                        nr_pages);
                return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages);
        }
        return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages));
}

static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
                unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
{
        struct block_device *bdev = I_BDEV(inode);
        loff_t isize = i_size_read(inode);

        iomap->bdev = bdev;
        iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
        if (offset >= isize)
                return -EIO;
        iomap->type = IOMAP_MAPPED;
        iomap->addr = iomap->offset;
        iomap->length = isize - iomap->offset;
        iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */
        return 0;
}

static const struct iomap_ops blkdev_iomap_ops = {
        .iomap_begin                = blkdev_iomap_begin,
};

#ifdef CONFIG_BUFFER_HEAD
static int blkdev_get_block(struct inode *inode, sector_t iblock,
                struct buffer_head *bh, int create)
{
        bh->b_bdev = I_BDEV(inode);
        bh->b_blocknr = iblock;
        set_buffer_mapped(bh);
        return 0;
}

/*
 * We cannot call mpage_writepages() as it does not take the buffer lock.
 * We must use block_write_full_folio() directly which holds the buffer
 * lock.  The buffer lock provides the synchronisation with writeback
 * that filesystems rely on when they use the blockdev's mapping.
 */
static int blkdev_writepages(struct address_space *mapping,
                struct writeback_control *wbc)
{
        struct blk_plug plug;
        int err;

        blk_start_plug(&plug);
        err = write_cache_pages(mapping, wbc, block_write_full_folio,
                        blkdev_get_block);
        blk_finish_plug(&plug);

        return err;
}

static int blkdev_read_folio(struct file *file, struct folio *folio)
{
        return block_read_full_folio(folio, blkdev_get_block);
}

static void blkdev_readahead(struct readahead_control *rac)
{
        mpage_readahead(rac, blkdev_get_block);
}

static int blkdev_write_begin(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, struct page **pagep, void **fsdata)
{
        return block_write_begin(mapping, pos, len, pagep, blkdev_get_block);
}

static int blkdev_write_end(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned copied, struct page *page,
                void *fsdata)
{
        int ret;
        ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);

        unlock_page(page);
        put_page(page);

        return ret;
}

const struct address_space_operations def_blk_aops = {
        .dirty_folio        = block_dirty_folio,
        .invalidate_folio = block_invalidate_folio,
        .read_folio        = blkdev_read_folio,
        .readahead        = blkdev_readahead,
        .writepages        = blkdev_writepages,
        .write_begin        = blkdev_write_begin,
        .write_end        = blkdev_write_end,
        .migrate_folio        = buffer_migrate_folio_norefs,
        .is_dirty_writeback = buffer_check_dirty_writeback,
};
#else /* CONFIG_BUFFER_HEAD */
static int blkdev_read_folio(struct file *file, struct folio *folio)
{
        return iomap_read_folio(folio, &blkdev_iomap_ops);
}

static void blkdev_readahead(struct readahead_control *rac)
{
        iomap_readahead(rac, &blkdev_iomap_ops);
}

static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc,
                struct inode *inode, loff_t offset, unsigned int len)
{
        loff_t isize = i_size_read(inode);

        if (WARN_ON_ONCE(offset >= isize))
                return -EIO;
        if (offset >= wpc->iomap.offset &&
            offset < wpc->iomap.offset + wpc->iomap.length)
                return 0;
        return blkdev_iomap_begin(inode, offset, isize - offset,
                                  IOMAP_WRITE, &wpc->iomap, NULL);
}

static const struct iomap_writeback_ops blkdev_writeback_ops = {
        .map_blocks                = blkdev_map_blocks,
};

static int blkdev_writepages(struct address_space *mapping,
                struct writeback_control *wbc)
{
        struct iomap_writepage_ctx wpc = { };

        return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops);
}

const struct address_space_operations def_blk_aops = {
        .dirty_folio        = filemap_dirty_folio,
        .release_folio                = iomap_release_folio,
        .invalidate_folio        = iomap_invalidate_folio,
        .read_folio                = blkdev_read_folio,
        .readahead                = blkdev_readahead,
        .writepages                = blkdev_writepages,
        .is_partially_uptodate  = iomap_is_partially_uptodate,
        .error_remove_folio        = generic_error_remove_folio,
        .migrate_folio                = filemap_migrate_folio,
};
#endif /* CONFIG_BUFFER_HEAD */

/*
 * for a block special file file_inode(file)->i_size is zero
 * so we compute the size by hand (just as in block_read/write above)
 */
static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *bd_inode = bdev_file_inode(file);
        loff_t retval;

        inode_lock(bd_inode);
        retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
        inode_unlock(bd_inode);
        return retval;
}

static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
                int datasync)
{
        struct block_device *bdev = I_BDEV(filp->f_mapping->host);
        int error;

        error = file_write_and_wait_range(filp, start, end);
        if (error)
                return error;

        /*
         * There is no need to serialise calls to blkdev_issue_flush with
         * i_mutex and doing so causes performance issues with concurrent
         * O_SYNC writers to a block device.
         */
        error = blkdev_issue_flush(bdev);
        if (error == -EOPNOTSUPP)
                error = 0;

        return error;
}

/**
 * file_to_blk_mode - get block open flags from file flags
 * @file: file whose open flags should be converted
 *
 * Look at file open flags and generate corresponding block open flags from
 * them. The function works both for file just being open (e.g. during ->open
 * callback) and for file that is already open. This is actually non-trivial
 * (see comment in the function).
 */
blk_mode_t file_to_blk_mode(struct file *file)
{
        blk_mode_t mode = 0;

        if (file->f_mode & FMODE_READ)
                mode |= BLK_OPEN_READ;
        if (file->f_mode & FMODE_WRITE)
                mode |= BLK_OPEN_WRITE;
        /*
         * do_dentry_open() clears O_EXCL from f_flags, use file->private_data
         * to determine whether the open was exclusive for already open files.
         */
        if (file->private_data)
                mode |= BLK_OPEN_EXCL;
        else if (file->f_flags & O_EXCL)
                mode |= BLK_OPEN_EXCL;
        if (file->f_flags & O_NDELAY)
                mode |= BLK_OPEN_NDELAY;

        /*
         * If all bits in O_ACCMODE set (aka O_RDWR | O_WRONLY), the floppy
         * driver has historically allowed ioctls as if the file was opened for
         * writing, but does not allow and actual reads or writes.
         */
        if ((file->f_flags & O_ACCMODE) == (O_RDWR | O_WRONLY))
                mode |= BLK_OPEN_WRITE_IOCTL;

        return mode;
}

static int blkdev_open(struct inode *inode, struct file *filp)
{
        struct block_device *bdev;
        blk_mode_t mode;
        int ret;

        mode = file_to_blk_mode(filp);
        /* Use the file as the holder. */
        if (mode & BLK_OPEN_EXCL)
                filp->private_data = filp;
        ret = bdev_permission(inode->i_rdev, mode, filp->private_data);
        if (ret)
                return ret;

        bdev = blkdev_get_no_open(inode->i_rdev);
        if (!bdev)
                return -ENXIO;

        ret = bdev_open(bdev, mode, filp->private_data, NULL, filp);
        if (ret)
                blkdev_put_no_open(bdev);
        return ret;
}

static int blkdev_release(struct inode *inode, struct file *filp)
{
        bdev_release(filp);
        return 0;
}

static ssize_t
blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
        size_t count = iov_iter_count(from);
        ssize_t written;

        written = kiocb_invalidate_pages(iocb, count);
        if (written) {
                if (written == -EBUSY)
                        return 0;
                return written;
        }

        written = blkdev_direct_IO(iocb, from);
        if (written > 0) {
                kiocb_invalidate_post_direct_write(iocb, count);
                iocb->ki_pos += written;
                count -= written;
        }
        if (written != -EIOCBQUEUED)
                iov_iter_revert(from, count - iov_iter_count(from));
        return written;
}

static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
{
        return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops);
}

/*
 * Write data to the block device.  Only intended for the block device itself
 * and the raw driver which basically is a fake block device.
 *
 * Does not take i_mutex for the write and thus is not for general purpose
 * use.
 */
static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct inode *bd_inode = bdev_file_inode(file);
        struct block_device *bdev = I_BDEV(bd_inode);
        loff_t size = bdev_nr_bytes(bdev);
        size_t shorted = 0;
        ssize_t ret;

        if (bdev_read_only(bdev))
                return -EPERM;

        if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
                return -ETXTBSY;

        if (!iov_iter_count(from))
                return 0;

        if (iocb->ki_pos >= size)
                return -ENOSPC;

        if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
                return -EOPNOTSUPP;

        size -= iocb->ki_pos;
        if (iov_iter_count(from) > size) {
                shorted = iov_iter_count(from) - size;
                iov_iter_truncate(from, size);
        }

        ret = file_update_time(file);
        if (ret)
                return ret;

        if (iocb->ki_flags & IOCB_DIRECT) {
                ret = blkdev_direct_write(iocb, from);
                if (ret >= 0 && iov_iter_count(from))
                        ret = direct_write_fallback(iocb, from, ret,
                                        blkdev_buffered_write(iocb, from));
        } else {
                ret = blkdev_buffered_write(iocb, from);
        }

        if (ret > 0)
                ret = generic_write_sync(iocb, ret);
        iov_iter_reexpand(from, iov_iter_count(from) + shorted);
        return ret;
}

static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
        loff_t size = bdev_nr_bytes(bdev);
        loff_t pos = iocb->ki_pos;
        size_t shorted = 0;
        ssize_t ret = 0;
        size_t count;

        if (unlikely(pos + iov_iter_count(to) > size)) {
                if (pos >= size)
                        return 0;
                size -= pos;
                shorted = iov_iter_count(to) - size;
                iov_iter_truncate(to, size);
        }

        count = iov_iter_count(to);
        if (!count)
                goto reexpand; /* skip atime */

        if (iocb->ki_flags & IOCB_DIRECT) {
                ret = kiocb_write_and_wait(iocb, count);
                if (ret < 0)
                        goto reexpand;
                file_accessed(iocb->ki_filp);

                ret = blkdev_direct_IO(iocb, to);
                if (ret >= 0) {
                        iocb->ki_pos += ret;
                        count -= ret;
                }
                iov_iter_revert(to, count - iov_iter_count(to));
                if (ret < 0 || !count)
                        goto reexpand;
        }

        ret = filemap_read(iocb, to, ret);

reexpand:
        if (unlikely(shorted))
                iov_iter_reexpand(to, iov_iter_count(to) + shorted);
        return ret;
}

#define        BLKDEV_FALLOC_FL_SUPPORTED                                        \
                (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |                \
                 FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)

static long blkdev_fallocate(struct file *file, int mode, loff_t start,
                             loff_t len)
{
        struct inode *inode = bdev_file_inode(file);
        struct block_device *bdev = I_BDEV(inode);
        loff_t end = start + len - 1;
        loff_t isize;
        int error;

        /* Fail if we don't recognize the flags. */
        if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
                return -EOPNOTSUPP;

        /* Don't go off the end of the device. */
        isize = bdev_nr_bytes(bdev);
        if (start >= isize)
                return -EINVAL;
        if (end >= isize) {
                if (mode & FALLOC_FL_KEEP_SIZE) {
                        len = isize - start;
                        end = start + len - 1;
                } else
                        return -EINVAL;
        }

        /*
         * Don't allow IO that isn't aligned to logical block size.
         */
        if ((start | len) & (bdev_logical_block_size(bdev) - 1))
                return -EINVAL;

        filemap_invalidate_lock(inode->i_mapping);

        /*
         * Invalidate the page cache, including dirty pages, for valid
         * de-allocate mode calls to fallocate().
         */
        switch (mode) {
        case FALLOC_FL_ZERO_RANGE:
        case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
                error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
                if (error)
                        goto fail;

                error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
                                             len >> SECTOR_SHIFT, GFP_KERNEL,
                                             BLKDEV_ZERO_NOUNMAP);
                break;
        case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
                error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
                if (error)
                        goto fail;

                error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
                                             len >> SECTOR_SHIFT, GFP_KERNEL,
                                             BLKDEV_ZERO_NOFALLBACK);
                break;
        case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
                error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
                if (error)
                        goto fail;

                error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
                                             len >> SECTOR_SHIFT, GFP_KERNEL);
                break;
        default:
                error = -EOPNOTSUPP;
        }

 fail:
        filemap_invalidate_unlock(inode->i_mapping);
        return error;
}

static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct inode *bd_inode = bdev_file_inode(file);

        if (bdev_read_only(I_BDEV(bd_inode)))
                return generic_file_readonly_mmap(file, vma);

        return generic_file_mmap(file, vma);
}

const struct file_operations def_blk_fops = {
        .open                = blkdev_open,
        .release        = blkdev_release,
        .llseek                = blkdev_llseek,
        .read_iter        = blkdev_read_iter,
        .write_iter        = blkdev_write_iter,
        .iopoll                = iocb_bio_iopoll,
        .mmap                = blkdev_mmap,
        .fsync                = blkdev_fsync,
        .unlocked_ioctl        = blkdev_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl        = compat_blkdev_ioctl,
#endif
        .splice_read        = filemap_splice_read,
        .splice_write        = iter_file_splice_write,
        .fallocate        = blkdev_fallocate,
        .fop_flags        = FOP_BUFFER_RASYNC,
};

static __init int blkdev_init(void)
{
        return bioset_init(&blkdev_dio_pool, 4,
                                offsetof(struct blkdev_dio, bio),
                                BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE);
}
module_init(blkdev_init);




























































































































































































































































    1 


    1 






























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
/*
 * userio kernel serio device emulation module
 * Copyright (C) 2015 Red Hat
 * Copyright (C) 2015 Stephen Chandler Paul <thatslyude@gmail.com>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or (at
 * your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
 * General Public License for more details.
 */

#include <linux/circ_buf.h>
#include <linux/mutex.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/serio.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/miscdevice.h>
#include <linux/sched.h>
#include <linux/poll.h>
#include <uapi/linux/userio.h>

#define USERIO_NAME                "userio"
#define USERIO_BUFSIZE                16

static struct miscdevice userio_misc;

struct userio_device {
        struct serio *serio;
        struct mutex mutex;

        bool running;

        u8 head;
        u8 tail;

        spinlock_t buf_lock;
        unsigned char buf[USERIO_BUFSIZE];

        wait_queue_head_t waitq;
};

/**
 * userio_device_write - Write data from serio to a userio device in userspace
 * @id: The serio port for the userio device
 * @val: The data to write to the device
 */
static int userio_device_write(struct serio *id, unsigned char val)
{
        struct userio_device *userio = id->port_data;
        unsigned long flags;

        spin_lock_irqsave(&userio->buf_lock, flags);

        userio->buf[userio->head] = val;
        userio->head = (userio->head + 1) % USERIO_BUFSIZE;

        if (userio->head == userio->tail)
                dev_warn(userio_misc.this_device,
                         "Buffer overflowed, userio client isn't keeping up");

        spin_unlock_irqrestore(&userio->buf_lock, flags);

        wake_up_interruptible(&userio->waitq);

        return 0;
}

static int userio_char_open(struct inode *inode, struct file *file)
{
        struct userio_device *userio;

        userio = kzalloc(sizeof(struct userio_device), GFP_KERNEL);
        if (!userio)
                return -ENOMEM;

        mutex_init(&userio->mutex);
        spin_lock_init(&userio->buf_lock);
        init_waitqueue_head(&userio->waitq);

        userio->serio = kzalloc(sizeof(struct serio), GFP_KERNEL);
        if (!userio->serio) {
                kfree(userio);
                return -ENOMEM;
        }

        userio->serio->write = userio_device_write;
        userio->serio->port_data = userio;

        file->private_data = userio;

        return 0;
}

static int userio_char_release(struct inode *inode, struct file *file)
{
        struct userio_device *userio = file->private_data;

        if (userio->running) {
                /*
                 * Don't free the serio port here, serio_unregister_port()
                 * does it for us.
                 */
                serio_unregister_port(userio->serio);
        } else {
                kfree(userio->serio);
        }

        kfree(userio);

        return 0;
}

static ssize_t userio_char_read(struct file *file, char __user *user_buffer,
                                size_t count, loff_t *ppos)
{
        struct userio_device *userio = file->private_data;
        int error;
        size_t nonwrap_len, copylen;
        unsigned char buf[USERIO_BUFSIZE];
        unsigned long flags;

        /*
         * By the time we get here, the data that was waiting might have
         * been taken by another thread. Grab the buffer lock and check if
         * there's still any data waiting, otherwise repeat this process
         * until we have data (unless the file descriptor is non-blocking
         * of course).
         */
        for (;;) {
                spin_lock_irqsave(&userio->buf_lock, flags);

                nonwrap_len = CIRC_CNT_TO_END(userio->head,
                                              userio->tail,
                                              USERIO_BUFSIZE);
                copylen = min(nonwrap_len, count);
                if (copylen) {
                        memcpy(buf, &userio->buf[userio->tail], copylen);
                        userio->tail = (userio->tail + copylen) %
                                                        USERIO_BUFSIZE;
                }

                spin_unlock_irqrestore(&userio->buf_lock, flags);

                if (nonwrap_len)
                        break;

                /* buffer was/is empty */
                if (file->f_flags & O_NONBLOCK)
                        return -EAGAIN;

                /*
                 * count == 0 is special - no IO is done but we check
                 * for error conditions (see above).
                 */
                if (count == 0)
                        return 0;

                error = wait_event_interruptible(userio->waitq,
                                                 userio->head != userio->tail);
                if (error)
                        return error;
        }

        if (copylen)
                if (copy_to_user(user_buffer, buf, copylen))
                        return -EFAULT;

        return copylen;
}

static ssize_t userio_char_write(struct file *file, const char __user *buffer,
                                 size_t count, loff_t *ppos)
{
        struct userio_device *userio = file->private_data;
        struct userio_cmd cmd;
        int error;

        if (count != sizeof(cmd)) {
                dev_warn(userio_misc.this_device, "Invalid payload size\n");
                return -EINVAL;
        }

        if (copy_from_user(&cmd, buffer, sizeof(cmd)))
                return -EFAULT;

        error = mutex_lock_interruptible(&userio->mutex);
        if (error)
                return error;

        switch (cmd.type) {
        case USERIO_CMD_REGISTER:
                if (!userio->serio->id.type) {
                        dev_warn(userio_misc.this_device,
                                 "No port type given on /dev/userio\n");

                        error = -EINVAL;
                        goto out;
                }

                if (userio->running) {
                        dev_warn(userio_misc.this_device,
                                 "Begin command sent, but we're already running\n");
                        error = -EBUSY;
                        goto out;
                }

                userio->running = true;
                serio_register_port(userio->serio);
                break;

        case USERIO_CMD_SET_PORT_TYPE:
                if (userio->running) {
                        dev_warn(userio_misc.this_device,
                                 "Can't change port type on an already running userio instance\n");
                        error = -EBUSY;
                        goto out;
                }

                userio->serio->id.type = cmd.data;
                break;

        case USERIO_CMD_SEND_INTERRUPT:
                if (!userio->running) {
                        dev_warn(userio_misc.this_device,
                                 "The device must be registered before sending interrupts\n");
                        error = -ENODEV;
                        goto out;
                }

                serio_interrupt(userio->serio, cmd.data, 0);
                break;

        default:
                error = -EOPNOTSUPP;
                goto out;
        }

out:
        mutex_unlock(&userio->mutex);
        return error ?: count;
}

static __poll_t userio_char_poll(struct file *file, poll_table *wait)
{
        struct userio_device *userio = file->private_data;

        poll_wait(file, &userio->waitq, wait);

        if (userio->head != userio->tail)
                return EPOLLIN | EPOLLRDNORM;

        return 0;
}

static const struct file_operations userio_fops = {
        .owner                = THIS_MODULE,
        .open                = userio_char_open,
        .release        = userio_char_release,
        .read                = userio_char_read,
        .write                = userio_char_write,
        .poll                = userio_char_poll,
        .llseek                = no_llseek,
};

static struct miscdevice userio_misc = {
        .fops        = &userio_fops,
        .minor        = USERIO_MINOR,
        .name        = USERIO_NAME,
};
module_driver(userio_misc, misc_register, misc_deregister);

MODULE_ALIAS_MISCDEV(USERIO_MINOR);
MODULE_ALIAS("devname:" USERIO_NAME);

MODULE_AUTHOR("Stephen Chandler Paul <thatslyude@gmail.com>");
MODULE_DESCRIPTION("Virtual Serio Device Support");
MODULE_LICENSE("GPL");
























    2 



    2 















































































































































    2 






    1 

    2 










































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
/*
 *  linux/fs/hfs/catalog.c
 *
 * Copyright (C) 1995-1997  Paul H. Hargrove
 * (C) 2003 Ardis Technologies <roman@ardistech.com>
 * This file may be distributed under the terms of the GNU General Public License.
 *
 * This file contains the functions related to the catalog B-tree.
 *
 * Cache code shamelessly stolen from
 *     linux/fs/inode.c Copyright (C) 1991, 1992  Linus Torvalds
 *     re-shamelessly stolen Copyright (C) 1997 Linus Torvalds
 */

#include "hfs_fs.h"
#include "btree.h"

/*
 * hfs_cat_build_key()
 *
 * Given the ID of the parent and the name build a search key.
 */
void hfs_cat_build_key(struct super_block *sb, btree_key *key, u32 parent, const struct qstr *name)
{
        key->cat.reserved = 0;
        key->cat.ParID = cpu_to_be32(parent);
        if (name) {
                hfs_asc2mac(sb, &key->cat.CName, name);
                key->key_len = 6 + key->cat.CName.len;
        } else {
                memset(&key->cat.CName, 0, sizeof(struct hfs_name));
                key->key_len = 6;
        }
}

static int hfs_cat_build_record(hfs_cat_rec *rec, u32 cnid, struct inode *inode)
{
        __be32 mtime = hfs_mtime();

        memset(rec, 0, sizeof(*rec));
        if (S_ISDIR(inode->i_mode)) {
                rec->type = HFS_CDR_DIR;
                rec->dir.DirID = cpu_to_be32(cnid);
                rec->dir.CrDat = mtime;
                rec->dir.MdDat = mtime;
                rec->dir.BkDat = 0;
                rec->dir.UsrInfo.frView = cpu_to_be16(0xff);
                return sizeof(struct hfs_cat_dir);
        } else {
                /* init some fields for the file record */
                rec->type = HFS_CDR_FIL;
                rec->file.Flags = HFS_FIL_USED | HFS_FIL_THD;
                if (!(inode->i_mode & S_IWUSR))
                        rec->file.Flags |= HFS_FIL_LOCK;
                rec->file.FlNum = cpu_to_be32(cnid);
                rec->file.CrDat = mtime;
                rec->file.MdDat = mtime;
                rec->file.BkDat = 0;
                rec->file.UsrWds.fdType = HFS_SB(inode->i_sb)->s_type;
                rec->file.UsrWds.fdCreator = HFS_SB(inode->i_sb)->s_creator;
                return sizeof(struct hfs_cat_file);
        }
}

static int hfs_cat_build_thread(struct super_block *sb,
                                hfs_cat_rec *rec, int type,
                                u32 parentid, const struct qstr *name)
{
        rec->type = type;
        memset(rec->thread.reserved, 0, sizeof(rec->thread.reserved));
        rec->thread.ParID = cpu_to_be32(parentid);
        hfs_asc2mac(sb, &rec->thread.CName, name);
        return sizeof(struct hfs_cat_thread);
}

/*
 * create_entry()
 *
 * Add a new file or directory to the catalog B-tree and
 * return a (struct hfs_cat_entry) for it in '*result'.
 */
int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct inode *inode)
{
        struct hfs_find_data fd;
        struct super_block *sb;
        union hfs_cat_rec entry;
        int entry_size;
        int err;

        hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n",
                str->name, cnid, inode->i_nlink);
        if (dir->i_size >= HFS_MAX_VALENCE)
                return -ENOSPC;

        sb = dir->i_sb;
        err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
        if (err)
                return err;

        /*
         * Fail early and avoid ENOSPC during the btree operations. We may
         * have to split the root node at most once.
         */
        err = hfs_bmap_reserve(fd.tree, 2 * fd.tree->depth);
        if (err)
                goto err2;

        hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
        entry_size = hfs_cat_build_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
                        HFS_CDR_THD : HFS_CDR_FTH,
                        dir->i_ino, str);
        err = hfs_brec_find(&fd);
        if (err != -ENOENT) {
                if (!err)
                        err = -EEXIST;
                goto err2;
        }
        err = hfs_brec_insert(&fd, &entry, entry_size);
        if (err)
                goto err2;

        hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str);
        entry_size = hfs_cat_build_record(&entry, cnid, inode);
        err = hfs_brec_find(&fd);
        if (err != -ENOENT) {
                /* panic? */
                if (!err)
                        err = -EEXIST;
                goto err1;
        }
        err = hfs_brec_insert(&fd, &entry, entry_size);
        if (err)
                goto err1;

        dir->i_size++;
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        mark_inode_dirty(dir);
        hfs_find_exit(&fd);
        return 0;

err1:
        hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
        if (!hfs_brec_find(&fd))
                hfs_brec_remove(&fd);
err2:
        hfs_find_exit(&fd);
        return err;
}

/*
 * hfs_cat_compare()
 *
 * Description:
 *   This is the comparison function used for the catalog B-tree.  In
 *   comparing catalog B-tree entries, the parent id is the most
 *   significant field (compared as unsigned ints).  The name field is
 *   the least significant (compared in "Macintosh lexical order",
 *   see hfs_strcmp() in string.c)
 * Input Variable(s):
 *   struct hfs_cat_key *key1: pointer to the first key to compare
 *   struct hfs_cat_key *key2: pointer to the second key to compare
 * Output Variable(s):
 *   NONE
 * Returns:
 *   int: negative if key1<key2, positive if key1>key2, and 0 if key1==key2
 * Preconditions:
 *   key1 and key2 point to "valid" (struct hfs_cat_key)s.
 * Postconditions:
 *   This function has no side-effects
 */
int hfs_cat_keycmp(const btree_key *key1, const btree_key *key2)
{
        __be32 k1p, k2p;

        k1p = key1->cat.ParID;
        k2p = key2->cat.ParID;

        if (k1p != k2p)
                return be32_to_cpu(k1p) < be32_to_cpu(k2p) ? -1 : 1;

        return hfs_strcmp(key1->cat.CName.name, key1->cat.CName.len,
                          key2->cat.CName.name, key2->cat.CName.len);
}

/* Try to get a catalog entry for given catalog id */
// move to read_super???
int hfs_cat_find_brec(struct super_block *sb, u32 cnid,
                      struct hfs_find_data *fd)
{
        hfs_cat_rec rec;
        int res, len, type;

        hfs_cat_build_key(sb, fd->search_key, cnid, NULL);
        res = hfs_brec_read(fd, &rec, sizeof(rec));
        if (res)
                return res;

        type = rec.type;
        if (type != HFS_CDR_THD && type != HFS_CDR_FTH) {
                pr_err("found bad thread record in catalog\n");
                return -EIO;
        }

        fd->search_key->cat.ParID = rec.thread.ParID;
        len = fd->search_key->cat.CName.len = rec.thread.CName.len;
        if (len > HFS_NAMELEN) {
                pr_err("bad catalog namelength\n");
                return -EIO;
        }
        memcpy(fd->search_key->cat.CName.name, rec.thread.CName.name, len);
        return hfs_brec_find(fd);
}


/*
 * hfs_cat_delete()
 *
 * Delete the indicated file or directory.
 * The associated thread is also removed unless ('with_thread'==0).
 */
int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str)
{
        struct super_block *sb;
        struct hfs_find_data fd;
        struct hfs_readdir_data *rd;
        int res, type;

        hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
        sb = dir->i_sb;
        res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
        if (res)
                return res;

        hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str);
        res = hfs_brec_find(&fd);
        if (res)
                goto out;

        type = hfs_bnode_read_u8(fd.bnode, fd.entryoffset);
        if (type == HFS_CDR_FIL) {
                struct hfs_cat_file file;
                hfs_bnode_read(fd.bnode, &file, fd.entryoffset, sizeof(file));
                if (be32_to_cpu(file.FlNum) == cnid) {
#if 0
                        hfs_free_fork(sb, &file, HFS_FK_DATA);
#endif
                        hfs_free_fork(sb, &file, HFS_FK_RSRC);
                }
        }

        /* we only need to take spinlock for exclusion with ->release() */
        spin_lock(&HFS_I(dir)->open_dir_lock);
        list_for_each_entry(rd, &HFS_I(dir)->open_dir_list, list) {
                if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
                        rd->file->f_pos--;
        }
        spin_unlock(&HFS_I(dir)->open_dir_lock);

        res = hfs_brec_remove(&fd);
        if (res)
                goto out;

        hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
        res = hfs_brec_find(&fd);
        if (!res) {
                res = hfs_brec_remove(&fd);
                if (res)
                        goto out;
        }

        dir->i_size--;
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        mark_inode_dirty(dir);
        res = 0;
out:
        hfs_find_exit(&fd);

        return res;
}

/*
 * hfs_cat_move()
 *
 * Rename a file or directory, possibly to a new directory.
 * If the destination exists it is removed and a
 * (struct hfs_cat_entry) for it is returned in '*result'.
 */
int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
                 struct inode *dst_dir, const struct qstr *dst_name)
{
        struct super_block *sb;
        struct hfs_find_data src_fd, dst_fd;
        union hfs_cat_rec entry;
        int entry_size, type;
        int err;

        hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n",
                cnid, src_dir->i_ino, src_name->name,
                dst_dir->i_ino, dst_name->name);
        sb = src_dir->i_sb;
        err = hfs_find_init(HFS_SB(sb)->cat_tree, &src_fd);
        if (err)
                return err;
        dst_fd = src_fd;

        /*
         * Fail early and avoid ENOSPC during the btree operations. We may
         * have to split the root node at most once.
         */
        err = hfs_bmap_reserve(src_fd.tree, 2 * src_fd.tree->depth);
        if (err)
                goto out;

        /* find the old dir entry and read the data */
        hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
        err = hfs_brec_find(&src_fd);
        if (err)
                goto out;
        if (src_fd.entrylength > sizeof(entry) || src_fd.entrylength < 0) {
                err = -EIO;
                goto out;
        }

        hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset,
                            src_fd.entrylength);

        /* create new dir entry with the data from the old entry */
        hfs_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name);
        err = hfs_brec_find(&dst_fd);
        if (err != -ENOENT) {
                if (!err)
                        err = -EEXIST;
                goto out;
        }

        err = hfs_brec_insert(&dst_fd, &entry, src_fd.entrylength);
        if (err)
                goto out;
        dst_dir->i_size++;
        inode_set_mtime_to_ts(dst_dir, inode_set_ctime_current(dst_dir));
        mark_inode_dirty(dst_dir);

        /* finally remove the old entry */
        hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
        err = hfs_brec_find(&src_fd);
        if (err)
                goto out;
        err = hfs_brec_remove(&src_fd);
        if (err)
                goto out;
        src_dir->i_size--;
        inode_set_mtime_to_ts(src_dir, inode_set_ctime_current(src_dir));
        mark_inode_dirty(src_dir);

        type = entry.type;
        if (type == HFS_CDR_FIL && !(entry.file.Flags & HFS_FIL_THD))
                goto out;

        /* remove old thread entry */
        hfs_cat_build_key(sb, src_fd.search_key, cnid, NULL);
        err = hfs_brec_find(&src_fd);
        if (err)
                goto out;
        err = hfs_brec_remove(&src_fd);
        if (err)
                goto out;

        /* create new thread entry */
        hfs_cat_build_key(sb, dst_fd.search_key, cnid, NULL);
        entry_size = hfs_cat_build_thread(sb, &entry, type == HFS_CDR_FIL ? HFS_CDR_FTH : HFS_CDR_THD,
                                        dst_dir->i_ino, dst_name);
        err = hfs_brec_find(&dst_fd);
        if (err != -ENOENT) {
                if (!err)
                        err = -EEXIST;
                goto out;
        }
        err = hfs_brec_insert(&dst_fd, &entry, entry_size);
out:
        hfs_bnode_put(dst_fd.bnode);
        hfs_find_exit(&src_fd);
        return err;
}


















    1 





    1 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_PGTABLE_INVERT_H
#define _ASM_PGTABLE_INVERT_H 1

#ifndef __ASSEMBLY__

/*
 * A clear pte value is special, and doesn't get inverted.
 *
 * Note that even users that only pass a pgprot_t (rather
 * than a full pte) won't trigger the special zero case,
 * because even PAGE_NONE has _PAGE_PROTNONE | _PAGE_ACCESSED
 * set. So the all zero case really is limited to just the
 * cleared page table entry case.
 */
static inline bool __pte_needs_invert(u64 val)
{
        return val && !(val & _PAGE_PRESENT);
}

/* Get a mask to xor with the page table entry to get the correct pfn. */
static inline u64 protnone_mask(u64 val)
{
        return __pte_needs_invert(val) ?  ~0ull : 0;
}

static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
{
        /*
         * When a PTE transitions from NONE to !NONE or vice-versa
         * invert the PFN part to stop speculation.
         * pte_pfn undoes this when needed.
         */
        if (__pte_needs_invert(oldval) != __pte_needs_invert(val))
                val = (val & ~mask) | (~val & mask);
        return val;
}

#endif /* __ASSEMBLY__ */

#endif







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   47 








































































































   47 




































   12 














   46 
   42 








   44 
   46 
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMZONE_H
#define _LINUX_MMZONE_H

#ifndef __ASSEMBLY__
#ifndef __GENERATING_BOUNDS_H

#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/list_nulls.h>
#include <linux/wait.h>
#include <linux/bitops.h>
#include <linux/cache.h>
#include <linux/threads.h>
#include <linux/numa.h>
#include <linux/init.h>
#include <linux/seqlock.h>
#include <linux/nodemask.h>
#include <linux/pageblock-flags.h>
#include <linux/page-flags-layout.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/local_lock.h>
#include <linux/zswap.h>
#include <asm/page.h>

/* Free memory management - zoned buddy allocator.  */
#ifndef CONFIG_ARCH_FORCE_MAX_ORDER
#define MAX_PAGE_ORDER 10
#else
#define MAX_PAGE_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
#endif
#define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER)

#define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES)

#define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1)

/*
 * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
 * costly to service.  That is between allocation orders which should
 * coalesce naturally under reasonable reclaim pressure and those which
 * will not.
 */
#define PAGE_ALLOC_COSTLY_ORDER 3

enum migratetype {
        MIGRATE_UNMOVABLE,
        MIGRATE_MOVABLE,
        MIGRATE_RECLAIMABLE,
        MIGRATE_PCPTYPES,        /* the number of types on the pcp lists */
        MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
#ifdef CONFIG_CMA
        /*
         * MIGRATE_CMA migration type is designed to mimic the way
         * ZONE_MOVABLE works.  Only movable pages can be allocated
         * from MIGRATE_CMA pageblocks and page allocator never
         * implicitly change migration type of MIGRATE_CMA pageblock.
         *
         * The way to use it is to change migratetype of a range of
         * pageblocks to MIGRATE_CMA which can be done by
         * __free_pageblock_cma() function.
         */
        MIGRATE_CMA,
#endif
#ifdef CONFIG_MEMORY_ISOLATION
        MIGRATE_ISOLATE,        /* can't allocate from here */
#endif
        MIGRATE_TYPES
};

/* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
extern const char * const migratetype_names[MIGRATE_TYPES];

#ifdef CONFIG_CMA
#  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
#  define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
#  define is_migrate_cma_folio(folio, pfn)        (MIGRATE_CMA ==                \
        get_pfnblock_flags_mask(&folio->page, pfn, MIGRATETYPE_MASK))
#else
#  define is_migrate_cma(migratetype) false
#  define is_migrate_cma_page(_page) false
#  define is_migrate_cma_folio(folio, pfn) false
#endif

static inline bool is_migrate_movable(int mt)
{
        return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE;
}

/*
 * Check whether a migratetype can be merged with another migratetype.
 *
 * It is only mergeable when it can fall back to other migratetypes for
 * allocation. See fallbacks[MIGRATE_TYPES][3] in page_alloc.c.
 */
static inline bool migratetype_is_mergeable(int mt)
{
        return mt < MIGRATE_PCPTYPES;
}

#define for_each_migratetype_order(order, type) \
        for (order = 0; order < NR_PAGE_ORDERS; order++) \
                for (type = 0; type < MIGRATE_TYPES; type++)

extern int page_group_by_mobility_disabled;

#define MIGRATETYPE_MASK ((1UL << PB_migratetype_bits) - 1)

#define get_pageblock_migratetype(page)                                        \
        get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK)

#define folio_migratetype(folio)                                \
        get_pfnblock_flags_mask(&folio->page, folio_pfn(folio),                \
                        MIGRATETYPE_MASK)
struct free_area {
        struct list_head        free_list[MIGRATE_TYPES];
        unsigned long                nr_free;
};

struct pglist_data;

#ifdef CONFIG_NUMA
enum numa_stat_item {
        NUMA_HIT,                /* allocated in intended node */
        NUMA_MISS,                /* allocated in non intended node */
        NUMA_FOREIGN,                /* was intended here, hit elsewhere */
        NUMA_INTERLEAVE_HIT,        /* interleaver preferred this zone */
        NUMA_LOCAL,                /* allocation from local node */
        NUMA_OTHER,                /* allocation from other node */
        NR_VM_NUMA_EVENT_ITEMS
};
#else
#define NR_VM_NUMA_EVENT_ITEMS 0
#endif

enum zone_stat_item {
        /* First 128 byte cacheline (assuming 64 bit words) */
        NR_FREE_PAGES,
        NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
        NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
        NR_ZONE_ACTIVE_ANON,
        NR_ZONE_INACTIVE_FILE,
        NR_ZONE_ACTIVE_FILE,
        NR_ZONE_UNEVICTABLE,
        NR_ZONE_WRITE_PENDING,        /* Count of dirty, writeback and unstable pages */
        NR_MLOCK,                /* mlock()ed pages found and moved off LRU */
        /* Second 128 byte cacheline */
        NR_BOUNCE,
#if IS_ENABLED(CONFIG_ZSMALLOC)
        NR_ZSPAGES,                /* allocated in zsmalloc */
#endif
        NR_FREE_CMA_PAGES,
#ifdef CONFIG_UNACCEPTED_MEMORY
        NR_UNACCEPTED,
#endif
        NR_VM_ZONE_STAT_ITEMS };

enum node_stat_item {
        NR_LRU_BASE,
        NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
        NR_ACTIVE_ANON,                /*  "     "     "   "       "         */
        NR_INACTIVE_FILE,        /*  "     "     "   "       "         */
        NR_ACTIVE_FILE,                /*  "     "     "   "       "         */
        NR_UNEVICTABLE,                /*  "     "     "   "       "         */
        NR_SLAB_RECLAIMABLE_B,
        NR_SLAB_UNRECLAIMABLE_B,
        NR_ISOLATED_ANON,        /* Temporary isolated pages from anon lru */
        NR_ISOLATED_FILE,        /* Temporary isolated pages from file lru */
        WORKINGSET_NODES,
        WORKINGSET_REFAULT_BASE,
        WORKINGSET_REFAULT_ANON = WORKINGSET_REFAULT_BASE,
        WORKINGSET_REFAULT_FILE,
        WORKINGSET_ACTIVATE_BASE,
        WORKINGSET_ACTIVATE_ANON = WORKINGSET_ACTIVATE_BASE,
        WORKINGSET_ACTIVATE_FILE,
        WORKINGSET_RESTORE_BASE,
        WORKINGSET_RESTORE_ANON = WORKINGSET_RESTORE_BASE,
        WORKINGSET_RESTORE_FILE,
        WORKINGSET_NODERECLAIM,
        NR_ANON_MAPPED,        /* Mapped anonymous pages */
        NR_FILE_MAPPED,        /* pagecache pages mapped into pagetables.
                           only modified from process context */
        NR_FILE_PAGES,
        NR_FILE_DIRTY,
        NR_WRITEBACK,
        NR_WRITEBACK_TEMP,        /* Writeback using temporary buffers */
        NR_SHMEM,                /* shmem pages (included tmpfs/GEM pages) */
        NR_SHMEM_THPS,
        NR_SHMEM_PMDMAPPED,
        NR_FILE_THPS,
        NR_FILE_PMDMAPPED,
        NR_ANON_THPS,
        NR_VMSCAN_WRITE,
        NR_VMSCAN_IMMEDIATE,        /* Prioritise for reclaim when writeback ends */
        NR_DIRTIED,                /* page dirtyings since bootup */
        NR_WRITTEN,                /* page writings since bootup */
        NR_THROTTLED_WRITTEN,        /* NR_WRITTEN while reclaim throttled */
        NR_KERNEL_MISC_RECLAIMABLE,        /* reclaimable non-slab kernel pages */
        NR_FOLL_PIN_ACQUIRED,        /* via: pin_user_page(), gup flag: FOLL_PIN */
        NR_FOLL_PIN_RELEASED,        /* pages returned via unpin_user_page() */
        NR_KERNEL_STACK_KB,        /* measured in KiB */
#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
        NR_KERNEL_SCS_KB,        /* measured in KiB */
#endif
        NR_PAGETABLE,                /* used for pagetables */
        NR_SECONDARY_PAGETABLE, /* secondary pagetables, KVM & IOMMU */
#ifdef CONFIG_IOMMU_SUPPORT
        NR_IOMMU_PAGES,                /* # of pages allocated by IOMMU */
#endif
#ifdef CONFIG_SWAP
        NR_SWAPCACHE,
#endif
#ifdef CONFIG_NUMA_BALANCING
        PGPROMOTE_SUCCESS,        /* promote successfully */
        PGPROMOTE_CANDIDATE,        /* candidate pages to promote */
#endif
        /* PGDEMOTE_*: pages demoted */
        PGDEMOTE_KSWAPD,
        PGDEMOTE_DIRECT,
        PGDEMOTE_KHUGEPAGED,
        NR_VM_NODE_STAT_ITEMS
};

/*
 * Returns true if the item should be printed in THPs (/proc/vmstat
 * currently prints number of anon, file and shmem THPs. But the item
 * is charged in pages).
 */
static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item)
{
        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return false;

        return item == NR_ANON_THPS ||
               item == NR_FILE_THPS ||
               item == NR_SHMEM_THPS ||
               item == NR_SHMEM_PMDMAPPED ||
               item == NR_FILE_PMDMAPPED;
}

/*
 * Returns true if the value is measured in bytes (most vmstat values are
 * measured in pages). This defines the API part, the internal representation
 * might be different.
 */
static __always_inline bool vmstat_item_in_bytes(int idx)
{
        /*
         * Global and per-node slab counters track slab pages.
         * It's expected that changes are multiples of PAGE_SIZE.
         * Internally values are stored in pages.
         *
         * Per-memcg and per-lruvec counters track memory, consumed
         * by individual slab objects. These counters are actually
         * byte-precise.
         */
        return (idx == NR_SLAB_RECLAIMABLE_B ||
                idx == NR_SLAB_UNRECLAIMABLE_B);
}

/*
 * We do arithmetic on the LRU lists in various places in the code,
 * so it is important to keep the active lists LRU_ACTIVE higher in
 * the array than the corresponding inactive lists, and to keep
 * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists.
 *
 * This has to be kept in sync with the statistics in zone_stat_item
 * above and the descriptions in vmstat_text in mm/vmstat.c
 */
#define LRU_BASE 0
#define LRU_ACTIVE 1
#define LRU_FILE 2

enum lru_list {
        LRU_INACTIVE_ANON = LRU_BASE,
        LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
        LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
        LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
        LRU_UNEVICTABLE,
        NR_LRU_LISTS
};

enum vmscan_throttle_state {
        VMSCAN_THROTTLE_WRITEBACK,
        VMSCAN_THROTTLE_ISOLATED,
        VMSCAN_THROTTLE_NOPROGRESS,
        VMSCAN_THROTTLE_CONGESTED,
        NR_VMSCAN_THROTTLE,
};

#define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)

#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)

static inline bool is_file_lru(enum lru_list lru)
{
        return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
}

static inline bool is_active_lru(enum lru_list lru)
{
        return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
}

#define WORKINGSET_ANON 0
#define WORKINGSET_FILE 1
#define ANON_AND_FILE 2

enum lruvec_flags {
        /*
         * An lruvec has many dirty pages backed by a congested BDI:
         * 1. LRUVEC_CGROUP_CONGESTED is set by cgroup-level reclaim.
         *    It can be cleared by cgroup reclaim or kswapd.
         * 2. LRUVEC_NODE_CONGESTED is set by kswapd node-level reclaim.
         *    It can only be cleared by kswapd.
         *
         * Essentially, kswapd can unthrottle an lruvec throttled by cgroup
         * reclaim, but not vice versa. This only applies to the root cgroup.
         * The goal is to prevent cgroup reclaim on the root cgroup (e.g.
         * memory.reclaim) to unthrottle an unbalanced node (that was throttled
         * by kswapd).
         */
        LRUVEC_CGROUP_CONGESTED,
        LRUVEC_NODE_CONGESTED,
};

#endif /* !__GENERATING_BOUNDS_H */

/*
 * Evictable pages are divided into multiple generations. The youngest and the
 * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
 * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
 * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
 * corresponding generation. The gen counter in folio->flags stores gen+1 while
 * a page is on one of lrugen->folios[]. Otherwise it stores 0.
 *
 * A page is added to the youngest generation on faulting. The aging needs to
 * check the accessed bit at least twice before handing this page over to the
 * eviction. The first check takes care of the accessed bit set on the initial
 * fault; the second check makes sure this page hasn't been used since then.
 * This process, AKA second chance, requires a minimum of two generations,
 * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
 * LRU, e.g., /proc/vmstat, these two generations are considered active; the
 * rest of generations, if they exist, are considered inactive. See
 * lru_gen_is_active().
 *
 * PG_active is always cleared while a page is on one of lrugen->folios[] so
 * that the aging needs not to worry about it. And it's set again when a page
 * considered active is isolated for non-reclaiming purposes, e.g., migration.
 * See lru_gen_add_folio() and lru_gen_del_folio().
 *
 * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
 * number of categories of the active/inactive LRU when keeping track of
 * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
 * in folio->flags.
 */
#define MIN_NR_GENS                2U
#define MAX_NR_GENS                4U

/*
 * Each generation is divided into multiple tiers. A page accessed N times
 * through file descriptors is in tier order_base_2(N). A page in the first tier
 * (N=0,1) is marked by PG_referenced unless it was faulted in through page
 * tables or read ahead. A page in any other tier (N>1) is marked by
 * PG_referenced and PG_workingset. This implies a minimum of two tiers is
 * supported without using additional bits in folio->flags.
 *
 * In contrast to moving across generations which requires the LRU lock, moving
 * across tiers only involves atomic operations on folio->flags and therefore
 * has a negligible cost in the buffered access path. In the eviction path,
 * comparisons of refaulted/(evicted+protected) from the first tier and the
 * rest infer whether pages accessed multiple times through file descriptors
 * are statistically hot and thus worth protecting.
 *
 * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the
 * number of categories of the active/inactive LRU when keeping track of
 * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in
 * folio->flags.
 */
#define MAX_NR_TIERS                4U

#ifndef __GENERATING_BOUNDS_H

struct lruvec;
struct page_vma_mapped_walk;

#define LRU_GEN_MASK                ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
#define LRU_REFS_MASK                ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)

#ifdef CONFIG_LRU_GEN

enum {
        LRU_GEN_ANON,
        LRU_GEN_FILE,
};

enum {
        LRU_GEN_CORE,
        LRU_GEN_MM_WALK,
        LRU_GEN_NONLEAF_YOUNG,
        NR_LRU_GEN_CAPS
};

#define MIN_LRU_BATCH                BITS_PER_LONG
#define MAX_LRU_BATCH                (MIN_LRU_BATCH * 64)

/* whether to keep historical stats from evicted generations */
#ifdef CONFIG_LRU_GEN_STATS
#define NR_HIST_GENS                MAX_NR_GENS
#else
#define NR_HIST_GENS                1U
#endif

/*
 * The youngest generation number is stored in max_seq for both anon and file
 * types as they are aged on an equal footing. The oldest generation numbers are
 * stored in min_seq[] separately for anon and file types as clean file pages
 * can be evicted regardless of swap constraints.
 *
 * Normally anon and file min_seq are in sync. But if swapping is constrained,
 * e.g., out of swap space, file min_seq is allowed to advance and leave anon
 * min_seq behind.
 *
 * The number of pages in each generation is eventually consistent and therefore
 * can be transiently negative when reset_batch_size() is pending.
 */
struct lru_gen_folio {
        /* the aging increments the youngest generation number */
        unsigned long max_seq;
        /* the eviction increments the oldest generation numbers */
        unsigned long min_seq[ANON_AND_FILE];
        /* the birth time of each generation in jiffies */
        unsigned long timestamps[MAX_NR_GENS];
        /* the multi-gen LRU lists, lazily sorted on eviction */
        struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
        /* the multi-gen LRU sizes, eventually consistent */
        long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
        /* the exponential moving average of refaulted */
        unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
        /* the exponential moving average of evicted+protected */
        unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
        /* the first tier doesn't need protection, hence the minus one */
        unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
        /* can be modified without holding the LRU lock */
        atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
        atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
        /* whether the multi-gen LRU is enabled */
        bool enabled;
        /* the memcg generation this lru_gen_folio belongs to */
        u8 gen;
        /* the list segment this lru_gen_folio belongs to */
        u8 seg;
        /* per-node lru_gen_folio list for global reclaim */
        struct hlist_nulls_node list;
};

enum {
        MM_LEAF_TOTAL,                /* total leaf entries */
        MM_LEAF_OLD,                /* old leaf entries */
        MM_LEAF_YOUNG,                /* young leaf entries */
        MM_NONLEAF_TOTAL,        /* total non-leaf entries */
        MM_NONLEAF_FOUND,        /* non-leaf entries found in Bloom filters */
        MM_NONLEAF_ADDED,        /* non-leaf entries added to Bloom filters */
        NR_MM_STATS
};

/* double-buffering Bloom filters */
#define NR_BLOOM_FILTERS        2

struct lru_gen_mm_state {
        /* synced with max_seq after each iteration */
        unsigned long seq;
        /* where the current iteration continues after */
        struct list_head *head;
        /* where the last iteration ended before */
        struct list_head *tail;
        /* Bloom filters flip after each iteration */
        unsigned long *filters[NR_BLOOM_FILTERS];
        /* the mm stats for debugging */
        unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
};

struct lru_gen_mm_walk {
        /* the lruvec under reclaim */
        struct lruvec *lruvec;
        /* max_seq from lru_gen_folio: can be out of date */
        unsigned long seq;
        /* the next address within an mm to scan */
        unsigned long next_addr;
        /* to batch promoted pages */
        int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
        /* to batch the mm stats */
        int mm_stats[NR_MM_STATS];
        /* total batched items */
        int batched;
        bool can_swap;
        bool force_scan;
};

/*
 * For each node, memcgs are divided into two generations: the old and the
 * young. For each generation, memcgs are randomly sharded into multiple bins
 * to improve scalability. For each bin, the hlist_nulls is virtually divided
 * into three segments: the head, the tail and the default.
 *
 * An onlining memcg is added to the tail of a random bin in the old generation.
 * The eviction starts at the head of a random bin in the old generation. The
 * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
 * the old generation, is incremented when all its bins become empty.
 *
 * There are four operations:
 * 1. MEMCG_LRU_HEAD, which moves a memcg to the head of a random bin in its
 *    current generation (old or young) and updates its "seg" to "head";
 * 2. MEMCG_LRU_TAIL, which moves a memcg to the tail of a random bin in its
 *    current generation (old or young) and updates its "seg" to "tail";
 * 3. MEMCG_LRU_OLD, which moves a memcg to the head of a random bin in the old
 *    generation, updates its "gen" to "old" and resets its "seg" to "default";
 * 4. MEMCG_LRU_YOUNG, which moves a memcg to the tail of a random bin in the
 *    young generation, updates its "gen" to "young" and resets its "seg" to
 *    "default".
 *
 * The events that trigger the above operations are:
 * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
 * 2. The first attempt to reclaim a memcg below low, which triggers
 *    MEMCG_LRU_TAIL;
 * 3. The first attempt to reclaim a memcg offlined or below reclaimable size
 *    threshold, which triggers MEMCG_LRU_TAIL;
 * 4. The second attempt to reclaim a memcg offlined or below reclaimable size
 *    threshold, which triggers MEMCG_LRU_YOUNG;
 * 5. Attempting to reclaim a memcg below min, which triggers MEMCG_LRU_YOUNG;
 * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
 * 7. Offlining a memcg, which triggers MEMCG_LRU_OLD.
 *
 * Notes:
 * 1. Memcg LRU only applies to global reclaim, and the round-robin incrementing
 *    of their max_seq counters ensures the eventual fairness to all eligible
 *    memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
 * 2. There are only two valid generations: old (seq) and young (seq+1).
 *    MEMCG_NR_GENS is set to three so that when reading the generation counter
 *    locklessly, a stale value (seq-1) does not wraparound to young.
 */
#define MEMCG_NR_GENS        3
#define MEMCG_NR_BINS        8

struct lru_gen_memcg {
        /* the per-node memcg generation counter */
        unsigned long seq;
        /* each memcg has one lru_gen_folio per node */
        unsigned long nr_memcgs[MEMCG_NR_GENS];
        /* per-node lru_gen_folio list for global reclaim */
        struct hlist_nulls_head        fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
        /* protects the above */
        spinlock_t lock;
};

void lru_gen_init_pgdat(struct pglist_data *pgdat);
void lru_gen_init_lruvec(struct lruvec *lruvec);
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);

void lru_gen_init_memcg(struct mem_cgroup *memcg);
void lru_gen_exit_memcg(struct mem_cgroup *memcg);
void lru_gen_online_memcg(struct mem_cgroup *memcg);
void lru_gen_offline_memcg(struct mem_cgroup *memcg);
void lru_gen_release_memcg(struct mem_cgroup *memcg);
void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid);

#else /* !CONFIG_LRU_GEN */

static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
{
}

static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
{
}

static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
{
}

static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
{
}

#endif /* CONFIG_LRU_GEN */

struct lruvec {
        struct list_head                lists[NR_LRU_LISTS];
        /* per lruvec lru_lock for memcg */
        spinlock_t                        lru_lock;
        /*
         * These track the cost of reclaiming one LRU - file or anon -
         * over the other. As the observed cost of reclaiming one LRU
         * increases, the reclaim scan balance tips toward the other.
         */
        unsigned long                        anon_cost;
        unsigned long                        file_cost;
        /* Non-resident age, driven by LRU movement */
        atomic_long_t                        nonresident_age;
        /* Refaults at the time of last reclaim cycle */
        unsigned long                        refaults[ANON_AND_FILE];
        /* Various lruvec state flags (enum lruvec_flags) */
        unsigned long                        flags;
#ifdef CONFIG_LRU_GEN
        /* evictable pages divided into generations */
        struct lru_gen_folio                lrugen;
#ifdef CONFIG_LRU_GEN_WALKS_MMU
        /* to concurrently iterate lru_gen_mm_list */
        struct lru_gen_mm_state                mm_state;
#endif
#endif /* CONFIG_LRU_GEN */
#ifdef CONFIG_MEMCG
        struct pglist_data *pgdat;
#endif
        struct zswap_lruvec_state zswap_lruvec_state;
};

/* Isolate for asynchronous migration */
#define ISOLATE_ASYNC_MIGRATE        ((__force isolate_mode_t)0x4)
/* Isolate unevictable pages */
#define ISOLATE_UNEVICTABLE        ((__force isolate_mode_t)0x8)

/* LRU Isolation modes. */
typedef unsigned __bitwise isolate_mode_t;

enum zone_watermarks {
        WMARK_MIN,
        WMARK_LOW,
        WMARK_HIGH,
        WMARK_PROMO,
        NR_WMARK
};

/*
 * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. Two additional lists
 * are added for THP. One PCP list is used by GPF_MOVABLE, and the other PCP list
 * is used by GFP_UNMOVABLE and GFP_RECLAIMABLE.
 */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define NR_PCP_THP 2
#else
#define NR_PCP_THP 0
#endif
#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1))
#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)

#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)

/*
 * Flags used in pcp->flags field.
 *
 * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the
 * previous page freeing.  To avoid to drain PCP for an accident
 * high-order page freeing.
 *
 * PCPF_FREE_HIGH_BATCH: preserve "pcp->batch" pages in PCP before
 * draining PCP for consecutive high-order pages freeing without
 * allocation if data cache slice of CPU is large enough.  To reduce
 * zone lock contention and keep cache-hot pages reusing.
 */
#define        PCPF_PREV_FREE_HIGH_ORDER        BIT(0)
#define        PCPF_FREE_HIGH_BATCH                BIT(1)

struct per_cpu_pages {
        spinlock_t lock;        /* Protects lists field */
        int count;                /* number of pages in the list */
        int high;                /* high watermark, emptying needed */
        int high_min;                /* min high watermark */
        int high_max;                /* max high watermark */
        int batch;                /* chunk size for buddy add/remove */
        u8 flags;                /* protected by pcp->lock */
        u8 alloc_factor;        /* batch scaling factor during allocate */
#ifdef CONFIG_NUMA
        u8 expire;                /* When 0, remote pagesets are drained */
#endif
        short free_count;        /* consecutive free count */

        /* Lists of pages, one per migrate type stored on the pcp-lists */
        struct list_head lists[NR_PCP_LISTS];
} ____cacheline_aligned_in_smp;

struct per_cpu_zonestat {
#ifdef CONFIG_SMP
        s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
        s8 stat_threshold;
#endif
#ifdef CONFIG_NUMA
        /*
         * Low priority inaccurate counters that are only folded
         * on demand. Use a large type to avoid the overhead of
         * folding during refresh_cpu_vm_stats.
         */
        unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
#endif
};

struct per_cpu_nodestat {
        s8 stat_threshold;
        s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
};

#endif /* !__GENERATING_BOUNDS.H */

enum zone_type {
        /*
         * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able
         * to DMA to all of the addressable memory (ZONE_NORMAL).
         * On architectures where this area covers the whole 32 bit address
         * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller
         * DMA addressing constraints. This distinction is important as a 32bit
         * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit
         * platforms may need both zones as they support peripherals with
         * different DMA addressing limitations.
         */
#ifdef CONFIG_ZONE_DMA
        ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
        ZONE_DMA32,
#endif
        /*
         * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
         * performed on pages in ZONE_NORMAL if the DMA devices support
         * transfers to all addressable memory.
         */
        ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
        /*
         * A memory area that is only addressable by the kernel through
         * mapping portions into its own address space. This is for example
         * used by i386 to allow the kernel to address the memory beyond
         * 900MB. The kernel will set up special mappings (page
         * table entries on i386) for each page that the kernel needs to
         * access.
         */
        ZONE_HIGHMEM,
#endif
        /*
         * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains
         * movable pages with few exceptional cases described below. Main use
         * cases for ZONE_MOVABLE are to make memory offlining/unplug more
         * likely to succeed, and to locally limit unmovable allocations - e.g.,
         * to increase the number of THP/huge pages. Notable special cases are:
         *
         * 1. Pinned pages: (long-term) pinning of movable pages might
         *    essentially turn such pages unmovable. Therefore, we do not allow
         *    pinning long-term pages in ZONE_MOVABLE. When pages are pinned and
         *    faulted, they come from the right zone right away. However, it is
         *    still possible that address space already has pages in
         *    ZONE_MOVABLE at the time when pages are pinned (i.e. user has
         *    touches that memory before pinning). In such case we migrate them
         *    to a different zone. When migration fails - pinning fails.
         * 2. memblock allocations: kernelcore/movablecore setups might create
         *    situations where ZONE_MOVABLE contains unmovable allocations
         *    after boot. Memory offlining and allocations fail early.
         * 3. Memory holes: kernelcore/movablecore setups might create very rare
         *    situations where ZONE_MOVABLE contains memory holes after boot,
         *    for example, if we have sections that are only partially
         *    populated. Memory offlining and allocations fail early.
         * 4. PG_hwpoison pages: while poisoned pages can be skipped during
         *    memory offlining, such pages cannot be allocated.
         * 5. Unmovable PG_offline pages: in paravirtualized environments,
         *    hotplugged memory blocks might only partially be managed by the
         *    buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The
         *    parts not manged by the buddy are unmovable PG_offline pages. In
         *    some cases (virtio-mem), such pages can be skipped during
         *    memory offlining, however, cannot be moved/allocated. These
         *    techniques might use alloc_contig_range() to hide previously
         *    exposed pages from the buddy again (e.g., to implement some sort
         *    of memory unplug in virtio-mem).
         * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create
         *    situations where ZERO_PAGE(0) which is allocated differently
         *    on different platforms may end up in a movable zone. ZERO_PAGE(0)
         *    cannot be migrated.
         * 7. Memory-hotplug: when using memmap_on_memory and onlining the
         *    memory to the MOVABLE zone, the vmemmap pages are also placed in
         *    such zone. Such pages cannot be really moved around as they are
         *    self-stored in the range, but they are treated as movable when
         *    the range they describe is about to be offlined.
         *
         * In general, no unmovable allocations that degrade memory offlining
         * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
         * have to expect that migrating pages in ZONE_MOVABLE can fail (even
         * if has_unmovable_pages() states that there are no unmovable pages,
         * there can be false negatives).
         */
        ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
        ZONE_DEVICE,
#endif
        __MAX_NR_ZONES

};

#ifndef __GENERATING_BOUNDS_H

#define ASYNC_AND_SYNC 2

struct zone {
        /* Read-mostly fields */

        /* zone watermarks, access with *_wmark_pages(zone) macros */
        unsigned long _watermark[NR_WMARK];
        unsigned long watermark_boost;

        unsigned long nr_reserved_highatomic;

        /*
         * We don't know if the memory that we're going to allocate will be
         * freeable or/and it will be released eventually, so to avoid totally
         * wasting several GB of ram we must reserve some of the lower zone
         * memory (otherwise we risk to run OOM on the lower zones despite
         * there being tons of freeable ram on the higher zones).  This array is
         * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
         * changes.
         */
        long lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NUMA
        int node;
#endif
        struct pglist_data        *zone_pgdat;
        struct per_cpu_pages        __percpu *per_cpu_pageset;
        struct per_cpu_zonestat        __percpu *per_cpu_zonestats;
        /*
         * the high and batch values are copied to individual pagesets for
         * faster access
         */
        int pageset_high_min;
        int pageset_high_max;
        int pageset_batch;

#ifndef CONFIG_SPARSEMEM
        /*
         * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
         * In SPARSEMEM, this map is stored in struct mem_section
         */
        unsigned long                *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */

        /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
        unsigned long                zone_start_pfn;

        /*
         * spanned_pages is the total pages spanned by the zone, including
         * holes, which is calculated as:
         *         spanned_pages = zone_end_pfn - zone_start_pfn;
         *
         * present_pages is physical pages existing within the zone, which
         * is calculated as:
         *        present_pages = spanned_pages - absent_pages(pages in holes);
         *
         * present_early_pages is present pages existing within the zone
         * located on memory available since early boot, excluding hotplugged
         * memory.
         *
         * managed_pages is present pages managed by the buddy system, which
         * is calculated as (reserved_pages includes pages allocated by the
         * bootmem allocator):
         *        managed_pages = present_pages - reserved_pages;
         *
         * cma pages is present pages that are assigned for CMA use
         * (MIGRATE_CMA).
         *
         * So present_pages may be used by memory hotplug or memory power
         * management logic to figure out unmanaged pages by checking
         * (present_pages - managed_pages). And managed_pages should be used
         * by page allocator and vm scanner to calculate all kinds of watermarks
         * and thresholds.
         *
         * Locking rules:
         *
         * zone_start_pfn and spanned_pages are protected by span_seqlock.
         * It is a seqlock because it has to be read outside of zone->lock,
         * and it is done in the main allocator path.  But, it is written
         * quite infrequently.
         *
         * The span_seq lock is declared along with zone->lock because it is
         * frequently read in proximity to zone->lock.  It's good to
         * give them a chance of being in the same cacheline.
         *
         * Write access to present_pages at runtime should be protected by
         * mem_hotplug_begin/done(). Any reader who can't tolerant drift of
         * present_pages should use get_online_mems() to get a stable value.
         */
        atomic_long_t                managed_pages;
        unsigned long                spanned_pages;
        unsigned long                present_pages;
#if defined(CONFIG_MEMORY_HOTPLUG)
        unsigned long                present_early_pages;
#endif
#ifdef CONFIG_CMA
        unsigned long                cma_pages;
#endif

        const char                *name;

#ifdef CONFIG_MEMORY_ISOLATION
        /*
         * Number of isolated pageblock. It is used to solve incorrect
         * freepage counting problem due to racy retrieving migratetype
         * of pageblock. Protected by zone->lock.
         */
        unsigned long                nr_isolate_pageblock;
#endif

#ifdef CONFIG_MEMORY_HOTPLUG
        /* see spanned/present_pages for more description */
        seqlock_t                span_seqlock;
#endif

        int initialized;

        /* Write-intensive fields used from the page allocator */
        CACHELINE_PADDING(_pad1_);

        /* free areas of different sizes */
        struct free_area        free_area[NR_PAGE_ORDERS];

#ifdef CONFIG_UNACCEPTED_MEMORY
        /* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */
        struct list_head        unaccepted_pages;
#endif

        /* zone flags, see below */
        unsigned long                flags;

        /* Primarily protects free_area */
        spinlock_t                lock;

        /* Write-intensive fields used by compaction and vmstats. */
        CACHELINE_PADDING(_pad2_);

        /*
         * When free pages are below this point, additional steps are taken
         * when reading the number of free pages to avoid per-cpu counter
         * drift allowing watermarks to be breached
         */
        unsigned long percpu_drift_mark;

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
        /* pfn where compaction free scanner should start */
        unsigned long                compact_cached_free_pfn;
        /* pfn where compaction migration scanner should start */
        unsigned long                compact_cached_migrate_pfn[ASYNC_AND_SYNC];
        unsigned long                compact_init_migrate_pfn;
        unsigned long                compact_init_free_pfn;
#endif

#ifdef CONFIG_COMPACTION
        /*
         * On compaction failure, 1<<compact_defer_shift compactions
         * are skipped before trying again. The number attempted since
         * last failure is tracked with compact_considered.
         * compact_order_failed is the minimum compaction failed order.
         */
        unsigned int                compact_considered;
        unsigned int                compact_defer_shift;
        int                        compact_order_failed;
#endif

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
        /* Set to true when the PG_migrate_skip bits should be cleared */
        bool                        compact_blockskip_flush;
#endif

        bool                        contiguous;

        CACHELINE_PADDING(_pad3_);
        /* Zone statistics */
        atomic_long_t                vm_stat[NR_VM_ZONE_STAT_ITEMS];
        atomic_long_t                vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
} ____cacheline_internodealigned_in_smp;

enum pgdat_flags {
        PGDAT_DIRTY,                        /* reclaim scanning has recently found
                                         * many dirty file pages at the tail
                                         * of the LRU.
                                         */
        PGDAT_WRITEBACK,                /* reclaim scanning has recently found
                                         * many pages under writeback
                                         */
        PGDAT_RECLAIM_LOCKED,                /* prevents concurrent reclaim */
};

enum zone_flags {
        ZONE_BOOSTED_WATERMARK,                /* zone recently boosted watermarks.
                                         * Cleared when kswapd is woken.
                                         */
        ZONE_RECLAIM_ACTIVE,                /* kswapd may be scanning the zone. */
        ZONE_BELOW_HIGH,                /* zone is below high watermark. */
};

static inline unsigned long zone_managed_pages(struct zone *zone)
{
        return (unsigned long)atomic_long_read(&zone->managed_pages);
}

static inline unsigned long zone_cma_pages(struct zone *zone)
{
#ifdef CONFIG_CMA
        return zone->cma_pages;
#else
        return 0;
#endif
}

static inline unsigned long zone_end_pfn(const struct zone *zone)
{
        return zone->zone_start_pfn + zone->spanned_pages;
}

static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
{
        return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
}

static inline bool zone_is_initialized(struct zone *zone)
{
        return zone->initialized;
}

static inline bool zone_is_empty(struct zone *zone)
{
        return zone->spanned_pages == 0;
}

#ifndef BUILD_VDSO32_64
/*
 * The zone field is never updated after free_area_init_core()
 * sets it, so none of the operations on it need to be atomic.
 */

/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
#define SECTIONS_PGOFF                ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
#define NODES_PGOFF                (SECTIONS_PGOFF - NODES_WIDTH)
#define ZONES_PGOFF                (NODES_PGOFF - ZONES_WIDTH)
#define LAST_CPUPID_PGOFF        (ZONES_PGOFF - LAST_CPUPID_WIDTH)
#define KASAN_TAG_PGOFF                (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
#define LRU_GEN_PGOFF                (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
#define LRU_REFS_PGOFF                (LRU_GEN_PGOFF - LRU_REFS_WIDTH)

/*
 * Define the bit shifts to access each section.  For non-existent
 * sections we define the shift as 0; that plus a 0 mask ensures
 * the compiler will optimise away reference to them.
 */
#define SECTIONS_PGSHIFT        (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
#define NODES_PGSHIFT                (NODES_PGOFF * (NODES_WIDTH != 0))
#define ZONES_PGSHIFT                (ZONES_PGOFF * (ZONES_WIDTH != 0))
#define LAST_CPUPID_PGSHIFT        (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
#define KASAN_TAG_PGSHIFT        (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0))

/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
#ifdef NODE_NOT_IN_PAGE_FLAGS
#define ZONEID_SHIFT                (SECTIONS_SHIFT + ZONES_SHIFT)
#define ZONEID_PGOFF                ((SECTIONS_PGOFF < ZONES_PGOFF) ? \
                                                SECTIONS_PGOFF : ZONES_PGOFF)
#else
#define ZONEID_SHIFT                (NODES_SHIFT + ZONES_SHIFT)
#define ZONEID_PGOFF                ((NODES_PGOFF < ZONES_PGOFF) ? \
                                                NODES_PGOFF : ZONES_PGOFF)
#endif

#define ZONEID_PGSHIFT                (ZONEID_PGOFF * (ZONEID_SHIFT != 0))

#define ZONES_MASK                ((1UL << ZONES_WIDTH) - 1)
#define NODES_MASK                ((1UL << NODES_WIDTH) - 1)
#define SECTIONS_MASK                ((1UL << SECTIONS_WIDTH) - 1)
#define LAST_CPUPID_MASK        ((1UL << LAST_CPUPID_SHIFT) - 1)
#define KASAN_TAG_MASK                ((1UL << KASAN_TAG_WIDTH) - 1)
#define ZONEID_MASK                ((1UL << ZONEID_SHIFT) - 1)

static inline enum zone_type page_zonenum(const struct page *page)
{
        ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT);
        return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
}

static inline enum zone_type folio_zonenum(const struct folio *folio)
{
        return page_zonenum(&folio->page);
}

#ifdef CONFIG_ZONE_DEVICE
static inline bool is_zone_device_page(const struct page *page)
{
        return page_zonenum(page) == ZONE_DEVICE;
}

/*
 * Consecutive zone device pages should not be merged into the same sgl
 * or bvec segment with other types of pages or if they belong to different
 * pgmaps. Otherwise getting the pgmap of a given segment is not possible
 * without scanning the entire segment. This helper returns true either if
 * both pages are not zone device pages or both pages are zone device pages
 * with the same pgmap.
 */
static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
                                                     const struct page *b)
{
        if (is_zone_device_page(a) != is_zone_device_page(b))
                return false;
        if (!is_zone_device_page(a))
                return true;
        return a->pgmap == b->pgmap;
}

extern void memmap_init_zone_device(struct zone *, unsigned long,
                                    unsigned long, struct dev_pagemap *);
#else
static inline bool is_zone_device_page(const struct page *page)
{
        return false;
}
static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
                                                     const struct page *b)
{
        return true;
}
#endif

static inline bool folio_is_zone_device(const struct folio *folio)
{
        return is_zone_device_page(&folio->page);
}

static inline bool is_zone_movable_page(const struct page *page)
{
        return page_zonenum(page) == ZONE_MOVABLE;
}

static inline bool folio_is_zone_movable(const struct folio *folio)
{
        return folio_zonenum(folio) == ZONE_MOVABLE;
}
#endif

/*
 * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
 * intersection with the given zone
 */
static inline bool zone_intersects(struct zone *zone,
                unsigned long start_pfn, unsigned long nr_pages)
{
        if (zone_is_empty(zone))
                return false;
        if (start_pfn >= zone_end_pfn(zone) ||
            start_pfn + nr_pages <= zone->zone_start_pfn)
                return false;

        return true;
}

/*
 * The "priority" of VM scanning is how much of the queues we will scan in one
 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
 * queues ("queue_length >> 12") during an aging round.
 */
#define DEF_PRIORITY 12

/* Maximum number of zones on a zonelist */
#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)

enum {
        ZONELIST_FALLBACK,        /* zonelist with fallback */
#ifdef CONFIG_NUMA
        /*
         * The NUMA zonelists are doubled because we need zonelists that
         * restrict the allocations to a single node for __GFP_THISNODE.
         */
        ZONELIST_NOFALLBACK,        /* zonelist without fallback (__GFP_THISNODE) */
#endif
        MAX_ZONELISTS
};

/*
 * This struct contains information about a zone in a zonelist. It is stored
 * here to avoid dereferences into large structures and lookups of tables
 */
struct zoneref {
        struct zone *zone;        /* Pointer to actual zone */
        int zone_idx;                /* zone_idx(zoneref->zone) */
};

/*
 * One allocation request operates on a zonelist. A zonelist
 * is a list of zones, the first one is the 'goal' of the
 * allocation, the other zones are fallback zones, in decreasing
 * priority.
 *
 * To speed the reading of the zonelist, the zonerefs contain the zone index
 * of the entry being read. Helper functions to access information given
 * a struct zoneref are
 *
 * zonelist_zone()        - Return the struct zone * for an entry in _zonerefs
 * zonelist_zone_idx()        - Return the index of the zone for an entry
 * zonelist_node_idx()        - Return the index of the node for an entry
 */
struct zonelist {
        struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
};

/*
 * The array of struct pages for flatmem.
 * It must be declared for SPARSEMEM as well because there are configurations
 * that rely on that.
 */
extern struct page *mem_map;

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct deferred_split {
        spinlock_t split_queue_lock;
        struct list_head split_queue;
        unsigned long split_queue_len;
};
#endif

#ifdef CONFIG_MEMORY_FAILURE
/*
 * Per NUMA node memory failure handling statistics.
 */
struct memory_failure_stats {
        /*
         * Number of raw pages poisoned.
         * Cases not accounted: memory outside kernel control, offline page,
         * arch-specific memory_failure (SGX), hwpoison_filter() filtered
         * error events, and unpoison actions from hwpoison_unpoison.
         */
        unsigned long total;
        /*
         * Recovery results of poisoned raw pages handled by memory_failure,
         * in sync with mf_result.
         * total = ignored + failed + delayed + recovered.
         * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted.
         */
        unsigned long ignored;
        unsigned long failed;
        unsigned long delayed;
        unsigned long recovered;
};
#endif

/*
 * On NUMA machines, each NUMA node would have a pg_data_t to describe
 * it's memory layout. On UMA machines there is a single pglist_data which
 * describes the whole memory.
 *
 * Memory statistics and page replacement data structures are maintained on a
 * per-zone basis.
 */
typedef struct pglist_data {
        /*
         * node_zones contains just the zones for THIS node. Not all of the
         * zones may be populated, but it is the full list. It is referenced by
         * this node's node_zonelists as well as other node's node_zonelists.
         */
        struct zone node_zones[MAX_NR_ZONES];

        /*
         * node_zonelists contains references to all zones in all nodes.
         * Generally the first zones will be references to this node's
         * node_zones.
         */
        struct zonelist node_zonelists[MAX_ZONELISTS];

        int nr_zones; /* number of populated zones in this node */
#ifdef CONFIG_FLATMEM        /* means !SPARSEMEM */
        struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
        struct page_ext *node_page_ext;
#endif
#endif
#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
        /*
         * Must be held any time you expect node_start_pfn,
         * node_present_pages, node_spanned_pages or nr_zones to stay constant.
         * Also synchronizes pgdat->first_deferred_pfn during deferred page
         * init.
         *
         * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
         * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
         * or CONFIG_DEFERRED_STRUCT_PAGE_INIT.
         *
         * Nests above zone->lock and zone->span_seqlock
         */
        spinlock_t node_size_lock;
#endif
        unsigned long node_start_pfn;
        unsigned long node_present_pages; /* total number of physical pages */
        unsigned long node_spanned_pages; /* total size of physical page
                                             range, including holes */
        int node_id;
        wait_queue_head_t kswapd_wait;
        wait_queue_head_t pfmemalloc_wait;

        /* workqueues for throttling reclaim for different reasons. */
        wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE];

        atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */
        unsigned long nr_reclaim_start;        /* nr pages written while throttled
                                         * when throttling started. */
#ifdef CONFIG_MEMORY_HOTPLUG
        struct mutex kswapd_lock;
#endif
        struct task_struct *kswapd;        /* Protected by kswapd_lock */
        int kswapd_order;
        enum zone_type kswapd_highest_zoneidx;

        int kswapd_failures;                /* Number of 'reclaimed == 0' runs */

#ifdef CONFIG_COMPACTION
        int kcompactd_max_order;
        enum zone_type kcompactd_highest_zoneidx;
        wait_queue_head_t kcompactd_wait;
        struct task_struct *kcompactd;
        bool proactive_compact_trigger;
#endif
        /*
         * This is a per-node reserve of pages that are not available
         * to userspace allocations.
         */
        unsigned long                totalreserve_pages;

#ifdef CONFIG_NUMA
        /*
         * node reclaim becomes active if more unmapped pages exist.
         */
        unsigned long                min_unmapped_pages;
        unsigned long                min_slab_pages;
#endif /* CONFIG_NUMA */

        /* Write-intensive fields used by page reclaim */
        CACHELINE_PADDING(_pad1_);

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
        /*
         * If memory initialisation on large machines is deferred then this
         * is the first PFN that needs to be initialised.
         */
        unsigned long first_deferred_pfn;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        struct deferred_split deferred_split_queue;
#endif

#ifdef CONFIG_NUMA_BALANCING
        /* start time in ms of current promote rate limit period */
        unsigned int nbp_rl_start;
        /* number of promote candidate pages at start time of current rate limit period */
        unsigned long nbp_rl_nr_cand;
        /* promote threshold in ms */
        unsigned int nbp_threshold;
        /* start time in ms of current promote threshold adjustment period */
        unsigned int nbp_th_start;
        /*
         * number of promote candidate pages at start time of current promote
         * threshold adjustment period
         */
        unsigned long nbp_th_nr_cand;
#endif
        /* Fields commonly accessed by the page reclaim scanner */

        /*
         * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
         *
         * Use mem_cgroup_lruvec() to look up lruvecs.
         */
        struct lruvec                __lruvec;

        unsigned long                flags;

#ifdef CONFIG_LRU_GEN
        /* kswap mm walk data */
        struct lru_gen_mm_walk mm_walk;
        /* lru_gen_folio list */
        struct lru_gen_memcg memcg_lru;
#endif

        CACHELINE_PADDING(_pad2_);

        /* Per-node vmstats */
        struct per_cpu_nodestat __percpu *per_cpu_nodestats;
        atomic_long_t                vm_stat[NR_VM_NODE_STAT_ITEMS];
#ifdef CONFIG_NUMA
        struct memory_tier __rcu *memtier;
#endif
#ifdef CONFIG_MEMORY_FAILURE
        struct memory_failure_stats mf_stats;
#endif
} pg_data_t;

#define node_present_pages(nid)        (NODE_DATA(nid)->node_present_pages)
#define node_spanned_pages(nid)        (NODE_DATA(nid)->node_spanned_pages)

#define node_start_pfn(nid)        (NODE_DATA(nid)->node_start_pfn)
#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))

static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
{
        return pgdat->node_start_pfn + pgdat->node_spanned_pages;
}

#include <linux/memory_hotplug.h>

void build_all_zonelists(pg_data_t *pgdat);
void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
                   enum zone_type highest_zoneidx);
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
                         int highest_zoneidx, unsigned int alloc_flags,
                         long free_pages);
bool zone_watermark_ok(struct zone *z, unsigned int order,
                unsigned long mark, int highest_zoneidx,
                unsigned int alloc_flags);
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
                unsigned long mark, int highest_zoneidx);
/*
 * Memory initialization context, use to differentiate memory added by
 * the platform statically or via memory hotplug interface.
 */
enum meminit_context {
        MEMINIT_EARLY,
        MEMINIT_HOTPLUG,
};

extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
                                     unsigned long size);

extern void lruvec_init(struct lruvec *lruvec);

static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
{
#ifdef CONFIG_MEMCG
        return lruvec->pgdat;
#else
        return container_of(lruvec, struct pglist_data, __lruvec);
#endif
}

#ifdef CONFIG_HAVE_MEMORYLESS_NODES
int local_memory_node(int node_id);
#else
static inline int local_memory_node(int node_id) { return node_id; };
#endif

/*
 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
 */
#define zone_idx(zone)                ((zone) - (zone)->zone_pgdat->node_zones)

#ifdef CONFIG_ZONE_DEVICE
static inline bool zone_is_zone_device(struct zone *zone)
{
        return zone_idx(zone) == ZONE_DEVICE;
}
#else
static inline bool zone_is_zone_device(struct zone *zone)
{
        return false;
}
#endif

/*
 * Returns true if a zone has pages managed by the buddy allocator.
 * All the reclaim decisions have to use this function rather than
 * populated_zone(). If the whole zone is reserved then we can easily
 * end up with populated_zone() && !managed_zone().
 */
static inline bool managed_zone(struct zone *zone)
{
        return zone_managed_pages(zone);
}

/* Returns true if a zone has memory */
static inline bool populated_zone(struct zone *zone)
{
        return zone->present_pages;
}

#ifdef CONFIG_NUMA
static inline int zone_to_nid(struct zone *zone)
{
        return zone->node;
}

static inline void zone_set_nid(struct zone *zone, int nid)
{
        zone->node = nid;
}
#else
static inline int zone_to_nid(struct zone *zone)
{
        return 0;
}

static inline void zone_set_nid(struct zone *zone, int nid) {}
#endif

extern int movable_zone;

static inline int is_highmem_idx(enum zone_type idx)
{
#ifdef CONFIG_HIGHMEM
        return (idx == ZONE_HIGHMEM ||
                (idx == ZONE_MOVABLE && movable_zone == ZONE_HIGHMEM));
#else
        return 0;
#endif
}

/**
 * is_highmem - helper function to quickly check if a struct zone is a
 *              highmem zone or not.  This is an attempt to keep references
 *              to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
 * @zone: pointer to struct zone variable
 * Return: 1 for a highmem zone, 0 otherwise
 */
static inline int is_highmem(struct zone *zone)
{
        return is_highmem_idx(zone_idx(zone));
}

#ifdef CONFIG_ZONE_DMA
bool has_managed_dma(void);
#else
static inline bool has_managed_dma(void)
{
        return false;
}
#endif


#ifndef CONFIG_NUMA

extern struct pglist_data contig_page_data;
static inline struct pglist_data *NODE_DATA(int nid)
{
        return &contig_page_data;
}

#else /* CONFIG_NUMA */

#include <asm/mmzone.h>

#endif /* !CONFIG_NUMA */

extern struct pglist_data *first_online_pgdat(void);
extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
extern struct zone *next_zone(struct zone *zone);

/**
 * for_each_online_pgdat - helper macro to iterate over all online nodes
 * @pgdat: pointer to a pg_data_t variable
 */
#define for_each_online_pgdat(pgdat)                        \
        for (pgdat = first_online_pgdat();                \
             pgdat;                                        \
             pgdat = next_online_pgdat(pgdat))
/**
 * for_each_zone - helper macro to iterate over all memory zones
 * @zone: pointer to struct zone variable
 *
 * The user only needs to declare the zone variable, for_each_zone
 * fills it in.
 */
#define for_each_zone(zone)                                \
        for (zone = (first_online_pgdat())->node_zones; \
             zone;                                        \
             zone = next_zone(zone))

#define for_each_populated_zone(zone)                        \
        for (zone = (first_online_pgdat())->node_zones; \
             zone;                                        \
             zone = next_zone(zone))                        \
                if (!populated_zone(zone))                \
                        ; /* do nothing */                \
                else

static inline struct zone *zonelist_zone(struct zoneref *zoneref)
{
        return zoneref->zone;
}

static inline int zonelist_zone_idx(struct zoneref *zoneref)
{
        return zoneref->zone_idx;
}

static inline int zonelist_node_idx(struct zoneref *zoneref)
{
        return zone_to_nid(zoneref->zone);
}

struct zoneref *__next_zones_zonelist(struct zoneref *z,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes);

/**
 * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
 * @z: The cursor used as a starting point for the search
 * @highest_zoneidx: The zone index of the highest zone to return
 * @nodes: An optional nodemask to filter the zonelist with
 *
 * This function returns the next zone at or below a given zone index that is
 * within the allowed nodemask using a cursor as the starting point for the
 * search. The zoneref returned is a cursor that represents the current zone
 * being examined. It should be advanced by one before calling
 * next_zones_zonelist again.
 *
 * Return: the next zone at or below highest_zoneidx within the allowed
 * nodemask using a cursor within a zonelist as a starting point
 */
static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes)
{
        if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx))
                return z;
        return __next_zones_zonelist(z, highest_zoneidx, nodes);
}

/**
 * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
 * @zonelist: The zonelist to search for a suitable zone
 * @highest_zoneidx: The zone index of the highest zone to return
 * @nodes: An optional nodemask to filter the zonelist with
 *
 * This function returns the first zone at or below a given zone index that is
 * within the allowed nodemask. The zoneref returned is a cursor that can be
 * used to iterate the zonelist with next_zones_zonelist by advancing it by
 * one before calling.
 *
 * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is
 * never NULL). This may happen either genuinely, or due to concurrent nodemask
 * update due to cpuset modification.
 *
 * Return: Zoneref pointer for the first suitable zone found
 */
static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes)
{
        return next_zones_zonelist(zonelist->_zonerefs,
                                                        highest_zoneidx, nodes);
}

/**
 * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
 * @zone: The current zone in the iterator
 * @z: The current pointer within zonelist->_zonerefs being iterated
 * @zlist: The zonelist being iterated
 * @highidx: The zone index of the highest zone to return
 * @nodemask: Nodemask allowed by the allocator
 *
 * This iterator iterates though all zones at or below a given zone index and
 * within a given nodemask
 */
#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
        for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z);        \
                zone;                                                        \
                z = next_zones_zonelist(++z, highidx, nodemask),        \
                        zone = zonelist_zone(z))

#define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \
        for (zone = z->zone;        \
                zone;                                                        \
                z = next_zones_zonelist(++z, highidx, nodemask),        \
                        zone = zonelist_zone(z))


/**
 * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
 * @zone: The current zone in the iterator
 * @z: The current pointer within zonelist->zones being iterated
 * @zlist: The zonelist being iterated
 * @highidx: The zone index of the highest zone to return
 *
 * This iterator iterates though all zones at or below a given zone index.
 */
#define for_each_zone_zonelist(zone, z, zlist, highidx) \
        for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)

/* Whether the 'nodes' are all movable nodes */
static inline bool movable_only_nodes(nodemask_t *nodes)
{
        struct zonelist *zonelist;
        struct zoneref *z;
        int nid;

        if (nodes_empty(*nodes))
                return false;

        /*
         * We can chose arbitrary node from the nodemask to get a
         * zonelist as they are interlinked. We just need to find
         * at least one zone that can satisfy kernel allocations.
         */
        nid = first_node(*nodes);
        zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
        z = first_zones_zonelist(zonelist, ZONE_NORMAL,        nodes);
        return (!z->zone) ? true : false;
}


#ifdef CONFIG_SPARSEMEM
#include <asm/sparsemem.h>
#endif

#ifdef CONFIG_FLATMEM
#define pfn_to_nid(pfn)                (0)
#endif

#ifdef CONFIG_SPARSEMEM

/*
 * PA_SECTION_SHIFT                physical address to/from section number
 * PFN_SECTION_SHIFT                pfn to/from section number
 */
#define PA_SECTION_SHIFT        (SECTION_SIZE_BITS)
#define PFN_SECTION_SHIFT        (SECTION_SIZE_BITS - PAGE_SHIFT)

#define NR_MEM_SECTIONS                (1UL << SECTIONS_SHIFT)

#define PAGES_PER_SECTION       (1UL << PFN_SECTION_SHIFT)
#define PAGE_SECTION_MASK        (~(PAGES_PER_SECTION-1))

#define SECTION_BLOCKFLAGS_BITS \
        ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)

#if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
#error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE
#endif

static inline unsigned long pfn_to_section_nr(unsigned long pfn)
{
        return pfn >> PFN_SECTION_SHIFT;
}
static inline unsigned long section_nr_to_pfn(unsigned long sec)
{
        return sec << PFN_SECTION_SHIFT;
}

#define SECTION_ALIGN_UP(pfn)        (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
#define SECTION_ALIGN_DOWN(pfn)        ((pfn) & PAGE_SECTION_MASK)

#define SUBSECTION_SHIFT 21
#define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT)

#define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT)
#define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT)
#define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1))

#if SUBSECTION_SHIFT > SECTION_SIZE_BITS
#error Subsection size exceeds section size
#else
#define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT))
#endif

#define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION)
#define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)

struct mem_section_usage {
        struct rcu_head rcu;
#ifdef CONFIG_SPARSEMEM_VMEMMAP
        DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
#endif
        /* See declaration of similar field in struct zone */
        unsigned long pageblock_flags[0];
};

void subsection_map_init(unsigned long pfn, unsigned long nr_pages);

struct page;
struct page_ext;
struct mem_section {
        /*
         * This is, logically, a pointer to an array of struct
         * pages.  However, it is stored with some other magic.
         * (see sparse.c::sparse_init_one_section())
         *
         * Additionally during early boot we encode node id of
         * the location of the section here to guide allocation.
         * (see sparse.c::memory_present())
         *
         * Making it a UL at least makes someone do a cast
         * before using it wrong.
         */
        unsigned long section_mem_map;

        struct mem_section_usage *usage;
#ifdef CONFIG_PAGE_EXTENSION
        /*
         * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
         * section. (see page_ext.h about this.)
         */
        struct page_ext *page_ext;
        unsigned long pad;
#endif
        /*
         * WARNING: mem_section must be a power-of-2 in size for the
         * calculation and use of SECTION_ROOT_MASK to make sense.
         */
};

#ifdef CONFIG_SPARSEMEM_EXTREME
#define SECTIONS_PER_ROOT       (PAGE_SIZE / sizeof (struct mem_section))
#else
#define SECTIONS_PER_ROOT        1
#endif

#define SECTION_NR_TO_ROOT(sec)        ((sec) / SECTIONS_PER_ROOT)
#define NR_SECTION_ROOTS        DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)
#define SECTION_ROOT_MASK        (SECTIONS_PER_ROOT - 1)

#ifdef CONFIG_SPARSEMEM_EXTREME
extern struct mem_section **mem_section;
#else
extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
#endif

static inline unsigned long *section_to_usemap(struct mem_section *ms)
{
        return ms->usage->pageblock_flags;
}

static inline struct mem_section *__nr_to_section(unsigned long nr)
{
        unsigned long root = SECTION_NR_TO_ROOT(nr);

        if (unlikely(root >= NR_SECTION_ROOTS))
                return NULL;

#ifdef CONFIG_SPARSEMEM_EXTREME
        if (!mem_section || !mem_section[root])
                return NULL;
#endif
        return &mem_section[root][nr & SECTION_ROOT_MASK];
}
extern size_t mem_section_usage_size(void);

/*
 * We use the lower bits of the mem_map pointer to store
 * a little bit of information.  The pointer is calculated
 * as mem_map - section_nr_to_pfn(pnum).  The result is
 * aligned to the minimum alignment of the two values:
 *   1. All mem_map arrays are page-aligned.
 *   2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT
 *      lowest bits.  PFN_SECTION_SHIFT is arch-specific
 *      (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
 *      worst combination is powerpc with 256k pages,
 *      which results in PFN_SECTION_SHIFT equal 6.
 * To sum it up, at least 6 bits are available on all architectures.
 * However, we can exceed 6 bits on some other architectures except
 * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available
 * with the worst case of 64K pages on arm64) if we make sure the
 * exceeded bit is not applicable to powerpc.
 */
enum {
        SECTION_MARKED_PRESENT_BIT,
        SECTION_HAS_MEM_MAP_BIT,
        SECTION_IS_ONLINE_BIT,
        SECTION_IS_EARLY_BIT,
#ifdef CONFIG_ZONE_DEVICE
        SECTION_TAINT_ZONE_DEVICE_BIT,
#endif
        SECTION_MAP_LAST_BIT,
};

#define SECTION_MARKED_PRESENT                BIT(SECTION_MARKED_PRESENT_BIT)
#define SECTION_HAS_MEM_MAP                BIT(SECTION_HAS_MEM_MAP_BIT)
#define SECTION_IS_ONLINE                BIT(SECTION_IS_ONLINE_BIT)
#define SECTION_IS_EARLY                BIT(SECTION_IS_EARLY_BIT)
#ifdef CONFIG_ZONE_DEVICE
#define SECTION_TAINT_ZONE_DEVICE        BIT(SECTION_TAINT_ZONE_DEVICE_BIT)
#endif
#define SECTION_MAP_MASK                (~(BIT(SECTION_MAP_LAST_BIT) - 1))
#define SECTION_NID_SHIFT                SECTION_MAP_LAST_BIT

static inline struct page *__section_mem_map_addr(struct mem_section *section)
{
        unsigned long map = section->section_mem_map;
        map &= SECTION_MAP_MASK;
        return (struct page *)map;
}

static inline int present_section(struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
}

static inline int present_section_nr(unsigned long nr)
{
        return present_section(__nr_to_section(nr));
}

static inline int valid_section(struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
}

static inline int early_section(struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_IS_EARLY));
}

static inline int valid_section_nr(unsigned long nr)
{
        return valid_section(__nr_to_section(nr));
}

static inline int online_section(struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_IS_ONLINE));
}

#ifdef CONFIG_ZONE_DEVICE
static inline int online_device_section(struct mem_section *section)
{
        unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE;

        return section && ((section->section_mem_map & flags) == flags);
}
#else
static inline int online_device_section(struct mem_section *section)
{
        return 0;
}
#endif

static inline int online_section_nr(unsigned long nr)
{
        return online_section(__nr_to_section(nr));
}

#ifdef CONFIG_MEMORY_HOTPLUG
void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
#endif

static inline struct mem_section *__pfn_to_section(unsigned long pfn)
{
        return __nr_to_section(pfn_to_section_nr(pfn));
}

extern unsigned long __highest_present_section_nr;

static inline int subsection_map_index(unsigned long pfn)
{
        return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION;
}

#ifdef CONFIG_SPARSEMEM_VMEMMAP
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
        int idx = subsection_map_index(pfn);

        return test_bit(idx, READ_ONCE(ms->usage)->subsection_map);
}
#else
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
        return 1;
}
#endif

#ifndef CONFIG_HAVE_ARCH_PFN_VALID
/**
 * pfn_valid - check if there is a valid memory map entry for a PFN
 * @pfn: the page frame number to check
 *
 * Check if there is a valid memory map entry aka struct page for the @pfn.
 * Note, that availability of the memory map entry does not imply that
 * there is actual usable memory at that @pfn. The struct page may
 * represent a hole or an unusable page frame.
 *
 * Return: 1 for PFNs that have memory map entries and 0 otherwise
 */
static inline int pfn_valid(unsigned long pfn)
{
        struct mem_section *ms;
        int ret;

        /*
         * Ensure the upper PAGE_SHIFT bits are clear in the
         * pfn. Else it might lead to false positives when
         * some of the upper bits are set, but the lower bits
         * match a valid pfn.
         */
        if (PHYS_PFN(PFN_PHYS(pfn)) != pfn)
                return 0;

        if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
                return 0;
        ms = __pfn_to_section(pfn);
        rcu_read_lock_sched();
        if (!valid_section(ms)) {
                rcu_read_unlock_sched();
                return 0;
        }
        /*
         * Traditionally early sections always returned pfn_valid() for
         * the entire section-sized span.
         */
        ret = early_section(ms) || pfn_section_valid(ms, pfn);
        rcu_read_unlock_sched();

        return ret;
}
#endif

static inline int pfn_in_present_section(unsigned long pfn)
{
        if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
                return 0;
        return present_section(__pfn_to_section(pfn));
}

static inline unsigned long next_present_section_nr(unsigned long section_nr)
{
        while (++section_nr <= __highest_present_section_nr) {
                if (present_section_nr(section_nr))
                        return section_nr;
        }

        return -1;
}

/*
 * These are _only_ used during initialisation, therefore they
 * can use __initdata ...  They could have names to indicate
 * this restriction.
 */
#ifdef CONFIG_NUMA
#define pfn_to_nid(pfn)                                                        \
({                                                                        \
        unsigned long __pfn_to_nid_pfn = (pfn);                                \
        page_to_nid(pfn_to_page(__pfn_to_nid_pfn));                        \
})
#else
#define pfn_to_nid(pfn)                (0)
#endif

void sparse_init(void);
#else
#define sparse_init()        do {} while (0)
#define sparse_index_init(_sec, _nid)  do {} while (0)
#define pfn_in_present_section pfn_valid
#define subsection_map_init(_pfn, _nr_pages) do {} while (0)
#endif /* CONFIG_SPARSEMEM */

#endif /* !__GENERATING_BOUNDS.H */
#endif /* !__ASSEMBLY__ */
#endif /* _LINUX_MMZONE_H */























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef BTRFS_FS_H
#define BTRFS_FS_H

#include <linux/blkdev.h>
#include <linux/sizes.h>
#include <linux/time64.h>
#include <linux/compiler.h>
#include <linux/math.h>
#include <linux/atomic.h>
#include <linux/percpu_counter.h>
#include <linux/completion.h>
#include <linux/lockdep.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
#include <linux/rwlock_types.h>
#include <linux/rwsem.h>
#include <linux/semaphore.h>
#include <linux/list.h>
#include <linux/radix-tree.h>
#include <linux/workqueue.h>
#include <linux/wait.h>
#include <linux/wait_bit.h>
#include <linux/sched.h>
#include <linux/rbtree.h>
#include <uapi/linux/btrfs.h>
#include <uapi/linux/btrfs_tree.h>
#include "extent-io-tree.h"
#include "async-thread.h"
#include "block-rsv.h"
#include "fs.h"

struct inode;
struct super_block;
struct kobject;
struct reloc_control;
struct crypto_shash;
struct ulist;
struct btrfs_device;
struct btrfs_block_group;
struct btrfs_root;
struct btrfs_fs_devices;
struct btrfs_transaction;
struct btrfs_delayed_root;
struct btrfs_balance_control;
struct btrfs_subpage_info;
struct btrfs_stripe_hash_table;
struct btrfs_space_info;

#define BTRFS_MAX_EXTENT_SIZE SZ_128M

#define BTRFS_OLDEST_GENERATION        0ULL

#define BTRFS_EMPTY_DIR_SIZE 0

#define BTRFS_DIRTY_METADATA_THRESH                SZ_32M

#define BTRFS_SUPER_INFO_OFFSET                        SZ_64K
#define BTRFS_SUPER_INFO_SIZE                        4096
static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);

/*
 * Number of metadata items necessary for an unlink operation:
 *
 * 1 for the possible orphan item
 * 1 for the dir item
 * 1 for the dir index
 * 1 for the inode ref
 * 1 for the inode
 * 1 for the parent inode
 */
#define BTRFS_UNLINK_METADATA_UNITS                6

/*
 * The reserved space at the beginning of each device.  It covers the primary
 * super block and leaves space for potential use by other tools like
 * bootloaders or to lower potential damage of accidental overwrite.
 */
#define BTRFS_DEVICE_RANGE_RESERVED                        (SZ_1M)
/*
 * Runtime (in-memory) states of filesystem
 */
enum {
        /*
         * Filesystem is being remounted, allow to skip some operations, like
         * defrag
         */
        BTRFS_FS_STATE_REMOUNTING,
        /* Filesystem in RO mode */
        BTRFS_FS_STATE_RO,
        /* Track if a transaction abort has been reported on this filesystem */
        BTRFS_FS_STATE_TRANS_ABORTED,
        /*
         * Bio operations should be blocked on this filesystem because a source
         * or target device is being destroyed as part of a device replace
         */
        BTRFS_FS_STATE_DEV_REPLACING,
        /* The btrfs_fs_info created for self-tests */
        BTRFS_FS_STATE_DUMMY_FS_INFO,

        BTRFS_FS_STATE_NO_CSUMS,

        /* Indicates there was an error cleaning up a log tree. */
        BTRFS_FS_STATE_LOG_CLEANUP_ERROR,

        BTRFS_FS_STATE_COUNT
};

enum {
        BTRFS_FS_CLOSING_START,
        BTRFS_FS_CLOSING_DONE,
        BTRFS_FS_LOG_RECOVERING,
        BTRFS_FS_OPEN,
        BTRFS_FS_QUOTA_ENABLED,
        BTRFS_FS_UPDATE_UUID_TREE_GEN,
        BTRFS_FS_CREATING_FREE_SPACE_TREE,
        BTRFS_FS_BTREE_ERR,
        BTRFS_FS_LOG1_ERR,
        BTRFS_FS_LOG2_ERR,
        BTRFS_FS_QUOTA_OVERRIDE,
        /* Used to record internally whether fs has been frozen */
        BTRFS_FS_FROZEN,
        /*
         * Indicate that balance has been set up from the ioctl and is in the
         * main phase. The fs_info::balance_ctl is initialized.
         */
        BTRFS_FS_BALANCE_RUNNING,

        /*
         * Indicate that relocation of a chunk has started, it's set per chunk
         * and is toggled between chunks.
         */
        BTRFS_FS_RELOC_RUNNING,

        /* Indicate that the cleaner thread is awake and doing something. */
        BTRFS_FS_CLEANER_RUNNING,

        /*
         * The checksumming has an optimized version and is considered fast,
         * so we don't need to offload checksums to workqueues.
         */
        BTRFS_FS_CSUM_IMPL_FAST,

        /* Indicate that the discard workqueue can service discards. */
        BTRFS_FS_DISCARD_RUNNING,

        /* Indicate that we need to cleanup space cache v1 */
        BTRFS_FS_CLEANUP_SPACE_CACHE_V1,

        /* Indicate that we can't trust the free space tree for caching yet */
        BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED,

        /* Indicate whether there are any tree modification log users */
        BTRFS_FS_TREE_MOD_LOG_USERS,

        /* Indicate that we want the transaction kthread to commit right now. */
        BTRFS_FS_COMMIT_TRANS,

        /* Indicate we have half completed snapshot deletions pending. */
        BTRFS_FS_UNFINISHED_DROPS,

        /* Indicate we have to finish a zone to do next allocation. */
        BTRFS_FS_NEED_ZONE_FINISH,

        /* Indicate that we want to commit the transaction. */
        BTRFS_FS_NEED_TRANS_COMMIT,

        /* This is set when active zone tracking is needed. */
        BTRFS_FS_ACTIVE_ZONE_TRACKING,

        /*
         * Indicate if we have some features changed, this is mostly for
         * cleaner thread to update the sysfs interface.
         */
        BTRFS_FS_FEATURE_CHANGED,

        /*
         * Indicate that we have found a tree block which is only aligned to
         * sectorsize, but not to nodesize.  This should be rare nowadays.
         */
        BTRFS_FS_UNALIGNED_TREE_BLOCK,

#if BITS_PER_LONG == 32
        /* Indicate if we have error/warn message printed on 32bit systems */
        BTRFS_FS_32BIT_ERROR,
        BTRFS_FS_32BIT_WARN,
#endif
};

/*
 * Flags for mount options.
 *
 * Note: don't forget to add new options to btrfs_show_options()
 */
enum {
        BTRFS_MOUNT_NODATASUM                        = (1UL << 0),
        BTRFS_MOUNT_NODATACOW                        = (1UL << 1),
        BTRFS_MOUNT_NOBARRIER                        = (1UL << 2),
        BTRFS_MOUNT_SSD                                = (1UL << 3),
        BTRFS_MOUNT_DEGRADED                        = (1UL << 4),
        BTRFS_MOUNT_COMPRESS                        = (1UL << 5),
        BTRFS_MOUNT_NOTREELOG                   = (1UL << 6),
        BTRFS_MOUNT_FLUSHONCOMMIT                = (1UL << 7),
        BTRFS_MOUNT_SSD_SPREAD                        = (1UL << 8),
        BTRFS_MOUNT_NOSSD                        = (1UL << 9),
        BTRFS_MOUNT_DISCARD_SYNC                = (1UL << 10),
        BTRFS_MOUNT_FORCE_COMPRESS              = (1UL << 11),
        BTRFS_MOUNT_SPACE_CACHE                        = (1UL << 12),
        BTRFS_MOUNT_CLEAR_CACHE                        = (1UL << 13),
        BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED        = (1UL << 14),
        BTRFS_MOUNT_ENOSPC_DEBUG                = (1UL << 15),
        BTRFS_MOUNT_AUTO_DEFRAG                        = (1UL << 16),
        BTRFS_MOUNT_USEBACKUPROOT                = (1UL << 17),
        BTRFS_MOUNT_SKIP_BALANCE                = (1UL << 18),
        BTRFS_MOUNT_PANIC_ON_FATAL_ERROR        = (1UL << 19),
        BTRFS_MOUNT_RESCAN_UUID_TREE                = (1UL << 20),
        BTRFS_MOUNT_FRAGMENT_DATA                = (1UL << 21),
        BTRFS_MOUNT_FRAGMENT_METADATA                = (1UL << 22),
        BTRFS_MOUNT_FREE_SPACE_TREE                = (1UL << 23),
        BTRFS_MOUNT_NOLOGREPLAY                        = (1UL << 24),
        BTRFS_MOUNT_REF_VERIFY                        = (1UL << 25),
        BTRFS_MOUNT_DISCARD_ASYNC                = (1UL << 26),
        BTRFS_MOUNT_IGNOREBADROOTS                = (1UL << 27),
        BTRFS_MOUNT_IGNOREDATACSUMS                = (1UL << 28),
        BTRFS_MOUNT_NODISCARD                        = (1UL << 29),
        BTRFS_MOUNT_NOSPACECACHE                = (1UL << 30),
};

/*
 * Compat flags that we support.  If any incompat flags are set other than the
 * ones specified below then we will fail to mount
 */
#define BTRFS_FEATURE_COMPAT_SUPP                0ULL
#define BTRFS_FEATURE_COMPAT_SAFE_SET                0ULL
#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR                0ULL

#define BTRFS_FEATURE_COMPAT_RO_SUPP                        \
        (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE |        \
         BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
         BTRFS_FEATURE_COMPAT_RO_VERITY |                \
         BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE)

#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET        0ULL
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR        0ULL

#define BTRFS_FEATURE_INCOMPAT_SUPP_STABLE                \
        (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |                \
         BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |        \
         BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |                \
         BTRFS_FEATURE_INCOMPAT_BIG_METADATA |                \
         BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |                \
         BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD |                \
         BTRFS_FEATURE_INCOMPAT_RAID56 |                \
         BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |                \
         BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |        \
         BTRFS_FEATURE_INCOMPAT_NO_HOLES        |        \
         BTRFS_FEATURE_INCOMPAT_METADATA_UUID        |        \
         BTRFS_FEATURE_INCOMPAT_RAID1C34        |        \
         BTRFS_FEATURE_INCOMPAT_ZONED                |        \
         BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA)

#ifdef CONFIG_BTRFS_DEBUG
        /*
         * Features under developmen like Extent tree v2 support is enabled
         * only under CONFIG_BTRFS_DEBUG.
         */
#define BTRFS_FEATURE_INCOMPAT_SUPP                \
        (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE |        \
         BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \
         BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)

#else

#define BTRFS_FEATURE_INCOMPAT_SUPP                \
        (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE)

#endif

#define BTRFS_FEATURE_INCOMPAT_SAFE_SET                        \
        (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR                0ULL

#define BTRFS_DEFAULT_COMMIT_INTERVAL        (30)
#define BTRFS_DEFAULT_MAX_INLINE        (2048)

struct btrfs_dev_replace {
        /* See #define above */
        u64 replace_state;
        /* Seconds since 1-Jan-1970 */
        time64_t time_started;
        /* Seconds since 1-Jan-1970 */
        time64_t time_stopped;
        atomic64_t num_write_errors;
        atomic64_t num_uncorrectable_read_errors;

        u64 cursor_left;
        u64 committed_cursor_left;
        u64 cursor_left_last_write_of_item;
        u64 cursor_right;

        /* See #define above */
        u64 cont_reading_from_srcdev_mode;

        int is_valid;
        int item_needs_writeback;
        struct btrfs_device *srcdev;
        struct btrfs_device *tgtdev;

        struct mutex lock_finishing_cancel_unmount;
        struct rw_semaphore rwsem;

        struct btrfs_scrub_progress scrub_progress;

        struct percpu_counter bio_counter;
        wait_queue_head_t replace_wait;
};

/*
 * Free clusters are used to claim free space in relatively large chunks,
 * allowing us to do less seeky writes. They are used for all metadata
 * allocations. In ssd_spread mode they are also used for data allocations.
 */
struct btrfs_free_cluster {
        spinlock_t lock;
        spinlock_t refill_lock;
        struct rb_root root;

        /* Largest extent in this cluster */
        u64 max_size;

        /* First extent starting offset */
        u64 window_start;

        /* We did a full search and couldn't create a cluster */
        bool fragmented;

        struct btrfs_block_group *block_group;
        /*
         * When a cluster is allocated from a block group, we put the cluster
         * onto a list in the block group so that it can be freed before the
         * block group is freed.
         */
        struct list_head block_group_list;
};

/* Discard control. */
/*
 * Async discard uses multiple lists to differentiate the discard filter
 * parameters.  Index 0 is for completely free block groups where we need to
 * ensure the entire block group is trimmed without being lossy.  Indices
 * afterwards represent monotonically decreasing discard filter sizes to
 * prioritize what should be discarded next.
 */
#define BTRFS_NR_DISCARD_LISTS                3
#define BTRFS_DISCARD_INDEX_UNUSED        0
#define BTRFS_DISCARD_INDEX_START        1

struct btrfs_discard_ctl {
        struct workqueue_struct *discard_workers;
        struct delayed_work work;
        spinlock_t lock;
        struct btrfs_block_group *block_group;
        struct list_head discard_list[BTRFS_NR_DISCARD_LISTS];
        u64 prev_discard;
        u64 prev_discard_time;
        atomic_t discardable_extents;
        atomic64_t discardable_bytes;
        u64 max_discard_size;
        u64 delay_ms;
        u32 iops_limit;
        u32 kbps_limit;
        u64 discard_extent_bytes;
        u64 discard_bitmap_bytes;
        atomic64_t discard_bytes_saved;
};

/*
 * Exclusive operations (device replace, resize, device add/remove, balance)
 */
enum btrfs_exclusive_operation {
        BTRFS_EXCLOP_NONE,
        BTRFS_EXCLOP_BALANCE_PAUSED,
        BTRFS_EXCLOP_BALANCE,
        BTRFS_EXCLOP_DEV_ADD,
        BTRFS_EXCLOP_DEV_REMOVE,
        BTRFS_EXCLOP_DEV_REPLACE,
        BTRFS_EXCLOP_RESIZE,
        BTRFS_EXCLOP_SWAP_ACTIVATE,
};

/* Store data about transaction commits, exported via sysfs. */
struct btrfs_commit_stats {
        /* Total number of commits */
        u64 commit_count;
        /* The maximum commit duration so far in ns */
        u64 max_commit_dur;
        /* The last commit duration in ns */
        u64 last_commit_dur;
        /* The total commit duration in ns */
        u64 total_commit_dur;
};

struct btrfs_fs_info {
        u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
        unsigned long flags;
        struct btrfs_root *tree_root;
        struct btrfs_root *chunk_root;
        struct btrfs_root *dev_root;
        struct btrfs_root *fs_root;
        struct btrfs_root *quota_root;
        struct btrfs_root *uuid_root;
        struct btrfs_root *data_reloc_root;
        struct btrfs_root *block_group_root;
        struct btrfs_root *stripe_root;

        /* The log root tree is a directory of all the other log roots */
        struct btrfs_root *log_root_tree;

        /* The tree that holds the global roots (csum, extent, etc) */
        rwlock_t global_root_lock;
        struct rb_root global_root_tree;

        spinlock_t fs_roots_radix_lock;
        struct radix_tree_root fs_roots_radix;

        /* Block group cache stuff */
        rwlock_t block_group_cache_lock;
        struct rb_root_cached block_group_cache_tree;

        /* Keep track of unallocated space */
        atomic64_t free_chunk_space;

        /* Track ranges which are used by log trees blocks/logged data extents */
        struct extent_io_tree excluded_extents;

        /* logical->physical extent mapping */
        struct rb_root_cached mapping_tree;
        rwlock_t mapping_tree_lock;

        /*
         * Block reservation for extent, checksum, root tree and delayed dir
         * index item.
         */
        struct btrfs_block_rsv global_block_rsv;
        /* Block reservation for metadata operations */
        struct btrfs_block_rsv trans_block_rsv;
        /* Block reservation for chunk tree */
        struct btrfs_block_rsv chunk_block_rsv;
        /* Block reservation for delayed operations */
        struct btrfs_block_rsv delayed_block_rsv;
        /* Block reservation for delayed refs */
        struct btrfs_block_rsv delayed_refs_rsv;

        struct btrfs_block_rsv empty_block_rsv;

        /*
         * Updated while holding the lock 'trans_lock'. Due to the life cycle of
         * a transaction, it can be directly read while holding a transaction
         * handle, everywhere else must be read with btrfs_get_fs_generation().
         * Should always be updated using btrfs_set_fs_generation().
         */
        u64 generation;
        /*
         * Always use btrfs_get_last_trans_committed() and
         * btrfs_set_last_trans_committed() to read and update this field.
         */
        u64 last_trans_committed;
        /*
         * Generation of the last transaction used for block group relocation
         * since the filesystem was last mounted (or 0 if none happened yet).
         * Must be written and read while holding btrfs_fs_info::commit_root_sem.
         */
        u64 last_reloc_trans;

        /*
         * This is updated to the current trans every time a full commit is
         * required instead of the faster short fsync log commits
         */
        u64 last_trans_log_full_commit;
        unsigned long mount_opt;

        unsigned long compress_type:4;
        unsigned int compress_level;
        u32 commit_interval;
        /*
         * It is a suggestive number, the read side is safe even it gets a
         * wrong number because we will write out the data into a regular
         * extent. The write side(mount/remount) is under ->s_umount lock,
         * so it is also safe.
         */
        u64 max_inline;

        struct btrfs_transaction *running_transaction;
        wait_queue_head_t transaction_throttle;
        wait_queue_head_t transaction_wait;
        wait_queue_head_t transaction_blocked_wait;
        wait_queue_head_t async_submit_wait;

        /*
         * Used to protect the incompat_flags, compat_flags, compat_ro_flags
         * when they are updated.
         *
         * Because we do not clear the flags for ever, so we needn't use
         * the lock on the read side.
         *
         * We also needn't use the lock when we mount the fs, because
         * there is no other task which will update the flag.
         */
        spinlock_t super_lock;
        struct btrfs_super_block *super_copy;
        struct btrfs_super_block *super_for_commit;
        struct super_block *sb;
        struct inode *btree_inode;
        struct mutex tree_log_mutex;
        struct mutex transaction_kthread_mutex;
        struct mutex cleaner_mutex;
        struct mutex chunk_mutex;

        /*
         * This is taken to make sure we don't set block groups ro after the
         * free space cache has been allocated on them.
         */
        struct mutex ro_block_group_mutex;

        /*
         * This is used during read/modify/write to make sure no two ios are
         * trying to mod the same stripe at the same time.
         */
        struct btrfs_stripe_hash_table *stripe_hash_table;

        /*
         * This protects the ordered operations list only while we are
         * processing all of the entries on it.  This way we make sure the
         * commit code doesn't find the list temporarily empty because another
         * function happens to be doing non-waiting preflush before jumping
         * into the main commit.
         */
        struct mutex ordered_operations_mutex;

        struct rw_semaphore commit_root_sem;

        struct rw_semaphore cleanup_work_sem;

        struct rw_semaphore subvol_sem;

        spinlock_t trans_lock;
        /*
         * The reloc mutex goes with the trans lock, it is taken during commit
         * to protect us from the relocation code.
         */
        struct mutex reloc_mutex;

        struct list_head trans_list;
        struct list_head dead_roots;
        struct list_head caching_block_groups;

        spinlock_t delayed_iput_lock;
        struct list_head delayed_iputs;
        atomic_t nr_delayed_iputs;
        wait_queue_head_t delayed_iputs_wait;

        atomic64_t tree_mod_seq;

        /* This protects tree_mod_log and tree_mod_seq_list */
        rwlock_t tree_mod_log_lock;
        struct rb_root tree_mod_log;
        struct list_head tree_mod_seq_list;

        atomic_t async_delalloc_pages;

        /* This is used to protect the following list -- ordered_roots. */
        spinlock_t ordered_root_lock;

        /*
         * All fs/file tree roots in which there are data=ordered extents
         * pending writeback are added into this list.
         *
         * These can span multiple transactions and basically include every
         * dirty data page that isn't from nodatacow.
         */
        struct list_head ordered_roots;

        struct mutex delalloc_root_mutex;
        spinlock_t delalloc_root_lock;
        /* All fs/file tree roots that have delalloc inodes. */
        struct list_head delalloc_roots;

        /*
         * There is a pool of worker threads for checksumming during writes and
         * a pool for checksumming after reads.  This is because readers can
         * run with FS locks held, and the writers may be waiting for those
         * locks.  We don't want ordering in the pending list to cause
         * deadlocks, and so the two are serviced separately.
         *
         * A third pool does submit_bio to avoid deadlocking with the other two.
         */
        struct btrfs_workqueue *workers;
        struct btrfs_workqueue *delalloc_workers;
        struct btrfs_workqueue *flush_workers;
        struct workqueue_struct *endio_workers;
        struct workqueue_struct *endio_meta_workers;
        struct workqueue_struct *rmw_workers;
        struct workqueue_struct *compressed_write_workers;
        struct btrfs_workqueue *endio_write_workers;
        struct btrfs_workqueue *endio_freespace_worker;
        struct btrfs_workqueue *caching_workers;

        /*
         * Fixup workers take dirty pages that didn't properly go through the
         * cow mechanism and make them safe to write.  It happens for the
         * sys_munmap function call path.
         */
        struct btrfs_workqueue *fixup_workers;
        struct btrfs_workqueue *delayed_workers;

        struct task_struct *transaction_kthread;
        struct task_struct *cleaner_kthread;
        u32 thread_pool_size;

        struct kobject *space_info_kobj;
        struct kobject *qgroups_kobj;
        struct kobject *discard_kobj;

        /* Used to keep from writing metadata until there is a nice batch */
        struct percpu_counter dirty_metadata_bytes;
        struct percpu_counter delalloc_bytes;
        struct percpu_counter ordered_bytes;
        s32 dirty_metadata_batch;
        s32 delalloc_batch;

        struct percpu_counter evictable_extent_maps;
        u64 extent_map_shrinker_last_root;
        u64 extent_map_shrinker_last_ino;

        /* Protected by 'trans_lock'. */
        struct list_head dirty_cowonly_roots;

        struct btrfs_fs_devices *fs_devices;

        /*
         * The space_info list is effectively read only after initial setup.
         * It is populated at mount time and cleaned up after all block groups
         * are removed.  RCU is used to protect it.
         */
        struct list_head space_info;

        struct btrfs_space_info *data_sinfo;

        struct reloc_control *reloc_ctl;

        /* data_alloc_cluster is only used in ssd_spread mode */
        struct btrfs_free_cluster data_alloc_cluster;

        /* All metadata allocations go through this cluster. */
        struct btrfs_free_cluster meta_alloc_cluster;

        /* Auto defrag inodes go here. */
        spinlock_t defrag_inodes_lock;
        struct rb_root defrag_inodes;
        atomic_t defrag_running;

        /* Used to protect avail_{data, metadata, system}_alloc_bits */
        seqlock_t profiles_lock;
        /*
         * These three are in extended format (availability of single chunks is
         * denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other types are denoted
         * by corresponding BTRFS_BLOCK_GROUP_* bits)
         */
        u64 avail_data_alloc_bits;
        u64 avail_metadata_alloc_bits;
        u64 avail_system_alloc_bits;

        /* Balance state */
        spinlock_t balance_lock;
        struct mutex balance_mutex;
        atomic_t balance_pause_req;
        atomic_t balance_cancel_req;
        struct btrfs_balance_control *balance_ctl;
        wait_queue_head_t balance_wait_q;

        /* Cancellation requests for chunk relocation */
        atomic_t reloc_cancel_req;

        u32 data_chunk_allocations;
        u32 metadata_ratio;

        void *bdev_holder;

        /* Private scrub information */
        struct mutex scrub_lock;
        atomic_t scrubs_running;
        atomic_t scrub_pause_req;
        atomic_t scrubs_paused;
        atomic_t scrub_cancel_req;
        wait_queue_head_t scrub_pause_wait;
        /*
         * The worker pointers are NULL iff the refcount is 0, ie. scrub is not
         * running.
         */
        refcount_t scrub_workers_refcnt;
        struct workqueue_struct *scrub_workers;
        struct btrfs_subpage_info *subpage_info;

        struct btrfs_discard_ctl discard_ctl;

        /* Is qgroup tracking in a consistent state? */
        u64 qgroup_flags;

        /* Holds configuration and tracking. Protected by qgroup_lock. */
        struct rb_root qgroup_tree;
        spinlock_t qgroup_lock;

        /*
         * Used to avoid frequently calling ulist_alloc()/ulist_free()
         * when doing qgroup accounting, it must be protected by qgroup_lock.
         */
        struct ulist *qgroup_ulist;

        /*
         * Protect user change for quota operations. If a transaction is needed,
         * it must be started before locking this lock.
         */
        struct mutex qgroup_ioctl_lock;

        /* List of dirty qgroups to be written at next commit. */
        struct list_head dirty_qgroups;

        /* Used by qgroup for an efficient tree traversal. */
        u64 qgroup_seq;

        /* Qgroup rescan items. */
        /* Protects the progress item */
        struct mutex qgroup_rescan_lock;
        struct btrfs_key qgroup_rescan_progress;
        struct btrfs_workqueue *qgroup_rescan_workers;
        struct completion qgroup_rescan_completion;
        struct btrfs_work qgroup_rescan_work;
        /* Protected by qgroup_rescan_lock */
        bool qgroup_rescan_running;
        u8 qgroup_drop_subtree_thres;
        u64 qgroup_enable_gen;

        /*
         * If this is not 0, then it indicates a serious filesystem error has
         * happened and it contains that error (negative errno value).
         */
        int fs_error;

        /* Filesystem state */
        unsigned long fs_state;

        struct btrfs_delayed_root *delayed_root;

        /* Extent buffer radix tree */
        spinlock_t buffer_lock;
        /* Entries are eb->start / sectorsize */
        struct radix_tree_root buffer_radix;

        /* Next backup root to be overwritten */
        int backup_root_index;

        /* Device replace state */
        struct btrfs_dev_replace dev_replace;

        struct semaphore uuid_tree_rescan_sem;

        /* Used to reclaim the metadata space in the background. */
        struct work_struct async_reclaim_work;
        struct work_struct async_data_reclaim_work;
        struct work_struct preempt_reclaim_work;

        /* Reclaim partially filled block groups in the background */
        struct work_struct reclaim_bgs_work;
        /* Protected by unused_bgs_lock. */
        struct list_head reclaim_bgs;
        int bg_reclaim_threshold;

        /* Protects the lists unused_bgs and reclaim_bgs. */
        spinlock_t unused_bgs_lock;
        /* Protected by unused_bgs_lock. */
        struct list_head unused_bgs;
        struct mutex unused_bg_unpin_mutex;
        /* Protect block groups that are going to be deleted */
        struct mutex reclaim_bgs_lock;

        /* Cached block sizes */
        u32 nodesize;
        u32 sectorsize;
        /* ilog2 of sectorsize, use to avoid 64bit division */
        u32 sectorsize_bits;
        u32 csum_size;
        u32 csums_per_leaf;
        u32 stripesize;

        /*
         * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular
         * filesystem, on zoned it depends on the device constraints.
         */
        u64 max_extent_size;

        /* Block groups and devices containing active swapfiles. */
        spinlock_t swapfile_pins_lock;
        struct rb_root swapfile_pins;

        struct crypto_shash *csum_shash;

        /* Type of exclusive operation running, protected by super_lock */
        enum btrfs_exclusive_operation exclusive_operation;

        /*
         * Zone size > 0 when in ZONED mode, otherwise it's used for a check
         * if the mode is enabled
         */
        u64 zone_size;

        /* Constraints for ZONE_APPEND commands: */
        struct queue_limits limits;
        u64 max_zone_append_size;

        struct mutex zoned_meta_io_lock;
        spinlock_t treelog_bg_lock;
        u64 treelog_bg;

        /*
         * Start of the dedicated data relocation block group, protected by
         * relocation_bg_lock.
         */
        spinlock_t relocation_bg_lock;
        u64 data_reloc_bg;
        struct mutex zoned_data_reloc_io_lock;

        struct btrfs_block_group *active_meta_bg;
        struct btrfs_block_group *active_system_bg;

        u64 nr_global_roots;

        spinlock_t zone_active_bgs_lock;
        struct list_head zone_active_bgs;

        /* Updates are not protected by any lock */
        struct btrfs_commit_stats commit_stats;

        /*
         * Last generation where we dropped a non-relocation root.
         * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen()
         * to change it and to read it, respectively.
         */
        u64 last_root_drop_gen;

        /*
         * Annotations for transaction events (structures are empty when
         * compiled without lockdep).
         */
        struct lockdep_map btrfs_trans_num_writers_map;
        struct lockdep_map btrfs_trans_num_extwriters_map;
        struct lockdep_map btrfs_state_change_map[4];
        struct lockdep_map btrfs_trans_pending_ordered_map;
        struct lockdep_map btrfs_ordered_extent_map;

#ifdef CONFIG_BTRFS_FS_REF_VERIFY
        spinlock_t ref_verify_lock;
        struct rb_root block_tree;
#endif

#ifdef CONFIG_BTRFS_DEBUG
        struct kobject *debug_kobj;
        struct list_head allocated_roots;

        spinlock_t eb_leak_lock;
        struct list_head allocated_ebs;
#endif
};

#define page_to_inode(_page)        (BTRFS_I(_Generic((_page),                        \
                                          struct page *: (_page))->mapping->host))
#define folio_to_inode(_folio)        (BTRFS_I(_Generic((_folio),                        \
                                          struct folio *: (_folio))->mapping->host))

#define page_to_fs_info(_page)         (page_to_inode(_page)->root->fs_info)
#define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info)

#define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode),                        \
                                           struct inode *: (_inode)))->root->fs_info)

static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info)
{
        return READ_ONCE(fs_info->generation);
}

static inline void btrfs_set_fs_generation(struct btrfs_fs_info *fs_info, u64 gen)
{
        WRITE_ONCE(fs_info->generation, gen);
}

static inline u64 btrfs_get_last_trans_committed(const struct btrfs_fs_info *fs_info)
{
        return READ_ONCE(fs_info->last_trans_committed);
}

static inline void btrfs_set_last_trans_committed(struct btrfs_fs_info *fs_info, u64 gen)
{
        WRITE_ONCE(fs_info->last_trans_committed, gen);
}

static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info,
                                                u64 gen)
{
        WRITE_ONCE(fs_info->last_root_drop_gen, gen);
}

static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info)
{
        return READ_ONCE(fs_info->last_root_drop_gen);
}

/*
 * Take the number of bytes to be checksummed and figure out how many leaves
 * it would require to store the csums for that many bytes.
 */
static inline u64 btrfs_csum_bytes_to_leaves(
                        const struct btrfs_fs_info *fs_info, u64 csum_bytes)
{
        const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits;

        return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf);
}

/*
 * Use this if we would be adding new items, as we could split nodes as we cow
 * down the tree.
 */
static inline u64 btrfs_calc_insert_metadata_size(const struct btrfs_fs_info *fs_info,
                                                  unsigned num_items)
{
        return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
}

/*
 * Doing a truncate or a modification won't result in new nodes or leaves, just
 * what we need for COW.
 */
static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info,
                                                 unsigned num_items)
{
        return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
}

#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
                                        sizeof(struct btrfs_item))

static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
{
        return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && fs_info->zone_size > 0;
}

/*
 * Count how many fs_info->max_extent_size cover the @size
 */
static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size)
{
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
        if (!fs_info)
                return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
#endif

        return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
}

bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
                        enum btrfs_exclusive_operation type);
bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
                                 enum btrfs_exclusive_operation type);
void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
                          enum btrfs_exclusive_operation op);

int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args);

/* Compatibility and incompatibility defines */
void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
                             const char *name);
void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
                               const char *name);
void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
                              const char *name);
void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
                                const char *name);

#define __btrfs_fs_incompat(fs_info, flags)                                \
        (!!(btrfs_super_incompat_flags((fs_info)->super_copy) & (flags)))

#define __btrfs_fs_compat_ro(fs_info, flags)                                \
        (!!(btrfs_super_compat_ro_flags((fs_info)->super_copy) & (flags)))

#define btrfs_set_fs_incompat(__fs_info, opt)                                \
        __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt)

#define btrfs_clear_fs_incompat(__fs_info, opt)                                \
        __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt)

#define btrfs_fs_incompat(fs_info, opt)                                        \
        __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)

#define btrfs_set_fs_compat_ro(__fs_info, opt)                                \
        __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt)

#define btrfs_clear_fs_compat_ro(__fs_info, opt)                        \
        __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt)

#define btrfs_fs_compat_ro(fs_info, opt)                                \
        __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)

#define btrfs_clear_opt(o, opt)                ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt)                ((o) |= BTRFS_MOUNT_##opt)
#define btrfs_raw_test_opt(o, opt)        ((o) & BTRFS_MOUNT_##opt)
#define btrfs_test_opt(fs_info, opt)        ((fs_info)->mount_opt & \
                                         BTRFS_MOUNT_##opt)

static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
{
        /* Do it this way so we only ever do one test_bit in the normal case. */
        if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) {
                if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags))
                        return 2;
                return 1;
        }
        return 0;
}

/*
 * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
 * anything except sleeping. This function is used to check the status of
 * the fs.
 * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount,
 * since setting and checking for SB_RDONLY in the superblock's flags is not
 * atomic.
 */
static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
{
        return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) ||
                btrfs_fs_closing(fs_info);
}

static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
{
        clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
}

#define BTRFS_FS_ERROR(fs_info)        (READ_ONCE((fs_info)->fs_error))

#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info)                                \
        (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,                \
                           &(fs_info)->fs_state)))

#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS

#define EXPORT_FOR_TESTS

static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
{
        return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
}

void btrfs_test_destroy_inode(struct inode *inode);

#else

#define EXPORT_FOR_TESTS static

static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
{
        return 0;
}
#endif

#endif






























































































    7 





    7 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_COMPAT_H
#define _ASM_X86_COMPAT_H

/*
 * Architecture specific compatibility types
 */
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <asm/processor.h>
#include <asm/user32.h>
#include <asm/unistd.h>

#define compat_mode_t        compat_mode_t
typedef u16                compat_mode_t;

#define __compat_uid_t        __compat_uid_t
typedef u16                __compat_uid_t;
typedef u16                __compat_gid_t;

#define compat_dev_t        compat_dev_t
typedef u16                compat_dev_t;

#define compat_ipc_pid_t compat_ipc_pid_t
typedef u16                 compat_ipc_pid_t;

#define compat_statfs        compat_statfs

#include <asm-generic/compat.h>

#define COMPAT_UTS_MACHINE        "i686\0\0"

typedef u16                compat_nlink_t;

struct compat_stat {
        u32                st_dev;
        compat_ino_t        st_ino;
        compat_mode_t        st_mode;
        compat_nlink_t        st_nlink;
        __compat_uid_t        st_uid;
        __compat_gid_t        st_gid;
        u32                st_rdev;
        u32                st_size;
        u32                st_blksize;
        u32                st_blocks;
        u32                st_atime;
        u32                st_atime_nsec;
        u32                st_mtime;
        u32                st_mtime_nsec;
        u32                st_ctime;
        u32                st_ctime_nsec;
        u32                __unused4;
        u32                __unused5;
};

/*
 * IA32 uses 4 byte alignment for 64 bit quantities, so we need to pack the
 * compat flock64 structure.
 */
#define __ARCH_NEED_COMPAT_FLOCK64_PACKED

struct compat_statfs {
        int                f_type;
        int                f_bsize;
        int                f_blocks;
        int                f_bfree;
        int                f_bavail;
        int                f_files;
        int                f_ffree;
        compat_fsid_t        f_fsid;
        int                f_namelen;        /* SunOS ignores this field. */
        int                f_frsize;
        int                f_flags;
        int                f_spare[4];
};

#ifdef CONFIG_X86_X32_ABI
#define COMPAT_USE_64BIT_TIME \
        (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT))
#endif

static inline bool in_x32_syscall(void)
{
#ifdef CONFIG_X86_X32_ABI
        if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)
                return true;
#endif
        return false;
}

static inline bool in_32bit_syscall(void)
{
        return in_ia32_syscall() || in_x32_syscall();
}

#ifdef CONFIG_COMPAT
static inline bool in_compat_syscall(void)
{
        return in_32bit_syscall();
}
#define in_compat_syscall in_compat_syscall        /* override the generic impl */
#define compat_need_64bit_alignment_fixup in_ia32_syscall
#endif

struct compat_siginfo;

#ifdef CONFIG_X86_X32_ABI
int copy_siginfo_to_user32(struct compat_siginfo __user *to,
                const kernel_siginfo_t *from);
#define copy_siginfo_to_user32 copy_siginfo_to_user32
#endif /* CONFIG_X86_X32_ABI */

#endif /* _ASM_X86_COMPAT_H */






























































































































































































































    3 
















    5 































































    8 









































































































































































































































































































































































































































































































































































































































    3 







































    2 
































    3 








































    8 



















    1 

























    4 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * fscrypt.h: declarations for per-file encryption
 *
 * Filesystems that implement per-file encryption must include this header
 * file.
 *
 * Copyright (C) 2015, Google, Inc.
 *
 * Written by Michael Halcrow, 2015.
 * Modified by Jaegeuk Kim, 2015.
 */
#ifndef _LINUX_FSCRYPT_H
#define _LINUX_FSCRYPT_H

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <uapi/linux/fscrypt.h>

/*
 * The lengths of all file contents blocks must be divisible by this value.
 * This is needed to ensure that all contents encryption modes will work, as
 * some of the supported modes don't support arbitrarily byte-aligned messages.
 *
 * Since the needed alignment is 16 bytes, most filesystems will meet this
 * requirement naturally, as typical block sizes are powers of 2.  However, if a
 * filesystem can generate arbitrarily byte-aligned block lengths (e.g., via
 * compression), then it will need to pad to this alignment before encryption.
 */
#define FSCRYPT_CONTENTS_ALIGNMENT 16

union fscrypt_policy;
struct fscrypt_inode_info;
struct fs_parameter;
struct seq_file;

struct fscrypt_str {
        unsigned char *name;
        u32 len;
};

struct fscrypt_name {
        const struct qstr *usr_fname;
        struct fscrypt_str disk_name;
        u32 hash;
        u32 minor_hash;
        struct fscrypt_str crypto_buf;
        bool is_nokey_name;
};

#define FSTR_INIT(n, l)                { .name = n, .len = l }
#define FSTR_TO_QSTR(f)                QSTR_INIT((f)->name, (f)->len)
#define fname_name(p)                ((p)->disk_name.name)
#define fname_len(p)                ((p)->disk_name.len)

/* Maximum value for the third parameter of fscrypt_operations.set_context(). */
#define FSCRYPT_SET_CONTEXT_MAX_SIZE        40

#ifdef CONFIG_FS_ENCRYPTION

/* Crypto operations for filesystems */
struct fscrypt_operations {

        /*
         * If set, then fs/crypto/ will allocate a global bounce page pool the
         * first time an encryption key is set up for a file.  The bounce page
         * pool is required by the following functions:
         *
         * - fscrypt_encrypt_pagecache_blocks()
         * - fscrypt_zeroout_range() for files not using inline crypto
         *
         * If the filesystem doesn't use those, it doesn't need to set this.
         */
        unsigned int needs_bounce_pages : 1;

        /*
         * If set, then fs/crypto/ will allow the use of encryption settings
         * that assume inode numbers fit in 32 bits (i.e.
         * FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64}), provided that the other
         * prerequisites for these settings are also met.  This is only useful
         * if the filesystem wants to support inline encryption hardware that is
         * limited to 32-bit or 64-bit data unit numbers and where programming
         * keyslots is very slow.
         */
        unsigned int has_32bit_inodes : 1;

        /*
         * If set, then fs/crypto/ will allow users to select a crypto data unit
         * size that is less than the filesystem block size.  This is done via
         * the log2_data_unit_size field of the fscrypt policy.  This flag is
         * not compatible with filesystems that encrypt variable-length blocks
         * (i.e. blocks that aren't all equal to filesystem's block size), for
         * example as a result of compression.  It's also not compatible with
         * the fscrypt_encrypt_block_inplace() and
         * fscrypt_decrypt_block_inplace() functions.
         */
        unsigned int supports_subblock_data_units : 1;

        /*
         * This field exists only for backwards compatibility reasons and should
         * only be set by the filesystems that are setting it already.  It
         * contains the filesystem-specific key description prefix that is
         * accepted for "logon" keys for v1 fscrypt policies.  This
         * functionality is deprecated in favor of the generic prefix
         * "fscrypt:", which itself is deprecated in favor of the filesystem
         * keyring ioctls such as FS_IOC_ADD_ENCRYPTION_KEY.  Filesystems that
         * are newly adding fscrypt support should not set this field.
         */
        const char *legacy_key_prefix;

        /*
         * Get the fscrypt context of the given inode.
         *
         * @inode: the inode whose context to get
         * @ctx: the buffer into which to get the context
         * @len: length of the @ctx buffer in bytes
         *
         * Return: On success, returns the length of the context in bytes; this
         *           may be less than @len.  On failure, returns -ENODATA if the
         *           inode doesn't have a context, -ERANGE if the context is
         *           longer than @len, or another -errno code.
         */
        int (*get_context)(struct inode *inode, void *ctx, size_t len);

        /*
         * Set an fscrypt context on the given inode.
         *
         * @inode: the inode whose context to set.  The inode won't already have
         *           an fscrypt context.
         * @ctx: the context to set
         * @len: length of @ctx in bytes (at most FSCRYPT_SET_CONTEXT_MAX_SIZE)
         * @fs_data: If called from fscrypt_set_context(), this will be the
         *             value the filesystem passed to fscrypt_set_context().
         *             Otherwise (i.e. when called from
         *             FS_IOC_SET_ENCRYPTION_POLICY) this will be NULL.
         *
         * i_rwsem will be held for write.
         *
         * Return: 0 on success, -errno on failure.
         */
        int (*set_context)(struct inode *inode, const void *ctx, size_t len,
                           void *fs_data);

        /*
         * Get the dummy fscrypt policy in use on the filesystem (if any).
         *
         * Filesystems only need to implement this function if they support the
         * test_dummy_encryption mount option.
         *
         * Return: A pointer to the dummy fscrypt policy, if the filesystem is
         *           mounted with test_dummy_encryption; otherwise NULL.
         */
        const union fscrypt_policy *(*get_dummy_policy)(struct super_block *sb);

        /*
         * Check whether a directory is empty.  i_rwsem will be held for write.
         */
        bool (*empty_dir)(struct inode *inode);

        /*
         * Check whether the filesystem's inode numbers and UUID are stable,
         * meaning that they will never be changed even by offline operations
         * such as filesystem shrinking and therefore can be used in the
         * encryption without the possibility of files becoming unreadable.
         *
         * Filesystems only need to implement this function if they want to
         * support the FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64} flags.  These
         * flags are designed to work around the limitations of UFS and eMMC
         * inline crypto hardware, and they shouldn't be used in scenarios where
         * such hardware isn't being used.
         *
         * Leaving this NULL is equivalent to always returning false.
         */
        bool (*has_stable_inodes)(struct super_block *sb);

        /*
         * Return an array of pointers to the block devices to which the
         * filesystem may write encrypted file contents, NULL if the filesystem
         * only has a single such block device, or an ERR_PTR() on error.
         *
         * On successful non-NULL return, *num_devs is set to the number of
         * devices in the returned array.  The caller must free the returned
         * array using kfree().
         *
         * If the filesystem can use multiple block devices (other than block
         * devices that aren't used for encrypted file contents, such as
         * external journal devices), and wants to support inline encryption,
         * then it must implement this function.  Otherwise it's not needed.
         */
        struct block_device **(*get_devices)(struct super_block *sb,
                                             unsigned int *num_devs);
};

int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags);

static inline struct fscrypt_inode_info *
fscrypt_get_inode_info(const struct inode *inode)
{
        /*
         * Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info().
         * I.e., another task may publish ->i_crypt_info concurrently, executing
         * a RELEASE barrier.  We need to use smp_load_acquire() here to safely
         * ACQUIRE the memory the other task published.
         */
        return smp_load_acquire(&inode->i_crypt_info);
}

/**
 * fscrypt_needs_contents_encryption() - check whether an inode needs
 *                                         contents encryption
 * @inode: the inode to check
 *
 * Return: %true iff the inode is an encrypted regular file and the kernel was
 * built with fscrypt support.
 *
 * If you need to know whether the encrypt bit is set even when the kernel was
 * built without fscrypt support, you must use IS_ENCRYPTED() directly instead.
 */
static inline bool fscrypt_needs_contents_encryption(const struct inode *inode)
{
        return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode);
}

/*
 * When d_splice_alias() moves a directory's no-key alias to its
 * plaintext alias as a result of the encryption key being added,
 * DCACHE_NOKEY_NAME must be cleared and there might be an opportunity
 * to disable d_revalidate.  Note that we don't have to support the
 * inverse operation because fscrypt doesn't allow no-key names to be
 * the source or target of a rename().
 */
static inline void fscrypt_handle_d_move(struct dentry *dentry)
{
        /*
         * VFS calls fscrypt_handle_d_move even for non-fscrypt
         * filesystems.
         */
        if (dentry->d_flags & DCACHE_NOKEY_NAME) {
                dentry->d_flags &= ~DCACHE_NOKEY_NAME;

                /*
                 * Other filesystem features might be handling dentry
                 * revalidation, in which case it cannot be disabled.
                 */
                if (dentry->d_op->d_revalidate == fscrypt_d_revalidate)
                        dentry->d_flags &= ~DCACHE_OP_REVALIDATE;
        }
}

/**
 * fscrypt_is_nokey_name() - test whether a dentry is a no-key name
 * @dentry: the dentry to check
 *
 * This returns true if the dentry is a no-key dentry.  A no-key dentry is a
 * dentry that was created in an encrypted directory that hasn't had its
 * encryption key added yet.  Such dentries may be either positive or negative.
 *
 * When a filesystem is asked to create a new filename in an encrypted directory
 * and the new filename's dentry is a no-key dentry, it must fail the operation
 * with ENOKEY.  This includes ->create(), ->mkdir(), ->mknod(), ->symlink(),
 * ->rename(), and ->link().  (However, ->rename() and ->link() are already
 * handled by fscrypt_prepare_rename() and fscrypt_prepare_link().)
 *
 * This is necessary because creating a filename requires the directory's
 * encryption key, but just checking for the key on the directory inode during
 * the final filesystem operation doesn't guarantee that the key was available
 * during the preceding dentry lookup.  And the key must have already been
 * available during the dentry lookup in order for it to have been checked
 * whether the filename already exists in the directory and for the new file's
 * dentry not to be invalidated due to it incorrectly having the no-key flag.
 *
 * Return: %true if the dentry is a no-key name
 */
static inline bool fscrypt_is_nokey_name(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_NOKEY_NAME;
}

static inline void fscrypt_prepare_dentry(struct dentry *dentry,
                                          bool is_nokey_name)
{
        /*
         * This code tries to only take ->d_lock when necessary to write
         * to ->d_flags.  We shouldn't be peeking on d_flags for
         * DCACHE_OP_REVALIDATE unlocked, but in the unlikely case
         * there is a race, the worst it can happen is that we fail to
         * unset DCACHE_OP_REVALIDATE and pay the cost of an extra
         * d_revalidate.
         */
        if (is_nokey_name) {
                spin_lock(&dentry->d_lock);
                dentry->d_flags |= DCACHE_NOKEY_NAME;
                spin_unlock(&dentry->d_lock);
        } else if (dentry->d_flags & DCACHE_OP_REVALIDATE &&
                   dentry->d_op->d_revalidate == fscrypt_d_revalidate) {
                /*
                 * Unencrypted dentries and encrypted dentries where the
                 * key is available are always valid from fscrypt
                 * perspective. Avoid the cost of calling
                 * fscrypt_d_revalidate unnecessarily.
                 */
                spin_lock(&dentry->d_lock);
                dentry->d_flags &= ~DCACHE_OP_REVALIDATE;
                spin_unlock(&dentry->d_lock);
        }
}

/* crypto.c */
void fscrypt_enqueue_decrypt_work(struct work_struct *);

struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
                                              unsigned int len,
                                              unsigned int offs,
                                              gfp_t gfp_flags);
int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
                                  unsigned int len, unsigned int offs,
                                  u64 lblk_num, gfp_t gfp_flags);

int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len,
                                     size_t offs);
int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
                                  unsigned int len, unsigned int offs,
                                  u64 lblk_num);

static inline bool fscrypt_is_bounce_page(struct page *page)
{
        return page->mapping == NULL;
}

static inline struct page *fscrypt_pagecache_page(struct page *bounce_page)
{
        return (struct page *)page_private(bounce_page);
}

static inline bool fscrypt_is_bounce_folio(struct folio *folio)
{
        return folio->mapping == NULL;
}

static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio)
{
        return bounce_folio->private;
}

void fscrypt_free_bounce_page(struct page *bounce_page);

/* policy.c */
int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg);
int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg);
int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg);
int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg);
int fscrypt_has_permitted_context(struct inode *parent, struct inode *child);
int fscrypt_context_for_new_inode(void *ctx, struct inode *inode);
int fscrypt_set_context(struct inode *inode, void *fs_data);

struct fscrypt_dummy_policy {
        const union fscrypt_policy *policy;
};

int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param,
                                    struct fscrypt_dummy_policy *dummy_policy);
bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1,
                                  const struct fscrypt_dummy_policy *p2);
void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep,
                                        struct super_block *sb);
static inline bool
fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy)
{
        return dummy_policy->policy != NULL;
}
static inline void
fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy)
{
        kfree(dummy_policy->policy);
        dummy_policy->policy = NULL;
}

/* keyring.c */
void fscrypt_destroy_keyring(struct super_block *sb);
int fscrypt_ioctl_add_key(struct file *filp, void __user *arg);
int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg);
int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg);
int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg);

/* keysetup.c */
int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode,
                              bool *encrypt_ret);
void fscrypt_put_encryption_info(struct inode *inode);
void fscrypt_free_inode(struct inode *inode);
int fscrypt_drop_inode(struct inode *inode);

/* fname.c */
int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
                          u8 *out, unsigned int olen);
bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
                                  u32 max_len, u32 *encrypted_len_ret);
int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname,
                           int lookup, struct fscrypt_name *fname);

static inline void fscrypt_free_filename(struct fscrypt_name *fname)
{
        kfree(fname->crypto_buf.name);
}

int fscrypt_fname_alloc_buffer(u32 max_encrypted_len,
                               struct fscrypt_str *crypto_str);
void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str);
int fscrypt_fname_disk_to_usr(const struct inode *inode,
                              u32 hash, u32 minor_hash,
                              const struct fscrypt_str *iname,
                              struct fscrypt_str *oname);
bool fscrypt_match_name(const struct fscrypt_name *fname,
                        const u8 *de_name, u32 de_name_len);
u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name);

/* bio.c */
bool fscrypt_decrypt_bio(struct bio *bio);
int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
                          sector_t pblk, unsigned int len);

/* hooks.c */
int fscrypt_file_open(struct inode *inode, struct file *filp);
int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
                           struct dentry *dentry);
int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry,
                             unsigned int flags);
int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
                             struct fscrypt_name *fname);
int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry);
int __fscrypt_prepare_readdir(struct inode *dir);
int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr);
int fscrypt_prepare_setflags(struct inode *inode,
                             unsigned int oldflags, unsigned int flags);
int fscrypt_prepare_symlink(struct inode *dir, const char *target,
                            unsigned int len, unsigned int max_len,
                            struct fscrypt_str *disk_link);
int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
                              unsigned int len, struct fscrypt_str *disk_link);
const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
                                unsigned int max_size,
                                struct delayed_call *done);
int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat);
static inline void fscrypt_set_ops(struct super_block *sb,
                                   const struct fscrypt_operations *s_cop)
{
        sb->s_cop = s_cop;
}
#else  /* !CONFIG_FS_ENCRYPTION */

static inline struct fscrypt_inode_info *
fscrypt_get_inode_info(const struct inode *inode)
{
        return NULL;
}

static inline bool fscrypt_needs_contents_encryption(const struct inode *inode)
{
        return false;
}

static inline void fscrypt_handle_d_move(struct dentry *dentry)
{
}

static inline bool fscrypt_is_nokey_name(const struct dentry *dentry)
{
        return false;
}

static inline void fscrypt_prepare_dentry(struct dentry *dentry,
                                          bool is_nokey_name)
{
}

/* crypto.c */
static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work)
{
}

static inline struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
                                                            unsigned int len,
                                                            unsigned int offs,
                                                            gfp_t gfp_flags)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline int fscrypt_encrypt_block_inplace(const struct inode *inode,
                                                struct page *page,
                                                unsigned int len,
                                                unsigned int offs, u64 lblk_num,
                                                gfp_t gfp_flags)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_decrypt_pagecache_blocks(struct folio *folio,
                                                   size_t len, size_t offs)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_decrypt_block_inplace(const struct inode *inode,
                                                struct page *page,
                                                unsigned int len,
                                                unsigned int offs, u64 lblk_num)
{
        return -EOPNOTSUPP;
}

static inline bool fscrypt_is_bounce_page(struct page *page)
{
        return false;
}

static inline struct page *fscrypt_pagecache_page(struct page *bounce_page)
{
        WARN_ON_ONCE(1);
        return ERR_PTR(-EINVAL);
}

static inline bool fscrypt_is_bounce_folio(struct folio *folio)
{
        return false;
}

static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio)
{
        WARN_ON_ONCE(1);
        return ERR_PTR(-EINVAL);
}

static inline void fscrypt_free_bounce_page(struct page *bounce_page)
{
}

/* policy.c */
static inline int fscrypt_ioctl_set_policy(struct file *filp,
                                           const void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_get_policy_ex(struct file *filp,
                                              void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_has_permitted_context(struct inode *parent,
                                                struct inode *child)
{
        return 0;
}

static inline int fscrypt_set_context(struct inode *inode, void *fs_data)
{
        return -EOPNOTSUPP;
}

struct fscrypt_dummy_policy {
};

static inline int
fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param,
                                    struct fscrypt_dummy_policy *dummy_policy)
{
        return -EINVAL;
}

static inline bool
fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1,
                             const struct fscrypt_dummy_policy *p2)
{
        return true;
}

static inline void fscrypt_show_test_dummy_encryption(struct seq_file *seq,
                                                      char sep,
                                                      struct super_block *sb)
{
}

static inline bool
fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy)
{
        return false;
}

static inline void
fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy)
{
}

/* keyring.c */
static inline void fscrypt_destroy_keyring(struct super_block *sb)
{
}

static inline int fscrypt_ioctl_add_key(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_remove_key_all_users(struct file *filp,
                                                     void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_get_key_status(struct file *filp,
                                               void __user *arg)
{
        return -EOPNOTSUPP;
}

/* keysetup.c */

static inline int fscrypt_prepare_new_inode(struct inode *dir,
                                            struct inode *inode,
                                            bool *encrypt_ret)
{
        if (IS_ENCRYPTED(dir))
                return -EOPNOTSUPP;
        return 0;
}

static inline void fscrypt_put_encryption_info(struct inode *inode)
{
        return;
}

static inline void fscrypt_free_inode(struct inode *inode)
{
}

static inline int fscrypt_drop_inode(struct inode *inode)
{
        return 0;
}

 /* fname.c */
static inline int fscrypt_setup_filename(struct inode *dir,
                                         const struct qstr *iname,
                                         int lookup, struct fscrypt_name *fname)
{
        if (IS_ENCRYPTED(dir))
                return -EOPNOTSUPP;

        memset(fname, 0, sizeof(*fname));
        fname->usr_fname = iname;
        fname->disk_name.name = (unsigned char *)iname->name;
        fname->disk_name.len = iname->len;
        return 0;
}

static inline void fscrypt_free_filename(struct fscrypt_name *fname)
{
        return;
}

static inline int fscrypt_fname_alloc_buffer(u32 max_encrypted_len,
                                             struct fscrypt_str *crypto_str)
{
        return -EOPNOTSUPP;
}

static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
{
        return;
}

static inline int fscrypt_fname_disk_to_usr(const struct inode *inode,
                                            u32 hash, u32 minor_hash,
                                            const struct fscrypt_str *iname,
                                            struct fscrypt_str *oname)
{
        return -EOPNOTSUPP;
}

static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
                                      const u8 *de_name, u32 de_name_len)
{
        /* Encryption support disabled; use standard comparison */
        if (de_name_len != fname->disk_name.len)
                return false;
        return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len);
}

static inline u64 fscrypt_fname_siphash(const struct inode *dir,
                                        const struct qstr *name)
{
        WARN_ON_ONCE(1);
        return 0;
}

static inline int fscrypt_d_revalidate(struct dentry *dentry,
                                       unsigned int flags)
{
        return 1;
}

/* bio.c */
static inline bool fscrypt_decrypt_bio(struct bio *bio)
{
        return true;
}

static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
                                        sector_t pblk, unsigned int len)
{
        return -EOPNOTSUPP;
}

/* hooks.c */

static inline int fscrypt_file_open(struct inode *inode, struct file *filp)
{
        if (IS_ENCRYPTED(inode))
                return -EOPNOTSUPP;
        return 0;
}

static inline int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
                                         struct dentry *dentry)
{
        return -EOPNOTSUPP;
}

static inline int __fscrypt_prepare_rename(struct inode *old_dir,
                                           struct dentry *old_dentry,
                                           struct inode *new_dir,
                                           struct dentry *new_dentry,
                                           unsigned int flags)
{
        return -EOPNOTSUPP;
}

static inline int __fscrypt_prepare_lookup(struct inode *dir,
                                           struct dentry *dentry,
                                           struct fscrypt_name *fname)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_prepare_lookup_partial(struct inode *dir,
                                                 struct dentry *dentry)
{
        return -EOPNOTSUPP;
}

static inline int __fscrypt_prepare_readdir(struct inode *dir)
{
        return -EOPNOTSUPP;
}

static inline int __fscrypt_prepare_setattr(struct dentry *dentry,
                                            struct iattr *attr)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_prepare_setflags(struct inode *inode,
                                           unsigned int oldflags,
                                           unsigned int flags)
{
        return 0;
}

static inline int fscrypt_prepare_symlink(struct inode *dir,
                                          const char *target,
                                          unsigned int len,
                                          unsigned int max_len,
                                          struct fscrypt_str *disk_link)
{
        if (IS_ENCRYPTED(dir))
                return -EOPNOTSUPP;
        disk_link->name = (unsigned char *)target;
        disk_link->len = len + 1;
        if (disk_link->len > max_len)
                return -ENAMETOOLONG;
        return 0;
}

static inline int __fscrypt_encrypt_symlink(struct inode *inode,
                                            const char *target,
                                            unsigned int len,
                                            struct fscrypt_str *disk_link)
{
        return -EOPNOTSUPP;
}

static inline const char *fscrypt_get_symlink(struct inode *inode,
                                              const void *caddr,
                                              unsigned int max_size,
                                              struct delayed_call *done)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline int fscrypt_symlink_getattr(const struct path *path,
                                          struct kstat *stat)
{
        return -EOPNOTSUPP;
}

static inline void fscrypt_set_ops(struct super_block *sb,
                                   const struct fscrypt_operations *s_cop)
{
}

#endif        /* !CONFIG_FS_ENCRYPTION */

/* inline_crypt.c */
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT

bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode);

void fscrypt_set_bio_crypt_ctx(struct bio *bio,
                               const struct inode *inode, u64 first_lblk,
                               gfp_t gfp_mask);

void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio,
                                  const struct buffer_head *first_bh,
                                  gfp_t gfp_mask);

bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
                           u64 next_lblk);

bool fscrypt_mergeable_bio_bh(struct bio *bio,
                              const struct buffer_head *next_bh);

bool fscrypt_dio_supported(struct inode *inode);

u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks);

#else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */

static inline bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode)
{
        return false;
}

static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio,
                                             const struct inode *inode,
                                             u64 first_lblk, gfp_t gfp_mask) { }

static inline void fscrypt_set_bio_crypt_ctx_bh(
                                         struct bio *bio,
                                         const struct buffer_head *first_bh,
                                         gfp_t gfp_mask) { }

static inline bool fscrypt_mergeable_bio(struct bio *bio,
                                         const struct inode *inode,
                                         u64 next_lblk)
{
        return true;
}

static inline bool fscrypt_mergeable_bio_bh(struct bio *bio,
                                            const struct buffer_head *next_bh)
{
        return true;
}

static inline bool fscrypt_dio_supported(struct inode *inode)
{
        return !fscrypt_needs_contents_encryption(inode);
}

static inline u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk,
                                          u64 nr_blocks)
{
        return nr_blocks;
}
#endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */

/**
 * fscrypt_inode_uses_inline_crypto() - test whether an inode uses inline
 *                                        encryption
 * @inode: an inode. If encrypted, its key must be set up.
 *
 * Return: true if the inode requires file contents encryption and if the
 *           encryption should be done in the block layer via blk-crypto rather
 *           than in the filesystem layer.
 */
static inline bool fscrypt_inode_uses_inline_crypto(const struct inode *inode)
{
        return fscrypt_needs_contents_encryption(inode) &&
               __fscrypt_inode_uses_inline_crypto(inode);
}

/**
 * fscrypt_inode_uses_fs_layer_crypto() - test whether an inode uses fs-layer
 *                                          encryption
 * @inode: an inode. If encrypted, its key must be set up.
 *
 * Return: true if the inode requires file contents encryption and if the
 *           encryption should be done in the filesystem layer rather than in the
 *           block layer via blk-crypto.
 */
static inline bool fscrypt_inode_uses_fs_layer_crypto(const struct inode *inode)
{
        return fscrypt_needs_contents_encryption(inode) &&
               !__fscrypt_inode_uses_inline_crypto(inode);
}

/**
 * fscrypt_has_encryption_key() - check whether an inode has had its key set up
 * @inode: the inode to check
 *
 * Return: %true if the inode has had its encryption key set up, else %false.
 *
 * Usually this should be preceded by fscrypt_get_encryption_info() to try to
 * set up the key first.
 */
static inline bool fscrypt_has_encryption_key(const struct inode *inode)
{
        return fscrypt_get_inode_info(inode) != NULL;
}

/**
 * fscrypt_prepare_link() - prepare to link an inode into a possibly-encrypted
 *                            directory
 * @old_dentry: an existing dentry for the inode being linked
 * @dir: the target directory
 * @dentry: negative dentry for the target filename
 *
 * A new link can only be added to an encrypted directory if the directory's
 * encryption key is available --- since otherwise we'd have no way to encrypt
 * the filename.
 *
 * We also verify that the link will not violate the constraint that all files
 * in an encrypted directory tree use the same encryption policy.
 *
 * Return: 0 on success, -ENOKEY if the directory's encryption key is missing,
 * -EXDEV if the link would result in an inconsistent encryption policy, or
 * another -errno code.
 */
static inline int fscrypt_prepare_link(struct dentry *old_dentry,
                                       struct inode *dir,
                                       struct dentry *dentry)
{
        if (IS_ENCRYPTED(dir))
                return __fscrypt_prepare_link(d_inode(old_dentry), dir, dentry);
        return 0;
}

/**
 * fscrypt_prepare_rename() - prepare for a rename between possibly-encrypted
 *                              directories
 * @old_dir: source directory
 * @old_dentry: dentry for source file
 * @new_dir: target directory
 * @new_dentry: dentry for target location (may be negative unless exchanging)
 * @flags: rename flags (we care at least about %RENAME_EXCHANGE)
 *
 * Prepare for ->rename() where the source and/or target directories may be
 * encrypted.  A new link can only be added to an encrypted directory if the
 * directory's encryption key is available --- since otherwise we'd have no way
 * to encrypt the filename.  A rename to an existing name, on the other hand,
 * *is* cryptographically possible without the key.  However, we take the more
 * conservative approach and just forbid all no-key renames.
 *
 * We also verify that the rename will not violate the constraint that all files
 * in an encrypted directory tree use the same encryption policy.
 *
 * Return: 0 on success, -ENOKEY if an encryption key is missing, -EXDEV if the
 * rename would cause inconsistent encryption policies, or another -errno code.
 */
static inline int fscrypt_prepare_rename(struct inode *old_dir,
                                         struct dentry *old_dentry,
                                         struct inode *new_dir,
                                         struct dentry *new_dentry,
                                         unsigned int flags)
{
        if (IS_ENCRYPTED(old_dir) || IS_ENCRYPTED(new_dir))
                return __fscrypt_prepare_rename(old_dir, old_dentry,
                                                new_dir, new_dentry, flags);
        return 0;
}

/**
 * fscrypt_prepare_lookup() - prepare to lookup a name in a possibly-encrypted
 *                              directory
 * @dir: directory being searched
 * @dentry: filename being looked up
 * @fname: (output) the name to use to search the on-disk directory
 *
 * Prepare for ->lookup() in a directory which may be encrypted by determining
 * the name that will actually be used to search the directory on-disk.  If the
 * directory's encryption policy is supported by this kernel and its encryption
 * key is available, then the lookup is assumed to be by plaintext name;
 * otherwise, it is assumed to be by no-key name.
 *
 * This will set DCACHE_NOKEY_NAME on the dentry if the lookup is by no-key
 * name.  In this case the filesystem must assign the dentry a dentry_operations
 * which contains fscrypt_d_revalidate (or contains a d_revalidate method that
 * calls fscrypt_d_revalidate), so that the dentry will be invalidated if the
 * directory's encryption key is later added.
 *
 * Return: 0 on success; -ENOENT if the directory's key is unavailable but the
 * filename isn't a valid no-key name, so a negative dentry should be created;
 * or another -errno code.
 */
static inline int fscrypt_prepare_lookup(struct inode *dir,
                                         struct dentry *dentry,
                                         struct fscrypt_name *fname)
{
        if (IS_ENCRYPTED(dir))
                return __fscrypt_prepare_lookup(dir, dentry, fname);

        memset(fname, 0, sizeof(*fname));
        fname->usr_fname = &dentry->d_name;
        fname->disk_name.name = (unsigned char *)dentry->d_name.name;
        fname->disk_name.len = dentry->d_name.len;

        fscrypt_prepare_dentry(dentry, false);

        return 0;
}

/**
 * fscrypt_prepare_readdir() - prepare to read a possibly-encrypted directory
 * @dir: the directory inode
 *
 * If the directory is encrypted and it doesn't already have its encryption key
 * set up, try to set it up so that the filenames will be listed in plaintext
 * form rather than in no-key form.
 *
 * Return: 0 on success; -errno on error.  Note that the encryption key being
 *           unavailable is not considered an error.  It is also not an error if
 *           the encryption policy is unsupported by this kernel; that is treated
 *           like the key being unavailable, so that files can still be deleted.
 */
static inline int fscrypt_prepare_readdir(struct inode *dir)
{
        if (IS_ENCRYPTED(dir))
                return __fscrypt_prepare_readdir(dir);
        return 0;
}

/**
 * fscrypt_prepare_setattr() - prepare to change a possibly-encrypted inode's
 *                               attributes
 * @dentry: dentry through which the inode is being changed
 * @attr: attributes to change
 *
 * Prepare for ->setattr() on a possibly-encrypted inode.  On an encrypted file,
 * most attribute changes are allowed even without the encryption key.  However,
 * without the encryption key we do have to forbid truncates.  This is needed
 * because the size being truncated to may not be a multiple of the filesystem
 * block size, and in that case we'd have to decrypt the final block, zero the
 * portion past i_size, and re-encrypt it.  (We *could* allow truncating to a
 * filesystem block boundary, but it's simpler to just forbid all truncates ---
 * and we already forbid all other contents modifications without the key.)
 *
 * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
 * if a problem occurred while setting up the encryption key.
 */
static inline int fscrypt_prepare_setattr(struct dentry *dentry,
                                          struct iattr *attr)
{
        if (IS_ENCRYPTED(d_inode(dentry)))
                return __fscrypt_prepare_setattr(dentry, attr);
        return 0;
}

/**
 * fscrypt_encrypt_symlink() - encrypt the symlink target if needed
 * @inode: symlink inode
 * @target: plaintext symlink target
 * @len: length of @target excluding null terminator
 * @disk_link: (in/out) the on-disk symlink target being prepared
 *
 * If the symlink target needs to be encrypted, then this function encrypts it
 * into @disk_link->name.  fscrypt_prepare_symlink() must have been called
 * previously to compute @disk_link->len.  If the filesystem did not allocate a
 * buffer for @disk_link->name after calling fscrypt_prepare_link(), then one
 * will be kmalloc()'ed and the filesystem will be responsible for freeing it.
 *
 * Return: 0 on success, -errno on failure
 */
static inline int fscrypt_encrypt_symlink(struct inode *inode,
                                          const char *target,
                                          unsigned int len,
                                          struct fscrypt_str *disk_link)
{
        if (IS_ENCRYPTED(inode))
                return __fscrypt_encrypt_symlink(inode, target, len, disk_link);
        return 0;
}

/* If *pagep is a bounce page, free it and set *pagep to the pagecache page */
static inline void fscrypt_finalize_bounce_page(struct page **pagep)
{
        struct page *page = *pagep;

        if (fscrypt_is_bounce_page(page)) {
                *pagep = fscrypt_pagecache_page(page);
                fscrypt_free_bounce_page(page);
        }
}

#endif        /* _LINUX_FSCRYPT_H */






























































































































































































































































































































































































































































































































































































































































































    3 












































    3 




























    3 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright 2002 Andi Kleen, SuSE Labs.
 * Thanks to Ben LaHaise for precious feedback.
 */
#include <linux/highmem.h>
#include <linux/memblock.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/debugfs.h>
#include <linux/pfn.h>
#include <linux/percpu.h>
#include <linux/gfp.h>
#include <linux/pci.h>
#include <linux/vmalloc.h>
#include <linux/libnvdimm.h>
#include <linux/vmstat.h>
#include <linux/kernel.h>
#include <linux/cc_platform.h>
#include <linux/set_memory.h>
#include <linux/memregion.h>

#include <asm/e820/api.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/setup.h>
#include <linux/uaccess.h>
#include <asm/pgalloc.h>
#include <asm/proto.h>
#include <asm/memtype.h>
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>

#include "../mm_internal.h"

/*
 * The current flushing context - we pass it instead of 5 arguments:
 */
struct cpa_data {
        unsigned long        *vaddr;
        pgd_t                *pgd;
        pgprot_t        mask_set;
        pgprot_t        mask_clr;
        unsigned long        numpages;
        unsigned long        curpage;
        unsigned long        pfn;
        unsigned int        flags;
        unsigned int        force_split                : 1,
                        force_static_prot        : 1,
                        force_flush_all                : 1;
        struct page        **pages;
};

enum cpa_warn {
        CPA_CONFLICT,
        CPA_PROTECT,
        CPA_DETECT,
};

static const int cpa_warn_level = CPA_PROTECT;

/*
 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
 * entries change the page attribute in parallel to some other cpu
 * splitting a large page entry along with changing the attribute.
 */
static DEFINE_SPINLOCK(cpa_lock);

#define CPA_FLUSHTLB 1
#define CPA_ARRAY 2
#define CPA_PAGES_ARRAY 4
#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */

static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
{
        return __pgprot(cachemode2protval(pcm));
}

#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];

void update_page_count(int level, unsigned long pages)
{
        /* Protect against CPA */
        spin_lock(&pgd_lock);
        direct_pages_count[level] += pages;
        spin_unlock(&pgd_lock);
}

static void split_page_count(int level)
{
        if (direct_pages_count[level] == 0)
                return;

        direct_pages_count[level]--;
        if (system_state == SYSTEM_RUNNING) {
                if (level == PG_LEVEL_2M)
                        count_vm_event(DIRECT_MAP_LEVEL2_SPLIT);
                else if (level == PG_LEVEL_1G)
                        count_vm_event(DIRECT_MAP_LEVEL3_SPLIT);
        }
        direct_pages_count[level - 1] += PTRS_PER_PTE;
}

void arch_report_meminfo(struct seq_file *m)
{
        seq_printf(m, "DirectMap4k:    %8lu kB\n",
                        direct_pages_count[PG_LEVEL_4K] << 2);
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
        seq_printf(m, "DirectMap2M:    %8lu kB\n",
                        direct_pages_count[PG_LEVEL_2M] << 11);
#else
        seq_printf(m, "DirectMap4M:    %8lu kB\n",
                        direct_pages_count[PG_LEVEL_2M] << 12);
#endif
        if (direct_gbpages)
                seq_printf(m, "DirectMap1G:    %8lu kB\n",
                        direct_pages_count[PG_LEVEL_1G] << 20);
}
#else
static inline void split_page_count(int level) { }
#endif

#ifdef CONFIG_X86_CPA_STATISTICS

static unsigned long cpa_1g_checked;
static unsigned long cpa_1g_sameprot;
static unsigned long cpa_1g_preserved;
static unsigned long cpa_2m_checked;
static unsigned long cpa_2m_sameprot;
static unsigned long cpa_2m_preserved;
static unsigned long cpa_4k_install;

static inline void cpa_inc_1g_checked(void)
{
        cpa_1g_checked++;
}

static inline void cpa_inc_2m_checked(void)
{
        cpa_2m_checked++;
}

static inline void cpa_inc_4k_install(void)
{
        data_race(cpa_4k_install++);
}

static inline void cpa_inc_lp_sameprot(int level)
{
        if (level == PG_LEVEL_1G)
                cpa_1g_sameprot++;
        else
                cpa_2m_sameprot++;
}

static inline void cpa_inc_lp_preserved(int level)
{
        if (level == PG_LEVEL_1G)
                cpa_1g_preserved++;
        else
                cpa_2m_preserved++;
}

static int cpastats_show(struct seq_file *m, void *p)
{
        seq_printf(m, "1G pages checked:     %16lu\n", cpa_1g_checked);
        seq_printf(m, "1G pages sameprot:    %16lu\n", cpa_1g_sameprot);
        seq_printf(m, "1G pages preserved:   %16lu\n", cpa_1g_preserved);
        seq_printf(m, "2M pages checked:     %16lu\n", cpa_2m_checked);
        seq_printf(m, "2M pages sameprot:    %16lu\n", cpa_2m_sameprot);
        seq_printf(m, "2M pages preserved:   %16lu\n", cpa_2m_preserved);
        seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install);
        return 0;
}

static int cpastats_open(struct inode *inode, struct file *file)
{
        return single_open(file, cpastats_show, NULL);
}

static const struct file_operations cpastats_fops = {
        .open                = cpastats_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = single_release,
};

static int __init cpa_stats_init(void)
{
        debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL,
                            &cpastats_fops);
        return 0;
}
late_initcall(cpa_stats_init);
#else
static inline void cpa_inc_1g_checked(void) { }
static inline void cpa_inc_2m_checked(void) { }
static inline void cpa_inc_4k_install(void) { }
static inline void cpa_inc_lp_sameprot(int level) { }
static inline void cpa_inc_lp_preserved(int level) { }
#endif


static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
        return addr >= start && addr < end;
}

static inline int
within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
{
        return addr >= start && addr <= end;
}

#ifdef CONFIG_X86_64

/*
 * The kernel image is mapped into two places in the virtual address space
 * (addresses without KASLR, of course):
 *
 * 1. The kernel direct map (0xffff880000000000)
 * 2. The "high kernel map" (0xffffffff81000000)
 *
 * We actually execute out of #2. If we get the address of a kernel symbol, it
 * points to #2, but almost all physical-to-virtual translations point to #1.
 *
 * This is so that we can have both a directmap of all physical memory *and*
 * take full advantage of the limited (s32) immediate addressing range (2G)
 * of x86_64.
 *
 * See Documentation/arch/x86/x86_64/mm.rst for more detail.
 */

static inline unsigned long highmap_start_pfn(void)
{
        return __pa_symbol(_text) >> PAGE_SHIFT;
}

static inline unsigned long highmap_end_pfn(void)
{
        /* Do not reference physical address outside the kernel. */
        return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT;
}

static bool __cpa_pfn_in_highmap(unsigned long pfn)
{
        /*
         * Kernel text has an alias mapping at a high address, known
         * here as "highmap".
         */
        return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn());
}

#else

static bool __cpa_pfn_in_highmap(unsigned long pfn)
{
        /* There is no highmap on 32-bit */
        return false;
}

#endif

/*
 * See set_mce_nospec().
 *
 * Machine check recovery code needs to change cache mode of poisoned pages to
 * UC to avoid speculative access logging another error. But passing the
 * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a
 * speculative access. So we cheat and flip the top bit of the address. This
 * works fine for the code that updates the page tables. But at the end of the
 * process we need to flush the TLB and cache and the non-canonical address
 * causes a #GP fault when used by the INVLPG and CLFLUSH instructions.
 *
 * But in the common case we already have a canonical address. This code
 * will fix the top bit if needed and is a no-op otherwise.
 */
static inline unsigned long fix_addr(unsigned long addr)
{
#ifdef CONFIG_X86_64
        return (long)(addr << 1) >> 1;
#else
        return addr;
#endif
}

static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx)
{
        if (cpa->flags & CPA_PAGES_ARRAY) {
                struct page *page = cpa->pages[idx];

                if (unlikely(PageHighMem(page)))
                        return 0;

                return (unsigned long)page_address(page);
        }

        if (cpa->flags & CPA_ARRAY)
                return cpa->vaddr[idx];

        return *cpa->vaddr + idx * PAGE_SIZE;
}

/*
 * Flushing functions
 */

static void clflush_cache_range_opt(void *vaddr, unsigned int size)
{
        const unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
        void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
        void *vend = vaddr + size;

        if (p >= vend)
                return;

        for (; p < vend; p += clflush_size)
                clflushopt(p);
}

/**
 * clflush_cache_range - flush a cache range with clflush
 * @vaddr:        virtual start address
 * @size:        number of bytes to flush
 *
 * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or
 * SFENCE to avoid ordering issues.
 */
void clflush_cache_range(void *vaddr, unsigned int size)
{
        mb();
        clflush_cache_range_opt(vaddr, size);
        mb();
}
EXPORT_SYMBOL_GPL(clflush_cache_range);

#ifdef CONFIG_ARCH_HAS_PMEM_API
void arch_invalidate_pmem(void *addr, size_t size)
{
        clflush_cache_range(addr, size);
}
EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
#endif

#ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
bool cpu_cache_has_invalidate_memregion(void)
{
        return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR);
}
EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, DEVMEM);

int cpu_cache_invalidate_memregion(int res_desc)
{
        if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion()))
                return -ENXIO;
        wbinvd_on_all_cpus();
        return 0;
}
EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, DEVMEM);
#endif

static void __cpa_flush_all(void *arg)
{
        unsigned long cache = (unsigned long)arg;

        /*
         * Flush all to work around Errata in early athlons regarding
         * large page flushing.
         */
        __flush_tlb_all();

        if (cache && boot_cpu_data.x86 >= 4)
                wbinvd();
}

static void cpa_flush_all(unsigned long cache)
{
        BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);

        on_each_cpu(__cpa_flush_all, (void *) cache, 1);
}

static void __cpa_flush_tlb(void *data)
{
        struct cpa_data *cpa = data;
        unsigned int i;

        for (i = 0; i < cpa->numpages; i++)
                flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
}

static void cpa_flush(struct cpa_data *data, int cache)
{
        struct cpa_data *cpa = data;
        unsigned int i;

        BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);

        if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
                cpa_flush_all(cache);
                return;
        }

        if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling)
                flush_tlb_all();
        else
                on_each_cpu(__cpa_flush_tlb, cpa, 1);

        if (!cache)
                return;

        mb();
        for (i = 0; i < cpa->numpages; i++) {
                unsigned long addr = __cpa_addr(cpa, i);
                unsigned int level;

                pte_t *pte = lookup_address(addr, &level);

                /*
                 * Only flush present addresses:
                 */
                if (pte && (pte_val(*pte) & _PAGE_PRESENT))
                        clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE);
        }
        mb();
}

static bool overlaps(unsigned long r1_start, unsigned long r1_end,
                     unsigned long r2_start, unsigned long r2_end)
{
        return (r1_start <= r2_end && r1_end >= r2_start) ||
                (r2_start <= r1_end && r2_end >= r1_start);
}

#ifdef CONFIG_PCI_BIOS
/*
 * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS
 * based config access (CONFIG_PCI_GOBIOS) support.
 */
#define BIOS_PFN        PFN_DOWN(BIOS_BEGIN)
#define BIOS_PFN_END        PFN_DOWN(BIOS_END - 1)

static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
{
        if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END))
                return _PAGE_NX;
        return 0;
}
#else
static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
{
        return 0;
}
#endif

/*
 * The .rodata section needs to be read-only. Using the pfn catches all
 * aliases.  This also includes __ro_after_init, so do not enforce until
 * kernel_set_to_readonly is true.
 */
static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn)
{
        unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata));

        /*
         * Note: __end_rodata is at page aligned and not inclusive, so
         * subtract 1 to get the last enforced PFN in the rodata area.
         */
        epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1;

        if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro))
                return _PAGE_RW;
        return 0;
}

/*
 * Protect kernel text against becoming non executable by forbidding
 * _PAGE_NX.  This protects only the high kernel mapping (_text -> _etext)
 * out of which the kernel actually executes.  Do not protect the low
 * mapping.
 *
 * This does not cover __inittext since that is gone after boot.
 */
static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end)
{
        unsigned long t_end = (unsigned long)_etext - 1;
        unsigned long t_start = (unsigned long)_text;

        if (overlaps(start, end, t_start, t_end))
                return _PAGE_NX;
        return 0;
}

#if defined(CONFIG_X86_64)
/*
 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
 * kernel text mappings for the large page aligned text, rodata sections
 * will be always read-only. For the kernel identity mappings covering the
 * holes caused by this alignment can be anything that user asks.
 *
 * This will preserve the large page mappings for kernel text/data at no
 * extra cost.
 */
static pgprotval_t protect_kernel_text_ro(unsigned long start,
                                          unsigned long end)
{
        unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1;
        unsigned long t_start = (unsigned long)_text;
        unsigned int level;

        if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end))
                return 0;
        /*
         * Don't enforce the !RW mapping for the kernel text mapping, if
         * the current mapping is already using small page mapping.  No
         * need to work hard to preserve large page mappings in this case.
         *
         * This also fixes the Linux Xen paravirt guest boot failure caused
         * by unexpected read-only mappings for kernel identity
         * mappings. In this paravirt guest case, the kernel text mapping
         * and the kernel identity mapping share the same page-table pages,
         * so the protections for kernel text and identity mappings have to
         * be the same.
         */
        if (lookup_address(start, &level) && (level != PG_LEVEL_4K))
                return _PAGE_RW;
        return 0;
}
#else
static pgprotval_t protect_kernel_text_ro(unsigned long start,
                                          unsigned long end)
{
        return 0;
}
#endif

static inline bool conflicts(pgprot_t prot, pgprotval_t val)
{
        return (pgprot_val(prot) & ~val) != pgprot_val(prot);
}

static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val,
                                  unsigned long start, unsigned long end,
                                  unsigned long pfn, const char *txt)
{
        static const char *lvltxt[] = {
                [CPA_CONFLICT]        = "conflict",
                [CPA_PROTECT]        = "protect",
                [CPA_DETECT]        = "detect",
        };

        if (warnlvl > cpa_warn_level || !conflicts(prot, val))
                return;

        pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n",
                lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot),
                (unsigned long long)val);
}

/*
 * Certain areas of memory on x86 require very specific protection flags,
 * for example the BIOS area or kernel text. Callers don't always get this
 * right (again, ioremap() on BIOS memory is not uncommon) so this function
 * checks and fixes these known static required protection bits.
 */
static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
                                          unsigned long pfn, unsigned long npg,
                                          unsigned long lpsize, int warnlvl)
{
        pgprotval_t forbidden, res;
        unsigned long end;

        /*
         * There is no point in checking RW/NX conflicts when the requested
         * mapping is setting the page !PRESENT.
         */
        if (!(pgprot_val(prot) & _PAGE_PRESENT))
                return prot;

        /* Operate on the virtual address */
        end = start + npg * PAGE_SIZE - 1;

        res = protect_kernel_text(start, end);
        check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX");
        forbidden = res;

        /*
         * Special case to preserve a large page. If the change spawns the
         * full large page mapping then there is no point to split it
         * up. Happens with ftrace and is going to be removed once ftrace
         * switched to text_poke().
         */
        if (lpsize != (npg * PAGE_SIZE) || (start & (lpsize - 1))) {
                res = protect_kernel_text_ro(start, end);
                check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO");
                forbidden |= res;
        }

        /* Check the PFN directly */
        res = protect_pci_bios(pfn, pfn + npg - 1);
        check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX");
        forbidden |= res;

        res = protect_rodata(pfn, pfn + npg - 1);
        check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO");
        forbidden |= res;

        return __pgprot(pgprot_val(prot) & ~forbidden);
}

/*
 * Validate strict W^X semantics.
 */
static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start,
                                  unsigned long pfn, unsigned long npg,
                                  bool nx, bool rw)
{
        unsigned long end;

        /*
         * 32-bit has some unfixable W+X issues, like EFI code
         * and writeable data being in the same page.  Disable
         * detection and enforcement there.
         */
        if (IS_ENABLED(CONFIG_X86_32))
                return new;

        /* Only verify when NX is supported: */
        if (!(__supported_pte_mask & _PAGE_NX))
                return new;

        if (!((pgprot_val(old) ^ pgprot_val(new)) & (_PAGE_RW | _PAGE_NX)))
                return new;

        if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW)
                return new;

        /* Non-leaf translation entries can disable writing or execution. */
        if (!rw || nx)
                return new;

        end = start + npg * PAGE_SIZE - 1;
        WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n",
                  (unsigned long long)pgprot_val(old),
                  (unsigned long long)pgprot_val(new),
                  start, end, pfn);

        /*
         * For now, allow all permission change attempts by returning the
         * attempted permissions.  This can 'return old' to actively
         * refuse the permission change at a later time.
         */
        return new;
}

/*
 * Lookup the page table entry for a virtual address in a specific pgd.
 * Return a pointer to the entry, the level of the mapping, and the effective
 * NX and RW bits of all page table levels.
 */
pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
                                  unsigned int *level, bool *nx, bool *rw)
{
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        *level = PG_LEVEL_NONE;
        *nx = false;
        *rw = true;

        if (pgd_none(*pgd))
                return NULL;

        *nx |= pgd_flags(*pgd) & _PAGE_NX;
        *rw &= pgd_flags(*pgd) & _PAGE_RW;

        p4d = p4d_offset(pgd, address);
        if (p4d_none(*p4d))
                return NULL;

        *level = PG_LEVEL_512G;
        if (p4d_leaf(*p4d) || !p4d_present(*p4d))
                return (pte_t *)p4d;

        *nx |= p4d_flags(*p4d) & _PAGE_NX;
        *rw &= p4d_flags(*p4d) & _PAGE_RW;

        pud = pud_offset(p4d, address);
        if (pud_none(*pud))
                return NULL;

        *level = PG_LEVEL_1G;
        if (pud_leaf(*pud) || !pud_present(*pud))
                return (pte_t *)pud;

        *nx |= pud_flags(*pud) & _PAGE_NX;
        *rw &= pud_flags(*pud) & _PAGE_RW;

        pmd = pmd_offset(pud, address);
        if (pmd_none(*pmd))
                return NULL;

        *level = PG_LEVEL_2M;
        if (pmd_leaf(*pmd) || !pmd_present(*pmd))
                return (pte_t *)pmd;

        *nx |= pmd_flags(*pmd) & _PAGE_NX;
        *rw &= pmd_flags(*pmd) & _PAGE_RW;

        *level = PG_LEVEL_4K;

        return pte_offset_kernel(pmd, address);
}

/*
 * Lookup the page table entry for a virtual address in a specific pgd.
 * Return a pointer to the entry and the level of the mapping.
 */
pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
                             unsigned int *level)
{
        bool nx, rw;

        return lookup_address_in_pgd_attr(pgd, address, level, &nx, &rw);
}

/*
 * Lookup the page table entry for a virtual address. Return a pointer
 * to the entry and the level of the mapping.
 *
 * Note: We return pud and pmd either when the entry is marked large
 * or when the present bit is not set. Otherwise we would return a
 * pointer to a nonexisting mapping.
 */
pte_t *lookup_address(unsigned long address, unsigned int *level)
{
        return lookup_address_in_pgd(pgd_offset_k(address), address, level);
}
EXPORT_SYMBOL_GPL(lookup_address);

static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
                                  unsigned int *level, bool *nx, bool *rw)
{
        pgd_t *pgd;

        if (!cpa->pgd)
                pgd = pgd_offset_k(address);
        else
                pgd = cpa->pgd + pgd_index(address);

        return lookup_address_in_pgd_attr(pgd, address, level, nx, rw);
}

/*
 * Lookup the PMD entry for a virtual address. Return a pointer to the entry
 * or NULL if not present.
 */
pmd_t *lookup_pmd_address(unsigned long address)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;

        pgd = pgd_offset_k(address);
        if (pgd_none(*pgd))
                return NULL;

        p4d = p4d_offset(pgd, address);
        if (p4d_none(*p4d) || p4d_leaf(*p4d) || !p4d_present(*p4d))
                return NULL;

        pud = pud_offset(p4d, address);
        if (pud_none(*pud) || pud_leaf(*pud) || !pud_present(*pud))
                return NULL;

        return pmd_offset(pud, address);
}

/*
 * This is necessary because __pa() does not work on some
 * kinds of memory, like vmalloc() or the alloc_remap()
 * areas on 32-bit NUMA systems.  The percpu areas can
 * end up in this kind of memory, for instance.
 *
 * Note that as long as the PTEs are well-formed with correct PFNs, this
 * works without checking the PRESENT bit in the leaf PTE.  This is unlike
 * the similar vmalloc_to_page() and derivatives.  Callers may depend on
 * this behavior.
 *
 * This could be optimized, but it is only used in paths that are not perf
 * sensitive, and keeping it unoptimized should increase the testing coverage
 * for the more obscure platforms.
 */
phys_addr_t slow_virt_to_phys(void *__virt_addr)
{
        unsigned long virt_addr = (unsigned long)__virt_addr;
        phys_addr_t phys_addr;
        unsigned long offset;
        enum pg_level level;
        pte_t *pte;

        pte = lookup_address(virt_addr, &level);
        BUG_ON(!pte);

        /*
         * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t
         * before being left-shifted PAGE_SHIFT bits -- this trick is to
         * make 32-PAE kernel work correctly.
         */
        switch (level) {
        case PG_LEVEL_1G:
                phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
                offset = virt_addr & ~PUD_MASK;
                break;
        case PG_LEVEL_2M:
                phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
                offset = virt_addr & ~PMD_MASK;
                break;
        default:
                phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
                offset = virt_addr & ~PAGE_MASK;
        }

        return (phys_addr_t)(phys_addr | offset);
}
EXPORT_SYMBOL_GPL(slow_virt_to_phys);

/*
 * Set the new pmd in all the pgds we know about:
 */
static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
{
        /* change init_mm */
        set_pte_atomic(kpte, pte);
#ifdef CONFIG_X86_32
        if (!SHARED_KERNEL_PMD) {
                struct page *page;

                list_for_each_entry(page, &pgd_list, lru) {
                        pgd_t *pgd;
                        p4d_t *p4d;
                        pud_t *pud;
                        pmd_t *pmd;

                        pgd = (pgd_t *)page_address(page) + pgd_index(address);
                        p4d = p4d_offset(pgd, address);
                        pud = pud_offset(p4d, address);
                        pmd = pmd_offset(pud, address);
                        set_pte_atomic((pte_t *)pmd, pte);
                }
        }
#endif
}

static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
{
        /*
         * _PAGE_GLOBAL means "global page" for present PTEs.
         * But, it is also used to indicate _PAGE_PROTNONE
         * for non-present PTEs.
         *
         * This ensures that a _PAGE_GLOBAL PTE going from
         * present to non-present is not confused as
         * _PAGE_PROTNONE.
         */
        if (!(pgprot_val(prot) & _PAGE_PRESENT))
                pgprot_val(prot) &= ~_PAGE_GLOBAL;

        return prot;
}

static int __should_split_large_page(pte_t *kpte, unsigned long address,
                                     struct cpa_data *cpa)
{
        unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn;
        pgprot_t old_prot, new_prot, req_prot, chk_prot;
        pte_t new_pte, *tmp;
        enum pg_level level;
        bool nx, rw;

        /*
         * Check for races, another CPU might have split this page
         * up already:
         */
        tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
        if (tmp != kpte)
                return 1;

        switch (level) {
        case PG_LEVEL_2M:
                old_prot = pmd_pgprot(*(pmd_t *)kpte);
                old_pfn = pmd_pfn(*(pmd_t *)kpte);
                cpa_inc_2m_checked();
                break;
        case PG_LEVEL_1G:
                old_prot = pud_pgprot(*(pud_t *)kpte);
                old_pfn = pud_pfn(*(pud_t *)kpte);
                cpa_inc_1g_checked();
                break;
        default:
                return -EINVAL;
        }

        psize = page_level_size(level);
        pmask = page_level_mask(level);

        /*
         * Calculate the number of pages, which fit into this large
         * page starting at address:
         */
        lpaddr = (address + psize) & pmask;
        numpages = (lpaddr - address) >> PAGE_SHIFT;
        if (numpages < cpa->numpages)
                cpa->numpages = numpages;

        /*
         * We are safe now. Check whether the new pgprot is the same:
         * Convert protection attributes to 4k-format, as cpa->mask* are set
         * up accordingly.
         */

        /* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */
        req_prot = pgprot_large_2_4k(old_prot);

        pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
        pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);

        /*
         * req_prot is in format of 4k pages. It must be converted to large
         * page format: the caching mode includes the PAT bit located at
         * different bit positions in the two formats.
         */
        req_prot = pgprot_4k_2_large(req_prot);
        req_prot = pgprot_clear_protnone_bits(req_prot);
        if (pgprot_val(req_prot) & _PAGE_PRESENT)
                pgprot_val(req_prot) |= _PAGE_PSE;

        /*
         * old_pfn points to the large page base pfn. So we need to add the
         * offset of the virtual address:
         */
        pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
        cpa->pfn = pfn;

        /*
         * Calculate the large page base address and the number of 4K pages
         * in the large page
         */
        lpaddr = address & pmask;
        numpages = psize >> PAGE_SHIFT;

        /*
         * Sanity check that the existing mapping is correct versus the static
         * protections. static_protections() guards against !PRESENT, so no
         * extra conditional required here.
         */
        chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages,
                                      psize, CPA_CONFLICT);

        if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) {
                /*
                 * Split the large page and tell the split code to
                 * enforce static protections.
                 */
                cpa->force_static_prot = 1;
                return 1;
        }

        /*
         * Optimization: If the requested pgprot is the same as the current
         * pgprot, then the large page can be preserved and no updates are
         * required independent of alignment and length of the requested
         * range. The above already established that the current pgprot is
         * correct, which in consequence makes the requested pgprot correct
         * as well if it is the same. The static protection scan below will
         * not come to a different conclusion.
         */
        if (pgprot_val(req_prot) == pgprot_val(old_prot)) {
                cpa_inc_lp_sameprot(level);
                return 0;
        }

        /*
         * If the requested range does not cover the full page, split it up
         */
        if (address != lpaddr || cpa->numpages != numpages)
                return 1;

        /*
         * Check whether the requested pgprot is conflicting with a static
         * protection requirement in the large page.
         */
        new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
                                      psize, CPA_DETECT);

        new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages,
                              nx, rw);

        /*
         * If there is a conflict, split the large page.
         *
         * There used to be a 4k wise evaluation trying really hard to
         * preserve the large pages, but experimentation has shown, that this
         * does not help at all. There might be corner cases which would
         * preserve one large page occasionally, but it's really not worth the
         * extra code and cycles for the common case.
         */
        if (pgprot_val(req_prot) != pgprot_val(new_prot))
                return 1;

        /* All checks passed. Update the large page mapping. */
        new_pte = pfn_pte(old_pfn, new_prot);
        __set_pmd_pte(kpte, address, new_pte);
        cpa->flags |= CPA_FLUSHTLB;
        cpa_inc_lp_preserved(level);
        return 0;
}

static int should_split_large_page(pte_t *kpte, unsigned long address,
                                   struct cpa_data *cpa)
{
        int do_split;

        if (cpa->force_split)
                return 1;

        spin_lock(&pgd_lock);
        do_split = __should_split_large_page(kpte, address, cpa);
        spin_unlock(&pgd_lock);

        return do_split;
}

static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn,
                          pgprot_t ref_prot, unsigned long address,
                          unsigned long size)
{
        unsigned int npg = PFN_DOWN(size);
        pgprot_t prot;

        /*
         * If should_split_large_page() discovered an inconsistent mapping,
         * remove the invalid protection in the split mapping.
         */
        if (!cpa->force_static_prot)
                goto set;

        /* Hand in lpsize = 0 to enforce the protection mechanism */
        prot = static_protections(ref_prot, address, pfn, npg, 0, CPA_PROTECT);

        if (pgprot_val(prot) == pgprot_val(ref_prot))
                goto set;

        /*
         * If this is splitting a PMD, fix it up. PUD splits cannot be
         * fixed trivially as that would require to rescan the newly
         * installed PMD mappings after returning from split_large_page()
         * so an eventual further split can allocate the necessary PTE
         * pages. Warn for now and revisit it in case this actually
         * happens.
         */
        if (size == PAGE_SIZE)
                ref_prot = prot;
        else
                pr_warn_once("CPA: Cannot fixup static protections for PUD split\n");
set:
        set_pte(pte, pfn_pte(pfn, ref_prot));
}

static int
__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
                   struct page *base)
{
        unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1;
        pte_t *pbase = (pte_t *)page_address(base);
        unsigned int i, level;
        pgprot_t ref_prot;
        bool nx, rw;
        pte_t *tmp;

        spin_lock(&pgd_lock);
        /*
         * Check for races, another CPU might have split this page
         * up for us already:
         */
        tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
        if (tmp != kpte) {
                spin_unlock(&pgd_lock);
                return 1;
        }

        paravirt_alloc_pte(&init_mm, page_to_pfn(base));

        switch (level) {
        case PG_LEVEL_2M:
                ref_prot = pmd_pgprot(*(pmd_t *)kpte);
                /*
                 * Clear PSE (aka _PAGE_PAT) and move
                 * PAT bit to correct position.
                 */
                ref_prot = pgprot_large_2_4k(ref_prot);
                ref_pfn = pmd_pfn(*(pmd_t *)kpte);
                lpaddr = address & PMD_MASK;
                lpinc = PAGE_SIZE;
                break;

        case PG_LEVEL_1G:
                ref_prot = pud_pgprot(*(pud_t *)kpte);
                ref_pfn = pud_pfn(*(pud_t *)kpte);
                pfninc = PMD_SIZE >> PAGE_SHIFT;
                lpaddr = address & PUD_MASK;
                lpinc = PMD_SIZE;
                /*
                 * Clear the PSE flags if the PRESENT flag is not set
                 * otherwise pmd_present/pmd_huge will return true
                 * even on a non present pmd.
                 */
                if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
                        pgprot_val(ref_prot) &= ~_PAGE_PSE;
                break;

        default:
                spin_unlock(&pgd_lock);
                return 1;
        }

        ref_prot = pgprot_clear_protnone_bits(ref_prot);

        /*
         * Get the target pfn from the original entry:
         */
        pfn = ref_pfn;
        for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc)
                split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc);

        if (virt_addr_valid(address)) {
                unsigned long pfn = PFN_DOWN(__pa(address));

                if (pfn_range_is_mapped(pfn, pfn + 1))
                        split_page_count(level);
        }

        /*
         * Install the new, split up pagetable.
         *
         * We use the standard kernel pagetable protections for the new
         * pagetable protections, the actual ptes set above control the
         * primary protection behavior:
         */
        __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));

        /*
         * Do a global flush tlb after splitting the large page
         * and before we do the actual change page attribute in the PTE.
         *
         * Without this, we violate the TLB application note, that says:
         * "The TLBs may contain both ordinary and large-page
         *  translations for a 4-KByte range of linear addresses. This
         *  may occur if software modifies the paging structures so that
         *  the page size used for the address range changes. If the two
         *  translations differ with respect to page frame or attributes
         *  (e.g., permissions), processor behavior is undefined and may
         *  be implementation-specific."
         *
         * We do this global tlb flush inside the cpa_lock, so that we
         * don't allow any other cpu, with stale tlb entries change the
         * page attribute in parallel, that also falls into the
         * just split large page entry.
         */
        flush_tlb_all();
        spin_unlock(&pgd_lock);

        return 0;
}

static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
                            unsigned long address)
{
        struct page *base;

        if (!debug_pagealloc_enabled())
                spin_unlock(&cpa_lock);
        base = alloc_pages(GFP_KERNEL, 0);
        if (!debug_pagealloc_enabled())
                spin_lock(&cpa_lock);
        if (!base)
                return -ENOMEM;

        if (__split_large_page(cpa, kpte, address, base))
                __free_page(base);

        return 0;
}

static bool try_to_free_pte_page(pte_t *pte)
{
        int i;

        for (i = 0; i < PTRS_PER_PTE; i++)
                if (!pte_none(pte[i]))
                        return false;

        free_page((unsigned long)pte);
        return true;
}

static bool try_to_free_pmd_page(pmd_t *pmd)
{
        int i;

        for (i = 0; i < PTRS_PER_PMD; i++)
                if (!pmd_none(pmd[i]))
                        return false;

        free_page((unsigned long)pmd);
        return true;
}

static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
{
        pte_t *pte = pte_offset_kernel(pmd, start);

        while (start < end) {
                set_pte(pte, __pte(0));

                start += PAGE_SIZE;
                pte++;
        }

        if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
                pmd_clear(pmd);
                return true;
        }
        return false;
}

static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
                              unsigned long start, unsigned long end)
{
        if (unmap_pte_range(pmd, start, end))
                if (try_to_free_pmd_page(pud_pgtable(*pud)))
                        pud_clear(pud);
}

static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
{
        pmd_t *pmd = pmd_offset(pud, start);

        /*
         * Not on a 2MB page boundary?
         */
        if (start & (PMD_SIZE - 1)) {
                unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
                unsigned long pre_end = min_t(unsigned long, end, next_page);

                __unmap_pmd_range(pud, pmd, start, pre_end);

                start = pre_end;
                pmd++;
        }

        /*
         * Try to unmap in 2M chunks.
         */
        while (end - start >= PMD_SIZE) {
                if (pmd_leaf(*pmd))
                        pmd_clear(pmd);
                else
                        __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);

                start += PMD_SIZE;
                pmd++;
        }

        /*
         * 4K leftovers?
         */
        if (start < end)
                return __unmap_pmd_range(pud, pmd, start, end);

        /*
         * Try again to free the PMD page if haven't succeeded above.
         */
        if (!pud_none(*pud))
                if (try_to_free_pmd_page(pud_pgtable(*pud)))
                        pud_clear(pud);
}

static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
{
        pud_t *pud = pud_offset(p4d, start);

        /*
         * Not on a GB page boundary?
         */
        if (start & (PUD_SIZE - 1)) {
                unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
                unsigned long pre_end        = min_t(unsigned long, end, next_page);

                unmap_pmd_range(pud, start, pre_end);

                start = pre_end;
                pud++;
        }

        /*
         * Try to unmap in 1G chunks?
         */
        while (end - start >= PUD_SIZE) {

                if (pud_leaf(*pud))
                        pud_clear(pud);
                else
                        unmap_pmd_range(pud, start, start + PUD_SIZE);

                start += PUD_SIZE;
                pud++;
        }

        /*
         * 2M leftovers?
         */
        if (start < end)
                unmap_pmd_range(pud, start, end);

        /*
         * No need to try to free the PUD page because we'll free it in
         * populate_pgd's error path
         */
}

static int alloc_pte_page(pmd_t *pmd)
{
        pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
        if (!pte)
                return -1;

        set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
        return 0;
}

static int alloc_pmd_page(pud_t *pud)
{
        pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
        if (!pmd)
                return -1;

        set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
        return 0;
}

static void populate_pte(struct cpa_data *cpa,
                         unsigned long start, unsigned long end,
                         unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
{
        pte_t *pte;

        pte = pte_offset_kernel(pmd, start);

        pgprot = pgprot_clear_protnone_bits(pgprot);

        while (num_pages-- && start < end) {
                set_pte(pte, pfn_pte(cpa->pfn, pgprot));

                start         += PAGE_SIZE;
                cpa->pfn++;
                pte++;
        }
}

static long populate_pmd(struct cpa_data *cpa,
                         unsigned long start, unsigned long end,
                         unsigned num_pages, pud_t *pud, pgprot_t pgprot)
{
        long cur_pages = 0;
        pmd_t *pmd;
        pgprot_t pmd_pgprot;

        /*
         * Not on a 2M boundary?
         */
        if (start & (PMD_SIZE - 1)) {
                unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
                unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;

                pre_end   = min_t(unsigned long, pre_end, next_page);
                cur_pages = (pre_end - start) >> PAGE_SHIFT;
                cur_pages = min_t(unsigned int, num_pages, cur_pages);

                /*
                 * Need a PTE page?
                 */
                pmd = pmd_offset(pud, start);
                if (pmd_none(*pmd))
                        if (alloc_pte_page(pmd))
                                return -1;

                populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);

                start = pre_end;
        }

        /*
         * We mapped them all?
         */
        if (num_pages == cur_pages)
                return cur_pages;

        pmd_pgprot = pgprot_4k_2_large(pgprot);

        while (end - start >= PMD_SIZE) {

                /*
                 * We cannot use a 1G page so allocate a PMD page if needed.
                 */
                if (pud_none(*pud))
                        if (alloc_pmd_page(pud))
                                return -1;

                pmd = pmd_offset(pud, start);

                set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn,
                                        canon_pgprot(pmd_pgprot))));

                start          += PMD_SIZE;
                cpa->pfn  += PMD_SIZE >> PAGE_SHIFT;
                cur_pages += PMD_SIZE >> PAGE_SHIFT;
        }

        /*
         * Map trailing 4K pages.
         */
        if (start < end) {
                pmd = pmd_offset(pud, start);
                if (pmd_none(*pmd))
                        if (alloc_pte_page(pmd))
                                return -1;

                populate_pte(cpa, start, end, num_pages - cur_pages,
                             pmd, pgprot);
        }
        return num_pages;
}

static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
                        pgprot_t pgprot)
{
        pud_t *pud;
        unsigned long end;
        long cur_pages = 0;
        pgprot_t pud_pgprot;

        end = start + (cpa->numpages << PAGE_SHIFT);

        /*
         * Not on a Gb page boundary? => map everything up to it with
         * smaller pages.
         */
        if (start & (PUD_SIZE - 1)) {
                unsigned long pre_end;
                unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;

                pre_end   = min_t(unsigned long, end, next_page);
                cur_pages = (pre_end - start) >> PAGE_SHIFT;
                cur_pages = min_t(int, (int)cpa->numpages, cur_pages);

                pud = pud_offset(p4d, start);

                /*
                 * Need a PMD page?
                 */
                if (pud_none(*pud))
                        if (alloc_pmd_page(pud))
                                return -1;

                cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
                                         pud, pgprot);
                if (cur_pages < 0)
                        return cur_pages;

                start = pre_end;
        }

        /* We mapped them all? */
        if (cpa->numpages == cur_pages)
                return cur_pages;

        pud = pud_offset(p4d, start);
        pud_pgprot = pgprot_4k_2_large(pgprot);

        /*
         * Map everything starting from the Gb boundary, possibly with 1G pages
         */
        while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
                set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
                                   canon_pgprot(pud_pgprot))));

                start          += PUD_SIZE;
                cpa->pfn  += PUD_SIZE >> PAGE_SHIFT;
                cur_pages += PUD_SIZE >> PAGE_SHIFT;
                pud++;
        }

        /* Map trailing leftover */
        if (start < end) {
                long tmp;

                pud = pud_offset(p4d, start);
                if (pud_none(*pud))
                        if (alloc_pmd_page(pud))
                                return -1;

                tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
                                   pud, pgprot);
                if (tmp < 0)
                        return cur_pages;

                cur_pages += tmp;
        }
        return cur_pages;
}

/*
 * Restrictions for kernel page table do not necessarily apply when mapping in
 * an alternate PGD.
 */
static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
{
        pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
        pud_t *pud = NULL;        /* shut up gcc */
        p4d_t *p4d;
        pgd_t *pgd_entry;
        long ret;

        pgd_entry = cpa->pgd + pgd_index(addr);

        if (pgd_none(*pgd_entry)) {
                p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
                if (!p4d)
                        return -1;

                set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE));
        }

        /*
         * Allocate a PUD page and hand it down for mapping.
         */
        p4d = p4d_offset(pgd_entry, addr);
        if (p4d_none(*p4d)) {
                pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
                if (!pud)
                        return -1;

                set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
        }

        pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
        pgprot_val(pgprot) |=  pgprot_val(cpa->mask_set);

        ret = populate_pud(cpa, addr, p4d, pgprot);
        if (ret < 0) {
                /*
                 * Leave the PUD page in place in case some other CPU or thread
                 * already found it, but remove any useless entries we just
                 * added to it.
                 */
                unmap_pud_range(p4d, addr,
                                addr + (cpa->numpages << PAGE_SHIFT));
                return ret;
        }

        cpa->numpages = ret;
        return 0;
}

static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
                               int primary)
{
        if (cpa->pgd) {
                /*
                 * Right now, we only execute this code path when mapping
                 * the EFI virtual memory map regions, no other users
                 * provide a ->pgd value. This may change in the future.
                 */
                return populate_pgd(cpa, vaddr);
        }

        /*
         * Ignore all non primary paths.
         */
        if (!primary) {
                cpa->numpages = 1;
                return 0;
        }

        /*
         * Ignore the NULL PTE for kernel identity mapping, as it is expected
         * to have holes.
         * Also set numpages to '1' indicating that we processed cpa req for
         * one virtual address page and its pfn. TBD: numpages can be set based
         * on the initial value and the level returned by lookup_address().
         */
        if (within(vaddr, PAGE_OFFSET,
                   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
                cpa->numpages = 1;
                cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
                return 0;

        } else if (__cpa_pfn_in_highmap(cpa->pfn)) {
                /* Faults in the highmap are OK, so do not warn: */
                return -EFAULT;
        } else {
                WARN(1, KERN_WARNING "CPA: called for zero pte. "
                        "vaddr = %lx cpa->vaddr = %lx\n", vaddr,
                        *cpa->vaddr);

                return -EFAULT;
        }
}

static int __change_page_attr(struct cpa_data *cpa, int primary)
{
        unsigned long address;
        int do_split, err;
        unsigned int level;
        pte_t *kpte, old_pte;
        bool nx, rw;

        address = __cpa_addr(cpa, cpa->curpage);
repeat:
        kpte = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
        if (!kpte)
                return __cpa_process_fault(cpa, address, primary);

        old_pte = *kpte;
        if (pte_none(old_pte))
                return __cpa_process_fault(cpa, address, primary);

        if (level == PG_LEVEL_4K) {
                pte_t new_pte;
                pgprot_t old_prot = pte_pgprot(old_pte);
                pgprot_t new_prot = pte_pgprot(old_pte);
                unsigned long pfn = pte_pfn(old_pte);

                pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
                pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);

                cpa_inc_4k_install();
                /* Hand in lpsize = 0 to enforce the protection mechanism */
                new_prot = static_protections(new_prot, address, pfn, 1, 0,
                                              CPA_PROTECT);

                new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1,
                                      nx, rw);

                new_prot = pgprot_clear_protnone_bits(new_prot);

                /*
                 * We need to keep the pfn from the existing PTE,
                 * after all we're only going to change its attributes
                 * not the memory it points to
                 */
                new_pte = pfn_pte(pfn, new_prot);
                cpa->pfn = pfn;
                /*
                 * Do we really change anything ?
                 */
                if (pte_val(old_pte) != pte_val(new_pte)) {
                        set_pte_atomic(kpte, new_pte);
                        cpa->flags |= CPA_FLUSHTLB;
                }
                cpa->numpages = 1;
                return 0;
        }

        /*
         * Check, whether we can keep the large page intact
         * and just change the pte:
         */
        do_split = should_split_large_page(kpte, address, cpa);
        /*
         * When the range fits into the existing large page,
         * return. cp->numpages and cpa->tlbflush have been updated in
         * try_large_page:
         */
        if (do_split <= 0)
                return do_split;

        /*
         * We have to split the large page:
         */
        err = split_large_page(cpa, kpte, address);
        if (!err)
                goto repeat;

        return err;
}

static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary);

/*
 * Check the directmap and "high kernel map" 'aliases'.
 */
static int cpa_process_alias(struct cpa_data *cpa)
{
        struct cpa_data alias_cpa;
        unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
        unsigned long vaddr;
        int ret;

        if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
                return 0;

        /*
         * No need to redo, when the primary call touched the direct
         * mapping already:
         */
        vaddr = __cpa_addr(cpa, cpa->curpage);
        if (!(within(vaddr, PAGE_OFFSET,
                    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {

                alias_cpa = *cpa;
                alias_cpa.vaddr = &laddr;
                alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
                alias_cpa.curpage = 0;

                /* Directmap always has NX set, do not modify. */
                if (__supported_pte_mask & _PAGE_NX) {
                        alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
                        alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
                }

                cpa->force_flush_all = 1;

                ret = __change_page_attr_set_clr(&alias_cpa, 0);
                if (ret)
                        return ret;
        }

#ifdef CONFIG_X86_64
        /*
         * If the primary call didn't touch the high mapping already
         * and the physical address is inside the kernel map, we need
         * to touch the high mapped kernel as well:
         */
        if (!within(vaddr, (unsigned long)_text, _brk_end) &&
            __cpa_pfn_in_highmap(cpa->pfn)) {
                unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
                                               __START_KERNEL_map - phys_base;
                alias_cpa = *cpa;
                alias_cpa.vaddr = &temp_cpa_vaddr;
                alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
                alias_cpa.curpage = 0;

                /*
                 * [_text, _brk_end) also covers data, do not modify NX except
                 * in cases where the highmap is the primary target.
                 */
                if (__supported_pte_mask & _PAGE_NX) {
                        alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
                        alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
                }

                cpa->force_flush_all = 1;
                /*
                 * The high mapping range is imprecise, so ignore the
                 * return value.
                 */
                __change_page_attr_set_clr(&alias_cpa, 0);
        }
#endif

        return 0;
}

static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary)
{
        unsigned long numpages = cpa->numpages;
        unsigned long rempages = numpages;
        int ret = 0;

        /*
         * No changes, easy!
         */
        if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr)) &&
            !cpa->force_split)
                return ret;

        while (rempages) {
                /*
                 * Store the remaining nr of pages for the large page
                 * preservation check.
                 */
                cpa->numpages = rempages;
                /* for array changes, we can't use large page */
                if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
                        cpa->numpages = 1;

                if (!debug_pagealloc_enabled())
                        spin_lock(&cpa_lock);
                ret = __change_page_attr(cpa, primary);
                if (!debug_pagealloc_enabled())
                        spin_unlock(&cpa_lock);
                if (ret)
                        goto out;

                if (primary && !(cpa->flags & CPA_NO_CHECK_ALIAS)) {
                        ret = cpa_process_alias(cpa);
                        if (ret)
                                goto out;
                }

                /*
                 * Adjust the number of pages with the result of the
                 * CPA operation. Either a large page has been
                 * preserved or a single page update happened.
                 */
                BUG_ON(cpa->numpages > rempages || !cpa->numpages);
                rempages -= cpa->numpages;
                cpa->curpage += cpa->numpages;
        }

out:
        /* Restore the original numpages */
        cpa->numpages = numpages;
        return ret;
}

static int change_page_attr_set_clr(unsigned long *addr, int numpages,
                                    pgprot_t mask_set, pgprot_t mask_clr,
                                    int force_split, int in_flag,
                                    struct page **pages)
{
        struct cpa_data cpa;
        int ret, cache;

        memset(&cpa, 0, sizeof(cpa));

        /*
         * Check, if we are requested to set a not supported
         * feature.  Clearing non-supported features is OK.
         */
        mask_set = canon_pgprot(mask_set);

        if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
                return 0;

        /* Ensure we are PAGE_SIZE aligned */
        if (in_flag & CPA_ARRAY) {
                int i;
                for (i = 0; i < numpages; i++) {
                        if (addr[i] & ~PAGE_MASK) {
                                addr[i] &= PAGE_MASK;
                                WARN_ON_ONCE(1);
                        }
                }
        } else if (!(in_flag & CPA_PAGES_ARRAY)) {
                /*
                 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
                 * No need to check in that case
                 */
                if (*addr & ~PAGE_MASK) {
                        *addr &= PAGE_MASK;
                        /*
                         * People should not be passing in unaligned addresses:
                         */
                        WARN_ON_ONCE(1);
                }
        }

        /* Must avoid aliasing mappings in the highmem code */
        kmap_flush_unused();

        vm_unmap_aliases();

        cpa.vaddr = addr;
        cpa.pages = pages;
        cpa.numpages = numpages;
        cpa.mask_set = mask_set;
        cpa.mask_clr = mask_clr;
        cpa.flags = in_flag;
        cpa.curpage = 0;
        cpa.force_split = force_split;

        ret = __change_page_attr_set_clr(&cpa, 1);

        /*
         * Check whether we really changed something:
         */
        if (!(cpa.flags & CPA_FLUSHTLB))
                goto out;

        /*
         * No need to flush, when we did not set any of the caching
         * attributes:
         */
        cache = !!pgprot2cachemode(mask_set);

        /*
         * On error; flush everything to be sure.
         */
        if (ret) {
                cpa_flush_all(cache);
                goto out;
        }

        cpa_flush(&cpa, cache);
out:
        return ret;
}

static inline int change_page_attr_set(unsigned long *addr, int numpages,
                                       pgprot_t mask, int array)
{
        return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
                (array ? CPA_ARRAY : 0), NULL);
}

static inline int change_page_attr_clear(unsigned long *addr, int numpages,
                                         pgprot_t mask, int array)
{
        return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
                (array ? CPA_ARRAY : 0), NULL);
}

static inline int cpa_set_pages_array(struct page **pages, int numpages,
                                       pgprot_t mask)
{
        return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
                CPA_PAGES_ARRAY, pages);
}

static inline int cpa_clear_pages_array(struct page **pages, int numpages,
                                         pgprot_t mask)
{
        return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
                CPA_PAGES_ARRAY, pages);
}

/*
 * __set_memory_prot is an internal helper for callers that have been passed
 * a pgprot_t value from upper layers and a reservation has already been taken.
 * If you want to set the pgprot to a specific page protocol, use the
 * set_memory_xx() functions.
 */
int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot)
{
        return change_page_attr_set_clr(&addr, numpages, prot,
                                        __pgprot(~pgprot_val(prot)), 0, 0,
                                        NULL);
}

int _set_memory_uc(unsigned long addr, int numpages)
{
        /*
         * for now UC MINUS. see comments in ioremap()
         * If you really need strong UC use ioremap_uc(), but note
         * that you cannot override IO areas with set_memory_*() as
         * these helpers cannot work with IO memory.
         */
        return change_page_attr_set(&addr, numpages,
                                    cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
                                    0);
}

int set_memory_uc(unsigned long addr, int numpages)
{
        int ret;

        /*
         * for now UC MINUS. see comments in ioremap()
         */
        ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
                              _PAGE_CACHE_MODE_UC_MINUS, NULL);
        if (ret)
                goto out_err;

        ret = _set_memory_uc(addr, numpages);
        if (ret)
                goto out_free;

        return 0;

out_free:
        memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
out_err:
        return ret;
}
EXPORT_SYMBOL(set_memory_uc);

int _set_memory_wc(unsigned long addr, int numpages)
{
        int ret;

        ret = change_page_attr_set(&addr, numpages,
                                   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
                                   0);
        if (!ret) {
                ret = change_page_attr_set_clr(&addr, numpages,
                                               cachemode2pgprot(_PAGE_CACHE_MODE_WC),
                                               __pgprot(_PAGE_CACHE_MASK),
                                               0, 0, NULL);
        }
        return ret;
}

int set_memory_wc(unsigned long addr, int numpages)
{
        int ret;

        ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
                _PAGE_CACHE_MODE_WC, NULL);
        if (ret)
                return ret;

        ret = _set_memory_wc(addr, numpages);
        if (ret)
                memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);

        return ret;
}
EXPORT_SYMBOL(set_memory_wc);

int _set_memory_wt(unsigned long addr, int numpages)
{
        return change_page_attr_set(&addr, numpages,
                                    cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0);
}

int _set_memory_wb(unsigned long addr, int numpages)
{
        /* WB cache mode is hard wired to all cache attribute bits being 0 */
        return change_page_attr_clear(&addr, numpages,
                                      __pgprot(_PAGE_CACHE_MASK), 0);
}

int set_memory_wb(unsigned long addr, int numpages)
{
        int ret;

        ret = _set_memory_wb(addr, numpages);
        if (ret)
                return ret;

        memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
        return 0;
}
EXPORT_SYMBOL(set_memory_wb);

/* Prevent speculative access to a page by marking it not-present */
#ifdef CONFIG_X86_64
int set_mce_nospec(unsigned long pfn)
{
        unsigned long decoy_addr;
        int rc;

        /* SGX pages are not in the 1:1 map */
        if (arch_is_platform_page(pfn << PAGE_SHIFT))
                return 0;
        /*
         * We would like to just call:
         *      set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
         * but doing that would radically increase the odds of a
         * speculative access to the poison page because we'd have
         * the virtual address of the kernel 1:1 mapping sitting
         * around in registers.
         * Instead we get tricky.  We create a non-canonical address
         * that looks just like the one we want, but has bit 63 flipped.
         * This relies on set_memory_XX() properly sanitizing any __pa()
         * results with __PHYSICAL_MASK or PTE_PFN_MASK.
         */
        decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));

        rc = set_memory_np(decoy_addr, 1);
        if (rc)
                pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
        return rc;
}

/* Restore full speculative operation to the pfn. */
int clear_mce_nospec(unsigned long pfn)
{
        unsigned long addr = (unsigned long) pfn_to_kaddr(pfn);

        return set_memory_p(addr, 1);
}
EXPORT_SYMBOL_GPL(clear_mce_nospec);
#endif /* CONFIG_X86_64 */

int set_memory_x(unsigned long addr, int numpages)
{
        if (!(__supported_pte_mask & _PAGE_NX))
                return 0;

        return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
}

int set_memory_nx(unsigned long addr, int numpages)
{
        if (!(__supported_pte_mask & _PAGE_NX))
                return 0;

        return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
}

int set_memory_ro(unsigned long addr, int numpages)
{
        return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW | _PAGE_DIRTY), 0);
}

int set_memory_rox(unsigned long addr, int numpages)
{
        pgprot_t clr = __pgprot(_PAGE_RW | _PAGE_DIRTY);

        if (__supported_pte_mask & _PAGE_NX)
                clr.pgprot |= _PAGE_NX;

        return change_page_attr_clear(&addr, numpages, clr, 0);
}

int set_memory_rw(unsigned long addr, int numpages)
{
        return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
}

int set_memory_np(unsigned long addr, int numpages)
{
        return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
}

int set_memory_np_noalias(unsigned long addr, int numpages)
{
        return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
                                        __pgprot(_PAGE_PRESENT), 0,
                                        CPA_NO_CHECK_ALIAS, NULL);
}

int set_memory_p(unsigned long addr, int numpages)
{
        return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
}

int set_memory_4k(unsigned long addr, int numpages)
{
        return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
                                        __pgprot(0), 1, 0, NULL);
}

int set_memory_nonglobal(unsigned long addr, int numpages)
{
        return change_page_attr_clear(&addr, numpages,
                                      __pgprot(_PAGE_GLOBAL), 0);
}

int set_memory_global(unsigned long addr, int numpages)
{
        return change_page_attr_set(&addr, numpages,
                                    __pgprot(_PAGE_GLOBAL), 0);
}

/*
 * __set_memory_enc_pgtable() is used for the hypervisors that get
 * informed about "encryption" status via page tables.
 */
static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
{
        pgprot_t empty = __pgprot(0);
        struct cpa_data cpa;
        int ret;

        /* Should not be working on unaligned addresses */
        if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
                addr &= PAGE_MASK;

        memset(&cpa, 0, sizeof(cpa));
        cpa.vaddr = &addr;
        cpa.numpages = numpages;
        cpa.mask_set = enc ? pgprot_encrypted(empty) : pgprot_decrypted(empty);
        cpa.mask_clr = enc ? pgprot_decrypted(empty) : pgprot_encrypted(empty);
        cpa.pgd = init_mm.pgd;

        /* Must avoid aliasing mappings in the highmem code */
        kmap_flush_unused();
        vm_unmap_aliases();

        /* Flush the caches as needed before changing the encryption attribute. */
        if (x86_platform.guest.enc_tlb_flush_required(enc))
                cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required());

        /* Notify hypervisor that we are about to set/clr encryption attribute. */
        if (!x86_platform.guest.enc_status_change_prepare(addr, numpages, enc))
                goto vmm_fail;

        ret = __change_page_attr_set_clr(&cpa, 1);

        /*
         * After changing the encryption attribute, we need to flush TLBs again
         * in case any speculative TLB caching occurred (but no need to flush
         * caches again).  We could just use cpa_flush_all(), but in case TLB
         * flushing gets optimized in the cpa_flush() path use the same logic
         * as above.
         */
        cpa_flush(&cpa, 0);

        if (ret)
                return ret;

        /* Notify hypervisor that we have successfully set/clr encryption attribute. */
        if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc))
                goto vmm_fail;

        return 0;

vmm_fail:
        WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s.\n",
                  (void *)addr, numpages, enc ? "private" : "shared");

        return -EIO;
}

static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
{
        if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
                return __set_memory_enc_pgtable(addr, numpages, enc);

        return 0;
}

int set_memory_encrypted(unsigned long addr, int numpages)
{
        return __set_memory_enc_dec(addr, numpages, true);
}
EXPORT_SYMBOL_GPL(set_memory_encrypted);

int set_memory_decrypted(unsigned long addr, int numpages)
{
        return __set_memory_enc_dec(addr, numpages, false);
}
EXPORT_SYMBOL_GPL(set_memory_decrypted);

int set_pages_uc(struct page *page, int numpages)
{
        unsigned long addr = (unsigned long)page_address(page);

        return set_memory_uc(addr, numpages);
}
EXPORT_SYMBOL(set_pages_uc);

static int _set_pages_array(struct page **pages, int numpages,
                enum page_cache_mode new_type)
{
        unsigned long start;
        unsigned long end;
        enum page_cache_mode set_type;
        int i;
        int free_idx;
        int ret;

        for (i = 0; i < numpages; i++) {
                if (PageHighMem(pages[i]))
                        continue;
                start = page_to_pfn(pages[i]) << PAGE_SHIFT;
                end = start + PAGE_SIZE;
                if (memtype_reserve(start, end, new_type, NULL))
                        goto err_out;
        }

        /* If WC, set to UC- first and then WC */
        set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
                                _PAGE_CACHE_MODE_UC_MINUS : new_type;

        ret = cpa_set_pages_array(pages, numpages,
                                  cachemode2pgprot(set_type));
        if (!ret && new_type == _PAGE_CACHE_MODE_WC)
                ret = change_page_attr_set_clr(NULL, numpages,
                                               cachemode2pgprot(
                                                _PAGE_CACHE_MODE_WC),
                                               __pgprot(_PAGE_CACHE_MASK),
                                               0, CPA_PAGES_ARRAY, pages);
        if (ret)
                goto err_out;
        return 0; /* Success */
err_out:
        free_idx = i;
        for (i = 0; i < free_idx; i++) {
                if (PageHighMem(pages[i]))
                        continue;
                start = page_to_pfn(pages[i]) << PAGE_SHIFT;
                end = start + PAGE_SIZE;
                memtype_free(start, end);
        }
        return -EINVAL;
}

int set_pages_array_uc(struct page **pages, int numpages)
{
        return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_UC_MINUS);
}
EXPORT_SYMBOL(set_pages_array_uc);

int set_pages_array_wc(struct page **pages, int numpages)
{
        return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WC);
}
EXPORT_SYMBOL(set_pages_array_wc);

int set_pages_wb(struct page *page, int numpages)
{
        unsigned long addr = (unsigned long)page_address(page);

        return set_memory_wb(addr, numpages);
}
EXPORT_SYMBOL(set_pages_wb);

int set_pages_array_wb(struct page **pages, int numpages)
{
        int retval;
        unsigned long start;
        unsigned long end;
        int i;

        /* WB cache mode is hard wired to all cache attribute bits being 0 */
        retval = cpa_clear_pages_array(pages, numpages,
                        __pgprot(_PAGE_CACHE_MASK));
        if (retval)
                return retval;

        for (i = 0; i < numpages; i++) {
                if (PageHighMem(pages[i]))
                        continue;
                start = page_to_pfn(pages[i]) << PAGE_SHIFT;
                end = start + PAGE_SIZE;
                memtype_free(start, end);
        }

        return 0;
}
EXPORT_SYMBOL(set_pages_array_wb);

int set_pages_ro(struct page *page, int numpages)
{
        unsigned long addr = (unsigned long)page_address(page);

        return set_memory_ro(addr, numpages);
}

int set_pages_rw(struct page *page, int numpages)
{
        unsigned long addr = (unsigned long)page_address(page);

        return set_memory_rw(addr, numpages);
}

static int __set_pages_p(struct page *page, int numpages)
{
        unsigned long tempaddr = (unsigned long) page_address(page);
        struct cpa_data cpa = { .vaddr = &tempaddr,
                                .pgd = NULL,
                                .numpages = numpages,
                                .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
                                .mask_clr = __pgprot(0),
                                .flags = CPA_NO_CHECK_ALIAS };

        /*
         * No alias checking needed for setting present flag. otherwise,
         * we may need to break large pages for 64-bit kernel text
         * mappings (this adds to complexity if we want to do this from
         * atomic context especially). Let's keep it simple!
         */
        return __change_page_attr_set_clr(&cpa, 1);
}

static int __set_pages_np(struct page *page, int numpages)
{
        unsigned long tempaddr = (unsigned long) page_address(page);
        struct cpa_data cpa = { .vaddr = &tempaddr,
                                .pgd = NULL,
                                .numpages = numpages,
                                .mask_set = __pgprot(0),
                                .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
                                .flags = CPA_NO_CHECK_ALIAS };

        /*
         * No alias checking needed for setting not present flag. otherwise,
         * we may need to break large pages for 64-bit kernel text
         * mappings (this adds to complexity if we want to do this from
         * atomic context especially). Let's keep it simple!
         */
        return __change_page_attr_set_clr(&cpa, 1);
}

int set_direct_map_invalid_noflush(struct page *page)
{
        return __set_pages_np(page, 1);
}

int set_direct_map_default_noflush(struct page *page)
{
        return __set_pages_p(page, 1);
}

#ifdef CONFIG_DEBUG_PAGEALLOC
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
        if (PageHighMem(page))
                return;
        if (!enable) {
                debug_check_no_locks_freed(page_address(page),
                                           numpages * PAGE_SIZE);
        }

        /*
         * The return value is ignored as the calls cannot fail.
         * Large pages for identity mappings are not used at boot time
         * and hence no memory allocations during large page split.
         */
        if (enable)
                __set_pages_p(page, numpages);
        else
                __set_pages_np(page, numpages);

        /*
         * We should perform an IPI and flush all tlbs,
         * but that can deadlock->flush only current cpu.
         * Preemption needs to be disabled around __flush_tlb_all() due to
         * CR3 reload in __native_flush_tlb().
         */
        preempt_disable();
        __flush_tlb_all();
        preempt_enable();

        arch_flush_lazy_mmu_mode();
}
#endif /* CONFIG_DEBUG_PAGEALLOC */

bool kernel_page_present(struct page *page)
{
        unsigned int level;
        pte_t *pte;

        if (PageHighMem(page))
                return false;

        pte = lookup_address((unsigned long)page_address(page), &level);
        return (pte_val(*pte) & _PAGE_PRESENT);
}

int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
                                   unsigned numpages, unsigned long page_flags)
{
        int retval = -EINVAL;

        struct cpa_data cpa = {
                .vaddr = &address,
                .pfn = pfn,
                .pgd = pgd,
                .numpages = numpages,
                .mask_set = __pgprot(0),
                .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)),
                .flags = CPA_NO_CHECK_ALIAS,
        };

        WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");

        if (!(__supported_pte_mask & _PAGE_NX))
                goto out;

        if (!(page_flags & _PAGE_ENC))
                cpa.mask_clr = pgprot_encrypted(cpa.mask_clr);

        cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);

        retval = __change_page_attr_set_clr(&cpa, 1);
        __flush_tlb_all();

out:
        return retval;
}

/*
 * __flush_tlb_all() flushes mappings only on current CPU and hence this
 * function shouldn't be used in an SMP environment. Presently, it's used only
 * during boot (way before smp_init()) by EFI subsystem and hence is ok.
 */
int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
                                     unsigned long numpages)
{
        int retval;

        /*
         * The typical sequence for unmapping is to find a pte through
         * lookup_address_in_pgd() (ideally, it should never return NULL because
         * the address is already mapped) and change its protections. As pfn is
         * the *target* of a mapping, it's not useful while unmapping.
         */
        struct cpa_data cpa = {
                .vaddr                = &address,
                .pfn                = 0,
                .pgd                = pgd,
                .numpages        = numpages,
                .mask_set        = __pgprot(0),
                .mask_clr        = __pgprot(_PAGE_PRESENT | _PAGE_RW),
                .flags                = CPA_NO_CHECK_ALIAS,
        };

        WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");

        retval = __change_page_attr_set_clr(&cpa, 1);
        __flush_tlb_all();

        return retval;
}

/*
 * The testcases use internal knowledge of the implementation that shouldn't
 * be exposed to the rest of the kernel. Include these directly here.
 */
#ifdef CONFIG_CPA_DEBUG
#include "cpa-test.c"
#endif



















































































































































































































































































































































































































    1 








    1 





















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#include "ctree.h"
#include "fs.h"
#include "messages.h"
#include "inode-item.h"
#include "disk-io.h"
#include "transaction.h"
#include "space-info.h"
#include "accessors.h"
#include "extent-tree.h"
#include "file-item.h"

struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
                                                   int slot,
                                                   const struct fscrypt_str *name)
{
        struct btrfs_inode_ref *ref;
        unsigned long ptr;
        unsigned long name_ptr;
        u32 item_size;
        u32 cur_offset = 0;
        int len;

        item_size = btrfs_item_size(leaf, slot);
        ptr = btrfs_item_ptr_offset(leaf, slot);
        while (cur_offset < item_size) {
                ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
                len = btrfs_inode_ref_name_len(leaf, ref);
                name_ptr = (unsigned long)(ref + 1);
                cur_offset += len + sizeof(*ref);
                if (len != name->len)
                        continue;
                if (memcmp_extent_buffer(leaf, name->name, name_ptr,
                                         name->len) == 0)
                        return ref;
        }
        return NULL;
}

struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
                struct extent_buffer *leaf, int slot, u64 ref_objectid,
                const struct fscrypt_str *name)
{
        struct btrfs_inode_extref *extref;
        unsigned long ptr;
        unsigned long name_ptr;
        u32 item_size;
        u32 cur_offset = 0;
        int ref_name_len;

        item_size = btrfs_item_size(leaf, slot);
        ptr = btrfs_item_ptr_offset(leaf, slot);

        /*
         * Search all extended backrefs in this item. We're only
         * looking through any collisions so most of the time this is
         * just going to compare against one buffer. If all is well,
         * we'll return success and the inode ref object.
         */
        while (cur_offset < item_size) {
                extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
                name_ptr = (unsigned long)(&extref->name);
                ref_name_len = btrfs_inode_extref_name_len(leaf, extref);

                if (ref_name_len == name->len &&
                    btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
                    (memcmp_extent_buffer(leaf, name->name, name_ptr,
                                          name->len) == 0))
                        return extref;

                cur_offset += ref_name_len + sizeof(*extref);
        }
        return NULL;
}

/* Returns NULL if no extref found */
struct btrfs_inode_extref *
btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          struct btrfs_path *path,
                          const struct fscrypt_str *name,
                          u64 inode_objectid, u64 ref_objectid, int ins_len,
                          int cow)
{
        int ret;
        struct btrfs_key key;

        key.objectid = inode_objectid;
        key.type = BTRFS_INODE_EXTREF_KEY;
        key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);

        ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
        if (ret < 0)
                return ERR_PTR(ret);
        if (ret > 0)
                return NULL;
        return btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
                                              ref_objectid, name);

}

static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root,
                                  const struct fscrypt_str *name,
                                  u64 inode_objectid, u64 ref_objectid,
                                  u64 *index)
{
        struct btrfs_path *path;
        struct btrfs_key key;
        struct btrfs_inode_extref *extref;
        struct extent_buffer *leaf;
        int ret;
        int del_len = name->len + sizeof(*extref);
        unsigned long ptr;
        unsigned long item_start;
        u32 item_size;

        key.objectid = inode_objectid;
        key.type = BTRFS_INODE_EXTREF_KEY;
        key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret > 0)
                ret = -ENOENT;
        if (ret < 0)
                goto out;

        /*
         * Sanity check - did we find the right item for this name?
         * This should always succeed so error here will make the FS
         * readonly.
         */
        extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
                                                ref_objectid, name);
        if (!extref) {
                btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL);
                ret = -EROFS;
                goto out;
        }

        leaf = path->nodes[0];
        item_size = btrfs_item_size(leaf, path->slots[0]);
        if (index)
                *index = btrfs_inode_extref_index(leaf, extref);

        if (del_len == item_size) {
                /*
                 * Common case only one ref in the item, remove the
                 * whole item.
                 */
                ret = btrfs_del_item(trans, root, path);
                goto out;
        }

        ptr = (unsigned long)extref;
        item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);

        memmove_extent_buffer(leaf, ptr, ptr + del_len,
                              item_size - (ptr + del_len - item_start));

        btrfs_truncate_item(trans, path, item_size - del_len, 1);

out:
        btrfs_free_path(path);

        return ret;
}

int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, const struct fscrypt_str *name,
                        u64 inode_objectid, u64 ref_objectid, u64 *index)
{
        struct btrfs_path *path;
        struct btrfs_key key;
        struct btrfs_inode_ref *ref;
        struct extent_buffer *leaf;
        unsigned long ptr;
        unsigned long item_start;
        u32 item_size;
        u32 sub_item_len;
        int ret;
        int search_ext_refs = 0;
        int del_len = name->len + sizeof(*ref);

        key.objectid = inode_objectid;
        key.offset = ref_objectid;
        key.type = BTRFS_INODE_REF_KEY;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret > 0) {
                ret = -ENOENT;
                search_ext_refs = 1;
                goto out;
        } else if (ret < 0) {
                goto out;
        }

        ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name);
        if (!ref) {
                ret = -ENOENT;
                search_ext_refs = 1;
                goto out;
        }
        leaf = path->nodes[0];
        item_size = btrfs_item_size(leaf, path->slots[0]);

        if (index)
                *index = btrfs_inode_ref_index(leaf, ref);

        if (del_len == item_size) {
                ret = btrfs_del_item(trans, root, path);
                goto out;
        }
        ptr = (unsigned long)ref;
        sub_item_len = name->len + sizeof(*ref);
        item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
        memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
                              item_size - (ptr + sub_item_len - item_start));
        btrfs_truncate_item(trans, path, item_size - sub_item_len, 1);
out:
        btrfs_free_path(path);

        if (search_ext_refs) {
                /*
                 * No refs were found, or we could not find the
                 * name in our ref array. Find and remove the extended
                 * inode ref then.
                 */
                return btrfs_del_inode_extref(trans, root, name,
                                              inode_objectid, ref_objectid, index);
        }

        return ret;
}

/*
 * Insert an extended inode ref into a tree.
 *
 * The caller must have checked against BTRFS_LINK_MAX already.
 */
static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     const struct fscrypt_str *name,
                                     u64 inode_objectid, u64 ref_objectid,
                                     u64 index)
{
        struct btrfs_inode_extref *extref;
        int ret;
        int ins_len = name->len + sizeof(*extref);
        unsigned long ptr;
        struct btrfs_path *path;
        struct btrfs_key key;
        struct extent_buffer *leaf;

        key.objectid = inode_objectid;
        key.type = BTRFS_INODE_EXTREF_KEY;
        key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        ret = btrfs_insert_empty_item(trans, root, path, &key,
                                      ins_len);
        if (ret == -EEXIST) {
                if (btrfs_find_name_in_ext_backref(path->nodes[0],
                                                   path->slots[0],
                                                   ref_objectid,
                                                   name))
                        goto out;

                btrfs_extend_item(trans, path, ins_len);
                ret = 0;
        }
        if (ret < 0)
                goto out;

        leaf = path->nodes[0];
        ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
        ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len;
        extref = (struct btrfs_inode_extref *)ptr;

        btrfs_set_inode_extref_name_len(path->nodes[0], extref, name->len);
        btrfs_set_inode_extref_index(path->nodes[0], extref, index);
        btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);

        ptr = (unsigned long)&extref->name;
        write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
        btrfs_mark_buffer_dirty(trans, path->nodes[0]);

out:
        btrfs_free_path(path);
        return ret;
}

/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */
int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, const struct fscrypt_str *name,
                           u64 inode_objectid, u64 ref_objectid, u64 index)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_path *path;
        struct btrfs_key key;
        struct btrfs_inode_ref *ref;
        unsigned long ptr;
        int ret;
        int ins_len = name->len + sizeof(*ref);

        key.objectid = inode_objectid;
        key.offset = ref_objectid;
        key.type = BTRFS_INODE_REF_KEY;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        path->skip_release_on_error = 1;
        ret = btrfs_insert_empty_item(trans, root, path, &key,
                                      ins_len);
        if (ret == -EEXIST) {
                u32 old_size;
                ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
                                                 name);
                if (ref)
                        goto out;

                old_size = btrfs_item_size(path->nodes[0], path->slots[0]);
                btrfs_extend_item(trans, path, ins_len);
                ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                     struct btrfs_inode_ref);
                ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
                btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len);
                btrfs_set_inode_ref_index(path->nodes[0], ref, index);
                ptr = (unsigned long)(ref + 1);
                ret = 0;
        } else if (ret < 0) {
                if (ret == -EOVERFLOW) {
                        if (btrfs_find_name_in_backref(path->nodes[0],
                                                       path->slots[0],
                                                       name))
                                ret = -EEXIST;
                        else
                                ret = -EMLINK;
                }
                goto out;
        } else {
                ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                     struct btrfs_inode_ref);
                btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len);
                btrfs_set_inode_ref_index(path->nodes[0], ref, index);
                ptr = (unsigned long)(ref + 1);
        }
        write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
        btrfs_mark_buffer_dirty(trans, path->nodes[0]);

out:
        btrfs_free_path(path);

        if (ret == -EMLINK) {
                struct btrfs_super_block *disk_super = fs_info->super_copy;
                /* We ran out of space in the ref array. Need to
                 * add an extended ref. */
                if (btrfs_super_incompat_flags(disk_super)
                    & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
                        ret = btrfs_insert_inode_extref(trans, root, name,
                                                        inode_objectid,
                                                        ref_objectid, index);
        }

        return ret;
}

int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid)
{
        struct btrfs_key key;
        int ret;
        key.objectid = objectid;
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;

        ret = btrfs_insert_empty_item(trans, root, path, &key,
                                      sizeof(struct btrfs_inode_item));
        return ret;
}

int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
                       *root, struct btrfs_path *path,
                       struct btrfs_key *location, int mod)
{
        int ins_len = mod < 0 ? -1 : 0;
        int cow = mod != 0;
        int ret;
        int slot;
        struct extent_buffer *leaf;
        struct btrfs_key found_key;

        ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
        if (ret > 0 && location->type == BTRFS_ROOT_ITEM_KEY &&
            location->offset == (u64)-1 && path->slots[0] != 0) {
                slot = path->slots[0] - 1;
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, slot);
                if (found_key.objectid == location->objectid &&
                    found_key.type == location->type) {
                        path->slots[0]--;
                        return 0;
                }
        }
        return ret;
}

static inline void btrfs_trace_truncate(struct btrfs_inode *inode,
                                        struct extent_buffer *leaf,
                                        struct btrfs_file_extent_item *fi,
                                        u64 offset, int extent_type, int slot)
{
        if (!inode)
                return;
        if (extent_type == BTRFS_FILE_EXTENT_INLINE)
                trace_btrfs_truncate_show_fi_inline(inode, leaf, fi, slot,
                                                    offset);
        else
                trace_btrfs_truncate_show_fi_regular(inode, leaf, fi, offset);
}

/*
 * Remove inode items from a given root.
 *
 * @trans:                A transaction handle.
 * @root:                The root from which to remove items.
 * @inode:                The inode whose items we want to remove.
 * @control:                The btrfs_truncate_control to control how and what we
 *                        are truncating.
 *
 * Remove all keys associated with the inode from the given root that have a key
 * with a type greater than or equals to @min_type. When @min_type has a value of
 * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value
 * greater than or equals to @new_size. If a file extent item that starts before
 * @new_size and ends after it is found, its length is adjusted.
 *
 * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is
 * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block.
 */
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               struct btrfs_truncate_control *control)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_file_extent_item *fi;
        struct btrfs_key key;
        struct btrfs_key found_key;
        u64 new_size = control->new_size;
        u64 extent_num_bytes = 0;
        u64 extent_offset = 0;
        u64 item_end = 0;
        u32 found_type = (u8)-1;
        int del_item;
        int pending_del_nr = 0;
        int pending_del_slot = 0;
        int extent_type = -1;
        int ret;
        u64 bytes_deleted = 0;
        bool be_nice = false;

        ASSERT(control->inode || !control->clear_extent_range);
        ASSERT(new_size == 0 || control->min_type == BTRFS_EXTENT_DATA_KEY);

        control->last_size = new_size;
        control->sub_bytes = 0;

        /*
         * For shareable roots we want to back off from time to time, this turns
         * out to be subvolume roots, reloc roots, and data reloc roots.
         */
        if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
                be_nice = true;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        path->reada = READA_BACK;

        key.objectid = control->ino;
        key.offset = (u64)-1;
        key.type = (u8)-1;

search_again:
        /*
         * With a 16K leaf size and 128MiB extents, you can actually queue up a
         * huge file in a single leaf.  Most of the time that bytes_deleted is
         * > 0, it will be huge by the time we get here
         */
        if (be_nice && bytes_deleted > SZ_32M &&
            btrfs_should_end_transaction(trans)) {
                ret = -EAGAIN;
                goto out;
        }

        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret < 0)
                goto out;

        if (ret > 0) {
                ret = 0;
                /* There are no items in the tree for us to truncate, we're done */
                if (path->slots[0] == 0)
                        goto out;
                path->slots[0]--;
        }

        while (1) {
                u64 clear_start = 0, clear_len = 0, extent_start = 0;
                bool refill_delayed_refs_rsv = false;

                fi = NULL;
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                found_type = found_key.type;

                if (found_key.objectid != control->ino)
                        break;

                if (found_type < control->min_type)
                        break;

                item_end = found_key.offset;
                if (found_type == BTRFS_EXTENT_DATA_KEY) {
                        fi = btrfs_item_ptr(leaf, path->slots[0],
                                            struct btrfs_file_extent_item);
                        extent_type = btrfs_file_extent_type(leaf, fi);
                        if (extent_type != BTRFS_FILE_EXTENT_INLINE)
                                item_end +=
                                    btrfs_file_extent_num_bytes(leaf, fi);
                        else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
                                item_end += btrfs_file_extent_ram_bytes(leaf, fi);

                        btrfs_trace_truncate(control->inode, leaf, fi,
                                             found_key.offset, extent_type,
                                             path->slots[0]);
                        item_end--;
                }
                if (found_type > control->min_type) {
                        del_item = 1;
                } else {
                        if (item_end < new_size)
                                break;
                        if (found_key.offset >= new_size)
                                del_item = 1;
                        else
                                del_item = 0;
                }

                /* FIXME, shrink the extent if the ref count is only 1 */
                if (found_type != BTRFS_EXTENT_DATA_KEY)
                        goto delete;

                control->extents_found++;

                if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
                        u64 num_dec;

                        clear_start = found_key.offset;
                        extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
                        if (!del_item) {
                                u64 orig_num_bytes =
                                        btrfs_file_extent_num_bytes(leaf, fi);
                                extent_num_bytes = ALIGN(new_size -
                                                found_key.offset,
                                                fs_info->sectorsize);
                                clear_start = ALIGN(new_size, fs_info->sectorsize);

                                btrfs_set_file_extent_num_bytes(leaf, fi,
                                                         extent_num_bytes);
                                num_dec = (orig_num_bytes - extent_num_bytes);
                                if (extent_start != 0)
                                        control->sub_bytes += num_dec;
                                btrfs_mark_buffer_dirty(trans, leaf);
                        } else {
                                extent_num_bytes =
                                        btrfs_file_extent_disk_num_bytes(leaf, fi);
                                extent_offset = found_key.offset -
                                        btrfs_file_extent_offset(leaf, fi);

                                /* FIXME blocksize != 4096 */
                                num_dec = btrfs_file_extent_num_bytes(leaf, fi);
                                if (extent_start != 0)
                                        control->sub_bytes += num_dec;
                        }
                        clear_len = num_dec;
                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                        /*
                         * We can't truncate inline items that have had
                         * special encodings
                         */
                        if (!del_item &&
                            btrfs_file_extent_encryption(leaf, fi) == 0 &&
                            btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
                            btrfs_file_extent_compression(leaf, fi) == 0) {
                                u32 size = (u32)(new_size - found_key.offset);

                                btrfs_set_file_extent_ram_bytes(leaf, fi, size);
                                size = btrfs_file_extent_calc_inline_size(size);
                                btrfs_truncate_item(trans, path, size, 1);
                        } else if (!del_item) {
                                /*
                                 * We have to bail so the last_size is set to
                                 * just before this extent.
                                 */
                                ret = BTRFS_NEED_TRUNCATE_BLOCK;
                                break;
                        } else {
                                /*
                                 * Inline extents are special, we just treat
                                 * them as a full sector worth in the file
                                 * extent tree just for simplicity sake.
                                 */
                                clear_len = fs_info->sectorsize;
                        }

                        control->sub_bytes += item_end + 1 - new_size;
                }
delete:
                /*
                 * We only want to clear the file extent range if we're
                 * modifying the actual inode's mapping, which is just the
                 * normal truncate path.
                 */
                if (control->clear_extent_range) {
                        ret = btrfs_inode_clear_file_extent_range(control->inode,
                                                  clear_start, clear_len);
                        if (ret) {
                                btrfs_abort_transaction(trans, ret);
                                break;
                        }
                }

                if (del_item) {
                        ASSERT(!pending_del_nr ||
                               ((path->slots[0] + 1) == pending_del_slot));

                        control->last_size = found_key.offset;
                        if (!pending_del_nr) {
                                /* No pending yet, add ourselves */
                                pending_del_slot = path->slots[0];
                                pending_del_nr = 1;
                        } else if (path->slots[0] + 1 == pending_del_slot) {
                                /* Hop on the pending chunk */
                                pending_del_nr++;
                                pending_del_slot = path->slots[0];
                        }
                } else {
                        control->last_size = new_size;
                        break;
                }

                if (del_item && extent_start != 0 && !control->skip_ref_updates) {
                        struct btrfs_ref ref = {
                                .action = BTRFS_DROP_DELAYED_REF,
                                .bytenr = extent_start,
                                .num_bytes = extent_num_bytes,
                                .owning_root = btrfs_root_id(root),
                                .ref_root = btrfs_header_owner(leaf),
                        };

                        bytes_deleted += extent_num_bytes;

                        btrfs_init_data_ref(&ref, control->ino, extent_offset,
                                            btrfs_root_id(root), false);
                        ret = btrfs_free_extent(trans, &ref);
                        if (ret) {
                                btrfs_abort_transaction(trans, ret);
                                break;
                        }
                        if (be_nice && btrfs_check_space_for_delayed_refs(fs_info))
                                refill_delayed_refs_rsv = true;
                }

                if (found_type == BTRFS_INODE_ITEM_KEY)
                        break;

                if (path->slots[0] == 0 ||
                    path->slots[0] != pending_del_slot ||
                    refill_delayed_refs_rsv) {
                        if (pending_del_nr) {
                                ret = btrfs_del_items(trans, root, path,
                                                pending_del_slot,
                                                pending_del_nr);
                                if (ret) {
                                        btrfs_abort_transaction(trans, ret);
                                        break;
                                }
                                pending_del_nr = 0;
                        }
                        btrfs_release_path(path);

                        /*
                         * We can generate a lot of delayed refs, so we need to
                         * throttle every once and a while and make sure we're
                         * adding enough space to keep up with the work we are
                         * generating.  Since we hold a transaction here we
                         * can't flush, and we don't want to FLUSH_LIMIT because
                         * we could have generated too many delayed refs to
                         * actually allocate, so just bail if we're short and
                         * let the normal reservation dance happen higher up.
                         */
                        if (refill_delayed_refs_rsv) {
                                ret = btrfs_delayed_refs_rsv_refill(fs_info,
                                                        BTRFS_RESERVE_NO_FLUSH);
                                if (ret) {
                                        ret = -EAGAIN;
                                        break;
                                }
                        }
                        goto search_again;
                } else {
                        path->slots[0]--;
                }
        }
out:
        if (ret >= 0 && pending_del_nr) {
                int err;

                err = btrfs_del_items(trans, root, path, pending_del_slot,
                                      pending_del_nr);
                if (err) {
                        btrfs_abort_transaction(trans, err);
                        ret = err;
                }
        }

        ASSERT(control->last_size >= new_size);
        if (!ret && control->last_size > new_size)
                control->last_size = new_size;

        btrfs_free_path(path);
        return ret;
}


























































   51 





































   52 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
 */

#ifndef _ASM_X86_STACKTRACE_H
#define _ASM_X86_STACKTRACE_H

#include <linux/uaccess.h>
#include <linux/ptrace.h>

#include <asm/cpu_entry_area.h>
#include <asm/switch_to.h>

enum stack_type {
        STACK_TYPE_UNKNOWN,
        STACK_TYPE_TASK,
        STACK_TYPE_IRQ,
        STACK_TYPE_SOFTIRQ,
        STACK_TYPE_ENTRY,
        STACK_TYPE_EXCEPTION,
        STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
};

struct stack_info {
        enum stack_type type;
        unsigned long *begin, *end, *next_sp;
};

bool in_task_stack(unsigned long *stack, struct task_struct *task,
                   struct stack_info *info);

bool in_entry_stack(unsigned long *stack, struct stack_info *info);

int get_stack_info(unsigned long *stack, struct task_struct *task,
                   struct stack_info *info, unsigned long *visit_mask);
bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task,
                            struct stack_info *info);

static __always_inline
bool get_stack_guard_info(unsigned long *stack, struct stack_info *info)
{
        /* make sure it's not in the stack proper */
        if (get_stack_info_noinstr(stack, current, info))
                return false;
        /* but if it is in the page below it, we hit a guard */
        return get_stack_info_noinstr((void *)stack + PAGE_SIZE, current, info);
}

const char *stack_type_name(enum stack_type type);

static inline bool on_stack(struct stack_info *info, void *addr, size_t len)
{
        void *begin = info->begin;
        void *end   = info->end;

        return (info->type != STACK_TYPE_UNKNOWN &&
                addr >= begin && addr < end &&
                addr + len > begin && addr + len <= end);
}

#ifdef CONFIG_X86_32
#define STACKSLOTS_PER_LINE 8
#else
#define STACKSLOTS_PER_LINE 4
#endif

#ifdef CONFIG_FRAME_POINTER
static inline unsigned long *
get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
{
        if (regs)
                return (unsigned long *)regs->bp;

        if (task == current)
                return __builtin_frame_address(0);

        return &((struct inactive_task_frame *)task->thread.sp)->bp;
}
#else
static inline unsigned long *
get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
{
        return NULL;
}
#endif /* CONFIG_FRAME_POINTER */

static inline unsigned long *
get_stack_pointer(struct task_struct *task, struct pt_regs *regs)
{
        if (regs)
                return (unsigned long *)regs->sp;

        if (task == current)
                return __builtin_frame_address(0);

        return (unsigned long *)task->thread.sp;
}

/* The form of the top of the frame on the stack */
struct stack_frame {
        struct stack_frame *next_frame;
        unsigned long return_address;
};

struct stack_frame_ia32 {
    u32 next_frame;
    u32 return_address;
};

void show_opcodes(struct pt_regs *regs, const char *loglvl);
void show_ip(struct pt_regs *regs, const char *loglvl);
#endif /* _ASM_X86_STACKTRACE_H */



















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 2012 Fusion-io  All rights reserved.
 * Copyright (C) 2012 Intel Corp. All rights reserved.
 */

#ifndef BTRFS_RAID56_H
#define BTRFS_RAID56_H

#include <linux/types.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/bio.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include "volumes.h"

struct page;
struct sector_ptr;
struct btrfs_fs_info;

enum btrfs_rbio_ops {
        BTRFS_RBIO_WRITE,
        BTRFS_RBIO_READ_REBUILD,
        BTRFS_RBIO_PARITY_SCRUB,
};

struct btrfs_raid_bio {
        struct btrfs_io_context *bioc;

        /*
         * While we're doing RMW on a stripe we put it into a hash table so we
         * can lock the stripe and merge more rbios into it.
         */
        struct list_head hash_list;

        /* LRU list for the stripe cache */
        struct list_head stripe_cache;

        /* For scheduling work in the helper threads */
        struct work_struct work;

        /*
         * bio_list and bio_list_lock are used to add more bios into the stripe
         * in hopes of avoiding the full RMW
         */
        struct bio_list bio_list;
        spinlock_t bio_list_lock;

        /*
         * Also protected by the bio_list_lock, the plug list is used by the
         * plugging code to collect partial bios while plugged.  The stripe
         * locking code also uses it to hand off the stripe lock to the next
         * pending IO.
         */
        struct list_head plug_list;

        /* Flags that tell us if it is safe to merge with this bio. */
        unsigned long flags;

        /*
         * Set if we're doing a parity rebuild for a read from higher up, which
         * is handled differently from a parity rebuild as part of RMW.
         */
        enum btrfs_rbio_ops operation;

        /* How many pages there are for the full stripe including P/Q */
        u16 nr_pages;

        /* How many sectors there are for the full stripe including P/Q */
        u16 nr_sectors;

        /* Number of data stripes (no p/q) */
        u8 nr_data;

        /* Number of all stripes (including P/Q) */
        u8 real_stripes;

        /* How many pages there are for each stripe */
        u8 stripe_npages;

        /* How many sectors there are for each stripe */
        u8 stripe_nsectors;

        /* Stripe number that we're scrubbing  */
        u8 scrubp;

        /*
         * Size of all the bios in the bio_list.  This helps us decide if the
         * rbio maps to a full stripe or not.
         */
        int bio_list_bytes;

        refcount_t refs;

        atomic_t stripes_pending;

        wait_queue_head_t io_wait;

        /* Bitmap to record which horizontal stripe has data */
        unsigned long dbitmap;

        /* Allocated with stripe_nsectors-many bits for finish_*() calls */
        unsigned long finish_pbitmap;

        /*
         * These are two arrays of pointers.  We allocate the rbio big enough
         * to hold them both and setup their locations when the rbio is
         * allocated.
         */

        /*
         * Pointers to pages that we allocated for reading/writing stripes
         * directly from the disk (including P/Q).
         */
        struct page **stripe_pages;

        /* Pointers to the sectors in the bio_list, for faster lookup */
        struct sector_ptr *bio_sectors;

        /*
         * For subpage support, we need to map each sector to above
         * stripe_pages.
         */
        struct sector_ptr *stripe_sectors;

        /* Allocated with real_stripes-many pointers for finish_*() calls */
        void **finish_pointers;

        /*
         * The bitmap recording where IO errors happened.
         * Each bit is corresponding to one sector in either bio_sectors[] or
         * stripe_sectors[] array.
         *
         * The reason we don't use another bit in sector_ptr is, we have two
         * arrays of sectors, and a lot of IO can use sectors in both arrays.
         * Thus making it much harder to iterate.
         */
        unsigned long *error_bitmap;

        /*
         * Checksum buffer if the rbio is for data.  The buffer should cover
         * all data sectors (excluding P/Q sectors).
         */
        u8 *csum_buf;

        /*
         * Each bit represents if the corresponding sector has data csum found.
         * Should only cover data sectors (excluding P/Q sectors).
         */
        unsigned long *csum_bitmap;
};

/*
 * For trace event usage only. Records useful debug info for each bio submitted
 * by RAID56 to each physical device.
 *
 * No matter signed or not, (-1) is always the one indicating we can not grab
 * the proper stripe number.
 */
struct raid56_bio_trace_info {
        u64 devid;

        /* The offset inside the stripe. (<= STRIPE_LEN) */
        u32 offset;

        /*
         * Stripe number.
         * 0 is the first data stripe, and nr_data for P stripe,
         * nr_data + 1 for Q stripe.
         * >= real_stripes for
         */
        u8 stripe_nr;
};

static inline int nr_data_stripes(const struct btrfs_chunk_map *map)
{
        return map->num_stripes - btrfs_nr_parity_stripes(map->type);
}

static inline int nr_bioc_data_stripes(const struct btrfs_io_context *bioc)
{
        return bioc->num_stripes - btrfs_nr_parity_stripes(bioc->map_type);
}

#define RAID5_P_STRIPE ((u64)-2)
#define RAID6_Q_STRIPE ((u64)-1)

#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) ||                \
                             ((x) == RAID6_Q_STRIPE))

struct btrfs_device;

void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
                           int mirror_num);
void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc);

struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
                                struct btrfs_io_context *bioc,
                                struct btrfs_device *scrub_dev,
                                unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);

void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
                                    struct page **data_pages, u64 data_logical);

int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);

#endif













































    5 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  include/linux/eventpoll.h ( Efficient event polling implementation )
 *  Copyright (C) 2001,...,2006         Davide Libenzi
 *
 *  Davide Libenzi <davidel@xmailserver.org>
 */
#ifndef _LINUX_EVENTPOLL_H
#define _LINUX_EVENTPOLL_H

#include <uapi/linux/eventpoll.h>
#include <uapi/linux/kcmp.h>


/* Forward declarations to avoid compiler errors */
struct file;


#ifdef CONFIG_EPOLL

#ifdef CONFIG_KCMP
struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff);
#endif

/* Used to release the epoll bits inside the "struct file" */
void eventpoll_release_file(struct file *file);

/*
 * This is called from inside fs/file_table.c:__fput() to unlink files
 * from the eventpoll interface. We need to have this facility to cleanup
 * correctly files that are closed without being removed from the eventpoll
 * interface.
 */
static inline void eventpoll_release(struct file *file)
{

        /*
         * Fast check to avoid the get/release of the semaphore. Since
         * we're doing this outside the semaphore lock, it might return
         * false negatives, but we don't care. It'll help in 99.99% of cases
         * to avoid the semaphore lock. False positives simply cannot happen
         * because the file in on the way to be removed and nobody ( but
         * eventpoll ) has still a reference to this file.
         */
        if (likely(!file->f_ep))
                return;

        /*
         * The file is being closed while it is still linked to an epoll
         * descriptor. We need to handle this by correctly unlinking it
         * from its containers.
         */
        eventpoll_release_file(file);
}

int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
                 bool nonblock);

/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
        return op != EPOLL_CTL_DEL;
}

#else

static inline void eventpoll_release(struct file *file) {}

#endif

#if defined(CONFIG_ARM) && defined(CONFIG_OABI_COMPAT)
/* ARM OABI has an incompatible struct layout and needs a special handler */
extern struct epoll_event __user *
epoll_put_uevent(__poll_t revents, __u64 data,
                 struct epoll_event __user *uevent);
#else
static inline struct epoll_event __user *
epoll_put_uevent(__poll_t revents, __u64 data,
                 struct epoll_event __user *uevent)
{
        if (__put_user(revents, &uevent->events) ||
            __put_user(data, &uevent->data))
                return NULL;

        return uevent+1;
}
#endif

#endif /* #ifndef _LINUX_EVENTPOLL_H */




























    2 


















    2 


    1 
    2 

    2 












    2 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
// SPDX-License-Identifier: GPL-2.0-only
/*
 * ratelimit.c - Do something with rate limit.
 *
 * Isolated from kernel/printk.c by Dave Young <hidave.darkstar@gmail.com>
 *
 * 2008-05-01 rewrite the function and use a ratelimit_state data struct as
 * parameter. Now every user can use their own standalone ratelimit_state.
 */

#include <linux/ratelimit.h>
#include <linux/jiffies.h>
#include <linux/export.h>

/*
 * __ratelimit - rate limiting
 * @rs: ratelimit_state data
 * @func: name of calling function
 *
 * This enforces a rate limit: not more than @rs->burst callbacks
 * in every @rs->interval
 *
 * RETURNS:
 * 0 means callbacks will be suppressed.
 * 1 means go ahead and do it.
 */
int ___ratelimit(struct ratelimit_state *rs, const char *func)
{
        /* Paired with WRITE_ONCE() in .proc_handler().
         * Changing two values seperately could be inconsistent
         * and some message could be lost.  (See: net_ratelimit_state).
         */
        int interval = READ_ONCE(rs->interval);
        int burst = READ_ONCE(rs->burst);
        unsigned long flags;
        int ret;

        if (!interval)
                return 1;

        /*
         * If we contend on this state's lock then almost
         * by definition we are too busy to print a message,
         * in addition to the one that will be printed by
         * the entity that is holding the lock already:
         */
        if (!raw_spin_trylock_irqsave(&rs->lock, flags))
                return 0;

        if (!rs->begin)
                rs->begin = jiffies;

        if (time_is_before_jiffies(rs->begin + interval)) {
                if (rs->missed) {
                        if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) {
                                printk_deferred(KERN_WARNING
                                                "%s: %d callbacks suppressed\n",
                                                func, rs->missed);
                                rs->missed = 0;
                        }
                }
                rs->begin   = jiffies;
                rs->printed = 0;
        }
        if (burst && burst > rs->printed) {
                rs->printed++;
                ret = 1;
        } else {
                rs->missed++;
                ret = 0;
        }
        raw_spin_unlock_irqrestore(&rs->lock, flags);

        return ret;
}
EXPORT_SYMBOL(___ratelimit);
















    1 




    1 





    1 







    3 




    3 






    3 




    1 

    1 




    1 





    1 


    1 


    1 
    1 
    1 

    1 








    1 








































































    5 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/export.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/fs.h>
#include <linux/path.h>
#include <linux/slab.h>
#include <linux/fs_struct.h>
#include "internal.h"

/*
 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
 * It can block.
 */
void set_fs_root(struct fs_struct *fs, const struct path *path)
{
        struct path old_root;

        path_get(path);
        spin_lock(&fs->lock);
        write_seqcount_begin(&fs->seq);
        old_root = fs->root;
        fs->root = *path;
        write_seqcount_end(&fs->seq);
        spin_unlock(&fs->lock);
        if (old_root.dentry)
                path_put(&old_root);
}

/*
 * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
 * It can block.
 */
void set_fs_pwd(struct fs_struct *fs, const struct path *path)
{
        struct path old_pwd;

        path_get(path);
        spin_lock(&fs->lock);
        write_seqcount_begin(&fs->seq);
        old_pwd = fs->pwd;
        fs->pwd = *path;
        write_seqcount_end(&fs->seq);
        spin_unlock(&fs->lock);

        if (old_pwd.dentry)
                path_put(&old_pwd);
}

static inline int replace_path(struct path *p, const struct path *old, const struct path *new)
{
        if (likely(p->dentry != old->dentry || p->mnt != old->mnt))
                return 0;
        *p = *new;
        return 1;
}

void chroot_fs_refs(const struct path *old_root, const struct path *new_root)
{
        struct task_struct *g, *p;
        struct fs_struct *fs;
        int count = 0;

        read_lock(&tasklist_lock);
        for_each_process_thread(g, p) {
                task_lock(p);
                fs = p->fs;
                if (fs) {
                        int hits = 0;
                        spin_lock(&fs->lock);
                        write_seqcount_begin(&fs->seq);
                        hits += replace_path(&fs->root, old_root, new_root);
                        hits += replace_path(&fs->pwd, old_root, new_root);
                        write_seqcount_end(&fs->seq);
                        while (hits--) {
                                count++;
                                path_get(new_root);
                        }
                        spin_unlock(&fs->lock);
                }
                task_unlock(p);
        }
        read_unlock(&tasklist_lock);
        while (count--)
                path_put(old_root);
}

void free_fs_struct(struct fs_struct *fs)
{
        path_put(&fs->root);
        path_put(&fs->pwd);
        kmem_cache_free(fs_cachep, fs);
}

void exit_fs(struct task_struct *tsk)
{
        struct fs_struct *fs = tsk->fs;

        if (fs) {
                int kill;
                task_lock(tsk);
                spin_lock(&fs->lock);
                tsk->fs = NULL;
                kill = !--fs->users;
                spin_unlock(&fs->lock);
                task_unlock(tsk);
                if (kill)
                        free_fs_struct(fs);
        }
}

struct fs_struct *copy_fs_struct(struct fs_struct *old)
{
        struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
        /* We don't need to lock fs - think why ;-) */
        if (fs) {
                fs->users = 1;
                fs->in_exec = 0;
                spin_lock_init(&fs->lock);
                seqcount_spinlock_init(&fs->seq, &fs->lock);
                fs->umask = old->umask;

                spin_lock(&old->lock);
                fs->root = old->root;
                path_get(&fs->root);
                fs->pwd = old->pwd;
                path_get(&fs->pwd);
                spin_unlock(&old->lock);
        }
        return fs;
}

int unshare_fs_struct(void)
{
        struct fs_struct *fs = current->fs;
        struct fs_struct *new_fs = copy_fs_struct(fs);
        int kill;

        if (!new_fs)
                return -ENOMEM;

        task_lock(current);
        spin_lock(&fs->lock);
        kill = !--fs->users;
        current->fs = new_fs;
        spin_unlock(&fs->lock);
        task_unlock(current);

        if (kill)
                free_fs_struct(fs);

        return 0;
}
EXPORT_SYMBOL_GPL(unshare_fs_struct);

int current_umask(void)
{
        return current->fs->umask;
}
EXPORT_SYMBOL(current_umask);

/* to be mentioned only in INIT_TASK */
struct fs_struct init_fs = {
        .users                = 1,
        .lock                = __SPIN_LOCK_UNLOCKED(init_fs.lock),
        .seq                = SEQCNT_SPINLOCK_ZERO(init_fs.seq, &init_fs.lock),
        .umask                = 0022,
};



















































    1 








    1 

    1 







    1 


    1 
    1 



























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
// SPDX-License-Identifier: GPL-2.0
/*
 * Block stat tracking code
 *
 * Copyright (C) 2016 Jens Axboe
 */
#include <linux/kernel.h>
#include <linux/rculist.h>

#include "blk-stat.h"
#include "blk-mq.h"
#include "blk.h"

struct blk_queue_stats {
        struct list_head callbacks;
        spinlock_t lock;
        int accounting;
};

void blk_rq_stat_init(struct blk_rq_stat *stat)
{
        stat->min = -1ULL;
        stat->max = stat->nr_samples = stat->mean = 0;
        stat->batch = 0;
}

/* src is a per-cpu stat, mean isn't initialized */
void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
{
        if (dst->nr_samples + src->nr_samples <= dst->nr_samples)
                return;

        dst->min = min(dst->min, src->min);
        dst->max = max(dst->max, src->max);

        dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples,
                                dst->nr_samples + src->nr_samples);

        dst->nr_samples += src->nr_samples;
}

void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value)
{
        stat->min = min(stat->min, value);
        stat->max = max(stat->max, value);
        stat->batch += value;
        stat->nr_samples++;
}

void blk_stat_add(struct request *rq, u64 now)
{
        struct request_queue *q = rq->q;
        struct blk_stat_callback *cb;
        struct blk_rq_stat *stat;
        int bucket, cpu;
        u64 value;

        value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;

        rcu_read_lock();
        cpu = get_cpu();
        list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
                if (!blk_stat_is_active(cb))
                        continue;

                bucket = cb->bucket_fn(rq);
                if (bucket < 0)
                        continue;

                stat = &per_cpu_ptr(cb->cpu_stat, cpu)[bucket];
                blk_rq_stat_add(stat, value);
        }
        put_cpu();
        rcu_read_unlock();
}

static void blk_stat_timer_fn(struct timer_list *t)
{
        struct blk_stat_callback *cb = from_timer(cb, t, timer);
        unsigned int bucket;
        int cpu;

        for (bucket = 0; bucket < cb->buckets; bucket++)
                blk_rq_stat_init(&cb->stat[bucket]);

        for_each_online_cpu(cpu) {
                struct blk_rq_stat *cpu_stat;

                cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
                for (bucket = 0; bucket < cb->buckets; bucket++) {
                        blk_rq_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]);
                        blk_rq_stat_init(&cpu_stat[bucket]);
                }
        }

        cb->timer_fn(cb);
}

struct blk_stat_callback *
blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
                        int (*bucket_fn)(const struct request *),
                        unsigned int buckets, void *data)
{
        struct blk_stat_callback *cb;

        cb = kmalloc(sizeof(*cb), GFP_KERNEL);
        if (!cb)
                return NULL;

        cb->stat = kmalloc_array(buckets, sizeof(struct blk_rq_stat),
                                 GFP_KERNEL);
        if (!cb->stat) {
                kfree(cb);
                return NULL;
        }
        cb->cpu_stat = __alloc_percpu(buckets * sizeof(struct blk_rq_stat),
                                      __alignof__(struct blk_rq_stat));
        if (!cb->cpu_stat) {
                kfree(cb->stat);
                kfree(cb);
                return NULL;
        }

        cb->timer_fn = timer_fn;
        cb->bucket_fn = bucket_fn;
        cb->data = data;
        cb->buckets = buckets;
        timer_setup(&cb->timer, blk_stat_timer_fn, 0);

        return cb;
}

void blk_stat_add_callback(struct request_queue *q,
                           struct blk_stat_callback *cb)
{
        unsigned int bucket;
        unsigned long flags;
        int cpu;

        for_each_possible_cpu(cpu) {
                struct blk_rq_stat *cpu_stat;

                cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
                for (bucket = 0; bucket < cb->buckets; bucket++)
                        blk_rq_stat_init(&cpu_stat[bucket]);
        }

        spin_lock_irqsave(&q->stats->lock, flags);
        list_add_tail_rcu(&cb->list, &q->stats->callbacks);
        blk_queue_flag_set(QUEUE_FLAG_STATS, q);
        spin_unlock_irqrestore(&q->stats->lock, flags);
}

void blk_stat_remove_callback(struct request_queue *q,
                              struct blk_stat_callback *cb)
{
        unsigned long flags;

        spin_lock_irqsave(&q->stats->lock, flags);
        list_del_rcu(&cb->list);
        if (list_empty(&q->stats->callbacks) && !q->stats->accounting)
                blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
        spin_unlock_irqrestore(&q->stats->lock, flags);

        del_timer_sync(&cb->timer);
}

static void blk_stat_free_callback_rcu(struct rcu_head *head)
{
        struct blk_stat_callback *cb;

        cb = container_of(head, struct blk_stat_callback, rcu);
        free_percpu(cb->cpu_stat);
        kfree(cb->stat);
        kfree(cb);
}

void blk_stat_free_callback(struct blk_stat_callback *cb)
{
        if (cb)
                call_rcu(&cb->rcu, blk_stat_free_callback_rcu);
}

void blk_stat_disable_accounting(struct request_queue *q)
{
        unsigned long flags;

        spin_lock_irqsave(&q->stats->lock, flags);
        if (!--q->stats->accounting && list_empty(&q->stats->callbacks))
                blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
        spin_unlock_irqrestore(&q->stats->lock, flags);
}
EXPORT_SYMBOL_GPL(blk_stat_disable_accounting);

void blk_stat_enable_accounting(struct request_queue *q)
{
        unsigned long flags;

        spin_lock_irqsave(&q->stats->lock, flags);
        if (!q->stats->accounting++ && list_empty(&q->stats->callbacks))
                blk_queue_flag_set(QUEUE_FLAG_STATS, q);
        spin_unlock_irqrestore(&q->stats->lock, flags);
}
EXPORT_SYMBOL_GPL(blk_stat_enable_accounting);

struct blk_queue_stats *blk_alloc_queue_stats(void)
{
        struct blk_queue_stats *stats;

        stats = kmalloc(sizeof(*stats), GFP_KERNEL);
        if (!stats)
                return NULL;

        INIT_LIST_HEAD(&stats->callbacks);
        spin_lock_init(&stats->lock);
        stats->accounting = 0;

        return stats;
}

void blk_free_queue_stats(struct blk_queue_stats *stats)
{
        if (!stats)
                return;

        WARN_ON(!list_empty(&stats->callbacks));

        kfree(stats);
}

































































    3 










    4 









    4 









    4 































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) "%s: " fmt, __func__

#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/percpu-refcount.h>

/*
 * Initially, a percpu refcount is just a set of percpu counters. Initially, we
 * don't try to detect the ref hitting 0 - which means that get/put can just
 * increment or decrement the local counter. Note that the counter on a
 * particular cpu can (and will) wrap - this is fine, when we go to shutdown the
 * percpu counters will all sum to the correct value
 *
 * (More precisely: because modular arithmetic is commutative the sum of all the
 * percpu_count vars will be equal to what it would have been if all the gets
 * and puts were done to a single integer, even if some of the percpu integers
 * overflow or underflow).
 *
 * The real trick to implementing percpu refcounts is shutdown. We can't detect
 * the ref hitting 0 on every put - this would require global synchronization
 * and defeat the whole purpose of using percpu refs.
 *
 * What we do is require the user to keep track of the initial refcount; we know
 * the ref can't hit 0 before the user drops the initial ref, so as long as we
 * convert to non percpu mode before the initial ref is dropped everything
 * works.
 *
 * Converting to non percpu mode is done with some RCUish stuff in
 * percpu_ref_kill. Additionally, we need a bias value so that the
 * atomic_long_t can't hit 0 before we've added up all the percpu refs.
 */

#define PERCPU_COUNT_BIAS        (1LU << (BITS_PER_LONG - 1))

static DEFINE_SPINLOCK(percpu_ref_switch_lock);
static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq);

static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref)
{
        return (unsigned long __percpu *)
                (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD);
}

/**
 * percpu_ref_init - initialize a percpu refcount
 * @ref: percpu_ref to initialize
 * @release: function which will be called when refcount hits 0
 * @flags: PERCPU_REF_INIT_* flags
 * @gfp: allocation mask to use
 *
 * Initializes @ref.  @ref starts out in percpu mode with a refcount of 1 unless
 * @flags contains PERCPU_REF_INIT_ATOMIC or PERCPU_REF_INIT_DEAD.  These flags
 * change the start state to atomic with the latter setting the initial refcount
 * to 0.  See the definitions of PERCPU_REF_INIT_* flags for flag behaviors.
 *
 * Note that @release must not sleep - it may potentially be called from RCU
 * callback context by percpu_ref_kill().
 */
int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release,
                    unsigned int flags, gfp_t gfp)
{
        size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS,
                             __alignof__(unsigned long));
        unsigned long start_count = 0;
        struct percpu_ref_data *data;

        ref->percpu_count_ptr = (unsigned long)
                __alloc_percpu_gfp(sizeof(unsigned long), align, gfp);
        if (!ref->percpu_count_ptr)
                return -ENOMEM;

        data = kzalloc(sizeof(*ref->data), gfp);
        if (!data) {
                free_percpu((void __percpu *)ref->percpu_count_ptr);
                ref->percpu_count_ptr = 0;
                return -ENOMEM;
        }

        data->force_atomic = flags & PERCPU_REF_INIT_ATOMIC;
        data->allow_reinit = flags & PERCPU_REF_ALLOW_REINIT;

        if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) {
                ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
                data->allow_reinit = true;
        } else {
                start_count += PERCPU_COUNT_BIAS;
        }

        if (flags & PERCPU_REF_INIT_DEAD)
                ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
        else
                start_count++;

        atomic_long_set(&data->count, start_count);

        data->release = release;
        data->confirm_switch = NULL;
        data->ref = ref;
        ref->data = data;
        return 0;
}
EXPORT_SYMBOL_GPL(percpu_ref_init);

static void __percpu_ref_exit(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count = percpu_count_ptr(ref);

        if (percpu_count) {
                /* non-NULL confirm_switch indicates switching in progress */
                WARN_ON_ONCE(ref->data && ref->data->confirm_switch);
                free_percpu(percpu_count);
                ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD;
        }
}

/**
 * percpu_ref_exit - undo percpu_ref_init()
 * @ref: percpu_ref to exit
 *
 * This function exits @ref.  The caller is responsible for ensuring that
 * @ref is no longer in active use.  The usual places to invoke this
 * function from are the @ref->release() callback or in init failure path
 * where percpu_ref_init() succeeded but other parts of the initialization
 * of the embedding object failed.
 */
void percpu_ref_exit(struct percpu_ref *ref)
{
        struct percpu_ref_data *data = ref->data;
        unsigned long flags;

        __percpu_ref_exit(ref);

        if (!data)
                return;

        spin_lock_irqsave(&percpu_ref_switch_lock, flags);
        ref->percpu_count_ptr |= atomic_long_read(&ref->data->count) <<
                __PERCPU_REF_FLAG_BITS;
        ref->data = NULL;
        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);

        kfree(data);
}
EXPORT_SYMBOL_GPL(percpu_ref_exit);

static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu)
{
        struct percpu_ref_data *data = container_of(rcu,
                        struct percpu_ref_data, rcu);
        struct percpu_ref *ref = data->ref;

        data->confirm_switch(ref);
        data->confirm_switch = NULL;
        wake_up_all(&percpu_ref_switch_waitq);

        if (!data->allow_reinit)
                __percpu_ref_exit(ref);

        /* drop ref from percpu_ref_switch_to_atomic() */
        percpu_ref_put(ref);
}

static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu)
{
        struct percpu_ref_data *data = container_of(rcu,
                        struct percpu_ref_data, rcu);
        struct percpu_ref *ref = data->ref;
        unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
        static atomic_t underflows;
        unsigned long count = 0;
        int cpu;

        for_each_possible_cpu(cpu)
                count += *per_cpu_ptr(percpu_count, cpu);

        pr_debug("global %lu percpu %lu\n",
                 atomic_long_read(&data->count), count);

        /*
         * It's crucial that we sum the percpu counters _before_ adding the sum
         * to &ref->count; since gets could be happening on one cpu while puts
         * happen on another, adding a single cpu's count could cause
         * @ref->count to hit 0 before we've got a consistent value - but the
         * sum of all the counts will be consistent and correct.
         *
         * Subtracting the bias value then has to happen _after_ adding count to
         * &ref->count; we need the bias value to prevent &ref->count from
         * reaching 0 before we add the percpu counts. But doing it at the same
         * time is equivalent and saves us atomic operations:
         */
        atomic_long_add((long)count - PERCPU_COUNT_BIAS, &data->count);

        if (WARN_ONCE(atomic_long_read(&data->count) <= 0,
                      "percpu ref (%ps) <= 0 (%ld) after switching to atomic",
                      data->release, atomic_long_read(&data->count)) &&
            atomic_inc_return(&underflows) < 4) {
                pr_err("%s(): percpu_ref underflow", __func__);
                mem_dump_obj(data);
        }

        /* @ref is viewed as dead on all CPUs, send out switch confirmation */
        percpu_ref_call_confirm_rcu(rcu);
}

static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref)
{
}

static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
                                          percpu_ref_func_t *confirm_switch)
{
        if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) {
                if (confirm_switch)
                        confirm_switch(ref);
                return;
        }

        /* switching from percpu to atomic */
        ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;

        /*
         * Non-NULL ->confirm_switch is used to indicate that switching is
         * in progress.  Use noop one if unspecified.
         */
        ref->data->confirm_switch = confirm_switch ?:
                percpu_ref_noop_confirm_switch;

        percpu_ref_get(ref);        /* put after confirmation */
        call_rcu_hurry(&ref->data->rcu,
                       percpu_ref_switch_to_atomic_rcu);
}

static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
        int cpu;

        BUG_ON(!percpu_count);

        if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
                return;

        if (WARN_ON_ONCE(!ref->data->allow_reinit))
                return;

        atomic_long_add(PERCPU_COUNT_BIAS, &ref->data->count);

        /*
         * Restore per-cpu operation.  smp_store_release() is paired
         * with READ_ONCE() in __ref_is_percpu() and guarantees that the
         * zeroing is visible to all percpu accesses which can see the
         * following __PERCPU_REF_ATOMIC clearing.
         */
        for_each_possible_cpu(cpu)
                *per_cpu_ptr(percpu_count, cpu) = 0;

        smp_store_release(&ref->percpu_count_ptr,
                          ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC);
}

static void __percpu_ref_switch_mode(struct percpu_ref *ref,
                                     percpu_ref_func_t *confirm_switch)
{
        struct percpu_ref_data *data = ref->data;

        lockdep_assert_held(&percpu_ref_switch_lock);

        /*
         * If the previous ATOMIC switching hasn't finished yet, wait for
         * its completion.  If the caller ensures that ATOMIC switching
         * isn't in progress, this function can be called from any context.
         */
        wait_event_lock_irq(percpu_ref_switch_waitq, !data->confirm_switch,
                            percpu_ref_switch_lock);

        if (data->force_atomic || percpu_ref_is_dying(ref))
                __percpu_ref_switch_to_atomic(ref, confirm_switch);
        else
                __percpu_ref_switch_to_percpu(ref);
}

/**
 * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode
 * @ref: percpu_ref to switch to atomic mode
 * @confirm_switch: optional confirmation callback
 *
 * There's no reason to use this function for the usual reference counting.
 * Use percpu_ref_kill[_and_confirm]().
 *
 * Schedule switching of @ref to atomic mode.  All its percpu counts will
 * be collected to the main atomic counter.  On completion, when all CPUs
 * are guaraneed to be in atomic mode, @confirm_switch, which may not
 * block, is invoked.  This function may be invoked concurrently with all
 * the get/put operations and can safely be mixed with kill and reinit
 * operations.  Note that @ref will stay in atomic mode across kill/reinit
 * cycles until percpu_ref_switch_to_percpu() is called.
 *
 * This function may block if @ref is in the process of switching to atomic
 * mode.  If the caller ensures that @ref is not in the process of
 * switching to atomic mode, this function can be called from any context.
 */
void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_switch)
{
        unsigned long flags;

        spin_lock_irqsave(&percpu_ref_switch_lock, flags);

        ref->data->force_atomic = true;
        __percpu_ref_switch_mode(ref, confirm_switch);

        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic);

/**
 * percpu_ref_switch_to_atomic_sync - switch a percpu_ref to atomic mode
 * @ref: percpu_ref to switch to atomic mode
 *
 * Schedule switching the ref to atomic mode, and wait for the
 * switch to complete.  Caller must ensure that no other thread
 * will switch back to percpu mode.
 */
void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref)
{
        percpu_ref_switch_to_atomic(ref, NULL);
        wait_event(percpu_ref_switch_waitq, !ref->data->confirm_switch);
}
EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync);

/**
 * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode
 * @ref: percpu_ref to switch to percpu mode
 *
 * There's no reason to use this function for the usual reference counting.
 * To re-use an expired ref, use percpu_ref_reinit().
 *
 * Switch @ref to percpu mode.  This function may be invoked concurrently
 * with all the get/put operations and can safely be mixed with kill and
 * reinit operations.  This function reverses the sticky atomic state set
 * by PERCPU_REF_INIT_ATOMIC or percpu_ref_switch_to_atomic().  If @ref is
 * dying or dead, the actual switching takes place on the following
 * percpu_ref_reinit().
 *
 * This function may block if @ref is in the process of switching to atomic
 * mode.  If the caller ensures that @ref is not in the process of
 * switching to atomic mode, this function can be called from any context.
 */
void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
{
        unsigned long flags;

        spin_lock_irqsave(&percpu_ref_switch_lock, flags);

        ref->data->force_atomic = false;
        __percpu_ref_switch_mode(ref, NULL);

        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu);

/**
 * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation
 * @ref: percpu_ref to kill
 * @confirm_kill: optional confirmation callback
 *
 * Equivalent to percpu_ref_kill() but also schedules kill confirmation if
 * @confirm_kill is not NULL.  @confirm_kill, which may not block, will be
 * called after @ref is seen as dead from all CPUs at which point all
 * further invocations of percpu_ref_tryget_live() will fail.  See
 * percpu_ref_tryget_live() for details.
 *
 * This function normally doesn't block and can be called from any context
 * but it may block if @confirm_kill is specified and @ref is in the
 * process of switching to atomic mode by percpu_ref_switch_to_atomic().
 *
 * There are no implied RCU grace periods between kill and release.
 */
void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_kill)
{
        unsigned long flags;

        spin_lock_irqsave(&percpu_ref_switch_lock, flags);

        WARN_ONCE(percpu_ref_is_dying(ref),
                  "%s called more than once on %ps!", __func__,
                  ref->data->release);

        ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
        __percpu_ref_switch_mode(ref, confirm_kill);
        percpu_ref_put(ref);

        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);

/**
 * percpu_ref_is_zero - test whether a percpu refcount reached zero
 * @ref: percpu_ref to test
 *
 * Returns %true if @ref reached zero.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
bool percpu_ref_is_zero(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count;
        unsigned long count, flags;

        if (__ref_is_percpu(ref, &percpu_count))
                return false;

        /* protect us from being destroyed */
        spin_lock_irqsave(&percpu_ref_switch_lock, flags);
        if (ref->data)
                count = atomic_long_read(&ref->data->count);
        else
                count = ref->percpu_count_ptr >> __PERCPU_REF_FLAG_BITS;
        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);

        return count == 0;
}
EXPORT_SYMBOL_GPL(percpu_ref_is_zero);

/**
 * percpu_ref_reinit - re-initialize a percpu refcount
 * @ref: perpcu_ref to re-initialize
 *
 * Re-initialize @ref so that it's in the same state as when it finished
 * percpu_ref_init() ignoring %PERCPU_REF_INIT_DEAD.  @ref must have been
 * initialized successfully and reached 0 but not exited.
 *
 * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while
 * this function is in progress.
 */
void percpu_ref_reinit(struct percpu_ref *ref)
{
        WARN_ON_ONCE(!percpu_ref_is_zero(ref));

        percpu_ref_resurrect(ref);
}
EXPORT_SYMBOL_GPL(percpu_ref_reinit);

/**
 * percpu_ref_resurrect - modify a percpu refcount from dead to live
 * @ref: perpcu_ref to resurrect
 *
 * Modify @ref so that it's in the same state as before percpu_ref_kill() was
 * called. @ref must be dead but must not yet have exited.
 *
 * If @ref->release() frees @ref then the caller is responsible for
 * guaranteeing that @ref->release() does not get called while this
 * function is in progress.
 *
 * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while
 * this function is in progress.
 */
void percpu_ref_resurrect(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count;
        unsigned long flags;

        spin_lock_irqsave(&percpu_ref_switch_lock, flags);

        WARN_ON_ONCE(!percpu_ref_is_dying(ref));
        WARN_ON_ONCE(__ref_is_percpu(ref, &percpu_count));

        ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD;
        percpu_ref_get(ref);
        __percpu_ref_switch_mode(ref, NULL);

        spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
}
EXPORT_SYMBOL_GPL(percpu_ref_resurrect);


























    1 























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
// SPDX-License-Identifier: GPL-2.0-only
/*
 * ialloc.c
 *
 * PURPOSE
 *        Inode allocation handling routines for the OSTA-UDF(tm) filesystem.
 *
 * COPYRIGHT
 *  (C) 1998-2001 Ben Fennema
 *
 * HISTORY
 *
 *  02/24/99 blf  Created.
 *
 */

#include "udfdecl.h"
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/slab.h>

#include "udf_i.h"
#include "udf_sb.h"

void udf_free_inode(struct inode *inode)
{
        udf_free_blocks(inode->i_sb, NULL, &UDF_I(inode)->i_location, 0, 1);
}

struct inode *udf_new_inode(struct inode *dir, umode_t mode)
{
        struct super_block *sb = dir->i_sb;
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct inode *inode;
        udf_pblk_t block;
        uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
        struct udf_inode_info *iinfo;
        struct udf_inode_info *dinfo = UDF_I(dir);
        int err;

        inode = new_inode(sb);

        if (!inode)
                return ERR_PTR(-ENOMEM);

        iinfo = UDF_I(inode);
        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
                iinfo->i_efe = 1;
                if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
                        sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
                iinfo->i_data = kzalloc(inode->i_sb->s_blocksize -
                                        sizeof(struct extendedFileEntry),
                                        GFP_KERNEL);
        } else {
                iinfo->i_efe = 0;
                iinfo->i_data = kzalloc(inode->i_sb->s_blocksize -
                                        sizeof(struct fileEntry),
                                        GFP_KERNEL);
        }
        if (!iinfo->i_data) {
                make_bad_inode(inode);
                iput(inode);
                return ERR_PTR(-ENOMEM);
        }

        err = -ENOSPC;
        block = udf_new_block(dir->i_sb, NULL,
                              dinfo->i_location.partitionReferenceNum,
                              start, &err);
        if (err) {
                make_bad_inode(inode);
                iput(inode);
                return ERR_PTR(err);
        }

        iinfo->i_unique = lvid_get_unique_id(sb);
        inode->i_generation = iinfo->i_unique;

        inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET))
                inode->i_uid = sbi->s_uid;
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET))
                inode->i_gid = sbi->s_gid;

        iinfo->i_location.logicalBlockNum = block;
        iinfo->i_location.partitionReferenceNum =
                                dinfo->i_location.partitionReferenceNum;
        inode->i_ino = udf_get_lb_pblock(sb, &iinfo->i_location, 0);
        inode->i_blocks = 0;
        iinfo->i_lenEAttr = 0;
        iinfo->i_lenAlloc = 0;
        iinfo->i_use = 0;
        iinfo->i_checkpoint = 1;
        iinfo->i_extraPerms = FE_PERM_U_CHATTR;
        udf_update_extra_perms(inode, mode);

        if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
                iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
        else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
                iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
        else
                iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
        simple_inode_init_ts(inode);
        iinfo->i_crtime = inode_get_mtime(inode);
        if (unlikely(insert_inode_locked(inode) < 0)) {
                make_bad_inode(inode);
                iput(inode);
                return ERR_PTR(-EIO);
        }
        mark_inode_dirty(inode);

        return inode;
}


































































































    6 

























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/*
 * Linux Security Module interfaces
 *
 * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com>
 * Copyright (C) 2001 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com>
 * Copyright (C) 2001 James Morris <jmorris@intercode.com.au>
 * Copyright (C) 2001 Silicon Graphics, Inc. (Trust Technology Group)
 * Copyright (C) 2015 Intel Corporation.
 * Copyright (C) 2015 Casey Schaufler <casey@schaufler-ca.com>
 * Copyright (C) 2016 Mellanox Techonologies
 *
 *        This program is free software; you can redistribute it and/or modify
 *        it under the terms of the GNU General Public License as published by
 *        the Free Software Foundation; either version 2 of the License, or
 *        (at your option) any later version.
 *
 *        Due to this file being licensed under the GPL there is controversy over
 *        whether this permits you to write a module that #includes this file
 *        without placing your module under the GPL.  Please consult a lawyer for
 *        advice before doing this.
 *
 */

#ifndef __LINUX_LSM_HOOKS_H
#define __LINUX_LSM_HOOKS_H

#include <uapi/linux/lsm.h>
#include <linux/security.h>
#include <linux/init.h>
#include <linux/rculist.h>
#include <linux/xattr.h>

union security_list_options {
        #define LSM_HOOK(RET, DEFAULT, NAME, ...) RET (*NAME)(__VA_ARGS__);
        #include "lsm_hook_defs.h"
        #undef LSM_HOOK
};

struct security_hook_heads {
        #define LSM_HOOK(RET, DEFAULT, NAME, ...) struct hlist_head NAME;
        #include "lsm_hook_defs.h"
        #undef LSM_HOOK
} __randomize_layout;

/**
 * struct lsm_id - Identify a Linux Security Module.
 * @lsm: name of the LSM, must be approved by the LSM maintainers
 * @id: LSM ID number from uapi/linux/lsm.h
 *
 * Contains the information that identifies the LSM.
 */
struct lsm_id {
        const char        *name;
        u64                id;
};

/*
 * Security module hook list structure.
 * For use with generic list macros for common operations.
 */
struct security_hook_list {
        struct hlist_node                list;
        struct hlist_head                *head;
        union security_list_options        hook;
        const struct lsm_id                *lsmid;
} __randomize_layout;

/*
 * Security blob size or offset data.
 */
struct lsm_blob_sizes {
        int        lbs_cred;
        int        lbs_file;
        int        lbs_inode;
        int        lbs_superblock;
        int        lbs_ipc;
        int        lbs_msg_msg;
        int        lbs_task;
        int        lbs_xattr_count; /* number of xattr slots in new_xattrs array */
};

/**
 * lsm_get_xattr_slot - Return the next available slot and increment the index
 * @xattrs: array storing LSM-provided xattrs
 * @xattr_count: number of already stored xattrs (updated)
 *
 * Retrieve the first available slot in the @xattrs array to fill with an xattr,
 * and increment @xattr_count.
 *
 * Return: The slot to fill in @xattrs if non-NULL, NULL otherwise.
 */
static inline struct xattr *lsm_get_xattr_slot(struct xattr *xattrs,
                                               int *xattr_count)
{
        if (unlikely(!xattrs))
                return NULL;
        return &xattrs[(*xattr_count)++];
}

/*
 * LSM_RET_VOID is used as the default value in LSM_HOOK definitions for void
 * LSM hooks (in include/linux/lsm_hook_defs.h).
 */
#define LSM_RET_VOID ((void) 0)

/*
 * Initializing a security_hook_list structure takes
 * up a lot of space in a source file. This macro takes
 * care of the common case and reduces the amount of
 * text involved.
 */
#define LSM_HOOK_INIT(HEAD, HOOK) \
        { .head = &security_hook_heads.HEAD, .hook = { .HEAD = HOOK } }

extern struct security_hook_heads security_hook_heads;
extern char *lsm_names;

extern void security_add_hooks(struct security_hook_list *hooks, int count,
                               const struct lsm_id *lsmid);

#define LSM_FLAG_LEGACY_MAJOR        BIT(0)
#define LSM_FLAG_EXCLUSIVE        BIT(1)

enum lsm_order {
        LSM_ORDER_FIRST = -1,        /* This is only for capabilities. */
        LSM_ORDER_MUTABLE = 0,
        LSM_ORDER_LAST = 1,        /* This is only for integrity. */
};

struct lsm_info {
        const char *name;        /* Required. */
        enum lsm_order order;        /* Optional: default is LSM_ORDER_MUTABLE */
        unsigned long flags;        /* Optional: flags describing LSM */
        int *enabled;                /* Optional: controlled by CONFIG_LSM */
        int (*init)(void);        /* Required. */
        struct lsm_blob_sizes *blobs; /* Optional: for blob sharing. */
};

extern struct lsm_info __start_lsm_info[], __end_lsm_info[];
extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[];

#define DEFINE_LSM(lsm)                                                        \
        static struct lsm_info __lsm_##lsm                                \
                __used __section(".lsm_info.init")                        \
                __aligned(sizeof(unsigned long))

#define DEFINE_EARLY_LSM(lsm)                                                \
        static struct lsm_info __early_lsm_##lsm                        \
                __used __section(".early_lsm_info.init")                \
                __aligned(sizeof(unsigned long))

extern int lsm_inode_alloc(struct inode *inode);

#endif /* ! __LINUX_LSM_HOOKS_H */













































































































































































































































































































































































































































































































































































































































































































































    4 

    5 























































    4 

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
/* SPDX-License-Identifier: GPL-2.0+ */
#ifndef _LINUX_MAPLE_TREE_H
#define _LINUX_MAPLE_TREE_H
/*
 * Maple Tree - An RCU-safe adaptive tree for storing ranges
 * Copyright (c) 2018-2022 Oracle
 * Authors:     Liam R. Howlett <Liam.Howlett@Oracle.com>
 *              Matthew Wilcox <willy@infradead.org>
 */

#include <linux/kernel.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
/* #define CONFIG_MAPLE_RCU_DISABLED */

/*
 * Allocated nodes are mutable until they have been inserted into the tree,
 * at which time they cannot change their type until they have been removed
 * from the tree and an RCU grace period has passed.
 *
 * Removed nodes have their ->parent set to point to themselves.  RCU readers
 * check ->parent before relying on the value that they loaded from the
 * slots array.  This lets us reuse the slots array for the RCU head.
 *
 * Nodes in the tree point to their parent unless bit 0 is set.
 */
#if defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64)
/* 64bit sizes */
#define MAPLE_NODE_SLOTS        31        /* 256 bytes including ->parent */
#define MAPLE_RANGE64_SLOTS        16        /* 256 bytes */
#define MAPLE_ARANGE64_SLOTS        10        /* 240 bytes */
#define MAPLE_ALLOC_SLOTS        (MAPLE_NODE_SLOTS - 1)
#else
/* 32bit sizes */
#define MAPLE_NODE_SLOTS        63        /* 256 bytes including ->parent */
#define MAPLE_RANGE64_SLOTS        32        /* 256 bytes */
#define MAPLE_ARANGE64_SLOTS        21        /* 240 bytes */
#define MAPLE_ALLOC_SLOTS        (MAPLE_NODE_SLOTS - 2)
#endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */

#define MAPLE_NODE_MASK                255UL

/*
 * The node->parent of the root node has bit 0 set and the rest of the pointer
 * is a pointer to the tree itself.  No more bits are available in this pointer
 * (on m68k, the data structure may only be 2-byte aligned).
 *
 * Internal non-root nodes can only have maple_range_* nodes as parents.  The
 * parent pointer is 256B aligned like all other tree nodes.  When storing a 32
 * or 64 bit values, the offset can fit into 4 bits.  The 16 bit values need an
 * extra bit to store the offset.  This extra bit comes from a reuse of the last
 * bit in the node type.  This is possible by using bit 1 to indicate if bit 2
 * is part of the type or the slot.
 *
 * Once the type is decided, the decision of an allocation range type or a range
 * type is done by examining the immutable tree flag for the MAPLE_ALLOC_RANGE
 * flag.
 *
 *  Node types:
 *   0x??1 = Root
 *   0x?00 = 16 bit nodes
 *   0x010 = 32 bit nodes
 *   0x110 = 64 bit nodes
 *
 *  Slot size and location in the parent pointer:
 *   type  : slot location
 *   0x??1 : Root
 *   0x?00 : 16 bit values, type in 0-1, slot in 2-6
 *   0x010 : 32 bit values, type in 0-2, slot in 3-6
 *   0x110 : 64 bit values, type in 0-2, slot in 3-6
 */

/*
 * This metadata is used to optimize the gap updating code and in reverse
 * searching for gaps or any other code that needs to find the end of the data.
 */
struct maple_metadata {
        unsigned char end;
        unsigned char gap;
};

/*
 * Leaf nodes do not store pointers to nodes, they store user data.  Users may
 * store almost any bit pattern.  As noted above, the optimisation of storing an
 * entry at 0 in the root pointer cannot be done for data which have the bottom
 * two bits set to '10'.  We also reserve values with the bottom two bits set to
 * '10' which are below 4096 (ie 2, 6, 10 .. 4094) for internal use.  Some APIs
 * return errnos as a negative errno shifted right by two bits and the bottom
 * two bits set to '10', and while choosing to store these values in the array
 * is not an error, it may lead to confusion if you're testing for an error with
 * mas_is_err().
 *
 * Non-leaf nodes store the type of the node pointed to (enum maple_type in bits
 * 3-6), bit 2 is reserved.  That leaves bits 0-1 unused for now.
 *
 * In regular B-Tree terms, pivots are called keys.  The term pivot is used to
 * indicate that the tree is specifying ranges,  Pivots may appear in the
 * subtree with an entry attached to the value whereas keys are unique to a
 * specific position of a B-tree.  Pivot values are inclusive of the slot with
 * the same index.
 */

struct maple_range_64 {
        struct maple_pnode *parent;
        unsigned long pivot[MAPLE_RANGE64_SLOTS - 1];
        union {
                void __rcu *slot[MAPLE_RANGE64_SLOTS];
                struct {
                        void __rcu *pad[MAPLE_RANGE64_SLOTS - 1];
                        struct maple_metadata meta;
                };
        };
};

/*
 * At tree creation time, the user can specify that they're willing to trade off
 * storing fewer entries in a tree in return for storing more information in
 * each node.
 *
 * The maple tree supports recording the largest range of NULL entries available
 * in this node, also called gaps.  This optimises the tree for allocating a
 * range.
 */
struct maple_arange_64 {
        struct maple_pnode *parent;
        unsigned long pivot[MAPLE_ARANGE64_SLOTS - 1];
        void __rcu *slot[MAPLE_ARANGE64_SLOTS];
        unsigned long gap[MAPLE_ARANGE64_SLOTS];
        struct maple_metadata meta;
};

struct maple_alloc {
        unsigned long total;
        unsigned char node_count;
        unsigned int request_count;
        struct maple_alloc *slot[MAPLE_ALLOC_SLOTS];
};

struct maple_topiary {
        struct maple_pnode *parent;
        struct maple_enode *next; /* Overlaps the pivot */
};

enum maple_type {
        maple_dense,
        maple_leaf_64,
        maple_range_64,
        maple_arange_64,
};


/**
 * DOC: Maple tree flags
 *
 * * MT_FLAGS_ALLOC_RANGE        - Track gaps in this tree
 * * MT_FLAGS_USE_RCU                - Operate in RCU mode
 * * MT_FLAGS_HEIGHT_OFFSET        - The position of the tree height in the flags
 * * MT_FLAGS_HEIGHT_MASK        - The mask for the maple tree height value
 * * MT_FLAGS_LOCK_MASK                - How the mt_lock is used
 * * MT_FLAGS_LOCK_IRQ                - Acquired irq-safe
 * * MT_FLAGS_LOCK_BH                - Acquired bh-safe
 * * MT_FLAGS_LOCK_EXTERN        - mt_lock is not used
 *
 * MAPLE_HEIGHT_MAX        The largest height that can be stored
 */
#define MT_FLAGS_ALLOC_RANGE        0x01
#define MT_FLAGS_USE_RCU        0x02
#define MT_FLAGS_HEIGHT_OFFSET        0x02
#define MT_FLAGS_HEIGHT_MASK        0x7C
#define MT_FLAGS_LOCK_MASK        0x300
#define MT_FLAGS_LOCK_IRQ        0x100
#define MT_FLAGS_LOCK_BH        0x200
#define MT_FLAGS_LOCK_EXTERN        0x300
#define MT_FLAGS_ALLOC_WRAPPED        0x0800

#define MAPLE_HEIGHT_MAX        31


#define MAPLE_NODE_TYPE_MASK        0x0F
#define MAPLE_NODE_TYPE_SHIFT        0x03

#define MAPLE_RESERVED_RANGE        4096

#ifdef CONFIG_LOCKDEP
typedef struct lockdep_map *lockdep_map_p;
#define mt_lock_is_held(mt)                                             \
        (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock))

#define mt_write_lock_is_held(mt)                                        \
        (!(mt)->ma_external_lock ||                                        \
         lock_is_held_type((mt)->ma_external_lock, 0))

#define mt_set_external_lock(mt, lock)                                        \
        (mt)->ma_external_lock = &(lock)->dep_map

#define mt_on_stack(mt)                        (mt).ma_external_lock = NULL
#else
typedef struct { /* nothing */ } lockdep_map_p;
#define mt_lock_is_held(mt)                1
#define mt_write_lock_is_held(mt)        1
#define mt_set_external_lock(mt, lock)        do { } while (0)
#define mt_on_stack(mt)                        do { } while (0)
#endif

/*
 * If the tree contains a single entry at index 0, it is usually stored in
 * tree->ma_root.  To optimise for the page cache, an entry which ends in '00',
 * '01' or '11' is stored in the root, but an entry which ends in '10' will be
 * stored in a node.  Bits 3-6 are used to store enum maple_type.
 *
 * The flags are used both to store some immutable information about this tree
 * (set at tree creation time) and dynamic information set under the spinlock.
 *
 * Another use of flags are to indicate global states of the tree.  This is the
 * case with the MAPLE_USE_RCU flag, which indicates the tree is currently in
 * RCU mode.  This mode was added to allow the tree to reuse nodes instead of
 * re-allocating and RCU freeing nodes when there is a single user.
 */
struct maple_tree {
        union {
                spinlock_t        ma_lock;
                lockdep_map_p        ma_external_lock;
        };
        unsigned int        ma_flags;
        void __rcu      *ma_root;
};

/**
 * MTREE_INIT() - Initialize a maple tree
 * @name: The maple tree name
 * @__flags: The maple tree flags
 *
 */
#define MTREE_INIT(name, __flags) {                                        \
        .ma_lock = __SPIN_LOCK_UNLOCKED((name).ma_lock),                \
        .ma_flags = __flags,                                                \
        .ma_root = NULL,                                                \
}

/**
 * MTREE_INIT_EXT() - Initialize a maple tree with an external lock.
 * @name: The tree name
 * @__flags: The maple tree flags
 * @__lock: The external lock
 */
#ifdef CONFIG_LOCKDEP
#define MTREE_INIT_EXT(name, __flags, __lock) {                                \
        .ma_external_lock = &(__lock).dep_map,                                \
        .ma_flags = (__flags),                                                \
        .ma_root = NULL,                                                \
}
#else
#define MTREE_INIT_EXT(name, __flags, __lock)        MTREE_INIT(name, __flags)
#endif

#define DEFINE_MTREE(name)                                                \
        struct maple_tree name = MTREE_INIT(name, 0)

#define mtree_lock(mt)                spin_lock((&(mt)->ma_lock))
#define mtree_lock_nested(mas, subclass) \
                spin_lock_nested((&(mt)->ma_lock), subclass)
#define mtree_unlock(mt)        spin_unlock((&(mt)->ma_lock))

/*
 * The Maple Tree squeezes various bits in at various points which aren't
 * necessarily obvious.  Usually, this is done by observing that pointers are
 * N-byte aligned and thus the bottom log_2(N) bits are available for use.  We
 * don't use the high bits of pointers to store additional information because
 * we don't know what bits are unused on any given architecture.
 *
 * Nodes are 256 bytes in size and are also aligned to 256 bytes, giving us 8
 * low bits for our own purposes.  Nodes are currently of 4 types:
 * 1. Single pointer (Range is 0-0)
 * 2. Non-leaf Allocation Range nodes
 * 3. Non-leaf Range nodes
 * 4. Leaf Range nodes All nodes consist of a number of node slots,
 *    pivots, and a parent pointer.
 */

struct maple_node {
        union {
                struct {
                        struct maple_pnode *parent;
                        void __rcu *slot[MAPLE_NODE_SLOTS];
                };
                struct {
                        void *pad;
                        struct rcu_head rcu;
                        struct maple_enode *piv_parent;
                        unsigned char parent_slot;
                        enum maple_type type;
                        unsigned char slot_len;
                        unsigned int ma_flags;
                };
                struct maple_range_64 mr64;
                struct maple_arange_64 ma64;
                struct maple_alloc alloc;
        };
};

/*
 * More complicated stores can cause two nodes to become one or three and
 * potentially alter the height of the tree.  Either half of the tree may need
 * to be rebalanced against the other.  The ma_topiary struct is used to track
 * which nodes have been 'cut' from the tree so that the change can be done
 * safely at a later date.  This is done to support RCU.
 */
struct ma_topiary {
        struct maple_enode *head;
        struct maple_enode *tail;
        struct maple_tree *mtree;
};

void *mtree_load(struct maple_tree *mt, unsigned long index);

int mtree_insert(struct maple_tree *mt, unsigned long index,
                void *entry, gfp_t gfp);
int mtree_insert_range(struct maple_tree *mt, unsigned long first,
                unsigned long last, void *entry, gfp_t gfp);
int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long size, unsigned long min,
                unsigned long max, gfp_t gfp);
int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long range_lo, unsigned long range_hi,
                unsigned long *next, gfp_t gfp);
int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long size, unsigned long min,
                unsigned long max, gfp_t gfp);

int mtree_store_range(struct maple_tree *mt, unsigned long first,
                      unsigned long last, void *entry, gfp_t gfp);
int mtree_store(struct maple_tree *mt, unsigned long index,
                void *entry, gfp_t gfp);
void *mtree_erase(struct maple_tree *mt, unsigned long index);

int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp);
int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp);

void mtree_destroy(struct maple_tree *mt);
void __mt_destroy(struct maple_tree *mt);

/**
 * mtree_empty() - Determine if a tree has any present entries.
 * @mt: Maple Tree.
 *
 * Context: Any context.
 * Return: %true if the tree contains only NULL pointers.
 */
static inline bool mtree_empty(const struct maple_tree *mt)
{
        return mt->ma_root == NULL;
}

/* Advanced API */

/*
 * Maple State Status
 * ma_active means the maple state is pointing to a node and offset and can
 * continue operating on the tree.
 * ma_start means we have not searched the tree.
 * ma_root means we have searched the tree and the entry we found lives in
 * the root of the tree (ie it has index 0, length 1 and is the only entry in
 * the tree).
 * ma_none means we have searched the tree and there is no node in the
 * tree for this entry.  For example, we searched for index 1 in an empty
 * tree.  Or we have a tree which points to a full leaf node and we
 * searched for an entry which is larger than can be contained in that
 * leaf node.
 * ma_pause means the data within the maple state may be stale, restart the
 * operation
 * ma_overflow means the search has reached the upper limit of the search
 * ma_underflow means the search has reached the lower limit of the search
 * ma_error means there was an error, check the node for the error number.
 */
enum maple_status {
        ma_active,
        ma_start,
        ma_root,
        ma_none,
        ma_pause,
        ma_overflow,
        ma_underflow,
        ma_error,
};

/*
 * The maple state is defined in the struct ma_state and is used to keep track
 * of information during operations, and even between operations when using the
 * advanced API.
 *
 * If state->node has bit 0 set then it references a tree location which is not
 * a node (eg the root).  If bit 1 is set, the rest of the bits are a negative
 * errno.  Bit 2 (the 'unallocated slots' bit) is clear.  Bits 3-6 indicate the
 * node type.
 *
 * state->alloc either has a request number of nodes or an allocated node.  If
 * stat->alloc has a requested number of nodes, the first bit will be set (0x1)
 * and the remaining bits are the value.  If state->alloc is a node, then the
 * node will be of type maple_alloc.  maple_alloc has MAPLE_NODE_SLOTS - 1 for
 * storing more allocated nodes, a total number of nodes allocated, and the
 * node_count in this node.  node_count is the number of allocated nodes in this
 * node.  The scaling beyond MAPLE_NODE_SLOTS - 1 is handled by storing further
 * nodes into state->alloc->slot[0]'s node.  Nodes are taken from state->alloc
 * by removing a node from the state->alloc node until state->alloc->node_count
 * is 1, when state->alloc is returned and the state->alloc->slot[0] is promoted
 * to state->alloc.  Nodes are pushed onto state->alloc by putting the current
 * state->alloc into the pushed node's slot[0].
 *
 * The state also contains the implied min/max of the state->node, the depth of
 * this search, and the offset. The implied min/max are either from the parent
 * node or are 0-oo for the root node.  The depth is incremented or decremented
 * every time a node is walked down or up.  The offset is the slot/pivot of
 * interest in the node - either for reading or writing.
 *
 * When returning a value the maple state index and last respectively contain
 * the start and end of the range for the entry.  Ranges are inclusive in the
 * Maple Tree.
 *
 * The status of the state is used to determine how the next action should treat
 * the state.  For instance, if the status is ma_start then the next action
 * should start at the root of the tree and walk down.  If the status is
 * ma_pause then the node may be stale data and should be discarded.  If the
 * status is ma_overflow, then the last action hit the upper limit.
 *
 */
struct ma_state {
        struct maple_tree *tree;        /* The tree we're operating in */
        unsigned long index;                /* The index we're operating on - range start */
        unsigned long last;                /* The last index we're operating on - range end */
        struct maple_enode *node;        /* The node containing this entry */
        unsigned long min;                /* The minimum index of this node - implied pivot min */
        unsigned long max;                /* The maximum index of this node - implied pivot max */
        struct maple_alloc *alloc;        /* Allocated nodes for this operation */
        enum maple_status status;        /* The status of the state (active, start, none, etc) */
        unsigned char depth;                /* depth of tree descent during write */
        unsigned char offset;
        unsigned char mas_flags;
        unsigned char end;                /* The end of the node */
};

struct ma_wr_state {
        struct ma_state *mas;
        struct maple_node *node;        /* Decoded mas->node */
        unsigned long r_min;                /* range min */
        unsigned long r_max;                /* range max */
        enum maple_type type;                /* mas->node type */
        unsigned char offset_end;        /* The offset where the write ends */
        unsigned long *pivots;                /* mas->node->pivots pointer */
        unsigned long end_piv;                /* The pivot at the offset end */
        void __rcu **slots;                /* mas->node->slots pointer */
        void *entry;                        /* The entry to write */
        void *content;                        /* The existing entry that is being overwritten */
};

#define mas_lock(mas)           spin_lock(&((mas)->tree->ma_lock))
#define mas_lock_nested(mas, subclass) \
                spin_lock_nested(&((mas)->tree->ma_lock), subclass)
#define mas_unlock(mas)         spin_unlock(&((mas)->tree->ma_lock))

/*
 * Special values for ma_state.node.
 * MA_ERROR represents an errno.  After dropping the lock and attempting
 * to resolve the error, the walk would have to be restarted from the
 * top of the tree as the tree may have been modified.
 */
#define MA_ERROR(err) \
                ((struct maple_enode *)(((unsigned long)err << 2) | 2UL))

#define MA_STATE(name, mt, first, end)                                        \
        struct ma_state name = {                                        \
                .tree = mt,                                                \
                .index = first,                                                \
                .last = end,                                                \
                .node = NULL,                                                \
                .status = ma_start,                                        \
                .min = 0,                                                \
                .max = ULONG_MAX,                                        \
                .alloc = NULL,                                                \
                .mas_flags = 0,                                                \
        }

#define MA_WR_STATE(name, ma_state, wr_entry)                                \
        struct ma_wr_state name = {                                        \
                .mas = ma_state,                                        \
                .content = NULL,                                        \
                .entry = wr_entry,                                        \
        }

#define MA_TOPIARY(name, tree)                                                \
        struct ma_topiary name = {                                        \
                .head = NULL,                                                \
                .tail = NULL,                                                \
                .mtree = tree,                                                \
        }

void *mas_walk(struct ma_state *mas);
void *mas_store(struct ma_state *mas, void *entry);
void *mas_erase(struct ma_state *mas);
int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp);
void mas_store_prealloc(struct ma_state *mas, void *entry);
void *mas_find(struct ma_state *mas, unsigned long max);
void *mas_find_range(struct ma_state *mas, unsigned long max);
void *mas_find_rev(struct ma_state *mas, unsigned long min);
void *mas_find_range_rev(struct ma_state *mas, unsigned long max);
int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp);
int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp,
                void *entry, unsigned long range_lo, unsigned long range_hi,
                unsigned long *next, gfp_t gfp);

bool mas_nomem(struct ma_state *mas, gfp_t gfp);
void mas_pause(struct ma_state *mas);
void maple_tree_init(void);
void mas_destroy(struct ma_state *mas);
int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries);

void *mas_prev(struct ma_state *mas, unsigned long min);
void *mas_prev_range(struct ma_state *mas, unsigned long max);
void *mas_next(struct ma_state *mas, unsigned long max);
void *mas_next_range(struct ma_state *mas, unsigned long max);

int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max,
                   unsigned long size);
/*
 * This finds an empty area from the highest address to the lowest.
 * AKA "Topdown" version,
 */
int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
                       unsigned long max, unsigned long size);

static inline void mas_init(struct ma_state *mas, struct maple_tree *tree,
                            unsigned long addr)
{
        memset(mas, 0, sizeof(struct ma_state));
        mas->tree = tree;
        mas->index = mas->last = addr;
        mas->max = ULONG_MAX;
        mas->status = ma_start;
        mas->node = NULL;
}

static inline bool mas_is_active(struct ma_state *mas)
{
        return mas->status == ma_active;
}

static inline bool mas_is_err(struct ma_state *mas)
{
        return mas->status == ma_error;
}

/**
 * mas_reset() - Reset a Maple Tree operation state.
 * @mas: Maple Tree operation state.
 *
 * Resets the error or walk state of the @mas so future walks of the
 * array will start from the root.  Use this if you have dropped the
 * lock and want to reuse the ma_state.
 *
 * Context: Any context.
 */
static __always_inline void mas_reset(struct ma_state *mas)
{
        mas->status = ma_start;
        mas->node = NULL;
}

/**
 * mas_for_each() - Iterate over a range of the maple tree.
 * @__mas: Maple Tree operation state (maple_state)
 * @__entry: Entry retrieved from the tree
 * @__max: maximum index to retrieve from the tree
 *
 * When returned, mas->index and mas->last will hold the entire range for the
 * entry.
 *
 * Note: may return the zero entry.
 */
#define mas_for_each(__mas, __entry, __max) \
        while (((__entry) = mas_find((__mas), (__max))) != NULL)

#ifdef CONFIG_DEBUG_MAPLE_TREE
enum mt_dump_format {
        mt_dump_dec,
        mt_dump_hex,
};

extern atomic_t maple_tree_tests_run;
extern atomic_t maple_tree_tests_passed;

void mt_dump(const struct maple_tree *mt, enum mt_dump_format format);
void mas_dump(const struct ma_state *mas);
void mas_wr_dump(const struct ma_wr_state *wr_mas);
void mt_validate(struct maple_tree *mt);
void mt_cache_shrink(void);
#define MT_BUG_ON(__tree, __x) do {                                        \
        atomic_inc(&maple_tree_tests_run);                                \
        if (__x) {                                                        \
                pr_info("BUG at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mt_dump(__tree, mt_dump_hex);                                \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
} while (0)

#define MAS_BUG_ON(__mas, __x) do {                                        \
        atomic_inc(&maple_tree_tests_run);                                \
        if (__x) {                                                        \
                pr_info("BUG at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mas_dump(__mas);                                        \
                mt_dump((__mas)->tree, mt_dump_hex);                        \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
} while (0)

#define MAS_WR_BUG_ON(__wrmas, __x) do {                                \
        atomic_inc(&maple_tree_tests_run);                                \
        if (__x) {                                                        \
                pr_info("BUG at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mas_wr_dump(__wrmas);                                        \
                mas_dump((__wrmas)->mas);                                \
                mt_dump((__wrmas)->mas->tree, mt_dump_hex);                \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
} while (0)

#define MT_WARN_ON(__tree, __x)  ({                                        \
        int ret = !!(__x);                                                \
        atomic_inc(&maple_tree_tests_run);                                \
        if (ret) {                                                        \
                pr_info("WARN at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mt_dump(__tree, mt_dump_hex);                                \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
        unlikely(ret);                                                        \
})

#define MAS_WARN_ON(__mas, __x) ({                                        \
        int ret = !!(__x);                                                \
        atomic_inc(&maple_tree_tests_run);                                \
        if (ret) {                                                        \
                pr_info("WARN at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mas_dump(__mas);                                        \
                mt_dump((__mas)->tree, mt_dump_hex);                        \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
        unlikely(ret);                                                        \
})

#define MAS_WR_WARN_ON(__wrmas, __x) ({                                        \
        int ret = !!(__x);                                                \
        atomic_inc(&maple_tree_tests_run);                                \
        if (ret) {                                                        \
                pr_info("WARN at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mas_wr_dump(__wrmas);                                        \
                mas_dump((__wrmas)->mas);                                \
                mt_dump((__wrmas)->mas->tree, mt_dump_hex);                \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
        unlikely(ret);                                                        \
})
#else
#define MT_BUG_ON(__tree, __x)                BUG_ON(__x)
#define MAS_BUG_ON(__mas, __x)                BUG_ON(__x)
#define MAS_WR_BUG_ON(__mas, __x)        BUG_ON(__x)
#define MT_WARN_ON(__tree, __x)                WARN_ON(__x)
#define MAS_WARN_ON(__mas, __x)                WARN_ON(__x)
#define MAS_WR_WARN_ON(__mas, __x)        WARN_ON(__x)
#endif /* CONFIG_DEBUG_MAPLE_TREE */

/**
 * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the
 * current location.
 * @mas: Maple Tree operation state.
 * @start: New start of range in the Maple Tree.
 * @last: New end of range in the Maple Tree.
 *
 * set the internal maple state values to a sub-range.
 * Please use mas_set_range() if you do not know where you are in the tree.
 */
static inline void __mas_set_range(struct ma_state *mas, unsigned long start,
                unsigned long last)
{
        /* Ensure the range starts within the current slot */
        MAS_WARN_ON(mas, mas_is_active(mas) &&
                   (mas->index > start || mas->last < start));
        mas->index = start;
        mas->last = last;
}

/**
 * mas_set_range() - Set up Maple Tree operation state for a different index.
 * @mas: Maple Tree operation state.
 * @start: New start of range in the Maple Tree.
 * @last: New end of range in the Maple Tree.
 *
 * Move the operation state to refer to a different range.  This will
 * have the effect of starting a walk from the top; see mas_next()
 * to move to an adjacent index.
 */
static inline
void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last)
{
        mas_reset(mas);
        __mas_set_range(mas, start, last);
}

/**
 * mas_set() - Set up Maple Tree operation state for a different index.
 * @mas: Maple Tree operation state.
 * @index: New index into the Maple Tree.
 *
 * Move the operation state to refer to a different index.  This will
 * have the effect of starting a walk from the top; see mas_next()
 * to move to an adjacent index.
 */
static inline void mas_set(struct ma_state *mas, unsigned long index)
{

        mas_set_range(mas, index, index);
}

static inline bool mt_external_lock(const struct maple_tree *mt)
{
        return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN;
}

/**
 * mt_init_flags() - Initialise an empty maple tree with flags.
 * @mt: Maple Tree
 * @flags: maple tree flags.
 *
 * If you need to initialise a Maple Tree with special flags (eg, an
 * allocation tree), use this function.
 *
 * Context: Any context.
 */
static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags)
{
        mt->ma_flags = flags;
        if (!mt_external_lock(mt))
                spin_lock_init(&mt->ma_lock);
        rcu_assign_pointer(mt->ma_root, NULL);
}

/**
 * mt_init() - Initialise an empty maple tree.
 * @mt: Maple Tree
 *
 * An empty Maple Tree.
 *
 * Context: Any context.
 */
static inline void mt_init(struct maple_tree *mt)
{
        mt_init_flags(mt, 0);
}

static inline bool mt_in_rcu(struct maple_tree *mt)
{
#ifdef CONFIG_MAPLE_RCU_DISABLED
        return false;
#endif
        return mt->ma_flags & MT_FLAGS_USE_RCU;
}

/**
 * mt_clear_in_rcu() - Switch the tree to non-RCU mode.
 * @mt: The Maple Tree
 */
static inline void mt_clear_in_rcu(struct maple_tree *mt)
{
        if (!mt_in_rcu(mt))
                return;

        if (mt_external_lock(mt)) {
                WARN_ON(!mt_lock_is_held(mt));
                mt->ma_flags &= ~MT_FLAGS_USE_RCU;
        } else {
                mtree_lock(mt);
                mt->ma_flags &= ~MT_FLAGS_USE_RCU;
                mtree_unlock(mt);
        }
}

/**
 * mt_set_in_rcu() - Switch the tree to RCU safe mode.
 * @mt: The Maple Tree
 */
static inline void mt_set_in_rcu(struct maple_tree *mt)
{
        if (mt_in_rcu(mt))
                return;

        if (mt_external_lock(mt)) {
                WARN_ON(!mt_lock_is_held(mt));
                mt->ma_flags |= MT_FLAGS_USE_RCU;
        } else {
                mtree_lock(mt);
                mt->ma_flags |= MT_FLAGS_USE_RCU;
                mtree_unlock(mt);
        }
}

static inline unsigned int mt_height(const struct maple_tree *mt)
{
        return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET;
}

void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max);
void *mt_find_after(struct maple_tree *mt, unsigned long *index,
                    unsigned long max);
void *mt_prev(struct maple_tree *mt, unsigned long index,  unsigned long min);
void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max);

/**
 * mt_for_each - Iterate over each entry starting at index until max.
 * @__tree: The Maple Tree
 * @__entry: The current entry
 * @__index: The index to start the search from. Subsequently used as iterator.
 * @__max: The maximum limit for @index
 *
 * This iterator skips all entries, which resolve to a NULL pointer,
 * e.g. entries which has been reserved with XA_ZERO_ENTRY.
 */
#define mt_for_each(__tree, __entry, __index, __max) \
        for (__entry = mt_find(__tree, &(__index), __max); \
                __entry; __entry = mt_find_after(__tree, &(__index), __max))

#endif /*_LINUX_MAPLE_TREE_H */





















































































































    1 






    1 









































































































































































    1 




    1 




















   50 






















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net>
 *
 * Based on the original implementation which is:
 *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
 *  Copyright 2003 Andi Kleen, SuSE Labs.
 *
 *  Parts of the original code have been moved to arch/x86/vdso/vma.c
 *
 * This file implements vsyscall emulation.  vsyscalls are a legacy ABI:
 * Userspace can request certain kernel services by calling fixed
 * addresses.  This concept is problematic:
 *
 * - It interferes with ASLR.
 * - It's awkward to write code that lives in kernel addresses but is
 *   callable by userspace at fixed addresses.
 * - The whole concept is impossible for 32-bit compat userspace.
 * - UML cannot easily virtualize a vsyscall.
 *
 * As of mid-2014, I believe that there is no new userspace code that
 * will use a vsyscall if the vDSO is present.  I hope that there will
 * soon be no new userspace code that will ever use a vsyscall.
 *
 * The code in this file emulates vsyscalls when notified of a page
 * fault to a vsyscall address.
 */

#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/sched/signal.h>
#include <linux/mm_types.h>
#include <linux/syscalls.h>
#include <linux/ratelimit.h>

#include <asm/vsyscall.h>
#include <asm/unistd.h>
#include <asm/fixmap.h>
#include <asm/traps.h>
#include <asm/paravirt.h>

#define CREATE_TRACE_POINTS
#include "vsyscall_trace.h"

static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init =
#ifdef CONFIG_LEGACY_VSYSCALL_NONE
        NONE;
#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY)
        XONLY;
#else
        #error VSYSCALL config is broken
#endif

static int __init vsyscall_setup(char *str)
{
        if (str) {
                if (!strcmp("emulate", str))
                        vsyscall_mode = EMULATE;
                else if (!strcmp("xonly", str))
                        vsyscall_mode = XONLY;
                else if (!strcmp("none", str))
                        vsyscall_mode = NONE;
                else
                        return -EINVAL;

                return 0;
        }

        return -EINVAL;
}
early_param("vsyscall", vsyscall_setup);

static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
                              const char *message)
{
        if (!show_unhandled_signals)
                return;

        printk_ratelimited("%s%s[%d] %s ip:%lx cs:%x sp:%lx ax:%lx si:%lx di:%lx\n",
                           level, current->comm, task_pid_nr(current),
                           message, regs->ip, regs->cs,
                           regs->sp, regs->ax, regs->si, regs->di);
}

static int addr_to_vsyscall_nr(unsigned long addr)
{
        int nr;

        if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
                return -EINVAL;

        nr = (addr & 0xC00UL) >> 10;
        if (nr >= 3)
                return -EINVAL;

        return nr;
}

static bool write_ok_or_segv(unsigned long ptr, size_t size)
{
        if (!access_ok((void __user *)ptr, size)) {
                struct thread_struct *thread = &current->thread;

                thread->error_code        = X86_PF_USER | X86_PF_WRITE;
                thread->cr2                = ptr;
                thread->trap_nr                = X86_TRAP_PF;

                force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr);
                return false;
        } else {
                return true;
        }
}

bool emulate_vsyscall(unsigned long error_code,
                      struct pt_regs *regs, unsigned long address)
{
        unsigned long caller;
        int vsyscall_nr, syscall_nr, tmp;
        long ret;
        unsigned long orig_dx;

        /* Write faults or kernel-privilege faults never get fixed up. */
        if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
                return false;

        if (!(error_code & X86_PF_INSTR)) {
                /* Failed vsyscall read */
                if (vsyscall_mode == EMULATE)
                        return false;

                /*
                 * User code tried and failed to read the vsyscall page.
                 */
                warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround");
                return false;
        }

        /*
         * No point in checking CS -- the only way to get here is a user mode
         * trap to a high address, which means that we're in 64-bit user code.
         */

        WARN_ON_ONCE(address != regs->ip);

        if (vsyscall_mode == NONE) {
                warn_bad_vsyscall(KERN_INFO, regs,
                                  "vsyscall attempted with vsyscall=none");
                return false;
        }

        vsyscall_nr = addr_to_vsyscall_nr(address);

        trace_emulate_vsyscall(vsyscall_nr);

        if (vsyscall_nr < 0) {
                warn_bad_vsyscall(KERN_WARNING, regs,
                                  "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
                goto sigsegv;
        }

        if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
                warn_bad_vsyscall(KERN_WARNING, regs,
                                  "vsyscall with bad stack (exploit attempt?)");
                goto sigsegv;
        }

        /*
         * Check for access_ok violations and find the syscall nr.
         *
         * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
         * 64-bit, so we don't need to special-case it here.  For all the
         * vsyscalls, NULL means "don't write anything" not "write it at
         * address 0".
         */
        switch (vsyscall_nr) {
        case 0:
                if (!write_ok_or_segv(regs->di, sizeof(struct __kernel_old_timeval)) ||
                    !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
                        ret = -EFAULT;
                        goto check_fault;
                }

                syscall_nr = __NR_gettimeofday;
                break;

        case 1:
                if (!write_ok_or_segv(regs->di, sizeof(__kernel_old_time_t))) {
                        ret = -EFAULT;
                        goto check_fault;
                }

                syscall_nr = __NR_time;
                break;

        case 2:
                if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
                    !write_ok_or_segv(regs->si, sizeof(unsigned))) {
                        ret = -EFAULT;
                        goto check_fault;
                }

                syscall_nr = __NR_getcpu;
                break;
        }

        /*
         * Handle seccomp.  regs->ip must be the original value.
         * See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst.
         *
         * We could optimize the seccomp disabled case, but performance
         * here doesn't matter.
         */
        regs->orig_ax = syscall_nr;
        regs->ax = -ENOSYS;
        tmp = secure_computing();
        if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
                warn_bad_vsyscall(KERN_DEBUG, regs,
                                  "seccomp tried to change syscall nr or ip");
                force_exit_sig(SIGSYS);
                return true;
        }
        regs->orig_ax = -1;
        if (tmp)
                goto do_ret;  /* skip requested */

        /*
         * With a real vsyscall, page faults cause SIGSEGV.
         */
        ret = -EFAULT;
        switch (vsyscall_nr) {
        case 0:
                /* this decodes regs->di and regs->si on its own */
                ret = __x64_sys_gettimeofday(regs);
                break;

        case 1:
                /* this decodes regs->di on its own */
                ret = __x64_sys_time(regs);
                break;

        case 2:
                /* while we could clobber regs->dx, we didn't in the past... */
                orig_dx = regs->dx;
                regs->dx = 0;
                /* this decodes regs->di, regs->si and regs->dx on its own */
                ret = __x64_sys_getcpu(regs);
                regs->dx = orig_dx;
                break;
        }

check_fault:
        if (ret == -EFAULT) {
                /* Bad news -- userspace fed a bad pointer to a vsyscall. */
                warn_bad_vsyscall(KERN_INFO, regs,
                                  "vsyscall fault (exploit attempt?)");
                goto sigsegv;
        }

        regs->ax = ret;

do_ret:
        /* Emulate a ret instruction. */
        regs->ip = caller;
        regs->sp += 8;
        return true;

sigsegv:
        force_sig(SIGSEGV);
        return true;
}

/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
static const char *gate_vma_name(struct vm_area_struct *vma)
{
        return "[vsyscall]";
}
static const struct vm_operations_struct gate_vma_ops = {
        .name = gate_vma_name,
};
static struct vm_area_struct gate_vma __ro_after_init = {
        .vm_start        = VSYSCALL_ADDR,
        .vm_end                = VSYSCALL_ADDR + PAGE_SIZE,
        .vm_page_prot        = PAGE_READONLY_EXEC,
        .vm_flags        = VM_READ | VM_EXEC,
        .vm_ops                = &gate_vma_ops,
};

struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
#ifdef CONFIG_COMPAT
        if (!mm || !test_bit(MM_CONTEXT_HAS_VSYSCALL, &mm->context.flags))
                return NULL;
#endif
        if (vsyscall_mode == NONE)
                return NULL;
        return &gate_vma;
}

int in_gate_area(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma = get_gate_vma(mm);

        if (!vma)
                return 0;

        return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

/*
 * Use this when you have no reliable mm, typically from interrupt
 * context. It is less reliable than using a task's mm and may give
 * false positives.
 */
int in_gate_area_no_mm(unsigned long addr)
{
        return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
}

/*
 * The VSYSCALL page is the only user-accessible page in the kernel address
 * range.  Normally, the kernel page tables can have _PAGE_USER clear, but
 * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls
 * are enabled.
 *
 * Some day we may create a "minimal" vsyscall mode in which we emulate
 * vsyscalls but leave the page not present.  If so, we skip calling
 * this.
 */
void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
        set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
        p4d = p4d_offset(pgd, VSYSCALL_ADDR);
#if CONFIG_PGTABLE_LEVELS >= 5
        set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER));
#endif
        pud = pud_offset(p4d, VSYSCALL_ADDR);
        set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
        pmd = pmd_offset(pud, VSYSCALL_ADDR);
        set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));
}

void __init map_vsyscall(void)
{
        extern char __vsyscall_page;
        unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);

        /*
         * For full emulation, the page needs to exist for real.  In
         * execute-only mode, there is no PTE at all backing the vsyscall
         * page.
         */
        if (vsyscall_mode == EMULATE) {
                __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
                             PAGE_KERNEL_VVAR);
                set_vsyscall_pgtable_user_bits(swapper_pg_dir);
        }

        if (vsyscall_mode == XONLY)
                vm_flags_init(&gate_vma, VM_EXEC);

        BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
                     (unsigned long)VSYSCALL_ADDR);
}



































































    3 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * x86 KFENCE support.
 *
 * Copyright (C) 2020, Google LLC.
 */

#ifndef _ASM_X86_KFENCE_H
#define _ASM_X86_KFENCE_H

#ifndef MODULE

#include <linux/bug.h>
#include <linux/kfence.h>

#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <asm/set_memory.h>
#include <asm/tlbflush.h>

/* Force 4K pages for __kfence_pool. */
static inline bool arch_kfence_init_pool(void)
{
        unsigned long addr;

        for (addr = (unsigned long)__kfence_pool; is_kfence_address((void *)addr);
             addr += PAGE_SIZE) {
                unsigned int level;

                if (!lookup_address(addr, &level))
                        return false;

                if (level != PG_LEVEL_4K)
                        set_memory_4k(addr, 1);
        }

        return true;
}

/* Protect the given page and flush TLB. */
static inline bool kfence_protect_page(unsigned long addr, bool protect)
{
        unsigned int level;
        pte_t *pte = lookup_address(addr, &level);

        if (WARN_ON(!pte || level != PG_LEVEL_4K))
                return false;

        /*
         * We need to avoid IPIs, as we may get KFENCE allocations or faults
         * with interrupts disabled. Therefore, the below is best-effort, and
         * does not flush TLBs on all CPUs. We can tolerate some inaccuracy;
         * lazy fault handling takes care of faults after the page is PRESENT.
         */

        if (protect)
                set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
        else
                set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));

        /*
         * Flush this CPU's TLB, assuming whoever did the allocation/free is
         * likely to continue running on this CPU.
         */
        preempt_disable();
        flush_tlb_one_kernel(addr);
        preempt_enable();
        return true;
}

#endif /* !MODULE */

#endif /* _ASM_X86_KFENCE_H */


















































































































    3 
    3 
















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* fs/ internal definitions
 *
 * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

struct super_block;
struct file_system_type;
struct iomap;
struct iomap_ops;
struct linux_binprm;
struct path;
struct mount;
struct shrink_control;
struct fs_context;
struct pipe_inode_info;
struct iov_iter;
struct mnt_idmap;

/*
 * block/bdev.c
 */
#ifdef CONFIG_BLOCK
extern void __init bdev_cache_init(void);
#else
static inline void bdev_cache_init(void)
{
}
#endif /* CONFIG_BLOCK */

/*
 * buffer.c
 */
int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
                get_block_t *get_block, const struct iomap *iomap);

/*
 * char_dev.c
 */
extern void __init chrdev_init(void);

/*
 * fs_context.c
 */
extern const struct fs_context_operations legacy_fs_context_ops;
extern int parse_monolithic_mount_data(struct fs_context *, void *);
extern void vfs_clean_context(struct fs_context *fc);
extern int finish_clean_context(struct fs_context *fc);

/*
 * namei.c
 */
extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
                           struct path *path, struct path *root);
int do_rmdir(int dfd, struct filename *name);
int do_unlinkat(int dfd, struct filename *name);
int may_linkat(struct mnt_idmap *idmap, const struct path *link);
int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
                 struct filename *newname, unsigned int flags);
int do_mkdirat(int dfd, struct filename *name, umode_t mode);
int do_symlinkat(struct filename *from, int newdfd, struct filename *to);
int do_linkat(int olddfd, struct filename *old, int newdfd,
                        struct filename *new, int flags);
int vfs_tmpfile(struct mnt_idmap *idmap,
                const struct path *parentpath,
                struct file *file, umode_t mode);

/*
 * namespace.c
 */
extern struct vfsmount *lookup_mnt(const struct path *);
extern int finish_automount(struct vfsmount *, const struct path *);

extern int sb_prepare_remount_readonly(struct super_block *);

extern void __init mnt_init(void);

int mnt_get_write_access_file(struct file *file);
void mnt_put_write_access_file(struct file *file);

extern void dissolve_on_fput(struct vfsmount *);
extern bool may_mount(void);

int path_mount(const char *dev_name, struct path *path,
                const char *type_page, unsigned long flags, void *data_page);
int path_umount(struct path *path, int flags);

int show_path(struct seq_file *m, struct dentry *root);

/*
 * fs_struct.c
 */
extern void chroot_fs_refs(const struct path *, const struct path *);

/*
 * file_table.c
 */
struct file *alloc_empty_file(int flags, const struct cred *cred);
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
struct file *alloc_empty_backing_file(int flags, const struct cred *cred);

static inline void file_put_write_access(struct file *file)
{
        put_write_access(file->f_inode);
        mnt_put_write_access(file->f_path.mnt);
        if (unlikely(file->f_mode & FMODE_BACKING))
                mnt_put_write_access(backing_file_user_path(file)->mnt);
}

static inline void put_file_access(struct file *file)
{
        if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
                i_readcount_dec(file->f_inode);
        } else if (file->f_mode & FMODE_WRITER) {
                file_put_write_access(file);
        }
}

/*
 * super.c
 */
extern int reconfigure_super(struct fs_context *);
extern bool super_trylock_shared(struct super_block *sb);
struct super_block *user_get_super(dev_t, bool excl);
void put_super(struct super_block *sb);
extern bool mount_capable(struct fs_context *);
int sb_init_dio_done_wq(struct super_block *sb);

/*
 * Prepare superblock for changing its read-only state (i.e., either remount
 * read-write superblock read-only or vice versa). After this function returns
 * mnt_is_readonly() will return true for any mount of the superblock if its
 * caller is able to observe any changes done by the remount. This holds until
 * sb_end_ro_state_change() is called.
 */
static inline void sb_start_ro_state_change(struct super_block *sb)
{
        WRITE_ONCE(sb->s_readonly_remount, 1);
        /*
         * For RO->RW transition, the barrier pairs with the barrier in
         * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY
         * cleared, it will see s_readonly_remount set.
         * For RW->RO transition, the barrier pairs with the barrier in
         * mnt_get_write_access() before the mnt_is_readonly() check.
         * The barrier makes sure if mnt_get_write_access() sees MNT_WRITE_HOLD
         * already cleared, it will see s_readonly_remount set.
         */
        smp_wmb();
}

/*
 * Ends section changing read-only state of the superblock. After this function
 * returns if mnt_is_readonly() returns false, the caller will be able to
 * observe all the changes remount did to the superblock.
 */
static inline void sb_end_ro_state_change(struct super_block *sb)
{
        /*
         * This barrier provides release semantics that pairs with
         * the smp_rmb() acquire semantics in mnt_is_readonly().
         * This barrier pair ensure that when mnt_is_readonly() sees
         * 0 for sb->s_readonly_remount, it will also see all the
         * preceding flag changes that were made during the RO state
         * change.
         */
        smp_wmb();
        WRITE_ONCE(sb->s_readonly_remount, 0);
}

/*
 * open.c
 */
struct open_flags {
        int open_flag;
        umode_t mode;
        int acc_mode;
        int intent;
        int lookup_flags;
};
extern struct file *do_filp_open(int dfd, struct filename *pathname,
                const struct open_flags *op);
extern struct file *do_file_open_root(const struct path *,
                const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
struct file *file_close_fd_locked(struct files_struct *files, unsigned fd);

long do_ftruncate(struct file *file, loff_t length, int small);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
                int flag);
int chown_common(const struct path *path, uid_t user, gid_t group);
extern int vfs_open(const struct path *, struct file *);

/*
 * inode.c
 */
extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry);
bool in_group_or_capable(struct mnt_idmap *idmap,
                         const struct inode *inode, vfsgid_t vfsgid);

/*
 * fs-writeback.c
 */
extern long get_nr_dirty_inodes(void);
void invalidate_inodes(struct super_block *sb);

/*
 * dcache.c
 */
extern int d_set_mounted(struct dentry *dentry);
extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
extern struct dentry *d_alloc_cursor(struct dentry *);
extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
extern char *simple_dname(struct dentry *, char *, int);
extern void dput_to_list(struct dentry *, struct list_head *);
extern void shrink_dentry_list(struct list_head *);
extern void shrink_dcache_for_umount(struct super_block *);
extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *);
extern struct dentry *__d_lookup_rcu(const struct dentry *parent,
                                const struct qstr *name, unsigned *seq);
extern void d_genocide(struct dentry *);

/*
 * pipe.c
 */
extern const struct file_operations pipefifo_fops;

/*
 * fs_pin.c
 */
extern void group_pin_kill(struct hlist_head *p);
extern void mnt_pin_kill(struct mount *m);

/*
 * fs/nsfs.c
 */
extern const struct dentry_operations ns_dentry_operations;

/*
 * fs/stat.c:
 */

int getname_statx_lookup_flags(int flags);
int do_statx(int dfd, struct filename *filename, unsigned int flags,
             unsigned int mask, struct statx __user *buffer);

/*
 * fs/splice.c:
 */
ssize_t splice_file_to_pipe(struct file *in,
                            struct pipe_inode_info *opipe,
                            loff_t *offset,
                            size_t len, unsigned int flags);

/*
 * fs/xattr.c:
 */
struct xattr_name {
        char name[XATTR_NAME_MAX + 1];
};

struct xattr_ctx {
        /* Value of attribute */
        union {
                const void __user *cvalue;
                void __user *value;
        };
        void *kvalue;
        size_t size;
        /* Attribute name */
        struct xattr_name *kname;
        unsigned int flags;
};


ssize_t do_getxattr(struct mnt_idmap *idmap,
                    struct dentry *d,
                    struct xattr_ctx *ctx);

int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
                struct xattr_ctx *ctx);
int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode);

#ifdef CONFIG_FS_POSIX_ACL
int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
               const char *acl_name, const void *kvalue, size_t size);
ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                   const char *acl_name, void *kvalue, size_t size);
#else
static inline int do_set_acl(struct mnt_idmap *idmap,
                             struct dentry *dentry, const char *acl_name,
                             const void *kvalue, size_t size)
{
        return -EOPNOTSUPP;
}
static inline ssize_t do_get_acl(struct mnt_idmap *idmap,
                                 struct dentry *dentry, const char *acl_name,
                                 void *kvalue, size_t size)
{
        return -EOPNOTSUPP;
}
#endif

ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos);

/*
 * fs/attr.c
 */
struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns);
struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
void mnt_idmap_put(struct mnt_idmap *idmap);
struct stashed_operations {
        void (*put_data)(void *data);
        int (*init_inode)(struct inode *inode, void *data);
};
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
                      struct path *path);
void stashed_dentry_prune(struct dentry *dentry);

















































































































    1 

    1 






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


    1 








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/exec.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * #!-checking implemented by tytso.
 */
/*
 * Demand-loading implemented 01.12.91 - no need to read anything but
 * the header into memory. The inode of the executable is put into
 * "current->executable", and page faults do the actual loading. Clean.
 *
 * Once more I can proudly say that linux stood up to being changed: it
 * was less than 2 hours work to get demand-loading completely implemented.
 *
 * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
 * current->executable is only used by the procfs.  This allows a dispatch
 * table to check for several different types  of binary formats.  We keep
 * trying until we recognize the file or we run out of supported binary
 * formats.
 */

#include <linux/kernel_read_file.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/swap.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/signal.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/pagemap.h>
#include <linux/perf_event.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
#include <linux/key.h>
#include <linux/personality.h>
#include <linux/binfmts.h>
#include <linux/utsname.h>
#include <linux/pid_namespace.h>
#include <linux/module.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/audit.h>
#include <linux/kmod.h>
#include <linux/fsnotify.h>
#include <linux/fs_struct.h>
#include <linux/oom.h>
#include <linux/compat.h>
#include <linux/vmalloc.h>
#include <linux/io_uring.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/coredump.h>
#include <linux/time_namespace.h>
#include <linux/user_events.h>
#include <linux/rseq.h>
#include <linux/ksm.h>

#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/tlb.h>

#include <trace/events/task.h>
#include "internal.h"

#include <trace/events/sched.h>

static int bprm_creds_from_file(struct linux_binprm *bprm);

int suid_dumpable = 0;

static LIST_HEAD(formats);
static DEFINE_RWLOCK(binfmt_lock);

void __register_binfmt(struct linux_binfmt * fmt, int insert)
{
        write_lock(&binfmt_lock);
        insert ? list_add(&fmt->lh, &formats) :
                 list_add_tail(&fmt->lh, &formats);
        write_unlock(&binfmt_lock);
}

EXPORT_SYMBOL(__register_binfmt);

void unregister_binfmt(struct linux_binfmt * fmt)
{
        write_lock(&binfmt_lock);
        list_del(&fmt->lh);
        write_unlock(&binfmt_lock);
}

EXPORT_SYMBOL(unregister_binfmt);

static inline void put_binfmt(struct linux_binfmt * fmt)
{
        module_put(fmt->module);
}

bool path_noexec(const struct path *path)
{
        return (path->mnt->mnt_flags & MNT_NOEXEC) ||
               (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
}

#ifdef CONFIG_USELIB
/*
 * Note that a shared library must be both readable and executable due to
 * security reasons.
 *
 * Also note that we take the address to load from the file itself.
 */
SYSCALL_DEFINE1(uselib, const char __user *, library)
{
        struct linux_binfmt *fmt;
        struct file *file;
        struct filename *tmp = getname(library);
        int error = PTR_ERR(tmp);
        static const struct open_flags uselib_flags = {
                .open_flag = O_LARGEFILE | O_RDONLY,
                .acc_mode = MAY_READ | MAY_EXEC,
                .intent = LOOKUP_OPEN,
                .lookup_flags = LOOKUP_FOLLOW,
        };

        if (IS_ERR(tmp))
                goto out;

        file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
        putname(tmp);
        error = PTR_ERR(file);
        if (IS_ERR(file))
                goto out;

        /*
         * may_open() has already checked for this, so it should be
         * impossible to trip now. But we need to be extra cautious
         * and check again at the very end too.
         */
        error = -EACCES;
        if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
                         path_noexec(&file->f_path)))
                goto exit;

        error = -ENOEXEC;

        read_lock(&binfmt_lock);
        list_for_each_entry(fmt, &formats, lh) {
                if (!fmt->load_shlib)
                        continue;
                if (!try_module_get(fmt->module))
                        continue;
                read_unlock(&binfmt_lock);
                error = fmt->load_shlib(file);
                read_lock(&binfmt_lock);
                put_binfmt(fmt);
                if (error != -ENOEXEC)
                        break;
        }
        read_unlock(&binfmt_lock);
exit:
        fput(file);
out:
        return error;
}
#endif /* #ifdef CONFIG_USELIB */

#ifdef CONFIG_MMU
/*
 * The nascent bprm->mm is not visible until exec_mmap() but it can
 * use a lot of memory, account these pages in current->mm temporary
 * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
 * change the counter back via acct_arg_size(0).
 */
static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
{
        struct mm_struct *mm = current->mm;
        long diff = (long)(pages - bprm->vma_pages);

        if (!mm || !diff)
                return;

        bprm->vma_pages = pages;
        add_mm_counter(mm, MM_ANONPAGES, diff);
}

static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
{
        struct page *page;
        struct vm_area_struct *vma = bprm->vma;
        struct mm_struct *mm = bprm->mm;
        int ret;

        /*
         * Avoid relying on expanding the stack down in GUP (which
         * does not work for STACK_GROWSUP anyway), and just do it
         * by hand ahead of time.
         */
        if (write && pos < vma->vm_start) {
                mmap_write_lock(mm);
                ret = expand_downwards(vma, pos);
                if (unlikely(ret < 0)) {
                        mmap_write_unlock(mm);
                        return NULL;
                }
                mmap_write_downgrade(mm);
        } else
                mmap_read_lock(mm);

        /*
         * We are doing an exec().  'current' is the process
         * doing the exec and 'mm' is the new process's mm.
         */
        ret = get_user_pages_remote(mm, pos, 1,
                        write ? FOLL_WRITE : 0,
                        &page, NULL);
        mmap_read_unlock(mm);
        if (ret <= 0)
                return NULL;

        if (write)
                acct_arg_size(bprm, vma_pages(vma));

        return page;
}

static void put_arg_page(struct page *page)
{
        put_page(page);
}

static void free_arg_pages(struct linux_binprm *bprm)
{
}

static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
                struct page *page)
{
        flush_cache_page(bprm->vma, pos, page_to_pfn(page));
}

static int __bprm_mm_init(struct linux_binprm *bprm)
{
        int err;
        struct vm_area_struct *vma = NULL;
        struct mm_struct *mm = bprm->mm;

        bprm->vma = vma = vm_area_alloc(mm);
        if (!vma)
                return -ENOMEM;
        vma_set_anonymous(vma);

        if (mmap_write_lock_killable(mm)) {
                err = -EINTR;
                goto err_free;
        }

        /*
         * Need to be called with mmap write lock
         * held, to avoid race with ksmd.
         */
        err = ksm_execve(mm);
        if (err)
                goto err_ksm;

        /*
         * Place the stack at the largest stack address the architecture
         * supports. Later, we'll move this to an appropriate place. We don't
         * use STACK_TOP because that can depend on attributes which aren't
         * configured yet.
         */
        BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
        vma->vm_end = STACK_TOP_MAX;
        vma->vm_start = vma->vm_end - PAGE_SIZE;
        vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP);
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);

        err = insert_vm_struct(mm, vma);
        if (err)
                goto err;

        mm->stack_vm = mm->total_vm = 1;
        mmap_write_unlock(mm);
        bprm->p = vma->vm_end - sizeof(void *);
        return 0;
err:
        ksm_exit(mm);
err_ksm:
        mmap_write_unlock(mm);
err_free:
        bprm->vma = NULL;
        vm_area_free(vma);
        return err;
}

static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
        return len <= MAX_ARG_STRLEN;
}

#else

static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
{
}

static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
{
        struct page *page;

        page = bprm->page[pos / PAGE_SIZE];
        if (!page && write) {
                page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
                if (!page)
                        return NULL;
                bprm->page[pos / PAGE_SIZE] = page;
        }

        return page;
}

static void put_arg_page(struct page *page)
{
}

static void free_arg_page(struct linux_binprm *bprm, int i)
{
        if (bprm->page[i]) {
                __free_page(bprm->page[i]);
                bprm->page[i] = NULL;
        }
}

static void free_arg_pages(struct linux_binprm *bprm)
{
        int i;

        for (i = 0; i < MAX_ARG_PAGES; i++)
                free_arg_page(bprm, i);
}

static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
                struct page *page)
{
}

static int __bprm_mm_init(struct linux_binprm *bprm)
{
        bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
        return 0;
}

static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
        return len <= bprm->p;
}

#endif /* CONFIG_MMU */

/*
 * Create a new mm_struct and populate it with a temporary stack
 * vm_area_struct.  We don't have enough context at this point to set the stack
 * flags, permissions, and offset, so we use temporary values.  We'll update
 * them later in setup_arg_pages().
 */
static int bprm_mm_init(struct linux_binprm *bprm)
{
        int err;
        struct mm_struct *mm = NULL;

        bprm->mm = mm = mm_alloc();
        err = -ENOMEM;
        if (!mm)
                goto err;

        /* Save current stack limit for all calculations made during exec. */
        task_lock(current->group_leader);
        bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
        task_unlock(current->group_leader);

        err = __bprm_mm_init(bprm);
        if (err)
                goto err;

        return 0;

err:
        if (mm) {
                bprm->mm = NULL;
                mmdrop(mm);
        }

        return err;
}

struct user_arg_ptr {
#ifdef CONFIG_COMPAT
        bool is_compat;
#endif
        union {
                const char __user *const __user *native;
#ifdef CONFIG_COMPAT
                const compat_uptr_t __user *compat;
#endif
        } ptr;
};

static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
{
        const char __user *native;

#ifdef CONFIG_COMPAT
        if (unlikely(argv.is_compat)) {
                compat_uptr_t compat;

                if (get_user(compat, argv.ptr.compat + nr))
                        return ERR_PTR(-EFAULT);

                return compat_ptr(compat);
        }
#endif

        if (get_user(native, argv.ptr.native + nr))
                return ERR_PTR(-EFAULT);

        return native;
}

/*
 * count() counts the number of strings in array ARGV.
 */
static int count(struct user_arg_ptr argv, int max)
{
        int i = 0;

        if (argv.ptr.native != NULL) {
                for (;;) {
                        const char __user *p = get_user_arg_ptr(argv, i);

                        if (!p)
                                break;

                        if (IS_ERR(p))
                                return -EFAULT;

                        if (i >= max)
                                return -E2BIG;
                        ++i;

                        if (fatal_signal_pending(current))
                                return -ERESTARTNOHAND;
                        cond_resched();
                }
        }
        return i;
}

static int count_strings_kernel(const char *const *argv)
{
        int i;

        if (!argv)
                return 0;

        for (i = 0; argv[i]; ++i) {
                if (i >= MAX_ARG_STRINGS)
                        return -E2BIG;
                if (fatal_signal_pending(current))
                        return -ERESTARTNOHAND;
                cond_resched();
        }
        return i;
}

static int bprm_stack_limits(struct linux_binprm *bprm)
{
        unsigned long limit, ptr_size;

        /*
         * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
         * (whichever is smaller) for the argv+env strings.
         * This ensures that:
         *  - the remaining binfmt code will not run out of stack space,
         *  - the program will have a reasonable amount of stack left
         *    to work from.
         */
        limit = _STK_LIM / 4 * 3;
        limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
        /*
         * We've historically supported up to 32 pages (ARG_MAX)
         * of argument strings even with small stacks
         */
        limit = max_t(unsigned long, limit, ARG_MAX);
        /*
         * We must account for the size of all the argv and envp pointers to
         * the argv and envp strings, since they will also take up space in
         * the stack. They aren't stored until much later when we can't
         * signal to the parent that the child has run out of stack space.
         * Instead, calculate it here so it's possible to fail gracefully.
         *
         * In the case of argc = 0, make sure there is space for adding a
         * empty string (which will bump argc to 1), to ensure confused
         * userspace programs don't start processing from argv[1], thinking
         * argc can never be 0, to keep them from walking envp by accident.
         * See do_execveat_common().
         */
        ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *);
        if (limit <= ptr_size)
                return -E2BIG;
        limit -= ptr_size;

        bprm->argmin = bprm->p - limit;
        return 0;
}

/*
 * 'copy_strings()' copies argument/environment strings from the old
 * processes's memory to the new process's stack.  The call to get_user_pages()
 * ensures the destination page is created and not swapped out.
 */
static int copy_strings(int argc, struct user_arg_ptr argv,
                        struct linux_binprm *bprm)
{
        struct page *kmapped_page = NULL;
        char *kaddr = NULL;
        unsigned long kpos = 0;
        int ret;

        while (argc-- > 0) {
                const char __user *str;
                int len;
                unsigned long pos;

                ret = -EFAULT;
                str = get_user_arg_ptr(argv, argc);
                if (IS_ERR(str))
                        goto out;

                len = strnlen_user(str, MAX_ARG_STRLEN);
                if (!len)
                        goto out;

                ret = -E2BIG;
                if (!valid_arg_len(bprm, len))
                        goto out;

                /* We're going to work our way backwards. */
                pos = bprm->p;
                str += len;
                bprm->p -= len;
#ifdef CONFIG_MMU
                if (bprm->p < bprm->argmin)
                        goto out;
#endif

                while (len > 0) {
                        int offset, bytes_to_copy;

                        if (fatal_signal_pending(current)) {
                                ret = -ERESTARTNOHAND;
                                goto out;
                        }
                        cond_resched();

                        offset = pos % PAGE_SIZE;
                        if (offset == 0)
                                offset = PAGE_SIZE;

                        bytes_to_copy = offset;
                        if (bytes_to_copy > len)
                                bytes_to_copy = len;

                        offset -= bytes_to_copy;
                        pos -= bytes_to_copy;
                        str -= bytes_to_copy;
                        len -= bytes_to_copy;

                        if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
                                struct page *page;

                                page = get_arg_page(bprm, pos, 1);
                                if (!page) {
                                        ret = -E2BIG;
                                        goto out;
                                }

                                if (kmapped_page) {
                                        flush_dcache_page(kmapped_page);
                                        kunmap_local(kaddr);
                                        put_arg_page(kmapped_page);
                                }
                                kmapped_page = page;
                                kaddr = kmap_local_page(kmapped_page);
                                kpos = pos & PAGE_MASK;
                                flush_arg_page(bprm, kpos, kmapped_page);
                        }
                        if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
                                ret = -EFAULT;
                                goto out;
                        }
                }
        }
        ret = 0;
out:
        if (kmapped_page) {
                flush_dcache_page(kmapped_page);
                kunmap_local(kaddr);
                put_arg_page(kmapped_page);
        }
        return ret;
}

/*
 * Copy and argument/environment string from the kernel to the processes stack.
 */
int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
{
        int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */;
        unsigned long pos = bprm->p;

        if (len == 0)
                return -EFAULT;
        if (!valid_arg_len(bprm, len))
                return -E2BIG;

        /* We're going to work our way backwards. */
        arg += len;
        bprm->p -= len;
        if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin)
                return -E2BIG;

        while (len > 0) {
                unsigned int bytes_to_copy = min_t(unsigned int, len,
                                min_not_zero(offset_in_page(pos), PAGE_SIZE));
                struct page *page;

                pos -= bytes_to_copy;
                arg -= bytes_to_copy;
                len -= bytes_to_copy;

                page = get_arg_page(bprm, pos, 1);
                if (!page)
                        return -E2BIG;
                flush_arg_page(bprm, pos & PAGE_MASK, page);
                memcpy_to_page(page, offset_in_page(pos), arg, bytes_to_copy);
                put_arg_page(page);
        }

        return 0;
}
EXPORT_SYMBOL(copy_string_kernel);

static int copy_strings_kernel(int argc, const char *const *argv,
                               struct linux_binprm *bprm)
{
        while (argc-- > 0) {
                int ret = copy_string_kernel(argv[argc], bprm);
                if (ret < 0)
                        return ret;
                if (fatal_signal_pending(current))
                        return -ERESTARTNOHAND;
                cond_resched();
        }
        return 0;
}

#ifdef CONFIG_MMU

/*
 * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
 * the binfmt code determines where the new stack should reside, we shift it to
 * its final location.  The process proceeds as follows:
 *
 * 1) Use shift to calculate the new vma endpoints.
 * 2) Extend vma to cover both the old and new ranges.  This ensures the
 *    arguments passed to subsequent functions are consistent.
 * 3) Move vma's page tables to the new range.
 * 4) Free up any cleared pgd range.
 * 5) Shrink the vma to cover only the new range.
 */
static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
{
        struct mm_struct *mm = vma->vm_mm;
        unsigned long old_start = vma->vm_start;
        unsigned long old_end = vma->vm_end;
        unsigned long length = old_end - old_start;
        unsigned long new_start = old_start - shift;
        unsigned long new_end = old_end - shift;
        VMA_ITERATOR(vmi, mm, new_start);
        struct vm_area_struct *next;
        struct mmu_gather tlb;

        BUG_ON(new_start > new_end);

        /*
         * ensure there are no vmas between where we want to go
         * and where we are
         */
        if (vma != vma_next(&vmi))
                return -EFAULT;

        vma_iter_prev_range(&vmi);
        /*
         * cover the whole range: [new_start, old_end)
         */
        if (vma_expand(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL))
                return -ENOMEM;

        /*
         * move the page tables downwards, on failure we rely on
         * process cleanup to remove whatever mess we made.
         */
        if (length != move_page_tables(vma, old_start,
                                       vma, new_start, length, false, true))
                return -ENOMEM;

        lru_add_drain();
        tlb_gather_mmu(&tlb, mm);
        next = vma_next(&vmi);
        if (new_end > old_start) {
                /*
                 * when the old and new regions overlap clear from new_end.
                 */
                free_pgd_range(&tlb, new_end, old_end, new_end,
                        next ? next->vm_start : USER_PGTABLES_CEILING);
        } else {
                /*
                 * otherwise, clean from old_start; this is done to not touch
                 * the address space in [new_end, old_start) some architectures
                 * have constraints on va-space that make this illegal (IA64) -
                 * for the others its just a little faster.
                 */
                free_pgd_range(&tlb, old_start, old_end, new_end,
                        next ? next->vm_start : USER_PGTABLES_CEILING);
        }
        tlb_finish_mmu(&tlb);

        vma_prev(&vmi);
        /* Shrink the vma to just the new range */
        return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
}

/*
 * Finalizes the stack vm_area_struct. The flags and permissions are updated,
 * the stack is optionally relocated, and some extra space is added.
 */
int setup_arg_pages(struct linux_binprm *bprm,
                    unsigned long stack_top,
                    int executable_stack)
{
        unsigned long ret;
        unsigned long stack_shift;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = bprm->vma;
        struct vm_area_struct *prev = NULL;
        unsigned long vm_flags;
        unsigned long stack_base;
        unsigned long stack_size;
        unsigned long stack_expand;
        unsigned long rlim_stack;
        struct mmu_gather tlb;
        struct vma_iterator vmi;

#ifdef CONFIG_STACK_GROWSUP
        /* Limit stack size */
        stack_base = bprm->rlim_stack.rlim_max;

        stack_base = calc_max_stack_size(stack_base);

        /* Add space for stack randomization. */
        stack_base += (STACK_RND_MASK << PAGE_SHIFT);

        /* Make sure we didn't let the argument array grow too large. */
        if (vma->vm_end - vma->vm_start > stack_base)
                return -ENOMEM;

        stack_base = PAGE_ALIGN(stack_top - stack_base);

        stack_shift = vma->vm_start - stack_base;
        mm->arg_start = bprm->p - stack_shift;
        bprm->p = vma->vm_end - stack_shift;
#else
        stack_top = arch_align_stack(stack_top);
        stack_top = PAGE_ALIGN(stack_top);

        if (unlikely(stack_top < mmap_min_addr) ||
            unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
                return -ENOMEM;

        stack_shift = vma->vm_end - stack_top;

        bprm->p -= stack_shift;
        mm->arg_start = bprm->p;
#endif

        if (bprm->loader)
                bprm->loader -= stack_shift;
        bprm->exec -= stack_shift;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        vm_flags = VM_STACK_FLAGS;

        /*
         * Adjust stack execute permissions; explicitly enable for
         * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
         * (arch default) otherwise.
         */
        if (unlikely(executable_stack == EXSTACK_ENABLE_X))
                vm_flags |= VM_EXEC;
        else if (executable_stack == EXSTACK_DISABLE_X)
                vm_flags &= ~VM_EXEC;
        vm_flags |= mm->def_flags;
        vm_flags |= VM_STACK_INCOMPLETE_SETUP;

        vma_iter_init(&vmi, mm, vma->vm_start);

        tlb_gather_mmu(&tlb, mm);
        ret = mprotect_fixup(&vmi, &tlb, vma, &prev, vma->vm_start, vma->vm_end,
                        vm_flags);
        tlb_finish_mmu(&tlb);

        if (ret)
                goto out_unlock;
        BUG_ON(prev != vma);

        if (unlikely(vm_flags & VM_EXEC)) {
                pr_warn_once("process '%pD4' started with executable stack\n",
                             bprm->file);
        }

        /* Move stack pages down in memory. */
        if (stack_shift) {
                ret = shift_arg_pages(vma, stack_shift);
                if (ret)
                        goto out_unlock;
        }

        /* mprotect_fixup is overkill to remove the temporary stack flags */
        vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP);

        stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
        stack_size = vma->vm_end - vma->vm_start;
        /*
         * Align this down to a page boundary as expand_stack
         * will align it up.
         */
        rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;

        stack_expand = min(rlim_stack, stack_size + stack_expand);

#ifdef CONFIG_STACK_GROWSUP
        stack_base = vma->vm_start + stack_expand;
#else
        stack_base = vma->vm_end - stack_expand;
#endif
        current->mm->start_stack = bprm->p;
        ret = expand_stack_locked(vma, stack_base);
        if (ret)
                ret = -EFAULT;

out_unlock:
        mmap_write_unlock(mm);
        return ret;
}
EXPORT_SYMBOL(setup_arg_pages);

#else

/*
 * Transfer the program arguments and environment from the holding pages
 * onto the stack. The provided stack pointer is adjusted accordingly.
 */
int transfer_args_to_stack(struct linux_binprm *bprm,
                           unsigned long *sp_location)
{
        unsigned long index, stop, sp;
        int ret = 0;

        stop = bprm->p >> PAGE_SHIFT;
        sp = *sp_location;

        for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
                unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0;
                char *src = kmap_local_page(bprm->page[index]) + offset;
                sp -= PAGE_SIZE - offset;
                if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0)
                        ret = -EFAULT;
                kunmap_local(src);
                if (ret)
                        goto out;
        }

        bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE;
        *sp_location = sp;

out:
        return ret;
}
EXPORT_SYMBOL(transfer_args_to_stack);

#endif /* CONFIG_MMU */

/*
 * On success, caller must call do_close_execat() on the returned
 * struct file to close it.
 */
static struct file *do_open_execat(int fd, struct filename *name, int flags)
{
        struct file *file;
        int err;
        struct open_flags open_exec_flags = {
                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
                .acc_mode = MAY_EXEC,
                .intent = LOOKUP_OPEN,
                .lookup_flags = LOOKUP_FOLLOW,
        };

        if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                return ERR_PTR(-EINVAL);
        if (flags & AT_SYMLINK_NOFOLLOW)
                open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                open_exec_flags.lookup_flags |= LOOKUP_EMPTY;

        file = do_filp_open(fd, name, &open_exec_flags);
        if (IS_ERR(file))
                goto out;

        /*
         * may_open() has already checked for this, so it should be
         * impossible to trip now. But we need to be extra cautious
         * and check again at the very end too.
         */
        err = -EACCES;
        if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
                         path_noexec(&file->f_path)))
                goto exit;

        err = deny_write_access(file);
        if (err)
                goto exit;

out:
        return file;

exit:
        fput(file);
        return ERR_PTR(err);
}

/**
 * open_exec - Open a path name for execution
 *
 * @name: path name to open with the intent of executing it.
 *
 * Returns ERR_PTR on failure or allocated struct file on success.
 *
 * As this is a wrapper for the internal do_open_execat(), callers
 * must call allow_write_access() before fput() on release. Also see
 * do_close_execat().
 */
struct file *open_exec(const char *name)
{
        struct filename *filename = getname_kernel(name);
        struct file *f = ERR_CAST(filename);

        if (!IS_ERR(filename)) {
                f = do_open_execat(AT_FDCWD, filename, 0);
                putname(filename);
        }
        return f;
}
EXPORT_SYMBOL(open_exec);

#if defined(CONFIG_BINFMT_FLAT) || defined(CONFIG_BINFMT_ELF_FDPIC)
ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
{
        ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
        if (res > 0)
                flush_icache_user_range(addr, addr + len);
        return res;
}
EXPORT_SYMBOL(read_code);
#endif

/*
 * Maps the mm_struct mm into the current task struct.
 * On success, this function returns with exec_update_lock
 * held for writing.
 */
static int exec_mmap(struct mm_struct *mm)
{
        struct task_struct *tsk;
        struct mm_struct *old_mm, *active_mm;
        int ret;

        /* Notify parent that we're no longer interested in the old VM */
        tsk = current;
        old_mm = current->mm;
        exec_mm_release(tsk, old_mm);

        ret = down_write_killable(&tsk->signal->exec_update_lock);
        if (ret)
                return ret;

        if (old_mm) {
                /*
                 * If there is a pending fatal signal perhaps a signal
                 * whose default action is to create a coredump get
                 * out and die instead of going through with the exec.
                 */
                ret = mmap_read_lock_killable(old_mm);
                if (ret) {
                        up_write(&tsk->signal->exec_update_lock);
                        return ret;
                }
        }

        task_lock(tsk);
        membarrier_exec_mmap(mm);

        local_irq_disable();
        active_mm = tsk->active_mm;
        tsk->active_mm = mm;
        tsk->mm = mm;
        mm_init_cid(mm);
        /*
         * This prevents preemption while active_mm is being loaded and
         * it and mm are being updated, which could cause problems for
         * lazy tlb mm refcounting when these are updated by context
         * switches. Not all architectures can handle irqs off over
         * activate_mm yet.
         */
        if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
                local_irq_enable();
        activate_mm(active_mm, mm);
        if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
                local_irq_enable();
        lru_gen_add_mm(mm);
        task_unlock(tsk);
        lru_gen_use_mm(mm);
        if (old_mm) {
                mmap_read_unlock(old_mm);
                BUG_ON(active_mm != old_mm);
                setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
                mm_update_next_owner(old_mm);
                mmput(old_mm);
                return 0;
        }
        mmdrop_lazy_tlb(active_mm);
        return 0;
}

static int de_thread(struct task_struct *tsk)
{
        struct signal_struct *sig = tsk->signal;
        struct sighand_struct *oldsighand = tsk->sighand;
        spinlock_t *lock = &oldsighand->siglock;

        if (thread_group_empty(tsk))
                goto no_thread_group;

        /*
         * Kill all other threads in the thread group.
         */
        spin_lock_irq(lock);
        if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) {
                /*
                 * Another group action in progress, just
                 * return so that the signal is processed.
                 */
                spin_unlock_irq(lock);
                return -EAGAIN;
        }

        sig->group_exec_task = tsk;
        sig->notify_count = zap_other_threads(tsk);
        if (!thread_group_leader(tsk))
                sig->notify_count--;

        while (sig->notify_count) {
                __set_current_state(TASK_KILLABLE);
                spin_unlock_irq(lock);
                schedule();
                if (__fatal_signal_pending(tsk))
                        goto killed;
                spin_lock_irq(lock);
        }
        spin_unlock_irq(lock);

        /*
         * At this point all other threads have exited, all we have to
         * do is to wait for the thread group leader to become inactive,
         * and to assume its PID:
         */
        if (!thread_group_leader(tsk)) {
                struct task_struct *leader = tsk->group_leader;

                for (;;) {
                        cgroup_threadgroup_change_begin(tsk);
                        write_lock_irq(&tasklist_lock);
                        /*
                         * Do this under tasklist_lock to ensure that
                         * exit_notify() can't miss ->group_exec_task
                         */
                        sig->notify_count = -1;
                        if (likely(leader->exit_state))
                                break;
                        __set_current_state(TASK_KILLABLE);
                        write_unlock_irq(&tasklist_lock);
                        cgroup_threadgroup_change_end(tsk);
                        schedule();
                        if (__fatal_signal_pending(tsk))
                                goto killed;
                }

                /*
                 * The only record we have of the real-time age of a
                 * process, regardless of execs it's done, is start_time.
                 * All the past CPU time is accumulated in signal_struct
                 * from sister threads now dead.  But in this non-leader
                 * exec, nothing survives from the original leader thread,
                 * whose birth marks the true age of this process now.
                 * When we take on its identity by switching to its PID, we
                 * also take its birthdate (always earlier than our own).
                 */
                tsk->start_time = leader->start_time;
                tsk->start_boottime = leader->start_boottime;

                BUG_ON(!same_thread_group(leader, tsk));
                /*
                 * An exec() starts a new thread group with the
                 * TGID of the previous thread group. Rehash the
                 * two threads with a switched PID, and release
                 * the former thread group leader:
                 */

                /* Become a process group leader with the old leader's pid.
                 * The old leader becomes a thread of the this thread group.
                 */
                exchange_tids(tsk, leader);
                transfer_pid(leader, tsk, PIDTYPE_TGID);
                transfer_pid(leader, tsk, PIDTYPE_PGID);
                transfer_pid(leader, tsk, PIDTYPE_SID);

                list_replace_rcu(&leader->tasks, &tsk->tasks);
                list_replace_init(&leader->sibling, &tsk->sibling);

                tsk->group_leader = tsk;
                leader->group_leader = tsk;

                tsk->exit_signal = SIGCHLD;
                leader->exit_signal = -1;

                BUG_ON(leader->exit_state != EXIT_ZOMBIE);
                leader->exit_state = EXIT_DEAD;
                /*
                 * We are going to release_task()->ptrace_unlink() silently,
                 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
                 * the tracer won't block again waiting for this thread.
                 */
                if (unlikely(leader->ptrace))
                        __wake_up_parent(leader, leader->parent);
                write_unlock_irq(&tasklist_lock);
                cgroup_threadgroup_change_end(tsk);

                release_task(leader);
        }

        sig->group_exec_task = NULL;
        sig->notify_count = 0;

no_thread_group:
        /* we have changed execution domain */
        tsk->exit_signal = SIGCHLD;

        BUG_ON(!thread_group_leader(tsk));
        return 0;

killed:
        /* protects against exit_notify() and __exit_signal() */
        read_lock(&tasklist_lock);
        sig->group_exec_task = NULL;
        sig->notify_count = 0;
        read_unlock(&tasklist_lock);
        return -EAGAIN;
}


/*
 * This function makes sure the current process has its own signal table,
 * so that flush_signal_handlers can later reset the handlers without
 * disturbing other processes.  (Other processes might share the signal
 * table via the CLONE_SIGHAND option to clone().)
 */
static int unshare_sighand(struct task_struct *me)
{
        struct sighand_struct *oldsighand = me->sighand;

        if (refcount_read(&oldsighand->count) != 1) {
                struct sighand_struct *newsighand;
                /*
                 * This ->sighand is shared with the CLONE_SIGHAND
                 * but not CLONE_THREAD task, switch to the new one.
                 */
                newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
                if (!newsighand)
                        return -ENOMEM;

                refcount_set(&newsighand->count, 1);

                write_lock_irq(&tasklist_lock);
                spin_lock(&oldsighand->siglock);
                memcpy(newsighand->action, oldsighand->action,
                       sizeof(newsighand->action));
                rcu_assign_pointer(me->sighand, newsighand);
                spin_unlock(&oldsighand->siglock);
                write_unlock_irq(&tasklist_lock);

                __cleanup_sighand(oldsighand);
        }
        return 0;
}

char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
{
        task_lock(tsk);
        /* Always NUL terminated and zero-padded */
        strscpy_pad(buf, tsk->comm, buf_size);
        task_unlock(tsk);
        return buf;
}
EXPORT_SYMBOL_GPL(__get_task_comm);

/*
 * These functions flushes out all traces of the currently running executable
 * so that a new one can be started
 */

void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
{
        task_lock(tsk);
        trace_task_rename(tsk, buf);
        strscpy_pad(tsk->comm, buf, sizeof(tsk->comm));
        task_unlock(tsk);
        perf_event_comm(tsk, exec);
}

/*
 * Calling this is the point of no return. None of the failures will be
 * seen by userspace since either the process is already taking a fatal
 * signal (via de_thread() or coredump), or will have SEGV raised
 * (after exec_mmap()) by search_binary_handler (see below).
 */
int begin_new_exec(struct linux_binprm * bprm)
{
        struct task_struct *me = current;
        int retval;

        /* Once we are committed compute the creds */
        retval = bprm_creds_from_file(bprm);
        if (retval)
                return retval;

        /*
         * This tracepoint marks the point before flushing the old exec where
         * the current task is still unchanged, but errors are fatal (point of
         * no return). The later "sched_process_exec" tracepoint is called after
         * the current task has successfully switched to the new exec.
         */
        trace_sched_prepare_exec(current, bprm);

        /*
         * Ensure all future errors are fatal.
         */
        bprm->point_of_no_return = true;

        /*
         * Make this the only thread in the thread group.
         */
        retval = de_thread(me);
        if (retval)
                goto out;

        /*
         * Cancel any io_uring activity across execve
         */
        io_uring_task_cancel();

        /* Ensure the files table is not shared. */
        retval = unshare_files();
        if (retval)
                goto out;

        /*
         * Must be called _before_ exec_mmap() as bprm->mm is
         * not visible until then. Doing it here also ensures
         * we don't race against replace_mm_exe_file().
         */
        retval = set_mm_exe_file(bprm->mm, bprm->file);
        if (retval)
                goto out;

        /* If the binary is not readable then enforce mm->dumpable=0 */
        would_dump(bprm, bprm->file);
        if (bprm->have_execfd)
                would_dump(bprm, bprm->executable);

        /*
         * Release all of the old mmap stuff
         */
        acct_arg_size(bprm, 0);
        retval = exec_mmap(bprm->mm);
        if (retval)
                goto out;

        bprm->mm = NULL;

        retval = exec_task_namespaces();
        if (retval)
                goto out_unlock;

#ifdef CONFIG_POSIX_TIMERS
        spin_lock_irq(&me->sighand->siglock);
        posix_cpu_timers_exit(me);
        spin_unlock_irq(&me->sighand->siglock);
        exit_itimers(me);
        flush_itimer_signals();
#endif

        /*
         * Make the signal table private.
         */
        retval = unshare_sighand(me);
        if (retval)
                goto out_unlock;

        me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC |
                                        PF_NOFREEZE | PF_NO_SETAFFINITY);
        flush_thread();
        me->personality &= ~bprm->per_clear;

        clear_syscall_work_syscall_user_dispatch(me);

        /*
         * We have to apply CLOEXEC before we change whether the process is
         * dumpable (in setup_new_exec) to avoid a race with a process in userspace
         * trying to access the should-be-closed file descriptors of a process
         * undergoing exec(2).
         */
        do_close_on_exec(me->files);

        if (bprm->secureexec) {
                /* Make sure parent cannot signal privileged process. */
                me->pdeath_signal = 0;

                /*
                 * For secureexec, reset the stack limit to sane default to
                 * avoid bad behavior from the prior rlimits. This has to
                 * happen before arch_pick_mmap_layout(), which examines
                 * RLIMIT_STACK, but after the point of no return to avoid
                 * needing to clean up the change on failure.
                 */
                if (bprm->rlim_stack.rlim_cur > _STK_LIM)
                        bprm->rlim_stack.rlim_cur = _STK_LIM;
        }

        me->sas_ss_sp = me->sas_ss_size = 0;

        /*
         * Figure out dumpability. Note that this checking only of current
         * is wrong, but userspace depends on it. This should be testing
         * bprm->secureexec instead.
         */
        if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
            !(uid_eq(current_euid(), current_uid()) &&
              gid_eq(current_egid(), current_gid())))
                set_dumpable(current->mm, suid_dumpable);
        else
                set_dumpable(current->mm, SUID_DUMP_USER);

        perf_event_exec();
        __set_task_comm(me, kbasename(bprm->filename), true);

        /* An exec changes our domain. We are no longer part of the thread
           group */
        WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
        flush_signal_handlers(me, 0);

        retval = set_cred_ucounts(bprm->cred);
        if (retval < 0)
                goto out_unlock;

        /*
         * install the new credentials for this executable
         */
        security_bprm_committing_creds(bprm);

        commit_creds(bprm->cred);
        bprm->cred = NULL;

        /*
         * Disable monitoring for regular users
         * when executing setuid binaries. Must
         * wait until new credentials are committed
         * by commit_creds() above
         */
        if (get_dumpable(me->mm) != SUID_DUMP_USER)
                perf_event_exit_task(me);
        /*
         * cred_guard_mutex must be held at least to this point to prevent
         * ptrace_attach() from altering our determination of the task's
         * credentials; any time after this it may be unlocked.
         */
        security_bprm_committed_creds(bprm);

        /* Pass the opened binary to the interpreter. */
        if (bprm->have_execfd) {
                retval = get_unused_fd_flags(0);
                if (retval < 0)
                        goto out_unlock;
                fd_install(retval, bprm->executable);
                bprm->executable = NULL;
                bprm->execfd = retval;
        }
        return 0;

out_unlock:
        up_write(&me->signal->exec_update_lock);
        if (!bprm->cred)
                mutex_unlock(&me->signal->cred_guard_mutex);

out:
        return retval;
}
EXPORT_SYMBOL(begin_new_exec);

void would_dump(struct linux_binprm *bprm, struct file *file)
{
        struct inode *inode = file_inode(file);
        struct mnt_idmap *idmap = file_mnt_idmap(file);
        if (inode_permission(idmap, inode, MAY_READ) < 0) {
                struct user_namespace *old, *user_ns;
                bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;

                /* Ensure mm->user_ns contains the executable */
                user_ns = old = bprm->mm->user_ns;
                while ((user_ns != &init_user_ns) &&
                       !privileged_wrt_inode_uidgid(user_ns, idmap, inode))
                        user_ns = user_ns->parent;

                if (old != user_ns) {
                        bprm->mm->user_ns = get_user_ns(user_ns);
                        put_user_ns(old);
                }
        }
}
EXPORT_SYMBOL(would_dump);

void setup_new_exec(struct linux_binprm * bprm)
{
        /* Setup things that can depend upon the personality */
        struct task_struct *me = current;

        arch_pick_mmap_layout(me->mm, &bprm->rlim_stack);

        arch_setup_new_exec();

        /* Set the new mm task size. We have to do that late because it may
         * depend on TIF_32BIT which is only updated in flush_thread() on
         * some architectures like powerpc
         */
        me->mm->task_size = TASK_SIZE;
        up_write(&me->signal->exec_update_lock);
        mutex_unlock(&me->signal->cred_guard_mutex);
}
EXPORT_SYMBOL(setup_new_exec);

/* Runs immediately before start_thread() takes over. */
void finalize_exec(struct linux_binprm *bprm)
{
        /* Store any stack rlimit changes before starting thread. */
        task_lock(current->group_leader);
        current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack;
        task_unlock(current->group_leader);
}
EXPORT_SYMBOL(finalize_exec);

/*
 * Prepare credentials and lock ->cred_guard_mutex.
 * setup_new_exec() commits the new creds and drops the lock.
 * Or, if exec fails before, free_bprm() should release ->cred
 * and unlock.
 */
static int prepare_bprm_creds(struct linux_binprm *bprm)
{
        if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
                return -ERESTARTNOINTR;

        bprm->cred = prepare_exec_creds();
        if (likely(bprm->cred))
                return 0;

        mutex_unlock(&current->signal->cred_guard_mutex);
        return -ENOMEM;
}

/* Matches do_open_execat() */
static void do_close_execat(struct file *file)
{
        if (!file)
                return;
        allow_write_access(file);
        fput(file);
}

static void free_bprm(struct linux_binprm *bprm)
{
        if (bprm->mm) {
                acct_arg_size(bprm, 0);
                mmput(bprm->mm);
        }
        free_arg_pages(bprm);
        if (bprm->cred) {
                mutex_unlock(&current->signal->cred_guard_mutex);
                abort_creds(bprm->cred);
        }
        do_close_execat(bprm->file);
        if (bprm->executable)
                fput(bprm->executable);
        /* If a binfmt changed the interp, free it. */
        if (bprm->interp != bprm->filename)
                kfree(bprm->interp);
        kfree(bprm->fdpath);
        kfree(bprm);
}

static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags)
{
        struct linux_binprm *bprm;
        struct file *file;
        int retval = -ENOMEM;

        file = do_open_execat(fd, filename, flags);
        if (IS_ERR(file))
                return ERR_CAST(file);

        bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
        if (!bprm) {
                do_close_execat(file);
                return ERR_PTR(-ENOMEM);
        }

        bprm->file = file;

        if (fd == AT_FDCWD || filename->name[0] == '/') {
                bprm->filename = filename->name;
        } else {
                if (filename->name[0] == '\0')
                        bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
                else
                        bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
                                                  fd, filename->name);
                if (!bprm->fdpath)
                        goto out_free;

                /*
                 * Record that a name derived from an O_CLOEXEC fd will be
                 * inaccessible after exec.  This allows the code in exec to
                 * choose to fail when the executable is not mmaped into the
                 * interpreter and an open file descriptor is not passed to
                 * the interpreter.  This makes for a better user experience
                 * than having the interpreter start and then immediately fail
                 * when it finds the executable is inaccessible.
                 */
                if (get_close_on_exec(fd))
                        bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;

                bprm->filename = bprm->fdpath;
        }
        bprm->interp = bprm->filename;

        retval = bprm_mm_init(bprm);
        if (!retval)
                return bprm;

out_free:
        free_bprm(bprm);
        return ERR_PTR(retval);
}

int bprm_change_interp(const char *interp, struct linux_binprm *bprm)
{
        /* If a binfmt changed the interp, free it first. */
        if (bprm->interp != bprm->filename)
                kfree(bprm->interp);
        bprm->interp = kstrdup(interp, GFP_KERNEL);
        if (!bprm->interp)
                return -ENOMEM;
        return 0;
}
EXPORT_SYMBOL(bprm_change_interp);

/*
 * determine how safe it is to execute the proposed program
 * - the caller must hold ->cred_guard_mutex to protect against
 *   PTRACE_ATTACH or seccomp thread-sync
 */
static void check_unsafe_exec(struct linux_binprm *bprm)
{
        struct task_struct *p = current, *t;
        unsigned n_fs;

        if (p->ptrace)
                bprm->unsafe |= LSM_UNSAFE_PTRACE;

        /*
         * This isn't strictly necessary, but it makes it harder for LSMs to
         * mess up.
         */
        if (task_no_new_privs(current))
                bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;

        /*
         * If another task is sharing our fs, we cannot safely
         * suid exec because the differently privileged task
         * will be able to manipulate the current directory, etc.
         * It would be nice to force an unshare instead...
         */
        n_fs = 1;
        spin_lock(&p->fs->lock);
        rcu_read_lock();
        for_other_threads(p, t) {
                if (t->fs == p->fs)
                        n_fs++;
        }
        rcu_read_unlock();

        /* "users" and "in_exec" locked for copy_fs() */
        if (p->fs->users > n_fs)
                bprm->unsafe |= LSM_UNSAFE_SHARE;
        else
                p->fs->in_exec = 1;
        spin_unlock(&p->fs->lock);
}

static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
{
        /* Handle suid and sgid on files */
        struct mnt_idmap *idmap;
        struct inode *inode = file_inode(file);
        unsigned int mode;
        vfsuid_t vfsuid;
        vfsgid_t vfsgid;

        if (!mnt_may_suid(file->f_path.mnt))
                return;

        if (task_no_new_privs(current))
                return;

        mode = READ_ONCE(inode->i_mode);
        if (!(mode & (S_ISUID|S_ISGID)))
                return;

        idmap = file_mnt_idmap(file);

        /* Be careful if suid/sgid is set */
        inode_lock(inode);

        /* reload atomically mode/uid/gid now that lock held */
        mode = inode->i_mode;
        vfsuid = i_uid_into_vfsuid(idmap, inode);
        vfsgid = i_gid_into_vfsgid(idmap, inode);
        inode_unlock(inode);

        /* We ignore suid/sgid if there are no mappings for them in the ns */
        if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) ||
            !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid))
                return;

        if (mode & S_ISUID) {
                bprm->per_clear |= PER_CLEAR_ON_SETID;
                bprm->cred->euid = vfsuid_into_kuid(vfsuid);
        }

        if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
                bprm->per_clear |= PER_CLEAR_ON_SETID;
                bprm->cred->egid = vfsgid_into_kgid(vfsgid);
        }
}

/*
 * Compute brpm->cred based upon the final binary.
 */
static int bprm_creds_from_file(struct linux_binprm *bprm)
{
        /* Compute creds based on which file? */
        struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file;

        bprm_fill_uid(bprm, file);
        return security_bprm_creds_from_file(bprm, file);
}

/*
 * Fill the binprm structure from the inode.
 * Read the first BINPRM_BUF_SIZE bytes
 *
 * This may be called multiple times for binary chains (scripts for example).
 */
static int prepare_binprm(struct linux_binprm *bprm)
{
        loff_t pos = 0;

        memset(bprm->buf, 0, BINPRM_BUF_SIZE);
        return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos);
}

/*
 * Arguments are '\0' separated strings found at the location bprm->p
 * points to; chop off the first by relocating brpm->p to right after
 * the first '\0' encountered.
 */
int remove_arg_zero(struct linux_binprm *bprm)
{
        unsigned long offset;
        char *kaddr;
        struct page *page;

        if (!bprm->argc)
                return 0;

        do {
                offset = bprm->p & ~PAGE_MASK;
                page = get_arg_page(bprm, bprm->p, 0);
                if (!page)
                        return -EFAULT;
                kaddr = kmap_local_page(page);

                for (; offset < PAGE_SIZE && kaddr[offset];
                                offset++, bprm->p++)
                        ;

                kunmap_local(kaddr);
                put_arg_page(page);
        } while (offset == PAGE_SIZE);

        bprm->p++;
        bprm->argc--;

        return 0;
}
EXPORT_SYMBOL(remove_arg_zero);

#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
/*
 * cycle the list of binary formats handler, until one recognizes the image
 */
static int search_binary_handler(struct linux_binprm *bprm)
{
        bool need_retry = IS_ENABLED(CONFIG_MODULES);
        struct linux_binfmt *fmt;
        int retval;

        retval = prepare_binprm(bprm);
        if (retval < 0)
                return retval;

        retval = security_bprm_check(bprm);
        if (retval)
                return retval;

        retval = -ENOENT;
 retry:
        read_lock(&binfmt_lock);
        list_for_each_entry(fmt, &formats, lh) {
                if (!try_module_get(fmt->module))
                        continue;
                read_unlock(&binfmt_lock);

                retval = fmt->load_binary(bprm);

                read_lock(&binfmt_lock);
                put_binfmt(fmt);
                if (bprm->point_of_no_return || (retval != -ENOEXEC)) {
                        read_unlock(&binfmt_lock);
                        return retval;
                }
        }
        read_unlock(&binfmt_lock);

        if (need_retry) {
                if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
                    printable(bprm->buf[2]) && printable(bprm->buf[3]))
                        return retval;
                if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
                        return retval;
                need_retry = false;
                goto retry;
        }

        return retval;
}

/* binfmt handlers will call back into begin_new_exec() on success. */
static int exec_binprm(struct linux_binprm *bprm)
{
        pid_t old_pid, old_vpid;
        int ret, depth;

        /* Need to fetch pid before load_binary changes it */
        old_pid = current->pid;
        rcu_read_lock();
        old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
        rcu_read_unlock();

        /* This allows 4 levels of binfmt rewrites before failing hard. */
        for (depth = 0;; depth++) {
                struct file *exec;
                if (depth > 5)
                        return -ELOOP;

                ret = search_binary_handler(bprm);
                if (ret < 0)
                        return ret;
                if (!bprm->interpreter)
                        break;

                exec = bprm->file;
                bprm->file = bprm->interpreter;
                bprm->interpreter = NULL;

                allow_write_access(exec);
                if (unlikely(bprm->have_execfd)) {
                        if (bprm->executable) {
                                fput(exec);
                                return -ENOEXEC;
                        }
                        bprm->executable = exec;
                } else
                        fput(exec);
        }

        audit_bprm(bprm);
        trace_sched_process_exec(current, old_pid, bprm);
        ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
        proc_exec_connector(current);
        return 0;
}

static int bprm_execve(struct linux_binprm *bprm)
{
        int retval;

        retval = prepare_bprm_creds(bprm);
        if (retval)
                return retval;

        /*
         * Check for unsafe execution states before exec_binprm(), which
         * will call back into begin_new_exec(), into bprm_creds_from_file(),
         * where setuid-ness is evaluated.
         */
        check_unsafe_exec(bprm);
        current->in_execve = 1;
        sched_mm_cid_before_execve(current);

        sched_exec();

        /* Set the unchanging part of bprm->cred */
        retval = security_bprm_creds_for_exec(bprm);
        if (retval)
                goto out;

        retval = exec_binprm(bprm);
        if (retval < 0)
                goto out;

        sched_mm_cid_after_execve(current);
        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
        rseq_execve(current);
        user_events_execve(current);
        acct_update_integrals(current);
        task_numa_free(current, false);
        return retval;

out:
        /*
         * If past the point of no return ensure the code never
         * returns to the userspace process.  Use an existing fatal
         * signal if present otherwise terminate the process with
         * SIGSEGV.
         */
        if (bprm->point_of_no_return && !fatal_signal_pending(current))
                force_fatal_sig(SIGSEGV);

        sched_mm_cid_after_execve(current);
        current->fs->in_exec = 0;
        current->in_execve = 0;

        return retval;
}

static int do_execveat_common(int fd, struct filename *filename,
                              struct user_arg_ptr argv,
                              struct user_arg_ptr envp,
                              int flags)
{
        struct linux_binprm *bprm;
        int retval;

        if (IS_ERR(filename))
                return PTR_ERR(filename);

        /*
         * We move the actual failure in case of RLIMIT_NPROC excess from
         * set*uid() to execve() because too many poorly written programs
         * don't check setuid() return code.  Here we additionally recheck
         * whether NPROC limit is still exceeded.
         */
        if ((current->flags & PF_NPROC_EXCEEDED) &&
            is_rlimit_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
                retval = -EAGAIN;
                goto out_ret;
        }

        /* We're below the limit (still or again), so we don't want to make
         * further execve() calls fail. */
        current->flags &= ~PF_NPROC_EXCEEDED;

        bprm = alloc_bprm(fd, filename, flags);
        if (IS_ERR(bprm)) {
                retval = PTR_ERR(bprm);
                goto out_ret;
        }

        retval = count(argv, MAX_ARG_STRINGS);
        if (retval == 0)
                pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
                             current->comm, bprm->filename);
        if (retval < 0)
                goto out_free;
        bprm->argc = retval;

        retval = count(envp, MAX_ARG_STRINGS);
        if (retval < 0)
                goto out_free;
        bprm->envc = retval;

        retval = bprm_stack_limits(bprm);
        if (retval < 0)
                goto out_free;

        retval = copy_string_kernel(bprm->filename, bprm);
        if (retval < 0)
                goto out_free;
        bprm->exec = bprm->p;

        retval = copy_strings(bprm->envc, envp, bprm);
        if (retval < 0)
                goto out_free;

        retval = copy_strings(bprm->argc, argv, bprm);
        if (retval < 0)
                goto out_free;

        /*
         * When argv is empty, add an empty string ("") as argv[0] to
         * ensure confused userspace programs that start processing
         * from argv[1] won't end up walking envp. See also
         * bprm_stack_limits().
         */
        if (bprm->argc == 0) {
                retval = copy_string_kernel("", bprm);
                if (retval < 0)
                        goto out_free;
                bprm->argc = 1;
        }

        retval = bprm_execve(bprm);
out_free:
        free_bprm(bprm);

out_ret:
        putname(filename);
        return retval;
}

int kernel_execve(const char *kernel_filename,
                  const char *const *argv, const char *const *envp)
{
        struct filename *filename;
        struct linux_binprm *bprm;
        int fd = AT_FDCWD;
        int retval;

        /* It is non-sense for kernel threads to call execve */
        if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
                return -EINVAL;

        filename = getname_kernel(kernel_filename);
        if (IS_ERR(filename))
                return PTR_ERR(filename);

        bprm = alloc_bprm(fd, filename, 0);
        if (IS_ERR(bprm)) {
                retval = PTR_ERR(bprm);
                goto out_ret;
        }

        retval = count_strings_kernel(argv);
        if (WARN_ON_ONCE(retval == 0))
                retval = -EINVAL;
        if (retval < 0)
                goto out_free;
        bprm->argc = retval;

        retval = count_strings_kernel(envp);
        if (retval < 0)
                goto out_free;
        bprm->envc = retval;

        retval = bprm_stack_limits(bprm);
        if (retval < 0)
                goto out_free;

        retval = copy_string_kernel(bprm->filename, bprm);
        if (retval < 0)
                goto out_free;
        bprm->exec = bprm->p;

        retval = copy_strings_kernel(bprm->envc, envp, bprm);
        if (retval < 0)
                goto out_free;

        retval = copy_strings_kernel(bprm->argc, argv, bprm);
        if (retval < 0)
                goto out_free;

        retval = bprm_execve(bprm);
out_free:
        free_bprm(bprm);
out_ret:
        putname(filename);
        return retval;
}

static int do_execve(struct filename *filename,
        const char __user *const __user *__argv,
        const char __user *const __user *__envp)
{
        struct user_arg_ptr argv = { .ptr.native = __argv };
        struct user_arg_ptr envp = { .ptr.native = __envp };
        return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
}

static int do_execveat(int fd, struct filename *filename,
                const char __user *const __user *__argv,
                const char __user *const __user *__envp,
                int flags)
{
        struct user_arg_ptr argv = { .ptr.native = __argv };
        struct user_arg_ptr envp = { .ptr.native = __envp };

        return do_execveat_common(fd, filename, argv, envp, flags);
}

#ifdef CONFIG_COMPAT
static int compat_do_execve(struct filename *filename,
        const compat_uptr_t __user *__argv,
        const compat_uptr_t __user *__envp)
{
        struct user_arg_ptr argv = {
                .is_compat = true,
                .ptr.compat = __argv,
        };
        struct user_arg_ptr envp = {
                .is_compat = true,
                .ptr.compat = __envp,
        };
        return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
}

static int compat_do_execveat(int fd, struct filename *filename,
                              const compat_uptr_t __user *__argv,
                              const compat_uptr_t __user *__envp,
                              int flags)
{
        struct user_arg_ptr argv = {
                .is_compat = true,
                .ptr.compat = __argv,
        };
        struct user_arg_ptr envp = {
                .is_compat = true,
                .ptr.compat = __envp,
        };
        return do_execveat_common(fd, filename, argv, envp, flags);
}
#endif

void set_binfmt(struct linux_binfmt *new)
{
        struct mm_struct *mm = current->mm;

        if (mm->binfmt)
                module_put(mm->binfmt->module);

        mm->binfmt = new;
        if (new)
                __module_get(new->module);
}
EXPORT_SYMBOL(set_binfmt);

/*
 * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
 */
void set_dumpable(struct mm_struct *mm, int value)
{
        if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
                return;

        set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
}

SYSCALL_DEFINE3(execve,
                const char __user *, filename,
                const char __user *const __user *, argv,
                const char __user *const __user *, envp)
{
        return do_execve(getname(filename), argv, envp);
}

SYSCALL_DEFINE5(execveat,
                int, fd, const char __user *, filename,
                const char __user *const __user *, argv,
                const char __user *const __user *, envp,
                int, flags)
{
        return do_execveat(fd,
                           getname_uflags(filename, flags),
                           argv, envp, flags);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
        const compat_uptr_t __user *, argv,
        const compat_uptr_t __user *, envp)
{
        return compat_do_execve(getname(filename), argv, envp);
}

COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
                       const char __user *, filename,
                       const compat_uptr_t __user *, argv,
                       const compat_uptr_t __user *, envp,
                       int,  flags)
{
        return compat_do_execveat(fd,
                                  getname_uflags(filename, flags),
                                  argv, envp, flags);
}
#endif

#ifdef CONFIG_SYSCTL

static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);

        if (!error)
                validate_coredump_safety();
        return error;
}

static struct ctl_table fs_exec_sysctls[] = {
        {
                .procname        = "suid_dumpable",
                .data                = &suid_dumpable,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax_coredump,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_TWO,
        },
};

static int __init init_fs_exec_sysctls(void)
{
        register_sysctl_init("fs", fs_exec_sysctls);
        return 0;
}

fs_initcall(init_fs_exec_sysctls);
#endif /* CONFIG_SYSCTL */
























































































































































































































































































    1 
























































































































































































































































































    1 























    1 







    1 













































































































































    1 




































































    1 






































































































    1 







    1 

































































































    1 








    1 






































































    1 


































    4 














































































































    1 













    2 



























    2 





























    2 















































































    2 





































































































    2 






















    2 



































































    1 









































































































































































    1 












































































































































































































































    1 













































































    2 
































    1 











































































































































































































    5 





    1 




































    1 
    5 






























    4 







    1 





























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM btrfs

#if !defined(_TRACE_BTRFS_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_BTRFS_H

#include <linux/writeback.h>
#include <linux/tracepoint.h>
#include <trace/events/mmflags.h>

struct btrfs_root;
struct btrfs_fs_info;
struct btrfs_inode;
struct extent_map;
struct btrfs_file_extent_item;
struct btrfs_ordered_extent;
struct btrfs_delayed_ref_node;
struct btrfs_delayed_ref_head;
struct btrfs_block_group;
struct btrfs_free_cluster;
struct btrfs_chunk_map;
struct extent_buffer;
struct btrfs_work;
struct btrfs_workqueue;
struct btrfs_qgroup_extent_record;
struct btrfs_qgroup;
struct extent_io_tree;
struct prelim_ref;
struct btrfs_space_info;
struct btrfs_raid_bio;
struct raid56_bio_trace_info;
struct find_free_extent_ctl;

#define show_ref_type(type)                                                \
        __print_symbolic(type,                                                \
                { BTRFS_TREE_BLOCK_REF_KEY,         "TREE_BLOCK_REF" },        \
                { BTRFS_EXTENT_DATA_REF_KEY,         "EXTENT_DATA_REF" },        \
                { BTRFS_SHARED_BLOCK_REF_KEY,         "SHARED_BLOCK_REF" },        \
                { BTRFS_SHARED_DATA_REF_KEY,         "SHARED_DATA_REF" })

#define __show_root_type(obj)                                                \
        __print_symbolic_u64(obj,                                        \
                { BTRFS_ROOT_TREE_OBJECTID,         "ROOT_TREE"        },        \
                { BTRFS_EXTENT_TREE_OBJECTID,         "EXTENT_TREE"        },        \
                { BTRFS_CHUNK_TREE_OBJECTID,         "CHUNK_TREE"        },        \
                { BTRFS_DEV_TREE_OBJECTID,         "DEV_TREE"        },        \
                { BTRFS_FS_TREE_OBJECTID,         "FS_TREE"        },        \
                { BTRFS_ROOT_TREE_DIR_OBJECTID, "ROOT_TREE_DIR"        },        \
                { BTRFS_CSUM_TREE_OBJECTID,         "CSUM_TREE"        },        \
                { BTRFS_TREE_LOG_OBJECTID,        "TREE_LOG"        },        \
                { BTRFS_QUOTA_TREE_OBJECTID,        "QUOTA_TREE"        },        \
                { BTRFS_TREE_RELOC_OBJECTID,        "TREE_RELOC"        },        \
                { BTRFS_UUID_TREE_OBJECTID,        "UUID_TREE"        },        \
                { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" },        \
                { BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" },\
                { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" })

#define show_root_type(obj)                                                \
        obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) ||                \
              (obj >= BTRFS_ROOT_TREE_OBJECTID &&                        \
               obj <= BTRFS_QUOTA_TREE_OBJECTID)) ? __show_root_type(obj) : "-"

#define FLUSH_ACTIONS                                                                \
        EM( BTRFS_RESERVE_NO_FLUSH,                "BTRFS_RESERVE_NO_FLUSH")        \
        EM( BTRFS_RESERVE_FLUSH_LIMIT,                "BTRFS_RESERVE_FLUSH_LIMIT")        \
        EM( BTRFS_RESERVE_FLUSH_ALL,                "BTRFS_RESERVE_FLUSH_ALL")        \
        EMe(BTRFS_RESERVE_FLUSH_ALL_STEAL,        "BTRFS_RESERVE_FLUSH_ALL_STEAL")

#define FI_TYPES                                                        \
        EM( BTRFS_FILE_EXTENT_INLINE,                "INLINE")                \
        EM( BTRFS_FILE_EXTENT_REG,                "REG")                        \
        EMe(BTRFS_FILE_EXTENT_PREALLOC,                "PREALLOC")

#define QGROUP_RSV_TYPES                                                \
        EM( BTRFS_QGROUP_RSV_DATA,                "DATA")                        \
        EM( BTRFS_QGROUP_RSV_META_PERTRANS,        "META_PERTRANS")        \
        EMe(BTRFS_QGROUP_RSV_META_PREALLOC,        "META_PREALLOC")

#define IO_TREE_OWNER                                                    \
        EM( IO_TREE_FS_PINNED_EXTENTS,           "PINNED_EXTENTS")            \
        EM( IO_TREE_FS_EXCLUDED_EXTENTS,  "EXCLUDED_EXTENTS")            \
        EM( IO_TREE_BTREE_INODE_IO,          "BTREE_INODE_IO")            \
        EM( IO_TREE_INODE_IO,                  "INODE_IO")                    \
        EM( IO_TREE_RELOC_BLOCKS,          "RELOC_BLOCKS")            \
        EM( IO_TREE_TRANS_DIRTY_PAGES,          "TRANS_DIRTY_PAGES")      \
        EM( IO_TREE_ROOT_DIRTY_LOG_PAGES, "ROOT_DIRTY_LOG_PAGES")   \
        EM( IO_TREE_INODE_FILE_EXTENT,          "INODE_FILE_EXTENT")      \
        EM( IO_TREE_LOG_CSUM_RANGE,          "LOG_CSUM_RANGE")         \
        EMe(IO_TREE_SELFTEST,                  "SELFTEST")

#define FLUSH_STATES                                                        \
        EM( FLUSH_DELAYED_ITEMS_NR,        "FLUSH_DELAYED_ITEMS_NR")        \
        EM( FLUSH_DELAYED_ITEMS,        "FLUSH_DELAYED_ITEMS")                \
        EM( FLUSH_DELALLOC,                "FLUSH_DELALLOC")                \
        EM( FLUSH_DELALLOC_WAIT,        "FLUSH_DELALLOC_WAIT")                \
        EM( FLUSH_DELALLOC_FULL,        "FLUSH_DELALLOC_FULL")                \
        EM( FLUSH_DELAYED_REFS_NR,        "FLUSH_DELAYED_REFS_NR")        \
        EM( FLUSH_DELAYED_REFS,                "FLUSH_DELAYED_REFS")                \
        EM( ALLOC_CHUNK,                "ALLOC_CHUNK")                        \
        EM( ALLOC_CHUNK_FORCE,                "ALLOC_CHUNK_FORCE")                \
        EM( RUN_DELAYED_IPUTS,                "RUN_DELAYED_IPUTS")                \
        EMe(COMMIT_TRANS,                "COMMIT_TRANS")

/*
 * First define the enums in the above macros to be exported to userspace via
 * TRACE_DEFINE_ENUM().
 */

#undef EM
#undef EMe
#define EM(a, b)        TRACE_DEFINE_ENUM(a);
#define EMe(a, b)        TRACE_DEFINE_ENUM(a);

FLUSH_ACTIONS
FI_TYPES
QGROUP_RSV_TYPES
IO_TREE_OWNER
FLUSH_STATES

/*
 * Now redefine the EM and EMe macros to map the enums to the strings that will
 * be printed in the output
 */

#undef EM
#undef EMe
#define EM(a, b)        {a, b},
#define EMe(a, b)       {a, b}


#define BTRFS_GROUP_FLAGS        \
        { BTRFS_BLOCK_GROUP_DATA,        "DATA"},        \
        { BTRFS_BLOCK_GROUP_SYSTEM,        "SYSTEM"},        \
        { BTRFS_BLOCK_GROUP_METADATA,        "METADATA"},        \
        { BTRFS_BLOCK_GROUP_RAID0,        "RAID0"},         \
        { BTRFS_BLOCK_GROUP_RAID1,        "RAID1"},         \
        { BTRFS_BLOCK_GROUP_DUP,        "DUP"},         \
        { BTRFS_BLOCK_GROUP_RAID10,        "RAID10"},         \
        { BTRFS_BLOCK_GROUP_RAID5,        "RAID5"},        \
        { BTRFS_BLOCK_GROUP_RAID6,        "RAID6"}

#define EXTENT_FLAGS                                                \
        { EXTENT_DIRTY,                        "DIRTY"},                \
        { EXTENT_UPTODATE,                "UPTODATE"},                \
        { EXTENT_LOCKED,                "LOCKED"},                \
        { EXTENT_NEW,                        "NEW"},                        \
        { EXTENT_DELALLOC,                "DELALLOC"},                \
        { EXTENT_DEFRAG,                "DEFRAG"},                \
        { EXTENT_BOUNDARY,                "BOUNDARY"},                \
        { EXTENT_NODATASUM,                "NODATASUM"},                \
        { EXTENT_CLEAR_META_RESV,        "CLEAR_META_RESV"},        \
        { EXTENT_NEED_WAIT,                "NEED_WAIT"},                \
        { EXTENT_NORESERVE,                "NORESERVE"},                \
        { EXTENT_QGROUP_RESERVED,        "QGROUP_RESERVED"},        \
        { EXTENT_CLEAR_DATA_RESV,        "CLEAR_DATA_RESV"},        \
        { EXTENT_DELALLOC_NEW,                "DELALLOC_NEW"}

#define BTRFS_FSID_SIZE 16
#define TP_STRUCT__entry_fsid __array(u8, fsid, BTRFS_FSID_SIZE)

#define TP_fast_assign_fsid(fs_info)                                        \
({                                                                        \
        if (fs_info)                                                        \
                memcpy(__entry->fsid, fs_info->fs_devices->fsid,        \
                       BTRFS_FSID_SIZE);                                \
        else                                                                \
                memset(__entry->fsid, 0, BTRFS_FSID_SIZE);                \
})

#define TP_STRUCT__entry_btrfs(args...)                                        \
        TP_STRUCT__entry(                                                \
                TP_STRUCT__entry_fsid                                        \
                args)
#define TP_fast_assign_btrfs(fs_info, args...)                                \
        TP_fast_assign(                                                        \
                TP_fast_assign_fsid(fs_info);                                \
                args)
#define TP_printk_btrfs(fmt, args...) \
        TP_printk("%pU: " fmt, __entry->fsid, args)

TRACE_EVENT(btrfs_transaction_commit,

        TP_PROTO(const struct btrfs_fs_info *fs_info),

        TP_ARGS(fs_info),

        TP_STRUCT__entry_btrfs(
                __field(        u64,  generation                )
                __field(        u64,  root_objectid                )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->generation        = fs_info->generation;
                __entry->root_objectid        = BTRFS_ROOT_TREE_OBJECTID;
        ),

        TP_printk_btrfs("root=%llu(%s) gen=%llu",
                  show_root_type(__entry->root_objectid),
                  __entry->generation)
);

DECLARE_EVENT_CLASS(btrfs__inode,

        TP_PROTO(const struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry_btrfs(
                __field(        u64,  ino                        )
                __field(        u64,  blocks                        )
                __field(        u64,  disk_i_size                )
                __field(        u64,  generation                )
                __field(        u64,  last_trans                )
                __field(        u64,  logged_trans                )
                __field(        u64,  root_objectid                )
        ),

        TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
                __entry->ino        = btrfs_ino(BTRFS_I(inode));
                __entry->blocks        = inode->i_blocks;
                __entry->disk_i_size  = BTRFS_I(inode)->disk_i_size;
                __entry->generation = BTRFS_I(inode)->generation;
                __entry->last_trans = BTRFS_I(inode)->last_trans;
                __entry->logged_trans = BTRFS_I(inode)->logged_trans;
                __entry->root_objectid =
                                BTRFS_I(inode)->root->root_key.objectid;
        ),

        TP_printk_btrfs("root=%llu(%s) gen=%llu ino=%llu blocks=%llu "
                  "disk_i_size=%llu last_trans=%llu logged_trans=%llu",
                  show_root_type(__entry->root_objectid),
                  __entry->generation,
                  __entry->ino,
                  __entry->blocks,
                  __entry->disk_i_size,
                  __entry->last_trans,
                  __entry->logged_trans)
);

DEFINE_EVENT(btrfs__inode, btrfs_inode_new,

        TP_PROTO(const struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(btrfs__inode, btrfs_inode_request,

        TP_PROTO(const struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(btrfs__inode, btrfs_inode_evict,

        TP_PROTO(const struct inode *inode),

        TP_ARGS(inode)
);

#define __show_map_type(type)                                                \
        __print_symbolic_u64(type,                                        \
                { EXTENT_MAP_LAST_BYTE, "LAST_BYTE"         },                \
                { EXTENT_MAP_HOLE,         "HOLE"                 },                \
                { EXTENT_MAP_INLINE,        "INLINE"        })

#define show_map_type(type)                        \
        type, (type >= EXTENT_MAP_LAST_BYTE) ? "-" :  __show_map_type(type)

#define show_map_flags(flag)                                                \
        __print_flags(flag, "|",                                        \
                { EXTENT_FLAG_PINNED,                "PINNED"        },\
                { EXTENT_FLAG_COMPRESS_ZLIB,        "COMPRESS_ZLIB"        },\
                { EXTENT_FLAG_COMPRESS_LZO,        "COMPRESS_LZO"        },\
                { EXTENT_FLAG_COMPRESS_ZSTD,        "COMPRESS_ZSTD"        },\
                { EXTENT_FLAG_PREALLOC,                "PREALLOC"        },\
                { EXTENT_FLAG_LOGGING,                "LOGGING"        })

TRACE_EVENT_CONDITION(btrfs_get_extent,

        TP_PROTO(const struct btrfs_root *root, const struct btrfs_inode *inode,
                 const struct extent_map *map),

        TP_ARGS(root, inode, map),

        TP_CONDITION(map),

        TP_STRUCT__entry_btrfs(
                __field(        u64,  root_objectid        )
                __field(        u64,  ino                )
                __field(        u64,  start                )
                __field(        u64,  len                )
                __field(        u64,  orig_start        )
                __field(        u64,  block_start        )
                __field(        u64,  block_len                )
                __field(        u32,  flags                )
                __field(        int,  refs                )
        ),

        TP_fast_assign_btrfs(root->fs_info,
                __entry->root_objectid        = root->root_key.objectid;
                __entry->ino                = btrfs_ino(inode);
                __entry->start                = map->start;
                __entry->len                = map->len;
                __entry->orig_start        = map->orig_start;
                __entry->block_start        = map->block_start;
                __entry->block_len        = map->block_len;
                __entry->flags                = map->flags;
                __entry->refs                = refcount_read(&map->refs);
        ),

        TP_printk_btrfs("root=%llu(%s) ino=%llu start=%llu len=%llu "
                  "orig_start=%llu block_start=%llu(%s) "
                  "block_len=%llu flags=%s refs=%u",
                  show_root_type(__entry->root_objectid),
                  __entry->ino,
                  __entry->start,
                  __entry->len,
                  __entry->orig_start,
                  show_map_type(__entry->block_start),
                  __entry->block_len,
                  show_map_flags(__entry->flags),
                  __entry->refs)
);

TRACE_EVENT(btrfs_handle_em_exist,

        TP_PROTO(const struct btrfs_fs_info *fs_info,
                const struct extent_map *existing, const struct extent_map *map,
                u64 start, u64 len),

        TP_ARGS(fs_info, existing, map, start, len),

        TP_STRUCT__entry_btrfs(
                __field(        u64,  e_start                )
                __field(        u64,  e_len                )
                __field(        u64,  map_start                )
                __field(        u64,  map_len                )
                __field(        u64,  start                )
                __field(        u64,  len                )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->e_start        = existing->start;
                __entry->e_len                = existing->len;
                __entry->map_start        = map->start;
                __entry->map_len        = map->len;
                __entry->start                = start;
                __entry->len                = len;
        ),

        TP_printk_btrfs("start=%llu len=%llu "
                  "existing(start=%llu len=%llu) "
                  "em(start=%llu len=%llu)",
                  __entry->start,
                  __entry->len,
                  __entry->e_start,
                  __entry->e_len,
                  __entry->map_start,
                  __entry->map_len)
);

/* file extent item */
DECLARE_EVENT_CLASS(btrfs__file_extent_item_regular,

        TP_PROTO(const struct btrfs_inode *bi, const struct extent_buffer *l,
                 const struct btrfs_file_extent_item *fi, u64 start),

        TP_ARGS(bi, l, fi, start),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        root_obj        )
                __field(        u64,        ino                )
                __field(        loff_t,        isize                )
                __field(        u64,        disk_isize        )
                __field(        u64,        num_bytes        )
                __field(        u64,        ram_bytes        )
                __field(        u64,        disk_bytenr        )
                __field(        u64,        disk_num_bytes        )
                __field(        u64,        extent_offset        )
                __field(        u8,        extent_type        )
                __field(        u8,        compression        )
                __field(        u64,        extent_start        )
                __field(        u64,        extent_end        )
        ),

        TP_fast_assign_btrfs(bi->root->fs_info,
                __entry->root_obj        = bi->root->root_key.objectid;
                __entry->ino                = btrfs_ino(bi);
                __entry->isize                = bi->vfs_inode.i_size;
                __entry->disk_isize        = bi->disk_i_size;
                __entry->num_bytes        = btrfs_file_extent_num_bytes(l, fi);
                __entry->ram_bytes        = btrfs_file_extent_ram_bytes(l, fi);
                __entry->disk_bytenr        = btrfs_file_extent_disk_bytenr(l, fi);
                __entry->disk_num_bytes        = btrfs_file_extent_disk_num_bytes(l, fi);
                __entry->extent_offset        = btrfs_file_extent_offset(l, fi);
                __entry->extent_type        = btrfs_file_extent_type(l, fi);
                __entry->compression        = btrfs_file_extent_compression(l, fi);
                __entry->extent_start        = start;
                __entry->extent_end        = (start + __entry->num_bytes);
        ),

        TP_printk_btrfs(
                "root=%llu(%s) inode=%llu size=%llu disk_isize=%llu "
                "file extent range=[%llu %llu] "
                "(num_bytes=%llu ram_bytes=%llu disk_bytenr=%llu "
                "disk_num_bytes=%llu extent_offset=%llu type=%s "
                "compression=%u",
                show_root_type(__entry->root_obj), __entry->ino,
                __entry->isize,
                __entry->disk_isize, __entry->extent_start,
                __entry->extent_end, __entry->num_bytes, __entry->ram_bytes,
                __entry->disk_bytenr, __entry->disk_num_bytes,
                __entry->extent_offset, __print_symbolic(__entry->extent_type, FI_TYPES),
                __entry->compression)
);

DECLARE_EVENT_CLASS(
        btrfs__file_extent_item_inline,

        TP_PROTO(const struct btrfs_inode *bi, const struct extent_buffer *l,
                 const struct btrfs_file_extent_item *fi, int slot, u64 start),

        TP_ARGS(bi, l, fi, slot,  start),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        root_obj        )
                __field(        u64,        ino                )
                __field(        loff_t,        isize                )
                __field(        u64,        disk_isize        )
                __field(        u8,        extent_type        )
                __field(        u8,        compression        )
                __field(        u64,        extent_start        )
                __field(        u64,        extent_end        )
        ),

        TP_fast_assign_btrfs(
                bi->root->fs_info,
                __entry->root_obj        = bi->root->root_key.objectid;
                __entry->ino                = btrfs_ino(bi);
                __entry->isize                = bi->vfs_inode.i_size;
                __entry->disk_isize        = bi->disk_i_size;
                __entry->extent_type        = btrfs_file_extent_type(l, fi);
                __entry->compression        = btrfs_file_extent_compression(l, fi);
                __entry->extent_start        = start;
                __entry->extent_end        = (start + btrfs_file_extent_ram_bytes(l, fi));
        ),

        TP_printk_btrfs(
                "root=%llu(%s) inode=%llu size=%llu disk_isize=%llu "
                "file extent range=[%llu %llu] "
                "extent_type=%s compression=%u",
                show_root_type(__entry->root_obj), __entry->ino, __entry->isize,
                __entry->disk_isize, __entry->extent_start,
                __entry->extent_end, __print_symbolic(__entry->extent_type, FI_TYPES),
                __entry->compression)
);

DEFINE_EVENT(
        btrfs__file_extent_item_regular, btrfs_get_extent_show_fi_regular,

        TP_PROTO(const struct btrfs_inode *bi, const struct extent_buffer *l,
                 const struct btrfs_file_extent_item *fi, u64 start),

        TP_ARGS(bi, l, fi, start)
);

DEFINE_EVENT(
        btrfs__file_extent_item_regular, btrfs_truncate_show_fi_regular,

        TP_PROTO(const struct btrfs_inode *bi, const struct extent_buffer *l,
                 const struct btrfs_file_extent_item *fi, u64 start),

        TP_ARGS(bi, l, fi, start)
);

DEFINE_EVENT(
        btrfs__file_extent_item_inline, btrfs_get_extent_show_fi_inline,

        TP_PROTO(const struct btrfs_inode *bi, const struct extent_buffer *l,
                 const struct btrfs_file_extent_item *fi, int slot, u64 start),

        TP_ARGS(bi, l, fi, slot, start)
);

DEFINE_EVENT(
        btrfs__file_extent_item_inline, btrfs_truncate_show_fi_inline,

        TP_PROTO(const struct btrfs_inode *bi, const struct extent_buffer *l,
                 const struct btrfs_file_extent_item *fi, int slot, u64 start),

        TP_ARGS(bi, l, fi, slot, start)
);

#define show_ordered_flags(flags)                                           \
        __print_flags(flags, "|",                                           \
                { (1 << BTRFS_ORDERED_REGULAR),         "REGULAR"         }, \
                { (1 << BTRFS_ORDERED_NOCOW),                 "NOCOW"         }, \
                { (1 << BTRFS_ORDERED_PREALLOC),         "PREALLOC"         }, \
                { (1 << BTRFS_ORDERED_COMPRESSED),         "COMPRESSED"         }, \
                { (1 << BTRFS_ORDERED_DIRECT),                 "DIRECT"         }, \
                { (1 << BTRFS_ORDERED_IO_DONE),         "IO_DONE"         }, \
                { (1 << BTRFS_ORDERED_COMPLETE),         "COMPLETE"         }, \
                { (1 << BTRFS_ORDERED_IOERR),                 "IOERR"         }, \
                { (1 << BTRFS_ORDERED_TRUNCATED),         "TRUNCATED"        })


DECLARE_EVENT_CLASS(btrfs__ordered_extent,

        TP_PROTO(const struct btrfs_inode *inode,
                 const struct btrfs_ordered_extent *ordered),

        TP_ARGS(inode, ordered),

        TP_STRUCT__entry_btrfs(
                __field(        u64,  ino                )
                __field(        u64,  file_offset        )
                __field(        u64,  start                )
                __field(        u64,  len                )
                __field(        u64,  disk_len                )
                __field(        u64,  bytes_left        )
                __field(        unsigned long,  flags        )
                __field(        int,  compress_type        )
                __field(        int,  refs                )
                __field(        u64,  root_objectid        )
                __field(        u64,  truncated_len        )
        ),

        TP_fast_assign_btrfs(inode->root->fs_info,
                __entry->ino                 = btrfs_ino(inode);
                __entry->file_offset        = ordered->file_offset;
                __entry->start                = ordered->disk_bytenr;
                __entry->len                = ordered->num_bytes;
                __entry->disk_len        = ordered->disk_num_bytes;
                __entry->bytes_left        = ordered->bytes_left;
                __entry->flags                = ordered->flags;
                __entry->compress_type        = ordered->compress_type;
                __entry->refs                = refcount_read(&ordered->refs);
                __entry->root_objectid        = inode->root->root_key.objectid;
                __entry->truncated_len        = ordered->truncated_len;
        ),

        TP_printk_btrfs("root=%llu(%s) ino=%llu file_offset=%llu "
                  "start=%llu len=%llu disk_len=%llu "
                  "truncated_len=%llu "
                  "bytes_left=%llu flags=%s compress_type=%d "
                  "refs=%d",
                  show_root_type(__entry->root_objectid),
                  __entry->ino,
                  __entry->file_offset,
                  __entry->start,
                  __entry->len,
                  __entry->disk_len,
                  __entry->truncated_len,
                  __entry->bytes_left,
                  show_ordered_flags(__entry->flags),
                  __entry->compress_type, __entry->refs)
);

DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_add,

        TP_PROTO(const struct btrfs_inode *inode,
                 const struct btrfs_ordered_extent *ordered),

        TP_ARGS(inode, ordered)
);

DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_remove,

        TP_PROTO(const struct btrfs_inode *inode,
                 const struct btrfs_ordered_extent *ordered),

        TP_ARGS(inode, ordered)
);

DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_start,

        TP_PROTO(const struct btrfs_inode *inode,
                 const struct btrfs_ordered_extent *ordered),

        TP_ARGS(inode, ordered)
);

DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_put,

        TP_PROTO(const struct btrfs_inode *inode,
                 const struct btrfs_ordered_extent *ordered),

        TP_ARGS(inode, ordered)
);

DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup,

             TP_PROTO(const struct btrfs_inode *inode,
                      const struct btrfs_ordered_extent *ordered),

             TP_ARGS(inode, ordered)
);

DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup_range,

             TP_PROTO(const struct btrfs_inode *inode,
                      const struct btrfs_ordered_extent *ordered),

             TP_ARGS(inode, ordered)
);

DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup_first_range,

             TP_PROTO(const struct btrfs_inode *inode,
                      const struct btrfs_ordered_extent *ordered),

             TP_ARGS(inode, ordered)
);

DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup_for_logging,

             TP_PROTO(const struct btrfs_inode *inode,
                      const struct btrfs_ordered_extent *ordered),

             TP_ARGS(inode, ordered)
);

DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup_first,

             TP_PROTO(const struct btrfs_inode *inode,
                      const struct btrfs_ordered_extent *ordered),

             TP_ARGS(inode, ordered)
);

DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_split,

             TP_PROTO(const struct btrfs_inode *inode,
                      const struct btrfs_ordered_extent *ordered),

             TP_ARGS(inode, ordered)
);

DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_dec_test_pending,

             TP_PROTO(const struct btrfs_inode *inode,
                      const struct btrfs_ordered_extent *ordered),

             TP_ARGS(inode, ordered)
);

DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_mark_finished,

             TP_PROTO(const struct btrfs_inode *inode,
                      const struct btrfs_ordered_extent *ordered),

             TP_ARGS(inode, ordered)
);

TRACE_EVENT(btrfs_finish_ordered_extent,

        TP_PROTO(const struct btrfs_inode *inode, u64 start, u64 len,
                 bool uptodate),

        TP_ARGS(inode, start, len, uptodate),

        TP_STRUCT__entry_btrfs(
                __field(        u64,         ino                )
                __field(        u64,         start                )
                __field(        u64,         len                )
                __field(        bool,         uptodate        )
                __field(        u64,         root_objectid        )
        ),

        TP_fast_assign_btrfs(inode->root->fs_info,
                __entry->ino        = btrfs_ino(inode);
                __entry->start        = start;
                __entry->len        = len;
                __entry->uptodate = uptodate;
                __entry->root_objectid = inode->root->root_key.objectid;
        ),

        TP_printk_btrfs("root=%llu(%s) ino=%llu start=%llu len=%llu uptodate=%d",
                  show_root_type(__entry->root_objectid),
                  __entry->ino, __entry->start,
                  __entry->len, !!__entry->uptodate)
);

DECLARE_EVENT_CLASS(btrfs__writepage,

        TP_PROTO(const struct page *page, const struct inode *inode,
                 const struct writeback_control *wbc),

        TP_ARGS(page, inode, wbc),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        ino                        )
                __field(        pgoff_t,  index                        )
                __field(        long,   nr_to_write                )
                __field(        long,   pages_skipped                )
                __field(        loff_t, range_start                )
                __field(        loff_t, range_end                )
                __field(        char,   for_kupdate                )
                __field(        char,   for_reclaim                )
                __field(        char,   range_cyclic                )
                __field(        unsigned long,  writeback_index        )
                __field(        u64,    root_objectid                )
        ),

        TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
                __entry->ino                = btrfs_ino(BTRFS_I(inode));
                __entry->index                = page->index;
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->pages_skipped        = wbc->pages_skipped;
                __entry->range_start        = wbc->range_start;
                __entry->range_end        = wbc->range_end;
                __entry->for_kupdate        = wbc->for_kupdate;
                __entry->for_reclaim        = wbc->for_reclaim;
                __entry->range_cyclic        = wbc->range_cyclic;
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->root_objectid        =
                                 BTRFS_I(inode)->root->root_key.objectid;
        ),

        TP_printk_btrfs("root=%llu(%s) ino=%llu page_index=%lu "
                  "nr_to_write=%ld pages_skipped=%ld range_start=%llu "
                  "range_end=%llu for_kupdate=%d "
                  "for_reclaim=%d range_cyclic=%d writeback_index=%lu",
                  show_root_type(__entry->root_objectid),
                  __entry->ino, __entry->index,
                  __entry->nr_to_write, __entry->pages_skipped,
                  __entry->range_start, __entry->range_end,
                  __entry->for_kupdate,
                  __entry->for_reclaim, __entry->range_cyclic,
                  __entry->writeback_index)
);

DEFINE_EVENT(btrfs__writepage, __extent_writepage,

        TP_PROTO(const struct page *page, const struct inode *inode,
                 const struct writeback_control *wbc),

        TP_ARGS(page, inode, wbc)
);

TRACE_EVENT(btrfs_writepage_end_io_hook,

        TP_PROTO(const struct btrfs_inode *inode, u64 start, u64 end,
                 int uptodate),

        TP_ARGS(inode, start, end, uptodate),

        TP_STRUCT__entry_btrfs(
                __field(        u64,         ino                )
                __field(        u64,         start                )
                __field(        u64,         end                )
                __field(        int,         uptodate        )
                __field(        u64,    root_objectid        )
        ),

        TP_fast_assign_btrfs(inode->root->fs_info,
                __entry->ino        = btrfs_ino(inode);
                __entry->start        = start;
                __entry->end        = end;
                __entry->uptodate = uptodate;
                __entry->root_objectid = inode->root->root_key.objectid;
        ),

        TP_printk_btrfs("root=%llu(%s) ino=%llu start=%llu end=%llu uptodate=%d",
                  show_root_type(__entry->root_objectid),
                  __entry->ino, __entry->start,
                  __entry->end, __entry->uptodate)
);

TRACE_EVENT(btrfs_sync_file,

        TP_PROTO(const struct file *file, int datasync),

        TP_ARGS(file, datasync),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        ino                )
                __field(        u64,        parent                )
                __field(        int,    datasync        )
                __field(        u64,    root_objectid        )
        ),

        TP_fast_assign(
                const struct dentry *dentry = file->f_path.dentry;
                const struct inode *inode = d_inode(dentry);

                TP_fast_assign_fsid(btrfs_sb(file->f_path.dentry->d_sb));
                __entry->ino                = btrfs_ino(BTRFS_I(inode));
                __entry->parent                = btrfs_ino(BTRFS_I(d_inode(dentry->d_parent)));
                __entry->datasync        = datasync;
                __entry->root_objectid        =
                                 BTRFS_I(inode)->root->root_key.objectid;
        ),

        TP_printk_btrfs("root=%llu(%s) ino=%llu parent=%llu datasync=%d",
                  show_root_type(__entry->root_objectid),
                  __entry->ino,
                  __entry->parent,
                  __entry->datasync)
);

TRACE_EVENT(btrfs_sync_fs,

        TP_PROTO(const struct btrfs_fs_info *fs_info, int wait),

        TP_ARGS(fs_info, wait),

        TP_STRUCT__entry_btrfs(
                __field(        int,  wait                )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->wait        = wait;
        ),

        TP_printk_btrfs("wait=%d", __entry->wait)
);

TRACE_EVENT(btrfs_add_block_group,

        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_block_group *block_group, int create),

        TP_ARGS(fs_info, block_group, create),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        offset                        )
                __field(        u64,        size                        )
                __field(        u64,        flags                        )
                __field(        u64,        bytes_used                )
                __field(        u64,        bytes_super                )
                __field(        int,        create                        )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->offset                = block_group->start;
                __entry->size                = block_group->length;
                __entry->flags                = block_group->flags;
                __entry->bytes_used        = block_group->used;
                __entry->bytes_super        = block_group->bytes_super;
                __entry->create                = create;
        ),

        TP_printk_btrfs("block_group offset=%llu size=%llu "
                  "flags=%llu(%s) bytes_used=%llu bytes_super=%llu "
                  "create=%d",
                  __entry->offset,
                  __entry->size,
                  __entry->flags,
                  __print_flags((unsigned long)__entry->flags, "|",
                                BTRFS_GROUP_FLAGS),
                  __entry->bytes_used,
                  __entry->bytes_super, __entry->create)
);

#define show_ref_action(action)                                                \
        __print_symbolic(action,                                        \
                { BTRFS_ADD_DELAYED_REF,    "ADD_DELAYED_REF" },        \
                { BTRFS_DROP_DELAYED_REF,   "DROP_DELAYED_REF" },        \
                { BTRFS_ADD_DELAYED_EXTENT, "ADD_DELAYED_EXTENT" },         \
                { BTRFS_UPDATE_DELAYED_HEAD, "UPDATE_DELAYED_HEAD" })
                        

DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref,

        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_delayed_ref_node *ref),

        TP_ARGS(fs_info, ref),

        TP_STRUCT__entry_btrfs(
                __field(        u64,  bytenr                )
                __field(        u64,  num_bytes                )
                __field(        int,  action                ) 
                __field(        u64,  parent                )
                __field(        u64,  ref_root                )
                __field(        int,  level                )
                __field(        int,  type                )
                __field(        u64,  seq                )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->bytenr                = ref->bytenr;
                __entry->num_bytes        = ref->num_bytes;
                __entry->action                = ref->action;
                __entry->parent                = ref->parent;
                __entry->ref_root        = ref->ref_root;
                __entry->level                = ref->tree_ref.level;
                __entry->type                = ref->type;
                __entry->seq                = ref->seq;
        ),

        TP_printk_btrfs("bytenr=%llu num_bytes=%llu action=%s "
                  "parent=%llu(%s) ref_root=%llu(%s) level=%d "
                  "type=%s seq=%llu",
                  __entry->bytenr,
                  __entry->num_bytes,
                  show_ref_action(__entry->action),
                  show_root_type(__entry->parent),
                  show_root_type(__entry->ref_root),
                  __entry->level, show_ref_type(__entry->type),
                  __entry->seq)
);

DEFINE_EVENT(btrfs_delayed_tree_ref,  add_delayed_tree_ref,

        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_delayed_ref_node *ref),

        TP_ARGS(fs_info, ref)
);

DEFINE_EVENT(btrfs_delayed_tree_ref,  run_delayed_tree_ref,

        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_delayed_ref_node *ref),

        TP_ARGS(fs_info, ref)
);

DECLARE_EVENT_CLASS(btrfs_delayed_data_ref,

        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_delayed_ref_node *ref),

        TP_ARGS(fs_info, ref),

        TP_STRUCT__entry_btrfs(
                __field(        u64,  bytenr                )
                __field(        u64,  num_bytes                )
                __field(        int,  action                ) 
                __field(        u64,  parent                )
                __field(        u64,  ref_root                )
                __field(        u64,  owner                )
                __field(        u64,  offset                )
                __field(        int,  type                )
                __field(        u64,  seq                )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->bytenr                = ref->bytenr;
                __entry->num_bytes        = ref->num_bytes;
                __entry->action                = ref->action;
                __entry->parent                = ref->parent;
                __entry->ref_root        = ref->ref_root;
                __entry->owner                = ref->data_ref.objectid;
                __entry->offset                = ref->data_ref.offset;
                __entry->type                = ref->type;
                __entry->seq                = ref->seq;
        ),

        TP_printk_btrfs("bytenr=%llu num_bytes=%llu action=%s "
                  "parent=%llu(%s) ref_root=%llu(%s) owner=%llu "
                  "offset=%llu type=%s seq=%llu",
                  __entry->bytenr,
                  __entry->num_bytes,
                  show_ref_action(__entry->action),
                  show_root_type(__entry->parent),
                  show_root_type(__entry->ref_root),
                  __entry->owner,
                  __entry->offset,
                  show_ref_type(__entry->type),
                  __entry->seq)
);

DEFINE_EVENT(btrfs_delayed_data_ref,  add_delayed_data_ref,

        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_delayed_ref_node *ref),

        TP_ARGS(fs_info, ref)
);

DEFINE_EVENT(btrfs_delayed_data_ref,  run_delayed_data_ref,

        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_delayed_ref_node *ref),

        TP_ARGS(fs_info, ref)
);

DECLARE_EVENT_CLASS(btrfs_delayed_ref_head,

        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_delayed_ref_head *head_ref,
                 int action),

        TP_ARGS(fs_info, head_ref, action),

        TP_STRUCT__entry_btrfs(
                __field(        u64,  bytenr                )
                __field(        u64,  num_bytes                )
                __field(        int,  action                ) 
                __field(        int,  is_data                )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->bytenr                = head_ref->bytenr;
                __entry->num_bytes        = head_ref->num_bytes;
                __entry->action                = action;
                __entry->is_data        = head_ref->is_data;
        ),

        TP_printk_btrfs("bytenr=%llu num_bytes=%llu action=%s is_data=%d",
                  __entry->bytenr,
                  __entry->num_bytes,
                  show_ref_action(__entry->action),
                  __entry->is_data)
);

DEFINE_EVENT(btrfs_delayed_ref_head,  add_delayed_ref_head,

        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_delayed_ref_head *head_ref,
                 int action),

        TP_ARGS(fs_info, head_ref, action)
);

DEFINE_EVENT(btrfs_delayed_ref_head,  run_delayed_ref_head,

        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_delayed_ref_head *head_ref,
                 int action),

        TP_ARGS(fs_info, head_ref, action)
);

#define show_chunk_type(type)                                        \
        __print_flags(type, "|",                                \
                { BTRFS_BLOCK_GROUP_DATA,         "DATA"        },        \
                { BTRFS_BLOCK_GROUP_SYSTEM,         "SYSTEM"},        \
                { BTRFS_BLOCK_GROUP_METADATA,         "METADATA"},        \
                { BTRFS_BLOCK_GROUP_RAID0,         "RAID0" },        \
                { BTRFS_BLOCK_GROUP_RAID1,         "RAID1" },        \
                { BTRFS_BLOCK_GROUP_DUP,         "DUP"        },        \
                { BTRFS_BLOCK_GROUP_RAID10,         "RAID10"},        \
                { BTRFS_BLOCK_GROUP_RAID5,         "RAID5"        },        \
                { BTRFS_BLOCK_GROUP_RAID6,         "RAID6"        })

DECLARE_EVENT_CLASS(btrfs__chunk,

        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_chunk_map *map, u64 offset, u64 size),

        TP_ARGS(fs_info, map, offset, size),

        TP_STRUCT__entry_btrfs(
                __field(        int,  num_stripes                )
                __field(        u64,  type                        )
                __field(        int,  sub_stripes                )
                __field(        u64,  offset                        )
                __field(        u64,  size                        )
                __field(        u64,  root_objectid                )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->num_stripes        = map->num_stripes;
                __entry->type                = map->type;
                __entry->sub_stripes        = map->sub_stripes;
                __entry->offset                = offset;
                __entry->size                = size;
                __entry->root_objectid        = fs_info->chunk_root->root_key.objectid;
        ),

        TP_printk_btrfs("root=%llu(%s) offset=%llu size=%llu "
                  "num_stripes=%d sub_stripes=%d type=%s",
                  show_root_type(__entry->root_objectid),
                  __entry->offset,
                  __entry->size,
                  __entry->num_stripes, __entry->sub_stripes,
                  show_chunk_type(__entry->type))
);

DEFINE_EVENT(btrfs__chunk,  btrfs_chunk_alloc,

        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_chunk_map *map, u64 offset, u64 size),

        TP_ARGS(fs_info, map, offset, size)
);

DEFINE_EVENT(btrfs__chunk,  btrfs_chunk_free,

        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_chunk_map *map, u64 offset, u64 size),

        TP_ARGS(fs_info, map, offset, size)
);

TRACE_EVENT(btrfs_cow_block,

        TP_PROTO(const struct btrfs_root *root, const struct extent_buffer *buf,
                 const struct extent_buffer *cow),

        TP_ARGS(root, buf, cow),

        TP_STRUCT__entry_btrfs(
                __field(        u64,  root_objectid                )
                __field(        u64,  buf_start                        )
                __field(        int,  refs                        )
                __field(        u64,  cow_start                        )
                __field(        int,  buf_level                        )
                __field(        int,  cow_level                        )
        ),

        TP_fast_assign_btrfs(root->fs_info,
                __entry->root_objectid        = root->root_key.objectid;
                __entry->buf_start        = buf->start;
                __entry->refs                = atomic_read(&buf->refs);
                __entry->cow_start        = cow->start;
                __entry->buf_level        = btrfs_header_level(buf);
                __entry->cow_level        = btrfs_header_level(cow);
        ),

        TP_printk_btrfs("root=%llu(%s) refs=%d orig_buf=%llu "
                  "(orig_level=%d) cow_buf=%llu (cow_level=%d)",
                  show_root_type(__entry->root_objectid),
                  __entry->refs,
                  __entry->buf_start,
                  __entry->buf_level,
                  __entry->cow_start,
                  __entry->cow_level)
);

TRACE_EVENT(btrfs_space_reservation,

        TP_PROTO(const struct btrfs_fs_info *fs_info, const char *type, u64 val,
                 u64 bytes, int reserve),

        TP_ARGS(fs_info, type, val, bytes, reserve),

        TP_STRUCT__entry_btrfs(
                __string(        type,        type                        )
                __field(        u64,        val                        )
                __field(        u64,        bytes                        )
                __field(        int,        reserve                        )
        ),

        TP_fast_assign_btrfs(fs_info,
                __assign_str(type);
                __entry->val                = val;
                __entry->bytes                = bytes;
                __entry->reserve        = reserve;
        ),

        TP_printk_btrfs("%s: %llu %s %llu", __get_str(type), __entry->val,
                        __entry->reserve ? "reserve" : "release",
                        __entry->bytes)
);

TRACE_EVENT(btrfs_trigger_flush,

        TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 bytes,
                 int flush, const char *reason),

        TP_ARGS(fs_info, flags, bytes, flush, reason),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        flags                        )
                __field(        u64,        bytes                        )
                __field(        int,        flush                        )
                __string(        reason,        reason                        )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->flags        = flags;
                __entry->bytes        = bytes;
                __entry->flush        = flush;
                __assign_str(reason);
        ),

        TP_printk_btrfs("%s: flush=%d(%s) flags=%llu(%s) bytes=%llu",
                  __get_str(reason), __entry->flush,
                  __print_symbolic(__entry->flush, FLUSH_ACTIONS),
                  __entry->flags,
                  __print_flags((unsigned long)__entry->flags, "|",
                                BTRFS_GROUP_FLAGS),
                  __entry->bytes)
);


TRACE_EVENT(btrfs_flush_space,

        TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 num_bytes,
                 int state, int ret, bool for_preempt),

        TP_ARGS(fs_info, flags, num_bytes, state, ret, for_preempt),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        flags                        )
                __field(        u64,        num_bytes                )
                __field(        int,        state                        )
                __field(        int,        ret                        )
                __field(       bool,        for_preempt                )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->flags                =        flags;
                __entry->num_bytes        =        num_bytes;
                __entry->state                =        state;
                __entry->ret                =        ret;
                __entry->for_preempt        =        for_preempt;
        ),

        TP_printk_btrfs("state=%d(%s) flags=%llu(%s) num_bytes=%llu ret=%d for_preempt=%d",
                  __entry->state,
                  __print_symbolic(__entry->state, FLUSH_STATES),
                  __entry->flags,
                  __print_flags((unsigned long)__entry->flags, "|",
                                BTRFS_GROUP_FLAGS),
                  __entry->num_bytes, __entry->ret, __entry->for_preempt)
);

DECLARE_EVENT_CLASS(btrfs__reserved_extent,

        TP_PROTO(const struct btrfs_fs_info *fs_info, u64 start, u64 len),

        TP_ARGS(fs_info, start, len),

        TP_STRUCT__entry_btrfs(
                __field(        u64,  start                        )
                __field(        u64,  len                        )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->start                = start;
                __entry->len                = len;
        ),

        TP_printk_btrfs("root=%llu(%s) start=%llu len=%llu",
                  show_root_type(BTRFS_EXTENT_TREE_OBJECTID),
                  __entry->start,
                  __entry->len)
);

DEFINE_EVENT(btrfs__reserved_extent,  btrfs_reserved_extent_alloc,

        TP_PROTO(const struct btrfs_fs_info *fs_info, u64 start, u64 len),

        TP_ARGS(fs_info, start, len)
);

DEFINE_EVENT(btrfs__reserved_extent,  btrfs_reserved_extent_free,

        TP_PROTO(const struct btrfs_fs_info *fs_info, u64 start, u64 len),

        TP_ARGS(fs_info, start, len)
);

TRACE_EVENT(find_free_extent,

        TP_PROTO(const struct btrfs_root *root,
                 const struct find_free_extent_ctl *ffe_ctl),

        TP_ARGS(root, ffe_ctl),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        root_objectid                )
                __field(        u64,        num_bytes                )
                __field(        u64,        empty_size                )
                __field(        u64,        flags                        )
        ),

        TP_fast_assign_btrfs(root->fs_info,
                __entry->root_objectid        = root->root_key.objectid;
                __entry->num_bytes        = ffe_ctl->num_bytes;
                __entry->empty_size        = ffe_ctl->empty_size;
                __entry->flags                = ffe_ctl->flags;
        ),

        TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)",
                  show_root_type(__entry->root_objectid),
                  __entry->num_bytes, __entry->empty_size, __entry->flags,
                  __print_flags((unsigned long)__entry->flags, "|",
                                 BTRFS_GROUP_FLAGS))
);

TRACE_EVENT(find_free_extent_search_loop,

        TP_PROTO(const struct btrfs_root *root,
                 const struct find_free_extent_ctl *ffe_ctl),

        TP_ARGS(root, ffe_ctl),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        root_objectid                )
                __field(        u64,        num_bytes                )
                __field(        u64,        empty_size                )
                __field(        u64,        flags                        )
                __field(        u64,        loop                        )
        ),

        TP_fast_assign_btrfs(root->fs_info,
                __entry->root_objectid        = root->root_key.objectid;
                __entry->num_bytes        = ffe_ctl->num_bytes;
                __entry->empty_size        = ffe_ctl->empty_size;
                __entry->flags                = ffe_ctl->flags;
                __entry->loop                = ffe_ctl->loop;
        ),

        TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu",
                  show_root_type(__entry->root_objectid),
                  __entry->num_bytes, __entry->empty_size, __entry->flags,
                  __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS),
                  __entry->loop)
);

TRACE_EVENT(find_free_extent_have_block_group,

        TP_PROTO(const struct btrfs_root *root,
                 const struct find_free_extent_ctl *ffe_ctl,
                 const struct btrfs_block_group *block_group),

        TP_ARGS(root, ffe_ctl, block_group),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        root_objectid                )
                __field(        u64,        num_bytes                )
                __field(        u64,        empty_size                )
                __field(        u64,        flags                        )
                __field(        u64,        loop                        )
                __field(        bool,        hinted                        )
                __field(        u64,        bg_start                )
                __field(        u64,        bg_flags                )
        ),

        TP_fast_assign_btrfs(root->fs_info,
                __entry->root_objectid        = root->root_key.objectid;
                __entry->num_bytes        = ffe_ctl->num_bytes;
                __entry->empty_size        = ffe_ctl->empty_size;
                __entry->flags                = ffe_ctl->flags;
                __entry->loop                = ffe_ctl->loop;
                __entry->hinted                = ffe_ctl->hinted;
                __entry->bg_start        = block_group->start;
                __entry->bg_flags        = block_group->flags;
        ),

        TP_printk_btrfs(
"root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%llu hinted=%d block_group=%llu bg_flags=%llu(%s)",
                  show_root_type(__entry->root_objectid),
                  __entry->num_bytes, __entry->empty_size, __entry->flags,
                  __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS),
                  __entry->loop, __entry->hinted,
                  __entry->bg_start, __entry->bg_flags,
                  __print_flags((unsigned long)__entry->bg_flags, "|",
                                 BTRFS_GROUP_FLAGS))
);

DECLARE_EVENT_CLASS(btrfs__reserve_extent,

        TP_PROTO(const struct btrfs_block_group *block_group,
                 const struct find_free_extent_ctl *ffe_ctl),

        TP_ARGS(block_group, ffe_ctl),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        bg_objectid                )
                __field(        u64,        flags                        )
                __field(        int,        bg_size_class                )
                __field(        u64,        start                        )
                __field(        u64,        len                        )
                __field(        u64,        loop                        )
                __field(        bool,        hinted                        )
                __field(        int,        size_class                )
        ),

        TP_fast_assign_btrfs(block_group->fs_info,
                __entry->bg_objectid        = block_group->start;
                __entry->flags                = block_group->flags;
                __entry->bg_size_class        = block_group->size_class;
                __entry->start                = ffe_ctl->search_start;
                __entry->len                = ffe_ctl->num_bytes;
                __entry->loop                = ffe_ctl->loop;
                __entry->hinted                = ffe_ctl->hinted;
                __entry->size_class        = ffe_ctl->size_class;
        ),

        TP_printk_btrfs(
"root=%llu(%s) block_group=%llu flags=%llu(%s) bg_size_class=%d start=%llu len=%llu loop=%llu hinted=%d size_class=%d",
                  show_root_type(BTRFS_EXTENT_TREE_OBJECTID),
                  __entry->bg_objectid,
                  __entry->flags, __print_flags((unsigned long)__entry->flags,
                                                "|", BTRFS_GROUP_FLAGS),
                  __entry->bg_size_class, __entry->start, __entry->len,
                  __entry->loop, __entry->hinted, __entry->size_class)
);

DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,

        TP_PROTO(const struct btrfs_block_group *block_group,
                 const struct find_free_extent_ctl *ffe_ctl),

        TP_ARGS(block_group, ffe_ctl)
);

DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster,

        TP_PROTO(const struct btrfs_block_group *block_group,
                 const struct find_free_extent_ctl *ffe_ctl),

        TP_ARGS(block_group, ffe_ctl)
);

TRACE_EVENT(btrfs_find_cluster,

        TP_PROTO(const struct btrfs_block_group *block_group, u64 start,
                 u64 bytes, u64 empty_size, u64 min_bytes),

        TP_ARGS(block_group, start, bytes, empty_size, min_bytes),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        bg_objectid                )
                __field(        u64,        flags                        )
                __field(        u64,        start                        )
                __field(        u64,        bytes                        )
                __field(        u64,        empty_size                )
                __field(        u64,        min_bytes                )
        ),

        TP_fast_assign_btrfs(block_group->fs_info,
                __entry->bg_objectid        = block_group->start;
                __entry->flags                = block_group->flags;
                __entry->start                = start;
                __entry->bytes                = bytes;
                __entry->empty_size        = empty_size;
                __entry->min_bytes        = min_bytes;
        ),

        TP_printk_btrfs("block_group=%llu flags=%llu(%s) start=%llu len=%llu "
                  "empty_size=%llu min_bytes=%llu", __entry->bg_objectid,
                  __entry->flags,
                  __print_flags((unsigned long)__entry->flags, "|",
                                BTRFS_GROUP_FLAGS), __entry->start,
                  __entry->bytes, __entry->empty_size,  __entry->min_bytes)
);

TRACE_EVENT(btrfs_failed_cluster_setup,

        TP_PROTO(const struct btrfs_block_group *block_group),

        TP_ARGS(block_group),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        bg_objectid                )
        ),

        TP_fast_assign_btrfs(block_group->fs_info,
                __entry->bg_objectid        = block_group->start;
        ),

        TP_printk_btrfs("block_group=%llu", __entry->bg_objectid)
);

TRACE_EVENT(btrfs_setup_cluster,

        TP_PROTO(const struct btrfs_block_group *block_group,
                 const struct btrfs_free_cluster *cluster,
                 u64 size, int bitmap),

        TP_ARGS(block_group, cluster, size, bitmap),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        bg_objectid                )
                __field(        u64,        flags                        )
                __field(        u64,        start                        )
                __field(        u64,        max_size                )
                __field(        u64,        size                        )
                __field(        int,        bitmap                        )
        ),

        TP_fast_assign_btrfs(block_group->fs_info,
                __entry->bg_objectid        = block_group->start;
                __entry->flags                = block_group->flags;
                __entry->start                = cluster->window_start;
                __entry->max_size        = cluster->max_size;
                __entry->size                = size;
                __entry->bitmap                = bitmap;
        ),

        TP_printk_btrfs("block_group=%llu flags=%llu(%s) window_start=%llu "
                  "size=%llu max_size=%llu bitmap=%d",
                  __entry->bg_objectid,
                  __entry->flags,
                  __print_flags((unsigned long)__entry->flags, "|",
                                BTRFS_GROUP_FLAGS), __entry->start,
                  __entry->size, __entry->max_size, __entry->bitmap)
);

struct extent_state;
TRACE_EVENT(alloc_extent_state,

        TP_PROTO(const struct extent_state *state,
                 gfp_t mask, unsigned long IP),

        TP_ARGS(state, mask, IP),

        TP_STRUCT__entry(
                __field(const struct extent_state *, state)
                __field(unsigned long, mask)
                __field(const void*, ip)
        ),

        TP_fast_assign(
                __entry->state        = state,
                __entry->mask        = (__force unsigned long)mask,
                __entry->ip        = (const void *)IP
        ),

        TP_printk("state=%p mask=%s caller=%pS", __entry->state,
                  show_gfp_flags(__entry->mask), __entry->ip)
);

TRACE_EVENT(free_extent_state,

        TP_PROTO(const struct extent_state *state, unsigned long IP),

        TP_ARGS(state, IP),

        TP_STRUCT__entry(
                __field(const struct extent_state *, state)
                __field(const void*, ip)
        ),

        TP_fast_assign(
                __entry->state        = state,
                __entry->ip = (const void *)IP
        ),

        TP_printk("state=%p caller=%pS", __entry->state, __entry->ip)
);

DECLARE_EVENT_CLASS(btrfs__work,

        TP_PROTO(const struct btrfs_work *work),

        TP_ARGS(work),

        TP_STRUCT__entry_btrfs(
                __field(        const void *,        work                        )
                __field(        const void *,        wq                        )
                __field(        const void *,        func                        )
                __field(        const void *,        ordered_func                )
                __field(        const void *,        normal_work                )
        ),

        TP_fast_assign_btrfs(btrfs_work_owner(work),
                __entry->work                = work;
                __entry->wq                = work->wq;
                __entry->func                = work->func;
                __entry->ordered_func        = work->ordered_func;
                __entry->normal_work        = &work->normal_work;
        ),

        TP_printk_btrfs("work=%p (normal_work=%p) wq=%p func=%ps ordered_func=%p",
                  __entry->work, __entry->normal_work, __entry->wq,
                   __entry->func, __entry->ordered_func)
);

/*
 * For situations when the work is freed, we pass fs_info and a tag that matches
 * the address of the work structure so it can be paired with the scheduling
 * event. DO NOT add anything here that dereferences wtag.
 */
DECLARE_EVENT_CLASS(btrfs__work__done,

        TP_PROTO(const struct btrfs_fs_info *fs_info, const void *wtag),

        TP_ARGS(fs_info, wtag),

        TP_STRUCT__entry_btrfs(
                __field(        const void *,        wtag                        )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->wtag                = wtag;
        ),

        TP_printk_btrfs("work->%p", __entry->wtag)
);

DEFINE_EVENT(btrfs__work, btrfs_work_queued,

        TP_PROTO(const struct btrfs_work *work),

        TP_ARGS(work)
);

DEFINE_EVENT(btrfs__work, btrfs_work_sched,

        TP_PROTO(const struct btrfs_work *work),

        TP_ARGS(work)
);

DEFINE_EVENT(btrfs__work__done, btrfs_all_work_done,

        TP_PROTO(const struct btrfs_fs_info *fs_info, const void *wtag),

        TP_ARGS(fs_info, wtag)
);

DEFINE_EVENT(btrfs__work, btrfs_ordered_sched,

        TP_PROTO(const struct btrfs_work *work),

        TP_ARGS(work)
);

DECLARE_EVENT_CLASS(btrfs_workqueue,

        TP_PROTO(const struct btrfs_workqueue *wq, const char *name),

        TP_ARGS(wq, name),

        TP_STRUCT__entry_btrfs(
                __field(        const void *,        wq                        )
                __string(        name,        name                        )
        ),

        TP_fast_assign_btrfs(btrfs_workqueue_owner(wq),
                __entry->wq                = wq;
                __assign_str(name);
        ),

        TP_printk_btrfs("name=%s wq=%p", __get_str(name),
                  __entry->wq)
);

DEFINE_EVENT(btrfs_workqueue, btrfs_workqueue_alloc,

        TP_PROTO(const struct btrfs_workqueue *wq, const char *name),

        TP_ARGS(wq, name)
);

DECLARE_EVENT_CLASS(btrfs_workqueue_done,

        TP_PROTO(const struct btrfs_workqueue *wq),

        TP_ARGS(wq),

        TP_STRUCT__entry_btrfs(
                __field(        const void *,        wq                )
        ),

        TP_fast_assign_btrfs(btrfs_workqueue_owner(wq),
                __entry->wq                = wq;
        ),

        TP_printk_btrfs("wq=%p", __entry->wq)
);

DEFINE_EVENT(btrfs_workqueue_done, btrfs_workqueue_destroy,

        TP_PROTO(const struct btrfs_workqueue *wq),

        TP_ARGS(wq)
);

#define BTRFS_QGROUP_OPERATIONS                                \
        { QGROUP_RESERVE,        "reserve"        },        \
        { QGROUP_RELEASE,        "release"        },        \
        { QGROUP_FREE,                "free"                }

DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data,

        TP_PROTO(const struct inode *inode, u64 start, u64 len,
                 u64 reserved, int op),

        TP_ARGS(inode, start, len, reserved, op),

        TP_STRUCT__entry_btrfs(
                __field(        u64,                rootid                )
                __field(        u64,                ino                )
                __field(        u64,                start                )
                __field(        u64,                len                )
                __field(        u64,                reserved        )
                __field(        int,                op                )
        ),

        TP_fast_assign_btrfs(btrfs_sb(inode->i_sb),
                __entry->rootid                =
                        BTRFS_I(inode)->root->root_key.objectid;
                __entry->ino                = btrfs_ino(BTRFS_I(inode));
                __entry->start                = start;
                __entry->len                = len;
                __entry->reserved        = reserved;
                __entry->op                = op;
        ),

        TP_printk_btrfs("root=%llu ino=%llu start=%llu len=%llu reserved=%llu op=%s",
                  __entry->rootid, __entry->ino, __entry->start, __entry->len,
                  __entry->reserved,
                  __print_flags((unsigned long)__entry->op, "",
                                BTRFS_QGROUP_OPERATIONS)
        )
);

DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_reserve_data,

        TP_PROTO(const struct inode *inode, u64 start, u64 len,
                 u64 reserved, int op),

        TP_ARGS(inode, start, len, reserved, op)
);

DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data,

        TP_PROTO(const struct inode *inode, u64 start, u64 len,
                 u64 reserved, int op),

        TP_ARGS(inode, start, len, reserved, op)
);

DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_qgroup_extent_record *rec),

        TP_ARGS(fs_info, rec),

        TP_STRUCT__entry_btrfs(
                __field(        u64,  bytenr                )
                __field(        u64,  num_bytes                )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->bytenr                = rec->bytenr,
                __entry->num_bytes        = rec->num_bytes;
        ),

        TP_printk_btrfs("bytenr=%llu num_bytes=%llu",
                  __entry->bytenr, __entry->num_bytes)
);

DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents,

        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_qgroup_extent_record *rec),

        TP_ARGS(fs_info, rec)
);

DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_trace_extent,

        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_qgroup_extent_record *rec),

        TP_ARGS(fs_info, rec)
);

TRACE_EVENT(qgroup_num_dirty_extents,

        TP_PROTO(const struct btrfs_fs_info *fs_info, u64 transid,
                 u64 num_dirty_extents),

        TP_ARGS(fs_info, transid, num_dirty_extents),

        TP_STRUCT__entry_btrfs(
                __field(        u64, transid                        )
                __field(        u64, num_dirty_extents                )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->transid           = transid;
                __entry->num_dirty_extents = num_dirty_extents;
        ),

        TP_printk_btrfs("transid=%llu num_dirty_extents=%llu",
                __entry->transid, __entry->num_dirty_extents)
);

TRACE_EVENT(btrfs_qgroup_account_extent,

        TP_PROTO(const struct btrfs_fs_info *fs_info, u64 transid, u64 bytenr,
                 u64 num_bytes, u64 nr_old_roots, u64 nr_new_roots),

        TP_ARGS(fs_info, transid, bytenr, num_bytes, nr_old_roots,
                nr_new_roots),

        TP_STRUCT__entry_btrfs(
                __field(        u64,  transid                        )
                __field(        u64,  bytenr                        )
                __field(        u64,  num_bytes                        )
                __field(        u64,  nr_old_roots                )
                __field(        u64,  nr_new_roots                )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->transid        = transid;
                __entry->bytenr                = bytenr;
                __entry->num_bytes        = num_bytes;
                __entry->nr_old_roots        = nr_old_roots;
                __entry->nr_new_roots        = nr_new_roots;
        ),

        TP_printk_btrfs(
"transid=%llu bytenr=%llu num_bytes=%llu nr_old_roots=%llu nr_new_roots=%llu",
                __entry->transid,
                __entry->bytenr,
                __entry->num_bytes,
                __entry->nr_old_roots,
                __entry->nr_new_roots)
);

TRACE_EVENT(qgroup_update_counters,

        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_qgroup *qgroup,
                 u64 cur_old_count, u64 cur_new_count),

        TP_ARGS(fs_info, qgroup, cur_old_count, cur_new_count),

        TP_STRUCT__entry_btrfs(
                __field(        u64,  qgid                        )
                __field(        u64,  old_rfer                        )
                __field(        u64,  old_excl                        )
                __field(        u64,  cur_old_count                )
                __field(        u64,  cur_new_count                )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->qgid                = qgroup->qgroupid;
                __entry->old_rfer        = qgroup->rfer;
                __entry->old_excl        = qgroup->excl;
                __entry->cur_old_count        = cur_old_count;
                __entry->cur_new_count        = cur_new_count;
        ),

        TP_printk_btrfs("qgid=%llu old_rfer=%llu old_excl=%llu cur_old_count=%llu cur_new_count=%llu",
                  __entry->qgid, __entry->old_rfer, __entry->old_excl,
                  __entry->cur_old_count, __entry->cur_new_count)
);

TRACE_EVENT(qgroup_update_reserve,

        TP_PROTO(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup,
                 s64 diff, int type),

        TP_ARGS(fs_info, qgroup, diff, type),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        qgid                        )
                __field(        u64,        cur_reserved                )
                __field(        s64,        diff                        )
                __field(        int,        type                        )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->qgid                = qgroup->qgroupid;
                __entry->cur_reserved        = qgroup->rsv.values[type];
                __entry->diff                = diff;
                __entry->type                = type;
        ),

        TP_printk_btrfs("qgid=%llu type=%s cur_reserved=%llu diff=%lld",
                __entry->qgid, __print_symbolic(__entry->type, QGROUP_RSV_TYPES),
                __entry->cur_reserved, __entry->diff)
);

TRACE_EVENT(qgroup_meta_reserve,

        TP_PROTO(struct btrfs_root *root, s64 diff, int type),

        TP_ARGS(root, diff, type),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        refroot                        )
                __field(        s64,        diff                        )
                __field(        int,        type                        )
        ),

        TP_fast_assign_btrfs(root->fs_info,
                __entry->refroot        = root->root_key.objectid;
                __entry->diff                = diff;
                __entry->type                = type;
        ),

        TP_printk_btrfs("refroot=%llu(%s) type=%s diff=%lld",
                show_root_type(__entry->refroot),
                __print_symbolic(__entry->type, QGROUP_RSV_TYPES), __entry->diff)
);

TRACE_EVENT(qgroup_meta_convert,

        TP_PROTO(struct btrfs_root *root, s64 diff),

        TP_ARGS(root, diff),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        refroot                        )
                __field(        s64,        diff                        )
        ),

        TP_fast_assign_btrfs(root->fs_info,
                __entry->refroot        = root->root_key.objectid;
                __entry->diff                = diff;
        ),

        TP_printk_btrfs("refroot=%llu(%s) type=%s->%s diff=%lld",
                show_root_type(__entry->refroot),
                __print_symbolic(BTRFS_QGROUP_RSV_META_PREALLOC, QGROUP_RSV_TYPES),
                __print_symbolic(BTRFS_QGROUP_RSV_META_PERTRANS, QGROUP_RSV_TYPES),
                __entry->diff)
);

TRACE_EVENT(qgroup_meta_free_all_pertrans,

        TP_PROTO(struct btrfs_root *root),

        TP_ARGS(root),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        refroot                        )
                __field(        s64,        diff                        )
                __field(        int,        type                        )
        ),

        TP_fast_assign_btrfs(root->fs_info,
                __entry->refroot        = root->root_key.objectid;
                spin_lock(&root->qgroup_meta_rsv_lock);
                __entry->diff                = -(s64)root->qgroup_meta_rsv_pertrans;
                spin_unlock(&root->qgroup_meta_rsv_lock);
                __entry->type                = BTRFS_QGROUP_RSV_META_PERTRANS;
        ),

        TP_printk_btrfs("refroot=%llu(%s) type=%s diff=%lld",
                show_root_type(__entry->refroot),
                __print_symbolic(__entry->type, QGROUP_RSV_TYPES), __entry->diff)
);

DECLARE_EVENT_CLASS(btrfs__prelim_ref,
        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct prelim_ref *oldref,
                 const struct prelim_ref *newref, u64 tree_size),
        TP_ARGS(fs_info, newref, oldref, tree_size),

        TP_STRUCT__entry_btrfs(
                __field(        u64,  root_id                )
                __field(        u64,  objectid                )
                __field(         u8,  type                )
                __field(        u64,  offset                )
                __field(        int,  level                )
                __field(        int,  old_count                )
                __field(        u64,  parent                )
                __field(        u64,  bytenr                )
                __field(        int,  mod_count                )
                __field(        u64,  tree_size                )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->root_id        = oldref->root_id;
                __entry->objectid        = oldref->key_for_search.objectid;
                __entry->type                = oldref->key_for_search.type;
                __entry->offset                = oldref->key_for_search.offset;
                __entry->level                = oldref->level;
                __entry->old_count        = oldref->count;
                __entry->parent                = oldref->parent;
                __entry->bytenr                = oldref->wanted_disk_byte;
                __entry->mod_count        = newref ? newref->count : 0;
                __entry->tree_size        = tree_size;
        ),

        TP_printk_btrfs("root_id=%llu key=[%llu,%u,%llu] level=%d count=[%d+%d=%d] parent=%llu wanted_disk_byte=%llu nodes=%llu",
                        __entry->root_id,
                        __entry->objectid, __entry->type,
                        __entry->offset, __entry->level,
                        __entry->old_count, __entry->mod_count,
                        __entry->old_count + __entry->mod_count,
                        __entry->parent,
                        __entry->bytenr,
                        __entry->tree_size)
);

DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_merge,
        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct prelim_ref *oldref,
                 const struct prelim_ref *newref, u64 tree_size),
        TP_ARGS(fs_info, oldref, newref, tree_size)
);

DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_insert,
        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct prelim_ref *oldref,
                 const struct prelim_ref *newref, u64 tree_size),
        TP_ARGS(fs_info, oldref, newref, tree_size)
);

TRACE_EVENT(btrfs_inode_mod_outstanding_extents,
        TP_PROTO(const struct btrfs_root *root, u64 ino, int mod, unsigned outstanding),

        TP_ARGS(root, ino, mod, outstanding),

        TP_STRUCT__entry_btrfs(
                __field(        u64, root_objectid        )
                __field(        u64, ino                )
                __field(        int, mod                )
                __field(        unsigned, outstanding        )
        ),

        TP_fast_assign_btrfs(root->fs_info,
                __entry->root_objectid        = root->root_key.objectid;
                __entry->ino                = ino;
                __entry->mod                = mod;
                __entry->outstanding    = outstanding;
        ),

        TP_printk_btrfs("root=%llu(%s) ino=%llu mod=%d outstanding=%u",
                        show_root_type(__entry->root_objectid),
                        __entry->ino, __entry->mod, __entry->outstanding)
);

DECLARE_EVENT_CLASS(btrfs__block_group,
        TP_PROTO(const struct btrfs_block_group *bg_cache),

        TP_ARGS(bg_cache),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        bytenr                )
                __field(        u64,        len                )
                __field(        u64,        used                )
                __field(        u64,        flags                )
        ),

        TP_fast_assign_btrfs(bg_cache->fs_info,
                __entry->bytenr = bg_cache->start,
                __entry->len        = bg_cache->length,
                __entry->used        = bg_cache->used;
                __entry->flags        = bg_cache->flags;
        ),

        TP_printk_btrfs("bg bytenr=%llu len=%llu used=%llu flags=%llu(%s)",
                __entry->bytenr, __entry->len, __entry->used, __entry->flags,
                __print_flags(__entry->flags, "|", BTRFS_GROUP_FLAGS))
);

DEFINE_EVENT(btrfs__block_group, btrfs_remove_block_group,
        TP_PROTO(const struct btrfs_block_group *bg_cache),

        TP_ARGS(bg_cache)
);

DEFINE_EVENT(btrfs__block_group, btrfs_add_unused_block_group,
        TP_PROTO(const struct btrfs_block_group *bg_cache),

        TP_ARGS(bg_cache)
);

DEFINE_EVENT(btrfs__block_group, btrfs_add_reclaim_block_group,
        TP_PROTO(const struct btrfs_block_group *bg_cache),

        TP_ARGS(bg_cache)
);

DEFINE_EVENT(btrfs__block_group, btrfs_reclaim_block_group,
        TP_PROTO(const struct btrfs_block_group *bg_cache),

        TP_ARGS(bg_cache)
);

DEFINE_EVENT(btrfs__block_group, btrfs_skip_unused_block_group,
        TP_PROTO(const struct btrfs_block_group *bg_cache),

        TP_ARGS(bg_cache)
);

TRACE_EVENT(btrfs_set_extent_bit,
        TP_PROTO(const struct extent_io_tree *tree,
                 u64 start, u64 len, unsigned set_bits),

        TP_ARGS(tree, start, len, set_bits),

        TP_STRUCT__entry_btrfs(
                __field(        unsigned,        owner        )
                __field(        u64,                ino        )
                __field(        u64,                rootid        )
                __field(        u64,                start        )
                __field(        u64,                len        )
                __field(        unsigned,        set_bits)
        ),

        TP_fast_assign_btrfs(extent_io_tree_to_fs_info(tree),
                const struct btrfs_inode *inode = extent_io_tree_to_inode_const(tree);

                __entry->owner                = tree->owner;
                __entry->ino                = inode ? btrfs_ino(inode) : 0;
                __entry->rootid                = inode ? inode->root->root_key.objectid : 0;
                __entry->start                = start;
                __entry->len                = len;
                __entry->set_bits        = set_bits;
        ),

        TP_printk_btrfs(
                "io_tree=%s ino=%llu root=%llu start=%llu len=%llu set_bits=%s",
                __print_symbolic(__entry->owner, IO_TREE_OWNER), __entry->ino,
                __entry->rootid, __entry->start, __entry->len,
                __print_flags(__entry->set_bits, "|", EXTENT_FLAGS))
);

TRACE_EVENT(btrfs_clear_extent_bit,
        TP_PROTO(const struct extent_io_tree *tree,
                 u64 start, u64 len, unsigned clear_bits),

        TP_ARGS(tree, start, len, clear_bits),

        TP_STRUCT__entry_btrfs(
                __field(        unsigned,        owner        )
                __field(        u64,                ino        )
                __field(        u64,                rootid        )
                __field(        u64,                start        )
                __field(        u64,                len        )
                __field(        unsigned,        clear_bits)
        ),

        TP_fast_assign_btrfs(extent_io_tree_to_fs_info(tree),
                const struct btrfs_inode *inode = extent_io_tree_to_inode_const(tree);

                __entry->owner                = tree->owner;
                __entry->ino                = inode ? btrfs_ino(inode) : 0;
                __entry->rootid                = inode ? inode->root->root_key.objectid : 0;
                __entry->start                = start;
                __entry->len                = len;
                __entry->clear_bits        = clear_bits;
        ),

        TP_printk_btrfs(
                "io_tree=%s ino=%llu root=%llu start=%llu len=%llu clear_bits=%s",
                __print_symbolic(__entry->owner, IO_TREE_OWNER), __entry->ino,
                __entry->rootid, __entry->start, __entry->len,
                __print_flags(__entry->clear_bits, "|", EXTENT_FLAGS))
);

TRACE_EVENT(btrfs_convert_extent_bit,
        TP_PROTO(const struct extent_io_tree *tree,
                 u64 start, u64 len, unsigned set_bits, unsigned clear_bits),

        TP_ARGS(tree, start, len, set_bits, clear_bits),

        TP_STRUCT__entry_btrfs(
                __field(        unsigned,        owner        )
                __field(        u64,                ino        )
                __field(        u64,                rootid        )
                __field(        u64,                start        )
                __field(        u64,                len        )
                __field(        unsigned,        set_bits)
                __field(        unsigned,        clear_bits)
        ),

        TP_fast_assign_btrfs(extent_io_tree_to_fs_info(tree),
                const struct btrfs_inode *inode = extent_io_tree_to_inode_const(tree);

                __entry->owner                = tree->owner;
                __entry->ino                = inode ? btrfs_ino(inode) : 0;
                __entry->rootid                = inode ? inode->root->root_key.objectid : 0;
                __entry->start                = start;
                __entry->len                = len;
                __entry->set_bits        = set_bits;
                __entry->clear_bits        = clear_bits;
        ),

        TP_printk_btrfs(
"io_tree=%s ino=%llu root=%llu start=%llu len=%llu set_bits=%s clear_bits=%s",
                  __print_symbolic(__entry->owner, IO_TREE_OWNER), __entry->ino,
                  __entry->rootid, __entry->start, __entry->len,
                  __print_flags(__entry->set_bits , "|", EXTENT_FLAGS),
                  __print_flags(__entry->clear_bits, "|", EXTENT_FLAGS))
);

DECLARE_EVENT_CLASS(btrfs_dump_space_info,
        TP_PROTO(struct btrfs_fs_info *fs_info,
                 const struct btrfs_space_info *sinfo),

        TP_ARGS(fs_info, sinfo),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        flags                        )
                __field(        u64,        total_bytes                )
                __field(        u64,        bytes_used                )
                __field(        u64,        bytes_pinned                )
                __field(        u64,        bytes_reserved                )
                __field(        u64,        bytes_may_use                )
                __field(        u64,        bytes_readonly                )
                __field(        u64,        reclaim_size                )
                __field(        int,        clamp                        )
                __field(        u64,        global_reserved                )
                __field(        u64,        trans_reserved                )
                __field(        u64,        delayed_refs_reserved        )
                __field(        u64,        delayed_reserved        )
                __field(        u64,        free_chunk_space        )
                __field(        u64,        delalloc_bytes                )
                __field(        u64,        ordered_bytes                )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->flags                        =        sinfo->flags;
                __entry->total_bytes                =        sinfo->total_bytes;
                __entry->bytes_used                =        sinfo->bytes_used;
                __entry->bytes_pinned                =        sinfo->bytes_pinned;
                __entry->bytes_reserved                =        sinfo->bytes_reserved;
                __entry->bytes_may_use                =        sinfo->bytes_may_use;
                __entry->bytes_readonly                =        sinfo->bytes_readonly;
                __entry->reclaim_size                =        sinfo->reclaim_size;
                __entry->clamp                        =        sinfo->clamp;
                __entry->global_reserved        =        fs_info->global_block_rsv.reserved;
                __entry->trans_reserved                =        fs_info->trans_block_rsv.reserved;
                __entry->delayed_refs_reserved        =        fs_info->delayed_refs_rsv.reserved;
                __entry->delayed_reserved        =        fs_info->delayed_block_rsv.reserved;
                __entry->free_chunk_space        =        atomic64_read(&fs_info->free_chunk_space);
                __entry->delalloc_bytes                =        percpu_counter_sum_positive(&fs_info->delalloc_bytes);
                __entry->ordered_bytes                =        percpu_counter_sum_positive(&fs_info->ordered_bytes);
        ),

        TP_printk_btrfs("flags=%s total_bytes=%llu bytes_used=%llu "
                        "bytes_pinned=%llu bytes_reserved=%llu "
                        "bytes_may_use=%llu bytes_readonly=%llu "
                        "reclaim_size=%llu clamp=%d global_reserved=%llu "
                        "trans_reserved=%llu delayed_refs_reserved=%llu "
                        "delayed_reserved=%llu chunk_free_space=%llu "
                        "delalloc_bytes=%llu ordered_bytes=%llu",
                        __print_flags(__entry->flags, "|", BTRFS_GROUP_FLAGS),
                        __entry->total_bytes, __entry->bytes_used,
                        __entry->bytes_pinned, __entry->bytes_reserved,
                        __entry->bytes_may_use, __entry->bytes_readonly,
                        __entry->reclaim_size, __entry->clamp,
                        __entry->global_reserved, __entry->trans_reserved,
                        __entry->delayed_refs_reserved,
                        __entry->delayed_reserved, __entry->free_chunk_space,
                        __entry->delalloc_bytes, __entry->ordered_bytes)
);

DEFINE_EVENT(btrfs_dump_space_info, btrfs_done_preemptive_reclaim,
        TP_PROTO(struct btrfs_fs_info *fs_info,
                 const struct btrfs_space_info *sinfo),
        TP_ARGS(fs_info, sinfo)
);

DEFINE_EVENT(btrfs_dump_space_info, btrfs_fail_all_tickets,
        TP_PROTO(struct btrfs_fs_info *fs_info,
                 const struct btrfs_space_info *sinfo),
        TP_ARGS(fs_info, sinfo)
);

TRACE_EVENT(btrfs_reserve_ticket,
        TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 bytes,
                 u64 start_ns, int flush, int error),

        TP_ARGS(fs_info, flags, bytes, start_ns, flush, error),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        flags                )
                __field(        u64,        bytes                )
                __field(        u64,        start_ns        )
                __field(        int,        flush                )
                __field(        int,        error                )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->flags                = flags;
                __entry->bytes                = bytes;
                __entry->start_ns        = start_ns;
                __entry->flush                = flush;
                __entry->error                = error;
        ),

        TP_printk_btrfs("flags=%s bytes=%llu start_ns=%llu flush=%s error=%d",
                        __print_flags(__entry->flags, "|", BTRFS_GROUP_FLAGS),
                        __entry->bytes, __entry->start_ns,
                        __print_symbolic(__entry->flush, FLUSH_ACTIONS),
                        __entry->error)
);

DECLARE_EVENT_CLASS(btrfs_sleep_tree_lock,
        TP_PROTO(const struct extent_buffer *eb, u64 start_ns),

        TP_ARGS(eb, start_ns),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        block                )
                __field(        u64,        generation        )
                __field(        u64,        start_ns        )
                __field(        u64,        end_ns                )
                __field(        u64,        diff_ns                )
                __field(        u64,        owner                )
                __field(        int,        is_log_tree        )
        ),

        TP_fast_assign_btrfs(eb->fs_info,
                __entry->block                = eb->start;
                __entry->generation        = btrfs_header_generation(eb);
                __entry->start_ns        = start_ns;
                __entry->end_ns                = ktime_get_ns();
                __entry->diff_ns        = __entry->end_ns - start_ns;
                __entry->owner                = btrfs_header_owner(eb);
                __entry->is_log_tree        = (eb->log_index >= 0);
        ),

        TP_printk_btrfs(
"block=%llu generation=%llu start_ns=%llu end_ns=%llu diff_ns=%llu owner=%llu is_log_tree=%d",
                __entry->block, __entry->generation,
                __entry->start_ns, __entry->end_ns, __entry->diff_ns,
                __entry->owner, __entry->is_log_tree)
);

DEFINE_EVENT(btrfs_sleep_tree_lock, btrfs_tree_read_lock,
        TP_PROTO(const struct extent_buffer *eb, u64 start_ns),

        TP_ARGS(eb, start_ns)
);

DEFINE_EVENT(btrfs_sleep_tree_lock, btrfs_tree_lock,
        TP_PROTO(const struct extent_buffer *eb, u64 start_ns),

        TP_ARGS(eb, start_ns)
);

DECLARE_EVENT_CLASS(btrfs_locking_events,
        TP_PROTO(const struct extent_buffer *eb),

        TP_ARGS(eb),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        block                )
                __field(        u64,        generation        )
                __field(        u64,        owner                )
                __field(        int,        is_log_tree        )
        ),

        TP_fast_assign_btrfs(eb->fs_info,
                __entry->block                = eb->start;
                __entry->generation        = btrfs_header_generation(eb);
                __entry->owner                = btrfs_header_owner(eb);
                __entry->is_log_tree        = (eb->log_index >= 0);
        ),

        TP_printk_btrfs("block=%llu generation=%llu owner=%llu is_log_tree=%d",
                __entry->block, __entry->generation,
                __entry->owner, __entry->is_log_tree)
);

#define DEFINE_BTRFS_LOCK_EVENT(name)                                \
DEFINE_EVENT(btrfs_locking_events, name,                        \
                TP_PROTO(const struct extent_buffer *eb),        \
                                                                \
                TP_ARGS(eb)                                        \
)

DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_unlock);
DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_unlock);
DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_unlock_blocking);
DEFINE_BTRFS_LOCK_EVENT(btrfs_set_lock_blocking_read);
DEFINE_BTRFS_LOCK_EVENT(btrfs_set_lock_blocking_write);
DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_read_lock);
DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_write_lock);
DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_lock_atomic);

DECLARE_EVENT_CLASS(btrfs__space_info_update,

        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_space_info *sinfo, u64 old, s64 diff),

        TP_ARGS(fs_info, sinfo, old, diff),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        type                )
                __field(        u64,        old                )
                __field(        s64,        diff                )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->type        = sinfo->flags;
                __entry->old        = old;
                __entry->diff        = diff;
        ),
        TP_printk_btrfs("type=%s old=%llu diff=%lld",
                __print_flags(__entry->type, "|", BTRFS_GROUP_FLAGS),
                __entry->old, __entry->diff)
);

DEFINE_EVENT(btrfs__space_info_update, update_bytes_may_use,

        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_space_info *sinfo, u64 old, s64 diff),

        TP_ARGS(fs_info, sinfo, old, diff)
);

DEFINE_EVENT(btrfs__space_info_update, update_bytes_pinned,

        TP_PROTO(const struct btrfs_fs_info *fs_info,
                 const struct btrfs_space_info *sinfo, u64 old, s64 diff),

        TP_ARGS(fs_info, sinfo, old, diff)
);

DECLARE_EVENT_CLASS(btrfs_raid56_bio,

        TP_PROTO(const struct btrfs_raid_bio *rbio,
                 const struct bio *bio,
                 const struct raid56_bio_trace_info *trace_info),

        TP_ARGS(rbio, bio, trace_info),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        full_stripe        )
                __field(        u64,        physical        )
                __field(        u64,        devid                )
                __field(        u32,        offset                )
                __field(        u32,        len                )
                __field(        u8,        opf                )
                __field(        u8,        total_stripes        )
                __field(        u8,        real_stripes        )
                __field(        u8,        nr_data                )
                __field(        u8,        stripe_nr        )
        ),

        TP_fast_assign_btrfs(rbio->bioc->fs_info,
                __entry->full_stripe        = rbio->bioc->full_stripe_logical;
                __entry->physical        = bio->bi_iter.bi_sector << SECTOR_SHIFT;
                __entry->len                = bio->bi_iter.bi_size;
                __entry->opf                = bio_op(bio);
                __entry->devid                = trace_info->devid;
                __entry->offset                = trace_info->offset;
                __entry->stripe_nr        = trace_info->stripe_nr;
                __entry->total_stripes        = rbio->bioc->num_stripes;
                __entry->real_stripes        = rbio->real_stripes;
                __entry->nr_data        = rbio->nr_data;
        ),
        /*
         * For type output, we need to output things like "DATA1"
         * (the first data stripe), "DATA2" (the second data stripe),
         * "PQ1" (P stripe),"PQ2" (Q stripe), "REPLACE0" (replace target device).
         */
        TP_printk_btrfs(
"full_stripe=%llu devid=%lld type=%s%d offset=%d opf=0x%x physical=%llu len=%u",
                __entry->full_stripe, __entry->devid,
                (__entry->stripe_nr < __entry->nr_data) ? "DATA" :
                        ((__entry->stripe_nr < __entry->real_stripes) ? "PQ" :
                         "REPLACE"),
                (__entry->stripe_nr < __entry->nr_data) ?
                        (__entry->stripe_nr + 1) :
                        ((__entry->stripe_nr < __entry->real_stripes) ?
                         (__entry->stripe_nr - __entry->nr_data + 1) : 0),
                __entry->offset, __entry->opf, __entry->physical, __entry->len)
);

DEFINE_EVENT(btrfs_raid56_bio, raid56_read,
        TP_PROTO(const struct btrfs_raid_bio *rbio,
                 const struct bio *bio,
                 const struct raid56_bio_trace_info *trace_info),

        TP_ARGS(rbio, bio, trace_info)
);

DEFINE_EVENT(btrfs_raid56_bio, raid56_write,
        TP_PROTO(const struct btrfs_raid_bio *rbio,
                 const struct bio *bio,
                 const struct raid56_bio_trace_info *trace_info),

        TP_ARGS(rbio, bio, trace_info)
);

TRACE_EVENT(btrfs_insert_one_raid_extent,

        TP_PROTO(const struct btrfs_fs_info *fs_info, u64 logical, u64 length,
                 int num_stripes),

        TP_ARGS(fs_info, logical, length, num_stripes),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        logical                )
                __field(        u64,        length                )
                __field(        int,        num_stripes        )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->logical        = logical;
                __entry->length                = length;
                __entry->num_stripes        = num_stripes;
        ),

        TP_printk_btrfs("logical=%llu length=%llu num_stripes=%d",
                        __entry->logical, __entry->length,
                        __entry->num_stripes)
);

TRACE_EVENT(btrfs_raid_extent_delete,

        TP_PROTO(const struct btrfs_fs_info *fs_info, u64 start, u64 end,
                 u64 found_start, u64 found_end),

        TP_ARGS(fs_info, start, end, found_start, found_end),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        start                )
                __field(        u64,        end                )
                __field(        u64,        found_start        )
                __field(        u64,        found_end        )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->start                = start;
                __entry->end                = end;
                __entry->found_start        = found_start;
                __entry->found_end        = found_end;
        ),

        TP_printk_btrfs("start=%llu end=%llu found_start=%llu found_end=%llu",
                        __entry->start, __entry->end, __entry->found_start,
                        __entry->found_end)
);

TRACE_EVENT(btrfs_get_raid_extent_offset,

        TP_PROTO(const struct btrfs_fs_info *fs_info, u64 logical, u64 length,
                 u64 physical, u64 devid),

        TP_ARGS(fs_info, logical, length, physical, devid),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        logical                )
                __field(        u64,        length                )
                __field(        u64,        physical        )
                __field(        u64,        devid                )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->logical        = logical;
                __entry->length                = length;
                __entry->physical        = physical;
                __entry->devid                = devid;
        ),

        TP_printk_btrfs("logical=%llu length=%llu physical=%llu devid=%llu",
                        __entry->logical, __entry->length, __entry->physical,
                        __entry->devid)
);

TRACE_EVENT(btrfs_extent_map_shrinker_count,

        TP_PROTO(const struct btrfs_fs_info *fs_info, long nr),

        TP_ARGS(fs_info, nr),

        TP_STRUCT__entry_btrfs(
                __field(        long,        nr        )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->nr                = nr;
        ),

        TP_printk_btrfs("nr=%ld", __entry->nr)
);

TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,

        TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr),

        TP_ARGS(fs_info, nr_to_scan, nr),

        TP_STRUCT__entry_btrfs(
                __field(        long,        nr_to_scan        )
                __field(        long,        nr                )
                __field(        u64,        last_root_id        )
                __field(        u64,        last_ino        )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->nr_to_scan        = nr_to_scan;
                __entry->nr                = nr;
                __entry->last_root_id        = fs_info->extent_map_shrinker_last_root;
                __entry->last_ino        = fs_info->extent_map_shrinker_last_ino;
        ),

        TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
                        __entry->nr_to_scan, __entry->nr,
                        show_root_type(__entry->last_root_id), __entry->last_ino)
);

TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,

        TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr),

        TP_ARGS(fs_info, nr_dropped, nr),

        TP_STRUCT__entry_btrfs(
                __field(        long,        nr_dropped        )
                __field(        long,        nr                )
                __field(        u64,        last_root_id        )
                __field(        u64,        last_ino        )
        ),

        TP_fast_assign_btrfs(fs_info,
                __entry->nr_dropped        = nr_dropped;
                __entry->nr                = nr;
                __entry->last_root_id        = fs_info->extent_map_shrinker_last_root;
                __entry->last_ino        = fs_info->extent_map_shrinker_last_ino;
        ),

        TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
                        __entry->nr_dropped, __entry->nr,
                        show_root_type(__entry->last_root_id), __entry->last_ino)
);

TRACE_EVENT(btrfs_extent_map_shrinker_remove_em,

        TP_PROTO(const struct btrfs_inode *inode, const struct extent_map *em),

        TP_ARGS(inode, em),

        TP_STRUCT__entry_btrfs(
                __field(        u64,        ino                )
                __field(        u64,        root_id                )
                __field(        u64,        start                )
                __field(        u64,        len                )
                __field(        u64,        block_start        )
                __field(        u32,        flags                )
        ),

        TP_fast_assign_btrfs(inode->root->fs_info,
                __entry->ino                = btrfs_ino(inode);
                __entry->root_id        = inode->root->root_key.objectid;
                __entry->start                = em->start;
                __entry->len                = em->len;
                __entry->block_start        = em->block_start;
                __entry->flags                = em->flags;
        ),

        TP_printk_btrfs(
"ino=%llu root=%llu(%s) start=%llu len=%llu block_start=%llu(%s) flags=%s",
                        __entry->ino, show_root_type(__entry->root_id),
                        __entry->start, __entry->len,
                        show_map_type(__entry->block_start),
                        show_map_flags(__entry->flags))
);

#endif /* _TRACE_BTRFS_H */

/* This part must be outside protection */
#include <trace/define_trace.h>


















































































































































































































































































































































































































































































































































































    1 





    1 




























































































































































































































    1 












    1 









    1 

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
// SPDX-License-Identifier: GPL-2.0
/*
 *
 * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
 *
 *
 *                 terminology
 *
 * cluster - allocation unit     - 512,1K,2K,4K,...,2M
 * vcn - virtual cluster number  - Offset inside the file in clusters.
 * vbo - virtual byte offset     - Offset inside the file in bytes.
 * lcn - logical cluster number  - 0 based cluster in clusters heap.
 * lbo - logical byte offset     - Absolute position inside volume.
 * run - maps VCN to LCN         - Stored in attributes in packed form.
 * attr - attribute segment      - std/name/data etc records inside MFT.
 * mi  - MFT inode               - One MFT record(usually 1024 bytes or 4K), consists of attributes.
 * ni  - NTFS inode              - Extends linux inode. consists of one or more mft inodes.
 * index - unit inside directory - 2K, 4K, <=page size, does not depend on cluster size.
 *
 * WSL - Windows Subsystem for Linux
 * https://docs.microsoft.com/en-us/windows/wsl/file-permissions
 * It stores uid/gid/mode/dev in xattr
 *
 * ntfs allows up to 2^64 clusters per volume.
 * It means you should use 64 bits lcn to operate with ntfs.
 * Implementation of ntfs.sys uses only 32 bits lcn.
 * Default ntfs3 uses 32 bits lcn too.
 * ntfs3 built with CONFIG_NTFS3_64BIT_CLUSTER (ntfs3_64) uses 64 bits per lcn.
 *
 *
 *     ntfs limits, cluster size is 4K (2^12)
 * -----------------------------------------------------------------------------
 * | Volume size   | Clusters | ntfs.sys | ntfs3  | ntfs3_64 | mkntfs | chkdsk |
 * -----------------------------------------------------------------------------
 * | < 16T, 2^44   |  < 2^32  |  yes     |  yes   |   yes    |  yes   |  yes   |
 * | > 16T, 2^44   |  > 2^32  |  no      |  no    |   yes    |  yes   |  yes   |
 * ----------------------------------------------------------|------------------
 *
 * To mount large volumes as ntfs one should use large cluster size (up to 2M)
 * The maximum volume size in this case is 2^32 * 2^21 = 2^53 = 8P
 *
 *     ntfs limits, cluster size is 2M (2^21)
 * -----------------------------------------------------------------------------
 * | < 8P, 2^53    |  < 2^32  |  yes     |  yes   |   yes    |  yes   |  yes   |
 * | > 8P, 2^53    |  > 2^32  |  no      |  no    |   yes    |  yes   |  yes   |
 * ----------------------------------------------------------|------------------
 *
 */

#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
#include <linux/fs.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/log2.h>
#include <linux/minmax.h>
#include <linux/module.h>
#include <linux/nls.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/statfs.h>

#include "debug.h"
#include "ntfs.h"
#include "ntfs_fs.h"
#ifdef CONFIG_NTFS3_LZX_XPRESS
#include "lib/lib.h"
#endif

#ifdef CONFIG_PRINTK
/*
 * ntfs_printk - Trace warnings/notices/errors.
 *
 * Thanks Joe Perches <joe@perches.com> for implementation
 */
void ntfs_printk(const struct super_block *sb, const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;
        int level;
        struct ntfs_sb_info *sbi = sb->s_fs_info;

        /* Should we use different ratelimits for warnings/notices/errors? */
        if (!___ratelimit(&sbi->msg_ratelimit, "ntfs3"))
                return;

        va_start(args, fmt);

        level = printk_get_level(fmt);
        vaf.fmt = printk_skip_level(fmt);
        vaf.va = &args;
        printk("%c%cntfs3: %s: %pV\n", KERN_SOH_ASCII, level, sb->s_id, &vaf);

        va_end(args);
}

static char s_name_buf[512];
static atomic_t s_name_buf_cnt = ATOMIC_INIT(1); // 1 means 'free s_name_buf'.

/*
 * ntfs_inode_printk
 *
 * Print warnings/notices/errors about inode using name or inode number.
 */
void ntfs_inode_printk(struct inode *inode, const char *fmt, ...)
{
        struct super_block *sb = inode->i_sb;
        struct ntfs_sb_info *sbi = sb->s_fs_info;
        char *name;
        va_list args;
        struct va_format vaf;
        int level;

        if (!___ratelimit(&sbi->msg_ratelimit, "ntfs3"))
                return;

        /* Use static allocated buffer, if possible. */
        name = atomic_dec_and_test(&s_name_buf_cnt) ?
                       s_name_buf :
                       kmalloc(sizeof(s_name_buf), GFP_NOFS);

        if (name) {
                struct dentry *de = d_find_alias(inode);

                if (de) {
                        spin_lock(&de->d_lock);
                        snprintf(name, sizeof(s_name_buf), " \"%s\"",
                                 de->d_name.name);
                        spin_unlock(&de->d_lock);
                } else {
                        name[0] = 0;
                }
                dput(de); /* Cocci warns if placed in branch "if (de)" */
        }

        va_start(args, fmt);

        level = printk_get_level(fmt);
        vaf.fmt = printk_skip_level(fmt);
        vaf.va = &args;

        printk("%c%cntfs3: %s: ino=%lx,%s %pV\n", KERN_SOH_ASCII, level,
               sb->s_id, inode->i_ino, name ? name : "", &vaf);

        va_end(args);

        atomic_inc(&s_name_buf_cnt);
        if (name != s_name_buf)
                kfree(name);
}
#endif

/*
 * Shared memory struct.
 *
 * On-disk ntfs's upcase table is created by ntfs formatter.
 * 'upcase' table is 128K bytes of memory.
 * We should read it into memory when mounting.
 * Several ntfs volumes likely use the same 'upcase' table.
 * It is good idea to share in-memory 'upcase' table between different volumes.
 * Unfortunately winxp/vista/win7 use different upcase tables.
 */
static DEFINE_SPINLOCK(s_shared_lock);

static struct {
        void *ptr;
        u32 len;
        int cnt;
} s_shared[8];

/*
 * ntfs_set_shared
 *
 * Return:
 * * @ptr - If pointer was saved in shared memory.
 * * NULL - If pointer was not shared.
 */
void *ntfs_set_shared(void *ptr, u32 bytes)
{
        void *ret = NULL;
        int i, j = -1;

        spin_lock(&s_shared_lock);
        for (i = 0; i < ARRAY_SIZE(s_shared); i++) {
                if (!s_shared[i].cnt) {
                        j = i;
                } else if (bytes == s_shared[i].len &&
                           !memcmp(s_shared[i].ptr, ptr, bytes)) {
                        s_shared[i].cnt += 1;
                        ret = s_shared[i].ptr;
                        break;
                }
        }

        if (!ret && j != -1) {
                s_shared[j].ptr = ptr;
                s_shared[j].len = bytes;
                s_shared[j].cnt = 1;
                ret = ptr;
        }
        spin_unlock(&s_shared_lock);

        return ret;
}

/*
 * ntfs_put_shared
 *
 * Return:
 * * @ptr - If pointer is not shared anymore.
 * * NULL - If pointer is still shared.
 */
void *ntfs_put_shared(void *ptr)
{
        void *ret = ptr;
        int i;

        spin_lock(&s_shared_lock);
        for (i = 0; i < ARRAY_SIZE(s_shared); i++) {
                if (s_shared[i].cnt && s_shared[i].ptr == ptr) {
                        if (--s_shared[i].cnt)
                                ret = NULL;
                        break;
                }
        }
        spin_unlock(&s_shared_lock);

        return ret;
}

static inline void put_mount_options(struct ntfs_mount_options *options)
{
        kfree(options->nls_name);
        unload_nls(options->nls);
        kfree(options);
}

enum Opt {
        Opt_uid,
        Opt_gid,
        Opt_umask,
        Opt_dmask,
        Opt_fmask,
        Opt_immutable,
        Opt_discard,
        Opt_force,
        Opt_sparse,
        Opt_nohidden,
        Opt_hide_dot_files,
        Opt_windows_names,
        Opt_showmeta,
        Opt_acl,
        Opt_iocharset,
        Opt_prealloc,
        Opt_nocase,
        Opt_err,
};

// clang-format off
static const struct fs_parameter_spec ntfs_fs_parameters[] = {
        fsparam_u32("uid",                        Opt_uid),
        fsparam_u32("gid",                        Opt_gid),
        fsparam_u32oct("umask",                        Opt_umask),
        fsparam_u32oct("dmask",                        Opt_dmask),
        fsparam_u32oct("fmask",                        Opt_fmask),
        fsparam_flag_no("sys_immutable",        Opt_immutable),
        fsparam_flag_no("discard",                Opt_discard),
        fsparam_flag_no("force",                Opt_force),
        fsparam_flag_no("sparse",                Opt_sparse),
        fsparam_flag_no("hidden",                Opt_nohidden),
        fsparam_flag_no("hide_dot_files",        Opt_hide_dot_files),
        fsparam_flag_no("windows_names",        Opt_windows_names),
        fsparam_flag_no("showmeta",                Opt_showmeta),
        fsparam_flag_no("acl",                        Opt_acl),
        fsparam_string("iocharset",                Opt_iocharset),
        fsparam_flag_no("prealloc",                Opt_prealloc),
        fsparam_flag_no("nocase",                Opt_nocase),
        {}
};
// clang-format on

/*
 * Load nls table or if @nls is utf8 then return NULL.
 *
 * It is good idea to use here "const char *nls".
 * But load_nls accepts "char*".
 */
static struct nls_table *ntfs_load_nls(char *nls)
{
        struct nls_table *ret;

        if (!nls)
                nls = CONFIG_NLS_DEFAULT;

        if (strcmp(nls, "utf8") == 0)
                return NULL;

        if (strcmp(nls, CONFIG_NLS_DEFAULT) == 0)
                return load_nls_default();

        ret = load_nls(nls);
        if (ret)
                return ret;

        return ERR_PTR(-EINVAL);
}

static int ntfs_fs_parse_param(struct fs_context *fc,
                               struct fs_parameter *param)
{
        struct ntfs_mount_options *opts = fc->fs_private;
        struct fs_parse_result result;
        int opt;

        opt = fs_parse(fc, ntfs_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case Opt_uid:
                opts->fs_uid = make_kuid(current_user_ns(), result.uint_32);
                if (!uid_valid(opts->fs_uid))
                        return invalf(fc, "ntfs3: Invalid value for uid.");
                break;
        case Opt_gid:
                opts->fs_gid = make_kgid(current_user_ns(), result.uint_32);
                if (!gid_valid(opts->fs_gid))
                        return invalf(fc, "ntfs3: Invalid value for gid.");
                break;
        case Opt_umask:
                if (result.uint_32 & ~07777)
                        return invalf(fc, "ntfs3: Invalid value for umask.");
                opts->fs_fmask_inv = ~result.uint_32;
                opts->fs_dmask_inv = ~result.uint_32;
                opts->fmask = 1;
                opts->dmask = 1;
                break;
        case Opt_dmask:
                if (result.uint_32 & ~07777)
                        return invalf(fc, "ntfs3: Invalid value for dmask.");
                opts->fs_dmask_inv = ~result.uint_32;
                opts->dmask = 1;
                break;
        case Opt_fmask:
                if (result.uint_32 & ~07777)
                        return invalf(fc, "ntfs3: Invalid value for fmask.");
                opts->fs_fmask_inv = ~result.uint_32;
                opts->fmask = 1;
                break;
        case Opt_immutable:
                opts->sys_immutable = result.negated ? 0 : 1;
                break;
        case Opt_discard:
                opts->discard = result.negated ? 0 : 1;
                break;
        case Opt_force:
                opts->force = result.negated ? 0 : 1;
                break;
        case Opt_sparse:
                opts->sparse = result.negated ? 0 : 1;
                break;
        case Opt_nohidden:
                opts->nohidden = result.negated ? 1 : 0;
                break;
        case Opt_hide_dot_files:
                opts->hide_dot_files = result.negated ? 0 : 1;
                break;
        case Opt_windows_names:
                opts->windows_names = result.negated ? 0 : 1;
                break;
        case Opt_showmeta:
                opts->showmeta = result.negated ? 0 : 1;
                break;
        case Opt_acl:
                if (!result.negated)
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
                        fc->sb_flags |= SB_POSIXACL;
#else
                        return invalf(
                                fc, "ntfs3: Support for ACL not compiled in!");
#endif
                else
                        fc->sb_flags &= ~SB_POSIXACL;
                break;
        case Opt_iocharset:
                kfree(opts->nls_name);
                opts->nls_name = param->string;
                param->string = NULL;
                break;
        case Opt_prealloc:
                opts->prealloc = result.negated ? 0 : 1;
                break;
        case Opt_nocase:
                opts->nocase = result.negated ? 1 : 0;
                break;
        default:
                /* Should not be here unless we forget add case. */
                return -EINVAL;
        }
        return 0;
}

static int ntfs_fs_reconfigure(struct fs_context *fc)
{
        struct super_block *sb = fc->root->d_sb;
        struct ntfs_sb_info *sbi = sb->s_fs_info;
        struct ntfs_mount_options *new_opts = fc->fs_private;
        int ro_rw;

        /* If ntfs3 is used as legacy ntfs enforce read-only mode. */
        if (is_legacy_ntfs(sb)) {
                fc->sb_flags |= SB_RDONLY;
                goto out;
        }

        ro_rw = sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY);
        if (ro_rw && (sbi->flags & NTFS_FLAGS_NEED_REPLAY)) {
                errorf(fc,
                       "ntfs3: Couldn't remount rw because journal is not replayed. Please umount/remount instead\n");
                return -EINVAL;
        }

        new_opts->nls = ntfs_load_nls(new_opts->nls_name);
        if (IS_ERR(new_opts->nls)) {
                new_opts->nls = NULL;
                errorf(fc, "ntfs3: Cannot load iocharset %s",
                       new_opts->nls_name);
                return -EINVAL;
        }
        if (new_opts->nls != sbi->options->nls)
                return invalf(
                        fc,
                        "ntfs3: Cannot use different iocharset when remounting!");

        if (ro_rw && (sbi->volume.flags & VOLUME_FLAG_DIRTY) &&
            !new_opts->force) {
                errorf(fc,
                       "ntfs3: Volume is dirty and \"force\" flag is not set!");
                return -EINVAL;
        }

out:
        sync_filesystem(sb);
        swap(sbi->options, fc->fs_private);

        return 0;
}

#ifdef CONFIG_PROC_FS
static struct proc_dir_entry *proc_info_root;

/*
 * ntfs3_volinfo:
 *
 * The content of /proc/fs/ntfs3/<dev>/volinfo
 *
 * ntfs3.1
 * cluster size
 * number of clusters
 * total number of mft records
 * number of used mft records ~= number of files + folders
 * real state of ntfs "dirty"/"clean"
 * current state of ntfs "dirty"/"clean"
*/
static int ntfs3_volinfo(struct seq_file *m, void *o)
{
        struct super_block *sb = m->private;
        struct ntfs_sb_info *sbi = sb->s_fs_info;

        seq_printf(m, "ntfs%d.%d\n%u\n%zu\n\%zu\n%zu\n%s\n%s\n",
                   sbi->volume.major_ver, sbi->volume.minor_ver,
                   sbi->cluster_size, sbi->used.bitmap.nbits,
                   sbi->mft.bitmap.nbits,
                   sbi->mft.bitmap.nbits - wnd_zeroes(&sbi->mft.bitmap),
                   sbi->volume.real_dirty ? "dirty" : "clean",
                   (sbi->volume.flags & VOLUME_FLAG_DIRTY) ? "dirty" : "clean");

        return 0;
}

static int ntfs3_volinfo_open(struct inode *inode, struct file *file)
{
        return single_open(file, ntfs3_volinfo, pde_data(inode));
}

/* read /proc/fs/ntfs3/<dev>/label */
static int ntfs3_label_show(struct seq_file *m, void *o)
{
        struct super_block *sb = m->private;
        struct ntfs_sb_info *sbi = sb->s_fs_info;

        seq_printf(m, "%s\n", sbi->volume.label);

        return 0;
}

/* write /proc/fs/ntfs3/<dev>/label */
static ssize_t ntfs3_label_write(struct file *file, const char __user *buffer,
                                 size_t count, loff_t *ppos)
{
        int err;
        struct super_block *sb = pde_data(file_inode(file));
        ssize_t ret = count;
        u8 *label;

        if (sb_rdonly(sb))
                return -EROFS;

        label = kmalloc(count, GFP_NOFS);

        if (!label)
                return -ENOMEM;

        if (copy_from_user(label, buffer, ret)) {
                ret = -EFAULT;
                goto out;
        }
        while (ret > 0 && label[ret - 1] == '\n')
                ret -= 1;

        err = ntfs_set_label(sb->s_fs_info, label, ret);

        if (err < 0) {
                ntfs_err(sb, "failed (%d) to write label", err);
                ret = err;
                goto out;
        }

        *ppos += count;
        ret = count;
out:
        kfree(label);
        return ret;
}

static int ntfs3_label_open(struct inode *inode, struct file *file)
{
        return single_open(file, ntfs3_label_show, pde_data(inode));
}

static const struct proc_ops ntfs3_volinfo_fops = {
        .proc_read = seq_read,
        .proc_lseek = seq_lseek,
        .proc_release = single_release,
        .proc_open = ntfs3_volinfo_open,
};

static const struct proc_ops ntfs3_label_fops = {
        .proc_read = seq_read,
        .proc_lseek = seq_lseek,
        .proc_release = single_release,
        .proc_open = ntfs3_label_open,
        .proc_write = ntfs3_label_write,
};

#endif

static struct kmem_cache *ntfs_inode_cachep;

static struct inode *ntfs_alloc_inode(struct super_block *sb)
{
        struct ntfs_inode *ni = alloc_inode_sb(sb, ntfs_inode_cachep, GFP_NOFS);

        if (!ni)
                return NULL;

        memset(ni, 0, offsetof(struct ntfs_inode, vfs_inode));
        mutex_init(&ni->ni_lock);
        return &ni->vfs_inode;
}

static void ntfs_free_inode(struct inode *inode)
{
        struct ntfs_inode *ni = ntfs_i(inode);

        mutex_destroy(&ni->ni_lock);
        kmem_cache_free(ntfs_inode_cachep, ni);
}

static void init_once(void *foo)
{
        struct ntfs_inode *ni = foo;

        inode_init_once(&ni->vfs_inode);
}

/*
 * Noinline to reduce binary size.
 */
static noinline void ntfs3_put_sbi(struct ntfs_sb_info *sbi)
{
        wnd_close(&sbi->mft.bitmap);
        wnd_close(&sbi->used.bitmap);

        if (sbi->mft.ni) {
                iput(&sbi->mft.ni->vfs_inode);
                sbi->mft.ni = NULL;
        }

        if (sbi->security.ni) {
                iput(&sbi->security.ni->vfs_inode);
                sbi->security.ni = NULL;
        }

        if (sbi->reparse.ni) {
                iput(&sbi->reparse.ni->vfs_inode);
                sbi->reparse.ni = NULL;
        }

        if (sbi->objid.ni) {
                iput(&sbi->objid.ni->vfs_inode);
                sbi->objid.ni = NULL;
        }

        if (sbi->volume.ni) {
                iput(&sbi->volume.ni->vfs_inode);
                sbi->volume.ni = NULL;
        }

        ntfs_update_mftmirr(sbi, 0);

        indx_clear(&sbi->security.index_sii);
        indx_clear(&sbi->security.index_sdh);
        indx_clear(&sbi->reparse.index_r);
        indx_clear(&sbi->objid.index_o);
}

static void ntfs3_free_sbi(struct ntfs_sb_info *sbi)
{
        kfree(sbi->new_rec);
        kvfree(ntfs_put_shared(sbi->upcase));
        kvfree(sbi->def_table);
        kfree(sbi->compress.lznt);
#ifdef CONFIG_NTFS3_LZX_XPRESS
        xpress_free_decompressor(sbi->compress.xpress);
        lzx_free_decompressor(sbi->compress.lzx);
#endif
        kfree(sbi);
}

static void ntfs_put_super(struct super_block *sb)
{
        struct ntfs_sb_info *sbi = sb->s_fs_info;

#ifdef CONFIG_PROC_FS
        // Remove /proc/fs/ntfs3/..
        if (sbi->procdir) {
                remove_proc_entry("label", sbi->procdir);
                remove_proc_entry("volinfo", sbi->procdir);
                remove_proc_entry(sb->s_id, proc_info_root);
                sbi->procdir = NULL;
        }
#endif

        /* Mark rw ntfs as clear, if possible. */
        ntfs_set_state(sbi, NTFS_DIRTY_CLEAR);
        ntfs3_put_sbi(sbi);
}

static int ntfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct super_block *sb = dentry->d_sb;
        struct ntfs_sb_info *sbi = sb->s_fs_info;
        struct wnd_bitmap *wnd = &sbi->used.bitmap;

        buf->f_type = sb->s_magic;
        buf->f_bsize = sbi->cluster_size;
        buf->f_blocks = wnd->nbits;

        buf->f_bfree = buf->f_bavail = wnd_zeroes(wnd);
        buf->f_fsid.val[0] = sbi->volume.ser_num;
        buf->f_fsid.val[1] = (sbi->volume.ser_num >> 32);
        buf->f_namelen = NTFS_NAME_LEN;

        return 0;
}

static int ntfs_show_options(struct seq_file *m, struct dentry *root)
{
        struct super_block *sb = root->d_sb;
        struct ntfs_sb_info *sbi = sb->s_fs_info;
        struct ntfs_mount_options *opts = sbi->options;
        struct user_namespace *user_ns = seq_user_ns(m);

        seq_printf(m, ",uid=%u", from_kuid_munged(user_ns, opts->fs_uid));
        seq_printf(m, ",gid=%u", from_kgid_munged(user_ns, opts->fs_gid));
        if (opts->dmask)
                seq_printf(m, ",dmask=%04o", opts->fs_dmask_inv ^ 0xffff);
        if (opts->fmask)
                seq_printf(m, ",fmask=%04o", opts->fs_fmask_inv ^ 0xffff);
        if (opts->sys_immutable)
                seq_puts(m, ",sys_immutable");
        if (opts->discard)
                seq_puts(m, ",discard");
        if (opts->force)
                seq_puts(m, ",force");
        if (opts->sparse)
                seq_puts(m, ",sparse");
        if (opts->nohidden)
                seq_puts(m, ",nohidden");
        if (opts->hide_dot_files)
                seq_puts(m, ",hide_dot_files");
        if (opts->windows_names)
                seq_puts(m, ",windows_names");
        if (opts->showmeta)
                seq_puts(m, ",showmeta");
        if (sb->s_flags & SB_POSIXACL)
                seq_puts(m, ",acl");
        if (opts->nls)
                seq_printf(m, ",iocharset=%s", opts->nls->charset);
        else
                seq_puts(m, ",iocharset=utf8");
        if (opts->prealloc)
                seq_puts(m, ",prealloc");
        if (opts->nocase)
                seq_puts(m, ",nocase");

        return 0;
}

/*
 * ntfs_shutdown - super_operations::shutdown
 */
static void ntfs_shutdown(struct super_block *sb)
{
        set_bit(NTFS_FLAGS_SHUTDOWN_BIT, &ntfs_sb(sb)->flags);
}

/*
 * ntfs_sync_fs - super_operations::sync_fs
 */
static int ntfs_sync_fs(struct super_block *sb, int wait)
{
        int err = 0, err2;
        struct ntfs_sb_info *sbi = sb->s_fs_info;
        struct ntfs_inode *ni;
        struct inode *inode;

        if (unlikely(ntfs3_forced_shutdown(sb)))
                return -EIO;

        ni = sbi->security.ni;
        if (ni) {
                inode = &ni->vfs_inode;
                err2 = _ni_write_inode(inode, wait);
                if (err2 && !err)
                        err = err2;
        }

        ni = sbi->objid.ni;
        if (ni) {
                inode = &ni->vfs_inode;
                err2 = _ni_write_inode(inode, wait);
                if (err2 && !err)
                        err = err2;
        }

        ni = sbi->reparse.ni;
        if (ni) {
                inode = &ni->vfs_inode;
                err2 = _ni_write_inode(inode, wait);
                if (err2 && !err)
                        err = err2;
        }

        if (!err)
                ntfs_set_state(sbi, NTFS_DIRTY_CLEAR);

        ntfs_update_mftmirr(sbi, wait);

        return err;
}

static const struct super_operations ntfs_sops = {
        .alloc_inode = ntfs_alloc_inode,
        .free_inode = ntfs_free_inode,
        .evict_inode = ntfs_evict_inode,
        .put_super = ntfs_put_super,
        .statfs = ntfs_statfs,
        .show_options = ntfs_show_options,
        .shutdown = ntfs_shutdown,
        .sync_fs = ntfs_sync_fs,
        .write_inode = ntfs3_write_inode,
};

static struct inode *ntfs_export_get_inode(struct super_block *sb, u64 ino,
                                           u32 generation)
{
        struct MFT_REF ref;
        struct inode *inode;

        ref.low = cpu_to_le32(ino);
#ifdef CONFIG_NTFS3_64BIT_CLUSTER
        ref.high = cpu_to_le16(ino >> 32);
#else
        ref.high = 0;
#endif
        ref.seq = cpu_to_le16(generation);

        inode = ntfs_iget5(sb, &ref, NULL);
        if (!IS_ERR(inode) && is_bad_inode(inode)) {
                iput(inode);
                inode = ERR_PTR(-ESTALE);
        }

        return inode;
}

static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
                                        int fh_len, int fh_type)
{
        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
                                    ntfs_export_get_inode);
}

static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid,
                                        int fh_len, int fh_type)
{
        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
                                    ntfs_export_get_inode);
}

/* TODO: == ntfs_sync_inode */
static int ntfs_nfs_commit_metadata(struct inode *inode)
{
        return _ni_write_inode(inode, 1);
}

static const struct export_operations ntfs_export_ops = {
        .encode_fh = generic_encode_ino32_fh,
        .fh_to_dentry = ntfs_fh_to_dentry,
        .fh_to_parent = ntfs_fh_to_parent,
        .get_parent = ntfs3_get_parent,
        .commit_metadata = ntfs_nfs_commit_metadata,
};

/*
 * format_size_gb - Return Gb,Mb to print with "%u.%02u Gb".
 */
static u32 format_size_gb(const u64 bytes, u32 *mb)
{
        /* Do simple right 30 bit shift of 64 bit value. */
        u64 kbytes = bytes >> 10;
        u32 kbytes32 = kbytes;

        *mb = (100 * (kbytes32 & 0xfffff) + 0x7ffff) >> 20;
        if (*mb >= 100)
                *mb = 99;

        return (kbytes32 >> 20) | (((u32)(kbytes >> 32)) << 12);
}

static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot)
{
        if (boot->sectors_per_clusters <= 0x80)
                return boot->sectors_per_clusters;
        if (boot->sectors_per_clusters >= 0xf4) /* limit shift to 2MB max */
                return 1U << (-(s8)boot->sectors_per_clusters);
        return -EINVAL;
}

/*
 * ntfs_init_from_boot - Init internal info from on-disk boot sector.
 *
 * NTFS mount begins from boot - special formatted 512 bytes.
 * There are two boots: the first and the last 512 bytes of volume.
 * The content of boot is not changed during ntfs life.
 *
 * NOTE: ntfs.sys checks only first (primary) boot.
 * chkdsk checks both boots.
 */
static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
                               u64 dev_size, struct NTFS_BOOT **boot2)
{
        struct ntfs_sb_info *sbi = sb->s_fs_info;
        int err;
        u32 mb, gb, boot_sector_size, sct_per_clst, record_size;
        u64 sectors, clusters, mlcn, mlcn2, dev_size0;
        struct NTFS_BOOT *boot;
        struct buffer_head *bh;
        struct MFT_REC *rec;
        u16 fn, ao;
        u8 cluster_bits;
        u32 boot_off = 0;
        sector_t boot_block = 0;
        const char *hint = "Primary boot";

        /* Save original dev_size. Used with alternative boot. */
        dev_size0 = dev_size;

        sbi->volume.blocks = dev_size >> PAGE_SHIFT;

read_boot:
        bh = ntfs_bread(sb, boot_block);
        if (!bh)
                return boot_block ? -EINVAL : -EIO;

        err = -EINVAL;

        /* Corrupted image; do not read OOB */
        if (bh->b_size - sizeof(*boot) < boot_off)
                goto out;

        boot = (struct NTFS_BOOT *)Add2Ptr(bh->b_data, boot_off);

        if (memcmp(boot->system_id, "NTFS    ", sizeof("NTFS    ") - 1)) {
                ntfs_err(sb, "%s signature is not NTFS.", hint);
                goto out;
        }

        /* 0x55AA is not mandaroty. Thanks Maxim Suhanov*/
        /*if (0x55 != boot->boot_magic[0] || 0xAA != boot->boot_magic[1])
         *        goto out;
         */

        boot_sector_size = ((u32)boot->bytes_per_sector[1] << 8) |
                           boot->bytes_per_sector[0];
        if (boot_sector_size < SECTOR_SIZE ||
            !is_power_of_2(boot_sector_size)) {
                ntfs_err(sb, "%s: invalid bytes per sector %u.", hint,
                         boot_sector_size);
                goto out;
        }

        /* cluster size: 512, 1K, 2K, 4K, ... 2M */
        sct_per_clst = true_sectors_per_clst(boot);
        if ((int)sct_per_clst < 0 || !is_power_of_2(sct_per_clst)) {
                ntfs_err(sb, "%s: invalid sectors per cluster %u.", hint,
                         sct_per_clst);
                goto out;
        }

        sbi->cluster_size = boot_sector_size * sct_per_clst;
        sbi->cluster_bits = cluster_bits = blksize_bits(sbi->cluster_size);
        sbi->cluster_mask = sbi->cluster_size - 1;
        sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask;

        mlcn = le64_to_cpu(boot->mft_clst);
        mlcn2 = le64_to_cpu(boot->mft2_clst);
        sectors = le64_to_cpu(boot->sectors_per_volume);

        if (mlcn * sct_per_clst >= sectors || mlcn2 * sct_per_clst >= sectors) {
                ntfs_err(
                        sb,
                        "%s: start of MFT 0x%llx (0x%llx) is out of volume 0x%llx.",
                        hint, mlcn, mlcn2, sectors);
                goto out;
        }

        if (boot->record_size >= 0) {
                record_size = (u32)boot->record_size << cluster_bits;
        } else if (-boot->record_size <= MAXIMUM_SHIFT_BYTES_PER_MFT) {
                record_size = 1u << (-boot->record_size);
        } else {
                ntfs_err(sb, "%s: invalid record size %d.", hint,
                         boot->record_size);
                goto out;
        }

        sbi->record_size = record_size;
        sbi->record_bits = blksize_bits(record_size);
        sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes

        /* Check MFT record size. */
        if (record_size < SECTOR_SIZE || !is_power_of_2(record_size)) {
                ntfs_err(sb, "%s: invalid bytes per MFT record %u (%d).", hint,
                         record_size, boot->record_size);
                goto out;
        }

        if (record_size > MAXIMUM_BYTES_PER_MFT) {
                ntfs_err(sb, "Unsupported bytes per MFT record %u.",
                         record_size);
                goto out;
        }

        if (boot->index_size >= 0) {
                sbi->index_size = (u32)boot->index_size << cluster_bits;
        } else if (-boot->index_size <= MAXIMUM_SHIFT_BYTES_PER_INDEX) {
                sbi->index_size = 1u << (-boot->index_size);
        } else {
                ntfs_err(sb, "%s: invalid index size %d.", hint,
                         boot->index_size);
                goto out;
        }

        /* Check index record size. */
        if (sbi->index_size < SECTOR_SIZE || !is_power_of_2(sbi->index_size)) {
                ntfs_err(sb, "%s: invalid bytes per index %u(%d).", hint,
                         sbi->index_size, boot->index_size);
                goto out;
        }

        if (sbi->index_size > MAXIMUM_BYTES_PER_INDEX) {
                ntfs_err(sb, "%s: unsupported bytes per index %u.", hint,
                         sbi->index_size);
                goto out;
        }

        sbi->volume.size = sectors * boot_sector_size;

        gb = format_size_gb(sbi->volume.size + boot_sector_size, &mb);

        /*
         * - Volume formatted and mounted with the same sector size.
         * - Volume formatted 4K and mounted as 512.
         * - Volume formatted 512 and mounted as 4K.
         */
        if (boot_sector_size != sector_size) {
                ntfs_warn(
                        sb,
                        "Different NTFS sector size (%u) and media sector size (%u).",
                        boot_sector_size, sector_size);
                dev_size += sector_size - 1;
        }

        sbi->mft.lbo = mlcn << cluster_bits;
        sbi->mft.lbo2 = mlcn2 << cluster_bits;

        /* Compare boot's cluster and sector. */
        if (sbi->cluster_size < boot_sector_size) {
                ntfs_err(sb, "%s: invalid bytes per cluster (%u).", hint,
                         sbi->cluster_size);
                goto out;
        }

        /* Compare boot's cluster and media sector. */
        if (sbi->cluster_size < sector_size) {
                /* No way to use ntfs_get_block in this case. */
                ntfs_err(
                        sb,
                        "Failed to mount 'cause NTFS's cluster size (%u) is less than media sector size (%u).",
                        sbi->cluster_size, sector_size);
                goto out;
        }

        sbi->max_bytes_per_attr =
                record_size - ALIGN(MFTRECORD_FIXUP_OFFSET, 8) -
                ALIGN(((record_size >> SECTOR_SHIFT) * sizeof(short)), 8) -
                ALIGN(sizeof(enum ATTR_TYPE), 8);

        sbi->volume.ser_num = le64_to_cpu(boot->serial_num);

        /* Warning if RAW volume. */
        if (dev_size < sbi->volume.size + boot_sector_size) {
                u32 mb0, gb0;

                gb0 = format_size_gb(dev_size, &mb0);
                ntfs_warn(
                        sb,
                        "RAW NTFS volume: Filesystem size %u.%02u Gb > volume size %u.%02u Gb. Mount in read-only.",
                        gb, mb, gb0, mb0);
                sb->s_flags |= SB_RDONLY;
        }

        clusters = sbi->volume.size >> cluster_bits;
#ifndef CONFIG_NTFS3_64BIT_CLUSTER
        /* 32 bits per cluster. */
        if (clusters >> 32) {
                ntfs_notice(
                        sb,
                        "NTFS %u.%02u Gb is too big to use 32 bits per cluster.",
                        gb, mb);
                goto out;
        }
#elif BITS_PER_LONG < 64
#error "CONFIG_NTFS3_64BIT_CLUSTER incompatible in 32 bit OS"
#endif

        sbi->used.bitmap.nbits = clusters;

        rec = kzalloc(record_size, GFP_NOFS);
        if (!rec) {
                err = -ENOMEM;
                goto out;
        }

        sbi->new_rec = rec;
        rec->rhdr.sign = NTFS_FILE_SIGNATURE;
        rec->rhdr.fix_off = cpu_to_le16(MFTRECORD_FIXUP_OFFSET);
        fn = (sbi->record_size >> SECTOR_SHIFT) + 1;
        rec->rhdr.fix_num = cpu_to_le16(fn);
        ao = ALIGN(MFTRECORD_FIXUP_OFFSET + sizeof(short) * fn, 8);
        rec->attr_off = cpu_to_le16(ao);
        rec->used = cpu_to_le32(ao + ALIGN(sizeof(enum ATTR_TYPE), 8));
        rec->total = cpu_to_le32(sbi->record_size);
        ((struct ATTRIB *)Add2Ptr(rec, ao))->type = ATTR_END;

        sb_set_blocksize(sb, min_t(u32, sbi->cluster_size, PAGE_SIZE));

        sbi->block_mask = sb->s_blocksize - 1;
        sbi->blocks_per_cluster = sbi->cluster_size >> sb->s_blocksize_bits;
        sbi->volume.blocks = sbi->volume.size >> sb->s_blocksize_bits;

        /* Maximum size for normal files. */
        sbi->maxbytes = (clusters << cluster_bits) - 1;

#ifdef CONFIG_NTFS3_64BIT_CLUSTER
        if (clusters >= (1ull << (64 - cluster_bits)))
                sbi->maxbytes = -1;
        sbi->maxbytes_sparse = -1;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
#else
        /* Maximum size for sparse file. */
        sbi->maxbytes_sparse = (1ull << (cluster_bits + 32)) - 1;
        sb->s_maxbytes = 0xFFFFFFFFull << cluster_bits;
#endif

        /*
         * Compute the MFT zone at two steps.
         * It would be nice if we are able to allocate 1/8 of
         * total clusters for MFT but not more then 512 MB.
         */
        sbi->zone_max = min_t(CLST, 0x20000000 >> cluster_bits, clusters >> 3);

        err = 0;

        if (bh->b_blocknr && !sb_rdonly(sb)) {
                /*
                  * Alternative boot is ok but primary is not ok.
                  * Do not update primary boot here 'cause it may be faked boot.
                  * Let ntfs to be mounted and update boot later.
                 */
                *boot2 = kmemdup(boot, sizeof(*boot), GFP_NOFS | __GFP_NOWARN);
        }

out:
        brelse(bh);

        if (err == -EINVAL && !boot_block && dev_size0 > PAGE_SHIFT) {
                u32 block_size = min_t(u32, sector_size, PAGE_SIZE);
                u64 lbo = dev_size0 - sizeof(*boot);

                boot_block = lbo >> blksize_bits(block_size);
                boot_off = lbo & (block_size - 1);
                if (boot_block && block_size >= boot_off + sizeof(*boot)) {
                        /*
                         * Try alternative boot (last sector)
                         */
                        sb_set_blocksize(sb, block_size);
                        hint = "Alternative boot";
                        dev_size = dev_size0; /* restore original size. */
                        goto read_boot;
                }
        }

        return err;
}

/*
 * ntfs_fill_super - Try to mount.
 */
static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
        int err;
        struct ntfs_sb_info *sbi = sb->s_fs_info;
        struct block_device *bdev = sb->s_bdev;
        struct ntfs_mount_options *options;
        struct inode *inode;
        struct ntfs_inode *ni;
        size_t i, tt, bad_len, bad_frags;
        CLST vcn, lcn, len;
        struct ATTRIB *attr;
        const struct VOLUME_INFO *info;
        u32 idx, done, bytes;
        struct ATTR_DEF_ENTRY *t;
        u16 *shared;
        struct MFT_REF ref;
        bool ro = sb_rdonly(sb);
        struct NTFS_BOOT *boot2 = NULL;

        ref.high = 0;

        sbi->sb = sb;
        sbi->options = options = fc->fs_private;
        fc->fs_private = NULL;
        sb->s_flags |= SB_NODIRATIME;
        sb->s_magic = 0x7366746e; // "ntfs"
        sb->s_op = &ntfs_sops;
        sb->s_export_op = &ntfs_export_ops;
        sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec
        sb->s_xattr = ntfs_xattr_handlers;
        sb->s_d_op = options->nocase ? &ntfs_dentry_ops : NULL;

        options->nls = ntfs_load_nls(options->nls_name);
        if (IS_ERR(options->nls)) {
                options->nls = NULL;
                errorf(fc, "Cannot load nls %s", options->nls_name);
                err = -EINVAL;
                goto out;
        }

        if (bdev_max_discard_sectors(bdev) && bdev_discard_granularity(bdev)) {
                sbi->discard_granularity = bdev_discard_granularity(bdev);
                sbi->discard_granularity_mask_inv =
                        ~(u64)(sbi->discard_granularity - 1);
        }

        /* Parse boot. */
        err = ntfs_init_from_boot(sb, bdev_logical_block_size(bdev),
                                  bdev_nr_bytes(bdev), &boot2);
        if (err)
                goto out;

        /*
         * Load $Volume. This should be done before $LogFile
         * 'cause 'sbi->volume.ni' is used 'ntfs_set_state'.
         */
        ref.low = cpu_to_le32(MFT_REC_VOL);
        ref.seq = cpu_to_le16(MFT_REC_VOL);
        inode = ntfs_iget5(sb, &ref, &NAME_VOLUME);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                ntfs_err(sb, "Failed to load $Volume (%d).", err);
                goto out;
        }

        ni = ntfs_i(inode);

        /* Load and save label (not necessary). */
        attr = ni_find_attr(ni, NULL, NULL, ATTR_LABEL, NULL, 0, NULL, NULL);

        if (!attr) {
                /* It is ok if no ATTR_LABEL */
        } else if (!attr->non_res && !is_attr_ext(attr)) {
                /* $AttrDef allows labels to be up to 128 symbols. */
                err = utf16s_to_utf8s(resident_data(attr),
                                      le32_to_cpu(attr->res.data_size) >> 1,
                                      UTF16_LITTLE_ENDIAN, sbi->volume.label,
                                      sizeof(sbi->volume.label));
                if (err < 0)
                        sbi->volume.label[0] = 0;
        } else {
                /* Should we break mounting here? */
                //err = -EINVAL;
                //goto put_inode_out;
        }

        attr = ni_find_attr(ni, attr, NULL, ATTR_VOL_INFO, NULL, 0, NULL, NULL);
        if (!attr || is_attr_ext(attr) ||
            !(info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO))) {
                ntfs_err(sb, "$Volume is corrupted.");
                err = -EINVAL;
                goto put_inode_out;
        }

        sbi->volume.major_ver = info->major_ver;
        sbi->volume.minor_ver = info->minor_ver;
        sbi->volume.flags = info->flags;
        sbi->volume.ni = ni;
        if (info->flags & VOLUME_FLAG_DIRTY) {
                sbi->volume.real_dirty = true;
                ntfs_info(sb, "It is recommened to use chkdsk.");
        }

        /* Load $MFTMirr to estimate recs_mirr. */
        ref.low = cpu_to_le32(MFT_REC_MIRR);
        ref.seq = cpu_to_le16(MFT_REC_MIRR);
        inode = ntfs_iget5(sb, &ref, &NAME_MIRROR);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                ntfs_err(sb, "Failed to load $MFTMirr (%d).", err);
                goto out;
        }

        sbi->mft.recs_mirr = ntfs_up_cluster(sbi, inode->i_size) >>
                             sbi->record_bits;

        iput(inode);

        /* Load LogFile to replay. */
        ref.low = cpu_to_le32(MFT_REC_LOG);
        ref.seq = cpu_to_le16(MFT_REC_LOG);
        inode = ntfs_iget5(sb, &ref, &NAME_LOGFILE);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                ntfs_err(sb, "Failed to load \x24LogFile (%d).", err);
                goto out;
        }

        ni = ntfs_i(inode);

        err = ntfs_loadlog_and_replay(ni, sbi);
        if (err)
                goto put_inode_out;

        iput(inode);

        if ((sbi->flags & NTFS_FLAGS_NEED_REPLAY) && !ro) {
                ntfs_warn(sb, "failed to replay log file. Can't mount rw!");
                err = -EINVAL;
                goto out;
        }

        if ((sbi->volume.flags & VOLUME_FLAG_DIRTY) && !ro && !options->force) {
                ntfs_warn(sb, "volume is dirty and \"force\" flag is not set!");
                err = -EINVAL;
                goto out;
        }

        /* Load $MFT. */
        ref.low = cpu_to_le32(MFT_REC_MFT);
        ref.seq = cpu_to_le16(1);

        inode = ntfs_iget5(sb, &ref, &NAME_MFT);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                ntfs_err(sb, "Failed to load $MFT (%d).", err);
                goto out;
        }

        ni = ntfs_i(inode);

        sbi->mft.used = ni->i_valid >> sbi->record_bits;
        tt = inode->i_size >> sbi->record_bits;
        sbi->mft.next_free = MFT_REC_USER;

        err = wnd_init(&sbi->mft.bitmap, sb, tt);
        if (err)
                goto put_inode_out;

        err = ni_load_all_mi(ni);
        if (err) {
                ntfs_err(sb, "Failed to load $MFT's subrecords (%d).", err);
                goto put_inode_out;
        }

        sbi->mft.ni = ni;

        /* Load $Bitmap. */
        ref.low = cpu_to_le32(MFT_REC_BITMAP);
        ref.seq = cpu_to_le16(MFT_REC_BITMAP);
        inode = ntfs_iget5(sb, &ref, &NAME_BITMAP);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                ntfs_err(sb, "Failed to load $Bitmap (%d).", err);
                goto out;
        }

#ifndef CONFIG_NTFS3_64BIT_CLUSTER
        if (inode->i_size >> 32) {
                err = -EINVAL;
                goto put_inode_out;
        }
#endif

        /* Check bitmap boundary. */
        tt = sbi->used.bitmap.nbits;
        if (inode->i_size < ntfs3_bitmap_size(tt)) {
                ntfs_err(sb, "$Bitmap is corrupted.");
                err = -EINVAL;
                goto put_inode_out;
        }

        err = wnd_init(&sbi->used.bitmap, sb, tt);
        if (err) {
                ntfs_err(sb, "Failed to initialize $Bitmap (%d).", err);
                goto put_inode_out;
        }

        iput(inode);

        /* Compute the MFT zone. */
        err = ntfs_refresh_zone(sbi);
        if (err) {
                ntfs_err(sb, "Failed to initialize MFT zone (%d).", err);
                goto out;
        }

        /* Load $BadClus. */
        ref.low = cpu_to_le32(MFT_REC_BADCLUST);
        ref.seq = cpu_to_le16(MFT_REC_BADCLUST);
        inode = ntfs_iget5(sb, &ref, &NAME_BADCLUS);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                ntfs_err(sb, "Failed to load $BadClus (%d).", err);
                goto out;
        }

        ni = ntfs_i(inode);
        bad_len = bad_frags = 0;
        for (i = 0; run_get_entry(&ni->file.run, i, &vcn, &lcn, &len); i++) {
                if (lcn == SPARSE_LCN)
                        continue;

                bad_len += len;
                bad_frags += 1;
                if (ro)
                        continue;

                if (wnd_set_used_safe(&sbi->used.bitmap, lcn, len, &tt) || tt) {
                        /* Bad blocks marked as free in bitmap. */
                        ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
                }
        }
        if (bad_len) {
                /*
                 * Notice about bad blocks.
                 * In normal cases these blocks are marked as used in bitmap.
                 * And we never allocate space in it.
                 */
                ntfs_notice(sb,
                            "Volume contains %zu bad blocks in %zu fragments.",
                            bad_len, bad_frags);
        }
        iput(inode);

        /* Load $AttrDef. */
        ref.low = cpu_to_le32(MFT_REC_ATTR);
        ref.seq = cpu_to_le16(MFT_REC_ATTR);
        inode = ntfs_iget5(sb, &ref, &NAME_ATTRDEF);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                ntfs_err(sb, "Failed to load $AttrDef (%d)", err);
                goto out;
        }

        /*
         * Typical $AttrDef contains up to 20 entries.
         * Check for extremely large/small size.
         */
        if (inode->i_size < sizeof(struct ATTR_DEF_ENTRY) ||
            inode->i_size > 100 * sizeof(struct ATTR_DEF_ENTRY)) {
                ntfs_err(sb, "Looks like $AttrDef is corrupted (size=%llu).",
                         inode->i_size);
                err = -EINVAL;
                goto put_inode_out;
        }

        bytes = inode->i_size;
        sbi->def_table = t = kvmalloc(bytes, GFP_KERNEL);
        if (!t) {
                err = -ENOMEM;
                goto put_inode_out;
        }

        for (done = idx = 0; done < bytes; done += PAGE_SIZE, idx++) {
                unsigned long tail = bytes - done;
                struct page *page = ntfs_map_page(inode->i_mapping, idx);

                if (IS_ERR(page)) {
                        err = PTR_ERR(page);
                        ntfs_err(sb, "Failed to read $AttrDef (%d).", err);
                        goto put_inode_out;
                }
                memcpy(Add2Ptr(t, done), page_address(page),
                       min(PAGE_SIZE, tail));
                ntfs_unmap_page(page);

                if (!idx && ATTR_STD != t->type) {
                        ntfs_err(sb, "$AttrDef is corrupted.");
                        err = -EINVAL;
                        goto put_inode_out;
                }
        }

        t += 1;
        sbi->def_entries = 1;
        done = sizeof(struct ATTR_DEF_ENTRY);
        sbi->reparse.max_size = MAXIMUM_REPARSE_DATA_BUFFER_SIZE;
        sbi->ea_max_size = 0x10000; /* default formatter value */

        while (done + sizeof(struct ATTR_DEF_ENTRY) <= bytes) {
                u32 t32 = le32_to_cpu(t->type);
                u64 sz = le64_to_cpu(t->max_sz);

                if ((t32 & 0xF) || le32_to_cpu(t[-1].type) >= t32)
                        break;

                if (t->type == ATTR_REPARSE)
                        sbi->reparse.max_size = sz;
                else if (t->type == ATTR_EA)
                        sbi->ea_max_size = sz;

                done += sizeof(struct ATTR_DEF_ENTRY);
                t += 1;
                sbi->def_entries += 1;
        }
        iput(inode);

        /* Load $UpCase. */
        ref.low = cpu_to_le32(MFT_REC_UPCASE);
        ref.seq = cpu_to_le16(MFT_REC_UPCASE);
        inode = ntfs_iget5(sb, &ref, &NAME_UPCASE);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                ntfs_err(sb, "Failed to load $UpCase (%d).", err);
                goto out;
        }

        if (inode->i_size != 0x10000 * sizeof(short)) {
                err = -EINVAL;
                ntfs_err(sb, "$UpCase is corrupted.");
                goto put_inode_out;
        }

        for (idx = 0; idx < (0x10000 * sizeof(short) >> PAGE_SHIFT); idx++) {
                const __le16 *src;
                u16 *dst = Add2Ptr(sbi->upcase, idx << PAGE_SHIFT);
                struct page *page = ntfs_map_page(inode->i_mapping, idx);

                if (IS_ERR(page)) {
                        err = PTR_ERR(page);
                        ntfs_err(sb, "Failed to read $UpCase (%d).", err);
                        goto put_inode_out;
                }

                src = page_address(page);

#ifdef __BIG_ENDIAN
                for (i = 0; i < PAGE_SIZE / sizeof(u16); i++)
                        *dst++ = le16_to_cpu(*src++);
#else
                memcpy(dst, src, PAGE_SIZE);
#endif
                ntfs_unmap_page(page);
        }

        shared = ntfs_set_shared(sbi->upcase, 0x10000 * sizeof(short));
        if (shared && sbi->upcase != shared) {
                kvfree(sbi->upcase);
                sbi->upcase = shared;
        }

        iput(inode);

        if (is_ntfs3(sbi)) {
                /* Load $Secure. */
                err = ntfs_security_init(sbi);
                if (err) {
                        ntfs_err(sb, "Failed to initialize $Secure (%d).", err);
                        goto out;
                }

                /* Load $Extend. */
                err = ntfs_extend_init(sbi);
                if (err) {
                        ntfs_warn(sb, "Failed to initialize $Extend.");
                        goto load_root;
                }

                /* Load $Extend/$Reparse. */
                err = ntfs_reparse_init(sbi);
                if (err) {
                        ntfs_warn(sb, "Failed to initialize $Extend/$Reparse.");
                        goto load_root;
                }

                /* Load $Extend/$ObjId. */
                err = ntfs_objid_init(sbi);
                if (err) {
                        ntfs_warn(sb, "Failed to initialize $Extend/$ObjId.");
                        goto load_root;
                }
        }

load_root:
        /* Load root. */
        ref.low = cpu_to_le32(MFT_REC_ROOT);
        ref.seq = cpu_to_le16(MFT_REC_ROOT);
        inode = ntfs_iget5(sb, &ref, &NAME_ROOT);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                ntfs_err(sb, "Failed to load root (%d).", err);
                goto out;
        }

        /*
         * Final check. Looks like this case should never occurs.
         */
        if (!inode->i_op) {
                err = -EINVAL;
                ntfs_err(sb, "Failed to load root (%d).", err);
                goto put_inode_out;
        }

        sb->s_root = d_make_root(inode);
        if (!sb->s_root) {
                err = -ENOMEM;
                goto put_inode_out;
        }

        if (boot2) {
                /*
                  * Alternative boot is ok but primary is not ok.
                  * Volume is recognized as NTFS. Update primary boot.
                 */
                struct buffer_head *bh0 = sb_getblk(sb, 0);
                if (bh0) {
                        if (buffer_locked(bh0))
                                __wait_on_buffer(bh0);

                        lock_buffer(bh0);
                        memcpy(bh0->b_data, boot2, sizeof(*boot2));
                        set_buffer_uptodate(bh0);
                        mark_buffer_dirty(bh0);
                        unlock_buffer(bh0);
                        if (!sync_dirty_buffer(bh0))
                                ntfs_warn(sb, "primary boot is updated");
                        put_bh(bh0);
                }

                kfree(boot2);
        }

#ifdef CONFIG_PROC_FS
        /* Create /proc/fs/ntfs3/.. */
        if (proc_info_root) {
                struct proc_dir_entry *e = proc_mkdir(sb->s_id, proc_info_root);
                static_assert((S_IRUGO | S_IWUSR) == 0644);
                if (e) {
                        proc_create_data("volinfo", S_IRUGO, e,
                                         &ntfs3_volinfo_fops, sb);
                        proc_create_data("label", S_IRUGO | S_IWUSR, e,
                                         &ntfs3_label_fops, sb);
                        sbi->procdir = e;
                }
        }
#endif

        if (is_legacy_ntfs(sb))
                sb->s_flags |= SB_RDONLY;
        return 0;

put_inode_out:
        iput(inode);
out:
        ntfs3_put_sbi(sbi);
        kfree(boot2);
        ntfs3_put_sbi(sbi);
        return err;
}

void ntfs_unmap_meta(struct super_block *sb, CLST lcn, CLST len)
{
        struct ntfs_sb_info *sbi = sb->s_fs_info;
        struct block_device *bdev = sb->s_bdev;
        sector_t devblock = (u64)lcn * sbi->blocks_per_cluster;
        unsigned long blocks = (u64)len * sbi->blocks_per_cluster;
        unsigned long cnt = 0;
        unsigned long limit = global_zone_page_state(NR_FREE_PAGES)
                              << (PAGE_SHIFT - sb->s_blocksize_bits);

        if (limit >= 0x2000)
                limit -= 0x1000;
        else if (limit < 32)
                limit = 32;
        else
                limit >>= 1;

        while (blocks--) {
                clean_bdev_aliases(bdev, devblock++, 1);
                if (cnt++ >= limit) {
                        sync_blockdev(bdev);
                        cnt = 0;
                }
        }
}

/*
 * ntfs_discard - Issue a discard request (trim for SSD).
 */
int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len)
{
        int err;
        u64 lbo, bytes, start, end;
        struct super_block *sb;

        if (sbi->used.next_free_lcn == lcn + len)
                sbi->used.next_free_lcn = lcn;

        if (sbi->flags & NTFS_FLAGS_NODISCARD)
                return -EOPNOTSUPP;

        if (!sbi->options->discard)
                return -EOPNOTSUPP;

        lbo = (u64)lcn << sbi->cluster_bits;
        bytes = (u64)len << sbi->cluster_bits;

        /* Align up 'start' on discard_granularity. */
        start = (lbo + sbi->discard_granularity - 1) &
                sbi->discard_granularity_mask_inv;
        /* Align down 'end' on discard_granularity. */
        end = (lbo + bytes) & sbi->discard_granularity_mask_inv;

        sb = sbi->sb;
        if (start >= end)
                return 0;

        err = blkdev_issue_discard(sb->s_bdev, start >> 9, (end - start) >> 9,
                                   GFP_NOFS);

        if (err == -EOPNOTSUPP)
                sbi->flags |= NTFS_FLAGS_NODISCARD;

        return err;
}

static int ntfs_fs_get_tree(struct fs_context *fc)
{
        return get_tree_bdev(fc, ntfs_fill_super);
}

/*
 * ntfs_fs_free - Free fs_context.
 *
 * Note that this will be called after fill_super and reconfigure
 * even when they pass. So they have to take pointers if they pass.
 */
static void ntfs_fs_free(struct fs_context *fc)
{
        struct ntfs_mount_options *opts = fc->fs_private;
        struct ntfs_sb_info *sbi = fc->s_fs_info;

        if (sbi) {
                ntfs3_put_sbi(sbi);
                ntfs3_free_sbi(sbi);
        }

        if (opts)
                put_mount_options(opts);
}

// clang-format off
static const struct fs_context_operations ntfs_context_ops = {
        .parse_param        = ntfs_fs_parse_param,
        .get_tree        = ntfs_fs_get_tree,
        .reconfigure        = ntfs_fs_reconfigure,
        .free                = ntfs_fs_free,
};
// clang-format on

/*
 * ntfs_init_fs_context - Initialize sbi and opts
 *
 * This will called when mount/remount. We will first initialize
 * options so that if remount we can use just that.
 */
static int __ntfs_init_fs_context(struct fs_context *fc)
{
        struct ntfs_mount_options *opts;
        struct ntfs_sb_info *sbi;

        opts = kzalloc(sizeof(struct ntfs_mount_options), GFP_NOFS);
        if (!opts)
                return -ENOMEM;

        /* Default options. */
        opts->fs_uid = current_uid();
        opts->fs_gid = current_gid();
        opts->fs_fmask_inv = ~current_umask();
        opts->fs_dmask_inv = ~current_umask();

        if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)
                goto ok;

        sbi = kzalloc(sizeof(struct ntfs_sb_info), GFP_NOFS);
        if (!sbi)
                goto free_opts;

        sbi->upcase = kvmalloc(0x10000 * sizeof(short), GFP_KERNEL);
        if (!sbi->upcase)
                goto free_sbi;

        ratelimit_state_init(&sbi->msg_ratelimit, DEFAULT_RATELIMIT_INTERVAL,
                             DEFAULT_RATELIMIT_BURST);

        mutex_init(&sbi->compress.mtx_lznt);
#ifdef CONFIG_NTFS3_LZX_XPRESS
        mutex_init(&sbi->compress.mtx_xpress);
        mutex_init(&sbi->compress.mtx_lzx);
#endif

        fc->s_fs_info = sbi;
ok:
        fc->fs_private = opts;
        fc->ops = &ntfs_context_ops;

        return 0;
free_sbi:
        kfree(sbi);
free_opts:
        kfree(opts);
        return -ENOMEM;
}

static int ntfs_init_fs_context(struct fs_context *fc)
{
        return __ntfs_init_fs_context(fc);
}

static void ntfs3_kill_sb(struct super_block *sb)
{
        struct ntfs_sb_info *sbi = sb->s_fs_info;

        kill_block_super(sb);

        if (sbi->options)
                put_mount_options(sbi->options);
        ntfs3_free_sbi(sbi);
}

// clang-format off
static struct file_system_type ntfs_fs_type = {
        .owner                        = THIS_MODULE,
        .name                        = "ntfs3",
        .init_fs_context        = ntfs_init_fs_context,
        .parameters                = ntfs_fs_parameters,
        .kill_sb                = ntfs3_kill_sb,
        .fs_flags                = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};

#if IS_ENABLED(CONFIG_NTFS_FS)
static int ntfs_legacy_init_fs_context(struct fs_context *fc)
{
        int ret;

        ret = __ntfs_init_fs_context(fc);
        /* If ntfs3 is used as legacy ntfs enforce read-only mode. */
        fc->sb_flags |= SB_RDONLY;
        return ret;
}

static struct file_system_type ntfs_legacy_fs_type = {
        .owner                        = THIS_MODULE,
        .name                        = "ntfs",
        .init_fs_context        = ntfs_legacy_init_fs_context,
        .parameters                = ntfs_fs_parameters,
        .kill_sb                = ntfs3_kill_sb,
        .fs_flags                = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("ntfs");

static inline void register_as_ntfs_legacy(void)
{
        int err = register_filesystem(&ntfs_legacy_fs_type);
        if (err)
                pr_warn("ntfs3: Failed to register legacy ntfs filesystem driver: %d\n", err);
}

static inline void unregister_as_ntfs_legacy(void)
{
        unregister_filesystem(&ntfs_legacy_fs_type);
}
bool is_legacy_ntfs(struct super_block *sb)
{
        return sb->s_type == &ntfs_legacy_fs_type;
}
#else
static inline void register_as_ntfs_legacy(void) {}
static inline void unregister_as_ntfs_legacy(void) {}
bool is_legacy_ntfs(struct super_block *sb) { return false; }
#endif


// clang-format on

static int __init init_ntfs_fs(void)
{
        int err;

        if (IS_ENABLED(CONFIG_NTFS3_FS_POSIX_ACL))
                pr_info("ntfs3: Enabled Linux POSIX ACLs support\n");
        if (IS_ENABLED(CONFIG_NTFS3_64BIT_CLUSTER))
                pr_notice(
                        "ntfs3: Warning: Activated 64 bits per cluster. Windows does not support this\n");
        if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS))
                pr_info("ntfs3: Read-only LZX/Xpress compression included\n");

#ifdef CONFIG_PROC_FS
        /* Create "/proc/fs/ntfs3" */
        proc_info_root = proc_mkdir("fs/ntfs3", NULL);
#endif

        err = ntfs3_init_bitmap();
        if (err)
                return err;

        ntfs_inode_cachep = kmem_cache_create(
                "ntfs_inode_cache", sizeof(struct ntfs_inode), 0,
                (SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT),
                init_once);
        if (!ntfs_inode_cachep) {
                err = -ENOMEM;
                goto out1;
        }

        register_as_ntfs_legacy();
        err = register_filesystem(&ntfs_fs_type);
        if (err)
                goto out;

        return 0;
out:
        kmem_cache_destroy(ntfs_inode_cachep);
out1:
        ntfs3_exit_bitmap();
        return err;
}

static void __exit exit_ntfs_fs(void)
{
        rcu_barrier();
        kmem_cache_destroy(ntfs_inode_cachep);
        unregister_filesystem(&ntfs_fs_type);
        unregister_as_ntfs_legacy();
        ntfs3_exit_bitmap();

#ifdef CONFIG_PROC_FS
        if (proc_info_root)
                remove_proc_entry("fs/ntfs3", NULL);
#endif
}

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("ntfs3 read/write filesystem");
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
MODULE_INFO(behaviour, "Enabled Linux POSIX ACLs support");
#endif
#ifdef CONFIG_NTFS3_64BIT_CLUSTER
MODULE_INFO(
        cluster,
        "Warning: Activated 64 bits per cluster. Windows does not support this");
#endif
#ifdef CONFIG_NTFS3_LZX_XPRESS
MODULE_INFO(compression, "Read-only lzx/xpress compression included");
#endif

MODULE_AUTHOR("Konstantin Komarov");
MODULE_ALIAS_FS("ntfs3");

module_init(init_ntfs_fs);
module_exit(exit_ntfs_fs);



























   54 





   12 





















   11 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_JUMP_LABEL_H
#define _ASM_X86_JUMP_LABEL_H

#define HAVE_JUMP_LABEL_BATCH

#include <asm/asm.h>
#include <asm/nops.h>

#ifndef __ASSEMBLY__

#include <linux/stringify.h>
#include <linux/types.h>

#define JUMP_TABLE_ENTRY                                \
        ".pushsection __jump_table,  \"aw\" \n\t"        \
        _ASM_ALIGN "\n\t"                                \
        ".long 1b - . \n\t"                                \
        ".long %l[l_yes] - . \n\t"                        \
        _ASM_PTR "%c0 + %c1 - .\n\t"                        \
        ".popsection \n\t"

#ifdef CONFIG_HAVE_JUMP_LABEL_HACK

static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
        asm goto("1:"
                "jmp %l[l_yes] # objtool NOPs this \n\t"
                JUMP_TABLE_ENTRY
                : :  "i" (key), "i" (2 | branch) : : l_yes);

        return false;
l_yes:
        return true;
}

#else /* !CONFIG_HAVE_JUMP_LABEL_HACK */

static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch)
{
        asm goto("1:"
                ".byte " __stringify(BYTES_NOP5) "\n\t"
                JUMP_TABLE_ENTRY
                : :  "i" (key), "i" (branch) : : l_yes);

        return false;
l_yes:
        return true;
}

#endif /* CONFIG_HAVE_JUMP_LABEL_HACK */

static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch)
{
        asm goto("1:"
                "jmp %l[l_yes]\n\t"
                JUMP_TABLE_ENTRY
                : :  "i" (key), "i" (branch) : : l_yes);

        return false;
l_yes:
        return true;
}

extern int arch_jump_entry_size(struct jump_entry *entry);

#endif        /* __ASSEMBLY__ */

#endif















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   24 








    8 

   22 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
// SPDX-License-Identifier: GPL-2.0-only

#include <linux/blkdev.h>
#include <linux/wait.h>
#include <linux/rbtree.h>
#include <linux/kthread.h>
#include <linux/backing-dev.h>
#include <linux/blk-cgroup.h>
#include <linux/freezer.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/writeback.h>
#include <linux/device.h>
#include <trace/events/writeback.h>
#include "internal.h"

struct backing_dev_info noop_backing_dev_info;
EXPORT_SYMBOL_GPL(noop_backing_dev_info);

static const char *bdi_unknown_name = "(unknown)";

/*
 * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
 * reader side locking.
 */
DEFINE_SPINLOCK(bdi_lock);
static u64 bdi_id_cursor;
static struct rb_root bdi_tree = RB_ROOT;
LIST_HEAD(bdi_list);

/* bdi_wq serves all asynchronous writeback tasks */
struct workqueue_struct *bdi_wq;

#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>

struct wb_stats {
        unsigned long nr_dirty;
        unsigned long nr_io;
        unsigned long nr_more_io;
        unsigned long nr_dirty_time;
        unsigned long nr_writeback;
        unsigned long nr_reclaimable;
        unsigned long nr_dirtied;
        unsigned long nr_written;
        unsigned long dirty_thresh;
        unsigned long wb_thresh;
};

static struct dentry *bdi_debug_root;

static void bdi_debug_init(void)
{
        bdi_debug_root = debugfs_create_dir("bdi", NULL);
}

static void collect_wb_stats(struct wb_stats *stats,
                             struct bdi_writeback *wb)
{
        struct inode *inode;

        spin_lock(&wb->list_lock);
        list_for_each_entry(inode, &wb->b_dirty, i_io_list)
                stats->nr_dirty++;
        list_for_each_entry(inode, &wb->b_io, i_io_list)
                stats->nr_io++;
        list_for_each_entry(inode, &wb->b_more_io, i_io_list)
                stats->nr_more_io++;
        list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
                if (inode->i_state & I_DIRTY_TIME)
                        stats->nr_dirty_time++;
        spin_unlock(&wb->list_lock);

        stats->nr_writeback += wb_stat(wb, WB_WRITEBACK);
        stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE);
        stats->nr_dirtied += wb_stat(wb, WB_DIRTIED);
        stats->nr_written += wb_stat(wb, WB_WRITTEN);
        stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh);
}

#ifdef CONFIG_CGROUP_WRITEBACK
static void bdi_collect_stats(struct backing_dev_info *bdi,
                              struct wb_stats *stats)
{
        struct bdi_writeback *wb;

        rcu_read_lock();
        list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
                if (!wb_tryget(wb))
                        continue;

                collect_wb_stats(stats, wb);
                wb_put(wb);
        }
        rcu_read_unlock();
}
#else
static void bdi_collect_stats(struct backing_dev_info *bdi,
                              struct wb_stats *stats)
{
        collect_wb_stats(stats, &bdi->wb);
}
#endif

static int bdi_debug_stats_show(struct seq_file *m, void *v)
{
        struct backing_dev_info *bdi = m->private;
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        struct wb_stats stats;
        unsigned long tot_bw;

        global_dirty_limits(&background_thresh, &dirty_thresh);

        memset(&stats, 0, sizeof(stats));
        stats.dirty_thresh = dirty_thresh;
        bdi_collect_stats(bdi, &stats);
        tot_bw = atomic_long_read(&bdi->tot_write_bandwidth);

        seq_printf(m,
                   "BdiWriteback:       %10lu kB\n"
                   "BdiReclaimable:     %10lu kB\n"
                   "BdiDirtyThresh:     %10lu kB\n"
                   "DirtyThresh:        %10lu kB\n"
                   "BackgroundThresh:   %10lu kB\n"
                   "BdiDirtied:         %10lu kB\n"
                   "BdiWritten:         %10lu kB\n"
                   "BdiWriteBandwidth:  %10lu kBps\n"
                   "b_dirty:            %10lu\n"
                   "b_io:               %10lu\n"
                   "b_more_io:          %10lu\n"
                   "b_dirty_time:       %10lu\n"
                   "bdi_list:           %10u\n"
                   "state:              %10lx\n",
                   K(stats.nr_writeback),
                   K(stats.nr_reclaimable),
                   K(stats.wb_thresh),
                   K(dirty_thresh),
                   K(background_thresh),
                   K(stats.nr_dirtied),
                   K(stats.nr_written),
                   K(tot_bw),
                   stats.nr_dirty,
                   stats.nr_io,
                   stats.nr_more_io,
                   stats.nr_dirty_time,
                   !list_empty(&bdi->bdi_list), bdi->wb.state);

        return 0;
}
DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);

static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb,
                          struct wb_stats *stats)
{

        seq_printf(m,
                   "WbCgIno:           %10lu\n"
                   "WbWriteback:       %10lu kB\n"
                   "WbReclaimable:     %10lu kB\n"
                   "WbDirtyThresh:     %10lu kB\n"
                   "WbDirtied:         %10lu kB\n"
                   "WbWritten:         %10lu kB\n"
                   "WbWriteBandwidth:  %10lu kBps\n"
                   "b_dirty:           %10lu\n"
                   "b_io:              %10lu\n"
                   "b_more_io:         %10lu\n"
                   "b_dirty_time:      %10lu\n"
                   "state:             %10lx\n\n",
#ifdef CONFIG_CGROUP_WRITEBACK
                   cgroup_ino(wb->memcg_css->cgroup),
#else
                   1ul,
#endif
                   K(stats->nr_writeback),
                   K(stats->nr_reclaimable),
                   K(stats->wb_thresh),
                   K(stats->nr_dirtied),
                   K(stats->nr_written),
                   K(wb->avg_write_bandwidth),
                   stats->nr_dirty,
                   stats->nr_io,
                   stats->nr_more_io,
                   stats->nr_dirty_time,
                   wb->state);
}

static int cgwb_debug_stats_show(struct seq_file *m, void *v)
{
        struct backing_dev_info *bdi = m->private;
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        struct bdi_writeback *wb;

        global_dirty_limits(&background_thresh, &dirty_thresh);

        rcu_read_lock();
        list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
                struct wb_stats stats = { .dirty_thresh = dirty_thresh };

                if (!wb_tryget(wb))
                        continue;

                collect_wb_stats(&stats, wb);

                /*
                 * Calculate thresh of wb in writeback cgroup which is min of
                 * thresh in global domain and thresh in cgroup domain. Drop
                 * rcu lock because cgwb_calc_thresh may sleep in
                 * cgroup_rstat_flush. We can do so here because we have a ref.
                 */
                if (mem_cgroup_wb_domain(wb)) {
                        rcu_read_unlock();
                        stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
                        rcu_read_lock();
                }

                wb_stats_show(m, wb, &stats);

                wb_put(wb);
        }
        rcu_read_unlock();

        return 0;
}
DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats);

static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
{
        bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);

        debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
                            &bdi_debug_stats_fops);
        debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi,
                            &cgwb_debug_stats_fops);
}

static void bdi_debug_unregister(struct backing_dev_info *bdi)
{
        debugfs_remove_recursive(bdi->debug_dir);
}
#else /* CONFIG_DEBUG_FS */
static inline void bdi_debug_init(void)
{
}
static inline void bdi_debug_register(struct backing_dev_info *bdi,
                                      const char *name)
{
}
static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
{
}
#endif /* CONFIG_DEBUG_FS */

static ssize_t read_ahead_kb_store(struct device *dev,
                                  struct device_attribute *attr,
                                  const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned long read_ahead_kb;
        ssize_t ret;

        ret = kstrtoul(buf, 10, &read_ahead_kb);
        if (ret < 0)
                return ret;

        bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);

        return count;
}

#define BDI_SHOW(name, expr)                                                \
static ssize_t name##_show(struct device *dev,                                \
                           struct device_attribute *attr, char *buf)        \
{                                                                        \
        struct backing_dev_info *bdi = dev_get_drvdata(dev);                \
                                                                        \
        return sysfs_emit(buf, "%lld\n", (long long)expr);                \
}                                                                        \
static DEVICE_ATTR_RW(name);

BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))

static ssize_t min_ratio_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned int ratio;
        ssize_t ret;

        ret = kstrtouint(buf, 10, &ratio);
        if (ret < 0)
                return ret;

        ret = bdi_set_min_ratio(bdi, ratio);
        if (!ret)
                ret = count;

        return ret;
}
BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE)

static ssize_t min_ratio_fine_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned int ratio;
        ssize_t ret;

        ret = kstrtouint(buf, 10, &ratio);
        if (ret < 0)
                return ret;

        ret = bdi_set_min_ratio_no_scale(bdi, ratio);
        if (!ret)
                ret = count;

        return ret;
}
BDI_SHOW(min_ratio_fine, bdi->min_ratio)

static ssize_t max_ratio_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned int ratio;
        ssize_t ret;

        ret = kstrtouint(buf, 10, &ratio);
        if (ret < 0)
                return ret;

        ret = bdi_set_max_ratio(bdi, ratio);
        if (!ret)
                ret = count;

        return ret;
}
BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE)

static ssize_t max_ratio_fine_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned int ratio;
        ssize_t ret;

        ret = kstrtouint(buf, 10, &ratio);
        if (ret < 0)
                return ret;

        ret = bdi_set_max_ratio_no_scale(bdi, ratio);
        if (!ret)
                ret = count;

        return ret;
}
BDI_SHOW(max_ratio_fine, bdi->max_ratio)

static ssize_t min_bytes_show(struct device *dev,
                              struct device_attribute *attr,
                              char *buf)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);

        return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi));
}

static ssize_t min_bytes_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        u64 bytes;
        ssize_t ret;

        ret = kstrtoull(buf, 10, &bytes);
        if (ret < 0)
                return ret;

        ret = bdi_set_min_bytes(bdi, bytes);
        if (!ret)
                ret = count;

        return ret;
}
static DEVICE_ATTR_RW(min_bytes);

static ssize_t max_bytes_show(struct device *dev,
                              struct device_attribute *attr,
                              char *buf)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);

        return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi));
}

static ssize_t max_bytes_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        u64 bytes;
        ssize_t ret;

        ret = kstrtoull(buf, 10, &bytes);
        if (ret < 0)
                return ret;

        ret = bdi_set_max_bytes(bdi, bytes);
        if (!ret)
                ret = count;

        return ret;
}
static DEVICE_ATTR_RW(max_bytes);

static ssize_t stable_pages_required_show(struct device *dev,
                                          struct device_attribute *attr,
                                          char *buf)
{
        dev_warn_once(dev,
                "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
        return sysfs_emit(buf, "%d\n", 0);
}
static DEVICE_ATTR_RO(stable_pages_required);

static ssize_t strict_limit_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned int strict_limit;
        ssize_t ret;

        ret = kstrtouint(buf, 10, &strict_limit);
        if (ret < 0)
                return ret;

        ret = bdi_set_strict_limit(bdi, strict_limit);
        if (!ret)
                ret = count;

        return ret;
}

static ssize_t strict_limit_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);

        return sysfs_emit(buf, "%d\n",
                        !!(bdi->capabilities & BDI_CAP_STRICTLIMIT));
}
static DEVICE_ATTR_RW(strict_limit);

static struct attribute *bdi_dev_attrs[] = {
        &dev_attr_read_ahead_kb.attr,
        &dev_attr_min_ratio.attr,
        &dev_attr_min_ratio_fine.attr,
        &dev_attr_max_ratio.attr,
        &dev_attr_max_ratio_fine.attr,
        &dev_attr_min_bytes.attr,
        &dev_attr_max_bytes.attr,
        &dev_attr_stable_pages_required.attr,
        &dev_attr_strict_limit.attr,
        NULL,
};
ATTRIBUTE_GROUPS(bdi_dev);

static const struct class bdi_class = {
        .name                = "bdi",
        .dev_groups        = bdi_dev_groups,
};

static __init int bdi_class_init(void)
{
        int ret;

        ret = class_register(&bdi_class);
        if (ret)
                return ret;

        bdi_debug_init();

        return 0;
}
postcore_initcall(bdi_class_init);

static int __init default_bdi_init(void)
{
        bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
                                 WQ_SYSFS, 0);
        if (!bdi_wq)
                return -ENOMEM;
        return 0;
}
subsys_initcall(default_bdi_init);

static void wb_update_bandwidth_workfn(struct work_struct *work)
{
        struct bdi_writeback *wb = container_of(to_delayed_work(work),
                                                struct bdi_writeback, bw_dwork);

        wb_update_bandwidth(wb);
}

/*
 * Initial write bandwidth: 100 MB/s
 */
#define INIT_BW                (100 << (20 - PAGE_SHIFT))

static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
                   gfp_t gfp)
{
        int err;

        memset(wb, 0, sizeof(*wb));

        wb->bdi = bdi;
        wb->last_old_flush = jiffies;
        INIT_LIST_HEAD(&wb->b_dirty);
        INIT_LIST_HEAD(&wb->b_io);
        INIT_LIST_HEAD(&wb->b_more_io);
        INIT_LIST_HEAD(&wb->b_dirty_time);
        spin_lock_init(&wb->list_lock);

        atomic_set(&wb->writeback_inodes, 0);
        wb->bw_time_stamp = jiffies;
        wb->balanced_dirty_ratelimit = INIT_BW;
        wb->dirty_ratelimit = INIT_BW;
        wb->write_bandwidth = INIT_BW;
        wb->avg_write_bandwidth = INIT_BW;

        spin_lock_init(&wb->work_lock);
        INIT_LIST_HEAD(&wb->work_list);
        INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
        INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);

        err = fprop_local_init_percpu(&wb->completions, gfp);
        if (err)
                return err;

        err = percpu_counter_init_many(wb->stat, 0, gfp, NR_WB_STAT_ITEMS);
        if (err)
                fprop_local_destroy_percpu(&wb->completions);

        return err;
}

static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);

/*
 * Remove bdi from the global list and shutdown any threads we have running
 */
static void wb_shutdown(struct bdi_writeback *wb)
{
        /* Make sure nobody queues further work */
        spin_lock_irq(&wb->work_lock);
        if (!test_and_clear_bit(WB_registered, &wb->state)) {
                spin_unlock_irq(&wb->work_lock);
                return;
        }
        spin_unlock_irq(&wb->work_lock);

        cgwb_remove_from_bdi_list(wb);
        /*
         * Drain work list and shutdown the delayed_work.  !WB_registered
         * tells wb_workfn() that @wb is dying and its work_list needs to
         * be drained no matter what.
         */
        mod_delayed_work(bdi_wq, &wb->dwork, 0);
        flush_delayed_work(&wb->dwork);
        WARN_ON(!list_empty(&wb->work_list));
        flush_delayed_work(&wb->bw_dwork);
}

static void wb_exit(struct bdi_writeback *wb)
{
        WARN_ON(delayed_work_pending(&wb->dwork));
        percpu_counter_destroy_many(wb->stat, NR_WB_STAT_ITEMS);
        fprop_local_destroy_percpu(&wb->completions);
}

#ifdef CONFIG_CGROUP_WRITEBACK

#include <linux/memcontrol.h>

/*
 * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and
 * memcg->cgwb_list.  bdi->cgwb_tree is also RCU protected.
 */
static DEFINE_SPINLOCK(cgwb_lock);
static struct workqueue_struct *cgwb_release_wq;

static LIST_HEAD(offline_cgwbs);
static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);

static void cgwb_free_rcu(struct rcu_head *rcu_head)
{
        struct bdi_writeback *wb = container_of(rcu_head,
                        struct bdi_writeback, rcu);

        percpu_ref_exit(&wb->refcnt);
        kfree(wb);
}

static void cgwb_release_workfn(struct work_struct *work)
{
        struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
                                                release_work);
        struct backing_dev_info *bdi = wb->bdi;

        mutex_lock(&wb->bdi->cgwb_release_mutex);
        wb_shutdown(wb);

        css_put(wb->memcg_css);
        css_put(wb->blkcg_css);
        mutex_unlock(&wb->bdi->cgwb_release_mutex);

        /* triggers blkg destruction if no online users left */
        blkcg_unpin_online(wb->blkcg_css);

        fprop_local_destroy_percpu(&wb->memcg_completions);

        spin_lock_irq(&cgwb_lock);
        list_del(&wb->offline_node);
        spin_unlock_irq(&cgwb_lock);

        wb_exit(wb);
        bdi_put(bdi);
        WARN_ON_ONCE(!list_empty(&wb->b_attached));
        call_rcu(&wb->rcu, cgwb_free_rcu);
}

static void cgwb_release(struct percpu_ref *refcnt)
{
        struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
                                                refcnt);
        queue_work(cgwb_release_wq, &wb->release_work);
}

static void cgwb_kill(struct bdi_writeback *wb)
{
        lockdep_assert_held(&cgwb_lock);

        WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
        list_del(&wb->memcg_node);
        list_del(&wb->blkcg_node);
        list_add(&wb->offline_node, &offline_cgwbs);
        percpu_ref_kill(&wb->refcnt);
}

static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
{
        spin_lock_irq(&cgwb_lock);
        list_del_rcu(&wb->bdi_node);
        spin_unlock_irq(&cgwb_lock);
}

static int cgwb_create(struct backing_dev_info *bdi,
                       struct cgroup_subsys_state *memcg_css, gfp_t gfp)
{
        struct mem_cgroup *memcg;
        struct cgroup_subsys_state *blkcg_css;
        struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
        struct bdi_writeback *wb;
        unsigned long flags;
        int ret = 0;

        memcg = mem_cgroup_from_css(memcg_css);
        blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
        memcg_cgwb_list = &memcg->cgwb_list;
        blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css);

        /* look up again under lock and discard on blkcg mismatch */
        spin_lock_irqsave(&cgwb_lock, flags);
        wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
        if (wb && wb->blkcg_css != blkcg_css) {
                cgwb_kill(wb);
                wb = NULL;
        }
        spin_unlock_irqrestore(&cgwb_lock, flags);
        if (wb)
                goto out_put;

        /* need to create a new one */
        wb = kmalloc(sizeof(*wb), gfp);
        if (!wb) {
                ret = -ENOMEM;
                goto out_put;
        }

        ret = wb_init(wb, bdi, gfp);
        if (ret)
                goto err_free;

        ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
        if (ret)
                goto err_wb_exit;

        ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
        if (ret)
                goto err_ref_exit;

        wb->memcg_css = memcg_css;
        wb->blkcg_css = blkcg_css;
        INIT_LIST_HEAD(&wb->b_attached);
        INIT_WORK(&wb->release_work, cgwb_release_workfn);
        set_bit(WB_registered, &wb->state);
        bdi_get(bdi);

        /*
         * The root wb determines the registered state of the whole bdi and
         * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
         * whether they're still online.  Don't link @wb if any is dead.
         * See wb_memcg_offline() and wb_blkcg_offline().
         */
        ret = -ENODEV;
        spin_lock_irqsave(&cgwb_lock, flags);
        if (test_bit(WB_registered, &bdi->wb.state) &&
            blkcg_cgwb_list->next && memcg_cgwb_list->next) {
                /* we might have raced another instance of this function */
                ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
                if (!ret) {
                        list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
                        list_add(&wb->memcg_node, memcg_cgwb_list);
                        list_add(&wb->blkcg_node, blkcg_cgwb_list);
                        blkcg_pin_online(blkcg_css);
                        css_get(memcg_css);
                        css_get(blkcg_css);
                }
        }
        spin_unlock_irqrestore(&cgwb_lock, flags);
        if (ret) {
                if (ret == -EEXIST)
                        ret = 0;
                goto err_fprop_exit;
        }
        goto out_put;

err_fprop_exit:
        bdi_put(bdi);
        fprop_local_destroy_percpu(&wb->memcg_completions);
err_ref_exit:
        percpu_ref_exit(&wb->refcnt);
err_wb_exit:
        wb_exit(wb);
err_free:
        kfree(wb);
out_put:
        css_put(blkcg_css);
        return ret;
}

/**
 * wb_get_lookup - get wb for a given memcg
 * @bdi: target bdi
 * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
 *
 * Try to get the wb for @memcg_css on @bdi.  The returned wb has its
 * refcount incremented.
 *
 * This function uses css_get() on @memcg_css and thus expects its refcnt
 * to be positive on invocation.  IOW, rcu_read_lock() protection on
 * @memcg_css isn't enough.  try_get it before calling this function.
 *
 * A wb is keyed by its associated memcg.  As blkcg implicitly enables
 * memcg on the default hierarchy, memcg association is guaranteed to be
 * more specific (equal or descendant to the associated blkcg) and thus can
 * identify both the memcg and blkcg associations.
 *
 * Because the blkcg associated with a memcg may change as blkcg is enabled
 * and disabled closer to root in the hierarchy, each wb keeps track of
 * both the memcg and blkcg associated with it and verifies the blkcg on
 * each lookup.  On mismatch, the existing wb is discarded and a new one is
 * created.
 */
struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
                                    struct cgroup_subsys_state *memcg_css)
{
        struct bdi_writeback *wb;

        if (!memcg_css->parent)
                return &bdi->wb;

        rcu_read_lock();
        wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
        if (wb) {
                struct cgroup_subsys_state *blkcg_css;

                /* see whether the blkcg association has changed */
                blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
                if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
                        wb = NULL;
                css_put(blkcg_css);
        }
        rcu_read_unlock();

        return wb;
}

/**
 * wb_get_create - get wb for a given memcg, create if necessary
 * @bdi: target bdi
 * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
 * @gfp: allocation mask to use
 *
 * Try to get the wb for @memcg_css on @bdi.  If it doesn't exist, try to
 * create one.  See wb_get_lookup() for more details.
 */
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
                                    struct cgroup_subsys_state *memcg_css,
                                    gfp_t gfp)
{
        struct bdi_writeback *wb;

        might_alloc(gfp);

        do {
                wb = wb_get_lookup(bdi, memcg_css);
        } while (!wb && !cgwb_create(bdi, memcg_css, gfp));

        return wb;
}

static int cgwb_bdi_init(struct backing_dev_info *bdi)
{
        int ret;

        INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
        mutex_init(&bdi->cgwb_release_mutex);
        init_rwsem(&bdi->wb_switch_rwsem);

        ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
        if (!ret) {
                bdi->wb.memcg_css = &root_mem_cgroup->css;
                bdi->wb.blkcg_css = blkcg_root_css;
        }
        return ret;
}

static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
{
        struct radix_tree_iter iter;
        void **slot;
        struct bdi_writeback *wb;

        WARN_ON(test_bit(WB_registered, &bdi->wb.state));

        spin_lock_irq(&cgwb_lock);
        radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
                cgwb_kill(*slot);
        spin_unlock_irq(&cgwb_lock);

        mutex_lock(&bdi->cgwb_release_mutex);
        spin_lock_irq(&cgwb_lock);
        while (!list_empty(&bdi->wb_list)) {
                wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
                                      bdi_node);
                spin_unlock_irq(&cgwb_lock);
                wb_shutdown(wb);
                spin_lock_irq(&cgwb_lock);
        }
        spin_unlock_irq(&cgwb_lock);
        mutex_unlock(&bdi->cgwb_release_mutex);
}

/*
 * cleanup_offline_cgwbs_workfn - try to release dying cgwbs
 *
 * Try to release dying cgwbs by switching attached inodes to the nearest
 * living ancestor's writeback. Processed wbs are placed at the end
 * of the list to guarantee the forward progress.
 */
static void cleanup_offline_cgwbs_workfn(struct work_struct *work)
{
        struct bdi_writeback *wb;
        LIST_HEAD(processed);

        spin_lock_irq(&cgwb_lock);

        while (!list_empty(&offline_cgwbs)) {
                wb = list_first_entry(&offline_cgwbs, struct bdi_writeback,
                                      offline_node);
                list_move(&wb->offline_node, &processed);

                /*
                 * If wb is dirty, cleaning up the writeback by switching
                 * attached inodes will result in an effective removal of any
                 * bandwidth restrictions, which isn't the goal.  Instead,
                 * it can be postponed until the next time, when all io
                 * will be likely completed.  If in the meantime some inodes
                 * will get re-dirtied, they should be eventually switched to
                 * a new cgwb.
                 */
                if (wb_has_dirty_io(wb))
                        continue;

                if (!wb_tryget(wb))
                        continue;

                spin_unlock_irq(&cgwb_lock);
                while (cleanup_offline_cgwb(wb))
                        cond_resched();
                spin_lock_irq(&cgwb_lock);

                wb_put(wb);
        }

        if (!list_empty(&processed))
                list_splice_tail(&processed, &offline_cgwbs);

        spin_unlock_irq(&cgwb_lock);
}

/**
 * wb_memcg_offline - kill all wb's associated with a memcg being offlined
 * @memcg: memcg being offlined
 *
 * Also prevents creation of any new wb's associated with @memcg.
 */
void wb_memcg_offline(struct mem_cgroup *memcg)
{
        struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
        struct bdi_writeback *wb, *next;

        spin_lock_irq(&cgwb_lock);
        list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
                cgwb_kill(wb);
        memcg_cgwb_list->next = NULL;        /* prevent new wb's */
        spin_unlock_irq(&cgwb_lock);

        queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work);
}

/**
 * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
 * @css: blkcg being offlined
 *
 * Also prevents creation of any new wb's associated with @blkcg.
 */
void wb_blkcg_offline(struct cgroup_subsys_state *css)
{
        struct bdi_writeback *wb, *next;
        struct list_head *list = blkcg_get_cgwb_list(css);

        spin_lock_irq(&cgwb_lock);
        list_for_each_entry_safe(wb, next, list, blkcg_node)
                cgwb_kill(wb);
        list->next = NULL;        /* prevent new wb's */
        spin_unlock_irq(&cgwb_lock);
}

static void cgwb_bdi_register(struct backing_dev_info *bdi)
{
        spin_lock_irq(&cgwb_lock);
        list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
        spin_unlock_irq(&cgwb_lock);
}

static int __init cgwb_init(void)
{
        /*
         * There can be many concurrent release work items overwhelming
         * system_wq.  Put them in a separate wq and limit concurrency.
         * There's no point in executing many of these in parallel.
         */
        cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
        if (!cgwb_release_wq)
                return -ENOMEM;

        return 0;
}
subsys_initcall(cgwb_init);

#else        /* CONFIG_CGROUP_WRITEBACK */

static int cgwb_bdi_init(struct backing_dev_info *bdi)
{
        return wb_init(&bdi->wb, bdi, GFP_KERNEL);
}

static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }

static void cgwb_bdi_register(struct backing_dev_info *bdi)
{
        list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
}

static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
{
        list_del_rcu(&wb->bdi_node);
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

int bdi_init(struct backing_dev_info *bdi)
{
        bdi->dev = NULL;

        kref_init(&bdi->refcnt);
        bdi->min_ratio = 0;
        bdi->max_ratio = 100 * BDI_RATIO_SCALE;
        bdi->max_prop_frac = FPROP_FRAC_BASE;
        INIT_LIST_HEAD(&bdi->bdi_list);
        INIT_LIST_HEAD(&bdi->wb_list);
        init_waitqueue_head(&bdi->wb_waitq);
        bdi->last_bdp_sleep = jiffies;

        return cgwb_bdi_init(bdi);
}

struct backing_dev_info *bdi_alloc(int node_id)
{
        struct backing_dev_info *bdi;

        bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id);
        if (!bdi)
                return NULL;

        if (bdi_init(bdi)) {
                kfree(bdi);
                return NULL;
        }
        bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
        bdi->ra_pages = VM_READAHEAD_PAGES;
        bdi->io_pages = VM_READAHEAD_PAGES;
        timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0);
        return bdi;
}
EXPORT_SYMBOL(bdi_alloc);

static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
{
        struct rb_node **p = &bdi_tree.rb_node;
        struct rb_node *parent = NULL;
        struct backing_dev_info *bdi;

        lockdep_assert_held(&bdi_lock);

        while (*p) {
                parent = *p;
                bdi = rb_entry(parent, struct backing_dev_info, rb_node);

                if (bdi->id > id)
                        p = &(*p)->rb_left;
                else if (bdi->id < id)
                        p = &(*p)->rb_right;
                else
                        break;
        }

        if (parentp)
                *parentp = parent;
        return p;
}

/**
 * bdi_get_by_id - lookup and get bdi from its id
 * @id: bdi id to lookup
 *
 * Find bdi matching @id and get it.  Returns NULL if the matching bdi
 * doesn't exist or is already unregistered.
 */
struct backing_dev_info *bdi_get_by_id(u64 id)
{
        struct backing_dev_info *bdi = NULL;
        struct rb_node **p;

        spin_lock_bh(&bdi_lock);
        p = bdi_lookup_rb_node(id, NULL);
        if (*p) {
                bdi = rb_entry(*p, struct backing_dev_info, rb_node);
                bdi_get(bdi);
        }
        spin_unlock_bh(&bdi_lock);

        return bdi;
}

int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
{
        struct device *dev;
        struct rb_node *parent, **p;

        if (bdi->dev)        /* The driver needs to use separate queues per device */
                return 0;

        vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
        dev = device_create(&bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
        if (IS_ERR(dev))
                return PTR_ERR(dev);

        cgwb_bdi_register(bdi);
        bdi->dev = dev;

        bdi_debug_register(bdi, dev_name(dev));
        set_bit(WB_registered, &bdi->wb.state);

        spin_lock_bh(&bdi_lock);

        bdi->id = ++bdi_id_cursor;

        p = bdi_lookup_rb_node(bdi->id, &parent);
        rb_link_node(&bdi->rb_node, parent, p);
        rb_insert_color(&bdi->rb_node, &bdi_tree);

        list_add_tail_rcu(&bdi->bdi_list, &bdi_list);

        spin_unlock_bh(&bdi_lock);

        trace_writeback_bdi_register(bdi);
        return 0;
}

int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
{
        va_list args;
        int ret;

        va_start(args, fmt);
        ret = bdi_register_va(bdi, fmt, args);
        va_end(args);
        return ret;
}
EXPORT_SYMBOL(bdi_register);

void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner)
{
        WARN_ON_ONCE(bdi->owner);
        bdi->owner = owner;
        get_device(owner);
}

/*
 * Remove bdi from bdi_list, and ensure that it is no longer visible
 */
static void bdi_remove_from_list(struct backing_dev_info *bdi)
{
        spin_lock_bh(&bdi_lock);
        rb_erase(&bdi->rb_node, &bdi_tree);
        list_del_rcu(&bdi->bdi_list);
        spin_unlock_bh(&bdi_lock);

        synchronize_rcu_expedited();
}

void bdi_unregister(struct backing_dev_info *bdi)
{
        del_timer_sync(&bdi->laptop_mode_wb_timer);

        /* make sure nobody finds us on the bdi_list anymore */
        bdi_remove_from_list(bdi);
        wb_shutdown(&bdi->wb);
        cgwb_bdi_unregister(bdi);

        /*
         * If this BDI's min ratio has been set, use bdi_set_min_ratio() to
         * update the global bdi_min_ratio.
         */
        if (bdi->min_ratio)
                bdi_set_min_ratio(bdi, 0);

        if (bdi->dev) {
                bdi_debug_unregister(bdi);
                device_unregister(bdi->dev);
                bdi->dev = NULL;
        }

        if (bdi->owner) {
                put_device(bdi->owner);
                bdi->owner = NULL;
        }
}
EXPORT_SYMBOL(bdi_unregister);

static void release_bdi(struct kref *ref)
{
        struct backing_dev_info *bdi =
                        container_of(ref, struct backing_dev_info, refcnt);

        WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state));
        WARN_ON_ONCE(bdi->dev);
        wb_exit(&bdi->wb);
        kfree(bdi);
}

void bdi_put(struct backing_dev_info *bdi)
{
        kref_put(&bdi->refcnt, release_bdi);
}
EXPORT_SYMBOL(bdi_put);

struct backing_dev_info *inode_to_bdi(struct inode *inode)
{
        struct super_block *sb;

        if (!inode)
                return &noop_backing_dev_info;

        sb = inode->i_sb;
#ifdef CONFIG_BLOCK
        if (sb_is_blkdev_sb(sb))
                return I_BDEV(inode)->bd_disk->bdi;
#endif
        return sb->s_bdi;
}
EXPORT_SYMBOL(inode_to_bdi);

const char *bdi_dev_name(struct backing_dev_info *bdi)
{
        if (!bdi || !bdi->dev)
                return bdi_unknown_name;
        return bdi->dev_name;
}
EXPORT_SYMBOL_GPL(bdi_dev_name);











































































































































































































    1 

    5 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 2008 Oracle.  All rights reserved.
 */

#ifndef BTRFS_LOCKING_H
#define BTRFS_LOCKING_H

#include <linux/atomic.h>
#include <linux/wait.h>
#include <linux/lockdep.h>
#include <linux/percpu_counter.h>
#include "extent_io.h"
#include "locking.h"

struct extent_buffer;
struct btrfs_path;
struct btrfs_root;

#define BTRFS_WRITE_LOCK 1
#define BTRFS_READ_LOCK 2

/*
 * We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at
 * the time of this patch is 8, which is how many we use.  Keep this in mind if
 * you decide you want to add another subclass.
 */
enum btrfs_lock_nesting {
        BTRFS_NESTING_NORMAL,

        /*
         * When we COW a block we are holding the lock on the original block,
         * and since our lockdep maps are rootid+level, this confuses lockdep
         * when we lock the newly allocated COW'd block.  Handle this by having
         * a subclass for COW'ed blocks so that lockdep doesn't complain.
         */
        BTRFS_NESTING_COW,

        /*
         * Oftentimes we need to lock adjacent nodes on the same level while
         * still holding the lock on the original node we searched to, such as
         * for searching forward or for split/balance.
         *
         * Because of this we need to indicate to lockdep that this is
         * acceptable by having a different subclass for each of these
         * operations.
         */
        BTRFS_NESTING_LEFT,
        BTRFS_NESTING_RIGHT,

        /*
         * When splitting we will be holding a lock on the left/right node when
         * we need to cow that node, thus we need a new set of subclasses for
         * these two operations.
         */
        BTRFS_NESTING_LEFT_COW,
        BTRFS_NESTING_RIGHT_COW,

        /*
         * When splitting we may push nodes to the left or right, but still use
         * the subsequent nodes in our path, keeping our locks on those adjacent
         * blocks.  Thus when we go to allocate a new split block we've already
         * used up all of our available subclasses, so this subclass exists to
         * handle this case where we need to allocate a new split block.
         */
        BTRFS_NESTING_SPLIT,

        /*
         * When promoting a new block to a root we need to have a special
         * subclass so we don't confuse lockdep, as it will appear that we are
         * locking a higher level node before a lower level one.  Copying also
         * has this problem as it appears we're locking the same block again
         * when we make a snapshot of an existing root.
         */
        BTRFS_NESTING_NEW_ROOT,

        /*
         * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so
         * add this in here and add a static_assert to keep us from going over
         * the limit.  As of this writing we're limited to 8, and we're
         * definitely using 8, hence this check to keep us from messing up in
         * the future.
         */
        BTRFS_NESTING_MAX,
};

enum btrfs_lockdep_trans_states {
        BTRFS_LOCKDEP_TRANS_COMMIT_PREP,
        BTRFS_LOCKDEP_TRANS_UNBLOCKED,
        BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED,
        BTRFS_LOCKDEP_TRANS_COMPLETED,
};

/*
 * Lockdep annotation for wait events.
 *
 * @owner:  The struct where the lockdep map is defined
 * @lock:   The lockdep map corresponding to a wait event
 *
 * This macro is used to annotate a wait event. In this case a thread acquires
 * the lockdep map as writer (exclusive lock) because it has to block until all
 * the threads that hold the lock as readers signal the condition for the wait
 * event and release their locks.
 */
#define btrfs_might_wait_for_event(owner, lock)                                        \
        do {                                                                        \
                rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_);                \
                rwsem_release(&owner->lock##_map, _THIS_IP_);                        \
        } while (0)

/*
 * Protection for the resource/condition of a wait event.
 *
 * @owner:  The struct where the lockdep map is defined
 * @lock:   The lockdep map corresponding to a wait event
 *
 * Many threads can modify the condition for the wait event at the same time
 * and signal the threads that block on the wait event. The threads that modify
 * the condition and do the signaling acquire the lock as readers (shared
 * lock).
 */
#define btrfs_lockdep_acquire(owner, lock)                                        \
        rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_)

/*
 * Used after signaling the condition for a wait event to release the lockdep
 * map held by a reader thread.
 */
#define btrfs_lockdep_release(owner, lock)                                        \
        rwsem_release(&owner->lock##_map, _THIS_IP_)

/*
 * Macros for the transaction states wait events, similar to the generic wait
 * event macros.
 */
#define btrfs_might_wait_for_state(owner, i)                                        \
        do {                                                                        \
                rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \
                rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_);        \
        } while (0)

#define btrfs_trans_state_lockdep_acquire(owner, i)                                \
        rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_)

#define btrfs_trans_state_lockdep_release(owner, i)                                \
        rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_)

/* Initialization of the lockdep map */
#define btrfs_lockdep_init_map(owner, lock)                                        \
        do {                                                                        \
                static struct lock_class_key lock##_key;                        \
                lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0);        \
        } while (0)

/* Initialization of the transaction states lockdep maps. */
#define btrfs_state_lockdep_init_map(owner, lock, state)                        \
        do {                                                                        \
                static struct lock_class_key lock##_key;                        \
                lockdep_init_map(&owner->btrfs_state_change_map[state], #lock,        \
                                 &lock##_key, 0);                                \
        } while (0)

static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES,
              "too many lock subclasses defined");

void btrfs_tree_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest);

static inline void btrfs_tree_lock(struct extent_buffer *eb)
{
        btrfs_tree_lock_nested(eb, BTRFS_NESTING_NORMAL);
}

void btrfs_tree_unlock(struct extent_buffer *eb);

void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest);

static inline void btrfs_tree_read_lock(struct extent_buffer *eb)
{
        btrfs_tree_read_lock_nested(eb, BTRFS_NESTING_NORMAL);
}

void btrfs_tree_read_unlock(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root);

#ifdef CONFIG_BTRFS_DEBUG
static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)
{
        lockdep_assert_held_write(&eb->lock);
}
#else
static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { }
#endif

void btrfs_unlock_up_safe(struct btrfs_path *path, int level);

static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
{
        if (rw == BTRFS_WRITE_LOCK)
                btrfs_tree_unlock(eb);
        else if (rw == BTRFS_READ_LOCK)
                btrfs_tree_read_unlock(eb);
        else
                BUG();
}

struct btrfs_drew_lock {
        atomic_t readers;
        atomic_t writers;
        wait_queue_head_t pending_writers;
        wait_queue_head_t pending_readers;
};

void btrfs_drew_lock_init(struct btrfs_drew_lock *lock);
void btrfs_drew_write_lock(struct btrfs_drew_lock *lock);
bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock);
void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock);
void btrfs_drew_read_lock(struct btrfs_drew_lock *lock);
void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock);

#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level);
void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb);
#else
static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
                                        struct extent_buffer *eb, int level)
{
}
static inline void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root,
                                                   struct extent_buffer *eb)
{
}
#endif

#endif























































    1 


    1 
    1 
    1 





















































































































































































































































































































































































































































































































































































































































































































































































































































    1 








    1 

























    2 




















    1 








    1 







    1 












    1 














































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
// SPDX-License-Identifier: GPL-2.0
/*
 * Quota code necessary even when VFS quota support is not compiled
 * into the kernel.  The interesting stuff is over in dquot.c, here
 * we have symbols for initial quotactl(2) handling, the sysctl(2)
 * variables, etc - things needed even when quota support disabled.
 */

#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/slab.h>
#include <asm/current.h>
#include <linux/blkdev.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/capability.h>
#include <linux/quotaops.h>
#include <linux/types.h>
#include <linux/mount.h>
#include <linux/writeback.h>
#include <linux/nospec.h>
#include "compat.h"
#include "../internal.h"

static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
                                     qid_t id)
{
        switch (cmd) {
        /* these commands do not require any special privilegues */
        case Q_GETFMT:
        case Q_SYNC:
        case Q_GETINFO:
        case Q_XGETQSTAT:
        case Q_XGETQSTATV:
        case Q_XQUOTASYNC:
                break;
        /* allow to query information for dquots we "own" */
        case Q_GETQUOTA:
        case Q_XGETQUOTA:
                if ((type == USRQUOTA && uid_eq(current_euid(), make_kuid(current_user_ns(), id))) ||
                    (type == GRPQUOTA && in_egroup_p(make_kgid(current_user_ns(), id))))
                        break;
                fallthrough;
        default:
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
        }

        return security_quotactl(cmd, type, id, sb);
}

static void quota_sync_one(struct super_block *sb, void *arg)
{
        int type = *(int *)arg;

        if (sb->s_qcop && sb->s_qcop->quota_sync &&
            (sb->s_quota_types & (1 << type)))
                sb->s_qcop->quota_sync(sb, type);
}

static int quota_sync_all(int type)
{
        int ret;

        ret = security_quotactl(Q_SYNC, type, 0, NULL);
        if (!ret)
                iterate_supers(quota_sync_one, &type);
        return ret;
}

unsigned int qtype_enforce_flag(int type)
{
        switch (type) {
        case USRQUOTA:
                return FS_QUOTA_UDQ_ENFD;
        case GRPQUOTA:
                return FS_QUOTA_GDQ_ENFD;
        case PRJQUOTA:
                return FS_QUOTA_PDQ_ENFD;
        }
        return 0;
}

static int quota_quotaon(struct super_block *sb, int type, qid_t id,
                         const struct path *path)
{
        if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable)
                return -ENOSYS;
        if (sb->s_qcop->quota_enable)
                return sb->s_qcop->quota_enable(sb, qtype_enforce_flag(type));
        if (IS_ERR(path))
                return PTR_ERR(path);
        return sb->s_qcop->quota_on(sb, type, id, path);
}

static int quota_quotaoff(struct super_block *sb, int type)
{
        if (!sb->s_qcop->quota_off && !sb->s_qcop->quota_disable)
                return -ENOSYS;
        if (sb->s_qcop->quota_disable)
                return sb->s_qcop->quota_disable(sb, qtype_enforce_flag(type));
        return sb->s_qcop->quota_off(sb, type);
}

static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
{
        __u32 fmt;

        if (!sb_has_quota_active(sb, type))
                return -ESRCH;
        fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
        if (copy_to_user(addr, &fmt, sizeof(fmt)))
                return -EFAULT;
        return 0;
}

static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
{
        struct qc_state state;
        struct qc_type_state *tstate;
        struct if_dqinfo uinfo;
        int ret;

        if (!sb->s_qcop->get_state)
                return -ENOSYS;
        ret = sb->s_qcop->get_state(sb, &state);
        if (ret)
                return ret;
        tstate = state.s_state + type;
        if (!(tstate->flags & QCI_ACCT_ENABLED))
                return -ESRCH;
        memset(&uinfo, 0, sizeof(uinfo));
        uinfo.dqi_bgrace = tstate->spc_timelimit;
        uinfo.dqi_igrace = tstate->ino_timelimit;
        if (tstate->flags & QCI_SYSFILE)
                uinfo.dqi_flags |= DQF_SYS_FILE;
        if (tstate->flags & QCI_ROOT_SQUASH)
                uinfo.dqi_flags |= DQF_ROOT_SQUASH;
        uinfo.dqi_valid = IIF_ALL;
        if (copy_to_user(addr, &uinfo, sizeof(uinfo)))
                return -EFAULT;
        return 0;
}

static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
{
        struct if_dqinfo info;
        struct qc_info qinfo;

        if (copy_from_user(&info, addr, sizeof(info)))
                return -EFAULT;
        if (!sb->s_qcop->set_info)
                return -ENOSYS;
        if (info.dqi_valid & ~(IIF_FLAGS | IIF_BGRACE | IIF_IGRACE))
                return -EINVAL;
        memset(&qinfo, 0, sizeof(qinfo));
        if (info.dqi_valid & IIF_FLAGS) {
                if (info.dqi_flags & ~DQF_SETINFO_MASK)
                        return -EINVAL;
                if (info.dqi_flags & DQF_ROOT_SQUASH)
                        qinfo.i_flags |= QCI_ROOT_SQUASH;
                qinfo.i_fieldmask |= QC_FLAGS;
        }
        if (info.dqi_valid & IIF_BGRACE) {
                qinfo.i_spc_timelimit = info.dqi_bgrace;
                qinfo.i_fieldmask |= QC_SPC_TIMER;
        }
        if (info.dqi_valid & IIF_IGRACE) {
                qinfo.i_ino_timelimit = info.dqi_igrace;
                qinfo.i_fieldmask |= QC_INO_TIMER;
        }
        return sb->s_qcop->set_info(sb, type, &qinfo);
}

static inline qsize_t qbtos(qsize_t blocks)
{
        return blocks << QIF_DQBLKSIZE_BITS;
}

static inline qsize_t stoqb(qsize_t space)
{
        return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
}

static void copy_to_if_dqblk(struct if_dqblk *dst, struct qc_dqblk *src)
{
        memset(dst, 0, sizeof(*dst));
        dst->dqb_bhardlimit = stoqb(src->d_spc_hardlimit);
        dst->dqb_bsoftlimit = stoqb(src->d_spc_softlimit);
        dst->dqb_curspace = src->d_space;
        dst->dqb_ihardlimit = src->d_ino_hardlimit;
        dst->dqb_isoftlimit = src->d_ino_softlimit;
        dst->dqb_curinodes = src->d_ino_count;
        dst->dqb_btime = src->d_spc_timer;
        dst->dqb_itime = src->d_ino_timer;
        dst->dqb_valid = QIF_ALL;
}

static int quota_getquota(struct super_block *sb, int type, qid_t id,
                          void __user *addr)
{
        struct kqid qid;
        struct qc_dqblk fdq;
        struct if_dqblk idq;
        int ret;

        if (!sb->s_qcop->get_dqblk)
                return -ENOSYS;
        qid = make_kqid(current_user_ns(), type, id);
        if (!qid_has_mapping(sb->s_user_ns, qid))
                return -EINVAL;
        ret = sb->s_qcop->get_dqblk(sb, qid, &fdq);
        if (ret)
                return ret;
        copy_to_if_dqblk(&idq, &fdq);

        if (compat_need_64bit_alignment_fixup()) {
                struct compat_if_dqblk __user *compat_dqblk = addr;

                if (copy_to_user(compat_dqblk, &idq, sizeof(*compat_dqblk)))
                        return -EFAULT;
                if (put_user(idq.dqb_valid, &compat_dqblk->dqb_valid))
                        return -EFAULT;
        } else {
                if (copy_to_user(addr, &idq, sizeof(idq)))
                        return -EFAULT;
        }
        return 0;
}

/*
 * Return quota for next active quota >= this id, if any exists,
 * otherwise return -ENOENT via ->get_nextdqblk
 */
static int quota_getnextquota(struct super_block *sb, int type, qid_t id,
                          void __user *addr)
{
        struct kqid qid;
        struct qc_dqblk fdq;
        struct if_nextdqblk idq;
        int ret;

        if (!sb->s_qcop->get_nextdqblk)
                return -ENOSYS;
        qid = make_kqid(current_user_ns(), type, id);
        if (!qid_has_mapping(sb->s_user_ns, qid))
                return -EINVAL;
        ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq);
        if (ret)
                return ret;
        /* struct if_nextdqblk is a superset of struct if_dqblk */
        copy_to_if_dqblk((struct if_dqblk *)&idq, &fdq);
        idq.dqb_id = from_kqid(current_user_ns(), qid);
        if (copy_to_user(addr, &idq, sizeof(idq)))
                return -EFAULT;
        return 0;
}

static void copy_from_if_dqblk(struct qc_dqblk *dst, struct if_dqblk *src)
{
        dst->d_spc_hardlimit = qbtos(src->dqb_bhardlimit);
        dst->d_spc_softlimit = qbtos(src->dqb_bsoftlimit);
        dst->d_space = src->dqb_curspace;
        dst->d_ino_hardlimit = src->dqb_ihardlimit;
        dst->d_ino_softlimit = src->dqb_isoftlimit;
        dst->d_ino_count = src->dqb_curinodes;
        dst->d_spc_timer = src->dqb_btime;
        dst->d_ino_timer = src->dqb_itime;

        dst->d_fieldmask = 0;
        if (src->dqb_valid & QIF_BLIMITS)
                dst->d_fieldmask |= QC_SPC_SOFT | QC_SPC_HARD;
        if (src->dqb_valid & QIF_SPACE)
                dst->d_fieldmask |= QC_SPACE;
        if (src->dqb_valid & QIF_ILIMITS)
                dst->d_fieldmask |= QC_INO_SOFT | QC_INO_HARD;
        if (src->dqb_valid & QIF_INODES)
                dst->d_fieldmask |= QC_INO_COUNT;
        if (src->dqb_valid & QIF_BTIME)
                dst->d_fieldmask |= QC_SPC_TIMER;
        if (src->dqb_valid & QIF_ITIME)
                dst->d_fieldmask |= QC_INO_TIMER;
}

static int quota_setquota(struct super_block *sb, int type, qid_t id,
                          void __user *addr)
{
        struct qc_dqblk fdq;
        struct if_dqblk idq;
        struct kqid qid;

        if (compat_need_64bit_alignment_fixup()) {
                struct compat_if_dqblk __user *compat_dqblk = addr;

                if (copy_from_user(&idq, compat_dqblk, sizeof(*compat_dqblk)) ||
                    get_user(idq.dqb_valid, &compat_dqblk->dqb_valid))
                        return -EFAULT;
        } else {
                if (copy_from_user(&idq, addr, sizeof(idq)))
                        return -EFAULT;
        }
        if (!sb->s_qcop->set_dqblk)
                return -ENOSYS;
        qid = make_kqid(current_user_ns(), type, id);
        if (!qid_has_mapping(sb->s_user_ns, qid))
                return -EINVAL;
        copy_from_if_dqblk(&fdq, &idq);
        return sb->s_qcop->set_dqblk(sb, qid, &fdq);
}

static int quota_enable(struct super_block *sb, void __user *addr)
{
        __u32 flags;

        if (copy_from_user(&flags, addr, sizeof(flags)))
                return -EFAULT;
        if (!sb->s_qcop->quota_enable)
                return -ENOSYS;
        return sb->s_qcop->quota_enable(sb, flags);
}

static int quota_disable(struct super_block *sb, void __user *addr)
{
        __u32 flags;

        if (copy_from_user(&flags, addr, sizeof(flags)))
                return -EFAULT;
        if (!sb->s_qcop->quota_disable)
                return -ENOSYS;
        return sb->s_qcop->quota_disable(sb, flags);
}

static int quota_state_to_flags(struct qc_state *state)
{
        int flags = 0;

        if (state->s_state[USRQUOTA].flags & QCI_ACCT_ENABLED)
                flags |= FS_QUOTA_UDQ_ACCT;
        if (state->s_state[USRQUOTA].flags & QCI_LIMITS_ENFORCED)
                flags |= FS_QUOTA_UDQ_ENFD;
        if (state->s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED)
                flags |= FS_QUOTA_GDQ_ACCT;
        if (state->s_state[GRPQUOTA].flags & QCI_LIMITS_ENFORCED)
                flags |= FS_QUOTA_GDQ_ENFD;
        if (state->s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED)
                flags |= FS_QUOTA_PDQ_ACCT;
        if (state->s_state[PRJQUOTA].flags & QCI_LIMITS_ENFORCED)
                flags |= FS_QUOTA_PDQ_ENFD;
        return flags;
}

static int quota_getstate(struct super_block *sb, int type,
                          struct fs_quota_stat *fqs)
{
        struct qc_state state;
        int ret;

        memset(&state, 0, sizeof (struct qc_state));
        ret = sb->s_qcop->get_state(sb, &state);
        if (ret < 0)
                return ret;

        memset(fqs, 0, sizeof(*fqs));
        fqs->qs_version = FS_QSTAT_VERSION;
        fqs->qs_flags = quota_state_to_flags(&state);
        /* No quota enabled? */
        if (!fqs->qs_flags)
                return -ENOSYS;
        fqs->qs_incoredqs = state.s_incoredqs;

        fqs->qs_btimelimit = state.s_state[type].spc_timelimit;
        fqs->qs_itimelimit = state.s_state[type].ino_timelimit;
        fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit;
        fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit;
        fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit;

        /* Inodes may be allocated even if inactive; copy out if present */
        if (state.s_state[USRQUOTA].ino) {
                fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino;
                fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks;
                fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents;
        }
        if (state.s_state[GRPQUOTA].ino) {
                fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino;
                fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks;
                fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents;
        }
        if (state.s_state[PRJQUOTA].ino) {
                /*
                 * Q_XGETQSTAT doesn't have room for both group and project
                 * quotas.  So, allow the project quota values to be copied out
                 * only if there is no group quota information available.
                 */
                if (!(state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED)) {
                        fqs->qs_gquota.qfs_ino = state.s_state[PRJQUOTA].ino;
                        fqs->qs_gquota.qfs_nblks =
                                        state.s_state[PRJQUOTA].blocks;
                        fqs->qs_gquota.qfs_nextents =
                                        state.s_state[PRJQUOTA].nextents;
                }
        }
        return 0;
}

static int compat_copy_fs_qfilestat(struct compat_fs_qfilestat __user *to,
                struct fs_qfilestat *from)
{
        if (copy_to_user(to, from, sizeof(*to)) ||
            put_user(from->qfs_nextents, &to->qfs_nextents))
                return -EFAULT;
        return 0;
}

static int compat_copy_fs_quota_stat(struct compat_fs_quota_stat __user *to,
                struct fs_quota_stat *from)
{
        if (put_user(from->qs_version, &to->qs_version) ||
            put_user(from->qs_flags, &to->qs_flags) ||
            put_user(from->qs_pad, &to->qs_pad) ||
            compat_copy_fs_qfilestat(&to->qs_uquota, &from->qs_uquota) ||
            compat_copy_fs_qfilestat(&to->qs_gquota, &from->qs_gquota) ||
            put_user(from->qs_incoredqs, &to->qs_incoredqs) ||
            put_user(from->qs_btimelimit, &to->qs_btimelimit) ||
            put_user(from->qs_itimelimit, &to->qs_itimelimit) ||
            put_user(from->qs_rtbtimelimit, &to->qs_rtbtimelimit) ||
            put_user(from->qs_bwarnlimit, &to->qs_bwarnlimit) ||
            put_user(from->qs_iwarnlimit, &to->qs_iwarnlimit))
                return -EFAULT;
        return 0;
}

static int quota_getxstate(struct super_block *sb, int type, void __user *addr)
{
        struct fs_quota_stat fqs;
        int ret;

        if (!sb->s_qcop->get_state)
                return -ENOSYS;
        ret = quota_getstate(sb, type, &fqs);
        if (ret)
                return ret;

        if (compat_need_64bit_alignment_fixup())
                return compat_copy_fs_quota_stat(addr, &fqs);
        if (copy_to_user(addr, &fqs, sizeof(fqs)))
                return -EFAULT;
        return 0;
}

static int quota_getstatev(struct super_block *sb, int type,
                           struct fs_quota_statv *fqs)
{
        struct qc_state state;
        int ret;

        memset(&state, 0, sizeof (struct qc_state));
        ret = sb->s_qcop->get_state(sb, &state);
        if (ret < 0)
                return ret;

        memset(fqs, 0, sizeof(*fqs));
        fqs->qs_version = FS_QSTAT_VERSION;
        fqs->qs_flags = quota_state_to_flags(&state);
        /* No quota enabled? */
        if (!fqs->qs_flags)
                return -ENOSYS;
        fqs->qs_incoredqs = state.s_incoredqs;

        fqs->qs_btimelimit = state.s_state[type].spc_timelimit;
        fqs->qs_itimelimit = state.s_state[type].ino_timelimit;
        fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit;
        fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit;
        fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit;
        fqs->qs_rtbwarnlimit = state.s_state[type].rt_spc_warnlimit;

        /* Inodes may be allocated even if inactive; copy out if present */
        if (state.s_state[USRQUOTA].ino) {
                fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino;
                fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks;
                fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents;
        }
        if (state.s_state[GRPQUOTA].ino) {
                fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino;
                fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks;
                fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents;
        }
        if (state.s_state[PRJQUOTA].ino) {
                fqs->qs_pquota.qfs_ino = state.s_state[PRJQUOTA].ino;
                fqs->qs_pquota.qfs_nblks = state.s_state[PRJQUOTA].blocks;
                fqs->qs_pquota.qfs_nextents = state.s_state[PRJQUOTA].nextents;
        }
        return 0;
}

static int quota_getxstatev(struct super_block *sb, int type, void __user *addr)
{
        struct fs_quota_statv fqs;
        int ret;

        if (!sb->s_qcop->get_state)
                return -ENOSYS;

        memset(&fqs, 0, sizeof(fqs));
        if (copy_from_user(&fqs, addr, 1)) /* Just read qs_version */
                return -EFAULT;

        /* If this kernel doesn't support user specified version, fail */
        switch (fqs.qs_version) {
        case FS_QSTATV_VERSION1:
                break;
        default:
                return -EINVAL;
        }
        ret = quota_getstatev(sb, type, &fqs);
        if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
                return -EFAULT;
        return ret;
}

/*
 * XFS defines BBTOB and BTOBB macros inside fs/xfs/ and we cannot move them
 * out of there as xfsprogs rely on definitions being in that header file. So
 * just define same functions here for quota purposes.
 */
#define XFS_BB_SHIFT 9

static inline u64 quota_bbtob(u64 blocks)
{
        return blocks << XFS_BB_SHIFT;
}

static inline u64 quota_btobb(u64 bytes)
{
        return (bytes + (1 << XFS_BB_SHIFT) - 1) >> XFS_BB_SHIFT;
}

static inline s64 copy_from_xfs_dqblk_ts(const struct fs_disk_quota *d,
                __s32 timer, __s8 timer_hi)
{
        if (d->d_fieldmask & FS_DQ_BIGTIME)
                return (u32)timer | (s64)timer_hi << 32;
        return timer;
}

static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src)
{
        dst->d_spc_hardlimit = quota_bbtob(src->d_blk_hardlimit);
        dst->d_spc_softlimit = quota_bbtob(src->d_blk_softlimit);
        dst->d_ino_hardlimit = src->d_ino_hardlimit;
        dst->d_ino_softlimit = src->d_ino_softlimit;
        dst->d_space = quota_bbtob(src->d_bcount);
        dst->d_ino_count = src->d_icount;
        dst->d_ino_timer = copy_from_xfs_dqblk_ts(src, src->d_itimer,
                                                  src->d_itimer_hi);
        dst->d_spc_timer = copy_from_xfs_dqblk_ts(src, src->d_btimer,
                                                  src->d_btimer_hi);
        dst->d_ino_warns = src->d_iwarns;
        dst->d_spc_warns = src->d_bwarns;
        dst->d_rt_spc_hardlimit = quota_bbtob(src->d_rtb_hardlimit);
        dst->d_rt_spc_softlimit = quota_bbtob(src->d_rtb_softlimit);
        dst->d_rt_space = quota_bbtob(src->d_rtbcount);
        dst->d_rt_spc_timer = copy_from_xfs_dqblk_ts(src, src->d_rtbtimer,
                                                     src->d_rtbtimer_hi);
        dst->d_rt_spc_warns = src->d_rtbwarns;
        dst->d_fieldmask = 0;
        if (src->d_fieldmask & FS_DQ_ISOFT)
                dst->d_fieldmask |= QC_INO_SOFT;
        if (src->d_fieldmask & FS_DQ_IHARD)
                dst->d_fieldmask |= QC_INO_HARD;
        if (src->d_fieldmask & FS_DQ_BSOFT)
                dst->d_fieldmask |= QC_SPC_SOFT;
        if (src->d_fieldmask & FS_DQ_BHARD)
                dst->d_fieldmask |= QC_SPC_HARD;
        if (src->d_fieldmask & FS_DQ_RTBSOFT)
                dst->d_fieldmask |= QC_RT_SPC_SOFT;
        if (src->d_fieldmask & FS_DQ_RTBHARD)
                dst->d_fieldmask |= QC_RT_SPC_HARD;
        if (src->d_fieldmask & FS_DQ_BTIMER)
                dst->d_fieldmask |= QC_SPC_TIMER;
        if (src->d_fieldmask & FS_DQ_ITIMER)
                dst->d_fieldmask |= QC_INO_TIMER;
        if (src->d_fieldmask & FS_DQ_RTBTIMER)
                dst->d_fieldmask |= QC_RT_SPC_TIMER;
        if (src->d_fieldmask & FS_DQ_BWARNS)
                dst->d_fieldmask |= QC_SPC_WARNS;
        if (src->d_fieldmask & FS_DQ_IWARNS)
                dst->d_fieldmask |= QC_INO_WARNS;
        if (src->d_fieldmask & FS_DQ_RTBWARNS)
                dst->d_fieldmask |= QC_RT_SPC_WARNS;
        if (src->d_fieldmask & FS_DQ_BCOUNT)
                dst->d_fieldmask |= QC_SPACE;
        if (src->d_fieldmask & FS_DQ_ICOUNT)
                dst->d_fieldmask |= QC_INO_COUNT;
        if (src->d_fieldmask & FS_DQ_RTBCOUNT)
                dst->d_fieldmask |= QC_RT_SPACE;
}

static void copy_qcinfo_from_xfs_dqblk(struct qc_info *dst,
                                       struct fs_disk_quota *src)
{
        memset(dst, 0, sizeof(*dst));
        dst->i_spc_timelimit = src->d_btimer;
        dst->i_ino_timelimit = src->d_itimer;
        dst->i_rt_spc_timelimit = src->d_rtbtimer;
        dst->i_ino_warnlimit = src->d_iwarns;
        dst->i_spc_warnlimit = src->d_bwarns;
        dst->i_rt_spc_warnlimit = src->d_rtbwarns;
        if (src->d_fieldmask & FS_DQ_BWARNS)
                dst->i_fieldmask |= QC_SPC_WARNS;
        if (src->d_fieldmask & FS_DQ_IWARNS)
                dst->i_fieldmask |= QC_INO_WARNS;
        if (src->d_fieldmask & FS_DQ_RTBWARNS)
                dst->i_fieldmask |= QC_RT_SPC_WARNS;
        if (src->d_fieldmask & FS_DQ_BTIMER)
                dst->i_fieldmask |= QC_SPC_TIMER;
        if (src->d_fieldmask & FS_DQ_ITIMER)
                dst->i_fieldmask |= QC_INO_TIMER;
        if (src->d_fieldmask & FS_DQ_RTBTIMER)
                dst->i_fieldmask |= QC_RT_SPC_TIMER;
}

static int quota_setxquota(struct super_block *sb, int type, qid_t id,
                           void __user *addr)
{
        struct fs_disk_quota fdq;
        struct qc_dqblk qdq;
        struct kqid qid;

        if (copy_from_user(&fdq, addr, sizeof(fdq)))
                return -EFAULT;
        if (!sb->s_qcop->set_dqblk)
                return -ENOSYS;
        qid = make_kqid(current_user_ns(), type, id);
        if (!qid_has_mapping(sb->s_user_ns, qid))
                return -EINVAL;
        /* Are we actually setting timer / warning limits for all users? */
        if (from_kqid(sb->s_user_ns, qid) == 0 &&
            fdq.d_fieldmask & (FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK)) {
                struct qc_info qinfo;
                int ret;

                if (!sb->s_qcop->set_info)
                        return -EINVAL;
                copy_qcinfo_from_xfs_dqblk(&qinfo, &fdq);
                ret = sb->s_qcop->set_info(sb, type, &qinfo);
                if (ret)
                        return ret;
                /* These are already done */
                fdq.d_fieldmask &= ~(FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK);
        }
        copy_from_xfs_dqblk(&qdq, &fdq);
        return sb->s_qcop->set_dqblk(sb, qid, &qdq);
}

static inline void copy_to_xfs_dqblk_ts(const struct fs_disk_quota *d,
                __s32 *timer_lo, __s8 *timer_hi, s64 timer)
{
        *timer_lo = timer;
        if (d->d_fieldmask & FS_DQ_BIGTIME)
                *timer_hi = timer >> 32;
}

static inline bool want_bigtime(s64 timer)
{
        return timer > S32_MAX || timer < S32_MIN;
}

static void copy_to_xfs_dqblk(struct fs_disk_quota *dst, struct qc_dqblk *src,
                              int type, qid_t id)
{
        memset(dst, 0, sizeof(*dst));
        if (want_bigtime(src->d_ino_timer) || want_bigtime(src->d_spc_timer) ||
            want_bigtime(src->d_rt_spc_timer))
                dst->d_fieldmask |= FS_DQ_BIGTIME;
        dst->d_version = FS_DQUOT_VERSION;
        dst->d_id = id;
        if (type == USRQUOTA)
                dst->d_flags = FS_USER_QUOTA;
        else if (type == PRJQUOTA)
                dst->d_flags = FS_PROJ_QUOTA;
        else
                dst->d_flags = FS_GROUP_QUOTA;
        dst->d_blk_hardlimit = quota_btobb(src->d_spc_hardlimit);
        dst->d_blk_softlimit = quota_btobb(src->d_spc_softlimit);
        dst->d_ino_hardlimit = src->d_ino_hardlimit;
        dst->d_ino_softlimit = src->d_ino_softlimit;
        dst->d_bcount = quota_btobb(src->d_space);
        dst->d_icount = src->d_ino_count;
        copy_to_xfs_dqblk_ts(dst, &dst->d_itimer, &dst->d_itimer_hi,
                             src->d_ino_timer);
        copy_to_xfs_dqblk_ts(dst, &dst->d_btimer, &dst->d_btimer_hi,
                             src->d_spc_timer);
        dst->d_iwarns = src->d_ino_warns;
        dst->d_bwarns = src->d_spc_warns;
        dst->d_rtb_hardlimit = quota_btobb(src->d_rt_spc_hardlimit);
        dst->d_rtb_softlimit = quota_btobb(src->d_rt_spc_softlimit);
        dst->d_rtbcount = quota_btobb(src->d_rt_space);
        copy_to_xfs_dqblk_ts(dst, &dst->d_rtbtimer, &dst->d_rtbtimer_hi,
                             src->d_rt_spc_timer);
        dst->d_rtbwarns = src->d_rt_spc_warns;
}

static int quota_getxquota(struct super_block *sb, int type, qid_t id,
                           void __user *addr)
{
        struct fs_disk_quota fdq;
        struct qc_dqblk qdq;
        struct kqid qid;
        int ret;

        if (!sb->s_qcop->get_dqblk)
                return -ENOSYS;
        qid = make_kqid(current_user_ns(), type, id);
        if (!qid_has_mapping(sb->s_user_ns, qid))
                return -EINVAL;
        ret = sb->s_qcop->get_dqblk(sb, qid, &qdq);
        if (ret)
                return ret;
        copy_to_xfs_dqblk(&fdq, &qdq, type, id);
        if (copy_to_user(addr, &fdq, sizeof(fdq)))
                return -EFAULT;
        return ret;
}

/*
 * Return quota for next active quota >= this id, if any exists,
 * otherwise return -ENOENT via ->get_nextdqblk.
 */
static int quota_getnextxquota(struct super_block *sb, int type, qid_t id,
                            void __user *addr)
{
        struct fs_disk_quota fdq;
        struct qc_dqblk qdq;
        struct kqid qid;
        qid_t id_out;
        int ret;

        if (!sb->s_qcop->get_nextdqblk)
                return -ENOSYS;
        qid = make_kqid(current_user_ns(), type, id);
        if (!qid_has_mapping(sb->s_user_ns, qid))
                return -EINVAL;
        ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq);
        if (ret)
                return ret;
        id_out = from_kqid(current_user_ns(), qid);
        copy_to_xfs_dqblk(&fdq, &qdq, type, id_out);
        if (copy_to_user(addr, &fdq, sizeof(fdq)))
                return -EFAULT;
        return ret;
}

static int quota_rmxquota(struct super_block *sb, void __user *addr)
{
        __u32 flags;

        if (copy_from_user(&flags, addr, sizeof(flags)))
                return -EFAULT;
        if (!sb->s_qcop->rm_xquota)
                return -ENOSYS;
        return sb->s_qcop->rm_xquota(sb, flags);
}

/* Copy parameters and call proper function */
static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
                       void __user *addr, const struct path *path)
{
        int ret;

        type = array_index_nospec(type, MAXQUOTAS);
        /*
         * Quota not supported on this fs? Check this before s_quota_types
         * since they needn't be set if quota is not supported at all.
         */
        if (!sb->s_qcop)
                return -ENOSYS;
        if (!(sb->s_quota_types & (1 << type)))
                return -EINVAL;

        ret = check_quotactl_permission(sb, type, cmd, id);
        if (ret < 0)
                return ret;

        switch (cmd) {
        case Q_QUOTAON:
                return quota_quotaon(sb, type, id, path);
        case Q_QUOTAOFF:
                return quota_quotaoff(sb, type);
        case Q_GETFMT:
                return quota_getfmt(sb, type, addr);
        case Q_GETINFO:
                return quota_getinfo(sb, type, addr);
        case Q_SETINFO:
                return quota_setinfo(sb, type, addr);
        case Q_GETQUOTA:
                return quota_getquota(sb, type, id, addr);
        case Q_GETNEXTQUOTA:
                return quota_getnextquota(sb, type, id, addr);
        case Q_SETQUOTA:
                return quota_setquota(sb, type, id, addr);
        case Q_SYNC:
                if (!sb->s_qcop->quota_sync)
                        return -ENOSYS;
                return sb->s_qcop->quota_sync(sb, type);
        case Q_XQUOTAON:
                return quota_enable(sb, addr);
        case Q_XQUOTAOFF:
                return quota_disable(sb, addr);
        case Q_XQUOTARM:
                return quota_rmxquota(sb, addr);
        case Q_XGETQSTAT:
                return quota_getxstate(sb, type, addr);
        case Q_XGETQSTATV:
                return quota_getxstatev(sb, type, addr);
        case Q_XSETQLIM:
                return quota_setxquota(sb, type, id, addr);
        case Q_XGETQUOTA:
                return quota_getxquota(sb, type, id, addr);
        case Q_XGETNEXTQUOTA:
                return quota_getnextxquota(sb, type, id, addr);
        case Q_XQUOTASYNC:
                if (sb_rdonly(sb))
                        return -EROFS;
                /* XFS quotas are fully coherent now, making this call a noop */
                return 0;
        default:
                return -EINVAL;
        }
}

/* Return 1 if 'cmd' will block on frozen filesystem */
static int quotactl_cmd_write(int cmd)
{
        /*
         * We cannot allow Q_GETQUOTA and Q_GETNEXTQUOTA without write access
         * as dquot_acquire() may allocate space for new structure and OCFS2
         * needs to increment on-disk use count.
         */
        switch (cmd) {
        case Q_GETFMT:
        case Q_GETINFO:
        case Q_SYNC:
        case Q_XGETQSTAT:
        case Q_XGETQSTATV:
        case Q_XGETQUOTA:
        case Q_XGETNEXTQUOTA:
        case Q_XQUOTASYNC:
                return 0;
        }
        return 1;
}

/* Return true if quotactl command is manipulating quota on/off state */
static bool quotactl_cmd_onoff(int cmd)
{
        return (cmd == Q_QUOTAON) || (cmd == Q_QUOTAOFF) ||
                 (cmd == Q_XQUOTAON) || (cmd == Q_XQUOTAOFF);
}

/*
 * look up a superblock on which quota ops will be performed
 * - use the name of a block device to find the superblock thereon
 */
static struct super_block *quotactl_block(const char __user *special, int cmd)
{
#ifdef CONFIG_BLOCK
        struct super_block *sb;
        struct filename *tmp = getname(special);
        bool excl = false, thawed = false;
        int error;
        dev_t dev;

        if (IS_ERR(tmp))
                return ERR_CAST(tmp);
        error = lookup_bdev(tmp->name, &dev);
        putname(tmp);
        if (error)
                return ERR_PTR(error);

        if (quotactl_cmd_onoff(cmd)) {
                excl = true;
                thawed = true;
        } else if (quotactl_cmd_write(cmd)) {
                thawed = true;
        }

retry:
        sb = user_get_super(dev, excl);
        if (!sb)
                return ERR_PTR(-ENODEV);
        if (thawed && sb->s_writers.frozen != SB_UNFROZEN) {
                if (excl)
                        up_write(&sb->s_umount);
                else
                        up_read(&sb->s_umount);
                /* Wait for sb to unfreeze */
                sb_start_write(sb);
                sb_end_write(sb);
                put_super(sb);
                goto retry;
        }
        return sb;

#else
        return ERR_PTR(-ENODEV);
#endif
}

/*
 * This is the system call interface. This communicates with
 * the user-level programs. Currently this only supports diskquota
 * calls. Maybe we need to add the process quotas etc. in the future,
 * but we probably should use rlimits for that.
 */
SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
                qid_t, id, void __user *, addr)
{
        uint cmds, type;
        struct super_block *sb = NULL;
        struct path path, *pathp = NULL;
        int ret;

        cmds = cmd >> SUBCMDSHIFT;
        type = cmd & SUBCMDMASK;

        if (type >= MAXQUOTAS)
                return -EINVAL;

        /*
         * As a special case Q_SYNC can be called without a specific device.
         * It will iterate all superblocks that have quota enabled and call
         * the sync action on each of them.
         */
        if (!special) {
                if (cmds == Q_SYNC)
                        return quota_sync_all(type);
                return -ENODEV;
        }

        /*
         * Path for quotaon has to be resolved before grabbing superblock
         * because that gets s_umount sem which is also possibly needed by path
         * resolution (think about autofs) and thus deadlocks could arise.
         */
        if (cmds == Q_QUOTAON) {
                ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
                if (ret)
                        pathp = ERR_PTR(ret);
                else
                        pathp = &path;
        }

        sb = quotactl_block(special, cmds);
        if (IS_ERR(sb)) {
                ret = PTR_ERR(sb);
                goto out;
        }

        ret = do_quotactl(sb, type, cmds, id, addr, pathp);

        if (!quotactl_cmd_onoff(cmds))
                drop_super(sb);
        else
                drop_super_exclusive(sb);
out:
        if (pathp && !IS_ERR(pathp))
                path_put(pathp);
        return ret;
}

SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd,
                qid_t, id, void __user *, addr)
{
        struct super_block *sb;
        unsigned int cmds = cmd >> SUBCMDSHIFT;
        unsigned int type = cmd & SUBCMDMASK;
        struct fd f;
        int ret;

        f = fdget_raw(fd);
        if (!f.file)
                return -EBADF;

        ret = -EINVAL;
        if (type >= MAXQUOTAS)
                goto out;

        if (quotactl_cmd_write(cmds)) {
                ret = mnt_want_write(f.file->f_path.mnt);
                if (ret)
                        goto out;
        }

        sb = f.file->f_path.mnt->mnt_sb;
        if (quotactl_cmd_onoff(cmds))
                down_write(&sb->s_umount);
        else
                down_read(&sb->s_umount);

        ret = do_quotactl(sb, type, cmds, id, addr, ERR_PTR(-EINVAL));

        if (quotactl_cmd_onoff(cmds))
                up_write(&sb->s_umount);
        else
                up_read(&sb->s_umount);

        if (quotactl_cmd_write(cmds))
                mnt_drop_write(f.file->f_path.mnt);
out:
        fdput(f);
        return ret;
}



















































































































































   36 





    9 




   34 




















   27 





























   28 




   27 

   35 










   15 














   35 

























   16 



   15 










   16 














   16 











   16 




   37 









   35 


   35 
   38 





































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
// SPDX-License-Identifier: GPL-2.0
#include <linux/debugfs.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/memblock.h>
#include <linux/stacktrace.h>
#include <linux/page_owner.h>
#include <linux/jump_label.h>
#include <linux/migrate.h>
#include <linux/stackdepot.h>
#include <linux/seq_file.h>
#include <linux/memcontrol.h>
#include <linux/sched/clock.h>

#include "internal.h"

/*
 * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack)
 * to use off stack temporal storage
 */
#define PAGE_OWNER_STACK_DEPTH (16)

struct page_owner {
        unsigned short order;
        short last_migrate_reason;
        gfp_t gfp_mask;
        depot_stack_handle_t handle;
        depot_stack_handle_t free_handle;
        u64 ts_nsec;
        u64 free_ts_nsec;
        char comm[TASK_COMM_LEN];
        pid_t pid;
        pid_t tgid;
        pid_t free_pid;
        pid_t free_tgid;
};

struct stack {
        struct stack_record *stack_record;
        struct stack *next;
};
static struct stack dummy_stack;
static struct stack failure_stack;
static struct stack *stack_list;
static DEFINE_SPINLOCK(stack_list_lock);

static bool page_owner_enabled __initdata;
DEFINE_STATIC_KEY_FALSE(page_owner_inited);

static depot_stack_handle_t dummy_handle;
static depot_stack_handle_t failure_handle;
static depot_stack_handle_t early_handle;

static void init_early_allocated_pages(void);

static inline void set_current_in_page_owner(void)
{
        /*
         * Avoid recursion.
         *
         * We might need to allocate more memory from page_owner code, so make
         * sure to signal it in order to avoid recursion.
         */
        current->in_page_owner = 1;
}

static inline void unset_current_in_page_owner(void)
{
        current->in_page_owner = 0;
}

static int __init early_page_owner_param(char *buf)
{
        int ret = kstrtobool(buf, &page_owner_enabled);

        if (page_owner_enabled)
                stack_depot_request_early_init();

        return ret;
}
early_param("page_owner", early_page_owner_param);

static __init bool need_page_owner(void)
{
        return page_owner_enabled;
}

static __always_inline depot_stack_handle_t create_dummy_stack(void)
{
        unsigned long entries[4];
        unsigned int nr_entries;

        nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
        return stack_depot_save(entries, nr_entries, GFP_KERNEL);
}

static noinline void register_dummy_stack(void)
{
        dummy_handle = create_dummy_stack();
}

static noinline void register_failure_stack(void)
{
        failure_handle = create_dummy_stack();
}

static noinline void register_early_stack(void)
{
        early_handle = create_dummy_stack();
}

static __init void init_page_owner(void)
{
        if (!page_owner_enabled)
                return;

        register_dummy_stack();
        register_failure_stack();
        register_early_stack();
        init_early_allocated_pages();
        /* Initialize dummy and failure stacks and link them to stack_list */
        dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle);
        failure_stack.stack_record = __stack_depot_get_stack_record(failure_handle);
        if (dummy_stack.stack_record)
                refcount_set(&dummy_stack.stack_record->count, 1);
        if (failure_stack.stack_record)
                refcount_set(&failure_stack.stack_record->count, 1);
        dummy_stack.next = &failure_stack;
        stack_list = &dummy_stack;
        static_branch_enable(&page_owner_inited);
}

struct page_ext_operations page_owner_ops = {
        .size = sizeof(struct page_owner),
        .need = need_page_owner,
        .init = init_page_owner,
        .need_shared_flags = true,
};

static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
{
        return page_ext_data(page_ext, &page_owner_ops);
}

static noinline depot_stack_handle_t save_stack(gfp_t flags)
{
        unsigned long entries[PAGE_OWNER_STACK_DEPTH];
        depot_stack_handle_t handle;
        unsigned int nr_entries;

        if (current->in_page_owner)
                return dummy_handle;

        set_current_in_page_owner();
        nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
        handle = stack_depot_save(entries, nr_entries, flags);
        if (!handle)
                handle = failure_handle;
        unset_current_in_page_owner();

        return handle;
}

static void add_stack_record_to_list(struct stack_record *stack_record,
                                     gfp_t gfp_mask)
{
        unsigned long flags;
        struct stack *stack;

        set_current_in_page_owner();
        stack = kmalloc(sizeof(*stack), gfp_nested_mask(gfp_mask));
        if (!stack) {
                unset_current_in_page_owner();
                return;
        }
        unset_current_in_page_owner();

        stack->stack_record = stack_record;
        stack->next = NULL;

        spin_lock_irqsave(&stack_list_lock, flags);
        stack->next = stack_list;
        /*
         * This pairs with smp_load_acquire() from function
         * stack_start(). This guarantees that stack_start()
         * will see an updated stack_list before starting to
         * traverse the list.
         */
        smp_store_release(&stack_list, stack);
        spin_unlock_irqrestore(&stack_list_lock, flags);
}

static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask,
                                   int nr_base_pages)
{
        struct stack_record *stack_record = __stack_depot_get_stack_record(handle);

        if (!stack_record)
                return;

        /*
         * New stack_record's that do not use STACK_DEPOT_FLAG_GET start
         * with REFCOUNT_SATURATED to catch spurious increments of their
         * refcount.
         * Since we do not use STACK_DEPOT_FLAG_GET API, let us
         * set a refcount of 1 ourselves.
         */
        if (refcount_read(&stack_record->count) == REFCOUNT_SATURATED) {
                int old = REFCOUNT_SATURATED;

                if (atomic_try_cmpxchg_relaxed(&stack_record->count.refs, &old, 1))
                        /* Add the new stack_record to our list */
                        add_stack_record_to_list(stack_record, gfp_mask);
        }
        refcount_add(nr_base_pages, &stack_record->count);
}

static void dec_stack_record_count(depot_stack_handle_t handle,
                                   int nr_base_pages)
{
        struct stack_record *stack_record = __stack_depot_get_stack_record(handle);

        if (!stack_record)
                return;

        if (refcount_sub_and_test(nr_base_pages, &stack_record->count))
                pr_warn("%s: refcount went to 0 for %u handle\n", __func__,
                        handle);
}

static inline void __update_page_owner_handle(struct page_ext *page_ext,
                                              depot_stack_handle_t handle,
                                              unsigned short order,
                                              gfp_t gfp_mask,
                                              short last_migrate_reason, u64 ts_nsec,
                                              pid_t pid, pid_t tgid, char *comm)
{
        int i;
        struct page_owner *page_owner;

        for (i = 0; i < (1 << order); i++) {
                page_owner = get_page_owner(page_ext);
                page_owner->handle = handle;
                page_owner->order = order;
                page_owner->gfp_mask = gfp_mask;
                page_owner->last_migrate_reason = last_migrate_reason;
                page_owner->pid = pid;
                page_owner->tgid = tgid;
                page_owner->ts_nsec = ts_nsec;
                strscpy(page_owner->comm, comm,
                        sizeof(page_owner->comm));
                __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
                __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
                page_ext = page_ext_next(page_ext);
        }
}

static inline void __update_page_owner_free_handle(struct page_ext *page_ext,
                                                   depot_stack_handle_t handle,
                                                   unsigned short order,
                                                   pid_t pid, pid_t tgid,
                                                   u64 free_ts_nsec)
{
        int i;
        struct page_owner *page_owner;

        for (i = 0; i < (1 << order); i++) {
                page_owner = get_page_owner(page_ext);
                /* Only __reset_page_owner() wants to clear the bit */
                if (handle) {
                        __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
                        page_owner->free_handle = handle;
                }
                page_owner->free_ts_nsec = free_ts_nsec;
                page_owner->free_pid = current->pid;
                page_owner->free_tgid = current->tgid;
                page_ext = page_ext_next(page_ext);
        }
}

void __reset_page_owner(struct page *page, unsigned short order)
{
        struct page_ext *page_ext;
        depot_stack_handle_t handle;
        depot_stack_handle_t alloc_handle;
        struct page_owner *page_owner;
        u64 free_ts_nsec = local_clock();

        page_ext = page_ext_get(page);
        if (unlikely(!page_ext))
                return;

        page_owner = get_page_owner(page_ext);
        alloc_handle = page_owner->handle;

        handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
        __update_page_owner_free_handle(page_ext, handle, order, current->pid,
                                        current->tgid, free_ts_nsec);
        page_ext_put(page_ext);

        if (alloc_handle != early_handle)
                /*
                 * early_handle is being set as a handle for all those
                 * early allocated pages. See init_pages_in_zone().
                 * Since their refcount is not being incremented because
                 * the machinery is not ready yet, we cannot decrement
                 * their refcount either.
                 */
                dec_stack_record_count(alloc_handle, 1 << order);
}

noinline void __set_page_owner(struct page *page, unsigned short order,
                                        gfp_t gfp_mask)
{
        struct page_ext *page_ext;
        u64 ts_nsec = local_clock();
        depot_stack_handle_t handle;

        handle = save_stack(gfp_mask);

        page_ext = page_ext_get(page);
        if (unlikely(!page_ext))
                return;
        __update_page_owner_handle(page_ext, handle, order, gfp_mask, -1,
                                   ts_nsec, current->pid, current->tgid,
                                   current->comm);
        page_ext_put(page_ext);
        inc_stack_record_count(handle, gfp_mask, 1 << order);
}

void __set_page_owner_migrate_reason(struct page *page, int reason)
{
        struct page_ext *page_ext = page_ext_get(page);
        struct page_owner *page_owner;

        if (unlikely(!page_ext))
                return;

        page_owner = get_page_owner(page_ext);
        page_owner->last_migrate_reason = reason;
        page_ext_put(page_ext);
}

void __split_page_owner(struct page *page, int old_order, int new_order)
{
        int i;
        struct page_ext *page_ext = page_ext_get(page);
        struct page_owner *page_owner;

        if (unlikely(!page_ext))
                return;

        for (i = 0; i < (1 << old_order); i++) {
                page_owner = get_page_owner(page_ext);
                page_owner->order = new_order;
                page_ext = page_ext_next(page_ext);
        }
        page_ext_put(page_ext);
}

void __folio_copy_owner(struct folio *newfolio, struct folio *old)
{
        int i;
        struct page_ext *old_ext;
        struct page_ext *new_ext;
        struct page_owner *old_page_owner;
        struct page_owner *new_page_owner;
        depot_stack_handle_t migrate_handle;

        old_ext = page_ext_get(&old->page);
        if (unlikely(!old_ext))
                return;

        new_ext = page_ext_get(&newfolio->page);
        if (unlikely(!new_ext)) {
                page_ext_put(old_ext);
                return;
        }

        old_page_owner = get_page_owner(old_ext);
        new_page_owner = get_page_owner(new_ext);
        migrate_handle = new_page_owner->handle;
        __update_page_owner_handle(new_ext, old_page_owner->handle,
                                   old_page_owner->order, old_page_owner->gfp_mask,
                                   old_page_owner->last_migrate_reason,
                                   old_page_owner->ts_nsec, old_page_owner->pid,
                                   old_page_owner->tgid, old_page_owner->comm);
        /*
         * Do not proactively clear PAGE_EXT_OWNER{_ALLOCATED} bits as the folio
         * will be freed after migration. Keep them until then as they may be
         * useful.
         */
        __update_page_owner_free_handle(new_ext, 0, old_page_owner->order,
                                        old_page_owner->free_pid,
                                        old_page_owner->free_tgid,
                                        old_page_owner->free_ts_nsec);
        /*
         * We linked the original stack to the new folio, we need to do the same
         * for the new one and the old folio otherwise there will be an imbalance
         * when subtracting those pages from the stack.
         */
        for (i = 0; i < (1 << new_page_owner->order); i++) {
                old_page_owner->handle = migrate_handle;
                old_ext = page_ext_next(old_ext);
                old_page_owner = get_page_owner(old_ext);
        }

        page_ext_put(new_ext);
        page_ext_put(old_ext);
}

void pagetypeinfo_showmixedcount_print(struct seq_file *m,
                                       pg_data_t *pgdat, struct zone *zone)
{
        struct page *page;
        struct page_ext *page_ext;
        struct page_owner *page_owner;
        unsigned long pfn, block_end_pfn;
        unsigned long end_pfn = zone_end_pfn(zone);
        unsigned long count[MIGRATE_TYPES] = { 0, };
        int pageblock_mt, page_mt;
        int i;

        /* Scan block by block. First and last block may be incomplete */
        pfn = zone->zone_start_pfn;

        /*
         * Walk the zone in pageblock_nr_pages steps. If a page block spans
         * a zone boundary, it will be double counted between zones. This does
         * not matter as the mixed block count will still be correct
         */
        for (; pfn < end_pfn; ) {
                page = pfn_to_online_page(pfn);
                if (!page) {
                        pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
                        continue;
                }

                block_end_pfn = pageblock_end_pfn(pfn);
                block_end_pfn = min(block_end_pfn, end_pfn);

                pageblock_mt = get_pageblock_migratetype(page);

                for (; pfn < block_end_pfn; pfn++) {
                        /* The pageblock is online, no need to recheck. */
                        page = pfn_to_page(pfn);

                        if (page_zone(page) != zone)
                                continue;

                        if (PageBuddy(page)) {
                                unsigned long freepage_order;

                                freepage_order = buddy_order_unsafe(page);
                                if (freepage_order <= MAX_PAGE_ORDER)
                                        pfn += (1UL << freepage_order) - 1;
                                continue;
                        }

                        if (PageReserved(page))
                                continue;

                        page_ext = page_ext_get(page);
                        if (unlikely(!page_ext))
                                continue;

                        if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
                                goto ext_put_continue;

                        page_owner = get_page_owner(page_ext);
                        page_mt = gfp_migratetype(page_owner->gfp_mask);
                        if (pageblock_mt != page_mt) {
                                if (is_migrate_cma(pageblock_mt))
                                        count[MIGRATE_MOVABLE]++;
                                else
                                        count[pageblock_mt]++;

                                pfn = block_end_pfn;
                                page_ext_put(page_ext);
                                break;
                        }
                        pfn += (1UL << page_owner->order) - 1;
ext_put_continue:
                        page_ext_put(page_ext);
                }
        }

        /* Print counts */
        seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
        for (i = 0; i < MIGRATE_TYPES; i++)
                seq_printf(m, "%12lu ", count[i]);
        seq_putc(m, '\n');
}

/*
 * Looking for memcg information and print it out
 */
static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret,
                                         struct page *page)
{
#ifdef CONFIG_MEMCG
        unsigned long memcg_data;
        struct mem_cgroup *memcg;
        bool online;
        char name[80];

        rcu_read_lock();
        memcg_data = READ_ONCE(page->memcg_data);
        if (!memcg_data)
                goto out_unlock;

        if (memcg_data & MEMCG_DATA_OBJEXTS)
                ret += scnprintf(kbuf + ret, count - ret,
                                "Slab cache page\n");

        memcg = page_memcg_check(page);
        if (!memcg)
                goto out_unlock;

        online = (memcg->css.flags & CSS_ONLINE);
        cgroup_name(memcg->css.cgroup, name, sizeof(name));
        ret += scnprintf(kbuf + ret, count - ret,
                        "Charged %sto %smemcg %s\n",
                        PageMemcgKmem(page) ? "(via objcg) " : "",
                        online ? "" : "offline ",
                        name);
out_unlock:
        rcu_read_unlock();
#endif /* CONFIG_MEMCG */

        return ret;
}

static ssize_t
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
                struct page *page, struct page_owner *page_owner,
                depot_stack_handle_t handle)
{
        int ret, pageblock_mt, page_mt;
        char *kbuf;

        count = min_t(size_t, count, PAGE_SIZE);
        kbuf = kmalloc(count, GFP_KERNEL);
        if (!kbuf)
                return -ENOMEM;

        ret = scnprintf(kbuf, count,
                        "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns\n",
                        page_owner->order, page_owner->gfp_mask,
                        &page_owner->gfp_mask, page_owner->pid,
                        page_owner->tgid, page_owner->comm,
                        page_owner->ts_nsec);

        /* Print information relevant to grouping pages by mobility */
        pageblock_mt = get_pageblock_migratetype(page);
        page_mt  = gfp_migratetype(page_owner->gfp_mask);
        ret += scnprintf(kbuf + ret, count - ret,
                        "PFN 0x%lx type %s Block %lu type %s Flags %pGp\n",
                        pfn,
                        migratetype_names[page_mt],
                        pfn >> pageblock_order,
                        migratetype_names[pageblock_mt],
                        &page->flags);

        ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
        if (ret >= count)
                goto err;

        if (page_owner->last_migrate_reason != -1) {
                ret += scnprintf(kbuf + ret, count - ret,
                        "Page has been migrated, last migrate reason: %s\n",
                        migrate_reason_names[page_owner->last_migrate_reason]);
        }

        ret = print_page_owner_memcg(kbuf, count, ret, page);

        ret += snprintf(kbuf + ret, count - ret, "\n");
        if (ret >= count)
                goto err;

        if (copy_to_user(buf, kbuf, ret))
                ret = -EFAULT;

        kfree(kbuf);
        return ret;

err:
        kfree(kbuf);
        return -ENOMEM;
}

void __dump_page_owner(const struct page *page)
{
        struct page_ext *page_ext = page_ext_get((void *)page);
        struct page_owner *page_owner;
        depot_stack_handle_t handle;
        gfp_t gfp_mask;
        int mt;

        if (unlikely(!page_ext)) {
                pr_alert("There is not page extension available.\n");
                return;
        }

        page_owner = get_page_owner(page_ext);
        gfp_mask = page_owner->gfp_mask;
        mt = gfp_migratetype(gfp_mask);

        if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
                pr_alert("page_owner info is not present (never set?)\n");
                page_ext_put(page_ext);
                return;
        }

        if (test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
                pr_alert("page_owner tracks the page as allocated\n");
        else
                pr_alert("page_owner tracks the page as freed\n");

        pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu, free_ts %llu\n",
                 page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask,
                 page_owner->pid, page_owner->tgid, page_owner->comm,
                 page_owner->ts_nsec, page_owner->free_ts_nsec);

        handle = READ_ONCE(page_owner->handle);
        if (!handle)
                pr_alert("page_owner allocation stack trace missing\n");
        else
                stack_depot_print(handle);

        handle = READ_ONCE(page_owner->free_handle);
        if (!handle) {
                pr_alert("page_owner free stack trace missing\n");
        } else {
                pr_alert("page last free pid %d tgid %d stack trace:\n",
                          page_owner->free_pid, page_owner->free_tgid);
                stack_depot_print(handle);
        }

        if (page_owner->last_migrate_reason != -1)
                pr_alert("page has been migrated, last migrate reason: %s\n",
                        migrate_reason_names[page_owner->last_migrate_reason]);
        page_ext_put(page_ext);
}

static ssize_t
read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
        unsigned long pfn;
        struct page *page;
        struct page_ext *page_ext;
        struct page_owner *page_owner;
        depot_stack_handle_t handle;

        if (!static_branch_unlikely(&page_owner_inited))
                return -EINVAL;

        page = NULL;
        if (*ppos == 0)
                pfn = min_low_pfn;
        else
                pfn = *ppos;
        /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */
        while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
                pfn++;

        /* Find an allocated page */
        for (; pfn < max_pfn; pfn++) {
                /*
                 * This temporary page_owner is required so
                 * that we can avoid the context switches while holding
                 * the rcu lock and copying the page owner information to
                 * user through copy_to_user() or GFP_KERNEL allocations.
                 */
                struct page_owner page_owner_tmp;

                /*
                 * If the new page is in a new MAX_ORDER_NR_PAGES area,
                 * validate the area as existing, skip it if not
                 */
                if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) {
                        pfn += MAX_ORDER_NR_PAGES - 1;
                        continue;
                }

                page = pfn_to_page(pfn);
                if (PageBuddy(page)) {
                        unsigned long freepage_order = buddy_order_unsafe(page);

                        if (freepage_order <= MAX_PAGE_ORDER)
                                pfn += (1UL << freepage_order) - 1;
                        continue;
                }

                page_ext = page_ext_get(page);
                if (unlikely(!page_ext))
                        continue;

                /*
                 * Some pages could be missed by concurrent allocation or free,
                 * because we don't hold the zone lock.
                 */
                if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
                        goto ext_put_continue;

                /*
                 * Although we do have the info about past allocation of free
                 * pages, it's not relevant for current memory usage.
                 */
                if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
                        goto ext_put_continue;

                page_owner = get_page_owner(page_ext);

                /*
                 * Don't print "tail" pages of high-order allocations as that
                 * would inflate the stats.
                 */
                if (!IS_ALIGNED(pfn, 1 << page_owner->order))
                        goto ext_put_continue;

                /*
                 * Access to page_ext->handle isn't synchronous so we should
                 * be careful to access it.
                 */
                handle = READ_ONCE(page_owner->handle);
                if (!handle)
                        goto ext_put_continue;

                /* Record the next PFN to read in the file offset */
                *ppos = pfn + 1;

                page_owner_tmp = *page_owner;
                page_ext_put(page_ext);
                return print_page_owner(buf, count, pfn, page,
                                &page_owner_tmp, handle);
ext_put_continue:
                page_ext_put(page_ext);
        }

        return 0;
}

static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig)
{
        switch (orig) {
        case SEEK_SET:
                file->f_pos = offset;
                break;
        case SEEK_CUR:
                file->f_pos += offset;
                break;
        default:
                return -EINVAL;
        }
        return file->f_pos;
}

static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
{
        unsigned long pfn = zone->zone_start_pfn;
        unsigned long end_pfn = zone_end_pfn(zone);
        unsigned long count = 0;

        /*
         * Walk the zone in pageblock_nr_pages steps. If a page block spans
         * a zone boundary, it will be double counted between zones. This does
         * not matter as the mixed block count will still be correct
         */
        for (; pfn < end_pfn; ) {
                unsigned long block_end_pfn;

                if (!pfn_valid(pfn)) {
                        pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
                        continue;
                }

                block_end_pfn = pageblock_end_pfn(pfn);
                block_end_pfn = min(block_end_pfn, end_pfn);

                for (; pfn < block_end_pfn; pfn++) {
                        struct page *page = pfn_to_page(pfn);
                        struct page_ext *page_ext;

                        if (page_zone(page) != zone)
                                continue;

                        /*
                         * To avoid having to grab zone->lock, be a little
                         * careful when reading buddy page order. The only
                         * danger is that we skip too much and potentially miss
                         * some early allocated pages, which is better than
                         * heavy lock contention.
                         */
                        if (PageBuddy(page)) {
                                unsigned long order = buddy_order_unsafe(page);

                                if (order > 0 && order <= MAX_PAGE_ORDER)
                                        pfn += (1UL << order) - 1;
                                continue;
                        }

                        if (PageReserved(page))
                                continue;

                        page_ext = page_ext_get(page);
                        if (unlikely(!page_ext))
                                continue;

                        /* Maybe overlapping zone */
                        if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
                                goto ext_put_continue;

                        /* Found early allocated page */
                        __update_page_owner_handle(page_ext, early_handle, 0, 0,
                                                   -1, local_clock(), current->pid,
                                                   current->tgid, current->comm);
                        count++;
ext_put_continue:
                        page_ext_put(page_ext);
                }
                cond_resched();
        }

        pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n",
                pgdat->node_id, zone->name, count);
}

static void init_zones_in_node(pg_data_t *pgdat)
{
        struct zone *zone;
        struct zone *node_zones = pgdat->node_zones;

        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
                if (!populated_zone(zone))
                        continue;

                init_pages_in_zone(pgdat, zone);
        }
}

static void init_early_allocated_pages(void)
{
        pg_data_t *pgdat;

        for_each_online_pgdat(pgdat)
                init_zones_in_node(pgdat);
}

static const struct file_operations proc_page_owner_operations = {
        .read                = read_page_owner,
        .llseek                = lseek_page_owner,
};

static void *stack_start(struct seq_file *m, loff_t *ppos)
{
        struct stack *stack;

        if (*ppos == -1UL)
                return NULL;

        if (!*ppos) {
                /*
                 * This pairs with smp_store_release() from function
                 * add_stack_record_to_list(), so we get a consistent
                 * value of stack_list.
                 */
                stack = smp_load_acquire(&stack_list);
                m->private = stack;
        } else {
                stack = m->private;
        }

        return stack;
}

static void *stack_next(struct seq_file *m, void *v, loff_t *ppos)
{
        struct stack *stack = v;

        stack = stack->next;
        *ppos = stack ? *ppos + 1 : -1UL;
        m->private = stack;

        return stack;
}

static unsigned long page_owner_pages_threshold;

static int stack_print(struct seq_file *m, void *v)
{
        int i, nr_base_pages;
        struct stack *stack = v;
        unsigned long *entries;
        unsigned long nr_entries;
        struct stack_record *stack_record = stack->stack_record;

        if (!stack->stack_record)
                return 0;

        nr_entries = stack_record->size;
        entries = stack_record->entries;
        nr_base_pages = refcount_read(&stack_record->count) - 1;

        if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold)
                return 0;

        for (i = 0; i < nr_entries; i++)
                seq_printf(m, " %pS\n", (void *)entries[i]);
        seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages);

        return 0;
}

static void stack_stop(struct seq_file *m, void *v)
{
}

static const struct seq_operations page_owner_stack_op = {
        .start        = stack_start,
        .next        = stack_next,
        .stop        = stack_stop,
        .show        = stack_print
};

static int page_owner_stack_open(struct inode *inode, struct file *file)
{
        return seq_open_private(file, &page_owner_stack_op, 0);
}

static const struct file_operations page_owner_stack_operations = {
        .open                = page_owner_stack_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = seq_release,
};

static int page_owner_threshold_get(void *data, u64 *val)
{
        *val = READ_ONCE(page_owner_pages_threshold);
        return 0;
}

static int page_owner_threshold_set(void *data, u64 val)
{
        WRITE_ONCE(page_owner_pages_threshold, val);
        return 0;
}

DEFINE_SIMPLE_ATTRIBUTE(proc_page_owner_threshold, &page_owner_threshold_get,
                        &page_owner_threshold_set, "%llu");


static int __init pageowner_init(void)
{
        struct dentry *dir;

        if (!static_branch_unlikely(&page_owner_inited)) {
                pr_info("page_owner is disabled\n");
                return 0;
        }

        debugfs_create_file("page_owner", 0400, NULL, NULL,
                            &proc_page_owner_operations);
        dir = debugfs_create_dir("page_owner_stacks", NULL);
        debugfs_create_file("show_stacks", 0400, dir, NULL,
                            &page_owner_stack_operations);
        debugfs_create_file("count_threshold", 0600, dir, NULL,
                            &proc_page_owner_threshold);

        return 0;
}
late_initcall(pageowner_init)


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



















    1 




























    1 








    1 

















    1 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
/*
 * linux/fs/nls/nls_cp936.c
 *
 * Charset cp936 translation tables.
 * This translation table was generated automatically, the
 * original table can be download from the Microsoft website.
 * (http://www.microsoft.com/typography/unicode/unicodecp.htm)
 */

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/nls.h>
#include <linux/errno.h>

static const wchar_t c2u_81[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x4E02,0x4E04,0x4E05,0x4E06,0x4E0F,0x4E12,0x4E17,0x4E1F,/* 0x40-0x47 */
        0x4E20,0x4E21,0x4E23,0x4E26,0x4E29,0x4E2E,0x4E2F,0x4E31,/* 0x48-0x4F */
        0x4E33,0x4E35,0x4E37,0x4E3C,0x4E40,0x4E41,0x4E42,0x4E44,/* 0x50-0x57 */
        0x4E46,0x4E4A,0x4E51,0x4E55,0x4E57,0x4E5A,0x4E5B,0x4E62,/* 0x58-0x5F */
        0x4E63,0x4E64,0x4E65,0x4E67,0x4E68,0x4E6A,0x4E6B,0x4E6C,/* 0x60-0x67 */
        0x4E6D,0x4E6E,0x4E6F,0x4E72,0x4E74,0x4E75,0x4E76,0x4E77,/* 0x68-0x6F */
        0x4E78,0x4E79,0x4E7A,0x4E7B,0x4E7C,0x4E7D,0x4E7F,0x4E80,/* 0x70-0x77 */
        0x4E81,0x4E82,0x4E83,0x4E84,0x4E85,0x4E87,0x4E8A,0x0000,/* 0x78-0x7F */

        0x4E90,0x4E96,0x4E97,0x4E99,0x4E9C,0x4E9D,0x4E9E,0x4EA3,/* 0x80-0x87 */
        0x4EAA,0x4EAF,0x4EB0,0x4EB1,0x4EB4,0x4EB6,0x4EB7,0x4EB8,/* 0x88-0x8F */
        0x4EB9,0x4EBC,0x4EBD,0x4EBE,0x4EC8,0x4ECC,0x4ECF,0x4ED0,/* 0x90-0x97 */
        0x4ED2,0x4EDA,0x4EDB,0x4EDC,0x4EE0,0x4EE2,0x4EE6,0x4EE7,/* 0x98-0x9F */
        0x4EE9,0x4EED,0x4EEE,0x4EEF,0x4EF1,0x4EF4,0x4EF8,0x4EF9,/* 0xA0-0xA7 */
        0x4EFA,0x4EFC,0x4EFE,0x4F00,0x4F02,0x4F03,0x4F04,0x4F05,/* 0xA8-0xAF */
        0x4F06,0x4F07,0x4F08,0x4F0B,0x4F0C,0x4F12,0x4F13,0x4F14,/* 0xB0-0xB7 */
        0x4F15,0x4F16,0x4F1C,0x4F1D,0x4F21,0x4F23,0x4F28,0x4F29,/* 0xB8-0xBF */
        0x4F2C,0x4F2D,0x4F2E,0x4F31,0x4F33,0x4F35,0x4F37,0x4F39,/* 0xC0-0xC7 */
        0x4F3B,0x4F3E,0x4F3F,0x4F40,0x4F41,0x4F42,0x4F44,0x4F45,/* 0xC8-0xCF */
        0x4F47,0x4F48,0x4F49,0x4F4A,0x4F4B,0x4F4C,0x4F52,0x4F54,/* 0xD0-0xD7 */
        0x4F56,0x4F61,0x4F62,0x4F66,0x4F68,0x4F6A,0x4F6B,0x4F6D,/* 0xD8-0xDF */
        0x4F6E,0x4F71,0x4F72,0x4F75,0x4F77,0x4F78,0x4F79,0x4F7A,/* 0xE0-0xE7 */
        0x4F7D,0x4F80,0x4F81,0x4F82,0x4F85,0x4F86,0x4F87,0x4F8A,/* 0xE8-0xEF */
        0x4F8C,0x4F8E,0x4F90,0x4F92,0x4F93,0x4F95,0x4F96,0x4F98,/* 0xF0-0xF7 */
        0x4F99,0x4F9A,0x4F9C,0x4F9E,0x4F9F,0x4FA1,0x4FA2,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_82[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x4FA4,0x4FAB,0x4FAD,0x4FB0,0x4FB1,0x4FB2,0x4FB3,0x4FB4,/* 0x40-0x47 */
        0x4FB6,0x4FB7,0x4FB8,0x4FB9,0x4FBA,0x4FBB,0x4FBC,0x4FBD,/* 0x48-0x4F */
        0x4FBE,0x4FC0,0x4FC1,0x4FC2,0x4FC6,0x4FC7,0x4FC8,0x4FC9,/* 0x50-0x57 */
        0x4FCB,0x4FCC,0x4FCD,0x4FD2,0x4FD3,0x4FD4,0x4FD5,0x4FD6,/* 0x58-0x5F */
        0x4FD9,0x4FDB,0x4FE0,0x4FE2,0x4FE4,0x4FE5,0x4FE7,0x4FEB,/* 0x60-0x67 */
        0x4FEC,0x4FF0,0x4FF2,0x4FF4,0x4FF5,0x4FF6,0x4FF7,0x4FF9,/* 0x68-0x6F */
        0x4FFB,0x4FFC,0x4FFD,0x4FFF,0x5000,0x5001,0x5002,0x5003,/* 0x70-0x77 */
        0x5004,0x5005,0x5006,0x5007,0x5008,0x5009,0x500A,0x0000,/* 0x78-0x7F */

        0x500B,0x500E,0x5010,0x5011,0x5013,0x5015,0x5016,0x5017,/* 0x80-0x87 */
        0x501B,0x501D,0x501E,0x5020,0x5022,0x5023,0x5024,0x5027,/* 0x88-0x8F */
        0x502B,0x502F,0x5030,0x5031,0x5032,0x5033,0x5034,0x5035,/* 0x90-0x97 */
        0x5036,0x5037,0x5038,0x5039,0x503B,0x503D,0x503F,0x5040,/* 0x98-0x9F */
        0x5041,0x5042,0x5044,0x5045,0x5046,0x5049,0x504A,0x504B,/* 0xA0-0xA7 */
        0x504D,0x5050,0x5051,0x5052,0x5053,0x5054,0x5056,0x5057,/* 0xA8-0xAF */
        0x5058,0x5059,0x505B,0x505D,0x505E,0x505F,0x5060,0x5061,/* 0xB0-0xB7 */
        0x5062,0x5063,0x5064,0x5066,0x5067,0x5068,0x5069,0x506A,/* 0xB8-0xBF */
        0x506B,0x506D,0x506E,0x506F,0x5070,0x5071,0x5072,0x5073,/* 0xC0-0xC7 */
        0x5074,0x5075,0x5078,0x5079,0x507A,0x507C,0x507D,0x5081,/* 0xC8-0xCF */
        0x5082,0x5083,0x5084,0x5086,0x5087,0x5089,0x508A,0x508B,/* 0xD0-0xD7 */
        0x508C,0x508E,0x508F,0x5090,0x5091,0x5092,0x5093,0x5094,/* 0xD8-0xDF */
        0x5095,0x5096,0x5097,0x5098,0x5099,0x509A,0x509B,0x509C,/* 0xE0-0xE7 */
        0x509D,0x509E,0x509F,0x50A0,0x50A1,0x50A2,0x50A4,0x50A6,/* 0xE8-0xEF */
        0x50AA,0x50AB,0x50AD,0x50AE,0x50AF,0x50B0,0x50B1,0x50B3,/* 0xF0-0xF7 */
        0x50B4,0x50B5,0x50B6,0x50B7,0x50B8,0x50B9,0x50BC,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_83[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x50BD,0x50BE,0x50BF,0x50C0,0x50C1,0x50C2,0x50C3,0x50C4,/* 0x40-0x47 */
        0x50C5,0x50C6,0x50C7,0x50C8,0x50C9,0x50CA,0x50CB,0x50CC,/* 0x48-0x4F */
        0x50CD,0x50CE,0x50D0,0x50D1,0x50D2,0x50D3,0x50D4,0x50D5,/* 0x50-0x57 */
        0x50D7,0x50D8,0x50D9,0x50DB,0x50DC,0x50DD,0x50DE,0x50DF,/* 0x58-0x5F */
        0x50E0,0x50E1,0x50E2,0x50E3,0x50E4,0x50E5,0x50E8,0x50E9,/* 0x60-0x67 */
        0x50EA,0x50EB,0x50EF,0x50F0,0x50F1,0x50F2,0x50F4,0x50F6,/* 0x68-0x6F */
        0x50F7,0x50F8,0x50F9,0x50FA,0x50FC,0x50FD,0x50FE,0x50FF,/* 0x70-0x77 */
        0x5100,0x5101,0x5102,0x5103,0x5104,0x5105,0x5108,0x0000,/* 0x78-0x7F */

        0x5109,0x510A,0x510C,0x510D,0x510E,0x510F,0x5110,0x5111,/* 0x80-0x87 */
        0x5113,0x5114,0x5115,0x5116,0x5117,0x5118,0x5119,0x511A,/* 0x88-0x8F */
        0x511B,0x511C,0x511D,0x511E,0x511F,0x5120,0x5122,0x5123,/* 0x90-0x97 */
        0x5124,0x5125,0x5126,0x5127,0x5128,0x5129,0x512A,0x512B,/* 0x98-0x9F */
        0x512C,0x512D,0x512E,0x512F,0x5130,0x5131,0x5132,0x5133,/* 0xA0-0xA7 */
        0x5134,0x5135,0x5136,0x5137,0x5138,0x5139,0x513A,0x513B,/* 0xA8-0xAF */
        0x513C,0x513D,0x513E,0x5142,0x5147,0x514A,0x514C,0x514E,/* 0xB0-0xB7 */
        0x514F,0x5150,0x5152,0x5153,0x5157,0x5158,0x5159,0x515B,/* 0xB8-0xBF */
        0x515D,0x515E,0x515F,0x5160,0x5161,0x5163,0x5164,0x5166,/* 0xC0-0xC7 */
        0x5167,0x5169,0x516A,0x516F,0x5172,0x517A,0x517E,0x517F,/* 0xC8-0xCF */
        0x5183,0x5184,0x5186,0x5187,0x518A,0x518B,0x518E,0x518F,/* 0xD0-0xD7 */
        0x5190,0x5191,0x5193,0x5194,0x5198,0x519A,0x519D,0x519E,/* 0xD8-0xDF */
        0x519F,0x51A1,0x51A3,0x51A6,0x51A7,0x51A8,0x51A9,0x51AA,/* 0xE0-0xE7 */
        0x51AD,0x51AE,0x51B4,0x51B8,0x51B9,0x51BA,0x51BE,0x51BF,/* 0xE8-0xEF */
        0x51C1,0x51C2,0x51C3,0x51C5,0x51C8,0x51CA,0x51CD,0x51CE,/* 0xF0-0xF7 */
        0x51D0,0x51D2,0x51D3,0x51D4,0x51D5,0x51D6,0x51D7,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_84[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x51D8,0x51D9,0x51DA,0x51DC,0x51DE,0x51DF,0x51E2,0x51E3,/* 0x40-0x47 */
        0x51E5,0x51E6,0x51E7,0x51E8,0x51E9,0x51EA,0x51EC,0x51EE,/* 0x48-0x4F */
        0x51F1,0x51F2,0x51F4,0x51F7,0x51FE,0x5204,0x5205,0x5209,/* 0x50-0x57 */
        0x520B,0x520C,0x520F,0x5210,0x5213,0x5214,0x5215,0x521C,/* 0x58-0x5F */
        0x521E,0x521F,0x5221,0x5222,0x5223,0x5225,0x5226,0x5227,/* 0x60-0x67 */
        0x522A,0x522C,0x522F,0x5231,0x5232,0x5234,0x5235,0x523C,/* 0x68-0x6F */
        0x523E,0x5244,0x5245,0x5246,0x5247,0x5248,0x5249,0x524B,/* 0x70-0x77 */
        0x524E,0x524F,0x5252,0x5253,0x5255,0x5257,0x5258,0x0000,/* 0x78-0x7F */

        0x5259,0x525A,0x525B,0x525D,0x525F,0x5260,0x5262,0x5263,/* 0x80-0x87 */
        0x5264,0x5266,0x5268,0x526B,0x526C,0x526D,0x526E,0x5270,/* 0x88-0x8F */
        0x5271,0x5273,0x5274,0x5275,0x5276,0x5277,0x5278,0x5279,/* 0x90-0x97 */
        0x527A,0x527B,0x527C,0x527E,0x5280,0x5283,0x5284,0x5285,/* 0x98-0x9F */
        0x5286,0x5287,0x5289,0x528A,0x528B,0x528C,0x528D,0x528E,/* 0xA0-0xA7 */
        0x528F,0x5291,0x5292,0x5294,0x5295,0x5296,0x5297,0x5298,/* 0xA8-0xAF */
        0x5299,0x529A,0x529C,0x52A4,0x52A5,0x52A6,0x52A7,0x52AE,/* 0xB0-0xB7 */
        0x52AF,0x52B0,0x52B4,0x52B5,0x52B6,0x52B7,0x52B8,0x52B9,/* 0xB8-0xBF */
        0x52BA,0x52BB,0x52BC,0x52BD,0x52C0,0x52C1,0x52C2,0x52C4,/* 0xC0-0xC7 */
        0x52C5,0x52C6,0x52C8,0x52CA,0x52CC,0x52CD,0x52CE,0x52CF,/* 0xC8-0xCF */
        0x52D1,0x52D3,0x52D4,0x52D5,0x52D7,0x52D9,0x52DA,0x52DB,/* 0xD0-0xD7 */
        0x52DC,0x52DD,0x52DE,0x52E0,0x52E1,0x52E2,0x52E3,0x52E5,/* 0xD8-0xDF */
        0x52E6,0x52E7,0x52E8,0x52E9,0x52EA,0x52EB,0x52EC,0x52ED,/* 0xE0-0xE7 */
        0x52EE,0x52EF,0x52F1,0x52F2,0x52F3,0x52F4,0x52F5,0x52F6,/* 0xE8-0xEF */
        0x52F7,0x52F8,0x52FB,0x52FC,0x52FD,0x5301,0x5302,0x5303,/* 0xF0-0xF7 */
        0x5304,0x5307,0x5309,0x530A,0x530B,0x530C,0x530E,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_85[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x5311,0x5312,0x5313,0x5314,0x5318,0x531B,0x531C,0x531E,/* 0x40-0x47 */
        0x531F,0x5322,0x5324,0x5325,0x5327,0x5328,0x5329,0x532B,/* 0x48-0x4F */
        0x532C,0x532D,0x532F,0x5330,0x5331,0x5332,0x5333,0x5334,/* 0x50-0x57 */
        0x5335,0x5336,0x5337,0x5338,0x533C,0x533D,0x5340,0x5342,/* 0x58-0x5F */
        0x5344,0x5346,0x534B,0x534C,0x534D,0x5350,0x5354,0x5358,/* 0x60-0x67 */
        0x5359,0x535B,0x535D,0x5365,0x5368,0x536A,0x536C,0x536D,/* 0x68-0x6F */
        0x5372,0x5376,0x5379,0x537B,0x537C,0x537D,0x537E,0x5380,/* 0x70-0x77 */
        0x5381,0x5383,0x5387,0x5388,0x538A,0x538E,0x538F,0x0000,/* 0x78-0x7F */

        0x5390,0x5391,0x5392,0x5393,0x5394,0x5396,0x5397,0x5399,/* 0x80-0x87 */
        0x539B,0x539C,0x539E,0x53A0,0x53A1,0x53A4,0x53A7,0x53AA,/* 0x88-0x8F */
        0x53AB,0x53AC,0x53AD,0x53AF,0x53B0,0x53B1,0x53B2,0x53B3,/* 0x90-0x97 */
        0x53B4,0x53B5,0x53B7,0x53B8,0x53B9,0x53BA,0x53BC,0x53BD,/* 0x98-0x9F */
        0x53BE,0x53C0,0x53C3,0x53C4,0x53C5,0x53C6,0x53C7,0x53CE,/* 0xA0-0xA7 */
        0x53CF,0x53D0,0x53D2,0x53D3,0x53D5,0x53DA,0x53DC,0x53DD,/* 0xA8-0xAF */
        0x53DE,0x53E1,0x53E2,0x53E7,0x53F4,0x53FA,0x53FE,0x53FF,/* 0xB0-0xB7 */
        0x5400,0x5402,0x5405,0x5407,0x540B,0x5414,0x5418,0x5419,/* 0xB8-0xBF */
        0x541A,0x541C,0x5422,0x5424,0x5425,0x542A,0x5430,0x5433,/* 0xC0-0xC7 */
        0x5436,0x5437,0x543A,0x543D,0x543F,0x5441,0x5442,0x5444,/* 0xC8-0xCF */
        0x5445,0x5447,0x5449,0x544C,0x544D,0x544E,0x544F,0x5451,/* 0xD0-0xD7 */
        0x545A,0x545D,0x545E,0x545F,0x5460,0x5461,0x5463,0x5465,/* 0xD8-0xDF */
        0x5467,0x5469,0x546A,0x546B,0x546C,0x546D,0x546E,0x546F,/* 0xE0-0xE7 */
        0x5470,0x5474,0x5479,0x547A,0x547E,0x547F,0x5481,0x5483,/* 0xE8-0xEF */
        0x5485,0x5487,0x5488,0x5489,0x548A,0x548D,0x5491,0x5493,/* 0xF0-0xF7 */
        0x5497,0x5498,0x549C,0x549E,0x549F,0x54A0,0x54A1,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_86[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x54A2,0x54A5,0x54AE,0x54B0,0x54B2,0x54B5,0x54B6,0x54B7,/* 0x40-0x47 */
        0x54B9,0x54BA,0x54BC,0x54BE,0x54C3,0x54C5,0x54CA,0x54CB,/* 0x48-0x4F */
        0x54D6,0x54D8,0x54DB,0x54E0,0x54E1,0x54E2,0x54E3,0x54E4,/* 0x50-0x57 */
        0x54EB,0x54EC,0x54EF,0x54F0,0x54F1,0x54F4,0x54F5,0x54F6,/* 0x58-0x5F */
        0x54F7,0x54F8,0x54F9,0x54FB,0x54FE,0x5500,0x5502,0x5503,/* 0x60-0x67 */
        0x5504,0x5505,0x5508,0x550A,0x550B,0x550C,0x550D,0x550E,/* 0x68-0x6F */
        0x5512,0x5513,0x5515,0x5516,0x5517,0x5518,0x5519,0x551A,/* 0x70-0x77 */
        0x551C,0x551D,0x551E,0x551F,0x5521,0x5525,0x5526,0x0000,/* 0x78-0x7F */

        0x5528,0x5529,0x552B,0x552D,0x5532,0x5534,0x5535,0x5536,/* 0x80-0x87 */
        0x5538,0x5539,0x553A,0x553B,0x553D,0x5540,0x5542,0x5545,/* 0x88-0x8F */
        0x5547,0x5548,0x554B,0x554C,0x554D,0x554E,0x554F,0x5551,/* 0x90-0x97 */
        0x5552,0x5553,0x5554,0x5557,0x5558,0x5559,0x555A,0x555B,/* 0x98-0x9F */
        0x555D,0x555E,0x555F,0x5560,0x5562,0x5563,0x5568,0x5569,/* 0xA0-0xA7 */
        0x556B,0x556F,0x5570,0x5571,0x5572,0x5573,0x5574,0x5579,/* 0xA8-0xAF */
        0x557A,0x557D,0x557F,0x5585,0x5586,0x558C,0x558D,0x558E,/* 0xB0-0xB7 */
        0x5590,0x5592,0x5593,0x5595,0x5596,0x5597,0x559A,0x559B,/* 0xB8-0xBF */
        0x559E,0x55A0,0x55A1,0x55A2,0x55A3,0x55A4,0x55A5,0x55A6,/* 0xC0-0xC7 */
        0x55A8,0x55A9,0x55AA,0x55AB,0x55AC,0x55AD,0x55AE,0x55AF,/* 0xC8-0xCF */
        0x55B0,0x55B2,0x55B4,0x55B6,0x55B8,0x55BA,0x55BC,0x55BF,/* 0xD0-0xD7 */
        0x55C0,0x55C1,0x55C2,0x55C3,0x55C6,0x55C7,0x55C8,0x55CA,/* 0xD8-0xDF */
        0x55CB,0x55CE,0x55CF,0x55D0,0x55D5,0x55D7,0x55D8,0x55D9,/* 0xE0-0xE7 */
        0x55DA,0x55DB,0x55DE,0x55E0,0x55E2,0x55E7,0x55E9,0x55ED,/* 0xE8-0xEF */
        0x55EE,0x55F0,0x55F1,0x55F4,0x55F6,0x55F8,0x55F9,0x55FA,/* 0xF0-0xF7 */
        0x55FB,0x55FC,0x55FF,0x5602,0x5603,0x5604,0x5605,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_87[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x5606,0x5607,0x560A,0x560B,0x560D,0x5610,0x5611,0x5612,/* 0x40-0x47 */
        0x5613,0x5614,0x5615,0x5616,0x5617,0x5619,0x561A,0x561C,/* 0x48-0x4F */
        0x561D,0x5620,0x5621,0x5622,0x5625,0x5626,0x5628,0x5629,/* 0x50-0x57 */
        0x562A,0x562B,0x562E,0x562F,0x5630,0x5633,0x5635,0x5637,/* 0x58-0x5F */
        0x5638,0x563A,0x563C,0x563D,0x563E,0x5640,0x5641,0x5642,/* 0x60-0x67 */
        0x5643,0x5644,0x5645,0x5646,0x5647,0x5648,0x5649,0x564A,/* 0x68-0x6F */
        0x564B,0x564F,0x5650,0x5651,0x5652,0x5653,0x5655,0x5656,/* 0x70-0x77 */
        0x565A,0x565B,0x565D,0x565E,0x565F,0x5660,0x5661,0x0000,/* 0x78-0x7F */

        0x5663,0x5665,0x5666,0x5667,0x566D,0x566E,0x566F,0x5670,/* 0x80-0x87 */
        0x5672,0x5673,0x5674,0x5675,0x5677,0x5678,0x5679,0x567A,/* 0x88-0x8F */
        0x567D,0x567E,0x567F,0x5680,0x5681,0x5682,0x5683,0x5684,/* 0x90-0x97 */
        0x5687,0x5688,0x5689,0x568A,0x568B,0x568C,0x568D,0x5690,/* 0x98-0x9F */
        0x5691,0x5692,0x5694,0x5695,0x5696,0x5697,0x5698,0x5699,/* 0xA0-0xA7 */
        0x569A,0x569B,0x569C,0x569D,0x569E,0x569F,0x56A0,0x56A1,/* 0xA8-0xAF */
        0x56A2,0x56A4,0x56A5,0x56A6,0x56A7,0x56A8,0x56A9,0x56AA,/* 0xB0-0xB7 */
        0x56AB,0x56AC,0x56AD,0x56AE,0x56B0,0x56B1,0x56B2,0x56B3,/* 0xB8-0xBF */
        0x56B4,0x56B5,0x56B6,0x56B8,0x56B9,0x56BA,0x56BB,0x56BD,/* 0xC0-0xC7 */
        0x56BE,0x56BF,0x56C0,0x56C1,0x56C2,0x56C3,0x56C4,0x56C5,/* 0xC8-0xCF */
        0x56C6,0x56C7,0x56C8,0x56C9,0x56CB,0x56CC,0x56CD,0x56CE,/* 0xD0-0xD7 */
        0x56CF,0x56D0,0x56D1,0x56D2,0x56D3,0x56D5,0x56D6,0x56D8,/* 0xD8-0xDF */
        0x56D9,0x56DC,0x56E3,0x56E5,0x56E6,0x56E7,0x56E8,0x56E9,/* 0xE0-0xE7 */
        0x56EA,0x56EC,0x56EE,0x56EF,0x56F2,0x56F3,0x56F6,0x56F7,/* 0xE8-0xEF */
        0x56F8,0x56FB,0x56FC,0x5700,0x5701,0x5702,0x5705,0x5707,/* 0xF0-0xF7 */
        0x570B,0x570C,0x570D,0x570E,0x570F,0x5710,0x5711,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_88[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x5712,0x5713,0x5714,0x5715,0x5716,0x5717,0x5718,0x5719,/* 0x40-0x47 */
        0x571A,0x571B,0x571D,0x571E,0x5720,0x5721,0x5722,0x5724,/* 0x48-0x4F */
        0x5725,0x5726,0x5727,0x572B,0x5731,0x5732,0x5734,0x5735,/* 0x50-0x57 */
        0x5736,0x5737,0x5738,0x573C,0x573D,0x573F,0x5741,0x5743,/* 0x58-0x5F */
        0x5744,0x5745,0x5746,0x5748,0x5749,0x574B,0x5752,0x5753,/* 0x60-0x67 */
        0x5754,0x5755,0x5756,0x5758,0x5759,0x5762,0x5763,0x5765,/* 0x68-0x6F */
        0x5767,0x576C,0x576E,0x5770,0x5771,0x5772,0x5774,0x5775,/* 0x70-0x77 */
        0x5778,0x5779,0x577A,0x577D,0x577E,0x577F,0x5780,0x0000,/* 0x78-0x7F */

        0x5781,0x5787,0x5788,0x5789,0x578A,0x578D,0x578E,0x578F,/* 0x80-0x87 */
        0x5790,0x5791,0x5794,0x5795,0x5796,0x5797,0x5798,0x5799,/* 0x88-0x8F */
        0x579A,0x579C,0x579D,0x579E,0x579F,0x57A5,0x57A8,0x57AA,/* 0x90-0x97 */
        0x57AC,0x57AF,0x57B0,0x57B1,0x57B3,0x57B5,0x57B6,0x57B7,/* 0x98-0x9F */
        0x57B9,0x57BA,0x57BB,0x57BC,0x57BD,0x57BE,0x57BF,0x57C0,/* 0xA0-0xA7 */
        0x57C1,0x57C4,0x57C5,0x57C6,0x57C7,0x57C8,0x57C9,0x57CA,/* 0xA8-0xAF */
        0x57CC,0x57CD,0x57D0,0x57D1,0x57D3,0x57D6,0x57D7,0x57DB,/* 0xB0-0xB7 */
        0x57DC,0x57DE,0x57E1,0x57E2,0x57E3,0x57E5,0x57E6,0x57E7,/* 0xB8-0xBF */
        0x57E8,0x57E9,0x57EA,0x57EB,0x57EC,0x57EE,0x57F0,0x57F1,/* 0xC0-0xC7 */
        0x57F2,0x57F3,0x57F5,0x57F6,0x57F7,0x57FB,0x57FC,0x57FE,/* 0xC8-0xCF */
        0x57FF,0x5801,0x5803,0x5804,0x5805,0x5808,0x5809,0x580A,/* 0xD0-0xD7 */
        0x580C,0x580E,0x580F,0x5810,0x5812,0x5813,0x5814,0x5816,/* 0xD8-0xDF */
        0x5817,0x5818,0x581A,0x581B,0x581C,0x581D,0x581F,0x5822,/* 0xE0-0xE7 */
        0x5823,0x5825,0x5826,0x5827,0x5828,0x5829,0x582B,0x582C,/* 0xE8-0xEF */
        0x582D,0x582E,0x582F,0x5831,0x5832,0x5833,0x5834,0x5836,/* 0xF0-0xF7 */
        0x5837,0x5838,0x5839,0x583A,0x583B,0x583C,0x583D,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_89[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x583E,0x583F,0x5840,0x5841,0x5842,0x5843,0x5845,0x5846,/* 0x40-0x47 */
        0x5847,0x5848,0x5849,0x584A,0x584B,0x584E,0x584F,0x5850,/* 0x48-0x4F */
        0x5852,0x5853,0x5855,0x5856,0x5857,0x5859,0x585A,0x585B,/* 0x50-0x57 */
        0x585C,0x585D,0x585F,0x5860,0x5861,0x5862,0x5863,0x5864,/* 0x58-0x5F */
        0x5866,0x5867,0x5868,0x5869,0x586A,0x586D,0x586E,0x586F,/* 0x60-0x67 */
        0x5870,0x5871,0x5872,0x5873,0x5874,0x5875,0x5876,0x5877,/* 0x68-0x6F */
        0x5878,0x5879,0x587A,0x587B,0x587C,0x587D,0x587F,0x5882,/* 0x70-0x77 */
        0x5884,0x5886,0x5887,0x5888,0x588A,0x588B,0x588C,0x0000,/* 0x78-0x7F */

        0x588D,0x588E,0x588F,0x5890,0x5891,0x5894,0x5895,0x5896,/* 0x80-0x87 */
        0x5897,0x5898,0x589B,0x589C,0x589D,0x58A0,0x58A1,0x58A2,/* 0x88-0x8F */
        0x58A3,0x58A4,0x58A5,0x58A6,0x58A7,0x58AA,0x58AB,0x58AC,/* 0x90-0x97 */
        0x58AD,0x58AE,0x58AF,0x58B0,0x58B1,0x58B2,0x58B3,0x58B4,/* 0x98-0x9F */
        0x58B5,0x58B6,0x58B7,0x58B8,0x58B9,0x58BA,0x58BB,0x58BD,/* 0xA0-0xA7 */
        0x58BE,0x58BF,0x58C0,0x58C2,0x58C3,0x58C4,0x58C6,0x58C7,/* 0xA8-0xAF */
        0x58C8,0x58C9,0x58CA,0x58CB,0x58CC,0x58CD,0x58CE,0x58CF,/* 0xB0-0xB7 */
        0x58D0,0x58D2,0x58D3,0x58D4,0x58D6,0x58D7,0x58D8,0x58D9,/* 0xB8-0xBF */
        0x58DA,0x58DB,0x58DC,0x58DD,0x58DE,0x58DF,0x58E0,0x58E1,/* 0xC0-0xC7 */
        0x58E2,0x58E3,0x58E5,0x58E6,0x58E7,0x58E8,0x58E9,0x58EA,/* 0xC8-0xCF */
        0x58ED,0x58EF,0x58F1,0x58F2,0x58F4,0x58F5,0x58F7,0x58F8,/* 0xD0-0xD7 */
        0x58FA,0x58FB,0x58FC,0x58FD,0x58FE,0x58FF,0x5900,0x5901,/* 0xD8-0xDF */
        0x5903,0x5905,0x5906,0x5908,0x5909,0x590A,0x590B,0x590C,/* 0xE0-0xE7 */
        0x590E,0x5910,0x5911,0x5912,0x5913,0x5917,0x5918,0x591B,/* 0xE8-0xEF */
        0x591D,0x591E,0x5920,0x5921,0x5922,0x5923,0x5926,0x5928,/* 0xF0-0xF7 */
        0x592C,0x5930,0x5932,0x5933,0x5935,0x5936,0x593B,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_8A[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x593D,0x593E,0x593F,0x5940,0x5943,0x5945,0x5946,0x594A,/* 0x40-0x47 */
        0x594C,0x594D,0x5950,0x5952,0x5953,0x5959,0x595B,0x595C,/* 0x48-0x4F */
        0x595D,0x595E,0x595F,0x5961,0x5963,0x5964,0x5966,0x5967,/* 0x50-0x57 */
        0x5968,0x5969,0x596A,0x596B,0x596C,0x596D,0x596E,0x596F,/* 0x58-0x5F */
        0x5970,0x5971,0x5972,0x5975,0x5977,0x597A,0x597B,0x597C,/* 0x60-0x67 */
        0x597E,0x597F,0x5980,0x5985,0x5989,0x598B,0x598C,0x598E,/* 0x68-0x6F */
        0x598F,0x5990,0x5991,0x5994,0x5995,0x5998,0x599A,0x599B,/* 0x70-0x77 */
        0x599C,0x599D,0x599F,0x59A0,0x59A1,0x59A2,0x59A6,0x0000,/* 0x78-0x7F */

        0x59A7,0x59AC,0x59AD,0x59B0,0x59B1,0x59B3,0x59B4,0x59B5,/* 0x80-0x87 */
        0x59B6,0x59B7,0x59B8,0x59BA,0x59BC,0x59BD,0x59BF,0x59C0,/* 0x88-0x8F */
        0x59C1,0x59C2,0x59C3,0x59C4,0x59C5,0x59C7,0x59C8,0x59C9,/* 0x90-0x97 */
        0x59CC,0x59CD,0x59CE,0x59CF,0x59D5,0x59D6,0x59D9,0x59DB,/* 0x98-0x9F */
        0x59DE,0x59DF,0x59E0,0x59E1,0x59E2,0x59E4,0x59E6,0x59E7,/* 0xA0-0xA7 */
        0x59E9,0x59EA,0x59EB,0x59ED,0x59EE,0x59EF,0x59F0,0x59F1,/* 0xA8-0xAF */
        0x59F2,0x59F3,0x59F4,0x59F5,0x59F6,0x59F7,0x59F8,0x59FA,/* 0xB0-0xB7 */
        0x59FC,0x59FD,0x59FE,0x5A00,0x5A02,0x5A0A,0x5A0B,0x5A0D,/* 0xB8-0xBF */
        0x5A0E,0x5A0F,0x5A10,0x5A12,0x5A14,0x5A15,0x5A16,0x5A17,/* 0xC0-0xC7 */
        0x5A19,0x5A1A,0x5A1B,0x5A1D,0x5A1E,0x5A21,0x5A22,0x5A24,/* 0xC8-0xCF */
        0x5A26,0x5A27,0x5A28,0x5A2A,0x5A2B,0x5A2C,0x5A2D,0x5A2E,/* 0xD0-0xD7 */
        0x5A2F,0x5A30,0x5A33,0x5A35,0x5A37,0x5A38,0x5A39,0x5A3A,/* 0xD8-0xDF */
        0x5A3B,0x5A3D,0x5A3E,0x5A3F,0x5A41,0x5A42,0x5A43,0x5A44,/* 0xE0-0xE7 */
        0x5A45,0x5A47,0x5A48,0x5A4B,0x5A4C,0x5A4D,0x5A4E,0x5A4F,/* 0xE8-0xEF */
        0x5A50,0x5A51,0x5A52,0x5A53,0x5A54,0x5A56,0x5A57,0x5A58,/* 0xF0-0xF7 */
        0x5A59,0x5A5B,0x5A5C,0x5A5D,0x5A5E,0x5A5F,0x5A60,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_8B[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x5A61,0x5A63,0x5A64,0x5A65,0x5A66,0x5A68,0x5A69,0x5A6B,/* 0x40-0x47 */
        0x5A6C,0x5A6D,0x5A6E,0x5A6F,0x5A70,0x5A71,0x5A72,0x5A73,/* 0x48-0x4F */
        0x5A78,0x5A79,0x5A7B,0x5A7C,0x5A7D,0x5A7E,0x5A80,0x5A81,/* 0x50-0x57 */
        0x5A82,0x5A83,0x5A84,0x5A85,0x5A86,0x5A87,0x5A88,0x5A89,/* 0x58-0x5F */
        0x5A8A,0x5A8B,0x5A8C,0x5A8D,0x5A8E,0x5A8F,0x5A90,0x5A91,/* 0x60-0x67 */
        0x5A93,0x5A94,0x5A95,0x5A96,0x5A97,0x5A98,0x5A99,0x5A9C,/* 0x68-0x6F */
        0x5A9D,0x5A9E,0x5A9F,0x5AA0,0x5AA1,0x5AA2,0x5AA3,0x5AA4,/* 0x70-0x77 */
        0x5AA5,0x5AA6,0x5AA7,0x5AA8,0x5AA9,0x5AAB,0x5AAC,0x0000,/* 0x78-0x7F */

        0x5AAD,0x5AAE,0x5AAF,0x5AB0,0x5AB1,0x5AB4,0x5AB6,0x5AB7,/* 0x80-0x87 */
        0x5AB9,0x5ABA,0x5ABB,0x5ABC,0x5ABD,0x5ABF,0x5AC0,0x5AC3,/* 0x88-0x8F */
        0x5AC4,0x5AC5,0x5AC6,0x5AC7,0x5AC8,0x5ACA,0x5ACB,0x5ACD,/* 0x90-0x97 */
        0x5ACE,0x5ACF,0x5AD0,0x5AD1,0x5AD3,0x5AD5,0x5AD7,0x5AD9,/* 0x98-0x9F */
        0x5ADA,0x5ADB,0x5ADD,0x5ADE,0x5ADF,0x5AE2,0x5AE4,0x5AE5,/* 0xA0-0xA7 */
        0x5AE7,0x5AE8,0x5AEA,0x5AEC,0x5AED,0x5AEE,0x5AEF,0x5AF0,/* 0xA8-0xAF */
        0x5AF2,0x5AF3,0x5AF4,0x5AF5,0x5AF6,0x5AF7,0x5AF8,0x5AF9,/* 0xB0-0xB7 */
        0x5AFA,0x5AFB,0x5AFC,0x5AFD,0x5AFE,0x5AFF,0x5B00,0x5B01,/* 0xB8-0xBF */
        0x5B02,0x5B03,0x5B04,0x5B05,0x5B06,0x5B07,0x5B08,0x5B0A,/* 0xC0-0xC7 */
        0x5B0B,0x5B0C,0x5B0D,0x5B0E,0x5B0F,0x5B10,0x5B11,0x5B12,/* 0xC8-0xCF */
        0x5B13,0x5B14,0x5B15,0x5B18,0x5B19,0x5B1A,0x5B1B,0x5B1C,/* 0xD0-0xD7 */
        0x5B1D,0x5B1E,0x5B1F,0x5B20,0x5B21,0x5B22,0x5B23,0x5B24,/* 0xD8-0xDF */
        0x5B25,0x5B26,0x5B27,0x5B28,0x5B29,0x5B2A,0x5B2B,0x5B2C,/* 0xE0-0xE7 */
        0x5B2D,0x5B2E,0x5B2F,0x5B30,0x5B31,0x5B33,0x5B35,0x5B36,/* 0xE8-0xEF */
        0x5B38,0x5B39,0x5B3A,0x5B3B,0x5B3C,0x5B3D,0x5B3E,0x5B3F,/* 0xF0-0xF7 */
        0x5B41,0x5B42,0x5B43,0x5B44,0x5B45,0x5B46,0x5B47,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_8C[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x5B48,0x5B49,0x5B4A,0x5B4B,0x5B4C,0x5B4D,0x5B4E,0x5B4F,/* 0x40-0x47 */
        0x5B52,0x5B56,0x5B5E,0x5B60,0x5B61,0x5B67,0x5B68,0x5B6B,/* 0x48-0x4F */
        0x5B6D,0x5B6E,0x5B6F,0x5B72,0x5B74,0x5B76,0x5B77,0x5B78,/* 0x50-0x57 */
        0x5B79,0x5B7B,0x5B7C,0x5B7E,0x5B7F,0x5B82,0x5B86,0x5B8A,/* 0x58-0x5F */
        0x5B8D,0x5B8E,0x5B90,0x5B91,0x5B92,0x5B94,0x5B96,0x5B9F,/* 0x60-0x67 */
        0x5BA7,0x5BA8,0x5BA9,0x5BAC,0x5BAD,0x5BAE,0x5BAF,0x5BB1,/* 0x68-0x6F */
        0x5BB2,0x5BB7,0x5BBA,0x5BBB,0x5BBC,0x5BC0,0x5BC1,0x5BC3,/* 0x70-0x77 */
        0x5BC8,0x5BC9,0x5BCA,0x5BCB,0x5BCD,0x5BCE,0x5BCF,0x0000,/* 0x78-0x7F */

        0x5BD1,0x5BD4,0x5BD5,0x5BD6,0x5BD7,0x5BD8,0x5BD9,0x5BDA,/* 0x80-0x87 */
        0x5BDB,0x5BDC,0x5BE0,0x5BE2,0x5BE3,0x5BE6,0x5BE7,0x5BE9,/* 0x88-0x8F */
        0x5BEA,0x5BEB,0x5BEC,0x5BED,0x5BEF,0x5BF1,0x5BF2,0x5BF3,/* 0x90-0x97 */
        0x5BF4,0x5BF5,0x5BF6,0x5BF7,0x5BFD,0x5BFE,0x5C00,0x5C02,/* 0x98-0x9F */
        0x5C03,0x5C05,0x5C07,0x5C08,0x5C0B,0x5C0C,0x5C0D,0x5C0E,/* 0xA0-0xA7 */
        0x5C10,0x5C12,0x5C13,0x5C17,0x5C19,0x5C1B,0x5C1E,0x5C1F,/* 0xA8-0xAF */
        0x5C20,0x5C21,0x5C23,0x5C26,0x5C28,0x5C29,0x5C2A,0x5C2B,/* 0xB0-0xB7 */
        0x5C2D,0x5C2E,0x5C2F,0x5C30,0x5C32,0x5C33,0x5C35,0x5C36,/* 0xB8-0xBF */
        0x5C37,0x5C43,0x5C44,0x5C46,0x5C47,0x5C4C,0x5C4D,0x5C52,/* 0xC0-0xC7 */
        0x5C53,0x5C54,0x5C56,0x5C57,0x5C58,0x5C5A,0x5C5B,0x5C5C,/* 0xC8-0xCF */
        0x5C5D,0x5C5F,0x5C62,0x5C64,0x5C67,0x5C68,0x5C69,0x5C6A,/* 0xD0-0xD7 */
        0x5C6B,0x5C6C,0x5C6D,0x5C70,0x5C72,0x5C73,0x5C74,0x5C75,/* 0xD8-0xDF */
        0x5C76,0x5C77,0x5C78,0x5C7B,0x5C7C,0x5C7D,0x5C7E,0x5C80,/* 0xE0-0xE7 */
        0x5C83,0x5C84,0x5C85,0x5C86,0x5C87,0x5C89,0x5C8A,0x5C8B,/* 0xE8-0xEF */
        0x5C8E,0x5C8F,0x5C92,0x5C93,0x5C95,0x5C9D,0x5C9E,0x5C9F,/* 0xF0-0xF7 */
        0x5CA0,0x5CA1,0x5CA4,0x5CA5,0x5CA6,0x5CA7,0x5CA8,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_8D[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x5CAA,0x5CAE,0x5CAF,0x5CB0,0x5CB2,0x5CB4,0x5CB6,0x5CB9,/* 0x40-0x47 */
        0x5CBA,0x5CBB,0x5CBC,0x5CBE,0x5CC0,0x5CC2,0x5CC3,0x5CC5,/* 0x48-0x4F */
        0x5CC6,0x5CC7,0x5CC8,0x5CC9,0x5CCA,0x5CCC,0x5CCD,0x5CCE,/* 0x50-0x57 */
        0x5CCF,0x5CD0,0x5CD1,0x5CD3,0x5CD4,0x5CD5,0x5CD6,0x5CD7,/* 0x58-0x5F */
        0x5CD8,0x5CDA,0x5CDB,0x5CDC,0x5CDD,0x5CDE,0x5CDF,0x5CE0,/* 0x60-0x67 */
        0x5CE2,0x5CE3,0x5CE7,0x5CE9,0x5CEB,0x5CEC,0x5CEE,0x5CEF,/* 0x68-0x6F */
        0x5CF1,0x5CF2,0x5CF3,0x5CF4,0x5CF5,0x5CF6,0x5CF7,0x5CF8,/* 0x70-0x77 */
        0x5CF9,0x5CFA,0x5CFC,0x5CFD,0x5CFE,0x5CFF,0x5D00,0x0000,/* 0x78-0x7F */

        0x5D01,0x5D04,0x5D05,0x5D08,0x5D09,0x5D0A,0x5D0B,0x5D0C,/* 0x80-0x87 */
        0x5D0D,0x5D0F,0x5D10,0x5D11,0x5D12,0x5D13,0x5D15,0x5D17,/* 0x88-0x8F */
        0x5D18,0x5D19,0x5D1A,0x5D1C,0x5D1D,0x5D1F,0x5D20,0x5D21,/* 0x90-0x97 */
        0x5D22,0x5D23,0x5D25,0x5D28,0x5D2A,0x5D2B,0x5D2C,0x5D2F,/* 0x98-0x9F */
        0x5D30,0x5D31,0x5D32,0x5D33,0x5D35,0x5D36,0x5D37,0x5D38,/* 0xA0-0xA7 */
        0x5D39,0x5D3A,0x5D3B,0x5D3C,0x5D3F,0x5D40,0x5D41,0x5D42,/* 0xA8-0xAF */
        0x5D43,0x5D44,0x5D45,0x5D46,0x5D48,0x5D49,0x5D4D,0x5D4E,/* 0xB0-0xB7 */
        0x5D4F,0x5D50,0x5D51,0x5D52,0x5D53,0x5D54,0x5D55,0x5D56,/* 0xB8-0xBF */
        0x5D57,0x5D59,0x5D5A,0x5D5C,0x5D5E,0x5D5F,0x5D60,0x5D61,/* 0xC0-0xC7 */
        0x5D62,0x5D63,0x5D64,0x5D65,0x5D66,0x5D67,0x5D68,0x5D6A,/* 0xC8-0xCF */
        0x5D6D,0x5D6E,0x5D70,0x5D71,0x5D72,0x5D73,0x5D75,0x5D76,/* 0xD0-0xD7 */
        0x5D77,0x5D78,0x5D79,0x5D7A,0x5D7B,0x5D7C,0x5D7D,0x5D7E,/* 0xD8-0xDF */
        0x5D7F,0x5D80,0x5D81,0x5D83,0x5D84,0x5D85,0x5D86,0x5D87,/* 0xE0-0xE7 */
        0x5D88,0x5D89,0x5D8A,0x5D8B,0x5D8C,0x5D8D,0x5D8E,0x5D8F,/* 0xE8-0xEF */
        0x5D90,0x5D91,0x5D92,0x5D93,0x5D94,0x5D95,0x5D96,0x5D97,/* 0xF0-0xF7 */
        0x5D98,0x5D9A,0x5D9B,0x5D9C,0x5D9E,0x5D9F,0x5DA0,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_8E[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x5DA1,0x5DA2,0x5DA3,0x5DA4,0x5DA5,0x5DA6,0x5DA7,0x5DA8,/* 0x40-0x47 */
        0x5DA9,0x5DAA,0x5DAB,0x5DAC,0x5DAD,0x5DAE,0x5DAF,0x5DB0,/* 0x48-0x4F */
        0x5DB1,0x5DB2,0x5DB3,0x5DB4,0x5DB5,0x5DB6,0x5DB8,0x5DB9,/* 0x50-0x57 */
        0x5DBA,0x5DBB,0x5DBC,0x5DBD,0x5DBE,0x5DBF,0x5DC0,0x5DC1,/* 0x58-0x5F */
        0x5DC2,0x5DC3,0x5DC4,0x5DC6,0x5DC7,0x5DC8,0x5DC9,0x5DCA,/* 0x60-0x67 */
        0x5DCB,0x5DCC,0x5DCE,0x5DCF,0x5DD0,0x5DD1,0x5DD2,0x5DD3,/* 0x68-0x6F */
        0x5DD4,0x5DD5,0x5DD6,0x5DD7,0x5DD8,0x5DD9,0x5DDA,0x5DDC,/* 0x70-0x77 */
        0x5DDF,0x5DE0,0x5DE3,0x5DE4,0x5DEA,0x5DEC,0x5DED,0x0000,/* 0x78-0x7F */

        0x5DF0,0x5DF5,0x5DF6,0x5DF8,0x5DF9,0x5DFA,0x5DFB,0x5DFC,/* 0x80-0x87 */
        0x5DFF,0x5E00,0x5E04,0x5E07,0x5E09,0x5E0A,0x5E0B,0x5E0D,/* 0x88-0x8F */
        0x5E0E,0x5E12,0x5E13,0x5E17,0x5E1E,0x5E1F,0x5E20,0x5E21,/* 0x90-0x97 */
        0x5E22,0x5E23,0x5E24,0x5E25,0x5E28,0x5E29,0x5E2A,0x5E2B,/* 0x98-0x9F */
        0x5E2C,0x5E2F,0x5E30,0x5E32,0x5E33,0x5E34,0x5E35,0x5E36,/* 0xA0-0xA7 */
        0x5E39,0x5E3A,0x5E3E,0x5E3F,0x5E40,0x5E41,0x5E43,0x5E46,/* 0xA8-0xAF */
        0x5E47,0x5E48,0x5E49,0x5E4A,0x5E4B,0x5E4D,0x5E4E,0x5E4F,/* 0xB0-0xB7 */
        0x5E50,0x5E51,0x5E52,0x5E53,0x5E56,0x5E57,0x5E58,0x5E59,/* 0xB8-0xBF */
        0x5E5A,0x5E5C,0x5E5D,0x5E5F,0x5E60,0x5E63,0x5E64,0x5E65,/* 0xC0-0xC7 */
        0x5E66,0x5E67,0x5E68,0x5E69,0x5E6A,0x5E6B,0x5E6C,0x5E6D,/* 0xC8-0xCF */
        0x5E6E,0x5E6F,0x5E70,0x5E71,0x5E75,0x5E77,0x5E79,0x5E7E,/* 0xD0-0xD7 */
        0x5E81,0x5E82,0x5E83,0x5E85,0x5E88,0x5E89,0x5E8C,0x5E8D,/* 0xD8-0xDF */
        0x5E8E,0x5E92,0x5E98,0x5E9B,0x5E9D,0x5EA1,0x5EA2,0x5EA3,/* 0xE0-0xE7 */
        0x5EA4,0x5EA8,0x5EA9,0x5EAA,0x5EAB,0x5EAC,0x5EAE,0x5EAF,/* 0xE8-0xEF */
        0x5EB0,0x5EB1,0x5EB2,0x5EB4,0x5EBA,0x5EBB,0x5EBC,0x5EBD,/* 0xF0-0xF7 */
        0x5EBF,0x5EC0,0x5EC1,0x5EC2,0x5EC3,0x5EC4,0x5EC5,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_8F[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x5EC6,0x5EC7,0x5EC8,0x5ECB,0x5ECC,0x5ECD,0x5ECE,0x5ECF,/* 0x40-0x47 */
        0x5ED0,0x5ED4,0x5ED5,0x5ED7,0x5ED8,0x5ED9,0x5EDA,0x5EDC,/* 0x48-0x4F */
        0x5EDD,0x5EDE,0x5EDF,0x5EE0,0x5EE1,0x5EE2,0x5EE3,0x5EE4,/* 0x50-0x57 */
        0x5EE5,0x5EE6,0x5EE7,0x5EE9,0x5EEB,0x5EEC,0x5EED,0x5EEE,/* 0x58-0x5F */
        0x5EEF,0x5EF0,0x5EF1,0x5EF2,0x5EF3,0x5EF5,0x5EF8,0x5EF9,/* 0x60-0x67 */
        0x5EFB,0x5EFC,0x5EFD,0x5F05,0x5F06,0x5F07,0x5F09,0x5F0C,/* 0x68-0x6F */
        0x5F0D,0x5F0E,0x5F10,0x5F12,0x5F14,0x5F16,0x5F19,0x5F1A,/* 0x70-0x77 */
        0x5F1C,0x5F1D,0x5F1E,0x5F21,0x5F22,0x5F23,0x5F24,0x0000,/* 0x78-0x7F */

        0x5F28,0x5F2B,0x5F2C,0x5F2E,0x5F30,0x5F32,0x5F33,0x5F34,/* 0x80-0x87 */
        0x5F35,0x5F36,0x5F37,0x5F38,0x5F3B,0x5F3D,0x5F3E,0x5F3F,/* 0x88-0x8F */
        0x5F41,0x5F42,0x5F43,0x5F44,0x5F45,0x5F46,0x5F47,0x5F48,/* 0x90-0x97 */
        0x5F49,0x5F4A,0x5F4B,0x5F4C,0x5F4D,0x5F4E,0x5F4F,0x5F51,/* 0x98-0x9F */
        0x5F54,0x5F59,0x5F5A,0x5F5B,0x5F5C,0x5F5E,0x5F5F,0x5F60,/* 0xA0-0xA7 */
        0x5F63,0x5F65,0x5F67,0x5F68,0x5F6B,0x5F6E,0x5F6F,0x5F72,/* 0xA8-0xAF */
        0x5F74,0x5F75,0x5F76,0x5F78,0x5F7A,0x5F7D,0x5F7E,0x5F7F,/* 0xB0-0xB7 */
        0x5F83,0x5F86,0x5F8D,0x5F8E,0x5F8F,0x5F91,0x5F93,0x5F94,/* 0xB8-0xBF */
        0x5F96,0x5F9A,0x5F9B,0x5F9D,0x5F9E,0x5F9F,0x5FA0,0x5FA2,/* 0xC0-0xC7 */
        0x5FA3,0x5FA4,0x5FA5,0x5FA6,0x5FA7,0x5FA9,0x5FAB,0x5FAC,/* 0xC8-0xCF */
        0x5FAF,0x5FB0,0x5FB1,0x5FB2,0x5FB3,0x5FB4,0x5FB6,0x5FB8,/* 0xD0-0xD7 */
        0x5FB9,0x5FBA,0x5FBB,0x5FBE,0x5FBF,0x5FC0,0x5FC1,0x5FC2,/* 0xD8-0xDF */
        0x5FC7,0x5FC8,0x5FCA,0x5FCB,0x5FCE,0x5FD3,0x5FD4,0x5FD5,/* 0xE0-0xE7 */
        0x5FDA,0x5FDB,0x5FDC,0x5FDE,0x5FDF,0x5FE2,0x5FE3,0x5FE5,/* 0xE8-0xEF */
        0x5FE6,0x5FE8,0x5FE9,0x5FEC,0x5FEF,0x5FF0,0x5FF2,0x5FF3,/* 0xF0-0xF7 */
        0x5FF4,0x5FF6,0x5FF7,0x5FF9,0x5FFA,0x5FFC,0x6007,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_90[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x6008,0x6009,0x600B,0x600C,0x6010,0x6011,0x6013,0x6017,/* 0x40-0x47 */
        0x6018,0x601A,0x601E,0x601F,0x6022,0x6023,0x6024,0x602C,/* 0x48-0x4F */
        0x602D,0x602E,0x6030,0x6031,0x6032,0x6033,0x6034,0x6036,/* 0x50-0x57 */
        0x6037,0x6038,0x6039,0x603A,0x603D,0x603E,0x6040,0x6044,/* 0x58-0x5F */
        0x6045,0x6046,0x6047,0x6048,0x6049,0x604A,0x604C,0x604E,/* 0x60-0x67 */
        0x604F,0x6051,0x6053,0x6054,0x6056,0x6057,0x6058,0x605B,/* 0x68-0x6F */
        0x605C,0x605E,0x605F,0x6060,0x6061,0x6065,0x6066,0x606E,/* 0x70-0x77 */
        0x6071,0x6072,0x6074,0x6075,0x6077,0x607E,0x6080,0x0000,/* 0x78-0x7F */

        0x6081,0x6082,0x6085,0x6086,0x6087,0x6088,0x608A,0x608B,/* 0x80-0x87 */
        0x608E,0x608F,0x6090,0x6091,0x6093,0x6095,0x6097,0x6098,/* 0x88-0x8F */
        0x6099,0x609C,0x609E,0x60A1,0x60A2,0x60A4,0x60A5,0x60A7,/* 0x90-0x97 */
        0x60A9,0x60AA,0x60AE,0x60B0,0x60B3,0x60B5,0x60B6,0x60B7,/* 0x98-0x9F */
        0x60B9,0x60BA,0x60BD,0x60BE,0x60BF,0x60C0,0x60C1,0x60C2,/* 0xA0-0xA7 */
        0x60C3,0x60C4,0x60C7,0x60C8,0x60C9,0x60CC,0x60CD,0x60CE,/* 0xA8-0xAF */
        0x60CF,0x60D0,0x60D2,0x60D3,0x60D4,0x60D6,0x60D7,0x60D9,/* 0xB0-0xB7 */
        0x60DB,0x60DE,0x60E1,0x60E2,0x60E3,0x60E4,0x60E5,0x60EA,/* 0xB8-0xBF */
        0x60F1,0x60F2,0x60F5,0x60F7,0x60F8,0x60FB,0x60FC,0x60FD,/* 0xC0-0xC7 */
        0x60FE,0x60FF,0x6102,0x6103,0x6104,0x6105,0x6107,0x610A,/* 0xC8-0xCF */
        0x610B,0x610C,0x6110,0x6111,0x6112,0x6113,0x6114,0x6116,/* 0xD0-0xD7 */
        0x6117,0x6118,0x6119,0x611B,0x611C,0x611D,0x611E,0x6121,/* 0xD8-0xDF */
        0x6122,0x6125,0x6128,0x6129,0x612A,0x612C,0x612D,0x612E,/* 0xE0-0xE7 */
        0x612F,0x6130,0x6131,0x6132,0x6133,0x6134,0x6135,0x6136,/* 0xE8-0xEF */
        0x6137,0x6138,0x6139,0x613A,0x613B,0x613C,0x613D,0x613E,/* 0xF0-0xF7 */
        0x6140,0x6141,0x6142,0x6143,0x6144,0x6145,0x6146,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_91[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x6147,0x6149,0x614B,0x614D,0x614F,0x6150,0x6152,0x6153,/* 0x40-0x47 */
        0x6154,0x6156,0x6157,0x6158,0x6159,0x615A,0x615B,0x615C,/* 0x48-0x4F */
        0x615E,0x615F,0x6160,0x6161,0x6163,0x6164,0x6165,0x6166,/* 0x50-0x57 */
        0x6169,0x616A,0x616B,0x616C,0x616D,0x616E,0x616F,0x6171,/* 0x58-0x5F */
        0x6172,0x6173,0x6174,0x6176,0x6178,0x6179,0x617A,0x617B,/* 0x60-0x67 */
        0x617C,0x617D,0x617E,0x617F,0x6180,0x6181,0x6182,0x6183,/* 0x68-0x6F */
        0x6184,0x6185,0x6186,0x6187,0x6188,0x6189,0x618A,0x618C,/* 0x70-0x77 */
        0x618D,0x618F,0x6190,0x6191,0x6192,0x6193,0x6195,0x0000,/* 0x78-0x7F */

        0x6196,0x6197,0x6198,0x6199,0x619A,0x619B,0x619C,0x619E,/* 0x80-0x87 */
        0x619F,0x61A0,0x61A1,0x61A2,0x61A3,0x61A4,0x61A5,0x61A6,/* 0x88-0x8F */
        0x61AA,0x61AB,0x61AD,0x61AE,0x61AF,0x61B0,0x61B1,0x61B2,/* 0x90-0x97 */
        0x61B3,0x61B4,0x61B5,0x61B6,0x61B8,0x61B9,0x61BA,0x61BB,/* 0x98-0x9F */
        0x61BC,0x61BD,0x61BF,0x61C0,0x61C1,0x61C3,0x61C4,0x61C5,/* 0xA0-0xA7 */
        0x61C6,0x61C7,0x61C9,0x61CC,0x61CD,0x61CE,0x61CF,0x61D0,/* 0xA8-0xAF */
        0x61D3,0x61D5,0x61D6,0x61D7,0x61D8,0x61D9,0x61DA,0x61DB,/* 0xB0-0xB7 */
        0x61DC,0x61DD,0x61DE,0x61DF,0x61E0,0x61E1,0x61E2,0x61E3,/* 0xB8-0xBF */
        0x61E4,0x61E5,0x61E7,0x61E8,0x61E9,0x61EA,0x61EB,0x61EC,/* 0xC0-0xC7 */
        0x61ED,0x61EE,0x61EF,0x61F0,0x61F1,0x61F2,0x61F3,0x61F4,/* 0xC8-0xCF */
        0x61F6,0x61F7,0x61F8,0x61F9,0x61FA,0x61FB,0x61FC,0x61FD,/* 0xD0-0xD7 */
        0x61FE,0x6200,0x6201,0x6202,0x6203,0x6204,0x6205,0x6207,/* 0xD8-0xDF */
        0x6209,0x6213,0x6214,0x6219,0x621C,0x621D,0x621E,0x6220,/* 0xE0-0xE7 */
        0x6223,0x6226,0x6227,0x6228,0x6229,0x622B,0x622D,0x622F,/* 0xE8-0xEF */
        0x6230,0x6231,0x6232,0x6235,0x6236,0x6238,0x6239,0x623A,/* 0xF0-0xF7 */
        0x623B,0x623C,0x6242,0x6244,0x6245,0x6246,0x624A,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_92[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x624F,0x6250,0x6255,0x6256,0x6257,0x6259,0x625A,0x625C,/* 0x40-0x47 */
        0x625D,0x625E,0x625F,0x6260,0x6261,0x6262,0x6264,0x6265,/* 0x48-0x4F */
        0x6268,0x6271,0x6272,0x6274,0x6275,0x6277,0x6278,0x627A,/* 0x50-0x57 */
        0x627B,0x627D,0x6281,0x6282,0x6283,0x6285,0x6286,0x6287,/* 0x58-0x5F */
        0x6288,0x628B,0x628C,0x628D,0x628E,0x628F,0x6290,0x6294,/* 0x60-0x67 */
        0x6299,0x629C,0x629D,0x629E,0x62A3,0x62A6,0x62A7,0x62A9,/* 0x68-0x6F */
        0x62AA,0x62AD,0x62AE,0x62AF,0x62B0,0x62B2,0x62B3,0x62B4,/* 0x70-0x77 */
        0x62B6,0x62B7,0x62B8,0x62BA,0x62BE,0x62C0,0x62C1,0x0000,/* 0x78-0x7F */

        0x62C3,0x62CB,0x62CF,0x62D1,0x62D5,0x62DD,0x62DE,0x62E0,/* 0x80-0x87 */
        0x62E1,0x62E4,0x62EA,0x62EB,0x62F0,0x62F2,0x62F5,0x62F8,/* 0x88-0x8F */
        0x62F9,0x62FA,0x62FB,0x6300,0x6303,0x6304,0x6305,0x6306,/* 0x90-0x97 */
        0x630A,0x630B,0x630C,0x630D,0x630F,0x6310,0x6312,0x6313,/* 0x98-0x9F */
        0x6314,0x6315,0x6317,0x6318,0x6319,0x631C,0x6326,0x6327,/* 0xA0-0xA7 */
        0x6329,0x632C,0x632D,0x632E,0x6330,0x6331,0x6333,0x6334,/* 0xA8-0xAF */
        0x6335,0x6336,0x6337,0x6338,0x633B,0x633C,0x633E,0x633F,/* 0xB0-0xB7 */
        0x6340,0x6341,0x6344,0x6347,0x6348,0x634A,0x6351,0x6352,/* 0xB8-0xBF */
        0x6353,0x6354,0x6356,0x6357,0x6358,0x6359,0x635A,0x635B,/* 0xC0-0xC7 */
        0x635C,0x635D,0x6360,0x6364,0x6365,0x6366,0x6368,0x636A,/* 0xC8-0xCF */
        0x636B,0x636C,0x636F,0x6370,0x6372,0x6373,0x6374,0x6375,/* 0xD0-0xD7 */
        0x6378,0x6379,0x637C,0x637D,0x637E,0x637F,0x6381,0x6383,/* 0xD8-0xDF */
        0x6384,0x6385,0x6386,0x638B,0x638D,0x6391,0x6393,0x6394,/* 0xE0-0xE7 */
        0x6395,0x6397,0x6399,0x639A,0x639B,0x639C,0x639D,0x639E,/* 0xE8-0xEF */
        0x639F,0x63A1,0x63A4,0x63A6,0x63AB,0x63AF,0x63B1,0x63B2,/* 0xF0-0xF7 */
        0x63B5,0x63B6,0x63B9,0x63BB,0x63BD,0x63BF,0x63C0,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_93[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x63C1,0x63C2,0x63C3,0x63C5,0x63C7,0x63C8,0x63CA,0x63CB,/* 0x40-0x47 */
        0x63CC,0x63D1,0x63D3,0x63D4,0x63D5,0x63D7,0x63D8,0x63D9,/* 0x48-0x4F */
        0x63DA,0x63DB,0x63DC,0x63DD,0x63DF,0x63E2,0x63E4,0x63E5,/* 0x50-0x57 */
        0x63E6,0x63E7,0x63E8,0x63EB,0x63EC,0x63EE,0x63EF,0x63F0,/* 0x58-0x5F */
        0x63F1,0x63F3,0x63F5,0x63F7,0x63F9,0x63FA,0x63FB,0x63FC,/* 0x60-0x67 */
        0x63FE,0x6403,0x6404,0x6406,0x6407,0x6408,0x6409,0x640A,/* 0x68-0x6F */
        0x640D,0x640E,0x6411,0x6412,0x6415,0x6416,0x6417,0x6418,/* 0x70-0x77 */
        0x6419,0x641A,0x641D,0x641F,0x6422,0x6423,0x6424,0x0000,/* 0x78-0x7F */

        0x6425,0x6427,0x6428,0x6429,0x642B,0x642E,0x642F,0x6430,/* 0x80-0x87 */
        0x6431,0x6432,0x6433,0x6435,0x6436,0x6437,0x6438,0x6439,/* 0x88-0x8F */
        0x643B,0x643C,0x643E,0x6440,0x6442,0x6443,0x6449,0x644B,/* 0x90-0x97 */
        0x644C,0x644D,0x644E,0x644F,0x6450,0x6451,0x6453,0x6455,/* 0x98-0x9F */
        0x6456,0x6457,0x6459,0x645A,0x645B,0x645C,0x645D,0x645F,/* 0xA0-0xA7 */
        0x6460,0x6461,0x6462,0x6463,0x6464,0x6465,0x6466,0x6468,/* 0xA8-0xAF */
        0x646A,0x646B,0x646C,0x646E,0x646F,0x6470,0x6471,0x6472,/* 0xB0-0xB7 */
        0x6473,0x6474,0x6475,0x6476,0x6477,0x647B,0x647C,0x647D,/* 0xB8-0xBF */
        0x647E,0x647F,0x6480,0x6481,0x6483,0x6486,0x6488,0x6489,/* 0xC0-0xC7 */
        0x648A,0x648B,0x648C,0x648D,0x648E,0x648F,0x6490,0x6493,/* 0xC8-0xCF */
        0x6494,0x6497,0x6498,0x649A,0x649B,0x649C,0x649D,0x649F,/* 0xD0-0xD7 */
        0x64A0,0x64A1,0x64A2,0x64A3,0x64A5,0x64A6,0x64A7,0x64A8,/* 0xD8-0xDF */
        0x64AA,0x64AB,0x64AF,0x64B1,0x64B2,0x64B3,0x64B4,0x64B6,/* 0xE0-0xE7 */
        0x64B9,0x64BB,0x64BD,0x64BE,0x64BF,0x64C1,0x64C3,0x64C4,/* 0xE8-0xEF */
        0x64C6,0x64C7,0x64C8,0x64C9,0x64CA,0x64CB,0x64CC,0x64CF,/* 0xF0-0xF7 */
        0x64D1,0x64D3,0x64D4,0x64D5,0x64D6,0x64D9,0x64DA,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_94[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x64DB,0x64DC,0x64DD,0x64DF,0x64E0,0x64E1,0x64E3,0x64E5,/* 0x40-0x47 */
        0x64E7,0x64E8,0x64E9,0x64EA,0x64EB,0x64EC,0x64ED,0x64EE,/* 0x48-0x4F */
        0x64EF,0x64F0,0x64F1,0x64F2,0x64F3,0x64F4,0x64F5,0x64F6,/* 0x50-0x57 */
        0x64F7,0x64F8,0x64F9,0x64FA,0x64FB,0x64FC,0x64FD,0x64FE,/* 0x58-0x5F */
        0x64FF,0x6501,0x6502,0x6503,0x6504,0x6505,0x6506,0x6507,/* 0x60-0x67 */
        0x6508,0x650A,0x650B,0x650C,0x650D,0x650E,0x650F,0x6510,/* 0x68-0x6F */
        0x6511,0x6513,0x6514,0x6515,0x6516,0x6517,0x6519,0x651A,/* 0x70-0x77 */
        0x651B,0x651C,0x651D,0x651E,0x651F,0x6520,0x6521,0x0000,/* 0x78-0x7F */

        0x6522,0x6523,0x6524,0x6526,0x6527,0x6528,0x6529,0x652A,/* 0x80-0x87 */
        0x652C,0x652D,0x6530,0x6531,0x6532,0x6533,0x6537,0x653A,/* 0x88-0x8F */
        0x653C,0x653D,0x6540,0x6541,0x6542,0x6543,0x6544,0x6546,/* 0x90-0x97 */
        0x6547,0x654A,0x654B,0x654D,0x654E,0x6550,0x6552,0x6553,/* 0x98-0x9F */
        0x6554,0x6557,0x6558,0x655A,0x655C,0x655F,0x6560,0x6561,/* 0xA0-0xA7 */
        0x6564,0x6565,0x6567,0x6568,0x6569,0x656A,0x656D,0x656E,/* 0xA8-0xAF */
        0x656F,0x6571,0x6573,0x6575,0x6576,0x6578,0x6579,0x657A,/* 0xB0-0xB7 */
        0x657B,0x657C,0x657D,0x657E,0x657F,0x6580,0x6581,0x6582,/* 0xB8-0xBF */
        0x6583,0x6584,0x6585,0x6586,0x6588,0x6589,0x658A,0x658D,/* 0xC0-0xC7 */
        0x658E,0x658F,0x6592,0x6594,0x6595,0x6596,0x6598,0x659A,/* 0xC8-0xCF */
        0x659D,0x659E,0x65A0,0x65A2,0x65A3,0x65A6,0x65A8,0x65AA,/* 0xD0-0xD7 */
        0x65AC,0x65AE,0x65B1,0x65B2,0x65B3,0x65B4,0x65B5,0x65B6,/* 0xD8-0xDF */
        0x65B7,0x65B8,0x65BA,0x65BB,0x65BE,0x65BF,0x65C0,0x65C2,/* 0xE0-0xE7 */
        0x65C7,0x65C8,0x65C9,0x65CA,0x65CD,0x65D0,0x65D1,0x65D3,/* 0xE8-0xEF */
        0x65D4,0x65D5,0x65D8,0x65D9,0x65DA,0x65DB,0x65DC,0x65DD,/* 0xF0-0xF7 */
        0x65DE,0x65DF,0x65E1,0x65E3,0x65E4,0x65EA,0x65EB,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_95[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x65F2,0x65F3,0x65F4,0x65F5,0x65F8,0x65F9,0x65FB,0x65FC,/* 0x40-0x47 */
        0x65FD,0x65FE,0x65FF,0x6601,0x6604,0x6605,0x6607,0x6608,/* 0x48-0x4F */
        0x6609,0x660B,0x660D,0x6610,0x6611,0x6612,0x6616,0x6617,/* 0x50-0x57 */
        0x6618,0x661A,0x661B,0x661C,0x661E,0x6621,0x6622,0x6623,/* 0x58-0x5F */
        0x6624,0x6626,0x6629,0x662A,0x662B,0x662C,0x662E,0x6630,/* 0x60-0x67 */
        0x6632,0x6633,0x6637,0x6638,0x6639,0x663A,0x663B,0x663D,/* 0x68-0x6F */
        0x663F,0x6640,0x6642,0x6644,0x6645,0x6646,0x6647,0x6648,/* 0x70-0x77 */
        0x6649,0x664A,0x664D,0x664E,0x6650,0x6651,0x6658,0x0000,/* 0x78-0x7F */

        0x6659,0x665B,0x665C,0x665D,0x665E,0x6660,0x6662,0x6663,/* 0x80-0x87 */
        0x6665,0x6667,0x6669,0x666A,0x666B,0x666C,0x666D,0x6671,/* 0x88-0x8F */
        0x6672,0x6673,0x6675,0x6678,0x6679,0x667B,0x667C,0x667D,/* 0x90-0x97 */
        0x667F,0x6680,0x6681,0x6683,0x6685,0x6686,0x6688,0x6689,/* 0x98-0x9F */
        0x668A,0x668B,0x668D,0x668E,0x668F,0x6690,0x6692,0x6693,/* 0xA0-0xA7 */
        0x6694,0x6695,0x6698,0x6699,0x669A,0x669B,0x669C,0x669E,/* 0xA8-0xAF */
        0x669F,0x66A0,0x66A1,0x66A2,0x66A3,0x66A4,0x66A5,0x66A6,/* 0xB0-0xB7 */
        0x66A9,0x66AA,0x66AB,0x66AC,0x66AD,0x66AF,0x66B0,0x66B1,/* 0xB8-0xBF */
        0x66B2,0x66B3,0x66B5,0x66B6,0x66B7,0x66B8,0x66BA,0x66BB,/* 0xC0-0xC7 */
        0x66BC,0x66BD,0x66BF,0x66C0,0x66C1,0x66C2,0x66C3,0x66C4,/* 0xC8-0xCF */
        0x66C5,0x66C6,0x66C7,0x66C8,0x66C9,0x66CA,0x66CB,0x66CC,/* 0xD0-0xD7 */
        0x66CD,0x66CE,0x66CF,0x66D0,0x66D1,0x66D2,0x66D3,0x66D4,/* 0xD8-0xDF */
        0x66D5,0x66D6,0x66D7,0x66D8,0x66DA,0x66DE,0x66DF,0x66E0,/* 0xE0-0xE7 */
        0x66E1,0x66E2,0x66E3,0x66E4,0x66E5,0x66E7,0x66E8,0x66EA,/* 0xE8-0xEF */
        0x66EB,0x66EC,0x66ED,0x66EE,0x66EF,0x66F1,0x66F5,0x66F6,/* 0xF0-0xF7 */
        0x66F8,0x66FA,0x66FB,0x66FD,0x6701,0x6702,0x6703,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_96[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x6704,0x6705,0x6706,0x6707,0x670C,0x670E,0x670F,0x6711,/* 0x40-0x47 */
        0x6712,0x6713,0x6716,0x6718,0x6719,0x671A,0x671C,0x671E,/* 0x48-0x4F */
        0x6720,0x6721,0x6722,0x6723,0x6724,0x6725,0x6727,0x6729,/* 0x50-0x57 */
        0x672E,0x6730,0x6732,0x6733,0x6736,0x6737,0x6738,0x6739,/* 0x58-0x5F */
        0x673B,0x673C,0x673E,0x673F,0x6741,0x6744,0x6745,0x6747,/* 0x60-0x67 */
        0x674A,0x674B,0x674D,0x6752,0x6754,0x6755,0x6757,0x6758,/* 0x68-0x6F */
        0x6759,0x675A,0x675B,0x675D,0x6762,0x6763,0x6764,0x6766,/* 0x70-0x77 */
        0x6767,0x676B,0x676C,0x676E,0x6771,0x6774,0x6776,0x0000,/* 0x78-0x7F */

        0x6778,0x6779,0x677A,0x677B,0x677D,0x6780,0x6782,0x6783,/* 0x80-0x87 */
        0x6785,0x6786,0x6788,0x678A,0x678C,0x678D,0x678E,0x678F,/* 0x88-0x8F */
        0x6791,0x6792,0x6793,0x6794,0x6796,0x6799,0x679B,0x679F,/* 0x90-0x97 */
        0x67A0,0x67A1,0x67A4,0x67A6,0x67A9,0x67AC,0x67AE,0x67B1,/* 0x98-0x9F */
        0x67B2,0x67B4,0x67B9,0x67BA,0x67BB,0x67BC,0x67BD,0x67BE,/* 0xA0-0xA7 */
        0x67BF,0x67C0,0x67C2,0x67C5,0x67C6,0x67C7,0x67C8,0x67C9,/* 0xA8-0xAF */
        0x67CA,0x67CB,0x67CC,0x67CD,0x67CE,0x67D5,0x67D6,0x67D7,/* 0xB0-0xB7 */
        0x67DB,0x67DF,0x67E1,0x67E3,0x67E4,0x67E6,0x67E7,0x67E8,/* 0xB8-0xBF */
        0x67EA,0x67EB,0x67ED,0x67EE,0x67F2,0x67F5,0x67F6,0x67F7,/* 0xC0-0xC7 */
        0x67F8,0x67F9,0x67FA,0x67FB,0x67FC,0x67FE,0x6801,0x6802,/* 0xC8-0xCF */
        0x6803,0x6804,0x6806,0x680D,0x6810,0x6812,0x6814,0x6815,/* 0xD0-0xD7 */
        0x6818,0x6819,0x681A,0x681B,0x681C,0x681E,0x681F,0x6820,/* 0xD8-0xDF */
        0x6822,0x6823,0x6824,0x6825,0x6826,0x6827,0x6828,0x682B,/* 0xE0-0xE7 */
        0x682C,0x682D,0x682E,0x682F,0x6830,0x6831,0x6834,0x6835,/* 0xE8-0xEF */
        0x6836,0x683A,0x683B,0x683F,0x6847,0x684B,0x684D,0x684F,/* 0xF0-0xF7 */
        0x6852,0x6856,0x6857,0x6858,0x6859,0x685A,0x685B,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_97[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x685C,0x685D,0x685E,0x685F,0x686A,0x686C,0x686D,0x686E,/* 0x40-0x47 */
        0x686F,0x6870,0x6871,0x6872,0x6873,0x6875,0x6878,0x6879,/* 0x48-0x4F */
        0x687A,0x687B,0x687C,0x687D,0x687E,0x687F,0x6880,0x6882,/* 0x50-0x57 */
        0x6884,0x6887,0x6888,0x6889,0x688A,0x688B,0x688C,0x688D,/* 0x58-0x5F */
        0x688E,0x6890,0x6891,0x6892,0x6894,0x6895,0x6896,0x6898,/* 0x60-0x67 */
        0x6899,0x689A,0x689B,0x689C,0x689D,0x689E,0x689F,0x68A0,/* 0x68-0x6F */
        0x68A1,0x68A3,0x68A4,0x68A5,0x68A9,0x68AA,0x68AB,0x68AC,/* 0x70-0x77 */
        0x68AE,0x68B1,0x68B2,0x68B4,0x68B6,0x68B7,0x68B8,0x0000,/* 0x78-0x7F */

        0x68B9,0x68BA,0x68BB,0x68BC,0x68BD,0x68BE,0x68BF,0x68C1,/* 0x80-0x87 */
        0x68C3,0x68C4,0x68C5,0x68C6,0x68C7,0x68C8,0x68CA,0x68CC,/* 0x88-0x8F */
        0x68CE,0x68CF,0x68D0,0x68D1,0x68D3,0x68D4,0x68D6,0x68D7,/* 0x90-0x97 */
        0x68D9,0x68DB,0x68DC,0x68DD,0x68DE,0x68DF,0x68E1,0x68E2,/* 0x98-0x9F */
        0x68E4,0x68E5,0x68E6,0x68E7,0x68E8,0x68E9,0x68EA,0x68EB,/* 0xA0-0xA7 */
        0x68EC,0x68ED,0x68EF,0x68F2,0x68F3,0x68F4,0x68F6,0x68F7,/* 0xA8-0xAF */
        0x68F8,0x68FB,0x68FD,0x68FE,0x68FF,0x6900,0x6902,0x6903,/* 0xB0-0xB7 */
        0x6904,0x6906,0x6907,0x6908,0x6909,0x690A,0x690C,0x690F,/* 0xB8-0xBF */
        0x6911,0x6913,0x6914,0x6915,0x6916,0x6917,0x6918,0x6919,/* 0xC0-0xC7 */
        0x691A,0x691B,0x691C,0x691D,0x691E,0x6921,0x6922,0x6923,/* 0xC8-0xCF */
        0x6925,0x6926,0x6927,0x6928,0x6929,0x692A,0x692B,0x692C,/* 0xD0-0xD7 */
        0x692E,0x692F,0x6931,0x6932,0x6933,0x6935,0x6936,0x6937,/* 0xD8-0xDF */
        0x6938,0x693A,0x693B,0x693C,0x693E,0x6940,0x6941,0x6943,/* 0xE0-0xE7 */
        0x6944,0x6945,0x6946,0x6947,0x6948,0x6949,0x694A,0x694B,/* 0xE8-0xEF */
        0x694C,0x694D,0x694E,0x694F,0x6950,0x6951,0x6952,0x6953,/* 0xF0-0xF7 */
        0x6955,0x6956,0x6958,0x6959,0x695B,0x695C,0x695F,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_98[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x6961,0x6962,0x6964,0x6965,0x6967,0x6968,0x6969,0x696A,/* 0x40-0x47 */
        0x696C,0x696D,0x696F,0x6970,0x6972,0x6973,0x6974,0x6975,/* 0x48-0x4F */
        0x6976,0x697A,0x697B,0x697D,0x697E,0x697F,0x6981,0x6983,/* 0x50-0x57 */
        0x6985,0x698A,0x698B,0x698C,0x698E,0x698F,0x6990,0x6991,/* 0x58-0x5F */
        0x6992,0x6993,0x6996,0x6997,0x6999,0x699A,0x699D,0x699E,/* 0x60-0x67 */
        0x699F,0x69A0,0x69A1,0x69A2,0x69A3,0x69A4,0x69A5,0x69A6,/* 0x68-0x6F */
        0x69A9,0x69AA,0x69AC,0x69AE,0x69AF,0x69B0,0x69B2,0x69B3,/* 0x70-0x77 */
        0x69B5,0x69B6,0x69B8,0x69B9,0x69BA,0x69BC,0x69BD,0x0000,/* 0x78-0x7F */

        0x69BE,0x69BF,0x69C0,0x69C2,0x69C3,0x69C4,0x69C5,0x69C6,/* 0x80-0x87 */
        0x69C7,0x69C8,0x69C9,0x69CB,0x69CD,0x69CF,0x69D1,0x69D2,/* 0x88-0x8F */
        0x69D3,0x69D5,0x69D6,0x69D7,0x69D8,0x69D9,0x69DA,0x69DC,/* 0x90-0x97 */
        0x69DD,0x69DE,0x69E1,0x69E2,0x69E3,0x69E4,0x69E5,0x69E6,/* 0x98-0x9F */
        0x69E7,0x69E8,0x69E9,0x69EA,0x69EB,0x69EC,0x69EE,0x69EF,/* 0xA0-0xA7 */
        0x69F0,0x69F1,0x69F3,0x69F4,0x69F5,0x69F6,0x69F7,0x69F8,/* 0xA8-0xAF */
        0x69F9,0x69FA,0x69FB,0x69FC,0x69FE,0x6A00,0x6A01,0x6A02,/* 0xB0-0xB7 */
        0x6A03,0x6A04,0x6A05,0x6A06,0x6A07,0x6A08,0x6A09,0x6A0B,/* 0xB8-0xBF */
        0x6A0C,0x6A0D,0x6A0E,0x6A0F,0x6A10,0x6A11,0x6A12,0x6A13,/* 0xC0-0xC7 */
        0x6A14,0x6A15,0x6A16,0x6A19,0x6A1A,0x6A1B,0x6A1C,0x6A1D,/* 0xC8-0xCF */
        0x6A1E,0x6A20,0x6A22,0x6A23,0x6A24,0x6A25,0x6A26,0x6A27,/* 0xD0-0xD7 */
        0x6A29,0x6A2B,0x6A2C,0x6A2D,0x6A2E,0x6A30,0x6A32,0x6A33,/* 0xD8-0xDF */
        0x6A34,0x6A36,0x6A37,0x6A38,0x6A39,0x6A3A,0x6A3B,0x6A3C,/* 0xE0-0xE7 */
        0x6A3F,0x6A40,0x6A41,0x6A42,0x6A43,0x6A45,0x6A46,0x6A48,/* 0xE8-0xEF */
        0x6A49,0x6A4A,0x6A4B,0x6A4C,0x6A4D,0x6A4E,0x6A4F,0x6A51,/* 0xF0-0xF7 */
        0x6A52,0x6A53,0x6A54,0x6A55,0x6A56,0x6A57,0x6A5A,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_99[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x6A5C,0x6A5D,0x6A5E,0x6A5F,0x6A60,0x6A62,0x6A63,0x6A64,/* 0x40-0x47 */
        0x6A66,0x6A67,0x6A68,0x6A69,0x6A6A,0x6A6B,0x6A6C,0x6A6D,/* 0x48-0x4F */
        0x6A6E,0x6A6F,0x6A70,0x6A72,0x6A73,0x6A74,0x6A75,0x6A76,/* 0x50-0x57 */
        0x6A77,0x6A78,0x6A7A,0x6A7B,0x6A7D,0x6A7E,0x6A7F,0x6A81,/* 0x58-0x5F */
        0x6A82,0x6A83,0x6A85,0x6A86,0x6A87,0x6A88,0x6A89,0x6A8A,/* 0x60-0x67 */
        0x6A8B,0x6A8C,0x6A8D,0x6A8F,0x6A92,0x6A93,0x6A94,0x6A95,/* 0x68-0x6F */
        0x6A96,0x6A98,0x6A99,0x6A9A,0x6A9B,0x6A9C,0x6A9D,0x6A9E,/* 0x70-0x77 */
        0x6A9F,0x6AA1,0x6AA2,0x6AA3,0x6AA4,0x6AA5,0x6AA6,0x0000,/* 0x78-0x7F */

        0x6AA7,0x6AA8,0x6AAA,0x6AAD,0x6AAE,0x6AAF,0x6AB0,0x6AB1,/* 0x80-0x87 */
        0x6AB2,0x6AB3,0x6AB4,0x6AB5,0x6AB6,0x6AB7,0x6AB8,0x6AB9,/* 0x88-0x8F */
        0x6ABA,0x6ABB,0x6ABC,0x6ABD,0x6ABE,0x6ABF,0x6AC0,0x6AC1,/* 0x90-0x97 */
        0x6AC2,0x6AC3,0x6AC4,0x6AC5,0x6AC6,0x6AC7,0x6AC8,0x6AC9,/* 0x98-0x9F */
        0x6ACA,0x6ACB,0x6ACC,0x6ACD,0x6ACE,0x6ACF,0x6AD0,0x6AD1,/* 0xA0-0xA7 */
        0x6AD2,0x6AD3,0x6AD4,0x6AD5,0x6AD6,0x6AD7,0x6AD8,0x6AD9,/* 0xA8-0xAF */
        0x6ADA,0x6ADB,0x6ADC,0x6ADD,0x6ADE,0x6ADF,0x6AE0,0x6AE1,/* 0xB0-0xB7 */
        0x6AE2,0x6AE3,0x6AE4,0x6AE5,0x6AE6,0x6AE7,0x6AE8,0x6AE9,/* 0xB8-0xBF */
        0x6AEA,0x6AEB,0x6AEC,0x6AED,0x6AEE,0x6AEF,0x6AF0,0x6AF1,/* 0xC0-0xC7 */
        0x6AF2,0x6AF3,0x6AF4,0x6AF5,0x6AF6,0x6AF7,0x6AF8,0x6AF9,/* 0xC8-0xCF */
        0x6AFA,0x6AFB,0x6AFC,0x6AFD,0x6AFE,0x6AFF,0x6B00,0x6B01,/* 0xD0-0xD7 */
        0x6B02,0x6B03,0x6B04,0x6B05,0x6B06,0x6B07,0x6B08,0x6B09,/* 0xD8-0xDF */
        0x6B0A,0x6B0B,0x6B0C,0x6B0D,0x6B0E,0x6B0F,0x6B10,0x6B11,/* 0xE0-0xE7 */
        0x6B12,0x6B13,0x6B14,0x6B15,0x6B16,0x6B17,0x6B18,0x6B19,/* 0xE8-0xEF */
        0x6B1A,0x6B1B,0x6B1C,0x6B1D,0x6B1E,0x6B1F,0x6B25,0x6B26,/* 0xF0-0xF7 */
        0x6B28,0x6B29,0x6B2A,0x6B2B,0x6B2C,0x6B2D,0x6B2E,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_9A[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x6B2F,0x6B30,0x6B31,0x6B33,0x6B34,0x6B35,0x6B36,0x6B38,/* 0x40-0x47 */
        0x6B3B,0x6B3C,0x6B3D,0x6B3F,0x6B40,0x6B41,0x6B42,0x6B44,/* 0x48-0x4F */
        0x6B45,0x6B48,0x6B4A,0x6B4B,0x6B4D,0x6B4E,0x6B4F,0x6B50,/* 0x50-0x57 */
        0x6B51,0x6B52,0x6B53,0x6B54,0x6B55,0x6B56,0x6B57,0x6B58,/* 0x58-0x5F */
        0x6B5A,0x6B5B,0x6B5C,0x6B5D,0x6B5E,0x6B5F,0x6B60,0x6B61,/* 0x60-0x67 */
        0x6B68,0x6B69,0x6B6B,0x6B6C,0x6B6D,0x6B6E,0x6B6F,0x6B70,/* 0x68-0x6F */
        0x6B71,0x6B72,0x6B73,0x6B74,0x6B75,0x6B76,0x6B77,0x6B78,/* 0x70-0x77 */
        0x6B7A,0x6B7D,0x6B7E,0x6B7F,0x6B80,0x6B85,0x6B88,0x0000,/* 0x78-0x7F */

        0x6B8C,0x6B8E,0x6B8F,0x6B90,0x6B91,0x6B94,0x6B95,0x6B97,/* 0x80-0x87 */
        0x6B98,0x6B99,0x6B9C,0x6B9D,0x6B9E,0x6B9F,0x6BA0,0x6BA2,/* 0x88-0x8F */
        0x6BA3,0x6BA4,0x6BA5,0x6BA6,0x6BA7,0x6BA8,0x6BA9,0x6BAB,/* 0x90-0x97 */
        0x6BAC,0x6BAD,0x6BAE,0x6BAF,0x6BB0,0x6BB1,0x6BB2,0x6BB6,/* 0x98-0x9F */
        0x6BB8,0x6BB9,0x6BBA,0x6BBB,0x6BBC,0x6BBD,0x6BBE,0x6BC0,/* 0xA0-0xA7 */
        0x6BC3,0x6BC4,0x6BC6,0x6BC7,0x6BC8,0x6BC9,0x6BCA,0x6BCC,/* 0xA8-0xAF */
        0x6BCE,0x6BD0,0x6BD1,0x6BD8,0x6BDA,0x6BDC,0x6BDD,0x6BDE,/* 0xB0-0xB7 */
        0x6BDF,0x6BE0,0x6BE2,0x6BE3,0x6BE4,0x6BE5,0x6BE6,0x6BE7,/* 0xB8-0xBF */
        0x6BE8,0x6BE9,0x6BEC,0x6BED,0x6BEE,0x6BF0,0x6BF1,0x6BF2,/* 0xC0-0xC7 */
        0x6BF4,0x6BF6,0x6BF7,0x6BF8,0x6BFA,0x6BFB,0x6BFC,0x6BFE,/* 0xC8-0xCF */
        0x6BFF,0x6C00,0x6C01,0x6C02,0x6C03,0x6C04,0x6C08,0x6C09,/* 0xD0-0xD7 */
        0x6C0A,0x6C0B,0x6C0C,0x6C0E,0x6C12,0x6C17,0x6C1C,0x6C1D,/* 0xD8-0xDF */
        0x6C1E,0x6C20,0x6C23,0x6C25,0x6C2B,0x6C2C,0x6C2D,0x6C31,/* 0xE0-0xE7 */
        0x6C33,0x6C36,0x6C37,0x6C39,0x6C3A,0x6C3B,0x6C3C,0x6C3E,/* 0xE8-0xEF */
        0x6C3F,0x6C43,0x6C44,0x6C45,0x6C48,0x6C4B,0x6C4C,0x6C4D,/* 0xF0-0xF7 */
        0x6C4E,0x6C4F,0x6C51,0x6C52,0x6C53,0x6C56,0x6C58,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_9B[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x6C59,0x6C5A,0x6C62,0x6C63,0x6C65,0x6C66,0x6C67,0x6C6B,/* 0x40-0x47 */
        0x6C6C,0x6C6D,0x6C6E,0x6C6F,0x6C71,0x6C73,0x6C75,0x6C77,/* 0x48-0x4F */
        0x6C78,0x6C7A,0x6C7B,0x6C7C,0x6C7F,0x6C80,0x6C84,0x6C87,/* 0x50-0x57 */
        0x6C8A,0x6C8B,0x6C8D,0x6C8E,0x6C91,0x6C92,0x6C95,0x6C96,/* 0x58-0x5F */
        0x6C97,0x6C98,0x6C9A,0x6C9C,0x6C9D,0x6C9E,0x6CA0,0x6CA2,/* 0x60-0x67 */
        0x6CA8,0x6CAC,0x6CAF,0x6CB0,0x6CB4,0x6CB5,0x6CB6,0x6CB7,/* 0x68-0x6F */
        0x6CBA,0x6CC0,0x6CC1,0x6CC2,0x6CC3,0x6CC6,0x6CC7,0x6CC8,/* 0x70-0x77 */
        0x6CCB,0x6CCD,0x6CCE,0x6CCF,0x6CD1,0x6CD2,0x6CD8,0x0000,/* 0x78-0x7F */

        0x6CD9,0x6CDA,0x6CDC,0x6CDD,0x6CDF,0x6CE4,0x6CE6,0x6CE7,/* 0x80-0x87 */
        0x6CE9,0x6CEC,0x6CED,0x6CF2,0x6CF4,0x6CF9,0x6CFF,0x6D00,/* 0x88-0x8F */
        0x6D02,0x6D03,0x6D05,0x6D06,0x6D08,0x6D09,0x6D0A,0x6D0D,/* 0x90-0x97 */
        0x6D0F,0x6D10,0x6D11,0x6D13,0x6D14,0x6D15,0x6D16,0x6D18,/* 0x98-0x9F */
        0x6D1C,0x6D1D,0x6D1F,0x6D20,0x6D21,0x6D22,0x6D23,0x6D24,/* 0xA0-0xA7 */
        0x6D26,0x6D28,0x6D29,0x6D2C,0x6D2D,0x6D2F,0x6D30,0x6D34,/* 0xA8-0xAF */
        0x6D36,0x6D37,0x6D38,0x6D3A,0x6D3F,0x6D40,0x6D42,0x6D44,/* 0xB0-0xB7 */
        0x6D49,0x6D4C,0x6D50,0x6D55,0x6D56,0x6D57,0x6D58,0x6D5B,/* 0xB8-0xBF */
        0x6D5D,0x6D5F,0x6D61,0x6D62,0x6D64,0x6D65,0x6D67,0x6D68,/* 0xC0-0xC7 */
        0x6D6B,0x6D6C,0x6D6D,0x6D70,0x6D71,0x6D72,0x6D73,0x6D75,/* 0xC8-0xCF */
        0x6D76,0x6D79,0x6D7A,0x6D7B,0x6D7D,0x6D7E,0x6D7F,0x6D80,/* 0xD0-0xD7 */
        0x6D81,0x6D83,0x6D84,0x6D86,0x6D87,0x6D8A,0x6D8B,0x6D8D,/* 0xD8-0xDF */
        0x6D8F,0x6D90,0x6D92,0x6D96,0x6D97,0x6D98,0x6D99,0x6D9A,/* 0xE0-0xE7 */
        0x6D9C,0x6DA2,0x6DA5,0x6DAC,0x6DAD,0x6DB0,0x6DB1,0x6DB3,/* 0xE8-0xEF */
        0x6DB4,0x6DB6,0x6DB7,0x6DB9,0x6DBA,0x6DBB,0x6DBC,0x6DBD,/* 0xF0-0xF7 */
        0x6DBE,0x6DC1,0x6DC2,0x6DC3,0x6DC8,0x6DC9,0x6DCA,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_9C[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x6DCD,0x6DCE,0x6DCF,0x6DD0,0x6DD2,0x6DD3,0x6DD4,0x6DD5,/* 0x40-0x47 */
        0x6DD7,0x6DDA,0x6DDB,0x6DDC,0x6DDF,0x6DE2,0x6DE3,0x6DE5,/* 0x48-0x4F */
        0x6DE7,0x6DE8,0x6DE9,0x6DEA,0x6DED,0x6DEF,0x6DF0,0x6DF2,/* 0x50-0x57 */
        0x6DF4,0x6DF5,0x6DF6,0x6DF8,0x6DFA,0x6DFD,0x6DFE,0x6DFF,/* 0x58-0x5F */
        0x6E00,0x6E01,0x6E02,0x6E03,0x6E04,0x6E06,0x6E07,0x6E08,/* 0x60-0x67 */
        0x6E09,0x6E0B,0x6E0F,0x6E12,0x6E13,0x6E15,0x6E18,0x6E19,/* 0x68-0x6F */
        0x6E1B,0x6E1C,0x6E1E,0x6E1F,0x6E22,0x6E26,0x6E27,0x6E28,/* 0x70-0x77 */
        0x6E2A,0x6E2C,0x6E2E,0x6E30,0x6E31,0x6E33,0x6E35,0x0000,/* 0x78-0x7F */

        0x6E36,0x6E37,0x6E39,0x6E3B,0x6E3C,0x6E3D,0x6E3E,0x6E3F,/* 0x80-0x87 */
        0x6E40,0x6E41,0x6E42,0x6E45,0x6E46,0x6E47,0x6E48,0x6E49,/* 0x88-0x8F */
        0x6E4A,0x6E4B,0x6E4C,0x6E4F,0x6E50,0x6E51,0x6E52,0x6E55,/* 0x90-0x97 */
        0x6E57,0x6E59,0x6E5A,0x6E5C,0x6E5D,0x6E5E,0x6E60,0x6E61,/* 0x98-0x9F */
        0x6E62,0x6E63,0x6E64,0x6E65,0x6E66,0x6E67,0x6E68,0x6E69,/* 0xA0-0xA7 */
        0x6E6A,0x6E6C,0x6E6D,0x6E6F,0x6E70,0x6E71,0x6E72,0x6E73,/* 0xA8-0xAF */
        0x6E74,0x6E75,0x6E76,0x6E77,0x6E78,0x6E79,0x6E7A,0x6E7B,/* 0xB0-0xB7 */
        0x6E7C,0x6E7D,0x6E80,0x6E81,0x6E82,0x6E84,0x6E87,0x6E88,/* 0xB8-0xBF */
        0x6E8A,0x6E8B,0x6E8C,0x6E8D,0x6E8E,0x6E91,0x6E92,0x6E93,/* 0xC0-0xC7 */
        0x6E94,0x6E95,0x6E96,0x6E97,0x6E99,0x6E9A,0x6E9B,0x6E9D,/* 0xC8-0xCF */
        0x6E9E,0x6EA0,0x6EA1,0x6EA3,0x6EA4,0x6EA6,0x6EA8,0x6EA9,/* 0xD0-0xD7 */
        0x6EAB,0x6EAC,0x6EAD,0x6EAE,0x6EB0,0x6EB3,0x6EB5,0x6EB8,/* 0xD8-0xDF */
        0x6EB9,0x6EBC,0x6EBE,0x6EBF,0x6EC0,0x6EC3,0x6EC4,0x6EC5,/* 0xE0-0xE7 */
        0x6EC6,0x6EC8,0x6EC9,0x6ECA,0x6ECC,0x6ECD,0x6ECE,0x6ED0,/* 0xE8-0xEF */
        0x6ED2,0x6ED6,0x6ED8,0x6ED9,0x6EDB,0x6EDC,0x6EDD,0x6EE3,/* 0xF0-0xF7 */
        0x6EE7,0x6EEA,0x6EEB,0x6EEC,0x6EED,0x6EEE,0x6EEF,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_9D[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x6EF0,0x6EF1,0x6EF2,0x6EF3,0x6EF5,0x6EF6,0x6EF7,0x6EF8,/* 0x40-0x47 */
        0x6EFA,0x6EFB,0x6EFC,0x6EFD,0x6EFE,0x6EFF,0x6F00,0x6F01,/* 0x48-0x4F */
        0x6F03,0x6F04,0x6F05,0x6F07,0x6F08,0x6F0A,0x6F0B,0x6F0C,/* 0x50-0x57 */
        0x6F0D,0x6F0E,0x6F10,0x6F11,0x6F12,0x6F16,0x6F17,0x6F18,/* 0x58-0x5F */
        0x6F19,0x6F1A,0x6F1B,0x6F1C,0x6F1D,0x6F1E,0x6F1F,0x6F21,/* 0x60-0x67 */
        0x6F22,0x6F23,0x6F25,0x6F26,0x6F27,0x6F28,0x6F2C,0x6F2E,/* 0x68-0x6F */
        0x6F30,0x6F32,0x6F34,0x6F35,0x6F37,0x6F38,0x6F39,0x6F3A,/* 0x70-0x77 */
        0x6F3B,0x6F3C,0x6F3D,0x6F3F,0x6F40,0x6F41,0x6F42,0x0000,/* 0x78-0x7F */

        0x6F43,0x6F44,0x6F45,0x6F48,0x6F49,0x6F4A,0x6F4C,0x6F4E,/* 0x80-0x87 */
        0x6F4F,0x6F50,0x6F51,0x6F52,0x6F53,0x6F54,0x6F55,0x6F56,/* 0x88-0x8F */
        0x6F57,0x6F59,0x6F5A,0x6F5B,0x6F5D,0x6F5F,0x6F60,0x6F61,/* 0x90-0x97 */
        0x6F63,0x6F64,0x6F65,0x6F67,0x6F68,0x6F69,0x6F6A,0x6F6B,/* 0x98-0x9F */
        0x6F6C,0x6F6F,0x6F70,0x6F71,0x6F73,0x6F75,0x6F76,0x6F77,/* 0xA0-0xA7 */
        0x6F79,0x6F7B,0x6F7D,0x6F7E,0x6F7F,0x6F80,0x6F81,0x6F82,/* 0xA8-0xAF */
        0x6F83,0x6F85,0x6F86,0x6F87,0x6F8A,0x6F8B,0x6F8F,0x6F90,/* 0xB0-0xB7 */
        0x6F91,0x6F92,0x6F93,0x6F94,0x6F95,0x6F96,0x6F97,0x6F98,/* 0xB8-0xBF */
        0x6F99,0x6F9A,0x6F9B,0x6F9D,0x6F9E,0x6F9F,0x6FA0,0x6FA2,/* 0xC0-0xC7 */
        0x6FA3,0x6FA4,0x6FA5,0x6FA6,0x6FA8,0x6FA9,0x6FAA,0x6FAB,/* 0xC8-0xCF */
        0x6FAC,0x6FAD,0x6FAE,0x6FAF,0x6FB0,0x6FB1,0x6FB2,0x6FB4,/* 0xD0-0xD7 */
        0x6FB5,0x6FB7,0x6FB8,0x6FBA,0x6FBB,0x6FBC,0x6FBD,0x6FBE,/* 0xD8-0xDF */
        0x6FBF,0x6FC1,0x6FC3,0x6FC4,0x6FC5,0x6FC6,0x6FC7,0x6FC8,/* 0xE0-0xE7 */
        0x6FCA,0x6FCB,0x6FCC,0x6FCD,0x6FCE,0x6FCF,0x6FD0,0x6FD3,/* 0xE8-0xEF */
        0x6FD4,0x6FD5,0x6FD6,0x6FD7,0x6FD8,0x6FD9,0x6FDA,0x6FDB,/* 0xF0-0xF7 */
        0x6FDC,0x6FDD,0x6FDF,0x6FE2,0x6FE3,0x6FE4,0x6FE5,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_9E[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x6FE6,0x6FE7,0x6FE8,0x6FE9,0x6FEA,0x6FEB,0x6FEC,0x6FED,/* 0x40-0x47 */
        0x6FF0,0x6FF1,0x6FF2,0x6FF3,0x6FF4,0x6FF5,0x6FF6,0x6FF7,/* 0x48-0x4F */
        0x6FF8,0x6FF9,0x6FFA,0x6FFB,0x6FFC,0x6FFD,0x6FFE,0x6FFF,/* 0x50-0x57 */
        0x7000,0x7001,0x7002,0x7003,0x7004,0x7005,0x7006,0x7007,/* 0x58-0x5F */
        0x7008,0x7009,0x700A,0x700B,0x700C,0x700D,0x700E,0x700F,/* 0x60-0x67 */
        0x7010,0x7012,0x7013,0x7014,0x7015,0x7016,0x7017,0x7018,/* 0x68-0x6F */
        0x7019,0x701C,0x701D,0x701E,0x701F,0x7020,0x7021,0x7022,/* 0x70-0x77 */
        0x7024,0x7025,0x7026,0x7027,0x7028,0x7029,0x702A,0x0000,/* 0x78-0x7F */

        0x702B,0x702C,0x702D,0x702E,0x702F,0x7030,0x7031,0x7032,/* 0x80-0x87 */
        0x7033,0x7034,0x7036,0x7037,0x7038,0x703A,0x703B,0x703C,/* 0x88-0x8F */
        0x703D,0x703E,0x703F,0x7040,0x7041,0x7042,0x7043,0x7044,/* 0x90-0x97 */
        0x7045,0x7046,0x7047,0x7048,0x7049,0x704A,0x704B,0x704D,/* 0x98-0x9F */
        0x704E,0x7050,0x7051,0x7052,0x7053,0x7054,0x7055,0x7056,/* 0xA0-0xA7 */
        0x7057,0x7058,0x7059,0x705A,0x705B,0x705C,0x705D,0x705F,/* 0xA8-0xAF */
        0x7060,0x7061,0x7062,0x7063,0x7064,0x7065,0x7066,0x7067,/* 0xB0-0xB7 */
        0x7068,0x7069,0x706A,0x706E,0x7071,0x7072,0x7073,0x7074,/* 0xB8-0xBF */
        0x7077,0x7079,0x707A,0x707B,0x707D,0x7081,0x7082,0x7083,/* 0xC0-0xC7 */
        0x7084,0x7086,0x7087,0x7088,0x708B,0x708C,0x708D,0x708F,/* 0xC8-0xCF */
        0x7090,0x7091,0x7093,0x7097,0x7098,0x709A,0x709B,0x709E,/* 0xD0-0xD7 */
        0x709F,0x70A0,0x70A1,0x70A2,0x70A3,0x70A4,0x70A5,0x70A6,/* 0xD8-0xDF */
        0x70A7,0x70A8,0x70A9,0x70AA,0x70B0,0x70B2,0x70B4,0x70B5,/* 0xE0-0xE7 */
        0x70B6,0x70BA,0x70BE,0x70BF,0x70C4,0x70C5,0x70C6,0x70C7,/* 0xE8-0xEF */
        0x70C9,0x70CB,0x70CC,0x70CD,0x70CE,0x70CF,0x70D0,0x70D1,/* 0xF0-0xF7 */
        0x70D2,0x70D3,0x70D4,0x70D5,0x70D6,0x70D7,0x70DA,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_9F[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x70DC,0x70DD,0x70DE,0x70E0,0x70E1,0x70E2,0x70E3,0x70E5,/* 0x40-0x47 */
        0x70EA,0x70EE,0x70F0,0x70F1,0x70F2,0x70F3,0x70F4,0x70F5,/* 0x48-0x4F */
        0x70F6,0x70F8,0x70FA,0x70FB,0x70FC,0x70FE,0x70FF,0x7100,/* 0x50-0x57 */
        0x7101,0x7102,0x7103,0x7104,0x7105,0x7106,0x7107,0x7108,/* 0x58-0x5F */
        0x710B,0x710C,0x710D,0x710E,0x710F,0x7111,0x7112,0x7114,/* 0x60-0x67 */
        0x7117,0x711B,0x711C,0x711D,0x711E,0x711F,0x7120,0x7121,/* 0x68-0x6F */
        0x7122,0x7123,0x7124,0x7125,0x7127,0x7128,0x7129,0x712A,/* 0x70-0x77 */
        0x712B,0x712C,0x712D,0x712E,0x7132,0x7133,0x7134,0x0000,/* 0x78-0x7F */

        0x7135,0x7137,0x7138,0x7139,0x713A,0x713B,0x713C,0x713D,/* 0x80-0x87 */
        0x713E,0x713F,0x7140,0x7141,0x7142,0x7143,0x7144,0x7146,/* 0x88-0x8F */
        0x7147,0x7148,0x7149,0x714B,0x714D,0x714F,0x7150,0x7151,/* 0x90-0x97 */
        0x7152,0x7153,0x7154,0x7155,0x7156,0x7157,0x7158,0x7159,/* 0x98-0x9F */
        0x715A,0x715B,0x715D,0x715F,0x7160,0x7161,0x7162,0x7163,/* 0xA0-0xA7 */
        0x7165,0x7169,0x716A,0x716B,0x716C,0x716D,0x716F,0x7170,/* 0xA8-0xAF */
        0x7171,0x7174,0x7175,0x7176,0x7177,0x7179,0x717B,0x717C,/* 0xB0-0xB7 */
        0x717E,0x717F,0x7180,0x7181,0x7182,0x7183,0x7185,0x7186,/* 0xB8-0xBF */
        0x7187,0x7188,0x7189,0x718B,0x718C,0x718D,0x718E,0x7190,/* 0xC0-0xC7 */
        0x7191,0x7192,0x7193,0x7195,0x7196,0x7197,0x719A,0x719B,/* 0xC8-0xCF */
        0x719C,0x719D,0x719E,0x71A1,0x71A2,0x71A3,0x71A4,0x71A5,/* 0xD0-0xD7 */
        0x71A6,0x71A7,0x71A9,0x71AA,0x71AB,0x71AD,0x71AE,0x71AF,/* 0xD8-0xDF */
        0x71B0,0x71B1,0x71B2,0x71B4,0x71B6,0x71B7,0x71B8,0x71BA,/* 0xE0-0xE7 */
        0x71BB,0x71BC,0x71BD,0x71BE,0x71BF,0x71C0,0x71C1,0x71C2,/* 0xE8-0xEF */
        0x71C4,0x71C5,0x71C6,0x71C7,0x71C8,0x71C9,0x71CA,0x71CB,/* 0xF0-0xF7 */
        0x71CC,0x71CD,0x71CF,0x71D0,0x71D1,0x71D2,0x71D3,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_A0[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x71D6,0x71D7,0x71D8,0x71D9,0x71DA,0x71DB,0x71DC,0x71DD,/* 0x40-0x47 */
        0x71DE,0x71DF,0x71E1,0x71E2,0x71E3,0x71E4,0x71E6,0x71E8,/* 0x48-0x4F */
        0x71E9,0x71EA,0x71EB,0x71EC,0x71ED,0x71EF,0x71F0,0x71F1,/* 0x50-0x57 */
        0x71F2,0x71F3,0x71F4,0x71F5,0x71F6,0x71F7,0x71F8,0x71FA,/* 0x58-0x5F */
        0x71FB,0x71FC,0x71FD,0x71FE,0x71FF,0x7200,0x7201,0x7202,/* 0x60-0x67 */
        0x7203,0x7204,0x7205,0x7207,0x7208,0x7209,0x720A,0x720B,/* 0x68-0x6F */
        0x720C,0x720D,0x720E,0x720F,0x7210,0x7211,0x7212,0x7213,/* 0x70-0x77 */
        0x7214,0x7215,0x7216,0x7217,0x7218,0x7219,0x721A,0x0000,/* 0x78-0x7F */

        0x721B,0x721C,0x721E,0x721F,0x7220,0x7221,0x7222,0x7223,/* 0x80-0x87 */
        0x7224,0x7225,0x7226,0x7227,0x7229,0x722B,0x722D,0x722E,/* 0x88-0x8F */
        0x722F,0x7232,0x7233,0x7234,0x723A,0x723C,0x723E,0x7240,/* 0x90-0x97 */
        0x7241,0x7242,0x7243,0x7244,0x7245,0x7246,0x7249,0x724A,/* 0x98-0x9F */
        0x724B,0x724E,0x724F,0x7250,0x7251,0x7253,0x7254,0x7255,/* 0xA0-0xA7 */
        0x7257,0x7258,0x725A,0x725C,0x725E,0x7260,0x7263,0x7264,/* 0xA8-0xAF */
        0x7265,0x7268,0x726A,0x726B,0x726C,0x726D,0x7270,0x7271,/* 0xB0-0xB7 */
        0x7273,0x7274,0x7276,0x7277,0x7278,0x727B,0x727C,0x727D,/* 0xB8-0xBF */
        0x7282,0x7283,0x7285,0x7286,0x7287,0x7288,0x7289,0x728C,/* 0xC0-0xC7 */
        0x728E,0x7290,0x7291,0x7293,0x7294,0x7295,0x7296,0x7297,/* 0xC8-0xCF */
        0x7298,0x7299,0x729A,0x729B,0x729C,0x729D,0x729E,0x72A0,/* 0xD0-0xD7 */
        0x72A1,0x72A2,0x72A3,0x72A4,0x72A5,0x72A6,0x72A7,0x72A8,/* 0xD8-0xDF */
        0x72A9,0x72AA,0x72AB,0x72AE,0x72B1,0x72B2,0x72B3,0x72B5,/* 0xE0-0xE7 */
        0x72BA,0x72BB,0x72BC,0x72BD,0x72BE,0x72BF,0x72C0,0x72C5,/* 0xE8-0xEF */
        0x72C6,0x72C7,0x72C9,0x72CA,0x72CB,0x72CC,0x72CF,0x72D1,/* 0xF0-0xF7 */
        0x72D3,0x72D4,0x72D5,0x72D6,0x72D8,0x72DA,0x72DB,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_A1[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */

        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
        0x0000,0x3000,0x3001,0x3002,0x00B7,0x02C9,0x02C7,0x00A8,/* 0xA0-0xA7 */
        0x3003,0x3005,0x2014,0xFF5E,0x2016,0x2026,0x2018,0x2019,/* 0xA8-0xAF */
        0x201C,0x201D,0x3014,0x3015,0x3008,0x3009,0x300A,0x300B,/* 0xB0-0xB7 */
        0x300C,0x300D,0x300E,0x300F,0x3016,0x3017,0x3010,0x3011,/* 0xB8-0xBF */
        0x00B1,0x00D7,0x00F7,0x2236,0x2227,0x2228,0x2211,0x220F,/* 0xC0-0xC7 */
        0x222A,0x2229,0x2208,0x2237,0x221A,0x22A5,0x2225,0x2220,/* 0xC8-0xCF */
        0x2312,0x2299,0x222B,0x222E,0x2261,0x224C,0x2248,0x223D,/* 0xD0-0xD7 */
        0x221D,0x2260,0x226E,0x226F,0x2264,0x2265,0x221E,0x2235,/* 0xD8-0xDF */
        0x2234,0x2642,0x2640,0x00B0,0x2032,0x2033,0x2103,0xFF04,/* 0xE0-0xE7 */
        0x00A4,0xFFE0,0xFFE1,0x2030,0x00A7,0x2116,0x2606,0x2605,/* 0xE8-0xEF */
        0x25CB,0x25CF,0x25CE,0x25C7,0x25C6,0x25A1,0x25A0,0x25B3,/* 0xF0-0xF7 */
        0x25B2,0x203B,0x2192,0x2190,0x2191,0x2193,0x3013,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_A2[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */

        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
        0x0000,0x2170,0x2171,0x2172,0x2173,0x2174,0x2175,0x2176,/* 0xA0-0xA7 */
        0x2177,0x2178,0x2179,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA8-0xAF */
        0x0000,0x2488,0x2489,0x248A,0x248B,0x248C,0x248D,0x248E,/* 0xB0-0xB7 */
        0x248F,0x2490,0x2491,0x2492,0x2493,0x2494,0x2495,0x2496,/* 0xB8-0xBF */
        0x2497,0x2498,0x2499,0x249A,0x249B,0x2474,0x2475,0x2476,/* 0xC0-0xC7 */
        0x2477,0x2478,0x2479,0x247A,0x247B,0x247C,0x247D,0x247E,/* 0xC8-0xCF */
        0x247F,0x2480,0x2481,0x2482,0x2483,0x2484,0x2485,0x2486,/* 0xD0-0xD7 */
        0x2487,0x2460,0x2461,0x2462,0x2463,0x2464,0x2465,0x2466,/* 0xD8-0xDF */
        0x2467,0x2468,0x2469,0x0000,0x0000,0x3220,0x3221,0x3222,/* 0xE0-0xE7 */
        0x3223,0x3224,0x3225,0x3226,0x3227,0x3228,0x3229,0x0000,/* 0xE8-0xEF */
        0x0000,0x2160,0x2161,0x2162,0x2163,0x2164,0x2165,0x2166,/* 0xF0-0xF7 */
        0x2167,0x2168,0x2169,0x216A,0x216B,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_A3[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */

        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
        0x0000,0xFF01,0xFF02,0xFF03,0xFFE5,0xFF05,0xFF06,0xFF07,/* 0xA0-0xA7 */
        0xFF08,0xFF09,0xFF0A,0xFF0B,0xFF0C,0xFF0D,0xFF0E,0xFF0F,/* 0xA8-0xAF */
        0xFF10,0xFF11,0xFF12,0xFF13,0xFF14,0xFF15,0xFF16,0xFF17,/* 0xB0-0xB7 */
        0xFF18,0xFF19,0xFF1A,0xFF1B,0xFF1C,0xFF1D,0xFF1E,0xFF1F,/* 0xB8-0xBF */
        0xFF20,0xFF21,0xFF22,0xFF23,0xFF24,0xFF25,0xFF26,0xFF27,/* 0xC0-0xC7 */
        0xFF28,0xFF29,0xFF2A,0xFF2B,0xFF2C,0xFF2D,0xFF2E,0xFF2F,/* 0xC8-0xCF */
        0xFF30,0xFF31,0xFF32,0xFF33,0xFF34,0xFF35,0xFF36,0xFF37,/* 0xD0-0xD7 */
        0xFF38,0xFF39,0xFF3A,0xFF3B,0xFF3C,0xFF3D,0xFF3E,0xFF3F,/* 0xD8-0xDF */
        0xFF40,0xFF41,0xFF42,0xFF43,0xFF44,0xFF45,0xFF46,0xFF47,/* 0xE0-0xE7 */
        0xFF48,0xFF49,0xFF4A,0xFF4B,0xFF4C,0xFF4D,0xFF4E,0xFF4F,/* 0xE8-0xEF */
        0xFF50,0xFF51,0xFF52,0xFF53,0xFF54,0xFF55,0xFF56,0xFF57,/* 0xF0-0xF7 */
        0xFF58,0xFF59,0xFF5A,0xFF5B,0xFF5C,0xFF5D,0xFFE3,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_A4[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */

        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
        0x0000,0x3041,0x3042,0x3043,0x3044,0x3045,0x3046,0x3047,/* 0xA0-0xA7 */
        0x3048,0x3049,0x304A,0x304B,0x304C,0x304D,0x304E,0x304F,/* 0xA8-0xAF */
        0x3050,0x3051,0x3052,0x3053,0x3054,0x3055,0x3056,0x3057,/* 0xB0-0xB7 */
        0x3058,0x3059,0x305A,0x305B,0x305C,0x305D,0x305E,0x305F,/* 0xB8-0xBF */
        0x3060,0x3061,0x3062,0x3063,0x3064,0x3065,0x3066,0x3067,/* 0xC0-0xC7 */
        0x3068,0x3069,0x306A,0x306B,0x306C,0x306D,0x306E,0x306F,/* 0xC8-0xCF */
        0x3070,0x3071,0x3072,0x3073,0x3074,0x3075,0x3076,0x3077,/* 0xD0-0xD7 */
        0x3078,0x3079,0x307A,0x307B,0x307C,0x307D,0x307E,0x307F,/* 0xD8-0xDF */
        0x3080,0x3081,0x3082,0x3083,0x3084,0x3085,0x3086,0x3087,/* 0xE0-0xE7 */
        0x3088,0x3089,0x308A,0x308B,0x308C,0x308D,0x308E,0x308F,/* 0xE8-0xEF */
        0x3090,0x3091,0x3092,0x3093,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */
};

static const wchar_t c2u_A5[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */

        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
        0x0000,0x30A1,0x30A2,0x30A3,0x30A4,0x30A5,0x30A6,0x30A7,/* 0xA0-0xA7 */
        0x30A8,0x30A9,0x30AA,0x30AB,0x30AC,0x30AD,0x30AE,0x30AF,/* 0xA8-0xAF */
        0x30B0,0x30B1,0x30B2,0x30B3,0x30B4,0x30B5,0x30B6,0x30B7,/* 0xB0-0xB7 */
        0x30B8,0x30B9,0x30BA,0x30BB,0x30BC,0x30BD,0x30BE,0x30BF,/* 0xB8-0xBF */
        0x30C0,0x30C1,0x30C2,0x30C3,0x30C4,0x30C5,0x30C6,0x30C7,/* 0xC0-0xC7 */
        0x30C8,0x30C9,0x30CA,0x30CB,0x30CC,0x30CD,0x30CE,0x30CF,/* 0xC8-0xCF */
        0x30D0,0x30D1,0x30D2,0x30D3,0x30D4,0x30D5,0x30D6,0x30D7,/* 0xD0-0xD7 */
        0x30D8,0x30D9,0x30DA,0x30DB,0x30DC,0x30DD,0x30DE,0x30DF,/* 0xD8-0xDF */
        0x30E0,0x30E1,0x30E2,0x30E3,0x30E4,0x30E5,0x30E6,0x30E7,/* 0xE0-0xE7 */
        0x30E8,0x30E9,0x30EA,0x30EB,0x30EC,0x30ED,0x30EE,0x30EF,/* 0xE8-0xEF */
        0x30F0,0x30F1,0x30F2,0x30F3,0x30F4,0x30F5,0x30F6,0x0000,/* 0xF0-0xF7 */
};

static const wchar_t c2u_A6[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */

        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
        0x0000,0x0391,0x0392,0x0393,0x0394,0x0395,0x0396,0x0397,/* 0xA0-0xA7 */
        0x0398,0x0399,0x039A,0x039B,0x039C,0x039D,0x039E,0x039F,/* 0xA8-0xAF */
        0x03A0,0x03A1,0x03A3,0x03A4,0x03A5,0x03A6,0x03A7,0x03A8,/* 0xB0-0xB7 */
        0x03A9,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xB8-0xBF */
        0x0000,0x03B1,0x03B2,0x03B3,0x03B4,0x03B5,0x03B6,0x03B7,/* 0xC0-0xC7 */
        0x03B8,0x03B9,0x03BA,0x03BB,0x03BC,0x03BD,0x03BE,0x03BF,/* 0xC8-0xCF */
        0x03C0,0x03C1,0x03C3,0x03C4,0x03C5,0x03C6,0x03C7,0x03C8,/* 0xD0-0xD7 */
        0x03C9,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xD8-0xDF */
        0xFE35,0xFE36,0xFE39,0xFE3A,0xFE3F,0xFE40,0xFE3D,0xFE3E,/* 0xE0-0xE7 */
        0xFE41,0xFE42,0xFE43,0xFE44,0x0000,0x0000,0xFE3B,0xFE3C,/* 0xE8-0xEF */
        0xFE37,0xFE38,0xFE31,0x0000,0xFE33,0xFE34,0x0000,0x0000,/* 0xF0-0xF7 */
};

static const wchar_t c2u_A7[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x40-0x47 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x48-0x4F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x50-0x57 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x58-0x5F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x60-0x67 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x68-0x6F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x70-0x77 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x78-0x7F */

        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x80-0x87 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x90-0x97 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
        0x0000,0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0401,/* 0xA0-0xA7 */
        0x0416,0x0417,0x0418,0x0419,0x041A,0x041B,0x041C,0x041D,/* 0xA8-0xAF */
        0x041E,0x041F,0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,/* 0xB0-0xB7 */
        0x0426,0x0427,0x0428,0x0429,0x042A,0x042B,0x042C,0x042D,/* 0xB8-0xBF */
        0x042E,0x042F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC0-0xC7 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xC8-0xCF */
        0x0000,0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0451,/* 0xD0-0xD7 */
        0x0436,0x0437,0x0438,0x0439,0x043A,0x043B,0x043C,0x043D,/* 0xD8-0xDF */
        0x043E,0x043F,0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,/* 0xE0-0xE7 */
        0x0446,0x0447,0x0448,0x0449,0x044A,0x044B,0x044C,0x044D,/* 0xE8-0xEF */
        0x044E,0x044F,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF0-0xF7 */
};

static const wchar_t c2u_A8[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x02CA,0x02CB,0x02D9,0x2013,0x2015,0x2025,0x2035,0x2105,/* 0x40-0x47 */
        0x2109,0x2196,0x2197,0x2198,0x2199,0x2215,0x221F,0x2223,/* 0x48-0x4F */
        0x2252,0x2266,0x2267,0x22BF,0x2550,0x2551,0x2552,0x2553,/* 0x50-0x57 */
        0x2554,0x2555,0x2556,0x2557,0x2558,0x2559,0x255A,0x255B,/* 0x58-0x5F */
        0x255C,0x255D,0x255E,0x255F,0x2560,0x2561,0x2562,0x2563,/* 0x60-0x67 */
        0x2564,0x2565,0x2566,0x2567,0x2568,0x2569,0x256A,0x256B,/* 0x68-0x6F */
        0x256C,0x256D,0x256E,0x256F,0x2570,0x2571,0x2572,0x2573,/* 0x70-0x77 */
        0x2581,0x2582,0x2583,0x2584,0x2585,0x2586,0x2587,0x0000,/* 0x78-0x7F */

        0x2588,0x2589,0x258A,0x258B,0x258C,0x258D,0x258E,0x258F,/* 0x80-0x87 */
        0x2593,0x2594,0x2595,0x25BC,0x25BD,0x25E2,0x25E3,0x25E4,/* 0x88-0x8F */
        0x25E5,0x2609,0x2295,0x3012,0x301D,0x301E,0x0000,0x0000,/* 0x90-0x97 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
        0x0000,0x0101,0x00E1,0x01CE,0x00E0,0x0113,0x00E9,0x011B,/* 0xA0-0xA7 */
        0x00E8,0x012B,0x00ED,0x01D0,0x00EC,0x014D,0x00F3,0x01D2,/* 0xA8-0xAF */
        0x00F2,0x016B,0x00FA,0x01D4,0x00F9,0x01D6,0x01D8,0x01DA,/* 0xB0-0xB7 */
        0x01DC,0x00FC,0x00EA,0x0251,0x0000,0x0144,0x0148,0x0000,/* 0xB8-0xBF */
        0x0261,0x0000,0x0000,0x0000,0x0000,0x3105,0x3106,0x3107,/* 0xC0-0xC7 */
        0x3108,0x3109,0x310A,0x310B,0x310C,0x310D,0x310E,0x310F,/* 0xC8-0xCF */
        0x3110,0x3111,0x3112,0x3113,0x3114,0x3115,0x3116,0x3117,/* 0xD0-0xD7 */
        0x3118,0x3119,0x311A,0x311B,0x311C,0x311D,0x311E,0x311F,/* 0xD8-0xDF */
        0x3120,0x3121,0x3122,0x3123,0x3124,0x3125,0x3126,0x3127,/* 0xE0-0xE7 */
        0x3128,0x3129,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xE8-0xEF */
};

static const wchar_t c2u_A9[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x3021,0x3022,0x3023,0x3024,0x3025,0x3026,0x3027,0x3028,/* 0x40-0x47 */
        0x3029,0x32A3,0x338E,0x338F,0x339C,0x339D,0x339E,0x33A1,/* 0x48-0x4F */
        0x33C4,0x33CE,0x33D1,0x33D2,0x33D5,0xFE30,0xFFE2,0xFFE4,/* 0x50-0x57 */
        0x0000,0x2121,0x3231,0x0000,0x2010,0x0000,0x0000,0x0000,/* 0x58-0x5F */
        0x30FC,0x309B,0x309C,0x30FD,0x30FE,0x3006,0x309D,0x309E,/* 0x60-0x67 */
        0xFE49,0xFE4A,0xFE4B,0xFE4C,0xFE4D,0xFE4E,0xFE4F,0xFE50,/* 0x68-0x6F */
        0xFE51,0xFE52,0xFE54,0xFE55,0xFE56,0xFE57,0xFE59,0xFE5A,/* 0x70-0x77 */
        0xFE5B,0xFE5C,0xFE5D,0xFE5E,0xFE5F,0xFE60,0xFE61,0x0000,/* 0x78-0x7F */

        0xFE62,0xFE63,0xFE64,0xFE65,0xFE66,0xFE68,0xFE69,0xFE6A,/* 0x80-0x87 */
        0xFE6B,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x88-0x8F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x3007,0x0000,/* 0x90-0x97 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x98-0x9F */
        0x0000,0x0000,0x0000,0x0000,0x2500,0x2501,0x2502,0x2503,/* 0xA0-0xA7 */
        0x2504,0x2505,0x2506,0x2507,0x2508,0x2509,0x250A,0x250B,/* 0xA8-0xAF */
        0x250C,0x250D,0x250E,0x250F,0x2510,0x2511,0x2512,0x2513,/* 0xB0-0xB7 */
        0x2514,0x2515,0x2516,0x2517,0x2518,0x2519,0x251A,0x251B,/* 0xB8-0xBF */
        0x251C,0x251D,0x251E,0x251F,0x2520,0x2521,0x2522,0x2523,/* 0xC0-0xC7 */
        0x2524,0x2525,0x2526,0x2527,0x2528,0x2529,0x252A,0x252B,/* 0xC8-0xCF */
        0x252C,0x252D,0x252E,0x252F,0x2530,0x2531,0x2532,0x2533,/* 0xD0-0xD7 */
        0x2534,0x2535,0x2536,0x2537,0x2538,0x2539,0x253A,0x253B,/* 0xD8-0xDF */
        0x253C,0x253D,0x253E,0x253F,0x2540,0x2541,0x2542,0x2543,/* 0xE0-0xE7 */
        0x2544,0x2545,0x2546,0x2547,0x2548,0x2549,0x254A,0x254B,/* 0xE8-0xEF */
};

static const wchar_t c2u_AA[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x72DC,0x72DD,0x72DF,0x72E2,0x72E3,0x72E4,0x72E5,0x72E6,/* 0x40-0x47 */
        0x72E7,0x72EA,0x72EB,0x72F5,0x72F6,0x72F9,0x72FD,0x72FE,/* 0x48-0x4F */
        0x72FF,0x7300,0x7302,0x7304,0x7305,0x7306,0x7307,0x7308,/* 0x50-0x57 */
        0x7309,0x730B,0x730C,0x730D,0x730F,0x7310,0x7311,0x7312,/* 0x58-0x5F */
        0x7314,0x7318,0x7319,0x731A,0x731F,0x7320,0x7323,0x7324,/* 0x60-0x67 */
        0x7326,0x7327,0x7328,0x732D,0x732F,0x7330,0x7332,0x7333,/* 0x68-0x6F */
        0x7335,0x7336,0x733A,0x733B,0x733C,0x733D,0x7340,0x7341,/* 0x70-0x77 */
        0x7342,0x7343,0x7344,0x7345,0x7346,0x7347,0x7348,0x0000,/* 0x78-0x7F */

        0x7349,0x734A,0x734B,0x734C,0x734E,0x734F,0x7351,0x7353,/* 0x80-0x87 */
        0x7354,0x7355,0x7356,0x7358,0x7359,0x735A,0x735B,0x735C,/* 0x88-0x8F */
        0x735D,0x735E,0x735F,0x7361,0x7362,0x7363,0x7364,0x7365,/* 0x90-0x97 */
        0x7366,0x7367,0x7368,0x7369,0x736A,0x736B,0x736E,0x7370,/* 0x98-0x9F */
        0x7371,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
};

static const wchar_t c2u_AB[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x7372,0x7373,0x7374,0x7375,0x7376,0x7377,0x7378,0x7379,/* 0x40-0x47 */
        0x737A,0x737B,0x737C,0x737D,0x737F,0x7380,0x7381,0x7382,/* 0x48-0x4F */
        0x7383,0x7385,0x7386,0x7388,0x738A,0x738C,0x738D,0x738F,/* 0x50-0x57 */
        0x7390,0x7392,0x7393,0x7394,0x7395,0x7397,0x7398,0x7399,/* 0x58-0x5F */
        0x739A,0x739C,0x739D,0x739E,0x73A0,0x73A1,0x73A3,0x73A4,/* 0x60-0x67 */
        0x73A5,0x73A6,0x73A7,0x73A8,0x73AA,0x73AC,0x73AD,0x73B1,/* 0x68-0x6F */
        0x73B4,0x73B5,0x73B6,0x73B8,0x73B9,0x73BC,0x73BD,0x73BE,/* 0x70-0x77 */
        0x73BF,0x73C1,0x73C3,0x73C4,0x73C5,0x73C6,0x73C7,0x0000,/* 0x78-0x7F */

        0x73CB,0x73CC,0x73CE,0x73D2,0x73D3,0x73D4,0x73D5,0x73D6,/* 0x80-0x87 */
        0x73D7,0x73D8,0x73DA,0x73DB,0x73DC,0x73DD,0x73DF,0x73E1,/* 0x88-0x8F */
        0x73E2,0x73E3,0x73E4,0x73E6,0x73E8,0x73EA,0x73EB,0x73EC,/* 0x90-0x97 */
        0x73EE,0x73EF,0x73F0,0x73F1,0x73F3,0x73F4,0x73F5,0x73F6,/* 0x98-0x9F */
        0x73F7,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
};

static const wchar_t c2u_AC[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x73F8,0x73F9,0x73FA,0x73FB,0x73FC,0x73FD,0x73FE,0x73FF,/* 0x40-0x47 */
        0x7400,0x7401,0x7402,0x7404,0x7407,0x7408,0x740B,0x740C,/* 0x48-0x4F */
        0x740D,0x740E,0x7411,0x7412,0x7413,0x7414,0x7415,0x7416,/* 0x50-0x57 */
        0x7417,0x7418,0x7419,0x741C,0x741D,0x741E,0x741F,0x7420,/* 0x58-0x5F */
        0x7421,0x7423,0x7424,0x7427,0x7429,0x742B,0x742D,0x742F,/* 0x60-0x67 */
        0x7431,0x7432,0x7437,0x7438,0x7439,0x743A,0x743B,0x743D,/* 0x68-0x6F */
        0x743E,0x743F,0x7440,0x7442,0x7443,0x7444,0x7445,0x7446,/* 0x70-0x77 */
        0x7447,0x7448,0x7449,0x744A,0x744B,0x744C,0x744D,0x0000,/* 0x78-0x7F */

        0x744E,0x744F,0x7450,0x7451,0x7452,0x7453,0x7454,0x7456,/* 0x80-0x87 */
        0x7458,0x745D,0x7460,0x7461,0x7462,0x7463,0x7464,0x7465,/* 0x88-0x8F */
        0x7466,0x7467,0x7468,0x7469,0x746A,0x746B,0x746C,0x746E,/* 0x90-0x97 */
        0x746F,0x7471,0x7472,0x7473,0x7474,0x7475,0x7478,0x7479,/* 0x98-0x9F */
        0x747A,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
};

static const wchar_t c2u_AD[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x747B,0x747C,0x747D,0x747F,0x7482,0x7484,0x7485,0x7486,/* 0x40-0x47 */
        0x7488,0x7489,0x748A,0x748C,0x748D,0x748F,0x7491,0x7492,/* 0x48-0x4F */
        0x7493,0x7494,0x7495,0x7496,0x7497,0x7498,0x7499,0x749A,/* 0x50-0x57 */
        0x749B,0x749D,0x749F,0x74A0,0x74A1,0x74A2,0x74A3,0x74A4,/* 0x58-0x5F */
        0x74A5,0x74A6,0x74AA,0x74AB,0x74AC,0x74AD,0x74AE,0x74AF,/* 0x60-0x67 */
        0x74B0,0x74B1,0x74B2,0x74B3,0x74B4,0x74B5,0x74B6,0x74B7,/* 0x68-0x6F */
        0x74B8,0x74B9,0x74BB,0x74BC,0x74BD,0x74BE,0x74BF,0x74C0,/* 0x70-0x77 */
        0x74C1,0x74C2,0x74C3,0x74C4,0x74C5,0x74C6,0x74C7,0x0000,/* 0x78-0x7F */

        0x74C8,0x74C9,0x74CA,0x74CB,0x74CC,0x74CD,0x74CE,0x74CF,/* 0x80-0x87 */
        0x74D0,0x74D1,0x74D3,0x74D4,0x74D5,0x74D6,0x74D7,0x74D8,/* 0x88-0x8F */
        0x74D9,0x74DA,0x74DB,0x74DD,0x74DF,0x74E1,0x74E5,0x74E7,/* 0x90-0x97 */
        0x74E8,0x74E9,0x74EA,0x74EB,0x74EC,0x74ED,0x74F0,0x74F1,/* 0x98-0x9F */
        0x74F2,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
};

static const wchar_t c2u_AE[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x74F3,0x74F5,0x74F8,0x74F9,0x74FA,0x74FB,0x74FC,0x74FD,/* 0x40-0x47 */
        0x74FE,0x7500,0x7501,0x7502,0x7503,0x7505,0x7506,0x7507,/* 0x48-0x4F */
        0x7508,0x7509,0x750A,0x750B,0x750C,0x750E,0x7510,0x7512,/* 0x50-0x57 */
        0x7514,0x7515,0x7516,0x7517,0x751B,0x751D,0x751E,0x7520,/* 0x58-0x5F */
        0x7521,0x7522,0x7523,0x7524,0x7526,0x7527,0x752A,0x752E,/* 0x60-0x67 */
        0x7534,0x7536,0x7539,0x753C,0x753D,0x753F,0x7541,0x7542,/* 0x68-0x6F */
        0x7543,0x7544,0x7546,0x7547,0x7549,0x754A,0x754D,0x7550,/* 0x70-0x77 */
        0x7551,0x7552,0x7553,0x7555,0x7556,0x7557,0x7558,0x0000,/* 0x78-0x7F */

        0x755D,0x755E,0x755F,0x7560,0x7561,0x7562,0x7563,0x7564,/* 0x80-0x87 */
        0x7567,0x7568,0x7569,0x756B,0x756C,0x756D,0x756E,0x756F,/* 0x88-0x8F */
        0x7570,0x7571,0x7573,0x7575,0x7576,0x7577,0x757A,0x757B,/* 0x90-0x97 */
        0x757C,0x757D,0x757E,0x7580,0x7581,0x7582,0x7584,0x7585,/* 0x98-0x9F */
        0x7587,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
};

static const wchar_t c2u_AF[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x7588,0x7589,0x758A,0x758C,0x758D,0x758E,0x7590,0x7593,/* 0x40-0x47 */
        0x7595,0x7598,0x759B,0x759C,0x759E,0x75A2,0x75A6,0x75A7,/* 0x48-0x4F */
        0x75A8,0x75A9,0x75AA,0x75AD,0x75B6,0x75B7,0x75BA,0x75BB,/* 0x50-0x57 */
        0x75BF,0x75C0,0x75C1,0x75C6,0x75CB,0x75CC,0x75CE,0x75CF,/* 0x58-0x5F */
        0x75D0,0x75D1,0x75D3,0x75D7,0x75D9,0x75DA,0x75DC,0x75DD,/* 0x60-0x67 */
        0x75DF,0x75E0,0x75E1,0x75E5,0x75E9,0x75EC,0x75ED,0x75EE,/* 0x68-0x6F */
        0x75EF,0x75F2,0x75F3,0x75F5,0x75F6,0x75F7,0x75F8,0x75FA,/* 0x70-0x77 */
        0x75FB,0x75FD,0x75FE,0x7602,0x7604,0x7606,0x7607,0x0000,/* 0x78-0x7F */

        0x7608,0x7609,0x760B,0x760D,0x760E,0x760F,0x7611,0x7612,/* 0x80-0x87 */
        0x7613,0x7614,0x7616,0x761A,0x761C,0x761D,0x761E,0x7621,/* 0x88-0x8F */
        0x7623,0x7627,0x7628,0x762C,0x762E,0x762F,0x7631,0x7632,/* 0x90-0x97 */
        0x7636,0x7637,0x7639,0x763A,0x763B,0x763D,0x7641,0x7642,/* 0x98-0x9F */
        0x7644,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
};

static const wchar_t c2u_B0[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x7645,0x7646,0x7647,0x7648,0x7649,0x764A,0x764B,0x764E,/* 0x40-0x47 */
        0x764F,0x7650,0x7651,0x7652,0x7653,0x7655,0x7657,0x7658,/* 0x48-0x4F */
        0x7659,0x765A,0x765B,0x765D,0x765F,0x7660,0x7661,0x7662,/* 0x50-0x57 */
        0x7664,0x7665,0x7666,0x7667,0x7668,0x7669,0x766A,0x766C,/* 0x58-0x5F */
        0x766D,0x766E,0x7670,0x7671,0x7672,0x7673,0x7674,0x7675,/* 0x60-0x67 */
        0x7676,0x7677,0x7679,0x767A,0x767C,0x767F,0x7680,0x7681,/* 0x68-0x6F */
        0x7683,0x7685,0x7689,0x768A,0x768C,0x768D,0x768F,0x7690,/* 0x70-0x77 */
        0x7692,0x7694,0x7695,0x7697,0x7698,0x769A,0x769B,0x0000,/* 0x78-0x7F */

        0x769C,0x769D,0x769E,0x769F,0x76A0,0x76A1,0x76A2,0x76A3,/* 0x80-0x87 */
        0x76A5,0x76A6,0x76A7,0x76A8,0x76A9,0x76AA,0x76AB,0x76AC,/* 0x88-0x8F */
        0x76AD,0x76AF,0x76B0,0x76B3,0x76B5,0x76B6,0x76B7,0x76B8,/* 0x90-0x97 */
        0x76B9,0x76BA,0x76BB,0x76BC,0x76BD,0x76BE,0x76C0,0x76C1,/* 0x98-0x9F */
        0x76C3,0x554A,0x963F,0x57C3,0x6328,0x54CE,0x5509,0x54C0,/* 0xA0-0xA7 */
        0x7691,0x764C,0x853C,0x77EE,0x827E,0x788D,0x7231,0x9698,/* 0xA8-0xAF */
        0x978D,0x6C28,0x5B89,0x4FFA,0x6309,0x6697,0x5CB8,0x80FA,/* 0xB0-0xB7 */
        0x6848,0x80AE,0x6602,0x76CE,0x51F9,0x6556,0x71AC,0x7FF1,/* 0xB8-0xBF */
        0x8884,0x50B2,0x5965,0x61CA,0x6FB3,0x82AD,0x634C,0x6252,/* 0xC0-0xC7 */
        0x53ED,0x5427,0x7B06,0x516B,0x75A4,0x5DF4,0x62D4,0x8DCB,/* 0xC8-0xCF */
        0x9776,0x628A,0x8019,0x575D,0x9738,0x7F62,0x7238,0x767D,/* 0xD0-0xD7 */
        0x67CF,0x767E,0x6446,0x4F70,0x8D25,0x62DC,0x7A17,0x6591,/* 0xD8-0xDF */
        0x73ED,0x642C,0x6273,0x822C,0x9881,0x677F,0x7248,0x626E,/* 0xE0-0xE7 */
        0x62CC,0x4F34,0x74E3,0x534A,0x529E,0x7ECA,0x90A6,0x5E2E,/* 0xE8-0xEF */
        0x6886,0x699C,0x8180,0x7ED1,0x68D2,0x78C5,0x868C,0x9551,/* 0xF0-0xF7 */
        0x508D,0x8C24,0x82DE,0x80DE,0x5305,0x8912,0x5265,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_B1[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x76C4,0x76C7,0x76C9,0x76CB,0x76CC,0x76D3,0x76D5,0x76D9,/* 0x40-0x47 */
        0x76DA,0x76DC,0x76DD,0x76DE,0x76E0,0x76E1,0x76E2,0x76E3,/* 0x48-0x4F */
        0x76E4,0x76E6,0x76E7,0x76E8,0x76E9,0x76EA,0x76EB,0x76EC,/* 0x50-0x57 */
        0x76ED,0x76F0,0x76F3,0x76F5,0x76F6,0x76F7,0x76FA,0x76FB,/* 0x58-0x5F */
        0x76FD,0x76FF,0x7700,0x7702,0x7703,0x7705,0x7706,0x770A,/* 0x60-0x67 */
        0x770C,0x770E,0x770F,0x7710,0x7711,0x7712,0x7713,0x7714,/* 0x68-0x6F */
        0x7715,0x7716,0x7717,0x7718,0x771B,0x771C,0x771D,0x771E,/* 0x70-0x77 */
        0x7721,0x7723,0x7724,0x7725,0x7727,0x772A,0x772B,0x0000,/* 0x78-0x7F */

        0x772C,0x772E,0x7730,0x7731,0x7732,0x7733,0x7734,0x7739,/* 0x80-0x87 */
        0x773B,0x773D,0x773E,0x773F,0x7742,0x7744,0x7745,0x7746,/* 0x88-0x8F */
        0x7748,0x7749,0x774A,0x774B,0x774C,0x774D,0x774E,0x774F,/* 0x90-0x97 */
        0x7752,0x7753,0x7754,0x7755,0x7756,0x7757,0x7758,0x7759,/* 0x98-0x9F */
        0x775C,0x8584,0x96F9,0x4FDD,0x5821,0x9971,0x5B9D,0x62B1,/* 0xA0-0xA7 */
        0x62A5,0x66B4,0x8C79,0x9C8D,0x7206,0x676F,0x7891,0x60B2,/* 0xA8-0xAF */
        0x5351,0x5317,0x8F88,0x80CC,0x8D1D,0x94A1,0x500D,0x72C8,/* 0xB0-0xB7 */
        0x5907,0x60EB,0x7119,0x88AB,0x5954,0x82EF,0x672C,0x7B28,/* 0xB8-0xBF */
        0x5D29,0x7EF7,0x752D,0x6CF5,0x8E66,0x8FF8,0x903C,0x9F3B,/* 0xC0-0xC7 */
        0x6BD4,0x9119,0x7B14,0x5F7C,0x78A7,0x84D6,0x853D,0x6BD5,/* 0xC8-0xCF */
        0x6BD9,0x6BD6,0x5E01,0x5E87,0x75F9,0x95ED,0x655D,0x5F0A,/* 0xD0-0xD7 */
        0x5FC5,0x8F9F,0x58C1,0x81C2,0x907F,0x965B,0x97AD,0x8FB9,/* 0xD8-0xDF */
        0x7F16,0x8D2C,0x6241,0x4FBF,0x53D8,0x535E,0x8FA8,0x8FA9,/* 0xE0-0xE7 */
        0x8FAB,0x904D,0x6807,0x5F6A,0x8198,0x8868,0x9CD6,0x618B,/* 0xE8-0xEF */
        0x522B,0x762A,0x5F6C,0x658C,0x6FD2,0x6EE8,0x5BBE,0x6448,/* 0xF0-0xF7 */
        0x5175,0x51B0,0x67C4,0x4E19,0x79C9,0x997C,0x70B3,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_B2[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x775D,0x775E,0x775F,0x7760,0x7764,0x7767,0x7769,0x776A,/* 0x40-0x47 */
        0x776D,0x776E,0x776F,0x7770,0x7771,0x7772,0x7773,0x7774,/* 0x48-0x4F */
        0x7775,0x7776,0x7777,0x7778,0x777A,0x777B,0x777C,0x7781,/* 0x50-0x57 */
        0x7782,0x7783,0x7786,0x7787,0x7788,0x7789,0x778A,0x778B,/* 0x58-0x5F */
        0x778F,0x7790,0x7793,0x7794,0x7795,0x7796,0x7797,0x7798,/* 0x60-0x67 */
        0x7799,0x779A,0x779B,0x779C,0x779D,0x779E,0x77A1,0x77A3,/* 0x68-0x6F */
        0x77A4,0x77A6,0x77A8,0x77AB,0x77AD,0x77AE,0x77AF,0x77B1,/* 0x70-0x77 */
        0x77B2,0x77B4,0x77B6,0x77B7,0x77B8,0x77B9,0x77BA,0x0000,/* 0x78-0x7F */

        0x77BC,0x77BE,0x77C0,0x77C1,0x77C2,0x77C3,0x77C4,0x77C5,/* 0x80-0x87 */
        0x77C6,0x77C7,0x77C8,0x77C9,0x77CA,0x77CB,0x77CC,0x77CE,/* 0x88-0x8F */
        0x77CF,0x77D0,0x77D1,0x77D2,0x77D3,0x77D4,0x77D5,0x77D6,/* 0x90-0x97 */
        0x77D8,0x77D9,0x77DA,0x77DD,0x77DE,0x77DF,0x77E0,0x77E1,/* 0x98-0x9F */
        0x77E4,0x75C5,0x5E76,0x73BB,0x83E0,0x64AD,0x62E8,0x94B5,/* 0xA0-0xA7 */
        0x6CE2,0x535A,0x52C3,0x640F,0x94C2,0x7B94,0x4F2F,0x5E1B,/* 0xA8-0xAF */
        0x8236,0x8116,0x818A,0x6E24,0x6CCA,0x9A73,0x6355,0x535C,/* 0xB0-0xB7 */
        0x54FA,0x8865,0x57E0,0x4E0D,0x5E03,0x6B65,0x7C3F,0x90E8,/* 0xB8-0xBF */
        0x6016,0x64E6,0x731C,0x88C1,0x6750,0x624D,0x8D22,0x776C,/* 0xC0-0xC7 */
        0x8E29,0x91C7,0x5F69,0x83DC,0x8521,0x9910,0x53C2,0x8695,/* 0xC8-0xCF */
        0x6B8B,0x60ED,0x60E8,0x707F,0x82CD,0x8231,0x4ED3,0x6CA7,/* 0xD0-0xD7 */
        0x85CF,0x64CD,0x7CD9,0x69FD,0x66F9,0x8349,0x5395,0x7B56,/* 0xD8-0xDF */
        0x4FA7,0x518C,0x6D4B,0x5C42,0x8E6D,0x63D2,0x53C9,0x832C,/* 0xE0-0xE7 */
        0x8336,0x67E5,0x78B4,0x643D,0x5BDF,0x5C94,0x5DEE,0x8BE7,/* 0xE8-0xEF */
        0x62C6,0x67F4,0x8C7A,0x6400,0x63BA,0x8749,0x998B,0x8C17,/* 0xF0-0xF7 */
        0x7F20,0x94F2,0x4EA7,0x9610,0x98A4,0x660C,0x7316,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_B3[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x77E6,0x77E8,0x77EA,0x77EF,0x77F0,0x77F1,0x77F2,0x77F4,/* 0x40-0x47 */
        0x77F5,0x77F7,0x77F9,0x77FA,0x77FB,0x77FC,0x7803,0x7804,/* 0x48-0x4F */
        0x7805,0x7806,0x7807,0x7808,0x780A,0x780B,0x780E,0x780F,/* 0x50-0x57 */
        0x7810,0x7813,0x7815,0x7819,0x781B,0x781E,0x7820,0x7821,/* 0x58-0x5F */
        0x7822,0x7824,0x7828,0x782A,0x782B,0x782E,0x782F,0x7831,/* 0x60-0x67 */
        0x7832,0x7833,0x7835,0x7836,0x783D,0x783F,0x7841,0x7842,/* 0x68-0x6F */
        0x7843,0x7844,0x7846,0x7848,0x7849,0x784A,0x784B,0x784D,/* 0x70-0x77 */
        0x784F,0x7851,0x7853,0x7854,0x7858,0x7859,0x785A,0x0000,/* 0x78-0x7F */

        0x785B,0x785C,0x785E,0x785F,0x7860,0x7861,0x7862,0x7863,/* 0x80-0x87 */
        0x7864,0x7865,0x7866,0x7867,0x7868,0x7869,0x786F,0x7870,/* 0x88-0x8F */
        0x7871,0x7872,0x7873,0x7874,0x7875,0x7876,0x7878,0x7879,/* 0x90-0x97 */
        0x787A,0x787B,0x787D,0x787E,0x787F,0x7880,0x7881,0x7882,/* 0x98-0x9F */
        0x7883,0x573A,0x5C1D,0x5E38,0x957F,0x507F,0x80A0,0x5382,/* 0xA0-0xA7 */
        0x655E,0x7545,0x5531,0x5021,0x8D85,0x6284,0x949E,0x671D,/* 0xA8-0xAF */
        0x5632,0x6F6E,0x5DE2,0x5435,0x7092,0x8F66,0x626F,0x64A4,/* 0xB0-0xB7 */
        0x63A3,0x5F7B,0x6F88,0x90F4,0x81E3,0x8FB0,0x5C18,0x6668,/* 0xB8-0xBF */
        0x5FF1,0x6C89,0x9648,0x8D81,0x886C,0x6491,0x79F0,0x57CE,/* 0xC0-0xC7 */
        0x6A59,0x6210,0x5448,0x4E58,0x7A0B,0x60E9,0x6F84,0x8BDA,/* 0xC8-0xCF */
        0x627F,0x901E,0x9A8B,0x79E4,0x5403,0x75F4,0x6301,0x5319,/* 0xD0-0xD7 */
        0x6C60,0x8FDF,0x5F1B,0x9A70,0x803B,0x9F7F,0x4F88,0x5C3A,/* 0xD8-0xDF */
        0x8D64,0x7FC5,0x65A5,0x70BD,0x5145,0x51B2,0x866B,0x5D07,/* 0xE0-0xE7 */
        0x5BA0,0x62BD,0x916C,0x7574,0x8E0C,0x7A20,0x6101,0x7B79,/* 0xE8-0xEF */
        0x4EC7,0x7EF8,0x7785,0x4E11,0x81ED,0x521D,0x51FA,0x6A71,/* 0xF0-0xF7 */
        0x53A8,0x8E87,0x9504,0x96CF,0x6EC1,0x9664,0x695A,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_B4[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x7884,0x7885,0x7886,0x7888,0x788A,0x788B,0x788F,0x7890,/* 0x40-0x47 */
        0x7892,0x7894,0x7895,0x7896,0x7899,0x789D,0x789E,0x78A0,/* 0x48-0x4F */
        0x78A2,0x78A4,0x78A6,0x78A8,0x78A9,0x78AA,0x78AB,0x78AC,/* 0x50-0x57 */
        0x78AD,0x78AE,0x78AF,0x78B5,0x78B6,0x78B7,0x78B8,0x78BA,/* 0x58-0x5F */
        0x78BB,0x78BC,0x78BD,0x78BF,0x78C0,0x78C2,0x78C3,0x78C4,/* 0x60-0x67 */
        0x78C6,0x78C7,0x78C8,0x78CC,0x78CD,0x78CE,0x78CF,0x78D1,/* 0x68-0x6F */
        0x78D2,0x78D3,0x78D6,0x78D7,0x78D8,0x78DA,0x78DB,0x78DC,/* 0x70-0x77 */
        0x78DD,0x78DE,0x78DF,0x78E0,0x78E1,0x78E2,0x78E3,0x0000,/* 0x78-0x7F */

        0x78E4,0x78E5,0x78E6,0x78E7,0x78E9,0x78EA,0x78EB,0x78ED,/* 0x80-0x87 */
        0x78EE,0x78EF,0x78F0,0x78F1,0x78F3,0x78F5,0x78F6,0x78F8,/* 0x88-0x8F */
        0x78F9,0x78FB,0x78FC,0x78FD,0x78FE,0x78FF,0x7900,0x7902,/* 0x90-0x97 */
        0x7903,0x7904,0x7906,0x7907,0x7908,0x7909,0x790A,0x790B,/* 0x98-0x9F */
        0x790C,0x7840,0x50A8,0x77D7,0x6410,0x89E6,0x5904,0x63E3,/* 0xA0-0xA7 */
        0x5DDD,0x7A7F,0x693D,0x4F20,0x8239,0x5598,0x4E32,0x75AE,/* 0xA8-0xAF */
        0x7A97,0x5E62,0x5E8A,0x95EF,0x521B,0x5439,0x708A,0x6376,/* 0xB0-0xB7 */
        0x9524,0x5782,0x6625,0x693F,0x9187,0x5507,0x6DF3,0x7EAF,/* 0xB8-0xBF */
        0x8822,0x6233,0x7EF0,0x75B5,0x8328,0x78C1,0x96CC,0x8F9E,/* 0xC0-0xC7 */
        0x6148,0x74F7,0x8BCD,0x6B64,0x523A,0x8D50,0x6B21,0x806A,/* 0xC8-0xCF */
        0x8471,0x56F1,0x5306,0x4ECE,0x4E1B,0x51D1,0x7C97,0x918B,/* 0xD0-0xD7 */
        0x7C07,0x4FC3,0x8E7F,0x7BE1,0x7A9C,0x6467,0x5D14,0x50AC,/* 0xD8-0xDF */
        0x8106,0x7601,0x7CB9,0x6DEC,0x7FE0,0x6751,0x5B58,0x5BF8,/* 0xE0-0xE7 */
        0x78CB,0x64AE,0x6413,0x63AA,0x632B,0x9519,0x642D,0x8FBE,/* 0xE8-0xEF */
        0x7B54,0x7629,0x6253,0x5927,0x5446,0x6B79,0x50A3,0x6234,/* 0xF0-0xF7 */
        0x5E26,0x6B86,0x4EE3,0x8D37,0x888B,0x5F85,0x902E,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_B5[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x790D,0x790E,0x790F,0x7910,0x7911,0x7912,0x7914,0x7915,/* 0x40-0x47 */
        0x7916,0x7917,0x7918,0x7919,0x791A,0x791B,0x791C,0x791D,/* 0x48-0x4F */
        0x791F,0x7920,0x7921,0x7922,0x7923,0x7925,0x7926,0x7927,/* 0x50-0x57 */
        0x7928,0x7929,0x792A,0x792B,0x792C,0x792D,0x792E,0x792F,/* 0x58-0x5F */
        0x7930,0x7931,0x7932,0x7933,0x7935,0x7936,0x7937,0x7938,/* 0x60-0x67 */
        0x7939,0x793D,0x793F,0x7942,0x7943,0x7944,0x7945,0x7947,/* 0x68-0x6F */
        0x794A,0x794B,0x794C,0x794D,0x794E,0x794F,0x7950,0x7951,/* 0x70-0x77 */
        0x7952,0x7954,0x7955,0x7958,0x7959,0x7961,0x7963,0x0000,/* 0x78-0x7F */

        0x7964,0x7966,0x7969,0x796A,0x796B,0x796C,0x796E,0x7970,/* 0x80-0x87 */
        0x7971,0x7972,0x7973,0x7974,0x7975,0x7976,0x7979,0x797B,/* 0x88-0x8F */
        0x797C,0x797D,0x797E,0x797F,0x7982,0x7983,0x7986,0x7987,/* 0x90-0x97 */
        0x7988,0x7989,0x798B,0x798C,0x798D,0x798E,0x7990,0x7991,/* 0x98-0x9F */
        0x7992,0x6020,0x803D,0x62C5,0x4E39,0x5355,0x90F8,0x63B8,/* 0xA0-0xA7 */
        0x80C6,0x65E6,0x6C2E,0x4F46,0x60EE,0x6DE1,0x8BDE,0x5F39,/* 0xA8-0xAF */
        0x86CB,0x5F53,0x6321,0x515A,0x8361,0x6863,0x5200,0x6363,/* 0xB0-0xB7 */
        0x8E48,0x5012,0x5C9B,0x7977,0x5BFC,0x5230,0x7A3B,0x60BC,/* 0xB8-0xBF */
        0x9053,0x76D7,0x5FB7,0x5F97,0x7684,0x8E6C,0x706F,0x767B,/* 0xC0-0xC7 */
        0x7B49,0x77AA,0x51F3,0x9093,0x5824,0x4F4E,0x6EF4,0x8FEA,/* 0xC8-0xCF */
        0x654C,0x7B1B,0x72C4,0x6DA4,0x7FDF,0x5AE1,0x62B5,0x5E95,/* 0xD0-0xD7 */
        0x5730,0x8482,0x7B2C,0x5E1D,0x5F1F,0x9012,0x7F14,0x98A0,/* 0xD8-0xDF */
        0x6382,0x6EC7,0x7898,0x70B9,0x5178,0x975B,0x57AB,0x7535,/* 0xE0-0xE7 */
        0x4F43,0x7538,0x5E97,0x60E6,0x5960,0x6DC0,0x6BBF,0x7889,/* 0xE8-0xEF */
        0x53FC,0x96D5,0x51CB,0x5201,0x6389,0x540A,0x9493,0x8C03,/* 0xF0-0xF7 */
        0x8DCC,0x7239,0x789F,0x8776,0x8FED,0x8C0D,0x53E0,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_B6[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x7993,0x7994,0x7995,0x7996,0x7997,0x7998,0x7999,0x799B,/* 0x40-0x47 */
        0x799C,0x799D,0x799E,0x799F,0x79A0,0x79A1,0x79A2,0x79A3,/* 0x48-0x4F */
        0x79A4,0x79A5,0x79A6,0x79A8,0x79A9,0x79AA,0x79AB,0x79AC,/* 0x50-0x57 */
        0x79AD,0x79AE,0x79AF,0x79B0,0x79B1,0x79B2,0x79B4,0x79B5,/* 0x58-0x5F */
        0x79B6,0x79B7,0x79B8,0x79BC,0x79BF,0x79C2,0x79C4,0x79C5,/* 0x60-0x67 */
        0x79C7,0x79C8,0x79CA,0x79CC,0x79CE,0x79CF,0x79D0,0x79D3,/* 0x68-0x6F */
        0x79D4,0x79D6,0x79D7,0x79D9,0x79DA,0x79DB,0x79DC,0x79DD,/* 0x70-0x77 */
        0x79DE,0x79E0,0x79E1,0x79E2,0x79E5,0x79E8,0x79EA,0x0000,/* 0x78-0x7F */

        0x79EC,0x79EE,0x79F1,0x79F2,0x79F3,0x79F4,0x79F5,0x79F6,/* 0x80-0x87 */
        0x79F7,0x79F9,0x79FA,0x79FC,0x79FE,0x79FF,0x7A01,0x7A04,/* 0x88-0x8F */
        0x7A05,0x7A07,0x7A08,0x7A09,0x7A0A,0x7A0C,0x7A0F,0x7A10,/* 0x90-0x97 */
        0x7A11,0x7A12,0x7A13,0x7A15,0x7A16,0x7A18,0x7A19,0x7A1B,/* 0x98-0x9F */
        0x7A1C,0x4E01,0x76EF,0x53EE,0x9489,0x9876,0x9F0E,0x952D,/* 0xA0-0xA7 */
        0x5B9A,0x8BA2,0x4E22,0x4E1C,0x51AC,0x8463,0x61C2,0x52A8,/* 0xA8-0xAF */
        0x680B,0x4F97,0x606B,0x51BB,0x6D1E,0x515C,0x6296,0x6597,/* 0xB0-0xB7 */
        0x9661,0x8C46,0x9017,0x75D8,0x90FD,0x7763,0x6BD2,0x728A,/* 0xB8-0xBF */
        0x72EC,0x8BFB,0x5835,0x7779,0x8D4C,0x675C,0x9540,0x809A,/* 0xC0-0xC7 */
        0x5EA6,0x6E21,0x5992,0x7AEF,0x77ED,0x953B,0x6BB5,0x65AD,/* 0xC8-0xCF */
        0x7F0E,0x5806,0x5151,0x961F,0x5BF9,0x58A9,0x5428,0x8E72,/* 0xD0-0xD7 */
        0x6566,0x987F,0x56E4,0x949D,0x76FE,0x9041,0x6387,0x54C6,/* 0xD8-0xDF */
        0x591A,0x593A,0x579B,0x8EB2,0x6735,0x8DFA,0x8235,0x5241,/* 0xE0-0xE7 */
        0x60F0,0x5815,0x86FE,0x5CE8,0x9E45,0x4FC4,0x989D,0x8BB9,/* 0xE8-0xEF */
        0x5A25,0x6076,0x5384,0x627C,0x904F,0x9102,0x997F,0x6069,/* 0xF0-0xF7 */
        0x800C,0x513F,0x8033,0x5C14,0x9975,0x6D31,0x4E8C,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_B7[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x7A1D,0x7A1F,0x7A21,0x7A22,0x7A24,0x7A25,0x7A26,0x7A27,/* 0x40-0x47 */
        0x7A28,0x7A29,0x7A2A,0x7A2B,0x7A2C,0x7A2D,0x7A2E,0x7A2F,/* 0x48-0x4F */
        0x7A30,0x7A31,0x7A32,0x7A34,0x7A35,0x7A36,0x7A38,0x7A3A,/* 0x50-0x57 */
        0x7A3E,0x7A40,0x7A41,0x7A42,0x7A43,0x7A44,0x7A45,0x7A47,/* 0x58-0x5F */
        0x7A48,0x7A49,0x7A4A,0x7A4B,0x7A4C,0x7A4D,0x7A4E,0x7A4F,/* 0x60-0x67 */
        0x7A50,0x7A52,0x7A53,0x7A54,0x7A55,0x7A56,0x7A58,0x7A59,/* 0x68-0x6F */
        0x7A5A,0x7A5B,0x7A5C,0x7A5D,0x7A5E,0x7A5F,0x7A60,0x7A61,/* 0x70-0x77 */
        0x7A62,0x7A63,0x7A64,0x7A65,0x7A66,0x7A67,0x7A68,0x0000,/* 0x78-0x7F */

        0x7A69,0x7A6A,0x7A6B,0x7A6C,0x7A6D,0x7A6E,0x7A6F,0x7A71,/* 0x80-0x87 */
        0x7A72,0x7A73,0x7A75,0x7A7B,0x7A7C,0x7A7D,0x7A7E,0x7A82,/* 0x88-0x8F */
        0x7A85,0x7A87,0x7A89,0x7A8A,0x7A8B,0x7A8C,0x7A8E,0x7A8F,/* 0x90-0x97 */
        0x7A90,0x7A93,0x7A94,0x7A99,0x7A9A,0x7A9B,0x7A9E,0x7AA1,/* 0x98-0x9F */
        0x7AA2,0x8D30,0x53D1,0x7F5A,0x7B4F,0x4F10,0x4E4F,0x9600,/* 0xA0-0xA7 */
        0x6CD5,0x73D0,0x85E9,0x5E06,0x756A,0x7FFB,0x6A0A,0x77FE,/* 0xA8-0xAF */
        0x9492,0x7E41,0x51E1,0x70E6,0x53CD,0x8FD4,0x8303,0x8D29,/* 0xB0-0xB7 */
        0x72AF,0x996D,0x6CDB,0x574A,0x82B3,0x65B9,0x80AA,0x623F,/* 0xB8-0xBF */
        0x9632,0x59A8,0x4EFF,0x8BBF,0x7EBA,0x653E,0x83F2,0x975E,/* 0xC0-0xC7 */
        0x5561,0x98DE,0x80A5,0x532A,0x8BFD,0x5420,0x80BA,0x5E9F,/* 0xC8-0xCF */
        0x6CB8,0x8D39,0x82AC,0x915A,0x5429,0x6C1B,0x5206,0x7EB7,/* 0xD0-0xD7 */
        0x575F,0x711A,0x6C7E,0x7C89,0x594B,0x4EFD,0x5FFF,0x6124,/* 0xD8-0xDF */
        0x7CAA,0x4E30,0x5C01,0x67AB,0x8702,0x5CF0,0x950B,0x98CE,/* 0xE0-0xE7 */
        0x75AF,0x70FD,0x9022,0x51AF,0x7F1D,0x8BBD,0x5949,0x51E4,/* 0xE8-0xEF */
        0x4F5B,0x5426,0x592B,0x6577,0x80A4,0x5B75,0x6276,0x62C2,/* 0xF0-0xF7 */
        0x8F90,0x5E45,0x6C1F,0x7B26,0x4F0F,0x4FD8,0x670D,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_B8[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x7AA3,0x7AA4,0x7AA7,0x7AA9,0x7AAA,0x7AAB,0x7AAE,0x7AAF,/* 0x40-0x47 */
        0x7AB0,0x7AB1,0x7AB2,0x7AB4,0x7AB5,0x7AB6,0x7AB7,0x7AB8,/* 0x48-0x4F */
        0x7AB9,0x7ABA,0x7ABB,0x7ABC,0x7ABD,0x7ABE,0x7AC0,0x7AC1,/* 0x50-0x57 */
        0x7AC2,0x7AC3,0x7AC4,0x7AC5,0x7AC6,0x7AC7,0x7AC8,0x7AC9,/* 0x58-0x5F */
        0x7ACA,0x7ACC,0x7ACD,0x7ACE,0x7ACF,0x7AD0,0x7AD1,0x7AD2,/* 0x60-0x67 */
        0x7AD3,0x7AD4,0x7AD5,0x7AD7,0x7AD8,0x7ADA,0x7ADB,0x7ADC,/* 0x68-0x6F */
        0x7ADD,0x7AE1,0x7AE2,0x7AE4,0x7AE7,0x7AE8,0x7AE9,0x7AEA,/* 0x70-0x77 */
        0x7AEB,0x7AEC,0x7AEE,0x7AF0,0x7AF1,0x7AF2,0x7AF3,0x0000,/* 0x78-0x7F */

        0x7AF4,0x7AF5,0x7AF6,0x7AF7,0x7AF8,0x7AFB,0x7AFC,0x7AFE,/* 0x80-0x87 */
        0x7B00,0x7B01,0x7B02,0x7B05,0x7B07,0x7B09,0x7B0C,0x7B0D,/* 0x88-0x8F */
        0x7B0E,0x7B10,0x7B12,0x7B13,0x7B16,0x7B17,0x7B18,0x7B1A,/* 0x90-0x97 */
        0x7B1C,0x7B1D,0x7B1F,0x7B21,0x7B22,0x7B23,0x7B27,0x7B29,/* 0x98-0x9F */
        0x7B2D,0x6D6E,0x6DAA,0x798F,0x88B1,0x5F17,0x752B,0x629A,/* 0xA0-0xA7 */
        0x8F85,0x4FEF,0x91DC,0x65A7,0x812F,0x8151,0x5E9C,0x8150,/* 0xA8-0xAF */
        0x8D74,0x526F,0x8986,0x8D4B,0x590D,0x5085,0x4ED8,0x961C,/* 0xB0-0xB7 */
        0x7236,0x8179,0x8D1F,0x5BCC,0x8BA3,0x9644,0x5987,0x7F1A,/* 0xB8-0xBF */
        0x5490,0x5676,0x560E,0x8BE5,0x6539,0x6982,0x9499,0x76D6,/* 0xC0-0xC7 */
        0x6E89,0x5E72,0x7518,0x6746,0x67D1,0x7AFF,0x809D,0x8D76,/* 0xC8-0xCF */
        0x611F,0x79C6,0x6562,0x8D63,0x5188,0x521A,0x94A2,0x7F38,/* 0xD0-0xD7 */
        0x809B,0x7EB2,0x5C97,0x6E2F,0x6760,0x7BD9,0x768B,0x9AD8,/* 0xD8-0xDF */
        0x818F,0x7F94,0x7CD5,0x641E,0x9550,0x7A3F,0x544A,0x54E5,/* 0xE0-0xE7 */
        0x6B4C,0x6401,0x6208,0x9E3D,0x80F3,0x7599,0x5272,0x9769,/* 0xE8-0xEF */
        0x845B,0x683C,0x86E4,0x9601,0x9694,0x94EC,0x4E2A,0x5404,/* 0xF0-0xF7 */
        0x7ED9,0x6839,0x8DDF,0x8015,0x66F4,0x5E9A,0x7FB9,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_B9[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x7B2F,0x7B30,0x7B32,0x7B34,0x7B35,0x7B36,0x7B37,0x7B39,/* 0x40-0x47 */
        0x7B3B,0x7B3D,0x7B3F,0x7B40,0x7B41,0x7B42,0x7B43,0x7B44,/* 0x48-0x4F */
        0x7B46,0x7B48,0x7B4A,0x7B4D,0x7B4E,0x7B53,0x7B55,0x7B57,/* 0x50-0x57 */
        0x7B59,0x7B5C,0x7B5E,0x7B5F,0x7B61,0x7B63,0x7B64,0x7B65,/* 0x58-0x5F */
        0x7B66,0x7B67,0x7B68,0x7B69,0x7B6A,0x7B6B,0x7B6C,0x7B6D,/* 0x60-0x67 */
        0x7B6F,0x7B70,0x7B73,0x7B74,0x7B76,0x7B78,0x7B7A,0x7B7C,/* 0x68-0x6F */
        0x7B7D,0x7B7F,0x7B81,0x7B82,0x7B83,0x7B84,0x7B86,0x7B87,/* 0x70-0x77 */
        0x7B88,0x7B89,0x7B8A,0x7B8B,0x7B8C,0x7B8E,0x7B8F,0x0000,/* 0x78-0x7F */

        0x7B91,0x7B92,0x7B93,0x7B96,0x7B98,0x7B99,0x7B9A,0x7B9B,/* 0x80-0x87 */
        0x7B9E,0x7B9F,0x7BA0,0x7BA3,0x7BA4,0x7BA5,0x7BAE,0x7BAF,/* 0x88-0x8F */
        0x7BB0,0x7BB2,0x7BB3,0x7BB5,0x7BB6,0x7BB7,0x7BB9,0x7BBA,/* 0x90-0x97 */
        0x7BBB,0x7BBC,0x7BBD,0x7BBE,0x7BBF,0x7BC0,0x7BC2,0x7BC3,/* 0x98-0x9F */
        0x7BC4,0x57C2,0x803F,0x6897,0x5DE5,0x653B,0x529F,0x606D,/* 0xA0-0xA7 */
        0x9F9A,0x4F9B,0x8EAC,0x516C,0x5BAB,0x5F13,0x5DE9,0x6C5E,/* 0xA8-0xAF */
        0x62F1,0x8D21,0x5171,0x94A9,0x52FE,0x6C9F,0x82DF,0x72D7,/* 0xB0-0xB7 */
        0x57A2,0x6784,0x8D2D,0x591F,0x8F9C,0x83C7,0x5495,0x7B8D,/* 0xB8-0xBF */
        0x4F30,0x6CBD,0x5B64,0x59D1,0x9F13,0x53E4,0x86CA,0x9AA8,/* 0xC0-0xC7 */
        0x8C37,0x80A1,0x6545,0x987E,0x56FA,0x96C7,0x522E,0x74DC,/* 0xC8-0xCF */
        0x5250,0x5BE1,0x6302,0x8902,0x4E56,0x62D0,0x602A,0x68FA,/* 0xD0-0xD7 */
        0x5173,0x5B98,0x51A0,0x89C2,0x7BA1,0x9986,0x7F50,0x60EF,/* 0xD8-0xDF */
        0x704C,0x8D2F,0x5149,0x5E7F,0x901B,0x7470,0x89C4,0x572D,/* 0xE0-0xE7 */
        0x7845,0x5F52,0x9F9F,0x95FA,0x8F68,0x9B3C,0x8BE1,0x7678,/* 0xE8-0xEF */
        0x6842,0x67DC,0x8DEA,0x8D35,0x523D,0x8F8A,0x6EDA,0x68CD,/* 0xF0-0xF7 */
        0x9505,0x90ED,0x56FD,0x679C,0x88F9,0x8FC7,0x54C8,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_BA[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x7BC5,0x7BC8,0x7BC9,0x7BCA,0x7BCB,0x7BCD,0x7BCE,0x7BCF,/* 0x40-0x47 */
        0x7BD0,0x7BD2,0x7BD4,0x7BD5,0x7BD6,0x7BD7,0x7BD8,0x7BDB,/* 0x48-0x4F */
        0x7BDC,0x7BDE,0x7BDF,0x7BE0,0x7BE2,0x7BE3,0x7BE4,0x7BE7,/* 0x50-0x57 */
        0x7BE8,0x7BE9,0x7BEB,0x7BEC,0x7BED,0x7BEF,0x7BF0,0x7BF2,/* 0x58-0x5F */
        0x7BF3,0x7BF4,0x7BF5,0x7BF6,0x7BF8,0x7BF9,0x7BFA,0x7BFB,/* 0x60-0x67 */
        0x7BFD,0x7BFF,0x7C00,0x7C01,0x7C02,0x7C03,0x7C04,0x7C05,/* 0x68-0x6F */
        0x7C06,0x7C08,0x7C09,0x7C0A,0x7C0D,0x7C0E,0x7C10,0x7C11,/* 0x70-0x77 */
        0x7C12,0x7C13,0x7C14,0x7C15,0x7C17,0x7C18,0x7C19,0x0000,/* 0x78-0x7F */

        0x7C1A,0x7C1B,0x7C1C,0x7C1D,0x7C1E,0x7C20,0x7C21,0x7C22,/* 0x80-0x87 */
        0x7C23,0x7C24,0x7C25,0x7C28,0x7C29,0x7C2B,0x7C2C,0x7C2D,/* 0x88-0x8F */
        0x7C2E,0x7C2F,0x7C30,0x7C31,0x7C32,0x7C33,0x7C34,0x7C35,/* 0x90-0x97 */
        0x7C36,0x7C37,0x7C39,0x7C3A,0x7C3B,0x7C3C,0x7C3D,0x7C3E,/* 0x98-0x9F */
        0x7C42,0x9AB8,0x5B69,0x6D77,0x6C26,0x4EA5,0x5BB3,0x9A87,/* 0xA0-0xA7 */
        0x9163,0x61A8,0x90AF,0x97E9,0x542B,0x6DB5,0x5BD2,0x51FD,/* 0xA8-0xAF */
        0x558A,0x7F55,0x7FF0,0x64BC,0x634D,0x65F1,0x61BE,0x608D,/* 0xB0-0xB7 */
        0x710A,0x6C57,0x6C49,0x592F,0x676D,0x822A,0x58D5,0x568E,/* 0xB8-0xBF */
        0x8C6A,0x6BEB,0x90DD,0x597D,0x8017,0x53F7,0x6D69,0x5475,/* 0xC0-0xC7 */
        0x559D,0x8377,0x83CF,0x6838,0x79BE,0x548C,0x4F55,0x5408,/* 0xC8-0xCF */
        0x76D2,0x8C89,0x9602,0x6CB3,0x6DB8,0x8D6B,0x8910,0x9E64,/* 0xD0-0xD7 */
        0x8D3A,0x563F,0x9ED1,0x75D5,0x5F88,0x72E0,0x6068,0x54FC,/* 0xD8-0xDF */
        0x4EA8,0x6A2A,0x8861,0x6052,0x8F70,0x54C4,0x70D8,0x8679,/* 0xE0-0xE7 */
        0x9E3F,0x6D2A,0x5B8F,0x5F18,0x7EA2,0x5589,0x4FAF,0x7334,/* 0xE8-0xEF */
        0x543C,0x539A,0x5019,0x540E,0x547C,0x4E4E,0x5FFD,0x745A,/* 0xF0-0xF7 */
        0x58F6,0x846B,0x80E1,0x8774,0x72D0,0x7CCA,0x6E56,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_BB[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x7C43,0x7C44,0x7C45,0x7C46,0x7C47,0x7C48,0x7C49,0x7C4A,/* 0x40-0x47 */
        0x7C4B,0x7C4C,0x7C4E,0x7C4F,0x7C50,0x7C51,0x7C52,0x7C53,/* 0x48-0x4F */
        0x7C54,0x7C55,0x7C56,0x7C57,0x7C58,0x7C59,0x7C5A,0x7C5B,/* 0x50-0x57 */
        0x7C5C,0x7C5D,0x7C5E,0x7C5F,0x7C60,0x7C61,0x7C62,0x7C63,/* 0x58-0x5F */
        0x7C64,0x7C65,0x7C66,0x7C67,0x7C68,0x7C69,0x7C6A,0x7C6B,/* 0x60-0x67 */
        0x7C6C,0x7C6D,0x7C6E,0x7C6F,0x7C70,0x7C71,0x7C72,0x7C75,/* 0x68-0x6F */
        0x7C76,0x7C77,0x7C78,0x7C79,0x7C7A,0x7C7E,0x7C7F,0x7C80,/* 0x70-0x77 */
        0x7C81,0x7C82,0x7C83,0x7C84,0x7C85,0x7C86,0x7C87,0x0000,/* 0x78-0x7F */

        0x7C88,0x7C8A,0x7C8B,0x7C8C,0x7C8D,0x7C8E,0x7C8F,0x7C90,/* 0x80-0x87 */
        0x7C93,0x7C94,0x7C96,0x7C99,0x7C9A,0x7C9B,0x7CA0,0x7CA1,/* 0x88-0x8F */
        0x7CA3,0x7CA6,0x7CA7,0x7CA8,0x7CA9,0x7CAB,0x7CAC,0x7CAD,/* 0x90-0x97 */
        0x7CAF,0x7CB0,0x7CB4,0x7CB5,0x7CB6,0x7CB7,0x7CB8,0x7CBA,/* 0x98-0x9F */
        0x7CBB,0x5F27,0x864E,0x552C,0x62A4,0x4E92,0x6CAA,0x6237,/* 0xA0-0xA7 */
        0x82B1,0x54D7,0x534E,0x733E,0x6ED1,0x753B,0x5212,0x5316,/* 0xA8-0xAF */
        0x8BDD,0x69D0,0x5F8A,0x6000,0x6DEE,0x574F,0x6B22,0x73AF,/* 0xB0-0xB7 */
        0x6853,0x8FD8,0x7F13,0x6362,0x60A3,0x5524,0x75EA,0x8C62,/* 0xB8-0xBF */
        0x7115,0x6DA3,0x5BA6,0x5E7B,0x8352,0x614C,0x9EC4,0x78FA,/* 0xC0-0xC7 */
        0x8757,0x7C27,0x7687,0x51F0,0x60F6,0x714C,0x6643,0x5E4C,/* 0xC8-0xCF */
        0x604D,0x8C0E,0x7070,0x6325,0x8F89,0x5FBD,0x6062,0x86D4,/* 0xD0-0xD7 */
        0x56DE,0x6BC1,0x6094,0x6167,0x5349,0x60E0,0x6666,0x8D3F,/* 0xD8-0xDF */
        0x79FD,0x4F1A,0x70E9,0x6C47,0x8BB3,0x8BF2,0x7ED8,0x8364,/* 0xE0-0xE7 */
        0x660F,0x5A5A,0x9B42,0x6D51,0x6DF7,0x8C41,0x6D3B,0x4F19,/* 0xE8-0xEF */
        0x706B,0x83B7,0x6216,0x60D1,0x970D,0x8D27,0x7978,0x51FB,/* 0xF0-0xF7 */
        0x573E,0x57FA,0x673A,0x7578,0x7A3D,0x79EF,0x7B95,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_BC[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x7CBF,0x7CC0,0x7CC2,0x7CC3,0x7CC4,0x7CC6,0x7CC9,0x7CCB,/* 0x40-0x47 */
        0x7CCE,0x7CCF,0x7CD0,0x7CD1,0x7CD2,0x7CD3,0x7CD4,0x7CD8,/* 0x48-0x4F */
        0x7CDA,0x7CDB,0x7CDD,0x7CDE,0x7CE1,0x7CE2,0x7CE3,0x7CE4,/* 0x50-0x57 */
        0x7CE5,0x7CE6,0x7CE7,0x7CE9,0x7CEA,0x7CEB,0x7CEC,0x7CED,/* 0x58-0x5F */
        0x7CEE,0x7CF0,0x7CF1,0x7CF2,0x7CF3,0x7CF4,0x7CF5,0x7CF6,/* 0x60-0x67 */
        0x7CF7,0x7CF9,0x7CFA,0x7CFC,0x7CFD,0x7CFE,0x7CFF,0x7D00,/* 0x68-0x6F */
        0x7D01,0x7D02,0x7D03,0x7D04,0x7D05,0x7D06,0x7D07,0x7D08,/* 0x70-0x77 */
        0x7D09,0x7D0B,0x7D0C,0x7D0D,0x7D0E,0x7D0F,0x7D10,0x0000,/* 0x78-0x7F */

        0x7D11,0x7D12,0x7D13,0x7D14,0x7D15,0x7D16,0x7D17,0x7D18,/* 0x80-0x87 */
        0x7D19,0x7D1A,0x7D1B,0x7D1C,0x7D1D,0x7D1E,0x7D1F,0x7D21,/* 0x88-0x8F */
        0x7D23,0x7D24,0x7D25,0x7D26,0x7D28,0x7D29,0x7D2A,0x7D2C,/* 0x90-0x97 */
        0x7D2D,0x7D2E,0x7D30,0x7D31,0x7D32,0x7D33,0x7D34,0x7D35,/* 0x98-0x9F */
        0x7D36,0x808C,0x9965,0x8FF9,0x6FC0,0x8BA5,0x9E21,0x59EC,/* 0xA0-0xA7 */
        0x7EE9,0x7F09,0x5409,0x6781,0x68D8,0x8F91,0x7C4D,0x96C6,/* 0xA8-0xAF */
        0x53CA,0x6025,0x75BE,0x6C72,0x5373,0x5AC9,0x7EA7,0x6324,/* 0xB0-0xB7 */
        0x51E0,0x810A,0x5DF1,0x84DF,0x6280,0x5180,0x5B63,0x4F0E,/* 0xB8-0xBF */
        0x796D,0x5242,0x60B8,0x6D4E,0x5BC4,0x5BC2,0x8BA1,0x8BB0,/* 0xC0-0xC7 */
        0x65E2,0x5FCC,0x9645,0x5993,0x7EE7,0x7EAA,0x5609,0x67B7,/* 0xC8-0xCF */
        0x5939,0x4F73,0x5BB6,0x52A0,0x835A,0x988A,0x8D3E,0x7532,/* 0xD0-0xD7 */
        0x94BE,0x5047,0x7A3C,0x4EF7,0x67B6,0x9A7E,0x5AC1,0x6B7C,/* 0xD8-0xDF */
        0x76D1,0x575A,0x5C16,0x7B3A,0x95F4,0x714E,0x517C,0x80A9,/* 0xE0-0xE7 */
        0x8270,0x5978,0x7F04,0x8327,0x68C0,0x67EC,0x78B1,0x7877,/* 0xE8-0xEF */
        0x62E3,0x6361,0x7B80,0x4FED,0x526A,0x51CF,0x8350,0x69DB,/* 0xF0-0xF7 */
        0x9274,0x8DF5,0x8D31,0x89C1,0x952E,0x7BAD,0x4EF6,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_BD[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x7D37,0x7D38,0x7D39,0x7D3A,0x7D3B,0x7D3C,0x7D3D,0x7D3E,/* 0x40-0x47 */
        0x7D3F,0x7D40,0x7D41,0x7D42,0x7D43,0x7D44,0x7D45,0x7D46,/* 0x48-0x4F */
        0x7D47,0x7D48,0x7D49,0x7D4A,0x7D4B,0x7D4C,0x7D4D,0x7D4E,/* 0x50-0x57 */
        0x7D4F,0x7D50,0x7D51,0x7D52,0x7D53,0x7D54,0x7D55,0x7D56,/* 0x58-0x5F */
        0x7D57,0x7D58,0x7D59,0x7D5A,0x7D5B,0x7D5C,0x7D5D,0x7D5E,/* 0x60-0x67 */
        0x7D5F,0x7D60,0x7D61,0x7D62,0x7D63,0x7D64,0x7D65,0x7D66,/* 0x68-0x6F */
        0x7D67,0x7D68,0x7D69,0x7D6A,0x7D6B,0x7D6C,0x7D6D,0x7D6F,/* 0x70-0x77 */
        0x7D70,0x7D71,0x7D72,0x7D73,0x7D74,0x7D75,0x7D76,0x0000,/* 0x78-0x7F */

        0x7D78,0x7D79,0x7D7A,0x7D7B,0x7D7C,0x7D7D,0x7D7E,0x7D7F,/* 0x80-0x87 */
        0x7D80,0x7D81,0x7D82,0x7D83,0x7D84,0x7D85,0x7D86,0x7D87,/* 0x88-0x8F */
        0x7D88,0x7D89,0x7D8A,0x7D8B,0x7D8C,0x7D8D,0x7D8E,0x7D8F,/* 0x90-0x97 */
        0x7D90,0x7D91,0x7D92,0x7D93,0x7D94,0x7D95,0x7D96,0x7D97,/* 0x98-0x9F */
        0x7D98,0x5065,0x8230,0x5251,0x996F,0x6E10,0x6E85,0x6DA7,/* 0xA0-0xA7 */
        0x5EFA,0x50F5,0x59DC,0x5C06,0x6D46,0x6C5F,0x7586,0x848B,/* 0xA8-0xAF */
        0x6868,0x5956,0x8BB2,0x5320,0x9171,0x964D,0x8549,0x6912,/* 0xB0-0xB7 */
        0x7901,0x7126,0x80F6,0x4EA4,0x90CA,0x6D47,0x9A84,0x5A07,/* 0xB8-0xBF */
        0x56BC,0x6405,0x94F0,0x77EB,0x4FA5,0x811A,0x72E1,0x89D2,/* 0xC0-0xC7 */
        0x997A,0x7F34,0x7EDE,0x527F,0x6559,0x9175,0x8F7F,0x8F83,/* 0xC8-0xCF */
        0x53EB,0x7A96,0x63ED,0x63A5,0x7686,0x79F8,0x8857,0x9636,/* 0xD0-0xD7 */
        0x622A,0x52AB,0x8282,0x6854,0x6770,0x6377,0x776B,0x7AED,/* 0xD8-0xDF */
        0x6D01,0x7ED3,0x89E3,0x59D0,0x6212,0x85C9,0x82A5,0x754C,/* 0xE0-0xE7 */
        0x501F,0x4ECB,0x75A5,0x8BEB,0x5C4A,0x5DFE,0x7B4B,0x65A4,/* 0xE8-0xEF */
        0x91D1,0x4ECA,0x6D25,0x895F,0x7D27,0x9526,0x4EC5,0x8C28,/* 0xF0-0xF7 */
        0x8FDB,0x9773,0x664B,0x7981,0x8FD1,0x70EC,0x6D78,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_BE[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x7D99,0x7D9A,0x7D9B,0x7D9C,0x7D9D,0x7D9E,0x7D9F,0x7DA0,/* 0x40-0x47 */
        0x7DA1,0x7DA2,0x7DA3,0x7DA4,0x7DA5,0x7DA7,0x7DA8,0x7DA9,/* 0x48-0x4F */
        0x7DAA,0x7DAB,0x7DAC,0x7DAD,0x7DAF,0x7DB0,0x7DB1,0x7DB2,/* 0x50-0x57 */
        0x7DB3,0x7DB4,0x7DB5,0x7DB6,0x7DB7,0x7DB8,0x7DB9,0x7DBA,/* 0x58-0x5F */
        0x7DBB,0x7DBC,0x7DBD,0x7DBE,0x7DBF,0x7DC0,0x7DC1,0x7DC2,/* 0x60-0x67 */
        0x7DC3,0x7DC4,0x7DC5,0x7DC6,0x7DC7,0x7DC8,0x7DC9,0x7DCA,/* 0x68-0x6F */
        0x7DCB,0x7DCC,0x7DCD,0x7DCE,0x7DCF,0x7DD0,0x7DD1,0x7DD2,/* 0x70-0x77 */
        0x7DD3,0x7DD4,0x7DD5,0x7DD6,0x7DD7,0x7DD8,0x7DD9,0x0000,/* 0x78-0x7F */

        0x7DDA,0x7DDB,0x7DDC,0x7DDD,0x7DDE,0x7DDF,0x7DE0,0x7DE1,/* 0x80-0x87 */
        0x7DE2,0x7DE3,0x7DE4,0x7DE5,0x7DE6,0x7DE7,0x7DE8,0x7DE9,/* 0x88-0x8F */
        0x7DEA,0x7DEB,0x7DEC,0x7DED,0x7DEE,0x7DEF,0x7DF0,0x7DF1,/* 0x90-0x97 */
        0x7DF2,0x7DF3,0x7DF4,0x7DF5,0x7DF6,0x7DF7,0x7DF8,0x7DF9,/* 0x98-0x9F */
        0x7DFA,0x5C3D,0x52B2,0x8346,0x5162,0x830E,0x775B,0x6676,/* 0xA0-0xA7 */
        0x9CB8,0x4EAC,0x60CA,0x7CBE,0x7CB3,0x7ECF,0x4E95,0x8B66,/* 0xA8-0xAF */
        0x666F,0x9888,0x9759,0x5883,0x656C,0x955C,0x5F84,0x75C9,/* 0xB0-0xB7 */
        0x9756,0x7ADF,0x7ADE,0x51C0,0x70AF,0x7A98,0x63EA,0x7A76,/* 0xB8-0xBF */
        0x7EA0,0x7396,0x97ED,0x4E45,0x7078,0x4E5D,0x9152,0x53A9,/* 0xC0-0xC7 */
        0x6551,0x65E7,0x81FC,0x8205,0x548E,0x5C31,0x759A,0x97A0,/* 0xC8-0xCF */
        0x62D8,0x72D9,0x75BD,0x5C45,0x9A79,0x83CA,0x5C40,0x5480,/* 0xD0-0xD7 */
        0x77E9,0x4E3E,0x6CAE,0x805A,0x62D2,0x636E,0x5DE8,0x5177,/* 0xD8-0xDF */
        0x8DDD,0x8E1E,0x952F,0x4FF1,0x53E5,0x60E7,0x70AC,0x5267,/* 0xE0-0xE7 */
        0x6350,0x9E43,0x5A1F,0x5026,0x7737,0x5377,0x7EE2,0x6485,/* 0xE8-0xEF */
        0x652B,0x6289,0x6398,0x5014,0x7235,0x89C9,0x51B3,0x8BC0,/* 0xF0-0xF7 */
        0x7EDD,0x5747,0x83CC,0x94A7,0x519B,0x541B,0x5CFB,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_BF[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x7DFB,0x7DFC,0x7DFD,0x7DFE,0x7DFF,0x7E00,0x7E01,0x7E02,/* 0x40-0x47 */
        0x7E03,0x7E04,0x7E05,0x7E06,0x7E07,0x7E08,0x7E09,0x7E0A,/* 0x48-0x4F */
        0x7E0B,0x7E0C,0x7E0D,0x7E0E,0x7E0F,0x7E10,0x7E11,0x7E12,/* 0x50-0x57 */
        0x7E13,0x7E14,0x7E15,0x7E16,0x7E17,0x7E18,0x7E19,0x7E1A,/* 0x58-0x5F */
        0x7E1B,0x7E1C,0x7E1D,0x7E1E,0x7E1F,0x7E20,0x7E21,0x7E22,/* 0x60-0x67 */
        0x7E23,0x7E24,0x7E25,0x7E26,0x7E27,0x7E28,0x7E29,0x7E2A,/* 0x68-0x6F */
        0x7E2B,0x7E2C,0x7E2D,0x7E2E,0x7E2F,0x7E30,0x7E31,0x7E32,/* 0x70-0x77 */
        0x7E33,0x7E34,0x7E35,0x7E36,0x7E37,0x7E38,0x7E39,0x0000,/* 0x78-0x7F */

        0x7E3A,0x7E3C,0x7E3D,0x7E3E,0x7E3F,0x7E40,0x7E42,0x7E43,/* 0x80-0x87 */
        0x7E44,0x7E45,0x7E46,0x7E48,0x7E49,0x7E4A,0x7E4B,0x7E4C,/* 0x88-0x8F */
        0x7E4D,0x7E4E,0x7E4F,0x7E50,0x7E51,0x7E52,0x7E53,0x7E54,/* 0x90-0x97 */
        0x7E55,0x7E56,0x7E57,0x7E58,0x7E59,0x7E5A,0x7E5B,0x7E5C,/* 0x98-0x9F */
        0x7E5D,0x4FCA,0x7AE3,0x6D5A,0x90E1,0x9A8F,0x5580,0x5496,/* 0xA0-0xA7 */
        0x5361,0x54AF,0x5F00,0x63E9,0x6977,0x51EF,0x6168,0x520A,/* 0xA8-0xAF */
        0x582A,0x52D8,0x574E,0x780D,0x770B,0x5EB7,0x6177,0x7CE0,/* 0xB0-0xB7 */
        0x625B,0x6297,0x4EA2,0x7095,0x8003,0x62F7,0x70E4,0x9760,/* 0xB8-0xBF */
        0x5777,0x82DB,0x67EF,0x68F5,0x78D5,0x9897,0x79D1,0x58F3,/* 0xC0-0xC7 */
        0x54B3,0x53EF,0x6E34,0x514B,0x523B,0x5BA2,0x8BFE,0x80AF,/* 0xC8-0xCF */
        0x5543,0x57A6,0x6073,0x5751,0x542D,0x7A7A,0x6050,0x5B54,/* 0xD0-0xD7 */
        0x63A7,0x62A0,0x53E3,0x6263,0x5BC7,0x67AF,0x54ED,0x7A9F,/* 0xD8-0xDF */
        0x82E6,0x9177,0x5E93,0x88E4,0x5938,0x57AE,0x630E,0x8DE8,/* 0xE0-0xE7 */
        0x80EF,0x5757,0x7B77,0x4FA9,0x5FEB,0x5BBD,0x6B3E,0x5321,/* 0xE8-0xEF */
        0x7B50,0x72C2,0x6846,0x77FF,0x7736,0x65F7,0x51B5,0x4E8F,/* 0xF0-0xF7 */
        0x76D4,0x5CBF,0x7AA5,0x8475,0x594E,0x9B41,0x5080,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_C0[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x7E5E,0x7E5F,0x7E60,0x7E61,0x7E62,0x7E63,0x7E64,0x7E65,/* 0x40-0x47 */
        0x7E66,0x7E67,0x7E68,0x7E69,0x7E6A,0x7E6B,0x7E6C,0x7E6D,/* 0x48-0x4F */
        0x7E6E,0x7E6F,0x7E70,0x7E71,0x7E72,0x7E73,0x7E74,0x7E75,/* 0x50-0x57 */
        0x7E76,0x7E77,0x7E78,0x7E79,0x7E7A,0x7E7B,0x7E7C,0x7E7D,/* 0x58-0x5F */
        0x7E7E,0x7E7F,0x7E80,0x7E81,0x7E83,0x7E84,0x7E85,0x7E86,/* 0x60-0x67 */
        0x7E87,0x7E88,0x7E89,0x7E8A,0x7E8B,0x7E8C,0x7E8D,0x7E8E,/* 0x68-0x6F */
        0x7E8F,0x7E90,0x7E91,0x7E92,0x7E93,0x7E94,0x7E95,0x7E96,/* 0x70-0x77 */
        0x7E97,0x7E98,0x7E99,0x7E9A,0x7E9C,0x7E9D,0x7E9E,0x0000,/* 0x78-0x7F */

        0x7EAE,0x7EB4,0x7EBB,0x7EBC,0x7ED6,0x7EE4,0x7EEC,0x7EF9,/* 0x80-0x87 */
        0x7F0A,0x7F10,0x7F1E,0x7F37,0x7F39,0x7F3B,0x7F3C,0x7F3D,/* 0x88-0x8F */
        0x7F3E,0x7F3F,0x7F40,0x7F41,0x7F43,0x7F46,0x7F47,0x7F48,/* 0x90-0x97 */
        0x7F49,0x7F4A,0x7F4B,0x7F4C,0x7F4D,0x7F4E,0x7F4F,0x7F52,/* 0x98-0x9F */
        0x7F53,0x9988,0x6127,0x6E83,0x5764,0x6606,0x6346,0x56F0,/* 0xA0-0xA7 */
        0x62EC,0x6269,0x5ED3,0x9614,0x5783,0x62C9,0x5587,0x8721,/* 0xA8-0xAF */
        0x814A,0x8FA3,0x5566,0x83B1,0x6765,0x8D56,0x84DD,0x5A6A,/* 0xB0-0xB7 */
        0x680F,0x62E6,0x7BEE,0x9611,0x5170,0x6F9C,0x8C30,0x63FD,/* 0xB8-0xBF */
        0x89C8,0x61D2,0x7F06,0x70C2,0x6EE5,0x7405,0x6994,0x72FC,/* 0xC0-0xC7 */
        0x5ECA,0x90CE,0x6717,0x6D6A,0x635E,0x52B3,0x7262,0x8001,/* 0xC8-0xCF */
        0x4F6C,0x59E5,0x916A,0x70D9,0x6D9D,0x52D2,0x4E50,0x96F7,/* 0xD0-0xD7 */
        0x956D,0x857E,0x78CA,0x7D2F,0x5121,0x5792,0x64C2,0x808B,/* 0xD8-0xDF */
        0x7C7B,0x6CEA,0x68F1,0x695E,0x51B7,0x5398,0x68A8,0x7281,/* 0xE0-0xE7 */
        0x9ECE,0x7BF1,0x72F8,0x79BB,0x6F13,0x7406,0x674E,0x91CC,/* 0xE8-0xEF */
        0x9CA4,0x793C,0x8389,0x8354,0x540F,0x6817,0x4E3D,0x5389,/* 0xF0-0xF7 */
        0x52B1,0x783E,0x5386,0x5229,0x5088,0x4F8B,0x4FD0,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_C1[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x7F56,0x7F59,0x7F5B,0x7F5C,0x7F5D,0x7F5E,0x7F60,0x7F63,/* 0x40-0x47 */
        0x7F64,0x7F65,0x7F66,0x7F67,0x7F6B,0x7F6C,0x7F6D,0x7F6F,/* 0x48-0x4F */
        0x7F70,0x7F73,0x7F75,0x7F76,0x7F77,0x7F78,0x7F7A,0x7F7B,/* 0x50-0x57 */
        0x7F7C,0x7F7D,0x7F7F,0x7F80,0x7F82,0x7F83,0x7F84,0x7F85,/* 0x58-0x5F */
        0x7F86,0x7F87,0x7F88,0x7F89,0x7F8B,0x7F8D,0x7F8F,0x7F90,/* 0x60-0x67 */
        0x7F91,0x7F92,0x7F93,0x7F95,0x7F96,0x7F97,0x7F98,0x7F99,/* 0x68-0x6F */
        0x7F9B,0x7F9C,0x7FA0,0x7FA2,0x7FA3,0x7FA5,0x7FA6,0x7FA8,/* 0x70-0x77 */
        0x7FA9,0x7FAA,0x7FAB,0x7FAC,0x7FAD,0x7FAE,0x7FB1,0x0000,/* 0x78-0x7F */

        0x7FB3,0x7FB4,0x7FB5,0x7FB6,0x7FB7,0x7FBA,0x7FBB,0x7FBE,/* 0x80-0x87 */
        0x7FC0,0x7FC2,0x7FC3,0x7FC4,0x7FC6,0x7FC7,0x7FC8,0x7FC9,/* 0x88-0x8F */
        0x7FCB,0x7FCD,0x7FCF,0x7FD0,0x7FD1,0x7FD2,0x7FD3,0x7FD6,/* 0x90-0x97 */
        0x7FD7,0x7FD9,0x7FDA,0x7FDB,0x7FDC,0x7FDD,0x7FDE,0x7FE2,/* 0x98-0x9F */
        0x7FE3,0x75E2,0x7ACB,0x7C92,0x6CA5,0x96B6,0x529B,0x7483,/* 0xA0-0xA7 */
        0x54E9,0x4FE9,0x8054,0x83B2,0x8FDE,0x9570,0x5EC9,0x601C,/* 0xA8-0xAF */
        0x6D9F,0x5E18,0x655B,0x8138,0x94FE,0x604B,0x70BC,0x7EC3,/* 0xB0-0xB7 */
        0x7CAE,0x51C9,0x6881,0x7CB1,0x826F,0x4E24,0x8F86,0x91CF,/* 0xB8-0xBF */
        0x667E,0x4EAE,0x8C05,0x64A9,0x804A,0x50DA,0x7597,0x71CE,/* 0xC0-0xC7 */
        0x5BE5,0x8FBD,0x6F66,0x4E86,0x6482,0x9563,0x5ED6,0x6599,/* 0xC8-0xCF */
        0x5217,0x88C2,0x70C8,0x52A3,0x730E,0x7433,0x6797,0x78F7,/* 0xD0-0xD7 */
        0x9716,0x4E34,0x90BB,0x9CDE,0x6DCB,0x51DB,0x8D41,0x541D,/* 0xD8-0xDF */
        0x62CE,0x73B2,0x83F1,0x96F6,0x9F84,0x94C3,0x4F36,0x7F9A,/* 0xE0-0xE7 */
        0x51CC,0x7075,0x9675,0x5CAD,0x9886,0x53E6,0x4EE4,0x6E9C,/* 0xE8-0xEF */
        0x7409,0x69B4,0x786B,0x998F,0x7559,0x5218,0x7624,0x6D41,/* 0xF0-0xF7 */
        0x67F3,0x516D,0x9F99,0x804B,0x5499,0x7B3C,0x7ABF,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_C2[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x7FE4,0x7FE7,0x7FE8,0x7FEA,0x7FEB,0x7FEC,0x7FED,0x7FEF,/* 0x40-0x47 */
        0x7FF2,0x7FF4,0x7FF5,0x7FF6,0x7FF7,0x7FF8,0x7FF9,0x7FFA,/* 0x48-0x4F */
        0x7FFD,0x7FFE,0x7FFF,0x8002,0x8007,0x8008,0x8009,0x800A,/* 0x50-0x57 */
        0x800E,0x800F,0x8011,0x8013,0x801A,0x801B,0x801D,0x801E,/* 0x58-0x5F */
        0x801F,0x8021,0x8023,0x8024,0x802B,0x802C,0x802D,0x802E,/* 0x60-0x67 */
        0x802F,0x8030,0x8032,0x8034,0x8039,0x803A,0x803C,0x803E,/* 0x68-0x6F */
        0x8040,0x8041,0x8044,0x8045,0x8047,0x8048,0x8049,0x804E,/* 0x70-0x77 */
        0x804F,0x8050,0x8051,0x8053,0x8055,0x8056,0x8057,0x0000,/* 0x78-0x7F */

        0x8059,0x805B,0x805C,0x805D,0x805E,0x805F,0x8060,0x8061,/* 0x80-0x87 */
        0x8062,0x8063,0x8064,0x8065,0x8066,0x8067,0x8068,0x806B,/* 0x88-0x8F */
        0x806C,0x806D,0x806E,0x806F,0x8070,0x8072,0x8073,0x8074,/* 0x90-0x97 */
        0x8075,0x8076,0x8077,0x8078,0x8079,0x807A,0x807B,0x807C,/* 0x98-0x9F */
        0x807D,0x9686,0x5784,0x62E2,0x9647,0x697C,0x5A04,0x6402,/* 0xA0-0xA7 */
        0x7BD3,0x6F0F,0x964B,0x82A6,0x5362,0x9885,0x5E90,0x7089,/* 0xA8-0xAF */
        0x63B3,0x5364,0x864F,0x9C81,0x9E93,0x788C,0x9732,0x8DEF,/* 0xB0-0xB7 */
        0x8D42,0x9E7F,0x6F5E,0x7984,0x5F55,0x9646,0x622E,0x9A74,/* 0xB8-0xBF */
        0x5415,0x94DD,0x4FA3,0x65C5,0x5C65,0x5C61,0x7F15,0x8651,/* 0xC0-0xC7 */
        0x6C2F,0x5F8B,0x7387,0x6EE4,0x7EFF,0x5CE6,0x631B,0x5B6A,/* 0xC8-0xCF */
        0x6EE6,0x5375,0x4E71,0x63A0,0x7565,0x62A1,0x8F6E,0x4F26,/* 0xD0-0xD7 */
        0x4ED1,0x6CA6,0x7EB6,0x8BBA,0x841D,0x87BA,0x7F57,0x903B,/* 0xD8-0xDF */
        0x9523,0x7BA9,0x9AA1,0x88F8,0x843D,0x6D1B,0x9A86,0x7EDC,/* 0xE0-0xE7 */
        0x5988,0x9EBB,0x739B,0x7801,0x8682,0x9A6C,0x9A82,0x561B,/* 0xE8-0xEF */
        0x5417,0x57CB,0x4E70,0x9EA6,0x5356,0x8FC8,0x8109,0x7792,/* 0xF0-0xF7 */
        0x9992,0x86EE,0x6EE1,0x8513,0x66FC,0x6162,0x6F2B,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_C3[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x807E,0x8081,0x8082,0x8085,0x8088,0x808A,0x808D,0x808E,/* 0x40-0x47 */
        0x808F,0x8090,0x8091,0x8092,0x8094,0x8095,0x8097,0x8099,/* 0x48-0x4F */
        0x809E,0x80A3,0x80A6,0x80A7,0x80A8,0x80AC,0x80B0,0x80B3,/* 0x50-0x57 */
        0x80B5,0x80B6,0x80B8,0x80B9,0x80BB,0x80C5,0x80C7,0x80C8,/* 0x58-0x5F */
        0x80C9,0x80CA,0x80CB,0x80CF,0x80D0,0x80D1,0x80D2,0x80D3,/* 0x60-0x67 */
        0x80D4,0x80D5,0x80D8,0x80DF,0x80E0,0x80E2,0x80E3,0x80E6,/* 0x68-0x6F */
        0x80EE,0x80F5,0x80F7,0x80F9,0x80FB,0x80FE,0x80FF,0x8100,/* 0x70-0x77 */
        0x8101,0x8103,0x8104,0x8105,0x8107,0x8108,0x810B,0x0000,/* 0x78-0x7F */

        0x810C,0x8115,0x8117,0x8119,0x811B,0x811C,0x811D,0x811F,/* 0x80-0x87 */
        0x8120,0x8121,0x8122,0x8123,0x8124,0x8125,0x8126,0x8127,/* 0x88-0x8F */
        0x8128,0x8129,0x812A,0x812B,0x812D,0x812E,0x8130,0x8133,/* 0x90-0x97 */
        0x8134,0x8135,0x8137,0x8139,0x813A,0x813B,0x813C,0x813D,/* 0x98-0x9F */
        0x813F,0x8C29,0x8292,0x832B,0x76F2,0x6C13,0x5FD9,0x83BD,/* 0xA0-0xA7 */
        0x732B,0x8305,0x951A,0x6BDB,0x77DB,0x94C6,0x536F,0x8302,/* 0xA8-0xAF */
        0x5192,0x5E3D,0x8C8C,0x8D38,0x4E48,0x73AB,0x679A,0x6885,/* 0xB0-0xB7 */
        0x9176,0x9709,0x7164,0x6CA1,0x7709,0x5A92,0x9541,0x6BCF,/* 0xB8-0xBF */
        0x7F8E,0x6627,0x5BD0,0x59B9,0x5A9A,0x95E8,0x95F7,0x4EEC,/* 0xC0-0xC7 */
        0x840C,0x8499,0x6AAC,0x76DF,0x9530,0x731B,0x68A6,0x5B5F,/* 0xC8-0xCF */
        0x772F,0x919A,0x9761,0x7CDC,0x8FF7,0x8C1C,0x5F25,0x7C73,/* 0xD0-0xD7 */
        0x79D8,0x89C5,0x6CCC,0x871C,0x5BC6,0x5E42,0x68C9,0x7720,/* 0xD8-0xDF */
        0x7EF5,0x5195,0x514D,0x52C9,0x5A29,0x7F05,0x9762,0x82D7,/* 0xE0-0xE7 */
        0x63CF,0x7784,0x85D0,0x79D2,0x6E3A,0x5E99,0x5999,0x8511,/* 0xE8-0xEF */
        0x706D,0x6C11,0x62BF,0x76BF,0x654F,0x60AF,0x95FD,0x660E,/* 0xF0-0xF7 */
        0x879F,0x9E23,0x94ED,0x540D,0x547D,0x8C2C,0x6478,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_C4[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x8140,0x8141,0x8142,0x8143,0x8144,0x8145,0x8147,0x8149,/* 0x40-0x47 */
        0x814D,0x814E,0x814F,0x8152,0x8156,0x8157,0x8158,0x815B,/* 0x48-0x4F */
        0x815C,0x815D,0x815E,0x815F,0x8161,0x8162,0x8163,0x8164,/* 0x50-0x57 */
        0x8166,0x8168,0x816A,0x816B,0x816C,0x816F,0x8172,0x8173,/* 0x58-0x5F */
        0x8175,0x8176,0x8177,0x8178,0x8181,0x8183,0x8184,0x8185,/* 0x60-0x67 */
        0x8186,0x8187,0x8189,0x818B,0x818C,0x818D,0x818E,0x8190,/* 0x68-0x6F */
        0x8192,0x8193,0x8194,0x8195,0x8196,0x8197,0x8199,0x819A,/* 0x70-0x77 */
        0x819E,0x819F,0x81A0,0x81A1,0x81A2,0x81A4,0x81A5,0x0000,/* 0x78-0x7F */

        0x81A7,0x81A9,0x81AB,0x81AC,0x81AD,0x81AE,0x81AF,0x81B0,/* 0x80-0x87 */
        0x81B1,0x81B2,0x81B4,0x81B5,0x81B6,0x81B7,0x81B8,0x81B9,/* 0x88-0x8F */
        0x81BC,0x81BD,0x81BE,0x81BF,0x81C4,0x81C5,0x81C7,0x81C8,/* 0x90-0x97 */
        0x81C9,0x81CB,0x81CD,0x81CE,0x81CF,0x81D0,0x81D1,0x81D2,/* 0x98-0x9F */
        0x81D3,0x6479,0x8611,0x6A21,0x819C,0x78E8,0x6469,0x9B54,/* 0xA0-0xA7 */
        0x62B9,0x672B,0x83AB,0x58A8,0x9ED8,0x6CAB,0x6F20,0x5BDE,/* 0xA8-0xAF */
        0x964C,0x8C0B,0x725F,0x67D0,0x62C7,0x7261,0x4EA9,0x59C6,/* 0xB0-0xB7 */
        0x6BCD,0x5893,0x66AE,0x5E55,0x52DF,0x6155,0x6728,0x76EE,/* 0xB8-0xBF */
        0x7766,0x7267,0x7A46,0x62FF,0x54EA,0x5450,0x94A0,0x90A3,/* 0xC0-0xC7 */
        0x5A1C,0x7EB3,0x6C16,0x4E43,0x5976,0x8010,0x5948,0x5357,/* 0xC8-0xCF */
        0x7537,0x96BE,0x56CA,0x6320,0x8111,0x607C,0x95F9,0x6DD6,/* 0xD0-0xD7 */
        0x5462,0x9981,0x5185,0x5AE9,0x80FD,0x59AE,0x9713,0x502A,/* 0xD8-0xDF */
        0x6CE5,0x5C3C,0x62DF,0x4F60,0x533F,0x817B,0x9006,0x6EBA,/* 0xE0-0xE7 */
        0x852B,0x62C8,0x5E74,0x78BE,0x64B5,0x637B,0x5FF5,0x5A18,/* 0xE8-0xEF */
        0x917F,0x9E1F,0x5C3F,0x634F,0x8042,0x5B7D,0x556E,0x954A,/* 0xF0-0xF7 */
        0x954D,0x6D85,0x60A8,0x67E0,0x72DE,0x51DD,0x5B81,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_C5[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x81D4,0x81D5,0x81D6,0x81D7,0x81D8,0x81D9,0x81DA,0x81DB,/* 0x40-0x47 */
        0x81DC,0x81DD,0x81DE,0x81DF,0x81E0,0x81E1,0x81E2,0x81E4,/* 0x48-0x4F */
        0x81E5,0x81E6,0x81E8,0x81E9,0x81EB,0x81EE,0x81EF,0x81F0,/* 0x50-0x57 */
        0x81F1,0x81F2,0x81F5,0x81F6,0x81F7,0x81F8,0x81F9,0x81FA,/* 0x58-0x5F */
        0x81FD,0x81FF,0x8203,0x8207,0x8208,0x8209,0x820A,0x820B,/* 0x60-0x67 */
        0x820E,0x820F,0x8211,0x8213,0x8215,0x8216,0x8217,0x8218,/* 0x68-0x6F */
        0x8219,0x821A,0x821D,0x8220,0x8224,0x8225,0x8226,0x8227,/* 0x70-0x77 */
        0x8229,0x822E,0x8232,0x823A,0x823C,0x823D,0x823F,0x0000,/* 0x78-0x7F */

        0x8240,0x8241,0x8242,0x8243,0x8245,0x8246,0x8248,0x824A,/* 0x80-0x87 */
        0x824C,0x824D,0x824E,0x8250,0x8251,0x8252,0x8253,0x8254,/* 0x88-0x8F */
        0x8255,0x8256,0x8257,0x8259,0x825B,0x825C,0x825D,0x825E,/* 0x90-0x97 */
        0x8260,0x8261,0x8262,0x8263,0x8264,0x8265,0x8266,0x8267,/* 0x98-0x9F */
        0x8269,0x62E7,0x6CDE,0x725B,0x626D,0x94AE,0x7EBD,0x8113,/* 0xA0-0xA7 */
        0x6D53,0x519C,0x5F04,0x5974,0x52AA,0x6012,0x5973,0x6696,/* 0xA8-0xAF */
        0x8650,0x759F,0x632A,0x61E6,0x7CEF,0x8BFA,0x54E6,0x6B27,/* 0xB0-0xB7 */
        0x9E25,0x6BB4,0x85D5,0x5455,0x5076,0x6CA4,0x556A,0x8DB4,/* 0xB8-0xBF */
        0x722C,0x5E15,0x6015,0x7436,0x62CD,0x6392,0x724C,0x5F98,/* 0xC0-0xC7 */
        0x6E43,0x6D3E,0x6500,0x6F58,0x76D8,0x78D0,0x76FC,0x7554,/* 0xC8-0xCF */
        0x5224,0x53DB,0x4E53,0x5E9E,0x65C1,0x802A,0x80D6,0x629B,/* 0xD0-0xD7 */
        0x5486,0x5228,0x70AE,0x888D,0x8DD1,0x6CE1,0x5478,0x80DA,/* 0xD8-0xDF */
        0x57F9,0x88F4,0x8D54,0x966A,0x914D,0x4F69,0x6C9B,0x55B7,/* 0xE0-0xE7 */
        0x76C6,0x7830,0x62A8,0x70F9,0x6F8E,0x5F6D,0x84EC,0x68DA,/* 0xE8-0xEF */
        0x787C,0x7BF7,0x81A8,0x670B,0x9E4F,0x6367,0x78B0,0x576F,/* 0xF0-0xF7 */
        0x7812,0x9739,0x6279,0x62AB,0x5288,0x7435,0x6BD7,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_C6[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x826A,0x826B,0x826C,0x826D,0x8271,0x8275,0x8276,0x8277,/* 0x40-0x47 */
        0x8278,0x827B,0x827C,0x8280,0x8281,0x8283,0x8285,0x8286,/* 0x48-0x4F */
        0x8287,0x8289,0x828C,0x8290,0x8293,0x8294,0x8295,0x8296,/* 0x50-0x57 */
        0x829A,0x829B,0x829E,0x82A0,0x82A2,0x82A3,0x82A7,0x82B2,/* 0x58-0x5F */
        0x82B5,0x82B6,0x82BA,0x82BB,0x82BC,0x82BF,0x82C0,0x82C2,/* 0x60-0x67 */
        0x82C3,0x82C5,0x82C6,0x82C9,0x82D0,0x82D6,0x82D9,0x82DA,/* 0x68-0x6F */
        0x82DD,0x82E2,0x82E7,0x82E8,0x82E9,0x82EA,0x82EC,0x82ED,/* 0x70-0x77 */
        0x82EE,0x82F0,0x82F2,0x82F3,0x82F5,0x82F6,0x82F8,0x0000,/* 0x78-0x7F */

        0x82FA,0x82FC,0x82FD,0x82FE,0x82FF,0x8300,0x830A,0x830B,/* 0x80-0x87 */
        0x830D,0x8310,0x8312,0x8313,0x8316,0x8318,0x8319,0x831D,/* 0x88-0x8F */
        0x831E,0x831F,0x8320,0x8321,0x8322,0x8323,0x8324,0x8325,/* 0x90-0x97 */
        0x8326,0x8329,0x832A,0x832E,0x8330,0x8332,0x8337,0x833B,/* 0x98-0x9F */
        0x833D,0x5564,0x813E,0x75B2,0x76AE,0x5339,0x75DE,0x50FB,/* 0xA0-0xA7 */
        0x5C41,0x8B6C,0x7BC7,0x504F,0x7247,0x9A97,0x98D8,0x6F02,/* 0xA8-0xAF */
        0x74E2,0x7968,0x6487,0x77A5,0x62FC,0x9891,0x8D2B,0x54C1,/* 0xB0-0xB7 */
        0x8058,0x4E52,0x576A,0x82F9,0x840D,0x5E73,0x51ED,0x74F6,/* 0xB8-0xBF */
        0x8BC4,0x5C4F,0x5761,0x6CFC,0x9887,0x5A46,0x7834,0x9B44,/* 0xC0-0xC7 */
        0x8FEB,0x7C95,0x5256,0x6251,0x94FA,0x4EC6,0x8386,0x8461,/* 0xC8-0xCF */
        0x83E9,0x84B2,0x57D4,0x6734,0x5703,0x666E,0x6D66,0x8C31,/* 0xD0-0xD7 */
        0x66DD,0x7011,0x671F,0x6B3A,0x6816,0x621A,0x59BB,0x4E03,/* 0xD8-0xDF */
        0x51C4,0x6F06,0x67D2,0x6C8F,0x5176,0x68CB,0x5947,0x6B67,/* 0xE0-0xE7 */
        0x7566,0x5D0E,0x8110,0x9F50,0x65D7,0x7948,0x7941,0x9A91,/* 0xE8-0xEF */
        0x8D77,0x5C82,0x4E5E,0x4F01,0x542F,0x5951,0x780C,0x5668,/* 0xF0-0xF7 */
        0x6C14,0x8FC4,0x5F03,0x6C7D,0x6CE3,0x8BAB,0x6390,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_C7[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x833E,0x833F,0x8341,0x8342,0x8344,0x8345,0x8348,0x834A,/* 0x40-0x47 */
        0x834B,0x834C,0x834D,0x834E,0x8353,0x8355,0x8356,0x8357,/* 0x48-0x4F */
        0x8358,0x8359,0x835D,0x8362,0x8370,0x8371,0x8372,0x8373,/* 0x50-0x57 */
        0x8374,0x8375,0x8376,0x8379,0x837A,0x837E,0x837F,0x8380,/* 0x58-0x5F */
        0x8381,0x8382,0x8383,0x8384,0x8387,0x8388,0x838A,0x838B,/* 0x60-0x67 */
        0x838C,0x838D,0x838F,0x8390,0x8391,0x8394,0x8395,0x8396,/* 0x68-0x6F */
        0x8397,0x8399,0x839A,0x839D,0x839F,0x83A1,0x83A2,0x83A3,/* 0x70-0x77 */
        0x83A4,0x83A5,0x83A6,0x83A7,0x83AC,0x83AD,0x83AE,0x0000,/* 0x78-0x7F */

        0x83AF,0x83B5,0x83BB,0x83BE,0x83BF,0x83C2,0x83C3,0x83C4,/* 0x80-0x87 */
        0x83C6,0x83C8,0x83C9,0x83CB,0x83CD,0x83CE,0x83D0,0x83D1,/* 0x88-0x8F */
        0x83D2,0x83D3,0x83D5,0x83D7,0x83D9,0x83DA,0x83DB,0x83DE,/* 0x90-0x97 */
        0x83E2,0x83E3,0x83E4,0x83E6,0x83E7,0x83E8,0x83EB,0x83EC,/* 0x98-0x9F */
        0x83ED,0x6070,0x6D3D,0x7275,0x6266,0x948E,0x94C5,0x5343,/* 0xA0-0xA7 */
        0x8FC1,0x7B7E,0x4EDF,0x8C26,0x4E7E,0x9ED4,0x94B1,0x94B3,/* 0xA8-0xAF */
        0x524D,0x6F5C,0x9063,0x6D45,0x8C34,0x5811,0x5D4C,0x6B20,/* 0xB0-0xB7 */
        0x6B49,0x67AA,0x545B,0x8154,0x7F8C,0x5899,0x8537,0x5F3A,/* 0xB8-0xBF */
        0x62A2,0x6A47,0x9539,0x6572,0x6084,0x6865,0x77A7,0x4E54,/* 0xC0-0xC7 */
        0x4FA8,0x5DE7,0x9798,0x64AC,0x7FD8,0x5CED,0x4FCF,0x7A8D,/* 0xC8-0xCF */
        0x5207,0x8304,0x4E14,0x602F,0x7A83,0x94A6,0x4FB5,0x4EB2,/* 0xD0-0xD7 */
        0x79E6,0x7434,0x52E4,0x82B9,0x64D2,0x79BD,0x5BDD,0x6C81,/* 0xD8-0xDF */
        0x9752,0x8F7B,0x6C22,0x503E,0x537F,0x6E05,0x64CE,0x6674,/* 0xE0-0xE7 */
        0x6C30,0x60C5,0x9877,0x8BF7,0x5E86,0x743C,0x7A77,0x79CB,/* 0xE8-0xEF */
        0x4E18,0x90B1,0x7403,0x6C42,0x56DA,0x914B,0x6CC5,0x8D8B,/* 0xF0-0xF7 */
        0x533A,0x86C6,0x66F2,0x8EAF,0x5C48,0x9A71,0x6E20,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_C8[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x83EE,0x83EF,0x83F3,0x83F4,0x83F5,0x83F6,0x83F7,0x83FA,/* 0x40-0x47 */
        0x83FB,0x83FC,0x83FE,0x83FF,0x8400,0x8402,0x8405,0x8407,/* 0x48-0x4F */
        0x8408,0x8409,0x840A,0x8410,0x8412,0x8413,0x8414,0x8415,/* 0x50-0x57 */
        0x8416,0x8417,0x8419,0x841A,0x841B,0x841E,0x841F,0x8420,/* 0x58-0x5F */
        0x8421,0x8422,0x8423,0x8429,0x842A,0x842B,0x842C,0x842D,/* 0x60-0x67 */
        0x842E,0x842F,0x8430,0x8432,0x8433,0x8434,0x8435,0x8436,/* 0x68-0x6F */
        0x8437,0x8439,0x843A,0x843B,0x843E,0x843F,0x8440,0x8441,/* 0x70-0x77 */
        0x8442,0x8443,0x8444,0x8445,0x8447,0x8448,0x8449,0x0000,/* 0x78-0x7F */

        0x844A,0x844B,0x844C,0x844D,0x844E,0x844F,0x8450,0x8452,/* 0x80-0x87 */
        0x8453,0x8454,0x8455,0x8456,0x8458,0x845D,0x845E,0x845F,/* 0x88-0x8F */
        0x8460,0x8462,0x8464,0x8465,0x8466,0x8467,0x8468,0x846A,/* 0x90-0x97 */
        0x846E,0x846F,0x8470,0x8472,0x8474,0x8477,0x8479,0x847B,/* 0x98-0x9F */
        0x847C,0x53D6,0x5A36,0x9F8B,0x8DA3,0x53BB,0x5708,0x98A7,/* 0xA0-0xA7 */
        0x6743,0x919B,0x6CC9,0x5168,0x75CA,0x62F3,0x72AC,0x5238,/* 0xA8-0xAF */
        0x529D,0x7F3A,0x7094,0x7638,0x5374,0x9E4A,0x69B7,0x786E,/* 0xB0-0xB7 */
        0x96C0,0x88D9,0x7FA4,0x7136,0x71C3,0x5189,0x67D3,0x74E4,/* 0xB8-0xBF */
        0x58E4,0x6518,0x56B7,0x8BA9,0x9976,0x6270,0x7ED5,0x60F9,/* 0xC0-0xC7 */
        0x70ED,0x58EC,0x4EC1,0x4EBA,0x5FCD,0x97E7,0x4EFB,0x8BA4,/* 0xC8-0xCF */
        0x5203,0x598A,0x7EAB,0x6254,0x4ECD,0x65E5,0x620E,0x8338,/* 0xD0-0xD7 */
        0x84C9,0x8363,0x878D,0x7194,0x6EB6,0x5BB9,0x7ED2,0x5197,/* 0xD8-0xDF */
        0x63C9,0x67D4,0x8089,0x8339,0x8815,0x5112,0x5B7A,0x5982,/* 0xE0-0xE7 */
        0x8FB1,0x4E73,0x6C5D,0x5165,0x8925,0x8F6F,0x962E,0x854A,/* 0xE8-0xEF */
        0x745E,0x9510,0x95F0,0x6DA6,0x82E5,0x5F31,0x6492,0x6D12,/* 0xF0-0xF7 */
        0x8428,0x816E,0x9CC3,0x585E,0x8D5B,0x4E09,0x53C1,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_C9[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x847D,0x847E,0x847F,0x8480,0x8481,0x8483,0x8484,0x8485,/* 0x40-0x47 */
        0x8486,0x848A,0x848D,0x848F,0x8490,0x8491,0x8492,0x8493,/* 0x48-0x4F */
        0x8494,0x8495,0x8496,0x8498,0x849A,0x849B,0x849D,0x849E,/* 0x50-0x57 */
        0x849F,0x84A0,0x84A2,0x84A3,0x84A4,0x84A5,0x84A6,0x84A7,/* 0x58-0x5F */
        0x84A8,0x84A9,0x84AA,0x84AB,0x84AC,0x84AD,0x84AE,0x84B0,/* 0x60-0x67 */
        0x84B1,0x84B3,0x84B5,0x84B6,0x84B7,0x84BB,0x84BC,0x84BE,/* 0x68-0x6F */
        0x84C0,0x84C2,0x84C3,0x84C5,0x84C6,0x84C7,0x84C8,0x84CB,/* 0x70-0x77 */
        0x84CC,0x84CE,0x84CF,0x84D2,0x84D4,0x84D5,0x84D7,0x0000,/* 0x78-0x7F */

        0x84D8,0x84D9,0x84DA,0x84DB,0x84DC,0x84DE,0x84E1,0x84E2,/* 0x80-0x87 */
        0x84E4,0x84E7,0x84E8,0x84E9,0x84EA,0x84EB,0x84ED,0x84EE,/* 0x88-0x8F */
        0x84EF,0x84F1,0x84F2,0x84F3,0x84F4,0x84F5,0x84F6,0x84F7,/* 0x90-0x97 */
        0x84F8,0x84F9,0x84FA,0x84FB,0x84FD,0x84FE,0x8500,0x8501,/* 0x98-0x9F */
        0x8502,0x4F1E,0x6563,0x6851,0x55D3,0x4E27,0x6414,0x9A9A,/* 0xA0-0xA7 */
        0x626B,0x5AC2,0x745F,0x8272,0x6DA9,0x68EE,0x50E7,0x838E,/* 0xA8-0xAF */
        0x7802,0x6740,0x5239,0x6C99,0x7EB1,0x50BB,0x5565,0x715E,/* 0xB0-0xB7 */
        0x7B5B,0x6652,0x73CA,0x82EB,0x6749,0x5C71,0x5220,0x717D,/* 0xB8-0xBF */
        0x886B,0x95EA,0x9655,0x64C5,0x8D61,0x81B3,0x5584,0x6C55,/* 0xC0-0xC7 */
        0x6247,0x7F2E,0x5892,0x4F24,0x5546,0x8D4F,0x664C,0x4E0A,/* 0xC8-0xCF */
        0x5C1A,0x88F3,0x68A2,0x634E,0x7A0D,0x70E7,0x828D,0x52FA,/* 0xD0-0xD7 */
        0x97F6,0x5C11,0x54E8,0x90B5,0x7ECD,0x5962,0x8D4A,0x86C7,/* 0xD8-0xDF */
        0x820C,0x820D,0x8D66,0x6444,0x5C04,0x6151,0x6D89,0x793E,/* 0xE0-0xE7 */
        0x8BBE,0x7837,0x7533,0x547B,0x4F38,0x8EAB,0x6DF1,0x5A20,/* 0xE8-0xEF */
        0x7EC5,0x795E,0x6C88,0x5BA1,0x5A76,0x751A,0x80BE,0x614E,/* 0xF0-0xF7 */
        0x6E17,0x58F0,0x751F,0x7525,0x7272,0x5347,0x7EF3,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_CA[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x8503,0x8504,0x8505,0x8506,0x8507,0x8508,0x8509,0x850A,/* 0x40-0x47 */
        0x850B,0x850D,0x850E,0x850F,0x8510,0x8512,0x8514,0x8515,/* 0x48-0x4F */
        0x8516,0x8518,0x8519,0x851B,0x851C,0x851D,0x851E,0x8520,/* 0x50-0x57 */
        0x8522,0x8523,0x8524,0x8525,0x8526,0x8527,0x8528,0x8529,/* 0x58-0x5F */
        0x852A,0x852D,0x852E,0x852F,0x8530,0x8531,0x8532,0x8533,/* 0x60-0x67 */
        0x8534,0x8535,0x8536,0x853E,0x853F,0x8540,0x8541,0x8542,/* 0x68-0x6F */
        0x8544,0x8545,0x8546,0x8547,0x854B,0x854C,0x854D,0x854E,/* 0x70-0x77 */
        0x854F,0x8550,0x8551,0x8552,0x8553,0x8554,0x8555,0x0000,/* 0x78-0x7F */

        0x8557,0x8558,0x855A,0x855B,0x855C,0x855D,0x855F,0x8560,/* 0x80-0x87 */
        0x8561,0x8562,0x8563,0x8565,0x8566,0x8567,0x8569,0x856A,/* 0x88-0x8F */
        0x856B,0x856C,0x856D,0x856E,0x856F,0x8570,0x8571,0x8573,/* 0x90-0x97 */
        0x8575,0x8576,0x8577,0x8578,0x857C,0x857D,0x857F,0x8580,/* 0x98-0x9F */
        0x8581,0x7701,0x76DB,0x5269,0x80DC,0x5723,0x5E08,0x5931,/* 0xA0-0xA7 */
        0x72EE,0x65BD,0x6E7F,0x8BD7,0x5C38,0x8671,0x5341,0x77F3,/* 0xA8-0xAF */
        0x62FE,0x65F6,0x4EC0,0x98DF,0x8680,0x5B9E,0x8BC6,0x53F2,/* 0xB0-0xB7 */
        0x77E2,0x4F7F,0x5C4E,0x9A76,0x59CB,0x5F0F,0x793A,0x58EB,/* 0xB8-0xBF */
        0x4E16,0x67FF,0x4E8B,0x62ED,0x8A93,0x901D,0x52BF,0x662F,/* 0xC0-0xC7 */
        0x55DC,0x566C,0x9002,0x4ED5,0x4F8D,0x91CA,0x9970,0x6C0F,/* 0xC8-0xCF */
        0x5E02,0x6043,0x5BA4,0x89C6,0x8BD5,0x6536,0x624B,0x9996,/* 0xD0-0xD7 */
        0x5B88,0x5BFF,0x6388,0x552E,0x53D7,0x7626,0x517D,0x852C,/* 0xD8-0xDF */
        0x67A2,0x68B3,0x6B8A,0x6292,0x8F93,0x53D4,0x8212,0x6DD1,/* 0xE0-0xE7 */
        0x758F,0x4E66,0x8D4E,0x5B70,0x719F,0x85AF,0x6691,0x66D9,/* 0xE8-0xEF */
        0x7F72,0x8700,0x9ECD,0x9F20,0x5C5E,0x672F,0x8FF0,0x6811,/* 0xF0-0xF7 */
        0x675F,0x620D,0x7AD6,0x5885,0x5EB6,0x6570,0x6F31,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_CB[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x8582,0x8583,0x8586,0x8588,0x8589,0x858A,0x858B,0x858C,/* 0x40-0x47 */
        0x858D,0x858E,0x8590,0x8591,0x8592,0x8593,0x8594,0x8595,/* 0x48-0x4F */
        0x8596,0x8597,0x8598,0x8599,0x859A,0x859D,0x859E,0x859F,/* 0x50-0x57 */
        0x85A0,0x85A1,0x85A2,0x85A3,0x85A5,0x85A6,0x85A7,0x85A9,/* 0x58-0x5F */
        0x85AB,0x85AC,0x85AD,0x85B1,0x85B2,0x85B3,0x85B4,0x85B5,/* 0x60-0x67 */
        0x85B6,0x85B8,0x85BA,0x85BB,0x85BC,0x85BD,0x85BE,0x85BF,/* 0x68-0x6F */
        0x85C0,0x85C2,0x85C3,0x85C4,0x85C5,0x85C6,0x85C7,0x85C8,/* 0x70-0x77 */
        0x85CA,0x85CB,0x85CC,0x85CD,0x85CE,0x85D1,0x85D2,0x0000,/* 0x78-0x7F */

        0x85D4,0x85D6,0x85D7,0x85D8,0x85D9,0x85DA,0x85DB,0x85DD,/* 0x80-0x87 */
        0x85DE,0x85DF,0x85E0,0x85E1,0x85E2,0x85E3,0x85E5,0x85E6,/* 0x88-0x8F */
        0x85E7,0x85E8,0x85EA,0x85EB,0x85EC,0x85ED,0x85EE,0x85EF,/* 0x90-0x97 */
        0x85F0,0x85F1,0x85F2,0x85F3,0x85F4,0x85F5,0x85F6,0x85F7,/* 0x98-0x9F */
        0x85F8,0x6055,0x5237,0x800D,0x6454,0x8870,0x7529,0x5E05,/* 0xA0-0xA7 */
        0x6813,0x62F4,0x971C,0x53CC,0x723D,0x8C01,0x6C34,0x7761,/* 0xA8-0xAF */
        0x7A0E,0x542E,0x77AC,0x987A,0x821C,0x8BF4,0x7855,0x6714,/* 0xB0-0xB7 */
        0x70C1,0x65AF,0x6495,0x5636,0x601D,0x79C1,0x53F8,0x4E1D,/* 0xB8-0xBF */
        0x6B7B,0x8086,0x5BFA,0x55E3,0x56DB,0x4F3A,0x4F3C,0x9972,/* 0xC0-0xC7 */
        0x5DF3,0x677E,0x8038,0x6002,0x9882,0x9001,0x5B8B,0x8BBC,/* 0xC8-0xCF */
        0x8BF5,0x641C,0x8258,0x64DE,0x55FD,0x82CF,0x9165,0x4FD7,/* 0xD0-0xD7 */
        0x7D20,0x901F,0x7C9F,0x50F3,0x5851,0x6EAF,0x5BBF,0x8BC9,/* 0xD8-0xDF */
        0x8083,0x9178,0x849C,0x7B97,0x867D,0x968B,0x968F,0x7EE5,/* 0xE0-0xE7 */
        0x9AD3,0x788E,0x5C81,0x7A57,0x9042,0x96A7,0x795F,0x5B59,/* 0xE8-0xEF */
        0x635F,0x7B0B,0x84D1,0x68AD,0x5506,0x7F29,0x7410,0x7D22,/* 0xF0-0xF7 */
        0x9501,0x6240,0x584C,0x4ED6,0x5B83,0x5979,0x5854,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_CC[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x85F9,0x85FA,0x85FC,0x85FD,0x85FE,0x8600,0x8601,0x8602,/* 0x40-0x47 */
        0x8603,0x8604,0x8606,0x8607,0x8608,0x8609,0x860A,0x860B,/* 0x48-0x4F */
        0x860C,0x860D,0x860E,0x860F,0x8610,0x8612,0x8613,0x8614,/* 0x50-0x57 */
        0x8615,0x8617,0x8618,0x8619,0x861A,0x861B,0x861C,0x861D,/* 0x58-0x5F */
        0x861E,0x861F,0x8620,0x8621,0x8622,0x8623,0x8624,0x8625,/* 0x60-0x67 */
        0x8626,0x8628,0x862A,0x862B,0x862C,0x862D,0x862E,0x862F,/* 0x68-0x6F */
        0x8630,0x8631,0x8632,0x8633,0x8634,0x8635,0x8636,0x8637,/* 0x70-0x77 */
        0x8639,0x863A,0x863B,0x863D,0x863E,0x863F,0x8640,0x0000,/* 0x78-0x7F */

        0x8641,0x8642,0x8643,0x8644,0x8645,0x8646,0x8647,0x8648,/* 0x80-0x87 */
        0x8649,0x864A,0x864B,0x864C,0x8652,0x8653,0x8655,0x8656,/* 0x88-0x8F */
        0x8657,0x8658,0x8659,0x865B,0x865C,0x865D,0x865F,0x8660,/* 0x90-0x97 */
        0x8661,0x8663,0x8664,0x8665,0x8666,0x8667,0x8668,0x8669,/* 0x98-0x9F */
        0x866A,0x736D,0x631E,0x8E4B,0x8E0F,0x80CE,0x82D4,0x62AC,/* 0xA0-0xA7 */
        0x53F0,0x6CF0,0x915E,0x592A,0x6001,0x6C70,0x574D,0x644A,/* 0xA8-0xAF */
        0x8D2A,0x762B,0x6EE9,0x575B,0x6A80,0x75F0,0x6F6D,0x8C2D,/* 0xB0-0xB7 */
        0x8C08,0x5766,0x6BEF,0x8892,0x78B3,0x63A2,0x53F9,0x70AD,/* 0xB8-0xBF */
        0x6C64,0x5858,0x642A,0x5802,0x68E0,0x819B,0x5510,0x7CD6,/* 0xC0-0xC7 */
        0x5018,0x8EBA,0x6DCC,0x8D9F,0x70EB,0x638F,0x6D9B,0x6ED4,/* 0xC8-0xCF */
        0x7EE6,0x8404,0x6843,0x9003,0x6DD8,0x9676,0x8BA8,0x5957,/* 0xD0-0xD7 */
        0x7279,0x85E4,0x817E,0x75BC,0x8A8A,0x68AF,0x5254,0x8E22,/* 0xD8-0xDF */
        0x9511,0x63D0,0x9898,0x8E44,0x557C,0x4F53,0x66FF,0x568F,/* 0xE0-0xE7 */
        0x60D5,0x6D95,0x5243,0x5C49,0x5929,0x6DFB,0x586B,0x7530,/* 0xE8-0xEF */
        0x751C,0x606C,0x8214,0x8146,0x6311,0x6761,0x8FE2,0x773A,/* 0xF0-0xF7 */
        0x8DF3,0x8D34,0x94C1,0x5E16,0x5385,0x542C,0x70C3,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_CD[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x866D,0x866F,0x8670,0x8672,0x8673,0x8674,0x8675,0x8676,/* 0x40-0x47 */
        0x8677,0x8678,0x8683,0x8684,0x8685,0x8686,0x8687,0x8688,/* 0x48-0x4F */
        0x8689,0x868E,0x868F,0x8690,0x8691,0x8692,0x8694,0x8696,/* 0x50-0x57 */
        0x8697,0x8698,0x8699,0x869A,0x869B,0x869E,0x869F,0x86A0,/* 0x58-0x5F */
        0x86A1,0x86A2,0x86A5,0x86A6,0x86AB,0x86AD,0x86AE,0x86B2,/* 0x60-0x67 */
        0x86B3,0x86B7,0x86B8,0x86B9,0x86BB,0x86BC,0x86BD,0x86BE,/* 0x68-0x6F */
        0x86BF,0x86C1,0x86C2,0x86C3,0x86C5,0x86C8,0x86CC,0x86CD,/* 0x70-0x77 */
        0x86D2,0x86D3,0x86D5,0x86D6,0x86D7,0x86DA,0x86DC,0x0000,/* 0x78-0x7F */

        0x86DD,0x86E0,0x86E1,0x86E2,0x86E3,0x86E5,0x86E6,0x86E7,/* 0x80-0x87 */
        0x86E8,0x86EA,0x86EB,0x86EC,0x86EF,0x86F5,0x86F6,0x86F7,/* 0x88-0x8F */
        0x86FA,0x86FB,0x86FC,0x86FD,0x86FF,0x8701,0x8704,0x8705,/* 0x90-0x97 */
        0x8706,0x870B,0x870C,0x870E,0x870F,0x8710,0x8711,0x8714,/* 0x98-0x9F */
        0x8716,0x6C40,0x5EF7,0x505C,0x4EAD,0x5EAD,0x633A,0x8247,/* 0xA0-0xA7 */
        0x901A,0x6850,0x916E,0x77B3,0x540C,0x94DC,0x5F64,0x7AE5,/* 0xA8-0xAF */
        0x6876,0x6345,0x7B52,0x7EDF,0x75DB,0x5077,0x6295,0x5934,/* 0xB0-0xB7 */
        0x900F,0x51F8,0x79C3,0x7A81,0x56FE,0x5F92,0x9014,0x6D82,/* 0xB8-0xBF */
        0x5C60,0x571F,0x5410,0x5154,0x6E4D,0x56E2,0x63A8,0x9893,/* 0xC0-0xC7 */
        0x817F,0x8715,0x892A,0x9000,0x541E,0x5C6F,0x81C0,0x62D6,/* 0xC8-0xCF */
        0x6258,0x8131,0x9E35,0x9640,0x9A6E,0x9A7C,0x692D,0x59A5,/* 0xD0-0xD7 */
        0x62D3,0x553E,0x6316,0x54C7,0x86D9,0x6D3C,0x5A03,0x74E6,/* 0xD8-0xDF */
        0x889C,0x6B6A,0x5916,0x8C4C,0x5F2F,0x6E7E,0x73A9,0x987D,/* 0xE0-0xE7 */
        0x4E38,0x70F7,0x5B8C,0x7897,0x633D,0x665A,0x7696,0x60CB,/* 0xE8-0xEF */
        0x5B9B,0x5A49,0x4E07,0x8155,0x6C6A,0x738B,0x4EA1,0x6789,/* 0xF0-0xF7 */
        0x7F51,0x5F80,0x65FA,0x671B,0x5FD8,0x5984,0x5A01,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_CE[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x8719,0x871B,0x871D,0x871F,0x8720,0x8724,0x8726,0x8727,/* 0x40-0x47 */
        0x8728,0x872A,0x872B,0x872C,0x872D,0x872F,0x8730,0x8732,/* 0x48-0x4F */
        0x8733,0x8735,0x8736,0x8738,0x8739,0x873A,0x873C,0x873D,/* 0x50-0x57 */
        0x8740,0x8741,0x8742,0x8743,0x8744,0x8745,0x8746,0x874A,/* 0x58-0x5F */
        0x874B,0x874D,0x874F,0x8750,0x8751,0x8752,0x8754,0x8755,/* 0x60-0x67 */
        0x8756,0x8758,0x875A,0x875B,0x875C,0x875D,0x875E,0x875F,/* 0x68-0x6F */
        0x8761,0x8762,0x8766,0x8767,0x8768,0x8769,0x876A,0x876B,/* 0x70-0x77 */
        0x876C,0x876D,0x876F,0x8771,0x8772,0x8773,0x8775,0x0000,/* 0x78-0x7F */

        0x8777,0x8778,0x8779,0x877A,0x877F,0x8780,0x8781,0x8784,/* 0x80-0x87 */
        0x8786,0x8787,0x8789,0x878A,0x878C,0x878E,0x878F,0x8790,/* 0x88-0x8F */
        0x8791,0x8792,0x8794,0x8795,0x8796,0x8798,0x8799,0x879A,/* 0x90-0x97 */
        0x879B,0x879C,0x879D,0x879E,0x87A0,0x87A1,0x87A2,0x87A3,/* 0x98-0x9F */
        0x87A4,0x5DCD,0x5FAE,0x5371,0x97E6,0x8FDD,0x6845,0x56F4,/* 0xA0-0xA7 */
        0x552F,0x60DF,0x4E3A,0x6F4D,0x7EF4,0x82C7,0x840E,0x59D4,/* 0xA8-0xAF */
        0x4F1F,0x4F2A,0x5C3E,0x7EAC,0x672A,0x851A,0x5473,0x754F,/* 0xB0-0xB7 */
        0x80C3,0x5582,0x9B4F,0x4F4D,0x6E2D,0x8C13,0x5C09,0x6170,/* 0xB8-0xBF */
        0x536B,0x761F,0x6E29,0x868A,0x6587,0x95FB,0x7EB9,0x543B,/* 0xC0-0xC7 */
        0x7A33,0x7D0A,0x95EE,0x55E1,0x7FC1,0x74EE,0x631D,0x8717,/* 0xC8-0xCF */
        0x6DA1,0x7A9D,0x6211,0x65A1,0x5367,0x63E1,0x6C83,0x5DEB,/* 0xD0-0xD7 */
        0x545C,0x94A8,0x4E4C,0x6C61,0x8BEC,0x5C4B,0x65E0,0x829C,/* 0xD8-0xDF */
        0x68A7,0x543E,0x5434,0x6BCB,0x6B66,0x4E94,0x6342,0x5348,/* 0xE0-0xE7 */
        0x821E,0x4F0D,0x4FAE,0x575E,0x620A,0x96FE,0x6664,0x7269,/* 0xE8-0xEF */
        0x52FF,0x52A1,0x609F,0x8BEF,0x6614,0x7199,0x6790,0x897F,/* 0xF0-0xF7 */
        0x7852,0x77FD,0x6670,0x563B,0x5438,0x9521,0x727A,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_CF[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x87A5,0x87A6,0x87A7,0x87A9,0x87AA,0x87AE,0x87B0,0x87B1,/* 0x40-0x47 */
        0x87B2,0x87B4,0x87B6,0x87B7,0x87B8,0x87B9,0x87BB,0x87BC,/* 0x48-0x4F */
        0x87BE,0x87BF,0x87C1,0x87C2,0x87C3,0x87C4,0x87C5,0x87C7,/* 0x50-0x57 */
        0x87C8,0x87C9,0x87CC,0x87CD,0x87CE,0x87CF,0x87D0,0x87D4,/* 0x58-0x5F */
        0x87D5,0x87D6,0x87D7,0x87D8,0x87D9,0x87DA,0x87DC,0x87DD,/* 0x60-0x67 */
        0x87DE,0x87DF,0x87E1,0x87E2,0x87E3,0x87E4,0x87E6,0x87E7,/* 0x68-0x6F */
        0x87E8,0x87E9,0x87EB,0x87EC,0x87ED,0x87EF,0x87F0,0x87F1,/* 0x70-0x77 */
        0x87F2,0x87F3,0x87F4,0x87F5,0x87F6,0x87F7,0x87F8,0x0000,/* 0x78-0x7F */

        0x87FA,0x87FB,0x87FC,0x87FD,0x87FF,0x8800,0x8801,0x8802,/* 0x80-0x87 */
        0x8804,0x8805,0x8806,0x8807,0x8808,0x8809,0x880B,0x880C,/* 0x88-0x8F */
        0x880D,0x880E,0x880F,0x8810,0x8811,0x8812,0x8814,0x8817,/* 0x90-0x97 */
        0x8818,0x8819,0x881A,0x881C,0x881D,0x881E,0x881F,0x8820,/* 0x98-0x9F */
        0x8823,0x7A00,0x606F,0x5E0C,0x6089,0x819D,0x5915,0x60DC,/* 0xA0-0xA7 */
        0x7184,0x70EF,0x6EAA,0x6C50,0x7280,0x6A84,0x88AD,0x5E2D,/* 0xA8-0xAF */
        0x4E60,0x5AB3,0x559C,0x94E3,0x6D17,0x7CFB,0x9699,0x620F,/* 0xB0-0xB7 */
        0x7EC6,0x778E,0x867E,0x5323,0x971E,0x8F96,0x6687,0x5CE1,/* 0xB8-0xBF */
        0x4FA0,0x72ED,0x4E0B,0x53A6,0x590F,0x5413,0x6380,0x9528,/* 0xC0-0xC7 */
        0x5148,0x4ED9,0x9C9C,0x7EA4,0x54B8,0x8D24,0x8854,0x8237,/* 0xC8-0xCF */
        0x95F2,0x6D8E,0x5F26,0x5ACC,0x663E,0x9669,0x73B0,0x732E,/* 0xD0-0xD7 */
        0x53BF,0x817A,0x9985,0x7FA1,0x5BAA,0x9677,0x9650,0x7EBF,/* 0xD8-0xDF */
        0x76F8,0x53A2,0x9576,0x9999,0x7BB1,0x8944,0x6E58,0x4E61,/* 0xE0-0xE7 */
        0x7FD4,0x7965,0x8BE6,0x60F3,0x54CD,0x4EAB,0x9879,0x5DF7,/* 0xE8-0xEF */
        0x6A61,0x50CF,0x5411,0x8C61,0x8427,0x785D,0x9704,0x524A,/* 0xF0-0xF7 */
        0x54EE,0x56A3,0x9500,0x6D88,0x5BB5,0x6DC6,0x6653,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_D0[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x8824,0x8825,0x8826,0x8827,0x8828,0x8829,0x882A,0x882B,/* 0x40-0x47 */
        0x882C,0x882D,0x882E,0x882F,0x8830,0x8831,0x8833,0x8834,/* 0x48-0x4F */
        0x8835,0x8836,0x8837,0x8838,0x883A,0x883B,0x883D,0x883E,/* 0x50-0x57 */
        0x883F,0x8841,0x8842,0x8843,0x8846,0x8847,0x8848,0x8849,/* 0x58-0x5F */
        0x884A,0x884B,0x884E,0x884F,0x8850,0x8851,0x8852,0x8853,/* 0x60-0x67 */
        0x8855,0x8856,0x8858,0x885A,0x885B,0x885C,0x885D,0x885E,/* 0x68-0x6F */
        0x885F,0x8860,0x8866,0x8867,0x886A,0x886D,0x886F,0x8871,/* 0x70-0x77 */
        0x8873,0x8874,0x8875,0x8876,0x8878,0x8879,0x887A,0x0000,/* 0x78-0x7F */

        0x887B,0x887C,0x8880,0x8883,0x8886,0x8887,0x8889,0x888A,/* 0x80-0x87 */
        0x888C,0x888E,0x888F,0x8890,0x8891,0x8893,0x8894,0x8895,/* 0x88-0x8F */
        0x8897,0x8898,0x8899,0x889A,0x889B,0x889D,0x889E,0x889F,/* 0x90-0x97 */
        0x88A0,0x88A1,0x88A3,0x88A5,0x88A6,0x88A7,0x88A8,0x88A9,/* 0x98-0x9F */
        0x88AA,0x5C0F,0x5B5D,0x6821,0x8096,0x5578,0x7B11,0x6548,/* 0xA0-0xA7 */
        0x6954,0x4E9B,0x6B47,0x874E,0x978B,0x534F,0x631F,0x643A,/* 0xA8-0xAF */
        0x90AA,0x659C,0x80C1,0x8C10,0x5199,0x68B0,0x5378,0x87F9,/* 0xB0-0xB7 */
        0x61C8,0x6CC4,0x6CFB,0x8C22,0x5C51,0x85AA,0x82AF,0x950C,/* 0xB8-0xBF */
        0x6B23,0x8F9B,0x65B0,0x5FFB,0x5FC3,0x4FE1,0x8845,0x661F,/* 0xC0-0xC7 */
        0x8165,0x7329,0x60FA,0x5174,0x5211,0x578B,0x5F62,0x90A2,/* 0xC8-0xCF */
        0x884C,0x9192,0x5E78,0x674F,0x6027,0x59D3,0x5144,0x51F6,/* 0xD0-0xD7 */
        0x80F8,0x5308,0x6C79,0x96C4,0x718A,0x4F11,0x4FEE,0x7F9E,/* 0xD8-0xDF */
        0x673D,0x55C5,0x9508,0x79C0,0x8896,0x7EE3,0x589F,0x620C,/* 0xE0-0xE7 */
        0x9700,0x865A,0x5618,0x987B,0x5F90,0x8BB8,0x84C4,0x9157,/* 0xE8-0xEF */
        0x53D9,0x65ED,0x5E8F,0x755C,0x6064,0x7D6E,0x5A7F,0x7EEA,/* 0xF0-0xF7 */
        0x7EED,0x8F69,0x55A7,0x5BA3,0x60AC,0x65CB,0x7384,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_D1[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x88AC,0x88AE,0x88AF,0x88B0,0x88B2,0x88B3,0x88B4,0x88B5,/* 0x40-0x47 */
        0x88B6,0x88B8,0x88B9,0x88BA,0x88BB,0x88BD,0x88BE,0x88BF,/* 0x48-0x4F */
        0x88C0,0x88C3,0x88C4,0x88C7,0x88C8,0x88CA,0x88CB,0x88CC,/* 0x50-0x57 */
        0x88CD,0x88CF,0x88D0,0x88D1,0x88D3,0x88D6,0x88D7,0x88DA,/* 0x58-0x5F */
        0x88DB,0x88DC,0x88DD,0x88DE,0x88E0,0x88E1,0x88E6,0x88E7,/* 0x60-0x67 */
        0x88E9,0x88EA,0x88EB,0x88EC,0x88ED,0x88EE,0x88EF,0x88F2,/* 0x68-0x6F */
        0x88F5,0x88F6,0x88F7,0x88FA,0x88FB,0x88FD,0x88FF,0x8900,/* 0x70-0x77 */
        0x8901,0x8903,0x8904,0x8905,0x8906,0x8907,0x8908,0x0000,/* 0x78-0x7F */

        0x8909,0x890B,0x890C,0x890D,0x890E,0x890F,0x8911,0x8914,/* 0x80-0x87 */
        0x8915,0x8916,0x8917,0x8918,0x891C,0x891D,0x891E,0x891F,/* 0x88-0x8F */
        0x8920,0x8922,0x8923,0x8924,0x8926,0x8927,0x8928,0x8929,/* 0x90-0x97 */
        0x892C,0x892D,0x892E,0x892F,0x8931,0x8932,0x8933,0x8935,/* 0x98-0x9F */
        0x8937,0x9009,0x7663,0x7729,0x7EDA,0x9774,0x859B,0x5B66,/* 0xA0-0xA7 */
        0x7A74,0x96EA,0x8840,0x52CB,0x718F,0x5FAA,0x65EC,0x8BE2,/* 0xA8-0xAF */
        0x5BFB,0x9A6F,0x5DE1,0x6B89,0x6C5B,0x8BAD,0x8BAF,0x900A,/* 0xB0-0xB7 */
        0x8FC5,0x538B,0x62BC,0x9E26,0x9E2D,0x5440,0x4E2B,0x82BD,/* 0xB8-0xBF */
        0x7259,0x869C,0x5D16,0x8859,0x6DAF,0x96C5,0x54D1,0x4E9A,/* 0xC0-0xC7 */
        0x8BB6,0x7109,0x54BD,0x9609,0x70DF,0x6DF9,0x76D0,0x4E25,/* 0xC8-0xCF */
        0x7814,0x8712,0x5CA9,0x5EF6,0x8A00,0x989C,0x960E,0x708E,/* 0xD0-0xD7 */
        0x6CBF,0x5944,0x63A9,0x773C,0x884D,0x6F14,0x8273,0x5830,/* 0xD8-0xDF */
        0x71D5,0x538C,0x781A,0x96C1,0x5501,0x5F66,0x7130,0x5BB4,/* 0xE0-0xE7 */
        0x8C1A,0x9A8C,0x6B83,0x592E,0x9E2F,0x79E7,0x6768,0x626C,/* 0xE8-0xEF */
        0x4F6F,0x75A1,0x7F8A,0x6D0B,0x9633,0x6C27,0x4EF0,0x75D2,/* 0xF0-0xF7 */
        0x517B,0x6837,0x6F3E,0x9080,0x8170,0x5996,0x7476,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_D2[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x8938,0x8939,0x893A,0x893B,0x893C,0x893D,0x893E,0x893F,/* 0x40-0x47 */
        0x8940,0x8942,0x8943,0x8945,0x8946,0x8947,0x8948,0x8949,/* 0x48-0x4F */
        0x894A,0x894B,0x894C,0x894D,0x894E,0x894F,0x8950,0x8951,/* 0x50-0x57 */
        0x8952,0x8953,0x8954,0x8955,0x8956,0x8957,0x8958,0x8959,/* 0x58-0x5F */
        0x895A,0x895B,0x895C,0x895D,0x8960,0x8961,0x8962,0x8963,/* 0x60-0x67 */
        0x8964,0x8965,0x8967,0x8968,0x8969,0x896A,0x896B,0x896C,/* 0x68-0x6F */
        0x896D,0x896E,0x896F,0x8970,0x8971,0x8972,0x8973,0x8974,/* 0x70-0x77 */
        0x8975,0x8976,0x8977,0x8978,0x8979,0x897A,0x897C,0x0000,/* 0x78-0x7F */

        0x897D,0x897E,0x8980,0x8982,0x8984,0x8985,0x8987,0x8988,/* 0x80-0x87 */
        0x8989,0x898A,0x898B,0x898C,0x898D,0x898E,0x898F,0x8990,/* 0x88-0x8F */
        0x8991,0x8992,0x8993,0x8994,0x8995,0x8996,0x8997,0x8998,/* 0x90-0x97 */
        0x8999,0x899A,0x899B,0x899C,0x899D,0x899E,0x899F,0x89A0,/* 0x98-0x9F */
        0x89A1,0x6447,0x5C27,0x9065,0x7A91,0x8C23,0x59DA,0x54AC,/* 0xA0-0xA7 */
        0x8200,0x836F,0x8981,0x8000,0x6930,0x564E,0x8036,0x7237,/* 0xA8-0xAF */
        0x91CE,0x51B6,0x4E5F,0x9875,0x6396,0x4E1A,0x53F6,0x66F3,/* 0xB0-0xB7 */
        0x814B,0x591C,0x6DB2,0x4E00,0x58F9,0x533B,0x63D6,0x94F1,/* 0xB8-0xBF */
        0x4F9D,0x4F0A,0x8863,0x9890,0x5937,0x9057,0x79FB,0x4EEA,/* 0xC0-0xC7 */
        0x80F0,0x7591,0x6C82,0x5B9C,0x59E8,0x5F5D,0x6905,0x8681,/* 0xC8-0xCF */
        0x501A,0x5DF2,0x4E59,0x77E3,0x4EE5,0x827A,0x6291,0x6613,/* 0xD0-0xD7 */
        0x9091,0x5C79,0x4EBF,0x5F79,0x81C6,0x9038,0x8084,0x75AB,/* 0xD8-0xDF */
        0x4EA6,0x88D4,0x610F,0x6BC5,0x5FC6,0x4E49,0x76CA,0x6EA2,/* 0xE0-0xE7 */
        0x8BE3,0x8BAE,0x8C0A,0x8BD1,0x5F02,0x7FFC,0x7FCC,0x7ECE,/* 0xE8-0xEF */
        0x8335,0x836B,0x56E0,0x6BB7,0x97F3,0x9634,0x59FB,0x541F,/* 0xF0-0xF7 */
        0x94F6,0x6DEB,0x5BC5,0x996E,0x5C39,0x5F15,0x9690,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_D3[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x89A2,0x89A3,0x89A4,0x89A5,0x89A6,0x89A7,0x89A8,0x89A9,/* 0x40-0x47 */
        0x89AA,0x89AB,0x89AC,0x89AD,0x89AE,0x89AF,0x89B0,0x89B1,/* 0x48-0x4F */
        0x89B2,0x89B3,0x89B4,0x89B5,0x89B6,0x89B7,0x89B8,0x89B9,/* 0x50-0x57 */
        0x89BA,0x89BB,0x89BC,0x89BD,0x89BE,0x89BF,0x89C0,0x89C3,/* 0x58-0x5F */
        0x89CD,0x89D3,0x89D4,0x89D5,0x89D7,0x89D8,0x89D9,0x89DB,/* 0x60-0x67 */
        0x89DD,0x89DF,0x89E0,0x89E1,0x89E2,0x89E4,0x89E7,0x89E8,/* 0x68-0x6F */
        0x89E9,0x89EA,0x89EC,0x89ED,0x89EE,0x89F0,0x89F1,0x89F2,/* 0x70-0x77 */
        0x89F4,0x89F5,0x89F6,0x89F7,0x89F8,0x89F9,0x89FA,0x0000,/* 0x78-0x7F */

        0x89FB,0x89FC,0x89FD,0x89FE,0x89FF,0x8A01,0x8A02,0x8A03,/* 0x80-0x87 */
        0x8A04,0x8A05,0x8A06,0x8A08,0x8A09,0x8A0A,0x8A0B,0x8A0C,/* 0x88-0x8F */
        0x8A0D,0x8A0E,0x8A0F,0x8A10,0x8A11,0x8A12,0x8A13,0x8A14,/* 0x90-0x97 */
        0x8A15,0x8A16,0x8A17,0x8A18,0x8A19,0x8A1A,0x8A1B,0x8A1C,/* 0x98-0x9F */
        0x8A1D,0x5370,0x82F1,0x6A31,0x5A74,0x9E70,0x5E94,0x7F28,/* 0xA0-0xA7 */
        0x83B9,0x8424,0x8425,0x8367,0x8747,0x8FCE,0x8D62,0x76C8,/* 0xA8-0xAF */
        0x5F71,0x9896,0x786C,0x6620,0x54DF,0x62E5,0x4F63,0x81C3,/* 0xB0-0xB7 */
        0x75C8,0x5EB8,0x96CD,0x8E0A,0x86F9,0x548F,0x6CF3,0x6D8C,/* 0xB8-0xBF */
        0x6C38,0x607F,0x52C7,0x7528,0x5E7D,0x4F18,0x60A0,0x5FE7,/* 0xC0-0xC7 */
        0x5C24,0x7531,0x90AE,0x94C0,0x72B9,0x6CB9,0x6E38,0x9149,/* 0xC8-0xCF */
        0x6709,0x53CB,0x53F3,0x4F51,0x91C9,0x8BF1,0x53C8,0x5E7C,/* 0xD0-0xD7 */
        0x8FC2,0x6DE4,0x4E8E,0x76C2,0x6986,0x865E,0x611A,0x8206,/* 0xD8-0xDF */
        0x4F59,0x4FDE,0x903E,0x9C7C,0x6109,0x6E1D,0x6E14,0x9685,/* 0xE0-0xE7 */
        0x4E88,0x5A31,0x96E8,0x4E0E,0x5C7F,0x79B9,0x5B87,0x8BED,/* 0xE8-0xEF */
        0x7FBD,0x7389,0x57DF,0x828B,0x90C1,0x5401,0x9047,0x55BB,/* 0xF0-0xF7 */
        0x5CEA,0x5FA1,0x6108,0x6B32,0x72F1,0x80B2,0x8A89,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_D4[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x8A1E,0x8A1F,0x8A20,0x8A21,0x8A22,0x8A23,0x8A24,0x8A25,/* 0x40-0x47 */
        0x8A26,0x8A27,0x8A28,0x8A29,0x8A2A,0x8A2B,0x8A2C,0x8A2D,/* 0x48-0x4F */
        0x8A2E,0x8A2F,0x8A30,0x8A31,0x8A32,0x8A33,0x8A34,0x8A35,/* 0x50-0x57 */
        0x8A36,0x8A37,0x8A38,0x8A39,0x8A3A,0x8A3B,0x8A3C,0x8A3D,/* 0x58-0x5F */
        0x8A3F,0x8A40,0x8A41,0x8A42,0x8A43,0x8A44,0x8A45,0x8A46,/* 0x60-0x67 */
        0x8A47,0x8A49,0x8A4A,0x8A4B,0x8A4C,0x8A4D,0x8A4E,0x8A4F,/* 0x68-0x6F */
        0x8A50,0x8A51,0x8A52,0x8A53,0x8A54,0x8A55,0x8A56,0x8A57,/* 0x70-0x77 */
        0x8A58,0x8A59,0x8A5A,0x8A5B,0x8A5C,0x8A5D,0x8A5E,0x0000,/* 0x78-0x7F */

        0x8A5F,0x8A60,0x8A61,0x8A62,0x8A63,0x8A64,0x8A65,0x8A66,/* 0x80-0x87 */
        0x8A67,0x8A68,0x8A69,0x8A6A,0x8A6B,0x8A6C,0x8A6D,0x8A6E,/* 0x88-0x8F */
        0x8A6F,0x8A70,0x8A71,0x8A72,0x8A73,0x8A74,0x8A75,0x8A76,/* 0x90-0x97 */
        0x8A77,0x8A78,0x8A7A,0x8A7B,0x8A7C,0x8A7D,0x8A7E,0x8A7F,/* 0x98-0x9F */
        0x8A80,0x6D74,0x5BD3,0x88D5,0x9884,0x8C6B,0x9A6D,0x9E33,/* 0xA0-0xA7 */
        0x6E0A,0x51A4,0x5143,0x57A3,0x8881,0x539F,0x63F4,0x8F95,/* 0xA8-0xAF */
        0x56ED,0x5458,0x5706,0x733F,0x6E90,0x7F18,0x8FDC,0x82D1,/* 0xB0-0xB7 */
        0x613F,0x6028,0x9662,0x66F0,0x7EA6,0x8D8A,0x8DC3,0x94A5,/* 0xB8-0xBF */
        0x5CB3,0x7CA4,0x6708,0x60A6,0x9605,0x8018,0x4E91,0x90E7,/* 0xC0-0xC7 */
        0x5300,0x9668,0x5141,0x8FD0,0x8574,0x915D,0x6655,0x97F5,/* 0xC8-0xCF */
        0x5B55,0x531D,0x7838,0x6742,0x683D,0x54C9,0x707E,0x5BB0,/* 0xD0-0xD7 */
        0x8F7D,0x518D,0x5728,0x54B1,0x6512,0x6682,0x8D5E,0x8D43,/* 0xD8-0xDF */
        0x810F,0x846C,0x906D,0x7CDF,0x51FF,0x85FB,0x67A3,0x65E9,/* 0xE0-0xE7 */
        0x6FA1,0x86A4,0x8E81,0x566A,0x9020,0x7682,0x7076,0x71E5,/* 0xE8-0xEF */
        0x8D23,0x62E9,0x5219,0x6CFD,0x8D3C,0x600E,0x589E,0x618E,/* 0xF0-0xF7 */
        0x66FE,0x8D60,0x624E,0x55B3,0x6E23,0x672D,0x8F67,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_D5[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x8A81,0x8A82,0x8A83,0x8A84,0x8A85,0x8A86,0x8A87,0x8A88,/* 0x40-0x47 */
        0x8A8B,0x8A8C,0x8A8D,0x8A8E,0x8A8F,0x8A90,0x8A91,0x8A92,/* 0x48-0x4F */
        0x8A94,0x8A95,0x8A96,0x8A97,0x8A98,0x8A99,0x8A9A,0x8A9B,/* 0x50-0x57 */
        0x8A9C,0x8A9D,0x8A9E,0x8A9F,0x8AA0,0x8AA1,0x8AA2,0x8AA3,/* 0x58-0x5F */
        0x8AA4,0x8AA5,0x8AA6,0x8AA7,0x8AA8,0x8AA9,0x8AAA,0x8AAB,/* 0x60-0x67 */
        0x8AAC,0x8AAD,0x8AAE,0x8AAF,0x8AB0,0x8AB1,0x8AB2,0x8AB3,/* 0x68-0x6F */
        0x8AB4,0x8AB5,0x8AB6,0x8AB7,0x8AB8,0x8AB9,0x8ABA,0x8ABB,/* 0x70-0x77 */
        0x8ABC,0x8ABD,0x8ABE,0x8ABF,0x8AC0,0x8AC1,0x8AC2,0x0000,/* 0x78-0x7F */

        0x8AC3,0x8AC4,0x8AC5,0x8AC6,0x8AC7,0x8AC8,0x8AC9,0x8ACA,/* 0x80-0x87 */
        0x8ACB,0x8ACC,0x8ACD,0x8ACE,0x8ACF,0x8AD0,0x8AD1,0x8AD2,/* 0x88-0x8F */
        0x8AD3,0x8AD4,0x8AD5,0x8AD6,0x8AD7,0x8AD8,0x8AD9,0x8ADA,/* 0x90-0x97 */
        0x8ADB,0x8ADC,0x8ADD,0x8ADE,0x8ADF,0x8AE0,0x8AE1,0x8AE2,/* 0x98-0x9F */
        0x8AE3,0x94E1,0x95F8,0x7728,0x6805,0x69A8,0x548B,0x4E4D,/* 0xA0-0xA7 */
        0x70B8,0x8BC8,0x6458,0x658B,0x5B85,0x7A84,0x503A,0x5BE8,/* 0xA8-0xAF */
        0x77BB,0x6BE1,0x8A79,0x7C98,0x6CBE,0x76CF,0x65A9,0x8F97,/* 0xB0-0xB7 */
        0x5D2D,0x5C55,0x8638,0x6808,0x5360,0x6218,0x7AD9,0x6E5B,/* 0xB8-0xBF */
        0x7EFD,0x6A1F,0x7AE0,0x5F70,0x6F33,0x5F20,0x638C,0x6DA8,/* 0xC0-0xC7 */
        0x6756,0x4E08,0x5E10,0x8D26,0x4ED7,0x80C0,0x7634,0x969C,/* 0xC8-0xCF */
        0x62DB,0x662D,0x627E,0x6CBC,0x8D75,0x7167,0x7F69,0x5146,/* 0xD0-0xD7 */
        0x8087,0x53EC,0x906E,0x6298,0x54F2,0x86F0,0x8F99,0x8005,/* 0xD8-0xDF */
        0x9517,0x8517,0x8FD9,0x6D59,0x73CD,0x659F,0x771F,0x7504,/* 0xE0-0xE7 */
        0x7827,0x81FB,0x8D1E,0x9488,0x4FA6,0x6795,0x75B9,0x8BCA,/* 0xE8-0xEF */
        0x9707,0x632F,0x9547,0x9635,0x84B8,0x6323,0x7741,0x5F81,/* 0xF0-0xF7 */
        0x72F0,0x4E89,0x6014,0x6574,0x62EF,0x6B63,0x653F,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_D6[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x8AE4,0x8AE5,0x8AE6,0x8AE7,0x8AE8,0x8AE9,0x8AEA,0x8AEB,/* 0x40-0x47 */
        0x8AEC,0x8AED,0x8AEE,0x8AEF,0x8AF0,0x8AF1,0x8AF2,0x8AF3,/* 0x48-0x4F */
        0x8AF4,0x8AF5,0x8AF6,0x8AF7,0x8AF8,0x8AF9,0x8AFA,0x8AFB,/* 0x50-0x57 */
        0x8AFC,0x8AFD,0x8AFE,0x8AFF,0x8B00,0x8B01,0x8B02,0x8B03,/* 0x58-0x5F */
        0x8B04,0x8B05,0x8B06,0x8B08,0x8B09,0x8B0A,0x8B0B,0x8B0C,/* 0x60-0x67 */
        0x8B0D,0x8B0E,0x8B0F,0x8B10,0x8B11,0x8B12,0x8B13,0x8B14,/* 0x68-0x6F */
        0x8B15,0x8B16,0x8B17,0x8B18,0x8B19,0x8B1A,0x8B1B,0x8B1C,/* 0x70-0x77 */
        0x8B1D,0x8B1E,0x8B1F,0x8B20,0x8B21,0x8B22,0x8B23,0x0000,/* 0x78-0x7F */

        0x8B24,0x8B25,0x8B27,0x8B28,0x8B29,0x8B2A,0x8B2B,0x8B2C,/* 0x80-0x87 */
        0x8B2D,0x8B2E,0x8B2F,0x8B30,0x8B31,0x8B32,0x8B33,0x8B34,/* 0x88-0x8F */
        0x8B35,0x8B36,0x8B37,0x8B38,0x8B39,0x8B3A,0x8B3B,0x8B3C,/* 0x90-0x97 */
        0x8B3D,0x8B3E,0x8B3F,0x8B40,0x8B41,0x8B42,0x8B43,0x8B44,/* 0x98-0x9F */
        0x8B45,0x5E27,0x75C7,0x90D1,0x8BC1,0x829D,0x679D,0x652F,/* 0xA0-0xA7 */
        0x5431,0x8718,0x77E5,0x80A2,0x8102,0x6C41,0x4E4B,0x7EC7,/* 0xA8-0xAF */
        0x804C,0x76F4,0x690D,0x6B96,0x6267,0x503C,0x4F84,0x5740,/* 0xB0-0xB7 */
        0x6307,0x6B62,0x8DBE,0x53EA,0x65E8,0x7EB8,0x5FD7,0x631A,/* 0xB8-0xBF */
        0x63B7,0x81F3,0x81F4,0x7F6E,0x5E1C,0x5CD9,0x5236,0x667A,/* 0xC0-0xC7 */
        0x79E9,0x7A1A,0x8D28,0x7099,0x75D4,0x6EDE,0x6CBB,0x7A92,/* 0xC8-0xCF */
        0x4E2D,0x76C5,0x5FE0,0x949F,0x8877,0x7EC8,0x79CD,0x80BF,/* 0xD0-0xD7 */
        0x91CD,0x4EF2,0x4F17,0x821F,0x5468,0x5DDE,0x6D32,0x8BCC,/* 0xD8-0xDF */
        0x7CA5,0x8F74,0x8098,0x5E1A,0x5492,0x76B1,0x5B99,0x663C,/* 0xE0-0xE7 */
        0x9AA4,0x73E0,0x682A,0x86DB,0x6731,0x732A,0x8BF8,0x8BDB,/* 0xE8-0xEF */
        0x9010,0x7AF9,0x70DB,0x716E,0x62C4,0x77A9,0x5631,0x4E3B,/* 0xF0-0xF7 */
        0x8457,0x67F1,0x52A9,0x86C0,0x8D2E,0x94F8,0x7B51,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_D7[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x8B46,0x8B47,0x8B48,0x8B49,0x8B4A,0x8B4B,0x8B4C,0x8B4D,/* 0x40-0x47 */
        0x8B4E,0x8B4F,0x8B50,0x8B51,0x8B52,0x8B53,0x8B54,0x8B55,/* 0x48-0x4F */
        0x8B56,0x8B57,0x8B58,0x8B59,0x8B5A,0x8B5B,0x8B5C,0x8B5D,/* 0x50-0x57 */
        0x8B5E,0x8B5F,0x8B60,0x8B61,0x8B62,0x8B63,0x8B64,0x8B65,/* 0x58-0x5F */
        0x8B67,0x8B68,0x8B69,0x8B6A,0x8B6B,0x8B6D,0x8B6E,0x8B6F,/* 0x60-0x67 */
        0x8B70,0x8B71,0x8B72,0x8B73,0x8B74,0x8B75,0x8B76,0x8B77,/* 0x68-0x6F */
        0x8B78,0x8B79,0x8B7A,0x8B7B,0x8B7C,0x8B7D,0x8B7E,0x8B7F,/* 0x70-0x77 */
        0x8B80,0x8B81,0x8B82,0x8B83,0x8B84,0x8B85,0x8B86,0x0000,/* 0x78-0x7F */

        0x8B87,0x8B88,0x8B89,0x8B8A,0x8B8B,0x8B8C,0x8B8D,0x8B8E,/* 0x80-0x87 */
        0x8B8F,0x8B90,0x8B91,0x8B92,0x8B93,0x8B94,0x8B95,0x8B96,/* 0x88-0x8F */
        0x8B97,0x8B98,0x8B99,0x8B9A,0x8B9B,0x8B9C,0x8B9D,0x8B9E,/* 0x90-0x97 */
        0x8B9F,0x8BAC,0x8BB1,0x8BBB,0x8BC7,0x8BD0,0x8BEA,0x8C09,/* 0x98-0x9F */
        0x8C1E,0x4F4F,0x6CE8,0x795D,0x9A7B,0x6293,0x722A,0x62FD,/* 0xA0-0xA7 */
        0x4E13,0x7816,0x8F6C,0x64B0,0x8D5A,0x7BC6,0x6869,0x5E84,/* 0xA8-0xAF */
        0x88C5,0x5986,0x649E,0x58EE,0x72B6,0x690E,0x9525,0x8FFD,/* 0xB0-0xB7 */
        0x8D58,0x5760,0x7F00,0x8C06,0x51C6,0x6349,0x62D9,0x5353,/* 0xB8-0xBF */
        0x684C,0x7422,0x8301,0x914C,0x5544,0x7740,0x707C,0x6D4A,/* 0xC0-0xC7 */
        0x5179,0x54A8,0x8D44,0x59FF,0x6ECB,0x6DC4,0x5B5C,0x7D2B,/* 0xC8-0xCF */
        0x4ED4,0x7C7D,0x6ED3,0x5B50,0x81EA,0x6E0D,0x5B57,0x9B03,/* 0xD0-0xD7 */
        0x68D5,0x8E2A,0x5B97,0x7EFC,0x603B,0x7EB5,0x90B9,0x8D70,/* 0xD8-0xDF */
        0x594F,0x63CD,0x79DF,0x8DB3,0x5352,0x65CF,0x7956,0x8BC5,/* 0xE0-0xE7 */
        0x963B,0x7EC4,0x94BB,0x7E82,0x5634,0x9189,0x6700,0x7F6A,/* 0xE8-0xEF */
        0x5C0A,0x9075,0x6628,0x5DE6,0x4F50,0x67DE,0x505A,0x4F5C,/* 0xF0-0xF7 */
        0x5750,0x5EA7,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_D8[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x8C38,0x8C39,0x8C3A,0x8C3B,0x8C3C,0x8C3D,0x8C3E,0x8C3F,/* 0x40-0x47 */
        0x8C40,0x8C42,0x8C43,0x8C44,0x8C45,0x8C48,0x8C4A,0x8C4B,/* 0x48-0x4F */
        0x8C4D,0x8C4E,0x8C4F,0x8C50,0x8C51,0x8C52,0x8C53,0x8C54,/* 0x50-0x57 */
        0x8C56,0x8C57,0x8C58,0x8C59,0x8C5B,0x8C5C,0x8C5D,0x8C5E,/* 0x58-0x5F */
        0x8C5F,0x8C60,0x8C63,0x8C64,0x8C65,0x8C66,0x8C67,0x8C68,/* 0x60-0x67 */
        0x8C69,0x8C6C,0x8C6D,0x8C6E,0x8C6F,0x8C70,0x8C71,0x8C72,/* 0x68-0x6F */
        0x8C74,0x8C75,0x8C76,0x8C77,0x8C7B,0x8C7C,0x8C7D,0x8C7E,/* 0x70-0x77 */
        0x8C7F,0x8C80,0x8C81,0x8C83,0x8C84,0x8C86,0x8C87,0x0000,/* 0x78-0x7F */

        0x8C88,0x8C8B,0x8C8D,0x8C8E,0x8C8F,0x8C90,0x8C91,0x8C92,/* 0x80-0x87 */
        0x8C93,0x8C95,0x8C96,0x8C97,0x8C99,0x8C9A,0x8C9B,0x8C9C,/* 0x88-0x8F */
        0x8C9D,0x8C9E,0x8C9F,0x8CA0,0x8CA1,0x8CA2,0x8CA3,0x8CA4,/* 0x90-0x97 */
        0x8CA5,0x8CA6,0x8CA7,0x8CA8,0x8CA9,0x8CAA,0x8CAB,0x8CAC,/* 0x98-0x9F */
        0x8CAD,0x4E8D,0x4E0C,0x5140,0x4E10,0x5EFF,0x5345,0x4E15,/* 0xA0-0xA7 */
        0x4E98,0x4E1E,0x9B32,0x5B6C,0x5669,0x4E28,0x79BA,0x4E3F,/* 0xA8-0xAF */
        0x5315,0x4E47,0x592D,0x723B,0x536E,0x6C10,0x56DF,0x80E4,/* 0xB0-0xB7 */
        0x9997,0x6BD3,0x777E,0x9F17,0x4E36,0x4E9F,0x9F10,0x4E5C,/* 0xB8-0xBF */
        0x4E69,0x4E93,0x8288,0x5B5B,0x556C,0x560F,0x4EC4,0x538D,/* 0xC0-0xC7 */
        0x539D,0x53A3,0x53A5,0x53AE,0x9765,0x8D5D,0x531A,0x53F5,/* 0xC8-0xCF */
        0x5326,0x532E,0x533E,0x8D5C,0x5366,0x5363,0x5202,0x5208,/* 0xD0-0xD7 */
        0x520E,0x522D,0x5233,0x523F,0x5240,0x524C,0x525E,0x5261,/* 0xD8-0xDF */
        0x525C,0x84AF,0x527D,0x5282,0x5281,0x5290,0x5293,0x5182,/* 0xE0-0xE7 */
        0x7F54,0x4EBB,0x4EC3,0x4EC9,0x4EC2,0x4EE8,0x4EE1,0x4EEB,/* 0xE8-0xEF */
        0x4EDE,0x4F1B,0x4EF3,0x4F22,0x4F64,0x4EF5,0x4F25,0x4F27,/* 0xF0-0xF7 */
        0x4F09,0x4F2B,0x4F5E,0x4F67,0x6538,0x4F5A,0x4F5D,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_D9[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x8CAE,0x8CAF,0x8CB0,0x8CB1,0x8CB2,0x8CB3,0x8CB4,0x8CB5,/* 0x40-0x47 */
        0x8CB6,0x8CB7,0x8CB8,0x8CB9,0x8CBA,0x8CBB,0x8CBC,0x8CBD,/* 0x48-0x4F */
        0x8CBE,0x8CBF,0x8CC0,0x8CC1,0x8CC2,0x8CC3,0x8CC4,0x8CC5,/* 0x50-0x57 */
        0x8CC6,0x8CC7,0x8CC8,0x8CC9,0x8CCA,0x8CCB,0x8CCC,0x8CCD,/* 0x58-0x5F */
        0x8CCE,0x8CCF,0x8CD0,0x8CD1,0x8CD2,0x8CD3,0x8CD4,0x8CD5,/* 0x60-0x67 */
        0x8CD6,0x8CD7,0x8CD8,0x8CD9,0x8CDA,0x8CDB,0x8CDC,0x8CDD,/* 0x68-0x6F */
        0x8CDE,0x8CDF,0x8CE0,0x8CE1,0x8CE2,0x8CE3,0x8CE4,0x8CE5,/* 0x70-0x77 */
        0x8CE6,0x8CE7,0x8CE8,0x8CE9,0x8CEA,0x8CEB,0x8CEC,0x0000,/* 0x78-0x7F */

        0x8CED,0x8CEE,0x8CEF,0x8CF0,0x8CF1,0x8CF2,0x8CF3,0x8CF4,/* 0x80-0x87 */
        0x8CF5,0x8CF6,0x8CF7,0x8CF8,0x8CF9,0x8CFA,0x8CFB,0x8CFC,/* 0x88-0x8F */
        0x8CFD,0x8CFE,0x8CFF,0x8D00,0x8D01,0x8D02,0x8D03,0x8D04,/* 0x90-0x97 */
        0x8D05,0x8D06,0x8D07,0x8D08,0x8D09,0x8D0A,0x8D0B,0x8D0C,/* 0x98-0x9F */
        0x8D0D,0x4F5F,0x4F57,0x4F32,0x4F3D,0x4F76,0x4F74,0x4F91,/* 0xA0-0xA7 */
        0x4F89,0x4F83,0x4F8F,0x4F7E,0x4F7B,0x4FAA,0x4F7C,0x4FAC,/* 0xA8-0xAF */
        0x4F94,0x4FE6,0x4FE8,0x4FEA,0x4FC5,0x4FDA,0x4FE3,0x4FDC,/* 0xB0-0xB7 */
        0x4FD1,0x4FDF,0x4FF8,0x5029,0x504C,0x4FF3,0x502C,0x500F,/* 0xB8-0xBF */
        0x502E,0x502D,0x4FFE,0x501C,0x500C,0x5025,0x5028,0x507E,/* 0xC0-0xC7 */
        0x5043,0x5055,0x5048,0x504E,0x506C,0x507B,0x50A5,0x50A7,/* 0xC8-0xCF */
        0x50A9,0x50BA,0x50D6,0x5106,0x50ED,0x50EC,0x50E6,0x50EE,/* 0xD0-0xD7 */
        0x5107,0x510B,0x4EDD,0x6C3D,0x4F58,0x4F65,0x4FCE,0x9FA0,/* 0xD8-0xDF */
        0x6C46,0x7C74,0x516E,0x5DFD,0x9EC9,0x9998,0x5181,0x5914,/* 0xE0-0xE7 */
        0x52F9,0x530D,0x8A07,0x5310,0x51EB,0x5919,0x5155,0x4EA0,/* 0xE8-0xEF */
        0x5156,0x4EB3,0x886E,0x88A4,0x4EB5,0x8114,0x88D2,0x7980,/* 0xF0-0xF7 */
        0x5B34,0x8803,0x7FB8,0x51AB,0x51B1,0x51BD,0x51BC,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_DA[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x8D0E,0x8D0F,0x8D10,0x8D11,0x8D12,0x8D13,0x8D14,0x8D15,/* 0x40-0x47 */
        0x8D16,0x8D17,0x8D18,0x8D19,0x8D1A,0x8D1B,0x8D1C,0x8D20,/* 0x48-0x4F */
        0x8D51,0x8D52,0x8D57,0x8D5F,0x8D65,0x8D68,0x8D69,0x8D6A,/* 0x50-0x57 */
        0x8D6C,0x8D6E,0x8D6F,0x8D71,0x8D72,0x8D78,0x8D79,0x8D7A,/* 0x58-0x5F */
        0x8D7B,0x8D7C,0x8D7D,0x8D7E,0x8D7F,0x8D80,0x8D82,0x8D83,/* 0x60-0x67 */
        0x8D86,0x8D87,0x8D88,0x8D89,0x8D8C,0x8D8D,0x8D8E,0x8D8F,/* 0x68-0x6F */
        0x8D90,0x8D92,0x8D93,0x8D95,0x8D96,0x8D97,0x8D98,0x8D99,/* 0x70-0x77 */
        0x8D9A,0x8D9B,0x8D9C,0x8D9D,0x8D9E,0x8DA0,0x8DA1,0x0000,/* 0x78-0x7F */

        0x8DA2,0x8DA4,0x8DA5,0x8DA6,0x8DA7,0x8DA8,0x8DA9,0x8DAA,/* 0x80-0x87 */
        0x8DAB,0x8DAC,0x8DAD,0x8DAE,0x8DAF,0x8DB0,0x8DB2,0x8DB6,/* 0x88-0x8F */
        0x8DB7,0x8DB9,0x8DBB,0x8DBD,0x8DC0,0x8DC1,0x8DC2,0x8DC5,/* 0x90-0x97 */
        0x8DC7,0x8DC8,0x8DC9,0x8DCA,0x8DCD,0x8DD0,0x8DD2,0x8DD3,/* 0x98-0x9F */
        0x8DD4,0x51C7,0x5196,0x51A2,0x51A5,0x8BA0,0x8BA6,0x8BA7,/* 0xA0-0xA7 */
        0x8BAA,0x8BB4,0x8BB5,0x8BB7,0x8BC2,0x8BC3,0x8BCB,0x8BCF,/* 0xA8-0xAF */
        0x8BCE,0x8BD2,0x8BD3,0x8BD4,0x8BD6,0x8BD8,0x8BD9,0x8BDC,/* 0xB0-0xB7 */
        0x8BDF,0x8BE0,0x8BE4,0x8BE8,0x8BE9,0x8BEE,0x8BF0,0x8BF3,/* 0xB8-0xBF */
        0x8BF6,0x8BF9,0x8BFC,0x8BFF,0x8C00,0x8C02,0x8C04,0x8C07,/* 0xC0-0xC7 */
        0x8C0C,0x8C0F,0x8C11,0x8C12,0x8C14,0x8C15,0x8C16,0x8C19,/* 0xC8-0xCF */
        0x8C1B,0x8C18,0x8C1D,0x8C1F,0x8C20,0x8C21,0x8C25,0x8C27,/* 0xD0-0xD7 */
        0x8C2A,0x8C2B,0x8C2E,0x8C2F,0x8C32,0x8C33,0x8C35,0x8C36,/* 0xD8-0xDF */
        0x5369,0x537A,0x961D,0x9622,0x9621,0x9631,0x962A,0x963D,/* 0xE0-0xE7 */
        0x963C,0x9642,0x9649,0x9654,0x965F,0x9667,0x966C,0x9672,/* 0xE8-0xEF */
        0x9674,0x9688,0x968D,0x9697,0x96B0,0x9097,0x909B,0x909D,/* 0xF0-0xF7 */
        0x9099,0x90AC,0x90A1,0x90B4,0x90B3,0x90B6,0x90BA,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_DB[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x8DD5,0x8DD8,0x8DD9,0x8DDC,0x8DE0,0x8DE1,0x8DE2,0x8DE5,/* 0x40-0x47 */
        0x8DE6,0x8DE7,0x8DE9,0x8DED,0x8DEE,0x8DF0,0x8DF1,0x8DF2,/* 0x48-0x4F */
        0x8DF4,0x8DF6,0x8DFC,0x8DFE,0x8DFF,0x8E00,0x8E01,0x8E02,/* 0x50-0x57 */
        0x8E03,0x8E04,0x8E06,0x8E07,0x8E08,0x8E0B,0x8E0D,0x8E0E,/* 0x58-0x5F */
        0x8E10,0x8E11,0x8E12,0x8E13,0x8E15,0x8E16,0x8E17,0x8E18,/* 0x60-0x67 */
        0x8E19,0x8E1A,0x8E1B,0x8E1C,0x8E20,0x8E21,0x8E24,0x8E25,/* 0x68-0x6F */
        0x8E26,0x8E27,0x8E28,0x8E2B,0x8E2D,0x8E30,0x8E32,0x8E33,/* 0x70-0x77 */
        0x8E34,0x8E36,0x8E37,0x8E38,0x8E3B,0x8E3C,0x8E3E,0x0000,/* 0x78-0x7F */

        0x8E3F,0x8E43,0x8E45,0x8E46,0x8E4C,0x8E4D,0x8E4E,0x8E4F,/* 0x80-0x87 */
        0x8E50,0x8E53,0x8E54,0x8E55,0x8E56,0x8E57,0x8E58,0x8E5A,/* 0x88-0x8F */
        0x8E5B,0x8E5C,0x8E5D,0x8E5E,0x8E5F,0x8E60,0x8E61,0x8E62,/* 0x90-0x97 */
        0x8E63,0x8E64,0x8E65,0x8E67,0x8E68,0x8E6A,0x8E6B,0x8E6E,/* 0x98-0x9F */
        0x8E71,0x90B8,0x90B0,0x90CF,0x90C5,0x90BE,0x90D0,0x90C4,/* 0xA0-0xA7 */
        0x90C7,0x90D3,0x90E6,0x90E2,0x90DC,0x90D7,0x90DB,0x90EB,/* 0xA8-0xAF */
        0x90EF,0x90FE,0x9104,0x9122,0x911E,0x9123,0x9131,0x912F,/* 0xB0-0xB7 */
        0x9139,0x9143,0x9146,0x520D,0x5942,0x52A2,0x52AC,0x52AD,/* 0xB8-0xBF */
        0x52BE,0x54FF,0x52D0,0x52D6,0x52F0,0x53DF,0x71EE,0x77CD,/* 0xC0-0xC7 */
        0x5EF4,0x51F5,0x51FC,0x9B2F,0x53B6,0x5F01,0x755A,0x5DEF,/* 0xC8-0xCF */
        0x574C,0x57A9,0x57A1,0x587E,0x58BC,0x58C5,0x58D1,0x5729,/* 0xD0-0xD7 */
        0x572C,0x572A,0x5733,0x5739,0x572E,0x572F,0x575C,0x573B,/* 0xD8-0xDF */
        0x5742,0x5769,0x5785,0x576B,0x5786,0x577C,0x577B,0x5768,/* 0xE0-0xE7 */
        0x576D,0x5776,0x5773,0x57AD,0x57A4,0x578C,0x57B2,0x57CF,/* 0xE8-0xEF */
        0x57A7,0x57B4,0x5793,0x57A0,0x57D5,0x57D8,0x57DA,0x57D9,/* 0xF0-0xF7 */
        0x57D2,0x57B8,0x57F4,0x57EF,0x57F8,0x57E4,0x57DD,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_DC[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x8E73,0x8E75,0x8E77,0x8E78,0x8E79,0x8E7A,0x8E7B,0x8E7D,/* 0x40-0x47 */
        0x8E7E,0x8E80,0x8E82,0x8E83,0x8E84,0x8E86,0x8E88,0x8E89,/* 0x48-0x4F */
        0x8E8A,0x8E8B,0x8E8C,0x8E8D,0x8E8E,0x8E91,0x8E92,0x8E93,/* 0x50-0x57 */
        0x8E95,0x8E96,0x8E97,0x8E98,0x8E99,0x8E9A,0x8E9B,0x8E9D,/* 0x58-0x5F */
        0x8E9F,0x8EA0,0x8EA1,0x8EA2,0x8EA3,0x8EA4,0x8EA5,0x8EA6,/* 0x60-0x67 */
        0x8EA7,0x8EA8,0x8EA9,0x8EAA,0x8EAD,0x8EAE,0x8EB0,0x8EB1,/* 0x68-0x6F */
        0x8EB3,0x8EB4,0x8EB5,0x8EB6,0x8EB7,0x8EB8,0x8EB9,0x8EBB,/* 0x70-0x77 */
        0x8EBC,0x8EBD,0x8EBE,0x8EBF,0x8EC0,0x8EC1,0x8EC2,0x0000,/* 0x78-0x7F */

        0x8EC3,0x8EC4,0x8EC5,0x8EC6,0x8EC7,0x8EC8,0x8EC9,0x8ECA,/* 0x80-0x87 */
        0x8ECB,0x8ECC,0x8ECD,0x8ECF,0x8ED0,0x8ED1,0x8ED2,0x8ED3,/* 0x88-0x8F */
        0x8ED4,0x8ED5,0x8ED6,0x8ED7,0x8ED8,0x8ED9,0x8EDA,0x8EDB,/* 0x90-0x97 */
        0x8EDC,0x8EDD,0x8EDE,0x8EDF,0x8EE0,0x8EE1,0x8EE2,0x8EE3,/* 0x98-0x9F */
        0x8EE4,0x580B,0x580D,0x57FD,0x57ED,0x5800,0x581E,0x5819,/* 0xA0-0xA7 */
        0x5844,0x5820,0x5865,0x586C,0x5881,0x5889,0x589A,0x5880,/* 0xA8-0xAF */
        0x99A8,0x9F19,0x61FF,0x8279,0x827D,0x827F,0x828F,0x828A,/* 0xB0-0xB7 */
        0x82A8,0x8284,0x828E,0x8291,0x8297,0x8299,0x82AB,0x82B8,/* 0xB8-0xBF */
        0x82BE,0x82B0,0x82C8,0x82CA,0x82E3,0x8298,0x82B7,0x82AE,/* 0xC0-0xC7 */
        0x82CB,0x82CC,0x82C1,0x82A9,0x82B4,0x82A1,0x82AA,0x829F,/* 0xC8-0xCF */
        0x82C4,0x82CE,0x82A4,0x82E1,0x8309,0x82F7,0x82E4,0x830F,/* 0xD0-0xD7 */
        0x8307,0x82DC,0x82F4,0x82D2,0x82D8,0x830C,0x82FB,0x82D3,/* 0xD8-0xDF */
        0x8311,0x831A,0x8306,0x8314,0x8315,0x82E0,0x82D5,0x831C,/* 0xE0-0xE7 */
        0x8351,0x835B,0x835C,0x8308,0x8392,0x833C,0x8334,0x8331,/* 0xE8-0xEF */
        0x839B,0x835E,0x832F,0x834F,0x8347,0x8343,0x835F,0x8340,/* 0xF0-0xF7 */
        0x8317,0x8360,0x832D,0x833A,0x8333,0x8366,0x8365,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_DD[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x8EE5,0x8EE6,0x8EE7,0x8EE8,0x8EE9,0x8EEA,0x8EEB,0x8EEC,/* 0x40-0x47 */
        0x8EED,0x8EEE,0x8EEF,0x8EF0,0x8EF1,0x8EF2,0x8EF3,0x8EF4,/* 0x48-0x4F */
        0x8EF5,0x8EF6,0x8EF7,0x8EF8,0x8EF9,0x8EFA,0x8EFB,0x8EFC,/* 0x50-0x57 */
        0x8EFD,0x8EFE,0x8EFF,0x8F00,0x8F01,0x8F02,0x8F03,0x8F04,/* 0x58-0x5F */
        0x8F05,0x8F06,0x8F07,0x8F08,0x8F09,0x8F0A,0x8F0B,0x8F0C,/* 0x60-0x67 */
        0x8F0D,0x8F0E,0x8F0F,0x8F10,0x8F11,0x8F12,0x8F13,0x8F14,/* 0x68-0x6F */
        0x8F15,0x8F16,0x8F17,0x8F18,0x8F19,0x8F1A,0x8F1B,0x8F1C,/* 0x70-0x77 */
        0x8F1D,0x8F1E,0x8F1F,0x8F20,0x8F21,0x8F22,0x8F23,0x0000,/* 0x78-0x7F */

        0x8F24,0x8F25,0x8F26,0x8F27,0x8F28,0x8F29,0x8F2A,0x8F2B,/* 0x80-0x87 */
        0x8F2C,0x8F2D,0x8F2E,0x8F2F,0x8F30,0x8F31,0x8F32,0x8F33,/* 0x88-0x8F */
        0x8F34,0x8F35,0x8F36,0x8F37,0x8F38,0x8F39,0x8F3A,0x8F3B,/* 0x90-0x97 */
        0x8F3C,0x8F3D,0x8F3E,0x8F3F,0x8F40,0x8F41,0x8F42,0x8F43,/* 0x98-0x9F */
        0x8F44,0x8368,0x831B,0x8369,0x836C,0x836A,0x836D,0x836E,/* 0xA0-0xA7 */
        0x83B0,0x8378,0x83B3,0x83B4,0x83A0,0x83AA,0x8393,0x839C,/* 0xA8-0xAF */
        0x8385,0x837C,0x83B6,0x83A9,0x837D,0x83B8,0x837B,0x8398,/* 0xB0-0xB7 */
        0x839E,0x83A8,0x83BA,0x83BC,0x83C1,0x8401,0x83E5,0x83D8,/* 0xB8-0xBF */
        0x5807,0x8418,0x840B,0x83DD,0x83FD,0x83D6,0x841C,0x8438,/* 0xC0-0xC7 */
        0x8411,0x8406,0x83D4,0x83DF,0x840F,0x8403,0x83F8,0x83F9,/* 0xC8-0xCF */
        0x83EA,0x83C5,0x83C0,0x8426,0x83F0,0x83E1,0x845C,0x8451,/* 0xD0-0xD7 */
        0x845A,0x8459,0x8473,0x8487,0x8488,0x847A,0x8489,0x8478,/* 0xD8-0xDF */
        0x843C,0x8446,0x8469,0x8476,0x848C,0x848E,0x8431,0x846D,/* 0xE0-0xE7 */
        0x84C1,0x84CD,0x84D0,0x84E6,0x84BD,0x84D3,0x84CA,0x84BF,/* 0xE8-0xEF */
        0x84BA,0x84E0,0x84A1,0x84B9,0x84B4,0x8497,0x84E5,0x84E3,/* 0xF0-0xF7 */
        0x850C,0x750D,0x8538,0x84F0,0x8539,0x851F,0x853A,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_DE[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x8F45,0x8F46,0x8F47,0x8F48,0x8F49,0x8F4A,0x8F4B,0x8F4C,/* 0x40-0x47 */
        0x8F4D,0x8F4E,0x8F4F,0x8F50,0x8F51,0x8F52,0x8F53,0x8F54,/* 0x48-0x4F */
        0x8F55,0x8F56,0x8F57,0x8F58,0x8F59,0x8F5A,0x8F5B,0x8F5C,/* 0x50-0x57 */
        0x8F5D,0x8F5E,0x8F5F,0x8F60,0x8F61,0x8F62,0x8F63,0x8F64,/* 0x58-0x5F */
        0x8F65,0x8F6A,0x8F80,0x8F8C,0x8F92,0x8F9D,0x8FA0,0x8FA1,/* 0x60-0x67 */
        0x8FA2,0x8FA4,0x8FA5,0x8FA6,0x8FA7,0x8FAA,0x8FAC,0x8FAD,/* 0x68-0x6F */
        0x8FAE,0x8FAF,0x8FB2,0x8FB3,0x8FB4,0x8FB5,0x8FB7,0x8FB8,/* 0x70-0x77 */
        0x8FBA,0x8FBB,0x8FBC,0x8FBF,0x8FC0,0x8FC3,0x8FC6,0x0000,/* 0x78-0x7F */

        0x8FC9,0x8FCA,0x8FCB,0x8FCC,0x8FCD,0x8FCF,0x8FD2,0x8FD6,/* 0x80-0x87 */
        0x8FD7,0x8FDA,0x8FE0,0x8FE1,0x8FE3,0x8FE7,0x8FEC,0x8FEF,/* 0x88-0x8F */
        0x8FF1,0x8FF2,0x8FF4,0x8FF5,0x8FF6,0x8FFA,0x8FFB,0x8FFC,/* 0x90-0x97 */
        0x8FFE,0x8FFF,0x9007,0x9008,0x900C,0x900E,0x9013,0x9015,/* 0x98-0x9F */
        0x9018,0x8556,0x853B,0x84FF,0x84FC,0x8559,0x8548,0x8568,/* 0xA0-0xA7 */
        0x8564,0x855E,0x857A,0x77A2,0x8543,0x8572,0x857B,0x85A4,/* 0xA8-0xAF */
        0x85A8,0x8587,0x858F,0x8579,0x85AE,0x859C,0x8585,0x85B9,/* 0xB0-0xB7 */
        0x85B7,0x85B0,0x85D3,0x85C1,0x85DC,0x85FF,0x8627,0x8605,/* 0xB8-0xBF */
        0x8629,0x8616,0x863C,0x5EFE,0x5F08,0x593C,0x5941,0x8037,/* 0xC0-0xC7 */
        0x5955,0x595A,0x5958,0x530F,0x5C22,0x5C25,0x5C2C,0x5C34,/* 0xC8-0xCF */
        0x624C,0x626A,0x629F,0x62BB,0x62CA,0x62DA,0x62D7,0x62EE,/* 0xD0-0xD7 */
        0x6322,0x62F6,0x6339,0x634B,0x6343,0x63AD,0x63F6,0x6371,/* 0xD8-0xDF */
        0x637A,0x638E,0x63B4,0x636D,0x63AC,0x638A,0x6369,0x63AE,/* 0xE0-0xE7 */
        0x63BC,0x63F2,0x63F8,0x63E0,0x63FF,0x63C4,0x63DE,0x63CE,/* 0xE8-0xEF */
        0x6452,0x63C6,0x63BE,0x6445,0x6441,0x640B,0x641B,0x6420,/* 0xF0-0xF7 */
        0x640C,0x6426,0x6421,0x645E,0x6484,0x646D,0x6496,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_DF[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x9019,0x901C,0x9023,0x9024,0x9025,0x9027,0x9028,0x9029,/* 0x40-0x47 */
        0x902A,0x902B,0x902C,0x9030,0x9031,0x9032,0x9033,0x9034,/* 0x48-0x4F */
        0x9037,0x9039,0x903A,0x903D,0x903F,0x9040,0x9043,0x9045,/* 0x50-0x57 */
        0x9046,0x9048,0x9049,0x904A,0x904B,0x904C,0x904E,0x9054,/* 0x58-0x5F */
        0x9055,0x9056,0x9059,0x905A,0x905C,0x905D,0x905E,0x905F,/* 0x60-0x67 */
        0x9060,0x9061,0x9064,0x9066,0x9067,0x9069,0x906A,0x906B,/* 0x68-0x6F */
        0x906C,0x906F,0x9070,0x9071,0x9072,0x9073,0x9076,0x9077,/* 0x70-0x77 */
        0x9078,0x9079,0x907A,0x907B,0x907C,0x907E,0x9081,0x0000,/* 0x78-0x7F */

        0x9084,0x9085,0x9086,0x9087,0x9089,0x908A,0x908C,0x908D,/* 0x80-0x87 */
        0x908E,0x908F,0x9090,0x9092,0x9094,0x9096,0x9098,0x909A,/* 0x88-0x8F */
        0x909C,0x909E,0x909F,0x90A0,0x90A4,0x90A5,0x90A7,0x90A8,/* 0x90-0x97 */
        0x90A9,0x90AB,0x90AD,0x90B2,0x90B7,0x90BC,0x90BD,0x90BF,/* 0x98-0x9F */
        0x90C0,0x647A,0x64B7,0x64B8,0x6499,0x64BA,0x64C0,0x64D0,/* 0xA0-0xA7 */
        0x64D7,0x64E4,0x64E2,0x6509,0x6525,0x652E,0x5F0B,0x5FD2,/* 0xA8-0xAF */
        0x7519,0x5F11,0x535F,0x53F1,0x53FD,0x53E9,0x53E8,0x53FB,/* 0xB0-0xB7 */
        0x5412,0x5416,0x5406,0x544B,0x5452,0x5453,0x5454,0x5456,/* 0xB8-0xBF */
        0x5443,0x5421,0x5457,0x5459,0x5423,0x5432,0x5482,0x5494,/* 0xC0-0xC7 */
        0x5477,0x5471,0x5464,0x549A,0x549B,0x5484,0x5476,0x5466,/* 0xC8-0xCF */
        0x549D,0x54D0,0x54AD,0x54C2,0x54B4,0x54D2,0x54A7,0x54A6,/* 0xD0-0xD7 */
        0x54D3,0x54D4,0x5472,0x54A3,0x54D5,0x54BB,0x54BF,0x54CC,/* 0xD8-0xDF */
        0x54D9,0x54DA,0x54DC,0x54A9,0x54AA,0x54A4,0x54DD,0x54CF,/* 0xE0-0xE7 */
        0x54DE,0x551B,0x54E7,0x5520,0x54FD,0x5514,0x54F3,0x5522,/* 0xE8-0xEF */
        0x5523,0x550F,0x5511,0x5527,0x552A,0x5567,0x558F,0x55B5,/* 0xF0-0xF7 */
        0x5549,0x556D,0x5541,0x5555,0x553F,0x5550,0x553C,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_E0[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x90C2,0x90C3,0x90C6,0x90C8,0x90C9,0x90CB,0x90CC,0x90CD,/* 0x40-0x47 */
        0x90D2,0x90D4,0x90D5,0x90D6,0x90D8,0x90D9,0x90DA,0x90DE,/* 0x48-0x4F */
        0x90DF,0x90E0,0x90E3,0x90E4,0x90E5,0x90E9,0x90EA,0x90EC,/* 0x50-0x57 */
        0x90EE,0x90F0,0x90F1,0x90F2,0x90F3,0x90F5,0x90F6,0x90F7,/* 0x58-0x5F */
        0x90F9,0x90FA,0x90FB,0x90FC,0x90FF,0x9100,0x9101,0x9103,/* 0x60-0x67 */
        0x9105,0x9106,0x9107,0x9108,0x9109,0x910A,0x910B,0x910C,/* 0x68-0x6F */
        0x910D,0x910E,0x910F,0x9110,0x9111,0x9112,0x9113,0x9114,/* 0x70-0x77 */
        0x9115,0x9116,0x9117,0x9118,0x911A,0x911B,0x911C,0x0000,/* 0x78-0x7F */

        0x911D,0x911F,0x9120,0x9121,0x9124,0x9125,0x9126,0x9127,/* 0x80-0x87 */
        0x9128,0x9129,0x912A,0x912B,0x912C,0x912D,0x912E,0x9130,/* 0x88-0x8F */
        0x9132,0x9133,0x9134,0x9135,0x9136,0x9137,0x9138,0x913A,/* 0x90-0x97 */
        0x913B,0x913C,0x913D,0x913E,0x913F,0x9140,0x9141,0x9142,/* 0x98-0x9F */
        0x9144,0x5537,0x5556,0x5575,0x5576,0x5577,0x5533,0x5530,/* 0xA0-0xA7 */
        0x555C,0x558B,0x55D2,0x5583,0x55B1,0x55B9,0x5588,0x5581,/* 0xA8-0xAF */
        0x559F,0x557E,0x55D6,0x5591,0x557B,0x55DF,0x55BD,0x55BE,/* 0xB0-0xB7 */
        0x5594,0x5599,0x55EA,0x55F7,0x55C9,0x561F,0x55D1,0x55EB,/* 0xB8-0xBF */
        0x55EC,0x55D4,0x55E6,0x55DD,0x55C4,0x55EF,0x55E5,0x55F2,/* 0xC0-0xC7 */
        0x55F3,0x55CC,0x55CD,0x55E8,0x55F5,0x55E4,0x8F94,0x561E,/* 0xC8-0xCF */
        0x5608,0x560C,0x5601,0x5624,0x5623,0x55FE,0x5600,0x5627,/* 0xD0-0xD7 */
        0x562D,0x5658,0x5639,0x5657,0x562C,0x564D,0x5662,0x5659,/* 0xD8-0xDF */
        0x565C,0x564C,0x5654,0x5686,0x5664,0x5671,0x566B,0x567B,/* 0xE0-0xE7 */
        0x567C,0x5685,0x5693,0x56AF,0x56D4,0x56D7,0x56DD,0x56E1,/* 0xE8-0xEF */
        0x56F5,0x56EB,0x56F9,0x56FF,0x5704,0x570A,0x5709,0x571C,/* 0xF0-0xF7 */
        0x5E0F,0x5E19,0x5E14,0x5E11,0x5E31,0x5E3B,0x5E3C,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_E1[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x9145,0x9147,0x9148,0x9151,0x9153,0x9154,0x9155,0x9156,/* 0x40-0x47 */
        0x9158,0x9159,0x915B,0x915C,0x915F,0x9160,0x9166,0x9167,/* 0x48-0x4F */
        0x9168,0x916B,0x916D,0x9173,0x917A,0x917B,0x917C,0x9180,/* 0x50-0x57 */
        0x9181,0x9182,0x9183,0x9184,0x9186,0x9188,0x918A,0x918E,/* 0x58-0x5F */
        0x918F,0x9193,0x9194,0x9195,0x9196,0x9197,0x9198,0x9199,/* 0x60-0x67 */
        0x919C,0x919D,0x919E,0x919F,0x91A0,0x91A1,0x91A4,0x91A5,/* 0x68-0x6F */
        0x91A6,0x91A7,0x91A8,0x91A9,0x91AB,0x91AC,0x91B0,0x91B1,/* 0x70-0x77 */
        0x91B2,0x91B3,0x91B6,0x91B7,0x91B8,0x91B9,0x91BB,0x0000,/* 0x78-0x7F */

        0x91BC,0x91BD,0x91BE,0x91BF,0x91C0,0x91C1,0x91C2,0x91C3,/* 0x80-0x87 */
        0x91C4,0x91C5,0x91C6,0x91C8,0x91CB,0x91D0,0x91D2,0x91D3,/* 0x88-0x8F */
        0x91D4,0x91D5,0x91D6,0x91D7,0x91D8,0x91D9,0x91DA,0x91DB,/* 0x90-0x97 */
        0x91DD,0x91DE,0x91DF,0x91E0,0x91E1,0x91E2,0x91E3,0x91E4,/* 0x98-0x9F */
        0x91E5,0x5E37,0x5E44,0x5E54,0x5E5B,0x5E5E,0x5E61,0x5C8C,/* 0xA0-0xA7 */
        0x5C7A,0x5C8D,0x5C90,0x5C96,0x5C88,0x5C98,0x5C99,0x5C91,/* 0xA8-0xAF */
        0x5C9A,0x5C9C,0x5CB5,0x5CA2,0x5CBD,0x5CAC,0x5CAB,0x5CB1,/* 0xB0-0xB7 */
        0x5CA3,0x5CC1,0x5CB7,0x5CC4,0x5CD2,0x5CE4,0x5CCB,0x5CE5,/* 0xB8-0xBF */
        0x5D02,0x5D03,0x5D27,0x5D26,0x5D2E,0x5D24,0x5D1E,0x5D06,/* 0xC0-0xC7 */
        0x5D1B,0x5D58,0x5D3E,0x5D34,0x5D3D,0x5D6C,0x5D5B,0x5D6F,/* 0xC8-0xCF */
        0x5D5D,0x5D6B,0x5D4B,0x5D4A,0x5D69,0x5D74,0x5D82,0x5D99,/* 0xD0-0xD7 */
        0x5D9D,0x8C73,0x5DB7,0x5DC5,0x5F73,0x5F77,0x5F82,0x5F87,/* 0xD8-0xDF */
        0x5F89,0x5F8C,0x5F95,0x5F99,0x5F9C,0x5FA8,0x5FAD,0x5FB5,/* 0xE0-0xE7 */
        0x5FBC,0x8862,0x5F61,0x72AD,0x72B0,0x72B4,0x72B7,0x72B8,/* 0xE8-0xEF */
        0x72C3,0x72C1,0x72CE,0x72CD,0x72D2,0x72E8,0x72EF,0x72E9,/* 0xF0-0xF7 */
        0x72F2,0x72F4,0x72F7,0x7301,0x72F3,0x7303,0x72FA,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_E2[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x91E6,0x91E7,0x91E8,0x91E9,0x91EA,0x91EB,0x91EC,0x91ED,/* 0x40-0x47 */
        0x91EE,0x91EF,0x91F0,0x91F1,0x91F2,0x91F3,0x91F4,0x91F5,/* 0x48-0x4F */
        0x91F6,0x91F7,0x91F8,0x91F9,0x91FA,0x91FB,0x91FC,0x91FD,/* 0x50-0x57 */
        0x91FE,0x91FF,0x9200,0x9201,0x9202,0x9203,0x9204,0x9205,/* 0x58-0x5F */
        0x9206,0x9207,0x9208,0x9209,0x920A,0x920B,0x920C,0x920D,/* 0x60-0x67 */
        0x920E,0x920F,0x9210,0x9211,0x9212,0x9213,0x9214,0x9215,/* 0x68-0x6F */
        0x9216,0x9217,0x9218,0x9219,0x921A,0x921B,0x921C,0x921D,/* 0x70-0x77 */
        0x921E,0x921F,0x9220,0x9221,0x9222,0x9223,0x9224,0x0000,/* 0x78-0x7F */

        0x9225,0x9226,0x9227,0x9228,0x9229,0x922A,0x922B,0x922C,/* 0x80-0x87 */
        0x922D,0x922E,0x922F,0x9230,0x9231,0x9232,0x9233,0x9234,/* 0x88-0x8F */
        0x9235,0x9236,0x9237,0x9238,0x9239,0x923A,0x923B,0x923C,/* 0x90-0x97 */
        0x923D,0x923E,0x923F,0x9240,0x9241,0x9242,0x9243,0x9244,/* 0x98-0x9F */
        0x9245,0x72FB,0x7317,0x7313,0x7321,0x730A,0x731E,0x731D,/* 0xA0-0xA7 */
        0x7315,0x7322,0x7339,0x7325,0x732C,0x7338,0x7331,0x7350,/* 0xA8-0xAF */
        0x734D,0x7357,0x7360,0x736C,0x736F,0x737E,0x821B,0x5925,/* 0xB0-0xB7 */
        0x98E7,0x5924,0x5902,0x9963,0x9967,0x9968,0x9969,0x996A,/* 0xB8-0xBF */
        0x996B,0x996C,0x9974,0x9977,0x997D,0x9980,0x9984,0x9987,/* 0xC0-0xC7 */
        0x998A,0x998D,0x9990,0x9991,0x9993,0x9994,0x9995,0x5E80,/* 0xC8-0xCF */
        0x5E91,0x5E8B,0x5E96,0x5EA5,0x5EA0,0x5EB9,0x5EB5,0x5EBE,/* 0xD0-0xD7 */
        0x5EB3,0x8D53,0x5ED2,0x5ED1,0x5EDB,0x5EE8,0x5EEA,0x81BA,/* 0xD8-0xDF */
        0x5FC4,0x5FC9,0x5FD6,0x5FCF,0x6003,0x5FEE,0x6004,0x5FE1,/* 0xE0-0xE7 */
        0x5FE4,0x5FFE,0x6005,0x6006,0x5FEA,0x5FED,0x5FF8,0x6019,/* 0xE8-0xEF */
        0x6035,0x6026,0x601B,0x600F,0x600D,0x6029,0x602B,0x600A,/* 0xF0-0xF7 */
        0x603F,0x6021,0x6078,0x6079,0x607B,0x607A,0x6042,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_E3[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x9246,0x9247,0x9248,0x9249,0x924A,0x924B,0x924C,0x924D,/* 0x40-0x47 */
        0x924E,0x924F,0x9250,0x9251,0x9252,0x9253,0x9254,0x9255,/* 0x48-0x4F */
        0x9256,0x9257,0x9258,0x9259,0x925A,0x925B,0x925C,0x925D,/* 0x50-0x57 */
        0x925E,0x925F,0x9260,0x9261,0x9262,0x9263,0x9264,0x9265,/* 0x58-0x5F */
        0x9266,0x9267,0x9268,0x9269,0x926A,0x926B,0x926C,0x926D,/* 0x60-0x67 */
        0x926E,0x926F,0x9270,0x9271,0x9272,0x9273,0x9275,0x9276,/* 0x68-0x6F */
        0x9277,0x9278,0x9279,0x927A,0x927B,0x927C,0x927D,0x927E,/* 0x70-0x77 */
        0x927F,0x9280,0x9281,0x9282,0x9283,0x9284,0x9285,0x0000,/* 0x78-0x7F */

        0x9286,0x9287,0x9288,0x9289,0x928A,0x928B,0x928C,0x928D,/* 0x80-0x87 */
        0x928F,0x9290,0x9291,0x9292,0x9293,0x9294,0x9295,0x9296,/* 0x88-0x8F */
        0x9297,0x9298,0x9299,0x929A,0x929B,0x929C,0x929D,0x929E,/* 0x90-0x97 */
        0x929F,0x92A0,0x92A1,0x92A2,0x92A3,0x92A4,0x92A5,0x92A6,/* 0x98-0x9F */
        0x92A7,0x606A,0x607D,0x6096,0x609A,0x60AD,0x609D,0x6083,/* 0xA0-0xA7 */
        0x6092,0x608C,0x609B,0x60EC,0x60BB,0x60B1,0x60DD,0x60D8,/* 0xA8-0xAF */
        0x60C6,0x60DA,0x60B4,0x6120,0x6126,0x6115,0x6123,0x60F4,/* 0xB0-0xB7 */
        0x6100,0x610E,0x612B,0x614A,0x6175,0x61AC,0x6194,0x61A7,/* 0xB8-0xBF */
        0x61B7,0x61D4,0x61F5,0x5FDD,0x96B3,0x95E9,0x95EB,0x95F1,/* 0xC0-0xC7 */
        0x95F3,0x95F5,0x95F6,0x95FC,0x95FE,0x9603,0x9604,0x9606,/* 0xC8-0xCF */
        0x9608,0x960A,0x960B,0x960C,0x960D,0x960F,0x9612,0x9615,/* 0xD0-0xD7 */
        0x9616,0x9617,0x9619,0x961A,0x4E2C,0x723F,0x6215,0x6C35,/* 0xD8-0xDF */
        0x6C54,0x6C5C,0x6C4A,0x6CA3,0x6C85,0x6C90,0x6C94,0x6C8C,/* 0xE0-0xE7 */
        0x6C68,0x6C69,0x6C74,0x6C76,0x6C86,0x6CA9,0x6CD0,0x6CD4,/* 0xE8-0xEF */
        0x6CAD,0x6CF7,0x6CF8,0x6CF1,0x6CD7,0x6CB2,0x6CE0,0x6CD6,/* 0xF0-0xF7 */
        0x6CFA,0x6CEB,0x6CEE,0x6CB1,0x6CD3,0x6CEF,0x6CFE,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_E4[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x92A8,0x92A9,0x92AA,0x92AB,0x92AC,0x92AD,0x92AF,0x92B0,/* 0x40-0x47 */
        0x92B1,0x92B2,0x92B3,0x92B4,0x92B5,0x92B6,0x92B7,0x92B8,/* 0x48-0x4F */
        0x92B9,0x92BA,0x92BB,0x92BC,0x92BD,0x92BE,0x92BF,0x92C0,/* 0x50-0x57 */
        0x92C1,0x92C2,0x92C3,0x92C4,0x92C5,0x92C6,0x92C7,0x92C9,/* 0x58-0x5F */
        0x92CA,0x92CB,0x92CC,0x92CD,0x92CE,0x92CF,0x92D0,0x92D1,/* 0x60-0x67 */
        0x92D2,0x92D3,0x92D4,0x92D5,0x92D6,0x92D7,0x92D8,0x92D9,/* 0x68-0x6F */
        0x92DA,0x92DB,0x92DC,0x92DD,0x92DE,0x92DF,0x92E0,0x92E1,/* 0x70-0x77 */
        0x92E2,0x92E3,0x92E4,0x92E5,0x92E6,0x92E7,0x92E8,0x0000,/* 0x78-0x7F */

        0x92E9,0x92EA,0x92EB,0x92EC,0x92ED,0x92EE,0x92EF,0x92F0,/* 0x80-0x87 */
        0x92F1,0x92F2,0x92F3,0x92F4,0x92F5,0x92F6,0x92F7,0x92F8,/* 0x88-0x8F */
        0x92F9,0x92FA,0x92FB,0x92FC,0x92FD,0x92FE,0x92FF,0x9300,/* 0x90-0x97 */
        0x9301,0x9302,0x9303,0x9304,0x9305,0x9306,0x9307,0x9308,/* 0x98-0x9F */
        0x9309,0x6D39,0x6D27,0x6D0C,0x6D43,0x6D48,0x6D07,0x6D04,/* 0xA0-0xA7 */
        0x6D19,0x6D0E,0x6D2B,0x6D4D,0x6D2E,0x6D35,0x6D1A,0x6D4F,/* 0xA8-0xAF */
        0x6D52,0x6D54,0x6D33,0x6D91,0x6D6F,0x6D9E,0x6DA0,0x6D5E,/* 0xB0-0xB7 */
        0x6D93,0x6D94,0x6D5C,0x6D60,0x6D7C,0x6D63,0x6E1A,0x6DC7,/* 0xB8-0xBF */
        0x6DC5,0x6DDE,0x6E0E,0x6DBF,0x6DE0,0x6E11,0x6DE6,0x6DDD,/* 0xC0-0xC7 */
        0x6DD9,0x6E16,0x6DAB,0x6E0C,0x6DAE,0x6E2B,0x6E6E,0x6E4E,/* 0xC8-0xCF */
        0x6E6B,0x6EB2,0x6E5F,0x6E86,0x6E53,0x6E54,0x6E32,0x6E25,/* 0xD0-0xD7 */
        0x6E44,0x6EDF,0x6EB1,0x6E98,0x6EE0,0x6F2D,0x6EE2,0x6EA5,/* 0xD8-0xDF */
        0x6EA7,0x6EBD,0x6EBB,0x6EB7,0x6ED7,0x6EB4,0x6ECF,0x6E8F,/* 0xE0-0xE7 */
        0x6EC2,0x6E9F,0x6F62,0x6F46,0x6F47,0x6F24,0x6F15,0x6EF9,/* 0xE8-0xEF */
        0x6F2F,0x6F36,0x6F4B,0x6F74,0x6F2A,0x6F09,0x6F29,0x6F89,/* 0xF0-0xF7 */
        0x6F8D,0x6F8C,0x6F78,0x6F72,0x6F7C,0x6F7A,0x6FD1,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_E5[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x930A,0x930B,0x930C,0x930D,0x930E,0x930F,0x9310,0x9311,/* 0x40-0x47 */
        0x9312,0x9313,0x9314,0x9315,0x9316,0x9317,0x9318,0x9319,/* 0x48-0x4F */
        0x931A,0x931B,0x931C,0x931D,0x931E,0x931F,0x9320,0x9321,/* 0x50-0x57 */
        0x9322,0x9323,0x9324,0x9325,0x9326,0x9327,0x9328,0x9329,/* 0x58-0x5F */
        0x932A,0x932B,0x932C,0x932D,0x932E,0x932F,0x9330,0x9331,/* 0x60-0x67 */
        0x9332,0x9333,0x9334,0x9335,0x9336,0x9337,0x9338,0x9339,/* 0x68-0x6F */
        0x933A,0x933B,0x933C,0x933D,0x933F,0x9340,0x9341,0x9342,/* 0x70-0x77 */
        0x9343,0x9344,0x9345,0x9346,0x9347,0x9348,0x9349,0x0000,/* 0x78-0x7F */

        0x934A,0x934B,0x934C,0x934D,0x934E,0x934F,0x9350,0x9351,/* 0x80-0x87 */
        0x9352,0x9353,0x9354,0x9355,0x9356,0x9357,0x9358,0x9359,/* 0x88-0x8F */
        0x935A,0x935B,0x935C,0x935D,0x935E,0x935F,0x9360,0x9361,/* 0x90-0x97 */
        0x9362,0x9363,0x9364,0x9365,0x9366,0x9367,0x9368,0x9369,/* 0x98-0x9F */
        0x936B,0x6FC9,0x6FA7,0x6FB9,0x6FB6,0x6FC2,0x6FE1,0x6FEE,/* 0xA0-0xA7 */
        0x6FDE,0x6FE0,0x6FEF,0x701A,0x7023,0x701B,0x7039,0x7035,/* 0xA8-0xAF */
        0x704F,0x705E,0x5B80,0x5B84,0x5B95,0x5B93,0x5BA5,0x5BB8,/* 0xB0-0xB7 */
        0x752F,0x9A9E,0x6434,0x5BE4,0x5BEE,0x8930,0x5BF0,0x8E47,/* 0xB8-0xBF */
        0x8B07,0x8FB6,0x8FD3,0x8FD5,0x8FE5,0x8FEE,0x8FE4,0x8FE9,/* 0xC0-0xC7 */
        0x8FE6,0x8FF3,0x8FE8,0x9005,0x9004,0x900B,0x9026,0x9011,/* 0xC8-0xCF */
        0x900D,0x9016,0x9021,0x9035,0x9036,0x902D,0x902F,0x9044,/* 0xD0-0xD7 */
        0x9051,0x9052,0x9050,0x9068,0x9058,0x9062,0x905B,0x66B9,/* 0xD8-0xDF */
        0x9074,0x907D,0x9082,0x9088,0x9083,0x908B,0x5F50,0x5F57,/* 0xE0-0xE7 */
        0x5F56,0x5F58,0x5C3B,0x54AB,0x5C50,0x5C59,0x5B71,0x5C63,/* 0xE8-0xEF */
        0x5C66,0x7FBC,0x5F2A,0x5F29,0x5F2D,0x8274,0x5F3C,0x9B3B,/* 0xF0-0xF7 */
        0x5C6E,0x5981,0x5983,0x598D,0x59A9,0x59AA,0x59A3,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_E6[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x936C,0x936D,0x936E,0x936F,0x9370,0x9371,0x9372,0x9373,/* 0x40-0x47 */
        0x9374,0x9375,0x9376,0x9377,0x9378,0x9379,0x937A,0x937B,/* 0x48-0x4F */
        0x937C,0x937D,0x937E,0x937F,0x9380,0x9381,0x9382,0x9383,/* 0x50-0x57 */
        0x9384,0x9385,0x9386,0x9387,0x9388,0x9389,0x938A,0x938B,/* 0x58-0x5F */
        0x938C,0x938D,0x938E,0x9390,0x9391,0x9392,0x9393,0x9394,/* 0x60-0x67 */
        0x9395,0x9396,0x9397,0x9398,0x9399,0x939A,0x939B,0x939C,/* 0x68-0x6F */
        0x939D,0x939E,0x939F,0x93A0,0x93A1,0x93A2,0x93A3,0x93A4,/* 0x70-0x77 */
        0x93A5,0x93A6,0x93A7,0x93A8,0x93A9,0x93AA,0x93AB,0x0000,/* 0x78-0x7F */

        0x93AC,0x93AD,0x93AE,0x93AF,0x93B0,0x93B1,0x93B2,0x93B3,/* 0x80-0x87 */
        0x93B4,0x93B5,0x93B6,0x93B7,0x93B8,0x93B9,0x93BA,0x93BB,/* 0x88-0x8F */
        0x93BC,0x93BD,0x93BE,0x93BF,0x93C0,0x93C1,0x93C2,0x93C3,/* 0x90-0x97 */
        0x93C4,0x93C5,0x93C6,0x93C7,0x93C8,0x93C9,0x93CB,0x93CC,/* 0x98-0x9F */
        0x93CD,0x5997,0x59CA,0x59AB,0x599E,0x59A4,0x59D2,0x59B2,/* 0xA0-0xA7 */
        0x59AF,0x59D7,0x59BE,0x5A05,0x5A06,0x59DD,0x5A08,0x59E3,/* 0xA8-0xAF */
        0x59D8,0x59F9,0x5A0C,0x5A09,0x5A32,0x5A34,0x5A11,0x5A23,/* 0xB0-0xB7 */
        0x5A13,0x5A40,0x5A67,0x5A4A,0x5A55,0x5A3C,0x5A62,0x5A75,/* 0xB8-0xBF */
        0x80EC,0x5AAA,0x5A9B,0x5A77,0x5A7A,0x5ABE,0x5AEB,0x5AB2,/* 0xC0-0xC7 */
        0x5AD2,0x5AD4,0x5AB8,0x5AE0,0x5AE3,0x5AF1,0x5AD6,0x5AE6,/* 0xC8-0xCF */
        0x5AD8,0x5ADC,0x5B09,0x5B17,0x5B16,0x5B32,0x5B37,0x5B40,/* 0xD0-0xD7 */
        0x5C15,0x5C1C,0x5B5A,0x5B65,0x5B73,0x5B51,0x5B53,0x5B62,/* 0xD8-0xDF */
        0x9A75,0x9A77,0x9A78,0x9A7A,0x9A7F,0x9A7D,0x9A80,0x9A81,/* 0xE0-0xE7 */
        0x9A85,0x9A88,0x9A8A,0x9A90,0x9A92,0x9A93,0x9A96,0x9A98,/* 0xE8-0xEF */
        0x9A9B,0x9A9C,0x9A9D,0x9A9F,0x9AA0,0x9AA2,0x9AA3,0x9AA5,/* 0xF0-0xF7 */
        0x9AA7,0x7E9F,0x7EA1,0x7EA3,0x7EA5,0x7EA8,0x7EA9,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_E7[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x93CE,0x93CF,0x93D0,0x93D1,0x93D2,0x93D3,0x93D4,0x93D5,/* 0x40-0x47 */
        0x93D7,0x93D8,0x93D9,0x93DA,0x93DB,0x93DC,0x93DD,0x93DE,/* 0x48-0x4F */
        0x93DF,0x93E0,0x93E1,0x93E2,0x93E3,0x93E4,0x93E5,0x93E6,/* 0x50-0x57 */
        0x93E7,0x93E8,0x93E9,0x93EA,0x93EB,0x93EC,0x93ED,0x93EE,/* 0x58-0x5F */
        0x93EF,0x93F0,0x93F1,0x93F2,0x93F3,0x93F4,0x93F5,0x93F6,/* 0x60-0x67 */
        0x93F7,0x93F8,0x93F9,0x93FA,0x93FB,0x93FC,0x93FD,0x93FE,/* 0x68-0x6F */
        0x93FF,0x9400,0x9401,0x9402,0x9403,0x9404,0x9405,0x9406,/* 0x70-0x77 */
        0x9407,0x9408,0x9409,0x940A,0x940B,0x940C,0x940D,0x0000,/* 0x78-0x7F */

        0x940E,0x940F,0x9410,0x9411,0x9412,0x9413,0x9414,0x9415,/* 0x80-0x87 */
        0x9416,0x9417,0x9418,0x9419,0x941A,0x941B,0x941C,0x941D,/* 0x88-0x8F */
        0x941E,0x941F,0x9420,0x9421,0x9422,0x9423,0x9424,0x9425,/* 0x90-0x97 */
        0x9426,0x9427,0x9428,0x9429,0x942A,0x942B,0x942C,0x942D,/* 0x98-0x9F */
        0x942E,0x7EAD,0x7EB0,0x7EBE,0x7EC0,0x7EC1,0x7EC2,0x7EC9,/* 0xA0-0xA7 */
        0x7ECB,0x7ECC,0x7ED0,0x7ED4,0x7ED7,0x7EDB,0x7EE0,0x7EE1,/* 0xA8-0xAF */
        0x7EE8,0x7EEB,0x7EEE,0x7EEF,0x7EF1,0x7EF2,0x7F0D,0x7EF6,/* 0xB0-0xB7 */
        0x7EFA,0x7EFB,0x7EFE,0x7F01,0x7F02,0x7F03,0x7F07,0x7F08,/* 0xB8-0xBF */
        0x7F0B,0x7F0C,0x7F0F,0x7F11,0x7F12,0x7F17,0x7F19,0x7F1C,/* 0xC0-0xC7 */
        0x7F1B,0x7F1F,0x7F21,0x7F22,0x7F23,0x7F24,0x7F25,0x7F26,/* 0xC8-0xCF */
        0x7F27,0x7F2A,0x7F2B,0x7F2C,0x7F2D,0x7F2F,0x7F30,0x7F31,/* 0xD0-0xD7 */
        0x7F32,0x7F33,0x7F35,0x5E7A,0x757F,0x5DDB,0x753E,0x9095,/* 0xD8-0xDF */
        0x738E,0x7391,0x73AE,0x73A2,0x739F,0x73CF,0x73C2,0x73D1,/* 0xE0-0xE7 */
        0x73B7,0x73B3,0x73C0,0x73C9,0x73C8,0x73E5,0x73D9,0x987C,/* 0xE8-0xEF */
        0x740A,0x73E9,0x73E7,0x73DE,0x73BA,0x73F2,0x740F,0x742A,/* 0xF0-0xF7 */
        0x745B,0x7426,0x7425,0x7428,0x7430,0x742E,0x742C,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_E8[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x942F,0x9430,0x9431,0x9432,0x9433,0x9434,0x9435,0x9436,/* 0x40-0x47 */
        0x9437,0x9438,0x9439,0x943A,0x943B,0x943C,0x943D,0x943F,/* 0x48-0x4F */
        0x9440,0x9441,0x9442,0x9443,0x9444,0x9445,0x9446,0x9447,/* 0x50-0x57 */
        0x9448,0x9449,0x944A,0x944B,0x944C,0x944D,0x944E,0x944F,/* 0x58-0x5F */
        0x9450,0x9451,0x9452,0x9453,0x9454,0x9455,0x9456,0x9457,/* 0x60-0x67 */
        0x9458,0x9459,0x945A,0x945B,0x945C,0x945D,0x945E,0x945F,/* 0x68-0x6F */
        0x9460,0x9461,0x9462,0x9463,0x9464,0x9465,0x9466,0x9467,/* 0x70-0x77 */
        0x9468,0x9469,0x946A,0x946C,0x946D,0x946E,0x946F,0x0000,/* 0x78-0x7F */

        0x9470,0x9471,0x9472,0x9473,0x9474,0x9475,0x9476,0x9477,/* 0x80-0x87 */
        0x9478,0x9479,0x947A,0x947B,0x947C,0x947D,0x947E,0x947F,/* 0x88-0x8F */
        0x9480,0x9481,0x9482,0x9483,0x9484,0x9491,0x9496,0x9498,/* 0x90-0x97 */
        0x94C7,0x94CF,0x94D3,0x94D4,0x94DA,0x94E6,0x94FB,0x951C,/* 0x98-0x9F */
        0x9520,0x741B,0x741A,0x7441,0x745C,0x7457,0x7455,0x7459,/* 0xA0-0xA7 */
        0x7477,0x746D,0x747E,0x749C,0x748E,0x7480,0x7481,0x7487,/* 0xA8-0xAF */
        0x748B,0x749E,0x74A8,0x74A9,0x7490,0x74A7,0x74D2,0x74BA,/* 0xB0-0xB7 */
        0x97EA,0x97EB,0x97EC,0x674C,0x6753,0x675E,0x6748,0x6769,/* 0xB8-0xBF */
        0x67A5,0x6787,0x676A,0x6773,0x6798,0x67A7,0x6775,0x67A8,/* 0xC0-0xC7 */
        0x679E,0x67AD,0x678B,0x6777,0x677C,0x67F0,0x6809,0x67D8,/* 0xC8-0xCF */
        0x680A,0x67E9,0x67B0,0x680C,0x67D9,0x67B5,0x67DA,0x67B3,/* 0xD0-0xD7 */
        0x67DD,0x6800,0x67C3,0x67B8,0x67E2,0x680E,0x67C1,0x67FD,/* 0xD8-0xDF */
        0x6832,0x6833,0x6860,0x6861,0x684E,0x6862,0x6844,0x6864,/* 0xE0-0xE7 */
        0x6883,0x681D,0x6855,0x6866,0x6841,0x6867,0x6840,0x683E,/* 0xE8-0xEF */
        0x684A,0x6849,0x6829,0x68B5,0x688F,0x6874,0x6877,0x6893,/* 0xF0-0xF7 */
        0x686B,0x68C2,0x696E,0x68FC,0x691F,0x6920,0x68F9,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_E9[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x9527,0x9533,0x953D,0x9543,0x9548,0x954B,0x9555,0x955A,/* 0x40-0x47 */
        0x9560,0x956E,0x9574,0x9575,0x9577,0x9578,0x9579,0x957A,/* 0x48-0x4F */
        0x957B,0x957C,0x957D,0x957E,0x9580,0x9581,0x9582,0x9583,/* 0x50-0x57 */
        0x9584,0x9585,0x9586,0x9587,0x9588,0x9589,0x958A,0x958B,/* 0x58-0x5F */
        0x958C,0x958D,0x958E,0x958F,0x9590,0x9591,0x9592,0x9593,/* 0x60-0x67 */
        0x9594,0x9595,0x9596,0x9597,0x9598,0x9599,0x959A,0x959B,/* 0x68-0x6F */
        0x959C,0x959D,0x959E,0x959F,0x95A0,0x95A1,0x95A2,0x95A3,/* 0x70-0x77 */
        0x95A4,0x95A5,0x95A6,0x95A7,0x95A8,0x95A9,0x95AA,0x0000,/* 0x78-0x7F */

        0x95AB,0x95AC,0x95AD,0x95AE,0x95AF,0x95B0,0x95B1,0x95B2,/* 0x80-0x87 */
        0x95B3,0x95B4,0x95B5,0x95B6,0x95B7,0x95B8,0x95B9,0x95BA,/* 0x88-0x8F */
        0x95BB,0x95BC,0x95BD,0x95BE,0x95BF,0x95C0,0x95C1,0x95C2,/* 0x90-0x97 */
        0x95C3,0x95C4,0x95C5,0x95C6,0x95C7,0x95C8,0x95C9,0x95CA,/* 0x98-0x9F */
        0x95CB,0x6924,0x68F0,0x690B,0x6901,0x6957,0x68E3,0x6910,/* 0xA0-0xA7 */
        0x6971,0x6939,0x6960,0x6942,0x695D,0x6984,0x696B,0x6980,/* 0xA8-0xAF */
        0x6998,0x6978,0x6934,0x69CC,0x6987,0x6988,0x69CE,0x6989,/* 0xB0-0xB7 */
        0x6966,0x6963,0x6979,0x699B,0x69A7,0x69BB,0x69AB,0x69AD,/* 0xB8-0xBF */
        0x69D4,0x69B1,0x69C1,0x69CA,0x69DF,0x6995,0x69E0,0x698D,/* 0xC0-0xC7 */
        0x69FF,0x6A2F,0x69ED,0x6A17,0x6A18,0x6A65,0x69F2,0x6A44,/* 0xC8-0xCF */
        0x6A3E,0x6AA0,0x6A50,0x6A5B,0x6A35,0x6A8E,0x6A79,0x6A3D,/* 0xD0-0xD7 */
        0x6A28,0x6A58,0x6A7C,0x6A91,0x6A90,0x6AA9,0x6A97,0x6AAB,/* 0xD8-0xDF */
        0x7337,0x7352,0x6B81,0x6B82,0x6B87,0x6B84,0x6B92,0x6B93,/* 0xE0-0xE7 */
        0x6B8D,0x6B9A,0x6B9B,0x6BA1,0x6BAA,0x8F6B,0x8F6D,0x8F71,/* 0xE8-0xEF */
        0x8F72,0x8F73,0x8F75,0x8F76,0x8F78,0x8F77,0x8F79,0x8F7A,/* 0xF0-0xF7 */
        0x8F7C,0x8F7E,0x8F81,0x8F82,0x8F84,0x8F87,0x8F8B,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_EA[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x95CC,0x95CD,0x95CE,0x95CF,0x95D0,0x95D1,0x95D2,0x95D3,/* 0x40-0x47 */
        0x95D4,0x95D5,0x95D6,0x95D7,0x95D8,0x95D9,0x95DA,0x95DB,/* 0x48-0x4F */
        0x95DC,0x95DD,0x95DE,0x95DF,0x95E0,0x95E1,0x95E2,0x95E3,/* 0x50-0x57 */
        0x95E4,0x95E5,0x95E6,0x95E7,0x95EC,0x95FF,0x9607,0x9613,/* 0x58-0x5F */
        0x9618,0x961B,0x961E,0x9620,0x9623,0x9624,0x9625,0x9626,/* 0x60-0x67 */
        0x9627,0x9628,0x9629,0x962B,0x962C,0x962D,0x962F,0x9630,/* 0x68-0x6F */
        0x9637,0x9638,0x9639,0x963A,0x963E,0x9641,0x9643,0x964A,/* 0x70-0x77 */
        0x964E,0x964F,0x9651,0x9652,0x9653,0x9656,0x9657,0x0000,/* 0x78-0x7F */

        0x9658,0x9659,0x965A,0x965C,0x965D,0x965E,0x9660,0x9663,/* 0x80-0x87 */
        0x9665,0x9666,0x966B,0x966D,0x966E,0x966F,0x9670,0x9671,/* 0x88-0x8F */
        0x9673,0x9678,0x9679,0x967A,0x967B,0x967C,0x967D,0x967E,/* 0x90-0x97 */
        0x967F,0x9680,0x9681,0x9682,0x9683,0x9684,0x9687,0x9689,/* 0x98-0x9F */
        0x968A,0x8F8D,0x8F8E,0x8F8F,0x8F98,0x8F9A,0x8ECE,0x620B,/* 0xA0-0xA7 */
        0x6217,0x621B,0x621F,0x6222,0x6221,0x6225,0x6224,0x622C,/* 0xA8-0xAF */
        0x81E7,0x74EF,0x74F4,0x74FF,0x750F,0x7511,0x7513,0x6534,/* 0xB0-0xB7 */
        0x65EE,0x65EF,0x65F0,0x660A,0x6619,0x6772,0x6603,0x6615,/* 0xB8-0xBF */
        0x6600,0x7085,0x66F7,0x661D,0x6634,0x6631,0x6636,0x6635,/* 0xC0-0xC7 */
        0x8006,0x665F,0x6654,0x6641,0x664F,0x6656,0x6661,0x6657,/* 0xC8-0xCF */
        0x6677,0x6684,0x668C,0x66A7,0x669D,0x66BE,0x66DB,0x66DC,/* 0xD0-0xD7 */
        0x66E6,0x66E9,0x8D32,0x8D33,0x8D36,0x8D3B,0x8D3D,0x8D40,/* 0xD8-0xDF */
        0x8D45,0x8D46,0x8D48,0x8D49,0x8D47,0x8D4D,0x8D55,0x8D59,/* 0xE0-0xE7 */
        0x89C7,0x89CA,0x89CB,0x89CC,0x89CE,0x89CF,0x89D0,0x89D1,/* 0xE8-0xEF */
        0x726E,0x729F,0x725D,0x7266,0x726F,0x727E,0x727F,0x7284,/* 0xF0-0xF7 */
        0x728B,0x728D,0x728F,0x7292,0x6308,0x6332,0x63B0,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_EB[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x968C,0x968E,0x9691,0x9692,0x9693,0x9695,0x9696,0x969A,/* 0x40-0x47 */
        0x969B,0x969D,0x969E,0x969F,0x96A0,0x96A1,0x96A2,0x96A3,/* 0x48-0x4F */
        0x96A4,0x96A5,0x96A6,0x96A8,0x96A9,0x96AA,0x96AB,0x96AC,/* 0x50-0x57 */
        0x96AD,0x96AE,0x96AF,0x96B1,0x96B2,0x96B4,0x96B5,0x96B7,/* 0x58-0x5F */
        0x96B8,0x96BA,0x96BB,0x96BF,0x96C2,0x96C3,0x96C8,0x96CA,/* 0x60-0x67 */
        0x96CB,0x96D0,0x96D1,0x96D3,0x96D4,0x96D6,0x96D7,0x96D8,/* 0x68-0x6F */
        0x96D9,0x96DA,0x96DB,0x96DC,0x96DD,0x96DE,0x96DF,0x96E1,/* 0x70-0x77 */
        0x96E2,0x96E3,0x96E4,0x96E5,0x96E6,0x96E7,0x96EB,0x0000,/* 0x78-0x7F */

        0x96EC,0x96ED,0x96EE,0x96F0,0x96F1,0x96F2,0x96F4,0x96F5,/* 0x80-0x87 */
        0x96F8,0x96FA,0x96FB,0x96FC,0x96FD,0x96FF,0x9702,0x9703,/* 0x88-0x8F */
        0x9705,0x970A,0x970B,0x970C,0x9710,0x9711,0x9712,0x9714,/* 0x90-0x97 */
        0x9715,0x9717,0x9718,0x9719,0x971A,0x971B,0x971D,0x971F,/* 0x98-0x9F */
        0x9720,0x643F,0x64D8,0x8004,0x6BEA,0x6BF3,0x6BFD,0x6BF5,/* 0xA0-0xA7 */
        0x6BF9,0x6C05,0x6C07,0x6C06,0x6C0D,0x6C15,0x6C18,0x6C19,/* 0xA8-0xAF */
        0x6C1A,0x6C21,0x6C29,0x6C24,0x6C2A,0x6C32,0x6535,0x6555,/* 0xB0-0xB7 */
        0x656B,0x724D,0x7252,0x7256,0x7230,0x8662,0x5216,0x809F,/* 0xB8-0xBF */
        0x809C,0x8093,0x80BC,0x670A,0x80BD,0x80B1,0x80AB,0x80AD,/* 0xC0-0xC7 */
        0x80B4,0x80B7,0x80E7,0x80E8,0x80E9,0x80EA,0x80DB,0x80C2,/* 0xC8-0xCF */
        0x80C4,0x80D9,0x80CD,0x80D7,0x6710,0x80DD,0x80EB,0x80F1,/* 0xD0-0xD7 */
        0x80F4,0x80ED,0x810D,0x810E,0x80F2,0x80FC,0x6715,0x8112,/* 0xD8-0xDF */
        0x8C5A,0x8136,0x811E,0x812C,0x8118,0x8132,0x8148,0x814C,/* 0xE0-0xE7 */
        0x8153,0x8174,0x8159,0x815A,0x8171,0x8160,0x8169,0x817C,/* 0xE8-0xEF */
        0x817D,0x816D,0x8167,0x584D,0x5AB5,0x8188,0x8182,0x8191,/* 0xF0-0xF7 */
        0x6ED5,0x81A3,0x81AA,0x81CC,0x6726,0x81CA,0x81BB,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_EC[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x9721,0x9722,0x9723,0x9724,0x9725,0x9726,0x9727,0x9728,/* 0x40-0x47 */
        0x9729,0x972B,0x972C,0x972E,0x972F,0x9731,0x9733,0x9734,/* 0x48-0x4F */
        0x9735,0x9736,0x9737,0x973A,0x973B,0x973C,0x973D,0x973F,/* 0x50-0x57 */
        0x9740,0x9741,0x9742,0x9743,0x9744,0x9745,0x9746,0x9747,/* 0x58-0x5F */
        0x9748,0x9749,0x974A,0x974B,0x974C,0x974D,0x974E,0x974F,/* 0x60-0x67 */
        0x9750,0x9751,0x9754,0x9755,0x9757,0x9758,0x975A,0x975C,/* 0x68-0x6F */
        0x975D,0x975F,0x9763,0x9764,0x9766,0x9767,0x9768,0x976A,/* 0x70-0x77 */
        0x976B,0x976C,0x976D,0x976E,0x976F,0x9770,0x9771,0x0000,/* 0x78-0x7F */

        0x9772,0x9775,0x9777,0x9778,0x9779,0x977A,0x977B,0x977D,/* 0x80-0x87 */
        0x977E,0x977F,0x9780,0x9781,0x9782,0x9783,0x9784,0x9786,/* 0x88-0x8F */
        0x9787,0x9788,0x9789,0x978A,0x978C,0x978E,0x978F,0x9790,/* 0x90-0x97 */
        0x9793,0x9795,0x9796,0x9797,0x9799,0x979A,0x979B,0x979C,/* 0x98-0x9F */
        0x979D,0x81C1,0x81A6,0x6B24,0x6B37,0x6B39,0x6B43,0x6B46,/* 0xA0-0xA7 */
        0x6B59,0x98D1,0x98D2,0x98D3,0x98D5,0x98D9,0x98DA,0x6BB3,/* 0xA8-0xAF */
        0x5F40,0x6BC2,0x89F3,0x6590,0x9F51,0x6593,0x65BC,0x65C6,/* 0xB0-0xB7 */
        0x65C4,0x65C3,0x65CC,0x65CE,0x65D2,0x65D6,0x7080,0x709C,/* 0xB8-0xBF */
        0x7096,0x709D,0x70BB,0x70C0,0x70B7,0x70AB,0x70B1,0x70E8,/* 0xC0-0xC7 */
        0x70CA,0x7110,0x7113,0x7116,0x712F,0x7131,0x7173,0x715C,/* 0xC8-0xCF */
        0x7168,0x7145,0x7172,0x714A,0x7178,0x717A,0x7198,0x71B3,/* 0xD0-0xD7 */
        0x71B5,0x71A8,0x71A0,0x71E0,0x71D4,0x71E7,0x71F9,0x721D,/* 0xD8-0xDF */
        0x7228,0x706C,0x7118,0x7166,0x71B9,0x623E,0x623D,0x6243,/* 0xE0-0xE7 */
        0x6248,0x6249,0x793B,0x7940,0x7946,0x7949,0x795B,0x795C,/* 0xE8-0xEF */
        0x7953,0x795A,0x7962,0x7957,0x7960,0x796F,0x7967,0x797A,/* 0xF0-0xF7 */
        0x7985,0x798A,0x799A,0x79A7,0x79B3,0x5FD1,0x5FD0,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_ED[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x979E,0x979F,0x97A1,0x97A2,0x97A4,0x97A5,0x97A6,0x97A7,/* 0x40-0x47 */
        0x97A8,0x97A9,0x97AA,0x97AC,0x97AE,0x97B0,0x97B1,0x97B3,/* 0x48-0x4F */
        0x97B5,0x97B6,0x97B7,0x97B8,0x97B9,0x97BA,0x97BB,0x97BC,/* 0x50-0x57 */
        0x97BD,0x97BE,0x97BF,0x97C0,0x97C1,0x97C2,0x97C3,0x97C4,/* 0x58-0x5F */
        0x97C5,0x97C6,0x97C7,0x97C8,0x97C9,0x97CA,0x97CB,0x97CC,/* 0x60-0x67 */
        0x97CD,0x97CE,0x97CF,0x97D0,0x97D1,0x97D2,0x97D3,0x97D4,/* 0x68-0x6F */
        0x97D5,0x97D6,0x97D7,0x97D8,0x97D9,0x97DA,0x97DB,0x97DC,/* 0x70-0x77 */
        0x97DD,0x97DE,0x97DF,0x97E0,0x97E1,0x97E2,0x97E3,0x0000,/* 0x78-0x7F */

        0x97E4,0x97E5,0x97E8,0x97EE,0x97EF,0x97F0,0x97F1,0x97F2,/* 0x80-0x87 */
        0x97F4,0x97F7,0x97F8,0x97F9,0x97FA,0x97FB,0x97FC,0x97FD,/* 0x88-0x8F */
        0x97FE,0x97FF,0x9800,0x9801,0x9802,0x9803,0x9804,0x9805,/* 0x90-0x97 */
        0x9806,0x9807,0x9808,0x9809,0x980A,0x980B,0x980C,0x980D,/* 0x98-0x9F */
        0x980E,0x603C,0x605D,0x605A,0x6067,0x6041,0x6059,0x6063,/* 0xA0-0xA7 */
        0x60AB,0x6106,0x610D,0x615D,0x61A9,0x619D,0x61CB,0x61D1,/* 0xA8-0xAF */
        0x6206,0x8080,0x807F,0x6C93,0x6CF6,0x6DFC,0x77F6,0x77F8,/* 0xB0-0xB7 */
        0x7800,0x7809,0x7817,0x7818,0x7811,0x65AB,0x782D,0x781C,/* 0xB8-0xBF */
        0x781D,0x7839,0x783A,0x783B,0x781F,0x783C,0x7825,0x782C,/* 0xC0-0xC7 */
        0x7823,0x7829,0x784E,0x786D,0x7856,0x7857,0x7826,0x7850,/* 0xC8-0xCF */
        0x7847,0x784C,0x786A,0x789B,0x7893,0x789A,0x7887,0x789C,/* 0xD0-0xD7 */
        0x78A1,0x78A3,0x78B2,0x78B9,0x78A5,0x78D4,0x78D9,0x78C9,/* 0xD8-0xDF */
        0x78EC,0x78F2,0x7905,0x78F4,0x7913,0x7924,0x791E,0x7934,/* 0xE0-0xE7 */
        0x9F9B,0x9EF9,0x9EFB,0x9EFC,0x76F1,0x7704,0x770D,0x76F9,/* 0xE8-0xEF */
        0x7707,0x7708,0x771A,0x7722,0x7719,0x772D,0x7726,0x7735,/* 0xF0-0xF7 */
        0x7738,0x7750,0x7751,0x7747,0x7743,0x775A,0x7768,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_EE[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x980F,0x9810,0x9811,0x9812,0x9813,0x9814,0x9815,0x9816,/* 0x40-0x47 */
        0x9817,0x9818,0x9819,0x981A,0x981B,0x981C,0x981D,0x981E,/* 0x48-0x4F */
        0x981F,0x9820,0x9821,0x9822,0x9823,0x9824,0x9825,0x9826,/* 0x50-0x57 */
        0x9827,0x9828,0x9829,0x982A,0x982B,0x982C,0x982D,0x982E,/* 0x58-0x5F */
        0x982F,0x9830,0x9831,0x9832,0x9833,0x9834,0x9835,0x9836,/* 0x60-0x67 */
        0x9837,0x9838,0x9839,0x983A,0x983B,0x983C,0x983D,0x983E,/* 0x68-0x6F */
        0x983F,0x9840,0x9841,0x9842,0x9843,0x9844,0x9845,0x9846,/* 0x70-0x77 */
        0x9847,0x9848,0x9849,0x984A,0x984B,0x984C,0x984D,0x0000,/* 0x78-0x7F */

        0x984E,0x984F,0x9850,0x9851,0x9852,0x9853,0x9854,0x9855,/* 0x80-0x87 */
        0x9856,0x9857,0x9858,0x9859,0x985A,0x985B,0x985C,0x985D,/* 0x88-0x8F */
        0x985E,0x985F,0x9860,0x9861,0x9862,0x9863,0x9864,0x9865,/* 0x90-0x97 */
        0x9866,0x9867,0x9868,0x9869,0x986A,0x986B,0x986C,0x986D,/* 0x98-0x9F */
        0x986E,0x7762,0x7765,0x777F,0x778D,0x777D,0x7780,0x778C,/* 0xA0-0xA7 */
        0x7791,0x779F,0x77A0,0x77B0,0x77B5,0x77BD,0x753A,0x7540,/* 0xA8-0xAF */
        0x754E,0x754B,0x7548,0x755B,0x7572,0x7579,0x7583,0x7F58,/* 0xB0-0xB7 */
        0x7F61,0x7F5F,0x8A48,0x7F68,0x7F74,0x7F71,0x7F79,0x7F81,/* 0xB8-0xBF */
        0x7F7E,0x76CD,0x76E5,0x8832,0x9485,0x9486,0x9487,0x948B,/* 0xC0-0xC7 */
        0x948A,0x948C,0x948D,0x948F,0x9490,0x9494,0x9497,0x9495,/* 0xC8-0xCF */
        0x949A,0x949B,0x949C,0x94A3,0x94A4,0x94AB,0x94AA,0x94AD,/* 0xD0-0xD7 */
        0x94AC,0x94AF,0x94B0,0x94B2,0x94B4,0x94B6,0x94B7,0x94B8,/* 0xD8-0xDF */
        0x94B9,0x94BA,0x94BC,0x94BD,0x94BF,0x94C4,0x94C8,0x94C9,/* 0xE0-0xE7 */
        0x94CA,0x94CB,0x94CC,0x94CD,0x94CE,0x94D0,0x94D1,0x94D2,/* 0xE8-0xEF */
        0x94D5,0x94D6,0x94D7,0x94D9,0x94D8,0x94DB,0x94DE,0x94DF,/* 0xF0-0xF7 */
        0x94E0,0x94E2,0x94E4,0x94E5,0x94E7,0x94E8,0x94EA,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_EF[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x986F,0x9870,0x9871,0x9872,0x9873,0x9874,0x988B,0x988E,/* 0x40-0x47 */
        0x9892,0x9895,0x9899,0x98A3,0x98A8,0x98A9,0x98AA,0x98AB,/* 0x48-0x4F */
        0x98AC,0x98AD,0x98AE,0x98AF,0x98B0,0x98B1,0x98B2,0x98B3,/* 0x50-0x57 */
        0x98B4,0x98B5,0x98B6,0x98B7,0x98B8,0x98B9,0x98BA,0x98BB,/* 0x58-0x5F */
        0x98BC,0x98BD,0x98BE,0x98BF,0x98C0,0x98C1,0x98C2,0x98C3,/* 0x60-0x67 */
        0x98C4,0x98C5,0x98C6,0x98C7,0x98C8,0x98C9,0x98CA,0x98CB,/* 0x68-0x6F */
        0x98CC,0x98CD,0x98CF,0x98D0,0x98D4,0x98D6,0x98D7,0x98DB,/* 0x70-0x77 */
        0x98DC,0x98DD,0x98E0,0x98E1,0x98E2,0x98E3,0x98E4,0x0000,/* 0x78-0x7F */

        0x98E5,0x98E6,0x98E9,0x98EA,0x98EB,0x98EC,0x98ED,0x98EE,/* 0x80-0x87 */
        0x98EF,0x98F0,0x98F1,0x98F2,0x98F3,0x98F4,0x98F5,0x98F6,/* 0x88-0x8F */
        0x98F7,0x98F8,0x98F9,0x98FA,0x98FB,0x98FC,0x98FD,0x98FE,/* 0x90-0x97 */
        0x98FF,0x9900,0x9901,0x9902,0x9903,0x9904,0x9905,0x9906,/* 0x98-0x9F */
        0x9907,0x94E9,0x94EB,0x94EE,0x94EF,0x94F3,0x94F4,0x94F5,/* 0xA0-0xA7 */
        0x94F7,0x94F9,0x94FC,0x94FD,0x94FF,0x9503,0x9502,0x9506,/* 0xA8-0xAF */
        0x9507,0x9509,0x950A,0x950D,0x950E,0x950F,0x9512,0x9513,/* 0xB0-0xB7 */
        0x9514,0x9515,0x9516,0x9518,0x951B,0x951D,0x951E,0x951F,/* 0xB8-0xBF */
        0x9522,0x952A,0x952B,0x9529,0x952C,0x9531,0x9532,0x9534,/* 0xC0-0xC7 */
        0x9536,0x9537,0x9538,0x953C,0x953E,0x953F,0x9542,0x9535,/* 0xC8-0xCF */
        0x9544,0x9545,0x9546,0x9549,0x954C,0x954E,0x954F,0x9552,/* 0xD0-0xD7 */
        0x9553,0x9554,0x9556,0x9557,0x9558,0x9559,0x955B,0x955E,/* 0xD8-0xDF */
        0x955F,0x955D,0x9561,0x9562,0x9564,0x9565,0x9566,0x9567,/* 0xE0-0xE7 */
        0x9568,0x9569,0x956A,0x956B,0x956C,0x956F,0x9571,0x9572,/* 0xE8-0xEF */
        0x9573,0x953A,0x77E7,0x77EC,0x96C9,0x79D5,0x79ED,0x79E3,/* 0xF0-0xF7 */
        0x79EB,0x7A06,0x5D47,0x7A03,0x7A02,0x7A1E,0x7A14,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_F0[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x9908,0x9909,0x990A,0x990B,0x990C,0x990E,0x990F,0x9911,/* 0x40-0x47 */
        0x9912,0x9913,0x9914,0x9915,0x9916,0x9917,0x9918,0x9919,/* 0x48-0x4F */
        0x991A,0x991B,0x991C,0x991D,0x991E,0x991F,0x9920,0x9921,/* 0x50-0x57 */
        0x9922,0x9923,0x9924,0x9925,0x9926,0x9927,0x9928,0x9929,/* 0x58-0x5F */
        0x992A,0x992B,0x992C,0x992D,0x992F,0x9930,0x9931,0x9932,/* 0x60-0x67 */
        0x9933,0x9934,0x9935,0x9936,0x9937,0x9938,0x9939,0x993A,/* 0x68-0x6F */
        0x993B,0x993C,0x993D,0x993E,0x993F,0x9940,0x9941,0x9942,/* 0x70-0x77 */
        0x9943,0x9944,0x9945,0x9946,0x9947,0x9948,0x9949,0x0000,/* 0x78-0x7F */

        0x994A,0x994B,0x994C,0x994D,0x994E,0x994F,0x9950,0x9951,/* 0x80-0x87 */
        0x9952,0x9953,0x9956,0x9957,0x9958,0x9959,0x995A,0x995B,/* 0x88-0x8F */
        0x995C,0x995D,0x995E,0x995F,0x9960,0x9961,0x9962,0x9964,/* 0x90-0x97 */
        0x9966,0x9973,0x9978,0x9979,0x997B,0x997E,0x9982,0x9983,/* 0x98-0x9F */
        0x9989,0x7A39,0x7A37,0x7A51,0x9ECF,0x99A5,0x7A70,0x7688,/* 0xA0-0xA7 */
        0x768E,0x7693,0x7699,0x76A4,0x74DE,0x74E0,0x752C,0x9E20,/* 0xA8-0xAF */
        0x9E22,0x9E28,0x9E29,0x9E2A,0x9E2B,0x9E2C,0x9E32,0x9E31,/* 0xB0-0xB7 */
        0x9E36,0x9E38,0x9E37,0x9E39,0x9E3A,0x9E3E,0x9E41,0x9E42,/* 0xB8-0xBF */
        0x9E44,0x9E46,0x9E47,0x9E48,0x9E49,0x9E4B,0x9E4C,0x9E4E,/* 0xC0-0xC7 */
        0x9E51,0x9E55,0x9E57,0x9E5A,0x9E5B,0x9E5C,0x9E5E,0x9E63,/* 0xC8-0xCF */
        0x9E66,0x9E67,0x9E68,0x9E69,0x9E6A,0x9E6B,0x9E6C,0x9E71,/* 0xD0-0xD7 */
        0x9E6D,0x9E73,0x7592,0x7594,0x7596,0x75A0,0x759D,0x75AC,/* 0xD8-0xDF */
        0x75A3,0x75B3,0x75B4,0x75B8,0x75C4,0x75B1,0x75B0,0x75C3,/* 0xE0-0xE7 */
        0x75C2,0x75D6,0x75CD,0x75E3,0x75E8,0x75E6,0x75E4,0x75EB,/* 0xE8-0xEF */
        0x75E7,0x7603,0x75F1,0x75FC,0x75FF,0x7610,0x7600,0x7605,/* 0xF0-0xF7 */
        0x760C,0x7617,0x760A,0x7625,0x7618,0x7615,0x7619,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_F1[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x998C,0x998E,0x999A,0x999B,0x999C,0x999D,0x999E,0x999F,/* 0x40-0x47 */
        0x99A0,0x99A1,0x99A2,0x99A3,0x99A4,0x99A6,0x99A7,0x99A9,/* 0x48-0x4F */
        0x99AA,0x99AB,0x99AC,0x99AD,0x99AE,0x99AF,0x99B0,0x99B1,/* 0x50-0x57 */
        0x99B2,0x99B3,0x99B4,0x99B5,0x99B6,0x99B7,0x99B8,0x99B9,/* 0x58-0x5F */
        0x99BA,0x99BB,0x99BC,0x99BD,0x99BE,0x99BF,0x99C0,0x99C1,/* 0x60-0x67 */
        0x99C2,0x99C3,0x99C4,0x99C5,0x99C6,0x99C7,0x99C8,0x99C9,/* 0x68-0x6F */
        0x99CA,0x99CB,0x99CC,0x99CD,0x99CE,0x99CF,0x99D0,0x99D1,/* 0x70-0x77 */
        0x99D2,0x99D3,0x99D4,0x99D5,0x99D6,0x99D7,0x99D8,0x0000,/* 0x78-0x7F */

        0x99D9,0x99DA,0x99DB,0x99DC,0x99DD,0x99DE,0x99DF,0x99E0,/* 0x80-0x87 */
        0x99E1,0x99E2,0x99E3,0x99E4,0x99E5,0x99E6,0x99E7,0x99E8,/* 0x88-0x8F */
        0x99E9,0x99EA,0x99EB,0x99EC,0x99ED,0x99EE,0x99EF,0x99F0,/* 0x90-0x97 */
        0x99F1,0x99F2,0x99F3,0x99F4,0x99F5,0x99F6,0x99F7,0x99F8,/* 0x98-0x9F */
        0x99F9,0x761B,0x763C,0x7622,0x7620,0x7640,0x762D,0x7630,/* 0xA0-0xA7 */
        0x763F,0x7635,0x7643,0x763E,0x7633,0x764D,0x765E,0x7654,/* 0xA8-0xAF */
        0x765C,0x7656,0x766B,0x766F,0x7FCA,0x7AE6,0x7A78,0x7A79,/* 0xB0-0xB7 */
        0x7A80,0x7A86,0x7A88,0x7A95,0x7AA6,0x7AA0,0x7AAC,0x7AA8,/* 0xB8-0xBF */
        0x7AAD,0x7AB3,0x8864,0x8869,0x8872,0x887D,0x887F,0x8882,/* 0xC0-0xC7 */
        0x88A2,0x88C6,0x88B7,0x88BC,0x88C9,0x88E2,0x88CE,0x88E3,/* 0xC8-0xCF */
        0x88E5,0x88F1,0x891A,0x88FC,0x88E8,0x88FE,0x88F0,0x8921,/* 0xD0-0xD7 */
        0x8919,0x8913,0x891B,0x890A,0x8934,0x892B,0x8936,0x8941,/* 0xD8-0xDF */
        0x8966,0x897B,0x758B,0x80E5,0x76B2,0x76B4,0x77DC,0x8012,/* 0xE0-0xE7 */
        0x8014,0x8016,0x801C,0x8020,0x8022,0x8025,0x8026,0x8027,/* 0xE8-0xEF */
        0x8029,0x8028,0x8031,0x800B,0x8035,0x8043,0x8046,0x804D,/* 0xF0-0xF7 */
        0x8052,0x8069,0x8071,0x8983,0x9878,0x9880,0x9883,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_F2[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x99FA,0x99FB,0x99FC,0x99FD,0x99FE,0x99FF,0x9A00,0x9A01,/* 0x40-0x47 */
        0x9A02,0x9A03,0x9A04,0x9A05,0x9A06,0x9A07,0x9A08,0x9A09,/* 0x48-0x4F */
        0x9A0A,0x9A0B,0x9A0C,0x9A0D,0x9A0E,0x9A0F,0x9A10,0x9A11,/* 0x50-0x57 */
        0x9A12,0x9A13,0x9A14,0x9A15,0x9A16,0x9A17,0x9A18,0x9A19,/* 0x58-0x5F */
        0x9A1A,0x9A1B,0x9A1C,0x9A1D,0x9A1E,0x9A1F,0x9A20,0x9A21,/* 0x60-0x67 */
        0x9A22,0x9A23,0x9A24,0x9A25,0x9A26,0x9A27,0x9A28,0x9A29,/* 0x68-0x6F */
        0x9A2A,0x9A2B,0x9A2C,0x9A2D,0x9A2E,0x9A2F,0x9A30,0x9A31,/* 0x70-0x77 */
        0x9A32,0x9A33,0x9A34,0x9A35,0x9A36,0x9A37,0x9A38,0x0000,/* 0x78-0x7F */

        0x9A39,0x9A3A,0x9A3B,0x9A3C,0x9A3D,0x9A3E,0x9A3F,0x9A40,/* 0x80-0x87 */
        0x9A41,0x9A42,0x9A43,0x9A44,0x9A45,0x9A46,0x9A47,0x9A48,/* 0x88-0x8F */
        0x9A49,0x9A4A,0x9A4B,0x9A4C,0x9A4D,0x9A4E,0x9A4F,0x9A50,/* 0x90-0x97 */
        0x9A51,0x9A52,0x9A53,0x9A54,0x9A55,0x9A56,0x9A57,0x9A58,/* 0x98-0x9F */
        0x9A59,0x9889,0x988C,0x988D,0x988F,0x9894,0x989A,0x989B,/* 0xA0-0xA7 */
        0x989E,0x989F,0x98A1,0x98A2,0x98A5,0x98A6,0x864D,0x8654,/* 0xA8-0xAF */
        0x866C,0x866E,0x867F,0x867A,0x867C,0x867B,0x86A8,0x868D,/* 0xB0-0xB7 */
        0x868B,0x86AC,0x869D,0x86A7,0x86A3,0x86AA,0x8693,0x86A9,/* 0xB8-0xBF */
        0x86B6,0x86C4,0x86B5,0x86CE,0x86B0,0x86BA,0x86B1,0x86AF,/* 0xC0-0xC7 */
        0x86C9,0x86CF,0x86B4,0x86E9,0x86F1,0x86F2,0x86ED,0x86F3,/* 0xC8-0xCF */
        0x86D0,0x8713,0x86DE,0x86F4,0x86DF,0x86D8,0x86D1,0x8703,/* 0xD0-0xD7 */
        0x8707,0x86F8,0x8708,0x870A,0x870D,0x8709,0x8723,0x873B,/* 0xD8-0xDF */
        0x871E,0x8725,0x872E,0x871A,0x873E,0x8748,0x8734,0x8731,/* 0xE0-0xE7 */
        0x8729,0x8737,0x873F,0x8782,0x8722,0x877D,0x877E,0x877B,/* 0xE8-0xEF */
        0x8760,0x8770,0x874C,0x876E,0x878B,0x8753,0x8763,0x877C,/* 0xF0-0xF7 */
        0x8764,0x8759,0x8765,0x8793,0x87AF,0x87A8,0x87D2,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_F3[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x9A5A,0x9A5B,0x9A5C,0x9A5D,0x9A5E,0x9A5F,0x9A60,0x9A61,/* 0x40-0x47 */
        0x9A62,0x9A63,0x9A64,0x9A65,0x9A66,0x9A67,0x9A68,0x9A69,/* 0x48-0x4F */
        0x9A6A,0x9A6B,0x9A72,0x9A83,0x9A89,0x9A8D,0x9A8E,0x9A94,/* 0x50-0x57 */
        0x9A95,0x9A99,0x9AA6,0x9AA9,0x9AAA,0x9AAB,0x9AAC,0x9AAD,/* 0x58-0x5F */
        0x9AAE,0x9AAF,0x9AB2,0x9AB3,0x9AB4,0x9AB5,0x9AB9,0x9ABB,/* 0x60-0x67 */
        0x9ABD,0x9ABE,0x9ABF,0x9AC3,0x9AC4,0x9AC6,0x9AC7,0x9AC8,/* 0x68-0x6F */
        0x9AC9,0x9ACA,0x9ACD,0x9ACE,0x9ACF,0x9AD0,0x9AD2,0x9AD4,/* 0x70-0x77 */
        0x9AD5,0x9AD6,0x9AD7,0x9AD9,0x9ADA,0x9ADB,0x9ADC,0x0000,/* 0x78-0x7F */

        0x9ADD,0x9ADE,0x9AE0,0x9AE2,0x9AE3,0x9AE4,0x9AE5,0x9AE7,/* 0x80-0x87 */
        0x9AE8,0x9AE9,0x9AEA,0x9AEC,0x9AEE,0x9AF0,0x9AF1,0x9AF2,/* 0x88-0x8F */
        0x9AF3,0x9AF4,0x9AF5,0x9AF6,0x9AF7,0x9AF8,0x9AFA,0x9AFC,/* 0x90-0x97 */
        0x9AFD,0x9AFE,0x9AFF,0x9B00,0x9B01,0x9B02,0x9B04,0x9B05,/* 0x98-0x9F */
        0x9B06,0x87C6,0x8788,0x8785,0x87AD,0x8797,0x8783,0x87AB,/* 0xA0-0xA7 */
        0x87E5,0x87AC,0x87B5,0x87B3,0x87CB,0x87D3,0x87BD,0x87D1,/* 0xA8-0xAF */
        0x87C0,0x87CA,0x87DB,0x87EA,0x87E0,0x87EE,0x8816,0x8813,/* 0xB0-0xB7 */
        0x87FE,0x880A,0x881B,0x8821,0x8839,0x883C,0x7F36,0x7F42,/* 0xB8-0xBF */
        0x7F44,0x7F45,0x8210,0x7AFA,0x7AFD,0x7B08,0x7B03,0x7B04,/* 0xC0-0xC7 */
        0x7B15,0x7B0A,0x7B2B,0x7B0F,0x7B47,0x7B38,0x7B2A,0x7B19,/* 0xC8-0xCF */
        0x7B2E,0x7B31,0x7B20,0x7B25,0x7B24,0x7B33,0x7B3E,0x7B1E,/* 0xD0-0xD7 */
        0x7B58,0x7B5A,0x7B45,0x7B75,0x7B4C,0x7B5D,0x7B60,0x7B6E,/* 0xD8-0xDF */
        0x7B7B,0x7B62,0x7B72,0x7B71,0x7B90,0x7BA6,0x7BA7,0x7BB8,/* 0xE0-0xE7 */
        0x7BAC,0x7B9D,0x7BA8,0x7B85,0x7BAA,0x7B9C,0x7BA2,0x7BAB,/* 0xE8-0xEF */
        0x7BB4,0x7BD1,0x7BC1,0x7BCC,0x7BDD,0x7BDA,0x7BE5,0x7BE6,/* 0xF0-0xF7 */
        0x7BEA,0x7C0C,0x7BFE,0x7BFC,0x7C0F,0x7C16,0x7C0B,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_F4[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x9B07,0x9B09,0x9B0A,0x9B0B,0x9B0C,0x9B0D,0x9B0E,0x9B10,/* 0x40-0x47 */
        0x9B11,0x9B12,0x9B14,0x9B15,0x9B16,0x9B17,0x9B18,0x9B19,/* 0x48-0x4F */
        0x9B1A,0x9B1B,0x9B1C,0x9B1D,0x9B1E,0x9B20,0x9B21,0x9B22,/* 0x50-0x57 */
        0x9B24,0x9B25,0x9B26,0x9B27,0x9B28,0x9B29,0x9B2A,0x9B2B,/* 0x58-0x5F */
        0x9B2C,0x9B2D,0x9B2E,0x9B30,0x9B31,0x9B33,0x9B34,0x9B35,/* 0x60-0x67 */
        0x9B36,0x9B37,0x9B38,0x9B39,0x9B3A,0x9B3D,0x9B3E,0x9B3F,/* 0x68-0x6F */
        0x9B40,0x9B46,0x9B4A,0x9B4B,0x9B4C,0x9B4E,0x9B50,0x9B52,/* 0x70-0x77 */
        0x9B53,0x9B55,0x9B56,0x9B57,0x9B58,0x9B59,0x9B5A,0x0000,/* 0x78-0x7F */

        0x9B5B,0x9B5C,0x9B5D,0x9B5E,0x9B5F,0x9B60,0x9B61,0x9B62,/* 0x80-0x87 */
        0x9B63,0x9B64,0x9B65,0x9B66,0x9B67,0x9B68,0x9B69,0x9B6A,/* 0x88-0x8F */
        0x9B6B,0x9B6C,0x9B6D,0x9B6E,0x9B6F,0x9B70,0x9B71,0x9B72,/* 0x90-0x97 */
        0x9B73,0x9B74,0x9B75,0x9B76,0x9B77,0x9B78,0x9B79,0x9B7A,/* 0x98-0x9F */
        0x9B7B,0x7C1F,0x7C2A,0x7C26,0x7C38,0x7C41,0x7C40,0x81FE,/* 0xA0-0xA7 */
        0x8201,0x8202,0x8204,0x81EC,0x8844,0x8221,0x8222,0x8223,/* 0xA8-0xAF */
        0x822D,0x822F,0x8228,0x822B,0x8238,0x823B,0x8233,0x8234,/* 0xB0-0xB7 */
        0x823E,0x8244,0x8249,0x824B,0x824F,0x825A,0x825F,0x8268,/* 0xB8-0xBF */
        0x887E,0x8885,0x8888,0x88D8,0x88DF,0x895E,0x7F9D,0x7F9F,/* 0xC0-0xC7 */
        0x7FA7,0x7FAF,0x7FB0,0x7FB2,0x7C7C,0x6549,0x7C91,0x7C9D,/* 0xC8-0xCF */
        0x7C9C,0x7C9E,0x7CA2,0x7CB2,0x7CBC,0x7CBD,0x7CC1,0x7CC7,/* 0xD0-0xD7 */
        0x7CCC,0x7CCD,0x7CC8,0x7CC5,0x7CD7,0x7CE8,0x826E,0x66A8,/* 0xD8-0xDF */
        0x7FBF,0x7FCE,0x7FD5,0x7FE5,0x7FE1,0x7FE6,0x7FE9,0x7FEE,/* 0xE0-0xE7 */
        0x7FF3,0x7CF8,0x7D77,0x7DA6,0x7DAE,0x7E47,0x7E9B,0x9EB8,/* 0xE8-0xEF */
        0x9EB4,0x8D73,0x8D84,0x8D94,0x8D91,0x8DB1,0x8D67,0x8D6D,/* 0xF0-0xF7 */
        0x8C47,0x8C49,0x914A,0x9150,0x914E,0x914F,0x9164,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_F5[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x9B7C,0x9B7D,0x9B7E,0x9B7F,0x9B80,0x9B81,0x9B82,0x9B83,/* 0x40-0x47 */
        0x9B84,0x9B85,0x9B86,0x9B87,0x9B88,0x9B89,0x9B8A,0x9B8B,/* 0x48-0x4F */
        0x9B8C,0x9B8D,0x9B8E,0x9B8F,0x9B90,0x9B91,0x9B92,0x9B93,/* 0x50-0x57 */
        0x9B94,0x9B95,0x9B96,0x9B97,0x9B98,0x9B99,0x9B9A,0x9B9B,/* 0x58-0x5F */
        0x9B9C,0x9B9D,0x9B9E,0x9B9F,0x9BA0,0x9BA1,0x9BA2,0x9BA3,/* 0x60-0x67 */
        0x9BA4,0x9BA5,0x9BA6,0x9BA7,0x9BA8,0x9BA9,0x9BAA,0x9BAB,/* 0x68-0x6F */
        0x9BAC,0x9BAD,0x9BAE,0x9BAF,0x9BB0,0x9BB1,0x9BB2,0x9BB3,/* 0x70-0x77 */
        0x9BB4,0x9BB5,0x9BB6,0x9BB7,0x9BB8,0x9BB9,0x9BBA,0x0000,/* 0x78-0x7F */

        0x9BBB,0x9BBC,0x9BBD,0x9BBE,0x9BBF,0x9BC0,0x9BC1,0x9BC2,/* 0x80-0x87 */
        0x9BC3,0x9BC4,0x9BC5,0x9BC6,0x9BC7,0x9BC8,0x9BC9,0x9BCA,/* 0x88-0x8F */
        0x9BCB,0x9BCC,0x9BCD,0x9BCE,0x9BCF,0x9BD0,0x9BD1,0x9BD2,/* 0x90-0x97 */
        0x9BD3,0x9BD4,0x9BD5,0x9BD6,0x9BD7,0x9BD8,0x9BD9,0x9BDA,/* 0x98-0x9F */
        0x9BDB,0x9162,0x9161,0x9170,0x9169,0x916F,0x917D,0x917E,/* 0xA0-0xA7 */
        0x9172,0x9174,0x9179,0x918C,0x9185,0x9190,0x918D,0x9191,/* 0xA8-0xAF */
        0x91A2,0x91A3,0x91AA,0x91AD,0x91AE,0x91AF,0x91B5,0x91B4,/* 0xB0-0xB7 */
        0x91BA,0x8C55,0x9E7E,0x8DB8,0x8DEB,0x8E05,0x8E59,0x8E69,/* 0xB8-0xBF */
        0x8DB5,0x8DBF,0x8DBC,0x8DBA,0x8DC4,0x8DD6,0x8DD7,0x8DDA,/* 0xC0-0xC7 */
        0x8DDE,0x8DCE,0x8DCF,0x8DDB,0x8DC6,0x8DEC,0x8DF7,0x8DF8,/* 0xC8-0xCF */
        0x8DE3,0x8DF9,0x8DFB,0x8DE4,0x8E09,0x8DFD,0x8E14,0x8E1D,/* 0xD0-0xD7 */
        0x8E1F,0x8E2C,0x8E2E,0x8E23,0x8E2F,0x8E3A,0x8E40,0x8E39,/* 0xD8-0xDF */
        0x8E35,0x8E3D,0x8E31,0x8E49,0x8E41,0x8E42,0x8E51,0x8E52,/* 0xE0-0xE7 */
        0x8E4A,0x8E70,0x8E76,0x8E7C,0x8E6F,0x8E74,0x8E85,0x8E8F,/* 0xE8-0xEF */
        0x8E94,0x8E90,0x8E9C,0x8E9E,0x8C78,0x8C82,0x8C8A,0x8C85,/* 0xF0-0xF7 */
        0x8C98,0x8C94,0x659B,0x89D6,0x89DE,0x89DA,0x89DC,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_F6[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x9BDC,0x9BDD,0x9BDE,0x9BDF,0x9BE0,0x9BE1,0x9BE2,0x9BE3,/* 0x40-0x47 */
        0x9BE4,0x9BE5,0x9BE6,0x9BE7,0x9BE8,0x9BE9,0x9BEA,0x9BEB,/* 0x48-0x4F */
        0x9BEC,0x9BED,0x9BEE,0x9BEF,0x9BF0,0x9BF1,0x9BF2,0x9BF3,/* 0x50-0x57 */
        0x9BF4,0x9BF5,0x9BF6,0x9BF7,0x9BF8,0x9BF9,0x9BFA,0x9BFB,/* 0x58-0x5F */
        0x9BFC,0x9BFD,0x9BFE,0x9BFF,0x9C00,0x9C01,0x9C02,0x9C03,/* 0x60-0x67 */
        0x9C04,0x9C05,0x9C06,0x9C07,0x9C08,0x9C09,0x9C0A,0x9C0B,/* 0x68-0x6F */
        0x9C0C,0x9C0D,0x9C0E,0x9C0F,0x9C10,0x9C11,0x9C12,0x9C13,/* 0x70-0x77 */
        0x9C14,0x9C15,0x9C16,0x9C17,0x9C18,0x9C19,0x9C1A,0x0000,/* 0x78-0x7F */

        0x9C1B,0x9C1C,0x9C1D,0x9C1E,0x9C1F,0x9C20,0x9C21,0x9C22,/* 0x80-0x87 */
        0x9C23,0x9C24,0x9C25,0x9C26,0x9C27,0x9C28,0x9C29,0x9C2A,/* 0x88-0x8F */
        0x9C2B,0x9C2C,0x9C2D,0x9C2E,0x9C2F,0x9C30,0x9C31,0x9C32,/* 0x90-0x97 */
        0x9C33,0x9C34,0x9C35,0x9C36,0x9C37,0x9C38,0x9C39,0x9C3A,/* 0x98-0x9F */
        0x9C3B,0x89E5,0x89EB,0x89EF,0x8A3E,0x8B26,0x9753,0x96E9,/* 0xA0-0xA7 */
        0x96F3,0x96EF,0x9706,0x9701,0x9708,0x970F,0x970E,0x972A,/* 0xA8-0xAF */
        0x972D,0x9730,0x973E,0x9F80,0x9F83,0x9F85,0x9F86,0x9F87,/* 0xB0-0xB7 */
        0x9F88,0x9F89,0x9F8A,0x9F8C,0x9EFE,0x9F0B,0x9F0D,0x96B9,/* 0xB8-0xBF */
        0x96BC,0x96BD,0x96CE,0x96D2,0x77BF,0x96E0,0x928E,0x92AE,/* 0xC0-0xC7 */
        0x92C8,0x933E,0x936A,0x93CA,0x938F,0x943E,0x946B,0x9C7F,/* 0xC8-0xCF */
        0x9C82,0x9C85,0x9C86,0x9C87,0x9C88,0x7A23,0x9C8B,0x9C8E,/* 0xD0-0xD7 */
        0x9C90,0x9C91,0x9C92,0x9C94,0x9C95,0x9C9A,0x9C9B,0x9C9E,/* 0xD8-0xDF */
        0x9C9F,0x9CA0,0x9CA1,0x9CA2,0x9CA3,0x9CA5,0x9CA6,0x9CA7,/* 0xE0-0xE7 */
        0x9CA8,0x9CA9,0x9CAB,0x9CAD,0x9CAE,0x9CB0,0x9CB1,0x9CB2,/* 0xE8-0xEF */
        0x9CB3,0x9CB4,0x9CB5,0x9CB6,0x9CB7,0x9CBA,0x9CBB,0x9CBC,/* 0xF0-0xF7 */
        0x9CBD,0x9CC4,0x9CC5,0x9CC6,0x9CC7,0x9CCA,0x9CCB,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_F7[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x9C3C,0x9C3D,0x9C3E,0x9C3F,0x9C40,0x9C41,0x9C42,0x9C43,/* 0x40-0x47 */
        0x9C44,0x9C45,0x9C46,0x9C47,0x9C48,0x9C49,0x9C4A,0x9C4B,/* 0x48-0x4F */
        0x9C4C,0x9C4D,0x9C4E,0x9C4F,0x9C50,0x9C51,0x9C52,0x9C53,/* 0x50-0x57 */
        0x9C54,0x9C55,0x9C56,0x9C57,0x9C58,0x9C59,0x9C5A,0x9C5B,/* 0x58-0x5F */
        0x9C5C,0x9C5D,0x9C5E,0x9C5F,0x9C60,0x9C61,0x9C62,0x9C63,/* 0x60-0x67 */
        0x9C64,0x9C65,0x9C66,0x9C67,0x9C68,0x9C69,0x9C6A,0x9C6B,/* 0x68-0x6F */
        0x9C6C,0x9C6D,0x9C6E,0x9C6F,0x9C70,0x9C71,0x9C72,0x9C73,/* 0x70-0x77 */
        0x9C74,0x9C75,0x9C76,0x9C77,0x9C78,0x9C79,0x9C7A,0x0000,/* 0x78-0x7F */

        0x9C7B,0x9C7D,0x9C7E,0x9C80,0x9C83,0x9C84,0x9C89,0x9C8A,/* 0x80-0x87 */
        0x9C8C,0x9C8F,0x9C93,0x9C96,0x9C97,0x9C98,0x9C99,0x9C9D,/* 0x88-0x8F */
        0x9CAA,0x9CAC,0x9CAF,0x9CB9,0x9CBE,0x9CBF,0x9CC0,0x9CC1,/* 0x90-0x97 */
        0x9CC2,0x9CC8,0x9CC9,0x9CD1,0x9CD2,0x9CDA,0x9CDB,0x9CE0,/* 0x98-0x9F */
        0x9CE1,0x9CCC,0x9CCD,0x9CCE,0x9CCF,0x9CD0,0x9CD3,0x9CD4,/* 0xA0-0xA7 */
        0x9CD5,0x9CD7,0x9CD8,0x9CD9,0x9CDC,0x9CDD,0x9CDF,0x9CE2,/* 0xA8-0xAF */
        0x977C,0x9785,0x9791,0x9792,0x9794,0x97AF,0x97AB,0x97A3,/* 0xB0-0xB7 */
        0x97B2,0x97B4,0x9AB1,0x9AB0,0x9AB7,0x9E58,0x9AB6,0x9ABA,/* 0xB8-0xBF */
        0x9ABC,0x9AC1,0x9AC0,0x9AC5,0x9AC2,0x9ACB,0x9ACC,0x9AD1,/* 0xC0-0xC7 */
        0x9B45,0x9B43,0x9B47,0x9B49,0x9B48,0x9B4D,0x9B51,0x98E8,/* 0xC8-0xCF */
        0x990D,0x992E,0x9955,0x9954,0x9ADF,0x9AE1,0x9AE6,0x9AEF,/* 0xD0-0xD7 */
        0x9AEB,0x9AFB,0x9AED,0x9AF9,0x9B08,0x9B0F,0x9B13,0x9B1F,/* 0xD8-0xDF */
        0x9B23,0x9EBD,0x9EBE,0x7E3B,0x9E82,0x9E87,0x9E88,0x9E8B,/* 0xE0-0xE7 */
        0x9E92,0x93D6,0x9E9D,0x9E9F,0x9EDB,0x9EDC,0x9EDD,0x9EE0,/* 0xE8-0xEF */
        0x9EDF,0x9EE2,0x9EE9,0x9EE7,0x9EE5,0x9EEA,0x9EEF,0x9F22,/* 0xF0-0xF7 */
        0x9F2C,0x9F2F,0x9F39,0x9F37,0x9F3D,0x9F3E,0x9F44,0x0000,/* 0xF8-0xFF */
};

static const wchar_t c2u_F8[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x9CE3,0x9CE4,0x9CE5,0x9CE6,0x9CE7,0x9CE8,0x9CE9,0x9CEA,/* 0x40-0x47 */
        0x9CEB,0x9CEC,0x9CED,0x9CEE,0x9CEF,0x9CF0,0x9CF1,0x9CF2,/* 0x48-0x4F */
        0x9CF3,0x9CF4,0x9CF5,0x9CF6,0x9CF7,0x9CF8,0x9CF9,0x9CFA,/* 0x50-0x57 */
        0x9CFB,0x9CFC,0x9CFD,0x9CFE,0x9CFF,0x9D00,0x9D01,0x9D02,/* 0x58-0x5F */
        0x9D03,0x9D04,0x9D05,0x9D06,0x9D07,0x9D08,0x9D09,0x9D0A,/* 0x60-0x67 */
        0x9D0B,0x9D0C,0x9D0D,0x9D0E,0x9D0F,0x9D10,0x9D11,0x9D12,/* 0x68-0x6F */
        0x9D13,0x9D14,0x9D15,0x9D16,0x9D17,0x9D18,0x9D19,0x9D1A,/* 0x70-0x77 */
        0x9D1B,0x9D1C,0x9D1D,0x9D1E,0x9D1F,0x9D20,0x9D21,0x0000,/* 0x78-0x7F */

        0x9D22,0x9D23,0x9D24,0x9D25,0x9D26,0x9D27,0x9D28,0x9D29,/* 0x80-0x87 */
        0x9D2A,0x9D2B,0x9D2C,0x9D2D,0x9D2E,0x9D2F,0x9D30,0x9D31,/* 0x88-0x8F */
        0x9D32,0x9D33,0x9D34,0x9D35,0x9D36,0x9D37,0x9D38,0x9D39,/* 0x90-0x97 */
        0x9D3A,0x9D3B,0x9D3C,0x9D3D,0x9D3E,0x9D3F,0x9D40,0x9D41,/* 0x98-0x9F */
        0x9D42,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
};

static const wchar_t c2u_F9[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x9D43,0x9D44,0x9D45,0x9D46,0x9D47,0x9D48,0x9D49,0x9D4A,/* 0x40-0x47 */
        0x9D4B,0x9D4C,0x9D4D,0x9D4E,0x9D4F,0x9D50,0x9D51,0x9D52,/* 0x48-0x4F */
        0x9D53,0x9D54,0x9D55,0x9D56,0x9D57,0x9D58,0x9D59,0x9D5A,/* 0x50-0x57 */
        0x9D5B,0x9D5C,0x9D5D,0x9D5E,0x9D5F,0x9D60,0x9D61,0x9D62,/* 0x58-0x5F */
        0x9D63,0x9D64,0x9D65,0x9D66,0x9D67,0x9D68,0x9D69,0x9D6A,/* 0x60-0x67 */
        0x9D6B,0x9D6C,0x9D6D,0x9D6E,0x9D6F,0x9D70,0x9D71,0x9D72,/* 0x68-0x6F */
        0x9D73,0x9D74,0x9D75,0x9D76,0x9D77,0x9D78,0x9D79,0x9D7A,/* 0x70-0x77 */
        0x9D7B,0x9D7C,0x9D7D,0x9D7E,0x9D7F,0x9D80,0x9D81,0x0000,/* 0x78-0x7F */

        0x9D82,0x9D83,0x9D84,0x9D85,0x9D86,0x9D87,0x9D88,0x9D89,/* 0x80-0x87 */
        0x9D8A,0x9D8B,0x9D8C,0x9D8D,0x9D8E,0x9D8F,0x9D90,0x9D91,/* 0x88-0x8F */
        0x9D92,0x9D93,0x9D94,0x9D95,0x9D96,0x9D97,0x9D98,0x9D99,/* 0x90-0x97 */
        0x9D9A,0x9D9B,0x9D9C,0x9D9D,0x9D9E,0x9D9F,0x9DA0,0x9DA1,/* 0x98-0x9F */
        0x9DA2,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
};

static const wchar_t c2u_FA[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x9DA3,0x9DA4,0x9DA5,0x9DA6,0x9DA7,0x9DA8,0x9DA9,0x9DAA,/* 0x40-0x47 */
        0x9DAB,0x9DAC,0x9DAD,0x9DAE,0x9DAF,0x9DB0,0x9DB1,0x9DB2,/* 0x48-0x4F */
        0x9DB3,0x9DB4,0x9DB5,0x9DB6,0x9DB7,0x9DB8,0x9DB9,0x9DBA,/* 0x50-0x57 */
        0x9DBB,0x9DBC,0x9DBD,0x9DBE,0x9DBF,0x9DC0,0x9DC1,0x9DC2,/* 0x58-0x5F */
        0x9DC3,0x9DC4,0x9DC5,0x9DC6,0x9DC7,0x9DC8,0x9DC9,0x9DCA,/* 0x60-0x67 */
        0x9DCB,0x9DCC,0x9DCD,0x9DCE,0x9DCF,0x9DD0,0x9DD1,0x9DD2,/* 0x68-0x6F */
        0x9DD3,0x9DD4,0x9DD5,0x9DD6,0x9DD7,0x9DD8,0x9DD9,0x9DDA,/* 0x70-0x77 */
        0x9DDB,0x9DDC,0x9DDD,0x9DDE,0x9DDF,0x9DE0,0x9DE1,0x0000,/* 0x78-0x7F */

        0x9DE2,0x9DE3,0x9DE4,0x9DE5,0x9DE6,0x9DE7,0x9DE8,0x9DE9,/* 0x80-0x87 */
        0x9DEA,0x9DEB,0x9DEC,0x9DED,0x9DEE,0x9DEF,0x9DF0,0x9DF1,/* 0x88-0x8F */
        0x9DF2,0x9DF3,0x9DF4,0x9DF5,0x9DF6,0x9DF7,0x9DF8,0x9DF9,/* 0x90-0x97 */
        0x9DFA,0x9DFB,0x9DFC,0x9DFD,0x9DFE,0x9DFF,0x9E00,0x9E01,/* 0x98-0x9F */
        0x9E02,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
};

static const wchar_t c2u_FB[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x9E03,0x9E04,0x9E05,0x9E06,0x9E07,0x9E08,0x9E09,0x9E0A,/* 0x40-0x47 */
        0x9E0B,0x9E0C,0x9E0D,0x9E0E,0x9E0F,0x9E10,0x9E11,0x9E12,/* 0x48-0x4F */
        0x9E13,0x9E14,0x9E15,0x9E16,0x9E17,0x9E18,0x9E19,0x9E1A,/* 0x50-0x57 */
        0x9E1B,0x9E1C,0x9E1D,0x9E1E,0x9E24,0x9E27,0x9E2E,0x9E30,/* 0x58-0x5F */
        0x9E34,0x9E3B,0x9E3C,0x9E40,0x9E4D,0x9E50,0x9E52,0x9E53,/* 0x60-0x67 */
        0x9E54,0x9E56,0x9E59,0x9E5D,0x9E5F,0x9E60,0x9E61,0x9E62,/* 0x68-0x6F */
        0x9E65,0x9E6E,0x9E6F,0x9E72,0x9E74,0x9E75,0x9E76,0x9E77,/* 0x70-0x77 */
        0x9E78,0x9E79,0x9E7A,0x9E7B,0x9E7C,0x9E7D,0x9E80,0x0000,/* 0x78-0x7F */

        0x9E81,0x9E83,0x9E84,0x9E85,0x9E86,0x9E89,0x9E8A,0x9E8C,/* 0x80-0x87 */
        0x9E8D,0x9E8E,0x9E8F,0x9E90,0x9E91,0x9E94,0x9E95,0x9E96,/* 0x88-0x8F */
        0x9E97,0x9E98,0x9E99,0x9E9A,0x9E9B,0x9E9C,0x9E9E,0x9EA0,/* 0x90-0x97 */
        0x9EA1,0x9EA2,0x9EA3,0x9EA4,0x9EA5,0x9EA7,0x9EA8,0x9EA9,/* 0x98-0x9F */
        0x9EAA,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
};

static const wchar_t c2u_FC[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x9EAB,0x9EAC,0x9EAD,0x9EAE,0x9EAF,0x9EB0,0x9EB1,0x9EB2,/* 0x40-0x47 */
        0x9EB3,0x9EB5,0x9EB6,0x9EB7,0x9EB9,0x9EBA,0x9EBC,0x9EBF,/* 0x48-0x4F */
        0x9EC0,0x9EC1,0x9EC2,0x9EC3,0x9EC5,0x9EC6,0x9EC7,0x9EC8,/* 0x50-0x57 */
        0x9ECA,0x9ECB,0x9ECC,0x9ED0,0x9ED2,0x9ED3,0x9ED5,0x9ED6,/* 0x58-0x5F */
        0x9ED7,0x9ED9,0x9EDA,0x9EDE,0x9EE1,0x9EE3,0x9EE4,0x9EE6,/* 0x60-0x67 */
        0x9EE8,0x9EEB,0x9EEC,0x9EED,0x9EEE,0x9EF0,0x9EF1,0x9EF2,/* 0x68-0x6F */
        0x9EF3,0x9EF4,0x9EF5,0x9EF6,0x9EF7,0x9EF8,0x9EFA,0x9EFD,/* 0x70-0x77 */
        0x9EFF,0x9F00,0x9F01,0x9F02,0x9F03,0x9F04,0x9F05,0x0000,/* 0x78-0x7F */

        0x9F06,0x9F07,0x9F08,0x9F09,0x9F0A,0x9F0C,0x9F0F,0x9F11,/* 0x80-0x87 */
        0x9F12,0x9F14,0x9F15,0x9F16,0x9F18,0x9F1A,0x9F1B,0x9F1C,/* 0x88-0x8F */
        0x9F1D,0x9F1E,0x9F1F,0x9F21,0x9F23,0x9F24,0x9F25,0x9F26,/* 0x90-0x97 */
        0x9F27,0x9F28,0x9F29,0x9F2A,0x9F2B,0x9F2D,0x9F2E,0x9F30,/* 0x98-0x9F */
        0x9F31,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
};

static const wchar_t c2u_FD[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0x9F32,0x9F33,0x9F34,0x9F35,0x9F36,0x9F38,0x9F3A,0x9F3C,/* 0x40-0x47 */
        0x9F3F,0x9F40,0x9F41,0x9F42,0x9F43,0x9F45,0x9F46,0x9F47,/* 0x48-0x4F */
        0x9F48,0x9F49,0x9F4A,0x9F4B,0x9F4C,0x9F4D,0x9F4E,0x9F4F,/* 0x50-0x57 */
        0x9F52,0x9F53,0x9F54,0x9F55,0x9F56,0x9F57,0x9F58,0x9F59,/* 0x58-0x5F */
        0x9F5A,0x9F5B,0x9F5C,0x9F5D,0x9F5E,0x9F5F,0x9F60,0x9F61,/* 0x60-0x67 */
        0x9F62,0x9F63,0x9F64,0x9F65,0x9F66,0x9F67,0x9F68,0x9F69,/* 0x68-0x6F */
        0x9F6A,0x9F6B,0x9F6C,0x9F6D,0x9F6E,0x9F6F,0x9F70,0x9F71,/* 0x70-0x77 */
        0x9F72,0x9F73,0x9F74,0x9F75,0x9F76,0x9F77,0x9F78,0x0000,/* 0x78-0x7F */

        0x9F79,0x9F7A,0x9F7B,0x9F7C,0x9F7D,0x9F7E,0x9F81,0x9F82,/* 0x80-0x87 */
        0x9F8D,0x9F8E,0x9F8F,0x9F90,0x9F91,0x9F92,0x9F93,0x9F94,/* 0x88-0x8F */
        0x9F95,0x9F96,0x9F97,0x9F98,0x9F9C,0x9F9D,0x9F9E,0x9FA1,/* 0x90-0x97 */
        0x9FA2,0x9FA3,0x9FA4,0x9FA5,0xF92C,0xF979,0xF995,0xF9E7,/* 0x98-0x9F */
        0xF9F1,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0xA0-0xA7 */
};

static const wchar_t c2u_FE[256] = {
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x00-0x07 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x08-0x0F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x10-0x17 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x18-0x1F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x20-0x27 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x28-0x2F */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x30-0x37 */
        0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,/* 0x38-0x3F */
        0xFA0C,0xFA0D,0xFA0E,0xFA0F,0xFA11,0xFA13,0xFA14,0xFA18,/* 0x40-0x47 */
        0xFA1F,0xFA20,0xFA21,0xFA23,0xFA24,0xFA27,0xFA28,0xFA29,/* 0x48-0x4F */
};

static const wchar_t *page_charset2uni[256] = {
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   c2u_81, c2u_82, c2u_83, c2u_84, c2u_85, c2u_86, c2u_87, 
        c2u_88, c2u_89, c2u_8A, c2u_8B, c2u_8C, c2u_8D, c2u_8E, c2u_8F, 
        c2u_90, c2u_91, c2u_92, c2u_93, c2u_94, c2u_95, c2u_96, c2u_97, 
        c2u_98, c2u_99, c2u_9A, c2u_9B, c2u_9C, c2u_9D, c2u_9E, c2u_9F, 
        c2u_A0, c2u_A1, c2u_A2, c2u_A3, c2u_A4, c2u_A5, c2u_A6, c2u_A7, 
        c2u_A8, c2u_A9, c2u_AA, c2u_AB, c2u_AC, c2u_AD, c2u_AE, c2u_AF, 
        c2u_B0, c2u_B1, c2u_B2, c2u_B3, c2u_B4, c2u_B5, c2u_B6, c2u_B7, 
        c2u_B8, c2u_B9, c2u_BA, c2u_BB, c2u_BC, c2u_BD, c2u_BE, c2u_BF, 
        c2u_C0, c2u_C1, c2u_C2, c2u_C3, c2u_C4, c2u_C5, c2u_C6, c2u_C7, 
        c2u_C8, c2u_C9, c2u_CA, c2u_CB, c2u_CC, c2u_CD, c2u_CE, c2u_CF, 
        c2u_D0, c2u_D1, c2u_D2, c2u_D3, c2u_D4, c2u_D5, c2u_D6, c2u_D7, 
        c2u_D8, c2u_D9, c2u_DA, c2u_DB, c2u_DC, c2u_DD, c2u_DE, c2u_DF, 
        c2u_E0, c2u_E1, c2u_E2, c2u_E3, c2u_E4, c2u_E5, c2u_E6, c2u_E7, 
        c2u_E8, c2u_E9, c2u_EA, c2u_EB, c2u_EC, c2u_ED, c2u_EE, c2u_EF, 
        c2u_F0, c2u_F1, c2u_F2, c2u_F3, c2u_F4, c2u_F5, c2u_F6, c2u_F7, 
        c2u_F8, c2u_F9, c2u_FA, c2u_FB, c2u_FC, c2u_FD, c2u_FE, NULL,   
};

static const unsigned char u2c_00[512] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
        0xA1, 0xE8, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xEC, /* 0xA4-0xA7 */
        0xA1, 0xA7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
        0xA1, 0xE3, 0xA1, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xA4, /* 0xB4-0xB7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC1, /* 0xD4-0xD7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
        0xA8, 0xA4, 0xA8, 0xA2, 0x00, 0x00, 0x00, 0x00, /* 0xE0-0xE3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
        0xA8, 0xA8, 0xA8, 0xA6, 0xA8, 0xBA, 0x00, 0x00, /* 0xE8-0xEB */
        0xA8, 0xAC, 0xA8, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
        0x00, 0x00, 0x00, 0x00, 0xA8, 0xB0, 0xA8, 0xAE, /* 0xF0-0xF3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC2, /* 0xF4-0xF7 */
        0x00, 0x00, 0xA8, 0xB4, 0xA8, 0xB2, 0x00, 0x00, /* 0xF8-0xFB */
        0xA8, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
};

static const unsigned char u2c_01[512] = {
        0xA8, 0xA1, 0xA8, 0xA1, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
        0x00, 0x00, 0x00, 0x00, 0xA8, 0xA5, 0xA8, 0xA5, /* 0x10-0x13 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
        0x00, 0x00, 0x00, 0x00, 0xA8, 0xA7, 0xA8, 0xA7, /* 0x18-0x1B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
        0x00, 0x00, 0x00, 0x00, 0xA8, 0xA9, 0xA8, 0xA9, /* 0x28-0x2B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
        0xA8, 0xBD, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
        0xA8, 0xBE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
        0xA8, 0xAD, 0xA8, 0xAD, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
        0x00, 0x00, 0x00, 0x00, 0xA8, 0xB1, 0xA8, 0xB1, /* 0x68-0x6B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
        
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
        0x00, 0x00, 0xA1, 0xCE, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
        0x00, 0x00, 0xA8, 0xA3, 0xA8, 0xA3, 0xA8, 0xAB, /* 0xCC-0xCF */
        0xA8, 0xAB, 0xA8, 0xAF, 0xA8, 0xAF, 0xA8, 0xB3, /* 0xD0-0xD3 */
        0xA8, 0xB3, 0xA8, 0xB5, 0xA8, 0xB5, 0xA8, 0xB6, /* 0xD4-0xD7 */
        0xA8, 0xB6, 0xA8, 0xB7, 0xA8, 0xB7, 0xA8, 0xB8, /* 0xD8-0xDB */
        0xA8, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
};

static const unsigned char u2c_02[512] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
        0x00, 0x00, 0xA8, 0xBB, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
        0x00, 0x00, 0xA8, 0xC0, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
        
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xA6, /* 0xC4-0xC7 */
        0x00, 0x00, 0xA1, 0xA5, 0xA8, 0x40, 0xA8, 0x41, /* 0xC8-0xCB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
        0x00, 0x00, 0xA8, 0x42, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
};

static const unsigned char u2c_03[512] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
        
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
        0x00, 0x00, 0xA6, 0xA1, 0xA6, 0xA2, 0xA6, 0xA3, /* 0x90-0x93 */
        0xA6, 0xA4, 0xA6, 0xA5, 0xA6, 0xA6, 0xA6, 0xA7, /* 0x94-0x97 */
        0xA6, 0xA8, 0xA6, 0xA9, 0xA6, 0xAA, 0xA6, 0xAB, /* 0x98-0x9B */
        0xA6, 0xAC, 0xA6, 0xAD, 0xA6, 0xAE, 0xA6, 0xAF, /* 0x9C-0x9F */
        0xA6, 0xB0, 0xA6, 0xB1, 0x00, 0x00, 0xA6, 0xB2, /* 0xA0-0xA3 */
        0xA6, 0xB3, 0xA6, 0xB4, 0xA6, 0xB5, 0xA6, 0xB6, /* 0xA4-0xA7 */
        0xA6, 0xB7, 0xA6, 0xB8, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
        0x00, 0x00, 0xA6, 0xC1, 0xA6, 0xC2, 0xA6, 0xC3, /* 0xB0-0xB3 */
        0xA6, 0xC4, 0xA6, 0xC5, 0xA6, 0xC6, 0xA6, 0xC7, /* 0xB4-0xB7 */
        0xA6, 0xC8, 0xA6, 0xC9, 0xA6, 0xCA, 0xA6, 0xCB, /* 0xB8-0xBB */
        0xA6, 0xCC, 0xA6, 0xCD, 0xA6, 0xCE, 0xA6, 0xCF, /* 0xBC-0xBF */
        0xA6, 0xD0, 0xA6, 0xD1, 0x00, 0x00, 0xA6, 0xD2, /* 0xC0-0xC3 */
        0xA6, 0xD3, 0xA6, 0xD4, 0xA6, 0xD5, 0xA6, 0xD6, /* 0xC4-0xC7 */
        0xA6, 0xD7, 0xA6, 0xD8, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
};

static const unsigned char u2c_04[512] = {
        0x00, 0x00, 0xA7, 0xA7, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
        0xA7, 0xA1, 0xA7, 0xA2, 0xA7, 0xA3, 0xA7, 0xA4, /* 0x10-0x13 */
        0xA7, 0xA5, 0xA7, 0xA6, 0xA7, 0xA8, 0xA7, 0xA9, /* 0x14-0x17 */
        0xA7, 0xAA, 0xA7, 0xAB, 0xA7, 0xAC, 0xA7, 0xAD, /* 0x18-0x1B */
        0xA7, 0xAE, 0xA7, 0xAF, 0xA7, 0xB0, 0xA7, 0xB1, /* 0x1C-0x1F */
        0xA7, 0xB2, 0xA7, 0xB3, 0xA7, 0xB4, 0xA7, 0xB5, /* 0x20-0x23 */
        0xA7, 0xB6, 0xA7, 0xB7, 0xA7, 0xB8, 0xA7, 0xB9, /* 0x24-0x27 */
        0xA7, 0xBA, 0xA7, 0xBB, 0xA7, 0xBC, 0xA7, 0xBD, /* 0x28-0x2B */
        0xA7, 0xBE, 0xA7, 0xBF, 0xA7, 0xC0, 0xA7, 0xC1, /* 0x2C-0x2F */
        0xA7, 0xD1, 0xA7, 0xD2, 0xA7, 0xD3, 0xA7, 0xD4, /* 0x30-0x33 */
        0xA7, 0xD5, 0xA7, 0xD6, 0xA7, 0xD8, 0xA7, 0xD9, /* 0x34-0x37 */
        0xA7, 0xDA, 0xA7, 0xDB, 0xA7, 0xDC, 0xA7, 0xDD, /* 0x38-0x3B */
        0xA7, 0xDE, 0xA7, 0xDF, 0xA7, 0xE0, 0xA7, 0xE1, /* 0x3C-0x3F */
        0xA7, 0xE2, 0xA7, 0xE3, 0xA7, 0xE4, 0xA7, 0xE5, /* 0x40-0x43 */
        0xA7, 0xE6, 0xA7, 0xE7, 0xA7, 0xE8, 0xA7, 0xE9, /* 0x44-0x47 */
        0xA7, 0xEA, 0xA7, 0xEB, 0xA7, 0xEC, 0xA7, 0xED, /* 0x48-0x4B */
        0xA7, 0xEE, 0xA7, 0xEF, 0xA7, 0xF0, 0xA7, 0xF1, /* 0x4C-0x4F */
        0x00, 0x00, 0xA7, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
};

static const unsigned char u2c_20[512] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
        0xA9, 0x5C, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x43, /* 0x10-0x13 */
        0xA1, 0xAA, 0xA8, 0x44, 0xA1, 0xAC, 0x00, 0x00, /* 0x14-0x17 */
        0xA1, 0xAE, 0xA1, 0xAF, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
        0xA1, 0xB0, 0xA1, 0xB1, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
        0x00, 0x00, 0xA8, 0x45, 0xA1, 0xAD, 0x00, 0x00, /* 0x24-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
        0xA1, 0xEB, 0x00, 0x00, 0xA1, 0xE4, 0xA1, 0xE5, /* 0x30-0x33 */
        0x00, 0x00, 0xA8, 0x46, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF9, /* 0x38-0x3B */
        0x00, 0x00, 0x00, 0x00, 0xA3, 0xFE, 0x00, 0x00, /* 0x3C-0x3F */
};

static const unsigned char u2c_21[512] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xE6, /* 0x00-0x03 */
        0x00, 0x00, 0xA8, 0x47, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
        0x00, 0x00, 0xA8, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
        0x00, 0x00, 0x00, 0x00, 0xA1, 0xED, 0x00, 0x00, /* 0x14-0x17 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
        0x00, 0x00, 0xA9, 0x59, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
        0xA2, 0xF1, 0xA2, 0xF2, 0xA2, 0xF3, 0xA2, 0xF4, /* 0x60-0x63 */
        0xA2, 0xF5, 0xA2, 0xF6, 0xA2, 0xF7, 0xA2, 0xF8, /* 0x64-0x67 */
        0xA2, 0xF9, 0xA2, 0xFA, 0xA2, 0xFB, 0xA2, 0xFC, /* 0x68-0x6B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
        0xA2, 0xA1, 0xA2, 0xA2, 0xA2, 0xA3, 0xA2, 0xA4, /* 0x70-0x73 */
        0xA2, 0xA5, 0xA2, 0xA6, 0xA2, 0xA7, 0xA2, 0xA8, /* 0x74-0x77 */
        0xA2, 0xA9, 0xA2, 0xAA, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
        
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
        0xA1, 0xFB, 0xA1, 0xFC, 0xA1, 0xFA, 0xA1, 0xFD, /* 0x90-0x93 */
        0x00, 0x00, 0x00, 0x00, 0xA8, 0x49, 0xA8, 0x4A, /* 0x94-0x97 */
        0xA8, 0x4B, 0xA8, 0x4C, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
};

static const unsigned char u2c_22[512] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
        0xA1, 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xC7, /* 0x0C-0x0F */
        0x00, 0x00, 0xA1, 0xC6, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
        0x00, 0x00, 0xA8, 0x4D, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
        0xA1, 0xE3, 0x00, 0x00, 0xA1, 0xCC, 0x00, 0x00, /* 0x18-0x1B */
        0x00, 0x00, 0xA1, 0xD8, 0xA1, 0xDE, 0xA8, 0x4E, /* 0x1C-0x1F */
        0xA1, 0xCF, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x4F, /* 0x20-0x23 */
        0x00, 0x00, 0xA1, 0xCE, 0x00, 0x00, 0xA1, 0xC4, /* 0x24-0x27 */
        0xA1, 0xC5, 0xA1, 0xC9, 0xA1, 0xC8, 0xA1, 0xD2, /* 0x28-0x2B */
        0x00, 0x00, 0x00, 0x00, 0xA1, 0xD3, 0x00, 0x00, /* 0x2C-0x2F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
        0xA1, 0xE0, 0xA1, 0xDF, 0xA1, 0xC3, 0xA1, 0xCB, /* 0x34-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
        0xA1, 0xAB, 0xA1, 0xD7, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
        0xA1, 0xD6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
        0xA1, 0xD5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
        0x00, 0x00, 0x00, 0x00, 0xA8, 0x50, 0x00, 0x00, /* 0x50-0x53 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
        0xA1, 0xD9, 0xA1, 0xD4, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
        0xA1, 0xDC, 0xA1, 0xDD, 0xA8, 0x51, 0xA8, 0x52, /* 0x64-0x67 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
        0x00, 0x00, 0x00, 0x00, 0xA1, 0xDA, 0xA1, 0xDB, /* 0x6C-0x6F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
        
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
        0x00, 0x00, 0xA8, 0x92, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
        0x00, 0x00, 0xA1, 0xD1, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
        0x00, 0x00, 0xA1, 0xCD, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x53, /* 0xBC-0xBF */
};

static const unsigned char u2c_23[512] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
        0x00, 0x00, 0x00, 0x00, 0xA1, 0xD0, 0x00, 0x00, /* 0x10-0x13 */
};

static const unsigned char u2c_24[512] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
        0xA2, 0xD9, 0xA2, 0xDA, 0xA2, 0xDB, 0xA2, 0xDC, /* 0x60-0x63 */
        0xA2, 0xDD, 0xA2, 0xDE, 0xA2, 0xDF, 0xA2, 0xE0, /* 0x64-0x67 */
        0xA2, 0xE1, 0xA2, 0xE2, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
        0xA2, 0xC5, 0xA2, 0xC6, 0xA2, 0xC7, 0xA2, 0xC8, /* 0x74-0x77 */
        0xA2, 0xC9, 0xA2, 0xCA, 0xA2, 0xCB, 0xA2, 0xCC, /* 0x78-0x7B */
        0xA2, 0xCD, 0xA2, 0xCE, 0xA2, 0xCF, 0xA2, 0xD0, /* 0x7C-0x7F */
        
        0xA2, 0xD1, 0xA2, 0xD2, 0xA2, 0xD3, 0xA2, 0xD4, /* 0x80-0x83 */
        0xA2, 0xD5, 0xA2, 0xD6, 0xA2, 0xD7, 0xA2, 0xD8, /* 0x84-0x87 */
        0xA2, 0xB1, 0xA2, 0xB2, 0xA2, 0xB3, 0xA2, 0xB4, /* 0x88-0x8B */
        0xA2, 0xB5, 0xA2, 0xB6, 0xA2, 0xB7, 0xA2, 0xB8, /* 0x8C-0x8F */
        0xA2, 0xB9, 0xA2, 0xBA, 0xA2, 0xBB, 0xA2, 0xBC, /* 0x90-0x93 */
        0xA2, 0xBD, 0xA2, 0xBE, 0xA2, 0xBF, 0xA2, 0xC0, /* 0x94-0x97 */
        0xA2, 0xC1, 0xA2, 0xC2, 0xA2, 0xC3, 0xA2, 0xC4, /* 0x98-0x9B */
};

static const unsigned char u2c_25[512] = {
        0xA9, 0xA4, 0xA9, 0xA5, 0xA9, 0xA6, 0xA9, 0xA7, /* 0x00-0x03 */
        0xA9, 0xA8, 0xA9, 0xA9, 0xA9, 0xAA, 0xA9, 0xAB, /* 0x04-0x07 */
        0xA9, 0xAC, 0xA9, 0xAD, 0xA9, 0xAE, 0xA9, 0xAF, /* 0x08-0x0B */
        0xA9, 0xB0, 0xA9, 0xB1, 0xA9, 0xB2, 0xA9, 0xB3, /* 0x0C-0x0F */
        0xA9, 0xB4, 0xA9, 0xB5, 0xA9, 0xB6, 0xA9, 0xB7, /* 0x10-0x13 */
        0xA9, 0xB8, 0xA9, 0xB9, 0xA9, 0xBA, 0xA9, 0xBB, /* 0x14-0x17 */
        0xA9, 0xBC, 0xA9, 0xBD, 0xA9, 0xBE, 0xA9, 0xBF, /* 0x18-0x1B */
        0xA9, 0xC0, 0xA9, 0xC1, 0xA9, 0xC2, 0xA9, 0xC3, /* 0x1C-0x1F */
        0xA9, 0xC4, 0xA9, 0xC5, 0xA9, 0xC6, 0xA9, 0xC7, /* 0x20-0x23 */
        0xA9, 0xC8, 0xA9, 0xC9, 0xA9, 0xCA, 0xA9, 0xCB, /* 0x24-0x27 */
        0xA9, 0xCC, 0xA9, 0xCD, 0xA9, 0xCE, 0xA9, 0xCF, /* 0x28-0x2B */
        0xA9, 0xD0, 0xA9, 0xD1, 0xA9, 0xD2, 0xA9, 0xD3, /* 0x2C-0x2F */
        0xA9, 0xD4, 0xA9, 0xD5, 0xA9, 0xD6, 0xA9, 0xD7, /* 0x30-0x33 */
        0xA9, 0xD8, 0xA9, 0xD9, 0xA9, 0xDA, 0xA9, 0xDB, /* 0x34-0x37 */
        0xA9, 0xDC, 0xA9, 0xDD, 0xA9, 0xDE, 0xA9, 0xDF, /* 0x38-0x3B */
        0xA9, 0xE0, 0xA9, 0xE1, 0xA9, 0xE2, 0xA9, 0xE3, /* 0x3C-0x3F */
        0xA9, 0xE4, 0xA9, 0xE5, 0xA9, 0xE6, 0xA9, 0xE7, /* 0x40-0x43 */
        0xA9, 0xE8, 0xA9, 0xE9, 0xA9, 0xEA, 0xA9, 0xEB, /* 0x44-0x47 */
        0xA9, 0xEC, 0xA9, 0xED, 0xA9, 0xEE, 0xA9, 0xEF, /* 0x48-0x4B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
        0xA8, 0x54, 0xA8, 0x55, 0xA8, 0x56, 0xA8, 0x57, /* 0x50-0x53 */
        0xA8, 0x58, 0xA8, 0x59, 0xA8, 0x5A, 0xA8, 0x5B, /* 0x54-0x57 */
        0xA8, 0x5C, 0xA8, 0x5D, 0xA8, 0x5E, 0xA8, 0x5F, /* 0x58-0x5B */
        0xA8, 0x60, 0xA8, 0x61, 0xA8, 0x62, 0xA8, 0x63, /* 0x5C-0x5F */
        0xA8, 0x64, 0xA8, 0x65, 0xA8, 0x66, 0xA8, 0x67, /* 0x60-0x63 */
        0xA8, 0x68, 0xA8, 0x69, 0xA8, 0x6A, 0xA8, 0x6B, /* 0x64-0x67 */
        0xA8, 0x6C, 0xA8, 0x6D, 0xA8, 0x6E, 0xA8, 0x6F, /* 0x68-0x6B */
        0xA8, 0x70, 0xA8, 0x71, 0xA8, 0x72, 0xA8, 0x73, /* 0x6C-0x6F */
        0xA8, 0x74, 0xA8, 0x75, 0xA8, 0x76, 0xA8, 0x77, /* 0x70-0x73 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
        
        0x00, 0x00, 0xA8, 0x78, 0xA8, 0x79, 0xA8, 0x7A, /* 0x80-0x83 */
        0xA8, 0x7B, 0xA8, 0x7C, 0xA8, 0x7D, 0xA8, 0x7E, /* 0x84-0x87 */
        0xA8, 0x80, 0xA8, 0x81, 0xA8, 0x82, 0xA8, 0x83, /* 0x88-0x8B */
        0xA8, 0x84, 0xA8, 0x85, 0xA8, 0x86, 0xA8, 0x87, /* 0x8C-0x8F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA8, 0x88, /* 0x90-0x93 */
        0xA8, 0x89, 0xA8, 0x8A, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
        0xA1, 0xF6, 0xA1, 0xF5, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
        0x00, 0x00, 0x00, 0x00, 0xA1, 0xF8, 0xA1, 0xF7, /* 0xB0-0xB3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
        0xA8, 0x8B, 0xA8, 0x8C, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
        0x00, 0x00, 0x00, 0x00, 0xA1, 0xF4, 0xA1, 0xF3, /* 0xC4-0xC7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA1, 0xF0, /* 0xC8-0xCB */
        0x00, 0x00, 0x00, 0x00, 0xA1, 0xF2, 0xA1, 0xF1, /* 0xCC-0xCF */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
        0x00, 0x00, 0x00, 0x00, 0xA8, 0x8D, 0xA8, 0x8E, /* 0xE0-0xE3 */
        0xA8, 0x8F, 0xA8, 0x90, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xE8-0xEB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xEC-0xEF */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF0-0xF3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF4-0xF7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xFC-0xFF */
};

static const unsigned char u2c_26[512] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
        0x00, 0x00, 0xA1, 0xEF, 0xA1, 0xEE, 0x00, 0x00, /* 0x04-0x07 */
        0x00, 0x00, 0xA8, 0x91, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
        0xA1, 0xE2, 0x00, 0x00, 0xA1, 0xE1, 0x00, 0x00, /* 0x40-0x43 */
};

static const unsigned char u2c_30[512] = {
        0xA1, 0xA1, 0xA1, 0xA2, 0xA1, 0xA3, 0xA1, 0xA8, /* 0x00-0x03 */
        0x00, 0x00, 0xA1, 0xA9, 0xA9, 0x65, 0xA9, 0x96, /* 0x04-0x07 */
        0xA1, 0xB4, 0xA1, 0xB5, 0xA1, 0xB6, 0xA1, 0xB7, /* 0x08-0x0B */
        0xA1, 0xB8, 0xA1, 0xB9, 0xA1, 0xBA, 0xA1, 0xBB, /* 0x0C-0x0F */
        0xA1, 0xBE, 0xA1, 0xBF, 0xA8, 0x93, 0xA1, 0xFE, /* 0x10-0x13 */
        0xA1, 0xB2, 0xA1, 0xB3, 0xA1, 0xBC, 0xA1, 0xBD, /* 0x14-0x17 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
        0x00, 0x00, 0xA8, 0x94, 0xA8, 0x95, 0x00, 0x00, /* 0x1C-0x1F */
        0x00, 0x00, 0xA9, 0x40, 0xA9, 0x41, 0xA9, 0x42, /* 0x20-0x23 */
        0xA9, 0x43, 0xA9, 0x44, 0xA9, 0x45, 0xA9, 0x46, /* 0x24-0x27 */
        0xA9, 0x47, 0xA9, 0x48, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
        0x00, 0x00, 0xA4, 0xA1, 0xA4, 0xA2, 0xA4, 0xA3, /* 0x40-0x43 */
        0xA4, 0xA4, 0xA4, 0xA5, 0xA4, 0xA6, 0xA4, 0xA7, /* 0x44-0x47 */
        0xA4, 0xA8, 0xA4, 0xA9, 0xA4, 0xAA, 0xA4, 0xAB, /* 0x48-0x4B */
        0xA4, 0xAC, 0xA4, 0xAD, 0xA4, 0xAE, 0xA4, 0xAF, /* 0x4C-0x4F */
        0xA4, 0xB0, 0xA4, 0xB1, 0xA4, 0xB2, 0xA4, 0xB3, /* 0x50-0x53 */
        0xA4, 0xB4, 0xA4, 0xB5, 0xA4, 0xB6, 0xA4, 0xB7, /* 0x54-0x57 */
        0xA4, 0xB8, 0xA4, 0xB9, 0xA4, 0xBA, 0xA4, 0xBB, /* 0x58-0x5B */
        0xA4, 0xBC, 0xA4, 0xBD, 0xA4, 0xBE, 0xA4, 0xBF, /* 0x5C-0x5F */
        0xA4, 0xC0, 0xA4, 0xC1, 0xA4, 0xC2, 0xA4, 0xC3, /* 0x60-0x63 */
        0xA4, 0xC4, 0xA4, 0xC5, 0xA4, 0xC6, 0xA4, 0xC7, /* 0x64-0x67 */
        0xA4, 0xC8, 0xA4, 0xC9, 0xA4, 0xCA, 0xA4, 0xCB, /* 0x68-0x6B */
        0xA4, 0xCC, 0xA4, 0xCD, 0xA4, 0xCE, 0xA4, 0xCF, /* 0x6C-0x6F */
        0xA4, 0xD0, 0xA4, 0xD1, 0xA4, 0xD2, 0xA4, 0xD3, /* 0x70-0x73 */
        0xA4, 0xD4, 0xA4, 0xD5, 0xA4, 0xD6, 0xA4, 0xD7, /* 0x74-0x77 */
        0xA4, 0xD8, 0xA4, 0xD9, 0xA4, 0xDA, 0xA4, 0xDB, /* 0x78-0x7B */
        0xA4, 0xDC, 0xA4, 0xDD, 0xA4, 0xDE, 0xA4, 0xDF, /* 0x7C-0x7F */
        
        0xA4, 0xE0, 0xA4, 0xE1, 0xA4, 0xE2, 0xA4, 0xE3, /* 0x80-0x83 */
        0xA4, 0xE4, 0xA4, 0xE5, 0xA4, 0xE6, 0xA4, 0xE7, /* 0x84-0x87 */
        0xA4, 0xE8, 0xA4, 0xE9, 0xA4, 0xEA, 0xA4, 0xEB, /* 0x88-0x8B */
        0xA4, 0xEC, 0xA4, 0xED, 0xA4, 0xEE, 0xA4, 0xEF, /* 0x8C-0x8F */
        0xA4, 0xF0, 0xA4, 0xF1, 0xA4, 0xF2, 0xA4, 0xF3, /* 0x90-0x93 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xA9, 0x61, /* 0x98-0x9B */
        0xA9, 0x62, 0xA9, 0x66, 0xA9, 0x67, 0x00, 0x00, /* 0x9C-0x9F */
        0x00, 0x00, 0xA5, 0xA1, 0xA5, 0xA2, 0xA5, 0xA3, /* 0xA0-0xA3 */
        0xA5, 0xA4, 0xA5, 0xA5, 0xA5, 0xA6, 0xA5, 0xA7, /* 0xA4-0xA7 */
        0xA5, 0xA8, 0xA5, 0xA9, 0xA5, 0xAA, 0xA5, 0xAB, /* 0xA8-0xAB */
        0xA5, 0xAC, 0xA5, 0xAD, 0xA5, 0xAE, 0xA5, 0xAF, /* 0xAC-0xAF */
        0xA5, 0xB0, 0xA5, 0xB1, 0xA5, 0xB2, 0xA5, 0xB3, /* 0xB0-0xB3 */
        0xA5, 0xB4, 0xA5, 0xB5, 0xA5, 0xB6, 0xA5, 0xB7, /* 0xB4-0xB7 */
        0xA5, 0xB8, 0xA5, 0xB9, 0xA5, 0xBA, 0xA5, 0xBB, /* 0xB8-0xBB */
        0xA5, 0xBC, 0xA5, 0xBD, 0xA5, 0xBE, 0xA5, 0xBF, /* 0xBC-0xBF */
        0xA5, 0xC0, 0xA5, 0xC1, 0xA5, 0xC2, 0xA5, 0xC3, /* 0xC0-0xC3 */
        0xA5, 0xC4, 0xA5, 0xC5, 0xA5, 0xC6, 0xA5, 0xC7, /* 0xC4-0xC7 */
        0xA5, 0xC8, 0xA5, 0xC9, 0xA5, 0xCA, 0xA5, 0xCB, /* 0xC8-0xCB */
        0xA5, 0xCC, 0xA5, 0xCD, 0xA5, 0xCE, 0xA5, 0xCF, /* 0xCC-0xCF */
        0xA5, 0xD0, 0xA5, 0xD1, 0xA5, 0xD2, 0xA5, 0xD3, /* 0xD0-0xD3 */
        0xA5, 0xD4, 0xA5, 0xD5, 0xA5, 0xD6, 0xA5, 0xD7, /* 0xD4-0xD7 */
        0xA5, 0xD8, 0xA5, 0xD9, 0xA5, 0xDA, 0xA5, 0xDB, /* 0xD8-0xDB */
        0xA5, 0xDC, 0xA5, 0xDD, 0xA5, 0xDE, 0xA5, 0xDF, /* 0xDC-0xDF */
        0xA5, 0xE0, 0xA5, 0xE1, 0xA5, 0xE2, 0xA5, 0xE3, /* 0xE0-0xE3 */
        0xA5, 0xE4, 0xA5, 0xE5, 0xA5, 0xE6, 0xA5, 0xE7, /* 0xE4-0xE7 */
        0xA5, 0xE8, 0xA5, 0xE9, 0xA5, 0xEA, 0xA5, 0xEB, /* 0xE8-0xEB */
        0xA5, 0xEC, 0xA5, 0xED, 0xA5, 0xEE, 0xA5, 0xEF, /* 0xEC-0xEF */
        0xA5, 0xF0, 0xA5, 0xF1, 0xA5, 0xF2, 0xA5, 0xF3, /* 0xF0-0xF3 */
        0xA5, 0xF4, 0xA5, 0xF5, 0xA5, 0xF6, 0x00, 0x00, /* 0xF4-0xF7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xF8-0xFB */
        0xA9, 0x60, 0xA9, 0x63, 0xA9, 0x64, 0x00, 0x00, /* 0xFC-0xFF */
};

static const unsigned char u2c_31[512] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
        0x00, 0x00, 0xA8, 0xC5, 0xA8, 0xC6, 0xA8, 0xC7, /* 0x04-0x07 */
        0xA8, 0xC8, 0xA8, 0xC9, 0xA8, 0xCA, 0xA8, 0xCB, /* 0x08-0x0B */
        0xA8, 0xCC, 0xA8, 0xCD, 0xA8, 0xCE, 0xA8, 0xCF, /* 0x0C-0x0F */
        0xA8, 0xD0, 0xA8, 0xD1, 0xA8, 0xD2, 0xA8, 0xD3, /* 0x10-0x13 */
        0xA8, 0xD4, 0xA8, 0xD5, 0xA8, 0xD6, 0xA8, 0xD7, /* 0x14-0x17 */
        0xA8, 0xD8, 0xA8, 0xD9, 0xA8, 0xDA, 0xA8, 0xDB, /* 0x18-0x1B */
        0xA8, 0xDC, 0xA8, 0xDD, 0xA8, 0xDE, 0xA8, 0xDF, /* 0x1C-0x1F */
        0xA8, 0xE0, 0xA8, 0xE1, 0xA8, 0xE2, 0xA8, 0xE3, /* 0x20-0x23 */
        0xA8, 0xE4, 0xA8, 0xE5, 0xA8, 0xE6, 0xA8, 0xE7, /* 0x24-0x27 */
        0xA8, 0xE8, 0xA8, 0xE9, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
        
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
        0x00, 0x00, 0x00, 0x00, 0xD2, 0xBB, 0xB6, 0xFE, /* 0x90-0x93 */
        0xC8, 0xFD, 0xCB, 0xC4, 0xC9, 0xCF, 0xD6, 0xD0, /* 0x94-0x97 */
        0xCF, 0xC2, 0xBC, 0xD7, 0xD2, 0xD2, 0xB1, 0xFB, /* 0x98-0x9B */
        0xB6, 0xA1, 0xCC, 0xEC, 0xB5, 0xD8, 0xC8, 0xCB, /* 0x9C-0x9F */
};

static const unsigned char u2c_32[512] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
        0xA2, 0xE5, 0xA2, 0xE6, 0xA2, 0xE7, 0xA2, 0xE8, /* 0x20-0x23 */
        0xA2, 0xE9, 0xA2, 0xEA, 0xA2, 0xEB, 0xA2, 0xEC, /* 0x24-0x27 */
        0xA2, 0xED, 0xA2, 0xEE, 0xD4, 0xC2, 0xBB, 0xF0, /* 0x28-0x2B */
        0xCB, 0xAE, 0xC4, 0xBE, 0xBD, 0xF0, 0xCD, 0xC1, /* 0x2C-0x2F */
        0xC8, 0xD5, 0xA9, 0x5A, 0xD3, 0xD0, 0xC9, 0xE7, /* 0x30-0x33 */
        0xC3, 0xFB, 0xCC, 0xD8, 0xB2, 0xC6, 0xD7, 0xA3, /* 0x34-0x37 */
        0xC0, 0xCD, 0xB4, 0xFA, 0xBA, 0xF4, 0xD1, 0xA7, /* 0x38-0x3B */
        0xBC, 0xE0, 0xC6, 0xF3, 0xD7, 0xCA, 0xD0, 0xAD, /* 0x3C-0x3F */
        0xBC, 0xC0, 0xD0, 0xDD, 0xD7, 0xD4, 0xD6, 0xC1, /* 0x40-0x43 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
        
        0xD2, 0xBB, 0xB6, 0xFE, 0xC8, 0xFD, 0xCB, 0xC4, /* 0x80-0x83 */
        0xCE, 0xE5, 0xC1, 0xF9, 0xC6, 0xDF, 0xB0, 0xCB, /* 0x84-0x87 */
        0xBE, 0xC5, 0xCA, 0xAE, 0xD4, 0xC2, 0xBB, 0xF0, /* 0x88-0x8B */
        0xCB, 0xAE, 0xC4, 0xBE, 0xBD, 0xF0, 0xCD, 0xC1, /* 0x8C-0x8F */
        0xC8, 0xD5, 0xD6, 0xEA, 0xD3, 0xD0, 0xC9, 0xE7, /* 0x90-0x93 */
        0xC3, 0xFB, 0xCC, 0xD8, 0xB2, 0xC6, 0xD7, 0xA3, /* 0x94-0x97 */
        0xC0, 0xCD, 0xC3, 0xD8, 0xC4, 0xD0, 0xC5, 0xAE, /* 0x98-0x9B */
        0xCA, 0xCA, 0xD3, 0xC5, 0x00, 0x00, 0xD7, 0xA2, /* 0x9C-0x9F */
        0xCF, 0xEE, 0xD0, 0xDD, 0xD0, 0xB4, 0xA9, 0x49, /* 0xA0-0xA3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
        0x00, 0x00, 0xD2, 0xBD, 0xD7, 0xDA, 0xD1, 0xA7, /* 0xA8-0xAB */
        0xBC, 0xE0, 0xC6, 0xF3, 0xD7, 0xCA, 0xD0, 0xAD, /* 0xAC-0xAF */
        0xD2, 0xB9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
};

static const unsigned char u2c_33[512] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x33 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x34-0x37 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3C-0x3F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x43 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x4C-0x4F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x53 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x54-0x57 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x5C-0x5F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
        
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
        0x00, 0x00, 0x00, 0x00, 0xA9, 0x4A, 0xA9, 0x4B, /* 0x8C-0x8F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
        0xA9, 0x4C, 0xA9, 0x4D, 0xA9, 0x4E, 0x00, 0x00, /* 0x9C-0x9F */
        0x00, 0x00, 0xA9, 0x4F, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
        0xA9, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
        0x00, 0x00, 0x00, 0x00, 0xA9, 0x51, 0x00, 0x00, /* 0xCC-0xCF */
        0x00, 0x00, 0xA9, 0x52, 0xA9, 0x53, 0x00, 0x00, /* 0xD0-0xD3 */
        0x00, 0x00, 0xA9, 0x54, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
};

static const unsigned char u2c_4E[512] = {
        0xD2, 0xBB, 0xB6, 0xA1, 0x81, 0x40, 0xC6, 0xDF, /* 0x00-0x03 */
        0x81, 0x41, 0x81, 0x42, 0x81, 0x43, 0xCD, 0xF2, /* 0x04-0x07 */
        0xD5, 0xC9, 0xC8, 0xFD, 0xC9, 0xCF, 0xCF, 0xC2, /* 0x08-0x0B */
        0xD8, 0xA2, 0xB2, 0xBB, 0xD3, 0xEB, 0x81, 0x44, /* 0x0C-0x0F */
        0xD8, 0xA4, 0xB3, 0xF3, 0x81, 0x45, 0xD7, 0xA8, /* 0x10-0x13 */
        0xC7, 0xD2, 0xD8, 0xA7, 0xCA, 0xC0, 0x81, 0x46, /* 0x14-0x17 */
        0xC7, 0xF0, 0xB1, 0xFB, 0xD2, 0xB5, 0xB4, 0xD4, /* 0x18-0x1B */
        0xB6, 0xAB, 0xCB, 0xBF, 0xD8, 0xA9, 0x81, 0x47, /* 0x1C-0x1F */
        0x81, 0x48, 0x81, 0x49, 0xB6, 0xAA, 0x81, 0x4A, /* 0x20-0x23 */
        0xC1, 0xBD, 0xD1, 0xCF, 0x81, 0x4B, 0xC9, 0xA5, /* 0x24-0x27 */
        0xD8, 0xAD, 0x81, 0x4C, 0xB8, 0xF6, 0xD1, 0xBE, /* 0x28-0x2B */
        0xE3, 0xDC, 0xD6, 0xD0, 0x81, 0x4D, 0x81, 0x4E, /* 0x2C-0x2F */
        0xB7, 0xE1, 0x81, 0x4F, 0xB4, 0xAE, 0x81, 0x50, /* 0x30-0x33 */
        0xC1, 0xD9, 0x81, 0x51, 0xD8, 0xBC, 0x81, 0x52, /* 0x34-0x37 */
        0xCD, 0xE8, 0xB5, 0xA4, 0xCE, 0xAA, 0xD6, 0xF7, /* 0x38-0x3B */
        0x81, 0x53, 0xC0, 0xF6, 0xBE, 0xD9, 0xD8, 0xAF, /* 0x3C-0x3F */
        0x81, 0x54, 0x81, 0x55, 0x81, 0x56, 0xC4, 0xCB, /* 0x40-0x43 */
        0x81, 0x57, 0xBE, 0xC3, 0x81, 0x58, 0xD8, 0xB1, /* 0x44-0x47 */
        0xC3, 0xB4, 0xD2, 0xE5, 0x81, 0x59, 0xD6, 0xAE, /* 0x48-0x4B */
        0xCE, 0xDA, 0xD5, 0xA7, 0xBA, 0xF5, 0xB7, 0xA6, /* 0x4C-0x4F */
        0xC0, 0xD6, 0x81, 0x5A, 0xC6, 0xB9, 0xC5, 0xD2, /* 0x50-0x53 */
        0xC7, 0xC7, 0x81, 0x5B, 0xB9, 0xD4, 0x81, 0x5C, /* 0x54-0x57 */
        0xB3, 0xCB, 0xD2, 0xD2, 0x81, 0x5D, 0x81, 0x5E, /* 0x58-0x5B */
        0xD8, 0xBF, 0xBE, 0xC5, 0xC6, 0xF2, 0xD2, 0xB2, /* 0x5C-0x5F */
        0xCF, 0xB0, 0xCF, 0xE7, 0x81, 0x5F, 0x81, 0x60, /* 0x60-0x63 */
        0x81, 0x61, 0x81, 0x62, 0xCA, 0xE9, 0x81, 0x63, /* 0x64-0x67 */
        0x81, 0x64, 0xD8, 0xC0, 0x81, 0x65, 0x81, 0x66, /* 0x68-0x6B */
        0x81, 0x67, 0x81, 0x68, 0x81, 0x69, 0x81, 0x6A, /* 0x6C-0x6F */
        0xC2, 0xF2, 0xC2, 0xD2, 0x81, 0x6B, 0xC8, 0xE9, /* 0x70-0x73 */
        0x81, 0x6C, 0x81, 0x6D, 0x81, 0x6E, 0x81, 0x6F, /* 0x74-0x77 */
        0x81, 0x70, 0x81, 0x71, 0x81, 0x72, 0x81, 0x73, /* 0x78-0x7B */
        0x81, 0x74, 0x81, 0x75, 0xC7, 0xAC, 0x81, 0x76, /* 0x7C-0x7F */
        
        0x81, 0x77, 0x81, 0x78, 0x81, 0x79, 0x81, 0x7A, /* 0x80-0x83 */
        0x81, 0x7B, 0x81, 0x7C, 0xC1, 0xCB, 0x81, 0x7D, /* 0x84-0x87 */
        0xD3, 0xE8, 0xD5, 0xF9, 0x81, 0x7E, 0xCA, 0xC2, /* 0x88-0x8B */
        0xB6, 0xFE, 0xD8, 0xA1, 0xD3, 0xDA, 0xBF, 0xF7, /* 0x8C-0x8F */
        0x81, 0x80, 0xD4, 0xC6, 0xBB, 0xA5, 0xD8, 0xC1, /* 0x90-0x93 */
        0xCE, 0xE5, 0xBE, 0xAE, 0x81, 0x81, 0x81, 0x82, /* 0x94-0x97 */
        0xD8, 0xA8, 0x81, 0x83, 0xD1, 0xC7, 0xD0, 0xA9, /* 0x98-0x9B */
        0x81, 0x84, 0x81, 0x85, 0x81, 0x86, 0xD8, 0xBD, /* 0x9C-0x9F */
        0xD9, 0xEF, 0xCD, 0xF6, 0xBF, 0xBA, 0x81, 0x87, /* 0xA0-0xA3 */
        0xBD, 0xBB, 0xBA, 0xA5, 0xD2, 0xE0, 0xB2, 0xFA, /* 0xA4-0xA7 */
        0xBA, 0xE0, 0xC4, 0xB6, 0x81, 0x88, 0xCF, 0xED, /* 0xA8-0xAB */
        0xBE, 0xA9, 0xCD, 0xA4, 0xC1, 0xC1, 0x81, 0x89, /* 0xAC-0xAF */
        0x81, 0x8A, 0x81, 0x8B, 0xC7, 0xD7, 0xD9, 0xF1, /* 0xB0-0xB3 */
        0x81, 0x8C, 0xD9, 0xF4, 0x81, 0x8D, 0x81, 0x8E, /* 0xB4-0xB7 */
        0x81, 0x8F, 0x81, 0x90, 0xC8, 0xCB, 0xD8, 0xE9, /* 0xB8-0xBB */
        0x81, 0x91, 0x81, 0x92, 0x81, 0x93, 0xD2, 0xDA, /* 0xBC-0xBF */
        0xCA, 0xB2, 0xC8, 0xCA, 0xD8, 0xEC, 0xD8, 0xEA, /* 0xC0-0xC3 */
        0xD8, 0xC6, 0xBD, 0xF6, 0xC6, 0xCD, 0xB3, 0xF0, /* 0xC4-0xC7 */
        0x81, 0x94, 0xD8, 0xEB, 0xBD, 0xF1, 0xBD, 0xE9, /* 0xC8-0xCB */
        0x81, 0x95, 0xC8, 0xD4, 0xB4, 0xD3, 0x81, 0x96, /* 0xCC-0xCF */
        0x81, 0x97, 0xC2, 0xD8, 0x81, 0x98, 0xB2, 0xD6, /* 0xD0-0xD3 */
        0xD7, 0xD0, 0xCA, 0xCB, 0xCB, 0xFB, 0xD5, 0xCC, /* 0xD4-0xD7 */
        0xB8, 0xB6, 0xCF, 0xC9, 0x81, 0x99, 0x81, 0x9A, /* 0xD8-0xDB */
        0x81, 0x9B, 0xD9, 0xDA, 0xD8, 0xF0, 0xC7, 0xAA, /* 0xDC-0xDF */
        0x81, 0x9C, 0xD8, 0xEE, 0x81, 0x9D, 0xB4, 0xFA, /* 0xE0-0xE3 */
        0xC1, 0xEE, 0xD2, 0xD4, 0x81, 0x9E, 0x81, 0x9F, /* 0xE4-0xE7 */
        0xD8, 0xED, 0x81, 0xA0, 0xD2, 0xC7, 0xD8, 0xEF, /* 0xE8-0xEB */
        0xC3, 0xC7, 0x81, 0xA1, 0x81, 0xA2, 0x81, 0xA3, /* 0xEC-0xEF */
        0xD1, 0xF6, 0x81, 0xA4, 0xD6, 0xD9, 0xD8, 0xF2, /* 0xF0-0xF3 */
        0x81, 0xA5, 0xD8, 0xF5, 0xBC, 0xFE, 0xBC, 0xDB, /* 0xF4-0xF7 */
        0x81, 0xA6, 0x81, 0xA7, 0x81, 0xA8, 0xC8, 0xCE, /* 0xF8-0xFB */
        0x81, 0xA9, 0xB7, 0xDD, 0x81, 0xAA, 0xB7, 0xC2, /* 0xFC-0xFF */
};

static const unsigned char u2c_4F[512] = {
        0x81, 0xAB, 0xC6, 0xF3, 0x81, 0xAC, 0x81, 0xAD, /* 0x00-0x03 */
        0x81, 0xAE, 0x81, 0xAF, 0x81, 0xB0, 0x81, 0xB1, /* 0x04-0x07 */
        0x81, 0xB2, 0xD8, 0xF8, 0xD2, 0xC1, 0x81, 0xB3, /* 0x08-0x0B */
        0x81, 0xB4, 0xCE, 0xE9, 0xBC, 0xBF, 0xB7, 0xFC, /* 0x0C-0x0F */
        0xB7, 0xA5, 0xD0, 0xDD, 0x81, 0xB5, 0x81, 0xB6, /* 0x10-0x13 */
        0x81, 0xB7, 0x81, 0xB8, 0x81, 0xB9, 0xD6, 0xDA, /* 0x14-0x17 */
        0xD3, 0xC5, 0xBB, 0xEF, 0xBB, 0xE1, 0xD8, 0xF1, /* 0x18-0x1B */
        0x81, 0xBA, 0x81, 0xBB, 0xC9, 0xA1, 0xCE, 0xB0, /* 0x1C-0x1F */
        0xB4, 0xAB, 0x81, 0xBC, 0xD8, 0xF3, 0x81, 0xBD, /* 0x20-0x23 */
        0xC9, 0xCB, 0xD8, 0xF6, 0xC2, 0xD7, 0xD8, 0xF7, /* 0x24-0x27 */
        0x81, 0xBE, 0x81, 0xBF, 0xCE, 0xB1, 0xD8, 0xF9, /* 0x28-0x2B */
        0x81, 0xC0, 0x81, 0xC1, 0x81, 0xC2, 0xB2, 0xAE, /* 0x2C-0x2F */
        0xB9, 0xC0, 0x81, 0xC3, 0xD9, 0xA3, 0x81, 0xC4, /* 0x30-0x33 */
        0xB0, 0xE9, 0x81, 0xC5, 0xC1, 0xE6, 0x81, 0xC6, /* 0x34-0x37 */
        0xC9, 0xEC, 0x81, 0xC7, 0xCB, 0xC5, 0x81, 0xC8, /* 0x38-0x3B */
        0xCB, 0xC6, 0xD9, 0xA4, 0x81, 0xC9, 0x81, 0xCA, /* 0x3C-0x3F */
        0x81, 0xCB, 0x81, 0xCC, 0x81, 0xCD, 0xB5, 0xE8, /* 0x40-0x43 */
        0x81, 0xCE, 0x81, 0xCF, 0xB5, 0xAB, 0x81, 0xD0, /* 0x44-0x47 */
        0x81, 0xD1, 0x81, 0xD2, 0x81, 0xD3, 0x81, 0xD4, /* 0x48-0x4B */
        0x81, 0xD5, 0xCE, 0xBB, 0xB5, 0xCD, 0xD7, 0xA1, /* 0x4C-0x4F */
        0xD7, 0xF4, 0xD3, 0xD3, 0x81, 0xD6, 0xCC, 0xE5, /* 0x50-0x53 */
        0x81, 0xD7, 0xBA, 0xCE, 0x81, 0xD8, 0xD9, 0xA2, /* 0x54-0x57 */
        0xD9, 0xDC, 0xD3, 0xE0, 0xD8, 0xFD, 0xB7, 0xF0, /* 0x58-0x5B */
        0xD7, 0xF7, 0xD8, 0xFE, 0xD8, 0xFA, 0xD9, 0xA1, /* 0x5C-0x5F */
        0xC4, 0xE3, 0x81, 0xD9, 0x81, 0xDA, 0xD3, 0xB6, /* 0x60-0x63 */
        0xD8, 0xF4, 0xD9, 0xDD, 0x81, 0xDB, 0xD8, 0xFB, /* 0x64-0x67 */
        0x81, 0xDC, 0xC5, 0xE5, 0x81, 0xDD, 0x81, 0xDE, /* 0x68-0x6B */
        0xC0, 0xD0, 0x81, 0xDF, 0x81, 0xE0, 0xD1, 0xF0, /* 0x6C-0x6F */
        0xB0, 0xDB, 0x81, 0xE1, 0x81, 0xE2, 0xBC, 0xD1, /* 0x70-0x73 */
        0xD9, 0xA6, 0x81, 0xE3, 0xD9, 0xA5, 0x81, 0xE4, /* 0x74-0x77 */
        0x81, 0xE5, 0x81, 0xE6, 0x81, 0xE7, 0xD9, 0xAC, /* 0x78-0x7B */
        0xD9, 0xAE, 0x81, 0xE8, 0xD9, 0xAB, 0xCA, 0xB9, /* 0x7C-0x7F */
        
        0x81, 0xE9, 0x81, 0xEA, 0x81, 0xEB, 0xD9, 0xA9, /* 0x80-0x83 */
        0xD6, 0xB6, 0x81, 0xEC, 0x81, 0xED, 0x81, 0xEE, /* 0x84-0x87 */
        0xB3, 0xDE, 0xD9, 0xA8, 0x81, 0xEF, 0xC0, 0xFD, /* 0x88-0x8B */
        0x81, 0xF0, 0xCA, 0xCC, 0x81, 0xF1, 0xD9, 0xAA, /* 0x8C-0x8F */
        0x81, 0xF2, 0xD9, 0xA7, 0x81, 0xF3, 0x81, 0xF4, /* 0x90-0x93 */
        0xD9, 0xB0, 0x81, 0xF5, 0x81, 0xF6, 0xB6, 0xB1, /* 0x94-0x97 */
        0x81, 0xF7, 0x81, 0xF8, 0x81, 0xF9, 0xB9, 0xA9, /* 0x98-0x9B */
        0x81, 0xFA, 0xD2, 0xC0, 0x81, 0xFB, 0x81, 0xFC, /* 0x9C-0x9F */
        0xCF, 0xC0, 0x81, 0xFD, 0x81, 0xFE, 0xC2, 0xC2, /* 0xA0-0xA3 */
        0x82, 0x40, 0xBD, 0xC4, 0xD5, 0xEC, 0xB2, 0xE0, /* 0xA4-0xA7 */
        0xC7, 0xC8, 0xBF, 0xEB, 0xD9, 0xAD, 0x82, 0x41, /* 0xA8-0xAB */
        0xD9, 0xAF, 0x82, 0x42, 0xCE, 0xEA, 0xBA, 0xEE, /* 0xAC-0xAF */
        0x82, 0x43, 0x82, 0x44, 0x82, 0x45, 0x82, 0x46, /* 0xB0-0xB3 */
        0x82, 0x47, 0xC7, 0xD6, 0x82, 0x48, 0x82, 0x49, /* 0xB4-0xB7 */
        0x82, 0x4A, 0x82, 0x4B, 0x82, 0x4C, 0x82, 0x4D, /* 0xB8-0xBB */
        0x82, 0x4E, 0x82, 0x4F, 0x82, 0x50, 0xB1, 0xE3, /* 0xBC-0xBF */
        0x82, 0x51, 0x82, 0x52, 0x82, 0x53, 0xB4, 0xD9, /* 0xC0-0xC3 */
        0xB6, 0xED, 0xD9, 0xB4, 0x82, 0x54, 0x82, 0x55, /* 0xC4-0xC7 */
        0x82, 0x56, 0x82, 0x57, 0xBF, 0xA1, 0x82, 0x58, /* 0xC8-0xCB */
        0x82, 0x59, 0x82, 0x5A, 0xD9, 0xDE, 0xC7, 0xCE, /* 0xCC-0xCF */
        0xC0, 0xFE, 0xD9, 0xB8, 0x82, 0x5B, 0x82, 0x5C, /* 0xD0-0xD3 */
        0x82, 0x5D, 0x82, 0x5E, 0x82, 0x5F, 0xCB, 0xD7, /* 0xD4-0xD7 */
        0xB7, 0xFD, 0x82, 0x60, 0xD9, 0xB5, 0x82, 0x61, /* 0xD8-0xDB */
        0xD9, 0xB7, 0xB1, 0xA3, 0xD3, 0xE1, 0xD9, 0xB9, /* 0xDC-0xDF */
        0x82, 0x62, 0xD0, 0xC5, 0x82, 0x63, 0xD9, 0xB6, /* 0xE0-0xE3 */
        0x82, 0x64, 0x82, 0x65, 0xD9, 0xB1, 0x82, 0x66, /* 0xE4-0xE7 */
        0xD9, 0xB2, 0xC1, 0xA9, 0xD9, 0xB3, 0x82, 0x67, /* 0xE8-0xEB */
        0x82, 0x68, 0xBC, 0xF3, 0xD0, 0xDE, 0xB8, 0xA9, /* 0xEC-0xEF */
        0x82, 0x69, 0xBE, 0xE3, 0x82, 0x6A, 0xD9, 0xBD, /* 0xF0-0xF3 */
        0x82, 0x6B, 0x82, 0x6C, 0x82, 0x6D, 0x82, 0x6E, /* 0xF4-0xF7 */
        0xD9, 0xBA, 0x82, 0x6F, 0xB0, 0xB3, 0x82, 0x70, /* 0xF8-0xFB */
        0x82, 0x71, 0x82, 0x72, 0xD9, 0xC2, 0x82, 0x73, /* 0xFC-0xFF */
};

static const unsigned char u2c_50[512] = {
        0x82, 0x74, 0x82, 0x75, 0x82, 0x76, 0x82, 0x77, /* 0x00-0x03 */
        0x82, 0x78, 0x82, 0x79, 0x82, 0x7A, 0x82, 0x7B, /* 0x04-0x07 */
        0x82, 0x7C, 0x82, 0x7D, 0x82, 0x7E, 0x82, 0x80, /* 0x08-0x0B */
        0xD9, 0xC4, 0xB1, 0xB6, 0x82, 0x81, 0xD9, 0xBF, /* 0x0C-0x0F */
        0x82, 0x82, 0x82, 0x83, 0xB5, 0xB9, 0x82, 0x84, /* 0x10-0x13 */
        0xBE, 0xF3, 0x82, 0x85, 0x82, 0x86, 0x82, 0x87, /* 0x14-0x17 */
        0xCC, 0xC8, 0xBA, 0xF2, 0xD2, 0xD0, 0x82, 0x88, /* 0x18-0x1B */
        0xD9, 0xC3, 0x82, 0x89, 0x82, 0x8A, 0xBD, 0xE8, /* 0x1C-0x1F */
        0x82, 0x8B, 0xB3, 0xAB, 0x82, 0x8C, 0x82, 0x8D, /* 0x20-0x23 */
        0x82, 0x8E, 0xD9, 0xC5, 0xBE, 0xEB, 0x82, 0x8F, /* 0x24-0x27 */
        0xD9, 0xC6, 0xD9, 0xBB, 0xC4, 0xDF, 0x82, 0x90, /* 0x28-0x2B */
        0xD9, 0xBE, 0xD9, 0xC1, 0xD9, 0xC0, 0x82, 0x91, /* 0x2C-0x2F */
        0x82, 0x92, 0x82, 0x93, 0x82, 0x94, 0x82, 0x95, /* 0x30-0x33 */
        0x82, 0x96, 0x82, 0x97, 0x82, 0x98, 0x82, 0x99, /* 0x34-0x37 */
        0x82, 0x9A, 0x82, 0x9B, 0xD5, 0xAE, 0x82, 0x9C, /* 0x38-0x3B */
        0xD6, 0xB5, 0x82, 0x9D, 0xC7, 0xE3, 0x82, 0x9E, /* 0x3C-0x3F */
        0x82, 0x9F, 0x82, 0xA0, 0x82, 0xA1, 0xD9, 0xC8, /* 0x40-0x43 */
        0x82, 0xA2, 0x82, 0xA3, 0x82, 0xA4, 0xBC, 0xD9, /* 0x44-0x47 */
        0xD9, 0xCA, 0x82, 0xA5, 0x82, 0xA6, 0x82, 0xA7, /* 0x48-0x4B */
        0xD9, 0xBC, 0x82, 0xA8, 0xD9, 0xCB, 0xC6, 0xAB, /* 0x4C-0x4F */
        0x82, 0xA9, 0x82, 0xAA, 0x82, 0xAB, 0x82, 0xAC, /* 0x50-0x53 */
        0x82, 0xAD, 0xD9, 0xC9, 0x82, 0xAE, 0x82, 0xAF, /* 0x54-0x57 */
        0x82, 0xB0, 0x82, 0xB1, 0xD7, 0xF6, 0x82, 0xB2, /* 0x58-0x5B */
        0xCD, 0xA3, 0x82, 0xB3, 0x82, 0xB4, 0x82, 0xB5, /* 0x5C-0x5F */
        0x82, 0xB6, 0x82, 0xB7, 0x82, 0xB8, 0x82, 0xB9, /* 0x60-0x63 */
        0x82, 0xBA, 0xBD, 0xA1, 0x82, 0xBB, 0x82, 0xBC, /* 0x64-0x67 */
        0x82, 0xBD, 0x82, 0xBE, 0x82, 0xBF, 0x82, 0xC0, /* 0x68-0x6B */
        0xD9, 0xCC, 0x82, 0xC1, 0x82, 0xC2, 0x82, 0xC3, /* 0x6C-0x6F */
        0x82, 0xC4, 0x82, 0xC5, 0x82, 0xC6, 0x82, 0xC7, /* 0x70-0x73 */
        0x82, 0xC8, 0x82, 0xC9, 0xC5, 0xBC, 0xCD, 0xB5, /* 0x74-0x77 */
        0x82, 0xCA, 0x82, 0xCB, 0x82, 0xCC, 0xD9, 0xCD, /* 0x78-0x7B */
        0x82, 0xCD, 0x82, 0xCE, 0xD9, 0xC7, 0xB3, 0xA5, /* 0x7C-0x7F */
        
        0xBF, 0xFE, 0x82, 0xCF, 0x82, 0xD0, 0x82, 0xD1, /* 0x80-0x83 */
        0x82, 0xD2, 0xB8, 0xB5, 0x82, 0xD3, 0x82, 0xD4, /* 0x84-0x87 */
        0xC0, 0xFC, 0x82, 0xD5, 0x82, 0xD6, 0x82, 0xD7, /* 0x88-0x8B */
        0x82, 0xD8, 0xB0, 0xF8, 0x82, 0xD9, 0x82, 0xDA, /* 0x8C-0x8F */
        0x82, 0xDB, 0x82, 0xDC, 0x82, 0xDD, 0x82, 0xDE, /* 0x90-0x93 */
        0x82, 0xDF, 0x82, 0xE0, 0x82, 0xE1, 0x82, 0xE2, /* 0x94-0x97 */
        0x82, 0xE3, 0x82, 0xE4, 0x82, 0xE5, 0x82, 0xE6, /* 0x98-0x9B */
        0x82, 0xE7, 0x82, 0xE8, 0x82, 0xE9, 0x82, 0xEA, /* 0x9C-0x9F */
        0x82, 0xEB, 0x82, 0xEC, 0x82, 0xED, 0xB4, 0xF6, /* 0xA0-0xA3 */
        0x82, 0xEE, 0xD9, 0xCE, 0x82, 0xEF, 0xD9, 0xCF, /* 0xA4-0xA7 */
        0xB4, 0xA2, 0xD9, 0xD0, 0x82, 0xF0, 0x82, 0xF1, /* 0xA8-0xAB */
        0xB4, 0xDF, 0x82, 0xF2, 0x82, 0xF3, 0x82, 0xF4, /* 0xAC-0xAF */
        0x82, 0xF5, 0x82, 0xF6, 0xB0, 0xC1, 0x82, 0xF7, /* 0xB0-0xB3 */
        0x82, 0xF8, 0x82, 0xF9, 0x82, 0xFA, 0x82, 0xFB, /* 0xB4-0xB7 */
        0x82, 0xFC, 0x82, 0xFD, 0xD9, 0xD1, 0xC9, 0xB5, /* 0xB8-0xBB */
        0x82, 0xFE, 0x83, 0x40, 0x83, 0x41, 0x83, 0x42, /* 0xBC-0xBF */
        0x83, 0x43, 0x83, 0x44, 0x83, 0x45, 0x83, 0x46, /* 0xC0-0xC3 */
        0x83, 0x47, 0x83, 0x48, 0x83, 0x49, 0x83, 0x4A, /* 0xC4-0xC7 */
        0x83, 0x4B, 0x83, 0x4C, 0x83, 0x4D, 0x83, 0x4E, /* 0xC8-0xCB */
        0x83, 0x4F, 0x83, 0x50, 0x83, 0x51, 0xCF, 0xF1, /* 0xCC-0xCF */
        0x83, 0x52, 0x83, 0x53, 0x83, 0x54, 0x83, 0x55, /* 0xD0-0xD3 */
        0x83, 0x56, 0x83, 0x57, 0xD9, 0xD2, 0x83, 0x58, /* 0xD4-0xD7 */
        0x83, 0x59, 0x83, 0x5A, 0xC1, 0xC5, 0x83, 0x5B, /* 0xD8-0xDB */
        0x83, 0x5C, 0x83, 0x5D, 0x83, 0x5E, 0x83, 0x5F, /* 0xDC-0xDF */
        0x83, 0x60, 0x83, 0x61, 0x83, 0x62, 0x83, 0x63, /* 0xE0-0xE3 */
        0x83, 0x64, 0x83, 0x65, 0xD9, 0xD6, 0xC9, 0xAE, /* 0xE4-0xE7 */
        0x83, 0x66, 0x83, 0x67, 0x83, 0x68, 0x83, 0x69, /* 0xE8-0xEB */
        0xD9, 0xD5, 0xD9, 0xD4, 0xD9, 0xD7, 0x83, 0x6A, /* 0xEC-0xEF */
        0x83, 0x6B, 0x83, 0x6C, 0x83, 0x6D, 0xCB, 0xDB, /* 0xF0-0xF3 */
        0x83, 0x6E, 0xBD, 0xA9, 0x83, 0x6F, 0x83, 0x70, /* 0xF4-0xF7 */
        0x83, 0x71, 0x83, 0x72, 0x83, 0x73, 0xC6, 0xA7, /* 0xF8-0xFB */
        0x83, 0x74, 0x83, 0x75, 0x83, 0x76, 0x83, 0x77, /* 0xFC-0xFF */
};

static const unsigned char u2c_51[512] = {
        0x83, 0x78, 0x83, 0x79, 0x83, 0x7A, 0x83, 0x7B, /* 0x00-0x03 */
        0x83, 0x7C, 0x83, 0x7D, 0xD9, 0xD3, 0xD9, 0xD8, /* 0x04-0x07 */
        0x83, 0x7E, 0x83, 0x80, 0x83, 0x81, 0xD9, 0xD9, /* 0x08-0x0B */
        0x83, 0x82, 0x83, 0x83, 0x83, 0x84, 0x83, 0x85, /* 0x0C-0x0F */
        0x83, 0x86, 0x83, 0x87, 0xC8, 0xE5, 0x83, 0x88, /* 0x10-0x13 */
        0x83, 0x89, 0x83, 0x8A, 0x83, 0x8B, 0x83, 0x8C, /* 0x14-0x17 */
        0x83, 0x8D, 0x83, 0x8E, 0x83, 0x8F, 0x83, 0x90, /* 0x18-0x1B */
        0x83, 0x91, 0x83, 0x92, 0x83, 0x93, 0x83, 0x94, /* 0x1C-0x1F */
        0x83, 0x95, 0xC0, 0xDC, 0x83, 0x96, 0x83, 0x97, /* 0x20-0x23 */
        0x83, 0x98, 0x83, 0x99, 0x83, 0x9A, 0x83, 0x9B, /* 0x24-0x27 */
        0x83, 0x9C, 0x83, 0x9D, 0x83, 0x9E, 0x83, 0x9F, /* 0x28-0x2B */
        0x83, 0xA0, 0x83, 0xA1, 0x83, 0xA2, 0x83, 0xA3, /* 0x2C-0x2F */
        0x83, 0xA4, 0x83, 0xA5, 0x83, 0xA6, 0x83, 0xA7, /* 0x30-0x33 */
        0x83, 0xA8, 0x83, 0xA9, 0x83, 0xAA, 0x83, 0xAB, /* 0x34-0x37 */
        0x83, 0xAC, 0x83, 0xAD, 0x83, 0xAE, 0x83, 0xAF, /* 0x38-0x3B */
        0x83, 0xB0, 0x83, 0xB1, 0x83, 0xB2, 0xB6, 0xF9, /* 0x3C-0x3F */
        0xD8, 0xA3, 0xD4, 0xCA, 0x83, 0xB3, 0xD4, 0xAA, /* 0x40-0x43 */
        0xD0, 0xD6, 0xB3, 0xE4, 0xD5, 0xD7, 0x83, 0xB4, /* 0x44-0x47 */
        0xCF, 0xC8, 0xB9, 0xE2, 0x83, 0xB5, 0xBF, 0xCB, /* 0x48-0x4B */
        0x83, 0xB6, 0xC3, 0xE2, 0x83, 0xB7, 0x83, 0xB8, /* 0x4C-0x4F */
        0x83, 0xB9, 0xB6, 0xD2, 0x83, 0xBA, 0x83, 0xBB, /* 0x50-0x53 */
        0xCD, 0xC3, 0xD9, 0xEE, 0xD9, 0xF0, 0x83, 0xBC, /* 0x54-0x57 */
        0x83, 0xBD, 0x83, 0xBE, 0xB5, 0xB3, 0x83, 0xBF, /* 0x58-0x5B */
        0xB6, 0xB5, 0x83, 0xC0, 0x83, 0xC1, 0x83, 0xC2, /* 0x5C-0x5F */
        0x83, 0xC3, 0x83, 0xC4, 0xBE, 0xA4, 0x83, 0xC5, /* 0x60-0x63 */
        0x83, 0xC6, 0xC8, 0xEB, 0x83, 0xC7, 0x83, 0xC8, /* 0x64-0x67 */
        0xC8, 0xAB, 0x83, 0xC9, 0x83, 0xCA, 0xB0, 0xCB, /* 0x68-0x6B */
        0xB9, 0xAB, 0xC1, 0xF9, 0xD9, 0xE2, 0x83, 0xCB, /* 0x6C-0x6F */
        0xC0, 0xBC, 0xB9, 0xB2, 0x83, 0xCC, 0xB9, 0xD8, /* 0x70-0x73 */
        0xD0, 0xCB, 0xB1, 0xF8, 0xC6, 0xE4, 0xBE, 0xDF, /* 0x74-0x77 */
        0xB5, 0xE4, 0xD7, 0xC8, 0x83, 0xCD, 0xD1, 0xF8, /* 0x78-0x7B */
        0xBC, 0xE6, 0xCA, 0xDE, 0x83, 0xCE, 0x83, 0xCF, /* 0x7C-0x7F */
        
        0xBC, 0xBD, 0xD9, 0xE6, 0xD8, 0xE7, 0x83, 0xD0, /* 0x80-0x83 */
        0x83, 0xD1, 0xC4, 0xDA, 0x83, 0xD2, 0x83, 0xD3, /* 0x84-0x87 */
        0xB8, 0xD4, 0xC8, 0xBD, 0x83, 0xD4, 0x83, 0xD5, /* 0x88-0x8B */
        0xB2, 0xE1, 0xD4, 0xD9, 0x83, 0xD6, 0x83, 0xD7, /* 0x8C-0x8F */
        0x83, 0xD8, 0x83, 0xD9, 0xC3, 0xB0, 0x83, 0xDA, /* 0x90-0x93 */
        0x83, 0xDB, 0xC3, 0xE1, 0xDA, 0xA2, 0xC8, 0xDF, /* 0x94-0x97 */
        0x83, 0xDC, 0xD0, 0xB4, 0x83, 0xDD, 0xBE, 0xFC, /* 0x98-0x9B */
        0xC5, 0xA9, 0x83, 0xDE, 0x83, 0xDF, 0x83, 0xE0, /* 0x9C-0x9F */
        0xB9, 0xDA, 0x83, 0xE1, 0xDA, 0xA3, 0x83, 0xE2, /* 0xA0-0xA3 */
        0xD4, 0xA9, 0xDA, 0xA4, 0x83, 0xE3, 0x83, 0xE4, /* 0xA4-0xA7 */
        0x83, 0xE5, 0x83, 0xE6, 0x83, 0xE7, 0xD9, 0xFB, /* 0xA8-0xAB */
        0xB6, 0xAC, 0x83, 0xE8, 0x83, 0xE9, 0xB7, 0xEB, /* 0xAC-0xAF */
        0xB1, 0xF9, 0xD9, 0xFC, 0xB3, 0xE5, 0xBE, 0xF6, /* 0xB0-0xB3 */
        0x83, 0xEA, 0xBF, 0xF6, 0xD2, 0xB1, 0xC0, 0xE4, /* 0xB4-0xB7 */
        0x83, 0xEB, 0x83, 0xEC, 0x83, 0xED, 0xB6, 0xB3, /* 0xB8-0xBB */
        0xD9, 0xFE, 0xD9, 0xFD, 0x83, 0xEE, 0x83, 0xEF, /* 0xBC-0xBF */
        0xBE, 0xBB, 0x83, 0xF0, 0x83, 0xF1, 0x83, 0xF2, /* 0xC0-0xC3 */
        0xC6, 0xE0, 0x83, 0xF3, 0xD7, 0xBC, 0xDA, 0xA1, /* 0xC4-0xC7 */
        0x83, 0xF4, 0xC1, 0xB9, 0x83, 0xF5, 0xB5, 0xF2, /* 0xC8-0xCB */
        0xC1, 0xE8, 0x83, 0xF6, 0x83, 0xF7, 0xBC, 0xF5, /* 0xCC-0xCF */
        0x83, 0xF8, 0xB4, 0xD5, 0x83, 0xF9, 0x83, 0xFA, /* 0xD0-0xD3 */
        0x83, 0xFB, 0x83, 0xFC, 0x83, 0xFD, 0x83, 0xFE, /* 0xD4-0xD7 */
        0x84, 0x40, 0x84, 0x41, 0x84, 0x42, 0xC1, 0xDD, /* 0xD8-0xDB */
        0x84, 0x43, 0xC4, 0xFD, 0x84, 0x44, 0x84, 0x45, /* 0xDC-0xDF */
        0xBC, 0xB8, 0xB7, 0xB2, 0x84, 0x46, 0x84, 0x47, /* 0xE0-0xE3 */
        0xB7, 0xEF, 0x84, 0x48, 0x84, 0x49, 0x84, 0x4A, /* 0xE4-0xE7 */
        0x84, 0x4B, 0x84, 0x4C, 0x84, 0x4D, 0xD9, 0xEC, /* 0xE8-0xEB */
        0x84, 0x4E, 0xC6, 0xBE, 0x84, 0x4F, 0xBF, 0xAD, /* 0xEC-0xEF */
        0xBB, 0xCB, 0x84, 0x50, 0x84, 0x51, 0xB5, 0xCA, /* 0xF0-0xF3 */
        0x84, 0x52, 0xDB, 0xC9, 0xD0, 0xD7, 0x84, 0x53, /* 0xF4-0xF7 */
        0xCD, 0xB9, 0xB0, 0xBC, 0xB3, 0xF6, 0xBB, 0xF7, /* 0xF8-0xFB */
        0xDB, 0xCA, 0xBA, 0xAF, 0x84, 0x54, 0xD4, 0xE4, /* 0xFC-0xFF */
};

static const unsigned char u2c_52[512] = {
        0xB5, 0xB6, 0xB5, 0xF3, 0xD8, 0xD6, 0xC8, 0xD0, /* 0x00-0x03 */
        0x84, 0x55, 0x84, 0x56, 0xB7, 0xD6, 0xC7, 0xD0, /* 0x04-0x07 */
        0xD8, 0xD7, 0x84, 0x57, 0xBF, 0xAF, 0x84, 0x58, /* 0x08-0x0B */
        0x84, 0x59, 0xDB, 0xBB, 0xD8, 0xD8, 0x84, 0x5A, /* 0x0C-0x0F */
        0x84, 0x5B, 0xD0, 0xCC, 0xBB, 0xAE, 0x84, 0x5C, /* 0x10-0x13 */
        0x84, 0x5D, 0x84, 0x5E, 0xEB, 0xBE, 0xC1, 0xD0, /* 0x14-0x17 */
        0xC1, 0xF5, 0xD4, 0xF2, 0xB8, 0xD5, 0xB4, 0xB4, /* 0x18-0x1B */
        0x84, 0x5F, 0xB3, 0xF5, 0x84, 0x60, 0x84, 0x61, /* 0x1C-0x1F */
        0xC9, 0xBE, 0x84, 0x62, 0x84, 0x63, 0x84, 0x64, /* 0x20-0x23 */
        0xC5, 0xD0, 0x84, 0x65, 0x84, 0x66, 0x84, 0x67, /* 0x24-0x27 */
        0xC5, 0xD9, 0xC0, 0xFB, 0x84, 0x68, 0xB1, 0xF0, /* 0x28-0x2B */
        0x84, 0x69, 0xD8, 0xD9, 0xB9, 0xCE, 0x84, 0x6A, /* 0x2C-0x2F */
        0xB5, 0xBD, 0x84, 0x6B, 0x84, 0x6C, 0xD8, 0xDA, /* 0x30-0x33 */
        0x84, 0x6D, 0x84, 0x6E, 0xD6, 0xC6, 0xCB, 0xA2, /* 0x34-0x37 */
        0xC8, 0xAF, 0xC9, 0xB2, 0xB4, 0xCC, 0xBF, 0xCC, /* 0x38-0x3B */
        0x84, 0x6F, 0xB9, 0xF4, 0x84, 0x70, 0xD8, 0xDB, /* 0x3C-0x3F */
        0xD8, 0xDC, 0xB6, 0xE7, 0xBC, 0xC1, 0xCC, 0xEA, /* 0x40-0x43 */
        0x84, 0x71, 0x84, 0x72, 0x84, 0x73, 0x84, 0x74, /* 0x44-0x47 */
        0x84, 0x75, 0x84, 0x76, 0xCF, 0xF7, 0x84, 0x77, /* 0x48-0x4B */
        0xD8, 0xDD, 0xC7, 0xB0, 0x84, 0x78, 0x84, 0x79, /* 0x4C-0x4F */
        0xB9, 0xD0, 0xBD, 0xA3, 0x84, 0x7A, 0x84, 0x7B, /* 0x50-0x53 */
        0xCC, 0xDE, 0x84, 0x7C, 0xC6, 0xCA, 0x84, 0x7D, /* 0x54-0x57 */
        0x84, 0x7E, 0x84, 0x80, 0x84, 0x81, 0x84, 0x82, /* 0x58-0x5B */
        0xD8, 0xE0, 0x84, 0x83, 0xD8, 0xDE, 0x84, 0x84, /* 0x5C-0x5F */
        0x84, 0x85, 0xD8, 0xDF, 0x84, 0x86, 0x84, 0x87, /* 0x60-0x63 */
        0x84, 0x88, 0xB0, 0xFE, 0x84, 0x89, 0xBE, 0xE7, /* 0x64-0x67 */
        0x84, 0x8A, 0xCA, 0xA3, 0xBC, 0xF4, 0x84, 0x8B, /* 0x68-0x6B */
        0x84, 0x8C, 0x84, 0x8D, 0x84, 0x8E, 0xB8, 0xB1, /* 0x6C-0x6F */
        0x84, 0x8F, 0x84, 0x90, 0xB8, 0xEE, 0x84, 0x91, /* 0x70-0x73 */
        0x84, 0x92, 0x84, 0x93, 0x84, 0x94, 0x84, 0x95, /* 0x74-0x77 */
        0x84, 0x96, 0x84, 0x97, 0x84, 0x98, 0x84, 0x99, /* 0x78-0x7B */
        0x84, 0x9A, 0xD8, 0xE2, 0x84, 0x9B, 0xBD, 0xCB, /* 0x7C-0x7F */
        
        0x84, 0x9C, 0xD8, 0xE4, 0xD8, 0xE3, 0x84, 0x9D, /* 0x80-0x83 */
        0x84, 0x9E, 0x84, 0x9F, 0x84, 0xA0, 0x84, 0xA1, /* 0x84-0x87 */
        0xC5, 0xFC, 0x84, 0xA2, 0x84, 0xA3, 0x84, 0xA4, /* 0x88-0x8B */
        0x84, 0xA5, 0x84, 0xA6, 0x84, 0xA7, 0x84, 0xA8, /* 0x8C-0x8F */
        0xD8, 0xE5, 0x84, 0xA9, 0x84, 0xAA, 0xD8, 0xE6, /* 0x90-0x93 */
        0x84, 0xAB, 0x84, 0xAC, 0x84, 0xAD, 0x84, 0xAE, /* 0x94-0x97 */
        0x84, 0xAF, 0x84, 0xB0, 0x84, 0xB1, 0xC1, 0xA6, /* 0x98-0x9B */
        0x84, 0xB2, 0xC8, 0xB0, 0xB0, 0xEC, 0xB9, 0xA6, /* 0x9C-0x9F */
        0xBC, 0xD3, 0xCE, 0xF1, 0xDB, 0xBD, 0xC1, 0xD3, /* 0xA0-0xA3 */
        0x84, 0xB3, 0x84, 0xB4, 0x84, 0xB5, 0x84, 0xB6, /* 0xA4-0xA7 */
        0xB6, 0xAF, 0xD6, 0xFA, 0xC5, 0xAC, 0xBD, 0xD9, /* 0xA8-0xAB */
        0xDB, 0xBE, 0xDB, 0xBF, 0x84, 0xB7, 0x84, 0xB8, /* 0xAC-0xAF */
        0x84, 0xB9, 0xC0, 0xF8, 0xBE, 0xA2, 0xC0, 0xCD, /* 0xB0-0xB3 */
        0x84, 0xBA, 0x84, 0xBB, 0x84, 0xBC, 0x84, 0xBD, /* 0xB4-0xB7 */
        0x84, 0xBE, 0x84, 0xBF, 0x84, 0xC0, 0x84, 0xC1, /* 0xB8-0xBB */
        0x84, 0xC2, 0x84, 0xC3, 0xDB, 0xC0, 0xCA, 0xC6, /* 0xBC-0xBF */
        0x84, 0xC4, 0x84, 0xC5, 0x84, 0xC6, 0xB2, 0xAA, /* 0xC0-0xC3 */
        0x84, 0xC7, 0x84, 0xC8, 0x84, 0xC9, 0xD3, 0xC2, /* 0xC4-0xC7 */
        0x84, 0xCA, 0xC3, 0xE3, 0x84, 0xCB, 0xD1, 0xAB, /* 0xC8-0xCB */
        0x84, 0xCC, 0x84, 0xCD, 0x84, 0xCE, 0x84, 0xCF, /* 0xCC-0xCF */
        0xDB, 0xC2, 0x84, 0xD0, 0xC0, 0xD5, 0x84, 0xD1, /* 0xD0-0xD3 */
        0x84, 0xD2, 0x84, 0xD3, 0xDB, 0xC3, 0x84, 0xD4, /* 0xD4-0xD7 */
        0xBF, 0xB1, 0x84, 0xD5, 0x84, 0xD6, 0x84, 0xD7, /* 0xD8-0xDB */
        0x84, 0xD8, 0x84, 0xD9, 0x84, 0xDA, 0xC4, 0xBC, /* 0xDC-0xDF */
        0x84, 0xDB, 0x84, 0xDC, 0x84, 0xDD, 0x84, 0xDE, /* 0xE0-0xE3 */
        0xC7, 0xDA, 0x84, 0xDF, 0x84, 0xE0, 0x84, 0xE1, /* 0xE4-0xE7 */
        0x84, 0xE2, 0x84, 0xE3, 0x84, 0xE4, 0x84, 0xE5, /* 0xE8-0xEB */
        0x84, 0xE6, 0x84, 0xE7, 0x84, 0xE8, 0x84, 0xE9, /* 0xEC-0xEF */
        0xDB, 0xC4, 0x84, 0xEA, 0x84, 0xEB, 0x84, 0xEC, /* 0xF0-0xF3 */
        0x84, 0xED, 0x84, 0xEE, 0x84, 0xEF, 0x84, 0xF0, /* 0xF4-0xF7 */
        0x84, 0xF1, 0xD9, 0xE8, 0xC9, 0xD7, 0x84, 0xF2, /* 0xF8-0xFB */
        0x84, 0xF3, 0x84, 0xF4, 0xB9, 0xB4, 0xCE, 0xF0, /* 0xFC-0xFF */
};

static const unsigned char u2c_53[512] = {
        0xD4, 0xC8, 0x84, 0xF5, 0x84, 0xF6, 0x84, 0xF7, /* 0x00-0x03 */
        0x84, 0xF8, 0xB0, 0xFC, 0xB4, 0xD2, 0x84, 0xF9, /* 0x04-0x07 */
        0xD0, 0xD9, 0x84, 0xFA, 0x84, 0xFB, 0x84, 0xFC, /* 0x08-0x0B */
        0x84, 0xFD, 0xD9, 0xE9, 0x84, 0xFE, 0xDE, 0xCB, /* 0x0C-0x0F */
        0xD9, 0xEB, 0x85, 0x40, 0x85, 0x41, 0x85, 0x42, /* 0x10-0x13 */
        0x85, 0x43, 0xD8, 0xB0, 0xBB, 0xAF, 0xB1, 0xB1, /* 0x14-0x17 */
        0x85, 0x44, 0xB3, 0xD7, 0xD8, 0xCE, 0x85, 0x45, /* 0x18-0x1B */
        0x85, 0x46, 0xD4, 0xD1, 0x85, 0x47, 0x85, 0x48, /* 0x1C-0x1F */
        0xBD, 0xB3, 0xBF, 0xEF, 0x85, 0x49, 0xCF, 0xBB, /* 0x20-0x23 */
        0x85, 0x4A, 0x85, 0x4B, 0xD8, 0xD0, 0x85, 0x4C, /* 0x24-0x27 */
        0x85, 0x4D, 0x85, 0x4E, 0xB7, 0xCB, 0x85, 0x4F, /* 0x28-0x2B */
        0x85, 0x50, 0x85, 0x51, 0xD8, 0xD1, 0x85, 0x52, /* 0x2C-0x2F */
        0x85, 0x53, 0x85, 0x54, 0x85, 0x55, 0x85, 0x56, /* 0x30-0x33 */
        0x85, 0x57, 0x85, 0x58, 0x85, 0x59, 0x85, 0x5A, /* 0x34-0x37 */
        0x85, 0x5B, 0xC6, 0xA5, 0xC7, 0xF8, 0xD2, 0xBD, /* 0x38-0x3B */
        0x85, 0x5C, 0x85, 0x5D, 0xD8, 0xD2, 0xC4, 0xE4, /* 0x3C-0x3F */
        0x85, 0x5E, 0xCA, 0xAE, 0x85, 0x5F, 0xC7, 0xA7, /* 0x40-0x43 */
        0x85, 0x60, 0xD8, 0xA6, 0x85, 0x61, 0xC9, 0xFD, /* 0x44-0x47 */
        0xCE, 0xE7, 0xBB, 0xDC, 0xB0, 0xEB, 0x85, 0x62, /* 0x48-0x4B */
        0x85, 0x63, 0x85, 0x64, 0xBB, 0xAA, 0xD0, 0xAD, /* 0x4C-0x4F */
        0x85, 0x65, 0xB1, 0xB0, 0xD7, 0xE4, 0xD7, 0xBF, /* 0x50-0x53 */
        0x85, 0x66, 0xB5, 0xA5, 0xC2, 0xF4, 0xC4, 0xCF, /* 0x54-0x57 */
        0x85, 0x67, 0x85, 0x68, 0xB2, 0xA9, 0x85, 0x69, /* 0x58-0x5B */
        0xB2, 0xB7, 0x85, 0x6A, 0xB1, 0xE5, 0xDF, 0xB2, /* 0x5C-0x5F */
        0xD5, 0xBC, 0xBF, 0xA8, 0xC2, 0xAC, 0xD8, 0xD5, /* 0x60-0x63 */
        0xC2, 0xB1, 0x85, 0x6B, 0xD8, 0xD4, 0xCE, 0xD4, /* 0x64-0x67 */
        0x85, 0x6C, 0xDA, 0xE0, 0x85, 0x6D, 0xCE, 0xC0, /* 0x68-0x6B */
        0x85, 0x6E, 0x85, 0x6F, 0xD8, 0xB4, 0xC3, 0xAE, /* 0x6C-0x6F */
        0xD3, 0xA1, 0xCE, 0xA3, 0x85, 0x70, 0xBC, 0xB4, /* 0x70-0x73 */
        0xC8, 0xB4, 0xC2, 0xD1, 0x85, 0x71, 0xBE, 0xED, /* 0x74-0x77 */
        0xD0, 0xB6, 0x85, 0x72, 0xDA, 0xE1, 0x85, 0x73, /* 0x78-0x7B */
        0x85, 0x74, 0x85, 0x75, 0x85, 0x76, 0xC7, 0xE4, /* 0x7C-0x7F */
        
        0x85, 0x77, 0x85, 0x78, 0xB3, 0xA7, 0x85, 0x79, /* 0x80-0x83 */
        0xB6, 0xF2, 0xCC, 0xFC, 0xC0, 0xFA, 0x85, 0x7A, /* 0x84-0x87 */
        0x85, 0x7B, 0xC0, 0xF7, 0x85, 0x7C, 0xD1, 0xB9, /* 0x88-0x8B */
        0xD1, 0xE1, 0xD8, 0xC7, 0x85, 0x7D, 0x85, 0x7E, /* 0x8C-0x8F */
        0x85, 0x80, 0x85, 0x81, 0x85, 0x82, 0x85, 0x83, /* 0x90-0x93 */
        0x85, 0x84, 0xB2, 0xDE, 0x85, 0x85, 0x85, 0x86, /* 0x94-0x97 */
        0xC0, 0xE5, 0x85, 0x87, 0xBA, 0xF1, 0x85, 0x88, /* 0x98-0x9B */
        0x85, 0x89, 0xD8, 0xC8, 0x85, 0x8A, 0xD4, 0xAD, /* 0x9C-0x9F */
        0x85, 0x8B, 0x85, 0x8C, 0xCF, 0xE1, 0xD8, 0xC9, /* 0xA0-0xA3 */
        0x85, 0x8D, 0xD8, 0xCA, 0xCF, 0xC3, 0x85, 0x8E, /* 0xA4-0xA7 */
        0xB3, 0xF8, 0xBE, 0xC7, 0x85, 0x8F, 0x85, 0x90, /* 0xA8-0xAB */
        0x85, 0x91, 0x85, 0x92, 0xD8, 0xCB, 0x85, 0x93, /* 0xAC-0xAF */
        0x85, 0x94, 0x85, 0x95, 0x85, 0x96, 0x85, 0x97, /* 0xB0-0xB3 */
        0x85, 0x98, 0x85, 0x99, 0xDB, 0xCC, 0x85, 0x9A, /* 0xB4-0xB7 */
        0x85, 0x9B, 0x85, 0x9C, 0x85, 0x9D, 0xC8, 0xA5, /* 0xB8-0xBB */
        0x85, 0x9E, 0x85, 0x9F, 0x85, 0xA0, 0xCF, 0xD8, /* 0xBC-0xBF */
        0x85, 0xA1, 0xC8, 0xFE, 0xB2, 0xCE, 0x85, 0xA2, /* 0xC0-0xC3 */
        0x85, 0xA3, 0x85, 0xA4, 0x85, 0xA5, 0x85, 0xA6, /* 0xC4-0xC7 */
        0xD3, 0xD6, 0xB2, 0xE6, 0xBC, 0xB0, 0xD3, 0xD1, /* 0xC8-0xCB */
        0xCB, 0xAB, 0xB7, 0xB4, 0x85, 0xA7, 0x85, 0xA8, /* 0xCC-0xCF */
        0x85, 0xA9, 0xB7, 0xA2, 0x85, 0xAA, 0x85, 0xAB, /* 0xD0-0xD3 */
        0xCA, 0xE5, 0x85, 0xAC, 0xC8, 0xA1, 0xCA, 0xDC, /* 0xD4-0xD7 */
        0xB1, 0xE4, 0xD0, 0xF0, 0x85, 0xAD, 0xC5, 0xD1, /* 0xD8-0xDB */
        0x85, 0xAE, 0x85, 0xAF, 0x85, 0xB0, 0xDB, 0xC5, /* 0xDC-0xDF */
        0xB5, 0xFE, 0x85, 0xB1, 0x85, 0xB2, 0xBF, 0xDA, /* 0xE0-0xE3 */
        0xB9, 0xC5, 0xBE, 0xE4, 0xC1, 0xED, 0x85, 0xB3, /* 0xE4-0xE7 */
        0xDF, 0xB6, 0xDF, 0xB5, 0xD6, 0xBB, 0xBD, 0xD0, /* 0xE8-0xEB */
        0xD5, 0xD9, 0xB0, 0xC8, 0xB6, 0xA3, 0xBF, 0xC9, /* 0xEC-0xEF */
        0xCC, 0xA8, 0xDF, 0xB3, 0xCA, 0xB7, 0xD3, 0xD2, /* 0xF0-0xF3 */
        0x85, 0xB4, 0xD8, 0xCF, 0xD2, 0xB6, 0xBA, 0xC5, /* 0xF4-0xF7 */
        0xCB, 0xBE, 0xCC, 0xBE, 0x85, 0xB5, 0xDF, 0xB7, /* 0xF8-0xFB */
        0xB5, 0xF0, 0xDF, 0xB4, 0x85, 0xB6, 0x85, 0xB7, /* 0xFC-0xFF */
};

static const unsigned char u2c_54[512] = {
        0x85, 0xB8, 0xD3, 0xF5, 0x85, 0xB9, 0xB3, 0xD4, /* 0x00-0x03 */
        0xB8, 0xF7, 0x85, 0xBA, 0xDF, 0xBA, 0x85, 0xBB, /* 0x04-0x07 */
        0xBA, 0xCF, 0xBC, 0xAA, 0xB5, 0xF5, 0x85, 0xBC, /* 0x08-0x0B */
        0xCD, 0xAC, 0xC3, 0xFB, 0xBA, 0xF3, 0xC0, 0xF4, /* 0x0C-0x0F */
        0xCD, 0xC2, 0xCF, 0xF2, 0xDF, 0xB8, 0xCF, 0xC5, /* 0x10-0x13 */
        0x85, 0xBD, 0xC2, 0xC0, 0xDF, 0xB9, 0xC2, 0xF0, /* 0x14-0x17 */
        0x85, 0xBE, 0x85, 0xBF, 0x85, 0xC0, 0xBE, 0xFD, /* 0x18-0x1B */
        0x85, 0xC1, 0xC1, 0xDF, 0xCD, 0xCC, 0xD2, 0xF7, /* 0x1C-0x1F */
        0xB7, 0xCD, 0xDF, 0xC1, 0x85, 0xC2, 0xDF, 0xC4, /* 0x20-0x23 */
        0x85, 0xC3, 0x85, 0xC4, 0xB7, 0xF1, 0xB0, 0xC9, /* 0x24-0x27 */
        0xB6, 0xD6, 0xB7, 0xD4, 0x85, 0xC5, 0xBA, 0xAC, /* 0x28-0x2B */
        0xCC, 0xFD, 0xBF, 0xD4, 0xCB, 0xB1, 0xC6, 0xF4, /* 0x2C-0x2F */
        0x85, 0xC6, 0xD6, 0xA8, 0xDF, 0xC5, 0x85, 0xC7, /* 0x30-0x33 */
        0xCE, 0xE2, 0xB3, 0xB3, 0x85, 0xC8, 0x85, 0xC9, /* 0x34-0x37 */
        0xCE, 0xFC, 0xB4, 0xB5, 0x85, 0xCA, 0xCE, 0xC7, /* 0x38-0x3B */
        0xBA, 0xF0, 0x85, 0xCB, 0xCE, 0xE1, 0x85, 0xCC, /* 0x3C-0x3F */
        0xD1, 0xBD, 0x85, 0xCD, 0x85, 0xCE, 0xDF, 0xC0, /* 0x40-0x43 */
        0x85, 0xCF, 0x85, 0xD0, 0xB4, 0xF4, 0x85, 0xD1, /* 0x44-0x47 */
        0xB3, 0xCA, 0x85, 0xD2, 0xB8, 0xE6, 0xDF, 0xBB, /* 0x48-0x4B */
        0x85, 0xD3, 0x85, 0xD4, 0x85, 0xD5, 0x85, 0xD6, /* 0x4C-0x4F */
        0xC4, 0xC5, 0x85, 0xD7, 0xDF, 0xBC, 0xDF, 0xBD, /* 0x50-0x53 */
        0xDF, 0xBE, 0xC5, 0xBB, 0xDF, 0xBF, 0xDF, 0xC2, /* 0x54-0x57 */
        0xD4, 0xB1, 0xDF, 0xC3, 0x85, 0xD8, 0xC7, 0xBA, /* 0x58-0x5B */
        0xCE, 0xD8, 0x85, 0xD9, 0x85, 0xDA, 0x85, 0xDB, /* 0x5C-0x5F */
        0x85, 0xDC, 0x85, 0xDD, 0xC4, 0xD8, 0x85, 0xDE, /* 0x60-0x63 */
        0xDF, 0xCA, 0x85, 0xDF, 0xDF, 0xCF, 0x85, 0xE0, /* 0x64-0x67 */
        0xD6, 0xDC, 0x85, 0xE1, 0x85, 0xE2, 0x85, 0xE3, /* 0x68-0x6B */
        0x85, 0xE4, 0x85, 0xE5, 0x85, 0xE6, 0x85, 0xE7, /* 0x6C-0x6F */
        0x85, 0xE8, 0xDF, 0xC9, 0xDF, 0xDA, 0xCE, 0xB6, /* 0x70-0x73 */
        0x85, 0xE9, 0xBA, 0xC7, 0xDF, 0xCE, 0xDF, 0xC8, /* 0x74-0x77 */
        0xC5, 0xDE, 0x85, 0xEA, 0x85, 0xEB, 0xC9, 0xEB, /* 0x78-0x7B */
        0xBA, 0xF4, 0xC3, 0xFC, 0x85, 0xEC, 0x85, 0xED, /* 0x7C-0x7F */
        
        0xBE, 0xD7, 0x85, 0xEE, 0xDF, 0xC6, 0x85, 0xEF, /* 0x80-0x83 */
        0xDF, 0xCD, 0x85, 0xF0, 0xC5, 0xD8, 0x85, 0xF1, /* 0x84-0x87 */
        0x85, 0xF2, 0x85, 0xF3, 0x85, 0xF4, 0xD5, 0xA6, /* 0x88-0x8B */
        0xBA, 0xCD, 0x85, 0xF5, 0xBE, 0xCC, 0xD3, 0xBD, /* 0x8C-0x8F */
        0xB8, 0xC0, 0x85, 0xF6, 0xD6, 0xE4, 0x85, 0xF7, /* 0x90-0x93 */
        0xDF, 0xC7, 0xB9, 0xBE, 0xBF, 0xA7, 0x85, 0xF8, /* 0x94-0x97 */
        0x85, 0xF9, 0xC1, 0xFC, 0xDF, 0xCB, 0xDF, 0xCC, /* 0x98-0x9B */
        0x85, 0xFA, 0xDF, 0xD0, 0x85, 0xFB, 0x85, 0xFC, /* 0x9C-0x9F */
        0x85, 0xFD, 0x85, 0xFE, 0x86, 0x40, 0xDF, 0xDB, /* 0xA0-0xA3 */
        0xDF, 0xE5, 0x86, 0x41, 0xDF, 0xD7, 0xDF, 0xD6, /* 0xA4-0xA7 */
        0xD7, 0xC9, 0xDF, 0xE3, 0xDF, 0xE4, 0xE5, 0xEB, /* 0xA8-0xAB */
        0xD2, 0xA7, 0xDF, 0xD2, 0x86, 0x42, 0xBF, 0xA9, /* 0xAC-0xAF */
        0x86, 0x43, 0xD4, 0xDB, 0x86, 0x44, 0xBF, 0xC8, /* 0xB0-0xB3 */
        0xDF, 0xD4, 0x86, 0x45, 0x86, 0x46, 0x86, 0x47, /* 0xB4-0xB7 */
        0xCF, 0xCC, 0x86, 0x48, 0x86, 0x49, 0xDF, 0xDD, /* 0xB8-0xBB */
        0x86, 0x4A, 0xD1, 0xCA, 0x86, 0x4B, 0xDF, 0xDE, /* 0xBC-0xBF */
        0xB0, 0xA7, 0xC6, 0xB7, 0xDF, 0xD3, 0x86, 0x4C, /* 0xC0-0xC3 */
        0xBA, 0xE5, 0x86, 0x4D, 0xB6, 0xDF, 0xCD, 0xDB, /* 0xC4-0xC7 */
        0xB9, 0xFE, 0xD4, 0xD5, 0x86, 0x4E, 0x86, 0x4F, /* 0xC8-0xCB */
        0xDF, 0xDF, 0xCF, 0xEC, 0xB0, 0xA5, 0xDF, 0xE7, /* 0xCC-0xCF */
        0xDF, 0xD1, 0xD1, 0xC6, 0xDF, 0xD5, 0xDF, 0xD8, /* 0xD0-0xD3 */
        0xDF, 0xD9, 0xDF, 0xDC, 0x86, 0x50, 0xBB, 0xA9, /* 0xD4-0xD7 */
        0x86, 0x51, 0xDF, 0xE0, 0xDF, 0xE1, 0x86, 0x52, /* 0xD8-0xDB */
        0xDF, 0xE2, 0xDF, 0xE6, 0xDF, 0xE8, 0xD3, 0xB4, /* 0xDC-0xDF */
        0x86, 0x53, 0x86, 0x54, 0x86, 0x55, 0x86, 0x56, /* 0xE0-0xE3 */
        0x86, 0x57, 0xB8, 0xE7, 0xC5, 0xB6, 0xDF, 0xEA, /* 0xE4-0xE7 */
        0xC9, 0xDA, 0xC1, 0xA8, 0xC4, 0xC4, 0x86, 0x58, /* 0xE8-0xEB */
        0x86, 0x59, 0xBF, 0xDE, 0xCF, 0xF8, 0x86, 0x5A, /* 0xEC-0xEF */
        0x86, 0x5B, 0x86, 0x5C, 0xD5, 0xDC, 0xDF, 0xEE, /* 0xF0-0xF3 */
        0x86, 0x5D, 0x86, 0x5E, 0x86, 0x5F, 0x86, 0x60, /* 0xF4-0xF7 */
        0x86, 0x61, 0x86, 0x62, 0xB2, 0xB8, 0x86, 0x63, /* 0xF8-0xFB */
        0xBA, 0xDF, 0xDF, 0xEC, 0x86, 0x64, 0xDB, 0xC1, /* 0xFC-0xFF */
};

static const unsigned char u2c_55[512] = {
        0x86, 0x65, 0xD1, 0xE4, 0x86, 0x66, 0x86, 0x67, /* 0x00-0x03 */
        0x86, 0x68, 0x86, 0x69, 0xCB, 0xF4, 0xB4, 0xBD, /* 0x04-0x07 */
        0x86, 0x6A, 0xB0, 0xA6, 0x86, 0x6B, 0x86, 0x6C, /* 0x08-0x0B */
        0x86, 0x6D, 0x86, 0x6E, 0x86, 0x6F, 0xDF, 0xF1, /* 0x0C-0x0F */
        0xCC, 0xC6, 0xDF, 0xF2, 0x86, 0x70, 0x86, 0x71, /* 0x10-0x13 */
        0xDF, 0xED, 0x86, 0x72, 0x86, 0x73, 0x86, 0x74, /* 0x14-0x17 */
        0x86, 0x75, 0x86, 0x76, 0x86, 0x77, 0xDF, 0xE9, /* 0x18-0x1B */
        0x86, 0x78, 0x86, 0x79, 0x86, 0x7A, 0x86, 0x7B, /* 0x1C-0x1F */
        0xDF, 0xEB, 0x86, 0x7C, 0xDF, 0xEF, 0xDF, 0xF0, /* 0x20-0x23 */
        0xBB, 0xBD, 0x86, 0x7D, 0x86, 0x7E, 0xDF, 0xF3, /* 0x24-0x27 */
        0x86, 0x80, 0x86, 0x81, 0xDF, 0xF4, 0x86, 0x82, /* 0x28-0x2B */
        0xBB, 0xA3, 0x86, 0x83, 0xCA, 0xDB, 0xCE, 0xA8, /* 0x2C-0x2F */
        0xE0, 0xA7, 0xB3, 0xAA, 0x86, 0x84, 0xE0, 0xA6, /* 0x30-0x33 */
        0x86, 0x85, 0x86, 0x86, 0x86, 0x87, 0xE0, 0xA1, /* 0x34-0x37 */
        0x86, 0x88, 0x86, 0x89, 0x86, 0x8A, 0x86, 0x8B, /* 0x38-0x3B */
        0xDF, 0xFE, 0x86, 0x8C, 0xCD, 0xD9, 0xDF, 0xFC, /* 0x3C-0x3F */
        0x86, 0x8D, 0xDF, 0xFA, 0x86, 0x8E, 0xBF, 0xD0, /* 0x40-0x43 */
        0xD7, 0xC4, 0x86, 0x8F, 0xC9, 0xCC, 0x86, 0x90, /* 0x44-0x47 */
        0x86, 0x91, 0xDF, 0xF8, 0xB0, 0xA1, 0x86, 0x92, /* 0x48-0x4B */
        0x86, 0x93, 0x86, 0x94, 0x86, 0x95, 0x86, 0x96, /* 0x4C-0x4F */
        0xDF, 0xFD, 0x86, 0x97, 0x86, 0x98, 0x86, 0x99, /* 0x50-0x53 */
        0x86, 0x9A, 0xDF, 0xFB, 0xE0, 0xA2, 0x86, 0x9B, /* 0x54-0x57 */
        0x86, 0x9C, 0x86, 0x9D, 0x86, 0x9E, 0x86, 0x9F, /* 0x58-0x5B */
        0xE0, 0xA8, 0x86, 0xA0, 0x86, 0xA1, 0x86, 0xA2, /* 0x5C-0x5F */
        0x86, 0xA3, 0xB7, 0xC8, 0x86, 0xA4, 0x86, 0xA5, /* 0x60-0x63 */
        0xC6, 0xA1, 0xC9, 0xB6, 0xC0, 0xB2, 0xDF, 0xF5, /* 0x64-0x67 */
        0x86, 0xA6, 0x86, 0xA7, 0xC5, 0xBE, 0x86, 0xA8, /* 0x68-0x6B */
        0xD8, 0xC4, 0xDF, 0xF9, 0xC4, 0xF6, 0x86, 0xA9, /* 0x6C-0x6F */
        0x86, 0xAA, 0x86, 0xAB, 0x86, 0xAC, 0x86, 0xAD, /* 0x70-0x73 */
        0x86, 0xAE, 0xE0, 0xA3, 0xE0, 0xA4, 0xE0, 0xA5, /* 0x74-0x77 */
        0xD0, 0xA5, 0x86, 0xAF, 0x86, 0xB0, 0xE0, 0xB4, /* 0x78-0x7B */
        0xCC, 0xE4, 0x86, 0xB1, 0xE0, 0xB1, 0x86, 0xB2, /* 0x7C-0x7F */
        
        0xBF, 0xA6, 0xE0, 0xAF, 0xCE, 0xB9, 0xE0, 0xAB, /* 0x80-0x83 */
        0xC9, 0xC6, 0x86, 0xB3, 0x86, 0xB4, 0xC0, 0xAE, /* 0x84-0x87 */
        0xE0, 0xAE, 0xBA, 0xED, 0xBA, 0xB0, 0xE0, 0xA9, /* 0x88-0x8B */
        0x86, 0xB5, 0x86, 0xB6, 0x86, 0xB7, 0xDF, 0xF6, /* 0x8C-0x8F */
        0x86, 0xB8, 0xE0, 0xB3, 0x86, 0xB9, 0x86, 0xBA, /* 0x90-0x93 */
        0xE0, 0xB8, 0x86, 0xBB, 0x86, 0xBC, 0x86, 0xBD, /* 0x94-0x97 */
        0xB4, 0xAD, 0xE0, 0xB9, 0x86, 0xBE, 0x86, 0xBF, /* 0x98-0x9B */
        0xCF, 0xB2, 0xBA, 0xC8, 0x86, 0xC0, 0xE0, 0xB0, /* 0x9C-0x9F */
        0x86, 0xC1, 0x86, 0xC2, 0x86, 0xC3, 0x86, 0xC4, /* 0xA0-0xA3 */
        0x86, 0xC5, 0x86, 0xC6, 0x86, 0xC7, 0xD0, 0xFA, /* 0xA4-0xA7 */
        0x86, 0xC8, 0x86, 0xC9, 0x86, 0xCA, 0x86, 0xCB, /* 0xA8-0xAB */
        0x86, 0xCC, 0x86, 0xCD, 0x86, 0xCE, 0x86, 0xCF, /* 0xAC-0xAF */
        0x86, 0xD0, 0xE0, 0xAC, 0x86, 0xD1, 0xD4, 0xFB, /* 0xB0-0xB3 */
        0x86, 0xD2, 0xDF, 0xF7, 0x86, 0xD3, 0xC5, 0xE7, /* 0xB4-0xB7 */
        0x86, 0xD4, 0xE0, 0xAD, 0x86, 0xD5, 0xD3, 0xF7, /* 0xB8-0xBB */
        0x86, 0xD6, 0xE0, 0xB6, 0xE0, 0xB7, 0x86, 0xD7, /* 0xBC-0xBF */
        0x86, 0xD8, 0x86, 0xD9, 0x86, 0xDA, 0x86, 0xDB, /* 0xC0-0xC3 */
        0xE0, 0xC4, 0xD0, 0xE1, 0x86, 0xDC, 0x86, 0xDD, /* 0xC4-0xC7 */
        0x86, 0xDE, 0xE0, 0xBC, 0x86, 0xDF, 0x86, 0xE0, /* 0xC8-0xCB */
        0xE0, 0xC9, 0xE0, 0xCA, 0x86, 0xE1, 0x86, 0xE2, /* 0xCC-0xCF */
        0x86, 0xE3, 0xE0, 0xBE, 0xE0, 0xAA, 0xC9, 0xA4, /* 0xD0-0xD3 */
        0xE0, 0xC1, 0x86, 0xE4, 0xE0, 0xB2, 0x86, 0xE5, /* 0xD4-0xD7 */
        0x86, 0xE6, 0x86, 0xE7, 0x86, 0xE8, 0x86, 0xE9, /* 0xD8-0xDB */
        0xCA, 0xC8, 0xE0, 0xC3, 0x86, 0xEA, 0xE0, 0xB5, /* 0xDC-0xDF */
        0x86, 0xEB, 0xCE, 0xCB, 0x86, 0xEC, 0xCB, 0xC3, /* 0xE0-0xE3 */
        0xE0, 0xCD, 0xE0, 0xC6, 0xE0, 0xC2, 0x86, 0xED, /* 0xE4-0xE7 */
        0xE0, 0xCB, 0x86, 0xEE, 0xE0, 0xBA, 0xE0, 0xBF, /* 0xE8-0xEB */
        0xE0, 0xC0, 0x86, 0xEF, 0x86, 0xF0, 0xE0, 0xC5, /* 0xEC-0xEF */
        0x86, 0xF1, 0x86, 0xF2, 0xE0, 0xC7, 0xE0, 0xC8, /* 0xF0-0xF3 */
        0x86, 0xF3, 0xE0, 0xCC, 0x86, 0xF4, 0xE0, 0xBB, /* 0xF4-0xF7 */
        0x86, 0xF5, 0x86, 0xF6, 0x86, 0xF7, 0x86, 0xF8, /* 0xF8-0xFB */
        0x86, 0xF9, 0xCB, 0xD4, 0xE0, 0xD5, 0x86, 0xFA, /* 0xFC-0xFF */
};

static const unsigned char u2c_56[512] = {
        0xE0, 0xD6, 0xE0, 0xD2, 0x86, 0xFB, 0x86, 0xFC, /* 0x00-0x03 */
        0x86, 0xFD, 0x86, 0xFE, 0x87, 0x40, 0x87, 0x41, /* 0x04-0x07 */
        0xE0, 0xD0, 0xBC, 0xCE, 0x87, 0x42, 0x87, 0x43, /* 0x08-0x0B */
        0xE0, 0xD1, 0x87, 0x44, 0xB8, 0xC2, 0xD8, 0xC5, /* 0x0C-0x0F */
        0x87, 0x45, 0x87, 0x46, 0x87, 0x47, 0x87, 0x48, /* 0x10-0x13 */
        0x87, 0x49, 0x87, 0x4A, 0x87, 0x4B, 0x87, 0x4C, /* 0x14-0x17 */
        0xD0, 0xEA, 0x87, 0x4D, 0x87, 0x4E, 0xC2, 0xEF, /* 0x18-0x1B */
        0x87, 0x4F, 0x87, 0x50, 0xE0, 0xCF, 0xE0, 0xBD, /* 0x1C-0x1F */
        0x87, 0x51, 0x87, 0x52, 0x87, 0x53, 0xE0, 0xD4, /* 0x20-0x23 */
        0xE0, 0xD3, 0x87, 0x54, 0x87, 0x55, 0xE0, 0xD7, /* 0x24-0x27 */
        0x87, 0x56, 0x87, 0x57, 0x87, 0x58, 0x87, 0x59, /* 0x28-0x2B */
        0xE0, 0xDC, 0xE0, 0xD8, 0x87, 0x5A, 0x87, 0x5B, /* 0x2C-0x2F */
        0x87, 0x5C, 0xD6, 0xF6, 0xB3, 0xB0, 0x87, 0x5D, /* 0x30-0x33 */
        0xD7, 0xEC, 0x87, 0x5E, 0xCB, 0xBB, 0x87, 0x5F, /* 0x34-0x37 */
        0x87, 0x60, 0xE0, 0xDA, 0x87, 0x61, 0xCE, 0xFB, /* 0x38-0x3B */
        0x87, 0x62, 0x87, 0x63, 0x87, 0x64, 0xBA, 0xD9, /* 0x3C-0x3F */
        0x87, 0x65, 0x87, 0x66, 0x87, 0x67, 0x87, 0x68, /* 0x40-0x43 */
        0x87, 0x69, 0x87, 0x6A, 0x87, 0x6B, 0x87, 0x6C, /* 0x44-0x47 */
        0x87, 0x6D, 0x87, 0x6E, 0x87, 0x6F, 0x87, 0x70, /* 0x48-0x4B */
        0xE0, 0xE1, 0xE0, 0xDD, 0xD2, 0xAD, 0x87, 0x71, /* 0x4C-0x4F */
        0x87, 0x72, 0x87, 0x73, 0x87, 0x74, 0x87, 0x75, /* 0x50-0x53 */
        0xE0, 0xE2, 0x87, 0x76, 0x87, 0x77, 0xE0, 0xDB, /* 0x54-0x57 */
        0xE0, 0xD9, 0xE0, 0xDF, 0x87, 0x78, 0x87, 0x79, /* 0x58-0x5B */
        0xE0, 0xE0, 0x87, 0x7A, 0x87, 0x7B, 0x87, 0x7C, /* 0x5C-0x5F */
        0x87, 0x7D, 0x87, 0x7E, 0xE0, 0xDE, 0x87, 0x80, /* 0x60-0x63 */
        0xE0, 0xE4, 0x87, 0x81, 0x87, 0x82, 0x87, 0x83, /* 0x64-0x67 */
        0xC6, 0xF7, 0xD8, 0xAC, 0xD4, 0xEB, 0xE0, 0xE6, /* 0x68-0x6B */
        0xCA, 0xC9, 0x87, 0x84, 0x87, 0x85, 0x87, 0x86, /* 0x6C-0x6F */
        0x87, 0x87, 0xE0, 0xE5, 0x87, 0x88, 0x87, 0x89, /* 0x70-0x73 */
        0x87, 0x8A, 0x87, 0x8B, 0xB8, 0xC1, 0x87, 0x8C, /* 0x74-0x77 */
        0x87, 0x8D, 0x87, 0x8E, 0x87, 0x8F, 0xE0, 0xE7, /* 0x78-0x7B */
        0xE0, 0xE8, 0x87, 0x90, 0x87, 0x91, 0x87, 0x92, /* 0x7C-0x7F */
        
        0x87, 0x93, 0x87, 0x94, 0x87, 0x95, 0x87, 0x96, /* 0x80-0x83 */
        0x87, 0x97, 0xE0, 0xE9, 0xE0, 0xE3, 0x87, 0x98, /* 0x84-0x87 */
        0x87, 0x99, 0x87, 0x9A, 0x87, 0x9B, 0x87, 0x9C, /* 0x88-0x8B */
        0x87, 0x9D, 0x87, 0x9E, 0xBA, 0xBF, 0xCC, 0xE7, /* 0x8C-0x8F */
        0x87, 0x9F, 0x87, 0xA0, 0x87, 0xA1, 0xE0, 0xEA, /* 0x90-0x93 */
        0x87, 0xA2, 0x87, 0xA3, 0x87, 0xA4, 0x87, 0xA5, /* 0x94-0x97 */
        0x87, 0xA6, 0x87, 0xA7, 0x87, 0xA8, 0x87, 0xA9, /* 0x98-0x9B */
        0x87, 0xAA, 0x87, 0xAB, 0x87, 0xAC, 0x87, 0xAD, /* 0x9C-0x9F */
        0x87, 0xAE, 0x87, 0xAF, 0x87, 0xB0, 0xCF, 0xF9, /* 0xA0-0xA3 */
        0x87, 0xB1, 0x87, 0xB2, 0x87, 0xB3, 0x87, 0xB4, /* 0xA4-0xA7 */
        0x87, 0xB5, 0x87, 0xB6, 0x87, 0xB7, 0x87, 0xB8, /* 0xA8-0xAB */
        0x87, 0xB9, 0x87, 0xBA, 0x87, 0xBB, 0xE0, 0xEB, /* 0xAC-0xAF */
        0x87, 0xBC, 0x87, 0xBD, 0x87, 0xBE, 0x87, 0xBF, /* 0xB0-0xB3 */
        0x87, 0xC0, 0x87, 0xC1, 0x87, 0xC2, 0xC8, 0xC2, /* 0xB4-0xB7 */
        0x87, 0xC3, 0x87, 0xC4, 0x87, 0xC5, 0x87, 0xC6, /* 0xB8-0xBB */
        0xBD, 0xC0, 0x87, 0xC7, 0x87, 0xC8, 0x87, 0xC9, /* 0xBC-0xBF */
        0x87, 0xCA, 0x87, 0xCB, 0x87, 0xCC, 0x87, 0xCD, /* 0xC0-0xC3 */
        0x87, 0xCE, 0x87, 0xCF, 0x87, 0xD0, 0x87, 0xD1, /* 0xC4-0xC7 */
        0x87, 0xD2, 0x87, 0xD3, 0xC4, 0xD2, 0x87, 0xD4, /* 0xC8-0xCB */
        0x87, 0xD5, 0x87, 0xD6, 0x87, 0xD7, 0x87, 0xD8, /* 0xCC-0xCF */
        0x87, 0xD9, 0x87, 0xDA, 0x87, 0xDB, 0x87, 0xDC, /* 0xD0-0xD3 */
        0xE0, 0xEC, 0x87, 0xDD, 0x87, 0xDE, 0xE0, 0xED, /* 0xD4-0xD7 */
        0x87, 0xDF, 0x87, 0xE0, 0xC7, 0xF4, 0xCB, 0xC4, /* 0xD8-0xDB */
        0x87, 0xE1, 0xE0, 0xEE, 0xBB, 0xD8, 0xD8, 0xB6, /* 0xDC-0xDF */
        0xD2, 0xF2, 0xE0, 0xEF, 0xCD, 0xC5, 0x87, 0xE2, /* 0xE0-0xE3 */
        0xB6, 0xDA, 0x87, 0xE3, 0x87, 0xE4, 0x87, 0xE5, /* 0xE4-0xE7 */
        0x87, 0xE6, 0x87, 0xE7, 0x87, 0xE8, 0xE0, 0xF1, /* 0xE8-0xEB */
        0x87, 0xE9, 0xD4, 0xB0, 0x87, 0xEA, 0x87, 0xEB, /* 0xEC-0xEF */
        0xC0, 0xA7, 0xB4, 0xD1, 0x87, 0xEC, 0x87, 0xED, /* 0xF0-0xF3 */
        0xCE, 0xA7, 0xE0, 0xF0, 0x87, 0xEE, 0x87, 0xEF, /* 0xF4-0xF7 */
        0x87, 0xF0, 0xE0, 0xF2, 0xB9, 0xCC, 0x87, 0xF1, /* 0xF8-0xFB */
        0x87, 0xF2, 0xB9, 0xFA, 0xCD, 0xBC, 0xE0, 0xF3, /* 0xFC-0xFF */
};

static const unsigned char u2c_57[512] = {
        0x87, 0xF3, 0x87, 0xF4, 0x87, 0xF5, 0xC6, 0xD4, /* 0x00-0x03 */
        0xE0, 0xF4, 0x87, 0xF6, 0xD4, 0xB2, 0x87, 0xF7, /* 0x04-0x07 */
        0xC8, 0xA6, 0xE0, 0xF6, 0xE0, 0xF5, 0x87, 0xF8, /* 0x08-0x0B */
        0x87, 0xF9, 0x87, 0xFA, 0x87, 0xFB, 0x87, 0xFC, /* 0x0C-0x0F */
        0x87, 0xFD, 0x87, 0xFE, 0x88, 0x40, 0x88, 0x41, /* 0x10-0x13 */
        0x88, 0x42, 0x88, 0x43, 0x88, 0x44, 0x88, 0x45, /* 0x14-0x17 */
        0x88, 0x46, 0x88, 0x47, 0x88, 0x48, 0x88, 0x49, /* 0x18-0x1B */
        0xE0, 0xF7, 0x88, 0x4A, 0x88, 0x4B, 0xCD, 0xC1, /* 0x1C-0x1F */
        0x88, 0x4C, 0x88, 0x4D, 0x88, 0x4E, 0xCA, 0xA5, /* 0x20-0x23 */
        0x88, 0x4F, 0x88, 0x50, 0x88, 0x51, 0x88, 0x52, /* 0x24-0x27 */
        0xD4, 0xDA, 0xDB, 0xD7, 0xDB, 0xD9, 0x88, 0x53, /* 0x28-0x2B */
        0xDB, 0xD8, 0xB9, 0xE7, 0xDB, 0xDC, 0xDB, 0xDD, /* 0x2C-0x2F */
        0xB5, 0xD8, 0x88, 0x54, 0x88, 0x55, 0xDB, 0xDA, /* 0x30-0x33 */
        0x88, 0x56, 0x88, 0x57, 0x88, 0x58, 0x88, 0x59, /* 0x34-0x37 */
        0x88, 0x5A, 0xDB, 0xDB, 0xB3, 0xA1, 0xDB, 0xDF, /* 0x38-0x3B */
        0x88, 0x5B, 0x88, 0x5C, 0xBB, 0xF8, 0x88, 0x5D, /* 0x3C-0x3F */
        0xD6, 0xB7, 0x88, 0x5E, 0xDB, 0xE0, 0x88, 0x5F, /* 0x40-0x43 */
        0x88, 0x60, 0x88, 0x61, 0x88, 0x62, 0xBE, 0xF9, /* 0x44-0x47 */
        0x88, 0x63, 0x88, 0x64, 0xB7, 0xBB, 0x88, 0x65, /* 0x48-0x4B */
        0xDB, 0xD0, 0xCC, 0xAE, 0xBF, 0xB2, 0xBB, 0xB5, /* 0x4C-0x4F */
        0xD7, 0xF8, 0xBF, 0xD3, 0x88, 0x66, 0x88, 0x67, /* 0x50-0x53 */
        0x88, 0x68, 0x88, 0x69, 0x88, 0x6A, 0xBF, 0xE9, /* 0x54-0x57 */
        0x88, 0x6B, 0x88, 0x6C, 0xBC, 0xE1, 0xCC, 0xB3, /* 0x58-0x5B */
        0xDB, 0xDE, 0xB0, 0xD3, 0xCE, 0xEB, 0xB7, 0xD8, /* 0x5C-0x5F */
        0xD7, 0xB9, 0xC6, 0xC2, 0x88, 0x6D, 0x88, 0x6E, /* 0x60-0x63 */
        0xC0, 0xA4, 0x88, 0x6F, 0xCC, 0xB9, 0x88, 0x70, /* 0x64-0x67 */
        0xDB, 0xE7, 0xDB, 0xE1, 0xC6, 0xBA, 0xDB, 0xE3, /* 0x68-0x6B */
        0x88, 0x71, 0xDB, 0xE8, 0x88, 0x72, 0xC5, 0xF7, /* 0x6C-0x6F */
        0x88, 0x73, 0x88, 0x74, 0x88, 0x75, 0xDB, 0xEA, /* 0x70-0x73 */
        0x88, 0x76, 0x88, 0x77, 0xDB, 0xE9, 0xBF, 0xC0, /* 0x74-0x77 */
        0x88, 0x78, 0x88, 0x79, 0x88, 0x7A, 0xDB, 0xE6, /* 0x78-0x7B */
        0xDB, 0xE5, 0x88, 0x7B, 0x88, 0x7C, 0x88, 0x7D, /* 0x7C-0x7F */
        
        0x88, 0x7E, 0x88, 0x80, 0xB4, 0xB9, 0xC0, 0xAC, /* 0x80-0x83 */
        0xC2, 0xA2, 0xDB, 0xE2, 0xDB, 0xE4, 0x88, 0x81, /* 0x84-0x87 */
        0x88, 0x82, 0x88, 0x83, 0x88, 0x84, 0xD0, 0xCD, /* 0x88-0x8B */
        0xDB, 0xED, 0x88, 0x85, 0x88, 0x86, 0x88, 0x87, /* 0x8C-0x8F */
        0x88, 0x88, 0x88, 0x89, 0xC0, 0xDD, 0xDB, 0xF2, /* 0x90-0x93 */
        0x88, 0x8A, 0x88, 0x8B, 0x88, 0x8C, 0x88, 0x8D, /* 0x94-0x97 */
        0x88, 0x8E, 0x88, 0x8F, 0x88, 0x90, 0xB6, 0xE2, /* 0x98-0x9B */
        0x88, 0x91, 0x88, 0x92, 0x88, 0x93, 0x88, 0x94, /* 0x9C-0x9F */
        0xDB, 0xF3, 0xDB, 0xD2, 0xB9, 0xB8, 0xD4, 0xAB, /* 0xA0-0xA3 */
        0xDB, 0xEC, 0x88, 0x95, 0xBF, 0xD1, 0xDB, 0xF0, /* 0xA4-0xA7 */
        0x88, 0x96, 0xDB, 0xD1, 0x88, 0x97, 0xB5, 0xE6, /* 0xA8-0xAB */
        0x88, 0x98, 0xDB, 0xEB, 0xBF, 0xE5, 0x88, 0x99, /* 0xAC-0xAF */
        0x88, 0x9A, 0x88, 0x9B, 0xDB, 0xEE, 0x88, 0x9C, /* 0xB0-0xB3 */
        0xDB, 0xF1, 0x88, 0x9D, 0x88, 0x9E, 0x88, 0x9F, /* 0xB4-0xB7 */
        0xDB, 0xF9, 0x88, 0xA0, 0x88, 0xA1, 0x88, 0xA2, /* 0xB8-0xBB */
        0x88, 0xA3, 0x88, 0xA4, 0x88, 0xA5, 0x88, 0xA6, /* 0xBC-0xBF */
        0x88, 0xA7, 0x88, 0xA8, 0xB9, 0xA1, 0xB0, 0xA3, /* 0xC0-0xC3 */
        0x88, 0xA9, 0x88, 0xAA, 0x88, 0xAB, 0x88, 0xAC, /* 0xC4-0xC7 */
        0x88, 0xAD, 0x88, 0xAE, 0x88, 0xAF, 0xC2, 0xF1, /* 0xC8-0xCB */
        0x88, 0xB0, 0x88, 0xB1, 0xB3, 0xC7, 0xDB, 0xEF, /* 0xCC-0xCF */
        0x88, 0xB2, 0x88, 0xB3, 0xDB, 0xF8, 0x88, 0xB4, /* 0xD0-0xD3 */
        0xC6, 0xD2, 0xDB, 0xF4, 0x88, 0xB5, 0x88, 0xB6, /* 0xD4-0xD7 */
        0xDB, 0xF5, 0xDB, 0xF7, 0xDB, 0xF6, 0x88, 0xB7, /* 0xD8-0xDB */
        0x88, 0xB8, 0xDB, 0xFE, 0x88, 0xB9, 0xD3, 0xF2, /* 0xDC-0xDF */
        0xB2, 0xBA, 0x88, 0xBA, 0x88, 0xBB, 0x88, 0xBC, /* 0xE0-0xE3 */
        0xDB, 0xFD, 0x88, 0xBD, 0x88, 0xBE, 0x88, 0xBF, /* 0xE4-0xE7 */
        0x88, 0xC0, 0x88, 0xC1, 0x88, 0xC2, 0x88, 0xC3, /* 0xE8-0xEB */
        0x88, 0xC4, 0xDC, 0xA4, 0x88, 0xC5, 0xDB, 0xFB, /* 0xEC-0xEF */
        0x88, 0xC6, 0x88, 0xC7, 0x88, 0xC8, 0x88, 0xC9, /* 0xF0-0xF3 */
        0xDB, 0xFA, 0x88, 0xCA, 0x88, 0xCB, 0x88, 0xCC, /* 0xF4-0xF7 */
        0xDB, 0xFC, 0xC5, 0xE0, 0xBB, 0xF9, 0x88, 0xCD, /* 0xF8-0xFB */
        0x88, 0xCE, 0xDC, 0xA3, 0x88, 0xCF, 0x88, 0xD0, /* 0xFC-0xFF */
};

static const unsigned char u2c_58[512] = {
        0xDC, 0xA5, 0x88, 0xD1, 0xCC, 0xC3, 0x88, 0xD2, /* 0x00-0x03 */
        0x88, 0xD3, 0x88, 0xD4, 0xB6, 0xD1, 0xDD, 0xC0, /* 0x04-0x07 */
        0x88, 0xD5, 0x88, 0xD6, 0x88, 0xD7, 0xDC, 0xA1, /* 0x08-0x0B */
        0x88, 0xD8, 0xDC, 0xA2, 0x88, 0xD9, 0x88, 0xDA, /* 0x0C-0x0F */
        0x88, 0xDB, 0xC7, 0xB5, 0x88, 0xDC, 0x88, 0xDD, /* 0x10-0x13 */
        0x88, 0xDE, 0xB6, 0xE9, 0x88, 0xDF, 0x88, 0xE0, /* 0x14-0x17 */
        0x88, 0xE1, 0xDC, 0xA7, 0x88, 0xE2, 0x88, 0xE3, /* 0x18-0x1B */
        0x88, 0xE4, 0x88, 0xE5, 0xDC, 0xA6, 0x88, 0xE6, /* 0x1C-0x1F */
        0xDC, 0xA9, 0xB1, 0xA4, 0x88, 0xE7, 0x88, 0xE8, /* 0x20-0x23 */
        0xB5, 0xCC, 0x88, 0xE9, 0x88, 0xEA, 0x88, 0xEB, /* 0x24-0x27 */
        0x88, 0xEC, 0x88, 0xED, 0xBF, 0xB0, 0x88, 0xEE, /* 0x28-0x2B */
        0x88, 0xEF, 0x88, 0xF0, 0x88, 0xF1, 0x88, 0xF2, /* 0x2C-0x2F */
        0xD1, 0xDF, 0x88, 0xF3, 0x88, 0xF4, 0x88, 0xF5, /* 0x30-0x33 */
        0x88, 0xF6, 0xB6, 0xC2, 0x88, 0xF7, 0x88, 0xF8, /* 0x34-0x37 */
        0x88, 0xF9, 0x88, 0xFA, 0x88, 0xFB, 0x88, 0xFC, /* 0x38-0x3B */
        0x88, 0xFD, 0x88, 0xFE, 0x89, 0x40, 0x89, 0x41, /* 0x3C-0x3F */
        0x89, 0x42, 0x89, 0x43, 0x89, 0x44, 0x89, 0x45, /* 0x40-0x43 */
        0xDC, 0xA8, 0x89, 0x46, 0x89, 0x47, 0x89, 0x48, /* 0x44-0x47 */
        0x89, 0x49, 0x89, 0x4A, 0x89, 0x4B, 0x89, 0x4C, /* 0x48-0x4B */
        0xCB, 0xFA, 0xEB, 0xF3, 0x89, 0x4D, 0x89, 0x4E, /* 0x4C-0x4F */
        0x89, 0x4F, 0xCB, 0xDC, 0x89, 0x50, 0x89, 0x51, /* 0x50-0x53 */
        0xCB, 0xFE, 0x89, 0x52, 0x89, 0x53, 0x89, 0x54, /* 0x54-0x57 */
        0xCC, 0xC1, 0x89, 0x55, 0x89, 0x56, 0x89, 0x57, /* 0x58-0x5B */
        0x89, 0x58, 0x89, 0x59, 0xC8, 0xFB, 0x89, 0x5A, /* 0x5C-0x5F */
        0x89, 0x5B, 0x89, 0x5C, 0x89, 0x5D, 0x89, 0x5E, /* 0x60-0x63 */
        0x89, 0x5F, 0xDC, 0xAA, 0x89, 0x60, 0x89, 0x61, /* 0x64-0x67 */
        0x89, 0x62, 0x89, 0x63, 0x89, 0x64, 0xCC, 0xEE, /* 0x68-0x6B */
        0xDC, 0xAB, 0x89, 0x65, 0x89, 0x66, 0x89, 0x67, /* 0x6C-0x6F */
        0x89, 0x68, 0x89, 0x69, 0x89, 0x6A, 0x89, 0x6B, /* 0x70-0x73 */
        0x89, 0x6C, 0x89, 0x6D, 0x89, 0x6E, 0x89, 0x6F, /* 0x74-0x77 */
        0x89, 0x70, 0x89, 0x71, 0x89, 0x72, 0x89, 0x73, /* 0x78-0x7B */
        0x89, 0x74, 0x89, 0x75, 0xDB, 0xD3, 0x89, 0x76, /* 0x7C-0x7F */
        
        0xDC, 0xAF, 0xDC, 0xAC, 0x89, 0x77, 0xBE, 0xB3, /* 0x80-0x83 */
        0x89, 0x78, 0xCA, 0xFB, 0x89, 0x79, 0x89, 0x7A, /* 0x84-0x87 */
        0x89, 0x7B, 0xDC, 0xAD, 0x89, 0x7C, 0x89, 0x7D, /* 0x88-0x8B */
        0x89, 0x7E, 0x89, 0x80, 0x89, 0x81, 0x89, 0x82, /* 0x8C-0x8F */
        0x89, 0x83, 0x89, 0x84, 0xC9, 0xCA, 0xC4, 0xB9, /* 0x90-0x93 */
        0x89, 0x85, 0x89, 0x86, 0x89, 0x87, 0x89, 0x88, /* 0x94-0x97 */
        0x89, 0x89, 0xC7, 0xBD, 0xDC, 0xAE, 0x89, 0x8A, /* 0x98-0x9B */
        0x89, 0x8B, 0x89, 0x8C, 0xD4, 0xF6, 0xD0, 0xE6, /* 0x9C-0x9F */
        0x89, 0x8D, 0x89, 0x8E, 0x89, 0x8F, 0x89, 0x90, /* 0xA0-0xA3 */
        0x89, 0x91, 0x89, 0x92, 0x89, 0x93, 0x89, 0x94, /* 0xA4-0xA7 */
        0xC4, 0xAB, 0xB6, 0xD5, 0x89, 0x95, 0x89, 0x96, /* 0xA8-0xAB */
        0x89, 0x97, 0x89, 0x98, 0x89, 0x99, 0x89, 0x9A, /* 0xAC-0xAF */
        0x89, 0x9B, 0x89, 0x9C, 0x89, 0x9D, 0x89, 0x9E, /* 0xB0-0xB3 */
        0x89, 0x9F, 0x89, 0xA0, 0x89, 0xA1, 0x89, 0xA2, /* 0xB4-0xB7 */
        0x89, 0xA3, 0x89, 0xA4, 0x89, 0xA5, 0x89, 0xA6, /* 0xB8-0xBB */
        0xDB, 0xD4, 0x89, 0xA7, 0x89, 0xA8, 0x89, 0xA9, /* 0xBC-0xBF */
        0x89, 0xAA, 0xB1, 0xDA, 0x89, 0xAB, 0x89, 0xAC, /* 0xC0-0xC3 */
        0x89, 0xAD, 0xDB, 0xD5, 0x89, 0xAE, 0x89, 0xAF, /* 0xC4-0xC7 */
        0x89, 0xB0, 0x89, 0xB1, 0x89, 0xB2, 0x89, 0xB3, /* 0xC8-0xCB */
        0x89, 0xB4, 0x89, 0xB5, 0x89, 0xB6, 0x89, 0xB7, /* 0xCC-0xCF */
        0x89, 0xB8, 0xDB, 0xD6, 0x89, 0xB9, 0x89, 0xBA, /* 0xD0-0xD3 */
        0x89, 0xBB, 0xBA, 0xBE, 0x89, 0xBC, 0x89, 0xBD, /* 0xD4-0xD7 */
        0x89, 0xBE, 0x89, 0xBF, 0x89, 0xC0, 0x89, 0xC1, /* 0xD8-0xDB */
        0x89, 0xC2, 0x89, 0xC3, 0x89, 0xC4, 0x89, 0xC5, /* 0xDC-0xDF */
        0x89, 0xC6, 0x89, 0xC7, 0x89, 0xC8, 0x89, 0xC9, /* 0xE0-0xE3 */
        0xC8, 0xC0, 0x89, 0xCA, 0x89, 0xCB, 0x89, 0xCC, /* 0xE4-0xE7 */
        0x89, 0xCD, 0x89, 0xCE, 0x89, 0xCF, 0xCA, 0xBF, /* 0xE8-0xEB */
        0xC8, 0xC9, 0x89, 0xD0, 0xD7, 0xB3, 0x89, 0xD1, /* 0xEC-0xEF */
        0xC9, 0xF9, 0x89, 0xD2, 0x89, 0xD3, 0xBF, 0xC7, /* 0xF0-0xF3 */
        0x89, 0xD4, 0x89, 0xD5, 0xBA, 0xF8, 0x89, 0xD6, /* 0xF4-0xF7 */
        0x89, 0xD7, 0xD2, 0xBC, 0x89, 0xD8, 0x89, 0xD9, /* 0xF8-0xFB */
        0x89, 0xDA, 0x89, 0xDB, 0x89, 0xDC, 0x89, 0xDD, /* 0xFC-0xFF */
};

static const unsigned char u2c_59[512] = {
        0x89, 0xDE, 0x89, 0xDF, 0xE2, 0xBA, 0x89, 0xE0, /* 0x00-0x03 */
        0xB4, 0xA6, 0x89, 0xE1, 0x89, 0xE2, 0xB1, 0xB8, /* 0x04-0x07 */
        0x89, 0xE3, 0x89, 0xE4, 0x89, 0xE5, 0x89, 0xE6, /* 0x08-0x0B */
        0x89, 0xE7, 0xB8, 0xB4, 0x89, 0xE8, 0xCF, 0xC4, /* 0x0C-0x0F */
        0x89, 0xE9, 0x89, 0xEA, 0x89, 0xEB, 0x89, 0xEC, /* 0x10-0x13 */
        0xD9, 0xE7, 0xCF, 0xA6, 0xCD, 0xE2, 0x89, 0xED, /* 0x14-0x17 */
        0x89, 0xEE, 0xD9, 0xED, 0xB6, 0xE0, 0x89, 0xEF, /* 0x18-0x1B */
        0xD2, 0xB9, 0x89, 0xF0, 0x89, 0xF1, 0xB9, 0xBB, /* 0x1C-0x1F */
        0x89, 0xF2, 0x89, 0xF3, 0x89, 0xF4, 0x89, 0xF5, /* 0x20-0x23 */
        0xE2, 0xB9, 0xE2, 0xB7, 0x89, 0xF6, 0xB4, 0xF3, /* 0x24-0x27 */
        0x89, 0xF7, 0xCC, 0xEC, 0xCC, 0xAB, 0xB7, 0xF2, /* 0x28-0x2B */
        0x89, 0xF8, 0xD8, 0xB2, 0xD1, 0xEB, 0xBA, 0xBB, /* 0x2C-0x2F */
        0x89, 0xF9, 0xCA, 0xA7, 0x89, 0xFA, 0x89, 0xFB, /* 0x30-0x33 */
        0xCD, 0xB7, 0x89, 0xFC, 0x89, 0xFD, 0xD2, 0xC4, /* 0x34-0x37 */
        0xBF, 0xE4, 0xBC, 0xD0, 0xB6, 0xE1, 0x89, 0xFE, /* 0x38-0x3B */
        0xDE, 0xC5, 0x8A, 0x40, 0x8A, 0x41, 0x8A, 0x42, /* 0x3C-0x3F */
        0x8A, 0x43, 0xDE, 0xC6, 0xDB, 0xBC, 0x8A, 0x44, /* 0x40-0x43 */
        0xD1, 0xD9, 0x8A, 0x45, 0x8A, 0x46, 0xC6, 0xE6, /* 0x44-0x47 */
        0xC4, 0xCE, 0xB7, 0xEE, 0x8A, 0x47, 0xB7, 0xDC, /* 0x48-0x4B */
        0x8A, 0x48, 0x8A, 0x49, 0xBF, 0xFC, 0xD7, 0xE0, /* 0x4C-0x4F */
        0x8A, 0x4A, 0xC6, 0xF5, 0x8A, 0x4B, 0x8A, 0x4C, /* 0x50-0x53 */
        0xB1, 0xBC, 0xDE, 0xC8, 0xBD, 0xB1, 0xCC, 0xD7, /* 0x54-0x57 */
        0xDE, 0xCA, 0x8A, 0x4D, 0xDE, 0xC9, 0x8A, 0x4E, /* 0x58-0x5B */
        0x8A, 0x4F, 0x8A, 0x50, 0x8A, 0x51, 0x8A, 0x52, /* 0x5C-0x5F */
        0xB5, 0xEC, 0x8A, 0x53, 0xC9, 0xDD, 0x8A, 0x54, /* 0x60-0x63 */
        0x8A, 0x55, 0xB0, 0xC2, 0x8A, 0x56, 0x8A, 0x57, /* 0x64-0x67 */
        0x8A, 0x58, 0x8A, 0x59, 0x8A, 0x5A, 0x8A, 0x5B, /* 0x68-0x6B */
        0x8A, 0x5C, 0x8A, 0x5D, 0x8A, 0x5E, 0x8A, 0x5F, /* 0x6C-0x6F */
        0x8A, 0x60, 0x8A, 0x61, 0x8A, 0x62, 0xC5, 0xAE, /* 0x70-0x73 */
        0xC5, 0xAB, 0x8A, 0x63, 0xC4, 0xCC, 0x8A, 0x64, /* 0x74-0x77 */
        0xBC, 0xE9, 0xCB, 0xFD, 0x8A, 0x65, 0x8A, 0x66, /* 0x78-0x7B */
        0x8A, 0x67, 0xBA, 0xC3, 0x8A, 0x68, 0x8A, 0x69, /* 0x7C-0x7F */
        
        0x8A, 0x6A, 0xE5, 0xF9, 0xC8, 0xE7, 0xE5, 0xFA, /* 0x80-0x83 */
        0xCD, 0xFD, 0x8A, 0x6B, 0xD7, 0xB1, 0xB8, 0xBE, /* 0x84-0x87 */
        0xC2, 0xE8, 0x8A, 0x6C, 0xC8, 0xD1, 0x8A, 0x6D, /* 0x88-0x8B */
        0x8A, 0x6E, 0xE5, 0xFB, 0x8A, 0x6F, 0x8A, 0x70, /* 0x8C-0x8F */
        0x8A, 0x71, 0x8A, 0x72, 0xB6, 0xCA, 0xBC, 0xCB, /* 0x90-0x93 */
        0x8A, 0x73, 0x8A, 0x74, 0xD1, 0xFD, 0xE6, 0xA1, /* 0x94-0x97 */
        0x8A, 0x75, 0xC3, 0xEE, 0x8A, 0x76, 0x8A, 0x77, /* 0x98-0x9B */
        0x8A, 0x78, 0x8A, 0x79, 0xE6, 0xA4, 0x8A, 0x7A, /* 0x9C-0x9F */
        0x8A, 0x7B, 0x8A, 0x7C, 0x8A, 0x7D, 0xE5, 0xFE, /* 0xA0-0xA3 */
        0xE6, 0xA5, 0xCD, 0xD7, 0x8A, 0x7E, 0x8A, 0x80, /* 0xA4-0xA7 */
        0xB7, 0xC1, 0xE5, 0xFC, 0xE5, 0xFD, 0xE6, 0xA3, /* 0xA8-0xAB */
        0x8A, 0x81, 0x8A, 0x82, 0xC4, 0xDD, 0xE6, 0xA8, /* 0xAC-0xAF */
        0x8A, 0x83, 0x8A, 0x84, 0xE6, 0xA7, 0x8A, 0x85, /* 0xB0-0xB3 */
        0x8A, 0x86, 0x8A, 0x87, 0x8A, 0x88, 0x8A, 0x89, /* 0xB4-0xB7 */
        0x8A, 0x8A, 0xC3, 0xC3, 0x8A, 0x8B, 0xC6, 0xDE, /* 0xB8-0xBB */
        0x8A, 0x8C, 0x8A, 0x8D, 0xE6, 0xAA, 0x8A, 0x8E, /* 0xBC-0xBF */
        0x8A, 0x8F, 0x8A, 0x90, 0x8A, 0x91, 0x8A, 0x92, /* 0xC0-0xC3 */
        0x8A, 0x93, 0x8A, 0x94, 0xC4, 0xB7, 0x8A, 0x95, /* 0xC4-0xC7 */
        0x8A, 0x96, 0x8A, 0x97, 0xE6, 0xA2, 0xCA, 0xBC, /* 0xC8-0xCB */
        0x8A, 0x98, 0x8A, 0x99, 0x8A, 0x9A, 0x8A, 0x9B, /* 0xCC-0xCF */
        0xBD, 0xE3, 0xB9, 0xC3, 0xE6, 0xA6, 0xD0, 0xD5, /* 0xD0-0xD3 */
        0xCE, 0xAF, 0x8A, 0x9C, 0x8A, 0x9D, 0xE6, 0xA9, /* 0xD4-0xD7 */
        0xE6, 0xB0, 0x8A, 0x9E, 0xD2, 0xA6, 0x8A, 0x9F, /* 0xD8-0xDB */
        0xBD, 0xAA, 0xE6, 0xAD, 0x8A, 0xA0, 0x8A, 0xA1, /* 0xDC-0xDF */
        0x8A, 0xA2, 0x8A, 0xA3, 0x8A, 0xA4, 0xE6, 0xAF, /* 0xE0-0xE3 */
        0x8A, 0xA5, 0xC0, 0xD1, 0x8A, 0xA6, 0x8A, 0xA7, /* 0xE4-0xE7 */
        0xD2, 0xCC, 0x8A, 0xA8, 0x8A, 0xA9, 0x8A, 0xAA, /* 0xE8-0xEB */
        0xBC, 0xA7, 0x8A, 0xAB, 0x8A, 0xAC, 0x8A, 0xAD, /* 0xEC-0xEF */
        0x8A, 0xAE, 0x8A, 0xAF, 0x8A, 0xB0, 0x8A, 0xB1, /* 0xF0-0xF3 */
        0x8A, 0xB2, 0x8A, 0xB3, 0x8A, 0xB4, 0x8A, 0xB5, /* 0xF4-0xF7 */
        0x8A, 0xB6, 0xE6, 0xB1, 0x8A, 0xB7, 0xD2, 0xF6, /* 0xF8-0xFB */
        0x8A, 0xB8, 0x8A, 0xB9, 0x8A, 0xBA, 0xD7, 0xCB, /* 0xFC-0xFF */
};

static const unsigned char u2c_5A[512] = {
        0x8A, 0xBB, 0xCD, 0xFE, 0x8A, 0xBC, 0xCD, 0xDE, /* 0x00-0x03 */
        0xC2, 0xA6, 0xE6, 0xAB, 0xE6, 0xAC, 0xBD, 0xBF, /* 0x04-0x07 */
        0xE6, 0xAE, 0xE6, 0xB3, 0x8A, 0xBD, 0x8A, 0xBE, /* 0x08-0x0B */
        0xE6, 0xB2, 0x8A, 0xBF, 0x8A, 0xC0, 0x8A, 0xC1, /* 0x0C-0x0F */
        0x8A, 0xC2, 0xE6, 0xB6, 0x8A, 0xC3, 0xE6, 0xB8, /* 0x10-0x13 */
        0x8A, 0xC4, 0x8A, 0xC5, 0x8A, 0xC6, 0x8A, 0xC7, /* 0x14-0x17 */
        0xC4, 0xEF, 0x8A, 0xC8, 0x8A, 0xC9, 0x8A, 0xCA, /* 0x18-0x1B */
        0xC4, 0xC8, 0x8A, 0xCB, 0x8A, 0xCC, 0xBE, 0xEA, /* 0x1C-0x1F */
        0xC9, 0xEF, 0x8A, 0xCD, 0x8A, 0xCE, 0xE6, 0xB7, /* 0x20-0x23 */
        0x8A, 0xCF, 0xB6, 0xF0, 0x8A, 0xD0, 0x8A, 0xD1, /* 0x24-0x27 */
        0x8A, 0xD2, 0xC3, 0xE4, 0x8A, 0xD3, 0x8A, 0xD4, /* 0x28-0x2B */
        0x8A, 0xD5, 0x8A, 0xD6, 0x8A, 0xD7, 0x8A, 0xD8, /* 0x2C-0x2F */
        0x8A, 0xD9, 0xD3, 0xE9, 0xE6, 0xB4, 0x8A, 0xDA, /* 0x30-0x33 */
        0xE6, 0xB5, 0x8A, 0xDB, 0xC8, 0xA2, 0x8A, 0xDC, /* 0x34-0x37 */
        0x8A, 0xDD, 0x8A, 0xDE, 0x8A, 0xDF, 0x8A, 0xE0, /* 0x38-0x3B */
        0xE6, 0xBD, 0x8A, 0xE1, 0x8A, 0xE2, 0x8A, 0xE3, /* 0x3C-0x3F */
        0xE6, 0xB9, 0x8A, 0xE4, 0x8A, 0xE5, 0x8A, 0xE6, /* 0x40-0x43 */
        0x8A, 0xE7, 0x8A, 0xE8, 0xC6, 0xC5, 0x8A, 0xE9, /* 0x44-0x47 */
        0x8A, 0xEA, 0xCD, 0xF1, 0xE6, 0xBB, 0x8A, 0xEB, /* 0x48-0x4B */
        0x8A, 0xEC, 0x8A, 0xED, 0x8A, 0xEE, 0x8A, 0xEF, /* 0x4C-0x4F */
        0x8A, 0xF0, 0x8A, 0xF1, 0x8A, 0xF2, 0x8A, 0xF3, /* 0x50-0x53 */
        0x8A, 0xF4, 0xE6, 0xBC, 0x8A, 0xF5, 0x8A, 0xF6, /* 0x54-0x57 */
        0x8A, 0xF7, 0x8A, 0xF8, 0xBB, 0xE9, 0x8A, 0xF9, /* 0x58-0x5B */
        0x8A, 0xFA, 0x8A, 0xFB, 0x8A, 0xFC, 0x8A, 0xFD, /* 0x5C-0x5F */
        0x8A, 0xFE, 0x8B, 0x40, 0xE6, 0xBE, 0x8B, 0x41, /* 0x60-0x63 */
        0x8B, 0x42, 0x8B, 0x43, 0x8B, 0x44, 0xE6, 0xBA, /* 0x64-0x67 */
        0x8B, 0x45, 0x8B, 0x46, 0xC0, 0xB7, 0x8B, 0x47, /* 0x68-0x6B */
        0x8B, 0x48, 0x8B, 0x49, 0x8B, 0x4A, 0x8B, 0x4B, /* 0x6C-0x6F */
        0x8B, 0x4C, 0x8B, 0x4D, 0x8B, 0x4E, 0x8B, 0x4F, /* 0x70-0x73 */
        0xD3, 0xA4, 0xE6, 0xBF, 0xC9, 0xF4, 0xE6, 0xC3, /* 0x74-0x77 */
        0x8B, 0x50, 0x8B, 0x51, 0xE6, 0xC4, 0x8B, 0x52, /* 0x78-0x7B */
        0x8B, 0x53, 0x8B, 0x54, 0x8B, 0x55, 0xD0, 0xF6, /* 0x7C-0x7F */
        
        0x8B, 0x56, 0x8B, 0x57, 0x8B, 0x58, 0x8B, 0x59, /* 0x80-0x83 */
        0x8B, 0x5A, 0x8B, 0x5B, 0x8B, 0x5C, 0x8B, 0x5D, /* 0x84-0x87 */
        0x8B, 0x5E, 0x8B, 0x5F, 0x8B, 0x60, 0x8B, 0x61, /* 0x88-0x8B */
        0x8B, 0x62, 0x8B, 0x63, 0x8B, 0x64, 0x8B, 0x65, /* 0x8C-0x8F */
        0x8B, 0x66, 0x8B, 0x67, 0xC3, 0xBD, 0x8B, 0x68, /* 0x90-0x93 */
        0x8B, 0x69, 0x8B, 0x6A, 0x8B, 0x6B, 0x8B, 0x6C, /* 0x94-0x97 */
        0x8B, 0x6D, 0x8B, 0x6E, 0xC3, 0xC4, 0xE6, 0xC2, /* 0x98-0x9B */
        0x8B, 0x6F, 0x8B, 0x70, 0x8B, 0x71, 0x8B, 0x72, /* 0x9C-0x9F */
        0x8B, 0x73, 0x8B, 0x74, 0x8B, 0x75, 0x8B, 0x76, /* 0xA0-0xA3 */
        0x8B, 0x77, 0x8B, 0x78, 0x8B, 0x79, 0x8B, 0x7A, /* 0xA4-0xA7 */
        0x8B, 0x7B, 0x8B, 0x7C, 0xE6, 0xC1, 0x8B, 0x7D, /* 0xA8-0xAB */
        0x8B, 0x7E, 0x8B, 0x80, 0x8B, 0x81, 0x8B, 0x82, /* 0xAC-0xAF */
        0x8B, 0x83, 0x8B, 0x84, 0xE6, 0xC7, 0xCF, 0xB1, /* 0xB0-0xB3 */
        0x8B, 0x85, 0xEB, 0xF4, 0x8B, 0x86, 0x8B, 0x87, /* 0xB4-0xB7 */
        0xE6, 0xCA, 0x8B, 0x88, 0x8B, 0x89, 0x8B, 0x8A, /* 0xB8-0xBB */
        0x8B, 0x8B, 0x8B, 0x8C, 0xE6, 0xC5, 0x8B, 0x8D, /* 0xBC-0xBF */
        0x8B, 0x8E, 0xBC, 0xDE, 0xC9, 0xA9, 0x8B, 0x8F, /* 0xC0-0xC3 */
        0x8B, 0x90, 0x8B, 0x91, 0x8B, 0x92, 0x8B, 0x93, /* 0xC4-0xC7 */
        0x8B, 0x94, 0xBC, 0xB5, 0x8B, 0x95, 0x8B, 0x96, /* 0xC8-0xCB */
        0xCF, 0xD3, 0x8B, 0x97, 0x8B, 0x98, 0x8B, 0x99, /* 0xCC-0xCF */
        0x8B, 0x9A, 0x8B, 0x9B, 0xE6, 0xC8, 0x8B, 0x9C, /* 0xD0-0xD3 */
        0xE6, 0xC9, 0x8B, 0x9D, 0xE6, 0xCE, 0x8B, 0x9E, /* 0xD4-0xD7 */
        0xE6, 0xD0, 0x8B, 0x9F, 0x8B, 0xA0, 0x8B, 0xA1, /* 0xD8-0xDB */
        0xE6, 0xD1, 0x8B, 0xA2, 0x8B, 0xA3, 0x8B, 0xA4, /* 0xDC-0xDF */
        0xE6, 0xCB, 0xB5, 0xD5, 0x8B, 0xA5, 0xE6, 0xCC, /* 0xE0-0xE3 */
        0x8B, 0xA6, 0x8B, 0xA7, 0xE6, 0xCF, 0x8B, 0xA8, /* 0xE4-0xE7 */
        0x8B, 0xA9, 0xC4, 0xDB, 0x8B, 0xAA, 0xE6, 0xC6, /* 0xE8-0xEB */
        0x8B, 0xAB, 0x8B, 0xAC, 0x8B, 0xAD, 0x8B, 0xAE, /* 0xEC-0xEF */
        0x8B, 0xAF, 0xE6, 0xCD, 0x8B, 0xB0, 0x8B, 0xB1, /* 0xF0-0xF3 */
        0x8B, 0xB2, 0x8B, 0xB3, 0x8B, 0xB4, 0x8B, 0xB5, /* 0xF4-0xF7 */
        0x8B, 0xB6, 0x8B, 0xB7, 0x8B, 0xB8, 0x8B, 0xB9, /* 0xF8-0xFB */
        0x8B, 0xBA, 0x8B, 0xBB, 0x8B, 0xBC, 0x8B, 0xBD, /* 0xFC-0xFF */
};

static const unsigned char u2c_5B[512] = {
        0x8B, 0xBE, 0x8B, 0xBF, 0x8B, 0xC0, 0x8B, 0xC1, /* 0x00-0x03 */
        0x8B, 0xC2, 0x8B, 0xC3, 0x8B, 0xC4, 0x8B, 0xC5, /* 0x04-0x07 */
        0x8B, 0xC6, 0xE6, 0xD2, 0x8B, 0xC7, 0x8B, 0xC8, /* 0x08-0x0B */
        0x8B, 0xC9, 0x8B, 0xCA, 0x8B, 0xCB, 0x8B, 0xCC, /* 0x0C-0x0F */
        0x8B, 0xCD, 0x8B, 0xCE, 0x8B, 0xCF, 0x8B, 0xD0, /* 0x10-0x13 */
        0x8B, 0xD1, 0x8B, 0xD2, 0xE6, 0xD4, 0xE6, 0xD3, /* 0x14-0x17 */
        0x8B, 0xD3, 0x8B, 0xD4, 0x8B, 0xD5, 0x8B, 0xD6, /* 0x18-0x1B */
        0x8B, 0xD7, 0x8B, 0xD8, 0x8B, 0xD9, 0x8B, 0xDA, /* 0x1C-0x1F */
        0x8B, 0xDB, 0x8B, 0xDC, 0x8B, 0xDD, 0x8B, 0xDE, /* 0x20-0x23 */
        0x8B, 0xDF, 0x8B, 0xE0, 0x8B, 0xE1, 0x8B, 0xE2, /* 0x24-0x27 */
        0x8B, 0xE3, 0x8B, 0xE4, 0x8B, 0xE5, 0x8B, 0xE6, /* 0x28-0x2B */
        0x8B, 0xE7, 0x8B, 0xE8, 0x8B, 0xE9, 0x8B, 0xEA, /* 0x2C-0x2F */
        0x8B, 0xEB, 0x8B, 0xEC, 0xE6, 0xD5, 0x8B, 0xED, /* 0x30-0x33 */
        0xD9, 0xF8, 0x8B, 0xEE, 0x8B, 0xEF, 0xE6, 0xD6, /* 0x34-0x37 */
        0x8B, 0xF0, 0x8B, 0xF1, 0x8B, 0xF2, 0x8B, 0xF3, /* 0x38-0x3B */
        0x8B, 0xF4, 0x8B, 0xF5, 0x8B, 0xF6, 0x8B, 0xF7, /* 0x3C-0x3F */
        0xE6, 0xD7, 0x8B, 0xF8, 0x8B, 0xF9, 0x8B, 0xFA, /* 0x40-0x43 */
        0x8B, 0xFB, 0x8B, 0xFC, 0x8B, 0xFD, 0x8B, 0xFE, /* 0x44-0x47 */
        0x8C, 0x40, 0x8C, 0x41, 0x8C, 0x42, 0x8C, 0x43, /* 0x48-0x4B */
        0x8C, 0x44, 0x8C, 0x45, 0x8C, 0x46, 0x8C, 0x47, /* 0x4C-0x4F */
        0xD7, 0xD3, 0xE6, 0xDD, 0x8C, 0x48, 0xE6, 0xDE, /* 0x50-0x53 */
        0xBF, 0xD7, 0xD4, 0xD0, 0x8C, 0x49, 0xD7, 0xD6, /* 0x54-0x57 */
        0xB4, 0xE6, 0xCB, 0xEF, 0xE6, 0xDA, 0xD8, 0xC3, /* 0x58-0x5B */
        0xD7, 0xCE, 0xD0, 0xA2, 0x8C, 0x4A, 0xC3, 0xCF, /* 0x5C-0x5F */
        0x8C, 0x4B, 0x8C, 0x4C, 0xE6, 0xDF, 0xBC, 0xBE, /* 0x60-0x63 */
        0xB9, 0xC2, 0xE6, 0xDB, 0xD1, 0xA7, 0x8C, 0x4D, /* 0x64-0x67 */
        0x8C, 0x4E, 0xBA, 0xA2, 0xC2, 0xCF, 0x8C, 0x4F, /* 0x68-0x6B */
        0xD8, 0xAB, 0x8C, 0x50, 0x8C, 0x51, 0x8C, 0x52, /* 0x6C-0x6F */
        0xCA, 0xEB, 0xE5, 0xEE, 0x8C, 0x53, 0xE6, 0xDC, /* 0x70-0x73 */
        0x8C, 0x54, 0xB7, 0xF5, 0x8C, 0x55, 0x8C, 0x56, /* 0x74-0x77 */
        0x8C, 0x57, 0x8C, 0x58, 0xC8, 0xE6, 0x8C, 0x59, /* 0x78-0x7B */
        0x8C, 0x5A, 0xC4, 0xF5, 0x8C, 0x5B, 0x8C, 0x5C, /* 0x7C-0x7F */
        
        0xE5, 0xB2, 0xC4, 0xFE, 0x8C, 0x5D, 0xCB, 0xFC, /* 0x80-0x83 */
        0xE5, 0xB3, 0xD5, 0xAC, 0x8C, 0x5E, 0xD3, 0xEE, /* 0x84-0x87 */
        0xCA, 0xD8, 0xB0, 0xB2, 0x8C, 0x5F, 0xCB, 0xCE, /* 0x88-0x8B */
        0xCD, 0xEA, 0x8C, 0x60, 0x8C, 0x61, 0xBA, 0xEA, /* 0x8C-0x8F */
        0x8C, 0x62, 0x8C, 0x63, 0x8C, 0x64, 0xE5, 0xB5, /* 0x90-0x93 */
        0x8C, 0x65, 0xE5, 0xB4, 0x8C, 0x66, 0xD7, 0xDA, /* 0x94-0x97 */
        0xB9, 0xD9, 0xD6, 0xE6, 0xB6, 0xA8, 0xCD, 0xF0, /* 0x98-0x9B */
        0xD2, 0xCB, 0xB1, 0xA6, 0xCA, 0xB5, 0x8C, 0x67, /* 0x9C-0x9F */
        0xB3, 0xE8, 0xC9, 0xF3, 0xBF, 0xCD, 0xD0, 0xFB, /* 0xA0-0xA3 */
        0xCA, 0xD2, 0xE5, 0xB6, 0xBB, 0xC2, 0x8C, 0x68, /* 0xA4-0xA7 */
        0x8C, 0x69, 0x8C, 0x6A, 0xCF, 0xDC, 0xB9, 0xAC, /* 0xA8-0xAB */
        0x8C, 0x6B, 0x8C, 0x6C, 0x8C, 0x6D, 0x8C, 0x6E, /* 0xAC-0xAF */
        0xD4, 0xD7, 0x8C, 0x6F, 0x8C, 0x70, 0xBA, 0xA6, /* 0xB0-0xB3 */
        0xD1, 0xE7, 0xCF, 0xFC, 0xBC, 0xD2, 0x8C, 0x71, /* 0xB4-0xB7 */
        0xE5, 0xB7, 0xC8, 0xDD, 0x8C, 0x72, 0x8C, 0x73, /* 0xB8-0xBB */
        0x8C, 0x74, 0xBF, 0xED, 0xB1, 0xF6, 0xCB, 0xDE, /* 0xBC-0xBF */
        0x8C, 0x75, 0x8C, 0x76, 0xBC, 0xC5, 0x8C, 0x77, /* 0xC0-0xC3 */
        0xBC, 0xC4, 0xD2, 0xFA, 0xC3, 0xDC, 0xBF, 0xDC, /* 0xC4-0xC7 */
        0x8C, 0x78, 0x8C, 0x79, 0x8C, 0x7A, 0x8C, 0x7B, /* 0xC8-0xCB */
        0xB8, 0xBB, 0x8C, 0x7C, 0x8C, 0x7D, 0x8C, 0x7E, /* 0xCC-0xCF */
        0xC3, 0xC2, 0x8C, 0x80, 0xBA, 0xAE, 0xD4, 0xA2, /* 0xD0-0xD3 */
        0x8C, 0x81, 0x8C, 0x82, 0x8C, 0x83, 0x8C, 0x84, /* 0xD4-0xD7 */
        0x8C, 0x85, 0x8C, 0x86, 0x8C, 0x87, 0x8C, 0x88, /* 0xD8-0xDB */
        0x8C, 0x89, 0xC7, 0xDE, 0xC4, 0xAF, 0xB2, 0xEC, /* 0xDC-0xDF */
        0x8C, 0x8A, 0xB9, 0xD1, 0x8C, 0x8B, 0x8C, 0x8C, /* 0xE0-0xE3 */
        0xE5, 0xBB, 0xC1, 0xC8, 0x8C, 0x8D, 0x8C, 0x8E, /* 0xE4-0xE7 */
        0xD5, 0xAF, 0x8C, 0x8F, 0x8C, 0x90, 0x8C, 0x91, /* 0xE8-0xEB */
        0x8C, 0x92, 0x8C, 0x93, 0xE5, 0xBC, 0x8C, 0x94, /* 0xEC-0xEF */
        0xE5, 0xBE, 0x8C, 0x95, 0x8C, 0x96, 0x8C, 0x97, /* 0xF0-0xF3 */
        0x8C, 0x98, 0x8C, 0x99, 0x8C, 0x9A, 0x8C, 0x9B, /* 0xF4-0xF7 */
        0xB4, 0xE7, 0xB6, 0xD4, 0xCB, 0xC2, 0xD1, 0xB0, /* 0xF8-0xFB */
        0xB5, 0xBC, 0x8C, 0x9C, 0x8C, 0x9D, 0xCA, 0xD9, /* 0xFC-0xFF */
};

static const unsigned char u2c_5C[512] = {
        0x8C, 0x9E, 0xB7, 0xE2, 0x8C, 0x9F, 0x8C, 0xA0, /* 0x00-0x03 */
        0xC9, 0xE4, 0x8C, 0xA1, 0xBD, 0xAB, 0x8C, 0xA2, /* 0x04-0x07 */
        0x8C, 0xA3, 0xCE, 0xBE, 0xD7, 0xF0, 0x8C, 0xA4, /* 0x08-0x0B */
        0x8C, 0xA5, 0x8C, 0xA6, 0x8C, 0xA7, 0xD0, 0xA1, /* 0x0C-0x0F */
        0x8C, 0xA8, 0xC9, 0xD9, 0x8C, 0xA9, 0x8C, 0xAA, /* 0x10-0x13 */
        0xB6, 0xFB, 0xE6, 0xD8, 0xBC, 0xE2, 0x8C, 0xAB, /* 0x14-0x17 */
        0xB3, 0xBE, 0x8C, 0xAC, 0xC9, 0xD0, 0x8C, 0xAD, /* 0x18-0x1B */
        0xE6, 0xD9, 0xB3, 0xA2, 0x8C, 0xAE, 0x8C, 0xAF, /* 0x1C-0x1F */
        0x8C, 0xB0, 0x8C, 0xB1, 0xDE, 0xCC, 0x8C, 0xB2, /* 0x20-0x23 */
        0xD3, 0xC8, 0xDE, 0xCD, 0x8C, 0xB3, 0xD2, 0xA2, /* 0x24-0x27 */
        0x8C, 0xB4, 0x8C, 0xB5, 0x8C, 0xB6, 0x8C, 0xB7, /* 0x28-0x2B */
        0xDE, 0xCE, 0x8C, 0xB8, 0x8C, 0xB9, 0x8C, 0xBA, /* 0x2C-0x2F */
        0x8C, 0xBB, 0xBE, 0xCD, 0x8C, 0xBC, 0x8C, 0xBD, /* 0x30-0x33 */
        0xDE, 0xCF, 0x8C, 0xBE, 0x8C, 0xBF, 0x8C, 0xC0, /* 0x34-0x37 */
        0xCA, 0xAC, 0xD2, 0xFC, 0xB3, 0xDF, 0xE5, 0xEA, /* 0x38-0x3B */
        0xC4, 0xE1, 0xBE, 0xA1, 0xCE, 0xB2, 0xC4, 0xF2, /* 0x3C-0x3F */
        0xBE, 0xD6, 0xC6, 0xA8, 0xB2, 0xE3, 0x8C, 0xC1, /* 0x40-0x43 */
        0x8C, 0xC2, 0xBE, 0xD3, 0x8C, 0xC3, 0x8C, 0xC4, /* 0x44-0x47 */
        0xC7, 0xFC, 0xCC, 0xEB, 0xBD, 0xEC, 0xCE, 0xDD, /* 0x48-0x4B */
        0x8C, 0xC5, 0x8C, 0xC6, 0xCA, 0xBA, 0xC6, 0xC1, /* 0x4C-0x4F */
        0xE5, 0xEC, 0xD0, 0xBC, 0x8C, 0xC7, 0x8C, 0xC8, /* 0x50-0x53 */
        0x8C, 0xC9, 0xD5, 0xB9, 0x8C, 0xCA, 0x8C, 0xCB, /* 0x54-0x57 */
        0x8C, 0xCC, 0xE5, 0xED, 0x8C, 0xCD, 0x8C, 0xCE, /* 0x58-0x5B */
        0x8C, 0xCF, 0x8C, 0xD0, 0xCA, 0xF4, 0x8C, 0xD1, /* 0x5C-0x5F */
        0xCD, 0xC0, 0xC2, 0xC5, 0x8C, 0xD2, 0xE5, 0xEF, /* 0x60-0x63 */
        0x8C, 0xD3, 0xC2, 0xC4, 0xE5, 0xF0, 0x8C, 0xD4, /* 0x64-0x67 */
        0x8C, 0xD5, 0x8C, 0xD6, 0x8C, 0xD7, 0x8C, 0xD8, /* 0x68-0x6B */
        0x8C, 0xD9, 0x8C, 0xDA, 0xE5, 0xF8, 0xCD, 0xCD, /* 0x6C-0x6F */
        0x8C, 0xDB, 0xC9, 0xBD, 0x8C, 0xDC, 0x8C, 0xDD, /* 0x70-0x73 */
        0x8C, 0xDE, 0x8C, 0xDF, 0x8C, 0xE0, 0x8C, 0xE1, /* 0x74-0x77 */
        0x8C, 0xE2, 0xD2, 0xD9, 0xE1, 0xA8, 0x8C, 0xE3, /* 0x78-0x7B */
        0x8C, 0xE4, 0x8C, 0xE5, 0x8C, 0xE6, 0xD3, 0xEC, /* 0x7C-0x7F */
        
        0x8C, 0xE7, 0xCB, 0xEA, 0xC6, 0xF1, 0x8C, 0xE8, /* 0x80-0x83 */
        0x8C, 0xE9, 0x8C, 0xEA, 0x8C, 0xEB, 0x8C, 0xEC, /* 0x84-0x87 */
        0xE1, 0xAC, 0x8C, 0xED, 0x8C, 0xEE, 0x8C, 0xEF, /* 0x88-0x8B */
        0xE1, 0xA7, 0xE1, 0xA9, 0x8C, 0xF0, 0x8C, 0xF1, /* 0x8C-0x8F */
        0xE1, 0xAA, 0xE1, 0xAF, 0x8C, 0xF2, 0x8C, 0xF3, /* 0x90-0x93 */
        0xB2, 0xED, 0x8C, 0xF4, 0xE1, 0xAB, 0xB8, 0xDA, /* 0x94-0x97 */
        0xE1, 0xAD, 0xE1, 0xAE, 0xE1, 0xB0, 0xB5, 0xBA, /* 0x98-0x9B */
        0xE1, 0xB1, 0x8C, 0xF5, 0x8C, 0xF6, 0x8C, 0xF7, /* 0x9C-0x9F */
        0x8C, 0xF8, 0x8C, 0xF9, 0xE1, 0xB3, 0xE1, 0xB8, /* 0xA0-0xA3 */
        0x8C, 0xFA, 0x8C, 0xFB, 0x8C, 0xFC, 0x8C, 0xFD, /* 0xA4-0xA7 */
        0x8C, 0xFE, 0xD1, 0xD2, 0x8D, 0x40, 0xE1, 0xB6, /* 0xA8-0xAB */
        0xE1, 0xB5, 0xC1, 0xEB, 0x8D, 0x41, 0x8D, 0x42, /* 0xAC-0xAF */
        0x8D, 0x43, 0xE1, 0xB7, 0x8D, 0x44, 0xD4, 0xC0, /* 0xB0-0xB3 */
        0x8D, 0x45, 0xE1, 0xB2, 0x8D, 0x46, 0xE1, 0xBA, /* 0xB4-0xB7 */
        0xB0, 0xB6, 0x8D, 0x47, 0x8D, 0x48, 0x8D, 0x49, /* 0xB8-0xBB */
        0x8D, 0x4A, 0xE1, 0xB4, 0x8D, 0x4B, 0xBF, 0xF9, /* 0xBC-0xBF */
        0x8D, 0x4C, 0xE1, 0xB9, 0x8D, 0x4D, 0x8D, 0x4E, /* 0xC0-0xC3 */
        0xE1, 0xBB, 0x8D, 0x4F, 0x8D, 0x50, 0x8D, 0x51, /* 0xC4-0xC7 */
        0x8D, 0x52, 0x8D, 0x53, 0x8D, 0x54, 0xE1, 0xBE, /* 0xC8-0xCB */
        0x8D, 0x55, 0x8D, 0x56, 0x8D, 0x57, 0x8D, 0x58, /* 0xCC-0xCF */
        0x8D, 0x59, 0x8D, 0x5A, 0xE1, 0xBC, 0x8D, 0x5B, /* 0xD0-0xD3 */
        0x8D, 0x5C, 0x8D, 0x5D, 0x8D, 0x5E, 0x8D, 0x5F, /* 0xD4-0xD7 */
        0x8D, 0x60, 0xD6, 0xC5, 0x8D, 0x61, 0x8D, 0x62, /* 0xD8-0xDB */
        0x8D, 0x63, 0x8D, 0x64, 0x8D, 0x65, 0x8D, 0x66, /* 0xDC-0xDF */
        0x8D, 0x67, 0xCF, 0xBF, 0x8D, 0x68, 0x8D, 0x69, /* 0xE0-0xE3 */
        0xE1, 0xBD, 0xE1, 0xBF, 0xC2, 0xCD, 0x8D, 0x6A, /* 0xE4-0xE7 */
        0xB6, 0xEB, 0x8D, 0x6B, 0xD3, 0xF8, 0x8D, 0x6C, /* 0xE8-0xEB */
        0x8D, 0x6D, 0xC7, 0xCD, 0x8D, 0x6E, 0x8D, 0x6F, /* 0xEC-0xEF */
        0xB7, 0xE5, 0x8D, 0x70, 0x8D, 0x71, 0x8D, 0x72, /* 0xF0-0xF3 */
        0x8D, 0x73, 0x8D, 0x74, 0x8D, 0x75, 0x8D, 0x76, /* 0xF4-0xF7 */
        0x8D, 0x77, 0x8D, 0x78, 0x8D, 0x79, 0xBE, 0xFE, /* 0xF8-0xFB */
        0x8D, 0x7A, 0x8D, 0x7B, 0x8D, 0x7C, 0x8D, 0x7D, /* 0xFC-0xFF */
};

static const unsigned char u2c_5D[512] = {
        0x8D, 0x7E, 0x8D, 0x80, 0xE1, 0xC0, 0xE1, 0xC1, /* 0x00-0x03 */
        0x8D, 0x81, 0x8D, 0x82, 0xE1, 0xC7, 0xB3, 0xE7, /* 0x04-0x07 */
        0x8D, 0x83, 0x8D, 0x84, 0x8D, 0x85, 0x8D, 0x86, /* 0x08-0x0B */
        0x8D, 0x87, 0x8D, 0x88, 0xC6, 0xE9, 0x8D, 0x89, /* 0x0C-0x0F */
        0x8D, 0x8A, 0x8D, 0x8B, 0x8D, 0x8C, 0x8D, 0x8D, /* 0x10-0x13 */
        0xB4, 0xDE, 0x8D, 0x8E, 0xD1, 0xC2, 0x8D, 0x8F, /* 0x14-0x17 */
        0x8D, 0x90, 0x8D, 0x91, 0x8D, 0x92, 0xE1, 0xC8, /* 0x18-0x1B */
        0x8D, 0x93, 0x8D, 0x94, 0xE1, 0xC6, 0x8D, 0x95, /* 0x1C-0x1F */
        0x8D, 0x96, 0x8D, 0x97, 0x8D, 0x98, 0x8D, 0x99, /* 0x20-0x23 */
        0xE1, 0xC5, 0x8D, 0x9A, 0xE1, 0xC3, 0xE1, 0xC2, /* 0x24-0x27 */
        0x8D, 0x9B, 0xB1, 0xC0, 0x8D, 0x9C, 0x8D, 0x9D, /* 0x28-0x2B */
        0x8D, 0x9E, 0xD5, 0xB8, 0xE1, 0xC4, 0x8D, 0x9F, /* 0x2C-0x2F */
        0x8D, 0xA0, 0x8D, 0xA1, 0x8D, 0xA2, 0x8D, 0xA3, /* 0x30-0x33 */
        0xE1, 0xCB, 0x8D, 0xA4, 0x8D, 0xA5, 0x8D, 0xA6, /* 0x34-0x37 */
        0x8D, 0xA7, 0x8D, 0xA8, 0x8D, 0xA9, 0x8D, 0xAA, /* 0x38-0x3B */
        0x8D, 0xAB, 0xE1, 0xCC, 0xE1, 0xCA, 0x8D, 0xAC, /* 0x3C-0x3F */
        0x8D, 0xAD, 0x8D, 0xAE, 0x8D, 0xAF, 0x8D, 0xB0, /* 0x40-0x43 */
        0x8D, 0xB1, 0x8D, 0xB2, 0x8D, 0xB3, 0xEF, 0xFA, /* 0x44-0x47 */
        0x8D, 0xB4, 0x8D, 0xB5, 0xE1, 0xD3, 0xE1, 0xD2, /* 0x48-0x4B */
        0xC7, 0xB6, 0x8D, 0xB6, 0x8D, 0xB7, 0x8D, 0xB8, /* 0x4C-0x4F */
        0x8D, 0xB9, 0x8D, 0xBA, 0x8D, 0xBB, 0x8D, 0xBC, /* 0x50-0x53 */
        0x8D, 0xBD, 0x8D, 0xBE, 0x8D, 0xBF, 0x8D, 0xC0, /* 0x54-0x57 */
        0xE1, 0xC9, 0x8D, 0xC1, 0x8D, 0xC2, 0xE1, 0xCE, /* 0x58-0x5B */
        0x8D, 0xC3, 0xE1, 0xD0, 0x8D, 0xC4, 0x8D, 0xC5, /* 0x5C-0x5F */
        0x8D, 0xC6, 0x8D, 0xC7, 0x8D, 0xC8, 0x8D, 0xC9, /* 0x60-0x63 */
        0x8D, 0xCA, 0x8D, 0xCB, 0x8D, 0xCC, 0x8D, 0xCD, /* 0x64-0x67 */
        0x8D, 0xCE, 0xE1, 0xD4, 0x8D, 0xCF, 0xE1, 0xD1, /* 0x68-0x6B */
        0xE1, 0xCD, 0x8D, 0xD0, 0x8D, 0xD1, 0xE1, 0xCF, /* 0x6C-0x6F */
        0x8D, 0xD2, 0x8D, 0xD3, 0x8D, 0xD4, 0x8D, 0xD5, /* 0x70-0x73 */
        0xE1, 0xD5, 0x8D, 0xD6, 0x8D, 0xD7, 0x8D, 0xD8, /* 0x74-0x77 */
        0x8D, 0xD9, 0x8D, 0xDA, 0x8D, 0xDB, 0x8D, 0xDC, /* 0x78-0x7B */
        0x8D, 0xDD, 0x8D, 0xDE, 0x8D, 0xDF, 0x8D, 0xE0, /* 0x7C-0x7F */
        
        0x8D, 0xE1, 0x8D, 0xE2, 0xE1, 0xD6, 0x8D, 0xE3, /* 0x80-0x83 */
        0x8D, 0xE4, 0x8D, 0xE5, 0x8D, 0xE6, 0x8D, 0xE7, /* 0x84-0x87 */
        0x8D, 0xE8, 0x8D, 0xE9, 0x8D, 0xEA, 0x8D, 0xEB, /* 0x88-0x8B */
        0x8D, 0xEC, 0x8D, 0xED, 0x8D, 0xEE, 0x8D, 0xEF, /* 0x8C-0x8F */
        0x8D, 0xF0, 0x8D, 0xF1, 0x8D, 0xF2, 0x8D, 0xF3, /* 0x90-0x93 */
        0x8D, 0xF4, 0x8D, 0xF5, 0x8D, 0xF6, 0x8D, 0xF7, /* 0x94-0x97 */
        0x8D, 0xF8, 0xE1, 0xD7, 0x8D, 0xF9, 0x8D, 0xFA, /* 0x98-0x9B */
        0x8D, 0xFB, 0xE1, 0xD8, 0x8D, 0xFC, 0x8D, 0xFD, /* 0x9C-0x9F */
        0x8D, 0xFE, 0x8E, 0x40, 0x8E, 0x41, 0x8E, 0x42, /* 0xA0-0xA3 */
        0x8E, 0x43, 0x8E, 0x44, 0x8E, 0x45, 0x8E, 0x46, /* 0xA4-0xA7 */
        0x8E, 0x47, 0x8E, 0x48, 0x8E, 0x49, 0x8E, 0x4A, /* 0xA8-0xAB */
        0x8E, 0x4B, 0x8E, 0x4C, 0x8E, 0x4D, 0x8E, 0x4E, /* 0xAC-0xAF */
        0x8E, 0x4F, 0x8E, 0x50, 0x8E, 0x51, 0x8E, 0x52, /* 0xB0-0xB3 */
        0x8E, 0x53, 0x8E, 0x54, 0x8E, 0x55, 0xE1, 0xDA, /* 0xB4-0xB7 */
        0x8E, 0x56, 0x8E, 0x57, 0x8E, 0x58, 0x8E, 0x59, /* 0xB8-0xBB */
        0x8E, 0x5A, 0x8E, 0x5B, 0x8E, 0x5C, 0x8E, 0x5D, /* 0xBC-0xBF */
        0x8E, 0x5E, 0x8E, 0x5F, 0x8E, 0x60, 0x8E, 0x61, /* 0xC0-0xC3 */
        0x8E, 0x62, 0xE1, 0xDB, 0x8E, 0x63, 0x8E, 0x64, /* 0xC4-0xC7 */
        0x8E, 0x65, 0x8E, 0x66, 0x8E, 0x67, 0x8E, 0x68, /* 0xC8-0xCB */
        0x8E, 0x69, 0xCE, 0xA1, 0x8E, 0x6A, 0x8E, 0x6B, /* 0xCC-0xCF */
        0x8E, 0x6C, 0x8E, 0x6D, 0x8E, 0x6E, 0x8E, 0x6F, /* 0xD0-0xD3 */
        0x8E, 0x70, 0x8E, 0x71, 0x8E, 0x72, 0x8E, 0x73, /* 0xD4-0xD7 */
        0x8E, 0x74, 0x8E, 0x75, 0x8E, 0x76, 0xE7, 0xDD, /* 0xD8-0xDB */
        0x8E, 0x77, 0xB4, 0xA8, 0xD6, 0xDD, 0x8E, 0x78, /* 0xDC-0xDF */
        0x8E, 0x79, 0xD1, 0xB2, 0xB3, 0xB2, 0x8E, 0x7A, /* 0xE0-0xE3 */
        0x8E, 0x7B, 0xB9, 0xA4, 0xD7, 0xF3, 0xC7, 0xC9, /* 0xE4-0xE7 */
        0xBE, 0xDE, 0xB9, 0xAE, 0x8E, 0x7C, 0xCE, 0xD7, /* 0xE8-0xEB */
        0x8E, 0x7D, 0x8E, 0x7E, 0xB2, 0xEE, 0xDB, 0xCF, /* 0xEC-0xEF */
        0x8E, 0x80, 0xBC, 0xBA, 0xD2, 0xD1, 0xCB, 0xC8, /* 0xF0-0xF3 */
        0xB0, 0xCD, 0x8E, 0x81, 0x8E, 0x82, 0xCF, 0xEF, /* 0xF4-0xF7 */
        0x8E, 0x83, 0x8E, 0x84, 0x8E, 0x85, 0x8E, 0x86, /* 0xF8-0xFB */
        0x8E, 0x87, 0xD9, 0xE3, 0xBD, 0xED, 0x8E, 0x88, /* 0xFC-0xFF */
};

static const unsigned char u2c_5E[512] = {
        0x8E, 0x89, 0xB1, 0xD2, 0xCA, 0xD0, 0xB2, 0xBC, /* 0x00-0x03 */
        0x8E, 0x8A, 0xCB, 0xA7, 0xB7, 0xAB, 0x8E, 0x8B, /* 0x04-0x07 */
        0xCA, 0xA6, 0x8E, 0x8C, 0x8E, 0x8D, 0x8E, 0x8E, /* 0x08-0x0B */
        0xCF, 0xA3, 0x8E, 0x8F, 0x8E, 0x90, 0xE0, 0xF8, /* 0x0C-0x0F */
        0xD5, 0xCA, 0xE0, 0xFB, 0x8E, 0x91, 0x8E, 0x92, /* 0x10-0x13 */
        0xE0, 0xFA, 0xC5, 0xC1, 0xCC, 0xFB, 0x8E, 0x93, /* 0x14-0x17 */
        0xC1, 0xB1, 0xE0, 0xF9, 0xD6, 0xE3, 0xB2, 0xAF, /* 0x18-0x1B */
        0xD6, 0xC4, 0xB5, 0xDB, 0x8E, 0x94, 0x8E, 0x95, /* 0x1C-0x1F */
        0x8E, 0x96, 0x8E, 0x97, 0x8E, 0x98, 0x8E, 0x99, /* 0x20-0x23 */
        0x8E, 0x9A, 0x8E, 0x9B, 0xB4, 0xF8, 0xD6, 0xA1, /* 0x24-0x27 */
        0x8E, 0x9C, 0x8E, 0x9D, 0x8E, 0x9E, 0x8E, 0x9F, /* 0x28-0x2B */
        0x8E, 0xA0, 0xCF, 0xAF, 0xB0, 0xEF, 0x8E, 0xA1, /* 0x2C-0x2F */
        0x8E, 0xA2, 0xE0, 0xFC, 0x8E, 0xA3, 0x8E, 0xA4, /* 0x30-0x33 */
        0x8E, 0xA5, 0x8E, 0xA6, 0x8E, 0xA7, 0xE1, 0xA1, /* 0x34-0x37 */
        0xB3, 0xA3, 0x8E, 0xA8, 0x8E, 0xA9, 0xE0, 0xFD, /* 0x38-0x3B */
        0xE0, 0xFE, 0xC3, 0xB1, 0x8E, 0xAA, 0x8E, 0xAB, /* 0x3C-0x3F */
        0x8E, 0xAC, 0x8E, 0xAD, 0xC3, 0xDD, 0x8E, 0xAE, /* 0x40-0x43 */
        0xE1, 0xA2, 0xB7, 0xF9, 0x8E, 0xAF, 0x8E, 0xB0, /* 0x44-0x47 */
        0x8E, 0xB1, 0x8E, 0xB2, 0x8E, 0xB3, 0x8E, 0xB4, /* 0x48-0x4B */
        0xBB, 0xCF, 0x8E, 0xB5, 0x8E, 0xB6, 0x8E, 0xB7, /* 0x4C-0x4F */
        0x8E, 0xB8, 0x8E, 0xB9, 0x8E, 0xBA, 0x8E, 0xBB, /* 0x50-0x53 */
        0xE1, 0xA3, 0xC4, 0xBB, 0x8E, 0xBC, 0x8E, 0xBD, /* 0x54-0x57 */
        0x8E, 0xBE, 0x8E, 0xBF, 0x8E, 0xC0, 0xE1, 0xA4, /* 0x58-0x5B */
        0x8E, 0xC1, 0x8E, 0xC2, 0xE1, 0xA5, 0x8E, 0xC3, /* 0x5C-0x5F */
        0x8E, 0xC4, 0xE1, 0xA6, 0xB4, 0xB1, 0x8E, 0xC5, /* 0x60-0x63 */
        0x8E, 0xC6, 0x8E, 0xC7, 0x8E, 0xC8, 0x8E, 0xC9, /* 0x64-0x67 */
        0x8E, 0xCA, 0x8E, 0xCB, 0x8E, 0xCC, 0x8E, 0xCD, /* 0x68-0x6B */
        0x8E, 0xCE, 0x8E, 0xCF, 0x8E, 0xD0, 0x8E, 0xD1, /* 0x6C-0x6F */
        0x8E, 0xD2, 0x8E, 0xD3, 0xB8, 0xC9, 0xC6, 0xBD, /* 0x70-0x73 */
        0xC4, 0xEA, 0x8E, 0xD4, 0xB2, 0xA2, 0x8E, 0xD5, /* 0x74-0x77 */
        0xD0, 0xD2, 0x8E, 0xD6, 0xE7, 0xDB, 0xBB, 0xC3, /* 0x78-0x7B */
        0xD3, 0xD7, 0xD3, 0xC4, 0x8E, 0xD7, 0xB9, 0xE3, /* 0x7C-0x7F */
        
        0xE2, 0xCF, 0x8E, 0xD8, 0x8E, 0xD9, 0x8E, 0xDA, /* 0x80-0x83 */
        0xD7, 0xAF, 0x8E, 0xDB, 0xC7, 0xEC, 0xB1, 0xD3, /* 0x84-0x87 */
        0x8E, 0xDC, 0x8E, 0xDD, 0xB4, 0xB2, 0xE2, 0xD1, /* 0x88-0x8B */
        0x8E, 0xDE, 0x8E, 0xDF, 0x8E, 0xE0, 0xD0, 0xF2, /* 0x8C-0x8F */
        0xC2, 0xAE, 0xE2, 0xD0, 0x8E, 0xE1, 0xBF, 0xE2, /* 0x90-0x93 */
        0xD3, 0xA6, 0xB5, 0xD7, 0xE2, 0xD2, 0xB5, 0xEA, /* 0x94-0x97 */
        0x8E, 0xE2, 0xC3, 0xED, 0xB8, 0xFD, 0x8E, 0xE3, /* 0x98-0x9B */
        0xB8, 0xAE, 0x8E, 0xE4, 0xC5, 0xD3, 0xB7, 0xCF, /* 0x9C-0x9F */
        0xE2, 0xD4, 0x8E, 0xE5, 0x8E, 0xE6, 0x8E, 0xE7, /* 0xA0-0xA3 */
        0x8E, 0xE8, 0xE2, 0xD3, 0xB6, 0xC8, 0xD7, 0xF9, /* 0xA4-0xA7 */
        0x8E, 0xE9, 0x8E, 0xEA, 0x8E, 0xEB, 0x8E, 0xEC, /* 0xA8-0xAB */
        0x8E, 0xED, 0xCD, 0xA5, 0x8E, 0xEE, 0x8E, 0xEF, /* 0xAC-0xAF */
        0x8E, 0xF0, 0x8E, 0xF1, 0x8E, 0xF2, 0xE2, 0xD8, /* 0xB0-0xB3 */
        0x8E, 0xF3, 0xE2, 0xD6, 0xCA, 0xFC, 0xBF, 0xB5, /* 0xB4-0xB7 */
        0xD3, 0xB9, 0xE2, 0xD5, 0x8E, 0xF4, 0x8E, 0xF5, /* 0xB8-0xBB */
        0x8E, 0xF6, 0x8E, 0xF7, 0xE2, 0xD7, 0x8E, 0xF8, /* 0xBC-0xBF */
        0x8E, 0xF9, 0x8E, 0xFA, 0x8E, 0xFB, 0x8E, 0xFC, /* 0xC0-0xC3 */
        0x8E, 0xFD, 0x8E, 0xFE, 0x8F, 0x40, 0x8F, 0x41, /* 0xC4-0xC7 */
        0x8F, 0x42, 0xC1, 0xAE, 0xC0, 0xC8, 0x8F, 0x43, /* 0xC8-0xCB */
        0x8F, 0x44, 0x8F, 0x45, 0x8F, 0x46, 0x8F, 0x47, /* 0xCC-0xCF */
        0x8F, 0x48, 0xE2, 0xDB, 0xE2, 0xDA, 0xC0, 0xAA, /* 0xD0-0xD3 */
        0x8F, 0x49, 0x8F, 0x4A, 0xC1, 0xCE, 0x8F, 0x4B, /* 0xD4-0xD7 */
        0x8F, 0x4C, 0x8F, 0x4D, 0x8F, 0x4E, 0xE2, 0xDC, /* 0xD8-0xDB */
        0x8F, 0x4F, 0x8F, 0x50, 0x8F, 0x51, 0x8F, 0x52, /* 0xDC-0xDF */
        0x8F, 0x53, 0x8F, 0x54, 0x8F, 0x55, 0x8F, 0x56, /* 0xE0-0xE3 */
        0x8F, 0x57, 0x8F, 0x58, 0x8F, 0x59, 0x8F, 0x5A, /* 0xE4-0xE7 */
        0xE2, 0xDD, 0x8F, 0x5B, 0xE2, 0xDE, 0x8F, 0x5C, /* 0xE8-0xEB */
        0x8F, 0x5D, 0x8F, 0x5E, 0x8F, 0x5F, 0x8F, 0x60, /* 0xEC-0xEF */
        0x8F, 0x61, 0x8F, 0x62, 0x8F, 0x63, 0x8F, 0x64, /* 0xF0-0xF3 */
        0xDB, 0xC8, 0x8F, 0x65, 0xD1, 0xD3, 0xCD, 0xA2, /* 0xF4-0xF7 */
        0x8F, 0x66, 0x8F, 0x67, 0xBD, 0xA8, 0x8F, 0x68, /* 0xF8-0xFB */
        0x8F, 0x69, 0x8F, 0x6A, 0xDE, 0xC3, 0xD8, 0xA5, /* 0xFC-0xFF */
};

static const unsigned char u2c_5F[512] = {
        0xBF, 0xAA, 0xDB, 0xCD, 0xD2, 0xEC, 0xC6, 0xFA, /* 0x00-0x03 */
        0xC5, 0xAA, 0x8F, 0x6B, 0x8F, 0x6C, 0x8F, 0x6D, /* 0x04-0x07 */
        0xDE, 0xC4, 0x8F, 0x6E, 0xB1, 0xD7, 0xDF, 0xAE, /* 0x08-0x0B */
        0x8F, 0x6F, 0x8F, 0x70, 0x8F, 0x71, 0xCA, 0xBD, /* 0x0C-0x0F */
        0x8F, 0x72, 0xDF, 0xB1, 0x8F, 0x73, 0xB9, 0xAD, /* 0x10-0x13 */
        0x8F, 0x74, 0xD2, 0xFD, 0x8F, 0x75, 0xB8, 0xA5, /* 0x14-0x17 */
        0xBA, 0xEB, 0x8F, 0x76, 0x8F, 0x77, 0xB3, 0xDA, /* 0x18-0x1B */
        0x8F, 0x78, 0x8F, 0x79, 0x8F, 0x7A, 0xB5, 0xDC, /* 0x1C-0x1F */
        0xD5, 0xC5, 0x8F, 0x7B, 0x8F, 0x7C, 0x8F, 0x7D, /* 0x20-0x23 */
        0x8F, 0x7E, 0xC3, 0xD6, 0xCF, 0xD2, 0xBB, 0xA1, /* 0x24-0x27 */
        0x8F, 0x80, 0xE5, 0xF3, 0xE5, 0xF2, 0x8F, 0x81, /* 0x28-0x2B */
        0x8F, 0x82, 0xE5, 0xF4, 0x8F, 0x83, 0xCD, 0xE4, /* 0x2C-0x2F */
        0x8F, 0x84, 0xC8, 0xF5, 0x8F, 0x85, 0x8F, 0x86, /* 0x30-0x33 */
        0x8F, 0x87, 0x8F, 0x88, 0x8F, 0x89, 0x8F, 0x8A, /* 0x34-0x37 */
        0x8F, 0x8B, 0xB5, 0xAF, 0xC7, 0xBF, 0x8F, 0x8C, /* 0x38-0x3B */
        0xE5, 0xF6, 0x8F, 0x8D, 0x8F, 0x8E, 0x8F, 0x8F, /* 0x3C-0x3F */
        0xEC, 0xB0, 0x8F, 0x90, 0x8F, 0x91, 0x8F, 0x92, /* 0x40-0x43 */
        0x8F, 0x93, 0x8F, 0x94, 0x8F, 0x95, 0x8F, 0x96, /* 0x44-0x47 */
        0x8F, 0x97, 0x8F, 0x98, 0x8F, 0x99, 0x8F, 0x9A, /* 0x48-0x4B */
        0x8F, 0x9B, 0x8F, 0x9C, 0x8F, 0x9D, 0x8F, 0x9E, /* 0x4C-0x4F */
        0xE5, 0xE6, 0x8F, 0x9F, 0xB9, 0xE9, 0xB5, 0xB1, /* 0x50-0x53 */
        0x8F, 0xA0, 0xC2, 0xBC, 0xE5, 0xE8, 0xE5, 0xE7, /* 0x54-0x57 */
        0xE5, 0xE9, 0x8F, 0xA1, 0x8F, 0xA2, 0x8F, 0xA3, /* 0x58-0x5B */
        0x8F, 0xA4, 0xD2, 0xCD, 0x8F, 0xA5, 0x8F, 0xA6, /* 0x5C-0x5F */
        0x8F, 0xA7, 0xE1, 0xEA, 0xD0, 0xCE, 0x8F, 0xA8, /* 0x60-0x63 */
        0xCD, 0xAE, 0x8F, 0xA9, 0xD1, 0xE5, 0x8F, 0xAA, /* 0x64-0x67 */
        0x8F, 0xAB, 0xB2, 0xCA, 0xB1, 0xEB, 0x8F, 0xAC, /* 0x68-0x6B */
        0xB1, 0xF2, 0xC5, 0xED, 0x8F, 0xAD, 0x8F, 0xAE, /* 0x6C-0x6F */
        0xD5, 0xC3, 0xD3, 0xB0, 0x8F, 0xAF, 0xE1, 0xDC, /* 0x70-0x73 */
        0x8F, 0xB0, 0x8F, 0xB1, 0x8F, 0xB2, 0xE1, 0xDD, /* 0x74-0x77 */
        0x8F, 0xB3, 0xD2, 0xDB, 0x8F, 0xB4, 0xB3, 0xB9, /* 0x78-0x7B */
        0xB1, 0xCB, 0x8F, 0xB5, 0x8F, 0xB6, 0x8F, 0xB7, /* 0x7C-0x7F */
        
        0xCD, 0xF9, 0xD5, 0xF7, 0xE1, 0xDE, 0x8F, 0xB8, /* 0x80-0x83 */
        0xBE, 0xB6, 0xB4, 0xFD, 0x8F, 0xB9, 0xE1, 0xDF, /* 0x84-0x87 */
        0xBA, 0xDC, 0xE1, 0xE0, 0xBB, 0xB2, 0xC2, 0xC9, /* 0x88-0x8B */
        0xE1, 0xE1, 0x8F, 0xBA, 0x8F, 0xBB, 0x8F, 0xBC, /* 0x8C-0x8F */
        0xD0, 0xEC, 0x8F, 0xBD, 0xCD, 0xBD, 0x8F, 0xBE, /* 0x90-0x93 */
        0x8F, 0xBF, 0xE1, 0xE2, 0x8F, 0xC0, 0xB5, 0xC3, /* 0x94-0x97 */
        0xC5, 0xC7, 0xE1, 0xE3, 0x8F, 0xC1, 0x8F, 0xC2, /* 0x98-0x9B */
        0xE1, 0xE4, 0x8F, 0xC3, 0x8F, 0xC4, 0x8F, 0xC5, /* 0x9C-0x9F */
        0x8F, 0xC6, 0xD3, 0xF9, 0x8F, 0xC7, 0x8F, 0xC8, /* 0xA0-0xA3 */
        0x8F, 0xC9, 0x8F, 0xCA, 0x8F, 0xCB, 0x8F, 0xCC, /* 0xA4-0xA7 */
        0xE1, 0xE5, 0x8F, 0xCD, 0xD1, 0xAD, 0x8F, 0xCE, /* 0xA8-0xAB */
        0x8F, 0xCF, 0xE1, 0xE6, 0xCE, 0xA2, 0x8F, 0xD0, /* 0xAC-0xAF */
        0x8F, 0xD1, 0x8F, 0xD2, 0x8F, 0xD3, 0x8F, 0xD4, /* 0xB0-0xB3 */
        0x8F, 0xD5, 0xE1, 0xE7, 0x8F, 0xD6, 0xB5, 0xC2, /* 0xB4-0xB7 */
        0x8F, 0xD7, 0x8F, 0xD8, 0x8F, 0xD9, 0x8F, 0xDA, /* 0xB8-0xBB */
        0xE1, 0xE8, 0xBB, 0xD5, 0x8F, 0xDB, 0x8F, 0xDC, /* 0xBC-0xBF */
        0x8F, 0xDD, 0x8F, 0xDE, 0x8F, 0xDF, 0xD0, 0xC4, /* 0xC0-0xC3 */
        0xE2, 0xE0, 0xB1, 0xD8, 0xD2, 0xE4, 0x8F, 0xE0, /* 0xC4-0xC7 */
        0x8F, 0xE1, 0xE2, 0xE1, 0x8F, 0xE2, 0x8F, 0xE3, /* 0xC8-0xCB */
        0xBC, 0xC9, 0xC8, 0xCC, 0x8F, 0xE4, 0xE2, 0xE3, /* 0xCC-0xCF */
        0xEC, 0xFE, 0xEC, 0xFD, 0xDF, 0xAF, 0x8F, 0xE5, /* 0xD0-0xD3 */
        0x8F, 0xE6, 0x8F, 0xE7, 0xE2, 0xE2, 0xD6, 0xBE, /* 0xD4-0xD7 */
        0xCD, 0xFC, 0xC3, 0xA6, 0x8F, 0xE8, 0x8F, 0xE9, /* 0xD8-0xDB */
        0x8F, 0xEA, 0xE3, 0xC3, 0x8F, 0xEB, 0x8F, 0xEC, /* 0xDC-0xDF */
        0xD6, 0xD2, 0xE2, 0xE7, 0x8F, 0xED, 0x8F, 0xEE, /* 0xE0-0xE3 */
        0xE2, 0xE8, 0x8F, 0xEF, 0x8F, 0xF0, 0xD3, 0xC7, /* 0xE4-0xE7 */
        0x8F, 0xF1, 0x8F, 0xF2, 0xE2, 0xEC, 0xBF, 0xEC, /* 0xE8-0xEB */
        0x8F, 0xF3, 0xE2, 0xED, 0xE2, 0xE5, 0x8F, 0xF4, /* 0xEC-0xEF */
        0x8F, 0xF5, 0xB3, 0xC0, 0x8F, 0xF6, 0x8F, 0xF7, /* 0xF0-0xF3 */
        0x8F, 0xF8, 0xC4, 0xEE, 0x8F, 0xF9, 0x8F, 0xFA, /* 0xF4-0xF7 */
        0xE2, 0xEE, 0x8F, 0xFB, 0x8F, 0xFC, 0xD0, 0xC3, /* 0xF8-0xFB */
        0x8F, 0xFD, 0xBA, 0xF6, 0xE2, 0xE9, 0xB7, 0xDE, /* 0xFC-0xFF */
};

static const unsigned char u2c_60[512] = {
        0xBB, 0xB3, 0xCC, 0xAC, 0xCB, 0xCB, 0xE2, 0xE4, /* 0x00-0x03 */
        0xE2, 0xE6, 0xE2, 0xEA, 0xE2, 0xEB, 0x8F, 0xFE, /* 0x04-0x07 */
        0x90, 0x40, 0x90, 0x41, 0xE2, 0xF7, 0x90, 0x42, /* 0x08-0x0B */
        0x90, 0x43, 0xE2, 0xF4, 0xD4, 0xF5, 0xE2, 0xF3, /* 0x0C-0x0F */
        0x90, 0x44, 0x90, 0x45, 0xC5, 0xAD, 0x90, 0x46, /* 0x10-0x13 */
        0xD5, 0xFA, 0xC5, 0xC2, 0xB2, 0xC0, 0x90, 0x47, /* 0x14-0x17 */
        0x90, 0x48, 0xE2, 0xEF, 0x90, 0x49, 0xE2, 0xF2, /* 0x18-0x1B */
        0xC1, 0xAF, 0xCB, 0xBC, 0x90, 0x4A, 0x90, 0x4B, /* 0x1C-0x1F */
        0xB5, 0xA1, 0xE2, 0xF9, 0x90, 0x4C, 0x90, 0x4D, /* 0x20-0x23 */
        0x90, 0x4E, 0xBC, 0xB1, 0xE2, 0xF1, 0xD0, 0xD4, /* 0x24-0x27 */
        0xD4, 0xB9, 0xE2, 0xF5, 0xB9, 0xD6, 0xE2, 0xF6, /* 0x28-0x2B */
        0x90, 0x4F, 0x90, 0x50, 0x90, 0x51, 0xC7, 0xD3, /* 0x2C-0x2F */
        0x90, 0x52, 0x90, 0x53, 0x90, 0x54, 0x90, 0x55, /* 0x30-0x33 */
        0x90, 0x56, 0xE2, 0xF0, 0x90, 0x57, 0x90, 0x58, /* 0x34-0x37 */
        0x90, 0x59, 0x90, 0x5A, 0x90, 0x5B, 0xD7, 0xDC, /* 0x38-0x3B */
        0xED, 0xA1, 0x90, 0x5C, 0x90, 0x5D, 0xE2, 0xF8, /* 0x3C-0x3F */
        0x90, 0x5E, 0xED, 0xA5, 0xE2, 0xFE, 0xCA, 0xD1, /* 0x40-0x43 */
        0x90, 0x5F, 0x90, 0x60, 0x90, 0x61, 0x90, 0x62, /* 0x44-0x47 */
        0x90, 0x63, 0x90, 0x64, 0x90, 0x65, 0xC1, 0xB5, /* 0x48-0x4B */
        0x90, 0x66, 0xBB, 0xD0, 0x90, 0x67, 0x90, 0x68, /* 0x4C-0x4F */
        0xBF, 0xD6, 0x90, 0x69, 0xBA, 0xE3, 0x90, 0x6A, /* 0x50-0x53 */
        0x90, 0x6B, 0xCB, 0xA1, 0x90, 0x6C, 0x90, 0x6D, /* 0x54-0x57 */
        0x90, 0x6E, 0xED, 0xA6, 0xED, 0xA3, 0x90, 0x6F, /* 0x58-0x5B */
        0x90, 0x70, 0xED, 0xA2, 0x90, 0x71, 0x90, 0x72, /* 0x5C-0x5F */
        0x90, 0x73, 0x90, 0x74, 0xBB, 0xD6, 0xED, 0xA7, /* 0x60-0x63 */
        0xD0, 0xF4, 0x90, 0x75, 0x90, 0x76, 0xED, 0xA4, /* 0x64-0x67 */
        0xBA, 0xDE, 0xB6, 0xF7, 0xE3, 0xA1, 0xB6, 0xB2, /* 0x68-0x6B */
        0xCC, 0xF1, 0xB9, 0xA7, 0x90, 0x77, 0xCF, 0xA2, /* 0x6C-0x6F */
        0xC7, 0xA1, 0x90, 0x78, 0x90, 0x79, 0xBF, 0xD2, /* 0x70-0x73 */
        0x90, 0x7A, 0x90, 0x7B, 0xB6, 0xF1, 0x90, 0x7C, /* 0x74-0x77 */
        0xE2, 0xFA, 0xE2, 0xFB, 0xE2, 0xFD, 0xE2, 0xFC, /* 0x78-0x7B */
        0xC4, 0xD5, 0xE3, 0xA2, 0x90, 0x7D, 0xD3, 0xC1, /* 0x7C-0x7F */
        
        0x90, 0x7E, 0x90, 0x80, 0x90, 0x81, 0xE3, 0xA7, /* 0x80-0x83 */
        0xC7, 0xC4, 0x90, 0x82, 0x90, 0x83, 0x90, 0x84, /* 0x84-0x87 */
        0x90, 0x85, 0xCF, 0xA4, 0x90, 0x86, 0x90, 0x87, /* 0x88-0x8B */
        0xE3, 0xA9, 0xBA, 0xB7, 0x90, 0x88, 0x90, 0x89, /* 0x8C-0x8F */
        0x90, 0x8A, 0x90, 0x8B, 0xE3, 0xA8, 0x90, 0x8C, /* 0x90-0x93 */
        0xBB, 0xDA, 0x90, 0x8D, 0xE3, 0xA3, 0x90, 0x8E, /* 0x94-0x97 */
        0x90, 0x8F, 0x90, 0x90, 0xE3, 0xA4, 0xE3, 0xAA, /* 0x98-0x9B */
        0x90, 0x91, 0xE3, 0xA6, 0x90, 0x92, 0xCE, 0xF2, /* 0x9C-0x9F */
        0xD3, 0xC6, 0x90, 0x93, 0x90, 0x94, 0xBB, 0xBC, /* 0xA0-0xA3 */
        0x90, 0x95, 0x90, 0x96, 0xD4, 0xC3, 0x90, 0x97, /* 0xA4-0xA7 */
        0xC4, 0xFA, 0x90, 0x98, 0x90, 0x99, 0xED, 0xA8, /* 0xA8-0xAB */
        0xD0, 0xFC, 0xE3, 0xA5, 0x90, 0x9A, 0xC3, 0xF5, /* 0xAC-0xAF */
        0x90, 0x9B, 0xE3, 0xAD, 0xB1, 0xAF, 0x90, 0x9C, /* 0xB0-0xB3 */
        0xE3, 0xB2, 0x90, 0x9D, 0x90, 0x9E, 0x90, 0x9F, /* 0xB4-0xB7 */
        0xBC, 0xC2, 0x90, 0xA0, 0x90, 0xA1, 0xE3, 0xAC, /* 0xB8-0xBB */
        0xB5, 0xBF, 0x90, 0xA2, 0x90, 0xA3, 0x90, 0xA4, /* 0xBC-0xBF */
        0x90, 0xA5, 0x90, 0xA6, 0x90, 0xA7, 0x90, 0xA8, /* 0xC0-0xC3 */
        0x90, 0xA9, 0xC7, 0xE9, 0xE3, 0xB0, 0x90, 0xAA, /* 0xC4-0xC7 */
        0x90, 0xAB, 0x90, 0xAC, 0xBE, 0xAA, 0xCD, 0xEF, /* 0xC8-0xCB */
        0x90, 0xAD, 0x90, 0xAE, 0x90, 0xAF, 0x90, 0xB0, /* 0xCC-0xCF */
        0x90, 0xB1, 0xBB, 0xF3, 0x90, 0xB2, 0x90, 0xB3, /* 0xD0-0xD3 */
        0x90, 0xB4, 0xCC, 0xE8, 0x90, 0xB5, 0x90, 0xB6, /* 0xD4-0xD7 */
        0xE3, 0xAF, 0x90, 0xB7, 0xE3, 0xB1, 0x90, 0xB8, /* 0xD8-0xDB */
        0xCF, 0xA7, 0xE3, 0xAE, 0x90, 0xB9, 0xCE, 0xA9, /* 0xDC-0xDF */
        0xBB, 0xDD, 0x90, 0xBA, 0x90, 0xBB, 0x90, 0xBC, /* 0xE0-0xE3 */
        0x90, 0xBD, 0x90, 0xBE, 0xB5, 0xEB, 0xBE, 0xE5, /* 0xE4-0xE7 */
        0xB2, 0xD2, 0xB3, 0xCD, 0x90, 0xBF, 0xB1, 0xB9, /* 0xE8-0xEB */
        0xE3, 0xAB, 0xB2, 0xD1, 0xB5, 0xAC, 0xB9, 0xDF, /* 0xEC-0xEF */
        0xB6, 0xE8, 0x90, 0xC0, 0x90, 0xC1, 0xCF, 0xEB, /* 0xF0-0xF3 */
        0xE3, 0xB7, 0x90, 0xC2, 0xBB, 0xCC, 0x90, 0xC3, /* 0xF4-0xF7 */
        0x90, 0xC4, 0xC8, 0xC7, 0xD0, 0xCA, 0x90, 0xC5, /* 0xF8-0xFB */
        0x90, 0xC6, 0x90, 0xC7, 0x90, 0xC8, 0x90, 0xC9, /* 0xFC-0xFF */
};

static const unsigned char u2c_61[512] = {
        0xE3, 0xB8, 0xB3, 0xEE, 0x90, 0xCA, 0x90, 0xCB, /* 0x00-0x03 */
        0x90, 0xCC, 0x90, 0xCD, 0xED, 0xA9, 0x90, 0xCE, /* 0x04-0x07 */
        0xD3, 0xFA, 0xD3, 0xE4, 0x90, 0xCF, 0x90, 0xD0, /* 0x08-0x0B */
        0x90, 0xD1, 0xED, 0xAA, 0xE3, 0xB9, 0xD2, 0xE2, /* 0x0C-0x0F */
        0x90, 0xD2, 0x90, 0xD3, 0x90, 0xD4, 0x90, 0xD5, /* 0x10-0x13 */
        0x90, 0xD6, 0xE3, 0xB5, 0x90, 0xD7, 0x90, 0xD8, /* 0x14-0x17 */
        0x90, 0xD9, 0x90, 0xDA, 0xD3, 0xDE, 0x90, 0xDB, /* 0x18-0x1B */
        0x90, 0xDC, 0x90, 0xDD, 0x90, 0xDE, 0xB8, 0xD0, /* 0x1C-0x1F */
        0xE3, 0xB3, 0x90, 0xDF, 0x90, 0xE0, 0xE3, 0xB6, /* 0x20-0x23 */
        0xB7, 0xDF, 0x90, 0xE1, 0xE3, 0xB4, 0xC0, 0xA2, /* 0x24-0x27 */
        0x90, 0xE2, 0x90, 0xE3, 0x90, 0xE4, 0xE3, 0xBA, /* 0x28-0x2B */
        0x90, 0xE5, 0x90, 0xE6, 0x90, 0xE7, 0x90, 0xE8, /* 0x2C-0x2F */
        0x90, 0xE9, 0x90, 0xEA, 0x90, 0xEB, 0x90, 0xEC, /* 0x30-0x33 */
        0x90, 0xED, 0x90, 0xEE, 0x90, 0xEF, 0x90, 0xF0, /* 0x34-0x37 */
        0x90, 0xF1, 0x90, 0xF2, 0x90, 0xF3, 0x90, 0xF4, /* 0x38-0x3B */
        0x90, 0xF5, 0x90, 0xF6, 0x90, 0xF7, 0xD4, 0xB8, /* 0x3C-0x3F */
        0x90, 0xF8, 0x90, 0xF9, 0x90, 0xFA, 0x90, 0xFB, /* 0x40-0x43 */
        0x90, 0xFC, 0x90, 0xFD, 0x90, 0xFE, 0x91, 0x40, /* 0x44-0x47 */
        0xB4, 0xC8, 0x91, 0x41, 0xE3, 0xBB, 0x91, 0x42, /* 0x48-0x4B */
        0xBB, 0xC5, 0x91, 0x43, 0xC9, 0xF7, 0x91, 0x44, /* 0x4C-0x4F */
        0x91, 0x45, 0xC9, 0xE5, 0x91, 0x46, 0x91, 0x47, /* 0x50-0x53 */
        0x91, 0x48, 0xC4, 0xBD, 0x91, 0x49, 0x91, 0x4A, /* 0x54-0x57 */
        0x91, 0x4B, 0x91, 0x4C, 0x91, 0x4D, 0x91, 0x4E, /* 0x58-0x5B */
        0x91, 0x4F, 0xED, 0xAB, 0x91, 0x50, 0x91, 0x51, /* 0x5C-0x5F */
        0x91, 0x52, 0x91, 0x53, 0xC2, 0xFD, 0x91, 0x54, /* 0x60-0x63 */
        0x91, 0x55, 0x91, 0x56, 0x91, 0x57, 0xBB, 0xDB, /* 0x64-0x67 */
        0xBF, 0xAE, 0x91, 0x58, 0x91, 0x59, 0x91, 0x5A, /* 0x68-0x6B */
        0x91, 0x5B, 0x91, 0x5C, 0x91, 0x5D, 0x91, 0x5E, /* 0x6C-0x6F */
        0xCE, 0xBF, 0x91, 0x5F, 0x91, 0x60, 0x91, 0x61, /* 0x70-0x73 */
        0x91, 0x62, 0xE3, 0xBC, 0x91, 0x63, 0xBF, 0xB6, /* 0x74-0x77 */
        0x91, 0x64, 0x91, 0x65, 0x91, 0x66, 0x91, 0x67, /* 0x78-0x7B */
        0x91, 0x68, 0x91, 0x69, 0x91, 0x6A, 0x91, 0x6B, /* 0x7C-0x7F */
        
        0x91, 0x6C, 0x91, 0x6D, 0x91, 0x6E, 0x91, 0x6F, /* 0x80-0x83 */
        0x91, 0x70, 0x91, 0x71, 0x91, 0x72, 0x91, 0x73, /* 0x84-0x87 */
        0x91, 0x74, 0x91, 0x75, 0x91, 0x76, 0xB1, 0xEF, /* 0x88-0x8B */
        0x91, 0x77, 0x91, 0x78, 0xD4, 0xF7, 0x91, 0x79, /* 0x8C-0x8F */
        0x91, 0x7A, 0x91, 0x7B, 0x91, 0x7C, 0x91, 0x7D, /* 0x90-0x93 */
        0xE3, 0xBE, 0x91, 0x7E, 0x91, 0x80, 0x91, 0x81, /* 0x94-0x97 */
        0x91, 0x82, 0x91, 0x83, 0x91, 0x84, 0x91, 0x85, /* 0x98-0x9B */
        0x91, 0x86, 0xED, 0xAD, 0x91, 0x87, 0x91, 0x88, /* 0x9C-0x9F */
        0x91, 0x89, 0x91, 0x8A, 0x91, 0x8B, 0x91, 0x8C, /* 0xA0-0xA3 */
        0x91, 0x8D, 0x91, 0x8E, 0x91, 0x8F, 0xE3, 0xBF, /* 0xA4-0xA7 */
        0xBA, 0xA9, 0xED, 0xAC, 0x91, 0x90, 0x91, 0x91, /* 0xA8-0xAB */
        0xE3, 0xBD, 0x91, 0x92, 0x91, 0x93, 0x91, 0x94, /* 0xAC-0xAF */
        0x91, 0x95, 0x91, 0x96, 0x91, 0x97, 0x91, 0x98, /* 0xB0-0xB3 */
        0x91, 0x99, 0x91, 0x9A, 0x91, 0x9B, 0xE3, 0xC0, /* 0xB4-0xB7 */
        0x91, 0x9C, 0x91, 0x9D, 0x91, 0x9E, 0x91, 0x9F, /* 0xB8-0xBB */
        0x91, 0xA0, 0x91, 0xA1, 0xBA, 0xB6, 0x91, 0xA2, /* 0xBC-0xBF */
        0x91, 0xA3, 0x91, 0xA4, 0xB6, 0xAE, 0x91, 0xA5, /* 0xC0-0xC3 */
        0x91, 0xA6, 0x91, 0xA7, 0x91, 0xA8, 0x91, 0xA9, /* 0xC4-0xC7 */
        0xD0, 0xB8, 0x91, 0xAA, 0xB0, 0xC3, 0xED, 0xAE, /* 0xC8-0xCB */
        0x91, 0xAB, 0x91, 0xAC, 0x91, 0xAD, 0x91, 0xAE, /* 0xCC-0xCF */
        0x91, 0xAF, 0xED, 0xAF, 0xC0, 0xC1, 0x91, 0xB0, /* 0xD0-0xD3 */
        0xE3, 0xC1, 0x91, 0xB1, 0x91, 0xB2, 0x91, 0xB3, /* 0xD4-0xD7 */
        0x91, 0xB4, 0x91, 0xB5, 0x91, 0xB6, 0x91, 0xB7, /* 0xD8-0xDB */
        0x91, 0xB8, 0x91, 0xB9, 0x91, 0xBA, 0x91, 0xBB, /* 0xDC-0xDF */
        0x91, 0xBC, 0x91, 0xBD, 0x91, 0xBE, 0x91, 0xBF, /* 0xE0-0xE3 */
        0x91, 0xC0, 0x91, 0xC1, 0xC5, 0xB3, 0x91, 0xC2, /* 0xE4-0xE7 */
        0x91, 0xC3, 0x91, 0xC4, 0x91, 0xC5, 0x91, 0xC6, /* 0xE8-0xEB */
        0x91, 0xC7, 0x91, 0xC8, 0x91, 0xC9, 0x91, 0xCA, /* 0xEC-0xEF */
        0x91, 0xCB, 0x91, 0xCC, 0x91, 0xCD, 0x91, 0xCE, /* 0xF0-0xF3 */
        0x91, 0xCF, 0xE3, 0xC2, 0x91, 0xD0, 0x91, 0xD1, /* 0xF4-0xF7 */
        0x91, 0xD2, 0x91, 0xD3, 0x91, 0xD4, 0x91, 0xD5, /* 0xF8-0xFB */
        0x91, 0xD6, 0x91, 0xD7, 0x91, 0xD8, 0xDC, 0xB2, /* 0xFC-0xFF */
};

static const unsigned char u2c_62[512] = {
        0x91, 0xD9, 0x91, 0xDA, 0x91, 0xDB, 0x91, 0xDC, /* 0x00-0x03 */
        0x91, 0xDD, 0x91, 0xDE, 0xED, 0xB0, 0x91, 0xDF, /* 0x04-0x07 */
        0xB8, 0xEA, 0x91, 0xE0, 0xCE, 0xEC, 0xEA, 0xA7, /* 0x08-0x0B */
        0xD0, 0xE7, 0xCA, 0xF9, 0xC8, 0xD6, 0xCF, 0xB7, /* 0x0C-0x0F */
        0xB3, 0xC9, 0xCE, 0xD2, 0xBD, 0xE4, 0x91, 0xE1, /* 0x10-0x13 */
        0x91, 0xE2, 0xE3, 0xDE, 0xBB, 0xF2, 0xEA, 0xA8, /* 0x14-0x17 */
        0xD5, 0xBD, 0x91, 0xE3, 0xC6, 0xDD, 0xEA, 0xA9, /* 0x18-0x1B */
        0x91, 0xE4, 0x91, 0xE5, 0x91, 0xE6, 0xEA, 0xAA, /* 0x1C-0x1F */
        0x91, 0xE7, 0xEA, 0xAC, 0xEA, 0xAB, 0x91, 0xE8, /* 0x20-0x23 */
        0xEA, 0xAE, 0xEA, 0xAD, 0x91, 0xE9, 0x91, 0xEA, /* 0x24-0x27 */
        0x91, 0xEB, 0x91, 0xEC, 0xBD, 0xD8, 0x91, 0xED, /* 0x28-0x2B */
        0xEA, 0xAF, 0x91, 0xEE, 0xC2, 0xBE, 0x91, 0xEF, /* 0x2C-0x2F */
        0x91, 0xF0, 0x91, 0xF1, 0x91, 0xF2, 0xB4, 0xC1, /* 0x30-0x33 */
        0xB4, 0xF7, 0x91, 0xF3, 0x91, 0xF4, 0xBB, 0xA7, /* 0x34-0x37 */
        0x91, 0xF5, 0x91, 0xF6, 0x91, 0xF7, 0x91, 0xF8, /* 0x38-0x3B */
        0x91, 0xF9, 0xEC, 0xE6, 0xEC, 0xE5, 0xB7, 0xBF, /* 0x3C-0x3F */
        0xCB, 0xF9, 0xB1, 0xE2, 0x91, 0xFA, 0xEC, 0xE7, /* 0x40-0x43 */
        0x91, 0xFB, 0x91, 0xFC, 0x91, 0xFD, 0xC9, 0xC8, /* 0x44-0x47 */
        0xEC, 0xE8, 0xEC, 0xE9, 0x91, 0xFE, 0xCA, 0xD6, /* 0x48-0x4B */
        0xDE, 0xD0, 0xB2, 0xC5, 0xD4, 0xFA, 0x92, 0x40, /* 0x4C-0x4F */
        0x92, 0x41, 0xC6, 0xCB, 0xB0, 0xC7, 0xB4, 0xF2, /* 0x50-0x53 */
        0xC8, 0xD3, 0x92, 0x42, 0x92, 0x43, 0x92, 0x44, /* 0x54-0x57 */
        0xCD, 0xD0, 0x92, 0x45, 0x92, 0x46, 0xBF, 0xB8, /* 0x58-0x5B */
        0x92, 0x47, 0x92, 0x48, 0x92, 0x49, 0x92, 0x4A, /* 0x5C-0x5F */
        0x92, 0x4B, 0x92, 0x4C, 0x92, 0x4D, 0xBF, 0xDB, /* 0x60-0x63 */
        0x92, 0x4E, 0x92, 0x4F, 0xC7, 0xA4, 0xD6, 0xB4, /* 0x64-0x67 */
        0x92, 0x50, 0xC0, 0xA9, 0xDE, 0xD1, 0xC9, 0xA8, /* 0x68-0x6B */
        0xD1, 0xEF, 0xC5, 0xA4, 0xB0, 0xE7, 0xB3, 0xB6, /* 0x6C-0x6F */
        0xC8, 0xC5, 0x92, 0x51, 0x92, 0x52, 0xB0, 0xE2, /* 0x70-0x73 */
        0x92, 0x53, 0x92, 0x54, 0xB7, 0xF6, 0x92, 0x55, /* 0x74-0x77 */
        0x92, 0x56, 0xC5, 0xFA, 0x92, 0x57, 0x92, 0x58, /* 0x78-0x7B */
        0xB6, 0xF3, 0x92, 0x59, 0xD5, 0xD2, 0xB3, 0xD0, /* 0x7C-0x7F */
        
        0xBC, 0xBC, 0x92, 0x5A, 0x92, 0x5B, 0x92, 0x5C, /* 0x80-0x83 */
        0xB3, 0xAD, 0x92, 0x5D, 0x92, 0x5E, 0x92, 0x5F, /* 0x84-0x87 */
        0x92, 0x60, 0xBE, 0xF1, 0xB0, 0xD1, 0x92, 0x61, /* 0x88-0x8B */
        0x92, 0x62, 0x92, 0x63, 0x92, 0x64, 0x92, 0x65, /* 0x8C-0x8F */
        0x92, 0x66, 0xD2, 0xD6, 0xCA, 0xE3, 0xD7, 0xA5, /* 0x90-0x93 */
        0x92, 0x67, 0xCD, 0xB6, 0xB6, 0xB6, 0xBF, 0xB9, /* 0x94-0x97 */
        0xD5, 0xDB, 0x92, 0x68, 0xB8, 0xA7, 0xC5, 0xD7, /* 0x98-0x9B */
        0x92, 0x69, 0x92, 0x6A, 0x92, 0x6B, 0xDE, 0xD2, /* 0x9C-0x9F */
        0xBF, 0xD9, 0xC2, 0xD5, 0xC7, 0xC0, 0x92, 0x6C, /* 0xA0-0xA3 */
        0xBB, 0xA4, 0xB1, 0xA8, 0x92, 0x6D, 0x92, 0x6E, /* 0xA4-0xA7 */
        0xC5, 0xEA, 0x92, 0x6F, 0x92, 0x70, 0xC5, 0xFB, /* 0xA8-0xAB */
        0xCC, 0xA7, 0x92, 0x71, 0x92, 0x72, 0x92, 0x73, /* 0xAC-0xAF */
        0x92, 0x74, 0xB1, 0xA7, 0x92, 0x75, 0x92, 0x76, /* 0xB0-0xB3 */
        0x92, 0x77, 0xB5, 0xD6, 0x92, 0x78, 0x92, 0x79, /* 0xB4-0xB7 */
        0x92, 0x7A, 0xC4, 0xA8, 0x92, 0x7B, 0xDE, 0xD3, /* 0xB8-0xBB */
        0xD1, 0xBA, 0xB3, 0xE9, 0x92, 0x7C, 0xC3, 0xF2, /* 0xBC-0xBF */
        0x92, 0x7D, 0x92, 0x7E, 0xB7, 0xF7, 0x92, 0x80, /* 0xC0-0xC3 */
        0xD6, 0xF4, 0xB5, 0xA3, 0xB2, 0xF0, 0xC4, 0xB4, /* 0xC4-0xC7 */
        0xC4, 0xE9, 0xC0, 0xAD, 0xDE, 0xD4, 0x92, 0x81, /* 0xC8-0xCB */
        0xB0, 0xE8, 0xC5, 0xC4, 0xC1, 0xE0, 0x92, 0x82, /* 0xCC-0xCF */
        0xB9, 0xD5, 0x92, 0x83, 0xBE, 0xDC, 0xCD, 0xD8, /* 0xD0-0xD3 */
        0xB0, 0xCE, 0x92, 0x84, 0xCD, 0xCF, 0xDE, 0xD6, /* 0xD4-0xD7 */
        0xBE, 0xD0, 0xD7, 0xBE, 0xDE, 0xD5, 0xD5, 0xD0, /* 0xD8-0xDB */
        0xB0, 0xDD, 0x92, 0x85, 0x92, 0x86, 0xC4, 0xE2, /* 0xDC-0xDF */
        0x92, 0x87, 0x92, 0x88, 0xC2, 0xA3, 0xBC, 0xF0, /* 0xE0-0xE3 */
        0x92, 0x89, 0xD3, 0xB5, 0xC0, 0xB9, 0xC5, 0xA1, /* 0xE4-0xE7 */
        0xB2, 0xA6, 0xD4, 0xF1, 0x92, 0x8A, 0x92, 0x8B, /* 0xE8-0xEB */
        0xC0, 0xA8, 0xCA, 0xC3, 0xDE, 0xD7, 0xD5, 0xFC, /* 0xEC-0xEF */
        0x92, 0x8C, 0xB9, 0xB0, 0x92, 0x8D, 0xC8, 0xAD, /* 0xF0-0xF3 */
        0xCB, 0xA9, 0x92, 0x8E, 0xDE, 0xD9, 0xBF, 0xBD, /* 0xF4-0xF7 */
        0x92, 0x8F, 0x92, 0x90, 0x92, 0x91, 0x92, 0x92, /* 0xF8-0xFB */
        0xC6, 0xB4, 0xD7, 0xA7, 0xCA, 0xB0, 0xC4, 0xC3, /* 0xFC-0xFF */
};

static const unsigned char u2c_63[512] = {
        0x92, 0x93, 0xB3, 0xD6, 0xB9, 0xD2, 0x92, 0x94, /* 0x00-0x03 */
        0x92, 0x95, 0x92, 0x96, 0x92, 0x97, 0xD6, 0xB8, /* 0x04-0x07 */
        0xEA, 0xFC, 0xB0, 0xB4, 0x92, 0x98, 0x92, 0x99, /* 0x08-0x0B */
        0x92, 0x9A, 0x92, 0x9B, 0xBF, 0xE6, 0x92, 0x9C, /* 0x0C-0x0F */
        0x92, 0x9D, 0xCC, 0xF4, 0x92, 0x9E, 0x92, 0x9F, /* 0x10-0x13 */
        0x92, 0xA0, 0x92, 0xA1, 0xCD, 0xDA, 0x92, 0xA2, /* 0x14-0x17 */
        0x92, 0xA3, 0x92, 0xA4, 0xD6, 0xBF, 0xC2, 0xCE, /* 0x18-0x1B */
        0x92, 0xA5, 0xCE, 0xCE, 0xCC, 0xA2, 0xD0, 0xAE, /* 0x1C-0x1F */
        0xC4, 0xD3, 0xB5, 0xB2, 0xDE, 0xD8, 0xD5, 0xF5, /* 0x20-0x23 */
        0xBC, 0xB7, 0xBB, 0xD3, 0x92, 0xA6, 0x92, 0xA7, /* 0x24-0x27 */
        0xB0, 0xA4, 0x92, 0xA8, 0xC5, 0xB2, 0xB4, 0xEC, /* 0x28-0x2B */
        0x92, 0xA9, 0x92, 0xAA, 0x92, 0xAB, 0xD5, 0xF1, /* 0x2C-0x2F */
        0x92, 0xAC, 0x92, 0xAD, 0xEA, 0xFD, 0x92, 0xAE, /* 0x30-0x33 */
        0x92, 0xAF, 0x92, 0xB0, 0x92, 0xB1, 0x92, 0xB2, /* 0x34-0x37 */
        0x92, 0xB3, 0xDE, 0xDA, 0xCD, 0xA6, 0x92, 0xB4, /* 0x38-0x3B */
        0x92, 0xB5, 0xCD, 0xEC, 0x92, 0xB6, 0x92, 0xB7, /* 0x3C-0x3F */
        0x92, 0xB8, 0x92, 0xB9, 0xCE, 0xE6, 0xDE, 0xDC, /* 0x40-0x43 */
        0x92, 0xBA, 0xCD, 0xB1, 0xC0, 0xA6, 0x92, 0xBB, /* 0x44-0x47 */
        0x92, 0xBC, 0xD7, 0xBD, 0x92, 0xBD, 0xDE, 0xDB, /* 0x48-0x4B */
        0xB0, 0xC6, 0xBA, 0xB4, 0xC9, 0xD3, 0xC4, 0xF3, /* 0x4C-0x4F */
        0xBE, 0xE8, 0x92, 0xBE, 0x92, 0xBF, 0x92, 0xC0, /* 0x50-0x53 */
        0x92, 0xC1, 0xB2, 0xB6, 0x92, 0xC2, 0x92, 0xC3, /* 0x54-0x57 */
        0x92, 0xC4, 0x92, 0xC5, 0x92, 0xC6, 0x92, 0xC7, /* 0x58-0x5B */
        0x92, 0xC8, 0x92, 0xC9, 0xC0, 0xCC, 0xCB, 0xF0, /* 0x5C-0x5F */
        0x92, 0xCA, 0xBC, 0xF1, 0xBB, 0xBB, 0xB5, 0xB7, /* 0x60-0x63 */
        0x92, 0xCB, 0x92, 0xCC, 0x92, 0xCD, 0xC5, 0xF5, /* 0x64-0x67 */
        0x92, 0xCE, 0xDE, 0xE6, 0x92, 0xCF, 0x92, 0xD0, /* 0x68-0x6B */
        0x92, 0xD1, 0xDE, 0xE3, 0xBE, 0xDD, 0x92, 0xD2, /* 0x6C-0x6F */
        0x92, 0xD3, 0xDE, 0xDF, 0x92, 0xD4, 0x92, 0xD5, /* 0x70-0x73 */
        0x92, 0xD6, 0x92, 0xD7, 0xB4, 0xB7, 0xBD, 0xDD, /* 0x74-0x77 */
        0x92, 0xD8, 0x92, 0xD9, 0xDE, 0xE0, 0xC4, 0xED, /* 0x78-0x7B */
        0x92, 0xDA, 0x92, 0xDB, 0x92, 0xDC, 0x92, 0xDD, /* 0x7C-0x7F */
        
        0xCF, 0xC6, 0x92, 0xDE, 0xB5, 0xE0, 0x92, 0xDF, /* 0x80-0x83 */
        0x92, 0xE0, 0x92, 0xE1, 0x92, 0xE2, 0xB6, 0xDE, /* 0x84-0x87 */
        0xCA, 0xDA, 0xB5, 0xF4, 0xDE, 0xE5, 0x92, 0xE3, /* 0x88-0x8B */
        0xD5, 0xC6, 0x92, 0xE4, 0xDE, 0xE1, 0xCC, 0xCD, /* 0x8C-0x8F */
        0xC6, 0xFE, 0x92, 0xE5, 0xC5, 0xC5, 0x92, 0xE6, /* 0x90-0x93 */
        0x92, 0xE7, 0x92, 0xE8, 0xD2, 0xB4, 0x92, 0xE9, /* 0x94-0x97 */
        0xBE, 0xF2, 0x92, 0xEA, 0x92, 0xEB, 0x92, 0xEC, /* 0x98-0x9B */
        0x92, 0xED, 0x92, 0xEE, 0x92, 0xEF, 0x92, 0xF0, /* 0x9C-0x9F */
        0xC2, 0xD3, 0x92, 0xF1, 0xCC, 0xBD, 0xB3, 0xB8, /* 0xA0-0xA3 */
        0x92, 0xF2, 0xBD, 0xD3, 0x92, 0xF3, 0xBF, 0xD8, /* 0xA4-0xA7 */
        0xCD, 0xC6, 0xD1, 0xDA, 0xB4, 0xEB, 0x92, 0xF4, /* 0xA8-0xAB */
        0xDE, 0xE4, 0xDE, 0xDD, 0xDE, 0xE7, 0x92, 0xF5, /* 0xAC-0xAF */
        0xEA, 0xFE, 0x92, 0xF6, 0x92, 0xF7, 0xC2, 0xB0, /* 0xB0-0xB3 */
        0xDE, 0xE2, 0x92, 0xF8, 0x92, 0xF9, 0xD6, 0xC0, /* 0xB4-0xB7 */
        0xB5, 0xA7, 0x92, 0xFA, 0xB2, 0xF4, 0x92, 0xFB, /* 0xB8-0xBB */
        0xDE, 0xE8, 0x92, 0xFC, 0xDE, 0xF2, 0x92, 0xFD, /* 0xBC-0xBF */
        0x92, 0xFE, 0x93, 0x40, 0x93, 0x41, 0x93, 0x42, /* 0xC0-0xC3 */
        0xDE, 0xED, 0x93, 0x43, 0xDE, 0xF1, 0x93, 0x44, /* 0xC4-0xC7 */
        0x93, 0x45, 0xC8, 0xE0, 0x93, 0x46, 0x93, 0x47, /* 0xC8-0xCB */
        0x93, 0x48, 0xD7, 0xE1, 0xDE, 0xEF, 0xC3, 0xE8, /* 0xCC-0xCF */
        0xCC, 0xE1, 0x93, 0x49, 0xB2, 0xE5, 0x93, 0x4A, /* 0xD0-0xD3 */
        0x93, 0x4B, 0x93, 0x4C, 0xD2, 0xBE, 0x93, 0x4D, /* 0xD4-0xD7 */
        0x93, 0x4E, 0x93, 0x4F, 0x93, 0x50, 0x93, 0x51, /* 0xD8-0xDB */
        0x93, 0x52, 0x93, 0x53, 0xDE, 0xEE, 0x93, 0x54, /* 0xDC-0xDF */
        0xDE, 0xEB, 0xCE, 0xD5, 0x93, 0x55, 0xB4, 0xA7, /* 0xE0-0xE3 */
        0x93, 0x56, 0x93, 0x57, 0x93, 0x58, 0x93, 0x59, /* 0xE4-0xE7 */
        0x93, 0x5A, 0xBF, 0xAB, 0xBE, 0xBE, 0x93, 0x5B, /* 0xE8-0xEB */
        0x93, 0x5C, 0xBD, 0xD2, 0x93, 0x5D, 0x93, 0x5E, /* 0xEC-0xEF */
        0x93, 0x5F, 0x93, 0x60, 0xDE, 0xE9, 0x93, 0x61, /* 0xF0-0xF3 */
        0xD4, 0xAE, 0x93, 0x62, 0xDE, 0xDE, 0x93, 0x63, /* 0xF4-0xF7 */
        0xDE, 0xEA, 0x93, 0x64, 0x93, 0x65, 0x93, 0x66, /* 0xF8-0xFB */
        0x93, 0x67, 0xC0, 0xBF, 0x93, 0x68, 0xDE, 0xEC, /* 0xFC-0xFF */
};

static const unsigned char u2c_64[512] = {
        0xB2, 0xF3, 0xB8, 0xE9, 0xC2, 0xA7, 0x93, 0x69, /* 0x00-0x03 */
        0x93, 0x6A, 0xBD, 0xC1, 0x93, 0x6B, 0x93, 0x6C, /* 0x04-0x07 */
        0x93, 0x6D, 0x93, 0x6E, 0x93, 0x6F, 0xDE, 0xF5, /* 0x08-0x0B */
        0xDE, 0xF8, 0x93, 0x70, 0x93, 0x71, 0xB2, 0xAB, /* 0x0C-0x0F */
        0xB4, 0xA4, 0x93, 0x72, 0x93, 0x73, 0xB4, 0xEA, /* 0x10-0x13 */
        0xC9, 0xA6, 0x93, 0x74, 0x93, 0x75, 0x93, 0x76, /* 0x14-0x17 */
        0x93, 0x77, 0x93, 0x78, 0x93, 0x79, 0xDE, 0xF6, /* 0x18-0x1B */
        0xCB, 0xD1, 0x93, 0x7A, 0xB8, 0xE3, 0x93, 0x7B, /* 0x1C-0x1F */
        0xDE, 0xF7, 0xDE, 0xFA, 0x93, 0x7C, 0x93, 0x7D, /* 0x20-0x23 */
        0x93, 0x7E, 0x93, 0x80, 0xDE, 0xF9, 0x93, 0x81, /* 0x24-0x27 */
        0x93, 0x82, 0x93, 0x83, 0xCC, 0xC2, 0x93, 0x84, /* 0x28-0x2B */
        0xB0, 0xE1, 0xB4, 0xEE, 0x93, 0x85, 0x93, 0x86, /* 0x2C-0x2F */
        0x93, 0x87, 0x93, 0x88, 0x93, 0x89, 0x93, 0x8A, /* 0x30-0x33 */
        0xE5, 0xBA, 0x93, 0x8B, 0x93, 0x8C, 0x93, 0x8D, /* 0x34-0x37 */
        0x93, 0x8E, 0x93, 0x8F, 0xD0, 0xAF, 0x93, 0x90, /* 0x38-0x3B */
        0x93, 0x91, 0xB2, 0xEB, 0x93, 0x92, 0xEB, 0xA1, /* 0x3C-0x3F */
        0x93, 0x93, 0xDE, 0xF4, 0x93, 0x94, 0x93, 0x95, /* 0x40-0x43 */
        0xC9, 0xE3, 0xDE, 0xF3, 0xB0, 0xDA, 0xD2, 0xA1, /* 0x44-0x47 */
        0xB1, 0xF7, 0x93, 0x96, 0xCC, 0xAF, 0x93, 0x97, /* 0x48-0x4B */
        0x93, 0x98, 0x93, 0x99, 0x93, 0x9A, 0x93, 0x9B, /* 0x4C-0x4F */
        0x93, 0x9C, 0x93, 0x9D, 0xDE, 0xF0, 0x93, 0x9E, /* 0x50-0x53 */
        0xCB, 0xA4, 0x93, 0x9F, 0x93, 0xA0, 0x93, 0xA1, /* 0x54-0x57 */
        0xD5, 0xAA, 0x93, 0xA2, 0x93, 0xA3, 0x93, 0xA4, /* 0x58-0x5B */
        0x93, 0xA5, 0x93, 0xA6, 0xDE, 0xFB, 0x93, 0xA7, /* 0x5C-0x5F */
        0x93, 0xA8, 0x93, 0xA9, 0x93, 0xAA, 0x93, 0xAB, /* 0x60-0x63 */
        0x93, 0xAC, 0x93, 0xAD, 0x93, 0xAE, 0xB4, 0xDD, /* 0x64-0x67 */
        0x93, 0xAF, 0xC4, 0xA6, 0x93, 0xB0, 0x93, 0xB1, /* 0x68-0x6B */
        0x93, 0xB2, 0xDE, 0xFD, 0x93, 0xB3, 0x93, 0xB4, /* 0x6C-0x6F */
        0x93, 0xB5, 0x93, 0xB6, 0x93, 0xB7, 0x93, 0xB8, /* 0x70-0x73 */
        0x93, 0xB9, 0x93, 0xBA, 0x93, 0xBB, 0x93, 0xBC, /* 0x74-0x77 */
        0xC3, 0xFE, 0xC4, 0xA1, 0xDF, 0xA1, 0x93, 0xBD, /* 0x78-0x7B */
        0x93, 0xBE, 0x93, 0xBF, 0x93, 0xC0, 0x93, 0xC1, /* 0x7C-0x7F */
        
        0x93, 0xC2, 0x93, 0xC3, 0xC1, 0xCC, 0x93, 0xC4, /* 0x80-0x83 */
        0xDE, 0xFC, 0xBE, 0xEF, 0x93, 0xC5, 0xC6, 0xB2, /* 0x84-0x87 */
        0x93, 0xC6, 0x93, 0xC7, 0x93, 0xC8, 0x93, 0xC9, /* 0x88-0x8B */
        0x93, 0xCA, 0x93, 0xCB, 0x93, 0xCC, 0x93, 0xCD, /* 0x8C-0x8F */
        0x93, 0xCE, 0xB3, 0xC5, 0xC8, 0xF6, 0x93, 0xCF, /* 0x90-0x93 */
        0x93, 0xD0, 0xCB, 0xBA, 0xDE, 0xFE, 0x93, 0xD1, /* 0x94-0x97 */
        0x93, 0xD2, 0xDF, 0xA4, 0x93, 0xD3, 0x93, 0xD4, /* 0x98-0x9B */
        0x93, 0xD5, 0x93, 0xD6, 0xD7, 0xB2, 0x93, 0xD7, /* 0x9C-0x9F */
        0x93, 0xD8, 0x93, 0xD9, 0x93, 0xDA, 0x93, 0xDB, /* 0xA0-0xA3 */
        0xB3, 0xB7, 0x93, 0xDC, 0x93, 0xDD, 0x93, 0xDE, /* 0xA4-0xA7 */
        0x93, 0xDF, 0xC1, 0xC3, 0x93, 0xE0, 0x93, 0xE1, /* 0xA8-0xAB */
        0xC7, 0xCB, 0xB2, 0xA5, 0xB4, 0xE9, 0x93, 0xE2, /* 0xAC-0xAF */
        0xD7, 0xAB, 0x93, 0xE3, 0x93, 0xE4, 0x93, 0xE5, /* 0xB0-0xB3 */
        0x93, 0xE6, 0xC4, 0xEC, 0x93, 0xE7, 0xDF, 0xA2, /* 0xB4-0xB7 */
        0xDF, 0xA3, 0x93, 0xE8, 0xDF, 0xA5, 0x93, 0xE9, /* 0xB8-0xBB */
        0xBA, 0xB3, 0x93, 0xEA, 0x93, 0xEB, 0x93, 0xEC, /* 0xBC-0xBF */
        0xDF, 0xA6, 0x93, 0xED, 0xC0, 0xDE, 0x93, 0xEE, /* 0xC0-0xC3 */
        0x93, 0xEF, 0xC9, 0xC3, 0x93, 0xF0, 0x93, 0xF1, /* 0xC4-0xC7 */
        0x93, 0xF2, 0x93, 0xF3, 0x93, 0xF4, 0x93, 0xF5, /* 0xC8-0xCB */
        0x93, 0xF6, 0xB2, 0xD9, 0xC7, 0xE6, 0x93, 0xF7, /* 0xCC-0xCF */
        0xDF, 0xA7, 0x93, 0xF8, 0xC7, 0xDC, 0x93, 0xF9, /* 0xD0-0xD3 */
        0x93, 0xFA, 0x93, 0xFB, 0x93, 0xFC, 0xDF, 0xA8, /* 0xD4-0xD7 */
        0xEB, 0xA2, 0x93, 0xFD, 0x93, 0xFE, 0x94, 0x40, /* 0xD8-0xDB */
        0x94, 0x41, 0x94, 0x42, 0xCB, 0xD3, 0x94, 0x43, /* 0xDC-0xDF */
        0x94, 0x44, 0x94, 0x45, 0xDF, 0xAA, 0x94, 0x46, /* 0xE0-0xE3 */
        0xDF, 0xA9, 0x94, 0x47, 0xB2, 0xC1, 0x94, 0x48, /* 0xE4-0xE7 */
        0x94, 0x49, 0x94, 0x4A, 0x94, 0x4B, 0x94, 0x4C, /* 0xE8-0xEB */
        0x94, 0x4D, 0x94, 0x4E, 0x94, 0x4F, 0x94, 0x50, /* 0xEC-0xEF */
        0x94, 0x51, 0x94, 0x52, 0x94, 0x53, 0x94, 0x54, /* 0xF0-0xF3 */
        0x94, 0x55, 0x94, 0x56, 0x94, 0x57, 0x94, 0x58, /* 0xF4-0xF7 */
        0x94, 0x59, 0x94, 0x5A, 0x94, 0x5B, 0x94, 0x5C, /* 0xF8-0xFB */
        0x94, 0x5D, 0x94, 0x5E, 0x94, 0x5F, 0x94, 0x60, /* 0xFC-0xFF */
};

static const unsigned char u2c_65[512] = {
        0xC5, 0xCA, 0x94, 0x61, 0x94, 0x62, 0x94, 0x63, /* 0x00-0x03 */
        0x94, 0x64, 0x94, 0x65, 0x94, 0x66, 0x94, 0x67, /* 0x04-0x07 */
        0x94, 0x68, 0xDF, 0xAB, 0x94, 0x69, 0x94, 0x6A, /* 0x08-0x0B */
        0x94, 0x6B, 0x94, 0x6C, 0x94, 0x6D, 0x94, 0x6E, /* 0x0C-0x0F */
        0x94, 0x6F, 0x94, 0x70, 0xD4, 0xDC, 0x94, 0x71, /* 0x10-0x13 */
        0x94, 0x72, 0x94, 0x73, 0x94, 0x74, 0x94, 0x75, /* 0x14-0x17 */
        0xC8, 0xC1, 0x94, 0x76, 0x94, 0x77, 0x94, 0x78, /* 0x18-0x1B */
        0x94, 0x79, 0x94, 0x7A, 0x94, 0x7B, 0x94, 0x7C, /* 0x1C-0x1F */
        0x94, 0x7D, 0x94, 0x7E, 0x94, 0x80, 0x94, 0x81, /* 0x20-0x23 */
        0x94, 0x82, 0xDF, 0xAC, 0x94, 0x83, 0x94, 0x84, /* 0x24-0x27 */
        0x94, 0x85, 0x94, 0x86, 0x94, 0x87, 0xBE, 0xF0, /* 0x28-0x2B */
        0x94, 0x88, 0x94, 0x89, 0xDF, 0xAD, 0xD6, 0xA7, /* 0x2C-0x2F */
        0x94, 0x8A, 0x94, 0x8B, 0x94, 0x8C, 0x94, 0x8D, /* 0x30-0x33 */
        0xEA, 0xB7, 0xEB, 0xB6, 0xCA, 0xD5, 0x94, 0x8E, /* 0x34-0x37 */
        0xD8, 0xFC, 0xB8, 0xC4, 0x94, 0x8F, 0xB9, 0xA5, /* 0x38-0x3B */
        0x94, 0x90, 0x94, 0x91, 0xB7, 0xC5, 0xD5, 0xFE, /* 0x3C-0x3F */
        0x94, 0x92, 0x94, 0x93, 0x94, 0x94, 0x94, 0x95, /* 0x40-0x43 */
        0x94, 0x96, 0xB9, 0xCA, 0x94, 0x97, 0x94, 0x98, /* 0x44-0x47 */
        0xD0, 0xA7, 0xF4, 0xCD, 0x94, 0x99, 0x94, 0x9A, /* 0x48-0x4B */
        0xB5, 0xD0, 0x94, 0x9B, 0x94, 0x9C, 0xC3, 0xF4, /* 0x4C-0x4F */
        0x94, 0x9D, 0xBE, 0xC8, 0x94, 0x9E, 0x94, 0x9F, /* 0x50-0x53 */
        0x94, 0xA0, 0xEB, 0xB7, 0xB0, 0xBD, 0x94, 0xA1, /* 0x54-0x57 */
        0x94, 0xA2, 0xBD, 0xCC, 0x94, 0xA3, 0xC1, 0xB2, /* 0x58-0x5B */
        0x94, 0xA4, 0xB1, 0xD6, 0xB3, 0xA8, 0x94, 0xA5, /* 0x5C-0x5F */
        0x94, 0xA6, 0x94, 0xA7, 0xB8, 0xD2, 0xC9, 0xA2, /* 0x60-0x63 */
        0x94, 0xA8, 0x94, 0xA9, 0xB6, 0xD8, 0x94, 0xAA, /* 0x64-0x67 */
        0x94, 0xAB, 0x94, 0xAC, 0x94, 0xAD, 0xEB, 0xB8, /* 0x68-0x6B */
        0xBE, 0xB4, 0x94, 0xAE, 0x94, 0xAF, 0x94, 0xB0, /* 0x6C-0x6F */
        0xCA, 0xFD, 0x94, 0xB1, 0xC7, 0xC3, 0x94, 0xB2, /* 0x70-0x73 */
        0xD5, 0xFB, 0x94, 0xB3, 0x94, 0xB4, 0xB7, 0xF3, /* 0x74-0x77 */
        0x94, 0xB5, 0x94, 0xB6, 0x94, 0xB7, 0x94, 0xB8, /* 0x78-0x7B */
        0x94, 0xB9, 0x94, 0xBA, 0x94, 0xBB, 0x94, 0xBC, /* 0x7C-0x7F */
        
        0x94, 0xBD, 0x94, 0xBE, 0x94, 0xBF, 0x94, 0xC0, /* 0x80-0x83 */
        0x94, 0xC1, 0x94, 0xC2, 0x94, 0xC3, 0xCE, 0xC4, /* 0x84-0x87 */
        0x94, 0xC4, 0x94, 0xC5, 0x94, 0xC6, 0xD5, 0xAB, /* 0x88-0x8B */
        0xB1, 0xF3, 0x94, 0xC7, 0x94, 0xC8, 0x94, 0xC9, /* 0x8C-0x8F */
        0xEC, 0xB3, 0xB0, 0xDF, 0x94, 0xCA, 0xEC, 0xB5, /* 0x90-0x93 */
        0x94, 0xCB, 0x94, 0xCC, 0x94, 0xCD, 0xB6, 0xB7, /* 0x94-0x97 */
        0x94, 0xCE, 0xC1, 0xCF, 0x94, 0xCF, 0xF5, 0xFA, /* 0x98-0x9B */
        0xD0, 0xB1, 0x94, 0xD0, 0x94, 0xD1, 0xD5, 0xE5, /* 0x9C-0x9F */
        0x94, 0xD2, 0xCE, 0xD3, 0x94, 0xD3, 0x94, 0xD4, /* 0xA0-0xA3 */
        0xBD, 0xEF, 0xB3, 0xE2, 0x94, 0xD5, 0xB8, 0xAB, /* 0xA4-0xA7 */
        0x94, 0xD6, 0xD5, 0xB6, 0x94, 0xD7, 0xED, 0xBD, /* 0xA8-0xAB */
        0x94, 0xD8, 0xB6, 0xCF, 0x94, 0xD9, 0xCB, 0xB9, /* 0xAC-0xAF */
        0xD0, 0xC2, 0x94, 0xDA, 0x94, 0xDB, 0x94, 0xDC, /* 0xB0-0xB3 */
        0x94, 0xDD, 0x94, 0xDE, 0x94, 0xDF, 0x94, 0xE0, /* 0xB4-0xB7 */
        0x94, 0xE1, 0xB7, 0xBD, 0x94, 0xE2, 0x94, 0xE3, /* 0xB8-0xBB */
        0xEC, 0xB6, 0xCA, 0xA9, 0x94, 0xE4, 0x94, 0xE5, /* 0xBC-0xBF */
        0x94, 0xE6, 0xC5, 0xD4, 0x94, 0xE7, 0xEC, 0xB9, /* 0xC0-0xC3 */
        0xEC, 0xB8, 0xC2, 0xC3, 0xEC, 0xB7, 0x94, 0xE8, /* 0xC4-0xC7 */
        0x94, 0xE9, 0x94, 0xEA, 0x94, 0xEB, 0xD0, 0xFD, /* 0xC8-0xCB */
        0xEC, 0xBA, 0x94, 0xEC, 0xEC, 0xBB, 0xD7, 0xE5, /* 0xCC-0xCF */
        0x94, 0xED, 0x94, 0xEE, 0xEC, 0xBC, 0x94, 0xEF, /* 0xD0-0xD3 */
        0x94, 0xF0, 0x94, 0xF1, 0xEC, 0xBD, 0xC6, 0xEC, /* 0xD4-0xD7 */
        0x94, 0xF2, 0x94, 0xF3, 0x94, 0xF4, 0x94, 0xF5, /* 0xD8-0xDB */
        0x94, 0xF6, 0x94, 0xF7, 0x94, 0xF8, 0x94, 0xF9, /* 0xDC-0xDF */
        0xCE, 0xDE, 0x94, 0xFA, 0xBC, 0xC8, 0x94, 0xFB, /* 0xE0-0xE3 */
        0x94, 0xFC, 0xC8, 0xD5, 0xB5, 0xA9, 0xBE, 0xC9, /* 0xE4-0xE7 */
        0xD6, 0xBC, 0xD4, 0xE7, 0x94, 0xFD, 0x94, 0xFE, /* 0xE8-0xEB */
        0xD1, 0xAE, 0xD0, 0xF1, 0xEA, 0xB8, 0xEA, 0xB9, /* 0xEC-0xEF */
        0xEA, 0xBA, 0xBA, 0xB5, 0x95, 0x40, 0x95, 0x41, /* 0xF0-0xF3 */
        0x95, 0x42, 0x95, 0x43, 0xCA, 0xB1, 0xBF, 0xF5, /* 0xF4-0xF7 */
        0x95, 0x44, 0x95, 0x45, 0xCD, 0xFA, 0x95, 0x46, /* 0xF8-0xFB */
        0x95, 0x47, 0x95, 0x48, 0x95, 0x49, 0x95, 0x4A, /* 0xFC-0xFF */
};

static const unsigned char u2c_66[512] = {
        0xEA, 0xC0, 0x95, 0x4B, 0xB0, 0xBA, 0xEA, 0xBE, /* 0x00-0x03 */
        0x95, 0x4C, 0x95, 0x4D, 0xC0, 0xA5, 0x95, 0x4E, /* 0x04-0x07 */
        0x95, 0x4F, 0x95, 0x50, 0xEA, 0xBB, 0x95, 0x51, /* 0x08-0x0B */
        0xB2, 0xFD, 0x95, 0x52, 0xC3, 0xF7, 0xBB, 0xE8, /* 0x0C-0x0F */
        0x95, 0x53, 0x95, 0x54, 0x95, 0x55, 0xD2, 0xD7, /* 0x10-0x13 */
        0xCE, 0xF4, 0xEA, 0xBF, 0x95, 0x56, 0x95, 0x57, /* 0x14-0x17 */
        0x95, 0x58, 0xEA, 0xBC, 0x95, 0x59, 0x95, 0x5A, /* 0x18-0x1B */
        0x95, 0x5B, 0xEA, 0xC3, 0x95, 0x5C, 0xD0, 0xC7, /* 0x1C-0x1F */
        0xD3, 0xB3, 0x95, 0x5D, 0x95, 0x5E, 0x95, 0x5F, /* 0x20-0x23 */
        0x95, 0x60, 0xB4, 0xBA, 0x95, 0x61, 0xC3, 0xC1, /* 0x24-0x27 */
        0xD7, 0xF2, 0x95, 0x62, 0x95, 0x63, 0x95, 0x64, /* 0x28-0x2B */
        0x95, 0x65, 0xD5, 0xD1, 0x95, 0x66, 0xCA, 0xC7, /* 0x2C-0x2F */
        0x95, 0x67, 0xEA, 0xC5, 0x95, 0x68, 0x95, 0x69, /* 0x30-0x33 */
        0xEA, 0xC4, 0xEA, 0xC7, 0xEA, 0xC6, 0x95, 0x6A, /* 0x34-0x37 */
        0x95, 0x6B, 0x95, 0x6C, 0x95, 0x6D, 0x95, 0x6E, /* 0x38-0x3B */
        0xD6, 0xE7, 0x95, 0x6F, 0xCF, 0xD4, 0x95, 0x70, /* 0x3C-0x3F */
        0x95, 0x71, 0xEA, 0xCB, 0x95, 0x72, 0xBB, 0xCE, /* 0x40-0x43 */
        0x95, 0x73, 0x95, 0x74, 0x95, 0x75, 0x95, 0x76, /* 0x44-0x47 */
        0x95, 0x77, 0x95, 0x78, 0x95, 0x79, 0xBD, 0xFA, /* 0x48-0x4B */
        0xC9, 0xCE, 0x95, 0x7A, 0x95, 0x7B, 0xEA, 0xCC, /* 0x4C-0x4F */
        0x95, 0x7C, 0x95, 0x7D, 0xC9, 0xB9, 0xCF, 0xFE, /* 0x50-0x53 */
        0xEA, 0xCA, 0xD4, 0xCE, 0xEA, 0xCD, 0xEA, 0xCF, /* 0x54-0x57 */
        0x95, 0x7E, 0x95, 0x80, 0xCD, 0xED, 0x95, 0x81, /* 0x58-0x5B */
        0x95, 0x82, 0x95, 0x83, 0x95, 0x84, 0xEA, 0xC9, /* 0x5C-0x5F */
        0x95, 0x85, 0xEA, 0xCE, 0x95, 0x86, 0x95, 0x87, /* 0x60-0x63 */
        0xCE, 0xEE, 0x95, 0x88, 0xBB, 0xDE, 0x95, 0x89, /* 0x64-0x67 */
        0xB3, 0xBF, 0x95, 0x8A, 0x95, 0x8B, 0x95, 0x8C, /* 0x68-0x6B */
        0x95, 0x8D, 0x95, 0x8E, 0xC6, 0xD5, 0xBE, 0xB0, /* 0x6C-0x6F */
        0xCE, 0xFA, 0x95, 0x8F, 0x95, 0x90, 0x95, 0x91, /* 0x70-0x73 */
        0xC7, 0xE7, 0x95, 0x92, 0xBE, 0xA7, 0xEA, 0xD0, /* 0x74-0x77 */
        0x95, 0x93, 0x95, 0x94, 0xD6, 0xC7, 0x95, 0x95, /* 0x78-0x7B */
        0x95, 0x96, 0x95, 0x97, 0xC1, 0xC0, 0x95, 0x98, /* 0x7C-0x7F */
        
        0x95, 0x99, 0x95, 0x9A, 0xD4, 0xDD, 0x95, 0x9B, /* 0x80-0x83 */
        0xEA, 0xD1, 0x95, 0x9C, 0x95, 0x9D, 0xCF, 0xBE, /* 0x84-0x87 */
        0x95, 0x9E, 0x95, 0x9F, 0x95, 0xA0, 0x95, 0xA1, /* 0x88-0x8B */
        0xEA, 0xD2, 0x95, 0xA2, 0x95, 0xA3, 0x95, 0xA4, /* 0x8C-0x8F */
        0x95, 0xA5, 0xCA, 0xEE, 0x95, 0xA6, 0x95, 0xA7, /* 0x90-0x93 */
        0x95, 0xA8, 0x95, 0xA9, 0xC5, 0xAF, 0xB0, 0xB5, /* 0x94-0x97 */
        0x95, 0xAA, 0x95, 0xAB, 0x95, 0xAC, 0x95, 0xAD, /* 0x98-0x9B */
        0x95, 0xAE, 0xEA, 0xD4, 0x95, 0xAF, 0x95, 0xB0, /* 0x9C-0x9F */
        0x95, 0xB1, 0x95, 0xB2, 0x95, 0xB3, 0x95, 0xB4, /* 0xA0-0xA3 */
        0x95, 0xB5, 0x95, 0xB6, 0x95, 0xB7, 0xEA, 0xD3, /* 0xA4-0xA7 */
        0xF4, 0xDF, 0x95, 0xB8, 0x95, 0xB9, 0x95, 0xBA, /* 0xA8-0xAB */
        0x95, 0xBB, 0x95, 0xBC, 0xC4, 0xBA, 0x95, 0xBD, /* 0xAC-0xAF */
        0x95, 0xBE, 0x95, 0xBF, 0x95, 0xC0, 0x95, 0xC1, /* 0xB0-0xB3 */
        0xB1, 0xA9, 0x95, 0xC2, 0x95, 0xC3, 0x95, 0xC4, /* 0xB4-0xB7 */
        0x95, 0xC5, 0xE5, 0xDF, 0x95, 0xC6, 0x95, 0xC7, /* 0xB8-0xBB */
        0x95, 0xC8, 0x95, 0xC9, 0xEA, 0xD5, 0x95, 0xCA, /* 0xBC-0xBF */
        0x95, 0xCB, 0x95, 0xCC, 0x95, 0xCD, 0x95, 0xCE, /* 0xC0-0xC3 */
        0x95, 0xCF, 0x95, 0xD0, 0x95, 0xD1, 0x95, 0xD2, /* 0xC4-0xC7 */
        0x95, 0xD3, 0x95, 0xD4, 0x95, 0xD5, 0x95, 0xD6, /* 0xC8-0xCB */
        0x95, 0xD7, 0x95, 0xD8, 0x95, 0xD9, 0x95, 0xDA, /* 0xCC-0xCF */
        0x95, 0xDB, 0x95, 0xDC, 0x95, 0xDD, 0x95, 0xDE, /* 0xD0-0xD3 */
        0x95, 0xDF, 0x95, 0xE0, 0x95, 0xE1, 0x95, 0xE2, /* 0xD4-0xD7 */
        0x95, 0xE3, 0xCA, 0xEF, 0x95, 0xE4, 0xEA, 0xD6, /* 0xD8-0xDB */
        0xEA, 0xD7, 0xC6, 0xD8, 0x95, 0xE5, 0x95, 0xE6, /* 0xDC-0xDF */
        0x95, 0xE7, 0x95, 0xE8, 0x95, 0xE9, 0x95, 0xEA, /* 0xE0-0xE3 */
        0x95, 0xEB, 0x95, 0xEC, 0xEA, 0xD8, 0x95, 0xED, /* 0xE4-0xE7 */
        0x95, 0xEE, 0xEA, 0xD9, 0x95, 0xEF, 0x95, 0xF0, /* 0xE8-0xEB */
        0x95, 0xF1, 0x95, 0xF2, 0x95, 0xF3, 0x95, 0xF4, /* 0xEC-0xEF */
        0xD4, 0xBB, 0x95, 0xF5, 0xC7, 0xFA, 0xD2, 0xB7, /* 0xF0-0xF3 */
        0xB8, 0xFC, 0x95, 0xF6, 0x95, 0xF7, 0xEA, 0xC2, /* 0xF4-0xF7 */
        0x95, 0xF8, 0xB2, 0xDC, 0x95, 0xF9, 0x95, 0xFA, /* 0xF8-0xFB */
        0xC2, 0xFC, 0x95, 0xFB, 0xD4, 0xF8, 0xCC, 0xE6, /* 0xFC-0xFF */
};

static const unsigned char u2c_67[512] = {
        0xD7, 0xEE, 0x95, 0xFC, 0x95, 0xFD, 0x95, 0xFE, /* 0x00-0x03 */
        0x96, 0x40, 0x96, 0x41, 0x96, 0x42, 0x96, 0x43, /* 0x04-0x07 */
        0xD4, 0xC2, 0xD3, 0xD0, 0xEB, 0xC3, 0xC5, 0xF3, /* 0x08-0x0B */
        0x96, 0x44, 0xB7, 0xFE, 0x96, 0x45, 0x96, 0x46, /* 0x0C-0x0F */
        0xEB, 0xD4, 0x96, 0x47, 0x96, 0x48, 0x96, 0x49, /* 0x10-0x13 */
        0xCB, 0xB7, 0xEB, 0xDE, 0x96, 0x4A, 0xC0, 0xCA, /* 0x14-0x17 */
        0x96, 0x4B, 0x96, 0x4C, 0x96, 0x4D, 0xCD, 0xFB, /* 0x18-0x1B */
        0x96, 0x4E, 0xB3, 0xAF, 0x96, 0x4F, 0xC6, 0xDA, /* 0x1C-0x1F */
        0x96, 0x50, 0x96, 0x51, 0x96, 0x52, 0x96, 0x53, /* 0x20-0x23 */
        0x96, 0x54, 0x96, 0x55, 0xEB, 0xFC, 0x96, 0x56, /* 0x24-0x27 */
        0xC4, 0xBE, 0x96, 0x57, 0xCE, 0xB4, 0xC4, 0xA9, /* 0x28-0x2B */
        0xB1, 0xBE, 0xD4, 0xFD, 0x96, 0x58, 0xCA, 0xF5, /* 0x2C-0x2F */
        0x96, 0x59, 0xD6, 0xEC, 0x96, 0x5A, 0x96, 0x5B, /* 0x30-0x33 */
        0xC6, 0xD3, 0xB6, 0xE4, 0x96, 0x5C, 0x96, 0x5D, /* 0x34-0x37 */
        0x96, 0x5E, 0x96, 0x5F, 0xBB, 0xFA, 0x96, 0x60, /* 0x38-0x3B */
        0x96, 0x61, 0xD0, 0xE0, 0x96, 0x62, 0x96, 0x63, /* 0x3C-0x3F */
        0xC9, 0xB1, 0x96, 0x64, 0xD4, 0xD3, 0xC8, 0xA8, /* 0x40-0x43 */
        0x96, 0x65, 0x96, 0x66, 0xB8, 0xCB, 0x96, 0x67, /* 0x44-0x47 */
        0xE8, 0xBE, 0xC9, 0xBC, 0x96, 0x68, 0x96, 0x69, /* 0x48-0x4B */
        0xE8, 0xBB, 0x96, 0x6A, 0xC0, 0xEE, 0xD0, 0xD3, /* 0x4C-0x4F */
        0xB2, 0xC4, 0xB4, 0xE5, 0x96, 0x6B, 0xE8, 0xBC, /* 0x50-0x53 */
        0x96, 0x6C, 0x96, 0x6D, 0xD5, 0xC8, 0x96, 0x6E, /* 0x54-0x57 */
        0x96, 0x6F, 0x96, 0x70, 0x96, 0x71, 0x96, 0x72, /* 0x58-0x5B */
        0xB6, 0xC5, 0x96, 0x73, 0xE8, 0xBD, 0xCA, 0xF8, /* 0x5C-0x5F */
        0xB8, 0xDC, 0xCC, 0xF5, 0x96, 0x74, 0x96, 0x75, /* 0x60-0x63 */
        0x96, 0x76, 0xC0, 0xB4, 0x96, 0x77, 0x96, 0x78, /* 0x64-0x67 */
        0xD1, 0xEE, 0xE8, 0xBF, 0xE8, 0xC2, 0x96, 0x79, /* 0x68-0x6B */
        0x96, 0x7A, 0xBA, 0xBC, 0x96, 0x7B, 0xB1, 0xAD, /* 0x6C-0x6F */
        0xBD, 0xDC, 0x96, 0x7C, 0xEA, 0xBD, 0xE8, 0xC3, /* 0x70-0x73 */
        0x96, 0x7D, 0xE8, 0xC6, 0x96, 0x7E, 0xE8, 0xCB, /* 0x74-0x77 */
        0x96, 0x80, 0x96, 0x81, 0x96, 0x82, 0x96, 0x83, /* 0x78-0x7B */
        0xE8, 0xCC, 0x96, 0x84, 0xCB, 0xC9, 0xB0, 0xE5, /* 0x7C-0x7F */
        
        0x96, 0x85, 0xBC, 0xAB, 0x96, 0x86, 0x96, 0x87, /* 0x80-0x83 */
        0xB9, 0xB9, 0x96, 0x88, 0x96, 0x89, 0xE8, 0xC1, /* 0x84-0x87 */
        0x96, 0x8A, 0xCD, 0xF7, 0x96, 0x8B, 0xE8, 0xCA, /* 0x88-0x8B */
        0x96, 0x8C, 0x96, 0x8D, 0x96, 0x8E, 0x96, 0x8F, /* 0x8C-0x8F */
        0xCE, 0xF6, 0x96, 0x90, 0x96, 0x91, 0x96, 0x92, /* 0x90-0x93 */
        0x96, 0x93, 0xD5, 0xED, 0x96, 0x94, 0xC1, 0xD6, /* 0x94-0x97 */
        0xE8, 0xC4, 0x96, 0x95, 0xC3, 0xB6, 0x96, 0x96, /* 0x98-0x9B */
        0xB9, 0xFB, 0xD6, 0xA6, 0xE8, 0xC8, 0x96, 0x97, /* 0x9C-0x9F */
        0x96, 0x98, 0x96, 0x99, 0xCA, 0xE0, 0xD4, 0xE6, /* 0xA0-0xA3 */
        0x96, 0x9A, 0xE8, 0xC0, 0x96, 0x9B, 0xE8, 0xC5, /* 0xA4-0xA7 */
        0xE8, 0xC7, 0x96, 0x9C, 0xC7, 0xB9, 0xB7, 0xE3, /* 0xA8-0xAB */
        0x96, 0x9D, 0xE8, 0xC9, 0x96, 0x9E, 0xBF, 0xDD, /* 0xAC-0xAF */
        0xE8, 0xD2, 0x96, 0x9F, 0x96, 0xA0, 0xE8, 0xD7, /* 0xB0-0xB3 */
        0x96, 0xA1, 0xE8, 0xD5, 0xBC, 0xDC, 0xBC, 0xCF, /* 0xB4-0xB7 */
        0xE8, 0xDB, 0x96, 0xA2, 0x96, 0xA3, 0x96, 0xA4, /* 0xB8-0xBB */
        0x96, 0xA5, 0x96, 0xA6, 0x96, 0xA7, 0x96, 0xA8, /* 0xBC-0xBF */
        0x96, 0xA9, 0xE8, 0xDE, 0x96, 0xAA, 0xE8, 0xDA, /* 0xC0-0xC3 */
        0xB1, 0xFA, 0x96, 0xAB, 0x96, 0xAC, 0x96, 0xAD, /* 0xC4-0xC7 */
        0x96, 0xAE, 0x96, 0xAF, 0x96, 0xB0, 0x96, 0xB1, /* 0xC8-0xCB */
        0x96, 0xB2, 0x96, 0xB3, 0x96, 0xB4, 0xB0, 0xD8, /* 0xCC-0xCF */
        0xC4, 0xB3, 0xB8, 0xCC, 0xC6, 0xE2, 0xC8, 0xBE, /* 0xD0-0xD3 */
        0xC8, 0xE1, 0x96, 0xB5, 0x96, 0xB6, 0x96, 0xB7, /* 0xD4-0xD7 */
        0xE8, 0xCF, 0xE8, 0xD4, 0xE8, 0xD6, 0x96, 0xB8, /* 0xD8-0xDB */
        0xB9, 0xF1, 0xE8, 0xD8, 0xD7, 0xF5, 0x96, 0xB9, /* 0xDC-0xDF */
        0xC4, 0xFB, 0x96, 0xBA, 0xE8, 0xDC, 0x96, 0xBB, /* 0xE0-0xE3 */
        0x96, 0xBC, 0xB2, 0xE9, 0x96, 0xBD, 0x96, 0xBE, /* 0xE4-0xE7 */
        0x96, 0xBF, 0xE8, 0xD1, 0x96, 0xC0, 0x96, 0xC1, /* 0xE8-0xEB */
        0xBC, 0xED, 0x96, 0xC2, 0x96, 0xC3, 0xBF, 0xC2, /* 0xEC-0xEF */
        0xE8, 0xCD, 0xD6, 0xF9, 0x96, 0xC4, 0xC1, 0xF8, /* 0xF0-0xF3 */
        0xB2, 0xF1, 0x96, 0xC5, 0x96, 0xC6, 0x96, 0xC7, /* 0xF4-0xF7 */
        0x96, 0xC8, 0x96, 0xC9, 0x96, 0xCA, 0x96, 0xCB, /* 0xF8-0xFB */
        0x96, 0xCC, 0xE8, 0xDF, 0x96, 0xCD, 0xCA, 0xC1, /* 0xFC-0xFF */
};

static const unsigned char u2c_68[512] = {
        0xE8, 0xD9, 0x96, 0xCE, 0x96, 0xCF, 0x96, 0xD0, /* 0x00-0x03 */
        0x96, 0xD1, 0xD5, 0xA4, 0x96, 0xD2, 0xB1, 0xEA, /* 0x04-0x07 */
        0xD5, 0xBB, 0xE8, 0xCE, 0xE8, 0xD0, 0xB6, 0xB0, /* 0x08-0x0B */
        0xE8, 0xD3, 0x96, 0xD3, 0xE8, 0xDD, 0xC0, 0xB8, /* 0x0C-0x0F */
        0x96, 0xD4, 0xCA, 0xF7, 0x96, 0xD5, 0xCB, 0xA8, /* 0x10-0x13 */
        0x96, 0xD6, 0x96, 0xD7, 0xC6, 0xDC, 0xC0, 0xF5, /* 0x14-0x17 */
        0x96, 0xD8, 0x96, 0xD9, 0x96, 0xDA, 0x96, 0xDB, /* 0x18-0x1B */
        0x96, 0xDC, 0xE8, 0xE9, 0x96, 0xDD, 0x96, 0xDE, /* 0x1C-0x1F */
        0x96, 0xDF, 0xD0, 0xA3, 0x96, 0xE0, 0x96, 0xE1, /* 0x20-0x23 */
        0x96, 0xE2, 0x96, 0xE3, 0x96, 0xE4, 0x96, 0xE5, /* 0x24-0x27 */
        0x96, 0xE6, 0xE8, 0xF2, 0xD6, 0xEA, 0x96, 0xE7, /* 0x28-0x2B */
        0x96, 0xE8, 0x96, 0xE9, 0x96, 0xEA, 0x96, 0xEB, /* 0x2C-0x2F */
        0x96, 0xEC, 0x96, 0xED, 0xE8, 0xE0, 0xE8, 0xE1, /* 0x30-0x33 */
        0x96, 0xEE, 0x96, 0xEF, 0x96, 0xF0, 0xD1, 0xF9, /* 0x34-0x37 */
        0xBA, 0xCB, 0xB8, 0xF9, 0x96, 0xF1, 0x96, 0xF2, /* 0x38-0x3B */
        0xB8, 0xF1, 0xD4, 0xD4, 0xE8, 0xEF, 0x96, 0xF3, /* 0x3C-0x3F */
        0xE8, 0xEE, 0xE8, 0xEC, 0xB9, 0xF0, 0xCC, 0xD2, /* 0x40-0x43 */
        0xE8, 0xE6, 0xCE, 0xA6, 0xBF, 0xF2, 0x96, 0xF4, /* 0x44-0x47 */
        0xB0, 0xB8, 0xE8, 0xF1, 0xE8, 0xF0, 0x96, 0xF5, /* 0x48-0x4B */
        0xD7, 0xC0, 0x96, 0xF6, 0xE8, 0xE4, 0x96, 0xF7, /* 0x4C-0x4F */
        0xCD, 0xA9, 0xC9, 0xA3, 0x96, 0xF8, 0xBB, 0xB8, /* 0x50-0x53 */
        0xBD, 0xDB, 0xE8, 0xEA, 0x96, 0xF9, 0x96, 0xFA, /* 0x54-0x57 */
        0x96, 0xFB, 0x96, 0xFC, 0x96, 0xFD, 0x96, 0xFE, /* 0x58-0x5B */
        0x97, 0x40, 0x97, 0x41, 0x97, 0x42, 0x97, 0x43, /* 0x5C-0x5F */
        0xE8, 0xE2, 0xE8, 0xE3, 0xE8, 0xE5, 0xB5, 0xB5, /* 0x60-0x63 */
        0xE8, 0xE7, 0xC7, 0xC5, 0xE8, 0xEB, 0xE8, 0xED, /* 0x64-0x67 */
        0xBD, 0xB0, 0xD7, 0xAE, 0x97, 0x44, 0xE8, 0xF8, /* 0x68-0x6B */
        0x97, 0x45, 0x97, 0x46, 0x97, 0x47, 0x97, 0x48, /* 0x6C-0x6F */
        0x97, 0x49, 0x97, 0x4A, 0x97, 0x4B, 0x97, 0x4C, /* 0x70-0x73 */
        0xE8, 0xF5, 0x97, 0x4D, 0xCD, 0xB0, 0xE8, 0xF6, /* 0x74-0x77 */
        0x97, 0x4E, 0x97, 0x4F, 0x97, 0x50, 0x97, 0x51, /* 0x78-0x7B */
        0x97, 0x52, 0x97, 0x53, 0x97, 0x54, 0x97, 0x55, /* 0x7C-0x7F */
        
        0x97, 0x56, 0xC1, 0xBA, 0x97, 0x57, 0xE8, 0xE8, /* 0x80-0x83 */
        0x97, 0x58, 0xC3, 0xB7, 0xB0, 0xF0, 0x97, 0x59, /* 0x84-0x87 */
        0x97, 0x5A, 0x97, 0x5B, 0x97, 0x5C, 0x97, 0x5D, /* 0x88-0x8B */
        0x97, 0x5E, 0x97, 0x5F, 0x97, 0x60, 0xE8, 0xF4, /* 0x8C-0x8F */
        0x97, 0x61, 0x97, 0x62, 0x97, 0x63, 0xE8, 0xF7, /* 0x90-0x93 */
        0x97, 0x64, 0x97, 0x65, 0x97, 0x66, 0xB9, 0xA3, /* 0x94-0x97 */
        0x97, 0x67, 0x97, 0x68, 0x97, 0x69, 0x97, 0x6A, /* 0x98-0x9B */
        0x97, 0x6B, 0x97, 0x6C, 0x97, 0x6D, 0x97, 0x6E, /* 0x9C-0x9F */
        0x97, 0x6F, 0x97, 0x70, 0xC9, 0xD2, 0x97, 0x71, /* 0xA0-0xA3 */
        0x97, 0x72, 0x97, 0x73, 0xC3, 0xCE, 0xCE, 0xE0, /* 0xA4-0xA7 */
        0xC0, 0xE6, 0x97, 0x74, 0x97, 0x75, 0x97, 0x76, /* 0xA8-0xAB */
        0x97, 0x77, 0xCB, 0xF3, 0x97, 0x78, 0xCC, 0xDD, /* 0xAC-0xAF */
        0xD0, 0xB5, 0x97, 0x79, 0x97, 0x7A, 0xCA, 0xE1, /* 0xB0-0xB3 */
        0x97, 0x7B, 0xE8, 0xF3, 0x97, 0x7C, 0x97, 0x7D, /* 0xB4-0xB7 */
        0x97, 0x7E, 0x97, 0x80, 0x97, 0x81, 0x97, 0x82, /* 0xB8-0xBB */
        0x97, 0x83, 0x97, 0x84, 0x97, 0x85, 0x97, 0x86, /* 0xBC-0xBF */
        0xBC, 0xEC, 0x97, 0x87, 0xE8, 0xF9, 0x97, 0x88, /* 0xC0-0xC3 */
        0x97, 0x89, 0x97, 0x8A, 0x97, 0x8B, 0x97, 0x8C, /* 0xC4-0xC7 */
        0x97, 0x8D, 0xC3, 0xDE, 0x97, 0x8E, 0xC6, 0xE5, /* 0xC8-0xCB */
        0x97, 0x8F, 0xB9, 0xF7, 0x97, 0x90, 0x97, 0x91, /* 0xCC-0xCF */
        0x97, 0x92, 0x97, 0x93, 0xB0, 0xF4, 0x97, 0x94, /* 0xD0-0xD3 */
        0x97, 0x95, 0xD7, 0xD8, 0x97, 0x96, 0x97, 0x97, /* 0xD4-0xD7 */
        0xBC, 0xAC, 0x97, 0x98, 0xC5, 0xEF, 0x97, 0x99, /* 0xD8-0xDB */
        0x97, 0x9A, 0x97, 0x9B, 0x97, 0x9C, 0x97, 0x9D, /* 0xDC-0xDF */
        0xCC, 0xC4, 0x97, 0x9E, 0x97, 0x9F, 0xE9, 0xA6, /* 0xE0-0xE3 */
        0x97, 0xA0, 0x97, 0xA1, 0x97, 0xA2, 0x97, 0xA3, /* 0xE4-0xE7 */
        0x97, 0xA4, 0x97, 0xA5, 0x97, 0xA6, 0x97, 0xA7, /* 0xE8-0xEB */
        0x97, 0xA8, 0x97, 0xA9, 0xC9, 0xAD, 0x97, 0xAA, /* 0xEC-0xEF */
        0xE9, 0xA2, 0xC0, 0xE2, 0x97, 0xAB, 0x97, 0xAC, /* 0xF0-0xF3 */
        0x97, 0xAD, 0xBF, 0xC3, 0x97, 0xAE, 0x97, 0xAF, /* 0xF4-0xF7 */
        0x97, 0xB0, 0xE8, 0xFE, 0xB9, 0xD7, 0x97, 0xB1, /* 0xF8-0xFB */
        0xE8, 0xFB, 0x97, 0xB2, 0x97, 0xB3, 0x97, 0xB4, /* 0xFC-0xFF */
};

static const unsigned char u2c_69[512] = {
        0x97, 0xB5, 0xE9, 0xA4, 0x97, 0xB6, 0x97, 0xB7, /* 0x00-0x03 */
        0x97, 0xB8, 0xD2, 0xCE, 0x97, 0xB9, 0x97, 0xBA, /* 0x04-0x07 */
        0x97, 0xBB, 0x97, 0xBC, 0x97, 0xBD, 0xE9, 0xA3, /* 0x08-0x0B */
        0x97, 0xBE, 0xD6, 0xB2, 0xD7, 0xB5, 0x97, 0xBF, /* 0x0C-0x0F */
        0xE9, 0xA7, 0x97, 0xC0, 0xBD, 0xB7, 0x97, 0xC1, /* 0x10-0x13 */
        0x97, 0xC2, 0x97, 0xC3, 0x97, 0xC4, 0x97, 0xC5, /* 0x14-0x17 */
        0x97, 0xC6, 0x97, 0xC7, 0x97, 0xC8, 0x97, 0xC9, /* 0x18-0x1B */
        0x97, 0xCA, 0x97, 0xCB, 0x97, 0xCC, 0xE8, 0xFC, /* 0x1C-0x1F */
        0xE8, 0xFD, 0x97, 0xCD, 0x97, 0xCE, 0x97, 0xCF, /* 0x20-0x23 */
        0xE9, 0xA1, 0x97, 0xD0, 0x97, 0xD1, 0x97, 0xD2, /* 0x24-0x27 */
        0x97, 0xD3, 0x97, 0xD4, 0x97, 0xD5, 0x97, 0xD6, /* 0x28-0x2B */
        0x97, 0xD7, 0xCD, 0xD6, 0x97, 0xD8, 0x97, 0xD9, /* 0x2C-0x2F */
        0xD2, 0xAC, 0x97, 0xDA, 0x97, 0xDB, 0x97, 0xDC, /* 0x30-0x33 */
        0xE9, 0xB2, 0x97, 0xDD, 0x97, 0xDE, 0x97, 0xDF, /* 0x34-0x37 */
        0x97, 0xE0, 0xE9, 0xA9, 0x97, 0xE1, 0x97, 0xE2, /* 0x38-0x3B */
        0x97, 0xE3, 0xB4, 0xAA, 0x97, 0xE4, 0xB4, 0xBB, /* 0x3C-0x3F */
        0x97, 0xE5, 0x97, 0xE6, 0xE9, 0xAB, 0x97, 0xE7, /* 0x40-0x43 */
        0x97, 0xE8, 0x97, 0xE9, 0x97, 0xEA, 0x97, 0xEB, /* 0x44-0x47 */
        0x97, 0xEC, 0x97, 0xED, 0x97, 0xEE, 0x97, 0xEF, /* 0x48-0x4B */
        0x97, 0xF0, 0x97, 0xF1, 0x97, 0xF2, 0x97, 0xF3, /* 0x4C-0x4F */
        0x97, 0xF4, 0x97, 0xF5, 0x97, 0xF6, 0x97, 0xF7, /* 0x50-0x53 */
        0xD0, 0xA8, 0x97, 0xF8, 0x97, 0xF9, 0xE9, 0xA5, /* 0x54-0x57 */
        0x97, 0xFA, 0x97, 0xFB, 0xB3, 0xFE, 0x97, 0xFC, /* 0x58-0x5B */
        0x97, 0xFD, 0xE9, 0xAC, 0xC0, 0xE3, 0x97, 0xFE, /* 0x5C-0x5F */
        0xE9, 0xAA, 0x98, 0x40, 0x98, 0x41, 0xE9, 0xB9, /* 0x60-0x63 */
        0x98, 0x42, 0x98, 0x43, 0xE9, 0xB8, 0x98, 0x44, /* 0x64-0x67 */
        0x98, 0x45, 0x98, 0x46, 0x98, 0x47, 0xE9, 0xAE, /* 0x68-0x6B */
        0x98, 0x48, 0x98, 0x49, 0xE8, 0xFA, 0x98, 0x4A, /* 0x6C-0x6F */
        0x98, 0x4B, 0xE9, 0xA8, 0x98, 0x4C, 0x98, 0x4D, /* 0x70-0x73 */
        0x98, 0x4E, 0x98, 0x4F, 0x98, 0x50, 0xBF, 0xAC, /* 0x74-0x77 */
        0xE9, 0xB1, 0xE9, 0xBA, 0x98, 0x51, 0x98, 0x52, /* 0x78-0x7B */
        0xC2, 0xA5, 0x98, 0x53, 0x98, 0x54, 0x98, 0x55, /* 0x7C-0x7F */
        
        0xE9, 0xAF, 0x98, 0x56, 0xB8, 0xC5, 0x98, 0x57, /* 0x80-0x83 */
        0xE9, 0xAD, 0x98, 0x58, 0xD3, 0xDC, 0xE9, 0xB4, /* 0x84-0x87 */
        0xE9, 0xB5, 0xE9, 0xB7, 0x98, 0x59, 0x98, 0x5A, /* 0x88-0x8B */
        0x98, 0x5B, 0xE9, 0xC7, 0x98, 0x5C, 0x98, 0x5D, /* 0x8C-0x8F */
        0x98, 0x5E, 0x98, 0x5F, 0x98, 0x60, 0x98, 0x61, /* 0x90-0x93 */
        0xC0, 0xC6, 0xE9, 0xC5, 0x98, 0x62, 0x98, 0x63, /* 0x94-0x97 */
        0xE9, 0xB0, 0x98, 0x64, 0x98, 0x65, 0xE9, 0xBB, /* 0x98-0x9B */
        0xB0, 0xF1, 0x98, 0x66, 0x98, 0x67, 0x98, 0x68, /* 0x9C-0x9F */
        0x98, 0x69, 0x98, 0x6A, 0x98, 0x6B, 0x98, 0x6C, /* 0xA0-0xA3 */
        0x98, 0x6D, 0x98, 0x6E, 0x98, 0x6F, 0xE9, 0xBC, /* 0xA4-0xA7 */
        0xD5, 0xA5, 0x98, 0x70, 0x98, 0x71, 0xE9, 0xBE, /* 0xA8-0xAB */
        0x98, 0x72, 0xE9, 0xBF, 0x98, 0x73, 0x98, 0x74, /* 0xAC-0xAF */
        0x98, 0x75, 0xE9, 0xC1, 0x98, 0x76, 0x98, 0x77, /* 0xB0-0xB3 */
        0xC1, 0xF1, 0x98, 0x78, 0x98, 0x79, 0xC8, 0xB6, /* 0xB4-0xB7 */
        0x98, 0x7A, 0x98, 0x7B, 0x98, 0x7C, 0xE9, 0xBD, /* 0xB8-0xBB */
        0x98, 0x7D, 0x98, 0x7E, 0x98, 0x80, 0x98, 0x81, /* 0xBC-0xBF */
        0x98, 0x82, 0xE9, 0xC2, 0x98, 0x83, 0x98, 0x84, /* 0xC0-0xC3 */
        0x98, 0x85, 0x98, 0x86, 0x98, 0x87, 0x98, 0x88, /* 0xC4-0xC7 */
        0x98, 0x89, 0x98, 0x8A, 0xE9, 0xC3, 0x98, 0x8B, /* 0xC8-0xCB */
        0xE9, 0xB3, 0x98, 0x8C, 0xE9, 0xB6, 0x98, 0x8D, /* 0xCC-0xCF */
        0xBB, 0xB1, 0x98, 0x8E, 0x98, 0x8F, 0x98, 0x90, /* 0xD0-0xD3 */
        0xE9, 0xC0, 0x98, 0x91, 0x98, 0x92, 0x98, 0x93, /* 0xD4-0xD7 */
        0x98, 0x94, 0x98, 0x95, 0x98, 0x96, 0xBC, 0xF7, /* 0xD8-0xDB */
        0x98, 0x97, 0x98, 0x98, 0x98, 0x99, 0xE9, 0xC4, /* 0xDC-0xDF */
        0xE9, 0xC6, 0x98, 0x9A, 0x98, 0x9B, 0x98, 0x9C, /* 0xE0-0xE3 */
        0x98, 0x9D, 0x98, 0x9E, 0x98, 0x9F, 0x98, 0xA0, /* 0xE4-0xE7 */
        0x98, 0xA1, 0x98, 0xA2, 0x98, 0xA3, 0x98, 0xA4, /* 0xE8-0xEB */
        0x98, 0xA5, 0xE9, 0xCA, 0x98, 0xA6, 0x98, 0xA7, /* 0xEC-0xEF */
        0x98, 0xA8, 0x98, 0xA9, 0xE9, 0xCE, 0x98, 0xAA, /* 0xF0-0xF3 */
        0x98, 0xAB, 0x98, 0xAC, 0x98, 0xAD, 0x98, 0xAE, /* 0xF4-0xF7 */
        0x98, 0xAF, 0x98, 0xB0, 0x98, 0xB1, 0x98, 0xB2, /* 0xF8-0xFB */
        0x98, 0xB3, 0xB2, 0xDB, 0x98, 0xB4, 0xE9, 0xC8, /* 0xFC-0xFF */
};

static const unsigned char u2c_6A[512] = {
        0x98, 0xB5, 0x98, 0xB6, 0x98, 0xB7, 0x98, 0xB8, /* 0x00-0x03 */
        0x98, 0xB9, 0x98, 0xBA, 0x98, 0xBB, 0x98, 0xBC, /* 0x04-0x07 */
        0x98, 0xBD, 0x98, 0xBE, 0xB7, 0xAE, 0x98, 0xBF, /* 0x08-0x0B */
        0x98, 0xC0, 0x98, 0xC1, 0x98, 0xC2, 0x98, 0xC3, /* 0x0C-0x0F */
        0x98, 0xC4, 0x98, 0xC5, 0x98, 0xC6, 0x98, 0xC7, /* 0x10-0x13 */
        0x98, 0xC8, 0x98, 0xC9, 0x98, 0xCA, 0xE9, 0xCB, /* 0x14-0x17 */
        0xE9, 0xCC, 0x98, 0xCB, 0x98, 0xCC, 0x98, 0xCD, /* 0x18-0x1B */
        0x98, 0xCE, 0x98, 0xCF, 0x98, 0xD0, 0xD5, 0xC1, /* 0x1C-0x1F */
        0x98, 0xD1, 0xC4, 0xA3, 0x98, 0xD2, 0x98, 0xD3, /* 0x20-0x23 */
        0x98, 0xD4, 0x98, 0xD5, 0x98, 0xD6, 0x98, 0xD7, /* 0x24-0x27 */
        0xE9, 0xD8, 0x98, 0xD8, 0xBA, 0xE1, 0x98, 0xD9, /* 0x28-0x2B */
        0x98, 0xDA, 0x98, 0xDB, 0x98, 0xDC, 0xE9, 0xC9, /* 0x2C-0x2F */
        0x98, 0xDD, 0xD3, 0xA3, 0x98, 0xDE, 0x98, 0xDF, /* 0x30-0x33 */
        0x98, 0xE0, 0xE9, 0xD4, 0x98, 0xE1, 0x98, 0xE2, /* 0x34-0x37 */
        0x98, 0xE3, 0x98, 0xE4, 0x98, 0xE5, 0x98, 0xE6, /* 0x38-0x3B */
        0x98, 0xE7, 0xE9, 0xD7, 0xE9, 0xD0, 0x98, 0xE8, /* 0x3C-0x3F */
        0x98, 0xE9, 0x98, 0xEA, 0x98, 0xEB, 0x98, 0xEC, /* 0x40-0x43 */
        0xE9, 0xCF, 0x98, 0xED, 0x98, 0xEE, 0xC7, 0xC1, /* 0x44-0x47 */
        0x98, 0xEF, 0x98, 0xF0, 0x98, 0xF1, 0x98, 0xF2, /* 0x48-0x4B */
        0x98, 0xF3, 0x98, 0xF4, 0x98, 0xF5, 0x98, 0xF6, /* 0x4C-0x4F */
        0xE9, 0xD2, 0x98, 0xF7, 0x98, 0xF8, 0x98, 0xF9, /* 0x50-0x53 */
        0x98, 0xFA, 0x98, 0xFB, 0x98, 0xFC, 0x98, 0xFD, /* 0x54-0x57 */
        0xE9, 0xD9, 0xB3, 0xC8, 0x98, 0xFE, 0xE9, 0xD3, /* 0x58-0x5B */
        0x99, 0x40, 0x99, 0x41, 0x99, 0x42, 0x99, 0x43, /* 0x5C-0x5F */
        0x99, 0x44, 0xCF, 0xF0, 0x99, 0x45, 0x99, 0x46, /* 0x60-0x63 */
        0x99, 0x47, 0xE9, 0xCD, 0x99, 0x48, 0x99, 0x49, /* 0x64-0x67 */
        0x99, 0x4A, 0x99, 0x4B, 0x99, 0x4C, 0x99, 0x4D, /* 0x68-0x6B */
        0x99, 0x4E, 0x99, 0x4F, 0x99, 0x50, 0x99, 0x51, /* 0x6C-0x6F */
        0x99, 0x52, 0xB3, 0xF7, 0x99, 0x53, 0x99, 0x54, /* 0x70-0x73 */
        0x99, 0x55, 0x99, 0x56, 0x99, 0x57, 0x99, 0x58, /* 0x74-0x77 */
        0x99, 0x59, 0xE9, 0xD6, 0x99, 0x5A, 0x99, 0x5B, /* 0x78-0x7B */
        0xE9, 0xDA, 0x99, 0x5C, 0x99, 0x5D, 0x99, 0x5E, /* 0x7C-0x7F */
        
        0xCC, 0xB4, 0x99, 0x5F, 0x99, 0x60, 0x99, 0x61, /* 0x80-0x83 */
        0xCF, 0xAD, 0x99, 0x62, 0x99, 0x63, 0x99, 0x64, /* 0x84-0x87 */
        0x99, 0x65, 0x99, 0x66, 0x99, 0x67, 0x99, 0x68, /* 0x88-0x8B */
        0x99, 0x69, 0x99, 0x6A, 0xE9, 0xD5, 0x99, 0x6B, /* 0x8C-0x8F */
        0xE9, 0xDC, 0xE9, 0xDB, 0x99, 0x6C, 0x99, 0x6D, /* 0x90-0x93 */
        0x99, 0x6E, 0x99, 0x6F, 0x99, 0x70, 0xE9, 0xDE, /* 0x94-0x97 */
        0x99, 0x71, 0x99, 0x72, 0x99, 0x73, 0x99, 0x74, /* 0x98-0x9B */
        0x99, 0x75, 0x99, 0x76, 0x99, 0x77, 0x99, 0x78, /* 0x9C-0x9F */
        0xE9, 0xD1, 0x99, 0x79, 0x99, 0x7A, 0x99, 0x7B, /* 0xA0-0xA3 */
        0x99, 0x7C, 0x99, 0x7D, 0x99, 0x7E, 0x99, 0x80, /* 0xA4-0xA7 */
        0x99, 0x81, 0xE9, 0xDD, 0x99, 0x82, 0xE9, 0xDF, /* 0xA8-0xAB */
        0xC3, 0xCA, 0x99, 0x83, 0x99, 0x84, 0x99, 0x85, /* 0xAC-0xAF */
        0x99, 0x86, 0x99, 0x87, 0x99, 0x88, 0x99, 0x89, /* 0xB0-0xB3 */
        0x99, 0x8A, 0x99, 0x8B, 0x99, 0x8C, 0x99, 0x8D, /* 0xB4-0xB7 */
        0x99, 0x8E, 0x99, 0x8F, 0x99, 0x90, 0x99, 0x91, /* 0xB8-0xBB */
        0x99, 0x92, 0x99, 0x93, 0x99, 0x94, 0x99, 0x95, /* 0xBC-0xBF */
        0x99, 0x96, 0x99, 0x97, 0x99, 0x98, 0x99, 0x99, /* 0xC0-0xC3 */
        0x99, 0x9A, 0x99, 0x9B, 0x99, 0x9C, 0x99, 0x9D, /* 0xC4-0xC7 */
        0x99, 0x9E, 0x99, 0x9F, 0x99, 0xA0, 0x99, 0xA1, /* 0xC8-0xCB */
        0x99, 0xA2, 0x99, 0xA3, 0x99, 0xA4, 0x99, 0xA5, /* 0xCC-0xCF */
        0x99, 0xA6, 0x99, 0xA7, 0x99, 0xA8, 0x99, 0xA9, /* 0xD0-0xD3 */
        0x99, 0xAA, 0x99, 0xAB, 0x99, 0xAC, 0x99, 0xAD, /* 0xD4-0xD7 */
        0x99, 0xAE, 0x99, 0xAF, 0x99, 0xB0, 0x99, 0xB1, /* 0xD8-0xDB */
        0x99, 0xB2, 0x99, 0xB3, 0x99, 0xB4, 0x99, 0xB5, /* 0xDC-0xDF */
        0x99, 0xB6, 0x99, 0xB7, 0x99, 0xB8, 0x99, 0xB9, /* 0xE0-0xE3 */
        0x99, 0xBA, 0x99, 0xBB, 0x99, 0xBC, 0x99, 0xBD, /* 0xE4-0xE7 */
        0x99, 0xBE, 0x99, 0xBF, 0x99, 0xC0, 0x99, 0xC1, /* 0xE8-0xEB */
        0x99, 0xC2, 0x99, 0xC3, 0x99, 0xC4, 0x99, 0xC5, /* 0xEC-0xEF */
        0x99, 0xC6, 0x99, 0xC7, 0x99, 0xC8, 0x99, 0xC9, /* 0xF0-0xF3 */
        0x99, 0xCA, 0x99, 0xCB, 0x99, 0xCC, 0x99, 0xCD, /* 0xF4-0xF7 */
        0x99, 0xCE, 0x99, 0xCF, 0x99, 0xD0, 0x99, 0xD1, /* 0xF8-0xFB */
        0x99, 0xD2, 0x99, 0xD3, 0x99, 0xD4, 0x99, 0xD5, /* 0xFC-0xFF */
};

static const unsigned char u2c_6B[512] = {
        0x99, 0xD6, 0x99, 0xD7, 0x99, 0xD8, 0x99, 0xD9, /* 0x00-0x03 */
        0x99, 0xDA, 0x99, 0xDB, 0x99, 0xDC, 0x99, 0xDD, /* 0x04-0x07 */
        0x99, 0xDE, 0x99, 0xDF, 0x99, 0xE0, 0x99, 0xE1, /* 0x08-0x0B */
        0x99, 0xE2, 0x99, 0xE3, 0x99, 0xE4, 0x99, 0xE5, /* 0x0C-0x0F */
        0x99, 0xE6, 0x99, 0xE7, 0x99, 0xE8, 0x99, 0xE9, /* 0x10-0x13 */
        0x99, 0xEA, 0x99, 0xEB, 0x99, 0xEC, 0x99, 0xED, /* 0x14-0x17 */
        0x99, 0xEE, 0x99, 0xEF, 0x99, 0xF0, 0x99, 0xF1, /* 0x18-0x1B */
        0x99, 0xF2, 0x99, 0xF3, 0x99, 0xF4, 0x99, 0xF5, /* 0x1C-0x1F */
        0xC7, 0xB7, 0xB4, 0xCE, 0xBB, 0xB6, 0xD0, 0xC0, /* 0x20-0x23 */
        0xEC, 0xA3, 0x99, 0xF6, 0x99, 0xF7, 0xC5, 0xB7, /* 0x24-0x27 */
        0x99, 0xF8, 0x99, 0xF9, 0x99, 0xFA, 0x99, 0xFB, /* 0x28-0x2B */
        0x99, 0xFC, 0x99, 0xFD, 0x99, 0xFE, 0x9A, 0x40, /* 0x2C-0x2F */
        0x9A, 0x41, 0x9A, 0x42, 0xD3, 0xFB, 0x9A, 0x43, /* 0x30-0x33 */
        0x9A, 0x44, 0x9A, 0x45, 0x9A, 0x46, 0xEC, 0xA4, /* 0x34-0x37 */
        0x9A, 0x47, 0xEC, 0xA5, 0xC6, 0xDB, 0x9A, 0x48, /* 0x38-0x3B */
        0x9A, 0x49, 0x9A, 0x4A, 0xBF, 0xEE, 0x9A, 0x4B, /* 0x3C-0x3F */
        0x9A, 0x4C, 0x9A, 0x4D, 0x9A, 0x4E, 0xEC, 0xA6, /* 0x40-0x43 */
        0x9A, 0x4F, 0x9A, 0x50, 0xEC, 0xA7, 0xD0, 0xAA, /* 0x44-0x47 */
        0x9A, 0x51, 0xC7, 0xB8, 0x9A, 0x52, 0x9A, 0x53, /* 0x48-0x4B */
        0xB8, 0xE8, 0x9A, 0x54, 0x9A, 0x55, 0x9A, 0x56, /* 0x4C-0x4F */
        0x9A, 0x57, 0x9A, 0x58, 0x9A, 0x59, 0x9A, 0x5A, /* 0x50-0x53 */
        0x9A, 0x5B, 0x9A, 0x5C, 0x9A, 0x5D, 0x9A, 0x5E, /* 0x54-0x57 */
        0x9A, 0x5F, 0xEC, 0xA8, 0x9A, 0x60, 0x9A, 0x61, /* 0x58-0x5B */
        0x9A, 0x62, 0x9A, 0x63, 0x9A, 0x64, 0x9A, 0x65, /* 0x5C-0x5F */
        0x9A, 0x66, 0x9A, 0x67, 0xD6, 0xB9, 0xD5, 0xFD, /* 0x60-0x63 */
        0xB4, 0xCB, 0xB2, 0xBD, 0xCE, 0xE4, 0xC6, 0xE7, /* 0x64-0x67 */
        0x9A, 0x68, 0x9A, 0x69, 0xCD, 0xE1, 0x9A, 0x6A, /* 0x68-0x6B */
        0x9A, 0x6B, 0x9A, 0x6C, 0x9A, 0x6D, 0x9A, 0x6E, /* 0x6C-0x6F */
        0x9A, 0x6F, 0x9A, 0x70, 0x9A, 0x71, 0x9A, 0x72, /* 0x70-0x73 */
        0x9A, 0x73, 0x9A, 0x74, 0x9A, 0x75, 0x9A, 0x76, /* 0x74-0x77 */
        0x9A, 0x77, 0xB4, 0xF5, 0x9A, 0x78, 0xCB, 0xC0, /* 0x78-0x7B */
        0xBC, 0xDF, 0x9A, 0x79, 0x9A, 0x7A, 0x9A, 0x7B, /* 0x7C-0x7F */
        
        0x9A, 0x7C, 0xE9, 0xE2, 0xE9, 0xE3, 0xD1, 0xEA, /* 0x80-0x83 */
        0xE9, 0xE5, 0x9A, 0x7D, 0xB4, 0xF9, 0xE9, 0xE4, /* 0x84-0x87 */
        0x9A, 0x7E, 0xD1, 0xB3, 0xCA, 0xE2, 0xB2, 0xD0, /* 0x88-0x8B */
        0x9A, 0x80, 0xE9, 0xE8, 0x9A, 0x81, 0x9A, 0x82, /* 0x8C-0x8F */
        0x9A, 0x83, 0x9A, 0x84, 0xE9, 0xE6, 0xE9, 0xE7, /* 0x90-0x93 */
        0x9A, 0x85, 0x9A, 0x86, 0xD6, 0xB3, 0x9A, 0x87, /* 0x94-0x97 */
        0x9A, 0x88, 0x9A, 0x89, 0xE9, 0xE9, 0xE9, 0xEA, /* 0x98-0x9B */
        0x9A, 0x8A, 0x9A, 0x8B, 0x9A, 0x8C, 0x9A, 0x8D, /* 0x9C-0x9F */
        0x9A, 0x8E, 0xE9, 0xEB, 0x9A, 0x8F, 0x9A, 0x90, /* 0xA0-0xA3 */
        0x9A, 0x91, 0x9A, 0x92, 0x9A, 0x93, 0x9A, 0x94, /* 0xA4-0xA7 */
        0x9A, 0x95, 0x9A, 0x96, 0xE9, 0xEC, 0x9A, 0x97, /* 0xA8-0xAB */
        0x9A, 0x98, 0x9A, 0x99, 0x9A, 0x9A, 0x9A, 0x9B, /* 0xAC-0xAF */
        0x9A, 0x9C, 0x9A, 0x9D, 0x9A, 0x9E, 0xEC, 0xAF, /* 0xB0-0xB3 */
        0xC5, 0xB9, 0xB6, 0xCE, 0x9A, 0x9F, 0xD2, 0xF3, /* 0xB4-0xB7 */
        0x9A, 0xA0, 0x9A, 0xA1, 0x9A, 0xA2, 0x9A, 0xA3, /* 0xB8-0xBB */
        0x9A, 0xA4, 0x9A, 0xA5, 0x9A, 0xA6, 0xB5, 0xEE, /* 0xBC-0xBF */
        0x9A, 0xA7, 0xBB, 0xD9, 0xEC, 0xB1, 0x9A, 0xA8, /* 0xC0-0xC3 */
        0x9A, 0xA9, 0xD2, 0xE3, 0x9A, 0xAA, 0x9A, 0xAB, /* 0xC4-0xC7 */
        0x9A, 0xAC, 0x9A, 0xAD, 0x9A, 0xAE, 0xCE, 0xE3, /* 0xC8-0xCB */
        0x9A, 0xAF, 0xC4, 0xB8, 0x9A, 0xB0, 0xC3, 0xBF, /* 0xCC-0xCF */
        0x9A, 0xB1, 0x9A, 0xB2, 0xB6, 0xBE, 0xD8, 0xB9, /* 0xD0-0xD3 */
        0xB1, 0xC8, 0xB1, 0xCF, 0xB1, 0xD1, 0xC5, 0xFE, /* 0xD4-0xD7 */
        0x9A, 0xB3, 0xB1, 0xD0, 0x9A, 0xB4, 0xC3, 0xAB, /* 0xD8-0xDB */
        0x9A, 0xB5, 0x9A, 0xB6, 0x9A, 0xB7, 0x9A, 0xB8, /* 0xDC-0xDF */
        0x9A, 0xB9, 0xD5, 0xB1, 0x9A, 0xBA, 0x9A, 0xBB, /* 0xE0-0xE3 */
        0x9A, 0xBC, 0x9A, 0xBD, 0x9A, 0xBE, 0x9A, 0xBF, /* 0xE4-0xE7 */
        0x9A, 0xC0, 0x9A, 0xC1, 0xEB, 0xA4, 0xBA, 0xC1, /* 0xE8-0xEB */
        0x9A, 0xC2, 0x9A, 0xC3, 0x9A, 0xC4, 0xCC, 0xBA, /* 0xEC-0xEF */
        0x9A, 0xC5, 0x9A, 0xC6, 0x9A, 0xC7, 0xEB, 0xA5, /* 0xF0-0xF3 */
        0x9A, 0xC8, 0xEB, 0xA7, 0x9A, 0xC9, 0x9A, 0xCA, /* 0xF4-0xF7 */
        0x9A, 0xCB, 0xEB, 0xA8, 0x9A, 0xCC, 0x9A, 0xCD, /* 0xF8-0xFB */
        0x9A, 0xCE, 0xEB, 0xA6, 0x9A, 0xCF, 0x9A, 0xD0, /* 0xFC-0xFF */
};

static const unsigned char u2c_6C[512] = {
        0x9A, 0xD1, 0x9A, 0xD2, 0x9A, 0xD3, 0x9A, 0xD4, /* 0x00-0x03 */
        0x9A, 0xD5, 0xEB, 0xA9, 0xEB, 0xAB, 0xEB, 0xAA, /* 0x04-0x07 */
        0x9A, 0xD6, 0x9A, 0xD7, 0x9A, 0xD8, 0x9A, 0xD9, /* 0x08-0x0B */
        0x9A, 0xDA, 0xEB, 0xAC, 0x9A, 0xDB, 0xCA, 0xCF, /* 0x0C-0x0F */
        0xD8, 0xB5, 0xC3, 0xF1, 0x9A, 0xDC, 0xC3, 0xA5, /* 0x10-0x13 */
        0xC6, 0xF8, 0xEB, 0xAD, 0xC4, 0xCA, 0x9A, 0xDD, /* 0x14-0x17 */
        0xEB, 0xAE, 0xEB, 0xAF, 0xEB, 0xB0, 0xB7, 0xD5, /* 0x18-0x1B */
        0x9A, 0xDE, 0x9A, 0xDF, 0x9A, 0xE0, 0xB7, 0xFA, /* 0x1C-0x1F */
        0x9A, 0xE1, 0xEB, 0xB1, 0xC7, 0xE2, 0x9A, 0xE2, /* 0x20-0x23 */
        0xEB, 0xB3, 0x9A, 0xE3, 0xBA, 0xA4, 0xD1, 0xF5, /* 0x24-0x27 */
        0xB0, 0xB1, 0xEB, 0xB2, 0xEB, 0xB4, 0x9A, 0xE4, /* 0x28-0x2B */
        0x9A, 0xE5, 0x9A, 0xE6, 0xB5, 0xAA, 0xC2, 0xC8, /* 0x2C-0x2F */
        0xC7, 0xE8, 0x9A, 0xE7, 0xEB, 0xB5, 0x9A, 0xE8, /* 0x30-0x33 */
        0xCB, 0xAE, 0xE3, 0xDF, 0x9A, 0xE9, 0x9A, 0xEA, /* 0x34-0x37 */
        0xD3, 0xC0, 0x9A, 0xEB, 0x9A, 0xEC, 0x9A, 0xED, /* 0x38-0x3B */
        0x9A, 0xEE, 0xD9, 0xDB, 0x9A, 0xEF, 0x9A, 0xF0, /* 0x3C-0x3F */
        0xCD, 0xA1, 0xD6, 0xAD, 0xC7, 0xF3, 0x9A, 0xF1, /* 0x40-0x43 */
        0x9A, 0xF2, 0x9A, 0xF3, 0xD9, 0xE0, 0xBB, 0xE3, /* 0x44-0x47 */
        0x9A, 0xF4, 0xBA, 0xBA, 0xE3, 0xE2, 0x9A, 0xF5, /* 0x48-0x4B */
        0x9A, 0xF6, 0x9A, 0xF7, 0x9A, 0xF8, 0x9A, 0xF9, /* 0x4C-0x4F */
        0xCF, 0xAB, 0x9A, 0xFA, 0x9A, 0xFB, 0x9A, 0xFC, /* 0x50-0x53 */
        0xE3, 0xE0, 0xC9, 0xC7, 0x9A, 0xFD, 0xBA, 0xB9, /* 0x54-0x57 */
        0x9A, 0xFE, 0x9B, 0x40, 0x9B, 0x41, 0xD1, 0xB4, /* 0x58-0x5B */
        0xE3, 0xE1, 0xC8, 0xEA, 0xB9, 0xAF, 0xBD, 0xAD, /* 0x5C-0x5F */
        0xB3, 0xD8, 0xCE, 0xDB, 0x9B, 0x42, 0x9B, 0x43, /* 0x60-0x63 */
        0xCC, 0xC0, 0x9B, 0x44, 0x9B, 0x45, 0x9B, 0x46, /* 0x64-0x67 */
        0xE3, 0xE8, 0xE3, 0xE9, 0xCD, 0xF4, 0x9B, 0x47, /* 0x68-0x6B */
        0x9B, 0x48, 0x9B, 0x49, 0x9B, 0x4A, 0x9B, 0x4B, /* 0x6C-0x6F */
        0xCC, 0xAD, 0x9B, 0x4C, 0xBC, 0xB3, 0x9B, 0x4D, /* 0x70-0x73 */
        0xE3, 0xEA, 0x9B, 0x4E, 0xE3, 0xEB, 0x9B, 0x4F, /* 0x74-0x77 */
        0x9B, 0x50, 0xD0, 0xDA, 0x9B, 0x51, 0x9B, 0x52, /* 0x78-0x7B */
        0x9B, 0x53, 0xC6, 0xFB, 0xB7, 0xDA, 0x9B, 0x54, /* 0x7C-0x7F */
        
        0x9B, 0x55, 0xC7, 0xDF, 0xD2, 0xCA, 0xCE, 0xD6, /* 0x80-0x83 */
        0x9B, 0x56, 0xE3, 0xE4, 0xE3, 0xEC, 0x9B, 0x57, /* 0x84-0x87 */
        0xC9, 0xF2, 0xB3, 0xC1, 0x9B, 0x58, 0x9B, 0x59, /* 0x88-0x8B */
        0xE3, 0xE7, 0x9B, 0x5A, 0x9B, 0x5B, 0xC6, 0xE3, /* 0x8C-0x8F */
        0xE3, 0xE5, 0x9B, 0x5C, 0x9B, 0x5D, 0xED, 0xB3, /* 0x90-0x93 */
        0xE3, 0xE6, 0x9B, 0x5E, 0x9B, 0x5F, 0x9B, 0x60, /* 0x94-0x97 */
        0x9B, 0x61, 0xC9, 0xB3, 0x9B, 0x62, 0xC5, 0xE6, /* 0x98-0x9B */
        0x9B, 0x63, 0x9B, 0x64, 0x9B, 0x65, 0xB9, 0xB5, /* 0x9C-0x9F */
        0x9B, 0x66, 0xC3, 0xBB, 0x9B, 0x67, 0xE3, 0xE3, /* 0xA0-0xA3 */
        0xC5, 0xBD, 0xC1, 0xA4, 0xC2, 0xD9, 0xB2, 0xD7, /* 0xA4-0xA7 */
        0x9B, 0x68, 0xE3, 0xED, 0xBB, 0xA6, 0xC4, 0xAD, /* 0xA8-0xAB */
        0x9B, 0x69, 0xE3, 0xF0, 0xBE, 0xDA, 0x9B, 0x6A, /* 0xAC-0xAF */
        0x9B, 0x6B, 0xE3, 0xFB, 0xE3, 0xF5, 0xBA, 0xD3, /* 0xB0-0xB3 */
        0x9B, 0x6C, 0x9B, 0x6D, 0x9B, 0x6E, 0x9B, 0x6F, /* 0xB4-0xB7 */
        0xB7, 0xD0, 0xD3, 0xCD, 0x9B, 0x70, 0xD6, 0xCE, /* 0xB8-0xBB */
        0xD5, 0xD3, 0xB9, 0xC1, 0xD5, 0xB4, 0xD1, 0xD8, /* 0xBC-0xBF */
        0x9B, 0x71, 0x9B, 0x72, 0x9B, 0x73, 0x9B, 0x74, /* 0xC0-0xC3 */
        0xD0, 0xB9, 0xC7, 0xF6, 0x9B, 0x75, 0x9B, 0x76, /* 0xC4-0xC7 */
        0x9B, 0x77, 0xC8, 0xAA, 0xB2, 0xB4, 0x9B, 0x78, /* 0xC8-0xCB */
        0xC3, 0xDA, 0x9B, 0x79, 0x9B, 0x7A, 0x9B, 0x7B, /* 0xCC-0xCF */
        0xE3, 0xEE, 0x9B, 0x7C, 0x9B, 0x7D, 0xE3, 0xFC, /* 0xD0-0xD3 */
        0xE3, 0xEF, 0xB7, 0xA8, 0xE3, 0xF7, 0xE3, 0xF4, /* 0xD4-0xD7 */
        0x9B, 0x7E, 0x9B, 0x80, 0x9B, 0x81, 0xB7, 0xBA, /* 0xD8-0xDB */
        0x9B, 0x82, 0x9B, 0x83, 0xC5, 0xA2, 0x9B, 0x84, /* 0xDC-0xDF */
        0xE3, 0xF6, 0xC5, 0xDD, 0xB2, 0xA8, 0xC6, 0xFC, /* 0xE0-0xE3 */
        0x9B, 0x85, 0xC4, 0xE0, 0x9B, 0x86, 0x9B, 0x87, /* 0xE4-0xE7 */
        0xD7, 0xA2, 0x9B, 0x88, 0xC0, 0xE1, 0xE3, 0xF9, /* 0xE8-0xEB */
        0x9B, 0x89, 0x9B, 0x8A, 0xE3, 0xFA, 0xE3, 0xFD, /* 0xEC-0xEF */
        0xCC, 0xA9, 0xE3, 0xF3, 0x9B, 0x8B, 0xD3, 0xBE, /* 0xF0-0xF3 */
        0x9B, 0x8C, 0xB1, 0xC3, 0xED, 0xB4, 0xE3, 0xF1, /* 0xF4-0xF7 */
        0xE3, 0xF2, 0x9B, 0x8D, 0xE3, 0xF8, 0xD0, 0xBA, /* 0xF8-0xFB */
        0xC6, 0xC3, 0xD4, 0xF3, 0xE3, 0xFE, 0x9B, 0x8E, /* 0xFC-0xFF */
};

static const unsigned char u2c_6D[512] = {
        0x9B, 0x8F, 0xBD, 0xE0, 0x9B, 0x90, 0x9B, 0x91, /* 0x00-0x03 */
        0xE4, 0xA7, 0x9B, 0x92, 0x9B, 0x93, 0xE4, 0xA6, /* 0x04-0x07 */
        0x9B, 0x94, 0x9B, 0x95, 0x9B, 0x96, 0xD1, 0xF3, /* 0x08-0x0B */
        0xE4, 0xA3, 0x9B, 0x97, 0xE4, 0xA9, 0x9B, 0x98, /* 0x0C-0x0F */
        0x9B, 0x99, 0x9B, 0x9A, 0xC8, 0xF7, 0x9B, 0x9B, /* 0x10-0x13 */
        0x9B, 0x9C, 0x9B, 0x9D, 0x9B, 0x9E, 0xCF, 0xB4, /* 0x14-0x17 */
        0x9B, 0x9F, 0xE4, 0xA8, 0xE4, 0xAE, 0xC2, 0xE5, /* 0x18-0x1B */
        0x9B, 0xA0, 0x9B, 0xA1, 0xB6, 0xB4, 0x9B, 0xA2, /* 0x1C-0x1F */
        0x9B, 0xA3, 0x9B, 0xA4, 0x9B, 0xA5, 0x9B, 0xA6, /* 0x20-0x23 */
        0x9B, 0xA7, 0xBD, 0xF2, 0x9B, 0xA8, 0xE4, 0xA2, /* 0x24-0x27 */
        0x9B, 0xA9, 0x9B, 0xAA, 0xBA, 0xE9, 0xE4, 0xAA, /* 0x28-0x2B */
        0x9B, 0xAB, 0x9B, 0xAC, 0xE4, 0xAC, 0x9B, 0xAD, /* 0x2C-0x2F */
        0x9B, 0xAE, 0xB6, 0xFD, 0xD6, 0xDE, 0xE4, 0xB2, /* 0x30-0x33 */
        0x9B, 0xAF, 0xE4, 0xAD, 0x9B, 0xB0, 0x9B, 0xB1, /* 0x34-0x37 */
        0x9B, 0xB2, 0xE4, 0xA1, 0x9B, 0xB3, 0xBB, 0xEE, /* 0x38-0x3B */
        0xCD, 0xDD, 0xC7, 0xA2, 0xC5, 0xC9, 0x9B, 0xB4, /* 0x3C-0x3F */
        0x9B, 0xB5, 0xC1, 0xF7, 0x9B, 0xB6, 0xE4, 0xA4, /* 0x40-0x43 */
        0x9B, 0xB7, 0xC7, 0xB3, 0xBD, 0xAC, 0xBD, 0xBD, /* 0x44-0x47 */
        0xE4, 0xA5, 0x9B, 0xB8, 0xD7, 0xC7, 0xB2, 0xE2, /* 0x48-0x4B */
        0x9B, 0xB9, 0xE4, 0xAB, 0xBC, 0xC3, 0xE4, 0xAF, /* 0x4C-0x4F */
        0x9B, 0xBA, 0xBB, 0xEB, 0xE4, 0xB0, 0xC5, 0xA8, /* 0x50-0x53 */
        0xE4, 0xB1, 0x9B, 0xBB, 0x9B, 0xBC, 0x9B, 0xBD, /* 0x54-0x57 */
        0x9B, 0xBE, 0xD5, 0xE3, 0xBF, 0xA3, 0x9B, 0xBF, /* 0x58-0x5B */
        0xE4, 0xBA, 0x9B, 0xC0, 0xE4, 0xB7, 0x9B, 0xC1, /* 0x5C-0x5F */
        0xE4, 0xBB, 0x9B, 0xC2, 0x9B, 0xC3, 0xE4, 0xBD, /* 0x60-0x63 */
        0x9B, 0xC4, 0x9B, 0xC5, 0xC6, 0xD6, 0x9B, 0xC6, /* 0x64-0x67 */
        0x9B, 0xC7, 0xBA, 0xC6, 0xC0, 0xCB, 0x9B, 0xC8, /* 0x68-0x6B */
        0x9B, 0xC9, 0x9B, 0xCA, 0xB8, 0xA1, 0xE4, 0xB4, /* 0x6C-0x6F */
        0x9B, 0xCB, 0x9B, 0xCC, 0x9B, 0xCD, 0x9B, 0xCE, /* 0x70-0x73 */
        0xD4, 0xA1, 0x9B, 0xCF, 0x9B, 0xD0, 0xBA, 0xA3, /* 0x74-0x77 */
        0xBD, 0xFE, 0x9B, 0xD1, 0x9B, 0xD2, 0x9B, 0xD3, /* 0x78-0x7B */
        0xE4, 0xBC, 0x9B, 0xD4, 0x9B, 0xD5, 0x9B, 0xD6, /* 0x7C-0x7F */
        
        0x9B, 0xD7, 0x9B, 0xD8, 0xCD, 0xBF, 0x9B, 0xD9, /* 0x80-0x83 */
        0x9B, 0xDA, 0xC4, 0xF9, 0x9B, 0xDB, 0x9B, 0xDC, /* 0x84-0x87 */
        0xCF, 0xFB, 0xC9, 0xE6, 0x9B, 0xDD, 0x9B, 0xDE, /* 0x88-0x8B */
        0xD3, 0xBF, 0x9B, 0xDF, 0xCF, 0xD1, 0x9B, 0xE0, /* 0x8C-0x8F */
        0x9B, 0xE1, 0xE4, 0xB3, 0x9B, 0xE2, 0xE4, 0xB8, /* 0x90-0x93 */
        0xE4, 0xB9, 0xCC, 0xE9, 0x9B, 0xE3, 0x9B, 0xE4, /* 0x94-0x97 */
        0x9B, 0xE5, 0x9B, 0xE6, 0x9B, 0xE7, 0xCC, 0xCE, /* 0x98-0x9B */
        0x9B, 0xE8, 0xC0, 0xD4, 0xE4, 0xB5, 0xC1, 0xB0, /* 0x9C-0x9F */
        0xE4, 0xB6, 0xCE, 0xD0, 0x9B, 0xE9, 0xBB, 0xC1, /* 0xA0-0xA3 */
        0xB5, 0xD3, 0x9B, 0xEA, 0xC8, 0xF3, 0xBD, 0xA7, /* 0xA4-0xA7 */
        0xD5, 0xC7, 0xC9, 0xAC, 0xB8, 0xA2, 0xE4, 0xCA, /* 0xA8-0xAB */
        0x9B, 0xEB, 0x9B, 0xEC, 0xE4, 0xCC, 0xD1, 0xC4, /* 0xAC-0xAF */
        0x9B, 0xED, 0x9B, 0xEE, 0xD2, 0xBA, 0x9B, 0xEF, /* 0xB0-0xB3 */
        0x9B, 0xF0, 0xBA, 0xAD, 0x9B, 0xF1, 0x9B, 0xF2, /* 0xB4-0xB7 */
        0xBA, 0xD4, 0x9B, 0xF3, 0x9B, 0xF4, 0x9B, 0xF5, /* 0xB8-0xBB */
        0x9B, 0xF6, 0x9B, 0xF7, 0x9B, 0xF8, 0xE4, 0xC3, /* 0xBC-0xBF */
        0xB5, 0xED, 0x9B, 0xF9, 0x9B, 0xFA, 0x9B, 0xFB, /* 0xC0-0xC3 */
        0xD7, 0xCD, 0xE4, 0xC0, 0xCF, 0xFD, 0xE4, 0xBF, /* 0xC4-0xC7 */
        0x9B, 0xFC, 0x9B, 0xFD, 0x9B, 0xFE, 0xC1, 0xDC, /* 0xC8-0xCB */
        0xCC, 0xCA, 0x9C, 0x40, 0x9C, 0x41, 0x9C, 0x42, /* 0xCC-0xCF */
        0x9C, 0x43, 0xCA, 0xE7, 0x9C, 0x44, 0x9C, 0x45, /* 0xD0-0xD3 */
        0x9C, 0x46, 0x9C, 0x47, 0xC4, 0xD7, 0x9C, 0x48, /* 0xD4-0xD7 */
        0xCC, 0xD4, 0xE4, 0xC8, 0x9C, 0x49, 0x9C, 0x4A, /* 0xD8-0xDB */
        0x9C, 0x4B, 0xE4, 0xC7, 0xE4, 0xC1, 0x9C, 0x4C, /* 0xDC-0xDF */
        0xE4, 0xC4, 0xB5, 0xAD, 0x9C, 0x4D, 0x9C, 0x4E, /* 0xE0-0xE3 */
        0xD3, 0xD9, 0x9C, 0x4F, 0xE4, 0xC6, 0x9C, 0x50, /* 0xE4-0xE7 */
        0x9C, 0x51, 0x9C, 0x52, 0x9C, 0x53, 0xD2, 0xF9, /* 0xE8-0xEB */
        0xB4, 0xE3, 0x9C, 0x54, 0xBB, 0xB4, 0x9C, 0x55, /* 0xEC-0xEF */
        0x9C, 0x56, 0xC9, 0xEE, 0x9C, 0x57, 0xB4, 0xBE, /* 0xF0-0xF3 */
        0x9C, 0x58, 0x9C, 0x59, 0x9C, 0x5A, 0xBB, 0xEC, /* 0xF4-0xF7 */
        0x9C, 0x5B, 0xD1, 0xCD, 0x9C, 0x5C, 0xCC, 0xED, /* 0xF8-0xFB */
        0xED, 0xB5, 0x9C, 0x5D, 0x9C, 0x5E, 0x9C, 0x5F, /* 0xFC-0xFF */
};

static const unsigned char u2c_6E[512] = {
        0x9C, 0x60, 0x9C, 0x61, 0x9C, 0x62, 0x9C, 0x63, /* 0x00-0x03 */
        0x9C, 0x64, 0xC7, 0xE5, 0x9C, 0x65, 0x9C, 0x66, /* 0x04-0x07 */
        0x9C, 0x67, 0x9C, 0x68, 0xD4, 0xA8, 0x9C, 0x69, /* 0x08-0x0B */
        0xE4, 0xCB, 0xD7, 0xD5, 0xE4, 0xC2, 0x9C, 0x6A, /* 0x0C-0x0F */
        0xBD, 0xA5, 0xE4, 0xC5, 0x9C, 0x6B, 0x9C, 0x6C, /* 0x10-0x13 */
        0xD3, 0xE6, 0x9C, 0x6D, 0xE4, 0xC9, 0xC9, 0xF8, /* 0x14-0x17 */
        0x9C, 0x6E, 0x9C, 0x6F, 0xE4, 0xBE, 0x9C, 0x70, /* 0x18-0x1B */
        0x9C, 0x71, 0xD3, 0xE5, 0x9C, 0x72, 0x9C, 0x73, /* 0x1C-0x1F */
        0xC7, 0xFE, 0xB6, 0xC9, 0x9C, 0x74, 0xD4, 0xFC, /* 0x20-0x23 */
        0xB2, 0xB3, 0xE4, 0xD7, 0x9C, 0x75, 0x9C, 0x76, /* 0x24-0x27 */
        0x9C, 0x77, 0xCE, 0xC2, 0x9C, 0x78, 0xE4, 0xCD, /* 0x28-0x2B */
        0x9C, 0x79, 0xCE, 0xBC, 0x9C, 0x7A, 0xB8, 0xDB, /* 0x2C-0x2F */
        0x9C, 0x7B, 0x9C, 0x7C, 0xE4, 0xD6, 0x9C, 0x7D, /* 0x30-0x33 */
        0xBF, 0xCA, 0x9C, 0x7E, 0x9C, 0x80, 0x9C, 0x81, /* 0x34-0x37 */
        0xD3, 0xCE, 0x9C, 0x82, 0xC3, 0xEC, 0x9C, 0x83, /* 0x38-0x3B */
        0x9C, 0x84, 0x9C, 0x85, 0x9C, 0x86, 0x9C, 0x87, /* 0x3C-0x3F */
        0x9C, 0x88, 0x9C, 0x89, 0x9C, 0x8A, 0xC5, 0xC8, /* 0x40-0x43 */
        0xE4, 0xD8, 0x9C, 0x8B, 0x9C, 0x8C, 0x9C, 0x8D, /* 0x44-0x47 */
        0x9C, 0x8E, 0x9C, 0x8F, 0x9C, 0x90, 0x9C, 0x91, /* 0x48-0x4B */
        0x9C, 0x92, 0xCD, 0xC4, 0xE4, 0xCF, 0x9C, 0x93, /* 0x4C-0x4F */
        0x9C, 0x94, 0x9C, 0x95, 0x9C, 0x96, 0xE4, 0xD4, /* 0x50-0x53 */
        0xE4, 0xD5, 0x9C, 0x97, 0xBA, 0xFE, 0x9C, 0x98, /* 0x54-0x57 */
        0xCF, 0xE6, 0x9C, 0x99, 0x9C, 0x9A, 0xD5, 0xBF, /* 0x58-0x5B */
        0x9C, 0x9B, 0x9C, 0x9C, 0x9C, 0x9D, 0xE4, 0xD2, /* 0x5C-0x5F */
        0x9C, 0x9E, 0x9C, 0x9F, 0x9C, 0xA0, 0x9C, 0xA1, /* 0x60-0x63 */
        0x9C, 0xA2, 0x9C, 0xA3, 0x9C, 0xA4, 0x9C, 0xA5, /* 0x64-0x67 */
        0x9C, 0xA6, 0x9C, 0xA7, 0x9C, 0xA8, 0xE4, 0xD0, /* 0x68-0x6B */
        0x9C, 0xA9, 0x9C, 0xAA, 0xE4, 0xCE, 0x9C, 0xAB, /* 0x6C-0x6F */
        0x9C, 0xAC, 0x9C, 0xAD, 0x9C, 0xAE, 0x9C, 0xAF, /* 0x70-0x73 */
        0x9C, 0xB0, 0x9C, 0xB1, 0x9C, 0xB2, 0x9C, 0xB3, /* 0x74-0x77 */
        0x9C, 0xB4, 0x9C, 0xB5, 0x9C, 0xB6, 0x9C, 0xB7, /* 0x78-0x7B */
        0x9C, 0xB8, 0x9C, 0xB9, 0xCD, 0xE5, 0xCA, 0xAA, /* 0x7C-0x7F */
        
        0x9C, 0xBA, 0x9C, 0xBB, 0x9C, 0xBC, 0xC0, 0xA3, /* 0x80-0x83 */
        0x9C, 0xBD, 0xBD, 0xA6, 0xE4, 0xD3, 0x9C, 0xBE, /* 0x84-0x87 */
        0x9C, 0xBF, 0xB8, 0xC8, 0x9C, 0xC0, 0x9C, 0xC1, /* 0x88-0x8B */
        0x9C, 0xC2, 0x9C, 0xC3, 0x9C, 0xC4, 0xE4, 0xE7, /* 0x8C-0x8F */
        0xD4, 0xB4, 0x9C, 0xC5, 0x9C, 0xC6, 0x9C, 0xC7, /* 0x90-0x93 */
        0x9C, 0xC8, 0x9C, 0xC9, 0x9C, 0xCA, 0x9C, 0xCB, /* 0x94-0x97 */
        0xE4, 0xDB, 0x9C, 0xCC, 0x9C, 0xCD, 0x9C, 0xCE, /* 0x98-0x9B */
        0xC1, 0xEF, 0x9C, 0xCF, 0x9C, 0xD0, 0xE4, 0xE9, /* 0x9C-0x9F */
        0x9C, 0xD1, 0x9C, 0xD2, 0xD2, 0xE7, 0x9C, 0xD3, /* 0xA0-0xA3 */
        0x9C, 0xD4, 0xE4, 0xDF, 0x9C, 0xD5, 0xE4, 0xE0, /* 0xA4-0xA7 */
        0x9C, 0xD6, 0x9C, 0xD7, 0xCF, 0xAA, 0x9C, 0xD8, /* 0xA8-0xAB */
        0x9C, 0xD9, 0x9C, 0xDA, 0x9C, 0xDB, 0xCB, 0xDD, /* 0xAC-0xAF */
        0x9C, 0xDC, 0xE4, 0xDA, 0xE4, 0xD1, 0x9C, 0xDD, /* 0xB0-0xB3 */
        0xE4, 0xE5, 0x9C, 0xDE, 0xC8, 0xDC, 0xE4, 0xE3, /* 0xB4-0xB7 */
        0x9C, 0xDF, 0x9C, 0xE0, 0xC4, 0xE7, 0xE4, 0xE2, /* 0xB8-0xBB */
        0x9C, 0xE1, 0xE4, 0xE1, 0x9C, 0xE2, 0x9C, 0xE3, /* 0xBC-0xBF */
        0x9C, 0xE4, 0xB3, 0xFC, 0xE4, 0xE8, 0x9C, 0xE5, /* 0xC0-0xC3 */
        0x9C, 0xE6, 0x9C, 0xE7, 0x9C, 0xE8, 0xB5, 0xE1, /* 0xC4-0xC7 */
        0x9C, 0xE9, 0x9C, 0xEA, 0x9C, 0xEB, 0xD7, 0xCC, /* 0xC8-0xCB */
        0x9C, 0xEC, 0x9C, 0xED, 0x9C, 0xEE, 0xE4, 0xE6, /* 0xCC-0xCF */
        0x9C, 0xEF, 0xBB, 0xAC, 0x9C, 0xF0, 0xD7, 0xD2, /* 0xD0-0xD3 */
        0xCC, 0xCF, 0xEB, 0xF8, 0x9C, 0xF1, 0xE4, 0xE4, /* 0xD4-0xD7 */
        0x9C, 0xF2, 0x9C, 0xF3, 0xB9, 0xF6, 0x9C, 0xF4, /* 0xD8-0xDB */
        0x9C, 0xF5, 0x9C, 0xF6, 0xD6, 0xCD, 0xE4, 0xD9, /* 0xDC-0xDF */
        0xE4, 0xDC, 0xC2, 0xFA, 0xE4, 0xDE, 0x9C, 0xF7, /* 0xE0-0xE3 */
        0xC2, 0xCB, 0xC0, 0xC4, 0xC2, 0xD0, 0x9C, 0xF8, /* 0xE4-0xE7 */
        0xB1, 0xF5, 0xCC, 0xB2, 0x9C, 0xF9, 0x9C, 0xFA, /* 0xE8-0xEB */
        0x9C, 0xFB, 0x9C, 0xFC, 0x9C, 0xFD, 0x9C, 0xFE, /* 0xEC-0xEF */
        0x9D, 0x40, 0x9D, 0x41, 0x9D, 0x42, 0x9D, 0x43, /* 0xF0-0xF3 */
        0xB5, 0xCE, 0x9D, 0x44, 0x9D, 0x45, 0x9D, 0x46, /* 0xF4-0xF7 */
        0x9D, 0x47, 0xE4, 0xEF, 0x9D, 0x48, 0x9D, 0x49, /* 0xF8-0xFB */
        0x9D, 0x4A, 0x9D, 0x4B, 0x9D, 0x4C, 0x9D, 0x4D, /* 0xFC-0xFF */
};

static const unsigned char u2c_6F[512] = {
        0x9D, 0x4E, 0x9D, 0x4F, 0xC6, 0xAF, 0x9D, 0x50, /* 0x00-0x03 */
        0x9D, 0x51, 0x9D, 0x52, 0xC6, 0xE1, 0x9D, 0x53, /* 0x04-0x07 */
        0x9D, 0x54, 0xE4, 0xF5, 0x9D, 0x55, 0x9D, 0x56, /* 0x08-0x0B */
        0x9D, 0x57, 0x9D, 0x58, 0x9D, 0x59, 0xC2, 0xA9, /* 0x0C-0x0F */
        0x9D, 0x5A, 0x9D, 0x5B, 0x9D, 0x5C, 0xC0, 0xEC, /* 0x10-0x13 */
        0xD1, 0xDD, 0xE4, 0xEE, 0x9D, 0x5D, 0x9D, 0x5E, /* 0x14-0x17 */
        0x9D, 0x5F, 0x9D, 0x60, 0x9D, 0x61, 0x9D, 0x62, /* 0x18-0x1B */
        0x9D, 0x63, 0x9D, 0x64, 0x9D, 0x65, 0x9D, 0x66, /* 0x1C-0x1F */
        0xC4, 0xAE, 0x9D, 0x67, 0x9D, 0x68, 0x9D, 0x69, /* 0x20-0x23 */
        0xE4, 0xED, 0x9D, 0x6A, 0x9D, 0x6B, 0x9D, 0x6C, /* 0x24-0x27 */
        0x9D, 0x6D, 0xE4, 0xF6, 0xE4, 0xF4, 0xC2, 0xFE, /* 0x28-0x2B */
        0x9D, 0x6E, 0xE4, 0xDD, 0x9D, 0x6F, 0xE4, 0xF0, /* 0x2C-0x2F */
        0x9D, 0x70, 0xCA, 0xFE, 0x9D, 0x71, 0xD5, 0xC4, /* 0x30-0x33 */
        0x9D, 0x72, 0x9D, 0x73, 0xE4, 0xF1, 0x9D, 0x74, /* 0x34-0x37 */
        0x9D, 0x75, 0x9D, 0x76, 0x9D, 0x77, 0x9D, 0x78, /* 0x38-0x3B */
        0x9D, 0x79, 0x9D, 0x7A, 0xD1, 0xFA, 0x9D, 0x7B, /* 0x3C-0x3F */
        0x9D, 0x7C, 0x9D, 0x7D, 0x9D, 0x7E, 0x9D, 0x80, /* 0x40-0x43 */
        0x9D, 0x81, 0x9D, 0x82, 0xE4, 0xEB, 0xE4, 0xEC, /* 0x44-0x47 */
        0x9D, 0x83, 0x9D, 0x84, 0x9D, 0x85, 0xE4, 0xF2, /* 0x48-0x4B */
        0x9D, 0x86, 0xCE, 0xAB, 0x9D, 0x87, 0x9D, 0x88, /* 0x4C-0x4F */
        0x9D, 0x89, 0x9D, 0x8A, 0x9D, 0x8B, 0x9D, 0x8C, /* 0x50-0x53 */
        0x9D, 0x8D, 0x9D, 0x8E, 0x9D, 0x8F, 0x9D, 0x90, /* 0x54-0x57 */
        0xC5, 0xCB, 0x9D, 0x91, 0x9D, 0x92, 0x9D, 0x93, /* 0x58-0x5B */
        0xC7, 0xB1, 0x9D, 0x94, 0xC2, 0xBA, 0x9D, 0x95, /* 0x5C-0x5F */
        0x9D, 0x96, 0x9D, 0x97, 0xE4, 0xEA, 0x9D, 0x98, /* 0x60-0x63 */
        0x9D, 0x99, 0x9D, 0x9A, 0xC1, 0xCA, 0x9D, 0x9B, /* 0x64-0x67 */
        0x9D, 0x9C, 0x9D, 0x9D, 0x9D, 0x9E, 0x9D, 0x9F, /* 0x68-0x6B */
        0x9D, 0xA0, 0xCC, 0xB6, 0xB3, 0xB1, 0x9D, 0xA1, /* 0x6C-0x6F */
        0x9D, 0xA2, 0x9D, 0xA3, 0xE4, 0xFB, 0x9D, 0xA4, /* 0x70-0x73 */
        0xE4, 0xF3, 0x9D, 0xA5, 0x9D, 0xA6, 0x9D, 0xA7, /* 0x74-0x77 */
        0xE4, 0xFA, 0x9D, 0xA8, 0xE4, 0xFD, 0x9D, 0xA9, /* 0x78-0x7B */
        0xE4, 0xFC, 0x9D, 0xAA, 0x9D, 0xAB, 0x9D, 0xAC, /* 0x7C-0x7F */
        
        0x9D, 0xAD, 0x9D, 0xAE, 0x9D, 0xAF, 0x9D, 0xB0, /* 0x80-0x83 */
        0xB3, 0xCE, 0x9D, 0xB1, 0x9D, 0xB2, 0x9D, 0xB3, /* 0x84-0x87 */
        0xB3, 0xBA, 0xE4, 0xF7, 0x9D, 0xB4, 0x9D, 0xB5, /* 0x88-0x8B */
        0xE4, 0xF9, 0xE4, 0xF8, 0xC5, 0xEC, 0x9D, 0xB6, /* 0x8C-0x8F */
        0x9D, 0xB7, 0x9D, 0xB8, 0x9D, 0xB9, 0x9D, 0xBA, /* 0x90-0x93 */
        0x9D, 0xBB, 0x9D, 0xBC, 0x9D, 0xBD, 0x9D, 0xBE, /* 0x94-0x97 */
        0x9D, 0xBF, 0x9D, 0xC0, 0x9D, 0xC1, 0x9D, 0xC2, /* 0x98-0x9B */
        0xC0, 0xBD, 0x9D, 0xC3, 0x9D, 0xC4, 0x9D, 0xC5, /* 0x9C-0x9F */
        0x9D, 0xC6, 0xD4, 0xE8, 0x9D, 0xC7, 0x9D, 0xC8, /* 0xA0-0xA3 */
        0x9D, 0xC9, 0x9D, 0xCA, 0x9D, 0xCB, 0xE5, 0xA2, /* 0xA4-0xA7 */
        0x9D, 0xCC, 0x9D, 0xCD, 0x9D, 0xCE, 0x9D, 0xCF, /* 0xA8-0xAB */
        0x9D, 0xD0, 0x9D, 0xD1, 0x9D, 0xD2, 0x9D, 0xD3, /* 0xAC-0xAF */
        0x9D, 0xD4, 0x9D, 0xD5, 0x9D, 0xD6, 0xB0, 0xC4, /* 0xB0-0xB3 */
        0x9D, 0xD7, 0x9D, 0xD8, 0xE5, 0xA4, 0x9D, 0xD9, /* 0xB4-0xB7 */
        0x9D, 0xDA, 0xE5, 0xA3, 0x9D, 0xDB, 0x9D, 0xDC, /* 0xB8-0xBB */
        0x9D, 0xDD, 0x9D, 0xDE, 0x9D, 0xDF, 0x9D, 0xE0, /* 0xBC-0xBF */
        0xBC, 0xA4, 0x9D, 0xE1, 0xE5, 0xA5, 0x9D, 0xE2, /* 0xC0-0xC3 */
        0x9D, 0xE3, 0x9D, 0xE4, 0x9D, 0xE5, 0x9D, 0xE6, /* 0xC4-0xC7 */
        0x9D, 0xE7, 0xE5, 0xA1, 0x9D, 0xE8, 0x9D, 0xE9, /* 0xC8-0xCB */
        0x9D, 0xEA, 0x9D, 0xEB, 0x9D, 0xEC, 0x9D, 0xED, /* 0xCC-0xCF */
        0x9D, 0xEE, 0xE4, 0xFE, 0xB1, 0xF4, 0x9D, 0xEF, /* 0xD0-0xD3 */
        0x9D, 0xF0, 0x9D, 0xF1, 0x9D, 0xF2, 0x9D, 0xF3, /* 0xD4-0xD7 */
        0x9D, 0xF4, 0x9D, 0xF5, 0x9D, 0xF6, 0x9D, 0xF7, /* 0xD8-0xDB */
        0x9D, 0xF8, 0x9D, 0xF9, 0xE5, 0xA8, 0x9D, 0xFA, /* 0xDC-0xDF */
        0xE5, 0xA9, 0xE5, 0xA6, 0x9D, 0xFB, 0x9D, 0xFC, /* 0xE0-0xE3 */
        0x9D, 0xFD, 0x9D, 0xFE, 0x9E, 0x40, 0x9E, 0x41, /* 0xE4-0xE7 */
        0x9E, 0x42, 0x9E, 0x43, 0x9E, 0x44, 0x9E, 0x45, /* 0xE8-0xEB */
        0x9E, 0x46, 0x9E, 0x47, 0xE5, 0xA7, 0xE5, 0xAA, /* 0xEC-0xEF */
        0x9E, 0x48, 0x9E, 0x49, 0x9E, 0x4A, 0x9E, 0x4B, /* 0xF0-0xF3 */
        0x9E, 0x4C, 0x9E, 0x4D, 0x9E, 0x4E, 0x9E, 0x4F, /* 0xF4-0xF7 */
        0x9E, 0x50, 0x9E, 0x51, 0x9E, 0x52, 0x9E, 0x53, /* 0xF8-0xFB */
        0x9E, 0x54, 0x9E, 0x55, 0x9E, 0x56, 0x9E, 0x57, /* 0xFC-0xFF */
};

static const unsigned char u2c_70[512] = {
        0x9E, 0x58, 0x9E, 0x59, 0x9E, 0x5A, 0x9E, 0x5B, /* 0x00-0x03 */
        0x9E, 0x5C, 0x9E, 0x5D, 0x9E, 0x5E, 0x9E, 0x5F, /* 0x04-0x07 */
        0x9E, 0x60, 0x9E, 0x61, 0x9E, 0x62, 0x9E, 0x63, /* 0x08-0x0B */
        0x9E, 0x64, 0x9E, 0x65, 0x9E, 0x66, 0x9E, 0x67, /* 0x0C-0x0F */
        0x9E, 0x68, 0xC6, 0xD9, 0x9E, 0x69, 0x9E, 0x6A, /* 0x10-0x13 */
        0x9E, 0x6B, 0x9E, 0x6C, 0x9E, 0x6D, 0x9E, 0x6E, /* 0x14-0x17 */
        0x9E, 0x6F, 0x9E, 0x70, 0xE5, 0xAB, 0xE5, 0xAD, /* 0x18-0x1B */
        0x9E, 0x71, 0x9E, 0x72, 0x9E, 0x73, 0x9E, 0x74, /* 0x1C-0x1F */
        0x9E, 0x75, 0x9E, 0x76, 0x9E, 0x77, 0xE5, 0xAC, /* 0x20-0x23 */
        0x9E, 0x78, 0x9E, 0x79, 0x9E, 0x7A, 0x9E, 0x7B, /* 0x24-0x27 */
        0x9E, 0x7C, 0x9E, 0x7D, 0x9E, 0x7E, 0x9E, 0x80, /* 0x28-0x2B */
        0x9E, 0x81, 0x9E, 0x82, 0x9E, 0x83, 0x9E, 0x84, /* 0x2C-0x2F */
        0x9E, 0x85, 0x9E, 0x86, 0x9E, 0x87, 0x9E, 0x88, /* 0x30-0x33 */
        0x9E, 0x89, 0xE5, 0xAF, 0x9E, 0x8A, 0x9E, 0x8B, /* 0x34-0x37 */
        0x9E, 0x8C, 0xE5, 0xAE, 0x9E, 0x8D, 0x9E, 0x8E, /* 0x38-0x3B */
        0x9E, 0x8F, 0x9E, 0x90, 0x9E, 0x91, 0x9E, 0x92, /* 0x3C-0x3F */
        0x9E, 0x93, 0x9E, 0x94, 0x9E, 0x95, 0x9E, 0x96, /* 0x40-0x43 */
        0x9E, 0x97, 0x9E, 0x98, 0x9E, 0x99, 0x9E, 0x9A, /* 0x44-0x47 */
        0x9E, 0x9B, 0x9E, 0x9C, 0x9E, 0x9D, 0x9E, 0x9E, /* 0x48-0x4B */
        0xB9, 0xE0, 0x9E, 0x9F, 0x9E, 0xA0, 0xE5, 0xB0, /* 0x4C-0x4F */
        0x9E, 0xA1, 0x9E, 0xA2, 0x9E, 0xA3, 0x9E, 0xA4, /* 0x50-0x53 */
        0x9E, 0xA5, 0x9E, 0xA6, 0x9E, 0xA7, 0x9E, 0xA8, /* 0x54-0x57 */
        0x9E, 0xA9, 0x9E, 0xAA, 0x9E, 0xAB, 0x9E, 0xAC, /* 0x58-0x5B */
        0x9E, 0xAD, 0x9E, 0xAE, 0xE5, 0xB1, 0x9E, 0xAF, /* 0x5C-0x5F */
        0x9E, 0xB0, 0x9E, 0xB1, 0x9E, 0xB2, 0x9E, 0xB3, /* 0x60-0x63 */
        0x9E, 0xB4, 0x9E, 0xB5, 0x9E, 0xB6, 0x9E, 0xB7, /* 0x64-0x67 */
        0x9E, 0xB8, 0x9E, 0xB9, 0x9E, 0xBA, 0xBB, 0xF0, /* 0x68-0x6B */
        0xEC, 0xE1, 0xC3, 0xF0, 0x9E, 0xBB, 0xB5, 0xC6, /* 0x6C-0x6F */
        0xBB, 0xD2, 0x9E, 0xBC, 0x9E, 0xBD, 0x9E, 0xBE, /* 0x70-0x73 */
        0x9E, 0xBF, 0xC1, 0xE9, 0xD4, 0xEE, 0x9E, 0xC0, /* 0x74-0x77 */
        0xBE, 0xC4, 0x9E, 0xC1, 0x9E, 0xC2, 0x9E, 0xC3, /* 0x78-0x7B */
        0xD7, 0xC6, 0x9E, 0xC4, 0xD4, 0xD6, 0xB2, 0xD3, /* 0x7C-0x7F */
        
        0xEC, 0xBE, 0x9E, 0xC5, 0x9E, 0xC6, 0x9E, 0xC7, /* 0x80-0x83 */
        0x9E, 0xC8, 0xEA, 0xC1, 0x9E, 0xC9, 0x9E, 0xCA, /* 0x84-0x87 */
        0x9E, 0xCB, 0xC2, 0xAF, 0xB4, 0xB6, 0x9E, 0xCC, /* 0x88-0x8B */
        0x9E, 0xCD, 0x9E, 0xCE, 0xD1, 0xD7, 0x9E, 0xCF, /* 0x8C-0x8F */
        0x9E, 0xD0, 0x9E, 0xD1, 0xB3, 0xB4, 0x9E, 0xD2, /* 0x90-0x93 */
        0xC8, 0xB2, 0xBF, 0xBB, 0xEC, 0xC0, 0x9E, 0xD3, /* 0x94-0x97 */
        0x9E, 0xD4, 0xD6, 0xCB, 0x9E, 0xD5, 0x9E, 0xD6, /* 0x98-0x9B */
        0xEC, 0xBF, 0xEC, 0xC1, 0x9E, 0xD7, 0x9E, 0xD8, /* 0x9C-0x9F */
        0x9E, 0xD9, 0x9E, 0xDA, 0x9E, 0xDB, 0x9E, 0xDC, /* 0xA0-0xA3 */
        0x9E, 0xDD, 0x9E, 0xDE, 0x9E, 0xDF, 0x9E, 0xE0, /* 0xA4-0xA7 */
        0x9E, 0xE1, 0x9E, 0xE2, 0x9E, 0xE3, 0xEC, 0xC5, /* 0xA8-0xAB */
        0xBE, 0xE6, 0xCC, 0xBF, 0xC5, 0xDA, 0xBE, 0xBC, /* 0xAC-0xAF */
        0x9E, 0xE4, 0xEC, 0xC6, 0x9E, 0xE5, 0xB1, 0xFE, /* 0xB0-0xB3 */
        0x9E, 0xE6, 0x9E, 0xE7, 0x9E, 0xE8, 0xEC, 0xC4, /* 0xB4-0xB7 */
        0xD5, 0xA8, 0xB5, 0xE3, 0x9E, 0xE9, 0xEC, 0xC2, /* 0xB8-0xBB */
        0xC1, 0xB6, 0xB3, 0xE3, 0x9E, 0xEA, 0x9E, 0xEB, /* 0xBC-0xBF */
        0xEC, 0xC3, 0xCB, 0xB8, 0xC0, 0xC3, 0xCC, 0xFE, /* 0xC0-0xC3 */
        0x9E, 0xEC, 0x9E, 0xED, 0x9E, 0xEE, 0x9E, 0xEF, /* 0xC4-0xC7 */
        0xC1, 0xD2, 0x9E, 0xF0, 0xEC, 0xC8, 0x9E, 0xF1, /* 0xC8-0xCB */
        0x9E, 0xF2, 0x9E, 0xF3, 0x9E, 0xF4, 0x9E, 0xF5, /* 0xCC-0xCF */
        0x9E, 0xF6, 0x9E, 0xF7, 0x9E, 0xF8, 0x9E, 0xF9, /* 0xD0-0xD3 */
        0x9E, 0xFA, 0x9E, 0xFB, 0x9E, 0xFC, 0x9E, 0xFD, /* 0xD4-0xD7 */
        0xBA, 0xE6, 0xC0, 0xD3, 0x9E, 0xFE, 0xD6, 0xF2, /* 0xD8-0xDB */
        0x9F, 0x40, 0x9F, 0x41, 0x9F, 0x42, 0xD1, 0xCC, /* 0xDC-0xDF */
        0x9F, 0x43, 0x9F, 0x44, 0x9F, 0x45, 0x9F, 0x46, /* 0xE0-0xE3 */
        0xBF, 0xBE, 0x9F, 0x47, 0xB7, 0xB3, 0xC9, 0xD5, /* 0xE4-0xE7 */
        0xEC, 0xC7, 0xBB, 0xE2, 0x9F, 0x48, 0xCC, 0xCC, /* 0xE8-0xEB */
        0xBD, 0xFD, 0xC8, 0xC8, 0x9F, 0x49, 0xCF, 0xA9, /* 0xEC-0xEF */
        0x9F, 0x4A, 0x9F, 0x4B, 0x9F, 0x4C, 0x9F, 0x4D, /* 0xF0-0xF3 */
        0x9F, 0x4E, 0x9F, 0x4F, 0x9F, 0x50, 0xCD, 0xE9, /* 0xF4-0xF7 */
        0x9F, 0x51, 0xC5, 0xEB, 0x9F, 0x52, 0x9F, 0x53, /* 0xF8-0xFB */
        0x9F, 0x54, 0xB7, 0xE9, 0x9F, 0x55, 0x9F, 0x56, /* 0xFC-0xFF */
};

static const unsigned char u2c_71[512] = {
        0x9F, 0x57, 0x9F, 0x58, 0x9F, 0x59, 0x9F, 0x5A, /* 0x00-0x03 */
        0x9F, 0x5B, 0x9F, 0x5C, 0x9F, 0x5D, 0x9F, 0x5E, /* 0x04-0x07 */
        0x9F, 0x5F, 0xD1, 0xC9, 0xBA, 0xB8, 0x9F, 0x60, /* 0x08-0x0B */
        0x9F, 0x61, 0x9F, 0x62, 0x9F, 0x63, 0x9F, 0x64, /* 0x0C-0x0F */
        0xEC, 0xC9, 0x9F, 0x65, 0x9F, 0x66, 0xEC, 0xCA, /* 0x10-0x13 */
        0x9F, 0x67, 0xBB, 0xC0, 0xEC, 0xCB, 0x9F, 0x68, /* 0x14-0x17 */
        0xEC, 0xE2, 0xB1, 0xBA, 0xB7, 0xD9, 0x9F, 0x69, /* 0x18-0x1B */
        0x9F, 0x6A, 0x9F, 0x6B, 0x9F, 0x6C, 0x9F, 0x6D, /* 0x1C-0x1F */
        0x9F, 0x6E, 0x9F, 0x6F, 0x9F, 0x70, 0x9F, 0x71, /* 0x20-0x23 */
        0x9F, 0x72, 0x9F, 0x73, 0xBD, 0xB9, 0x9F, 0x74, /* 0x24-0x27 */
        0x9F, 0x75, 0x9F, 0x76, 0x9F, 0x77, 0x9F, 0x78, /* 0x28-0x2B */
        0x9F, 0x79, 0x9F, 0x7A, 0x9F, 0x7B, 0xEC, 0xCC, /* 0x2C-0x2F */
        0xD1, 0xE6, 0xEC, 0xCD, 0x9F, 0x7C, 0x9F, 0x7D, /* 0x30-0x33 */
        0x9F, 0x7E, 0x9F, 0x80, 0xC8, 0xBB, 0x9F, 0x81, /* 0x34-0x37 */
        0x9F, 0x82, 0x9F, 0x83, 0x9F, 0x84, 0x9F, 0x85, /* 0x38-0x3B */
        0x9F, 0x86, 0x9F, 0x87, 0x9F, 0x88, 0x9F, 0x89, /* 0x3C-0x3F */
        0x9F, 0x8A, 0x9F, 0x8B, 0x9F, 0x8C, 0x9F, 0x8D, /* 0x40-0x43 */
        0x9F, 0x8E, 0xEC, 0xD1, 0x9F, 0x8F, 0x9F, 0x90, /* 0x44-0x47 */
        0x9F, 0x91, 0x9F, 0x92, 0xEC, 0xD3, 0x9F, 0x93, /* 0x48-0x4B */
        0xBB, 0xCD, 0x9F, 0x94, 0xBC, 0xE5, 0x9F, 0x95, /* 0x4C-0x4F */
        0x9F, 0x96, 0x9F, 0x97, 0x9F, 0x98, 0x9F, 0x99, /* 0x50-0x53 */
        0x9F, 0x9A, 0x9F, 0x9B, 0x9F, 0x9C, 0x9F, 0x9D, /* 0x54-0x57 */
        0x9F, 0x9E, 0x9F, 0x9F, 0x9F, 0xA0, 0x9F, 0xA1, /* 0x58-0x5B */
        0xEC, 0xCF, 0x9F, 0xA2, 0xC9, 0xB7, 0x9F, 0xA3, /* 0x5C-0x5F */
        0x9F, 0xA4, 0x9F, 0xA5, 0x9F, 0xA6, 0x9F, 0xA7, /* 0x60-0x63 */
        0xC3, 0xBA, 0x9F, 0xA8, 0xEC, 0xE3, 0xD5, 0xD5, /* 0x64-0x67 */
        0xEC, 0xD0, 0x9F, 0xA9, 0x9F, 0xAA, 0x9F, 0xAB, /* 0x68-0x6B */
        0x9F, 0xAC, 0x9F, 0xAD, 0xD6, 0xF3, 0x9F, 0xAE, /* 0x6C-0x6F */
        0x9F, 0xAF, 0x9F, 0xB0, 0xEC, 0xD2, 0xEC, 0xCE, /* 0x70-0x73 */
        0x9F, 0xB1, 0x9F, 0xB2, 0x9F, 0xB3, 0x9F, 0xB4, /* 0x74-0x77 */
        0xEC, 0xD4, 0x9F, 0xB5, 0xEC, 0xD5, 0x9F, 0xB6, /* 0x78-0x7B */
        0x9F, 0xB7, 0xC9, 0xBF, 0x9F, 0xB8, 0x9F, 0xB9, /* 0x7C-0x7F */
        
        0x9F, 0xBA, 0x9F, 0xBB, 0x9F, 0xBC, 0x9F, 0xBD, /* 0x80-0x83 */
        0xCF, 0xA8, 0x9F, 0xBE, 0x9F, 0xBF, 0x9F, 0xC0, /* 0x84-0x87 */
        0x9F, 0xC1, 0x9F, 0xC2, 0xD0, 0xDC, 0x9F, 0xC3, /* 0x88-0x8B */
        0x9F, 0xC4, 0x9F, 0xC5, 0x9F, 0xC6, 0xD1, 0xAC, /* 0x8C-0x8F */
        0x9F, 0xC7, 0x9F, 0xC8, 0x9F, 0xC9, 0x9F, 0xCA, /* 0x90-0x93 */
        0xC8, 0xDB, 0x9F, 0xCB, 0x9F, 0xCC, 0x9F, 0xCD, /* 0x94-0x97 */
        0xEC, 0xD6, 0xCE, 0xF5, 0x9F, 0xCE, 0x9F, 0xCF, /* 0x98-0x9B */
        0x9F, 0xD0, 0x9F, 0xD1, 0x9F, 0xD2, 0xCA, 0xEC, /* 0x9C-0x9F */
        0xEC, 0xDA, 0x9F, 0xD3, 0x9F, 0xD4, 0x9F, 0xD5, /* 0xA0-0xA3 */
        0x9F, 0xD6, 0x9F, 0xD7, 0x9F, 0xD8, 0x9F, 0xD9, /* 0xA4-0xA7 */
        0xEC, 0xD9, 0x9F, 0xDA, 0x9F, 0xDB, 0x9F, 0xDC, /* 0xA8-0xAB */
        0xB0, 0xBE, 0x9F, 0xDD, 0x9F, 0xDE, 0x9F, 0xDF, /* 0xAC-0xAF */
        0x9F, 0xE0, 0x9F, 0xE1, 0x9F, 0xE2, 0xEC, 0xD7, /* 0xB0-0xB3 */
        0x9F, 0xE3, 0xEC, 0xD8, 0x9F, 0xE4, 0x9F, 0xE5, /* 0xB4-0xB7 */
        0x9F, 0xE6, 0xEC, 0xE4, 0x9F, 0xE7, 0x9F, 0xE8, /* 0xB8-0xBB */
        0x9F, 0xE9, 0x9F, 0xEA, 0x9F, 0xEB, 0x9F, 0xEC, /* 0xBC-0xBF */
        0x9F, 0xED, 0x9F, 0xEE, 0x9F, 0xEF, 0xC8, 0xBC, /* 0xC0-0xC3 */
        0x9F, 0xF0, 0x9F, 0xF1, 0x9F, 0xF2, 0x9F, 0xF3, /* 0xC4-0xC7 */
        0x9F, 0xF4, 0x9F, 0xF5, 0x9F, 0xF6, 0x9F, 0xF7, /* 0xC8-0xCB */
        0x9F, 0xF8, 0x9F, 0xF9, 0xC1, 0xC7, 0x9F, 0xFA, /* 0xCC-0xCF */
        0x9F, 0xFB, 0x9F, 0xFC, 0x9F, 0xFD, 0x9F, 0xFE, /* 0xD0-0xD3 */
        0xEC, 0xDC, 0xD1, 0xE0, 0xA0, 0x40, 0xA0, 0x41, /* 0xD4-0xD7 */
        0xA0, 0x42, 0xA0, 0x43, 0xA0, 0x44, 0xA0, 0x45, /* 0xD8-0xDB */
        0xA0, 0x46, 0xA0, 0x47, 0xA0, 0x48, 0xA0, 0x49, /* 0xDC-0xDF */
        0xEC, 0xDB, 0xA0, 0x4A, 0xA0, 0x4B, 0xA0, 0x4C, /* 0xE0-0xE3 */
        0xA0, 0x4D, 0xD4, 0xEF, 0xA0, 0x4E, 0xEC, 0xDD, /* 0xE4-0xE7 */
        0xA0, 0x4F, 0xA0, 0x50, 0xA0, 0x51, 0xA0, 0x52, /* 0xE8-0xEB */
        0xA0, 0x53, 0xA0, 0x54, 0xDB, 0xC6, 0xA0, 0x55, /* 0xEC-0xEF */
        0xA0, 0x56, 0xA0, 0x57, 0xA0, 0x58, 0xA0, 0x59, /* 0xF0-0xF3 */
        0xA0, 0x5A, 0xA0, 0x5B, 0xA0, 0x5C, 0xA0, 0x5D, /* 0xF4-0xF7 */
        0xA0, 0x5E, 0xEC, 0xDE, 0xA0, 0x5F, 0xA0, 0x60, /* 0xF8-0xFB */
        0xA0, 0x61, 0xA0, 0x62, 0xA0, 0x63, 0xA0, 0x64, /* 0xFC-0xFF */
};

static const unsigned char u2c_72[512] = {
        0xA0, 0x65, 0xA0, 0x66, 0xA0, 0x67, 0xA0, 0x68, /* 0x00-0x03 */
        0xA0, 0x69, 0xA0, 0x6A, 0xB1, 0xAC, 0xA0, 0x6B, /* 0x04-0x07 */
        0xA0, 0x6C, 0xA0, 0x6D, 0xA0, 0x6E, 0xA0, 0x6F, /* 0x08-0x0B */
        0xA0, 0x70, 0xA0, 0x71, 0xA0, 0x72, 0xA0, 0x73, /* 0x0C-0x0F */
        0xA0, 0x74, 0xA0, 0x75, 0xA0, 0x76, 0xA0, 0x77, /* 0x10-0x13 */
        0xA0, 0x78, 0xA0, 0x79, 0xA0, 0x7A, 0xA0, 0x7B, /* 0x14-0x17 */
        0xA0, 0x7C, 0xA0, 0x7D, 0xA0, 0x7E, 0xA0, 0x80, /* 0x18-0x1B */
        0xA0, 0x81, 0xEC, 0xDF, 0xA0, 0x82, 0xA0, 0x83, /* 0x1C-0x1F */
        0xA0, 0x84, 0xA0, 0x85, 0xA0, 0x86, 0xA0, 0x87, /* 0x20-0x23 */
        0xA0, 0x88, 0xA0, 0x89, 0xA0, 0x8A, 0xA0, 0x8B, /* 0x24-0x27 */
        0xEC, 0xE0, 0xA0, 0x8C, 0xD7, 0xA6, 0xA0, 0x8D, /* 0x28-0x2B */
        0xC5, 0xC0, 0xA0, 0x8E, 0xA0, 0x8F, 0xA0, 0x90, /* 0x2C-0x2F */
        0xEB, 0xBC, 0xB0, 0xAE, 0xA0, 0x91, 0xA0, 0x92, /* 0x30-0x33 */
        0xA0, 0x93, 0xBE, 0xF4, 0xB8, 0xB8, 0xD2, 0xAF, /* 0x34-0x37 */
        0xB0, 0xD6, 0xB5, 0xF9, 0xA0, 0x94, 0xD8, 0xB3, /* 0x38-0x3B */
        0xA0, 0x95, 0xCB, 0xAC, 0xA0, 0x96, 0xE3, 0xDD, /* 0x3C-0x3F */
        0xA0, 0x97, 0xA0, 0x98, 0xA0, 0x99, 0xA0, 0x9A, /* 0x40-0x43 */
        0xA0, 0x9B, 0xA0, 0x9C, 0xA0, 0x9D, 0xC6, 0xAC, /* 0x44-0x47 */
        0xB0, 0xE6, 0xA0, 0x9E, 0xA0, 0x9F, 0xA0, 0xA0, /* 0x48-0x4B */
        0xC5, 0xC6, 0xEB, 0xB9, 0xA0, 0xA1, 0xA0, 0xA2, /* 0x4C-0x4F */
        0xA0, 0xA3, 0xA0, 0xA4, 0xEB, 0xBA, 0xA0, 0xA5, /* 0x50-0x53 */
        0xA0, 0xA6, 0xA0, 0xA7, 0xEB, 0xBB, 0xA0, 0xA8, /* 0x54-0x57 */
        0xA0, 0xA9, 0xD1, 0xC0, 0xA0, 0xAA, 0xC5, 0xA3, /* 0x58-0x5B */
        0xA0, 0xAB, 0xEA, 0xF2, 0xA0, 0xAC, 0xC4, 0xB2, /* 0x5C-0x5F */
        0xA0, 0xAD, 0xC4, 0xB5, 0xC0, 0xCE, 0xA0, 0xAE, /* 0x60-0x63 */
        0xA0, 0xAF, 0xA0, 0xB0, 0xEA, 0xF3, 0xC4, 0xC1, /* 0x64-0x67 */
        0xA0, 0xB1, 0xCE, 0xEF, 0xA0, 0xB2, 0xA0, 0xB3, /* 0x68-0x6B */
        0xA0, 0xB4, 0xA0, 0xB5, 0xEA, 0xF0, 0xEA, 0xF4, /* 0x6C-0x6F */
        0xA0, 0xB6, 0xA0, 0xB7, 0xC9, 0xFC, 0xA0, 0xB8, /* 0x70-0x73 */
        0xA0, 0xB9, 0xC7, 0xA3, 0xA0, 0xBA, 0xA0, 0xBB, /* 0x74-0x77 */
        0xA0, 0xBC, 0xCC, 0xD8, 0xCE, 0xFE, 0xA0, 0xBD, /* 0x78-0x7B */
        0xA0, 0xBE, 0xA0, 0xBF, 0xEA, 0xF5, 0xEA, 0xF6, /* 0x7C-0x7F */
        
        0xCF, 0xAC, 0xC0, 0xE7, 0xA0, 0xC0, 0xA0, 0xC1, /* 0x80-0x83 */
        0xEA, 0xF7, 0xA0, 0xC2, 0xA0, 0xC3, 0xA0, 0xC4, /* 0x84-0x87 */
        0xA0, 0xC5, 0xA0, 0xC6, 0xB6, 0xBF, 0xEA, 0xF8, /* 0x88-0x8B */
        0xA0, 0xC7, 0xEA, 0xF9, 0xA0, 0xC8, 0xEA, 0xFA, /* 0x8C-0x8F */
        0xA0, 0xC9, 0xA0, 0xCA, 0xEA, 0xFB, 0xA0, 0xCB, /* 0x90-0x93 */
        0xA0, 0xCC, 0xA0, 0xCD, 0xA0, 0xCE, 0xA0, 0xCF, /* 0x94-0x97 */
        0xA0, 0xD0, 0xA0, 0xD1, 0xA0, 0xD2, 0xA0, 0xD3, /* 0x98-0x9B */
        0xA0, 0xD4, 0xA0, 0xD5, 0xA0, 0xD6, 0xEA, 0xF1, /* 0x9C-0x9F */
        0xA0, 0xD7, 0xA0, 0xD8, 0xA0, 0xD9, 0xA0, 0xDA, /* 0xA0-0xA3 */
        0xA0, 0xDB, 0xA0, 0xDC, 0xA0, 0xDD, 0xA0, 0xDE, /* 0xA4-0xA7 */
        0xA0, 0xDF, 0xA0, 0xE0, 0xA0, 0xE1, 0xA0, 0xE2, /* 0xA8-0xAB */
        0xC8, 0xAE, 0xE1, 0xEB, 0xA0, 0xE3, 0xB7, 0xB8, /* 0xAC-0xAF */
        0xE1, 0xEC, 0xA0, 0xE4, 0xA0, 0xE5, 0xA0, 0xE6, /* 0xB0-0xB3 */
        0xE1, 0xED, 0xA0, 0xE7, 0xD7, 0xB4, 0xE1, 0xEE, /* 0xB4-0xB7 */
        0xE1, 0xEF, 0xD3, 0xCC, 0xA0, 0xE8, 0xA0, 0xE9, /* 0xB8-0xBB */
        0xA0, 0xEA, 0xA0, 0xEB, 0xA0, 0xEC, 0xA0, 0xED, /* 0xBC-0xBF */
        0xA0, 0xEE, 0xE1, 0xF1, 0xBF, 0xF1, 0xE1, 0xF0, /* 0xC0-0xC3 */
        0xB5, 0xD2, 0xA0, 0xEF, 0xA0, 0xF0, 0xA0, 0xF1, /* 0xC4-0xC7 */
        0xB1, 0xB7, 0xA0, 0xF2, 0xA0, 0xF3, 0xA0, 0xF4, /* 0xC8-0xCB */
        0xA0, 0xF5, 0xE1, 0xF3, 0xE1, 0xF2, 0xA0, 0xF6, /* 0xCC-0xCF */
        0xBA, 0xFC, 0xA0, 0xF7, 0xE1, 0xF4, 0xA0, 0xF8, /* 0xD0-0xD3 */
        0xA0, 0xF9, 0xA0, 0xFA, 0xA0, 0xFB, 0xB9, 0xB7, /* 0xD4-0xD7 */
        0xA0, 0xFC, 0xBE, 0xD1, 0xA0, 0xFD, 0xA0, 0xFE, /* 0xD8-0xDB */
        0xAA, 0x40, 0xAA, 0x41, 0xC4, 0xFC, 0xAA, 0x42, /* 0xDC-0xDF */
        0xBA, 0xDD, 0xBD, 0xC6, 0xAA, 0x43, 0xAA, 0x44, /* 0xE0-0xE3 */
        0xAA, 0x45, 0xAA, 0x46, 0xAA, 0x47, 0xAA, 0x48, /* 0xE4-0xE7 */
        0xE1, 0xF5, 0xE1, 0xF7, 0xAA, 0x49, 0xAA, 0x4A, /* 0xE8-0xEB */
        0xB6, 0xC0, 0xCF, 0xC1, 0xCA, 0xA8, 0xE1, 0xF6, /* 0xEC-0xEF */
        0xD5, 0xF8, 0xD3, 0xFC, 0xE1, 0xF8, 0xE1, 0xFC, /* 0xF0-0xF3 */
        0xE1, 0xF9, 0xAA, 0x4B, 0xAA, 0x4C, 0xE1, 0xFA, /* 0xF4-0xF7 */
        0xC0, 0xEA, 0xAA, 0x4D, 0xE1, 0xFE, 0xE2, 0xA1, /* 0xF8-0xFB */
        0xC0, 0xC7, 0xAA, 0x4E, 0xAA, 0x4F, 0xAA, 0x50, /* 0xFC-0xFF */
};

static const unsigned char u2c_73[512] = {
        0xAA, 0x51, 0xE1, 0xFB, 0xAA, 0x52, 0xE1, 0xFD, /* 0x00-0x03 */
        0xAA, 0x53, 0xAA, 0x54, 0xAA, 0x55, 0xAA, 0x56, /* 0x04-0x07 */
        0xAA, 0x57, 0xAA, 0x58, 0xE2, 0xA5, 0xAA, 0x59, /* 0x08-0x0B */
        0xAA, 0x5A, 0xAA, 0x5B, 0xC1, 0xD4, 0xAA, 0x5C, /* 0x0C-0x0F */
        0xAA, 0x5D, 0xAA, 0x5E, 0xAA, 0x5F, 0xE2, 0xA3, /* 0x10-0x13 */
        0xAA, 0x60, 0xE2, 0xA8, 0xB2, 0xFE, 0xE2, 0xA2, /* 0x14-0x17 */
        0xAA, 0x61, 0xAA, 0x62, 0xAA, 0x63, 0xC3, 0xCD, /* 0x18-0x1B */
        0xB2, 0xC2, 0xE2, 0xA7, 0xE2, 0xA6, 0xAA, 0x64, /* 0x1C-0x1F */
        0xAA, 0x65, 0xE2, 0xA4, 0xE2, 0xA9, 0xAA, 0x66, /* 0x20-0x23 */
        0xAA, 0x67, 0xE2, 0xAB, 0xAA, 0x68, 0xAA, 0x69, /* 0x24-0x27 */
        0xAA, 0x6A, 0xD0, 0xC9, 0xD6, 0xED, 0xC3, 0xA8, /* 0x28-0x2B */
        0xE2, 0xAC, 0xAA, 0x6B, 0xCF, 0xD7, 0xAA, 0x6C, /* 0x2C-0x2F */
        0xAA, 0x6D, 0xE2, 0xAE, 0xAA, 0x6E, 0xAA, 0x6F, /* 0x30-0x33 */
        0xBA, 0xEF, 0xAA, 0x70, 0xAA, 0x71, 0xE9, 0xE0, /* 0x34-0x37 */
        0xE2, 0xAD, 0xE2, 0xAA, 0xAA, 0x72, 0xAA, 0x73, /* 0x38-0x3B */
        0xAA, 0x74, 0xAA, 0x75, 0xBB, 0xAB, 0xD4, 0xB3, /* 0x3C-0x3F */
        0xAA, 0x76, 0xAA, 0x77, 0xAA, 0x78, 0xAA, 0x79, /* 0x40-0x43 */
        0xAA, 0x7A, 0xAA, 0x7B, 0xAA, 0x7C, 0xAA, 0x7D, /* 0x44-0x47 */
        0xAA, 0x7E, 0xAA, 0x80, 0xAA, 0x81, 0xAA, 0x82, /* 0x48-0x4B */
        0xAA, 0x83, 0xE2, 0xB0, 0xAA, 0x84, 0xAA, 0x85, /* 0x4C-0x4F */
        0xE2, 0xAF, 0xAA, 0x86, 0xE9, 0xE1, 0xAA, 0x87, /* 0x50-0x53 */
        0xAA, 0x88, 0xAA, 0x89, 0xAA, 0x8A, 0xE2, 0xB1, /* 0x54-0x57 */
        0xAA, 0x8B, 0xAA, 0x8C, 0xAA, 0x8D, 0xAA, 0x8E, /* 0x58-0x5B */
        0xAA, 0x8F, 0xAA, 0x90, 0xAA, 0x91, 0xAA, 0x92, /* 0x5C-0x5F */
        0xE2, 0xB2, 0xAA, 0x93, 0xAA, 0x94, 0xAA, 0x95, /* 0x60-0x63 */
        0xAA, 0x96, 0xAA, 0x97, 0xAA, 0x98, 0xAA, 0x99, /* 0x64-0x67 */
        0xAA, 0x9A, 0xAA, 0x9B, 0xAA, 0x9C, 0xAA, 0x9D, /* 0x68-0x6B */
        0xE2, 0xB3, 0xCC, 0xA1, 0xAA, 0x9E, 0xE2, 0xB4, /* 0x6C-0x6F */
        0xAA, 0x9F, 0xAA, 0xA0, 0xAB, 0x40, 0xAB, 0x41, /* 0x70-0x73 */
        0xAB, 0x42, 0xAB, 0x43, 0xAB, 0x44, 0xAB, 0x45, /* 0x74-0x77 */
        0xAB, 0x46, 0xAB, 0x47, 0xAB, 0x48, 0xAB, 0x49, /* 0x78-0x7B */
        0xAB, 0x4A, 0xAB, 0x4B, 0xE2, 0xB5, 0xAB, 0x4C, /* 0x7C-0x7F */
        
        0xAB, 0x4D, 0xAB, 0x4E, 0xAB, 0x4F, 0xAB, 0x50, /* 0x80-0x83 */
        0xD0, 0xFE, 0xAB, 0x51, 0xAB, 0x52, 0xC2, 0xCA, /* 0x84-0x87 */
        0xAB, 0x53, 0xD3, 0xF1, 0xAB, 0x54, 0xCD, 0xF5, /* 0x88-0x8B */
        0xAB, 0x55, 0xAB, 0x56, 0xE7, 0xE0, 0xAB, 0x57, /* 0x8C-0x8F */
        0xAB, 0x58, 0xE7, 0xE1, 0xAB, 0x59, 0xAB, 0x5A, /* 0x90-0x93 */
        0xAB, 0x5B, 0xAB, 0x5C, 0xBE, 0xC1, 0xAB, 0x5D, /* 0x94-0x97 */
        0xAB, 0x5E, 0xAB, 0x5F, 0xAB, 0x60, 0xC2, 0xEA, /* 0x98-0x9B */
        0xAB, 0x61, 0xAB, 0x62, 0xAB, 0x63, 0xE7, 0xE4, /* 0x9C-0x9F */
        0xAB, 0x64, 0xAB, 0x65, 0xE7, 0xE3, 0xAB, 0x66, /* 0xA0-0xA3 */
        0xAB, 0x67, 0xAB, 0x68, 0xAB, 0x69, 0xAB, 0x6A, /* 0xA4-0xA7 */
        0xAB, 0x6B, 0xCD, 0xE6, 0xAB, 0x6C, 0xC3, 0xB5, /* 0xA8-0xAB */
        0xAB, 0x6D, 0xAB, 0x6E, 0xE7, 0xE2, 0xBB, 0xB7, /* 0xAC-0xAF */
        0xCF, 0xD6, 0xAB, 0x6F, 0xC1, 0xE1, 0xE7, 0xE9, /* 0xB0-0xB3 */
        0xAB, 0x70, 0xAB, 0x71, 0xAB, 0x72, 0xE7, 0xE8, /* 0xB4-0xB7 */
        0xAB, 0x73, 0xAB, 0x74, 0xE7, 0xF4, 0xB2, 0xA3, /* 0xB8-0xBB */
        0xAB, 0x75, 0xAB, 0x76, 0xAB, 0x77, 0xAB, 0x78, /* 0xBC-0xBF */
        0xE7, 0xEA, 0xAB, 0x79, 0xE7, 0xE6, 0xAB, 0x7A, /* 0xC0-0xC3 */
        0xAB, 0x7B, 0xAB, 0x7C, 0xAB, 0x7D, 0xAB, 0x7E, /* 0xC4-0xC7 */
        0xE7, 0xEC, 0xE7, 0xEB, 0xC9, 0xBA, 0xAB, 0x80, /* 0xC8-0xCB */
        0xAB, 0x81, 0xD5, 0xE4, 0xAB, 0x82, 0xE7, 0xE5, /* 0xCC-0xCF */
        0xB7, 0xA9, 0xE7, 0xE7, 0xAB, 0x83, 0xAB, 0x84, /* 0xD0-0xD3 */
        0xAB, 0x85, 0xAB, 0x86, 0xAB, 0x87, 0xAB, 0x88, /* 0xD4-0xD7 */
        0xAB, 0x89, 0xE7, 0xEE, 0xAB, 0x8A, 0xAB, 0x8B, /* 0xD8-0xDB */
        0xAB, 0x8C, 0xAB, 0x8D, 0xE7, 0xF3, 0xAB, 0x8E, /* 0xDC-0xDF */
        0xD6, 0xE9, 0xAB, 0x8F, 0xAB, 0x90, 0xAB, 0x91, /* 0xE0-0xE3 */
        0xAB, 0x92, 0xE7, 0xED, 0xAB, 0x93, 0xE7, 0xF2, /* 0xE4-0xE7 */
        0xAB, 0x94, 0xE7, 0xF1, 0xAB, 0x95, 0xAB, 0x96, /* 0xE8-0xEB */
        0xAB, 0x97, 0xB0, 0xE0, 0xAB, 0x98, 0xAB, 0x99, /* 0xEC-0xEF */
        0xAB, 0x9A, 0xAB, 0x9B, 0xE7, 0xF5, 0xAB, 0x9C, /* 0xF0-0xF3 */
        0xAB, 0x9D, 0xAB, 0x9E, 0xAB, 0x9F, 0xAB, 0xA0, /* 0xF4-0xF7 */
        0xAC, 0x40, 0xAC, 0x41, 0xAC, 0x42, 0xAC, 0x43, /* 0xF8-0xFB */
        0xAC, 0x44, 0xAC, 0x45, 0xAC, 0x46, 0xAC, 0x47, /* 0xFC-0xFF */
};

static const unsigned char u2c_74[512] = {
        0xAC, 0x48, 0xAC, 0x49, 0xAC, 0x4A, 0xC7, 0xF2, /* 0x00-0x03 */
        0xAC, 0x4B, 0xC0, 0xC5, 0xC0, 0xED, 0xAC, 0x4C, /* 0x04-0x07 */
        0xAC, 0x4D, 0xC1, 0xF0, 0xE7, 0xF0, 0xAC, 0x4E, /* 0x08-0x0B */
        0xAC, 0x4F, 0xAC, 0x50, 0xAC, 0x51, 0xE7, 0xF6, /* 0x0C-0x0F */
        0xCB, 0xF6, 0xAC, 0x52, 0xAC, 0x53, 0xAC, 0x54, /* 0x10-0x13 */
        0xAC, 0x55, 0xAC, 0x56, 0xAC, 0x57, 0xAC, 0x58, /* 0x14-0x17 */
        0xAC, 0x59, 0xAC, 0x5A, 0xE8, 0xA2, 0xE8, 0xA1, /* 0x18-0x1B */
        0xAC, 0x5B, 0xAC, 0x5C, 0xAC, 0x5D, 0xAC, 0x5E, /* 0x1C-0x1F */
        0xAC, 0x5F, 0xAC, 0x60, 0xD7, 0xC1, 0xAC, 0x61, /* 0x20-0x23 */
        0xAC, 0x62, 0xE7, 0xFA, 0xE7, 0xF9, 0xAC, 0x63, /* 0x24-0x27 */
        0xE7, 0xFB, 0xAC, 0x64, 0xE7, 0xF7, 0xAC, 0x65, /* 0x28-0x2B */
        0xE7, 0xFE, 0xAC, 0x66, 0xE7, 0xFD, 0xAC, 0x67, /* 0x2C-0x2F */
        0xE7, 0xFC, 0xAC, 0x68, 0xAC, 0x69, 0xC1, 0xD5, /* 0x30-0x33 */
        0xC7, 0xD9, 0xC5, 0xFD, 0xC5, 0xC3, 0xAC, 0x6A, /* 0x34-0x37 */
        0xAC, 0x6B, 0xAC, 0x6C, 0xAC, 0x6D, 0xAC, 0x6E, /* 0x38-0x3B */
        0xC7, 0xED, 0xAC, 0x6F, 0xAC, 0x70, 0xAC, 0x71, /* 0x3C-0x3F */
        0xAC, 0x72, 0xE8, 0xA3, 0xAC, 0x73, 0xAC, 0x74, /* 0x40-0x43 */
        0xAC, 0x75, 0xAC, 0x76, 0xAC, 0x77, 0xAC, 0x78, /* 0x44-0x47 */
        0xAC, 0x79, 0xAC, 0x7A, 0xAC, 0x7B, 0xAC, 0x7C, /* 0x48-0x4B */
        0xAC, 0x7D, 0xAC, 0x7E, 0xAC, 0x80, 0xAC, 0x81, /* 0x4C-0x4F */
        0xAC, 0x82, 0xAC, 0x83, 0xAC, 0x84, 0xAC, 0x85, /* 0x50-0x53 */
        0xAC, 0x86, 0xE8, 0xA6, 0xAC, 0x87, 0xE8, 0xA5, /* 0x54-0x57 */
        0xAC, 0x88, 0xE8, 0xA7, 0xBA, 0xF7, 0xE7, 0xF8, /* 0x58-0x5B */
        0xE8, 0xA4, 0xAC, 0x89, 0xC8, 0xF0, 0xC9, 0xAA, /* 0x5C-0x5F */
        0xAC, 0x8A, 0xAC, 0x8B, 0xAC, 0x8C, 0xAC, 0x8D, /* 0x60-0x63 */
        0xAC, 0x8E, 0xAC, 0x8F, 0xAC, 0x90, 0xAC, 0x91, /* 0x64-0x67 */
        0xAC, 0x92, 0xAC, 0x93, 0xAC, 0x94, 0xAC, 0x95, /* 0x68-0x6B */
        0xAC, 0x96, 0xE8, 0xA9, 0xAC, 0x97, 0xAC, 0x98, /* 0x6C-0x6F */
        0xB9, 0xE5, 0xAC, 0x99, 0xAC, 0x9A, 0xAC, 0x9B, /* 0x70-0x73 */
        0xAC, 0x9C, 0xAC, 0x9D, 0xD1, 0xFE, 0xE8, 0xA8, /* 0x74-0x77 */
        0xAC, 0x9E, 0xAC, 0x9F, 0xAC, 0xA0, 0xAD, 0x40, /* 0x78-0x7B */
        0xAD, 0x41, 0xAD, 0x42, 0xE8, 0xAA, 0xAD, 0x43, /* 0x7C-0x7F */
        
        0xE8, 0xAD, 0xE8, 0xAE, 0xAD, 0x44, 0xC1, 0xA7, /* 0x80-0x83 */
        0xAD, 0x45, 0xAD, 0x46, 0xAD, 0x47, 0xE8, 0xAF, /* 0x84-0x87 */
        0xAD, 0x48, 0xAD, 0x49, 0xAD, 0x4A, 0xE8, 0xB0, /* 0x88-0x8B */
        0xAD, 0x4B, 0xAD, 0x4C, 0xE8, 0xAC, 0xAD, 0x4D, /* 0x8C-0x8F */
        0xE8, 0xB4, 0xAD, 0x4E, 0xAD, 0x4F, 0xAD, 0x50, /* 0x90-0x93 */
        0xAD, 0x51, 0xAD, 0x52, 0xAD, 0x53, 0xAD, 0x54, /* 0x94-0x97 */
        0xAD, 0x55, 0xAD, 0x56, 0xAD, 0x57, 0xAD, 0x58, /* 0x98-0x9B */
        0xE8, 0xAB, 0xAD, 0x59, 0xE8, 0xB1, 0xAD, 0x5A, /* 0x9C-0x9F */
        0xAD, 0x5B, 0xAD, 0x5C, 0xAD, 0x5D, 0xAD, 0x5E, /* 0xA0-0xA3 */
        0xAD, 0x5F, 0xAD, 0x60, 0xAD, 0x61, 0xE8, 0xB5, /* 0xA4-0xA7 */
        0xE8, 0xB2, 0xE8, 0xB3, 0xAD, 0x62, 0xAD, 0x63, /* 0xA8-0xAB */
        0xAD, 0x64, 0xAD, 0x65, 0xAD, 0x66, 0xAD, 0x67, /* 0xAC-0xAF */
        0xAD, 0x68, 0xAD, 0x69, 0xAD, 0x6A, 0xAD, 0x6B, /* 0xB0-0xB3 */
        0xAD, 0x6C, 0xAD, 0x6D, 0xAD, 0x6E, 0xAD, 0x6F, /* 0xB4-0xB7 */
        0xAD, 0x70, 0xAD, 0x71, 0xE8, 0xB7, 0xAD, 0x72, /* 0xB8-0xBB */
        0xAD, 0x73, 0xAD, 0x74, 0xAD, 0x75, 0xAD, 0x76, /* 0xBC-0xBF */
        0xAD, 0x77, 0xAD, 0x78, 0xAD, 0x79, 0xAD, 0x7A, /* 0xC0-0xC3 */
        0xAD, 0x7B, 0xAD, 0x7C, 0xAD, 0x7D, 0xAD, 0x7E, /* 0xC4-0xC7 */
        0xAD, 0x80, 0xAD, 0x81, 0xAD, 0x82, 0xAD, 0x83, /* 0xC8-0xCB */
        0xAD, 0x84, 0xAD, 0x85, 0xAD, 0x86, 0xAD, 0x87, /* 0xCC-0xCF */
        0xAD, 0x88, 0xAD, 0x89, 0xE8, 0xB6, 0xAD, 0x8A, /* 0xD0-0xD3 */
        0xAD, 0x8B, 0xAD, 0x8C, 0xAD, 0x8D, 0xAD, 0x8E, /* 0xD4-0xD7 */
        0xAD, 0x8F, 0xAD, 0x90, 0xAD, 0x91, 0xAD, 0x92, /* 0xD8-0xDB */
        0xB9, 0xCF, 0xAD, 0x93, 0xF0, 0xAC, 0xAD, 0x94, /* 0xDC-0xDF */
        0xF0, 0xAD, 0xAD, 0x95, 0xC6, 0xB0, 0xB0, 0xEA, /* 0xE0-0xE3 */
        0xC8, 0xBF, 0xAD, 0x96, 0xCD, 0xDF, 0xAD, 0x97, /* 0xE4-0xE7 */
        0xAD, 0x98, 0xAD, 0x99, 0xAD, 0x9A, 0xAD, 0x9B, /* 0xE8-0xEB */
        0xAD, 0x9C, 0xAD, 0x9D, 0xCE, 0xCD, 0xEA, 0xB1, /* 0xEC-0xEF */
        0xAD, 0x9E, 0xAD, 0x9F, 0xAD, 0xA0, 0xAE, 0x40, /* 0xF0-0xF3 */
        0xEA, 0xB2, 0xAE, 0x41, 0xC6, 0xBF, 0xB4, 0xC9, /* 0xF4-0xF7 */
        0xAE, 0x42, 0xAE, 0x43, 0xAE, 0x44, 0xAE, 0x45, /* 0xF8-0xFB */
        0xAE, 0x46, 0xAE, 0x47, 0xAE, 0x48, 0xEA, 0xB3, /* 0xFC-0xFF */
};

static const unsigned char u2c_75[512] = {
        0xAE, 0x49, 0xAE, 0x4A, 0xAE, 0x4B, 0xAE, 0x4C, /* 0x00-0x03 */
        0xD5, 0xE7, 0xAE, 0x4D, 0xAE, 0x4E, 0xAE, 0x4F, /* 0x04-0x07 */
        0xAE, 0x50, 0xAE, 0x51, 0xAE, 0x52, 0xAE, 0x53, /* 0x08-0x0B */
        0xAE, 0x54, 0xDD, 0xF9, 0xAE, 0x55, 0xEA, 0xB4, /* 0x0C-0x0F */
        0xAE, 0x56, 0xEA, 0xB5, 0xAE, 0x57, 0xEA, 0xB6, /* 0x10-0x13 */
        0xAE, 0x58, 0xAE, 0x59, 0xAE, 0x5A, 0xAE, 0x5B, /* 0x14-0x17 */
        0xB8, 0xCA, 0xDF, 0xB0, 0xC9, 0xF5, 0xAE, 0x5C, /* 0x18-0x1B */
        0xCC, 0xF0, 0xAE, 0x5D, 0xAE, 0x5E, 0xC9, 0xFA, /* 0x1C-0x1F */
        0xAE, 0x5F, 0xAE, 0x60, 0xAE, 0x61, 0xAE, 0x62, /* 0x20-0x23 */
        0xAE, 0x63, 0xC9, 0xFB, 0xAE, 0x64, 0xAE, 0x65, /* 0x24-0x27 */
        0xD3, 0xC3, 0xCB, 0xA6, 0xAE, 0x66, 0xB8, 0xA6, /* 0x28-0x2B */
        0xF0, 0xAE, 0xB1, 0xC2, 0xAE, 0x67, 0xE5, 0xB8, /* 0x2C-0x2F */
        0xCC, 0xEF, 0xD3, 0xC9, 0xBC, 0xD7, 0xC9, 0xEA, /* 0x30-0x33 */
        0xAE, 0x68, 0xB5, 0xE7, 0xAE, 0x69, 0xC4, 0xD0, /* 0x34-0x37 */
        0xB5, 0xE9, 0xAE, 0x6A, 0xEE, 0xAE, 0xBB, 0xAD, /* 0x38-0x3B */
        0xAE, 0x6B, 0xAE, 0x6C, 0xE7, 0xDE, 0xAE, 0x6D, /* 0x3C-0x3F */
        0xEE, 0xAF, 0xAE, 0x6E, 0xAE, 0x6F, 0xAE, 0x70, /* 0x40-0x43 */
        0xAE, 0x71, 0xB3, 0xA9, 0xAE, 0x72, 0xAE, 0x73, /* 0x44-0x47 */
        0xEE, 0xB2, 0xAE, 0x74, 0xAE, 0x75, 0xEE, 0xB1, /* 0x48-0x4B */
        0xBD, 0xE7, 0xAE, 0x76, 0xEE, 0xB0, 0xCE, 0xB7, /* 0x4C-0x4F */
        0xAE, 0x77, 0xAE, 0x78, 0xAE, 0x79, 0xAE, 0x7A, /* 0x50-0x53 */
        0xC5, 0xCF, 0xAE, 0x7B, 0xAE, 0x7C, 0xAE, 0x7D, /* 0x54-0x57 */
        0xAE, 0x7E, 0xC1, 0xF4, 0xDB, 0xCE, 0xEE, 0xB3, /* 0x58-0x5B */
        0xD0, 0xF3, 0xAE, 0x80, 0xAE, 0x81, 0xAE, 0x82, /* 0x5C-0x5F */
        0xAE, 0x83, 0xAE, 0x84, 0xAE, 0x85, 0xAE, 0x86, /* 0x60-0x63 */
        0xAE, 0x87, 0xC2, 0xD4, 0xC6, 0xE8, 0xAE, 0x88, /* 0x64-0x67 */
        0xAE, 0x89, 0xAE, 0x8A, 0xB7, 0xAC, 0xAE, 0x8B, /* 0x68-0x6B */
        0xAE, 0x8C, 0xAE, 0x8D, 0xAE, 0x8E, 0xAE, 0x8F, /* 0x6C-0x6F */
        0xAE, 0x90, 0xAE, 0x91, 0xEE, 0xB4, 0xAE, 0x92, /* 0x70-0x73 */
        0xB3, 0xEB, 0xAE, 0x93, 0xAE, 0x94, 0xAE, 0x95, /* 0x74-0x77 */
        0xBB, 0xFB, 0xEE, 0xB5, 0xAE, 0x96, 0xAE, 0x97, /* 0x78-0x7B */
        0xAE, 0x98, 0xAE, 0x99, 0xAE, 0x9A, 0xE7, 0xDC, /* 0x7C-0x7F */
        
        0xAE, 0x9B, 0xAE, 0x9C, 0xAE, 0x9D, 0xEE, 0xB6, /* 0x80-0x83 */
        0xAE, 0x9E, 0xAE, 0x9F, 0xBD, 0xAE, 0xAE, 0xA0, /* 0x84-0x87 */
        0xAF, 0x40, 0xAF, 0x41, 0xAF, 0x42, 0xF1, 0xE2, /* 0x88-0x8B */
        0xAF, 0x43, 0xAF, 0x44, 0xAF, 0x45, 0xCA, 0xE8, /* 0x8C-0x8F */
        0xAF, 0x46, 0xD2, 0xC9, 0xF0, 0xDA, 0xAF, 0x47, /* 0x90-0x93 */
        0xF0, 0xDB, 0xAF, 0x48, 0xF0, 0xDC, 0xC1, 0xC6, /* 0x94-0x97 */
        0xAF, 0x49, 0xB8, 0xED, 0xBE, 0xCE, 0xAF, 0x4A, /* 0x98-0x9B */
        0xAF, 0x4B, 0xF0, 0xDE, 0xAF, 0x4C, 0xC5, 0xB1, /* 0x9C-0x9F */
        0xF0, 0xDD, 0xD1, 0xF1, 0xAF, 0x4D, 0xF0, 0xE0, /* 0xA0-0xA3 */
        0xB0, 0xCC, 0xBD, 0xEA, 0xAF, 0x4E, 0xAF, 0x4F, /* 0xA4-0xA7 */
        0xAF, 0x50, 0xAF, 0x51, 0xAF, 0x52, 0xD2, 0xDF, /* 0xA8-0xAB */
        0xF0, 0xDF, 0xAF, 0x53, 0xB4, 0xAF, 0xB7, 0xE8, /* 0xAC-0xAF */
        0xF0, 0xE6, 0xF0, 0xE5, 0xC6, 0xA3, 0xF0, 0xE1, /* 0xB0-0xB3 */
        0xF0, 0xE2, 0xB4, 0xC3, 0xAF, 0x54, 0xAF, 0x55, /* 0xB4-0xB7 */
        0xF0, 0xE3, 0xD5, 0xEE, 0xAF, 0x56, 0xAF, 0x57, /* 0xB8-0xBB */
        0xCC, 0xDB, 0xBE, 0xD2, 0xBC, 0xB2, 0xAF, 0x58, /* 0xBC-0xBF */
        0xAF, 0x59, 0xAF, 0x5A, 0xF0, 0xE8, 0xF0, 0xE7, /* 0xC0-0xC3 */
        0xF0, 0xE4, 0xB2, 0xA1, 0xAF, 0x5B, 0xD6, 0xA2, /* 0xC4-0xC7 */
        0xD3, 0xB8, 0xBE, 0xB7, 0xC8, 0xAC, 0xAF, 0x5C, /* 0xC8-0xCB */
        0xAF, 0x5D, 0xF0, 0xEA, 0xAF, 0x5E, 0xAF, 0x5F, /* 0xCC-0xCF */
        0xAF, 0x60, 0xAF, 0x61, 0xD1, 0xF7, 0xAF, 0x62, /* 0xD0-0xD3 */
        0xD6, 0xCC, 0xBA, 0xDB, 0xF0, 0xE9, 0xAF, 0x63, /* 0xD4-0xD7 */
        0xB6, 0xBB, 0xAF, 0x64, 0xAF, 0x65, 0xCD, 0xB4, /* 0xD8-0xDB */
        0xAF, 0x66, 0xAF, 0x67, 0xC6, 0xA6, 0xAF, 0x68, /* 0xDC-0xDF */
        0xAF, 0x69, 0xAF, 0x6A, 0xC1, 0xA1, 0xF0, 0xEB, /* 0xE0-0xE3 */
        0xF0, 0xEE, 0xAF, 0x6B, 0xF0, 0xED, 0xF0, 0xF0, /* 0xE4-0xE7 */
        0xF0, 0xEC, 0xAF, 0x6C, 0xBB, 0xBE, 0xF0, 0xEF, /* 0xE8-0xEB */
        0xAF, 0x6D, 0xAF, 0x6E, 0xAF, 0x6F, 0xAF, 0x70, /* 0xEC-0xEF */
        0xCC, 0xB5, 0xF0, 0xF2, 0xAF, 0x71, 0xAF, 0x72, /* 0xF0-0xF3 */
        0xB3, 0xD5, 0xAF, 0x73, 0xAF, 0x74, 0xAF, 0x75, /* 0xF4-0xF7 */
        0xAF, 0x76, 0xB1, 0xD4, 0xAF, 0x77, 0xAF, 0x78, /* 0xF8-0xFB */
        0xF0, 0xF3, 0xAF, 0x79, 0xAF, 0x7A, 0xF0, 0xF4, /* 0xFC-0xFF */
};

static const unsigned char u2c_76[512] = {
        0xF0, 0xF6, 0xB4, 0xE1, 0xAF, 0x7B, 0xF0, 0xF1, /* 0x00-0x03 */
        0xAF, 0x7C, 0xF0, 0xF7, 0xAF, 0x7D, 0xAF, 0x7E, /* 0x04-0x07 */
        0xAF, 0x80, 0xAF, 0x81, 0xF0, 0xFA, 0xAF, 0x82, /* 0x08-0x0B */
        0xF0, 0xF8, 0xAF, 0x83, 0xAF, 0x84, 0xAF, 0x85, /* 0x0C-0x0F */
        0xF0, 0xF5, 0xAF, 0x86, 0xAF, 0x87, 0xAF, 0x88, /* 0x10-0x13 */
        0xAF, 0x89, 0xF0, 0xFD, 0xAF, 0x8A, 0xF0, 0xF9, /* 0x14-0x17 */
        0xF0, 0xFC, 0xF0, 0xFE, 0xAF, 0x8B, 0xF1, 0xA1, /* 0x18-0x1B */
        0xAF, 0x8C, 0xAF, 0x8D, 0xAF, 0x8E, 0xCE, 0xC1, /* 0x1C-0x1F */
        0xF1, 0xA4, 0xAF, 0x8F, 0xF1, 0xA3, 0xAF, 0x90, /* 0x20-0x23 */
        0xC1, 0xF6, 0xF0, 0xFB, 0xCA, 0xDD, 0xAF, 0x91, /* 0x24-0x27 */
        0xAF, 0x92, 0xB4, 0xF1, 0xB1, 0xF1, 0xCC, 0xB1, /* 0x28-0x2B */
        0xAF, 0x93, 0xF1, 0xA6, 0xAF, 0x94, 0xAF, 0x95, /* 0x2C-0x2F */
        0xF1, 0xA7, 0xAF, 0x96, 0xAF, 0x97, 0xF1, 0xAC, /* 0x30-0x33 */
        0xD5, 0xCE, 0xF1, 0xA9, 0xAF, 0x98, 0xAF, 0x99, /* 0x34-0x37 */
        0xC8, 0xB3, 0xAF, 0x9A, 0xAF, 0x9B, 0xAF, 0x9C, /* 0x38-0x3B */
        0xF1, 0xA2, 0xAF, 0x9D, 0xF1, 0xAB, 0xF1, 0xA8, /* 0x3C-0x3F */
        0xF1, 0xA5, 0xAF, 0x9E, 0xAF, 0x9F, 0xF1, 0xAA, /* 0x40-0x43 */
        0xAF, 0xA0, 0xB0, 0x40, 0xB0, 0x41, 0xB0, 0x42, /* 0x44-0x47 */
        0xB0, 0x43, 0xB0, 0x44, 0xB0, 0x45, 0xB0, 0x46, /* 0x48-0x4B */
        0xB0, 0xA9, 0xF1, 0xAD, 0xB0, 0x47, 0xB0, 0x48, /* 0x4C-0x4F */
        0xB0, 0x49, 0xB0, 0x4A, 0xB0, 0x4B, 0xB0, 0x4C, /* 0x50-0x53 */
        0xF1, 0xAF, 0xB0, 0x4D, 0xF1, 0xB1, 0xB0, 0x4E, /* 0x54-0x57 */
        0xB0, 0x4F, 0xB0, 0x50, 0xB0, 0x51, 0xB0, 0x52, /* 0x58-0x5B */
        0xF1, 0xB0, 0xB0, 0x53, 0xF1, 0xAE, 0xB0, 0x54, /* 0x5C-0x5F */
        0xB0, 0x55, 0xB0, 0x56, 0xB0, 0x57, 0xD1, 0xA2, /* 0x60-0x63 */
        0xB0, 0x58, 0xB0, 0x59, 0xB0, 0x5A, 0xB0, 0x5B, /* 0x64-0x67 */
        0xB0, 0x5C, 0xB0, 0x5D, 0xB0, 0x5E, 0xF1, 0xB2, /* 0x68-0x6B */
        0xB0, 0x5F, 0xB0, 0x60, 0xB0, 0x61, 0xF1, 0xB3, /* 0x6C-0x6F */
        0xB0, 0x62, 0xB0, 0x63, 0xB0, 0x64, 0xB0, 0x65, /* 0x70-0x73 */
        0xB0, 0x66, 0xB0, 0x67, 0xB0, 0x68, 0xB0, 0x69, /* 0x74-0x77 */
        0xB9, 0xEF, 0xB0, 0x6A, 0xB0, 0x6B, 0xB5, 0xC7, /* 0x78-0x7B */
        0xB0, 0x6C, 0xB0, 0xD7, 0xB0, 0xD9, 0xB0, 0x6D, /* 0x7C-0x7F */
        
        0xB0, 0x6E, 0xB0, 0x6F, 0xD4, 0xED, 0xB0, 0x70, /* 0x80-0x83 */
        0xB5, 0xC4, 0xB0, 0x71, 0xBD, 0xD4, 0xBB, 0xCA, /* 0x84-0x87 */
        0xF0, 0xA7, 0xB0, 0x72, 0xB0, 0x73, 0xB8, 0xDE, /* 0x88-0x8B */
        0xB0, 0x74, 0xB0, 0x75, 0xF0, 0xA8, 0xB0, 0x76, /* 0x8C-0x8F */
        0xB0, 0x77, 0xB0, 0xA8, 0xB0, 0x78, 0xF0, 0xA9, /* 0x90-0x93 */
        0xB0, 0x79, 0xB0, 0x7A, 0xCD, 0xEE, 0xB0, 0x7B, /* 0x94-0x97 */
        0xB0, 0x7C, 0xF0, 0xAA, 0xB0, 0x7D, 0xB0, 0x7E, /* 0x98-0x9B */
        0xB0, 0x80, 0xB0, 0x81, 0xB0, 0x82, 0xB0, 0x83, /* 0x9C-0x9F */
        0xB0, 0x84, 0xB0, 0x85, 0xB0, 0x86, 0xB0, 0x87, /* 0xA0-0xA3 */
        0xF0, 0xAB, 0xB0, 0x88, 0xB0, 0x89, 0xB0, 0x8A, /* 0xA4-0xA7 */
        0xB0, 0x8B, 0xB0, 0x8C, 0xB0, 0x8D, 0xB0, 0x8E, /* 0xA8-0xAB */
        0xB0, 0x8F, 0xB0, 0x90, 0xC6, 0xA4, 0xB0, 0x91, /* 0xAC-0xAF */
        0xB0, 0x92, 0xD6, 0xE5, 0xF1, 0xE4, 0xB0, 0x93, /* 0xB0-0xB3 */
        0xF1, 0xE5, 0xB0, 0x94, 0xB0, 0x95, 0xB0, 0x96, /* 0xB4-0xB7 */
        0xB0, 0x97, 0xB0, 0x98, 0xB0, 0x99, 0xB0, 0x9A, /* 0xB8-0xBB */
        0xB0, 0x9B, 0xB0, 0x9C, 0xB0, 0x9D, 0xC3, 0xF3, /* 0xBC-0xBF */
        0xB0, 0x9E, 0xB0, 0x9F, 0xD3, 0xDB, 0xB0, 0xA0, /* 0xC0-0xC3 */
        0xB1, 0x40, 0xD6, 0xD1, 0xC5, 0xE8, 0xB1, 0x41, /* 0xC4-0xC7 */
        0xD3, 0xAF, 0xB1, 0x42, 0xD2, 0xE6, 0xB1, 0x43, /* 0xC8-0xCB */
        0xB1, 0x44, 0xEE, 0xC1, 0xB0, 0xBB, 0xD5, 0xB5, /* 0xCC-0xCF */
        0xD1, 0xCE, 0xBC, 0xE0, 0xBA, 0xD0, 0xB1, 0x45, /* 0xD0-0xD3 */
        0xBF, 0xF8, 0xB1, 0x46, 0xB8, 0xC7, 0xB5, 0xC1, /* 0xD4-0xD7 */
        0xC5, 0xCC, 0xB1, 0x47, 0xB1, 0x48, 0xCA, 0xA2, /* 0xD8-0xDB */
        0xB1, 0x49, 0xB1, 0x4A, 0xB1, 0x4B, 0xC3, 0xCB, /* 0xDC-0xDF */
        0xB1, 0x4C, 0xB1, 0x4D, 0xB1, 0x4E, 0xB1, 0x4F, /* 0xE0-0xE3 */
        0xB1, 0x50, 0xEE, 0xC2, 0xB1, 0x51, 0xB1, 0x52, /* 0xE4-0xE7 */
        0xB1, 0x53, 0xB1, 0x54, 0xB1, 0x55, 0xB1, 0x56, /* 0xE8-0xEB */
        0xB1, 0x57, 0xB1, 0x58, 0xC4, 0xBF, 0xB6, 0xA2, /* 0xEC-0xEF */
        0xB1, 0x59, 0xED, 0xEC, 0xC3, 0xA4, 0xB1, 0x5A, /* 0xF0-0xF3 */
        0xD6, 0xB1, 0xB1, 0x5B, 0xB1, 0x5C, 0xB1, 0x5D, /* 0xF4-0xF7 */
        0xCF, 0xE0, 0xED, 0xEF, 0xB1, 0x5E, 0xB1, 0x5F, /* 0xF8-0xFB */
        0xC5, 0xCE, 0xB1, 0x60, 0xB6, 0xDC, 0xB1, 0x61, /* 0xFC-0xFF */
};

static const unsigned char u2c_77[512] = {
        0xB1, 0x62, 0xCA, 0xA1, 0xB1, 0x63, 0xB1, 0x64, /* 0x00-0x03 */
        0xED, 0xED, 0xB1, 0x65, 0xB1, 0x66, 0xED, 0xF0, /* 0x04-0x07 */
        0xED, 0xF1, 0xC3, 0xBC, 0xB1, 0x67, 0xBF, 0xB4, /* 0x08-0x0B */
        0xB1, 0x68, 0xED, 0xEE, 0xB1, 0x69, 0xB1, 0x6A, /* 0x0C-0x0F */
        0xB1, 0x6B, 0xB1, 0x6C, 0xB1, 0x6D, 0xB1, 0x6E, /* 0x10-0x13 */
        0xB1, 0x6F, 0xB1, 0x70, 0xB1, 0x71, 0xB1, 0x72, /* 0x14-0x17 */
        0xB1, 0x73, 0xED, 0xF4, 0xED, 0xF2, 0xB1, 0x74, /* 0x18-0x1B */
        0xB1, 0x75, 0xB1, 0x76, 0xB1, 0x77, 0xD5, 0xE6, /* 0x1C-0x1F */
        0xC3, 0xDF, 0xB1, 0x78, 0xED, 0xF3, 0xB1, 0x79, /* 0x20-0x23 */
        0xB1, 0x7A, 0xB1, 0x7B, 0xED, 0xF6, 0xB1, 0x7C, /* 0x24-0x27 */
        0xD5, 0xA3, 0xD1, 0xA3, 0xB1, 0x7D, 0xB1, 0x7E, /* 0x28-0x2B */
        0xB1, 0x80, 0xED, 0xF5, 0xB1, 0x81, 0xC3, 0xD0, /* 0x2C-0x2F */
        0xB1, 0x82, 0xB1, 0x83, 0xB1, 0x84, 0xB1, 0x85, /* 0x30-0x33 */
        0xB1, 0x86, 0xED, 0xF7, 0xBF, 0xF4, 0xBE, 0xEC, /* 0x34-0x37 */
        0xED, 0xF8, 0xB1, 0x87, 0xCC, 0xF7, 0xB1, 0x88, /* 0x38-0x3B */
        0xD1, 0xDB, 0xB1, 0x89, 0xB1, 0x8A, 0xB1, 0x8B, /* 0x3C-0x3F */
        0xD7, 0xC5, 0xD5, 0xF6, 0xB1, 0x8C, 0xED, 0xFC, /* 0x40-0x43 */
        0xB1, 0x8D, 0xB1, 0x8E, 0xB1, 0x8F, 0xED, 0xFB, /* 0x44-0x47 */
        0xB1, 0x90, 0xB1, 0x91, 0xB1, 0x92, 0xB1, 0x93, /* 0x48-0x4B */
        0xB1, 0x94, 0xB1, 0x95, 0xB1, 0x96, 0xB1, 0x97, /* 0x4C-0x4F */
        0xED, 0xF9, 0xED, 0xFA, 0xB1, 0x98, 0xB1, 0x99, /* 0x50-0x53 */
        0xB1, 0x9A, 0xB1, 0x9B, 0xB1, 0x9C, 0xB1, 0x9D, /* 0x54-0x57 */
        0xB1, 0x9E, 0xB1, 0x9F, 0xED, 0xFD, 0xBE, 0xA6, /* 0x58-0x5B */
        0xB1, 0xA0, 0xB2, 0x40, 0xB2, 0x41, 0xB2, 0x42, /* 0x5C-0x5F */
        0xB2, 0x43, 0xCB, 0xAF, 0xEE, 0xA1, 0xB6, 0xBD, /* 0x60-0x63 */
        0xB2, 0x44, 0xEE, 0xA2, 0xC4, 0xC0, 0xB2, 0x45, /* 0x64-0x67 */
        0xED, 0xFE, 0xB2, 0x46, 0xB2, 0x47, 0xBD, 0xDE, /* 0x68-0x6B */
        0xB2, 0xC7, 0xB2, 0x48, 0xB2, 0x49, 0xB2, 0x4A, /* 0x6C-0x6F */
        0xB2, 0x4B, 0xB2, 0x4C, 0xB2, 0x4D, 0xB2, 0x4E, /* 0x70-0x73 */
        0xB2, 0x4F, 0xB2, 0x50, 0xB2, 0x51, 0xB2, 0x52, /* 0x74-0x77 */
        0xB2, 0x53, 0xB6, 0xC3, 0xB2, 0x54, 0xB2, 0x55, /* 0x78-0x7B */
        0xB2, 0x56, 0xEE, 0xA5, 0xD8, 0xBA, 0xEE, 0xA3, /* 0x7C-0x7F */
        
        0xEE, 0xA6, 0xB2, 0x57, 0xB2, 0x58, 0xB2, 0x59, /* 0x80-0x83 */
        0xC3, 0xE9, 0xB3, 0xF2, 0xB2, 0x5A, 0xB2, 0x5B, /* 0x84-0x87 */
        0xB2, 0x5C, 0xB2, 0x5D, 0xB2, 0x5E, 0xB2, 0x5F, /* 0x88-0x8B */
        0xEE, 0xA7, 0xEE, 0xA4, 0xCF, 0xB9, 0xB2, 0x60, /* 0x8C-0x8F */
        0xB2, 0x61, 0xEE, 0xA8, 0xC2, 0xF7, 0xB2, 0x62, /* 0x90-0x93 */
        0xB2, 0x63, 0xB2, 0x64, 0xB2, 0x65, 0xB2, 0x66, /* 0x94-0x97 */
        0xB2, 0x67, 0xB2, 0x68, 0xB2, 0x69, 0xB2, 0x6A, /* 0x98-0x9B */
        0xB2, 0x6B, 0xB2, 0x6C, 0xB2, 0x6D, 0xEE, 0xA9, /* 0x9C-0x9F */
        0xEE, 0xAA, 0xB2, 0x6E, 0xDE, 0xAB, 0xB2, 0x6F, /* 0xA0-0xA3 */
        0xB2, 0x70, 0xC6, 0xB3, 0xB2, 0x71, 0xC7, 0xC6, /* 0xA4-0xA7 */
        0xB2, 0x72, 0xD6, 0xF5, 0xB5, 0xC9, 0xB2, 0x73, /* 0xA8-0xAB */
        0xCB, 0xB2, 0xB2, 0x74, 0xB2, 0x75, 0xB2, 0x76, /* 0xAC-0xAF */
        0xEE, 0xAB, 0xB2, 0x77, 0xB2, 0x78, 0xCD, 0xAB, /* 0xB0-0xB3 */
        0xB2, 0x79, 0xEE, 0xAC, 0xB2, 0x7A, 0xB2, 0x7B, /* 0xB4-0xB7 */
        0xB2, 0x7C, 0xB2, 0x7D, 0xB2, 0x7E, 0xD5, 0xB0, /* 0xB8-0xBB */
        0xB2, 0x80, 0xEE, 0xAD, 0xB2, 0x81, 0xF6, 0xC4, /* 0xBC-0xBF */
        0xB2, 0x82, 0xB2, 0x83, 0xB2, 0x84, 0xB2, 0x85, /* 0xC0-0xC3 */
        0xB2, 0x86, 0xB2, 0x87, 0xB2, 0x88, 0xB2, 0x89, /* 0xC4-0xC7 */
        0xB2, 0x8A, 0xB2, 0x8B, 0xB2, 0x8C, 0xB2, 0x8D, /* 0xC8-0xCB */
        0xB2, 0x8E, 0xDB, 0xC7, 0xB2, 0x8F, 0xB2, 0x90, /* 0xCC-0xCF */
        0xB2, 0x91, 0xB2, 0x92, 0xB2, 0x93, 0xB2, 0x94, /* 0xD0-0xD3 */
        0xB2, 0x95, 0xB2, 0x96, 0xB2, 0x97, 0xB4, 0xA3, /* 0xD4-0xD7 */
        0xB2, 0x98, 0xB2, 0x99, 0xB2, 0x9A, 0xC3, 0xAC, /* 0xD8-0xDB */
        0xF1, 0xE6, 0xB2, 0x9B, 0xB2, 0x9C, 0xB2, 0x9D, /* 0xDC-0xDF */
        0xB2, 0x9E, 0xB2, 0x9F, 0xCA, 0xB8, 0xD2, 0xD3, /* 0xE0-0xE3 */
        0xB2, 0xA0, 0xD6, 0xAA, 0xB3, 0x40, 0xEF, 0xF2, /* 0xE4-0xE7 */
        0xB3, 0x41, 0xBE, 0xD8, 0xB3, 0x42, 0xBD, 0xC3, /* 0xE8-0xEB */
        0xEF, 0xF3, 0xB6, 0xCC, 0xB0, 0xAB, 0xB3, 0x43, /* 0xEC-0xEF */
        0xB3, 0x44, 0xB3, 0x45, 0xB3, 0x46, 0xCA, 0xAF, /* 0xF0-0xF3 */
        0xB3, 0x47, 0xB3, 0x48, 0xED, 0xB6, 0xB3, 0x49, /* 0xF4-0xF7 */
        0xED, 0xB7, 0xB3, 0x4A, 0xB3, 0x4B, 0xB3, 0x4C, /* 0xF8-0xFB */
        0xB3, 0x4D, 0xCE, 0xF9, 0xB7, 0xAF, 0xBF, 0xF3, /* 0xFC-0xFF */
};

static const unsigned char u2c_78[512] = {
        0xED, 0xB8, 0xC2, 0xEB, 0xC9, 0xB0, 0xB3, 0x4E, /* 0x00-0x03 */
        0xB3, 0x4F, 0xB3, 0x50, 0xB3, 0x51, 0xB3, 0x52, /* 0x04-0x07 */
        0xB3, 0x53, 0xED, 0xB9, 0xB3, 0x54, 0xB3, 0x55, /* 0x08-0x0B */
        0xC6, 0xF6, 0xBF, 0xB3, 0xB3, 0x56, 0xB3, 0x57, /* 0x0C-0x0F */
        0xB3, 0x58, 0xED, 0xBC, 0xC5, 0xF8, 0xB3, 0x59, /* 0x10-0x13 */
        0xD1, 0xD0, 0xB3, 0x5A, 0xD7, 0xA9, 0xED, 0xBA, /* 0x14-0x17 */
        0xED, 0xBB, 0xB3, 0x5B, 0xD1, 0xE2, 0xB3, 0x5C, /* 0x18-0x1B */
        0xED, 0xBF, 0xED, 0xC0, 0xB3, 0x5D, 0xED, 0xC4, /* 0x1C-0x1F */
        0xB3, 0x5E, 0xB3, 0x5F, 0xB3, 0x60, 0xED, 0xC8, /* 0x20-0x23 */
        0xB3, 0x61, 0xED, 0xC6, 0xED, 0xCE, 0xD5, 0xE8, /* 0x24-0x27 */
        0xB3, 0x62, 0xED, 0xC9, 0xB3, 0x63, 0xB3, 0x64, /* 0x28-0x2B */
        0xED, 0xC7, 0xED, 0xBE, 0xB3, 0x65, 0xB3, 0x66, /* 0x2C-0x2F */
        0xC5, 0xE9, 0xB3, 0x67, 0xB3, 0x68, 0xB3, 0x69, /* 0x30-0x33 */
        0xC6, 0xC6, 0xB3, 0x6A, 0xB3, 0x6B, 0xC9, 0xE9, /* 0x34-0x37 */
        0xD4, 0xD2, 0xED, 0xC1, 0xED, 0xC2, 0xED, 0xC3, /* 0x38-0x3B */
        0xED, 0xC5, 0xB3, 0x6C, 0xC0, 0xF9, 0xB3, 0x6D, /* 0x3C-0x3F */
        0xB4, 0xA1, 0xB3, 0x6E, 0xB3, 0x6F, 0xB3, 0x70, /* 0x40-0x43 */
        0xB3, 0x71, 0xB9, 0xE8, 0xB3, 0x72, 0xED, 0xD0, /* 0x44-0x47 */
        0xB3, 0x73, 0xB3, 0x74, 0xB3, 0x75, 0xB3, 0x76, /* 0x48-0x4B */
        0xED, 0xD1, 0xB3, 0x77, 0xED, 0xCA, 0xB3, 0x78, /* 0x4C-0x4F */
        0xED, 0xCF, 0xB3, 0x79, 0xCE, 0xF8, 0xB3, 0x7A, /* 0x50-0x53 */
        0xB3, 0x7B, 0xCB, 0xB6, 0xED, 0xCC, 0xED, 0xCD, /* 0x54-0x57 */
        0xB3, 0x7C, 0xB3, 0x7D, 0xB3, 0x7E, 0xB3, 0x80, /* 0x58-0x5B */
        0xB3, 0x81, 0xCF, 0xF5, 0xB3, 0x82, 0xB3, 0x83, /* 0x5C-0x5F */
        0xB3, 0x84, 0xB3, 0x85, 0xB3, 0x86, 0xB3, 0x87, /* 0x60-0x63 */
        0xB3, 0x88, 0xB3, 0x89, 0xB3, 0x8A, 0xB3, 0x8B, /* 0x64-0x67 */
        0xB3, 0x8C, 0xB3, 0x8D, 0xED, 0xD2, 0xC1, 0xF2, /* 0x68-0x6B */
        0xD3, 0xB2, 0xED, 0xCB, 0xC8, 0xB7, 0xB3, 0x8E, /* 0x6C-0x6F */
        0xB3, 0x8F, 0xB3, 0x90, 0xB3, 0x91, 0xB3, 0x92, /* 0x70-0x73 */
        0xB3, 0x93, 0xB3, 0x94, 0xB3, 0x95, 0xBC, 0xEF, /* 0x74-0x77 */
        0xB3, 0x96, 0xB3, 0x97, 0xB3, 0x98, 0xB3, 0x99, /* 0x78-0x7B */
        0xC5, 0xF0, 0xB3, 0x9A, 0xB3, 0x9B, 0xB3, 0x9C, /* 0x7C-0x7F */
        
        0xB3, 0x9D, 0xB3, 0x9E, 0xB3, 0x9F, 0xB3, 0xA0, /* 0x80-0x83 */
        0xB4, 0x40, 0xB4, 0x41, 0xB4, 0x42, 0xED, 0xD6, /* 0x84-0x87 */
        0xB4, 0x43, 0xB5, 0xEF, 0xB4, 0x44, 0xB4, 0x45, /* 0x88-0x8B */
        0xC2, 0xB5, 0xB0, 0xAD, 0xCB, 0xE9, 0xB4, 0x46, /* 0x8C-0x8F */
        0xB4, 0x47, 0xB1, 0xAE, 0xB4, 0x48, 0xED, 0xD4, /* 0x90-0x93 */
        0xB4, 0x49, 0xB4, 0x4A, 0xB4, 0x4B, 0xCD, 0xEB, /* 0x94-0x97 */
        0xB5, 0xE2, 0xB4, 0x4C, 0xED, 0xD5, 0xED, 0xD3, /* 0x98-0x9B */
        0xED, 0xD7, 0xB4, 0x4D, 0xB4, 0x4E, 0xB5, 0xFA, /* 0x9C-0x9F */
        0xB4, 0x4F, 0xED, 0xD8, 0xB4, 0x50, 0xED, 0xD9, /* 0xA0-0xA3 */
        0xB4, 0x51, 0xED, 0xDC, 0xB4, 0x52, 0xB1, 0xCC, /* 0xA4-0xA7 */
        0xB4, 0x53, 0xB4, 0x54, 0xB4, 0x55, 0xB4, 0x56, /* 0xA8-0xAB */
        0xB4, 0x57, 0xB4, 0x58, 0xB4, 0x59, 0xB4, 0x5A, /* 0xAC-0xAF */
        0xC5, 0xF6, 0xBC, 0xEE, 0xED, 0xDA, 0xCC, 0xBC, /* 0xB0-0xB3 */
        0xB2, 0xEA, 0xB4, 0x5B, 0xB4, 0x5C, 0xB4, 0x5D, /* 0xB4-0xB7 */
        0xB4, 0x5E, 0xED, 0xDB, 0xB4, 0x5F, 0xB4, 0x60, /* 0xB8-0xBB */
        0xB4, 0x61, 0xB4, 0x62, 0xC4, 0xEB, 0xB4, 0x63, /* 0xBC-0xBF */
        0xB4, 0x64, 0xB4, 0xC5, 0xB4, 0x65, 0xB4, 0x66, /* 0xC0-0xC3 */
        0xB4, 0x67, 0xB0, 0xF5, 0xB4, 0x68, 0xB4, 0x69, /* 0xC4-0xC7 */
        0xB4, 0x6A, 0xED, 0xDF, 0xC0, 0xDA, 0xB4, 0xE8, /* 0xC8-0xCB */
        0xB4, 0x6B, 0xB4, 0x6C, 0xB4, 0x6D, 0xB4, 0x6E, /* 0xCC-0xCF */
        0xC5, 0xCD, 0xB4, 0x6F, 0xB4, 0x70, 0xB4, 0x71, /* 0xD0-0xD3 */
        0xED, 0xDD, 0xBF, 0xC4, 0xB4, 0x72, 0xB4, 0x73, /* 0xD4-0xD7 */
        0xB4, 0x74, 0xED, 0xDE, 0xB4, 0x75, 0xB4, 0x76, /* 0xD8-0xDB */
        0xB4, 0x77, 0xB4, 0x78, 0xB4, 0x79, 0xB4, 0x7A, /* 0xDC-0xDF */
        0xB4, 0x7B, 0xB4, 0x7C, 0xB4, 0x7D, 0xB4, 0x7E, /* 0xE0-0xE3 */
        0xB4, 0x80, 0xB4, 0x81, 0xB4, 0x82, 0xB4, 0x83, /* 0xE4-0xE7 */
        0xC4, 0xA5, 0xB4, 0x84, 0xB4, 0x85, 0xB4, 0x86, /* 0xE8-0xEB */
        0xED, 0xE0, 0xB4, 0x87, 0xB4, 0x88, 0xB4, 0x89, /* 0xEC-0xEF */
        0xB4, 0x8A, 0xB4, 0x8B, 0xED, 0xE1, 0xB4, 0x8C, /* 0xF0-0xF3 */
        0xED, 0xE3, 0xB4, 0x8D, 0xB4, 0x8E, 0xC1, 0xD7, /* 0xF4-0xF7 */
        0xB4, 0x8F, 0xB4, 0x90, 0xBB, 0xC7, 0xB4, 0x91, /* 0xF8-0xFB */
        0xB4, 0x92, 0xB4, 0x93, 0xB4, 0x94, 0xB4, 0x95, /* 0xFC-0xFF */
};

static const unsigned char u2c_79[512] = {
        0xB4, 0x96, 0xBD, 0xB8, 0xB4, 0x97, 0xB4, 0x98, /* 0x00-0x03 */
        0xB4, 0x99, 0xED, 0xE2, 0xB4, 0x9A, 0xB4, 0x9B, /* 0x04-0x07 */
        0xB4, 0x9C, 0xB4, 0x9D, 0xB4, 0x9E, 0xB4, 0x9F, /* 0x08-0x0B */
        0xB4, 0xA0, 0xB5, 0x40, 0xB5, 0x41, 0xB5, 0x42, /* 0x0C-0x0F */
        0xB5, 0x43, 0xB5, 0x44, 0xB5, 0x45, 0xED, 0xE4, /* 0x10-0x13 */
        0xB5, 0x46, 0xB5, 0x47, 0xB5, 0x48, 0xB5, 0x49, /* 0x14-0x17 */
        0xB5, 0x4A, 0xB5, 0x4B, 0xB5, 0x4C, 0xB5, 0x4D, /* 0x18-0x1B */
        0xB5, 0x4E, 0xB5, 0x4F, 0xED, 0xE6, 0xB5, 0x50, /* 0x1C-0x1F */
        0xB5, 0x51, 0xB5, 0x52, 0xB5, 0x53, 0xB5, 0x54, /* 0x20-0x23 */
        0xED, 0xE5, 0xB5, 0x55, 0xB5, 0x56, 0xB5, 0x57, /* 0x24-0x27 */
        0xB5, 0x58, 0xB5, 0x59, 0xB5, 0x5A, 0xB5, 0x5B, /* 0x28-0x2B */
        0xB5, 0x5C, 0xB5, 0x5D, 0xB5, 0x5E, 0xB5, 0x5F, /* 0x2C-0x2F */
        0xB5, 0x60, 0xB5, 0x61, 0xB5, 0x62, 0xB5, 0x63, /* 0x30-0x33 */
        0xED, 0xE7, 0xB5, 0x64, 0xB5, 0x65, 0xB5, 0x66, /* 0x34-0x37 */
        0xB5, 0x67, 0xB5, 0x68, 0xCA, 0xBE, 0xEC, 0xEA, /* 0x38-0x3B */
        0xC0, 0xF1, 0xB5, 0x69, 0xC9, 0xE7, 0xB5, 0x6A, /* 0x3C-0x3F */
        0xEC, 0xEB, 0xC6, 0xEE, 0xB5, 0x6B, 0xB5, 0x6C, /* 0x40-0x43 */
        0xB5, 0x6D, 0xB5, 0x6E, 0xEC, 0xEC, 0xB5, 0x6F, /* 0x44-0x47 */
        0xC6, 0xED, 0xEC, 0xED, 0xB5, 0x70, 0xB5, 0x71, /* 0x48-0x4B */
        0xB5, 0x72, 0xB5, 0x73, 0xB5, 0x74, 0xB5, 0x75, /* 0x4C-0x4F */
        0xB5, 0x76, 0xB5, 0x77, 0xB5, 0x78, 0xEC, 0xF0, /* 0x50-0x53 */
        0xB5, 0x79, 0xB5, 0x7A, 0xD7, 0xE6, 0xEC, 0xF3, /* 0x54-0x57 */
        0xB5, 0x7B, 0xB5, 0x7C, 0xEC, 0xF1, 0xEC, 0xEE, /* 0x58-0x5B */
        0xEC, 0xEF, 0xD7, 0xA3, 0xC9, 0xF1, 0xCB, 0xEE, /* 0x5C-0x5F */
        0xEC, 0xF4, 0xB5, 0x7D, 0xEC, 0xF2, 0xB5, 0x7E, /* 0x60-0x63 */
        0xB5, 0x80, 0xCF, 0xE9, 0xB5, 0x81, 0xEC, 0xF6, /* 0x64-0x67 */
        0xC6, 0xB1, 0xB5, 0x82, 0xB5, 0x83, 0xB5, 0x84, /* 0x68-0x6B */
        0xB5, 0x85, 0xBC, 0xC0, 0xB5, 0x86, 0xEC, 0xF5, /* 0x6C-0x6F */
        0xB5, 0x87, 0xB5, 0x88, 0xB5, 0x89, 0xB5, 0x8A, /* 0x70-0x73 */
        0xB5, 0x8B, 0xB5, 0x8C, 0xB5, 0x8D, 0xB5, 0xBB, /* 0x74-0x77 */
        0xBB, 0xF6, 0xB5, 0x8E, 0xEC, 0xF7, 0xB5, 0x8F, /* 0x78-0x7B */
        0xB5, 0x90, 0xB5, 0x91, 0xB5, 0x92, 0xB5, 0x93, /* 0x7C-0x7F */
        
        0xD9, 0xF7, 0xBD, 0xFB, 0xB5, 0x94, 0xB5, 0x95, /* 0x80-0x83 */
        0xC2, 0xBB, 0xEC, 0xF8, 0xB5, 0x96, 0xB5, 0x97, /* 0x84-0x87 */
        0xB5, 0x98, 0xB5, 0x99, 0xEC, 0xF9, 0xB5, 0x9A, /* 0x88-0x8B */
        0xB5, 0x9B, 0xB5, 0x9C, 0xB5, 0x9D, 0xB8, 0xA3, /* 0x8C-0x8F */
        0xB5, 0x9E, 0xB5, 0x9F, 0xB5, 0xA0, 0xB6, 0x40, /* 0x90-0x93 */
        0xB6, 0x41, 0xB6, 0x42, 0xB6, 0x43, 0xB6, 0x44, /* 0x94-0x97 */
        0xB6, 0x45, 0xB6, 0x46, 0xEC, 0xFA, 0xB6, 0x47, /* 0x98-0x9B */
        0xB6, 0x48, 0xB6, 0x49, 0xB6, 0x4A, 0xB6, 0x4B, /* 0x9C-0x9F */
        0xB6, 0x4C, 0xB6, 0x4D, 0xB6, 0x4E, 0xB6, 0x4F, /* 0xA0-0xA3 */
        0xB6, 0x50, 0xB6, 0x51, 0xB6, 0x52, 0xEC, 0xFB, /* 0xA4-0xA7 */
        0xB6, 0x53, 0xB6, 0x54, 0xB6, 0x55, 0xB6, 0x56, /* 0xA8-0xAB */
        0xB6, 0x57, 0xB6, 0x58, 0xB6, 0x59, 0xB6, 0x5A, /* 0xAC-0xAF */
        0xB6, 0x5B, 0xB6, 0x5C, 0xB6, 0x5D, 0xEC, 0xFC, /* 0xB0-0xB3 */
        0xB6, 0x5E, 0xB6, 0x5F, 0xB6, 0x60, 0xB6, 0x61, /* 0xB4-0xB7 */
        0xB6, 0x62, 0xD3, 0xED, 0xD8, 0xAE, 0xC0, 0xEB, /* 0xB8-0xBB */
        0xB6, 0x63, 0xC7, 0xDD, 0xBA, 0xCC, 0xB6, 0x64, /* 0xBC-0xBF */
        0xD0, 0xE3, 0xCB, 0xBD, 0xB6, 0x65, 0xCD, 0xBA, /* 0xC0-0xC3 */
        0xB6, 0x66, 0xB6, 0x67, 0xB8, 0xD1, 0xB6, 0x68, /* 0xC4-0xC7 */
        0xB6, 0x69, 0xB1, 0xFC, 0xB6, 0x6A, 0xC7, 0xEF, /* 0xC8-0xCB */
        0xB6, 0x6B, 0xD6, 0xD6, 0xB6, 0x6C, 0xB6, 0x6D, /* 0xCC-0xCF */
        0xB6, 0x6E, 0xBF, 0xC6, 0xC3, 0xEB, 0xB6, 0x6F, /* 0xD0-0xD3 */
        0xB6, 0x70, 0xEF, 0xF5, 0xB6, 0x71, 0xB6, 0x72, /* 0xD4-0xD7 */
        0xC3, 0xD8, 0xB6, 0x73, 0xB6, 0x74, 0xB6, 0x75, /* 0xD8-0xDB */
        0xB6, 0x76, 0xB6, 0x77, 0xB6, 0x78, 0xD7, 0xE2, /* 0xDC-0xDF */
        0xB6, 0x79, 0xB6, 0x7A, 0xB6, 0x7B, 0xEF, 0xF7, /* 0xE0-0xE3 */
        0xB3, 0xD3, 0xB6, 0x7C, 0xC7, 0xD8, 0xD1, 0xED, /* 0xE4-0xE7 */
        0xB6, 0x7D, 0xD6, 0xC8, 0xB6, 0x7E, 0xEF, 0xF8, /* 0xE8-0xEB */
        0xB6, 0x80, 0xEF, 0xF6, 0xB6, 0x81, 0xBB, 0xFD, /* 0xEC-0xEF */
        0xB3, 0xC6, 0xB6, 0x82, 0xB6, 0x83, 0xB6, 0x84, /* 0xF0-0xF3 */
        0xB6, 0x85, 0xB6, 0x86, 0xB6, 0x87, 0xB6, 0x88, /* 0xF4-0xF7 */
        0xBD, 0xD5, 0xB6, 0x89, 0xB6, 0x8A, 0xD2, 0xC6, /* 0xF8-0xFB */
        0xB6, 0x8B, 0xBB, 0xE0, 0xB6, 0x8C, 0xB6, 0x8D, /* 0xFC-0xFF */
};

static const unsigned char u2c_7A[512] = {
        0xCF, 0xA1, 0xB6, 0x8E, 0xEF, 0xFC, 0xEF, 0xFB, /* 0x00-0x03 */
        0xB6, 0x8F, 0xB6, 0x90, 0xEF, 0xF9, 0xB6, 0x91, /* 0x04-0x07 */
        0xB6, 0x92, 0xB6, 0x93, 0xB6, 0x94, 0xB3, 0xCC, /* 0x08-0x0B */
        0xB6, 0x95, 0xC9, 0xD4, 0xCB, 0xB0, 0xB6, 0x96, /* 0x0C-0x0F */
        0xB6, 0x97, 0xB6, 0x98, 0xB6, 0x99, 0xB6, 0x9A, /* 0x10-0x13 */
        0xEF, 0xFE, 0xB6, 0x9B, 0xB6, 0x9C, 0xB0, 0xDE, /* 0x14-0x17 */
        0xB6, 0x9D, 0xB6, 0x9E, 0xD6, 0xC9, 0xB6, 0x9F, /* 0x18-0x1B */
        0xB6, 0xA0, 0xB7, 0x40, 0xEF, 0xFD, 0xB7, 0x41, /* 0x1C-0x1F */
        0xB3, 0xED, 0xB7, 0x42, 0xB7, 0x43, 0xF6, 0xD5, /* 0x20-0x23 */
        0xB7, 0x44, 0xB7, 0x45, 0xB7, 0x46, 0xB7, 0x47, /* 0x24-0x27 */
        0xB7, 0x48, 0xB7, 0x49, 0xB7, 0x4A, 0xB7, 0x4B, /* 0x28-0x2B */
        0xB7, 0x4C, 0xB7, 0x4D, 0xB7, 0x4E, 0xB7, 0x4F, /* 0x2C-0x2F */
        0xB7, 0x50, 0xB7, 0x51, 0xB7, 0x52, 0xCE, 0xC8, /* 0x30-0x33 */
        0xB7, 0x53, 0xB7, 0x54, 0xB7, 0x55, 0xF0, 0xA2, /* 0x34-0x37 */
        0xB7, 0x56, 0xF0, 0xA1, 0xB7, 0x57, 0xB5, 0xBE, /* 0x38-0x3B */
        0xBC, 0xDA, 0xBB, 0xFC, 0xB7, 0x58, 0xB8, 0xE5, /* 0x3C-0x3F */
        0xB7, 0x59, 0xB7, 0x5A, 0xB7, 0x5B, 0xB7, 0x5C, /* 0x40-0x43 */
        0xB7, 0x5D, 0xB7, 0x5E, 0xC4, 0xC2, 0xB7, 0x5F, /* 0x44-0x47 */
        0xB7, 0x60, 0xB7, 0x61, 0xB7, 0x62, 0xB7, 0x63, /* 0x48-0x4B */
        0xB7, 0x64, 0xB7, 0x65, 0xB7, 0x66, 0xB7, 0x67, /* 0x4C-0x4F */
        0xB7, 0x68, 0xF0, 0xA3, 0xB7, 0x69, 0xB7, 0x6A, /* 0x50-0x53 */
        0xB7, 0x6B, 0xB7, 0x6C, 0xB7, 0x6D, 0xCB, 0xEB, /* 0x54-0x57 */
        0xB7, 0x6E, 0xB7, 0x6F, 0xB7, 0x70, 0xB7, 0x71, /* 0x58-0x5B */
        0xB7, 0x72, 0xB7, 0x73, 0xB7, 0x74, 0xB7, 0x75, /* 0x5C-0x5F */
        0xB7, 0x76, 0xB7, 0x77, 0xB7, 0x78, 0xB7, 0x79, /* 0x60-0x63 */
        0xB7, 0x7A, 0xB7, 0x7B, 0xB7, 0x7C, 0xB7, 0x7D, /* 0x64-0x67 */
        0xB7, 0x7E, 0xB7, 0x80, 0xB7, 0x81, 0xB7, 0x82, /* 0x68-0x6B */
        0xB7, 0x83, 0xB7, 0x84, 0xB7, 0x85, 0xB7, 0x86, /* 0x6C-0x6F */
        0xF0, 0xA6, 0xB7, 0x87, 0xB7, 0x88, 0xB7, 0x89, /* 0x70-0x73 */
        0xD1, 0xA8, 0xB7, 0x8A, 0xBE, 0xBF, 0xC7, 0xEE, /* 0x74-0x77 */
        0xF1, 0xB6, 0xF1, 0xB7, 0xBF, 0xD5, 0xB7, 0x8B, /* 0x78-0x7B */
        0xB7, 0x8C, 0xB7, 0x8D, 0xB7, 0x8E, 0xB4, 0xA9, /* 0x7C-0x7F */
        
        0xF1, 0xB8, 0xCD, 0xBB, 0xB7, 0x8F, 0xC7, 0xD4, /* 0x80-0x83 */
        0xD5, 0xAD, 0xB7, 0x90, 0xF1, 0xB9, 0xB7, 0x91, /* 0x84-0x87 */
        0xF1, 0xBA, 0xB7, 0x92, 0xB7, 0x93, 0xB7, 0x94, /* 0x88-0x8B */
        0xB7, 0x95, 0xC7, 0xCF, 0xB7, 0x96, 0xB7, 0x97, /* 0x8C-0x8F */
        0xB7, 0x98, 0xD2, 0xA4, 0xD6, 0xCF, 0xB7, 0x99, /* 0x90-0x93 */
        0xB7, 0x9A, 0xF1, 0xBB, 0xBD, 0xD1, 0xB4, 0xB0, /* 0x94-0x97 */
        0xBE, 0xBD, 0xB7, 0x9B, 0xB7, 0x9C, 0xB7, 0x9D, /* 0x98-0x9B */
        0xB4, 0xDC, 0xCE, 0xD1, 0xB7, 0x9E, 0xBF, 0xDF, /* 0x9C-0x9F */
        0xF1, 0xBD, 0xB7, 0x9F, 0xB7, 0xA0, 0xB8, 0x40, /* 0xA0-0xA3 */
        0xB8, 0x41, 0xBF, 0xFA, 0xF1, 0xBC, 0xB8, 0x42, /* 0xA4-0xA7 */
        0xF1, 0xBF, 0xB8, 0x43, 0xB8, 0x44, 0xB8, 0x45, /* 0xA8-0xAB */
        0xF1, 0xBE, 0xF1, 0xC0, 0xB8, 0x46, 0xB8, 0x47, /* 0xAC-0xAF */
        0xB8, 0x48, 0xB8, 0x49, 0xB8, 0x4A, 0xF1, 0xC1, /* 0xB0-0xB3 */
        0xB8, 0x4B, 0xB8, 0x4C, 0xB8, 0x4D, 0xB8, 0x4E, /* 0xB4-0xB7 */
        0xB8, 0x4F, 0xB8, 0x50, 0xB8, 0x51, 0xB8, 0x52, /* 0xB8-0xBB */
        0xB8, 0x53, 0xB8, 0x54, 0xB8, 0x55, 0xC1, 0xFE, /* 0xBC-0xBF */
        0xB8, 0x56, 0xB8, 0x57, 0xB8, 0x58, 0xB8, 0x59, /* 0xC0-0xC3 */
        0xB8, 0x5A, 0xB8, 0x5B, 0xB8, 0x5C, 0xB8, 0x5D, /* 0xC4-0xC7 */
        0xB8, 0x5E, 0xB8, 0x5F, 0xB8, 0x60, 0xC1, 0xA2, /* 0xC8-0xCB */
        0xB8, 0x61, 0xB8, 0x62, 0xB8, 0x63, 0xB8, 0x64, /* 0xCC-0xCF */
        0xB8, 0x65, 0xB8, 0x66, 0xB8, 0x67, 0xB8, 0x68, /* 0xD0-0xD3 */
        0xB8, 0x69, 0xB8, 0x6A, 0xCA, 0xFA, 0xB8, 0x6B, /* 0xD4-0xD7 */
        0xB8, 0x6C, 0xD5, 0xBE, 0xB8, 0x6D, 0xB8, 0x6E, /* 0xD8-0xDB */
        0xB8, 0x6F, 0xB8, 0x70, 0xBE, 0xBA, 0xBE, 0xB9, /* 0xDC-0xDF */
        0xD5, 0xC2, 0xB8, 0x71, 0xB8, 0x72, 0xBF, 0xA2, /* 0xE0-0xE3 */
        0xB8, 0x73, 0xCD, 0xAF, 0xF1, 0xB5, 0xB8, 0x74, /* 0xE4-0xE7 */
        0xB8, 0x75, 0xB8, 0x76, 0xB8, 0x77, 0xB8, 0x78, /* 0xE8-0xEB */
        0xB8, 0x79, 0xBD, 0xDF, 0xB8, 0x7A, 0xB6, 0xCB, /* 0xEC-0xEF */
        0xB8, 0x7B, 0xB8, 0x7C, 0xB8, 0x7D, 0xB8, 0x7E, /* 0xF0-0xF3 */
        0xB8, 0x80, 0xB8, 0x81, 0xB8, 0x82, 0xB8, 0x83, /* 0xF4-0xF7 */
        0xB8, 0x84, 0xD6, 0xF1, 0xF3, 0xC3, 0xB8, 0x85, /* 0xF8-0xFB */
        0xB8, 0x86, 0xF3, 0xC4, 0xB8, 0x87, 0xB8, 0xCD, /* 0xFC-0xFF */
};

static const unsigned char u2c_7B[512] = {
        0xB8, 0x88, 0xB8, 0x89, 0xB8, 0x8A, 0xF3, 0xC6, /* 0x00-0x03 */
        0xF3, 0xC7, 0xB8, 0x8B, 0xB0, 0xCA, 0xB8, 0x8C, /* 0x04-0x07 */
        0xF3, 0xC5, 0xB8, 0x8D, 0xF3, 0xC9, 0xCB, 0xF1, /* 0x08-0x0B */
        0xB8, 0x8E, 0xB8, 0x8F, 0xB8, 0x90, 0xF3, 0xCB, /* 0x0C-0x0F */
        0xB8, 0x91, 0xD0, 0xA6, 0xB8, 0x92, 0xB8, 0x93, /* 0x10-0x13 */
        0xB1, 0xCA, 0xF3, 0xC8, 0xB8, 0x94, 0xB8, 0x95, /* 0x14-0x17 */
        0xB8, 0x96, 0xF3, 0xCF, 0xB8, 0x97, 0xB5, 0xD1, /* 0x18-0x1B */
        0xB8, 0x98, 0xB8, 0x99, 0xF3, 0xD7, 0xB8, 0x9A, /* 0x1C-0x1F */
        0xF3, 0xD2, 0xB8, 0x9B, 0xB8, 0x9C, 0xB8, 0x9D, /* 0x20-0x23 */
        0xF3, 0xD4, 0xF3, 0xD3, 0xB7, 0xFB, 0xB8, 0x9E, /* 0x24-0x27 */
        0xB1, 0xBF, 0xB8, 0x9F, 0xF3, 0xCE, 0xF3, 0xCA, /* 0x28-0x2B */
        0xB5, 0xDA, 0xB8, 0xA0, 0xF3, 0xD0, 0xB9, 0x40, /* 0x2C-0x2F */
        0xB9, 0x41, 0xF3, 0xD1, 0xB9, 0x42, 0xF3, 0xD5, /* 0x30-0x33 */
        0xB9, 0x43, 0xB9, 0x44, 0xB9, 0x45, 0xB9, 0x46, /* 0x34-0x37 */
        0xF3, 0xCD, 0xB9, 0x47, 0xBC, 0xE3, 0xB9, 0x48, /* 0x38-0x3B */
        0xC1, 0xFD, 0xB9, 0x49, 0xF3, 0xD6, 0xB9, 0x4A, /* 0x3C-0x3F */
        0xB9, 0x4B, 0xB9, 0x4C, 0xB9, 0x4D, 0xB9, 0x4E, /* 0x40-0x43 */
        0xB9, 0x4F, 0xF3, 0xDA, 0xB9, 0x50, 0xF3, 0xCC, /* 0x44-0x47 */
        0xB9, 0x51, 0xB5, 0xC8, 0xB9, 0x52, 0xBD, 0xEE, /* 0x48-0x4B */
        0xF3, 0xDC, 0xB9, 0x53, 0xB9, 0x54, 0xB7, 0xA4, /* 0x4C-0x4F */
        0xBF, 0xF0, 0xD6, 0xFE, 0xCD, 0xB2, 0xB9, 0x55, /* 0x50-0x53 */
        0xB4, 0xF0, 0xB9, 0x56, 0xB2, 0xDF, 0xB9, 0x57, /* 0x54-0x57 */
        0xF3, 0xD8, 0xB9, 0x58, 0xF3, 0xD9, 0xC9, 0xB8, /* 0x58-0x5B */
        0xB9, 0x59, 0xF3, 0xDD, 0xB9, 0x5A, 0xB9, 0x5B, /* 0x5C-0x5F */
        0xF3, 0xDE, 0xB9, 0x5C, 0xF3, 0xE1, 0xB9, 0x5D, /* 0x60-0x63 */
        0xB9, 0x5E, 0xB9, 0x5F, 0xB9, 0x60, 0xB9, 0x61, /* 0x64-0x67 */
        0xB9, 0x62, 0xB9, 0x63, 0xB9, 0x64, 0xB9, 0x65, /* 0x68-0x6B */
        0xB9, 0x66, 0xB9, 0x67, 0xF3, 0xDF, 0xB9, 0x68, /* 0x6C-0x6F */
        0xB9, 0x69, 0xF3, 0xE3, 0xF3, 0xE2, 0xB9, 0x6A, /* 0x70-0x73 */
        0xB9, 0x6B, 0xF3, 0xDB, 0xB9, 0x6C, 0xBF, 0xEA, /* 0x74-0x77 */
        0xB9, 0x6D, 0xB3, 0xEF, 0xB9, 0x6E, 0xF3, 0xE0, /* 0x78-0x7B */
        0xB9, 0x6F, 0xB9, 0x70, 0xC7, 0xA9, 0xB9, 0x71, /* 0x7C-0x7F */
        
        0xBC, 0xF2, 0xB9, 0x72, 0xB9, 0x73, 0xB9, 0x74, /* 0x80-0x83 */
        0xB9, 0x75, 0xF3, 0xEB, 0xB9, 0x76, 0xB9, 0x77, /* 0x84-0x87 */
        0xB9, 0x78, 0xB9, 0x79, 0xB9, 0x7A, 0xB9, 0x7B, /* 0x88-0x8B */
        0xB9, 0x7C, 0xB9, 0xBF, 0xB9, 0x7D, 0xB9, 0x7E, /* 0x8C-0x8F */
        0xF3, 0xE4, 0xB9, 0x80, 0xB9, 0x81, 0xB9, 0x82, /* 0x90-0x93 */
        0xB2, 0xAD, 0xBB, 0xFE, 0xB9, 0x83, 0xCB, 0xE3, /* 0x94-0x97 */
        0xB9, 0x84, 0xB9, 0x85, 0xB9, 0x86, 0xB9, 0x87, /* 0x98-0x9B */
        0xF3, 0xED, 0xF3, 0xE9, 0xB9, 0x88, 0xB9, 0x89, /* 0x9C-0x9F */
        0xB9, 0x8A, 0xB9, 0xDC, 0xF3, 0xEE, 0xB9, 0x8B, /* 0xA0-0xA3 */
        0xB9, 0x8C, 0xB9, 0x8D, 0xF3, 0xE5, 0xF3, 0xE6, /* 0xA4-0xA7 */
        0xF3, 0xEA, 0xC2, 0xE1, 0xF3, 0xEC, 0xF3, 0xEF, /* 0xA8-0xAB */
        0xF3, 0xE8, 0xBC, 0xFD, 0xB9, 0x8E, 0xB9, 0x8F, /* 0xAC-0xAF */
        0xB9, 0x90, 0xCF, 0xE4, 0xB9, 0x91, 0xB9, 0x92, /* 0xB0-0xB3 */
        0xF3, 0xF0, 0xB9, 0x93, 0xB9, 0x94, 0xB9, 0x95, /* 0xB4-0xB7 */
        0xF3, 0xE7, 0xB9, 0x96, 0xB9, 0x97, 0xB9, 0x98, /* 0xB8-0xBB */
        0xB9, 0x99, 0xB9, 0x9A, 0xB9, 0x9B, 0xB9, 0x9C, /* 0xBC-0xBF */
        0xB9, 0x9D, 0xF3, 0xF2, 0xB9, 0x9E, 0xB9, 0x9F, /* 0xC0-0xC3 */
        0xB9, 0xA0, 0xBA, 0x40, 0xD7, 0xAD, 0xC6, 0xAA, /* 0xC4-0xC7 */
        0xBA, 0x41, 0xBA, 0x42, 0xBA, 0x43, 0xBA, 0x44, /* 0xC8-0xCB */
        0xF3, 0xF3, 0xBA, 0x45, 0xBA, 0x46, 0xBA, 0x47, /* 0xCC-0xCF */
        0xBA, 0x48, 0xF3, 0xF1, 0xBA, 0x49, 0xC2, 0xA8, /* 0xD0-0xD3 */
        0xBA, 0x4A, 0xBA, 0x4B, 0xBA, 0x4C, 0xBA, 0x4D, /* 0xD4-0xD7 */
        0xBA, 0x4E, 0xB8, 0xDD, 0xF3, 0xF5, 0xBA, 0x4F, /* 0xD8-0xDB */
        0xBA, 0x50, 0xF3, 0xF4, 0xBA, 0x51, 0xBA, 0x52, /* 0xDC-0xDF */
        0xBA, 0x53, 0xB4, 0xDB, 0xBA, 0x54, 0xBA, 0x55, /* 0xE0-0xE3 */
        0xBA, 0x56, 0xF3, 0xF6, 0xF3, 0xF7, 0xBA, 0x57, /* 0xE4-0xE7 */
        0xBA, 0x58, 0xBA, 0x59, 0xF3, 0xF8, 0xBA, 0x5A, /* 0xE8-0xEB */
        0xBA, 0x5B, 0xBA, 0x5C, 0xC0, 0xBA, 0xBA, 0x5D, /* 0xEC-0xEF */
        0xBA, 0x5E, 0xC0, 0xE9, 0xBA, 0x5F, 0xBA, 0x60, /* 0xF0-0xF3 */
        0xBA, 0x61, 0xBA, 0x62, 0xBA, 0x63, 0xC5, 0xF1, /* 0xF4-0xF7 */
        0xBA, 0x64, 0xBA, 0x65, 0xBA, 0x66, 0xBA, 0x67, /* 0xF8-0xFB */
        0xF3, 0xFB, 0xBA, 0x68, 0xF3, 0xFA, 0xBA, 0x69, /* 0xFC-0xFF */
};

static const unsigned char u2c_7C[512] = {
        0xBA, 0x6A, 0xBA, 0x6B, 0xBA, 0x6C, 0xBA, 0x6D, /* 0x00-0x03 */
        0xBA, 0x6E, 0xBA, 0x6F, 0xBA, 0x70, 0xB4, 0xD8, /* 0x04-0x07 */
        0xBA, 0x71, 0xBA, 0x72, 0xBA, 0x73, 0xF3, 0xFE, /* 0x08-0x0B */
        0xF3, 0xF9, 0xBA, 0x74, 0xBA, 0x75, 0xF3, 0xFC, /* 0x0C-0x0F */
        0xBA, 0x76, 0xBA, 0x77, 0xBA, 0x78, 0xBA, 0x79, /* 0x10-0x13 */
        0xBA, 0x7A, 0xBA, 0x7B, 0xF3, 0xFD, 0xBA, 0x7C, /* 0x14-0x17 */
        0xBA, 0x7D, 0xBA, 0x7E, 0xBA, 0x80, 0xBA, 0x81, /* 0x18-0x1B */
        0xBA, 0x82, 0xBA, 0x83, 0xBA, 0x84, 0xF4, 0xA1, /* 0x1C-0x1F */
        0xBA, 0x85, 0xBA, 0x86, 0xBA, 0x87, 0xBA, 0x88, /* 0x20-0x23 */
        0xBA, 0x89, 0xBA, 0x8A, 0xF4, 0xA3, 0xBB, 0xC9, /* 0x24-0x27 */
        0xBA, 0x8B, 0xBA, 0x8C, 0xF4, 0xA2, 0xBA, 0x8D, /* 0x28-0x2B */
        0xBA, 0x8E, 0xBA, 0x8F, 0xBA, 0x90, 0xBA, 0x91, /* 0x2C-0x2F */
        0xBA, 0x92, 0xBA, 0x93, 0xBA, 0x94, 0xBA, 0x95, /* 0x30-0x33 */
        0xBA, 0x96, 0xBA, 0x97, 0xBA, 0x98, 0xBA, 0x99, /* 0x34-0x37 */
        0xF4, 0xA4, 0xBA, 0x9A, 0xBA, 0x9B, 0xBA, 0x9C, /* 0x38-0x3B */
        0xBA, 0x9D, 0xBA, 0x9E, 0xBA, 0x9F, 0xB2, 0xBE, /* 0x3C-0x3F */
        0xF4, 0xA6, 0xF4, 0xA5, 0xBA, 0xA0, 0xBB, 0x40, /* 0x40-0x43 */
        0xBB, 0x41, 0xBB, 0x42, 0xBB, 0x43, 0xBB, 0x44, /* 0x44-0x47 */
        0xBB, 0x45, 0xBB, 0x46, 0xBB, 0x47, 0xBB, 0x48, /* 0x48-0x4B */
        0xBB, 0x49, 0xBC, 0xAE, 0xBB, 0x4A, 0xBB, 0x4B, /* 0x4C-0x4F */
        0xBB, 0x4C, 0xBB, 0x4D, 0xBB, 0x4E, 0xBB, 0x4F, /* 0x50-0x53 */
        0xBB, 0x50, 0xBB, 0x51, 0xBB, 0x52, 0xBB, 0x53, /* 0x54-0x57 */
        0xBB, 0x54, 0xBB, 0x55, 0xBB, 0x56, 0xBB, 0x57, /* 0x58-0x5B */
        0xBB, 0x58, 0xBB, 0x59, 0xBB, 0x5A, 0xBB, 0x5B, /* 0x5C-0x5F */
        0xBB, 0x5C, 0xBB, 0x5D, 0xBB, 0x5E, 0xBB, 0x5F, /* 0x60-0x63 */
        0xBB, 0x60, 0xBB, 0x61, 0xBB, 0x62, 0xBB, 0x63, /* 0x64-0x67 */
        0xBB, 0x64, 0xBB, 0x65, 0xBB, 0x66, 0xBB, 0x67, /* 0x68-0x6B */
        0xBB, 0x68, 0xBB, 0x69, 0xBB, 0x6A, 0xBB, 0x6B, /* 0x6C-0x6F */
        0xBB, 0x6C, 0xBB, 0x6D, 0xBB, 0x6E, 0xC3, 0xD7, /* 0x70-0x73 */
        0xD9, 0xE1, 0xBB, 0x6F, 0xBB, 0x70, 0xBB, 0x71, /* 0x74-0x77 */
        0xBB, 0x72, 0xBB, 0x73, 0xBB, 0x74, 0xC0, 0xE0, /* 0x78-0x7B */
        0xF4, 0xCC, 0xD7, 0xD1, 0xBB, 0x75, 0xBB, 0x76, /* 0x7C-0x7F */
        
        0xBB, 0x77, 0xBB, 0x78, 0xBB, 0x79, 0xBB, 0x7A, /* 0x80-0x83 */
        0xBB, 0x7B, 0xBB, 0x7C, 0xBB, 0x7D, 0xBB, 0x7E, /* 0x84-0x87 */
        0xBB, 0x80, 0xB7, 0xDB, 0xBB, 0x81, 0xBB, 0x82, /* 0x88-0x8B */
        0xBB, 0x83, 0xBB, 0x84, 0xBB, 0x85, 0xBB, 0x86, /* 0x8C-0x8F */
        0xBB, 0x87, 0xF4, 0xCE, 0xC1, 0xA3, 0xBB, 0x88, /* 0x90-0x93 */
        0xBB, 0x89, 0xC6, 0xC9, 0xBB, 0x8A, 0xB4, 0xD6, /* 0x94-0x97 */
        0xD5, 0xB3, 0xBB, 0x8B, 0xBB, 0x8C, 0xBB, 0x8D, /* 0x98-0x9B */
        0xF4, 0xD0, 0xF4, 0xCF, 0xF4, 0xD1, 0xCB, 0xDA, /* 0x9C-0x9F */
        0xBB, 0x8E, 0xBB, 0x8F, 0xF4, 0xD2, 0xBB, 0x90, /* 0xA0-0xA3 */
        0xD4, 0xC1, 0xD6, 0xE0, 0xBB, 0x91, 0xBB, 0x92, /* 0xA4-0xA7 */
        0xBB, 0x93, 0xBB, 0x94, 0xB7, 0xE0, 0xBB, 0x95, /* 0xA8-0xAB */
        0xBB, 0x96, 0xBB, 0x97, 0xC1, 0xB8, 0xBB, 0x98, /* 0xAC-0xAF */
        0xBB, 0x99, 0xC1, 0xBB, 0xF4, 0xD3, 0xBE, 0xAC, /* 0xB0-0xB3 */
        0xBB, 0x9A, 0xBB, 0x9B, 0xBB, 0x9C, 0xBB, 0x9D, /* 0xB4-0xB7 */
        0xBB, 0x9E, 0xB4, 0xE2, 0xBB, 0x9F, 0xBB, 0xA0, /* 0xB8-0xBB */
        0xF4, 0xD4, 0xF4, 0xD5, 0xBE, 0xAB, 0xBC, 0x40, /* 0xBC-0xBF */
        0xBC, 0x41, 0xF4, 0xD6, 0xBC, 0x42, 0xBC, 0x43, /* 0xC0-0xC3 */
        0xBC, 0x44, 0xF4, 0xDB, 0xBC, 0x45, 0xF4, 0xD7, /* 0xC4-0xC7 */
        0xF4, 0xDA, 0xBC, 0x46, 0xBA, 0xFD, 0xBC, 0x47, /* 0xC8-0xCB */
        0xF4, 0xD8, 0xF4, 0xD9, 0xBC, 0x48, 0xBC, 0x49, /* 0xCC-0xCF */
        0xBC, 0x4A, 0xBC, 0x4B, 0xBC, 0x4C, 0xBC, 0x4D, /* 0xD0-0xD3 */
        0xBC, 0x4E, 0xB8, 0xE2, 0xCC, 0xC7, 0xF4, 0xDC, /* 0xD4-0xD7 */
        0xBC, 0x4F, 0xB2, 0xDA, 0xBC, 0x50, 0xBC, 0x51, /* 0xD8-0xDB */
        0xC3, 0xD3, 0xBC, 0x52, 0xBC, 0x53, 0xD4, 0xE3, /* 0xDC-0xDF */
        0xBF, 0xB7, 0xBC, 0x54, 0xBC, 0x55, 0xBC, 0x56, /* 0xE0-0xE3 */
        0xBC, 0x57, 0xBC, 0x58, 0xBC, 0x59, 0xBC, 0x5A, /* 0xE4-0xE7 */
        0xF4, 0xDD, 0xBC, 0x5B, 0xBC, 0x5C, 0xBC, 0x5D, /* 0xE8-0xEB */
        0xBC, 0x5E, 0xBC, 0x5F, 0xBC, 0x60, 0xC5, 0xB4, /* 0xEC-0xEF */
        0xBC, 0x61, 0xBC, 0x62, 0xBC, 0x63, 0xBC, 0x64, /* 0xF0-0xF3 */
        0xBC, 0x65, 0xBC, 0x66, 0xBC, 0x67, 0xBC, 0x68, /* 0xF4-0xF7 */
        0xF4, 0xE9, 0xBC, 0x69, 0xBC, 0x6A, 0xCF, 0xB5, /* 0xF8-0xFB */
        0xBC, 0x6B, 0xBC, 0x6C, 0xBC, 0x6D, 0xBC, 0x6E, /* 0xFC-0xFF */
};

static const unsigned char u2c_7D[512] = {
        0xBC, 0x6F, 0xBC, 0x70, 0xBC, 0x71, 0xBC, 0x72, /* 0x00-0x03 */
        0xBC, 0x73, 0xBC, 0x74, 0xBC, 0x75, 0xBC, 0x76, /* 0x04-0x07 */
        0xBC, 0x77, 0xBC, 0x78, 0xCE, 0xC9, 0xBC, 0x79, /* 0x08-0x0B */
        0xBC, 0x7A, 0xBC, 0x7B, 0xBC, 0x7C, 0xBC, 0x7D, /* 0x0C-0x0F */
        0xBC, 0x7E, 0xBC, 0x80, 0xBC, 0x81, 0xBC, 0x82, /* 0x10-0x13 */
        0xBC, 0x83, 0xBC, 0x84, 0xBC, 0x85, 0xBC, 0x86, /* 0x14-0x17 */
        0xBC, 0x87, 0xBC, 0x88, 0xBC, 0x89, 0xBC, 0x8A, /* 0x18-0x1B */
        0xBC, 0x8B, 0xBC, 0x8C, 0xBC, 0x8D, 0xBC, 0x8E, /* 0x1C-0x1F */
        0xCB, 0xD8, 0xBC, 0x8F, 0xCB, 0xF7, 0xBC, 0x90, /* 0x20-0x23 */
        0xBC, 0x91, 0xBC, 0x92, 0xBC, 0x93, 0xBD, 0xF4, /* 0x24-0x27 */
        0xBC, 0x94, 0xBC, 0x95, 0xBC, 0x96, 0xD7, 0xCF, /* 0x28-0x2B */
        0xBC, 0x97, 0xBC, 0x98, 0xBC, 0x99, 0xC0, 0xDB, /* 0x2C-0x2F */
        0xBC, 0x9A, 0xBC, 0x9B, 0xBC, 0x9C, 0xBC, 0x9D, /* 0x30-0x33 */
        0xBC, 0x9E, 0xBC, 0x9F, 0xBC, 0xA0, 0xBD, 0x40, /* 0x34-0x37 */
        0xBD, 0x41, 0xBD, 0x42, 0xBD, 0x43, 0xBD, 0x44, /* 0x38-0x3B */
        0xBD, 0x45, 0xBD, 0x46, 0xBD, 0x47, 0xBD, 0x48, /* 0x3C-0x3F */
        0xBD, 0x49, 0xBD, 0x4A, 0xBD, 0x4B, 0xBD, 0x4C, /* 0x40-0x43 */
        0xBD, 0x4D, 0xBD, 0x4E, 0xBD, 0x4F, 0xBD, 0x50, /* 0x44-0x47 */
        0xBD, 0x51, 0xBD, 0x52, 0xBD, 0x53, 0xBD, 0x54, /* 0x48-0x4B */
        0xBD, 0x55, 0xBD, 0x56, 0xBD, 0x57, 0xBD, 0x58, /* 0x4C-0x4F */
        0xBD, 0x59, 0xBD, 0x5A, 0xBD, 0x5B, 0xBD, 0x5C, /* 0x50-0x53 */
        0xBD, 0x5D, 0xBD, 0x5E, 0xBD, 0x5F, 0xBD, 0x60, /* 0x54-0x57 */
        0xBD, 0x61, 0xBD, 0x62, 0xBD, 0x63, 0xBD, 0x64, /* 0x58-0x5B */
        0xBD, 0x65, 0xBD, 0x66, 0xBD, 0x67, 0xBD, 0x68, /* 0x5C-0x5F */
        0xBD, 0x69, 0xBD, 0x6A, 0xBD, 0x6B, 0xBD, 0x6C, /* 0x60-0x63 */
        0xBD, 0x6D, 0xBD, 0x6E, 0xBD, 0x6F, 0xBD, 0x70, /* 0x64-0x67 */
        0xBD, 0x71, 0xBD, 0x72, 0xBD, 0x73, 0xBD, 0x74, /* 0x68-0x6B */
        0xBD, 0x75, 0xBD, 0x76, 0xD0, 0xF5, 0xBD, 0x77, /* 0x6C-0x6F */
        0xBD, 0x78, 0xBD, 0x79, 0xBD, 0x7A, 0xBD, 0x7B, /* 0x70-0x73 */
        0xBD, 0x7C, 0xBD, 0x7D, 0xBD, 0x7E, 0xF4, 0xEA, /* 0x74-0x77 */
        0xBD, 0x80, 0xBD, 0x81, 0xBD, 0x82, 0xBD, 0x83, /* 0x78-0x7B */
        0xBD, 0x84, 0xBD, 0x85, 0xBD, 0x86, 0xBD, 0x87, /* 0x7C-0x7F */
        
        0xBD, 0x88, 0xBD, 0x89, 0xBD, 0x8A, 0xBD, 0x8B, /* 0x80-0x83 */
        0xBD, 0x8C, 0xBD, 0x8D, 0xBD, 0x8E, 0xBD, 0x8F, /* 0x84-0x87 */
        0xBD, 0x90, 0xBD, 0x91, 0xBD, 0x92, 0xBD, 0x93, /* 0x88-0x8B */
        0xBD, 0x94, 0xBD, 0x95, 0xBD, 0x96, 0xBD, 0x97, /* 0x8C-0x8F */
        0xBD, 0x98, 0xBD, 0x99, 0xBD, 0x9A, 0xBD, 0x9B, /* 0x90-0x93 */
        0xBD, 0x9C, 0xBD, 0x9D, 0xBD, 0x9E, 0xBD, 0x9F, /* 0x94-0x97 */
        0xBD, 0xA0, 0xBE, 0x40, 0xBE, 0x41, 0xBE, 0x42, /* 0x98-0x9B */
        0xBE, 0x43, 0xBE, 0x44, 0xBE, 0x45, 0xBE, 0x46, /* 0x9C-0x9F */
        0xBE, 0x47, 0xBE, 0x48, 0xBE, 0x49, 0xBE, 0x4A, /* 0xA0-0xA3 */
        0xBE, 0x4B, 0xBE, 0x4C, 0xF4, 0xEB, 0xBE, 0x4D, /* 0xA4-0xA7 */
        0xBE, 0x4E, 0xBE, 0x4F, 0xBE, 0x50, 0xBE, 0x51, /* 0xA8-0xAB */
        0xBE, 0x52, 0xBE, 0x53, 0xF4, 0xEC, 0xBE, 0x54, /* 0xAC-0xAF */
        0xBE, 0x55, 0xBE, 0x56, 0xBE, 0x57, 0xBE, 0x58, /* 0xB0-0xB3 */
        0xBE, 0x59, 0xBE, 0x5A, 0xBE, 0x5B, 0xBE, 0x5C, /* 0xB4-0xB7 */
        0xBE, 0x5D, 0xBE, 0x5E, 0xBE, 0x5F, 0xBE, 0x60, /* 0xB8-0xBB */
        0xBE, 0x61, 0xBE, 0x62, 0xBE, 0x63, 0xBE, 0x64, /* 0xBC-0xBF */
        0xBE, 0x65, 0xBE, 0x66, 0xBE, 0x67, 0xBE, 0x68, /* 0xC0-0xC3 */
        0xBE, 0x69, 0xBE, 0x6A, 0xBE, 0x6B, 0xBE, 0x6C, /* 0xC4-0xC7 */
        0xBE, 0x6D, 0xBE, 0x6E, 0xBE, 0x6F, 0xBE, 0x70, /* 0xC8-0xCB */
        0xBE, 0x71, 0xBE, 0x72, 0xBE, 0x73, 0xBE, 0x74, /* 0xCC-0xCF */
        0xBE, 0x75, 0xBE, 0x76, 0xBE, 0x77, 0xBE, 0x78, /* 0xD0-0xD3 */
        0xBE, 0x79, 0xBE, 0x7A, 0xBE, 0x7B, 0xBE, 0x7C, /* 0xD4-0xD7 */
        0xBE, 0x7D, 0xBE, 0x7E, 0xBE, 0x80, 0xBE, 0x81, /* 0xD8-0xDB */
        0xBE, 0x82, 0xBE, 0x83, 0xBE, 0x84, 0xBE, 0x85, /* 0xDC-0xDF */
        0xBE, 0x86, 0xBE, 0x87, 0xBE, 0x88, 0xBE, 0x89, /* 0xE0-0xE3 */
        0xBE, 0x8A, 0xBE, 0x8B, 0xBE, 0x8C, 0xBE, 0x8D, /* 0xE4-0xE7 */
        0xBE, 0x8E, 0xBE, 0x8F, 0xBE, 0x90, 0xBE, 0x91, /* 0xE8-0xEB */
        0xBE, 0x92, 0xBE, 0x93, 0xBE, 0x94, 0xBE, 0x95, /* 0xEC-0xEF */
        0xBE, 0x96, 0xBE, 0x97, 0xBE, 0x98, 0xBE, 0x99, /* 0xF0-0xF3 */
        0xBE, 0x9A, 0xBE, 0x9B, 0xBE, 0x9C, 0xBE, 0x9D, /* 0xF4-0xF7 */
        0xBE, 0x9E, 0xBE, 0x9F, 0xBE, 0xA0, 0xBF, 0x40, /* 0xF8-0xFB */
        0xBF, 0x41, 0xBF, 0x42, 0xBF, 0x43, 0xBF, 0x44, /* 0xFC-0xFF */
};

static const unsigned char u2c_7E[512] = {
        0xBF, 0x45, 0xBF, 0x46, 0xBF, 0x47, 0xBF, 0x48, /* 0x00-0x03 */
        0xBF, 0x49, 0xBF, 0x4A, 0xBF, 0x4B, 0xBF, 0x4C, /* 0x04-0x07 */
        0xBF, 0x4D, 0xBF, 0x4E, 0xBF, 0x4F, 0xBF, 0x50, /* 0x08-0x0B */
        0xBF, 0x51, 0xBF, 0x52, 0xBF, 0x53, 0xBF, 0x54, /* 0x0C-0x0F */
        0xBF, 0x55, 0xBF, 0x56, 0xBF, 0x57, 0xBF, 0x58, /* 0x10-0x13 */
        0xBF, 0x59, 0xBF, 0x5A, 0xBF, 0x5B, 0xBF, 0x5C, /* 0x14-0x17 */
        0xBF, 0x5D, 0xBF, 0x5E, 0xBF, 0x5F, 0xBF, 0x60, /* 0x18-0x1B */
        0xBF, 0x61, 0xBF, 0x62, 0xBF, 0x63, 0xBF, 0x64, /* 0x1C-0x1F */
        0xBF, 0x65, 0xBF, 0x66, 0xBF, 0x67, 0xBF, 0x68, /* 0x20-0x23 */
        0xBF, 0x69, 0xBF, 0x6A, 0xBF, 0x6B, 0xBF, 0x6C, /* 0x24-0x27 */
        0xBF, 0x6D, 0xBF, 0x6E, 0xBF, 0x6F, 0xBF, 0x70, /* 0x28-0x2B */
        0xBF, 0x71, 0xBF, 0x72, 0xBF, 0x73, 0xBF, 0x74, /* 0x2C-0x2F */
        0xBF, 0x75, 0xBF, 0x76, 0xBF, 0x77, 0xBF, 0x78, /* 0x30-0x33 */
        0xBF, 0x79, 0xBF, 0x7A, 0xBF, 0x7B, 0xBF, 0x7C, /* 0x34-0x37 */
        0xBF, 0x7D, 0xBF, 0x7E, 0xBF, 0x80, 0xF7, 0xE3, /* 0x38-0x3B */
        0xBF, 0x81, 0xBF, 0x82, 0xBF, 0x83, 0xBF, 0x84, /* 0x3C-0x3F */
        0xBF, 0x85, 0xB7, 0xB1, 0xBF, 0x86, 0xBF, 0x87, /* 0x40-0x43 */
        0xBF, 0x88, 0xBF, 0x89, 0xBF, 0x8A, 0xF4, 0xED, /* 0x44-0x47 */
        0xBF, 0x8B, 0xBF, 0x8C, 0xBF, 0x8D, 0xBF, 0x8E, /* 0x48-0x4B */
        0xBF, 0x8F, 0xBF, 0x90, 0xBF, 0x91, 0xBF, 0x92, /* 0x4C-0x4F */
        0xBF, 0x93, 0xBF, 0x94, 0xBF, 0x95, 0xBF, 0x96, /* 0x50-0x53 */
        0xBF, 0x97, 0xBF, 0x98, 0xBF, 0x99, 0xBF, 0x9A, /* 0x54-0x57 */
        0xBF, 0x9B, 0xBF, 0x9C, 0xBF, 0x9D, 0xBF, 0x9E, /* 0x58-0x5B */
        0xBF, 0x9F, 0xBF, 0xA0, 0xC0, 0x40, 0xC0, 0x41, /* 0x5C-0x5F */
        0xC0, 0x42, 0xC0, 0x43, 0xC0, 0x44, 0xC0, 0x45, /* 0x60-0x63 */
        0xC0, 0x46, 0xC0, 0x47, 0xC0, 0x48, 0xC0, 0x49, /* 0x64-0x67 */
        0xC0, 0x4A, 0xC0, 0x4B, 0xC0, 0x4C, 0xC0, 0x4D, /* 0x68-0x6B */
        0xC0, 0x4E, 0xC0, 0x4F, 0xC0, 0x50, 0xC0, 0x51, /* 0x6C-0x6F */
        0xC0, 0x52, 0xC0, 0x53, 0xC0, 0x54, 0xC0, 0x55, /* 0x70-0x73 */
        0xC0, 0x56, 0xC0, 0x57, 0xC0, 0x58, 0xC0, 0x59, /* 0x74-0x77 */
        0xC0, 0x5A, 0xC0, 0x5B, 0xC0, 0x5C, 0xC0, 0x5D, /* 0x78-0x7B */
        0xC0, 0x5E, 0xC0, 0x5F, 0xC0, 0x60, 0xC0, 0x61, /* 0x7C-0x7F */
        
        0xC0, 0x62, 0xC0, 0x63, 0xD7, 0xEB, 0xC0, 0x64, /* 0x80-0x83 */
        0xC0, 0x65, 0xC0, 0x66, 0xC0, 0x67, 0xC0, 0x68, /* 0x84-0x87 */
        0xC0, 0x69, 0xC0, 0x6A, 0xC0, 0x6B, 0xC0, 0x6C, /* 0x88-0x8B */
        0xC0, 0x6D, 0xC0, 0x6E, 0xC0, 0x6F, 0xC0, 0x70, /* 0x8C-0x8F */
        0xC0, 0x71, 0xC0, 0x72, 0xC0, 0x73, 0xC0, 0x74, /* 0x90-0x93 */
        0xC0, 0x75, 0xC0, 0x76, 0xC0, 0x77, 0xC0, 0x78, /* 0x94-0x97 */
        0xC0, 0x79, 0xC0, 0x7A, 0xC0, 0x7B, 0xF4, 0xEE, /* 0x98-0x9B */
        0xC0, 0x7C, 0xC0, 0x7D, 0xC0, 0x7E, 0xE6, 0xF9, /* 0x9C-0x9F */
        0xBE, 0xC0, 0xE6, 0xFA, 0xBA, 0xEC, 0xE6, 0xFB, /* 0xA0-0xA3 */
        0xCF, 0xCB, 0xE6, 0xFC, 0xD4, 0xBC, 0xBC, 0xB6, /* 0xA4-0xA7 */
        0xE6, 0xFD, 0xE6, 0xFE, 0xBC, 0xCD, 0xC8, 0xD2, /* 0xA8-0xAB */
        0xCE, 0xB3, 0xE7, 0xA1, 0xC0, 0x80, 0xB4, 0xBF, /* 0xAC-0xAF */
        0xE7, 0xA2, 0xC9, 0xB4, 0xB8, 0xD9, 0xC4, 0xC9, /* 0xB0-0xB3 */
        0xC0, 0x81, 0xD7, 0xDD, 0xC2, 0xDA, 0xB7, 0xD7, /* 0xB4-0xB7 */
        0xD6, 0xBD, 0xCE, 0xC6, 0xB7, 0xC4, 0xC0, 0x82, /* 0xB8-0xBB */
        0xC0, 0x83, 0xC5, 0xA6, 0xE7, 0xA3, 0xCF, 0xDF, /* 0xBC-0xBF */
        0xE7, 0xA4, 0xE7, 0xA5, 0xE7, 0xA6, 0xC1, 0xB7, /* 0xC0-0xC3 */
        0xD7, 0xE9, 0xC9, 0xF0, 0xCF, 0xB8, 0xD6, 0xAF, /* 0xC4-0xC7 */
        0xD6, 0xD5, 0xE7, 0xA7, 0xB0, 0xED, 0xE7, 0xA8, /* 0xC8-0xCB */
        0xE7, 0xA9, 0xC9, 0xDC, 0xD2, 0xEF, 0xBE, 0xAD, /* 0xCC-0xCF */
        0xE7, 0xAA, 0xB0, 0xF3, 0xC8, 0xDE, 0xBD, 0xE1, /* 0xD0-0xD3 */
        0xE7, 0xAB, 0xC8, 0xC6, 0xC0, 0x84, 0xE7, 0xAC, /* 0xD4-0xD7 */
        0xBB, 0xE6, 0xB8, 0xF8, 0xD1, 0xA4, 0xE7, 0xAD, /* 0xD8-0xDB */
        0xC2, 0xE7, 0xBE, 0xF8, 0xBD, 0xCA, 0xCD, 0xB3, /* 0xDC-0xDF */
        0xE7, 0xAE, 0xE7, 0xAF, 0xBE, 0xEE, 0xD0, 0xE5, /* 0xE0-0xE3 */
        0xC0, 0x85, 0xCB, 0xE7, 0xCC, 0xD0, 0xBC, 0xCC, /* 0xE4-0xE7 */
        0xE7, 0xB0, 0xBC, 0xA8, 0xD0, 0xF7, 0xE7, 0xB1, /* 0xE8-0xEB */
        0xC0, 0x86, 0xD0, 0xF8, 0xE7, 0xB2, 0xE7, 0xB3, /* 0xEC-0xEF */
        0xB4, 0xC2, 0xE7, 0xB4, 0xE7, 0xB5, 0xC9, 0xFE, /* 0xF0-0xF3 */
        0xCE, 0xAC, 0xC3, 0xE0, 0xE7, 0xB7, 0xB1, 0xC1, /* 0xF4-0xF7 */
        0xB3, 0xF1, 0xC0, 0x87, 0xE7, 0xB8, 0xE7, 0xB9, /* 0xF8-0xFB */
        0xD7, 0xDB, 0xD5, 0xC0, 0xE7, 0xBA, 0xC2, 0xCC, /* 0xFC-0xFF */
};

static const unsigned char u2c_7F[512] = {
        0xD7, 0xBA, 0xE7, 0xBB, 0xE7, 0xBC, 0xE7, 0xBD, /* 0x00-0x03 */
        0xBC, 0xEA, 0xC3, 0xE5, 0xC0, 0xC2, 0xE7, 0xBE, /* 0x04-0x07 */
        0xE7, 0xBF, 0xBC, 0xA9, 0xC0, 0x88, 0xE7, 0xC0, /* 0x08-0x0B */
        0xE7, 0xC1, 0xE7, 0xB6, 0xB6, 0xD0, 0xE7, 0xC2, /* 0x0C-0x0F */
        0xC0, 0x89, 0xE7, 0xC3, 0xE7, 0xC4, 0xBB, 0xBA, /* 0x10-0x13 */
        0xB5, 0xDE, 0xC2, 0xC6, 0xB1, 0xE0, 0xE7, 0xC5, /* 0x14-0x17 */
        0xD4, 0xB5, 0xE7, 0xC6, 0xB8, 0xBF, 0xE7, 0xC8, /* 0x18-0x1B */
        0xE7, 0xC7, 0xB7, 0xEC, 0xC0, 0x8A, 0xE7, 0xC9, /* 0x1C-0x1F */
        0xB2, 0xF8, 0xE7, 0xCA, 0xE7, 0xCB, 0xE7, 0xCC, /* 0x20-0x23 */
        0xE7, 0xCD, 0xE7, 0xCE, 0xE7, 0xCF, 0xE7, 0xD0, /* 0x24-0x27 */
        0xD3, 0xA7, 0xCB, 0xF5, 0xE7, 0xD1, 0xE7, 0xD2, /* 0x28-0x2B */
        0xE7, 0xD3, 0xE7, 0xD4, 0xC9, 0xC9, 0xE7, 0xD5, /* 0x2C-0x2F */
        0xE7, 0xD6, 0xE7, 0xD7, 0xE7, 0xD8, 0xE7, 0xD9, /* 0x30-0x33 */
        0xBD, 0xC9, 0xE7, 0xDA, 0xF3, 0xBE, 0xC0, 0x8B, /* 0x34-0x37 */
        0xB8, 0xD7, 0xC0, 0x8C, 0xC8, 0xB1, 0xC0, 0x8D, /* 0x38-0x3B */
        0xC0, 0x8E, 0xC0, 0x8F, 0xC0, 0x90, 0xC0, 0x91, /* 0x3C-0x3F */
        0xC0, 0x92, 0xC0, 0x93, 0xF3, 0xBF, 0xC0, 0x94, /* 0x40-0x43 */
        0xF3, 0xC0, 0xF3, 0xC1, 0xC0, 0x95, 0xC0, 0x96, /* 0x44-0x47 */
        0xC0, 0x97, 0xC0, 0x98, 0xC0, 0x99, 0xC0, 0x9A, /* 0x48-0x4B */
        0xC0, 0x9B, 0xC0, 0x9C, 0xC0, 0x9D, 0xC0, 0x9E, /* 0x4C-0x4F */
        0xB9, 0xDE, 0xCD, 0xF8, 0xC0, 0x9F, 0xC0, 0xA0, /* 0x50-0x53 */
        0xD8, 0xE8, 0xBA, 0xB1, 0xC1, 0x40, 0xC2, 0xDE, /* 0x54-0x57 */
        0xEE, 0xB7, 0xC1, 0x41, 0xB7, 0xA3, 0xC1, 0x42, /* 0x58-0x5B */
        0xC1, 0x43, 0xC1, 0x44, 0xC1, 0x45, 0xEE, 0xB9, /* 0x5C-0x5F */
        0xC1, 0x46, 0xEE, 0xB8, 0xB0, 0xD5, 0xC1, 0x47, /* 0x60-0x63 */
        0xC1, 0x48, 0xC1, 0x49, 0xC1, 0x4A, 0xC1, 0x4B, /* 0x64-0x67 */
        0xEE, 0xBB, 0xD5, 0xD6, 0xD7, 0xEF, 0xC1, 0x4C, /* 0x68-0x6B */
        0xC1, 0x4D, 0xC1, 0x4E, 0xD6, 0xC3, 0xC1, 0x4F, /* 0x6C-0x6F */
        0xC1, 0x50, 0xEE, 0xBD, 0xCA, 0xF0, 0xC1, 0x51, /* 0x70-0x73 */
        0xEE, 0xBC, 0xC1, 0x52, 0xC1, 0x53, 0xC1, 0x54, /* 0x74-0x77 */
        0xC1, 0x55, 0xEE, 0xBE, 0xC1, 0x56, 0xC1, 0x57, /* 0x78-0x7B */
        0xC1, 0x58, 0xC1, 0x59, 0xEE, 0xC0, 0xC1, 0x5A, /* 0x7C-0x7F */
        
        0xC1, 0x5B, 0xEE, 0xBF, 0xC1, 0x5C, 0xC1, 0x5D, /* 0x80-0x83 */
        0xC1, 0x5E, 0xC1, 0x5F, 0xC1, 0x60, 0xC1, 0x61, /* 0x84-0x87 */
        0xC1, 0x62, 0xC1, 0x63, 0xD1, 0xF2, 0xC1, 0x64, /* 0x88-0x8B */
        0xC7, 0xBC, 0xC1, 0x65, 0xC3, 0xC0, 0xC1, 0x66, /* 0x8C-0x8F */
        0xC1, 0x67, 0xC1, 0x68, 0xC1, 0x69, 0xC1, 0x6A, /* 0x90-0x93 */
        0xB8, 0xE1, 0xC1, 0x6B, 0xC1, 0x6C, 0xC1, 0x6D, /* 0x94-0x97 */
        0xC1, 0x6E, 0xC1, 0x6F, 0xC1, 0xE7, 0xC1, 0x70, /* 0x98-0x9B */
        0xC1, 0x71, 0xF4, 0xC6, 0xD0, 0xDF, 0xF4, 0xC7, /* 0x9C-0x9F */
        0xC1, 0x72, 0xCF, 0xDB, 0xC1, 0x73, 0xC1, 0x74, /* 0xA0-0xA3 */
        0xC8, 0xBA, 0xC1, 0x75, 0xC1, 0x76, 0xF4, 0xC8, /* 0xA4-0xA7 */
        0xC1, 0x77, 0xC1, 0x78, 0xC1, 0x79, 0xC1, 0x7A, /* 0xA8-0xAB */
        0xC1, 0x7B, 0xC1, 0x7C, 0xC1, 0x7D, 0xF4, 0xC9, /* 0xAC-0xAF */
        0xF4, 0xCA, 0xC1, 0x7E, 0xF4, 0xCB, 0xC1, 0x80, /* 0xB0-0xB3 */
        0xC1, 0x81, 0xC1, 0x82, 0xC1, 0x83, 0xC1, 0x84, /* 0xB4-0xB7 */
        0xD9, 0xFA, 0xB8, 0xFE, 0xC1, 0x85, 0xC1, 0x86, /* 0xB8-0xBB */
        0xE5, 0xF1, 0xD3, 0xF0, 0xC1, 0x87, 0xF4, 0xE0, /* 0xBC-0xBF */
        0xC1, 0x88, 0xCE, 0xCC, 0xC1, 0x89, 0xC1, 0x8A, /* 0xC0-0xC3 */
        0xC1, 0x8B, 0xB3, 0xE1, 0xC1, 0x8C, 0xC1, 0x8D, /* 0xC4-0xC7 */
        0xC1, 0x8E, 0xC1, 0x8F, 0xF1, 0xB4, 0xC1, 0x90, /* 0xC8-0xCB */
        0xD2, 0xEE, 0xC1, 0x91, 0xF4, 0xE1, 0xC1, 0x92, /* 0xCC-0xCF */
        0xC1, 0x93, 0xC1, 0x94, 0xC1, 0x95, 0xC1, 0x96, /* 0xD0-0xD3 */
        0xCF, 0xE8, 0xF4, 0xE2, 0xC1, 0x97, 0xC1, 0x98, /* 0xD4-0xD7 */
        0xC7, 0xCC, 0xC1, 0x99, 0xC1, 0x9A, 0xC1, 0x9B, /* 0xD8-0xDB */
        0xC1, 0x9C, 0xC1, 0x9D, 0xC1, 0x9E, 0xB5, 0xD4, /* 0xDC-0xDF */
        0xB4, 0xE4, 0xF4, 0xE4, 0xC1, 0x9F, 0xC1, 0xA0, /* 0xE0-0xE3 */
        0xC2, 0x40, 0xF4, 0xE3, 0xF4, 0xE5, 0xC2, 0x41, /* 0xE4-0xE7 */
        0xC2, 0x42, 0xF4, 0xE6, 0xC2, 0x43, 0xC2, 0x44, /* 0xE8-0xEB */
        0xC2, 0x45, 0xC2, 0x46, 0xF4, 0xE7, 0xC2, 0x47, /* 0xEC-0xEF */
        0xBA, 0xB2, 0xB0, 0xBF, 0xC2, 0x48, 0xF4, 0xE8, /* 0xF0-0xF3 */
        0xC2, 0x49, 0xC2, 0x4A, 0xC2, 0x4B, 0xC2, 0x4C, /* 0xF4-0xF7 */
        0xC2, 0x4D, 0xC2, 0x4E, 0xC2, 0x4F, 0xB7, 0xAD, /* 0xF8-0xFB */
        0xD2, 0xED, 0xC2, 0x50, 0xC2, 0x51, 0xC2, 0x52, /* 0xFC-0xFF */
};

static const unsigned char u2c_80[512] = {
        0xD2, 0xAB, 0xC0, 0xCF, 0xC2, 0x53, 0xBF, 0xBC, /* 0x00-0x03 */
        0xEB, 0xA3, 0xD5, 0xDF, 0xEA, 0xC8, 0xC2, 0x54, /* 0x04-0x07 */
        0xC2, 0x55, 0xC2, 0x56, 0xC2, 0x57, 0xF1, 0xF3, /* 0x08-0x0B */
        0xB6, 0xF8, 0xCB, 0xA3, 0xC2, 0x58, 0xC2, 0x59, /* 0x0C-0x0F */
        0xC4, 0xCD, 0xC2, 0x5A, 0xF1, 0xE7, 0xC2, 0x5B, /* 0x10-0x13 */
        0xF1, 0xE8, 0xB8, 0xFB, 0xF1, 0xE9, 0xBA, 0xC4, /* 0x14-0x17 */
        0xD4, 0xC5, 0xB0, 0xD2, 0xC2, 0x5C, 0xC2, 0x5D, /* 0x18-0x1B */
        0xF1, 0xEA, 0xC2, 0x5E, 0xC2, 0x5F, 0xC2, 0x60, /* 0x1C-0x1F */
        0xF1, 0xEB, 0xC2, 0x61, 0xF1, 0xEC, 0xC2, 0x62, /* 0x20-0x23 */
        0xC2, 0x63, 0xF1, 0xED, 0xF1, 0xEE, 0xF1, 0xEF, /* 0x24-0x27 */
        0xF1, 0xF1, 0xF1, 0xF0, 0xC5, 0xD5, 0xC2, 0x64, /* 0x28-0x2B */
        0xC2, 0x65, 0xC2, 0x66, 0xC2, 0x67, 0xC2, 0x68, /* 0x2C-0x2F */
        0xC2, 0x69, 0xF1, 0xF2, 0xC2, 0x6A, 0xB6, 0xFA, /* 0x30-0x33 */
        0xC2, 0x6B, 0xF1, 0xF4, 0xD2, 0xAE, 0xDE, 0xC7, /* 0x34-0x37 */
        0xCB, 0xCA, 0xC2, 0x6C, 0xC2, 0x6D, 0xB3, 0xDC, /* 0x38-0x3B */
        0xC2, 0x6E, 0xB5, 0xA2, 0xC2, 0x6F, 0xB9, 0xA2, /* 0x3C-0x3F */
        0xC2, 0x70, 0xC2, 0x71, 0xC4, 0xF4, 0xF1, 0xF5, /* 0x40-0x43 */
        0xC2, 0x72, 0xC2, 0x73, 0xF1, 0xF6, 0xC2, 0x74, /* 0x44-0x47 */
        0xC2, 0x75, 0xC2, 0x76, 0xC1, 0xC4, 0xC1, 0xFB, /* 0x48-0x4B */
        0xD6, 0xB0, 0xF1, 0xF7, 0xC2, 0x77, 0xC2, 0x78, /* 0x4C-0x4F */
        0xC2, 0x79, 0xC2, 0x7A, 0xF1, 0xF8, 0xC2, 0x7B, /* 0x50-0x53 */
        0xC1, 0xAA, 0xC2, 0x7C, 0xC2, 0x7D, 0xC2, 0x7E, /* 0x54-0x57 */
        0xC6, 0xB8, 0xC2, 0x80, 0xBE, 0xDB, 0xC2, 0x81, /* 0x58-0x5B */
        0xC2, 0x82, 0xC2, 0x83, 0xC2, 0x84, 0xC2, 0x85, /* 0x5C-0x5F */
        0xC2, 0x86, 0xC2, 0x87, 0xC2, 0x88, 0xC2, 0x89, /* 0x60-0x63 */
        0xC2, 0x8A, 0xC2, 0x8B, 0xC2, 0x8C, 0xC2, 0x8D, /* 0x64-0x67 */
        0xC2, 0x8E, 0xF1, 0xF9, 0xB4, 0xCF, 0xC2, 0x8F, /* 0x68-0x6B */
        0xC2, 0x90, 0xC2, 0x91, 0xC2, 0x92, 0xC2, 0x93, /* 0x6C-0x6F */
        0xC2, 0x94, 0xF1, 0xFA, 0xC2, 0x95, 0xC2, 0x96, /* 0x70-0x73 */
        0xC2, 0x97, 0xC2, 0x98, 0xC2, 0x99, 0xC2, 0x9A, /* 0x74-0x77 */
        0xC2, 0x9B, 0xC2, 0x9C, 0xC2, 0x9D, 0xC2, 0x9E, /* 0x78-0x7B */
        0xC2, 0x9F, 0xC2, 0xA0, 0xC3, 0x40, 0xED, 0xB2, /* 0x7C-0x7F */
        
        0xED, 0xB1, 0xC3, 0x41, 0xC3, 0x42, 0xCB, 0xE0, /* 0x80-0x83 */
        0xD2, 0xDE, 0xC3, 0x43, 0xCB, 0xC1, 0xD5, 0xD8, /* 0x84-0x87 */
        0xC3, 0x44, 0xC8, 0xE2, 0xC3, 0x45, 0xC0, 0xDF, /* 0x88-0x8B */
        0xBC, 0xA1, 0xC3, 0x46, 0xC3, 0x47, 0xC3, 0x48, /* 0x8C-0x8F */
        0xC3, 0x49, 0xC3, 0x4A, 0xC3, 0x4B, 0xEB, 0xC1, /* 0x90-0x93 */
        0xC3, 0x4C, 0xC3, 0x4D, 0xD0, 0xA4, 0xC3, 0x4E, /* 0x94-0x97 */
        0xD6, 0xE2, 0xC3, 0x4F, 0xB6, 0xC7, 0xB8, 0xD8, /* 0x98-0x9B */
        0xEB, 0xC0, 0xB8, 0xCE, 0xC3, 0x50, 0xEB, 0xBF, /* 0x9C-0x9F */
        0xB3, 0xA6, 0xB9, 0xC9, 0xD6, 0xAB, 0xC3, 0x51, /* 0xA0-0xA3 */
        0xB7, 0xF4, 0xB7, 0xCA, 0xC3, 0x52, 0xC3, 0x53, /* 0xA4-0xA7 */
        0xC3, 0x54, 0xBC, 0xE7, 0xB7, 0xBE, 0xEB, 0xC6, /* 0xA8-0xAB */
        0xC3, 0x55, 0xEB, 0xC7, 0xB0, 0xB9, 0xBF, 0xCF, /* 0xAC-0xAF */
        0xC3, 0x56, 0xEB, 0xC5, 0xD3, 0xFD, 0xC3, 0x57, /* 0xB0-0xB3 */
        0xEB, 0xC8, 0xC3, 0x58, 0xC3, 0x59, 0xEB, 0xC9, /* 0xB4-0xB7 */
        0xC3, 0x5A, 0xC3, 0x5B, 0xB7, 0xCE, 0xC3, 0x5C, /* 0xB8-0xBB */
        0xEB, 0xC2, 0xEB, 0xC4, 0xC9, 0xF6, 0xD6, 0xD7, /* 0xBC-0xBF */
        0xD5, 0xCD, 0xD0, 0xB2, 0xEB, 0xCF, 0xCE, 0xB8, /* 0xC0-0xC3 */
        0xEB, 0xD0, 0xC3, 0x5D, 0xB5, 0xA8, 0xC3, 0x5E, /* 0xC4-0xC7 */
        0xC3, 0x5F, 0xC3, 0x60, 0xC3, 0x61, 0xC3, 0x62, /* 0xC8-0xCB */
        0xB1, 0xB3, 0xEB, 0xD2, 0xCC, 0xA5, 0xC3, 0x63, /* 0xCC-0xCF */
        0xC3, 0x64, 0xC3, 0x65, 0xC3, 0x66, 0xC3, 0x67, /* 0xD0-0xD3 */
        0xC3, 0x68, 0xC3, 0x69, 0xC5, 0xD6, 0xEB, 0xD3, /* 0xD4-0xD7 */
        0xC3, 0x6A, 0xEB, 0xD1, 0xC5, 0xDF, 0xEB, 0xCE, /* 0xD8-0xDB */
        0xCA, 0xA4, 0xEB, 0xD5, 0xB0, 0xFB, 0xC3, 0x6B, /* 0xDC-0xDF */
        0xC3, 0x6C, 0xBA, 0xFA, 0xC3, 0x6D, 0xC3, 0x6E, /* 0xE0-0xE3 */
        0xD8, 0xB7, 0xF1, 0xE3, 0xC3, 0x6F, 0xEB, 0xCA, /* 0xE4-0xE7 */
        0xEB, 0xCB, 0xEB, 0xCC, 0xEB, 0xCD, 0xEB, 0xD6, /* 0xE8-0xEB */
        0xE6, 0xC0, 0xEB, 0xD9, 0xC3, 0x70, 0xBF, 0xE8, /* 0xEC-0xEF */
        0xD2, 0xC8, 0xEB, 0xD7, 0xEB, 0xDC, 0xB8, 0xEC, /* 0xF0-0xF3 */
        0xEB, 0xD8, 0xC3, 0x71, 0xBD, 0xBA, 0xC3, 0x72, /* 0xF4-0xF7 */
        0xD0, 0xD8, 0xC3, 0x73, 0xB0, 0xB7, 0xC3, 0x74, /* 0xF8-0xFB */
        0xEB, 0xDD, 0xC4, 0xDC, 0xC3, 0x75, 0xC3, 0x76, /* 0xFC-0xFF */
};

static const unsigned char u2c_81[512] = {
        0xC3, 0x77, 0xC3, 0x78, 0xD6, 0xAC, 0xC3, 0x79, /* 0x00-0x03 */
        0xC3, 0x7A, 0xC3, 0x7B, 0xB4, 0xE0, 0xC3, 0x7C, /* 0x04-0x07 */
        0xC3, 0x7D, 0xC2, 0xF6, 0xBC, 0xB9, 0xC3, 0x7E, /* 0x08-0x0B */
        0xC3, 0x80, 0xEB, 0xDA, 0xEB, 0xDB, 0xD4, 0xE0, /* 0x0C-0x0F */
        0xC6, 0xEA, 0xC4, 0xD4, 0xEB, 0xDF, 0xC5, 0xA7, /* 0x10-0x13 */
        0xD9, 0xF5, 0xC3, 0x81, 0xB2, 0xB1, 0xC3, 0x82, /* 0x14-0x17 */
        0xEB, 0xE4, 0xC3, 0x83, 0xBD, 0xC5, 0xC3, 0x84, /* 0x18-0x1B */
        0xC3, 0x85, 0xC3, 0x86, 0xEB, 0xE2, 0xC3, 0x87, /* 0x1C-0x1F */
        0xC3, 0x88, 0xC3, 0x89, 0xC3, 0x8A, 0xC3, 0x8B, /* 0x20-0x23 */
        0xC3, 0x8C, 0xC3, 0x8D, 0xC3, 0x8E, 0xC3, 0x8F, /* 0x24-0x27 */
        0xC3, 0x90, 0xC3, 0x91, 0xC3, 0x92, 0xC3, 0x93, /* 0x28-0x2B */
        0xEB, 0xE3, 0xC3, 0x94, 0xC3, 0x95, 0xB8, 0xAC, /* 0x2C-0x2F */
        0xC3, 0x96, 0xCD, 0xD1, 0xEB, 0xE5, 0xC3, 0x97, /* 0x30-0x33 */
        0xC3, 0x98, 0xC3, 0x99, 0xEB, 0xE1, 0xC3, 0x9A, /* 0x34-0x37 */
        0xC1, 0xB3, 0xC3, 0x9B, 0xC3, 0x9C, 0xC3, 0x9D, /* 0x38-0x3B */
        0xC3, 0x9E, 0xC3, 0x9F, 0xC6, 0xA2, 0xC3, 0xA0, /* 0x3C-0x3F */
        0xC4, 0x40, 0xC4, 0x41, 0xC4, 0x42, 0xC4, 0x43, /* 0x40-0x43 */
        0xC4, 0x44, 0xC4, 0x45, 0xCC, 0xF3, 0xC4, 0x46, /* 0x44-0x47 */
        0xEB, 0xE6, 0xC4, 0x47, 0xC0, 0xB0, 0xD2, 0xB8, /* 0x48-0x4B */
        0xEB, 0xE7, 0xC4, 0x48, 0xC4, 0x49, 0xC4, 0x4A, /* 0x4C-0x4F */
        0xB8, 0xAF, 0xB8, 0xAD, 0xC4, 0x4B, 0xEB, 0xE8, /* 0x50-0x53 */
        0xC7, 0xBB, 0xCD, 0xF3, 0xC4, 0x4C, 0xC4, 0x4D, /* 0x54-0x57 */
        0xC4, 0x4E, 0xEB, 0xEA, 0xEB, 0xEB, 0xC4, 0x4F, /* 0x58-0x5B */
        0xC4, 0x50, 0xC4, 0x51, 0xC4, 0x52, 0xC4, 0x53, /* 0x5C-0x5F */
        0xEB, 0xED, 0xC4, 0x54, 0xC4, 0x55, 0xC4, 0x56, /* 0x60-0x63 */
        0xC4, 0x57, 0xD0, 0xC8, 0xC4, 0x58, 0xEB, 0xF2, /* 0x64-0x67 */
        0xC4, 0x59, 0xEB, 0xEE, 0xC4, 0x5A, 0xC4, 0x5B, /* 0x68-0x6B */
        0xC4, 0x5C, 0xEB, 0xF1, 0xC8, 0xF9, 0xC4, 0x5D, /* 0x6C-0x6F */
        0xD1, 0xFC, 0xEB, 0xEC, 0xC4, 0x5E, 0xC4, 0x5F, /* 0x70-0x73 */
        0xEB, 0xE9, 0xC4, 0x60, 0xC4, 0x61, 0xC4, 0x62, /* 0x74-0x77 */
        0xC4, 0x63, 0xB8, 0xB9, 0xCF, 0xD9, 0xC4, 0xE5, /* 0x78-0x7B */
        0xEB, 0xEF, 0xEB, 0xF0, 0xCC, 0xDA, 0xCD, 0xC8, /* 0x7C-0x7F */
        
        0xB0, 0xF2, 0xC4, 0x64, 0xEB, 0xF6, 0xC4, 0x65, /* 0x80-0x83 */
        0xC4, 0x66, 0xC4, 0x67, 0xC4, 0x68, 0xC4, 0x69, /* 0x84-0x87 */
        0xEB, 0xF5, 0xC4, 0x6A, 0xB2, 0xB2, 0xC4, 0x6B, /* 0x88-0x8B */
        0xC4, 0x6C, 0xC4, 0x6D, 0xC4, 0x6E, 0xB8, 0xE0, /* 0x8C-0x8F */
        0xC4, 0x6F, 0xEB, 0xF7, 0xC4, 0x70, 0xC4, 0x71, /* 0x90-0x93 */
        0xC4, 0x72, 0xC4, 0x73, 0xC4, 0x74, 0xC4, 0x75, /* 0x94-0x97 */
        0xB1, 0xEC, 0xC4, 0x76, 0xC4, 0x77, 0xCC, 0xC5, /* 0x98-0x9B */
        0xC4, 0xA4, 0xCF, 0xA5, 0xC4, 0x78, 0xC4, 0x79, /* 0x9C-0x9F */
        0xC4, 0x7A, 0xC4, 0x7B, 0xC4, 0x7C, 0xEB, 0xF9, /* 0xA0-0xA3 */
        0xC4, 0x7D, 0xC4, 0x7E, 0xEC, 0xA2, 0xC4, 0x80, /* 0xA4-0xA7 */
        0xC5, 0xF2, 0xC4, 0x81, 0xEB, 0xFA, 0xC4, 0x82, /* 0xA8-0xAB */
        0xC4, 0x83, 0xC4, 0x84, 0xC4, 0x85, 0xC4, 0x86, /* 0xAC-0xAF */
        0xC4, 0x87, 0xC4, 0x88, 0xC4, 0x89, 0xC9, 0xC5, /* 0xB0-0xB3 */
        0xC4, 0x8A, 0xC4, 0x8B, 0xC4, 0x8C, 0xC4, 0x8D, /* 0xB4-0xB7 */
        0xC4, 0x8E, 0xC4, 0x8F, 0xE2, 0xDF, 0xEB, 0xFE, /* 0xB8-0xBB */
        0xC4, 0x90, 0xC4, 0x91, 0xC4, 0x92, 0xC4, 0x93, /* 0xBC-0xBF */
        0xCD, 0xCE, 0xEC, 0xA1, 0xB1, 0xDB, 0xD3, 0xB7, /* 0xC0-0xC3 */
        0xC4, 0x94, 0xC4, 0x95, 0xD2, 0xDC, 0xC4, 0x96, /* 0xC4-0xC7 */
        0xC4, 0x97, 0xC4, 0x98, 0xEB, 0xFD, 0xC4, 0x99, /* 0xC8-0xCB */
        0xEB, 0xFB, 0xC4, 0x9A, 0xC4, 0x9B, 0xC4, 0x9C, /* 0xCC-0xCF */
        0xC4, 0x9D, 0xC4, 0x9E, 0xC4, 0x9F, 0xC4, 0xA0, /* 0xD0-0xD3 */
        0xC5, 0x40, 0xC5, 0x41, 0xC5, 0x42, 0xC5, 0x43, /* 0xD4-0xD7 */
        0xC5, 0x44, 0xC5, 0x45, 0xC5, 0x46, 0xC5, 0x47, /* 0xD8-0xDB */
        0xC5, 0x48, 0xC5, 0x49, 0xC5, 0x4A, 0xC5, 0x4B, /* 0xDC-0xDF */
        0xC5, 0x4C, 0xC5, 0x4D, 0xC5, 0x4E, 0xB3, 0xBC, /* 0xE0-0xE3 */
        0xC5, 0x4F, 0xC5, 0x50, 0xC5, 0x51, 0xEA, 0xB0, /* 0xE4-0xE7 */
        0xC5, 0x52, 0xC5, 0x53, 0xD7, 0xD4, 0xC5, 0x54, /* 0xE8-0xEB */
        0xF4, 0xAB, 0xB3, 0xF4, 0xC5, 0x55, 0xC5, 0x56, /* 0xEC-0xEF */
        0xC5, 0x57, 0xC5, 0x58, 0xC5, 0x59, 0xD6, 0xC1, /* 0xF0-0xF3 */
        0xD6, 0xC2, 0xC5, 0x5A, 0xC5, 0x5B, 0xC5, 0x5C, /* 0xF4-0xF7 */
        0xC5, 0x5D, 0xC5, 0x5E, 0xC5, 0x5F, 0xD5, 0xE9, /* 0xF8-0xFB */
        0xBE, 0xCA, 0xC5, 0x60, 0xF4, 0xA7, 0xC5, 0x61, /* 0xFC-0xFF */
};

static const unsigned char u2c_82[512] = {
        0xD2, 0xA8, 0xF4, 0xA8, 0xF4, 0xA9, 0xC5, 0x62, /* 0x00-0x03 */
        0xF4, 0xAA, 0xBE, 0xCB, 0xD3, 0xDF, 0xC5, 0x63, /* 0x04-0x07 */
        0xC5, 0x64, 0xC5, 0x65, 0xC5, 0x66, 0xC5, 0x67, /* 0x08-0x0B */
        0xC9, 0xE0, 0xC9, 0xE1, 0xC5, 0x68, 0xC5, 0x69, /* 0x0C-0x0F */
        0xF3, 0xC2, 0xC5, 0x6A, 0xCA, 0xE6, 0xC5, 0x6B, /* 0x10-0x13 */
        0xCC, 0xF2, 0xC5, 0x6C, 0xC5, 0x6D, 0xC5, 0x6E, /* 0x14-0x17 */
        0xC5, 0x6F, 0xC5, 0x70, 0xC5, 0x71, 0xE2, 0xB6, /* 0x18-0x1B */
        0xCB, 0xB4, 0xC5, 0x72, 0xCE, 0xE8, 0xD6, 0xDB, /* 0x1C-0x1F */
        0xC5, 0x73, 0xF4, 0xAD, 0xF4, 0xAE, 0xF4, 0xAF, /* 0x20-0x23 */
        0xC5, 0x74, 0xC5, 0x75, 0xC5, 0x76, 0xC5, 0x77, /* 0x24-0x27 */
        0xF4, 0xB2, 0xC5, 0x78, 0xBA, 0xBD, 0xF4, 0xB3, /* 0x28-0x2B */
        0xB0, 0xE3, 0xF4, 0xB0, 0xC5, 0x79, 0xF4, 0xB1, /* 0x2C-0x2F */
        0xBD, 0xA2, 0xB2, 0xD5, 0xC5, 0x7A, 0xF4, 0xB6, /* 0x30-0x33 */
        0xF4, 0xB7, 0xB6, 0xE6, 0xB2, 0xB0, 0xCF, 0xCF, /* 0x34-0x37 */
        0xF4, 0xB4, 0xB4, 0xAC, 0xC5, 0x7B, 0xF4, 0xB5, /* 0x38-0x3B */
        0xC5, 0x7C, 0xC5, 0x7D, 0xF4, 0xB8, 0xC5, 0x7E, /* 0x3C-0x3F */
        0xC5, 0x80, 0xC5, 0x81, 0xC5, 0x82, 0xC5, 0x83, /* 0x40-0x43 */
        0xF4, 0xB9, 0xC5, 0x84, 0xC5, 0x85, 0xCD, 0xA7, /* 0x44-0x47 */
        0xC5, 0x86, 0xF4, 0xBA, 0xC5, 0x87, 0xF4, 0xBB, /* 0x48-0x4B */
        0xC5, 0x88, 0xC5, 0x89, 0xC5, 0x8A, 0xF4, 0xBC, /* 0x4C-0x4F */
        0xC5, 0x8B, 0xC5, 0x8C, 0xC5, 0x8D, 0xC5, 0x8E, /* 0x50-0x53 */
        0xC5, 0x8F, 0xC5, 0x90, 0xC5, 0x91, 0xC5, 0x92, /* 0x54-0x57 */
        0xCB, 0xD2, 0xC5, 0x93, 0xF4, 0xBD, 0xC5, 0x94, /* 0x58-0x5B */
        0xC5, 0x95, 0xC5, 0x96, 0xC5, 0x97, 0xF4, 0xBE, /* 0x5C-0x5F */
        0xC5, 0x98, 0xC5, 0x99, 0xC5, 0x9A, 0xC5, 0x9B, /* 0x60-0x63 */
        0xC5, 0x9C, 0xC5, 0x9D, 0xC5, 0x9E, 0xC5, 0x9F, /* 0x64-0x67 */
        0xF4, 0xBF, 0xC5, 0xA0, 0xC6, 0x40, 0xC6, 0x41, /* 0x68-0x6B */
        0xC6, 0x42, 0xC6, 0x43, 0xF4, 0xDE, 0xC1, 0xBC, /* 0x6C-0x6F */
        0xBC, 0xE8, 0xC6, 0x44, 0xC9, 0xAB, 0xD1, 0xDE, /* 0x70-0x73 */
        0xE5, 0xF5, 0xC6, 0x45, 0xC6, 0x46, 0xC6, 0x47, /* 0x74-0x77 */
        0xC6, 0x48, 0xDC, 0xB3, 0xD2, 0xD5, 0xC6, 0x49, /* 0x78-0x7B */
        0xC6, 0x4A, 0xDC, 0xB4, 0xB0, 0xAC, 0xDC, 0xB5, /* 0x7C-0x7F */
        
        0xC6, 0x4B, 0xC6, 0x4C, 0xBD, 0xDA, 0xC6, 0x4D, /* 0x80-0x83 */
        0xDC, 0xB9, 0xC6, 0x4E, 0xC6, 0x4F, 0xC6, 0x50, /* 0x84-0x87 */
        0xD8, 0xC2, 0xC6, 0x51, 0xDC, 0xB7, 0xD3, 0xF3, /* 0x88-0x8B */
        0xC6, 0x52, 0xC9, 0xD6, 0xDC, 0xBA, 0xDC, 0xB6, /* 0x8C-0x8F */
        0xC6, 0x53, 0xDC, 0xBB, 0xC3, 0xA2, 0xC6, 0x54, /* 0x90-0x93 */
        0xC6, 0x55, 0xC6, 0x56, 0xC6, 0x57, 0xDC, 0xBC, /* 0x94-0x97 */
        0xDC, 0xC5, 0xDC, 0xBD, 0xC6, 0x58, 0xC6, 0x59, /* 0x98-0x9B */
        0xCE, 0xDF, 0xD6, 0xA5, 0xC6, 0x5A, 0xDC, 0xCF, /* 0x9C-0x9F */
        0xC6, 0x5B, 0xDC, 0xCD, 0xC6, 0x5C, 0xC6, 0x5D, /* 0xA0-0xA3 */
        0xDC, 0xD2, 0xBD, 0xE6, 0xC2, 0xAB, 0xC6, 0x5E, /* 0xA4-0xA7 */
        0xDC, 0xB8, 0xDC, 0xCB, 0xDC, 0xCE, 0xDC, 0xBE, /* 0xA8-0xAB */
        0xB7, 0xD2, 0xB0, 0xC5, 0xDC, 0xC7, 0xD0, 0xBE, /* 0xAC-0xAF */
        0xDC, 0xC1, 0xBB, 0xA8, 0xC6, 0x5F, 0xB7, 0xBC, /* 0xB0-0xB3 */
        0xDC, 0xCC, 0xC6, 0x60, 0xC6, 0x61, 0xDC, 0xC6, /* 0xB4-0xB7 */
        0xDC, 0xBF, 0xC7, 0xDB, 0xC6, 0x62, 0xC6, 0x63, /* 0xB8-0xBB */
        0xC6, 0x64, 0xD1, 0xBF, 0xDC, 0xC0, 0xC6, 0x65, /* 0xBC-0xBF */
        0xC6, 0x66, 0xDC, 0xCA, 0xC6, 0x67, 0xC6, 0x68, /* 0xC0-0xC3 */
        0xDC, 0xD0, 0xC6, 0x69, 0xC6, 0x6A, 0xCE, 0xAD, /* 0xC4-0xC7 */
        0xDC, 0xC2, 0xC6, 0x6B, 0xDC, 0xC3, 0xDC, 0xC8, /* 0xC8-0xCB */
        0xDC, 0xC9, 0xB2, 0xD4, 0xDC, 0xD1, 0xCB, 0xD5, /* 0xCC-0xCF */
        0xC6, 0x6C, 0xD4, 0xB7, 0xDC, 0xDB, 0xDC, 0xDF, /* 0xD0-0xD3 */
        0xCC, 0xA6, 0xDC, 0xE6, 0xC6, 0x6D, 0xC3, 0xE7, /* 0xD4-0xD7 */
        0xDC, 0xDC, 0xC6, 0x6E, 0xC6, 0x6F, 0xBF, 0xC1, /* 0xD8-0xDB */
        0xDC, 0xD9, 0xC6, 0x70, 0xB0, 0xFA, 0xB9, 0xB6, /* 0xDC-0xDF */
        0xDC, 0xE5, 0xDC, 0xD3, 0xC6, 0x71, 0xDC, 0xC4, /* 0xE0-0xE3 */
        0xDC, 0xD6, 0xC8, 0xF4, 0xBF, 0xE0, 0xC6, 0x72, /* 0xE4-0xE7 */
        0xC6, 0x73, 0xC6, 0x74, 0xC6, 0x75, 0xC9, 0xBB, /* 0xE8-0xEB */
        0xC6, 0x76, 0xC6, 0x77, 0xC6, 0x78, 0xB1, 0xBD, /* 0xEC-0xEF */
        0xC6, 0x79, 0xD3, 0xA2, 0xC6, 0x7A, 0xC6, 0x7B, /* 0xF0-0xF3 */
        0xDC, 0xDA, 0xC6, 0x7C, 0xC6, 0x7D, 0xDC, 0xD5, /* 0xF4-0xF7 */
        0xC6, 0x7E, 0xC6, 0xBB, 0xC6, 0x80, 0xDC, 0xDE, /* 0xF8-0xFB */
        0xC6, 0x81, 0xC6, 0x82, 0xC6, 0x83, 0xC6, 0x84, /* 0xFC-0xFF */
};

static const unsigned char u2c_83[512] = {
        0xC6, 0x85, 0xD7, 0xC2, 0xC3, 0xAF, 0xB7, 0xB6, /* 0x00-0x03 */
        0xC7, 0xD1, 0xC3, 0xA9, 0xDC, 0xE2, 0xDC, 0xD8, /* 0x04-0x07 */
        0xDC, 0xEB, 0xDC, 0xD4, 0xC6, 0x86, 0xC6, 0x87, /* 0x08-0x0B */
        0xDC, 0xDD, 0xC6, 0x88, 0xBE, 0xA5, 0xDC, 0xD7, /* 0x0C-0x0F */
        0xC6, 0x89, 0xDC, 0xE0, 0xC6, 0x8A, 0xC6, 0x8B, /* 0x10-0x13 */
        0xDC, 0xE3, 0xDC, 0xE4, 0xC6, 0x8C, 0xDC, 0xF8, /* 0x14-0x17 */
        0xC6, 0x8D, 0xC6, 0x8E, 0xDC, 0xE1, 0xDD, 0xA2, /* 0x18-0x1B */
        0xDC, 0xE7, 0xC6, 0x8F, 0xC6, 0x90, 0xC6, 0x91, /* 0x1C-0x1F */
        0xC6, 0x92, 0xC6, 0x93, 0xC6, 0x94, 0xC6, 0x95, /* 0x20-0x23 */
        0xC6, 0x96, 0xC6, 0x97, 0xC6, 0x98, 0xBC, 0xEB, /* 0x24-0x27 */
        0xB4, 0xC4, 0xC6, 0x99, 0xC6, 0x9A, 0xC3, 0xA3, /* 0x28-0x2B */
        0xB2, 0xE7, 0xDC, 0xFA, 0xC6, 0x9B, 0xDC, 0xF2, /* 0x2C-0x2F */
        0xC6, 0x9C, 0xDC, 0xEF, 0xC6, 0x9D, 0xDC, 0xFC, /* 0x30-0x33 */
        0xDC, 0xEE, 0xD2, 0xF0, 0xB2, 0xE8, 0xC6, 0x9E, /* 0x34-0x37 */
        0xC8, 0xD7, 0xC8, 0xE3, 0xDC, 0xFB, 0xC6, 0x9F, /* 0x38-0x3B */
        0xDC, 0xED, 0xC6, 0xA0, 0xC7, 0x40, 0xC7, 0x41, /* 0x3C-0x3F */
        0xDC, 0xF7, 0xC7, 0x42, 0xC7, 0x43, 0xDC, 0xF5, /* 0x40-0x43 */
        0xC7, 0x44, 0xC7, 0x45, 0xBE, 0xA3, 0xDC, 0xF4, /* 0x44-0x47 */
        0xC7, 0x46, 0xB2, 0xDD, 0xC7, 0x47, 0xC7, 0x48, /* 0x48-0x4B */
        0xC7, 0x49, 0xC7, 0x4A, 0xC7, 0x4B, 0xDC, 0xF3, /* 0x4C-0x4F */
        0xBC, 0xF6, 0xDC, 0xE8, 0xBB, 0xC4, 0xC7, 0x4C, /* 0x50-0x53 */
        0xC0, 0xF3, 0xC7, 0x4D, 0xC7, 0x4E, 0xC7, 0x4F, /* 0x54-0x57 */
        0xC7, 0x50, 0xC7, 0x51, 0xBC, 0xD4, 0xDC, 0xE9, /* 0x58-0x5B */
        0xDC, 0xEA, 0xC7, 0x52, 0xDC, 0xF1, 0xDC, 0xF6, /* 0x5C-0x5F */
        0xDC, 0xF9, 0xB5, 0xB4, 0xC7, 0x53, 0xC8, 0xD9, /* 0x60-0x63 */
        0xBB, 0xE7, 0xDC, 0xFE, 0xDC, 0xFD, 0xD3, 0xAB, /* 0x64-0x67 */
        0xDD, 0xA1, 0xDD, 0xA3, 0xDD, 0xA5, 0xD2, 0xF1, /* 0x68-0x6B */
        0xDD, 0xA4, 0xDD, 0xA6, 0xDD, 0xA7, 0xD2, 0xA9, /* 0x6C-0x6F */
        0xC7, 0x54, 0xC7, 0x55, 0xC7, 0x56, 0xC7, 0x57, /* 0x70-0x73 */
        0xC7, 0x58, 0xC7, 0x59, 0xC7, 0x5A, 0xBA, 0xC9, /* 0x74-0x77 */
        0xDD, 0xA9, 0xC7, 0x5B, 0xC7, 0x5C, 0xDD, 0xB6, /* 0x78-0x7B */
        0xDD, 0xB1, 0xDD, 0xB4, 0xC7, 0x5D, 0xC7, 0x5E, /* 0x7C-0x7F */
        
        0xC7, 0x5F, 0xC7, 0x60, 0xC7, 0x61, 0xC7, 0x62, /* 0x80-0x83 */
        0xC7, 0x63, 0xDD, 0xB0, 0xC6, 0xCE, 0xC7, 0x64, /* 0x84-0x87 */
        0xC7, 0x65, 0xC0, 0xF2, 0xC7, 0x66, 0xC7, 0x67, /* 0x88-0x8B */
        0xC7, 0x68, 0xC7, 0x69, 0xC9, 0xAF, 0xC7, 0x6A, /* 0x8C-0x8F */
        0xC7, 0x6B, 0xC7, 0x6C, 0xDC, 0xEC, 0xDD, 0xAE, /* 0x90-0x93 */
        0xC7, 0x6D, 0xC7, 0x6E, 0xC7, 0x6F, 0xC7, 0x70, /* 0x94-0x97 */
        0xDD, 0xB7, 0xC7, 0x71, 0xC7, 0x72, 0xDC, 0xF0, /* 0x98-0x9B */
        0xDD, 0xAF, 0xC7, 0x73, 0xDD, 0xB8, 0xC7, 0x74, /* 0x9C-0x9F */
        0xDD, 0xAC, 0xC7, 0x75, 0xC7, 0x76, 0xC7, 0x77, /* 0xA0-0xA3 */
        0xC7, 0x78, 0xC7, 0x79, 0xC7, 0x7A, 0xC7, 0x7B, /* 0xA4-0xA7 */
        0xDD, 0xB9, 0xDD, 0xB3, 0xDD, 0xAD, 0xC4, 0xAA, /* 0xA8-0xAB */
        0xC7, 0x7C, 0xC7, 0x7D, 0xC7, 0x7E, 0xC7, 0x80, /* 0xAC-0xAF */
        0xDD, 0xA8, 0xC0, 0xB3, 0xC1, 0xAB, 0xDD, 0xAA, /* 0xB0-0xB3 */
        0xDD, 0xAB, 0xC7, 0x81, 0xDD, 0xB2, 0xBB, 0xF1, /* 0xB4-0xB7 */
        0xDD, 0xB5, 0xD3, 0xA8, 0xDD, 0xBA, 0xC7, 0x82, /* 0xB8-0xBB */
        0xDD, 0xBB, 0xC3, 0xA7, 0xC7, 0x83, 0xC7, 0x84, /* 0xBC-0xBF */
        0xDD, 0xD2, 0xDD, 0xBC, 0xC7, 0x85, 0xC7, 0x86, /* 0xC0-0xC3 */
        0xC7, 0x87, 0xDD, 0xD1, 0xC7, 0x88, 0xB9, 0xBD, /* 0xC4-0xC7 */
        0xC7, 0x89, 0xC7, 0x8A, 0xBE, 0xD5, 0xC7, 0x8B, /* 0xC8-0xCB */
        0xBE, 0xFA, 0xC7, 0x8C, 0xC7, 0x8D, 0xBA, 0xCA, /* 0xCC-0xCF */
        0xC7, 0x8E, 0xC7, 0x8F, 0xC7, 0x90, 0xC7, 0x91, /* 0xD0-0xD3 */
        0xDD, 0xCA, 0xC7, 0x92, 0xDD, 0xC5, 0xC7, 0x93, /* 0xD4-0xD7 */
        0xDD, 0xBF, 0xC7, 0x94, 0xC7, 0x95, 0xC7, 0x96, /* 0xD8-0xDB */
        0xB2, 0xCB, 0xDD, 0xC3, 0xC7, 0x97, 0xDD, 0xCB, /* 0xDC-0xDF */
        0xB2, 0xA4, 0xDD, 0xD5, 0xC7, 0x98, 0xC7, 0x99, /* 0xE0-0xE3 */
        0xC7, 0x9A, 0xDD, 0xBE, 0xC7, 0x9B, 0xC7, 0x9C, /* 0xE4-0xE7 */
        0xC7, 0x9D, 0xC6, 0xD0, 0xDD, 0xD0, 0xC7, 0x9E, /* 0xE8-0xEB */
        0xC7, 0x9F, 0xC7, 0xA0, 0xC8, 0x40, 0xC8, 0x41, /* 0xEC-0xEF */
        0xDD, 0xD4, 0xC1, 0xE2, 0xB7, 0xC6, 0xC8, 0x42, /* 0xF0-0xF3 */
        0xC8, 0x43, 0xC8, 0x44, 0xC8, 0x45, 0xC8, 0x46, /* 0xF4-0xF7 */
        0xDD, 0xCE, 0xDD, 0xCF, 0xC8, 0x47, 0xC8, 0x48, /* 0xF8-0xFB */
        0xC8, 0x49, 0xDD, 0xC4, 0xC8, 0x4A, 0xC8, 0x4B, /* 0xFC-0xFF */
};

static const unsigned char u2c_84[512] = {
        0xC8, 0x4C, 0xDD, 0xBD, 0xC8, 0x4D, 0xDD, 0xCD, /* 0x00-0x03 */
        0xCC, 0xD1, 0xC8, 0x4E, 0xDD, 0xC9, 0xC8, 0x4F, /* 0x04-0x07 */
        0xC8, 0x50, 0xC8, 0x51, 0xC8, 0x52, 0xDD, 0xC2, /* 0x08-0x0B */
        0xC3, 0xC8, 0xC6, 0xBC, 0xCE, 0xAE, 0xDD, 0xCC, /* 0x0C-0x0F */
        0xC8, 0x53, 0xDD, 0xC8, 0xC8, 0x54, 0xC8, 0x55, /* 0x10-0x13 */
        0xC8, 0x56, 0xC8, 0x57, 0xC8, 0x58, 0xC8, 0x59, /* 0x14-0x17 */
        0xDD, 0xC1, 0xC8, 0x5A, 0xC8, 0x5B, 0xC8, 0x5C, /* 0x18-0x1B */
        0xDD, 0xC6, 0xC2, 0xDC, 0xC8, 0x5D, 0xC8, 0x5E, /* 0x1C-0x1F */
        0xC8, 0x5F, 0xC8, 0x60, 0xC8, 0x61, 0xC8, 0x62, /* 0x20-0x23 */
        0xD3, 0xA9, 0xD3, 0xAA, 0xDD, 0xD3, 0xCF, 0xF4, /* 0x24-0x27 */
        0xC8, 0xF8, 0xC8, 0x63, 0xC8, 0x64, 0xC8, 0x65, /* 0x28-0x2B */
        0xC8, 0x66, 0xC8, 0x67, 0xC8, 0x68, 0xC8, 0x69, /* 0x2C-0x2F */
        0xC8, 0x6A, 0xDD, 0xE6, 0xC8, 0x6B, 0xC8, 0x6C, /* 0x30-0x33 */
        0xC8, 0x6D, 0xC8, 0x6E, 0xC8, 0x6F, 0xC8, 0x70, /* 0x34-0x37 */
        0xDD, 0xC7, 0xC8, 0x71, 0xC8, 0x72, 0xC8, 0x73, /* 0x38-0x3B */
        0xDD, 0xE0, 0xC2, 0xE4, 0xC8, 0x74, 0xC8, 0x75, /* 0x3C-0x3F */
        0xC8, 0x76, 0xC8, 0x77, 0xC8, 0x78, 0xC8, 0x79, /* 0x40-0x43 */
        0xC8, 0x7A, 0xC8, 0x7B, 0xDD, 0xE1, 0xC8, 0x7C, /* 0x44-0x47 */
        0xC8, 0x7D, 0xC8, 0x7E, 0xC8, 0x80, 0xC8, 0x81, /* 0x48-0x4B */
        0xC8, 0x82, 0xC8, 0x83, 0xC8, 0x84, 0xC8, 0x85, /* 0x4C-0x4F */
        0xC8, 0x86, 0xDD, 0xD7, 0xC8, 0x87, 0xC8, 0x88, /* 0x50-0x53 */
        0xC8, 0x89, 0xC8, 0x8A, 0xC8, 0x8B, 0xD6, 0xF8, /* 0x54-0x57 */
        0xC8, 0x8C, 0xDD, 0xD9, 0xDD, 0xD8, 0xB8, 0xF0, /* 0x58-0x5B */
        0xDD, 0xD6, 0xC8, 0x8D, 0xC8, 0x8E, 0xC8, 0x8F, /* 0x5C-0x5F */
        0xC8, 0x90, 0xC6, 0xCF, 0xC8, 0x91, 0xB6, 0xAD, /* 0x60-0x63 */
        0xC8, 0x92, 0xC8, 0x93, 0xC8, 0x94, 0xC8, 0x95, /* 0x64-0x67 */
        0xC8, 0x96, 0xDD, 0xE2, 0xC8, 0x97, 0xBA, 0xF9, /* 0x68-0x6B */
        0xD4, 0xE1, 0xDD, 0xE7, 0xC8, 0x98, 0xC8, 0x99, /* 0x6C-0x6F */
        0xC8, 0x9A, 0xB4, 0xD0, 0xC8, 0x9B, 0xDD, 0xDA, /* 0x70-0x73 */
        0xC8, 0x9C, 0xBF, 0xFB, 0xDD, 0xE3, 0xC8, 0x9D, /* 0x74-0x77 */
        0xDD, 0xDF, 0xC8, 0x9E, 0xDD, 0xDD, 0xC8, 0x9F, /* 0x78-0x7B */
        0xC8, 0xA0, 0xC9, 0x40, 0xC9, 0x41, 0xC9, 0x42, /* 0x7C-0x7F */
        
        0xC9, 0x43, 0xC9, 0x44, 0xB5, 0xD9, 0xC9, 0x45, /* 0x80-0x83 */
        0xC9, 0x46, 0xC9, 0x47, 0xC9, 0x48, 0xDD, 0xDB, /* 0x84-0x87 */
        0xDD, 0xDC, 0xDD, 0xDE, 0xC9, 0x49, 0xBD, 0xAF, /* 0x88-0x8B */
        0xDD, 0xE4, 0xC9, 0x4A, 0xDD, 0xE5, 0xC9, 0x4B, /* 0x8C-0x8F */
        0xC9, 0x4C, 0xC9, 0x4D, 0xC9, 0x4E, 0xC9, 0x4F, /* 0x90-0x93 */
        0xC9, 0x50, 0xC9, 0x51, 0xC9, 0x52, 0xDD, 0xF5, /* 0x94-0x97 */
        0xC9, 0x53, 0xC3, 0xC9, 0xC9, 0x54, 0xC9, 0x55, /* 0x98-0x9B */
        0xCB, 0xE2, 0xC9, 0x56, 0xC9, 0x57, 0xC9, 0x58, /* 0x9C-0x9F */
        0xC9, 0x59, 0xDD, 0xF2, 0xC9, 0x5A, 0xC9, 0x5B, /* 0xA0-0xA3 */
        0xC9, 0x5C, 0xC9, 0x5D, 0xC9, 0x5E, 0xC9, 0x5F, /* 0xA4-0xA7 */
        0xC9, 0x60, 0xC9, 0x61, 0xC9, 0x62, 0xC9, 0x63, /* 0xA8-0xAB */
        0xC9, 0x64, 0xC9, 0x65, 0xC9, 0x66, 0xD8, 0xE1, /* 0xAC-0xAF */
        0xC9, 0x67, 0xC9, 0x68, 0xC6, 0xD1, 0xC9, 0x69, /* 0xB0-0xB3 */
        0xDD, 0xF4, 0xC9, 0x6A, 0xC9, 0x6B, 0xC9, 0x6C, /* 0xB4-0xB7 */
        0xD5, 0xF4, 0xDD, 0xF3, 0xDD, 0xF0, 0xC9, 0x6D, /* 0xB8-0xBB */
        0xC9, 0x6E, 0xDD, 0xEC, 0xC9, 0x6F, 0xDD, 0xEF, /* 0xBC-0xBF */
        0xC9, 0x70, 0xDD, 0xE8, 0xC9, 0x71, 0xC9, 0x72, /* 0xC0-0xC3 */
        0xD0, 0xEE, 0xC9, 0x73, 0xC9, 0x74, 0xC9, 0x75, /* 0xC4-0xC7 */
        0xC9, 0x76, 0xC8, 0xD8, 0xDD, 0xEE, 0xC9, 0x77, /* 0xC8-0xCB */
        0xC9, 0x78, 0xDD, 0xE9, 0xC9, 0x79, 0xC9, 0x7A, /* 0xCC-0xCF */
        0xDD, 0xEA, 0xCB, 0xF2, 0xC9, 0x7B, 0xDD, 0xED, /* 0xD0-0xD3 */
        0xC9, 0x7C, 0xC9, 0x7D, 0xB1, 0xCD, 0xC9, 0x7E, /* 0xD4-0xD7 */
        0xC9, 0x80, 0xC9, 0x81, 0xC9, 0x82, 0xC9, 0x83, /* 0xD8-0xDB */
        0xC9, 0x84, 0xC0, 0xB6, 0xC9, 0x85, 0xBC, 0xBB, /* 0xDC-0xDF */
        0xDD, 0xF1, 0xC9, 0x86, 0xC9, 0x87, 0xDD, 0xF7, /* 0xE0-0xE3 */
        0xC9, 0x88, 0xDD, 0xF6, 0xDD, 0xEB, 0xC9, 0x89, /* 0xE4-0xE7 */
        0xC9, 0x8A, 0xC9, 0x8B, 0xC9, 0x8C, 0xC9, 0x8D, /* 0xE8-0xEB */
        0xC5, 0xEE, 0xC9, 0x8E, 0xC9, 0x8F, 0xC9, 0x90, /* 0xEC-0xEF */
        0xDD, 0xFB, 0xC9, 0x91, 0xC9, 0x92, 0xC9, 0x93, /* 0xF0-0xF3 */
        0xC9, 0x94, 0xC9, 0x95, 0xC9, 0x96, 0xC9, 0x97, /* 0xF4-0xF7 */
        0xC9, 0x98, 0xC9, 0x99, 0xC9, 0x9A, 0xC9, 0x9B, /* 0xF8-0xFB */
        0xDE, 0xA4, 0xC9, 0x9C, 0xC9, 0x9D, 0xDE, 0xA3, /* 0xFC-0xFF */
};

static const unsigned char u2c_85[512] = {
        0xC9, 0x9E, 0xC9, 0x9F, 0xC9, 0xA0, 0xCA, 0x40, /* 0x00-0x03 */
        0xCA, 0x41, 0xCA, 0x42, 0xCA, 0x43, 0xCA, 0x44, /* 0x04-0x07 */
        0xCA, 0x45, 0xCA, 0x46, 0xCA, 0x47, 0xCA, 0x48, /* 0x08-0x0B */
        0xDD, 0xF8, 0xCA, 0x49, 0xCA, 0x4A, 0xCA, 0x4B, /* 0x0C-0x0F */
        0xCA, 0x4C, 0xC3, 0xEF, 0xCA, 0x4D, 0xC2, 0xFB, /* 0x10-0x13 */
        0xCA, 0x4E, 0xCA, 0x4F, 0xCA, 0x50, 0xD5, 0xE1, /* 0x14-0x17 */
        0xCA, 0x51, 0xCA, 0x52, 0xCE, 0xB5, 0xCA, 0x53, /* 0x18-0x1B */
        0xCA, 0x54, 0xCA, 0x55, 0xCA, 0x56, 0xDD, 0xFD, /* 0x1C-0x1F */
        0xCA, 0x57, 0xB2, 0xCC, 0xCA, 0x58, 0xCA, 0x59, /* 0x20-0x23 */
        0xCA, 0x5A, 0xCA, 0x5B, 0xCA, 0x5C, 0xCA, 0x5D, /* 0x24-0x27 */
        0xCA, 0x5E, 0xCA, 0x5F, 0xCA, 0x60, 0xC4, 0xE8, /* 0x28-0x2B */
        0xCA, 0xDF, 0xCA, 0x61, 0xCA, 0x62, 0xCA, 0x63, /* 0x2C-0x2F */
        0xCA, 0x64, 0xCA, 0x65, 0xCA, 0x66, 0xCA, 0x67, /* 0x30-0x33 */
        0xCA, 0x68, 0xCA, 0x69, 0xCA, 0x6A, 0xC7, 0xBE, /* 0x34-0x37 */
        0xDD, 0xFA, 0xDD, 0xFC, 0xDD, 0xFE, 0xDE, 0xA2, /* 0x38-0x3B */
        0xB0, 0xAA, 0xB1, 0xCE, 0xCA, 0x6B, 0xCA, 0x6C, /* 0x3C-0x3F */
        0xCA, 0x6D, 0xCA, 0x6E, 0xCA, 0x6F, 0xDE, 0xAC, /* 0x40-0x43 */
        0xCA, 0x70, 0xCA, 0x71, 0xCA, 0x72, 0xCA, 0x73, /* 0x44-0x47 */
        0xDE, 0xA6, 0xBD, 0xB6, 0xC8, 0xEF, 0xCA, 0x74, /* 0x48-0x4B */
        0xCA, 0x75, 0xCA, 0x76, 0xCA, 0x77, 0xCA, 0x78, /* 0x4C-0x4F */
        0xCA, 0x79, 0xCA, 0x7A, 0xCA, 0x7B, 0xCA, 0x7C, /* 0x50-0x53 */
        0xCA, 0x7D, 0xCA, 0x7E, 0xDE, 0xA1, 0xCA, 0x80, /* 0x54-0x57 */
        0xCA, 0x81, 0xDE, 0xA5, 0xCA, 0x82, 0xCA, 0x83, /* 0x58-0x5B */
        0xCA, 0x84, 0xCA, 0x85, 0xDE, 0xA9, 0xCA, 0x86, /* 0x5C-0x5F */
        0xCA, 0x87, 0xCA, 0x88, 0xCA, 0x89, 0xCA, 0x8A, /* 0x60-0x63 */
        0xDE, 0xA8, 0xCA, 0x8B, 0xCA, 0x8C, 0xCA, 0x8D, /* 0x64-0x67 */
        0xDE, 0xA7, 0xCA, 0x8E, 0xCA, 0x8F, 0xCA, 0x90, /* 0x68-0x6B */
        0xCA, 0x91, 0xCA, 0x92, 0xCA, 0x93, 0xCA, 0x94, /* 0x6C-0x6F */
        0xCA, 0x95, 0xCA, 0x96, 0xDE, 0xAD, 0xCA, 0x97, /* 0x70-0x73 */
        0xD4, 0xCC, 0xCA, 0x98, 0xCA, 0x99, 0xCA, 0x9A, /* 0x74-0x77 */
        0xCA, 0x9B, 0xDE, 0xB3, 0xDE, 0xAA, 0xDE, 0xAE, /* 0x78-0x7B */
        0xCA, 0x9C, 0xCA, 0x9D, 0xC0, 0xD9, 0xCA, 0x9E, /* 0x7C-0x7F */
        
        0xCA, 0x9F, 0xCA, 0xA0, 0xCB, 0x40, 0xCB, 0x41, /* 0x80-0x83 */
        0xB1, 0xA1, 0xDE, 0xB6, 0xCB, 0x42, 0xDE, 0xB1, /* 0x84-0x87 */
        0xCB, 0x43, 0xCB, 0x44, 0xCB, 0x45, 0xCB, 0x46, /* 0x88-0x8B */
        0xCB, 0x47, 0xCB, 0x48, 0xCB, 0x49, 0xDE, 0xB2, /* 0x8C-0x8F */
        0xCB, 0x4A, 0xCB, 0x4B, 0xCB, 0x4C, 0xCB, 0x4D, /* 0x90-0x93 */
        0xCB, 0x4E, 0xCB, 0x4F, 0xCB, 0x50, 0xCB, 0x51, /* 0x94-0x97 */
        0xCB, 0x52, 0xCB, 0x53, 0xCB, 0x54, 0xD1, 0xA6, /* 0x98-0x9B */
        0xDE, 0xB5, 0xCB, 0x55, 0xCB, 0x56, 0xCB, 0x57, /* 0x9C-0x9F */
        0xCB, 0x58, 0xCB, 0x59, 0xCB, 0x5A, 0xCB, 0x5B, /* 0xA0-0xA3 */
        0xDE, 0xAF, 0xCB, 0x5C, 0xCB, 0x5D, 0xCB, 0x5E, /* 0xA4-0xA7 */
        0xDE, 0xB0, 0xCB, 0x5F, 0xD0, 0xBD, 0xCB, 0x60, /* 0xA8-0xAB */
        0xCB, 0x61, 0xCB, 0x62, 0xDE, 0xB4, 0xCA, 0xED, /* 0xAC-0xAF */
        0xDE, 0xB9, 0xCB, 0x63, 0xCB, 0x64, 0xCB, 0x65, /* 0xB0-0xB3 */
        0xCB, 0x66, 0xCB, 0x67, 0xCB, 0x68, 0xDE, 0xB8, /* 0xB4-0xB7 */
        0xCB, 0x69, 0xDE, 0xB7, 0xCB, 0x6A, 0xCB, 0x6B, /* 0xB8-0xBB */
        0xCB, 0x6C, 0xCB, 0x6D, 0xCB, 0x6E, 0xCB, 0x6F, /* 0xBC-0xBF */
        0xCB, 0x70, 0xDE, 0xBB, 0xCB, 0x71, 0xCB, 0x72, /* 0xC0-0xC3 */
        0xCB, 0x73, 0xCB, 0x74, 0xCB, 0x75, 0xCB, 0x76, /* 0xC4-0xC7 */
        0xCB, 0x77, 0xBD, 0xE5, 0xCB, 0x78, 0xCB, 0x79, /* 0xC8-0xCB */
        0xCB, 0x7A, 0xCB, 0x7B, 0xCB, 0x7C, 0xB2, 0xD8, /* 0xCC-0xCF */
        0xC3, 0xEA, 0xCB, 0x7D, 0xCB, 0x7E, 0xDE, 0xBA, /* 0xD0-0xD3 */
        0xCB, 0x80, 0xC5, 0xBA, 0xCB, 0x81, 0xCB, 0x82, /* 0xD4-0xD7 */
        0xCB, 0x83, 0xCB, 0x84, 0xCB, 0x85, 0xCB, 0x86, /* 0xD8-0xDB */
        0xDE, 0xBC, 0xCB, 0x87, 0xCB, 0x88, 0xCB, 0x89, /* 0xDC-0xDF */
        0xCB, 0x8A, 0xCB, 0x8B, 0xCB, 0x8C, 0xCB, 0x8D, /* 0xE0-0xE3 */
        0xCC, 0xD9, 0xCB, 0x8E, 0xCB, 0x8F, 0xCB, 0x90, /* 0xE4-0xE7 */
        0xCB, 0x91, 0xB7, 0xAA, 0xCB, 0x92, 0xCB, 0x93, /* 0xE8-0xEB */
        0xCB, 0x94, 0xCB, 0x95, 0xCB, 0x96, 0xCB, 0x97, /* 0xEC-0xEF */
        0xCB, 0x98, 0xCB, 0x99, 0xCB, 0x9A, 0xCB, 0x9B, /* 0xF0-0xF3 */
        0xCB, 0x9C, 0xCB, 0x9D, 0xCB, 0x9E, 0xCB, 0x9F, /* 0xF4-0xF7 */
        0xCB, 0xA0, 0xCC, 0x40, 0xCC, 0x41, 0xD4, 0xE5, /* 0xF8-0xFB */
        0xCC, 0x42, 0xCC, 0x43, 0xCC, 0x44, 0xDE, 0xBD, /* 0xFC-0xFF */
};

static const unsigned char u2c_86[512] = {
        0xCC, 0x45, 0xCC, 0x46, 0xCC, 0x47, 0xCC, 0x48, /* 0x00-0x03 */
        0xCC, 0x49, 0xDE, 0xBF, 0xCC, 0x4A, 0xCC, 0x4B, /* 0x04-0x07 */
        0xCC, 0x4C, 0xCC, 0x4D, 0xCC, 0x4E, 0xCC, 0x4F, /* 0x08-0x0B */
        0xCC, 0x50, 0xCC, 0x51, 0xCC, 0x52, 0xCC, 0x53, /* 0x0C-0x0F */
        0xCC, 0x54, 0xC4, 0xA2, 0xCC, 0x55, 0xCC, 0x56, /* 0x10-0x13 */
        0xCC, 0x57, 0xCC, 0x58, 0xDE, 0xC1, 0xCC, 0x59, /* 0x14-0x17 */
        0xCC, 0x5A, 0xCC, 0x5B, 0xCC, 0x5C, 0xCC, 0x5D, /* 0x18-0x1B */
        0xCC, 0x5E, 0xCC, 0x5F, 0xCC, 0x60, 0xCC, 0x61, /* 0x1C-0x1F */
        0xCC, 0x62, 0xCC, 0x63, 0xCC, 0x64, 0xCC, 0x65, /* 0x20-0x23 */
        0xCC, 0x66, 0xCC, 0x67, 0xCC, 0x68, 0xDE, 0xBE, /* 0x24-0x27 */
        0xCC, 0x69, 0xDE, 0xC0, 0xCC, 0x6A, 0xCC, 0x6B, /* 0x28-0x2B */
        0xCC, 0x6C, 0xCC, 0x6D, 0xCC, 0x6E, 0xCC, 0x6F, /* 0x2C-0x2F */
        0xCC, 0x70, 0xCC, 0x71, 0xCC, 0x72, 0xCC, 0x73, /* 0x30-0x33 */
        0xCC, 0x74, 0xCC, 0x75, 0xCC, 0x76, 0xCC, 0x77, /* 0x34-0x37 */
        0xD5, 0xBA, 0xCC, 0x78, 0xCC, 0x79, 0xCC, 0x7A, /* 0x38-0x3B */
        0xDE, 0xC2, 0xCC, 0x7B, 0xCC, 0x7C, 0xCC, 0x7D, /* 0x3C-0x3F */
        0xCC, 0x7E, 0xCC, 0x80, 0xCC, 0x81, 0xCC, 0x82, /* 0x40-0x43 */
        0xCC, 0x83, 0xCC, 0x84, 0xCC, 0x85, 0xCC, 0x86, /* 0x44-0x47 */
        0xCC, 0x87, 0xCC, 0x88, 0xCC, 0x89, 0xCC, 0x8A, /* 0x48-0x4B */
        0xCC, 0x8B, 0xF2, 0xAE, 0xBB, 0xA2, 0xC2, 0xB2, /* 0x4C-0x4F */
        0xC5, 0xB0, 0xC2, 0xC7, 0xCC, 0x8C, 0xCC, 0x8D, /* 0x50-0x53 */
        0xF2, 0xAF, 0xCC, 0x8E, 0xCC, 0x8F, 0xCC, 0x90, /* 0x54-0x57 */
        0xCC, 0x91, 0xCC, 0x92, 0xD0, 0xE9, 0xCC, 0x93, /* 0x58-0x5B */
        0xCC, 0x94, 0xCC, 0x95, 0xD3, 0xDD, 0xCC, 0x96, /* 0x5C-0x5F */
        0xCC, 0x97, 0xCC, 0x98, 0xEB, 0xBD, 0xCC, 0x99, /* 0x60-0x63 */
        0xCC, 0x9A, 0xCC, 0x9B, 0xCC, 0x9C, 0xCC, 0x9D, /* 0x64-0x67 */
        0xCC, 0x9E, 0xCC, 0x9F, 0xCC, 0xA0, 0xB3, 0xE6, /* 0x68-0x6B */
        0xF2, 0xB0, 0xCD, 0x40, 0xF2, 0xB1, 0xCD, 0x41, /* 0x6C-0x6F */
        0xCD, 0x42, 0xCA, 0xAD, 0xCD, 0x43, 0xCD, 0x44, /* 0x70-0x73 */
        0xCD, 0x45, 0xCD, 0x46, 0xCD, 0x47, 0xCD, 0x48, /* 0x74-0x77 */
        0xCD, 0x49, 0xBA, 0xE7, 0xF2, 0xB3, 0xF2, 0xB5, /* 0x78-0x7B */
        0xF2, 0xB4, 0xCB, 0xE4, 0xCF, 0xBA, 0xF2, 0xB2, /* 0x7C-0x7F */
        
        0xCA, 0xB4, 0xD2, 0xCF, 0xC2, 0xEC, 0xCD, 0x4A, /* 0x80-0x83 */
        0xCD, 0x4B, 0xCD, 0x4C, 0xCD, 0x4D, 0xCD, 0x4E, /* 0x84-0x87 */
        0xCD, 0x4F, 0xCD, 0x50, 0xCE, 0xC3, 0xF2, 0xB8, /* 0x88-0x8B */
        0xB0, 0xF6, 0xF2, 0xB7, 0xCD, 0x51, 0xCD, 0x52, /* 0x8C-0x8F */
        0xCD, 0x53, 0xCD, 0x54, 0xCD, 0x55, 0xF2, 0xBE, /* 0x90-0x93 */
        0xCD, 0x56, 0xB2, 0xCF, 0xCD, 0x57, 0xCD, 0x58, /* 0x94-0x97 */
        0xCD, 0x59, 0xCD, 0x5A, 0xCD, 0x5B, 0xCD, 0x5C, /* 0x98-0x9B */
        0xD1, 0xC1, 0xF2, 0xBA, 0xCD, 0x5D, 0xCD, 0x5E, /* 0x9C-0x9F */
        0xCD, 0x5F, 0xCD, 0x60, 0xCD, 0x61, 0xF2, 0xBC, /* 0xA0-0xA3 */
        0xD4, 0xE9, 0xCD, 0x62, 0xCD, 0x63, 0xF2, 0xBB, /* 0xA4-0xA7 */
        0xF2, 0xB6, 0xF2, 0xBF, 0xF2, 0xBD, 0xCD, 0x64, /* 0xA8-0xAB */
        0xF2, 0xB9, 0xCD, 0x65, 0xCD, 0x66, 0xF2, 0xC7, /* 0xAC-0xAF */
        0xF2, 0xC4, 0xF2, 0xC6, 0xCD, 0x67, 0xCD, 0x68, /* 0xB0-0xB3 */
        0xF2, 0xCA, 0xF2, 0xC2, 0xF2, 0xC0, 0xCD, 0x69, /* 0xB4-0xB7 */
        0xCD, 0x6A, 0xCD, 0x6B, 0xF2, 0xC5, 0xCD, 0x6C, /* 0xB8-0xBB */
        0xCD, 0x6D, 0xCD, 0x6E, 0xCD, 0x6F, 0xCD, 0x70, /* 0xBC-0xBF */
        0xD6, 0xFB, 0xCD, 0x71, 0xCD, 0x72, 0xCD, 0x73, /* 0xC0-0xC3 */
        0xF2, 0xC1, 0xCD, 0x74, 0xC7, 0xF9, 0xC9, 0xDF, /* 0xC4-0xC7 */
        0xCD, 0x75, 0xF2, 0xC8, 0xB9, 0xC6, 0xB5, 0xB0, /* 0xC8-0xCB */
        0xCD, 0x76, 0xCD, 0x77, 0xF2, 0xC3, 0xF2, 0xC9, /* 0xCC-0xCF */
        0xF2, 0xD0, 0xF2, 0xD6, 0xCD, 0x78, 0xCD, 0x79, /* 0xD0-0xD3 */
        0xBB, 0xD7, 0xCD, 0x7A, 0xCD, 0x7B, 0xCD, 0x7C, /* 0xD4-0xD7 */
        0xF2, 0xD5, 0xCD, 0xDC, 0xCD, 0x7D, 0xD6, 0xEB, /* 0xD8-0xDB */
        0xCD, 0x7E, 0xCD, 0x80, 0xF2, 0xD2, 0xF2, 0xD4, /* 0xDC-0xDF */
        0xCD, 0x81, 0xCD, 0x82, 0xCD, 0x83, 0xCD, 0x84, /* 0xE0-0xE3 */
        0xB8, 0xF2, 0xCD, 0x85, 0xCD, 0x86, 0xCD, 0x87, /* 0xE4-0xE7 */
        0xCD, 0x88, 0xF2, 0xCB, 0xCD, 0x89, 0xCD, 0x8A, /* 0xE8-0xEB */
        0xCD, 0x8B, 0xF2, 0xCE, 0xC2, 0xF9, 0xCD, 0x8C, /* 0xEC-0xEF */
        0xD5, 0xDD, 0xF2, 0xCC, 0xF2, 0xCD, 0xF2, 0xCF, /* 0xF0-0xF3 */
        0xF2, 0xD3, 0xCD, 0x8D, 0xCD, 0x8E, 0xCD, 0x8F, /* 0xF4-0xF7 */
        0xF2, 0xD9, 0xD3, 0xBC, 0xCD, 0x90, 0xCD, 0x91, /* 0xF8-0xFB */
        0xCD, 0x92, 0xCD, 0x93, 0xB6, 0xEA, 0xCD, 0x94, /* 0xFC-0xFF */
};

static const unsigned char u2c_87[512] = {
        0xCA, 0xF1, 0xCD, 0x95, 0xB7, 0xE4, 0xF2, 0xD7, /* 0x00-0x03 */
        0xCD, 0x96, 0xCD, 0x97, 0xCD, 0x98, 0xF2, 0xD8, /* 0x04-0x07 */
        0xF2, 0xDA, 0xF2, 0xDD, 0xF2, 0xDB, 0xCD, 0x99, /* 0x08-0x0B */
        0xCD, 0x9A, 0xF2, 0xDC, 0xCD, 0x9B, 0xCD, 0x9C, /* 0x0C-0x0F */
        0xCD, 0x9D, 0xCD, 0x9E, 0xD1, 0xD1, 0xF2, 0xD1, /* 0x10-0x13 */
        0xCD, 0x9F, 0xCD, 0xC9, 0xCD, 0xA0, 0xCE, 0xCF, /* 0x14-0x17 */
        0xD6, 0xA9, 0xCE, 0x40, 0xF2, 0xE3, 0xCE, 0x41, /* 0x18-0x1B */
        0xC3, 0xDB, 0xCE, 0x42, 0xF2, 0xE0, 0xCE, 0x43, /* 0x1C-0x1F */
        0xCE, 0x44, 0xC0, 0xAF, 0xF2, 0xEC, 0xF2, 0xDE, /* 0x20-0x23 */
        0xCE, 0x45, 0xF2, 0xE1, 0xCE, 0x46, 0xCE, 0x47, /* 0x24-0x27 */
        0xCE, 0x48, 0xF2, 0xE8, 0xCE, 0x49, 0xCE, 0x4A, /* 0x28-0x2B */
        0xCE, 0x4B, 0xCE, 0x4C, 0xF2, 0xE2, 0xCE, 0x4D, /* 0x2C-0x2F */
        0xCE, 0x4E, 0xF2, 0xE7, 0xCE, 0x4F, 0xCE, 0x50, /* 0x30-0x33 */
        0xF2, 0xE6, 0xCE, 0x51, 0xCE, 0x52, 0xF2, 0xE9, /* 0x34-0x37 */
        0xCE, 0x53, 0xCE, 0x54, 0xCE, 0x55, 0xF2, 0xDF, /* 0x38-0x3B */
        0xCE, 0x56, 0xCE, 0x57, 0xF2, 0xE4, 0xF2, 0xEA, /* 0x3C-0x3F */
        0xCE, 0x58, 0xCE, 0x59, 0xCE, 0x5A, 0xCE, 0x5B, /* 0x40-0x43 */
        0xCE, 0x5C, 0xCE, 0x5D, 0xCE, 0x5E, 0xD3, 0xAC, /* 0x44-0x47 */
        0xF2, 0xE5, 0xB2, 0xF5, 0xCE, 0x5F, 0xCE, 0x60, /* 0x48-0x4B */
        0xF2, 0xF2, 0xCE, 0x61, 0xD0, 0xAB, 0xCE, 0x62, /* 0x4C-0x4F */
        0xCE, 0x63, 0xCE, 0x64, 0xCE, 0x65, 0xF2, 0xF5, /* 0x50-0x53 */
        0xCE, 0x66, 0xCE, 0x67, 0xCE, 0x68, 0xBB, 0xC8, /* 0x54-0x57 */
        0xCE, 0x69, 0xF2, 0xF9, 0xCE, 0x6A, 0xCE, 0x6B, /* 0x58-0x5B */
        0xCE, 0x6C, 0xCE, 0x6D, 0xCE, 0x6E, 0xCE, 0x6F, /* 0x5C-0x5F */
        0xF2, 0xF0, 0xCE, 0x70, 0xCE, 0x71, 0xF2, 0xF6, /* 0x60-0x63 */
        0xF2, 0xF8, 0xF2, 0xFA, 0xCE, 0x72, 0xCE, 0x73, /* 0x64-0x67 */
        0xCE, 0x74, 0xCE, 0x75, 0xCE, 0x76, 0xCE, 0x77, /* 0x68-0x6B */
        0xCE, 0x78, 0xCE, 0x79, 0xF2, 0xF3, 0xCE, 0x7A, /* 0x6C-0x6F */
        0xF2, 0xF1, 0xCE, 0x7B, 0xCE, 0x7C, 0xCE, 0x7D, /* 0x70-0x73 */
        0xBA, 0xFB, 0xCE, 0x7E, 0xB5, 0xFB, 0xCE, 0x80, /* 0x74-0x77 */
        0xCE, 0x81, 0xCE, 0x82, 0xCE, 0x83, 0xF2, 0xEF, /* 0x78-0x7B */
        0xF2, 0xF7, 0xF2, 0xED, 0xF2, 0xEE, 0xCE, 0x84, /* 0x7C-0x7F */
        
        0xCE, 0x85, 0xCE, 0x86, 0xF2, 0xEB, 0xF3, 0xA6, /* 0x80-0x83 */
        0xCE, 0x87, 0xF3, 0xA3, 0xCE, 0x88, 0xCE, 0x89, /* 0x84-0x87 */
        0xF3, 0xA2, 0xCE, 0x8A, 0xCE, 0x8B, 0xF2, 0xF4, /* 0x88-0x8B */
        0xCE, 0x8C, 0xC8, 0xDA, 0xCE, 0x8D, 0xCE, 0x8E, /* 0x8C-0x8F */
        0xCE, 0x8F, 0xCE, 0x90, 0xCE, 0x91, 0xF2, 0xFB, /* 0x90-0x93 */
        0xCE, 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xF3, 0xA5, /* 0x94-0x97 */
        0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, 0x98, /* 0x98-0x9B */
        0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xC3, 0xF8, /* 0x9C-0x9F */
        0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, /* 0xA0-0xA3 */
        0xCE, 0xA0, 0xCF, 0x40, 0xCF, 0x41, 0xCF, 0x42, /* 0xA4-0xA7 */
        0xF2, 0xFD, 0xCF, 0x43, 0xCF, 0x44, 0xF3, 0xA7, /* 0xA8-0xAB */
        0xF3, 0xA9, 0xF3, 0xA4, 0xCF, 0x45, 0xF2, 0xFC, /* 0xAC-0xAF */
        0xCF, 0x46, 0xCF, 0x47, 0xCF, 0x48, 0xF3, 0xAB, /* 0xB0-0xB3 */
        0xCF, 0x49, 0xF3, 0xAA, 0xCF, 0x4A, 0xCF, 0x4B, /* 0xB4-0xB7 */
        0xCF, 0x4C, 0xCF, 0x4D, 0xC2, 0xDD, 0xCF, 0x4E, /* 0xB8-0xBB */
        0xCF, 0x4F, 0xF3, 0xAE, 0xCF, 0x50, 0xCF, 0x51, /* 0xBC-0xBF */
        0xF3, 0xB0, 0xCF, 0x52, 0xCF, 0x53, 0xCF, 0x54, /* 0xC0-0xC3 */
        0xCF, 0x55, 0xCF, 0x56, 0xF3, 0xA1, 0xCF, 0x57, /* 0xC4-0xC7 */
        0xCF, 0x58, 0xCF, 0x59, 0xF3, 0xB1, 0xF3, 0xAC, /* 0xC8-0xCB */
        0xCF, 0x5A, 0xCF, 0x5B, 0xCF, 0x5C, 0xCF, 0x5D, /* 0xCC-0xCF */
        0xCF, 0x5E, 0xF3, 0xAF, 0xF2, 0xFE, 0xF3, 0xAD, /* 0xD0-0xD3 */
        0xCF, 0x5F, 0xCF, 0x60, 0xCF, 0x61, 0xCF, 0x62, /* 0xD4-0xD7 */
        0xCF, 0x63, 0xCF, 0x64, 0xCF, 0x65, 0xF3, 0xB2, /* 0xD8-0xDB */
        0xCF, 0x66, 0xCF, 0x67, 0xCF, 0x68, 0xCF, 0x69, /* 0xDC-0xDF */
        0xF3, 0xB4, 0xCF, 0x6A, 0xCF, 0x6B, 0xCF, 0x6C, /* 0xE0-0xE3 */
        0xCF, 0x6D, 0xF3, 0xA8, 0xCF, 0x6E, 0xCF, 0x6F, /* 0xE4-0xE7 */
        0xCF, 0x70, 0xCF, 0x71, 0xF3, 0xB3, 0xCF, 0x72, /* 0xE8-0xEB */
        0xCF, 0x73, 0xCF, 0x74, 0xF3, 0xB5, 0xCF, 0x75, /* 0xEC-0xEF */
        0xCF, 0x76, 0xCF, 0x77, 0xCF, 0x78, 0xCF, 0x79, /* 0xF0-0xF3 */
        0xCF, 0x7A, 0xCF, 0x7B, 0xCF, 0x7C, 0xCF, 0x7D, /* 0xF4-0xF7 */
        0xCF, 0x7E, 0xD0, 0xB7, 0xCF, 0x80, 0xCF, 0x81, /* 0xF8-0xFB */
        0xCF, 0x82, 0xCF, 0x83, 0xF3, 0xB8, 0xCF, 0x84, /* 0xFC-0xFF */
};

static const unsigned char u2c_88[512] = {
        0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, 0xD9, 0xF9, /* 0x00-0x03 */
        0xCF, 0x88, 0xCF, 0x89, 0xCF, 0x8A, 0xCF, 0x8B, /* 0x04-0x07 */
        0xCF, 0x8C, 0xCF, 0x8D, 0xF3, 0xB9, 0xCF, 0x8E, /* 0x08-0x0B */
        0xCF, 0x8F, 0xCF, 0x90, 0xCF, 0x91, 0xCF, 0x92, /* 0x0C-0x0F */
        0xCF, 0x93, 0xCF, 0x94, 0xCF, 0x95, 0xF3, 0xB7, /* 0x10-0x13 */
        0xCF, 0x96, 0xC8, 0xE4, 0xF3, 0xB6, 0xCF, 0x97, /* 0x14-0x17 */
        0xCF, 0x98, 0xCF, 0x99, 0xCF, 0x9A, 0xF3, 0xBA, /* 0x18-0x1B */
        0xCF, 0x9B, 0xCF, 0x9C, 0xCF, 0x9D, 0xCF, 0x9E, /* 0x1C-0x1F */
        0xCF, 0x9F, 0xF3, 0xBB, 0xB4, 0xC0, 0xCF, 0xA0, /* 0x20-0x23 */
        0xD0, 0x40, 0xD0, 0x41, 0xD0, 0x42, 0xD0, 0x43, /* 0x24-0x27 */
        0xD0, 0x44, 0xD0, 0x45, 0xD0, 0x46, 0xD0, 0x47, /* 0x28-0x2B */
        0xD0, 0x48, 0xD0, 0x49, 0xD0, 0x4A, 0xD0, 0x4B, /* 0x2C-0x2F */
        0xD0, 0x4C, 0xD0, 0x4D, 0xEE, 0xC3, 0xD0, 0x4E, /* 0x30-0x33 */
        0xD0, 0x4F, 0xD0, 0x50, 0xD0, 0x51, 0xD0, 0x52, /* 0x34-0x37 */
        0xD0, 0x53, 0xF3, 0xBC, 0xD0, 0x54, 0xD0, 0x55, /* 0x38-0x3B */
        0xF3, 0xBD, 0xD0, 0x56, 0xD0, 0x57, 0xD0, 0x58, /* 0x3C-0x3F */
        0xD1, 0xAA, 0xD0, 0x59, 0xD0, 0x5A, 0xD0, 0x5B, /* 0x40-0x43 */
        0xF4, 0xAC, 0xD0, 0xC6, 0xD0, 0x5C, 0xD0, 0x5D, /* 0x44-0x47 */
        0xD0, 0x5E, 0xD0, 0x5F, 0xD0, 0x60, 0xD0, 0x61, /* 0x48-0x4B */
        0xD0, 0xD0, 0xD1, 0xDC, 0xD0, 0x62, 0xD0, 0x63, /* 0x4C-0x4F */
        0xD0, 0x64, 0xD0, 0x65, 0xD0, 0x66, 0xD0, 0x67, /* 0x50-0x53 */
        0xCF, 0xCE, 0xD0, 0x68, 0xD0, 0x69, 0xBD, 0xD6, /* 0x54-0x57 */
        0xD0, 0x6A, 0xD1, 0xC3, 0xD0, 0x6B, 0xD0, 0x6C, /* 0x58-0x5B */
        0xD0, 0x6D, 0xD0, 0x6E, 0xD0, 0x6F, 0xD0, 0x70, /* 0x5C-0x5F */
        0xD0, 0x71, 0xBA, 0xE2, 0xE1, 0xE9, 0xD2, 0xC2, /* 0x60-0x63 */
        0xF1, 0xC2, 0xB2, 0xB9, 0xD0, 0x72, 0xD0, 0x73, /* 0x64-0x67 */
        0xB1, 0xED, 0xF1, 0xC3, 0xD0, 0x74, 0xC9, 0xC0, /* 0x68-0x6B */
        0xB3, 0xC4, 0xD0, 0x75, 0xD9, 0xF2, 0xD0, 0x76, /* 0x6C-0x6F */
        0xCB, 0xA5, 0xD0, 0x77, 0xF1, 0xC4, 0xD0, 0x78, /* 0x70-0x73 */
        0xD0, 0x79, 0xD0, 0x7A, 0xD0, 0x7B, 0xD6, 0xD4, /* 0x74-0x77 */
        0xD0, 0x7C, 0xD0, 0x7D, 0xD0, 0x7E, 0xD0, 0x80, /* 0x78-0x7B */
        0xD0, 0x81, 0xF1, 0xC5, 0xF4, 0xC0, 0xF1, 0xC6, /* 0x7C-0x7F */
        
        0xD0, 0x82, 0xD4, 0xAC, 0xF1, 0xC7, 0xD0, 0x83, /* 0x80-0x83 */
        0xB0, 0xC0, 0xF4, 0xC1, 0xD0, 0x84, 0xD0, 0x85, /* 0x84-0x87 */
        0xF4, 0xC2, 0xD0, 0x86, 0xD0, 0x87, 0xB4, 0xFC, /* 0x88-0x8B */
        0xD0, 0x88, 0xC5, 0xDB, 0xD0, 0x89, 0xD0, 0x8A, /* 0x8C-0x8F */
        0xD0, 0x8B, 0xD0, 0x8C, 0xCC, 0xBB, 0xD0, 0x8D, /* 0x90-0x93 */
        0xD0, 0x8E, 0xD0, 0x8F, 0xD0, 0xE4, 0xD0, 0x90, /* 0x94-0x97 */
        0xD0, 0x91, 0xD0, 0x92, 0xD0, 0x93, 0xD0, 0x94, /* 0x98-0x9B */
        0xCD, 0xE0, 0xD0, 0x95, 0xD0, 0x96, 0xD0, 0x97, /* 0x9C-0x9F */
        0xD0, 0x98, 0xD0, 0x99, 0xF1, 0xC8, 0xD0, 0x9A, /* 0xA0-0xA3 */
        0xD9, 0xF3, 0xD0, 0x9B, 0xD0, 0x9C, 0xD0, 0x9D, /* 0xA4-0xA7 */
        0xD0, 0x9E, 0xD0, 0x9F, 0xD0, 0xA0, 0xB1, 0xBB, /* 0xA8-0xAB */
        0xD1, 0x40, 0xCF, 0xAE, 0xD1, 0x41, 0xD1, 0x42, /* 0xAC-0xAF */
        0xD1, 0x43, 0xB8, 0xA4, 0xD1, 0x44, 0xD1, 0x45, /* 0xB0-0xB3 */
        0xD1, 0x46, 0xD1, 0x47, 0xD1, 0x48, 0xF1, 0xCA, /* 0xB4-0xB7 */
        0xD1, 0x49, 0xD1, 0x4A, 0xD1, 0x4B, 0xD1, 0x4C, /* 0xB8-0xBB */
        0xF1, 0xCB, 0xD1, 0x4D, 0xD1, 0x4E, 0xD1, 0x4F, /* 0xBC-0xBF */
        0xD1, 0x50, 0xB2, 0xC3, 0xC1, 0xD1, 0xD1, 0x51, /* 0xC0-0xC3 */
        0xD1, 0x52, 0xD7, 0xB0, 0xF1, 0xC9, 0xD1, 0x53, /* 0xC4-0xC7 */
        0xD1, 0x54, 0xF1, 0xCC, 0xD1, 0x55, 0xD1, 0x56, /* 0xC8-0xCB */
        0xD1, 0x57, 0xD1, 0x58, 0xF1, 0xCE, 0xD1, 0x59, /* 0xCC-0xCF */
        0xD1, 0x5A, 0xD1, 0x5B, 0xD9, 0xF6, 0xD1, 0x5C, /* 0xD0-0xD3 */
        0xD2, 0xE1, 0xD4, 0xA3, 0xD1, 0x5D, 0xD1, 0x5E, /* 0xD4-0xD7 */
        0xF4, 0xC3, 0xC8, 0xB9, 0xD1, 0x5F, 0xD1, 0x60, /* 0xD8-0xDB */
        0xD1, 0x61, 0xD1, 0x62, 0xD1, 0x63, 0xF4, 0xC4, /* 0xDC-0xDF */
        0xD1, 0x64, 0xD1, 0x65, 0xF1, 0xCD, 0xF1, 0xCF, /* 0xE0-0xE3 */
        0xBF, 0xE3, 0xF1, 0xD0, 0xD1, 0x66, 0xD1, 0x67, /* 0xE4-0xE7 */
        0xF1, 0xD4, 0xD1, 0x68, 0xD1, 0x69, 0xD1, 0x6A, /* 0xE8-0xEB */
        0xD1, 0x6B, 0xD1, 0x6C, 0xD1, 0x6D, 0xD1, 0x6E, /* 0xEC-0xEF */
        0xF1, 0xD6, 0xF1, 0xD1, 0xD1, 0x6F, 0xC9, 0xD1, /* 0xF0-0xF3 */
        0xC5, 0xE1, 0xD1, 0x70, 0xD1, 0x71, 0xD1, 0x72, /* 0xF4-0xF7 */
        0xC2, 0xE3, 0xB9, 0xFC, 0xD1, 0x73, 0xD1, 0x74, /* 0xF8-0xFB */
        0xF1, 0xD3, 0xD1, 0x75, 0xF1, 0xD5, 0xD1, 0x76, /* 0xFC-0xFF */
};

static const unsigned char u2c_89[512] = {
        0xD1, 0x77, 0xD1, 0x78, 0xB9, 0xD3, 0xD1, 0x79, /* 0x00-0x03 */
        0xD1, 0x7A, 0xD1, 0x7B, 0xD1, 0x7C, 0xD1, 0x7D, /* 0x04-0x07 */
        0xD1, 0x7E, 0xD1, 0x80, 0xF1, 0xDB, 0xD1, 0x81, /* 0x08-0x0B */
        0xD1, 0x82, 0xD1, 0x83, 0xD1, 0x84, 0xD1, 0x85, /* 0x0C-0x0F */
        0xBA, 0xD6, 0xD1, 0x86, 0xB0, 0xFD, 0xF1, 0xD9, /* 0x10-0x13 */
        0xD1, 0x87, 0xD1, 0x88, 0xD1, 0x89, 0xD1, 0x8A, /* 0x14-0x17 */
        0xD1, 0x8B, 0xF1, 0xD8, 0xF1, 0xD2, 0xF1, 0xDA, /* 0x18-0x1B */
        0xD1, 0x8C, 0xD1, 0x8D, 0xD1, 0x8E, 0xD1, 0x8F, /* 0x1C-0x1F */
        0xD1, 0x90, 0xF1, 0xD7, 0xD1, 0x91, 0xD1, 0x92, /* 0x20-0x23 */
        0xD1, 0x93, 0xC8, 0xEC, 0xD1, 0x94, 0xD1, 0x95, /* 0x24-0x27 */
        0xD1, 0x96, 0xD1, 0x97, 0xCD, 0xCA, 0xF1, 0xDD, /* 0x28-0x2B */
        0xD1, 0x98, 0xD1, 0x99, 0xD1, 0x9A, 0xD1, 0x9B, /* 0x2C-0x2F */
        0xE5, 0xBD, 0xD1, 0x9C, 0xD1, 0x9D, 0xD1, 0x9E, /* 0x30-0x33 */
        0xF1, 0xDC, 0xD1, 0x9F, 0xF1, 0xDE, 0xD1, 0xA0, /* 0x34-0x37 */
        0xD2, 0x40, 0xD2, 0x41, 0xD2, 0x42, 0xD2, 0x43, /* 0x38-0x3B */
        0xD2, 0x44, 0xD2, 0x45, 0xD2, 0x46, 0xD2, 0x47, /* 0x3C-0x3F */
        0xD2, 0x48, 0xF1, 0xDF, 0xD2, 0x49, 0xD2, 0x4A, /* 0x40-0x43 */
        0xCF, 0xE5, 0xD2, 0x4B, 0xD2, 0x4C, 0xD2, 0x4D, /* 0x44-0x47 */
        0xD2, 0x4E, 0xD2, 0x4F, 0xD2, 0x50, 0xD2, 0x51, /* 0x48-0x4B */
        0xD2, 0x52, 0xD2, 0x53, 0xD2, 0x54, 0xD2, 0x55, /* 0x4C-0x4F */
        0xD2, 0x56, 0xD2, 0x57, 0xD2, 0x58, 0xD2, 0x59, /* 0x50-0x53 */
        0xD2, 0x5A, 0xD2, 0x5B, 0xD2, 0x5C, 0xD2, 0x5D, /* 0x54-0x57 */
        0xD2, 0x5E, 0xD2, 0x5F, 0xD2, 0x60, 0xD2, 0x61, /* 0x58-0x5B */
        0xD2, 0x62, 0xD2, 0x63, 0xF4, 0xC5, 0xBD, 0xF3, /* 0x5C-0x5F */
        0xD2, 0x64, 0xD2, 0x65, 0xD2, 0x66, 0xD2, 0x67, /* 0x60-0x63 */
        0xD2, 0x68, 0xD2, 0x69, 0xF1, 0xE0, 0xD2, 0x6A, /* 0x64-0x67 */
        0xD2, 0x6B, 0xD2, 0x6C, 0xD2, 0x6D, 0xD2, 0x6E, /* 0x68-0x6B */
        0xD2, 0x6F, 0xD2, 0x70, 0xD2, 0x71, 0xD2, 0x72, /* 0x6C-0x6F */
        0xD2, 0x73, 0xD2, 0x74, 0xD2, 0x75, 0xD2, 0x76, /* 0x70-0x73 */
        0xD2, 0x77, 0xD2, 0x78, 0xD2, 0x79, 0xD2, 0x7A, /* 0x74-0x77 */
        0xD2, 0x7B, 0xD2, 0x7C, 0xD2, 0x7D, 0xF1, 0xE1, /* 0x78-0x7B */
        0xD2, 0x7E, 0xD2, 0x80, 0xD2, 0x81, 0xCE, 0xF7, /* 0x7C-0x7F */
        
        0xD2, 0x82, 0xD2, 0xAA, 0xD2, 0x83, 0xF1, 0xFB, /* 0x80-0x83 */
        0xD2, 0x84, 0xD2, 0x85, 0xB8, 0xB2, 0xD2, 0x86, /* 0x84-0x87 */
        0xD2, 0x87, 0xD2, 0x88, 0xD2, 0x89, 0xD2, 0x8A, /* 0x88-0x8B */
        0xD2, 0x8B, 0xD2, 0x8C, 0xD2, 0x8D, 0xD2, 0x8E, /* 0x8C-0x8F */
        0xD2, 0x8F, 0xD2, 0x90, 0xD2, 0x91, 0xD2, 0x92, /* 0x90-0x93 */
        0xD2, 0x93, 0xD2, 0x94, 0xD2, 0x95, 0xD2, 0x96, /* 0x94-0x97 */
        0xD2, 0x97, 0xD2, 0x98, 0xD2, 0x99, 0xD2, 0x9A, /* 0x98-0x9B */
        0xD2, 0x9B, 0xD2, 0x9C, 0xD2, 0x9D, 0xD2, 0x9E, /* 0x9C-0x9F */
        0xD2, 0x9F, 0xD2, 0xA0, 0xD3, 0x40, 0xD3, 0x41, /* 0xA0-0xA3 */
        0xD3, 0x42, 0xD3, 0x43, 0xD3, 0x44, 0xD3, 0x45, /* 0xA4-0xA7 */
        0xD3, 0x46, 0xD3, 0x47, 0xD3, 0x48, 0xD3, 0x49, /* 0xA8-0xAB */
        0xD3, 0x4A, 0xD3, 0x4B, 0xD3, 0x4C, 0xD3, 0x4D, /* 0xAC-0xAF */
        0xD3, 0x4E, 0xD3, 0x4F, 0xD3, 0x50, 0xD3, 0x51, /* 0xB0-0xB3 */
        0xD3, 0x52, 0xD3, 0x53, 0xD3, 0x54, 0xD3, 0x55, /* 0xB4-0xB7 */
        0xD3, 0x56, 0xD3, 0x57, 0xD3, 0x58, 0xD3, 0x59, /* 0xB8-0xBB */
        0xD3, 0x5A, 0xD3, 0x5B, 0xD3, 0x5C, 0xD3, 0x5D, /* 0xBC-0xBF */
        0xD3, 0x5E, 0xBC, 0xFB, 0xB9, 0xDB, 0xD3, 0x5F, /* 0xC0-0xC3 */
        0xB9, 0xE6, 0xC3, 0xD9, 0xCA, 0xD3, 0xEA, 0xE8, /* 0xC4-0xC7 */
        0xC0, 0xC0, 0xBE, 0xF5, 0xEA, 0xE9, 0xEA, 0xEA, /* 0xC8-0xCB */
        0xEA, 0xEB, 0xD3, 0x60, 0xEA, 0xEC, 0xEA, 0xED, /* 0xCC-0xCF */
        0xEA, 0xEE, 0xEA, 0xEF, 0xBD, 0xC7, 0xD3, 0x61, /* 0xD0-0xD3 */
        0xD3, 0x62, 0xD3, 0x63, 0xF5, 0xFB, 0xD3, 0x64, /* 0xD4-0xD7 */
        0xD3, 0x65, 0xD3, 0x66, 0xF5, 0xFD, 0xD3, 0x67, /* 0xD8-0xDB */
        0xF5, 0xFE, 0xD3, 0x68, 0xF5, 0xFC, 0xD3, 0x69, /* 0xDC-0xDF */
        0xD3, 0x6A, 0xD3, 0x6B, 0xD3, 0x6C, 0xBD, 0xE2, /* 0xE0-0xE3 */
        0xD3, 0x6D, 0xF6, 0xA1, 0xB4, 0xA5, 0xD3, 0x6E, /* 0xE4-0xE7 */
        0xD3, 0x6F, 0xD3, 0x70, 0xD3, 0x71, 0xF6, 0xA2, /* 0xE8-0xEB */
        0xD3, 0x72, 0xD3, 0x73, 0xD3, 0x74, 0xF6, 0xA3, /* 0xEC-0xEF */
        0xD3, 0x75, 0xD3, 0x76, 0xD3, 0x77, 0xEC, 0xB2, /* 0xF0-0xF3 */
        0xD3, 0x78, 0xD3, 0x79, 0xD3, 0x7A, 0xD3, 0x7B, /* 0xF4-0xF7 */
        0xD3, 0x7C, 0xD3, 0x7D, 0xD3, 0x7E, 0xD3, 0x80, /* 0xF8-0xFB */
        0xD3, 0x81, 0xD3, 0x82, 0xD3, 0x83, 0xD3, 0x84, /* 0xFC-0xFF */
};

static const unsigned char u2c_8A[512] = {
        0xD1, 0xD4, 0xD3, 0x85, 0xD3, 0x86, 0xD3, 0x87, /* 0x00-0x03 */
        0xD3, 0x88, 0xD3, 0x89, 0xD3, 0x8A, 0xD9, 0xEA, /* 0x04-0x07 */
        0xD3, 0x8B, 0xD3, 0x8C, 0xD3, 0x8D, 0xD3, 0x8E, /* 0x08-0x0B */
        0xD3, 0x8F, 0xD3, 0x90, 0xD3, 0x91, 0xD3, 0x92, /* 0x0C-0x0F */
        0xD3, 0x93, 0xD3, 0x94, 0xD3, 0x95, 0xD3, 0x96, /* 0x10-0x13 */
        0xD3, 0x97, 0xD3, 0x98, 0xD3, 0x99, 0xD3, 0x9A, /* 0x14-0x17 */
        0xD3, 0x9B, 0xD3, 0x9C, 0xD3, 0x9D, 0xD3, 0x9E, /* 0x18-0x1B */
        0xD3, 0x9F, 0xD3, 0xA0, 0xD4, 0x40, 0xD4, 0x41, /* 0x1C-0x1F */
        0xD4, 0x42, 0xD4, 0x43, 0xD4, 0x44, 0xD4, 0x45, /* 0x20-0x23 */
        0xD4, 0x46, 0xD4, 0x47, 0xD4, 0x48, 0xD4, 0x49, /* 0x24-0x27 */
        0xD4, 0x4A, 0xD4, 0x4B, 0xD4, 0x4C, 0xD4, 0x4D, /* 0x28-0x2B */
        0xD4, 0x4E, 0xD4, 0x4F, 0xD4, 0x50, 0xD4, 0x51, /* 0x2C-0x2F */
        0xD4, 0x52, 0xD4, 0x53, 0xD4, 0x54, 0xD4, 0x55, /* 0x30-0x33 */
        0xD4, 0x56, 0xD4, 0x57, 0xD4, 0x58, 0xD4, 0x59, /* 0x34-0x37 */
        0xD4, 0x5A, 0xD4, 0x5B, 0xD4, 0x5C, 0xD4, 0x5D, /* 0x38-0x3B */
        0xD4, 0x5E, 0xD4, 0x5F, 0xF6, 0xA4, 0xD4, 0x60, /* 0x3C-0x3F */
        0xD4, 0x61, 0xD4, 0x62, 0xD4, 0x63, 0xD4, 0x64, /* 0x40-0x43 */
        0xD4, 0x65, 0xD4, 0x66, 0xD4, 0x67, 0xD4, 0x68, /* 0x44-0x47 */
        0xEE, 0xBA, 0xD4, 0x69, 0xD4, 0x6A, 0xD4, 0x6B, /* 0x48-0x4B */
        0xD4, 0x6C, 0xD4, 0x6D, 0xD4, 0x6E, 0xD4, 0x6F, /* 0x4C-0x4F */
        0xD4, 0x70, 0xD4, 0x71, 0xD4, 0x72, 0xD4, 0x73, /* 0x50-0x53 */
        0xD4, 0x74, 0xD4, 0x75, 0xD4, 0x76, 0xD4, 0x77, /* 0x54-0x57 */
        0xD4, 0x78, 0xD4, 0x79, 0xD4, 0x7A, 0xD4, 0x7B, /* 0x58-0x5B */
        0xD4, 0x7C, 0xD4, 0x7D, 0xD4, 0x7E, 0xD4, 0x80, /* 0x5C-0x5F */
        0xD4, 0x81, 0xD4, 0x82, 0xD4, 0x83, 0xD4, 0x84, /* 0x60-0x63 */
        0xD4, 0x85, 0xD4, 0x86, 0xD4, 0x87, 0xD4, 0x88, /* 0x64-0x67 */
        0xD4, 0x89, 0xD4, 0x8A, 0xD4, 0x8B, 0xD4, 0x8C, /* 0x68-0x6B */
        0xD4, 0x8D, 0xD4, 0x8E, 0xD4, 0x8F, 0xD4, 0x90, /* 0x6C-0x6F */
        0xD4, 0x91, 0xD4, 0x92, 0xD4, 0x93, 0xD4, 0x94, /* 0x70-0x73 */
        0xD4, 0x95, 0xD4, 0x96, 0xD4, 0x97, 0xD4, 0x98, /* 0x74-0x77 */
        0xD4, 0x99, 0xD5, 0xB2, 0xD4, 0x9A, 0xD4, 0x9B, /* 0x78-0x7B */
        0xD4, 0x9C, 0xD4, 0x9D, 0xD4, 0x9E, 0xD4, 0x9F, /* 0x7C-0x7F */
        
        0xD4, 0xA0, 0xD5, 0x40, 0xD5, 0x41, 0xD5, 0x42, /* 0x80-0x83 */
        0xD5, 0x43, 0xD5, 0x44, 0xD5, 0x45, 0xD5, 0x46, /* 0x84-0x87 */
        0xD5, 0x47, 0xD3, 0xFE, 0xCC, 0xDC, 0xD5, 0x48, /* 0x88-0x8B */
        0xD5, 0x49, 0xD5, 0x4A, 0xD5, 0x4B, 0xD5, 0x4C, /* 0x8C-0x8F */
        0xD5, 0x4D, 0xD5, 0x4E, 0xD5, 0x4F, 0xCA, 0xC4, /* 0x90-0x93 */
        0xD5, 0x50, 0xD5, 0x51, 0xD5, 0x52, 0xD5, 0x53, /* 0x94-0x97 */
        0xD5, 0x54, 0xD5, 0x55, 0xD5, 0x56, 0xD5, 0x57, /* 0x98-0x9B */
        0xD5, 0x58, 0xD5, 0x59, 0xD5, 0x5A, 0xD5, 0x5B, /* 0x9C-0x9F */
        0xD5, 0x5C, 0xD5, 0x5D, 0xD5, 0x5E, 0xD5, 0x5F, /* 0xA0-0xA3 */
        0xD5, 0x60, 0xD5, 0x61, 0xD5, 0x62, 0xD5, 0x63, /* 0xA4-0xA7 */
        0xD5, 0x64, 0xD5, 0x65, 0xD5, 0x66, 0xD5, 0x67, /* 0xA8-0xAB */
        0xD5, 0x68, 0xD5, 0x69, 0xD5, 0x6A, 0xD5, 0x6B, /* 0xAC-0xAF */
        0xD5, 0x6C, 0xD5, 0x6D, 0xD5, 0x6E, 0xD5, 0x6F, /* 0xB0-0xB3 */
        0xD5, 0x70, 0xD5, 0x71, 0xD5, 0x72, 0xD5, 0x73, /* 0xB4-0xB7 */
        0xD5, 0x74, 0xD5, 0x75, 0xD5, 0x76, 0xD5, 0x77, /* 0xB8-0xBB */
        0xD5, 0x78, 0xD5, 0x79, 0xD5, 0x7A, 0xD5, 0x7B, /* 0xBC-0xBF */
        0xD5, 0x7C, 0xD5, 0x7D, 0xD5, 0x7E, 0xD5, 0x80, /* 0xC0-0xC3 */
        0xD5, 0x81, 0xD5, 0x82, 0xD5, 0x83, 0xD5, 0x84, /* 0xC4-0xC7 */
        0xD5, 0x85, 0xD5, 0x86, 0xD5, 0x87, 0xD5, 0x88, /* 0xC8-0xCB */
        0xD5, 0x89, 0xD5, 0x8A, 0xD5, 0x8B, 0xD5, 0x8C, /* 0xCC-0xCF */
        0xD5, 0x8D, 0xD5, 0x8E, 0xD5, 0x8F, 0xD5, 0x90, /* 0xD0-0xD3 */
        0xD5, 0x91, 0xD5, 0x92, 0xD5, 0x93, 0xD5, 0x94, /* 0xD4-0xD7 */
        0xD5, 0x95, 0xD5, 0x96, 0xD5, 0x97, 0xD5, 0x98, /* 0xD8-0xDB */
        0xD5, 0x99, 0xD5, 0x9A, 0xD5, 0x9B, 0xD5, 0x9C, /* 0xDC-0xDF */
        0xD5, 0x9D, 0xD5, 0x9E, 0xD5, 0x9F, 0xD5, 0xA0, /* 0xE0-0xE3 */
        0xD6, 0x40, 0xD6, 0x41, 0xD6, 0x42, 0xD6, 0x43, /* 0xE4-0xE7 */
        0xD6, 0x44, 0xD6, 0x45, 0xD6, 0x46, 0xD6, 0x47, /* 0xE8-0xEB */
        0xD6, 0x48, 0xD6, 0x49, 0xD6, 0x4A, 0xD6, 0x4B, /* 0xEC-0xEF */
        0xD6, 0x4C, 0xD6, 0x4D, 0xD6, 0x4E, 0xD6, 0x4F, /* 0xF0-0xF3 */
        0xD6, 0x50, 0xD6, 0x51, 0xD6, 0x52, 0xD6, 0x53, /* 0xF4-0xF7 */
        0xD6, 0x54, 0xD6, 0x55, 0xD6, 0x56, 0xD6, 0x57, /* 0xF8-0xFB */
        0xD6, 0x58, 0xD6, 0x59, 0xD6, 0x5A, 0xD6, 0x5B, /* 0xFC-0xFF */
};

static const unsigned char u2c_8B[512] = {
        0xD6, 0x5C, 0xD6, 0x5D, 0xD6, 0x5E, 0xD6, 0x5F, /* 0x00-0x03 */
        0xD6, 0x60, 0xD6, 0x61, 0xD6, 0x62, 0xE5, 0xC0, /* 0x04-0x07 */
        0xD6, 0x63, 0xD6, 0x64, 0xD6, 0x65, 0xD6, 0x66, /* 0x08-0x0B */
        0xD6, 0x67, 0xD6, 0x68, 0xD6, 0x69, 0xD6, 0x6A, /* 0x0C-0x0F */
        0xD6, 0x6B, 0xD6, 0x6C, 0xD6, 0x6D, 0xD6, 0x6E, /* 0x10-0x13 */
        0xD6, 0x6F, 0xD6, 0x70, 0xD6, 0x71, 0xD6, 0x72, /* 0x14-0x17 */
        0xD6, 0x73, 0xD6, 0x74, 0xD6, 0x75, 0xD6, 0x76, /* 0x18-0x1B */
        0xD6, 0x77, 0xD6, 0x78, 0xD6, 0x79, 0xD6, 0x7A, /* 0x1C-0x1F */
        0xD6, 0x7B, 0xD6, 0x7C, 0xD6, 0x7D, 0xD6, 0x7E, /* 0x20-0x23 */
        0xD6, 0x80, 0xD6, 0x81, 0xF6, 0xA5, 0xD6, 0x82, /* 0x24-0x27 */
        0xD6, 0x83, 0xD6, 0x84, 0xD6, 0x85, 0xD6, 0x86, /* 0x28-0x2B */
        0xD6, 0x87, 0xD6, 0x88, 0xD6, 0x89, 0xD6, 0x8A, /* 0x2C-0x2F */
        0xD6, 0x8B, 0xD6, 0x8C, 0xD6, 0x8D, 0xD6, 0x8E, /* 0x30-0x33 */
        0xD6, 0x8F, 0xD6, 0x90, 0xD6, 0x91, 0xD6, 0x92, /* 0x34-0x37 */
        0xD6, 0x93, 0xD6, 0x94, 0xD6, 0x95, 0xD6, 0x96, /* 0x38-0x3B */
        0xD6, 0x97, 0xD6, 0x98, 0xD6, 0x99, 0xD6, 0x9A, /* 0x3C-0x3F */
        0xD6, 0x9B, 0xD6, 0x9C, 0xD6, 0x9D, 0xD6, 0x9E, /* 0x40-0x43 */
        0xD6, 0x9F, 0xD6, 0xA0, 0xD7, 0x40, 0xD7, 0x41, /* 0x44-0x47 */
        0xD7, 0x42, 0xD7, 0x43, 0xD7, 0x44, 0xD7, 0x45, /* 0x48-0x4B */
        0xD7, 0x46, 0xD7, 0x47, 0xD7, 0x48, 0xD7, 0x49, /* 0x4C-0x4F */
        0xD7, 0x4A, 0xD7, 0x4B, 0xD7, 0x4C, 0xD7, 0x4D, /* 0x50-0x53 */
        0xD7, 0x4E, 0xD7, 0x4F, 0xD7, 0x50, 0xD7, 0x51, /* 0x54-0x57 */
        0xD7, 0x52, 0xD7, 0x53, 0xD7, 0x54, 0xD7, 0x55, /* 0x58-0x5B */
        0xD7, 0x56, 0xD7, 0x57, 0xD7, 0x58, 0xD7, 0x59, /* 0x5C-0x5F */
        0xD7, 0x5A, 0xD7, 0x5B, 0xD7, 0x5C, 0xD7, 0x5D, /* 0x60-0x63 */
        0xD7, 0x5E, 0xD7, 0x5F, 0xBE, 0xAF, 0xD7, 0x60, /* 0x64-0x67 */
        0xD7, 0x61, 0xD7, 0x62, 0xD7, 0x63, 0xD7, 0x64, /* 0x68-0x6B */
        0xC6, 0xA9, 0xD7, 0x65, 0xD7, 0x66, 0xD7, 0x67, /* 0x6C-0x6F */
        0xD7, 0x68, 0xD7, 0x69, 0xD7, 0x6A, 0xD7, 0x6B, /* 0x70-0x73 */
        0xD7, 0x6C, 0xD7, 0x6D, 0xD7, 0x6E, 0xD7, 0x6F, /* 0x74-0x77 */
        0xD7, 0x70, 0xD7, 0x71, 0xD7, 0x72, 0xD7, 0x73, /* 0x78-0x7B */
        0xD7, 0x74, 0xD7, 0x75, 0xD7, 0x76, 0xD7, 0x77, /* 0x7C-0x7F */
        
        0xD7, 0x78, 0xD7, 0x79, 0xD7, 0x7A, 0xD7, 0x7B, /* 0x80-0x83 */
        0xD7, 0x7C, 0xD7, 0x7D, 0xD7, 0x7E, 0xD7, 0x80, /* 0x84-0x87 */
        0xD7, 0x81, 0xD7, 0x82, 0xD7, 0x83, 0xD7, 0x84, /* 0x88-0x8B */
        0xD7, 0x85, 0xD7, 0x86, 0xD7, 0x87, 0xD7, 0x88, /* 0x8C-0x8F */
        0xD7, 0x89, 0xD7, 0x8A, 0xD7, 0x8B, 0xD7, 0x8C, /* 0x90-0x93 */
        0xD7, 0x8D, 0xD7, 0x8E, 0xD7, 0x8F, 0xD7, 0x90, /* 0x94-0x97 */
        0xD7, 0x91, 0xD7, 0x92, 0xD7, 0x93, 0xD7, 0x94, /* 0x98-0x9B */
        0xD7, 0x95, 0xD7, 0x96, 0xD7, 0x97, 0xD7, 0x98, /* 0x9C-0x9F */
        0xDA, 0xA5, 0xBC, 0xC6, 0xB6, 0xA9, 0xB8, 0xBC, /* 0xA0-0xA3 */
        0xC8, 0xCF, 0xBC, 0xA5, 0xDA, 0xA6, 0xDA, 0xA7, /* 0xA4-0xA7 */
        0xCC, 0xD6, 0xC8, 0xC3, 0xDA, 0xA8, 0xC6, 0xFD, /* 0xA8-0xAB */
        0xD7, 0x99, 0xD1, 0xB5, 0xD2, 0xE9, 0xD1, 0xB6, /* 0xAC-0xAF */
        0xBC, 0xC7, 0xD7, 0x9A, 0xBD, 0xB2, 0xBB, 0xE4, /* 0xB0-0xB3 */
        0xDA, 0xA9, 0xDA, 0xAA, 0xD1, 0xC8, 0xDA, 0xAB, /* 0xB4-0xB7 */
        0xD0, 0xED, 0xB6, 0xEF, 0xC2, 0xDB, 0xD7, 0x9B, /* 0xB8-0xBB */
        0xCB, 0xCF, 0xB7, 0xED, 0xC9, 0xE8, 0xB7, 0xC3, /* 0xBC-0xBF */
        0xBE, 0xF7, 0xD6, 0xA4, 0xDA, 0xAC, 0xDA, 0xAD, /* 0xC0-0xC3 */
        0xC6, 0xC0, 0xD7, 0xE7, 0xCA, 0xB6, 0xD7, 0x9C, /* 0xC4-0xC7 */
        0xD5, 0xA9, 0xCB, 0xDF, 0xD5, 0xEF, 0xDA, 0xAE, /* 0xC8-0xCB */
        0xD6, 0xDF, 0xB4, 0xCA, 0xDA, 0xB0, 0xDA, 0xAF, /* 0xCC-0xCF */
        0xD7, 0x9D, 0xD2, 0xEB, 0xDA, 0xB1, 0xDA, 0xB2, /* 0xD0-0xD3 */
        0xDA, 0xB3, 0xCA, 0xD4, 0xDA, 0xB4, 0xCA, 0xAB, /* 0xD4-0xD7 */
        0xDA, 0xB5, 0xDA, 0xB6, 0xB3, 0xCF, 0xD6, 0xEF, /* 0xD8-0xDB */
        0xDA, 0xB7, 0xBB, 0xB0, 0xB5, 0xAE, 0xDA, 0xB8, /* 0xDC-0xDF */
        0xDA, 0xB9, 0xB9, 0xEE, 0xD1, 0xAF, 0xD2, 0xE8, /* 0xE0-0xE3 */
        0xDA, 0xBA, 0xB8, 0xC3, 0xCF, 0xEA, 0xB2, 0xEF, /* 0xE4-0xE7 */
        0xDA, 0xBB, 0xDA, 0xBC, 0xD7, 0x9E, 0xBD, 0xEB, /* 0xE8-0xEB */
        0xCE, 0xDC, 0xD3, 0xEF, 0xDA, 0xBD, 0xCE, 0xF3, /* 0xEC-0xEF */
        0xDA, 0xBE, 0xD3, 0xD5, 0xBB, 0xE5, 0xDA, 0xBF, /* 0xF0-0xF3 */
        0xCB, 0xB5, 0xCB, 0xD0, 0xDA, 0xC0, 0xC7, 0xEB, /* 0xF4-0xF7 */
        0xD6, 0xEE, 0xDA, 0xC1, 0xC5, 0xB5, 0xB6, 0xC1, /* 0xF8-0xFB */
        0xDA, 0xC2, 0xB7, 0xCC, 0xBF, 0xCE, 0xDA, 0xC3, /* 0xFC-0xFF */
};

static const unsigned char u2c_8C[512] = {
        0xDA, 0xC4, 0xCB, 0xAD, 0xDA, 0xC5, 0xB5, 0xF7, /* 0x00-0x03 */
        0xDA, 0xC6, 0xC1, 0xC2, 0xD7, 0xBB, 0xDA, 0xC7, /* 0x04-0x07 */
        0xCC, 0xB8, 0xD7, 0x9F, 0xD2, 0xEA, 0xC4, 0xB1, /* 0x08-0x0B */
        0xDA, 0xC8, 0xB5, 0xFD, 0xBB, 0xD1, 0xDA, 0xC9, /* 0x0C-0x0F */
        0xD0, 0xB3, 0xDA, 0xCA, 0xDA, 0xCB, 0xCE, 0xBD, /* 0x10-0x13 */
        0xDA, 0xCC, 0xDA, 0xCD, 0xDA, 0xCE, 0xB2, 0xF7, /* 0x14-0x17 */
        0xDA, 0xD1, 0xDA, 0xCF, 0xD1, 0xE8, 0xDA, 0xD0, /* 0x18-0x1B */
        0xC3, 0xD5, 0xDA, 0xD2, 0xD7, 0xA0, 0xDA, 0xD3, /* 0x1C-0x1F */
        0xDA, 0xD4, 0xDA, 0xD5, 0xD0, 0xBB, 0xD2, 0xA5, /* 0x20-0x23 */
        0xB0, 0xF9, 0xDA, 0xD6, 0xC7, 0xAB, 0xDA, 0xD7, /* 0x24-0x27 */
        0xBD, 0xF7, 0xC3, 0xA1, 0xDA, 0xD8, 0xDA, 0xD9, /* 0x28-0x2B */
        0xC3, 0xFD, 0xCC, 0xB7, 0xDA, 0xDA, 0xDA, 0xDB, /* 0x2C-0x2F */
        0xC0, 0xBE, 0xC6, 0xD7, 0xDA, 0xDC, 0xDA, 0xDD, /* 0x30-0x33 */
        0xC7, 0xB4, 0xDA, 0xDE, 0xDA, 0xDF, 0xB9, 0xC8, /* 0x34-0x37 */
        0xD8, 0x40, 0xD8, 0x41, 0xD8, 0x42, 0xD8, 0x43, /* 0x38-0x3B */
        0xD8, 0x44, 0xD8, 0x45, 0xD8, 0x46, 0xD8, 0x47, /* 0x3C-0x3F */
        0xD8, 0x48, 0xBB, 0xED, 0xD8, 0x49, 0xD8, 0x4A, /* 0x40-0x43 */
        0xD8, 0x4B, 0xD8, 0x4C, 0xB6, 0xB9, 0xF4, 0xF8, /* 0x44-0x47 */
        0xD8, 0x4D, 0xF4, 0xF9, 0xD8, 0x4E, 0xD8, 0x4F, /* 0x48-0x4B */
        0xCD, 0xE3, 0xD8, 0x50, 0xD8, 0x51, 0xD8, 0x52, /* 0x4C-0x4F */
        0xD8, 0x53, 0xD8, 0x54, 0xD8, 0x55, 0xD8, 0x56, /* 0x50-0x53 */
        0xD8, 0x57, 0xF5, 0xB9, 0xD8, 0x58, 0xD8, 0x59, /* 0x54-0x57 */
        0xD8, 0x5A, 0xD8, 0x5B, 0xEB, 0xE0, 0xD8, 0x5C, /* 0x58-0x5B */
        0xD8, 0x5D, 0xD8, 0x5E, 0xD8, 0x5F, 0xD8, 0x60, /* 0x5C-0x5F */
        0xD8, 0x61, 0xCF, 0xF3, 0xBB, 0xBF, 0xD8, 0x62, /* 0x60-0x63 */
        0xD8, 0x63, 0xD8, 0x64, 0xD8, 0x65, 0xD8, 0x66, /* 0x64-0x67 */
        0xD8, 0x67, 0xD8, 0x68, 0xBA, 0xC0, 0xD4, 0xA5, /* 0x68-0x6B */
        0xD8, 0x69, 0xD8, 0x6A, 0xD8, 0x6B, 0xD8, 0x6C, /* 0x6C-0x6F */
        0xD8, 0x6D, 0xD8, 0x6E, 0xD8, 0x6F, 0xE1, 0xD9, /* 0x70-0x73 */
        0xD8, 0x70, 0xD8, 0x71, 0xD8, 0x72, 0xD8, 0x73, /* 0x74-0x77 */
        0xF5, 0xF4, 0xB1, 0xAA, 0xB2, 0xF2, 0xD8, 0x74, /* 0x78-0x7B */
        0xD8, 0x75, 0xD8, 0x76, 0xD8, 0x77, 0xD8, 0x78, /* 0x7C-0x7F */
        
        0xD8, 0x79, 0xD8, 0x7A, 0xF5, 0xF5, 0xD8, 0x7B, /* 0x80-0x83 */
        0xD8, 0x7C, 0xF5, 0xF7, 0xD8, 0x7D, 0xD8, 0x7E, /* 0x84-0x87 */
        0xD8, 0x80, 0xBA, 0xD1, 0xF5, 0xF6, 0xD8, 0x81, /* 0x88-0x8B */
        0xC3, 0xB2, 0xD8, 0x82, 0xD8, 0x83, 0xD8, 0x84, /* 0x8C-0x8F */
        0xD8, 0x85, 0xD8, 0x86, 0xD8, 0x87, 0xD8, 0x88, /* 0x90-0x93 */
        0xF5, 0xF9, 0xD8, 0x89, 0xD8, 0x8A, 0xD8, 0x8B, /* 0x94-0x97 */
        0xF5, 0xF8, 0xD8, 0x8C, 0xD8, 0x8D, 0xD8, 0x8E, /* 0x98-0x9B */
        0xD8, 0x8F, 0xD8, 0x90, 0xD8, 0x91, 0xD8, 0x92, /* 0x9C-0x9F */
        0xD8, 0x93, 0xD8, 0x94, 0xD8, 0x95, 0xD8, 0x96, /* 0xA0-0xA3 */
        0xD8, 0x97, 0xD8, 0x98, 0xD8, 0x99, 0xD8, 0x9A, /* 0xA4-0xA7 */
        0xD8, 0x9B, 0xD8, 0x9C, 0xD8, 0x9D, 0xD8, 0x9E, /* 0xA8-0xAB */
        0xD8, 0x9F, 0xD8, 0xA0, 0xD9, 0x40, 0xD9, 0x41, /* 0xAC-0xAF */
        0xD9, 0x42, 0xD9, 0x43, 0xD9, 0x44, 0xD9, 0x45, /* 0xB0-0xB3 */
        0xD9, 0x46, 0xD9, 0x47, 0xD9, 0x48, 0xD9, 0x49, /* 0xB4-0xB7 */
        0xD9, 0x4A, 0xD9, 0x4B, 0xD9, 0x4C, 0xD9, 0x4D, /* 0xB8-0xBB */
        0xD9, 0x4E, 0xD9, 0x4F, 0xD9, 0x50, 0xD9, 0x51, /* 0xBC-0xBF */
        0xD9, 0x52, 0xD9, 0x53, 0xD9, 0x54, 0xD9, 0x55, /* 0xC0-0xC3 */
        0xD9, 0x56, 0xD9, 0x57, 0xD9, 0x58, 0xD9, 0x59, /* 0xC4-0xC7 */
        0xD9, 0x5A, 0xD9, 0x5B, 0xD9, 0x5C, 0xD9, 0x5D, /* 0xC8-0xCB */
        0xD9, 0x5E, 0xD9, 0x5F, 0xD9, 0x60, 0xD9, 0x61, /* 0xCC-0xCF */
        0xD9, 0x62, 0xD9, 0x63, 0xD9, 0x64, 0xD9, 0x65, /* 0xD0-0xD3 */
        0xD9, 0x66, 0xD9, 0x67, 0xD9, 0x68, 0xD9, 0x69, /* 0xD4-0xD7 */
        0xD9, 0x6A, 0xD9, 0x6B, 0xD9, 0x6C, 0xD9, 0x6D, /* 0xD8-0xDB */
        0xD9, 0x6E, 0xD9, 0x6F, 0xD9, 0x70, 0xD9, 0x71, /* 0xDC-0xDF */
        0xD9, 0x72, 0xD9, 0x73, 0xD9, 0x74, 0xD9, 0x75, /* 0xE0-0xE3 */
        0xD9, 0x76, 0xD9, 0x77, 0xD9, 0x78, 0xD9, 0x79, /* 0xE4-0xE7 */
        0xD9, 0x7A, 0xD9, 0x7B, 0xD9, 0x7C, 0xD9, 0x7D, /* 0xE8-0xEB */
        0xD9, 0x7E, 0xD9, 0x80, 0xD9, 0x81, 0xD9, 0x82, /* 0xEC-0xEF */
        0xD9, 0x83, 0xD9, 0x84, 0xD9, 0x85, 0xD9, 0x86, /* 0xF0-0xF3 */
        0xD9, 0x87, 0xD9, 0x88, 0xD9, 0x89, 0xD9, 0x8A, /* 0xF4-0xF7 */
        0xD9, 0x8B, 0xD9, 0x8C, 0xD9, 0x8D, 0xD9, 0x8E, /* 0xF8-0xFB */
        0xD9, 0x8F, 0xD9, 0x90, 0xD9, 0x91, 0xD9, 0x92, /* 0xFC-0xFF */
};

static const unsigned char u2c_8D[512] = {
        0xD9, 0x93, 0xD9, 0x94, 0xD9, 0x95, 0xD9, 0x96, /* 0x00-0x03 */
        0xD9, 0x97, 0xD9, 0x98, 0xD9, 0x99, 0xD9, 0x9A, /* 0x04-0x07 */
        0xD9, 0x9B, 0xD9, 0x9C, 0xD9, 0x9D, 0xD9, 0x9E, /* 0x08-0x0B */
        0xD9, 0x9F, 0xD9, 0xA0, 0xDA, 0x40, 0xDA, 0x41, /* 0x0C-0x0F */
        0xDA, 0x42, 0xDA, 0x43, 0xDA, 0x44, 0xDA, 0x45, /* 0x10-0x13 */
        0xDA, 0x46, 0xDA, 0x47, 0xDA, 0x48, 0xDA, 0x49, /* 0x14-0x17 */
        0xDA, 0x4A, 0xDA, 0x4B, 0xDA, 0x4C, 0xDA, 0x4D, /* 0x18-0x1B */
        0xDA, 0x4E, 0xB1, 0xB4, 0xD5, 0xEA, 0xB8, 0xBA, /* 0x1C-0x1F */
        0xDA, 0x4F, 0xB9, 0xB1, 0xB2, 0xC6, 0xD4, 0xF0, /* 0x20-0x23 */
        0xCF, 0xCD, 0xB0, 0xDC, 0xD5, 0xCB, 0xBB, 0xF5, /* 0x24-0x27 */
        0xD6, 0xCA, 0xB7, 0xB7, 0xCC, 0xB0, 0xC6, 0xB6, /* 0x28-0x2B */
        0xB1, 0xE1, 0xB9, 0xBA, 0xD6, 0xFC, 0xB9, 0xE1, /* 0x2C-0x2F */
        0xB7, 0xA1, 0xBC, 0xFA, 0xEA, 0xDA, 0xEA, 0xDB, /* 0x30-0x33 */
        0xCC, 0xF9, 0xB9, 0xF3, 0xEA, 0xDC, 0xB4, 0xFB, /* 0x34-0x37 */
        0xC3, 0xB3, 0xB7, 0xD1, 0xBA, 0xD8, 0xEA, 0xDD, /* 0x38-0x3B */
        0xD4, 0xF4, 0xEA, 0xDE, 0xBC, 0xD6, 0xBB, 0xDF, /* 0x3C-0x3F */
        0xEA, 0xDF, 0xC1, 0xDE, 0xC2, 0xB8, 0xD4, 0xDF, /* 0x40-0x43 */
        0xD7, 0xCA, 0xEA, 0xE0, 0xEA, 0xE1, 0xEA, 0xE4, /* 0x44-0x47 */
        0xEA, 0xE2, 0xEA, 0xE3, 0xC9, 0xDE, 0xB8, 0xB3, /* 0x48-0x4B */
        0xB6, 0xC4, 0xEA, 0xE5, 0xCA, 0xEA, 0xC9, 0xCD, /* 0x4C-0x4F */
        0xB4, 0xCD, 0xDA, 0x50, 0xDA, 0x51, 0xE2, 0xD9, /* 0x50-0x53 */
        0xC5, 0xE2, 0xEA, 0xE6, 0xC0, 0xB5, 0xDA, 0x52, /* 0x54-0x57 */
        0xD7, 0xB8, 0xEA, 0xE7, 0xD7, 0xAC, 0xC8, 0xFC, /* 0x58-0x5B */
        0xD8, 0xD3, 0xD8, 0xCD, 0xD4, 0xDE, 0xDA, 0x53, /* 0x5C-0x5F */
        0xD4, 0xF9, 0xC9, 0xC4, 0xD3, 0xAE, 0xB8, 0xD3, /* 0x60-0x63 */
        0xB3, 0xE0, 0xDA, 0x54, 0xC9, 0xE2, 0xF4, 0xF6, /* 0x64-0x67 */
        0xDA, 0x55, 0xDA, 0x56, 0xDA, 0x57, 0xBA, 0xD5, /* 0x68-0x6B */
        0xDA, 0x58, 0xF4, 0xF7, 0xDA, 0x59, 0xDA, 0x5A, /* 0x6C-0x6F */
        0xD7, 0xDF, 0xDA, 0x5B, 0xDA, 0x5C, 0xF4, 0xF1, /* 0x70-0x73 */
        0xB8, 0xB0, 0xD5, 0xD4, 0xB8, 0xCF, 0xC6, 0xF0, /* 0x74-0x77 */
        0xDA, 0x5D, 0xDA, 0x5E, 0xDA, 0x5F, 0xDA, 0x60, /* 0x78-0x7B */
        0xDA, 0x61, 0xDA, 0x62, 0xDA, 0x63, 0xDA, 0x64, /* 0x7C-0x7F */
        
        0xDA, 0x65, 0xB3, 0xC3, 0xDA, 0x66, 0xDA, 0x67, /* 0x80-0x83 */
        0xF4, 0xF2, 0xB3, 0xAC, 0xDA, 0x68, 0xDA, 0x69, /* 0x84-0x87 */
        0xDA, 0x6A, 0xDA, 0x6B, 0xD4, 0xBD, 0xC7, 0xF7, /* 0x88-0x8B */
        0xDA, 0x6C, 0xDA, 0x6D, 0xDA, 0x6E, 0xDA, 0x6F, /* 0x8C-0x8F */
        0xDA, 0x70, 0xF4, 0xF4, 0xDA, 0x71, 0xDA, 0x72, /* 0x90-0x93 */
        0xF4, 0xF3, 0xDA, 0x73, 0xDA, 0x74, 0xDA, 0x75, /* 0x94-0x97 */
        0xDA, 0x76, 0xDA, 0x77, 0xDA, 0x78, 0xDA, 0x79, /* 0x98-0x9B */
        0xDA, 0x7A, 0xDA, 0x7B, 0xDA, 0x7C, 0xCC, 0xCB, /* 0x9C-0x9F */
        0xDA, 0x7D, 0xDA, 0x7E, 0xDA, 0x80, 0xC8, 0xA4, /* 0xA0-0xA3 */
        0xDA, 0x81, 0xDA, 0x82, 0xDA, 0x83, 0xDA, 0x84, /* 0xA4-0xA7 */
        0xDA, 0x85, 0xDA, 0x86, 0xDA, 0x87, 0xDA, 0x88, /* 0xA8-0xAB */
        0xDA, 0x89, 0xDA, 0x8A, 0xDA, 0x8B, 0xDA, 0x8C, /* 0xAC-0xAF */
        0xDA, 0x8D, 0xF4, 0xF5, 0xDA, 0x8E, 0xD7, 0xE3, /* 0xB0-0xB3 */
        0xC5, 0xBF, 0xF5, 0xC0, 0xDA, 0x8F, 0xDA, 0x90, /* 0xB4-0xB7 */
        0xF5, 0xBB, 0xDA, 0x91, 0xF5, 0xC3, 0xDA, 0x92, /* 0xB8-0xBB */
        0xF5, 0xC2, 0xDA, 0x93, 0xD6, 0xBA, 0xF5, 0xC1, /* 0xBC-0xBF */
        0xDA, 0x94, 0xDA, 0x95, 0xDA, 0x96, 0xD4, 0xBE, /* 0xC0-0xC3 */
        0xF5, 0xC4, 0xDA, 0x97, 0xF5, 0xCC, 0xDA, 0x98, /* 0xC4-0xC7 */
        0xDA, 0x99, 0xDA, 0x9A, 0xDA, 0x9B, 0xB0, 0xCF, /* 0xC8-0xCB */
        0xB5, 0xF8, 0xDA, 0x9C, 0xF5, 0xC9, 0xF5, 0xCA, /* 0xCC-0xCF */
        0xDA, 0x9D, 0xC5, 0xDC, 0xDA, 0x9E, 0xDA, 0x9F, /* 0xD0-0xD3 */
        0xDA, 0xA0, 0xDB, 0x40, 0xF5, 0xC5, 0xF5, 0xC6, /* 0xD4-0xD7 */
        0xDB, 0x41, 0xDB, 0x42, 0xF5, 0xC7, 0xF5, 0xCB, /* 0xD8-0xDB */
        0xDB, 0x43, 0xBE, 0xE0, 0xF5, 0xC8, 0xB8, 0xFA, /* 0xDC-0xDF */
        0xDB, 0x44, 0xDB, 0x45, 0xDB, 0x46, 0xF5, 0xD0, /* 0xE0-0xE3 */
        0xF5, 0xD3, 0xDB, 0x47, 0xDB, 0x48, 0xDB, 0x49, /* 0xE4-0xE7 */
        0xBF, 0xE7, 0xDB, 0x4A, 0xB9, 0xF2, 0xF5, 0xBC, /* 0xE8-0xEB */
        0xF5, 0xCD, 0xDB, 0x4B, 0xDB, 0x4C, 0xC2, 0xB7, /* 0xEC-0xEF */
        0xDB, 0x4D, 0xDB, 0x4E, 0xDB, 0x4F, 0xCC, 0xF8, /* 0xF0-0xF3 */
        0xDB, 0x50, 0xBC, 0xF9, 0xDB, 0x51, 0xF5, 0xCE, /* 0xF4-0xF7 */
        0xF5, 0xCF, 0xF5, 0xD1, 0xB6, 0xE5, 0xF5, 0xD2, /* 0xF8-0xFB */
        0xDB, 0x52, 0xF5, 0xD5, 0xDB, 0x53, 0xDB, 0x54, /* 0xFC-0xFF */
};

static const unsigned char u2c_8E[512] = {
        0xDB, 0x55, 0xDB, 0x56, 0xDB, 0x57, 0xDB, 0x58, /* 0x00-0x03 */
        0xDB, 0x59, 0xF5, 0xBD, 0xDB, 0x5A, 0xDB, 0x5B, /* 0x04-0x07 */
        0xDB, 0x5C, 0xF5, 0xD4, 0xD3, 0xBB, 0xDB, 0x5D, /* 0x08-0x0B */
        0xB3, 0xEC, 0xDB, 0x5E, 0xDB, 0x5F, 0xCC, 0xA4, /* 0x0C-0x0F */
        0xDB, 0x60, 0xDB, 0x61, 0xDB, 0x62, 0xDB, 0x63, /* 0x10-0x13 */
        0xF5, 0xD6, 0xDB, 0x64, 0xDB, 0x65, 0xDB, 0x66, /* 0x14-0x17 */
        0xDB, 0x67, 0xDB, 0x68, 0xDB, 0x69, 0xDB, 0x6A, /* 0x18-0x1B */
        0xDB, 0x6B, 0xF5, 0xD7, 0xBE, 0xE1, 0xF5, 0xD8, /* 0x1C-0x1F */
        0xDB, 0x6C, 0xDB, 0x6D, 0xCC, 0xDF, 0xF5, 0xDB, /* 0x20-0x23 */
        0xDB, 0x6E, 0xDB, 0x6F, 0xDB, 0x70, 0xDB, 0x71, /* 0x24-0x27 */
        0xDB, 0x72, 0xB2, 0xC8, 0xD7, 0xD9, 0xDB, 0x73, /* 0x28-0x2B */
        0xF5, 0xD9, 0xDB, 0x74, 0xF5, 0xDA, 0xF5, 0xDC, /* 0x2C-0x2F */
        0xDB, 0x75, 0xF5, 0xE2, 0xDB, 0x76, 0xDB, 0x77, /* 0x30-0x33 */
        0xDB, 0x78, 0xF5, 0xE0, 0xDB, 0x79, 0xDB, 0x7A, /* 0x34-0x37 */
        0xDB, 0x7B, 0xF5, 0xDF, 0xF5, 0xDD, 0xDB, 0x7C, /* 0x38-0x3B */
        0xDB, 0x7D, 0xF5, 0xE1, 0xDB, 0x7E, 0xDB, 0x80, /* 0x3C-0x3F */
        0xF5, 0xDE, 0xF5, 0xE4, 0xF5, 0xE5, 0xDB, 0x81, /* 0x40-0x43 */
        0xCC, 0xE3, 0xDB, 0x82, 0xDB, 0x83, 0xE5, 0xBF, /* 0x44-0x47 */
        0xB5, 0xB8, 0xF5, 0xE3, 0xF5, 0xE8, 0xCC, 0xA3, /* 0x48-0x4B */
        0xDB, 0x84, 0xDB, 0x85, 0xDB, 0x86, 0xDB, 0x87, /* 0x4C-0x4F */
        0xDB, 0x88, 0xF5, 0xE6, 0xF5, 0xE7, 0xDB, 0x89, /* 0x50-0x53 */
        0xDB, 0x8A, 0xDB, 0x8B, 0xDB, 0x8C, 0xDB, 0x8D, /* 0x54-0x57 */
        0xDB, 0x8E, 0xF5, 0xBE, 0xDB, 0x8F, 0xDB, 0x90, /* 0x58-0x5B */
        0xDB, 0x91, 0xDB, 0x92, 0xDB, 0x93, 0xDB, 0x94, /* 0x5C-0x5F */
        0xDB, 0x95, 0xDB, 0x96, 0xDB, 0x97, 0xDB, 0x98, /* 0x60-0x63 */
        0xDB, 0x99, 0xDB, 0x9A, 0xB1, 0xC4, 0xDB, 0x9B, /* 0x64-0x67 */
        0xDB, 0x9C, 0xF5, 0xBF, 0xDB, 0x9D, 0xDB, 0x9E, /* 0x68-0x6B */
        0xB5, 0xC5, 0xB2, 0xE4, 0xDB, 0x9F, 0xF5, 0xEC, /* 0x6C-0x6F */
        0xF5, 0xE9, 0xDB, 0xA0, 0xB6, 0xD7, 0xDC, 0x40, /* 0x70-0x73 */
        0xF5, 0xED, 0xDC, 0x41, 0xF5, 0xEA, 0xDC, 0x42, /* 0x74-0x77 */
        0xDC, 0x43, 0xDC, 0x44, 0xDC, 0x45, 0xDC, 0x46, /* 0x78-0x7B */
        0xF5, 0xEB, 0xDC, 0x47, 0xDC, 0x48, 0xB4, 0xDA, /* 0x7C-0x7F */
        
        0xDC, 0x49, 0xD4, 0xEA, 0xDC, 0x4A, 0xDC, 0x4B, /* 0x80-0x83 */
        0xDC, 0x4C, 0xF5, 0xEE, 0xDC, 0x4D, 0xB3, 0xF9, /* 0x84-0x87 */
        0xDC, 0x4E, 0xDC, 0x4F, 0xDC, 0x50, 0xDC, 0x51, /* 0x88-0x8B */
        0xDC, 0x52, 0xDC, 0x53, 0xDC, 0x54, 0xF5, 0xEF, /* 0x8C-0x8F */
        0xF5, 0xF1, 0xDC, 0x55, 0xDC, 0x56, 0xDC, 0x57, /* 0x90-0x93 */
        0xF5, 0xF0, 0xDC, 0x58, 0xDC, 0x59, 0xDC, 0x5A, /* 0x94-0x97 */
        0xDC, 0x5B, 0xDC, 0x5C, 0xDC, 0x5D, 0xDC, 0x5E, /* 0x98-0x9B */
        0xF5, 0xF2, 0xDC, 0x5F, 0xF5, 0xF3, 0xDC, 0x60, /* 0x9C-0x9F */
        0xDC, 0x61, 0xDC, 0x62, 0xDC, 0x63, 0xDC, 0x64, /* 0xA0-0xA3 */
        0xDC, 0x65, 0xDC, 0x66, 0xDC, 0x67, 0xDC, 0x68, /* 0xA4-0xA7 */
        0xDC, 0x69, 0xDC, 0x6A, 0xDC, 0x6B, 0xC9, 0xED, /* 0xA8-0xAB */
        0xB9, 0xAA, 0xDC, 0x6C, 0xDC, 0x6D, 0xC7, 0xFB, /* 0xAC-0xAF */
        0xDC, 0x6E, 0xDC, 0x6F, 0xB6, 0xE3, 0xDC, 0x70, /* 0xB0-0xB3 */
        0xDC, 0x71, 0xDC, 0x72, 0xDC, 0x73, 0xDC, 0x74, /* 0xB4-0xB7 */
        0xDC, 0x75, 0xDC, 0x76, 0xCC, 0xC9, 0xDC, 0x77, /* 0xB8-0xBB */
        0xDC, 0x78, 0xDC, 0x79, 0xDC, 0x7A, 0xDC, 0x7B, /* 0xBC-0xBF */
        0xDC, 0x7C, 0xDC, 0x7D, 0xDC, 0x7E, 0xDC, 0x80, /* 0xC0-0xC3 */
        0xDC, 0x81, 0xDC, 0x82, 0xDC, 0x83, 0xDC, 0x84, /* 0xC4-0xC7 */
        0xDC, 0x85, 0xDC, 0x86, 0xDC, 0x87, 0xDC, 0x88, /* 0xC8-0xCB */
        0xDC, 0x89, 0xDC, 0x8A, 0xEA, 0xA6, 0xDC, 0x8B, /* 0xCC-0xCF */
        0xDC, 0x8C, 0xDC, 0x8D, 0xDC, 0x8E, 0xDC, 0x8F, /* 0xD0-0xD3 */
        0xDC, 0x90, 0xDC, 0x91, 0xDC, 0x92, 0xDC, 0x93, /* 0xD4-0xD7 */
        0xDC, 0x94, 0xDC, 0x95, 0xDC, 0x96, 0xDC, 0x97, /* 0xD8-0xDB */
        0xDC, 0x98, 0xDC, 0x99, 0xDC, 0x9A, 0xDC, 0x9B, /* 0xDC-0xDF */
        0xDC, 0x9C, 0xDC, 0x9D, 0xDC, 0x9E, 0xDC, 0x9F, /* 0xE0-0xE3 */
        0xDC, 0xA0, 0xDD, 0x40, 0xDD, 0x41, 0xDD, 0x42, /* 0xE4-0xE7 */
        0xDD, 0x43, 0xDD, 0x44, 0xDD, 0x45, 0xDD, 0x46, /* 0xE8-0xEB */
        0xDD, 0x47, 0xDD, 0x48, 0xDD, 0x49, 0xDD, 0x4A, /* 0xEC-0xEF */
        0xDD, 0x4B, 0xDD, 0x4C, 0xDD, 0x4D, 0xDD, 0x4E, /* 0xF0-0xF3 */
        0xDD, 0x4F, 0xDD, 0x50, 0xDD, 0x51, 0xDD, 0x52, /* 0xF4-0xF7 */
        0xDD, 0x53, 0xDD, 0x54, 0xDD, 0x55, 0xDD, 0x56, /* 0xF8-0xFB */
        0xDD, 0x57, 0xDD, 0x58, 0xDD, 0x59, 0xDD, 0x5A, /* 0xFC-0xFF */
};

static const unsigned char u2c_8F[512] = {
        0xDD, 0x5B, 0xDD, 0x5C, 0xDD, 0x5D, 0xDD, 0x5E, /* 0x00-0x03 */
        0xDD, 0x5F, 0xDD, 0x60, 0xDD, 0x61, 0xDD, 0x62, /* 0x04-0x07 */
        0xDD, 0x63, 0xDD, 0x64, 0xDD, 0x65, 0xDD, 0x66, /* 0x08-0x0B */
        0xDD, 0x67, 0xDD, 0x68, 0xDD, 0x69, 0xDD, 0x6A, /* 0x0C-0x0F */
        0xDD, 0x6B, 0xDD, 0x6C, 0xDD, 0x6D, 0xDD, 0x6E, /* 0x10-0x13 */
        0xDD, 0x6F, 0xDD, 0x70, 0xDD, 0x71, 0xDD, 0x72, /* 0x14-0x17 */
        0xDD, 0x73, 0xDD, 0x74, 0xDD, 0x75, 0xDD, 0x76, /* 0x18-0x1B */
        0xDD, 0x77, 0xDD, 0x78, 0xDD, 0x79, 0xDD, 0x7A, /* 0x1C-0x1F */
        0xDD, 0x7B, 0xDD, 0x7C, 0xDD, 0x7D, 0xDD, 0x7E, /* 0x20-0x23 */
        0xDD, 0x80, 0xDD, 0x81, 0xDD, 0x82, 0xDD, 0x83, /* 0x24-0x27 */
        0xDD, 0x84, 0xDD, 0x85, 0xDD, 0x86, 0xDD, 0x87, /* 0x28-0x2B */
        0xDD, 0x88, 0xDD, 0x89, 0xDD, 0x8A, 0xDD, 0x8B, /* 0x2C-0x2F */
        0xDD, 0x8C, 0xDD, 0x8D, 0xDD, 0x8E, 0xDD, 0x8F, /* 0x30-0x33 */
        0xDD, 0x90, 0xDD, 0x91, 0xDD, 0x92, 0xDD, 0x93, /* 0x34-0x37 */
        0xDD, 0x94, 0xDD, 0x95, 0xDD, 0x96, 0xDD, 0x97, /* 0x38-0x3B */
        0xDD, 0x98, 0xDD, 0x99, 0xDD, 0x9A, 0xDD, 0x9B, /* 0x3C-0x3F */
        0xDD, 0x9C, 0xDD, 0x9D, 0xDD, 0x9E, 0xDD, 0x9F, /* 0x40-0x43 */
        0xDD, 0xA0, 0xDE, 0x40, 0xDE, 0x41, 0xDE, 0x42, /* 0x44-0x47 */
        0xDE, 0x43, 0xDE, 0x44, 0xDE, 0x45, 0xDE, 0x46, /* 0x48-0x4B */
        0xDE, 0x47, 0xDE, 0x48, 0xDE, 0x49, 0xDE, 0x4A, /* 0x4C-0x4F */
        0xDE, 0x4B, 0xDE, 0x4C, 0xDE, 0x4D, 0xDE, 0x4E, /* 0x50-0x53 */
        0xDE, 0x4F, 0xDE, 0x50, 0xDE, 0x51, 0xDE, 0x52, /* 0x54-0x57 */
        0xDE, 0x53, 0xDE, 0x54, 0xDE, 0x55, 0xDE, 0x56, /* 0x58-0x5B */
        0xDE, 0x57, 0xDE, 0x58, 0xDE, 0x59, 0xDE, 0x5A, /* 0x5C-0x5F */
        0xDE, 0x5B, 0xDE, 0x5C, 0xDE, 0x5D, 0xDE, 0x5E, /* 0x60-0x63 */
        0xDE, 0x5F, 0xDE, 0x60, 0xB3, 0xB5, 0xD4, 0xFE, /* 0x64-0x67 */
        0xB9, 0xEC, 0xD0, 0xF9, 0xDE, 0x61, 0xE9, 0xED, /* 0x68-0x6B */
        0xD7, 0xAA, 0xE9, 0xEE, 0xC2, 0xD6, 0xC8, 0xED, /* 0x6C-0x6F */
        0xBA, 0xE4, 0xE9, 0xEF, 0xE9, 0xF0, 0xE9, 0xF1, /* 0x70-0x73 */
        0xD6, 0xE1, 0xE9, 0xF2, 0xE9, 0xF3, 0xE9, 0xF5, /* 0x74-0x77 */
        0xE9, 0xF4, 0xE9, 0xF6, 0xE9, 0xF7, 0xC7, 0xE1, /* 0x78-0x7B */
        0xE9, 0xF8, 0xD4, 0xD8, 0xE9, 0xF9, 0xBD, 0xCE, /* 0x7C-0x7F */
        
        0xDE, 0x62, 0xE9, 0xFA, 0xE9, 0xFB, 0xBD, 0xCF, /* 0x80-0x83 */
        0xE9, 0xFC, 0xB8, 0xA8, 0xC1, 0xBE, 0xE9, 0xFD, /* 0x84-0x87 */
        0xB1, 0xB2, 0xBB, 0xD4, 0xB9, 0xF5, 0xE9, 0xFE, /* 0x88-0x8B */
        0xDE, 0x63, 0xEA, 0xA1, 0xEA, 0xA2, 0xEA, 0xA3, /* 0x8C-0x8F */
        0xB7, 0xF8, 0xBC, 0xAD, 0xDE, 0x64, 0xCA, 0xE4, /* 0x90-0x93 */
        0xE0, 0xCE, 0xD4, 0xAF, 0xCF, 0xBD, 0xD5, 0xB7, /* 0x94-0x97 */
        0xEA, 0xA4, 0xD5, 0xDE, 0xEA, 0xA5, 0xD0, 0xC1, /* 0x98-0x9B */
        0xB9, 0xBC, 0xDE, 0x65, 0xB4, 0xC7, 0xB1, 0xD9, /* 0x9C-0x9F */
        0xDE, 0x66, 0xDE, 0x67, 0xDE, 0x68, 0xC0, 0xB1, /* 0xA0-0xA3 */
        0xDE, 0x69, 0xDE, 0x6A, 0xDE, 0x6B, 0xDE, 0x6C, /* 0xA4-0xA7 */
        0xB1, 0xE6, 0xB1, 0xE7, 0xDE, 0x6D, 0xB1, 0xE8, /* 0xA8-0xAB */
        0xDE, 0x6E, 0xDE, 0x6F, 0xDE, 0x70, 0xDE, 0x71, /* 0xAC-0xAF */
        0xB3, 0xBD, 0xC8, 0xE8, 0xDE, 0x72, 0xDE, 0x73, /* 0xB0-0xB3 */
        0xDE, 0x74, 0xDE, 0x75, 0xE5, 0xC1, 0xDE, 0x76, /* 0xB4-0xB7 */
        0xDE, 0x77, 0xB1, 0xDF, 0xDE, 0x78, 0xDE, 0x79, /* 0xB8-0xBB */
        0xDE, 0x7A, 0xC1, 0xC9, 0xB4, 0xEF, 0xDE, 0x7B, /* 0xBC-0xBF */
        0xDE, 0x7C, 0xC7, 0xA8, 0xD3, 0xD8, 0xDE, 0x7D, /* 0xC0-0xC3 */
        0xC6, 0xF9, 0xD1, 0xB8, 0xDE, 0x7E, 0xB9, 0xFD, /* 0xC4-0xC7 */
        0xC2, 0xF5, 0xDE, 0x80, 0xDE, 0x81, 0xDE, 0x82, /* 0xC8-0xCB */
        0xDE, 0x83, 0xDE, 0x84, 0xD3, 0xAD, 0xDE, 0x85, /* 0xCC-0xCF */
        0xD4, 0xCB, 0xBD, 0xFC, 0xDE, 0x86, 0xE5, 0xC2, /* 0xD0-0xD3 */
        0xB7, 0xB5, 0xE5, 0xC3, 0xDE, 0x87, 0xDE, 0x88, /* 0xD4-0xD7 */
        0xBB, 0xB9, 0xD5, 0xE2, 0xDE, 0x89, 0xBD, 0xF8, /* 0xD8-0xDB */
        0xD4, 0xB6, 0xCE, 0xA5, 0xC1, 0xAC, 0xB3, 0xD9, /* 0xDC-0xDF */
        0xDE, 0x8A, 0xDE, 0x8B, 0xCC, 0xF6, 0xDE, 0x8C, /* 0xE0-0xE3 */
        0xE5, 0xC6, 0xE5, 0xC4, 0xE5, 0xC8, 0xDE, 0x8D, /* 0xE4-0xE7 */
        0xE5, 0xCA, 0xE5, 0xC7, 0xB5, 0xCF, 0xC6, 0xC8, /* 0xE8-0xEB */
        0xDE, 0x8E, 0xB5, 0xFC, 0xE5, 0xC5, 0xDE, 0x8F, /* 0xEC-0xEF */
        0xCA, 0xF6, 0xDE, 0x90, 0xDE, 0x91, 0xE5, 0xC9, /* 0xF0-0xF3 */
        0xDE, 0x92, 0xDE, 0x93, 0xDE, 0x94, 0xC3, 0xD4, /* 0xF4-0xF7 */
        0xB1, 0xC5, 0xBC, 0xA3, 0xDE, 0x95, 0xDE, 0x96, /* 0xF8-0xFB */
        0xDE, 0x97, 0xD7, 0xB7, 0xDE, 0x98, 0xDE, 0x99, /* 0xFC-0xFF */
};

static const unsigned char u2c_90[512] = {
        0xCD, 0xCB, 0xCB, 0xCD, 0xCA, 0xCA, 0xCC, 0xD3, /* 0x00-0x03 */
        0xE5, 0xCC, 0xE5, 0xCB, 0xC4, 0xE6, 0xDE, 0x9A, /* 0x04-0x07 */
        0xDE, 0x9B, 0xD1, 0xA1, 0xD1, 0xB7, 0xE5, 0xCD, /* 0x08-0x0B */
        0xDE, 0x9C, 0xE5, 0xD0, 0xDE, 0x9D, 0xCD, 0xB8, /* 0x0C-0x0F */
        0xD6, 0xF0, 0xE5, 0xCF, 0xB5, 0xDD, 0xDE, 0x9E, /* 0x10-0x13 */
        0xCD, 0xBE, 0xDE, 0x9F, 0xE5, 0xD1, 0xB6, 0xBA, /* 0x14-0x17 */
        0xDE, 0xA0, 0xDF, 0x40, 0xCD, 0xA8, 0xB9, 0xE4, /* 0x18-0x1B */
        0xDF, 0x41, 0xCA, 0xC5, 0xB3, 0xD1, 0xCB, 0xD9, /* 0x1C-0x1F */
        0xD4, 0xEC, 0xE5, 0xD2, 0xB7, 0xEA, 0xDF, 0x42, /* 0x20-0x23 */
        0xDF, 0x43, 0xDF, 0x44, 0xE5, 0xCE, 0xDF, 0x45, /* 0x24-0x27 */
        0xDF, 0x46, 0xDF, 0x47, 0xDF, 0x48, 0xDF, 0x49, /* 0x28-0x2B */
        0xDF, 0x4A, 0xE5, 0xD5, 0xB4, 0xFE, 0xE5, 0xD6, /* 0x2C-0x2F */
        0xDF, 0x4B, 0xDF, 0x4C, 0xDF, 0x4D, 0xDF, 0x4E, /* 0x30-0x33 */
        0xDF, 0x4F, 0xE5, 0xD3, 0xE5, 0xD4, 0xDF, 0x50, /* 0x34-0x37 */
        0xD2, 0xDD, 0xDF, 0x51, 0xDF, 0x52, 0xC2, 0xDF, /* 0x38-0x3B */
        0xB1, 0xC6, 0xDF, 0x53, 0xD3, 0xE2, 0xDF, 0x54, /* 0x3C-0x3F */
        0xDF, 0x55, 0xB6, 0xDD, 0xCB, 0xEC, 0xDF, 0x56, /* 0x40-0x43 */
        0xE5, 0xD7, 0xDF, 0x57, 0xDF, 0x58, 0xD3, 0xF6, /* 0x44-0x47 */
        0xDF, 0x59, 0xDF, 0x5A, 0xDF, 0x5B, 0xDF, 0x5C, /* 0x48-0x4B */
        0xDF, 0x5D, 0xB1, 0xE9, 0xDF, 0x5E, 0xB6, 0xF4, /* 0x4C-0x4F */
        0xE5, 0xDA, 0xE5, 0xD8, 0xE5, 0xD9, 0xB5, 0xC0, /* 0x50-0x53 */
        0xDF, 0x5F, 0xDF, 0x60, 0xDF, 0x61, 0xD2, 0xC5, /* 0x54-0x57 */
        0xE5, 0xDC, 0xDF, 0x62, 0xDF, 0x63, 0xE5, 0xDE, /* 0x58-0x5B */
        0xDF, 0x64, 0xDF, 0x65, 0xDF, 0x66, 0xDF, 0x67, /* 0x5C-0x5F */
        0xDF, 0x68, 0xDF, 0x69, 0xE5, 0xDD, 0xC7, 0xB2, /* 0x60-0x63 */
        0xDF, 0x6A, 0xD2, 0xA3, 0xDF, 0x6B, 0xDF, 0x6C, /* 0x64-0x67 */
        0xE5, 0xDB, 0xDF, 0x6D, 0xDF, 0x6E, 0xDF, 0x6F, /* 0x68-0x6B */
        0xDF, 0x70, 0xD4, 0xE2, 0xD5, 0xDA, 0xDF, 0x71, /* 0x6C-0x6F */
        0xDF, 0x72, 0xDF, 0x73, 0xDF, 0x74, 0xDF, 0x75, /* 0x70-0x73 */
        0xE5, 0xE0, 0xD7, 0xF1, 0xDF, 0x76, 0xDF, 0x77, /* 0x74-0x77 */
        0xDF, 0x78, 0xDF, 0x79, 0xDF, 0x7A, 0xDF, 0x7B, /* 0x78-0x7B */
        0xDF, 0x7C, 0xE5, 0xE1, 0xDF, 0x7D, 0xB1, 0xDC, /* 0x7C-0x7F */
        
        0xD1, 0xFB, 0xDF, 0x7E, 0xE5, 0xE2, 0xE5, 0xE4, /* 0x80-0x83 */
        0xDF, 0x80, 0xDF, 0x81, 0xDF, 0x82, 0xDF, 0x83, /* 0x84-0x87 */
        0xE5, 0xE3, 0xDF, 0x84, 0xDF, 0x85, 0xE5, 0xE5, /* 0x88-0x8B */
        0xDF, 0x86, 0xDF, 0x87, 0xDF, 0x88, 0xDF, 0x89, /* 0x8C-0x8F */
        0xDF, 0x8A, 0xD2, 0xD8, 0xDF, 0x8B, 0xB5, 0xCB, /* 0x90-0x93 */
        0xDF, 0x8C, 0xE7, 0xDF, 0xDF, 0x8D, 0xDA, 0xF5, /* 0x94-0x97 */
        0xDF, 0x8E, 0xDA, 0xF8, 0xDF, 0x8F, 0xDA, 0xF6, /* 0x98-0x9B */
        0xDF, 0x90, 0xDA, 0xF7, 0xDF, 0x91, 0xDF, 0x92, /* 0x9C-0x9F */
        0xDF, 0x93, 0xDA, 0xFA, 0xD0, 0xCF, 0xC4, 0xC7, /* 0xA0-0xA3 */
        0xDF, 0x94, 0xDF, 0x95, 0xB0, 0xEE, 0xDF, 0x96, /* 0xA4-0xA7 */
        0xDF, 0x97, 0xDF, 0x98, 0xD0, 0xB0, 0xDF, 0x99, /* 0xA8-0xAB */
        0xDA, 0xF9, 0xDF, 0x9A, 0xD3, 0xCA, 0xBA, 0xAA, /* 0xAC-0xAF */
        0xDB, 0xA2, 0xC7, 0xF1, 0xDF, 0x9B, 0xDA, 0xFC, /* 0xB0-0xB3 */
        0xDA, 0xFB, 0xC9, 0xDB, 0xDA, 0xFD, 0xDF, 0x9C, /* 0xB4-0xB7 */
        0xDB, 0xA1, 0xD7, 0xDE, 0xDA, 0xFE, 0xC1, 0xDA, /* 0xB8-0xBB */
        0xDF, 0x9D, 0xDF, 0x9E, 0xDB, 0xA5, 0xDF, 0x9F, /* 0xBC-0xBF */
        0xDF, 0xA0, 0xD3, 0xF4, 0xE0, 0x40, 0xE0, 0x41, /* 0xC0-0xC3 */
        0xDB, 0xA7, 0xDB, 0xA4, 0xE0, 0x42, 0xDB, 0xA8, /* 0xC4-0xC7 */
        0xE0, 0x43, 0xE0, 0x44, 0xBD, 0xBC, 0xE0, 0x45, /* 0xC8-0xCB */
        0xE0, 0x46, 0xE0, 0x47, 0xC0, 0xC9, 0xDB, 0xA3, /* 0xCC-0xCF */
        0xDB, 0xA6, 0xD6, 0xA3, 0xE0, 0x48, 0xDB, 0xA9, /* 0xD0-0xD3 */
        0xE0, 0x49, 0xE0, 0x4A, 0xE0, 0x4B, 0xDB, 0xAD, /* 0xD4-0xD7 */
        0xE0, 0x4C, 0xE0, 0x4D, 0xE0, 0x4E, 0xDB, 0xAE, /* 0xD8-0xDB */
        0xDB, 0xAC, 0xBA, 0xC2, 0xE0, 0x4F, 0xE0, 0x50, /* 0xDC-0xDF */
        0xE0, 0x51, 0xBF, 0xA4, 0xDB, 0xAB, 0xE0, 0x52, /* 0xE0-0xE3 */
        0xE0, 0x53, 0xE0, 0x54, 0xDB, 0xAA, 0xD4, 0xC7, /* 0xE4-0xE7 */
        0xB2, 0xBF, 0xE0, 0x55, 0xE0, 0x56, 0xDB, 0xAF, /* 0xE8-0xEB */
        0xE0, 0x57, 0xB9, 0xF9, 0xE0, 0x58, 0xDB, 0xB0, /* 0xEC-0xEF */
        0xE0, 0x59, 0xE0, 0x5A, 0xE0, 0x5B, 0xE0, 0x5C, /* 0xF0-0xF3 */
        0xB3, 0xBB, 0xE0, 0x5D, 0xE0, 0x5E, 0xE0, 0x5F, /* 0xF4-0xF7 */
        0xB5, 0xA6, 0xE0, 0x60, 0xE0, 0x61, 0xE0, 0x62, /* 0xF8-0xFB */
        0xE0, 0x63, 0xB6, 0xBC, 0xDB, 0xB1, 0xE0, 0x64, /* 0xFC-0xFF */
};

static const unsigned char u2c_91[512] = {
        0xE0, 0x65, 0xE0, 0x66, 0xB6, 0xF5, 0xE0, 0x67, /* 0x00-0x03 */
        0xDB, 0xB2, 0xE0, 0x68, 0xE0, 0x69, 0xE0, 0x6A, /* 0x04-0x07 */
        0xE0, 0x6B, 0xE0, 0x6C, 0xE0, 0x6D, 0xE0, 0x6E, /* 0x08-0x0B */
        0xE0, 0x6F, 0xE0, 0x70, 0xE0, 0x71, 0xE0, 0x72, /* 0x0C-0x0F */
        0xE0, 0x73, 0xE0, 0x74, 0xE0, 0x75, 0xE0, 0x76, /* 0x10-0x13 */
        0xE0, 0x77, 0xE0, 0x78, 0xE0, 0x79, 0xE0, 0x7A, /* 0x14-0x17 */
        0xE0, 0x7B, 0xB1, 0xC9, 0xE0, 0x7C, 0xE0, 0x7D, /* 0x18-0x1B */
        0xE0, 0x7E, 0xE0, 0x80, 0xDB, 0xB4, 0xE0, 0x81, /* 0x1C-0x1F */
        0xE0, 0x82, 0xE0, 0x83, 0xDB, 0xB3, 0xDB, 0xB5, /* 0x20-0x23 */
        0xE0, 0x84, 0xE0, 0x85, 0xE0, 0x86, 0xE0, 0x87, /* 0x24-0x27 */
        0xE0, 0x88, 0xE0, 0x89, 0xE0, 0x8A, 0xE0, 0x8B, /* 0x28-0x2B */
        0xE0, 0x8C, 0xE0, 0x8D, 0xE0, 0x8E, 0xDB, 0xB7, /* 0x2C-0x2F */
        0xE0, 0x8F, 0xDB, 0xB6, 0xE0, 0x90, 0xE0, 0x91, /* 0x30-0x33 */
        0xE0, 0x92, 0xE0, 0x93, 0xE0, 0x94, 0xE0, 0x95, /* 0x34-0x37 */
        0xE0, 0x96, 0xDB, 0xB8, 0xE0, 0x97, 0xE0, 0x98, /* 0x38-0x3B */
        0xE0, 0x99, 0xE0, 0x9A, 0xE0, 0x9B, 0xE0, 0x9C, /* 0x3C-0x3F */
        0xE0, 0x9D, 0xE0, 0x9E, 0xE0, 0x9F, 0xDB, 0xB9, /* 0x40-0x43 */
        0xE0, 0xA0, 0xE1, 0x40, 0xDB, 0xBA, 0xE1, 0x41, /* 0x44-0x47 */
        0xE1, 0x42, 0xD3, 0xCF, 0xF4, 0xFA, 0xC7, 0xF5, /* 0x48-0x4B */
        0xD7, 0xC3, 0xC5, 0xE4, 0xF4, 0xFC, 0xF4, 0xFD, /* 0x4C-0x4F */
        0xF4, 0xFB, 0xE1, 0x43, 0xBE, 0xC6, 0xE1, 0x44, /* 0x50-0x53 */
        0xE1, 0x45, 0xE1, 0x46, 0xE1, 0x47, 0xD0, 0xEF, /* 0x54-0x57 */
        0xE1, 0x48, 0xE1, 0x49, 0xB7, 0xD3, 0xE1, 0x4A, /* 0x58-0x5B */
        0xE1, 0x4B, 0xD4, 0xCD, 0xCC, 0xAA, 0xE1, 0x4C, /* 0x5C-0x5F */
        0xE1, 0x4D, 0xF5, 0xA2, 0xF5, 0xA1, 0xBA, 0xA8, /* 0x60-0x63 */
        0xF4, 0xFE, 0xCB, 0xD6, 0xE1, 0x4E, 0xE1, 0x4F, /* 0x64-0x67 */
        0xE1, 0x50, 0xF5, 0xA4, 0xC0, 0xD2, 0xE1, 0x51, /* 0x68-0x6B */
        0xB3, 0xEA, 0xE1, 0x52, 0xCD, 0xAA, 0xF5, 0xA5, /* 0x6C-0x6F */
        0xF5, 0xA3, 0xBD, 0xB4, 0xF5, 0xA8, 0xE1, 0x53, /* 0x70-0x73 */
        0xF5, 0xA9, 0xBD, 0xCD, 0xC3, 0xB8, 0xBF, 0xE1, /* 0x74-0x77 */
        0xCB, 0xE1, 0xF5, 0xAA, 0xE1, 0x54, 0xE1, 0x55, /* 0x78-0x7B */
        0xE1, 0x56, 0xF5, 0xA6, 0xF5, 0xA7, 0xC4, 0xF0, /* 0x7C-0x7F */
        
        0xE1, 0x57, 0xE1, 0x58, 0xE1, 0x59, 0xE1, 0x5A, /* 0x80-0x83 */
        0xE1, 0x5B, 0xF5, 0xAC, 0xE1, 0x5C, 0xB4, 0xBC, /* 0x84-0x87 */
        0xE1, 0x5D, 0xD7, 0xED, 0xE1, 0x5E, 0xB4, 0xD7, /* 0x88-0x8B */
        0xF5, 0xAB, 0xF5, 0xAE, 0xE1, 0x5F, 0xE1, 0x60, /* 0x8C-0x8F */
        0xF5, 0xAD, 0xF5, 0xAF, 0xD0, 0xD1, 0xE1, 0x61, /* 0x90-0x93 */
        0xE1, 0x62, 0xE1, 0x63, 0xE1, 0x64, 0xE1, 0x65, /* 0x94-0x97 */
        0xE1, 0x66, 0xE1, 0x67, 0xC3, 0xD1, 0xC8, 0xA9, /* 0x98-0x9B */
        0xE1, 0x68, 0xE1, 0x69, 0xE1, 0x6A, 0xE1, 0x6B, /* 0x9C-0x9F */
        0xE1, 0x6C, 0xE1, 0x6D, 0xF5, 0xB0, 0xF5, 0xB1, /* 0xA0-0xA3 */
        0xE1, 0x6E, 0xE1, 0x6F, 0xE1, 0x70, 0xE1, 0x71, /* 0xA4-0xA7 */
        0xE1, 0x72, 0xE1, 0x73, 0xF5, 0xB2, 0xE1, 0x74, /* 0xA8-0xAB */
        0xE1, 0x75, 0xF5, 0xB3, 0xF5, 0xB4, 0xF5, 0xB5, /* 0xAC-0xAF */
        0xE1, 0x76, 0xE1, 0x77, 0xE1, 0x78, 0xE1, 0x79, /* 0xB0-0xB3 */
        0xF5, 0xB7, 0xF5, 0xB6, 0xE1, 0x7A, 0xE1, 0x7B, /* 0xB4-0xB7 */
        0xE1, 0x7C, 0xE1, 0x7D, 0xF5, 0xB8, 0xE1, 0x7E, /* 0xB8-0xBB */
        0xE1, 0x80, 0xE1, 0x81, 0xE1, 0x82, 0xE1, 0x83, /* 0xBC-0xBF */
        0xE1, 0x84, 0xE1, 0x85, 0xE1, 0x86, 0xE1, 0x87, /* 0xC0-0xC3 */
        0xE1, 0x88, 0xE1, 0x89, 0xE1, 0x8A, 0xB2, 0xC9, /* 0xC4-0xC7 */
        0xE1, 0x8B, 0xD3, 0xD4, 0xCA, 0xCD, 0xE1, 0x8C, /* 0xC8-0xCB */
        0xC0, 0xEF, 0xD6, 0xD8, 0xD2, 0xB0, 0xC1, 0xBF, /* 0xCC-0xCF */
        0xE1, 0x8D, 0xBD, 0xF0, 0xE1, 0x8E, 0xE1, 0x8F, /* 0xD0-0xD3 */
        0xE1, 0x90, 0xE1, 0x91, 0xE1, 0x92, 0xE1, 0x93, /* 0xD4-0xD7 */
        0xE1, 0x94, 0xE1, 0x95, 0xE1, 0x96, 0xE1, 0x97, /* 0xD8-0xDB */
        0xB8, 0xAA, 0xE1, 0x98, 0xE1, 0x99, 0xE1, 0x9A, /* 0xDC-0xDF */
        0xE1, 0x9B, 0xE1, 0x9C, 0xE1, 0x9D, 0xE1, 0x9E, /* 0xE0-0xE3 */
        0xE1, 0x9F, 0xE1, 0xA0, 0xE2, 0x40, 0xE2, 0x41, /* 0xE4-0xE7 */
        0xE2, 0x42, 0xE2, 0x43, 0xE2, 0x44, 0xE2, 0x45, /* 0xE8-0xEB */
        0xE2, 0x46, 0xE2, 0x47, 0xE2, 0x48, 0xE2, 0x49, /* 0xEC-0xEF */
        0xE2, 0x4A, 0xE2, 0x4B, 0xE2, 0x4C, 0xE2, 0x4D, /* 0xF0-0xF3 */
        0xE2, 0x4E, 0xE2, 0x4F, 0xE2, 0x50, 0xE2, 0x51, /* 0xF4-0xF7 */
        0xE2, 0x52, 0xE2, 0x53, 0xE2, 0x54, 0xE2, 0x55, /* 0xF8-0xFB */
        0xE2, 0x56, 0xE2, 0x57, 0xE2, 0x58, 0xE2, 0x59, /* 0xFC-0xFF */
};

static const unsigned char u2c_92[512] = {
        0xE2, 0x5A, 0xE2, 0x5B, 0xE2, 0x5C, 0xE2, 0x5D, /* 0x00-0x03 */
        0xE2, 0x5E, 0xE2, 0x5F, 0xE2, 0x60, 0xE2, 0x61, /* 0x04-0x07 */
        0xE2, 0x62, 0xE2, 0x63, 0xE2, 0x64, 0xE2, 0x65, /* 0x08-0x0B */
        0xE2, 0x66, 0xE2, 0x67, 0xE2, 0x68, 0xE2, 0x69, /* 0x0C-0x0F */
        0xE2, 0x6A, 0xE2, 0x6B, 0xE2, 0x6C, 0xE2, 0x6D, /* 0x10-0x13 */
        0xE2, 0x6E, 0xE2, 0x6F, 0xE2, 0x70, 0xE2, 0x71, /* 0x14-0x17 */
        0xE2, 0x72, 0xE2, 0x73, 0xE2, 0x74, 0xE2, 0x75, /* 0x18-0x1B */
        0xE2, 0x76, 0xE2, 0x77, 0xE2, 0x78, 0xE2, 0x79, /* 0x1C-0x1F */
        0xE2, 0x7A, 0xE2, 0x7B, 0xE2, 0x7C, 0xE2, 0x7D, /* 0x20-0x23 */
        0xE2, 0x7E, 0xE2, 0x80, 0xE2, 0x81, 0xE2, 0x82, /* 0x24-0x27 */
        0xE2, 0x83, 0xE2, 0x84, 0xE2, 0x85, 0xE2, 0x86, /* 0x28-0x2B */
        0xE2, 0x87, 0xE2, 0x88, 0xE2, 0x89, 0xE2, 0x8A, /* 0x2C-0x2F */
        0xE2, 0x8B, 0xE2, 0x8C, 0xE2, 0x8D, 0xE2, 0x8E, /* 0x30-0x33 */
        0xE2, 0x8F, 0xE2, 0x90, 0xE2, 0x91, 0xE2, 0x92, /* 0x34-0x37 */
        0xE2, 0x93, 0xE2, 0x94, 0xE2, 0x95, 0xE2, 0x96, /* 0x38-0x3B */
        0xE2, 0x97, 0xE2, 0x98, 0xE2, 0x99, 0xE2, 0x9A, /* 0x3C-0x3F */
        0xE2, 0x9B, 0xE2, 0x9C, 0xE2, 0x9D, 0xE2, 0x9E, /* 0x40-0x43 */
        0xE2, 0x9F, 0xE2, 0xA0, 0xE3, 0x40, 0xE3, 0x41, /* 0x44-0x47 */
        0xE3, 0x42, 0xE3, 0x43, 0xE3, 0x44, 0xE3, 0x45, /* 0x48-0x4B */
        0xE3, 0x46, 0xE3, 0x47, 0xE3, 0x48, 0xE3, 0x49, /* 0x4C-0x4F */
        0xE3, 0x4A, 0xE3, 0x4B, 0xE3, 0x4C, 0xE3, 0x4D, /* 0x50-0x53 */
        0xE3, 0x4E, 0xE3, 0x4F, 0xE3, 0x50, 0xE3, 0x51, /* 0x54-0x57 */
        0xE3, 0x52, 0xE3, 0x53, 0xE3, 0x54, 0xE3, 0x55, /* 0x58-0x5B */
        0xE3, 0x56, 0xE3, 0x57, 0xE3, 0x58, 0xE3, 0x59, /* 0x5C-0x5F */
        0xE3, 0x5A, 0xE3, 0x5B, 0xE3, 0x5C, 0xE3, 0x5D, /* 0x60-0x63 */
        0xE3, 0x5E, 0xE3, 0x5F, 0xE3, 0x60, 0xE3, 0x61, /* 0x64-0x67 */
        0xE3, 0x62, 0xE3, 0x63, 0xE3, 0x64, 0xE3, 0x65, /* 0x68-0x6B */
        0xE3, 0x66, 0xE3, 0x67, 0xE3, 0x68, 0xE3, 0x69, /* 0x6C-0x6F */
        0xE3, 0x6A, 0xE3, 0x6B, 0xE3, 0x6C, 0xE3, 0x6D, /* 0x70-0x73 */
        0xBC, 0xF8, 0xE3, 0x6E, 0xE3, 0x6F, 0xE3, 0x70, /* 0x74-0x77 */
        0xE3, 0x71, 0xE3, 0x72, 0xE3, 0x73, 0xE3, 0x74, /* 0x78-0x7B */
        0xE3, 0x75, 0xE3, 0x76, 0xE3, 0x77, 0xE3, 0x78, /* 0x7C-0x7F */
        
        0xE3, 0x79, 0xE3, 0x7A, 0xE3, 0x7B, 0xE3, 0x7C, /* 0x80-0x83 */
        0xE3, 0x7D, 0xE3, 0x7E, 0xE3, 0x80, 0xE3, 0x81, /* 0x84-0x87 */
        0xE3, 0x82, 0xE3, 0x83, 0xE3, 0x84, 0xE3, 0x85, /* 0x88-0x8B */
        0xE3, 0x86, 0xE3, 0x87, 0xF6, 0xC6, 0xE3, 0x88, /* 0x8C-0x8F */
        0xE3, 0x89, 0xE3, 0x8A, 0xE3, 0x8B, 0xE3, 0x8C, /* 0x90-0x93 */
        0xE3, 0x8D, 0xE3, 0x8E, 0xE3, 0x8F, 0xE3, 0x90, /* 0x94-0x97 */
        0xE3, 0x91, 0xE3, 0x92, 0xE3, 0x93, 0xE3, 0x94, /* 0x98-0x9B */
        0xE3, 0x95, 0xE3, 0x96, 0xE3, 0x97, 0xE3, 0x98, /* 0x9C-0x9F */
        0xE3, 0x99, 0xE3, 0x9A, 0xE3, 0x9B, 0xE3, 0x9C, /* 0xA0-0xA3 */
        0xE3, 0x9D, 0xE3, 0x9E, 0xE3, 0x9F, 0xE3, 0xA0, /* 0xA4-0xA7 */
        0xE4, 0x40, 0xE4, 0x41, 0xE4, 0x42, 0xE4, 0x43, /* 0xA8-0xAB */
        0xE4, 0x44, 0xE4, 0x45, 0xF6, 0xC7, 0xE4, 0x46, /* 0xAC-0xAF */
        0xE4, 0x47, 0xE4, 0x48, 0xE4, 0x49, 0xE4, 0x4A, /* 0xB0-0xB3 */
        0xE4, 0x4B, 0xE4, 0x4C, 0xE4, 0x4D, 0xE4, 0x4E, /* 0xB4-0xB7 */
        0xE4, 0x4F, 0xE4, 0x50, 0xE4, 0x51, 0xE4, 0x52, /* 0xB8-0xBB */
        0xE4, 0x53, 0xE4, 0x54, 0xE4, 0x55, 0xE4, 0x56, /* 0xBC-0xBF */
        0xE4, 0x57, 0xE4, 0x58, 0xE4, 0x59, 0xE4, 0x5A, /* 0xC0-0xC3 */
        0xE4, 0x5B, 0xE4, 0x5C, 0xE4, 0x5D, 0xE4, 0x5E, /* 0xC4-0xC7 */
        0xF6, 0xC8, 0xE4, 0x5F, 0xE4, 0x60, 0xE4, 0x61, /* 0xC8-0xCB */
        0xE4, 0x62, 0xE4, 0x63, 0xE4, 0x64, 0xE4, 0x65, /* 0xCC-0xCF */
        0xE4, 0x66, 0xE4, 0x67, 0xE4, 0x68, 0xE4, 0x69, /* 0xD0-0xD3 */
        0xE4, 0x6A, 0xE4, 0x6B, 0xE4, 0x6C, 0xE4, 0x6D, /* 0xD4-0xD7 */
        0xE4, 0x6E, 0xE4, 0x6F, 0xE4, 0x70, 0xE4, 0x71, /* 0xD8-0xDB */
        0xE4, 0x72, 0xE4, 0x73, 0xE4, 0x74, 0xE4, 0x75, /* 0xDC-0xDF */
        0xE4, 0x76, 0xE4, 0x77, 0xE4, 0x78, 0xE4, 0x79, /* 0xE0-0xE3 */
        0xE4, 0x7A, 0xE4, 0x7B, 0xE4, 0x7C, 0xE4, 0x7D, /* 0xE4-0xE7 */
        0xE4, 0x7E, 0xE4, 0x80, 0xE4, 0x81, 0xE4, 0x82, /* 0xE8-0xEB */
        0xE4, 0x83, 0xE4, 0x84, 0xE4, 0x85, 0xE4, 0x86, /* 0xEC-0xEF */
        0xE4, 0x87, 0xE4, 0x88, 0xE4, 0x89, 0xE4, 0x8A, /* 0xF0-0xF3 */
        0xE4, 0x8B, 0xE4, 0x8C, 0xE4, 0x8D, 0xE4, 0x8E, /* 0xF4-0xF7 */
        0xE4, 0x8F, 0xE4, 0x90, 0xE4, 0x91, 0xE4, 0x92, /* 0xF8-0xFB */
        0xE4, 0x93, 0xE4, 0x94, 0xE4, 0x95, 0xE4, 0x96, /* 0xFC-0xFF */
};

static const unsigned char u2c_93[512] = {
        0xE4, 0x97, 0xE4, 0x98, 0xE4, 0x99, 0xE4, 0x9A, /* 0x00-0x03 */
        0xE4, 0x9B, 0xE4, 0x9C, 0xE4, 0x9D, 0xE4, 0x9E, /* 0x04-0x07 */
        0xE4, 0x9F, 0xE4, 0xA0, 0xE5, 0x40, 0xE5, 0x41, /* 0x08-0x0B */
        0xE5, 0x42, 0xE5, 0x43, 0xE5, 0x44, 0xE5, 0x45, /* 0x0C-0x0F */
        0xE5, 0x46, 0xE5, 0x47, 0xE5, 0x48, 0xE5, 0x49, /* 0x10-0x13 */
        0xE5, 0x4A, 0xE5, 0x4B, 0xE5, 0x4C, 0xE5, 0x4D, /* 0x14-0x17 */
        0xE5, 0x4E, 0xE5, 0x4F, 0xE5, 0x50, 0xE5, 0x51, /* 0x18-0x1B */
        0xE5, 0x52, 0xE5, 0x53, 0xE5, 0x54, 0xE5, 0x55, /* 0x1C-0x1F */
        0xE5, 0x56, 0xE5, 0x57, 0xE5, 0x58, 0xE5, 0x59, /* 0x20-0x23 */
        0xE5, 0x5A, 0xE5, 0x5B, 0xE5, 0x5C, 0xE5, 0x5D, /* 0x24-0x27 */
        0xE5, 0x5E, 0xE5, 0x5F, 0xE5, 0x60, 0xE5, 0x61, /* 0x28-0x2B */
        0xE5, 0x62, 0xE5, 0x63, 0xE5, 0x64, 0xE5, 0x65, /* 0x2C-0x2F */
        0xE5, 0x66, 0xE5, 0x67, 0xE5, 0x68, 0xE5, 0x69, /* 0x30-0x33 */
        0xE5, 0x6A, 0xE5, 0x6B, 0xE5, 0x6C, 0xE5, 0x6D, /* 0x34-0x37 */
        0xE5, 0x6E, 0xE5, 0x6F, 0xE5, 0x70, 0xE5, 0x71, /* 0x38-0x3B */
        0xE5, 0x72, 0xE5, 0x73, 0xF6, 0xC9, 0xE5, 0x74, /* 0x3C-0x3F */
        0xE5, 0x75, 0xE5, 0x76, 0xE5, 0x77, 0xE5, 0x78, /* 0x40-0x43 */
        0xE5, 0x79, 0xE5, 0x7A, 0xE5, 0x7B, 0xE5, 0x7C, /* 0x44-0x47 */
        0xE5, 0x7D, 0xE5, 0x7E, 0xE5, 0x80, 0xE5, 0x81, /* 0x48-0x4B */
        0xE5, 0x82, 0xE5, 0x83, 0xE5, 0x84, 0xE5, 0x85, /* 0x4C-0x4F */
        0xE5, 0x86, 0xE5, 0x87, 0xE5, 0x88, 0xE5, 0x89, /* 0x50-0x53 */
        0xE5, 0x8A, 0xE5, 0x8B, 0xE5, 0x8C, 0xE5, 0x8D, /* 0x54-0x57 */
        0xE5, 0x8E, 0xE5, 0x8F, 0xE5, 0x90, 0xE5, 0x91, /* 0x58-0x5B */
        0xE5, 0x92, 0xE5, 0x93, 0xE5, 0x94, 0xE5, 0x95, /* 0x5C-0x5F */
        0xE5, 0x96, 0xE5, 0x97, 0xE5, 0x98, 0xE5, 0x99, /* 0x60-0x63 */
        0xE5, 0x9A, 0xE5, 0x9B, 0xE5, 0x9C, 0xE5, 0x9D, /* 0x64-0x67 */
        0xE5, 0x9E, 0xE5, 0x9F, 0xF6, 0xCA, 0xE5, 0xA0, /* 0x68-0x6B */
        0xE6, 0x40, 0xE6, 0x41, 0xE6, 0x42, 0xE6, 0x43, /* 0x6C-0x6F */
        0xE6, 0x44, 0xE6, 0x45, 0xE6, 0x46, 0xE6, 0x47, /* 0x70-0x73 */
        0xE6, 0x48, 0xE6, 0x49, 0xE6, 0x4A, 0xE6, 0x4B, /* 0x74-0x77 */
        0xE6, 0x4C, 0xE6, 0x4D, 0xE6, 0x4E, 0xE6, 0x4F, /* 0x78-0x7B */
        0xE6, 0x50, 0xE6, 0x51, 0xE6, 0x52, 0xE6, 0x53, /* 0x7C-0x7F */
        
        0xE6, 0x54, 0xE6, 0x55, 0xE6, 0x56, 0xE6, 0x57, /* 0x80-0x83 */
        0xE6, 0x58, 0xE6, 0x59, 0xE6, 0x5A, 0xE6, 0x5B, /* 0x84-0x87 */
        0xE6, 0x5C, 0xE6, 0x5D, 0xE6, 0x5E, 0xE6, 0x5F, /* 0x88-0x8B */
        0xE6, 0x60, 0xE6, 0x61, 0xE6, 0x62, 0xF6, 0xCC, /* 0x8C-0x8F */
        0xE6, 0x63, 0xE6, 0x64, 0xE6, 0x65, 0xE6, 0x66, /* 0x90-0x93 */
        0xE6, 0x67, 0xE6, 0x68, 0xE6, 0x69, 0xE6, 0x6A, /* 0x94-0x97 */
        0xE6, 0x6B, 0xE6, 0x6C, 0xE6, 0x6D, 0xE6, 0x6E, /* 0x98-0x9B */
        0xE6, 0x6F, 0xE6, 0x70, 0xE6, 0x71, 0xE6, 0x72, /* 0x9C-0x9F */
        0xE6, 0x73, 0xE6, 0x74, 0xE6, 0x75, 0xE6, 0x76, /* 0xA0-0xA3 */
        0xE6, 0x77, 0xE6, 0x78, 0xE6, 0x79, 0xE6, 0x7A, /* 0xA4-0xA7 */
        0xE6, 0x7B, 0xE6, 0x7C, 0xE6, 0x7D, 0xE6, 0x7E, /* 0xA8-0xAB */
        0xE6, 0x80, 0xE6, 0x81, 0xE6, 0x82, 0xE6, 0x83, /* 0xAC-0xAF */
        0xE6, 0x84, 0xE6, 0x85, 0xE6, 0x86, 0xE6, 0x87, /* 0xB0-0xB3 */
        0xE6, 0x88, 0xE6, 0x89, 0xE6, 0x8A, 0xE6, 0x8B, /* 0xB4-0xB7 */
        0xE6, 0x8C, 0xE6, 0x8D, 0xE6, 0x8E, 0xE6, 0x8F, /* 0xB8-0xBB */
        0xE6, 0x90, 0xE6, 0x91, 0xE6, 0x92, 0xE6, 0x93, /* 0xBC-0xBF */
        0xE6, 0x94, 0xE6, 0x95, 0xE6, 0x96, 0xE6, 0x97, /* 0xC0-0xC3 */
        0xE6, 0x98, 0xE6, 0x99, 0xE6, 0x9A, 0xE6, 0x9B, /* 0xC4-0xC7 */
        0xE6, 0x9C, 0xE6, 0x9D, 0xF6, 0xCB, 0xE6, 0x9E, /* 0xC8-0xCB */
        0xE6, 0x9F, 0xE6, 0xA0, 0xE7, 0x40, 0xE7, 0x41, /* 0xCC-0xCF */
        0xE7, 0x42, 0xE7, 0x43, 0xE7, 0x44, 0xE7, 0x45, /* 0xD0-0xD3 */
        0xE7, 0x46, 0xE7, 0x47, 0xF7, 0xE9, 0xE7, 0x48, /* 0xD4-0xD7 */
        0xE7, 0x49, 0xE7, 0x4A, 0xE7, 0x4B, 0xE7, 0x4C, /* 0xD8-0xDB */
        0xE7, 0x4D, 0xE7, 0x4E, 0xE7, 0x4F, 0xE7, 0x50, /* 0xDC-0xDF */
        0xE7, 0x51, 0xE7, 0x52, 0xE7, 0x53, 0xE7, 0x54, /* 0xE0-0xE3 */
        0xE7, 0x55, 0xE7, 0x56, 0xE7, 0x57, 0xE7, 0x58, /* 0xE4-0xE7 */
        0xE7, 0x59, 0xE7, 0x5A, 0xE7, 0x5B, 0xE7, 0x5C, /* 0xE8-0xEB */
        0xE7, 0x5D, 0xE7, 0x5E, 0xE7, 0x5F, 0xE7, 0x60, /* 0xEC-0xEF */
        0xE7, 0x61, 0xE7, 0x62, 0xE7, 0x63, 0xE7, 0x64, /* 0xF0-0xF3 */
        0xE7, 0x65, 0xE7, 0x66, 0xE7, 0x67, 0xE7, 0x68, /* 0xF4-0xF7 */
        0xE7, 0x69, 0xE7, 0x6A, 0xE7, 0x6B, 0xE7, 0x6C, /* 0xF8-0xFB */
        0xE7, 0x6D, 0xE7, 0x6E, 0xE7, 0x6F, 0xE7, 0x70, /* 0xFC-0xFF */
};

static const unsigned char u2c_94[512] = {
        0xE7, 0x71, 0xE7, 0x72, 0xE7, 0x73, 0xE7, 0x74, /* 0x00-0x03 */
        0xE7, 0x75, 0xE7, 0x76, 0xE7, 0x77, 0xE7, 0x78, /* 0x04-0x07 */
        0xE7, 0x79, 0xE7, 0x7A, 0xE7, 0x7B, 0xE7, 0x7C, /* 0x08-0x0B */
        0xE7, 0x7D, 0xE7, 0x7E, 0xE7, 0x80, 0xE7, 0x81, /* 0x0C-0x0F */
        0xE7, 0x82, 0xE7, 0x83, 0xE7, 0x84, 0xE7, 0x85, /* 0x10-0x13 */
        0xE7, 0x86, 0xE7, 0x87, 0xE7, 0x88, 0xE7, 0x89, /* 0x14-0x17 */
        0xE7, 0x8A, 0xE7, 0x8B, 0xE7, 0x8C, 0xE7, 0x8D, /* 0x18-0x1B */
        0xE7, 0x8E, 0xE7, 0x8F, 0xE7, 0x90, 0xE7, 0x91, /* 0x1C-0x1F */
        0xE7, 0x92, 0xE7, 0x93, 0xE7, 0x94, 0xE7, 0x95, /* 0x20-0x23 */
        0xE7, 0x96, 0xE7, 0x97, 0xE7, 0x98, 0xE7, 0x99, /* 0x24-0x27 */
        0xE7, 0x9A, 0xE7, 0x9B, 0xE7, 0x9C, 0xE7, 0x9D, /* 0x28-0x2B */
        0xE7, 0x9E, 0xE7, 0x9F, 0xE7, 0xA0, 0xE8, 0x40, /* 0x2C-0x2F */
        0xE8, 0x41, 0xE8, 0x42, 0xE8, 0x43, 0xE8, 0x44, /* 0x30-0x33 */
        0xE8, 0x45, 0xE8, 0x46, 0xE8, 0x47, 0xE8, 0x48, /* 0x34-0x37 */
        0xE8, 0x49, 0xE8, 0x4A, 0xE8, 0x4B, 0xE8, 0x4C, /* 0x38-0x3B */
        0xE8, 0x4D, 0xE8, 0x4E, 0xF6, 0xCD, 0xE8, 0x4F, /* 0x3C-0x3F */
        0xE8, 0x50, 0xE8, 0x51, 0xE8, 0x52, 0xE8, 0x53, /* 0x40-0x43 */
        0xE8, 0x54, 0xE8, 0x55, 0xE8, 0x56, 0xE8, 0x57, /* 0x44-0x47 */
        0xE8, 0x58, 0xE8, 0x59, 0xE8, 0x5A, 0xE8, 0x5B, /* 0x48-0x4B */
        0xE8, 0x5C, 0xE8, 0x5D, 0xE8, 0x5E, 0xE8, 0x5F, /* 0x4C-0x4F */
        0xE8, 0x60, 0xE8, 0x61, 0xE8, 0x62, 0xE8, 0x63, /* 0x50-0x53 */
        0xE8, 0x64, 0xE8, 0x65, 0xE8, 0x66, 0xE8, 0x67, /* 0x54-0x57 */
        0xE8, 0x68, 0xE8, 0x69, 0xE8, 0x6A, 0xE8, 0x6B, /* 0x58-0x5B */
        0xE8, 0x6C, 0xE8, 0x6D, 0xE8, 0x6E, 0xE8, 0x6F, /* 0x5C-0x5F */
        0xE8, 0x70, 0xE8, 0x71, 0xE8, 0x72, 0xE8, 0x73, /* 0x60-0x63 */
        0xE8, 0x74, 0xE8, 0x75, 0xE8, 0x76, 0xE8, 0x77, /* 0x64-0x67 */
        0xE8, 0x78, 0xE8, 0x79, 0xE8, 0x7A, 0xF6, 0xCE, /* 0x68-0x6B */
        0xE8, 0x7B, 0xE8, 0x7C, 0xE8, 0x7D, 0xE8, 0x7E, /* 0x6C-0x6F */
        0xE8, 0x80, 0xE8, 0x81, 0xE8, 0x82, 0xE8, 0x83, /* 0x70-0x73 */
        0xE8, 0x84, 0xE8, 0x85, 0xE8, 0x86, 0xE8, 0x87, /* 0x74-0x77 */
        0xE8, 0x88, 0xE8, 0x89, 0xE8, 0x8A, 0xE8, 0x8B, /* 0x78-0x7B */
        0xE8, 0x8C, 0xE8, 0x8D, 0xE8, 0x8E, 0xE8, 0x8F, /* 0x7C-0x7F */
        
        0xE8, 0x90, 0xE8, 0x91, 0xE8, 0x92, 0xE8, 0x93, /* 0x80-0x83 */
        0xE8, 0x94, 0xEE, 0xC4, 0xEE, 0xC5, 0xEE, 0xC6, /* 0x84-0x87 */
        0xD5, 0xEB, 0xB6, 0xA4, 0xEE, 0xC8, 0xEE, 0xC7, /* 0x88-0x8B */
        0xEE, 0xC9, 0xEE, 0xCA, 0xC7, 0xA5, 0xEE, 0xCB, /* 0x8C-0x8F */
        0xEE, 0xCC, 0xE8, 0x95, 0xB7, 0xB0, 0xB5, 0xF6, /* 0x90-0x93 */
        0xEE, 0xCD, 0xEE, 0xCF, 0xE8, 0x96, 0xEE, 0xCE, /* 0x94-0x97 */
        0xE8, 0x97, 0xB8, 0xC6, 0xEE, 0xD0, 0xEE, 0xD1, /* 0x98-0x9B */
        0xEE, 0xD2, 0xB6, 0xDB, 0xB3, 0xAE, 0xD6, 0xD3, /* 0x9C-0x9F */
        0xC4, 0xC6, 0xB1, 0xB5, 0xB8, 0xD6, 0xEE, 0xD3, /* 0xA0-0xA3 */
        0xEE, 0xD4, 0xD4, 0xBF, 0xC7, 0xD5, 0xBE, 0xFB, /* 0xA4-0xA7 */
        0xCE, 0xD9, 0xB9, 0xB3, 0xEE, 0xD6, 0xEE, 0xD5, /* 0xA8-0xAB */
        0xEE, 0xD8, 0xEE, 0xD7, 0xC5, 0xA5, 0xEE, 0xD9, /* 0xAC-0xAF */
        0xEE, 0xDA, 0xC7, 0xAE, 0xEE, 0xDB, 0xC7, 0xAF, /* 0xB0-0xB3 */
        0xEE, 0xDC, 0xB2, 0xA7, 0xEE, 0xDD, 0xEE, 0xDE, /* 0xB4-0xB7 */
        0xEE, 0xDF, 0xEE, 0xE0, 0xEE, 0xE1, 0xD7, 0xEA, /* 0xB8-0xBB */
        0xEE, 0xE2, 0xEE, 0xE3, 0xBC, 0xD8, 0xEE, 0xE4, /* 0xBC-0xBF */
        0xD3, 0xCB, 0xCC, 0xFA, 0xB2, 0xAC, 0xC1, 0xE5, /* 0xC0-0xC3 */
        0xEE, 0xE5, 0xC7, 0xA6, 0xC3, 0xAD, 0xE8, 0x98, /* 0xC4-0xC7 */
        0xEE, 0xE6, 0xEE, 0xE7, 0xEE, 0xE8, 0xEE, 0xE9, /* 0xC8-0xCB */
        0xEE, 0xEA, 0xEE, 0xEB, 0xEE, 0xEC, 0xE8, 0x99, /* 0xCC-0xCF */
        0xEE, 0xED, 0xEE, 0xEE, 0xEE, 0xEF, 0xE8, 0x9A, /* 0xD0-0xD3 */
        0xE8, 0x9B, 0xEE, 0xF0, 0xEE, 0xF1, 0xEE, 0xF2, /* 0xD4-0xD7 */
        0xEE, 0xF4, 0xEE, 0xF3, 0xE8, 0x9C, 0xEE, 0xF5, /* 0xD8-0xDB */
        0xCD, 0xAD, 0xC2, 0xC1, 0xEE, 0xF6, 0xEE, 0xF7, /* 0xDC-0xDF */
        0xEE, 0xF8, 0xD5, 0xA1, 0xEE, 0xF9, 0xCF, 0xB3, /* 0xE0-0xE3 */
        0xEE, 0xFA, 0xEE, 0xFB, 0xE8, 0x9D, 0xEE, 0xFC, /* 0xE4-0xE7 */
        0xEE, 0xFD, 0xEF, 0xA1, 0xEE, 0xFE, 0xEF, 0xA2, /* 0xE8-0xEB */
        0xB8, 0xF5, 0xC3, 0xFA, 0xEF, 0xA3, 0xEF, 0xA4, /* 0xEC-0xEF */
        0xBD, 0xC2, 0xD2, 0xBF, 0xB2, 0xF9, 0xEF, 0xA5, /* 0xF0-0xF3 */
        0xEF, 0xA6, 0xEF, 0xA7, 0xD2, 0xF8, 0xEF, 0xA8, /* 0xF4-0xF7 */
        0xD6, 0xFD, 0xEF, 0xA9, 0xC6, 0xCC, 0xE8, 0x9E, /* 0xF8-0xFB */
        0xEF, 0xAA, 0xEF, 0xAB, 0xC1, 0xB4, 0xEF, 0xAC, /* 0xFC-0xFF */
};

static const unsigned char u2c_95[512] = {
        0xCF, 0xFA, 0xCB, 0xF8, 0xEF, 0xAE, 0xEF, 0xAD, /* 0x00-0x03 */
        0xB3, 0xFA, 0xB9, 0xF8, 0xEF, 0xAF, 0xEF, 0xB0, /* 0x04-0x07 */
        0xD0, 0xE2, 0xEF, 0xB1, 0xEF, 0xB2, 0xB7, 0xE6, /* 0x08-0x0B */
        0xD0, 0xBF, 0xEF, 0xB3, 0xEF, 0xB4, 0xEF, 0xB5, /* 0x0C-0x0F */
        0xC8, 0xF1, 0xCC, 0xE0, 0xEF, 0xB6, 0xEF, 0xB7, /* 0x10-0x13 */
        0xEF, 0xB8, 0xEF, 0xB9, 0xEF, 0xBA, 0xD5, 0xE0, /* 0x14-0x17 */
        0xEF, 0xBB, 0xB4, 0xED, 0xC3, 0xAA, 0xEF, 0xBC, /* 0x18-0x1B */
        0xE8, 0x9F, 0xEF, 0xBD, 0xEF, 0xBE, 0xEF, 0xBF, /* 0x1C-0x1F */
        0xE8, 0xA0, 0xCE, 0xFD, 0xEF, 0xC0, 0xC2, 0xE0, /* 0x20-0x23 */
        0xB4, 0xB8, 0xD7, 0xB6, 0xBD, 0xF5, 0xE9, 0x40, /* 0x24-0x27 */
        0xCF, 0xC7, 0xEF, 0xC3, 0xEF, 0xC1, 0xEF, 0xC2, /* 0x28-0x2B */
        0xEF, 0xC4, 0xB6, 0xA7, 0xBC, 0xFC, 0xBE, 0xE2, /* 0x2C-0x2F */
        0xC3, 0xCC, 0xEF, 0xC5, 0xEF, 0xC6, 0xE9, 0x41, /* 0x30-0x33 */
        0xEF, 0xC7, 0xEF, 0xCF, 0xEF, 0xC8, 0xEF, 0xC9, /* 0x34-0x37 */
        0xEF, 0xCA, 0xC7, 0xC2, 0xEF, 0xF1, 0xB6, 0xCD, /* 0x38-0x3B */
        0xEF, 0xCB, 0xE9, 0x42, 0xEF, 0xCC, 0xEF, 0xCD, /* 0x3C-0x3F */
        0xB6, 0xC6, 0xC3, 0xBE, 0xEF, 0xCE, 0xE9, 0x43, /* 0x40-0x43 */
        0xEF, 0xD0, 0xEF, 0xD1, 0xEF, 0xD2, 0xD5, 0xF2, /* 0x44-0x47 */
        0xE9, 0x44, 0xEF, 0xD3, 0xC4, 0xF7, 0xE9, 0x45, /* 0x48-0x4B */
        0xEF, 0xD4, 0xC4, 0xF8, 0xEF, 0xD5, 0xEF, 0xD6, /* 0x4C-0x4F */
        0xB8, 0xE4, 0xB0, 0xF7, 0xEF, 0xD7, 0xEF, 0xD8, /* 0x50-0x53 */
        0xEF, 0xD9, 0xE9, 0x46, 0xEF, 0xDA, 0xEF, 0xDB, /* 0x54-0x57 */
        0xEF, 0xDC, 0xEF, 0xDD, 0xE9, 0x47, 0xEF, 0xDE, /* 0x58-0x5B */
        0xBE, 0xB5, 0xEF, 0xE1, 0xEF, 0xDF, 0xEF, 0xE0, /* 0x5C-0x5F */
        0xE9, 0x48, 0xEF, 0xE2, 0xEF, 0xE3, 0xC1, 0xCD, /* 0x60-0x63 */
        0xEF, 0xE4, 0xEF, 0xE5, 0xEF, 0xE6, 0xEF, 0xE7, /* 0x64-0x67 */
        0xEF, 0xE8, 0xEF, 0xE9, 0xEF, 0xEA, 0xEF, 0xEB, /* 0x68-0x6B */
        0xEF, 0xEC, 0xC0, 0xD8, 0xE9, 0x49, 0xEF, 0xED, /* 0x6C-0x6F */
        0xC1, 0xAD, 0xEF, 0xEE, 0xEF, 0xEF, 0xEF, 0xF0, /* 0x70-0x73 */
        0xE9, 0x4A, 0xE9, 0x4B, 0xCF, 0xE2, 0xE9, 0x4C, /* 0x74-0x77 */
        0xE9, 0x4D, 0xE9, 0x4E, 0xE9, 0x4F, 0xE9, 0x50, /* 0x78-0x7B */
        0xE9, 0x51, 0xE9, 0x52, 0xE9, 0x53, 0xB3, 0xA4, /* 0x7C-0x7F */
        
        0xE9, 0x54, 0xE9, 0x55, 0xE9, 0x56, 0xE9, 0x57, /* 0x80-0x83 */
        0xE9, 0x58, 0xE9, 0x59, 0xE9, 0x5A, 0xE9, 0x5B, /* 0x84-0x87 */
        0xE9, 0x5C, 0xE9, 0x5D, 0xE9, 0x5E, 0xE9, 0x5F, /* 0x88-0x8B */
        0xE9, 0x60, 0xE9, 0x61, 0xE9, 0x62, 0xE9, 0x63, /* 0x8C-0x8F */
        0xE9, 0x64, 0xE9, 0x65, 0xE9, 0x66, 0xE9, 0x67, /* 0x90-0x93 */
        0xE9, 0x68, 0xE9, 0x69, 0xE9, 0x6A, 0xE9, 0x6B, /* 0x94-0x97 */
        0xE9, 0x6C, 0xE9, 0x6D, 0xE9, 0x6E, 0xE9, 0x6F, /* 0x98-0x9B */
        0xE9, 0x70, 0xE9, 0x71, 0xE9, 0x72, 0xE9, 0x73, /* 0x9C-0x9F */
        0xE9, 0x74, 0xE9, 0x75, 0xE9, 0x76, 0xE9, 0x77, /* 0xA0-0xA3 */
        0xE9, 0x78, 0xE9, 0x79, 0xE9, 0x7A, 0xE9, 0x7B, /* 0xA4-0xA7 */
        0xE9, 0x7C, 0xE9, 0x7D, 0xE9, 0x7E, 0xE9, 0x80, /* 0xA8-0xAB */
        0xE9, 0x81, 0xE9, 0x82, 0xE9, 0x83, 0xE9, 0x84, /* 0xAC-0xAF */
        0xE9, 0x85, 0xE9, 0x86, 0xE9, 0x87, 0xE9, 0x88, /* 0xB0-0xB3 */
        0xE9, 0x89, 0xE9, 0x8A, 0xE9, 0x8B, 0xE9, 0x8C, /* 0xB4-0xB7 */
        0xE9, 0x8D, 0xE9, 0x8E, 0xE9, 0x8F, 0xE9, 0x90, /* 0xB8-0xBB */
        0xE9, 0x91, 0xE9, 0x92, 0xE9, 0x93, 0xE9, 0x94, /* 0xBC-0xBF */
        0xE9, 0x95, 0xE9, 0x96, 0xE9, 0x97, 0xE9, 0x98, /* 0xC0-0xC3 */
        0xE9, 0x99, 0xE9, 0x9A, 0xE9, 0x9B, 0xE9, 0x9C, /* 0xC4-0xC7 */
        0xE9, 0x9D, 0xE9, 0x9E, 0xE9, 0x9F, 0xE9, 0xA0, /* 0xC8-0xCB */
        0xEA, 0x40, 0xEA, 0x41, 0xEA, 0x42, 0xEA, 0x43, /* 0xCC-0xCF */
        0xEA, 0x44, 0xEA, 0x45, 0xEA, 0x46, 0xEA, 0x47, /* 0xD0-0xD3 */
        0xEA, 0x48, 0xEA, 0x49, 0xEA, 0x4A, 0xEA, 0x4B, /* 0xD4-0xD7 */
        0xEA, 0x4C, 0xEA, 0x4D, 0xEA, 0x4E, 0xEA, 0x4F, /* 0xD8-0xDB */
        0xEA, 0x50, 0xEA, 0x51, 0xEA, 0x52, 0xEA, 0x53, /* 0xDC-0xDF */
        0xEA, 0x54, 0xEA, 0x55, 0xEA, 0x56, 0xEA, 0x57, /* 0xE0-0xE3 */
        0xEA, 0x58, 0xEA, 0x59, 0xEA, 0x5A, 0xEA, 0x5B, /* 0xE4-0xE7 */
        0xC3, 0xC5, 0xE3, 0xC5, 0xC9, 0xC1, 0xE3, 0xC6, /* 0xE8-0xEB */
        0xEA, 0x5C, 0xB1, 0xD5, 0xCE, 0xCA, 0xB4, 0xB3, /* 0xEC-0xEF */
        0xC8, 0xF2, 0xE3, 0xC7, 0xCF, 0xD0, 0xE3, 0xC8, /* 0xF0-0xF3 */
        0xBC, 0xE4, 0xE3, 0xC9, 0xE3, 0xCA, 0xC3, 0xC6, /* 0xF4-0xF7 */
        0xD5, 0xA2, 0xC4, 0xD6, 0xB9, 0xEB, 0xCE, 0xC5, /* 0xF8-0xFB */
        0xE3, 0xCB, 0xC3, 0xF6, 0xE3, 0xCC, 0xEA, 0x5D, /* 0xFC-0xFF */
};

static const unsigned char u2c_96[512] = {
        0xB7, 0xA7, 0xB8, 0xF3, 0xBA, 0xD2, 0xE3, 0xCD, /* 0x00-0x03 */
        0xE3, 0xCE, 0xD4, 0xC4, 0xE3, 0xCF, 0xEA, 0x5E, /* 0x04-0x07 */
        0xE3, 0xD0, 0xD1, 0xCB, 0xE3, 0xD1, 0xE3, 0xD2, /* 0x08-0x0B */
        0xE3, 0xD3, 0xE3, 0xD4, 0xD1, 0xD6, 0xE3, 0xD5, /* 0x0C-0x0F */
        0xB2, 0xFB, 0xC0, 0xBB, 0xE3, 0xD6, 0xEA, 0x5F, /* 0x10-0x13 */
        0xC0, 0xAB, 0xE3, 0xD7, 0xE3, 0xD8, 0xE3, 0xD9, /* 0x14-0x17 */
        0xEA, 0x60, 0xE3, 0xDA, 0xE3, 0xDB, 0xEA, 0x61, /* 0x18-0x1B */
        0xB8, 0xB7, 0xDA, 0xE2, 0xEA, 0x62, 0xB6, 0xD3, /* 0x1C-0x1F */
        0xEA, 0x63, 0xDA, 0xE4, 0xDA, 0xE3, 0xEA, 0x64, /* 0x20-0x23 */
        0xEA, 0x65, 0xEA, 0x66, 0xEA, 0x67, 0xEA, 0x68, /* 0x24-0x27 */
        0xEA, 0x69, 0xEA, 0x6A, 0xDA, 0xE6, 0xEA, 0x6B, /* 0x28-0x2B */
        0xEA, 0x6C, 0xEA, 0x6D, 0xC8, 0xEE, 0xEA, 0x6E, /* 0x2C-0x2F */
        0xEA, 0x6F, 0xDA, 0xE5, 0xB7, 0xC0, 0xD1, 0xF4, /* 0x30-0x33 */
        0xD2, 0xF5, 0xD5, 0xF3, 0xBD, 0xD7, 0xEA, 0x70, /* 0x34-0x37 */
        0xEA, 0x71, 0xEA, 0x72, 0xEA, 0x73, 0xD7, 0xE8, /* 0x38-0x3B */
        0xDA, 0xE8, 0xDA, 0xE7, 0xEA, 0x74, 0xB0, 0xA2, /* 0x3C-0x3F */
        0xCD, 0xD3, 0xEA, 0x75, 0xDA, 0xE9, 0xEA, 0x76, /* 0x40-0x43 */
        0xB8, 0xBD, 0xBC, 0xCA, 0xC2, 0xBD, 0xC2, 0xA4, /* 0x44-0x47 */
        0xB3, 0xC2, 0xDA, 0xEA, 0xEA, 0x77, 0xC2, 0xAA, /* 0x48-0x4B */
        0xC4, 0xB0, 0xBD, 0xB5, 0xEA, 0x78, 0xEA, 0x79, /* 0x4C-0x4F */
        0xCF, 0xDE, 0xEA, 0x7A, 0xEA, 0x7B, 0xEA, 0x7C, /* 0x50-0x53 */
        0xDA, 0xEB, 0xC9, 0xC2, 0xEA, 0x7D, 0xEA, 0x7E, /* 0x54-0x57 */
        0xEA, 0x80, 0xEA, 0x81, 0xEA, 0x82, 0xB1, 0xDD, /* 0x58-0x5B */
        0xEA, 0x83, 0xEA, 0x84, 0xEA, 0x85, 0xDA, 0xEC, /* 0x5C-0x5F */
        0xEA, 0x86, 0xB6, 0xB8, 0xD4, 0xBA, 0xEA, 0x87, /* 0x60-0x63 */
        0xB3, 0xFD, 0xEA, 0x88, 0xEA, 0x89, 0xDA, 0xED, /* 0x64-0x67 */
        0xD4, 0xC9, 0xCF, 0xD5, 0xC5, 0xE3, 0xEA, 0x8A, /* 0x68-0x6B */
        0xDA, 0xEE, 0xEA, 0x8B, 0xEA, 0x8C, 0xEA, 0x8D, /* 0x6C-0x6F */
        0xEA, 0x8E, 0xEA, 0x8F, 0xDA, 0xEF, 0xEA, 0x90, /* 0x70-0x73 */
        0xDA, 0xF0, 0xC1, 0xEA, 0xCC, 0xD5, 0xCF, 0xDD, /* 0x74-0x77 */
        0xEA, 0x91, 0xEA, 0x92, 0xEA, 0x93, 0xEA, 0x94, /* 0x78-0x7B */
        0xEA, 0x95, 0xEA, 0x96, 0xEA, 0x97, 0xEA, 0x98, /* 0x7C-0x7F */
        
        0xEA, 0x99, 0xEA, 0x9A, 0xEA, 0x9B, 0xEA, 0x9C, /* 0x80-0x83 */
        0xEA, 0x9D, 0xD3, 0xE7, 0xC2, 0xA1, 0xEA, 0x9E, /* 0x84-0x87 */
        0xDA, 0xF1, 0xEA, 0x9F, 0xEA, 0xA0, 0xCB, 0xE5, /* 0x88-0x8B */
        0xEB, 0x40, 0xDA, 0xF2, 0xEB, 0x41, 0xCB, 0xE6, /* 0x8C-0x8F */
        0xD2, 0xFE, 0xEB, 0x42, 0xEB, 0x43, 0xEB, 0x44, /* 0x90-0x93 */
        0xB8, 0xF4, 0xEB, 0x45, 0xEB, 0x46, 0xDA, 0xF3, /* 0x94-0x97 */
        0xB0, 0xAF, 0xCF, 0xB6, 0xEB, 0x47, 0xEB, 0x48, /* 0x98-0x9B */
        0xD5, 0xCF, 0xEB, 0x49, 0xEB, 0x4A, 0xEB, 0x4B, /* 0x9C-0x9F */
        0xEB, 0x4C, 0xEB, 0x4D, 0xEB, 0x4E, 0xEB, 0x4F, /* 0xA0-0xA3 */
        0xEB, 0x50, 0xEB, 0x51, 0xEB, 0x52, 0xCB, 0xED, /* 0xA4-0xA7 */
        0xEB, 0x53, 0xEB, 0x54, 0xEB, 0x55, 0xEB, 0x56, /* 0xA8-0xAB */
        0xEB, 0x57, 0xEB, 0x58, 0xEB, 0x59, 0xEB, 0x5A, /* 0xAC-0xAF */
        0xDA, 0xF4, 0xEB, 0x5B, 0xEB, 0x5C, 0xE3, 0xC4, /* 0xB0-0xB3 */
        0xEB, 0x5D, 0xEB, 0x5E, 0xC1, 0xA5, 0xEB, 0x5F, /* 0xB4-0xB7 */
        0xEB, 0x60, 0xF6, 0xBF, 0xEB, 0x61, 0xEB, 0x62, /* 0xB8-0xBB */
        0xF6, 0xC0, 0xF6, 0xC1, 0xC4, 0xD1, 0xEB, 0x63, /* 0xBC-0xBF */
        0xC8, 0xB8, 0xD1, 0xE3, 0xEB, 0x64, 0xEB, 0x65, /* 0xC0-0xC3 */
        0xD0, 0xDB, 0xD1, 0xC5, 0xBC, 0xAF, 0xB9, 0xCD, /* 0xC4-0xC7 */
        0xEB, 0x66, 0xEF, 0xF4, 0xEB, 0x67, 0xEB, 0x68, /* 0xC8-0xCB */
        0xB4, 0xC6, 0xD3, 0xBA, 0xF6, 0xC2, 0xB3, 0xFB, /* 0xCC-0xCF */
        0xEB, 0x69, 0xEB, 0x6A, 0xF6, 0xC3, 0xEB, 0x6B, /* 0xD0-0xD3 */
        0xEB, 0x6C, 0xB5, 0xF1, 0xEB, 0x6D, 0xEB, 0x6E, /* 0xD4-0xD7 */
        0xEB, 0x6F, 0xEB, 0x70, 0xEB, 0x71, 0xEB, 0x72, /* 0xD8-0xDB */
        0xEB, 0x73, 0xEB, 0x74, 0xEB, 0x75, 0xEB, 0x76, /* 0xDC-0xDF */
        0xF6, 0xC5, 0xEB, 0x77, 0xEB, 0x78, 0xEB, 0x79, /* 0xE0-0xE3 */
        0xEB, 0x7A, 0xEB, 0x7B, 0xEB, 0x7C, 0xEB, 0x7D, /* 0xE4-0xE7 */
        0xD3, 0xEA, 0xF6, 0xA7, 0xD1, 0xA9, 0xEB, 0x7E, /* 0xE8-0xEB */
        0xEB, 0x80, 0xEB, 0x81, 0xEB, 0x82, 0xF6, 0xA9, /* 0xEC-0xEF */
        0xEB, 0x83, 0xEB, 0x84, 0xEB, 0x85, 0xF6, 0xA8, /* 0xF0-0xF3 */
        0xEB, 0x86, 0xEB, 0x87, 0xC1, 0xE3, 0xC0, 0xD7, /* 0xF4-0xF7 */
        0xEB, 0x88, 0xB1, 0xA2, 0xEB, 0x89, 0xEB, 0x8A, /* 0xF8-0xFB */
        0xEB, 0x8B, 0xEB, 0x8C, 0xCE, 0xED, 0xEB, 0x8D, /* 0xFC-0xFF */
};

static const unsigned char u2c_97[512] = {
        0xD0, 0xE8, 0xF6, 0xAB, 0xEB, 0x8E, 0xEB, 0x8F, /* 0x00-0x03 */
        0xCF, 0xF6, 0xEB, 0x90, 0xF6, 0xAA, 0xD5, 0xF0, /* 0x04-0x07 */
        0xF6, 0xAC, 0xC3, 0xB9, 0xEB, 0x91, 0xEB, 0x92, /* 0x08-0x0B */
        0xEB, 0x93, 0xBB, 0xF4, 0xF6, 0xAE, 0xF6, 0xAD, /* 0x0C-0x0F */
        0xEB, 0x94, 0xEB, 0x95, 0xEB, 0x96, 0xC4, 0xDE, /* 0x10-0x13 */
        0xEB, 0x97, 0xEB, 0x98, 0xC1, 0xD8, 0xEB, 0x99, /* 0x14-0x17 */
        0xEB, 0x9A, 0xEB, 0x9B, 0xEB, 0x9C, 0xEB, 0x9D, /* 0x18-0x1B */
        0xCB, 0xAA, 0xEB, 0x9E, 0xCF, 0xBC, 0xEB, 0x9F, /* 0x1C-0x1F */
        0xEB, 0xA0, 0xEC, 0x40, 0xEC, 0x41, 0xEC, 0x42, /* 0x20-0x23 */
        0xEC, 0x43, 0xEC, 0x44, 0xEC, 0x45, 0xEC, 0x46, /* 0x24-0x27 */
        0xEC, 0x47, 0xEC, 0x48, 0xF6, 0xAF, 0xEC, 0x49, /* 0x28-0x2B */
        0xEC, 0x4A, 0xF6, 0xB0, 0xEC, 0x4B, 0xEC, 0x4C, /* 0x2C-0x2F */
        0xF6, 0xB1, 0xEC, 0x4D, 0xC2, 0xB6, 0xEC, 0x4E, /* 0x30-0x33 */
        0xEC, 0x4F, 0xEC, 0x50, 0xEC, 0x51, 0xEC, 0x52, /* 0x34-0x37 */
        0xB0, 0xD4, 0xC5, 0xF9, 0xEC, 0x53, 0xEC, 0x54, /* 0x38-0x3B */
        0xEC, 0x55, 0xEC, 0x56, 0xF6, 0xB2, 0xEC, 0x57, /* 0x3C-0x3F */
        0xEC, 0x58, 0xEC, 0x59, 0xEC, 0x5A, 0xEC, 0x5B, /* 0x40-0x43 */
        0xEC, 0x5C, 0xEC, 0x5D, 0xEC, 0x5E, 0xEC, 0x5F, /* 0x44-0x47 */
        0xEC, 0x60, 0xEC, 0x61, 0xEC, 0x62, 0xEC, 0x63, /* 0x48-0x4B */
        0xEC, 0x64, 0xEC, 0x65, 0xEC, 0x66, 0xEC, 0x67, /* 0x4C-0x4F */
        0xEC, 0x68, 0xEC, 0x69, 0xC7, 0xE0, 0xF6, 0xA6, /* 0x50-0x53 */
        0xEC, 0x6A, 0xEC, 0x6B, 0xBE, 0xB8, 0xEC, 0x6C, /* 0x54-0x57 */
        0xEC, 0x6D, 0xBE, 0xB2, 0xEC, 0x6E, 0xB5, 0xE5, /* 0x58-0x5B */
        0xEC, 0x6F, 0xEC, 0x70, 0xB7, 0xC7, 0xEC, 0x71, /* 0x5C-0x5F */
        0xBF, 0xBF, 0xC3, 0xD2, 0xC3, 0xE6, 0xEC, 0x72, /* 0x60-0x63 */
        0xEC, 0x73, 0xD8, 0xCC, 0xEC, 0x74, 0xEC, 0x75, /* 0x64-0x67 */
        0xEC, 0x76, 0xB8, 0xEF, 0xEC, 0x77, 0xEC, 0x78, /* 0x68-0x6B */
        0xEC, 0x79, 0xEC, 0x7A, 0xEC, 0x7B, 0xEC, 0x7C, /* 0x6C-0x6F */
        0xEC, 0x7D, 0xEC, 0x7E, 0xEC, 0x80, 0xBD, 0xF9, /* 0x70-0x73 */
        0xD1, 0xA5, 0xEC, 0x81, 0xB0, 0xD0, 0xEC, 0x82, /* 0x74-0x77 */
        0xEC, 0x83, 0xEC, 0x84, 0xEC, 0x85, 0xEC, 0x86, /* 0x78-0x7B */
        0xF7, 0xB0, 0xEC, 0x87, 0xEC, 0x88, 0xEC, 0x89, /* 0x7C-0x7F */
        
        0xEC, 0x8A, 0xEC, 0x8B, 0xEC, 0x8C, 0xEC, 0x8D, /* 0x80-0x83 */
        0xEC, 0x8E, 0xF7, 0xB1, 0xEC, 0x8F, 0xEC, 0x90, /* 0x84-0x87 */
        0xEC, 0x91, 0xEC, 0x92, 0xEC, 0x93, 0xD0, 0xAC, /* 0x88-0x8B */
        0xEC, 0x94, 0xB0, 0xB0, 0xEC, 0x95, 0xEC, 0x96, /* 0x8C-0x8F */
        0xEC, 0x97, 0xF7, 0xB2, 0xF7, 0xB3, 0xEC, 0x98, /* 0x90-0x93 */
        0xF7, 0xB4, 0xEC, 0x99, 0xEC, 0x9A, 0xEC, 0x9B, /* 0x94-0x97 */
        0xC7, 0xCA, 0xEC, 0x9C, 0xEC, 0x9D, 0xEC, 0x9E, /* 0x98-0x9B */
        0xEC, 0x9F, 0xEC, 0xA0, 0xED, 0x40, 0xED, 0x41, /* 0x9C-0x9F */
        0xBE, 0xCF, 0xED, 0x42, 0xED, 0x43, 0xF7, 0xB7, /* 0xA0-0xA3 */
        0xED, 0x44, 0xED, 0x45, 0xED, 0x46, 0xED, 0x47, /* 0xA4-0xA7 */
        0xED, 0x48, 0xED, 0x49, 0xED, 0x4A, 0xF7, 0xB6, /* 0xA8-0xAB */
        0xED, 0x4B, 0xB1, 0xDE, 0xED, 0x4C, 0xF7, 0xB5, /* 0xAC-0xAF */
        0xED, 0x4D, 0xED, 0x4E, 0xF7, 0xB8, 0xED, 0x4F, /* 0xB0-0xB3 */
        0xF7, 0xB9, 0xED, 0x50, 0xED, 0x51, 0xED, 0x52, /* 0xB4-0xB7 */
        0xED, 0x53, 0xED, 0x54, 0xED, 0x55, 0xED, 0x56, /* 0xB8-0xBB */
        0xED, 0x57, 0xED, 0x58, 0xED, 0x59, 0xED, 0x5A, /* 0xBC-0xBF */
        0xED, 0x5B, 0xED, 0x5C, 0xED, 0x5D, 0xED, 0x5E, /* 0xC0-0xC3 */
        0xED, 0x5F, 0xED, 0x60, 0xED, 0x61, 0xED, 0x62, /* 0xC4-0xC7 */
        0xED, 0x63, 0xED, 0x64, 0xED, 0x65, 0xED, 0x66, /* 0xC8-0xCB */
        0xED, 0x67, 0xED, 0x68, 0xED, 0x69, 0xED, 0x6A, /* 0xCC-0xCF */
        0xED, 0x6B, 0xED, 0x6C, 0xED, 0x6D, 0xED, 0x6E, /* 0xD0-0xD3 */
        0xED, 0x6F, 0xED, 0x70, 0xED, 0x71, 0xED, 0x72, /* 0xD4-0xD7 */
        0xED, 0x73, 0xED, 0x74, 0xED, 0x75, 0xED, 0x76, /* 0xD8-0xDB */
        0xED, 0x77, 0xED, 0x78, 0xED, 0x79, 0xED, 0x7A, /* 0xDC-0xDF */
        0xED, 0x7B, 0xED, 0x7C, 0xED, 0x7D, 0xED, 0x7E, /* 0xE0-0xE3 */
        0xED, 0x80, 0xED, 0x81, 0xCE, 0xA4, 0xC8, 0xCD, /* 0xE4-0xE7 */
        0xED, 0x82, 0xBA, 0xAB, 0xE8, 0xB8, 0xE8, 0xB9, /* 0xE8-0xEB */
        0xE8, 0xBA, 0xBE, 0xC2, 0xED, 0x83, 0xED, 0x84, /* 0xEC-0xEF */
        0xED, 0x85, 0xED, 0x86, 0xED, 0x87, 0xD2, 0xF4, /* 0xF0-0xF3 */
        0xED, 0x88, 0xD4, 0xCF, 0xC9, 0xD8, 0xED, 0x89, /* 0xF4-0xF7 */
        0xED, 0x8A, 0xED, 0x8B, 0xED, 0x8C, 0xED, 0x8D, /* 0xF8-0xFB */
        0xED, 0x8E, 0xED, 0x8F, 0xED, 0x90, 0xED, 0x91, /* 0xFC-0xFF */
};

static const unsigned char u2c_98[512] = {
        0xED, 0x92, 0xED, 0x93, 0xED, 0x94, 0xED, 0x95, /* 0x00-0x03 */
        0xED, 0x96, 0xED, 0x97, 0xED, 0x98, 0xED, 0x99, /* 0x04-0x07 */
        0xED, 0x9A, 0xED, 0x9B, 0xED, 0x9C, 0xED, 0x9D, /* 0x08-0x0B */
        0xED, 0x9E, 0xED, 0x9F, 0xED, 0xA0, 0xEE, 0x40, /* 0x0C-0x0F */
        0xEE, 0x41, 0xEE, 0x42, 0xEE, 0x43, 0xEE, 0x44, /* 0x10-0x13 */
        0xEE, 0x45, 0xEE, 0x46, 0xEE, 0x47, 0xEE, 0x48, /* 0x14-0x17 */
        0xEE, 0x49, 0xEE, 0x4A, 0xEE, 0x4B, 0xEE, 0x4C, /* 0x18-0x1B */
        0xEE, 0x4D, 0xEE, 0x4E, 0xEE, 0x4F, 0xEE, 0x50, /* 0x1C-0x1F */
        0xEE, 0x51, 0xEE, 0x52, 0xEE, 0x53, 0xEE, 0x54, /* 0x20-0x23 */
        0xEE, 0x55, 0xEE, 0x56, 0xEE, 0x57, 0xEE, 0x58, /* 0x24-0x27 */
        0xEE, 0x59, 0xEE, 0x5A, 0xEE, 0x5B, 0xEE, 0x5C, /* 0x28-0x2B */
        0xEE, 0x5D, 0xEE, 0x5E, 0xEE, 0x5F, 0xEE, 0x60, /* 0x2C-0x2F */
        0xEE, 0x61, 0xEE, 0x62, 0xEE, 0x63, 0xEE, 0x64, /* 0x30-0x33 */
        0xEE, 0x65, 0xEE, 0x66, 0xEE, 0x67, 0xEE, 0x68, /* 0x34-0x37 */
        0xEE, 0x69, 0xEE, 0x6A, 0xEE, 0x6B, 0xEE, 0x6C, /* 0x38-0x3B */
        0xEE, 0x6D, 0xEE, 0x6E, 0xEE, 0x6F, 0xEE, 0x70, /* 0x3C-0x3F */
        0xEE, 0x71, 0xEE, 0x72, 0xEE, 0x73, 0xEE, 0x74, /* 0x40-0x43 */
        0xEE, 0x75, 0xEE, 0x76, 0xEE, 0x77, 0xEE, 0x78, /* 0x44-0x47 */
        0xEE, 0x79, 0xEE, 0x7A, 0xEE, 0x7B, 0xEE, 0x7C, /* 0x48-0x4B */
        0xEE, 0x7D, 0xEE, 0x7E, 0xEE, 0x80, 0xEE, 0x81, /* 0x4C-0x4F */
        0xEE, 0x82, 0xEE, 0x83, 0xEE, 0x84, 0xEE, 0x85, /* 0x50-0x53 */
        0xEE, 0x86, 0xEE, 0x87, 0xEE, 0x88, 0xEE, 0x89, /* 0x54-0x57 */
        0xEE, 0x8A, 0xEE, 0x8B, 0xEE, 0x8C, 0xEE, 0x8D, /* 0x58-0x5B */
        0xEE, 0x8E, 0xEE, 0x8F, 0xEE, 0x90, 0xEE, 0x91, /* 0x5C-0x5F */
        0xEE, 0x92, 0xEE, 0x93, 0xEE, 0x94, 0xEE, 0x95, /* 0x60-0x63 */
        0xEE, 0x96, 0xEE, 0x97, 0xEE, 0x98, 0xEE, 0x99, /* 0x64-0x67 */
        0xEE, 0x9A, 0xEE, 0x9B, 0xEE, 0x9C, 0xEE, 0x9D, /* 0x68-0x6B */
        0xEE, 0x9E, 0xEE, 0x9F, 0xEE, 0xA0, 0xEF, 0x40, /* 0x6C-0x6F */
        0xEF, 0x41, 0xEF, 0x42, 0xEF, 0x43, 0xEF, 0x44, /* 0x70-0x73 */
        0xEF, 0x45, 0xD2, 0xB3, 0xB6, 0xA5, 0xC7, 0xEA, /* 0x74-0x77 */
        0xF1, 0xFC, 0xCF, 0xEE, 0xCB, 0xB3, 0xD0, 0xEB, /* 0x78-0x7B */
        0xE7, 0xEF, 0xCD, 0xE7, 0xB9, 0xCB, 0xB6, 0xD9, /* 0x7C-0x7F */
        
        0xF1, 0xFD, 0xB0, 0xE4, 0xCB, 0xCC, 0xF1, 0xFE, /* 0x80-0x83 */
        0xD4, 0xA4, 0xC2, 0xAD, 0xC1, 0xEC, 0xC6, 0xC4, /* 0x84-0x87 */
        0xBE, 0xB1, 0xF2, 0xA1, 0xBC, 0xD5, 0xEF, 0x46, /* 0x88-0x8B */
        0xF2, 0xA2, 0xF2, 0xA3, 0xEF, 0x47, 0xF2, 0xA4, /* 0x8C-0x8F */
        0xD2, 0xC3, 0xC6, 0xB5, 0xEF, 0x48, 0xCD, 0xC7, /* 0x90-0x93 */
        0xF2, 0xA5, 0xEF, 0x49, 0xD3, 0xB1, 0xBF, 0xC5, /* 0x94-0x97 */
        0xCC, 0xE2, 0xEF, 0x4A, 0xF2, 0xA6, 0xF2, 0xA7, /* 0x98-0x9B */
        0xD1, 0xD5, 0xB6, 0xEE, 0xF2, 0xA8, 0xF2, 0xA9, /* 0x9C-0x9F */
        0xB5, 0xDF, 0xF2, 0xAA, 0xF2, 0xAB, 0xEF, 0x4B, /* 0xA0-0xA3 */
        0xB2, 0xFC, 0xF2, 0xAC, 0xF2, 0xAD, 0xC8, 0xA7, /* 0xA4-0xA7 */
        0xEF, 0x4C, 0xEF, 0x4D, 0xEF, 0x4E, 0xEF, 0x4F, /* 0xA8-0xAB */
        0xEF, 0x50, 0xEF, 0x51, 0xEF, 0x52, 0xEF, 0x53, /* 0xAC-0xAF */
        0xEF, 0x54, 0xEF, 0x55, 0xEF, 0x56, 0xEF, 0x57, /* 0xB0-0xB3 */
        0xEF, 0x58, 0xEF, 0x59, 0xEF, 0x5A, 0xEF, 0x5B, /* 0xB4-0xB7 */
        0xEF, 0x5C, 0xEF, 0x5D, 0xEF, 0x5E, 0xEF, 0x5F, /* 0xB8-0xBB */
        0xEF, 0x60, 0xEF, 0x61, 0xEF, 0x62, 0xEF, 0x63, /* 0xBC-0xBF */
        0xEF, 0x64, 0xEF, 0x65, 0xEF, 0x66, 0xEF, 0x67, /* 0xC0-0xC3 */
        0xEF, 0x68, 0xEF, 0x69, 0xEF, 0x6A, 0xEF, 0x6B, /* 0xC4-0xC7 */
        0xEF, 0x6C, 0xEF, 0x6D, 0xEF, 0x6E, 0xEF, 0x6F, /* 0xC8-0xCB */
        0xEF, 0x70, 0xEF, 0x71, 0xB7, 0xE7, 0xEF, 0x72, /* 0xCC-0xCF */
        0xEF, 0x73, 0xEC, 0xA9, 0xEC, 0xAA, 0xEC, 0xAB, /* 0xD0-0xD3 */
        0xEF, 0x74, 0xEC, 0xAC, 0xEF, 0x75, 0xEF, 0x76, /* 0xD4-0xD7 */
        0xC6, 0xAE, 0xEC, 0xAD, 0xEC, 0xAE, 0xEF, 0x77, /* 0xD8-0xDB */
        0xEF, 0x78, 0xEF, 0x79, 0xB7, 0xC9, 0xCA, 0xB3, /* 0xDC-0xDF */
        0xEF, 0x7A, 0xEF, 0x7B, 0xEF, 0x7C, 0xEF, 0x7D, /* 0xE0-0xE3 */
        0xEF, 0x7E, 0xEF, 0x80, 0xEF, 0x81, 0xE2, 0xB8, /* 0xE4-0xE7 */
        0xF7, 0xCF, 0xEF, 0x82, 0xEF, 0x83, 0xEF, 0x84, /* 0xE8-0xEB */
        0xEF, 0x85, 0xEF, 0x86, 0xEF, 0x87, 0xEF, 0x88, /* 0xEC-0xEF */
        0xEF, 0x89, 0xEF, 0x8A, 0xEF, 0x8B, 0xEF, 0x8C, /* 0xF0-0xF3 */
        0xEF, 0x8D, 0xEF, 0x8E, 0xEF, 0x8F, 0xEF, 0x90, /* 0xF4-0xF7 */
        0xEF, 0x91, 0xEF, 0x92, 0xEF, 0x93, 0xEF, 0x94, /* 0xF8-0xFB */
        0xEF, 0x95, 0xEF, 0x96, 0xEF, 0x97, 0xEF, 0x98, /* 0xFC-0xFF */
};

static const unsigned char u2c_99[512] = {
        0xEF, 0x99, 0xEF, 0x9A, 0xEF, 0x9B, 0xEF, 0x9C, /* 0x00-0x03 */
        0xEF, 0x9D, 0xEF, 0x9E, 0xEF, 0x9F, 0xEF, 0xA0, /* 0x04-0x07 */
        0xF0, 0x40, 0xF0, 0x41, 0xF0, 0x42, 0xF0, 0x43, /* 0x08-0x0B */
        0xF0, 0x44, 0xF7, 0xD0, 0xF0, 0x45, 0xF0, 0x46, /* 0x0C-0x0F */
        0xB2, 0xCD, 0xF0, 0x47, 0xF0, 0x48, 0xF0, 0x49, /* 0x10-0x13 */
        0xF0, 0x4A, 0xF0, 0x4B, 0xF0, 0x4C, 0xF0, 0x4D, /* 0x14-0x17 */
        0xF0, 0x4E, 0xF0, 0x4F, 0xF0, 0x50, 0xF0, 0x51, /* 0x18-0x1B */
        0xF0, 0x52, 0xF0, 0x53, 0xF0, 0x54, 0xF0, 0x55, /* 0x1C-0x1F */
        0xF0, 0x56, 0xF0, 0x57, 0xF0, 0x58, 0xF0, 0x59, /* 0x20-0x23 */
        0xF0, 0x5A, 0xF0, 0x5B, 0xF0, 0x5C, 0xF0, 0x5D, /* 0x24-0x27 */
        0xF0, 0x5E, 0xF0, 0x5F, 0xF0, 0x60, 0xF0, 0x61, /* 0x28-0x2B */
        0xF0, 0x62, 0xF0, 0x63, 0xF7, 0xD1, 0xF0, 0x64, /* 0x2C-0x2F */
        0xF0, 0x65, 0xF0, 0x66, 0xF0, 0x67, 0xF0, 0x68, /* 0x30-0x33 */
        0xF0, 0x69, 0xF0, 0x6A, 0xF0, 0x6B, 0xF0, 0x6C, /* 0x34-0x37 */
        0xF0, 0x6D, 0xF0, 0x6E, 0xF0, 0x6F, 0xF0, 0x70, /* 0x38-0x3B */
        0xF0, 0x71, 0xF0, 0x72, 0xF0, 0x73, 0xF0, 0x74, /* 0x3C-0x3F */
        0xF0, 0x75, 0xF0, 0x76, 0xF0, 0x77, 0xF0, 0x78, /* 0x40-0x43 */
        0xF0, 0x79, 0xF0, 0x7A, 0xF0, 0x7B, 0xF0, 0x7C, /* 0x44-0x47 */
        0xF0, 0x7D, 0xF0, 0x7E, 0xF0, 0x80, 0xF0, 0x81, /* 0x48-0x4B */
        0xF0, 0x82, 0xF0, 0x83, 0xF0, 0x84, 0xF0, 0x85, /* 0x4C-0x4F */
        0xF0, 0x86, 0xF0, 0x87, 0xF0, 0x88, 0xF0, 0x89, /* 0x50-0x53 */
        0xF7, 0xD3, 0xF7, 0xD2, 0xF0, 0x8A, 0xF0, 0x8B, /* 0x54-0x57 */
        0xF0, 0x8C, 0xF0, 0x8D, 0xF0, 0x8E, 0xF0, 0x8F, /* 0x58-0x5B */
        0xF0, 0x90, 0xF0, 0x91, 0xF0, 0x92, 0xF0, 0x93, /* 0x5C-0x5F */
        0xF0, 0x94, 0xF0, 0x95, 0xF0, 0x96, 0xE2, 0xBB, /* 0x60-0x63 */
        0xF0, 0x97, 0xBC, 0xA2, 0xF0, 0x98, 0xE2, 0xBC, /* 0x64-0x67 */
        0xE2, 0xBD, 0xE2, 0xBE, 0xE2, 0xBF, 0xE2, 0xC0, /* 0x68-0x6B */
        0xE2, 0xC1, 0xB7, 0xB9, 0xD2, 0xFB, 0xBD, 0xA4, /* 0x6C-0x6F */
        0xCA, 0xCE, 0xB1, 0xA5, 0xCB, 0xC7, 0xF0, 0x99, /* 0x70-0x73 */
        0xE2, 0xC2, 0xB6, 0xFC, 0xC8, 0xC4, 0xE2, 0xC3, /* 0x74-0x77 */
        0xF0, 0x9A, 0xF0, 0x9B, 0xBD, 0xC8, 0xF0, 0x9C, /* 0x78-0x7B */
        0xB1, 0xFD, 0xE2, 0xC4, 0xF0, 0x9D, 0xB6, 0xF6, /* 0x7C-0x7F */
        
        0xE2, 0xC5, 0xC4, 0xD9, 0xF0, 0x9E, 0xF0, 0x9F, /* 0x80-0x83 */
        0xE2, 0xC6, 0xCF, 0xDA, 0xB9, 0xDD, 0xE2, 0xC7, /* 0x84-0x87 */
        0xC0, 0xA1, 0xF0, 0xA0, 0xE2, 0xC8, 0xB2, 0xF6, /* 0x88-0x8B */
        0xF1, 0x40, 0xE2, 0xC9, 0xF1, 0x41, 0xC1, 0xF3, /* 0x8C-0x8F */
        0xE2, 0xCA, 0xE2, 0xCB, 0xC2, 0xF8, 0xE2, 0xCC, /* 0x90-0x93 */
        0xE2, 0xCD, 0xE2, 0xCE, 0xCA, 0xD7, 0xD8, 0xB8, /* 0x94-0x97 */
        0xD9, 0xE5, 0xCF, 0xE3, 0xF1, 0x42, 0xF1, 0x43, /* 0x98-0x9B */
        0xF1, 0x44, 0xF1, 0x45, 0xF1, 0x46, 0xF1, 0x47, /* 0x9C-0x9F */
        0xF1, 0x48, 0xF1, 0x49, 0xF1, 0x4A, 0xF1, 0x4B, /* 0xA0-0xA3 */
        0xF1, 0x4C, 0xF0, 0xA5, 0xF1, 0x4D, 0xF1, 0x4E, /* 0xA4-0xA7 */
        0xDC, 0xB0, 0xF1, 0x4F, 0xF1, 0x50, 0xF1, 0x51, /* 0xA8-0xAB */
        0xF1, 0x52, 0xF1, 0x53, 0xF1, 0x54, 0xF1, 0x55, /* 0xAC-0xAF */
        0xF1, 0x56, 0xF1, 0x57, 0xF1, 0x58, 0xF1, 0x59, /* 0xB0-0xB3 */
        0xF1, 0x5A, 0xF1, 0x5B, 0xF1, 0x5C, 0xF1, 0x5D, /* 0xB4-0xB7 */
        0xF1, 0x5E, 0xF1, 0x5F, 0xF1, 0x60, 0xF1, 0x61, /* 0xB8-0xBB */
        0xF1, 0x62, 0xF1, 0x63, 0xF1, 0x64, 0xF1, 0x65, /* 0xBC-0xBF */
        0xF1, 0x66, 0xF1, 0x67, 0xF1, 0x68, 0xF1, 0x69, /* 0xC0-0xC3 */
        0xF1, 0x6A, 0xF1, 0x6B, 0xF1, 0x6C, 0xF1, 0x6D, /* 0xC4-0xC7 */
        0xF1, 0x6E, 0xF1, 0x6F, 0xF1, 0x70, 0xF1, 0x71, /* 0xC8-0xCB */
        0xF1, 0x72, 0xF1, 0x73, 0xF1, 0x74, 0xF1, 0x75, /* 0xCC-0xCF */
        0xF1, 0x76, 0xF1, 0x77, 0xF1, 0x78, 0xF1, 0x79, /* 0xD0-0xD3 */
        0xF1, 0x7A, 0xF1, 0x7B, 0xF1, 0x7C, 0xF1, 0x7D, /* 0xD4-0xD7 */
        0xF1, 0x7E, 0xF1, 0x80, 0xF1, 0x81, 0xF1, 0x82, /* 0xD8-0xDB */
        0xF1, 0x83, 0xF1, 0x84, 0xF1, 0x85, 0xF1, 0x86, /* 0xDC-0xDF */
        0xF1, 0x87, 0xF1, 0x88, 0xF1, 0x89, 0xF1, 0x8A, /* 0xE0-0xE3 */
        0xF1, 0x8B, 0xF1, 0x8C, 0xF1, 0x8D, 0xF1, 0x8E, /* 0xE4-0xE7 */
        0xF1, 0x8F, 0xF1, 0x90, 0xF1, 0x91, 0xF1, 0x92, /* 0xE8-0xEB */
        0xF1, 0x93, 0xF1, 0x94, 0xF1, 0x95, 0xF1, 0x96, /* 0xEC-0xEF */
        0xF1, 0x97, 0xF1, 0x98, 0xF1, 0x99, 0xF1, 0x9A, /* 0xF0-0xF3 */
        0xF1, 0x9B, 0xF1, 0x9C, 0xF1, 0x9D, 0xF1, 0x9E, /* 0xF4-0xF7 */
        0xF1, 0x9F, 0xF1, 0xA0, 0xF2, 0x40, 0xF2, 0x41, /* 0xF8-0xFB */
        0xF2, 0x42, 0xF2, 0x43, 0xF2, 0x44, 0xF2, 0x45, /* 0xFC-0xFF */
};

static const unsigned char u2c_9A[512] = {
        0xF2, 0x46, 0xF2, 0x47, 0xF2, 0x48, 0xF2, 0x49, /* 0x00-0x03 */
        0xF2, 0x4A, 0xF2, 0x4B, 0xF2, 0x4C, 0xF2, 0x4D, /* 0x04-0x07 */
        0xF2, 0x4E, 0xF2, 0x4F, 0xF2, 0x50, 0xF2, 0x51, /* 0x08-0x0B */
        0xF2, 0x52, 0xF2, 0x53, 0xF2, 0x54, 0xF2, 0x55, /* 0x0C-0x0F */
        0xF2, 0x56, 0xF2, 0x57, 0xF2, 0x58, 0xF2, 0x59, /* 0x10-0x13 */
        0xF2, 0x5A, 0xF2, 0x5B, 0xF2, 0x5C, 0xF2, 0x5D, /* 0x14-0x17 */
        0xF2, 0x5E, 0xF2, 0x5F, 0xF2, 0x60, 0xF2, 0x61, /* 0x18-0x1B */
        0xF2, 0x62, 0xF2, 0x63, 0xF2, 0x64, 0xF2, 0x65, /* 0x1C-0x1F */
        0xF2, 0x66, 0xF2, 0x67, 0xF2, 0x68, 0xF2, 0x69, /* 0x20-0x23 */
        0xF2, 0x6A, 0xF2, 0x6B, 0xF2, 0x6C, 0xF2, 0x6D, /* 0x24-0x27 */
        0xF2, 0x6E, 0xF2, 0x6F, 0xF2, 0x70, 0xF2, 0x71, /* 0x28-0x2B */
        0xF2, 0x72, 0xF2, 0x73, 0xF2, 0x74, 0xF2, 0x75, /* 0x2C-0x2F */
        0xF2, 0x76, 0xF2, 0x77, 0xF2, 0x78, 0xF2, 0x79, /* 0x30-0x33 */
        0xF2, 0x7A, 0xF2, 0x7B, 0xF2, 0x7C, 0xF2, 0x7D, /* 0x34-0x37 */
        0xF2, 0x7E, 0xF2, 0x80, 0xF2, 0x81, 0xF2, 0x82, /* 0x38-0x3B */
        0xF2, 0x83, 0xF2, 0x84, 0xF2, 0x85, 0xF2, 0x86, /* 0x3C-0x3F */
        0xF2, 0x87, 0xF2, 0x88, 0xF2, 0x89, 0xF2, 0x8A, /* 0x40-0x43 */
        0xF2, 0x8B, 0xF2, 0x8C, 0xF2, 0x8D, 0xF2, 0x8E, /* 0x44-0x47 */
        0xF2, 0x8F, 0xF2, 0x90, 0xF2, 0x91, 0xF2, 0x92, /* 0x48-0x4B */
        0xF2, 0x93, 0xF2, 0x94, 0xF2, 0x95, 0xF2, 0x96, /* 0x4C-0x4F */
        0xF2, 0x97, 0xF2, 0x98, 0xF2, 0x99, 0xF2, 0x9A, /* 0x50-0x53 */
        0xF2, 0x9B, 0xF2, 0x9C, 0xF2, 0x9D, 0xF2, 0x9E, /* 0x54-0x57 */
        0xF2, 0x9F, 0xF2, 0xA0, 0xF3, 0x40, 0xF3, 0x41, /* 0x58-0x5B */
        0xF3, 0x42, 0xF3, 0x43, 0xF3, 0x44, 0xF3, 0x45, /* 0x5C-0x5F */
        0xF3, 0x46, 0xF3, 0x47, 0xF3, 0x48, 0xF3, 0x49, /* 0x60-0x63 */
        0xF3, 0x4A, 0xF3, 0x4B, 0xF3, 0x4C, 0xF3, 0x4D, /* 0x64-0x67 */
        0xF3, 0x4E, 0xF3, 0x4F, 0xF3, 0x50, 0xF3, 0x51, /* 0x68-0x6B */
        0xC2, 0xED, 0xD4, 0xA6, 0xCD, 0xD4, 0xD1, 0xB1, /* 0x6C-0x6F */
        0xB3, 0xDB, 0xC7, 0xFD, 0xF3, 0x52, 0xB2, 0xB5, /* 0x70-0x73 */
        0xC2, 0xBF, 0xE6, 0xE0, 0xCA, 0xBB, 0xE6, 0xE1, /* 0x74-0x77 */
        0xE6, 0xE2, 0xBE, 0xD4, 0xE6, 0xE3, 0xD7, 0xA4, /* 0x78-0x7B */
        0xCD, 0xD5, 0xE6, 0xE5, 0xBC, 0xDD, 0xE6, 0xE4, /* 0x7C-0x7F */
        
        0xE6, 0xE6, 0xE6, 0xE7, 0xC2, 0xEE, 0xF3, 0x53, /* 0x80-0x83 */
        0xBD, 0xBE, 0xE6, 0xE8, 0xC2, 0xE6, 0xBA, 0xA7, /* 0x84-0x87 */
        0xE6, 0xE9, 0xF3, 0x54, 0xE6, 0xEA, 0xB3, 0xD2, /* 0x88-0x8B */
        0xD1, 0xE9, 0xF3, 0x55, 0xF3, 0x56, 0xBF, 0xA5, /* 0x8C-0x8F */
        0xE6, 0xEB, 0xC6, 0xEF, 0xE6, 0xEC, 0xE6, 0xED, /* 0x90-0x93 */
        0xF3, 0x57, 0xF3, 0x58, 0xE6, 0xEE, 0xC6, 0xAD, /* 0x94-0x97 */
        0xE6, 0xEF, 0xF3, 0x59, 0xC9, 0xA7, 0xE6, 0xF0, /* 0x98-0x9B */
        0xE6, 0xF1, 0xE6, 0xF2, 0xE5, 0xB9, 0xE6, 0xF3, /* 0x9C-0x9F */
        0xE6, 0xF4, 0xC2, 0xE2, 0xE6, 0xF5, 0xE6, 0xF6, /* 0xA0-0xA3 */
        0xD6, 0xE8, 0xE6, 0xF7, 0xF3, 0x5A, 0xE6, 0xF8, /* 0xA4-0xA7 */
        0xB9, 0xC7, 0xF3, 0x5B, 0xF3, 0x5C, 0xF3, 0x5D, /* 0xA8-0xAB */
        0xF3, 0x5E, 0xF3, 0x5F, 0xF3, 0x60, 0xF3, 0x61, /* 0xAC-0xAF */
        0xF7, 0xBB, 0xF7, 0xBA, 0xF3, 0x62, 0xF3, 0x63, /* 0xB0-0xB3 */
        0xF3, 0x64, 0xF3, 0x65, 0xF7, 0xBE, 0xF7, 0xBC, /* 0xB4-0xB7 */
        0xBA, 0xA1, 0xF3, 0x66, 0xF7, 0xBF, 0xF3, 0x67, /* 0xB8-0xBB */
        0xF7, 0xC0, 0xF3, 0x68, 0xF3, 0x69, 0xF3, 0x6A, /* 0xBC-0xBF */
        0xF7, 0xC2, 0xF7, 0xC1, 0xF7, 0xC4, 0xF3, 0x6B, /* 0xC0-0xC3 */
        0xF3, 0x6C, 0xF7, 0xC3, 0xF3, 0x6D, 0xF3, 0x6E, /* 0xC4-0xC7 */
        0xF3, 0x6F, 0xF3, 0x70, 0xF3, 0x71, 0xF7, 0xC5, /* 0xC8-0xCB */
        0xF7, 0xC6, 0xF3, 0x72, 0xF3, 0x73, 0xF3, 0x74, /* 0xCC-0xCF */
        0xF3, 0x75, 0xF7, 0xC7, 0xF3, 0x76, 0xCB, 0xE8, /* 0xD0-0xD3 */
        0xF3, 0x77, 0xF3, 0x78, 0xF3, 0x79, 0xF3, 0x7A, /* 0xD4-0xD7 */
        0xB8, 0xDF, 0xF3, 0x7B, 0xF3, 0x7C, 0xF3, 0x7D, /* 0xD8-0xDB */
        0xF3, 0x7E, 0xF3, 0x80, 0xF3, 0x81, 0xF7, 0xD4, /* 0xDC-0xDF */
        0xF3, 0x82, 0xF7, 0xD5, 0xF3, 0x83, 0xF3, 0x84, /* 0xE0-0xE3 */
        0xF3, 0x85, 0xF3, 0x86, 0xF7, 0xD6, 0xF3, 0x87, /* 0xE4-0xE7 */
        0xF3, 0x88, 0xF3, 0x89, 0xF3, 0x8A, 0xF7, 0xD8, /* 0xE8-0xEB */
        0xF3, 0x8B, 0xF7, 0xDA, 0xF3, 0x8C, 0xF7, 0xD7, /* 0xEC-0xEF */
        0xF3, 0x8D, 0xF3, 0x8E, 0xF3, 0x8F, 0xF3, 0x90, /* 0xF0-0xF3 */
        0xF3, 0x91, 0xF3, 0x92, 0xF3, 0x93, 0xF3, 0x94, /* 0xF4-0xF7 */
        0xF3, 0x95, 0xF7, 0xDB, 0xF3, 0x96, 0xF7, 0xD9, /* 0xF8-0xFB */
        0xF3, 0x97, 0xF3, 0x98, 0xF3, 0x99, 0xF3, 0x9A, /* 0xFC-0xFF */
};

static const unsigned char u2c_9B[512] = {
        0xF3, 0x9B, 0xF3, 0x9C, 0xF3, 0x9D, 0xD7, 0xD7, /* 0x00-0x03 */
        0xF3, 0x9E, 0xF3, 0x9F, 0xF3, 0xA0, 0xF4, 0x40, /* 0x04-0x07 */
        0xF7, 0xDC, 0xF4, 0x41, 0xF4, 0x42, 0xF4, 0x43, /* 0x08-0x0B */
        0xF4, 0x44, 0xF4, 0x45, 0xF4, 0x46, 0xF7, 0xDD, /* 0x0C-0x0F */
        0xF4, 0x47, 0xF4, 0x48, 0xF4, 0x49, 0xF7, 0xDE, /* 0x10-0x13 */
        0xF4, 0x4A, 0xF4, 0x4B, 0xF4, 0x4C, 0xF4, 0x4D, /* 0x14-0x17 */
        0xF4, 0x4E, 0xF4, 0x4F, 0xF4, 0x50, 0xF4, 0x51, /* 0x18-0x1B */
        0xF4, 0x52, 0xF4, 0x53, 0xF4, 0x54, 0xF7, 0xDF, /* 0x1C-0x1F */
        0xF4, 0x55, 0xF4, 0x56, 0xF4, 0x57, 0xF7, 0xE0, /* 0x20-0x23 */
        0xF4, 0x58, 0xF4, 0x59, 0xF4, 0x5A, 0xF4, 0x5B, /* 0x24-0x27 */
        0xF4, 0x5C, 0xF4, 0x5D, 0xF4, 0x5E, 0xF4, 0x5F, /* 0x28-0x2B */
        0xF4, 0x60, 0xF4, 0x61, 0xF4, 0x62, 0xDB, 0xCB, /* 0x2C-0x2F */
        0xF4, 0x63, 0xF4, 0x64, 0xD8, 0xAA, 0xF4, 0x65, /* 0x30-0x33 */
        0xF4, 0x66, 0xF4, 0x67, 0xF4, 0x68, 0xF4, 0x69, /* 0x34-0x37 */
        0xF4, 0x6A, 0xF4, 0x6B, 0xF4, 0x6C, 0xE5, 0xF7, /* 0x38-0x3B */
        0xB9, 0xED, 0xF4, 0x6D, 0xF4, 0x6E, 0xF4, 0x6F, /* 0x3C-0x3F */
        0xF4, 0x70, 0xBF, 0xFD, 0xBB, 0xEA, 0xF7, 0xC9, /* 0x40-0x43 */
        0xC6, 0xC7, 0xF7, 0xC8, 0xF4, 0x71, 0xF7, 0xCA, /* 0x44-0x47 */
        0xF7, 0xCC, 0xF7, 0xCB, 0xF4, 0x72, 0xF4, 0x73, /* 0x48-0x4B */
        0xF4, 0x74, 0xF7, 0xCD, 0xF4, 0x75, 0xCE, 0xBA, /* 0x4C-0x4F */
        0xF4, 0x76, 0xF7, 0xCE, 0xF4, 0x77, 0xF4, 0x78, /* 0x50-0x53 */
        0xC4, 0xA7, 0xF4, 0x79, 0xF4, 0x7A, 0xF4, 0x7B, /* 0x54-0x57 */
        0xF4, 0x7C, 0xF4, 0x7D, 0xF4, 0x7E, 0xF4, 0x80, /* 0x58-0x5B */
        0xF4, 0x81, 0xF4, 0x82, 0xF4, 0x83, 0xF4, 0x84, /* 0x5C-0x5F */
        0xF4, 0x85, 0xF4, 0x86, 0xF4, 0x87, 0xF4, 0x88, /* 0x60-0x63 */
        0xF4, 0x89, 0xF4, 0x8A, 0xF4, 0x8B, 0xF4, 0x8C, /* 0x64-0x67 */
        0xF4, 0x8D, 0xF4, 0x8E, 0xF4, 0x8F, 0xF4, 0x90, /* 0x68-0x6B */
        0xF4, 0x91, 0xF4, 0x92, 0xF4, 0x93, 0xF4, 0x94, /* 0x6C-0x6F */
        0xF4, 0x95, 0xF4, 0x96, 0xF4, 0x97, 0xF4, 0x98, /* 0x70-0x73 */
        0xF4, 0x99, 0xF4, 0x9A, 0xF4, 0x9B, 0xF4, 0x9C, /* 0x74-0x77 */
        0xF4, 0x9D, 0xF4, 0x9E, 0xF4, 0x9F, 0xF4, 0xA0, /* 0x78-0x7B */
        0xF5, 0x40, 0xF5, 0x41, 0xF5, 0x42, 0xF5, 0x43, /* 0x7C-0x7F */
        
        0xF5, 0x44, 0xF5, 0x45, 0xF5, 0x46, 0xF5, 0x47, /* 0x80-0x83 */
        0xF5, 0x48, 0xF5, 0x49, 0xF5, 0x4A, 0xF5, 0x4B, /* 0x84-0x87 */
        0xF5, 0x4C, 0xF5, 0x4D, 0xF5, 0x4E, 0xF5, 0x4F, /* 0x88-0x8B */
        0xF5, 0x50, 0xF5, 0x51, 0xF5, 0x52, 0xF5, 0x53, /* 0x8C-0x8F */
        0xF5, 0x54, 0xF5, 0x55, 0xF5, 0x56, 0xF5, 0x57, /* 0x90-0x93 */
        0xF5, 0x58, 0xF5, 0x59, 0xF5, 0x5A, 0xF5, 0x5B, /* 0x94-0x97 */
        0xF5, 0x5C, 0xF5, 0x5D, 0xF5, 0x5E, 0xF5, 0x5F, /* 0x98-0x9B */
        0xF5, 0x60, 0xF5, 0x61, 0xF5, 0x62, 0xF5, 0x63, /* 0x9C-0x9F */
        0xF5, 0x64, 0xF5, 0x65, 0xF5, 0x66, 0xF5, 0x67, /* 0xA0-0xA3 */
        0xF5, 0x68, 0xF5, 0x69, 0xF5, 0x6A, 0xF5, 0x6B, /* 0xA4-0xA7 */
        0xF5, 0x6C, 0xF5, 0x6D, 0xF5, 0x6E, 0xF5, 0x6F, /* 0xA8-0xAB */
        0xF5, 0x70, 0xF5, 0x71, 0xF5, 0x72, 0xF5, 0x73, /* 0xAC-0xAF */
        0xF5, 0x74, 0xF5, 0x75, 0xF5, 0x76, 0xF5, 0x77, /* 0xB0-0xB3 */
        0xF5, 0x78, 0xF5, 0x79, 0xF5, 0x7A, 0xF5, 0x7B, /* 0xB4-0xB7 */
        0xF5, 0x7C, 0xF5, 0x7D, 0xF5, 0x7E, 0xF5, 0x80, /* 0xB8-0xBB */
        0xF5, 0x81, 0xF5, 0x82, 0xF5, 0x83, 0xF5, 0x84, /* 0xBC-0xBF */
        0xF5, 0x85, 0xF5, 0x86, 0xF5, 0x87, 0xF5, 0x88, /* 0xC0-0xC3 */
        0xF5, 0x89, 0xF5, 0x8A, 0xF5, 0x8B, 0xF5, 0x8C, /* 0xC4-0xC7 */
        0xF5, 0x8D, 0xF5, 0x8E, 0xF5, 0x8F, 0xF5, 0x90, /* 0xC8-0xCB */
        0xF5, 0x91, 0xF5, 0x92, 0xF5, 0x93, 0xF5, 0x94, /* 0xCC-0xCF */
        0xF5, 0x95, 0xF5, 0x96, 0xF5, 0x97, 0xF5, 0x98, /* 0xD0-0xD3 */
        0xF5, 0x99, 0xF5, 0x9A, 0xF5, 0x9B, 0xF5, 0x9C, /* 0xD4-0xD7 */
        0xF5, 0x9D, 0xF5, 0x9E, 0xF5, 0x9F, 0xF5, 0xA0, /* 0xD8-0xDB */
        0xF6, 0x40, 0xF6, 0x41, 0xF6, 0x42, 0xF6, 0x43, /* 0xDC-0xDF */
        0xF6, 0x44, 0xF6, 0x45, 0xF6, 0x46, 0xF6, 0x47, /* 0xE0-0xE3 */
        0xF6, 0x48, 0xF6, 0x49, 0xF6, 0x4A, 0xF6, 0x4B, /* 0xE4-0xE7 */
        0xF6, 0x4C, 0xF6, 0x4D, 0xF6, 0x4E, 0xF6, 0x4F, /* 0xE8-0xEB */
        0xF6, 0x50, 0xF6, 0x51, 0xF6, 0x52, 0xF6, 0x53, /* 0xEC-0xEF */
        0xF6, 0x54, 0xF6, 0x55, 0xF6, 0x56, 0xF6, 0x57, /* 0xF0-0xF3 */
        0xF6, 0x58, 0xF6, 0x59, 0xF6, 0x5A, 0xF6, 0x5B, /* 0xF4-0xF7 */
        0xF6, 0x5C, 0xF6, 0x5D, 0xF6, 0x5E, 0xF6, 0x5F, /* 0xF8-0xFB */
        0xF6, 0x60, 0xF6, 0x61, 0xF6, 0x62, 0xF6, 0x63, /* 0xFC-0xFF */
};

static const unsigned char u2c_9C[512] = {
        0xF6, 0x64, 0xF6, 0x65, 0xF6, 0x66, 0xF6, 0x67, /* 0x00-0x03 */
        0xF6, 0x68, 0xF6, 0x69, 0xF6, 0x6A, 0xF6, 0x6B, /* 0x04-0x07 */
        0xF6, 0x6C, 0xF6, 0x6D, 0xF6, 0x6E, 0xF6, 0x6F, /* 0x08-0x0B */
        0xF6, 0x70, 0xF6, 0x71, 0xF6, 0x72, 0xF6, 0x73, /* 0x0C-0x0F */
        0xF6, 0x74, 0xF6, 0x75, 0xF6, 0x76, 0xF6, 0x77, /* 0x10-0x13 */
        0xF6, 0x78, 0xF6, 0x79, 0xF6, 0x7A, 0xF6, 0x7B, /* 0x14-0x17 */
        0xF6, 0x7C, 0xF6, 0x7D, 0xF6, 0x7E, 0xF6, 0x80, /* 0x18-0x1B */
        0xF6, 0x81, 0xF6, 0x82, 0xF6, 0x83, 0xF6, 0x84, /* 0x1C-0x1F */
        0xF6, 0x85, 0xF6, 0x86, 0xF6, 0x87, 0xF6, 0x88, /* 0x20-0x23 */
        0xF6, 0x89, 0xF6, 0x8A, 0xF6, 0x8B, 0xF6, 0x8C, /* 0x24-0x27 */
        0xF6, 0x8D, 0xF6, 0x8E, 0xF6, 0x8F, 0xF6, 0x90, /* 0x28-0x2B */
        0xF6, 0x91, 0xF6, 0x92, 0xF6, 0x93, 0xF6, 0x94, /* 0x2C-0x2F */
        0xF6, 0x95, 0xF6, 0x96, 0xF6, 0x97, 0xF6, 0x98, /* 0x30-0x33 */
        0xF6, 0x99, 0xF6, 0x9A, 0xF6, 0x9B, 0xF6, 0x9C, /* 0x34-0x37 */
        0xF6, 0x9D, 0xF6, 0x9E, 0xF6, 0x9F, 0xF6, 0xA0, /* 0x38-0x3B */
        0xF7, 0x40, 0xF7, 0x41, 0xF7, 0x42, 0xF7, 0x43, /* 0x3C-0x3F */
        0xF7, 0x44, 0xF7, 0x45, 0xF7, 0x46, 0xF7, 0x47, /* 0x40-0x43 */
        0xF7, 0x48, 0xF7, 0x49, 0xF7, 0x4A, 0xF7, 0x4B, /* 0x44-0x47 */
        0xF7, 0x4C, 0xF7, 0x4D, 0xF7, 0x4E, 0xF7, 0x4F, /* 0x48-0x4B */
        0xF7, 0x50, 0xF7, 0x51, 0xF7, 0x52, 0xF7, 0x53, /* 0x4C-0x4F */
        0xF7, 0x54, 0xF7, 0x55, 0xF7, 0x56, 0xF7, 0x57, /* 0x50-0x53 */
        0xF7, 0x58, 0xF7, 0x59, 0xF7, 0x5A, 0xF7, 0x5B, /* 0x54-0x57 */
        0xF7, 0x5C, 0xF7, 0x5D, 0xF7, 0x5E, 0xF7, 0x5F, /* 0x58-0x5B */
        0xF7, 0x60, 0xF7, 0x61, 0xF7, 0x62, 0xF7, 0x63, /* 0x5C-0x5F */
        0xF7, 0x64, 0xF7, 0x65, 0xF7, 0x66, 0xF7, 0x67, /* 0x60-0x63 */
        0xF7, 0x68, 0xF7, 0x69, 0xF7, 0x6A, 0xF7, 0x6B, /* 0x64-0x67 */
        0xF7, 0x6C, 0xF7, 0x6D, 0xF7, 0x6E, 0xF7, 0x6F, /* 0x68-0x6B */
        0xF7, 0x70, 0xF7, 0x71, 0xF7, 0x72, 0xF7, 0x73, /* 0x6C-0x6F */
        0xF7, 0x74, 0xF7, 0x75, 0xF7, 0x76, 0xF7, 0x77, /* 0x70-0x73 */
        0xF7, 0x78, 0xF7, 0x79, 0xF7, 0x7A, 0xF7, 0x7B, /* 0x74-0x77 */
        0xF7, 0x7C, 0xF7, 0x7D, 0xF7, 0x7E, 0xF7, 0x80, /* 0x78-0x7B */
        0xD3, 0xE3, 0xF7, 0x81, 0xF7, 0x82, 0xF6, 0xCF, /* 0x7C-0x7F */
        
        0xF7, 0x83, 0xC2, 0xB3, 0xF6, 0xD0, 0xF7, 0x84, /* 0x80-0x83 */
        0xF7, 0x85, 0xF6, 0xD1, 0xF6, 0xD2, 0xF6, 0xD3, /* 0x84-0x87 */
        0xF6, 0xD4, 0xF7, 0x86, 0xF7, 0x87, 0xF6, 0xD6, /* 0x88-0x8B */
        0xF7, 0x88, 0xB1, 0xAB, 0xF6, 0xD7, 0xF7, 0x89, /* 0x8C-0x8F */
        0xF6, 0xD8, 0xF6, 0xD9, 0xF6, 0xDA, 0xF7, 0x8A, /* 0x90-0x93 */
        0xF6, 0xDB, 0xF6, 0xDC, 0xF7, 0x8B, 0xF7, 0x8C, /* 0x94-0x97 */
        0xF7, 0x8D, 0xF7, 0x8E, 0xF6, 0xDD, 0xF6, 0xDE, /* 0x98-0x9B */
        0xCF, 0xCA, 0xF7, 0x8F, 0xF6, 0xDF, 0xF6, 0xE0, /* 0x9C-0x9F */
        0xF6, 0xE1, 0xF6, 0xE2, 0xF6, 0xE3, 0xF6, 0xE4, /* 0xA0-0xA3 */
        0xC0, 0xF0, 0xF6, 0xE5, 0xF6, 0xE6, 0xF6, 0xE7, /* 0xA4-0xA7 */
        0xF6, 0xE8, 0xF6, 0xE9, 0xF7, 0x90, 0xF6, 0xEA, /* 0xA8-0xAB */
        0xF7, 0x91, 0xF6, 0xEB, 0xF6, 0xEC, 0xF7, 0x92, /* 0xAC-0xAF */
        0xF6, 0xED, 0xF6, 0xEE, 0xF6, 0xEF, 0xF6, 0xF0, /* 0xB0-0xB3 */
        0xF6, 0xF1, 0xF6, 0xF2, 0xF6, 0xF3, 0xF6, 0xF4, /* 0xB4-0xB7 */
        0xBE, 0xA8, 0xF7, 0x93, 0xF6, 0xF5, 0xF6, 0xF6, /* 0xB8-0xBB */
        0xF6, 0xF7, 0xF6, 0xF8, 0xF7, 0x94, 0xF7, 0x95, /* 0xBC-0xBF */
        0xF7, 0x96, 0xF7, 0x97, 0xF7, 0x98, 0xC8, 0xFA, /* 0xC0-0xC3 */
        0xF6, 0xF9, 0xF6, 0xFA, 0xF6, 0xFB, 0xF6, 0xFC, /* 0xC4-0xC7 */
        0xF7, 0x99, 0xF7, 0x9A, 0xF6, 0xFD, 0xF6, 0xFE, /* 0xC8-0xCB */
        0xF7, 0xA1, 0xF7, 0xA2, 0xF7, 0xA3, 0xF7, 0xA4, /* 0xCC-0xCF */
        0xF7, 0xA5, 0xF7, 0x9B, 0xF7, 0x9C, 0xF7, 0xA6, /* 0xD0-0xD3 */
        0xF7, 0xA7, 0xF7, 0xA8, 0xB1, 0xEE, 0xF7, 0xA9, /* 0xD4-0xD7 */
        0xF7, 0xAA, 0xF7, 0xAB, 0xF7, 0x9D, 0xF7, 0x9E, /* 0xD8-0xDB */
        0xF7, 0xAC, 0xF7, 0xAD, 0xC1, 0xDB, 0xF7, 0xAE, /* 0xDC-0xDF */
        0xF7, 0x9F, 0xF7, 0xA0, 0xF7, 0xAF, 0xF8, 0x40, /* 0xE0-0xE3 */
        0xF8, 0x41, 0xF8, 0x42, 0xF8, 0x43, 0xF8, 0x44, /* 0xE4-0xE7 */
        0xF8, 0x45, 0xF8, 0x46, 0xF8, 0x47, 0xF8, 0x48, /* 0xE8-0xEB */
        0xF8, 0x49, 0xF8, 0x4A, 0xF8, 0x4B, 0xF8, 0x4C, /* 0xEC-0xEF */
        0xF8, 0x4D, 0xF8, 0x4E, 0xF8, 0x4F, 0xF8, 0x50, /* 0xF0-0xF3 */
        0xF8, 0x51, 0xF8, 0x52, 0xF8, 0x53, 0xF8, 0x54, /* 0xF4-0xF7 */
        0xF8, 0x55, 0xF8, 0x56, 0xF8, 0x57, 0xF8, 0x58, /* 0xF8-0xFB */
        0xF8, 0x59, 0xF8, 0x5A, 0xF8, 0x5B, 0xF8, 0x5C, /* 0xFC-0xFF */
};

static const unsigned char u2c_9D[512] = {
        0xF8, 0x5D, 0xF8, 0x5E, 0xF8, 0x5F, 0xF8, 0x60, /* 0x00-0x03 */
        0xF8, 0x61, 0xF8, 0x62, 0xF8, 0x63, 0xF8, 0x64, /* 0x04-0x07 */
        0xF8, 0x65, 0xF8, 0x66, 0xF8, 0x67, 0xF8, 0x68, /* 0x08-0x0B */
        0xF8, 0x69, 0xF8, 0x6A, 0xF8, 0x6B, 0xF8, 0x6C, /* 0x0C-0x0F */
        0xF8, 0x6D, 0xF8, 0x6E, 0xF8, 0x6F, 0xF8, 0x70, /* 0x10-0x13 */
        0xF8, 0x71, 0xF8, 0x72, 0xF8, 0x73, 0xF8, 0x74, /* 0x14-0x17 */
        0xF8, 0x75, 0xF8, 0x76, 0xF8, 0x77, 0xF8, 0x78, /* 0x18-0x1B */
        0xF8, 0x79, 0xF8, 0x7A, 0xF8, 0x7B, 0xF8, 0x7C, /* 0x1C-0x1F */
        0xF8, 0x7D, 0xF8, 0x7E, 0xF8, 0x80, 0xF8, 0x81, /* 0x20-0x23 */
        0xF8, 0x82, 0xF8, 0x83, 0xF8, 0x84, 0xF8, 0x85, /* 0x24-0x27 */
        0xF8, 0x86, 0xF8, 0x87, 0xF8, 0x88, 0xF8, 0x89, /* 0x28-0x2B */
        0xF8, 0x8A, 0xF8, 0x8B, 0xF8, 0x8C, 0xF8, 0x8D, /* 0x2C-0x2F */
        0xF8, 0x8E, 0xF8, 0x8F, 0xF8, 0x90, 0xF8, 0x91, /* 0x30-0x33 */
        0xF8, 0x92, 0xF8, 0x93, 0xF8, 0x94, 0xF8, 0x95, /* 0x34-0x37 */
        0xF8, 0x96, 0xF8, 0x97, 0xF8, 0x98, 0xF8, 0x99, /* 0x38-0x3B */
        0xF8, 0x9A, 0xF8, 0x9B, 0xF8, 0x9C, 0xF8, 0x9D, /* 0x3C-0x3F */
        0xF8, 0x9E, 0xF8, 0x9F, 0xF8, 0xA0, 0xF9, 0x40, /* 0x40-0x43 */
        0xF9, 0x41, 0xF9, 0x42, 0xF9, 0x43, 0xF9, 0x44, /* 0x44-0x47 */
        0xF9, 0x45, 0xF9, 0x46, 0xF9, 0x47, 0xF9, 0x48, /* 0x48-0x4B */
        0xF9, 0x49, 0xF9, 0x4A, 0xF9, 0x4B, 0xF9, 0x4C, /* 0x4C-0x4F */
        0xF9, 0x4D, 0xF9, 0x4E, 0xF9, 0x4F, 0xF9, 0x50, /* 0x50-0x53 */
        0xF9, 0x51, 0xF9, 0x52, 0xF9, 0x53, 0xF9, 0x54, /* 0x54-0x57 */
        0xF9, 0x55, 0xF9, 0x56, 0xF9, 0x57, 0xF9, 0x58, /* 0x58-0x5B */
        0xF9, 0x59, 0xF9, 0x5A, 0xF9, 0x5B, 0xF9, 0x5C, /* 0x5C-0x5F */
        0xF9, 0x5D, 0xF9, 0x5E, 0xF9, 0x5F, 0xF9, 0x60, /* 0x60-0x63 */
        0xF9, 0x61, 0xF9, 0x62, 0xF9, 0x63, 0xF9, 0x64, /* 0x64-0x67 */
        0xF9, 0x65, 0xF9, 0x66, 0xF9, 0x67, 0xF9, 0x68, /* 0x68-0x6B */
        0xF9, 0x69, 0xF9, 0x6A, 0xF9, 0x6B, 0xF9, 0x6C, /* 0x6C-0x6F */
        0xF9, 0x6D, 0xF9, 0x6E, 0xF9, 0x6F, 0xF9, 0x70, /* 0x70-0x73 */
        0xF9, 0x71, 0xF9, 0x72, 0xF9, 0x73, 0xF9, 0x74, /* 0x74-0x77 */
        0xF9, 0x75, 0xF9, 0x76, 0xF9, 0x77, 0xF9, 0x78, /* 0x78-0x7B */
        0xF9, 0x79, 0xF9, 0x7A, 0xF9, 0x7B, 0xF9, 0x7C, /* 0x7C-0x7F */
        
        0xF9, 0x7D, 0xF9, 0x7E, 0xF9, 0x80, 0xF9, 0x81, /* 0x80-0x83 */
        0xF9, 0x82, 0xF9, 0x83, 0xF9, 0x84, 0xF9, 0x85, /* 0x84-0x87 */
        0xF9, 0x86, 0xF9, 0x87, 0xF9, 0x88, 0xF9, 0x89, /* 0x88-0x8B */
        0xF9, 0x8A, 0xF9, 0x8B, 0xF9, 0x8C, 0xF9, 0x8D, /* 0x8C-0x8F */
        0xF9, 0x8E, 0xF9, 0x8F, 0xF9, 0x90, 0xF9, 0x91, /* 0x90-0x93 */
        0xF9, 0x92, 0xF9, 0x93, 0xF9, 0x94, 0xF9, 0x95, /* 0x94-0x97 */
        0xF9, 0x96, 0xF9, 0x97, 0xF9, 0x98, 0xF9, 0x99, /* 0x98-0x9B */
        0xF9, 0x9A, 0xF9, 0x9B, 0xF9, 0x9C, 0xF9, 0x9D, /* 0x9C-0x9F */
        0xF9, 0x9E, 0xF9, 0x9F, 0xF9, 0xA0, 0xFA, 0x40, /* 0xA0-0xA3 */
        0xFA, 0x41, 0xFA, 0x42, 0xFA, 0x43, 0xFA, 0x44, /* 0xA4-0xA7 */
        0xFA, 0x45, 0xFA, 0x46, 0xFA, 0x47, 0xFA, 0x48, /* 0xA8-0xAB */
        0xFA, 0x49, 0xFA, 0x4A, 0xFA, 0x4B, 0xFA, 0x4C, /* 0xAC-0xAF */
        0xFA, 0x4D, 0xFA, 0x4E, 0xFA, 0x4F, 0xFA, 0x50, /* 0xB0-0xB3 */
        0xFA, 0x51, 0xFA, 0x52, 0xFA, 0x53, 0xFA, 0x54, /* 0xB4-0xB7 */
        0xFA, 0x55, 0xFA, 0x56, 0xFA, 0x57, 0xFA, 0x58, /* 0xB8-0xBB */
        0xFA, 0x59, 0xFA, 0x5A, 0xFA, 0x5B, 0xFA, 0x5C, /* 0xBC-0xBF */
        0xFA, 0x5D, 0xFA, 0x5E, 0xFA, 0x5F, 0xFA, 0x60, /* 0xC0-0xC3 */
        0xFA, 0x61, 0xFA, 0x62, 0xFA, 0x63, 0xFA, 0x64, /* 0xC4-0xC7 */
        0xFA, 0x65, 0xFA, 0x66, 0xFA, 0x67, 0xFA, 0x68, /* 0xC8-0xCB */
        0xFA, 0x69, 0xFA, 0x6A, 0xFA, 0x6B, 0xFA, 0x6C, /* 0xCC-0xCF */
        0xFA, 0x6D, 0xFA, 0x6E, 0xFA, 0x6F, 0xFA, 0x70, /* 0xD0-0xD3 */
        0xFA, 0x71, 0xFA, 0x72, 0xFA, 0x73, 0xFA, 0x74, /* 0xD4-0xD7 */
        0xFA, 0x75, 0xFA, 0x76, 0xFA, 0x77, 0xFA, 0x78, /* 0xD8-0xDB */
        0xFA, 0x79, 0xFA, 0x7A, 0xFA, 0x7B, 0xFA, 0x7C, /* 0xDC-0xDF */
        0xFA, 0x7D, 0xFA, 0x7E, 0xFA, 0x80, 0xFA, 0x81, /* 0xE0-0xE3 */
        0xFA, 0x82, 0xFA, 0x83, 0xFA, 0x84, 0xFA, 0x85, /* 0xE4-0xE7 */
        0xFA, 0x86, 0xFA, 0x87, 0xFA, 0x88, 0xFA, 0x89, /* 0xE8-0xEB */
        0xFA, 0x8A, 0xFA, 0x8B, 0xFA, 0x8C, 0xFA, 0x8D, /* 0xEC-0xEF */
        0xFA, 0x8E, 0xFA, 0x8F, 0xFA, 0x90, 0xFA, 0x91, /* 0xF0-0xF3 */
        0xFA, 0x92, 0xFA, 0x93, 0xFA, 0x94, 0xFA, 0x95, /* 0xF4-0xF7 */
        0xFA, 0x96, 0xFA, 0x97, 0xFA, 0x98, 0xFA, 0x99, /* 0xF8-0xFB */
        0xFA, 0x9A, 0xFA, 0x9B, 0xFA, 0x9C, 0xFA, 0x9D, /* 0xFC-0xFF */
};

static const unsigned char u2c_9E[512] = {
        0xFA, 0x9E, 0xFA, 0x9F, 0xFA, 0xA0, 0xFB, 0x40, /* 0x00-0x03 */
        0xFB, 0x41, 0xFB, 0x42, 0xFB, 0x43, 0xFB, 0x44, /* 0x04-0x07 */
        0xFB, 0x45, 0xFB, 0x46, 0xFB, 0x47, 0xFB, 0x48, /* 0x08-0x0B */
        0xFB, 0x49, 0xFB, 0x4A, 0xFB, 0x4B, 0xFB, 0x4C, /* 0x0C-0x0F */
        0xFB, 0x4D, 0xFB, 0x4E, 0xFB, 0x4F, 0xFB, 0x50, /* 0x10-0x13 */
        0xFB, 0x51, 0xFB, 0x52, 0xFB, 0x53, 0xFB, 0x54, /* 0x14-0x17 */
        0xFB, 0x55, 0xFB, 0x56, 0xFB, 0x57, 0xFB, 0x58, /* 0x18-0x1B */
        0xFB, 0x59, 0xFB, 0x5A, 0xFB, 0x5B, 0xC4, 0xF1, /* 0x1C-0x1F */
        0xF0, 0xAF, 0xBC, 0xA6, 0xF0, 0xB0, 0xC3, 0xF9, /* 0x20-0x23 */
        0xFB, 0x5C, 0xC5, 0xB8, 0xD1, 0xBB, 0xFB, 0x5D, /* 0x24-0x27 */
        0xF0, 0xB1, 0xF0, 0xB2, 0xF0, 0xB3, 0xF0, 0xB4, /* 0x28-0x2B */
        0xF0, 0xB5, 0xD1, 0xBC, 0xFB, 0x5E, 0xD1, 0xEC, /* 0x2C-0x2F */
        0xFB, 0x5F, 0xF0, 0xB7, 0xF0, 0xB6, 0xD4, 0xA7, /* 0x30-0x33 */
        0xFB, 0x60, 0xCD, 0xD2, 0xF0, 0xB8, 0xF0, 0xBA, /* 0x34-0x37 */
        0xF0, 0xB9, 0xF0, 0xBB, 0xF0, 0xBC, 0xFB, 0x61, /* 0x38-0x3B */
        0xFB, 0x62, 0xB8, 0xEB, 0xF0, 0xBD, 0xBA, 0xE8, /* 0x3C-0x3F */
        0xFB, 0x63, 0xF0, 0xBE, 0xF0, 0xBF, 0xBE, 0xE9, /* 0x40-0x43 */
        0xF0, 0xC0, 0xB6, 0xEC, 0xF0, 0xC1, 0xF0, 0xC2, /* 0x44-0x47 */
        0xF0, 0xC3, 0xF0, 0xC4, 0xC8, 0xB5, 0xF0, 0xC5, /* 0x48-0x4B */
        0xF0, 0xC6, 0xFB, 0x64, 0xF0, 0xC7, 0xC5, 0xF4, /* 0x4C-0x4F */
        0xFB, 0x65, 0xF0, 0xC8, 0xFB, 0x66, 0xFB, 0x67, /* 0x50-0x53 */
        0xFB, 0x68, 0xF0, 0xC9, 0xFB, 0x69, 0xF0, 0xCA, /* 0x54-0x57 */
        0xF7, 0xBD, 0xFB, 0x6A, 0xF0, 0xCB, 0xF0, 0xCC, /* 0x58-0x5B */
        0xF0, 0xCD, 0xFB, 0x6B, 0xF0, 0xCE, 0xFB, 0x6C, /* 0x5C-0x5F */
        0xFB, 0x6D, 0xFB, 0x6E, 0xFB, 0x6F, 0xF0, 0xCF, /* 0x60-0x63 */
        0xBA, 0xD7, 0xFB, 0x70, 0xF0, 0xD0, 0xF0, 0xD1, /* 0x64-0x67 */
        0xF0, 0xD2, 0xF0, 0xD3, 0xF0, 0xD4, 0xF0, 0xD5, /* 0x68-0x6B */
        0xF0, 0xD6, 0xF0, 0xD8, 0xFB, 0x71, 0xFB, 0x72, /* 0x6C-0x6F */
        0xD3, 0xA5, 0xF0, 0xD7, 0xFB, 0x73, 0xF0, 0xD9, /* 0x70-0x73 */
        0xFB, 0x74, 0xFB, 0x75, 0xFB, 0x76, 0xFB, 0x77, /* 0x74-0x77 */
        0xFB, 0x78, 0xFB, 0x79, 0xFB, 0x7A, 0xFB, 0x7B, /* 0x78-0x7B */
        0xFB, 0x7C, 0xFB, 0x7D, 0xF5, 0xBA, 0xC2, 0xB9, /* 0x7C-0x7F */
        
        0xFB, 0x7E, 0xFB, 0x80, 0xF7, 0xE4, 0xFB, 0x81, /* 0x80-0x83 */
        0xFB, 0x82, 0xFB, 0x83, 0xFB, 0x84, 0xF7, 0xE5, /* 0x84-0x87 */
        0xF7, 0xE6, 0xFB, 0x85, 0xFB, 0x86, 0xF7, 0xE7, /* 0x88-0x8B */
        0xFB, 0x87, 0xFB, 0x88, 0xFB, 0x89, 0xFB, 0x8A, /* 0x8C-0x8F */
        0xFB, 0x8B, 0xFB, 0x8C, 0xF7, 0xE8, 0xC2, 0xB4, /* 0x90-0x93 */
        0xFB, 0x8D, 0xFB, 0x8E, 0xFB, 0x8F, 0xFB, 0x90, /* 0x94-0x97 */
        0xFB, 0x91, 0xFB, 0x92, 0xFB, 0x93, 0xFB, 0x94, /* 0x98-0x9B */
        0xFB, 0x95, 0xF7, 0xEA, 0xFB, 0x96, 0xF7, 0xEB, /* 0x9C-0x9F */
        0xFB, 0x97, 0xFB, 0x98, 0xFB, 0x99, 0xFB, 0x9A, /* 0xA0-0xA3 */
        0xFB, 0x9B, 0xFB, 0x9C, 0xC2, 0xF3, 0xFB, 0x9D, /* 0xA4-0xA7 */
        0xFB, 0x9E, 0xFB, 0x9F, 0xFB, 0xA0, 0xFC, 0x40, /* 0xA8-0xAB */
        0xFC, 0x41, 0xFC, 0x42, 0xFC, 0x43, 0xFC, 0x44, /* 0xAC-0xAF */
        0xFC, 0x45, 0xFC, 0x46, 0xFC, 0x47, 0xFC, 0x48, /* 0xB0-0xB3 */
        0xF4, 0xF0, 0xFC, 0x49, 0xFC, 0x4A, 0xFC, 0x4B, /* 0xB4-0xB7 */
        0xF4, 0xEF, 0xFC, 0x4C, 0xFC, 0x4D, 0xC2, 0xE9, /* 0xB8-0xBB */
        0xFC, 0x4E, 0xF7, 0xE1, 0xF7, 0xE2, 0xFC, 0x4F, /* 0xBC-0xBF */
        0xFC, 0x50, 0xFC, 0x51, 0xFC, 0x52, 0xFC, 0x53, /* 0xC0-0xC3 */
        0xBB, 0xC6, 0xFC, 0x54, 0xFC, 0x55, 0xFC, 0x56, /* 0xC4-0xC7 */
        0xFC, 0x57, 0xD9, 0xE4, 0xFC, 0x58, 0xFC, 0x59, /* 0xC8-0xCB */
        0xFC, 0x5A, 0xCA, 0xF2, 0xC0, 0xE8, 0xF0, 0xA4, /* 0xCC-0xCF */
        0xFC, 0x5B, 0xBA, 0xDA, 0xFC, 0x5C, 0xFC, 0x5D, /* 0xD0-0xD3 */
        0xC7, 0xAD, 0xFC, 0x5E, 0xFC, 0x5F, 0xFC, 0x60, /* 0xD4-0xD7 */
        0xC4, 0xAC, 0xFC, 0x61, 0xFC, 0x62, 0xF7, 0xEC, /* 0xD8-0xDB */
        0xF7, 0xED, 0xF7, 0xEE, 0xFC, 0x63, 0xF7, 0xF0, /* 0xDC-0xDF */
        0xF7, 0xEF, 0xFC, 0x64, 0xF7, 0xF1, 0xFC, 0x65, /* 0xE0-0xE3 */
        0xFC, 0x66, 0xF7, 0xF4, 0xFC, 0x67, 0xF7, 0xF3, /* 0xE4-0xE7 */
        0xFC, 0x68, 0xF7, 0xF2, 0xF7, 0xF5, 0xFC, 0x69, /* 0xE8-0xEB */
        0xFC, 0x6A, 0xFC, 0x6B, 0xFC, 0x6C, 0xF7, 0xF6, /* 0xEC-0xEF */
        0xFC, 0x6D, 0xFC, 0x6E, 0xFC, 0x6F, 0xFC, 0x70, /* 0xF0-0xF3 */
        0xFC, 0x71, 0xFC, 0x72, 0xFC, 0x73, 0xFC, 0x74, /* 0xF4-0xF7 */
        0xFC, 0x75, 0xED, 0xE9, 0xFC, 0x76, 0xED, 0xEA, /* 0xF8-0xFB */
        0xED, 0xEB, 0xFC, 0x77, 0xF6, 0xBC, 0xFC, 0x78, /* 0xFC-0xFF */
};

static const unsigned char u2c_9F[512] = {
        0xFC, 0x79, 0xFC, 0x7A, 0xFC, 0x7B, 0xFC, 0x7C, /* 0x00-0x03 */
        0xFC, 0x7D, 0xFC, 0x7E, 0xFC, 0x80, 0xFC, 0x81, /* 0x04-0x07 */
        0xFC, 0x82, 0xFC, 0x83, 0xFC, 0x84, 0xF6, 0xBD, /* 0x08-0x0B */
        0xFC, 0x85, 0xF6, 0xBE, 0xB6, 0xA6, 0xFC, 0x86, /* 0x0C-0x0F */
        0xD8, 0xBE, 0xFC, 0x87, 0xFC, 0x88, 0xB9, 0xC4, /* 0x10-0x13 */
        0xFC, 0x89, 0xFC, 0x8A, 0xFC, 0x8B, 0xD8, 0xBB, /* 0x14-0x17 */
        0xFC, 0x8C, 0xDC, 0xB1, 0xFC, 0x8D, 0xFC, 0x8E, /* 0x18-0x1B */
        0xFC, 0x8F, 0xFC, 0x90, 0xFC, 0x91, 0xFC, 0x92, /* 0x1C-0x1F */
        0xCA, 0xF3, 0xFC, 0x93, 0xF7, 0xF7, 0xFC, 0x94, /* 0x20-0x23 */
        0xFC, 0x95, 0xFC, 0x96, 0xFC, 0x97, 0xFC, 0x98, /* 0x24-0x27 */
        0xFC, 0x99, 0xFC, 0x9A, 0xFC, 0x9B, 0xFC, 0x9C, /* 0x28-0x2B */
        0xF7, 0xF8, 0xFC, 0x9D, 0xFC, 0x9E, 0xF7, 0xF9, /* 0x2C-0x2F */
        0xFC, 0x9F, 0xFC, 0xA0, 0xFD, 0x40, 0xFD, 0x41, /* 0x30-0x33 */
        0xFD, 0x42, 0xFD, 0x43, 0xFD, 0x44, 0xF7, 0xFB, /* 0x34-0x37 */
        0xFD, 0x45, 0xF7, 0xFA, 0xFD, 0x46, 0xB1, 0xC7, /* 0x38-0x3B */
        0xFD, 0x47, 0xF7, 0xFC, 0xF7, 0xFD, 0xFD, 0x48, /* 0x3C-0x3F */
        0xFD, 0x49, 0xFD, 0x4A, 0xFD, 0x4B, 0xFD, 0x4C, /* 0x40-0x43 */
        0xF7, 0xFE, 0xFD, 0x4D, 0xFD, 0x4E, 0xFD, 0x4F, /* 0x44-0x47 */
        0xFD, 0x50, 0xFD, 0x51, 0xFD, 0x52, 0xFD, 0x53, /* 0x48-0x4B */
        0xFD, 0x54, 0xFD, 0x55, 0xFD, 0x56, 0xFD, 0x57, /* 0x4C-0x4F */
        0xC6, 0xEB, 0xEC, 0xB4, 0xFD, 0x58, 0xFD, 0x59, /* 0x50-0x53 */
        0xFD, 0x5A, 0xFD, 0x5B, 0xFD, 0x5C, 0xFD, 0x5D, /* 0x54-0x57 */
        0xFD, 0x5E, 0xFD, 0x5F, 0xFD, 0x60, 0xFD, 0x61, /* 0x58-0x5B */
        0xFD, 0x62, 0xFD, 0x63, 0xFD, 0x64, 0xFD, 0x65, /* 0x5C-0x5F */
        0xFD, 0x66, 0xFD, 0x67, 0xFD, 0x68, 0xFD, 0x69, /* 0x60-0x63 */
        0xFD, 0x6A, 0xFD, 0x6B, 0xFD, 0x6C, 0xFD, 0x6D, /* 0x64-0x67 */
        0xFD, 0x6E, 0xFD, 0x6F, 0xFD, 0x70, 0xFD, 0x71, /* 0x68-0x6B */
        0xFD, 0x72, 0xFD, 0x73, 0xFD, 0x74, 0xFD, 0x75, /* 0x6C-0x6F */
        0xFD, 0x76, 0xFD, 0x77, 0xFD, 0x78, 0xFD, 0x79, /* 0x70-0x73 */
        0xFD, 0x7A, 0xFD, 0x7B, 0xFD, 0x7C, 0xFD, 0x7D, /* 0x74-0x77 */
        0xFD, 0x7E, 0xFD, 0x80, 0xFD, 0x81, 0xFD, 0x82, /* 0x78-0x7B */
        0xFD, 0x83, 0xFD, 0x84, 0xFD, 0x85, 0xB3, 0xDD, /* 0x7C-0x7F */
        
        0xF6, 0xB3, 0xFD, 0x86, 0xFD, 0x87, 0xF6, 0xB4, /* 0x80-0x83 */
        0xC1, 0xE4, 0xF6, 0xB5, 0xF6, 0xB6, 0xF6, 0xB7, /* 0x84-0x87 */
        0xF6, 0xB8, 0xF6, 0xB9, 0xF6, 0xBA, 0xC8, 0xA3, /* 0x88-0x8B */
        0xF6, 0xBB, 0xFD, 0x88, 0xFD, 0x89, 0xFD, 0x8A, /* 0x8C-0x8F */
        0xFD, 0x8B, 0xFD, 0x8C, 0xFD, 0x8D, 0xFD, 0x8E, /* 0x90-0x93 */
        0xFD, 0x8F, 0xFD, 0x90, 0xFD, 0x91, 0xFD, 0x92, /* 0x94-0x97 */
        0xFD, 0x93, 0xC1, 0xFA, 0xB9, 0xA8, 0xED, 0xE8, /* 0x98-0x9B */
        0xFD, 0x94, 0xFD, 0x95, 0xFD, 0x96, 0xB9, 0xEA, /* 0x9C-0x9F */
        0xD9, 0xDF, 0xFD, 0x97, 0xFD, 0x98, 0xFD, 0x99, /* 0xA0-0xA3 */
        0xFD, 0x9A, 0xFD, 0x9B, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
};

static const unsigned char u2c_DC[512] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
};

static const unsigned char u2c_F9[512] = {
        0xD8, 0x4D, 0xB8, 0xFC, 0xDC, 0x87, 0xD9, 0x5A, /* 0x00-0x03 */
        0xBB, 0xAC, 0xB4, 0xAE, 0xBE, 0xE4, 0xFD, 0x94, /* 0x04-0x07 */
        0xFD, 0x94, 0xC6, 0xF5, 0xBD, 0xF0, 0xC0, 0xAE, /* 0x08-0x0B */
        0xC4, 0xCE, 0x91, 0xD0, 0xB0, 0x5D, 0xC1, 0x5F, /* 0x0C-0x0F */
        0xCC, 0x7D, 0xC2, 0xDD, 0xC2, 0xE3, 0xDF, 0x89, /* 0x10-0x13 */
        0x98, 0xB7, 0xC2, 0xE5, 0xC0, 0xD3, 0xE7, 0xF3, /* 0x14-0x17 */
        0xC2, 0xE4, 0xC0, 0xD2, 0xF1, 0x98, 0x81, 0x79, /* 0x18-0x1B */
        0xC2, 0xD1, 0x99, 0xDA, 0xA0, 0x80, 0xCC, 0x6D, /* 0x1C-0x1F */
        0xFB, 0x5B, 0x8D, 0xB9, 0x9E, 0x45, 0xCB, 0x7B, /* 0x20-0x23 */
        0xD2, 0x68, 0xC0, 0xAD, 0xC5, 0x44, 0xCF, 0x9E, /* 0x24-0x27 */
        0xC0, 0xC8, 0xC0, 0xCA, 0xC0, 0xCB, 0xC0, 0xC7, /* 0x28-0x2B */
        0xFD, 0x9C, 0x81, 0xED, 0xC0, 0xE4, 0x84, 0xDA, /* 0x2C-0x2F */
        0x93, 0xEF, 0x99, 0xA9, 0xA0, 0x74, 0xB1, 0x52, /* 0x30-0x33 */
        0xC0, 0xCF, 0xCC, 0x4A, 0xCC, 0x94, 0xC2, 0xB7, /* 0x34-0x37 */
        0xC2, 0xB6, 0xF4, 0x94, 0xFA, 0x98, 0xC2, 0xB5, /* 0x38-0x3B */
        0xB5, 0x93, 0xBE, 0x47, 0xC7, 0x8A, 0xE4, 0x9B, /* 0x3C-0x3F */
        0xC2, 0xB9, 0xD5, 0x93, 0x89, 0xC5, 0xC5, 0xAA, /* 0x40-0x43 */
        0xBB, 0x5C, 0xC3, 0x40, 0xC0, 0xCE, 0xC0, 0xDA, /* 0x44-0x47 */
        0xD9, 0x54, 0xC0, 0xD7, 0x89, 0xBE, 0x8C, 0xD2, /* 0x48-0x4B */
        0x98, 0xC7, 0x9C, 0x49, 0xC2, 0xA9, 0xC0, 0xDB, /* 0x4C-0x4F */
        0xBF, 0x7C, 0xC2, 0xAA, 0xC0, 0xD5, 0xC0, 0xDF, /* 0x50-0x53 */
        0x84, 0x43, 0xC1, 0xE8, 0xB6, 0xA0, 0xBE, 0x63, /* 0x54-0x57 */
        0xC1, 0xE2, 0xC1, 0xEA, 0xD7, 0x78, 0x92, 0x82, /* 0x58-0x5B */
        0x98, 0xB7, 0xD6, 0x5A, 0xB5, 0xA4, 0x8C, 0x8E, /* 0x5C-0x5F */
        0xC5, 0xAD, 0xC2, 0xCA, 0xAE, 0x90, 0xB1, 0xB1, /* 0x60-0x63 */
        0xB4, 0x91, 0xB1, 0xE3, 0x8F, 0xCD, 0xB2, 0xBB, /* 0x64-0x67 */
        0xC3, 0xDA, 0x94, 0xB5, 0xCB, 0xF7, 0x85, 0xA2, /* 0x68-0x6B */
        0xC8, 0xFB, 0xCA, 0xA1, 0xC8, 0x7E, 0xD5, 0x66, /* 0x6C-0x6F */
        0x9A, 0xA2, 0xB3, 0xBD, 0xC9, 0xF2, 0xCA, 0xB0, /* 0x70-0x73 */
        0xC8, 0xF4, 0xC2, 0xD3, 0xC2, 0xD4, 0xC1, 0xC1, /* 0x74-0x77 */
        0x83, 0xC9, 0xFD, 0x9D, 0xC1, 0xBA, 0xBC, 0x5A, /* 0x78-0x7B */
        0xC1, 0xBC, 0xD5, 0x8F, 0xC1, 0xBF, 0x84, 0xEE, /* 0x7C-0x7F */
        
        0x85, 0xCE, 0xC5, 0xAE, 0x8F, 0x5D, 0xC2, 0xC3, /* 0x80-0x83 */
        0x9E, 0x56, 0xB5, 0x5A, 0xE9, 0x82, 0xF3, 0x50, /* 0x84-0x87 */
        0xFB, 0x90, 0xC0, 0xE8, 0xC1, 0xA6, 0x95, 0xD1, /* 0x88-0x8B */
        0x9A, 0x76, 0xDE, 0x5D, 0xC4, 0xEA, 0x91, 0x7A, /* 0x8C-0x8F */
        0x91, 0xD9, 0x93, 0xD3, 0x9D, 0x69, 0x9F, 0x92, /* 0x90-0x93 */
        0xAD, 0x49, 0xFD, 0x9E, 0xBE, 0x9A, 0xC2, 0x93, /* 0x94-0x97 */
        0xDD, 0x82, 0xC9, 0x8F, 0xDF, 0x42, 0xE5, 0x80, /* 0x98-0x9B */
        0xC1, 0xD0, 0xC1, 0xD3, 0xD1, 0xCA, 0xC1, 0xD2, /* 0x9C-0x9F */
        0xC1, 0xD1, 0xD5, 0x66, 0xC1, 0xAE, 0xC4, 0xEE, /* 0xA0-0xA3 */
        0xC4, 0xED, 0x9A, 0x9A, 0xBA, 0x9F, 0xAB, 0x43, /* 0xA4-0xA7 */
        0xC1, 0xEE, 0xE0, 0xF2, 0x8C, 0x8E, 0x8E, 0x58, /* 0xA8-0xAB */
        0xC1, 0xAF, 0xC1, 0xE1, 0xAC, 0x93, 0xC1, 0xE7, /* 0xAC-0xAF */
        0xF1, 0xF6, 0xE2, 0x8F, 0xC1, 0xE3, 0xEC, 0x60, /* 0xB0-0xB3 */
        0xEE, 0x49, 0xC0, 0xFD, 0xB6, 0x59, 0xF5, 0xB7, /* 0xB4-0xB7 */
        0xEB, 0x60, 0x90, 0xBA, 0xC1, 0xCB, 0xC1, 0xC5, /* 0xB8-0xBB */
        0xE5, 0xBC, 0xC4, 0xF2, 0xC1, 0xCF, 0x98, 0xB7, /* 0xBC-0xBF */
        0xC1, 0xC7, 0xAF, 0x9F, 0xDE, 0xA4, 0xDF, 0x7C, /* 0xC0-0xC3 */
        0xFD, 0x88, 0x95, 0x9E, 0xC8, 0xEE, 0x84, 0xA2, /* 0xC4-0xC7 */
        0x96, 0x83, 0xC1, 0xF8, 0xC1, 0xF7, 0xC1, 0xEF, /* 0xC8-0xCB */
        0xC1, 0xF0, 0xC1, 0xF4, 0xC1, 0xF2, 0xBC, 0x7E, /* 0xCC-0xCF */
        0xEE, 0x90, 0xC1, 0xF9, 0xC2, 0xBE, 0xEA, 0x91, /* 0xD0-0xD3 */
        0x82, 0x90, 0x8D, 0x91, 0x9C, 0x53, 0xDD, 0x86, /* 0xD4-0xD7 */
        0xC2, 0xC9, 0x90, 0xFC, 0xC0, 0xF5, 0xC2, 0xCA, /* 0xD8-0xDB */
        0xC2, 0xA1, 0xC0, 0xFB, 0xC0, 0xF4, 0xC2, 0xC4, /* 0xDC-0xDF */
        0xD2, 0xD7, 0xC0, 0xEE, 0xC0, 0xE6, 0xC4, 0xE0, /* 0xE0-0xE3 */
        0xC0, 0xED, 0xC1, 0xA1, 0xEE, 0xBE, 0xFD, 0x9F, /* 0xE4-0xE7 */
        0xD1, 0x65, 0xC0, 0xEF, 0xEB, 0x78, 0xC4, 0xE4, /* 0xE8-0xEB */
        0xC4, 0xE7, 0xC1, 0xDF, 0x9F, 0xFB, 0xAD, 0x55, /* 0xEC-0xEF */
        0xCC, 0x41, 0xFD, 0xA0, 0xF7, 0x5B, 0xF7, 0xEB, /* 0xF0-0xF3 */
        0xC1, 0xD6, 0xC1, 0xDC, 0xC5, 0x52, 0xC1, 0xA2, /* 0xF4-0xF7 */
        0xF3, 0xD2, 0xC1, 0xA3, 0xA0, 0xEE, 0xD6, 0xCB, /* 0xF8-0xFB */
        0xD7, 0x52, 0xCA, 0xB2, 0xB2, 0xE8, 0xB4, 0xCC, /* 0xFC-0xFF */
};

static const unsigned char u2c_FA[512] = {
        0xC7, 0xD0, 0xB6, 0xC8, 0xCD, 0xD8, 0xCC, 0xC7, /* 0x00-0x03 */
        0xD5, 0xAC, 0xB6, 0xB4, 0xB1, 0xA9, 0xDD, 0x97, /* 0x04-0x07 */
        0xD0, 0xD0, 0xBD, 0xB5, 0xD2, 0x8A, 0xC0, 0xAA, /* 0x08-0x0B */
        0xFE, 0x40, 0xFE, 0x41, 0xFE, 0x42, 0xFE, 0x43, /* 0x0C-0x0F */
        0x89, 0x56, 0xFE, 0x44, 0xC7, 0xE7, 0xFE, 0x45, /* 0x10-0x13 */
        0xFE, 0x46, 0x84, 0x44, 0xD8, 0x69, 0xD2, 0xE6, /* 0x14-0x17 */
        0xFE, 0x47, 0xC9, 0xF1, 0xCF, 0xE9, 0xB8, 0xA3, /* 0x18-0x1B */
        0xBE, 0xB8, 0xBE, 0xAB, 0xD3, 0xF0, 0xFE, 0x48, /* 0x1C-0x1F */
        0xFE, 0x49, 0xFE, 0x4A, 0xD6, 0x54, 0xFE, 0x4B, /* 0x20-0x23 */
        0xFE, 0x4C, 0xD2, 0xDD, 0xB6, 0xBC, 0xFE, 0x4D, /* 0x24-0x27 */
        0xFE, 0x4E, 0xFE, 0x4F, 0xEF, 0x88, 0xEF, 0x95, /* 0x28-0x2B */
        0xF0, 0x5E, 0xFA, 0x51, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
};

static const unsigned char u2c_FE[512] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x03 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04-0x07 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0C-0x0F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x13 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x14-0x17 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1C-0x1F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x23 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x24-0x27 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2C-0x2F */
        0xA9, 0x55, 0xA6, 0xF2, 0x00, 0x00, 0xA6, 0xF4, /* 0x30-0x33 */
        0xA6, 0xF5, 0xA6, 0xE0, 0xA6, 0xE1, 0xA6, 0xF0, /* 0x34-0x37 */
        0xA6, 0xF1, 0xA6, 0xE2, 0xA6, 0xE3, 0xA6, 0xEE, /* 0x38-0x3B */
        0xA6, 0xEF, 0xA6, 0xE6, 0xA6, 0xE7, 0xA6, 0xE4, /* 0x3C-0x3F */
        0xA6, 0xE5, 0xA6, 0xE8, 0xA6, 0xE9, 0xA6, 0xEA, /* 0x40-0x43 */
        0xA6, 0xEB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x44-0x47 */
        0x00, 0x00, 0xA9, 0x68, 0xA9, 0x69, 0xA9, 0x6A, /* 0x48-0x4B */
        0xA9, 0x6B, 0xA9, 0x6C, 0xA9, 0x6D, 0xA9, 0x6E, /* 0x4C-0x4F */
        0xA9, 0x6F, 0xA9, 0x70, 0xA9, 0x71, 0x00, 0x00, /* 0x50-0x53 */
        0xA9, 0x72, 0xA9, 0x73, 0xA9, 0x74, 0xA9, 0x75, /* 0x54-0x57 */
        0x00, 0x00, 0xA9, 0x76, 0xA9, 0x77, 0xA9, 0x78, /* 0x58-0x5B */
        0xA9, 0x79, 0xA9, 0x7A, 0xA9, 0x7B, 0xA9, 0x7C, /* 0x5C-0x5F */
        0xA9, 0x7D, 0xA9, 0x7E, 0xA9, 0x80, 0xA9, 0x81, /* 0x60-0x63 */
        0xA9, 0x82, 0xA9, 0x83, 0xA9, 0x84, 0x00, 0x00, /* 0x64-0x67 */
        0xA9, 0x85, 0xA9, 0x86, 0xA9, 0x87, 0xA9, 0x88, /* 0x68-0x6B */
};

static const unsigned char u2c_FF[512] = {
        0x00, 0x00, 0xA3, 0xA1, 0xA3, 0xA2, 0xA3, 0xA3, /* 0x00-0x03 */
        0xA1, 0xE7, 0xA3, 0xA5, 0xA3, 0xA6, 0xA3, 0xA7, /* 0x04-0x07 */
        0xA3, 0xA8, 0xA3, 0xA9, 0xA3, 0xAA, 0xA3, 0xAB, /* 0x08-0x0B */
        0xA3, 0xAC, 0xA3, 0xAD, 0xA3, 0xAE, 0xA3, 0xAF, /* 0x0C-0x0F */
        0xA3, 0xB0, 0xA3, 0xB1, 0xA3, 0xB2, 0xA3, 0xB3, /* 0x10-0x13 */
        0xA3, 0xB4, 0xA3, 0xB5, 0xA3, 0xB6, 0xA3, 0xB7, /* 0x14-0x17 */
        0xA3, 0xB8, 0xA3, 0xB9, 0xA3, 0xBA, 0xA3, 0xBB, /* 0x18-0x1B */
        0xA3, 0xBC, 0xA3, 0xBD, 0xA3, 0xBE, 0xA3, 0xBF, /* 0x1C-0x1F */
        0xA3, 0xC0, 0xA3, 0xC1, 0xA3, 0xC2, 0xA3, 0xC3, /* 0x20-0x23 */
        0xA3, 0xC4, 0xA3, 0xC5, 0xA3, 0xC6, 0xA3, 0xC7, /* 0x24-0x27 */
        0xA3, 0xC8, 0xA3, 0xC9, 0xA3, 0xCA, 0xA3, 0xCB, /* 0x28-0x2B */
        0xA3, 0xCC, 0xA3, 0xCD, 0xA3, 0xCE, 0xA3, 0xCF, /* 0x2C-0x2F */
        0xA3, 0xD0, 0xA3, 0xD1, 0xA3, 0xD2, 0xA3, 0xD3, /* 0x30-0x33 */
        0xA3, 0xD4, 0xA3, 0xD5, 0xA3, 0xD6, 0xA3, 0xD7, /* 0x34-0x37 */
        0xA3, 0xD8, 0xA3, 0xD9, 0xA3, 0xDA, 0xA3, 0xDB, /* 0x38-0x3B */
        0xA3, 0xDC, 0xA3, 0xDD, 0xA3, 0xDE, 0xA3, 0xDF, /* 0x3C-0x3F */
        0xA3, 0xE0, 0xA3, 0xE1, 0xA3, 0xE2, 0xA3, 0xE3, /* 0x40-0x43 */
        0xA3, 0xE4, 0xA3, 0xE5, 0xA3, 0xE6, 0xA3, 0xE7, /* 0x44-0x47 */
        0xA3, 0xE8, 0xA3, 0xE9, 0xA3, 0xEA, 0xA3, 0xEB, /* 0x48-0x4B */
        0xA3, 0xEC, 0xA3, 0xED, 0xA3, 0xEE, 0xA3, 0xEF, /* 0x4C-0x4F */
        0xA3, 0xF0, 0xA3, 0xF1, 0xA3, 0xF2, 0xA3, 0xF3, /* 0x50-0x53 */
        0xA3, 0xF4, 0xA3, 0xF5, 0xA3, 0xF6, 0xA3, 0xF7, /* 0x54-0x57 */
        0xA3, 0xF8, 0xA3, 0xF9, 0xA3, 0xFA, 0xA3, 0xFB, /* 0x58-0x5B */
        0xA3, 0xFC, 0xA3, 0xFD, 0xA1, 0xAB, 0x00, 0x00, /* 0x5C-0x5F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x63 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x64-0x67 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x6C-0x6F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x73 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x74-0x77 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x7C-0x7F */
        
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x83 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x84-0x87 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x8C-0x8F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x93 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x94-0x97 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9B */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x9C-0x9F */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA0-0xA3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA4-0xA7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xA8-0xAB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xAC-0xAF */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB0-0xB3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB4-0xB7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xB8-0xBB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xBC-0xBF */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC0-0xC3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC4-0xC7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xC8-0xCB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xCC-0xCF */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD0-0xD3 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD4-0xD7 */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xD8-0xDB */
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xDC-0xDF */
        0xA1, 0xE9, 0xA1, 0xEA, 0xA9, 0x56, 0xA3, 0xFE, /* 0xE0-0xE3 */
        0xA9, 0x57, 0xA3, 0xA4, 0x00, 0x00, 0x00, 0x00, /* 0xE4-0xE7 */
};

static const unsigned char *const page_uni2charset[256] = {
        u2c_00, u2c_01, u2c_02, u2c_03, u2c_04, NULL,   NULL,   NULL,
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        u2c_20, u2c_21, u2c_22, u2c_23, u2c_24, u2c_25, u2c_26, NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        u2c_30, u2c_31, u2c_32, u2c_33, NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   u2c_4E, u2c_4F, 
        u2c_50, u2c_51, u2c_52, u2c_53, u2c_54, u2c_55, u2c_56, u2c_57, 
        u2c_58, u2c_59, u2c_5A, u2c_5B, u2c_5C, u2c_5D, u2c_5E, u2c_5F, 
        u2c_60, u2c_61, u2c_62, u2c_63, u2c_64, u2c_65, u2c_66, u2c_67, 
        u2c_68, u2c_69, u2c_6A, u2c_6B, u2c_6C, u2c_6D, u2c_6E, u2c_6F, 
        u2c_70, u2c_71, u2c_72, u2c_73, u2c_74, u2c_75, u2c_76, u2c_77, 
        u2c_78, u2c_79, u2c_7A, u2c_7B, u2c_7C, u2c_7D, u2c_7E, u2c_7F, 
        u2c_80, u2c_81, u2c_82, u2c_83, u2c_84, u2c_85, u2c_86, u2c_87, 
        u2c_88, u2c_89, u2c_8A, u2c_8B, u2c_8C, u2c_8D, u2c_8E, u2c_8F, 
        u2c_90, u2c_91, u2c_92, u2c_93, u2c_94, u2c_95, u2c_96, u2c_97, 
        u2c_98, u2c_99, u2c_9A, u2c_9B, u2c_9C, u2c_9D, u2c_9E, u2c_9F, 
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   u2c_DC, NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   NULL,   
        NULL,   u2c_F9, u2c_FA, NULL,   NULL,   NULL,   u2c_FE, u2c_FF, };

static const unsigned char charset2lower[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */
        0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
        0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
        0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
        0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
        0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
        0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
        0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
        0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
        0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
        0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
        0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
        0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
        0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
        0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
        0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
        0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
        0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
        0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
};

static const unsigned char charset2upper[256] = {
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
        0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
        0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
        0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
        0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
        0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
        0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
        0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
        0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
        0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */
        0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */
        0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */
        0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */

        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */
        0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */
        0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */
        0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */
        0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */
        0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */
        0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */
        0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */
        0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */
        0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */
        0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */
        0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */
        0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */
        0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */
        0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */
        0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */
};

static int uni2char(const wchar_t uni,
                        unsigned char *out, int boundlen)
{
        const unsigned char *uni2charset;
        unsigned char cl = uni&0xFF;
        unsigned char ch = (uni>>8)&0xFF;
        unsigned char out0,out1;

        if (boundlen <= 0)
                return -ENAMETOOLONG;

        if (uni == 0x20ac) {/* Euro symbol.The only exception with a non-ascii unicode */
                out[0] = 0x80;
                return 1;
        }

        if (ch == 0) { /* handle the U00 plane*/
                /* if (cl == 0) return -EINVAL;*/ /*U0000 is legal in cp936*/
                out0 = u2c_00[cl*2];
                out1 = u2c_00[cl*2+1];
                if (out0 == 0x00 && out1 == 0x00) {
                        if (cl<0x80) {
                                out[0] = cl;
                                return 1;
                        }
                        return -EINVAL;
                } else {
                        if (boundlen <= 1)
                                return -ENAMETOOLONG;
                        out[0] = out0;
                        out[1] = out1;
                        return 2;
                }
        }

        uni2charset = page_uni2charset[ch];
        if (uni2charset) {
                if (boundlen <= 1)
                        return -ENAMETOOLONG;
                out[0] = uni2charset[cl*2];
                out[1] = uni2charset[cl*2+1];
                if (out[0] == 0x00 && out[1] == 0x00)
                        return -EINVAL;
                return 2;
        }
        else
                return -EINVAL;
}

static int char2uni(const unsigned char *rawstring, int boundlen,
                        wchar_t *uni)
{
        unsigned char ch, cl;
        const wchar_t *charset2uni;
        int n;

        if (boundlen <= 0)
                return -ENAMETOOLONG;

        if (boundlen == 1) {
                if (rawstring[0]==0x80) { /* Euro symbol.The only exception with a non-ascii unicode */
                        *uni = 0x20ac;
                } else {
                        *uni = rawstring[0];
                }
                return 1;
        }

        ch = rawstring[0];
        cl = rawstring[1];

        charset2uni = page_charset2uni[ch];
        if (charset2uni && cl) {
                *uni = charset2uni[cl];
                if (*uni == 0x0000)
                        return -EINVAL;
                n = 2;
        } else{
                if (ch==0x80) {/* Euro symbol.The only exception with a non-ascii unicode */
                        *uni = 0x20ac;
                } else {
                        *uni = ch;
                }
                n = 1;
        }
        return n;
}

static struct nls_table table = {
        .charset        = "cp936",
        .alias                = "gb2312",
        .uni2char        = uni2char,
        .char2uni        = char2uni,
        .charset2lower        = charset2lower,
        .charset2upper        = charset2upper,
};

static int __init init_nls_cp936(void)
{
        return register_nls(&table);
}

static void __exit exit_nls_cp936(void)
{
        unregister_nls(&table);
}

module_init(init_nls_cp936)
module_exit(exit_nls_cp936)

MODULE_LICENSE("Dual BSD/GPL");
MODULE_ALIAS_NLS(gb2312);


















































































































































































































































































































































































































































































































































































    2 


    2 

    1 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    8 




































    7 


























    4 












   16 

























































    6 


































































































































































































































































































    4 
    7 






















































































































































































































































































































































































































































    5 
   18 


    2 
    7 



    6 
    9 









   18 




    9 































   10 




   11 
























    7 




    8 





















    2 



    2 








    1 
    1 


    2 
    2 















































































    6 






    9 

















































































































   19 



































    2 






















































































































































    1 



















    1 
    2 




























   18 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
// SPDX-License-Identifier: GPL-2.0
/*
 *  ext4.h
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/include/linux/minix_fs.h
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#ifndef _EXT4_H
#define _EXT4_H

#include <linux/refcount.h>
#include <linux/types.h>
#include <linux/blkdev.h>
#include <linux/magic.h>
#include <linux/jbd2.h>
#include <linux/quota.h>
#include <linux/rwsem.h>
#include <linux/rbtree.h>
#include <linux/seqlock.h>
#include <linux/mutex.h>
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/sched/signal.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
#include <linux/ratelimit.h>
#include <crypto/hash.h>
#include <linux/falloc.h>
#include <linux/percpu-rwsem.h>
#include <linux/fiemap.h>
#ifdef __KERNEL__
#include <linux/compat.h>
#endif
#include <uapi/linux/ext4.h>

#include <linux/fscrypt.h>
#include <linux/fsverity.h>

#include <linux/compiler.h>

/*
 * The fourth extended filesystem constants/structures
 */

/*
 * with AGGRESSIVE_CHECK allocator runs consistency checks over
 * structures. these checks slow things down a lot
 */
#define AGGRESSIVE_CHECK__

/*
 * with DOUBLE_CHECK defined mballoc creates persistent in-core
 * bitmaps, maintains and uses them to check for double allocations
 */
#define DOUBLE_CHECK__

/*
 * Define EXT4FS_DEBUG to produce debug messages
 */
#undef EXT4FS_DEBUG

/*
 * Debug code
 */
#ifdef EXT4FS_DEBUG
#define ext4_debug(f, a...)                                                \
        do {                                                                \
                printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",        \
                        __FILE__, __LINE__, __func__);                        \
                printk(KERN_DEBUG f, ## a);                                \
        } while (0)
#else
#define ext4_debug(fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

 /*
  * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c
  */
#define EXT_DEBUG__

/*
 * Dynamic printk for controlled extents debugging.
 */
#ifdef CONFIG_EXT4_DEBUG
#define ext_debug(ino, fmt, ...)                                        \
        pr_debug("[%s/%d] EXT4-fs (%s): ino %lu: (%s, %d): %s:" fmt,        \
                 current->comm, task_pid_nr(current),                        \
                 ino->i_sb->s_id, ino->i_ino, __FILE__, __LINE__,        \
                 __func__, ##__VA_ARGS__)
#else
#define ext_debug(ino, fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

#define ASSERT(assert)                                                \
do {                                                                        \
        if (unlikely(!(assert))) {                                        \
                printk(KERN_EMERG                                        \
                       "Assertion failure in %s() at %s:%d: '%s'\n",        \
                       __func__, __FILE__, __LINE__, #assert);                \
                BUG();                                                        \
        }                                                                \
} while (0)

/* data type for block offset of block group */
typedef int ext4_grpblk_t;

/* data type for filesystem-wide blocks number */
typedef unsigned long long ext4_fsblk_t;

/* data type for file logical block number */
typedef __u32 ext4_lblk_t;

/* data type for block group number */
typedef unsigned int ext4_group_t;

enum SHIFT_DIRECTION {
        SHIFT_LEFT = 0,
        SHIFT_RIGHT,
};

/*
 * For each criteria, mballoc has slightly different way of finding
 * the required blocks nad usually, higher the criteria the slower the
 * allocation.  We start at lower criterias and keep falling back to
 * higher ones if we are not able to find any blocks.  Lower (earlier)
 * criteria are faster.
 */
enum criteria {
        /*
         * Used when number of blocks needed is a power of 2. This
         * doesn't trigger any disk IO except prefetch and is the
         * fastest criteria.
         */
        CR_POWER2_ALIGNED,

        /*
         * Tries to lookup in-memory data structures to find the most
         * suitable group that satisfies goal request. No disk IO
         * except block prefetch.
         */
        CR_GOAL_LEN_FAST,

        /*
         * Same as CR_GOAL_LEN_FAST but is allowed to reduce the goal
         * length to the best available length for faster allocation.
         */
        CR_BEST_AVAIL_LEN,

        /*
         * Reads each block group sequentially, performing disk IO if
         * necessary, to find find_suitable block group. Tries to
         * allocate goal length but might trim the request if nothing
         * is found after enough tries.
         */
        CR_GOAL_LEN_SLOW,

        /*
         * Finds the first free set of blocks and allocates
         * those. This is only used in rare cases when
         * CR_GOAL_LEN_SLOW also fails to allocate anything.
         */
        CR_ANY_FREE,

        /*
         * Number of criterias defined.
         */
        EXT4_MB_NUM_CRS
};

/*
 * Flags used in mballoc's allocation_context flags field.
 *
 * Also used to show what's going on for debugging purposes when the
 * flag field is exported via the traceport interface
 */

/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE                0x0001
/* blocks already reserved */
#define EXT4_MB_HINT_RESERVED                0x0002
/* metadata is being allocated */
#define EXT4_MB_HINT_METADATA                0x0004
/* first blocks in the file */
#define EXT4_MB_HINT_FIRST                0x0008
/* search for the best chunk */
#define EXT4_MB_HINT_BEST                0x0010
/* data is being allocated */
#define EXT4_MB_HINT_DATA                0x0020
/* don't preallocate (for tails) */
#define EXT4_MB_HINT_NOPREALLOC                0x0040
/* allocate for locality group */
#define EXT4_MB_HINT_GROUP_ALLOC        0x0080
/* allocate goal blocks or none */
#define EXT4_MB_HINT_GOAL_ONLY                0x0100
/* goal is meaningful */
#define EXT4_MB_HINT_TRY_GOAL                0x0200
/* blocks already pre-reserved by delayed allocation */
#define EXT4_MB_DELALLOC_RESERVED        0x0400
/* We are doing stream allocation */
#define EXT4_MB_STREAM_ALLOC                0x0800
/* Use reserved root blocks if needed */
#define EXT4_MB_USE_ROOT_BLOCKS                0x1000
/* Use blocks from reserved pool */
#define EXT4_MB_USE_RESERVED                0x2000
/* Do strict check for free blocks while retrying block allocation */
#define EXT4_MB_STRICT_CHECK                0x4000
/* Large fragment size list lookup succeeded at least once for
 * CR_POWER2_ALIGNED */
#define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED                0x8000
/* Avg fragment size rb tree lookup succeeded at least once for
 * CR_GOAL_LEN_FAST */
#define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED                0x00010000
/* Avg fragment size rb tree lookup succeeded at least once for
 * CR_BEST_AVAIL_LEN */
#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED                0x00020000

struct ext4_allocation_request {
        /* target inode for block we're allocating */
        struct inode *inode;
        /* how many blocks we want to allocate */
        unsigned int len;
        /* logical block in target inode */
        ext4_lblk_t logical;
        /* the closest logical allocated block to the left */
        ext4_lblk_t lleft;
        /* the closest logical allocated block to the right */
        ext4_lblk_t lright;
        /* phys. target (a hint) */
        ext4_fsblk_t goal;
        /* phys. block for the closest logical allocated block to the left */
        ext4_fsblk_t pleft;
        /* phys. block for the closest logical allocated block to the right */
        ext4_fsblk_t pright;
        /* flags. see above EXT4_MB_HINT_* */
        unsigned int flags;
};

/*
 * Logical to physical block mapping, used by ext4_map_blocks()
 *
 * This structure is used to pass requests into ext4_map_blocks() as
 * well as to store the information returned by ext4_map_blocks().  It
 * takes less room on the stack than a struct buffer_head.
 */
#define EXT4_MAP_NEW                BIT(BH_New)
#define EXT4_MAP_MAPPED                BIT(BH_Mapped)
#define EXT4_MAP_UNWRITTEN        BIT(BH_Unwritten)
#define EXT4_MAP_BOUNDARY        BIT(BH_Boundary)
#define EXT4_MAP_DELAYED        BIT(BH_Delay)
#define EXT4_MAP_FLAGS                (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
                                 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
                                 EXT4_MAP_DELAYED)

struct ext4_map_blocks {
        ext4_fsblk_t m_pblk;
        ext4_lblk_t m_lblk;
        unsigned int m_len;
        unsigned int m_flags;
};

/*
 * Block validity checking, system zone rbtree.
 */
struct ext4_system_blocks {
        struct rb_root root;
        struct rcu_head rcu;
};

/*
 * Flags for ext4_io_end->flags
 */
#define        EXT4_IO_END_UNWRITTEN        0x0001

struct ext4_io_end_vec {
        struct list_head list;                /* list of io_end_vec */
        loff_t offset;                        /* offset in the file */
        ssize_t size;                        /* size of the extent */
};

/*
 * For converting unwritten extents on a work queue. 'handle' is used for
 * buffered writeback.
 */
typedef struct ext4_io_end {
        struct list_head        list;                /* per-file finished IO list */
        handle_t                *handle;        /* handle reserved for extent
                                                 * conversion */
        struct inode                *inode;                /* file being written to */
        struct bio                *bio;                /* Linked list of completed
                                                 * bios covering the extent */
        unsigned int                flag;                /* unwritten or not */
        refcount_t                count;                /* reference counter */
        struct list_head        list_vec;        /* list of ext4_io_end_vec */
} ext4_io_end_t;

struct ext4_io_submit {
        struct writeback_control *io_wbc;
        struct bio                *io_bio;
        ext4_io_end_t                *io_end;
        sector_t                io_next_block;
};

/*
 * Special inodes numbers
 */
#define        EXT4_BAD_INO                 1        /* Bad blocks inode */
#define EXT4_ROOT_INO                 2        /* Root inode */
#define EXT4_USR_QUOTA_INO         3        /* User quota inode */
#define EXT4_GRP_QUOTA_INO         4        /* Group quota inode */
#define EXT4_BOOT_LOADER_INO         5        /* Boot loader inode */
#define EXT4_UNDEL_DIR_INO         6        /* Undelete directory inode */
#define EXT4_RESIZE_INO                 7        /* Reserved group descriptors inode */
#define EXT4_JOURNAL_INO         8        /* Journal inode */

/* First non-reserved inode for old ext4 filesystems */
#define EXT4_GOOD_OLD_FIRST_INO        11

/*
 * Maximal count of links to a file
 */
#define EXT4_LINK_MAX                65000

/*
 * Macro-instructions used to manage several block sizes
 */
#define EXT4_MIN_BLOCK_SIZE                1024
#define        EXT4_MAX_BLOCK_SIZE                65536
#define EXT4_MIN_BLOCK_LOG_SIZE                10
#define EXT4_MAX_BLOCK_LOG_SIZE                16
#define EXT4_MAX_CLUSTER_LOG_SIZE        30
#ifdef __KERNEL__
# define EXT4_BLOCK_SIZE(s)                ((s)->s_blocksize)
#else
# define EXT4_BLOCK_SIZE(s)                (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
#endif
#define        EXT4_ADDR_PER_BLOCK(s)                (EXT4_BLOCK_SIZE(s) / sizeof(__u32))
#define EXT4_CLUSTER_SIZE(s)                (EXT4_BLOCK_SIZE(s) << \
                                         EXT4_SB(s)->s_cluster_bits)
#ifdef __KERNEL__
# define EXT4_BLOCK_SIZE_BITS(s)        ((s)->s_blocksize_bits)
# define EXT4_CLUSTER_BITS(s)                (EXT4_SB(s)->s_cluster_bits)
#else
# define EXT4_BLOCK_SIZE_BITS(s)        ((s)->s_log_block_size + 10)
#endif
#ifdef __KERNEL__
#define        EXT4_ADDR_PER_BLOCK_BITS(s)        (EXT4_SB(s)->s_addr_per_block_bits)
#define EXT4_INODE_SIZE(s)                (EXT4_SB(s)->s_inode_size)
#define EXT4_FIRST_INO(s)                (EXT4_SB(s)->s_first_ino)
#else
#define EXT4_INODE_SIZE(s)        (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
                                 EXT4_GOOD_OLD_INODE_SIZE : \
                                 (s)->s_inode_size)
#define EXT4_FIRST_INO(s)        (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
                                 EXT4_GOOD_OLD_FIRST_INO : \
                                 (s)->s_first_ino)
#endif
#define EXT4_BLOCK_ALIGN(size, blkbits)                ALIGN((size), (1 << (blkbits)))
#define EXT4_MAX_BLOCKS(size, offset, blkbits) \
        ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \
                                                                  blkbits))

/* Translate a block number to a cluster number */
#define EXT4_B2C(sbi, blk)        ((blk) >> (sbi)->s_cluster_bits)
/* Translate a cluster number to a block number */
#define EXT4_C2B(sbi, cluster)        ((cluster) << (sbi)->s_cluster_bits)
/* Translate # of blks to # of clusters */
#define EXT4_NUM_B2C(sbi, blks)        (((blks) + (sbi)->s_cluster_ratio - 1) >> \
                                 (sbi)->s_cluster_bits)
/* Mask out the low bits to get the starting block of the cluster */
#define EXT4_PBLK_CMASK(s, pblk) ((pblk) &                                \
                                  ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
#define EXT4_LBLK_CMASK(s, lblk) ((lblk) &                                \
                                  ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
/* Fill in the low bits to get the last block of the cluster */
#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) |                                \
                                    ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1))
/* Get the cluster offset */
#define EXT4_PBLK_COFF(s, pblk) ((pblk) &                                \
                                 ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
#define EXT4_LBLK_COFF(s, lblk) ((lblk) &                                \
                                 ((ext4_lblk_t) (s)->s_cluster_ratio - 1))

/*
 * Structure of a blocks group descriptor
 */
struct ext4_group_desc
{
        __le32        bg_block_bitmap_lo;        /* Blocks bitmap block */
        __le32        bg_inode_bitmap_lo;        /* Inodes bitmap block */
        __le32        bg_inode_table_lo;        /* Inodes table block */
        __le16        bg_free_blocks_count_lo;/* Free blocks count */
        __le16        bg_free_inodes_count_lo;/* Free inodes count */
        __le16        bg_used_dirs_count_lo;        /* Directories count */
        __le16        bg_flags;                /* EXT4_BG_flags (INODE_UNINIT, etc) */
        __le32  bg_exclude_bitmap_lo;   /* Exclude bitmap for snapshots */
        __le16  bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */
        __le16  bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */
        __le16  bg_itable_unused_lo;        /* Unused inodes count */
        __le16  bg_checksum;                /* crc16(sb_uuid+group+desc) */
        __le32        bg_block_bitmap_hi;        /* Blocks bitmap block MSB */
        __le32        bg_inode_bitmap_hi;        /* Inodes bitmap block MSB */
        __le32        bg_inode_table_hi;        /* Inodes table block MSB */
        __le16        bg_free_blocks_count_hi;/* Free blocks count MSB */
        __le16        bg_free_inodes_count_hi;/* Free inodes count MSB */
        __le16        bg_used_dirs_count_hi;        /* Directories count MSB */
        __le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
        __le32  bg_exclude_bitmap_hi;   /* Exclude bitmap block MSB */
        __le16  bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */
        __le16  bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */
        __u32   bg_reserved;
};

#define EXT4_BG_INODE_BITMAP_CSUM_HI_END        \
        (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \
         sizeof(__le16))
#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END        \
        (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \
         sizeof(__le16))

/*
 * Structure of a flex block group info
 */

struct flex_groups {
        atomic64_t        free_clusters;
        atomic_t        free_inodes;
        atomic_t        used_dirs;
};

#define EXT4_BG_INODE_UNINIT        0x0001 /* Inode table/bitmap not in use */
#define EXT4_BG_BLOCK_UNINIT        0x0002 /* Block bitmap not in use */
#define EXT4_BG_INODE_ZEROED        0x0004 /* On-disk itable initialized to zero */

/*
 * Macro-instructions used to manage group descriptors
 */
#define EXT4_MIN_DESC_SIZE                32
#define EXT4_MIN_DESC_SIZE_64BIT        64
#define        EXT4_MAX_DESC_SIZE                EXT4_MIN_BLOCK_SIZE
#define EXT4_DESC_SIZE(s)                (EXT4_SB(s)->s_desc_size)
#ifdef __KERNEL__
# define EXT4_BLOCKS_PER_GROUP(s)        (EXT4_SB(s)->s_blocks_per_group)
# define EXT4_CLUSTERS_PER_GROUP(s)        (EXT4_SB(s)->s_clusters_per_group)
# define EXT4_DESC_PER_BLOCK(s)                (EXT4_SB(s)->s_desc_per_block)
# define EXT4_INODES_PER_GROUP(s)        (EXT4_SB(s)->s_inodes_per_group)
# define EXT4_DESC_PER_BLOCK_BITS(s)        (EXT4_SB(s)->s_desc_per_block_bits)
#else
# define EXT4_BLOCKS_PER_GROUP(s)        ((s)->s_blocks_per_group)
# define EXT4_DESC_PER_BLOCK(s)                (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s))
# define EXT4_INODES_PER_GROUP(s)        ((s)->s_inodes_per_group)
#endif

/*
 * Constants relative to the data blocks
 */
#define        EXT4_NDIR_BLOCKS                12
#define        EXT4_IND_BLOCK                        EXT4_NDIR_BLOCKS
#define        EXT4_DIND_BLOCK                        (EXT4_IND_BLOCK + 1)
#define        EXT4_TIND_BLOCK                        (EXT4_DIND_BLOCK + 1)
#define        EXT4_N_BLOCKS                        (EXT4_TIND_BLOCK + 1)

/*
 * Inode flags
 */
#define        EXT4_SECRM_FL                        0x00000001 /* Secure deletion */
#define        EXT4_UNRM_FL                        0x00000002 /* Undelete */
#define        EXT4_COMPR_FL                        0x00000004 /* Compress file */
#define EXT4_SYNC_FL                        0x00000008 /* Synchronous updates */
#define EXT4_IMMUTABLE_FL                0x00000010 /* Immutable file */
#define EXT4_APPEND_FL                        0x00000020 /* writes to file may only append */
#define EXT4_NODUMP_FL                        0x00000040 /* do not dump file */
#define EXT4_NOATIME_FL                        0x00000080 /* do not update atime */
/* Reserved for compression usage... */
#define EXT4_DIRTY_FL                        0x00000100
#define EXT4_COMPRBLK_FL                0x00000200 /* One or more compressed clusters */
#define EXT4_NOCOMPR_FL                        0x00000400 /* Don't compress */
        /* nb: was previously EXT2_ECOMPR_FL */
#define EXT4_ENCRYPT_FL                        0x00000800 /* encrypted file */
/* End compression flags --- maybe not all used */
#define EXT4_INDEX_FL                        0x00001000 /* hash-indexed directory */
#define EXT4_IMAGIC_FL                        0x00002000 /* AFS directory */
#define EXT4_JOURNAL_DATA_FL                0x00004000 /* file data should be journaled */
#define EXT4_NOTAIL_FL                        0x00008000 /* file tail should not be merged */
#define EXT4_DIRSYNC_FL                        0x00010000 /* dirsync behaviour (directories only) */
#define EXT4_TOPDIR_FL                        0x00020000 /* Top of directory hierarchies*/
#define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
#define EXT4_EXTENTS_FL                        0x00080000 /* Inode uses extents */
#define EXT4_VERITY_FL                        0x00100000 /* Verity protected inode */
#define EXT4_EA_INODE_FL                0x00200000 /* Inode used for large EA */
/* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */

#define EXT4_DAX_FL                        0x02000000 /* Inode is DAX */

#define EXT4_INLINE_DATA_FL                0x10000000 /* Inode has inline data. */
#define EXT4_PROJINHERIT_FL                0x20000000 /* Create with parents projid */
#define EXT4_CASEFOLD_FL                0x40000000 /* Casefolded directory */
#define EXT4_RESERVED_FL                0x80000000 /* reserved for ext4 lib */

/* User modifiable flags */
#define EXT4_FL_USER_MODIFIABLE                (EXT4_SECRM_FL | \
                                         EXT4_UNRM_FL | \
                                         EXT4_COMPR_FL | \
                                         EXT4_SYNC_FL | \
                                         EXT4_IMMUTABLE_FL | \
                                         EXT4_APPEND_FL | \
                                         EXT4_NODUMP_FL | \
                                         EXT4_NOATIME_FL | \
                                         EXT4_JOURNAL_DATA_FL | \
                                         EXT4_NOTAIL_FL | \
                                         EXT4_DIRSYNC_FL | \
                                         EXT4_TOPDIR_FL | \
                                         EXT4_EXTENTS_FL | \
                                         0x00400000 /* EXT4_EOFBLOCKS_FL */ | \
                                         EXT4_DAX_FL | \
                                         EXT4_PROJINHERIT_FL | \
                                         EXT4_CASEFOLD_FL)

/* User visible flags */
#define EXT4_FL_USER_VISIBLE                (EXT4_FL_USER_MODIFIABLE | \
                                         EXT4_DIRTY_FL | \
                                         EXT4_COMPRBLK_FL | \
                                         EXT4_NOCOMPR_FL | \
                                         EXT4_ENCRYPT_FL | \
                                         EXT4_INDEX_FL | \
                                         EXT4_VERITY_FL | \
                                         EXT4_INLINE_DATA_FL)

/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
                           EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
                           EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
                           EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
                           EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\
                           EXT4_DAX_FL)

/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\
                           EXT4_PROJINHERIT_FL))

/* Flags that are appropriate for non-directories/regular files. */
#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)

/* The only flags that should be swapped */
#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL)

/* Flags which are mutually exclusive to DAX */
#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\
                           EXT4_JOURNAL_DATA_FL | EXT4_INLINE_DATA_FL)

/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
{
        if (S_ISDIR(mode))
                return flags;
        else if (S_ISREG(mode))
                return flags & EXT4_REG_FLMASK;
        else
                return flags & EXT4_OTHER_FLMASK;
}

/*
 * Inode flags used for atomic set/get
 */
enum {
        EXT4_INODE_SECRM        = 0,        /* Secure deletion */
        EXT4_INODE_UNRM                = 1,        /* Undelete */
        EXT4_INODE_COMPR        = 2,        /* Compress file */
        EXT4_INODE_SYNC                = 3,        /* Synchronous updates */
        EXT4_INODE_IMMUTABLE        = 4,        /* Immutable file */
        EXT4_INODE_APPEND        = 5,        /* writes to file may only append */
        EXT4_INODE_NODUMP        = 6,        /* do not dump file */
        EXT4_INODE_NOATIME        = 7,        /* do not update atime */
/* Reserved for compression usage... */
        EXT4_INODE_DIRTY        = 8,
        EXT4_INODE_COMPRBLK        = 9,        /* One or more compressed clusters */
        EXT4_INODE_NOCOMPR        = 10,        /* Don't compress */
        EXT4_INODE_ENCRYPT        = 11,        /* Encrypted file */
/* End compression flags --- maybe not all used */
        EXT4_INODE_INDEX        = 12,        /* hash-indexed directory */
        EXT4_INODE_IMAGIC        = 13,        /* AFS directory */
        EXT4_INODE_JOURNAL_DATA        = 14,        /* file data should be journaled */
        EXT4_INODE_NOTAIL        = 15,        /* file tail should not be merged */
        EXT4_INODE_DIRSYNC        = 16,        /* dirsync behaviour (directories only) */
        EXT4_INODE_TOPDIR        = 17,        /* Top of directory hierarchies*/
        EXT4_INODE_HUGE_FILE        = 18,        /* Set to each huge file */
        EXT4_INODE_EXTENTS        = 19,        /* Inode uses extents */
        EXT4_INODE_VERITY        = 20,        /* Verity protected inode */
        EXT4_INODE_EA_INODE        = 21,        /* Inode used for large EA */
/* 22 was formerly EXT4_INODE_EOFBLOCKS */
        EXT4_INODE_DAX                = 25,        /* Inode is DAX */
        EXT4_INODE_INLINE_DATA        = 28,        /* Data in inode. */
        EXT4_INODE_PROJINHERIT        = 29,        /* Create with parents projid */
        EXT4_INODE_CASEFOLD        = 30,        /* Casefolded directory */
        EXT4_INODE_RESERVED        = 31,        /* reserved for ext4 lib */
};

/*
 * Since it's pretty easy to mix up bit numbers and hex values, we use a
 * build-time check to make sure that EXT4_XXX_FL is consistent with respect to
 * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost
 * any extra space in the compiled kernel image, otherwise, the build will fail.
 * It's important that these values are the same, since we are using
 * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent
 * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk
 * values found in ext2, ext3 and ext4 filesystems, and of course the values
 * defined in e2fsprogs.
 *
 * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
 */
#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG))
#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))

static inline void ext4_check_flag_values(void)
{
        CHECK_FLAG_VALUE(SECRM);
        CHECK_FLAG_VALUE(UNRM);
        CHECK_FLAG_VALUE(COMPR);
        CHECK_FLAG_VALUE(SYNC);
        CHECK_FLAG_VALUE(IMMUTABLE);
        CHECK_FLAG_VALUE(APPEND);
        CHECK_FLAG_VALUE(NODUMP);
        CHECK_FLAG_VALUE(NOATIME);
        CHECK_FLAG_VALUE(DIRTY);
        CHECK_FLAG_VALUE(COMPRBLK);
        CHECK_FLAG_VALUE(NOCOMPR);
        CHECK_FLAG_VALUE(ENCRYPT);
        CHECK_FLAG_VALUE(INDEX);
        CHECK_FLAG_VALUE(IMAGIC);
        CHECK_FLAG_VALUE(JOURNAL_DATA);
        CHECK_FLAG_VALUE(NOTAIL);
        CHECK_FLAG_VALUE(DIRSYNC);
        CHECK_FLAG_VALUE(TOPDIR);
        CHECK_FLAG_VALUE(HUGE_FILE);
        CHECK_FLAG_VALUE(EXTENTS);
        CHECK_FLAG_VALUE(VERITY);
        CHECK_FLAG_VALUE(EA_INODE);
        CHECK_FLAG_VALUE(INLINE_DATA);
        CHECK_FLAG_VALUE(PROJINHERIT);
        CHECK_FLAG_VALUE(CASEFOLD);
        CHECK_FLAG_VALUE(RESERVED);
}

#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
struct compat_ext4_new_group_input {
        u32 group;
        compat_u64 block_bitmap;
        compat_u64 inode_bitmap;
        compat_u64 inode_table;
        u32 blocks_count;
        u16 reserved_blocks;
        u16 unused;
};
#endif

/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
struct ext4_new_group_data {
        __u32 group;
        __u64 block_bitmap;
        __u64 inode_bitmap;
        __u64 inode_table;
        __u32 blocks_count;
        __u16 reserved_blocks;
        __u16 mdata_blocks;
        __u32 free_clusters_count;
};

/* Indexes used to index group tables in ext4_new_group_data */
enum {
        BLOCK_BITMAP = 0,        /* block bitmap */
        INODE_BITMAP,                /* inode bitmap */
        INODE_TABLE,                /* inode tables */
        GROUP_TABLE_COUNT,
};

/*
 * Flags used by ext4_map_blocks()
 */
        /* Allocate any needed blocks and/or convert an unwritten
           extent to be an initialized ext4 */
#define EXT4_GET_BLOCKS_CREATE                        0x0001
        /* Request the creation of an unwritten extent */
#define EXT4_GET_BLOCKS_UNWRIT_EXT                0x0002
#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT        (EXT4_GET_BLOCKS_UNWRIT_EXT|\
                                                 EXT4_GET_BLOCKS_CREATE)
        /* Caller is from the delayed allocation writeout path
         * finally doing the actual allocation of delayed blocks */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE        0x0004
        /* caller is from the direct IO path, request to creation of an
        unwritten extents if not allocated, split the unwritten
        extent if blocks has been preallocated already*/
#define EXT4_GET_BLOCKS_PRE_IO                        0x0008
#define EXT4_GET_BLOCKS_CONVERT                        0x0010
#define EXT4_GET_BLOCKS_IO_CREATE_EXT                (EXT4_GET_BLOCKS_PRE_IO|\
                                         EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
        /* Convert extent to initialized after IO complete */
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT                (EXT4_GET_BLOCKS_CONVERT|\
                                         EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
        /* Eventual metadata allocation (due to growing extent tree)
         * should not fail, so try to use reserved blocks for that.*/
#define EXT4_GET_BLOCKS_METADATA_NOFAIL                0x0020
        /* Don't normalize allocation size (used for fallocate) */
#define EXT4_GET_BLOCKS_NO_NORMALIZE                0x0040
        /* Convert written extents to unwritten */
#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN        0x0100
        /* Write zeros to newly created written extents */
#define EXT4_GET_BLOCKS_ZERO                        0x0200
#define EXT4_GET_BLOCKS_CREATE_ZERO                (EXT4_GET_BLOCKS_CREATE |\
                                        EXT4_GET_BLOCKS_ZERO)
        /* Caller will submit data before dropping transaction handle. This
         * allows jbd2 to avoid submitting data before commit. */
#define EXT4_GET_BLOCKS_IO_SUBMIT                0x0400
        /* Caller is in the atomic contex, find extent if it has been cached */
#define EXT4_GET_BLOCKS_CACHED_NOWAIT                0x0800

/*
 * The bit position of these flags must not overlap with any of the
 * EXT4_GET_BLOCKS_*.  They are used by ext4_find_extent(),
 * read_extent_tree_block(), ext4_split_extent_at(),
 * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
 * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
 * caching the extents when reading from the extent tree while a
 * truncate or punch hole operation is in progress.
 */
#define EXT4_EX_NOCACHE                                0x40000000
#define EXT4_EX_FORCE_CACHE                        0x20000000
#define EXT4_EX_NOFAIL                                0x10000000

/*
 * Flags used by ext4_free_blocks
 */
#define EXT4_FREE_BLOCKS_METADATA                0x0001
#define EXT4_FREE_BLOCKS_FORGET                        0x0002
#define EXT4_FREE_BLOCKS_VALIDATED                0x0004
#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE                0x0008
#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER        0x0010
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER        0x0020
#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER      0x0040

#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
 * ioctl commands in 32 bit emulation
 */
#define EXT4_IOC32_GETVERSION                _IOR('f', 3, int)
#define EXT4_IOC32_SETVERSION                _IOW('f', 4, int)
#define EXT4_IOC32_GETRSVSZ                _IOR('f', 5, int)
#define EXT4_IOC32_SETRSVSZ                _IOW('f', 6, int)
#define EXT4_IOC32_GROUP_EXTEND                _IOW('f', 7, unsigned int)
#define EXT4_IOC32_GROUP_ADD                _IOW('f', 8, struct compat_ext4_new_group_input)
#define EXT4_IOC32_GETVERSION_OLD        FS_IOC32_GETVERSION
#define EXT4_IOC32_SETVERSION_OLD        FS_IOC32_SETVERSION
#endif

/* Max physical block we can address w/o extents */
#define EXT4_MAX_BLOCK_FILE_PHYS        0xFFFFFFFF

/* Max logical block we can support */
#define EXT4_MAX_LOGICAL_BLOCK                0xFFFFFFFE

/*
 * Structure of an inode on the disk
 */
struct ext4_inode {
        __le16        i_mode;                /* File mode */
        __le16        i_uid;                /* Low 16 bits of Owner Uid */
        __le32        i_size_lo;        /* Size in bytes */
        __le32        i_atime;        /* Access time */
        __le32        i_ctime;        /* Inode Change time */
        __le32        i_mtime;        /* Modification time */
        __le32        i_dtime;        /* Deletion Time */
        __le16        i_gid;                /* Low 16 bits of Group Id */
        __le16        i_links_count;        /* Links count */
        __le32        i_blocks_lo;        /* Blocks count */
        __le32        i_flags;        /* File flags */
        union {
                struct {
                        __le32  l_i_version;
                } linux1;
                struct {
                        __u32  h_i_translator;
                } hurd1;
                struct {
                        __u32  m_i_reserved1;
                } masix1;
        } osd1;                                /* OS dependent 1 */
        __le32        i_block[EXT4_N_BLOCKS];/* Pointers to blocks */
        __le32        i_generation;        /* File version (for NFS) */
        __le32        i_file_acl_lo;        /* File ACL */
        __le32        i_size_high;
        __le32        i_obso_faddr;        /* Obsoleted fragment address */
        union {
                struct {
                        __le16        l_i_blocks_high; /* were l_i_reserved1 */
                        __le16        l_i_file_acl_high;
                        __le16        l_i_uid_high;        /* these 2 fields */
                        __le16        l_i_gid_high;        /* were reserved2[0] */
                        __le16        l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */
                        __le16        l_i_reserved;
                } linux2;
                struct {
                        __le16        h_i_reserved1;        /* Obsoleted fragment number/size which are removed in ext4 */
                        __u16        h_i_mode_high;
                        __u16        h_i_uid_high;
                        __u16        h_i_gid_high;
                        __u32        h_i_author;
                } hurd2;
                struct {
                        __le16        h_i_reserved1;        /* Obsoleted fragment number/size which are removed in ext4 */
                        __le16        m_i_file_acl_high;
                        __u32        m_i_reserved2[2];
                } masix2;
        } osd2;                                /* OS dependent 2 */
        __le16        i_extra_isize;
        __le16        i_checksum_hi;        /* crc32c(uuid+inum+inode) BE */
        __le32  i_ctime_extra;  /* extra Change time      (nsec << 2 | epoch) */
        __le32  i_mtime_extra;  /* extra Modification time(nsec << 2 | epoch) */
        __le32  i_atime_extra;  /* extra Access time      (nsec << 2 | epoch) */
        __le32  i_crtime;       /* File Creation time */
        __le32  i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
        __le32  i_version_hi;        /* high 32 bits for 64-bit version */
        __le32        i_projid;        /* Project ID */
};

#define EXT4_EPOCH_BITS 2
#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
#define EXT4_NSEC_MASK  (~0UL << EXT4_EPOCH_BITS)

/*
 * Extended fields will fit into an inode if the filesystem was formatted
 * with large inodes (-I 256 or larger) and there are not currently any EAs
 * consuming all of the available space. For new inodes we always reserve
 * enough space for the kernel's known extended fields, but for inodes
 * created with an old kernel this might not have been the case. None of
 * the extended inode fields is critical for correct filesystem operation.
 * This macro checks if a certain field fits in the inode. Note that
 * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize
 */
#define EXT4_FITS_IN_INODE(ext4_inode, einode, field)        \
        ((offsetof(typeof(*ext4_inode), field) +        \
          sizeof((ext4_inode)->field))                        \
        <= (EXT4_GOOD_OLD_INODE_SIZE +                        \
            (einode)->i_extra_isize))                        \

/*
 * We use an encoding that preserves the times for extra epoch "00":
 *
 * extra  msb of                         adjust for signed
 * epoch  32-bit                         32-bit tv_sec to
 * bits   time    decoded 64-bit tv_sec  64-bit tv_sec      valid time range
 * 0 0    1    -0x80000000..-0x00000001  0x000000000 1901-12-13..1969-12-31
 * 0 0    0    0x000000000..0x07fffffff  0x000000000 1970-01-01..2038-01-19
 * 0 1    1    0x080000000..0x0ffffffff  0x100000000 2038-01-19..2106-02-07
 * 0 1    0    0x100000000..0x17fffffff  0x100000000 2106-02-07..2174-02-25
 * 1 0    1    0x180000000..0x1ffffffff  0x200000000 2174-02-25..2242-03-16
 * 1 0    0    0x200000000..0x27fffffff  0x200000000 2242-03-16..2310-04-04
 * 1 1    1    0x280000000..0x2ffffffff  0x300000000 2310-04-04..2378-04-22
 * 1 1    0    0x300000000..0x37fffffff  0x300000000 2378-04-22..2446-05-10
 *
 * Note that previous versions of the kernel on 64-bit systems would
 * incorrectly use extra epoch bits 1,1 for dates between 1901 and
 * 1970.  e2fsck will correct this, assuming that it is run on the
 * affected filesystem before 2242.
 */

static inline __le32 ext4_encode_extra_time(struct timespec64 ts)
{
        u32 extra = ((ts.tv_sec - (s32)ts.tv_sec) >> 32) & EXT4_EPOCH_MASK;
        return cpu_to_le32(extra | (ts.tv_nsec << EXT4_EPOCH_BITS));
}

static inline struct timespec64 ext4_decode_extra_time(__le32 base,
                                                       __le32 extra)
{
        struct timespec64 ts = { .tv_sec = (signed)le32_to_cpu(base) };

        if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK)))
                ts.tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
        ts.tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
        return ts;
}

#define EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, ts)                        \
do {                                                                                \
        if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {        \
                (raw_inode)->xtime = cpu_to_le32((ts).tv_sec);                        \
                (raw_inode)->xtime ## _extra = ext4_encode_extra_time(ts);        \
        } else                                                                        \
                (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX));        \
} while (0)

#define EXT4_INODE_SET_ATIME(inode, raw_inode)                                                \
        EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode))

#define EXT4_INODE_SET_MTIME(inode, raw_inode)                                                \
        EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode))

#define EXT4_INODE_SET_CTIME(inode, raw_inode)                                                \
        EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode))

#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode)                                \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))                        \
                EXT4_INODE_SET_XTIME_VAL(xtime, &((einode)->vfs_inode),                \
                                         raw_inode, (einode)->xtime)

#define EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode)                        \
        (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra) ?        \
                ext4_decode_extra_time((raw_inode)->xtime,                                \
                                       (raw_inode)->xtime ## _extra) :                \
                (struct timespec64) {                                                \
                        .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime)        \
                })

#define EXT4_INODE_GET_ATIME(inode, raw_inode)                                        \
do {                                                                                \
        inode_set_atime_to_ts(inode,                                                \
                EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode));                \
} while (0)

#define EXT4_INODE_GET_MTIME(inode, raw_inode)                                        \
do {                                                                                \
        inode_set_mtime_to_ts(inode,                                                \
                EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode));                \
} while (0)

#define EXT4_INODE_GET_CTIME(inode, raw_inode)                                        \
do {                                                                                \
        inode_set_ctime_to_ts(inode,                                                \
                EXT4_INODE_GET_XTIME_VAL(i_ctime, inode, raw_inode));                \
} while (0)

#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)                                \
do {                                                                                \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))                         \
                (einode)->xtime =                                                \
                        EXT4_INODE_GET_XTIME_VAL(xtime, &(einode->vfs_inode),        \
                                                 raw_inode);                        \
        else                                                                        \
                (einode)->xtime = (struct timespec64){0, 0};                        \
} while (0)

#define i_disk_version osd1.linux1.l_i_version

#if defined(__KERNEL__) || defined(__linux__)
#define i_reserved1        osd1.linux1.l_i_reserved1
#define i_file_acl_high        osd2.linux2.l_i_file_acl_high
#define i_blocks_high        osd2.linux2.l_i_blocks_high
#define i_uid_low        i_uid
#define i_gid_low        i_gid
#define i_uid_high        osd2.linux2.l_i_uid_high
#define i_gid_high        osd2.linux2.l_i_gid_high
#define i_checksum_lo        osd2.linux2.l_i_checksum_lo

#elif defined(__GNU__)

#define i_translator        osd1.hurd1.h_i_translator
#define i_uid_high        osd2.hurd2.h_i_uid_high
#define i_gid_high        osd2.hurd2.h_i_gid_high
#define i_author        osd2.hurd2.h_i_author

#elif defined(__masix__)

#define i_reserved1        osd1.masix1.m_i_reserved1
#define i_file_acl_high        osd2.masix2.m_i_file_acl_high
#define i_reserved2        osd2.masix2.m_i_reserved2

#endif /* defined(__KERNEL__) || defined(__linux__) */

#include "extents_status.h"
#include "fast_commit.h"

/*
 * Lock subclasses for i_data_sem in the ext4_inode_info structure.
 *
 * These are needed to avoid lockdep false positives when we need to
 * allocate blocks to the quota inode during ext4_map_blocks(), while
 * holding i_data_sem for a normal (non-quota) inode.  Since we don't
 * do quota tracking for the quota inode, this avoids deadlock (as
 * well as infinite recursion, since it isn't turtles all the way
 * down...)
 *
 *  I_DATA_SEM_NORMAL - Used for most inodes
 *  I_DATA_SEM_OTHER  - Used by move_inode.c for the second normal inode
 *                          where the second inode has larger inode number
 *                          than the first
 *  I_DATA_SEM_QUOTA  - Used for quota inodes only
 *  I_DATA_SEM_EA     - Used for ea_inodes only
 */
enum {
        I_DATA_SEM_NORMAL = 0,
        I_DATA_SEM_OTHER,
        I_DATA_SEM_QUOTA,
        I_DATA_SEM_EA
};


/*
 * fourth extended file system inode data in memory
 */
struct ext4_inode_info {
        __le32        i_data[15];        /* unconverted */
        __u32        i_dtime;
        ext4_fsblk_t        i_file_acl;

        /*
         * i_block_group is the number of the block group which contains
         * this file's inode.  Constant across the lifetime of the inode,
         * it is used for making block allocation decisions - we try to
         * place a file's data blocks near its inode block, and new inodes
         * near to their parent directory's inode.
         */
        ext4_group_t        i_block_group;
        ext4_lblk_t        i_dir_start_lookup;
#if (BITS_PER_LONG < 64)
        unsigned long        i_state_flags;                /* Dynamic state flags */
#endif
        unsigned long        i_flags;

        /*
         * Extended attributes can be read independently of the main file
         * data. Taking i_rwsem even when reading would cause contention
         * between readers of EAs and writers of regular file data, so
         * instead we synchronize on xattr_sem when reading or changing
         * EAs.
         */
        struct rw_semaphore xattr_sem;

        /*
         * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise
         * i_orphan is used.
         */
        union {
                struct list_head i_orphan;        /* unlinked but open inodes */
                unsigned int i_orphan_idx;        /* Index in orphan file */
        };

        /* Fast commit related info */

        /* For tracking dentry create updates */
        struct list_head i_fc_dilist;
        struct list_head i_fc_list;        /*
                                         * inodes that need fast commit
                                         * protected by sbi->s_fc_lock.
                                         */

        /* Start of lblk range that needs to be committed in this fast commit */
        ext4_lblk_t i_fc_lblk_start;

        /* End of lblk range that needs to be committed in this fast commit */
        ext4_lblk_t i_fc_lblk_len;

        /* Number of ongoing updates on this inode */
        atomic_t  i_fc_updates;

        /* Fast commit wait queue for this inode */
        wait_queue_head_t i_fc_wait;

        /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */
        struct mutex i_fc_lock;

        /*
         * i_disksize keeps track of what the inode size is ON DISK, not
         * in memory.  During truncate, i_size is set to the new size by
         * the VFS prior to calling ext4_truncate(), but the filesystem won't
         * set i_disksize to 0 until the truncate is actually under way.
         *
         * The intent is that i_disksize always represents the blocks which
         * are used by this file.  This allows recovery to restart truncate
         * on orphans if we crash during truncate.  We actually write i_disksize
         * into the on-disk inode when writing inodes out, instead of i_size.
         *
         * The only time when i_disksize and i_size may be different is when
         * a truncate is in progress.  The only things which change i_disksize
         * are ext4_get_block (growth) and ext4_truncate (shrinkth).
         */
        loff_t        i_disksize;

        /*
         * i_data_sem is for serialising ext4_truncate() against
         * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
         * data tree are chopped off during truncate. We can't do that in
         * ext4 because whenever we perform intermediate commits during
         * truncate, the inode and all the metadata blocks *must* be in a
         * consistent state which allows truncation of the orphans to restart
         * during recovery.  Hence we must fix the get_block-vs-truncate race
         * by other means, so we have i_data_sem.
         */
        struct rw_semaphore i_data_sem;
        struct inode vfs_inode;
        struct jbd2_inode *jinode;

        spinlock_t i_raw_lock;        /* protects updates to the raw inode */

        /*
         * File creation time. Its function is same as that of
         * struct timespec64 i_{a,c,m}time in the generic inode.
         */
        struct timespec64 i_crtime;

        /* mballoc */
        atomic_t i_prealloc_active;
        struct rb_root i_prealloc_node;
        rwlock_t i_prealloc_lock;

        /* extents status tree */
        struct ext4_es_tree i_es_tree;
        rwlock_t i_es_lock;
        struct list_head i_es_list;
        unsigned int i_es_all_nr;        /* protected by i_es_lock */
        unsigned int i_es_shk_nr;        /* protected by i_es_lock */
        ext4_lblk_t i_es_shrink_lblk;        /* Offset where we start searching for
                                           extents to shrink. Protected by
                                           i_es_lock  */

        /* ialloc */
        ext4_group_t        i_last_alloc_group;

        /* allocation reservation info for delalloc */
        /* In case of bigalloc, this refer to clusters rather than blocks */
        unsigned int i_reserved_data_blocks;

        /* pending cluster reservations for bigalloc file systems */
        struct ext4_pending_tree i_pending_tree;

        /* on-disk additional length */
        __u16 i_extra_isize;

        /* Indicate the inline data space. */
        u16 i_inline_off;
        u16 i_inline_size;

#ifdef CONFIG_QUOTA
        /* quota space reservation, managed internally by quota code */
        qsize_t i_reserved_quota;
#endif

        /* Lock protecting lists below */
        spinlock_t i_completed_io_lock;
        /*
         * Completed IOs that need unwritten extents handling and have
         * transaction reserved
         */
        struct list_head i_rsv_conversion_list;
        struct work_struct i_rsv_conversion_work;
        atomic_t i_unwritten; /* Nr. of inflight conversions pending */

        spinlock_t i_block_reservation_lock;

        /*
         * Transactions that contain inode's metadata needed to complete
         * fsync and fdatasync, respectively.
         */
        tid_t i_sync_tid;
        tid_t i_datasync_tid;

#ifdef CONFIG_QUOTA
        struct dquot __rcu *i_dquot[MAXQUOTAS];
#endif

        /* Precomputed uuid+inum+igen checksum for seeding inode checksums */
        __u32 i_csum_seed;

        kprojid_t i_projid;
};

/*
 * File system states
 */
#define        EXT4_VALID_FS                        0x0001        /* Unmounted cleanly */
#define        EXT4_ERROR_FS                        0x0002        /* Errors detected */
#define        EXT4_ORPHAN_FS                        0x0004        /* Orphans being recovered */
#define EXT4_FC_REPLAY                        0x0020        /* Fast commit replay ongoing */

/*
 * Misc. filesystem flags
 */
#define EXT2_FLAGS_SIGNED_HASH                0x0001  /* Signed dirhash in use */
#define EXT2_FLAGS_UNSIGNED_HASH        0x0002  /* Unsigned dirhash in use */
#define EXT2_FLAGS_TEST_FILESYS                0x0004        /* to test development code */

/*
 * Mount flags set via mount options or defaults
 */
#define EXT4_MOUNT_NO_MBCACHE                0x00001 /* Do not use mbcache */
#define EXT4_MOUNT_GRPID                0x00004        /* Create files with directory's group */
#define EXT4_MOUNT_DEBUG                0x00008        /* Some debugging messages */
#define EXT4_MOUNT_ERRORS_CONT                0x00010        /* Continue on errors */
#define EXT4_MOUNT_ERRORS_RO                0x00020        /* Remount fs ro on errors */
#define EXT4_MOUNT_ERRORS_PANIC                0x00040        /* Panic on errors */
#define EXT4_MOUNT_ERRORS_MASK                0x00070
#define EXT4_MOUNT_MINIX_DF                0x00080        /* Mimics the Minix statfs */
#define EXT4_MOUNT_NOLOAD                0x00100        /* Don't use existing journal*/
#ifdef CONFIG_FS_DAX
#define EXT4_MOUNT_DAX_ALWAYS                0x00200        /* Direct Access */
#else
#define EXT4_MOUNT_DAX_ALWAYS                0
#endif
#define EXT4_MOUNT_DATA_FLAGS                0x00C00        /* Mode for data writes: */
#define EXT4_MOUNT_JOURNAL_DATA                0x00400        /* Write data to journal */
#define EXT4_MOUNT_ORDERED_DATA                0x00800        /* Flush data before commit */
#define EXT4_MOUNT_WRITEBACK_DATA        0x00C00        /* No data ordering */
#define EXT4_MOUNT_UPDATE_JOURNAL        0x01000        /* Update the journal format */
#define EXT4_MOUNT_NO_UID32                0x02000  /* Disable 32-bit UIDs */
#define EXT4_MOUNT_XATTR_USER                0x04000        /* Extended user attributes */
#define EXT4_MOUNT_POSIX_ACL                0x08000        /* POSIX Access Control Lists */
#define EXT4_MOUNT_NO_AUTO_DA_ALLOC        0x10000        /* No auto delalloc mapping */
#define EXT4_MOUNT_BARRIER                0x20000 /* Use block barriers */
#define EXT4_MOUNT_QUOTA                0x40000 /* Some quota option set */
#define EXT4_MOUNT_USRQUOTA                0x80000 /* "old" user quota,
                                                 * enable enforcement for hidden
                                                 * quota files */
#define EXT4_MOUNT_GRPQUOTA                0x100000 /* "old" group quota, enable
                                                  * enforcement for hidden quota
                                                  * files */
#define EXT4_MOUNT_PRJQUOTA                0x200000 /* Enable project quota
                                                  * enforcement */
#define EXT4_MOUNT_DIOREAD_NOLOCK        0x400000 /* Enable support for dio read nolocking */
#define EXT4_MOUNT_JOURNAL_CHECKSUM        0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT        0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_WARN_ON_ERROR        0x2000000 /* Trigger WARN_ON on error */
#define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000
#define EXT4_MOUNT_DELALLOC                0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT        0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY        0x20000000 /* Block validity checking */
#define EXT4_MOUNT_DISCARD                0x40000000 /* Issue DISCARD requests */
#define EXT4_MOUNT_INIT_INODE_TABLE        0x80000000 /* Initialize uninitialized itables */

/*
 * Mount flags set either automatically (could not be set by mount option)
 * based on per file system feature or property or in special cases such as
 * distinguishing between explicit mount option definition and default.
 */
#define EXT4_MOUNT2_EXPLICIT_DELALLOC        0x00000001 /* User explicitly
                                                      specified delalloc */
#define EXT4_MOUNT2_STD_GROUP_SIZE        0x00000002 /* We have standard group
                                                      size of blocksize * 8
                                                      blocks */
#define EXT4_MOUNT2_HURD_COMPAT                0x00000004 /* Support HURD-castrated
                                                      file systems */
#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM        0x00000008 /* User explicitly
                                                specified journal checksum */

#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT        0x00000010 /* Journal fast commit */
#define EXT4_MOUNT2_DAX_NEVER                0x00000020 /* Do not allow Direct Access */
#define EXT4_MOUNT2_DAX_INODE                0x00000040 /* For printing options only */
#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN        0x00000080 /* Optimize group
                                                    * scanning in mballoc
                                                    */
#define EXT4_MOUNT2_ABORT                0x00000100 /* Abort filesystem */

#define clear_opt(sb, opt)                EXT4_SB(sb)->s_mount_opt &= \
                                                ~EXT4_MOUNT_##opt
#define set_opt(sb, opt)                EXT4_SB(sb)->s_mount_opt |= \
                                                EXT4_MOUNT_##opt
#define test_opt(sb, opt)                (EXT4_SB(sb)->s_mount_opt & \
                                         EXT4_MOUNT_##opt)

#define clear_opt2(sb, opt)                EXT4_SB(sb)->s_mount_opt2 &= \
                                                ~EXT4_MOUNT2_##opt
#define set_opt2(sb, opt)                EXT4_SB(sb)->s_mount_opt2 |= \
                                                EXT4_MOUNT2_##opt
#define test_opt2(sb, opt)                (EXT4_SB(sb)->s_mount_opt2 & \
                                         EXT4_MOUNT2_##opt)

#define ext4_test_and_set_bit                __test_and_set_bit_le
#define ext4_set_bit                        __set_bit_le
#define ext4_test_and_clear_bit                __test_and_clear_bit_le
#define ext4_clear_bit                        __clear_bit_le
#define ext4_test_bit                        test_bit_le
#define ext4_find_next_zero_bit                find_next_zero_bit_le
#define ext4_find_next_bit                find_next_bit_le

extern void mb_set_bits(void *bm, int cur, int len);

/*
 * Maximal mount counts between two filesystem checks
 */
#define EXT4_DFL_MAX_MNT_COUNT                20        /* Allow 20 mounts */
#define EXT4_DFL_CHECKINTERVAL                0        /* Don't use interval check */

/*
 * Behaviour when detecting errors
 */
#define EXT4_ERRORS_CONTINUE                1        /* Continue execution */
#define EXT4_ERRORS_RO                        2        /* Remount fs read-only */
#define EXT4_ERRORS_PANIC                3        /* Panic */
#define EXT4_ERRORS_DEFAULT                EXT4_ERRORS_CONTINUE

/* Metadata checksum algorithm codes */
#define EXT4_CRC32C_CHKSUM                1

#define EXT4_LABEL_MAX                        16

/*
 * Structure of the super block
 */
struct ext4_super_block {
/*00*/        __le32        s_inodes_count;                /* Inodes count */
        __le32        s_blocks_count_lo;        /* Blocks count */
        __le32        s_r_blocks_count_lo;        /* Reserved blocks count */
        __le32        s_free_blocks_count_lo;        /* Free blocks count */
/*10*/        __le32        s_free_inodes_count;        /* Free inodes count */
        __le32        s_first_data_block;        /* First Data Block */
        __le32        s_log_block_size;        /* Block size */
        __le32        s_log_cluster_size;        /* Allocation cluster size */
/*20*/        __le32        s_blocks_per_group;        /* # Blocks per group */
        __le32        s_clusters_per_group;        /* # Clusters per group */
        __le32        s_inodes_per_group;        /* # Inodes per group */
        __le32        s_mtime;                /* Mount time */
/*30*/        __le32        s_wtime;                /* Write time */
        __le16        s_mnt_count;                /* Mount count */
        __le16        s_max_mnt_count;        /* Maximal mount count */
        __le16        s_magic;                /* Magic signature */
        __le16        s_state;                /* File system state */
        __le16        s_errors;                /* Behaviour when detecting errors */
        __le16        s_minor_rev_level;        /* minor revision level */
/*40*/        __le32        s_lastcheck;                /* time of last check */
        __le32        s_checkinterval;        /* max. time between checks */
        __le32        s_creator_os;                /* OS */
        __le32        s_rev_level;                /* Revision level */
/*50*/        __le16        s_def_resuid;                /* Default uid for reserved blocks */
        __le16        s_def_resgid;                /* Default gid for reserved blocks */
        /*
         * These fields are for EXT4_DYNAMIC_REV superblocks only.
         *
         * Note: the difference between the compatible feature set and
         * the incompatible feature set is that if there is a bit set
         * in the incompatible feature set that the kernel doesn't
         * know about, it should refuse to mount the filesystem.
         *
         * e2fsck's requirements are more strict; if it doesn't know
         * about a feature in either the compatible or incompatible
         * feature set, it must abort and not try to meddle with
         * things it doesn't understand...
         */
        __le32        s_first_ino;                /* First non-reserved inode */
        __le16  s_inode_size;                /* size of inode structure */
        __le16        s_block_group_nr;        /* block group # of this superblock */
        __le32        s_feature_compat;        /* compatible feature set */
/*60*/        __le32        s_feature_incompat;        /* incompatible feature set */
        __le32        s_feature_ro_compat;        /* readonly-compatible feature set */
/*68*/        __u8        s_uuid[16];                /* 128-bit uuid for volume */
/*78*/        char        s_volume_name[EXT4_LABEL_MAX];        /* volume name */
/*88*/        char        s_last_mounted[64] __nonstring;        /* directory where last mounted */
/*C8*/        __le32        s_algorithm_usage_bitmap; /* For compression */
        /*
         * Performance hints.  Directory preallocation should only
         * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
         */
        __u8        s_prealloc_blocks;        /* Nr of blocks to try to preallocate*/
        __u8        s_prealloc_dir_blocks;        /* Nr to preallocate for dirs */
        __le16        s_reserved_gdt_blocks;        /* Per group desc for online growth */
        /*
         * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set.
         */
/*D0*/        __u8        s_journal_uuid[16];        /* uuid of journal superblock */
/*E0*/        __le32        s_journal_inum;                /* inode number of journal file */
        __le32        s_journal_dev;                /* device number of journal file */
        __le32        s_last_orphan;                /* start of list of inodes to delete */
        __le32        s_hash_seed[4];                /* HTREE hash seed */
        __u8        s_def_hash_version;        /* Default hash version to use */
        __u8        s_jnl_backup_type;
        __le16  s_desc_size;                /* size of group descriptor */
/*100*/        __le32        s_default_mount_opts;
        __le32        s_first_meta_bg;        /* First metablock block group */
        __le32        s_mkfs_time;                /* When the filesystem was created */
        __le32        s_jnl_blocks[17];        /* Backup of the journal inode */
        /* 64bit support valid if EXT4_FEATURE_INCOMPAT_64BIT */
/*150*/        __le32        s_blocks_count_hi;        /* Blocks count */
        __le32        s_r_blocks_count_hi;        /* Reserved blocks count */
        __le32        s_free_blocks_count_hi;        /* Free blocks count */
        __le16        s_min_extra_isize;        /* All inodes have at least # bytes */
        __le16        s_want_extra_isize;         /* New inodes should reserve # bytes */
        __le32        s_flags;                /* Miscellaneous flags */
        __le16  s_raid_stride;                /* RAID stride */
        __le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
        __le64  s_mmp_block;            /* Block for multi-mount protection */
        __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
        __u8        s_log_groups_per_flex;  /* FLEX_BG group size */
        __u8        s_checksum_type;        /* metadata checksum algorithm used */
        __u8        s_encryption_level;        /* versioning level for encryption */
        __u8        s_reserved_pad;                /* Padding to next 32bits */
        __le64        s_kbytes_written;        /* nr of lifetime kilobytes written */
        __le32        s_snapshot_inum;        /* Inode number of active snapshot */
        __le32        s_snapshot_id;                /* sequential ID of active snapshot */
        __le64        s_snapshot_r_blocks_count; /* reserved blocks for active
                                              snapshot's future use */
        __le32        s_snapshot_list;        /* inode number of the head of the
                                           on-disk snapshot list */
#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count)
        __le32        s_error_count;                /* number of fs errors */
        __le32        s_first_error_time;        /* first time an error happened */
        __le32        s_first_error_ino;        /* inode involved in first error */
        __le64        s_first_error_block;        /* block involved of first error */
        __u8        s_first_error_func[32] __nonstring;        /* function where the error happened */
        __le32        s_first_error_line;        /* line number where error happened */
        __le32        s_last_error_time;        /* most recent time of an error */
        __le32        s_last_error_ino;        /* inode involved in last error */
        __le32        s_last_error_line;        /* line number where error happened */
        __le64        s_last_error_block;        /* block involved of last error */
        __u8        s_last_error_func[32] __nonstring;        /* function where the error happened */
#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
        __u8        s_mount_opts[64];
        __le32        s_usr_quota_inum;        /* inode for tracking user quota */
        __le32        s_grp_quota_inum;        /* inode for tracking group quota */
        __le32        s_overhead_clusters;        /* overhead blocks/clusters in fs */
        __le32        s_backup_bgs[2];        /* groups with sparse_super2 SBs */
        __u8        s_encrypt_algos[4];        /* Encryption algorithms in use  */
        __u8        s_encrypt_pw_salt[16];        /* Salt used for string2key algorithm */
        __le32        s_lpf_ino;                /* Location of the lost+found inode */
        __le32        s_prj_quota_inum;        /* inode for tracking project quota */
        __le32        s_checksum_seed;        /* crc32c(uuid) if csum_seed set */
        __u8        s_wtime_hi;
        __u8        s_mtime_hi;
        __u8        s_mkfs_time_hi;
        __u8        s_lastcheck_hi;
        __u8        s_first_error_time_hi;
        __u8        s_last_error_time_hi;
        __u8        s_first_error_errcode;
        __u8    s_last_error_errcode;
        __le16  s_encoding;                /* Filename charset encoding */
        __le16  s_encoding_flags;        /* Filename charset encoding flags */
        __le32  s_orphan_file_inum;        /* Inode for tracking orphan inodes */
        __le32        s_reserved[94];                /* Padding to the end of the block */
        __le32        s_checksum;                /* crc32c(superblock) */
};

#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)

#ifdef __KERNEL__

/* Number of quota types we support */
#define EXT4_MAXQUOTAS 3

#define EXT4_ENC_UTF8_12_1        1

/* Types of ext4 journal triggers */
enum ext4_journal_trigger_type {
        EXT4_JTR_ORPHAN_FILE,
        EXT4_JTR_NONE        /* This must be the last entry for indexing to work! */
};

#define EXT4_JOURNAL_TRIGGER_COUNT EXT4_JTR_NONE

struct ext4_journal_trigger {
        struct jbd2_buffer_trigger_type tr_triggers;
        struct super_block *sb;
};

static inline struct ext4_journal_trigger *EXT4_TRIGGER(
                                struct jbd2_buffer_trigger_type *trigger)
{
        return container_of(trigger, struct ext4_journal_trigger, tr_triggers);
}

#define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04

/* Structure at the tail of orphan block */
struct ext4_orphan_block_tail {
        __le32 ob_magic;
        __le32 ob_checksum;
};

static inline int ext4_inodes_per_orphan_block(struct super_block *sb)
{
        return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) /
                        sizeof(u32);
}

struct ext4_orphan_block {
        atomic_t ob_free_entries;        /* Number of free orphan entries in block */
        struct buffer_head *ob_bh;        /* Buffer for orphan block */
};

/*
 * Info about orphan file.
 */
struct ext4_orphan_info {
        int of_blocks;                        /* Number of orphan blocks in a file */
        __u32 of_csum_seed;                /* Checksum seed for orphan file */
        struct ext4_orphan_block *of_binfo;        /* Array with info about orphan
                                                 * file blocks */
};

/*
 * fourth extended-fs super-block data in memory
 */
struct ext4_sb_info {
        unsigned long s_desc_size;        /* Size of a group descriptor in bytes */
        unsigned long s_inodes_per_block;/* Number of inodes per block */
        unsigned long s_blocks_per_group;/* Number of blocks in a group */
        unsigned long s_clusters_per_group; /* Number of clusters in a group */
        unsigned long s_inodes_per_group;/* Number of inodes in a group */
        unsigned long s_itb_per_group;        /* Number of inode table blocks per group */
        unsigned long s_gdb_count;        /* Number of group descriptor blocks */
        unsigned long s_desc_per_block;        /* Number of group descriptors per block */
        ext4_group_t s_groups_count;        /* Number of groups in the fs */
        ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
        unsigned long s_overhead;  /* # of fs overhead clusters */
        unsigned int s_cluster_ratio;        /* Number of blocks per cluster */
        unsigned int s_cluster_bits;        /* log2 of s_cluster_ratio */
        loff_t s_bitmap_maxbytes;        /* max bytes for bitmap files */
        struct buffer_head * s_sbh;        /* Buffer containing the super block */
        struct ext4_super_block *s_es;        /* Pointer to the super block in the buffer */
        /* Array of bh's for the block group descriptors */
        struct buffer_head * __rcu *s_group_desc;
        unsigned int s_mount_opt;
        unsigned int s_mount_opt2;
        unsigned long s_mount_flags;
        unsigned int s_def_mount_opt;
        unsigned int s_def_mount_opt2;
        ext4_fsblk_t s_sb_block;
        atomic64_t s_resv_clusters;
        kuid_t s_resuid;
        kgid_t s_resgid;
        unsigned short s_mount_state;
        unsigned short s_pad;
        int s_addr_per_block_bits;
        int s_desc_per_block_bits;
        int s_inode_size;
        int s_first_ino;
        unsigned int s_inode_readahead_blks;
        unsigned int s_inode_goal;
        u32 s_hash_seed[4];
        int s_def_hash_version;
        int s_hash_unsigned;        /* 3 if hash should be unsigned, 0 if not */
        struct percpu_counter s_freeclusters_counter;
        struct percpu_counter s_freeinodes_counter;
        struct percpu_counter s_dirs_counter;
        struct percpu_counter s_dirtyclusters_counter;
        struct percpu_counter s_sra_exceeded_retry_limit;
        struct blockgroup_lock *s_blockgroup_lock;
        struct proc_dir_entry *s_proc;
        struct kobject s_kobj;
        struct completion s_kobj_unregister;
        struct super_block *s_sb;
        struct buffer_head *s_mmp_bh;

        /* Journaling */
        struct journal_s *s_journal;
        unsigned long s_ext4_flags;                /* Ext4 superblock flags */
        struct mutex s_orphan_lock;        /* Protects on disk list changes */
        struct list_head s_orphan;        /* List of orphaned inodes in on disk
                                           list */
        struct ext4_orphan_info s_orphan_info;
        unsigned long s_commit_interval;
        u32 s_max_batch_time;
        u32 s_min_batch_time;
        struct file *s_journal_bdev_file;
#ifdef CONFIG_QUOTA
        /* Names of quota files with journalled quota */
        char __rcu *s_qf_names[EXT4_MAXQUOTAS];
        int s_jquota_fmt;                        /* Format of quota to use */
#endif
        unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
        struct ext4_system_blocks __rcu *s_system_blks;

#ifdef EXTENTS_STATS
        /* ext4 extents stats */
        unsigned long s_ext_min;
        unsigned long s_ext_max;
        unsigned long s_depth_max;
        spinlock_t s_ext_stats_lock;
        unsigned long s_ext_blocks;
        unsigned long s_ext_extents;
#endif

        /* for buddy allocator */
        struct ext4_group_info ** __rcu *s_group_info;
        struct inode *s_buddy_cache;
        spinlock_t s_md_lock;
        unsigned short *s_mb_offsets;
        unsigned int *s_mb_maxs;
        unsigned int s_group_info_size;
        unsigned int s_mb_free_pending;
        struct list_head s_freed_data_list[2];        /* List of blocks to be freed
                                                   after commit completed */
        struct list_head s_discard_list;
        struct work_struct s_discard_work;
        atomic_t s_retry_alloc_pending;
        struct list_head *s_mb_avg_fragment_size;
        rwlock_t *s_mb_avg_fragment_size_locks;
        struct list_head *s_mb_largest_free_orders;
        rwlock_t *s_mb_largest_free_orders_locks;

        /* tunables */
        unsigned long s_stripe;
        unsigned int s_mb_max_linear_groups;
        unsigned int s_mb_stream_request;
        unsigned int s_mb_max_to_scan;
        unsigned int s_mb_min_to_scan;
        unsigned int s_mb_stats;
        unsigned int s_mb_order2_reqs;
        unsigned int s_mb_group_prealloc;
        unsigned int s_max_dir_size_kb;
        /* where last allocation was done - for stream allocation */
        unsigned long s_mb_last_group;
        unsigned long s_mb_last_start;
        unsigned int s_mb_prefetch;
        unsigned int s_mb_prefetch_limit;
        unsigned int s_mb_best_avail_max_trim_order;

        /* stats for buddy allocator */
        atomic_t s_bal_reqs;        /* number of reqs with len > 1 */
        atomic_t s_bal_success;        /* we found long enough chunks */
        atomic_t s_bal_allocated;        /* in blocks */
        atomic_t s_bal_ex_scanned;        /* total extents scanned */
        atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS];        /* total extents scanned */
        atomic_t s_bal_groups_scanned;        /* number of groups scanned */
        atomic_t s_bal_goals;        /* goal hits */
        atomic_t s_bal_len_goals;        /* len goal hits */
        atomic_t s_bal_breaks;        /* too long searches */
        atomic_t s_bal_2orders;        /* 2^order hits */
        atomic_t s_bal_p2_aligned_bad_suggestions;
        atomic_t s_bal_goal_fast_bad_suggestions;
        atomic_t s_bal_best_avail_bad_suggestions;
        atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS];
        atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS];
        atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS];                /* cX loop didn't find blocks */
        atomic_t s_mb_buddies_generated;        /* number of buddies generated */
        atomic64_t s_mb_generation_time;
        atomic_t s_mb_lost_chunks;
        atomic_t s_mb_preallocated;
        atomic_t s_mb_discarded;
        atomic_t s_lock_busy;

        /* locality groups */
        struct ext4_locality_group __percpu *s_locality_groups;

        /* for write statistics */
        unsigned long s_sectors_written_start;
        u64 s_kbytes_written;

        /* the size of zero-out chunk */
        unsigned int s_extent_max_zeroout_kb;

        unsigned int s_log_groups_per_flex;
        struct flex_groups * __rcu *s_flex_groups;
        ext4_group_t s_flex_groups_allocated;

        /* workqueue for reserved extent conversions (buffered io) */
        struct workqueue_struct *rsv_conversion_wq;

        /* timer for periodic error stats printing */
        struct timer_list s_err_report;

        /* Lazy inode table initialization info */
        struct ext4_li_request *s_li_request;
        /* Wait multiplier for lazy initialization thread */
        unsigned int s_li_wait_mult;

        /* Kernel thread for multiple mount protection */
        struct task_struct *s_mmp_tsk;

        /* record the last minlen when FITRIM is called. */
        unsigned long s_last_trim_minblks;

        /* Reference to checksum algorithm driver via cryptoapi */
        struct crypto_shash *s_chksum_driver;

        /* Precomputed FS UUID checksum for seeding other checksums */
        __u32 s_csum_seed;

        /* Reclaim extents from extent status tree */
        struct shrinker *s_es_shrinker;
        struct list_head s_es_list;        /* List of inodes with reclaimable extents */
        long s_es_nr_inode;
        struct ext4_es_stats s_es_stats;
        struct mb_cache *s_ea_block_cache;
        struct mb_cache *s_ea_inode_cache;
        spinlock_t s_es_lock ____cacheline_aligned_in_smp;

        /* Journal triggers for checksum computation */
        struct ext4_journal_trigger s_journal_triggers[EXT4_JOURNAL_TRIGGER_COUNT];

        /* Ratelimit ext4 messages. */
        struct ratelimit_state s_err_ratelimit_state;
        struct ratelimit_state s_warning_ratelimit_state;
        struct ratelimit_state s_msg_ratelimit_state;
        atomic_t s_warning_count;
        atomic_t s_msg_count;

        /* Encryption policy for '-o test_dummy_encryption' */
        struct fscrypt_dummy_policy s_dummy_enc_policy;

        /*
         * Barrier between writepages ops and changing any inode's JOURNAL_DATA
         * or EXTENTS flag or between writepages ops and changing DELALLOC or
         * DIOREAD_NOLOCK mount options on remount.
         */
        struct percpu_rw_semaphore s_writepages_rwsem;
        struct dax_device *s_daxdev;
        u64 s_dax_part_off;
#ifdef CONFIG_EXT4_DEBUG
        unsigned long s_simulate_fail;
#endif
        /* Record the errseq of the backing block device */
        errseq_t s_bdev_wb_err;
        spinlock_t s_bdev_wb_lock;

        /* Information about errors that happened during this mount */
        spinlock_t s_error_lock;
        int s_add_error_count;
        int s_first_error_code;
        __u32 s_first_error_line;
        __u32 s_first_error_ino;
        __u64 s_first_error_block;
        const char *s_first_error_func;
        time64_t s_first_error_time;
        int s_last_error_code;
        __u32 s_last_error_line;
        __u32 s_last_error_ino;
        __u64 s_last_error_block;
        const char *s_last_error_func;
        time64_t s_last_error_time;
        /*
         * If we are in a context where we cannot update the on-disk
         * superblock, we queue the work here.  This is used to update
         * the error information in the superblock, and for periodic
         * updates of the superblock called from the commit callback
         * function.
         */
        struct work_struct s_sb_upd_work;

        /* Ext4 fast commit sub transaction ID */
        atomic_t s_fc_subtid;

        /*
         * After commit starts, the main queue gets locked, and the further
         * updates get added in the staging queue.
         */
#define FC_Q_MAIN        0
#define FC_Q_STAGING        1
        struct list_head s_fc_q[2];        /* Inodes staged for fast commit
                                         * that have data changes in them.
                                         */
        struct list_head s_fc_dentry_q[2];        /* directory entry updates */
        unsigned int s_fc_bytes;
        /*
         * Main fast commit lock. This lock protects accesses to the
         * following fields:
         * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh.
         */
        spinlock_t s_fc_lock;
        struct buffer_head *s_fc_bh;
        struct ext4_fc_stats s_fc_stats;
        tid_t s_fc_ineligible_tid;
#ifdef CONFIG_EXT4_DEBUG
        int s_fc_debug_max_replay;
#endif
        struct ext4_fc_replay_state s_fc_replay_state;
};

static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}
static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
{
        return container_of(inode, struct ext4_inode_info, vfs_inode);
}

static inline int ext4_writepages_down_read(struct super_block *sb)
{
        percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem);
        return memalloc_nofs_save();
}

static inline void ext4_writepages_up_read(struct super_block *sb, int ctx)
{
        memalloc_nofs_restore(ctx);
        percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem);
}

static inline int ext4_writepages_down_write(struct super_block *sb)
{
        percpu_down_write(&EXT4_SB(sb)->s_writepages_rwsem);
        return memalloc_nofs_save();
}

static inline void ext4_writepages_up_write(struct super_block *sb, int ctx)
{
        memalloc_nofs_restore(ctx);
        percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem);
}

static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
{
        return ino == EXT4_ROOT_INO ||
                (ino >= EXT4_FIRST_INO(sb) &&
                 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
}

/*
 * Returns: sbi->field[index]
 * Used to access an array element from the following sbi fields which require
 * rcu protection to avoid dereferencing an invalid pointer due to reassignment
 * - s_group_desc
 * - s_group_info
 * - s_flex_group
 */
#define sbi_array_rcu_deref(sbi, field, index)                                   \
({                                                                           \
        typeof(*((sbi)->field)) _v;                                           \
        rcu_read_lock();                                                   \
        _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index];           \
        rcu_read_unlock();                                                   \
        _v;                                                                   \
})

/*
 * run-time mount flags
 */
enum {
        EXT4_MF_MNTDIR_SAMPLED,
        EXT4_MF_FC_INELIGIBLE        /* Fast commit ineligible */
};

static inline void ext4_set_mount_flag(struct super_block *sb, int bit)
{
        set_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}

static inline void ext4_clear_mount_flag(struct super_block *sb, int bit)
{
        clear_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}

static inline int ext4_test_mount_flag(struct super_block *sb, int bit)
{
        return test_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}


/*
 * Simulate_fail codes
 */
#define EXT4_SIM_BBITMAP_EIO        1
#define EXT4_SIM_BBITMAP_CRC        2
#define EXT4_SIM_IBITMAP_EIO        3
#define EXT4_SIM_IBITMAP_CRC        4
#define EXT4_SIM_INODE_EIO        5
#define EXT4_SIM_INODE_CRC        6
#define EXT4_SIM_DIRBLOCK_EIO        7
#define EXT4_SIM_DIRBLOCK_CRC        8

static inline bool ext4_simulate_fail(struct super_block *sb,
                                     unsigned long code)
{
#ifdef CONFIG_EXT4_DEBUG
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (unlikely(sbi->s_simulate_fail == code)) {
                sbi->s_simulate_fail = 0;
                return true;
        }
#endif
        return false;
}

static inline void ext4_simulate_fail_bh(struct super_block *sb,
                                         struct buffer_head *bh,
                                         unsigned long code)
{
        if (!IS_ERR(bh) && ext4_simulate_fail(sb, code))
                clear_buffer_uptodate(bh);
}

/*
 * Error number codes for s_{first,last}_error_errno
 *
 * Linux errno numbers are architecture specific, so we need to translate
 * them into something which is architecture independent.   We don't define
 * codes for all errno's; just the ones which are most likely to be the cause
 * of an ext4_error() call.
 */
#define EXT4_ERR_UNKNOWN         1
#define EXT4_ERR_EIO                 2
#define EXT4_ERR_ENOMEM                 3
#define EXT4_ERR_EFSBADCRC         4
#define EXT4_ERR_EFSCORRUPTED         5
#define EXT4_ERR_ENOSPC                 6
#define EXT4_ERR_ENOKEY                 7
#define EXT4_ERR_EROFS                 8
#define EXT4_ERR_EFBIG                 9
#define EXT4_ERR_EEXIST                10
#define EXT4_ERR_ERANGE                11
#define EXT4_ERR_EOVERFLOW        12
#define EXT4_ERR_EBUSY                13
#define EXT4_ERR_ENOTDIR        14
#define EXT4_ERR_ENOTEMPTY        15
#define EXT4_ERR_ESHUTDOWN        16
#define EXT4_ERR_EFAULT                17

/*
 * Inode dynamic state flags
 */
enum {
        EXT4_STATE_NEW,                        /* inode is newly created */
        EXT4_STATE_XATTR,                /* has in-inode xattrs */
        EXT4_STATE_NO_EXPAND,                /* No space for expansion */
        EXT4_STATE_DA_ALLOC_CLOSE,        /* Alloc DA blks on close */
        EXT4_STATE_EXT_MIGRATE,                /* Inode is migrating */
        EXT4_STATE_NEWENTRY,                /* File just added to dir */
        EXT4_STATE_MAY_INLINE_DATA,        /* may have in-inode data */
        EXT4_STATE_EXT_PRECACHED,        /* extents have been precached */
        EXT4_STATE_LUSTRE_EA_INODE,        /* Lustre-style ea_inode */
        EXT4_STATE_VERITY_IN_PROGRESS,        /* building fs-verity Merkle tree */
        EXT4_STATE_FC_COMMITTING,        /* Fast commit ongoing */
        EXT4_STATE_ORPHAN_FILE,                /* Inode orphaned in orphan file */
};

#define EXT4_INODE_BIT_FNS(name, field, offset)                                \
static inline int ext4_test_inode_##name(struct inode *inode, int bit)        \
{                                                                        \
        return test_bit(bit + (offset), &EXT4_I(inode)->i_##field);        \
}                                                                        \
static inline void ext4_set_inode_##name(struct inode *inode, int bit)        \
{                                                                        \
        set_bit(bit + (offset), &EXT4_I(inode)->i_##field);                \
}                                                                        \
static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
{                                                                        \
        clear_bit(bit + (offset), &EXT4_I(inode)->i_##field);                \
}

/* Add these declarations here only so that these functions can be
 * found by name.  Otherwise, they are very hard to locate. */
static inline int ext4_test_inode_flag(struct inode *inode, int bit);
static inline void ext4_set_inode_flag(struct inode *inode, int bit);
static inline void ext4_clear_inode_flag(struct inode *inode, int bit);
EXT4_INODE_BIT_FNS(flag, flags, 0)

/* Add these declarations here only so that these functions can be
 * found by name.  Otherwise, they are very hard to locate. */
static inline int ext4_test_inode_state(struct inode *inode, int bit);
static inline void ext4_set_inode_state(struct inode *inode, int bit);
static inline void ext4_clear_inode_state(struct inode *inode, int bit);
#if (BITS_PER_LONG < 64)
EXT4_INODE_BIT_FNS(state, state_flags, 0)

static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
{
        (ei)->i_state_flags = 0;
}
#else
EXT4_INODE_BIT_FNS(state, flags, 32)

static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
{
        /* We depend on the fact that callers will set i_flags */
}
#endif
#else
/* Assume that user mode programs are passing in an ext4fs superblock, not
 * a kernel struct super_block.  This will allow us to call the feature-test
 * macros from user land. */
#define EXT4_SB(sb)        (sb)
#endif

static inline bool ext4_verity_in_progress(struct inode *inode)
{
        return IS_ENABLED(CONFIG_FS_VERITY) &&
               ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
}

#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime

/*
 * Codes for operating systems
 */
#define EXT4_OS_LINUX                0
#define EXT4_OS_HURD                1
#define EXT4_OS_MASIX                2
#define EXT4_OS_FREEBSD                3
#define EXT4_OS_LITES                4

/*
 * Revision levels
 */
#define EXT4_GOOD_OLD_REV        0        /* The good old (original) format */
#define EXT4_DYNAMIC_REV        1        /* V2 format w/ dynamic inode sizes */

#define EXT4_MAX_SUPP_REV        EXT4_DYNAMIC_REV

#define EXT4_GOOD_OLD_INODE_SIZE 128

#define EXT4_EXTRA_TIMESTAMP_MAX        (((s64)1 << 34) - 1  + S32_MIN)
#define EXT4_NON_EXTRA_TIMESTAMP_MAX        S32_MAX
#define EXT4_TIMESTAMP_MIN                S32_MIN

/*
 * Feature set definitions
 */

#define EXT4_FEATURE_COMPAT_DIR_PREALLOC        0x0001
#define EXT4_FEATURE_COMPAT_IMAGIC_INODES        0x0002
#define EXT4_FEATURE_COMPAT_HAS_JOURNAL                0x0004
#define EXT4_FEATURE_COMPAT_EXT_ATTR                0x0008
#define EXT4_FEATURE_COMPAT_RESIZE_INODE        0x0010
#define EXT4_FEATURE_COMPAT_DIR_INDEX                0x0020
#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2        0x0200
/*
 * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes
 * incompatible only if fast commit blocks are present in the FS. Since we
 * clear the journal (and thus the fast commit blocks), we don't mark FS as
 * incompatible. We also have a JBD2 incompat feature, which gets set when
 * there are fast commit blocks present in the journal.
 */
#define EXT4_FEATURE_COMPAT_FAST_COMMIT                0x0400
#define EXT4_FEATURE_COMPAT_STABLE_INODES        0x0800
#define EXT4_FEATURE_COMPAT_ORPHAN_FILE                0x1000        /* Orphan file exists */

#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER        0x0001
#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE        0x0002
#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR        0x0004
#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE        0x0008
#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM                0x0010
#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK        0x0020
#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE        0x0040
#define EXT4_FEATURE_RO_COMPAT_QUOTA                0x0100
#define EXT4_FEATURE_RO_COMPAT_BIGALLOC                0x0200
/*
 * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM).  When
 * METADATA_CSUM is set, group descriptor checksums use the same algorithm as
 * all other data structures' checksums.  However, the METADATA_CSUM and
 * GDT_CSUM bits are mutually exclusive.
 */
#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM        0x0400
#define EXT4_FEATURE_RO_COMPAT_READONLY                0x1000
#define EXT4_FEATURE_RO_COMPAT_PROJECT                0x2000
#define EXT4_FEATURE_RO_COMPAT_VERITY                0x8000
#define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT        0x10000 /* Orphan file may be
                                                           non-empty */

#define EXT4_FEATURE_INCOMPAT_COMPRESSION        0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE                0x0002
#define EXT4_FEATURE_INCOMPAT_RECOVER                0x0004 /* Needs recovery */
#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV        0x0008 /* Journal device */
#define EXT4_FEATURE_INCOMPAT_META_BG                0x0010
#define EXT4_FEATURE_INCOMPAT_EXTENTS                0x0040 /* extents support */
#define EXT4_FEATURE_INCOMPAT_64BIT                0x0080
#define EXT4_FEATURE_INCOMPAT_MMP               0x0100
#define EXT4_FEATURE_INCOMPAT_FLEX_BG                0x0200
#define EXT4_FEATURE_INCOMPAT_EA_INODE                0x0400 /* EA in inode */
#define EXT4_FEATURE_INCOMPAT_DIRDATA                0x1000 /* data in dirent */
#define EXT4_FEATURE_INCOMPAT_CSUM_SEED                0x2000
#define EXT4_FEATURE_INCOMPAT_LARGEDIR                0x4000 /* >2GB or 3-lvl htree */
#define EXT4_FEATURE_INCOMPAT_INLINE_DATA        0x8000 /* data in inode */
#define EXT4_FEATURE_INCOMPAT_ENCRYPT                0x10000
#define EXT4_FEATURE_INCOMPAT_CASEFOLD                0x20000

extern void ext4_update_dynamic_rev(struct super_block *sb);

#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_compat & \
                cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
        ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_compat |= \
                cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
} \
static inline void ext4_clear_feature_##name(struct super_block *sb) \
{ \
        EXT4_SB(sb)->s_es->s_feature_compat &= \
                ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
}

#define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \
                cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
        ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_ro_compat |= \
                cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
} \
static inline void ext4_clear_feature_##name(struct super_block *sb) \
{ \
        EXT4_SB(sb)->s_es->s_feature_ro_compat &= \
                ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
}

#define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_incompat & \
                cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
        ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_incompat |= \
                cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
} \
static inline void ext4_clear_feature_##name(struct super_block *sb) \
{ \
        EXT4_SB(sb)->s_es->s_feature_incompat &= \
                ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
}

EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc,                DIR_PREALLOC)
EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes,        IMAGIC_INODES)
EXT4_FEATURE_COMPAT_FUNCS(journal,                HAS_JOURNAL)
EXT4_FEATURE_COMPAT_FUNCS(xattr,                EXT_ATTR)
EXT4_FEATURE_COMPAT_FUNCS(resize_inode,                RESIZE_INODE)
EXT4_FEATURE_COMPAT_FUNCS(dir_index,                DIR_INDEX)
EXT4_FEATURE_COMPAT_FUNCS(sparse_super2,        SPARSE_SUPER2)
EXT4_FEATURE_COMPAT_FUNCS(fast_commit,                FAST_COMMIT)
EXT4_FEATURE_COMPAT_FUNCS(stable_inodes,        STABLE_INODES)
EXT4_FEATURE_COMPAT_FUNCS(orphan_file,                ORPHAN_FILE)

EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super,        SPARSE_SUPER)
EXT4_FEATURE_RO_COMPAT_FUNCS(large_file,        LARGE_FILE)
EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir,                BTREE_DIR)
EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file,                HUGE_FILE)
EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum,                GDT_CSUM)
EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink,                DIR_NLINK)
EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize,        EXTRA_ISIZE)
EXT4_FEATURE_RO_COMPAT_FUNCS(quota,                QUOTA)
EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc,                BIGALLOC)
EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum,        METADATA_CSUM)
EXT4_FEATURE_RO_COMPAT_FUNCS(readonly,                READONLY)
EXT4_FEATURE_RO_COMPAT_FUNCS(project,                PROJECT)
EXT4_FEATURE_RO_COMPAT_FUNCS(verity,                VERITY)
EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present,        ORPHAN_PRESENT)

EXT4_FEATURE_INCOMPAT_FUNCS(compression,        COMPRESSION)
EXT4_FEATURE_INCOMPAT_FUNCS(filetype,                FILETYPE)
EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery,        RECOVER)
EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev,        JOURNAL_DEV)
EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg,                META_BG)
EXT4_FEATURE_INCOMPAT_FUNCS(extents,                EXTENTS)
EXT4_FEATURE_INCOMPAT_FUNCS(64bit,                64BIT)
EXT4_FEATURE_INCOMPAT_FUNCS(mmp,                MMP)
EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg,                FLEX_BG)
EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode,                EA_INODE)
EXT4_FEATURE_INCOMPAT_FUNCS(dirdata,                DIRDATA)
EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed,                CSUM_SEED)
EXT4_FEATURE_INCOMPAT_FUNCS(largedir,                LARGEDIR)
EXT4_FEATURE_INCOMPAT_FUNCS(inline_data,        INLINE_DATA)
EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,                ENCRYPT)
EXT4_FEATURE_INCOMPAT_FUNCS(casefold,                CASEFOLD)

#define EXT2_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP        (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_META_BG)
#define EXT2_FEATURE_RO_COMPAT_SUPP        (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR)

#define EXT3_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT3_FEATURE_INCOMPAT_SUPP        (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_RECOVER| \
                                         EXT4_FEATURE_INCOMPAT_META_BG)
#define EXT3_FEATURE_RO_COMPAT_SUPP        (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR)

#define EXT4_FEATURE_COMPAT_SUPP        (EXT4_FEATURE_COMPAT_EXT_ATTR| \
                                         EXT4_FEATURE_COMPAT_ORPHAN_FILE)
#define EXT4_FEATURE_INCOMPAT_SUPP        (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_RECOVER| \
                                         EXT4_FEATURE_INCOMPAT_META_BG| \
                                         EXT4_FEATURE_INCOMPAT_EXTENTS| \
                                         EXT4_FEATURE_INCOMPAT_64BIT| \
                                         EXT4_FEATURE_INCOMPAT_FLEX_BG| \
                                         EXT4_FEATURE_INCOMPAT_EA_INODE| \
                                         EXT4_FEATURE_INCOMPAT_MMP | \
                                         EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
                                         EXT4_FEATURE_INCOMPAT_ENCRYPT | \
                                         EXT4_FEATURE_INCOMPAT_CASEFOLD | \
                                         EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
                                         EXT4_FEATURE_INCOMPAT_LARGEDIR)
#define EXT4_FEATURE_RO_COMPAT_SUPP        (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
                                         EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
                                         EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
                                         EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
                                         EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
                                         EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
                                         EXT4_FEATURE_RO_COMPAT_QUOTA |\
                                         EXT4_FEATURE_RO_COMPAT_PROJECT |\
                                         EXT4_FEATURE_RO_COMPAT_VERITY |\
                                         EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT)

#define EXTN_FEATURE_FUNCS(ver) \
static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_compat & \
                cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \
} \
static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \
                cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \
} \
static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_incompat & \
                cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \
}

EXTN_FEATURE_FUNCS(2)
EXTN_FEATURE_FUNCS(3)
EXTN_FEATURE_FUNCS(4)

static inline bool ext4_has_compat_features(struct super_block *sb)
{
        return (EXT4_SB(sb)->s_es->s_feature_compat != 0);
}
static inline bool ext4_has_ro_compat_features(struct super_block *sb)
{
        return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0);
}
static inline bool ext4_has_incompat_features(struct super_block *sb)
{
        return (EXT4_SB(sb)->s_es->s_feature_incompat != 0);
}

extern int ext4_feature_set_ok(struct super_block *sb, int readonly);

/*
 * Superblock flags
 */
#define EXT4_FLAGS_RESIZING        0
#define EXT4_FLAGS_SHUTDOWN        1
#define EXT4_FLAGS_BDEV_IS_DAX        2

static inline int ext4_forced_shutdown(struct super_block *sb)
{
        return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags);
}

/*
 * Default values for user and/or group using reserved blocks
 */
#define        EXT4_DEF_RESUID                0
#define        EXT4_DEF_RESGID                0

/*
 * Default project ID
 */
#define        EXT4_DEF_PROJID                0

#define EXT4_DEF_INODE_READAHEAD_BLKS        32

/*
 * Default mount options
 */
#define EXT4_DEFM_DEBUG                0x0001
#define EXT4_DEFM_BSDGROUPS        0x0002
#define EXT4_DEFM_XATTR_USER        0x0004
#define EXT4_DEFM_ACL                0x0008
#define EXT4_DEFM_UID16                0x0010
#define EXT4_DEFM_JMODE                0x0060
#define EXT4_DEFM_JMODE_DATA        0x0020
#define EXT4_DEFM_JMODE_ORDERED        0x0040
#define EXT4_DEFM_JMODE_WBACK        0x0060
#define EXT4_DEFM_NOBARRIER        0x0100
#define EXT4_DEFM_BLOCK_VALIDITY 0x0200
#define EXT4_DEFM_DISCARD        0x0400
#define EXT4_DEFM_NODELALLOC        0x0800

/*
 * Default journal batch times
 */
#define EXT4_DEF_MIN_BATCH_TIME        0
#define EXT4_DEF_MAX_BATCH_TIME        15000 /* 15ms */

/*
 * Minimum number of groups in a flexgroup before we separate out
 * directories into the first block group of a flexgroup
 */
#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME        4

/*
 * Structure of a directory entry
 */
#define EXT4_NAME_LEN 255
/*
 * Base length of the ext4 directory entry excluding the name length
 */
#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN)

struct ext4_dir_entry {
        __le32        inode;                        /* Inode number */
        __le16        rec_len;                /* Directory entry length */
        __le16        name_len;                /* Name length */
        char        name[EXT4_NAME_LEN];        /* File name */
};


/*
 * Encrypted Casefolded entries require saving the hash on disk. This structure
 * followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned
 * boundary.
 */
struct ext4_dir_entry_hash {
        __le32 hash;
        __le32 minor_hash;
};

/*
 * The new version of the directory entry.  Since EXT4 structures are
 * stored in intel byte order, and the name_len field could never be
 * bigger than 255 chars, it's safe to reclaim the extra byte for the
 * file_type field.
 */
struct ext4_dir_entry_2 {
        __le32        inode;                        /* Inode number */
        __le16        rec_len;                /* Directory entry length */
        __u8        name_len;                /* Name length */
        __u8        file_type;                /* See file type macros EXT4_FT_* below */
        char        name[EXT4_NAME_LEN];        /* File name */
};

/*
 * Access the hashes at the end of ext4_dir_entry_2
 */
#define EXT4_DIRENT_HASHES(entry) \
        ((struct ext4_dir_entry_hash *) \
                (((void *)(entry)) + \
                ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND)))
#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(de)->hash)
#define EXT4_DIRENT_MINOR_HASH(entry) \
                le32_to_cpu(EXT4_DIRENT_HASHES(de)->minor_hash)

static inline bool ext4_hash_in_dirent(const struct inode *inode)
{
        return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode);
}

/*
 * This is a bogus directory entry at the end of each leaf block that
 * records checksums.
 */
struct ext4_dir_entry_tail {
        __le32        det_reserved_zero1;        /* Pretend to be unused */
        __le16        det_rec_len;                /* 12 */
        __u8        det_reserved_zero2;        /* Zero name length */
        __u8        det_reserved_ft;        /* 0xDE, fake file type */
        __le32        det_checksum;                /* crc32c(uuid+inum+dirblock) */
};

#define EXT4_DIRENT_TAIL(block, blocksize) \
        ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
                                        ((blocksize) - \
                                         sizeof(struct ext4_dir_entry_tail))))

/*
 * Ext4 directory file types.  Only the low 3 bits are used.  The
 * other bits are reserved for now.
 */
#define EXT4_FT_UNKNOWN                0
#define EXT4_FT_REG_FILE        1
#define EXT4_FT_DIR                2
#define EXT4_FT_CHRDEV                3
#define EXT4_FT_BLKDEV                4
#define EXT4_FT_FIFO                5
#define EXT4_FT_SOCK                6
#define EXT4_FT_SYMLINK                7

#define EXT4_FT_MAX                8

#define EXT4_FT_DIR_CSUM        0xDE

/*
 * EXT4_DIR_PAD defines the directory entries boundaries
 *
 * NOTE: It must be a multiple of 4
 */
#define EXT4_DIR_PAD                        4
#define EXT4_DIR_ROUND                        (EXT4_DIR_PAD - 1)
#define EXT4_MAX_REC_LEN                ((1<<16)-1)

/*
 * The rec_len is dependent on the type of directory. Directories that are
 * casefolded and encrypted need to store the hash as well, so we add room for
 * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should
 * pass NULL for dir, as those entries do not use the extra fields.
 */
static inline unsigned int ext4_dir_rec_len(__u8 name_len,
                                                const struct inode *dir)
{
        int rec_len = (name_len + 8 + EXT4_DIR_ROUND);

        if (dir && ext4_hash_in_dirent(dir))
                rec_len += sizeof(struct ext4_dir_entry_hash);
        return (rec_len & ~EXT4_DIR_ROUND);
}

/*
 * If we ever get support for fs block sizes > page_size, we'll need
 * to remove the #if statements in the next two functions...
 */
static inline unsigned int
ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
{
        unsigned len = le16_to_cpu(dlen);

#if (PAGE_SIZE >= 65536)
        if (len == EXT4_MAX_REC_LEN || len == 0)
                return blocksize;
        return (len & 65532) | ((len & 3) << 16);
#else
        return len;
#endif
}

static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
{
        BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3));
#if (PAGE_SIZE >= 65536)
        if (len < 65536)
                return cpu_to_le16(len);
        if (len == blocksize) {
                if (blocksize == 65536)
                        return cpu_to_le16(EXT4_MAX_REC_LEN);
                else
                        return cpu_to_le16(0);
        }
        return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
#else
        return cpu_to_le16(len);
#endif
}

/*
 * Hash Tree Directory indexing
 * (c) Daniel Phillips, 2001
 */

#define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \
                    ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
#define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \
                    !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir)))
#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)

/* Legal values for the dx_root hash_version field: */

#define DX_HASH_LEGACY                        0
#define DX_HASH_HALF_MD4                1
#define DX_HASH_TEA                        2
#define DX_HASH_LEGACY_UNSIGNED                3
#define DX_HASH_HALF_MD4_UNSIGNED        4
#define DX_HASH_TEA_UNSIGNED                5
#define DX_HASH_SIPHASH                        6

static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
                              const void *address, unsigned int length)
{
        struct {
                struct shash_desc shash;
                char ctx[4];
        } desc;

        BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx));

        desc.shash.tfm = sbi->s_chksum_driver;
        *(u32 *)desc.ctx = crc;

        BUG_ON(crypto_shash_update(&desc.shash, address, length));

        return *(u32 *)desc.ctx;
}

#ifdef __KERNEL__

/* hash info structure used by the directory hash */
struct dx_hash_info
{
        u32                hash;
        u32                minor_hash;
        int                hash_version;
        u32                *seed;
};


/* 32 and 64 bit signed EOF for dx directories */
#define EXT4_HTREE_EOF_32BIT   ((1UL  << (32 - 1)) - 1)
#define EXT4_HTREE_EOF_64BIT   ((1ULL << (64 - 1)) - 1)


/*
 * Control parameters used by ext4_htree_next_block
 */
#define HASH_NB_ALWAYS                1

struct ext4_filename {
        const struct qstr *usr_fname;
        struct fscrypt_str disk_name;
        struct dx_hash_info hinfo;
#ifdef CONFIG_FS_ENCRYPTION
        struct fscrypt_str crypto_buf;
#endif
#if IS_ENABLED(CONFIG_UNICODE)
        struct fscrypt_str cf_name;
#endif
};

#define fname_name(p) ((p)->disk_name.name)
#define fname_usr_name(p) ((p)->usr_fname->name)
#define fname_len(p)  ((p)->disk_name.len)

/*
 * Describe an inode's exact location on disk and in memory
 */
struct ext4_iloc
{
        struct buffer_head *bh;
        unsigned long offset;
        ext4_group_t block_group;
};

static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
{
        return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
}

static inline bool ext4_is_quota_file(struct inode *inode)
{
        return IS_NOQUOTA(inode) &&
               !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
}

/*
 * This structure is stuffed into the struct file's private_data field
 * for directories.  It is where we put information so that we can do
 * readdir operations in hash tree order.
 */
struct dir_private_info {
        struct rb_root        root;
        struct rb_node        *curr_node;
        struct fname        *extra_fname;
        loff_t                last_pos;
        __u32                curr_hash;
        __u32                curr_minor_hash;
        __u32                next_hash;
};

/* calculate the first block number of the group */
static inline ext4_fsblk_t
ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
{
        return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
                le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
}

/*
 * Special error return code only used by dx_probe() and its callers.
 */
#define ERR_BAD_DX_DIR        (-(MAX_ERRNO - 1))

/* htree levels for ext4 */
#define        EXT4_HTREE_LEVEL_COMPAT        2
#define        EXT4_HTREE_LEVEL        3

static inline int ext4_dir_htree_level(struct super_block *sb)
{
        return ext4_has_feature_largedir(sb) ?
                EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
}

/*
 * Timeout and state flag for lazy initialization inode thread.
 */
#define EXT4_DEF_LI_WAIT_MULT                        10
#define EXT4_DEF_LI_MAX_START_DELAY                5
#define EXT4_LAZYINIT_QUIT                        0x0001
#define EXT4_LAZYINIT_RUNNING                        0x0002

/*
 * Lazy inode table initialization info
 */
struct ext4_lazy_init {
        unsigned long                li_state;
        struct list_head        li_request_list;
        struct mutex                li_list_mtx;
};

enum ext4_li_mode {
        EXT4_LI_MODE_PREFETCH_BBITMAP,
        EXT4_LI_MODE_ITABLE,
};

struct ext4_li_request {
        struct super_block        *lr_super;
        enum ext4_li_mode        lr_mode;
        ext4_group_t                lr_first_not_zeroed;
        ext4_group_t                lr_next_group;
        struct list_head        lr_request;
        unsigned long                lr_next_sched;
        unsigned long                lr_timeout;
};

struct ext4_features {
        struct kobject f_kobj;
        struct completion f_kobj_unregister;
};

/*
 * This structure will be used for multiple mount protection. It will be
 * written into the block number saved in the s_mmp_block field in the
 * superblock. Programs that check MMP should assume that if
 * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
 * to use the filesystem, regardless of how old the timestamp is.
 */
#define EXT4_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
#define EXT4_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
#define EXT4_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */

struct mmp_struct {
        __le32        mmp_magic;                /* Magic number for MMP */
        __le32        mmp_seq;                /* Sequence no. updated periodically */

        /*
         * mmp_time, mmp_nodename & mmp_bdevname are only used for information
         * purposes and do not affect the correctness of the algorithm
         */
        __le64        mmp_time;                /* Time last updated */
        char        mmp_nodename[64];        /* Node which last updated MMP block */
        char        mmp_bdevname[32];        /* Bdev which last updated MMP block */

        /*
         * mmp_check_interval is used to verify if the MMP block has been
         * updated on the block device. The value is updated based on the
         * maximum time to write the MMP block during an update cycle.
         */
        __le16        mmp_check_interval;

        __le16        mmp_pad1;
        __le32        mmp_pad2[226];
        __le32        mmp_checksum;                /* crc32c(uuid+mmp_block) */
};

/* arguments passed to the mmp thread */
struct mmpd_data {
        struct buffer_head *bh; /* bh from initial read_mmp_block() */
        struct super_block *sb;  /* super block of the fs */
};

/*
 * Check interval multiplier
 * The MMP block is written every update interval and initially checked every
 * update interval x the multiplier (the value is then adapted based on the
 * write latency). The reason is that writes can be delayed under load and we
 * don't want readers to incorrectly assume that the filesystem is no longer
 * in use.
 */
#define EXT4_MMP_CHECK_MULT                2UL

/*
 * Minimum interval for MMP checking in seconds.
 */
#define EXT4_MMP_MIN_CHECK_INTERVAL        5UL

/*
 * Maximum interval for MMP checking in seconds.
 */
#define EXT4_MMP_MAX_CHECK_INTERVAL        300UL

/*
 * Function prototypes
 */

/*
 * Ok, these declarations are also in <linux/kernel.h> but none of the
 * ext4 source programs needs to include it so they are duplicated here.
 */
# define NORET_TYPE        /**/
# define ATTRIB_NORET        __attribute__((noreturn))
# define NORET_AND        noreturn,

/* bitmap.c */
extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
void ext4_inode_bitmap_csum_set(struct super_block *sb,
                                struct ext4_group_desc *gdp,
                                struct buffer_head *bh, int sz);
int ext4_inode_bitmap_csum_verify(struct super_block *sb,
                                  struct ext4_group_desc *gdp,
                                  struct buffer_head *bh, int sz);
void ext4_block_bitmap_csum_set(struct super_block *sb,
                                struct ext4_group_desc *gdp,
                                struct buffer_head *bh);
int ext4_block_bitmap_csum_verify(struct super_block *sb,
                                  struct ext4_group_desc *gdp,
                                  struct buffer_head *bh);

/* balloc.c */
extern void ext4_get_group_no_and_offset(struct super_block *sb,
                                         ext4_fsblk_t blocknr,
                                         ext4_group_t *blockgrpp,
                                         ext4_grpblk_t *offsetp);
extern ext4_group_t ext4_get_group_number(struct super_block *sb,
                                          ext4_fsblk_t block);

extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
                        ext4_group_t group);
extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                                         ext4_fsblk_t goal,
                                         unsigned int flags,
                                         unsigned long *count,
                                         int *errp);
extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
                                    s64 nclusters, unsigned int flags);
extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
                                                    ext4_group_t block_group,
                                                    struct buffer_head ** bh);
extern struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
                                                   ext4_group_t group);
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);

extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
                                                ext4_group_t block_group,
                                                bool ignore_locked);
extern int ext4_wait_block_bitmap(struct super_block *sb,
                                  ext4_group_t block_group,
                                  struct buffer_head *bh);
extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
                                                  ext4_group_t block_group);
extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
                                              ext4_group_t block_group,
                                              struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);

#if IS_ENABLED(CONFIG_UNICODE)
extern int ext4_fname_setup_ci_filename(struct inode *dir,
                                         const struct qstr *iname,
                                         struct ext4_filename *fname);
#endif

/* ext4 encryption related stuff goes here crypto.c */
#ifdef CONFIG_FS_ENCRYPTION
extern const struct fscrypt_operations ext4_cryptops;

int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
                              int lookup, struct ext4_filename *fname);

int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry,
                              struct ext4_filename *fname);

void ext4_fname_free_filename(struct ext4_filename *fname);

int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg);

#else /* !CONFIG_FS_ENCRYPTION */
static inline int ext4_fname_setup_filename(struct inode *dir,
                                            const struct qstr *iname,
                                            int lookup,
                                            struct ext4_filename *fname)
{
        int err = 0;
        fname->usr_fname = iname;
        fname->disk_name.name = (unsigned char *) iname->name;
        fname->disk_name.len = iname->len;

#if IS_ENABLED(CONFIG_UNICODE)
        err = ext4_fname_setup_ci_filename(dir, iname, fname);
#endif

        return err;
}

static inline int ext4_fname_prepare_lookup(struct inode *dir,
                                            struct dentry *dentry,
                                            struct ext4_filename *fname)
{
        return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname);
}

static inline void ext4_fname_free_filename(struct ext4_filename *fname)
{
#if IS_ENABLED(CONFIG_UNICODE)
        kfree(fname->cf_name.name);
        fname->cf_name.name = NULL;
#endif
}

static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp,
                                                   void __user *arg)
{
        return -EOPNOTSUPP;
}
#endif /* !CONFIG_FS_ENCRYPTION */

/* dir.c */
extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
                                  struct file *,
                                  struct ext4_dir_entry_2 *,
                                  struct buffer_head *, char *, int,
                                  unsigned int);
#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \
        unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
                                (de), (bh), (buf), (size), (offset)))
extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                __u32 minor_hash,
                                struct ext4_dir_entry_2 *dirent,
                                struct fscrypt_str *ent_name);
extern void ext4_htree_free_dir_info(struct dir_private_info *p);
extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
                             struct buffer_head *bh,
                             void *buf, int buf_size,
                             struct ext4_filename *fname,
                             struct ext4_dir_entry_2 **dest_de);
void ext4_insert_dentry(struct inode *dir, struct inode *inode,
                        struct ext4_dir_entry_2 *de,
                        int buf_size,
                        struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
        if (!ext4_has_feature_dir_index(inode->i_sb) &&
            ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
                /* ext4_iget() should have caught this... */
                WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb));
                ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
        }
}
static const unsigned char ext4_filetype_table[] = {
        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};

static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
{
        if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX)
                return DT_UNKNOWN;

        return ext4_filetype_table[filetype];
}
extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh,
                             void *buf, int buf_size);

/* fsync.c */
extern int ext4_sync_file(struct file *, loff_t, loff_t, int);

/* hash.c */
extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
                          struct dx_hash_info *hinfo);

/* ialloc.c */
extern int ext4_mark_inode_used(struct super_block *sb, int ino);
extern struct inode *__ext4_new_inode(struct mnt_idmap *, handle_t *,
                                      struct inode *, umode_t,
                                      const struct qstr *qstr, __u32 goal,
                                      uid_t *owner, __u32 i_flags,
                                      int handle_type, unsigned int line_no,
                                      int nblocks);

#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags)          \
        __ext4_new_inode(&nop_mnt_idmap, (handle), (dir), (mode), (qstr),      \
                         (goal), (owner), i_flags, 0, 0, 0)
#define ext4_new_inode_start_handle(idmap, dir, mode, qstr, goal, owner, \
                                    type, nblocks)                    \
        __ext4_new_inode((idmap), NULL, (dir), (mode), (qstr), (goal), (owner), \
                         0, (type), __LINE__, (nblocks))


extern void ext4_free_inode(handle_t *, struct inode *);
extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
extern unsigned long ext4_count_free_inodes(struct super_block *);
extern unsigned long ext4_count_dirs(struct super_block *);
extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
extern int ext4_init_inode_table(struct super_block *sb,
                                 ext4_group_t group, int barrier);
extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);

/* fast_commit.c */
int ext4_fc_info_show(struct seq_file *seq, void *v);
void ext4_fc_init(struct super_block *sb, journal_t *journal);
void ext4_fc_init_inode(struct inode *inode);
void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
                         ext4_lblk_t end);
void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode,
        struct dentry *dentry);
void __ext4_fc_track_link(handle_t *handle, struct inode *inode,
        struct dentry *dentry);
void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_link(handle_t *handle, struct dentry *dentry);
void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
                            struct dentry *dentry);
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle);
void ext4_fc_start_update(struct inode *inode);
void ext4_fc_stop_update(struct inode *inode);
void ext4_fc_del(struct inode *inode);
bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block);
void ext4_fc_replay_cleanup(struct super_block *sb);
int ext4_fc_commit(journal_t *journal, tid_t commit_tid);
int __init ext4_fc_init_dentry_cache(void);
void ext4_fc_destroy_dentry_cache(void);
int ext4_fc_record_regions(struct super_block *sb, int ino,
                           ext4_lblk_t lblk, ext4_fsblk_t pblk,
                           int len, int replay);

/* mballoc.c */
extern const struct seq_operations ext4_mb_seq_groups_ops;
extern const struct seq_operations ext4_mb_seq_structs_summary_ops;
extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
extern int ext4_mb_init(struct super_block *);
extern void ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
                                struct ext4_allocation_request *, int *);
extern void ext4_discard_preallocations(struct inode *);
extern int __init ext4_init_mballoc(void);
extern void ext4_exit_mballoc(void);
extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
                                     ext4_group_t group,
                                     unsigned int nr, int *cnt);
extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
                                  unsigned int nr);

extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                             struct buffer_head *bh, ext4_fsblk_t block,
                             unsigned long count, int flags);
extern int ext4_mb_alloc_groupinfo(struct super_block *sb,
                                   ext4_group_t ngroups);
extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
                                ext4_fsblk_t block, unsigned long count);
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
                            int len, bool state);
static inline bool ext4_mb_cr_expensive(enum criteria cr)
{
        return cr >= CR_GOAL_LEN_SLOW;
}

/* inode.c */
void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
                         struct ext4_inode_info *ei);
int ext4_inode_is_fast_symlink(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
                     bool wait, struct buffer_head **bhs);
int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
                             struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int create);
int ext4_walk_page_buffers(handle_t *handle,
                           struct inode *inode,
                           struct buffer_head *head,
                           unsigned from,
                           unsigned to,
                           int *partial,
                           int (*fn)(handle_t *handle, struct inode *inode,
                                     struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
                                struct buffer_head *bh);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA         2

typedef enum {
        EXT4_IGET_NORMAL =        0,
        EXT4_IGET_SPECIAL =        0x0001, /* OK to iget a system inode */
        EXT4_IGET_HANDLE =         0x0002,        /* Inode # is from a handle */
        EXT4_IGET_BAD =                0x0004, /* Allow to iget a bad inode */
        EXT4_IGET_EA_INODE =        0x0008        /* Inode should contain an EA value */
} ext4_iget_flags;

extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
                                 ext4_iget_flags flags, const char *function,
                                 unsigned int line);

#define ext4_iget(sb, ino, flags) \
        __ext4_iget((sb), (ino), (flags), __func__, __LINE__)

extern int  ext4_write_inode(struct inode *, struct writeback_control *);
extern int  ext4_setattr(struct mnt_idmap *, struct dentry *,
                         struct iattr *);
extern u32  ext4_dio_alignment(struct inode *inode);
extern int  ext4_getattr(struct mnt_idmap *, const struct path *,
                         struct kstat *, u32, unsigned int);
extern void ext4_evict_inode(struct inode *);
extern void ext4_clear_inode(struct inode *);
extern int  ext4_file_getattr(struct mnt_idmap *, const struct path *,
                              struct kstat *, u32, unsigned int);
extern void ext4_dirty_inode(struct inode *, int);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
                          struct ext4_iloc *iloc);
extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
                             loff_t lstart, loff_t lend);
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
extern void ext4_da_release_space(struct inode *inode, int to_free);
extern void ext4_da_update_reserve_space(struct inode *inode,
                                        int used, int quota_claim);
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
                              ext4_fsblk_t pblk, ext4_lblk_t len);

/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
                                struct ext4_map_blocks *map, int flags);
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
extern void ext4_ind_truncate(handle_t *, struct inode *inode);
extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
                                 ext4_lblk_t start, ext4_lblk_t end);

/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
int ext4_fileattr_set(struct mnt_idmap *idmap,
                      struct dentry *dentry, struct fileattr *fa);
int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa);
extern void ext4_reset_inode_seed(struct inode *inode);
int ext4_update_overhead(struct super_block *sb, bool force);
int ext4_force_shutdown(struct super_block *sb, u32 flags);

/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
extern int ext4_ind_migrate(struct inode *inode);

/* namei.c */
extern int ext4_init_new_dir(handle_t *handle, struct inode *dir,
                             struct inode *inode);
extern int ext4_dirblock_csum_verify(struct inode *inode,
                                     struct buffer_head *bh);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
                                __u32 start_minor_hash, __u32 *next_hash);
extern int ext4_search_dir(struct buffer_head *bh,
                           char *search_buf,
                           int buf_size,
                           struct inode *dir,
                           struct ext4_filename *fname,
                           unsigned int offset,
                           struct ext4_dir_entry_2 **res_dir);
extern int ext4_generic_delete_entry(struct inode *dir,
                                     struct ext4_dir_entry_2 *de_del,
                                     struct buffer_head *bh,
                                     void *entry_buf,
                                     int buf_size,
                                     int csum_size);
extern bool ext4_empty_dir(struct inode *inode);

/* resize.c */
extern void ext4_kvfree_array_rcu(void *to_free);
extern int ext4_group_add(struct super_block *sb,
                                struct ext4_new_group_data *input);
extern int ext4_group_extend(struct super_block *sb,
                                struct ext4_super_block *es,
                                ext4_fsblk_t n_blocks_count);
extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
extern unsigned int ext4_list_backups(struct super_block *sb,
                                      unsigned int *three, unsigned int *five,
                                      unsigned int *seven);

/* super.c */
extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
                                         sector_t block, blk_opf_t op_flags);
extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
                                                   sector_t block);
extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
                                bh_end_io_t *end_io);
extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
                        bh_end_io_t *end_io);
extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
extern __le32 ext4_superblock_csum(struct super_block *sb,
                                   struct ext4_super_block *es);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
                                    ext4_group_t ngroup);
extern const char *ext4_decode_error(struct super_block *sb, int errno,
                                     char nbuf[16]);
extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
                                             ext4_group_t block_group,
                                             unsigned int flags);
extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb,
                                              ext4_group_t block_group);

extern __printf(7, 8)
void __ext4_error(struct super_block *, const char *, unsigned int, bool,
                  int, __u64, const char *, ...);
extern __printf(6, 7)
void __ext4_error_inode(struct inode *, const char *, unsigned int,
                        ext4_fsblk_t, int, const char *, ...);
extern __printf(5, 6)
void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
                     const char *, ...);
extern void __ext4_std_error(struct super_block *, const char *,
                             unsigned int, int);
extern __printf(4, 5)
void __ext4_warning(struct super_block *, const char *, unsigned int,
                    const char *, ...);
extern __printf(4, 5)
void __ext4_warning_inode(const struct inode *inode, const char *function,
                          unsigned int line, const char *fmt, ...);
extern __printf(3, 4)
void __ext4_msg(struct super_block *, const char *, const char *, ...);
extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
                           const char *, unsigned int, const char *);
extern __printf(7, 8)
void __ext4_grp_locked_error(const char *, unsigned int,
                             struct super_block *, ext4_group_t,
                             unsigned long, ext4_fsblk_t,
                             const char *, ...);

#define EXT4_ERROR_INODE(inode, fmt, a...) \
        ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)

#define EXT4_ERROR_INODE_ERR(inode, err, fmt, a...)                        \
        __ext4_error_inode((inode), __func__, __LINE__, 0, (err), (fmt), ## a)

#define ext4_error_inode_block(inode, block, err, fmt, a...)                \
        __ext4_error_inode((inode), __func__, __LINE__, (block), (err),        \
                           (fmt), ## a)

#define EXT4_ERROR_FILE(file, block, fmt, a...)                                \
        ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)

#define ext4_abort(sb, err, fmt, a...)                                        \
        __ext4_error((sb), __func__, __LINE__, true, (err), 0, (fmt), ## a)

#ifdef CONFIG_PRINTK

#define ext4_error_inode(inode, func, line, block, fmt, ...)                \
        __ext4_error_inode(inode, func, line, block, 0, fmt, ##__VA_ARGS__)
#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...)        \
        __ext4_error_inode((inode), (func), (line), (block),                 \
                           (err), (fmt), ##__VA_ARGS__)
#define ext4_error_file(file, func, line, block, fmt, ...)                \
        __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
#define ext4_error(sb, fmt, ...)                                        \
        __ext4_error((sb), __func__, __LINE__, false, 0, 0, (fmt),        \
                ##__VA_ARGS__)
#define ext4_error_err(sb, err, fmt, ...)                                \
        __ext4_error((sb), __func__, __LINE__, false, (err), 0, (fmt),        \
                ##__VA_ARGS__)
#define ext4_warning(sb, fmt, ...)                                        \
        __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define ext4_warning_inode(inode, fmt, ...)                                \
        __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define ext4_msg(sb, level, fmt, ...)                                \
        __ext4_msg(sb, level, fmt, ##__VA_ARGS__)
#define dump_mmp_msg(sb, mmp, msg)                                        \
        __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)                \
        __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
                                fmt, ##__VA_ARGS__)

#else

#define ext4_error_inode(inode, func, line, block, fmt, ...)                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error_inode(inode, "", 0, block, 0, " ");                \
} while (0)
#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...)        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error_inode(inode, "", 0, block, err, " ");                \
} while (0)
#define ext4_error_file(file, func, line, block, fmt, ...)                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error_file(file, "", 0, block, " ");                        \
} while (0)
#define ext4_error(sb, fmt, ...)                                        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error(sb, "", 0, false, 0, 0, " ");                        \
} while (0)
#define ext4_error_err(sb, err, fmt, ...)                                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error(sb, "", 0, false, err, 0, " ");                        \
} while (0)
#define ext4_warning(sb, fmt, ...)                                        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_warning(sb, "", 0, " ");                                        \
} while (0)
#define ext4_warning_inode(inode, fmt, ...)                                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_warning_inode(inode, "", 0, " ");                        \
} while (0)
#define ext4_msg(sb, level, fmt, ...)                                        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_msg(sb, "", " ");                                        \
} while (0)
#define dump_mmp_msg(sb, mmp, msg)                                        \
        __dump_mmp_msg(sb, mmp, "", 0, "")
#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                \
        __ext4_grp_locked_error("", 0, sb, grp, ino, block, " ");        \
} while (0)

#endif

extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
                                     struct ext4_group_desc *bg);
extern __u32 ext4_free_group_clusters(struct super_block *sb,
                                      struct ext4_group_desc *bg);
extern __u32 ext4_free_inodes_count(struct super_block *sb,
                                 struct ext4_group_desc *bg);
extern __u32 ext4_used_dirs_count(struct super_block *sb,
                                struct ext4_group_desc *bg);
extern __u32 ext4_itable_unused_count(struct super_block *sb,
                                   struct ext4_group_desc *bg);
extern void ext4_block_bitmap_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_bitmap_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_table_set(struct super_block *sb,
                                 struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_free_group_clusters_set(struct super_block *sb,
                                         struct ext4_group_desc *bg,
                                         __u32 count);
extern void ext4_free_inodes_set(struct super_block *sb,
                                struct ext4_group_desc *bg, __u32 count);
extern void ext4_used_dirs_set(struct super_block *sb,
                                struct ext4_group_desc *bg, __u32 count);
extern void ext4_itable_unused_set(struct super_block *sb,
                                   struct ext4_group_desc *bg, __u32 count);
extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group,
                                       struct ext4_group_desc *gdp);
extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
                                     struct ext4_group_desc *gdp);
extern int ext4_register_li_request(struct super_block *sb,
                                    ext4_group_t first_not_zeroed);

static inline int ext4_has_metadata_csum(struct super_block *sb)
{
        WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
                     !EXT4_SB(sb)->s_chksum_driver);

        return ext4_has_feature_metadata_csum(sb) &&
               (EXT4_SB(sb)->s_chksum_driver != NULL);
}

static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
        return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
}

#define ext4_read_incompat_64bit_val(es, name) \
        (((es)->s_feature_incompat & cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT) \
                ? (ext4_fsblk_t)le32_to_cpu(es->name##_hi) << 32 : 0) | \
                le32_to_cpu(es->name##_lo))

static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
        return ext4_read_incompat_64bit_val(es, s_blocks_count);
}

static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
{
        return ext4_read_incompat_64bit_val(es, s_r_blocks_count);
}

static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es)
{
        return ext4_read_incompat_64bit_val(es, s_free_blocks_count);
}

static inline void ext4_blocks_count_set(struct ext4_super_block *es,
                                         ext4_fsblk_t blk)
{
        es->s_blocks_count_lo = cpu_to_le32((u32)blk);
        es->s_blocks_count_hi = cpu_to_le32(blk >> 32);
}

static inline void ext4_free_blocks_count_set(struct ext4_super_block *es,
                                              ext4_fsblk_t blk)
{
        es->s_free_blocks_count_lo = cpu_to_le32((u32)blk);
        es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32);
}

static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
                                           ext4_fsblk_t blk)
{
        es->s_r_blocks_count_lo = cpu_to_le32((u32)blk);
        es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
}

static inline loff_t ext4_isize(struct super_block *sb,
                                struct ext4_inode *raw_inode)
{
        if (ext4_has_feature_largedir(sb) ||
            S_ISREG(le16_to_cpu(raw_inode->i_mode)))
                return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
                        le32_to_cpu(raw_inode->i_size_lo);

        return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
}

static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
{
        raw_inode->i_size_lo = cpu_to_le32(i_size);
        raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
}

/*
 * Reading s_groups_count requires using smp_rmb() afterwards.  See
 * the locking protocol documented in the comments of ext4_group_add()
 * in resize.c
 */
static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
{
        ext4_group_t        ngroups = EXT4_SB(sb)->s_groups_count;

        smp_rmb();
        return ngroups;
}

static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
                                             ext4_group_t block_group)
{
        return block_group >> sbi->s_log_groups_per_flex;
}

static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
{
        return 1 << sbi->s_log_groups_per_flex;
}

#define ext4_std_error(sb, errno)                                \
do {                                                                \
        if ((errno))                                                \
                __ext4_std_error((sb), __func__, __LINE__, (errno));        \
} while (0)

#ifdef CONFIG_SMP
/* Each CPU can accumulate percpu_counter_batch clusters in their local
 * counters. So we need to make sure we have free clusters more
 * than percpu_counter_batch  * nr_cpu_ids. Also add a window of 4 times.
 */
#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
#else
#define EXT4_FREECLUSTERS_WATERMARK 0
#endif

/* Update i_disksize. Requires i_rwsem to avoid races with truncate */
static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
{
        WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
                     !inode_is_locked(inode));
        down_write(&EXT4_I(inode)->i_data_sem);
        if (newsize > EXT4_I(inode)->i_disksize)
                WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize);
        up_write(&EXT4_I(inode)->i_data_sem);
}

/* Update i_size, i_disksize. Requires i_rwsem to avoid races with truncate */
static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
{
        int changed = 0;

        if (newsize > inode->i_size) {
                i_size_write(inode, newsize);
                changed = 1;
        }
        if (newsize > EXT4_I(inode)->i_disksize) {
                ext4_update_i_disksize(inode, newsize);
                changed |= 2;
        }
        return changed;
}

int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
                                      loff_t len);

struct ext4_group_info {
        unsigned long   bb_state;
#ifdef AGGRESSIVE_CHECK
        unsigned long        bb_check_counter;
#endif
        struct rb_root  bb_free_root;
        ext4_grpblk_t        bb_first_free;        /* first free block */
        ext4_grpblk_t        bb_free;        /* total free blocks */
        ext4_grpblk_t        bb_fragments;        /* nr of freespace fragments */
        int                bb_avg_fragment_size_order;        /* order of average
                                                           fragment in BG */
        ext4_grpblk_t        bb_largest_free_order;/* order of largest frag in BG */
        ext4_group_t        bb_group;        /* Group number */
        struct          list_head bb_prealloc_list;
#ifdef DOUBLE_CHECK
        void            *bb_bitmap;
#endif
        struct rw_semaphore alloc_sem;
        struct list_head bb_avg_fragment_size_node;
        struct list_head bb_largest_free_order_node;
        ext4_grpblk_t        bb_counters[];        /* Nr of free power-of-two-block
                                         * regions, index is order.
                                         * bb_counters[3] = 5 means
                                         * 5 free 8-block regions. */
};

#define EXT4_GROUP_INFO_NEED_INIT_BIT                0
#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT                1
#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT        2
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT        3
#define EXT4_GROUP_INFO_BBITMAP_CORRUPT                \
        (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT                \
        (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
#define EXT4_GROUP_INFO_BBITMAP_READ_BIT        4

#define EXT4_MB_GRP_NEED_INIT(grp)        \
        (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp)        \
        (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp)        \
        (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))

#define EXT4_MB_GRP_WAS_TRIMMED(grp)        \
        (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_SET_TRIMMED(grp)        \
        (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_CLEAR_TRIMMED(grp)        \
        (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_TEST_AND_SET_READ(grp)        \
        (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))

#define EXT4_MAX_CONTENTION                8
#define EXT4_CONTENTION_THRESHOLD        2

static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
                                              ext4_group_t group)
{
        return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
}

/*
 * Returns true if the filesystem is busy enough that attempts to
 * access the block group locks has run into contention.
 */
static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
{
        return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
}

static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
{
        spinlock_t *lock = ext4_group_lock_ptr(sb, group);
        if (spin_trylock(lock))
                /*
                 * We're able to grab the lock right away, so drop the
                 * lock contention counter.
                 */
                atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
        else {
                /*
                 * The lock is busy, so bump the contention counter,
                 * and then wait on the spin lock.
                 */
                atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
                                  EXT4_MAX_CONTENTION);
                spin_lock(lock);
        }
}

static inline void ext4_unlock_group(struct super_block *sb,
                                        ext4_group_t group)
{
        spin_unlock(ext4_group_lock_ptr(sb, group));
}

#ifdef CONFIG_QUOTA
static inline bool ext4_quota_capable(struct super_block *sb)
{
        return (test_opt(sb, QUOTA) || ext4_has_feature_quota(sb));
}

static inline bool ext4_is_quota_journalled(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        return (ext4_has_feature_quota(sb) ||
                sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]);
}
int ext4_enable_quotas(struct super_block *sb);
#endif

/*
 * Block validity checking
 */
#define ext4_check_indirect_blockref(inode, bh)                                \
        ext4_check_blockref(__func__, __LINE__, inode,                        \
                            (__le32 *)(bh)->b_data,                        \
                            EXT4_ADDR_PER_BLOCK((inode)->i_sb))

#define ext4_ind_check_inode(inode)                                        \
        ext4_check_blockref(__func__, __LINE__, inode,                        \
                            EXT4_I(inode)->i_data,                        \
                            EXT4_NDIR_BLOCKS)

/*
 * Inodes and files operations
 */

/* dir.c */
extern const struct file_operations ext4_dir_operations;

/* file.c */
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);

/* inline.c */
extern int ext4_get_max_inline_size(struct inode *inode);
extern int ext4_find_inline_data_nolock(struct inode *inode);
extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);

int ext4_readpage_inline(struct inode *inode, struct folio *folio);
extern int ext4_try_to_write_inline_data(struct address_space *mapping,
                                         struct inode *inode,
                                         loff_t pos, unsigned len,
                                         struct page **pagep);
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
                               unsigned copied, struct folio *folio);
extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
                                           struct inode *inode,
                                           loff_t pos, unsigned len,
                                           struct page **pagep,
                                           void **fsdata);
extern int ext4_try_add_inline_entry(handle_t *handle,
                                     struct ext4_filename *fname,
                                     struct inode *dir, struct inode *inode);
extern int ext4_try_create_inline_dir(handle_t *handle,
                                      struct inode *parent,
                                      struct inode *inode);
extern int ext4_read_inline_dir(struct file *filp,
                                struct dir_context *ctx,
                                int *has_inline_data);
extern int ext4_inlinedir_to_tree(struct file *dir_file,
                                  struct inode *dir, ext4_lblk_t block,
                                  struct dx_hash_info *hinfo,
                                  __u32 start_hash, __u32 start_minor_hash,
                                  int *has_inline_data);
extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
                                        struct ext4_filename *fname,
                                        struct ext4_dir_entry_2 **res_dir,
                                        int *has_inline_data);
extern int ext4_delete_inline_entry(handle_t *handle,
                                    struct inode *dir,
                                    struct ext4_dir_entry_2 *de_del,
                                    struct buffer_head *bh,
                                    int *has_inline_data);
extern bool empty_inline_dir(struct inode *dir, int *has_inline_data);
extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
                                        struct ext4_dir_entry_2 **parent_de,
                                        int *retval);
extern void *ext4_read_inline_link(struct inode *inode);

struct iomap;
extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap);

extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline);

extern int ext4_convert_inline_data(struct inode *inode);

static inline int ext4_has_inline_data(struct inode *inode)
{
        return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
               EXT4_I(inode)->i_inline_off;
}

/* namei.c */
extern const struct inode_operations ext4_dir_inode_operations;
extern const struct inode_operations ext4_special_inode_operations;
extern struct dentry *ext4_get_parent(struct dentry *child);
extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
                                 struct ext4_dir_entry_2 *de,
                                 int blocksize, int csum_size,
                                 unsigned int parent_ino, int dotdot_real_len);
extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
                                        unsigned int blocksize);
extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
                                      struct buffer_head *bh);
extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
                         struct inode *inode, struct dentry *dentry);
extern int __ext4_link(struct inode *dir, struct inode *inode,
                       struct dentry *dentry);

#define S_SHIFT 12
static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
        [S_IFREG >> S_SHIFT]        = EXT4_FT_REG_FILE,
        [S_IFDIR >> S_SHIFT]        = EXT4_FT_DIR,
        [S_IFCHR >> S_SHIFT]        = EXT4_FT_CHRDEV,
        [S_IFBLK >> S_SHIFT]        = EXT4_FT_BLKDEV,
        [S_IFIFO >> S_SHIFT]        = EXT4_FT_FIFO,
        [S_IFSOCK >> S_SHIFT]        = EXT4_FT_SOCK,
        [S_IFLNK >> S_SHIFT]        = EXT4_FT_SYMLINK,
};

static inline void ext4_set_de_type(struct super_block *sb,
                                struct ext4_dir_entry_2 *de,
                                umode_t mode) {
        if (ext4_has_feature_filetype(sb))
                de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}

/* readpages.c */
extern int ext4_mpage_readpages(struct inode *inode,
                struct readahead_control *rac, struct folio *folio);
extern int __init ext4_init_post_read_processing(void);
extern void ext4_exit_post_read_processing(void);

/* symlink.c */
extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
extern const struct inode_operations ext4_symlink_inode_operations;
extern const struct inode_operations ext4_fast_symlink_inode_operations;

/* sysfs.c */
extern void ext4_notify_error_sysfs(struct ext4_sb_info *sbi);
extern int ext4_register_sysfs(struct super_block *sb);
extern void ext4_unregister_sysfs(struct super_block *sb);
extern int __init ext4_init_sysfs(void);
extern void ext4_exit_sysfs(void);

/* block_validity */
extern void ext4_release_system_zone(struct super_block *sb);
extern int ext4_setup_system_zone(struct super_block *sb);
extern int __init ext4_init_system_zone(void);
extern void ext4_exit_system_zone(void);
extern int ext4_inode_block_valid(struct inode *inode,
                                  ext4_fsblk_t start_blk,
                                  unsigned int count);
extern int ext4_check_blockref(const char *, unsigned int,
                               struct inode *, __le32 *, unsigned int);
extern int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
                                ext4_fsblk_t start_blk, unsigned int count);


/* extents.c */
struct ext4_ext_path;
struct ext4_extent;

/*
 * Maximum number of logical blocks in a file; ext4_extent's ee_block is
 * __le32.
 */
#define EXT_MAX_BLOCKS        0xffffffff

extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode);
extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                               struct ext4_map_blocks *map, int flags);
extern int ext4_ext_truncate(handle_t *, struct inode *);
extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
                                 ext4_lblk_t end);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
                          loff_t len);
extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
                                          loff_t offset, ssize_t len);
extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
                                             ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map, int flags);
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
                                                   int num,
                                                   struct ext4_ext_path *path);
extern int ext4_ext_insert_extent(handle_t *, struct inode *,
                                  struct ext4_ext_path **,
                                  struct ext4_extent *, int);
extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
                                              struct ext4_ext_path **,
                                              int flags);
extern void ext4_free_ext_path(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        __u64 start, __u64 len);
extern int ext4_get_es_cache(struct inode *inode,
                             struct fiemap_extent_info *fieinfo,
                             __u64 start, __u64 len);
extern int ext4_ext_precache(struct inode *inode);
extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
                                struct inode *inode2, ext4_lblk_t lblk1,
                             ext4_lblk_t lblk2,  ext4_lblk_t count,
                             int mark_unwritten,int *err);
extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
                                       int check_cred, int restart_cred,
                                       int revoke_cred);
extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end);
extern int ext4_ext_replay_set_iblocks(struct inode *inode);
extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
                int len, int unwritten, ext4_fsblk_t pblk);
extern int ext4_ext_clear_bb(struct inode *inode);


/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
                                            struct inode *second);
extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
                                          struct inode *donor_inode);
extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 start_orig, __u64 start_donor,
                             __u64 len, __u64 *moved_len);

/* page-io.c */
extern int __init ext4_init_pageio(void);
extern void ext4_exit_pageio(void);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
extern int ext4_put_io_end(ext4_io_end_t *io_end);
extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
extern void ext4_io_submit_init(struct ext4_io_submit *io,
                                struct writeback_control *wbc);
extern void ext4_end_io_rsv_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page,
                size_t len);
extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);

/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);

/* mmp.c */
extern void ext4_stop_mmpd(struct ext4_sb_info *sbi);

/* verity.c */
extern const struct fsverity_operations ext4_verityops;

/* orphan.c */
extern int ext4_orphan_add(handle_t *, struct inode *);
extern int ext4_orphan_del(handle_t *, struct inode *);
extern void ext4_orphan_cleanup(struct super_block *sb,
                                struct ext4_super_block *es);
extern void ext4_release_orphan_info(struct super_block *sb);
extern int ext4_init_orphan_info(struct super_block *sb);
extern int ext4_orphan_file_empty(struct super_block *sb);
extern void ext4_orphan_file_block_trigger(
                                struct jbd2_buffer_trigger_type *triggers,
                                struct buffer_head *bh,
                                void *data, size_t size);

/*
 * Add new method to test whether block and inode bitmaps are properly
 * initialized. With uninit_bg reading the block from disk is not enough
 * to mark the bitmap uptodate. We need to also zero-out the bitmap
 */
#define BH_BITMAP_UPTODATE BH_JBDPrivateStart

static inline int bitmap_uptodate(struct buffer_head *bh)
{
        return (buffer_uptodate(bh) &&
                        test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
}
static inline void set_bitmap_uptodate(struct buffer_head *bh)
{
        set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}

/* For ioend & aio unwritten conversion wait queues */
#define EXT4_WQ_HASH_SZ                37
#define ext4_ioend_wq(v)   (&ext4__ioend_wq[((unsigned long)(v)) %\
                                            EXT4_WQ_HASH_SZ])
extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];

extern int ext4_resize_begin(struct super_block *sb);
extern int ext4_resize_end(struct super_block *sb, bool update_backups);

static inline void ext4_set_io_unwritten_flag(struct inode *inode,
                                              struct ext4_io_end *io_end)
{
        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
                io_end->flag |= EXT4_IO_END_UNWRITTEN;
                atomic_inc(&EXT4_I(inode)->i_unwritten);
        }
}

static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
{
        struct inode *inode = io_end->inode;

        if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
                io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
                /* Wake up anyone waiting on unwritten extent conversion */
                if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
                        wake_up_all(ext4_ioend_wq(inode));
        }
}

extern const struct iomap_ops ext4_iomap_ops;
extern const struct iomap_ops ext4_iomap_overwrite_ops;
extern const struct iomap_ops ext4_iomap_report_ops;

static inline int ext4_buffer_uptodate(struct buffer_head *bh)
{
        /*
         * If the buffer has the write error flag, we have failed
         * to write out data in the block.  In this  case, we don't
         * have to read the block because we may read the old data
         * successfully.
         */
        if (buffer_write_io_error(bh))
                set_buffer_uptodate(bh);
        return buffer_uptodate(bh);
}

#endif        /* __KERNEL__ */

#define EFSBADCRC        EBADMSG                /* Bad CRC detected */
#define EFSCORRUPTED        EUCLEAN                /* Filesystem is corrupted */

#endif        /* _EXT4_H */








































































































































































































































































    1 


    1 















    1 










    1 












    1 






































    1 
















    1 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
/*
 * memfd_create system call and file sealing support
 *
 * Code was originally included in shmem.c, and broken out to facilitate
 * use by hugetlbfs as well as tmpfs.
 *
 * This file is released under the GPL.
 */

#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/khugepaged.h>
#include <linux/syscalls.h>
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/memfd.h>
#include <linux/pid_namespace.h>
#include <uapi/linux/memfd.h>

/*
 * We need a tag: a new tag would expand every xa_node by 8 bytes,
 * so reuse a tag which we firmly believe is never set or cleared on tmpfs
 * or hugetlbfs because they are memory only filesystems.
 */
#define MEMFD_TAG_PINNED        PAGECACHE_TAG_TOWRITE
#define LAST_SCAN               4       /* about 150ms max */

static bool memfd_folio_has_extra_refs(struct folio *folio)
{
        return folio_ref_count(folio) - folio_mapcount(folio) !=
               folio_nr_pages(folio);
}

static void memfd_tag_pins(struct xa_state *xas)
{
        struct folio *folio;
        int latency = 0;

        lru_add_drain();

        xas_lock_irq(xas);
        xas_for_each(xas, folio, ULONG_MAX) {
                if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio))
                        xas_set_mark(xas, MEMFD_TAG_PINNED);

                if (++latency < XA_CHECK_SCHED)
                        continue;
                latency = 0;

                xas_pause(xas);
                xas_unlock_irq(xas);
                cond_resched();
                xas_lock_irq(xas);
        }
        xas_unlock_irq(xas);
}

/*
 * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
 * via get_user_pages(), drivers might have some pending I/O without any active
 * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios
 * and see whether it has an elevated ref-count. If so, we tag them and wait for
 * them to be dropped.
 * The caller must guarantee that no new user will acquire writable references
 * to those folios to avoid races.
 */
static int memfd_wait_for_pins(struct address_space *mapping)
{
        XA_STATE(xas, &mapping->i_pages, 0);
        struct folio *folio;
        int error, scan;

        memfd_tag_pins(&xas);

        error = 0;
        for (scan = 0; scan <= LAST_SCAN; scan++) {
                int latency = 0;

                if (!xas_marked(&xas, MEMFD_TAG_PINNED))
                        break;

                if (!scan)
                        lru_add_drain_all();
                else if (schedule_timeout_killable((HZ << scan) / 200))
                        scan = LAST_SCAN;

                xas_set(&xas, 0);
                xas_lock_irq(&xas);
                xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) {
                        bool clear = true;

                        if (!xa_is_value(folio) &&
                            memfd_folio_has_extra_refs(folio)) {
                                /*
                                 * On the last scan, we clean up all those tags
                                 * we inserted; but make a note that we still
                                 * found folios pinned.
                                 */
                                if (scan == LAST_SCAN)
                                        error = -EBUSY;
                                else
                                        clear = false;
                        }
                        if (clear)
                                xas_clear_mark(&xas, MEMFD_TAG_PINNED);

                        if (++latency < XA_CHECK_SCHED)
                                continue;
                        latency = 0;

                        xas_pause(&xas);
                        xas_unlock_irq(&xas);
                        cond_resched();
                        xas_lock_irq(&xas);
                }
                xas_unlock_irq(&xas);
        }

        return error;
}

static unsigned int *memfd_file_seals_ptr(struct file *file)
{
        if (shmem_file(file))
                return &SHMEM_I(file_inode(file))->seals;

#ifdef CONFIG_HUGETLBFS
        if (is_file_hugepages(file))
                return &HUGETLBFS_I(file_inode(file))->seals;
#endif

        return NULL;
}

#define F_ALL_SEALS (F_SEAL_SEAL | \
                     F_SEAL_EXEC | \
                     F_SEAL_SHRINK | \
                     F_SEAL_GROW | \
                     F_SEAL_WRITE | \
                     F_SEAL_FUTURE_WRITE)

static int memfd_add_seals(struct file *file, unsigned int seals)
{
        struct inode *inode = file_inode(file);
        unsigned int *file_seals;
        int error;

        /*
         * SEALING
         * Sealing allows multiple parties to share a tmpfs or hugetlbfs file
         * but restrict access to a specific subset of file operations. Seals
         * can only be added, but never removed. This way, mutually untrusted
         * parties can share common memory regions with a well-defined policy.
         * A malicious peer can thus never perform unwanted operations on a
         * shared object.
         *
         * Seals are only supported on special tmpfs or hugetlbfs files and
         * always affect the whole underlying inode. Once a seal is set, it
         * may prevent some kinds of access to the file. Currently, the
         * following seals are defined:
         *   SEAL_SEAL: Prevent further seals from being set on this file
         *   SEAL_SHRINK: Prevent the file from shrinking
         *   SEAL_GROW: Prevent the file from growing
         *   SEAL_WRITE: Prevent write access to the file
         *   SEAL_EXEC: Prevent modification of the exec bits in the file mode
         *
         * As we don't require any trust relationship between two parties, we
         * must prevent seals from being removed. Therefore, sealing a file
         * only adds a given set of seals to the file, it never touches
         * existing seals. Furthermore, the "setting seals"-operation can be
         * sealed itself, which basically prevents any further seal from being
         * added.
         *
         * Semantics of sealing are only defined on volatile files. Only
         * anonymous tmpfs and hugetlbfs files support sealing. More
         * importantly, seals are never written to disk. Therefore, there's
         * no plan to support it on other file types.
         */

        if (!(file->f_mode & FMODE_WRITE))
                return -EPERM;
        if (seals & ~(unsigned int)F_ALL_SEALS)
                return -EINVAL;

        inode_lock(inode);

        file_seals = memfd_file_seals_ptr(file);
        if (!file_seals) {
                error = -EINVAL;
                goto unlock;
        }

        if (*file_seals & F_SEAL_SEAL) {
                error = -EPERM;
                goto unlock;
        }

        if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
                error = mapping_deny_writable(file->f_mapping);
                if (error)
                        goto unlock;

                error = memfd_wait_for_pins(file->f_mapping);
                if (error) {
                        mapping_allow_writable(file->f_mapping);
                        goto unlock;
                }
        }

        /*
         * SEAL_EXEC implys SEAL_WRITE, making W^X from the start.
         */
        if (seals & F_SEAL_EXEC && inode->i_mode & 0111)
                seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE;

        *file_seals |= seals;
        error = 0;

unlock:
        inode_unlock(inode);
        return error;
}

static int memfd_get_seals(struct file *file)
{
        unsigned int *seals = memfd_file_seals_ptr(file);

        return seals ? *seals : -EINVAL;
}

long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
{
        long error;

        switch (cmd) {
        case F_ADD_SEALS:
                error = memfd_add_seals(file, arg);
                break;
        case F_GET_SEALS:
                error = memfd_get_seals(file);
                break;
        default:
                error = -EINVAL;
                break;
        }

        return error;
}

#define MFD_NAME_PREFIX "memfd:"
#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)

#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)

static int check_sysctl_memfd_noexec(unsigned int *flags)
{
#ifdef CONFIG_SYSCTL
        struct pid_namespace *ns = task_active_pid_ns(current);
        int sysctl = pidns_memfd_noexec_scope(ns);

        if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
                if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL)
                        *flags |= MFD_NOEXEC_SEAL;
                else
                        *flags |= MFD_EXEC;
        }

        if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) {
                pr_err_ratelimited(
                        "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n",
                        current->comm, task_pid_nr(current), sysctl);
                return -EACCES;
        }
#endif
        return 0;
}

SYSCALL_DEFINE2(memfd_create,
                const char __user *, uname,
                unsigned int, flags)
{
        unsigned int *file_seals;
        struct file *file;
        int fd, error;
        char *name;
        long len;

        if (!(flags & MFD_HUGETLB)) {
                if (flags & ~(unsigned int)MFD_ALL_FLAGS)
                        return -EINVAL;
        } else {
                /* Allow huge page size encoding in flags. */
                if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
                                (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
                        return -EINVAL;
        }

        /* Invalid if both EXEC and NOEXEC_SEAL are set.*/
        if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
                return -EINVAL;

        error = check_sysctl_memfd_noexec(&flags);
        if (error < 0)
                return error;

        /* length includes terminating zero */
        len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
        if (len <= 0)
                return -EFAULT;
        if (len > MFD_NAME_MAX_LEN + 1)
                return -EINVAL;

        name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
        if (!name)
                return -ENOMEM;

        strcpy(name, MFD_NAME_PREFIX);
        if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
                error = -EFAULT;
                goto err_name;
        }

        /* terminating-zero may have changed after strnlen_user() returned */
        if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
                error = -EFAULT;
                goto err_name;
        }

        fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
        if (fd < 0) {
                error = fd;
                goto err_name;
        }

        if (flags & MFD_HUGETLB) {
                file = hugetlb_file_setup(name, 0, VM_NORESERVE,
                                        HUGETLB_ANONHUGE_INODE,
                                        (flags >> MFD_HUGE_SHIFT) &
                                        MFD_HUGE_MASK);
        } else
                file = shmem_file_setup(name, 0, VM_NORESERVE);
        if (IS_ERR(file)) {
                error = PTR_ERR(file);
                goto err_fd;
        }
        file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
        file->f_flags |= O_LARGEFILE;

        if (flags & MFD_NOEXEC_SEAL) {
                struct inode *inode = file_inode(file);

                inode->i_mode &= ~0111;
                file_seals = memfd_file_seals_ptr(file);
                if (file_seals) {
                        *file_seals &= ~F_SEAL_SEAL;
                        *file_seals |= F_SEAL_EXEC;
                }
        } else if (flags & MFD_ALLOW_SEALING) {
                /* MFD_EXEC and MFD_ALLOW_SEALING are set */
                file_seals = memfd_file_seals_ptr(file);
                if (file_seals)
                        *file_seals &= ~F_SEAL_SEAL;
        }

        fd_install(fd, file);
        kfree(name);
        return fd;

err_fd:
        put_unused_fd(fd);
err_name:
        kfree(name);
        return error;
}
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    5 
























    2 














    2 


    2 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) Qu Wenruo 2017.  All rights reserved.
 */

/*
 * The module is used to catch unexpected/corrupted tree block data.
 * Such behavior can be caused either by a fuzzed image or bugs.
 *
 * The objective is to do leaf/node validation checks when tree block is read
 * from disk, and check *every* possible member, so other code won't
 * need to checking them again.
 *
 * Due to the potential and unwanted damage, every checker needs to be
 * carefully reviewed otherwise so it does not prevent mount of valid images.
 */

#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/error-injection.h>
#include "messages.h"
#include "ctree.h"
#include "tree-checker.h"
#include "compression.h"
#include "volumes.h"
#include "misc.h"
#include "fs.h"
#include "accessors.h"
#include "file-item.h"
#include "inode-item.h"
#include "dir-item.h"
#include "extent-tree.h"

/*
 * Error message should follow the following format:
 * corrupt <type>: <identifier>, <reason>[, <bad_value>]
 *
 * @type:        leaf or node
 * @identifier:        the necessary info to locate the leaf/node.
 *                 It's recommended to decode key.objecitd/offset if it's
 *                 meaningful.
 * @reason:        describe the error
 * @bad_value:        optional, it's recommended to output bad value and its
 *                expected value (range).
 *
 * Since comma is used to separate the components, only space is allowed
 * inside each component.
 */

/*
 * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt.
 * Allows callers to customize the output.
 */
__printf(3, 4)
__cold
static void generic_err(const struct extent_buffer *eb, int slot,
                        const char *fmt, ...)
{
        const struct btrfs_fs_info *fs_info = eb->fs_info;
        struct va_format vaf;
        va_list args;

        va_start(args, fmt);

        vaf.fmt = fmt;
        vaf.va = &args;

        dump_page(folio_page(eb->folios[0], 0), "eb page dump");
        btrfs_crit(fs_info,
                "corrupt %s: root=%llu block=%llu slot=%d, %pV",
                btrfs_header_level(eb) == 0 ? "leaf" : "node",
                btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, &vaf);
        va_end(args);
}

/*
 * Customized reporter for extent data item, since its key objectid and
 * offset has its own meaning.
 */
__printf(3, 4)
__cold
static void file_extent_err(const struct extent_buffer *eb, int slot,
                            const char *fmt, ...)
{
        const struct btrfs_fs_info *fs_info = eb->fs_info;
        struct btrfs_key key;
        struct va_format vaf;
        va_list args;

        btrfs_item_key_to_cpu(eb, &key, slot);
        va_start(args, fmt);

        vaf.fmt = fmt;
        vaf.va = &args;

        dump_page(folio_page(eb->folios[0], 0), "eb page dump");
        btrfs_crit(fs_info,
        "corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV",
                btrfs_header_level(eb) == 0 ? "leaf" : "node",
                btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
                key.objectid, key.offset, &vaf);
        va_end(args);
}

/*
 * Return 0 if the btrfs_file_extent_##name is aligned to @alignment
 * Else return 1
 */
#define CHECK_FE_ALIGNED(leaf, slot, fi, name, alignment)                      \
({                                                                              \
        if (unlikely(!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)),      \
                                 (alignment))))                                      \
                file_extent_err((leaf), (slot),                                      \
        "invalid %s for file extent, have %llu, should be aligned to %u",     \
                        (#name), btrfs_file_extent_##name((leaf), (fi)),      \
                        (alignment));                                              \
        (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment)));   \
})

static u64 file_extent_end(struct extent_buffer *leaf,
                           struct btrfs_key *key,
                           struct btrfs_file_extent_item *extent)
{
        u64 end;
        u64 len;

        if (btrfs_file_extent_type(leaf, extent) == BTRFS_FILE_EXTENT_INLINE) {
                len = btrfs_file_extent_ram_bytes(leaf, extent);
                end = ALIGN(key->offset + len, leaf->fs_info->sectorsize);
        } else {
                len = btrfs_file_extent_num_bytes(leaf, extent);
                end = key->offset + len;
        }
        return end;
}

/*
 * Customized report for dir_item, the only new important information is
 * key->objectid, which represents inode number
 */
__printf(3, 4)
__cold
static void dir_item_err(const struct extent_buffer *eb, int slot,
                         const char *fmt, ...)
{
        const struct btrfs_fs_info *fs_info = eb->fs_info;
        struct btrfs_key key;
        struct va_format vaf;
        va_list args;

        btrfs_item_key_to_cpu(eb, &key, slot);
        va_start(args, fmt);

        vaf.fmt = fmt;
        vaf.va = &args;

        dump_page(folio_page(eb->folios[0], 0), "eb page dump");
        btrfs_crit(fs_info,
                "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
                btrfs_header_level(eb) == 0 ? "leaf" : "node",
                btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
                key.objectid, &vaf);
        va_end(args);
}

/*
 * This functions checks prev_key->objectid, to ensure current key and prev_key
 * share the same objectid as inode number.
 *
 * This is to detect missing INODE_ITEM in subvolume trees.
 *
 * Return true if everything is OK or we don't need to check.
 * Return false if anything is wrong.
 */
static bool check_prev_ino(struct extent_buffer *leaf,
                           struct btrfs_key *key, int slot,
                           struct btrfs_key *prev_key)
{
        /* No prev key, skip check */
        if (slot == 0)
                return true;

        /* Only these key->types needs to be checked */
        ASSERT(key->type == BTRFS_XATTR_ITEM_KEY ||
               key->type == BTRFS_INODE_REF_KEY ||
               key->type == BTRFS_DIR_INDEX_KEY ||
               key->type == BTRFS_DIR_ITEM_KEY ||
               key->type == BTRFS_EXTENT_DATA_KEY);

        /*
         * Only subvolume trees along with their reloc trees need this check.
         * Things like log tree doesn't follow this ino requirement.
         */
        if (!is_fstree(btrfs_header_owner(leaf)))
                return true;

        if (key->objectid == prev_key->objectid)
                return true;

        /* Error found */
        dir_item_err(leaf, slot,
                "invalid previous key objectid, have %llu expect %llu",
                prev_key->objectid, key->objectid);
        return false;
}
static int check_extent_data_item(struct extent_buffer *leaf,
                                  struct btrfs_key *key, int slot,
                                  struct btrfs_key *prev_key)
{
        struct btrfs_fs_info *fs_info = leaf->fs_info;
        struct btrfs_file_extent_item *fi;
        u32 sectorsize = fs_info->sectorsize;
        u32 item_size = btrfs_item_size(leaf, slot);
        u64 extent_end;

        if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) {
                file_extent_err(leaf, slot,
"unaligned file_offset for file extent, have %llu should be aligned to %u",
                        key->offset, sectorsize);
                return -EUCLEAN;
        }

        /*
         * Previous key must have the same key->objectid (ino).
         * It can be XATTR_ITEM, INODE_ITEM or just another EXTENT_DATA.
         * But if objectids mismatch, it means we have a missing
         * INODE_ITEM.
         */
        if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
                return -EUCLEAN;

        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);

        /*
         * Make sure the item contains at least inline header, so the file
         * extent type is not some garbage.
         */
        if (unlikely(item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START)) {
                file_extent_err(leaf, slot,
                                "invalid item size, have %u expect [%zu, %u)",
                                item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START,
                                SZ_4K);
                return -EUCLEAN;
        }
        if (unlikely(btrfs_file_extent_type(leaf, fi) >=
                     BTRFS_NR_FILE_EXTENT_TYPES)) {
                file_extent_err(leaf, slot,
                "invalid type for file extent, have %u expect range [0, %u]",
                        btrfs_file_extent_type(leaf, fi),
                        BTRFS_NR_FILE_EXTENT_TYPES - 1);
                return -EUCLEAN;
        }

        /*
         * Support for new compression/encryption must introduce incompat flag,
         * and must be caught in open_ctree().
         */
        if (unlikely(btrfs_file_extent_compression(leaf, fi) >=
                     BTRFS_NR_COMPRESS_TYPES)) {
                file_extent_err(leaf, slot,
        "invalid compression for file extent, have %u expect range [0, %u]",
                        btrfs_file_extent_compression(leaf, fi),
                        BTRFS_NR_COMPRESS_TYPES - 1);
                return -EUCLEAN;
        }
        if (unlikely(btrfs_file_extent_encryption(leaf, fi))) {
                file_extent_err(leaf, slot,
                        "invalid encryption for file extent, have %u expect 0",
                        btrfs_file_extent_encryption(leaf, fi));
                return -EUCLEAN;
        }
        if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
                /* Inline extent must have 0 as key offset */
                if (unlikely(key->offset)) {
                        file_extent_err(leaf, slot,
                "invalid file_offset for inline file extent, have %llu expect 0",
                                key->offset);
                        return -EUCLEAN;
                }

                /* Compressed inline extent has no on-disk size, skip it */
                if (btrfs_file_extent_compression(leaf, fi) !=
                    BTRFS_COMPRESS_NONE)
                        return 0;

                /* Uncompressed inline extent size must match item size */
                if (unlikely(item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
                                          btrfs_file_extent_ram_bytes(leaf, fi))) {
                        file_extent_err(leaf, slot,
        "invalid ram_bytes for uncompressed inline extent, have %u expect %llu",
                                item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START +
                                btrfs_file_extent_ram_bytes(leaf, fi));
                        return -EUCLEAN;
                }
                return 0;
        }

        /* Regular or preallocated extent has fixed item size */
        if (unlikely(item_size != sizeof(*fi))) {
                file_extent_err(leaf, slot,
        "invalid item size for reg/prealloc file extent, have %u expect %zu",
                        item_size, sizeof(*fi));
                return -EUCLEAN;
        }
        if (unlikely(CHECK_FE_ALIGNED(leaf, slot, fi, ram_bytes, sectorsize) ||
                     CHECK_FE_ALIGNED(leaf, slot, fi, disk_bytenr, sectorsize) ||
                     CHECK_FE_ALIGNED(leaf, slot, fi, disk_num_bytes, sectorsize) ||
                     CHECK_FE_ALIGNED(leaf, slot, fi, offset, sectorsize) ||
                     CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize)))
                return -EUCLEAN;

        /* Catch extent end overflow */
        if (unlikely(check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi),
                                        key->offset, &extent_end))) {
                file_extent_err(leaf, slot,
        "extent end overflow, have file offset %llu extent num bytes %llu",
                                key->offset,
                                btrfs_file_extent_num_bytes(leaf, fi));
                return -EUCLEAN;
        }

        /*
         * Check that no two consecutive file extent items, in the same leaf,
         * present ranges that overlap each other.
         */
        if (slot > 0 &&
            prev_key->objectid == key->objectid &&
            prev_key->type == BTRFS_EXTENT_DATA_KEY) {
                struct btrfs_file_extent_item *prev_fi;
                u64 prev_end;

                prev_fi = btrfs_item_ptr(leaf, slot - 1,
                                         struct btrfs_file_extent_item);
                prev_end = file_extent_end(leaf, prev_key, prev_fi);
                if (unlikely(prev_end > key->offset)) {
                        file_extent_err(leaf, slot - 1,
"file extent end range (%llu) goes beyond start offset (%llu) of the next file extent",
                                        prev_end, key->offset);
                        return -EUCLEAN;
                }
        }

        return 0;
}

static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
                           int slot, struct btrfs_key *prev_key)
{
        struct btrfs_fs_info *fs_info = leaf->fs_info;
        u32 sectorsize = fs_info->sectorsize;
        const u32 csumsize = fs_info->csum_size;

        if (unlikely(key->objectid != BTRFS_EXTENT_CSUM_OBJECTID)) {
                generic_err(leaf, slot,
                "invalid key objectid for csum item, have %llu expect %llu",
                        key->objectid, BTRFS_EXTENT_CSUM_OBJECTID);
                return -EUCLEAN;
        }
        if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) {
                generic_err(leaf, slot,
        "unaligned key offset for csum item, have %llu should be aligned to %u",
                        key->offset, sectorsize);
                return -EUCLEAN;
        }
        if (unlikely(!IS_ALIGNED(btrfs_item_size(leaf, slot), csumsize))) {
                generic_err(leaf, slot,
        "unaligned item size for csum item, have %u should be aligned to %u",
                        btrfs_item_size(leaf, slot), csumsize);
                return -EUCLEAN;
        }
        if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) {
                u64 prev_csum_end;
                u32 prev_item_size;

                prev_item_size = btrfs_item_size(leaf, slot - 1);
                prev_csum_end = (prev_item_size / csumsize) * sectorsize;
                prev_csum_end += prev_key->offset;
                if (unlikely(prev_csum_end > key->offset)) {
                        generic_err(leaf, slot - 1,
"csum end range (%llu) goes beyond the start range (%llu) of the next csum item",
                                    prev_csum_end, key->offset);
                        return -EUCLEAN;
                }
        }
        return 0;
}

/* Inode item error output has the same format as dir_item_err() */
#define inode_item_err(eb, slot, fmt, ...)                        \
        dir_item_err(eb, slot, fmt, __VA_ARGS__)

static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key,
                           int slot)
{
        struct btrfs_key item_key;
        bool is_inode_item;

        btrfs_item_key_to_cpu(leaf, &item_key, slot);
        is_inode_item = (item_key.type == BTRFS_INODE_ITEM_KEY);

        /* For XATTR_ITEM, location key should be all 0 */
        if (item_key.type == BTRFS_XATTR_ITEM_KEY) {
                if (unlikely(key->objectid != 0 || key->type != 0 ||
                             key->offset != 0))
                        return -EUCLEAN;
                return 0;
        }

        if (unlikely((key->objectid < BTRFS_FIRST_FREE_OBJECTID ||
                      key->objectid > BTRFS_LAST_FREE_OBJECTID) &&
                     key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID &&
                     key->objectid != BTRFS_FREE_INO_OBJECTID)) {
                if (is_inode_item) {
                        generic_err(leaf, slot,
        "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu",
                                key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID,
                                BTRFS_FIRST_FREE_OBJECTID,
                                BTRFS_LAST_FREE_OBJECTID,
                                BTRFS_FREE_INO_OBJECTID);
                } else {
                        dir_item_err(leaf, slot,
"invalid location key objectid: has %llu expect %llu or [%llu, %llu] or %llu",
                                key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID,
                                BTRFS_FIRST_FREE_OBJECTID,
                                BTRFS_LAST_FREE_OBJECTID,
                                BTRFS_FREE_INO_OBJECTID);
                }
                return -EUCLEAN;
        }
        if (unlikely(key->offset != 0)) {
                if (is_inode_item)
                        inode_item_err(leaf, slot,
                                       "invalid key offset: has %llu expect 0",
                                       key->offset);
                else
                        dir_item_err(leaf, slot,
                                "invalid location key offset:has %llu expect 0",
                                key->offset);
                return -EUCLEAN;
        }
        return 0;
}

static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
                          int slot)
{
        struct btrfs_key item_key;
        bool is_root_item;

        btrfs_item_key_to_cpu(leaf, &item_key, slot);
        is_root_item = (item_key.type == BTRFS_ROOT_ITEM_KEY);

        /*
         * Bad rootid for reloc trees.
         *
         * Reloc trees are only for subvolume trees, other trees only need
         * to be COWed to be relocated.
         */
        if (unlikely(is_root_item && key->objectid == BTRFS_TREE_RELOC_OBJECTID &&
                     !is_fstree(key->offset))) {
                generic_err(leaf, slot,
                "invalid reloc tree for root %lld, root id is not a subvolume tree",
                            key->offset);
                return -EUCLEAN;
        }

        /* No such tree id */
        if (unlikely(key->objectid == 0)) {
                if (is_root_item)
                        generic_err(leaf, slot, "invalid root id 0");
                else
                        dir_item_err(leaf, slot,
                                     "invalid location key root id 0");
                return -EUCLEAN;
        }

        /* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */
        if (unlikely(!is_fstree(key->objectid) && !is_root_item)) {
                dir_item_err(leaf, slot,
                "invalid location key objectid, have %llu expect [%llu, %llu]",
                                key->objectid, BTRFS_FIRST_FREE_OBJECTID,
                                BTRFS_LAST_FREE_OBJECTID);
                return -EUCLEAN;
        }

        /*
         * ROOT_ITEM with non-zero offset means this is a snapshot, created at
         * @offset transid.
         * Furthermore, for location key in DIR_ITEM, its offset is always -1.
         *
         * So here we only check offset for reloc tree whose key->offset must
         * be a valid tree.
         */
        if (unlikely(key->objectid == BTRFS_TREE_RELOC_OBJECTID &&
                     key->offset == 0)) {
                generic_err(leaf, slot, "invalid root id 0 for reloc tree");
                return -EUCLEAN;
        }
        return 0;
}

static int check_dir_item(struct extent_buffer *leaf,
                          struct btrfs_key *key, struct btrfs_key *prev_key,
                          int slot)
{
        struct btrfs_fs_info *fs_info = leaf->fs_info;
        struct btrfs_dir_item *di;
        u32 item_size = btrfs_item_size(leaf, slot);
        u32 cur = 0;

        if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
                return -EUCLEAN;

        di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
        while (cur < item_size) {
                struct btrfs_key location_key;
                u32 name_len;
                u32 data_len;
                u32 max_name_len;
                u32 total_size;
                u32 name_hash;
                u8 dir_type;
                int ret;

                /* header itself should not cross item boundary */
                if (unlikely(cur + sizeof(*di) > item_size)) {
                        dir_item_err(leaf, slot,
                "dir item header crosses item boundary, have %zu boundary %u",
                                cur + sizeof(*di), item_size);
                        return -EUCLEAN;
                }

                /* Location key check */
                btrfs_dir_item_key_to_cpu(leaf, di, &location_key);
                if (location_key.type == BTRFS_ROOT_ITEM_KEY) {
                        ret = check_root_key(leaf, &location_key, slot);
                        if (unlikely(ret < 0))
                                return ret;
                } else if (location_key.type == BTRFS_INODE_ITEM_KEY ||
                           location_key.type == 0) {
                        ret = check_inode_key(leaf, &location_key, slot);
                        if (unlikely(ret < 0))
                                return ret;
                } else {
                        dir_item_err(leaf, slot,
                        "invalid location key type, have %u, expect %u or %u",
                                     location_key.type, BTRFS_ROOT_ITEM_KEY,
                                     BTRFS_INODE_ITEM_KEY);
                        return -EUCLEAN;
                }

                /* dir type check */
                dir_type = btrfs_dir_ftype(leaf, di);
                if (unlikely(dir_type >= BTRFS_FT_MAX)) {
                        dir_item_err(leaf, slot,
                        "invalid dir item type, have %u expect [0, %u)",
                                dir_type, BTRFS_FT_MAX);
                        return -EUCLEAN;
                }

                if (unlikely(key->type == BTRFS_XATTR_ITEM_KEY &&
                             dir_type != BTRFS_FT_XATTR)) {
                        dir_item_err(leaf, slot,
                "invalid dir item type for XATTR key, have %u expect %u",
                                dir_type, BTRFS_FT_XATTR);
                        return -EUCLEAN;
                }
                if (unlikely(dir_type == BTRFS_FT_XATTR &&
                             key->type != BTRFS_XATTR_ITEM_KEY)) {
                        dir_item_err(leaf, slot,
                        "xattr dir type found for non-XATTR key");
                        return -EUCLEAN;
                }
                if (dir_type == BTRFS_FT_XATTR)
                        max_name_len = XATTR_NAME_MAX;
                else
                        max_name_len = BTRFS_NAME_LEN;

                /* Name/data length check */
                name_len = btrfs_dir_name_len(leaf, di);
                data_len = btrfs_dir_data_len(leaf, di);
                if (unlikely(name_len > max_name_len)) {
                        dir_item_err(leaf, slot,
                        "dir item name len too long, have %u max %u",
                                name_len, max_name_len);
                        return -EUCLEAN;
                }
                if (unlikely(name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info))) {
                        dir_item_err(leaf, slot,
                        "dir item name and data len too long, have %u max %u",
                                name_len + data_len,
                                BTRFS_MAX_XATTR_SIZE(fs_info));
                        return -EUCLEAN;
                }

                if (unlikely(data_len && dir_type != BTRFS_FT_XATTR)) {
                        dir_item_err(leaf, slot,
                        "dir item with invalid data len, have %u expect 0",
                                data_len);
                        return -EUCLEAN;
                }

                total_size = sizeof(*di) + name_len + data_len;

                /* header and name/data should not cross item boundary */
                if (unlikely(cur + total_size > item_size)) {
                        dir_item_err(leaf, slot,
                "dir item data crosses item boundary, have %u boundary %u",
                                cur + total_size, item_size);
                        return -EUCLEAN;
                }

                /*
                 * Special check for XATTR/DIR_ITEM, as key->offset is name
                 * hash, should match its name
                 */
                if (key->type == BTRFS_DIR_ITEM_KEY ||
                    key->type == BTRFS_XATTR_ITEM_KEY) {
                        char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)];

                        read_extent_buffer(leaf, namebuf,
                                        (unsigned long)(di + 1), name_len);
                        name_hash = btrfs_name_hash(namebuf, name_len);
                        if (unlikely(key->offset != name_hash)) {
                                dir_item_err(leaf, slot,
                "name hash mismatch with key, have 0x%016x expect 0x%016llx",
                                        name_hash, key->offset);
                                return -EUCLEAN;
                        }
                }
                cur += total_size;
                di = (struct btrfs_dir_item *)((void *)di + total_size);
        }
        return 0;
}

__printf(3, 4)
__cold
static void block_group_err(const struct extent_buffer *eb, int slot,
                            const char *fmt, ...)
{
        const struct btrfs_fs_info *fs_info = eb->fs_info;
        struct btrfs_key key;
        struct va_format vaf;
        va_list args;

        btrfs_item_key_to_cpu(eb, &key, slot);
        va_start(args, fmt);

        vaf.fmt = fmt;
        vaf.va = &args;

        dump_page(folio_page(eb->folios[0], 0), "eb page dump");
        btrfs_crit(fs_info,
        "corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV",
                btrfs_header_level(eb) == 0 ? "leaf" : "node",
                btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
                key.objectid, key.offset, &vaf);
        va_end(args);
}

static int check_block_group_item(struct extent_buffer *leaf,
                                  struct btrfs_key *key, int slot)
{
        struct btrfs_fs_info *fs_info = leaf->fs_info;
        struct btrfs_block_group_item bgi;
        u32 item_size = btrfs_item_size(leaf, slot);
        u64 chunk_objectid;
        u64 flags;
        u64 type;

        /*
         * Here we don't really care about alignment since extent allocator can
         * handle it.  We care more about the size.
         */
        if (unlikely(key->offset == 0)) {
                block_group_err(leaf, slot,
                                "invalid block group size 0");
                return -EUCLEAN;
        }

        if (unlikely(item_size != sizeof(bgi))) {
                block_group_err(leaf, slot,
                        "invalid item size, have %u expect %zu",
                                item_size, sizeof(bgi));
                return -EUCLEAN;
        }

        read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
                           sizeof(bgi));
        chunk_objectid = btrfs_stack_block_group_chunk_objectid(&bgi);
        if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
                /*
                 * We don't init the nr_global_roots until we load the global
                 * roots, so this could be 0 at mount time.  If it's 0 we'll
                 * just assume we're fine, and later we'll check against our
                 * actual value.
                 */
                if (unlikely(fs_info->nr_global_roots &&
                             chunk_objectid >= fs_info->nr_global_roots)) {
                        block_group_err(leaf, slot,
        "invalid block group global root id, have %llu, needs to be <= %llu",
                                        chunk_objectid,
                                        fs_info->nr_global_roots);
                        return -EUCLEAN;
                }
        } else if (unlikely(chunk_objectid != BTRFS_FIRST_CHUNK_TREE_OBJECTID)) {
                block_group_err(leaf, slot,
                "invalid block group chunk objectid, have %llu expect %llu",
                                btrfs_stack_block_group_chunk_objectid(&bgi),
                                BTRFS_FIRST_CHUNK_TREE_OBJECTID);
                return -EUCLEAN;
        }

        if (unlikely(btrfs_stack_block_group_used(&bgi) > key->offset)) {
                block_group_err(leaf, slot,
                        "invalid block group used, have %llu expect [0, %llu)",
                                btrfs_stack_block_group_used(&bgi), key->offset);
                return -EUCLEAN;
        }

        flags = btrfs_stack_block_group_flags(&bgi);
        if (unlikely(hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1)) {
                block_group_err(leaf, slot,
"invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set",
                        flags & BTRFS_BLOCK_GROUP_PROFILE_MASK,
                        hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK));
                return -EUCLEAN;
        }

        type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
        if (unlikely(type != BTRFS_BLOCK_GROUP_DATA &&
                     type != BTRFS_BLOCK_GROUP_METADATA &&
                     type != BTRFS_BLOCK_GROUP_SYSTEM &&
                     type != (BTRFS_BLOCK_GROUP_METADATA |
                              BTRFS_BLOCK_GROUP_DATA))) {
                block_group_err(leaf, slot,
"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
                        type, hweight64(type),
                        BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
                        BTRFS_BLOCK_GROUP_SYSTEM,
                        BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA);
                return -EUCLEAN;
        }
        return 0;
}

__printf(4, 5)
__cold
static void chunk_err(const struct extent_buffer *leaf,
                      const struct btrfs_chunk *chunk, u64 logical,
                      const char *fmt, ...)
{
        const struct btrfs_fs_info *fs_info = leaf->fs_info;
        bool is_sb;
        struct va_format vaf;
        va_list args;
        int i;
        int slot = -1;

        /* Only superblock eb is able to have such small offset */
        is_sb = (leaf->start == BTRFS_SUPER_INFO_OFFSET);

        if (!is_sb) {
                /*
                 * Get the slot number by iterating through all slots, this
                 * would provide better readability.
                 */
                for (i = 0; i < btrfs_header_nritems(leaf); i++) {
                        if (btrfs_item_ptr_offset(leaf, i) ==
                                        (unsigned long)chunk) {
                                slot = i;
                                break;
                        }
                }
        }
        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;

        if (is_sb)
                btrfs_crit(fs_info,
                "corrupt superblock syschunk array: chunk_start=%llu, %pV",
                           logical, &vaf);
        else
                btrfs_crit(fs_info,
        "corrupt leaf: root=%llu block=%llu slot=%d chunk_start=%llu, %pV",
                           BTRFS_CHUNK_TREE_OBJECTID, leaf->start, slot,
                           logical, &vaf);
        va_end(args);
}

/*
 * The common chunk check which could also work on super block sys chunk array.
 *
 * Return -EUCLEAN if anything is corrupted.
 * Return 0 if everything is OK.
 */
int btrfs_check_chunk_valid(struct extent_buffer *leaf,
                            struct btrfs_chunk *chunk, u64 logical)
{
        struct btrfs_fs_info *fs_info = leaf->fs_info;
        u64 length;
        u64 chunk_end;
        u64 stripe_len;
        u16 num_stripes;
        u16 sub_stripes;
        u64 type;
        u64 features;
        bool mixed = false;
        int raid_index;
        int nparity;
        int ncopies;

        length = btrfs_chunk_length(leaf, chunk);
        stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
        num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
        sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
        type = btrfs_chunk_type(leaf, chunk);
        raid_index = btrfs_bg_flags_to_raid_index(type);
        ncopies = btrfs_raid_array[raid_index].ncopies;
        nparity = btrfs_raid_array[raid_index].nparity;

        if (unlikely(!num_stripes)) {
                chunk_err(leaf, chunk, logical,
                          "invalid chunk num_stripes, have %u", num_stripes);
                return -EUCLEAN;
        }
        if (unlikely(num_stripes < ncopies)) {
                chunk_err(leaf, chunk, logical,
                          "invalid chunk num_stripes < ncopies, have %u < %d",
                          num_stripes, ncopies);
                return -EUCLEAN;
        }
        if (unlikely(nparity && num_stripes == nparity)) {
                chunk_err(leaf, chunk, logical,
                          "invalid chunk num_stripes == nparity, have %u == %d",
                          num_stripes, nparity);
                return -EUCLEAN;
        }
        if (unlikely(!IS_ALIGNED(logical, fs_info->sectorsize))) {
                chunk_err(leaf, chunk, logical,
                "invalid chunk logical, have %llu should aligned to %u",
                          logical, fs_info->sectorsize);
                return -EUCLEAN;
        }
        if (unlikely(btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize)) {
                chunk_err(leaf, chunk, logical,
                          "invalid chunk sectorsize, have %u expect %u",
                          btrfs_chunk_sector_size(leaf, chunk),
                          fs_info->sectorsize);
                return -EUCLEAN;
        }
        if (unlikely(!length || !IS_ALIGNED(length, fs_info->sectorsize))) {
                chunk_err(leaf, chunk, logical,
                          "invalid chunk length, have %llu", length);
                return -EUCLEAN;
        }
        if (unlikely(check_add_overflow(logical, length, &chunk_end))) {
                chunk_err(leaf, chunk, logical,
"invalid chunk logical start and length, have logical start %llu length %llu",
                          logical, length);
                return -EUCLEAN;
        }
        if (unlikely(!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN)) {
                chunk_err(leaf, chunk, logical,
                          "invalid chunk stripe length: %llu",
                          stripe_len);
                return -EUCLEAN;
        }
        /*
         * We artificially limit the chunk size, so that the number of stripes
         * inside a chunk can be fit into a U32.  The current limit (256G) is
         * way too large for real world usage anyway, and it's also much larger
         * than our existing limit (10G).
         *
         * Thus it should be a good way to catch obvious bitflips.
         */
        if (unlikely(length >= btrfs_stripe_nr_to_offset(U32_MAX))) {
                chunk_err(leaf, chunk, logical,
                          "chunk length too large: have %llu limit %llu",
                          length, btrfs_stripe_nr_to_offset(U32_MAX));
                return -EUCLEAN;
        }
        if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
                              BTRFS_BLOCK_GROUP_PROFILE_MASK))) {
                chunk_err(leaf, chunk, logical,
                          "unrecognized chunk type: 0x%llx",
                          ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
                            BTRFS_BLOCK_GROUP_PROFILE_MASK) &
                          btrfs_chunk_type(leaf, chunk));
                return -EUCLEAN;
        }

        if (unlikely(!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
                     (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0)) {
                chunk_err(leaf, chunk, logical,
                "invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set",
                          type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
                return -EUCLEAN;
        }
        if (unlikely((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0)) {
                chunk_err(leaf, chunk, logical,
        "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx",
                          type, BTRFS_BLOCK_GROUP_TYPE_MASK);
                return -EUCLEAN;
        }

        if (unlikely((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
                     (type & (BTRFS_BLOCK_GROUP_METADATA |
                              BTRFS_BLOCK_GROUP_DATA)))) {
                chunk_err(leaf, chunk, logical,
                          "system chunk with data or metadata type: 0x%llx",
                          type);
                return -EUCLEAN;
        }

        features = btrfs_super_incompat_flags(fs_info->super_copy);
        if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
                mixed = true;

        if (!mixed) {
                if (unlikely((type & BTRFS_BLOCK_GROUP_METADATA) &&
                             (type & BTRFS_BLOCK_GROUP_DATA))) {
                        chunk_err(leaf, chunk, logical,
                        "mixed chunk type in non-mixed mode: 0x%llx", type);
                        return -EUCLEAN;
                }
        }

        if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 &&
                      sub_stripes != btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes) ||
                     (type & BTRFS_BLOCK_GROUP_RAID1 &&
                      num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1].devs_min) ||
                     (type & BTRFS_BLOCK_GROUP_RAID1C3 &&
                      num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min) ||
                     (type & BTRFS_BLOCK_GROUP_RAID1C4 &&
                      num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min) ||
                     (type & BTRFS_BLOCK_GROUP_RAID5 &&
                      num_stripes < btrfs_raid_array[BTRFS_RAID_RAID5].devs_min) ||
                     (type & BTRFS_BLOCK_GROUP_RAID6 &&
                      num_stripes < btrfs_raid_array[BTRFS_RAID_RAID6].devs_min) ||
                     (type & BTRFS_BLOCK_GROUP_DUP &&
                      num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) ||
                     ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
                      num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) {
                chunk_err(leaf, chunk, logical,
                        "invalid num_stripes:sub_stripes %u:%u for profile %llu",
                        num_stripes, sub_stripes,
                        type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
                return -EUCLEAN;
        }

        return 0;
}

/*
 * Enhanced version of chunk item checker.
 *
 * The common btrfs_check_chunk_valid() doesn't check item size since it needs
 * to work on super block sys_chunk_array which doesn't have full item ptr.
 */
static int check_leaf_chunk_item(struct extent_buffer *leaf,
                                 struct btrfs_chunk *chunk,
                                 struct btrfs_key *key, int slot)
{
        int num_stripes;

        if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) {
                chunk_err(leaf, chunk, key->offset,
                        "invalid chunk item size: have %u expect [%zu, %u)",
                        btrfs_item_size(leaf, slot),
                        sizeof(struct btrfs_chunk),
                        BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
                return -EUCLEAN;
        }

        num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
        /* Let btrfs_check_chunk_valid() handle this error type */
        if (num_stripes == 0)
                goto out;

        if (unlikely(btrfs_chunk_item_size(num_stripes) !=
                     btrfs_item_size(leaf, slot))) {
                chunk_err(leaf, chunk, key->offset,
                        "invalid chunk item size: have %u expect %lu",
                        btrfs_item_size(leaf, slot),
                        btrfs_chunk_item_size(num_stripes));
                return -EUCLEAN;
        }
out:
        return btrfs_check_chunk_valid(leaf, chunk, key->offset);
}

__printf(3, 4)
__cold
static void dev_item_err(const struct extent_buffer *eb, int slot,
                         const char *fmt, ...)
{
        struct btrfs_key key;
        struct va_format vaf;
        va_list args;

        btrfs_item_key_to_cpu(eb, &key, slot);
        va_start(args, fmt);

        vaf.fmt = fmt;
        vaf.va = &args;

        dump_page(folio_page(eb->folios[0], 0), "eb page dump");
        btrfs_crit(eb->fs_info,
        "corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV",
                btrfs_header_level(eb) == 0 ? "leaf" : "node",
                btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
                key.objectid, &vaf);
        va_end(args);
}

static int check_dev_item(struct extent_buffer *leaf,
                          struct btrfs_key *key, int slot)
{
        struct btrfs_dev_item *ditem;
        const u32 item_size = btrfs_item_size(leaf, slot);

        if (unlikely(key->objectid != BTRFS_DEV_ITEMS_OBJECTID)) {
                dev_item_err(leaf, slot,
                             "invalid objectid: has=%llu expect=%llu",
                             key->objectid, BTRFS_DEV_ITEMS_OBJECTID);
                return -EUCLEAN;
        }

        if (unlikely(item_size != sizeof(*ditem))) {
                dev_item_err(leaf, slot, "invalid item size: has %u expect %zu",
                             item_size, sizeof(*ditem));
                return -EUCLEAN;
        }

        ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item);
        if (unlikely(btrfs_device_id(leaf, ditem) != key->offset)) {
                dev_item_err(leaf, slot,
                             "devid mismatch: key has=%llu item has=%llu",
                             key->offset, btrfs_device_id(leaf, ditem));
                return -EUCLEAN;
        }

        /*
         * For device total_bytes, we don't have reliable way to check it, as
         * it can be 0 for device removal. Device size check can only be done
         * by dev extents check.
         */
        if (unlikely(btrfs_device_bytes_used(leaf, ditem) >
                     btrfs_device_total_bytes(leaf, ditem))) {
                dev_item_err(leaf, slot,
                             "invalid bytes used: have %llu expect [0, %llu]",
                             btrfs_device_bytes_used(leaf, ditem),
                             btrfs_device_total_bytes(leaf, ditem));
                return -EUCLEAN;
        }
        /*
         * Remaining members like io_align/type/gen/dev_group aren't really
         * utilized.  Skip them to make later usage of them easier.
         */
        return 0;
}

static int check_inode_item(struct extent_buffer *leaf,
                            struct btrfs_key *key, int slot)
{
        struct btrfs_fs_info *fs_info = leaf->fs_info;
        struct btrfs_inode_item *iitem;
        u64 super_gen = btrfs_super_generation(fs_info->super_copy);
        u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777);
        const u32 item_size = btrfs_item_size(leaf, slot);
        u32 mode;
        int ret;
        u32 flags;
        u32 ro_flags;

        ret = check_inode_key(leaf, key, slot);
        if (unlikely(ret < 0))
                return ret;

        if (unlikely(item_size != sizeof(*iitem))) {
                generic_err(leaf, slot, "invalid item size: has %u expect %zu",
                            item_size, sizeof(*iitem));
                return -EUCLEAN;
        }

        iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item);

        /* Here we use super block generation + 1 to handle log tree */
        if (unlikely(btrfs_inode_generation(leaf, iitem) > super_gen + 1)) {
                inode_item_err(leaf, slot,
                        "invalid inode generation: has %llu expect (0, %llu]",
                               btrfs_inode_generation(leaf, iitem),
                               super_gen + 1);
                return -EUCLEAN;
        }
        /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */
        if (unlikely(btrfs_inode_transid(leaf, iitem) > super_gen + 1)) {
                inode_item_err(leaf, slot,
                        "invalid inode transid: has %llu expect [0, %llu]",
                               btrfs_inode_transid(leaf, iitem), super_gen + 1);
                return -EUCLEAN;
        }

        /*
         * For size and nbytes it's better not to be too strict, as for dir
         * item its size/nbytes can easily get wrong, but doesn't affect
         * anything in the fs. So here we skip the check.
         */
        mode = btrfs_inode_mode(leaf, iitem);
        if (unlikely(mode & ~valid_mask)) {
                inode_item_err(leaf, slot,
                               "unknown mode bit detected: 0x%x",
                               mode & ~valid_mask);
                return -EUCLEAN;
        }

        /*
         * S_IFMT is not bit mapped so we can't completely rely on
         * is_power_of_2/has_single_bit_set, but it can save us from checking
         * FIFO/CHR/DIR/REG.  Only needs to check BLK, LNK and SOCKS
         */
        if (!has_single_bit_set(mode & S_IFMT)) {
                if (unlikely(!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode))) {
                        inode_item_err(leaf, slot,
                        "invalid mode: has 0%o expect valid S_IF* bit(s)",
                                       mode & S_IFMT);
                        return -EUCLEAN;
                }
        }
        if (unlikely(S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1)) {
                inode_item_err(leaf, slot,
                       "invalid nlink: has %u expect no more than 1 for dir",
                        btrfs_inode_nlink(leaf, iitem));
                return -EUCLEAN;
        }
        btrfs_inode_split_flags(btrfs_inode_flags(leaf, iitem), &flags, &ro_flags);
        if (unlikely(flags & ~BTRFS_INODE_FLAG_MASK)) {
                inode_item_err(leaf, slot,
                               "unknown incompat flags detected: 0x%x", flags);
                return -EUCLEAN;
        }
        if (unlikely(!sb_rdonly(fs_info->sb) &&
                     (ro_flags & ~BTRFS_INODE_RO_FLAG_MASK))) {
                inode_item_err(leaf, slot,
                        "unknown ro-compat flags detected on writeable mount: 0x%x",
                        ro_flags);
                return -EUCLEAN;
        }
        return 0;
}

static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
                           int slot)
{
        struct btrfs_fs_info *fs_info = leaf->fs_info;
        struct btrfs_root_item ri = { 0 };
        const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY |
                                     BTRFS_ROOT_SUBVOL_DEAD;
        int ret;

        ret = check_root_key(leaf, key, slot);
        if (unlikely(ret < 0))
                return ret;

        if (unlikely(btrfs_item_size(leaf, slot) != sizeof(ri) &&
                     btrfs_item_size(leaf, slot) !=
                     btrfs_legacy_root_item_size())) {
                generic_err(leaf, slot,
                            "invalid root item size, have %u expect %zu or %u",
                            btrfs_item_size(leaf, slot), sizeof(ri),
                            btrfs_legacy_root_item_size());
                return -EUCLEAN;
        }

        /*
         * For legacy root item, the members starting at generation_v2 will be
         * all filled with 0.
         * And since we allow geneartion_v2 as 0, it will still pass the check.
         */
        read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
                           btrfs_item_size(leaf, slot));

        /* Generation related */
        if (unlikely(btrfs_root_generation(&ri) >
                     btrfs_super_generation(fs_info->super_copy) + 1)) {
                generic_err(leaf, slot,
                        "invalid root generation, have %llu expect (0, %llu]",
                            btrfs_root_generation(&ri),
                            btrfs_super_generation(fs_info->super_copy) + 1);
                return -EUCLEAN;
        }
        if (unlikely(btrfs_root_generation_v2(&ri) >
                     btrfs_super_generation(fs_info->super_copy) + 1)) {
                generic_err(leaf, slot,
                "invalid root v2 generation, have %llu expect (0, %llu]",
                            btrfs_root_generation_v2(&ri),
                            btrfs_super_generation(fs_info->super_copy) + 1);
                return -EUCLEAN;
        }
        if (unlikely(btrfs_root_last_snapshot(&ri) >
                     btrfs_super_generation(fs_info->super_copy) + 1)) {
                generic_err(leaf, slot,
                "invalid root last_snapshot, have %llu expect (0, %llu]",
                            btrfs_root_last_snapshot(&ri),
                            btrfs_super_generation(fs_info->super_copy) + 1);
                return -EUCLEAN;
        }

        /* Alignment and level check */
        if (unlikely(!IS_ALIGNED(btrfs_root_bytenr(&ri), fs_info->sectorsize))) {
                generic_err(leaf, slot,
                "invalid root bytenr, have %llu expect to be aligned to %u",
                            btrfs_root_bytenr(&ri), fs_info->sectorsize);
                return -EUCLEAN;
        }
        if (unlikely(btrfs_root_level(&ri) >= BTRFS_MAX_LEVEL)) {
                generic_err(leaf, slot,
                            "invalid root level, have %u expect [0, %u]",
                            btrfs_root_level(&ri), BTRFS_MAX_LEVEL - 1);
                return -EUCLEAN;
        }
        if (unlikely(btrfs_root_drop_level(&ri) >= BTRFS_MAX_LEVEL)) {
                generic_err(leaf, slot,
                            "invalid root level, have %u expect [0, %u]",
                            btrfs_root_drop_level(&ri), BTRFS_MAX_LEVEL - 1);
                return -EUCLEAN;
        }

        /* Flags check */
        if (unlikely(btrfs_root_flags(&ri) & ~valid_root_flags)) {
                generic_err(leaf, slot,
                            "invalid root flags, have 0x%llx expect mask 0x%llx",
                            btrfs_root_flags(&ri), valid_root_flags);
                return -EUCLEAN;
        }
        return 0;
}

__printf(3,4)
__cold
static void extent_err(const struct extent_buffer *eb, int slot,
                       const char *fmt, ...)
{
        struct btrfs_key key;
        struct va_format vaf;
        va_list args;
        u64 bytenr;
        u64 len;

        btrfs_item_key_to_cpu(eb, &key, slot);
        bytenr = key.objectid;
        if (key.type == BTRFS_METADATA_ITEM_KEY ||
            key.type == BTRFS_TREE_BLOCK_REF_KEY ||
            key.type == BTRFS_SHARED_BLOCK_REF_KEY)
                len = eb->fs_info->nodesize;
        else
                len = key.offset;
        va_start(args, fmt);

        vaf.fmt = fmt;
        vaf.va = &args;

        dump_page(folio_page(eb->folios[0], 0), "eb page dump");
        btrfs_crit(eb->fs_info,
        "corrupt %s: block=%llu slot=%d extent bytenr=%llu len=%llu %pV",
                btrfs_header_level(eb) == 0 ? "leaf" : "node",
                eb->start, slot, bytenr, len, &vaf);
        va_end(args);
}

static int check_extent_item(struct extent_buffer *leaf,
                             struct btrfs_key *key, int slot,
                             struct btrfs_key *prev_key)
{
        struct btrfs_fs_info *fs_info = leaf->fs_info;
        struct btrfs_extent_item *ei;
        bool is_tree_block = false;
        unsigned long ptr;        /* Current pointer inside inline refs */
        unsigned long end;        /* Extent item end */
        const u32 item_size = btrfs_item_size(leaf, slot);
        u8 last_type = 0;
        u64 last_seq = U64_MAX;
        u64 flags;
        u64 generation;
        u64 total_refs;                /* Total refs in btrfs_extent_item */
        u64 inline_refs = 0;        /* found total inline refs */

        if (unlikely(key->type == BTRFS_METADATA_ITEM_KEY &&
                     !btrfs_fs_incompat(fs_info, SKINNY_METADATA))) {
                generic_err(leaf, slot,
"invalid key type, METADATA_ITEM type invalid when SKINNY_METADATA feature disabled");
                return -EUCLEAN;
        }
        /* key->objectid is the bytenr for both key types */
        if (unlikely(!IS_ALIGNED(key->objectid, fs_info->sectorsize))) {
                generic_err(leaf, slot,
                "invalid key objectid, have %llu expect to be aligned to %u",
                           key->objectid, fs_info->sectorsize);
                return -EUCLEAN;
        }

        /* key->offset is tree level for METADATA_ITEM_KEY */
        if (unlikely(key->type == BTRFS_METADATA_ITEM_KEY &&
                     key->offset >= BTRFS_MAX_LEVEL)) {
                extent_err(leaf, slot,
                           "invalid tree level, have %llu expect [0, %u]",
                           key->offset, BTRFS_MAX_LEVEL - 1);
                return -EUCLEAN;
        }

        /*
         * EXTENT/METADATA_ITEM consists of:
         * 1) One btrfs_extent_item
         *    Records the total refs, type and generation of the extent.
         *
         * 2) One btrfs_tree_block_info (for EXTENT_ITEM and tree backref only)
         *    Records the first key and level of the tree block.
         *
         * 2) Zero or more btrfs_extent_inline_ref(s)
         *    Each inline ref has one btrfs_extent_inline_ref shows:
         *    2.1) The ref type, one of the 4
         *         TREE_BLOCK_REF        Tree block only
         *         SHARED_BLOCK_REF        Tree block only
         *         EXTENT_DATA_REF        Data only
         *         SHARED_DATA_REF        Data only
         *    2.2) Ref type specific data
         *         Either using btrfs_extent_inline_ref::offset, or specific
         *         data structure.
         *
         *    All above inline items should follow the order:
         *
         *    - All btrfs_extent_inline_ref::type should be in an ascending
         *      order
         *
         *    - Within the same type, the items should follow a descending
         *      order by their sequence number. The sequence number is
         *      determined by:
         *      * btrfs_extent_inline_ref::offset for all types  other than
         *        EXTENT_DATA_REF
         *      * hash_extent_data_ref() for EXTENT_DATA_REF
         */
        if (unlikely(item_size < sizeof(*ei))) {
                extent_err(leaf, slot,
                           "invalid item size, have %u expect [%zu, %u)",
                           item_size, sizeof(*ei),
                           BTRFS_LEAF_DATA_SIZE(fs_info));
                return -EUCLEAN;
        }
        end = item_size + btrfs_item_ptr_offset(leaf, slot);

        /* Checks against extent_item */
        ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
        flags = btrfs_extent_flags(leaf, ei);
        total_refs = btrfs_extent_refs(leaf, ei);
        generation = btrfs_extent_generation(leaf, ei);
        if (unlikely(generation >
                     btrfs_super_generation(fs_info->super_copy) + 1)) {
                extent_err(leaf, slot,
                           "invalid generation, have %llu expect (0, %llu]",
                           generation,
                           btrfs_super_generation(fs_info->super_copy) + 1);
                return -EUCLEAN;
        }
        if (unlikely(!has_single_bit_set(flags & (BTRFS_EXTENT_FLAG_DATA |
                                                  BTRFS_EXTENT_FLAG_TREE_BLOCK)))) {
                extent_err(leaf, slot,
                "invalid extent flag, have 0x%llx expect 1 bit set in 0x%llx",
                        flags, BTRFS_EXTENT_FLAG_DATA |
                        BTRFS_EXTENT_FLAG_TREE_BLOCK);
                return -EUCLEAN;
        }
        is_tree_block = !!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK);
        if (is_tree_block) {
                if (unlikely(key->type == BTRFS_EXTENT_ITEM_KEY &&
                             key->offset != fs_info->nodesize)) {
                        extent_err(leaf, slot,
                                   "invalid extent length, have %llu expect %u",
                                   key->offset, fs_info->nodesize);
                        return -EUCLEAN;
                }
        } else {
                if (unlikely(key->type != BTRFS_EXTENT_ITEM_KEY)) {
                        extent_err(leaf, slot,
                        "invalid key type, have %u expect %u for data backref",
                                   key->type, BTRFS_EXTENT_ITEM_KEY);
                        return -EUCLEAN;
                }
                if (unlikely(!IS_ALIGNED(key->offset, fs_info->sectorsize))) {
                        extent_err(leaf, slot,
                        "invalid extent length, have %llu expect aligned to %u",
                                   key->offset, fs_info->sectorsize);
                        return -EUCLEAN;
                }
                if (unlikely(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
                        extent_err(leaf, slot,
                        "invalid extent flag, data has full backref set");
                        return -EUCLEAN;
                }
        }
        ptr = (unsigned long)(struct btrfs_extent_item *)(ei + 1);

        /* Check the special case of btrfs_tree_block_info */
        if (is_tree_block && key->type != BTRFS_METADATA_ITEM_KEY) {
                struct btrfs_tree_block_info *info;

                info = (struct btrfs_tree_block_info *)ptr;
                if (unlikely(btrfs_tree_block_level(leaf, info) >= BTRFS_MAX_LEVEL)) {
                        extent_err(leaf, slot,
                        "invalid tree block info level, have %u expect [0, %u]",
                                   btrfs_tree_block_level(leaf, info),
                                   BTRFS_MAX_LEVEL - 1);
                        return -EUCLEAN;
                }
                ptr = (unsigned long)(struct btrfs_tree_block_info *)(info + 1);
        }

        /* Check inline refs */
        while (ptr < end) {
                struct btrfs_extent_inline_ref *iref;
                struct btrfs_extent_data_ref *dref;
                struct btrfs_shared_data_ref *sref;
                u64 seq;
                u64 dref_offset;
                u64 inline_offset;
                u8 inline_type;

                if (unlikely(ptr + sizeof(*iref) > end)) {
                        extent_err(leaf, slot,
"inline ref item overflows extent item, ptr %lu iref size %zu end %lu",
                                   ptr, sizeof(*iref), end);
                        return -EUCLEAN;
                }
                iref = (struct btrfs_extent_inline_ref *)ptr;
                inline_type = btrfs_extent_inline_ref_type(leaf, iref);
                inline_offset = btrfs_extent_inline_ref_offset(leaf, iref);
                seq = inline_offset;
                if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) {
                        extent_err(leaf, slot,
"inline ref item overflows extent item, ptr %lu iref size %u end %lu",
                                   ptr, btrfs_extent_inline_ref_size(inline_type), end);
                        return -EUCLEAN;
                }

                switch (inline_type) {
                /* inline_offset is subvolid of the owner, no need to check */
                case BTRFS_TREE_BLOCK_REF_KEY:
                        inline_refs++;
                        break;
                /* Contains parent bytenr */
                case BTRFS_SHARED_BLOCK_REF_KEY:
                        if (unlikely(!IS_ALIGNED(inline_offset,
                                                 fs_info->sectorsize))) {
                                extent_err(leaf, slot,
                "invalid tree parent bytenr, have %llu expect aligned to %u",
                                           inline_offset, fs_info->sectorsize);
                                return -EUCLEAN;
                        }
                        inline_refs++;
                        break;
                /*
                 * Contains owner subvolid, owner key objectid, adjusted offset.
                 * The only obvious corruption can happen in that offset.
                 */
                case BTRFS_EXTENT_DATA_REF_KEY:
                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
                        dref_offset = btrfs_extent_data_ref_offset(leaf, dref);
                        seq = hash_extent_data_ref(
                                        btrfs_extent_data_ref_root(leaf, dref),
                                        btrfs_extent_data_ref_objectid(leaf, dref),
                                        btrfs_extent_data_ref_offset(leaf, dref));
                        if (unlikely(!IS_ALIGNED(dref_offset,
                                                 fs_info->sectorsize))) {
                                extent_err(leaf, slot,
                "invalid data ref offset, have %llu expect aligned to %u",
                                           dref_offset, fs_info->sectorsize);
                                return -EUCLEAN;
                        }
                        inline_refs += btrfs_extent_data_ref_count(leaf, dref);
                        break;
                /* Contains parent bytenr and ref count */
                case BTRFS_SHARED_DATA_REF_KEY:
                        sref = (struct btrfs_shared_data_ref *)(iref + 1);
                        if (unlikely(!IS_ALIGNED(inline_offset,
                                                 fs_info->sectorsize))) {
                                extent_err(leaf, slot,
                "invalid data parent bytenr, have %llu expect aligned to %u",
                                           inline_offset, fs_info->sectorsize);
                                return -EUCLEAN;
                        }
                        inline_refs += btrfs_shared_data_ref_count(leaf, sref);
                        break;
                case BTRFS_EXTENT_OWNER_REF_KEY:
                        WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA));
                        break;
                default:
                        extent_err(leaf, slot, "unknown inline ref type: %u",
                                   inline_type);
                        return -EUCLEAN;
                }
                if (inline_type < last_type) {
                        extent_err(leaf, slot,
                                   "inline ref out-of-order: has type %u, prev type %u",
                                   inline_type, last_type);
                        return -EUCLEAN;
                }
                /* Type changed, allow the sequence starts from U64_MAX again. */
                if (inline_type > last_type)
                        last_seq = U64_MAX;
                if (seq > last_seq) {
                        extent_err(leaf, slot,
"inline ref out-of-order: has type %u offset %llu seq 0x%llx, prev type %u seq 0x%llx",
                                   inline_type, inline_offset, seq,
                                   last_type, last_seq);
                        return -EUCLEAN;
                }
                last_type = inline_type;
                last_seq = seq;
                ptr += btrfs_extent_inline_ref_size(inline_type);
        }
        /* No padding is allowed */
        if (unlikely(ptr != end)) {
                extent_err(leaf, slot,
                           "invalid extent item size, padding bytes found");
                return -EUCLEAN;
        }

        /* Finally, check the inline refs against total refs */
        if (unlikely(inline_refs > total_refs)) {
                extent_err(leaf, slot,
                        "invalid extent refs, have %llu expect >= inline %llu",
                           total_refs, inline_refs);
                return -EUCLEAN;
        }

        if ((prev_key->type == BTRFS_EXTENT_ITEM_KEY) ||
            (prev_key->type == BTRFS_METADATA_ITEM_KEY)) {
                u64 prev_end = prev_key->objectid;

                if (prev_key->type == BTRFS_METADATA_ITEM_KEY)
                        prev_end += fs_info->nodesize;
                else
                        prev_end += prev_key->offset;

                if (unlikely(prev_end > key->objectid)) {
                        extent_err(leaf, slot,
        "previous extent [%llu %u %llu] overlaps current extent [%llu %u %llu]",
                                   prev_key->objectid, prev_key->type,
                                   prev_key->offset, key->objectid, key->type,
                                   key->offset);
                        return -EUCLEAN;
                }
        }

        return 0;
}

static int check_simple_keyed_refs(struct extent_buffer *leaf,
                                   struct btrfs_key *key, int slot)
{
        u32 expect_item_size = 0;

        if (key->type == BTRFS_SHARED_DATA_REF_KEY)
                expect_item_size = sizeof(struct btrfs_shared_data_ref);

        if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) {
                generic_err(leaf, slot,
                "invalid item size, have %u expect %u for key type %u",
                            btrfs_item_size(leaf, slot),
                            expect_item_size, key->type);
                return -EUCLEAN;
        }
        if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) {
                generic_err(leaf, slot,
"invalid key objectid for shared block ref, have %llu expect aligned to %u",
                            key->objectid, leaf->fs_info->sectorsize);
                return -EUCLEAN;
        }
        if (unlikely(key->type != BTRFS_TREE_BLOCK_REF_KEY &&
                     !IS_ALIGNED(key->offset, leaf->fs_info->sectorsize))) {
                extent_err(leaf, slot,
                "invalid tree parent bytenr, have %llu expect aligned to %u",
                           key->offset, leaf->fs_info->sectorsize);
                return -EUCLEAN;
        }
        return 0;
}

static int check_extent_data_ref(struct extent_buffer *leaf,
                                 struct btrfs_key *key, int slot)
{
        struct btrfs_extent_data_ref *dref;
        unsigned long ptr = btrfs_item_ptr_offset(leaf, slot);
        const unsigned long end = ptr + btrfs_item_size(leaf, slot);

        if (unlikely(btrfs_item_size(leaf, slot) % sizeof(*dref) != 0)) {
                generic_err(leaf, slot,
        "invalid item size, have %u expect aligned to %zu for key type %u",
                            btrfs_item_size(leaf, slot),
                            sizeof(*dref), key->type);
                return -EUCLEAN;
        }
        if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) {
                generic_err(leaf, slot,
"invalid key objectid for shared block ref, have %llu expect aligned to %u",
                            key->objectid, leaf->fs_info->sectorsize);
                return -EUCLEAN;
        }
        for (; ptr < end; ptr += sizeof(*dref)) {
                u64 offset;

                /*
                 * We cannot check the extent_data_ref hash due to possible
                 * overflow from the leaf due to hash collisions.
                 */
                dref = (struct btrfs_extent_data_ref *)ptr;
                offset = btrfs_extent_data_ref_offset(leaf, dref);
                if (unlikely(!IS_ALIGNED(offset, leaf->fs_info->sectorsize))) {
                        extent_err(leaf, slot,
        "invalid extent data backref offset, have %llu expect aligned to %u",
                                   offset, leaf->fs_info->sectorsize);
                        return -EUCLEAN;
                }
        }
        return 0;
}

#define inode_ref_err(eb, slot, fmt, args...)                        \
        inode_item_err(eb, slot, fmt, ##args)
static int check_inode_ref(struct extent_buffer *leaf,
                           struct btrfs_key *key, struct btrfs_key *prev_key,
                           int slot)
{
        struct btrfs_inode_ref *iref;
        unsigned long ptr;
        unsigned long end;

        if (unlikely(!check_prev_ino(leaf, key, slot, prev_key)))
                return -EUCLEAN;
        /* namelen can't be 0, so item_size == sizeof() is also invalid */
        if (unlikely(btrfs_item_size(leaf, slot) <= sizeof(*iref))) {
                inode_ref_err(leaf, slot,
                        "invalid item size, have %u expect (%zu, %u)",
                        btrfs_item_size(leaf, slot),
                        sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
                return -EUCLEAN;
        }

        ptr = btrfs_item_ptr_offset(leaf, slot);
        end = ptr + btrfs_item_size(leaf, slot);
        while (ptr < end) {
                u16 namelen;

                if (unlikely(ptr + sizeof(iref) > end)) {
                        inode_ref_err(leaf, slot,
                        "inode ref overflow, ptr %lu end %lu inode_ref_size %zu",
                                ptr, end, sizeof(iref));
                        return -EUCLEAN;
                }

                iref = (struct btrfs_inode_ref *)ptr;
                namelen = btrfs_inode_ref_name_len(leaf, iref);
                if (unlikely(ptr + sizeof(*iref) + namelen > end)) {
                        inode_ref_err(leaf, slot,
                                "inode ref overflow, ptr %lu end %lu namelen %u",
                                ptr, end, namelen);
                        return -EUCLEAN;
                }

                /*
                 * NOTE: In theory we should record all found index numbers
                 * to find any duplicated indexes, but that will be too time
                 * consuming for inodes with too many hard links.
                 */
                ptr += sizeof(*iref) + namelen;
        }
        return 0;
}

static int check_raid_stripe_extent(const struct extent_buffer *leaf,
                                    const struct btrfs_key *key, int slot)
{
        struct btrfs_stripe_extent *stripe_extent =
                btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);

        if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) {
                generic_err(leaf, slot,
"invalid key objectid for raid stripe extent, have %llu expect aligned to %u",
                            key->objectid, leaf->fs_info->sectorsize);
                return -EUCLEAN;
        }

        if (unlikely(!btrfs_fs_incompat(leaf->fs_info, RAID_STRIPE_TREE))) {
                generic_err(leaf, slot,
        "RAID_STRIPE_EXTENT present but RAID_STRIPE_TREE incompat bit unset");
                return -EUCLEAN;
        }

        switch (btrfs_stripe_extent_encoding(leaf, stripe_extent)) {
        case BTRFS_STRIPE_RAID0:
        case BTRFS_STRIPE_RAID1:
        case BTRFS_STRIPE_DUP:
        case BTRFS_STRIPE_RAID10:
        case BTRFS_STRIPE_RAID5:
        case BTRFS_STRIPE_RAID6:
        case BTRFS_STRIPE_RAID1C3:
        case BTRFS_STRIPE_RAID1C4:
                break;
        default:
                generic_err(leaf, slot, "invalid raid stripe encoding %u",
                            btrfs_stripe_extent_encoding(leaf, stripe_extent));
                return -EUCLEAN;
        }

        return 0;
}

/*
 * Common point to switch the item-specific validation.
 */
static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
                                                    struct btrfs_key *key,
                                                    int slot,
                                                    struct btrfs_key *prev_key)
{
        int ret = 0;
        struct btrfs_chunk *chunk;

        switch (key->type) {
        case BTRFS_EXTENT_DATA_KEY:
                ret = check_extent_data_item(leaf, key, slot, prev_key);
                break;
        case BTRFS_EXTENT_CSUM_KEY:
                ret = check_csum_item(leaf, key, slot, prev_key);
                break;
        case BTRFS_DIR_ITEM_KEY:
        case BTRFS_DIR_INDEX_KEY:
        case BTRFS_XATTR_ITEM_KEY:
                ret = check_dir_item(leaf, key, prev_key, slot);
                break;
        case BTRFS_INODE_REF_KEY:
                ret = check_inode_ref(leaf, key, prev_key, slot);
                break;
        case BTRFS_BLOCK_GROUP_ITEM_KEY:
                ret = check_block_group_item(leaf, key, slot);
                break;
        case BTRFS_CHUNK_ITEM_KEY:
                chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
                ret = check_leaf_chunk_item(leaf, chunk, key, slot);
                break;
        case BTRFS_DEV_ITEM_KEY:
                ret = check_dev_item(leaf, key, slot);
                break;
        case BTRFS_INODE_ITEM_KEY:
                ret = check_inode_item(leaf, key, slot);
                break;
        case BTRFS_ROOT_ITEM_KEY:
                ret = check_root_item(leaf, key, slot);
                break;
        case BTRFS_EXTENT_ITEM_KEY:
        case BTRFS_METADATA_ITEM_KEY:
                ret = check_extent_item(leaf, key, slot, prev_key);
                break;
        case BTRFS_TREE_BLOCK_REF_KEY:
        case BTRFS_SHARED_DATA_REF_KEY:
        case BTRFS_SHARED_BLOCK_REF_KEY:
                ret = check_simple_keyed_refs(leaf, key, slot);
                break;
        case BTRFS_EXTENT_DATA_REF_KEY:
                ret = check_extent_data_ref(leaf, key, slot);
                break;
        case BTRFS_RAID_STRIPE_KEY:
                ret = check_raid_stripe_extent(leaf, key, slot);
                break;
        }

        if (ret)
                return BTRFS_TREE_BLOCK_INVALID_ITEM;
        return BTRFS_TREE_BLOCK_CLEAN;
}

enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf)
{
        struct btrfs_fs_info *fs_info = leaf->fs_info;
        /* No valid key type is 0, so all key should be larger than this key */
        struct btrfs_key prev_key = {0, 0, 0};
        struct btrfs_key key;
        u32 nritems = btrfs_header_nritems(leaf);
        int slot;

        if (unlikely(btrfs_header_level(leaf) != 0)) {
                generic_err(leaf, 0,
                        "invalid level for leaf, have %d expect 0",
                        btrfs_header_level(leaf));
                return BTRFS_TREE_BLOCK_INVALID_LEVEL;
        }

        if (unlikely(!btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN))) {
                generic_err(leaf, 0, "invalid flag for leaf, WRITTEN not set");
                return BTRFS_TREE_BLOCK_WRITTEN_NOT_SET;
        }

        /*
         * Extent buffers from a relocation tree have a owner field that
         * corresponds to the subvolume tree they are based on. So just from an
         * extent buffer alone we can not find out what is the id of the
         * corresponding subvolume tree, so we can not figure out if the extent
         * buffer corresponds to the root of the relocation tree or not. So
         * skip this check for relocation trees.
         */
        if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
                u64 owner = btrfs_header_owner(leaf);

                /* These trees must never be empty */
                if (unlikely(owner == BTRFS_ROOT_TREE_OBJECTID ||
                             owner == BTRFS_CHUNK_TREE_OBJECTID ||
                             owner == BTRFS_DEV_TREE_OBJECTID ||
                             owner == BTRFS_FS_TREE_OBJECTID ||
                             owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) {
                        generic_err(leaf, 0,
                        "invalid root, root %llu must never be empty",
                                    owner);
                        return BTRFS_TREE_BLOCK_INVALID_NRITEMS;
                }

                /* Unknown tree */
                if (unlikely(owner == 0)) {
                        generic_err(leaf, 0,
                                "invalid owner, root 0 is not defined");
                        return BTRFS_TREE_BLOCK_INVALID_OWNER;
                }

                /* EXTENT_TREE_V2 can have empty extent trees. */
                if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
                        return BTRFS_TREE_BLOCK_CLEAN;

                if (unlikely(owner == BTRFS_EXTENT_TREE_OBJECTID)) {
                        generic_err(leaf, 0,
                        "invalid root, root %llu must never be empty",
                                    owner);
                        return BTRFS_TREE_BLOCK_INVALID_NRITEMS;
                }

                return BTRFS_TREE_BLOCK_CLEAN;
        }

        if (unlikely(nritems == 0))
                return BTRFS_TREE_BLOCK_CLEAN;

        /*
         * Check the following things to make sure this is a good leaf, and
         * leaf users won't need to bother with similar sanity checks:
         *
         * 1) key ordering
         * 2) item offset and size
         *    No overlap, no hole, all inside the leaf.
         * 3) item content
         *    If possible, do comprehensive sanity check.
         *    NOTE: All checks must only rely on the item data itself.
         */
        for (slot = 0; slot < nritems; slot++) {
                u32 item_end_expected;
                u64 item_data_end;
                enum btrfs_tree_block_status ret;

                btrfs_item_key_to_cpu(leaf, &key, slot);

                /* Make sure the keys are in the right order */
                if (unlikely(btrfs_comp_cpu_keys(&prev_key, &key) >= 0)) {
                        generic_err(leaf, slot,
        "bad key order, prev (%llu %u %llu) current (%llu %u %llu)",
                                prev_key.objectid, prev_key.type,
                                prev_key.offset, key.objectid, key.type,
                                key.offset);
                        return BTRFS_TREE_BLOCK_BAD_KEY_ORDER;
                }

                item_data_end = (u64)btrfs_item_offset(leaf, slot) +
                                btrfs_item_size(leaf, slot);
                /*
                 * Make sure the offset and ends are right, remember that the
                 * item data starts at the end of the leaf and grows towards the
                 * front.
                 */
                if (slot == 0)
                        item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info);
                else
                        item_end_expected = btrfs_item_offset(leaf,
                                                                 slot - 1);
                if (unlikely(item_data_end != item_end_expected)) {
                        generic_err(leaf, slot,
                                "unexpected item end, have %llu expect %u",
                                item_data_end, item_end_expected);
                        return BTRFS_TREE_BLOCK_INVALID_OFFSETS;
                }

                /*
                 * Check to make sure that we don't point outside of the leaf,
                 * just in case all the items are consistent to each other, but
                 * all point outside of the leaf.
                 */
                if (unlikely(item_data_end > BTRFS_LEAF_DATA_SIZE(fs_info))) {
                        generic_err(leaf, slot,
                        "slot end outside of leaf, have %llu expect range [0, %u]",
                                item_data_end, BTRFS_LEAF_DATA_SIZE(fs_info));
                        return BTRFS_TREE_BLOCK_INVALID_OFFSETS;
                }

                /* Also check if the item pointer overlaps with btrfs item. */
                if (unlikely(btrfs_item_ptr_offset(leaf, slot) <
                             btrfs_item_nr_offset(leaf, slot) + sizeof(struct btrfs_item))) {
                        generic_err(leaf, slot,
                "slot overlaps with its data, item end %lu data start %lu",
                                btrfs_item_nr_offset(leaf, slot) +
                                sizeof(struct btrfs_item),
                                btrfs_item_ptr_offset(leaf, slot));
                        return BTRFS_TREE_BLOCK_INVALID_OFFSETS;
                }

                /* Check if the item size and content meet other criteria. */
                ret = check_leaf_item(leaf, &key, slot, &prev_key);
                if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN))
                        return ret;

                prev_key.objectid = key.objectid;
                prev_key.type = key.type;
                prev_key.offset = key.offset;
        }

        return BTRFS_TREE_BLOCK_CLEAN;
}

int btrfs_check_leaf(struct extent_buffer *leaf)
{
        enum btrfs_tree_block_status ret;

        ret = __btrfs_check_leaf(leaf);
        if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN))
                return -EUCLEAN;
        return 0;
}
ALLOW_ERROR_INJECTION(btrfs_check_leaf, ERRNO);

enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node)
{
        struct btrfs_fs_info *fs_info = node->fs_info;
        unsigned long nr = btrfs_header_nritems(node);
        struct btrfs_key key, next_key;
        int slot;
        int level = btrfs_header_level(node);
        u64 bytenr;

        if (unlikely(!btrfs_header_flag(node, BTRFS_HEADER_FLAG_WRITTEN))) {
                generic_err(node, 0, "invalid flag for node, WRITTEN not set");
                return BTRFS_TREE_BLOCK_WRITTEN_NOT_SET;
        }

        if (unlikely(level <= 0 || level >= BTRFS_MAX_LEVEL)) {
                generic_err(node, 0,
                        "invalid level for node, have %d expect [1, %d]",
                        level, BTRFS_MAX_LEVEL - 1);
                return BTRFS_TREE_BLOCK_INVALID_LEVEL;
        }
        if (unlikely(nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info))) {
                btrfs_crit(fs_info,
"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]",
                           btrfs_header_owner(node), node->start,
                           nr == 0 ? "small" : "large", nr,
                           BTRFS_NODEPTRS_PER_BLOCK(fs_info));
                return BTRFS_TREE_BLOCK_INVALID_NRITEMS;
        }

        for (slot = 0; slot < nr - 1; slot++) {
                bytenr = btrfs_node_blockptr(node, slot);
                btrfs_node_key_to_cpu(node, &key, slot);
                btrfs_node_key_to_cpu(node, &next_key, slot + 1);

                if (unlikely(!bytenr)) {
                        generic_err(node, slot,
                                "invalid NULL node pointer");
                        return BTRFS_TREE_BLOCK_INVALID_BLOCKPTR;
                }
                if (unlikely(!IS_ALIGNED(bytenr, fs_info->sectorsize))) {
                        generic_err(node, slot,
                        "unaligned pointer, have %llu should be aligned to %u",
                                bytenr, fs_info->sectorsize);
                        return BTRFS_TREE_BLOCK_INVALID_BLOCKPTR;
                }

                if (unlikely(btrfs_comp_cpu_keys(&key, &next_key) >= 0)) {
                        generic_err(node, slot,
        "bad key order, current (%llu %u %llu) next (%llu %u %llu)",
                                key.objectid, key.type, key.offset,
                                next_key.objectid, next_key.type,
                                next_key.offset);
                        return BTRFS_TREE_BLOCK_BAD_KEY_ORDER;
                }
        }
        return BTRFS_TREE_BLOCK_CLEAN;
}

int btrfs_check_node(struct extent_buffer *node)
{
        enum btrfs_tree_block_status ret;

        ret = __btrfs_check_node(node);
        if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN))
                return -EUCLEAN;
        return 0;
}
ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO);

int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
{
        const bool is_subvol = is_fstree(root_owner);
        const u64 eb_owner = btrfs_header_owner(eb);

        /*
         * Skip dummy fs, as selftests don't create unique ebs for each dummy
         * root.
         */
        if (btrfs_is_testing(eb->fs_info))
                return 0;
        /*
         * There are several call sites (backref walking, qgroup, and data
         * reloc) passing 0 as @root_owner, as they are not holding the
         * tree root.  In that case, we can not do a reliable ownership check,
         * so just exit.
         */
        if (root_owner == 0)
                return 0;
        /*
         * These trees use key.offset as their owner, our callers don't have
         * the extra capacity to pass key.offset here.  So we just skip them.
         */
        if (root_owner == BTRFS_TREE_LOG_OBJECTID ||
            root_owner == BTRFS_TREE_RELOC_OBJECTID)
                return 0;

        if (!is_subvol) {
                /* For non-subvolume trees, the eb owner should match root owner */
                if (unlikely(root_owner != eb_owner)) {
                        btrfs_crit(eb->fs_info,
"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect %llu",
                                btrfs_header_level(eb) == 0 ? "leaf" : "node",
                                root_owner, btrfs_header_bytenr(eb), eb_owner,
                                root_owner);
                        return -EUCLEAN;
                }
                return 0;
        }

        /*
         * For subvolume trees, owners can mismatch, but they should all belong
         * to subvolume trees.
         */
        if (unlikely(is_subvol != is_fstree(eb_owner))) {
                btrfs_crit(eb->fs_info,
"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]",
                        btrfs_header_level(eb) == 0 ? "leaf" : "node",
                        root_owner, btrfs_header_bytenr(eb), eb_owner,
                        BTRFS_FIRST_FREE_OBJECTID, BTRFS_LAST_FREE_OBJECTID);
                return -EUCLEAN;
        }
        return 0;
}

int btrfs_verify_level_key(struct extent_buffer *eb, int level,
                           struct btrfs_key *first_key, u64 parent_transid)
{
        struct btrfs_fs_info *fs_info = eb->fs_info;
        int found_level;
        struct btrfs_key found_key;
        int ret;

        found_level = btrfs_header_level(eb);
        if (found_level != level) {
                WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
                     KERN_ERR "BTRFS: tree level check failed\n");
                btrfs_err(fs_info,
"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
                          eb->start, level, found_level);
                return -EIO;
        }

        if (!first_key)
                return 0;

        /*
         * For live tree block (new tree blocks in current transaction),
         * we need proper lock context to avoid race, which is impossible here.
         * So we only checks tree blocks which is read from disk, whose
         * generation <= fs_info->last_trans_committed.
         */
        if (btrfs_header_generation(eb) > btrfs_get_last_trans_committed(fs_info))
                return 0;

        /* We have @first_key, so this @eb must have at least one item */
        if (btrfs_header_nritems(eb) == 0) {
                btrfs_err(fs_info,
                "invalid tree nritems, bytenr=%llu nritems=0 expect >0",
                          eb->start);
                WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
                return -EUCLEAN;
        }

        if (found_level)
                btrfs_node_key_to_cpu(eb, &found_key, 0);
        else
                btrfs_item_key_to_cpu(eb, &found_key, 0);
        ret = btrfs_comp_cpu_keys(first_key, &found_key);

        if (ret) {
                WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
                     KERN_ERR "BTRFS: tree first key check failed\n");
                btrfs_err(fs_info,
"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
                          eb->start, parent_transid, first_key->objectid,
                          first_key->type, first_key->offset,
                          found_key.objectid, found_key.type,
                          found_key.offset);
        }
        return ret;
}




























































































































































    3 


































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM scsi

#if !defined(_TRACE_SCSI_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SCSI_H

#include <scsi/scsi_cmnd.h>
#include <scsi/scsi_host.h>
#include <linux/tracepoint.h>
#include <linux/trace_seq.h>

#define scsi_opcode_name(opcode)        { opcode, #opcode }
#define show_opcode_name(val)                                        \
        __print_symbolic(val,                                        \
                scsi_opcode_name(TEST_UNIT_READY),                \
                scsi_opcode_name(REZERO_UNIT),                        \
                scsi_opcode_name(REQUEST_SENSE),                \
                scsi_opcode_name(FORMAT_UNIT),                        \
                scsi_opcode_name(READ_BLOCK_LIMITS),                \
                scsi_opcode_name(REASSIGN_BLOCKS),                \
                scsi_opcode_name(INITIALIZE_ELEMENT_STATUS),        \
                scsi_opcode_name(READ_6),                        \
                scsi_opcode_name(WRITE_6),                        \
                scsi_opcode_name(SEEK_6),                        \
                scsi_opcode_name(READ_REVERSE),                        \
                scsi_opcode_name(WRITE_FILEMARKS),                \
                scsi_opcode_name(SPACE),                        \
                scsi_opcode_name(INQUIRY),                        \
                scsi_opcode_name(RECOVER_BUFFERED_DATA),        \
                scsi_opcode_name(MODE_SELECT),                        \
                scsi_opcode_name(RESERVE),                        \
                scsi_opcode_name(RELEASE),                        \
                scsi_opcode_name(COPY),                                \
                scsi_opcode_name(ERASE),                        \
                scsi_opcode_name(MODE_SENSE),                        \
                scsi_opcode_name(START_STOP),                        \
                scsi_opcode_name(RECEIVE_DIAGNOSTIC),                \
                scsi_opcode_name(SEND_DIAGNOSTIC),                \
                scsi_opcode_name(ALLOW_MEDIUM_REMOVAL),                \
                scsi_opcode_name(SET_WINDOW),                        \
                scsi_opcode_name(READ_CAPACITY),                \
                scsi_opcode_name(READ_10),                        \
                scsi_opcode_name(WRITE_10),                        \
                scsi_opcode_name(SEEK_10),                        \
                scsi_opcode_name(POSITION_TO_ELEMENT),                \
                scsi_opcode_name(WRITE_VERIFY),                        \
                scsi_opcode_name(VERIFY),                        \
                scsi_opcode_name(SEARCH_HIGH),                        \
                scsi_opcode_name(SEARCH_EQUAL),                        \
                scsi_opcode_name(SEARCH_LOW),                        \
                scsi_opcode_name(SET_LIMITS),                        \
                scsi_opcode_name(PRE_FETCH),                        \
                scsi_opcode_name(READ_POSITION),                \
                scsi_opcode_name(SYNCHRONIZE_CACHE),                \
                scsi_opcode_name(LOCK_UNLOCK_CACHE),                \
                scsi_opcode_name(READ_DEFECT_DATA),                \
                scsi_opcode_name(MEDIUM_SCAN),                        \
                scsi_opcode_name(COMPARE),                        \
                scsi_opcode_name(COPY_VERIFY),                        \
                scsi_opcode_name(WRITE_BUFFER),                        \
                scsi_opcode_name(READ_BUFFER),                        \
                scsi_opcode_name(UPDATE_BLOCK),                        \
                scsi_opcode_name(READ_LONG),                        \
                scsi_opcode_name(WRITE_LONG),                        \
                scsi_opcode_name(CHANGE_DEFINITION),                \
                scsi_opcode_name(WRITE_SAME),                        \
                scsi_opcode_name(UNMAP),                        \
                scsi_opcode_name(READ_TOC),                        \
                scsi_opcode_name(LOG_SELECT),                        \
                scsi_opcode_name(LOG_SENSE),                        \
                scsi_opcode_name(XDWRITEREAD_10),                \
                scsi_opcode_name(MODE_SELECT_10),                \
                scsi_opcode_name(RESERVE_10),                        \
                scsi_opcode_name(RELEASE_10),                        \
                scsi_opcode_name(MODE_SENSE_10),                \
                scsi_opcode_name(PERSISTENT_RESERVE_IN),        \
                scsi_opcode_name(PERSISTENT_RESERVE_OUT),        \
                scsi_opcode_name(VARIABLE_LENGTH_CMD),                \
                scsi_opcode_name(REPORT_LUNS),                        \
                scsi_opcode_name(MAINTENANCE_IN),                \
                scsi_opcode_name(MAINTENANCE_OUT),                \
                scsi_opcode_name(MOVE_MEDIUM),                        \
                scsi_opcode_name(EXCHANGE_MEDIUM),                \
                scsi_opcode_name(READ_12),                        \
                scsi_opcode_name(WRITE_12),                        \
                scsi_opcode_name(WRITE_VERIFY_12),                \
                scsi_opcode_name(SEARCH_HIGH_12),                \
                scsi_opcode_name(SEARCH_EQUAL_12),                \
                scsi_opcode_name(SEARCH_LOW_12),                \
                scsi_opcode_name(READ_ELEMENT_STATUS),                \
                scsi_opcode_name(SEND_VOLUME_TAG),                \
                scsi_opcode_name(WRITE_LONG_2),                        \
                scsi_opcode_name(READ_16),                        \
                scsi_opcode_name(WRITE_16),                        \
                scsi_opcode_name(VERIFY_16),                        \
                scsi_opcode_name(WRITE_SAME_16),                \
                scsi_opcode_name(ZBC_OUT),                        \
                scsi_opcode_name(ZBC_IN),                        \
                scsi_opcode_name(SERVICE_ACTION_IN_16),                \
                scsi_opcode_name(READ_32),                        \
                scsi_opcode_name(WRITE_32),                        \
                scsi_opcode_name(WRITE_SAME_32),                \
                scsi_opcode_name(ATA_16),                        \
                scsi_opcode_name(ATA_12))

#define scsi_hostbyte_name(result)        { result, #result }
#define show_hostbyte_name(val)                                        \
        __print_symbolic(val,                                        \
                scsi_hostbyte_name(DID_OK),                        \
                scsi_hostbyte_name(DID_NO_CONNECT),                \
                scsi_hostbyte_name(DID_BUS_BUSY),                \
                scsi_hostbyte_name(DID_TIME_OUT),                \
                scsi_hostbyte_name(DID_BAD_TARGET),                \
                scsi_hostbyte_name(DID_ABORT),                        \
                scsi_hostbyte_name(DID_PARITY),                        \
                scsi_hostbyte_name(DID_ERROR),                        \
                scsi_hostbyte_name(DID_RESET),                        \
                scsi_hostbyte_name(DID_BAD_INTR),                \
                scsi_hostbyte_name(DID_PASSTHROUGH),                \
                scsi_hostbyte_name(DID_SOFT_ERROR),                \
                scsi_hostbyte_name(DID_IMM_RETRY),                \
                scsi_hostbyte_name(DID_REQUEUE),                \
                scsi_hostbyte_name(DID_TRANSPORT_DISRUPTED),        \
                scsi_hostbyte_name(DID_TRANSPORT_FAILFAST))

#define scsi_statusbyte_name(result)        { result, #result }
#define show_statusbyte_name(val)                                \
        __print_symbolic(val,                                        \
                scsi_statusbyte_name(SAM_STAT_GOOD),                \
                scsi_statusbyte_name(SAM_STAT_CHECK_CONDITION),        \
                scsi_statusbyte_name(SAM_STAT_CONDITION_MET),        \
                scsi_statusbyte_name(SAM_STAT_BUSY),                \
                scsi_statusbyte_name(SAM_STAT_INTERMEDIATE),        \
                scsi_statusbyte_name(SAM_STAT_INTERMEDIATE_CONDITION_MET), \
                scsi_statusbyte_name(SAM_STAT_RESERVATION_CONFLICT),        \
                scsi_statusbyte_name(SAM_STAT_COMMAND_TERMINATED),        \
                scsi_statusbyte_name(SAM_STAT_TASK_SET_FULL),        \
                scsi_statusbyte_name(SAM_STAT_ACA_ACTIVE),        \
                scsi_statusbyte_name(SAM_STAT_TASK_ABORTED))

#define scsi_prot_op_name(result)        { result, #result }
#define show_prot_op_name(val)                                        \
        __print_symbolic(val,                                        \
                scsi_prot_op_name(SCSI_PROT_NORMAL),                \
                scsi_prot_op_name(SCSI_PROT_READ_INSERT),        \
                scsi_prot_op_name(SCSI_PROT_WRITE_STRIP),        \
                scsi_prot_op_name(SCSI_PROT_READ_STRIP),        \
                scsi_prot_op_name(SCSI_PROT_WRITE_INSERT),        \
                scsi_prot_op_name(SCSI_PROT_READ_PASS),                \
                scsi_prot_op_name(SCSI_PROT_WRITE_PASS))

const char *scsi_trace_parse_cdb(struct trace_seq*, unsigned char*, int);
#define __parse_cdb(cdb, len) scsi_trace_parse_cdb(p, cdb, len)

TRACE_EVENT(scsi_dispatch_cmd_start,

        TP_PROTO(struct scsi_cmnd *cmd),

        TP_ARGS(cmd),

        TP_STRUCT__entry(
                __field( unsigned int,        host_no        )
                __field( unsigned int,        channel        )
                __field( unsigned int,        id        )
                __field( unsigned int,        lun        )
                __field( unsigned int,        opcode        )
                __field( unsigned int,        cmd_len )
                __field( int,        driver_tag)
                __field( int,        scheduler_tag)
                __field( unsigned int,        data_sglen )
                __field( unsigned int,        prot_sglen )
                __field( unsigned char,        prot_op )
                __dynamic_array(unsigned char,        cmnd, cmd->cmd_len)
        ),

        TP_fast_assign(
                __entry->host_no        = cmd->device->host->host_no;
                __entry->channel        = cmd->device->channel;
                __entry->id                = cmd->device->id;
                __entry->lun                = cmd->device->lun;
                __entry->opcode                = cmd->cmnd[0];
                __entry->cmd_len        = cmd->cmd_len;
                __entry->driver_tag        = scsi_cmd_to_rq(cmd)->tag;
                __entry->scheduler_tag        = scsi_cmd_to_rq(cmd)->internal_tag;
                __entry->data_sglen        = scsi_sg_count(cmd);
                __entry->prot_sglen        = scsi_prot_sg_count(cmd);
                __entry->prot_op        = scsi_get_prot_op(cmd);
                memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len);
        ),

        TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u prot_sgl=%u" \
                  " prot_op=%s driver_tag=%d scheduler_tag=%d cmnd=(%s %s raw=%s)",
                  __entry->host_no, __entry->channel, __entry->id,
                  __entry->lun, __entry->data_sglen, __entry->prot_sglen,
                  show_prot_op_name(__entry->prot_op), __entry->driver_tag,
                  __entry->scheduler_tag, show_opcode_name(__entry->opcode),
                  __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len),
                  __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len))
);

TRACE_EVENT(scsi_dispatch_cmd_error,

        TP_PROTO(struct scsi_cmnd *cmd, int rtn),

        TP_ARGS(cmd, rtn),

        TP_STRUCT__entry(
                __field( unsigned int,        host_no        )
                __field( unsigned int,        channel        )
                __field( unsigned int,        id        )
                __field( unsigned int,        lun        )
                __field( int,                rtn        )
                __field( unsigned int,        opcode        )
                __field( unsigned int,        cmd_len )
                __field( int,        driver_tag)
                __field( int,        scheduler_tag)
                __field( unsigned int,        data_sglen )
                __field( unsigned int,        prot_sglen )
                __field( unsigned char,        prot_op )
                __dynamic_array(unsigned char,        cmnd, cmd->cmd_len)
        ),

        TP_fast_assign(
                __entry->host_no        = cmd->device->host->host_no;
                __entry->channel        = cmd->device->channel;
                __entry->id                = cmd->device->id;
                __entry->lun                = cmd->device->lun;
                __entry->rtn                = rtn;
                __entry->opcode                = cmd->cmnd[0];
                __entry->cmd_len        = cmd->cmd_len;
                __entry->driver_tag        = scsi_cmd_to_rq(cmd)->tag;
                __entry->scheduler_tag        = scsi_cmd_to_rq(cmd)->internal_tag;
                __entry->data_sglen        = scsi_sg_count(cmd);
                __entry->prot_sglen        = scsi_prot_sg_count(cmd);
                __entry->prot_op        = scsi_get_prot_op(cmd);
                memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len);
        ),

        TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u prot_sgl=%u" \
                  " prot_op=%s driver_tag=%d scheduler_tag=%d cmnd=(%s %s raw=%s)" \
                  " rtn=%d",
                  __entry->host_no, __entry->channel, __entry->id,
                  __entry->lun, __entry->data_sglen, __entry->prot_sglen,
                  show_prot_op_name(__entry->prot_op), __entry->driver_tag,
                  __entry->scheduler_tag, show_opcode_name(__entry->opcode),
                  __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len),
                  __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len),
                  __entry->rtn)
);

DECLARE_EVENT_CLASS(scsi_cmd_done_timeout_template,

        TP_PROTO(struct scsi_cmnd *cmd),

        TP_ARGS(cmd),

        TP_STRUCT__entry(
                __field( unsigned int,        host_no        )
                __field( unsigned int,        channel        )
                __field( unsigned int,        id        )
                __field( unsigned int,        lun        )
                __field( int,                result        )
                __field( unsigned int,        opcode        )
                __field( unsigned int,        cmd_len )
                __field( int,        driver_tag)
                __field( int,        scheduler_tag)
                __field( unsigned int,        data_sglen )
                __field( unsigned int,        prot_sglen )
                __field( unsigned char,        prot_op )
                __dynamic_array(unsigned char,        cmnd, cmd->cmd_len)
                __field( u8, sense_key )
                __field( u8, asc )
                __field( u8, ascq )
        ),

        TP_fast_assign(
                struct scsi_sense_hdr sshdr;

                __entry->host_no        = cmd->device->host->host_no;
                __entry->channel        = cmd->device->channel;
                __entry->id                = cmd->device->id;
                __entry->lun                = cmd->device->lun;
                __entry->result                = cmd->result;
                __entry->opcode                = cmd->cmnd[0];
                __entry->cmd_len        = cmd->cmd_len;
                __entry->driver_tag        = scsi_cmd_to_rq(cmd)->tag;
                __entry->scheduler_tag        = scsi_cmd_to_rq(cmd)->internal_tag;
                __entry->data_sglen        = scsi_sg_count(cmd);
                __entry->prot_sglen        = scsi_prot_sg_count(cmd);
                __entry->prot_op        = scsi_get_prot_op(cmd);
                memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len);
                if (cmd->sense_buffer && SCSI_SENSE_VALID(cmd) &&
                    scsi_command_normalize_sense(cmd, &sshdr)) {
                        __entry->sense_key = sshdr.sense_key;
                        __entry->asc = sshdr.asc;
                        __entry->ascq = sshdr.ascq;
                } else {
                        __entry->sense_key = 0;
                        __entry->asc = 0;
                        __entry->ascq = 0;
                }
        ),

        TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u prot_sgl=%u " \
                  "prot_op=%s driver_tag=%d scheduler_tag=%d cmnd=(%s %s raw=%s) " \
                  "result=(driver=%s host=%s message=%s status=%s) "
                  "sense=(key=%#x asc=%#x ascq=%#x)",
                  __entry->host_no, __entry->channel, __entry->id,
                  __entry->lun, __entry->data_sglen, __entry->prot_sglen,
                  show_prot_op_name(__entry->prot_op), __entry->driver_tag,
                  __entry->scheduler_tag, show_opcode_name(__entry->opcode),
                  __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len),
                  __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len),
                  "DRIVER_OK",
                  show_hostbyte_name(((__entry->result) >> 16) & 0xff),
                  "COMMAND_COMPLETE",
                  show_statusbyte_name(__entry->result & 0xff),
                  __entry->sense_key, __entry->asc, __entry->ascq)
);

DEFINE_EVENT(scsi_cmd_done_timeout_template, scsi_dispatch_cmd_done,
             TP_PROTO(struct scsi_cmnd *cmd),
             TP_ARGS(cmd));

DEFINE_EVENT(scsi_cmd_done_timeout_template, scsi_dispatch_cmd_timeout,
             TP_PROTO(struct scsi_cmnd *cmd),
             TP_ARGS(cmd));

TRACE_EVENT(scsi_eh_wakeup,

        TP_PROTO(struct Scsi_Host *shost),

        TP_ARGS(shost),

        TP_STRUCT__entry(
                __field( unsigned int,        host_no        )
        ),

        TP_fast_assign(
                __entry->host_no        = shost->host_no;
        ),

        TP_printk("host_no=%u", __entry->host_no)
);

#endif /*  _TRACE_SCSI_H */

/* This part must be outside protection */
#include <trace/define_trace.h>























    4 















































    2 





    1 




    1 




    2 
    2 


















    5 
    5 
    4 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/interval_tree.c - interval tree for mapping->i_mmap
 *
 * Copyright (C) 2012, Michel Lespinasse <walken@google.com>
 */

#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/rmap.h>
#include <linux/interval_tree_generic.h>

static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
{
        return v->vm_pgoff;
}

static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
{
        return v->vm_pgoff + vma_pages(v) - 1;
}

INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
                     unsigned long, shared.rb_subtree_last,
                     vma_start_pgoff, vma_last_pgoff, /* empty */, vma_interval_tree)

/* Insert node immediately after prev in the interval tree */
void vma_interval_tree_insert_after(struct vm_area_struct *node,
                                    struct vm_area_struct *prev,
                                    struct rb_root_cached *root)
{
        struct rb_node **link;
        struct vm_area_struct *parent;
        unsigned long last = vma_last_pgoff(node);

        VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node);

        if (!prev->shared.rb.rb_right) {
                parent = prev;
                link = &prev->shared.rb.rb_right;
        } else {
                parent = rb_entry(prev->shared.rb.rb_right,
                                  struct vm_area_struct, shared.rb);
                if (parent->shared.rb_subtree_last < last)
                        parent->shared.rb_subtree_last = last;
                while (parent->shared.rb.rb_left) {
                        parent = rb_entry(parent->shared.rb.rb_left,
                                struct vm_area_struct, shared.rb);
                        if (parent->shared.rb_subtree_last < last)
                                parent->shared.rb_subtree_last = last;
                }
                link = &parent->shared.rb.rb_left;
        }

        node->shared.rb_subtree_last = last;
        rb_link_node(&node->shared.rb, &parent->shared.rb, link);
        rb_insert_augmented(&node->shared.rb, &root->rb_root,
                            &vma_interval_tree_augment);
}

static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
{
        return vma_start_pgoff(avc->vma);
}

static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
{
        return vma_last_pgoff(avc->vma);
}

INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
                     avc_start_pgoff, avc_last_pgoff,
                     static inline, __anon_vma_interval_tree)

void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
                                   struct rb_root_cached *root)
{
#ifdef CONFIG_DEBUG_VM_RB
        node->cached_vma_start = avc_start_pgoff(node);
        node->cached_vma_last = avc_last_pgoff(node);
#endif
        __anon_vma_interval_tree_insert(node, root);
}

void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
                                   struct rb_root_cached *root)
{
        __anon_vma_interval_tree_remove(node, root);
}

struct anon_vma_chain *
anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
                                  unsigned long first, unsigned long last)
{
        return __anon_vma_interval_tree_iter_first(root, first, last);
}

struct anon_vma_chain *
anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
                                 unsigned long first, unsigned long last)
{
        return __anon_vma_interval_tree_iter_next(node, first, last);
}

#ifdef CONFIG_DEBUG_VM_RB
void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
{
        WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
        WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
}
#endif




























   15 
































    6 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM pagemap

#if !defined(_TRACE_PAGEMAP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PAGEMAP_H

#include <linux/tracepoint.h>
#include <linux/mm.h>

#define        PAGEMAP_MAPPED                0x0001u
#define PAGEMAP_ANONYMOUS        0x0002u
#define PAGEMAP_FILE                0x0004u
#define PAGEMAP_SWAPCACHE        0x0008u
#define PAGEMAP_SWAPBACKED        0x0010u
#define PAGEMAP_MAPPEDDISK        0x0020u
#define PAGEMAP_BUFFERS                0x0040u

#define trace_pagemap_flags(folio) ( \
        (folio_test_anon(folio)                ? PAGEMAP_ANONYMOUS  : PAGEMAP_FILE) | \
        (folio_mapped(folio)                ? PAGEMAP_MAPPED     : 0) | \
        (folio_test_swapcache(folio)        ? PAGEMAP_SWAPCACHE  : 0) | \
        (folio_test_swapbacked(folio)        ? PAGEMAP_SWAPBACKED : 0) | \
        (folio_test_mappedtodisk(folio)        ? PAGEMAP_MAPPEDDISK : 0) | \
        (folio_test_private(folio)        ? PAGEMAP_BUFFERS    : 0) \
        )

TRACE_EVENT(mm_lru_insertion,

        TP_PROTO(struct folio *folio),

        TP_ARGS(folio),

        TP_STRUCT__entry(
                __field(struct folio *,        folio        )
                __field(unsigned long,        pfn        )
                __field(enum lru_list,        lru        )
                __field(unsigned long,        flags        )
        ),

        TP_fast_assign(
                __entry->folio        = folio;
                __entry->pfn        = folio_pfn(folio);
                __entry->lru        = folio_lru_list(folio);
                __entry->flags        = trace_pagemap_flags(folio);
        ),

        /* Flag format is based on page-types.c formatting for pagemap */
        TP_printk("folio=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s",
                        __entry->folio,
                        __entry->pfn,
                        __entry->lru,
                        __entry->flags & PAGEMAP_MAPPED                ? "M" : " ",
                        __entry->flags & PAGEMAP_ANONYMOUS        ? "a" : "f",
                        __entry->flags & PAGEMAP_SWAPCACHE        ? "s" : " ",
                        __entry->flags & PAGEMAP_SWAPBACKED        ? "b" : " ",
                        __entry->flags & PAGEMAP_MAPPEDDISK        ? "d" : " ",
                        __entry->flags & PAGEMAP_BUFFERS        ? "B" : " ")
);

TRACE_EVENT(mm_lru_activate,

        TP_PROTO(struct folio *folio),

        TP_ARGS(folio),

        TP_STRUCT__entry(
                __field(struct folio *,        folio        )
                __field(unsigned long,        pfn        )
        ),

        TP_fast_assign(
                __entry->folio        = folio;
                __entry->pfn        = folio_pfn(folio);
        ),

        TP_printk("folio=%p pfn=0x%lx", __entry->folio, __entry->pfn)
);

#endif /* _TRACE_PAGEMAP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>


































































































































   31 




   32 


























   30 





























   30 
    1 

    1 







   30 













    7 





    4 








    4 



















    3 






   33 













   34 































































































































   32 





   31 





    1 
    1 












    2 








    2 
    1 





















   31 

    4 


    8 
















   32 



   14 



















   33 


   32 















   14 



   33 



   32 
    2 













   11 

   13 










   22 











































   32 







   32 

   28 
    1 









   32 

   33 





























    2 








   32 

    1 





   30 


   30 



   32 
   32 



   33 

   28 
    5 



    1 






























   29 






   32 













   27 




























   31 








   30 


   32 









































































   23 
















   32 



   32 












   33 




















   29 


    7 













    5 














    4 

    3 
    3 

    2 






    4 


    4 





























































    1 
    1 















































































    2 

















































































    2 







    2 














































    3 
    3 


























































    1 





    1 








    1 


    1 

    1 

    1 




    1 






    1 










































    8 




    7 


    8 








    1 

    7 
    1 

    8 







    1 












    8 

    8 













































































































   25 















    8 

    8 

    4 














   11 
    5 




















    9 
   10 















   10 








   23 

































    5 

   21 

    4 












    8 
























    7 
    6 
    2 









    6 











   29 
















    2 
























    2 


    2 












    2 









    2 

















    2 











    2 






    2 
















   24 


   27 

    4 


   22 
    4 









   23 
    8 












    2 
















































    2 

    1 








    1 




    2 












   30 
   31 



    2 

    2 





    2 
    2 





    2 

















   22 







    3 

   27 




   21 

    7 


   20 















































































































   25 



   30 








   26 

















































   30 


































































   31 






    4 

    3 

    3 











   32 
   32 


   30 


   32 












   29 




    5 









   32 







    1 




   32 









   30 



    1 





   30 











   33 










   28 

    3 




   33 
   32 





    2 







    2 









    3 











   22 


   22 
   20 

    1 







    1 


    5 









    7 







    5 




    7 



   27 













   24 















   23 



   22 





    3 
   24 

   24 






   21 
    3 












   22 




   25 

   18 

   24 


    3 
   20 

   25 









    1 


   10 












   10 





   10 
   10 
    1 

    1 

    1 


    9 

   10 





















































    2 






















































    1 


    1 



    1 











    1 















































































































    1 








    1 







































































































   17 













    1 





























    7 














    7 







    6 






    3 

    4 



    7 















    8 
    8 







    7 









































    4 





















































    4 


































    5 





























    2 









    2 





    2 


























    2 

    2 




    8 







    2 



    1 

    4 




    2 









    2 











    8 







    7 














































    3 

























































































    1 

    3 


    2 











    1 













    3 





    3 

    3 
















    3 
    3 












    3 







    3 


    3 






















    2 





    3 
    1 




    3 






    3 



    3 

    3 




    2 







    3 


    3 
    1 
    3 

    3 




    3 




    1 
    3 





    4 
    1 

    3 
    2 















    3 

    4 


    3 
    5 

    5 
    3 



    3 




    7 






    3 

    4 






    1 

    7 

    7 

    8 



    8 

































































































































    7 













    3 
    8 


    8 


    4 
    6 
















    6 




    5 

    6 

    6 

    5 





    2 





    2 






    2 

    2 

    1 

    2 






    6 











    1 




































    5 

































































































    2 
















    2 






    2 









    2 




    2 




    2 












    2 








    1 





    1 





















    2 










    2 




    2 




    2 





    2 












    2 

    2 



    2 








    1 




    1 




















    2 








    2 



    2 














    2 





    2 





    2 
































    1 






    2 







    2 








    1 































    1 












    1 




    1 





    1 










    1 














    1 


































    1 






    1 

    1 







    1 


















    1 





    1 



    1 





















    2 















    2 





    2 











    1 





    2 


    2 









    1 





    1 

































    2 





































    2 









    2 






















    2 











































    2 



    2 






    2 













    1 






    1 



















































    4 






















    3 




    1 





    1 









    3 




















    3 

























    3 



    3 



    3 


    3 









    3 


    3 









    3 









    3 




    3 

    3 





    3 












    4 


















































    4 




















    4 

    4 











    1 



    3 

















    4 













    4 

    4 











    4 










    1 






    1 






    2 






    2 







    2 
















    1 





    1 

    1 










    1 





    1 



















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Some corrections by tytso.
 */

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
 * lookup logic.
 */
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
 */

#include <linux/init.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/wordpart.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/sched/mm.h>
#include <linux/fsnotify.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fcntl.h>
#include <linux/device_cgroup.h>
#include <linux/fs_struct.h>
#include <linux/posix_acl.h>
#include <linux/hash.h>
#include <linux/bitops.h>
#include <linux/init_task.h>
#include <linux/uaccess.h>

#include "internal.h"
#include "mount.h"

/* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
 * were necessary because of omirr.  The reason is that omirr needs
 * to know the _real_ pathname, not the user-supplied one, in case
 * of symlinks (and also when transname replacements occur).
 *
 * The new code replaces the old recursive symlink resolution with
 * an iterative one (in case of non-nested symlink chains).  It does
 * this with calls to <fs>_follow_link().
 * As a side effect, dir_namei(), _namei() and follow_link() are now 
 * replaced with a single function lookup_dentry() that can handle all 
 * the special cases of the former code.
 *
 * With the new dcache, the pathname is stored at each inode, at least as
 * long as the refcount of the inode is positive.  As a side effect, the
 * size of the dcache depends on the inode cache and thus is dynamic.
 *
 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
 * resolution to correspond with current state of the code.
 *
 * Note that the symlink resolution is not *completely* iterative.
 * There is still a significant amount of tail- and mid- recursion in
 * the algorithm.  Also, note that <fs>_readlink() is not used in
 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
 * may return different results than <fs>_follow_link().  Many virtual
 * filesystems (including /proc) exhibit this behavior.
 */

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
 * and the name already exists in form of a symlink, try to create the new
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
 * the name is a symlink pointing to a non-existent name.
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
 * "old" one. Personally, I think the new semantics is much more logical.
 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
 * file does succeed in both HP-UX and SunOs, but not in Solaris
 * and in the old Linux semantics.
 */

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
 * semantics.  See the comments in "open_namei" and "do_link" below.
 *
 * [10-Sep-98 Alan Modra] Another symlink change.
 */

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
 *        inside the path - always follow.
 *        in the last component in creation/removal/renaming - never follow.
 *        if LOOKUP_FOLLOW passed - follow.
 *        if the pathname has trailing slashes - follow.
 *        otherwise - don't follow.
 * (applied in that order).
 *
 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 * During the 2.4 we need to fix the userland stuff depending on it -
 * hopefully we will be able to get rid of that wart in 2.5. So far only
 * XEmacs seems to be relying on it...
 */
/*
 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
 * any extra contention...
 */

/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */

#define EMBEDDED_NAME_MAX        (PATH_MAX - offsetof(struct filename, iname))

struct filename *
getname_flags(const char __user *filename, int flags, int *empty)
{
        struct filename *result;
        char *kname;
        int len;

        result = audit_reusename(filename);
        if (result)
                return result;

        result = __getname();
        if (unlikely(!result))
                return ERR_PTR(-ENOMEM);

        /*
         * First, try to embed the struct filename inside the names_cache
         * allocation
         */
        kname = (char *)result->iname;
        result->name = kname;

        len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
        if (unlikely(len < 0)) {
                __putname(result);
                return ERR_PTR(len);
        }

        /*
         * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
         * separate struct filename so we can dedicate the entire
         * names_cache allocation for the pathname, and re-do the copy from
         * userland.
         */
        if (unlikely(len == EMBEDDED_NAME_MAX)) {
                const size_t size = offsetof(struct filename, iname[1]);
                kname = (char *)result;

                /*
                 * size is chosen that way we to guarantee that
                 * result->iname[0] is within the same object and that
                 * kname can't be equal to result->iname, no matter what.
                 */
                result = kzalloc(size, GFP_KERNEL);
                if (unlikely(!result)) {
                        __putname(kname);
                        return ERR_PTR(-ENOMEM);
                }
                result->name = kname;
                len = strncpy_from_user(kname, filename, PATH_MAX);
                if (unlikely(len < 0)) {
                        __putname(kname);
                        kfree(result);
                        return ERR_PTR(len);
                }
                if (unlikely(len == PATH_MAX)) {
                        __putname(kname);
                        kfree(result);
                        return ERR_PTR(-ENAMETOOLONG);
                }
        }

        atomic_set(&result->refcnt, 1);
        /* The empty path is special. */
        if (unlikely(!len)) {
                if (empty)
                        *empty = 1;
                if (!(flags & LOOKUP_EMPTY)) {
                        putname(result);
                        return ERR_PTR(-ENOENT);
                }
        }

        result->uptr = filename;
        result->aname = NULL;
        audit_getname(result);
        return result;
}

struct filename *
getname_uflags(const char __user *filename, int uflags)
{
        int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;

        return getname_flags(filename, flags, NULL);
}

struct filename *
getname(const char __user * filename)
{
        return getname_flags(filename, 0, NULL);
}

struct filename *
getname_kernel(const char * filename)
{
        struct filename *result;
        int len = strlen(filename) + 1;

        result = __getname();
        if (unlikely(!result))
                return ERR_PTR(-ENOMEM);

        if (len <= EMBEDDED_NAME_MAX) {
                result->name = (char *)result->iname;
        } else if (len <= PATH_MAX) {
                const size_t size = offsetof(struct filename, iname[1]);
                struct filename *tmp;

                tmp = kmalloc(size, GFP_KERNEL);
                if (unlikely(!tmp)) {
                        __putname(result);
                        return ERR_PTR(-ENOMEM);
                }
                tmp->name = (char *)result;
                result = tmp;
        } else {
                __putname(result);
                return ERR_PTR(-ENAMETOOLONG);
        }
        memcpy((char *)result->name, filename, len);
        result->uptr = NULL;
        result->aname = NULL;
        atomic_set(&result->refcnt, 1);
        audit_getname(result);

        return result;
}
EXPORT_SYMBOL(getname_kernel);

void putname(struct filename *name)
{
        if (IS_ERR(name))
                return;

        if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
                return;

        if (!atomic_dec_and_test(&name->refcnt))
                return;

        if (name->name != name->iname) {
                __putname(name->name);
                kfree(name);
        } else
                __putname(name);
}
EXPORT_SYMBOL(putname);

/**
 * check_acl - perform ACL permission checking
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        inode to check permissions on
 * @mask:        right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 *
 * This function performs the ACL permission checking. Since this function
 * retrieve POSIX acls it needs to know whether it is called from a blocking or
 * non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
static int check_acl(struct mnt_idmap *idmap,
                     struct inode *inode, int mask)
{
#ifdef CONFIG_FS_POSIX_ACL
        struct posix_acl *acl;

        if (mask & MAY_NOT_BLOCK) {
                acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
                if (!acl)
                        return -EAGAIN;
                /* no ->get_inode_acl() calls in RCU mode... */
                if (is_uncached_acl(acl))
                        return -ECHILD;
                return posix_acl_permission(idmap, inode, acl, mask);
        }

        acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
                int error = posix_acl_permission(idmap, inode, acl, mask);
                posix_acl_release(acl);
                return error;
        }
#endif

        return -EAGAIN;
}

/**
 * acl_permission_check - perform basic UNIX permission checking
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        inode to check permissions on
 * @mask:        right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 *
 * This function performs the basic UNIX permission checking. Since this
 * function may retrieve POSIX acls it needs to know whether it is called from a
 * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
static int acl_permission_check(struct mnt_idmap *idmap,
                                struct inode *inode, int mask)
{
        unsigned int mode = inode->i_mode;
        vfsuid_t vfsuid;

        /* Are we the owner? If so, ACL's don't matter */
        vfsuid = i_uid_into_vfsuid(idmap, inode);
        if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
                mask &= 7;
                mode >>= 6;
                return (mask & ~mode) ? -EACCES : 0;
        }

        /* Do we have ACL's? */
        if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
                int error = check_acl(idmap, inode, mask);
                if (error != -EAGAIN)
                        return error;
        }

        /* Only RWX matters for group/other mode bits */
        mask &= 7;

        /*
         * Are the group permissions different from
         * the other permissions in the bits we care
         * about? Need to check group ownership if so.
         */
        if (mask & (mode ^ (mode >> 3))) {
                vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
                if (vfsgid_in_group_p(vfsgid))
                        mode >>= 3;
        }

        /* Bits in 'mode' clear that we require? */
        return (mask & ~mode) ? -EACCES : 0;
}

/**
 * generic_permission -  check for access rights on a Posix-like filesystem
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        inode to check access rights for
 * @mask:        right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
 *                %MAY_NOT_BLOCK ...)
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
 * are used for other things.
 *
 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 * request cannot be satisfied (eg. requires blocking or too much complexity).
 * It would then be called again in ref-walk mode.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int generic_permission(struct mnt_idmap *idmap, struct inode *inode,
                       int mask)
{
        int ret;

        /*
         * Do the basic permission checks.
         */
        ret = acl_permission_check(idmap, inode, mask);
        if (ret != -EACCES)
                return ret;

        if (S_ISDIR(inode->i_mode)) {
                /* DACs are overridable for directories */
                if (!(mask & MAY_WRITE))
                        if (capable_wrt_inode_uidgid(idmap, inode,
                                                     CAP_DAC_READ_SEARCH))
                                return 0;
                if (capable_wrt_inode_uidgid(idmap, inode,
                                             CAP_DAC_OVERRIDE))
                        return 0;
                return -EACCES;
        }

        /*
         * Searching includes executable on directories, else just read.
         */
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
        if (mask == MAY_READ)
                if (capable_wrt_inode_uidgid(idmap, inode,
                                             CAP_DAC_READ_SEARCH))
                        return 0;
        /*
         * Read/write DACs are always overridable.
         * Executable DACs are overridable when there is
         * at least one exec bit set.
         */
        if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
                if (capable_wrt_inode_uidgid(idmap, inode,
                                             CAP_DAC_OVERRIDE))
                        return 0;

        return -EACCES;
}
EXPORT_SYMBOL(generic_permission);

/**
 * do_inode_permission - UNIX permission checking
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        inode to check permissions on
 * @mask:        right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 *
 * We _really_ want to just do "generic_permission()" without
 * even looking at the inode->i_op values. So we keep a cache
 * flag in inode->i_opflags, that says "this has not special
 * permission function, use the fast case".
 */
static inline int do_inode_permission(struct mnt_idmap *idmap,
                                      struct inode *inode, int mask)
{
        if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
                if (likely(inode->i_op->permission))
                        return inode->i_op->permission(idmap, inode, mask);

                /* This gets set once for the inode lifetime */
                spin_lock(&inode->i_lock);
                inode->i_opflags |= IOP_FASTPERM;
                spin_unlock(&inode->i_lock);
        }
        return generic_permission(idmap, inode, mask);
}

/**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
 */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
        if (unlikely(mask & MAY_WRITE)) {
                umode_t mode = inode->i_mode;

                /* Nobody gets write access to a read-only fs. */
                if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
                        return -EROFS;
        }
        return 0;
}

/**
 * inode_permission - Check for access rights to a given inode
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        Inode to check permission on
 * @mask:        Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 * this, letting us set arbitrary permissions for filesystem access without
 * changing the "normal" UIDs which are used for other things.
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
int inode_permission(struct mnt_idmap *idmap,
                     struct inode *inode, int mask)
{
        int retval;

        retval = sb_permission(inode->i_sb, inode, mask);
        if (retval)
                return retval;

        if (unlikely(mask & MAY_WRITE)) {
                /*
                 * Nobody gets write access to an immutable file.
                 */
                if (IS_IMMUTABLE(inode))
                        return -EPERM;

                /*
                 * Updating mtime will likely cause i_uid and i_gid to be
                 * written back improperly if their true value is unknown
                 * to the vfs.
                 */
                if (HAS_UNMAPPED_ID(idmap, inode))
                        return -EACCES;
        }

        retval = do_inode_permission(idmap, inode, mask);
        if (retval)
                return retval;

        retval = devcgroup_inode_permission(inode, mask);
        if (retval)
                return retval;

        return security_inode_permission(inode, mask);
}
EXPORT_SYMBOL(inode_permission);

/**
 * path_get - get a reference to a path
 * @path: path to get the reference to
 *
 * Given a path increment the reference count to the dentry and the vfsmount.
 */
void path_get(const struct path *path)
{
        mntget(path->mnt);
        dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
void path_put(const struct path *path)
{
        dput(path->dentry);
        mntput(path->mnt);
}
EXPORT_SYMBOL(path_put);

#define EMBEDDED_LEVELS 2
struct nameidata {
        struct path        path;
        struct qstr        last;
        struct path        root;
        struct inode        *inode; /* path.dentry.d_inode */
        unsigned int        flags, state;
        unsigned        seq, next_seq, m_seq, r_seq;
        int                last_type;
        unsigned        depth;
        int                total_link_count;
        struct saved {
                struct path link;
                struct delayed_call done;
                const char *name;
                unsigned seq;
        } *stack, internal[EMBEDDED_LEVELS];
        struct filename        *name;
        struct nameidata *saved;
        unsigned        root_seq;
        int                dfd;
        vfsuid_t        dir_vfsuid;
        umode_t                dir_mode;
} __randomize_layout;

#define ND_ROOT_PRESET 1
#define ND_ROOT_GRABBED 2
#define ND_JUMPED 4

static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
{
        struct nameidata *old = current->nameidata;
        p->stack = p->internal;
        p->depth = 0;
        p->dfd = dfd;
        p->name = name;
        p->path.mnt = NULL;
        p->path.dentry = NULL;
        p->total_link_count = old ? old->total_link_count : 0;
        p->saved = old;
        current->nameidata = p;
}

static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name,
                          const struct path *root)
{
        __set_nameidata(p, dfd, name);
        p->state = 0;
        if (unlikely(root)) {
                p->state = ND_ROOT_PRESET;
                p->root = *root;
        }
}

static void restore_nameidata(void)
{
        struct nameidata *now = current->nameidata, *old = now->saved;

        current->nameidata = old;
        if (old)
                old->total_link_count = now->total_link_count;
        if (now->stack != now->internal)
                kfree(now->stack);
}

static bool nd_alloc_stack(struct nameidata *nd)
{
        struct saved *p;

        p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
                         nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
        if (unlikely(!p))
                return false;
        memcpy(p, nd->internal, sizeof(nd->internal));
        nd->stack = p;
        return true;
}

/**
 * path_connected - Verify that a dentry is below mnt.mnt_root
 * @mnt: The mountpoint to check.
 * @dentry: The dentry to check.
 *
 * Rename can sometimes move a file or directory outside of a bind
 * mount, path_connected allows those cases to be detected.
 */
static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
{
        struct super_block *sb = mnt->mnt_sb;

        /* Bind mounts can have disconnected paths */
        if (mnt->mnt_root == sb->s_root)
                return true;

        return is_subdir(dentry, mnt->mnt_root);
}

static void drop_links(struct nameidata *nd)
{
        int i = nd->depth;
        while (i--) {
                struct saved *last = nd->stack + i;
                do_delayed_call(&last->done);
                clear_delayed_call(&last->done);
        }
}

static void leave_rcu(struct nameidata *nd)
{
        nd->flags &= ~LOOKUP_RCU;
        nd->seq = nd->next_seq = 0;
        rcu_read_unlock();
}

static void terminate_walk(struct nameidata *nd)
{
        drop_links(nd);
        if (!(nd->flags & LOOKUP_RCU)) {
                int i;
                path_put(&nd->path);
                for (i = 0; i < nd->depth; i++)
                        path_put(&nd->stack[i].link);
                if (nd->state & ND_ROOT_GRABBED) {
                        path_put(&nd->root);
                        nd->state &= ~ND_ROOT_GRABBED;
                }
        } else {
                leave_rcu(nd);
        }
        nd->depth = 0;
        nd->path.mnt = NULL;
        nd->path.dentry = NULL;
}

/* path_put is needed afterwards regardless of success or failure */
static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
{
        int res = __legitimize_mnt(path->mnt, mseq);
        if (unlikely(res)) {
                if (res > 0)
                        path->mnt = NULL;
                path->dentry = NULL;
                return false;
        }
        if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
                path->dentry = NULL;
                return false;
        }
        return !read_seqcount_retry(&path->dentry->d_seq, seq);
}

static inline bool legitimize_path(struct nameidata *nd,
                            struct path *path, unsigned seq)
{
        return __legitimize_path(path, seq, nd->m_seq);
}

static bool legitimize_links(struct nameidata *nd)
{
        int i;
        if (unlikely(nd->flags & LOOKUP_CACHED)) {
                drop_links(nd);
                nd->depth = 0;
                return false;
        }
        for (i = 0; i < nd->depth; i++) {
                struct saved *last = nd->stack + i;
                if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
                        drop_links(nd);
                        nd->depth = i + 1;
                        return false;
                }
        }
        return true;
}

static bool legitimize_root(struct nameidata *nd)
{
        /* Nothing to do if nd->root is zero or is managed by the VFS user. */
        if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET))
                return true;
        nd->state |= ND_ROOT_GRABBED;
        return legitimize_path(nd, &nd->root, nd->root_seq);
}

/*
 * Path walking has 2 modes, rcu-walk and ref-walk (see
 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
 * normal reference counts on dentries and vfsmounts to transition to ref-walk
 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 * got stuck, so ref-walk may continue from there. If this is not successful
 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 * to restart the path walk from the beginning in ref-walk mode.
 */

/**
 * try_to_unlazy - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * Returns: true on success, false on failure
 *
 * try_to_unlazy attempts to legitimize the current nd->path and nd->root
 * for ref-walk mode.
 * Must be called from rcu-walk context.
 * Nothing should touch nameidata between try_to_unlazy() failure and
 * terminate_walk().
 */
static bool try_to_unlazy(struct nameidata *nd)
{
        struct dentry *parent = nd->path.dentry;

        BUG_ON(!(nd->flags & LOOKUP_RCU));

        if (unlikely(!legitimize_links(nd)))
                goto out1;
        if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
                goto out;
        if (unlikely(!legitimize_root(nd)))
                goto out;
        leave_rcu(nd);
        BUG_ON(nd->inode != parent->d_inode);
        return true;

out1:
        nd->path.mnt = NULL;
        nd->path.dentry = NULL;
out:
        leave_rcu(nd);
        return false;
}

/**
 * try_to_unlazy_next - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * @dentry: next dentry to step into
 * Returns: true on success, false on failure
 *
 * Similar to try_to_unlazy(), but here we have the next dentry already
 * picked by rcu-walk and want to legitimize that in addition to the current
 * nd->path and nd->root for ref-walk mode.  Must be called from rcu-walk context.
 * Nothing should touch nameidata between try_to_unlazy_next() failure and
 * terminate_walk().
 */
static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry)
{
        int res;
        BUG_ON(!(nd->flags & LOOKUP_RCU));

        if (unlikely(!legitimize_links(nd)))
                goto out2;
        res = __legitimize_mnt(nd->path.mnt, nd->m_seq);
        if (unlikely(res)) {
                if (res > 0)
                        goto out2;
                goto out1;
        }
        if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
                goto out1;

        /*
         * We need to move both the parent and the dentry from the RCU domain
         * to be properly refcounted. And the sequence number in the dentry
         * validates *both* dentry counters, since we checked the sequence
         * number of the parent after we got the child sequence number. So we
         * know the parent must still be valid if the child sequence number is
         */
        if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
                goto out;
        if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
                goto out_dput;
        /*
         * Sequence counts matched. Now make sure that the root is
         * still valid and get it if required.
         */
        if (unlikely(!legitimize_root(nd)))
                goto out_dput;
        leave_rcu(nd);
        return true;

out2:
        nd->path.mnt = NULL;
out1:
        nd->path.dentry = NULL;
out:
        leave_rcu(nd);
        return false;
out_dput:
        leave_rcu(nd);
        dput(dentry);
        return false;
}

static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
{
        if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
                return dentry->d_op->d_revalidate(dentry, flags);
        else
                return 1;
}

/**
 * complete_walk - successful completion of path walk
 * @nd:  pointer nameidata
 *
 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 * Revalidate the final result, unless we'd already done that during
 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 * success, -error on failure.  In case of failure caller does not
 * need to drop nd->path.
 */
static int complete_walk(struct nameidata *nd)
{
        struct dentry *dentry = nd->path.dentry;
        int status;

        if (nd->flags & LOOKUP_RCU) {
                /*
                 * We don't want to zero nd->root for scoped-lookups or
                 * externally-managed nd->root.
                 */
                if (!(nd->state & ND_ROOT_PRESET))
                        if (!(nd->flags & LOOKUP_IS_SCOPED))
                                nd->root.mnt = NULL;
                nd->flags &= ~LOOKUP_CACHED;
                if (!try_to_unlazy(nd))
                        return -ECHILD;
        }

        if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
                /*
                 * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
                 * ever step outside the root during lookup" and should already
                 * be guaranteed by the rest of namei, we want to avoid a namei
                 * BUG resulting in userspace being given a path that was not
                 * scoped within the root at some point during the lookup.
                 *
                 * So, do a final sanity-check to make sure that in the
                 * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
                 * we won't silently return an fd completely outside of the
                 * requested root to userspace.
                 *
                 * Userspace could move the path outside the root after this
                 * check, but as discussed elsewhere this is not a concern (the
                 * resolved file was inside the root at some point).
                 */
                if (!path_is_under(&nd->path, &nd->root))
                        return -EXDEV;
        }

        if (likely(!(nd->state & ND_JUMPED)))
                return 0;

        if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
                return 0;

        status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
        if (status > 0)
                return 0;

        if (!status)
                status = -ESTALE;

        return status;
}

static int set_root(struct nameidata *nd)
{
        struct fs_struct *fs = current->fs;

        /*
         * Jumping to the real root in a scoped-lookup is a BUG in namei, but we
         * still have to ensure it doesn't happen because it will cause a breakout
         * from the dirfd.
         */
        if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
                return -ENOTRECOVERABLE;

        if (nd->flags & LOOKUP_RCU) {
                unsigned seq;

                do {
                        seq = read_seqcount_begin(&fs->seq);
                        nd->root = fs->root;
                        nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
                } while (read_seqcount_retry(&fs->seq, seq));
        } else {
                get_fs_root(fs, &nd->root);
                nd->state |= ND_ROOT_GRABBED;
        }
        return 0;
}

static int nd_jump_root(struct nameidata *nd)
{
        if (unlikely(nd->flags & LOOKUP_BENEATH))
                return -EXDEV;
        if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
                /* Absolute path arguments to path_init() are allowed. */
                if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
                        return -EXDEV;
        }
        if (!nd->root.mnt) {
                int error = set_root(nd);
                if (error)
                        return error;
        }
        if (nd->flags & LOOKUP_RCU) {
                struct dentry *d;
                nd->path = nd->root;
                d = nd->path.dentry;
                nd->inode = d->d_inode;
                nd->seq = nd->root_seq;
                if (read_seqcount_retry(&d->d_seq, nd->seq))
                        return -ECHILD;
        } else {
                path_put(&nd->path);
                nd->path = nd->root;
                path_get(&nd->path);
                nd->inode = nd->path.dentry->d_inode;
        }
        nd->state |= ND_JUMPED;
        return 0;
}

/*
 * Helper to directly jump to a known parsed path from ->get_link,
 * caller must have taken a reference to path beforehand.
 */
int nd_jump_link(const struct path *path)
{
        int error = -ELOOP;
        struct nameidata *nd = current->nameidata;

        if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
                goto err;

        error = -EXDEV;
        if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
                if (nd->path.mnt != path->mnt)
                        goto err;
        }
        /* Not currently safe for scoped-lookups. */
        if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
                goto err;

        path_put(&nd->path);
        nd->path = *path;
        nd->inode = nd->path.dentry->d_inode;
        nd->state |= ND_JUMPED;
        return 0;

err:
        path_put(path);
        return error;
}

static inline void put_link(struct nameidata *nd)
{
        struct saved *last = nd->stack + --nd->depth;
        do_delayed_call(&last->done);
        if (!(nd->flags & LOOKUP_RCU))
                path_put(&last->link);
}

static int sysctl_protected_symlinks __read_mostly;
static int sysctl_protected_hardlinks __read_mostly;
static int sysctl_protected_fifos __read_mostly;
static int sysctl_protected_regular __read_mostly;

#ifdef CONFIG_SYSCTL
static struct ctl_table namei_sysctls[] = {
        {
                .procname        = "protected_symlinks",
                .data                = &sysctl_protected_symlinks,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
        {
                .procname        = "protected_hardlinks",
                .data                = &sysctl_protected_hardlinks,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
        {
                .procname        = "protected_fifos",
                .data                = &sysctl_protected_fifos,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_TWO,
        },
        {
                .procname        = "protected_regular",
                .data                = &sysctl_protected_regular,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_TWO,
        },
};

static int __init init_fs_namei_sysctls(void)
{
        register_sysctl_init("fs", namei_sysctls);
        return 0;
}
fs_initcall(init_fs_namei_sysctls);

#endif /* CONFIG_SYSCTL */

/**
 * may_follow_link - Check symlink following for unsafe situations
 * @nd: nameidata pathwalk data
 * @inode: Used for idmapping.
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 * in a sticky world-writable directory. This is to protect privileged
 * processes from failing races against path names that may change out
 * from under them by way of other users creating malicious symlinks.
 * It will permit symlinks to be followed only when outside a sticky
 * world-writable directory, or when the uid of the symlink and follower
 * match, or when the directory owner matches the symlink's owner.
 *
 * Returns 0 if following the symlink is allowed, -ve on error.
 */
static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
{
        struct mnt_idmap *idmap;
        vfsuid_t vfsuid;

        if (!sysctl_protected_symlinks)
                return 0;

        idmap = mnt_idmap(nd->path.mnt);
        vfsuid = i_uid_into_vfsuid(idmap, inode);
        /* Allowed if owner and follower match. */
        if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
                return 0;

        /* Allowed if parent directory not sticky and world-writable. */
        if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
                return 0;

        /* Allowed if parent directory and link owner match. */
        if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid))
                return 0;

        if (nd->flags & LOOKUP_RCU)
                return -ECHILD;

        audit_inode(nd->name, nd->stack[0].link.dentry, 0);
        audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
        return -EACCES;
}

/**
 * safe_hardlink_source - Check for safe hardlink conditions
 * @idmap: idmap of the mount the inode was found from
 * @inode: the source inode to hardlink from
 *
 * Return false if at least one of the following conditions:
 *    - inode is not a regular file
 *    - inode is setuid
 *    - inode is setgid and group-exec
 *    - access failure for read and write
 *
 * Otherwise returns true.
 */
static bool safe_hardlink_source(struct mnt_idmap *idmap,
                                 struct inode *inode)
{
        umode_t mode = inode->i_mode;

        /* Special files should not get pinned to the filesystem. */
        if (!S_ISREG(mode))
                return false;

        /* Setuid files should not get pinned to the filesystem. */
        if (mode & S_ISUID)
                return false;

        /* Executable setgid files should not get pinned to the filesystem. */
        if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
                return false;

        /* Hardlinking to unreadable or unwritable sources is dangerous. */
        if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE))
                return false;

        return true;
}

/**
 * may_linkat - Check permissions for creating a hardlink
 * @idmap: idmap of the mount the inode was found from
 * @link:  the source to hardlink from
 *
 * Block hardlink when all of:
 *  - sysctl_protected_hardlinks enabled
 *  - fsuid does not match inode
 *  - hardlink source is unsafe (see safe_hardlink_source() above)
 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 *
 * Returns 0 if successful, -ve on error.
 */
int may_linkat(struct mnt_idmap *idmap, const struct path *link)
{
        struct inode *inode = link->dentry->d_inode;

        /* Inode writeback is not safe when the uid or gid are invalid. */
        if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
            !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)))
                return -EOVERFLOW;

        if (!sysctl_protected_hardlinks)
                return 0;

        /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
         * otherwise, it must be a safe source.
         */
        if (safe_hardlink_source(idmap, inode) ||
            inode_owner_or_capable(idmap, inode))
                return 0;

        audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
        return -EPERM;
}

/**
 * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
 *                          should be allowed, or not, on files that already
 *                          exist.
 * @idmap: idmap of the mount the inode was found from
 * @nd: nameidata pathwalk data
 * @inode: the inode of the file to open
 *
 * Block an O_CREAT open of a FIFO (or a regular file) when:
 *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
 *   - the file already exists
 *   - we are in a sticky directory
 *   - we don't own the file
 *   - the owner of the directory doesn't own the file
 *   - the directory is world writable
 * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
 * the directory doesn't have to be world writable: being group writable will
 * be enough.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 *
 * Returns 0 if the open is allowed, -ve on error.
 */
static int may_create_in_sticky(struct mnt_idmap *idmap,
                                struct nameidata *nd, struct inode *const inode)
{
        umode_t dir_mode = nd->dir_mode;
        vfsuid_t dir_vfsuid = nd->dir_vfsuid;

        if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
            (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
            likely(!(dir_mode & S_ISVTX)) ||
            vfsuid_eq(i_uid_into_vfsuid(idmap, inode), dir_vfsuid) ||
            vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid()))
                return 0;

        if (likely(dir_mode & 0002) ||
            (dir_mode & 0020 &&
             ((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
              (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
                const char *operation = S_ISFIFO(inode->i_mode) ?
                                        "sticky_create_fifo" :
                                        "sticky_create_regular";
                audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
                return -EACCES;
        }
        return 0;
}

/*
 * follow_up - Find the mountpoint of path's vfsmount
 *
 * Given a path, find the mountpoint of its source file system.
 * Replace @path with the path of the mountpoint in the parent mount.
 * Up is towards /.
 *
 * Return 1 if we went up a level and 0 if we were already at the
 * root.
 */
int follow_up(struct path *path)
{
        struct mount *mnt = real_mount(path->mnt);
        struct mount *parent;
        struct dentry *mountpoint;

        read_seqlock_excl(&mount_lock);
        parent = mnt->mnt_parent;
        if (parent == mnt) {
                read_sequnlock_excl(&mount_lock);
                return 0;
        }
        mntget(&parent->mnt);
        mountpoint = dget(mnt->mnt_mountpoint);
        read_sequnlock_excl(&mount_lock);
        dput(path->dentry);
        path->dentry = mountpoint;
        mntput(path->mnt);
        path->mnt = &parent->mnt;
        return 1;
}
EXPORT_SYMBOL(follow_up);

static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
                                  struct path *path, unsigned *seqp)
{
        while (mnt_has_parent(m)) {
                struct dentry *mountpoint = m->mnt_mountpoint;

                m = m->mnt_parent;
                if (unlikely(root->dentry == mountpoint &&
                             root->mnt == &m->mnt))
                        break;
                if (mountpoint != m->mnt.mnt_root) {
                        path->mnt = &m->mnt;
                        path->dentry = mountpoint;
                        *seqp = read_seqcount_begin(&mountpoint->d_seq);
                        return true;
                }
        }
        return false;
}

static bool choose_mountpoint(struct mount *m, const struct path *root,
                              struct path *path)
{
        bool found;

        rcu_read_lock();
        while (1) {
                unsigned seq, mseq = read_seqbegin(&mount_lock);

                found = choose_mountpoint_rcu(m, root, path, &seq);
                if (unlikely(!found)) {
                        if (!read_seqretry(&mount_lock, mseq))
                                break;
                } else {
                        if (likely(__legitimize_path(path, seq, mseq)))
                                break;
                        rcu_read_unlock();
                        path_put(path);
                        rcu_read_lock();
                }
        }
        rcu_read_unlock();
        return found;
}

/*
 * Perform an automount
 * - return -EISDIR to tell follow_managed() to stop and return the path we
 *   were called with.
 */
static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
{
        struct dentry *dentry = path->dentry;

        /* We don't want to mount if someone's just doing a stat -
         * unless they're stat'ing a directory and appended a '/' to
         * the name.
         *
         * We do, however, want to mount if someone wants to open or
         * create a file of any type under the mountpoint, wants to
         * traverse through the mountpoint or wants to open the
         * mounted directory.  Also, autofs may mark negative dentries
         * as being automount points.  These will need the attentions
         * of the daemon to instantiate them before they can be used.
         */
        if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
                           LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
            dentry->d_inode)
                return -EISDIR;

        if (count && (*count)++ >= MAXSYMLINKS)
                return -ELOOP;

        return finish_automount(dentry->d_op->d_automount(path), path);
}

/*
 * mount traversal - out-of-line part.  One note on ->d_flags accesses -
 * dentries are pinned but not locked here, so negative dentry can go
 * positive right under us.  Use of smp_load_acquire() provides a barrier
 * sufficient for ->d_inode and ->d_flags consistency.
 */
static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
                             int *count, unsigned lookup_flags)
{
        struct vfsmount *mnt = path->mnt;
        bool need_mntput = false;
        int ret = 0;

        while (flags & DCACHE_MANAGED_DENTRY) {
                /* Allow the filesystem to manage the transit without i_mutex
                 * being held. */
                if (flags & DCACHE_MANAGE_TRANSIT) {
                        ret = path->dentry->d_op->d_manage(path, false);
                        flags = smp_load_acquire(&path->dentry->d_flags);
                        if (ret < 0)
                                break;
                }

                if (flags & DCACHE_MOUNTED) {        // something's mounted on it..
                        struct vfsmount *mounted = lookup_mnt(path);
                        if (mounted) {                // ... in our namespace
                                dput(path->dentry);
                                if (need_mntput)
                                        mntput(path->mnt);
                                path->mnt = mounted;
                                path->dentry = dget(mounted->mnt_root);
                                // here we know it's positive
                                flags = path->dentry->d_flags;
                                need_mntput = true;
                                continue;
                        }
                }

                if (!(flags & DCACHE_NEED_AUTOMOUNT))
                        break;

                // uncovered automount point
                ret = follow_automount(path, count, lookup_flags);
                flags = smp_load_acquire(&path->dentry->d_flags);
                if (ret < 0)
                        break;
        }

        if (ret == -EISDIR)
                ret = 0;
        // possible if you race with several mount --move
        if (need_mntput && path->mnt == mnt)
                mntput(path->mnt);
        if (!ret && unlikely(d_flags_negative(flags)))
                ret = -ENOENT;
        *jumped = need_mntput;
        return ret;
}

static inline int traverse_mounts(struct path *path, bool *jumped,
                                  int *count, unsigned lookup_flags)
{
        unsigned flags = smp_load_acquire(&path->dentry->d_flags);

        /* fastpath */
        if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
                *jumped = false;
                if (unlikely(d_flags_negative(flags)))
                        return -ENOENT;
                return 0;
        }
        return __traverse_mounts(path, flags, jumped, count, lookup_flags);
}

int follow_down_one(struct path *path)
{
        struct vfsmount *mounted;

        mounted = lookup_mnt(path);
        if (mounted) {
                dput(path->dentry);
                mntput(path->mnt);
                path->mnt = mounted;
                path->dentry = dget(mounted->mnt_root);
                return 1;
        }
        return 0;
}
EXPORT_SYMBOL(follow_down_one);

/*
 * Follow down to the covering mount currently visible to userspace.  At each
 * point, the filesystem owning that dentry may be queried as to whether the
 * caller is permitted to proceed or not.
 */
int follow_down(struct path *path, unsigned int flags)
{
        struct vfsmount *mnt = path->mnt;
        bool jumped;
        int ret = traverse_mounts(path, &jumped, NULL, flags);

        if (path->mnt != mnt)
                mntput(mnt);
        return ret;
}
EXPORT_SYMBOL(follow_down);

/*
 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 * we meet a managed dentry that would need blocking.
 */
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path)
{
        struct dentry *dentry = path->dentry;
        unsigned int flags = dentry->d_flags;

        if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
                return true;

        if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                return false;

        for (;;) {
                /*
                 * Don't forget we might have a non-mountpoint managed dentry
                 * that wants to block transit.
                 */
                if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
                        int res = dentry->d_op->d_manage(path, true);
                        if (res)
                                return res == -EISDIR;
                        flags = dentry->d_flags;
                }

                if (flags & DCACHE_MOUNTED) {
                        struct mount *mounted = __lookup_mnt(path->mnt, dentry);
                        if (mounted) {
                                path->mnt = &mounted->mnt;
                                dentry = path->dentry = mounted->mnt.mnt_root;
                                nd->state |= ND_JUMPED;
                                nd->next_seq = read_seqcount_begin(&dentry->d_seq);
                                flags = dentry->d_flags;
                                // makes sure that non-RCU pathwalk could reach
                                // this state.
                                if (read_seqretry(&mount_lock, nd->m_seq))
                                        return false;
                                continue;
                        }
                        if (read_seqretry(&mount_lock, nd->m_seq))
                                return false;
                }
                return !(flags & DCACHE_NEED_AUTOMOUNT);
        }
}

static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
                          struct path *path)
{
        bool jumped;
        int ret;

        path->mnt = nd->path.mnt;
        path->dentry = dentry;
        if (nd->flags & LOOKUP_RCU) {
                unsigned int seq = nd->next_seq;
                if (likely(__follow_mount_rcu(nd, path)))
                        return 0;
                // *path and nd->next_seq might've been clobbered
                path->mnt = nd->path.mnt;
                path->dentry = dentry;
                nd->next_seq = seq;
                if (!try_to_unlazy_next(nd, dentry))
                        return -ECHILD;
        }
        ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
        if (jumped) {
                if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                        ret = -EXDEV;
                else
                        nd->state |= ND_JUMPED;
        }
        if (unlikely(ret)) {
                dput(path->dentry);
                if (path->mnt != nd->path.mnt)
                        mntput(path->mnt);
        }
        return ret;
}

/*
 * This looks up the name in dcache and possibly revalidates the found dentry.
 * NULL is returned if the dentry does not exist in the cache.
 */
static struct dentry *lookup_dcache(const struct qstr *name,
                                    struct dentry *dir,
                                    unsigned int flags)
{
        struct dentry *dentry = d_lookup(dir, name);
        if (dentry) {
                int error = d_revalidate(dentry, flags);
                if (unlikely(error <= 0)) {
                        if (!error)
                                d_invalidate(dentry);
                        dput(dentry);
                        return ERR_PTR(error);
                }
        }
        return dentry;
}

/*
 * Parent directory has inode locked exclusive.  This is one
 * and only case when ->lookup() gets called on non in-lookup
 * dentries - as the matter of fact, this only gets called
 * when directory is guaranteed to have no in-lookup children
 * at all.
 */
struct dentry *lookup_one_qstr_excl(const struct qstr *name,
                                    struct dentry *base,
                                    unsigned int flags)
{
        struct dentry *dentry = lookup_dcache(name, base, flags);
        struct dentry *old;
        struct inode *dir = base->d_inode;

        if (dentry)
                return dentry;

        /* Don't create child dentry for a dead directory. */
        if (unlikely(IS_DEADDIR(dir)))
                return ERR_PTR(-ENOENT);

        dentry = d_alloc(base, name);
        if (unlikely(!dentry))
                return ERR_PTR(-ENOMEM);

        old = dir->i_op->lookup(dir, dentry, flags);
        if (unlikely(old)) {
                dput(dentry);
                dentry = old;
        }
        return dentry;
}
EXPORT_SYMBOL(lookup_one_qstr_excl);

static struct dentry *lookup_fast(struct nameidata *nd)
{
        struct dentry *dentry, *parent = nd->path.dentry;
        int status = 1;

        /*
         * Rename seqlock is not required here because in the off chance
         * of a false negative due to a concurrent rename, the caller is
         * going to fall back to non-racy lookup.
         */
        if (nd->flags & LOOKUP_RCU) {
                dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq);
                if (unlikely(!dentry)) {
                        if (!try_to_unlazy(nd))
                                return ERR_PTR(-ECHILD);
                        return NULL;
                }

                /*
                 * This sequence count validates that the parent had no
                 * changes while we did the lookup of the dentry above.
                 */
                if (read_seqcount_retry(&parent->d_seq, nd->seq))
                        return ERR_PTR(-ECHILD);

                status = d_revalidate(dentry, nd->flags);
                if (likely(status > 0))
                        return dentry;
                if (!try_to_unlazy_next(nd, dentry))
                        return ERR_PTR(-ECHILD);
                if (status == -ECHILD)
                        /* we'd been told to redo it in non-rcu mode */
                        status = d_revalidate(dentry, nd->flags);
        } else {
                dentry = __d_lookup(parent, &nd->last);
                if (unlikely(!dentry))
                        return NULL;
                status = d_revalidate(dentry, nd->flags);
        }
        if (unlikely(status <= 0)) {
                if (!status)
                        d_invalidate(dentry);
                dput(dentry);
                return ERR_PTR(status);
        }
        return dentry;
}

/* Fast lookup failed, do it the slow way */
static struct dentry *__lookup_slow(const struct qstr *name,
                                    struct dentry *dir,
                                    unsigned int flags)
{
        struct dentry *dentry, *old;
        struct inode *inode = dir->d_inode;
        DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);

        /* Don't go there if it's already dead */
        if (unlikely(IS_DEADDIR(inode)))
                return ERR_PTR(-ENOENT);
again:
        dentry = d_alloc_parallel(dir, name, &wq);
        if (IS_ERR(dentry))
                return dentry;
        if (unlikely(!d_in_lookup(dentry))) {
                int error = d_revalidate(dentry, flags);
                if (unlikely(error <= 0)) {
                        if (!error) {
                                d_invalidate(dentry);
                                dput(dentry);
                                goto again;
                        }
                        dput(dentry);
                        dentry = ERR_PTR(error);
                }
        } else {
                old = inode->i_op->lookup(inode, dentry, flags);
                d_lookup_done(dentry);
                if (unlikely(old)) {
                        dput(dentry);
                        dentry = old;
                }
        }
        return dentry;
}

static struct dentry *lookup_slow(const struct qstr *name,
                                  struct dentry *dir,
                                  unsigned int flags)
{
        struct inode *inode = dir->d_inode;
        struct dentry *res;
        inode_lock_shared(inode);
        res = __lookup_slow(name, dir, flags);
        inode_unlock_shared(inode);
        return res;
}

static inline int may_lookup(struct mnt_idmap *idmap,
                             struct nameidata *nd)
{
        if (nd->flags & LOOKUP_RCU) {
                int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
                if (!err)                // success, keep going
                        return 0;
                if (!try_to_unlazy(nd))
                        return -ECHILD;        // redo it all non-lazy
                if (err != -ECHILD)        // hard error
                        return err;
        }
        return inode_permission(idmap, nd->inode, MAY_EXEC);
}

static int reserve_stack(struct nameidata *nd, struct path *link)
{
        if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
                return -ELOOP;

        if (likely(nd->depth != EMBEDDED_LEVELS))
                return 0;
        if (likely(nd->stack != nd->internal))
                return 0;
        if (likely(nd_alloc_stack(nd)))
                return 0;

        if (nd->flags & LOOKUP_RCU) {
                // we need to grab link before we do unlazy.  And we can't skip
                // unlazy even if we fail to grab the link - cleanup needs it
                bool grabbed_link = legitimize_path(nd, link, nd->next_seq);

                if (!try_to_unlazy(nd) || !grabbed_link)
                        return -ECHILD;

                if (nd_alloc_stack(nd))
                        return 0;
        }
        return -ENOMEM;
}

enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};

static const char *pick_link(struct nameidata *nd, struct path *link,
                     struct inode *inode, int flags)
{
        struct saved *last;
        const char *res;
        int error = reserve_stack(nd, link);

        if (unlikely(error)) {
                if (!(nd->flags & LOOKUP_RCU))
                        path_put(link);
                return ERR_PTR(error);
        }
        last = nd->stack + nd->depth++;
        last->link = *link;
        clear_delayed_call(&last->done);
        last->seq = nd->next_seq;

        if (flags & WALK_TRAILING) {
                error = may_follow_link(nd, inode);
                if (unlikely(error))
                        return ERR_PTR(error);
        }

        if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
                        unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
                return ERR_PTR(-ELOOP);

        if (!(nd->flags & LOOKUP_RCU)) {
                touch_atime(&last->link);
                cond_resched();
        } else if (atime_needs_update(&last->link, inode)) {
                if (!try_to_unlazy(nd))
                        return ERR_PTR(-ECHILD);
                touch_atime(&last->link);
        }

        error = security_inode_follow_link(link->dentry, inode,
                                           nd->flags & LOOKUP_RCU);
        if (unlikely(error))
                return ERR_PTR(error);

        res = READ_ONCE(inode->i_link);
        if (!res) {
                const char * (*get)(struct dentry *, struct inode *,
                                struct delayed_call *);
                get = inode->i_op->get_link;
                if (nd->flags & LOOKUP_RCU) {
                        res = get(NULL, inode, &last->done);
                        if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
                                res = get(link->dentry, inode, &last->done);
                } else {
                        res = get(link->dentry, inode, &last->done);
                }
                if (!res)
                        goto all_done;
                if (IS_ERR(res))
                        return res;
        }
        if (*res == '/') {
                error = nd_jump_root(nd);
                if (unlikely(error))
                        return ERR_PTR(error);
                while (unlikely(*++res == '/'))
                        ;
        }
        if (*res)
                return res;
all_done: // pure jump
        put_link(nd);
        return NULL;
}

/*
 * Do we need to follow links? We _really_ want to be able
 * to do this check without having to look at inode->i_op,
 * so we keep a cache of "no, this doesn't need follow_link"
 * for the common case.
 *
 * NOTE: dentry must be what nd->next_seq had been sampled from.
 */
static const char *step_into(struct nameidata *nd, int flags,
                     struct dentry *dentry)
{
        struct path path;
        struct inode *inode;
        int err = handle_mounts(nd, dentry, &path);

        if (err < 0)
                return ERR_PTR(err);
        inode = path.dentry->d_inode;
        if (likely(!d_is_symlink(path.dentry)) ||
           ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
           (flags & WALK_NOFOLLOW)) {
                /* not a symlink or should not follow */
                if (nd->flags & LOOKUP_RCU) {
                        if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
                                return ERR_PTR(-ECHILD);
                        if (unlikely(!inode))
                                return ERR_PTR(-ENOENT);
                } else {
                        dput(nd->path.dentry);
                        if (nd->path.mnt != path.mnt)
                                mntput(nd->path.mnt);
                }
                nd->path = path;
                nd->inode = inode;
                nd->seq = nd->next_seq;
                return NULL;
        }
        if (nd->flags & LOOKUP_RCU) {
                /* make sure that d_is_symlink above matches inode */
                if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
                        return ERR_PTR(-ECHILD);
        } else {
                if (path.mnt == nd->path.mnt)
                        mntget(path.mnt);
        }
        return pick_link(nd, &path, inode, flags);
}

static struct dentry *follow_dotdot_rcu(struct nameidata *nd)
{
        struct dentry *parent, *old;

        if (path_equal(&nd->path, &nd->root))
                goto in_root;
        if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
                struct path path;
                unsigned seq;
                if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
                                           &nd->root, &path, &seq))
                        goto in_root;
                if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                        return ERR_PTR(-ECHILD);
                nd->path = path;
                nd->inode = path.dentry->d_inode;
                nd->seq = seq;
                // makes sure that non-RCU pathwalk could reach this state
                if (read_seqretry(&mount_lock, nd->m_seq))
                        return ERR_PTR(-ECHILD);
                /* we know that mountpoint was pinned */
        }
        old = nd->path.dentry;
        parent = old->d_parent;
        nd->next_seq = read_seqcount_begin(&parent->d_seq);
        // makes sure that non-RCU pathwalk could reach this state
        if (read_seqcount_retry(&old->d_seq, nd->seq))
                return ERR_PTR(-ECHILD);
        if (unlikely(!path_connected(nd->path.mnt, parent)))
                return ERR_PTR(-ECHILD);
        return parent;
in_root:
        if (read_seqretry(&mount_lock, nd->m_seq))
                return ERR_PTR(-ECHILD);
        if (unlikely(nd->flags & LOOKUP_BENEATH))
                return ERR_PTR(-ECHILD);
        nd->next_seq = nd->seq;
        return nd->path.dentry;
}

static struct dentry *follow_dotdot(struct nameidata *nd)
{
        struct dentry *parent;

        if (path_equal(&nd->path, &nd->root))
                goto in_root;
        if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
                struct path path;

                if (!choose_mountpoint(real_mount(nd->path.mnt),
                                       &nd->root, &path))
                        goto in_root;
                path_put(&nd->path);
                nd->path = path;
                nd->inode = path.dentry->d_inode;
                if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                        return ERR_PTR(-EXDEV);
        }
        /* rare case of legitimate dget_parent()... */
        parent = dget_parent(nd->path.dentry);
        if (unlikely(!path_connected(nd->path.mnt, parent))) {
                dput(parent);
                return ERR_PTR(-ENOENT);
        }
        return parent;

in_root:
        if (unlikely(nd->flags & LOOKUP_BENEATH))
                return ERR_PTR(-EXDEV);
        return dget(nd->path.dentry);
}

static const char *handle_dots(struct nameidata *nd, int type)
{
        if (type == LAST_DOTDOT) {
                const char *error = NULL;
                struct dentry *parent;

                if (!nd->root.mnt) {
                        error = ERR_PTR(set_root(nd));
                        if (error)
                                return error;
                }
                if (nd->flags & LOOKUP_RCU)
                        parent = follow_dotdot_rcu(nd);
                else
                        parent = follow_dotdot(nd);
                if (IS_ERR(parent))
                        return ERR_CAST(parent);
                error = step_into(nd, WALK_NOFOLLOW, parent);
                if (unlikely(error))
                        return error;

                if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
                        /*
                         * If there was a racing rename or mount along our
                         * path, then we can't be sure that ".." hasn't jumped
                         * above nd->root (and so userspace should retry or use
                         * some fallback).
                         */
                        smp_rmb();
                        if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))
                                return ERR_PTR(-EAGAIN);
                        if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))
                                return ERR_PTR(-EAGAIN);
                }
        }
        return NULL;
}

static const char *walk_component(struct nameidata *nd, int flags)
{
        struct dentry *dentry;
        /*
         * "." and ".." are special - ".." especially so because it has
         * to be able to know about the current root directory and
         * parent relationships.
         */
        if (unlikely(nd->last_type != LAST_NORM)) {
                if (!(flags & WALK_MORE) && nd->depth)
                        put_link(nd);
                return handle_dots(nd, nd->last_type);
        }
        dentry = lookup_fast(nd);
        if (IS_ERR(dentry))
                return ERR_CAST(dentry);
        if (unlikely(!dentry)) {
                dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
                if (IS_ERR(dentry))
                        return ERR_CAST(dentry);
        }
        if (!(flags & WALK_MORE) && nd->depth)
                put_link(nd);
        return step_into(nd, flags, dentry);
}

/*
 * We can do the critical dentry name comparison and hashing
 * operations one word at a time, but we are limited to:
 *
 * - Architectures with fast unaligned word accesses. We could
 *   do a "get_unaligned()" if this helps and is sufficiently
 *   fast.
 *
 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
 *   do not trap on the (extremely unlikely) case of a page
 *   crossing operation.
 *
 * - Furthermore, we need an efficient 64-bit compile for the
 *   64-bit case in order to generate the "number of bytes in
 *   the final mask". Again, that could be replaced with a
 *   efficient population count instruction or similar.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

#include <asm/word-at-a-time.h>

#ifdef HASH_MIX

/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */

#elif defined(CONFIG_64BIT)
/*
 * Register pressure in the mixing function is an issue, particularly
 * on 32-bit x86, but almost any function requires one state value and
 * one temporary.  Instead, use a function designed for two state values
 * and no temporaries.
 *
 * This function cannot create a collision in only two iterations, so
 * we have two iterations to achieve avalanche.  In those two iterations,
 * we have six layers of mixing, which is enough to spread one bit's
 * influence out to 2^6 = 64 state bits.
 *
 * Rotate constants are scored by considering either 64 one-bit input
 * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
 * probability of that delta causing a change to each of the 128 output
 * bits, using a sample of random initial states.
 *
 * The Shannon entropy of the computed probabilities is then summed
 * to produce a score.  Ideally, any input change has a 50% chance of
 * toggling any given output bit.
 *
 * Mixing scores (in bits) for (12,45):
 * Input delta: 1-bit      2-bit
 * 1 round:     713.3    42542.6
 * 2 rounds:   2753.7   140389.8
 * 3 rounds:   5954.1   233458.2
 * 4 rounds:   7862.6   256672.2
 * Perfect:    8192     258048
 *            (64*128) (64*63/2 * 128)
 */
#define HASH_MIX(x, y, a)        \
        (        x ^= (a),        \
        y ^= x,        x = rol64(x,12),\
        x += y,        y = rol64(y,45),\
        y *= 9                        )

/*
 * Fold two longs into one 32-bit hash value.  This must be fast, but
 * latency isn't quite as critical, as there is a fair bit of additional
 * work done before the hash value is used.
 */
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
{
        y ^= x * GOLDEN_RATIO_64;
        y *= GOLDEN_RATIO_64;
        return y >> 32;
}

#else        /* 32-bit case */

/*
 * Mixing scores (in bits) for (7,20):
 * Input delta: 1-bit      2-bit
 * 1 round:     330.3     9201.6
 * 2 rounds:   1246.4    25475.4
 * 3 rounds:   1907.1    31295.1
 * 4 rounds:   2042.3    31718.6
 * Perfect:    2048      31744
 *            (32*64)   (32*31/2 * 64)
 */
#define HASH_MIX(x, y, a)        \
        (        x ^= (a),        \
        y ^= x,        x = rol32(x, 7),\
        x += y,        y = rol32(y,20),\
        y *= 9                        )

static inline unsigned int fold_hash(unsigned long x, unsigned long y)
{
        /* Use arch-optimized multiply if one exists */
        return __hash_32(y ^ __hash_32(x));
}

#endif

/*
 * Return the hash of a string of known length.  This is carfully
 * designed to match hash_name(), which is the more critical function.
 * In particular, we must end by hashing a final word containing 0..7
 * payload bytes, to match the way that hash_name() iterates until it
 * finds the delimiter after the name.
 */
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
        unsigned long a, x = 0, y = (unsigned long)salt;

        for (;;) {
                if (!len)
                        goto done;
                a = load_unaligned_zeropad(name);
                if (len < sizeof(unsigned long))
                        break;
                HASH_MIX(x, y, a);
                name += sizeof(unsigned long);
                len -= sizeof(unsigned long);
        }
        x ^= a & bytemask_from_count(len);
done:
        return fold_hash(x, y);
}
EXPORT_SYMBOL(full_name_hash);

/* Return the "hash_len" (hash and length) of a null-terminated string */
u64 hashlen_string(const void *salt, const char *name)
{
        unsigned long a = 0, x = 0, y = (unsigned long)salt;
        unsigned long adata, mask, len;
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

        len = 0;
        goto inside;

        do {
                HASH_MIX(x, y, a);
                len += sizeof(unsigned long);
inside:
                a = load_unaligned_zeropad(name+len);
        } while (!has_zero(a, &adata, &constants));

        adata = prep_zero_mask(a, adata, &constants);
        mask = create_zero_mask(adata);
        x ^= a & zero_bytemask(mask);

        return hashlen_create(fold_hash(x, y), len + find_zero(mask));
}
EXPORT_SYMBOL(hashlen_string);

/*
 * Calculate the length and hash of the path component, and
 * return the "hash_len" as the result.
 */
static inline u64 hash_name(const void *salt, const char *name)
{
        unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
        unsigned long adata, bdata, mask, len;
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

        len = 0;
        goto inside;

        do {
                HASH_MIX(x, y, a);
                len += sizeof(unsigned long);
inside:
                a = load_unaligned_zeropad(name+len);
                b = a ^ REPEAT_BYTE('/');
        } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));

        adata = prep_zero_mask(a, adata, &constants);
        bdata = prep_zero_mask(b, bdata, &constants);
        mask = create_zero_mask(adata | bdata);
        x ^= a & zero_bytemask(mask);

        return hashlen_create(fold_hash(x, y), len + find_zero(mask));
}

#else        /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */

/* Return the hash of a string of known length */
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
        unsigned long hash = init_name_hash(salt);
        while (len--)
                hash = partial_name_hash((unsigned char)*name++, hash);
        return end_name_hash(hash);
}
EXPORT_SYMBOL(full_name_hash);

/* Return the "hash_len" (hash and length) of a null-terminated string */
u64 hashlen_string(const void *salt, const char *name)
{
        unsigned long hash = init_name_hash(salt);
        unsigned long len = 0, c;

        c = (unsigned char)*name;
        while (c) {
                len++;
                hash = partial_name_hash(c, hash);
                c = (unsigned char)name[len];
        }
        return hashlen_create(end_name_hash(hash), len);
}
EXPORT_SYMBOL(hashlen_string);

/*
 * We know there's a real path component here of at least
 * one character.
 */
static inline u64 hash_name(const void *salt, const char *name)
{
        unsigned long hash = init_name_hash(salt);
        unsigned long len = 0, c;

        c = (unsigned char)*name;
        do {
                len++;
                hash = partial_name_hash(c, hash);
                c = (unsigned char)name[len];
        } while (c && c != '/');
        return hashlen_create(end_name_hash(hash), len);
}

#endif

/*
 * Name resolution.
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
 *
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
 */
static int link_path_walk(const char *name, struct nameidata *nd)
{
        int depth = 0; // depth <= nd->depth
        int err;

        nd->last_type = LAST_ROOT;
        nd->flags |= LOOKUP_PARENT;
        if (IS_ERR(name))
                return PTR_ERR(name);
        while (*name=='/')
                name++;
        if (!*name) {
                nd->dir_mode = 0; // short-circuit the 'hardening' idiocy
                return 0;
        }

        /* At this point we know we have a real path component. */
        for(;;) {
                struct mnt_idmap *idmap;
                const char *link;
                u64 hash_len;
                int type;

                idmap = mnt_idmap(nd->path.mnt);
                err = may_lookup(idmap, nd);
                if (err)
                        return err;

                hash_len = hash_name(nd->path.dentry, name);

                type = LAST_NORM;
                if (name[0] == '.') switch (hashlen_len(hash_len)) {
                        case 2:
                                if (name[1] == '.') {
                                        type = LAST_DOTDOT;
                                        nd->state |= ND_JUMPED;
                                }
                                break;
                        case 1:
                                type = LAST_DOT;
                }
                if (likely(type == LAST_NORM)) {
                        struct dentry *parent = nd->path.dentry;
                        nd->state &= ~ND_JUMPED;
                        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
                                struct qstr this = { { .hash_len = hash_len }, .name = name };
                                err = parent->d_op->d_hash(parent, &this);
                                if (err < 0)
                                        return err;
                                hash_len = this.hash_len;
                                name = this.name;
                        }
                }

                nd->last.hash_len = hash_len;
                nd->last.name = name;
                nd->last_type = type;

                name += hashlen_len(hash_len);
                if (!*name)
                        goto OK;
                /*
                 * If it wasn't NUL, we know it was '/'. Skip that
                 * slash, and continue until no more slashes.
                 */
                do {
                        name++;
                } while (unlikely(*name == '/'));
                if (unlikely(!*name)) {
OK:
                        /* pathname or trailing symlink, done */
                        if (!depth) {
                                nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode);
                                nd->dir_mode = nd->inode->i_mode;
                                nd->flags &= ~LOOKUP_PARENT;
                                return 0;
                        }
                        /* last component of nested symlink */
                        name = nd->stack[--depth].name;
                        link = walk_component(nd, 0);
                } else {
                        /* not the last component */
                        link = walk_component(nd, WALK_MORE);
                }
                if (unlikely(link)) {
                        if (IS_ERR(link))
                                return PTR_ERR(link);
                        /* a symlink to follow */
                        nd->stack[depth++].name = name;
                        name = link;
                        continue;
                }
                if (unlikely(!d_can_lookup(nd->path.dentry))) {
                        if (nd->flags & LOOKUP_RCU) {
                                if (!try_to_unlazy(nd))
                                        return -ECHILD;
                        }
                        return -ENOTDIR;
                }
        }
}

/* must be paired with terminate_walk() */
static const char *path_init(struct nameidata *nd, unsigned flags)
{
        int error;
        const char *s = nd->name->name;

        /* LOOKUP_CACHED requires RCU, ask caller to retry */
        if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
                return ERR_PTR(-EAGAIN);

        if (!*s)
                flags &= ~LOOKUP_RCU;
        if (flags & LOOKUP_RCU)
                rcu_read_lock();
        else
                nd->seq = nd->next_seq = 0;

        nd->flags = flags;
        nd->state |= ND_JUMPED;

        nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
        nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
        smp_rmb();

        if (nd->state & ND_ROOT_PRESET) {
                struct dentry *root = nd->root.dentry;
                struct inode *inode = root->d_inode;
                if (*s && unlikely(!d_can_lookup(root)))
                        return ERR_PTR(-ENOTDIR);
                nd->path = nd->root;
                nd->inode = inode;
                if (flags & LOOKUP_RCU) {
                        nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
                        nd->root_seq = nd->seq;
                } else {
                        path_get(&nd->path);
                }
                return s;
        }

        nd->root.mnt = NULL;

        /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
        if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
                error = nd_jump_root(nd);
                if (unlikely(error))
                        return ERR_PTR(error);
                return s;
        }

        /* Relative pathname -- get the starting-point it is relative to. */
        if (nd->dfd == AT_FDCWD) {
                if (flags & LOOKUP_RCU) {
                        struct fs_struct *fs = current->fs;
                        unsigned seq;

                        do {
                                seq = read_seqcount_begin(&fs->seq);
                                nd->path = fs->pwd;
                                nd->inode = nd->path.dentry->d_inode;
                                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
                        } while (read_seqcount_retry(&fs->seq, seq));
                } else {
                        get_fs_pwd(current->fs, &nd->path);
                        nd->inode = nd->path.dentry->d_inode;
                }
        } else {
                /* Caller must check execute permissions on the starting path component */
                struct fd f = fdget_raw(nd->dfd);
                struct dentry *dentry;

                if (!f.file)
                        return ERR_PTR(-EBADF);

                if (flags & LOOKUP_LINKAT_EMPTY) {
                        if (f.file->f_cred != current_cred() &&
                            !ns_capable(f.file->f_cred->user_ns, CAP_DAC_READ_SEARCH)) {
                                fdput(f);
                                return ERR_PTR(-ENOENT);
                        }
                }

                dentry = f.file->f_path.dentry;

                if (*s && unlikely(!d_can_lookup(dentry))) {
                        fdput(f);
                        return ERR_PTR(-ENOTDIR);
                }

                nd->path = f.file->f_path;
                if (flags & LOOKUP_RCU) {
                        nd->inode = nd->path.dentry->d_inode;
                        nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
                } else {
                        path_get(&nd->path);
                        nd->inode = nd->path.dentry->d_inode;
                }
                fdput(f);
        }

        /* For scoped-lookups we need to set the root to the dirfd as well. */
        if (flags & LOOKUP_IS_SCOPED) {
                nd->root = nd->path;
                if (flags & LOOKUP_RCU) {
                        nd->root_seq = nd->seq;
                } else {
                        path_get(&nd->root);
                        nd->state |= ND_ROOT_GRABBED;
                }
        }
        return s;
}

static inline const char *lookup_last(struct nameidata *nd)
{
        if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
                nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;

        return walk_component(nd, WALK_TRAILING);
}

static int handle_lookup_down(struct nameidata *nd)
{
        if (!(nd->flags & LOOKUP_RCU))
                dget(nd->path.dentry);
        nd->next_seq = nd->seq;
        return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry));
}

/* Returns 0 and nd will be valid on success; Returns error, otherwise. */
static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
{
        const char *s = path_init(nd, flags);
        int err;

        if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
                err = handle_lookup_down(nd);
                if (unlikely(err < 0))
                        s = ERR_PTR(err);
        }

        while (!(err = link_path_walk(s, nd)) &&
               (s = lookup_last(nd)) != NULL)
                ;
        if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
                err = handle_lookup_down(nd);
                nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
        }
        if (!err)
                err = complete_walk(nd);

        if (!err && nd->flags & LOOKUP_DIRECTORY)
                if (!d_can_lookup(nd->path.dentry))
                        err = -ENOTDIR;
        if (!err) {
                *path = nd->path;
                nd->path.mnt = NULL;
                nd->path.dentry = NULL;
        }
        terminate_walk(nd);
        return err;
}

int filename_lookup(int dfd, struct filename *name, unsigned flags,
                    struct path *path, struct path *root)
{
        int retval;
        struct nameidata nd;
        if (IS_ERR(name))
                return PTR_ERR(name);
        set_nameidata(&nd, dfd, name, root);
        retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
        if (unlikely(retval == -ECHILD))
                retval = path_lookupat(&nd, flags, path);
        if (unlikely(retval == -ESTALE))
                retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);

        if (likely(!retval))
                audit_inode(name, path->dentry,
                            flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
        restore_nameidata();
        return retval;
}

/* Returns 0 and nd will be valid on success; Returns error, otherwise. */
static int path_parentat(struct nameidata *nd, unsigned flags,
                                struct path *parent)
{
        const char *s = path_init(nd, flags);
        int err = link_path_walk(s, nd);
        if (!err)
                err = complete_walk(nd);
        if (!err) {
                *parent = nd->path;
                nd->path.mnt = NULL;
                nd->path.dentry = NULL;
        }
        terminate_walk(nd);
        return err;
}

/* Note: this does not consume "name" */
static int __filename_parentat(int dfd, struct filename *name,
                               unsigned int flags, struct path *parent,
                               struct qstr *last, int *type,
                               const struct path *root)
{
        int retval;
        struct nameidata nd;

        if (IS_ERR(name))
                return PTR_ERR(name);
        set_nameidata(&nd, dfd, name, root);
        retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
        if (unlikely(retval == -ECHILD))
                retval = path_parentat(&nd, flags, parent);
        if (unlikely(retval == -ESTALE))
                retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
        if (likely(!retval)) {
                *last = nd.last;
                *type = nd.last_type;
                audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
        }
        restore_nameidata();
        return retval;
}

static int filename_parentat(int dfd, struct filename *name,
                             unsigned int flags, struct path *parent,
                             struct qstr *last, int *type)
{
        return __filename_parentat(dfd, name, flags, parent, last, type, NULL);
}

/* does lookup, returns the object with parent locked */
static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path)
{
        struct dentry *d;
        struct qstr last;
        int type, error;

        error = filename_parentat(dfd, name, 0, path, &last, &type);
        if (error)
                return ERR_PTR(error);
        if (unlikely(type != LAST_NORM)) {
                path_put(path);
                return ERR_PTR(-EINVAL);
        }
        inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
        d = lookup_one_qstr_excl(&last, path->dentry, 0);
        if (IS_ERR(d)) {
                inode_unlock(path->dentry->d_inode);
                path_put(path);
        }
        return d;
}

struct dentry *kern_path_locked(const char *name, struct path *path)
{
        struct filename *filename = getname_kernel(name);
        struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path);

        putname(filename);
        return res;
}

struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path)
{
        struct filename *filename = getname(name);
        struct dentry *res = __kern_path_locked(dfd, filename, path);

        putname(filename);
        return res;
}
EXPORT_SYMBOL(user_path_locked_at);

int kern_path(const char *name, unsigned int flags, struct path *path)
{
        struct filename *filename = getname_kernel(name);
        int ret = filename_lookup(AT_FDCWD, filename, flags, path, NULL);

        putname(filename);
        return ret;

}
EXPORT_SYMBOL(kern_path);

/**
 * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair
 * @filename: filename structure
 * @flags: lookup flags
 * @parent: pointer to struct path to fill
 * @last: last component
 * @type: type of the last component
 * @root: pointer to struct path of the base directory
 */
int vfs_path_parent_lookup(struct filename *filename, unsigned int flags,
                           struct path *parent, struct qstr *last, int *type,
                           const struct path *root)
{
        return  __filename_parentat(AT_FDCWD, filename, flags, parent, last,
                                    type, root);
}
EXPORT_SYMBOL(vfs_path_parent_lookup);

/**
 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
 * @dentry:  pointer to dentry of the base directory
 * @mnt: pointer to vfs mount of the base directory
 * @name: pointer to file name
 * @flags: lookup flags
 * @path: pointer to struct path to fill
 */
int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
                    const char *name, unsigned int flags,
                    struct path *path)
{
        struct filename *filename;
        struct path root = {.mnt = mnt, .dentry = dentry};
        int ret;

        filename = getname_kernel(name);
        /* the first argument of filename_lookup() is ignored with root */
        ret = filename_lookup(AT_FDCWD, filename, flags, path, &root);
        putname(filename);
        return ret;
}
EXPORT_SYMBOL(vfs_path_lookup);

static int lookup_one_common(struct mnt_idmap *idmap,
                             const char *name, struct dentry *base, int len,
                             struct qstr *this)
{
        this->name = name;
        this->len = len;
        this->hash = full_name_hash(base, name, len);
        if (!len)
                return -EACCES;

        if (is_dot_dotdot(name, len))
                return -EACCES;

        while (len--) {
                unsigned int c = *(const unsigned char *)name++;
                if (c == '/' || c == '\0')
                        return -EACCES;
        }
        /*
         * See if the low-level filesystem might want
         * to use its own hash..
         */
        if (base->d_flags & DCACHE_OP_HASH) {
                int err = base->d_op->d_hash(base, this);
                if (err < 0)
                        return err;
        }

        return inode_permission(idmap, base->d_inode, MAY_EXEC);
}

/**
 * try_lookup_one_len - filesystem helper to lookup single pathname component
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Look up a dentry by name in the dcache, returning NULL if it does not
 * currently exist.  The function does not try to create a dentry.
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
{
        struct qstr this;
        int err;

        WARN_ON_ONCE(!inode_is_locked(base->d_inode));

        err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this);
        if (err)
                return ERR_PTR(err);

        return lookup_dcache(&this, base, 0);
}
EXPORT_SYMBOL(try_lookup_one_len);

/**
 * lookup_one_len - filesystem helper to lookup single pathname component
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
        struct dentry *dentry;
        struct qstr this;
        int err;

        WARN_ON_ONCE(!inode_is_locked(base->d_inode));

        err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this);
        if (err)
                return ERR_PTR(err);

        dentry = lookup_dcache(&this, base, 0);
        return dentry ? dentry : __lookup_slow(&this, base, 0);
}
EXPORT_SYMBOL(lookup_one_len);

/**
 * lookup_one - filesystem helper to lookup single pathname component
 * @idmap:        idmap of the mount the lookup is performed from
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name,
                          struct dentry *base, int len)
{
        struct dentry *dentry;
        struct qstr this;
        int err;

        WARN_ON_ONCE(!inode_is_locked(base->d_inode));

        err = lookup_one_common(idmap, name, base, len, &this);
        if (err)
                return ERR_PTR(err);

        dentry = lookup_dcache(&this, base, 0);
        return dentry ? dentry : __lookup_slow(&this, base, 0);
}
EXPORT_SYMBOL(lookup_one);

/**
 * lookup_one_unlocked - filesystem helper to lookup single pathname component
 * @idmap:        idmap of the mount the lookup is performed from
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * Unlike lookup_one_len, it should be called without the parent
 * i_mutex held, and will take the i_mutex itself if necessary.
 */
struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap,
                                   const char *name, struct dentry *base,
                                   int len)
{
        struct qstr this;
        int err;
        struct dentry *ret;

        err = lookup_one_common(idmap, name, base, len, &this);
        if (err)
                return ERR_PTR(err);

        ret = lookup_dcache(&this, base, 0);
        if (!ret)
                ret = lookup_slow(&this, base, 0);
        return ret;
}
EXPORT_SYMBOL(lookup_one_unlocked);

/**
 * lookup_one_positive_unlocked - filesystem helper to lookup single
 *                                  pathname component
 * @idmap:        idmap of the mount the lookup is performed from
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
 * known positive or ERR_PTR(). This is what most of the users want.
 *
 * Note that pinned negative with unlocked parent _can_ become positive at any
 * time, so callers of lookup_one_unlocked() need to be very careful; pinned
 * positives have >d_inode stable, so this one avoids such problems.
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The helper should be called without i_mutex held.
 */
struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap,
                                            const char *name,
                                            struct dentry *base, int len)
{
        struct dentry *ret = lookup_one_unlocked(idmap, name, base, len);

        if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
                dput(ret);
                ret = ERR_PTR(-ENOENT);
        }
        return ret;
}
EXPORT_SYMBOL(lookup_one_positive_unlocked);

/**
 * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * Unlike lookup_one_len, it should be called without the parent
 * i_mutex held, and will take the i_mutex itself if necessary.
 */
struct dentry *lookup_one_len_unlocked(const char *name,
                                       struct dentry *base, int len)
{
        return lookup_one_unlocked(&nop_mnt_idmap, name, base, len);
}
EXPORT_SYMBOL(lookup_one_len_unlocked);

/*
 * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
 * on negatives.  Returns known positive or ERR_PTR(); that's what
 * most of the users want.  Note that pinned negative with unlocked parent
 * _can_ become positive at any time, so callers of lookup_one_len_unlocked()
 * need to be very careful; pinned positives have ->d_inode stable, so
 * this one avoids such problems.
 */
struct dentry *lookup_positive_unlocked(const char *name,
                                       struct dentry *base, int len)
{
        return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len);
}
EXPORT_SYMBOL(lookup_positive_unlocked);

#ifdef CONFIG_UNIX98_PTYS
int path_pts(struct path *path)
{
        /* Find something mounted on "pts" in the same directory as
         * the input path.
         */
        struct dentry *parent = dget_parent(path->dentry);
        struct dentry *child;
        struct qstr this = QSTR_INIT("pts", 3);

        if (unlikely(!path_connected(path->mnt, parent))) {
                dput(parent);
                return -ENOENT;
        }
        dput(path->dentry);
        path->dentry = parent;
        child = d_hash_and_lookup(parent, &this);
        if (IS_ERR_OR_NULL(child))
                return -ENOENT;

        path->dentry = child;
        dput(parent);
        follow_down(path, 0);
        return 0;
}
#endif

int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
                 struct path *path, int *empty)
{
        struct filename *filename = getname_flags(name, flags, empty);
        int ret = filename_lookup(dfd, filename, flags, path, NULL);

        putname(filename);
        return ret;
}
EXPORT_SYMBOL(user_path_at_empty);

int __check_sticky(struct mnt_idmap *idmap, struct inode *dir,
                   struct inode *inode)
{
        kuid_t fsuid = current_fsuid();

        if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid))
                return 0;
        if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid))
                return 0;
        return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER);
}
EXPORT_SYMBOL(__check_sticky);

/*
 *        Check whether we can remove a link victim from directory dir, check
 *  whether the type of victim is right.
 *  1. We can't do it if dir is read-only (done in permission())
 *  2. We should have write and exec permissions on dir
 *  3. We can't remove anything from append-only dir
 *  4. We can't do anything with immutable dir (done in permission())
 *  5. If the sticky bit on dir is set we should either
 *        a. be owner of dir, or
 *        b. be owner of victim, or
 *        c. have CAP_FOWNER capability
 *  6. If the victim is append-only or immutable we can't do antyhing with
 *     links pointing to it.
 *  7. If the victim has an unknown uid or gid we can't change the inode.
 *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 * 10. We can't remove a root or mountpoint.
 * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
 *     nfs_async_unlink().
 */
static int may_delete(struct mnt_idmap *idmap, struct inode *dir,
                      struct dentry *victim, bool isdir)
{
        struct inode *inode = d_backing_inode(victim);
        int error;

        if (d_is_negative(victim))
                return -ENOENT;
        BUG_ON(!inode);

        BUG_ON(victim->d_parent->d_inode != dir);

        /* Inode writeback is not safe when the uid or gid are invalid. */
        if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
            !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)))
                return -EOVERFLOW;

        audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);

        error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
        if (IS_APPEND(dir))
                return -EPERM;

        if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) ||
            IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) ||
            HAS_UNMAPPED_ID(idmap, inode))
                return -EPERM;
        if (isdir) {
                if (!d_is_dir(victim))
                        return -ENOTDIR;
                if (IS_ROOT(victim))
                        return -EBUSY;
        } else if (d_is_dir(victim))
                return -EISDIR;
        if (IS_DEADDIR(dir))
                return -ENOENT;
        if (victim->d_flags & DCACHE_NFSFS_RENAMED)
                return -EBUSY;
        return 0;
}

/*        Check whether we can create an object with dentry child in directory
 *  dir.
 *  1. We can't do it if child already exists (open has special treatment for
 *     this case, but since we are inlined it's OK)
 *  2. We can't do it if dir is read-only (done in permission())
 *  3. We can't do it if the fs can't represent the fsuid or fsgid.
 *  4. We should have write and exec permissions on dir
 *  5. We can't do it if dir is immutable (done in permission())
 */
static inline int may_create(struct mnt_idmap *idmap,
                             struct inode *dir, struct dentry *child)
{
        audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
        if (child->d_inode)
                return -EEXIST;
        if (IS_DEADDIR(dir))
                return -ENOENT;
        if (!fsuidgid_has_mapping(dir->i_sb, idmap))
                return -EOVERFLOW;

        return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
}

// p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held
static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2)
{
        struct dentry *p = p1, *q = p2, *r;

        while ((r = p->d_parent) != p2 && r != p)
                p = r;
        if (r == p2) {
                // p is a child of p2 and an ancestor of p1 or p1 itself
                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2);
                return p;
        }
        // p is the root of connected component that contains p1
        // p2 does not occur on the path from p to p1
        while ((r = q->d_parent) != p1 && r != p && r != q)
                q = r;
        if (r == p1) {
                // q is a child of p1 and an ancestor of p2 or p2 itself
                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
                return q;
        } else if (likely(r == p)) {
                // both p2 and p1 are descendents of p
                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
                return NULL;
        } else { // no common ancestor at the time we'd been called
                mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
                return ERR_PTR(-EXDEV);
        }
}

/*
 * p1 and p2 should be directories on the same fs.
 */
struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
{
        if (p1 == p2) {
                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
                return NULL;
        }

        mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
        return lock_two_directories(p1, p2);
}
EXPORT_SYMBOL(lock_rename);

/*
 * c1 and p2 should be on the same fs.
 */
struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2)
{
        if (READ_ONCE(c1->d_parent) == p2) {
                /*
                 * hopefully won't need to touch ->s_vfs_rename_mutex at all.
                 */
                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
                /*
                 * now that p2 is locked, nobody can move in or out of it,
                 * so the test below is safe.
                 */
                if (likely(c1->d_parent == p2))
                        return NULL;

                /*
                 * c1 got moved out of p2 while we'd been taking locks;
                 * unlock and fall back to slow case.
                 */
                inode_unlock(p2->d_inode);
        }

        mutex_lock(&c1->d_sb->s_vfs_rename_mutex);
        /*
         * nobody can move out of any directories on this fs.
         */
        if (likely(c1->d_parent != p2))
                return lock_two_directories(c1->d_parent, p2);

        /*
         * c1 got moved into p2 while we were taking locks;
         * we need p2 locked and ->s_vfs_rename_mutex unlocked,
         * for consistency with lock_rename().
         */
        inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
        mutex_unlock(&c1->d_sb->s_vfs_rename_mutex);
        return NULL;
}
EXPORT_SYMBOL(lock_rename_child);

void unlock_rename(struct dentry *p1, struct dentry *p2)
{
        inode_unlock(p1->d_inode);
        if (p1 != p2) {
                inode_unlock(p2->d_inode);
                mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
        }
}
EXPORT_SYMBOL(unlock_rename);

/**
 * vfs_prepare_mode - prepare the mode to be used for a new inode
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        parent directory of the new inode
 * @mode:        mode of the new inode
 * @mask_perms:        allowed permission by the vfs
 * @type:        type of file to be created
 *
 * This helper consolidates and enforces vfs restrictions on the @mode of a new
 * object to be created.
 *
 * Umask stripping depends on whether the filesystem supports POSIX ACLs (see
 * the kernel documentation for mode_strip_umask()). Moving umask stripping
 * after setgid stripping allows the same ordering for both non-POSIX ACL and
 * POSIX ACL supporting filesystems.
 *
 * Note that it's currently valid for @type to be 0 if a directory is created.
 * Filesystems raise that flag individually and we need to check whether each
 * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a
 * non-zero type.
 *
 * Returns: mode to be passed to the filesystem
 */
static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
                                       const struct inode *dir, umode_t mode,
                                       umode_t mask_perms, umode_t type)
{
        mode = mode_strip_sgid(idmap, dir, mode);
        mode = mode_strip_umask(dir, mode);

        /*
         * Apply the vfs mandated allowed permission mask and set the type of
         * file to be created before we call into the filesystem.
         */
        mode &= (mask_perms & ~S_IFMT);
        mode |= (type & S_IFMT);

        return mode;
}

/**
 * vfs_create - create new file
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        inode of @dentry
 * @dentry:        pointer to dentry of the base directory
 * @mode:        mode of the new file
 * @want_excl:        whether the file must not yet exist
 *
 * Create a new file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_create(struct mnt_idmap *idmap, struct inode *dir,
               struct dentry *dentry, umode_t mode, bool want_excl)
{
        int error;

        error = may_create(idmap, dir, dentry);
        if (error)
                return error;

        if (!dir->i_op->create)
                return -EACCES;        /* shouldn't it be ENOSYS? */

        mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG);
        error = security_inode_create(dir, dentry, mode);
        if (error)
                return error;
        error = dir->i_op->create(idmap, dir, dentry, mode, want_excl);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_create);

int vfs_mkobj(struct dentry *dentry, umode_t mode,
                int (*f)(struct dentry *, umode_t, void *),
                void *arg)
{
        struct inode *dir = dentry->d_parent->d_inode;
        int error = may_create(&nop_mnt_idmap, dir, dentry);
        if (error)
                return error;

        mode &= S_IALLUGO;
        mode |= S_IFREG;
        error = security_inode_create(dir, dentry, mode);
        if (error)
                return error;
        error = f(dentry, mode, arg);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_mkobj);

bool may_open_dev(const struct path *path)
{
        return !(path->mnt->mnt_flags & MNT_NODEV) &&
                !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}

static int may_open(struct mnt_idmap *idmap, const struct path *path,
                    int acc_mode, int flag)
{
        struct dentry *dentry = path->dentry;
        struct inode *inode = dentry->d_inode;
        int error;

        if (!inode)
                return -ENOENT;

        switch (inode->i_mode & S_IFMT) {
        case S_IFLNK:
                return -ELOOP;
        case S_IFDIR:
                if (acc_mode & MAY_WRITE)
                        return -EISDIR;
                if (acc_mode & MAY_EXEC)
                        return -EACCES;
                break;
        case S_IFBLK:
        case S_IFCHR:
                if (!may_open_dev(path))
                        return -EACCES;
                fallthrough;
        case S_IFIFO:
        case S_IFSOCK:
                if (acc_mode & MAY_EXEC)
                        return -EACCES;
                flag &= ~O_TRUNC;
                break;
        case S_IFREG:
                if ((acc_mode & MAY_EXEC) && path_noexec(path))
                        return -EACCES;
                break;
        }

        error = inode_permission(idmap, inode, MAY_OPEN | acc_mode);
        if (error)
                return error;

        /*
         * An append-only file must be opened in append mode for writing.
         */
        if (IS_APPEND(inode)) {
                if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
                        return -EPERM;
                if (flag & O_TRUNC)
                        return -EPERM;
        }

        /* O_NOATIME can only be set by the owner or superuser */
        if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode))
                return -EPERM;

        return 0;
}

static int handle_truncate(struct mnt_idmap *idmap, struct file *filp)
{
        const struct path *path = &filp->f_path;
        struct inode *inode = path->dentry->d_inode;
        int error = get_write_access(inode);
        if (error)
                return error;

        error = security_file_truncate(filp);
        if (!error) {
                error = do_truncate(idmap, path->dentry, 0,
                                    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
                                    filp);
        }
        put_write_access(inode);
        return error;
}

static inline int open_to_namei_flags(int flag)
{
        if ((flag & O_ACCMODE) == 3)
                flag--;
        return flag;
}

static int may_o_create(struct mnt_idmap *idmap,
                        const struct path *dir, struct dentry *dentry,
                        umode_t mode)
{
        int error = security_path_mknod(dir, dentry, mode, 0);
        if (error)
                return error;

        if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap))
                return -EOVERFLOW;

        error = inode_permission(idmap, dir->dentry->d_inode,
                                 MAY_WRITE | MAY_EXEC);
        if (error)
                return error;

        return security_inode_create(dir->dentry->d_inode, dentry, mode);
}

/*
 * Attempt to atomically look up, create and open a file from a negative
 * dentry.
 *
 * Returns 0 if successful.  The file will have been created and attached to
 * @file by the filesystem calling finish_open().
 *
 * If the file was looked up only or didn't need creating, FMODE_OPENED won't
 * be set.  The caller will need to perform the open themselves.  @path will
 * have been updated to point to the new dentry.  This may be negative.
 *
 * Returns an error code otherwise.
 */
static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
                                  struct file *file,
                                  int open_flag, umode_t mode)
{
        struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
        struct inode *dir =  nd->path.dentry->d_inode;
        int error;

        if (nd->flags & LOOKUP_DIRECTORY)
                open_flag |= O_DIRECTORY;

        file->f_path.dentry = DENTRY_NOT_SET;
        file->f_path.mnt = nd->path.mnt;
        error = dir->i_op->atomic_open(dir, dentry, file,
                                       open_to_namei_flags(open_flag), mode);
        d_lookup_done(dentry);
        if (!error) {
                if (file->f_mode & FMODE_OPENED) {
                        if (unlikely(dentry != file->f_path.dentry)) {
                                dput(dentry);
                                dentry = dget(file->f_path.dentry);
                        }
                } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
                        error = -EIO;
                } else {
                        if (file->f_path.dentry) {
                                dput(dentry);
                                dentry = file->f_path.dentry;
                        }
                        if (unlikely(d_is_negative(dentry)))
                                error = -ENOENT;
                }
        }
        if (error) {
                dput(dentry);
                dentry = ERR_PTR(error);
        }
        return dentry;
}

/*
 * Look up and maybe create and open the last component.
 *
 * Must be called with parent locked (exclusive in O_CREAT case).
 *
 * Returns 0 on success, that is, if
 *  the file was successfully atomically created (if necessary) and opened, or
 *  the file was not completely opened at this time, though lookups and
 *  creations were performed.
 * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
 * In the latter case dentry returned in @path might be negative if O_CREAT
 * hadn't been specified.
 *
 * An error code is returned on failure.
 */
static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
                                  const struct open_flags *op,
                                  bool got_write)
{
        struct mnt_idmap *idmap;
        struct dentry *dir = nd->path.dentry;
        struct inode *dir_inode = dir->d_inode;
        int open_flag = op->open_flag;
        struct dentry *dentry;
        int error, create_error = 0;
        umode_t mode = op->mode;
        DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);

        if (unlikely(IS_DEADDIR(dir_inode)))
                return ERR_PTR(-ENOENT);

        file->f_mode &= ~FMODE_CREATED;
        dentry = d_lookup(dir, &nd->last);
        for (;;) {
                if (!dentry) {
                        dentry = d_alloc_parallel(dir, &nd->last, &wq);
                        if (IS_ERR(dentry))
                                return dentry;
                }
                if (d_in_lookup(dentry))
                        break;

                error = d_revalidate(dentry, nd->flags);
                if (likely(error > 0))
                        break;
                if (error)
                        goto out_dput;
                d_invalidate(dentry);
                dput(dentry);
                dentry = NULL;
        }
        if (dentry->d_inode) {
                /* Cached positive dentry: will open in f_op->open */
                return dentry;
        }

        /*
         * Checking write permission is tricky, bacuse we don't know if we are
         * going to actually need it: O_CREAT opens should work as long as the
         * file exists.  But checking existence breaks atomicity.  The trick is
         * to check access and if not granted clear O_CREAT from the flags.
         *
         * Another problem is returing the "right" error value (e.g. for an
         * O_EXCL open we want to return EEXIST not EROFS).
         */
        if (unlikely(!got_write))
                open_flag &= ~O_TRUNC;
        idmap = mnt_idmap(nd->path.mnt);
        if (open_flag & O_CREAT) {
                if (open_flag & O_EXCL)
                        open_flag &= ~O_TRUNC;
                mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode);
                if (likely(got_write))
                        create_error = may_o_create(idmap, &nd->path,
                                                    dentry, mode);
                else
                        create_error = -EROFS;
        }
        if (create_error)
                open_flag &= ~O_CREAT;
        if (dir_inode->i_op->atomic_open) {
                dentry = atomic_open(nd, dentry, file, open_flag, mode);
                if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
                        dentry = ERR_PTR(create_error);
                return dentry;
        }

        if (d_in_lookup(dentry)) {
                struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
                                                             nd->flags);
                d_lookup_done(dentry);
                if (unlikely(res)) {
                        if (IS_ERR(res)) {
                                error = PTR_ERR(res);
                                goto out_dput;
                        }
                        dput(dentry);
                        dentry = res;
                }
        }

        /* Negative dentry, just create the file */
        if (!dentry->d_inode && (open_flag & O_CREAT)) {
                file->f_mode |= FMODE_CREATED;
                audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
                if (!dir_inode->i_op->create) {
                        error = -EACCES;
                        goto out_dput;
                }

                error = dir_inode->i_op->create(idmap, dir_inode, dentry,
                                                mode, open_flag & O_EXCL);
                if (error)
                        goto out_dput;
        }
        if (unlikely(create_error) && !dentry->d_inode) {
                error = create_error;
                goto out_dput;
        }
        return dentry;

out_dput:
        dput(dentry);
        return ERR_PTR(error);
}

static const char *open_last_lookups(struct nameidata *nd,
                   struct file *file, const struct open_flags *op)
{
        struct dentry *dir = nd->path.dentry;
        int open_flag = op->open_flag;
        bool got_write = false;
        struct dentry *dentry;
        const char *res;

        nd->flags |= op->intent;

        if (nd->last_type != LAST_NORM) {
                if (nd->depth)
                        put_link(nd);
                return handle_dots(nd, nd->last_type);
        }

        if (!(open_flag & O_CREAT)) {
                if (nd->last.name[nd->last.len])
                        nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
                /* we _can_ be in RCU mode here */
                dentry = lookup_fast(nd);
                if (IS_ERR(dentry))
                        return ERR_CAST(dentry);
                if (likely(dentry))
                        goto finish_lookup;

                if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
                        return ERR_PTR(-ECHILD);
        } else {
                /* create side of things */
                if (nd->flags & LOOKUP_RCU) {
                        if (!try_to_unlazy(nd))
                                return ERR_PTR(-ECHILD);
                }
                audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
                /* trailing slashes? */
                if (unlikely(nd->last.name[nd->last.len]))
                        return ERR_PTR(-EISDIR);
        }

        if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
                got_write = !mnt_want_write(nd->path.mnt);
                /*
                 * do _not_ fail yet - we might not need that or fail with
                 * a different error; let lookup_open() decide; we'll be
                 * dropping this one anyway.
                 */
        }
        if (open_flag & O_CREAT)
                inode_lock(dir->d_inode);
        else
                inode_lock_shared(dir->d_inode);
        dentry = lookup_open(nd, file, op, got_write);
        if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
                fsnotify_create(dir->d_inode, dentry);
        if (open_flag & O_CREAT)
                inode_unlock(dir->d_inode);
        else
                inode_unlock_shared(dir->d_inode);

        if (got_write)
                mnt_drop_write(nd->path.mnt);

        if (IS_ERR(dentry))
                return ERR_CAST(dentry);

        if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
                dput(nd->path.dentry);
                nd->path.dentry = dentry;
                return NULL;
        }

finish_lookup:
        if (nd->depth)
                put_link(nd);
        res = step_into(nd, WALK_TRAILING, dentry);
        if (unlikely(res))
                nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
        return res;
}

/*
 * Handle the last step of open()
 */
static int do_open(struct nameidata *nd,
                   struct file *file, const struct open_flags *op)
{
        struct mnt_idmap *idmap;
        int open_flag = op->open_flag;
        bool do_truncate;
        int acc_mode;
        int error;

        if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
                error = complete_walk(nd);
                if (error)
                        return error;
        }
        if (!(file->f_mode & FMODE_CREATED))
                audit_inode(nd->name, nd->path.dentry, 0);
        idmap = mnt_idmap(nd->path.mnt);
        if (open_flag & O_CREAT) {
                if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
                        return -EEXIST;
                if (d_is_dir(nd->path.dentry))
                        return -EISDIR;
                error = may_create_in_sticky(idmap, nd,
                                             d_backing_inode(nd->path.dentry));
                if (unlikely(error))
                        return error;
        }
        if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
                return -ENOTDIR;

        do_truncate = false;
        acc_mode = op->acc_mode;
        if (file->f_mode & FMODE_CREATED) {
                /* Don't check for write permission, don't truncate */
                open_flag &= ~O_TRUNC;
                acc_mode = 0;
        } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
                error = mnt_want_write(nd->path.mnt);
                if (error)
                        return error;
                do_truncate = true;
        }
        error = may_open(idmap, &nd->path, acc_mode, open_flag);
        if (!error && !(file->f_mode & FMODE_OPENED))
                error = vfs_open(&nd->path, file);
        if (!error)
                error = security_file_post_open(file, op->acc_mode);
        if (!error && do_truncate)
                error = handle_truncate(idmap, file);
        if (unlikely(error > 0)) {
                WARN_ON(1);
                error = -EINVAL;
        }
        if (do_truncate)
                mnt_drop_write(nd->path.mnt);
        return error;
}

/**
 * vfs_tmpfile - create tmpfile
 * @idmap:        idmap of the mount the inode was found from
 * @parentpath:        pointer to the path of the base directory
 * @file:        file descriptor of the new tmpfile
 * @mode:        mode of the new tmpfile
 *
 * Create a temporary file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_tmpfile(struct mnt_idmap *idmap,
                const struct path *parentpath,
                struct file *file, umode_t mode)
{
        struct dentry *child;
        struct inode *dir = d_inode(parentpath->dentry);
        struct inode *inode;
        int error;
        int open_flag = file->f_flags;

        /* we want directory to be writable */
        error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
        if (!dir->i_op->tmpfile)
                return -EOPNOTSUPP;
        child = d_alloc(parentpath->dentry, &slash_name);
        if (unlikely(!child))
                return -ENOMEM;
        file->f_path.mnt = parentpath->mnt;
        file->f_path.dentry = child;
        mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
        error = dir->i_op->tmpfile(idmap, dir, file, mode);
        dput(child);
        if (error)
                return error;
        /* Don't check for other permissions, the inode was just created */
        error = may_open(idmap, &file->f_path, 0, file->f_flags);
        if (error)
                return error;
        inode = file_inode(file);
        if (!(open_flag & O_EXCL)) {
                spin_lock(&inode->i_lock);
                inode->i_state |= I_LINKABLE;
                spin_unlock(&inode->i_lock);
        }
        security_inode_post_create_tmpfile(idmap, inode);
        return 0;
}

/**
 * kernel_tmpfile_open - open a tmpfile for kernel internal use
 * @idmap:        idmap of the mount the inode was found from
 * @parentpath:        path of the base directory
 * @mode:        mode of the new tmpfile
 * @open_flag:        flags
 * @cred:        credentials for open
 *
 * Create and open a temporary file.  The file is not accounted in nr_files,
 * hence this is only for kernel internal use, and must not be installed into
 * file tables or such.
 */
struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
                                 const struct path *parentpath,
                                 umode_t mode, int open_flag,
                                 const struct cred *cred)
{
        struct file *file;
        int error;

        file = alloc_empty_file_noaccount(open_flag, cred);
        if (IS_ERR(file))
                return file;

        error = vfs_tmpfile(idmap, parentpath, file, mode);
        if (error) {
                fput(file);
                file = ERR_PTR(error);
        }
        return file;
}
EXPORT_SYMBOL(kernel_tmpfile_open);

static int do_tmpfile(struct nameidata *nd, unsigned flags,
                const struct open_flags *op,
                struct file *file)
{
        struct path path;
        int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);

        if (unlikely(error))
                return error;
        error = mnt_want_write(path.mnt);
        if (unlikely(error))
                goto out;
        error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode);
        if (error)
                goto out2;
        audit_inode(nd->name, file->f_path.dentry, 0);
out2:
        mnt_drop_write(path.mnt);
out:
        path_put(&path);
        return error;
}

static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
{
        struct path path;
        int error = path_lookupat(nd, flags, &path);
        if (!error) {
                audit_inode(nd->name, path.dentry, 0);
                error = vfs_open(&path, file);
                path_put(&path);
        }
        return error;
}

static struct file *path_openat(struct nameidata *nd,
                        const struct open_flags *op, unsigned flags)
{
        struct file *file;
        int error;

        file = alloc_empty_file(op->open_flag, current_cred());
        if (IS_ERR(file))
                return file;

        if (unlikely(file->f_flags & __O_TMPFILE)) {
                error = do_tmpfile(nd, flags, op, file);
        } else if (unlikely(file->f_flags & O_PATH)) {
                error = do_o_path(nd, flags, file);
        } else {
                const char *s = path_init(nd, flags);
                while (!(error = link_path_walk(s, nd)) &&
                       (s = open_last_lookups(nd, file, op)) != NULL)
                        ;
                if (!error)
                        error = do_open(nd, file, op);
                terminate_walk(nd);
        }
        if (likely(!error)) {
                if (likely(file->f_mode & FMODE_OPENED))
                        return file;
                WARN_ON(1);
                error = -EINVAL;
        }
        fput(file);
        if (error == -EOPENSTALE) {
                if (flags & LOOKUP_RCU)
                        error = -ECHILD;
                else
                        error = -ESTALE;
        }
        return ERR_PTR(error);
}

struct file *do_filp_open(int dfd, struct filename *pathname,
                const struct open_flags *op)
{
        struct nameidata nd;
        int flags = op->lookup_flags;
        struct file *filp;

        set_nameidata(&nd, dfd, pathname, NULL);
        filp = path_openat(&nd, op, flags | LOOKUP_RCU);
        if (unlikely(filp == ERR_PTR(-ECHILD)))
                filp = path_openat(&nd, op, flags);
        if (unlikely(filp == ERR_PTR(-ESTALE)))
                filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
        restore_nameidata();
        return filp;
}

struct file *do_file_open_root(const struct path *root,
                const char *name, const struct open_flags *op)
{
        struct nameidata nd;
        struct file *file;
        struct filename *filename;
        int flags = op->lookup_flags;

        if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN)
                return ERR_PTR(-ELOOP);

        filename = getname_kernel(name);
        if (IS_ERR(filename))
                return ERR_CAST(filename);

        set_nameidata(&nd, -1, filename, root);
        file = path_openat(&nd, op, flags | LOOKUP_RCU);
        if (unlikely(file == ERR_PTR(-ECHILD)))
                file = path_openat(&nd, op, flags);
        if (unlikely(file == ERR_PTR(-ESTALE)))
                file = path_openat(&nd, op, flags | LOOKUP_REVAL);
        restore_nameidata();
        putname(filename);
        return file;
}

static struct dentry *filename_create(int dfd, struct filename *name,
                                      struct path *path, unsigned int lookup_flags)
{
        struct dentry *dentry = ERR_PTR(-EEXIST);
        struct qstr last;
        bool want_dir = lookup_flags & LOOKUP_DIRECTORY;
        unsigned int reval_flag = lookup_flags & LOOKUP_REVAL;
        unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL;
        int type;
        int err2;
        int error;

        error = filename_parentat(dfd, name, reval_flag, path, &last, &type);
        if (error)
                return ERR_PTR(error);

        /*
         * Yucky last component or no last component at all?
         * (foo/., foo/.., /////)
         */
        if (unlikely(type != LAST_NORM))
                goto out;

        /* don't fail immediately if it's r/o, at least try to report other errors */
        err2 = mnt_want_write(path->mnt);
        /*
         * Do the final lookup.  Suppress 'create' if there is a trailing
         * '/', and a directory wasn't requested.
         */
        if (last.name[last.len] && !want_dir)
                create_flags = 0;
        inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
        dentry = lookup_one_qstr_excl(&last, path->dentry,
                                      reval_flag | create_flags);
        if (IS_ERR(dentry))
                goto unlock;

        error = -EEXIST;
        if (d_is_positive(dentry))
                goto fail;

        /*
         * Special case - lookup gave negative, but... we had foo/bar/
         * From the vfs_mknod() POV we just have a negative dentry -
         * all is fine. Let's be bastards - you had / on the end, you've
         * been asking for (non-existent) directory. -ENOENT for you.
         */
        if (unlikely(!create_flags)) {
                error = -ENOENT;
                goto fail;
        }
        if (unlikely(err2)) {
                error = err2;
                goto fail;
        }
        return dentry;
fail:
        dput(dentry);
        dentry = ERR_PTR(error);
unlock:
        inode_unlock(path->dentry->d_inode);
        if (!err2)
                mnt_drop_write(path->mnt);
out:
        path_put(path);
        return dentry;
}

struct dentry *kern_path_create(int dfd, const char *pathname,
                                struct path *path, unsigned int lookup_flags)
{
        struct filename *filename = getname_kernel(pathname);
        struct dentry *res = filename_create(dfd, filename, path, lookup_flags);

        putname(filename);
        return res;
}
EXPORT_SYMBOL(kern_path_create);

void done_path_create(struct path *path, struct dentry *dentry)
{
        dput(dentry);
        inode_unlock(path->dentry->d_inode);
        mnt_drop_write(path->mnt);
        path_put(path);
}
EXPORT_SYMBOL(done_path_create);

inline struct dentry *user_path_create(int dfd, const char __user *pathname,
                                struct path *path, unsigned int lookup_flags)
{
        struct filename *filename = getname(pathname);
        struct dentry *res = filename_create(dfd, filename, path, lookup_flags);

        putname(filename);
        return res;
}
EXPORT_SYMBOL(user_path_create);

/**
 * vfs_mknod - create device node or file
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        inode of @dentry
 * @dentry:        pointer to dentry of the base directory
 * @mode:        mode of the new device node or file
 * @dev:        device number of device to create
 *
 * Create a device node or file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
              struct dentry *dentry, umode_t mode, dev_t dev)
{
        bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
        int error = may_create(idmap, dir, dentry);

        if (error)
                return error;

        if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
            !capable(CAP_MKNOD))
                return -EPERM;

        if (!dir->i_op->mknod)
                return -EPERM;

        mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
        error = devcgroup_inode_mknod(mode, dev);
        if (error)
                return error;

        error = security_inode_mknod(dir, dentry, mode, dev);
        if (error)
                return error;

        error = dir->i_op->mknod(idmap, dir, dentry, mode, dev);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_mknod);

static int may_mknod(umode_t mode)
{
        switch (mode & S_IFMT) {
        case S_IFREG:
        case S_IFCHR:
        case S_IFBLK:
        case S_IFIFO:
        case S_IFSOCK:
        case 0: /* zero mode translates to S_IFREG */
                return 0;
        case S_IFDIR:
                return -EPERM;
        default:
                return -EINVAL;
        }
}

static int do_mknodat(int dfd, struct filename *name, umode_t mode,
                unsigned int dev)
{
        struct mnt_idmap *idmap;
        struct dentry *dentry;
        struct path path;
        int error;
        unsigned int lookup_flags = 0;

        error = may_mknod(mode);
        if (error)
                goto out1;
retry:
        dentry = filename_create(dfd, name, &path, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto out1;

        error = security_path_mknod(&path, dentry,
                        mode_strip_umask(path.dentry->d_inode, mode), dev);
        if (error)
                goto out2;

        idmap = mnt_idmap(path.mnt);
        switch (mode & S_IFMT) {
                case 0: case S_IFREG:
                        error = vfs_create(idmap, path.dentry->d_inode,
                                           dentry, mode, true);
                        if (!error)
                                security_path_post_mknod(idmap, dentry);
                        break;
                case S_IFCHR: case S_IFBLK:
                        error = vfs_mknod(idmap, path.dentry->d_inode,
                                          dentry, mode, new_decode_dev(dev));
                        break;
                case S_IFIFO: case S_IFSOCK:
                        error = vfs_mknod(idmap, path.dentry->d_inode,
                                          dentry, mode, 0);
                        break;
        }
out2:
        done_path_create(&path, dentry);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out1:
        putname(name);
        return error;
}

SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
                unsigned int, dev)
{
        return do_mknodat(dfd, getname(filename), mode, dev);
}

SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
{
        return do_mknodat(AT_FDCWD, getname(filename), mode, dev);
}

/**
 * vfs_mkdir - create directory
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        inode of @dentry
 * @dentry:        pointer to dentry of the base directory
 * @mode:        mode of the new directory
 *
 * Create a directory.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
              struct dentry *dentry, umode_t mode)
{
        int error;
        unsigned max_links = dir->i_sb->s_max_links;

        error = may_create(idmap, dir, dentry);
        if (error)
                return error;

        if (!dir->i_op->mkdir)
                return -EPERM;

        mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0);
        error = security_inode_mkdir(dir, dentry, mode);
        if (error)
                return error;

        if (max_links && dir->i_nlink >= max_links)
                return -EMLINK;

        error = dir->i_op->mkdir(idmap, dir, dentry, mode);
        if (!error)
                fsnotify_mkdir(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_mkdir);

int do_mkdirat(int dfd, struct filename *name, umode_t mode)
{
        struct dentry *dentry;
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_DIRECTORY;

retry:
        dentry = filename_create(dfd, name, &path, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto out_putname;

        error = security_path_mkdir(&path, dentry,
                        mode_strip_umask(path.dentry->d_inode, mode));
        if (!error) {
                error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
                                  dentry, mode);
        }
        done_path_create(&path, dentry);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out_putname:
        putname(name);
        return error;
}

SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
{
        return do_mkdirat(dfd, getname(pathname), mode);
}

SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
{
        return do_mkdirat(AT_FDCWD, getname(pathname), mode);
}

/**
 * vfs_rmdir - remove directory
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        inode of @dentry
 * @dentry:        pointer to dentry of the base directory
 *
 * Remove a directory.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
                     struct dentry *dentry)
{
        int error = may_delete(idmap, dir, dentry, 1);

        if (error)
                return error;

        if (!dir->i_op->rmdir)
                return -EPERM;

        dget(dentry);
        inode_lock(dentry->d_inode);

        error = -EBUSY;
        if (is_local_mountpoint(dentry) ||
            (dentry->d_inode->i_flags & S_KERNEL_FILE))
                goto out;

        error = security_inode_rmdir(dir, dentry);
        if (error)
                goto out;

        error = dir->i_op->rmdir(dir, dentry);
        if (error)
                goto out;

        shrink_dcache_parent(dentry);
        dentry->d_inode->i_flags |= S_DEAD;
        dont_mount(dentry);
        detach_mounts(dentry);

out:
        inode_unlock(dentry->d_inode);
        dput(dentry);
        if (!error)
                d_delete_notify(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_rmdir);

int do_rmdir(int dfd, struct filename *name)
{
        int error;
        struct dentry *dentry;
        struct path path;
        struct qstr last;
        int type;
        unsigned int lookup_flags = 0;
retry:
        error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
        if (error)
                goto exit1;

        switch (type) {
        case LAST_DOTDOT:
                error = -ENOTEMPTY;
                goto exit2;
        case LAST_DOT:
                error = -EINVAL;
                goto exit2;
        case LAST_ROOT:
                error = -EBUSY;
                goto exit2;
        }

        error = mnt_want_write(path.mnt);
        if (error)
                goto exit2;

        inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
        dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto exit3;
        if (!dentry->d_inode) {
                error = -ENOENT;
                goto exit4;
        }
        error = security_path_rmdir(&path, dentry);
        if (error)
                goto exit4;
        error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry);
exit4:
        dput(dentry);
exit3:
        inode_unlock(path.dentry->d_inode);
        mnt_drop_write(path.mnt);
exit2:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
exit1:
        putname(name);
        return error;
}

SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
{
        return do_rmdir(AT_FDCWD, getname(pathname));
}

/**
 * vfs_unlink - unlink a filesystem object
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        parent directory
 * @dentry:        victim
 * @delegated_inode: returns victim inode, if the inode is delegated.
 *
 * The caller must hold dir->i_mutex.
 *
 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
 * return a reference to the inode in delegated_inode.  The caller
 * should then break the delegation on that inode and retry.  Because
 * breaking a delegation may take a long time, the caller should drop
 * dir->i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
               struct dentry *dentry, struct inode **delegated_inode)
{
        struct inode *target = dentry->d_inode;
        int error = may_delete(idmap, dir, dentry, 0);

        if (error)
                return error;

        if (!dir->i_op->unlink)
                return -EPERM;

        inode_lock(target);
        if (IS_SWAPFILE(target))
                error = -EPERM;
        else if (is_local_mountpoint(dentry))
                error = -EBUSY;
        else {
                error = security_inode_unlink(dir, dentry);
                if (!error) {
                        error = try_break_deleg(target, delegated_inode);
                        if (error)
                                goto out;
                        error = dir->i_op->unlink(dir, dentry);
                        if (!error) {
                                dont_mount(dentry);
                                detach_mounts(dentry);
                        }
                }
        }
out:
        inode_unlock(target);

        /* We don't d_delete() NFS sillyrenamed files--they still exist. */
        if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
                fsnotify_unlink(dir, dentry);
        } else if (!error) {
                fsnotify_link_count(target);
                d_delete_notify(dir, dentry);
        }

        return error;
}
EXPORT_SYMBOL(vfs_unlink);

/*
 * Make sure that the actual truncation of the file will occur outside its
 * directory's i_mutex.  Truncate can take a long time if there is a lot of
 * writeout happening, and we don't want to prevent access to the directory
 * while waiting on the I/O.
 */
int do_unlinkat(int dfd, struct filename *name)
{
        int error;
        struct dentry *dentry;
        struct path path;
        struct qstr last;
        int type;
        struct inode *inode = NULL;
        struct inode *delegated_inode = NULL;
        unsigned int lookup_flags = 0;
retry:
        error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
        if (error)
                goto exit1;

        error = -EISDIR;
        if (type != LAST_NORM)
                goto exit2;

        error = mnt_want_write(path.mnt);
        if (error)
                goto exit2;
retry_deleg:
        inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
        dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
        error = PTR_ERR(dentry);
        if (!IS_ERR(dentry)) {

                /* Why not before? Because we want correct error value */
                if (last.name[last.len] || d_is_negative(dentry))
                        goto slashes;
                inode = dentry->d_inode;
                ihold(inode);
                error = security_path_unlink(&path, dentry);
                if (error)
                        goto exit3;
                error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode,
                                   dentry, &delegated_inode);
exit3:
                dput(dentry);
        }
        inode_unlock(path.dentry->d_inode);
        if (inode)
                iput(inode);        /* truncate the inode here */
        inode = NULL;
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        mnt_drop_write(path.mnt);
exit2:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                inode = NULL;
                goto retry;
        }
exit1:
        putname(name);
        return error;

slashes:
        if (d_is_negative(dentry))
                error = -ENOENT;
        else if (d_is_dir(dentry))
                error = -EISDIR;
        else
                error = -ENOTDIR;
        goto exit3;
}

SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
{
        if ((flag & ~AT_REMOVEDIR) != 0)
                return -EINVAL;

        if (flag & AT_REMOVEDIR)
                return do_rmdir(dfd, getname(pathname));
        return do_unlinkat(dfd, getname(pathname));
}

SYSCALL_DEFINE1(unlink, const char __user *, pathname)
{
        return do_unlinkat(AT_FDCWD, getname(pathname));
}

/**
 * vfs_symlink - create symlink
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        inode of @dentry
 * @dentry:        pointer to dentry of the base directory
 * @oldname:        name of the file to link to
 *
 * Create a symlink.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
                struct dentry *dentry, const char *oldname)
{
        int error;

        error = may_create(idmap, dir, dentry);
        if (error)
                return error;

        if (!dir->i_op->symlink)
                return -EPERM;

        error = security_inode_symlink(dir, dentry, oldname);
        if (error)
                return error;

        error = dir->i_op->symlink(idmap, dir, dentry, oldname);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_symlink);

int do_symlinkat(struct filename *from, int newdfd, struct filename *to)
{
        int error;
        struct dentry *dentry;
        struct path path;
        unsigned int lookup_flags = 0;

        if (IS_ERR(from)) {
                error = PTR_ERR(from);
                goto out_putnames;
        }
retry:
        dentry = filename_create(newdfd, to, &path, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto out_putnames;

        error = security_path_symlink(&path, dentry, from->name);
        if (!error)
                error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode,
                                    dentry, from->name);
        done_path_create(&path, dentry);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out_putnames:
        putname(to);
        putname(from);
        return error;
}

SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
                int, newdfd, const char __user *, newname)
{
        return do_symlinkat(getname(oldname), newdfd, getname(newname));
}

SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
{
        return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname));
}

/**
 * vfs_link - create a new link
 * @old_dentry:        object to be linked
 * @idmap:        idmap of the mount
 * @dir:        new parent
 * @new_dentry:        where to create the new link
 * @delegated_inode: returns inode needing a delegation break
 *
 * The caller must hold dir->i_mutex
 *
 * If vfs_link discovers a delegation on the to-be-linked file in need
 * of breaking, it will return -EWOULDBLOCK and return a reference to the
 * inode in delegated_inode.  The caller should then break the delegation
 * and retry.  Because breaking a delegation may take a long time, the
 * caller should drop the i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
             struct inode *dir, struct dentry *new_dentry,
             struct inode **delegated_inode)
{
        struct inode *inode = old_dentry->d_inode;
        unsigned max_links = dir->i_sb->s_max_links;
        int error;

        if (!inode)
                return -ENOENT;

        error = may_create(idmap, dir, new_dentry);
        if (error)
                return error;

        if (dir->i_sb != inode->i_sb)
                return -EXDEV;

        /*
         * A link to an append-only or immutable file cannot be created.
         */
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return -EPERM;
        /*
         * Updating the link count will likely cause i_uid and i_gid to
         * be writen back improperly if their true value is unknown to
         * the vfs.
         */
        if (HAS_UNMAPPED_ID(idmap, inode))
                return -EPERM;
        if (!dir->i_op->link)
                return -EPERM;
        if (S_ISDIR(inode->i_mode))
                return -EPERM;

        error = security_inode_link(old_dentry, dir, new_dentry);
        if (error)
                return error;

        inode_lock(inode);
        /* Make sure we don't allow creating hardlink to an unlinked file */
        if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
                error =  -ENOENT;
        else if (max_links && inode->i_nlink >= max_links)
                error = -EMLINK;
        else {
                error = try_break_deleg(inode, delegated_inode);
                if (!error)
                        error = dir->i_op->link(old_dentry, dir, new_dentry);
        }

        if (!error && (inode->i_state & I_LINKABLE)) {
                spin_lock(&inode->i_lock);
                inode->i_state &= ~I_LINKABLE;
                spin_unlock(&inode->i_lock);
        }
        inode_unlock(inode);
        if (!error)
                fsnotify_link(dir, inode, new_dentry);
        return error;
}
EXPORT_SYMBOL(vfs_link);

/*
 * Hardlinks are often used in delicate situations.  We avoid
 * security-related surprises by not following symlinks on the
 * newname.  --KAB
 *
 * We don't follow them on the oldname either to be compatible
 * with linux 2.0, and to avoid hard-linking to directories
 * and other special files.  --ADM
 */
int do_linkat(int olddfd, struct filename *old, int newdfd,
              struct filename *new, int flags)
{
        struct mnt_idmap *idmap;
        struct dentry *new_dentry;
        struct path old_path, new_path;
        struct inode *delegated_inode = NULL;
        int how = 0;
        int error;

        if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) {
                error = -EINVAL;
                goto out_putnames;
        }
        /*
         * To use null names we require CAP_DAC_READ_SEARCH or
         * that the open-time creds of the dfd matches current.
         * This ensures that not everyone will be able to create
         * a hardlink using the passed file descriptor.
         */
        if (flags & AT_EMPTY_PATH)
                how |= LOOKUP_LINKAT_EMPTY;

        if (flags & AT_SYMLINK_FOLLOW)
                how |= LOOKUP_FOLLOW;
retry:
        error = filename_lookup(olddfd, old, how, &old_path, NULL);
        if (error)
                goto out_putnames;

        new_dentry = filename_create(newdfd, new, &new_path,
                                        (how & LOOKUP_REVAL));
        error = PTR_ERR(new_dentry);
        if (IS_ERR(new_dentry))
                goto out_putpath;

        error = -EXDEV;
        if (old_path.mnt != new_path.mnt)
                goto out_dput;
        idmap = mnt_idmap(new_path.mnt);
        error = may_linkat(idmap, &old_path);
        if (unlikely(error))
                goto out_dput;
        error = security_path_link(old_path.dentry, &new_path, new_dentry);
        if (error)
                goto out_dput;
        error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode,
                         new_dentry, &delegated_inode);
out_dput:
        done_path_create(&new_path, new_dentry);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error) {
                        path_put(&old_path);
                        goto retry;
                }
        }
        if (retry_estale(error, how)) {
                path_put(&old_path);
                how |= LOOKUP_REVAL;
                goto retry;
        }
out_putpath:
        path_put(&old_path);
out_putnames:
        putname(old);
        putname(new);

        return error;
}

SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
                int, newdfd, const char __user *, newname, int, flags)
{
        return do_linkat(olddfd, getname_uflags(oldname, flags),
                newdfd, getname(newname), flags);
}

SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
{
        return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0);
}

/**
 * vfs_rename - rename a filesystem object
 * @rd:                pointer to &struct renamedata info
 *
 * The caller must hold multiple mutexes--see lock_rename()).
 *
 * If vfs_rename discovers a delegation in need of breaking at either
 * the source or destination, it will return -EWOULDBLOCK and return a
 * reference to the inode in delegated_inode.  The caller should then
 * break the delegation and retry.  Because breaking a delegation may
 * take a long time, the caller should drop all locks before doing
 * so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
 * The worst of all namespace operations - renaming directory. "Perverted"
 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
 * Problems:
 *
 *        a) we can get into loop creation.
 *        b) race potential - two innocent renames can create a loop together.
 *           That's where 4.4BSD screws up. Current fix: serialization on
 *           sb->s_vfs_rename_mutex. We might be more accurate, but that's another
 *           story.
 *        c) we may have to lock up to _four_ objects - parents and victim (if it exists),
 *           and source (if it's a non-directory or a subdirectory that moves to
 *           different parent).
 *           And that - after we got ->i_mutex on parents (until then we don't know
 *           whether the target exists).  Solution: try to be smart with locking
 *           order for inodes.  We rely on the fact that tree topology may change
 *           only under ->s_vfs_rename_mutex _and_ that parent of the object we
 *           move will be locked.  Thus we can rank directories by the tree
 *           (ancestors first) and rank all non-directories after them.
 *           That works since everybody except rename does "lock parent, lookup,
 *           lock child" and rename is under ->s_vfs_rename_mutex.
 *           HOWEVER, it relies on the assumption that any object with ->lookup()
 *           has no more than 1 dentry.  If "hybrid" objects will ever appear,
 *           we'd better make sure that there's no link(2) for them.
 *        d) conversion from fhandle to dentry may come in the wrong moment - when
 *           we are removing the target. Solution: we will have to grab ->i_mutex
 *           in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
 *           ->i_mutex on parents, which works but leads to some truly excessive
 *           locking].
 */
int vfs_rename(struct renamedata *rd)
{
        int error;
        struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
        struct dentry *old_dentry = rd->old_dentry;
        struct dentry *new_dentry = rd->new_dentry;
        struct inode **delegated_inode = rd->delegated_inode;
        unsigned int flags = rd->flags;
        bool is_dir = d_is_dir(old_dentry);
        struct inode *source = old_dentry->d_inode;
        struct inode *target = new_dentry->d_inode;
        bool new_is_dir = false;
        unsigned max_links = new_dir->i_sb->s_max_links;
        struct name_snapshot old_name;
        bool lock_old_subdir, lock_new_subdir;

        if (source == target)
                return 0;

        error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir);
        if (error)
                return error;

        if (!target) {
                error = may_create(rd->new_mnt_idmap, new_dir, new_dentry);
        } else {
                new_is_dir = d_is_dir(new_dentry);

                if (!(flags & RENAME_EXCHANGE))
                        error = may_delete(rd->new_mnt_idmap, new_dir,
                                           new_dentry, is_dir);
                else
                        error = may_delete(rd->new_mnt_idmap, new_dir,
                                           new_dentry, new_is_dir);
        }
        if (error)
                return error;

        if (!old_dir->i_op->rename)
                return -EPERM;

        /*
         * If we are going to change the parent - check write permissions,
         * we'll need to flip '..'.
         */
        if (new_dir != old_dir) {
                if (is_dir) {
                        error = inode_permission(rd->old_mnt_idmap, source,
                                                 MAY_WRITE);
                        if (error)
                                return error;
                }
                if ((flags & RENAME_EXCHANGE) && new_is_dir) {
                        error = inode_permission(rd->new_mnt_idmap, target,
                                                 MAY_WRITE);
                        if (error)
                                return error;
                }
        }

        error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
                                      flags);
        if (error)
                return error;

        take_dentry_name_snapshot(&old_name, old_dentry);
        dget(new_dentry);
        /*
         * Lock children.
         * The source subdirectory needs to be locked on cross-directory
         * rename or cross-directory exchange since its parent changes.
         * The target subdirectory needs to be locked on cross-directory
         * exchange due to parent change and on any rename due to becoming
         * a victim.
         * Non-directories need locking in all cases (for NFS reasons);
         * they get locked after any subdirectories (in inode address order).
         *
         * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE.
         * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex.
         */
        lock_old_subdir = new_dir != old_dir;
        lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE);
        if (is_dir) {
                if (lock_old_subdir)
                        inode_lock_nested(source, I_MUTEX_CHILD);
                if (target && (!new_is_dir || lock_new_subdir))
                        inode_lock(target);
        } else if (new_is_dir) {
                if (lock_new_subdir)
                        inode_lock_nested(target, I_MUTEX_CHILD);
                inode_lock(source);
        } else {
                lock_two_nondirectories(source, target);
        }

        error = -EPERM;
        if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target)))
                goto out;

        error = -EBUSY;
        if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
                goto out;

        if (max_links && new_dir != old_dir) {
                error = -EMLINK;
                if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
                        goto out;
                if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
                    old_dir->i_nlink >= max_links)
                        goto out;
        }
        if (!is_dir) {
                error = try_break_deleg(source, delegated_inode);
                if (error)
                        goto out;
        }
        if (target && !new_is_dir) {
                error = try_break_deleg(target, delegated_inode);
                if (error)
                        goto out;
        }
        error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry,
                                      new_dir, new_dentry, flags);
        if (error)
                goto out;

        if (!(flags & RENAME_EXCHANGE) && target) {
                if (is_dir) {
                        shrink_dcache_parent(new_dentry);
                        target->i_flags |= S_DEAD;
                }
                dont_mount(new_dentry);
                detach_mounts(new_dentry);
        }
        if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
                if (!(flags & RENAME_EXCHANGE))
                        d_move(old_dentry, new_dentry);
                else
                        d_exchange(old_dentry, new_dentry);
        }
out:
        if (!is_dir || lock_old_subdir)
                inode_unlock(source);
        if (target && (!new_is_dir || lock_new_subdir))
                inode_unlock(target);
        dput(new_dentry);
        if (!error) {
                fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
                              !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
                if (flags & RENAME_EXCHANGE) {
                        fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
                                      new_is_dir, NULL, new_dentry);
                }
        }
        release_dentry_name_snapshot(&old_name);

        return error;
}
EXPORT_SYMBOL(vfs_rename);

int do_renameat2(int olddfd, struct filename *from, int newdfd,
                 struct filename *to, unsigned int flags)
{
        struct renamedata rd;
        struct dentry *old_dentry, *new_dentry;
        struct dentry *trap;
        struct path old_path, new_path;
        struct qstr old_last, new_last;
        int old_type, new_type;
        struct inode *delegated_inode = NULL;
        unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
        bool should_retry = false;
        int error = -EINVAL;

        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                goto put_names;

        if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
            (flags & RENAME_EXCHANGE))
                goto put_names;

        if (flags & RENAME_EXCHANGE)
                target_flags = 0;

retry:
        error = filename_parentat(olddfd, from, lookup_flags, &old_path,
                                  &old_last, &old_type);
        if (error)
                goto put_names;

        error = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
                                  &new_type);
        if (error)
                goto exit1;

        error = -EXDEV;
        if (old_path.mnt != new_path.mnt)
                goto exit2;

        error = -EBUSY;
        if (old_type != LAST_NORM)
                goto exit2;

        if (flags & RENAME_NOREPLACE)
                error = -EEXIST;
        if (new_type != LAST_NORM)
                goto exit2;

        error = mnt_want_write(old_path.mnt);
        if (error)
                goto exit2;

retry_deleg:
        trap = lock_rename(new_path.dentry, old_path.dentry);
        if (IS_ERR(trap)) {
                error = PTR_ERR(trap);
                goto exit_lock_rename;
        }

        old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry,
                                          lookup_flags);
        error = PTR_ERR(old_dentry);
        if (IS_ERR(old_dentry))
                goto exit3;
        /* source must exist */
        error = -ENOENT;
        if (d_is_negative(old_dentry))
                goto exit4;
        new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
                                          lookup_flags | target_flags);
        error = PTR_ERR(new_dentry);
        if (IS_ERR(new_dentry))
                goto exit4;
        error = -EEXIST;
        if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
                goto exit5;
        if (flags & RENAME_EXCHANGE) {
                error = -ENOENT;
                if (d_is_negative(new_dentry))
                        goto exit5;

                if (!d_is_dir(new_dentry)) {
                        error = -ENOTDIR;
                        if (new_last.name[new_last.len])
                                goto exit5;
                }
        }
        /* unless the source is a directory trailing slashes give -ENOTDIR */
        if (!d_is_dir(old_dentry)) {
                error = -ENOTDIR;
                if (old_last.name[old_last.len])
                        goto exit5;
                if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
                        goto exit5;
        }
        /* source should not be ancestor of target */
        error = -EINVAL;
        if (old_dentry == trap)
                goto exit5;
        /* target should not be an ancestor of source */
        if (!(flags & RENAME_EXCHANGE))
                error = -ENOTEMPTY;
        if (new_dentry == trap)
                goto exit5;

        error = security_path_rename(&old_path, old_dentry,
                                     &new_path, new_dentry, flags);
        if (error)
                goto exit5;

        rd.old_dir           = old_path.dentry->d_inode;
        rd.old_dentry           = old_dentry;
        rd.old_mnt_idmap   = mnt_idmap(old_path.mnt);
        rd.new_dir           = new_path.dentry->d_inode;
        rd.new_dentry           = new_dentry;
        rd.new_mnt_idmap   = mnt_idmap(new_path.mnt);
        rd.delegated_inode = &delegated_inode;
        rd.flags           = flags;
        error = vfs_rename(&rd);
exit5:
        dput(new_dentry);
exit4:
        dput(old_dentry);
exit3:
        unlock_rename(new_path.dentry, old_path.dentry);
exit_lock_rename:
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        mnt_drop_write(old_path.mnt);
exit2:
        if (retry_estale(error, lookup_flags))
                should_retry = true;
        path_put(&new_path);
exit1:
        path_put(&old_path);
        if (should_retry) {
                should_retry = false;
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
put_names:
        putname(from);
        putname(to);
        return error;
}

SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
                int, newdfd, const char __user *, newname, unsigned int, flags)
{
        return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
                                flags);
}

SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
                int, newdfd, const char __user *, newname)
{
        return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
                                0);
}

SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
{
        return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD,
                                getname(newname), 0);
}

int readlink_copy(char __user *buffer, int buflen, const char *link)
{
        int len = PTR_ERR(link);
        if (IS_ERR(link))
                goto out;

        len = strlen(link);
        if (len > (unsigned) buflen)
                len = buflen;
        if (copy_to_user(buffer, link, len))
                len = -EFAULT;
out:
        return len;
}

/**
 * vfs_readlink - copy symlink body into userspace buffer
 * @dentry: dentry on which to get symbolic link
 * @buffer: user memory pointer
 * @buflen: size of buffer
 *
 * Does not touch atime.  That's up to the caller if necessary
 *
 * Does not call security hook.
 */
int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
        struct inode *inode = d_inode(dentry);
        DEFINE_DELAYED_CALL(done);
        const char *link;
        int res;

        if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
                if (unlikely(inode->i_op->readlink))
                        return inode->i_op->readlink(dentry, buffer, buflen);

                if (!d_is_symlink(dentry))
                        return -EINVAL;

                spin_lock(&inode->i_lock);
                inode->i_opflags |= IOP_DEFAULT_READLINK;
                spin_unlock(&inode->i_lock);
        }

        link = READ_ONCE(inode->i_link);
        if (!link) {
                link = inode->i_op->get_link(dentry, inode, &done);
                if (IS_ERR(link))
                        return PTR_ERR(link);
        }
        res = readlink_copy(buffer, buflen, link);
        do_delayed_call(&done);
        return res;
}
EXPORT_SYMBOL(vfs_readlink);

/**
 * vfs_get_link - get symlink body
 * @dentry: dentry on which to get symbolic link
 * @done: caller needs to free returned data with this
 *
 * Calls security hook and i_op->get_link() on the supplied inode.
 *
 * It does not touch atime.  That's up to the caller if necessary.
 *
 * Does not work on "special" symlinks like /proc/$$/fd/N
 */
const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
{
        const char *res = ERR_PTR(-EINVAL);
        struct inode *inode = d_inode(dentry);

        if (d_is_symlink(dentry)) {
                res = ERR_PTR(security_inode_readlink(dentry));
                if (!res)
                        res = inode->i_op->get_link(dentry, inode, done);
        }
        return res;
}
EXPORT_SYMBOL(vfs_get_link);

/* get the link contents into pagecache */
const char *page_get_link(struct dentry *dentry, struct inode *inode,
                          struct delayed_call *callback)
{
        char *kaddr;
        struct page *page;
        struct address_space *mapping = inode->i_mapping;

        if (!dentry) {
                page = find_get_page(mapping, 0);
                if (!page)
                        return ERR_PTR(-ECHILD);
                if (!PageUptodate(page)) {
                        put_page(page);
                        return ERR_PTR(-ECHILD);
                }
        } else {
                page = read_mapping_page(mapping, 0, NULL);
                if (IS_ERR(page))
                        return (char*)page;
        }
        set_delayed_call(callback, page_put_link, page);
        BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
        kaddr = page_address(page);
        nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
        return kaddr;
}

EXPORT_SYMBOL(page_get_link);

void page_put_link(void *arg)
{
        put_page(arg);
}
EXPORT_SYMBOL(page_put_link);

int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
        DEFINE_DELAYED_CALL(done);
        int res = readlink_copy(buffer, buflen,
                                page_get_link(dentry, d_inode(dentry),
                                              &done));
        do_delayed_call(&done);
        return res;
}
EXPORT_SYMBOL(page_readlink);

int page_symlink(struct inode *inode, const char *symname, int len)
{
        struct address_space *mapping = inode->i_mapping;
        const struct address_space_operations *aops = mapping->a_ops;
        bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS);
        struct page *page;
        void *fsdata = NULL;
        int err;
        unsigned int flags;

retry:
        if (nofs)
                flags = memalloc_nofs_save();
        err = aops->write_begin(NULL, mapping, 0, len-1, &page, &fsdata);
        if (nofs)
                memalloc_nofs_restore(flags);
        if (err)
                goto fail;

        memcpy(page_address(page), symname, len-1);

        err = aops->write_end(NULL, mapping, 0, len-1, len-1,
                                                        page, fsdata);
        if (err < 0)
                goto fail;
        if (err < len-1)
                goto retry;

        mark_inode_dirty(inode);
        return 0;
fail:
        return err;
}
EXPORT_SYMBOL(page_symlink);

const struct inode_operations page_symlink_inode_operations = {
        .get_link        = page_get_link,
};
EXPORT_SYMBOL(page_symlink_inode_operations);










   35 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __X86_KERNEL_FPU_INTERNAL_H
#define __X86_KERNEL_FPU_INTERNAL_H

extern struct fpstate init_fpstate;

/* CPU feature check wrappers */
static __always_inline __pure bool use_xsave(void)
{
        return cpu_feature_enabled(X86_FEATURE_XSAVE);
}

static __always_inline __pure bool use_fxsr(void)
{
        return cpu_feature_enabled(X86_FEATURE_FXSR);
}

#ifdef CONFIG_X86_DEBUG_FPU
# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
#else
# define WARN_ON_FPU(x) ({ (void)(x); 0; })
#endif

/* Used in init.c */
extern void fpstate_init_user(struct fpstate *fpstate);
extern void fpstate_reset(struct fpu *fpu);

#endif



































































































































































































   14 







    5 






























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
/* SPDX-License-Identifier: GPL-2.0 */
/* rwsem.h: R/W semaphores, public interface
 *
 * Written by David Howells (dhowells@redhat.com).
 * Derived from asm-i386/semaphore.h
 */

#ifndef _LINUX_RWSEM_H
#define _LINUX_RWSEM_H

#include <linux/linkage.h>

#include <linux/types.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/err.h>
#include <linux/cleanup.h>

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define __RWSEM_DEP_MAP_INIT(lockname)                        \
        .dep_map = {                                        \
                .name = #lockname,                        \
                .wait_type_inner = LD_WAIT_SLEEP,        \
        },
#else
# define __RWSEM_DEP_MAP_INIT(lockname)
#endif

#ifndef CONFIG_PREEMPT_RT

#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
#include <linux/osq_lock.h>
#endif

/*
 * For an uncontended rwsem, count and owner are the only fields a task
 * needs to touch when acquiring the rwsem. So they are put next to each
 * other to increase the chance that they will share the same cacheline.
 *
 * In a contended rwsem, the owner is likely the most frequently accessed
 * field in the structure as the optimistic waiter that holds the osq lock
 * will spin on owner. For an embedded rwsem, other hot fields in the
 * containing structure should be moved further away from the rwsem to
 * reduce the chance that they will share the same cacheline causing
 * cacheline bouncing problem.
 */
struct rw_semaphore {
        atomic_long_t count;
        /*
         * Write owner or one of the read owners as well flags regarding
         * the current state of the rwsem. Can be used as a speculative
         * check to see if the write owner is running on the cpu.
         */
        atomic_long_t owner;
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
        struct optimistic_spin_queue osq; /* spinner MCS lock */
#endif
        raw_spinlock_t wait_lock;
        struct list_head wait_list;
#ifdef CONFIG_DEBUG_RWSEMS
        void *magic;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
};

#define RWSEM_UNLOCKED_VALUE                0UL
#define RWSEM_WRITER_LOCKED                (1UL << 0)
#define __RWSEM_COUNT_INIT(name)        .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE)

static inline int rwsem_is_locked(struct rw_semaphore *sem)
{
        return atomic_long_read(&sem->count) != RWSEM_UNLOCKED_VALUE;
}

static inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem)
{
        WARN_ON(atomic_long_read(&sem->count) == RWSEM_UNLOCKED_VALUE);
}

static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem)
{
        WARN_ON(!(atomic_long_read(&sem->count) & RWSEM_WRITER_LOCKED));
}

/* Common initializer macros and functions */

#ifdef CONFIG_DEBUG_RWSEMS
# define __RWSEM_DEBUG_INIT(lockname) .magic = &lockname,
#else
# define __RWSEM_DEBUG_INIT(lockname)
#endif

#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
#define __RWSEM_OPT_INIT(lockname) .osq = OSQ_LOCK_UNLOCKED,
#else
#define __RWSEM_OPT_INIT(lockname)
#endif

#define __RWSEM_INITIALIZER(name)                                \
        { __RWSEM_COUNT_INIT(name),                                \
          .owner = ATOMIC_LONG_INIT(0),                                \
          __RWSEM_OPT_INIT(name)                                \
          .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\
          .wait_list = LIST_HEAD_INIT((name).wait_list),        \
          __RWSEM_DEBUG_INIT(name)                                \
          __RWSEM_DEP_MAP_INIT(name) }

#define DECLARE_RWSEM(name) \
        struct rw_semaphore name = __RWSEM_INITIALIZER(name)

extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
                         struct lock_class_key *key);

#define init_rwsem(sem)                                                \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        __init_rwsem((sem), #sem, &__key);                        \
} while (0)

/*
 * This is the same regardless of which rwsem implementation that is being used.
 * It is just a heuristic meant to be called by somebody already holding the
 * rwsem to see if somebody from an incompatible type is wanting access to the
 * lock.
 */
static inline int rwsem_is_contended(struct rw_semaphore *sem)
{
        return !list_empty(&sem->wait_list);
}

#else /* !CONFIG_PREEMPT_RT */

#include <linux/rwbase_rt.h>

struct rw_semaphore {
        struct rwbase_rt        rwbase;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
};

#define __RWSEM_INITIALIZER(name)                                \
        {                                                        \
                .rwbase = __RWBASE_INITIALIZER(name),                \
                __RWSEM_DEP_MAP_INIT(name)                        \
        }

#define DECLARE_RWSEM(lockname) \
        struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)

extern void  __init_rwsem(struct rw_semaphore *rwsem, const char *name,
                          struct lock_class_key *key);

#define init_rwsem(sem)                                                \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        __init_rwsem((sem), #sem, &__key);                        \
} while (0)

static __always_inline int rwsem_is_locked(const struct rw_semaphore *sem)
{
        return rw_base_is_locked(&sem->rwbase);
}

static __always_inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem)
{
        WARN_ON(!rwsem_is_locked(sem));
}

static __always_inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem)
{
        WARN_ON(!rw_base_is_write_locked(&sem->rwbase));
}

static __always_inline int rwsem_is_contended(struct rw_semaphore *sem)
{
        return rw_base_is_contended(&sem->rwbase);
}

#endif /* CONFIG_PREEMPT_RT */

/*
 * The functions below are the same for all rwsem implementations including
 * the RT specific variant.
 */

static inline void rwsem_assert_held(const struct rw_semaphore *sem)
{
        if (IS_ENABLED(CONFIG_LOCKDEP))
                lockdep_assert_held(sem);
        else
                rwsem_assert_held_nolockdep(sem);
}

static inline void rwsem_assert_held_write(const struct rw_semaphore *sem)
{
        if (IS_ENABLED(CONFIG_LOCKDEP))
                lockdep_assert_held_write(sem);
        else
                rwsem_assert_held_write_nolockdep(sem);
}

/*
 * lock for reading
 */
extern void down_read(struct rw_semaphore *sem);
extern int __must_check down_read_interruptible(struct rw_semaphore *sem);
extern int __must_check down_read_killable(struct rw_semaphore *sem);

/*
 * trylock for reading -- returns 1 if successful, 0 if contention
 */
extern int down_read_trylock(struct rw_semaphore *sem);

/*
 * lock for writing
 */
extern void down_write(struct rw_semaphore *sem);
extern int __must_check down_write_killable(struct rw_semaphore *sem);

/*
 * trylock for writing -- returns 1 if successful, 0 if contention
 */
extern int down_write_trylock(struct rw_semaphore *sem);

/*
 * release a read lock
 */
extern void up_read(struct rw_semaphore *sem);

/*
 * release a write lock
 */
extern void up_write(struct rw_semaphore *sem);

DEFINE_GUARD(rwsem_read, struct rw_semaphore *, down_read(_T), up_read(_T))
DEFINE_GUARD_COND(rwsem_read, _try, down_read_trylock(_T))
DEFINE_GUARD_COND(rwsem_read, _intr, down_read_interruptible(_T) == 0)

DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T))
DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))

/*
 * downgrade write lock to read lock
 */
extern void downgrade_write(struct rw_semaphore *sem);

#ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
 * nested locking. NOTE: rwsems are not allowed to recurse
 * (which occurs if the same task tries to acquire the same
 * lock instance multiple times), but multiple locks of the
 * same lock class might be taken, if the order of the locks
 * is always the same. This ordering rule can be expressed
 * to lockdep via the _nested() APIs, but enumerating the
 * subclasses that are used. (If the nesting relationship is
 * static then another method for expressing nested locking is
 * the explicit definition of lock class keys and the use of
 * lockdep_set_class() at lock initialization time.
 * See Documentation/locking/lockdep-design.rst for more details.)
 */
extern void down_read_nested(struct rw_semaphore *sem, int subclass);
extern int __must_check down_read_killable_nested(struct rw_semaphore *sem, int subclass);
extern void down_write_nested(struct rw_semaphore *sem, int subclass);
extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass);
extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock);

# define down_write_nest_lock(sem, nest_lock)                        \
do {                                                                \
        typecheck(struct lockdep_map *, &(nest_lock)->dep_map);        \
        _down_write_nest_lock(sem, &(nest_lock)->dep_map);        \
} while (0)

/*
 * Take/release a lock when not the owner will release it.
 *
 * [ This API should be avoided as much as possible - the
 *   proper abstraction for this case is completions. ]
 */
extern void down_read_non_owner(struct rw_semaphore *sem);
extern void up_read_non_owner(struct rw_semaphore *sem);
#else
# define down_read_nested(sem, subclass)                down_read(sem)
# define down_read_killable_nested(sem, subclass)        down_read_killable(sem)
# define down_write_nest_lock(sem, nest_lock)        down_write(sem)
# define down_write_nested(sem, subclass)        down_write(sem)
# define down_write_killable_nested(sem, subclass)        down_write_killable(sem)
# define down_read_non_owner(sem)                down_read(sem)
# define up_read_non_owner(sem)                        up_read(sem)
#endif

#endif /* _LINUX_RWSEM_H */
































































































    2 


    1 
    2 






















































































































































































   11 



   11 




    2 
   12 


   11 





   11 







   11 





   11 

















































   11 










   12 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
// SPDX-License-Identifier: GPL-2.0
/*
 *  mm/pgtable-generic.c
 *
 *  Generic pgtable methods declared in linux/pgtable.h
 *
 *  Copyright (C) 2010  Linus Torvalds
 */

#include <linux/pagemap.h>
#include <linux/hugetlb.h>
#include <linux/pgtable.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mm_inline.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>

/*
 * If a p?d_bad entry is found while walking page tables, report
 * the error, before resetting entry to p?d_none.  Usually (but
 * very seldom) called out from the p?d_none_or_clear_bad macros.
 */

void pgd_clear_bad(pgd_t *pgd)
{
        pgd_ERROR(*pgd);
        pgd_clear(pgd);
}

#ifndef __PAGETABLE_P4D_FOLDED
void p4d_clear_bad(p4d_t *p4d)
{
        p4d_ERROR(*p4d);
        p4d_clear(p4d);
}
#endif

#ifndef __PAGETABLE_PUD_FOLDED
void pud_clear_bad(pud_t *pud)
{
        pud_ERROR(*pud);
        pud_clear(pud);
}
#endif

/*
 * Note that the pmd variant below can't be stub'ed out just as for p4d/pud
 * above. pmd folding is special and typically pmd_* macros refer to upper
 * level even when folded
 */
void pmd_clear_bad(pmd_t *pmd)
{
        pmd_ERROR(*pmd);
        pmd_clear(pmd);
}

#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
/*
 * Only sets the access flags (dirty, accessed), as well as write
 * permission. Furthermore, we know it always gets set to a "more
 * permissive" setting, which allows most architectures to optimize
 * this. We return whether the PTE actually changed, which in turn
 * instructs the caller to do things like update__mmu_cache.  This
 * used to be done in the caller, but sparc needs minor faults to
 * force that call on sun4c so we changed this macro slightly
 */
int ptep_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pte_t *ptep,
                          pte_t entry, int dirty)
{
        int changed = !pte_same(ptep_get(ptep), entry);
        if (changed) {
                set_pte_at(vma->vm_mm, address, ptep, entry);
                flush_tlb_fix_spurious_fault(vma, address, ptep);
        }
        return changed;
}
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
int ptep_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pte_t *ptep)
{
        int young;
        young = ptep_test_and_clear_young(vma, address, ptep);
        if (young)
                flush_tlb_page(vma, address);
        return young;
}
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
                       pte_t *ptep)
{
        struct mm_struct *mm = (vma)->vm_mm;
        pte_t pte;
        pte = ptep_get_and_clear(mm, address, ptep);
        if (pte_accessible(mm, pte))
                flush_tlb_page(vma, address);
        return pte;
}
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
int pmdp_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pmd_t *pmdp,
                          pmd_t entry, int dirty)
{
        int changed = !pmd_same(*pmdp, entry);
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        if (changed) {
                set_pmd_at(vma->vm_mm, address, pmdp, entry);
                flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        }
        return changed;
}
#endif

#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
int pmdp_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pmd_t *pmdp)
{
        int young;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        young = pmdp_test_and_clear_young(vma, address, pmdp);
        if (young)
                flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return young;
}
#endif

#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
                            pmd_t *pmdp)
{
        pmd_t pmd;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
                           !pmd_devmap(*pmdp));
        pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
        flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return pmd;
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
                            pud_t *pudp)
{
        pud_t pud;

        VM_BUG_ON(address & ~HPAGE_PUD_MASK);
        VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp));
        pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
        flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
        return pud;
}
#endif
#endif

#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                pgtable_t pgtable)
{
        assert_spin_locked(pmd_lockptr(mm, pmdp));

        /* FIFO */
        if (!pmd_huge_pte(mm, pmdp))
                INIT_LIST_HEAD(&pgtable->lru);
        else
                list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru);
        pmd_huge_pte(mm, pmdp) = pgtable;
}
#endif

#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
/* no "address" argument so destroys page coloring of some arch */
pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
        pgtable_t pgtable;

        assert_spin_locked(pmd_lockptr(mm, pmdp));

        /* FIFO */
        pgtable = pmd_huge_pte(mm, pmdp);
        pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru,
                                                          struct page, lru);
        if (pmd_huge_pte(mm, pmdp))
                list_del(&pgtable->lru);
        return pgtable;
}
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE
pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                     pmd_t *pmdp)
{
        VM_WARN_ON_ONCE(!pmd_present(*pmdp));
        pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
        flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return old;
}
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
                         pmd_t *pmdp)
{
        VM_WARN_ON_ONCE(!pmd_present(*pmdp));
        return pmdp_invalidate(vma, address, pmdp);
}
#endif

#ifndef pmdp_collapse_flush
pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
                          pmd_t *pmdp)
{
        /*
         * pmd and hugepage pte format are same. So we could
         * use the same function.
         */
        pmd_t pmd;

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        VM_BUG_ON(pmd_trans_huge(*pmdp));
        pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);

        /* collapse entails shooting down ptes not pmd */
        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return pmd;
}
#endif

/* arch define pte_free_defer in asm/pgalloc.h for its own implementation */
#ifndef pte_free_defer
static void pte_free_now(struct rcu_head *head)
{
        struct page *page;

        page = container_of(head, struct page, rcu_head);
        pte_free(NULL /* mm not passed and not used */, (pgtable_t)page);
}

void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
{
        struct page *page;

        page = pgtable;
        call_rcu(&page->rcu_head, pte_free_now);
}
#endif /* pte_free_defer */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \
        (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RCU))
/*
 * See the comment above ptep_get_lockless() in include/linux/pgtable.h:
 * the barriers in pmdp_get_lockless() cannot guarantee that the value in
 * pmd_high actually belongs with the value in pmd_low; but holding interrupts
 * off blocks the TLB flush between present updates, which guarantees that a
 * successful __pte_offset_map() points to a page from matched halves.
 */
static unsigned long pmdp_get_lockless_start(void)
{
        unsigned long irqflags;

        local_irq_save(irqflags);
        return irqflags;
}
static void pmdp_get_lockless_end(unsigned long irqflags)
{
        local_irq_restore(irqflags);
}
#else
static unsigned long pmdp_get_lockless_start(void) { return 0; }
static void pmdp_get_lockless_end(unsigned long irqflags) { }
#endif

pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
{
        unsigned long irqflags;
        pmd_t pmdval;

        rcu_read_lock();
        irqflags = pmdp_get_lockless_start();
        pmdval = pmdp_get_lockless(pmd);
        pmdp_get_lockless_end(irqflags);

        if (pmdvalp)
                *pmdvalp = pmdval;
        if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
                goto nomap;
        if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval)))
                goto nomap;
        if (unlikely(pmd_bad(pmdval))) {
                pmd_clear_bad(pmd);
                goto nomap;
        }
        return __pte_map(&pmdval, addr);
nomap:
        rcu_read_unlock();
        return NULL;
}

pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
                             unsigned long addr, spinlock_t **ptlp)
{
        pmd_t pmdval;
        pte_t *pte;

        pte = __pte_offset_map(pmd, addr, &pmdval);
        if (likely(pte))
                *ptlp = pte_lockptr(mm, &pmdval);
        return pte;
}

/*
 * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation
 * __pte_offset_map_lock() below, is usually called with the pmd pointer for
 * addr, reached by walking down the mm's pgd, p4d, pud for addr: either while
 * holding mmap_lock or vma lock for read or for write; or in truncate or rmap
 * context, while holding file's i_mmap_lock or anon_vma lock for read (or for
 * write). In a few cases, it may be used with pmd pointing to a pmd_t already
 * copied to or constructed on the stack.
 *
 * When successful, it returns the pte pointer for addr, with its page table
 * kmapped if necessary (when CONFIG_HIGHPTE), and locked against concurrent
 * modification by software, with a pointer to that spinlock in ptlp (in some
 * configs mm->page_table_lock, in SPLIT_PTLOCK configs a spinlock in table's
 * struct page).  pte_unmap_unlock(pte, ptl) to unlock and unmap afterwards.
 *
 * But it is unsuccessful, returning NULL with *ptlp unchanged, if there is no
 * page table at *pmd: if, for example, the page table has just been removed,
 * or replaced by the huge pmd of a THP.  (When successful, *pmd is rechecked
 * after acquiring the ptlock, and retried internally if it changed: so that a
 * page table can be safely removed or replaced by THP while holding its lock.)
 *
 * pte_offset_map(pmd, addr), and its internal helper __pte_offset_map() above,
 * just returns the pte pointer for addr, its page table kmapped if necessary;
 * or NULL if there is no page table at *pmd.  It does not attempt to lock the
 * page table, so cannot normally be used when the page table is to be updated,
 * or when entries read must be stable.  But it does take rcu_read_lock(): so
 * that even when page table is racily removed, it remains a valid though empty
 * and disconnected table.  Until pte_unmap(pte) unmaps and rcu_read_unlock()s
 * afterwards.
 *
 * pte_offset_map_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map();
 * but when successful, it also outputs a pointer to the spinlock in ptlp - as
 * pte_offset_map_lock() does, but in this case without locking it.  This helps
 * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time
 * act on a changed *pmd: pte_offset_map_nolock() provides the correct spinlock
 * pointer for the page table that it returns.  In principle, the caller should
 * recheck *pmd once the lock is taken; in practice, no callsite needs that -
 * either the mmap_lock for write, or pte_same() check on contents, is enough.
 *
 * Note that free_pgtables(), used after unmapping detached vmas, or when
 * exiting the whole mm, does not take page table lock before freeing a page
 * table, and may not use RCU at all: "outsiders" like khugepaged should avoid
 * pte_offset_map() and co once the vma is detached from mm or mm_users is zero.
 */
pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
                             unsigned long addr, spinlock_t **ptlp)
{
        spinlock_t *ptl;
        pmd_t pmdval;
        pte_t *pte;
again:
        pte = __pte_offset_map(pmd, addr, &pmdval);
        if (unlikely(!pte))
                return pte;
        ptl = pte_lockptr(mm, &pmdval);
        spin_lock(ptl);
        if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
                *ptlp = ptl;
                return pte;
        }
        pte_unmap_unlock(pte, ptl);
        goto again;
}

















































































































































































































































































































































































    1 















    2 

    1 











    2 
    2 


    2 




































































































































































































































































































































































































































































































    1 
    1 



























































































































































































































































































































































































    1 















    1 













































    1 


























    1 

    1 

































    1 

    1 




































































    1 


















































    1 



    1 




















































































































































































































































    1 












































    1 

























    1 
























































































































































    1 















    1 





































    1 











    1 





















































































































































































    1 






































    1 













    1 










    1 

















    1 























































































































    1 



    1 








    1 





    1 











    1 


    1 

    1 






    1 































    1 













    1 




    1 

    1 










    1 












    1 









    1 
















    1 
    1 





    1 

























































































































































































































































































    1 





    1 





































































































































































































































































































































































































































































































































    1 



    1 
































































































































































































































































































































    1 






    1 




























    1 
















































    1 

















    1 













































    1 


















































    1 


















    1 









    1 


    1 








    1 










































































































    3 


















    3 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 











    2 










    2 


































































































































































































































































































    1 
























































































    1 












    1 






    1 






































    1 






    1 

    1 






































    1 









    1 











































































    1 











    1 









    1 






































    1 



    1 







    1 







    1 





































































































































    1 




    1 











    2 


















    2 

















    2 




    2 










    2 
































    2 
























    1 







    1 











    3 













    3 










    3 









    2 







    2 






























































































































































































































































































































































































































































































































































































































































































































    1 





















    1 






























































































































































    1 

    1 



























































































































































































































































































































    1 
































    1 


























    1 



    1 











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






    1 
































    1 


















    1 
















































































































































    2 










    2 

    2 

    2 

    2 

    2 
    1 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 



    3 







































































    1 








    1 




    1 
    1 





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#include <crypto/hash.h>
#include <linux/kernel.h>
#include <linux/bio.h>
#include <linux/blk-cgroup.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
#include <linux/compat.h>
#include <linux/xattr.h>
#include <linux/posix_acl.h>
#include <linux/falloc.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/btrfs.h>
#include <linux/blkdev.h>
#include <linux/posix_acl_xattr.h>
#include <linux/uio.h>
#include <linux/magic.h>
#include <linux/iversion.h>
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/sched/mm.h>
#include <linux/iomap.h>
#include <asm/unaligned.h>
#include <linux/fsverity.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "ordered-data.h"
#include "xattr.h"
#include "tree-log.h"
#include "bio.h"
#include "compression.h"
#include "locking.h"
#include "props.h"
#include "qgroup.h"
#include "delalloc-space.h"
#include "block-group.h"
#include "space-info.h"
#include "zoned.h"
#include "subpage.h"
#include "inode-item.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
#include "defrag.h"
#include "dir-item.h"
#include "file-item.h"
#include "uuid-tree.h"
#include "ioctl.h"
#include "file.h"
#include "acl.h"
#include "relocation.h"
#include "verity.h"
#include "super.h"
#include "orphan.h"
#include "backref.h"
#include "raid-stripe-tree.h"

struct btrfs_iget_args {
        u64 ino;
        struct btrfs_root *root;
};

struct btrfs_dio_data {
        ssize_t submitted;
        struct extent_changeset *data_reserved;
        struct btrfs_ordered_extent *ordered;
        bool data_space_reserved;
        bool nocow_done;
};

struct btrfs_dio_private {
        /* Range of I/O */
        u64 file_offset;
        u32 bytes;

        /* This must be last */
        struct btrfs_bio bbio;
};

static struct bio_set btrfs_dio_bioset;

struct btrfs_rename_ctx {
        /* Output field. Stores the index number of the old directory entry. */
        u64 index;
};

/*
 * Used by data_reloc_print_warning_inode() to pass needed info for filename
 * resolution and output of error message.
 */
struct data_reloc_warn {
        struct btrfs_path path;
        struct btrfs_fs_info *fs_info;
        u64 extent_item_size;
        u64 logical;
        int mirror_num;
};

/*
 * For the file_extent_tree, we want to hold the inode lock when we lookup and
 * update the disk_i_size, but lockdep will complain because our io_tree we hold
 * the tree lock and get the inode lock when setting delalloc. These two things
 * are unrelated, so make a class for the file_extent_tree so we don't get the
 * two locking patterns mixed up.
 */
static struct lock_class_key file_extent_tree_class;

static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
static const struct address_space_operations btrfs_aops;
static const struct file_operations btrfs_dir_file_operations;

static struct kmem_cache *btrfs_inode_cachep;

static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);

static noinline int run_delalloc_cow(struct btrfs_inode *inode,
                                     struct page *locked_page, u64 start,
                                     u64 end, struct writeback_control *wbc,
                                     bool pages_dirty);
static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
                                       u64 len, u64 orig_start, u64 block_start,
                                       u64 block_len, u64 orig_block_len,
                                       u64 ram_bytes, int compress_type,
                                       int type);

static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
                                          u64 root, void *warn_ctx)
{
        struct data_reloc_warn *warn = warn_ctx;
        struct btrfs_fs_info *fs_info = warn->fs_info;
        struct extent_buffer *eb;
        struct btrfs_inode_item *inode_item;
        struct inode_fs_paths *ipath = NULL;
        struct btrfs_root *local_root;
        struct btrfs_key key;
        unsigned int nofs_flag;
        u32 nlink;
        int ret;

        local_root = btrfs_get_fs_root(fs_info, root, true);
        if (IS_ERR(local_root)) {
                ret = PTR_ERR(local_root);
                goto err;
        }

        /* This makes the path point to (inum INODE_ITEM ioff). */
        key.objectid = inum;
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;

        ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0);
        if (ret) {
                btrfs_put_root(local_root);
                btrfs_release_path(&warn->path);
                goto err;
        }

        eb = warn->path.nodes[0];
        inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item);
        nlink = btrfs_inode_nlink(eb, inode_item);
        btrfs_release_path(&warn->path);

        nofs_flag = memalloc_nofs_save();
        ipath = init_ipath(4096, local_root, &warn->path);
        memalloc_nofs_restore(nofs_flag);
        if (IS_ERR(ipath)) {
                btrfs_put_root(local_root);
                ret = PTR_ERR(ipath);
                ipath = NULL;
                /*
                 * -ENOMEM, not a critical error, just output an generic error
                 * without filename.
                 */
                btrfs_warn(fs_info,
"checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
                           warn->logical, warn->mirror_num, root, inum, offset);
                return ret;
        }
        ret = paths_from_inode(inum, ipath);
        if (ret < 0)
                goto err;

        /*
         * We deliberately ignore the bit ipath might have been too small to
         * hold all of the paths here
         */
        for (int i = 0; i < ipath->fspath->elem_cnt; i++) {
                btrfs_warn(fs_info,
"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
                           warn->logical, warn->mirror_num, root, inum, offset,
                           fs_info->sectorsize, nlink,
                           (char *)(unsigned long)ipath->fspath->val[i]);
        }

        btrfs_put_root(local_root);
        free_ipath(ipath);
        return 0;

err:
        btrfs_warn(fs_info,
"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
                   warn->logical, warn->mirror_num, root, inum, offset, ret);

        free_ipath(ipath);
        return ret;
}

/*
 * Do extra user-friendly error output (e.g. lookup all the affected files).
 *
 * Return true if we succeeded doing the backref lookup.
 * Return false if such lookup failed, and has to fallback to the old error message.
 */
static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off,
                                   const u8 *csum, const u8 *csum_expected,
                                   int mirror_num)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct btrfs_path path = { 0 };
        struct btrfs_key found_key = { 0 };
        struct extent_buffer *eb;
        struct btrfs_extent_item *ei;
        const u32 csum_size = fs_info->csum_size;
        u64 logical;
        u64 flags;
        u32 item_size;
        int ret;

        mutex_lock(&fs_info->reloc_mutex);
        logical = btrfs_get_reloc_bg_bytenr(fs_info);
        mutex_unlock(&fs_info->reloc_mutex);

        if (logical == U64_MAX) {
                btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
                btrfs_warn_rl(fs_info,
"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
                        btrfs_root_id(inode->root), btrfs_ino(inode), file_off,
                        CSUM_FMT_VALUE(csum_size, csum),
                        CSUM_FMT_VALUE(csum_size, csum_expected),
                        mirror_num);
                return;
        }

        logical += file_off;
        btrfs_warn_rl(fs_info,
"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
                        btrfs_root_id(inode->root),
                        btrfs_ino(inode), file_off, logical,
                        CSUM_FMT_VALUE(csum_size, csum),
                        CSUM_FMT_VALUE(csum_size, csum_expected),
                        mirror_num);

        ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags);
        if (ret < 0) {
                btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
                             logical, ret);
                return;
        }
        eb = path.nodes[0];
        ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item);
        item_size = btrfs_item_size(eb, path.slots[0]);
        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                unsigned long ptr = 0;
                u64 ref_root;
                u8 ref_level;

                while (true) {
                        ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
                                                      item_size, &ref_root,
                                                      &ref_level);
                        if (ret < 0) {
                                btrfs_warn_rl(fs_info,
                                "failed to resolve tree backref for logical %llu: %d",
                                              logical, ret);
                                break;
                        }
                        if (ret > 0)
                                break;

                        btrfs_warn_rl(fs_info,
"csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu",
                                logical, mirror_num,
                                (ref_level ? "node" : "leaf"),
                                ref_level, ref_root);
                }
                btrfs_release_path(&path);
        } else {
                struct btrfs_backref_walk_ctx ctx = { 0 };
                struct data_reloc_warn reloc_warn = { 0 };

                btrfs_release_path(&path);

                ctx.bytenr = found_key.objectid;
                ctx.extent_item_pos = logical - found_key.objectid;
                ctx.fs_info = fs_info;

                reloc_warn.logical = logical;
                reloc_warn.extent_item_size = found_key.offset;
                reloc_warn.mirror_num = mirror_num;
                reloc_warn.fs_info = fs_info;

                iterate_extent_inodes(&ctx, true,
                                      data_reloc_print_warning_inode, &reloc_warn);
        }
}

static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
                u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
{
        struct btrfs_root *root = inode->root;
        const u32 csum_size = root->fs_info->csum_size;

        /* For data reloc tree, it's better to do a backref lookup instead. */
        if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID)
                return print_data_reloc_error(inode, logical_start, csum,
                                              csum_expected, mirror_num);

        /* Output without objectid, which is more meaningful */
        if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) {
                btrfs_warn_rl(root->fs_info,
"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
                        btrfs_root_id(root), btrfs_ino(inode),
                        logical_start,
                        CSUM_FMT_VALUE(csum_size, csum),
                        CSUM_FMT_VALUE(csum_size, csum_expected),
                        mirror_num);
        } else {
                btrfs_warn_rl(root->fs_info,
"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
                        btrfs_root_id(root), btrfs_ino(inode),
                        logical_start,
                        CSUM_FMT_VALUE(csum_size, csum),
                        CSUM_FMT_VALUE(csum_size, csum_expected),
                        mirror_num);
        }
}

/*
 * Lock inode i_rwsem based on arguments passed.
 *
 * ilock_flags can have the following bit set:
 *
 * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
 * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
 *                     return -EAGAIN
 * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
 */
int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
{
        if (ilock_flags & BTRFS_ILOCK_SHARED) {
                if (ilock_flags & BTRFS_ILOCK_TRY) {
                        if (!inode_trylock_shared(&inode->vfs_inode))
                                return -EAGAIN;
                        else
                                return 0;
                }
                inode_lock_shared(&inode->vfs_inode);
        } else {
                if (ilock_flags & BTRFS_ILOCK_TRY) {
                        if (!inode_trylock(&inode->vfs_inode))
                                return -EAGAIN;
                        else
                                return 0;
                }
                inode_lock(&inode->vfs_inode);
        }
        if (ilock_flags & BTRFS_ILOCK_MMAP)
                down_write(&inode->i_mmap_lock);
        return 0;
}

/*
 * Unock inode i_rwsem.
 *
 * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
 * to decide whether the lock acquired is shared or exclusive.
 */
void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
{
        if (ilock_flags & BTRFS_ILOCK_MMAP)
                up_write(&inode->i_mmap_lock);
        if (ilock_flags & BTRFS_ILOCK_SHARED)
                inode_unlock_shared(&inode->vfs_inode);
        else
                inode_unlock(&inode->vfs_inode);
}

/*
 * Cleanup all submitted ordered extents in specified range to handle errors
 * from the btrfs_run_delalloc_range() callback.
 *
 * NOTE: caller must ensure that when an error happens, it can not call
 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
 * to be released, which we want to happen only when finishing the ordered
 * extent (btrfs_finish_ordered_io()).
 */
static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
                                                 struct page *locked_page,
                                                 u64 offset, u64 bytes)
{
        unsigned long index = offset >> PAGE_SHIFT;
        unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
        u64 page_start = 0, page_end = 0;
        struct page *page;

        if (locked_page) {
                page_start = page_offset(locked_page);
                page_end = page_start + PAGE_SIZE - 1;
        }

        while (index <= end_index) {
                /*
                 * For locked page, we will call btrfs_mark_ordered_io_finished
                 * through btrfs_mark_ordered_io_finished() on it
                 * in run_delalloc_range() for the error handling, which will
                 * clear page Ordered and run the ordered extent accounting.
                 *
                 * Here we can't just clear the Ordered bit, or
                 * btrfs_mark_ordered_io_finished() would skip the accounting
                 * for the page range, and the ordered extent will never finish.
                 */
                if (locked_page && index == (page_start >> PAGE_SHIFT)) {
                        index++;
                        continue;
                }
                page = find_get_page(inode->vfs_inode.i_mapping, index);
                index++;
                if (!page)
                        continue;

                /*
                 * Here we just clear all Ordered bits for every page in the
                 * range, then btrfs_mark_ordered_io_finished() will handle
                 * the ordered extent accounting for the range.
                 */
                btrfs_folio_clamp_clear_ordered(inode->root->fs_info,
                                                page_folio(page), offset, bytes);
                put_page(page);
        }

        if (locked_page) {
                /* The locked page covers the full range, nothing needs to be done */
                if (bytes + offset <= page_start + PAGE_SIZE)
                        return;
                /*
                 * In case this page belongs to the delalloc range being
                 * instantiated then skip it, since the first page of a range is
                 * going to be properly cleaned up by the caller of
                 * run_delalloc_range
                 */
                if (page_start >= offset && page_end <= (offset + bytes - 1)) {
                        bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
                        offset = page_offset(locked_page) + PAGE_SIZE;
                }
        }

        return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
}

static int btrfs_dirty_inode(struct btrfs_inode *inode);

static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
                                     struct btrfs_new_inode_args *args)
{
        int err;

        if (args->default_acl) {
                err = __btrfs_set_acl(trans, args->inode, args->default_acl,
                                      ACL_TYPE_DEFAULT);
                if (err)
                        return err;
        }
        if (args->acl) {
                err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
                if (err)
                        return err;
        }
        if (!args->default_acl && !args->acl)
                cache_no_acl(args->inode);
        return btrfs_xattr_security_init(trans, args->inode, args->dir,
                                         &args->dentry->d_name);
}

/*
 * this does all the hard work for inserting an inline extent into
 * the btree.  The caller should have done a btrfs_drop_extents so that
 * no overlapping inline items exist in the btree
 */
static int insert_inline_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_path *path,
                                struct btrfs_inode *inode, bool extent_inserted,
                                size_t size, size_t compressed_size,
                                int compress_type,
                                struct folio *compressed_folio,
                                bool update_i_size)
{
        struct btrfs_root *root = inode->root;
        struct extent_buffer *leaf;
        struct page *page = NULL;
        const u32 sectorsize = trans->fs_info->sectorsize;
        char *kaddr;
        unsigned long ptr;
        struct btrfs_file_extent_item *ei;
        int ret;
        size_t cur_size = size;
        u64 i_size;

        /*
         * The decompressed size must still be no larger than a sector.  Under
         * heavy race, we can have size == 0 passed in, but that shouldn't be a
         * big deal and we can continue the insertion.
         */
        ASSERT(size <= sectorsize);

        /*
         * The compressed size also needs to be no larger than a sector.
         * That's also why we only need one page as the parameter.
         */
        if (compressed_folio)
                ASSERT(compressed_size <= sectorsize);
        else
                ASSERT(compressed_size == 0);

        if (compressed_size && compressed_folio)
                cur_size = compressed_size;

        if (!extent_inserted) {
                struct btrfs_key key;
                size_t datasize;

                key.objectid = btrfs_ino(inode);
                key.offset = 0;
                key.type = BTRFS_EXTENT_DATA_KEY;

                datasize = btrfs_file_extent_calc_inline_size(cur_size);
                ret = btrfs_insert_empty_item(trans, root, path, &key,
                                              datasize);
                if (ret)
                        goto fail;
        }
        leaf = path->nodes[0];
        ei = btrfs_item_ptr(leaf, path->slots[0],
                            struct btrfs_file_extent_item);
        btrfs_set_file_extent_generation(leaf, ei, trans->transid);
        btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
        btrfs_set_file_extent_encryption(leaf, ei, 0);
        btrfs_set_file_extent_other_encoding(leaf, ei, 0);
        btrfs_set_file_extent_ram_bytes(leaf, ei, size);
        ptr = btrfs_file_extent_inline_start(ei);

        if (compress_type != BTRFS_COMPRESS_NONE) {
                kaddr = kmap_local_folio(compressed_folio, 0);
                write_extent_buffer(leaf, kaddr, ptr, compressed_size);
                kunmap_local(kaddr);

                btrfs_set_file_extent_compression(leaf, ei,
                                                  compress_type);
        } else {
                page = find_get_page(inode->vfs_inode.i_mapping, 0);
                btrfs_set_file_extent_compression(leaf, ei, 0);
                kaddr = kmap_local_page(page);
                write_extent_buffer(leaf, kaddr, ptr, size);
                kunmap_local(kaddr);
                put_page(page);
        }
        btrfs_mark_buffer_dirty(trans, leaf);
        btrfs_release_path(path);

        /*
         * We align size to sectorsize for inline extents just for simplicity
         * sake.
         */
        ret = btrfs_inode_set_file_extent_range(inode, 0,
                                        ALIGN(size, root->fs_info->sectorsize));
        if (ret)
                goto fail;

        /*
         * We're an inline extent, so nobody can extend the file past i_size
         * without locking a page we already have locked.
         *
         * We must do any i_size and inode updates before we unlock the pages.
         * Otherwise we could end up racing with unlink.
         */
        i_size = i_size_read(&inode->vfs_inode);
        if (update_i_size && size > i_size) {
                i_size_write(&inode->vfs_inode, size);
                i_size = size;
        }
        inode->disk_i_size = i_size;

fail:
        return ret;
}

static bool can_cow_file_range_inline(struct btrfs_inode *inode,
                                      u64 offset, u64 size,
                                      size_t compressed_size)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        u64 data_len = (compressed_size ?: size);

        /* Inline extents must start at offset 0. */
        if (offset != 0)
                return false;

        /*
         * Due to the page size limit, for subpage we can only trigger the
         * writeback for the dirty sectors of page, that means data writeback
         * is doing more writeback than what we want.
         *
         * This is especially unexpected for some call sites like fallocate,
         * where we only increase i_size after everything is done.
         * This means we can trigger inline extent even if we didn't want to.
         * So here we skip inline extent creation completely.
         */
        if (fs_info->sectorsize != PAGE_SIZE)
                return false;

        /* Inline extents are limited to sectorsize. */
        if (size > fs_info->sectorsize)
                return false;

        /* We cannot exceed the maximum inline data size. */
        if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
                return false;

        /* We cannot exceed the user specified max_inline size. */
        if (data_len > fs_info->max_inline)
                return false;

        /* Inline extents must be the entirety of the file. */
        if (size < i_size_read(&inode->vfs_inode))
                return false;

        return true;
}

/*
 * conditionally insert an inline extent into the file.  This
 * does the checks required to make sure the data is small enough
 * to fit as an inline extent.
 *
 * If being used directly, you must have already checked we're allowed to cow
 * the range by getting true from can_cow_file_range_inline().
 */
static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset,
                                            u64 size, size_t compressed_size,
                                            int compress_type,
                                            struct folio *compressed_folio,
                                            bool update_i_size)
{
        struct btrfs_drop_extents_args drop_args = { 0 };
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_trans_handle *trans;
        u64 data_len = (compressed_size ?: size);
        int ret;
        struct btrfs_path *path;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
                btrfs_free_path(path);
                return PTR_ERR(trans);
        }
        trans->block_rsv = &inode->block_rsv;

        drop_args.path = path;
        drop_args.start = 0;
        drop_args.end = fs_info->sectorsize;
        drop_args.drop_cache = true;
        drop_args.replace_extent = true;
        drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
        ret = btrfs_drop_extents(trans, root, inode, &drop_args);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out;
        }

        ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
                                   size, compressed_size, compress_type,
                                   compressed_folio, update_i_size);
        if (ret && ret != -ENOSPC) {
                btrfs_abort_transaction(trans, ret);
                goto out;
        } else if (ret == -ENOSPC) {
                ret = 1;
                goto out;
        }

        btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
        ret = btrfs_update_inode(trans, inode);
        if (ret && ret != -ENOSPC) {
                btrfs_abort_transaction(trans, ret);
                goto out;
        } else if (ret == -ENOSPC) {
                ret = 1;
                goto out;
        }

        btrfs_set_inode_full_sync(inode);
out:
        /*
         * Don't forget to free the reserved space, as for inlined extent
         * it won't count as data extent, free them directly here.
         * And at reserve time, it's always aligned to page size, so
         * just free one page here.
         */
        btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL);
        btrfs_free_path(path);
        btrfs_end_transaction(trans);
        return ret;
}

static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 offset,
                                          u64 end,
                                          size_t compressed_size,
                                          int compress_type,
                                          struct folio *compressed_folio,
                                          bool update_i_size)
{
        struct extent_state *cached = NULL;
        unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
                EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED;
        u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1);
        int ret;

        if (!can_cow_file_range_inline(inode, offset, size, compressed_size))
                return 1;

        lock_extent(&inode->io_tree, offset, end, &cached);
        ret = __cow_file_range_inline(inode, offset, size, compressed_size,
                                      compress_type, compressed_folio,
                                      update_i_size);
        if (ret > 0) {
                unlock_extent(&inode->io_tree, offset, end, &cached);
                return ret;
        }

        extent_clear_unlock_delalloc(inode, offset, end, NULL, &cached,
                                     clear_flags,
                                     PAGE_UNLOCK | PAGE_START_WRITEBACK |
                                     PAGE_END_WRITEBACK);
        return ret;
}

struct async_extent {
        u64 start;
        u64 ram_size;
        u64 compressed_size;
        struct folio **folios;
        unsigned long nr_folios;
        int compress_type;
        struct list_head list;
};

struct async_chunk {
        struct btrfs_inode *inode;
        struct page *locked_page;
        u64 start;
        u64 end;
        blk_opf_t write_flags;
        struct list_head extents;
        struct cgroup_subsys_state *blkcg_css;
        struct btrfs_work work;
        struct async_cow *async_cow;
};

struct async_cow {
        atomic_t num_chunks;
        struct async_chunk chunks[];
};

static noinline int add_async_extent(struct async_chunk *cow,
                                     u64 start, u64 ram_size,
                                     u64 compressed_size,
                                     struct folio **folios,
                                     unsigned long nr_folios,
                                     int compress_type)
{
        struct async_extent *async_extent;

        async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
        if (!async_extent)
                return -ENOMEM;
        async_extent->start = start;
        async_extent->ram_size = ram_size;
        async_extent->compressed_size = compressed_size;
        async_extent->folios = folios;
        async_extent->nr_folios = nr_folios;
        async_extent->compress_type = compress_type;
        list_add_tail(&async_extent->list, &cow->extents);
        return 0;
}

/*
 * Check if the inode needs to be submitted to compression, based on mount
 * options, defragmentation, properties or heuristics.
 */
static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
                                      u64 end)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;

        if (!btrfs_inode_can_compress(inode)) {
                WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
                        KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
                        btrfs_ino(inode));
                return 0;
        }
        /*
         * Special check for subpage.
         *
         * We lock the full page then run each delalloc range in the page, thus
         * for the following case, we will hit some subpage specific corner case:
         *
         * 0                32K                64K
         * |        |///////|        |///////|
         *                \- A                \- B
         *
         * In above case, both range A and range B will try to unlock the full
         * page [0, 64K), causing the one finished later will have page
         * unlocked already, triggering various page lock requirement BUG_ON()s.
         *
         * So here we add an artificial limit that subpage compression can only
         * if the range is fully page aligned.
         *
         * In theory we only need to ensure the first page is fully covered, but
         * the tailing partial page will be locked until the full compression
         * finishes, delaying the write of other range.
         *
         * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
         * first to prevent any submitted async extent to unlock the full page.
         * By this, we can ensure for subpage case that only the last async_cow
         * will unlock the full page.
         */
        if (fs_info->sectorsize < PAGE_SIZE) {
                if (!PAGE_ALIGNED(start) ||
                    !PAGE_ALIGNED(end + 1))
                        return 0;
        }

        /* force compress */
        if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
                return 1;
        /* defrag ioctl */
        if (inode->defrag_compress)
                return 1;
        /* bad compression ratios */
        if (inode->flags & BTRFS_INODE_NOCOMPRESS)
                return 0;
        if (btrfs_test_opt(fs_info, COMPRESS) ||
            inode->flags & BTRFS_INODE_COMPRESS ||
            inode->prop_compress)
                return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
        return 0;
}

static inline void inode_should_defrag(struct btrfs_inode *inode,
                u64 start, u64 end, u64 num_bytes, u32 small_write)
{
        /* If this is a small write inside eof, kick off a defrag */
        if (num_bytes < small_write &&
            (start > 0 || end + 1 < inode->disk_i_size))
                btrfs_add_inode_defrag(NULL, inode, small_write);
}

/*
 * Work queue call back to started compression on a file and pages.
 *
 * This is done inside an ordered work queue, and the compression is spread
 * across many cpus.  The actual IO submission is step two, and the ordered work
 * queue takes care of making sure that happens in the same order things were
 * put onto the queue by writepages and friends.
 *
 * If this code finds it can't get good compression, it puts an entry onto the
 * work queue to write the uncompressed bytes.  This makes sure that both
 * compressed inodes and uncompressed inodes are written in the same order that
 * the flusher thread sent them down.
 */
static void compress_file_range(struct btrfs_work *work)
{
        struct async_chunk *async_chunk =
                container_of(work, struct async_chunk, work);
        struct btrfs_inode *inode = async_chunk->inode;
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct address_space *mapping = inode->vfs_inode.i_mapping;
        u64 blocksize = fs_info->sectorsize;
        u64 start = async_chunk->start;
        u64 end = async_chunk->end;
        u64 actual_end;
        u64 i_size;
        int ret = 0;
        struct folio **folios;
        unsigned long nr_folios;
        unsigned long total_compressed = 0;
        unsigned long total_in = 0;
        unsigned int poff;
        int i;
        int compress_type = fs_info->compress_type;

        inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);

        /*
         * We need to call clear_page_dirty_for_io on each page in the range.
         * Otherwise applications with the file mmap'd can wander in and change
         * the page contents while we are compressing them.
         */
        extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);

        /*
         * We need to save i_size before now because it could change in between
         * us evaluating the size and assigning it.  This is because we lock and
         * unlock the page in truncate and fallocate, and then modify the i_size
         * later on.
         *
         * The barriers are to emulate READ_ONCE, remove that once i_size_read
         * does that for us.
         */
        barrier();
        i_size = i_size_read(&inode->vfs_inode);
        barrier();
        actual_end = min_t(u64, i_size, end + 1);
again:
        folios = NULL;
        nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
        nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES);

        /*
         * we don't want to send crud past the end of i_size through
         * compression, that's just a waste of CPU time.  So, if the
         * end of the file is before the start of our current
         * requested range of bytes, we bail out to the uncompressed
         * cleanup code that can deal with all of this.
         *
         * It isn't really the fastest way to fix things, but this is a
         * very uncommon corner.
         */
        if (actual_end <= start)
                goto cleanup_and_bail_uncompressed;

        total_compressed = actual_end - start;

        /*
         * Skip compression for a small file range(<=blocksize) that
         * isn't an inline extent, since it doesn't save disk space at all.
         */
        if (total_compressed <= blocksize &&
           (start > 0 || end + 1 < inode->disk_i_size))
                goto cleanup_and_bail_uncompressed;

        /*
         * For subpage case, we require full page alignment for the sector
         * aligned range.
         * Thus we must also check against @actual_end, not just @end.
         */
        if (blocksize < PAGE_SIZE) {
                if (!PAGE_ALIGNED(start) ||
                    !PAGE_ALIGNED(round_up(actual_end, blocksize)))
                        goto cleanup_and_bail_uncompressed;
        }

        total_compressed = min_t(unsigned long, total_compressed,
                        BTRFS_MAX_UNCOMPRESSED);
        total_in = 0;
        ret = 0;

        /*
         * We do compression for mount -o compress and when the inode has not
         * been flagged as NOCOMPRESS.  This flag can change at any time if we
         * discover bad compression ratios.
         */
        if (!inode_need_compress(inode, start, end))
                goto cleanup_and_bail_uncompressed;

        folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS);
        if (!folios) {
                /*
                 * Memory allocation failure is not a fatal error, we can fall
                 * back to uncompressed code.
                 */
                goto cleanup_and_bail_uncompressed;
        }

        if (inode->defrag_compress)
                compress_type = inode->defrag_compress;
        else if (inode->prop_compress)
                compress_type = inode->prop_compress;

        /* Compression level is applied here. */
        ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4),
                                    mapping, start, folios, &nr_folios, &total_in,
                                    &total_compressed);
        if (ret)
                goto mark_incompressible;

        /*
         * Zero the tail end of the last page, as we might be sending it down
         * to disk.
         */
        poff = offset_in_page(total_compressed);
        if (poff)
                folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff);

        /*
         * Try to create an inline extent.
         *
         * If we didn't compress the entire range, try to create an uncompressed
         * inline extent, else a compressed one.
         *
         * Check cow_file_range() for why we don't even try to create inline
         * extent for the subpage case.
         */
        if (total_in < actual_end)
                ret = cow_file_range_inline(inode, start, end, 0,
                                            BTRFS_COMPRESS_NONE, NULL, false);
        else
                ret = cow_file_range_inline(inode, start, end, total_compressed,
                                            compress_type, folios[0], false);
        if (ret <= 0) {
                if (ret < 0)
                        mapping_set_error(mapping, -EIO);
                goto free_pages;
        }

        /*
         * We aren't doing an inline extent. Round the compressed size up to a
         * block size boundary so the allocator does sane things.
         */
        total_compressed = ALIGN(total_compressed, blocksize);

        /*
         * One last check to make sure the compression is really a win, compare
         * the page count read with the blocks on disk, compression must free at
         * least one sector.
         */
        total_in = round_up(total_in, fs_info->sectorsize);
        if (total_compressed + blocksize > total_in)
                goto mark_incompressible;

        /*
         * The async work queues will take care of doing actual allocation on
         * disk for these compressed pages, and will submit the bios.
         */
        ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios,
                               nr_folios, compress_type);
        BUG_ON(ret);
        if (start + total_in < end) {
                start += total_in;
                cond_resched();
                goto again;
        }
        return;

mark_incompressible:
        if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
                inode->flags |= BTRFS_INODE_NOCOMPRESS;
cleanup_and_bail_uncompressed:
        ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
                               BTRFS_COMPRESS_NONE);
        BUG_ON(ret);
free_pages:
        if (folios) {
                for (i = 0; i < nr_folios; i++) {
                        WARN_ON(folios[i]->mapping);
                        btrfs_free_compr_folio(folios[i]);
                }
                kfree(folios);
        }
}

static void free_async_extent_pages(struct async_extent *async_extent)
{
        int i;

        if (!async_extent->folios)
                return;

        for (i = 0; i < async_extent->nr_folios; i++) {
                WARN_ON(async_extent->folios[i]->mapping);
                btrfs_free_compr_folio(async_extent->folios[i]);
        }
        kfree(async_extent->folios);
        async_extent->nr_folios = 0;
        async_extent->folios = NULL;
}

static void submit_uncompressed_range(struct btrfs_inode *inode,
                                      struct async_extent *async_extent,
                                      struct page *locked_page)
{
        u64 start = async_extent->start;
        u64 end = async_extent->start + async_extent->ram_size - 1;
        int ret;
        struct writeback_control wbc = {
                .sync_mode                = WB_SYNC_ALL,
                .range_start                = start,
                .range_end                = end,
                .no_cgroup_owner        = 1,
        };

        wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
        ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false);
        wbc_detach_inode(&wbc);
        if (ret < 0) {
                btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
                if (locked_page) {
                        const u64 page_start = page_offset(locked_page);

                        set_page_writeback(locked_page);
                        end_page_writeback(locked_page);
                        btrfs_mark_ordered_io_finished(inode, locked_page,
                                                       page_start, PAGE_SIZE,
                                                       !ret);
                        mapping_set_error(locked_page->mapping, ret);
                        unlock_page(locked_page);
                }
        }
}

static void submit_one_async_extent(struct async_chunk *async_chunk,
                                    struct async_extent *async_extent,
                                    u64 *alloc_hint)
{
        struct btrfs_inode *inode = async_chunk->inode;
        struct extent_io_tree *io_tree = &inode->io_tree;
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_ordered_extent *ordered;
        struct btrfs_key ins;
        struct page *locked_page = NULL;
        struct extent_state *cached = NULL;
        struct extent_map *em;
        int ret = 0;
        u64 start = async_extent->start;
        u64 end = async_extent->start + async_extent->ram_size - 1;

        if (async_chunk->blkcg_css)
                kthread_associate_blkcg(async_chunk->blkcg_css);

        /*
         * If async_chunk->locked_page is in the async_extent range, we need to
         * handle it.
         */
        if (async_chunk->locked_page) {
                u64 locked_page_start = page_offset(async_chunk->locked_page);
                u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;

                if (!(start >= locked_page_end || end <= locked_page_start))
                        locked_page = async_chunk->locked_page;
        }

        if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
                submit_uncompressed_range(inode, async_extent, locked_page);
                goto done;
        }

        ret = btrfs_reserve_extent(root, async_extent->ram_size,
                                   async_extent->compressed_size,
                                   async_extent->compressed_size,
                                   0, *alloc_hint, &ins, 1, 1);
        if (ret) {
                /*
                 * We can't reserve contiguous space for the compressed size.
                 * Unlikely, but it's possible that we could have enough
                 * non-contiguous space for the uncompressed size instead.  So
                 * fall back to uncompressed.
                 */
                submit_uncompressed_range(inode, async_extent, locked_page);
                goto done;
        }

        lock_extent(io_tree, start, end, &cached);

        /* Here we're doing allocation and writeback of the compressed pages */
        em = create_io_em(inode, start,
                          async_extent->ram_size,        /* len */
                          start,                        /* orig_start */
                          ins.objectid,                        /* block_start */
                          ins.offset,                        /* block_len */
                          ins.offset,                        /* orig_block_len */
                          async_extent->ram_size,        /* ram_bytes */
                          async_extent->compress_type,
                          BTRFS_ORDERED_COMPRESSED);
        if (IS_ERR(em)) {
                ret = PTR_ERR(em);
                goto out_free_reserve;
        }
        free_extent_map(em);

        ordered = btrfs_alloc_ordered_extent(inode, start,        /* file_offset */
                                       async_extent->ram_size,        /* num_bytes */
                                       async_extent->ram_size,        /* ram_bytes */
                                       ins.objectid,                /* disk_bytenr */
                                       ins.offset,                /* disk_num_bytes */
                                       0,                        /* offset */
                                       1 << BTRFS_ORDERED_COMPRESSED,
                                       async_extent->compress_type);
        if (IS_ERR(ordered)) {
                btrfs_drop_extent_map_range(inode, start, end, false);
                ret = PTR_ERR(ordered);
                goto out_free_reserve;
        }
        btrfs_dec_block_group_reservations(fs_info, ins.objectid);

        /* Clear dirty, set writeback and unlock the pages. */
        extent_clear_unlock_delalloc(inode, start, end,
                        NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC,
                        PAGE_UNLOCK | PAGE_START_WRITEBACK);
        btrfs_submit_compressed_write(ordered,
                            async_extent->folios,        /* compressed_folios */
                            async_extent->nr_folios,
                            async_chunk->write_flags, true);
        *alloc_hint = ins.objectid + ins.offset;
done:
        if (async_chunk->blkcg_css)
                kthread_associate_blkcg(NULL);
        kfree(async_extent);
        return;

out_free_reserve:
        btrfs_dec_block_group_reservations(fs_info, ins.objectid);
        btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
        mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
        extent_clear_unlock_delalloc(inode, start, end,
                                     NULL, &cached,
                                     EXTENT_LOCKED | EXTENT_DELALLOC |
                                     EXTENT_DELALLOC_NEW |
                                     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
                                     PAGE_UNLOCK | PAGE_START_WRITEBACK |
                                     PAGE_END_WRITEBACK);
        free_async_extent_pages(async_extent);
        if (async_chunk->blkcg_css)
                kthread_associate_blkcg(NULL);
        btrfs_debug(fs_info,
"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
                    btrfs_root_id(root), btrfs_ino(inode), start,
                    async_extent->ram_size, ret);
        kfree(async_extent);
}

static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
                                      u64 num_bytes)
{
        struct extent_map_tree *em_tree = &inode->extent_tree;
        struct extent_map *em;
        u64 alloc_hint = 0;

        read_lock(&em_tree->lock);
        em = search_extent_mapping(em_tree, start, num_bytes);
        if (em) {
                /*
                 * if block start isn't an actual block number then find the
                 * first block in this inode and use that as a hint.  If that
                 * block is also bogus then just don't worry about it.
                 */
                if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
                        free_extent_map(em);
                        em = search_extent_mapping(em_tree, 0, 0);
                        if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
                                alloc_hint = em->block_start;
                        if (em)
                                free_extent_map(em);
                } else {
                        alloc_hint = em->block_start;
                        free_extent_map(em);
                }
        }
        read_unlock(&em_tree->lock);

        return alloc_hint;
}

/*
 * when extent_io.c finds a delayed allocation range in the file,
 * the call backs end up in this code.  The basic idea is to
 * allocate extents on disk for the range, and create ordered data structs
 * in ram to track those extents.
 *
 * locked_page is the page that writepage had locked already.  We use
 * it to make sure we don't do extra locks or unlocks.
 *
 * When this function fails, it unlocks all pages except @locked_page.
 *
 * When this function successfully creates an inline extent, it returns 1 and
 * unlocks all pages including locked_page and starts I/O on them.
 * (In reality inline extents are limited to a single page, so locked_page is
 * the only page handled anyway).
 *
 * When this function succeed and creates a normal extent, the page locking
 * status depends on the passed in flags:
 *
 * - If @keep_locked is set, all pages are kept locked.
 * - Else all pages except for @locked_page are unlocked.
 *
 * When a failure happens in the second or later iteration of the
 * while-loop, the ordered extents created in previous iterations are kept
 * intact. So, the caller must clean them up by calling
 * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
 * example.
 */
static noinline int cow_file_range(struct btrfs_inode *inode,
                                   struct page *locked_page, u64 start, u64 end,
                                   u64 *done_offset,
                                   bool keep_locked, bool no_inline)
{
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_state *cached = NULL;
        u64 alloc_hint = 0;
        u64 orig_start = start;
        u64 num_bytes;
        unsigned long ram_size;
        u64 cur_alloc_size = 0;
        u64 min_alloc_size;
        u64 blocksize = fs_info->sectorsize;
        struct btrfs_key ins;
        struct extent_map *em;
        unsigned clear_bits;
        unsigned long page_ops;
        bool extent_reserved = false;
        int ret = 0;

        if (btrfs_is_free_space_inode(inode)) {
                ret = -EINVAL;
                goto out_unlock;
        }

        num_bytes = ALIGN(end - start + 1, blocksize);
        num_bytes = max(blocksize,  num_bytes);
        ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));

        inode_should_defrag(inode, start, end, num_bytes, SZ_64K);

        if (!no_inline) {
                /* lets try to make an inline extent */
                ret = cow_file_range_inline(inode, start, end, 0,
                                            BTRFS_COMPRESS_NONE, NULL, false);
                if (ret <= 0) {
                        /*
                         * We succeeded, return 1 so the caller knows we're done
                         * with this page and already handled the IO.
                         *
                         * If there was an error then cow_file_range_inline() has
                         * already done the cleanup.
                         */
                        if (ret == 0)
                                ret = 1;
                        goto done;
                }
        }

        alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);

        /*
         * Relocation relies on the relocated extents to have exactly the same
         * size as the original extents. Normally writeback for relocation data
         * extents follows a NOCOW path because relocation preallocates the
         * extents. However, due to an operation such as scrub turning a block
         * group to RO mode, it may fallback to COW mode, so we must make sure
         * an extent allocated during COW has exactly the requested size and can
         * not be split into smaller extents, otherwise relocation breaks and
         * fails during the stage where it updates the bytenr of file extent
         * items.
         */
        if (btrfs_is_data_reloc_root(root))
                min_alloc_size = num_bytes;
        else
                min_alloc_size = fs_info->sectorsize;

        while (num_bytes > 0) {
                struct btrfs_ordered_extent *ordered;

                cur_alloc_size = num_bytes;
                ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
                                           min_alloc_size, 0, alloc_hint,
                                           &ins, 1, 1);
                if (ret == -EAGAIN) {
                        /*
                         * btrfs_reserve_extent only returns -EAGAIN for zoned
                         * file systems, which is an indication that there are
                         * no active zones to allocate from at the moment.
                         *
                         * If this is the first loop iteration, wait for at
                         * least one zone to finish before retrying the
                         * allocation.  Otherwise ask the caller to write out
                         * the already allocated blocks before coming back to
                         * us, or return -ENOSPC if it can't handle retries.
                         */
                        ASSERT(btrfs_is_zoned(fs_info));
                        if (start == orig_start) {
                                wait_on_bit_io(&inode->root->fs_info->flags,
                                               BTRFS_FS_NEED_ZONE_FINISH,
                                               TASK_UNINTERRUPTIBLE);
                                continue;
                        }
                        if (done_offset) {
                                *done_offset = start - 1;
                                return 0;
                        }
                        ret = -ENOSPC;
                }
                if (ret < 0)
                        goto out_unlock;
                cur_alloc_size = ins.offset;
                extent_reserved = true;

                ram_size = ins.offset;

                lock_extent(&inode->io_tree, start, start + ram_size - 1,
                            &cached);

                em = create_io_em(inode, start, ins.offset, /* len */
                                  start, /* orig_start */
                                  ins.objectid, /* block_start */
                                  ins.offset, /* block_len */
                                  ins.offset, /* orig_block_len */
                                  ram_size, /* ram_bytes */
                                  BTRFS_COMPRESS_NONE, /* compress_type */
                                  BTRFS_ORDERED_REGULAR /* type */);
                if (IS_ERR(em)) {
                        unlock_extent(&inode->io_tree, start,
                                      start + ram_size - 1, &cached);
                        ret = PTR_ERR(em);
                        goto out_reserve;
                }
                free_extent_map(em);

                ordered = btrfs_alloc_ordered_extent(inode, start, ram_size,
                                        ram_size, ins.objectid, cur_alloc_size,
                                        0, 1 << BTRFS_ORDERED_REGULAR,
                                        BTRFS_COMPRESS_NONE);
                if (IS_ERR(ordered)) {
                        unlock_extent(&inode->io_tree, start,
                                      start + ram_size - 1, &cached);
                        ret = PTR_ERR(ordered);
                        goto out_drop_extent_cache;
                }

                if (btrfs_is_data_reloc_root(root)) {
                        ret = btrfs_reloc_clone_csums(ordered);

                        /*
                         * Only drop cache here, and process as normal.
                         *
                         * We must not allow extent_clear_unlock_delalloc()
                         * at out_unlock label to free meta of this ordered
                         * extent, as its meta should be freed by
                         * btrfs_finish_ordered_io().
                         *
                         * So we must continue until @start is increased to
                         * skip current ordered extent.
                         */
                        if (ret)
                                btrfs_drop_extent_map_range(inode, start,
                                                            start + ram_size - 1,
                                                            false);
                }
                btrfs_put_ordered_extent(ordered);

                btrfs_dec_block_group_reservations(fs_info, ins.objectid);

                /*
                 * We're not doing compressed IO, don't unlock the first page
                 * (which the caller expects to stay locked), don't clear any
                 * dirty bits and don't set any writeback bits
                 *
                 * Do set the Ordered (Private2) bit so we know this page was
                 * properly setup for writepage.
                 */
                page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
                page_ops |= PAGE_SET_ORDERED;

                extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
                                             locked_page, &cached,
                                             EXTENT_LOCKED | EXTENT_DELALLOC,
                                             page_ops);
                if (num_bytes < cur_alloc_size)
                        num_bytes = 0;
                else
                        num_bytes -= cur_alloc_size;
                alloc_hint = ins.objectid + ins.offset;
                start += cur_alloc_size;
                extent_reserved = false;

                /*
                 * btrfs_reloc_clone_csums() error, since start is increased
                 * extent_clear_unlock_delalloc() at out_unlock label won't
                 * free metadata of current ordered extent, we're OK to exit.
                 */
                if (ret)
                        goto out_unlock;
        }
done:
        if (done_offset)
                *done_offset = end;
        return ret;

out_drop_extent_cache:
        btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
out_reserve:
        btrfs_dec_block_group_reservations(fs_info, ins.objectid);
        btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_unlock:
        /*
         * Now, we have three regions to clean up:
         *
         * |-------(1)----|---(2)---|-------------(3)----------|
         * `- orig_start  `- start  `- start + cur_alloc_size  `- end
         *
         * We process each region below.
         */

        clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
                EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
        page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;

        /*
         * For the range (1). We have already instantiated the ordered extents
         * for this region. They are cleaned up by
         * btrfs_cleanup_ordered_extents() in e.g,
         * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
         * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
         * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
         * function.
         *
         * However, in case of @keep_locked, we still need to unlock the pages
         * (except @locked_page) to ensure all the pages are unlocked.
         */
        if (keep_locked && orig_start < start) {
                if (!locked_page)
                        mapping_set_error(inode->vfs_inode.i_mapping, ret);
                extent_clear_unlock_delalloc(inode, orig_start, start - 1,
                                             locked_page, NULL, 0, page_ops);
        }

        /*
         * At this point we're unlocked, we want to make sure we're only
         * clearing these flags under the extent lock, so lock the rest of the
         * range and clear everything up.
         */
        lock_extent(&inode->io_tree, start, end, NULL);

        /*
         * For the range (2). If we reserved an extent for our delalloc range
         * (or a subrange) and failed to create the respective ordered extent,
         * then it means that when we reserved the extent we decremented the
         * extent's size from the data space_info's bytes_may_use counter and
         * incremented the space_info's bytes_reserved counter by the same
         * amount. We must make sure extent_clear_unlock_delalloc() does not try
         * to decrement again the data space_info's bytes_may_use counter,
         * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
         */
        if (extent_reserved) {
                extent_clear_unlock_delalloc(inode, start,
                                             start + cur_alloc_size - 1,
                                             locked_page, &cached,
                                             clear_bits,
                                             page_ops);
                start += cur_alloc_size;
        }

        /*
         * For the range (3). We never touched the region. In addition to the
         * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
         * space_info's bytes_may_use counter, reserved in
         * btrfs_check_data_free_space().
         */
        if (start < end) {
                clear_bits |= EXTENT_CLEAR_DATA_RESV;
                extent_clear_unlock_delalloc(inode, start, end, locked_page,
                                             &cached, clear_bits, page_ops);
        }
        return ret;
}

/*
 * Phase two of compressed writeback.  This is the ordered portion of the code,
 * which only gets called in the order the work was queued.  We walk all the
 * async extents created by compress_file_range and send them down to the disk.
 *
 * If called with @do_free == true then it'll try to finish the work and free
 * the work struct eventually.
 */
static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free)
{
        struct async_chunk *async_chunk = container_of(work, struct async_chunk,
                                                     work);
        struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
        struct async_extent *async_extent;
        unsigned long nr_pages;
        u64 alloc_hint = 0;

        if (do_free) {
                struct async_chunk *async_chunk;
                struct async_cow *async_cow;

                async_chunk = container_of(work, struct async_chunk, work);
                btrfs_add_delayed_iput(async_chunk->inode);
                if (async_chunk->blkcg_css)
                        css_put(async_chunk->blkcg_css);

                async_cow = async_chunk->async_cow;
                if (atomic_dec_and_test(&async_cow->num_chunks))
                        kvfree(async_cow);
                return;
        }

        nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
                PAGE_SHIFT;

        while (!list_empty(&async_chunk->extents)) {
                async_extent = list_entry(async_chunk->extents.next,
                                          struct async_extent, list);
                list_del(&async_extent->list);
                submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
        }

        /* atomic_sub_return implies a barrier */
        if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
            5 * SZ_1M)
                cond_wake_up_nomb(&fs_info->async_submit_wait);
}

static bool run_delalloc_compressed(struct btrfs_inode *inode,
                                    struct page *locked_page, u64 start,
                                    u64 end, struct writeback_control *wbc)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
        struct async_cow *ctx;
        struct async_chunk *async_chunk;
        unsigned long nr_pages;
        u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
        int i;
        unsigned nofs_flag;
        const blk_opf_t write_flags = wbc_to_write_flags(wbc);

        nofs_flag = memalloc_nofs_save();
        ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
        memalloc_nofs_restore(nofs_flag);
        if (!ctx)
                return false;

        set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);

        async_chunk = ctx->chunks;
        atomic_set(&ctx->num_chunks, num_chunks);

        for (i = 0; i < num_chunks; i++) {
                u64 cur_end = min(end, start + SZ_512K - 1);

                /*
                 * igrab is called higher up in the call chain, take only the
                 * lightweight reference for the callback lifetime
                 */
                ihold(&inode->vfs_inode);
                async_chunk[i].async_cow = ctx;
                async_chunk[i].inode = inode;
                async_chunk[i].start = start;
                async_chunk[i].end = cur_end;
                async_chunk[i].write_flags = write_flags;
                INIT_LIST_HEAD(&async_chunk[i].extents);

                /*
                 * The locked_page comes all the way from writepage and its
                 * the original page we were actually given.  As we spread
                 * this large delalloc region across multiple async_chunk
                 * structs, only the first struct needs a pointer to locked_page
                 *
                 * This way we don't need racey decisions about who is supposed
                 * to unlock it.
                 */
                if (locked_page) {
                        /*
                         * Depending on the compressibility, the pages might or
                         * might not go through async.  We want all of them to
                         * be accounted against wbc once.  Let's do it here
                         * before the paths diverge.  wbc accounting is used
                         * only for foreign writeback detection and doesn't
                         * need full accuracy.  Just account the whole thing
                         * against the first page.
                         */
                        wbc_account_cgroup_owner(wbc, locked_page,
                                                 cur_end - start);
                        async_chunk[i].locked_page = locked_page;
                        locked_page = NULL;
                } else {
                        async_chunk[i].locked_page = NULL;
                }

                if (blkcg_css != blkcg_root_css) {
                        css_get(blkcg_css);
                        async_chunk[i].blkcg_css = blkcg_css;
                        async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
                } else {
                        async_chunk[i].blkcg_css = NULL;
                }

                btrfs_init_work(&async_chunk[i].work, compress_file_range,
                                submit_compressed_extents);

                nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
                atomic_add(nr_pages, &fs_info->async_delalloc_pages);

                btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);

                start = cur_end + 1;
        }
        return true;
}

/*
 * Run the delalloc range from start to end, and write back any dirty pages
 * covered by the range.
 */
static noinline int run_delalloc_cow(struct btrfs_inode *inode,
                                     struct page *locked_page, u64 start,
                                     u64 end, struct writeback_control *wbc,
                                     bool pages_dirty)
{
        u64 done_offset = end;
        int ret;

        while (start <= end) {
                ret = cow_file_range(inode, locked_page, start, end, &done_offset,
                                     true, false);
                if (ret)
                        return ret;
                extent_write_locked_range(&inode->vfs_inode, locked_page, start,
                                          done_offset, wbc, pages_dirty);
                start = done_offset + 1;
        }

        return 1;
}

static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
                           const u64 start, const u64 end)
{
        const bool is_space_ino = btrfs_is_free_space_inode(inode);
        const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
        const u64 range_bytes = end + 1 - start;
        struct extent_io_tree *io_tree = &inode->io_tree;
        struct extent_state *cached_state = NULL;
        u64 range_start = start;
        u64 count;
        int ret;

        /*
         * If EXTENT_NORESERVE is set it means that when the buffered write was
         * made we had not enough available data space and therefore we did not
         * reserve data space for it, since we though we could do NOCOW for the
         * respective file range (either there is prealloc extent or the inode
         * has the NOCOW bit set).
         *
         * However when we need to fallback to COW mode (because for example the
         * block group for the corresponding extent was turned to RO mode by a
         * scrub or relocation) we need to do the following:
         *
         * 1) We increment the bytes_may_use counter of the data space info.
         *    If COW succeeds, it allocates a new data extent and after doing
         *    that it decrements the space info's bytes_may_use counter and
         *    increments its bytes_reserved counter by the same amount (we do
         *    this at btrfs_add_reserved_bytes()). So we need to increment the
         *    bytes_may_use counter to compensate (when space is reserved at
         *    buffered write time, the bytes_may_use counter is incremented);
         *
         * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
         *    that if the COW path fails for any reason, it decrements (through
         *    extent_clear_unlock_delalloc()) the bytes_may_use counter of the
         *    data space info, which we incremented in the step above.
         *
         * If we need to fallback to cow and the inode corresponds to a free
         * space cache inode or an inode of the data relocation tree, we must
         * also increment bytes_may_use of the data space_info for the same
         * reason. Space caches and relocated data extents always get a prealloc
         * extent for them, however scrub or balance may have set the block
         * group that contains that extent to RO mode and therefore force COW
         * when starting writeback.
         */
        lock_extent(io_tree, start, end, &cached_state);
        count = count_range_bits(io_tree, &range_start, end, range_bytes,
                                 EXTENT_NORESERVE, 0, NULL);
        if (count > 0 || is_space_ino || is_reloc_ino) {
                u64 bytes = count;
                struct btrfs_fs_info *fs_info = inode->root->fs_info;
                struct btrfs_space_info *sinfo = fs_info->data_sinfo;

                if (is_space_ino || is_reloc_ino)
                        bytes = range_bytes;

                spin_lock(&sinfo->lock);
                btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
                spin_unlock(&sinfo->lock);

                if (count > 0)
                        clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
                                         NULL);
        }
        unlock_extent(io_tree, start, end, &cached_state);

        /*
         * Don't try to create inline extents, as a mix of inline extent that
         * is written out and unlocked directly and a normal NOCOW extent
         * doesn't work.
         */
        ret = cow_file_range(inode, locked_page, start, end, NULL, false, true);
        ASSERT(ret != 1);
        return ret;
}

struct can_nocow_file_extent_args {
        /* Input fields. */

        /* Start file offset of the range we want to NOCOW. */
        u64 start;
        /* End file offset (inclusive) of the range we want to NOCOW. */
        u64 end;
        bool writeback_path;
        bool strict;
        /*
         * Free the path passed to can_nocow_file_extent() once it's not needed
         * anymore.
         */
        bool free_path;

        /* Output fields. Only set when can_nocow_file_extent() returns 1. */

        u64 disk_bytenr;
        u64 disk_num_bytes;
        u64 extent_offset;
        /* Number of bytes that can be written to in NOCOW mode. */
        u64 num_bytes;
};

/*
 * Check if we can NOCOW the file extent that the path points to.
 * This function may return with the path released, so the caller should check
 * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
 *
 * Returns: < 0 on error
 *            0 if we can not NOCOW
 *            1 if we can NOCOW
 */
static int can_nocow_file_extent(struct btrfs_path *path,
                                 struct btrfs_key *key,
                                 struct btrfs_inode *inode,
                                 struct can_nocow_file_extent_args *args)
{
        const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
        struct extent_buffer *leaf = path->nodes[0];
        struct btrfs_root *root = inode->root;
        struct btrfs_file_extent_item *fi;
        struct btrfs_root *csum_root;
        u64 extent_end;
        u8 extent_type;
        int can_nocow = 0;
        int ret = 0;
        bool nowait = path->nowait;

        fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
        extent_type = btrfs_file_extent_type(leaf, fi);

        if (extent_type == BTRFS_FILE_EXTENT_INLINE)
                goto out;

        /* Can't access these fields unless we know it's not an inline extent. */
        args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
        args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
        args->extent_offset = btrfs_file_extent_offset(leaf, fi);

        if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
            extent_type == BTRFS_FILE_EXTENT_REG)
                goto out;

        /*
         * If the extent was created before the generation where the last snapshot
         * for its subvolume was created, then this implies the extent is shared,
         * hence we must COW.
         */
        if (!args->strict &&
            btrfs_file_extent_generation(leaf, fi) <=
            btrfs_root_last_snapshot(&root->root_item))
                goto out;

        /* An explicit hole, must COW. */
        if (args->disk_bytenr == 0)
                goto out;

        /* Compressed/encrypted/encoded extents must be COWed. */
        if (btrfs_file_extent_compression(leaf, fi) ||
            btrfs_file_extent_encryption(leaf, fi) ||
            btrfs_file_extent_other_encoding(leaf, fi))
                goto out;

        extent_end = btrfs_file_extent_end(path);

        /*
         * The following checks can be expensive, as they need to take other
         * locks and do btree or rbtree searches, so release the path to avoid
         * blocking other tasks for too long.
         */
        btrfs_release_path(path);

        ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
                                    key->offset - args->extent_offset,
                                    args->disk_bytenr, args->strict, path);
        WARN_ON_ONCE(ret > 0 && is_freespace_inode);
        if (ret != 0)
                goto out;

        if (args->free_path) {
                /*
                 * We don't need the path anymore, plus through the
                 * btrfs_lookup_csums_list() call below we will end up allocating
                 * another path. So free the path to avoid unnecessary extra
                 * memory usage.
                 */
                btrfs_free_path(path);
                path = NULL;
        }

        /* If there are pending snapshots for this root, we must COW. */
        if (args->writeback_path && !is_freespace_inode &&
            atomic_read(&root->snapshot_force_cow))
                goto out;

        args->disk_bytenr += args->extent_offset;
        args->disk_bytenr += args->start - key->offset;
        args->num_bytes = min(args->end + 1, extent_end) - args->start;

        /*
         * Force COW if csums exist in the range. This ensures that csums for a
         * given extent are either valid or do not exist.
         */

        csum_root = btrfs_csum_root(root->fs_info, args->disk_bytenr);
        ret = btrfs_lookup_csums_list(csum_root, args->disk_bytenr,
                                      args->disk_bytenr + args->num_bytes - 1,
                                      NULL, nowait);
        WARN_ON_ONCE(ret > 0 && is_freespace_inode);
        if (ret != 0)
                goto out;

        can_nocow = 1;
 out:
        if (args->free_path && path)
                btrfs_free_path(path);

        return ret < 0 ? ret : can_nocow;
}

/*
 * when nowcow writeback call back.  This checks for snapshots or COW copies
 * of the extents that exist in the file, and COWs the file as required.
 *
 * If no cow copies or snapshots exist, we write directly to the existing
 * blocks on disk
 */
static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
                                       struct page *locked_page,
                                       const u64 start, const u64 end)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct btrfs_root *root = inode->root;
        struct btrfs_path *path;
        u64 cow_start = (u64)-1;
        u64 cur_offset = start;
        int ret;
        bool check_prev = true;
        u64 ino = btrfs_ino(inode);
        struct can_nocow_file_extent_args nocow_args = { 0 };

        /*
         * Normally on a zoned device we're only doing COW writes, but in case
         * of relocation on a zoned filesystem serializes I/O so that we're only
         * writing sequentially and can end up here as well.
         */
        ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto error;
        }

        nocow_args.end = end;
        nocow_args.writeback_path = true;

        while (cur_offset <= end) {
                struct btrfs_block_group *nocow_bg = NULL;
                struct btrfs_ordered_extent *ordered;
                struct btrfs_key found_key;
                struct btrfs_file_extent_item *fi;
                struct extent_buffer *leaf;
                struct extent_state *cached_state = NULL;
                u64 extent_end;
                u64 ram_bytes;
                u64 nocow_end;
                int extent_type;
                bool is_prealloc;

                ret = btrfs_lookup_file_extent(NULL, root, path, ino,
                                               cur_offset, 0);
                if (ret < 0)
                        goto error;

                /*
                 * If there is no extent for our range when doing the initial
                 * search, then go back to the previous slot as it will be the
                 * one containing the search offset
                 */
                if (ret > 0 && path->slots[0] > 0 && check_prev) {
                        leaf = path->nodes[0];
                        btrfs_item_key_to_cpu(leaf, &found_key,
                                              path->slots[0] - 1);
                        if (found_key.objectid == ino &&
                            found_key.type == BTRFS_EXTENT_DATA_KEY)
                                path->slots[0]--;
                }
                check_prev = false;
next_slot:
                /* Go to next leaf if we have exhausted the current one */
                leaf = path->nodes[0];
                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
                        ret = btrfs_next_leaf(root, path);
                        if (ret < 0)
                                goto error;
                        if (ret > 0)
                                break;
                        leaf = path->nodes[0];
                }

                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);

                /* Didn't find anything for our INO */
                if (found_key.objectid > ino)
                        break;
                /*
                 * Keep searching until we find an EXTENT_ITEM or there are no
                 * more extents for this inode
                 */
                if (WARN_ON_ONCE(found_key.objectid < ino) ||
                    found_key.type < BTRFS_EXTENT_DATA_KEY) {
                        path->slots[0]++;
                        goto next_slot;
                }

                /* Found key is not EXTENT_DATA_KEY or starts after req range */
                if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
                    found_key.offset > end)
                        break;

                /*
                 * If the found extent starts after requested offset, then
                 * adjust extent_end to be right before this extent begins
                 */
                if (found_key.offset > cur_offset) {
                        extent_end = found_key.offset;
                        extent_type = 0;
                        goto must_cow;
                }

                /*
                 * Found extent which begins before our range and potentially
                 * intersect it
                 */
                fi = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_file_extent_item);
                extent_type = btrfs_file_extent_type(leaf, fi);
                /* If this is triggered then we have a memory corruption. */
                ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
                if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
                        ret = -EUCLEAN;
                        goto error;
                }
                ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
                extent_end = btrfs_file_extent_end(path);

                /*
                 * If the extent we got ends before our current offset, skip to
                 * the next extent.
                 */
                if (extent_end <= cur_offset) {
                        path->slots[0]++;
                        goto next_slot;
                }

                nocow_args.start = cur_offset;
                ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
                if (ret < 0)
                        goto error;
                if (ret == 0)
                        goto must_cow;

                ret = 0;
                nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
                if (!nocow_bg) {
must_cow:
                        /*
                         * If we can't perform NOCOW writeback for the range,
                         * then record the beginning of the range that needs to
                         * be COWed.  It will be written out before the next
                         * NOCOW range if we find one, or when exiting this
                         * loop.
                         */
                        if (cow_start == (u64)-1)
                                cow_start = cur_offset;
                        cur_offset = extent_end;
                        if (cur_offset > end)
                                break;
                        if (!path->nodes[0])
                                continue;
                        path->slots[0]++;
                        goto next_slot;
                }

                /*
                 * COW range from cow_start to found_key.offset - 1. As the key
                 * will contain the beginning of the first extent that can be
                 * NOCOW, following one which needs to be COW'ed
                 */
                if (cow_start != (u64)-1) {
                        ret = fallback_to_cow(inode, locked_page,
                                              cow_start, found_key.offset - 1);
                        cow_start = (u64)-1;
                        if (ret) {
                                btrfs_dec_nocow_writers(nocow_bg);
                                goto error;
                        }
                }

                nocow_end = cur_offset + nocow_args.num_bytes - 1;
                lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state);

                is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
                if (is_prealloc) {
                        u64 orig_start = found_key.offset - nocow_args.extent_offset;
                        struct extent_map *em;

                        em = create_io_em(inode, cur_offset, nocow_args.num_bytes,
                                          orig_start,
                                          nocow_args.disk_bytenr, /* block_start */
                                          nocow_args.num_bytes, /* block_len */
                                          nocow_args.disk_num_bytes, /* orig_block_len */
                                          ram_bytes, BTRFS_COMPRESS_NONE,
                                          BTRFS_ORDERED_PREALLOC);
                        if (IS_ERR(em)) {
                                unlock_extent(&inode->io_tree, cur_offset,
                                              nocow_end, &cached_state);
                                btrfs_dec_nocow_writers(nocow_bg);
                                ret = PTR_ERR(em);
                                goto error;
                        }
                        free_extent_map(em);
                }

                ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
                                nocow_args.num_bytes, nocow_args.num_bytes,
                                nocow_args.disk_bytenr, nocow_args.num_bytes, 0,
                                is_prealloc
                                ? (1 << BTRFS_ORDERED_PREALLOC)
                                : (1 << BTRFS_ORDERED_NOCOW),
                                BTRFS_COMPRESS_NONE);
                btrfs_dec_nocow_writers(nocow_bg);
                if (IS_ERR(ordered)) {
                        if (is_prealloc) {
                                btrfs_drop_extent_map_range(inode, cur_offset,
                                                            nocow_end, false);
                        }
                        unlock_extent(&inode->io_tree, cur_offset,
                                      nocow_end, &cached_state);
                        ret = PTR_ERR(ordered);
                        goto error;
                }

                if (btrfs_is_data_reloc_root(root))
                        /*
                         * Error handled later, as we must prevent
                         * extent_clear_unlock_delalloc() in error handler
                         * from freeing metadata of created ordered extent.
                         */
                        ret = btrfs_reloc_clone_csums(ordered);
                btrfs_put_ordered_extent(ordered);

                extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
                                             locked_page, &cached_state,
                                             EXTENT_LOCKED | EXTENT_DELALLOC |
                                             EXTENT_CLEAR_DATA_RESV,
                                             PAGE_UNLOCK | PAGE_SET_ORDERED);

                cur_offset = extent_end;

                /*
                 * btrfs_reloc_clone_csums() error, now we're OK to call error
                 * handler, as metadata for created ordered extent will only
                 * be freed by btrfs_finish_ordered_io().
                 */
                if (ret)
                        goto error;
        }
        btrfs_release_path(path);

        if (cur_offset <= end && cow_start == (u64)-1)
                cow_start = cur_offset;

        if (cow_start != (u64)-1) {
                cur_offset = end;
                ret = fallback_to_cow(inode, locked_page, cow_start, end);
                cow_start = (u64)-1;
                if (ret)
                        goto error;
        }

        btrfs_free_path(path);
        return 0;

error:
        /*
         * If an error happened while a COW region is outstanding, cur_offset
         * needs to be reset to cow_start to ensure the COW region is unlocked
         * as well.
         */
        if (cow_start != (u64)-1)
                cur_offset = cow_start;

        /*
         * We need to lock the extent here because we're clearing DELALLOC and
         * we're not locked at this point.
         */
        if (cur_offset < end) {
                struct extent_state *cached = NULL;

                lock_extent(&inode->io_tree, cur_offset, end, &cached);
                extent_clear_unlock_delalloc(inode, cur_offset, end,
                                             locked_page, &cached,
                                             EXTENT_LOCKED | EXTENT_DELALLOC |
                                             EXTENT_DEFRAG |
                                             EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
                                             PAGE_START_WRITEBACK |
                                             PAGE_END_WRITEBACK);
        }
        btrfs_free_path(path);
        return ret;
}

static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
{
        if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
                if (inode->defrag_bytes &&
                    test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
                        return false;
                return true;
        }
        return false;
}

/*
 * Function to process delayed allocation (create CoW) for ranges which are
 * being touched for the first time.
 */
int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
                             u64 start, u64 end, struct writeback_control *wbc)
{
        const bool zoned = btrfs_is_zoned(inode->root->fs_info);
        int ret;

        /*
         * The range must cover part of the @locked_page, or a return of 1
         * can confuse the caller.
         */
        ASSERT(!(end <= page_offset(locked_page) ||
                 start >= page_offset(locked_page) + PAGE_SIZE));

        if (should_nocow(inode, start, end)) {
                ret = run_delalloc_nocow(inode, locked_page, start, end);
                goto out;
        }

        if (btrfs_inode_can_compress(inode) &&
            inode_need_compress(inode, start, end) &&
            run_delalloc_compressed(inode, locked_page, start, end, wbc))
                return 1;

        if (zoned)
                ret = run_delalloc_cow(inode, locked_page, start, end, wbc,
                                       true);
        else
                ret = cow_file_range(inode, locked_page, start, end, NULL,
                                     false, false);

out:
        if (ret < 0)
                btrfs_cleanup_ordered_extents(inode, locked_page, start,
                                              end - start + 1);
        return ret;
}

void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
                                 struct extent_state *orig, u64 split)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        u64 size;

        lockdep_assert_held(&inode->io_tree.lock);

        /* not delalloc, ignore it */
        if (!(orig->state & EXTENT_DELALLOC))
                return;

        size = orig->end - orig->start + 1;
        if (size > fs_info->max_extent_size) {
                u32 num_extents;
                u64 new_size;

                /*
                 * See the explanation in btrfs_merge_delalloc_extent, the same
                 * applies here, just in reverse.
                 */
                new_size = orig->end - split + 1;
                num_extents = count_max_extents(fs_info, new_size);
                new_size = split - orig->start;
                num_extents += count_max_extents(fs_info, new_size);
                if (count_max_extents(fs_info, size) >= num_extents)
                        return;
        }

        spin_lock(&inode->lock);
        btrfs_mod_outstanding_extents(inode, 1);
        spin_unlock(&inode->lock);
}

/*
 * Handle merged delayed allocation extents so we can keep track of new extents
 * that are just merged onto old extents, such as when we are doing sequential
 * writes, so we can properly account for the metadata space we'll need.
 */
void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
                                 struct extent_state *other)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        u64 new_size, old_size;
        u32 num_extents;

        lockdep_assert_held(&inode->io_tree.lock);

        /* not delalloc, ignore it */
        if (!(other->state & EXTENT_DELALLOC))
                return;

        if (new->start > other->start)
                new_size = new->end - other->start + 1;
        else
                new_size = other->end - new->start + 1;

        /* we're not bigger than the max, unreserve the space and go */
        if (new_size <= fs_info->max_extent_size) {
                spin_lock(&inode->lock);
                btrfs_mod_outstanding_extents(inode, -1);
                spin_unlock(&inode->lock);
                return;
        }

        /*
         * We have to add up either side to figure out how many extents were
         * accounted for before we merged into one big extent.  If the number of
         * extents we accounted for is <= the amount we need for the new range
         * then we can return, otherwise drop.  Think of it like this
         *
         * [ 4k][MAX_SIZE]
         *
         * So we've grown the extent by a MAX_SIZE extent, this would mean we
         * need 2 outstanding extents, on one side we have 1 and the other side
         * we have 1 so they are == and we can return.  But in this case
         *
         * [MAX_SIZE+4k][MAX_SIZE+4k]
         *
         * Each range on their own accounts for 2 extents, but merged together
         * they are only 3 extents worth of accounting, so we need to drop in
         * this case.
         */
        old_size = other->end - other->start + 1;
        num_extents = count_max_extents(fs_info, old_size);
        old_size = new->end - new->start + 1;
        num_extents += count_max_extents(fs_info, old_size);
        if (count_max_extents(fs_info, new_size) >= num_extents)
                return;

        spin_lock(&inode->lock);
        btrfs_mod_outstanding_extents(inode, -1);
        spin_unlock(&inode->lock);
}

static void btrfs_add_delalloc_inode(struct btrfs_inode *inode)
{
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;

        spin_lock(&root->delalloc_lock);
        ASSERT(list_empty(&inode->delalloc_inodes));
        list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
        root->nr_delalloc_inodes++;
        if (root->nr_delalloc_inodes == 1) {
                spin_lock(&fs_info->delalloc_root_lock);
                ASSERT(list_empty(&root->delalloc_root));
                list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots);
                spin_unlock(&fs_info->delalloc_root_lock);
        }
        spin_unlock(&root->delalloc_lock);
}

void btrfs_del_delalloc_inode(struct btrfs_inode *inode)
{
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;

        lockdep_assert_held(&root->delalloc_lock);

        /*
         * We may be called after the inode was already deleted from the list,
         * namely in the transaction abort path btrfs_destroy_delalloc_inodes(),
         * and then later through btrfs_clear_delalloc_extent() while the inode
         * still has ->delalloc_bytes > 0.
         */
        if (!list_empty(&inode->delalloc_inodes)) {
                list_del_init(&inode->delalloc_inodes);
                root->nr_delalloc_inodes--;
                if (!root->nr_delalloc_inodes) {
                        ASSERT(list_empty(&root->delalloc_inodes));
                        spin_lock(&fs_info->delalloc_root_lock);
                        ASSERT(!list_empty(&root->delalloc_root));
                        list_del_init(&root->delalloc_root);
                        spin_unlock(&fs_info->delalloc_root_lock);
                }
        }
}

/*
 * Properly track delayed allocation bytes in the inode and to maintain the
 * list of inodes that have pending delalloc work to be done.
 */
void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
                               u32 bits)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;

        lockdep_assert_held(&inode->io_tree.lock);

        if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
                WARN_ON(1);
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
         * but in this case, we are only testing for the DELALLOC
         * bit, which is only set or cleared with irqs on
         */
        if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
                u64 len = state->end + 1 - state->start;
                u64 prev_delalloc_bytes;
                u32 num_extents = count_max_extents(fs_info, len);

                spin_lock(&inode->lock);
                btrfs_mod_outstanding_extents(inode, num_extents);
                spin_unlock(&inode->lock);

                /* For sanity tests */
                if (btrfs_is_testing(fs_info))
                        return;

                percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
                                         fs_info->delalloc_batch);
                spin_lock(&inode->lock);
                prev_delalloc_bytes = inode->delalloc_bytes;
                inode->delalloc_bytes += len;
                if (bits & EXTENT_DEFRAG)
                        inode->defrag_bytes += len;
                spin_unlock(&inode->lock);

                /*
                 * We don't need to be under the protection of the inode's lock,
                 * because we are called while holding the inode's io_tree lock
                 * and are therefore protected against concurrent calls of this
                 * function and btrfs_clear_delalloc_extent().
                 */
                if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0)
                        btrfs_add_delalloc_inode(inode);
        }

        if (!(state->state & EXTENT_DELALLOC_NEW) &&
            (bits & EXTENT_DELALLOC_NEW)) {
                spin_lock(&inode->lock);
                inode->new_delalloc_bytes += state->end + 1 - state->start;
                spin_unlock(&inode->lock);
        }
}

/*
 * Once a range is no longer delalloc this function ensures that proper
 * accounting happens.
 */
void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
                                 struct extent_state *state, u32 bits)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        u64 len = state->end + 1 - state->start;
        u32 num_extents = count_max_extents(fs_info, len);

        lockdep_assert_held(&inode->io_tree.lock);

        if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
                spin_lock(&inode->lock);
                inode->defrag_bytes -= len;
                spin_unlock(&inode->lock);
        }

        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
         * but in this case, we are only testing for the DELALLOC
         * bit, which is only set or cleared with irqs on
         */
        if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = inode->root;
                u64 new_delalloc_bytes;

                spin_lock(&inode->lock);
                btrfs_mod_outstanding_extents(inode, -num_extents);
                spin_unlock(&inode->lock);

                /*
                 * We don't reserve metadata space for space cache inodes so we
                 * don't need to call delalloc_release_metadata if there is an
                 * error.
                 */
                if (bits & EXTENT_CLEAR_META_RESV &&
                    root != fs_info->tree_root)
                        btrfs_delalloc_release_metadata(inode, len, true);

                /* For sanity tests. */
                if (btrfs_is_testing(fs_info))
                        return;

                if (!btrfs_is_data_reloc_root(root) &&
                    !btrfs_is_free_space_inode(inode) &&
                    !(state->state & EXTENT_NORESERVE) &&
                    (bits & EXTENT_CLEAR_DATA_RESV))
                        btrfs_free_reserved_data_space_noquota(fs_info, len);

                percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
                                         fs_info->delalloc_batch);
                spin_lock(&inode->lock);
                inode->delalloc_bytes -= len;
                new_delalloc_bytes = inode->delalloc_bytes;
                spin_unlock(&inode->lock);

                /*
                 * We don't need to be under the protection of the inode's lock,
                 * because we are called while holding the inode's io_tree lock
                 * and are therefore protected against concurrent calls of this
                 * function and btrfs_set_delalloc_extent().
                 */
                if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) {
                        spin_lock(&root->delalloc_lock);
                        btrfs_del_delalloc_inode(inode);
                        spin_unlock(&root->delalloc_lock);
                }
        }

        if ((state->state & EXTENT_DELALLOC_NEW) &&
            (bits & EXTENT_DELALLOC_NEW)) {
                spin_lock(&inode->lock);
                ASSERT(inode->new_delalloc_bytes >= len);
                inode->new_delalloc_bytes -= len;
                if (bits & EXTENT_ADD_INODE_BYTES)
                        inode_add_bytes(&inode->vfs_inode, len);
                spin_unlock(&inode->lock);
        }
}

static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
                                        struct btrfs_ordered_extent *ordered)
{
        u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
        u64 len = bbio->bio.bi_iter.bi_size;
        struct btrfs_ordered_extent *new;
        int ret;

        /* Must always be called for the beginning of an ordered extent. */
        if (WARN_ON_ONCE(start != ordered->disk_bytenr))
                return -EINVAL;

        /* No need to split if the ordered extent covers the entire bio. */
        if (ordered->disk_num_bytes == len) {
                refcount_inc(&ordered->refs);
                bbio->ordered = ordered;
                return 0;
        }

        /*
         * Don't split the extent_map for NOCOW extents, as we're writing into
         * a pre-existing one.
         */
        if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
                ret = split_extent_map(bbio->inode, bbio->file_offset,
                                       ordered->num_bytes, len,
                                       ordered->disk_bytenr);
                if (ret)
                        return ret;
        }

        new = btrfs_split_ordered_extent(ordered, len);
        if (IS_ERR(new))
                return PTR_ERR(new);
        bbio->ordered = new;
        return 0;
}

/*
 * given a list of ordered sums record them in the inode.  This happens
 * at IO completion time based on sums calculated at bio submission time.
 */
static int add_pending_csums(struct btrfs_trans_handle *trans,
                             struct list_head *list)
{
        struct btrfs_ordered_sum *sum;
        struct btrfs_root *csum_root = NULL;
        int ret;

        list_for_each_entry(sum, list, list) {
                trans->adding_csums = true;
                if (!csum_root)
                        csum_root = btrfs_csum_root(trans->fs_info,
                                                    sum->logical);
                ret = btrfs_csum_file_blocks(trans, csum_root, sum);
                trans->adding_csums = false;
                if (ret)
                        return ret;
        }
        return 0;
}

static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
                                         const u64 start,
                                         const u64 len,
                                         struct extent_state **cached_state)
{
        u64 search_start = start;
        const u64 end = start + len - 1;

        while (search_start < end) {
                const u64 search_len = end - search_start + 1;
                struct extent_map *em;
                u64 em_len;
                int ret = 0;

                em = btrfs_get_extent(inode, NULL, search_start, search_len);
                if (IS_ERR(em))
                        return PTR_ERR(em);

                if (em->block_start != EXTENT_MAP_HOLE)
                        goto next;

                em_len = em->len;
                if (em->start < search_start)
                        em_len -= search_start - em->start;
                if (em_len > search_len)
                        em_len = search_len;

                ret = set_extent_bit(&inode->io_tree, search_start,
                                     search_start + em_len - 1,
                                     EXTENT_DELALLOC_NEW, cached_state);
next:
                search_start = extent_map_end(em);
                free_extent_map(em);
                if (ret)
                        return ret;
        }
        return 0;
}

int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
                              unsigned int extra_bits,
                              struct extent_state **cached_state)
{
        WARN_ON(PAGE_ALIGNED(end));

        if (start >= i_size_read(&inode->vfs_inode) &&
            !(inode->flags & BTRFS_INODE_PREALLOC)) {
                /*
                 * There can't be any extents following eof in this case so just
                 * set the delalloc new bit for the range directly.
                 */
                extra_bits |= EXTENT_DELALLOC_NEW;
        } else {
                int ret;

                ret = btrfs_find_new_delalloc_bytes(inode, start,
                                                    end + 1 - start,
                                                    cached_state);
                if (ret)
                        return ret;
        }

        return set_extent_bit(&inode->io_tree, start, end,
                              EXTENT_DELALLOC | extra_bits, cached_state);
}

/* see btrfs_writepage_start_hook for details on why this is required */
struct btrfs_writepage_fixup {
        struct page *page;
        struct btrfs_inode *inode;
        struct btrfs_work work;
};

static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
{
        struct btrfs_writepage_fixup *fixup =
                container_of(work, struct btrfs_writepage_fixup, work);
        struct btrfs_ordered_extent *ordered;
        struct extent_state *cached_state = NULL;
        struct extent_changeset *data_reserved = NULL;
        struct page *page = fixup->page;
        struct btrfs_inode *inode = fixup->inode;
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        u64 page_start = page_offset(page);
        u64 page_end = page_offset(page) + PAGE_SIZE - 1;
        int ret = 0;
        bool free_delalloc_space = true;

        /*
         * This is similar to page_mkwrite, we need to reserve the space before
         * we take the page lock.
         */
        ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
                                           PAGE_SIZE);
again:
        lock_page(page);

        /*
         * Before we queued this fixup, we took a reference on the page.
         * page->mapping may go NULL, but it shouldn't be moved to a different
         * address space.
         */
        if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
                /*
                 * Unfortunately this is a little tricky, either
                 *
                 * 1) We got here and our page had already been dealt with and
                 *    we reserved our space, thus ret == 0, so we need to just
                 *    drop our space reservation and bail.  This can happen the
                 *    first time we come into the fixup worker, or could happen
                 *    while waiting for the ordered extent.
                 * 2) Our page was already dealt with, but we happened to get an
                 *    ENOSPC above from the btrfs_delalloc_reserve_space.  In
                 *    this case we obviously don't have anything to release, but
                 *    because the page was already dealt with we don't want to
                 *    mark the page with an error, so make sure we're resetting
                 *    ret to 0.  This is why we have this check _before_ the ret
                 *    check, because we do not want to have a surprise ENOSPC
                 *    when the page was already properly dealt with.
                 */
                if (!ret) {
                        btrfs_delalloc_release_extents(inode, PAGE_SIZE);
                        btrfs_delalloc_release_space(inode, data_reserved,
                                                     page_start, PAGE_SIZE,
                                                     true);
                }
                ret = 0;
                goto out_page;
        }

        /*
         * We can't mess with the page state unless it is locked, so now that
         * it is locked bail if we failed to make our space reservation.
         */
        if (ret)
                goto out_page;

        lock_extent(&inode->io_tree, page_start, page_end, &cached_state);

        /* already ordered? We're done */
        if (PageOrdered(page))
                goto out_reserved;

        ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
        if (ordered) {
                unlock_extent(&inode->io_tree, page_start, page_end,
                              &cached_state);
                unlock_page(page);
                btrfs_start_ordered_extent(ordered);
                btrfs_put_ordered_extent(ordered);
                goto again;
        }

        ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
                                        &cached_state);
        if (ret)
                goto out_reserved;

        /*
         * Everything went as planned, we're now the owner of a dirty page with
         * delayed allocation bits set and space reserved for our COW
         * destination.
         *
         * The page was dirty when we started, nothing should have cleaned it.
         */
        BUG_ON(!PageDirty(page));
        free_delalloc_space = false;
out_reserved:
        btrfs_delalloc_release_extents(inode, PAGE_SIZE);
        if (free_delalloc_space)
                btrfs_delalloc_release_space(inode, data_reserved, page_start,
                                             PAGE_SIZE, true);
        unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
out_page:
        if (ret) {
                /*
                 * We hit ENOSPC or other errors.  Update the mapping and page
                 * to reflect the errors and clean the page.
                 */
                mapping_set_error(page->mapping, ret);
                btrfs_mark_ordered_io_finished(inode, page, page_start,
                                               PAGE_SIZE, !ret);
                clear_page_dirty_for_io(page);
        }
        btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE);
        unlock_page(page);
        put_page(page);
        kfree(fixup);
        extent_changeset_free(data_reserved);
        /*
         * As a precaution, do a delayed iput in case it would be the last iput
         * that could need flushing space. Recursing back to fixup worker would
         * deadlock.
         */
        btrfs_add_delayed_iput(inode);
}

/*
 * There are a few paths in the higher layers of the kernel that directly
 * set the page dirty bit without asking the filesystem if it is a
 * good idea.  This causes problems because we want to make sure COW
 * properly happens and the data=ordered rules are followed.
 *
 * In our case any range that doesn't have the ORDERED bit set
 * hasn't been properly setup for IO.  We kick off an async process
 * to fix it up.  The async helper will wait for ordered extents, set
 * the delalloc bit and make it safe to write the page.
 */
int btrfs_writepage_cow_fixup(struct page *page)
{
        struct inode *inode = page->mapping->host;
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct btrfs_writepage_fixup *fixup;

        /* This page has ordered extent covering it already */
        if (PageOrdered(page))
                return 0;

        /*
         * PageChecked is set below when we create a fixup worker for this page,
         * don't try to create another one if we're already PageChecked()
         *
         * The extent_io writepage code will redirty the page if we send back
         * EAGAIN.
         */
        if (PageChecked(page))
                return -EAGAIN;

        fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
        if (!fixup)
                return -EAGAIN;

        /*
         * We are already holding a reference to this inode from
         * write_cache_pages.  We need to hold it because the space reservation
         * takes place outside of the page lock, and we can't trust
         * page->mapping outside of the page lock.
         */
        ihold(inode);
        btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE);
        get_page(page);
        btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
        fixup->page = page;
        fixup->inode = BTRFS_I(inode);
        btrfs_queue_work(fs_info->fixup_workers, &fixup->work);

        return -EAGAIN;
}

static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
                                       struct btrfs_inode *inode, u64 file_pos,
                                       struct btrfs_file_extent_item *stack_fi,
                                       const bool update_inode_bytes,
                                       u64 qgroup_reserved)
{
        struct btrfs_root *root = inode->root;
        const u64 sectorsize = root->fs_info->sectorsize;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_key ins;
        u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
        u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
        u64 offset = btrfs_stack_file_extent_offset(stack_fi);
        u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
        u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
        struct btrfs_drop_extents_args drop_args = { 0 };
        int ret;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        /*
         * we may be replacing one extent in the tree with another.
         * The new extent is pinned in the extent map, and we don't want
         * to drop it from the cache until it is completely in the btree.
         *
         * So, tell btrfs_drop_extents to leave this extent in the cache.
         * the caller is expected to unpin it and allow it to be merged
         * with the others.
         */
        drop_args.path = path;
        drop_args.start = file_pos;
        drop_args.end = file_pos + num_bytes;
        drop_args.replace_extent = true;
        drop_args.extent_item_size = sizeof(*stack_fi);
        ret = btrfs_drop_extents(trans, root, inode, &drop_args);
        if (ret)
                goto out;

        if (!drop_args.extent_inserted) {
                ins.objectid = btrfs_ino(inode);
                ins.offset = file_pos;
                ins.type = BTRFS_EXTENT_DATA_KEY;

                ret = btrfs_insert_empty_item(trans, root, path, &ins,
                                              sizeof(*stack_fi));
                if (ret)
                        goto out;
        }
        leaf = path->nodes[0];
        btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
        write_extent_buffer(leaf, stack_fi,
                        btrfs_item_ptr_offset(leaf, path->slots[0]),
                        sizeof(struct btrfs_file_extent_item));

        btrfs_mark_buffer_dirty(trans, leaf);
        btrfs_release_path(path);

        /*
         * If we dropped an inline extent here, we know the range where it is
         * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
         * number of bytes only for that range containing the inline extent.
         * The remaining of the range will be processed when clearning the
         * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
         */
        if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
                u64 inline_size = round_down(drop_args.bytes_found, sectorsize);

                inline_size = drop_args.bytes_found - inline_size;
                btrfs_update_inode_bytes(inode, sectorsize, inline_size);
                drop_args.bytes_found -= inline_size;
                num_bytes -= sectorsize;
        }

        if (update_inode_bytes)
                btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);

        ins.objectid = disk_bytenr;
        ins.offset = disk_num_bytes;
        ins.type = BTRFS_EXTENT_ITEM_KEY;

        ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
        if (ret)
                goto out;

        ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
                                               file_pos - offset,
                                               qgroup_reserved, &ins);
out:
        btrfs_free_path(path);

        return ret;
}

static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
                                         u64 start, u64 len)
{
        struct btrfs_block_group *cache;

        cache = btrfs_lookup_block_group(fs_info, start);
        ASSERT(cache);

        spin_lock(&cache->lock);
        cache->delalloc_bytes -= len;
        spin_unlock(&cache->lock);

        btrfs_put_block_group(cache);
}

static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
                                             struct btrfs_ordered_extent *oe)
{
        struct btrfs_file_extent_item stack_fi;
        bool update_inode_bytes;
        u64 num_bytes = oe->num_bytes;
        u64 ram_bytes = oe->ram_bytes;

        memset(&stack_fi, 0, sizeof(stack_fi));
        btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
        btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
        btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
                                                   oe->disk_num_bytes);
        btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
        if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) {
                num_bytes = oe->truncated_len;
                ram_bytes = num_bytes;
        }
        btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
        btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
        btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
        /* Encryption and other encoding is reserved and all 0 */

        /*
         * For delalloc, when completing an ordered extent we update the inode's
         * bytes when clearing the range in the inode's io tree, so pass false
         * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
         * except if the ordered extent was truncated.
         */
        update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
                             test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
                             test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);

        return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
                                           oe->file_offset, &stack_fi,
                                           update_inode_bytes, oe->qgroup_rsv);
}

/*
 * As ordered data IO finishes, this gets called so we can finish
 * an ordered extent if the range of bytes in the file it covers are
 * fully written.
 */
int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
{
        struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_trans_handle *trans = NULL;
        struct extent_io_tree *io_tree = &inode->io_tree;
        struct extent_state *cached_state = NULL;
        u64 start, end;
        int compress_type = 0;
        int ret = 0;
        u64 logical_len = ordered_extent->num_bytes;
        bool freespace_inode;
        bool truncated = false;
        bool clear_reserved_extent = true;
        unsigned int clear_bits = EXTENT_DEFRAG;

        start = ordered_extent->file_offset;
        end = start + ordered_extent->num_bytes - 1;

        if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
            !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
            !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
            !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
                clear_bits |= EXTENT_DELALLOC_NEW;

        freespace_inode = btrfs_is_free_space_inode(inode);
        if (!freespace_inode)
                btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);

        if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
                ret = -EIO;
                goto out;
        }

        if (btrfs_is_zoned(fs_info))
                btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
                                        ordered_extent->disk_num_bytes);

        if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
                truncated = true;
                logical_len = ordered_extent->truncated_len;
                /* Truncated the entire extent, don't bother adding */
                if (!logical_len)
                        goto out;
        }

        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
                BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */

                btrfs_inode_safe_disk_i_size_write(inode, 0);
                if (freespace_inode)
                        trans = btrfs_join_transaction_spacecache(root);
                else
                        trans = btrfs_join_transaction(root);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
                        trans = NULL;
                        goto out;
                }
                trans->block_rsv = &inode->block_rsv;
                ret = btrfs_update_inode_fallback(trans, inode);
                if (ret) /* -ENOMEM or corruption */
                        btrfs_abort_transaction(trans, ret);
                goto out;
        }

        clear_bits |= EXTENT_LOCKED;
        lock_extent(io_tree, start, end, &cached_state);

        if (freespace_inode)
                trans = btrfs_join_transaction_spacecache(root);
        else
                trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                trans = NULL;
                goto out;
        }

        trans->block_rsv = &inode->block_rsv;

        ret = btrfs_insert_raid_extent(trans, ordered_extent);
        if (ret)
                goto out;

        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
                compress_type = ordered_extent->compress_type;
        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
                BUG_ON(compress_type);
                ret = btrfs_mark_extent_written(trans, inode,
                                                ordered_extent->file_offset,
                                                ordered_extent->file_offset +
                                                logical_len);
                btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
                                                  ordered_extent->disk_num_bytes);
        } else {
                BUG_ON(root == fs_info->tree_root);
                ret = insert_ordered_extent_file_extent(trans, ordered_extent);
                if (!ret) {
                        clear_reserved_extent = false;
                        btrfs_release_delalloc_bytes(fs_info,
                                                ordered_extent->disk_bytenr,
                                                ordered_extent->disk_num_bytes);
                }
        }
        if (ret < 0) {
                btrfs_abort_transaction(trans, ret);
                goto out;
        }

        ret = unpin_extent_cache(inode, ordered_extent->file_offset,
                                 ordered_extent->num_bytes, trans->transid);
        if (ret < 0) {
                btrfs_abort_transaction(trans, ret);
                goto out;
        }

        ret = add_pending_csums(trans, &ordered_extent->list);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out;
        }

        /*
         * If this is a new delalloc range, clear its new delalloc flag to
         * update the inode's number of bytes. This needs to be done first
         * before updating the inode item.
         */
        if ((clear_bits & EXTENT_DELALLOC_NEW) &&
            !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
                clear_extent_bit(&inode->io_tree, start, end,
                                 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
                                 &cached_state);

        btrfs_inode_safe_disk_i_size_write(inode, 0);
        ret = btrfs_update_inode_fallback(trans, inode);
        if (ret) { /* -ENOMEM or corruption */
                btrfs_abort_transaction(trans, ret);
                goto out;
        }
out:
        clear_extent_bit(&inode->io_tree, start, end, clear_bits,
                         &cached_state);

        if (trans)
                btrfs_end_transaction(trans);

        if (ret || truncated) {
                u64 unwritten_start = start;

                /*
                 * If we failed to finish this ordered extent for any reason we
                 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
                 * extent, and mark the inode with the error if it wasn't
                 * already set.  Any error during writeback would have already
                 * set the mapping error, so we need to set it if we're the ones
                 * marking this ordered extent as failed.
                 */
                if (ret)
                        btrfs_mark_ordered_extent_error(ordered_extent);

                if (truncated)
                        unwritten_start += logical_len;
                clear_extent_uptodate(io_tree, unwritten_start, end, NULL);

                /*
                 * Drop extent maps for the part of the extent we didn't write.
                 *
                 * We have an exception here for the free_space_inode, this is
                 * because when we do btrfs_get_extent() on the free space inode
                 * we will search the commit root.  If this is a new block group
                 * we won't find anything, and we will trip over the assert in
                 * writepage where we do ASSERT(em->block_start !=
                 * EXTENT_MAP_HOLE).
                 *
                 * Theoretically we could also skip this for any NOCOW extent as
                 * we don't mess with the extent map tree in the NOCOW case, but
                 * for now simply skip this if we are the free space inode.
                 */
                if (!btrfs_is_free_space_inode(inode))
                        btrfs_drop_extent_map_range(inode, unwritten_start,
                                                    end, false);

                /*
                 * If the ordered extent had an IOERR or something else went
                 * wrong we need to return the space for this ordered extent
                 * back to the allocator.  We only free the extent in the
                 * truncated case if we didn't write out the extent at all.
                 *
                 * If we made it past insert_reserved_file_extent before we
                 * errored out then we don't need to do this as the accounting
                 * has already been done.
                 */
                if ((ret || !logical_len) &&
                    clear_reserved_extent &&
                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
                    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
                        /*
                         * Discard the range before returning it back to the
                         * free space pool
                         */
                        if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
                                btrfs_discard_extent(fs_info,
                                                ordered_extent->disk_bytenr,
                                                ordered_extent->disk_num_bytes,
                                                NULL);
                        btrfs_free_reserved_extent(fs_info,
                                        ordered_extent->disk_bytenr,
                                        ordered_extent->disk_num_bytes, 1);
                        /*
                         * Actually free the qgroup rsv which was released when
                         * the ordered extent was created.
                         */
                        btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root),
                                                  ordered_extent->qgroup_rsv,
                                                  BTRFS_QGROUP_RSV_DATA);
                }
        }

        /*
         * This needs to be done to make sure anybody waiting knows we are done
         * updating everything for this ordered extent.
         */
        btrfs_remove_ordered_extent(inode, ordered_extent);

        /* once for us */
        btrfs_put_ordered_extent(ordered_extent);
        /* once for the tree */
        btrfs_put_ordered_extent(ordered_extent);

        return ret;
}

int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
{
        if (btrfs_is_zoned(inode_to_fs_info(ordered->inode)) &&
            !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
            list_empty(&ordered->bioc_list))
                btrfs_finish_ordered_zoned(ordered);
        return btrfs_finish_one_ordered(ordered);
}

/*
 * Verify the checksum for a single sector without any extra action that depend
 * on the type of I/O.
 */
int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
                            u32 pgoff, u8 *csum, const u8 * const csum_expected)
{
        SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
        char *kaddr;

        ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);

        shash->tfm = fs_info->csum_shash;

        kaddr = kmap_local_page(page) + pgoff;
        crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
        kunmap_local(kaddr);

        if (memcmp(csum, csum_expected, fs_info->csum_size))
                return -EIO;
        return 0;
}

/*
 * Verify the checksum of a single data sector.
 *
 * @bbio:        btrfs_io_bio which contains the csum
 * @dev:        device the sector is on
 * @bio_offset:        offset to the beginning of the bio (in bytes)
 * @bv:                bio_vec to check
 *
 * Check if the checksum on a data block is valid.  When a checksum mismatch is
 * detected, report the error and fill the corrupted range with zero.
 *
 * Return %true if the sector is ok or had no checksum to start with, else %false.
 */
bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
                        u32 bio_offset, struct bio_vec *bv)
{
        struct btrfs_inode *inode = bbio->inode;
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        u64 file_offset = bbio->file_offset + bio_offset;
        u64 end = file_offset + bv->bv_len - 1;
        u8 *csum_expected;
        u8 csum[BTRFS_CSUM_SIZE];

        ASSERT(bv->bv_len == fs_info->sectorsize);

        if (!bbio->csum)
                return true;

        if (btrfs_is_data_reloc_root(inode->root) &&
            test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
                           NULL)) {
                /* Skip the range without csum for data reloc inode */
                clear_extent_bits(&inode->io_tree, file_offset, end,
                                  EXTENT_NODATASUM);
                return true;
        }

        csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
                                fs_info->csum_size;
        if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
                                    csum_expected))
                goto zeroit;
        return true;

zeroit:
        btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected,
                                    bbio->mirror_num);
        if (dev)
                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
        memzero_bvec(bv);
        return false;
}

/*
 * Perform a delayed iput on @inode.
 *
 * @inode: The inode we want to perform iput on
 *
 * This function uses the generic vfs_inode::i_count to track whether we should
 * just decrement it (in case it's > 1) or if this is the last iput then link
 * the inode to the delayed iput machinery. Delayed iputs are processed at
 * transaction commit time/superblock commit/cleaner kthread.
 */
void btrfs_add_delayed_iput(struct btrfs_inode *inode)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        unsigned long flags;

        if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
                return;

        atomic_inc(&fs_info->nr_delayed_iputs);
        /*
         * Need to be irq safe here because we can be called from either an irq
         * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq
         * context.
         */
        spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);
        ASSERT(list_empty(&inode->delayed_iput));
        list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
        spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags);
        if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
                wake_up_process(fs_info->cleaner_kthread);
}

static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
                                    struct btrfs_inode *inode)
{
        list_del_init(&inode->delayed_iput);
        spin_unlock_irq(&fs_info->delayed_iput_lock);
        iput(&inode->vfs_inode);
        if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
                wake_up(&fs_info->delayed_iputs_wait);
        spin_lock_irq(&fs_info->delayed_iput_lock);
}

static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
                                   struct btrfs_inode *inode)
{
        if (!list_empty(&inode->delayed_iput)) {
                spin_lock_irq(&fs_info->delayed_iput_lock);
                if (!list_empty(&inode->delayed_iput))
                        run_delayed_iput_locked(fs_info, inode);
                spin_unlock_irq(&fs_info->delayed_iput_lock);
        }
}

void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
{
        /*
         * btrfs_put_ordered_extent() can run in irq context (see bio.c), which
         * calls btrfs_add_delayed_iput() and that needs to lock
         * fs_info->delayed_iput_lock. So we need to disable irqs here to
         * prevent a deadlock.
         */
        spin_lock_irq(&fs_info->delayed_iput_lock);
        while (!list_empty(&fs_info->delayed_iputs)) {
                struct btrfs_inode *inode;

                inode = list_first_entry(&fs_info->delayed_iputs,
                                struct btrfs_inode, delayed_iput);
                run_delayed_iput_locked(fs_info, inode);
                if (need_resched()) {
                        spin_unlock_irq(&fs_info->delayed_iput_lock);
                        cond_resched();
                        spin_lock_irq(&fs_info->delayed_iput_lock);
                }
        }
        spin_unlock_irq(&fs_info->delayed_iput_lock);
}

/*
 * Wait for flushing all delayed iputs
 *
 * @fs_info:  the filesystem
 *
 * This will wait on any delayed iputs that are currently running with KILLABLE
 * set.  Once they are all done running we will return, unless we are killed in
 * which case we return EINTR. This helps in user operations like fallocate etc
 * that might get blocked on the iputs.
 *
 * Return EINTR if we were killed, 0 if nothing's pending
 */
int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
{
        int ret = wait_event_killable(fs_info->delayed_iputs_wait,
                        atomic_read(&fs_info->nr_delayed_iputs) == 0);
        if (ret)
                return -EINTR;
        return 0;
}

/*
 * This creates an orphan entry for the given inode in case something goes wrong
 * in the middle of an unlink.
 */
int btrfs_orphan_add(struct btrfs_trans_handle *trans,
                     struct btrfs_inode *inode)
{
        int ret;

        ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
        if (ret && ret != -EEXIST) {
                btrfs_abort_transaction(trans, ret);
                return ret;
        }

        return 0;
}

/*
 * We have done the delete so we can go ahead and remove the orphan item for
 * this particular inode.
 */
static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
                            struct btrfs_inode *inode)
{
        return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
}

/*
 * this cleans up any orphans that may be left on the list from the last use
 * of this root.
 */
int btrfs_orphan_cleanup(struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_key key, found_key;
        struct btrfs_trans_handle *trans;
        struct inode *inode;
        u64 last_objectid = 0;
        int ret = 0, nr_unlink = 0;

        if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
                return 0;

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }
        path->reada = READA_BACK;

        key.objectid = BTRFS_ORPHAN_OBJECTID;
        key.type = BTRFS_ORPHAN_ITEM_KEY;
        key.offset = (u64)-1;

        while (1) {
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0)
                        goto out;

                /*
                 * if ret == 0 means we found what we were searching for, which
                 * is weird, but possible, so only screw with path if we didn't
                 * find the key and see if we have stuff that matches
                 */
                if (ret > 0) {
                        ret = 0;
                        if (path->slots[0] == 0)
                                break;
                        path->slots[0]--;
                }

                /* pull out the item */
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);

                /* make sure the item matches what we want */
                if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
                        break;
                if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
                        break;

                /* release the path since we're done with it */
                btrfs_release_path(path);

                /*
                 * this is where we are basically btrfs_lookup, without the
                 * crossing root thing.  we store the inode number in the
                 * offset of the orphan item.
                 */

                if (found_key.offset == last_objectid) {
                        /*
                         * We found the same inode as before. This means we were
                         * not able to remove its items via eviction triggered
                         * by an iput(). A transaction abort may have happened,
                         * due to -ENOSPC for example, so try to grab the error
                         * that lead to a transaction abort, if any.
                         */
                        btrfs_err(fs_info,
                                  "Error removing orphan entry, stopping orphan cleanup");
                        ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL;
                        goto out;
                }

                last_objectid = found_key.offset;

                found_key.objectid = found_key.offset;
                found_key.type = BTRFS_INODE_ITEM_KEY;
                found_key.offset = 0;
                inode = btrfs_iget(fs_info->sb, last_objectid, root);
                if (IS_ERR(inode)) {
                        ret = PTR_ERR(inode);
                        inode = NULL;
                        if (ret != -ENOENT)
                                goto out;
                }

                if (!inode && root == fs_info->tree_root) {
                        struct btrfs_root *dead_root;
                        int is_dead_root = 0;

                        /*
                         * This is an orphan in the tree root. Currently these
                         * could come from 2 sources:
                         *  a) a root (snapshot/subvolume) deletion in progress
                         *  b) a free space cache inode
                         * We need to distinguish those two, as the orphan item
                         * for a root must not get deleted before the deletion
                         * of the snapshot/subvolume's tree completes.
                         *
                         * btrfs_find_orphan_roots() ran before us, which has
                         * found all deleted roots and loaded them into
                         * fs_info->fs_roots_radix. So here we can find if an
                         * orphan item corresponds to a deleted root by looking
                         * up the root from that radix tree.
                         */

                        spin_lock(&fs_info->fs_roots_radix_lock);
                        dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
                                                         (unsigned long)found_key.objectid);
                        if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
                                is_dead_root = 1;
                        spin_unlock(&fs_info->fs_roots_radix_lock);

                        if (is_dead_root) {
                                /* prevent this orphan from being found again */
                                key.offset = found_key.objectid - 1;
                                continue;
                        }

                }

                /*
                 * If we have an inode with links, there are a couple of
                 * possibilities:
                 *
                 * 1. We were halfway through creating fsverity metadata for the
                 * file. In that case, the orphan item represents incomplete
                 * fsverity metadata which must be cleaned up with
                 * btrfs_drop_verity_items and deleting the orphan item.

                 * 2. Old kernels (before v3.12) used to create an
                 * orphan item for truncate indicating that there were possibly
                 * extent items past i_size that needed to be deleted. In v3.12,
                 * truncate was changed to update i_size in sync with the extent
                 * items, but the (useless) orphan item was still created. Since
                 * v4.18, we don't create the orphan item for truncate at all.
                 *
                 * So, this item could mean that we need to do a truncate, but
                 * only if this filesystem was last used on a pre-v3.12 kernel
                 * and was not cleanly unmounted. The odds of that are quite
                 * slim, and it's a pain to do the truncate now, so just delete
                 * the orphan item.
                 *
                 * It's also possible that this orphan item was supposed to be
                 * deleted but wasn't. The inode number may have been reused,
                 * but either way, we can delete the orphan item.
                 */
                if (!inode || inode->i_nlink) {
                        if (inode) {
                                ret = btrfs_drop_verity_items(BTRFS_I(inode));
                                iput(inode);
                                inode = NULL;
                                if (ret)
                                        goto out;
                        }
                        trans = btrfs_start_transaction(root, 1);
                        if (IS_ERR(trans)) {
                                ret = PTR_ERR(trans);
                                goto out;
                        }
                        btrfs_debug(fs_info, "auto deleting %Lu",
                                    found_key.objectid);
                        ret = btrfs_del_orphan_item(trans, root,
                                                    found_key.objectid);
                        btrfs_end_transaction(trans);
                        if (ret)
                                goto out;
                        continue;
                }

                nr_unlink++;

                /* this will do delete_inode and everything for us */
                iput(inode);
        }
        /* release the path since we're done with it */
        btrfs_release_path(path);

        if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
                trans = btrfs_join_transaction(root);
                if (!IS_ERR(trans))
                        btrfs_end_transaction(trans);
        }

        if (nr_unlink)
                btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);

out:
        if (ret)
                btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
        btrfs_free_path(path);
        return ret;
}

/*
 * very simple check to peek ahead in the leaf looking for xattrs.  If we
 * don't find any xattrs, we know there can't be any acls.
 *
 * slot is the slot the inode is in, objectid is the objectid of the inode
 */
static noinline int acls_after_inode_item(struct extent_buffer *leaf,
                                          int slot, u64 objectid,
                                          int *first_xattr_slot)
{
        u32 nritems = btrfs_header_nritems(leaf);
        struct btrfs_key found_key;
        static u64 xattr_access = 0;
        static u64 xattr_default = 0;
        int scanned = 0;

        if (!xattr_access) {
                xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
                                        strlen(XATTR_NAME_POSIX_ACL_ACCESS));
                xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
                                        strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
        }

        slot++;
        *first_xattr_slot = -1;
        while (slot < nritems) {
                btrfs_item_key_to_cpu(leaf, &found_key, slot);

                /* we found a different objectid, there must not be acls */
                if (found_key.objectid != objectid)
                        return 0;

                /* we found an xattr, assume we've got an acl */
                if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
                        if (*first_xattr_slot == -1)
                                *first_xattr_slot = slot;
                        if (found_key.offset == xattr_access ||
                            found_key.offset == xattr_default)
                                return 1;
                }

                /*
                 * we found a key greater than an xattr key, there can't
                 * be any acls later on
                 */
                if (found_key.type > BTRFS_XATTR_ITEM_KEY)
                        return 0;

                slot++;
                scanned++;

                /*
                 * it goes inode, inode backrefs, xattrs, extents,
                 * so if there are a ton of hard links to an inode there can
                 * be a lot of backrefs.  Don't waste time searching too hard,
                 * this is just an optimization
                 */
                if (scanned >= 8)
                        break;
        }
        /* we hit the end of the leaf before we found an xattr or
         * something larger than an xattr.  We have to assume the inode
         * has acls
         */
        if (*first_xattr_slot == -1)
                *first_xattr_slot = slot;
        return 1;
}

/*
 * read an inode from the btree into the in-memory inode
 */
static int btrfs_read_locked_inode(struct inode *inode,
                                   struct btrfs_path *in_path)
{
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct btrfs_path *path = in_path;
        struct extent_buffer *leaf;
        struct btrfs_inode_item *inode_item;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key location;
        unsigned long ptr;
        int maybe_acls;
        u32 rdev;
        int ret;
        bool filled = false;
        int first_xattr_slot;

        ret = btrfs_fill_inode(inode, &rdev);
        if (!ret)
                filled = true;

        if (!path) {
                path = btrfs_alloc_path();
                if (!path)
                        return -ENOMEM;
        }

        memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));

        ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
        if (ret) {
                if (path != in_path)
                        btrfs_free_path(path);
                return ret;
        }

        leaf = path->nodes[0];

        if (filled)
                goto cache_index;

        inode_item = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_inode_item);
        inode->i_mode = btrfs_inode_mode(leaf, inode_item);
        set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
        i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
        i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
        btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
        btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
                        round_up(i_size_read(inode), fs_info->sectorsize));

        inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime),
                        btrfs_timespec_nsec(leaf, &inode_item->atime));

        inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
                        btrfs_timespec_nsec(leaf, &inode_item->mtime));

        inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
                        btrfs_timespec_nsec(leaf, &inode_item->ctime));

        BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
        BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);

        inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
        BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
        BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);

        inode_set_iversion_queried(inode,
                                   btrfs_inode_sequence(leaf, inode_item));
        inode->i_generation = BTRFS_I(inode)->generation;
        inode->i_rdev = 0;
        rdev = btrfs_inode_rdev(leaf, inode_item);

        BTRFS_I(inode)->index_cnt = (u64)-1;
        btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
                                &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);

cache_index:
        /*
         * If we were modified in the current generation and evicted from memory
         * and then re-read we need to do a full sync since we don't have any
         * idea about which extents were modified before we were evicted from
         * cache.
         *
         * This is required for both inode re-read from disk and delayed inode
         * in the delayed_nodes xarray.
         */
        if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                        &BTRFS_I(inode)->runtime_flags);

        /*
         * We don't persist the id of the transaction where an unlink operation
         * against the inode was last made. So here we assume the inode might
         * have been evicted, and therefore the exact value of last_unlink_trans
         * lost, and set it to last_trans to avoid metadata inconsistencies
         * between the inode and its parent if the inode is fsync'ed and the log
         * replayed. For example, in the scenario:
         *
         * touch mydir/foo
         * ln mydir/foo mydir/bar
         * sync
         * unlink mydir/bar
         * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
         * xfs_io -c fsync mydir/foo
         * <power failure>
         * mount fs, triggers fsync log replay
         *
         * We must make sure that when we fsync our inode foo we also log its
         * parent inode, otherwise after log replay the parent still has the
         * dentry with the "bar" name but our inode foo has a link count of 1
         * and doesn't have an inode ref with the name "bar" anymore.
         *
         * Setting last_unlink_trans to last_trans is a pessimistic approach,
         * but it guarantees correctness at the expense of occasional full
         * transaction commits on fsync if our inode is a directory, or if our
         * inode is not a directory, logging its parent unnecessarily.
         */
        BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;

        /*
         * Same logic as for last_unlink_trans. We don't persist the generation
         * of the last transaction where this inode was used for a reflink
         * operation, so after eviction and reloading the inode we must be
         * pessimistic and assume the last transaction that modified the inode.
         */
        BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;

        path->slots[0]++;
        if (inode->i_nlink != 1 ||
            path->slots[0] >= btrfs_header_nritems(leaf))
                goto cache_acl;

        btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
        if (location.objectid != btrfs_ino(BTRFS_I(inode)))
                goto cache_acl;

        ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
        if (location.type == BTRFS_INODE_REF_KEY) {
                struct btrfs_inode_ref *ref;

                ref = (struct btrfs_inode_ref *)ptr;
                BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
        } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
                struct btrfs_inode_extref *extref;

                extref = (struct btrfs_inode_extref *)ptr;
                BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
                                                                     extref);
        }
cache_acl:
        /*
         * try to precache a NULL acl entry for files that don't have
         * any xattrs or acls
         */
        maybe_acls = acls_after_inode_item(leaf, path->slots[0],
                        btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
        if (first_xattr_slot != -1) {
                path->slots[0] = first_xattr_slot;
                ret = btrfs_load_inode_props(inode, path);
                if (ret)
                        btrfs_err(fs_info,
                                  "error loading props for ino %llu (root %llu): %d",
                                  btrfs_ino(BTRFS_I(inode)),
                                  btrfs_root_id(root), ret);
        }
        if (path != in_path)
                btrfs_free_path(path);

        if (!maybe_acls)
                cache_no_acl(inode);

        switch (inode->i_mode & S_IFMT) {
        case S_IFREG:
                inode->i_mapping->a_ops = &btrfs_aops;
                inode->i_fop = &btrfs_file_operations;
                inode->i_op = &btrfs_file_inode_operations;
                break;
        case S_IFDIR:
                inode->i_fop = &btrfs_dir_file_operations;
                inode->i_op = &btrfs_dir_inode_operations;
                break;
        case S_IFLNK:
                inode->i_op = &btrfs_symlink_inode_operations;
                inode_nohighmem(inode);
                inode->i_mapping->a_ops = &btrfs_aops;
                break;
        default:
                inode->i_op = &btrfs_special_inode_operations;
                init_special_inode(inode, inode->i_mode, rdev);
                break;
        }

        btrfs_sync_inode_flags_to_i_flags(inode);
        return 0;
}

/*
 * given a leaf and an inode, copy the inode fields into the leaf
 */
static void fill_inode_item(struct btrfs_trans_handle *trans,
                            struct extent_buffer *leaf,
                            struct btrfs_inode_item *item,
                            struct inode *inode)
{
        struct btrfs_map_token token;
        u64 flags;

        btrfs_init_map_token(&token, leaf);

        btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
        btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
        btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
        btrfs_set_token_inode_mode(&token, item, inode->i_mode);
        btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);

        btrfs_set_token_timespec_sec(&token, &item->atime,
                                     inode_get_atime_sec(inode));
        btrfs_set_token_timespec_nsec(&token, &item->atime,
                                      inode_get_atime_nsec(inode));

        btrfs_set_token_timespec_sec(&token, &item->mtime,
                                     inode_get_mtime_sec(inode));
        btrfs_set_token_timespec_nsec(&token, &item->mtime,
                                      inode_get_mtime_nsec(inode));

        btrfs_set_token_timespec_sec(&token, &item->ctime,
                                     inode_get_ctime_sec(inode));
        btrfs_set_token_timespec_nsec(&token, &item->ctime,
                                      inode_get_ctime_nsec(inode));

        btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec);
        btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec);

        btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
        btrfs_set_token_inode_generation(&token, item,
                                         BTRFS_I(inode)->generation);
        btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
        btrfs_set_token_inode_transid(&token, item, trans->transid);
        btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
        flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
                                          BTRFS_I(inode)->ro_flags);
        btrfs_set_token_inode_flags(&token, item, flags);
        btrfs_set_token_inode_block_group(&token, item, 0);
}

/*
 * copy everything in the in-memory inode into the btree.
 */
static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
                                            struct btrfs_inode *inode)
{
        struct btrfs_inode_item *inode_item;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        int ret;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        ret = btrfs_lookup_inode(trans, inode->root, path, &inode->location, 1);
        if (ret) {
                if (ret > 0)
                        ret = -ENOENT;
                goto failed;
        }

        leaf = path->nodes[0];
        inode_item = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_inode_item);

        fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
        btrfs_mark_buffer_dirty(trans, leaf);
        btrfs_set_inode_last_trans(trans, inode);
        ret = 0;
failed:
        btrfs_free_path(path);
        return ret;
}

/*
 * copy everything in the in-memory inode into the btree.
 */
int btrfs_update_inode(struct btrfs_trans_handle *trans,
                       struct btrfs_inode *inode)
{
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;

        /*
         * If the inode is a free space inode, we can deadlock during commit
         * if we put it into the delayed code.
         *
         * The data relocation inode should also be directly updated
         * without delay
         */
        if (!btrfs_is_free_space_inode(inode)
            && !btrfs_is_data_reloc_root(root)
            && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
                btrfs_update_root_times(trans, root);

                ret = btrfs_delayed_update_inode(trans, inode);
                if (!ret)
                        btrfs_set_inode_last_trans(trans, inode);
                return ret;
        }

        return btrfs_update_inode_item(trans, inode);
}

int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
                                struct btrfs_inode *inode)
{
        int ret;

        ret = btrfs_update_inode(trans, inode);
        if (ret == -ENOSPC)
                return btrfs_update_inode_item(trans, inode);
        return ret;
}

/*
 * unlink helper that gets used here in inode.c and in the tree logging
 * recovery code.  It remove a link in a directory with a given name, and
 * also drops the back refs in the inode to the directory
 */
static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
                                struct btrfs_inode *dir,
                                struct btrfs_inode *inode,
                                const struct fscrypt_str *name,
                                struct btrfs_rename_ctx *rename_ctx)
{
        struct btrfs_root *root = dir->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_path *path;
        int ret = 0;
        struct btrfs_dir_item *di;
        u64 index;
        u64 ino = btrfs_ino(inode);
        u64 dir_ino = btrfs_ino(dir);

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }

        di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
        if (IS_ERR_OR_NULL(di)) {
                ret = di ? PTR_ERR(di) : -ENOENT;
                goto err;
        }
        ret = btrfs_delete_one_dir_name(trans, root, path, di);
        if (ret)
                goto err;
        btrfs_release_path(path);

        /*
         * If we don't have dir index, we have to get it by looking up
         * the inode ref, since we get the inode ref, remove it directly,
         * it is unnecessary to do delayed deletion.
         *
         * But if we have dir index, needn't search inode ref to get it.
         * Since the inode ref is close to the inode item, it is better
         * that we delay to delete it, and just do this deletion when
         * we update the inode item.
         */
        if (inode->dir_index) {
                ret = btrfs_delayed_delete_inode_ref(inode);
                if (!ret) {
                        index = inode->dir_index;
                        goto skip_backref;
                }
        }

        ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
        if (ret) {
                btrfs_info(fs_info,
                        "failed to delete reference to %.*s, inode %llu parent %llu",
                        name->len, name->name, ino, dir_ino);
                btrfs_abort_transaction(trans, ret);
                goto err;
        }
skip_backref:
        if (rename_ctx)
                rename_ctx->index = index;

        ret = btrfs_delete_delayed_dir_index(trans, dir, index);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto err;
        }

        /*
         * If we are in a rename context, we don't need to update anything in the
         * log. That will be done later during the rename by btrfs_log_new_name().
         * Besides that, doing it here would only cause extra unnecessary btree
         * operations on the log tree, increasing latency for applications.
         */
        if (!rename_ctx) {
                btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
                btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
        }

        /*
         * If we have a pending delayed iput we could end up with the final iput
         * being run in btrfs-cleaner context.  If we have enough of these built
         * up we can end up burning a lot of time in btrfs-cleaner without any
         * way to throttle the unlinks.  Since we're currently holding a ref on
         * the inode we can run the delayed iput here without any issues as the
         * final iput won't be done until after we drop the ref we're currently
         * holding.
         */
        btrfs_run_delayed_iput(fs_info, inode);
err:
        btrfs_free_path(path);
        if (ret)
                goto out;

        btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
        inode_inc_iversion(&inode->vfs_inode);
        inode_inc_iversion(&dir->vfs_inode);
         inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
        ret = btrfs_update_inode(trans, dir);
out:
        return ret;
}

int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
                       struct btrfs_inode *dir, struct btrfs_inode *inode,
                       const struct fscrypt_str *name)
{
        int ret;

        ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
        if (!ret) {
                drop_nlink(&inode->vfs_inode);
                ret = btrfs_update_inode(trans, inode);
        }
        return ret;
}

/*
 * helper to start transaction for unlink and rmdir.
 *
 * unlink and rmdir are special in btrfs, they do not always free space, so
 * if we cannot make our reservations the normal way try and see if there is
 * plenty of slack room in the global reserve to migrate, otherwise we cannot
 * allow the unlink to occur.
 */
static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
{
        struct btrfs_root *root = dir->root;

        return btrfs_start_transaction_fallback_global_rsv(root,
                                                   BTRFS_UNLINK_METADATA_UNITS);
}

static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
{
        struct btrfs_trans_handle *trans;
        struct inode *inode = d_inode(dentry);
        int ret;
        struct fscrypt_name fname;

        ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
        if (ret)
                return ret;

        /* This needs to handle no-key deletions later on */

        trans = __unlink_start_trans(BTRFS_I(dir));
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto fscrypt_free;
        }

        btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
                                false);

        ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
                                 &fname.disk_name);
        if (ret)
                goto end_trans;

        if (inode->i_nlink == 0) {
                ret = btrfs_orphan_add(trans, BTRFS_I(inode));
                if (ret)
                        goto end_trans;
        }

end_trans:
        btrfs_end_transaction(trans);
        btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
fscrypt_free:
        fscrypt_free_filename(&fname);
        return ret;
}

static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
                               struct btrfs_inode *dir, struct dentry *dentry)
{
        struct btrfs_root *root = dir->root;
        struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_dir_item *di;
        struct btrfs_key key;
        u64 index;
        int ret;
        u64 objectid;
        u64 dir_ino = btrfs_ino(dir);
        struct fscrypt_name fname;

        ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
        if (ret)
                return ret;

        /* This needs to handle no-key deletions later on */

        if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
                objectid = btrfs_root_id(inode->root);
        } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
                objectid = inode->location.objectid;
        } else {
                WARN_ON(1);
                fscrypt_free_filename(&fname);
                return -EINVAL;
        }

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }

        di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
                                   &fname.disk_name, -1);
        if (IS_ERR_OR_NULL(di)) {
                ret = di ? PTR_ERR(di) : -ENOENT;
                goto out;
        }

        leaf = path->nodes[0];
        btrfs_dir_item_key_to_cpu(leaf, di, &key);
        WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
        ret = btrfs_delete_one_dir_name(trans, root, path, di);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out;
        }
        btrfs_release_path(path);

        /*
         * This is a placeholder inode for a subvolume we didn't have a
         * reference to at the time of the snapshot creation.  In the meantime
         * we could have renamed the real subvol link into our snapshot, so
         * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
         * Instead simply lookup the dir_index_item for this entry so we can
         * remove it.  Otherwise we know we have a ref to the root and we can
         * call btrfs_del_root_ref, and it _shouldn't_ fail.
         */
        if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
                di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
                if (IS_ERR_OR_NULL(di)) {
                        if (!di)
                                ret = -ENOENT;
                        else
                                ret = PTR_ERR(di);
                        btrfs_abort_transaction(trans, ret);
                        goto out;
                }

                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
                index = key.offset;
                btrfs_release_path(path);
        } else {
                ret = btrfs_del_root_ref(trans, objectid,
                                         btrfs_root_id(root), dir_ino,
                                         &index, &fname.disk_name);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        goto out;
                }
        }

        ret = btrfs_delete_delayed_dir_index(trans, dir, index);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out;
        }

        btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
        inode_inc_iversion(&dir->vfs_inode);
        inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
        ret = btrfs_update_inode_fallback(trans, dir);
        if (ret)
                btrfs_abort_transaction(trans, ret);
out:
        btrfs_free_path(path);
        fscrypt_free_filename(&fname);
        return ret;
}

/*
 * Helper to check if the subvolume references other subvolumes or if it's
 * default.
 */
static noinline int may_destroy_subvol(struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_path *path;
        struct btrfs_dir_item *di;
        struct btrfs_key key;
        struct fscrypt_str name = FSTR_INIT("default", 7);
        u64 dir_id;
        int ret;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        /* Make sure this root isn't set as the default subvol */
        dir_id = btrfs_super_root_dir(fs_info->super_copy);
        di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
                                   dir_id, &name, 0);
        if (di && !IS_ERR(di)) {
                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
                if (key.objectid == btrfs_root_id(root)) {
                        ret = -EPERM;
                        btrfs_err(fs_info,
                                  "deleting default subvolume %llu is not allowed",
                                  key.objectid);
                        goto out;
                }
                btrfs_release_path(path);
        }

        key.objectid = btrfs_root_id(root);
        key.type = BTRFS_ROOT_REF_KEY;
        key.offset = (u64)-1;

        ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
        if (ret < 0)
                goto out;
        if (ret == 0) {
                /*
                 * Key with offset -1 found, there would have to exist a root
                 * with such id, but this is out of valid range.
                 */
                ret = -EUCLEAN;
                goto out;
        }

        ret = 0;
        if (path->slots[0] > 0) {
                path->slots[0]--;
                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
                if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY)
                        ret = -ENOTEMPTY;
        }
out:
        btrfs_free_path(path);
        return ret;
}

/* Delete all dentries for inodes belonging to the root */
static void btrfs_prune_dentries(struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_inode *inode;
        u64 min_ino = 0;

        if (!BTRFS_FS_ERROR(fs_info))
                WARN_ON(btrfs_root_refs(&root->root_item) != 0);

        inode = btrfs_find_first_inode(root, min_ino);
        while (inode) {
                if (atomic_read(&inode->vfs_inode.i_count) > 1)
                        d_prune_aliases(&inode->vfs_inode);

                min_ino = btrfs_ino(inode) + 1;
                /*
                 * btrfs_drop_inode() will have it removed from the inode
                 * cache when its usage count hits zero.
                 */
                iput(&inode->vfs_inode);
                cond_resched();
                inode = btrfs_find_first_inode(root, min_ino);
        }
}

int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
{
        struct btrfs_root *root = dir->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct inode *inode = d_inode(dentry);
        struct btrfs_root *dest = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        struct btrfs_block_rsv block_rsv;
        u64 root_flags;
        u64 qgroup_reserved = 0;
        int ret;

        down_write(&fs_info->subvol_sem);

        /*
         * Don't allow to delete a subvolume with send in progress. This is
         * inside the inode lock so the error handling that has to drop the bit
         * again is not run concurrently.
         */
        spin_lock(&dest->root_item_lock);
        if (dest->send_in_progress) {
                spin_unlock(&dest->root_item_lock);
                btrfs_warn(fs_info,
                           "attempt to delete subvolume %llu during send",
                           btrfs_root_id(dest));
                ret = -EPERM;
                goto out_up_write;
        }
        if (atomic_read(&dest->nr_swapfiles)) {
                spin_unlock(&dest->root_item_lock);
                btrfs_warn(fs_info,
                           "attempt to delete subvolume %llu with active swapfile",
                           btrfs_root_id(root));
                ret = -EPERM;
                goto out_up_write;
        }
        root_flags = btrfs_root_flags(&dest->root_item);
        btrfs_set_root_flags(&dest->root_item,
                             root_flags | BTRFS_ROOT_SUBVOL_DEAD);
        spin_unlock(&dest->root_item_lock);

        ret = may_destroy_subvol(dest);
        if (ret)
                goto out_undead;

        btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
        /*
         * One for dir inode,
         * two for dir entries,
         * two for root ref/backref.
         */
        ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
        if (ret)
                goto out_undead;
        qgroup_reserved = block_rsv.qgroup_rsv_reserved;

        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto out_release;
        }
        ret = btrfs_record_root_in_trans(trans, root);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out_end_trans;
        }
        btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
        qgroup_reserved = 0;
        trans->block_rsv = &block_rsv;
        trans->bytes_reserved = block_rsv.size;

        btrfs_record_snapshot_destroy(trans, dir);

        ret = btrfs_unlink_subvol(trans, dir, dentry);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out_end_trans;
        }

        ret = btrfs_record_root_in_trans(trans, dest);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out_end_trans;
        }

        memset(&dest->root_item.drop_progress, 0,
                sizeof(dest->root_item.drop_progress));
        btrfs_set_root_drop_level(&dest->root_item, 0);
        btrfs_set_root_refs(&dest->root_item, 0);

        if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
                ret = btrfs_insert_orphan_item(trans,
                                        fs_info->tree_root,
                                        btrfs_root_id(dest));
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        goto out_end_trans;
                }
        }

        ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
                                     BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest));
        if (ret && ret != -ENOENT) {
                btrfs_abort_transaction(trans, ret);
                goto out_end_trans;
        }
        if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
                ret = btrfs_uuid_tree_remove(trans,
                                          dest->root_item.received_uuid,
                                          BTRFS_UUID_KEY_RECEIVED_SUBVOL,
                                          btrfs_root_id(dest));
                if (ret && ret != -ENOENT) {
                        btrfs_abort_transaction(trans, ret);
                        goto out_end_trans;
                }
        }

        free_anon_bdev(dest->anon_dev);
        dest->anon_dev = 0;
out_end_trans:
        trans->block_rsv = NULL;
        trans->bytes_reserved = 0;
        ret = btrfs_end_transaction(trans);
        inode->i_flags |= S_DEAD;
out_release:
        btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
        if (qgroup_reserved)
                btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
out_undead:
        if (ret) {
                spin_lock(&dest->root_item_lock);
                root_flags = btrfs_root_flags(&dest->root_item);
                btrfs_set_root_flags(&dest->root_item,
                                root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
                spin_unlock(&dest->root_item_lock);
        }
out_up_write:
        up_write(&fs_info->subvol_sem);
        if (!ret) {
                d_invalidate(dentry);
                btrfs_prune_dentries(dest);
                ASSERT(dest->send_in_progress == 0);
        }

        return ret;
}

static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
        int ret = 0;
        struct btrfs_trans_handle *trans;
        u64 last_unlink_trans;
        struct fscrypt_name fname;

        if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
                return -ENOTEMPTY;
        if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
                if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
                        btrfs_err(fs_info,
                        "extent tree v2 doesn't support snapshot deletion yet");
                        return -EOPNOTSUPP;
                }
                return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
        }

        ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
        if (ret)
                return ret;

        /* This needs to handle no-key deletions later on */

        trans = __unlink_start_trans(BTRFS_I(dir));
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto out_notrans;
        }

        if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
                ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
                goto out;
        }

        ret = btrfs_orphan_add(trans, BTRFS_I(inode));
        if (ret)
                goto out;

        last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;

        /* now the directory is empty */
        ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
                                 &fname.disk_name);
        if (!ret) {
                btrfs_i_size_write(BTRFS_I(inode), 0);
                /*
                 * Propagate the last_unlink_trans value of the deleted dir to
                 * its parent directory. This is to prevent an unrecoverable
                 * log tree in the case we do something like this:
                 * 1) create dir foo
                 * 2) create snapshot under dir foo
                 * 3) delete the snapshot
                 * 4) rmdir foo
                 * 5) mkdir foo
                 * 6) fsync foo or some file inside foo
                 */
                if (last_unlink_trans >= trans->transid)
                        BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
        }
out:
        btrfs_end_transaction(trans);
out_notrans:
        btrfs_btree_balance_dirty(fs_info);
        fscrypt_free_filename(&fname);

        return ret;
}

/*
 * Read, zero a chunk and write a block.
 *
 * @inode - inode that we're zeroing
 * @from - the offset to start zeroing
 * @len - the length to zero, 0 to zero the entire range respective to the
 *        offset
 * @front - zero up to the offset instead of from the offset on
 *
 * This will find the block for the "from" offset and cow the block and zero the
 * part we want to zero.  This is used with truncate and hole punching.
 */
int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
                         int front)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct address_space *mapping = inode->vfs_inode.i_mapping;
        struct extent_io_tree *io_tree = &inode->io_tree;
        struct btrfs_ordered_extent *ordered;
        struct extent_state *cached_state = NULL;
        struct extent_changeset *data_reserved = NULL;
        bool only_release_metadata = false;
        u32 blocksize = fs_info->sectorsize;
        pgoff_t index = from >> PAGE_SHIFT;
        unsigned offset = from & (blocksize - 1);
        struct folio *folio;
        gfp_t mask = btrfs_alloc_write_mask(mapping);
        size_t write_bytes = blocksize;
        int ret = 0;
        u64 block_start;
        u64 block_end;

        if (IS_ALIGNED(offset, blocksize) &&
            (!len || IS_ALIGNED(len, blocksize)))
                goto out;

        block_start = round_down(from, blocksize);
        block_end = block_start + blocksize - 1;

        ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
                                          blocksize, false);
        if (ret < 0) {
                if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
                        /* For nocow case, no need to reserve data space */
                        only_release_metadata = true;
                } else {
                        goto out;
                }
        }
        ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
        if (ret < 0) {
                if (!only_release_metadata)
                        btrfs_free_reserved_data_space(inode, data_reserved,
                                                       block_start, blocksize);
                goto out;
        }
again:
        folio = __filemap_get_folio(mapping, index,
                                    FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
        if (IS_ERR(folio)) {
                btrfs_delalloc_release_space(inode, data_reserved, block_start,
                                             blocksize, true);
                btrfs_delalloc_release_extents(inode, blocksize);
                ret = -ENOMEM;
                goto out;
        }

        if (!folio_test_uptodate(folio)) {
                ret = btrfs_read_folio(NULL, folio);
                folio_lock(folio);
                if (folio->mapping != mapping) {
                        folio_unlock(folio);
                        folio_put(folio);
                        goto again;
                }
                if (!folio_test_uptodate(folio)) {
                        ret = -EIO;
                        goto out_unlock;
                }
        }

        /*
         * We unlock the page after the io is completed and then re-lock it
         * above.  release_folio() could have come in between that and cleared
         * folio private, but left the page in the mapping.  Set the page mapped
         * here to make sure it's properly set for the subpage stuff.
         */
        ret = set_folio_extent_mapped(folio);
        if (ret < 0)
                goto out_unlock;

        folio_wait_writeback(folio);

        lock_extent(io_tree, block_start, block_end, &cached_state);

        ordered = btrfs_lookup_ordered_extent(inode, block_start);
        if (ordered) {
                unlock_extent(io_tree, block_start, block_end, &cached_state);
                folio_unlock(folio);
                folio_put(folio);
                btrfs_start_ordered_extent(ordered);
                btrfs_put_ordered_extent(ordered);
                goto again;
        }

        clear_extent_bit(&inode->io_tree, block_start, block_end,
                         EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
                         &cached_state);

        ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
                                        &cached_state);
        if (ret) {
                unlock_extent(io_tree, block_start, block_end, &cached_state);
                goto out_unlock;
        }

        if (offset != blocksize) {
                if (!len)
                        len = blocksize - offset;
                if (front)
                        folio_zero_range(folio, block_start - folio_pos(folio),
                                         offset);
                else
                        folio_zero_range(folio,
                                         (block_start - folio_pos(folio)) + offset,
                                         len);
        }
        btrfs_folio_clear_checked(fs_info, folio, block_start,
                                  block_end + 1 - block_start);
        btrfs_folio_set_dirty(fs_info, folio, block_start,
                              block_end + 1 - block_start);
        unlock_extent(io_tree, block_start, block_end, &cached_state);

        if (only_release_metadata)
                set_extent_bit(&inode->io_tree, block_start, block_end,
                               EXTENT_NORESERVE, NULL);

out_unlock:
        if (ret) {
                if (only_release_metadata)
                        btrfs_delalloc_release_metadata(inode, blocksize, true);
                else
                        btrfs_delalloc_release_space(inode, data_reserved,
                                        block_start, blocksize, true);
        }
        btrfs_delalloc_release_extents(inode, blocksize);
        folio_unlock(folio);
        folio_put(folio);
out:
        if (only_release_metadata)
                btrfs_check_nocow_unlock(inode);
        extent_changeset_free(data_reserved);
        return ret;
}

static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len)
{
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_trans_handle *trans;
        struct btrfs_drop_extents_args drop_args = { 0 };
        int ret;

        /*
         * If NO_HOLES is enabled, we don't need to do anything.
         * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
         * or btrfs_update_inode() will be called, which guarantee that the next
         * fsync will know this inode was changed and needs to be logged.
         */
        if (btrfs_fs_incompat(fs_info, NO_HOLES))
                return 0;

        /*
         * 1 - for the one we're dropping
         * 1 - for the one we're adding
         * 1 - for updating the inode.
         */
        trans = btrfs_start_transaction(root, 3);
        if (IS_ERR(trans))
                return PTR_ERR(trans);

        drop_args.start = offset;
        drop_args.end = offset + len;
        drop_args.drop_cache = true;

        ret = btrfs_drop_extents(trans, root, inode, &drop_args);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                btrfs_end_transaction(trans);
                return ret;
        }

        ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
        } else {
                btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
                btrfs_update_inode(trans, inode);
        }
        btrfs_end_transaction(trans);
        return ret;
}

/*
 * This function puts in dummy file extents for the area we're creating a hole
 * for.  So if we are truncating this file to a larger size we need to insert
 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
 * the range between oldsize and size
 */
int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
{
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_io_tree *io_tree = &inode->io_tree;
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
        u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
        u64 block_end = ALIGN(size, fs_info->sectorsize);
        u64 last_byte;
        u64 cur_offset;
        u64 hole_size;
        int ret = 0;

        /*
         * If our size started in the middle of a block we need to zero out the
         * rest of the block before we expand the i_size, otherwise we could
         * expose stale data.
         */
        ret = btrfs_truncate_block(inode, oldsize, 0, 0);
        if (ret)
                return ret;

        if (size <= hole_start)
                return 0;

        btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
                                           &cached_state);
        cur_offset = hole_start;
        while (1) {
                em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset);
                if (IS_ERR(em)) {
                        ret = PTR_ERR(em);
                        em = NULL;
                        break;
                }
                last_byte = min(extent_map_end(em), block_end);
                last_byte = ALIGN(last_byte, fs_info->sectorsize);
                hole_size = last_byte - cur_offset;

                if (!(em->flags & EXTENT_FLAG_PREALLOC)) {
                        struct extent_map *hole_em;

                        ret = maybe_insert_hole(inode, cur_offset, hole_size);
                        if (ret)
                                break;

                        ret = btrfs_inode_set_file_extent_range(inode,
                                                        cur_offset, hole_size);
                        if (ret)
                                break;

                        hole_em = alloc_extent_map();
                        if (!hole_em) {
                                btrfs_drop_extent_map_range(inode, cur_offset,
                                                    cur_offset + hole_size - 1,
                                                    false);
                                btrfs_set_inode_full_sync(inode);
                                goto next;
                        }
                        hole_em->start = cur_offset;
                        hole_em->len = hole_size;
                        hole_em->orig_start = cur_offset;

                        hole_em->block_start = EXTENT_MAP_HOLE;
                        hole_em->block_len = 0;
                        hole_em->orig_block_len = 0;
                        hole_em->ram_bytes = hole_size;
                        hole_em->generation = btrfs_get_fs_generation(fs_info);

                        ret = btrfs_replace_extent_map_range(inode, hole_em, true);
                        free_extent_map(hole_em);
                } else {
                        ret = btrfs_inode_set_file_extent_range(inode,
                                                        cur_offset, hole_size);
                        if (ret)
                                break;
                }
next:
                free_extent_map(em);
                em = NULL;
                cur_offset = last_byte;
                if (cur_offset >= block_end)
                        break;
        }
        free_extent_map(em);
        unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
        return ret;
}

static int btrfs_setsize(struct inode *inode, struct iattr *attr)
{
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        loff_t oldsize = i_size_read(inode);
        loff_t newsize = attr->ia_size;
        int mask = attr->ia_valid;
        int ret;

        /*
         * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
         * special case where we need to update the times despite not having
         * these flags set.  For all other operations the VFS set these flags
         * explicitly if it wants a timestamp update.
         */
        if (newsize != oldsize) {
                inode_inc_iversion(inode);
                if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
                        inode_set_mtime_to_ts(inode,
                                              inode_set_ctime_current(inode));
                }
        }

        if (newsize > oldsize) {
                /*
                 * Don't do an expanding truncate while snapshotting is ongoing.
                 * This is to ensure the snapshot captures a fully consistent
                 * state of this file - if the snapshot captures this expanding
                 * truncation, it must capture all writes that happened before
                 * this truncation.
                 */
                btrfs_drew_write_lock(&root->snapshot_lock);
                ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
                if (ret) {
                        btrfs_drew_write_unlock(&root->snapshot_lock);
                        return ret;
                }

                trans = btrfs_start_transaction(root, 1);
                if (IS_ERR(trans)) {
                        btrfs_drew_write_unlock(&root->snapshot_lock);
                        return PTR_ERR(trans);
                }

                i_size_write(inode, newsize);
                btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
                pagecache_isize_extended(inode, oldsize, newsize);
                ret = btrfs_update_inode(trans, BTRFS_I(inode));
                btrfs_drew_write_unlock(&root->snapshot_lock);
                btrfs_end_transaction(trans);
        } else {
                struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);

                if (btrfs_is_zoned(fs_info)) {
                        ret = btrfs_wait_ordered_range(inode,
                                        ALIGN(newsize, fs_info->sectorsize),
                                        (u64)-1);
                        if (ret)
                                return ret;
                }

                /*
                 * We're truncating a file that used to have good data down to
                 * zero. Make sure any new writes to the file get on disk
                 * on close.
                 */
                if (newsize == 0)
                        set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
                                &BTRFS_I(inode)->runtime_flags);

                truncate_setsize(inode, newsize);

                inode_dio_wait(inode);

                ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
                if (ret && inode->i_nlink) {
                        int err;

                        /*
                         * Truncate failed, so fix up the in-memory size. We
                         * adjusted disk_i_size down as we removed extents, so
                         * wait for disk_i_size to be stable and then update the
                         * in-memory size to match.
                         */
                        err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
                        if (err)
                                return err;
                        i_size_write(inode, BTRFS_I(inode)->disk_i_size);
                }
        }

        return ret;
}

static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                         struct iattr *attr)
{
        struct inode *inode = d_inode(dentry);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int err;

        if (btrfs_root_readonly(root))
                return -EROFS;

        err = setattr_prepare(idmap, dentry, attr);
        if (err)
                return err;

        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
                err = btrfs_setsize(inode, attr);
                if (err)
                        return err;
        }

        if (attr->ia_valid) {
                setattr_copy(idmap, inode, attr);
                inode_inc_iversion(inode);
                err = btrfs_dirty_inode(BTRFS_I(inode));

                if (!err && attr->ia_valid & ATTR_MODE)
                        err = posix_acl_chmod(idmap, dentry, inode->i_mode);
        }

        return err;
}

/*
 * While truncating the inode pages during eviction, we get the VFS
 * calling btrfs_invalidate_folio() against each folio of the inode. This
 * is slow because the calls to btrfs_invalidate_folio() result in a
 * huge amount of calls to lock_extent() and clear_extent_bit(),
 * which keep merging and splitting extent_state structures over and over,
 * wasting lots of time.
 *
 * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
 * skip all those expensive operations on a per folio basis and do only
 * the ordered io finishing, while we release here the extent_map and
 * extent_state structures, without the excessive merging and splitting.
 */
static void evict_inode_truncate_pages(struct inode *inode)
{
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct rb_node *node;

        ASSERT(inode->i_state & I_FREEING);
        truncate_inode_pages_final(&inode->i_data);

        btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);

        /*
         * Keep looping until we have no more ranges in the io tree.
         * We can have ongoing bios started by readahead that have
         * their endio callback (extent_io.c:end_bio_extent_readpage)
         * still in progress (unlocked the pages in the bio but did not yet
         * unlocked the ranges in the io tree). Therefore this means some
         * ranges can still be locked and eviction started because before
         * submitting those bios, which are executed by a separate task (work
         * queue kthread), inode references (inode->i_count) were not taken
         * (which would be dropped in the end io callback of each bio).
         * Therefore here we effectively end up waiting for those bios and
         * anyone else holding locked ranges without having bumped the inode's
         * reference count - if we don't do it, when they access the inode's
         * io_tree to unlock a range it may be too late, leading to an
         * use-after-free issue.
         */
        spin_lock(&io_tree->lock);
        while (!RB_EMPTY_ROOT(&io_tree->state)) {
                struct extent_state *state;
                struct extent_state *cached_state = NULL;
                u64 start;
                u64 end;
                unsigned state_flags;

                node = rb_first(&io_tree->state);
                state = rb_entry(node, struct extent_state, rb_node);
                start = state->start;
                end = state->end;
                state_flags = state->state;
                spin_unlock(&io_tree->lock);

                lock_extent(io_tree, start, end, &cached_state);

                /*
                 * If still has DELALLOC flag, the extent didn't reach disk,
                 * and its reserved space won't be freed by delayed_ref.
                 * So we need to free its reserved space here.
                 * (Refer to comment in btrfs_invalidate_folio, case 2)
                 *
                 * Note, end is the bytenr of last byte, so we need + 1 here.
                 */
                if (state_flags & EXTENT_DELALLOC)
                        btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
                                               end - start + 1, NULL);

                clear_extent_bit(io_tree, start, end,
                                 EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
                                 &cached_state);

                cond_resched();
                spin_lock(&io_tree->lock);
        }
        spin_unlock(&io_tree->lock);
}

static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
                                                        struct btrfs_block_rsv *rsv)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_trans_handle *trans;
        u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1);
        int ret;

        /*
         * Eviction should be taking place at some place safe because of our
         * delayed iputs.  However the normal flushing code will run delayed
         * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
         *
         * We reserve the delayed_refs_extra here again because we can't use
         * btrfs_start_transaction(root, 0) for the same deadlocky reason as
         * above.  We reserve our extra bit here because we generate a ton of
         * delayed refs activity by truncating.
         *
         * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
         * if we fail to make this reservation we can re-try without the
         * delayed_refs_extra so we can make some forward progress.
         */
        ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
                                     BTRFS_RESERVE_FLUSH_EVICT);
        if (ret) {
                ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
                                             BTRFS_RESERVE_FLUSH_EVICT);
                if (ret) {
                        btrfs_warn(fs_info,
                                   "could not allocate space for delete; will truncate on mount");
                        return ERR_PTR(-ENOSPC);
                }
                delayed_refs_extra = 0;
        }

        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                return trans;

        if (delayed_refs_extra) {
                trans->block_rsv = &fs_info->trans_block_rsv;
                trans->bytes_reserved = delayed_refs_extra;
                btrfs_block_rsv_migrate(rsv, trans->block_rsv,
                                        delayed_refs_extra, true);
        }
        return trans;
}

void btrfs_evict_inode(struct inode *inode)
{
        struct btrfs_fs_info *fs_info;
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_block_rsv *rsv = NULL;
        int ret;

        trace_btrfs_inode_evict(inode);

        if (!root) {
                fsverity_cleanup_inode(inode);
                clear_inode(inode);
                return;
        }

        fs_info = inode_to_fs_info(inode);
        evict_inode_truncate_pages(inode);

        if (inode->i_nlink &&
            ((btrfs_root_refs(&root->root_item) != 0 &&
              btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) ||
             btrfs_is_free_space_inode(BTRFS_I(inode))))
                goto out;

        if (is_bad_inode(inode))
                goto out;

        if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
                goto out;

        if (inode->i_nlink > 0) {
                BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
                       btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID);
                goto out;
        }

        /*
         * This makes sure the inode item in tree is uptodate and the space for
         * the inode update is released.
         */
        ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
        if (ret)
                goto out;

        /*
         * This drops any pending insert or delete operations we have for this
         * inode.  We could have a delayed dir index deletion queued up, but
         * we're removing the inode completely so that'll be taken care of in
         * the truncate.
         */
        btrfs_kill_delayed_inode_items(BTRFS_I(inode));

        rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
        if (!rsv)
                goto out;
        rsv->size = btrfs_calc_metadata_size(fs_info, 1);
        rsv->failfast = true;

        btrfs_i_size_write(BTRFS_I(inode), 0);

        while (1) {
                struct btrfs_truncate_control control = {
                        .inode = BTRFS_I(inode),
                        .ino = btrfs_ino(BTRFS_I(inode)),
                        .new_size = 0,
                        .min_type = 0,
                };

                trans = evict_refill_and_join(root, rsv);
                if (IS_ERR(trans))
                        goto out;

                trans->block_rsv = rsv;

                ret = btrfs_truncate_inode_items(trans, root, &control);
                trans->block_rsv = &fs_info->trans_block_rsv;
                btrfs_end_transaction(trans);
                /*
                 * We have not added new delayed items for our inode after we
                 * have flushed its delayed items, so no need to throttle on
                 * delayed items. However we have modified extent buffers.
                 */
                btrfs_btree_balance_dirty_nodelay(fs_info);
                if (ret && ret != -ENOSPC && ret != -EAGAIN)
                        goto out;
                else if (!ret)
                        break;
        }

        /*
         * Errors here aren't a big deal, it just means we leave orphan items in
         * the tree. They will be cleaned up on the next mount. If the inode
         * number gets reused, cleanup deletes the orphan item without doing
         * anything, and unlink reuses the existing orphan item.
         *
         * If it turns out that we are dropping too many of these, we might want
         * to add a mechanism for retrying these after a commit.
         */
        trans = evict_refill_and_join(root, rsv);
        if (!IS_ERR(trans)) {
                trans->block_rsv = rsv;
                btrfs_orphan_del(trans, BTRFS_I(inode));
                trans->block_rsv = &fs_info->trans_block_rsv;
                btrfs_end_transaction(trans);
        }

out:
        btrfs_free_block_rsv(fs_info, rsv);
        /*
         * If we didn't successfully delete, the orphan item will still be in
         * the tree and we'll retry on the next mount. Again, we might also want
         * to retry these periodically in the future.
         */
        btrfs_remove_delayed_node(BTRFS_I(inode));
        fsverity_cleanup_inode(inode);
        clear_inode(inode);
}

/*
 * Return the key found in the dir entry in the location pointer, fill @type
 * with BTRFS_FT_*, and return 0.
 *
 * If no dir entries were found, returns -ENOENT.
 * If found a corrupted location in dir entry, returns -EUCLEAN.
 */
static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
                               struct btrfs_key *location, u8 *type)
{
        struct btrfs_dir_item *di;
        struct btrfs_path *path;
        struct btrfs_root *root = dir->root;
        int ret = 0;
        struct fscrypt_name fname;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
        if (ret < 0)
                goto out;
        /*
         * fscrypt_setup_filename() should never return a positive value, but
         * gcc on sparc/parisc thinks it can, so assert that doesn't happen.
         */
        ASSERT(ret == 0);

        /* This needs to handle no-key deletions later on */

        di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir),
                                   &fname.disk_name, 0);
        if (IS_ERR_OR_NULL(di)) {
                ret = di ? PTR_ERR(di) : -ENOENT;
                goto out;
        }

        btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
        if (location->type != BTRFS_INODE_ITEM_KEY &&
            location->type != BTRFS_ROOT_ITEM_KEY) {
                ret = -EUCLEAN;
                btrfs_warn(root->fs_info,
"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
                           __func__, fname.disk_name.name, btrfs_ino(dir),
                           location->objectid, location->type, location->offset);
        }
        if (!ret)
                *type = btrfs_dir_ftype(path->nodes[0], di);
out:
        fscrypt_free_filename(&fname);
        btrfs_free_path(path);
        return ret;
}

/*
 * when we hit a tree root in a directory, the btrfs part of the inode
 * needs to be changed to reflect the root directory of the tree root.  This
 * is kind of like crossing a mount point.
 */
static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
                                    struct btrfs_inode *dir,
                                    struct dentry *dentry,
                                    struct btrfs_key *location,
                                    struct btrfs_root **sub_root)
{
        struct btrfs_path *path;
        struct btrfs_root *new_root;
        struct btrfs_root_ref *ref;
        struct extent_buffer *leaf;
        struct btrfs_key key;
        int ret;
        int err = 0;
        struct fscrypt_name fname;

        ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname);
        if (ret)
                return ret;

        path = btrfs_alloc_path();
        if (!path) {
                err = -ENOMEM;
                goto out;
        }

        err = -ENOENT;
        key.objectid = btrfs_root_id(dir->root);
        key.type = BTRFS_ROOT_REF_KEY;
        key.offset = location->objectid;

        ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
        if (ret) {
                if (ret < 0)
                        err = ret;
                goto out;
        }

        leaf = path->nodes[0];
        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
        if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
            btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
                goto out;

        ret = memcmp_extent_buffer(leaf, fname.disk_name.name,
                                   (unsigned long)(ref + 1), fname.disk_name.len);
        if (ret)
                goto out;

        btrfs_release_path(path);

        new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
        if (IS_ERR(new_root)) {
                err = PTR_ERR(new_root);
                goto out;
        }

        *sub_root = new_root;
        location->objectid = btrfs_root_dirid(&new_root->root_item);
        location->type = BTRFS_INODE_ITEM_KEY;
        location->offset = 0;
        err = 0;
out:
        btrfs_free_path(path);
        fscrypt_free_filename(&fname);
        return err;
}

static void inode_tree_add(struct btrfs_inode *inode)
{
        struct btrfs_root *root = inode->root;
        struct btrfs_inode *entry;
        struct rb_node **p;
        struct rb_node *parent;
        struct rb_node *new = &inode->rb_node;
        u64 ino = btrfs_ino(inode);

        if (inode_unhashed(&inode->vfs_inode))
                return;
        parent = NULL;
        spin_lock(&root->inode_lock);
        p = &root->inode_tree.rb_node;
        while (*p) {
                parent = *p;
                entry = rb_entry(parent, struct btrfs_inode, rb_node);

                if (ino < btrfs_ino(entry))
                        p = &parent->rb_left;
                else if (ino > btrfs_ino(entry))
                        p = &parent->rb_right;
                else {
                        WARN_ON(!(entry->vfs_inode.i_state &
                                  (I_WILL_FREE | I_FREEING)));
                        rb_replace_node(parent, new, &root->inode_tree);
                        RB_CLEAR_NODE(parent);
                        spin_unlock(&root->inode_lock);
                        return;
                }
        }
        rb_link_node(new, parent, p);
        rb_insert_color(new, &root->inode_tree);
        spin_unlock(&root->inode_lock);
}

static void inode_tree_del(struct btrfs_inode *inode)
{
        struct btrfs_root *root = inode->root;
        int empty = 0;

        spin_lock(&root->inode_lock);
        if (!RB_EMPTY_NODE(&inode->rb_node)) {
                rb_erase(&inode->rb_node, &root->inode_tree);
                RB_CLEAR_NODE(&inode->rb_node);
                empty = RB_EMPTY_ROOT(&root->inode_tree);
        }
        spin_unlock(&root->inode_lock);

        if (empty && btrfs_root_refs(&root->root_item) == 0) {
                spin_lock(&root->inode_lock);
                empty = RB_EMPTY_ROOT(&root->inode_tree);
                spin_unlock(&root->inode_lock);
                if (empty)
                        btrfs_add_dead_root(root);
        }
}


static int btrfs_init_locked_inode(struct inode *inode, void *p)
{
        struct btrfs_iget_args *args = p;

        inode->i_ino = args->ino;
        BTRFS_I(inode)->location.objectid = args->ino;
        BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
        BTRFS_I(inode)->location.offset = 0;
        BTRFS_I(inode)->root = btrfs_grab_root(args->root);

        if (args->root && args->root == args->root->fs_info->tree_root &&
            args->ino != BTRFS_BTREE_INODE_OBJECTID)
                set_bit(BTRFS_INODE_FREE_SPACE_INODE,
                        &BTRFS_I(inode)->runtime_flags);
        return 0;
}

static int btrfs_find_actor(struct inode *inode, void *opaque)
{
        struct btrfs_iget_args *args = opaque;

        return args->ino == BTRFS_I(inode)->location.objectid &&
                args->root == BTRFS_I(inode)->root;
}

static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
                                       struct btrfs_root *root)
{
        struct inode *inode;
        struct btrfs_iget_args args;
        unsigned long hashval = btrfs_inode_hash(ino, root);

        args.ino = ino;
        args.root = root;

        inode = iget5_locked(s, hashval, btrfs_find_actor,
                             btrfs_init_locked_inode,
                             (void *)&args);
        return inode;
}

/*
 * Get an inode object given its inode number and corresponding root.
 * Path can be preallocated to prevent recursing back to iget through
 * allocator. NULL is also valid but may require an additional allocation
 * later.
 */
struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
                              struct btrfs_root *root, struct btrfs_path *path)
{
        struct inode *inode;

        inode = btrfs_iget_locked(s, ino, root);
        if (!inode)
                return ERR_PTR(-ENOMEM);

        if (inode->i_state & I_NEW) {
                int ret;

                ret = btrfs_read_locked_inode(inode, path);
                if (!ret) {
                        inode_tree_add(BTRFS_I(inode));
                        unlock_new_inode(inode);
                } else {
                        iget_failed(inode);
                        /*
                         * ret > 0 can come from btrfs_search_slot called by
                         * btrfs_read_locked_inode, this means the inode item
                         * was not found.
                         */
                        if (ret > 0)
                                ret = -ENOENT;
                        inode = ERR_PTR(ret);
                }
        }

        return inode;
}

struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
{
        return btrfs_iget_path(s, ino, root, NULL);
}

static struct inode *new_simple_dir(struct inode *dir,
                                    struct btrfs_key *key,
                                    struct btrfs_root *root)
{
        struct timespec64 ts;
        struct inode *inode = new_inode(dir->i_sb);

        if (!inode)
                return ERR_PTR(-ENOMEM);

        BTRFS_I(inode)->root = btrfs_grab_root(root);
        memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
        set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);

        inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
        /*
         * We only need lookup, the rest is read-only and there's no inode
         * associated with the dentry
         */
        inode->i_op = &simple_dir_inode_operations;
        inode->i_opflags &= ~IOP_XATTR;
        inode->i_fop = &simple_dir_operations;
        inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;

        ts = inode_set_ctime_current(inode);
        inode_set_mtime_to_ts(inode, ts);
        inode_set_atime_to_ts(inode, inode_get_atime(dir));
        BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
        BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;

        inode->i_uid = dir->i_uid;
        inode->i_gid = dir->i_gid;

        return inode;
}

static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
static_assert(BTRFS_FT_DIR == FT_DIR);
static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
static_assert(BTRFS_FT_FIFO == FT_FIFO);
static_assert(BTRFS_FT_SOCK == FT_SOCK);
static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);

static inline u8 btrfs_inode_type(struct inode *inode)
{
        return fs_umode_to_ftype(inode->i_mode);
}

struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
{
        struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
        struct inode *inode;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_root *sub_root = root;
        struct btrfs_key location;
        u8 di_type = 0;
        int ret = 0;

        if (dentry->d_name.len > BTRFS_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);

        ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type);
        if (ret < 0)
                return ERR_PTR(ret);

        if (location.type == BTRFS_INODE_ITEM_KEY) {
                inode = btrfs_iget(dir->i_sb, location.objectid, root);
                if (IS_ERR(inode))
                        return inode;

                /* Do extra check against inode mode with di_type */
                if (btrfs_inode_type(inode) != di_type) {
                        btrfs_crit(fs_info,
"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
                                  inode->i_mode, btrfs_inode_type(inode),
                                  di_type);
                        iput(inode);
                        return ERR_PTR(-EUCLEAN);
                }
                return inode;
        }

        ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
                                       &location, &sub_root);
        if (ret < 0) {
                if (ret != -ENOENT)
                        inode = ERR_PTR(ret);
                else
                        inode = new_simple_dir(dir, &location, root);
        } else {
                inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
                btrfs_put_root(sub_root);

                if (IS_ERR(inode))
                        return inode;

                down_read(&fs_info->cleanup_work_sem);
                if (!sb_rdonly(inode->i_sb))
                        ret = btrfs_orphan_cleanup(sub_root);
                up_read(&fs_info->cleanup_work_sem);
                if (ret) {
                        iput(inode);
                        inode = ERR_PTR(ret);
                }
        }

        return inode;
}

static int btrfs_dentry_delete(const struct dentry *dentry)
{
        struct btrfs_root *root;
        struct inode *inode = d_inode(dentry);

        if (!inode && !IS_ROOT(dentry))
                inode = d_inode(dentry->d_parent);

        if (inode) {
                root = BTRFS_I(inode)->root;
                if (btrfs_root_refs(&root->root_item) == 0)
                        return 1;

                if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
                        return 1;
        }
        return 0;
}

static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
                                   unsigned int flags)
{
        struct inode *inode = btrfs_lookup_dentry(dir, dentry);

        if (inode == ERR_PTR(-ENOENT))
                inode = NULL;
        return d_splice_alias(inode, dentry);
}

/*
 * Find the highest existing sequence number in a directory and then set the
 * in-memory index_cnt variable to the first free sequence number.
 */
static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
{
        struct btrfs_root *root = inode->root;
        struct btrfs_key key, found_key;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        int ret;

        key.objectid = btrfs_ino(inode);
        key.type = BTRFS_DIR_INDEX_KEY;
        key.offset = (u64)-1;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto out;
        /* FIXME: we should be able to handle this */
        if (ret == 0)
                goto out;
        ret = 0;

        if (path->slots[0] == 0) {
                inode->index_cnt = BTRFS_DIR_START_INDEX;
                goto out;
        }

        path->slots[0]--;

        leaf = path->nodes[0];
        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);

        if (found_key.objectid != btrfs_ino(inode) ||
            found_key.type != BTRFS_DIR_INDEX_KEY) {
                inode->index_cnt = BTRFS_DIR_START_INDEX;
                goto out;
        }

        inode->index_cnt = found_key.offset + 1;
out:
        btrfs_free_path(path);
        return ret;
}

static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
{
        int ret = 0;

        btrfs_inode_lock(dir, 0);
        if (dir->index_cnt == (u64)-1) {
                ret = btrfs_inode_delayed_dir_index_count(dir);
                if (ret) {
                        ret = btrfs_set_inode_index_count(dir);
                        if (ret)
                                goto out;
                }
        }

        /* index_cnt is the index number of next new entry, so decrement it. */
        *index = dir->index_cnt - 1;
out:
        btrfs_inode_unlock(dir, 0);

        return ret;
}

/*
 * All this infrastructure exists because dir_emit can fault, and we are holding
 * the tree lock when doing readdir.  For now just allocate a buffer and copy
 * our information into that, and then dir_emit from the buffer.  This is
 * similar to what NFS does, only we don't keep the buffer around in pagecache
 * because I'm afraid I'll mess that up.  Long term we need to make filldir do
 * copy_to_user_inatomic so we don't have to worry about page faulting under the
 * tree lock.
 */
static int btrfs_opendir(struct inode *inode, struct file *file)
{
        struct btrfs_file_private *private;
        u64 last_index;
        int ret;

        ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index);
        if (ret)
                return ret;

        private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
        if (!private)
                return -ENOMEM;
        private->last_index = last_index;
        private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
        if (!private->filldir_buf) {
                kfree(private);
                return -ENOMEM;
        }
        file->private_data = private;
        return 0;
}

static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence)
{
        struct btrfs_file_private *private = file->private_data;
        int ret;

        ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)),
                                       &private->last_index);
        if (ret)
                return ret;

        return generic_file_llseek(file, offset, whence);
}

struct dir_entry {
        u64 ino;
        u64 offset;
        unsigned type;
        int name_len;
};

static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
{
        while (entries--) {
                struct dir_entry *entry = addr;
                char *name = (char *)(entry + 1);

                ctx->pos = get_unaligned(&entry->offset);
                if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
                                         get_unaligned(&entry->ino),
                                         get_unaligned(&entry->type)))
                        return 1;
                addr += sizeof(struct dir_entry) +
                        get_unaligned(&entry->name_len);
                ctx->pos++;
        }
        return 0;
}

static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
{
        struct inode *inode = file_inode(file);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_file_private *private = file->private_data;
        struct btrfs_dir_item *di;
        struct btrfs_key key;
        struct btrfs_key found_key;
        struct btrfs_path *path;
        void *addr;
        LIST_HEAD(ins_list);
        LIST_HEAD(del_list);
        int ret;
        char *name_ptr;
        int name_len;
        int entries = 0;
        int total_len = 0;
        bool put = false;
        struct btrfs_key location;

        if (!dir_emit_dots(file, ctx))
                return 0;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        addr = private->filldir_buf;
        path->reada = READA_FORWARD;

        put = btrfs_readdir_get_delayed_items(inode, private->last_index,
                                              &ins_list, &del_list);

again:
        key.type = BTRFS_DIR_INDEX_KEY;
        key.offset = ctx->pos;
        key.objectid = btrfs_ino(BTRFS_I(inode));

        btrfs_for_each_slot(root, &key, &found_key, path, ret) {
                struct dir_entry *entry;
                struct extent_buffer *leaf = path->nodes[0];
                u8 ftype;

                if (found_key.objectid != key.objectid)
                        break;
                if (found_key.type != BTRFS_DIR_INDEX_KEY)
                        break;
                if (found_key.offset < ctx->pos)
                        continue;
                if (found_key.offset > private->last_index)
                        break;
                if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
                        continue;
                di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
                name_len = btrfs_dir_name_len(leaf, di);
                if ((total_len + sizeof(struct dir_entry) + name_len) >=
                    PAGE_SIZE) {
                        btrfs_release_path(path);
                        ret = btrfs_filldir(private->filldir_buf, entries, ctx);
                        if (ret)
                                goto nopos;
                        addr = private->filldir_buf;
                        entries = 0;
                        total_len = 0;
                        goto again;
                }

                ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di));
                entry = addr;
                name_ptr = (char *)(entry + 1);
                read_extent_buffer(leaf, name_ptr,
                                   (unsigned long)(di + 1), name_len);
                put_unaligned(name_len, &entry->name_len);
                put_unaligned(fs_ftype_to_dtype(ftype), &entry->type);
                btrfs_dir_item_key_to_cpu(leaf, di, &location);
                put_unaligned(location.objectid, &entry->ino);
                put_unaligned(found_key.offset, &entry->offset);
                entries++;
                addr += sizeof(struct dir_entry) + name_len;
                total_len += sizeof(struct dir_entry) + name_len;
        }
        /* Catch error encountered during iteration */
        if (ret < 0)
                goto err;

        btrfs_release_path(path);

        ret = btrfs_filldir(private->filldir_buf, entries, ctx);
        if (ret)
                goto nopos;

        ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
        if (ret)
                goto nopos;

        /*
         * Stop new entries from being returned after we return the last
         * entry.
         *
         * New directory entries are assigned a strictly increasing
         * offset.  This means that new entries created during readdir
         * are *guaranteed* to be seen in the future by that readdir.
         * This has broken buggy programs which operate on names as
         * they're returned by readdir.  Until we re-use freed offsets
         * we have this hack to stop new entries from being returned
         * under the assumption that they'll never reach this huge
         * offset.
         *
         * This is being careful not to overflow 32bit loff_t unless the
         * last entry requires it because doing so has broken 32bit apps
         * in the past.
         */
        if (ctx->pos >= INT_MAX)
                ctx->pos = LLONG_MAX;
        else
                ctx->pos = INT_MAX;
nopos:
        ret = 0;
err:
        if (put)
                btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
        btrfs_free_path(path);
        return ret;
}

/*
 * This is somewhat expensive, updating the tree every time the
 * inode changes.  But, it is most likely to find the inode in cache.
 * FIXME, needs more benchmarking...there are no reasons other than performance
 * to keep or drop this code.
 */
static int btrfs_dirty_inode(struct btrfs_inode *inode)
{
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_trans_handle *trans;
        int ret;

        if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags))
                return 0;

        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                return PTR_ERR(trans);

        ret = btrfs_update_inode(trans, inode);
        if (ret == -ENOSPC || ret == -EDQUOT) {
                /* whoops, lets try again with the full transaction */
                btrfs_end_transaction(trans);
                trans = btrfs_start_transaction(root, 1);
                if (IS_ERR(trans))
                        return PTR_ERR(trans);

                ret = btrfs_update_inode(trans, inode);
        }
        btrfs_end_transaction(trans);
        if (inode->delayed_node)
                btrfs_balance_delayed_items(fs_info);

        return ret;
}

/*
 * This is a copy of file_update_time.  We need this so we can return error on
 * ENOSPC for updating the inode in the case of file write and mmap writes.
 */
static int btrfs_update_time(struct inode *inode, int flags)
{
        struct btrfs_root *root = BTRFS_I(inode)->root;
        bool dirty;

        if (btrfs_root_readonly(root))
                return -EROFS;

        dirty = inode_update_timestamps(inode, flags);
        return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
}

/*
 * helper to find a free sequence number in a given directory.  This current
 * code is very simple, later versions will do smarter things in the btree
 */
int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
{
        int ret = 0;

        if (dir->index_cnt == (u64)-1) {
                ret = btrfs_inode_delayed_dir_index_count(dir);
                if (ret) {
                        ret = btrfs_set_inode_index_count(dir);
                        if (ret)
                                return ret;
                }
        }

        *index = dir->index_cnt;
        dir->index_cnt++;

        return ret;
}

static int btrfs_insert_inode_locked(struct inode *inode)
{
        struct btrfs_iget_args args;

        args.ino = BTRFS_I(inode)->location.objectid;
        args.root = BTRFS_I(inode)->root;

        return insert_inode_locked4(inode,
                   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
                   btrfs_find_actor, &args);
}

int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
                            unsigned int *trans_num_items)
{
        struct inode *dir = args->dir;
        struct inode *inode = args->inode;
        int ret;

        if (!args->orphan) {
                ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
                                             &args->fname);
                if (ret)
                        return ret;
        }

        ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
        if (ret) {
                fscrypt_free_filename(&args->fname);
                return ret;
        }

        /* 1 to add inode item */
        *trans_num_items = 1;
        /* 1 to add compression property */
        if (BTRFS_I(dir)->prop_compress)
                (*trans_num_items)++;
        /* 1 to add default ACL xattr */
        if (args->default_acl)
                (*trans_num_items)++;
        /* 1 to add access ACL xattr */
        if (args->acl)
                (*trans_num_items)++;
#ifdef CONFIG_SECURITY
        /* 1 to add LSM xattr */
        if (dir->i_security)
                (*trans_num_items)++;
#endif
        if (args->orphan) {
                /* 1 to add orphan item */
                (*trans_num_items)++;
        } else {
                /*
                 * 1 to add dir item
                 * 1 to add dir index
                 * 1 to update parent inode item
                 *
                 * No need for 1 unit for the inode ref item because it is
                 * inserted in a batch together with the inode item at
                 * btrfs_create_new_inode().
                 */
                *trans_num_items += 3;
        }
        return 0;
}

void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
{
        posix_acl_release(args->acl);
        posix_acl_release(args->default_acl);
        fscrypt_free_filename(&args->fname);
}

/*
 * Inherit flags from the parent inode.
 *
 * Currently only the compression flags and the cow flags are inherited.
 */
static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir)
{
        unsigned int flags;

        flags = dir->flags;

        if (flags & BTRFS_INODE_NOCOMPRESS) {
                inode->flags &= ~BTRFS_INODE_COMPRESS;
                inode->flags |= BTRFS_INODE_NOCOMPRESS;
        } else if (flags & BTRFS_INODE_COMPRESS) {
                inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
                inode->flags |= BTRFS_INODE_COMPRESS;
        }

        if (flags & BTRFS_INODE_NODATACOW) {
                inode->flags |= BTRFS_INODE_NODATACOW;
                if (S_ISREG(inode->vfs_inode.i_mode))
                        inode->flags |= BTRFS_INODE_NODATASUM;
        }

        btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
}

int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
                           struct btrfs_new_inode_args *args)
{
        struct timespec64 ts;
        struct inode *dir = args->dir;
        struct inode *inode = args->inode;
        const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
        struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
        struct btrfs_root *root;
        struct btrfs_inode_item *inode_item;
        struct btrfs_key *location;
        struct btrfs_path *path;
        u64 objectid;
        struct btrfs_inode_ref *ref;
        struct btrfs_key key[2];
        u32 sizes[2];
        struct btrfs_item_batch batch;
        unsigned long ptr;
        int ret;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        if (!args->subvol)
                BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
        root = BTRFS_I(inode)->root;

        ret = btrfs_get_free_objectid(root, &objectid);
        if (ret)
                goto out;
        inode->i_ino = objectid;

        if (args->orphan) {
                /*
                 * O_TMPFILE, set link count to 0, so that after this point, we
                 * fill in an inode item with the correct link count.
                 */
                set_nlink(inode, 0);
        } else {
                trace_btrfs_inode_request(dir);

                ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
                if (ret)
                        goto out;
        }
        /* index_cnt is ignored for everything but a dir. */
        BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
        BTRFS_I(inode)->generation = trans->transid;
        inode->i_generation = BTRFS_I(inode)->generation;

        /*
         * We don't have any capability xattrs set here yet, shortcut any
         * queries for the xattrs here.  If we add them later via the inode
         * security init path or any other path this flag will be cleared.
         */
        set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);

        /*
         * Subvolumes don't inherit flags from their parent directory.
         * Originally this was probably by accident, but we probably can't
         * change it now without compatibility issues.
         */
        if (!args->subvol)
                btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));

        if (S_ISREG(inode->i_mode)) {
                if (btrfs_test_opt(fs_info, NODATASUM))
                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
                if (btrfs_test_opt(fs_info, NODATACOW))
                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
                                BTRFS_INODE_NODATASUM;
        }

        location = &BTRFS_I(inode)->location;
        location->objectid = objectid;
        location->offset = 0;
        location->type = BTRFS_INODE_ITEM_KEY;

        ret = btrfs_insert_inode_locked(inode);
        if (ret < 0) {
                if (!args->orphan)
                        BTRFS_I(dir)->index_cnt--;
                goto out;
        }

        /*
         * We could have gotten an inode number from somebody who was fsynced
         * and then removed in this same transaction, so let's just set full
         * sync since it will be a full sync anyway and this will blow away the
         * old info in the log.
         */
        btrfs_set_inode_full_sync(BTRFS_I(inode));

        key[0].objectid = objectid;
        key[0].type = BTRFS_INODE_ITEM_KEY;
        key[0].offset = 0;

        sizes[0] = sizeof(struct btrfs_inode_item);

        if (!args->orphan) {
                /*
                 * Start new inodes with an inode_ref. This is slightly more
                 * efficient for small numbers of hard links since they will
                 * be packed into one item. Extended refs will kick in if we
                 * add more hard links than can fit in the ref item.
                 */
                key[1].objectid = objectid;
                key[1].type = BTRFS_INODE_REF_KEY;
                if (args->subvol) {
                        key[1].offset = objectid;
                        sizes[1] = 2 + sizeof(*ref);
                } else {
                        key[1].offset = btrfs_ino(BTRFS_I(dir));
                        sizes[1] = name->len + sizeof(*ref);
                }
        }

        batch.keys = &key[0];
        batch.data_sizes = &sizes[0];
        batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
        batch.nr = args->orphan ? 1 : 2;
        ret = btrfs_insert_empty_items(trans, root, path, &batch);
        if (ret != 0) {
                btrfs_abort_transaction(trans, ret);
                goto discard;
        }

        ts = simple_inode_init_ts(inode);
        BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
        BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;

        /*
         * We're going to fill the inode item now, so at this point the inode
         * must be fully initialized.
         */

        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                  struct btrfs_inode_item);
        memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
                             sizeof(*inode_item));
        fill_inode_item(trans, path->nodes[0], inode_item, inode);

        if (!args->orphan) {
                ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
                                     struct btrfs_inode_ref);
                ptr = (unsigned long)(ref + 1);
                if (args->subvol) {
                        btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
                        btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
                        write_extent_buffer(path->nodes[0], "..", ptr, 2);
                } else {
                        btrfs_set_inode_ref_name_len(path->nodes[0], ref,
                                                     name->len);
                        btrfs_set_inode_ref_index(path->nodes[0], ref,
                                                  BTRFS_I(inode)->dir_index);
                        write_extent_buffer(path->nodes[0], name->name, ptr,
                                            name->len);
                }
        }

        btrfs_mark_buffer_dirty(trans, path->nodes[0]);
        /*
         * We don't need the path anymore, plus inheriting properties, adding
         * ACLs, security xattrs, orphan item or adding the link, will result in
         * allocating yet another path. So just free our path.
         */
        btrfs_free_path(path);
        path = NULL;

        if (args->subvol) {
                struct inode *parent;

                /*
                 * Subvolumes inherit properties from their parent subvolume,
                 * not the directory they were created in.
                 */
                parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID,
                                    BTRFS_I(dir)->root);
                if (IS_ERR(parent)) {
                        ret = PTR_ERR(parent);
                } else {
                        ret = btrfs_inode_inherit_props(trans, inode, parent);
                        iput(parent);
                }
        } else {
                ret = btrfs_inode_inherit_props(trans, inode, dir);
        }
        if (ret) {
                btrfs_err(fs_info,
                          "error inheriting props for ino %llu (root %llu): %d",
                          btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret);
        }

        /*
         * Subvolumes don't inherit ACLs or get passed to the LSM. This is
         * probably a bug.
         */
        if (!args->subvol) {
                ret = btrfs_init_inode_security(trans, args);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        goto discard;
                }
        }

        inode_tree_add(BTRFS_I(inode));

        trace_btrfs_inode_new(inode);
        btrfs_set_inode_last_trans(trans, BTRFS_I(inode));

        btrfs_update_root_times(trans, root);

        if (args->orphan) {
                ret = btrfs_orphan_add(trans, BTRFS_I(inode));
        } else {
                ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
                                     0, BTRFS_I(inode)->dir_index);
        }
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto discard;
        }

        return 0;

discard:
        /*
         * discard_new_inode() calls iput(), but the caller owns the reference
         * to the inode.
         */
        ihold(inode);
        discard_new_inode(inode);
out:
        btrfs_free_path(path);
        return ret;
}

/*
 * utility function to add 'inode' into 'parent_inode' with
 * a give name and a given sequence number.
 * if 'add_backref' is true, also insert a backref from the
 * inode to the parent directory.
 */
int btrfs_add_link(struct btrfs_trans_handle *trans,
                   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
                   const struct fscrypt_str *name, int add_backref, u64 index)
{
        int ret = 0;
        struct btrfs_key key;
        struct btrfs_root *root = parent_inode->root;
        u64 ino = btrfs_ino(inode);
        u64 parent_ino = btrfs_ino(parent_inode);

        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
                memcpy(&key, &inode->root->root_key, sizeof(key));
        } else {
                key.objectid = ino;
                key.type = BTRFS_INODE_ITEM_KEY;
                key.offset = 0;
        }

        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
                ret = btrfs_add_root_ref(trans, key.objectid,
                                         btrfs_root_id(root), parent_ino,
                                         index, name);
        } else if (add_backref) {
                ret = btrfs_insert_inode_ref(trans, root, name,
                                             ino, parent_ino, index);
        }

        /* Nothing to clean up yet */
        if (ret)
                return ret;

        ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
                                    btrfs_inode_type(&inode->vfs_inode), index);
        if (ret == -EEXIST || ret == -EOVERFLOW)
                goto fail_dir_item;
        else if (ret) {
                btrfs_abort_transaction(trans, ret);
                return ret;
        }

        btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
                           name->len * 2);
        inode_inc_iversion(&parent_inode->vfs_inode);
        /*
         * If we are replaying a log tree, we do not want to update the mtime
         * and ctime of the parent directory with the current time, since the
         * log replay procedure is responsible for setting them to their correct
         * values (the ones it had when the fsync was done).
         */
        if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
                inode_set_mtime_to_ts(&parent_inode->vfs_inode,
                                      inode_set_ctime_current(&parent_inode->vfs_inode));

        ret = btrfs_update_inode(trans, parent_inode);
        if (ret)
                btrfs_abort_transaction(trans, ret);
        return ret;

fail_dir_item:
        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
                u64 local_index;
                int err;
                err = btrfs_del_root_ref(trans, key.objectid,
                                         btrfs_root_id(root), parent_ino,
                                         &local_index, name);
                if (err)
                        btrfs_abort_transaction(trans, err);
        } else if (add_backref) {
                u64 local_index;
                int err;

                err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
                                          &local_index);
                if (err)
                        btrfs_abort_transaction(trans, err);
        }

        /* Return the original error code */
        return ret;
}

static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
                               struct inode *inode)
{
        struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_new_inode_args new_inode_args = {
                .dir = dir,
                .dentry = dentry,
                .inode = inode,
        };
        unsigned int trans_num_items;
        struct btrfs_trans_handle *trans;
        int err;

        err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
        if (err)
                goto out_inode;

        trans = btrfs_start_transaction(root, trans_num_items);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto out_new_inode_args;
        }

        err = btrfs_create_new_inode(trans, &new_inode_args);
        if (!err)
                d_instantiate_new(dentry, inode);

        btrfs_end_transaction(trans);
        btrfs_btree_balance_dirty(fs_info);
out_new_inode_args:
        btrfs_new_inode_args_destroy(&new_inode_args);
out_inode:
        if (err)
                iput(inode);
        return err;
}

static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
                       struct dentry *dentry, umode_t mode, dev_t rdev)
{
        struct inode *inode;

        inode = new_inode(dir->i_sb);
        if (!inode)
                return -ENOMEM;
        inode_init_owner(idmap, inode, dir, mode);
        inode->i_op = &btrfs_special_inode_operations;
        init_special_inode(inode, inode->i_mode, rdev);
        return btrfs_create_common(dir, dentry, inode);
}

static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir,
                        struct dentry *dentry, umode_t mode, bool excl)
{
        struct inode *inode;

        inode = new_inode(dir->i_sb);
        if (!inode)
                return -ENOMEM;
        inode_init_owner(idmap, inode, dir, mode);
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
        inode->i_mapping->a_ops = &btrfs_aops;
        return btrfs_create_common(dir, dentry, inode);
}

static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                      struct dentry *dentry)
{
        struct btrfs_trans_handle *trans = NULL;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct inode *inode = d_inode(old_dentry);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct fscrypt_name fname;
        u64 index;
        int err;
        int drop_inode = 0;

        /* do not allow sys_link's with other subvols of the same device */
        if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
                return -EXDEV;

        if (inode->i_nlink >= BTRFS_LINK_MAX)
                return -EMLINK;

        err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
        if (err)
                goto fail;

        err = btrfs_set_inode_index(BTRFS_I(dir), &index);
        if (err)
                goto fail;

        /*
         * 2 items for inode and inode ref
         * 2 items for dir items
         * 1 item for parent inode
         * 1 item for orphan item deletion if O_TMPFILE
         */
        trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                trans = NULL;
                goto fail;
        }

        /* There are several dir indexes for this inode, clear the cache. */
        BTRFS_I(inode)->dir_index = 0ULL;
        inc_nlink(inode);
        inode_inc_iversion(inode);
        inode_set_ctime_current(inode);
        ihold(inode);
        set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);

        err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
                             &fname.disk_name, 1, index);

        if (err) {
                drop_inode = 1;
        } else {
                struct dentry *parent = dentry->d_parent;

                err = btrfs_update_inode(trans, BTRFS_I(inode));
                if (err)
                        goto fail;
                if (inode->i_nlink == 1) {
                        /*
                         * If new hard link count is 1, it's a file created
                         * with open(2) O_TMPFILE flag.
                         */
                        err = btrfs_orphan_del(trans, BTRFS_I(inode));
                        if (err)
                                goto fail;
                }
                d_instantiate(dentry, inode);
                btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
        }

fail:
        fscrypt_free_filename(&fname);
        if (trans)
                btrfs_end_transaction(trans);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
        }
        btrfs_btree_balance_dirty(fs_info);
        return err;
}

static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
                       struct dentry *dentry, umode_t mode)
{
        struct inode *inode;

        inode = new_inode(dir->i_sb);
        if (!inode)
                return -ENOMEM;
        inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
        inode->i_op = &btrfs_dir_inode_operations;
        inode->i_fop = &btrfs_dir_file_operations;
        return btrfs_create_common(dir, dentry, inode);
}

static noinline int uncompress_inline(struct btrfs_path *path,
                                      struct page *page,
                                      struct btrfs_file_extent_item *item)
{
        int ret;
        struct extent_buffer *leaf = path->nodes[0];
        char *tmp;
        size_t max_size;
        unsigned long inline_size;
        unsigned long ptr;
        int compress_type;

        compress_type = btrfs_file_extent_compression(leaf, item);
        max_size = btrfs_file_extent_ram_bytes(leaf, item);
        inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
        tmp = kmalloc(inline_size, GFP_NOFS);
        if (!tmp)
                return -ENOMEM;
        ptr = btrfs_file_extent_inline_start(item);

        read_extent_buffer(leaf, tmp, ptr, inline_size);

        max_size = min_t(unsigned long, PAGE_SIZE, max_size);
        ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size);

        /*
         * decompression code contains a memset to fill in any space between the end
         * of the uncompressed data and the end of max_size in case the decompressed
         * data ends up shorter than ram_bytes.  That doesn't cover the hole between
         * the end of an inline extent and the beginning of the next block, so we
         * cover that region here.
         */

        if (max_size < PAGE_SIZE)
                memzero_page(page, max_size, PAGE_SIZE - max_size);
        kfree(tmp);
        return ret;
}

static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
                              struct page *page)
{
        struct btrfs_file_extent_item *fi;
        void *kaddr;
        size_t copy_size;

        if (!page || PageUptodate(page))
                return 0;

        ASSERT(page_offset(page) == 0);

        fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
                            struct btrfs_file_extent_item);
        if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
                return uncompress_inline(path, page, fi);

        copy_size = min_t(u64, PAGE_SIZE,
                          btrfs_file_extent_ram_bytes(path->nodes[0], fi));
        kaddr = kmap_local_page(page);
        read_extent_buffer(path->nodes[0], kaddr,
                           btrfs_file_extent_inline_start(fi), copy_size);
        kunmap_local(kaddr);
        if (copy_size < PAGE_SIZE)
                memzero_page(page, copy_size, PAGE_SIZE - copy_size);
        return 0;
}

/*
 * Lookup the first extent overlapping a range in a file.
 *
 * @inode:        file to search in
 * @page:        page to read extent data into if the extent is inline
 * @start:        file offset
 * @len:        length of range starting at @start
 *
 * Return the first &struct extent_map which overlaps the given range, reading
 * it from the B-tree and caching it if necessary. Note that there may be more
 * extents which overlap the given range after the returned extent_map.
 *
 * If @page is not NULL and the extent is inline, this also reads the extent
 * data directly into the page and marks the extent up to date in the io_tree.
 *
 * Return: ERR_PTR on error, non-NULL extent_map on success.
 */
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
                                    struct page *page, u64 start, u64 len)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        int ret = 0;
        u64 extent_start = 0;
        u64 extent_end = 0;
        u64 objectid = btrfs_ino(inode);
        int extent_type = -1;
        struct btrfs_path *path = NULL;
        struct btrfs_root *root = inode->root;
        struct btrfs_file_extent_item *item;
        struct extent_buffer *leaf;
        struct btrfs_key found_key;
        struct extent_map *em = NULL;
        struct extent_map_tree *em_tree = &inode->extent_tree;

        read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, start, len);
        read_unlock(&em_tree->lock);

        if (em) {
                if (em->start > start || em->start + em->len <= start)
                        free_extent_map(em);
                else if (em->block_start == EXTENT_MAP_INLINE && page)
                        free_extent_map(em);
                else
                        goto out;
        }
        em = alloc_extent_map();
        if (!em) {
                ret = -ENOMEM;
                goto out;
        }
        em->start = EXTENT_MAP_HOLE;
        em->orig_start = EXTENT_MAP_HOLE;
        em->len = (u64)-1;
        em->block_len = (u64)-1;

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }

        /* Chances are we'll be called again, so go ahead and do readahead */
        path->reada = READA_FORWARD;

        /*
         * The same explanation in load_free_space_cache applies here as well,
         * we only read when we're loading the free space cache, and at that
         * point the commit_root has everything we need.
         */
        if (btrfs_is_free_space_inode(inode)) {
                path->search_commit_root = 1;
                path->skip_locking = 1;
        }

        ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
        if (ret < 0) {
                goto out;
        } else if (ret > 0) {
                if (path->slots[0] == 0)
                        goto not_found;
                path->slots[0]--;
                ret = 0;
        }

        leaf = path->nodes[0];
        item = btrfs_item_ptr(leaf, path->slots[0],
                              struct btrfs_file_extent_item);
        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
        if (found_key.objectid != objectid ||
            found_key.type != BTRFS_EXTENT_DATA_KEY) {
                /*
                 * If we backup past the first extent we want to move forward
                 * and see if there is an extent in front of us, otherwise we'll
                 * say there is a hole for our whole search range which can
                 * cause problems.
                 */
                extent_end = start;
                goto next;
        }

        extent_type = btrfs_file_extent_type(leaf, item);
        extent_start = found_key.offset;
        extent_end = btrfs_file_extent_end(path);
        if (extent_type == BTRFS_FILE_EXTENT_REG ||
            extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
                /* Only regular file could have regular/prealloc extent */
                if (!S_ISREG(inode->vfs_inode.i_mode)) {
                        ret = -EUCLEAN;
                        btrfs_crit(fs_info,
                "regular/prealloc extent found for non-regular inode %llu",
                                   btrfs_ino(inode));
                        goto out;
                }
                trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
                                                       extent_start);
        } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
                                                      path->slots[0],
                                                      extent_start);
        }
next:
        if (start >= extent_end) {
                path->slots[0]++;
                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
                        ret = btrfs_next_leaf(root, path);
                        if (ret < 0)
                                goto out;
                        else if (ret > 0)
                                goto not_found;

                        leaf = path->nodes[0];
                }
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                if (found_key.objectid != objectid ||
                    found_key.type != BTRFS_EXTENT_DATA_KEY)
                        goto not_found;
                if (start + len <= found_key.offset)
                        goto not_found;
                if (start > found_key.offset)
                        goto next;

                /* New extent overlaps with existing one */
                em->start = start;
                em->orig_start = start;
                em->len = found_key.offset - start;
                em->block_start = EXTENT_MAP_HOLE;
                goto insert;
        }

        btrfs_extent_item_to_extent_map(inode, path, item, em);

        if (extent_type == BTRFS_FILE_EXTENT_REG ||
            extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
                goto insert;
        } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                /*
                 * Inline extent can only exist at file offset 0. This is
                 * ensured by tree-checker and inline extent creation path.
                 * Thus all members representing file offsets should be zero.
                 */
                ASSERT(extent_start == 0);
                ASSERT(em->start == 0);

                /*
                 * btrfs_extent_item_to_extent_map() should have properly
                 * initialized em members already.
                 *
                 * Other members are not utilized for inline extents.
                 */
                ASSERT(em->block_start == EXTENT_MAP_INLINE);
                ASSERT(em->len == fs_info->sectorsize);

                ret = read_inline_extent(inode, path, page);
                if (ret < 0)
                        goto out;
                goto insert;
        }
not_found:
        em->start = start;
        em->orig_start = start;
        em->len = len;
        em->block_start = EXTENT_MAP_HOLE;
insert:
        ret = 0;
        btrfs_release_path(path);
        if (em->start > start || extent_map_end(em) <= start) {
                btrfs_err(fs_info,
                          "bad extent! em: [%llu %llu] passed [%llu %llu]",
                          em->start, em->len, start, len);
                ret = -EIO;
                goto out;
        }

        write_lock(&em_tree->lock);
        ret = btrfs_add_extent_mapping(inode, &em, start, len);
        write_unlock(&em_tree->lock);
out:
        btrfs_free_path(path);

        trace_btrfs_get_extent(root, inode, em);

        if (ret) {
                free_extent_map(em);
                return ERR_PTR(ret);
        }
        return em;
}

static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
                                                  struct btrfs_dio_data *dio_data,
                                                  const u64 start,
                                                  const u64 len,
                                                  const u64 orig_start,
                                                  const u64 block_start,
                                                  const u64 block_len,
                                                  const u64 orig_block_len,
                                                  const u64 ram_bytes,
                                                  const int type)
{
        struct extent_map *em = NULL;
        struct btrfs_ordered_extent *ordered;

        if (type != BTRFS_ORDERED_NOCOW) {
                em = create_io_em(inode, start, len, orig_start, block_start,
                                  block_len, orig_block_len, ram_bytes,
                                  BTRFS_COMPRESS_NONE, /* compress_type */
                                  type);
                if (IS_ERR(em))
                        goto out;
        }
        ordered = btrfs_alloc_ordered_extent(inode, start, len, len,
                                             block_start, block_len, 0,
                                             (1 << type) |
                                             (1 << BTRFS_ORDERED_DIRECT),
                                             BTRFS_COMPRESS_NONE);
        if (IS_ERR(ordered)) {
                if (em) {
                        free_extent_map(em);
                        btrfs_drop_extent_map_range(inode, start,
                                                    start + len - 1, false);
                }
                em = ERR_CAST(ordered);
        } else {
                ASSERT(!dio_data->ordered);
                dio_data->ordered = ordered;
        }
 out:

        return em;
}

static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
                                                  struct btrfs_dio_data *dio_data,
                                                  u64 start, u64 len)
{
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_map *em;
        struct btrfs_key ins;
        u64 alloc_hint;
        int ret;

        alloc_hint = get_extent_allocation_hint(inode, start, len);
again:
        ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
                                   0, alloc_hint, &ins, 1, 1);
        if (ret == -EAGAIN) {
                ASSERT(btrfs_is_zoned(fs_info));
                wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
                               TASK_UNINTERRUPTIBLE);
                goto again;
        }
        if (ret)
                return ERR_PTR(ret);

        em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start,
                                     ins.objectid, ins.offset, ins.offset,
                                     ins.offset, BTRFS_ORDERED_REGULAR);
        btrfs_dec_block_group_reservations(fs_info, ins.objectid);
        if (IS_ERR(em))
                btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
                                           1);

        return em;
}

static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
{
        struct btrfs_block_group *block_group;
        bool readonly = false;

        block_group = btrfs_lookup_block_group(fs_info, bytenr);
        if (!block_group || block_group->ro)
                readonly = true;
        if (block_group)
                btrfs_put_block_group(block_group);
        return readonly;
}

/*
 * Check if we can do nocow write into the range [@offset, @offset + @len)
 *
 * @offset:        File offset
 * @len:        The length to write, will be updated to the nocow writeable
 *                range
 * @orig_start:        (optional) Return the original file offset of the file extent
 * @orig_len:        (optional) Return the original on-disk length of the file extent
 * @ram_bytes:        (optional) Return the ram_bytes of the file extent
 * @strict:        if true, omit optimizations that might force us into unnecessary
 *                cow. e.g., don't trust generation number.
 *
 * Return:
 * >0        and update @len if we can do nocow write
 *  0        if we can't do nocow write
 * <0        if error happened
 *
 * NOTE: This only checks the file extents, caller is responsible to wait for
 *         any ordered extents.
 */
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
                              u64 *orig_start, u64 *orig_block_len,
                              u64 *ram_bytes, bool nowait, bool strict)
{
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct can_nocow_file_extent_args nocow_args = { 0 };
        struct btrfs_path *path;
        int ret;
        struct extent_buffer *leaf;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_file_extent_item *fi;
        struct btrfs_key key;
        int found_type;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        path->nowait = nowait;

        ret = btrfs_lookup_file_extent(NULL, root, path,
                        btrfs_ino(BTRFS_I(inode)), offset, 0);
        if (ret < 0)
                goto out;

        if (ret == 1) {
                if (path->slots[0] == 0) {
                        /* can't find the item, must cow */
                        ret = 0;
                        goto out;
                }
                path->slots[0]--;
        }
        ret = 0;
        leaf = path->nodes[0];
        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
        if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
            key.type != BTRFS_EXTENT_DATA_KEY) {
                /* not our file or wrong item type, must cow */
                goto out;
        }

        if (key.offset > offset) {
                /* Wrong offset, must cow */
                goto out;
        }

        if (btrfs_file_extent_end(path) <= offset)
                goto out;

        fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
        found_type = btrfs_file_extent_type(leaf, fi);
        if (ram_bytes)
                *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);

        nocow_args.start = offset;
        nocow_args.end = offset + *len - 1;
        nocow_args.strict = strict;
        nocow_args.free_path = true;

        ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
        /* can_nocow_file_extent() has freed the path. */
        path = NULL;

        if (ret != 1) {
                /* Treat errors as not being able to NOCOW. */
                ret = 0;
                goto out;
        }

        ret = 0;
        if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr))
                goto out;

        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
                u64 range_end;

                range_end = round_up(offset + nocow_args.num_bytes,
                                     root->fs_info->sectorsize) - 1;
                ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC);
                if (ret) {
                        ret = -EAGAIN;
                        goto out;
                }
        }

        if (orig_start)
                *orig_start = key.offset - nocow_args.extent_offset;
        if (orig_block_len)
                *orig_block_len = nocow_args.disk_num_bytes;

        *len = nocow_args.num_bytes;
        ret = 1;
out:
        btrfs_free_path(path);
        return ret;
}

static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
                              struct extent_state **cached_state,
                              unsigned int iomap_flags)
{
        const bool writing = (iomap_flags & IOMAP_WRITE);
        const bool nowait = (iomap_flags & IOMAP_NOWAIT);
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_ordered_extent *ordered;
        int ret = 0;

        while (1) {
                if (nowait) {
                        if (!try_lock_extent(io_tree, lockstart, lockend,
                                             cached_state))
                                return -EAGAIN;
                } else {
                        lock_extent(io_tree, lockstart, lockend, cached_state);
                }
                /*
                 * We're concerned with the entire range that we're going to be
                 * doing DIO to, so we need to make sure there's no ordered
                 * extents in this range.
                 */
                ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
                                                     lockend - lockstart + 1);

                /*
                 * We need to make sure there are no buffered pages in this
                 * range either, we could have raced between the invalidate in
                 * generic_file_direct_write and locking the extent.  The
                 * invalidate needs to happen so that reads after a write do not
                 * get stale data.
                 */
                if (!ordered &&
                    (!writing || !filemap_range_has_page(inode->i_mapping,
                                                         lockstart, lockend)))
                        break;

                unlock_extent(io_tree, lockstart, lockend, cached_state);

                if (ordered) {
                        if (nowait) {
                                btrfs_put_ordered_extent(ordered);
                                ret = -EAGAIN;
                                break;
                        }
                        /*
                         * If we are doing a DIO read and the ordered extent we
                         * found is for a buffered write, we can not wait for it
                         * to complete and retry, because if we do so we can
                         * deadlock with concurrent buffered writes on page
                         * locks. This happens only if our DIO read covers more
                         * than one extent map, if at this point has already
                         * created an ordered extent for a previous extent map
                         * and locked its range in the inode's io tree, and a
                         * concurrent write against that previous extent map's
                         * range and this range started (we unlock the ranges
                         * in the io tree only when the bios complete and
                         * buffered writes always lock pages before attempting
                         * to lock range in the io tree).
                         */
                        if (writing ||
                            test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
                                btrfs_start_ordered_extent(ordered);
                        else
                                ret = nowait ? -EAGAIN : -ENOTBLK;
                        btrfs_put_ordered_extent(ordered);
                } else {
                        /*
                         * We could trigger writeback for this range (and wait
                         * for it to complete) and then invalidate the pages for
                         * this range (through invalidate_inode_pages2_range()),
                         * but that can lead us to a deadlock with a concurrent
                         * call to readahead (a buffered read or a defrag call
                         * triggered a readahead) on a page lock due to an
                         * ordered dio extent we created before but did not have
                         * yet a corresponding bio submitted (whence it can not
                         * complete), which makes readahead wait for that
                         * ordered extent to complete while holding a lock on
                         * that page.
                         */
                        ret = nowait ? -EAGAIN : -ENOTBLK;
                }

                if (ret)
                        break;

                cond_resched();
        }

        return ret;
}

/* The callers of this must take lock_extent() */
static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
                                       u64 len, u64 orig_start, u64 block_start,
                                       u64 block_len, u64 orig_block_len,
                                       u64 ram_bytes, int compress_type,
                                       int type)
{
        struct extent_map *em;
        int ret;

        /*
         * Note the missing NOCOW type.
         *
         * For pure NOCOW writes, we should not create an io extent map, but
         * just reusing the existing one.
         * Only PREALLOC writes (NOCOW write into preallocated range) can
         * create an io extent map.
         */
        ASSERT(type == BTRFS_ORDERED_PREALLOC ||
               type == BTRFS_ORDERED_COMPRESSED ||
               type == BTRFS_ORDERED_REGULAR);

        switch (type) {
        case BTRFS_ORDERED_PREALLOC:
                /* Uncompressed extents. */
                ASSERT(block_len == len);

                /* We're only referring part of a larger preallocated extent. */
                ASSERT(block_len <= ram_bytes);
                break;
        case BTRFS_ORDERED_REGULAR:
                /* Uncompressed extents. */
                ASSERT(block_len == len);

                /* COW results a new extent matching our file extent size. */
                ASSERT(orig_block_len == len);
                ASSERT(ram_bytes == len);

                /* Since it's a new extent, we should not have any offset. */
                ASSERT(orig_start == start);
                break;
        case BTRFS_ORDERED_COMPRESSED:
                /* Must be compressed. */
                ASSERT(compress_type != BTRFS_COMPRESS_NONE);

                /*
                 * Encoded write can make us to refer to part of the
                 * uncompressed extent.
                 */
                ASSERT(len <= ram_bytes);
                break;
        }

        em = alloc_extent_map();
        if (!em)
                return ERR_PTR(-ENOMEM);

        em->start = start;
        em->orig_start = orig_start;
        em->len = len;
        em->block_len = block_len;
        em->block_start = block_start;
        em->orig_block_len = orig_block_len;
        em->ram_bytes = ram_bytes;
        em->generation = -1;
        em->flags |= EXTENT_FLAG_PINNED;
        if (type == BTRFS_ORDERED_COMPRESSED)
                extent_map_set_compression(em, compress_type);

        ret = btrfs_replace_extent_map_range(inode, em, true);
        if (ret) {
                free_extent_map(em);
                return ERR_PTR(ret);
        }

        /* em got 2 refs now, callers needs to do free_extent_map once. */
        return em;
}


static int btrfs_get_blocks_direct_write(struct extent_map **map,
                                         struct inode *inode,
                                         struct btrfs_dio_data *dio_data,
                                         u64 start, u64 *lenp,
                                         unsigned int iomap_flags)
{
        const bool nowait = (iomap_flags & IOMAP_NOWAIT);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct extent_map *em = *map;
        int type;
        u64 block_start, orig_start, orig_block_len, ram_bytes;
        struct btrfs_block_group *bg;
        bool can_nocow = false;
        bool space_reserved = false;
        u64 len = *lenp;
        u64 prev_len;
        int ret = 0;

        /*
         * We don't allocate a new extent in the following cases
         *
         * 1) The inode is marked as NODATACOW. In this case we'll just use the
         * existing extent.
         * 2) The extent is marked as PREALLOC. We're good to go here and can
         * just use the extent.
         *
         */
        if ((em->flags & EXTENT_FLAG_PREALLOC) ||
            ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
             em->block_start != EXTENT_MAP_HOLE)) {
                if (em->flags & EXTENT_FLAG_PREALLOC)
                        type = BTRFS_ORDERED_PREALLOC;
                else
                        type = BTRFS_ORDERED_NOCOW;
                len = min(len, em->len - (start - em->start));
                block_start = em->block_start + (start - em->start);

                if (can_nocow_extent(inode, start, &len, &orig_start,
                                     &orig_block_len, &ram_bytes, false, false) == 1) {
                        bg = btrfs_inc_nocow_writers(fs_info, block_start);
                        if (bg)
                                can_nocow = true;
                }
        }

        prev_len = len;
        if (can_nocow) {
                struct extent_map *em2;

                /* We can NOCOW, so only need to reserve metadata space. */
                ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
                                                      nowait);
                if (ret < 0) {
                        /* Our caller expects us to free the input extent map. */
                        free_extent_map(em);
                        *map = NULL;
                        btrfs_dec_nocow_writers(bg);
                        if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
                                ret = -EAGAIN;
                        goto out;
                }
                space_reserved = true;

                em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len,
                                              orig_start, block_start,
                                              len, orig_block_len,
                                              ram_bytes, type);
                btrfs_dec_nocow_writers(bg);
                if (type == BTRFS_ORDERED_PREALLOC) {
                        free_extent_map(em);
                        *map = em2;
                        em = em2;
                }

                if (IS_ERR(em2)) {
                        ret = PTR_ERR(em2);
                        goto out;
                }

                dio_data->nocow_done = true;
        } else {
                /* Our caller expects us to free the input extent map. */
                free_extent_map(em);
                *map = NULL;

                if (nowait) {
                        ret = -EAGAIN;
                        goto out;
                }

                /*
                 * If we could not allocate data space before locking the file
                 * range and we can't do a NOCOW write, then we have to fail.
                 */
                if (!dio_data->data_space_reserved) {
                        ret = -ENOSPC;
                        goto out;
                }

                /*
                 * We have to COW and we have already reserved data space before,
                 * so now we reserve only metadata.
                 */
                ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
                                                      false);
                if (ret < 0)
                        goto out;
                space_reserved = true;

                em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
                if (IS_ERR(em)) {
                        ret = PTR_ERR(em);
                        goto out;
                }
                *map = em;
                len = min(len, em->len - (start - em->start));
                if (len < prev_len)
                        btrfs_delalloc_release_metadata(BTRFS_I(inode),
                                                        prev_len - len, true);
        }

        /*
         * We have created our ordered extent, so we can now release our reservation
         * for an outstanding extent.
         */
        btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);

        /*
         * Need to update the i_size under the extent lock so buffered
         * readers will get the updated i_size when we unlock.
         */
        if (start + len > i_size_read(inode))
                i_size_write(inode, start + len);
out:
        if (ret && space_reserved) {
                btrfs_delalloc_release_extents(BTRFS_I(inode), len);
                btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
        }
        *lenp = len;
        return ret;
}

static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
                loff_t length, unsigned int flags, struct iomap *iomap,
                struct iomap *srcmap)
{
        struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct extent_map *em;
        struct extent_state *cached_state = NULL;
        struct btrfs_dio_data *dio_data = iter->private;
        u64 lockstart, lockend;
        const bool write = !!(flags & IOMAP_WRITE);
        int ret = 0;
        u64 len = length;
        const u64 data_alloc_len = length;
        bool unlock_extents = false;

        /*
         * We could potentially fault if we have a buffer > PAGE_SIZE, and if
         * we're NOWAIT we may submit a bio for a partial range and return
         * EIOCBQUEUED, which would result in an errant short read.
         *
         * The best way to handle this would be to allow for partial completions
         * of iocb's, so we could submit the partial bio, return and fault in
         * the rest of the pages, and then submit the io for the rest of the
         * range.  However we don't have that currently, so simply return
         * -EAGAIN at this point so that the normal path is used.
         */
        if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
                return -EAGAIN;

        /*
         * Cap the size of reads to that usually seen in buffered I/O as we need
         * to allocate a contiguous array for the checksums.
         */
        if (!write)
                len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);

        lockstart = start;
        lockend = start + len - 1;

        /*
         * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
         * enough if we've written compressed pages to this area, so we need to
         * flush the dirty pages again to make absolutely sure that any
         * outstanding dirty pages are on disk - the first flush only starts
         * compression on the data, while keeping the pages locked, so by the
         * time the second flush returns we know bios for the compressed pages
         * were submitted and finished, and the pages no longer under writeback.
         *
         * If we have a NOWAIT request and we have any pages in the range that
         * are locked, likely due to compression still in progress, we don't want
         * to block on page locks. We also don't want to block on pages marked as
         * dirty or under writeback (same as for the non-compression case).
         * iomap_dio_rw() did the same check, but after that and before we got
         * here, mmap'ed writes may have happened or buffered reads started
         * (readpage() and readahead(), which lock pages), as we haven't locked
         * the file range yet.
         */
        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
                     &BTRFS_I(inode)->runtime_flags)) {
                if (flags & IOMAP_NOWAIT) {
                        if (filemap_range_needs_writeback(inode->i_mapping,
                                                          lockstart, lockend))
                                return -EAGAIN;
                } else {
                        ret = filemap_fdatawrite_range(inode->i_mapping, start,
                                                       start + length - 1);
                        if (ret)
                                return ret;
                }
        }

        memset(dio_data, 0, sizeof(*dio_data));

        /*
         * We always try to allocate data space and must do it before locking
         * the file range, to avoid deadlocks with concurrent writes to the same
         * range if the range has several extents and the writes don't expand the
         * current i_size (the inode lock is taken in shared mode). If we fail to
         * allocate data space here we continue and later, after locking the
         * file range, we fail with ENOSPC only if we figure out we can not do a
         * NOCOW write.
         */
        if (write && !(flags & IOMAP_NOWAIT)) {
                ret = btrfs_check_data_free_space(BTRFS_I(inode),
                                                  &dio_data->data_reserved,
                                                  start, data_alloc_len, false);
                if (!ret)
                        dio_data->data_space_reserved = true;
                else if (ret && !(BTRFS_I(inode)->flags &
                                  (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
                        goto err;
        }

        /*
         * If this errors out it's because we couldn't invalidate pagecache for
         * this range and we need to fallback to buffered IO, or we are doing a
         * NOWAIT read/write and we need to block.
         */
        ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
        if (ret < 0)
                goto err;

        em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
        if (IS_ERR(em)) {
                ret = PTR_ERR(em);
                goto unlock_err;
        }

        /*
         * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
         * io.  INLINE is special, and we could probably kludge it in here, but
         * it's still buffered so for safety lets just fall back to the generic
         * buffered path.
         *
         * For COMPRESSED we _have_ to read the entire extent in so we can
         * decompress it, so there will be buffering required no matter what we
         * do, so go ahead and fallback to buffered.
         *
         * We return -ENOTBLK because that's what makes DIO go ahead and go back
         * to buffered IO.  Don't blame me, this is the price we pay for using
         * the generic code.
         */
        if (extent_map_is_compressed(em) ||
            em->block_start == EXTENT_MAP_INLINE) {
                free_extent_map(em);
                /*
                 * If we are in a NOWAIT context, return -EAGAIN in order to
                 * fallback to buffered IO. This is not only because we can
                 * block with buffered IO (no support for NOWAIT semantics at
                 * the moment) but also to avoid returning short reads to user
                 * space - this happens if we were able to read some data from
                 * previous non-compressed extents and then when we fallback to
                 * buffered IO, at btrfs_file_read_iter() by calling
                 * filemap_read(), we fail to fault in pages for the read buffer,
                 * in which case filemap_read() returns a short read (the number
                 * of bytes previously read is > 0, so it does not return -EFAULT).
                 */
                ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
                goto unlock_err;
        }

        len = min(len, em->len - (start - em->start));

        /*
         * If we have a NOWAIT request and the range contains multiple extents
         * (or a mix of extents and holes), then we return -EAGAIN to make the
         * caller fallback to a context where it can do a blocking (without
         * NOWAIT) request. This way we avoid doing partial IO and returning
         * success to the caller, which is not optimal for writes and for reads
         * it can result in unexpected behaviour for an application.
         *
         * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
         * iomap_dio_rw(), we can end up returning less data then what the caller
         * asked for, resulting in an unexpected, and incorrect, short read.
         * That is, the caller asked to read N bytes and we return less than that,
         * which is wrong unless we are crossing EOF. This happens if we get a
         * page fault error when trying to fault in pages for the buffer that is
         * associated to the struct iov_iter passed to iomap_dio_rw(), and we
         * have previously submitted bios for other extents in the range, in
         * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
         * those bios have completed by the time we get the page fault error,
         * which we return back to our caller - we should only return EIOCBQUEUED
         * after we have submitted bios for all the extents in the range.
         */
        if ((flags & IOMAP_NOWAIT) && len < length) {
                free_extent_map(em);
                ret = -EAGAIN;
                goto unlock_err;
        }

        if (write) {
                ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
                                                    start, &len, flags);
                if (ret < 0)
                        goto unlock_err;
                unlock_extents = true;
                /* Recalc len in case the new em is smaller than requested */
                len = min(len, em->len - (start - em->start));
                if (dio_data->data_space_reserved) {
                        u64 release_offset;
                        u64 release_len = 0;

                        if (dio_data->nocow_done) {
                                release_offset = start;
                                release_len = data_alloc_len;
                        } else if (len < data_alloc_len) {
                                release_offset = start + len;
                                release_len = data_alloc_len - len;
                        }

                        if (release_len > 0)
                                btrfs_free_reserved_data_space(BTRFS_I(inode),
                                                               dio_data->data_reserved,
                                                               release_offset,
                                                               release_len);
                }
        } else {
                /*
                 * We need to unlock only the end area that we aren't using.
                 * The rest is going to be unlocked by the endio routine.
                 */
                lockstart = start + len;
                if (lockstart < lockend)
                        unlock_extents = true;
        }

        if (unlock_extents)
                unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                              &cached_state);
        else
                free_extent_state(cached_state);

        /*
         * Translate extent map information to iomap.
         * We trim the extents (and move the addr) even though iomap code does
         * that, since we have locked only the parts we are performing I/O in.
         */
        if ((em->block_start == EXTENT_MAP_HOLE) ||
            ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
                iomap->addr = IOMAP_NULL_ADDR;
                iomap->type = IOMAP_HOLE;
        } else {
                iomap->addr = em->block_start + (start - em->start);
                iomap->type = IOMAP_MAPPED;
        }
        iomap->offset = start;
        iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
        iomap->length = len;
        free_extent_map(em);

        return 0;

unlock_err:
        unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                      &cached_state);
err:
        if (dio_data->data_space_reserved) {
                btrfs_free_reserved_data_space(BTRFS_I(inode),
                                               dio_data->data_reserved,
                                               start, data_alloc_len);
                extent_changeset_free(dio_data->data_reserved);
        }

        return ret;
}

static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
                ssize_t written, unsigned int flags, struct iomap *iomap)
{
        struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
        struct btrfs_dio_data *dio_data = iter->private;
        size_t submitted = dio_data->submitted;
        const bool write = !!(flags & IOMAP_WRITE);
        int ret = 0;

        if (!write && (iomap->type == IOMAP_HOLE)) {
                /* If reading from a hole, unlock and return */
                unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
                              NULL);
                return 0;
        }

        if (submitted < length) {
                pos += submitted;
                length -= submitted;
                if (write)
                        btrfs_finish_ordered_extent(dio_data->ordered, NULL,
                                                    pos, length, false);
                else
                        unlock_extent(&BTRFS_I(inode)->io_tree, pos,
                                      pos + length - 1, NULL);
                ret = -ENOTBLK;
        }
        if (write) {
                btrfs_put_ordered_extent(dio_data->ordered);
                dio_data->ordered = NULL;
        }

        if (write)
                extent_changeset_free(dio_data->data_reserved);
        return ret;
}

static void btrfs_dio_end_io(struct btrfs_bio *bbio)
{
        struct btrfs_dio_private *dip =
                container_of(bbio, struct btrfs_dio_private, bbio);
        struct btrfs_inode *inode = bbio->inode;
        struct bio *bio = &bbio->bio;

        if (bio->bi_status) {
                btrfs_warn(inode->root->fs_info,
                "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
                           btrfs_ino(inode), bio->bi_opf,
                           dip->file_offset, dip->bytes, bio->bi_status);
        }

        if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
                btrfs_finish_ordered_extent(bbio->ordered, NULL,
                                            dip->file_offset, dip->bytes,
                                            !bio->bi_status);
        } else {
                unlock_extent(&inode->io_tree, dip->file_offset,
                              dip->file_offset + dip->bytes - 1, NULL);
        }

        bbio->bio.bi_private = bbio->private;
        iomap_dio_bio_end_io(bio);
}

static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
                                loff_t file_offset)
{
        struct btrfs_bio *bbio = btrfs_bio(bio);
        struct btrfs_dio_private *dip =
                container_of(bbio, struct btrfs_dio_private, bbio);
        struct btrfs_dio_data *dio_data = iter->private;

        btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
                       btrfs_dio_end_io, bio->bi_private);
        bbio->inode = BTRFS_I(iter->inode);
        bbio->file_offset = file_offset;

        dip->file_offset = file_offset;
        dip->bytes = bio->bi_iter.bi_size;

        dio_data->submitted += bio->bi_iter.bi_size;

        /*
         * Check if we are doing a partial write.  If we are, we need to split
         * the ordered extent to match the submitted bio.  Hang on to the
         * remaining unfinishable ordered_extent in dio_data so that it can be
         * cancelled in iomap_end to avoid a deadlock wherein faulting the
         * remaining pages is blocked on the outstanding ordered extent.
         */
        if (iter->flags & IOMAP_WRITE) {
                int ret;

                ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
                if (ret) {
                        btrfs_finish_ordered_extent(dio_data->ordered, NULL,
                                                    file_offset, dip->bytes,
                                                    !ret);
                        bio->bi_status = errno_to_blk_status(ret);
                        iomap_dio_bio_end_io(bio);
                        return;
                }
        }

        btrfs_submit_bio(bbio, 0);
}

static const struct iomap_ops btrfs_dio_iomap_ops = {
        .iomap_begin            = btrfs_dio_iomap_begin,
        .iomap_end              = btrfs_dio_iomap_end,
};

static const struct iomap_dio_ops btrfs_dio_ops = {
        .submit_io                = btrfs_dio_submit_io,
        .bio_set                = &btrfs_dio_bioset,
};

ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
{
        struct btrfs_dio_data data = { 0 };

        return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
                            IOMAP_DIO_PARTIAL, &data, done_before);
}

struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
                                  size_t done_before)
{
        struct btrfs_dio_data data = { 0 };

        return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
                            IOMAP_DIO_PARTIAL, &data, done_before);
}

static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        u64 start, u64 len)
{
        struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
        int        ret;

        ret = fiemap_prep(inode, fieinfo, start, &len, 0);
        if (ret)
                return ret;

        /*
         * fiemap_prep() called filemap_write_and_wait() for the whole possible
         * file range (0 to LLONG_MAX), but that is not enough if we have
         * compression enabled. The first filemap_fdatawrite_range() only kicks
         * in the compression of data (in an async thread) and will return
         * before the compression is done and writeback is started. A second
         * filemap_fdatawrite_range() is needed to wait for the compression to
         * complete and writeback to start. We also need to wait for ordered
         * extents to complete, because our fiemap implementation uses mainly
         * file extent items to list the extents, searching for extent maps
         * only for file ranges with holes or prealloc extents to figure out
         * if we have delalloc in those ranges.
         */
        if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
                ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
                if (ret)
                        return ret;
        }

        btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);

        /*
         * We did an initial flush to avoid holding the inode's lock while
         * triggering writeback and waiting for the completion of IO and ordered
         * extents. Now after we locked the inode we do it again, because it's
         * possible a new write may have happened in between those two steps.
         */
        if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
                ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
                if (ret) {
                        btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
                        return ret;
                }
        }

        ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
        btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);

        return ret;
}

/*
 * For release_folio() and invalidate_folio() we have a race window where
 * folio_end_writeback() is called but the subpage spinlock is not yet released.
 * If we continue to release/invalidate the page, we could cause use-after-free
 * for subpage spinlock.  So this function is to spin and wait for subpage
 * spinlock.
 */
static void wait_subpage_spinlock(struct page *page)
{
        struct btrfs_fs_info *fs_info = page_to_fs_info(page);
        struct folio *folio = page_folio(page);
        struct btrfs_subpage *subpage;

        if (!btrfs_is_subpage(fs_info, page->mapping))
                return;

        ASSERT(folio_test_private(folio) && folio_get_private(folio));
        subpage = folio_get_private(folio);

        /*
         * This may look insane as we just acquire the spinlock and release it,
         * without doing anything.  But we just want to make sure no one is
         * still holding the subpage spinlock.
         * And since the page is not dirty nor writeback, and we have page
         * locked, the only possible way to hold a spinlock is from the endio
         * function to clear page writeback.
         *
         * Here we just acquire the spinlock so that all existing callers
         * should exit and we're safe to release/invalidate the page.
         */
        spin_lock_irq(&subpage->lock);
        spin_unlock_irq(&subpage->lock);
}

static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
{
        if (try_release_extent_mapping(&folio->page, gfp_flags)) {
                wait_subpage_spinlock(&folio->page);
                clear_page_extent_mapped(&folio->page);
                return true;
        }
        return false;
}

static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
{
        if (folio_test_writeback(folio) || folio_test_dirty(folio))
                return false;
        return __btrfs_release_folio(folio, gfp_flags);
}

#ifdef CONFIG_MIGRATION
static int btrfs_migrate_folio(struct address_space *mapping,
                             struct folio *dst, struct folio *src,
                             enum migrate_mode mode)
{
        int ret = filemap_migrate_folio(mapping, dst, src, mode);

        if (ret != MIGRATEPAGE_SUCCESS)
                return ret;

        if (folio_test_ordered(src)) {
                folio_clear_ordered(src);
                folio_set_ordered(dst);
        }

        return MIGRATEPAGE_SUCCESS;
}
#else
#define btrfs_migrate_folio NULL
#endif

static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
                                 size_t length)
{
        struct btrfs_inode *inode = folio_to_inode(folio);
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct extent_io_tree *tree = &inode->io_tree;
        struct extent_state *cached_state = NULL;
        u64 page_start = folio_pos(folio);
        u64 page_end = page_start + folio_size(folio) - 1;
        u64 cur;
        int inode_evicting = inode->vfs_inode.i_state & I_FREEING;

        /*
         * We have folio locked so no new ordered extent can be created on this
         * page, nor bio can be submitted for this folio.
         *
         * But already submitted bio can still be finished on this folio.
         * Furthermore, endio function won't skip folio which has Ordered
         * (Private2) already cleared, so it's possible for endio and
         * invalidate_folio to do the same ordered extent accounting twice
         * on one folio.
         *
         * So here we wait for any submitted bios to finish, so that we won't
         * do double ordered extent accounting on the same folio.
         */
        folio_wait_writeback(folio);
        wait_subpage_spinlock(&folio->page);

        /*
         * For subpage case, we have call sites like
         * btrfs_punch_hole_lock_range() which passes range not aligned to
         * sectorsize.
         * If the range doesn't cover the full folio, we don't need to and
         * shouldn't clear page extent mapped, as folio->private can still
         * record subpage dirty bits for other part of the range.
         *
         * For cases that invalidate the full folio even the range doesn't
         * cover the full folio, like invalidating the last folio, we're
         * still safe to wait for ordered extent to finish.
         */
        if (!(offset == 0 && length == folio_size(folio))) {
                btrfs_release_folio(folio, GFP_NOFS);
                return;
        }

        if (!inode_evicting)
                lock_extent(tree, page_start, page_end, &cached_state);

        cur = page_start;
        while (cur < page_end) {
                struct btrfs_ordered_extent *ordered;
                u64 range_end;
                u32 range_len;
                u32 extra_flags = 0;

                ordered = btrfs_lookup_first_ordered_range(inode, cur,
                                                           page_end + 1 - cur);
                if (!ordered) {
                        range_end = page_end;
                        /*
                         * No ordered extent covering this range, we are safe
                         * to delete all extent states in the range.
                         */
                        extra_flags = EXTENT_CLEAR_ALL_BITS;
                        goto next;
                }
                if (ordered->file_offset > cur) {
                        /*
                         * There is a range between [cur, oe->file_offset) not
                         * covered by any ordered extent.
                         * We are safe to delete all extent states, and handle
                         * the ordered extent in the next iteration.
                         */
                        range_end = ordered->file_offset - 1;
                        extra_flags = EXTENT_CLEAR_ALL_BITS;
                        goto next;
                }

                range_end = min(ordered->file_offset + ordered->num_bytes - 1,
                                page_end);
                ASSERT(range_end + 1 - cur < U32_MAX);
                range_len = range_end + 1 - cur;
                if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
                        /*
                         * If Ordered (Private2) is cleared, it means endio has
                         * already been executed for the range.
                         * We can't delete the extent states as
                         * btrfs_finish_ordered_io() may still use some of them.
                         */
                        goto next;
                }
                btrfs_folio_clear_ordered(fs_info, folio, cur, range_len);

                /*
                 * IO on this page will never be started, so we need to account
                 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
                 * here, must leave that up for the ordered extent completion.
                 *
                 * This will also unlock the range for incoming
                 * btrfs_finish_ordered_io().
                 */
                if (!inode_evicting)
                        clear_extent_bit(tree, cur, range_end,
                                         EXTENT_DELALLOC |
                                         EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
                                         EXTENT_DEFRAG, &cached_state);

                spin_lock_irq(&inode->ordered_tree_lock);
                set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
                ordered->truncated_len = min(ordered->truncated_len,
                                             cur - ordered->file_offset);
                spin_unlock_irq(&inode->ordered_tree_lock);

                /*
                 * If the ordered extent has finished, we're safe to delete all
                 * the extent states of the range, otherwise
                 * btrfs_finish_ordered_io() will get executed by endio for
                 * other pages, so we can't delete extent states.
                 */
                if (btrfs_dec_test_ordered_pending(inode, &ordered,
                                                   cur, range_end + 1 - cur)) {
                        btrfs_finish_ordered_io(ordered);
                        /*
                         * The ordered extent has finished, now we're again
                         * safe to delete all extent states of the range.
                         */
                        extra_flags = EXTENT_CLEAR_ALL_BITS;
                }
next:
                if (ordered)
                        btrfs_put_ordered_extent(ordered);
                /*
                 * Qgroup reserved space handler
                 * Sector(s) here will be either:
                 *
                 * 1) Already written to disk or bio already finished
                 *    Then its QGROUP_RESERVED bit in io_tree is already cleared.
                 *    Qgroup will be handled by its qgroup_record then.
                 *    btrfs_qgroup_free_data() call will do nothing here.
                 *
                 * 2) Not written to disk yet
                 *    Then btrfs_qgroup_free_data() call will clear the
                 *    QGROUP_RESERVED bit of its io_tree, and free the qgroup
                 *    reserved data space.
                 *    Since the IO will never happen for this page.
                 */
                btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
                if (!inode_evicting) {
                        clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
                                 EXTENT_DELALLOC | EXTENT_UPTODATE |
                                 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
                                 extra_flags, &cached_state);
                }
                cur = range_end + 1;
        }
        /*
         * We have iterated through all ordered extents of the page, the page
         * should not have Ordered (Private2) anymore, or the above iteration
         * did something wrong.
         */
        ASSERT(!folio_test_ordered(folio));
        btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
        if (!inode_evicting)
                __btrfs_release_folio(folio, GFP_NOFS);
        clear_page_extent_mapped(&folio->page);
}

static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
{
        struct btrfs_truncate_control control = {
                .inode = inode,
                .ino = btrfs_ino(inode),
                .min_type = BTRFS_EXTENT_DATA_KEY,
                .clear_extent_range = true,
        };
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_block_rsv *rsv;
        int ret;
        struct btrfs_trans_handle *trans;
        u64 mask = fs_info->sectorsize - 1;
        const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);

        if (!skip_writeback) {
                ret = btrfs_wait_ordered_range(&inode->vfs_inode,
                                               inode->vfs_inode.i_size & (~mask),
                                               (u64)-1);
                if (ret)
                        return ret;
        }

        /*
         * Yes ladies and gentlemen, this is indeed ugly.  We have a couple of
         * things going on here:
         *
         * 1) We need to reserve space to update our inode.
         *
         * 2) We need to have something to cache all the space that is going to
         * be free'd up by the truncate operation, but also have some slack
         * space reserved in case it uses space during the truncate (thank you
         * very much snapshotting).
         *
         * And we need these to be separate.  The fact is we can use a lot of
         * space doing the truncate, and we have no earthly idea how much space
         * we will use, so we need the truncate reservation to be separate so it
         * doesn't end up using space reserved for updating the inode.  We also
         * need to be able to stop the transaction and start a new one, which
         * means we need to be able to update the inode several times, and we
         * have no idea of knowing how many times that will be, so we can't just
         * reserve 1 item for the entirety of the operation, so that has to be
         * done separately as well.
         *
         * So that leaves us with
         *
         * 1) rsv - for the truncate reservation, which we will steal from the
         * transaction reservation.
         * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
         * updating the inode.
         */
        rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
        if (!rsv)
                return -ENOMEM;
        rsv->size = min_size;
        rsv->failfast = true;

        /*
         * 1 for the truncate slack space
         * 1 for updating the inode.
         */
        trans = btrfs_start_transaction(root, 2);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto out;
        }

        /* Migrate the slack space for the truncate to our reserve */
        ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
                                      min_size, false);
        /*
         * We have reserved 2 metadata units when we started the transaction and
         * min_size matches 1 unit, so this should never fail, but if it does,
         * it's not critical we just fail truncation.
         */
        if (WARN_ON(ret)) {
                btrfs_end_transaction(trans);
                goto out;
        }

        trans->block_rsv = rsv;

        while (1) {
                struct extent_state *cached_state = NULL;
                const u64 new_size = inode->vfs_inode.i_size;
                const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);

                control.new_size = new_size;
                lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
                /*
                 * We want to drop from the next block forward in case this new
                 * size is not block aligned since we will be keeping the last
                 * block of the extent just the way it is.
                 */
                btrfs_drop_extent_map_range(inode,
                                            ALIGN(new_size, fs_info->sectorsize),
                                            (u64)-1, false);

                ret = btrfs_truncate_inode_items(trans, root, &control);

                inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
                btrfs_inode_safe_disk_i_size_write(inode, control.last_size);

                unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);

                trans->block_rsv = &fs_info->trans_block_rsv;
                if (ret != -ENOSPC && ret != -EAGAIN)
                        break;

                ret = btrfs_update_inode(trans, inode);
                if (ret)
                        break;

                btrfs_end_transaction(trans);
                btrfs_btree_balance_dirty(fs_info);

                trans = btrfs_start_transaction(root, 2);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
                        trans = NULL;
                        break;
                }

                btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
                ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
                                              rsv, min_size, false);
                /*
                 * We have reserved 2 metadata units when we started the
                 * transaction and min_size matches 1 unit, so this should never
                 * fail, but if it does, it's not critical we just fail truncation.
                 */
                if (WARN_ON(ret))
                        break;

                trans->block_rsv = rsv;
        }

        /*
         * We can't call btrfs_truncate_block inside a trans handle as we could
         * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
         * know we've truncated everything except the last little bit, and can
         * do btrfs_truncate_block and then update the disk_i_size.
         */
        if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
                btrfs_end_transaction(trans);
                btrfs_btree_balance_dirty(fs_info);

                ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0);
                if (ret)
                        goto out;
                trans = btrfs_start_transaction(root, 1);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
                        goto out;
                }
                btrfs_inode_safe_disk_i_size_write(inode, 0);
        }

        if (trans) {
                int ret2;

                trans->block_rsv = &fs_info->trans_block_rsv;
                ret2 = btrfs_update_inode(trans, inode);
                if (ret2 && !ret)
                        ret = ret2;

                ret2 = btrfs_end_transaction(trans);
                if (ret2 && !ret)
                        ret = ret2;
                btrfs_btree_balance_dirty(fs_info);
        }
out:
        btrfs_free_block_rsv(fs_info, rsv);
        /*
         * So if we truncate and then write and fsync we normally would just
         * write the extents that changed, which is a problem if we need to
         * first truncate that entire inode.  So set this flag so we write out
         * all of the extents in the inode to the sync log so we're completely
         * safe.
         *
         * If no extents were dropped or trimmed we don't need to force the next
         * fsync to truncate all the inode's items from the log and re-log them
         * all. This means the truncate operation did not change the file size,
         * or changed it to a smaller size but there was only an implicit hole
         * between the old i_size and the new i_size, and there were no prealloc
         * extents beyond i_size to drop.
         */
        if (control.extents_found > 0)
                btrfs_set_inode_full_sync(inode);

        return ret;
}

struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap,
                                     struct inode *dir)
{
        struct inode *inode;

        inode = new_inode(dir->i_sb);
        if (inode) {
                /*
                 * Subvolumes don't inherit the sgid bit or the parent's gid if
                 * the parent's sgid bit is set. This is probably a bug.
                 */
                inode_init_owner(idmap, inode, NULL,
                                 S_IFDIR | (~current_umask() & S_IRWXUGO));
                inode->i_op = &btrfs_dir_inode_operations;
                inode->i_fop = &btrfs_dir_file_operations;
        }
        return inode;
}

struct inode *btrfs_alloc_inode(struct super_block *sb)
{
        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        struct btrfs_inode *ei;
        struct inode *inode;
        struct extent_io_tree *file_extent_tree = NULL;

        /* Self tests may pass a NULL fs_info. */
        if (fs_info && !btrfs_fs_incompat(fs_info, NO_HOLES)) {
                file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
                if (!file_extent_tree)
                        return NULL;
        }

        ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
        if (!ei) {
                kfree(file_extent_tree);
                return NULL;
        }

        ei->root = NULL;
        ei->generation = 0;
        ei->last_trans = 0;
        ei->last_sub_trans = 0;
        ei->logged_trans = 0;
        ei->delalloc_bytes = 0;
        ei->new_delalloc_bytes = 0;
        ei->defrag_bytes = 0;
        ei->disk_i_size = 0;
        ei->flags = 0;
        ei->ro_flags = 0;
        ei->csum_bytes = 0;
        ei->index_cnt = (u64)-1;
        ei->dir_index = 0;
        ei->last_unlink_trans = 0;
        ei->last_reflink_trans = 0;
        ei->last_log_commit = 0;

        spin_lock_init(&ei->lock);
        ei->outstanding_extents = 0;
        if (sb->s_magic != BTRFS_TEST_MAGIC)
                btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
                                              BTRFS_BLOCK_RSV_DELALLOC);
        ei->runtime_flags = 0;
        ei->prop_compress = BTRFS_COMPRESS_NONE;
        ei->defrag_compress = BTRFS_COMPRESS_NONE;

        ei->delayed_node = NULL;

        ei->i_otime_sec = 0;
        ei->i_otime_nsec = 0;

        inode = &ei->vfs_inode;
        extent_map_tree_init(&ei->extent_tree);

        /* This io tree sets the valid inode. */
        extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
        ei->io_tree.inode = ei;

        ei->file_extent_tree = file_extent_tree;
        if (file_extent_tree) {
                extent_io_tree_init(fs_info, ei->file_extent_tree,
                                    IO_TREE_INODE_FILE_EXTENT);
                /* Lockdep class is set only for the file extent tree. */
                lockdep_set_class(&ei->file_extent_tree->lock, &file_extent_tree_class);
        }
        mutex_init(&ei->log_mutex);
        spin_lock_init(&ei->ordered_tree_lock);
        ei->ordered_tree = RB_ROOT;
        ei->ordered_tree_last = NULL;
        INIT_LIST_HEAD(&ei->delalloc_inodes);
        INIT_LIST_HEAD(&ei->delayed_iput);
        RB_CLEAR_NODE(&ei->rb_node);
        init_rwsem(&ei->i_mmap_lock);

        return inode;
}

#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_destroy_inode(struct inode *inode)
{
        btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
        kfree(BTRFS_I(inode)->file_extent_tree);
        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
#endif

void btrfs_free_inode(struct inode *inode)
{
        kfree(BTRFS_I(inode)->file_extent_tree);
        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}

void btrfs_destroy_inode(struct inode *vfs_inode)
{
        struct btrfs_ordered_extent *ordered;
        struct btrfs_inode *inode = BTRFS_I(vfs_inode);
        struct btrfs_root *root = inode->root;
        bool freespace_inode;

        WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
        WARN_ON(vfs_inode->i_data.nrpages);
        WARN_ON(inode->block_rsv.reserved);
        WARN_ON(inode->block_rsv.size);
        WARN_ON(inode->outstanding_extents);
        if (!S_ISDIR(vfs_inode->i_mode)) {
                WARN_ON(inode->delalloc_bytes);
                WARN_ON(inode->new_delalloc_bytes);
        }
        WARN_ON(inode->csum_bytes);
        WARN_ON(inode->defrag_bytes);

        /*
         * This can happen where we create an inode, but somebody else also
         * created the same inode and we need to destroy the one we already
         * created.
         */
        if (!root)
                return;

        /*
         * If this is a free space inode do not take the ordered extents lockdep
         * map.
         */
        freespace_inode = btrfs_is_free_space_inode(inode);

        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
                if (!ordered)
                        break;
                else {
                        btrfs_err(root->fs_info,
                                  "found ordered extent %llu %llu on inode cleanup",
                                  ordered->file_offset, ordered->num_bytes);

                        if (!freespace_inode)
                                btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);

                        btrfs_remove_ordered_extent(inode, ordered);
                        btrfs_put_ordered_extent(ordered);
                        btrfs_put_ordered_extent(ordered);
                }
        }
        btrfs_qgroup_check_reserved_leak(inode);
        inode_tree_del(inode);
        btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
        btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
        btrfs_put_root(inode->root);
}

int btrfs_drop_inode(struct inode *inode)
{
        struct btrfs_root *root = BTRFS_I(inode)->root;

        if (root == NULL)
                return 1;

        /* the snap/subvol tree is on deleting */
        if (btrfs_root_refs(&root->root_item) == 0)
                return 1;
        else
                return generic_drop_inode(inode);
}

static void init_once(void *foo)
{
        struct btrfs_inode *ei = foo;

        inode_init_once(&ei->vfs_inode);
}

void __cold btrfs_destroy_cachep(void)
{
        /*
         * Make sure all delayed rcu free inodes are flushed before we
         * destroy cache.
         */
        rcu_barrier();
        bioset_exit(&btrfs_dio_bioset);
        kmem_cache_destroy(btrfs_inode_cachep);
}

int __init btrfs_init_cachep(void)
{
        btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
                        sizeof(struct btrfs_inode), 0,
                        SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
                        init_once);
        if (!btrfs_inode_cachep)
                goto fail;

        if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
                        offsetof(struct btrfs_dio_private, bbio.bio),
                        BIOSET_NEED_BVECS))
                goto fail;

        return 0;
fail:
        btrfs_destroy_cachep();
        return -ENOMEM;
}

static int btrfs_getattr(struct mnt_idmap *idmap,
                         const struct path *path, struct kstat *stat,
                         u32 request_mask, unsigned int flags)
{
        u64 delalloc_bytes;
        u64 inode_bytes;
        struct inode *inode = d_inode(path->dentry);
        u32 blocksize = btrfs_sb(inode->i_sb)->sectorsize;
        u32 bi_flags = BTRFS_I(inode)->flags;
        u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;

        stat->result_mask |= STATX_BTIME;
        stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec;
        stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec;
        if (bi_flags & BTRFS_INODE_APPEND)
                stat->attributes |= STATX_ATTR_APPEND;
        if (bi_flags & BTRFS_INODE_COMPRESS)
                stat->attributes |= STATX_ATTR_COMPRESSED;
        if (bi_flags & BTRFS_INODE_IMMUTABLE)
                stat->attributes |= STATX_ATTR_IMMUTABLE;
        if (bi_flags & BTRFS_INODE_NODUMP)
                stat->attributes |= STATX_ATTR_NODUMP;
        if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
                stat->attributes |= STATX_ATTR_VERITY;

        stat->attributes_mask |= (STATX_ATTR_APPEND |
                                  STATX_ATTR_COMPRESSED |
                                  STATX_ATTR_IMMUTABLE |
                                  STATX_ATTR_NODUMP);

        generic_fillattr(idmap, request_mask, inode, stat);
        stat->dev = BTRFS_I(inode)->root->anon_dev;

        stat->subvol = BTRFS_I(inode)->root->root_key.objectid;
        stat->result_mask |= STATX_SUBVOL;

        spin_lock(&BTRFS_I(inode)->lock);
        delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
        inode_bytes = inode_get_bytes(inode);
        spin_unlock(&BTRFS_I(inode)->lock);
        stat->blocks = (ALIGN(inode_bytes, blocksize) +
                        ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT;
        return 0;
}

static int btrfs_rename_exchange(struct inode *old_dir,
                              struct dentry *old_dentry,
                              struct inode *new_dir,
                              struct dentry *new_dentry)
{
        struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
        struct btrfs_trans_handle *trans;
        unsigned int trans_num_items;
        struct btrfs_root *root = BTRFS_I(old_dir)->root;
        struct btrfs_root *dest = BTRFS_I(new_dir)->root;
        struct inode *new_inode = new_dentry->d_inode;
        struct inode *old_inode = old_dentry->d_inode;
        struct btrfs_rename_ctx old_rename_ctx;
        struct btrfs_rename_ctx new_rename_ctx;
        u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
        u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
        u64 old_idx = 0;
        u64 new_idx = 0;
        int ret;
        int ret2;
        bool need_abort = false;
        struct fscrypt_name old_fname, new_fname;
        struct fscrypt_str *old_name, *new_name;

        /*
         * For non-subvolumes allow exchange only within one subvolume, in the
         * same inode namespace. Two subvolumes (represented as directory) can
         * be exchanged as they're a logical link and have a fixed inode number.
         */
        if (root != dest &&
            (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
             new_ino != BTRFS_FIRST_FREE_OBJECTID))
                return -EXDEV;

        ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
        if (ret)
                return ret;

        ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
        if (ret) {
                fscrypt_free_filename(&old_fname);
                return ret;
        }

        old_name = &old_fname.disk_name;
        new_name = &new_fname.disk_name;

        /* close the race window with snapshot create/destroy ioctl */
        if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
            new_ino == BTRFS_FIRST_FREE_OBJECTID)
                down_read(&fs_info->subvol_sem);

        /*
         * For each inode:
         * 1 to remove old dir item
         * 1 to remove old dir index
         * 1 to add new dir item
         * 1 to add new dir index
         * 1 to update parent inode
         *
         * If the parents are the same, we only need to account for one
         */
        trans_num_items = (old_dir == new_dir ? 9 : 10);
        if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
                /*
                 * 1 to remove old root ref
                 * 1 to remove old root backref
                 * 1 to add new root ref
                 * 1 to add new root backref
                 */
                trans_num_items += 4;
        } else {
                /*
                 * 1 to update inode item
                 * 1 to remove old inode ref
                 * 1 to add new inode ref
                 */
                trans_num_items += 3;
        }
        if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
                trans_num_items += 4;
        else
                trans_num_items += 3;
        trans = btrfs_start_transaction(root, trans_num_items);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto out_notrans;
        }

        if (dest != root) {
                ret = btrfs_record_root_in_trans(trans, dest);
                if (ret)
                        goto out_fail;
        }

        /*
         * We need to find a free sequence number both in the source and
         * in the destination directory for the exchange.
         */
        ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
        if (ret)
                goto out_fail;
        ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
        if (ret)
                goto out_fail;

        BTRFS_I(old_inode)->dir_index = 0ULL;
        BTRFS_I(new_inode)->dir_index = 0ULL;

        /* Reference for the source. */
        if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
                /* force full log commit if subvolume involved. */
                btrfs_set_log_full_commit(trans);
        } else {
                ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
                                             btrfs_ino(BTRFS_I(new_dir)),
                                             old_idx);
                if (ret)
                        goto out_fail;
                need_abort = true;
        }

        /* And now for the dest. */
        if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
                /* force full log commit if subvolume involved. */
                btrfs_set_log_full_commit(trans);
        } else {
                ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
                                             btrfs_ino(BTRFS_I(old_dir)),
                                             new_idx);
                if (ret) {
                        if (need_abort)
                                btrfs_abort_transaction(trans, ret);
                        goto out_fail;
                }
        }

        /* Update inode version and ctime/mtime. */
        inode_inc_iversion(old_dir);
        inode_inc_iversion(new_dir);
        inode_inc_iversion(old_inode);
        inode_inc_iversion(new_inode);
        simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);

        if (old_dentry->d_parent != new_dentry->d_parent) {
                btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
                                        BTRFS_I(old_inode), true);
                btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
                                        BTRFS_I(new_inode), true);
        }

        /* src is a subvolume */
        if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
                ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
        } else { /* src is an inode */
                ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
                                           BTRFS_I(old_dentry->d_inode),
                                           old_name, &old_rename_ctx);
                if (!ret)
                        ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
        }
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out_fail;
        }

        /* dest is a subvolume */
        if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
                ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
        } else { /* dest is an inode */
                ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
                                           BTRFS_I(new_dentry->d_inode),
                                           new_name, &new_rename_ctx);
                if (!ret)
                        ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
        }
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out_fail;
        }

        ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
                             new_name, 0, old_idx);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out_fail;
        }

        ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
                             old_name, 0, new_idx);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out_fail;
        }

        if (old_inode->i_nlink == 1)
                BTRFS_I(old_inode)->dir_index = old_idx;
        if (new_inode->i_nlink == 1)
                BTRFS_I(new_inode)->dir_index = new_idx;

        /*
         * Now pin the logs of the roots. We do it to ensure that no other task
         * can sync the logs while we are in progress with the rename, because
         * that could result in an inconsistency in case any of the inodes that
         * are part of this rename operation were logged before.
         */
        if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
                btrfs_pin_log_trans(root);
        if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
                btrfs_pin_log_trans(dest);

        /* Do the log updates for all inodes. */
        if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
                btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
                                   old_rename_ctx.index, new_dentry->d_parent);
        if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
                btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
                                   new_rename_ctx.index, old_dentry->d_parent);

        /* Now unpin the logs. */
        if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
                btrfs_end_log_trans(root);
        if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
                btrfs_end_log_trans(dest);
out_fail:
        ret2 = btrfs_end_transaction(trans);
        ret = ret ? ret : ret2;
out_notrans:
        if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
            old_ino == BTRFS_FIRST_FREE_OBJECTID)
                up_read(&fs_info->subvol_sem);

        fscrypt_free_filename(&new_fname);
        fscrypt_free_filename(&old_fname);
        return ret;
}

static struct inode *new_whiteout_inode(struct mnt_idmap *idmap,
                                        struct inode *dir)
{
        struct inode *inode;

        inode = new_inode(dir->i_sb);
        if (inode) {
                inode_init_owner(idmap, inode, dir,
                                 S_IFCHR | WHITEOUT_MODE);
                inode->i_op = &btrfs_special_inode_operations;
                init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
        }
        return inode;
}

static int btrfs_rename(struct mnt_idmap *idmap,
                        struct inode *old_dir, struct dentry *old_dentry,
                        struct inode *new_dir, struct dentry *new_dentry,
                        unsigned int flags)
{
        struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
        struct btrfs_new_inode_args whiteout_args = {
                .dir = old_dir,
                .dentry = old_dentry,
        };
        struct btrfs_trans_handle *trans;
        unsigned int trans_num_items;
        struct btrfs_root *root = BTRFS_I(old_dir)->root;
        struct btrfs_root *dest = BTRFS_I(new_dir)->root;
        struct inode *new_inode = d_inode(new_dentry);
        struct inode *old_inode = d_inode(old_dentry);
        struct btrfs_rename_ctx rename_ctx;
        u64 index = 0;
        int ret;
        int ret2;
        u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
        struct fscrypt_name old_fname, new_fname;

        if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
                return -EPERM;

        /* we only allow rename subvolume link between subvolumes */
        if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
                return -EXDEV;

        if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
            (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
                return -ENOTEMPTY;

        if (S_ISDIR(old_inode->i_mode) && new_inode &&
            new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
                return -ENOTEMPTY;

        ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
        if (ret)
                return ret;

        ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
        if (ret) {
                fscrypt_free_filename(&old_fname);
                return ret;
        }

        /* check for collisions, even if the  name isn't there */
        ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name);
        if (ret) {
                if (ret == -EEXIST) {
                        /* we shouldn't get
                         * eexist without a new_inode */
                        if (WARN_ON(!new_inode)) {
                                goto out_fscrypt_names;
                        }
                } else {
                        /* maybe -EOVERFLOW */
                        goto out_fscrypt_names;
                }
        }
        ret = 0;

        /*
         * we're using rename to replace one file with another.  Start IO on it
         * now so  we don't add too much work to the end of the transaction
         */
        if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
                filemap_flush(old_inode->i_mapping);

        if (flags & RENAME_WHITEOUT) {
                whiteout_args.inode = new_whiteout_inode(idmap, old_dir);
                if (!whiteout_args.inode) {
                        ret = -ENOMEM;
                        goto out_fscrypt_names;
                }
                ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
                if (ret)
                        goto out_whiteout_inode;
        } else {
                /* 1 to update the old parent inode. */
                trans_num_items = 1;
        }

        if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
                /* Close the race window with snapshot create/destroy ioctl */
                down_read(&fs_info->subvol_sem);
                /*
                 * 1 to remove old root ref
                 * 1 to remove old root backref
                 * 1 to add new root ref
                 * 1 to add new root backref
                 */
                trans_num_items += 4;
        } else {
                /*
                 * 1 to update inode
                 * 1 to remove old inode ref
                 * 1 to add new inode ref
                 */
                trans_num_items += 3;
        }
        /*
         * 1 to remove old dir item
         * 1 to remove old dir index
         * 1 to add new dir item
         * 1 to add new dir index
         */
        trans_num_items += 4;
        /* 1 to update new parent inode if it's not the same as the old parent */
        if (new_dir != old_dir)
                trans_num_items++;
        if (new_inode) {
                /*
                 * 1 to update inode
                 * 1 to remove inode ref
                 * 1 to remove dir item
                 * 1 to remove dir index
                 * 1 to possibly add orphan item
                 */
                trans_num_items += 5;
        }
        trans = btrfs_start_transaction(root, trans_num_items);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto out_notrans;
        }

        if (dest != root) {
                ret = btrfs_record_root_in_trans(trans, dest);
                if (ret)
                        goto out_fail;
        }

        ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
        if (ret)
                goto out_fail;

        BTRFS_I(old_inode)->dir_index = 0ULL;
        if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
                /* force full log commit if subvolume involved. */
                btrfs_set_log_full_commit(trans);
        } else {
                ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
                                             old_ino, btrfs_ino(BTRFS_I(new_dir)),
                                             index);
                if (ret)
                        goto out_fail;
        }

        inode_inc_iversion(old_dir);
        inode_inc_iversion(new_dir);
        inode_inc_iversion(old_inode);
        simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);

        if (old_dentry->d_parent != new_dentry->d_parent)
                btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
                                        BTRFS_I(old_inode), true);

        if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
                ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
        } else {
                ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
                                           BTRFS_I(d_inode(old_dentry)),
                                           &old_fname.disk_name, &rename_ctx);
                if (!ret)
                        ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
        }
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out_fail;
        }

        if (new_inode) {
                inode_inc_iversion(new_inode);
                if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
                             BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
                        ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
                        BUG_ON(new_inode->i_nlink == 0);
                } else {
                        ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
                                                 BTRFS_I(d_inode(new_dentry)),
                                                 &new_fname.disk_name);
                }
                if (!ret && new_inode->i_nlink == 0)
                        ret = btrfs_orphan_add(trans,
                                        BTRFS_I(d_inode(new_dentry)));
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        goto out_fail;
                }
        }

        ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
                             &new_fname.disk_name, 0, index);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out_fail;
        }

        if (old_inode->i_nlink == 1)
                BTRFS_I(old_inode)->dir_index = index;

        if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
                btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
                                   rename_ctx.index, new_dentry->d_parent);

        if (flags & RENAME_WHITEOUT) {
                ret = btrfs_create_new_inode(trans, &whiteout_args);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        goto out_fail;
                } else {
                        unlock_new_inode(whiteout_args.inode);
                        iput(whiteout_args.inode);
                        whiteout_args.inode = NULL;
                }
        }
out_fail:
        ret2 = btrfs_end_transaction(trans);
        ret = ret ? ret : ret2;
out_notrans:
        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
                up_read(&fs_info->subvol_sem);
        if (flags & RENAME_WHITEOUT)
                btrfs_new_inode_args_destroy(&whiteout_args);
out_whiteout_inode:
        if (flags & RENAME_WHITEOUT)
                iput(whiteout_args.inode);
out_fscrypt_names:
        fscrypt_free_filename(&old_fname);
        fscrypt_free_filename(&new_fname);
        return ret;
}

static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir,
                         struct dentry *old_dentry, struct inode *new_dir,
                         struct dentry *new_dentry, unsigned int flags)
{
        int ret;

        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                return -EINVAL;

        if (flags & RENAME_EXCHANGE)
                ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
                                            new_dentry);
        else
                ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir,
                                   new_dentry, flags);

        btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);

        return ret;
}

struct btrfs_delalloc_work {
        struct inode *inode;
        struct completion completion;
        struct list_head list;
        struct btrfs_work work;
};

static void btrfs_run_delalloc_work(struct btrfs_work *work)
{
        struct btrfs_delalloc_work *delalloc_work;
        struct inode *inode;

        delalloc_work = container_of(work, struct btrfs_delalloc_work,
                                     work);
        inode = delalloc_work->inode;
        filemap_flush(inode->i_mapping);
        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
                                &BTRFS_I(inode)->runtime_flags))
                filemap_flush(inode->i_mapping);

        iput(inode);
        complete(&delalloc_work->completion);
}

static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
{
        struct btrfs_delalloc_work *work;

        work = kmalloc(sizeof(*work), GFP_NOFS);
        if (!work)
                return NULL;

        init_completion(&work->completion);
        INIT_LIST_HEAD(&work->list);
        work->inode = inode;
        btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL);

        return work;
}

/*
 * some fairly slow code that needs optimization. This walks the list
 * of all the inodes with pending delalloc and forces them to disk.
 */
static int start_delalloc_inodes(struct btrfs_root *root,
                                 struct writeback_control *wbc, bool snapshot,
                                 bool in_reclaim_context)
{
        struct btrfs_inode *binode;
        struct inode *inode;
        struct btrfs_delalloc_work *work, *next;
        LIST_HEAD(works);
        LIST_HEAD(splice);
        int ret = 0;
        bool full_flush = wbc->nr_to_write == LONG_MAX;

        mutex_lock(&root->delalloc_mutex);
        spin_lock(&root->delalloc_lock);
        list_splice_init(&root->delalloc_inodes, &splice);
        while (!list_empty(&splice)) {
                binode = list_entry(splice.next, struct btrfs_inode,
                                    delalloc_inodes);

                list_move_tail(&binode->delalloc_inodes,
                               &root->delalloc_inodes);

                if (in_reclaim_context &&
                    test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
                        continue;

                inode = igrab(&binode->vfs_inode);
                if (!inode) {
                        cond_resched_lock(&root->delalloc_lock);
                        continue;
                }
                spin_unlock(&root->delalloc_lock);

                if (snapshot)
                        set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
                                &binode->runtime_flags);
                if (full_flush) {
                        work = btrfs_alloc_delalloc_work(inode);
                        if (!work) {
                                iput(inode);
                                ret = -ENOMEM;
                                goto out;
                        }
                        list_add_tail(&work->list, &works);
                        btrfs_queue_work(root->fs_info->flush_workers,
                                         &work->work);
                } else {
                        ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
                        btrfs_add_delayed_iput(BTRFS_I(inode));
                        if (ret || wbc->nr_to_write <= 0)
                                goto out;
                }
                cond_resched();
                spin_lock(&root->delalloc_lock);
        }
        spin_unlock(&root->delalloc_lock);

out:
        list_for_each_entry_safe(work, next, &works, list) {
                list_del_init(&work->list);
                wait_for_completion(&work->completion);
                kfree(work);
        }

        if (!list_empty(&splice)) {
                spin_lock(&root->delalloc_lock);
                list_splice_tail(&splice, &root->delalloc_inodes);
                spin_unlock(&root->delalloc_lock);
        }
        mutex_unlock(&root->delalloc_mutex);
        return ret;
}

int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
{
        struct writeback_control wbc = {
                .nr_to_write = LONG_MAX,
                .sync_mode = WB_SYNC_NONE,
                .range_start = 0,
                .range_end = LLONG_MAX,
        };
        struct btrfs_fs_info *fs_info = root->fs_info;

        if (BTRFS_FS_ERROR(fs_info))
                return -EROFS;

        return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
}

int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
                               bool in_reclaim_context)
{
        struct writeback_control wbc = {
                .nr_to_write = nr,
                .sync_mode = WB_SYNC_NONE,
                .range_start = 0,
                .range_end = LLONG_MAX,
        };
        struct btrfs_root *root;
        LIST_HEAD(splice);
        int ret;

        if (BTRFS_FS_ERROR(fs_info))
                return -EROFS;

        mutex_lock(&fs_info->delalloc_root_mutex);
        spin_lock(&fs_info->delalloc_root_lock);
        list_splice_init(&fs_info->delalloc_roots, &splice);
        while (!list_empty(&splice)) {
                /*
                 * Reset nr_to_write here so we know that we're doing a full
                 * flush.
                 */
                if (nr == LONG_MAX)
                        wbc.nr_to_write = LONG_MAX;

                root = list_first_entry(&splice, struct btrfs_root,
                                        delalloc_root);
                root = btrfs_grab_root(root);
                BUG_ON(!root);
                list_move_tail(&root->delalloc_root,
                               &fs_info->delalloc_roots);
                spin_unlock(&fs_info->delalloc_root_lock);

                ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
                btrfs_put_root(root);
                if (ret < 0 || wbc.nr_to_write <= 0)
                        goto out;
                spin_lock(&fs_info->delalloc_root_lock);
        }
        spin_unlock(&fs_info->delalloc_root_lock);

        ret = 0;
out:
        if (!list_empty(&splice)) {
                spin_lock(&fs_info->delalloc_root_lock);
                list_splice_tail(&splice, &fs_info->delalloc_roots);
                spin_unlock(&fs_info->delalloc_root_lock);
        }
        mutex_unlock(&fs_info->delalloc_root_mutex);
        return ret;
}

static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
                         struct dentry *dentry, const char *symname)
{
        struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_path *path;
        struct btrfs_key key;
        struct inode *inode;
        struct btrfs_new_inode_args new_inode_args = {
                .dir = dir,
                .dentry = dentry,
        };
        unsigned int trans_num_items;
        int err;
        int name_len;
        int datasize;
        unsigned long ptr;
        struct btrfs_file_extent_item *ei;
        struct extent_buffer *leaf;

        name_len = strlen(symname);
        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
                return -ENAMETOOLONG;

        inode = new_inode(dir->i_sb);
        if (!inode)
                return -ENOMEM;
        inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO);
        inode->i_op = &btrfs_symlink_inode_operations;
        inode_nohighmem(inode);
        inode->i_mapping->a_ops = &btrfs_aops;
        btrfs_i_size_write(BTRFS_I(inode), name_len);
        inode_set_bytes(inode, name_len);

        new_inode_args.inode = inode;
        err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
        if (err)
                goto out_inode;
        /* 1 additional item for the inline extent */
        trans_num_items++;

        trans = btrfs_start_transaction(root, trans_num_items);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto out_new_inode_args;
        }

        err = btrfs_create_new_inode(trans, &new_inode_args);
        if (err)
                goto out;

        path = btrfs_alloc_path();
        if (!path) {
                err = -ENOMEM;
                btrfs_abort_transaction(trans, err);
                discard_new_inode(inode);
                inode = NULL;
                goto out;
        }
        key.objectid = btrfs_ino(BTRFS_I(inode));
        key.offset = 0;
        key.type = BTRFS_EXTENT_DATA_KEY;
        datasize = btrfs_file_extent_calc_inline_size(name_len);
        err = btrfs_insert_empty_item(trans, root, path, &key,
                                      datasize);
        if (err) {
                btrfs_abort_transaction(trans, err);
                btrfs_free_path(path);
                discard_new_inode(inode);
                inode = NULL;
                goto out;
        }
        leaf = path->nodes[0];
        ei = btrfs_item_ptr(leaf, path->slots[0],
                            struct btrfs_file_extent_item);
        btrfs_set_file_extent_generation(leaf, ei, trans->transid);
        btrfs_set_file_extent_type(leaf, ei,
                                   BTRFS_FILE_EXTENT_INLINE);
        btrfs_set_file_extent_encryption(leaf, ei, 0);
        btrfs_set_file_extent_compression(leaf, ei, 0);
        btrfs_set_file_extent_other_encoding(leaf, ei, 0);
        btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);

        ptr = btrfs_file_extent_inline_start(ei);
        write_extent_buffer(leaf, symname, ptr, name_len);
        btrfs_mark_buffer_dirty(trans, leaf);
        btrfs_free_path(path);

        d_instantiate_new(dentry, inode);
        err = 0;
out:
        btrfs_end_transaction(trans);
        btrfs_btree_balance_dirty(fs_info);
out_new_inode_args:
        btrfs_new_inode_args_destroy(&new_inode_args);
out_inode:
        if (err)
                iput(inode);
        return err;
}

static struct btrfs_trans_handle *insert_prealloc_file_extent(
                                       struct btrfs_trans_handle *trans_in,
                                       struct btrfs_inode *inode,
                                       struct btrfs_key *ins,
                                       u64 file_offset)
{
        struct btrfs_file_extent_item stack_fi;
        struct btrfs_replace_extent_info extent_info;
        struct btrfs_trans_handle *trans = trans_in;
        struct btrfs_path *path;
        u64 start = ins->objectid;
        u64 len = ins->offset;
        u64 qgroup_released = 0;
        int ret;

        memset(&stack_fi, 0, sizeof(stack_fi));

        btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
        btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
        btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
        btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
        btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
        btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
        /* Encryption and other encoding is reserved and all 0 */

        ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released);
        if (ret < 0)
                return ERR_PTR(ret);

        if (trans) {
                ret = insert_reserved_file_extent(trans, inode,
                                                  file_offset, &stack_fi,
                                                  true, qgroup_released);
                if (ret)
                        goto free_qgroup;
                return trans;
        }

        extent_info.disk_offset = start;
        extent_info.disk_len = len;
        extent_info.data_offset = 0;
        extent_info.data_len = len;
        extent_info.file_offset = file_offset;
        extent_info.extent_buf = (char *)&stack_fi;
        extent_info.is_new_extent = true;
        extent_info.update_times = true;
        extent_info.qgroup_reserved = qgroup_released;
        extent_info.insertions = 0;

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto free_qgroup;
        }

        ret = btrfs_replace_file_extents(inode, path, file_offset,
                                     file_offset + len - 1, &extent_info,
                                     &trans);
        btrfs_free_path(path);
        if (ret)
                goto free_qgroup;
        return trans;

free_qgroup:
        /*
         * We have released qgroup data range at the beginning of the function,
         * and normally qgroup_released bytes will be freed when committing
         * transaction.
         * But if we error out early, we have to free what we have released
         * or we leak qgroup data reservation.
         */
        btrfs_qgroup_free_refroot(inode->root->fs_info,
                        btrfs_root_id(inode->root), qgroup_released,
                        BTRFS_QGROUP_RSV_DATA);
        return ERR_PTR(ret);
}

static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                                       u64 start, u64 num_bytes, u64 min_size,
                                       loff_t actual_len, u64 *alloc_hint,
                                       struct btrfs_trans_handle *trans)
{
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct extent_map *em;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 cur_offset = start;
        u64 clear_offset = start;
        u64 i_size;
        u64 cur_bytes;
        u64 last_alloc = (u64)-1;
        int ret = 0;
        bool own_trans = true;
        u64 end = start + num_bytes - 1;

        if (trans)
                own_trans = false;
        while (num_bytes > 0) {
                cur_bytes = min_t(u64, num_bytes, SZ_256M);
                cur_bytes = max(cur_bytes, min_size);
                /*
                 * If we are severely fragmented we could end up with really
                 * small allocations, so if the allocator is returning small
                 * chunks lets make its job easier by only searching for those
                 * sized chunks.
                 */
                cur_bytes = min(cur_bytes, last_alloc);
                ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
                                min_size, 0, *alloc_hint, &ins, 1, 0);
                if (ret)
                        break;

                /*
                 * We've reserved this space, and thus converted it from
                 * ->bytes_may_use to ->bytes_reserved.  Any error that happens
                 * from here on out we will only need to clear our reservation
                 * for the remaining unreserved area, so advance our
                 * clear_offset by our extent size.
                 */
                clear_offset += ins.offset;

                last_alloc = ins.offset;
                trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
                                                    &ins, cur_offset);
                /*
                 * Now that we inserted the prealloc extent we can finally
                 * decrement the number of reservations in the block group.
                 * If we did it before, we could race with relocation and have
                 * relocation miss the reserved extent, making it fail later.
                 */
                btrfs_dec_block_group_reservations(fs_info, ins.objectid);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
                        btrfs_free_reserved_extent(fs_info, ins.objectid,
                                                   ins.offset, 0);
                        break;
                }

                em = alloc_extent_map();
                if (!em) {
                        btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
                                            cur_offset + ins.offset - 1, false);
                        btrfs_set_inode_full_sync(BTRFS_I(inode));
                        goto next;
                }

                em->start = cur_offset;
                em->orig_start = cur_offset;
                em->len = ins.offset;
                em->block_start = ins.objectid;
                em->block_len = ins.offset;
                em->orig_block_len = ins.offset;
                em->ram_bytes = ins.offset;
                em->flags |= EXTENT_FLAG_PREALLOC;
                em->generation = trans->transid;

                ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
                free_extent_map(em);
next:
                num_bytes -= ins.offset;
                cur_offset += ins.offset;
                *alloc_hint = ins.objectid + ins.offset;

                inode_inc_iversion(inode);
                inode_set_ctime_current(inode);
                BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
                    (actual_len > inode->i_size) &&
                    (cur_offset > inode->i_size)) {
                        if (cur_offset > actual_len)
                                i_size = actual_len;
                        else
                                i_size = cur_offset;
                        i_size_write(inode, i_size);
                        btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
                }

                ret = btrfs_update_inode(trans, BTRFS_I(inode));

                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        if (own_trans)
                                btrfs_end_transaction(trans);
                        break;
                }

                if (own_trans) {
                        btrfs_end_transaction(trans);
                        trans = NULL;
                }
        }
        if (clear_offset < end)
                btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
                        end - clear_offset + 1);
        return ret;
}

int btrfs_prealloc_file_range(struct inode *inode, int mode,
                              u64 start, u64 num_bytes, u64 min_size,
                              loff_t actual_len, u64 *alloc_hint)
{
        return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
                                           min_size, actual_len, alloc_hint,
                                           NULL);
}

int btrfs_prealloc_file_range_trans(struct inode *inode,
                                    struct btrfs_trans_handle *trans, int mode,
                                    u64 start, u64 num_bytes, u64 min_size,
                                    loff_t actual_len, u64 *alloc_hint)
{
        return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
                                           min_size, actual_len, alloc_hint, trans);
}

static int btrfs_permission(struct mnt_idmap *idmap,
                            struct inode *inode, int mask)
{
        struct btrfs_root *root = BTRFS_I(inode)->root;
        umode_t mode = inode->i_mode;

        if (mask & MAY_WRITE &&
            (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
                if (btrfs_root_readonly(root))
                        return -EROFS;
                if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
                        return -EACCES;
        }
        return generic_permission(idmap, inode, mask);
}

static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
                         struct file *file, umode_t mode)
{
        struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct inode *inode;
        struct btrfs_new_inode_args new_inode_args = {
                .dir = dir,
                .dentry = file->f_path.dentry,
                .orphan = true,
        };
        unsigned int trans_num_items;
        int ret;

        inode = new_inode(dir->i_sb);
        if (!inode)
                return -ENOMEM;
        inode_init_owner(idmap, inode, dir, mode);
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
        inode->i_mapping->a_ops = &btrfs_aops;

        new_inode_args.inode = inode;
        ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
        if (ret)
                goto out_inode;

        trans = btrfs_start_transaction(root, trans_num_items);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto out_new_inode_args;
        }

        ret = btrfs_create_new_inode(trans, &new_inode_args);

        /*
         * We set number of links to 0 in btrfs_create_new_inode(), and here we
         * set it to 1 because d_tmpfile() will issue a warning if the count is
         * 0, through:
         *
         *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
         */
        set_nlink(inode, 1);

        if (!ret) {
                d_tmpfile(file, inode);
                unlock_new_inode(inode);
                mark_inode_dirty(inode);
        }

        btrfs_end_transaction(trans);
        btrfs_btree_balance_dirty(fs_info);
out_new_inode_args:
        btrfs_new_inode_args_destroy(&new_inode_args);
out_inode:
        if (ret)
                iput(inode);
        return finish_open_simple(file, ret);
}

void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        unsigned long index = start >> PAGE_SHIFT;
        unsigned long end_index = end >> PAGE_SHIFT;
        struct page *page;
        u32 len;

        ASSERT(end + 1 - start <= U32_MAX);
        len = end + 1 - start;
        while (index <= end_index) {
                page = find_get_page(inode->vfs_inode.i_mapping, index);
                ASSERT(page); /* Pages should be in the extent_io_tree */

                /* This is for data, which doesn't yet support larger folio. */
                ASSERT(folio_order(page_folio(page)) == 0);
                btrfs_folio_set_writeback(fs_info, page_folio(page), start, len);
                put_page(page);
                index++;
        }
}

int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
                                             int compress_type)
{
        switch (compress_type) {
        case BTRFS_COMPRESS_NONE:
                return BTRFS_ENCODED_IO_COMPRESSION_NONE;
        case BTRFS_COMPRESS_ZLIB:
                return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
        case BTRFS_COMPRESS_LZO:
                /*
                 * The LZO format depends on the sector size. 64K is the maximum
                 * sector size that we support.
                 */
                if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
                        return -EINVAL;
                return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
                       (fs_info->sectorsize_bits - 12);
        case BTRFS_COMPRESS_ZSTD:
                return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
        default:
                return -EUCLEAN;
        }
}

static ssize_t btrfs_encoded_read_inline(
                                struct kiocb *iocb,
                                struct iov_iter *iter, u64 start,
                                u64 lockend,
                                struct extent_state **cached_state,
                                u64 extent_start, size_t count,
                                struct btrfs_ioctl_encoded_io_args *encoded,
                                bool *unlocked)
{
        struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_io_tree *io_tree = &inode->io_tree;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_file_extent_item *item;
        u64 ram_bytes;
        unsigned long ptr;
        void *tmp;
        ssize_t ret;

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }
        ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
                                       extent_start, 0);
        if (ret) {
                if (ret > 0) {
                        /* The extent item disappeared? */
                        ret = -EIO;
                }
                goto out;
        }
        leaf = path->nodes[0];
        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);

        ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
        ptr = btrfs_file_extent_inline_start(item);

        encoded->len = min_t(u64, extent_start + ram_bytes,
                             inode->vfs_inode.i_size) - iocb->ki_pos;
        ret = btrfs_encoded_io_compression_from_extent(fs_info,
                                 btrfs_file_extent_compression(leaf, item));
        if (ret < 0)
                goto out;
        encoded->compression = ret;
        if (encoded->compression) {
                size_t inline_size;

                inline_size = btrfs_file_extent_inline_item_len(leaf,
                                                                path->slots[0]);
                if (inline_size > count) {
                        ret = -ENOBUFS;
                        goto out;
                }
                count = inline_size;
                encoded->unencoded_len = ram_bytes;
                encoded->unencoded_offset = iocb->ki_pos - extent_start;
        } else {
                count = min_t(u64, count, encoded->len);
                encoded->len = count;
                encoded->unencoded_len = count;
                ptr += iocb->ki_pos - extent_start;
        }

        tmp = kmalloc(count, GFP_NOFS);
        if (!tmp) {
                ret = -ENOMEM;
                goto out;
        }
        read_extent_buffer(leaf, tmp, ptr, count);
        btrfs_release_path(path);
        unlock_extent(io_tree, start, lockend, cached_state);
        btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
        *unlocked = true;

        ret = copy_to_iter(tmp, count, iter);
        if (ret != count)
                ret = -EFAULT;
        kfree(tmp);
out:
        btrfs_free_path(path);
        return ret;
}

struct btrfs_encoded_read_private {
        wait_queue_head_t wait;
        atomic_t pending;
        blk_status_t status;
};

static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
{
        struct btrfs_encoded_read_private *priv = bbio->private;

        if (bbio->bio.bi_status) {
                /*
                 * The memory barrier implied by the atomic_dec_return() here
                 * pairs with the memory barrier implied by the
                 * atomic_dec_return() or io_wait_event() in
                 * btrfs_encoded_read_regular_fill_pages() to ensure that this
                 * write is observed before the load of status in
                 * btrfs_encoded_read_regular_fill_pages().
                 */
                WRITE_ONCE(priv->status, bbio->bio.bi_status);
        }
        if (!atomic_dec_return(&priv->pending))
                wake_up(&priv->wait);
        bio_put(&bbio->bio);
}

int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
                                          u64 file_offset, u64 disk_bytenr,
                                          u64 disk_io_size, struct page **pages)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct btrfs_encoded_read_private priv = {
                .pending = ATOMIC_INIT(1),
        };
        unsigned long i = 0;
        struct btrfs_bio *bbio;

        init_waitqueue_head(&priv.wait);

        bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
                               btrfs_encoded_read_endio, &priv);
        bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
        bbio->inode = inode;

        do {
                size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);

                if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
                        atomic_inc(&priv.pending);
                        btrfs_submit_bio(bbio, 0);

                        bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
                                               btrfs_encoded_read_endio, &priv);
                        bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
                        bbio->inode = inode;
                        continue;
                }

                i++;
                disk_bytenr += bytes;
                disk_io_size -= bytes;
        } while (disk_io_size);

        atomic_inc(&priv.pending);
        btrfs_submit_bio(bbio, 0);

        if (atomic_dec_return(&priv.pending))
                io_wait_event(priv.wait, !atomic_read(&priv.pending));
        /* See btrfs_encoded_read_endio() for ordering. */
        return blk_status_to_errno(READ_ONCE(priv.status));
}

static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
                                          struct iov_iter *iter,
                                          u64 start, u64 lockend,
                                          struct extent_state **cached_state,
                                          u64 disk_bytenr, u64 disk_io_size,
                                          size_t count, bool compressed,
                                          bool *unlocked)
{
        struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
        struct extent_io_tree *io_tree = &inode->io_tree;
        struct page **pages;
        unsigned long nr_pages, i;
        u64 cur;
        size_t page_offset;
        ssize_t ret;

        nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
        pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
        if (!pages)
                return -ENOMEM;
        ret = btrfs_alloc_page_array(nr_pages, pages, 0);
        if (ret) {
                ret = -ENOMEM;
                goto out;
                }

        ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
                                                    disk_io_size, pages);
        if (ret)
                goto out;

        unlock_extent(io_tree, start, lockend, cached_state);
        btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
        *unlocked = true;

        if (compressed) {
                i = 0;
                page_offset = 0;
        } else {
                i = (iocb->ki_pos - start) >> PAGE_SHIFT;
                page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
        }
        cur = 0;
        while (cur < count) {
                size_t bytes = min_t(size_t, count - cur,
                                     PAGE_SIZE - page_offset);

                if (copy_page_to_iter(pages[i], page_offset, bytes,
                                      iter) != bytes) {
                        ret = -EFAULT;
                        goto out;
                }
                i++;
                cur += bytes;
                page_offset = 0;
        }
        ret = count;
out:
        for (i = 0; i < nr_pages; i++) {
                if (pages[i])
                        __free_page(pages[i]);
        }
        kfree(pages);
        return ret;
}

ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
                           struct btrfs_ioctl_encoded_io_args *encoded)
{
        struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct extent_io_tree *io_tree = &inode->io_tree;
        ssize_t ret;
        size_t count = iov_iter_count(iter);
        u64 start, lockend, disk_bytenr, disk_io_size;
        struct extent_state *cached_state = NULL;
        struct extent_map *em;
        bool unlocked = false;

        file_accessed(iocb->ki_filp);

        btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);

        if (iocb->ki_pos >= inode->vfs_inode.i_size) {
                btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
                return 0;
        }
        start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
        /*
         * We don't know how long the extent containing iocb->ki_pos is, but if
         * it's compressed we know that it won't be longer than this.
         */
        lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;

        for (;;) {
                struct btrfs_ordered_extent *ordered;

                ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
                                               lockend - start + 1);
                if (ret)
                        goto out_unlock_inode;
                lock_extent(io_tree, start, lockend, &cached_state);
                ordered = btrfs_lookup_ordered_range(inode, start,
                                                     lockend - start + 1);
                if (!ordered)
                        break;
                btrfs_put_ordered_extent(ordered);
                unlock_extent(io_tree, start, lockend, &cached_state);
                cond_resched();
        }

        em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
        if (IS_ERR(em)) {
                ret = PTR_ERR(em);
                goto out_unlock_extent;
        }

        if (em->block_start == EXTENT_MAP_INLINE) {
                u64 extent_start = em->start;

                /*
                 * For inline extents we get everything we need out of the
                 * extent item.
                 */
                free_extent_map(em);
                em = NULL;
                ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
                                                &cached_state, extent_start,
                                                count, encoded, &unlocked);
                goto out;
        }

        /*
         * We only want to return up to EOF even if the extent extends beyond
         * that.
         */
        encoded->len = min_t(u64, extent_map_end(em),
                             inode->vfs_inode.i_size) - iocb->ki_pos;
        if (em->block_start == EXTENT_MAP_HOLE ||
            (em->flags & EXTENT_FLAG_PREALLOC)) {
                disk_bytenr = EXTENT_MAP_HOLE;
                count = min_t(u64, count, encoded->len);
                encoded->len = count;
                encoded->unencoded_len = count;
        } else if (extent_map_is_compressed(em)) {
                disk_bytenr = em->block_start;
                /*
                 * Bail if the buffer isn't large enough to return the whole
                 * compressed extent.
                 */
                if (em->block_len > count) {
                        ret = -ENOBUFS;
                        goto out_em;
                }
                disk_io_size = em->block_len;
                count = em->block_len;
                encoded->unencoded_len = em->ram_bytes;
                encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
                ret = btrfs_encoded_io_compression_from_extent(fs_info,
                                                               extent_map_compression(em));
                if (ret < 0)
                        goto out_em;
                encoded->compression = ret;
        } else {
                disk_bytenr = em->block_start + (start - em->start);
                if (encoded->len > count)
                        encoded->len = count;
                /*
                 * Don't read beyond what we locked. This also limits the page
                 * allocations that we'll do.
                 */
                disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
                count = start + disk_io_size - iocb->ki_pos;
                encoded->len = count;
                encoded->unencoded_len = count;
                disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
        }
        free_extent_map(em);
        em = NULL;

        if (disk_bytenr == EXTENT_MAP_HOLE) {
                unlock_extent(io_tree, start, lockend, &cached_state);
                btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
                unlocked = true;
                ret = iov_iter_zero(count, iter);
                if (ret != count)
                        ret = -EFAULT;
        } else {
                ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
                                                 &cached_state, disk_bytenr,
                                                 disk_io_size, count,
                                                 encoded->compression,
                                                 &unlocked);
        }

out:
        if (ret >= 0)
                iocb->ki_pos += encoded->len;
out_em:
        free_extent_map(em);
out_unlock_extent:
        if (!unlocked)
                unlock_extent(io_tree, start, lockend, &cached_state);
out_unlock_inode:
        if (!unlocked)
                btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
        return ret;
}

ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
                               const struct btrfs_ioctl_encoded_io_args *encoded)
{
        struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_io_tree *io_tree = &inode->io_tree;
        struct extent_changeset *data_reserved = NULL;
        struct extent_state *cached_state = NULL;
        struct btrfs_ordered_extent *ordered;
        int compression;
        size_t orig_count;
        u64 start, end;
        u64 num_bytes, ram_bytes, disk_num_bytes;
        unsigned long nr_folios, i;
        struct folio **folios;
        struct btrfs_key ins;
        bool extent_reserved = false;
        struct extent_map *em;
        ssize_t ret;

        switch (encoded->compression) {
        case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
                compression = BTRFS_COMPRESS_ZLIB;
                break;
        case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
                compression = BTRFS_COMPRESS_ZSTD;
                break;
        case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
        case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
        case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
        case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
        case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
                /* The sector size must match for LZO. */
                if (encoded->compression -
                    BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
                    fs_info->sectorsize_bits)
                        return -EINVAL;
                compression = BTRFS_COMPRESS_LZO;
                break;
        default:
                return -EINVAL;
        }
        if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
                return -EINVAL;

        /*
         * Compressed extents should always have checksums, so error out if we
         * have a NOCOW file or inode was created while mounted with NODATASUM.
         */
        if (inode->flags & BTRFS_INODE_NODATASUM)
                return -EINVAL;

        orig_count = iov_iter_count(from);

        /* The extent size must be sane. */
        if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
            orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
                return -EINVAL;

        /*
         * The compressed data must be smaller than the decompressed data.
         *
         * It's of course possible for data to compress to larger or the same
         * size, but the buffered I/O path falls back to no compression for such
         * data, and we don't want to break any assumptions by creating these
         * extents.
         *
         * Note that this is less strict than the current check we have that the
         * compressed data must be at least one sector smaller than the
         * decompressed data. We only want to enforce the weaker requirement
         * from old kernels that it is at least one byte smaller.
         */
        if (orig_count >= encoded->unencoded_len)
                return -EINVAL;

        /* The extent must start on a sector boundary. */
        start = iocb->ki_pos;
        if (!IS_ALIGNED(start, fs_info->sectorsize))
                return -EINVAL;

        /*
         * The extent must end on a sector boundary. However, we allow a write
         * which ends at or extends i_size to have an unaligned length; we round
         * up the extent size and set i_size to the unaligned end.
         */
        if (start + encoded->len < inode->vfs_inode.i_size &&
            !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
                return -EINVAL;

        /* Finally, the offset in the unencoded data must be sector-aligned. */
        if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
                return -EINVAL;

        num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
        ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
        end = start + num_bytes - 1;

        /*
         * If the extent cannot be inline, the compressed data on disk must be
         * sector-aligned. For convenience, we extend it with zeroes if it
         * isn't.
         */
        disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
        nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
        folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
        if (!folios)
                return -ENOMEM;
        for (i = 0; i < nr_folios; i++) {
                size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
                char *kaddr;

                folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0);
                if (!folios[i]) {
                        ret = -ENOMEM;
                        goto out_folios;
                }
                kaddr = kmap_local_folio(folios[i], 0);
                if (copy_from_iter(kaddr, bytes, from) != bytes) {
                        kunmap_local(kaddr);
                        ret = -EFAULT;
                        goto out_folios;
                }
                if (bytes < PAGE_SIZE)
                        memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
                kunmap_local(kaddr);
        }

        for (;;) {
                struct btrfs_ordered_extent *ordered;

                ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
                if (ret)
                        goto out_folios;
                ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
                                                    start >> PAGE_SHIFT,
                                                    end >> PAGE_SHIFT);
                if (ret)
                        goto out_folios;
                lock_extent(io_tree, start, end, &cached_state);
                ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
                if (!ordered &&
                    !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
                        break;
                if (ordered)
                        btrfs_put_ordered_extent(ordered);
                unlock_extent(io_tree, start, end, &cached_state);
                cond_resched();
        }

        /*
         * We don't use the higher-level delalloc space functions because our
         * num_bytes and disk_num_bytes are different.
         */
        ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
        if (ret)
                goto out_unlock;
        ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
        if (ret)
                goto out_free_data_space;
        ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
                                              false);
        if (ret)
                goto out_qgroup_free_data;

        /* Try an inline extent first. */
        if (encoded->unencoded_len == encoded->len &&
            encoded->unencoded_offset == 0 &&
            can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
                ret = __cow_file_range_inline(inode, start, encoded->len,
                                              orig_count, compression, folios[0],
                                              true);
                if (ret <= 0) {
                        if (ret == 0)
                                ret = orig_count;
                        goto out_delalloc_release;
                }
        }

        ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
                                   disk_num_bytes, 0, 0, &ins, 1, 1);
        if (ret)
                goto out_delalloc_release;
        extent_reserved = true;

        em = create_io_em(inode, start, num_bytes,
                          start - encoded->unencoded_offset, ins.objectid,
                          ins.offset, ins.offset, ram_bytes, compression,
                          BTRFS_ORDERED_COMPRESSED);
        if (IS_ERR(em)) {
                ret = PTR_ERR(em);
                goto out_free_reserved;
        }
        free_extent_map(em);

        ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes,
                                       ins.objectid, ins.offset,
                                       encoded->unencoded_offset,
                                       (1 << BTRFS_ORDERED_ENCODED) |
                                       (1 << BTRFS_ORDERED_COMPRESSED),
                                       compression);
        if (IS_ERR(ordered)) {
                btrfs_drop_extent_map_range(inode, start, end, false);
                ret = PTR_ERR(ordered);
                goto out_free_reserved;
        }
        btrfs_dec_block_group_reservations(fs_info, ins.objectid);

        if (start + encoded->len > inode->vfs_inode.i_size)
                i_size_write(&inode->vfs_inode, start + encoded->len);

        unlock_extent(io_tree, start, end, &cached_state);

        btrfs_delalloc_release_extents(inode, num_bytes);

        btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false);
        ret = orig_count;
        goto out;

out_free_reserved:
        btrfs_dec_block_group_reservations(fs_info, ins.objectid);
        btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
out_delalloc_release:
        btrfs_delalloc_release_extents(inode, num_bytes);
        btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
out_qgroup_free_data:
        if (ret < 0)
                btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
out_free_data_space:
        /*
         * If btrfs_reserve_extent() succeeded, then we already decremented
         * bytes_may_use.
         */
        if (!extent_reserved)
                btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
out_unlock:
        unlock_extent(io_tree, start, end, &cached_state);
out_folios:
        for (i = 0; i < nr_folios; i++) {
                if (folios[i])
                        __folio_put(folios[i]);
        }
        kvfree(folios);
out:
        if (ret >= 0)
                iocb->ki_pos += encoded->len;
        return ret;
}

#ifdef CONFIG_SWAP
/*
 * Add an entry indicating a block group or device which is pinned by a
 * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
 * negative errno on failure.
 */
static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
                                  bool is_block_group)
{
        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
        struct btrfs_swapfile_pin *sp, *entry;
        struct rb_node **p;
        struct rb_node *parent = NULL;

        sp = kmalloc(sizeof(*sp), GFP_NOFS);
        if (!sp)
                return -ENOMEM;
        sp->ptr = ptr;
        sp->inode = inode;
        sp->is_block_group = is_block_group;
        sp->bg_extent_count = 1;

        spin_lock(&fs_info->swapfile_pins_lock);
        p = &fs_info->swapfile_pins.rb_node;
        while (*p) {
                parent = *p;
                entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
                if (sp->ptr < entry->ptr ||
                    (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
                        p = &(*p)->rb_left;
                } else if (sp->ptr > entry->ptr ||
                           (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
                        p = &(*p)->rb_right;
                } else {
                        if (is_block_group)
                                entry->bg_extent_count++;
                        spin_unlock(&fs_info->swapfile_pins_lock);
                        kfree(sp);
                        return 1;
                }
        }
        rb_link_node(&sp->node, parent, p);
        rb_insert_color(&sp->node, &fs_info->swapfile_pins);
        spin_unlock(&fs_info->swapfile_pins_lock);
        return 0;
}

/* Free all of the entries pinned by this swapfile. */
static void btrfs_free_swapfile_pins(struct inode *inode)
{
        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
        struct btrfs_swapfile_pin *sp;
        struct rb_node *node, *next;

        spin_lock(&fs_info->swapfile_pins_lock);
        node = rb_first(&fs_info->swapfile_pins);
        while (node) {
                next = rb_next(node);
                sp = rb_entry(node, struct btrfs_swapfile_pin, node);
                if (sp->inode == inode) {
                        rb_erase(&sp->node, &fs_info->swapfile_pins);
                        if (sp->is_block_group) {
                                btrfs_dec_block_group_swap_extents(sp->ptr,
                                                           sp->bg_extent_count);
                                btrfs_put_block_group(sp->ptr);
                        }
                        kfree(sp);
                }
                node = next;
        }
        spin_unlock(&fs_info->swapfile_pins_lock);
}

struct btrfs_swap_info {
        u64 start;
        u64 block_start;
        u64 block_len;
        u64 lowest_ppage;
        u64 highest_ppage;
        unsigned long nr_pages;
        int nr_extents;
};

static int btrfs_add_swap_extent(struct swap_info_struct *sis,
                                 struct btrfs_swap_info *bsi)
{
        unsigned long nr_pages;
        unsigned long max_pages;
        u64 first_ppage, first_ppage_reported, next_ppage;
        int ret;

        /*
         * Our swapfile may have had its size extended after the swap header was
         * written. In that case activating the swapfile should not go beyond
         * the max size set in the swap header.
         */
        if (bsi->nr_pages >= sis->max)
                return 0;

        max_pages = sis->max - bsi->nr_pages;
        first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
        next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;

        if (first_ppage >= next_ppage)
                return 0;
        nr_pages = next_ppage - first_ppage;
        nr_pages = min(nr_pages, max_pages);

        first_ppage_reported = first_ppage;
        if (bsi->start == 0)
                first_ppage_reported++;
        if (bsi->lowest_ppage > first_ppage_reported)
                bsi->lowest_ppage = first_ppage_reported;
        if (bsi->highest_ppage < (next_ppage - 1))
                bsi->highest_ppage = next_ppage - 1;

        ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
        if (ret < 0)
                return ret;
        bsi->nr_extents += ret;
        bsi->nr_pages += nr_pages;
        return 0;
}

static void btrfs_swap_deactivate(struct file *file)
{
        struct inode *inode = file_inode(file);

        btrfs_free_swapfile_pins(inode);
        atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
}

static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
                               sector_t *span)
{
        struct inode *inode = file_inode(file);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_state *cached_state = NULL;
        struct extent_map *em = NULL;
        struct btrfs_chunk_map *map = NULL;
        struct btrfs_device *device = NULL;
        struct btrfs_swap_info bsi = {
                .lowest_ppage = (sector_t)-1ULL,
        };
        int ret = 0;
        u64 isize;
        u64 start;

        /*
         * If the swap file was just created, make sure delalloc is done. If the
         * file changes again after this, the user is doing something stupid and
         * we don't really care.
         */
        ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
        if (ret)
                return ret;

        /*
         * The inode is locked, so these flags won't change after we check them.
         */
        if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
                btrfs_warn(fs_info, "swapfile must not be compressed");
                return -EINVAL;
        }
        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
                btrfs_warn(fs_info, "swapfile must not be copy-on-write");
                return -EINVAL;
        }
        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
                btrfs_warn(fs_info, "swapfile must not be checksummed");
                return -EINVAL;
        }

        /*
         * Balance or device remove/replace/resize can move stuff around from
         * under us. The exclop protection makes sure they aren't running/won't
         * run concurrently while we are mapping the swap extents, and
         * fs_info->swapfile_pins prevents them from running while the swap
         * file is active and moving the extents. Note that this also prevents
         * a concurrent device add which isn't actually necessary, but it's not
         * really worth the trouble to allow it.
         */
        if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
                btrfs_warn(fs_info,
           "cannot activate swapfile while exclusive operation is running");
                return -EBUSY;
        }

        /*
         * Prevent snapshot creation while we are activating the swap file.
         * We do not want to race with snapshot creation. If snapshot creation
         * already started before we bumped nr_swapfiles from 0 to 1 and
         * completes before the first write into the swap file after it is
         * activated, than that write would fallback to COW.
         */
        if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
                btrfs_exclop_finish(fs_info);
                btrfs_warn(fs_info,
           "cannot activate swapfile because snapshot creation is in progress");
                return -EINVAL;
        }
        /*
         * Snapshots can create extents which require COW even if NODATACOW is
         * set. We use this counter to prevent snapshots. We must increment it
         * before walking the extents because we don't want a concurrent
         * snapshot to run after we've already checked the extents.
         *
         * It is possible that subvolume is marked for deletion but still not
         * removed yet. To prevent this race, we check the root status before
         * activating the swapfile.
         */
        spin_lock(&root->root_item_lock);
        if (btrfs_root_dead(root)) {
                spin_unlock(&root->root_item_lock);

                btrfs_exclop_finish(fs_info);
                btrfs_warn(fs_info,
                "cannot activate swapfile because subvolume %llu is being deleted",
                        btrfs_root_id(root));
                return -EPERM;
        }
        atomic_inc(&root->nr_swapfiles);
        spin_unlock(&root->root_item_lock);

        isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);

        lock_extent(io_tree, 0, isize - 1, &cached_state);
        start = 0;
        while (start < isize) {
                u64 logical_block_start, physical_block_start;
                struct btrfs_block_group *bg;
                u64 len = isize - start;

                em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
                if (IS_ERR(em)) {
                        ret = PTR_ERR(em);
                        goto out;
                }

                if (em->block_start == EXTENT_MAP_HOLE) {
                        btrfs_warn(fs_info, "swapfile must not have holes");
                        ret = -EINVAL;
                        goto out;
                }
                if (em->block_start == EXTENT_MAP_INLINE) {
                        /*
                         * It's unlikely we'll ever actually find ourselves
                         * here, as a file small enough to fit inline won't be
                         * big enough to store more than the swap header, but in
                         * case something changes in the future, let's catch it
                         * here rather than later.
                         */
                        btrfs_warn(fs_info, "swapfile must not be inline");
                        ret = -EINVAL;
                        goto out;
                }
                if (extent_map_is_compressed(em)) {
                        btrfs_warn(fs_info, "swapfile must not be compressed");
                        ret = -EINVAL;
                        goto out;
                }

                logical_block_start = em->block_start + (start - em->start);
                len = min(len, em->len - (start - em->start));
                free_extent_map(em);
                em = NULL;

                ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true);
                if (ret < 0) {
                        goto out;
                } else if (ret) {
                        ret = 0;
                } else {
                        btrfs_warn(fs_info,
                                   "swapfile must not be copy-on-write");
                        ret = -EINVAL;
                        goto out;
                }

                map = btrfs_get_chunk_map(fs_info, logical_block_start, len);
                if (IS_ERR(map)) {
                        ret = PTR_ERR(map);
                        goto out;
                }

                if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
                        btrfs_warn(fs_info,
                                   "swapfile must have single data profile");
                        ret = -EINVAL;
                        goto out;
                }

                if (device == NULL) {
                        device = map->stripes[0].dev;
                        ret = btrfs_add_swapfile_pin(inode, device, false);
                        if (ret == 1)
                                ret = 0;
                        else if (ret)
                                goto out;
                } else if (device != map->stripes[0].dev) {
                        btrfs_warn(fs_info, "swapfile must be on one device");
                        ret = -EINVAL;
                        goto out;
                }

                physical_block_start = (map->stripes[0].physical +
                                        (logical_block_start - map->start));
                len = min(len, map->chunk_len - (logical_block_start - map->start));
                btrfs_free_chunk_map(map);
                map = NULL;

                bg = btrfs_lookup_block_group(fs_info, logical_block_start);
                if (!bg) {
                        btrfs_warn(fs_info,
                           "could not find block group containing swapfile");
                        ret = -EINVAL;
                        goto out;
                }

                if (!btrfs_inc_block_group_swap_extents(bg)) {
                        btrfs_warn(fs_info,
                           "block group for swapfile at %llu is read-only%s",
                           bg->start,
                           atomic_read(&fs_info->scrubs_running) ?
                                       " (scrub running)" : "");
                        btrfs_put_block_group(bg);
                        ret = -EINVAL;
                        goto out;
                }

                ret = btrfs_add_swapfile_pin(inode, bg, true);
                if (ret) {
                        btrfs_put_block_group(bg);
                        if (ret == 1)
                                ret = 0;
                        else
                                goto out;
                }

                if (bsi.block_len &&
                    bsi.block_start + bsi.block_len == physical_block_start) {
                        bsi.block_len += len;
                } else {
                        if (bsi.block_len) {
                                ret = btrfs_add_swap_extent(sis, &bsi);
                                if (ret)
                                        goto out;
                        }
                        bsi.start = start;
                        bsi.block_start = physical_block_start;
                        bsi.block_len = len;
                }

                start += len;
        }

        if (bsi.block_len)
                ret = btrfs_add_swap_extent(sis, &bsi);

out:
        if (!IS_ERR_OR_NULL(em))
                free_extent_map(em);
        if (!IS_ERR_OR_NULL(map))
                btrfs_free_chunk_map(map);

        unlock_extent(io_tree, 0, isize - 1, &cached_state);

        if (ret)
                btrfs_swap_deactivate(file);

        btrfs_drew_write_unlock(&root->snapshot_lock);

        btrfs_exclop_finish(fs_info);

        if (ret)
                return ret;

        if (device)
                sis->bdev = device->bdev;
        *span = bsi.highest_ppage - bsi.lowest_ppage + 1;
        sis->max = bsi.nr_pages;
        sis->pages = bsi.nr_pages - 1;
        sis->highest_bit = bsi.nr_pages - 1;
        return bsi.nr_extents;
}
#else
static void btrfs_swap_deactivate(struct file *file)
{
}

static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
                               sector_t *span)
{
        return -EOPNOTSUPP;
}
#endif

/*
 * Update the number of bytes used in the VFS' inode. When we replace extents in
 * a range (clone, dedupe, fallocate's zero range), we must update the number of
 * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
 * always get a correct value.
 */
void btrfs_update_inode_bytes(struct btrfs_inode *inode,
                              const u64 add_bytes,
                              const u64 del_bytes)
{
        if (add_bytes == del_bytes)
                return;

        spin_lock(&inode->lock);
        if (del_bytes > 0)
                inode_sub_bytes(&inode->vfs_inode, del_bytes);
        if (add_bytes > 0)
                inode_add_bytes(&inode->vfs_inode, add_bytes);
        spin_unlock(&inode->lock);
}

/*
 * Verify that there are no ordered extents for a given file range.
 *
 * @inode:   The target inode.
 * @start:   Start offset of the file range, should be sector size aligned.
 * @end:     End offset (inclusive) of the file range, its value +1 should be
 *           sector size aligned.
 *
 * This should typically be used for cases where we locked an inode's VFS lock in
 * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
 * we have flushed all delalloc in the range, we have waited for all ordered
 * extents in the range to complete and finally we have locked the file range in
 * the inode's io_tree.
 */
void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
{
        struct btrfs_root *root = inode->root;
        struct btrfs_ordered_extent *ordered;

        if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
                return;

        ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
        if (ordered) {
                btrfs_err(root->fs_info,
"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
                          start, end, btrfs_ino(inode), btrfs_root_id(root),
                          ordered->file_offset,
                          ordered->file_offset + ordered->num_bytes - 1);
                btrfs_put_ordered_extent(ordered);
        }

        ASSERT(ordered == NULL);
}

/*
 * Find the first inode with a minimum number.
 *
 * @root:        The root to search for.
 * @min_ino:        The minimum inode number.
 *
 * Find the first inode in the @root with a number >= @min_ino and return it.
 * Returns NULL if no such inode found.
 */
struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino)
{
        struct rb_node *node;
        struct rb_node *prev;
        struct btrfs_inode *inode;

        spin_lock(&root->inode_lock);
again:
        node = root->inode_tree.rb_node;
        prev = NULL;
        while (node) {
                prev = node;
                inode = rb_entry(node, struct btrfs_inode, rb_node);
                if (min_ino < btrfs_ino(inode))
                        node = node->rb_left;
                else if (min_ino > btrfs_ino(inode))
                        node = node->rb_right;
                else
                        break;
        }

        if (!node) {
                while (prev) {
                        inode = rb_entry(prev, struct btrfs_inode, rb_node);
                        if (min_ino <= btrfs_ino(inode)) {
                                node = prev;
                                break;
                        }
                        prev = rb_next(prev);
                }
        }

        while (node) {
                inode = rb_entry(prev, struct btrfs_inode, rb_node);
                if (igrab(&inode->vfs_inode)) {
                        spin_unlock(&root->inode_lock);
                        return inode;
                }

                min_ino = btrfs_ino(inode) + 1;
                if (cond_resched_lock(&root->inode_lock))
                        goto again;

                node = rb_next(node);
        }
        spin_unlock(&root->inode_lock);

        return NULL;
}

static const struct inode_operations btrfs_dir_inode_operations = {
        .getattr        = btrfs_getattr,
        .lookup                = btrfs_lookup,
        .create                = btrfs_create,
        .unlink                = btrfs_unlink,
        .link                = btrfs_link,
        .mkdir                = btrfs_mkdir,
        .rmdir                = btrfs_rmdir,
        .rename                = btrfs_rename2,
        .symlink        = btrfs_symlink,
        .setattr        = btrfs_setattr,
        .mknod                = btrfs_mknod,
        .listxattr        = btrfs_listxattr,
        .permission        = btrfs_permission,
        .get_inode_acl        = btrfs_get_acl,
        .set_acl        = btrfs_set_acl,
        .update_time        = btrfs_update_time,
        .tmpfile        = btrfs_tmpfile,
        .fileattr_get        = btrfs_fileattr_get,
        .fileattr_set        = btrfs_fileattr_set,
};

static const struct file_operations btrfs_dir_file_operations = {
        .llseek                = btrfs_dir_llseek,
        .read                = generic_read_dir,
        .iterate_shared        = btrfs_real_readdir,
        .open                = btrfs_opendir,
        .unlocked_ioctl        = btrfs_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl        = btrfs_compat_ioctl,
#endif
        .release        = btrfs_release_file,
        .fsync                = btrfs_sync_file,
};

/*
 * btrfs doesn't support the bmap operation because swapfiles
 * use bmap to make a mapping of extents in the file.  They assume
 * these extents won't change over the life of the file and they
 * use the bmap result to do IO directly to the drive.
 *
 * the btrfs bmap call would return logical addresses that aren't
 * suitable for IO and they also will change frequently as COW
 * operations happen.  So, swapfile + btrfs == corruption.
 *
 * For now we're avoiding this by dropping bmap.
 */
static const struct address_space_operations btrfs_aops = {
        .read_folio        = btrfs_read_folio,
        .writepages        = btrfs_writepages,
        .readahead        = btrfs_readahead,
        .invalidate_folio = btrfs_invalidate_folio,
        .release_folio        = btrfs_release_folio,
        .migrate_folio        = btrfs_migrate_folio,
        .dirty_folio        = filemap_dirty_folio,
        .error_remove_folio = generic_error_remove_folio,
        .swap_activate        = btrfs_swap_activate,
        .swap_deactivate = btrfs_swap_deactivate,
};

static const struct inode_operations btrfs_file_inode_operations = {
        .getattr        = btrfs_getattr,
        .setattr        = btrfs_setattr,
        .listxattr      = btrfs_listxattr,
        .permission        = btrfs_permission,
        .fiemap                = btrfs_fiemap,
        .get_inode_acl        = btrfs_get_acl,
        .set_acl        = btrfs_set_acl,
        .update_time        = btrfs_update_time,
        .fileattr_get        = btrfs_fileattr_get,
        .fileattr_set        = btrfs_fileattr_set,
};
static const struct inode_operations btrfs_special_inode_operations = {
        .getattr        = btrfs_getattr,
        .setattr        = btrfs_setattr,
        .permission        = btrfs_permission,
        .listxattr        = btrfs_listxattr,
        .get_inode_acl        = btrfs_get_acl,
        .set_acl        = btrfs_set_acl,
        .update_time        = btrfs_update_time,
};
static const struct inode_operations btrfs_symlink_inode_operations = {
        .get_link        = page_get_link,
        .getattr        = btrfs_getattr,
        .setattr        = btrfs_setattr,
        .permission        = btrfs_permission,
        .listxattr        = btrfs_listxattr,
        .update_time        = btrfs_update_time,
};

const struct dentry_operations btrfs_dentry_operations = {
        .d_delete        = btrfs_dentry_delete,
};




































































































































































   29 






   29 

   28 

   25 



    2 



   30 

























































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/domain.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include "common.h"

#include <linux/binfmts.h>
#include <linux/slab.h>
#include <linux/rculist.h>

/* Variables definitions.*/

/* The initial domain. */
struct tomoyo_domain_info tomoyo_kernel_domain;

/**
 * tomoyo_update_policy - Update an entry for exception policy.
 *
 * @new_entry:       Pointer to "struct tomoyo_acl_info".
 * @size:            Size of @new_entry in bytes.
 * @param:           Pointer to "struct tomoyo_acl_param".
 * @check_duplicate: Callback function to find duplicated entry.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_update_policy(struct tomoyo_acl_head *new_entry, const int size,
                         struct tomoyo_acl_param *param,
                         bool (*check_duplicate)(const struct tomoyo_acl_head
                                                 *,
                                                 const struct tomoyo_acl_head
                                                 *))
{
        int error = param->is_delete ? -ENOENT : -ENOMEM;
        struct tomoyo_acl_head *entry;
        struct list_head *list = param->list;

        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                return -ENOMEM;
        list_for_each_entry_rcu(entry, list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (entry->is_deleted == TOMOYO_GC_IN_PROGRESS)
                        continue;
                if (!check_duplicate(entry, new_entry))
                        continue;
                entry->is_deleted = param->is_delete;
                error = 0;
                break;
        }
        if (error && !param->is_delete) {
                entry = tomoyo_commit_ok(new_entry, size);
                if (entry) {
                        list_add_tail_rcu(&entry->list, list);
                        error = 0;
                }
        }
        mutex_unlock(&tomoyo_policy_lock);
        return error;
}

/**
 * tomoyo_same_acl_head - Check for duplicated "struct tomoyo_acl_info" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_same_acl_head(const struct tomoyo_acl_info *a,
                                        const struct tomoyo_acl_info *b)
{
        return a->type == b->type && a->cond == b->cond;
}

/**
 * tomoyo_update_domain - Update an entry for domain policy.
 *
 * @new_entry:       Pointer to "struct tomoyo_acl_info".
 * @size:            Size of @new_entry in bytes.
 * @param:           Pointer to "struct tomoyo_acl_param".
 * @check_duplicate: Callback function to find duplicated entry.
 * @merge_duplicate: Callback function to merge duplicated entry.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_update_domain(struct tomoyo_acl_info *new_entry, const int size,
                         struct tomoyo_acl_param *param,
                         bool (*check_duplicate)(const struct tomoyo_acl_info
                                                 *,
                                                 const struct tomoyo_acl_info
                                                 *),
                         bool (*merge_duplicate)(struct tomoyo_acl_info *,
                                                 struct tomoyo_acl_info *,
                                                 const bool))
{
        const bool is_delete = param->is_delete;
        int error = is_delete ? -ENOENT : -ENOMEM;
        struct tomoyo_acl_info *entry;
        struct list_head * const list = param->list;

        if (param->data[0]) {
                new_entry->cond = tomoyo_get_condition(param);
                if (!new_entry->cond)
                        return -EINVAL;
                /*
                 * Domain transition preference is allowed for only
                 * "file execute" entries.
                 */
                if (new_entry->cond->transit &&
                    !(new_entry->type == TOMOYO_TYPE_PATH_ACL &&
                      container_of(new_entry, struct tomoyo_path_acl, head)
                      ->perm == 1 << TOMOYO_TYPE_EXECUTE))
                        goto out;
        }
        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                goto out;
        list_for_each_entry_rcu(entry, list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (entry->is_deleted == TOMOYO_GC_IN_PROGRESS)
                        continue;
                if (!tomoyo_same_acl_head(entry, new_entry) ||
                    !check_duplicate(entry, new_entry))
                        continue;
                if (merge_duplicate)
                        entry->is_deleted = merge_duplicate(entry, new_entry,
                                                            is_delete);
                else
                        entry->is_deleted = is_delete;
                error = 0;
                break;
        }
        if (error && !is_delete) {
                entry = tomoyo_commit_ok(new_entry, size);
                if (entry) {
                        list_add_tail_rcu(&entry->list, list);
                        error = 0;
                }
        }
        mutex_unlock(&tomoyo_policy_lock);
out:
        tomoyo_put_condition(new_entry->cond);
        return error;
}

/**
 * tomoyo_check_acl - Do permission check.
 *
 * @r:           Pointer to "struct tomoyo_request_info".
 * @check_entry: Callback function to check type specific parameters.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
void tomoyo_check_acl(struct tomoyo_request_info *r,
                      bool (*check_entry)(struct tomoyo_request_info *,
                                          const struct tomoyo_acl_info *))
{
        const struct tomoyo_domain_info *domain = r->domain;
        struct tomoyo_acl_info *ptr;
        const struct list_head *list = &domain->acl_info_list;
        u16 i = 0;

retry:
        list_for_each_entry_rcu(ptr, list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (ptr->is_deleted || ptr->type != r->param_type)
                        continue;
                if (!check_entry(r, ptr))
                        continue;
                if (!tomoyo_condition(r, ptr->cond))
                        continue;
                r->matched_acl = ptr;
                r->granted = true;
                return;
        }
        for (; i < TOMOYO_MAX_ACL_GROUPS; i++) {
                if (!test_bit(i, domain->group))
                        continue;
                list = &domain->ns->acl_group[i++];
                goto retry;
        }
        r->granted = false;
}

/* The list for "struct tomoyo_domain_info". */
LIST_HEAD(tomoyo_domain_list);

/**
 * tomoyo_last_word - Get last component of a domainname.
 *
 * @name: Domainname to check.
 *
 * Returns the last word of @domainname.
 */
static const char *tomoyo_last_word(const char *name)
{
        const char *cp = strrchr(name, ' ');

        if (cp)
                return cp + 1;
        return name;
}

/**
 * tomoyo_same_transition_control - Check for duplicated "struct tomoyo_transition_control" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_head".
 * @b: Pointer to "struct tomoyo_acl_head".
 *
 * Returns true if @a == @b, false otherwise.
 */
static bool tomoyo_same_transition_control(const struct tomoyo_acl_head *a,
                                           const struct tomoyo_acl_head *b)
{
        const struct tomoyo_transition_control *p1 = container_of(a,
                                                                  typeof(*p1),
                                                                  head);
        const struct tomoyo_transition_control *p2 = container_of(b,
                                                                  typeof(*p2),
                                                                  head);

        return p1->type == p2->type && p1->is_last_name == p2->is_last_name
                && p1->domainname == p2->domainname
                && p1->program == p2->program;
}

/**
 * tomoyo_write_transition_control - Write "struct tomoyo_transition_control" list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 * @type:  Type of this entry.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_write_transition_control(struct tomoyo_acl_param *param,
                                    const u8 type)
{
        struct tomoyo_transition_control e = { .type = type };
        int error = param->is_delete ? -ENOENT : -ENOMEM;
        char *program = param->data;
        char *domainname = strstr(program, " from ");

        if (domainname) {
                *domainname = '\0';
                domainname += 6;
        } else if (type == TOMOYO_TRANSITION_CONTROL_NO_KEEP ||
                   type == TOMOYO_TRANSITION_CONTROL_KEEP) {
                domainname = program;
                program = NULL;
        }
        if (program && strcmp(program, "any")) {
                if (!tomoyo_correct_path(program))
                        return -EINVAL;
                e.program = tomoyo_get_name(program);
                if (!e.program)
                        goto out;
        }
        if (domainname && strcmp(domainname, "any")) {
                if (!tomoyo_correct_domain(domainname)) {
                        if (!tomoyo_correct_path(domainname))
                                goto out;
                        e.is_last_name = true;
                }
                e.domainname = tomoyo_get_name(domainname);
                if (!e.domainname)
                        goto out;
        }
        param->list = &param->ns->policy_list[TOMOYO_ID_TRANSITION_CONTROL];
        error = tomoyo_update_policy(&e.head, sizeof(e), param,
                                     tomoyo_same_transition_control);
out:
        tomoyo_put_name(e.domainname);
        tomoyo_put_name(e.program);
        return error;
}

/**
 * tomoyo_scan_transition - Try to find specific domain transition type.
 *
 * @list:       Pointer to "struct list_head".
 * @domainname: The name of current domain.
 * @program:    The name of requested program.
 * @last_name:  The last component of @domainname.
 * @type:       One of values in "enum tomoyo_transition_type".
 *
 * Returns true if found one, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static inline bool tomoyo_scan_transition
(const struct list_head *list, const struct tomoyo_path_info *domainname,
 const struct tomoyo_path_info *program, const char *last_name,
 const enum tomoyo_transition_type type)
{
        const struct tomoyo_transition_control *ptr;

        list_for_each_entry_rcu(ptr, list, head.list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (ptr->head.is_deleted || ptr->type != type)
                        continue;
                if (ptr->domainname) {
                        if (!ptr->is_last_name) {
                                if (ptr->domainname != domainname)
                                        continue;
                        } else {
                                /*
                                 * Use direct strcmp() since this is
                                 * unlikely used.
                                 */
                                if (strcmp(ptr->domainname->name, last_name))
                                        continue;
                        }
                }
                if (ptr->program && tomoyo_pathcmp(ptr->program, program))
                        continue;
                return true;
        }
        return false;
}

/**
 * tomoyo_transition_type - Get domain transition type.
 *
 * @ns:         Pointer to "struct tomoyo_policy_namespace".
 * @domainname: The name of current domain.
 * @program:    The name of requested program.
 *
 * Returns TOMOYO_TRANSITION_CONTROL_TRANSIT if executing @program causes
 * domain transition across namespaces, TOMOYO_TRANSITION_CONTROL_INITIALIZE if
 * executing @program reinitializes domain transition within that namespace,
 * TOMOYO_TRANSITION_CONTROL_KEEP if executing @program stays at @domainname ,
 * others otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static enum tomoyo_transition_type tomoyo_transition_type
(const struct tomoyo_policy_namespace *ns,
 const struct tomoyo_path_info *domainname,
 const struct tomoyo_path_info *program)
{
        const char *last_name = tomoyo_last_word(domainname->name);
        enum tomoyo_transition_type type = TOMOYO_TRANSITION_CONTROL_NO_RESET;

        while (type < TOMOYO_MAX_TRANSITION_TYPE) {
                const struct list_head * const list =
                        &ns->policy_list[TOMOYO_ID_TRANSITION_CONTROL];

                if (!tomoyo_scan_transition(list, domainname, program,
                                            last_name, type)) {
                        type++;
                        continue;
                }
                if (type != TOMOYO_TRANSITION_CONTROL_NO_RESET &&
                    type != TOMOYO_TRANSITION_CONTROL_NO_INITIALIZE)
                        break;
                /*
                 * Do not check for reset_domain if no_reset_domain matched.
                 * Do not check for initialize_domain if no_initialize_domain
                 * matched.
                 */
                type++;
                type++;
        }
        return type;
}

/**
 * tomoyo_same_aggregator - Check for duplicated "struct tomoyo_aggregator" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_head".
 * @b: Pointer to "struct tomoyo_acl_head".
 *
 * Returns true if @a == @b, false otherwise.
 */
static bool tomoyo_same_aggregator(const struct tomoyo_acl_head *a,
                                   const struct tomoyo_acl_head *b)
{
        const struct tomoyo_aggregator *p1 = container_of(a, typeof(*p1),
                                                          head);
        const struct tomoyo_aggregator *p2 = container_of(b, typeof(*p2),
                                                          head);

        return p1->original_name == p2->original_name &&
                p1->aggregated_name == p2->aggregated_name;
}

/**
 * tomoyo_write_aggregator - Write "struct tomoyo_aggregator" list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_write_aggregator(struct tomoyo_acl_param *param)
{
        struct tomoyo_aggregator e = { };
        int error = param->is_delete ? -ENOENT : -ENOMEM;
        const char *original_name = tomoyo_read_token(param);
        const char *aggregated_name = tomoyo_read_token(param);

        if (!tomoyo_correct_word(original_name) ||
            !tomoyo_correct_path(aggregated_name))
                return -EINVAL;
        e.original_name = tomoyo_get_name(original_name);
        e.aggregated_name = tomoyo_get_name(aggregated_name);
        if (!e.original_name || !e.aggregated_name ||
            e.aggregated_name->is_patterned) /* No patterns allowed. */
                goto out;
        param->list = &param->ns->policy_list[TOMOYO_ID_AGGREGATOR];
        error = tomoyo_update_policy(&e.head, sizeof(e), param,
                                     tomoyo_same_aggregator);
out:
        tomoyo_put_name(e.original_name);
        tomoyo_put_name(e.aggregated_name);
        return error;
}

/**
 * tomoyo_find_namespace - Find specified namespace.
 *
 * @name: Name of namespace to find.
 * @len:  Length of @name.
 *
 * Returns pointer to "struct tomoyo_policy_namespace" if found,
 * NULL otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static struct tomoyo_policy_namespace *tomoyo_find_namespace
(const char *name, const unsigned int len)
{
        struct tomoyo_policy_namespace *ns;

        list_for_each_entry(ns, &tomoyo_namespace_list, namespace_list) {
                if (strncmp(name, ns->name, len) ||
                    (name[len] && name[len] != ' '))
                        continue;
                return ns;
        }
        return NULL;
}

/**
 * tomoyo_assign_namespace - Create a new namespace.
 *
 * @domainname: Name of namespace to create.
 *
 * Returns pointer to "struct tomoyo_policy_namespace" on success,
 * NULL otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
struct tomoyo_policy_namespace *tomoyo_assign_namespace(const char *domainname)
{
        struct tomoyo_policy_namespace *ptr;
        struct tomoyo_policy_namespace *entry;
        const char *cp = domainname;
        unsigned int len = 0;

        while (*cp && *cp++ != ' ')
                len++;
        ptr = tomoyo_find_namespace(domainname, len);
        if (ptr)
                return ptr;
        if (len >= TOMOYO_EXEC_TMPSIZE - 10 || !tomoyo_domain_def(domainname))
                return NULL;
        entry = kzalloc(sizeof(*entry) + len + 1, GFP_NOFS | __GFP_NOWARN);
        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                goto out;
        ptr = tomoyo_find_namespace(domainname, len);
        if (!ptr && tomoyo_memory_ok(entry)) {
                char *name = (char *) (entry + 1);

                ptr = entry;
                memmove(name, domainname, len);
                name[len] = '\0';
                entry->name = name;
                tomoyo_init_policy_namespace(entry);
                entry = NULL;
        }
        mutex_unlock(&tomoyo_policy_lock);
out:
        kfree(entry);
        return ptr;
}

/**
 * tomoyo_namespace_jump - Check for namespace jump.
 *
 * @domainname: Name of domain.
 *
 * Returns true if namespace differs, false otherwise.
 */
static bool tomoyo_namespace_jump(const char *domainname)
{
        const char *namespace = tomoyo_current_namespace()->name;
        const int len = strlen(namespace);

        return strncmp(domainname, namespace, len) ||
                (domainname[len] && domainname[len] != ' ');
}

/**
 * tomoyo_assign_domain - Create a domain or a namespace.
 *
 * @domainname: The name of domain.
 * @transit:    True if transit to domain found or created.
 *
 * Returns pointer to "struct tomoyo_domain_info" on success, NULL otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname,
                                                const bool transit)
{
        struct tomoyo_domain_info e = { };
        struct tomoyo_domain_info *entry = tomoyo_find_domain(domainname);
        bool created = false;

        if (entry) {
                if (transit) {
                        /*
                         * Since namespace is created at runtime, profiles may
                         * not be created by the moment the process transits to
                         * that domain. Do not perform domain transition if
                         * profile for that domain is not yet created.
                         */
                        if (tomoyo_policy_loaded &&
                            !entry->ns->profile_ptr[entry->profile])
                                return NULL;
                }
                return entry;
        }
        /* Requested domain does not exist. */
        /* Don't create requested domain if domainname is invalid. */
        if (strlen(domainname) >= TOMOYO_EXEC_TMPSIZE - 10 ||
            !tomoyo_correct_domain(domainname))
                return NULL;
        /*
         * Since definition of profiles and acl_groups may differ across
         * namespaces, do not inherit "use_profile" and "use_group" settings
         * by automatically creating requested domain upon domain transition.
         */
        if (transit && tomoyo_namespace_jump(domainname))
                return NULL;
        e.ns = tomoyo_assign_namespace(domainname);
        if (!e.ns)
                return NULL;
        /*
         * "use_profile" and "use_group" settings for automatically created
         * domains are inherited from current domain. These are 0 for manually
         * created domains.
         */
        if (transit) {
                const struct tomoyo_domain_info *domain = tomoyo_domain();

                e.profile = domain->profile;
                memcpy(e.group, domain->group, sizeof(e.group));
        }
        e.domainname = tomoyo_get_name(domainname);
        if (!e.domainname)
                return NULL;
        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                goto out;
        entry = tomoyo_find_domain(domainname);
        if (!entry) {
                entry = tomoyo_commit_ok(&e, sizeof(e));
                if (entry) {
                        INIT_LIST_HEAD(&entry->acl_info_list);
                        list_add_tail_rcu(&entry->list, &tomoyo_domain_list);
                        created = true;
                }
        }
        mutex_unlock(&tomoyo_policy_lock);
out:
        tomoyo_put_name(e.domainname);
        if (entry && transit) {
                if (created) {
                        struct tomoyo_request_info r;
                        int i;

                        tomoyo_init_request_info(&r, entry,
                                                 TOMOYO_MAC_FILE_EXECUTE);
                        r.granted = false;
                        tomoyo_write_log(&r, "use_profile %u\n",
                                         entry->profile);
                        for (i = 0; i < TOMOYO_MAX_ACL_GROUPS; i++)
                                if (test_bit(i, entry->group))
                                        tomoyo_write_log(&r, "use_group %u\n",
                                                         i);
                        tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES);
                }
        }
        return entry;
}

/**
 * tomoyo_environ - Check permission for environment variable names.
 *
 * @ee: Pointer to "struct tomoyo_execve".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_environ(struct tomoyo_execve *ee)
{
        struct tomoyo_request_info *r = &ee->r;
        struct linux_binprm *bprm = ee->bprm;
        /* env_page.data is allocated by tomoyo_dump_page(). */
        struct tomoyo_page_dump env_page = { };
        char *arg_ptr; /* Size is TOMOYO_EXEC_TMPSIZE bytes */
        int arg_len = 0;
        unsigned long pos = bprm->p;
        int offset = pos % PAGE_SIZE;
        int argv_count = bprm->argc;
        int envp_count = bprm->envc;
        int error = -ENOMEM;

        ee->r.type = TOMOYO_MAC_ENVIRON;
        ee->r.profile = r->domain->profile;
        ee->r.mode = tomoyo_get_mode(r->domain->ns, ee->r.profile,
                                     TOMOYO_MAC_ENVIRON);
        if (!r->mode || !envp_count)
                return 0;
        arg_ptr = kzalloc(TOMOYO_EXEC_TMPSIZE, GFP_NOFS);
        if (!arg_ptr)
                goto out;
        while (error == -ENOMEM) {
                if (!tomoyo_dump_page(bprm, pos, &env_page))
                        goto out;
                pos += PAGE_SIZE - offset;
                /* Read. */
                while (argv_count && offset < PAGE_SIZE) {
                        if (!env_page.data[offset++])
                                argv_count--;
                }
                if (argv_count) {
                        offset = 0;
                        continue;
                }
                while (offset < PAGE_SIZE) {
                        const unsigned char c = env_page.data[offset++];

                        if (c && arg_len < TOMOYO_EXEC_TMPSIZE - 10) {
                                if (c == '=') {
                                        arg_ptr[arg_len++] = '\0';
                                } else if (c == '\\') {
                                        arg_ptr[arg_len++] = '\\';
                                        arg_ptr[arg_len++] = '\\';
                                } else if (c > ' ' && c < 127) {
                                        arg_ptr[arg_len++] = c;
                                } else {
                                        arg_ptr[arg_len++] = '\\';
                                        arg_ptr[arg_len++] = (c >> 6) + '0';
                                        arg_ptr[arg_len++]
                                                = ((c >> 3) & 7) + '0';
                                        arg_ptr[arg_len++] = (c & 7) + '0';
                                }
                        } else {
                                arg_ptr[arg_len] = '\0';
                        }
                        if (c)
                                continue;
                        if (tomoyo_env_perm(r, arg_ptr)) {
                                error = -EPERM;
                                break;
                        }
                        if (!--envp_count) {
                                error = 0;
                                break;
                        }
                        arg_len = 0;
                }
                offset = 0;
        }
out:
        if (r->mode != TOMOYO_CONFIG_ENFORCING)
                error = 0;
        kfree(env_page.data);
        kfree(arg_ptr);
        return error;
}

/**
 * tomoyo_find_next_domain - Find a domain.
 *
 * @bprm: Pointer to "struct linux_binprm".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_find_next_domain(struct linux_binprm *bprm)
{
        struct tomoyo_domain_info *old_domain = tomoyo_domain();
        struct tomoyo_domain_info *domain = NULL;
        const char *original_name = bprm->filename;
        int retval = -ENOMEM;
        bool reject_on_transition_failure = false;
        const struct tomoyo_path_info *candidate;
        struct tomoyo_path_info exename;
        struct tomoyo_execve *ee = kzalloc(sizeof(*ee), GFP_NOFS);

        if (!ee)
                return -ENOMEM;
        ee->tmp = kzalloc(TOMOYO_EXEC_TMPSIZE, GFP_NOFS);
        if (!ee->tmp) {
                kfree(ee);
                return -ENOMEM;
        }
        /* ee->dump->data is allocated by tomoyo_dump_page(). */
        tomoyo_init_request_info(&ee->r, NULL, TOMOYO_MAC_FILE_EXECUTE);
        ee->r.ee = ee;
        ee->bprm = bprm;
        ee->r.obj = &ee->obj;
        ee->obj.path1 = bprm->file->f_path;
        /* Get symlink's pathname of program. */
        retval = -ENOENT;
        exename.name = tomoyo_realpath_nofollow(original_name);
        if (!exename.name)
                goto out;
        tomoyo_fill_path_info(&exename);
retry:
        /* Check 'aggregator' directive. */
        {
                struct tomoyo_aggregator *ptr;
                struct list_head *list =
                        &old_domain->ns->policy_list[TOMOYO_ID_AGGREGATOR];

                /* Check 'aggregator' directive. */
                candidate = &exename;
                list_for_each_entry_rcu(ptr, list, head.list,
                                        srcu_read_lock_held(&tomoyo_ss)) {
                        if (ptr->head.is_deleted ||
                            !tomoyo_path_matches_pattern(&exename,
                                                         ptr->original_name))
                                continue;
                        candidate = ptr->aggregated_name;
                        break;
                }
        }

        /* Check execute permission. */
        retval = tomoyo_execute_permission(&ee->r, candidate);
        if (retval == TOMOYO_RETRY_REQUEST)
                goto retry;
        if (retval < 0)
                goto out;
        /*
         * To be able to specify domainnames with wildcards, use the
         * pathname specified in the policy (which may contain
         * wildcard) rather than the pathname passed to execve()
         * (which never contains wildcard).
         */
        if (ee->r.param.path.matched_path)
                candidate = ee->r.param.path.matched_path;

        /*
         * Check for domain transition preference if "file execute" matched.
         * If preference is given, make execve() fail if domain transition
         * has failed, for domain transition preference should be used with
         * destination domain defined.
         */
        if (ee->transition) {
                const char *domainname = ee->transition->name;

                reject_on_transition_failure = true;
                if (!strcmp(domainname, "keep"))
                        goto force_keep_domain;
                if (!strcmp(domainname, "child"))
                        goto force_child_domain;
                if (!strcmp(domainname, "reset"))
                        goto force_reset_domain;
                if (!strcmp(domainname, "initialize"))
                        goto force_initialize_domain;
                if (!strcmp(domainname, "parent")) {
                        char *cp;

                        strscpy(ee->tmp, old_domain->domainname->name, TOMOYO_EXEC_TMPSIZE);
                        cp = strrchr(ee->tmp, ' ');
                        if (cp)
                                *cp = '\0';
                } else if (*domainname == '<')
                        strscpy(ee->tmp, domainname, TOMOYO_EXEC_TMPSIZE);
                else
                        snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "%s %s",
                                 old_domain->domainname->name, domainname);
                goto force_jump_domain;
        }
        /*
         * No domain transition preference specified.
         * Calculate domain to transit to.
         */
        switch (tomoyo_transition_type(old_domain->ns, old_domain->domainname,
                                       candidate)) {
        case TOMOYO_TRANSITION_CONTROL_RESET:
force_reset_domain:
                /* Transit to the root of specified namespace. */
                snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "<%s>",
                         candidate->name);
                /*
                 * Make execve() fail if domain transition across namespaces
                 * has failed.
                 */
                reject_on_transition_failure = true;
                break;
        case TOMOYO_TRANSITION_CONTROL_INITIALIZE:
force_initialize_domain:
                /* Transit to the child of current namespace's root. */
                snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "%s %s",
                         old_domain->ns->name, candidate->name);
                break;
        case TOMOYO_TRANSITION_CONTROL_KEEP:
force_keep_domain:
                /* Keep current domain. */
                domain = old_domain;
                break;
        default:
                if (old_domain == &tomoyo_kernel_domain &&
                    !tomoyo_policy_loaded) {
                        /*
                         * Needn't to transit from kernel domain before
                         * starting /sbin/init. But transit from kernel domain
                         * if executing initializers because they might start
                         * before /sbin/init.
                         */
                        domain = old_domain;
                        break;
                }
force_child_domain:
                /* Normal domain transition. */
                snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "%s %s",
                         old_domain->domainname->name, candidate->name);
                break;
        }
force_jump_domain:
        if (!domain)
                domain = tomoyo_assign_domain(ee->tmp, true);
        if (domain)
                retval = 0;
        else if (reject_on_transition_failure) {
                pr_warn("ERROR: Domain '%s' not ready.\n", ee->tmp);
                retval = -ENOMEM;
        } else if (ee->r.mode == TOMOYO_CONFIG_ENFORCING)
                retval = -ENOMEM;
        else {
                retval = 0;
                if (!old_domain->flags[TOMOYO_DIF_TRANSITION_FAILED]) {
                        old_domain->flags[TOMOYO_DIF_TRANSITION_FAILED] = true;
                        ee->r.granted = false;
                        tomoyo_write_log(&ee->r, "%s", tomoyo_dif
                                         [TOMOYO_DIF_TRANSITION_FAILED]);
                        pr_warn("ERROR: Domain '%s' not defined.\n", ee->tmp);
                }
        }
 out:
        if (!domain)
                domain = old_domain;
        /* Update reference count on "struct tomoyo_domain_info". */
        {
                struct tomoyo_task *s = tomoyo_task(current);

                s->old_domain_info = s->domain_info;
                s->domain_info = domain;
                atomic_inc(&domain->users);
        }
        kfree(exename.name);
        if (!retval) {
                ee->r.domain = domain;
                retval = tomoyo_environ(ee);
        }
        kfree(ee->tmp);
        kfree(ee->dump.data);
        kfree(ee);
        return retval;
}

/**
 * tomoyo_dump_page - Dump a page to buffer.
 *
 * @bprm: Pointer to "struct linux_binprm".
 * @pos:  Location to dump.
 * @dump: Pointer to "struct tomoyo_page_dump".
 *
 * Returns true on success, false otherwise.
 */
bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
                      struct tomoyo_page_dump *dump)
{
        struct page *page;
#ifdef CONFIG_MMU
        int ret;
#endif

        /* dump->data is released by tomoyo_find_next_domain(). */
        if (!dump->data) {
                dump->data = kzalloc(PAGE_SIZE, GFP_NOFS);
                if (!dump->data)
                        return false;
        }
        /* Same with get_arg_page(bprm, pos, 0) in fs/exec.c */
#ifdef CONFIG_MMU
        /*
         * This is called at execve() time in order to dig around
         * in the argv/environment of the new proceess
         * (represented by bprm).
         */
        mmap_read_lock(bprm->mm);
        ret = get_user_pages_remote(bprm->mm, pos, 1,
                                    FOLL_FORCE, &page, NULL);
        mmap_read_unlock(bprm->mm);
        if (ret <= 0)
                return false;
#else
        page = bprm->page[pos / PAGE_SIZE];
#endif
        if (page != dump->page) {
                const unsigned int offset = pos % PAGE_SIZE;
                /*
                 * Maybe kmap()/kunmap() should be used here.
                 * But remove_arg_zero() uses kmap_atomic()/kunmap_atomic().
                 * So do I.
                 */
                char *kaddr = kmap_atomic(page);

                dump->page = page;
                memcpy(dump->data + offset, kaddr + offset,
                       PAGE_SIZE - offset);
                kunmap_atomic(kaddr);
        }
        /* Same with put_arg_page(page) in fs/exec.c */
#ifdef CONFIG_MMU
        put_page(page);
#endif
        return true;
}






















































   14 












    1 





   14 



   14 


































   18 




    1 

   14 




    1 
    1 








    1 












    1 

    1 


    1 


    1 









    1 











    1 

    1 











    1 










    1 









    1 




    1 





    1 














    1 









    1 










    1 











































    1 


    1 
















    3 










    1 




    1 
    1 
















    1 














    1 








   11 



   10 































































































   12 
















   13 







    4 

   11 









    6 











   13 




















    3 

    1 

    4 


    1 









    1 










    2 
   11 





   12 








    2 





































    1 



    5 


    5 














    5 














    5 

    2 




    4 

    2 









    5 








    5 


    4 









    1 







    5 






























    2 




    1 



    1 



    1 




    1 
    1 


    1 




















    1 












    1 
    1 












    9 






    8 

    3 
















    9 











    3 







    2 

    3 




    3 











    8 






    8 













    7 











    8 

    8 


    7 






























































































    3 












    3 





    1 
    2 

    3 

    3 



    1 
    1 



    3 

    1 
    3 

    1 

    1 
    3 





    1 
    2 





    1 
    1 






    1 





    3 
    1 




    1 
    3 


























    1 











    1 









    1 
























    1 
    1 






















    1 











    1 


















    1 












    1 


    1 


    1 































    1 
    1 







    1 

    1 













    1 

    1 

    1 







    1 
    1 

    1 










    1 



































































































































    4 









    4 










    4 






    3 



    3 




    1 
    1 




















    1 






























    2 







    2 





    3 

    1 
    1 











    3 



    1 
    3 



    3 

    3 
    3 
    2 
    1 
    1 


    1 



    3 























































    3 






    1 




    1 

































    4 








    2 




















    1 















    2 


    2 






    2 





















    4 




    4 
    4 
    4 

















    4 














    4 
    1 


    1 
    1 








    1 
    1 





    2 
    2 




    2 





















    1 

    1 


    2 

































    1 
    2 









    3 

    2 












    1 

























    3 


    3 








    3 

    1 


    3 




    3 

    2 



    1 

    1 
    2 




































    1 













    1 










    1 
    1 



    2 
    1 

    2 

































    2 





    2 



    1 
    2 

    2 














    2 






    1 










    3 



















    3 








    1 

    1 


    2 









































    3 

    3 



    1 

































    2 


    1 







    1 








    1 


    1 



















    3 

    3 































































    3 











    3 

    3 






    2 






    1 


    1 


    1 
    2 









    3 

    1 



    3 


    2 














    2 
    3 







    3 

















    1 
    3 
    3 

























    3 




    3 




    1 



    2 



    3 
    3 





    3 











































    2 








    1 















    3 












    3 
    3 


    3 









    3 





    3 



























    2 







    3 








    3 
















    3 


    3 











    3 




















    3 


    3 















    3 







    3 














    3 



















    3 
































































    5 
















    3 










    3 





    2 

    2 




































    2 
















    2 

    2 


    2 









































    2 






    2 

    1 
    1 

    2 
















    3 


    3 


    1 


    2 


    2 




    2 


    2 











































































































    1 



    1 







    2 
    2 


    2 





























    3 



    4 


    4 


    2 

    2 










    1 













    1 








    1 



    1 
    1 




    1 
    1 


    1 














    1 
    1 



    1 
    1 





















    1 

    1 
















    1 








    1 

    1 













    1 








    1 




    1 


    1 





    1 










    1 




    1 




    1 
































    1 







































































































    1 
    2 
    2 


































































    2 
    2 












    1 

    1 


































































































































































































    3 















































    2 








    2 












































































































































































    1 



    1 
















































    3 











    1 
    2 
    3 

    3 


    1 
    2 

    2 








    3 






    3 









    3 

























    3 


    2 
    1 









    1 
    2 



    3 




    3 
























































   18 







   17 













   16 
    1 








    1 
    1 





   16 
   16 
   15 
   16 





   18 















   18 








    1 


   15 





   15 



    1 

   14 

   18 












   17 


























   17 









   18 
























































































































    1 






    1 



















    3 



    2 





    2 








    5 



    5 















    5 























































































    1 


    1 







    1 



















    1 














    1 










































































































































































































































































































    1 









































    1 
    1 
    1 

    1 


    1 


























   17 


   17 

   14 












   16 
    1 







   16 














   16 

   17 

   15 






































    1 




































    1 
    1 




    1 











































































    4 





















    4 



    5 



    5 

    2 



    5 





    1 




















    1 
    1 







    4 





    1 

















    1 
























    1 












    1 






















    1 









    1 















    1 







    1 









    4 


    5 
    2 










    1 




    1 

    1 









    1 





    2 











    2 
















    2 

    2 

    2 

    2 

    2 

    2 




















































    5 

























    5 


















    5 















    3 
    3 


    3 

















    1 
    3 








   14 












   18 












   16 










   17 




































































































































































   17 





   18 




   15 





   14 




















   15 





   14 



































































































    2 

    2 




















    1 













    2 



    2 















































































    2 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/inode.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/inode.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  64-bit file support on 64-bit platforms by Jakub Jelinek
 *        (jj@sunsite.ms.mff.cuni.cz)
 *
 *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
 */

#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/kernel.h>
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/iomap.h>
#include <linux/iversion.h>

#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
#include "truncate.h"

#include <trace/events/ext4.h>

static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
                              struct ext4_inode_info *ei)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        __u32 csum;
        __u16 dummy_csum = 0;
        int offset = offsetof(struct ext4_inode, i_checksum_lo);
        unsigned int csum_size = sizeof(dummy_csum);

        csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset);
        csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size);
        offset += csum_size;
        csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
                           EXT4_GOOD_OLD_INODE_SIZE - offset);

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                offset = offsetof(struct ext4_inode, i_checksum_hi);
                csum = ext4_chksum(sbi, csum, (__u8 *)raw +
                                   EXT4_GOOD_OLD_INODE_SIZE,
                                   offset - EXT4_GOOD_OLD_INODE_SIZE);
                if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
                        csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
                                           csum_size);
                        offset += csum_size;
                }
                csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
                                   EXT4_INODE_SIZE(inode->i_sb) - offset);
        }

        return csum;
}

static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
                                  struct ext4_inode_info *ei)
{
        __u32 provided, calculated;

        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_LINUX) ||
            !ext4_has_metadata_csum(inode->i_sb))
                return 1;

        provided = le16_to_cpu(raw->i_checksum_lo);
        calculated = ext4_inode_csum(inode, raw, ei);
        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
                provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
        else
                calculated &= 0xFFFF;

        return provided == calculated;
}

void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
                         struct ext4_inode_info *ei)
{
        __u32 csum;

        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_LINUX) ||
            !ext4_has_metadata_csum(inode->i_sb))
                return;

        csum = ext4_inode_csum(inode, raw, ei);
        raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
                raw->i_checksum_hi = cpu_to_le16(csum >> 16);
}

static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
{
        trace_ext4_begin_ordered_truncate(inode, new_size);
        /*
         * If jinode is zero, then we never opened the file for
         * writing, so there's no need to call
         * jbd2_journal_begin_ordered_truncate() since there's no
         * outstanding writes we need to flush.
         */
        if (!EXT4_I(inode)->jinode)
                return 0;
        return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
                                                   EXT4_I(inode)->jinode,
                                                   new_size);
}

static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
                                  int pextents);

/*
 * Test whether an inode is a fast symlink.
 * A fast symlink has its symlink data stored in ext4_inode_info->i_data.
 */
int ext4_inode_is_fast_symlink(struct inode *inode)
{
        if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
                int ea_blocks = EXT4_I(inode)->i_file_acl ?
                                EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;

                if (ext4_has_inline_data(inode))
                        return 0;

                return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
        }
        return S_ISLNK(inode->i_mode) && inode->i_size &&
               (inode->i_size < EXT4_N_BLOCKS * 4);
}

/*
 * Called at the last iput() if i_nlink is zero.
 */
void ext4_evict_inode(struct inode *inode)
{
        handle_t *handle;
        int err;
        /*
         * Credits for final inode cleanup and freeing:
         * sb + inode (ext4_orphan_del()), block bitmap, group descriptor
         * (xattr block freeing), bitmap, group descriptor (inode freeing)
         */
        int extra_credits = 6;
        struct ext4_xattr_inode_array *ea_inode_array = NULL;
        bool freeze_protected = false;

        trace_ext4_evict_inode(inode);

        if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
                ext4_evict_ea_inode(inode);
        if (inode->i_nlink) {
                truncate_inode_pages_final(&inode->i_data);

                goto no_delete;
        }

        if (is_bad_inode(inode))
                goto no_delete;
        dquot_initialize(inode);

        if (ext4_should_order_data(inode))
                ext4_begin_ordered_truncate(inode, 0);
        truncate_inode_pages_final(&inode->i_data);

        /*
         * For inodes with journalled data, transaction commit could have
         * dirtied the inode. And for inodes with dioread_nolock, unwritten
         * extents converting worker could merge extents and also have dirtied
         * the inode. Flush worker is ignoring it because of I_FREEING flag but
         * we still need to remove the inode from the writeback lists.
         */
        if (!list_empty_careful(&inode->i_io_list))
                inode_io_list_del(inode);

        /*
         * Protect us against freezing - iput() caller didn't have to have any
         * protection against it. When we are in a running transaction though,
         * we are already protected against freezing and we cannot grab further
         * protection due to lock ordering constraints.
         */
        if (!ext4_journal_current_handle()) {
                sb_start_intwrite(inode->i_sb);
                freeze_protected = true;
        }

        if (!IS_NOQUOTA(inode))
                extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);

        /*
         * Block bitmap, group descriptor, and inode are accounted in both
         * ext4_blocks_for_truncate() and extra_credits. So subtract 3.
         */
        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
                         ext4_blocks_for_truncate(inode) + extra_credits - 3);
        if (IS_ERR(handle)) {
                ext4_std_error(inode->i_sb, PTR_ERR(handle));
                /*
                 * If we're going to skip the normal cleanup, we still need to
                 * make sure that the in-core orphan linked list is properly
                 * cleaned up.
                 */
                ext4_orphan_del(NULL, inode);
                if (freeze_protected)
                        sb_end_intwrite(inode->i_sb);
                goto no_delete;
        }

        if (IS_SYNC(inode))
                ext4_handle_sync(handle);

        /*
         * Set inode->i_size to 0 before calling ext4_truncate(). We need
         * special handling of symlinks here because i_size is used to
         * determine whether ext4_inode_info->i_data contains symlink data or
         * block mappings. Setting i_size to 0 will remove its fast symlink
         * status. Erase i_data so that it becomes a valid empty block map.
         */
        if (ext4_inode_is_fast_symlink(inode))
                memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data));
        inode->i_size = 0;
        err = ext4_mark_inode_dirty(handle, inode);
        if (err) {
                ext4_warning(inode->i_sb,
                             "couldn't mark inode dirty (err %d)", err);
                goto stop_handle;
        }
        if (inode->i_blocks) {
                err = ext4_truncate(inode);
                if (err) {
                        ext4_error_err(inode->i_sb, -err,
                                       "couldn't truncate inode %lu (err %d)",
                                       inode->i_ino, err);
                        goto stop_handle;
                }
        }

        /* Remove xattr references. */
        err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array,
                                      extra_credits);
        if (err) {
                ext4_warning(inode->i_sb, "xattr delete (err %d)", err);
stop_handle:
                ext4_journal_stop(handle);
                ext4_orphan_del(NULL, inode);
                if (freeze_protected)
                        sb_end_intwrite(inode->i_sb);
                ext4_xattr_inode_array_free(ea_inode_array);
                goto no_delete;
        }

        /*
         * Kill off the orphan record which ext4_truncate created.
         * AKPM: I think this can be inside the above `if'.
         * Note that ext4_orphan_del() has to be able to cope with the
         * deletion of a non-existent orphan - this is because we don't
         * know if ext4_truncate() actually created an orphan record.
         * (Well, we could do this if we need to, but heck - it works)
         */
        ext4_orphan_del(handle, inode);
        EXT4_I(inode)->i_dtime        = (__u32)ktime_get_real_seconds();

        /*
         * One subtle ordering requirement: if anything has gone wrong
         * (transaction abort, IO errors, whatever), then we can still
         * do these next steps (the fs will already have been marked as
         * having errors), but we can't free the inode if the mark_dirty
         * fails.
         */
        if (ext4_mark_inode_dirty(handle, inode))
                /* If that failed, just do the required in-core inode clear. */
                ext4_clear_inode(inode);
        else
                ext4_free_inode(handle, inode);
        ext4_journal_stop(handle);
        if (freeze_protected)
                sb_end_intwrite(inode->i_sb);
        ext4_xattr_inode_array_free(ea_inode_array);
        return;
no_delete:
        /*
         * Check out some where else accidentally dirty the evicting inode,
         * which may probably cause inode use-after-free issues later.
         */
        WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list));

        if (!list_empty(&EXT4_I(inode)->i_fc_list))
                ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
        ext4_clear_inode(inode);        /* We must guarantee clearing of inode... */
}

#ifdef CONFIG_QUOTA
qsize_t *ext4_get_reserved_space(struct inode *inode)
{
        return &EXT4_I(inode)->i_reserved_quota;
}
#endif

/*
 * Called with i_data_sem down, which is important since we can call
 * ext4_discard_preallocations() from here.
 */
void ext4_da_update_reserve_space(struct inode *inode,
                                        int used, int quota_claim)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);

        spin_lock(&ei->i_block_reservation_lock);
        trace_ext4_da_update_reserve_space(inode, used, quota_claim);
        if (unlikely(used > ei->i_reserved_data_blocks)) {
                ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
                         "with only %d reserved data blocks",
                         __func__, inode->i_ino, used,
                         ei->i_reserved_data_blocks);
                WARN_ON(1);
                used = ei->i_reserved_data_blocks;
        }

        /* Update per-inode reservations */
        ei->i_reserved_data_blocks -= used;
        percpu_counter_sub(&sbi->s_dirtyclusters_counter, used);

        spin_unlock(&ei->i_block_reservation_lock);

        /* Update quota subsystem for data blocks */
        if (quota_claim)
                dquot_claim_block(inode, EXT4_C2B(sbi, used));
        else {
                /*
                 * We did fallocate with an offset that is already delayed
                 * allocated. So on delayed allocated writeback we should
                 * not re-claim the quota for fallocated blocks.
                 */
                dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
        }

        /*
         * If we have done all the pending block allocations and if
         * there aren't any writers on the inode, we can discard the
         * inode's preallocations.
         */
        if ((ei->i_reserved_data_blocks == 0) &&
            !inode_is_open_for_write(inode))
                ext4_discard_preallocations(inode);
}

static int __check_block_validity(struct inode *inode, const char *func,
                                unsigned int line,
                                struct ext4_map_blocks *map)
{
        if (ext4_has_feature_journal(inode->i_sb) &&
            (inode->i_ino ==
             le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
                return 0;
        if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
                ext4_error_inode(inode, func, line, map->m_pblk,
                                 "lblock %lu mapped to illegal pblock %llu "
                                 "(length %d)", (unsigned long) map->m_lblk,
                                 map->m_pblk, map->m_len);
                return -EFSCORRUPTED;
        }
        return 0;
}

int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
                       ext4_lblk_t len)
{
        int ret;

        if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
                return fscrypt_zeroout_range(inode, lblk, pblk, len);

        ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
        if (ret > 0)
                ret = 0;

        return ret;
}

#define check_block_validity(inode, map)        \
        __check_block_validity((inode), __func__, __LINE__, (map))

#ifdef ES_AGGRESSIVE_TEST
static void ext4_map_blocks_es_recheck(handle_t *handle,
                                       struct inode *inode,
                                       struct ext4_map_blocks *es_map,
                                       struct ext4_map_blocks *map,
                                       int flags)
{
        int retval;

        map->m_flags = 0;
        /*
         * There is a race window that the result is not the same.
         * e.g. xfstests #223 when dioread_nolock enables.  The reason
         * is that we lookup a block mapping in extent status tree with
         * out taking i_data_sem.  So at the time the unwritten extent
         * could be converted.
         */
        down_read(&EXT4_I(inode)->i_data_sem);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                retval = ext4_ext_map_blocks(handle, inode, map, 0);
        } else {
                retval = ext4_ind_map_blocks(handle, inode, map, 0);
        }
        up_read((&EXT4_I(inode)->i_data_sem));

        /*
         * We don't check m_len because extent will be collpased in status
         * tree.  So the m_len might not equal.
         */
        if (es_map->m_lblk != map->m_lblk ||
            es_map->m_flags != map->m_flags ||
            es_map->m_pblk != map->m_pblk) {
                printk("ES cache assertion failed for inode: %lu "
                       "es_cached ex [%d/%d/%llu/%x] != "
                       "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
                       inode->i_ino, es_map->m_lblk, es_map->m_len,
                       es_map->m_pblk, es_map->m_flags, map->m_lblk,
                       map->m_len, map->m_pblk, map->m_flags,
                       retval, flags);
        }
}
#endif /* ES_AGGRESSIVE_TEST */

/*
 * The ext4_map_blocks() function tries to look up the requested blocks,
 * and returns if the blocks are already mapped.
 *
 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
 * and store the allocated blocks in the result buffer head and mark it
 * mapped.
 *
 * If file type is extents based, it will call ext4_ext_map_blocks(),
 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
 * based files
 *
 * On success, it returns the number of blocks being mapped or allocated.
 * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are
 * pre-allocated and unwritten, the resulting @map is marked as unwritten.
 * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped.
 *
 * It returns 0 if plain look up failed (blocks have not been allocated), in
 * that case, @map is returned as unmapped but we still do fill map->m_len to
 * indicate the length of a hole starting at map->m_lblk.
 *
 * It returns the error in case of allocation failure.
 */
int ext4_map_blocks(handle_t *handle, struct inode *inode,
                    struct ext4_map_blocks *map, int flags)
{
        struct extent_status es;
        int retval;
        int ret = 0;
#ifdef ES_AGGRESSIVE_TEST
        struct ext4_map_blocks orig_map;

        memcpy(&orig_map, map, sizeof(*map));
#endif

        map->m_flags = 0;
        ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n",
                  flags, map->m_len, (unsigned long) map->m_lblk);

        /*
         * ext4_map_blocks returns an int, and m_len is an unsigned int
         */
        if (unlikely(map->m_len > INT_MAX))
                map->m_len = INT_MAX;

        /* We can handle the block number less than EXT_MAX_BLOCKS */
        if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
                return -EFSCORRUPTED;

        /* Lookup extent status tree firstly */
        if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
            ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
                if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
                        map->m_pblk = ext4_es_pblock(&es) +
                                        map->m_lblk - es.es_lblk;
                        map->m_flags |= ext4_es_is_written(&es) ?
                                        EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
                        retval = es.es_len - (map->m_lblk - es.es_lblk);
                        if (retval > map->m_len)
                                retval = map->m_len;
                        map->m_len = retval;
                } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
                        map->m_pblk = 0;
                        map->m_flags |= ext4_es_is_delayed(&es) ?
                                        EXT4_MAP_DELAYED : 0;
                        retval = es.es_len - (map->m_lblk - es.es_lblk);
                        if (retval > map->m_len)
                                retval = map->m_len;
                        map->m_len = retval;
                        retval = 0;
                } else {
                        BUG();
                }

                if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
                        return retval;
#ifdef ES_AGGRESSIVE_TEST
                ext4_map_blocks_es_recheck(handle, inode, map,
                                           &orig_map, flags);
#endif
                goto found;
        }
        /*
         * In the query cache no-wait mode, nothing we can do more if we
         * cannot find extent in the cache.
         */
        if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
                return 0;

        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
         */
        down_read(&EXT4_I(inode)->i_data_sem);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                retval = ext4_ext_map_blocks(handle, inode, map, 0);
        } else {
                retval = ext4_ind_map_blocks(handle, inode, map, 0);
        }
        if (retval > 0) {
                unsigned int status;

                if (unlikely(retval != map->m_len)) {
                        ext4_warning(inode->i_sb,
                                     "ES len assertion failed for inode "
                                     "%lu: retval %d != map->m_len %d",
                                     inode->i_ino, retval, map->m_len);
                        WARN_ON(1);
                }

                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
                if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
                    !(status & EXTENT_STATUS_WRITTEN) &&
                    ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
                                       map->m_lblk + map->m_len - 1))
                        status |= EXTENT_STATUS_DELAYED;
                ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
                                      map->m_pblk, status);
        }
        up_read((&EXT4_I(inode)->i_data_sem));

found:
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
                ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;
        }

        /* If it is only a block(s) look up */
        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
                return retval;

        /*
         * Returns if the blocks have already allocated
         *
         * Note that if blocks have been preallocated
         * ext4_ext_map_blocks() returns with buffer head unmapped
         */
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
                /*
                 * If we need to convert extent to unwritten
                 * we continue and do the actual work in
                 * ext4_ext_map_blocks()
                 */
                if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
                        return retval;

        /*
         * Here we clear m_flags because after allocating an new extent,
         * it will be set again.
         */
        map->m_flags &= ~EXT4_MAP_FLAGS;

        /*
         * New blocks allocate and/or writing to unwritten extent
         * will possibly result in updating i_data, so we take
         * the write lock of i_data_sem, and call get_block()
         * with create == 1 flag.
         */
        down_write(&EXT4_I(inode)->i_data_sem);

        /*
         * We need to check for EXT4 here because migrate
         * could have changed the inode type in between
         */
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                retval = ext4_ext_map_blocks(handle, inode, map, flags);
        } else {
                retval = ext4_ind_map_blocks(handle, inode, map, flags);

                if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
                        /*
                         * We allocated new blocks which will result in
                         * i_data's format changing.  Force the migrate
                         * to fail by clearing migrate flags
                         */
                        ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
                }
        }

        if (retval > 0) {
                unsigned int status;

                if (unlikely(retval != map->m_len)) {
                        ext4_warning(inode->i_sb,
                                     "ES len assertion failed for inode "
                                     "%lu: retval %d != map->m_len %d",
                                     inode->i_ino, retval, map->m_len);
                        WARN_ON(1);
                }

                /*
                 * We have to zeroout blocks before inserting them into extent
                 * status tree. Otherwise someone could look them up there and
                 * use them before they are really zeroed. We also have to
                 * unmap metadata before zeroing as otherwise writeback can
                 * overwrite zeros with stale data from block device.
                 */
                if (flags & EXT4_GET_BLOCKS_ZERO &&
                    map->m_flags & EXT4_MAP_MAPPED &&
                    map->m_flags & EXT4_MAP_NEW) {
                        ret = ext4_issue_zeroout(inode, map->m_lblk,
                                                 map->m_pblk, map->m_len);
                        if (ret) {
                                retval = ret;
                                goto out_sem;
                        }
                }

                /*
                 * If the extent has been zeroed out, we don't need to update
                 * extent status tree.
                 */
                if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
                    ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
                        if (ext4_es_is_written(&es))
                                goto out_sem;
                }
                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
                if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
                    !(status & EXTENT_STATUS_WRITTEN) &&
                    ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
                                       map->m_lblk + map->m_len - 1))
                        status |= EXTENT_STATUS_DELAYED;
                ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
                                      map->m_pblk, status);
        }

out_sem:
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
                ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;

                /*
                 * Inodes with freshly allocated blocks where contents will be
                 * visible after transaction commit must be on transaction's
                 * ordered data list.
                 */
                if (map->m_flags & EXT4_MAP_NEW &&
                    !(map->m_flags & EXT4_MAP_UNWRITTEN) &&
                    !(flags & EXT4_GET_BLOCKS_ZERO) &&
                    !ext4_is_quota_file(inode) &&
                    ext4_should_order_data(inode)) {
                        loff_t start_byte =
                                (loff_t)map->m_lblk << inode->i_blkbits;
                        loff_t length = (loff_t)map->m_len << inode->i_blkbits;

                        if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
                                ret = ext4_jbd2_inode_add_wait(handle, inode,
                                                start_byte, length);
                        else
                                ret = ext4_jbd2_inode_add_write(handle, inode,
                                                start_byte, length);
                        if (ret)
                                return ret;
                }
        }
        if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN ||
                                map->m_flags & EXT4_MAP_MAPPED))
                ext4_fc_track_range(handle, inode, map->m_lblk,
                                        map->m_lblk + map->m_len - 1);
        if (retval < 0)
                ext_debug(inode, "failed with err %d\n", retval);
        return retval;
}

/*
 * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages
 * we have to be careful as someone else may be manipulating b_state as well.
 */
static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
{
        unsigned long old_state;
        unsigned long new_state;

        flags &= EXT4_MAP_FLAGS;

        /* Dummy buffer_head? Set non-atomically. */
        if (!bh->b_page) {
                bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
                return;
        }
        /*
         * Someone else may be modifying b_state. Be careful! This is ugly but
         * once we get rid of using bh as a container for mapping information
         * to pass to / from get_block functions, this can go away.
         */
        old_state = READ_ONCE(bh->b_state);
        do {
                new_state = (old_state & ~EXT4_MAP_FLAGS) | flags;
        } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state)));
}

static int _ext4_get_block(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int flags)
{
        struct ext4_map_blocks map;
        int ret = 0;

        if (ext4_has_inline_data(inode))
                return -ERANGE;

        map.m_lblk = iblock;
        map.m_len = bh->b_size >> inode->i_blkbits;

        ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map,
                              flags);
        if (ret > 0) {
                map_bh(bh, inode->i_sb, map.m_pblk);
                ext4_update_bh_state(bh, map.m_flags);
                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
                ret = 0;
        } else if (ret == 0) {
                /* hole case, need to fill in bh->b_size */
                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
        }
        return ret;
}

int ext4_get_block(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh, int create)
{
        return _ext4_get_block(inode, iblock, bh,
                               create ? EXT4_GET_BLOCKS_CREATE : 0);
}

/*
 * Get block function used when preparing for buffered write if we require
 * creating an unwritten extent if blocks haven't been allocated.  The extent
 * will be converted to written after the IO is complete.
 */
int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
                             struct buffer_head *bh_result, int create)
{
        int ret = 0;

        ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
                   inode->i_ino, create);
        ret = _ext4_get_block(inode, iblock, bh_result,
                               EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);

        /*
         * If the buffer is marked unwritten, mark it as new to make sure it is
         * zeroed out correctly in case of partial writes. Otherwise, there is
         * a chance of stale data getting exposed.
         */
        if (ret == 0 && buffer_unwritten(bh_result))
                set_buffer_new(bh_result);

        return ret;
}

/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096

/*
 * `handle' can be NULL if create is zero
 */
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
                                ext4_lblk_t block, int map_flags)
{
        struct ext4_map_blocks map;
        struct buffer_head *bh;
        int create = map_flags & EXT4_GET_BLOCKS_CREATE;
        bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT;
        int err;

        ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                    || handle != NULL || create == 0);
        ASSERT(create == 0 || !nowait);

        map.m_lblk = block;
        map.m_len = 1;
        err = ext4_map_blocks(handle, inode, &map, map_flags);

        if (err == 0)
                return create ? ERR_PTR(-ENOSPC) : NULL;
        if (err < 0)
                return ERR_PTR(err);

        if (nowait)
                return sb_find_get_block(inode->i_sb, map.m_pblk);

        bh = sb_getblk(inode->i_sb, map.m_pblk);
        if (unlikely(!bh))
                return ERR_PTR(-ENOMEM);
        if (map.m_flags & EXT4_MAP_NEW) {
                ASSERT(create != 0);
                ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                            || (handle != NULL));

                /*
                 * Now that we do not always journal data, we should
                 * keep in mind whether this should always journal the
                 * new buffer as metadata.  For now, regular file
                 * writes use ext4_get_block instead, so it's not a
                 * problem.
                 */
                lock_buffer(bh);
                BUFFER_TRACE(bh, "call get_create_access");
                err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
                                                     EXT4_JTR_NONE);
                if (unlikely(err)) {
                        unlock_buffer(bh);
                        goto errout;
                }
                if (!buffer_uptodate(bh)) {
                        memset(bh->b_data, 0, inode->i_sb->s_blocksize);
                        set_buffer_uptodate(bh);
                }
                unlock_buffer(bh);
                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
                err = ext4_handle_dirty_metadata(handle, inode, bh);
                if (unlikely(err))
                        goto errout;
        } else
                BUFFER_TRACE(bh, "not a new buffer");
        return bh;
errout:
        brelse(bh);
        return ERR_PTR(err);
}

struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
                               ext4_lblk_t block, int map_flags)
{
        struct buffer_head *bh;
        int ret;

        bh = ext4_getblk(handle, inode, block, map_flags);
        if (IS_ERR(bh))
                return bh;
        if (!bh || ext4_buffer_uptodate(bh))
                return bh;

        ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true);
        if (ret) {
                put_bh(bh);
                return ERR_PTR(ret);
        }
        return bh;
}

/* Read a contiguous batch of blocks. */
int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
                     bool wait, struct buffer_head **bhs)
{
        int i, err;

        for (i = 0; i < bh_count; i++) {
                bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */);
                if (IS_ERR(bhs[i])) {
                        err = PTR_ERR(bhs[i]);
                        bh_count = i;
                        goto out_brelse;
                }
        }

        for (i = 0; i < bh_count; i++)
                /* Note that NULL bhs[i] is valid because of holes. */
                if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
                        ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false);

        if (!wait)
                return 0;

        for (i = 0; i < bh_count; i++)
                if (bhs[i])
                        wait_on_buffer(bhs[i]);

        for (i = 0; i < bh_count; i++) {
                if (bhs[i] && !buffer_uptodate(bhs[i])) {
                        err = -EIO;
                        goto out_brelse;
                }
        }
        return 0;

out_brelse:
        for (i = 0; i < bh_count; i++) {
                brelse(bhs[i]);
                bhs[i] = NULL;
        }
        return err;
}

int ext4_walk_page_buffers(handle_t *handle, struct inode *inode,
                           struct buffer_head *head,
                           unsigned from,
                           unsigned to,
                           int *partial,
                           int (*fn)(handle_t *handle, struct inode *inode,
                                     struct buffer_head *bh))
{
        struct buffer_head *bh;
        unsigned block_start, block_end;
        unsigned blocksize = head->b_size;
        int err, ret = 0;
        struct buffer_head *next;

        for (bh = head, block_start = 0;
             ret == 0 && (bh != head || !block_start);
             block_start = block_end, bh = next) {
                next = bh->b_this_page;
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (partial && !buffer_uptodate(bh))
                                *partial = 1;
                        continue;
                }
                err = (*fn)(handle, inode, bh);
                if (!ret)
                        ret = err;
        }
        return ret;
}

/*
 * Helper for handling dirtying of journalled data. We also mark the folio as
 * dirty so that writeback code knows about this page (and inode) contains
 * dirty data. ext4_writepages() then commits appropriate transaction to
 * make data stable.
 */
static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh)
{
        folio_mark_dirty(bh->b_folio);
        return ext4_handle_dirty_metadata(handle, NULL, bh);
}

int do_journal_get_write_access(handle_t *handle, struct inode *inode,
                                struct buffer_head *bh)
{
        int dirty = buffer_dirty(bh);
        int ret;

        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        /*
         * __block_write_begin() could have dirtied some buffers. Clean
         * the dirty bit as jbd2_journal_get_write_access() could complain
         * otherwise about fs integrity issues. Setting of the dirty bit
         * by __block_write_begin() isn't a real problem here as we clear
         * the bit before releasing a page lock and thus writeback cannot
         * ever write the buffer.
         */
        if (dirty)
                clear_buffer_dirty(bh);
        BUFFER_TRACE(bh, "get write access");
        ret = ext4_journal_get_write_access(handle, inode->i_sb, bh,
                                            EXT4_JTR_NONE);
        if (!ret && dirty)
                ret = ext4_dirty_journalled_data(handle, bh);
        return ret;
}

#ifdef CONFIG_FS_ENCRYPTION
static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
                                  get_block_t *get_block)
{
        unsigned from = pos & (PAGE_SIZE - 1);
        unsigned to = from + len;
        struct inode *inode = folio->mapping->host;
        unsigned block_start, block_end;
        sector_t block;
        int err = 0;
        unsigned blocksize = inode->i_sb->s_blocksize;
        unsigned bbits;
        struct buffer_head *bh, *head, *wait[2];
        int nr_wait = 0;
        int i;

        BUG_ON(!folio_test_locked(folio));
        BUG_ON(from > PAGE_SIZE);
        BUG_ON(to > PAGE_SIZE);
        BUG_ON(from > to);

        head = folio_buffers(folio);
        if (!head)
                head = create_empty_buffers(folio, blocksize, 0);
        bbits = ilog2(blocksize);
        block = (sector_t)folio->index << (PAGE_SHIFT - bbits);

        for (bh = head, block_start = 0; bh != head || !block_start;
            block++, block_start = block_end, bh = bh->b_this_page) {
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (folio_test_uptodate(folio)) {
                                set_buffer_uptodate(bh);
                        }
                        continue;
                }
                if (buffer_new(bh))
                        clear_buffer_new(bh);
                if (!buffer_mapped(bh)) {
                        WARN_ON(bh->b_size != blocksize);
                        err = get_block(inode, block, bh, 1);
                        if (err)
                                break;
                        if (buffer_new(bh)) {
                                if (folio_test_uptodate(folio)) {
                                        clear_buffer_new(bh);
                                        set_buffer_uptodate(bh);
                                        mark_buffer_dirty(bh);
                                        continue;
                                }
                                if (block_end > to || block_start < from)
                                        folio_zero_segments(folio, to,
                                                            block_end,
                                                            block_start, from);
                                continue;
                        }
                }
                if (folio_test_uptodate(folio)) {
                        set_buffer_uptodate(bh);
                        continue;
                }
                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
                    !buffer_unwritten(bh) &&
                    (block_start < from || block_end > to)) {
                        ext4_read_bh_lock(bh, 0, false);
                        wait[nr_wait++] = bh;
                }
        }
        /*
         * If we issued read requests, let them complete.
         */
        for (i = 0; i < nr_wait; i++) {
                wait_on_buffer(wait[i]);
                if (!buffer_uptodate(wait[i]))
                        err = -EIO;
        }
        if (unlikely(err)) {
                folio_zero_new_buffers(folio, from, to);
        } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
                for (i = 0; i < nr_wait; i++) {
                        int err2;

                        err2 = fscrypt_decrypt_pagecache_blocks(folio,
                                                blocksize, bh_offset(wait[i]));
                        if (err2) {
                                clear_buffer_uptodate(wait[i]);
                                err = err2;
                        }
                }
        }

        return err;
}
#endif

/*
 * To preserve ordering, it is essential that the hole instantiation and
 * the data write be encapsulated in a single transaction.  We cannot
 * close off a transaction and start a new one between the ext4_get_block()
 * and the ext4_write_end().  So doing the jbd2_journal_start at the start of
 * ext4_write_begin() is the right place.
 */
static int ext4_write_begin(struct file *file, struct address_space *mapping,
                            loff_t pos, unsigned len,
                            struct page **pagep, void **fsdata)
{
        struct inode *inode = mapping->host;
        int ret, needed_blocks;
        handle_t *handle;
        int retries = 0;
        struct folio *folio;
        pgoff_t index;
        unsigned from, to;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;

        trace_ext4_write_begin(inode, pos, len);
        /*
         * Reserve one block more for addition to orphan list in case
         * we allocate blocks but write fails for some reason
         */
        needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
        index = pos >> PAGE_SHIFT;
        from = pos & (PAGE_SIZE - 1);
        to = from + len;

        if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
                ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
                                                    pagep);
                if (ret < 0)
                        return ret;
                if (ret == 1)
                        return 0;
        }

        /*
         * __filemap_get_folio() can take a long time if the
         * system is thrashing due to memory pressure, or if the folio
         * is being written back.  So grab it first before we start
         * the transaction handle.  This also allows us to allocate
         * the folio (if needed) without using GFP_NOFS.
         */
retry_grab:
        folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
                                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio))
                return PTR_ERR(folio);
        /*
         * The same as page allocation, we prealloc buffer heads before
         * starting the handle.
         */
        if (!folio_buffers(folio))
                create_empty_buffers(folio, inode->i_sb->s_blocksize, 0);

        folio_unlock(folio);

retry_journal:
        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
        if (IS_ERR(handle)) {
                folio_put(folio);
                return PTR_ERR(handle);
        }

        folio_lock(folio);
        if (folio->mapping != mapping) {
                /* The folio got truncated from under us */
                folio_unlock(folio);
                folio_put(folio);
                ext4_journal_stop(handle);
                goto retry_grab;
        }
        /* In case writeback began while the folio was unlocked */
        folio_wait_stable(folio);

#ifdef CONFIG_FS_ENCRYPTION
        if (ext4_should_dioread_nolock(inode))
                ret = ext4_block_write_begin(folio, pos, len,
                                             ext4_get_block_unwritten);
        else
                ret = ext4_block_write_begin(folio, pos, len, ext4_get_block);
#else
        if (ext4_should_dioread_nolock(inode))
                ret = __block_write_begin(&folio->page, pos, len,
                                          ext4_get_block_unwritten);
        else
                ret = __block_write_begin(&folio->page, pos, len, ext4_get_block);
#endif
        if (!ret && ext4_should_journal_data(inode)) {
                ret = ext4_walk_page_buffers(handle, inode,
                                             folio_buffers(folio), from, to,
                                             NULL, do_journal_get_write_access);
        }

        if (ret) {
                bool extended = (pos + len > inode->i_size) &&
                                !ext4_verity_in_progress(inode);

                folio_unlock(folio);
                /*
                 * __block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                 * i_size_read because we hold i_rwsem.
                 *
                 * Add inode to orphan list in case we crash before
                 * truncate finishes
                 */
                if (extended && ext4_can_truncate(inode))
                        ext4_orphan_add(handle, inode);

                ext4_journal_stop(handle);
                if (extended) {
                        ext4_truncate_failed_write(inode);
                        /*
                         * If truncate failed early the inode might
                         * still be on the orphan list; we need to
                         * make sure the inode is removed from the
                         * orphan list in that case.
                         */
                        if (inode->i_nlink)
                                ext4_orphan_del(NULL, inode);
                }

                if (ret == -ENOSPC &&
                    ext4_should_retry_alloc(inode->i_sb, &retries))
                        goto retry_journal;
                folio_put(folio);
                return ret;
        }
        *pagep = &folio->page;
        return ret;
}

/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct inode *inode,
                        struct buffer_head *bh)
{
        int ret;
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        set_buffer_uptodate(bh);
        ret = ext4_dirty_journalled_data(handle, bh);
        clear_buffer_meta(bh);
        clear_buffer_prio(bh);
        return ret;
}

/*
 * We need to pick up the new inode size which generic_commit_write gave us
 * `file' can be NULL - eg, when called from page_symlink().
 *
 * ext4 never places buffers on inode->i_mapping->i_private_list.  metadata
 * buffers are managed internally.
 */
static int ext4_write_end(struct file *file,
                          struct address_space *mapping,
                          loff_t pos, unsigned len, unsigned copied,
                          struct page *page, void *fsdata)
{
        struct folio *folio = page_folio(page);
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
        loff_t old_size = inode->i_size;
        int ret = 0, ret2;
        int i_size_changed = 0;
        bool verity = ext4_verity_in_progress(inode);

        trace_ext4_write_end(inode, pos, len, copied);

        if (ext4_has_inline_data(inode) &&
            ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
                return ext4_write_inline_data_end(inode, pos, len, copied,
                                                  folio);

        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
        /*
         * it's important to update i_size while still holding folio lock:
         * page writeout could otherwise come in and zero beyond i_size.
         *
         * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree
         * blocks are being written past EOF, so skip the i_size update.
         */
        if (!verity)
                i_size_changed = ext4_update_inode_size(inode, pos + copied);
        folio_unlock(folio);
        folio_put(folio);

        if (old_size < pos && !verity)
                pagecache_isize_extended(inode, old_size, pos);
        /*
         * Don't mark the inode dirty under folio lock. First, it unnecessarily
         * makes the holding time of folio lock longer. Second, it forces lock
         * ordering of folio lock and transaction start for journaling
         * filesystems.
         */
        if (i_size_changed)
                ret = ext4_mark_inode_dirty(handle, inode);

        if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
                 * inode->i_size. So truncate them
                 */
                ext4_orphan_add(handle, inode);

        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;

        if (pos + len > inode->i_size && !verity) {
                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
                if (inode->i_nlink)
                        ext4_orphan_del(NULL, inode);
        }

        return ret ? ret : copied;
}

/*
 * This is a private version of folio_zero_new_buffers() which doesn't
 * set the buffer to be dirty, since in data=journalled mode we need
 * to call ext4_dirty_journalled_data() instead.
 */
static void ext4_journalled_zero_new_buffers(handle_t *handle,
                                            struct inode *inode,
                                            struct folio *folio,
                                            unsigned from, unsigned to)
{
        unsigned int block_start = 0, block_end;
        struct buffer_head *head, *bh;

        bh = head = folio_buffers(folio);
        do {
                block_end = block_start + bh->b_size;
                if (buffer_new(bh)) {
                        if (block_end > from && block_start < to) {
                                if (!folio_test_uptodate(folio)) {
                                        unsigned start, size;

                                        start = max(from, block_start);
                                        size = min(to, block_end) - start;

                                        folio_zero_range(folio, start, size);
                                        write_end_fn(handle, inode, bh);
                                }
                                clear_buffer_new(bh);
                        }
                }
                block_start = block_end;
                bh = bh->b_this_page;
        } while (bh != head);
}

static int ext4_journalled_write_end(struct file *file,
                                     struct address_space *mapping,
                                     loff_t pos, unsigned len, unsigned copied,
                                     struct page *page, void *fsdata)
{
        struct folio *folio = page_folio(page);
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
        loff_t old_size = inode->i_size;
        int ret = 0, ret2;
        int partial = 0;
        unsigned from, to;
        int size_changed = 0;
        bool verity = ext4_verity_in_progress(inode);

        trace_ext4_journalled_write_end(inode, pos, len, copied);
        from = pos & (PAGE_SIZE - 1);
        to = from + len;

        BUG_ON(!ext4_handle_valid(handle));

        if (ext4_has_inline_data(inode))
                return ext4_write_inline_data_end(inode, pos, len, copied,
                                                  folio);

        if (unlikely(copied < len) && !folio_test_uptodate(folio)) {
                copied = 0;
                ext4_journalled_zero_new_buffers(handle, inode, folio,
                                                 from, to);
        } else {
                if (unlikely(copied < len))
                        ext4_journalled_zero_new_buffers(handle, inode, folio,
                                                         from + copied, to);
                ret = ext4_walk_page_buffers(handle, inode,
                                             folio_buffers(folio),
                                             from, from + copied, &partial,
                                             write_end_fn);
                if (!partial)
                        folio_mark_uptodate(folio);
        }
        if (!verity)
                size_changed = ext4_update_inode_size(inode, pos + copied);
        EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
        folio_unlock(folio);
        folio_put(folio);

        if (old_size < pos && !verity)
                pagecache_isize_extended(inode, old_size, pos);

        if (size_changed) {
                ret2 = ext4_mark_inode_dirty(handle, inode);
                if (!ret)
                        ret = ret2;
        }

        if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
                 * inode->i_size. So truncate them
                 */
                ext4_orphan_add(handle, inode);

        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
        if (pos + len > inode->i_size && !verity) {
                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
                if (inode->i_nlink)
                        ext4_orphan_del(NULL, inode);
        }

        return ret ? ret : copied;
}

/*
 * Reserve space for a single cluster
 */
static int ext4_da_reserve_space(struct inode *inode)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
        int ret;

        /*
         * We will charge metadata quota at writeout time; this saves
         * us from metadata over-estimation, though we may go over by
         * a small amount in the end.  Here we just reserve for data.
         */
        ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
        if (ret)
                return ret;

        spin_lock(&ei->i_block_reservation_lock);
        if (ext4_claim_free_clusters(sbi, 1, 0)) {
                spin_unlock(&ei->i_block_reservation_lock);
                dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
                return -ENOSPC;
        }
        ei->i_reserved_data_blocks++;
        trace_ext4_da_reserve_space(inode);
        spin_unlock(&ei->i_block_reservation_lock);

        return 0;       /* success */
}

void ext4_da_release_space(struct inode *inode, int to_free)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (!to_free)
                return;                /* Nothing to release, exit */

        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);

        trace_ext4_da_release_space(inode, to_free);
        if (unlikely(to_free > ei->i_reserved_data_blocks)) {
                /*
                 * if there aren't enough reserved blocks, then the
                 * counter is messed up somewhere.  Since this
                 * function is called from invalidate page, it's
                 * harmless to return without any action.
                 */
                ext4_warning(inode->i_sb, "ext4_da_release_space: "
                         "ino %lu, to_free %d with only %d reserved "
                         "data blocks", inode->i_ino, to_free,
                         ei->i_reserved_data_blocks);
                WARN_ON(1);
                to_free = ei->i_reserved_data_blocks;
        }
        ei->i_reserved_data_blocks -= to_free;

        /* update fs dirty data blocks counter */
        percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);

        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);

        dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
}

/*
 * Delayed allocation stuff
 */

struct mpage_da_data {
        /* These are input fields for ext4_do_writepages() */
        struct inode *inode;
        struct writeback_control *wbc;
        unsigned int can_map:1;        /* Can writepages call map blocks? */

        /* These are internal state of ext4_do_writepages() */
        pgoff_t first_page;        /* The first page to write */
        pgoff_t next_page;        /* Current page to examine */
        pgoff_t last_page;        /* Last page to examine */
        /*
         * Extent to map - this can be after first_page because that can be
         * fully mapped. We somewhat abuse m_flags to store whether the extent
         * is delalloc or unwritten.
         */
        struct ext4_map_blocks map;
        struct ext4_io_submit io_submit;        /* IO submission data */
        unsigned int do_map:1;
        unsigned int scanned_until_end:1;
        unsigned int journalled_more_data:1;
};

static void mpage_release_unused_pages(struct mpage_da_data *mpd,
                                       bool invalidate)
{
        unsigned nr, i;
        pgoff_t index, end;
        struct folio_batch fbatch;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;

        /* This is necessary when next_page == 0. */
        if (mpd->first_page >= mpd->next_page)
                return;

        mpd->scanned_until_end = 0;
        index = mpd->first_page;
        end   = mpd->next_page - 1;
        if (invalidate) {
                ext4_lblk_t start, last;
                start = index << (PAGE_SHIFT - inode->i_blkbits);
                last = end << (PAGE_SHIFT - inode->i_blkbits);

                /*
                 * avoid racing with extent status tree scans made by
                 * ext4_insert_delayed_block()
                 */
                down_write(&EXT4_I(inode)->i_data_sem);
                ext4_es_remove_extent(inode, start, last - start + 1);
                up_write(&EXT4_I(inode)->i_data_sem);
        }

        folio_batch_init(&fbatch);
        while (index <= end) {
                nr = filemap_get_folios(mapping, &index, end, &fbatch);
                if (nr == 0)
                        break;
                for (i = 0; i < nr; i++) {
                        struct folio *folio = fbatch.folios[i];

                        if (folio->index < mpd->first_page)
                                continue;
                        if (folio_next_index(folio) - 1 > end)
                                continue;
                        BUG_ON(!folio_test_locked(folio));
                        BUG_ON(folio_test_writeback(folio));
                        if (invalidate) {
                                if (folio_mapped(folio))
                                        folio_clear_dirty_for_io(folio);
                                block_invalidate_folio(folio, 0,
                                                folio_size(folio));
                                folio_clear_uptodate(folio);
                        }
                        folio_unlock(folio);
                }
                folio_batch_release(&fbatch);
        }
}

static void ext4_print_free_blocks(struct inode *inode)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct super_block *sb = inode->i_sb;
        struct ext4_inode_info *ei = EXT4_I(inode);

        ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
               EXT4_C2B(EXT4_SB(inode->i_sb),
                        ext4_count_free_clusters(sb)));
        ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
        ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
               (long long) EXT4_C2B(EXT4_SB(sb),
                percpu_counter_sum(&sbi->s_freeclusters_counter)));
        ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
               (long long) EXT4_C2B(EXT4_SB(sb),
                percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
        ext4_msg(sb, KERN_CRIT, "Block reservation details");
        ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
                 ei->i_reserved_data_blocks);
        return;
}

/*
 * ext4_insert_delayed_block - adds a delayed block to the extents status
 *                             tree, incrementing the reserved cluster/block
 *                             count or making a pending reservation
 *                             where needed
 *
 * @inode - file containing the newly added block
 * @lblk - logical block to be added
 *
 * Returns 0 on success, negative error code on failure.
 */
static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int ret;
        bool allocated = false;

        /*
         * If the cluster containing lblk is shared with a delayed,
         * written, or unwritten extent in a bigalloc file system, it's
         * already been accounted for and does not need to be reserved.
         * A pending reservation must be made for the cluster if it's
         * shared with a written or unwritten extent and doesn't already
         * have one.  Written and unwritten extents can be purged from the
         * extents status tree if the system is under memory pressure, so
         * it's necessary to examine the extent tree if a search of the
         * extents status tree doesn't get a match.
         */
        if (sbi->s_cluster_ratio == 1) {
                ret = ext4_da_reserve_space(inode);
                if (ret != 0)   /* ENOSPC */
                        return ret;
        } else {   /* bigalloc */
                if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
                        if (!ext4_es_scan_clu(inode,
                                              &ext4_es_is_mapped, lblk)) {
                                ret = ext4_clu_mapped(inode,
                                                      EXT4_B2C(sbi, lblk));
                                if (ret < 0)
                                        return ret;
                                if (ret == 0) {
                                        ret = ext4_da_reserve_space(inode);
                                        if (ret != 0)   /* ENOSPC */
                                                return ret;
                                } else {
                                        allocated = true;
                                }
                        } else {
                                allocated = true;
                        }
                }
        }

        ext4_es_insert_delayed_block(inode, lblk, allocated);
        return 0;
}

/*
 * This function is grabs code from the very beginning of
 * ext4_map_blocks, but assumes that the caller is from delayed write
 * time. This function looks up the requested blocks and sets the
 * buffer delay bit under the protection of i_data_sem.
 */
static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
                              struct ext4_map_blocks *map,
                              struct buffer_head *bh)
{
        struct extent_status es;
        int retval;
        sector_t invalid_block = ~((sector_t) 0xffff);
#ifdef ES_AGGRESSIVE_TEST
        struct ext4_map_blocks orig_map;

        memcpy(&orig_map, map, sizeof(*map));
#endif

        if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
                invalid_block = ~0;

        map->m_flags = 0;
        ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
                  (unsigned long) map->m_lblk);

        /* Lookup extent status tree firstly */
        if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
                if (ext4_es_is_hole(&es))
                        goto add_delayed;

                /*
                 * Delayed extent could be allocated by fallocate.
                 * So we need to check it.
                 */
                if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
                        map_bh(bh, inode->i_sb, invalid_block);
                        set_buffer_new(bh);
                        set_buffer_delay(bh);
                        return 0;
                }

                map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
                retval = es.es_len - (iblock - es.es_lblk);
                if (retval > map->m_len)
                        retval = map->m_len;
                map->m_len = retval;
                if (ext4_es_is_written(&es))
                        map->m_flags |= EXT4_MAP_MAPPED;
                else if (ext4_es_is_unwritten(&es))
                        map->m_flags |= EXT4_MAP_UNWRITTEN;
                else
                        BUG();

#ifdef ES_AGGRESSIVE_TEST
                ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
#endif
                return retval;
        }

        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
         */
        down_read(&EXT4_I(inode)->i_data_sem);
        if (ext4_has_inline_data(inode))
                retval = 0;
        else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                retval = ext4_ext_map_blocks(NULL, inode, map, 0);
        else
                retval = ext4_ind_map_blocks(NULL, inode, map, 0);
        if (retval < 0) {
                up_read(&EXT4_I(inode)->i_data_sem);
                return retval;
        }
        if (retval > 0) {
                unsigned int status;

                if (unlikely(retval != map->m_len)) {
                        ext4_warning(inode->i_sb,
                                     "ES len assertion failed for inode "
                                     "%lu: retval %d != map->m_len %d",
                                     inode->i_ino, retval, map->m_len);
                        WARN_ON(1);
                }

                status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                                EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
                ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
                                      map->m_pblk, status);
                up_read(&EXT4_I(inode)->i_data_sem);
                return retval;
        }
        up_read(&EXT4_I(inode)->i_data_sem);

add_delayed:
        down_write(&EXT4_I(inode)->i_data_sem);
        retval = ext4_insert_delayed_block(inode, map->m_lblk);
        up_write(&EXT4_I(inode)->i_data_sem);
        if (retval)
                return retval;

        map_bh(bh, inode->i_sb, invalid_block);
        set_buffer_new(bh);
        set_buffer_delay(bh);
        return retval;
}

/*
 * This is a special get_block_t callback which is used by
 * ext4_da_write_begin().  It will either return mapped block or
 * reserve space for a single block.
 *
 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
 * We also have b_blocknr = -1 and b_bdev initialized properly
 *
 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
 * initialized properly.
 */
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int create)
{
        struct ext4_map_blocks map;
        int ret = 0;

        BUG_ON(create == 0);
        BUG_ON(bh->b_size != inode->i_sb->s_blocksize);

        map.m_lblk = iblock;
        map.m_len = 1;

        /*
         * first, we need to know whether the block is allocated already
         * preallocated blocks are unmapped but should treated
         * the same as allocated blocks.
         */
        ret = ext4_da_map_blocks(inode, iblock, &map, bh);
        if (ret <= 0)
                return ret;

        map_bh(bh, inode->i_sb, map.m_pblk);
        ext4_update_bh_state(bh, map.m_flags);

        if (buffer_unwritten(bh)) {
                /* A delayed write to unwritten bh should be marked
                 * new and mapped.  Mapped ensures that we don't do
                 * get_block multiple times when we write to the same
                 * offset and new ensures that we do proper zero out
                 * for partial write.
                 */
                set_buffer_new(bh);
                set_buffer_mapped(bh);
        }
        return 0;
}

static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio)
{
        mpd->first_page += folio_nr_pages(folio);
        folio_unlock(folio);
}

static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
{
        size_t len;
        loff_t size;
        int err;

        BUG_ON(folio->index != mpd->first_page);
        folio_clear_dirty_for_io(folio);
        /*
         * We have to be very careful here!  Nothing protects writeback path
         * against i_size changes and the page can be writeably mapped into
         * page tables. So an application can be growing i_size and writing
         * data through mmap while writeback runs. folio_clear_dirty_for_io()
         * write-protects our page in page tables and the page cannot get
         * written to again until we release folio lock. So only after
         * folio_clear_dirty_for_io() we are safe to sample i_size for
         * ext4_bio_write_folio() to zero-out tail of the written page. We rely
         * on the barrier provided by folio_test_clear_dirty() in
         * folio_clear_dirty_for_io() to make sure i_size is really sampled only
         * after page tables are updated.
         */
        size = i_size_read(mpd->inode);
        len = folio_size(folio);
        if (folio_pos(folio) + len > size &&
            !ext4_verity_in_progress(mpd->inode))
                len = size & (len - 1);
        err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
        if (!err)
                mpd->wbc->nr_to_write--;

        return err;
}

#define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay))

/*
 * mballoc gives us at most this number of blocks...
 * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
 * The rest of mballoc seems to handle chunks up to full group size.
 */
#define MAX_WRITEPAGES_EXTENT_LEN 2048

/*
 * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
 *
 * @mpd - extent of blocks
 * @lblk - logical number of the block in the file
 * @bh - buffer head we want to add to the extent
 *
 * The function is used to collect contig. blocks in the same state. If the
 * buffer doesn't require mapping for writeback and we haven't started the
 * extent of buffers to map yet, the function returns 'true' immediately - the
 * caller can write the buffer right away. Otherwise the function returns true
 * if the block has been added to the extent, false if the block couldn't be
 * added.
 */
static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
                                   struct buffer_head *bh)
{
        struct ext4_map_blocks *map = &mpd->map;

        /* Buffer that doesn't need mapping for writeback? */
        if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
            (!buffer_delay(bh) && !buffer_unwritten(bh))) {
                /* So far no extent to map => we write the buffer right away */
                if (map->m_len == 0)
                        return true;
                return false;
        }

        /* First block in the extent? */
        if (map->m_len == 0) {
                /* We cannot map unless handle is started... */
                if (!mpd->do_map)
                        return false;
                map->m_lblk = lblk;
                map->m_len = 1;
                map->m_flags = bh->b_state & BH_FLAGS;
                return true;
        }

        /* Don't go larger than mballoc is willing to allocate */
        if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
                return false;

        /* Can we merge the block to our big extent? */
        if (lblk == map->m_lblk + map->m_len &&
            (bh->b_state & BH_FLAGS) == map->m_flags) {
                map->m_len++;
                return true;
        }
        return false;
}

/*
 * mpage_process_page_bufs - submit page buffers for IO or add them to extent
 *
 * @mpd - extent of blocks for mapping
 * @head - the first buffer in the page
 * @bh - buffer we should start processing from
 * @lblk - logical number of the block in the file corresponding to @bh
 *
 * Walk through page buffers from @bh upto @head (exclusive) and either submit
 * the page for IO if all buffers in this page were mapped and there's no
 * accumulated extent of buffers to map or add buffers in the page to the
 * extent of buffers to map. The function returns 1 if the caller can continue
 * by processing the next page, 0 if it should stop adding buffers to the
 * extent to map because we cannot extend it anymore. It can also return value
 * < 0 in case of error during IO submission.
 */
static int mpage_process_page_bufs(struct mpage_da_data *mpd,
                                   struct buffer_head *head,
                                   struct buffer_head *bh,
                                   ext4_lblk_t lblk)
{
        struct inode *inode = mpd->inode;
        int err;
        ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1)
                                                        >> inode->i_blkbits;

        if (ext4_verity_in_progress(inode))
                blocks = EXT_MAX_BLOCKS;

        do {
                BUG_ON(buffer_locked(bh));

                if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
                        /* Found extent to map? */
                        if (mpd->map.m_len)
                                return 0;
                        /* Buffer needs mapping and handle is not started? */
                        if (!mpd->do_map)
                                return 0;
                        /* Everything mapped so far and we hit EOF */
                        break;
                }
        } while (lblk++, (bh = bh->b_this_page) != head);
        /* So far everything mapped? Submit the page for IO. */
        if (mpd->map.m_len == 0) {
                err = mpage_submit_folio(mpd, head->b_folio);
                if (err < 0)
                        return err;
                mpage_folio_done(mpd, head->b_folio);
        }
        if (lblk >= blocks) {
                mpd->scanned_until_end = 1;
                return 0;
        }
        return 1;
}

/*
 * mpage_process_folio - update folio buffers corresponding to changed extent
 *                         and may submit fully mapped page for IO
 * @mpd: description of extent to map, on return next extent to map
 * @folio: Contains these buffers.
 * @m_lblk: logical block mapping.
 * @m_pblk: corresponding physical mapping.
 * @map_bh: determines on return whether this page requires any further
 *                  mapping or not.
 *
 * Scan given folio buffers corresponding to changed extent and update buffer
 * state according to new extent state.
 * We map delalloc buffers to their physical location, clear unwritten bits.
 * If the given folio is not fully mapped, we update @mpd to the next extent in
 * the given folio that needs mapping & return @map_bh as true.
 */
static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio,
                              ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
                              bool *map_bh)
{
        struct buffer_head *head, *bh;
        ext4_io_end_t *io_end = mpd->io_submit.io_end;
        ext4_lblk_t lblk = *m_lblk;
        ext4_fsblk_t pblock = *m_pblk;
        int err = 0;
        int blkbits = mpd->inode->i_blkbits;
        ssize_t io_end_size = 0;
        struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);

        bh = head = folio_buffers(folio);
        do {
                if (lblk < mpd->map.m_lblk)
                        continue;
                if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
                        /*
                         * Buffer after end of mapped extent.
                         * Find next buffer in the folio to map.
                         */
                        mpd->map.m_len = 0;
                        mpd->map.m_flags = 0;
                        io_end_vec->size += io_end_size;

                        err = mpage_process_page_bufs(mpd, head, bh, lblk);
                        if (err > 0)
                                err = 0;
                        if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
                                io_end_vec = ext4_alloc_io_end_vec(io_end);
                                if (IS_ERR(io_end_vec)) {
                                        err = PTR_ERR(io_end_vec);
                                        goto out;
                                }
                                io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits;
                        }
                        *map_bh = true;
                        goto out;
                }
                if (buffer_delay(bh)) {
                        clear_buffer_delay(bh);
                        bh->b_blocknr = pblock++;
                }
                clear_buffer_unwritten(bh);
                io_end_size += (1 << blkbits);
        } while (lblk++, (bh = bh->b_this_page) != head);

        io_end_vec->size += io_end_size;
        *map_bh = false;
out:
        *m_lblk = lblk;
        *m_pblk = pblock;
        return err;
}

/*
 * mpage_map_buffers - update buffers corresponding to changed extent and
 *                       submit fully mapped pages for IO
 *
 * @mpd - description of extent to map, on return next extent to map
 *
 * Scan buffers corresponding to changed extent (we expect corresponding pages
 * to be already locked) and update buffer state according to new extent state.
 * We map delalloc buffers to their physical location, clear unwritten bits,
 * and mark buffers as uninit when we perform writes to unwritten extents
 * and do extent conversion after IO is finished. If the last page is not fully
 * mapped, we update @map to the next extent in the last page that needs
 * mapping. Otherwise we submit the page for IO.
 */
static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
{
        struct folio_batch fbatch;
        unsigned nr, i;
        struct inode *inode = mpd->inode;
        int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
        pgoff_t start, end;
        ext4_lblk_t lblk;
        ext4_fsblk_t pblock;
        int err;
        bool map_bh = false;

        start = mpd->map.m_lblk >> bpp_bits;
        end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
        lblk = start << bpp_bits;
        pblock = mpd->map.m_pblk;

        folio_batch_init(&fbatch);
        while (start <= end) {
                nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch);
                if (nr == 0)
                        break;
                for (i = 0; i < nr; i++) {
                        struct folio *folio = fbatch.folios[i];

                        err = mpage_process_folio(mpd, folio, &lblk, &pblock,
                                                 &map_bh);
                        /*
                         * If map_bh is true, means page may require further bh
                         * mapping, or maybe the page was submitted for IO.
                         * So we return to call further extent mapping.
                         */
                        if (err < 0 || map_bh)
                                goto out;
                        /* Page fully mapped - let IO run! */
                        err = mpage_submit_folio(mpd, folio);
                        if (err < 0)
                                goto out;
                        mpage_folio_done(mpd, folio);
                }
                folio_batch_release(&fbatch);
        }
        /* Extent fully mapped and matches with page boundary. We are done. */
        mpd->map.m_len = 0;
        mpd->map.m_flags = 0;
        return 0;
out:
        folio_batch_release(&fbatch);
        return err;
}

static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
{
        struct inode *inode = mpd->inode;
        struct ext4_map_blocks *map = &mpd->map;
        int get_blocks_flags;
        int err, dioread_nolock;

        trace_ext4_da_write_pages_extent(inode, map);
        /*
         * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
         * to convert an unwritten extent to be initialized (in the case
         * where we have written into one or more preallocated blocks).  It is
         * possible that we're going to need more metadata blocks than
         * previously reserved. However we must not fail because we're in
         * writeback and there is nothing we can do about it so it might result
         * in data loss.  So use reserved blocks to allocate metadata if
         * possible.
         *
         * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if
         * the blocks in question are delalloc blocks.  This indicates
         * that the blocks and quotas has already been checked when
         * the data was copied into the page cache.
         */
        get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
                           EXT4_GET_BLOCKS_METADATA_NOFAIL |
                           EXT4_GET_BLOCKS_IO_SUBMIT;
        dioread_nolock = ext4_should_dioread_nolock(inode);
        if (dioread_nolock)
                get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
        if (map->m_flags & BIT(BH_Delay))
                get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;

        err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
        if (err < 0)
                return err;
        if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
                if (!mpd->io_submit.io_end->handle &&
                    ext4_handle_valid(handle)) {
                        mpd->io_submit.io_end->handle = handle->h_rsv_handle;
                        handle->h_rsv_handle = NULL;
                }
                ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
        }

        BUG_ON(map->m_len == 0);
        return 0;
}

/*
 * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
 *                                 mpd->len and submit pages underlying it for IO
 *
 * @handle - handle for journal operations
 * @mpd - extent to map
 * @give_up_on_write - we set this to true iff there is a fatal error and there
 *                     is no hope of writing the data. The caller should discard
 *                     dirty pages to avoid infinite loops.
 *
 * The function maps extent starting at mpd->lblk of length mpd->len. If it is
 * delayed, blocks are allocated, if it is unwritten, we may need to convert
 * them to initialized or split the described range from larger unwritten
 * extent. Note that we need not map all the described range since allocation
 * can return less blocks or the range is covered by more unwritten extents. We
 * cannot map more because we are limited by reserved transaction credits. On
 * the other hand we always make sure that the last touched page is fully
 * mapped so that it can be written out (and thus forward progress is
 * guaranteed). After mapping we submit all mapped pages for IO.
 */
static int mpage_map_and_submit_extent(handle_t *handle,
                                       struct mpage_da_data *mpd,
                                       bool *give_up_on_write)
{
        struct inode *inode = mpd->inode;
        struct ext4_map_blocks *map = &mpd->map;
        int err;
        loff_t disksize;
        int progress = 0;
        ext4_io_end_t *io_end = mpd->io_submit.io_end;
        struct ext4_io_end_vec *io_end_vec;

        io_end_vec = ext4_alloc_io_end_vec(io_end);
        if (IS_ERR(io_end_vec))
                return PTR_ERR(io_end_vec);
        io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
        do {
                err = mpage_map_one_extent(handle, mpd);
                if (err < 0) {
                        struct super_block *sb = inode->i_sb;

                        if (ext4_forced_shutdown(sb))
                                goto invalidate_dirty_pages;
                        /*
                         * Let the uper layers retry transient errors.
                         * In the case of ENOSPC, if ext4_count_free_blocks()
                         * is non-zero, a commit should free up blocks.
                         */
                        if ((err == -ENOMEM) ||
                            (err == -ENOSPC && ext4_count_free_clusters(sb))) {
                                if (progress)
                                        goto update_disksize;
                                return err;
                        }
                        ext4_msg(sb, KERN_CRIT,
                                 "Delayed block allocation failed for "
                                 "inode %lu at logical offset %llu with"
                                 " max blocks %u with error %d",
                                 inode->i_ino,
                                 (unsigned long long)map->m_lblk,
                                 (unsigned)map->m_len, -err);
                        ext4_msg(sb, KERN_CRIT,
                                 "This should not happen!! Data will "
                                 "be lost\n");
                        if (err == -ENOSPC)
                                ext4_print_free_blocks(inode);
                invalidate_dirty_pages:
                        *give_up_on_write = true;
                        return err;
                }
                progress = 1;
                /*
                 * Update buffer state, submit mapped pages, and get us new
                 * extent to map
                 */
                err = mpage_map_and_submit_buffers(mpd);
                if (err < 0)
                        goto update_disksize;
        } while (map->m_len);

update_disksize:
        /*
         * Update on-disk size after IO is submitted.  Races with
         * truncate are avoided by checking i_size under i_data_sem.
         */
        disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
        if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
                int err2;
                loff_t i_size;

                down_write(&EXT4_I(inode)->i_data_sem);
                i_size = i_size_read(inode);
                if (disksize > i_size)
                        disksize = i_size;
                if (disksize > EXT4_I(inode)->i_disksize)
                        EXT4_I(inode)->i_disksize = disksize;
                up_write(&EXT4_I(inode)->i_data_sem);
                err2 = ext4_mark_inode_dirty(handle, inode);
                if (err2) {
                        ext4_error_err(inode->i_sb, -err2,
                                       "Failed to mark inode %lu dirty",
                                       inode->i_ino);
                }
                if (!err)
                        err = err2;
        }
        return err;
}

/*
 * Calculate the total number of credits to reserve for one writepages
 * iteration. This is called from ext4_writepages(). We map an extent of
 * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
 * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
 * bpp - 1 blocks in bpp different extents.
 */
static int ext4_da_writepages_trans_blocks(struct inode *inode)
{
        int bpp = ext4_journal_blocks_per_page(inode);

        return ext4_meta_trans_blocks(inode,
                                MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
}

static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio,
                                     size_t len)
{
        struct buffer_head *page_bufs = folio_buffers(folio);
        struct inode *inode = folio->mapping->host;
        int ret, err;

        ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
                                     NULL, do_journal_get_write_access);
        err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
                                     NULL, write_end_fn);
        if (ret == 0)
                ret = err;
        err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len);
        if (ret == 0)
                ret = err;
        EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;

        return ret;
}

static int mpage_journal_page_buffers(handle_t *handle,
                                      struct mpage_da_data *mpd,
                                      struct folio *folio)
{
        struct inode *inode = mpd->inode;
        loff_t size = i_size_read(inode);
        size_t len = folio_size(folio);

        folio_clear_checked(folio);
        mpd->wbc->nr_to_write--;

        if (folio_pos(folio) + len > size &&
            !ext4_verity_in_progress(inode))
                len = size & (len - 1);

        return ext4_journal_folio_buffers(handle, folio, len);
}

/*
 * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
 *                                  needing mapping, submit mapped pages
 *
 * @mpd - where to look for pages
 *
 * Walk dirty pages in the mapping. If they are fully mapped, submit them for
 * IO immediately. If we cannot map blocks, we submit just already mapped
 * buffers in the page for IO and keep page dirty. When we can map blocks and
 * we find a page which isn't mapped we start accumulating extent of buffers
 * underlying these pages that needs mapping (formed by either delayed or
 * unwritten buffers). We also lock the pages containing these buffers. The
 * extent found is returned in @mpd structure (starting at mpd->lblk with
 * length mpd->len blocks).
 *
 * Note that this function can attach bios to one io_end structure which are
 * neither logically nor physically contiguous. Although it may seem as an
 * unnecessary complication, it is actually inevitable in blocksize < pagesize
 * case as we need to track IO to all buffers underlying a page in one io_end.
 */
static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
{
        struct address_space *mapping = mpd->inode->i_mapping;
        struct folio_batch fbatch;
        unsigned int nr_folios;
        pgoff_t index = mpd->first_page;
        pgoff_t end = mpd->last_page;
        xa_mark_t tag;
        int i, err = 0;
        int blkbits = mpd->inode->i_blkbits;
        ext4_lblk_t lblk;
        struct buffer_head *head;
        handle_t *handle = NULL;
        int bpp = ext4_journal_blocks_per_page(mpd->inode);

        if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
                tag = PAGECACHE_TAG_TOWRITE;
        else
                tag = PAGECACHE_TAG_DIRTY;

        mpd->map.m_len = 0;
        mpd->next_page = index;
        if (ext4_should_journal_data(mpd->inode)) {
                handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE,
                                            bpp);
                if (IS_ERR(handle))
                        return PTR_ERR(handle);
        }
        folio_batch_init(&fbatch);
        while (index <= end) {
                nr_folios = filemap_get_folios_tag(mapping, &index, end,
                                tag, &fbatch);
                if (nr_folios == 0)
                        break;

                for (i = 0; i < nr_folios; i++) {
                        struct folio *folio = fbatch.folios[i];

                        /*
                         * Accumulated enough dirty pages? This doesn't apply
                         * to WB_SYNC_ALL mode. For integrity sync we have to
                         * keep going because someone may be concurrently
                         * dirtying pages, and we might have synced a lot of
                         * newly appeared dirty pages, but have not synced all
                         * of the old dirty pages.
                         */
                        if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
                            mpd->wbc->nr_to_write <=
                            mpd->map.m_len >> (PAGE_SHIFT - blkbits))
                                goto out;

                        /* If we can't merge this page, we are done. */
                        if (mpd->map.m_len > 0 && mpd->next_page != folio->index)
                                goto out;

                        if (handle) {
                                err = ext4_journal_ensure_credits(handle, bpp,
                                                                  0);
                                if (err < 0)
                                        goto out;
                        }

                        folio_lock(folio);
                        /*
                         * If the page is no longer dirty, or its mapping no
                         * longer corresponds to inode we are writing (which
                         * means it has been truncated or invalidated), or the
                         * page is already under writeback and we are not doing
                         * a data integrity writeback, skip the page
                         */
                        if (!folio_test_dirty(folio) ||
                            (folio_test_writeback(folio) &&
                             (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
                            unlikely(folio->mapping != mapping)) {
                                folio_unlock(folio);
                                continue;
                        }

                        folio_wait_writeback(folio);
                        BUG_ON(folio_test_writeback(folio));

                        /*
                         * Should never happen but for buggy code in
                         * other subsystems that call
                         * set_page_dirty() without properly warning
                         * the file system first.  See [1] for more
                         * information.
                         *
                         * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz
                         */
                        if (!folio_buffers(folio)) {
                                ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index);
                                folio_clear_dirty(folio);
                                folio_unlock(folio);
                                continue;
                        }

                        if (mpd->map.m_len == 0)
                                mpd->first_page = folio->index;
                        mpd->next_page = folio_next_index(folio);
                        /*
                         * Writeout when we cannot modify metadata is simple.
                         * Just submit the page. For data=journal mode we
                         * first handle writeout of the page for checkpoint and
                         * only after that handle delayed page dirtying. This
                         * makes sure current data is checkpointed to the final
                         * location before possibly journalling it again which
                         * is desirable when the page is frequently dirtied
                         * through a pin.
                         */
                        if (!mpd->can_map) {
                                err = mpage_submit_folio(mpd, folio);
                                if (err < 0)
                                        goto out;
                                /* Pending dirtying of journalled data? */
                                if (folio_test_checked(folio)) {
                                        err = mpage_journal_page_buffers(handle,
                                                mpd, folio);
                                        if (err < 0)
                                                goto out;
                                        mpd->journalled_more_data = 1;
                                }
                                mpage_folio_done(mpd, folio);
                        } else {
                                /* Add all dirty buffers to mpd */
                                lblk = ((ext4_lblk_t)folio->index) <<
                                        (PAGE_SHIFT - blkbits);
                                head = folio_buffers(folio);
                                err = mpage_process_page_bufs(mpd, head, head,
                                                lblk);
                                if (err <= 0)
                                        goto out;
                                err = 0;
                        }
                }
                folio_batch_release(&fbatch);
                cond_resched();
        }
        mpd->scanned_until_end = 1;
        if (handle)
                ext4_journal_stop(handle);
        return 0;
out:
        folio_batch_release(&fbatch);
        if (handle)
                ext4_journal_stop(handle);
        return err;
}

static int ext4_do_writepages(struct mpage_da_data *mpd)
{
        struct writeback_control *wbc = mpd->wbc;
        pgoff_t        writeback_index = 0;
        long nr_to_write = wbc->nr_to_write;
        int range_whole = 0;
        int cycled = 1;
        handle_t *handle = NULL;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
        int needed_blocks, rsv_blocks = 0, ret = 0;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
        struct blk_plug plug;
        bool give_up_on_write = false;

        trace_ext4_writepages(inode, wbc);

        /*
         * No pages to write? This is mainly a kludge to avoid starting
         * a transaction for special inodes like journal inode on last iput()
         * because that could violate lock ordering on umount
         */
        if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                goto out_writepages;

        /*
         * If the filesystem has aborted, it is read-only, so return
         * right away instead of dumping stack traces later on that
         * will obscure the real source of the problem.  We test
         * fs shutdown state instead of sb->s_flag's SB_RDONLY because
         * the latter could be true if the filesystem is mounted
         * read-only, and in that case, ext4_writepages should
         * *never* be called, so if that ever happens, we would want
         * the stack trace.
         */
        if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) {
                ret = -EROFS;
                goto out_writepages;
        }

        /*
         * If we have inline data and arrive here, it means that
         * we will soon create the block for the 1st page, so
         * we'd better clear the inline data here.
         */
        if (ext4_has_inline_data(inode)) {
                /* Just inode will be modified... */
                handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        goto out_writepages;
                }
                BUG_ON(ext4_test_inode_state(inode,
                                EXT4_STATE_MAY_INLINE_DATA));
                ext4_destroy_inline_data(handle, inode);
                ext4_journal_stop(handle);
        }

        /*
         * data=journal mode does not do delalloc so we just need to writeout /
         * journal already mapped buffers. On the other hand we need to commit
         * transaction to make data stable. We expect all the data to be
         * already in the journal (the only exception are DMA pinned pages
         * dirtied behind our back) so we commit transaction here and run the
         * writeback loop to checkpoint them. The checkpointing is not actually
         * necessary to make data persistent *but* quite a few places (extent
         * shifting operations, fsverity, ...) depend on being able to drop
         * pagecache pages after calling filemap_write_and_wait() and for that
         * checkpointing needs to happen.
         */
        if (ext4_should_journal_data(inode)) {
                mpd->can_map = 0;
                if (wbc->sync_mode == WB_SYNC_ALL)
                        ext4_fc_commit(sbi->s_journal,
                                       EXT4_I(inode)->i_datasync_tid);
        }
        mpd->journalled_more_data = 0;

        if (ext4_should_dioread_nolock(inode)) {
                /*
                 * We may need to convert up to one extent per block in
                 * the page and we may dirty the inode.
                 */
                rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
                                                PAGE_SIZE >> inode->i_blkbits);
        }

        if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                range_whole = 1;

        if (wbc->range_cyclic) {
                writeback_index = mapping->writeback_index;
                if (writeback_index)
                        cycled = 0;
                mpd->first_page = writeback_index;
                mpd->last_page = -1;
        } else {
                mpd->first_page = wbc->range_start >> PAGE_SHIFT;
                mpd->last_page = wbc->range_end >> PAGE_SHIFT;
        }

        ext4_io_submit_init(&mpd->io_submit, wbc);
retry:
        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag_pages_for_writeback(mapping, mpd->first_page,
                                        mpd->last_page);
        blk_start_plug(&plug);

        /*
         * First writeback pages that don't need mapping - we can avoid
         * starting a transaction unnecessarily and also avoid being blocked
         * in the block layer on device congestion while having transaction
         * started.
         */
        mpd->do_map = 0;
        mpd->scanned_until_end = 0;
        mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
        if (!mpd->io_submit.io_end) {
                ret = -ENOMEM;
                goto unplug;
        }
        ret = mpage_prepare_extent_to_map(mpd);
        /* Unlock pages we didn't use */
        mpage_release_unused_pages(mpd, false);
        /* Submit prepared bio */
        ext4_io_submit(&mpd->io_submit);
        ext4_put_io_end_defer(mpd->io_submit.io_end);
        mpd->io_submit.io_end = NULL;
        if (ret < 0)
                goto unplug;

        while (!mpd->scanned_until_end && wbc->nr_to_write > 0) {
                /* For each extent of pages we use new io_end */
                mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
                if (!mpd->io_submit.io_end) {
                        ret = -ENOMEM;
                        break;
                }

                WARN_ON_ONCE(!mpd->can_map);
                /*
                 * We have two constraints: We find one extent to map and we
                 * must always write out whole page (makes a difference when
                 * blocksize < pagesize) so that we don't block on IO when we
                 * try to write out the rest of the page. Journalled mode is
                 * not supported by delalloc.
                 */
                BUG_ON(ext4_should_journal_data(inode));
                needed_blocks = ext4_da_writepages_trans_blocks(inode);

                /* start a new transaction */
                handle = ext4_journal_start_with_reserve(inode,
                                EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
                               "%ld pages, ino %lu; err %d", __func__,
                                wbc->nr_to_write, inode->i_ino, ret);
                        /* Release allocated io_end */
                        ext4_put_io_end(mpd->io_submit.io_end);
                        mpd->io_submit.io_end = NULL;
                        break;
                }
                mpd->do_map = 1;

                trace_ext4_da_write_pages(inode, mpd->first_page, wbc);
                ret = mpage_prepare_extent_to_map(mpd);
                if (!ret && mpd->map.m_len)
                        ret = mpage_map_and_submit_extent(handle, mpd,
                                        &give_up_on_write);
                /*
                 * Caution: If the handle is synchronous,
                 * ext4_journal_stop() can wait for transaction commit
                 * to finish which may depend on writeback of pages to
                 * complete or on page lock to be released.  In that
                 * case, we have to wait until after we have
                 * submitted all the IO, released page locks we hold,
                 * and dropped io_end reference (for extent conversion
                 * to be able to complete) before stopping the handle.
                 */
                if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
                        ext4_journal_stop(handle);
                        handle = NULL;
                        mpd->do_map = 0;
                }
                /* Unlock pages we didn't use */
                mpage_release_unused_pages(mpd, give_up_on_write);
                /* Submit prepared bio */
                ext4_io_submit(&mpd->io_submit);

                /*
                 * Drop our io_end reference we got from init. We have
                 * to be careful and use deferred io_end finishing if
                 * we are still holding the transaction as we can
                 * release the last reference to io_end which may end
                 * up doing unwritten extent conversion.
                 */
                if (handle) {
                        ext4_put_io_end_defer(mpd->io_submit.io_end);
                        ext4_journal_stop(handle);
                } else
                        ext4_put_io_end(mpd->io_submit.io_end);
                mpd->io_submit.io_end = NULL;

                if (ret == -ENOSPC && sbi->s_journal) {
                        /*
                         * Commit the transaction which would
                         * free blocks released in the transaction
                         * and try again
                         */
                        jbd2_journal_force_commit_nested(sbi->s_journal);
                        ret = 0;
                        continue;
                }
                /* Fatal error - ENOMEM, EIO... */
                if (ret)
                        break;
        }
unplug:
        blk_finish_plug(&plug);
        if (!ret && !cycled && wbc->nr_to_write > 0) {
                cycled = 1;
                mpd->last_page = writeback_index - 1;
                mpd->first_page = 0;
                goto retry;
        }

        /* Update index */
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                /*
                 * Set the writeback_index so that range_cyclic
                 * mode will write it back later
                 */
                mapping->writeback_index = mpd->first_page;

out_writepages:
        trace_ext4_writepages_result(inode, wbc, ret,
                                     nr_to_write - wbc->nr_to_write);
        return ret;
}

static int ext4_writepages(struct address_space *mapping,
                           struct writeback_control *wbc)
{
        struct super_block *sb = mapping->host->i_sb;
        struct mpage_da_data mpd = {
                .inode = mapping->host,
                .wbc = wbc,
                .can_map = 1,
        };
        int ret;
        int alloc_ctx;

        if (unlikely(ext4_forced_shutdown(sb)))
                return -EIO;

        alloc_ctx = ext4_writepages_down_read(sb);
        ret = ext4_do_writepages(&mpd);
        /*
         * For data=journal writeback we could have come across pages marked
         * for delayed dirtying (PageChecked) which were just added to the
         * running transaction. Try once more to get them to stable storage.
         */
        if (!ret && mpd.journalled_more_data)
                ret = ext4_do_writepages(&mpd);
        ext4_writepages_up_read(sb, alloc_ctx);

        return ret;
}

int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode)
{
        struct writeback_control wbc = {
                .sync_mode = WB_SYNC_ALL,
                .nr_to_write = LONG_MAX,
                .range_start = jinode->i_dirty_start,
                .range_end = jinode->i_dirty_end,
        };
        struct mpage_da_data mpd = {
                .inode = jinode->i_vfs_inode,
                .wbc = &wbc,
                .can_map = 0,
        };
        return ext4_do_writepages(&mpd);
}

static int ext4_dax_writepages(struct address_space *mapping,
                               struct writeback_control *wbc)
{
        int ret;
        long nr_to_write = wbc->nr_to_write;
        struct inode *inode = mapping->host;
        int alloc_ctx;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;

        alloc_ctx = ext4_writepages_down_read(inode->i_sb);
        trace_ext4_writepages(inode, wbc);

        ret = dax_writeback_mapping_range(mapping,
                                          EXT4_SB(inode->i_sb)->s_daxdev, wbc);
        trace_ext4_writepages_result(inode, wbc, ret,
                                     nr_to_write - wbc->nr_to_write);
        ext4_writepages_up_read(inode->i_sb, alloc_ctx);
        return ret;
}

static int ext4_nonda_switch(struct super_block *sb)
{
        s64 free_clusters, dirty_clusters;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /*
         * switch to non delalloc mode if we are running low
         * on free block. The free block accounting via percpu
         * counters can get slightly wrong with percpu_counter_batch getting
         * accumulated on each CPU without updating global counters
         * Delalloc need an accurate free block accounting. So switch
         * to non delalloc when we are near to error range.
         */
        free_clusters =
                percpu_counter_read_positive(&sbi->s_freeclusters_counter);
        dirty_clusters =
                percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
        /*
         * Start pushing delalloc when 1/2 of free blocks are dirty.
         */
        if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
                try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);

        if (2 * free_clusters < 3 * dirty_clusters ||
            free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
                /*
                 * free block count is less than 150% of dirty blocks
                 * or free blocks is less than watermark
                 */
                return 1;
        }
        return 0;
}

static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                               loff_t pos, unsigned len,
                               struct page **pagep, void **fsdata)
{
        int ret, retries = 0;
        struct folio *folio;
        pgoff_t index;
        struct inode *inode = mapping->host;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;

        index = pos >> PAGE_SHIFT;

        if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) {
                *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
                return ext4_write_begin(file, mapping, pos,
                                        len, pagep, fsdata);
        }
        *fsdata = (void *)0;
        trace_ext4_da_write_begin(inode, pos, len);

        if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
                ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len,
                                                      pagep, fsdata);
                if (ret < 0)
                        return ret;
                if (ret == 1)
                        return 0;
        }

retry:
        folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio))
                return PTR_ERR(folio);

#ifdef CONFIG_FS_ENCRYPTION
        ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep);
#else
        ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep);
#endif
        if (ret < 0) {
                folio_unlock(folio);
                folio_put(folio);
                /*
                 * block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                 * i_size_read because we hold inode lock.
                 */
                if (pos + len > inode->i_size)
                        ext4_truncate_failed_write(inode);

                if (ret == -ENOSPC &&
                    ext4_should_retry_alloc(inode->i_sb, &retries))
                        goto retry;
                return ret;
        }

        *pagep = &folio->page;
        return ret;
}

/*
 * Check if we should update i_disksize
 * when write to the end of file but not require block allocation
 */
static int ext4_da_should_update_i_disksize(struct folio *folio,
                                            unsigned long offset)
{
        struct buffer_head *bh;
        struct inode *inode = folio->mapping->host;
        unsigned int idx;
        int i;

        bh = folio_buffers(folio);
        idx = offset >> inode->i_blkbits;

        for (i = 0; i < idx; i++)
                bh = bh->b_this_page;

        if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
                return 0;
        return 1;
}

static int ext4_da_do_write_end(struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct folio *folio)
{
        struct inode *inode = mapping->host;
        loff_t old_size = inode->i_size;
        bool disksize_changed = false;
        loff_t new_i_size;

        /*
         * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
         * flag, which all that's needed to trigger page writeback.
         */
        copied = block_write_end(NULL, mapping, pos, len, copied,
                        &folio->page, NULL);
        new_i_size = pos + copied;

        /*
         * It's important to update i_size while still holding folio lock,
         * because folio writeout could otherwise come in and zero beyond
         * i_size.
         *
         * Since we are holding inode lock, we are sure i_disksize <=
         * i_size. We also know that if i_disksize < i_size, there are
         * delalloc writes pending in the range up to i_size. If the end of
         * the current write is <= i_size, there's no need to touch
         * i_disksize since writeback will push i_disksize up to i_size
         * eventually. If the end of the current write is > i_size and
         * inside an allocated block which ext4_da_should_update_i_disksize()
         * checked, we need to update i_disksize here as certain
         * ext4_writepages() paths not allocating blocks and update i_disksize.
         */
        if (new_i_size > inode->i_size) {
                unsigned long end;

                i_size_write(inode, new_i_size);
                end = (new_i_size - 1) & (PAGE_SIZE - 1);
                if (copied && ext4_da_should_update_i_disksize(folio, end)) {
                        ext4_update_i_disksize(inode, new_i_size);
                        disksize_changed = true;
                }
        }

        folio_unlock(folio);
        folio_put(folio);

        if (old_size < pos)
                pagecache_isize_extended(inode, old_size, pos);

        if (disksize_changed) {
                handle_t *handle;

                handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
                if (IS_ERR(handle))
                        return PTR_ERR(handle);
                ext4_mark_inode_dirty(handle, inode);
                ext4_journal_stop(handle);
        }

        return copied;
}

static int ext4_da_write_end(struct file *file,
                             struct address_space *mapping,
                             loff_t pos, unsigned len, unsigned copied,
                             struct page *page, void *fsdata)
{
        struct inode *inode = mapping->host;
        int write_mode = (int)(unsigned long)fsdata;
        struct folio *folio = page_folio(page);

        if (write_mode == FALL_BACK_TO_NONDELALLOC)
                return ext4_write_end(file, mapping, pos,
                                      len, copied, &folio->page, fsdata);

        trace_ext4_da_write_end(inode, pos, len, copied);

        if (write_mode != CONVERT_INLINE_DATA &&
            ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
            ext4_has_inline_data(inode))
                return ext4_write_inline_data_end(inode, pos, len, copied,
                                                  folio);

        if (unlikely(copied < len) && !folio_test_uptodate(folio))
                copied = 0;

        return ext4_da_do_write_end(mapping, pos, len, copied, folio);
}

/*
 * Force all delayed allocation blocks to be allocated for a given inode.
 */
int ext4_alloc_da_blocks(struct inode *inode)
{
        trace_ext4_alloc_da_blocks(inode);

        if (!EXT4_I(inode)->i_reserved_data_blocks)
                return 0;

        /*
         * We do something simple for now.  The filemap_flush() will
         * also start triggering a write of the data blocks, which is
         * not strictly speaking necessary (and for users of
         * laptop_mode, not even desirable).  However, to do otherwise
         * would require replicating code paths in:
         *
         * ext4_writepages() ->
         *    write_cache_pages() ---> (via passed in callback function)
         *        __mpage_da_writepage() -->
         *           mpage_add_bh_to_extent()
         *           mpage_da_map_blocks()
         *
         * The problem is that write_cache_pages(), located in
         * mm/page-writeback.c, marks pages clean in preparation for
         * doing I/O, which is not desirable if we're not planning on
         * doing I/O at all.
         *
         * We could call write_cache_pages(), and then redirty all of
         * the pages by calling redirty_page_for_writepage() but that
         * would be ugly in the extreme.  So instead we would need to
         * replicate parts of the code in the above functions,
         * simplifying them because we wouldn't actually intend to
         * write out the pages, but rather only collect contiguous
         * logical block extents, call the multi-block allocator, and
         * then update the buffer heads with the block allocations.
         *
         * For now, though, we'll cheat by calling filemap_flush(),
         * which will map the blocks, and start the I/O, but not
         * actually wait for the I/O to complete.
         */
        return filemap_flush(inode->i_mapping);
}

/*
 * bmap() is special.  It gets used by applications such as lilo and by
 * the swapper to find the on-disk block of a specific piece of data.
 *
 * Naturally, this is dangerous if the block concerned is still in the
 * journal.  If somebody makes a swapfile on an ext4 data-journaling
 * filesystem and enables swap, then they may get a nasty shock when the
 * data getting swapped to that swapfile suddenly gets overwritten by
 * the original zero's written out previously to the journal and
 * awaiting writeback in the kernel's buffer cache.
 *
 * So, if we see any bmap calls here on a modified, data-journaled file,
 * take extra steps to flush any blocks which might be in the cache.
 */
static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
{
        struct inode *inode = mapping->host;
        sector_t ret = 0;

        inode_lock_shared(inode);
        /*
         * We can get here for an inline file via the FIBMAP ioctl
         */
        if (ext4_has_inline_data(inode))
                goto out;

        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
            (test_opt(inode->i_sb, DELALLOC) ||
             ext4_should_journal_data(inode))) {
                /*
                 * With delalloc or journalled data we want to sync the file so
                 * that we can make sure we allocate blocks for file and data
                 * is in place for the user to see it
                 */
                filemap_write_and_wait(mapping);
        }

        ret = iomap_bmap(mapping, block, &ext4_iomap_ops);

out:
        inode_unlock_shared(inode);
        return ret;
}

static int ext4_read_folio(struct file *file, struct folio *folio)
{
        int ret = -EAGAIN;
        struct inode *inode = folio->mapping->host;

        trace_ext4_read_folio(inode, folio);

        if (ext4_has_inline_data(inode))
                ret = ext4_readpage_inline(inode, folio);

        if (ret == -EAGAIN)
                return ext4_mpage_readpages(inode, NULL, folio);

        return ret;
}

static void ext4_readahead(struct readahead_control *rac)
{
        struct inode *inode = rac->mapping->host;

        /* If the file has inline data, no need to do readahead. */
        if (ext4_has_inline_data(inode))
                return;

        ext4_mpage_readpages(inode, rac, NULL);
}

static void ext4_invalidate_folio(struct folio *folio, size_t offset,
                                size_t length)
{
        trace_ext4_invalidate_folio(folio, offset, length);

        /* No journalling happens on data buffers when this function is used */
        WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio)));

        block_invalidate_folio(folio, offset, length);
}

static int __ext4_journalled_invalidate_folio(struct folio *folio,
                                            size_t offset, size_t length)
{
        journal_t *journal = EXT4_JOURNAL(folio->mapping->host);

        trace_ext4_journalled_invalidate_folio(folio, offset, length);

        /*
         * If it's a full truncate we just forget about the pending dirtying
         */
        if (offset == 0 && length == folio_size(folio))
                folio_clear_checked(folio);

        return jbd2_journal_invalidate_folio(journal, folio, offset, length);
}

/* Wrapper for aops... */
static void ext4_journalled_invalidate_folio(struct folio *folio,
                                           size_t offset,
                                           size_t length)
{
        WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0);
}

static bool ext4_release_folio(struct folio *folio, gfp_t wait)
{
        struct inode *inode = folio->mapping->host;
        journal_t *journal = EXT4_JOURNAL(inode);

        trace_ext4_release_folio(inode, folio);

        /* Page has dirty journalled data -> cannot release */
        if (folio_test_checked(folio))
                return false;
        if (journal)
                return jbd2_journal_try_to_free_buffers(journal, folio);
        else
                return try_to_free_buffers(folio);
}

static bool ext4_inode_datasync_dirty(struct inode *inode)
{
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;

        if (journal) {
                if (jbd2_transaction_committed(journal,
                        EXT4_I(inode)->i_datasync_tid))
                        return false;
                if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
                        return !list_empty(&EXT4_I(inode)->i_fc_list);
                return true;
        }

        /* Any metadata buffers to write? */
        if (!list_empty(&inode->i_mapping->i_private_list))
                return true;
        return inode->i_state & I_DIRTY_DATASYNC;
}

static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
                           struct ext4_map_blocks *map, loff_t offset,
                           loff_t length, unsigned int flags)
{
        u8 blkbits = inode->i_blkbits;

        /*
         * Writes that span EOF might trigger an I/O size update on completion,
         * so consider them to be dirty for the purpose of O_DSYNC, even if
         * there is no other metadata changes being made or are pending.
         */
        iomap->flags = 0;
        if (ext4_inode_datasync_dirty(inode) ||
            offset + length > i_size_read(inode))
                iomap->flags |= IOMAP_F_DIRTY;

        if (map->m_flags & EXT4_MAP_NEW)
                iomap->flags |= IOMAP_F_NEW;

        if (flags & IOMAP_DAX)
                iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
        else
                iomap->bdev = inode->i_sb->s_bdev;
        iomap->offset = (u64) map->m_lblk << blkbits;
        iomap->length = (u64) map->m_len << blkbits;

        if ((map->m_flags & EXT4_MAP_MAPPED) &&
            !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                iomap->flags |= IOMAP_F_MERGED;

        /*
         * Flags passed to ext4_map_blocks() for direct I/O writes can result
         * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
         * set. In order for any allocated unwritten extents to be converted
         * into written extents correctly within the ->end_io() handler, we
         * need to ensure that the iomap->type is set appropriately. Hence, the
         * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
         * been set first.
         */
        if (map->m_flags & EXT4_MAP_UNWRITTEN) {
                iomap->type = IOMAP_UNWRITTEN;
                iomap->addr = (u64) map->m_pblk << blkbits;
                if (flags & IOMAP_DAX)
                        iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
        } else if (map->m_flags & EXT4_MAP_MAPPED) {
                iomap->type = IOMAP_MAPPED;
                iomap->addr = (u64) map->m_pblk << blkbits;
                if (flags & IOMAP_DAX)
                        iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
        } else if (map->m_flags & EXT4_MAP_DELAYED) {
                iomap->type = IOMAP_DELALLOC;
                iomap->addr = IOMAP_NULL_ADDR;
        } else {
                iomap->type = IOMAP_HOLE;
                iomap->addr = IOMAP_NULL_ADDR;
        }
}

static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
                            unsigned int flags)
{
        handle_t *handle;
        u8 blkbits = inode->i_blkbits;
        int ret, dio_credits, m_flags = 0, retries = 0;

        /*
         * Trim the mapping request to the maximum value that we can map at
         * once for direct I/O.
         */
        if (map->m_len > DIO_MAX_BLOCKS)
                map->m_len = DIO_MAX_BLOCKS;
        dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);

retry:
        /*
         * Either we allocate blocks and then don't get an unwritten extent, so
         * in that case we have reserved enough credits. Or, the blocks are
         * already allocated and unwritten. In that case, the extent conversion
         * fits into the credits as well.
         */
        handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        /*
         * DAX and direct I/O are the only two operations that are currently
         * supported with IOMAP_WRITE.
         */
        WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT)));
        if (flags & IOMAP_DAX)
                m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
        /*
         * We use i_size instead of i_disksize here because delalloc writeback
         * can complete at any point during the I/O and subsequently push the
         * i_disksize out to i_size. This could be beyond where direct I/O is
         * happening and thus expose allocated blocks to direct I/O reads.
         */
        else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode))
                m_flags = EXT4_GET_BLOCKS_CREATE;
        else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;

        ret = ext4_map_blocks(handle, inode, map, m_flags);

        /*
         * We cannot fill holes in indirect tree based inodes as that could
         * expose stale data in the case of a crash. Use the magic error code
         * to fallback to buffered I/O.
         */
        if (!m_flags && !ret)
                ret = -ENOTBLK;

        ext4_journal_stop(handle);
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;

        return ret;
}


static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
                unsigned flags, struct iomap *iomap, struct iomap *srcmap)
{
        int ret;
        struct ext4_map_blocks map;
        u8 blkbits = inode->i_blkbits;

        if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
                return -EINVAL;

        if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
                return -ERANGE;

        /*
         * Calculate the first and last logical blocks respectively.
         */
        map.m_lblk = offset >> blkbits;
        map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
                          EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;

        if (flags & IOMAP_WRITE) {
                /*
                 * We check here if the blocks are already allocated, then we
                 * don't need to start a journal txn and we can directly return
                 * the mapping information. This could boost performance
                 * especially in multi-threaded overwrite requests.
                 */
                if (offset + length <= i_size_read(inode)) {
                        ret = ext4_map_blocks(NULL, inode, &map, 0);
                        if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED))
                                goto out;
                }
                ret = ext4_iomap_alloc(inode, &map, flags);
        } else {
                ret = ext4_map_blocks(NULL, inode, &map, 0);
        }

        if (ret < 0)
                return ret;
out:
        /*
         * When inline encryption is enabled, sometimes I/O to an encrypted file
         * has to be broken up to guarantee DUN contiguity.  Handle this by
         * limiting the length of the mapping returned.
         */
        map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);

        ext4_set_iomap(inode, iomap, &map, offset, length, flags);

        return 0;
}

static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
                loff_t length, unsigned flags, struct iomap *iomap,
                struct iomap *srcmap)
{
        int ret;

        /*
         * Even for writes we don't need to allocate blocks, so just pretend
         * we are reading to save overhead of starting a transaction.
         */
        flags &= ~IOMAP_WRITE;
        ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap);
        WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED);
        return ret;
}

static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
                          ssize_t written, unsigned flags, struct iomap *iomap)
{
        /*
         * Check to see whether an error occurred while writing out the data to
         * the allocated blocks. If so, return the magic error code so that we
         * fallback to buffered I/O and attempt to complete the remainder of
         * the I/O. Any blocks that may have been allocated in preparation for
         * the direct I/O will be reused during buffered I/O.
         */
        if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
                return -ENOTBLK;

        return 0;
}

const struct iomap_ops ext4_iomap_ops = {
        .iomap_begin                = ext4_iomap_begin,
        .iomap_end                = ext4_iomap_end,
};

const struct iomap_ops ext4_iomap_overwrite_ops = {
        .iomap_begin                = ext4_iomap_overwrite_begin,
        .iomap_end                = ext4_iomap_end,
};

static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
                                   loff_t length, unsigned int flags,
                                   struct iomap *iomap, struct iomap *srcmap)
{
        int ret;
        struct ext4_map_blocks map;
        u8 blkbits = inode->i_blkbits;

        if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
                return -EINVAL;

        if (ext4_has_inline_data(inode)) {
                ret = ext4_inline_data_iomap(inode, iomap);
                if (ret != -EAGAIN) {
                        if (ret == 0 && offset >= iomap->length)
                                ret = -ENOENT;
                        return ret;
                }
        }

        /*
         * Calculate the first and last logical block respectively.
         */
        map.m_lblk = offset >> blkbits;
        map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
                          EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;

        /*
         * Fiemap callers may call for offset beyond s_bitmap_maxbytes.
         * So handle it here itself instead of querying ext4_map_blocks().
         * Since ext4_map_blocks() will warn about it and will return
         * -EIO error.
         */
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

                if (offset >= sbi->s_bitmap_maxbytes) {
                        map.m_flags = 0;
                        goto set_iomap;
                }
        }

        ret = ext4_map_blocks(NULL, inode, &map, 0);
        if (ret < 0)
                return ret;
set_iomap:
        ext4_set_iomap(inode, iomap, &map, offset, length, flags);

        return 0;
}

const struct iomap_ops ext4_iomap_report_ops = {
        .iomap_begin = ext4_iomap_begin_report,
};

/*
 * For data=journal mode, folio should be marked dirty only when it was
 * writeably mapped. When that happens, it was already attached to the
 * transaction and marked as jbddirty (we take care of this in
 * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings
 * so we should have nothing to do here, except for the case when someone
 * had the page pinned and dirtied the page through this pin (e.g. by doing
 * direct IO to it). In that case we'd need to attach buffers here to the
 * transaction but we cannot due to lock ordering.  We cannot just dirty the
 * folio and leave attached buffers clean, because the buffers' dirty state is
 * "definitive".  We cannot just set the buffers dirty or jbddirty because all
 * the journalling code will explode.  So what we do is to mark the folio
 * "pending dirty" and next time ext4_writepages() is called, attach buffers
 * to the transaction appropriately.
 */
static bool ext4_journalled_dirty_folio(struct address_space *mapping,
                struct folio *folio)
{
        WARN_ON_ONCE(!folio_buffers(folio));
        if (folio_maybe_dma_pinned(folio))
                folio_set_checked(folio);
        return filemap_dirty_folio(mapping, folio);
}

static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio)
{
        WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio));
        WARN_ON_ONCE(!folio_buffers(folio));
        return block_dirty_folio(mapping, folio);
}

static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
                                    struct file *file, sector_t *span)
{
        return iomap_swapfile_activate(sis, file, span,
                                       &ext4_iomap_report_ops);
}

static const struct address_space_operations ext4_aops = {
        .read_folio                = ext4_read_folio,
        .readahead                = ext4_readahead,
        .writepages                = ext4_writepages,
        .write_begin                = ext4_write_begin,
        .write_end                = ext4_write_end,
        .dirty_folio                = ext4_dirty_folio,
        .bmap                        = ext4_bmap,
        .invalidate_folio        = ext4_invalidate_folio,
        .release_folio                = ext4_release_folio,
        .migrate_folio                = buffer_migrate_folio,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_folio        = generic_error_remove_folio,
        .swap_activate                = ext4_iomap_swap_activate,
};

static const struct address_space_operations ext4_journalled_aops = {
        .read_folio                = ext4_read_folio,
        .readahead                = ext4_readahead,
        .writepages                = ext4_writepages,
        .write_begin                = ext4_write_begin,
        .write_end                = ext4_journalled_write_end,
        .dirty_folio                = ext4_journalled_dirty_folio,
        .bmap                        = ext4_bmap,
        .invalidate_folio        = ext4_journalled_invalidate_folio,
        .release_folio                = ext4_release_folio,
        .migrate_folio                = buffer_migrate_folio_norefs,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_folio        = generic_error_remove_folio,
        .swap_activate                = ext4_iomap_swap_activate,
};

static const struct address_space_operations ext4_da_aops = {
        .read_folio                = ext4_read_folio,
        .readahead                = ext4_readahead,
        .writepages                = ext4_writepages,
        .write_begin                = ext4_da_write_begin,
        .write_end                = ext4_da_write_end,
        .dirty_folio                = ext4_dirty_folio,
        .bmap                        = ext4_bmap,
        .invalidate_folio        = ext4_invalidate_folio,
        .release_folio                = ext4_release_folio,
        .migrate_folio                = buffer_migrate_folio,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_folio        = generic_error_remove_folio,
        .swap_activate                = ext4_iomap_swap_activate,
};

static const struct address_space_operations ext4_dax_aops = {
        .writepages                = ext4_dax_writepages,
        .dirty_folio                = noop_dirty_folio,
        .bmap                        = ext4_bmap,
        .swap_activate                = ext4_iomap_swap_activate,
};

void ext4_set_aops(struct inode *inode)
{
        switch (ext4_inode_journal_mode(inode)) {
        case EXT4_INODE_ORDERED_DATA_MODE:
        case EXT4_INODE_WRITEBACK_DATA_MODE:
                break;
        case EXT4_INODE_JOURNAL_DATA_MODE:
                inode->i_mapping->a_ops = &ext4_journalled_aops;
                return;
        default:
                BUG();
        }
        if (IS_DAX(inode))
                inode->i_mapping->a_ops = &ext4_dax_aops;
        else if (test_opt(inode->i_sb, DELALLOC))
                inode->i_mapping->a_ops = &ext4_da_aops;
        else
                inode->i_mapping->a_ops = &ext4_aops;
}

/*
 * Here we can't skip an unwritten buffer even though it usually reads zero
 * because it might have data in pagecache (eg, if called from ext4_zero_range,
 * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a
 * racing writeback can come later and flush the stale pagecache to disk.
 */
static int __ext4_block_zero_page_range(handle_t *handle,
                struct address_space *mapping, loff_t from, loff_t length)
{
        ext4_fsblk_t index = from >> PAGE_SHIFT;
        unsigned offset = from & (PAGE_SIZE-1);
        unsigned blocksize, pos;
        ext4_lblk_t iblock;
        struct inode *inode = mapping->host;
        struct buffer_head *bh;
        struct folio *folio;
        int err = 0;

        folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT,
                                    FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
                                    mapping_gfp_constraint(mapping, ~__GFP_FS));
        if (IS_ERR(folio))
                return PTR_ERR(folio);

        blocksize = inode->i_sb->s_blocksize;

        iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);

        bh = folio_buffers(folio);
        if (!bh)
                bh = create_empty_buffers(folio, blocksize, 0);

        /* Find the buffer that contains "offset" */
        pos = blocksize;
        while (offset >= pos) {
                bh = bh->b_this_page;
                iblock++;
                pos += blocksize;
        }
        if (buffer_freed(bh)) {
                BUFFER_TRACE(bh, "freed: skip");
                goto unlock;
        }
        if (!buffer_mapped(bh)) {
                BUFFER_TRACE(bh, "unmapped");
                ext4_get_block(inode, iblock, bh, 0);
                /* unmapped? It's a hole - nothing to do */
                if (!buffer_mapped(bh)) {
                        BUFFER_TRACE(bh, "still unmapped");
                        goto unlock;
                }
        }

        /* Ok, it's mapped. Make sure it's up-to-date */
        if (folio_test_uptodate(folio))
                set_buffer_uptodate(bh);

        if (!buffer_uptodate(bh)) {
                err = ext4_read_bh_lock(bh, 0, true);
                if (err)
                        goto unlock;
                if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
                        /* We expect the key to be set. */
                        BUG_ON(!fscrypt_has_encryption_key(inode));
                        err = fscrypt_decrypt_pagecache_blocks(folio,
                                                               blocksize,
                                                               bh_offset(bh));
                        if (err) {
                                clear_buffer_uptodate(bh);
                                goto unlock;
                        }
                }
        }
        if (ext4_should_journal_data(inode)) {
                BUFFER_TRACE(bh, "get write access");
                err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
                                                    EXT4_JTR_NONE);
                if (err)
                        goto unlock;
        }
        folio_zero_range(folio, offset, length);
        BUFFER_TRACE(bh, "zeroed end of block");

        if (ext4_should_journal_data(inode)) {
                err = ext4_dirty_journalled_data(handle, bh);
        } else {
                err = 0;
                mark_buffer_dirty(bh);
                if (ext4_should_order_data(inode))
                        err = ext4_jbd2_inode_add_write(handle, inode, from,
                                        length);
        }

unlock:
        folio_unlock(folio);
        folio_put(folio);
        return err;
}

/*
 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
 * starting from file offset 'from'.  The range to be zero'd must
 * be contained with in one block.  If the specified range exceeds
 * the end of the block it will be shortened to end of the block
 * that corresponds to 'from'
 */
static int ext4_block_zero_page_range(handle_t *handle,
                struct address_space *mapping, loff_t from, loff_t length)
{
        struct inode *inode = mapping->host;
        unsigned offset = from & (PAGE_SIZE-1);
        unsigned blocksize = inode->i_sb->s_blocksize;
        unsigned max = blocksize - (offset & (blocksize - 1));

        /*
         * correct length if it does not fall between
         * 'from' and the end of the block
         */
        if (length > max || length < 0)
                length = max;

        if (IS_DAX(inode)) {
                return dax_zero_range(inode, from, length, NULL,
                                      &ext4_iomap_ops);
        }
        return __ext4_block_zero_page_range(handle, mapping, from, length);
}

/*
 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
 * up to the end of the block which corresponds to `from'.
 * This required during truncate. We need to physically zero the tail end
 * of that block so it doesn't yield old data if the file is later grown.
 */
static int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from)
{
        unsigned offset = from & (PAGE_SIZE-1);
        unsigned length;
        unsigned blocksize;
        struct inode *inode = mapping->host;

        /* If we are processing an encrypted inode during orphan list handling */
        if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode))
                return 0;

        blocksize = inode->i_sb->s_blocksize;
        length = blocksize - (offset & (blocksize - 1));

        return ext4_block_zero_page_range(handle, mapping, from, length);
}

int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
                             loff_t lstart, loff_t length)
{
        struct super_block *sb = inode->i_sb;
        struct address_space *mapping = inode->i_mapping;
        unsigned partial_start, partial_end;
        ext4_fsblk_t start, end;
        loff_t byte_end = (lstart + length - 1);
        int err = 0;

        partial_start = lstart & (sb->s_blocksize - 1);
        partial_end = byte_end & (sb->s_blocksize - 1);

        start = lstart >> sb->s_blocksize_bits;
        end = byte_end >> sb->s_blocksize_bits;

        /* Handle partial zero within the single block */
        if (start == end &&
            (partial_start || (partial_end != sb->s_blocksize - 1))) {
                err = ext4_block_zero_page_range(handle, mapping,
                                                 lstart, length);
                return err;
        }
        /* Handle partial zero out on the start of the range */
        if (partial_start) {
                err = ext4_block_zero_page_range(handle, mapping,
                                                 lstart, sb->s_blocksize);
                if (err)
                        return err;
        }
        /* Handle partial zero out on the end of the range */
        if (partial_end != sb->s_blocksize - 1)
                err = ext4_block_zero_page_range(handle, mapping,
                                                 byte_end - partial_end,
                                                 partial_end + 1);
        return err;
}

int ext4_can_truncate(struct inode *inode)
{
        if (S_ISREG(inode->i_mode))
                return 1;
        if (S_ISDIR(inode->i_mode))
                return 1;
        if (S_ISLNK(inode->i_mode))
                return !ext4_inode_is_fast_symlink(inode);
        return 0;
}

/*
 * We have to make sure i_disksize gets properly updated before we truncate
 * page cache due to hole punching or zero range. Otherwise i_disksize update
 * can get lost as it may have been postponed to submission of writeback but
 * that will never happen after we truncate page cache.
 */
int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
                                      loff_t len)
{
        handle_t *handle;
        int ret;

        loff_t size = i_size_read(inode);

        WARN_ON(!inode_is_locked(inode));
        if (offset > size || offset + len < size)
                return 0;

        if (EXT4_I(inode)->i_disksize >= size)
                return 0;

        handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ext4_update_i_disksize(inode, size);
        ret = ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);

        return ret;
}

static void ext4_wait_dax_page(struct inode *inode)
{
        filemap_invalidate_unlock(inode->i_mapping);
        schedule();
        filemap_invalidate_lock(inode->i_mapping);
}

int ext4_break_layouts(struct inode *inode)
{
        struct page *page;
        int error;

        if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
                return -EINVAL;

        do {
                page = dax_layout_busy_page(inode->i_mapping);
                if (!page)
                        return 0;

                error = ___wait_var_event(&page->_refcount,
                                atomic_read(&page->_refcount) == 1,
                                TASK_INTERRUPTIBLE, 0, 0,
                                ext4_wait_dax_page(inode));
        } while (error == 0);

        return error;
}

/*
 * ext4_punch_hole: punches a hole in a file by releasing the blocks
 * associated with the given offset and length
 *
 * @inode:  File inode
 * @offset: The offset where the hole will begin
 * @len:    The length of the hole
 *
 * Returns: 0 on success or negative on failure
 */

int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        ext4_lblk_t first_block, stop_block;
        struct address_space *mapping = inode->i_mapping;
        loff_t first_block_offset, last_block_offset, max_length;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        handle_t *handle;
        unsigned int credits;
        int ret = 0, ret2 = 0;

        trace_ext4_punch_hole(inode, offset, length, 0);

        /*
         * Write out all dirty pages to avoid race conditions
         * Then release them.
         */
        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
                ret = filemap_write_and_wait_range(mapping, offset,
                                                   offset + length - 1);
                if (ret)
                        return ret;
        }

        inode_lock(inode);

        /* No need to punch hole beyond i_size */
        if (offset >= inode->i_size)
                goto out_mutex;

        /*
         * If the hole extends beyond i_size, set the hole
         * to end after the page that contains i_size
         */
        if (offset + length > inode->i_size) {
                length = inode->i_size +
                   PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) -
                   offset;
        }

        /*
         * For punch hole the length + offset needs to be within one block
         * before last range. Adjust the length if it goes beyond that limit.
         */
        max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
        if (offset + length > max_length)
                length = max_length - offset;

        if (offset & (sb->s_blocksize - 1) ||
            (offset + length) & (sb->s_blocksize - 1)) {
                /*
                 * Attach jinode to inode for jbd2 if we do any zeroing of
                 * partial block
                 */
                ret = ext4_inode_attach_jinode(inode);
                if (ret < 0)
                        goto out_mutex;

        }

        /* Wait all existing dio workers, newcomers will block on i_rwsem */
        inode_dio_wait(inode);

        ret = file_modified(file);
        if (ret)
                goto out_mutex;

        /*
         * Prevent page faults from reinstantiating pages we have released from
         * page cache.
         */
        filemap_invalidate_lock(mapping);

        ret = ext4_break_layouts(inode);
        if (ret)
                goto out_dio;

        first_block_offset = round_up(offset, sb->s_blocksize);
        last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;

        /* Now release the pages and zero block aligned part of pages*/
        if (last_block_offset > first_block_offset) {
                ret = ext4_update_disksize_before_punch(inode, offset, length);
                if (ret)
                        goto out_dio;
                truncate_pagecache_range(inode, first_block_offset,
                                         last_block_offset);
        }

        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                credits = ext4_writepage_trans_blocks(inode);
        else
                credits = ext4_blocks_for_truncate(inode);
        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                ext4_std_error(sb, ret);
                goto out_dio;
        }

        ret = ext4_zero_partial_blocks(handle, inode, offset,
                                       length);
        if (ret)
                goto out_stop;

        first_block = (offset + sb->s_blocksize - 1) >>
                EXT4_BLOCK_SIZE_BITS(sb);
        stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);

        /* If there are blocks to remove, do it */
        if (stop_block > first_block) {
                ext4_lblk_t hole_len = stop_block - first_block;

                down_write(&EXT4_I(inode)->i_data_sem);
                ext4_discard_preallocations(inode);

                ext4_es_remove_extent(inode, first_block, hole_len);

                if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                        ret = ext4_ext_remove_space(inode, first_block,
                                                    stop_block - 1);
                else
                        ret = ext4_ind_remove_space(handle, inode, first_block,
                                                    stop_block);

                ext4_es_insert_extent(inode, first_block, hole_len, ~0,
                                      EXTENT_STATUS_HOLE);
                up_write(&EXT4_I(inode)->i_data_sem);
        }
        ext4_fc_track_range(handle, inode, first_block, stop_block);
        if (IS_SYNC(inode))
                ext4_handle_sync(handle);

        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        ret2 = ext4_mark_inode_dirty(handle, inode);
        if (unlikely(ret2))
                ret = ret2;
        if (ret >= 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);
out_stop:
        ext4_journal_stop(handle);
out_dio:
        filemap_invalidate_unlock(mapping);
out_mutex:
        inode_unlock(inode);
        return ret;
}

int ext4_inode_attach_jinode(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct jbd2_inode *jinode;

        if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal)
                return 0;

        jinode = jbd2_alloc_inode(GFP_KERNEL);
        spin_lock(&inode->i_lock);
        if (!ei->jinode) {
                if (!jinode) {
                        spin_unlock(&inode->i_lock);
                        return -ENOMEM;
                }
                ei->jinode = jinode;
                jbd2_journal_init_jbd_inode(ei->jinode, inode);
                jinode = NULL;
        }
        spin_unlock(&inode->i_lock);
        if (unlikely(jinode != NULL))
                jbd2_free_inode(jinode);
        return 0;
}

/*
 * ext4_truncate()
 *
 * We block out ext4_get_block() block instantiations across the entire
 * transaction, and VFS/VM ensures that ext4_truncate() cannot run
 * simultaneously on behalf of the same inode.
 *
 * As we work through the truncate and commit bits of it to the journal there
 * is one core, guiding principle: the file's tree must always be consistent on
 * disk.  We must be able to restart the truncate after a crash.
 *
 * The file's tree may be transiently inconsistent in memory (although it
 * probably isn't), but whenever we close off and commit a journal transaction,
 * the contents of (the filesystem + the journal) must be consistent and
 * restartable.  It's pretty simple, really: bottom up, right to left (although
 * left-to-right works OK too).
 *
 * Note that at recovery time, journal replay occurs *before* the restart of
 * truncate against the orphan inode list.
 *
 * The committed inode has the new, desired i_size (which is the same as
 * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
 * that this inode's truncate did not complete and it will again call
 * ext4_truncate() to have another go.  So there will be instantiated blocks
 * to the right of the truncation point in a crashed ext4 filesystem.  But
 * that's fine - as long as they are linked from the inode, the post-crash
 * ext4_truncate() run will find them and release them.
 */
int ext4_truncate(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        unsigned int credits;
        int err = 0, err2;
        handle_t *handle;
        struct address_space *mapping = inode->i_mapping;

        /*
         * There is a possibility that we're either freeing the inode
         * or it's a completely new inode. In those cases we might not
         * have i_rwsem locked because it's not necessary.
         */
        if (!(inode->i_state & (I_NEW|I_FREEING)))
                WARN_ON(!inode_is_locked(inode));
        trace_ext4_truncate_enter(inode);

        if (!ext4_can_truncate(inode))
                goto out_trace;

        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
                ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);

        if (ext4_has_inline_data(inode)) {
                int has_inline = 1;

                err = ext4_inline_data_truncate(inode, &has_inline);
                if (err || has_inline)
                        goto out_trace;
        }

        /* If we zero-out tail of the page, we have to create jinode for jbd2 */
        if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
                err = ext4_inode_attach_jinode(inode);
                if (err)
                        goto out_trace;
        }

        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                credits = ext4_writepage_trans_blocks(inode);
        else
                credits = ext4_blocks_for_truncate(inode);

        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
        if (IS_ERR(handle)) {
                err = PTR_ERR(handle);
                goto out_trace;
        }

        if (inode->i_size & (inode->i_sb->s_blocksize - 1))
                ext4_block_truncate_page(handle, mapping, inode->i_size);

        /*
         * We add the inode to the orphan list, so that if this
         * truncate spans multiple transactions, and we crash, we will
         * resume the truncate when the filesystem recovers.  It also
         * marks the inode dirty, to catch the new size.
         *
         * Implication: the file must always be in a sane, consistent
         * truncatable state while each transaction commits.
         */
        err = ext4_orphan_add(handle, inode);
        if (err)
                goto out_stop;

        down_write(&EXT4_I(inode)->i_data_sem);

        ext4_discard_preallocations(inode);

        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                err = ext4_ext_truncate(handle, inode);
        else
                ext4_ind_truncate(handle, inode);

        up_write(&ei->i_data_sem);
        if (err)
                goto out_stop;

        if (IS_SYNC(inode))
                ext4_handle_sync(handle);

out_stop:
        /*
         * If this was a simple ftruncate() and the file will remain alive,
         * then we need to clear up the orphan record which we created above.
         * However, if this was a real unlink then we were called by
         * ext4_evict_inode(), and we allow that function to clean up the
         * orphan info for us.
         */
        if (inode->i_nlink)
                ext4_orphan_del(handle, inode);

        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        err2 = ext4_mark_inode_dirty(handle, inode);
        if (unlikely(err2 && !err))
                err = err2;
        ext4_journal_stop(handle);

out_trace:
        trace_ext4_truncate_exit(inode);
        return err;
}

static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
{
        if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
                return inode_peek_iversion_raw(inode);
        else
                return inode_peek_iversion(inode);
}

static int ext4_inode_blocks_set(struct ext4_inode *raw_inode,
                                 struct ext4_inode_info *ei)
{
        struct inode *inode = &(ei->vfs_inode);
        u64 i_blocks = READ_ONCE(inode->i_blocks);
        struct super_block *sb = inode->i_sb;

        if (i_blocks <= ~0U) {
                /*
                 * i_blocks can be represented in a 32 bit variable
                 * as multiple of 512 bytes
                 */
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = 0;
                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
                return 0;
        }

        /*
         * This should never happen since sb->s_maxbytes should not have
         * allowed this, sb->s_maxbytes was set according to the huge_file
         * feature in ext4_fill_super().
         */
        if (!ext4_has_feature_huge_file(sb))
                return -EFSCORRUPTED;

        if (i_blocks <= 0xffffffffffffULL) {
                /*
                 * i_blocks can be represented in a 48 bit variable
                 * as multiple of 512 bytes
                 */
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
        } else {
                ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
                /* i_block is stored in file system block size */
                i_blocks = i_blocks >> (inode->i_blkbits - 9);
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
        }
        return 0;
}

static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        uid_t i_uid;
        gid_t i_gid;
        projid_t i_projid;
        int block;
        int err;

        err = ext4_inode_blocks_set(raw_inode, ei);

        raw_inode->i_mode = cpu_to_le16(inode->i_mode);
        i_uid = i_uid_read(inode);
        i_gid = i_gid_read(inode);
        i_projid = from_kprojid(&init_user_ns, ei->i_projid);
        if (!(test_opt(inode->i_sb, NO_UID32))) {
                raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
                raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
                /*
                 * Fix up interoperability with old kernels. Otherwise,
                 * old inodes get re-used with the upper 16 bits of the
                 * uid/gid intact.
                 */
                if (ei->i_dtime && list_empty(&ei->i_orphan)) {
                        raw_inode->i_uid_high = 0;
                        raw_inode->i_gid_high = 0;
                } else {
                        raw_inode->i_uid_high =
                                cpu_to_le16(high_16_bits(i_uid));
                        raw_inode->i_gid_high =
                                cpu_to_le16(high_16_bits(i_gid));
                }
        } else {
                raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
                raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
                raw_inode->i_uid_high = 0;
                raw_inode->i_gid_high = 0;
        }
        raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);

        EXT4_INODE_SET_CTIME(inode, raw_inode);
        EXT4_INODE_SET_MTIME(inode, raw_inode);
        EXT4_INODE_SET_ATIME(inode, raw_inode);
        EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);

        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
        raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
                raw_inode->i_file_acl_high =
                        cpu_to_le16(ei->i_file_acl >> 32);
        raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
        ext4_isize_set(raw_inode, ei->i_disksize);

        raw_inode->i_generation = cpu_to_le32(inode->i_generation);
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
                if (old_valid_dev(inode->i_rdev)) {
                        raw_inode->i_block[0] =
                                cpu_to_le32(old_encode_dev(inode->i_rdev));
                        raw_inode->i_block[1] = 0;
                } else {
                        raw_inode->i_block[0] = 0;
                        raw_inode->i_block[1] =
                                cpu_to_le32(new_encode_dev(inode->i_rdev));
                        raw_inode->i_block[2] = 0;
                }
        } else if (!ext4_has_inline_data(inode)) {
                for (block = 0; block < EXT4_N_BLOCKS; block++)
                        raw_inode->i_block[block] = ei->i_data[block];
        }

        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
                u64 ivers = ext4_inode_peek_iversion(inode);

                raw_inode->i_disk_version = cpu_to_le32(ivers);
                if (ei->i_extra_isize) {
                        if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
                                raw_inode->i_version_hi =
                                        cpu_to_le32(ivers >> 32);
                        raw_inode->i_extra_isize =
                                cpu_to_le16(ei->i_extra_isize);
                }
        }

        if (i_projid != EXT4_DEF_PROJID &&
            !ext4_has_feature_project(inode->i_sb))
                err = err ?: -EFSCORRUPTED;

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
                raw_inode->i_projid = cpu_to_le32(i_projid);

        ext4_inode_csum_set(inode, raw_inode, ei);
        return err;
}

/*
 * ext4_get_inode_loc returns with an extra refcount against the inode's
 * underlying buffer_head on success. If we pass 'inode' and it does not
 * have in-inode xattr, we have all inode data in memory that is needed
 * to recreate the on-disk version of this inode.
 */
static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
                                struct inode *inode, struct ext4_iloc *iloc,
                                ext4_fsblk_t *ret_block)
{
        struct ext4_group_desc        *gdp;
        struct buffer_head        *bh;
        ext4_fsblk_t                block;
        struct blk_plug                plug;
        int                        inodes_per_block, inode_offset;

        iloc->bh = NULL;
        if (ino < EXT4_ROOT_INO ||
            ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
                return -EFSCORRUPTED;

        iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
        gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
        if (!gdp)
                return -EIO;

        /*
         * Figure out the offset within the block group inode table
         */
        inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
        inode_offset = ((ino - 1) %
                        EXT4_INODES_PER_GROUP(sb));
        iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);

        block = ext4_inode_table(sb, gdp);
        if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) ||
            (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) {
                ext4_error(sb, "Invalid inode table block %llu in "
                           "block_group %u", block, iloc->block_group);
                return -EFSCORRUPTED;
        }
        block += (inode_offset / inodes_per_block);

        bh = sb_getblk(sb, block);
        if (unlikely(!bh))
                return -ENOMEM;
        if (ext4_buffer_uptodate(bh))
                goto has_buffer;

        lock_buffer(bh);
        if (ext4_buffer_uptodate(bh)) {
                /* Someone brought it uptodate while we waited */
                unlock_buffer(bh);
                goto has_buffer;
        }

        /*
         * If we have all information of the inode in memory and this
         * is the only valid inode in the block, we need not read the
         * block.
         */
        if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
                struct buffer_head *bitmap_bh;
                int i, start;

                start = inode_offset & ~(inodes_per_block - 1);

                /* Is the inode bitmap in cache? */
                bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
                if (unlikely(!bitmap_bh))
                        goto make_io;

                /*
                 * If the inode bitmap isn't in cache then the
                 * optimisation may end up performing two reads instead
                 * of one, so skip it.
                 */
                if (!buffer_uptodate(bitmap_bh)) {
                        brelse(bitmap_bh);
                        goto make_io;
                }
                for (i = start; i < start + inodes_per_block; i++) {
                        if (i == inode_offset)
                                continue;
                        if (ext4_test_bit(i, bitmap_bh->b_data))
                                break;
                }
                brelse(bitmap_bh);
                if (i == start + inodes_per_block) {
                        struct ext4_inode *raw_inode =
                                (struct ext4_inode *) (bh->b_data + iloc->offset);

                        /* all other inodes are free, so skip I/O */
                        memset(bh->b_data, 0, bh->b_size);
                        if (!ext4_test_inode_state(inode, EXT4_STATE_NEW))
                                ext4_fill_raw_inode(inode, raw_inode);
                        set_buffer_uptodate(bh);
                        unlock_buffer(bh);
                        goto has_buffer;
                }
        }

make_io:
        /*
         * If we need to do any I/O, try to pre-readahead extra
         * blocks from the inode table.
         */
        blk_start_plug(&plug);
        if (EXT4_SB(sb)->s_inode_readahead_blks) {
                ext4_fsblk_t b, end, table;
                unsigned num;
                __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;

                table = ext4_inode_table(sb, gdp);
                /* s_inode_readahead_blks is always a power of 2 */
                b = block & ~((ext4_fsblk_t) ra_blks - 1);
                if (table > b)
                        b = table;
                end = b + ra_blks;
                num = EXT4_INODES_PER_GROUP(sb);
                if (ext4_has_group_desc_csum(sb))
                        num -= ext4_itable_unused_count(sb, gdp);
                table += num / inodes_per_block;
                if (end > table)
                        end = table;
                while (b <= end)
                        ext4_sb_breadahead_unmovable(sb, b++);
        }

        /*
         * There are other valid inodes in the buffer, this inode
         * has in-inode xattrs, or we don't have this inode in memory.
         * Read the block from disk.
         */
        trace_ext4_load_inode(sb, ino);
        ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL);
        blk_finish_plug(&plug);
        wait_on_buffer(bh);
        ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO);
        if (!buffer_uptodate(bh)) {
                if (ret_block)
                        *ret_block = block;
                brelse(bh);
                return -EIO;
        }
has_buffer:
        iloc->bh = bh;
        return 0;
}

static int __ext4_get_inode_loc_noinmem(struct inode *inode,
                                        struct ext4_iloc *iloc)
{
        ext4_fsblk_t err_blk = 0;
        int ret;

        ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc,
                                        &err_blk);

        if (ret == -EIO)
                ext4_error_inode_block(inode, err_blk, EIO,
                                        "unable to read itable block");

        return ret;
}

int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
{
        ext4_fsblk_t err_blk = 0;
        int ret;

        ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc,
                                        &err_blk);

        if (ret == -EIO)
                ext4_error_inode_block(inode, err_blk, EIO,
                                        "unable to read itable block");

        return ret;
}


int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
                          struct ext4_iloc *iloc)
{
        return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL);
}

static bool ext4_should_enable_dax(struct inode *inode)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

        if (test_opt2(inode->i_sb, DAX_NEVER))
                return false;
        if (!S_ISREG(inode->i_mode))
                return false;
        if (ext4_should_journal_data(inode))
                return false;
        if (ext4_has_inline_data(inode))
                return false;
        if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
                return false;
        if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
                return false;
        if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags))
                return false;
        if (test_opt(inode->i_sb, DAX_ALWAYS))
                return true;

        return ext4_test_inode_flag(inode, EXT4_INODE_DAX);
}

void ext4_set_inode_flags(struct inode *inode, bool init)
{
        unsigned int flags = EXT4_I(inode)->i_flags;
        unsigned int new_fl = 0;

        WARN_ON_ONCE(IS_DAX(inode) && init);

        if (flags & EXT4_SYNC_FL)
                new_fl |= S_SYNC;
        if (flags & EXT4_APPEND_FL)
                new_fl |= S_APPEND;
        if (flags & EXT4_IMMUTABLE_FL)
                new_fl |= S_IMMUTABLE;
        if (flags & EXT4_NOATIME_FL)
                new_fl |= S_NOATIME;
        if (flags & EXT4_DIRSYNC_FL)
                new_fl |= S_DIRSYNC;

        /* Because of the way inode_set_flags() works we must preserve S_DAX
         * here if already set. */
        new_fl |= (inode->i_flags & S_DAX);
        if (init && ext4_should_enable_dax(inode))
                new_fl |= S_DAX;

        if (flags & EXT4_ENCRYPT_FL)
                new_fl |= S_ENCRYPTED;
        if (flags & EXT4_CASEFOLD_FL)
                new_fl |= S_CASEFOLD;
        if (flags & EXT4_VERITY_FL)
                new_fl |= S_VERITY;
        inode_set_flags(inode, new_fl,
                        S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
                        S_ENCRYPTED|S_CASEFOLD|S_VERITY);
}

static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
                                  struct ext4_inode_info *ei)
{
        blkcnt_t i_blocks ;
        struct inode *inode = &(ei->vfs_inode);
        struct super_block *sb = inode->i_sb;

        if (ext4_has_feature_huge_file(sb)) {
                /* we are using combined 48 bit field */
                i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
                                        le32_to_cpu(raw_inode->i_blocks_lo);
                if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
                        /* i_blocks represent file system block size */
                        return i_blocks  << (inode->i_blkbits - 9);
                } else {
                        return i_blocks;
                }
        } else {
                return le32_to_cpu(raw_inode->i_blocks_lo);
        }
}

static inline int ext4_iget_extra_inode(struct inode *inode,
                                         struct ext4_inode *raw_inode,
                                         struct ext4_inode_info *ei)
{
        __le32 *magic = (void *)raw_inode +
                        EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;

        if (EXT4_INODE_HAS_XATTR_SPACE(inode)  &&
            *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
                int err;

                ext4_set_inode_state(inode, EXT4_STATE_XATTR);
                err = ext4_find_inline_data_nolock(inode);
                if (!err && ext4_has_inline_data(inode))
                        ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
                return err;
        } else
                EXT4_I(inode)->i_inline_off = 0;
        return 0;
}

int ext4_get_projid(struct inode *inode, kprojid_t *projid)
{
        if (!ext4_has_feature_project(inode->i_sb))
                return -EOPNOTSUPP;
        *projid = EXT4_I(inode)->i_projid;
        return 0;
}

/*
 * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of
 * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag
 * set.
 */
static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
{
        if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
                inode_set_iversion_raw(inode, val);
        else
                inode_set_iversion_queried(inode, val);
}

static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags)

{
        if (flags & EXT4_IGET_EA_INODE) {
                if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
                        return "missing EA_INODE flag";
                if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
                    EXT4_I(inode)->i_file_acl)
                        return "ea_inode with extended attributes";
        } else {
                if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
                        return "unexpected EA_INODE flag";
        }
        if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD))
                return "unexpected bad inode w/o EXT4_IGET_BAD";
        return NULL;
}

struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
                          ext4_iget_flags flags, const char *function,
                          unsigned int line)
{
        struct ext4_iloc iloc;
        struct ext4_inode *raw_inode;
        struct ext4_inode_info *ei;
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        struct inode *inode;
        const char *err_str;
        journal_t *journal = EXT4_SB(sb)->s_journal;
        long ret;
        loff_t size;
        int block;
        uid_t i_uid;
        gid_t i_gid;
        projid_t i_projid;

        if ((!(flags & EXT4_IGET_SPECIAL) &&
             ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
              ino == le32_to_cpu(es->s_usr_quota_inum) ||
              ino == le32_to_cpu(es->s_grp_quota_inum) ||
              ino == le32_to_cpu(es->s_prj_quota_inum) ||
              ino == le32_to_cpu(es->s_orphan_file_inum))) ||
            (ino < EXT4_ROOT_INO) ||
            (ino > le32_to_cpu(es->s_inodes_count))) {
                if (flags & EXT4_IGET_HANDLE)
                        return ERR_PTR(-ESTALE);
                __ext4_error(sb, function, line, false, EFSCORRUPTED, 0,
                             "inode #%lu: comm %s: iget: illegal inode #",
                             ino, current->comm);
                return ERR_PTR(-EFSCORRUPTED);
        }

        inode = iget_locked(sb, ino);
        if (!inode)
                return ERR_PTR(-ENOMEM);
        if (!(inode->i_state & I_NEW)) {
                if ((err_str = check_igot_inode(inode, flags)) != NULL) {
                        ext4_error_inode(inode, function, line, 0, err_str);
                        iput(inode);
                        return ERR_PTR(-EFSCORRUPTED);
                }
                return inode;
        }

        ei = EXT4_I(inode);
        iloc.bh = NULL;

        ret = __ext4_get_inode_loc_noinmem(inode, &iloc);
        if (ret < 0)
                goto bad_inode;
        raw_inode = ext4_raw_inode(&iloc);

        if ((flags & EXT4_IGET_HANDLE) &&
            (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) {
                ret = -ESTALE;
                goto bad_inode;
        }

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
                if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
                        EXT4_INODE_SIZE(inode->i_sb) ||
                    (ei->i_extra_isize & 3)) {
                        ext4_error_inode(inode, function, line, 0,
                                         "iget: bad extra_isize %u "
                                         "(inode size %u)",
                                         ei->i_extra_isize,
                                         EXT4_INODE_SIZE(inode->i_sb));
                        ret = -EFSCORRUPTED;
                        goto bad_inode;
                }
        } else
                ei->i_extra_isize = 0;

        /* Precompute checksum seed for inode metadata */
        if (ext4_has_metadata_csum(sb)) {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
                __u32 csum;
                __le32 inum = cpu_to_le32(inode->i_ino);
                __le32 gen = raw_inode->i_generation;
                csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
                                   sizeof(inum));
                ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
                                              sizeof(gen));
        }

        if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
            ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) &&
             (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) {
                ext4_error_inode_err(inode, function, line, 0,
                                EFSBADCRC, "iget: checksum invalid");
                ret = -EFSBADCRC;
                goto bad_inode;
        }

        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
        i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
        i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
        if (ext4_has_feature_project(sb) &&
            EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
                i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid);
        else
                i_projid = EXT4_DEF_PROJID;

        if (!(test_opt(inode->i_sb, NO_UID32))) {
                i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
                i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
        }
        i_uid_write(inode, i_uid);
        i_gid_write(inode, i_gid);
        ei->i_projid = make_kprojid(&init_user_ns, i_projid);
        set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));

        ext4_clear_state_flags(ei);        /* Only relevant on 32-bit archs */
        ei->i_inline_off = 0;
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
         * This is needed because nfsd might try to access dead inodes
         * the test is that same one that e2fsck uses
         * NeilBrown 1999oct15
         */
        if (inode->i_nlink == 0) {
                if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL ||
                     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
                    ino != EXT4_BOOT_LOADER_INO) {
                        /* this inode is deleted or unallocated */
                        if (flags & EXT4_IGET_SPECIAL) {
                                ext4_error_inode(inode, function, line, 0,
                                                 "iget: special inode unallocated");
                                ret = -EFSCORRUPTED;
                        } else
                                ret = -ESTALE;
                        goto bad_inode;
                }
                /* The only unlinked inodes we let through here have
                 * valid i_mode and are being read by the orphan
                 * recovery code: that's fine, we're about to complete
                 * the process of deleting those.
                 * OR it is the EXT4_BOOT_LOADER_INO which is
                 * not initialized on a new filesystem. */
        }
        ei->i_flags = le32_to_cpu(raw_inode->i_flags);
        ext4_set_inode_flags(inode, true);
        inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
        ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
        if (ext4_has_feature_64bit(sb))
                ei->i_file_acl |=
                        ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
        inode->i_size = ext4_isize(sb, raw_inode);
        if ((size = i_size_read(inode)) < 0) {
                ext4_error_inode(inode, function, line, 0,
                                 "iget: bad i_size value: %lld", size);
                ret = -EFSCORRUPTED;
                goto bad_inode;
        }
        /*
         * If dir_index is not enabled but there's dir with INDEX flag set,
         * we'd normally treat htree data as empty space. But with metadata
         * checksumming that corrupts checksums so forbid that.
         */
        if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) &&
            ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
                ext4_error_inode(inode, function, line, 0,
                         "iget: Dir with htree data on filesystem without dir_index feature.");
                ret = -EFSCORRUPTED;
                goto bad_inode;
        }
        ei->i_disksize = inode->i_size;
#ifdef CONFIG_QUOTA
        ei->i_reserved_quota = 0;
#endif
        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
        ei->i_block_group = iloc.block_group;
        ei->i_last_alloc_group = ~0;
        /*
         * NOTE! The in-memory inode i_data array is in little-endian order
         * even on big-endian machines: we do NOT byteswap the block numbers!
         */
        for (block = 0; block < EXT4_N_BLOCKS; block++)
                ei->i_data[block] = raw_inode->i_block[block];
        INIT_LIST_HEAD(&ei->i_orphan);
        ext4_fc_init_inode(&ei->vfs_inode);

        /*
         * Set transaction id's of transactions that have to be committed
         * to finish f[data]sync. We set them to currently running transaction
         * as we cannot be sure that the inode or some of its metadata isn't
         * part of the transaction - the inode could have been reclaimed and
         * now it is reread from disk.
         */
        if (journal) {
                transaction_t *transaction;
                tid_t tid;

                read_lock(&journal->j_state_lock);
                if (journal->j_running_transaction)
                        transaction = journal->j_running_transaction;
                else
                        transaction = journal->j_committing_transaction;
                if (transaction)
                        tid = transaction->t_tid;
                else
                        tid = journal->j_commit_sequence;
                read_unlock(&journal->j_state_lock);
                ei->i_sync_tid = tid;
                ei->i_datasync_tid = tid;
        }

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                if (ei->i_extra_isize == 0) {
                        /* The extra space is currently unused. Use it. */
                        BUILD_BUG_ON(sizeof(struct ext4_inode) & 3);
                        ei->i_extra_isize = sizeof(struct ext4_inode) -
                                            EXT4_GOOD_OLD_INODE_SIZE;
                } else {
                        ret = ext4_iget_extra_inode(inode, raw_inode, ei);
                        if (ret)
                                goto bad_inode;
                }
        }

        EXT4_INODE_GET_CTIME(inode, raw_inode);
        EXT4_INODE_GET_ATIME(inode, raw_inode);
        EXT4_INODE_GET_MTIME(inode, raw_inode);
        EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);

        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
                u64 ivers = le32_to_cpu(raw_inode->i_disk_version);

                if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                        if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
                                ivers |=
                    (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
                }
                ext4_inode_set_iversion_queried(inode, ivers);
        }

        ret = 0;
        if (ei->i_file_acl &&
            !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) {
                ext4_error_inode(inode, function, line, 0,
                                 "iget: bad extended attribute block %llu",
                                 ei->i_file_acl);
                ret = -EFSCORRUPTED;
                goto bad_inode;
        } else if (!ext4_has_inline_data(inode)) {
                /* validate the block references in the inode */
                if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
                        (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                        (S_ISLNK(inode->i_mode) &&
                        !ext4_inode_is_fast_symlink(inode)))) {
                        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                                ret = ext4_ext_check_inode(inode);
                        else
                                ret = ext4_ind_check_inode(inode);
                }
        }
        if (ret)
                goto bad_inode;

        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext4_file_inode_operations;
                inode->i_fop = &ext4_file_operations;
                ext4_set_aops(inode);
        } else if (S_ISDIR(inode->i_mode)) {
                inode->i_op = &ext4_dir_inode_operations;
                inode->i_fop = &ext4_dir_operations;
        } else if (S_ISLNK(inode->i_mode)) {
                /* VFS does not allow setting these so must be corruption */
                if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
                        ext4_error_inode(inode, function, line, 0,
                                         "iget: immutable or append flags "
                                         "not allowed on symlinks");
                        ret = -EFSCORRUPTED;
                        goto bad_inode;
                }
                if (IS_ENCRYPTED(inode)) {
                        inode->i_op = &ext4_encrypted_symlink_inode_operations;
                } else if (ext4_inode_is_fast_symlink(inode)) {
                        inode->i_link = (char *)ei->i_data;
                        inode->i_op = &ext4_fast_symlink_inode_operations;
                        nd_terminate_link(ei->i_data, inode->i_size,
                                sizeof(ei->i_data) - 1);
                } else {
                        inode->i_op = &ext4_symlink_inode_operations;
                }
        } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
              S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
                inode->i_op = &ext4_special_inode_operations;
                if (raw_inode->i_block[0])
                        init_special_inode(inode, inode->i_mode,
                           old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
                else
                        init_special_inode(inode, inode->i_mode,
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
        } else if (ino == EXT4_BOOT_LOADER_INO) {
                make_bad_inode(inode);
        } else {
                ret = -EFSCORRUPTED;
                ext4_error_inode(inode, function, line, 0,
                                 "iget: bogus i_mode (%o)", inode->i_mode);
                goto bad_inode;
        }
        if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) {
                ext4_error_inode(inode, function, line, 0,
                                 "casefold flag without casefold feature");
                ret = -EFSCORRUPTED;
                goto bad_inode;
        }
        if ((err_str = check_igot_inode(inode, flags)) != NULL) {
                ext4_error_inode(inode, function, line, 0, err_str);
                ret = -EFSCORRUPTED;
                goto bad_inode;
        }

        brelse(iloc.bh);
        unlock_new_inode(inode);
        return inode;

bad_inode:
        brelse(iloc.bh);
        iget_failed(inode);
        return ERR_PTR(ret);
}

static void __ext4_update_other_inode_time(struct super_block *sb,
                                           unsigned long orig_ino,
                                           unsigned long ino,
                                           struct ext4_inode *raw_inode)
{
        struct inode *inode;

        inode = find_inode_by_ino_rcu(sb, ino);
        if (!inode)
                return;

        if (!inode_is_dirtytime_only(inode))
                return;

        spin_lock(&inode->i_lock);
        if (inode_is_dirtytime_only(inode)) {
                struct ext4_inode_info        *ei = EXT4_I(inode);

                inode->i_state &= ~I_DIRTY_TIME;
                spin_unlock(&inode->i_lock);

                spin_lock(&ei->i_raw_lock);
                EXT4_INODE_SET_CTIME(inode, raw_inode);
                EXT4_INODE_SET_MTIME(inode, raw_inode);
                EXT4_INODE_SET_ATIME(inode, raw_inode);
                ext4_inode_csum_set(inode, raw_inode, ei);
                spin_unlock(&ei->i_raw_lock);
                trace_ext4_other_inode_update_time(inode, orig_ino);
                return;
        }
        spin_unlock(&inode->i_lock);
}

/*
 * Opportunistically update the other time fields for other inodes in
 * the same inode table block.
 */
static void ext4_update_other_inodes_time(struct super_block *sb,
                                          unsigned long orig_ino, char *buf)
{
        unsigned long ino;
        int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
        int inode_size = EXT4_INODE_SIZE(sb);

        /*
         * Calculate the first inode in the inode table block.  Inode
         * numbers are one-based.  That is, the first inode in a block
         * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1).
         */
        ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1;
        rcu_read_lock();
        for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
                if (ino == orig_ino)
                        continue;
                __ext4_update_other_inode_time(sb, orig_ino, ino,
                                               (struct ext4_inode *)buf);
        }
        rcu_read_unlock();
}

/*
 * Post the struct inode info into an on-disk inode location in the
 * buffer-cache.  This gobbles the caller's reference to the
 * buffer_head in the inode location struct.
 *
 * The caller must have write access to iloc->bh.
 */
static int ext4_do_update_inode(handle_t *handle,
                                struct inode *inode,
                                struct ext4_iloc *iloc)
{
        struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct buffer_head *bh = iloc->bh;
        struct super_block *sb = inode->i_sb;
        int err;
        int need_datasync = 0, set_large_file = 0;

        spin_lock(&ei->i_raw_lock);

        /*
         * For fields not tracked in the in-memory inode, initialise them
         * to zero for new inodes.
         */
        if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
                memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);

        if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode))
                need_datasync = 1;
        if (ei->i_disksize > 0x7fffffffULL) {
                if (!ext4_has_feature_large_file(sb) ||
                    EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV))
                        set_large_file = 1;
        }

        err = ext4_fill_raw_inode(inode, raw_inode);
        spin_unlock(&ei->i_raw_lock);
        if (err) {
                EXT4_ERROR_INODE(inode, "corrupted inode contents");
                goto out_brelse;
        }

        if (inode->i_sb->s_flags & SB_LAZYTIME)
                ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
                                              bh->b_data);

        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_metadata(handle, NULL, bh);
        if (err)
                goto out_error;
        ext4_clear_inode_state(inode, EXT4_STATE_NEW);
        if (set_large_file) {
                BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
                err = ext4_journal_get_write_access(handle, sb,
                                                    EXT4_SB(sb)->s_sbh,
                                                    EXT4_JTR_NONE);
                if (err)
                        goto out_error;
                lock_buffer(EXT4_SB(sb)->s_sbh);
                ext4_set_feature_large_file(sb);
                ext4_superblock_csum_set(sb);
                unlock_buffer(EXT4_SB(sb)->s_sbh);
                ext4_handle_sync(handle);
                err = ext4_handle_dirty_metadata(handle, NULL,
                                                 EXT4_SB(sb)->s_sbh);
        }
        ext4_update_inode_fsync_trans(handle, inode, need_datasync);
out_error:
        ext4_std_error(inode->i_sb, err);
out_brelse:
        brelse(bh);
        return err;
}

/*
 * ext4_write_inode()
 *
 * We are called from a few places:
 *
 * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
 *   Here, there will be no transaction running. We wait for any running
 *   transaction to commit.
 *
 * - Within flush work (sys_sync(), kupdate and such).
 *   We wait on commit, if told to.
 *
 * - Within iput_final() -> write_inode_now()
 *   We wait on commit, if told to.
 *
 * In all cases it is actually safe for us to return without doing anything,
 * because the inode has been copied into a raw inode buffer in
 * ext4_mark_inode_dirty().  This is a correctness thing for WB_SYNC_ALL
 * writeback.
 *
 * Note that we are absolutely dependent upon all inode dirtiers doing the
 * right thing: they *must* call mark_inode_dirty() after dirtying info in
 * which we are interested.
 *
 * It would be a bug for them to not do this.  The code:
 *
 *        mark_inode_dirty(inode)
 *        stuff();
 *        inode->i_size = expr;
 *
 * is in error because write_inode() could occur while `stuff()' is running,
 * and the new i_size will be lost.  Plus the inode will no longer be on the
 * superblock's dirty inode list.
 */
int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
{
        int err;

        if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
                return 0;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;

        if (EXT4_SB(inode->i_sb)->s_journal) {
                if (ext4_journal_current_handle()) {
                        ext4_debug("called recursively, non-PF_MEMALLOC!\n");
                        dump_stack();
                        return -EIO;
                }

                /*
                 * No need to force transaction in WB_SYNC_NONE mode. Also
                 * ext4_sync_fs() will force the commit after everything is
                 * written.
                 */
                if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
                        return 0;

                err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
                                                EXT4_I(inode)->i_sync_tid);
        } else {
                struct ext4_iloc iloc;

                err = __ext4_get_inode_loc_noinmem(inode, &iloc);
                if (err)
                        return err;
                /*
                 * sync(2) will flush the whole buffer cache. No need to do
                 * it here separately for each inode.
                 */
                if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
                        sync_dirty_buffer(iloc.bh);
                if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
                        ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO,
                                               "IO error syncing inode");
                        err = -EIO;
                }
                brelse(iloc.bh);
        }
        return err;
}

/*
 * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate
 * buffers that are attached to a folio straddling i_size and are undergoing
 * commit. In that case we have to wait for commit to finish and try again.
 */
static void ext4_wait_for_tail_page_commit(struct inode *inode)
{
        unsigned offset;
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
        tid_t commit_tid = 0;
        int ret;

        offset = inode->i_size & (PAGE_SIZE - 1);
        /*
         * If the folio is fully truncated, we don't need to wait for any commit
         * (and we even should not as __ext4_journalled_invalidate_folio() may
         * strip all buffers from the folio but keep the folio dirty which can then
         * confuse e.g. concurrent ext4_writepages() seeing dirty folio without
         * buffers). Also we don't need to wait for any commit if all buffers in
         * the folio remain valid. This is most beneficial for the common case of
         * blocksize == PAGESIZE.
         */
        if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
                return;
        while (1) {
                struct folio *folio = filemap_lock_folio(inode->i_mapping,
                                      inode->i_size >> PAGE_SHIFT);
                if (IS_ERR(folio))
                        return;
                ret = __ext4_journalled_invalidate_folio(folio, offset,
                                                folio_size(folio) - offset);
                folio_unlock(folio);
                folio_put(folio);
                if (ret != -EBUSY)
                        return;
                commit_tid = 0;
                read_lock(&journal->j_state_lock);
                if (journal->j_committing_transaction)
                        commit_tid = journal->j_committing_transaction->t_tid;
                read_unlock(&journal->j_state_lock);
                if (commit_tid)
                        jbd2_log_wait_commit(journal, commit_tid);
        }
}

/*
 * ext4_setattr()
 *
 * Called from notify_change.
 *
 * We want to trap VFS attempts to truncate the file as soon as
 * possible.  In particular, we want to make sure that when the VFS
 * shrinks i_size, we put the inode on the orphan list and modify
 * i_disksize immediately, so that during the subsequent flushing of
 * dirty pages and freeing of disk blocks, we can guarantee that any
 * commit will leave the blocks being flushed in an unused state on
 * disk.  (On recovery, the inode will get truncated and the blocks will
 * be freed, so we have a strong guarantee that no future commit will
 * leave these blocks visible to the user.)
 *
 * Another thing we have to assure is that if we are in ordered mode
 * and inode is still attached to the committing transaction, we must
 * we start writeout of all the dirty pages which are being truncated.
 * This way we are sure that all the data written in the previous
 * transaction are already on disk (truncate waits for pages under
 * writeback).
 *
 * Called with inode->i_rwsem down.
 */
int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                 struct iattr *attr)
{
        struct inode *inode = d_inode(dentry);
        int error, rc = 0;
        int orphan = 0;
        const unsigned int ia_valid = attr->ia_valid;
        bool inc_ivers = true;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;

        if (unlikely(IS_IMMUTABLE(inode)))
                return -EPERM;

        if (unlikely(IS_APPEND(inode) &&
                     (ia_valid & (ATTR_MODE | ATTR_UID |
                                  ATTR_GID | ATTR_TIMES_SET))))
                return -EPERM;

        error = setattr_prepare(idmap, dentry, attr);
        if (error)
                return error;

        error = fscrypt_prepare_setattr(dentry, attr);
        if (error)
                return error;

        error = fsverity_prepare_setattr(dentry, attr);
        if (error)
                return error;

        if (is_quota_modification(idmap, inode, attr)) {
                error = dquot_initialize(inode);
                if (error)
                        return error;
        }

        if (i_uid_needs_update(idmap, attr, inode) ||
            i_gid_needs_update(idmap, attr, inode)) {
                handle_t *handle;

                /* (user+group)*(old+new) structure, inode write (sb,
                 * inode block, ? - but truncate inode update has it) */
                handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
                        (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
                         EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
                if (IS_ERR(handle)) {
                        error = PTR_ERR(handle);
                        goto err_out;
                }

                /* dquot_transfer() calls back ext4_get_inode_usage() which
                 * counts xattr inode references.
                 */
                down_read(&EXT4_I(inode)->xattr_sem);
                error = dquot_transfer(idmap, inode, attr);
                up_read(&EXT4_I(inode)->xattr_sem);

                if (error) {
                        ext4_journal_stop(handle);
                        return error;
                }
                /* Update corresponding info in inode so that everything is in
                 * one transaction */
                i_uid_update(idmap, attr, inode);
                i_gid_update(idmap, attr, inode);
                error = ext4_mark_inode_dirty(handle, inode);
                ext4_journal_stop(handle);
                if (unlikely(error)) {
                        return error;
                }
        }

        if (attr->ia_valid & ATTR_SIZE) {
                handle_t *handle;
                loff_t oldsize = inode->i_size;
                loff_t old_disksize;
                int shrink = (attr->ia_size < inode->i_size);

                if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

                        if (attr->ia_size > sbi->s_bitmap_maxbytes) {
                                return -EFBIG;
                        }
                }
                if (!S_ISREG(inode->i_mode)) {
                        return -EINVAL;
                }

                if (attr->ia_size == inode->i_size)
                        inc_ivers = false;

                if (shrink) {
                        if (ext4_should_order_data(inode)) {
                                error = ext4_begin_ordered_truncate(inode,
                                                            attr->ia_size);
                                if (error)
                                        goto err_out;
                        }
                        /*
                         * Blocks are going to be removed from the inode. Wait
                         * for dio in flight.
                         */
                        inode_dio_wait(inode);
                }

                filemap_invalidate_lock(inode->i_mapping);

                rc = ext4_break_layouts(inode);
                if (rc) {
                        filemap_invalidate_unlock(inode->i_mapping);
                        goto err_out;
                }

                if (attr->ia_size != inode->i_size) {
                        handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
                        if (IS_ERR(handle)) {
                                error = PTR_ERR(handle);
                                goto out_mmap_sem;
                        }
                        if (ext4_handle_valid(handle) && shrink) {
                                error = ext4_orphan_add(handle, inode);
                                orphan = 1;
                        }
                        /*
                         * Update c/mtime on truncate up, ext4_truncate() will
                         * update c/mtime in shrink case below
                         */
                        if (!shrink)
                                inode_set_mtime_to_ts(inode,
                                                      inode_set_ctime_current(inode));

                        if (shrink)
                                ext4_fc_track_range(handle, inode,
                                        (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
                                        inode->i_sb->s_blocksize_bits,
                                        EXT_MAX_BLOCKS - 1);
                        else
                                ext4_fc_track_range(
                                        handle, inode,
                                        (oldsize > 0 ? oldsize - 1 : oldsize) >>
                                        inode->i_sb->s_blocksize_bits,
                                        (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
                                        inode->i_sb->s_blocksize_bits);

                        down_write(&EXT4_I(inode)->i_data_sem);
                        old_disksize = EXT4_I(inode)->i_disksize;
                        EXT4_I(inode)->i_disksize = attr->ia_size;
                        rc = ext4_mark_inode_dirty(handle, inode);
                        if (!error)
                                error = rc;
                        /*
                         * We have to update i_size under i_data_sem together
                         * with i_disksize to avoid races with writeback code
                         * running ext4_wb_update_i_disksize().
                         */
                        if (!error)
                                i_size_write(inode, attr->ia_size);
                        else
                                EXT4_I(inode)->i_disksize = old_disksize;
                        up_write(&EXT4_I(inode)->i_data_sem);
                        ext4_journal_stop(handle);
                        if (error)
                                goto out_mmap_sem;
                        if (!shrink) {
                                pagecache_isize_extended(inode, oldsize,
                                                         inode->i_size);
                        } else if (ext4_should_journal_data(inode)) {
                                ext4_wait_for_tail_page_commit(inode);
                        }
                }

                /*
                 * Truncate pagecache after we've waited for commit
                 * in data=journal mode to make pages freeable.
                 */
                truncate_pagecache(inode, inode->i_size);
                /*
                 * Call ext4_truncate() even if i_size didn't change to
                 * truncate possible preallocated blocks.
                 */
                if (attr->ia_size <= oldsize) {
                        rc = ext4_truncate(inode);
                        if (rc)
                                error = rc;
                }
out_mmap_sem:
                filemap_invalidate_unlock(inode->i_mapping);
        }

        if (!error) {
                if (inc_ivers)
                        inode_inc_iversion(inode);
                setattr_copy(idmap, inode, attr);
                mark_inode_dirty(inode);
        }

        /*
         * If the call to ext4_truncate failed to get a transaction handle at
         * all, we need to clean up the in-core orphan list manually.
         */
        if (orphan && inode->i_nlink)
                ext4_orphan_del(NULL, inode);

        if (!error && (ia_valid & ATTR_MODE))
                rc = posix_acl_chmod(idmap, dentry, inode->i_mode);

err_out:
        if  (error)
                ext4_std_error(inode->i_sb, error);
        if (!error)
                error = rc;
        return error;
}

u32 ext4_dio_alignment(struct inode *inode)
{
        if (fsverity_active(inode))
                return 0;
        if (ext4_should_journal_data(inode))
                return 0;
        if (ext4_has_inline_data(inode))
                return 0;
        if (IS_ENCRYPTED(inode)) {
                if (!fscrypt_dio_supported(inode))
                        return 0;
                return i_blocksize(inode);
        }
        return 1; /* use the iomap defaults */
}

int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
                 struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct ext4_inode *raw_inode;
        struct ext4_inode_info *ei = EXT4_I(inode);
        unsigned int flags;

        if ((request_mask & STATX_BTIME) &&
            EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
                stat->result_mask |= STATX_BTIME;
                stat->btime.tv_sec = ei->i_crtime.tv_sec;
                stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
        }

        /*
         * Return the DIO alignment restrictions if requested.  We only return
         * this information when requested, since on encrypted files it might
         * take a fair bit of work to get if the file wasn't opened recently.
         */
        if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
                u32 dio_align = ext4_dio_alignment(inode);

                stat->result_mask |= STATX_DIOALIGN;
                if (dio_align == 1) {
                        struct block_device *bdev = inode->i_sb->s_bdev;

                        /* iomap defaults */
                        stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
                        stat->dio_offset_align = bdev_logical_block_size(bdev);
                } else {
                        stat->dio_mem_align = dio_align;
                        stat->dio_offset_align = dio_align;
                }
        }

        flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
        if (flags & EXT4_APPEND_FL)
                stat->attributes |= STATX_ATTR_APPEND;
        if (flags & EXT4_COMPR_FL)
                stat->attributes |= STATX_ATTR_COMPRESSED;
        if (flags & EXT4_ENCRYPT_FL)
                stat->attributes |= STATX_ATTR_ENCRYPTED;
        if (flags & EXT4_IMMUTABLE_FL)
                stat->attributes |= STATX_ATTR_IMMUTABLE;
        if (flags & EXT4_NODUMP_FL)
                stat->attributes |= STATX_ATTR_NODUMP;
        if (flags & EXT4_VERITY_FL)
                stat->attributes |= STATX_ATTR_VERITY;

        stat->attributes_mask |= (STATX_ATTR_APPEND |
                                  STATX_ATTR_COMPRESSED |
                                  STATX_ATTR_ENCRYPTED |
                                  STATX_ATTR_IMMUTABLE |
                                  STATX_ATTR_NODUMP |
                                  STATX_ATTR_VERITY);

        generic_fillattr(idmap, request_mask, inode, stat);
        return 0;
}

int ext4_file_getattr(struct mnt_idmap *idmap,
                      const struct path *path, struct kstat *stat,
                      u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        u64 delalloc_blocks;

        ext4_getattr(idmap, path, stat, request_mask, query_flags);

        /*
         * If there is inline data in the inode, the inode will normally not
         * have data blocks allocated (it may have an external xattr block).
         * Report at least one sector for such files, so tools like tar, rsync,
         * others don't incorrectly think the file is completely sparse.
         */
        if (unlikely(ext4_has_inline_data(inode)))
                stat->blocks += (stat->size + 511) >> 9;

        /*
         * We can't update i_blocks if the block allocation is delayed
         * otherwise in the case of system crash before the real block
         * allocation is done, we will have i_blocks inconsistent with
         * on-disk file blocks.
         * We always keep i_blocks updated together with real
         * allocation. But to not confuse with user, stat
         * will return the blocks that include the delayed allocation
         * blocks for this file.
         */
        delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
                                   EXT4_I(inode)->i_reserved_data_blocks);
        stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);
        return 0;
}

static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
                                   int pextents)
{
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return ext4_ind_trans_blocks(inode, lblocks);
        return ext4_ext_index_trans_blocks(inode, pextents);
}

/*
 * Account for index blocks, block groups bitmaps and block group
 * descriptor blocks if modify datablocks and index blocks
 * worse case, the indexs blocks spread over different block groups
 *
 * If datablocks are discontiguous, they are possible to spread over
 * different block groups too. If they are contiguous, with flexbg,
 * they could still across block group boundary.
 *
 * Also account for superblock, inode, quota and xattr blocks
 */
static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
                                  int pextents)
{
        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
        int gdpblocks;
        int idxblocks;
        int ret;

        /*
         * How many index blocks need to touch to map @lblocks logical blocks
         * to @pextents physical extents?
         */
        idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);

        ret = idxblocks;

        /*
         * Now let's see how many group bitmaps and group descriptors need
         * to account
         */
        groups = idxblocks + pextents;
        gdpblocks = groups;
        if (groups > ngroups)
                groups = ngroups;
        if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
                gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;

        /* bitmaps and block group descriptor blocks */
        ret += groups + gdpblocks;

        /* Blocks for super block, inode, quota and xattr blocks */
        ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);

        return ret;
}

/*
 * Calculate the total number of credits to reserve to fit
 * the modification of a single pages into a single transaction,
 * which may include multiple chunks of block allocations.
 *
 * This could be called via ext4_write_begin()
 *
 * We need to consider the worse case, when
 * one new block per extent.
 */
int ext4_writepage_trans_blocks(struct inode *inode)
{
        int bpp = ext4_journal_blocks_per_page(inode);
        int ret;

        ret = ext4_meta_trans_blocks(inode, bpp, bpp);

        /* Account for data blocks for journalled mode */
        if (ext4_should_journal_data(inode))
                ret += bpp;
        return ret;
}

/*
 * Calculate the journal credits for a chunk of data modification.
 *
 * This is called from DIO, fallocate or whoever calling
 * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
 *
 * journal buffers for data blocks are not included here, as DIO
 * and fallocate do no need to journal data buffers.
 */
int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
{
        return ext4_meta_trans_blocks(inode, nrblocks, 1);
}

/*
 * The caller must have previously called ext4_reserve_inode_write().
 * Give this, we know that the caller already has write access to iloc->bh.
 */
int ext4_mark_iloc_dirty(handle_t *handle,
                         struct inode *inode, struct ext4_iloc *iloc)
{
        int err = 0;

        if (unlikely(ext4_forced_shutdown(inode->i_sb))) {
                put_bh(iloc->bh);
                return -EIO;
        }
        ext4_fc_track_inode(handle, inode);

        /* the do_update_inode consumes one bh->b_count */
        get_bh(iloc->bh);

        /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
        err = ext4_do_update_inode(handle, inode, iloc);
        put_bh(iloc->bh);
        return err;
}

/*
 * On success, We end up with an outstanding reference count against
 * iloc->bh.  This _must_ be cleaned up later.
 */

int
ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
                         struct ext4_iloc *iloc)
{
        int err;

        if (unlikely(ext4_forced_shutdown(inode->i_sb)))
                return -EIO;

        err = ext4_get_inode_loc(inode, iloc);
        if (!err) {
                BUFFER_TRACE(iloc->bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, inode->i_sb,
                                                    iloc->bh, EXT4_JTR_NONE);
                if (err) {
                        brelse(iloc->bh);
                        iloc->bh = NULL;
                }
        }
        ext4_std_error(inode->i_sb, err);
        return err;
}

static int __ext4_expand_extra_isize(struct inode *inode,
                                     unsigned int new_extra_isize,
                                     struct ext4_iloc *iloc,
                                     handle_t *handle, int *no_expand)
{
        struct ext4_inode *raw_inode;
        struct ext4_xattr_ibody_header *header;
        unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
        int error;

        /* this was checked at iget time, but double check for good measure */
        if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) ||
            (ei->i_extra_isize & 3)) {
                EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)",
                                 ei->i_extra_isize,
                                 EXT4_INODE_SIZE(inode->i_sb));
                return -EFSCORRUPTED;
        }
        if ((new_extra_isize < ei->i_extra_isize) ||
            (new_extra_isize < 4) ||
            (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE))
                return -EINVAL;        /* Should never happen */

        raw_inode = ext4_raw_inode(iloc);

        header = IHDR(inode, raw_inode);

        /* No extended attributes present */
        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
            header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
                memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE +
                       EXT4_I(inode)->i_extra_isize, 0,
                       new_extra_isize - EXT4_I(inode)->i_extra_isize);
                EXT4_I(inode)->i_extra_isize = new_extra_isize;
                return 0;
        }

        /*
         * We may need to allocate external xattr block so we need quotas
         * initialized. Here we can be called with various locks held so we
         * cannot affort to initialize quotas ourselves. So just bail.
         */
        if (dquot_initialize_needed(inode))
                return -EAGAIN;

        /* try to expand with EAs present */
        error = ext4_expand_extra_isize_ea(inode, new_extra_isize,
                                           raw_inode, handle);
        if (error) {
                /*
                 * Inode size expansion failed; don't try again
                 */
                *no_expand = 1;
        }

        return error;
}

/*
 * Expand an inode by new_extra_isize bytes.
 * Returns 0 on success or negative error number on failure.
 */
static int ext4_try_to_expand_extra_isize(struct inode *inode,
                                          unsigned int new_extra_isize,
                                          struct ext4_iloc iloc,
                                          handle_t *handle)
{
        int no_expand;
        int error;

        if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND))
                return -EOVERFLOW;

        /*
         * In nojournal mode, we can immediately attempt to expand
         * the inode.  When journaled, we first need to obtain extra
         * buffer credits since we may write into the EA block
         * with this same handle. If journal_extend fails, then it will
         * only result in a minor loss of functionality for that inode.
         * If this is felt to be critical, then e2fsck should be run to
         * force a large enough s_min_extra_isize.
         */
        if (ext4_journal_extend(handle,
                                EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0)
                return -ENOSPC;

        if (ext4_write_trylock_xattr(inode, &no_expand) == 0)
                return -EBUSY;

        error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc,
                                          handle, &no_expand);
        ext4_write_unlock_xattr(inode, &no_expand);

        return error;
}

int ext4_expand_extra_isize(struct inode *inode,
                            unsigned int new_extra_isize,
                            struct ext4_iloc *iloc)
{
        handle_t *handle;
        int no_expand;
        int error, rc;

        if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
                brelse(iloc->bh);
                return -EOVERFLOW;
        }

        handle = ext4_journal_start(inode, EXT4_HT_INODE,
                                    EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
        if (IS_ERR(handle)) {
                error = PTR_ERR(handle);
                brelse(iloc->bh);
                return error;
        }

        ext4_write_lock_xattr(inode, &no_expand);

        BUFFER_TRACE(iloc->bh, "get_write_access");
        error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh,
                                              EXT4_JTR_NONE);
        if (error) {
                brelse(iloc->bh);
                goto out_unlock;
        }

        error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc,
                                          handle, &no_expand);

        rc = ext4_mark_iloc_dirty(handle, inode, iloc);
        if (!error)
                error = rc;

out_unlock:
        ext4_write_unlock_xattr(inode, &no_expand);
        ext4_journal_stop(handle);
        return error;
}

/*
 * What we do here is to mark the in-core inode as clean with respect to inode
 * dirtiness (it may still be data-dirty).
 * This means that the in-core inode may be reaped by prune_icache
 * without having to perform any I/O.  This is a very good thing,
 * because *any* task may call prune_icache - even ones which
 * have a transaction open against a different journal.
 *
 * Is this cheating?  Not really.  Sure, we haven't written the
 * inode out, but prune_icache isn't a user-visible syncing function.
 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
 * we start and wait on commits.
 */
int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
                                const char *func, unsigned int line)
{
        struct ext4_iloc iloc;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int err;

        might_sleep();
        trace_ext4_mark_inode_dirty(inode, _RET_IP_);
        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (err)
                goto out;

        if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize)
                ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize,
                                               iloc, handle);

        err = ext4_mark_iloc_dirty(handle, inode, &iloc);
out:
        if (unlikely(err))
                ext4_error_inode_err(inode, func, line, 0, err,
                                        "mark_inode_dirty error");
        return err;
}

/*
 * ext4_dirty_inode() is called from __mark_inode_dirty()
 *
 * We're really interested in the case where a file is being extended.
 * i_size has been changed by generic_commit_write() and we thus need
 * to include the updated inode in the current transaction.
 *
 * Also, dquot_alloc_block() will always dirty the inode when blocks
 * are allocated to the file.
 *
 * If the inode is marked synchronous, we don't honour that here - doing
 * so would cause a commit on atime updates, which we don't bother doing.
 * We handle synchronous inodes at the highest possible level.
 */
void ext4_dirty_inode(struct inode *inode, int flags)
{
        handle_t *handle;

        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
        if (IS_ERR(handle))
                return;
        ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);
}

int ext4_change_inode_journal_flag(struct inode *inode, int val)
{
        journal_t *journal;
        handle_t *handle;
        int err;
        int alloc_ctx;

        /*
         * We have to be very careful here: changing a data block's
         * journaling status dynamically is dangerous.  If we write a
         * data block to the journal, change the status and then delete
         * that block, we risk forgetting to revoke the old log record
         * from the journal and so a subsequent replay can corrupt data.
         * So, first we make sure that the journal is empty and that
         * nobody is changing anything.
         */

        journal = EXT4_JOURNAL(inode);
        if (!journal)
                return 0;
        if (is_journal_aborted(journal))
                return -EROFS;

        /* Wait for all existing dio workers */
        inode_dio_wait(inode);

        /*
         * Before flushing the journal and switching inode's aops, we have
         * to flush all dirty data the inode has. There can be outstanding
         * delayed allocations, there can be unwritten extents created by
         * fallocate or buffered writes in dioread_nolock mode covered by
         * dirty data which can be converted only after flushing the dirty
         * data (and journalled aops don't know how to handle these cases).
         */
        if (val) {
                filemap_invalidate_lock(inode->i_mapping);
                err = filemap_write_and_wait(inode->i_mapping);
                if (err < 0) {
                        filemap_invalidate_unlock(inode->i_mapping);
                        return err;
                }
        }

        alloc_ctx = ext4_writepages_down_write(inode->i_sb);
        jbd2_journal_lock_updates(journal);

        /*
         * OK, there are no updates running now, and all cached data is
         * synced to disk.  We are now in a completely consistent state
         * which doesn't have anything in the journal, and we know that
         * no filesystem updates are running, so it is safe to modify
         * the inode's in-core data-journaling state flag now.
         */

        if (val)
                ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        else {
                err = jbd2_journal_flush(journal, 0);
                if (err < 0) {
                        jbd2_journal_unlock_updates(journal);
                        ext4_writepages_up_write(inode->i_sb, alloc_ctx);
                        return err;
                }
                ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        }
        ext4_set_aops(inode);

        jbd2_journal_unlock_updates(journal);
        ext4_writepages_up_write(inode->i_sb, alloc_ctx);

        if (val)
                filemap_invalidate_unlock(inode->i_mapping);

        /* Finally we can mark the inode as dirty. */

        handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        ext4_fc_mark_ineligible(inode->i_sb,
                EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle);
        err = ext4_mark_inode_dirty(handle, inode);
        ext4_handle_sync(handle);
        ext4_journal_stop(handle);
        ext4_std_error(inode->i_sb, err);

        return err;
}

static int ext4_bh_unmapped(handle_t *handle, struct inode *inode,
                            struct buffer_head *bh)
{
        return !buffer_mapped(bh);
}

vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio = page_folio(vmf->page);
        loff_t size;
        unsigned long len;
        int err;
        vm_fault_t ret;
        struct file *file = vma->vm_file;
        struct inode *inode = file_inode(file);
        struct address_space *mapping = inode->i_mapping;
        handle_t *handle;
        get_block_t *get_block;
        int retries = 0;

        if (unlikely(IS_IMMUTABLE(inode)))
                return VM_FAULT_SIGBUS;

        sb_start_pagefault(inode->i_sb);
        file_update_time(vma->vm_file);

        filemap_invalidate_lock_shared(mapping);

        err = ext4_convert_inline_data(inode);
        if (err)
                goto out_ret;

        /*
         * On data journalling we skip straight to the transaction handle:
         * there's no delalloc; page truncated will be checked later; the
         * early return w/ all buffers mapped (calculates size/len) can't
         * be used; and there's no dioread_nolock, so only ext4_get_block.
         */
        if (ext4_should_journal_data(inode))
                goto retry_alloc;

        /* Delalloc case is easy... */
        if (test_opt(inode->i_sb, DELALLOC) &&
            !ext4_nonda_switch(inode->i_sb)) {
                do {
                        err = block_page_mkwrite(vma, vmf,
                                                   ext4_da_get_block_prep);
                } while (err == -ENOSPC &&
                       ext4_should_retry_alloc(inode->i_sb, &retries));
                goto out_ret;
        }

        folio_lock(folio);
        size = i_size_read(inode);
        /* Page got truncated from under us? */
        if (folio->mapping != mapping || folio_pos(folio) > size) {
                folio_unlock(folio);
                ret = VM_FAULT_NOPAGE;
                goto out;
        }

        len = folio_size(folio);
        if (folio_pos(folio) + len > size)
                len = size - folio_pos(folio);
        /*
         * Return if we have all the buffers mapped. This avoids the need to do
         * journal_start/journal_stop which can block and take a long time
         *
         * This cannot be done for data journalling, as we have to add the
         * inode to the transaction's list to writeprotect pages on commit.
         */
        if (folio_buffers(folio)) {
                if (!ext4_walk_page_buffers(NULL, inode, folio_buffers(folio),
                                            0, len, NULL,
                                            ext4_bh_unmapped)) {
                        /* Wait so that we don't change page under IO */
                        folio_wait_stable(folio);
                        ret = VM_FAULT_LOCKED;
                        goto out;
                }
        }
        folio_unlock(folio);
        /* OK, we need to fill the hole... */
        if (ext4_should_dioread_nolock(inode))
                get_block = ext4_get_block_unwritten;
        else
                get_block = ext4_get_block;
retry_alloc:
        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
                                    ext4_writepage_trans_blocks(inode));
        if (IS_ERR(handle)) {
                ret = VM_FAULT_SIGBUS;
                goto out;
        }
        /*
         * Data journalling can't use block_page_mkwrite() because it
         * will set_buffer_dirty() before do_journal_get_write_access()
         * thus might hit warning messages for dirty metadata buffers.
         */
        if (!ext4_should_journal_data(inode)) {
                err = block_page_mkwrite(vma, vmf, get_block);
        } else {
                folio_lock(folio);
                size = i_size_read(inode);
                /* Page got truncated from under us? */
                if (folio->mapping != mapping || folio_pos(folio) > size) {
                        ret = VM_FAULT_NOPAGE;
                        goto out_error;
                }

                len = folio_size(folio);
                if (folio_pos(folio) + len > size)
                        len = size - folio_pos(folio);

                err = __block_write_begin(&folio->page, 0, len, ext4_get_block);
                if (!err) {
                        ret = VM_FAULT_SIGBUS;
                        if (ext4_journal_folio_buffers(handle, folio, len))
                                goto out_error;
                } else {
                        folio_unlock(folio);
                }
        }
        ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry_alloc;
out_ret:
        ret = vmf_fs_error(err);
out:
        filemap_invalidate_unlock_shared(mapping);
        sb_end_pagefault(inode->i_sb);
        return ret;
out_error:
        folio_unlock(folio);
        ext4_journal_stop(handle);
        goto out;
}











    1 
















    1 












    1 





















    1 




    1 
    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
// SPDX-License-Identifier: GPL-2.0
#include <linux/buffer_head.h>
#include "minix.h"

enum {DIRECT = 7, DEPTH = 4};        /* Have triple indirect */

typedef u32 block_t;        /* 32 bit, host order */

static inline unsigned long block_to_cpu(block_t n)
{
        return n;
}

static inline block_t cpu_to_block(unsigned long n)
{
        return n;
}

static inline block_t *i_data(struct inode *inode)
{
        return (block_t *)minix_i(inode)->u.i2_data;
}

#define DIRCOUNT 7
#define INDIRCOUNT(sb) (1 << ((sb)->s_blocksize_bits - 2))

static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
{
        int n = 0;
        struct super_block *sb = inode->i_sb;

        if (block < 0) {
                printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n",
                        block, sb->s_bdev);
                return 0;
        }
        if ((u64)block * (u64)sb->s_blocksize >= sb->s_maxbytes)
                return 0;

        if (block < DIRCOUNT) {
                offsets[n++] = block;
        } else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
                offsets[n++] = DIRCOUNT;
                offsets[n++] = block;
        } else if ((block -= INDIRCOUNT(sb)) < INDIRCOUNT(sb) * INDIRCOUNT(sb)) {
                offsets[n++] = DIRCOUNT + 1;
                offsets[n++] = block / INDIRCOUNT(sb);
                offsets[n++] = block % INDIRCOUNT(sb);
        } else {
                block -= INDIRCOUNT(sb) * INDIRCOUNT(sb);
                offsets[n++] = DIRCOUNT + 2;
                offsets[n++] = (block / INDIRCOUNT(sb)) / INDIRCOUNT(sb);
                offsets[n++] = (block / INDIRCOUNT(sb)) % INDIRCOUNT(sb);
                offsets[n++] = block % INDIRCOUNT(sb);
        }
        return n;
}

#include "itree_common.c"

int V2_minix_get_block(struct inode * inode, long block,
                        struct buffer_head *bh_result, int create)
{
        return get_block(inode, block, bh_result, create);
}

void V2_minix_truncate(struct inode * inode)
{
        truncate(inode);
}

unsigned V2_minix_blocks(loff_t size, struct super_block *sb)
{
        return nblocks(size, sb);
}










































































































































































































































































   22 





    7 





































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_NODEMASK_H
#define __LINUX_NODEMASK_H

/*
 * Nodemasks provide a bitmap suitable for representing the
 * set of Node's in a system, one bit position per Node number.
 *
 * See detailed comments in the file linux/bitmap.h describing the
 * data type on which these nodemasks are based.
 *
 * For details of nodemask_parse_user(), see bitmap_parse_user() in
 * lib/bitmap.c.  For details of nodelist_parse(), see bitmap_parselist(),
 * also in bitmap.c.  For details of node_remap(), see bitmap_bitremap in
 * lib/bitmap.c.  For details of nodes_remap(), see bitmap_remap in
 * lib/bitmap.c.  For details of nodes_onto(), see bitmap_onto in
 * lib/bitmap.c.  For details of nodes_fold(), see bitmap_fold in
 * lib/bitmap.c.
 *
 * The available nodemask operations are:
 *
 * void node_set(node, mask)                turn on bit 'node' in mask
 * void node_clear(node, mask)                turn off bit 'node' in mask
 * void nodes_setall(mask)                set all bits
 * void nodes_clear(mask)                clear all bits
 * int node_isset(node, mask)                true iff bit 'node' set in mask
 * int node_test_and_set(node, mask)        test and set bit 'node' in mask
 *
 * void nodes_and(dst, src1, src2)        dst = src1 & src2  [intersection]
 * void nodes_or(dst, src1, src2)        dst = src1 | src2  [union]
 * void nodes_xor(dst, src1, src2)        dst = src1 ^ src2
 * void nodes_andnot(dst, src1, src2)        dst = src1 & ~src2
 * void nodes_complement(dst, src)        dst = ~src
 *
 * int nodes_equal(mask1, mask2)        Does mask1 == mask2?
 * int nodes_intersects(mask1, mask2)        Do mask1 and mask2 intersect?
 * int nodes_subset(mask1, mask2)        Is mask1 a subset of mask2?
 * int nodes_empty(mask)                Is mask empty (no bits sets)?
 * int nodes_full(mask)                        Is mask full (all bits sets)?
 * int nodes_weight(mask)                Hamming weight - number of set bits
 *
 * void nodes_shift_right(dst, src, n)        Shift right
 * void nodes_shift_left(dst, src, n)        Shift left
 *
 * unsigned int first_node(mask)        Number lowest set bit, or MAX_NUMNODES
 * unsigend int next_node(node, mask)        Next node past 'node', or MAX_NUMNODES
 * unsigned int next_node_in(node, mask) Next node past 'node', or wrap to first,
 *                                        or MAX_NUMNODES
 * unsigned int first_unset_node(mask)        First node not set in mask, or
 *                                        MAX_NUMNODES
 *
 * nodemask_t nodemask_of_node(node)        Return nodemask with bit 'node' set
 * NODE_MASK_ALL                        Initializer - all bits set
 * NODE_MASK_NONE                        Initializer - no bits set
 * unsigned long *nodes_addr(mask)        Array of unsigned long's in mask
 *
 * int nodemask_parse_user(ubuf, ulen, mask)        Parse ascii string as nodemask
 * int nodelist_parse(buf, map)                Parse ascii string as nodelist
 * int node_remap(oldbit, old, new)        newbit = map(old, new)(oldbit)
 * void nodes_remap(dst, src, old, new)        *dst = map(old, new)(src)
 * void nodes_onto(dst, orig, relmap)        *dst = orig relative to relmap
 * void nodes_fold(dst, orig, sz)        dst bits = orig bits mod sz
 *
 * for_each_node_mask(node, mask)        for-loop node over mask
 *
 * int num_online_nodes()                Number of online Nodes
 * int num_possible_nodes()                Number of all possible Nodes
 *
 * int node_random(mask)                Random node with set bit in mask
 *
 * int node_online(node)                Is some node online?
 * int node_possible(node)                Is some node possible?
 *
 * node_set_online(node)                set bit 'node' in node_online_map
 * node_set_offline(node)                clear bit 'node' in node_online_map
 *
 * for_each_node(node)                        for-loop node over node_possible_map
 * for_each_online_node(node)                for-loop node over node_online_map
 *
 * Subtlety:
 * 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway)
 *    to generate slightly worse code.  So use a simple one-line #define
 *    for node_isset(), instead of wrapping an inline inside a macro, the
 *    way we do the other calls.
 *
 * NODEMASK_SCRATCH
 * When doing above logical AND, OR, XOR, Remap operations the callers tend to
 * need temporary nodemask_t's on the stack. But if NODES_SHIFT is large,
 * nodemask_t's consume too much stack space.  NODEMASK_SCRATCH is a helper
 * for such situations. See below and CPUMASK_ALLOC also.
 */

#include <linux/threads.h>
#include <linux/bitmap.h>
#include <linux/minmax.h>
#include <linux/nodemask_types.h>
#include <linux/numa.h>
#include <linux/random.h>

extern nodemask_t _unused_nodemask_arg_;

/**
 * nodemask_pr_args - printf args to output a nodemask
 * @maskp: nodemask to be printed
 *
 * Can be used to provide arguments for '%*pb[l]' when printing a nodemask.
 */
#define nodemask_pr_args(maskp)        __nodemask_pr_numnodes(maskp), \
                                __nodemask_pr_bits(maskp)
static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m)
{
        return m ? MAX_NUMNODES : 0;
}
static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
{
        return m ? m->bits : NULL;
}

/*
 * The inline keyword gives the compiler room to decide to inline, or
 * not inline a function as it sees best.  However, as these functions
 * are called in both __init and non-__init functions, if they are not
 * inlined we will end up with a section mismatch error (of the type of
 * freeable items not being freed).  So we must use __always_inline here
 * to fix the problem.  If other functions in the future also end up in
 * this situation they will also need to be annotated as __always_inline
 */
#define node_set(node, dst) __node_set((node), &(dst))
static __always_inline void __node_set(int node, volatile nodemask_t *dstp)
{
        set_bit(node, dstp->bits);
}

#define node_clear(node, dst) __node_clear((node), &(dst))
static inline void __node_clear(int node, volatile nodemask_t *dstp)
{
        clear_bit(node, dstp->bits);
}

#define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits)
{
        bitmap_fill(dstp->bits, nbits);
}

#define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
{
        bitmap_zero(dstp->bits, nbits);
}

/* No static inline type checking - see Subtlety (1) above. */
#define node_isset(node, nodemask) test_bit((node), (nodemask).bits)

#define node_test_and_set(node, nodemask) \
                        __node_test_and_set((node), &(nodemask))
static inline bool __node_test_and_set(int node, nodemask_t *addr)
{
        return test_and_set_bit(node, addr->bits);
}

#define nodes_and(dst, src1, src2) \
                        __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_or(dst, src1, src2) \
                        __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES)
static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_xor(dst, src1, src2) \
                        __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES)
static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_andnot(dst, src1, src2) \
                        __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_complement(dst, src) \
                        __nodes_complement(&(dst), &(src), MAX_NUMNODES)
static inline void __nodes_complement(nodemask_t *dstp,
                                        const nodemask_t *srcp, unsigned int nbits)
{
        bitmap_complement(dstp->bits, srcp->bits, nbits);
}

#define nodes_equal(src1, src2) \
                        __nodes_equal(&(src1), &(src2), MAX_NUMNODES)
static inline bool __nodes_equal(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_equal(src1p->bits, src2p->bits, nbits);
}

#define nodes_intersects(src1, src2) \
                        __nodes_intersects(&(src1), &(src2), MAX_NUMNODES)
static inline bool __nodes_intersects(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_intersects(src1p->bits, src2p->bits, nbits);
}

#define nodes_subset(src1, src2) \
                        __nodes_subset(&(src1), &(src2), MAX_NUMNODES)
static inline bool __nodes_subset(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_subset(src1p->bits, src2p->bits, nbits);
}

#define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
static inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_empty(srcp->bits, nbits);
}

#define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES)
static inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_full(srcp->bits, nbits);
}

#define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES)
static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_weight(srcp->bits, nbits);
}

#define nodes_shift_right(dst, src, n) \
                        __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES)
static inline void __nodes_shift_right(nodemask_t *dstp,
                                        const nodemask_t *srcp, int n, int nbits)
{
        bitmap_shift_right(dstp->bits, srcp->bits, n, nbits);
}

#define nodes_shift_left(dst, src, n) \
                        __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES)
static inline void __nodes_shift_left(nodemask_t *dstp,
                                        const nodemask_t *srcp, int n, int nbits)
{
        bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
}

/* FIXME: better would be to fix all architectures to never return
          > MAX_NUMNODES, then the silly min_ts could be dropped. */

#define first_node(src) __first_node(&(src))
static inline unsigned int __first_node(const nodemask_t *srcp)
{
        return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
}

#define next_node(n, src) __next_node((n), &(src))
static inline unsigned int __next_node(int n, const nodemask_t *srcp)
{
        return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
}

/*
 * Find the next present node in src, starting after node n, wrapping around to
 * the first node in src if needed.  Returns MAX_NUMNODES if src is empty.
 */
#define next_node_in(n, src) __next_node_in((n), &(src))
static inline unsigned int __next_node_in(int node, const nodemask_t *srcp)
{
        unsigned int ret = __next_node(node, srcp);

        if (ret == MAX_NUMNODES)
                ret = __first_node(srcp);
        return ret;
}

static inline void init_nodemask_of_node(nodemask_t *mask, int node)
{
        nodes_clear(*mask);
        node_set(node, *mask);
}

#define nodemask_of_node(node)                                                \
({                                                                        \
        typeof(_unused_nodemask_arg_) m;                                \
        if (sizeof(m) == sizeof(unsigned long)) {                        \
                m.bits[0] = 1UL << (node);                                \
        } else {                                                        \
                init_nodemask_of_node(&m, (node));                        \
        }                                                                \
        m;                                                                \
})

#define first_unset_node(mask) __first_unset_node(&(mask))
static inline unsigned int __first_unset_node(const nodemask_t *maskp)
{
        return min_t(unsigned int, MAX_NUMNODES,
                        find_first_zero_bit(maskp->bits, MAX_NUMNODES));
}

#define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES)

#if MAX_NUMNODES <= BITS_PER_LONG

#define NODE_MASK_ALL                                                        \
((nodemask_t) { {                                                        \
        [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD                \
} })

#else

#define NODE_MASK_ALL                                                        \
((nodemask_t) { {                                                        \
        [0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL,                        \
        [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD                \
} })

#endif

#define NODE_MASK_NONE                                                        \
((nodemask_t) { {                                                        \
        [0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] =  0UL                        \
} })

#define nodes_addr(src) ((src).bits)

#define nodemask_parse_user(ubuf, ulen, dst) \
                __nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES)
static inline int __nodemask_parse_user(const char __user *buf, int len,
                                        nodemask_t *dstp, int nbits)
{
        return bitmap_parse_user(buf, len, dstp->bits, nbits);
}

#define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES)
static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits)
{
        return bitmap_parselist(buf, dstp->bits, nbits);
}

#define node_remap(oldbit, old, new) \
                __node_remap((oldbit), &(old), &(new), MAX_NUMNODES)
static inline int __node_remap(int oldbit,
                const nodemask_t *oldp, const nodemask_t *newp, int nbits)
{
        return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits);
}

#define nodes_remap(dst, src, old, new) \
                __nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES)
static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
                const nodemask_t *oldp, const nodemask_t *newp, int nbits)
{
        bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits);
}

#define nodes_onto(dst, orig, relmap) \
                __nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES)
static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp,
                const nodemask_t *relmapp, int nbits)
{
        bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits);
}

#define nodes_fold(dst, orig, sz) \
                __nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES)
static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
                int sz, int nbits)
{
        bitmap_fold(dstp->bits, origp->bits, sz, nbits);
}

#if MAX_NUMNODES > 1
#define for_each_node_mask(node, mask)                                    \
        for ((node) = first_node(mask);                                    \
             (node) < MAX_NUMNODES;                                    \
             (node) = next_node((node), (mask)))
#else /* MAX_NUMNODES == 1 */
#define for_each_node_mask(node, mask)                                  \
        for ((node) = 0; (node) < 1 && !nodes_empty(mask); (node)++)
#endif /* MAX_NUMNODES */

/*
 * Bitmasks that are kept for all the nodes.
 */
enum node_states {
        N_POSSIBLE,                /* The node could become online at some point */
        N_ONLINE,                /* The node is online */
        N_NORMAL_MEMORY,        /* The node has regular memory */
#ifdef CONFIG_HIGHMEM
        N_HIGH_MEMORY,                /* The node has regular or high memory */
#else
        N_HIGH_MEMORY = N_NORMAL_MEMORY,
#endif
        N_MEMORY,                /* The node has memory(regular, high, movable) */
        N_CPU,                /* The node has one or more cpus */
        N_GENERIC_INITIATOR,        /* The node has one or more Generic Initiators */
        NR_NODE_STATES
};

/*
 * The following particular system nodemasks and operations
 * on them manage all possible and online nodes.
 */

extern nodemask_t node_states[NR_NODE_STATES];

#if MAX_NUMNODES > 1
static inline int node_state(int node, enum node_states state)
{
        return node_isset(node, node_states[state]);
}

static inline void node_set_state(int node, enum node_states state)
{
        __node_set(node, &node_states[state]);
}

static inline void node_clear_state(int node, enum node_states state)
{
        __node_clear(node, &node_states[state]);
}

static inline int num_node_state(enum node_states state)
{
        return nodes_weight(node_states[state]);
}

#define for_each_node_state(__node, __state) \
        for_each_node_mask((__node), node_states[__state])

#define first_online_node        first_node(node_states[N_ONLINE])
#define first_memory_node        first_node(node_states[N_MEMORY])
static inline unsigned int next_online_node(int nid)
{
        return next_node(nid, node_states[N_ONLINE]);
}
static inline unsigned int next_memory_node(int nid)
{
        return next_node(nid, node_states[N_MEMORY]);
}

extern unsigned int nr_node_ids;
extern unsigned int nr_online_nodes;

static inline void node_set_online(int nid)
{
        node_set_state(nid, N_ONLINE);
        nr_online_nodes = num_node_state(N_ONLINE);
}

static inline void node_set_offline(int nid)
{
        node_clear_state(nid, N_ONLINE);
        nr_online_nodes = num_node_state(N_ONLINE);
}

#else

static inline int node_state(int node, enum node_states state)
{
        return node == 0;
}

static inline void node_set_state(int node, enum node_states state)
{
}

static inline void node_clear_state(int node, enum node_states state)
{
}

static inline int num_node_state(enum node_states state)
{
        return 1;
}

#define for_each_node_state(node, __state) \
        for ( (node) = 0; (node) == 0; (node) = 1)

#define first_online_node        0
#define first_memory_node        0
#define next_online_node(nid)        (MAX_NUMNODES)
#define next_memory_node(nid)        (MAX_NUMNODES)
#define nr_node_ids                1U
#define nr_online_nodes                1U

#define node_set_online(node)           node_set_state((node), N_ONLINE)
#define node_set_offline(node)           node_clear_state((node), N_ONLINE)

#endif

static inline int node_random(const nodemask_t *maskp)
{
#if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1)
        int w, bit;

        w = nodes_weight(*maskp);
        switch (w) {
        case 0:
                bit = NUMA_NO_NODE;
                break;
        case 1:
                bit = first_node(*maskp);
                break;
        default:
                bit = find_nth_bit(maskp->bits, MAX_NUMNODES, get_random_u32_below(w));
                break;
        }
        return bit;
#else
        return 0;
#endif
}

#define node_online_map         node_states[N_ONLINE]
#define node_possible_map         node_states[N_POSSIBLE]

#define num_online_nodes()        num_node_state(N_ONLINE)
#define num_possible_nodes()        num_node_state(N_POSSIBLE)
#define node_online(node)        node_state((node), N_ONLINE)
#define node_possible(node)        node_state((node), N_POSSIBLE)

#define for_each_node(node)           for_each_node_state(node, N_POSSIBLE)
#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)

/*
 * For nodemask scratch area.
 * NODEMASK_ALLOC(type, name) allocates an object with a specified type and
 * name.
 */
#if NODES_SHIFT > 8 /* nodemask_t > 32 bytes */
#define NODEMASK_ALLOC(type, name, gfp_flags)        \
                        type *name = kmalloc(sizeof(*name), gfp_flags)
#define NODEMASK_FREE(m)                        kfree(m)
#else
#define NODEMASK_ALLOC(type, name, gfp_flags)        type _##name, *name = &_##name
#define NODEMASK_FREE(m)                        do {} while (0)
#endif

/* Example structure for using NODEMASK_ALLOC, used in mempolicy. */
struct nodemask_scratch {
        nodemask_t        mask1;
        nodemask_t        mask2;
};

#define NODEMASK_SCRATCH(x)                                                \
                        NODEMASK_ALLOC(struct nodemask_scratch, x,        \
                                        GFP_KERNEL | __GFP_NORETRY)
#define NODEMASK_SCRATCH_FREE(x)        NODEMASK_FREE(x)


#endif /* __LINUX_NODEMASK_H */















































    1 



    1 



    1 







































































































    1 




    1 
















    1 









    1 

    1 








    1 







    1 



    1 











    1 




    1 


    1 



























    1 


    1 

    1 


    1 














    1 
    1 





    1 



    1 




    1 


    1 






    1 




    1 





















































    1 






    1 






    1 


























































    1 



    1 






    1 





    1 





    1 





























    1 









    1 







    1 




    1 







































































    1 






    1 





    1 

    1 
    1 









    1 





    1 

    1 




    1 


































    1 








    1 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
// SPDX-License-Identifier: GPL-2.0
/*
 * fs/mpage.c
 *
 * Copyright (C) 2002, Linus Torvalds.
 *
 * Contains functions related to preparing and submitting BIOs which contain
 * multiple pagecache pages.
 *
 * 15May2002        Andrew Morton
 *                Initial version
 * 27Jun2002        axboe@suse.de
 *                use bio_add_page() to build bio's just the right size
 */

#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/kdev_t.h>
#include <linux/gfp.h>
#include <linux/bio.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/highmem.h>
#include <linux/prefetch.h>
#include <linux/mpage.h>
#include <linux/mm_inline.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include "internal.h"

/*
 * I/O completion handler for multipage BIOs.
 *
 * The mpage code never puts partial pages into a BIO (except for end-of-file).
 * If a page does not map to a contiguous run of blocks then it simply falls
 * back to block_read_full_folio().
 *
 * Why is this?  If a page's completion depends on a number of different BIOs
 * which can complete in any order (or at the same time) then determining the
 * status of that page is hard.  See end_buffer_async_read() for the details.
 * There is no point in duplicating all that complexity.
 */
static void mpage_read_end_io(struct bio *bio)
{
        struct folio_iter fi;
        int err = blk_status_to_errno(bio->bi_status);

        bio_for_each_folio_all(fi, bio) {
                if (err)
                        folio_set_error(fi.folio);
                else
                        folio_mark_uptodate(fi.folio);
                folio_unlock(fi.folio);
        }

        bio_put(bio);
}

static void mpage_write_end_io(struct bio *bio)
{
        struct folio_iter fi;
        int err = blk_status_to_errno(bio->bi_status);

        bio_for_each_folio_all(fi, bio) {
                if (err) {
                        folio_set_error(fi.folio);
                        mapping_set_error(fi.folio->mapping, err);
                }
                folio_end_writeback(fi.folio);
        }

        bio_put(bio);
}

static struct bio *mpage_bio_submit_read(struct bio *bio)
{
        bio->bi_end_io = mpage_read_end_io;
        guard_bio_eod(bio);
        submit_bio(bio);
        return NULL;
}

static struct bio *mpage_bio_submit_write(struct bio *bio)
{
        bio->bi_end_io = mpage_write_end_io;
        guard_bio_eod(bio);
        submit_bio(bio);
        return NULL;
}

/*
 * support function for mpage_readahead.  The fs supplied get_block might
 * return an up to date buffer.  This is used to map that buffer into
 * the page, which allows read_folio to avoid triggering a duplicate call
 * to get_block.
 *
 * The idea is to avoid adding buffers to pages that don't already have
 * them.  So when the buffer is up to date and the page size == block size,
 * this marks the page up to date instead of adding new buffers.
 */
static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh,
                int page_block)
{
        struct inode *inode = folio->mapping->host;
        struct buffer_head *page_bh, *head;
        int block = 0;

        head = folio_buffers(folio);
        if (!head) {
                /*
                 * don't make any buffers if there is only one buffer on
                 * the folio and the folio just needs to be set up to date
                 */
                if (inode->i_blkbits == PAGE_SHIFT &&
                    buffer_uptodate(bh)) {
                        folio_mark_uptodate(folio);
                        return;
                }
                head = create_empty_buffers(folio, i_blocksize(inode), 0);
        }

        page_bh = head;
        do {
                if (block == page_block) {
                        page_bh->b_state = bh->b_state;
                        page_bh->b_bdev = bh->b_bdev;
                        page_bh->b_blocknr = bh->b_blocknr;
                        break;
                }
                page_bh = page_bh->b_this_page;
                block++;
        } while (page_bh != head);
}

struct mpage_readpage_args {
        struct bio *bio;
        struct folio *folio;
        unsigned int nr_pages;
        bool is_readahead;
        sector_t last_block_in_bio;
        struct buffer_head map_bh;
        unsigned long first_logical_block;
        get_block_t *get_block;
};

/*
 * This is the worker routine which does all the work of mapping the disk
 * blocks and constructs largest possible bios, submits them for IO if the
 * blocks are not contiguous on the disk.
 *
 * We pass a buffer_head back and forth and use its buffer_mapped() flag to
 * represent the validity of its disk mapping and to decide when to do the next
 * get_block() call.
 */
static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
{
        struct folio *folio = args->folio;
        struct inode *inode = folio->mapping->host;
        const unsigned blkbits = inode->i_blkbits;
        const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
        const unsigned blocksize = 1 << blkbits;
        struct buffer_head *map_bh = &args->map_bh;
        sector_t block_in_file;
        sector_t last_block;
        sector_t last_block_in_file;
        sector_t first_block;
        unsigned page_block;
        unsigned first_hole = blocks_per_page;
        struct block_device *bdev = NULL;
        int length;
        int fully_mapped = 1;
        blk_opf_t opf = REQ_OP_READ;
        unsigned nblocks;
        unsigned relative_block;
        gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);

        /* MAX_BUF_PER_PAGE, for example */
        VM_BUG_ON_FOLIO(folio_test_large(folio), folio);

        if (args->is_readahead) {
                opf |= REQ_RAHEAD;
                gfp |= __GFP_NORETRY | __GFP_NOWARN;
        }

        if (folio_buffers(folio))
                goto confused;

        block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
        last_block = block_in_file + args->nr_pages * blocks_per_page;
        last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
        if (last_block > last_block_in_file)
                last_block = last_block_in_file;
        page_block = 0;

        /*
         * Map blocks using the result from the previous get_blocks call first.
         */
        nblocks = map_bh->b_size >> blkbits;
        if (buffer_mapped(map_bh) &&
                        block_in_file > args->first_logical_block &&
                        block_in_file < (args->first_logical_block + nblocks)) {
                unsigned map_offset = block_in_file - args->first_logical_block;
                unsigned last = nblocks - map_offset;

                first_block = map_bh->b_blocknr + map_offset;
                for (relative_block = 0; ; relative_block++) {
                        if (relative_block == last) {
                                clear_buffer_mapped(map_bh);
                                break;
                        }
                        if (page_block == blocks_per_page)
                                break;
                        page_block++;
                        block_in_file++;
                }
                bdev = map_bh->b_bdev;
        }

        /*
         * Then do more get_blocks calls until we are done with this folio.
         */
        map_bh->b_folio = folio;
        while (page_block < blocks_per_page) {
                map_bh->b_state = 0;
                map_bh->b_size = 0;

                if (block_in_file < last_block) {
                        map_bh->b_size = (last_block-block_in_file) << blkbits;
                        if (args->get_block(inode, block_in_file, map_bh, 0))
                                goto confused;
                        args->first_logical_block = block_in_file;
                }

                if (!buffer_mapped(map_bh)) {
                        fully_mapped = 0;
                        if (first_hole == blocks_per_page)
                                first_hole = page_block;
                        page_block++;
                        block_in_file++;
                        continue;
                }

                /* some filesystems will copy data into the page during
                 * the get_block call, in which case we don't want to
                 * read it again.  map_buffer_to_folio copies the data
                 * we just collected from get_block into the folio's buffers
                 * so read_folio doesn't have to repeat the get_block call
                 */
                if (buffer_uptodate(map_bh)) {
                        map_buffer_to_folio(folio, map_bh, page_block);
                        goto confused;
                }
        
                if (first_hole != blocks_per_page)
                        goto confused;                /* hole -> non-hole */

                /* Contiguous blocks? */
                if (!page_block)
                        first_block = map_bh->b_blocknr;
                else if (first_block + page_block != map_bh->b_blocknr)
                        goto confused;
                nblocks = map_bh->b_size >> blkbits;
                for (relative_block = 0; ; relative_block++) {
                        if (relative_block == nblocks) {
                                clear_buffer_mapped(map_bh);
                                break;
                        } else if (page_block == blocks_per_page)
                                break;
                        page_block++;
                        block_in_file++;
                }
                bdev = map_bh->b_bdev;
        }

        if (first_hole != blocks_per_page) {
                folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE);
                if (first_hole == 0) {
                        folio_mark_uptodate(folio);
                        folio_unlock(folio);
                        goto out;
                }
        } else if (fully_mapped) {
                folio_set_mappedtodisk(folio);
        }

        /*
         * This folio will go to BIO.  Do we need to send this BIO off first?
         */
        if (args->bio && (args->last_block_in_bio != first_block - 1))
                args->bio = mpage_bio_submit_read(args->bio);

alloc_new:
        if (args->bio == NULL) {
                args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), opf,
                                      gfp);
                if (args->bio == NULL)
                        goto confused;
                args->bio->bi_iter.bi_sector = first_block << (blkbits - 9);
        }

        length = first_hole << blkbits;
        if (!bio_add_folio(args->bio, folio, length, 0)) {
                args->bio = mpage_bio_submit_read(args->bio);
                goto alloc_new;
        }

        relative_block = block_in_file - args->first_logical_block;
        nblocks = map_bh->b_size >> blkbits;
        if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
            (first_hole != blocks_per_page))
                args->bio = mpage_bio_submit_read(args->bio);
        else
                args->last_block_in_bio = first_block + blocks_per_page - 1;
out:
        return args->bio;

confused:
        if (args->bio)
                args->bio = mpage_bio_submit_read(args->bio);
        if (!folio_test_uptodate(folio))
                block_read_full_folio(folio, args->get_block);
        else
                folio_unlock(folio);
        goto out;
}

/**
 * mpage_readahead - start reads against pages
 * @rac: Describes which pages to read.
 * @get_block: The filesystem's block mapper function.
 *
 * This function walks the pages and the blocks within each page, building and
 * emitting large BIOs.
 *
 * If anything unusual happens, such as:
 *
 * - encountering a page which has buffers
 * - encountering a page which has a non-hole after a hole
 * - encountering a page with non-contiguous blocks
 *
 * then this code just gives up and calls the buffer_head-based read function.
 * It does handle a page which has holes at the end - that is a common case:
 * the end-of-file on blocksize < PAGE_SIZE setups.
 *
 * BH_Boundary explanation:
 *
 * There is a problem.  The mpage read code assembles several pages, gets all
 * their disk mappings, and then submits them all.  That's fine, but obtaining
 * the disk mappings may require I/O.  Reads of indirect blocks, for example.
 *
 * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be
 * submitted in the following order:
 *
 *         12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16
 *
 * because the indirect block has to be read to get the mappings of blocks
 * 13,14,15,16.  Obviously, this impacts performance.
 *
 * So what we do it to allow the filesystem's get_block() function to set
 * BH_Boundary when it maps block 11.  BH_Boundary says: mapping of the block
 * after this one will require I/O against a block which is probably close to
 * this one.  So you should push what I/O you have currently accumulated.
 *
 * This all causes the disk requests to be issued in the correct order.
 */
void mpage_readahead(struct readahead_control *rac, get_block_t get_block)
{
        struct folio *folio;
        struct mpage_readpage_args args = {
                .get_block = get_block,
                .is_readahead = true,
        };

        while ((folio = readahead_folio(rac))) {
                prefetchw(&folio->flags);
                args.folio = folio;
                args.nr_pages = readahead_count(rac);
                args.bio = do_mpage_readpage(&args);
        }
        if (args.bio)
                mpage_bio_submit_read(args.bio);
}
EXPORT_SYMBOL(mpage_readahead);

/*
 * This isn't called much at all
 */
int mpage_read_folio(struct folio *folio, get_block_t get_block)
{
        struct mpage_readpage_args args = {
                .folio = folio,
                .nr_pages = 1,
                .get_block = get_block,
        };

        args.bio = do_mpage_readpage(&args);
        if (args.bio)
                mpage_bio_submit_read(args.bio);
        return 0;
}
EXPORT_SYMBOL(mpage_read_folio);

/*
 * Writing is not so simple.
 *
 * If the page has buffers then they will be used for obtaining the disk
 * mapping.  We only support pages which are fully mapped-and-dirty, with a
 * special case for pages which are unmapped at the end: end-of-file.
 *
 * If the page has no buffers (preferred) then the page is mapped here.
 *
 * If all blocks are found to be contiguous then the page can go into the
 * BIO.  Otherwise fall back to the mapping's writepage().
 * 
 * FIXME: This code wants an estimate of how many pages are still to be
 * written, so it can intelligently allocate a suitably-sized BIO.  For now,
 * just allocate full-size (16-page) BIOs.
 */

struct mpage_data {
        struct bio *bio;
        sector_t last_block_in_bio;
        get_block_t *get_block;
};

/*
 * We have our BIO, so we can now mark the buffers clean.  Make
 * sure to only clean buffers which we know we'll be writing.
 */
static void clean_buffers(struct folio *folio, unsigned first_unmapped)
{
        unsigned buffer_counter = 0;
        struct buffer_head *bh, *head = folio_buffers(folio);

        if (!head)
                return;
        bh = head;

        do {
                if (buffer_counter++ == first_unmapped)
                        break;
                clear_buffer_dirty(bh);
                bh = bh->b_this_page;
        } while (bh != head);

        /*
         * we cannot drop the bh if the page is not uptodate or a concurrent
         * read_folio would fail to serialize with the bh and it would read from
         * disk before we reach the platter.
         */
        if (buffer_heads_over_limit && folio_test_uptodate(folio))
                try_to_free_buffers(folio);
}

static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
                      void *data)
{
        struct mpage_data *mpd = data;
        struct bio *bio = mpd->bio;
        struct address_space *mapping = folio->mapping;
        struct inode *inode = mapping->host;
        const unsigned blkbits = inode->i_blkbits;
        const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
        sector_t last_block;
        sector_t block_in_file;
        sector_t first_block;
        unsigned page_block;
        unsigned first_unmapped = blocks_per_page;
        struct block_device *bdev = NULL;
        int boundary = 0;
        sector_t boundary_block = 0;
        struct block_device *boundary_bdev = NULL;
        size_t length;
        struct buffer_head map_bh;
        loff_t i_size = i_size_read(inode);
        int ret = 0;
        struct buffer_head *head = folio_buffers(folio);

        if (head) {
                struct buffer_head *bh = head;

                /* If they're all mapped and dirty, do it */
                page_block = 0;
                do {
                        BUG_ON(buffer_locked(bh));
                        if (!buffer_mapped(bh)) {
                                /*
                                 * unmapped dirty buffers are created by
                                 * block_dirty_folio -> mmapped data
                                 */
                                if (buffer_dirty(bh))
                                        goto confused;
                                if (first_unmapped == blocks_per_page)
                                        first_unmapped = page_block;
                                continue;
                        }

                        if (first_unmapped != blocks_per_page)
                                goto confused;        /* hole -> non-hole */

                        if (!buffer_dirty(bh) || !buffer_uptodate(bh))
                                goto confused;
                        if (page_block) {
                                if (bh->b_blocknr != first_block + page_block)
                                        goto confused;
                        } else {
                                first_block = bh->b_blocknr;
                        }
                        page_block++;
                        boundary = buffer_boundary(bh);
                        if (boundary) {
                                boundary_block = bh->b_blocknr;
                                boundary_bdev = bh->b_bdev;
                        }
                        bdev = bh->b_bdev;
                } while ((bh = bh->b_this_page) != head);

                if (first_unmapped)
                        goto page_is_mapped;

                /*
                 * Page has buffers, but they are all unmapped. The page was
                 * created by pagein or read over a hole which was handled by
                 * block_read_full_folio().  If this address_space is also
                 * using mpage_readahead then this can rarely happen.
                 */
                goto confused;
        }

        /*
         * The page has no buffers: map it to disk
         */
        BUG_ON(!folio_test_uptodate(folio));
        block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
        /*
         * Whole page beyond EOF? Skip allocating blocks to avoid leaking
         * space.
         */
        if (block_in_file >= (i_size + (1 << blkbits) - 1) >> blkbits)
                goto page_is_mapped;
        last_block = (i_size - 1) >> blkbits;
        map_bh.b_folio = folio;
        for (page_block = 0; page_block < blocks_per_page; ) {

                map_bh.b_state = 0;
                map_bh.b_size = 1 << blkbits;
                if (mpd->get_block(inode, block_in_file, &map_bh, 1))
                        goto confused;
                if (!buffer_mapped(&map_bh))
                        goto confused;
                if (buffer_new(&map_bh))
                        clean_bdev_bh_alias(&map_bh);
                if (buffer_boundary(&map_bh)) {
                        boundary_block = map_bh.b_blocknr;
                        boundary_bdev = map_bh.b_bdev;
                }
                if (page_block) {
                        if (map_bh.b_blocknr != first_block + page_block)
                                goto confused;
                } else {
                        first_block = map_bh.b_blocknr;
                }
                page_block++;
                boundary = buffer_boundary(&map_bh);
                bdev = map_bh.b_bdev;
                if (block_in_file == last_block)
                        break;
                block_in_file++;
        }
        BUG_ON(page_block == 0);

        first_unmapped = page_block;

page_is_mapped:
        /* Don't bother writing beyond EOF, truncate will discard the folio */
        if (folio_pos(folio) >= i_size)
                goto confused;
        length = folio_size(folio);
        if (folio_pos(folio) + length > i_size) {
                /*
                 * The page straddles i_size.  It must be zeroed out on each
                 * and every writepage invocation because it may be mmapped.
                 * "A file is mapped in multiples of the page size.  For a file
                 * that is not a multiple of the page size, the remaining memory
                 * is zeroed when mapped, and writes to that region are not
                 * written out to the file."
                 */
                length = i_size - folio_pos(folio);
                folio_zero_segment(folio, length, folio_size(folio));
        }

        /*
         * This page will go to BIO.  Do we need to send this BIO off first?
         */
        if (bio && mpd->last_block_in_bio != first_block - 1)
                bio = mpage_bio_submit_write(bio);

alloc_new:
        if (bio == NULL) {
                bio = bio_alloc(bdev, BIO_MAX_VECS,
                                REQ_OP_WRITE | wbc_to_write_flags(wbc),
                                GFP_NOFS);
                bio->bi_iter.bi_sector = first_block << (blkbits - 9);
                wbc_init_bio(wbc, bio);
                bio->bi_write_hint = inode->i_write_hint;
        }

        /*
         * Must try to add the page before marking the buffer clean or
         * the confused fail path above (OOM) will be very confused when
         * it finds all bh marked clean (i.e. it will not write anything)
         */
        wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio));
        length = first_unmapped << blkbits;
        if (!bio_add_folio(bio, folio, length, 0)) {
                bio = mpage_bio_submit_write(bio);
                goto alloc_new;
        }

        clean_buffers(folio, first_unmapped);

        BUG_ON(folio_test_writeback(folio));
        folio_start_writeback(folio);
        folio_unlock(folio);
        if (boundary || (first_unmapped != blocks_per_page)) {
                bio = mpage_bio_submit_write(bio);
                if (boundary_block) {
                        write_boundary_block(boundary_bdev,
                                        boundary_block, 1 << blkbits);
                }
        } else {
                mpd->last_block_in_bio = first_block + blocks_per_page - 1;
        }
        goto out;

confused:
        if (bio)
                bio = mpage_bio_submit_write(bio);

        /*
         * The caller has a ref on the inode, so *mapping is stable
         */
        ret = block_write_full_folio(folio, wbc, mpd->get_block);
        mapping_set_error(mapping, ret);
out:
        mpd->bio = bio;
        return ret;
}

/**
 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
 * @mapping: address space structure to write
 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
 * @get_block: the filesystem's block mapper function.
 *
 * This is a library function, which implements the writepages()
 * address_space_operation.
 */
int
mpage_writepages(struct address_space *mapping,
                struct writeback_control *wbc, get_block_t get_block)
{
        struct mpage_data mpd = {
                .get_block        = get_block,
        };
        struct blk_plug plug;
        int ret;

        blk_start_plug(&plug);
        ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
        if (mpd.bio)
                mpage_bio_submit_write(mpd.bio);
        blk_finish_plug(&plug);
        return ret;
}
EXPORT_SYMBOL(mpage_writepages);















































































































































































































































    4 



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HIGHMEM_INTERNAL_H
#define _LINUX_HIGHMEM_INTERNAL_H

/*
 * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft.
 */
#ifdef CONFIG_KMAP_LOCAL
void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
void *__kmap_local_page_prot(struct page *page, pgprot_t prot);
void kunmap_local_indexed(const void *vaddr);
void kmap_local_fork(struct task_struct *tsk);
void __kmap_local_sched_out(void);
void __kmap_local_sched_in(void);
static inline void kmap_assert_nomap(void)
{
        DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx);
}
#else
static inline void kmap_local_fork(struct task_struct *tsk) { }
static inline void kmap_assert_nomap(void) { }
#endif

#ifdef CONFIG_HIGHMEM
#include <asm/highmem.h>

#ifndef ARCH_HAS_KMAP_FLUSH_TLB
static inline void kmap_flush_tlb(unsigned long addr) { }
#endif

#ifndef kmap_prot
#define kmap_prot PAGE_KERNEL
#endif

void *kmap_high(struct page *page);
void kunmap_high(struct page *page);
void __kmap_flush_unused(void);
struct page *__kmap_to_page(void *addr);

static inline void *kmap(struct page *page)
{
        void *addr;

        might_sleep();
        if (!PageHighMem(page))
                addr = page_address(page);
        else
                addr = kmap_high(page);
        kmap_flush_tlb((unsigned long)addr);
        return addr;
}

static inline void kunmap(struct page *page)
{
        might_sleep();
        if (!PageHighMem(page))
                return;
        kunmap_high(page);
}

static inline struct page *kmap_to_page(void *addr)
{
        return __kmap_to_page(addr);
}

static inline void kmap_flush_unused(void)
{
        __kmap_flush_unused();
}

static inline void *kmap_local_page(struct page *page)
{
        return __kmap_local_page_prot(page, kmap_prot);
}

static inline void *kmap_local_folio(struct folio *folio, size_t offset)
{
        struct page *page = folio_page(folio, offset / PAGE_SIZE);
        return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE;
}

static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
{
        return __kmap_local_page_prot(page, prot);
}

static inline void *kmap_local_pfn(unsigned long pfn)
{
        return __kmap_local_pfn_prot(pfn, kmap_prot);
}

static inline void __kunmap_local(const void *vaddr)
{
        kunmap_local_indexed(vaddr);
}

static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_disable();
        else
                preempt_disable();

        pagefault_disable();
        return __kmap_local_page_prot(page, prot);
}

static inline void *kmap_atomic(struct page *page)
{
        return kmap_atomic_prot(page, kmap_prot);
}

static inline void *kmap_atomic_pfn(unsigned long pfn)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_disable();
        else
                preempt_disable();

        pagefault_disable();
        return __kmap_local_pfn_prot(pfn, kmap_prot);
}

static inline void __kunmap_atomic(const void *addr)
{
        kunmap_local_indexed(addr);
        pagefault_enable();
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_enable();
        else
                preempt_enable();
}

unsigned int __nr_free_highpages(void);
extern atomic_long_t _totalhigh_pages;

static inline unsigned int nr_free_highpages(void)
{
        return __nr_free_highpages();
}

static inline unsigned long totalhigh_pages(void)
{
        return (unsigned long)atomic_long_read(&_totalhigh_pages);
}

static inline void totalhigh_pages_add(long count)
{
        atomic_long_add(count, &_totalhigh_pages);
}

static inline bool is_kmap_addr(const void *x)
{
        unsigned long addr = (unsigned long)x;

        return (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) ||
                (addr >= __fix_to_virt(FIX_KMAP_END) &&
                 addr < __fix_to_virt(FIX_KMAP_BEGIN));
}
#else /* CONFIG_HIGHMEM */

static inline struct page *kmap_to_page(void *addr)
{
        return virt_to_page(addr);
}

static inline void *kmap(struct page *page)
{
        might_sleep();
        return page_address(page);
}

static inline void kunmap_high(struct page *page) { }
static inline void kmap_flush_unused(void) { }

static inline void kunmap(struct page *page)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
        kunmap_flush_on_unmap(page_address(page));
#endif
}

static inline void *kmap_local_page(struct page *page)
{
        return page_address(page);
}

static inline void *kmap_local_folio(struct folio *folio, size_t offset)
{
        return page_address(&folio->page) + offset;
}

static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
{
        return kmap_local_page(page);
}

static inline void *kmap_local_pfn(unsigned long pfn)
{
        return kmap_local_page(pfn_to_page(pfn));
}

static inline void __kunmap_local(const void *addr)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
        kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE));
#endif
}

static inline void *kmap_atomic(struct page *page)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_disable();
        else
                preempt_disable();
        pagefault_disable();
        return page_address(page);
}

static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
{
        return kmap_atomic(page);
}

static inline void *kmap_atomic_pfn(unsigned long pfn)
{
        return kmap_atomic(pfn_to_page(pfn));
}

static inline void __kunmap_atomic(const void *addr)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
        kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE));
#endif
        pagefault_enable();
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_enable();
        else
                preempt_enable();
}

static inline unsigned int nr_free_highpages(void) { return 0; }
static inline unsigned long totalhigh_pages(void) { return 0UL; }

static inline bool is_kmap_addr(const void *x)
{
        return false;
}

#endif /* CONFIG_HIGHMEM */

/**
 * kunmap_atomic - Unmap the virtual address mapped by kmap_atomic() - deprecated!
 * @__addr:       Virtual address to be unmapped
 *
 * Unmaps an address previously mapped by kmap_atomic() and re-enables
 * pagefaults. Depending on PREEMP_RT configuration, re-enables also
 * migration and preemption. Users should not count on these side effects.
 *
 * Mappings should be unmapped in the reverse order that they were mapped.
 * See kmap_local_page() for details on nesting.
 *
 * @__addr can be any address within the mapped page, so there is no need
 * to subtract any offset that has been added. In contrast to kunmap(),
 * this function takes the address returned from kmap_atomic(), not the
 * page passed to it. The compiler will warn you if you pass the page.
 */
#define kunmap_atomic(__addr)                                        \
do {                                                                \
        BUILD_BUG_ON(__same_type((__addr), struct page *));        \
        __kunmap_atomic(__addr);                                \
} while (0)

/**
 * kunmap_local - Unmap a page mapped via kmap_local_page().
 * @__addr: An address within the page mapped
 *
 * @__addr can be any address within the mapped page.  Commonly it is the
 * address return from kmap_local_page(), but it can also include offsets.
 *
 * Unmapping should be done in the reverse order of the mapping.  See
 * kmap_local_page() for details.
 */
#define kunmap_local(__addr)                                        \
do {                                                                \
        BUILD_BUG_ON(__same_type((__addr), struct page *));        \
        __kunmap_local(__addr);                                        \
} while (0)

#endif






































































































































































































































































































































































































































































































































































































    1 
    1 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *
 * Copyright (C) 2007 Alan Stern
 * Copyright (C) 2009 IBM Corporation
 * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
 *
 * Authors: Alan Stern <stern@rowland.harvard.edu>
 *          K.Prasad <prasad@linux.vnet.ibm.com>
 *          Frederic Weisbecker <fweisbec@gmail.com>
 */

/*
 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
 * using the CPU's debug registers.
 */

#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
#include <linux/irqflags.h>
#include <linux/notifier.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
#include <linux/percpu.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/smp.h>

#include <asm/hw_breakpoint.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
#include <asm/user.h>
#include <asm/desc.h>
#include <asm/tlbflush.h>

/* Per cpu debug control register value */
DEFINE_PER_CPU(unsigned long, cpu_dr7);
EXPORT_PER_CPU_SYMBOL(cpu_dr7);

/* Per cpu debug address registers values */
static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);

/*
 * Stores the breakpoints currently in use on each breakpoint address
 * register for each cpus
 */
static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);


static inline unsigned long
__encode_dr7(int drnum, unsigned int len, unsigned int type)
{
        unsigned long bp_info;

        bp_info = (len | type) & 0xf;
        bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
        bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));

        return bp_info;
}

/*
 * Encode the length, type, Exact, and Enable bits for a particular breakpoint
 * as stored in debug register 7.
 */
unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
{
        return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN;
}

/*
 * Decode the length and type bits for a particular breakpoint as
 * stored in debug register 7.  Return the "enabled" status.
 */
int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
{
        int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);

        *len = (bp_info & 0xc) | 0x40;
        *type = (bp_info & 0x3) | 0x80;

        return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
}

/*
 * Install a perf counter breakpoint.
 *
 * We seek a free debug address register and use it for this
 * breakpoint. Eventually we enable it in the debug control register.
 *
 * Atomic: we hold the counter->ctx->lock and we only handle variables
 * and registers local to this cpu.
 */
int arch_install_hw_breakpoint(struct perf_event *bp)
{
        struct arch_hw_breakpoint *info = counter_arch_bp(bp);
        unsigned long *dr7;
        int i;

        lockdep_assert_irqs_disabled();

        for (i = 0; i < HBP_NUM; i++) {
                struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);

                if (!*slot) {
                        *slot = bp;
                        break;
                }
        }

        if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
                return -EBUSY;

        set_debugreg(info->address, i);
        __this_cpu_write(cpu_debugreg[i], info->address);

        dr7 = this_cpu_ptr(&cpu_dr7);
        *dr7 |= encode_dr7(i, info->len, info->type);

        /*
         * Ensure we first write cpu_dr7 before we set the DR7 register.
         * This ensures an NMI never see cpu_dr7 0 when DR7 is not.
         */
        barrier();

        set_debugreg(*dr7, 7);
        if (info->mask)
                amd_set_dr_addr_mask(info->mask, i);

        return 0;
}

/*
 * Uninstall the breakpoint contained in the given counter.
 *
 * First we search the debug address register it uses and then we disable
 * it.
 *
 * Atomic: we hold the counter->ctx->lock and we only handle variables
 * and registers local to this cpu.
 */
void arch_uninstall_hw_breakpoint(struct perf_event *bp)
{
        struct arch_hw_breakpoint *info = counter_arch_bp(bp);
        unsigned long dr7;
        int i;

        lockdep_assert_irqs_disabled();

        for (i = 0; i < HBP_NUM; i++) {
                struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);

                if (*slot == bp) {
                        *slot = NULL;
                        break;
                }
        }

        if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
                return;

        dr7 = this_cpu_read(cpu_dr7);
        dr7 &= ~__encode_dr7(i, info->len, info->type);

        set_debugreg(dr7, 7);
        if (info->mask)
                amd_set_dr_addr_mask(0, i);

        /*
         * Ensure the write to cpu_dr7 is after we've set the DR7 register.
         * This ensures an NMI never see cpu_dr7 0 when DR7 is not.
         */
        barrier();

        this_cpu_write(cpu_dr7, dr7);
}

static int arch_bp_generic_len(int x86_len)
{
        switch (x86_len) {
        case X86_BREAKPOINT_LEN_1:
                return HW_BREAKPOINT_LEN_1;
        case X86_BREAKPOINT_LEN_2:
                return HW_BREAKPOINT_LEN_2;
        case X86_BREAKPOINT_LEN_4:
                return HW_BREAKPOINT_LEN_4;
#ifdef CONFIG_X86_64
        case X86_BREAKPOINT_LEN_8:
                return HW_BREAKPOINT_LEN_8;
#endif
        default:
                return -EINVAL;
        }
}

int arch_bp_generic_fields(int x86_len, int x86_type,
                           int *gen_len, int *gen_type)
{
        int len;

        /* Type */
        switch (x86_type) {
        case X86_BREAKPOINT_EXECUTE:
                if (x86_len != X86_BREAKPOINT_LEN_X)
                        return -EINVAL;

                *gen_type = HW_BREAKPOINT_X;
                *gen_len = sizeof(long);
                return 0;
        case X86_BREAKPOINT_WRITE:
                *gen_type = HW_BREAKPOINT_W;
                break;
        case X86_BREAKPOINT_RW:
                *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
                break;
        default:
                return -EINVAL;
        }

        /* Len */
        len = arch_bp_generic_len(x86_len);
        if (len < 0)
                return -EINVAL;
        *gen_len = len;

        return 0;
}

/*
 * Check for virtual address in kernel space.
 */
int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw)
{
        unsigned long va;
        int len;

        va = hw->address;
        len = arch_bp_generic_len(hw->len);
        WARN_ON_ONCE(len < 0);

        /*
         * We don't need to worry about va + len - 1 overflowing:
         * we already require that va is aligned to a multiple of len.
         */
        return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX);
}

/*
 * Checks whether the range [addr, end], overlaps the area [base, base + size).
 */
static inline bool within_area(unsigned long addr, unsigned long end,
                               unsigned long base, unsigned long size)
{
        return end >= base && addr < (base + size);
}

/*
 * Checks whether the range from addr to end, inclusive, overlaps the fixed
 * mapped CPU entry area range or other ranges used for CPU entry.
 */
static inline bool within_cpu_entry(unsigned long addr, unsigned long end)
{
        int cpu;

        /* CPU entry erea is always used for CPU entry */
        if (within_area(addr, end, CPU_ENTRY_AREA_BASE,
                        CPU_ENTRY_AREA_MAP_SIZE))
                return true;

        /*
         * When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU
         * GSBASE value via __per_cpu_offset or pcpu_unit_offsets.
         */
#ifdef CONFIG_SMP
        if (within_area(addr, end, (unsigned long)__per_cpu_offset,
                        sizeof(unsigned long) * nr_cpu_ids))
                return true;
#else
        if (within_area(addr, end, (unsigned long)&pcpu_unit_offsets,
                        sizeof(pcpu_unit_offsets)))
                return true;
#endif

        for_each_possible_cpu(cpu) {
                /* The original rw GDT is being used after load_direct_gdt() */
                if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu),
                                GDT_SIZE))
                        return true;

                /*
                 * cpu_tss_rw is not directly referenced by hardware, but
                 * cpu_tss_rw is also used in CPU entry code,
                 */
                if (within_area(addr, end,
                                (unsigned long)&per_cpu(cpu_tss_rw, cpu),
                                sizeof(struct tss_struct)))
                        return true;

                /*
                 * cpu_tlbstate.user_pcid_flush_mask is used for CPU entry.
                 * If a data breakpoint on it, it will cause an unwanted #DB.
                 * Protect the full cpu_tlbstate structure to be sure.
                 */
                if (within_area(addr, end,
                                (unsigned long)&per_cpu(cpu_tlbstate, cpu),
                                sizeof(struct tlb_state)))
                        return true;

                /*
                 * When in guest (X86_FEATURE_HYPERVISOR), local_db_save()
                 * will read per-cpu cpu_dr7 before clear dr7 register.
                 */
                if (within_area(addr, end, (unsigned long)&per_cpu(cpu_dr7, cpu),
                                sizeof(cpu_dr7)))
                        return true;
        }

        return false;
}

static int arch_build_bp_info(struct perf_event *bp,
                              const struct perf_event_attr *attr,
                              struct arch_hw_breakpoint *hw)
{
        unsigned long bp_end;

        bp_end = attr->bp_addr + attr->bp_len - 1;
        if (bp_end < attr->bp_addr)
                return -EINVAL;

        /*
         * Prevent any breakpoint of any type that overlaps the CPU
         * entry area and data.  This protects the IST stacks and also
         * reduces the chance that we ever find out what happens if
         * there's a data breakpoint on the GDT, IDT, or TSS.
         */
        if (within_cpu_entry(attr->bp_addr, bp_end))
                return -EINVAL;

        hw->address = attr->bp_addr;
        hw->mask = 0;

        /* Type */
        switch (attr->bp_type) {
        case HW_BREAKPOINT_W:
                hw->type = X86_BREAKPOINT_WRITE;
                break;
        case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
                hw->type = X86_BREAKPOINT_RW;
                break;
        case HW_BREAKPOINT_X:
                /*
                 * We don't allow kernel breakpoints in places that are not
                 * acceptable for kprobes.  On non-kprobes kernels, we don't
                 * allow kernel breakpoints at all.
                 */
                if (attr->bp_addr >= TASK_SIZE_MAX) {
                        if (within_kprobe_blacklist(attr->bp_addr))
                                return -EINVAL;
                }

                hw->type = X86_BREAKPOINT_EXECUTE;
                /*
                 * x86 inst breakpoints need to have a specific undefined len.
                 * But we still need to check userspace is not trying to setup
                 * an unsupported length, to get a range breakpoint for example.
                 */
                if (attr->bp_len == sizeof(long)) {
                        hw->len = X86_BREAKPOINT_LEN_X;
                        return 0;
                }
                fallthrough;
        default:
                return -EINVAL;
        }

        /* Len */
        switch (attr->bp_len) {
        case HW_BREAKPOINT_LEN_1:
                hw->len = X86_BREAKPOINT_LEN_1;
                break;
        case HW_BREAKPOINT_LEN_2:
                hw->len = X86_BREAKPOINT_LEN_2;
                break;
        case HW_BREAKPOINT_LEN_4:
                hw->len = X86_BREAKPOINT_LEN_4;
                break;
#ifdef CONFIG_X86_64
        case HW_BREAKPOINT_LEN_8:
                hw->len = X86_BREAKPOINT_LEN_8;
                break;
#endif
        default:
                /* AMD range breakpoint */
                if (!is_power_of_2(attr->bp_len))
                        return -EINVAL;
                if (attr->bp_addr & (attr->bp_len - 1))
                        return -EINVAL;

                if (!boot_cpu_has(X86_FEATURE_BPEXT))
                        return -EOPNOTSUPP;

                /*
                 * It's impossible to use a range breakpoint to fake out
                 * user vs kernel detection because bp_len - 1 can't
                 * have the high bit set.  If we ever allow range instruction
                 * breakpoints, then we'll have to check for kprobe-blacklisted
                 * addresses anywhere in the range.
                 */
                hw->mask = attr->bp_len - 1;
                hw->len = X86_BREAKPOINT_LEN_1;
        }

        return 0;
}

/*
 * Validate the arch-specific HW Breakpoint register settings
 */
int hw_breakpoint_arch_parse(struct perf_event *bp,
                             const struct perf_event_attr *attr,
                             struct arch_hw_breakpoint *hw)
{
        unsigned int align;
        int ret;


        ret = arch_build_bp_info(bp, attr, hw);
        if (ret)
                return ret;

        switch (hw->len) {
        case X86_BREAKPOINT_LEN_1:
                align = 0;
                if (hw->mask)
                        align = hw->mask;
                break;
        case X86_BREAKPOINT_LEN_2:
                align = 1;
                break;
        case X86_BREAKPOINT_LEN_4:
                align = 3;
                break;
#ifdef CONFIG_X86_64
        case X86_BREAKPOINT_LEN_8:
                align = 7;
                break;
#endif
        default:
                WARN_ON_ONCE(1);
                return -EINVAL;
        }

        /*
         * Check that the low-order bits of the address are appropriate
         * for the alignment implied by len.
         */
        if (hw->address & align)
                return -EINVAL;

        return 0;
}

/*
 * Release the user breakpoints used by ptrace
 */
void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
{
        int i;
        struct thread_struct *t = &tsk->thread;

        for (i = 0; i < HBP_NUM; i++) {
                unregister_hw_breakpoint(t->ptrace_bps[i]);
                t->ptrace_bps[i] = NULL;
        }

        t->virtual_dr6 = 0;
        t->ptrace_dr7 = 0;
}

void hw_breakpoint_restore(void)
{
        set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0);
        set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1);
        set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2);
        set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3);
        set_debugreg(DR6_RESERVED, 6);
        set_debugreg(__this_cpu_read(cpu_dr7), 7);
}
EXPORT_SYMBOL_GPL(hw_breakpoint_restore);

/*
 * Handle debug exception notifications.
 *
 * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
 *
 * NOTIFY_DONE returned if one of the following conditions is true.
 * i) When the causative address is from user-space and the exception
 * is a valid one, i.e. not triggered as a result of lazy debug register
 * switching
 * ii) When there are more bits than trap<n> set in DR6 register (such
 * as BD, BS or BT) indicating that more than one debug condition is
 * met and requires some more action in do_debug().
 *
 * NOTIFY_STOP returned for all other cases
 *
 */
static int hw_breakpoint_handler(struct die_args *args)
{
        int i, rc = NOTIFY_STOP;
        struct perf_event *bp;
        unsigned long *dr6_p;
        unsigned long dr6;
        bool bpx;

        /* The DR6 value is pointed by args->err */
        dr6_p = (unsigned long *)ERR_PTR(args->err);
        dr6 = *dr6_p;

        /* Do an early return if no trap bits are set in DR6 */
        if ((dr6 & DR_TRAP_BITS) == 0)
                return NOTIFY_DONE;

        /* Handle all the breakpoints that were triggered */
        for (i = 0; i < HBP_NUM; ++i) {
                if (likely(!(dr6 & (DR_TRAP0 << i))))
                        continue;

                bp = this_cpu_read(bp_per_reg[i]);
                if (!bp)
                        continue;

                bpx = bp->hw.info.type == X86_BREAKPOINT_EXECUTE;

                /*
                 * TF and data breakpoints are traps and can be merged, however
                 * instruction breakpoints are faults and will be raised
                 * separately.
                 *
                 * However DR6 can indicate both TF and instruction
                 * breakpoints. In that case take TF as that has precedence and
                 * delay the instruction breakpoint for the next exception.
                 */
                if (bpx && (dr6 & DR_STEP))
                        continue;

                /*
                 * Reset the 'i'th TRAP bit in dr6 to denote completion of
                 * exception handling
                 */
                (*dr6_p) &= ~(DR_TRAP0 << i);

                perf_bp_event(bp, args->regs);

                /*
                 * Set up resume flag to avoid breakpoint recursion when
                 * returning back to origin.
                 */
                if (bpx)
                        args->regs->flags |= X86_EFLAGS_RF;
        }

        /*
         * Further processing in do_debug() is needed for a) user-space
         * breakpoints (to generate signals) and b) when the system has
         * taken exception due to multiple causes
         */
        if ((current->thread.virtual_dr6 & DR_TRAP_BITS) ||
            (dr6 & (~DR_TRAP_BITS)))
                rc = NOTIFY_DONE;

        return rc;
}

/*
 * Handle debug exception notifications.
 */
int hw_breakpoint_exceptions_notify(
                struct notifier_block *unused, unsigned long val, void *data)
{
        if (val != DIE_DEBUG)
                return NOTIFY_DONE;

        return hw_breakpoint_handler(data);
}

void hw_breakpoint_pmu_read(struct perf_event *bp)
{
        /* TODO */
}












































































































































































   13 

























   13 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#ifndef BLK_THROTTLE_H
#define BLK_THROTTLE_H

#include "blk-cgroup-rwstat.h"

/*
 * To implement hierarchical throttling, throtl_grps form a tree and bios
 * are dispatched upwards level by level until they reach the top and get
 * issued.  When dispatching bios from the children and local group at each
 * level, if the bios are dispatched into a single bio_list, there's a risk
 * of a local or child group which can queue many bios at once filling up
 * the list starving others.
 *
 * To avoid such starvation, dispatched bios are queued separately
 * according to where they came from.  When they are again dispatched to
 * the parent, they're popped in round-robin order so that no single source
 * hogs the dispatch window.
 *
 * throtl_qnode is used to keep the queued bios separated by their sources.
 * Bios are queued to throtl_qnode which in turn is queued to
 * throtl_service_queue and then dispatched in round-robin order.
 *
 * It's also used to track the reference counts on blkg's.  A qnode always
 * belongs to a throtl_grp and gets queued on itself or the parent, so
 * incrementing the reference of the associated throtl_grp when a qnode is
 * queued and decrementing when dequeued is enough to keep the whole blkg
 * tree pinned while bios are in flight.
 */
struct throtl_qnode {
        struct list_head        node;                /* service_queue->queued[] */
        struct bio_list                bios;                /* queued bios */
        struct throtl_grp        *tg;                /* tg this qnode belongs to */
};

struct throtl_service_queue {
        struct throtl_service_queue *parent_sq;        /* the parent service_queue */

        /*
         * Bios queued directly to this service_queue or dispatched from
         * children throtl_grp's.
         */
        struct list_head        queued[2];        /* throtl_qnode [READ/WRITE] */
        unsigned int                nr_queued[2];        /* number of queued bios */

        /*
         * RB tree of active children throtl_grp's, which are sorted by
         * their ->disptime.
         */
        struct rb_root_cached        pending_tree;        /* RB tree of active tgs */
        unsigned int                nr_pending;        /* # queued in the tree */
        unsigned long                first_pending_disptime;        /* disptime of the first tg */
        struct timer_list        pending_timer;        /* fires on first_pending_disptime */
};

enum tg_state_flags {
        THROTL_TG_PENDING        = 1 << 0,        /* on parent's pending tree */
        THROTL_TG_WAS_EMPTY        = 1 << 1,        /* bio_lists[] became non-empty */
        THROTL_TG_CANCELING        = 1 << 2,        /* starts to cancel bio */
};

struct throtl_grp {
        /* must be the first member */
        struct blkg_policy_data pd;

        /* active throtl group service_queue member */
        struct rb_node rb_node;

        /* throtl_data this group belongs to */
        struct throtl_data *td;

        /* this group's service queue */
        struct throtl_service_queue service_queue;

        /*
         * qnode_on_self is used when bios are directly queued to this
         * throtl_grp so that local bios compete fairly with bios
         * dispatched from children.  qnode_on_parent is used when bios are
         * dispatched from this throtl_grp into its parent and will compete
         * with the sibling qnode_on_parents and the parent's
         * qnode_on_self.
         */
        struct throtl_qnode qnode_on_self[2];
        struct throtl_qnode qnode_on_parent[2];

        /*
         * Dispatch time in jiffies. This is the estimated time when group
         * will unthrottle and is ready to dispatch more bio. It is used as
         * key to sort active groups in service tree.
         */
        unsigned long disptime;

        unsigned int flags;

        /* are there any throtl rules between this group and td? */
        bool has_rules_bps[2];
        bool has_rules_iops[2];

        /* bytes per second rate limits */
        uint64_t bps[2];

        /* IOPS limits */
        unsigned int iops[2];

        /* Number of bytes dispatched in current slice */
        uint64_t bytes_disp[2];
        /* Number of bio's dispatched in current slice */
        unsigned int io_disp[2];

        unsigned long last_low_overflow_time[2];

        uint64_t last_bytes_disp[2];
        unsigned int last_io_disp[2];

        /*
         * The following two fields are updated when new configuration is
         * submitted while some bios are still throttled, they record how many
         * bytes/ios are waited already in previous configuration, and they will
         * be used to calculate wait time under new configuration.
         */
        long long carryover_bytes[2];
        int carryover_ios[2];

        unsigned long last_check_time;

        /* When did we start a new slice */
        unsigned long slice_start[2];
        unsigned long slice_end[2];

        struct blkg_rwstat stat_bytes;
        struct blkg_rwstat stat_ios;
};

extern struct blkcg_policy blkcg_policy_throtl;

static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
{
        return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
}

static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
{
        return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
}

/*
 * Internal throttling interface
 */
#ifndef CONFIG_BLK_DEV_THROTTLING
static inline void blk_throtl_exit(struct gendisk *disk) { }
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
static inline void blk_throtl_cancel_bios(struct gendisk *disk) { }
#else /* CONFIG_BLK_DEV_THROTTLING */
void blk_throtl_exit(struct gendisk *disk);
bool __blk_throtl_bio(struct bio *bio);
void blk_throtl_cancel_bios(struct gendisk *disk);

static inline bool blk_throtl_activated(struct request_queue *q)
{
        return q->td != NULL;
}

static inline bool blk_should_throtl(struct bio *bio)
{
        struct throtl_grp *tg;
        int rw = bio_data_dir(bio);

        /*
         * This is called under bio_queue_enter(), and it's synchronized with
         * the activation of blk-throtl, which is protected by
         * blk_mq_freeze_queue().
         */
        if (!blk_throtl_activated(bio->bi_bdev->bd_queue))
                return false;

        tg = blkg_to_tg(bio->bi_blkg);
        if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
                if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
                        bio_set_flag(bio, BIO_CGROUP_ACCT);
                        blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
                                        bio->bi_iter.bi_size);
                }
                blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
        }

        /* iops limit is always counted */
        if (tg->has_rules_iops[rw])
                return true;

        if (tg->has_rules_bps[rw] && !bio_flagged(bio, BIO_BPS_THROTTLED))
                return true;

        return false;
}

static inline bool blk_throtl_bio(struct bio *bio)
{

        if (!blk_should_throtl(bio))
                return false;

        return __blk_throtl_bio(bio);
}
#endif /* CONFIG_BLK_DEV_THROTTLING */

#endif
































































































































































































































































































































































































































































    4 




















































































































































































































    4 
















    2 



































































































    4 






























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Written by Mark Hemment, 1996 (markhe@nextd.demon.co.uk).
 *
 * (C) SGI 2006, Christoph Lameter
 *         Cleaned up and restructured to ease the addition of alternative
 *         implementations of SLAB allocators.
 * (C) Linux Foundation 2008-2013
 *      Unified interface for all slab allocators
 */

#ifndef _LINUX_SLAB_H
#define        _LINUX_SLAB_H

#include <linux/cache.h>
#include <linux/gfp.h>
#include <linux/overflow.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/percpu-refcount.h>
#include <linux/cleanup.h>
#include <linux/hash.h>

enum _slab_flag_bits {
        _SLAB_CONSISTENCY_CHECKS,
        _SLAB_RED_ZONE,
        _SLAB_POISON,
        _SLAB_KMALLOC,
        _SLAB_HWCACHE_ALIGN,
        _SLAB_CACHE_DMA,
        _SLAB_CACHE_DMA32,
        _SLAB_STORE_USER,
        _SLAB_PANIC,
        _SLAB_TYPESAFE_BY_RCU,
        _SLAB_TRACE,
#ifdef CONFIG_DEBUG_OBJECTS
        _SLAB_DEBUG_OBJECTS,
#endif
        _SLAB_NOLEAKTRACE,
        _SLAB_NO_MERGE,
#ifdef CONFIG_FAILSLAB
        _SLAB_FAILSLAB,
#endif
#ifdef CONFIG_MEMCG_KMEM
        _SLAB_ACCOUNT,
#endif
#ifdef CONFIG_KASAN_GENERIC
        _SLAB_KASAN,
#endif
        _SLAB_NO_USER_FLAGS,
#ifdef CONFIG_KFENCE
        _SLAB_SKIP_KFENCE,
#endif
#ifndef CONFIG_SLUB_TINY
        _SLAB_RECLAIM_ACCOUNT,
#endif
        _SLAB_OBJECT_POISON,
        _SLAB_CMPXCHG_DOUBLE,
#ifdef CONFIG_SLAB_OBJ_EXT
        _SLAB_NO_OBJ_EXT,
#endif
        _SLAB_FLAGS_LAST_BIT
};

#define __SLAB_FLAG_BIT(nr)        ((slab_flags_t __force)(1U << (nr)))
#define __SLAB_FLAG_UNUSED        ((slab_flags_t __force)(0U))

/*
 * Flags to pass to kmem_cache_create().
 * The ones marked DEBUG need CONFIG_SLUB_DEBUG enabled, otherwise are no-op
 */
/* DEBUG: Perform (expensive) checks on alloc/free */
#define SLAB_CONSISTENCY_CHECKS        __SLAB_FLAG_BIT(_SLAB_CONSISTENCY_CHECKS)
/* DEBUG: Red zone objs in a cache */
#define SLAB_RED_ZONE                __SLAB_FLAG_BIT(_SLAB_RED_ZONE)
/* DEBUG: Poison objects */
#define SLAB_POISON                __SLAB_FLAG_BIT(_SLAB_POISON)
/* Indicate a kmalloc slab */
#define SLAB_KMALLOC                __SLAB_FLAG_BIT(_SLAB_KMALLOC)
/* Align objs on cache lines */
#define SLAB_HWCACHE_ALIGN        __SLAB_FLAG_BIT(_SLAB_HWCACHE_ALIGN)
/* Use GFP_DMA memory */
#define SLAB_CACHE_DMA                __SLAB_FLAG_BIT(_SLAB_CACHE_DMA)
/* Use GFP_DMA32 memory */
#define SLAB_CACHE_DMA32        __SLAB_FLAG_BIT(_SLAB_CACHE_DMA32)
/* DEBUG: Store the last owner for bug hunting */
#define SLAB_STORE_USER                __SLAB_FLAG_BIT(_SLAB_STORE_USER)
/* Panic if kmem_cache_create() fails */
#define SLAB_PANIC                __SLAB_FLAG_BIT(_SLAB_PANIC)
/*
 * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
 *
 * This delays freeing the SLAB page by a grace period, it does _NOT_
 * delay object freeing. This means that if you do kmem_cache_free()
 * that memory location is free to be reused at any time. Thus it may
 * be possible to see another object there in the same RCU grace period.
 *
 * This feature only ensures the memory location backing the object
 * stays valid, the trick to using this is relying on an independent
 * object validation pass. Something like:
 *
 * begin:
 *  rcu_read_lock();
 *  obj = lockless_lookup(key);
 *  if (obj) {
 *    if (!try_get_ref(obj)) // might fail for free objects
 *      rcu_read_unlock();
 *      goto begin;
 *
 *    if (obj->key != key) { // not the object we expected
 *      put_ref(obj);
 *      rcu_read_unlock();
 *      goto begin;
 *    }
 *  }
 *  rcu_read_unlock();
 *
 * This is useful if we need to approach a kernel structure obliquely,
 * from its address obtained without the usual locking. We can lock
 * the structure to stabilize it and check it's still at the given address,
 * only if we can be sure that the memory has not been meanwhile reused
 * for some other kind of object (which our subsystem's lock might corrupt).
 *
 * rcu_read_lock before reading the address, then rcu_read_unlock after
 * taking the spinlock within the structure expected at that address.
 *
 * Note that it is not possible to acquire a lock within a structure
 * allocated with SLAB_TYPESAFE_BY_RCU without first acquiring a reference
 * as described above.  The reason is that SLAB_TYPESAFE_BY_RCU pages
 * are not zeroed before being given to the slab, which means that any
 * locks must be initialized after each and every kmem_struct_alloc().
 * Alternatively, make the ctor passed to kmem_cache_create() initialize
 * the locks at page-allocation time, as is done in __i915_request_ctor(),
 * sighand_ctor(), and anon_vma_ctor().  Such a ctor permits readers
 * to safely acquire those ctor-initialized locks under rcu_read_lock()
 * protection.
 *
 * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
 */
/* Defer freeing slabs to RCU */
#define SLAB_TYPESAFE_BY_RCU        __SLAB_FLAG_BIT(_SLAB_TYPESAFE_BY_RCU)
/* Trace allocations and frees */
#define SLAB_TRACE                __SLAB_FLAG_BIT(_SLAB_TRACE)

/* Flag to prevent checks on free */
#ifdef CONFIG_DEBUG_OBJECTS
# define SLAB_DEBUG_OBJECTS        __SLAB_FLAG_BIT(_SLAB_DEBUG_OBJECTS)
#else
# define SLAB_DEBUG_OBJECTS        __SLAB_FLAG_UNUSED
#endif

/* Avoid kmemleak tracing */
#define SLAB_NOLEAKTRACE        __SLAB_FLAG_BIT(_SLAB_NOLEAKTRACE)

/*
 * Prevent merging with compatible kmem caches. This flag should be used
 * cautiously. Valid use cases:
 *
 * - caches created for self-tests (e.g. kunit)
 * - general caches created and used by a subsystem, only when a
 *   (subsystem-specific) debug option is enabled
 * - performance critical caches, should be very rare and consulted with slab
 *   maintainers, and not used together with CONFIG_SLUB_TINY
 */
#define SLAB_NO_MERGE                __SLAB_FLAG_BIT(_SLAB_NO_MERGE)

/* Fault injection mark */
#ifdef CONFIG_FAILSLAB
# define SLAB_FAILSLAB                __SLAB_FLAG_BIT(_SLAB_FAILSLAB)
#else
# define SLAB_FAILSLAB                __SLAB_FLAG_UNUSED
#endif
/* Account to memcg */
#ifdef CONFIG_MEMCG_KMEM
# define SLAB_ACCOUNT                __SLAB_FLAG_BIT(_SLAB_ACCOUNT)
#else
# define SLAB_ACCOUNT                __SLAB_FLAG_UNUSED
#endif

#ifdef CONFIG_KASAN_GENERIC
#define SLAB_KASAN                __SLAB_FLAG_BIT(_SLAB_KASAN)
#else
#define SLAB_KASAN                __SLAB_FLAG_UNUSED
#endif

/*
 * Ignore user specified debugging flags.
 * Intended for caches created for self-tests so they have only flags
 * specified in the code and other flags are ignored.
 */
#define SLAB_NO_USER_FLAGS        __SLAB_FLAG_BIT(_SLAB_NO_USER_FLAGS)

#ifdef CONFIG_KFENCE
#define SLAB_SKIP_KFENCE        __SLAB_FLAG_BIT(_SLAB_SKIP_KFENCE)
#else
#define SLAB_SKIP_KFENCE        __SLAB_FLAG_UNUSED
#endif

/* The following flags affect the page allocator grouping pages by mobility */
/* Objects are reclaimable */
#ifndef CONFIG_SLUB_TINY
#define SLAB_RECLAIM_ACCOUNT        __SLAB_FLAG_BIT(_SLAB_RECLAIM_ACCOUNT)
#else
#define SLAB_RECLAIM_ACCOUNT        __SLAB_FLAG_UNUSED
#endif
#define SLAB_TEMPORARY                SLAB_RECLAIM_ACCOUNT        /* Objects are short-lived */

/* Slab created using create_boot_cache */
#ifdef CONFIG_SLAB_OBJ_EXT
#define SLAB_NO_OBJ_EXT                __SLAB_FLAG_BIT(_SLAB_NO_OBJ_EXT)
#else
#define SLAB_NO_OBJ_EXT                __SLAB_FLAG_UNUSED
#endif

/*
 * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
 *
 * Dereferencing ZERO_SIZE_PTR will lead to a distinct access fault.
 *
 * ZERO_SIZE_PTR can be passed to kfree though in the same way that NULL can.
 * Both make kfree a no-op.
 */
#define ZERO_SIZE_PTR ((void *)16)

#define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \
                                (unsigned long)ZERO_SIZE_PTR)

#include <linux/kasan.h>

struct list_lru;
struct mem_cgroup;
/*
 * struct kmem_cache related prototypes
 */
bool slab_is_available(void);

struct kmem_cache *kmem_cache_create(const char *name, unsigned int size,
                        unsigned int align, slab_flags_t flags,
                        void (*ctor)(void *));
struct kmem_cache *kmem_cache_create_usercopy(const char *name,
                        unsigned int size, unsigned int align,
                        slab_flags_t flags,
                        unsigned int useroffset, unsigned int usersize,
                        void (*ctor)(void *));
void kmem_cache_destroy(struct kmem_cache *s);
int kmem_cache_shrink(struct kmem_cache *s);

/*
 * Please use this macro to create slab caches. Simply specify the
 * name of the structure and maybe some flags that are listed above.
 *
 * The alignment of the struct determines object alignment. If you
 * f.e. add ____cacheline_aligned_in_smp to the struct declaration
 * then the objects will be properly aligned in SMP configurations.
 */
#define KMEM_CACHE(__struct, __flags)                                        \
                kmem_cache_create(#__struct, sizeof(struct __struct),        \
                        __alignof__(struct __struct), (__flags), NULL)

/*
 * To whitelist a single field for copying to/from usercopy, use this
 * macro instead for KMEM_CACHE() above.
 */
#define KMEM_CACHE_USERCOPY(__struct, __flags, __field)                        \
                kmem_cache_create_usercopy(#__struct,                        \
                        sizeof(struct __struct),                        \
                        __alignof__(struct __struct), (__flags),        \
                        offsetof(struct __struct, __field),                \
                        sizeof_field(struct __struct, __field), NULL)

/*
 * Common kmalloc functions provided by all allocators
 */
void * __must_check krealloc_noprof(const void *objp, size_t new_size,
                                    gfp_t flags) __realloc_size(2);
#define krealloc(...)                                alloc_hooks(krealloc_noprof(__VA_ARGS__))

void kfree(const void *objp);
void kfree_sensitive(const void *objp);
size_t __ksize(const void *objp);

DEFINE_FREE(kfree, void *, if (!IS_ERR_OR_NULL(_T)) kfree(_T))

/**
 * ksize - Report actual allocation size of associated object
 *
 * @objp: Pointer returned from a prior kmalloc()-family allocation.
 *
 * This should not be used for writing beyond the originally requested
 * allocation size. Either use krealloc() or round up the allocation size
 * with kmalloc_size_roundup() prior to allocation. If this is used to
 * access beyond the originally requested allocation size, UBSAN_BOUNDS
 * and/or FORTIFY_SOURCE may trip, since they only know about the
 * originally allocated size via the __alloc_size attribute.
 */
size_t ksize(const void *objp);

#ifdef CONFIG_PRINTK
bool kmem_dump_obj(void *object);
#else
static inline bool kmem_dump_obj(void *object) { return false; }
#endif

/*
 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
 * alignment larger than the alignment of a 64-bit integer.
 * Setting ARCH_DMA_MINALIGN in arch headers allows that.
 */
#ifdef ARCH_HAS_DMA_MINALIGN
#if ARCH_DMA_MINALIGN > 8 && !defined(ARCH_KMALLOC_MINALIGN)
#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
#endif
#endif

#ifndef ARCH_KMALLOC_MINALIGN
#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
#elif ARCH_KMALLOC_MINALIGN > 8
#define KMALLOC_MIN_SIZE ARCH_KMALLOC_MINALIGN
#define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE)
#endif

/*
 * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
 * Intended for arches that get misalignment faults even for 64 bit integer
 * aligned buffers.
 */
#ifndef ARCH_SLAB_MINALIGN
#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
#endif

/*
 * Arches can define this function if they want to decide the minimum slab
 * alignment at runtime. The value returned by the function must be a power
 * of two and >= ARCH_SLAB_MINALIGN.
 */
#ifndef arch_slab_minalign
static inline unsigned int arch_slab_minalign(void)
{
        return ARCH_SLAB_MINALIGN;
}
#endif

/*
 * kmem_cache_alloc and friends return pointers aligned to ARCH_SLAB_MINALIGN.
 * kmalloc and friends return pointers aligned to both ARCH_KMALLOC_MINALIGN
 * and ARCH_SLAB_MINALIGN, but here we only assume the former alignment.
 */
#define __assume_kmalloc_alignment __assume_aligned(ARCH_KMALLOC_MINALIGN)
#define __assume_slab_alignment __assume_aligned(ARCH_SLAB_MINALIGN)
#define __assume_page_alignment __assume_aligned(PAGE_SIZE)

/*
 * Kmalloc array related definitions
 */

/*
 * SLUB directly allocates requests fitting in to an order-1 page
 * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
 */
#define KMALLOC_SHIFT_HIGH        (PAGE_SHIFT + 1)
#define KMALLOC_SHIFT_MAX        (MAX_PAGE_ORDER + PAGE_SHIFT)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW        3
#endif

/* Maximum allocatable size */
#define KMALLOC_MAX_SIZE        (1UL << KMALLOC_SHIFT_MAX)
/* Maximum size for which we actually use a slab cache */
#define KMALLOC_MAX_CACHE_SIZE        (1UL << KMALLOC_SHIFT_HIGH)
/* Maximum order allocatable via the slab allocator */
#define KMALLOC_MAX_ORDER        (KMALLOC_SHIFT_MAX - PAGE_SHIFT)

/*
 * Kmalloc subsystem.
 */
#ifndef KMALLOC_MIN_SIZE
#define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW)
#endif

/*
 * This restriction comes from byte sized index implementation.
 * Page size is normally 2^12 bytes and, in this case, if we want to use
 * byte sized index which can represent 2^8 entries, the size of the object
 * should be equal or greater to 2^12 / 2^8 = 2^4 = 16.
 * If minimum size of kmalloc is less than 16, we use it as minimum object
 * size and give up to use byte sized index.
 */
#define SLAB_OBJ_MIN_SIZE      (KMALLOC_MIN_SIZE < 16 ? \
                               (KMALLOC_MIN_SIZE) : 16)

#ifdef CONFIG_RANDOM_KMALLOC_CACHES
#define RANDOM_KMALLOC_CACHES_NR        15 // # of cache copies
#else
#define RANDOM_KMALLOC_CACHES_NR        0
#endif

/*
 * Whenever changing this, take care of that kmalloc_type() and
 * create_kmalloc_caches() still work as intended.
 *
 * KMALLOC_NORMAL can contain only unaccounted objects whereas KMALLOC_CGROUP
 * is for accounted but unreclaimable and non-dma objects. All the other
 * kmem caches can have both accounted and unaccounted objects.
 */
enum kmalloc_cache_type {
        KMALLOC_NORMAL = 0,
#ifndef CONFIG_ZONE_DMA
        KMALLOC_DMA = KMALLOC_NORMAL,
#endif
#ifndef CONFIG_MEMCG_KMEM
        KMALLOC_CGROUP = KMALLOC_NORMAL,
#endif
        KMALLOC_RANDOM_START = KMALLOC_NORMAL,
        KMALLOC_RANDOM_END = KMALLOC_RANDOM_START + RANDOM_KMALLOC_CACHES_NR,
#ifdef CONFIG_SLUB_TINY
        KMALLOC_RECLAIM = KMALLOC_NORMAL,
#else
        KMALLOC_RECLAIM,
#endif
#ifdef CONFIG_ZONE_DMA
        KMALLOC_DMA,
#endif
#ifdef CONFIG_MEMCG_KMEM
        KMALLOC_CGROUP,
#endif
        NR_KMALLOC_TYPES
};

extern struct kmem_cache *
kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1];

/*
 * Define gfp bits that should not be set for KMALLOC_NORMAL.
 */
#define KMALLOC_NOT_NORMAL_BITS                                        \
        (__GFP_RECLAIMABLE |                                        \
        (IS_ENABLED(CONFIG_ZONE_DMA)   ? __GFP_DMA : 0) |        \
        (IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0))

extern unsigned long random_kmalloc_seed;

static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags, unsigned long caller)
{
        /*
         * The most common case is KMALLOC_NORMAL, so test for it
         * with a single branch for all the relevant flags.
         */
        if (likely((flags & KMALLOC_NOT_NORMAL_BITS) == 0))
#ifdef CONFIG_RANDOM_KMALLOC_CACHES
                /* RANDOM_KMALLOC_CACHES_NR (=15) copies + the KMALLOC_NORMAL */
                return KMALLOC_RANDOM_START + hash_64(caller ^ random_kmalloc_seed,
                                                      ilog2(RANDOM_KMALLOC_CACHES_NR + 1));
#else
                return KMALLOC_NORMAL;
#endif

        /*
         * At least one of the flags has to be set. Their priorities in
         * decreasing order are:
         *  1) __GFP_DMA
         *  2) __GFP_RECLAIMABLE
         *  3) __GFP_ACCOUNT
         */
        if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA))
                return KMALLOC_DMA;
        if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || (flags & __GFP_RECLAIMABLE))
                return KMALLOC_RECLAIM;
        else
                return KMALLOC_CGROUP;
}

/*
 * Figure out which kmalloc slab an allocation of a certain size
 * belongs to.
 * 0 = zero alloc
 * 1 =  65 .. 96 bytes
 * 2 = 129 .. 192 bytes
 * n = 2^(n-1)+1 .. 2^n
 *
 * Note: __kmalloc_index() is compile-time optimized, and not runtime optimized;
 * typical usage is via kmalloc_index() and therefore evaluated at compile-time.
 * Callers where !size_is_constant should only be test modules, where runtime
 * overheads of __kmalloc_index() can be tolerated.  Also see kmalloc_slab().
 */
static __always_inline unsigned int __kmalloc_index(size_t size,
                                                    bool size_is_constant)
{
        if (!size)
                return 0;

        if (size <= KMALLOC_MIN_SIZE)
                return KMALLOC_SHIFT_LOW;

        if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96)
                return 1;
        if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192)
                return 2;
        if (size <=          8) return 3;
        if (size <=         16) return 4;
        if (size <=         32) return 5;
        if (size <=         64) return 6;
        if (size <=        128) return 7;
        if (size <=        256) return 8;
        if (size <=        512) return 9;
        if (size <=       1024) return 10;
        if (size <=   2 * 1024) return 11;
        if (size <=   4 * 1024) return 12;
        if (size <=   8 * 1024) return 13;
        if (size <=  16 * 1024) return 14;
        if (size <=  32 * 1024) return 15;
        if (size <=  64 * 1024) return 16;
        if (size <= 128 * 1024) return 17;
        if (size <= 256 * 1024) return 18;
        if (size <= 512 * 1024) return 19;
        if (size <= 1024 * 1024) return 20;
        if (size <=  2 * 1024 * 1024) return 21;

        if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant)
                BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()");
        else
                BUG();

        /* Will never be reached. Needed because the compiler may complain */
        return -1;
}
static_assert(PAGE_SHIFT <= 20);
#define kmalloc_index(s) __kmalloc_index(s, true)

#include <linux/alloc_tag.h>

void *__kmalloc_noprof(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1);
#define __kmalloc(...)                                alloc_hooks(__kmalloc_noprof(__VA_ARGS__))

/**
 * kmem_cache_alloc - Allocate an object
 * @cachep: The cache to allocate from.
 * @flags: See kmalloc().
 *
 * Allocate an object from this cache.
 * See kmem_cache_zalloc() for a shortcut of adding __GFP_ZERO to flags.
 *
 * Return: pointer to the new object or %NULL in case of error
 */
void *kmem_cache_alloc_noprof(struct kmem_cache *cachep,
                              gfp_t flags) __assume_slab_alignment __malloc;
#define kmem_cache_alloc(...)                        alloc_hooks(kmem_cache_alloc_noprof(__VA_ARGS__))

void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
                            gfp_t gfpflags) __assume_slab_alignment __malloc;
#define kmem_cache_alloc_lru(...)        alloc_hooks(kmem_cache_alloc_lru_noprof(__VA_ARGS__))

void kmem_cache_free(struct kmem_cache *s, void *objp);

/*
 * Bulk allocation and freeing operations. These are accelerated in an
 * allocator specific way to avoid taking locks repeatedly or building
 * metadata structures unnecessarily.
 *
 * Note that interrupts must be enabled when calling these functions.
 */
void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);

int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p);
#define kmem_cache_alloc_bulk(...)        alloc_hooks(kmem_cache_alloc_bulk_noprof(__VA_ARGS__))

static __always_inline void kfree_bulk(size_t size, void **p)
{
        kmem_cache_free_bulk(NULL, size, p);
}

void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment
                                                         __alloc_size(1);
#define __kmalloc_node(...)                        alloc_hooks(__kmalloc_node_noprof(__VA_ARGS__))

void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags,
                                   int node) __assume_slab_alignment __malloc;
#define kmem_cache_alloc_node(...)        alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__))

void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t flags, size_t size)
                    __assume_kmalloc_alignment __alloc_size(3);

void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags,
                int node, size_t size) __assume_kmalloc_alignment
                                                __alloc_size(4);
#define kmalloc_trace(...)                        alloc_hooks(kmalloc_trace_noprof(__VA_ARGS__))

#define kmalloc_node_trace(...)                        alloc_hooks(kmalloc_node_trace_noprof(__VA_ARGS__))

void *kmalloc_large_noprof(size_t size, gfp_t flags) __assume_page_alignment
                                              __alloc_size(1);
#define kmalloc_large(...)                        alloc_hooks(kmalloc_large_noprof(__VA_ARGS__))

void *kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) __assume_page_alignment
                                                             __alloc_size(1);
#define kmalloc_large_node(...)                        alloc_hooks(kmalloc_large_node_noprof(__VA_ARGS__))

/**
 * kmalloc - allocate kernel memory
 * @size: how many bytes of memory are required.
 * @flags: describe the allocation context
 *
 * kmalloc is the normal method of allocating memory
 * for objects smaller than page size in the kernel.
 *
 * The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN
 * bytes. For @size of power of two bytes, the alignment is also guaranteed
 * to be at least to the size.
 *
 * The @flags argument may be one of the GFP flags defined at
 * include/linux/gfp_types.h and described at
 * :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`
 *
 * The recommended usage of the @flags is described at
 * :ref:`Documentation/core-api/memory-allocation.rst <memory_allocation>`
 *
 * Below is a brief outline of the most useful GFP flags
 *
 * %GFP_KERNEL
 *        Allocate normal kernel ram. May sleep.
 *
 * %GFP_NOWAIT
 *        Allocation will not sleep.
 *
 * %GFP_ATOMIC
 *        Allocation will not sleep.  May use emergency pools.
 *
 * Also it is possible to set different flags by OR'ing
 * in one or more of the following additional @flags:
 *
 * %__GFP_ZERO
 *        Zero the allocated memory before returning. Also see kzalloc().
 *
 * %__GFP_HIGH
 *        This allocation has high priority and may use emergency pools.
 *
 * %__GFP_NOFAIL
 *        Indicate that this allocation is in no way allowed to fail
 *        (think twice before using).
 *
 * %__GFP_NORETRY
 *        If memory is not immediately available,
 *        then give up at once.
 *
 * %__GFP_NOWARN
 *        If allocation fails, don't issue any warnings.
 *
 * %__GFP_RETRY_MAYFAIL
 *        Try really hard to succeed the allocation but fail
 *        eventually.
 */
static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t flags)
{
        if (__builtin_constant_p(size) && size) {
                unsigned int index;

                if (size > KMALLOC_MAX_CACHE_SIZE)
                        return kmalloc_large_noprof(size, flags);

                index = kmalloc_index(size);
                return kmalloc_trace_noprof(
                                kmalloc_caches[kmalloc_type(flags, _RET_IP_)][index],
                                flags, size);
        }
        return __kmalloc_noprof(size, flags);
}
#define kmalloc(...)                                alloc_hooks(kmalloc_noprof(__VA_ARGS__))

static __always_inline __alloc_size(1) void *kmalloc_node_noprof(size_t size, gfp_t flags, int node)
{
        if (__builtin_constant_p(size) && size) {
                unsigned int index;

                if (size > KMALLOC_MAX_CACHE_SIZE)
                        return kmalloc_large_node_noprof(size, flags, node);

                index = kmalloc_index(size);
                return kmalloc_node_trace_noprof(
                                kmalloc_caches[kmalloc_type(flags, _RET_IP_)][index],
                                flags, node, size);
        }
        return __kmalloc_node_noprof(size, flags, node);
}
#define kmalloc_node(...)                        alloc_hooks(kmalloc_node_noprof(__VA_ARGS__))

/**
 * kmalloc_array - allocate memory for an array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        if (__builtin_constant_p(n) && __builtin_constant_p(size))
                return kmalloc_noprof(bytes, flags);
        return kmalloc_noprof(bytes, flags);
}
#define kmalloc_array(...)                        alloc_hooks(kmalloc_array_noprof(__VA_ARGS__))

/**
 * krealloc_array - reallocate memory for an array.
 * @p: pointer to the memory chunk to reallocate
 * @new_n: new number of elements to alloc
 * @new_size: new size of a single member of the array
 * @flags: the type of memory to allocate (see kmalloc)
 */
static inline __realloc_size(2, 3) void * __must_check krealloc_array_noprof(void *p,
                                                                       size_t new_n,
                                                                       size_t new_size,
                                                                       gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(new_n, new_size, &bytes)))
                return NULL;

        return krealloc_noprof(p, bytes, flags);
}
#define krealloc_array(...)                        alloc_hooks(krealloc_array_noprof(__VA_ARGS__))

/**
 * kcalloc - allocate memory for an array. The memory is set to zero.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
#define kcalloc(n, size, flags)                kmalloc_array(n, size, (flags) | __GFP_ZERO)

void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags, int node,
                                  unsigned long caller) __alloc_size(1);
#define kmalloc_node_track_caller(...)                \
        alloc_hooks(kmalloc_node_track_caller_noprof(__VA_ARGS__, _RET_IP_))

/*
 * kmalloc_track_caller is a special version of kmalloc that records the
 * calling function of the routine calling it for slab leak tracking instead
 * of just the calling function (confusing, eh?).
 * It's useful when the call to kmalloc comes from a widely-used standard
 * allocator where we care about the real place the memory allocation
 * request comes from.
 */
#define kmalloc_track_caller(...)                kmalloc_node_track_caller(__VA_ARGS__, NUMA_NO_NODE)

#define kmalloc_track_caller_noprof(...)        \
                kmalloc_node_track_caller_noprof(__VA_ARGS__, NUMA_NO_NODE, _RET_IP_)

static inline __alloc_size(1, 2) void *kmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags,
                                                          int node)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        if (__builtin_constant_p(n) && __builtin_constant_p(size))
                return kmalloc_node_noprof(bytes, flags, node);
        return __kmalloc_node_noprof(bytes, flags, node);
}
#define kmalloc_array_node(...)                        alloc_hooks(kmalloc_array_node_noprof(__VA_ARGS__))

#define kcalloc_node(_n, _size, _flags, _node)        \
        kmalloc_array_node(_n, _size, (_flags) | __GFP_ZERO, _node)

/*
 * Shortcuts
 */
#define kmem_cache_zalloc(_k, _flags)                kmem_cache_alloc(_k, (_flags)|__GFP_ZERO)

/**
 * kzalloc - allocate memory. The memory is set to zero.
 * @size: how many bytes of memory are required.
 * @flags: the type of memory to allocate (see kmalloc).
 */
static inline __alloc_size(1) void *kzalloc_noprof(size_t size, gfp_t flags)
{
        return kmalloc_noprof(size, flags | __GFP_ZERO);
}
#define kzalloc(...)                                alloc_hooks(kzalloc_noprof(__VA_ARGS__))
#define kzalloc_node(_size, _flags, _node)        kmalloc_node(_size, (_flags)|__GFP_ZERO, _node)

extern void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) __alloc_size(1);
#define kvmalloc_node(...)                        alloc_hooks(kvmalloc_node_noprof(__VA_ARGS__))

#define kvmalloc(_size, _flags)                        kvmalloc_node(_size, _flags, NUMA_NO_NODE)
#define kvmalloc_noprof(_size, _flags)                kvmalloc_node_noprof(_size, _flags, NUMA_NO_NODE)
#define kvzalloc(_size, _flags)                        kvmalloc(_size, (_flags)|__GFP_ZERO)

#define kvzalloc_node(_size, _flags, _node)        kvmalloc_node(_size, (_flags)|__GFP_ZERO, _node)

static inline __alloc_size(1, 2) void *
kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;

        return kvmalloc_node_noprof(bytes, flags, node);
}

#define kvmalloc_array_noprof(...)                kvmalloc_array_node_noprof(__VA_ARGS__, NUMA_NO_NODE)
#define kvcalloc_node_noprof(_n,_s,_f,_node)        kvmalloc_array_node_noprof(_n,_s,(_f)|__GFP_ZERO,_node)
#define kvcalloc_noprof(...)                        kvcalloc_node_noprof(__VA_ARGS__, NUMA_NO_NODE)

#define kvmalloc_array(...)                        alloc_hooks(kvmalloc_array_noprof(__VA_ARGS__))
#define kvcalloc_node(...)                        alloc_hooks(kvcalloc_node_noprof(__VA_ARGS__))
#define kvcalloc(...)                                alloc_hooks(kvcalloc_noprof(__VA_ARGS__))

extern void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
                      __realloc_size(3);
#define kvrealloc(...)                                alloc_hooks(kvrealloc_noprof(__VA_ARGS__))

extern void kvfree(const void *addr);
DEFINE_FREE(kvfree, void *, if (!IS_ERR_OR_NULL(_T)) kvfree(_T))

extern void kvfree_sensitive(const void *addr, size_t len);

unsigned int kmem_cache_size(struct kmem_cache *s);

/**
 * kmalloc_size_roundup - Report allocation bucket size for the given size
 *
 * @size: Number of bytes to round up from.
 *
 * This returns the number of bytes that would be available in a kmalloc()
 * allocation of @size bytes. For example, a 126 byte request would be
 * rounded up to the next sized kmalloc bucket, 128 bytes. (This is strictly
 * for the general-purpose kmalloc()-based allocations, and is not for the
 * pre-sized kmem_cache_alloc()-based allocations.)
 *
 * Use this to kmalloc() the full bucket size ahead of time instead of using
 * ksize() to query the size after an allocation.
 */
size_t kmalloc_size_roundup(size_t size);

void __init kmem_cache_init_late(void);

#endif        /* _LINUX_SLAB_H */


































































































































































































































































































































































































































































































































































































































































































    9 



    9 














































































































    6 






    6 






    1 













































































   11 



    7 


    1 


    6 


    9 




























































































































































































































    3 























































































































    9 



    7 

    1 


























    9 
    9 






























































































































































































































































































    3 



    7 

































































    9 
    8 












    1 
    1 


























































    2 

    1 




































    9 


    8 


    9 






















































    3 





















    1 










































































































































    2 

    1 












    1 




















    1 




















    1 



    2 




    2 
















    1 




































    4 















    4 


    1 


































    2 

    2 






    1 






    1 


























    4 



    4 





















    9 






    6 
    9 
    9 




























    7 



















    9 











    7 






    7 


    8 



    3 
    7 




   10 













    3 




    1 






    9 









   10 











    9 




    9 








    6 
    9 











    8 















    8 

















    7 



    8 

    5 
    6 




    9 
































































































    3 



    4 
    4 
    4 
    4 








    3 















    1 


















    1 





    1 

    1 
    1 




    1 
























    4 





    4 


    4 























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 




    2 



    2 





    2 

    2 







    2 







    2 







    2 





    2 



























    2 










    2 














    2 





    2 






    1 











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 







    2 

    2 





    2 

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 

    3 








































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
// SPDX-License-Identifier: GPL-2.0-only
/*
 * kernel/workqueue.c - generic async execution with shared worker pool
 *
 * Copyright (C) 2002                Ingo Molnar
 *
 *   Derived from the taskqueue/keventd code by:
 *     David Woodhouse <dwmw2@infradead.org>
 *     Andrew Morton
 *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
 *     Theodore Ts'o <tytso@mit.edu>
 *
 * Made to use alloc_percpu by Christoph Lameter.
 *
 * Copyright (C) 2010                SUSE Linux Products GmbH
 * Copyright (C) 2010                Tejun Heo <tj@kernel.org>
 *
 * This is the generic async execution mechanism.  Work items as are
 * executed in process context.  The worker pool is shared and
 * automatically managed.  There are two worker pools for each CPU (one for
 * normal work items and the other for high priority ones) and some extra
 * pools for workqueues which are not bound to any specific CPU - the
 * number of these backing pools is dynamic.
 *
 * Please read Documentation/core-api/workqueue.rst for details.
 */

#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/signal.h>
#include <linux/completion.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/kthread.h>
#include <linux/hardirq.h>
#include <linux/mempolicy.h>
#include <linux/freezer.h>
#include <linux/debug_locks.h>
#include <linux/lockdep.h>
#include <linux/idr.h>
#include <linux/jhash.h>
#include <linux/hashtable.h>
#include <linux/rculist.h>
#include <linux/nodemask.h>
#include <linux/moduleparam.h>
#include <linux/uaccess.h>
#include <linux/sched/isolation.h>
#include <linux/sched/debug.h>
#include <linux/nmi.h>
#include <linux/kvm_para.h>
#include <linux/delay.h>
#include <linux/irq_work.h>

#include "workqueue_internal.h"

enum worker_pool_flags {
        /*
         * worker_pool flags
         *
         * A bound pool is either associated or disassociated with its CPU.
         * While associated (!DISASSOCIATED), all workers are bound to the
         * CPU and none has %WORKER_UNBOUND set and concurrency management
         * is in effect.
         *
         * While DISASSOCIATED, the cpu may be offline and all workers have
         * %WORKER_UNBOUND set and concurrency management disabled, and may
         * be executing on any CPU.  The pool behaves as an unbound one.
         *
         * Note that DISASSOCIATED should be flipped only while holding
         * wq_pool_attach_mutex to avoid changing binding state while
         * worker_attach_to_pool() is in progress.
         *
         * As there can only be one concurrent BH execution context per CPU, a
         * BH pool is per-CPU and always DISASSOCIATED.
         */
        POOL_BH                        = 1 << 0,        /* is a BH pool */
        POOL_MANAGER_ACTIVE        = 1 << 1,        /* being managed */
        POOL_DISASSOCIATED        = 1 << 2,        /* cpu can't serve workers */
        POOL_BH_DRAINING        = 1 << 3,        /* draining after CPU offline */
};

enum worker_flags {
        /* worker flags */
        WORKER_DIE                = 1 << 1,        /* die die die */
        WORKER_IDLE                = 1 << 2,        /* is idle */
        WORKER_PREP                = 1 << 3,        /* preparing to run works */
        WORKER_CPU_INTENSIVE        = 1 << 6,        /* cpu intensive */
        WORKER_UNBOUND                = 1 << 7,        /* worker is unbound */
        WORKER_REBOUND                = 1 << 8,        /* worker was rebound */

        WORKER_NOT_RUNNING        = WORKER_PREP | WORKER_CPU_INTENSIVE |
                                  WORKER_UNBOUND | WORKER_REBOUND,
};

enum work_cancel_flags {
        WORK_CANCEL_DELAYED        = 1 << 0,        /* canceling a delayed_work */
        WORK_CANCEL_DISABLE        = 1 << 1,        /* canceling to disable */
};

enum wq_internal_consts {
        NR_STD_WORKER_POOLS        = 2,                /* # standard pools per cpu */

        UNBOUND_POOL_HASH_ORDER        = 6,                /* hashed by pool->attrs */
        BUSY_WORKER_HASH_ORDER        = 6,                /* 64 pointers */

        MAX_IDLE_WORKERS_RATIO        = 4,                /* 1/4 of busy can be idle */
        IDLE_WORKER_TIMEOUT        = 300 * HZ,        /* keep idle ones for 5 mins */

        MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
                                                /* call for help after 10ms
                                                   (min two ticks) */
        MAYDAY_INTERVAL                = HZ / 10,        /* and then every 100ms */
        CREATE_COOLDOWN                = HZ,                /* time to breath after fail */

        /*
         * Rescue workers are used only on emergencies and shared by
         * all cpus.  Give MIN_NICE.
         */
        RESCUER_NICE_LEVEL        = MIN_NICE,
        HIGHPRI_NICE_LEVEL        = MIN_NICE,

        WQ_NAME_LEN                = 32,
        WORKER_ID_LEN                = 10 + WQ_NAME_LEN, /* "kworker/R-" + WQ_NAME_LEN */
};

/*
 * We don't want to trap softirq for too long. See MAX_SOFTIRQ_TIME and
 * MAX_SOFTIRQ_RESTART in kernel/softirq.c. These are macros because
 * msecs_to_jiffies() can't be an initializer.
 */
#define BH_WORKER_JIFFIES        msecs_to_jiffies(2)
#define BH_WORKER_RESTARTS        10

/*
 * Structure fields follow one of the following exclusion rules.
 *
 * I: Modifiable by initialization/destruction paths and read-only for
 *    everyone else.
 *
 * P: Preemption protected.  Disabling preemption is enough and should
 *    only be modified and accessed from the local cpu.
 *
 * L: pool->lock protected.  Access with pool->lock held.
 *
 * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for
 *     reads.
 *
 * K: Only modified by worker while holding pool->lock. Can be safely read by
 *    self, while holding pool->lock or from IRQ context if %current is the
 *    kworker.
 *
 * S: Only modified by worker self.
 *
 * A: wq_pool_attach_mutex protected.
 *
 * PL: wq_pool_mutex protected.
 *
 * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
 *
 * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
 *
 * PWR: wq_pool_mutex and wq->mutex protected for writes.  Either or
 *      RCU for reads.
 *
 * WQ: wq->mutex protected.
 *
 * WR: wq->mutex protected for writes.  RCU protected for reads.
 *
 * WO: wq->mutex protected for writes. Updated with WRITE_ONCE() and can be read
 *     with READ_ONCE() without locking.
 *
 * MD: wq_mayday_lock protected.
 *
 * WD: Used internally by the watchdog.
 */

/* struct worker is defined in workqueue_internal.h */

struct worker_pool {
        raw_spinlock_t                lock;                /* the pool lock */
        int                        cpu;                /* I: the associated cpu */
        int                        node;                /* I: the associated node ID */
        int                        id;                /* I: pool ID */
        unsigned int                flags;                /* L: flags */

        unsigned long                watchdog_ts;        /* L: watchdog timestamp */
        bool                        cpu_stall;        /* WD: stalled cpu bound pool */

        /*
         * The counter is incremented in a process context on the associated CPU
         * w/ preemption disabled, and decremented or reset in the same context
         * but w/ pool->lock held. The readers grab pool->lock and are
         * guaranteed to see if the counter reached zero.
         */
        int                        nr_running;

        struct list_head        worklist;        /* L: list of pending works */

        int                        nr_workers;        /* L: total number of workers */
        int                        nr_idle;        /* L: currently idle workers */

        struct list_head        idle_list;        /* L: list of idle workers */
        struct timer_list        idle_timer;        /* L: worker idle timeout */
        struct work_struct      idle_cull_work; /* L: worker idle cleanup */

        struct timer_list        mayday_timer;          /* L: SOS timer for workers */

        /* a workers is either on busy_hash or idle_list, or the manager */
        DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
                                                /* L: hash of busy workers */

        struct worker                *manager;        /* L: purely informational */
        struct list_head        workers;        /* A: attached workers */
        struct list_head        dying_workers;  /* A: workers about to die */
        struct completion        *detach_completion; /* all workers detached */

        struct ida                worker_ida;        /* worker IDs for task name */

        struct workqueue_attrs        *attrs;                /* I: worker attributes */
        struct hlist_node        hash_node;        /* PL: unbound_pool_hash node */
        int                        refcnt;                /* PL: refcnt for unbound pools */

        /*
         * Destruction of pool is RCU protected to allow dereferences
         * from get_work_pool().
         */
        struct rcu_head                rcu;
};

/*
 * Per-pool_workqueue statistics. These can be monitored using
 * tools/workqueue/wq_monitor.py.
 */
enum pool_workqueue_stats {
        PWQ_STAT_STARTED,        /* work items started execution */
        PWQ_STAT_COMPLETED,        /* work items completed execution */
        PWQ_STAT_CPU_TIME,        /* total CPU time consumed */
        PWQ_STAT_CPU_INTENSIVE,        /* wq_cpu_intensive_thresh_us violations */
        PWQ_STAT_CM_WAKEUP,        /* concurrency-management worker wakeups */
        PWQ_STAT_REPATRIATED,        /* unbound workers brought back into scope */
        PWQ_STAT_MAYDAY,        /* maydays to rescuer */
        PWQ_STAT_RESCUED,        /* linked work items executed by rescuer */

        PWQ_NR_STATS,
};

/*
 * The per-pool workqueue.  While queued, bits below WORK_PWQ_SHIFT
 * of work_struct->data are used for flags and the remaining high bits
 * point to the pwq; thus, pwqs need to be aligned at two's power of the
 * number of flag bits.
 */
struct pool_workqueue {
        struct worker_pool        *pool;                /* I: the associated pool */
        struct workqueue_struct *wq;                /* I: the owning workqueue */
        int                        work_color;        /* L: current color */
        int                        flush_color;        /* L: flushing color */
        int                        refcnt;                /* L: reference count */
        int                        nr_in_flight[WORK_NR_COLORS];
                                                /* L: nr of in_flight works */
        bool                        plugged;        /* L: execution suspended */

        /*
         * nr_active management and WORK_STRUCT_INACTIVE:
         *
         * When pwq->nr_active >= max_active, new work item is queued to
         * pwq->inactive_works instead of pool->worklist and marked with
         * WORK_STRUCT_INACTIVE.
         *
         * All work items marked with WORK_STRUCT_INACTIVE do not participate in
         * nr_active and all work items in pwq->inactive_works are marked with
         * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are
         * in pwq->inactive_works. Some of them are ready to run in
         * pool->worklist or worker->scheduled. Those work itmes are only struct
         * wq_barrier which is used for flush_work() and should not participate
         * in nr_active. For non-barrier work item, it is marked with
         * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
         */
        int                        nr_active;        /* L: nr of active works */
        struct list_head        inactive_works;        /* L: inactive works */
        struct list_head        pending_node;        /* LN: node on wq_node_nr_active->pending_pwqs */
        struct list_head        pwqs_node;        /* WR: node on wq->pwqs */
        struct list_head        mayday_node;        /* MD: node on wq->maydays */

        u64                        stats[PWQ_NR_STATS];

        /*
         * Release of unbound pwq is punted to a kthread_worker. See put_pwq()
         * and pwq_release_workfn() for details. pool_workqueue itself is also
         * RCU protected so that the first pwq can be determined without
         * grabbing wq->mutex.
         */
        struct kthread_work        release_work;
        struct rcu_head                rcu;
} __aligned(1 << WORK_STRUCT_PWQ_SHIFT);

/*
 * Structure used to wait for workqueue flush.
 */
struct wq_flusher {
        struct list_head        list;                /* WQ: list of flushers */
        int                        flush_color;        /* WQ: flush color waiting for */
        struct completion        done;                /* flush completion */
};

struct wq_device;

/*
 * Unlike in a per-cpu workqueue where max_active limits its concurrency level
 * on each CPU, in an unbound workqueue, max_active applies to the whole system.
 * As sharing a single nr_active across multiple sockets can be very expensive,
 * the counting and enforcement is per NUMA node.
 *
 * The following struct is used to enforce per-node max_active. When a pwq wants
 * to start executing a work item, it should increment ->nr using
 * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over
 * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish
 * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in
 * round-robin order.
 */
struct wq_node_nr_active {
        int                        max;                /* per-node max_active */
        atomic_t                nr;                /* per-node nr_active */
        raw_spinlock_t                lock;                /* nests inside pool locks */
        struct list_head        pending_pwqs;        /* LN: pwqs with inactive works */
};

/*
 * The externally visible workqueue.  It relays the issued work items to
 * the appropriate worker_pool through its pool_workqueues.
 */
struct workqueue_struct {
        struct list_head        pwqs;                /* WR: all pwqs of this wq */
        struct list_head        list;                /* PR: list of all workqueues */

        struct mutex                mutex;                /* protects this wq */
        int                        work_color;        /* WQ: current work color */
        int                        flush_color;        /* WQ: current flush color */
        atomic_t                nr_pwqs_to_flush; /* flush in progress */
        struct wq_flusher        *first_flusher;        /* WQ: first flusher */
        struct list_head        flusher_queue;        /* WQ: flush waiters */
        struct list_head        flusher_overflow; /* WQ: flush overflow list */

        struct list_head        maydays;        /* MD: pwqs requesting rescue */
        struct worker                *rescuer;        /* MD: rescue worker */

        int                        nr_drainers;        /* WQ: drain in progress */

        /* See alloc_workqueue() function comment for info on min/max_active */
        int                        max_active;        /* WO: max active works */
        int                        min_active;        /* WO: min active works */
        int                        saved_max_active; /* WQ: saved max_active */
        int                        saved_min_active; /* WQ: saved min_active */

        struct workqueue_attrs        *unbound_attrs;        /* PW: only for unbound wqs */
        struct pool_workqueue __rcu *dfl_pwq;   /* PW: only for unbound wqs */

#ifdef CONFIG_SYSFS
        struct wq_device        *wq_dev;        /* I: for sysfs interface */
#endif
#ifdef CONFIG_LOCKDEP
        char                        *lock_name;
        struct lock_class_key        key;
        struct lockdep_map        lockdep_map;
#endif
        char                        name[WQ_NAME_LEN]; /* I: workqueue name */

        /*
         * Destruction of workqueue_struct is RCU protected to allow walking
         * the workqueues list without grabbing wq_pool_mutex.
         * This is used to dump all workqueues from sysrq.
         */
        struct rcu_head                rcu;

        /* hot fields used during command issue, aligned to cacheline */
        unsigned int                flags ____cacheline_aligned; /* WQ: WQ_* flags */
        struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */
        struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */
};

/*
 * Each pod type describes how CPUs should be grouped for unbound workqueues.
 * See the comment above workqueue_attrs->affn_scope.
 */
struct wq_pod_type {
        int                        nr_pods;        /* number of pods */
        cpumask_var_t                *pod_cpus;        /* pod -> cpus */
        int                        *pod_node;        /* pod -> node */
        int                        *cpu_pod;        /* cpu -> pod */
};

struct work_offq_data {
        u32                        pool_id;
        u32                        disable;
        u32                        flags;
};

static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = {
        [WQ_AFFN_DFL]                = "default",
        [WQ_AFFN_CPU]                = "cpu",
        [WQ_AFFN_SMT]                = "smt",
        [WQ_AFFN_CACHE]                = "cache",
        [WQ_AFFN_NUMA]                = "numa",
        [WQ_AFFN_SYSTEM]        = "system",
};

/*
 * Per-cpu work items which run for longer than the following threshold are
 * automatically considered CPU intensive and excluded from concurrency
 * management to prevent them from noticeably delaying other per-cpu work items.
 * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter.
 * The actual value is initialized in wq_cpu_intensive_thresh_init().
 */
static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX;
module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT
static unsigned int wq_cpu_intensive_warning_thresh = 4;
module_param_named(cpu_intensive_warning_thresh, wq_cpu_intensive_warning_thresh, uint, 0644);
#endif

/* see the comment above the definition of WQ_POWER_EFFICIENT */
static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
module_param_named(power_efficient, wq_power_efficient, bool, 0444);

static bool wq_online;                        /* can kworkers be created yet? */
static bool wq_topo_initialized __read_mostly = false;

static struct kmem_cache *pwq_cache;

static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE;

/* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */
static struct workqueue_attrs *wq_update_pod_attrs_buf;

static DEFINE_MUTEX(wq_pool_mutex);        /* protects pools and workqueues list */
static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
static DEFINE_RAW_SPINLOCK(wq_mayday_lock);        /* protects wq->maydays list */
/* wait for manager to go away */
static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);

static LIST_HEAD(workqueues);                /* PR: list of all workqueues */
static bool workqueue_freezing;                /* PL: have wqs started freezing? */

/* PL&A: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;

/* PL: user requested unbound cpumask via sysfs */
static cpumask_var_t wq_requested_unbound_cpumask;

/* PL: isolated cpumask to be excluded from unbound cpumask */
static cpumask_var_t wq_isolated_cpumask;

/* for further constrain wq_unbound_cpumask by cmdline parameter*/
static struct cpumask wq_cmdline_cpumask __initdata;

/* CPU where unbound work was last round robin scheduled from this CPU */
static DEFINE_PER_CPU(int, wq_rr_cpu_last);

/*
 * Local execution of unbound work items is no longer guaranteed.  The
 * following always forces round-robin CPU selection on unbound work items
 * to uncover usages which depend on it.
 */
#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
static bool wq_debug_force_rr_cpu = true;
#else
static bool wq_debug_force_rr_cpu = false;
#endif
module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);

/* to raise softirq for the BH worker pools on other CPUs */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS],
                                     bh_pool_irq_works);

/* the BH worker pools */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
                                     bh_worker_pools);

/* the per-cpu worker pools */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
                                     cpu_worker_pools);

static DEFINE_IDR(worker_pool_idr);        /* PR: idr of all pools */

/* PL: hash of all unbound pools keyed by pool->attrs */
static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);

/* I: attributes used when instantiating standard unbound pools on demand */
static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];

/* I: attributes used when instantiating ordered pools on demand */
static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];

/*
 * I: kthread_worker to release pwq's. pwq release needs to be bounced to a
 * process context while holding a pool lock. Bounce to a dedicated kthread
 * worker to avoid A-A deadlocks.
 */
static struct kthread_worker *pwq_release_worker __ro_after_init;

struct workqueue_struct *system_wq __ro_after_init;
EXPORT_SYMBOL(system_wq);
struct workqueue_struct *system_highpri_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_highpri_wq);
struct workqueue_struct *system_long_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_long_wq);
struct workqueue_struct *system_unbound_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_unbound_wq);
struct workqueue_struct *system_freezable_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_wq);
struct workqueue_struct *system_power_efficient_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_power_efficient_wq);
struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
struct workqueue_struct *system_bh_wq;
EXPORT_SYMBOL_GPL(system_bh_wq);
struct workqueue_struct *system_bh_highpri_wq;
EXPORT_SYMBOL_GPL(system_bh_highpri_wq);

static int worker_thread(void *__worker);
static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
static void show_pwq(struct pool_workqueue *pwq);
static void show_one_worker_pool(struct worker_pool *pool);

#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>

#define assert_rcu_or_pool_mutex()                                        \
        RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() &&                        \
                         !lockdep_is_held(&wq_pool_mutex),                \
                         "RCU or wq_pool_mutex should be held")

#define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                        \
        RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() &&                        \
                         !lockdep_is_held(&wq->mutex) &&                \
                         !lockdep_is_held(&wq_pool_mutex),                \
                         "RCU, wq->mutex or wq_pool_mutex should be held")

#define for_each_bh_worker_pool(pool, cpu)                                \
        for ((pool) = &per_cpu(bh_worker_pools, cpu)[0];                \
             (pool) < &per_cpu(bh_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
             (pool)++)

#define for_each_cpu_worker_pool(pool, cpu)                                \
        for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];                \
             (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
             (pool)++)

/**
 * for_each_pool - iterate through all worker_pools in the system
 * @pool: iteration cursor
 * @pi: integer used for iteration
 *
 * This must be called either with wq_pool_mutex held or RCU read
 * locked.  If the pool needs to be used beyond the locking in effect, the
 * caller is responsible for guaranteeing that the pool stays online.
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
#define for_each_pool(pool, pi)                                                \
        idr_for_each_entry(&worker_pool_idr, pool, pi)                        \
                if (({ assert_rcu_or_pool_mutex(); false; })) { }        \
                else

/**
 * for_each_pool_worker - iterate through all workers of a worker_pool
 * @worker: iteration cursor
 * @pool: worker_pool to iterate workers of
 *
 * This must be called with wq_pool_attach_mutex.
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
#define for_each_pool_worker(worker, pool)                                \
        list_for_each_entry((worker), &(pool)->workers, node)                \
                if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \
                else

/**
 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
 * @pwq: iteration cursor
 * @wq: the target workqueue
 *
 * This must be called either with wq->mutex held or RCU read locked.
 * If the pwq needs to be used beyond the locking in effect, the caller is
 * responsible for guaranteeing that the pwq stays online.
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
#define for_each_pwq(pwq, wq)                                                \
        list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node,                \
                                 lockdep_is_held(&(wq->mutex)))

#ifdef CONFIG_DEBUG_OBJECTS_WORK

static const struct debug_obj_descr work_debug_descr;

static void *work_debug_hint(void *addr)
{
        return ((struct work_struct *) addr)->func;
}

static bool work_is_static_object(void *addr)
{
        struct work_struct *work = addr;

        return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
}

/*
 * fixup_init is called when:
 * - an active object is initialized
 */
static bool work_fixup_init(void *addr, enum debug_obj_state state)
{
        struct work_struct *work = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                cancel_work_sync(work);
                debug_object_init(work, &work_debug_descr);
                return true;
        default:
                return false;
        }
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
static bool work_fixup_free(void *addr, enum debug_obj_state state)
{
        struct work_struct *work = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                cancel_work_sync(work);
                debug_object_free(work, &work_debug_descr);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr work_debug_descr = {
        .name                = "work_struct",
        .debug_hint        = work_debug_hint,
        .is_static_object = work_is_static_object,
        .fixup_init        = work_fixup_init,
        .fixup_free        = work_fixup_free,
};

static inline void debug_work_activate(struct work_struct *work)
{
        debug_object_activate(work, &work_debug_descr);
}

static inline void debug_work_deactivate(struct work_struct *work)
{
        debug_object_deactivate(work, &work_debug_descr);
}

void __init_work(struct work_struct *work, int onstack)
{
        if (onstack)
                debug_object_init_on_stack(work, &work_debug_descr);
        else
                debug_object_init(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(__init_work);

void destroy_work_on_stack(struct work_struct *work)
{
        debug_object_free(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_work_on_stack);

void destroy_delayed_work_on_stack(struct delayed_work *work)
{
        destroy_timer_on_stack(&work->timer);
        debug_object_free(&work->work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);

#else
static inline void debug_work_activate(struct work_struct *work) { }
static inline void debug_work_deactivate(struct work_struct *work) { }
#endif

/**
 * worker_pool_assign_id - allocate ID and assign it to @pool
 * @pool: the pool pointer of interest
 *
 * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
 * successfully, -errno on failure.
 */
static int worker_pool_assign_id(struct worker_pool *pool)
{
        int ret;

        lockdep_assert_held(&wq_pool_mutex);

        ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
                        GFP_KERNEL);
        if (ret >= 0) {
                pool->id = ret;
                return 0;
        }
        return ret;
}

static struct pool_workqueue __rcu **
unbound_pwq_slot(struct workqueue_struct *wq, int cpu)
{
       if (cpu >= 0)
               return per_cpu_ptr(wq->cpu_pwq, cpu);
       else
               return &wq->dfl_pwq;
}

/* @cpu < 0 for dfl_pwq */
static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu)
{
        return rcu_dereference_check(*unbound_pwq_slot(wq, cpu),
                                     lockdep_is_held(&wq_pool_mutex) ||
                                     lockdep_is_held(&wq->mutex));
}

/**
 * unbound_effective_cpumask - effective cpumask of an unbound workqueue
 * @wq: workqueue of interest
 *
 * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which
 * is masked with wq_unbound_cpumask to determine the effective cpumask. The
 * default pwq is always mapped to the pool with the current effective cpumask.
 */
static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq)
{
        return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask;
}

static unsigned int work_color_to_flags(int color)
{
        return color << WORK_STRUCT_COLOR_SHIFT;
}

static int get_work_color(unsigned long work_data)
{
        return (work_data >> WORK_STRUCT_COLOR_SHIFT) &
                ((1 << WORK_STRUCT_COLOR_BITS) - 1);
}

static int work_next_color(int color)
{
        return (color + 1) % WORK_NR_COLORS;
}

static unsigned long pool_offq_flags(struct worker_pool *pool)
{
        return (pool->flags & POOL_BH) ? WORK_OFFQ_BH : 0;
}

/*
 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
 * contain the pointer to the queued pwq.  Once execution starts, the flag
 * is cleared and the high bits contain OFFQ flags and pool ID.
 *
 * set_work_pwq(), set_work_pool_and_clear_pending() and mark_work_canceling()
 * can be used to set the pwq, pool or clear work->data. These functions should
 * only be called while the work is owned - ie. while the PENDING bit is set.
 *
 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
 * corresponding to a work.  Pool is available once the work has been
 * queued anywhere after initialization until it is sync canceled.  pwq is
 * available only while the work item is queued.
 */
static inline void set_work_data(struct work_struct *work, unsigned long data)
{
        WARN_ON_ONCE(!work_pending(work));
        atomic_long_set(&work->data, data | work_static(work));
}

static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
                         unsigned long flags)
{
        set_work_data(work, (unsigned long)pwq | WORK_STRUCT_PENDING |
                      WORK_STRUCT_PWQ | flags);
}

static void set_work_pool_and_keep_pending(struct work_struct *work,
                                           int pool_id, unsigned long flags)
{
        set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) |
                      WORK_STRUCT_PENDING | flags);
}

static void set_work_pool_and_clear_pending(struct work_struct *work,
                                            int pool_id, unsigned long flags)
{
        /*
         * The following wmb is paired with the implied mb in
         * test_and_set_bit(PENDING) and ensures all updates to @work made
         * here are visible to and precede any updates by the next PENDING
         * owner.
         */
        smp_wmb();
        set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) |
                      flags);
        /*
         * The following mb guarantees that previous clear of a PENDING bit
         * will not be reordered with any speculative LOADS or STORES from
         * work->current_func, which is executed afterwards.  This possible
         * reordering can lead to a missed execution on attempt to queue
         * the same @work.  E.g. consider this case:
         *
         *   CPU#0                         CPU#1
         *   ----------------------------  --------------------------------
         *
         * 1  STORE event_indicated
         * 2  queue_work_on() {
         * 3    test_and_set_bit(PENDING)
         * 4 }                             set_..._and_clear_pending() {
         * 5                                 set_work_data() # clear bit
         * 6                                 smp_mb()
         * 7                               work->current_func() {
         * 8                                      LOAD event_indicated
         *                                   }
         *
         * Without an explicit full barrier speculative LOAD on line 8 can
         * be executed before CPU#0 does STORE on line 1.  If that happens,
         * CPU#0 observes the PENDING bit is still set and new execution of
         * a @work is not queued in a hope, that CPU#1 will eventually
         * finish the queued @work.  Meanwhile CPU#1 does not see
         * event_indicated is set, because speculative LOAD was executed
         * before actual STORE.
         */
        smp_mb();
}

static inline struct pool_workqueue *work_struct_pwq(unsigned long data)
{
        return (struct pool_workqueue *)(data & WORK_STRUCT_PWQ_MASK);
}

static struct pool_workqueue *get_work_pwq(struct work_struct *work)
{
        unsigned long data = atomic_long_read(&work->data);

        if (data & WORK_STRUCT_PWQ)
                return work_struct_pwq(data);
        else
                return NULL;
}

/**
 * get_work_pool - return the worker_pool a given work was associated with
 * @work: the work item of interest
 *
 * Pools are created and destroyed under wq_pool_mutex, and allows read
 * access under RCU read lock.  As such, this function should be
 * called under wq_pool_mutex or inside of a rcu_read_lock() region.
 *
 * All fields of the returned pool are accessible as long as the above
 * mentioned locking is in effect.  If the returned pool needs to be used
 * beyond the critical section, the caller is responsible for ensuring the
 * returned pool is and stays online.
 *
 * Return: The worker_pool @work was last associated with.  %NULL if none.
 */
static struct worker_pool *get_work_pool(struct work_struct *work)
{
        unsigned long data = atomic_long_read(&work->data);
        int pool_id;

        assert_rcu_or_pool_mutex();

        if (data & WORK_STRUCT_PWQ)
                return work_struct_pwq(data)->pool;

        pool_id = data >> WORK_OFFQ_POOL_SHIFT;
        if (pool_id == WORK_OFFQ_POOL_NONE)
                return NULL;

        return idr_find(&worker_pool_idr, pool_id);
}

static unsigned long shift_and_mask(unsigned long v, u32 shift, u32 bits)
{
        return (v >> shift) & ((1 << bits) - 1);
}

static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data)
{
        WARN_ON_ONCE(data & WORK_STRUCT_PWQ);

        offqd->pool_id = shift_and_mask(data, WORK_OFFQ_POOL_SHIFT,
                                        WORK_OFFQ_POOL_BITS);
        offqd->disable = shift_and_mask(data, WORK_OFFQ_DISABLE_SHIFT,
                                        WORK_OFFQ_DISABLE_BITS);
        offqd->flags = data & WORK_OFFQ_FLAG_MASK;
}

static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd)
{
        return ((unsigned long)offqd->disable << WORK_OFFQ_DISABLE_SHIFT) |
                ((unsigned long)offqd->flags);
}

/*
 * Policy functions.  These define the policies on how the global worker
 * pools are managed.  Unless noted otherwise, these functions assume that
 * they're being called with pool->lock held.
 */

/*
 * Need to wake up a worker?  Called from anything but currently
 * running workers.
 *
 * Note that, because unbound workers never contribute to nr_running, this
 * function will always return %true for unbound pools as long as the
 * worklist isn't empty.
 */
static bool need_more_worker(struct worker_pool *pool)
{
        return !list_empty(&pool->worklist) && !pool->nr_running;
}

/* Can I start working?  Called from busy but !running workers. */
static bool may_start_working(struct worker_pool *pool)
{
        return pool->nr_idle;
}

/* Do I need to keep working?  Called from currently running workers. */
static bool keep_working(struct worker_pool *pool)
{
        return !list_empty(&pool->worklist) && (pool->nr_running <= 1);
}

/* Do we need a new worker?  Called from manager. */
static bool need_to_create_worker(struct worker_pool *pool)
{
        return need_more_worker(pool) && !may_start_working(pool);
}

/* Do we have too many workers and should some go away? */
static bool too_many_workers(struct worker_pool *pool)
{
        bool managing = pool->flags & POOL_MANAGER_ACTIVE;
        int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
        int nr_busy = pool->nr_workers - nr_idle;

        return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
}

/**
 * worker_set_flags - set worker flags and adjust nr_running accordingly
 * @worker: self
 * @flags: flags to set
 *
 * Set @flags in @worker->flags and adjust nr_running accordingly.
 */
static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
        struct worker_pool *pool = worker->pool;

        lockdep_assert_held(&pool->lock);

        /* If transitioning into NOT_RUNNING, adjust nr_running. */
        if ((flags & WORKER_NOT_RUNNING) &&
            !(worker->flags & WORKER_NOT_RUNNING)) {
                pool->nr_running--;
        }

        worker->flags |= flags;
}

/**
 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
 * @worker: self
 * @flags: flags to clear
 *
 * Clear @flags in @worker->flags and adjust nr_running accordingly.
 */
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
        struct worker_pool *pool = worker->pool;
        unsigned int oflags = worker->flags;

        lockdep_assert_held(&pool->lock);

        worker->flags &= ~flags;

        /*
         * If transitioning out of NOT_RUNNING, increment nr_running.  Note
         * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
         * of multiple flags, not a single flag.
         */
        if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
                if (!(worker->flags & WORKER_NOT_RUNNING))
                        pool->nr_running++;
}

/* Return the first idle worker.  Called with pool->lock held. */
static struct worker *first_idle_worker(struct worker_pool *pool)
{
        if (unlikely(list_empty(&pool->idle_list)))
                return NULL;

        return list_first_entry(&pool->idle_list, struct worker, entry);
}

/**
 * worker_enter_idle - enter idle state
 * @worker: worker which is entering idle state
 *
 * @worker is entering idle state.  Update stats and idle timer if
 * necessary.
 *
 * LOCKING:
 * raw_spin_lock_irq(pool->lock).
 */
static void worker_enter_idle(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
            WARN_ON_ONCE(!list_empty(&worker->entry) &&
                         (worker->hentry.next || worker->hentry.pprev)))
                return;

        /* can't use worker_set_flags(), also called from create_worker() */
        worker->flags |= WORKER_IDLE;
        pool->nr_idle++;
        worker->last_active = jiffies;

        /* idle_list is LIFO */
        list_add(&worker->entry, &pool->idle_list);

        if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
                mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);

        /* Sanity check nr_running. */
        WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);
}

/**
 * worker_leave_idle - leave idle state
 * @worker: worker which is leaving idle state
 *
 * @worker is leaving idle state.  Update stats.
 *
 * LOCKING:
 * raw_spin_lock_irq(pool->lock).
 */
static void worker_leave_idle(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
                return;
        worker_clr_flags(worker, WORKER_IDLE);
        pool->nr_idle--;
        list_del_init(&worker->entry);
}

/**
 * find_worker_executing_work - find worker which is executing a work
 * @pool: pool of interest
 * @work: work to find worker for
 *
 * Find a worker which is executing @work on @pool by searching
 * @pool->busy_hash which is keyed by the address of @work.  For a worker
 * to match, its current execution should match the address of @work and
 * its work function.  This is to avoid unwanted dependency between
 * unrelated work executions through a work item being recycled while still
 * being executed.
 *
 * This is a bit tricky.  A work item may be freed once its execution
 * starts and nothing prevents the freed area from being recycled for
 * another work item.  If the same work item address ends up being reused
 * before the original execution finishes, workqueue will identify the
 * recycled work item as currently executing and make it wait until the
 * current execution finishes, introducing an unwanted dependency.
 *
 * This function checks the work item address and work function to avoid
 * false positives.  Note that this isn't complete as one may construct a
 * work function which can introduce dependency onto itself through a
 * recycled work item.  Well, if somebody wants to shoot oneself in the
 * foot that badly, there's only so much we can do, and if such deadlock
 * actually occurs, it should be easy to locate the culprit work function.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 *
 * Return:
 * Pointer to worker which is executing @work if found, %NULL
 * otherwise.
 */
static struct worker *find_worker_executing_work(struct worker_pool *pool,
                                                 struct work_struct *work)
{
        struct worker *worker;

        hash_for_each_possible(pool->busy_hash, worker, hentry,
                               (unsigned long)work)
                if (worker->current_work == work &&
                    worker->current_func == work->func)
                        return worker;

        return NULL;
}

/**
 * move_linked_works - move linked works to a list
 * @work: start of series of works to be scheduled
 * @head: target list to append @work to
 * @nextp: out parameter for nested worklist walking
 *
 * Schedule linked works starting from @work to @head. Work series to be
 * scheduled starts at @work and includes any consecutive work with
 * WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on
 * @nextp.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void move_linked_works(struct work_struct *work, struct list_head *head,
                              struct work_struct **nextp)
{
        struct work_struct *n;

        /*
         * Linked worklist will always end before the end of the list,
         * use NULL for list head.
         */
        list_for_each_entry_safe_from(work, n, NULL, entry) {
                list_move_tail(&work->entry, head);
                if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
                        break;
        }

        /*
         * If we're already inside safe list traversal and have moved
         * multiple works to the scheduled queue, the next position
         * needs to be updated.
         */
        if (nextp)
                *nextp = n;
}

/**
 * assign_work - assign a work item and its linked work items to a worker
 * @work: work to assign
 * @worker: worker to assign to
 * @nextp: out parameter for nested worklist walking
 *
 * Assign @work and its linked work items to @worker. If @work is already being
 * executed by another worker in the same pool, it'll be punted there.
 *
 * If @nextp is not NULL, it's updated to point to the next work of the last
 * scheduled work. This allows assign_work() to be nested inside
 * list_for_each_entry_safe().
 *
 * Returns %true if @work was successfully assigned to @worker. %false if @work
 * was punted to another worker already executing it.
 */
static bool assign_work(struct work_struct *work, struct worker *worker,
                        struct work_struct **nextp)
{
        struct worker_pool *pool = worker->pool;
        struct worker *collision;

        lockdep_assert_held(&pool->lock);

        /*
         * A single work shouldn't be executed concurrently by multiple workers.
         * __queue_work() ensures that @work doesn't jump to a different pool
         * while still running in the previous pool. Here, we should ensure that
         * @work is not executed concurrently by multiple workers from the same
         * pool. Check whether anyone is already processing the work. If so,
         * defer the work to the currently executing one.
         */
        collision = find_worker_executing_work(pool, work);
        if (unlikely(collision)) {
                move_linked_works(work, &collision->scheduled, nextp);
                return false;
        }

        move_linked_works(work, &worker->scheduled, nextp);
        return true;
}

static struct irq_work *bh_pool_irq_work(struct worker_pool *pool)
{
        int high = pool->attrs->nice == HIGHPRI_NICE_LEVEL ? 1 : 0;

        return &per_cpu(bh_pool_irq_works, pool->cpu)[high];
}

static void kick_bh_pool(struct worker_pool *pool)
{
#ifdef CONFIG_SMP
        /* see drain_dead_softirq_workfn() for BH_DRAINING */
        if (unlikely(pool->cpu != smp_processor_id() &&
                     !(pool->flags & POOL_BH_DRAINING))) {
                irq_work_queue_on(bh_pool_irq_work(pool), pool->cpu);
                return;
        }
#endif
        if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
                raise_softirq_irqoff(HI_SOFTIRQ);
        else
                raise_softirq_irqoff(TASKLET_SOFTIRQ);
}

/**
 * kick_pool - wake up an idle worker if necessary
 * @pool: pool to kick
 *
 * @pool may have pending work items. Wake up worker if necessary. Returns
 * whether a worker was woken up.
 */
static bool kick_pool(struct worker_pool *pool)
{
        struct worker *worker = first_idle_worker(pool);
        struct task_struct *p;

        lockdep_assert_held(&pool->lock);

        if (!need_more_worker(pool) || !worker)
                return false;

        if (pool->flags & POOL_BH) {
                kick_bh_pool(pool);
                return true;
        }

        p = worker->task;

#ifdef CONFIG_SMP
        /*
         * Idle @worker is about to execute @work and waking up provides an
         * opportunity to migrate @worker at a lower cost by setting the task's
         * wake_cpu field. Let's see if we want to move @worker to improve
         * execution locality.
         *
         * We're waking the worker that went idle the latest and there's some
         * chance that @worker is marked idle but hasn't gone off CPU yet. If
         * so, setting the wake_cpu won't do anything. As this is a best-effort
         * optimization and the race window is narrow, let's leave as-is for
         * now. If this becomes pronounced, we can skip over workers which are
         * still on cpu when picking an idle worker.
         *
         * If @pool has non-strict affinity, @worker might have ended up outside
         * its affinity scope. Repatriate.
         */
        if (!pool->attrs->affn_strict &&
            !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) {
                struct work_struct *work = list_first_entry(&pool->worklist,
                                                struct work_struct, entry);
                int wake_cpu = cpumask_any_and_distribute(pool->attrs->__pod_cpumask,
                                                          cpu_online_mask);
                if (wake_cpu < nr_cpu_ids) {
                        p->wake_cpu = wake_cpu;
                        get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++;
                }
        }
#endif
        wake_up_process(p);
        return true;
}

#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT

/*
 * Concurrency-managed per-cpu work items that hog CPU for longer than
 * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism,
 * which prevents them from stalling other concurrency-managed work items. If a
 * work function keeps triggering this mechanism, it's likely that the work item
 * should be using an unbound workqueue instead.
 *
 * wq_cpu_intensive_report() tracks work functions which trigger such conditions
 * and report them so that they can be examined and converted to use unbound
 * workqueues as appropriate. To avoid flooding the console, each violating work
 * function is tracked and reported with exponential backoff.
 */
#define WCI_MAX_ENTS 128

struct wci_ent {
        work_func_t                func;
        atomic64_t                cnt;
        struct hlist_node        hash_node;
};

static struct wci_ent wci_ents[WCI_MAX_ENTS];
static int wci_nr_ents;
static DEFINE_RAW_SPINLOCK(wci_lock);
static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS));

static struct wci_ent *wci_find_ent(work_func_t func)
{
        struct wci_ent *ent;

        hash_for_each_possible_rcu(wci_hash, ent, hash_node,
                                   (unsigned long)func) {
                if (ent->func == func)
                        return ent;
        }
        return NULL;
}

static void wq_cpu_intensive_report(work_func_t func)
{
        struct wci_ent *ent;

restart:
        ent = wci_find_ent(func);
        if (ent) {
                u64 cnt;

                /*
                 * Start reporting from the warning_thresh and back off
                 * exponentially.
                 */
                cnt = atomic64_inc_return_relaxed(&ent->cnt);
                if (wq_cpu_intensive_warning_thresh &&
                    cnt >= wq_cpu_intensive_warning_thresh &&
                    is_power_of_2(cnt + 1 - wq_cpu_intensive_warning_thresh))
                        printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n",
                                        ent->func, wq_cpu_intensive_thresh_us,
                                        atomic64_read(&ent->cnt));
                return;
        }

        /*
         * @func is a new violation. Allocate a new entry for it. If wcn_ents[]
         * is exhausted, something went really wrong and we probably made enough
         * noise already.
         */
        if (wci_nr_ents >= WCI_MAX_ENTS)
                return;

        raw_spin_lock(&wci_lock);

        if (wci_nr_ents >= WCI_MAX_ENTS) {
                raw_spin_unlock(&wci_lock);
                return;
        }

        if (wci_find_ent(func)) {
                raw_spin_unlock(&wci_lock);
                goto restart;
        }

        ent = &wci_ents[wci_nr_ents++];
        ent->func = func;
        atomic64_set(&ent->cnt, 0);
        hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func);

        raw_spin_unlock(&wci_lock);

        goto restart;
}

#else        /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
static void wq_cpu_intensive_report(work_func_t func) {}
#endif        /* CONFIG_WQ_CPU_INTENSIVE_REPORT */

/**
 * wq_worker_running - a worker is running again
 * @task: task waking up
 *
 * This function is called when a worker returns from schedule()
 */
void wq_worker_running(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);

        if (!READ_ONCE(worker->sleeping))
                return;

        /*
         * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
         * and the nr_running increment below, we may ruin the nr_running reset
         * and leave with an unexpected pool->nr_running == 1 on the newly unbound
         * pool. Protect against such race.
         */
        preempt_disable();
        if (!(worker->flags & WORKER_NOT_RUNNING))
                worker->pool->nr_running++;
        preempt_enable();

        /*
         * CPU intensive auto-detection cares about how long a work item hogged
         * CPU without sleeping. Reset the starting timestamp on wakeup.
         */
        worker->current_at = worker->task->se.sum_exec_runtime;

        WRITE_ONCE(worker->sleeping, 0);
}

/**
 * wq_worker_sleeping - a worker is going to sleep
 * @task: task going to sleep
 *
 * This function is called from schedule() when a busy worker is
 * going to sleep.
 */
void wq_worker_sleeping(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);
        struct worker_pool *pool;

        /*
         * Rescuers, which may not have all the fields set up like normal
         * workers, also reach here, let's not access anything before
         * checking NOT_RUNNING.
         */
        if (worker->flags & WORKER_NOT_RUNNING)
                return;

        pool = worker->pool;

        /* Return if preempted before wq_worker_running() was reached */
        if (READ_ONCE(worker->sleeping))
                return;

        WRITE_ONCE(worker->sleeping, 1);
        raw_spin_lock_irq(&pool->lock);

        /*
         * Recheck in case unbind_workers() preempted us. We don't
         * want to decrement nr_running after the worker is unbound
         * and nr_running has been reset.
         */
        if (worker->flags & WORKER_NOT_RUNNING) {
                raw_spin_unlock_irq(&pool->lock);
                return;
        }

        pool->nr_running--;
        if (kick_pool(pool))
                worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++;

        raw_spin_unlock_irq(&pool->lock);
}

/**
 * wq_worker_tick - a scheduler tick occurred while a kworker is running
 * @task: task currently running
 *
 * Called from sched_tick(). We're in the IRQ context and the current
 * worker's fields which follow the 'K' locking rule can be accessed safely.
 */
void wq_worker_tick(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);
        struct pool_workqueue *pwq = worker->current_pwq;
        struct worker_pool *pool = worker->pool;

        if (!pwq)
                return;

        pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC;

        if (!wq_cpu_intensive_thresh_us)
                return;

        /*
         * If the current worker is concurrency managed and hogged the CPU for
         * longer than wq_cpu_intensive_thresh_us, it's automatically marked
         * CPU_INTENSIVE to avoid stalling other concurrency-managed work items.
         *
         * Set @worker->sleeping means that @worker is in the process of
         * switching out voluntarily and won't be contributing to
         * @pool->nr_running until it wakes up. As wq_worker_sleeping() also
         * decrements ->nr_running, setting CPU_INTENSIVE here can lead to
         * double decrements. The task is releasing the CPU anyway. Let's skip.
         * We probably want to make this prettier in the future.
         */
        if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) ||
            worker->task->se.sum_exec_runtime - worker->current_at <
            wq_cpu_intensive_thresh_us * NSEC_PER_USEC)
                return;

        raw_spin_lock(&pool->lock);

        worker_set_flags(worker, WORKER_CPU_INTENSIVE);
        wq_cpu_intensive_report(worker->current_func);
        pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;

        if (kick_pool(pool))
                pwq->stats[PWQ_STAT_CM_WAKEUP]++;

        raw_spin_unlock(&pool->lock);
}

/**
 * wq_worker_last_func - retrieve worker's last work function
 * @task: Task to retrieve last work function of.
 *
 * Determine the last function a worker executed. This is called from
 * the scheduler to get a worker's last known identity.
 *
 * CONTEXT:
 * raw_spin_lock_irq(rq->lock)
 *
 * This function is called during schedule() when a kworker is going
 * to sleep. It's used by psi to identify aggregation workers during
 * dequeuing, to allow periodic aggregation to shut-off when that
 * worker is the last task in the system or cgroup to go to sleep.
 *
 * As this function doesn't involve any workqueue-related locking, it
 * only returns stable values when called from inside the scheduler's
 * queuing and dequeuing paths, when @task, which must be a kworker,
 * is guaranteed to not be processing any works.
 *
 * Return:
 * The last work function %current executed as a worker, NULL if it
 * hasn't executed any work yet.
 */
work_func_t wq_worker_last_func(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);

        return worker->last_func;
}

/**
 * wq_node_nr_active - Determine wq_node_nr_active to use
 * @wq: workqueue of interest
 * @node: NUMA node, can be %NUMA_NO_NODE
 *
 * Determine wq_node_nr_active to use for @wq on @node. Returns:
 *
 * - %NULL for per-cpu workqueues as they don't need to use shared nr_active.
 *
 * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE.
 *
 * - Otherwise, node_nr_active[@node].
 */
static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq,
                                                   int node)
{
        if (!(wq->flags & WQ_UNBOUND))
                return NULL;

        if (node == NUMA_NO_NODE)
                node = nr_node_ids;

        return wq->node_nr_active[node];
}

/**
 * wq_update_node_max_active - Update per-node max_actives to use
 * @wq: workqueue to update
 * @off_cpu: CPU that's going down, -1 if a CPU is not going down
 *
 * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is
 * distributed among nodes according to the proportions of numbers of online
 * cpus. The result is always between @wq->min_active and max_active.
 */
static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu)
{
        struct cpumask *effective = unbound_effective_cpumask(wq);
        int min_active = READ_ONCE(wq->min_active);
        int max_active = READ_ONCE(wq->max_active);
        int total_cpus, node;

        lockdep_assert_held(&wq->mutex);

        if (!wq_topo_initialized)
                return;

        if (off_cpu >= 0 && !cpumask_test_cpu(off_cpu, effective))
                off_cpu = -1;

        total_cpus = cpumask_weight_and(effective, cpu_online_mask);
        if (off_cpu >= 0)
                total_cpus--;

        /* If all CPUs of the wq get offline, use the default values */
        if (unlikely(!total_cpus)) {
                for_each_node(node)
                        wq_node_nr_active(wq, node)->max = min_active;

                wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active;
                return;
        }

        for_each_node(node) {
                int node_cpus;

                node_cpus = cpumask_weight_and(effective, cpumask_of_node(node));
                if (off_cpu >= 0 && cpu_to_node(off_cpu) == node)
                        node_cpus--;

                wq_node_nr_active(wq, node)->max =
                        clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus),
                              min_active, max_active);
        }

        wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active;
}

/**
 * get_pwq - get an extra reference on the specified pool_workqueue
 * @pwq: pool_workqueue to get
 *
 * Obtain an extra reference on @pwq.  The caller should guarantee that
 * @pwq has positive refcnt and be holding the matching pool->lock.
 */
static void get_pwq(struct pool_workqueue *pwq)
{
        lockdep_assert_held(&pwq->pool->lock);
        WARN_ON_ONCE(pwq->refcnt <= 0);
        pwq->refcnt++;
}

/**
 * put_pwq - put a pool_workqueue reference
 * @pwq: pool_workqueue to put
 *
 * Drop a reference of @pwq.  If its refcnt reaches zero, schedule its
 * destruction.  The caller should be holding the matching pool->lock.
 */
static void put_pwq(struct pool_workqueue *pwq)
{
        lockdep_assert_held(&pwq->pool->lock);
        if (likely(--pwq->refcnt))
                return;
        /*
         * @pwq can't be released under pool->lock, bounce to a dedicated
         * kthread_worker to avoid A-A deadlocks.
         */
        kthread_queue_work(pwq_release_worker, &pwq->release_work);
}

/**
 * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock
 * @pwq: pool_workqueue to put (can be %NULL)
 *
 * put_pwq() with locking.  This function also allows %NULL @pwq.
 */
static void put_pwq_unlocked(struct pool_workqueue *pwq)
{
        if (pwq) {
                /*
                 * As both pwqs and pools are RCU protected, the
                 * following lock operations are safe.
                 */
                raw_spin_lock_irq(&pwq->pool->lock);
                put_pwq(pwq);
                raw_spin_unlock_irq(&pwq->pool->lock);
        }
}

static bool pwq_is_empty(struct pool_workqueue *pwq)
{
        return !pwq->nr_active && list_empty(&pwq->inactive_works);
}

static void __pwq_activate_work(struct pool_workqueue *pwq,
                                struct work_struct *work)
{
        unsigned long *wdb = work_data_bits(work);

        WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE));
        trace_workqueue_activate_work(work);
        if (list_empty(&pwq->pool->worklist))
                pwq->pool->watchdog_ts = jiffies;
        move_linked_works(work, &pwq->pool->worklist, NULL);
        __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);
}

/**
 * pwq_activate_work - Activate a work item if inactive
 * @pwq: pool_workqueue @work belongs to
 * @work: work item to activate
 *
 * Returns %true if activated. %false if already active.
 */
static bool pwq_activate_work(struct pool_workqueue *pwq,
                              struct work_struct *work)
{
        struct worker_pool *pool = pwq->pool;
        struct wq_node_nr_active *nna;

        lockdep_assert_held(&pool->lock);

        if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE))
                return false;

        nna = wq_node_nr_active(pwq->wq, pool->node);
        if (nna)
                atomic_inc(&nna->nr);

        pwq->nr_active++;
        __pwq_activate_work(pwq, work);
        return true;
}

static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
{
        int max = READ_ONCE(nna->max);

        while (true) {
                int old, tmp;

                old = atomic_read(&nna->nr);
                if (old >= max)
                        return false;
                tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1);
                if (tmp == old)
                        return true;
        }
}

/**
 * pwq_tryinc_nr_active - Try to increment nr_active for a pwq
 * @pwq: pool_workqueue of interest
 * @fill: max_active may have increased, try to increase concurrency level
 *
 * Try to increment nr_active for @pwq. Returns %true if an nr_active count is
 * successfully obtained. %false otherwise.
 */
static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill)
{
        struct workqueue_struct *wq = pwq->wq;
        struct worker_pool *pool = pwq->pool;
        struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node);
        bool obtained = false;

        lockdep_assert_held(&pool->lock);

        if (!nna) {
                /* BH or per-cpu workqueue, pwq->nr_active is sufficient */
                obtained = pwq->nr_active < READ_ONCE(wq->max_active);
                goto out;
        }

        if (unlikely(pwq->plugged))
                return false;

        /*
         * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is
         * already waiting on $nna, pwq_dec_nr_active() will maintain the
         * concurrency level. Don't jump the line.
         *
         * We need to ignore the pending test after max_active has increased as
         * pwq_dec_nr_active() can only maintain the concurrency level but not
         * increase it. This is indicated by @fill.
         */
        if (!list_empty(&pwq->pending_node) && likely(!fill))
                goto out;

        obtained = tryinc_node_nr_active(nna);
        if (obtained)
                goto out;

        /*
         * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs
         * and try again. The smp_mb() is paired with the implied memory barrier
         * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either
         * we see the decremented $nna->nr or they see non-empty
         * $nna->pending_pwqs.
         */
        raw_spin_lock(&nna->lock);

        if (list_empty(&pwq->pending_node))
                list_add_tail(&pwq->pending_node, &nna->pending_pwqs);
        else if (likely(!fill))
                goto out_unlock;

        smp_mb();

        obtained = tryinc_node_nr_active(nna);

        /*
         * If @fill, @pwq might have already been pending. Being spuriously
         * pending in cold paths doesn't affect anything. Let's leave it be.
         */
        if (obtained && likely(!fill))
                list_del_init(&pwq->pending_node);

out_unlock:
        raw_spin_unlock(&nna->lock);
out:
        if (obtained)
                pwq->nr_active++;
        return obtained;
}

/**
 * pwq_activate_first_inactive - Activate the first inactive work item on a pwq
 * @pwq: pool_workqueue of interest
 * @fill: max_active may have increased, try to increase concurrency level
 *
 * Activate the first inactive work item of @pwq if available and allowed by
 * max_active limit.
 *
 * Returns %true if an inactive work item has been activated. %false if no
 * inactive work item is found or max_active limit is reached.
 */
static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill)
{
        struct work_struct *work =
                list_first_entry_or_null(&pwq->inactive_works,
                                         struct work_struct, entry);

        if (work && pwq_tryinc_nr_active(pwq, fill)) {
                __pwq_activate_work(pwq, work);
                return true;
        } else {
                return false;
        }
}

/**
 * unplug_oldest_pwq - unplug the oldest pool_workqueue
 * @wq: workqueue_struct where its oldest pwq is to be unplugged
 *
 * This function should only be called for ordered workqueues where only the
 * oldest pwq is unplugged, the others are plugged to suspend execution to
 * ensure proper work item ordering::
 *
 *    dfl_pwq --------------+     [P] - plugged
 *                          |
 *                          v
 *    pwqs -> A -> B [P] -> C [P] (newest)
 *            |    |        |
 *            1    3        5
 *            |    |        |
 *            2    4        6
 *
 * When the oldest pwq is drained and removed, this function should be called
 * to unplug the next oldest one to start its work item execution. Note that
 * pwq's are linked into wq->pwqs with the oldest first, so the first one in
 * the list is the oldest.
 */
static void unplug_oldest_pwq(struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;

        lockdep_assert_held(&wq->mutex);

        /* Caller should make sure that pwqs isn't empty before calling */
        pwq = list_first_entry_or_null(&wq->pwqs, struct pool_workqueue,
                                       pwqs_node);
        raw_spin_lock_irq(&pwq->pool->lock);
        if (pwq->plugged) {
                pwq->plugged = false;
                if (pwq_activate_first_inactive(pwq, true))
                        kick_pool(pwq->pool);
        }
        raw_spin_unlock_irq(&pwq->pool->lock);
}

/**
 * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active
 * @nna: wq_node_nr_active to activate a pending pwq for
 * @caller_pool: worker_pool the caller is locking
 *
 * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked.
 * @caller_pool may be unlocked and relocked to lock other worker_pools.
 */
static void node_activate_pending_pwq(struct wq_node_nr_active *nna,
                                      struct worker_pool *caller_pool)
{
        struct worker_pool *locked_pool = caller_pool;
        struct pool_workqueue *pwq;
        struct work_struct *work;

        lockdep_assert_held(&caller_pool->lock);

        raw_spin_lock(&nna->lock);
retry:
        pwq = list_first_entry_or_null(&nna->pending_pwqs,
                                       struct pool_workqueue, pending_node);
        if (!pwq)
                goto out_unlock;

        /*
         * If @pwq is for a different pool than @locked_pool, we need to lock
         * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock
         * / lock dance. For that, we also need to release @nna->lock as it's
         * nested inside pool locks.
         */
        if (pwq->pool != locked_pool) {
                raw_spin_unlock(&locked_pool->lock);
                locked_pool = pwq->pool;
                if (!raw_spin_trylock(&locked_pool->lock)) {
                        raw_spin_unlock(&nna->lock);
                        raw_spin_lock(&locked_pool->lock);
                        raw_spin_lock(&nna->lock);
                        goto retry;
                }
        }

        /*
         * $pwq may not have any inactive work items due to e.g. cancellations.
         * Drop it from pending_pwqs and see if there's another one.
         */
        work = list_first_entry_or_null(&pwq->inactive_works,
                                        struct work_struct, entry);
        if (!work) {
                list_del_init(&pwq->pending_node);
                goto retry;
        }

        /*
         * Acquire an nr_active count and activate the inactive work item. If
         * $pwq still has inactive work items, rotate it to the end of the
         * pending_pwqs so that we round-robin through them. This means that
         * inactive work items are not activated in queueing order which is fine
         * given that there has never been any ordering across different pwqs.
         */
        if (likely(tryinc_node_nr_active(nna))) {
                pwq->nr_active++;
                __pwq_activate_work(pwq, work);

                if (list_empty(&pwq->inactive_works))
                        list_del_init(&pwq->pending_node);
                else
                        list_move_tail(&pwq->pending_node, &nna->pending_pwqs);

                /* if activating a foreign pool, make sure it's running */
                if (pwq->pool != caller_pool)
                        kick_pool(pwq->pool);
        }

out_unlock:
        raw_spin_unlock(&nna->lock);
        if (locked_pool != caller_pool) {
                raw_spin_unlock(&locked_pool->lock);
                raw_spin_lock(&caller_pool->lock);
        }
}

/**
 * pwq_dec_nr_active - Retire an active count
 * @pwq: pool_workqueue of interest
 *
 * Decrement @pwq's nr_active and try to activate the first inactive work item.
 * For unbound workqueues, this function may temporarily drop @pwq->pool->lock.
 */
static void pwq_dec_nr_active(struct pool_workqueue *pwq)
{
        struct worker_pool *pool = pwq->pool;
        struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node);

        lockdep_assert_held(&pool->lock);

        /*
         * @pwq->nr_active should be decremented for both percpu and unbound
         * workqueues.
         */
        pwq->nr_active--;

        /*
         * For a percpu workqueue, it's simple. Just need to kick the first
         * inactive work item on @pwq itself.
         */
        if (!nna) {
                pwq_activate_first_inactive(pwq, false);
                return;
        }

        /*
         * If @pwq is for an unbound workqueue, it's more complicated because
         * multiple pwqs and pools may be sharing the nr_active count. When a
         * pwq needs to wait for an nr_active count, it puts itself on
         * $nna->pending_pwqs. The following atomic_dec_return()'s implied
         * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to
         * guarantee that either we see non-empty pending_pwqs or they see
         * decremented $nna->nr.
         *
         * $nna->max may change as CPUs come online/offline and @pwq->wq's
         * max_active gets updated. However, it is guaranteed to be equal to or
         * larger than @pwq->wq->min_active which is above zero unless freezing.
         * This maintains the forward progress guarantee.
         */
        if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max))
                return;

        if (!list_empty(&nna->pending_pwqs))
                node_activate_pending_pwq(nna, pool);
}

/**
 * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
 * @pwq: pwq of interest
 * @work_data: work_data of work which left the queue
 *
 * A work either has completed or is removed from pending queue,
 * decrement nr_in_flight of its pwq and handle workqueue flushing.
 *
 * NOTE:
 * For unbound workqueues, this function may temporarily drop @pwq->pool->lock
 * and thus should be called after all other state updates for the in-flight
 * work item is complete.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data)
{
        int color = get_work_color(work_data);

        if (!(work_data & WORK_STRUCT_INACTIVE))
                pwq_dec_nr_active(pwq);

        pwq->nr_in_flight[color]--;

        /* is flush in progress and are we at the flushing tip? */
        if (likely(pwq->flush_color != color))
                goto out_put;

        /* are there still in-flight works? */
        if (pwq->nr_in_flight[color])
                goto out_put;

        /* this pwq is done, clear flush_color */
        pwq->flush_color = -1;

        /*
         * If this was the last pwq, wake up the first flusher.  It
         * will handle the rest.
         */
        if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
                complete(&pwq->wq->first_flusher->done);
out_put:
        put_pwq(pwq);
}

/**
 * try_to_grab_pending - steal work item from worklist and disable irq
 * @work: work item to steal
 * @cflags: %WORK_CANCEL_ flags
 * @irq_flags: place to store irq state
 *
 * Try to grab PENDING bit of @work.  This function can handle @work in any
 * stable state - idle, on timer or on worklist.
 *
 * Return:
 *
 *  ========        ================================================================
 *  1                if @work was pending and we successfully stole PENDING
 *  0                if @work was idle and we claimed PENDING
 *  -EAGAIN        if PENDING couldn't be grabbed at the moment, safe to busy-retry
 *  ========        ================================================================
 *
 * Note:
 * On >= 0 return, the caller owns @work's PENDING bit.  To avoid getting
 * interrupted while holding PENDING and @work off queue, irq must be
 * disabled on entry.  This, combined with delayed_work->timer being
 * irqsafe, ensures that we return -EAGAIN for finite short period of time.
 *
 * On successful return, >= 0, irq is disabled and the caller is
 * responsible for releasing it using local_irq_restore(*@irq_flags).
 *
 * This function is safe to call from any context including IRQ handler.
 */
static int try_to_grab_pending(struct work_struct *work, u32 cflags,
                               unsigned long *irq_flags)
{
        struct worker_pool *pool;
        struct pool_workqueue *pwq;

        local_irq_save(*irq_flags);

        /* try to steal the timer if it exists */
        if (cflags & WORK_CANCEL_DELAYED) {
                struct delayed_work *dwork = to_delayed_work(work);

                /*
                 * dwork->timer is irqsafe.  If del_timer() fails, it's
                 * guaranteed that the timer is not queued anywhere and not
                 * running on the local CPU.
                 */
                if (likely(del_timer(&dwork->timer)))
                        return 1;
        }

        /* try to claim PENDING the normal way */
        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
                return 0;

        rcu_read_lock();
        /*
         * The queueing is in progress, or it is already queued. Try to
         * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
         */
        pool = get_work_pool(work);
        if (!pool)
                goto fail;

        raw_spin_lock(&pool->lock);
        /*
         * work->data is guaranteed to point to pwq only while the work
         * item is queued on pwq->wq, and both updating work->data to point
         * to pwq on queueing and to pool on dequeueing are done under
         * pwq->pool->lock.  This in turn guarantees that, if work->data
         * points to pwq which is associated with a locked pool, the work
         * item is currently queued on that pool.
         */
        pwq = get_work_pwq(work);
        if (pwq && pwq->pool == pool) {
                unsigned long work_data;

                debug_work_deactivate(work);

                /*
                 * A cancelable inactive work item must be in the
                 * pwq->inactive_works since a queued barrier can't be
                 * canceled (see the comments in insert_wq_barrier()).
                 *
                 * An inactive work item cannot be grabbed directly because
                 * it might have linked barrier work items which, if left
                 * on the inactive_works list, will confuse pwq->nr_active
                 * management later on and cause stall.  Make sure the work
                 * item is activated before grabbing.
                 */
                pwq_activate_work(pwq, work);

                list_del_init(&work->entry);

                /*
                 * work->data points to pwq iff queued. Let's point to pool. As
                 * this destroys work->data needed by the next step, stash it.
                 */
                work_data = *work_data_bits(work);
                set_work_pool_and_keep_pending(work, pool->id,
                                               pool_offq_flags(pool));

                /* must be the last step, see the function comment */
                pwq_dec_nr_in_flight(pwq, work_data);

                raw_spin_unlock(&pool->lock);
                rcu_read_unlock();
                return 1;
        }
        raw_spin_unlock(&pool->lock);
fail:
        rcu_read_unlock();
        local_irq_restore(*irq_flags);
        return -EAGAIN;
}

/**
 * work_grab_pending - steal work item from worklist and disable irq
 * @work: work item to steal
 * @cflags: %WORK_CANCEL_ flags
 * @irq_flags: place to store IRQ state
 *
 * Grab PENDING bit of @work. @work can be in any stable state - idle, on timer
 * or on worklist.
 *
 * Can be called from any context. IRQ is disabled on return with IRQ state
 * stored in *@irq_flags. The caller is responsible for re-enabling it using
 * local_irq_restore().
 *
 * Returns %true if @work was pending. %false if idle.
 */
static bool work_grab_pending(struct work_struct *work, u32 cflags,
                              unsigned long *irq_flags)
{
        int ret;

        while (true) {
                ret = try_to_grab_pending(work, cflags, irq_flags);
                if (ret >= 0)
                        return ret;
                cpu_relax();
        }
}

/**
 * insert_work - insert a work into a pool
 * @pwq: pwq @work belongs to
 * @work: work to insert
 * @head: insertion point
 * @extra_flags: extra WORK_STRUCT_* flags to set
 *
 * Insert @work which belongs to @pwq after @head.  @extra_flags is or'd to
 * work_struct flags.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
                        struct list_head *head, unsigned int extra_flags)
{
        debug_work_activate(work);

        /* record the work call stack in order to print it in KASAN reports */
        kasan_record_aux_stack_noalloc(work);

        /* we own @work, set data and link */
        set_work_pwq(work, pwq, extra_flags);
        list_add_tail(&work->entry, head);
        get_pwq(pwq);
}

/*
 * Test whether @work is being queued from another work executing on the
 * same workqueue.
 */
static bool is_chained_work(struct workqueue_struct *wq)
{
        struct worker *worker;

        worker = current_wq_worker();
        /*
         * Return %true iff I'm a worker executing a work item on @wq.  If
         * I'm @worker, it's safe to dereference it without locking.
         */
        return worker && worker->current_pwq->wq == wq;
}

/*
 * When queueing an unbound work item to a wq, prefer local CPU if allowed
 * by wq_unbound_cpumask.  Otherwise, round robin among the allowed ones to
 * avoid perturbing sensitive tasks.
 */
static int wq_select_unbound_cpu(int cpu)
{
        int new_cpu;

        if (likely(!wq_debug_force_rr_cpu)) {
                if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
                        return cpu;
        } else {
                pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n");
        }

        new_cpu = __this_cpu_read(wq_rr_cpu_last);
        new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
        if (unlikely(new_cpu >= nr_cpu_ids)) {
                new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
                if (unlikely(new_cpu >= nr_cpu_ids))
                        return cpu;
        }
        __this_cpu_write(wq_rr_cpu_last, new_cpu);

        return new_cpu;
}

static void __queue_work(int cpu, struct workqueue_struct *wq,
                         struct work_struct *work)
{
        struct pool_workqueue *pwq;
        struct worker_pool *last_pool, *pool;
        unsigned int work_flags;
        unsigned int req_cpu = cpu;

        /*
         * While a work item is PENDING && off queue, a task trying to
         * steal the PENDING will busy-loop waiting for it to either get
         * queued or lose PENDING.  Grabbing PENDING and queueing should
         * happen with IRQ disabled.
         */
        lockdep_assert_irqs_disabled();

        /*
         * For a draining wq, only works from the same workqueue are
         * allowed. The __WQ_DESTROYING helps to spot the issue that
         * queues a new work item to a wq after destroy_workqueue(wq).
         */
        if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
                     WARN_ON_ONCE(!is_chained_work(wq))))
                return;
        rcu_read_lock();
retry:
        /* pwq which will be used unless @work is executing elsewhere */
        if (req_cpu == WORK_CPU_UNBOUND) {
                if (wq->flags & WQ_UNBOUND)
                        cpu = wq_select_unbound_cpu(raw_smp_processor_id());
                else
                        cpu = raw_smp_processor_id();
        }

        pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu));
        pool = pwq->pool;

        /*
         * If @work was previously on a different pool, it might still be
         * running there, in which case the work needs to be queued on that
         * pool to guarantee non-reentrancy.
         */
        last_pool = get_work_pool(work);
        if (last_pool && last_pool != pool) {
                struct worker *worker;

                raw_spin_lock(&last_pool->lock);

                worker = find_worker_executing_work(last_pool, work);

                if (worker && worker->current_pwq->wq == wq) {
                        pwq = worker->current_pwq;
                        pool = pwq->pool;
                        WARN_ON_ONCE(pool != last_pool);
                } else {
                        /* meh... not running there, queue here */
                        raw_spin_unlock(&last_pool->lock);
                        raw_spin_lock(&pool->lock);
                }
        } else {
                raw_spin_lock(&pool->lock);
        }

        /*
         * pwq is determined and locked. For unbound pools, we could have raced
         * with pwq release and it could already be dead. If its refcnt is zero,
         * repeat pwq selection. Note that unbound pwqs never die without
         * another pwq replacing it in cpu_pwq or while work items are executing
         * on it, so the retrying is guaranteed to make forward-progress.
         */
        if (unlikely(!pwq->refcnt)) {
                if (wq->flags & WQ_UNBOUND) {
                        raw_spin_unlock(&pool->lock);
                        cpu_relax();
                        goto retry;
                }
                /* oops */
                WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
                          wq->name, cpu);
        }

        /* pwq determined, queue */
        trace_workqueue_queue_work(req_cpu, pwq, work);

        if (WARN_ON(!list_empty(&work->entry)))
                goto out;

        pwq->nr_in_flight[pwq->work_color]++;
        work_flags = work_color_to_flags(pwq->work_color);

        /*
         * Limit the number of concurrently active work items to max_active.
         * @work must also queue behind existing inactive work items to maintain
         * ordering when max_active changes. See wq_adjust_max_active().
         */
        if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) {
                if (list_empty(&pool->worklist))
                        pool->watchdog_ts = jiffies;

                trace_workqueue_activate_work(work);
                insert_work(pwq, work, &pool->worklist, work_flags);
                kick_pool(pool);
        } else {
                work_flags |= WORK_STRUCT_INACTIVE;
                insert_work(pwq, work, &pwq->inactive_works, work_flags);
        }

out:
        raw_spin_unlock(&pool->lock);
        rcu_read_unlock();
}

static bool clear_pending_if_disabled(struct work_struct *work)
{
        unsigned long data = *work_data_bits(work);
        struct work_offq_data offqd;

        if (likely((data & WORK_STRUCT_PWQ) ||
                   !(data & WORK_OFFQ_DISABLE_MASK)))
                return false;

        work_offqd_unpack(&offqd, data);
        set_work_pool_and_clear_pending(work, offqd.pool_id,
                                        work_offqd_pack_flags(&offqd));
        return true;
}

/**
 * queue_work_on - queue work on specific cpu
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @work: work to queue
 *
 * We queue the work to a specific CPU, the caller must ensure it
 * can't go away.  Callers that fail to ensure that the specified
 * CPU cannot go away will execute on a randomly chosen CPU.
 * But note well that callers specifying a CPU that never has been
 * online will get a splat.
 *
 * Return: %false if @work was already on a queue, %true otherwise.
 */
bool queue_work_on(int cpu, struct workqueue_struct *wq,
                   struct work_struct *work)
{
        bool ret = false;
        unsigned long irq_flags;

        local_irq_save(irq_flags);

        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
            !clear_pending_if_disabled(work)) {
                __queue_work(cpu, wq, work);
                ret = true;
        }

        local_irq_restore(irq_flags);
        return ret;
}
EXPORT_SYMBOL(queue_work_on);

/**
 * select_numa_node_cpu - Select a CPU based on NUMA node
 * @node: NUMA node ID that we want to select a CPU from
 *
 * This function will attempt to find a "random" cpu available on a given
 * node. If there are no CPUs available on the given node it will return
 * WORK_CPU_UNBOUND indicating that we should just schedule to any
 * available CPU if we need to schedule this work.
 */
static int select_numa_node_cpu(int node)
{
        int cpu;

        /* Delay binding to CPU if node is not valid or online */
        if (node < 0 || node >= MAX_NUMNODES || !node_online(node))
                return WORK_CPU_UNBOUND;

        /* Use local node/cpu if we are already there */
        cpu = raw_smp_processor_id();
        if (node == cpu_to_node(cpu))
                return cpu;

        /* Use "random" otherwise know as "first" online CPU of node */
        cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask);

        /* If CPU is valid return that, otherwise just defer */
        return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND;
}

/**
 * queue_work_node - queue work on a "random" cpu for a given NUMA node
 * @node: NUMA node that we are targeting the work for
 * @wq: workqueue to use
 * @work: work to queue
 *
 * We queue the work to a "random" CPU within a given NUMA node. The basic
 * idea here is to provide a way to somehow associate work with a given
 * NUMA node.
 *
 * This function will only make a best effort attempt at getting this onto
 * the right NUMA node. If no node is requested or the requested node is
 * offline then we just fall back to standard queue_work behavior.
 *
 * Currently the "random" CPU ends up being the first available CPU in the
 * intersection of cpu_online_mask and the cpumask of the node, unless we
 * are running on the node. In that case we just use the current CPU.
 *
 * Return: %false if @work was already on a queue, %true otherwise.
 */
bool queue_work_node(int node, struct workqueue_struct *wq,
                     struct work_struct *work)
{
        unsigned long irq_flags;
        bool ret = false;

        /*
         * This current implementation is specific to unbound workqueues.
         * Specifically we only return the first available CPU for a given
         * node instead of cycling through individual CPUs within the node.
         *
         * If this is used with a per-cpu workqueue then the logic in
         * workqueue_select_cpu_near would need to be updated to allow for
         * some round robin type logic.
         */
        WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND));

        local_irq_save(irq_flags);

        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
            !clear_pending_if_disabled(work)) {
                int cpu = select_numa_node_cpu(node);

                __queue_work(cpu, wq, work);
                ret = true;
        }

        local_irq_restore(irq_flags);
        return ret;
}
EXPORT_SYMBOL_GPL(queue_work_node);

void delayed_work_timer_fn(struct timer_list *t)
{
        struct delayed_work *dwork = from_timer(dwork, t, timer);

        /* should have been called from irqsafe timer with irq already off */
        __queue_work(dwork->cpu, dwork->wq, &dwork->work);
}
EXPORT_SYMBOL(delayed_work_timer_fn);

static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
                                struct delayed_work *dwork, unsigned long delay)
{
        struct timer_list *timer = &dwork->timer;
        struct work_struct *work = &dwork->work;

        WARN_ON_ONCE(!wq);
        WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
        WARN_ON_ONCE(timer_pending(timer));
        WARN_ON_ONCE(!list_empty(&work->entry));

        /*
         * If @delay is 0, queue @dwork->work immediately.  This is for
         * both optimization and correctness.  The earliest @timer can
         * expire is on the closest next tick and delayed_work users depend
         * on that there's no such delay when @delay is 0.
         */
        if (!delay) {
                __queue_work(cpu, wq, &dwork->work);
                return;
        }

        dwork->wq = wq;
        dwork->cpu = cpu;
        timer->expires = jiffies + delay;

        if (housekeeping_enabled(HK_TYPE_TIMER)) {
                /* If the current cpu is a housekeeping cpu, use it. */
                cpu = smp_processor_id();
                if (!housekeeping_test_cpu(cpu, HK_TYPE_TIMER))
                        cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
                add_timer_on(timer, cpu);
        } else {
                if (likely(cpu == WORK_CPU_UNBOUND))
                        add_timer_global(timer);
                else
                        add_timer_on(timer, cpu);
        }
}

/**
 * queue_delayed_work_on - queue work on specific CPU after delay
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * Return: %false if @work was already on a queue, %true otherwise.  If
 * @delay is zero and @dwork is idle, it will be scheduled for immediate
 * execution.
 */
bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
                           struct delayed_work *dwork, unsigned long delay)
{
        struct work_struct *work = &dwork->work;
        bool ret = false;
        unsigned long irq_flags;

        /* read the comment in __queue_work() */
        local_irq_save(irq_flags);

        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
            !clear_pending_if_disabled(work)) {
                __queue_delayed_work(cpu, wq, dwork, delay);
                ret = true;
        }

        local_irq_restore(irq_flags);
        return ret;
}
EXPORT_SYMBOL(queue_delayed_work_on);

/**
 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
 * modify @dwork's timer so that it expires after @delay.  If @delay is
 * zero, @work is guaranteed to be scheduled immediately regardless of its
 * current state.
 *
 * Return: %false if @dwork was idle and queued, %true if @dwork was
 * pending and its timer was modified.
 *
 * This function is safe to call from any context including IRQ handler.
 * See try_to_grab_pending() for details.
 */
bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
                         struct delayed_work *dwork, unsigned long delay)
{
        unsigned long irq_flags;
        bool ret;

        ret = work_grab_pending(&dwork->work, WORK_CANCEL_DELAYED, &irq_flags);

        if (!clear_pending_if_disabled(&dwork->work))
                __queue_delayed_work(cpu, wq, dwork, delay);

        local_irq_restore(irq_flags);
        return ret;
}
EXPORT_SYMBOL_GPL(mod_delayed_work_on);

static void rcu_work_rcufn(struct rcu_head *rcu)
{
        struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu);

        /* read the comment in __queue_work() */
        local_irq_disable();
        __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work);
        local_irq_enable();
}

/**
 * queue_rcu_work - queue work after a RCU grace period
 * @wq: workqueue to use
 * @rwork: work to queue
 *
 * Return: %false if @rwork was already pending, %true otherwise.  Note
 * that a full RCU grace period is guaranteed only after a %true return.
 * While @rwork is guaranteed to be executed after a %false return, the
 * execution may happen before a full RCU grace period has passed.
 */
bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
{
        struct work_struct *work = &rwork->work;

        /*
         * rcu_work can't be canceled or disabled. Warn if the user reached
         * inside @rwork and disabled the inner work.
         */
        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
            !WARN_ON_ONCE(clear_pending_if_disabled(work))) {
                rwork->wq = wq;
                call_rcu_hurry(&rwork->rcu, rcu_work_rcufn);
                return true;
        }

        return false;
}
EXPORT_SYMBOL(queue_rcu_work);

static struct worker *alloc_worker(int node)
{
        struct worker *worker;

        worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
        if (worker) {
                INIT_LIST_HEAD(&worker->entry);
                INIT_LIST_HEAD(&worker->scheduled);
                INIT_LIST_HEAD(&worker->node);
                /* on creation a worker is in !idle && prep state */
                worker->flags = WORKER_PREP;
        }
        return worker;
}

static cpumask_t *pool_allowed_cpus(struct worker_pool *pool)
{
        if (pool->cpu < 0 && pool->attrs->affn_strict)
                return pool->attrs->__pod_cpumask;
        else
                return pool->attrs->cpumask;
}

/**
 * worker_attach_to_pool() - attach a worker to a pool
 * @worker: worker to be attached
 * @pool: the target pool
 *
 * Attach @worker to @pool.  Once attached, the %WORKER_UNBOUND flag and
 * cpu-binding of @worker are kept coordinated with the pool across
 * cpu-[un]hotplugs.
 */
static void worker_attach_to_pool(struct worker *worker,
                                  struct worker_pool *pool)
{
        mutex_lock(&wq_pool_attach_mutex);

        /*
         * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains stable
         * across this function. See the comments above the flag definition for
         * details. BH workers are, while per-CPU, always DISASSOCIATED.
         */
        if (pool->flags & POOL_DISASSOCIATED) {
                worker->flags |= WORKER_UNBOUND;
        } else {
                WARN_ON_ONCE(pool->flags & POOL_BH);
                kthread_set_per_cpu(worker->task, pool->cpu);
        }

        if (worker->rescue_wq)
                set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool));

        list_add_tail(&worker->node, &pool->workers);
        worker->pool = pool;

        mutex_unlock(&wq_pool_attach_mutex);
}

/**
 * worker_detach_from_pool() - detach a worker from its pool
 * @worker: worker which is attached to its pool
 *
 * Undo the attaching which had been done in worker_attach_to_pool().  The
 * caller worker shouldn't access to the pool after detached except it has
 * other reference to the pool.
 */
static void worker_detach_from_pool(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;
        struct completion *detach_completion = NULL;

        /* there is one permanent BH worker per CPU which should never detach */
        WARN_ON_ONCE(pool->flags & POOL_BH);

        mutex_lock(&wq_pool_attach_mutex);

        kthread_set_per_cpu(worker->task, -1);
        list_del(&worker->node);
        worker->pool = NULL;

        if (list_empty(&pool->workers) && list_empty(&pool->dying_workers))
                detach_completion = pool->detach_completion;
        mutex_unlock(&wq_pool_attach_mutex);

        /* clear leftover flags without pool->lock after it is detached */
        worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);

        if (detach_completion)
                complete(detach_completion);
}

static int format_worker_id(char *buf, size_t size, struct worker *worker,
                            struct worker_pool *pool)
{
        if (worker->rescue_wq)
                return scnprintf(buf, size, "kworker/R-%s",
                                 worker->rescue_wq->name);

        if (pool) {
                if (pool->cpu >= 0)
                        return scnprintf(buf, size, "kworker/%d:%d%s",
                                         pool->cpu, worker->id,
                                         pool->attrs->nice < 0  ? "H" : "");
                else
                        return scnprintf(buf, size, "kworker/u%d:%d",
                                         pool->id, worker->id);
        } else {
                return scnprintf(buf, size, "kworker/dying");
        }
}

/**
 * create_worker - create a new workqueue worker
 * @pool: pool the new worker will belong to
 *
 * Create and start a new worker which is attached to @pool.
 *
 * CONTEXT:
 * Might sleep.  Does GFP_KERNEL allocations.
 *
 * Return:
 * Pointer to the newly created worker.
 */
static struct worker *create_worker(struct worker_pool *pool)
{
        struct worker *worker;
        int id;

        /* ID is needed to determine kthread name */
        id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
        if (id < 0) {
                pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n",
                            ERR_PTR(id));
                return NULL;
        }

        worker = alloc_worker(pool->node);
        if (!worker) {
                pr_err_once("workqueue: Failed to allocate a worker\n");
                goto fail;
        }

        worker->id = id;

        if (!(pool->flags & POOL_BH)) {
                char id_buf[WORKER_ID_LEN];

                format_worker_id(id_buf, sizeof(id_buf), worker, pool);
                worker->task = kthread_create_on_node(worker_thread, worker,
                                                      pool->node, "%s", id_buf);
                if (IS_ERR(worker->task)) {
                        if (PTR_ERR(worker->task) == -EINTR) {
                                pr_err("workqueue: Interrupted when creating a worker thread \"%s\"\n",
                                       id_buf);
                        } else {
                                pr_err_once("workqueue: Failed to create a worker thread: %pe",
                                            worker->task);
                        }
                        goto fail;
                }

                set_user_nice(worker->task, pool->attrs->nice);
                kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
        }

        /* successful, attach the worker to the pool */
        worker_attach_to_pool(worker, pool);

        /* start the newly created worker */
        raw_spin_lock_irq(&pool->lock);

        worker->pool->nr_workers++;
        worker_enter_idle(worker);

        /*
         * @worker is waiting on a completion in kthread() and will trigger hung
         * check if not woken up soon. As kick_pool() is noop if @pool is empty,
         * wake it up explicitly.
         */
        if (worker->task)
                wake_up_process(worker->task);

        raw_spin_unlock_irq(&pool->lock);

        return worker;

fail:
        ida_free(&pool->worker_ida, id);
        kfree(worker);
        return NULL;
}

static void unbind_worker(struct worker *worker)
{
        lockdep_assert_held(&wq_pool_attach_mutex);

        kthread_set_per_cpu(worker->task, -1);
        if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
        else
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
}

static void wake_dying_workers(struct list_head *cull_list)
{
        struct worker *worker, *tmp;

        list_for_each_entry_safe(worker, tmp, cull_list, entry) {
                list_del_init(&worker->entry);
                unbind_worker(worker);
                /*
                 * If the worker was somehow already running, then it had to be
                 * in pool->idle_list when set_worker_dying() happened or we
                 * wouldn't have gotten here.
                 *
                 * Thus, the worker must either have observed the WORKER_DIE
                 * flag, or have set its state to TASK_IDLE. Either way, the
                 * below will be observed by the worker and is safe to do
                 * outside of pool->lock.
                 */
                wake_up_process(worker->task);
        }
}

/**
 * set_worker_dying - Tag a worker for destruction
 * @worker: worker to be destroyed
 * @list: transfer worker away from its pool->idle_list and into list
 *
 * Tag @worker for destruction and adjust @pool stats accordingly.  The worker
 * should be idle.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void set_worker_dying(struct worker *worker, struct list_head *list)
{
        struct worker_pool *pool = worker->pool;

        lockdep_assert_held(&pool->lock);
        lockdep_assert_held(&wq_pool_attach_mutex);

        /* sanity check frenzy */
        if (WARN_ON(worker->current_work) ||
            WARN_ON(!list_empty(&worker->scheduled)) ||
            WARN_ON(!(worker->flags & WORKER_IDLE)))
                return;

        pool->nr_workers--;
        pool->nr_idle--;

        worker->flags |= WORKER_DIE;

        list_move(&worker->entry, list);
        list_move(&worker->node, &pool->dying_workers);
}

/**
 * idle_worker_timeout - check if some idle workers can now be deleted.
 * @t: The pool's idle_timer that just expired
 *
 * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in
 * worker_leave_idle(), as a worker flicking between idle and active while its
 * pool is at the too_many_workers() tipping point would cause too much timer
 * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let
 * it expire and re-evaluate things from there.
 */
static void idle_worker_timeout(struct timer_list *t)
{
        struct worker_pool *pool = from_timer(pool, t, idle_timer);
        bool do_cull = false;

        if (work_pending(&pool->idle_cull_work))
                return;

        raw_spin_lock_irq(&pool->lock);

        if (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;

                /* idle_list is kept in LIFO order, check the last one */
                worker = list_last_entry(&pool->idle_list, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
                do_cull = !time_before(jiffies, expires);

                if (!do_cull)
                        mod_timer(&pool->idle_timer, expires);
        }
        raw_spin_unlock_irq(&pool->lock);

        if (do_cull)
                queue_work(system_unbound_wq, &pool->idle_cull_work);
}

/**
 * idle_cull_fn - cull workers that have been idle for too long.
 * @work: the pool's work for handling these idle workers
 *
 * This goes through a pool's idle workers and gets rid of those that have been
 * idle for at least IDLE_WORKER_TIMEOUT seconds.
 *
 * We don't want to disturb isolated CPUs because of a pcpu kworker being
 * culled, so this also resets worker affinity. This requires a sleepable
 * context, hence the split between timer callback and work item.
 */
static void idle_cull_fn(struct work_struct *work)
{
        struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work);
        LIST_HEAD(cull_list);

        /*
         * Grabbing wq_pool_attach_mutex here ensures an already-running worker
         * cannot proceed beyong worker_detach_from_pool() in its self-destruct
         * path. This is required as a previously-preempted worker could run after
         * set_worker_dying() has happened but before wake_dying_workers() did.
         */
        mutex_lock(&wq_pool_attach_mutex);
        raw_spin_lock_irq(&pool->lock);

        while (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;

                worker = list_last_entry(&pool->idle_list, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;

                if (time_before(jiffies, expires)) {
                        mod_timer(&pool->idle_timer, expires);
                        break;
                }

                set_worker_dying(worker, &cull_list);
        }

        raw_spin_unlock_irq(&pool->lock);
        wake_dying_workers(&cull_list);
        mutex_unlock(&wq_pool_attach_mutex);
}

static void send_mayday(struct work_struct *work)
{
        struct pool_workqueue *pwq = get_work_pwq(work);
        struct workqueue_struct *wq = pwq->wq;

        lockdep_assert_held(&wq_mayday_lock);

        if (!wq->rescuer)
                return;

        /* mayday mayday mayday */
        if (list_empty(&pwq->mayday_node)) {
                /*
                 * If @pwq is for an unbound wq, its base ref may be put at
                 * any time due to an attribute change.  Pin @pwq until the
                 * rescuer is done with it.
                 */
                get_pwq(pwq);
                list_add_tail(&pwq->mayday_node, &wq->maydays);
                wake_up_process(wq->rescuer->task);
                pwq->stats[PWQ_STAT_MAYDAY]++;
        }
}

static void pool_mayday_timeout(struct timer_list *t)
{
        struct worker_pool *pool = from_timer(pool, t, mayday_timer);
        struct work_struct *work;

        raw_spin_lock_irq(&pool->lock);
        raw_spin_lock(&wq_mayday_lock);                /* for wq->maydays */

        if (need_to_create_worker(pool)) {
                /*
                 * We've been trying to create a new worker but
                 * haven't been successful.  We might be hitting an
                 * allocation deadlock.  Send distress signals to
                 * rescuers.
                 */
                list_for_each_entry(work, &pool->worklist, entry)
                        send_mayday(work);
        }

        raw_spin_unlock(&wq_mayday_lock);
        raw_spin_unlock_irq(&pool->lock);

        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
}

/**
 * maybe_create_worker - create a new worker if necessary
 * @pool: pool to create a new worker for
 *
 * Create a new worker for @pool if necessary.  @pool is guaranteed to
 * have at least one idle worker on return from this function.  If
 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
 * sent to all rescuers with works scheduled on @pool to resolve
 * possible allocation deadlock.
 *
 * On return, need_to_create_worker() is guaranteed to be %false and
 * may_start_working() %true.
 *
 * LOCKING:
 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.  Called only from
 * manager.
 */
static void maybe_create_worker(struct worker_pool *pool)
__releases(&pool->lock)
__acquires(&pool->lock)
{
restart:
        raw_spin_unlock_irq(&pool->lock);

        /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);

        while (true) {
                if (create_worker(pool) || !need_to_create_worker(pool))
                        break;

                schedule_timeout_interruptible(CREATE_COOLDOWN);

                if (!need_to_create_worker(pool))
                        break;
        }

        del_timer_sync(&pool->mayday_timer);
        raw_spin_lock_irq(&pool->lock);
        /*
         * This is necessary even after a new worker was just successfully
         * created as @pool->lock was dropped and the new worker might have
         * already become busy.
         */
        if (need_to_create_worker(pool))
                goto restart;
}

/**
 * manage_workers - manage worker pool
 * @worker: self
 *
 * Assume the manager role and manage the worker pool @worker belongs
 * to.  At any given time, there can be only zero or one manager per
 * pool.  The exclusion is handled automatically by this function.
 *
 * The caller can safely start processing works on false return.  On
 * true return, it's guaranteed that need_to_create_worker() is false
 * and may_start_working() is true.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.
 *
 * Return:
 * %false if the pool doesn't need management and the caller can safely
 * start processing works, %true if management function was performed and
 * the conditions that the caller verified before calling the function may
 * no longer be true.
 */
static bool manage_workers(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (pool->flags & POOL_MANAGER_ACTIVE)
                return false;

        pool->flags |= POOL_MANAGER_ACTIVE;
        pool->manager = worker;

        maybe_create_worker(pool);

        pool->manager = NULL;
        pool->flags &= ~POOL_MANAGER_ACTIVE;
        rcuwait_wake_up(&manager_wait);
        return true;
}

/**
 * process_one_work - process single work
 * @worker: self
 * @work: work to process
 *
 * Process @work.  This function contains all the logics necessary to
 * process a single work including synchronization against and
 * interaction with other workers on the same cpu, queueing and
 * flushing.  As long as context requirement is met, any worker can
 * call this function to process a work.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock) which is released and regrabbed.
 */
static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
__acquires(&pool->lock)
{
        struct pool_workqueue *pwq = get_work_pwq(work);
        struct worker_pool *pool = worker->pool;
        unsigned long work_data;
        int lockdep_start_depth, rcu_start_depth;
        bool bh_draining = pool->flags & POOL_BH_DRAINING;
#ifdef CONFIG_LOCKDEP
        /*
         * It is permissible to free the struct work_struct from
         * inside the function that is called from it, this we need to
         * take into account for lockdep too.  To avoid bogus "held
         * lock freed" warnings as well as problems when looking into
         * work->lockdep_map, make a copy and use that here.
         */
        struct lockdep_map lockdep_map;

        lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
        /* ensure we're on the correct CPU */
        WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
                     raw_smp_processor_id() != pool->cpu);

        /* claim and dequeue */
        debug_work_deactivate(work);
        hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
        worker->current_work = work;
        worker->current_func = work->func;
        worker->current_pwq = pwq;
        if (worker->task)
                worker->current_at = worker->task->se.sum_exec_runtime;
        work_data = *work_data_bits(work);
        worker->current_color = get_work_color(work_data);

        /*
         * Record wq name for cmdline and debug reporting, may get
         * overridden through set_worker_desc().
         */
        strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);

        list_del_init(&work->entry);

        /*
         * CPU intensive works don't participate in concurrency management.
         * They're the scheduler's responsibility.  This takes @worker out
         * of concurrency management and the next code block will chain
         * execution of the pending work items.
         */
        if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE))
                worker_set_flags(worker, WORKER_CPU_INTENSIVE);

        /*
         * Kick @pool if necessary. It's always noop for per-cpu worker pools
         * since nr_running would always be >= 1 at this point. This is used to
         * chain execution of the pending work items for WORKER_NOT_RUNNING
         * workers such as the UNBOUND and CPU_INTENSIVE ones.
         */
        kick_pool(pool);

        /*
         * Record the last pool and clear PENDING which should be the last
         * update to @work.  Also, do this inside @pool->lock so that
         * PENDING and queued state changes happen together while IRQ is
         * disabled.
         */
        set_work_pool_and_clear_pending(work, pool->id, pool_offq_flags(pool));

        pwq->stats[PWQ_STAT_STARTED]++;
        raw_spin_unlock_irq(&pool->lock);

        rcu_start_depth = rcu_preempt_depth();
        lockdep_start_depth = lockdep_depth(current);
        /* see drain_dead_softirq_workfn() */
        if (!bh_draining)
                lock_map_acquire(&pwq->wq->lockdep_map);
        lock_map_acquire(&lockdep_map);
        /*
         * Strictly speaking we should mark the invariant state without holding
         * any locks, that is, before these two lock_map_acquire()'s.
         *
         * However, that would result in:
         *
         *   A(W1)
         *   WFC(C)
         *                A(W1)
         *                C(C)
         *
         * Which would create W1->C->W1 dependencies, even though there is no
         * actual deadlock possible. There are two solutions, using a
         * read-recursive acquire on the work(queue) 'locks', but this will then
         * hit the lockdep limitation on recursive locks, or simply discard
         * these locks.
         *
         * AFAICT there is no possible deadlock scenario between the
         * flush_work() and complete() primitives (except for single-threaded
         * workqueues), so hiding them isn't a problem.
         */
        lockdep_invariant_state(true);
        trace_workqueue_execute_start(work);
        worker->current_func(work);
        /*
         * While we must be careful to not use "work" after this, the trace
         * point will only record its address.
         */
        trace_workqueue_execute_end(work, worker->current_func);
        pwq->stats[PWQ_STAT_COMPLETED]++;
        lock_map_release(&lockdep_map);
        if (!bh_draining)
                lock_map_release(&pwq->wq->lockdep_map);

        if (unlikely((worker->task && in_atomic()) ||
                     lockdep_depth(current) != lockdep_start_depth ||
                     rcu_preempt_depth() != rcu_start_depth)) {
                pr_err("BUG: workqueue leaked atomic, lock or RCU: %s[%d]\n"
                       "     preempt=0x%08x lock=%d->%d RCU=%d->%d workfn=%ps\n",
                       current->comm, task_pid_nr(current), preempt_count(),
                       lockdep_start_depth, lockdep_depth(current),
                       rcu_start_depth, rcu_preempt_depth(),
                       worker->current_func);
                debug_show_held_locks(current);
                dump_stack();
        }

        /*
         * The following prevents a kworker from hogging CPU on !PREEMPTION
         * kernels, where a requeueing work item waiting for something to
         * happen could deadlock with stop_machine as such work item could
         * indefinitely requeue itself while all other CPUs are trapped in
         * stop_machine. At the same time, report a quiescent RCU state so
         * the same condition doesn't freeze RCU.
         */
        if (worker->task)
                cond_resched();

        raw_spin_lock_irq(&pool->lock);

        /*
         * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
         * CPU intensive by wq_worker_tick() if @work hogged CPU longer than
         * wq_cpu_intensive_thresh_us. Clear it.
         */
        worker_clr_flags(worker, WORKER_CPU_INTENSIVE);

        /* tag the worker for identification in schedule() */
        worker->last_func = worker->current_func;

        /* we're done with it, release */
        hash_del(&worker->hentry);
        worker->current_work = NULL;
        worker->current_func = NULL;
        worker->current_pwq = NULL;
        worker->current_color = INT_MAX;

        /* must be the last step, see the function comment */
        pwq_dec_nr_in_flight(pwq, work_data);
}

/**
 * process_scheduled_works - process scheduled works
 * @worker: self
 *
 * Process all scheduled works.  Please note that the scheduled list
 * may change while processing a work, so this function repeatedly
 * fetches a work from the top and executes it.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.
 */
static void process_scheduled_works(struct worker *worker)
{
        struct work_struct *work;
        bool first = true;

        while ((work = list_first_entry_or_null(&worker->scheduled,
                                                struct work_struct, entry))) {
                if (first) {
                        worker->pool->watchdog_ts = jiffies;
                        first = false;
                }
                process_one_work(worker, work);
        }
}

static void set_pf_worker(bool val)
{
        mutex_lock(&wq_pool_attach_mutex);
        if (val)
                current->flags |= PF_WQ_WORKER;
        else
                current->flags &= ~PF_WQ_WORKER;
        mutex_unlock(&wq_pool_attach_mutex);
}

/**
 * worker_thread - the worker thread function
 * @__worker: self
 *
 * The worker thread function.  All workers belong to a worker_pool -
 * either a per-cpu one or dynamic unbound one.  These workers process all
 * work items regardless of their specific target workqueue.  The only
 * exception is work items which belong to workqueues with a rescuer which
 * will be explained in rescuer_thread().
 *
 * Return: 0
 */
static int worker_thread(void *__worker)
{
        struct worker *worker = __worker;
        struct worker_pool *pool = worker->pool;

        /* tell the scheduler that this is a workqueue worker */
        set_pf_worker(true);
woke_up:
        raw_spin_lock_irq(&pool->lock);

        /* am I supposed to die? */
        if (unlikely(worker->flags & WORKER_DIE)) {
                raw_spin_unlock_irq(&pool->lock);
                set_pf_worker(false);

                ida_free(&pool->worker_ida, worker->id);
                worker_detach_from_pool(worker);
                WARN_ON_ONCE(!list_empty(&worker->entry));
                kfree(worker);
                return 0;
        }

        worker_leave_idle(worker);
recheck:
        /* no more worker necessary? */
        if (!need_more_worker(pool))
                goto sleep;

        /* do we need to manage? */
        if (unlikely(!may_start_working(pool)) && manage_workers(worker))
                goto recheck;

        /*
         * ->scheduled list can only be filled while a worker is
         * preparing to process a work or actually processing it.
         * Make sure nobody diddled with it while I was sleeping.
         */
        WARN_ON_ONCE(!list_empty(&worker->scheduled));

        /*
         * Finish PREP stage.  We're guaranteed to have at least one idle
         * worker or that someone else has already assumed the manager
         * role.  This is where @worker starts participating in concurrency
         * management if applicable and concurrency management is restored
         * after being rebound.  See rebind_workers() for details.
         */
        worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);

        do {
                struct work_struct *work =
                        list_first_entry(&pool->worklist,
                                         struct work_struct, entry);

                if (assign_work(work, worker, NULL))
                        process_scheduled_works(worker);
        } while (keep_working(pool));

        worker_set_flags(worker, WORKER_PREP);
sleep:
        /*
         * pool->lock is held and there's no work to process and no need to
         * manage, sleep.  Workers are woken up only while holding
         * pool->lock or from local cpu, so setting the current state
         * before releasing pool->lock is enough to prevent losing any
         * event.
         */
        worker_enter_idle(worker);
        __set_current_state(TASK_IDLE);
        raw_spin_unlock_irq(&pool->lock);
        schedule();
        goto woke_up;
}

/**
 * rescuer_thread - the rescuer thread function
 * @__rescuer: self
 *
 * Workqueue rescuer thread function.  There's one rescuer for each
 * workqueue which has WQ_MEM_RECLAIM set.
 *
 * Regular work processing on a pool may block trying to create a new
 * worker which uses GFP_KERNEL allocation which has slight chance of
 * developing into deadlock if some works currently on the same queue
 * need to be processed to satisfy the GFP_KERNEL allocation.  This is
 * the problem rescuer solves.
 *
 * When such condition is possible, the pool summons rescuers of all
 * workqueues which have works queued on the pool and let them process
 * those works so that forward progress can be guaranteed.
 *
 * This should happen rarely.
 *
 * Return: 0
 */
static int rescuer_thread(void *__rescuer)
{
        struct worker *rescuer = __rescuer;
        struct workqueue_struct *wq = rescuer->rescue_wq;
        bool should_stop;

        set_user_nice(current, RESCUER_NICE_LEVEL);

        /*
         * Mark rescuer as worker too.  As WORKER_PREP is never cleared, it
         * doesn't participate in concurrency management.
         */
        set_pf_worker(true);
repeat:
        set_current_state(TASK_IDLE);

        /*
         * By the time the rescuer is requested to stop, the workqueue
         * shouldn't have any work pending, but @wq->maydays may still have
         * pwq(s) queued.  This can happen by non-rescuer workers consuming
         * all the work items before the rescuer got to them.  Go through
         * @wq->maydays processing before acting on should_stop so that the
         * list is always empty on exit.
         */
        should_stop = kthread_should_stop();

        /* see whether any pwq is asking for help */
        raw_spin_lock_irq(&wq_mayday_lock);

        while (!list_empty(&wq->maydays)) {
                struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
                                        struct pool_workqueue, mayday_node);
                struct worker_pool *pool = pwq->pool;
                struct work_struct *work, *n;

                __set_current_state(TASK_RUNNING);
                list_del_init(&pwq->mayday_node);

                raw_spin_unlock_irq(&wq_mayday_lock);

                worker_attach_to_pool(rescuer, pool);

                raw_spin_lock_irq(&pool->lock);

                /*
                 * Slurp in all works issued via this workqueue and
                 * process'em.
                 */
                WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
                list_for_each_entry_safe(work, n, &pool->worklist, entry) {
                        if (get_work_pwq(work) == pwq &&
                            assign_work(work, rescuer, &n))
                                pwq->stats[PWQ_STAT_RESCUED]++;
                }

                if (!list_empty(&rescuer->scheduled)) {
                        process_scheduled_works(rescuer);

                        /*
                         * The above execution of rescued work items could
                         * have created more to rescue through
                         * pwq_activate_first_inactive() or chained
                         * queueing.  Let's put @pwq back on mayday list so
                         * that such back-to-back work items, which may be
                         * being used to relieve memory pressure, don't
                         * incur MAYDAY_INTERVAL delay inbetween.
                         */
                        if (pwq->nr_active && need_to_create_worker(pool)) {
                                raw_spin_lock(&wq_mayday_lock);
                                /*
                                 * Queue iff we aren't racing destruction
                                 * and somebody else hasn't queued it already.
                                 */
                                if (wq->rescuer && list_empty(&pwq->mayday_node)) {
                                        get_pwq(pwq);
                                        list_add_tail(&pwq->mayday_node, &wq->maydays);
                                }
                                raw_spin_unlock(&wq_mayday_lock);
                        }
                }

                /*
                 * Put the reference grabbed by send_mayday().  @pool won't
                 * go away while we're still attached to it.
                 */
                put_pwq(pwq);

                /*
                 * Leave this pool. Notify regular workers; otherwise, we end up
                 * with 0 concurrency and stalling the execution.
                 */
                kick_pool(pool);

                raw_spin_unlock_irq(&pool->lock);

                worker_detach_from_pool(rescuer);

                raw_spin_lock_irq(&wq_mayday_lock);
        }

        raw_spin_unlock_irq(&wq_mayday_lock);

        if (should_stop) {
                __set_current_state(TASK_RUNNING);
                set_pf_worker(false);
                return 0;
        }

        /* rescuers should never participate in concurrency management */
        WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
        schedule();
        goto repeat;
}

static void bh_worker(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;
        int nr_restarts = BH_WORKER_RESTARTS;
        unsigned long end = jiffies + BH_WORKER_JIFFIES;

        raw_spin_lock_irq(&pool->lock);
        worker_leave_idle(worker);

        /*
         * This function follows the structure of worker_thread(). See there for
         * explanations on each step.
         */
        if (!need_more_worker(pool))
                goto done;

        WARN_ON_ONCE(!list_empty(&worker->scheduled));
        worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);

        do {
                struct work_struct *work =
                        list_first_entry(&pool->worklist,
                                         struct work_struct, entry);

                if (assign_work(work, worker, NULL))
                        process_scheduled_works(worker);
        } while (keep_working(pool) &&
                 --nr_restarts && time_before(jiffies, end));

        worker_set_flags(worker, WORKER_PREP);
done:
        worker_enter_idle(worker);
        kick_pool(pool);
        raw_spin_unlock_irq(&pool->lock);
}

/*
 * TODO: Convert all tasklet users to workqueue and use softirq directly.
 *
 * This is currently called from tasklet[_hi]action() and thus is also called
 * whenever there are tasklets to run. Let's do an early exit if there's nothing
 * queued. Once conversion from tasklet is complete, the need_more_worker() test
 * can be dropped.
 *
 * After full conversion, we'll add worker->softirq_action, directly use the
 * softirq action and obtain the worker pointer from the softirq_action pointer.
 */
void workqueue_softirq_action(bool highpri)
{
        struct worker_pool *pool =
                &per_cpu(bh_worker_pools, smp_processor_id())[highpri];
        if (need_more_worker(pool))
                bh_worker(list_first_entry(&pool->workers, struct worker, node));
}

struct wq_drain_dead_softirq_work {
        struct work_struct        work;
        struct worker_pool        *pool;
        struct completion        done;
};

static void drain_dead_softirq_workfn(struct work_struct *work)
{
        struct wq_drain_dead_softirq_work *dead_work =
                container_of(work, struct wq_drain_dead_softirq_work, work);
        struct worker_pool *pool = dead_work->pool;
        bool repeat;

        /*
         * @pool's CPU is dead and we want to execute its still pending work
         * items from this BH work item which is running on a different CPU. As
         * its CPU is dead, @pool can't be kicked and, as work execution path
         * will be nested, a lockdep annotation needs to be suppressed. Mark
         * @pool with %POOL_BH_DRAINING for the special treatments.
         */
        raw_spin_lock_irq(&pool->lock);
        pool->flags |= POOL_BH_DRAINING;
        raw_spin_unlock_irq(&pool->lock);

        bh_worker(list_first_entry(&pool->workers, struct worker, node));

        raw_spin_lock_irq(&pool->lock);
        pool->flags &= ~POOL_BH_DRAINING;
        repeat = need_more_worker(pool);
        raw_spin_unlock_irq(&pool->lock);

        /*
         * bh_worker() might hit consecutive execution limit and bail. If there
         * still are pending work items, reschedule self and return so that we
         * don't hog this CPU's BH.
         */
        if (repeat) {
                if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
                        queue_work(system_bh_highpri_wq, work);
                else
                        queue_work(system_bh_wq, work);
        } else {
                complete(&dead_work->done);
        }
}

/*
 * @cpu is dead. Drain the remaining BH work items on the current CPU. It's
 * possible to allocate dead_work per CPU and avoid flushing. However, then we
 * have to worry about draining overlapping with CPU coming back online or
 * nesting (one CPU's dead_work queued on another CPU which is also dead and so
 * on). Let's keep it simple and drain them synchronously. These are BH work
 * items which shouldn't be requeued on the same pool. Shouldn't take long.
 */
void workqueue_softirq_dead(unsigned int cpu)
{
        int i;

        for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
                struct worker_pool *pool = &per_cpu(bh_worker_pools, cpu)[i];
                struct wq_drain_dead_softirq_work dead_work;

                if (!need_more_worker(pool))
                        continue;

                INIT_WORK_ONSTACK(&dead_work.work, drain_dead_softirq_workfn);
                dead_work.pool = pool;
                init_completion(&dead_work.done);

                if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
                        queue_work(system_bh_highpri_wq, &dead_work.work);
                else
                        queue_work(system_bh_wq, &dead_work.work);

                wait_for_completion(&dead_work.done);
                destroy_work_on_stack(&dead_work.work);
        }
}

/**
 * check_flush_dependency - check for flush dependency sanity
 * @target_wq: workqueue being flushed
 * @target_work: work item being flushed (NULL for workqueue flushes)
 *
 * %current is trying to flush the whole @target_wq or @target_work on it.
 * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not
 * reclaiming memory or running on a workqueue which doesn't have
 * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to
 * a deadlock.
 */
static void check_flush_dependency(struct workqueue_struct *target_wq,
                                   struct work_struct *target_work)
{
        work_func_t target_func = target_work ? target_work->func : NULL;
        struct worker *worker;

        if (target_wq->flags & WQ_MEM_RECLAIM)
                return;

        worker = current_wq_worker();

        WARN_ONCE(current->flags & PF_MEMALLOC,
                  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",
                  current->pid, current->comm, target_wq->name, target_func);
        WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
                              (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
                  "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps",
                  worker->current_pwq->wq->name, worker->current_func,
                  target_wq->name, target_func);
}

struct wq_barrier {
        struct work_struct        work;
        struct completion        done;
        struct task_struct        *task;        /* purely informational */
};

static void wq_barrier_func(struct work_struct *work)
{
        struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
        complete(&barr->done);
}

/**
 * insert_wq_barrier - insert a barrier work
 * @pwq: pwq to insert barrier into
 * @barr: wq_barrier to insert
 * @target: target work to attach @barr to
 * @worker: worker currently executing @target, NULL if @target is not executing
 *
 * @barr is linked to @target such that @barr is completed only after
 * @target finishes execution.  Please note that the ordering
 * guarantee is observed only with respect to @target and on the local
 * cpu.
 *
 * Currently, a queued barrier can't be canceled.  This is because
 * try_to_grab_pending() can't determine whether the work to be
 * grabbed is at the head of the queue and thus can't clear LINKED
 * flag of the previous work while there must be a valid next work
 * after a work with LINKED flag set.
 *
 * Note that when @worker is non-NULL, @target may be modified
 * underneath us, so we can't reliably determine pwq from @target.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void insert_wq_barrier(struct pool_workqueue *pwq,
                              struct wq_barrier *barr,
                              struct work_struct *target, struct worker *worker)
{
        static __maybe_unused struct lock_class_key bh_key, thr_key;
        unsigned int work_flags = 0;
        unsigned int work_color;
        struct list_head *head;

        /*
         * debugobject calls are safe here even with pool->lock locked
         * as we know for sure that this will not trigger any of the
         * checks and call back into the fixup functions where we
         * might deadlock.
         *
         * BH and threaded workqueues need separate lockdep keys to avoid
         * spuriously triggering "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W}
         * usage".
         */
        INIT_WORK_ONSTACK_KEY(&barr->work, wq_barrier_func,
                              (pwq->wq->flags & WQ_BH) ? &bh_key : &thr_key);
        __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));

        init_completion_map(&barr->done, &target->lockdep_map);

        barr->task = current;

        /* The barrier work item does not participate in nr_active. */
        work_flags |= WORK_STRUCT_INACTIVE;

        /*
         * If @target is currently being executed, schedule the
         * barrier to the worker; otherwise, put it after @target.
         */
        if (worker) {
                head = worker->scheduled.next;
                work_color = worker->current_color;
        } else {
                unsigned long *bits = work_data_bits(target);

                head = target->entry.next;
                /* there can already be other linked works, inherit and set */
                work_flags |= *bits & WORK_STRUCT_LINKED;
                work_color = get_work_color(*bits);
                __set_bit(WORK_STRUCT_LINKED_BIT, bits);
        }

        pwq->nr_in_flight[work_color]++;
        work_flags |= work_color_to_flags(work_color);

        insert_work(pwq, &barr->work, head, work_flags);
}

/**
 * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
 * @wq: workqueue being flushed
 * @flush_color: new flush color, < 0 for no-op
 * @work_color: new work color, < 0 for no-op
 *
 * Prepare pwqs for workqueue flushing.
 *
 * If @flush_color is non-negative, flush_color on all pwqs should be
 * -1.  If no pwq has in-flight commands at the specified color, all
 * pwq->flush_color's stay at -1 and %false is returned.  If any pwq
 * has in flight commands, its pwq->flush_color is set to
 * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
 * wakeup logic is armed and %true is returned.
 *
 * The caller should have initialized @wq->first_flusher prior to
 * calling this function with non-negative @flush_color.  If
 * @flush_color is negative, no flush color update is done and %false
 * is returned.
 *
 * If @work_color is non-negative, all pwqs should have the same
 * work_color which is previous to @work_color and all will be
 * advanced to @work_color.
 *
 * CONTEXT:
 * mutex_lock(wq->mutex).
 *
 * Return:
 * %true if @flush_color >= 0 and there's something to flush.  %false
 * otherwise.
 */
static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
                                      int flush_color, int work_color)
{
        bool wait = false;
        struct pool_workqueue *pwq;

        if (flush_color >= 0) {
                WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
                atomic_set(&wq->nr_pwqs_to_flush, 1);
        }

        for_each_pwq(pwq, wq) {
                struct worker_pool *pool = pwq->pool;

                raw_spin_lock_irq(&pool->lock);

                if (flush_color >= 0) {
                        WARN_ON_ONCE(pwq->flush_color != -1);

                        if (pwq->nr_in_flight[flush_color]) {
                                pwq->flush_color = flush_color;
                                atomic_inc(&wq->nr_pwqs_to_flush);
                                wait = true;
                        }
                }

                if (work_color >= 0) {
                        WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
                        pwq->work_color = work_color;
                }

                raw_spin_unlock_irq(&pool->lock);
        }

        if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
                complete(&wq->first_flusher->done);

        return wait;
}

static void touch_wq_lockdep_map(struct workqueue_struct *wq)
{
#ifdef CONFIG_LOCKDEP
        if (wq->flags & WQ_BH)
                local_bh_disable();

        lock_map_acquire(&wq->lockdep_map);
        lock_map_release(&wq->lockdep_map);

        if (wq->flags & WQ_BH)
                local_bh_enable();
#endif
}

static void touch_work_lockdep_map(struct work_struct *work,
                                   struct workqueue_struct *wq)
{
#ifdef CONFIG_LOCKDEP
        if (wq->flags & WQ_BH)
                local_bh_disable();

        lock_map_acquire(&work->lockdep_map);
        lock_map_release(&work->lockdep_map);

        if (wq->flags & WQ_BH)
                local_bh_enable();
#endif
}

/**
 * __flush_workqueue - ensure that any scheduled work has run to completion.
 * @wq: workqueue to flush
 *
 * This function sleeps until all work items which were queued on entry
 * have finished execution, but it is not livelocked by new incoming ones.
 */
void __flush_workqueue(struct workqueue_struct *wq)
{
        struct wq_flusher this_flusher = {
                .list = LIST_HEAD_INIT(this_flusher.list),
                .flush_color = -1,
                .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map),
        };
        int next_color;

        if (WARN_ON(!wq_online))
                return;

        touch_wq_lockdep_map(wq);

        mutex_lock(&wq->mutex);

        /*
         * Start-to-wait phase
         */
        next_color = work_next_color(wq->work_color);

        if (next_color != wq->flush_color) {
                /*
                 * Color space is not full.  The current work_color
                 * becomes our flush_color and work_color is advanced
                 * by one.
                 */
                WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
                this_flusher.flush_color = wq->work_color;
                wq->work_color = next_color;

                if (!wq->first_flusher) {
                        /* no flush in progress, become the first flusher */
                        WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);

                        wq->first_flusher = &this_flusher;

                        if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
                                                       wq->work_color)) {
                                /* nothing to flush, done */
                                wq->flush_color = next_color;
                                wq->first_flusher = NULL;
                                goto out_unlock;
                        }
                } else {
                        /* wait in queue */
                        WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
                        list_add_tail(&this_flusher.list, &wq->flusher_queue);
                        flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
                }
        } else {
                /*
                 * Oops, color space is full, wait on overflow queue.
                 * The next flush completion will assign us
                 * flush_color and transfer to flusher_queue.
                 */
                list_add_tail(&this_flusher.list, &wq->flusher_overflow);
        }

        check_flush_dependency(wq, NULL);

        mutex_unlock(&wq->mutex);

        wait_for_completion(&this_flusher.done);

        /*
         * Wake-up-and-cascade phase
         *
         * First flushers are responsible for cascading flushes and
         * handling overflow.  Non-first flushers can simply return.
         */
        if (READ_ONCE(wq->first_flusher) != &this_flusher)
                return;

        mutex_lock(&wq->mutex);

        /* we might have raced, check again with mutex held */
        if (wq->first_flusher != &this_flusher)
                goto out_unlock;

        WRITE_ONCE(wq->first_flusher, NULL);

        WARN_ON_ONCE(!list_empty(&this_flusher.list));
        WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);

        while (true) {
                struct wq_flusher *next, *tmp;

                /* complete all the flushers sharing the current flush color */
                list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
                        if (next->flush_color != wq->flush_color)
                                break;
                        list_del_init(&next->list);
                        complete(&next->done);
                }

                WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
                             wq->flush_color != work_next_color(wq->work_color));

                /* this flush_color is finished, advance by one */
                wq->flush_color = work_next_color(wq->flush_color);

                /* one color has been freed, handle overflow queue */
                if (!list_empty(&wq->flusher_overflow)) {
                        /*
                         * Assign the same color to all overflowed
                         * flushers, advance work_color and append to
                         * flusher_queue.  This is the start-to-wait
                         * phase for these overflowed flushers.
                         */
                        list_for_each_entry(tmp, &wq->flusher_overflow, list)
                                tmp->flush_color = wq->work_color;

                        wq->work_color = work_next_color(wq->work_color);

                        list_splice_tail_init(&wq->flusher_overflow,
                                              &wq->flusher_queue);
                        flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
                }

                if (list_empty(&wq->flusher_queue)) {
                        WARN_ON_ONCE(wq->flush_color != wq->work_color);
                        break;
                }

                /*
                 * Need to flush more colors.  Make the next flusher
                 * the new first flusher and arm pwqs.
                 */
                WARN_ON_ONCE(wq->flush_color == wq->work_color);
                WARN_ON_ONCE(wq->flush_color != next->flush_color);

                list_del_init(&next->list);
                wq->first_flusher = next;

                if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
                        break;

                /*
                 * Meh... this color is already done, clear first
                 * flusher and repeat cascading.
                 */
                wq->first_flusher = NULL;
        }

out_unlock:
        mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL(__flush_workqueue);

/**
 * drain_workqueue - drain a workqueue
 * @wq: workqueue to drain
 *
 * Wait until the workqueue becomes empty.  While draining is in progress,
 * only chain queueing is allowed.  IOW, only currently pending or running
 * work items on @wq can queue further work items on it.  @wq is flushed
 * repeatedly until it becomes empty.  The number of flushing is determined
 * by the depth of chaining and should be relatively short.  Whine if it
 * takes too long.
 */
void drain_workqueue(struct workqueue_struct *wq)
{
        unsigned int flush_cnt = 0;
        struct pool_workqueue *pwq;

        /*
         * __queue_work() needs to test whether there are drainers, is much
         * hotter than drain_workqueue() and already looks at @wq->flags.
         * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
         */
        mutex_lock(&wq->mutex);
        if (!wq->nr_drainers++)
                wq->flags |= __WQ_DRAINING;
        mutex_unlock(&wq->mutex);
reflush:
        __flush_workqueue(wq);

        mutex_lock(&wq->mutex);

        for_each_pwq(pwq, wq) {
                bool drained;

                raw_spin_lock_irq(&pwq->pool->lock);
                drained = pwq_is_empty(pwq);
                raw_spin_unlock_irq(&pwq->pool->lock);

                if (drained)
                        continue;

                if (++flush_cnt == 10 ||
                    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
                        pr_warn("workqueue %s: %s() isn't complete after %u tries\n",
                                wq->name, __func__, flush_cnt);

                mutex_unlock(&wq->mutex);
                goto reflush;
        }

        if (!--wq->nr_drainers)
                wq->flags &= ~__WQ_DRAINING;
        mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL_GPL(drain_workqueue);

static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
                             bool from_cancel)
{
        struct worker *worker = NULL;
        struct worker_pool *pool;
        struct pool_workqueue *pwq;
        struct workqueue_struct *wq;

        rcu_read_lock();
        pool = get_work_pool(work);
        if (!pool) {
                rcu_read_unlock();
                return false;
        }

        raw_spin_lock_irq(&pool->lock);
        /* see the comment in try_to_grab_pending() with the same code */
        pwq = get_work_pwq(work);
        if (pwq) {
                if (unlikely(pwq->pool != pool))
                        goto already_gone;
        } else {
                worker = find_worker_executing_work(pool, work);
                if (!worker)
                        goto already_gone;
                pwq = worker->current_pwq;
        }

        wq = pwq->wq;
        check_flush_dependency(wq, work);

        insert_wq_barrier(pwq, barr, work, worker);
        raw_spin_unlock_irq(&pool->lock);

        touch_work_lockdep_map(work, wq);

        /*
         * Force a lock recursion deadlock when using flush_work() inside a
         * single-threaded or rescuer equipped workqueue.
         *
         * For single threaded workqueues the deadlock happens when the work
         * is after the work issuing the flush_work(). For rescuer equipped
         * workqueues the deadlock happens when the rescuer stalls, blocking
         * forward progress.
         */
        if (!from_cancel && (wq->saved_max_active == 1 || wq->rescuer))
                touch_wq_lockdep_map(wq);

        rcu_read_unlock();
        return true;
already_gone:
        raw_spin_unlock_irq(&pool->lock);
        rcu_read_unlock();
        return false;
}

static bool __flush_work(struct work_struct *work, bool from_cancel)
{
        struct wq_barrier barr;
        unsigned long data;

        if (WARN_ON(!wq_online))
                return false;

        if (WARN_ON(!work->func))
                return false;

        if (!start_flush_work(work, &barr, from_cancel))
                return false;

        /*
         * start_flush_work() returned %true. If @from_cancel is set, we know
         * that @work must have been executing during start_flush_work() and
         * can't currently be queued. Its data must contain OFFQ bits. If @work
         * was queued on a BH workqueue, we also know that it was running in the
         * BH context and thus can be busy-waited.
         */
        data = *work_data_bits(work);
        if (from_cancel &&
            !WARN_ON_ONCE(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_BH)) {
                /*
                 * On RT, prevent a live lock when %current preempted soft
                 * interrupt processing or prevents ksoftirqd from running by
                 * keeping flipping BH. If the BH work item runs on a different
                 * CPU then this has no effect other than doing the BH
                 * disable/enable dance for nothing. This is copied from
                 * kernel/softirq.c::tasklet_unlock_spin_wait().
                 */
                while (!try_wait_for_completion(&barr.done)) {
                        if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
                                local_bh_disable();
                                local_bh_enable();
                        } else {
                                cpu_relax();
                        }
                }
        } else {
                wait_for_completion(&barr.done);
        }

        destroy_work_on_stack(&barr.work);
        return true;
}

/**
 * flush_work - wait for a work to finish executing the last queueing instance
 * @work: the work to flush
 *
 * Wait until @work has finished execution.  @work is guaranteed to be idle
 * on return if it hasn't been requeued since flush started.
 *
 * Return:
 * %true if flush_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_work(struct work_struct *work)
{
        might_sleep();
        return __flush_work(work, false);
}
EXPORT_SYMBOL_GPL(flush_work);

/**
 * flush_delayed_work - wait for a dwork to finish executing the last queueing
 * @dwork: the delayed work to flush
 *
 * Delayed timer is cancelled and the pending work is queued for
 * immediate execution.  Like flush_work(), this function only
 * considers the last queueing instance of @dwork.
 *
 * Return:
 * %true if flush_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_delayed_work(struct delayed_work *dwork)
{
        local_irq_disable();
        if (del_timer_sync(&dwork->timer))
                __queue_work(dwork->cpu, dwork->wq, &dwork->work);
        local_irq_enable();
        return flush_work(&dwork->work);
}
EXPORT_SYMBOL(flush_delayed_work);

/**
 * flush_rcu_work - wait for a rwork to finish executing the last queueing
 * @rwork: the rcu work to flush
 *
 * Return:
 * %true if flush_rcu_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_rcu_work(struct rcu_work *rwork)
{
        if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) {
                rcu_barrier();
                flush_work(&rwork->work);
                return true;
        } else {
                return flush_work(&rwork->work);
        }
}
EXPORT_SYMBOL(flush_rcu_work);

static void work_offqd_disable(struct work_offq_data *offqd)
{
        const unsigned long max = (1lu << WORK_OFFQ_DISABLE_BITS) - 1;

        if (likely(offqd->disable < max))
                offqd->disable++;
        else
                WARN_ONCE(true, "workqueue: work disable count overflowed\n");
}

static void work_offqd_enable(struct work_offq_data *offqd)
{
        if (likely(offqd->disable > 0))
                offqd->disable--;
        else
                WARN_ONCE(true, "workqueue: work disable count underflowed\n");
}

static bool __cancel_work(struct work_struct *work, u32 cflags)
{
        struct work_offq_data offqd;
        unsigned long irq_flags;
        int ret;

        ret = work_grab_pending(work, cflags, &irq_flags);

        work_offqd_unpack(&offqd, *work_data_bits(work));

        if (cflags & WORK_CANCEL_DISABLE)
                work_offqd_disable(&offqd);

        set_work_pool_and_clear_pending(work, offqd.pool_id,
                                        work_offqd_pack_flags(&offqd));
        local_irq_restore(irq_flags);
        return ret;
}

static bool __cancel_work_sync(struct work_struct *work, u32 cflags)
{
        bool ret;

        ret = __cancel_work(work, cflags | WORK_CANCEL_DISABLE);

        if (*work_data_bits(work) & WORK_OFFQ_BH)
                WARN_ON_ONCE(in_hardirq());
        else
                might_sleep();

        /*
         * Skip __flush_work() during early boot when we know that @work isn't
         * executing. This allows canceling during early boot.
         */
        if (wq_online)
                __flush_work(work, true);

        if (!(cflags & WORK_CANCEL_DISABLE))
                enable_work(work);

        return ret;
}

/*
 * See cancel_delayed_work()
 */
bool cancel_work(struct work_struct *work)
{
        return __cancel_work(work, 0);
}
EXPORT_SYMBOL(cancel_work);

/**
 * cancel_work_sync - cancel a work and wait for it to finish
 * @work: the work to cancel
 *
 * Cancel @work and wait for its execution to finish. This function can be used
 * even if the work re-queues itself or migrates to another workqueue. On return
 * from this function, @work is guaranteed to be not pending or executing on any
 * CPU as long as there aren't racing enqueues.
 *
 * cancel_work_sync(&delayed_work->work) must not be used for delayed_work's.
 * Use cancel_delayed_work_sync() instead.
 *
 * Must be called from a sleepable context if @work was last queued on a non-BH
 * workqueue. Can also be called from non-hardirq atomic contexts including BH
 * if @work was last queued on a BH workqueue.
 *
 * Returns %true if @work was pending, %false otherwise.
 */
bool cancel_work_sync(struct work_struct *work)
{
        return __cancel_work_sync(work, 0);
}
EXPORT_SYMBOL_GPL(cancel_work_sync);

/**
 * cancel_delayed_work - cancel a delayed work
 * @dwork: delayed_work to cancel
 *
 * Kill off a pending delayed_work.
 *
 * Return: %true if @dwork was pending and canceled; %false if it wasn't
 * pending.
 *
 * Note:
 * The work callback function may still be running on return, unless
 * it returns %true and the work doesn't re-arm itself.  Explicitly flush or
 * use cancel_delayed_work_sync() to wait on it.
 *
 * This function is safe to call from any context including IRQ handler.
 */
bool cancel_delayed_work(struct delayed_work *dwork)
{
        return __cancel_work(&dwork->work, WORK_CANCEL_DELAYED);
}
EXPORT_SYMBOL(cancel_delayed_work);

/**
 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
 * @dwork: the delayed work cancel
 *
 * This is cancel_work_sync() for delayed works.
 *
 * Return:
 * %true if @dwork was pending, %false otherwise.
 */
bool cancel_delayed_work_sync(struct delayed_work *dwork)
{
        return __cancel_work_sync(&dwork->work, WORK_CANCEL_DELAYED);
}
EXPORT_SYMBOL(cancel_delayed_work_sync);

/**
 * disable_work - Disable and cancel a work item
 * @work: work item to disable
 *
 * Disable @work by incrementing its disable count and cancel it if currently
 * pending. As long as the disable count is non-zero, any attempt to queue @work
 * will fail and return %false. The maximum supported disable depth is 2 to the
 * power of %WORK_OFFQ_DISABLE_BITS, currently 65536.
 *
 * Can be called from any context. Returns %true if @work was pending, %false
 * otherwise.
 */
bool disable_work(struct work_struct *work)
{
        return __cancel_work(work, WORK_CANCEL_DISABLE);
}
EXPORT_SYMBOL_GPL(disable_work);

/**
 * disable_work_sync - Disable, cancel and drain a work item
 * @work: work item to disable
 *
 * Similar to disable_work() but also wait for @work to finish if currently
 * executing.
 *
 * Must be called from a sleepable context if @work was last queued on a non-BH
 * workqueue. Can also be called from non-hardirq atomic contexts including BH
 * if @work was last queued on a BH workqueue.
 *
 * Returns %true if @work was pending, %false otherwise.
 */
bool disable_work_sync(struct work_struct *work)
{
        return __cancel_work_sync(work, WORK_CANCEL_DISABLE);
}
EXPORT_SYMBOL_GPL(disable_work_sync);

/**
 * enable_work - Enable a work item
 * @work: work item to enable
 *
 * Undo disable_work[_sync]() by decrementing @work's disable count. @work can
 * only be queued if its disable count is 0.
 *
 * Can be called from any context. Returns %true if the disable count reached 0.
 * Otherwise, %false.
 */
bool enable_work(struct work_struct *work)
{
        struct work_offq_data offqd;
        unsigned long irq_flags;

        work_grab_pending(work, 0, &irq_flags);

        work_offqd_unpack(&offqd, *work_data_bits(work));
        work_offqd_enable(&offqd);
        set_work_pool_and_clear_pending(work, offqd.pool_id,
                                        work_offqd_pack_flags(&offqd));
        local_irq_restore(irq_flags);

        return !offqd.disable;
}
EXPORT_SYMBOL_GPL(enable_work);

/**
 * disable_delayed_work - Disable and cancel a delayed work item
 * @dwork: delayed work item to disable
 *
 * disable_work() for delayed work items.
 */
bool disable_delayed_work(struct delayed_work *dwork)
{
        return __cancel_work(&dwork->work,
                             WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE);
}
EXPORT_SYMBOL_GPL(disable_delayed_work);

/**
 * disable_delayed_work_sync - Disable, cancel and drain a delayed work item
 * @dwork: delayed work item to disable
 *
 * disable_work_sync() for delayed work items.
 */
bool disable_delayed_work_sync(struct delayed_work *dwork)
{
        return __cancel_work_sync(&dwork->work,
                                  WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE);
}
EXPORT_SYMBOL_GPL(disable_delayed_work_sync);

/**
 * enable_delayed_work - Enable a delayed work item
 * @dwork: delayed work item to enable
 *
 * enable_work() for delayed work items.
 */
bool enable_delayed_work(struct delayed_work *dwork)
{
        return enable_work(&dwork->work);
}
EXPORT_SYMBOL_GPL(enable_delayed_work);

/**
 * schedule_on_each_cpu - execute a function synchronously on each online CPU
 * @func: the function to call
 *
 * schedule_on_each_cpu() executes @func on each online CPU using the
 * system workqueue and blocks until all CPUs have completed.
 * schedule_on_each_cpu() is very slow.
 *
 * Return:
 * 0 on success, -errno on failure.
 */
int schedule_on_each_cpu(work_func_t func)
{
        int cpu;
        struct work_struct __percpu *works;

        works = alloc_percpu(struct work_struct);
        if (!works)
                return -ENOMEM;

        cpus_read_lock();

        for_each_online_cpu(cpu) {
                struct work_struct *work = per_cpu_ptr(works, cpu);

                INIT_WORK(work, func);
                schedule_work_on(cpu, work);
        }

        for_each_online_cpu(cpu)
                flush_work(per_cpu_ptr(works, cpu));

        cpus_read_unlock();
        free_percpu(works);
        return 0;
}

/**
 * execute_in_process_context - reliably execute the routine with user context
 * @fn:                the function to execute
 * @ew:                guaranteed storage for the execute work structure (must
 *                be available when the work executes)
 *
 * Executes the function immediately if process context is available,
 * otherwise schedules the function for delayed execution.
 *
 * Return:        0 - function was executed
 *                1 - function was scheduled for execution
 */
int execute_in_process_context(work_func_t fn, struct execute_work *ew)
{
        if (!in_interrupt()) {
                fn(&ew->work);
                return 0;
        }

        INIT_WORK(&ew->work, fn);
        schedule_work(&ew->work);

        return 1;
}
EXPORT_SYMBOL_GPL(execute_in_process_context);

/**
 * free_workqueue_attrs - free a workqueue_attrs
 * @attrs: workqueue_attrs to free
 *
 * Undo alloc_workqueue_attrs().
 */
void free_workqueue_attrs(struct workqueue_attrs *attrs)
{
        if (attrs) {
                free_cpumask_var(attrs->cpumask);
                free_cpumask_var(attrs->__pod_cpumask);
                kfree(attrs);
        }
}

/**
 * alloc_workqueue_attrs - allocate a workqueue_attrs
 *
 * Allocate a new workqueue_attrs, initialize with default settings and
 * return it.
 *
 * Return: The allocated new workqueue_attr on success. %NULL on failure.
 */
struct workqueue_attrs *alloc_workqueue_attrs(void)
{
        struct workqueue_attrs *attrs;

        attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
        if (!attrs)
                goto fail;
        if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL))
                goto fail;
        if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL))
                goto fail;

        cpumask_copy(attrs->cpumask, cpu_possible_mask);
        attrs->affn_scope = WQ_AFFN_DFL;
        return attrs;
fail:
        free_workqueue_attrs(attrs);
        return NULL;
}

static void copy_workqueue_attrs(struct workqueue_attrs *to,
                                 const struct workqueue_attrs *from)
{
        to->nice = from->nice;
        cpumask_copy(to->cpumask, from->cpumask);
        cpumask_copy(to->__pod_cpumask, from->__pod_cpumask);
        to->affn_strict = from->affn_strict;

        /*
         * Unlike hash and equality test, copying shouldn't ignore wq-only
         * fields as copying is used for both pool and wq attrs. Instead,
         * get_unbound_pool() explicitly clears the fields.
         */
        to->affn_scope = from->affn_scope;
        to->ordered = from->ordered;
}

/*
 * Some attrs fields are workqueue-only. Clear them for worker_pool's. See the
 * comments in 'struct workqueue_attrs' definition.
 */
static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs)
{
        attrs->affn_scope = WQ_AFFN_NR_TYPES;
        attrs->ordered = false;
        if (attrs->affn_strict)
                cpumask_copy(attrs->cpumask, cpu_possible_mask);
}

/* hash value of the content of @attr */
static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
{
        u32 hash = 0;

        hash = jhash_1word(attrs->nice, hash);
        hash = jhash_1word(attrs->affn_strict, hash);
        hash = jhash(cpumask_bits(attrs->__pod_cpumask),
                     BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
        if (!attrs->affn_strict)
                hash = jhash(cpumask_bits(attrs->cpumask),
                             BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
        return hash;
}

/* content equality test */
static bool wqattrs_equal(const struct workqueue_attrs *a,
                          const struct workqueue_attrs *b)
{
        if (a->nice != b->nice)
                return false;
        if (a->affn_strict != b->affn_strict)
                return false;
        if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask))
                return false;
        if (!a->affn_strict && !cpumask_equal(a->cpumask, b->cpumask))
                return false;
        return true;
}

/* Update @attrs with actually available CPUs */
static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs,
                                      const cpumask_t *unbound_cpumask)
{
        /*
         * Calculate the effective CPU mask of @attrs given @unbound_cpumask. If
         * @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to
         * @unbound_cpumask.
         */
        cpumask_and(attrs->cpumask, attrs->cpumask, unbound_cpumask);
        if (unlikely(cpumask_empty(attrs->cpumask)))
                cpumask_copy(attrs->cpumask, unbound_cpumask);
}

/* find wq_pod_type to use for @attrs */
static const struct wq_pod_type *
wqattrs_pod_type(const struct workqueue_attrs *attrs)
{
        enum wq_affn_scope scope;
        struct wq_pod_type *pt;

        /* to synchronize access to wq_affn_dfl */
        lockdep_assert_held(&wq_pool_mutex);

        if (attrs->affn_scope == WQ_AFFN_DFL)
                scope = wq_affn_dfl;
        else
                scope = attrs->affn_scope;

        pt = &wq_pod_types[scope];

        if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) &&
            likely(pt->nr_pods))
                return pt;

        /*
         * Before workqueue_init_topology(), only SYSTEM is available which is
         * initialized in workqueue_init_early().
         */
        pt = &wq_pod_types[WQ_AFFN_SYSTEM];
        BUG_ON(!pt->nr_pods);
        return pt;
}

/**
 * init_worker_pool - initialize a newly zalloc'd worker_pool
 * @pool: worker_pool to initialize
 *
 * Initialize a newly zalloc'd @pool.  It also allocates @pool->attrs.
 *
 * Return: 0 on success, -errno on failure.  Even on failure, all fields
 * inside @pool proper are initialized and put_unbound_pool() can be called
 * on @pool safely to release it.
 */
static int init_worker_pool(struct worker_pool *pool)
{
        raw_spin_lock_init(&pool->lock);
        pool->id = -1;
        pool->cpu = -1;
        pool->node = NUMA_NO_NODE;
        pool->flags |= POOL_DISASSOCIATED;
        pool->watchdog_ts = jiffies;
        INIT_LIST_HEAD(&pool->worklist);
        INIT_LIST_HEAD(&pool->idle_list);
        hash_init(pool->busy_hash);

        timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
        INIT_WORK(&pool->idle_cull_work, idle_cull_fn);

        timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);

        INIT_LIST_HEAD(&pool->workers);
        INIT_LIST_HEAD(&pool->dying_workers);

        ida_init(&pool->worker_ida);
        INIT_HLIST_NODE(&pool->hash_node);
        pool->refcnt = 1;

        /* shouldn't fail above this point */
        pool->attrs = alloc_workqueue_attrs();
        if (!pool->attrs)
                return -ENOMEM;

        wqattrs_clear_for_pool(pool->attrs);

        return 0;
}

#ifdef CONFIG_LOCKDEP
static void wq_init_lockdep(struct workqueue_struct *wq)
{
        char *lock_name;

        lockdep_register_key(&wq->key);
        lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name);
        if (!lock_name)
                lock_name = wq->name;

        wq->lock_name = lock_name;
        lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0);
}

static void wq_unregister_lockdep(struct workqueue_struct *wq)
{
        lockdep_unregister_key(&wq->key);
}

static void wq_free_lockdep(struct workqueue_struct *wq)
{
        if (wq->lock_name != wq->name)
                kfree(wq->lock_name);
}
#else
static void wq_init_lockdep(struct workqueue_struct *wq)
{
}

static void wq_unregister_lockdep(struct workqueue_struct *wq)
{
}

static void wq_free_lockdep(struct workqueue_struct *wq)
{
}
#endif

static void free_node_nr_active(struct wq_node_nr_active **nna_ar)
{
        int node;

        for_each_node(node) {
                kfree(nna_ar[node]);
                nna_ar[node] = NULL;
        }

        kfree(nna_ar[nr_node_ids]);
        nna_ar[nr_node_ids] = NULL;
}

static void init_node_nr_active(struct wq_node_nr_active *nna)
{
        nna->max = WQ_DFL_MIN_ACTIVE;
        atomic_set(&nna->nr, 0);
        raw_spin_lock_init(&nna->lock);
        INIT_LIST_HEAD(&nna->pending_pwqs);
}

/*
 * Each node's nr_active counter will be accessed mostly from its own node and
 * should be allocated in the node.
 */
static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar)
{
        struct wq_node_nr_active *nna;
        int node;

        for_each_node(node) {
                nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node);
                if (!nna)
                        goto err_free;
                init_node_nr_active(nna);
                nna_ar[node] = nna;
        }

        /* [nr_node_ids] is used as the fallback */
        nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE);
        if (!nna)
                goto err_free;
        init_node_nr_active(nna);
        nna_ar[nr_node_ids] = nna;

        return 0;

err_free:
        free_node_nr_active(nna_ar);
        return -ENOMEM;
}

static void rcu_free_wq(struct rcu_head *rcu)
{
        struct workqueue_struct *wq =
                container_of(rcu, struct workqueue_struct, rcu);

        if (wq->flags & WQ_UNBOUND)
                free_node_nr_active(wq->node_nr_active);

        wq_free_lockdep(wq);
        free_percpu(wq->cpu_pwq);
        free_workqueue_attrs(wq->unbound_attrs);
        kfree(wq);
}

static void rcu_free_pool(struct rcu_head *rcu)
{
        struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);

        ida_destroy(&pool->worker_ida);
        free_workqueue_attrs(pool->attrs);
        kfree(pool);
}

/**
 * put_unbound_pool - put a worker_pool
 * @pool: worker_pool to put
 *
 * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
 * safe manner.  get_unbound_pool() calls this function on its failure path
 * and this function should be able to release pools which went through,
 * successfully or not, init_worker_pool().
 *
 * Should be called with wq_pool_mutex held.
 */
static void put_unbound_pool(struct worker_pool *pool)
{
        DECLARE_COMPLETION_ONSTACK(detach_completion);
        struct worker *worker;
        LIST_HEAD(cull_list);

        lockdep_assert_held(&wq_pool_mutex);

        if (--pool->refcnt)
                return;

        /* sanity checks */
        if (WARN_ON(!(pool->cpu < 0)) ||
            WARN_ON(!list_empty(&pool->worklist)))
                return;

        /* release id and unhash */
        if (pool->id >= 0)
                idr_remove(&worker_pool_idr, pool->id);
        hash_del(&pool->hash_node);

        /*
         * Become the manager and destroy all workers.  This prevents
         * @pool's workers from blocking on attach_mutex.  We're the last
         * manager and @pool gets freed with the flag set.
         *
         * Having a concurrent manager is quite unlikely to happen as we can
         * only get here with
         *   pwq->refcnt == pool->refcnt == 0
         * which implies no work queued to the pool, which implies no worker can
         * become the manager. However a worker could have taken the role of
         * manager before the refcnts dropped to 0, since maybe_create_worker()
         * drops pool->lock
         */
        while (true) {
                rcuwait_wait_event(&manager_wait,
                                   !(pool->flags & POOL_MANAGER_ACTIVE),
                                   TASK_UNINTERRUPTIBLE);

                mutex_lock(&wq_pool_attach_mutex);
                raw_spin_lock_irq(&pool->lock);
                if (!(pool->flags & POOL_MANAGER_ACTIVE)) {
                        pool->flags |= POOL_MANAGER_ACTIVE;
                        break;
                }
                raw_spin_unlock_irq(&pool->lock);
                mutex_unlock(&wq_pool_attach_mutex);
        }

        while ((worker = first_idle_worker(pool)))
                set_worker_dying(worker, &cull_list);
        WARN_ON(pool->nr_workers || pool->nr_idle);
        raw_spin_unlock_irq(&pool->lock);

        wake_dying_workers(&cull_list);

        if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers))
                pool->detach_completion = &detach_completion;
        mutex_unlock(&wq_pool_attach_mutex);

        if (pool->detach_completion)
                wait_for_completion(pool->detach_completion);

        /* shut down the timers */
        del_timer_sync(&pool->idle_timer);
        cancel_work_sync(&pool->idle_cull_work);
        del_timer_sync(&pool->mayday_timer);

        /* RCU protected to allow dereferences from get_work_pool() */
        call_rcu(&pool->rcu, rcu_free_pool);
}

/**
 * get_unbound_pool - get a worker_pool with the specified attributes
 * @attrs: the attributes of the worker_pool to get
 *
 * Obtain a worker_pool which has the same attributes as @attrs, bump the
 * reference count and return it.  If there already is a matching
 * worker_pool, it will be used; otherwise, this function attempts to
 * create a new one.
 *
 * Should be called with wq_pool_mutex held.
 *
 * Return: On success, a worker_pool with the same attributes as @attrs.
 * On failure, %NULL.
 */
static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
{
        struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA];
        u32 hash = wqattrs_hash(attrs);
        struct worker_pool *pool;
        int pod, node = NUMA_NO_NODE;

        lockdep_assert_held(&wq_pool_mutex);

        /* do we already have a matching pool? */
        hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
                if (wqattrs_equal(pool->attrs, attrs)) {
                        pool->refcnt++;
                        return pool;
                }
        }

        /* If __pod_cpumask is contained inside a NUMA pod, that's our node */
        for (pod = 0; pod < pt->nr_pods; pod++) {
                if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) {
                        node = pt->pod_node[pod];
                        break;
                }
        }

        /* nope, create a new one */
        pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node);
        if (!pool || init_worker_pool(pool) < 0)
                goto fail;

        pool->node = node;
        copy_workqueue_attrs(pool->attrs, attrs);
        wqattrs_clear_for_pool(pool->attrs);

        if (worker_pool_assign_id(pool) < 0)
                goto fail;

        /* create and start the initial worker */
        if (wq_online && !create_worker(pool))
                goto fail;

        /* install */
        hash_add(unbound_pool_hash, &pool->hash_node, hash);

        return pool;
fail:
        if (pool)
                put_unbound_pool(pool);
        return NULL;
}

static void rcu_free_pwq(struct rcu_head *rcu)
{
        kmem_cache_free(pwq_cache,
                        container_of(rcu, struct pool_workqueue, rcu));
}

/*
 * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero
 * refcnt and needs to be destroyed.
 */
static void pwq_release_workfn(struct kthread_work *work)
{
        struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
                                                  release_work);
        struct workqueue_struct *wq = pwq->wq;
        struct worker_pool *pool = pwq->pool;
        bool is_last = false;

        /*
         * When @pwq is not linked, it doesn't hold any reference to the
         * @wq, and @wq is invalid to access.
         */
        if (!list_empty(&pwq->pwqs_node)) {
                mutex_lock(&wq->mutex);
                list_del_rcu(&pwq->pwqs_node);
                is_last = list_empty(&wq->pwqs);

                /*
                 * For ordered workqueue with a plugged dfl_pwq, restart it now.
                 */
                if (!is_last && (wq->flags & __WQ_ORDERED))
                        unplug_oldest_pwq(wq);

                mutex_unlock(&wq->mutex);
        }

        if (wq->flags & WQ_UNBOUND) {
                mutex_lock(&wq_pool_mutex);
                put_unbound_pool(pool);
                mutex_unlock(&wq_pool_mutex);
        }

        if (!list_empty(&pwq->pending_node)) {
                struct wq_node_nr_active *nna =
                        wq_node_nr_active(pwq->wq, pwq->pool->node);

                raw_spin_lock_irq(&nna->lock);
                list_del_init(&pwq->pending_node);
                raw_spin_unlock_irq(&nna->lock);
        }

        call_rcu(&pwq->rcu, rcu_free_pwq);

        /*
         * If we're the last pwq going away, @wq is already dead and no one
         * is gonna access it anymore.  Schedule RCU free.
         */
        if (is_last) {
                wq_unregister_lockdep(wq);
                call_rcu(&wq->rcu, rcu_free_wq);
        }
}

/* initialize newly allocated @pwq which is associated with @wq and @pool */
static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
                     struct worker_pool *pool)
{
        BUG_ON((unsigned long)pwq & ~WORK_STRUCT_PWQ_MASK);

        memset(pwq, 0, sizeof(*pwq));

        pwq->pool = pool;
        pwq->wq = wq;
        pwq->flush_color = -1;
        pwq->refcnt = 1;
        INIT_LIST_HEAD(&pwq->inactive_works);
        INIT_LIST_HEAD(&pwq->pending_node);
        INIT_LIST_HEAD(&pwq->pwqs_node);
        INIT_LIST_HEAD(&pwq->mayday_node);
        kthread_init_work(&pwq->release_work, pwq_release_workfn);
}

/* sync @pwq with the current state of its associated wq and link it */
static void link_pwq(struct pool_workqueue *pwq)
{
        struct workqueue_struct *wq = pwq->wq;

        lockdep_assert_held(&wq->mutex);

        /* may be called multiple times, ignore if already linked */
        if (!list_empty(&pwq->pwqs_node))
                return;

        /* set the matching work_color */
        pwq->work_color = wq->work_color;

        /* link in @pwq */
        list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs);
}

/* obtain a pool matching @attr and create a pwq associating the pool and @wq */
static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
                                        const struct workqueue_attrs *attrs)
{
        struct worker_pool *pool;
        struct pool_workqueue *pwq;

        lockdep_assert_held(&wq_pool_mutex);

        pool = get_unbound_pool(attrs);
        if (!pool)
                return NULL;

        pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
        if (!pwq) {
                put_unbound_pool(pool);
                return NULL;
        }

        init_pwq(pwq, wq, pool);
        return pwq;
}

/**
 * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod
 * @attrs: the wq_attrs of the default pwq of the target workqueue
 * @cpu: the target CPU
 * @cpu_going_down: if >= 0, the CPU to consider as offline
 *
 * Calculate the cpumask a workqueue with @attrs should use on @pod. If
 * @cpu_going_down is >= 0, that cpu is considered offline during calculation.
 * The result is stored in @attrs->__pod_cpumask.
 *
 * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled
 * and @pod has online CPUs requested by @attrs, the returned cpumask is the
 * intersection of the possible CPUs of @pod and @attrs->cpumask.
 *
 * The caller is responsible for ensuring that the cpumask of @pod stays stable.
 */
static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu,
                                int cpu_going_down)
{
        const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
        int pod = pt->cpu_pod[cpu];

        /* does @pod have any online CPUs @attrs wants? */
        cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask);
        cpumask_and(attrs->__pod_cpumask, attrs->__pod_cpumask, cpu_online_mask);
        if (cpu_going_down >= 0)
                cpumask_clear_cpu(cpu_going_down, attrs->__pod_cpumask);

        if (cpumask_empty(attrs->__pod_cpumask)) {
                cpumask_copy(attrs->__pod_cpumask, attrs->cpumask);
                return;
        }

        /* yeap, return possible CPUs in @pod that @attrs wants */
        cpumask_and(attrs->__pod_cpumask, attrs->cpumask, pt->pod_cpus[pod]);

        if (cpumask_empty(attrs->__pod_cpumask))
                pr_warn_once("WARNING: workqueue cpumask: online intersect > "
                                "possible intersect\n");
}

/* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */
static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq,
                                        int cpu, struct pool_workqueue *pwq)
{
        struct pool_workqueue __rcu **slot = unbound_pwq_slot(wq, cpu);
        struct pool_workqueue *old_pwq;

        lockdep_assert_held(&wq_pool_mutex);
        lockdep_assert_held(&wq->mutex);

        /* link_pwq() can handle duplicate calls */
        link_pwq(pwq);

        old_pwq = rcu_access_pointer(*slot);
        rcu_assign_pointer(*slot, pwq);
        return old_pwq;
}

/* context to store the prepared attrs & pwqs before applying */
struct apply_wqattrs_ctx {
        struct workqueue_struct        *wq;                /* target workqueue */
        struct workqueue_attrs        *attrs;                /* attrs to apply */
        struct list_head        list;                /* queued for batching commit */
        struct pool_workqueue        *dfl_pwq;
        struct pool_workqueue        *pwq_tbl[];
};

/* free the resources after success or abort */
static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
{
        if (ctx) {
                int cpu;

                for_each_possible_cpu(cpu)
                        put_pwq_unlocked(ctx->pwq_tbl[cpu]);
                put_pwq_unlocked(ctx->dfl_pwq);

                free_workqueue_attrs(ctx->attrs);

                kfree(ctx);
        }
}

/* allocate the attrs and pwqs for later installation */
static struct apply_wqattrs_ctx *
apply_wqattrs_prepare(struct workqueue_struct *wq,
                      const struct workqueue_attrs *attrs,
                      const cpumask_var_t unbound_cpumask)
{
        struct apply_wqattrs_ctx *ctx;
        struct workqueue_attrs *new_attrs;
        int cpu;

        lockdep_assert_held(&wq_pool_mutex);

        if (WARN_ON(attrs->affn_scope < 0 ||
                    attrs->affn_scope >= WQ_AFFN_NR_TYPES))
                return ERR_PTR(-EINVAL);

        ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL);

        new_attrs = alloc_workqueue_attrs();
        if (!ctx || !new_attrs)
                goto out_free;

        /*
         * If something goes wrong during CPU up/down, we'll fall back to
         * the default pwq covering whole @attrs->cpumask.  Always create
         * it even if we don't use it immediately.
         */
        copy_workqueue_attrs(new_attrs, attrs);
        wqattrs_actualize_cpumask(new_attrs, unbound_cpumask);
        cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
        ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
        if (!ctx->dfl_pwq)
                goto out_free;

        for_each_possible_cpu(cpu) {
                if (new_attrs->ordered) {
                        ctx->dfl_pwq->refcnt++;
                        ctx->pwq_tbl[cpu] = ctx->dfl_pwq;
                } else {
                        wq_calc_pod_cpumask(new_attrs, cpu, -1);
                        ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs);
                        if (!ctx->pwq_tbl[cpu])
                                goto out_free;
                }
        }

        /* save the user configured attrs and sanitize it. */
        copy_workqueue_attrs(new_attrs, attrs);
        cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
        cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
        ctx->attrs = new_attrs;

        /*
         * For initialized ordered workqueues, there should only be one pwq
         * (dfl_pwq). Set the plugged flag of ctx->dfl_pwq to suspend execution
         * of newly queued work items until execution of older work items in
         * the old pwq's have completed.
         */
        if ((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))
                ctx->dfl_pwq->plugged = true;

        ctx->wq = wq;
        return ctx;

out_free:
        free_workqueue_attrs(new_attrs);
        apply_wqattrs_cleanup(ctx);
        return ERR_PTR(-ENOMEM);
}

/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
{
        int cpu;

        /* all pwqs have been created successfully, let's install'em */
        mutex_lock(&ctx->wq->mutex);

        copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);

        /* save the previous pwqs and install the new ones */
        for_each_possible_cpu(cpu)
                ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu,
                                                        ctx->pwq_tbl[cpu]);
        ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq);

        /* update node_nr_active->max */
        wq_update_node_max_active(ctx->wq, -1);

        /* rescuer needs to respect wq cpumask changes */
        if (ctx->wq->rescuer)
                set_cpus_allowed_ptr(ctx->wq->rescuer->task,
                                     unbound_effective_cpumask(ctx->wq));

        mutex_unlock(&ctx->wq->mutex);
}

static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
                                        const struct workqueue_attrs *attrs)
{
        struct apply_wqattrs_ctx *ctx;

        /* only unbound workqueues can change attributes */
        if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
                return -EINVAL;

        ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask);
        if (IS_ERR(ctx))
                return PTR_ERR(ctx);

        /* the ctx has been prepared successfully, let's commit it */
        apply_wqattrs_commit(ctx);
        apply_wqattrs_cleanup(ctx);

        return 0;
}

/**
 * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
 * @wq: the target workqueue
 * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
 *
 * Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps
 * a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that
 * work items are affine to the pod it was issued on. Older pwqs are released as
 * in-flight work items finish. Note that a work item which repeatedly requeues
 * itself back-to-back will stay on its current pwq.
 *
 * Performs GFP_KERNEL allocations.
 *
 * Assumes caller has CPU hotplug read exclusion, i.e. cpus_read_lock().
 *
 * Return: 0 on success and -errno on failure.
 */
int apply_workqueue_attrs(struct workqueue_struct *wq,
                          const struct workqueue_attrs *attrs)
{
        int ret;

        lockdep_assert_cpus_held();

        mutex_lock(&wq_pool_mutex);
        ret = apply_workqueue_attrs_locked(wq, attrs);
        mutex_unlock(&wq_pool_mutex);

        return ret;
}

/**
 * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug
 * @wq: the target workqueue
 * @cpu: the CPU to update pool association for
 * @hotplug_cpu: the CPU coming up or going down
 * @online: whether @cpu is coming up or going down
 *
 * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
 * %CPU_DOWN_FAILED.  @cpu is being hot[un]plugged, update pod affinity of
 * @wq accordingly.
 *
 *
 * If pod affinity can't be adjusted due to memory allocation failure, it falls
 * back to @wq->dfl_pwq which may not be optimal but is always correct.
 *
 * Note that when the last allowed CPU of a pod goes offline for a workqueue
 * with a cpumask spanning multiple pods, the workers which were already
 * executing the work items for the workqueue will lose their CPU affinity and
 * may execute on any CPU. This is similar to how per-cpu workqueues behave on
 * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's
 * responsibility to flush the work item from CPU_DOWN_PREPARE.
 */
static void wq_update_pod(struct workqueue_struct *wq, int cpu,
                          int hotplug_cpu, bool online)
{
        int off_cpu = online ? -1 : hotplug_cpu;
        struct pool_workqueue *old_pwq = NULL, *pwq;
        struct workqueue_attrs *target_attrs;

        lockdep_assert_held(&wq_pool_mutex);

        if (!(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered)
                return;

        /*
         * We don't wanna alloc/free wq_attrs for each wq for each CPU.
         * Let's use a preallocated one.  The following buf is protected by
         * CPU hotplug exclusion.
         */
        target_attrs = wq_update_pod_attrs_buf;

        copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
        wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask);

        /* nothing to do if the target cpumask matches the current pwq */
        wq_calc_pod_cpumask(target_attrs, cpu, off_cpu);
        if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs))
                return;

        /* create a new pwq */
        pwq = alloc_unbound_pwq(wq, target_attrs);
        if (!pwq) {
                pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n",
                        wq->name);
                goto use_dfl_pwq;
        }

        /* Install the new pwq. */
        mutex_lock(&wq->mutex);
        old_pwq = install_unbound_pwq(wq, cpu, pwq);
        goto out_unlock;

use_dfl_pwq:
        mutex_lock(&wq->mutex);
        pwq = unbound_pwq(wq, -1);
        raw_spin_lock_irq(&pwq->pool->lock);
        get_pwq(pwq);
        raw_spin_unlock_irq(&pwq->pool->lock);
        old_pwq = install_unbound_pwq(wq, cpu, pwq);
out_unlock:
        mutex_unlock(&wq->mutex);
        put_pwq_unlocked(old_pwq);
}

static int alloc_and_link_pwqs(struct workqueue_struct *wq)
{
        bool highpri = wq->flags & WQ_HIGHPRI;
        int cpu, ret;

        wq->cpu_pwq = alloc_percpu(struct pool_workqueue *);
        if (!wq->cpu_pwq)
                goto enomem;

        if (!(wq->flags & WQ_UNBOUND)) {
                for_each_possible_cpu(cpu) {
                        struct pool_workqueue **pwq_p;
                        struct worker_pool __percpu *pools;
                        struct worker_pool *pool;

                        if (wq->flags & WQ_BH)
                                pools = bh_worker_pools;
                        else
                                pools = cpu_worker_pools;

                        pool = &(per_cpu_ptr(pools, cpu)[highpri]);
                        pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu);

                        *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL,
                                                       pool->node);
                        if (!*pwq_p)
                                goto enomem;

                        init_pwq(*pwq_p, wq, pool);

                        mutex_lock(&wq->mutex);
                        link_pwq(*pwq_p);
                        mutex_unlock(&wq->mutex);
                }
                return 0;
        }

        cpus_read_lock();
        if (wq->flags & __WQ_ORDERED) {
                struct pool_workqueue *dfl_pwq;

                ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
                /* there should only be single pwq for ordering guarantee */
                dfl_pwq = rcu_access_pointer(wq->dfl_pwq);
                WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node ||
                              wq->pwqs.prev != &dfl_pwq->pwqs_node),
                     "ordering guarantee broken for workqueue %s\n", wq->name);
        } else {
                ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
        }
        cpus_read_unlock();

        /* for unbound pwq, flush the pwq_release_worker ensures that the
         * pwq_release_workfn() completes before calling kfree(wq).
         */
        if (ret)
                kthread_flush_worker(pwq_release_worker);

        return ret;

enomem:
        if (wq->cpu_pwq) {
                for_each_possible_cpu(cpu) {
                        struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);

                        if (pwq)
                                kmem_cache_free(pwq_cache, pwq);
                }
                free_percpu(wq->cpu_pwq);
                wq->cpu_pwq = NULL;
        }
        return -ENOMEM;
}

static int wq_clamp_max_active(int max_active, unsigned int flags,
                               const char *name)
{
        if (max_active < 1 || max_active > WQ_MAX_ACTIVE)
                pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
                        max_active, name, 1, WQ_MAX_ACTIVE);

        return clamp_val(max_active, 1, WQ_MAX_ACTIVE);
}

/*
 * Workqueues which may be used during memory reclaim should have a rescuer
 * to guarantee forward progress.
 */
static int init_rescuer(struct workqueue_struct *wq)
{
        struct worker *rescuer;
        char id_buf[WORKER_ID_LEN];
        int ret;

        if (!(wq->flags & WQ_MEM_RECLAIM))
                return 0;

        rescuer = alloc_worker(NUMA_NO_NODE);
        if (!rescuer) {
                pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n",
                       wq->name);
                return -ENOMEM;
        }

        rescuer->rescue_wq = wq;
        format_worker_id(id_buf, sizeof(id_buf), rescuer, NULL);

        rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", id_buf);
        if (IS_ERR(rescuer->task)) {
                ret = PTR_ERR(rescuer->task);
                pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe",
                       wq->name, ERR_PTR(ret));
                kfree(rescuer);
                return ret;
        }

        wq->rescuer = rescuer;
        if (wq->flags & WQ_UNBOUND)
                kthread_bind_mask(rescuer->task, wq_unbound_cpumask);
        else
                kthread_bind_mask(rescuer->task, cpu_possible_mask);
        wake_up_process(rescuer->task);

        return 0;
}

/**
 * wq_adjust_max_active - update a wq's max_active to the current setting
 * @wq: target workqueue
 *
 * If @wq isn't freezing, set @wq->max_active to the saved_max_active and
 * activate inactive work items accordingly. If @wq is freezing, clear
 * @wq->max_active to zero.
 */
static void wq_adjust_max_active(struct workqueue_struct *wq)
{
        bool activated;
        int new_max, new_min;

        lockdep_assert_held(&wq->mutex);

        if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) {
                new_max = 0;
                new_min = 0;
        } else {
                new_max = wq->saved_max_active;
                new_min = wq->saved_min_active;
        }

        if (wq->max_active == new_max && wq->min_active == new_min)
                return;

        /*
         * Update @wq->max/min_active and then kick inactive work items if more
         * active work items are allowed. This doesn't break work item ordering
         * because new work items are always queued behind existing inactive
         * work items if there are any.
         */
        WRITE_ONCE(wq->max_active, new_max);
        WRITE_ONCE(wq->min_active, new_min);

        if (wq->flags & WQ_UNBOUND)
                wq_update_node_max_active(wq, -1);

        if (new_max == 0)
                return;

        /*
         * Round-robin through pwq's activating the first inactive work item
         * until max_active is filled.
         */
        do {
                struct pool_workqueue *pwq;

                activated = false;
                for_each_pwq(pwq, wq) {
                        unsigned long irq_flags;

                        /* can be called during early boot w/ irq disabled */
                        raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags);
                        if (pwq_activate_first_inactive(pwq, true)) {
                                activated = true;
                                kick_pool(pwq->pool);
                        }
                        raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags);
                }
        } while (activated);
}

__printf(1, 4)
struct workqueue_struct *alloc_workqueue(const char *fmt,
                                         unsigned int flags,
                                         int max_active, ...)
{
        va_list args;
        struct workqueue_struct *wq;
        size_t wq_size;
        int name_len;

        if (flags & WQ_BH) {
                if (WARN_ON_ONCE(flags & ~__WQ_BH_ALLOWS))
                        return NULL;
                if (WARN_ON_ONCE(max_active))
                        return NULL;
        }

        /* see the comment above the definition of WQ_POWER_EFFICIENT */
        if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
                flags |= WQ_UNBOUND;

        /* allocate wq and format name */
        if (flags & WQ_UNBOUND)
                wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1);
        else
                wq_size = sizeof(*wq);

        wq = kzalloc(wq_size, GFP_KERNEL);
        if (!wq)
                return NULL;

        if (flags & WQ_UNBOUND) {
                wq->unbound_attrs = alloc_workqueue_attrs();
                if (!wq->unbound_attrs)
                        goto err_free_wq;
        }

        va_start(args, max_active);
        name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);
        va_end(args);

        if (name_len >= WQ_NAME_LEN)
                pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n",
                             wq->name);

        if (flags & WQ_BH) {
                /*
                 * BH workqueues always share a single execution context per CPU
                 * and don't impose any max_active limit.
                 */
                max_active = INT_MAX;
        } else {
                max_active = max_active ?: WQ_DFL_ACTIVE;
                max_active = wq_clamp_max_active(max_active, flags, wq->name);
        }

        /* init wq */
        wq->flags = flags;
        wq->max_active = max_active;
        wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE);
        wq->saved_max_active = wq->max_active;
        wq->saved_min_active = wq->min_active;
        mutex_init(&wq->mutex);
        atomic_set(&wq->nr_pwqs_to_flush, 0);
        INIT_LIST_HEAD(&wq->pwqs);
        INIT_LIST_HEAD(&wq->flusher_queue);
        INIT_LIST_HEAD(&wq->flusher_overflow);
        INIT_LIST_HEAD(&wq->maydays);

        wq_init_lockdep(wq);
        INIT_LIST_HEAD(&wq->list);

        if (flags & WQ_UNBOUND) {
                if (alloc_node_nr_active(wq->node_nr_active) < 0)
                        goto err_unreg_lockdep;
        }

        if (alloc_and_link_pwqs(wq) < 0)
                goto err_free_node_nr_active;

        if (wq_online && init_rescuer(wq) < 0)
                goto err_destroy;

        if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
                goto err_destroy;

        /*
         * wq_pool_mutex protects global freeze state and workqueues list.
         * Grab it, adjust max_active and add the new @wq to workqueues
         * list.
         */
        mutex_lock(&wq_pool_mutex);

        mutex_lock(&wq->mutex);
        wq_adjust_max_active(wq);
        mutex_unlock(&wq->mutex);

        list_add_tail_rcu(&wq->list, &workqueues);

        mutex_unlock(&wq_pool_mutex);

        return wq;

err_free_node_nr_active:
        if (wq->flags & WQ_UNBOUND)
                free_node_nr_active(wq->node_nr_active);
err_unreg_lockdep:
        wq_unregister_lockdep(wq);
        wq_free_lockdep(wq);
err_free_wq:
        free_workqueue_attrs(wq->unbound_attrs);
        kfree(wq);
        return NULL;
err_destroy:
        destroy_workqueue(wq);
        return NULL;
}
EXPORT_SYMBOL_GPL(alloc_workqueue);

static bool pwq_busy(struct pool_workqueue *pwq)
{
        int i;

        for (i = 0; i < WORK_NR_COLORS; i++)
                if (pwq->nr_in_flight[i])
                        return true;

        if ((pwq != rcu_access_pointer(pwq->wq->dfl_pwq)) && (pwq->refcnt > 1))
                return true;
        if (!pwq_is_empty(pwq))
                return true;

        return false;
}

/**
 * destroy_workqueue - safely terminate a workqueue
 * @wq: target workqueue
 *
 * Safely destroy a workqueue. All work currently pending will be done first.
 */
void destroy_workqueue(struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;
        int cpu;

        /*
         * Remove it from sysfs first so that sanity check failure doesn't
         * lead to sysfs name conflicts.
         */
        workqueue_sysfs_unregister(wq);

        /* mark the workqueue destruction is in progress */
        mutex_lock(&wq->mutex);
        wq->flags |= __WQ_DESTROYING;
        mutex_unlock(&wq->mutex);

        /* drain it before proceeding with destruction */
        drain_workqueue(wq);

        /* kill rescuer, if sanity checks fail, leave it w/o rescuer */
        if (wq->rescuer) {
                struct worker *rescuer = wq->rescuer;

                /* this prevents new queueing */
                raw_spin_lock_irq(&wq_mayday_lock);
                wq->rescuer = NULL;
                raw_spin_unlock_irq(&wq_mayday_lock);

                /* rescuer will empty maydays list before exiting */
                kthread_stop(rescuer->task);
                kfree(rescuer);
        }

        /*
         * Sanity checks - grab all the locks so that we wait for all
         * in-flight operations which may do put_pwq().
         */
        mutex_lock(&wq_pool_mutex);
        mutex_lock(&wq->mutex);
        for_each_pwq(pwq, wq) {
                raw_spin_lock_irq(&pwq->pool->lock);
                if (WARN_ON(pwq_busy(pwq))) {
                        pr_warn("%s: %s has the following busy pwq\n",
                                __func__, wq->name);
                        show_pwq(pwq);
                        raw_spin_unlock_irq(&pwq->pool->lock);
                        mutex_unlock(&wq->mutex);
                        mutex_unlock(&wq_pool_mutex);
                        show_one_workqueue(wq);
                        return;
                }
                raw_spin_unlock_irq(&pwq->pool->lock);
        }
        mutex_unlock(&wq->mutex);

        /*
         * wq list is used to freeze wq, remove from list after
         * flushing is complete in case freeze races us.
         */
        list_del_rcu(&wq->list);
        mutex_unlock(&wq_pool_mutex);

        /*
         * We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq
         * to put the base refs. @wq will be auto-destroyed from the last
         * pwq_put. RCU read lock prevents @wq from going away from under us.
         */
        rcu_read_lock();

        for_each_possible_cpu(cpu) {
                put_pwq_unlocked(unbound_pwq(wq, cpu));
                RCU_INIT_POINTER(*unbound_pwq_slot(wq, cpu), NULL);
        }

        put_pwq_unlocked(unbound_pwq(wq, -1));
        RCU_INIT_POINTER(*unbound_pwq_slot(wq, -1), NULL);

        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(destroy_workqueue);

/**
 * workqueue_set_max_active - adjust max_active of a workqueue
 * @wq: target workqueue
 * @max_active: new max_active value.
 *
 * Set max_active of @wq to @max_active. See the alloc_workqueue() function
 * comment.
 *
 * CONTEXT:
 * Don't call from IRQ context.
 */
void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
{
        /* max_active doesn't mean anything for BH workqueues */
        if (WARN_ON(wq->flags & WQ_BH))
                return;
        /* disallow meddling with max_active for ordered workqueues */
        if (WARN_ON(wq->flags & __WQ_ORDERED))
                return;

        max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);

        mutex_lock(&wq->mutex);

        wq->saved_max_active = max_active;
        if (wq->flags & WQ_UNBOUND)
                wq->saved_min_active = min(wq->saved_min_active, max_active);

        wq_adjust_max_active(wq);

        mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL_GPL(workqueue_set_max_active);

/**
 * workqueue_set_min_active - adjust min_active of an unbound workqueue
 * @wq: target unbound workqueue
 * @min_active: new min_active value
 *
 * Set min_active of an unbound workqueue. Unlike other types of workqueues, an
 * unbound workqueue is not guaranteed to be able to process max_active
 * interdependent work items. Instead, an unbound workqueue is guaranteed to be
 * able to process min_active number of interdependent work items which is
 * %WQ_DFL_MIN_ACTIVE by default.
 *
 * Use this function to adjust the min_active value between 0 and the current
 * max_active.
 */
void workqueue_set_min_active(struct workqueue_struct *wq, int min_active)
{
        /* min_active is only meaningful for non-ordered unbound workqueues */
        if (WARN_ON((wq->flags & (WQ_BH | WQ_UNBOUND | __WQ_ORDERED)) !=
                    WQ_UNBOUND))
                return;

        mutex_lock(&wq->mutex);
        wq->saved_min_active = clamp(min_active, 0, wq->saved_max_active);
        wq_adjust_max_active(wq);
        mutex_unlock(&wq->mutex);
}

/**
 * current_work - retrieve %current task's work struct
 *
 * Determine if %current task is a workqueue worker and what it's working on.
 * Useful to find out the context that the %current task is running in.
 *
 * Return: work struct if %current task is a workqueue worker, %NULL otherwise.
 */
struct work_struct *current_work(void)
{
        struct worker *worker = current_wq_worker();

        return worker ? worker->current_work : NULL;
}
EXPORT_SYMBOL(current_work);

/**
 * current_is_workqueue_rescuer - is %current workqueue rescuer?
 *
 * Determine whether %current is a workqueue rescuer.  Can be used from
 * work functions to determine whether it's being run off the rescuer task.
 *
 * Return: %true if %current is a workqueue rescuer. %false otherwise.
 */
bool current_is_workqueue_rescuer(void)
{
        struct worker *worker = current_wq_worker();

        return worker && worker->rescue_wq;
}

/**
 * workqueue_congested - test whether a workqueue is congested
 * @cpu: CPU in question
 * @wq: target workqueue
 *
 * Test whether @wq's cpu workqueue for @cpu is congested.  There is
 * no synchronization around this function and the test result is
 * unreliable and only useful as advisory hints or for debugging.
 *
 * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
 *
 * With the exception of ordered workqueues, all workqueues have per-cpu
 * pool_workqueues, each with its own congested state. A workqueue being
 * congested on one CPU doesn't mean that the workqueue is contested on any
 * other CPUs.
 *
 * Return:
 * %true if congested, %false otherwise.
 */
bool workqueue_congested(int cpu, struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;
        bool ret;

        rcu_read_lock();
        preempt_disable();

        if (cpu == WORK_CPU_UNBOUND)
                cpu = smp_processor_id();

        pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
        ret = !list_empty(&pwq->inactive_works);

        preempt_enable();
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(workqueue_congested);

/**
 * work_busy - test whether a work is currently pending or running
 * @work: the work to be tested
 *
 * Test whether @work is currently pending or running.  There is no
 * synchronization around this function and the test result is
 * unreliable and only useful as advisory hints or for debugging.
 *
 * Return:
 * OR'd bitmask of WORK_BUSY_* bits.
 */
unsigned int work_busy(struct work_struct *work)
{
        struct worker_pool *pool;
        unsigned long irq_flags;
        unsigned int ret = 0;

        if (work_pending(work))
                ret |= WORK_BUSY_PENDING;

        rcu_read_lock();
        pool = get_work_pool(work);
        if (pool) {
                raw_spin_lock_irqsave(&pool->lock, irq_flags);
                if (find_worker_executing_work(pool, work))
                        ret |= WORK_BUSY_RUNNING;
                raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
        }
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(work_busy);

/**
 * set_worker_desc - set description for the current work item
 * @fmt: printf-style format string
 * @...: arguments for the format string
 *
 * This function can be called by a running work function to describe what
 * the work item is about.  If the worker task gets dumped, this
 * information will be printed out together to help debugging.  The
 * description can be at most WORKER_DESC_LEN including the trailing '\0'.
 */
void set_worker_desc(const char *fmt, ...)
{
        struct worker *worker = current_wq_worker();
        va_list args;

        if (worker) {
                va_start(args, fmt);
                vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
                va_end(args);
        }
}
EXPORT_SYMBOL_GPL(set_worker_desc);

/**
 * print_worker_info - print out worker information and description
 * @log_lvl: the log level to use when printing
 * @task: target task
 *
 * If @task is a worker and currently executing a work item, print out the
 * name of the workqueue being serviced and worker description set with
 * set_worker_desc() by the currently executing work item.
 *
 * This function can be safely called on any task as long as the
 * task_struct itself is accessible.  While safe, this function isn't
 * synchronized and may print out mixups or garbages of limited length.
 */
void print_worker_info(const char *log_lvl, struct task_struct *task)
{
        work_func_t *fn = NULL;
        char name[WQ_NAME_LEN] = { };
        char desc[WORKER_DESC_LEN] = { };
        struct pool_workqueue *pwq = NULL;
        struct workqueue_struct *wq = NULL;
        struct worker *worker;

        if (!(task->flags & PF_WQ_WORKER))
                return;

        /*
         * This function is called without any synchronization and @task
         * could be in any state.  Be careful with dereferences.
         */
        worker = kthread_probe_data(task);

        /*
         * Carefully copy the associated workqueue's workfn, name and desc.
         * Keep the original last '\0' in case the original is garbage.
         */
        copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn));
        copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq));
        copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq));
        copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1);
        copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1);

        if (fn || name[0] || desc[0]) {
                printk("%sWorkqueue: %s %ps", log_lvl, name, fn);
                if (strcmp(name, desc))
                        pr_cont(" (%s)", desc);
                pr_cont("\n");
        }
}

static void pr_cont_pool_info(struct worker_pool *pool)
{
        pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
        if (pool->node != NUMA_NO_NODE)
                pr_cont(" node=%d", pool->node);
        pr_cont(" flags=0x%x", pool->flags);
        if (pool->flags & POOL_BH)
                pr_cont(" bh%s",
                        pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
        else
                pr_cont(" nice=%d", pool->attrs->nice);
}

static void pr_cont_worker_id(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (pool->flags & WQ_BH)
                pr_cont("bh%s",
                        pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
        else
                pr_cont("%d%s", task_pid_nr(worker->task),
                        worker->rescue_wq ? "(RESCUER)" : "");
}

struct pr_cont_work_struct {
        bool comma;
        work_func_t func;
        long ctr;
};

static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp)
{
        if (!pcwsp->ctr)
                goto out_record;
        if (func == pcwsp->func) {
                pcwsp->ctr++;
                return;
        }
        if (pcwsp->ctr == 1)
                pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func);
        else
                pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func);
        pcwsp->ctr = 0;
out_record:
        if ((long)func == -1L)
                return;
        pcwsp->comma = comma;
        pcwsp->func = func;
        pcwsp->ctr = 1;
}

static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp)
{
        if (work->func == wq_barrier_func) {
                struct wq_barrier *barr;

                barr = container_of(work, struct wq_barrier, work);

                pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
                pr_cont("%s BAR(%d)", comma ? "," : "",
                        task_pid_nr(barr->task));
        } else {
                if (!comma)
                        pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
                pr_cont_work_flush(comma, work->func, pcwsp);
        }
}

static void show_pwq(struct pool_workqueue *pwq)
{
        struct pr_cont_work_struct pcws = { .ctr = 0, };
        struct worker_pool *pool = pwq->pool;
        struct work_struct *work;
        struct worker *worker;
        bool has_in_flight = false, has_pending = false;
        int bkt;

        pr_info("  pwq %d:", pool->id);
        pr_cont_pool_info(pool);

        pr_cont(" active=%d refcnt=%d%s\n",
                pwq->nr_active, pwq->refcnt,
                !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");

        hash_for_each(pool->busy_hash, bkt, worker, hentry) {
                if (worker->current_pwq == pwq) {
                        has_in_flight = true;
                        break;
                }
        }
        if (has_in_flight) {
                bool comma = false;

                pr_info("    in-flight:");
                hash_for_each(pool->busy_hash, bkt, worker, hentry) {
                        if (worker->current_pwq != pwq)
                                continue;

                        pr_cont(" %s", comma ? "," : "");
                        pr_cont_worker_id(worker);
                        pr_cont(":%ps", worker->current_func);
                        list_for_each_entry(work, &worker->scheduled, entry)
                                pr_cont_work(false, work, &pcws);
                        pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
                        comma = true;
                }
                pr_cont("\n");
        }

        list_for_each_entry(work, &pool->worklist, entry) {
                if (get_work_pwq(work) == pwq) {
                        has_pending = true;
                        break;
                }
        }
        if (has_pending) {
                bool comma = false;

                pr_info("    pending:");
                list_for_each_entry(work, &pool->worklist, entry) {
                        if (get_work_pwq(work) != pwq)
                                continue;

                        pr_cont_work(comma, work, &pcws);
                        comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
                }
                pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
                pr_cont("\n");
        }

        if (!list_empty(&pwq->inactive_works)) {
                bool comma = false;

                pr_info("    inactive:");
                list_for_each_entry(work, &pwq->inactive_works, entry) {
                        pr_cont_work(comma, work, &pcws);
                        comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
                }
                pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
                pr_cont("\n");
        }
}

/**
 * show_one_workqueue - dump state of specified workqueue
 * @wq: workqueue whose state will be printed
 */
void show_one_workqueue(struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;
        bool idle = true;
        unsigned long irq_flags;

        for_each_pwq(pwq, wq) {
                if (!pwq_is_empty(pwq)) {
                        idle = false;
                        break;
                }
        }
        if (idle) /* Nothing to print for idle workqueue */
                return;

        pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);

        for_each_pwq(pwq, wq) {
                raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags);
                if (!pwq_is_empty(pwq)) {
                        /*
                         * Defer printing to avoid deadlocks in console
                         * drivers that queue work while holding locks
                         * also taken in their write paths.
                         */
                        printk_deferred_enter();
                        show_pwq(pwq);
                        printk_deferred_exit();
                }
                raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags);
                /*
                 * We could be printing a lot from atomic context, e.g.
                 * sysrq-t -> show_all_workqueues(). Avoid triggering
                 * hard lockup.
                 */
                touch_nmi_watchdog();
        }

}

/**
 * show_one_worker_pool - dump state of specified worker pool
 * @pool: worker pool whose state will be printed
 */
static void show_one_worker_pool(struct worker_pool *pool)
{
        struct worker *worker;
        bool first = true;
        unsigned long irq_flags;
        unsigned long hung = 0;

        raw_spin_lock_irqsave(&pool->lock, irq_flags);
        if (pool->nr_workers == pool->nr_idle)
                goto next_pool;

        /* How long the first pending work is waiting for a worker. */
        if (!list_empty(&pool->worklist))
                hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000;

        /*
         * Defer printing to avoid deadlocks in console drivers that
         * queue work while holding locks also taken in their write
         * paths.
         */
        printk_deferred_enter();
        pr_info("pool %d:", pool->id);
        pr_cont_pool_info(pool);
        pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers);
        if (pool->manager)
                pr_cont(" manager: %d",
                        task_pid_nr(pool->manager->task));
        list_for_each_entry(worker, &pool->idle_list, entry) {
                pr_cont(" %s", first ? "idle: " : "");
                pr_cont_worker_id(worker);
                first = false;
        }
        pr_cont("\n");
        printk_deferred_exit();
next_pool:
        raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
        /*
         * We could be printing a lot from atomic context, e.g.
         * sysrq-t -> show_all_workqueues(). Avoid triggering
         * hard lockup.
         */
        touch_nmi_watchdog();

}

/**
 * show_all_workqueues - dump workqueue state
 *
 * Called from a sysrq handler and prints out all busy workqueues and pools.
 */
void show_all_workqueues(void)
{
        struct workqueue_struct *wq;
        struct worker_pool *pool;
        int pi;

        rcu_read_lock();

        pr_info("Showing busy workqueues and worker pools:\n");

        list_for_each_entry_rcu(wq, &workqueues, list)
                show_one_workqueue(wq);

        for_each_pool(pool, pi)
                show_one_worker_pool(pool);

        rcu_read_unlock();
}

/**
 * show_freezable_workqueues - dump freezable workqueue state
 *
 * Called from try_to_freeze_tasks() and prints out all freezable workqueues
 * still busy.
 */
void show_freezable_workqueues(void)
{
        struct workqueue_struct *wq;

        rcu_read_lock();

        pr_info("Showing freezable workqueues that are still busy:\n");

        list_for_each_entry_rcu(wq, &workqueues, list) {
                if (!(wq->flags & WQ_FREEZABLE))
                        continue;
                show_one_workqueue(wq);
        }

        rcu_read_unlock();
}

/* used to show worker information through /proc/PID/{comm,stat,status} */
void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
{
        /* stabilize PF_WQ_WORKER and worker pool association */
        mutex_lock(&wq_pool_attach_mutex);

        if (task->flags & PF_WQ_WORKER) {
                struct worker *worker = kthread_data(task);
                struct worker_pool *pool = worker->pool;
                int off;

                off = format_worker_id(buf, size, worker, pool);

                if (pool) {
                        raw_spin_lock_irq(&pool->lock);
                        /*
                         * ->desc tracks information (wq name or
                         * set_worker_desc()) for the latest execution.  If
                         * current, prepend '+', otherwise '-'.
                         */
                        if (worker->desc[0] != '\0') {
                                if (worker->current_work)
                                        scnprintf(buf + off, size - off, "+%s",
                                                  worker->desc);
                                else
                                        scnprintf(buf + off, size - off, "-%s",
                                                  worker->desc);
                        }
                        raw_spin_unlock_irq(&pool->lock);
                }
        } else {
                strscpy(buf, task->comm, size);
        }

        mutex_unlock(&wq_pool_attach_mutex);
}

#ifdef CONFIG_SMP

/*
 * CPU hotplug.
 *
 * There are two challenges in supporting CPU hotplug.  Firstly, there
 * are a lot of assumptions on strong associations among work, pwq and
 * pool which make migrating pending and scheduled works very
 * difficult to implement without impacting hot paths.  Secondly,
 * worker pools serve mix of short, long and very long running works making
 * blocked draining impractical.
 *
 * This is solved by allowing the pools to be disassociated from the CPU
 * running as an unbound one and allowing it to be reattached later if the
 * cpu comes back online.
 */

static void unbind_workers(int cpu)
{
        struct worker_pool *pool;
        struct worker *worker;

        for_each_cpu_worker_pool(pool, cpu) {
                mutex_lock(&wq_pool_attach_mutex);
                raw_spin_lock_irq(&pool->lock);

                /*
                 * We've blocked all attach/detach operations. Make all workers
                 * unbound and set DISASSOCIATED.  Before this, all workers
                 * must be on the cpu.  After this, they may become diasporas.
                 * And the preemption disabled section in their sched callbacks
                 * are guaranteed to see WORKER_UNBOUND since the code here
                 * is on the same cpu.
                 */
                for_each_pool_worker(worker, pool)
                        worker->flags |= WORKER_UNBOUND;

                pool->flags |= POOL_DISASSOCIATED;

                /*
                 * The handling of nr_running in sched callbacks are disabled
                 * now.  Zap nr_running.  After this, nr_running stays zero and
                 * need_more_worker() and keep_working() are always true as
                 * long as the worklist is not empty.  This pool now behaves as
                 * an unbound (in terms of concurrency management) pool which
                 * are served by workers tied to the pool.
                 */
                pool->nr_running = 0;

                /*
                 * With concurrency management just turned off, a busy
                 * worker blocking could lead to lengthy stalls.  Kick off
                 * unbound chain execution of currently pending work items.
                 */
                kick_pool(pool);

                raw_spin_unlock_irq(&pool->lock);

                for_each_pool_worker(worker, pool)
                        unbind_worker(worker);

                mutex_unlock(&wq_pool_attach_mutex);
        }
}

/**
 * rebind_workers - rebind all workers of a pool to the associated CPU
 * @pool: pool of interest
 *
 * @pool->cpu is coming online.  Rebind all workers to the CPU.
 */
static void rebind_workers(struct worker_pool *pool)
{
        struct worker *worker;

        lockdep_assert_held(&wq_pool_attach_mutex);

        /*
         * Restore CPU affinity of all workers.  As all idle workers should
         * be on the run-queue of the associated CPU before any local
         * wake-ups for concurrency management happen, restore CPU affinity
         * of all workers first and then clear UNBOUND.  As we're called
         * from CPU_ONLINE, the following shouldn't fail.
         */
        for_each_pool_worker(worker, pool) {
                kthread_set_per_cpu(worker->task, pool->cpu);
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
                                                  pool_allowed_cpus(pool)) < 0);
        }

        raw_spin_lock_irq(&pool->lock);

        pool->flags &= ~POOL_DISASSOCIATED;

        for_each_pool_worker(worker, pool) {
                unsigned int worker_flags = worker->flags;

                /*
                 * We want to clear UNBOUND but can't directly call
                 * worker_clr_flags() or adjust nr_running.  Atomically
                 * replace UNBOUND with another NOT_RUNNING flag REBOUND.
                 * @worker will clear REBOUND using worker_clr_flags() when
                 * it initiates the next execution cycle thus restoring
                 * concurrency management.  Note that when or whether
                 * @worker clears REBOUND doesn't affect correctness.
                 *
                 * WRITE_ONCE() is necessary because @worker->flags may be
                 * tested without holding any lock in
                 * wq_worker_running().  Without it, NOT_RUNNING test may
                 * fail incorrectly leading to premature concurrency
                 * management operations.
                 */
                WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
                worker_flags |= WORKER_REBOUND;
                worker_flags &= ~WORKER_UNBOUND;
                WRITE_ONCE(worker->flags, worker_flags);
        }

        raw_spin_unlock_irq(&pool->lock);
}

/**
 * restore_unbound_workers_cpumask - restore cpumask of unbound workers
 * @pool: unbound pool of interest
 * @cpu: the CPU which is coming up
 *
 * An unbound pool may end up with a cpumask which doesn't have any online
 * CPUs.  When a worker of such pool get scheduled, the scheduler resets
 * its cpus_allowed.  If @cpu is in @pool's cpumask which didn't have any
 * online CPU before, cpus_allowed of all its workers should be restored.
 */
static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
{
        static cpumask_t cpumask;
        struct worker *worker;

        lockdep_assert_held(&wq_pool_attach_mutex);

        /* is @cpu allowed for @pool? */
        if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
                return;

        cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);

        /* as we're called from CPU_ONLINE, the following shouldn't fail */
        for_each_pool_worker(worker, pool)
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0);
}

int workqueue_prepare_cpu(unsigned int cpu)
{
        struct worker_pool *pool;

        for_each_cpu_worker_pool(pool, cpu) {
                if (pool->nr_workers)
                        continue;
                if (!create_worker(pool))
                        return -ENOMEM;
        }
        return 0;
}

int workqueue_online_cpu(unsigned int cpu)
{
        struct worker_pool *pool;
        struct workqueue_struct *wq;
        int pi;

        mutex_lock(&wq_pool_mutex);

        for_each_pool(pool, pi) {
                /* BH pools aren't affected by hotplug */
                if (pool->flags & POOL_BH)
                        continue;

                mutex_lock(&wq_pool_attach_mutex);
                if (pool->cpu == cpu)
                        rebind_workers(pool);
                else if (pool->cpu < 0)
                        restore_unbound_workers_cpumask(pool, cpu);
                mutex_unlock(&wq_pool_attach_mutex);
        }

        /* update pod affinity of unbound workqueues */
        list_for_each_entry(wq, &workqueues, list) {
                struct workqueue_attrs *attrs = wq->unbound_attrs;

                if (attrs) {
                        const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
                        int tcpu;

                        for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
                                wq_update_pod(wq, tcpu, cpu, true);

                        mutex_lock(&wq->mutex);
                        wq_update_node_max_active(wq, -1);
                        mutex_unlock(&wq->mutex);
                }
        }

        mutex_unlock(&wq_pool_mutex);
        return 0;
}

int workqueue_offline_cpu(unsigned int cpu)
{
        struct workqueue_struct *wq;

        /* unbinding per-cpu workers should happen on the local CPU */
        if (WARN_ON(cpu != smp_processor_id()))
                return -1;

        unbind_workers(cpu);

        /* update pod affinity of unbound workqueues */
        mutex_lock(&wq_pool_mutex);
        list_for_each_entry(wq, &workqueues, list) {
                struct workqueue_attrs *attrs = wq->unbound_attrs;

                if (attrs) {
                        const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
                        int tcpu;

                        for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
                                wq_update_pod(wq, tcpu, cpu, false);

                        mutex_lock(&wq->mutex);
                        wq_update_node_max_active(wq, cpu);
                        mutex_unlock(&wq->mutex);
                }
        }
        mutex_unlock(&wq_pool_mutex);

        return 0;
}

struct work_for_cpu {
        struct work_struct work;
        long (*fn)(void *);
        void *arg;
        long ret;
};

static void work_for_cpu_fn(struct work_struct *work)
{
        struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);

        wfc->ret = wfc->fn(wfc->arg);
}

/**
 * work_on_cpu_key - run a function in thread context on a particular cpu
 * @cpu: the cpu to run on
 * @fn: the function to run
 * @arg: the function arg
 * @key: The lock class key for lock debugging purposes
 *
 * It is up to the caller to ensure that the cpu doesn't go offline.
 * The caller must not hold any locks which would prevent @fn from completing.
 *
 * Return: The value @fn returns.
 */
long work_on_cpu_key(int cpu, long (*fn)(void *),
                     void *arg, struct lock_class_key *key)
{
        struct work_for_cpu wfc = { .fn = fn, .arg = arg };

        INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key);
        schedule_work_on(cpu, &wfc.work);
        flush_work(&wfc.work);
        destroy_work_on_stack(&wfc.work);
        return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu_key);

/**
 * work_on_cpu_safe_key - run a function in thread context on a particular cpu
 * @cpu: the cpu to run on
 * @fn:  the function to run
 * @arg: the function argument
 * @key: The lock class key for lock debugging purposes
 *
 * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
 * any locks which would prevent @fn from completing.
 *
 * Return: The value @fn returns.
 */
long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
                          void *arg, struct lock_class_key *key)
{
        long ret = -ENODEV;

        cpus_read_lock();
        if (cpu_online(cpu))
                ret = work_on_cpu_key(cpu, fn, arg, key);
        cpus_read_unlock();
        return ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
#endif /* CONFIG_SMP */

#ifdef CONFIG_FREEZER

/**
 * freeze_workqueues_begin - begin freezing workqueues
 *
 * Start freezing workqueues.  After this function returns, all freezable
 * workqueues will queue new works to their inactive_works list instead of
 * pool->worklist.
 *
 * CONTEXT:
 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
 */
void freeze_workqueues_begin(void)
{
        struct workqueue_struct *wq;

        mutex_lock(&wq_pool_mutex);

        WARN_ON_ONCE(workqueue_freezing);
        workqueue_freezing = true;

        list_for_each_entry(wq, &workqueues, list) {
                mutex_lock(&wq->mutex);
                wq_adjust_max_active(wq);
                mutex_unlock(&wq->mutex);
        }

        mutex_unlock(&wq_pool_mutex);
}

/**
 * freeze_workqueues_busy - are freezable workqueues still busy?
 *
 * Check whether freezing is complete.  This function must be called
 * between freeze_workqueues_begin() and thaw_workqueues().
 *
 * CONTEXT:
 * Grabs and releases wq_pool_mutex.
 *
 * Return:
 * %true if some freezable workqueues are still busy.  %false if freezing
 * is complete.
 */
bool freeze_workqueues_busy(void)
{
        bool busy = false;
        struct workqueue_struct *wq;
        struct pool_workqueue *pwq;

        mutex_lock(&wq_pool_mutex);

        WARN_ON_ONCE(!workqueue_freezing);

        list_for_each_entry(wq, &workqueues, list) {
                if (!(wq->flags & WQ_FREEZABLE))
                        continue;
                /*
                 * nr_active is monotonically decreasing.  It's safe
                 * to peek without lock.
                 */
                rcu_read_lock();
                for_each_pwq(pwq, wq) {
                        WARN_ON_ONCE(pwq->nr_active < 0);
                        if (pwq->nr_active) {
                                busy = true;
                                rcu_read_unlock();
                                goto out_unlock;
                        }
                }
                rcu_read_unlock();
        }
out_unlock:
        mutex_unlock(&wq_pool_mutex);
        return busy;
}

/**
 * thaw_workqueues - thaw workqueues
 *
 * Thaw workqueues.  Normal queueing is restored and all collected
 * frozen works are transferred to their respective pool worklists.
 *
 * CONTEXT:
 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
 */
void thaw_workqueues(void)
{
        struct workqueue_struct *wq;

        mutex_lock(&wq_pool_mutex);

        if (!workqueue_freezing)
                goto out_unlock;

        workqueue_freezing = false;

        /* restore max_active and repopulate worklist */
        list_for_each_entry(wq, &workqueues, list) {
                mutex_lock(&wq->mutex);
                wq_adjust_max_active(wq);
                mutex_unlock(&wq->mutex);
        }

out_unlock:
        mutex_unlock(&wq_pool_mutex);
}
#endif /* CONFIG_FREEZER */

static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
{
        LIST_HEAD(ctxs);
        int ret = 0;
        struct workqueue_struct *wq;
        struct apply_wqattrs_ctx *ctx, *n;

        lockdep_assert_held(&wq_pool_mutex);

        list_for_each_entry(wq, &workqueues, list) {
                if (!(wq->flags & WQ_UNBOUND) || (wq->flags & __WQ_DESTROYING))
                        continue;

                ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
                if (IS_ERR(ctx)) {
                        ret = PTR_ERR(ctx);
                        break;
                }

                list_add_tail(&ctx->list, &ctxs);
        }

        list_for_each_entry_safe(ctx, n, &ctxs, list) {
                if (!ret)
                        apply_wqattrs_commit(ctx);
                apply_wqattrs_cleanup(ctx);
        }

        if (!ret) {
                mutex_lock(&wq_pool_attach_mutex);
                cpumask_copy(wq_unbound_cpumask, unbound_cpumask);
                mutex_unlock(&wq_pool_attach_mutex);
        }
        return ret;
}

/**
 * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask
 * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
 *
 * This function can be called from cpuset code to provide a set of isolated
 * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold
 * either cpus_read_lock or cpus_write_lock.
 */
int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
{
        cpumask_var_t cpumask;
        int ret = 0;

        if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
                return -ENOMEM;

        lockdep_assert_cpus_held();
        mutex_lock(&wq_pool_mutex);

        /* Save the current isolated cpumask & export it via sysfs */
        cpumask_copy(wq_isolated_cpumask, exclude_cpumask);

        /*
         * If the operation fails, it will fall back to
         * wq_requested_unbound_cpumask which is initially set to
         * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten
         * by any subsequent write to workqueue/cpumask sysfs file.
         */
        if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask))
                cpumask_copy(cpumask, wq_requested_unbound_cpumask);
        if (!cpumask_equal(cpumask, wq_unbound_cpumask))
                ret = workqueue_apply_unbound_cpumask(cpumask);

        mutex_unlock(&wq_pool_mutex);
        free_cpumask_var(cpumask);
        return ret;
}

static int parse_affn_scope(const char *val)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) {
                if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i])))
                        return i;
        }
        return -EINVAL;
}

static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)
{
        struct workqueue_struct *wq;
        int affn, cpu;

        affn = parse_affn_scope(val);
        if (affn < 0)
                return affn;
        if (affn == WQ_AFFN_DFL)
                return -EINVAL;

        cpus_read_lock();
        mutex_lock(&wq_pool_mutex);

        wq_affn_dfl = affn;

        list_for_each_entry(wq, &workqueues, list) {
                for_each_online_cpu(cpu) {
                        wq_update_pod(wq, cpu, cpu, true);
                }
        }

        mutex_unlock(&wq_pool_mutex);
        cpus_read_unlock();

        return 0;
}

static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp)
{
        return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]);
}

static const struct kernel_param_ops wq_affn_dfl_ops = {
        .set        = wq_affn_dfl_set,
        .get        = wq_affn_dfl_get,
};

module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644);

#ifdef CONFIG_SYSFS
/*
 * Workqueues with WQ_SYSFS flag set is visible to userland via
 * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
 * following attributes.
 *
 *  per_cpu                RO bool        : whether the workqueue is per-cpu or unbound
 *  max_active                RW int        : maximum number of in-flight work items
 *
 * Unbound workqueues have the following extra attributes.
 *
 *  nice                RW int        : nice value of the workers
 *  cpumask                RW mask        : bitmask of allowed CPUs for the workers
 *  affinity_scope        RW str  : worker CPU affinity scope (cache, numa, none)
 *  affinity_strict        RW bool : worker CPU affinity is strict
 */
struct wq_device {
        struct workqueue_struct                *wq;
        struct device                        dev;
};

static struct workqueue_struct *dev_to_wq(struct device *dev)
{
        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);

        return wq_dev->wq;
}

static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);

        return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
}
static DEVICE_ATTR_RO(per_cpu);

static ssize_t max_active_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);

        return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
}

static ssize_t max_active_store(struct device *dev,
                                struct device_attribute *attr, const char *buf,
                                size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int val;

        if (sscanf(buf, "%d", &val) != 1 || val <= 0)
                return -EINVAL;

        workqueue_set_max_active(wq, val);
        return count;
}
static DEVICE_ATTR_RW(max_active);

static struct attribute *wq_sysfs_attrs[] = {
        &dev_attr_per_cpu.attr,
        &dev_attr_max_active.attr,
        NULL,
};
ATTRIBUTE_GROUPS(wq_sysfs);

static void apply_wqattrs_lock(void)
{
        /* CPUs should stay stable across pwq creations and installations */
        cpus_read_lock();
        mutex_lock(&wq_pool_mutex);
}

static void apply_wqattrs_unlock(void)
{
        mutex_unlock(&wq_pool_mutex);
        cpus_read_unlock();
}

static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int written;

        mutex_lock(&wq->mutex);
        written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
        mutex_unlock(&wq->mutex);

        return written;
}

/* prepare workqueue_attrs for sysfs store operations */
static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
{
        struct workqueue_attrs *attrs;

        lockdep_assert_held(&wq_pool_mutex);

        attrs = alloc_workqueue_attrs();
        if (!attrs)
                return NULL;

        copy_workqueue_attrs(attrs, wq->unbound_attrs);
        return attrs;
}

static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
                             const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int ret = -ENOMEM;

        apply_wqattrs_lock();

        attrs = wq_sysfs_prep_attrs(wq);
        if (!attrs)
                goto out_unlock;

        if (sscanf(buf, "%d", &attrs->nice) == 1 &&
            attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
                ret = apply_workqueue_attrs_locked(wq, attrs);
        else
                ret = -EINVAL;

out_unlock:
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static ssize_t wq_cpumask_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int written;

        mutex_lock(&wq->mutex);
        written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
                            cpumask_pr_args(wq->unbound_attrs->cpumask));
        mutex_unlock(&wq->mutex);
        return written;
}

static ssize_t wq_cpumask_store(struct device *dev,
                                struct device_attribute *attr,
                                const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int ret = -ENOMEM;

        apply_wqattrs_lock();

        attrs = wq_sysfs_prep_attrs(wq);
        if (!attrs)
                goto out_unlock;

        ret = cpumask_parse(buf, attrs->cpumask);
        if (!ret)
                ret = apply_workqueue_attrs_locked(wq, attrs);

out_unlock:
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static ssize_t wq_affn_scope_show(struct device *dev,
                                  struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int written;

        mutex_lock(&wq->mutex);
        if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL)
                written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
                                    wq_affn_names[WQ_AFFN_DFL],
                                    wq_affn_names[wq_affn_dfl]);
        else
                written = scnprintf(buf, PAGE_SIZE, "%s\n",
                                    wq_affn_names[wq->unbound_attrs->affn_scope]);
        mutex_unlock(&wq->mutex);

        return written;
}

static ssize_t wq_affn_scope_store(struct device *dev,
                                   struct device_attribute *attr,
                                   const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int affn, ret = -ENOMEM;

        affn = parse_affn_scope(buf);
        if (affn < 0)
                return affn;

        apply_wqattrs_lock();
        attrs = wq_sysfs_prep_attrs(wq);
        if (attrs) {
                attrs->affn_scope = affn;
                ret = apply_workqueue_attrs_locked(wq, attrs);
        }
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static ssize_t wq_affinity_strict_show(struct device *dev,
                                       struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);

        return scnprintf(buf, PAGE_SIZE, "%d\n",
                         wq->unbound_attrs->affn_strict);
}

static ssize_t wq_affinity_strict_store(struct device *dev,
                                        struct device_attribute *attr,
                                        const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int v, ret = -ENOMEM;

        if (sscanf(buf, "%d", &v) != 1)
                return -EINVAL;

        apply_wqattrs_lock();
        attrs = wq_sysfs_prep_attrs(wq);
        if (attrs) {
                attrs->affn_strict = (bool)v;
                ret = apply_workqueue_attrs_locked(wq, attrs);
        }
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static struct device_attribute wq_sysfs_unbound_attrs[] = {
        __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
        __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
        __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store),
        __ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store),
        __ATTR_NULL,
};

static const struct bus_type wq_subsys = {
        .name                                = "workqueue",
        .dev_groups                        = wq_sysfs_groups,
};

/**
 *  workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
 *  @cpumask: the cpumask to set
 *
 *  The low-level workqueues cpumask is a global cpumask that limits
 *  the affinity of all unbound workqueues.  This function check the @cpumask
 *  and apply it to all unbound workqueues and updates all pwqs of them.
 *
 *  Return:        0        - Success
 *                -EINVAL        - Invalid @cpumask
 *                -ENOMEM        - Failed to allocate memory for attrs or pwqs.
 */
static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
{
        int ret = -EINVAL;

        /*
         * Not excluding isolated cpus on purpose.
         * If the user wishes to include them, we allow that.
         */
        cpumask_and(cpumask, cpumask, cpu_possible_mask);
        if (!cpumask_empty(cpumask)) {
                apply_wqattrs_lock();
                cpumask_copy(wq_requested_unbound_cpumask, cpumask);
                if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
                        ret = 0;
                        goto out_unlock;
                }

                ret = workqueue_apply_unbound_cpumask(cpumask);

out_unlock:
                apply_wqattrs_unlock();
        }

        return ret;
}

static ssize_t __wq_cpumask_show(struct device *dev,
                struct device_attribute *attr, char *buf, cpumask_var_t mask)
{
        int written;

        mutex_lock(&wq_pool_mutex);
        written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
        mutex_unlock(&wq_pool_mutex);

        return written;
}

static ssize_t cpumask_requested_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask);
}
static DEVICE_ATTR_RO(cpumask_requested);

static ssize_t cpumask_isolated_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask);
}
static DEVICE_ATTR_RO(cpumask_isolated);

static ssize_t cpumask_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask);
}

static ssize_t cpumask_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        cpumask_var_t cpumask;
        int ret;

        if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
                return -ENOMEM;

        ret = cpumask_parse(buf, cpumask);
        if (!ret)
                ret = workqueue_set_unbound_cpumask(cpumask);

        free_cpumask_var(cpumask);
        return ret ? ret : count;
}
static DEVICE_ATTR_RW(cpumask);

static struct attribute *wq_sysfs_cpumask_attrs[] = {
        &dev_attr_cpumask.attr,
        &dev_attr_cpumask_requested.attr,
        &dev_attr_cpumask_isolated.attr,
        NULL,
};
ATTRIBUTE_GROUPS(wq_sysfs_cpumask);

static int __init wq_sysfs_init(void)
{
        return subsys_virtual_register(&wq_subsys, wq_sysfs_cpumask_groups);
}
core_initcall(wq_sysfs_init);

static void wq_device_release(struct device *dev)
{
        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);

        kfree(wq_dev);
}

/**
 * workqueue_sysfs_register - make a workqueue visible in sysfs
 * @wq: the workqueue to register
 *
 * Expose @wq in sysfs under /sys/bus/workqueue/devices.
 * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
 * which is the preferred method.
 *
 * Workqueue user should use this function directly iff it wants to apply
 * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
 * apply_workqueue_attrs() may race against userland updating the
 * attributes.
 *
 * Return: 0 on success, -errno on failure.
 */
int workqueue_sysfs_register(struct workqueue_struct *wq)
{
        struct wq_device *wq_dev;
        int ret;

        /*
         * Adjusting max_active breaks ordering guarantee.  Disallow exposing
         * ordered workqueues.
         */
        if (WARN_ON(wq->flags & __WQ_ORDERED))
                return -EINVAL;

        wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
        if (!wq_dev)
                return -ENOMEM;

        wq_dev->wq = wq;
        wq_dev->dev.bus = &wq_subsys;
        wq_dev->dev.release = wq_device_release;
        dev_set_name(&wq_dev->dev, "%s", wq->name);

        /*
         * unbound_attrs are created separately.  Suppress uevent until
         * everything is ready.
         */
        dev_set_uevent_suppress(&wq_dev->dev, true);

        ret = device_register(&wq_dev->dev);
        if (ret) {
                put_device(&wq_dev->dev);
                wq->wq_dev = NULL;
                return ret;
        }

        if (wq->flags & WQ_UNBOUND) {
                struct device_attribute *attr;

                for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
                        ret = device_create_file(&wq_dev->dev, attr);
                        if (ret) {
                                device_unregister(&wq_dev->dev);
                                wq->wq_dev = NULL;
                                return ret;
                        }
                }
        }

        dev_set_uevent_suppress(&wq_dev->dev, false);
        kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
        return 0;
}

/**
 * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
 * @wq: the workqueue to unregister
 *
 * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
 */
static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
{
        struct wq_device *wq_dev = wq->wq_dev;

        if (!wq->wq_dev)
                return;

        wq->wq_dev = NULL;
        device_unregister(&wq_dev->dev);
}
#else        /* CONFIG_SYSFS */
static void workqueue_sysfs_unregister(struct workqueue_struct *wq)        { }
#endif        /* CONFIG_SYSFS */

/*
 * Workqueue watchdog.
 *
 * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
 * flush dependency, a concurrency managed work item which stays RUNNING
 * indefinitely.  Workqueue stalls can be very difficult to debug as the
 * usual warning mechanisms don't trigger and internal workqueue state is
 * largely opaque.
 *
 * Workqueue watchdog monitors all worker pools periodically and dumps
 * state if some pools failed to make forward progress for a while where
 * forward progress is defined as the first item on ->worklist changing.
 *
 * This mechanism is controlled through the kernel parameter
 * "workqueue.watchdog_thresh" which can be updated at runtime through the
 * corresponding sysfs parameter file.
 */
#ifdef CONFIG_WQ_WATCHDOG

static unsigned long wq_watchdog_thresh = 30;
static struct timer_list wq_watchdog_timer;

static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;

/*
 * Show workers that might prevent the processing of pending work items.
 * The only candidates are CPU-bound workers in the running state.
 * Pending work items should be handled by another idle worker
 * in all other situations.
 */
static void show_cpu_pool_hog(struct worker_pool *pool)
{
        struct worker *worker;
        unsigned long irq_flags;
        int bkt;

        raw_spin_lock_irqsave(&pool->lock, irq_flags);

        hash_for_each(pool->busy_hash, bkt, worker, hentry) {
                if (task_is_running(worker->task)) {
                        /*
                         * Defer printing to avoid deadlocks in console
                         * drivers that queue work while holding locks
                         * also taken in their write paths.
                         */
                        printk_deferred_enter();

                        pr_info("pool %d:\n", pool->id);
                        sched_show_task(worker->task);

                        printk_deferred_exit();
                }
        }

        raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
}

static void show_cpu_pools_hogs(void)
{
        struct worker_pool *pool;
        int pi;

        pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n");

        rcu_read_lock();

        for_each_pool(pool, pi) {
                if (pool->cpu_stall)
                        show_cpu_pool_hog(pool);

        }

        rcu_read_unlock();
}

static void wq_watchdog_reset_touched(void)
{
        int cpu;

        wq_watchdog_touched = jiffies;
        for_each_possible_cpu(cpu)
                per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
}

static void wq_watchdog_timer_fn(struct timer_list *unused)
{
        unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
        bool lockup_detected = false;
        bool cpu_pool_stall = false;
        unsigned long now = jiffies;
        struct worker_pool *pool;
        int pi;

        if (!thresh)
                return;

        rcu_read_lock();

        for_each_pool(pool, pi) {
                unsigned long pool_ts, touched, ts;

                pool->cpu_stall = false;
                if (list_empty(&pool->worklist))
                        continue;

                /*
                 * If a virtual machine is stopped by the host it can look to
                 * the watchdog like a stall.
                 */
                kvm_check_and_clear_guest_paused();

                /* get the latest of pool and touched timestamps */
                if (pool->cpu >= 0)
                        touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));
                else
                        touched = READ_ONCE(wq_watchdog_touched);
                pool_ts = READ_ONCE(pool->watchdog_ts);

                if (time_after(pool_ts, touched))
                        ts = pool_ts;
                else
                        ts = touched;

                /* did we stall? */
                if (time_after(now, ts + thresh)) {
                        lockup_detected = true;
                        if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) {
                                pool->cpu_stall = true;
                                cpu_pool_stall = true;
                        }
                        pr_emerg("BUG: workqueue lockup - pool");
                        pr_cont_pool_info(pool);
                        pr_cont(" stuck for %us!\n",
                                jiffies_to_msecs(now - pool_ts) / 1000);
                }


        }

        rcu_read_unlock();

        if (lockup_detected)
                show_all_workqueues();

        if (cpu_pool_stall)
                show_cpu_pools_hogs();

        wq_watchdog_reset_touched();
        mod_timer(&wq_watchdog_timer, jiffies + thresh);
}

notrace void wq_watchdog_touch(int cpu)
{
        if (cpu >= 0)
                per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;

        wq_watchdog_touched = jiffies;
}

static void wq_watchdog_set_thresh(unsigned long thresh)
{
        wq_watchdog_thresh = 0;
        del_timer_sync(&wq_watchdog_timer);

        if (thresh) {
                wq_watchdog_thresh = thresh;
                wq_watchdog_reset_touched();
                mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);
        }
}

static int wq_watchdog_param_set_thresh(const char *val,
                                        const struct kernel_param *kp)
{
        unsigned long thresh;
        int ret;

        ret = kstrtoul(val, 0, &thresh);
        if (ret)
                return ret;

        if (system_wq)
                wq_watchdog_set_thresh(thresh);
        else
                wq_watchdog_thresh = thresh;

        return 0;
}

static const struct kernel_param_ops wq_watchdog_thresh_ops = {
        .set        = wq_watchdog_param_set_thresh,
        .get        = param_get_ulong,
};

module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
                0644);

static void wq_watchdog_init(void)
{
        timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE);
        wq_watchdog_set_thresh(wq_watchdog_thresh);
}

#else        /* CONFIG_WQ_WATCHDOG */

static inline void wq_watchdog_init(void) { }

#endif        /* CONFIG_WQ_WATCHDOG */

static void bh_pool_kick_normal(struct irq_work *irq_work)
{
        raise_softirq_irqoff(TASKLET_SOFTIRQ);
}

static void bh_pool_kick_highpri(struct irq_work *irq_work)
{
        raise_softirq_irqoff(HI_SOFTIRQ);
}

static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask)
{
        if (!cpumask_intersects(wq_unbound_cpumask, mask)) {
                pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n",
                        cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask));
                return;
        }

        cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask);
}

static void __init init_cpu_worker_pool(struct worker_pool *pool, int cpu, int nice)
{
        BUG_ON(init_worker_pool(pool));
        pool->cpu = cpu;
        cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
        cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu));
        pool->attrs->nice = nice;
        pool->attrs->affn_strict = true;
        pool->node = cpu_to_node(cpu);

        /* alloc pool ID */
        mutex_lock(&wq_pool_mutex);
        BUG_ON(worker_pool_assign_id(pool));
        mutex_unlock(&wq_pool_mutex);
}

/**
 * workqueue_init_early - early init for workqueue subsystem
 *
 * This is the first step of three-staged workqueue subsystem initialization and
 * invoked as soon as the bare basics - memory allocation, cpumasks and idr are
 * up. It sets up all the data structures and system workqueues and allows early
 * boot code to create workqueues and queue/cancel work items. Actual work item
 * execution starts only after kthreads can be created and scheduled right
 * before early initcalls.
 */
void __init workqueue_init_early(void)
{
        struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];
        int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
        void (*irq_work_fns[2])(struct irq_work *) = { bh_pool_kick_normal,
                                                       bh_pool_kick_highpri };
        int i, cpu;

        BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));

        BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
        BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
        BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));

        cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
        restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
        restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
        if (!cpumask_empty(&wq_cmdline_cpumask))
                restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);

        cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);

        pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);

        wq_update_pod_attrs_buf = alloc_workqueue_attrs();
        BUG_ON(!wq_update_pod_attrs_buf);

        /*
         * If nohz_full is enabled, set power efficient workqueue as unbound.
         * This allows workqueue items to be moved to HK CPUs.
         */
        if (housekeeping_enabled(HK_TYPE_TICK))
                wq_power_efficient = true;

        /* initialize WQ_AFFN_SYSTEM pods */
        pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
        pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL);
        pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
        BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod);

        BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE));

        pt->nr_pods = 1;
        cpumask_copy(pt->pod_cpus[0], cpu_possible_mask);
        pt->pod_node[0] = NUMA_NO_NODE;
        pt->cpu_pod[0] = 0;

        /* initialize BH and CPU pools */
        for_each_possible_cpu(cpu) {
                struct worker_pool *pool;

                i = 0;
                for_each_bh_worker_pool(pool, cpu) {
                        init_cpu_worker_pool(pool, cpu, std_nice[i]);
                        pool->flags |= POOL_BH;
                        init_irq_work(bh_pool_irq_work(pool), irq_work_fns[i]);
                        i++;
                }

                i = 0;
                for_each_cpu_worker_pool(pool, cpu)
                        init_cpu_worker_pool(pool, cpu, std_nice[i++]);
        }

        /* create default unbound and ordered wq attrs */
        for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
                struct workqueue_attrs *attrs;

                BUG_ON(!(attrs = alloc_workqueue_attrs()));
                attrs->nice = std_nice[i];
                unbound_std_wq_attrs[i] = attrs;

                /*
                 * An ordered wq should have only one pwq as ordering is
                 * guaranteed by max_active which is enforced by pwqs.
                 */
                BUG_ON(!(attrs = alloc_workqueue_attrs()));
                attrs->nice = std_nice[i];
                attrs->ordered = true;
                ordered_wq_attrs[i] = attrs;
        }

        system_wq = alloc_workqueue("events", 0, 0);
        system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
        system_long_wq = alloc_workqueue("events_long", 0, 0);
        system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
                                            WQ_MAX_ACTIVE);
        system_freezable_wq = alloc_workqueue("events_freezable",
                                              WQ_FREEZABLE, 0);
        system_power_efficient_wq = alloc_workqueue("events_power_efficient",
                                              WQ_POWER_EFFICIENT, 0);
        system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient",
                                              WQ_FREEZABLE | WQ_POWER_EFFICIENT,
                                              0);
        system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0);
        system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
                                               WQ_BH | WQ_HIGHPRI, 0);
        BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
               !system_unbound_wq || !system_freezable_wq ||
               !system_power_efficient_wq ||
               !system_freezable_power_efficient_wq ||
               !system_bh_wq || !system_bh_highpri_wq);
}

static void __init wq_cpu_intensive_thresh_init(void)
{
        unsigned long thresh;
        unsigned long bogo;

        pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release");
        BUG_ON(IS_ERR(pwq_release_worker));

        /* if the user set it to a specific value, keep it */
        if (wq_cpu_intensive_thresh_us != ULONG_MAX)
                return;

        /*
         * The default of 10ms is derived from the fact that most modern (as of
         * 2023) processors can do a lot in 10ms and that it's just below what
         * most consider human-perceivable. However, the kernel also runs on a
         * lot slower CPUs including microcontrollers where the threshold is way
         * too low.
         *
         * Let's scale up the threshold upto 1 second if BogoMips is below 4000.
         * This is by no means accurate but it doesn't have to be. The mechanism
         * is still useful even when the threshold is fully scaled up. Also, as
         * the reports would usually be applicable to everyone, some machines
         * operating on longer thresholds won't significantly diminish their
         * usefulness.
         */
        thresh = 10 * USEC_PER_MSEC;

        /* see init/calibrate.c for lpj -> BogoMIPS calculation */
        bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1);
        if (bogo < 4000)
                thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC);

        pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n",
                 loops_per_jiffy, bogo, thresh);

        wq_cpu_intensive_thresh_us = thresh;
}

/**
 * workqueue_init - bring workqueue subsystem fully online
 *
 * This is the second step of three-staged workqueue subsystem initialization
 * and invoked as soon as kthreads can be created and scheduled. Workqueues have
 * been created and work items queued on them, but there are no kworkers
 * executing the work items yet. Populate the worker pools with the initial
 * workers and enable future kworker creations.
 */
void __init workqueue_init(void)
{
        struct workqueue_struct *wq;
        struct worker_pool *pool;
        int cpu, bkt;

        wq_cpu_intensive_thresh_init();

        mutex_lock(&wq_pool_mutex);

        /*
         * Per-cpu pools created earlier could be missing node hint. Fix them
         * up. Also, create a rescuer for workqueues that requested it.
         */
        for_each_possible_cpu(cpu) {
                for_each_bh_worker_pool(pool, cpu)
                        pool->node = cpu_to_node(cpu);
                for_each_cpu_worker_pool(pool, cpu)
                        pool->node = cpu_to_node(cpu);
        }

        list_for_each_entry(wq, &workqueues, list) {
                WARN(init_rescuer(wq),
                     "workqueue: failed to create early rescuer for %s",
                     wq->name);
        }

        mutex_unlock(&wq_pool_mutex);

        /*
         * Create the initial workers. A BH pool has one pseudo worker that
         * represents the shared BH execution context and thus doesn't get
         * affected by hotplug events. Create the BH pseudo workers for all
         * possible CPUs here.
         */
        for_each_possible_cpu(cpu)
                for_each_bh_worker_pool(pool, cpu)
                        BUG_ON(!create_worker(pool));

        for_each_online_cpu(cpu) {
                for_each_cpu_worker_pool(pool, cpu) {
                        pool->flags &= ~POOL_DISASSOCIATED;
                        BUG_ON(!create_worker(pool));
                }
        }

        hash_for_each(unbound_pool_hash, bkt, pool, hash_node)
                BUG_ON(!create_worker(pool));

        wq_online = true;
        wq_watchdog_init();
}

/*
 * Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to
 * @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique
 * and consecutive pod ID. The rest of @pt is initialized accordingly.
 */
static void __init init_pod_type(struct wq_pod_type *pt,
                                 bool (*cpus_share_pod)(int, int))
{
        int cur, pre, cpu, pod;

        pt->nr_pods = 0;

        /* init @pt->cpu_pod[] according to @cpus_share_pod() */
        pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
        BUG_ON(!pt->cpu_pod);

        for_each_possible_cpu(cur) {
                for_each_possible_cpu(pre) {
                        if (pre >= cur) {
                                pt->cpu_pod[cur] = pt->nr_pods++;
                                break;
                        }
                        if (cpus_share_pod(cur, pre)) {
                                pt->cpu_pod[cur] = pt->cpu_pod[pre];
                                break;
                        }
                }
        }

        /* init the rest to match @pt->cpu_pod[] */
        pt->pod_cpus = kcalloc(pt->nr_pods, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
        pt->pod_node = kcalloc(pt->nr_pods, sizeof(pt->pod_node[0]), GFP_KERNEL);
        BUG_ON(!pt->pod_cpus || !pt->pod_node);

        for (pod = 0; pod < pt->nr_pods; pod++)
                BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL));

        for_each_possible_cpu(cpu) {
                cpumask_set_cpu(cpu, pt->pod_cpus[pt->cpu_pod[cpu]]);
                pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu);
        }
}

static bool __init cpus_dont_share(int cpu0, int cpu1)
{
        return false;
}

static bool __init cpus_share_smt(int cpu0, int cpu1)
{
#ifdef CONFIG_SCHED_SMT
        return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1));
#else
        return false;
#endif
}

static bool __init cpus_share_numa(int cpu0, int cpu1)
{
        return cpu_to_node(cpu0) == cpu_to_node(cpu1);
}

/**
 * workqueue_init_topology - initialize CPU pods for unbound workqueues
 *
 * This is the third step of three-staged workqueue subsystem initialization and
 * invoked after SMP and topology information are fully initialized. It
 * initializes the unbound CPU pods accordingly.
 */
void __init workqueue_init_topology(void)
{
        struct workqueue_struct *wq;
        int cpu;

        init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share);
        init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt);
        init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache);
        init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa);

        wq_topo_initialized = true;

        mutex_lock(&wq_pool_mutex);

        /*
         * Workqueues allocated earlier would have all CPUs sharing the default
         * worker pool. Explicitly call wq_update_pod() on all workqueue and CPU
         * combinations to apply per-pod sharing.
         */
        list_for_each_entry(wq, &workqueues, list) {
                for_each_online_cpu(cpu)
                        wq_update_pod(wq, cpu, cpu, true);
                if (wq->flags & WQ_UNBOUND) {
                        mutex_lock(&wq->mutex);
                        wq_update_node_max_active(wq, -1);
                        mutex_unlock(&wq->mutex);
                }
        }

        mutex_unlock(&wq_pool_mutex);
}

void __warn_flushing_systemwide_wq(void)
{
        pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n");
        dump_stack();
}
EXPORT_SYMBOL(__warn_flushing_systemwide_wq);

static int __init workqueue_unbound_cpus_setup(char *str)
{
        if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) {
                cpumask_clear(&wq_cmdline_cpumask);
                pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n");
        }

        return 1;
}
__setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup);


















































































   11 






    4 





































   24 






   22 






   22 































































































































































































    3 






    3 
















































    1 










































































































    3 































































































































































































































































































    3 






    3 













































   10 










    4 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM writeback

#if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_WRITEBACK_H

#include <linux/tracepoint.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>

#define show_inode_state(state)                                        \
        __print_flags(state, "|",                                \
                {I_DIRTY_SYNC,                "I_DIRTY_SYNC"},        \
                {I_DIRTY_DATASYNC,        "I_DIRTY_DATASYNC"},        \
                {I_DIRTY_PAGES,                "I_DIRTY_PAGES"},        \
                {I_NEW,                        "I_NEW"},                \
                {I_WILL_FREE,                "I_WILL_FREE"},                \
                {I_FREEING,                "I_FREEING"},                \
                {I_CLEAR,                "I_CLEAR"},                \
                {I_SYNC,                "I_SYNC"},                \
                {I_DIRTY_TIME,                "I_DIRTY_TIME"},        \
                {I_REFERENCED,                "I_REFERENCED"}                \
        )

/* enums need to be exported to user space */
#undef EM
#undef EMe
#define EM(a,b)         TRACE_DEFINE_ENUM(a);
#define EMe(a,b)        TRACE_DEFINE_ENUM(a);

#define WB_WORK_REASON                                                        \
        EM( WB_REASON_BACKGROUND,                "background")                \
        EM( WB_REASON_VMSCAN,                        "vmscan")                \
        EM( WB_REASON_SYNC,                        "sync")                        \
        EM( WB_REASON_PERIODIC,                        "periodic")                \
        EM( WB_REASON_LAPTOP_TIMER,                "laptop_timer")                \
        EM( WB_REASON_FS_FREE_SPACE,                "fs_free_space")        \
        EM( WB_REASON_FORKER_THREAD,                "forker_thread")        \
        EMe(WB_REASON_FOREIGN_FLUSH,                "foreign_flush")

WB_WORK_REASON

/*
 * Now redefine the EM() and EMe() macros to map the enums to the strings
 * that will be printed in the output.
 */
#undef EM
#undef EMe
#define EM(a,b)                { a, b },
#define EMe(a,b)        { a, b }

struct wb_writeback_work;

DECLARE_EVENT_CLASS(writeback_folio_template,

        TP_PROTO(struct folio *folio, struct address_space *mapping),

        TP_ARGS(folio, mapping),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(pgoff_t, index)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(mapping ? inode_to_bdi(mapping->host) :
                                         NULL), 32);
                __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0;
                __entry->index = folio->index;
        ),

        TP_printk("bdi %s: ino=%lu index=%lu",
                __entry->name,
                (unsigned long)__entry->ino,
                __entry->index
        )
);

DEFINE_EVENT(writeback_folio_template, writeback_dirty_folio,

        TP_PROTO(struct folio *folio, struct address_space *mapping),

        TP_ARGS(folio, mapping)
);

DEFINE_EVENT(writeback_folio_template, folio_wait_writeback,

        TP_PROTO(struct folio *folio, struct address_space *mapping),

        TP_ARGS(folio, mapping)
);

DECLARE_EVENT_CLASS(writeback_dirty_inode_template,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(unsigned long, state)
                __field(unsigned long, flags)
        ),

        TP_fast_assign(
                struct backing_dev_info *bdi = inode_to_bdi(inode);

                /* may be called for files on pseudo FSes w/ unregistered bdi */
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode->i_state;
                __entry->flags                = flags;
        ),

        TP_printk("bdi %s: ino=%lu state=%s flags=%s",
                __entry->name,
                (unsigned long)__entry->ino,
                show_inode_state(__entry->state),
                show_inode_state(__entry->flags)
        )
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

#ifdef CREATE_TRACE_POINTS
#ifdef CONFIG_CGROUP_WRITEBACK

static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb)
{
        return cgroup_ino(wb->memcg_css->cgroup);
}

static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc)
{
        if (wbc->wb)
                return __trace_wb_assign_cgroup(wbc->wb);
        else
                return 1;
}
#else        /* CONFIG_CGROUP_WRITEBACK */

static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb)
{
        return 1;
}

static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc)
{
        return 1;
}

#endif        /* CONFIG_CGROUP_WRITEBACK */
#endif        /* CREATE_TRACE_POINTS */

#ifdef CONFIG_CGROUP_WRITEBACK
TRACE_EVENT(inode_foreign_history,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc,
                 unsigned int history),

        TP_ARGS(inode, wbc, history),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(ino_t,                ino)
                __field(ino_t,                cgroup_ino)
                __field(unsigned int,        history)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
                __entry->history        = history;
        ),

        TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x",
                __entry->name,
                (unsigned long)__entry->ino,
                (unsigned long)__entry->cgroup_ino,
                __entry->history
        )
);

TRACE_EVENT(inode_switch_wbs,

        TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb,
                 struct bdi_writeback *new_wb),

        TP_ARGS(inode, old_wb, new_wb),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(ino_t,                ino)
                __field(ino_t,                old_cgroup_ino)
                __field(ino_t,                new_cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32);
                __entry->ino                = inode->i_ino;
                __entry->old_cgroup_ino        = __trace_wb_assign_cgroup(old_wb);
                __entry->new_cgroup_ino        = __trace_wb_assign_cgroup(new_wb);
        ),

        TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu",
                __entry->name,
                (unsigned long)__entry->ino,
                (unsigned long)__entry->old_cgroup_ino,
                (unsigned long)__entry->new_cgroup_ino
        )
);

TRACE_EVENT(track_foreign_dirty,

        TP_PROTO(struct folio *folio, struct bdi_writeback *wb),

        TP_ARGS(folio, wb),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(u64,                bdi_id)
                __field(ino_t,                ino)
                __field(unsigned int,        memcg_id)
                __field(ino_t,                cgroup_ino)
                __field(ino_t,                page_cgroup_ino)
        ),

        TP_fast_assign(
                struct address_space *mapping = folio_mapping(folio);
                struct inode *inode = mapping ? mapping->host : NULL;

                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->bdi_id                = wb->bdi->id;
                __entry->ino                = inode ? inode->i_ino : 0;
                __entry->memcg_id        = wb->memcg_css->id;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
                __entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup);
        ),

        TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu",
                __entry->name,
                __entry->bdi_id,
                (unsigned long)__entry->ino,
                __entry->memcg_id,
                (unsigned long)__entry->cgroup_ino,
                (unsigned long)__entry->page_cgroup_ino
        )
);

TRACE_EVENT(flush_foreign,

        TP_PROTO(struct bdi_writeback *wb, unsigned int frn_bdi_id,
                 unsigned int frn_memcg_id),

        TP_ARGS(wb, frn_bdi_id, frn_memcg_id),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(ino_t,                cgroup_ino)
                __field(unsigned int,        frn_bdi_id)
                __field(unsigned int,        frn_memcg_id)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
                __entry->frn_bdi_id        = frn_bdi_id;
                __entry->frn_memcg_id        = frn_memcg_id;
        ),

        TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u",
                __entry->name,
                (unsigned long)__entry->cgroup_ino,
                __entry->frn_bdi_id,
                __entry->frn_memcg_id
        )
);
#endif

DECLARE_EVENT_CLASS(writeback_write_inode_template,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(int, sync_mode)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->sync_mode        = wbc->sync_mode;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu",
                __entry->name,
                (unsigned long)__entry->ino,
                __entry->sync_mode,
                (unsigned long)__entry->cgroup_ino
        )
);

DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode_start,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc)
);

DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc)
);

DECLARE_EVENT_CLASS(writeback_work_class,
        TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work),
        TP_ARGS(wb, work),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(long, nr_pages)
                __field(dev_t, sb_dev)
                __field(int, sync_mode)
                __field(int, for_kupdate)
                __field(int, range_cyclic)
                __field(int, for_background)
                __field(int, reason)
                __field(ino_t, cgroup_ino)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->nr_pages = work->nr_pages;
                __entry->sb_dev = work->sb ? work->sb->s_dev : 0;
                __entry->sync_mode = work->sync_mode;
                __entry->for_kupdate = work->for_kupdate;
                __entry->range_cyclic = work->range_cyclic;
                __entry->for_background        = work->for_background;
                __entry->reason = work->reason;
                __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
                  "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%lu",
                  __entry->name,
                  MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
                  __entry->nr_pages,
                  __entry->sync_mode,
                  __entry->for_kupdate,
                  __entry->range_cyclic,
                  __entry->for_background,
                  __print_symbolic(__entry->reason, WB_WORK_REASON),
                  (unsigned long)__entry->cgroup_ino
        )
);
#define DEFINE_WRITEBACK_WORK_EVENT(name) \
DEFINE_EVENT(writeback_work_class, name, \
        TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \
        TP_ARGS(wb, work))
DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
DEFINE_WRITEBACK_WORK_EVENT(writeback_written);
DEFINE_WRITEBACK_WORK_EVENT(writeback_wait);

TRACE_EVENT(writeback_pages_written,
        TP_PROTO(long pages_written),
        TP_ARGS(pages_written),
        TP_STRUCT__entry(
                __field(long,                pages)
        ),
        TP_fast_assign(
                __entry->pages                = pages_written;
        ),
        TP_printk("%ld", __entry->pages)
);

DECLARE_EVENT_CLASS(writeback_class,
        TP_PROTO(struct bdi_writeback *wb),
        TP_ARGS(wb),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(ino_t, cgroup_ino)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: cgroup_ino=%lu",
                  __entry->name,
                  (unsigned long)__entry->cgroup_ino
        )
);
#define DEFINE_WRITEBACK_EVENT(name) \
DEFINE_EVENT(writeback_class, name, \
        TP_PROTO(struct bdi_writeback *wb), \
        TP_ARGS(wb))

DEFINE_WRITEBACK_EVENT(writeback_wake_background);

TRACE_EVENT(writeback_bdi_register,
        TP_PROTO(struct backing_dev_info *bdi),
        TP_ARGS(bdi),
        TP_STRUCT__entry(
                __array(char, name, 32)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
        ),
        TP_printk("bdi %s",
                __entry->name
        )
);

DECLARE_EVENT_CLASS(wbc_class,
        TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
        TP_ARGS(wbc, bdi),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(long, nr_to_write)
                __field(long, pages_skipped)
                __field(int, sync_mode)
                __field(int, for_kupdate)
                __field(int, for_background)
                __field(int, for_reclaim)
                __field(int, range_cyclic)
                __field(long, range_start)
                __field(long, range_end)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->pages_skipped        = wbc->pages_skipped;
                __entry->sync_mode        = wbc->sync_mode;
                __entry->for_kupdate        = wbc->for_kupdate;
                __entry->for_background        = wbc->for_background;
                __entry->for_reclaim        = wbc->for_reclaim;
                __entry->range_cyclic        = wbc->range_cyclic;
                __entry->range_start        = (long)wbc->range_start;
                __entry->range_end        = (long)wbc->range_end;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
                "bgrd=%d reclm=%d cyclic=%d "
                "start=0x%lx end=0x%lx cgroup_ino=%lu",
                __entry->name,
                __entry->nr_to_write,
                __entry->pages_skipped,
                __entry->sync_mode,
                __entry->for_kupdate,
                __entry->for_background,
                __entry->for_reclaim,
                __entry->range_cyclic,
                __entry->range_start,
                __entry->range_end,
                (unsigned long)__entry->cgroup_ino
        )
)

#define DEFINE_WBC_EVENT(name) \
DEFINE_EVENT(wbc_class, name, \
        TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \
        TP_ARGS(wbc, bdi))
DEFINE_WBC_EVENT(wbc_writepage);

TRACE_EVENT(writeback_queue_io,
        TP_PROTO(struct bdi_writeback *wb,
                 struct wb_writeback_work *work,
                 unsigned long dirtied_before,
                 int moved),
        TP_ARGS(wb, work, dirtied_before, moved),
        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(unsigned long,        older)
                __field(long,                age)
                __field(int,                moved)
                __field(int,                reason)
                __field(ino_t,                cgroup_ino)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->older        = dirtied_before;
                __entry->age        = (jiffies - dirtied_before) * 1000 / HZ;
                __entry->moved        = moved;
                __entry->reason        = work->reason;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu",
                __entry->name,
                __entry->older,        /* dirtied_before in jiffies */
                __entry->age,        /* dirtied_before in relative milliseconds */
                __entry->moved,
                __print_symbolic(__entry->reason, WB_WORK_REASON),
                (unsigned long)__entry->cgroup_ino
        )
);

TRACE_EVENT(global_dirty_state,

        TP_PROTO(unsigned long background_thresh,
                 unsigned long dirty_thresh
        ),

        TP_ARGS(background_thresh,
                dirty_thresh
        ),

        TP_STRUCT__entry(
                __field(unsigned long,        nr_dirty)
                __field(unsigned long,        nr_writeback)
                __field(unsigned long,        background_thresh)
                __field(unsigned long,        dirty_thresh)
                __field(unsigned long,        dirty_limit)
                __field(unsigned long,        nr_dirtied)
                __field(unsigned long,        nr_written)
        ),

        TP_fast_assign(
                __entry->nr_dirty        = global_node_page_state(NR_FILE_DIRTY);
                __entry->nr_writeback        = global_node_page_state(NR_WRITEBACK);
                __entry->nr_dirtied        = global_node_page_state(NR_DIRTIED);
                __entry->nr_written        = global_node_page_state(NR_WRITTEN);
                __entry->background_thresh = background_thresh;
                __entry->dirty_thresh        = dirty_thresh;
                __entry->dirty_limit        = global_wb_domain.dirty_limit;
        ),

        TP_printk("dirty=%lu writeback=%lu "
                  "bg_thresh=%lu thresh=%lu limit=%lu "
                  "dirtied=%lu written=%lu",
                  __entry->nr_dirty,
                  __entry->nr_writeback,
                  __entry->background_thresh,
                  __entry->dirty_thresh,
                  __entry->dirty_limit,
                  __entry->nr_dirtied,
                  __entry->nr_written
        )
);

#define KBps(x)                        ((x) << (PAGE_SHIFT - 10))

TRACE_EVENT(bdi_dirty_ratelimit,

        TP_PROTO(struct bdi_writeback *wb,
                 unsigned long dirty_rate,
                 unsigned long task_ratelimit),

        TP_ARGS(wb, dirty_rate, task_ratelimit),

        TP_STRUCT__entry(
                __array(char,                bdi, 32)
                __field(unsigned long,        write_bw)
                __field(unsigned long,        avg_write_bw)
                __field(unsigned long,        dirty_rate)
                __field(unsigned long,        dirty_ratelimit)
                __field(unsigned long,        task_ratelimit)
                __field(unsigned long,        balanced_dirty_ratelimit)
                __field(ino_t,                cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);
                __entry->write_bw        = KBps(wb->write_bandwidth);
                __entry->avg_write_bw        = KBps(wb->avg_write_bandwidth);
                __entry->dirty_rate        = KBps(dirty_rate);
                __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit);
                __entry->task_ratelimit        = KBps(task_ratelimit);
                __entry->balanced_dirty_ratelimit =
                                        KBps(wb->balanced_dirty_ratelimit);
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),

        TP_printk("bdi %s: "
                  "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
                  "dirty_ratelimit=%lu task_ratelimit=%lu "
                  "balanced_dirty_ratelimit=%lu cgroup_ino=%lu",
                  __entry->bdi,
                  __entry->write_bw,                /* write bandwidth */
                  __entry->avg_write_bw,        /* avg write bandwidth */
                  __entry->dirty_rate,                /* bdi dirty rate */
                  __entry->dirty_ratelimit,        /* base ratelimit */
                  __entry->task_ratelimit, /* ratelimit with position control */
                  __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
                  (unsigned long)__entry->cgroup_ino
        )
);

TRACE_EVENT(balance_dirty_pages,

        TP_PROTO(struct bdi_writeback *wb,
                 unsigned long thresh,
                 unsigned long bg_thresh,
                 unsigned long dirty,
                 unsigned long bdi_thresh,
                 unsigned long bdi_dirty,
                 unsigned long dirty_ratelimit,
                 unsigned long task_ratelimit,
                 unsigned long dirtied,
                 unsigned long period,
                 long pause,
                 unsigned long start_time),

        TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
                dirty_ratelimit, task_ratelimit,
                dirtied, period, pause, start_time),

        TP_STRUCT__entry(
                __array(         char,        bdi, 32)
                __field(unsigned long,        limit)
                __field(unsigned long,        setpoint)
                __field(unsigned long,        dirty)
                __field(unsigned long,        bdi_setpoint)
                __field(unsigned long,        bdi_dirty)
                __field(unsigned long,        dirty_ratelimit)
                __field(unsigned long,        task_ratelimit)
                __field(unsigned int,        dirtied)
                __field(unsigned int,        dirtied_pause)
                __field(unsigned long,        paused)
                __field(         long,        pause)
                __field(unsigned long,        period)
                __field(         long,        think)
                __field(ino_t,                cgroup_ino)
        ),

        TP_fast_assign(
                unsigned long freerun = (thresh + bg_thresh) / 2;
                strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);

                __entry->limit                = global_wb_domain.dirty_limit;
                __entry->setpoint        = (global_wb_domain.dirty_limit +
                                                freerun) / 2;
                __entry->dirty                = dirty;
                __entry->bdi_setpoint        = __entry->setpoint *
                                                bdi_thresh / (thresh + 1);
                __entry->bdi_dirty        = bdi_dirty;
                __entry->dirty_ratelimit = KBps(dirty_ratelimit);
                __entry->task_ratelimit        = KBps(task_ratelimit);
                __entry->dirtied        = dirtied;
                __entry->dirtied_pause        = current->nr_dirtied_pause;
                __entry->think                = current->dirty_paused_when == 0 ? 0 :
                         (long)(jiffies - current->dirty_paused_when) * 1000/HZ;
                __entry->period                = period * 1000 / HZ;
                __entry->pause                = pause * 1000 / HZ;
                __entry->paused                = (jiffies - start_time) * 1000 / HZ;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),


        TP_printk("bdi %s: "
                  "limit=%lu setpoint=%lu dirty=%lu "
                  "bdi_setpoint=%lu bdi_dirty=%lu "
                  "dirty_ratelimit=%lu task_ratelimit=%lu "
                  "dirtied=%u dirtied_pause=%u "
                  "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu",
                  __entry->bdi,
                  __entry->limit,
                  __entry->setpoint,
                  __entry->dirty,
                  __entry->bdi_setpoint,
                  __entry->bdi_dirty,
                  __entry->dirty_ratelimit,
                  __entry->task_ratelimit,
                  __entry->dirtied,
                  __entry->dirtied_pause,
                  __entry->paused,        /* ms */
                  __entry->pause,        /* ms */
                  __entry->period,        /* ms */
                  __entry->think,        /* ms */
                  (unsigned long)__entry->cgroup_ino
          )
);

TRACE_EVENT(writeback_sb_inodes_requeue,

        TP_PROTO(struct inode *inode),
        TP_ARGS(inode),

        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(unsigned long, state)
                __field(unsigned long, dirtied_when)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode->i_state;
                __entry->dirtied_when        = inode->dirtied_when;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(inode_to_wb(inode));
        ),

        TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu",
                  __entry->name,
                  (unsigned long)__entry->ino,
                  show_inode_state(__entry->state),
                  __entry->dirtied_when,
                  (jiffies - __entry->dirtied_when) / HZ,
                  (unsigned long)__entry->cgroup_ino
        )
);

DECLARE_EVENT_CLASS(writeback_single_inode_template,

        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write
        ),

        TP_ARGS(inode, wbc, nr_to_write),

        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(unsigned long, state)
                __field(unsigned long, dirtied_when)
                __field(unsigned long, writeback_index)
                __field(long, nr_to_write)
                __field(unsigned long, wrote)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode->i_state;
                __entry->dirtied_when        = inode->dirtied_when;
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->nr_to_write        = nr_to_write;
                __entry->wrote                = nr_to_write - wbc->nr_to_write;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
                  "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu",
                  __entry->name,
                  (unsigned long)__entry->ino,
                  show_inode_state(__entry->state),
                  __entry->dirtied_when,
                  (jiffies - __entry->dirtied_when) / HZ,
                  __entry->writeback_index,
                  __entry->nr_to_write,
                  __entry->wrote,
                  (unsigned long)__entry->cgroup_ino
        )
);

DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_start,
        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write),
        TP_ARGS(inode, wbc, nr_to_write)
);

DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write),
        TP_ARGS(inode, wbc, nr_to_write)
);

DECLARE_EVENT_CLASS(writeback_inode_template,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(unsigned long,        state                        )
                __field(        __u16, mode                        )
                __field(unsigned long, dirtied_when                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->state        = inode->i_state;
                __entry->mode        = inode->i_mode;
                __entry->dirtied_when = inode->dirtied_when;
        ),

        TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long)__entry->ino, __entry->dirtied_when,
                  show_inode_state(__entry->state), __entry->mode)
);

DEFINE_EVENT(writeback_inode_template, writeback_lazytime,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(writeback_inode_template, writeback_lazytime_iput,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(writeback_inode_template, writeback_dirty_inode_enqueue,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

/*
 * Inode writeback list tracking.
 */

DEFINE_EVENT(writeback_inode_template, sb_mark_inode_writeback,
        TP_PROTO(struct inode *inode),
        TP_ARGS(inode)
);

DEFINE_EVENT(writeback_inode_template, sb_clear_inode_writeback,
        TP_PROTO(struct inode *inode),
        TP_ARGS(inode)
);

#endif /* _TRACE_WRITEBACK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>














































































    4 



    3 




















    4 



























    3 













    4 


































































    4 










    4 













    4 


























    4 


    4 











    2 



























    4 




    4 




































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
 *
 *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson
 *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
 *  2000-2002   x86-64 support by Andi Kleen
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/kernel.h>
#include <linux/kstrtox.h>
#include <linux/errno.h>
#include <linux/wait.h>
#include <linux/unistd.h>
#include <linux/stddef.h>
#include <linux/personality.h>
#include <linux/uaccess.h>
#include <linux/user-return-notifier.h>
#include <linux/uprobes.h>
#include <linux/context_tracking.h>
#include <linux/entry-common.h>
#include <linux/syscalls.h>
#include <linux/rseq.h>

#include <asm/processor.h>
#include <asm/ucontext.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/xstate.h>
#include <asm/vdso.h>
#include <asm/mce.h>
#include <asm/sighandling.h>
#include <asm/vm86.h>

#include <asm/syscall.h>
#include <asm/sigframe.h>
#include <asm/signal.h>
#include <asm/shstk.h>

static inline int is_ia32_compat_frame(struct ksignal *ksig)
{
        return IS_ENABLED(CONFIG_IA32_EMULATION) &&
                ksig->ka.sa.sa_flags & SA_IA32_ABI;
}

static inline int is_ia32_frame(struct ksignal *ksig)
{
        return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig);
}

static inline int is_x32_frame(struct ksignal *ksig)
{
        return IS_ENABLED(CONFIG_X86_X32_ABI) &&
                ksig->ka.sa.sa_flags & SA_X32_ABI;
}

/*
 * Set up a signal frame.
 */

/* x86 ABI requires 16-byte alignment */
#define FRAME_ALIGNMENT        16UL

#define MAX_FRAME_PADDING        (FRAME_ALIGNMENT - 1)

/*
 * Determine which stack to use..
 */
void __user *
get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size,
             void __user **fpstate)
{
        struct k_sigaction *ka = &ksig->ka;
        int ia32_frame = is_ia32_frame(ksig);
        /* Default to using normal stack */
        bool nested_altstack = on_sig_stack(regs->sp);
        bool entering_altstack = false;
        unsigned long math_size = 0;
        unsigned long sp = regs->sp;
        unsigned long buf_fx = 0;

        /* redzone */
        if (!ia32_frame)
                sp -= 128;

        /* This is the X/Open sanctioned signal stack switching.  */
        if (ka->sa.sa_flags & SA_ONSTACK) {
                /*
                 * This checks nested_altstack via sas_ss_flags(). Sensible
                 * programs use SS_AUTODISARM, which disables that check, and
                 * programs that don't use SS_AUTODISARM get compatible.
                 */
                if (sas_ss_flags(sp) == 0) {
                        sp = current->sas_ss_sp + current->sas_ss_size;
                        entering_altstack = true;
                }
        } else if (ia32_frame &&
                   !nested_altstack &&
                   regs->ss != __USER_DS &&
                   !(ka->sa.sa_flags & SA_RESTORER) &&
                   ka->sa.sa_restorer) {
                /* This is the legacy signal stack switching. */
                sp = (unsigned long) ka->sa.sa_restorer;
                entering_altstack = true;
        }

        sp = fpu__alloc_mathframe(sp, ia32_frame, &buf_fx, &math_size);
        *fpstate = (void __user *)sp;

        sp -= frame_size;

        if (ia32_frame)
                /*
                 * Align the stack pointer according to the i386 ABI,
                 * i.e. so that on function entry ((sp + 4) & 15) == 0.
                 */
                sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4;
        else
                sp = round_down(sp, FRAME_ALIGNMENT) - 8;

        /*
         * If we are on the alternate signal stack and would overflow it, don't.
         * Return an always-bogus address instead so we will die with SIGSEGV.
         */
        if (unlikely((nested_altstack || entering_altstack) &&
                     !__on_sig_stack(sp))) {

                if (show_unhandled_signals && printk_ratelimit())
                        pr_info("%s[%d] overflowed sigaltstack\n",
                                current->comm, task_pid_nr(current));

                return (void __user *)-1L;
        }

        /* save i387 and extended state */
        if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size))
                return (void __user *)-1L;

        return (void __user *)sp;
}

/*
 * There are four different struct types for signal frame: sigframe_ia32,
 * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case
 * -- the largest size. It means the size for 64-bit apps is a bit more
 * than needed, but this keeps the code simple.
 */
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
# define MAX_FRAME_SIGINFO_UCTXT_SIZE        sizeof(struct sigframe_ia32)
#else
# define MAX_FRAME_SIGINFO_UCTXT_SIZE        sizeof(struct rt_sigframe)
#endif

/*
 * The FP state frame contains an XSAVE buffer which must be 64-byte aligned.
 * If a signal frame starts at an unaligned address, extra space is required.
 * This is the max alignment padding, conservatively.
 */
#define MAX_XSAVE_PADDING        63UL

/*
 * The frame data is composed of the following areas and laid out as:
 *
 * -------------------------
 * | alignment padding     |
 * -------------------------
 * | (f)xsave frame        |
 * -------------------------
 * | fsave header          |
 * -------------------------
 * | alignment padding     |
 * -------------------------
 * | siginfo + ucontext    |
 * -------------------------
 */

/* max_frame_size tells userspace the worst case signal stack size. */
static unsigned long __ro_after_init max_frame_size;
static unsigned int __ro_after_init fpu_default_state_size;

static int __init init_sigframe_size(void)
{
        fpu_default_state_size = fpu__get_fpstate_size();

        max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING;

        max_frame_size += fpu_default_state_size + MAX_XSAVE_PADDING;

        /* Userspace expects an aligned size. */
        max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT);

        pr_info("max sigframe size: %lu\n", max_frame_size);
        return 0;
}
early_initcall(init_sigframe_size);

unsigned long get_sigframe_size(void)
{
        return max_frame_size;
}

static int
setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
{
        /* Perform fixup for the pre-signal frame. */
        rseq_signal_deliver(ksig, regs);

        /* Set up the stack frame */
        if (is_ia32_frame(ksig)) {
                if (ksig->ka.sa.sa_flags & SA_SIGINFO)
                        return ia32_setup_rt_frame(ksig, regs);
                else
                        return ia32_setup_frame(ksig, regs);
        } else if (is_x32_frame(ksig)) {
                return x32_setup_rt_frame(ksig, regs);
        } else {
                return x64_setup_rt_frame(ksig, regs);
        }
}

static void
handle_signal(struct ksignal *ksig, struct pt_regs *regs)
{
        bool stepping, failed;
        struct fpu *fpu = &current->thread.fpu;

        if (v8086_mode(regs))
                save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);

        /* Are we from a system call? */
        if (syscall_get_nr(current, regs) != -1) {
                /* If so, check system call restarting.. */
                switch (syscall_get_error(current, regs)) {
                case -ERESTART_RESTARTBLOCK:
                case -ERESTARTNOHAND:
                        regs->ax = -EINTR;
                        break;

                case -ERESTARTSYS:
                        if (!(ksig->ka.sa.sa_flags & SA_RESTART)) {
                                regs->ax = -EINTR;
                                break;
                        }
                        fallthrough;
                case -ERESTARTNOINTR:
                        regs->ax = regs->orig_ax;
                        regs->ip -= 2;
                        break;
                }
        }

        /*
         * If TF is set due to a debugger (TIF_FORCED_TF), clear TF now
         * so that register information in the sigcontext is correct and
         * then notify the tracer before entering the signal handler.
         */
        stepping = test_thread_flag(TIF_SINGLESTEP);
        if (stepping)
                user_disable_single_step(current);

        failed = (setup_rt_frame(ksig, regs) < 0);
        if (!failed) {
                /*
                 * Clear the direction flag as per the ABI for function entry.
                 *
                 * Clear RF when entering the signal handler, because
                 * it might disable possible debug exception from the
                 * signal handler.
                 *
                 * Clear TF for the case when it wasn't set by debugger to
                 * avoid the recursive send_sigtrap() in SIGTRAP handler.
                 */
                regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF);
                /*
                 * Ensure the signal handler starts with the new fpu state.
                 */
                fpu__clear_user_states(fpu);
        }
        signal_setup_done(failed, ksig, stepping);
}

static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
{
#ifdef CONFIG_IA32_EMULATION
        if (current->restart_block.arch_data & TS_COMPAT)
                return __NR_ia32_restart_syscall;
#endif
#ifdef CONFIG_X86_X32_ABI
        return __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT);
#else
        return __NR_restart_syscall;
#endif
}

/*
 * Note that 'init' is a special process: it doesn't get signals it doesn't
 * want to handle. Thus you cannot kill init even with a SIGKILL even by
 * mistake.
 */
void arch_do_signal_or_restart(struct pt_regs *regs)
{
        struct ksignal ksig;

        if (get_signal(&ksig)) {
                /* Whee! Actually deliver the signal.  */
                handle_signal(&ksig, regs);
                return;
        }

        /* Did we come from a system call? */
        if (syscall_get_nr(current, regs) != -1) {
                /* Restart the system call - no handlers present */
                switch (syscall_get_error(current, regs)) {
                case -ERESTARTNOHAND:
                case -ERESTARTSYS:
                case -ERESTARTNOINTR:
                        regs->ax = regs->orig_ax;
                        regs->ip -= 2;
                        break;

                case -ERESTART_RESTARTBLOCK:
                        regs->ax = get_nr_restart_syscall(regs);
                        regs->ip -= 2;
                        break;
                }
        }

        /*
         * If there's no signal to deliver, we just put the saved sigmask
         * back.
         */
        restore_saved_sigmask();
}

void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
{
        struct task_struct *me = current;

        if (show_unhandled_signals && printk_ratelimit()) {
                printk("%s"
                       "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
                       task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
                       me->comm, me->pid, where, frame,
                       regs->ip, regs->sp, regs->orig_ax);
                print_vma_addr(KERN_CONT " in ", regs->ip);
                pr_cont("\n");
        }

        force_sig(SIGSEGV);
}

#ifdef CONFIG_DYNAMIC_SIGFRAME
#ifdef CONFIG_STRICT_SIGALTSTACK_SIZE
static bool strict_sigaltstack_size __ro_after_init = true;
#else
static bool strict_sigaltstack_size __ro_after_init = false;
#endif

static int __init strict_sas_size(char *arg)
{
        return kstrtobool(arg, &strict_sigaltstack_size) == 0;
}
__setup("strict_sas_size", strict_sas_size);

/*
 * MINSIGSTKSZ is 2048 and can't be changed despite the fact that AVX512
 * exceeds that size already. As such programs might never use the
 * sigaltstack they just continued to work. While always checking against
 * the real size would be correct, this might be considered a regression.
 *
 * Therefore avoid the sanity check, unless enforced by kernel
 * configuration or command line option.
 *
 * When dynamic FPU features are supported, the check is also enforced when
 * the task has permissions to use dynamic features. Tasks which have no
 * permission are checked against the size of the non-dynamic feature set
 * if strict checking is enabled. This avoids forcing all tasks on the
 * system to allocate large sigaltstacks even if they are never going
 * to use a dynamic feature. As this is serialized via sighand::siglock
 * any permission request for a dynamic feature either happened already
 * or will see the newly install sigaltstack size in the permission checks.
 */
bool sigaltstack_size_valid(size_t ss_size)
{
        unsigned long fsize = max_frame_size - fpu_default_state_size;
        u64 mask;

        lockdep_assert_held(&current->sighand->siglock);

        if (!fpu_state_size_dynamic() && !strict_sigaltstack_size)
                return true;

        fsize += current->group_leader->thread.fpu.perm.__user_state_size;
        if (likely(ss_size > fsize))
                return true;

        if (strict_sigaltstack_size)
                return ss_size > fsize;

        mask = current->group_leader->thread.fpu.perm.__state_perm;
        if (mask & XFEATURE_MASK_USER_DYNAMIC)
                return ss_size > fsize;

        return true;
}
#endif /* CONFIG_DYNAMIC_SIGFRAME */




























































    5 




















































































































































































































































































































































































































































































































































































































































































    5 









    5 



































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
// SPDX-License-Identifier: GPL-2.0
/*
 * Encryption policy functions for per-file encryption support.
 *
 * Copyright (C) 2015, Google, Inc.
 * Copyright (C) 2015, Motorola Mobility.
 *
 * Originally written by Michael Halcrow, 2015.
 * Modified by Jaegeuk Kim, 2015.
 * Modified by Eric Biggers, 2019 for v2 policy support.
 */

#include <linux/fs_context.h>
#include <linux/random.h>
#include <linux/seq_file.h>
#include <linux/string.h>
#include <linux/mount.h>
#include "fscrypt_private.h"

/**
 * fscrypt_policies_equal() - check whether two encryption policies are the same
 * @policy1: the first policy
 * @policy2: the second policy
 *
 * Return: %true if equal, else %false
 */
bool fscrypt_policies_equal(const union fscrypt_policy *policy1,
                            const union fscrypt_policy *policy2)
{
        if (policy1->version != policy2->version)
                return false;

        return !memcmp(policy1, policy2, fscrypt_policy_size(policy1));
}

int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy,
                               struct fscrypt_key_specifier *key_spec)
{
        switch (policy->version) {
        case FSCRYPT_POLICY_V1:
                key_spec->type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR;
                memcpy(key_spec->u.descriptor, policy->v1.master_key_descriptor,
                       FSCRYPT_KEY_DESCRIPTOR_SIZE);
                return 0;
        case FSCRYPT_POLICY_V2:
                key_spec->type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
                memcpy(key_spec->u.identifier, policy->v2.master_key_identifier,
                       FSCRYPT_KEY_IDENTIFIER_SIZE);
                return 0;
        default:
                WARN_ON_ONCE(1);
                return -EINVAL;
        }
}

const union fscrypt_policy *fscrypt_get_dummy_policy(struct super_block *sb)
{
        if (!sb->s_cop->get_dummy_policy)
                return NULL;
        return sb->s_cop->get_dummy_policy(sb);
}

/*
 * Return %true if the given combination of encryption modes is supported for v1
 * (and later) encryption policies.
 *
 * Do *not* add anything new here, since v1 encryption policies are deprecated.
 * New combinations of modes should go in fscrypt_valid_enc_modes_v2() only.
 */
static bool fscrypt_valid_enc_modes_v1(u32 contents_mode, u32 filenames_mode)
{
        if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
            filenames_mode == FSCRYPT_MODE_AES_256_CTS)
                return true;

        if (contents_mode == FSCRYPT_MODE_AES_128_CBC &&
            filenames_mode == FSCRYPT_MODE_AES_128_CTS)
                return true;

        if (contents_mode == FSCRYPT_MODE_ADIANTUM &&
            filenames_mode == FSCRYPT_MODE_ADIANTUM)
                return true;

        return false;
}

static bool fscrypt_valid_enc_modes_v2(u32 contents_mode, u32 filenames_mode)
{
        if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
            filenames_mode == FSCRYPT_MODE_AES_256_HCTR2)
                return true;

        if (contents_mode == FSCRYPT_MODE_SM4_XTS &&
            filenames_mode == FSCRYPT_MODE_SM4_CTS)
                return true;

        return fscrypt_valid_enc_modes_v1(contents_mode, filenames_mode);
}

static bool supported_direct_key_modes(const struct inode *inode,
                                       u32 contents_mode, u32 filenames_mode)
{
        const struct fscrypt_mode *mode;

        if (contents_mode != filenames_mode) {
                fscrypt_warn(inode,
                             "Direct key flag not allowed with different contents and filenames modes");
                return false;
        }
        mode = &fscrypt_modes[contents_mode];

        if (mode->ivsize < offsetofend(union fscrypt_iv, nonce)) {
                fscrypt_warn(inode, "Direct key flag not allowed with %s",
                             mode->friendly_name);
                return false;
        }
        return true;
}

static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy,
                                         const struct inode *inode)
{
        const char *type = (policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64)
                                ? "IV_INO_LBLK_64" : "IV_INO_LBLK_32";
        struct super_block *sb = inode->i_sb;

        /*
         * IV_INO_LBLK_* exist only because of hardware limitations, and
         * currently the only known use case for them involves AES-256-XTS.
         * That's also all we test currently.  For these reasons, for now only
         * allow AES-256-XTS here.  This can be relaxed later if a use case for
         * IV_INO_LBLK_* with other encryption modes arises.
         */
        if (policy->contents_encryption_mode != FSCRYPT_MODE_AES_256_XTS) {
                fscrypt_warn(inode,
                             "Can't use %s policy with contents mode other than AES-256-XTS",
                             type);
                return false;
        }

        /*
         * It's unsafe to include inode numbers in the IVs if the filesystem can
         * potentially renumber inodes, e.g. via filesystem shrinking.
         */
        if (!sb->s_cop->has_stable_inodes ||
            !sb->s_cop->has_stable_inodes(sb)) {
                fscrypt_warn(inode,
                             "Can't use %s policy on filesystem '%s' because it doesn't have stable inode numbers",
                             type, sb->s_id);
                return false;
        }

        /*
         * IV_INO_LBLK_64 and IV_INO_LBLK_32 both require that inode numbers fit
         * in 32 bits.  In principle, IV_INO_LBLK_32 could support longer inode
         * numbers because it hashes the inode number; however, currently the
         * inode number is gotten from inode::i_ino which is 'unsigned long'.
         * So for now the implementation limit is 32 bits.
         */
        if (!sb->s_cop->has_32bit_inodes) {
                fscrypt_warn(inode,
                             "Can't use %s policy on filesystem '%s' because its inode numbers are too long",
                             type, sb->s_id);
                return false;
        }

        /*
         * IV_INO_LBLK_64 and IV_INO_LBLK_32 both require that file data unit
         * indices fit in 32 bits.
         */
        if (fscrypt_max_file_dun_bits(sb,
                        fscrypt_policy_v2_du_bits(policy, inode)) > 32) {
                fscrypt_warn(inode,
                             "Can't use %s policy on filesystem '%s' because its maximum file size is too large",
                             type, sb->s_id);
                return false;
        }
        return true;
}

static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy,
                                        const struct inode *inode)
{
        if (!fscrypt_valid_enc_modes_v1(policy->contents_encryption_mode,
                                     policy->filenames_encryption_mode)) {
                fscrypt_warn(inode,
                             "Unsupported encryption modes (contents %d, filenames %d)",
                             policy->contents_encryption_mode,
                             policy->filenames_encryption_mode);
                return false;
        }

        if (policy->flags & ~(FSCRYPT_POLICY_FLAGS_PAD_MASK |
                              FSCRYPT_POLICY_FLAG_DIRECT_KEY)) {
                fscrypt_warn(inode, "Unsupported encryption flags (0x%02x)",
                             policy->flags);
                return false;
        }

        if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) &&
            !supported_direct_key_modes(inode, policy->contents_encryption_mode,
                                        policy->filenames_encryption_mode))
                return false;

        if (IS_CASEFOLDED(inode)) {
                /* With v1, there's no way to derive dirhash keys. */
                fscrypt_warn(inode,
                             "v1 policies can't be used on casefolded directories");
                return false;
        }

        return true;
}

static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy,
                                        const struct inode *inode)
{
        int count = 0;

        if (!fscrypt_valid_enc_modes_v2(policy->contents_encryption_mode,
                                     policy->filenames_encryption_mode)) {
                fscrypt_warn(inode,
                             "Unsupported encryption modes (contents %d, filenames %d)",
                             policy->contents_encryption_mode,
                             policy->filenames_encryption_mode);
                return false;
        }

        if (policy->flags & ~(FSCRYPT_POLICY_FLAGS_PAD_MASK |
                              FSCRYPT_POLICY_FLAG_DIRECT_KEY |
                              FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 |
                              FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) {
                fscrypt_warn(inode, "Unsupported encryption flags (0x%02x)",
                             policy->flags);
                return false;
        }

        count += !!(policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY);
        count += !!(policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64);
        count += !!(policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32);
        if (count > 1) {
                fscrypt_warn(inode, "Mutually exclusive encryption flags (0x%02x)",
                             policy->flags);
                return false;
        }

        if (policy->log2_data_unit_size) {
                if (!inode->i_sb->s_cop->supports_subblock_data_units) {
                        fscrypt_warn(inode,
                                     "Filesystem does not support configuring crypto data unit size");
                        return false;
                }
                if (policy->log2_data_unit_size > inode->i_blkbits ||
                    policy->log2_data_unit_size < SECTOR_SHIFT /* 9 */) {
                        fscrypt_warn(inode,
                                     "Unsupported log2_data_unit_size in encryption policy: %d",
                                     policy->log2_data_unit_size);
                        return false;
                }
                if (policy->log2_data_unit_size != inode->i_blkbits &&
                    (policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) {
                        /*
                         * Not safe to enable yet, as we need to ensure that DUN
                         * wraparound can only occur on a FS block boundary.
                         */
                        fscrypt_warn(inode,
                                     "Sub-block data units not yet supported with IV_INO_LBLK_32");
                        return false;
                }
        }

        if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) &&
            !supported_direct_key_modes(inode, policy->contents_encryption_mode,
                                        policy->filenames_encryption_mode))
                return false;

        if ((policy->flags & (FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 |
                              FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) &&
            !supported_iv_ino_lblk_policy(policy, inode))
                return false;

        if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) {
                fscrypt_warn(inode, "Reserved bits set in encryption policy");
                return false;
        }

        return true;
}

/**
 * fscrypt_supported_policy() - check whether an encryption policy is supported
 * @policy_u: the encryption policy
 * @inode: the inode on which the policy will be used
 *
 * Given an encryption policy, check whether all its encryption modes and other
 * settings are supported by this kernel on the given inode.  (But we don't
 * currently don't check for crypto API support here, so attempting to use an
 * algorithm not configured into the crypto API will still fail later.)
 *
 * Return: %true if supported, else %false
 */
bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
                              const struct inode *inode)
{
        switch (policy_u->version) {
        case FSCRYPT_POLICY_V1:
                return fscrypt_supported_v1_policy(&policy_u->v1, inode);
        case FSCRYPT_POLICY_V2:
                return fscrypt_supported_v2_policy(&policy_u->v2, inode);
        }
        return false;
}

/**
 * fscrypt_new_context() - create a new fscrypt_context
 * @ctx_u: output context
 * @policy_u: input policy
 * @nonce: nonce to use
 *
 * Create an fscrypt_context for an inode that is being assigned the given
 * encryption policy.  @nonce must be a new random nonce.
 *
 * Return: the size of the new context in bytes.
 */
static int fscrypt_new_context(union fscrypt_context *ctx_u,
                               const union fscrypt_policy *policy_u,
                               const u8 nonce[FSCRYPT_FILE_NONCE_SIZE])
{
        memset(ctx_u, 0, sizeof(*ctx_u));

        switch (policy_u->version) {
        case FSCRYPT_POLICY_V1: {
                const struct fscrypt_policy_v1 *policy = &policy_u->v1;
                struct fscrypt_context_v1 *ctx = &ctx_u->v1;

                ctx->version = FSCRYPT_CONTEXT_V1;
                ctx->contents_encryption_mode =
                        policy->contents_encryption_mode;
                ctx->filenames_encryption_mode =
                        policy->filenames_encryption_mode;
                ctx->flags = policy->flags;
                memcpy(ctx->master_key_descriptor,
                       policy->master_key_descriptor,
                       sizeof(ctx->master_key_descriptor));
                memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE);
                return sizeof(*ctx);
        }
        case FSCRYPT_POLICY_V2: {
                const struct fscrypt_policy_v2 *policy = &policy_u->v2;
                struct fscrypt_context_v2 *ctx = &ctx_u->v2;

                ctx->version = FSCRYPT_CONTEXT_V2;
                ctx->contents_encryption_mode =
                        policy->contents_encryption_mode;
                ctx->filenames_encryption_mode =
                        policy->filenames_encryption_mode;
                ctx->flags = policy->flags;
                ctx->log2_data_unit_size = policy->log2_data_unit_size;
                memcpy(ctx->master_key_identifier,
                       policy->master_key_identifier,
                       sizeof(ctx->master_key_identifier));
                memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE);
                return sizeof(*ctx);
        }
        }
        BUG();
}

/**
 * fscrypt_policy_from_context() - convert an fscrypt_context to
 *                                   an fscrypt_policy
 * @policy_u: output policy
 * @ctx_u: input context
 * @ctx_size: size of input context in bytes
 *
 * Given an fscrypt_context, build the corresponding fscrypt_policy.
 *
 * Return: 0 on success, or -EINVAL if the fscrypt_context has an unrecognized
 * version number or size.
 *
 * This does *not* validate the settings within the policy itself, e.g. the
 * modes, flags, and reserved bits.  Use fscrypt_supported_policy() for that.
 */
int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
                                const union fscrypt_context *ctx_u,
                                int ctx_size)
{
        memset(policy_u, 0, sizeof(*policy_u));

        if (!fscrypt_context_is_valid(ctx_u, ctx_size))
                return -EINVAL;

        switch (ctx_u->version) {
        case FSCRYPT_CONTEXT_V1: {
                const struct fscrypt_context_v1 *ctx = &ctx_u->v1;
                struct fscrypt_policy_v1 *policy = &policy_u->v1;

                policy->version = FSCRYPT_POLICY_V1;
                policy->contents_encryption_mode =
                        ctx->contents_encryption_mode;
                policy->filenames_encryption_mode =
                        ctx->filenames_encryption_mode;
                policy->flags = ctx->flags;
                memcpy(policy->master_key_descriptor,
                       ctx->master_key_descriptor,
                       sizeof(policy->master_key_descriptor));
                return 0;
        }
        case FSCRYPT_CONTEXT_V2: {
                const struct fscrypt_context_v2 *ctx = &ctx_u->v2;
                struct fscrypt_policy_v2 *policy = &policy_u->v2;

                policy->version = FSCRYPT_POLICY_V2;
                policy->contents_encryption_mode =
                        ctx->contents_encryption_mode;
                policy->filenames_encryption_mode =
                        ctx->filenames_encryption_mode;
                policy->flags = ctx->flags;
                policy->log2_data_unit_size = ctx->log2_data_unit_size;
                memcpy(policy->__reserved, ctx->__reserved,
                       sizeof(policy->__reserved));
                memcpy(policy->master_key_identifier,
                       ctx->master_key_identifier,
                       sizeof(policy->master_key_identifier));
                return 0;
        }
        }
        /* unreachable */
        return -EINVAL;
}

/* Retrieve an inode's encryption policy */
static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy)
{
        const struct fscrypt_inode_info *ci;
        union fscrypt_context ctx;
        int ret;

        ci = fscrypt_get_inode_info(inode);
        if (ci) {
                /* key available, use the cached policy */
                *policy = ci->ci_policy;
                return 0;
        }

        if (!IS_ENCRYPTED(inode))
                return -ENODATA;

        ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
        if (ret < 0)
                return (ret == -ERANGE) ? -EINVAL : ret;

        return fscrypt_policy_from_context(policy, &ctx, ret);
}

static int set_encryption_policy(struct inode *inode,
                                 const union fscrypt_policy *policy)
{
        u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
        union fscrypt_context ctx;
        int ctxsize;
        int err;

        if (!fscrypt_supported_policy(policy, inode))
                return -EINVAL;

        switch (policy->version) {
        case FSCRYPT_POLICY_V1:
                /*
                 * The original encryption policy version provided no way of
                 * verifying that the correct master key was supplied, which was
                 * insecure in scenarios where multiple users have access to the
                 * same encrypted files (even just read-only access).  The new
                 * encryption policy version fixes this and also implies use of
                 * an improved key derivation function and allows non-root users
                 * to securely remove keys.  So as long as compatibility with
                 * old kernels isn't required, it is recommended to use the new
                 * policy version for all new encrypted directories.
                 */
                pr_warn_once("%s (pid %d) is setting deprecated v1 encryption policy; recommend upgrading to v2.\n",
                             current->comm, current->pid);
                break;
        case FSCRYPT_POLICY_V2:
                err = fscrypt_verify_key_added(inode->i_sb,
                                               policy->v2.master_key_identifier);
                if (err)
                        return err;
                if (policy->v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)
                        pr_warn_once("%s (pid %d) is setting an IV_INO_LBLK_32 encryption policy.  This should only be used if there are certain hardware limitations.\n",
                                     current->comm, current->pid);
                break;
        default:
                WARN_ON_ONCE(1);
                return -EINVAL;
        }

        get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE);
        ctxsize = fscrypt_new_context(&ctx, policy, nonce);

        return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, NULL);
}

int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
{
        union fscrypt_policy policy;
        union fscrypt_policy existing_policy;
        struct inode *inode = file_inode(filp);
        u8 version;
        int size;
        int ret;

        if (get_user(policy.version, (const u8 __user *)arg))
                return -EFAULT;

        size = fscrypt_policy_size(&policy);
        if (size <= 0)
                return -EINVAL;

        /*
         * We should just copy the remaining 'size - 1' bytes here, but a
         * bizarre bug in gcc 7 and earlier (fixed by gcc r255731) causes gcc to
         * think that size can be 0 here (despite the check above!) *and* that
         * it's a compile-time constant.  Thus it would think copy_from_user()
         * is passed compile-time constant ULONG_MAX, causing the compile-time
         * buffer overflow check to fail, breaking the build. This only occurred
         * when building an i386 kernel with -Os and branch profiling enabled.
         *
         * Work around it by just copying the first byte again...
         */
        version = policy.version;
        if (copy_from_user(&policy, arg, size))
                return -EFAULT;
        policy.version = version;

        if (!inode_owner_or_capable(&nop_mnt_idmap, inode))
                return -EACCES;

        ret = mnt_want_write_file(filp);
        if (ret)
                return ret;

        inode_lock(inode);

        ret = fscrypt_get_policy(inode, &existing_policy);
        if (ret == -ENODATA) {
                if (!S_ISDIR(inode->i_mode))
                        ret = -ENOTDIR;
                else if (IS_DEADDIR(inode))
                        ret = -ENOENT;
                else if (!inode->i_sb->s_cop->empty_dir(inode))
                        ret = -ENOTEMPTY;
                else
                        ret = set_encryption_policy(inode, &policy);
        } else if (ret == -EINVAL ||
                   (ret == 0 && !fscrypt_policies_equal(&policy,
                                                        &existing_policy))) {
                /* The file already uses a different encryption policy. */
                ret = -EEXIST;
        }

        inode_unlock(inode);

        mnt_drop_write_file(filp);
        return ret;
}
EXPORT_SYMBOL(fscrypt_ioctl_set_policy);

/* Original ioctl version; can only get the original policy version */
int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
{
        union fscrypt_policy policy;
        int err;

        err = fscrypt_get_policy(file_inode(filp), &policy);
        if (err)
                return err;

        if (policy.version != FSCRYPT_POLICY_V1)
                return -EINVAL;

        if (copy_to_user(arg, &policy, sizeof(policy.v1)))
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL(fscrypt_ioctl_get_policy);

/* Extended ioctl version; can get policies of any version */
int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *uarg)
{
        struct fscrypt_get_policy_ex_arg arg;
        union fscrypt_policy *policy = (union fscrypt_policy *)&arg.policy;
        size_t policy_size;
        int err;

        /* arg is policy_size, then policy */
        BUILD_BUG_ON(offsetof(typeof(arg), policy_size) != 0);
        BUILD_BUG_ON(offsetofend(typeof(arg), policy_size) !=
                     offsetof(typeof(arg), policy));
        BUILD_BUG_ON(sizeof(arg.policy) != sizeof(*policy));

        err = fscrypt_get_policy(file_inode(filp), policy);
        if (err)
                return err;
        policy_size = fscrypt_policy_size(policy);

        if (copy_from_user(&arg, uarg, sizeof(arg.policy_size)))
                return -EFAULT;

        if (policy_size > arg.policy_size)
                return -EOVERFLOW;
        arg.policy_size = policy_size;

        if (copy_to_user(uarg, &arg, sizeof(arg.policy_size) + policy_size))
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_policy_ex);

/* FS_IOC_GET_ENCRYPTION_NONCE: retrieve file's encryption nonce for testing */
int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg)
{
        struct inode *inode = file_inode(filp);
        union fscrypt_context ctx;
        int ret;

        ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
        if (ret < 0)
                return ret;
        if (!fscrypt_context_is_valid(&ctx, ret))
                return -EINVAL;
        if (copy_to_user(arg, fscrypt_context_nonce(&ctx),
                         FSCRYPT_FILE_NONCE_SIZE))
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_nonce);

/**
 * fscrypt_has_permitted_context() - is a file's encryption policy permitted
 *                                     within its directory?
 *
 * @parent: inode for parent directory
 * @child: inode for file being looked up, opened, or linked into @parent
 *
 * Filesystems must call this before permitting access to an inode in a
 * situation where the parent directory is encrypted (either before allowing
 * ->lookup() to succeed, or for a regular file before allowing it to be opened)
 * and before any operation that involves linking an inode into an encrypted
 * directory, including link, rename, and cross rename.  It enforces the
 * constraint that within a given encrypted directory tree, all files use the
 * same encryption policy.  The pre-access check is needed to detect potentially
 * malicious offline violations of this constraint, while the link and rename
 * checks are needed to prevent online violations of this constraint.
 *
 * Return: 1 if permitted, 0 if forbidden.
 */
int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
{
        union fscrypt_policy parent_policy, child_policy;
        int err, err1, err2;

        /* No restrictions on file types which are never encrypted */
        if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) &&
            !S_ISLNK(child->i_mode))
                return 1;

        /* No restrictions if the parent directory is unencrypted */
        if (!IS_ENCRYPTED(parent))
                return 1;

        /* Encrypted directories must not contain unencrypted files */
        if (!IS_ENCRYPTED(child))
                return 0;

        /*
         * Both parent and child are encrypted, so verify they use the same
         * encryption policy.  Compare the cached policies if the keys are
         * available, otherwise retrieve and compare the fscrypt_contexts.
         *
         * Note that the fscrypt_context retrieval will be required frequently
         * when accessing an encrypted directory tree without the key.
         * Performance-wise this is not a big deal because we already don't
         * really optimize for file access without the key (to the extent that
         * such access is even possible), given that any attempted access
         * already causes a fscrypt_context retrieval and keyring search.
         *
         * In any case, if an unexpected error occurs, fall back to "forbidden".
         */

        err = fscrypt_get_encryption_info(parent, true);
        if (err)
                return 0;
        err = fscrypt_get_encryption_info(child, true);
        if (err)
                return 0;

        err1 = fscrypt_get_policy(parent, &parent_policy);
        err2 = fscrypt_get_policy(child, &child_policy);

        /*
         * Allow the case where the parent and child both have an unrecognized
         * encryption policy, so that files with an unrecognized encryption
         * policy can be deleted.
         */
        if (err1 == -EINVAL && err2 == -EINVAL)
                return 1;

        if (err1 || err2)
                return 0;

        return fscrypt_policies_equal(&parent_policy, &child_policy);
}
EXPORT_SYMBOL(fscrypt_has_permitted_context);

/*
 * Return the encryption policy that new files in the directory will inherit, or
 * NULL if none, or an ERR_PTR() on error.  If the directory is encrypted, also
 * ensure that its key is set up, so that the new filename can be encrypted.
 */
const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
{
        int err;

        if (IS_ENCRYPTED(dir)) {
                err = fscrypt_require_key(dir);
                if (err)
                        return ERR_PTR(err);
                return &dir->i_crypt_info->ci_policy;
        }

        return fscrypt_get_dummy_policy(dir->i_sb);
}

/**
 * fscrypt_context_for_new_inode() - create an encryption context for a new inode
 * @ctx: where context should be written
 * @inode: inode from which to fetch policy and nonce
 *
 * Given an in-core "prepared" (via fscrypt_prepare_new_inode) inode,
 * generate a new context and write it to ctx. ctx _must_ be at least
 * FSCRYPT_SET_CONTEXT_MAX_SIZE bytes.
 *
 * Return: size of the resulting context or a negative error code.
 */
int fscrypt_context_for_new_inode(void *ctx, struct inode *inode)
{
        struct fscrypt_inode_info *ci = inode->i_crypt_info;

        BUILD_BUG_ON(sizeof(union fscrypt_context) !=
                        FSCRYPT_SET_CONTEXT_MAX_SIZE);

        /* fscrypt_prepare_new_inode() should have set up the key already. */
        if (WARN_ON_ONCE(!ci))
                return -ENOKEY;

        return fscrypt_new_context(ctx, &ci->ci_policy, ci->ci_nonce);
}
EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode);

/**
 * fscrypt_set_context() - Set the fscrypt context of a new inode
 * @inode: a new inode
 * @fs_data: private data given by FS and passed to ->set_context()
 *
 * This should be called after fscrypt_prepare_new_inode(), generally during a
 * filesystem transaction.  Everything here must be %GFP_NOFS-safe.
 *
 * Return: 0 on success, -errno on failure
 */
int fscrypt_set_context(struct inode *inode, void *fs_data)
{
        struct fscrypt_inode_info *ci = inode->i_crypt_info;
        union fscrypt_context ctx;
        int ctxsize;

        ctxsize = fscrypt_context_for_new_inode(&ctx, inode);
        if (ctxsize < 0)
                return ctxsize;

        /*
         * This may be the first time the inode number is available, so do any
         * delayed key setup that requires the inode number.
         */
        if (ci->ci_policy.version == FSCRYPT_POLICY_V2 &&
            (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))
                fscrypt_hash_inode_number(ci, ci->ci_master_key);

        return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, fs_data);
}
EXPORT_SYMBOL_GPL(fscrypt_set_context);

/**
 * fscrypt_parse_test_dummy_encryption() - parse the test_dummy_encryption mount option
 * @param: the mount option
 * @dummy_policy: (input/output) the place to write the dummy policy that will
 *        result from parsing the option.  Zero-initialize this.  If a policy is
 *        already set here (due to test_dummy_encryption being given multiple
 *        times), then this function will verify that the policies are the same.
 *
 * Return: 0 on success; -EINVAL if the argument is invalid; -EEXIST if the
 *           argument conflicts with one already specified; or -ENOMEM.
 */
int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param,
                                struct fscrypt_dummy_policy *dummy_policy)
{
        const char *arg = "v2";
        union fscrypt_policy *policy;
        int err;

        if (param->type == fs_value_is_string && *param->string)
                arg = param->string;

        policy = kzalloc(sizeof(*policy), GFP_KERNEL);
        if (!policy)
                return -ENOMEM;

        if (!strcmp(arg, "v1")) {
                policy->version = FSCRYPT_POLICY_V1;
                policy->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
                policy->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
                memset(policy->v1.master_key_descriptor, 0x42,
                       FSCRYPT_KEY_DESCRIPTOR_SIZE);
        } else if (!strcmp(arg, "v2")) {
                policy->version = FSCRYPT_POLICY_V2;
                policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
                policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
                err = fscrypt_get_test_dummy_key_identifier(
                                policy->v2.master_key_identifier);
                if (err)
                        goto out;
        } else {
                err = -EINVAL;
                goto out;
        }

        if (dummy_policy->policy) {
                if (fscrypt_policies_equal(policy, dummy_policy->policy))
                        err = 0;
                else
                        err = -EEXIST;
                goto out;
        }
        dummy_policy->policy = policy;
        policy = NULL;
        err = 0;
out:
        kfree(policy);
        return err;
}
EXPORT_SYMBOL_GPL(fscrypt_parse_test_dummy_encryption);

/**
 * fscrypt_dummy_policies_equal() - check whether two dummy policies are equal
 * @p1: the first test dummy policy (may be unset)
 * @p2: the second test dummy policy (may be unset)
 *
 * Return: %true if the dummy policies are both set and equal, or both unset.
 */
bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1,
                                  const struct fscrypt_dummy_policy *p2)
{
        if (!p1->policy && !p2->policy)
                return true;
        if (!p1->policy || !p2->policy)
                return false;
        return fscrypt_policies_equal(p1->policy, p2->policy);
}
EXPORT_SYMBOL_GPL(fscrypt_dummy_policies_equal);

/**
 * fscrypt_show_test_dummy_encryption() - show '-o test_dummy_encryption'
 * @seq: the seq_file to print the option to
 * @sep: the separator character to use
 * @sb: the filesystem whose options are being shown
 *
 * Show the test_dummy_encryption mount option, if it was specified.
 * This is mainly used for /proc/mounts.
 */
void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep,
                                        struct super_block *sb)
{
        const union fscrypt_policy *policy = fscrypt_get_dummy_policy(sb);
        int vers;

        if (!policy)
                return;

        vers = policy->version;
        if (vers == FSCRYPT_POLICY_V1) /* Handle numbering quirk */
                vers = 1;

        seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, vers);
}
EXPORT_SYMBOL_GPL(fscrypt_show_test_dummy_encryption);



























































































    1 









    1 





    1 






    1 













    1 























    2 





    2 













    1 















    1 
    1 




    1 









    1 




    1 

    1 



    2 












    2 






    1 


















































    2 









































































































    1 









    1 



    1 























































































































































    2 




    1 





    2 






























































































    1 



    1 



























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/fs/seq_file.c
 *
 * helper functions for making synthetic files from sequences of records.
 * initial implementation -- AV, Oct 2001.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/cache.h>
#include <linux/fs.h>
#include <linux/export.h>
#include <linux/seq_file.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/mm.h>
#include <linux/printk.h>
#include <linux/string_helpers.h>
#include <linux/uio.h>

#include <linux/uaccess.h>
#include <asm/page.h>

static struct kmem_cache *seq_file_cache __ro_after_init;

static void seq_set_overflow(struct seq_file *m)
{
        m->count = m->size;
}

static void *seq_buf_alloc(unsigned long size)
{
        if (unlikely(size > MAX_RW_COUNT))
                return NULL;

        return kvmalloc(size, GFP_KERNEL_ACCOUNT);
}

/**
 *        seq_open -        initialize sequential file
 *        @file: file we initialize
 *        @op: method table describing the sequence
 *
 *        seq_open() sets @file, associating it with a sequence described
 *        by @op.  @op->start() sets the iterator up and returns the first
 *        element of sequence. @op->stop() shuts it down.  @op->next()
 *        returns the next element of sequence.  @op->show() prints element
 *        into the buffer.  In case of error ->start() and ->next() return
 *        ERR_PTR(error).  In the end of sequence they return %NULL. ->show()
 *        returns 0 in case of success and negative number in case of error.
 *        Returning SEQ_SKIP means "discard this element and move on".
 *        Note: seq_open() will allocate a struct seq_file and store its
 *        pointer in @file->private_data. This pointer should not be modified.
 */
int seq_open(struct file *file, const struct seq_operations *op)
{
        struct seq_file *p;

        WARN_ON(file->private_data);

        p = kmem_cache_zalloc(seq_file_cache, GFP_KERNEL);
        if (!p)
                return -ENOMEM;

        file->private_data = p;

        mutex_init(&p->lock);
        p->op = op;

        // No refcounting: the lifetime of 'p' is constrained
        // to the lifetime of the file.
        p->file = file;

        /*
         * seq_files support lseek() and pread().  They do not implement
         * write() at all, but we clear FMODE_PWRITE here for historical
         * reasons.
         *
         * If a client of seq_files a) implements file.write() and b) wishes to
         * support pwrite() then that client will need to implement its own
         * file.open() which calls seq_open() and then sets FMODE_PWRITE.
         */
        file->f_mode &= ~FMODE_PWRITE;
        return 0;
}
EXPORT_SYMBOL(seq_open);

static int traverse(struct seq_file *m, loff_t offset)
{
        loff_t pos = 0;
        int error = 0;
        void *p;

        m->index = 0;
        m->count = m->from = 0;
        if (!offset)
                return 0;

        if (!m->buf) {
                m->buf = seq_buf_alloc(m->size = PAGE_SIZE);
                if (!m->buf)
                        return -ENOMEM;
        }
        p = m->op->start(m, &m->index);
        while (p) {
                error = PTR_ERR(p);
                if (IS_ERR(p))
                        break;
                error = m->op->show(m, p);
                if (error < 0)
                        break;
                if (unlikely(error)) {
                        error = 0;
                        m->count = 0;
                }
                if (seq_has_overflowed(m))
                        goto Eoverflow;
                p = m->op->next(m, p, &m->index);
                if (pos + m->count > offset) {
                        m->from = offset - pos;
                        m->count -= m->from;
                        break;
                }
                pos += m->count;
                m->count = 0;
                if (pos == offset)
                        break;
        }
        m->op->stop(m, p);
        return error;

Eoverflow:
        m->op->stop(m, p);
        kvfree(m->buf);
        m->count = 0;
        m->buf = seq_buf_alloc(m->size <<= 1);
        return !m->buf ? -ENOMEM : -EAGAIN;
}

/**
 *        seq_read -        ->read() method for sequential files.
 *        @file: the file to read from
 *        @buf: the buffer to read to
 *        @size: the maximum number of bytes to read
 *        @ppos: the current position in the file
 *
 *        Ready-made ->f_op->read()
 */
ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
{
        struct iovec iov = { .iov_base = buf, .iov_len = size};
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        init_sync_kiocb(&kiocb, file);
        iov_iter_init(&iter, ITER_DEST, &iov, 1, size);

        kiocb.ki_pos = *ppos;
        ret = seq_read_iter(&kiocb, &iter);
        *ppos = kiocb.ki_pos;
        return ret;
}
EXPORT_SYMBOL(seq_read);

/*
 * Ready-made ->f_op->read_iter()
 */
ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        struct seq_file *m = iocb->ki_filp->private_data;
        size_t copied = 0;
        size_t n;
        void *p;
        int err = 0;

        if (!iov_iter_count(iter))
                return 0;

        mutex_lock(&m->lock);

        /*
         * if request is to read from zero offset, reset iterator to first
         * record as it might have been already advanced by previous requests
         */
        if (iocb->ki_pos == 0) {
                m->index = 0;
                m->count = 0;
        }

        /* Don't assume ki_pos is where we left it */
        if (unlikely(iocb->ki_pos != m->read_pos)) {
                while ((err = traverse(m, iocb->ki_pos)) == -EAGAIN)
                        ;
                if (err) {
                        /* With prejudice... */
                        m->read_pos = 0;
                        m->index = 0;
                        m->count = 0;
                        goto Done;
                } else {
                        m->read_pos = iocb->ki_pos;
                }
        }

        /* grab buffer if we didn't have one */
        if (!m->buf) {
                m->buf = seq_buf_alloc(m->size = PAGE_SIZE);
                if (!m->buf)
                        goto Enomem;
        }
        // something left in the buffer - copy it out first
        if (m->count) {
                n = copy_to_iter(m->buf + m->from, m->count, iter);
                m->count -= n;
                m->from += n;
                copied += n;
                if (m->count)        // hadn't managed to copy everything
                        goto Done;
        }
        // get a non-empty record in the buffer
        m->from = 0;
        p = m->op->start(m, &m->index);
        while (1) {
                err = PTR_ERR(p);
                if (!p || IS_ERR(p))        // EOF or an error
                        break;
                err = m->op->show(m, p);
                if (err < 0)                // hard error
                        break;
                if (unlikely(err))        // ->show() says "skip it"
                        m->count = 0;
                if (unlikely(!m->count)) { // empty record
                        p = m->op->next(m, p, &m->index);
                        continue;
                }
                if (!seq_has_overflowed(m)) // got it
                        goto Fill;
                // need a bigger buffer
                m->op->stop(m, p);
                kvfree(m->buf);
                m->count = 0;
                m->buf = seq_buf_alloc(m->size <<= 1);
                if (!m->buf)
                        goto Enomem;
                p = m->op->start(m, &m->index);
        }
        // EOF or an error
        m->op->stop(m, p);
        m->count = 0;
        goto Done;
Fill:
        // one non-empty record is in the buffer; if they want more,
        // try to fit more in, but in any case we need to advance
        // the iterator once for every record shown.
        while (1) {
                size_t offs = m->count;
                loff_t pos = m->index;

                p = m->op->next(m, p, &m->index);
                if (pos == m->index) {
                        pr_info_ratelimited("buggy .next function %ps did not update position index\n",
                                            m->op->next);
                        m->index++;
                }
                if (!p || IS_ERR(p))        // no next record for us
                        break;
                if (m->count >= iov_iter_count(iter))
                        break;
                err = m->op->show(m, p);
                if (err > 0) {                // ->show() says "skip it"
                        m->count = offs;
                } else if (err || seq_has_overflowed(m)) {
                        m->count = offs;
                        break;
                }
        }
        m->op->stop(m, p);
        n = copy_to_iter(m->buf, m->count, iter);
        copied += n;
        m->count -= n;
        m->from = n;
Done:
        if (unlikely(!copied)) {
                copied = m->count ? -EFAULT : err;
        } else {
                iocb->ki_pos += copied;
                m->read_pos += copied;
        }
        mutex_unlock(&m->lock);
        return copied;
Enomem:
        err = -ENOMEM;
        goto Done;
}
EXPORT_SYMBOL(seq_read_iter);

/**
 *        seq_lseek -        ->llseek() method for sequential files.
 *        @file: the file in question
 *        @offset: new position
 *        @whence: 0 for absolute, 1 for relative position
 *
 *        Ready-made ->f_op->llseek()
 */
loff_t seq_lseek(struct file *file, loff_t offset, int whence)
{
        struct seq_file *m = file->private_data;
        loff_t retval = -EINVAL;

        mutex_lock(&m->lock);
        switch (whence) {
        case SEEK_CUR:
                offset += file->f_pos;
                fallthrough;
        case SEEK_SET:
                if (offset < 0)
                        break;
                retval = offset;
                if (offset != m->read_pos) {
                        while ((retval = traverse(m, offset)) == -EAGAIN)
                                ;
                        if (retval) {
                                /* with extreme prejudice... */
                                file->f_pos = 0;
                                m->read_pos = 0;
                                m->index = 0;
                                m->count = 0;
                        } else {
                                m->read_pos = offset;
                                retval = file->f_pos = offset;
                        }
                } else {
                        file->f_pos = offset;
                }
        }
        mutex_unlock(&m->lock);
        return retval;
}
EXPORT_SYMBOL(seq_lseek);

/**
 *        seq_release -        free the structures associated with sequential file.
 *        @file: file in question
 *        @inode: its inode
 *
 *        Frees the structures associated with sequential file; can be used
 *        as ->f_op->release() if you don't have private data to destroy.
 */
int seq_release(struct inode *inode, struct file *file)
{
        struct seq_file *m = file->private_data;
        kvfree(m->buf);
        kmem_cache_free(seq_file_cache, m);
        return 0;
}
EXPORT_SYMBOL(seq_release);

/**
 * seq_escape_mem - print data into buffer, escaping some characters
 * @m: target buffer
 * @src: source buffer
 * @len: size of source buffer
 * @flags: flags to pass to string_escape_mem()
 * @esc: set of characters that need escaping
 *
 * Puts data into buffer, replacing each occurrence of character from
 * given class (defined by @flags and @esc) with printable escaped sequence.
 *
 * Use seq_has_overflowed() to check for errors.
 */
void seq_escape_mem(struct seq_file *m, const char *src, size_t len,
                    unsigned int flags, const char *esc)
{
        char *buf;
        size_t size = seq_get_buf(m, &buf);
        int ret;

        ret = string_escape_mem(src, len, buf, size, flags, esc);
        seq_commit(m, ret < size ? ret : -1);
}
EXPORT_SYMBOL(seq_escape_mem);

void seq_vprintf(struct seq_file *m, const char *f, va_list args)
{
        int len;

        if (m->count < m->size) {
                len = vsnprintf(m->buf + m->count, m->size - m->count, f, args);
                if (m->count + len < m->size) {
                        m->count += len;
                        return;
                }
        }
        seq_set_overflow(m);
}
EXPORT_SYMBOL(seq_vprintf);

void seq_printf(struct seq_file *m, const char *f, ...)
{
        va_list args;

        va_start(args, f);
        seq_vprintf(m, f, args);
        va_end(args);
}
EXPORT_SYMBOL(seq_printf);

#ifdef CONFIG_BINARY_PRINTF
void seq_bprintf(struct seq_file *m, const char *f, const u32 *binary)
{
        int len;

        if (m->count < m->size) {
                len = bstr_printf(m->buf + m->count, m->size - m->count, f,
                                  binary);
                if (m->count + len < m->size) {
                        m->count += len;
                        return;
                }
        }
        seq_set_overflow(m);
}
EXPORT_SYMBOL(seq_bprintf);
#endif /* CONFIG_BINARY_PRINTF */

/**
 *        mangle_path -        mangle and copy path to buffer beginning
 *        @s: buffer start
 *        @p: beginning of path in above buffer
 *        @esc: set of characters that need escaping
 *
 *      Copy the path from @p to @s, replacing each occurrence of character from
 *      @esc with usual octal escape.
 *      Returns pointer past last written character in @s, or NULL in case of
 *      failure.
 */
char *mangle_path(char *s, const char *p, const char *esc)
{
        while (s <= p) {
                char c = *p++;
                if (!c) {
                        return s;
                } else if (!strchr(esc, c)) {
                        *s++ = c;
                } else if (s + 4 > p) {
                        break;
                } else {
                        *s++ = '\\';
                        *s++ = '0' + ((c & 0300) >> 6);
                        *s++ = '0' + ((c & 070) >> 3);
                        *s++ = '0' + (c & 07);
                }
        }
        return NULL;
}
EXPORT_SYMBOL(mangle_path);

/**
 * seq_path - seq_file interface to print a pathname
 * @m: the seq_file handle
 * @path: the struct path to print
 * @esc: set of characters to escape in the output
 *
 * return the absolute path of 'path', as represented by the
 * dentry / mnt pair in the path parameter.
 */
int seq_path(struct seq_file *m, const struct path *path, const char *esc)
{
        char *buf;
        size_t size = seq_get_buf(m, &buf);
        int res = -1;

        if (size) {
                char *p = d_path(path, buf, size);
                if (!IS_ERR(p)) {
                        char *end = mangle_path(buf, p, esc);
                        if (end)
                                res = end - buf;
                }
        }
        seq_commit(m, res);

        return res;
}
EXPORT_SYMBOL(seq_path);

/**
 * seq_file_path - seq_file interface to print a pathname of a file
 * @m: the seq_file handle
 * @file: the struct file to print
 * @esc: set of characters to escape in the output
 *
 * return the absolute path to the file.
 */
int seq_file_path(struct seq_file *m, struct file *file, const char *esc)
{
        return seq_path(m, &file->f_path, esc);
}
EXPORT_SYMBOL(seq_file_path);

/*
 * Same as seq_path, but relative to supplied root.
 */
int seq_path_root(struct seq_file *m, const struct path *path,
                  const struct path *root, const char *esc)
{
        char *buf;
        size_t size = seq_get_buf(m, &buf);
        int res = -ENAMETOOLONG;

        if (size) {
                char *p;

                p = __d_path(path, root, buf, size);
                if (!p)
                        return SEQ_SKIP;
                res = PTR_ERR(p);
                if (!IS_ERR(p)) {
                        char *end = mangle_path(buf, p, esc);
                        if (end)
                                res = end - buf;
                        else
                                res = -ENAMETOOLONG;
                }
        }
        seq_commit(m, res);

        return res < 0 && res != -ENAMETOOLONG ? res : 0;
}

/*
 * returns the path of the 'dentry' from the root of its filesystem.
 */
int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc)
{
        char *buf;
        size_t size = seq_get_buf(m, &buf);
        int res = -1;

        if (size) {
                char *p = dentry_path(dentry, buf, size);
                if (!IS_ERR(p)) {
                        char *end = mangle_path(buf, p, esc);
                        if (end)
                                res = end - buf;
                }
        }
        seq_commit(m, res);

        return res;
}
EXPORT_SYMBOL(seq_dentry);

void *single_start(struct seq_file *p, loff_t *pos)
{
        return *pos ? NULL : SEQ_START_TOKEN;
}

static void *single_next(struct seq_file *p, void *v, loff_t *pos)
{
        ++*pos;
        return NULL;
}

static void single_stop(struct seq_file *p, void *v)
{
}

int single_open(struct file *file, int (*show)(struct seq_file *, void *),
                void *data)
{
        struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_ACCOUNT);
        int res = -ENOMEM;

        if (op) {
                op->start = single_start;
                op->next = single_next;
                op->stop = single_stop;
                op->show = show;
                res = seq_open(file, op);
                if (!res)
                        ((struct seq_file *)file->private_data)->private = data;
                else
                        kfree(op);
        }
        return res;
}
EXPORT_SYMBOL(single_open);

int single_open_size(struct file *file, int (*show)(struct seq_file *, void *),
                void *data, size_t size)
{
        char *buf = seq_buf_alloc(size);
        int ret;
        if (!buf)
                return -ENOMEM;
        ret = single_open(file, show, data);
        if (ret) {
                kvfree(buf);
                return ret;
        }
        ((struct seq_file *)file->private_data)->buf = buf;
        ((struct seq_file *)file->private_data)->size = size;
        return 0;
}
EXPORT_SYMBOL(single_open_size);

int single_release(struct inode *inode, struct file *file)
{
        const struct seq_operations *op = ((struct seq_file *)file->private_data)->op;
        int res = seq_release(inode, file);
        kfree(op);
        return res;
}
EXPORT_SYMBOL(single_release);

int seq_release_private(struct inode *inode, struct file *file)
{
        struct seq_file *seq = file->private_data;

        kfree(seq->private);
        seq->private = NULL;
        return seq_release(inode, file);
}
EXPORT_SYMBOL(seq_release_private);

void *__seq_open_private(struct file *f, const struct seq_operations *ops,
                int psize)
{
        int rc;
        void *private;
        struct seq_file *seq;

        private = kzalloc(psize, GFP_KERNEL_ACCOUNT);
        if (private == NULL)
                goto out;

        rc = seq_open(f, ops);
        if (rc < 0)
                goto out_free;

        seq = f->private_data;
        seq->private = private;
        return private;

out_free:
        kfree(private);
out:
        return NULL;
}
EXPORT_SYMBOL(__seq_open_private);

int seq_open_private(struct file *filp, const struct seq_operations *ops,
                int psize)
{
        return __seq_open_private(filp, ops, psize) ? 0 : -ENOMEM;
}
EXPORT_SYMBOL(seq_open_private);

void seq_putc(struct seq_file *m, char c)
{
        if (m->count >= m->size)
                return;

        m->buf[m->count++] = c;
}
EXPORT_SYMBOL(seq_putc);

void __seq_puts(struct seq_file *m, const char *s)
{
        seq_write(m, s, strlen(s));
}
EXPORT_SYMBOL(__seq_puts);

/**
 * seq_put_decimal_ull_width - A helper routine for putting decimal numbers
 *                                without rich format of printf().
 * only 'unsigned long long' is supported.
 * @m: seq_file identifying the buffer to which data should be written
 * @delimiter: a string which is printed before the number
 * @num: the number
 * @width: a minimum field width
 *
 * This routine will put strlen(delimiter) + number into seq_filed.
 * This routine is very quick when you show lots of numbers.
 * In usual cases, it will be better to use seq_printf(). It's easier to read.
 */
void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter,
                         unsigned long long num, unsigned int width)
{
        int len;

        if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */
                goto overflow;

        if (delimiter && delimiter[0]) {
                if (delimiter[1] == 0)
                        seq_putc(m, delimiter[0]);
                else
                        seq_puts(m, delimiter);
        }

        if (!width)
                width = 1;

        if (m->count + width >= m->size)
                goto overflow;

        len = num_to_str(m->buf + m->count, m->size - m->count, num, width);
        if (!len)
                goto overflow;

        m->count += len;
        return;

overflow:
        seq_set_overflow(m);
}

void seq_put_decimal_ull(struct seq_file *m, const char *delimiter,
                         unsigned long long num)
{
        return seq_put_decimal_ull_width(m, delimiter, num, 0);
}
EXPORT_SYMBOL(seq_put_decimal_ull);

/**
 * seq_put_hex_ll - put a number in hexadecimal notation
 * @m: seq_file identifying the buffer to which data should be written
 * @delimiter: a string which is printed before the number
 * @v: the number
 * @width: a minimum field width
 *
 * seq_put_hex_ll(m, "", v, 8) is equal to seq_printf(m, "%08llx", v)
 *
 * This routine is very quick when you show lots of numbers.
 * In usual cases, it will be better to use seq_printf(). It's easier to read.
 */
void seq_put_hex_ll(struct seq_file *m, const char *delimiter,
                                unsigned long long v, unsigned int width)
{
        unsigned int len;
        int i;

        if (delimiter && delimiter[0]) {
                if (delimiter[1] == 0)
                        seq_putc(m, delimiter[0]);
                else
                        seq_puts(m, delimiter);
        }

        /* If x is 0, the result of __builtin_clzll is undefined */
        if (v == 0)
                len = 1;
        else
                len = (sizeof(v) * 8 - __builtin_clzll(v) + 3) / 4;

        if (len < width)
                len = width;

        if (m->count + len > m->size) {
                seq_set_overflow(m);
                return;
        }

        for (i = len - 1; i >= 0; i--) {
                m->buf[m->count + i] = hex_asc[0xf & v];
                v = v >> 4;
        }
        m->count += len;
}

void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num)
{
        int len;

        if (m->count + 3 >= m->size) /* we'll write 2 bytes at least */
                goto overflow;

        if (delimiter && delimiter[0]) {
                if (delimiter[1] == 0)
                        seq_putc(m, delimiter[0]);
                else
                        seq_puts(m, delimiter);
        }

        if (m->count + 2 >= m->size)
                goto overflow;

        if (num < 0) {
                m->buf[m->count++] = '-';
                num = -num;
        }

        if (num < 10) {
                m->buf[m->count++] = num + '0';
                return;
        }

        len = num_to_str(m->buf + m->count, m->size - m->count, num, 0);
        if (!len)
                goto overflow;

        m->count += len;
        return;

overflow:
        seq_set_overflow(m);
}
EXPORT_SYMBOL(seq_put_decimal_ll);

/**
 * seq_write - write arbitrary data to buffer
 * @seq: seq_file identifying the buffer to which data should be written
 * @data: data address
 * @len: number of bytes
 *
 * Return 0 on success, non-zero otherwise.
 */
int seq_write(struct seq_file *seq, const void *data, size_t len)
{
        if (seq->count + len < seq->size) {
                memcpy(seq->buf + seq->count, data, len);
                seq->count += len;
                return 0;
        }
        seq_set_overflow(seq);
        return -1;
}
EXPORT_SYMBOL(seq_write);

/**
 * seq_pad - write padding spaces to buffer
 * @m: seq_file identifying the buffer to which data should be written
 * @c: the byte to append after padding if non-zero
 */
void seq_pad(struct seq_file *m, char c)
{
        int size = m->pad_until - m->count;
        if (size > 0) {
                if (size + m->count > m->size) {
                        seq_set_overflow(m);
                        return;
                }
                memset(m->buf + m->count, ' ', size);
                m->count += size;
        }
        if (c)
                seq_putc(m, c);
}
EXPORT_SYMBOL(seq_pad);

/* A complete analogue of print_hex_dump() */
void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
                  int rowsize, int groupsize, const void *buf, size_t len,
                  bool ascii)
{
        const u8 *ptr = buf;
        int i, linelen, remaining = len;
        char *buffer;
        size_t size;
        int ret;

        if (rowsize != 16 && rowsize != 32)
                rowsize = 16;

        for (i = 0; i < len && !seq_has_overflowed(m); i += rowsize) {
                linelen = min(remaining, rowsize);
                remaining -= rowsize;

                switch (prefix_type) {
                case DUMP_PREFIX_ADDRESS:
                        seq_printf(m, "%s%p: ", prefix_str, ptr + i);
                        break;
                case DUMP_PREFIX_OFFSET:
                        seq_printf(m, "%s%.8x: ", prefix_str, i);
                        break;
                default:
                        seq_printf(m, "%s", prefix_str);
                        break;
                }

                size = seq_get_buf(m, &buffer);
                ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
                                         buffer, size, ascii);
                seq_commit(m, ret < size ? ret : -1);

                seq_putc(m, '\n');
        }
}
EXPORT_SYMBOL(seq_hex_dump);

struct list_head *seq_list_start(struct list_head *head, loff_t pos)
{
        struct list_head *lh;

        list_for_each(lh, head)
                if (pos-- == 0)
                        return lh;

        return NULL;
}
EXPORT_SYMBOL(seq_list_start);

struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
{
        if (!pos)
                return head;

        return seq_list_start(head, pos - 1);
}
EXPORT_SYMBOL(seq_list_start_head);

struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
{
        struct list_head *lh;

        lh = ((struct list_head *)v)->next;
        ++*ppos;
        return lh == head ? NULL : lh;
}
EXPORT_SYMBOL(seq_list_next);

struct list_head *seq_list_start_rcu(struct list_head *head, loff_t pos)
{
        struct list_head *lh;

        list_for_each_rcu(lh, head)
                if (pos-- == 0)
                        return lh;

        return NULL;
}
EXPORT_SYMBOL(seq_list_start_rcu);

struct list_head *seq_list_start_head_rcu(struct list_head *head, loff_t pos)
{
        if (!pos)
                return head;

        return seq_list_start_rcu(head, pos - 1);
}
EXPORT_SYMBOL(seq_list_start_head_rcu);

struct list_head *seq_list_next_rcu(void *v, struct list_head *head,
                                    loff_t *ppos)
{
        struct list_head *lh;

        lh = list_next_rcu((struct list_head *)v);
        ++*ppos;
        return lh == head ? NULL : lh;
}
EXPORT_SYMBOL(seq_list_next_rcu);

/**
 * seq_hlist_start - start an iteration of a hlist
 * @head: the head of the hlist
 * @pos:  the start position of the sequence
 *
 * Called at seq_file->op->start().
 */
struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos)
{
        struct hlist_node *node;

        hlist_for_each(node, head)
                if (pos-- == 0)
                        return node;
        return NULL;
}
EXPORT_SYMBOL(seq_hlist_start);

/**
 * seq_hlist_start_head - start an iteration of a hlist
 * @head: the head of the hlist
 * @pos:  the start position of the sequence
 *
 * Called at seq_file->op->start(). Call this function if you want to
 * print a header at the top of the output.
 */
struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos)
{
        if (!pos)
                return SEQ_START_TOKEN;

        return seq_hlist_start(head, pos - 1);
}
EXPORT_SYMBOL(seq_hlist_start_head);

/**
 * seq_hlist_next - move to the next position of the hlist
 * @v:    the current iterator
 * @head: the head of the hlist
 * @ppos: the current position
 *
 * Called at seq_file->op->next().
 */
struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head,
                                  loff_t *ppos)
{
        struct hlist_node *node = v;

        ++*ppos;
        if (v == SEQ_START_TOKEN)
                return head->first;
        else
                return node->next;
}
EXPORT_SYMBOL(seq_hlist_next);

/**
 * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU
 * @head: the head of the hlist
 * @pos:  the start position of the sequence
 *
 * Called at seq_file->op->start().
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head,
                                       loff_t pos)
{
        struct hlist_node *node;

        __hlist_for_each_rcu(node, head)
                if (pos-- == 0)
                        return node;
        return NULL;
}
EXPORT_SYMBOL(seq_hlist_start_rcu);

/**
 * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU
 * @head: the head of the hlist
 * @pos:  the start position of the sequence
 *
 * Called at seq_file->op->start(). Call this function if you want to
 * print a header at the top of the output.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head,
                                            loff_t pos)
{
        if (!pos)
                return SEQ_START_TOKEN;

        return seq_hlist_start_rcu(head, pos - 1);
}
EXPORT_SYMBOL(seq_hlist_start_head_rcu);

/**
 * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU
 * @v:    the current iterator
 * @head: the head of the hlist
 * @ppos: the current position
 *
 * Called at seq_file->op->next().
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
struct hlist_node *seq_hlist_next_rcu(void *v,
                                      struct hlist_head *head,
                                      loff_t *ppos)
{
        struct hlist_node *node = v;

        ++*ppos;
        if (v == SEQ_START_TOKEN)
                return rcu_dereference(head->first);
        else
                return rcu_dereference(node->next);
}
EXPORT_SYMBOL(seq_hlist_next_rcu);

/**
 * seq_hlist_start_percpu - start an iteration of a percpu hlist array
 * @head: pointer to percpu array of struct hlist_heads
 * @cpu:  pointer to cpu "cursor"
 * @pos:  start position of sequence
 *
 * Called at seq_file->op->start().
 */
struct hlist_node *
seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos)
{
        struct hlist_node *node;

        for_each_possible_cpu(*cpu) {
                hlist_for_each(node, per_cpu_ptr(head, *cpu)) {
                        if (pos-- == 0)
                                return node;
                }
        }
        return NULL;
}
EXPORT_SYMBOL(seq_hlist_start_percpu);

/**
 * seq_hlist_next_percpu - move to the next position of the percpu hlist array
 * @v:    pointer to current hlist_node
 * @head: pointer to percpu array of struct hlist_heads
 * @cpu:  pointer to cpu "cursor"
 * @pos:  start position of sequence
 *
 * Called at seq_file->op->next().
 */
struct hlist_node *
seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head,
                        int *cpu, loff_t *pos)
{
        struct hlist_node *node = v;

        ++*pos;

        if (node->next)
                return node->next;

        for (*cpu = cpumask_next(*cpu, cpu_possible_mask); *cpu < nr_cpu_ids;
             *cpu = cpumask_next(*cpu, cpu_possible_mask)) {
                struct hlist_head *bucket = per_cpu_ptr(head, *cpu);

                if (!hlist_empty(bucket))
                        return bucket->first;
        }
        return NULL;
}
EXPORT_SYMBOL(seq_hlist_next_percpu);

void __init seq_file_init(void)
{
        seq_file_cache = KMEM_CACHE(seq_file, SLAB_ACCOUNT|SLAB_PANIC);
}






















































    1 











































































































































    1 








    1 














    1 























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
// SPDX-License-Identifier: GPL-2.0
/*
 *
 * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
 *
 * TODO: try to use extents tree (instead of array)
 */

#include <linux/blkdev.h>
#include <linux/fs.h>
#include <linux/log2.h>

#include "debug.h"
#include "ntfs.h"
#include "ntfs_fs.h"

/* runs_tree is a continues memory. Try to avoid big size. */
#define NTFS3_RUN_MAX_BYTES 0x10000

struct ntfs_run {
        CLST vcn; /* Virtual cluster number. */
        CLST len; /* Length in clusters. */
        CLST lcn; /* Logical cluster number. */
};

/*
 * run_lookup - Lookup the index of a MCB entry that is first <= vcn.
 *
 * Case of success it will return non-zero value and set
 * @index parameter to index of entry been found.
 * Case of entry missing from list 'index' will be set to
 * point to insertion position for the entry question.
 */
static bool run_lookup(const struct runs_tree *run, CLST vcn, size_t *index)
{
        size_t min_idx, max_idx, mid_idx;
        struct ntfs_run *r;

        if (!run->count) {
                *index = 0;
                return false;
        }

        min_idx = 0;
        max_idx = run->count - 1;

        /* Check boundary cases specially, 'cause they cover the often requests. */
        r = run->runs;
        if (vcn < r->vcn) {
                *index = 0;
                return false;
        }

        if (vcn < r->vcn + r->len) {
                *index = 0;
                return true;
        }

        r += max_idx;
        if (vcn >= r->vcn + r->len) {
                *index = run->count;
                return false;
        }

        if (vcn >= r->vcn) {
                *index = max_idx;
                return true;
        }

        do {
                mid_idx = min_idx + ((max_idx - min_idx) >> 1);
                r = run->runs + mid_idx;

                if (vcn < r->vcn) {
                        max_idx = mid_idx - 1;
                        if (!mid_idx)
                                break;
                } else if (vcn >= r->vcn + r->len) {
                        min_idx = mid_idx + 1;
                } else {
                        *index = mid_idx;
                        return true;
                }
        } while (min_idx <= max_idx);

        *index = max_idx + 1;
        return false;
}

/*
 * run_consolidate - Consolidate runs starting from a given one.
 */
static void run_consolidate(struct runs_tree *run, size_t index)
{
        size_t i;
        struct ntfs_run *r = run->runs + index;

        while (index + 1 < run->count) {
                /*
                 * I should merge current run with next
                 * if start of the next run lies inside one being tested.
                 */
                struct ntfs_run *n = r + 1;
                CLST end = r->vcn + r->len;
                CLST dl;

                /* Stop if runs are not aligned one to another. */
                if (n->vcn > end)
                        break;

                dl = end - n->vcn;

                /*
                 * If range at index overlaps with next one
                 * then I will either adjust it's start position
                 * or (if completely matches) dust remove one from the list.
                 */
                if (dl > 0) {
                        if (n->len <= dl)
                                goto remove_next_range;

                        n->len -= dl;
                        n->vcn += dl;
                        if (n->lcn != SPARSE_LCN)
                                n->lcn += dl;
                        dl = 0;
                }

                /*
                 * Stop if sparse mode does not match
                 * both current and next runs.
                 */
                if ((n->lcn == SPARSE_LCN) != (r->lcn == SPARSE_LCN)) {
                        index += 1;
                        r = n;
                        continue;
                }

                /*
                 * Check if volume block
                 * of a next run lcn does not match
                 * last volume block of the current run.
                 */
                if (n->lcn != SPARSE_LCN && n->lcn != r->lcn + r->len)
                        break;

                /*
                 * Next and current are siblings.
                 * Eat/join.
                 */
                r->len += n->len - dl;

remove_next_range:
                i = run->count - (index + 1);
                if (i > 1)
                        memmove(n, n + 1, sizeof(*n) * (i - 1));

                run->count -= 1;
        }
}

/*
 * run_is_mapped_full
 *
 * Return: True if range [svcn - evcn] is mapped.
 */
bool run_is_mapped_full(const struct runs_tree *run, CLST svcn, CLST evcn)
{
        size_t i;
        const struct ntfs_run *r, *end;
        CLST next_vcn;

        if (!run_lookup(run, svcn, &i))
                return false;

        end = run->runs + run->count;
        r = run->runs + i;

        for (;;) {
                next_vcn = r->vcn + r->len;
                if (next_vcn > evcn)
                        return true;

                if (++r >= end)
                        return false;

                if (r->vcn != next_vcn)
                        return false;
        }
}

bool run_lookup_entry(const struct runs_tree *run, CLST vcn, CLST *lcn,
                      CLST *len, size_t *index)
{
        size_t idx;
        CLST gap;
        struct ntfs_run *r;

        /* Fail immediately if nrun was not touched yet. */
        if (!run->runs)
                return false;

        if (!run_lookup(run, vcn, &idx))
                return false;

        r = run->runs + idx;

        if (vcn >= r->vcn + r->len)
                return false;

        gap = vcn - r->vcn;
        if (r->len <= gap)
                return false;

        *lcn = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + gap);

        if (len)
                *len = r->len - gap;
        if (index)
                *index = idx;

        return true;
}

/*
 * run_truncate_head - Decommit the range before vcn.
 */
void run_truncate_head(struct runs_tree *run, CLST vcn)
{
        size_t index;
        struct ntfs_run *r;

        if (run_lookup(run, vcn, &index)) {
                r = run->runs + index;

                if (vcn > r->vcn) {
                        CLST dlen = vcn - r->vcn;

                        r->vcn = vcn;
                        r->len -= dlen;
                        if (r->lcn != SPARSE_LCN)
                                r->lcn += dlen;
                }

                if (!index)
                        return;
        }
        r = run->runs;
        memmove(r, r + index, sizeof(*r) * (run->count - index));

        run->count -= index;

        if (!run->count) {
                kvfree(run->runs);
                run->runs = NULL;
                run->allocated = 0;
        }
}

/*
 * run_truncate - Decommit the range after vcn.
 */
void run_truncate(struct runs_tree *run, CLST vcn)
{
        size_t index;

        /*
         * If I hit the range then
         * I have to truncate one.
         * If range to be truncated is becoming empty
         * then it will entirely be removed.
         */
        if (run_lookup(run, vcn, &index)) {
                struct ntfs_run *r = run->runs + index;

                r->len = vcn - r->vcn;

                if (r->len > 0)
                        index += 1;
        }

        /*
         * At this point 'index' is set to position that
         * should be thrown away (including index itself)
         * Simple one - just set the limit.
         */
        run->count = index;

        /* Do not reallocate array 'runs'. Only free if possible. */
        if (!index) {
                kvfree(run->runs);
                run->runs = NULL;
                run->allocated = 0;
        }
}

/*
 * run_truncate_around - Trim head and tail if necessary.
 */
void run_truncate_around(struct runs_tree *run, CLST vcn)
{
        run_truncate_head(run, vcn);

        if (run->count >= NTFS3_RUN_MAX_BYTES / sizeof(struct ntfs_run) / 2)
                run_truncate(run, (run->runs + (run->count >> 1))->vcn);
}

/*
 * run_add_entry
 *
 * Sets location to known state.
 * Run to be added may overlap with existing location.
 *
 * Return: false if of memory.
 */
bool run_add_entry(struct runs_tree *run, CLST vcn, CLST lcn, CLST len,
                   bool is_mft)
{
        size_t used, index;
        struct ntfs_run *r;
        bool inrange;
        CLST tail_vcn = 0, tail_len = 0, tail_lcn = 0;
        bool should_add_tail = false;

        /*
         * Lookup the insertion point.
         *
         * Execute bsearch for the entry containing
         * start position question.
         */
        inrange = run_lookup(run, vcn, &index);

        /*
         * Shortcut here would be case of
         * range not been found but one been added
         * continues previous run.
         * This case I can directly make use of
         * existing range as my start point.
         */
        if (!inrange && index > 0) {
                struct ntfs_run *t = run->runs + index - 1;

                if (t->vcn + t->len == vcn &&
                    (t->lcn == SPARSE_LCN) == (lcn == SPARSE_LCN) &&
                    (lcn == SPARSE_LCN || lcn == t->lcn + t->len)) {
                        inrange = true;
                        index -= 1;
                }
        }

        /*
         * At this point 'index' either points to the range
         * containing start position or to the insertion position
         * for a new range.
         * So first let's check if range I'm probing is here already.
         */
        if (!inrange) {
requires_new_range:
                /*
                 * Range was not found.
                 * Insert at position 'index'
                 */
                used = run->count * sizeof(struct ntfs_run);

                /*
                 * Check allocated space.
                 * If one is not enough to get one more entry
                 * then it will be reallocated.
                 */
                if (run->allocated < used + sizeof(struct ntfs_run)) {
                        size_t bytes;
                        struct ntfs_run *new_ptr;

                        /* Use power of 2 for 'bytes'. */
                        if (!used) {
                                bytes = 64;
                        } else if (used <= 16 * PAGE_SIZE) {
                                if (is_power_of_2(run->allocated))
                                        bytes = run->allocated << 1;
                                else
                                        bytes = (size_t)1
                                                << (2 + blksize_bits(used));
                        } else {
                                bytes = run->allocated + (16 * PAGE_SIZE);
                        }

                        WARN_ON(!is_mft && bytes > NTFS3_RUN_MAX_BYTES);

                        new_ptr = kvmalloc(bytes, GFP_KERNEL);

                        if (!new_ptr)
                                return false;

                        r = new_ptr + index;
                        memcpy(new_ptr, run->runs,
                               index * sizeof(struct ntfs_run));
                        memcpy(r + 1, run->runs + index,
                               sizeof(struct ntfs_run) * (run->count - index));

                        kvfree(run->runs);
                        run->runs = new_ptr;
                        run->allocated = bytes;

                } else {
                        size_t i = run->count - index;

                        r = run->runs + index;

                        /* memmove appears to be a bottle neck here... */
                        if (i > 0)
                                memmove(r + 1, r, sizeof(struct ntfs_run) * i);
                }

                r->vcn = vcn;
                r->lcn = lcn;
                r->len = len;
                run->count += 1;
        } else {
                r = run->runs + index;

                /*
                 * If one of ranges was not allocated then we
                 * have to split location we just matched and
                 * insert current one.
                 * A common case this requires tail to be reinserted
                 * a recursive call.
                 */
                if (((lcn == SPARSE_LCN) != (r->lcn == SPARSE_LCN)) ||
                    (lcn != SPARSE_LCN && lcn != r->lcn + (vcn - r->vcn))) {
                        CLST to_eat = vcn - r->vcn;
                        CLST Tovcn = to_eat + len;

                        should_add_tail = Tovcn < r->len;

                        if (should_add_tail) {
                                tail_lcn = r->lcn == SPARSE_LCN ?
                                                   SPARSE_LCN :
                                                   (r->lcn + Tovcn);
                                tail_vcn = r->vcn + Tovcn;
                                tail_len = r->len - Tovcn;
                        }

                        if (to_eat > 0) {
                                r->len = to_eat;
                                inrange = false;
                                index += 1;
                                goto requires_new_range;
                        }

                        /* lcn should match one were going to add. */
                        r->lcn = lcn;
                }

                /*
                 * If existing range fits then were done.
                 * Otherwise extend found one and fall back to range jocode.
                 */
                if (r->vcn + r->len < vcn + len)
                        r->len += len - ((r->vcn + r->len) - vcn);
        }

        /*
         * And normalize it starting from insertion point.
         * It's possible that no insertion needed case if
         * start point lies within the range of an entry
         * that 'index' points to.
         */
        if (inrange && index > 0)
                index -= 1;
        run_consolidate(run, index);
        run_consolidate(run, index + 1);

        /*
         * A special case.
         * We have to add extra range a tail.
         */
        if (should_add_tail &&
            !run_add_entry(run, tail_vcn, tail_lcn, tail_len, is_mft))
                return false;

        return true;
}

/* run_collapse_range
 *
 * Helper for attr_collapse_range(),
 * which is helper for fallocate(collapse_range).
 */
bool run_collapse_range(struct runs_tree *run, CLST vcn, CLST len)
{
        size_t index, eat;
        struct ntfs_run *r, *e, *eat_start, *eat_end;
        CLST end;

        if (WARN_ON(!run_lookup(run, vcn, &index)))
                return true; /* Should never be here. */

        e = run->runs + run->count;
        r = run->runs + index;
        end = vcn + len;

        if (vcn > r->vcn) {
                if (r->vcn + r->len <= end) {
                        /* Collapse tail of run .*/
                        r->len = vcn - r->vcn;
                } else if (r->lcn == SPARSE_LCN) {
                        /* Collapse a middle part of sparsed run. */
                        r->len -= len;
                } else {
                        /* Collapse a middle part of normal run, split. */
                        if (!run_add_entry(run, vcn, SPARSE_LCN, len, false))
                                return false;
                        return run_collapse_range(run, vcn, len);
                }

                r += 1;
        }

        eat_start = r;
        eat_end = r;

        for (; r < e; r++) {
                CLST d;

                if (r->vcn >= end) {
                        r->vcn -= len;
                        continue;
                }

                if (r->vcn + r->len <= end) {
                        /* Eat this run. */
                        eat_end = r + 1;
                        continue;
                }

                d = end - r->vcn;
                if (r->lcn != SPARSE_LCN)
                        r->lcn += d;
                r->len -= d;
                r->vcn -= len - d;
        }

        eat = eat_end - eat_start;
        memmove(eat_start, eat_end, (e - eat_end) * sizeof(*r));
        run->count -= eat;

        return true;
}

/* run_insert_range
 *
 * Helper for attr_insert_range(),
 * which is helper for fallocate(insert_range).
 */
bool run_insert_range(struct runs_tree *run, CLST vcn, CLST len)
{
        size_t index;
        struct ntfs_run *r, *e;

        if (WARN_ON(!run_lookup(run, vcn, &index)))
                return false; /* Should never be here. */

        e = run->runs + run->count;
        r = run->runs + index;

        if (vcn > r->vcn)
                r += 1;

        for (; r < e; r++)
                r->vcn += len;

        r = run->runs + index;

        if (vcn > r->vcn) {
                /* split fragment. */
                CLST len1 = vcn - r->vcn;
                CLST len2 = r->len - len1;
                CLST lcn2 = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + len1);

                r->len = len1;

                if (!run_add_entry(run, vcn + len, lcn2, len2, false))
                        return false;
        }

        if (!run_add_entry(run, vcn, SPARSE_LCN, len, false))
                return false;

        return true;
}

/*
 * run_get_entry - Return index-th mapped region.
 */
bool run_get_entry(const struct runs_tree *run, size_t index, CLST *vcn,
                   CLST *lcn, CLST *len)
{
        const struct ntfs_run *r;

        if (index >= run->count)
                return false;

        r = run->runs + index;

        if (!r->len)
                return false;

        if (vcn)
                *vcn = r->vcn;
        if (lcn)
                *lcn = r->lcn;
        if (len)
                *len = r->len;
        return true;
}

/*
 * run_packed_size - Calculate the size of packed int64.
 */
#ifdef __BIG_ENDIAN
static inline int run_packed_size(const s64 n)
{
        const u8 *p = (const u8 *)&n + sizeof(n) - 1;

        if (n >= 0) {
                if (p[-7] || p[-6] || p[-5] || p[-4])
                        p -= 4;
                if (p[-3] || p[-2])
                        p -= 2;
                if (p[-1])
                        p -= 1;
                if (p[0] & 0x80)
                        p -= 1;
        } else {
                if (p[-7] != 0xff || p[-6] != 0xff || p[-5] != 0xff ||
                    p[-4] != 0xff)
                        p -= 4;
                if (p[-3] != 0xff || p[-2] != 0xff)
                        p -= 2;
                if (p[-1] != 0xff)
                        p -= 1;
                if (!(p[0] & 0x80))
                        p -= 1;
        }
        return (const u8 *)&n + sizeof(n) - p;
}

/* Full trusted function. It does not check 'size' for errors. */
static inline void run_pack_s64(u8 *run_buf, u8 size, s64 v)
{
        const u8 *p = (u8 *)&v;

        switch (size) {
        case 8:
                run_buf[7] = p[0];
                fallthrough;
        case 7:
                run_buf[6] = p[1];
                fallthrough;
        case 6:
                run_buf[5] = p[2];
                fallthrough;
        case 5:
                run_buf[4] = p[3];
                fallthrough;
        case 4:
                run_buf[3] = p[4];
                fallthrough;
        case 3:
                run_buf[2] = p[5];
                fallthrough;
        case 2:
                run_buf[1] = p[6];
                fallthrough;
        case 1:
                run_buf[0] = p[7];
        }
}

/* Full trusted function. It does not check 'size' for errors. */
static inline s64 run_unpack_s64(const u8 *run_buf, u8 size, s64 v)
{
        u8 *p = (u8 *)&v;

        switch (size) {
        case 8:
                p[0] = run_buf[7];
                fallthrough;
        case 7:
                p[1] = run_buf[6];
                fallthrough;
        case 6:
                p[2] = run_buf[5];
                fallthrough;
        case 5:
                p[3] = run_buf[4];
                fallthrough;
        case 4:
                p[4] = run_buf[3];
                fallthrough;
        case 3:
                p[5] = run_buf[2];
                fallthrough;
        case 2:
                p[6] = run_buf[1];
                fallthrough;
        case 1:
                p[7] = run_buf[0];
        }
        return v;
}

#else

static inline int run_packed_size(const s64 n)
{
        const u8 *p = (const u8 *)&n;

        if (n >= 0) {
                if (p[7] || p[6] || p[5] || p[4])
                        p += 4;
                if (p[3] || p[2])
                        p += 2;
                if (p[1])
                        p += 1;
                if (p[0] & 0x80)
                        p += 1;
        } else {
                if (p[7] != 0xff || p[6] != 0xff || p[5] != 0xff ||
                    p[4] != 0xff)
                        p += 4;
                if (p[3] != 0xff || p[2] != 0xff)
                        p += 2;
                if (p[1] != 0xff)
                        p += 1;
                if (!(p[0] & 0x80))
                        p += 1;
        }

        return 1 + p - (const u8 *)&n;
}

/* Full trusted function. It does not check 'size' for errors. */
static inline void run_pack_s64(u8 *run_buf, u8 size, s64 v)
{
        const u8 *p = (u8 *)&v;

        /* memcpy( run_buf, &v, size); Is it faster? */
        switch (size) {
        case 8:
                run_buf[7] = p[7];
                fallthrough;
        case 7:
                run_buf[6] = p[6];
                fallthrough;
        case 6:
                run_buf[5] = p[5];
                fallthrough;
        case 5:
                run_buf[4] = p[4];
                fallthrough;
        case 4:
                run_buf[3] = p[3];
                fallthrough;
        case 3:
                run_buf[2] = p[2];
                fallthrough;
        case 2:
                run_buf[1] = p[1];
                fallthrough;
        case 1:
                run_buf[0] = p[0];
        }
}

/* full trusted function. It does not check 'size' for errors */
static inline s64 run_unpack_s64(const u8 *run_buf, u8 size, s64 v)
{
        u8 *p = (u8 *)&v;

        /* memcpy( &v, run_buf, size); Is it faster? */
        switch (size) {
        case 8:
                p[7] = run_buf[7];
                fallthrough;
        case 7:
                p[6] = run_buf[6];
                fallthrough;
        case 6:
                p[5] = run_buf[5];
                fallthrough;
        case 5:
                p[4] = run_buf[4];
                fallthrough;
        case 4:
                p[3] = run_buf[3];
                fallthrough;
        case 3:
                p[2] = run_buf[2];
                fallthrough;
        case 2:
                p[1] = run_buf[1];
                fallthrough;
        case 1:
                p[0] = run_buf[0];
        }
        return v;
}
#endif

/*
 * run_pack - Pack runs into buffer.
 *
 * packed_vcns - How much runs we have packed.
 * packed_size - How much bytes we have used run_buf.
 */
int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf,
             u32 run_buf_size, CLST *packed_vcns)
{
        CLST next_vcn, vcn, lcn;
        CLST prev_lcn = 0;
        CLST evcn1 = svcn + len;
        const struct ntfs_run *r, *r_end;
        int packed_size = 0;
        size_t i;
        s64 dlcn;
        int offset_size, size_size, tmp;

        *packed_vcns = 0;

        if (!len)
                goto out;

        /* Check all required entries [svcn, encv1) available. */
        if (!run_lookup(run, svcn, &i))
                return -ENOENT;

        r_end = run->runs + run->count;
        r = run->runs + i;

        for (next_vcn = r->vcn + r->len; next_vcn < evcn1;
             next_vcn = r->vcn + r->len) {
                if (++r >= r_end || r->vcn != next_vcn)
                        return -ENOENT;
        }

        /* Repeat cycle above and pack runs. Assume no errors. */
        r = run->runs + i;
        len = svcn - r->vcn;
        vcn = svcn;
        lcn = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + len);
        len = r->len - len;

        for (;;) {
                next_vcn = vcn + len;
                if (next_vcn > evcn1)
                        len = evcn1 - vcn;

                /* How much bytes required to pack len. */
                size_size = run_packed_size(len);

                /* offset_size - How much bytes is packed dlcn. */
                if (lcn == SPARSE_LCN) {
                        offset_size = 0;
                        dlcn = 0;
                } else {
                        /* NOTE: lcn can be less than prev_lcn! */
                        dlcn = (s64)lcn - prev_lcn;
                        offset_size = run_packed_size(dlcn);
                        prev_lcn = lcn;
                }

                tmp = run_buf_size - packed_size - 2 - offset_size;
                if (tmp <= 0)
                        goto out;

                /* Can we store this entire run. */
                if (tmp < size_size)
                        goto out;

                if (run_buf) {
                        /* Pack run header. */
                        run_buf[0] = ((u8)(size_size | (offset_size << 4)));
                        run_buf += 1;

                        /* Pack the length of run. */
                        run_pack_s64(run_buf, size_size, len);

                        run_buf += size_size;
                        /* Pack the offset from previous LCN. */
                        run_pack_s64(run_buf, offset_size, dlcn);
                        run_buf += offset_size;
                }

                packed_size += 1 + offset_size + size_size;
                *packed_vcns += len;

                if (packed_size + 1 >= run_buf_size || next_vcn >= evcn1)
                        goto out;

                r += 1;
                vcn = r->vcn;
                lcn = r->lcn;
                len = r->len;
        }

out:
        /* Store last zero. */
        if (run_buf)
                run_buf[0] = 0;

        return packed_size + 1;
}

/*
 * run_unpack - Unpack packed runs from @run_buf.
 *
 * Return: Error if negative, or real used bytes.
 */
int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
               CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf,
               int run_buf_size)
{
        u64 prev_lcn, vcn64, lcn, next_vcn;
        const u8 *run_last, *run_0;
        bool is_mft = ino == MFT_REC_MFT;

        if (run_buf_size < 0)
                return -EINVAL;

        /* Check for empty. */
        if (evcn + 1 == svcn)
                return 0;

        if (evcn < svcn)
                return -EINVAL;

        run_0 = run_buf;
        run_last = run_buf + run_buf_size;
        prev_lcn = 0;
        vcn64 = svcn;

        /* Read all runs the chain. */
        /* size_size - How much bytes is packed len. */
        while (run_buf < run_last) {
                /* size_size - How much bytes is packed len. */
                u8 size_size = *run_buf & 0xF;
                /* offset_size - How much bytes is packed dlcn. */
                u8 offset_size = *run_buf++ >> 4;
                u64 len;

                if (!size_size)
                        break;

                /*
                 * Unpack runs.
                 * NOTE: Runs are stored little endian order
                 * "len" is unsigned value, "dlcn" is signed.
                 * Large positive number requires to store 5 bytes
                 * e.g.: 05 FF 7E FF FF 00 00 00
                 */
                if (size_size > 8)
                        return -EINVAL;

                len = run_unpack_s64(run_buf, size_size, 0);
                /* Skip size_size. */
                run_buf += size_size;

                if (!len)
                        return -EINVAL;

                if (!offset_size)
                        lcn = SPARSE_LCN64;
                else if (offset_size <= 8) {
                        s64 dlcn;

                        /* Initial value of dlcn is -1 or 0. */
                        dlcn = (run_buf[offset_size - 1] & 0x80) ? (s64)-1 : 0;
                        dlcn = run_unpack_s64(run_buf, offset_size, dlcn);
                        /* Skip offset_size. */
                        run_buf += offset_size;

                        if (!dlcn)
                                return -EINVAL;
                        lcn = prev_lcn + dlcn;
                        prev_lcn = lcn;
                } else
                        return -EINVAL;

                next_vcn = vcn64 + len;
                /* Check boundary. */
                if (next_vcn > evcn + 1)
                        return -EINVAL;

#ifndef CONFIG_NTFS3_64BIT_CLUSTER
                if (next_vcn > 0x100000000ull || (lcn + len) > 0x100000000ull) {
                        ntfs_err(
                                sbi->sb,
                                "This driver is compiled without CONFIG_NTFS3_64BIT_CLUSTER (like windows driver).\n"
                                "Volume contains 64 bits run: vcn %llx, lcn %llx, len %llx.\n"
                                "Activate CONFIG_NTFS3_64BIT_CLUSTER to process this case",
                                vcn64, lcn, len);
                        return -EOPNOTSUPP;
                }
#endif
                if (lcn != SPARSE_LCN64 && lcn + len > sbi->used.bitmap.nbits) {
                        /* LCN range is out of volume. */
                        return -EINVAL;
                }

                if (!run)
                        ; /* Called from check_attr(fslog.c) to check run. */
                else if (run == RUN_DEALLOCATE) {
                        /*
                         * Called from ni_delete_all to free clusters
                         * without storing in run.
                         */
                        if (lcn != SPARSE_LCN64)
                                mark_as_free_ex(sbi, lcn, len, true);
                } else if (vcn64 >= vcn) {
                        if (!run_add_entry(run, vcn64, lcn, len, is_mft))
                                return -ENOMEM;
                } else if (next_vcn > vcn) {
                        u64 dlen = vcn - vcn64;

                        if (!run_add_entry(run, vcn, lcn + dlen, len - dlen,
                                           is_mft))
                                return -ENOMEM;
                }

                vcn64 = next_vcn;
        }

        if (vcn64 != evcn + 1) {
                /* Not expected length of unpacked runs. */
                return -EINVAL;
        }

        return run_buf - run_0;
}

#ifdef NTFS3_CHECK_FREE_CLST
/*
 * run_unpack_ex - Unpack packed runs from "run_buf".
 *
 * Checks unpacked runs to be used in bitmap.
 *
 * Return: Error if negative, or real used bytes.
 */
int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
                  CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf,
                  int run_buf_size)
{
        int ret, err;
        CLST next_vcn, lcn, len;
        size_t index;
        bool ok;
        struct wnd_bitmap *wnd;

        ret = run_unpack(run, sbi, ino, svcn, evcn, vcn, run_buf, run_buf_size);
        if (ret <= 0)
                return ret;

        if (!sbi->used.bitmap.sb || !run || run == RUN_DEALLOCATE)
                return ret;

        if (ino == MFT_REC_BADCLUST)
                return ret;

        next_vcn = vcn = svcn;
        wnd = &sbi->used.bitmap;

        for (ok = run_lookup_entry(run, vcn, &lcn, &len, &index);
             next_vcn <= evcn;
             ok = run_get_entry(run, ++index, &vcn, &lcn, &len)) {
                if (!ok || next_vcn != vcn)
                        return -EINVAL;

                next_vcn = vcn + len;

                if (lcn == SPARSE_LCN)
                        continue;

                if (sbi->flags & NTFS_FLAGS_NEED_REPLAY)
                        continue;

                down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS);
                /* Check for free blocks. */
                ok = wnd_is_used(wnd, lcn, len);
                up_read(&wnd->rw_lock);
                if (ok)
                        continue;

                /* Looks like volume is corrupted. */
                ntfs_set_state(sbi, NTFS_DIRTY_ERROR);

                if (down_write_trylock(&wnd->rw_lock)) {
                        /* Mark all zero bits as used in range [lcn, lcn+len). */
                        size_t done;
                        err = wnd_set_used_safe(wnd, lcn, len, &done);
                        up_write(&wnd->rw_lock);
                        if (err)
                                return err;
                }
        }

        return ret;
}
#endif

/*
 * run_get_highest_vcn
 *
 * Return the highest vcn from a mapping pairs array
 * it used while replaying log file.
 */
int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn)
{
        u64 vcn64 = vcn;
        u8 size_size;

        while ((size_size = *run_buf & 0xF)) {
                u8 offset_size = *run_buf++ >> 4;
                u64 len;

                if (size_size > 8 || offset_size > 8)
                        return -EINVAL;

                len = run_unpack_s64(run_buf, size_size, 0);
                if (!len)
                        return -EINVAL;

                run_buf += size_size + offset_size;
                vcn64 += len;

#ifndef CONFIG_NTFS3_64BIT_CLUSTER
                if (vcn64 > 0x100000000ull)
                        return -EINVAL;
#endif
        }

        *highest_vcn = vcn64 - 1;
        return 0;
}

/*
 * run_clone
 *
 * Make a copy of run
 */
int run_clone(const struct runs_tree *run, struct runs_tree *new_run)
{
        size_t bytes = run->count * sizeof(struct ntfs_run);

        if (bytes > new_run->allocated) {
                struct ntfs_run *new_ptr = kvmalloc(bytes, GFP_KERNEL);

                if (!new_ptr)
                        return -ENOMEM;

                kvfree(new_run->runs);
                new_run->runs = new_ptr;
                new_run->allocated = bytes;
        }

        memcpy(new_run->runs, run->runs, bytes);
        new_run->count = run->count;
        return 0;
}




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef _LINUX_FILEATTR_H
#define _LINUX_FILEATTR_H

/* Flags shared betwen flags/xflags */
#define FS_COMMON_FL \
        (FS_SYNC_FL | FS_IMMUTABLE_FL | FS_APPEND_FL | \
         FS_NODUMP_FL |        FS_NOATIME_FL | FS_DAX_FL | \
         FS_PROJINHERIT_FL)

#define FS_XFLAG_COMMON \
        (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND | \
         FS_XFLAG_NODUMP | FS_XFLAG_NOATIME | FS_XFLAG_DAX | \
         FS_XFLAG_PROJINHERIT)

/*
 * Merged interface for miscellaneous file attributes.  'flags' originates from
 * ext* and 'fsx_flags' from xfs.  There's some overlap between the two, which
 * is handled by the VFS helpers, so filesystems are free to implement just one
 * or both of these sub-interfaces.
 */
struct fileattr {
        u32        flags;                /* flags (FS_IOC_GETFLAGS/FS_IOC_SETFLAGS) */
        /* struct fsxattr: */
        u32        fsx_xflags;        /* xflags field value (get/set) */
        u32        fsx_extsize;        /* extsize field value (get/set)*/
        u32        fsx_nextents;        /* nextents field value (get)        */
        u32        fsx_projid;        /* project identifier (get/set) */
        u32        fsx_cowextsize;        /* CoW extsize field value (get/set)*/
        /* selectors: */
        bool        flags_valid:1;
        bool        fsx_valid:1;
};

int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa);

void fileattr_fill_xflags(struct fileattr *fa, u32 xflags);
void fileattr_fill_flags(struct fileattr *fa, u32 flags);

/**
 * fileattr_has_fsx - check for extended flags/attributes
 * @fa:                fileattr pointer
 *
 * Return: true if any attributes are present that are not represented in
 * ->flags.
 */
static inline bool fileattr_has_fsx(const struct fileattr *fa)
{
        return fa->fsx_valid &&
                ((fa->fsx_xflags & ~FS_XFLAG_COMMON) || fa->fsx_extsize != 0 ||
                 fa->fsx_projid != 0 ||        fa->fsx_cowextsize != 0);
}

int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
                     struct fileattr *fa);

#endif /* _LINUX_FILEATTR_H */












































































    1 



    1 


    1 




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 





    1 





    1 




































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
// SPDX-License-Identifier: GPL-2.0
/*
 *
 * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
 *
 *  Regular file handling primitives for NTFS-based filesystems.
 *
 */

#include <linux/backing-dev.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/compat.h>
#include <linux/falloc.h>
#include <linux/fiemap.h>

#include "debug.h"
#include "ntfs.h"
#include "ntfs_fs.h"

static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg)
{
        struct fstrim_range __user *user_range;
        struct fstrim_range range;
        struct block_device *dev;
        int err;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        dev = sbi->sb->s_bdev;
        if (!bdev_max_discard_sectors(dev))
                return -EOPNOTSUPP;

        user_range = (struct fstrim_range __user *)arg;
        if (copy_from_user(&range, user_range, sizeof(range)))
                return -EFAULT;

        range.minlen = max_t(u32, range.minlen, bdev_discard_granularity(dev));

        err = ntfs_trim_fs(sbi, &range);
        if (err < 0)
                return err;

        if (copy_to_user(user_range, &range, sizeof(range)))
                return -EFAULT;

        return 0;
}

long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg)
{
        struct inode *inode = file_inode(filp);
        struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;

        switch (cmd) {
        case FITRIM:
                return ntfs_ioctl_fitrim(sbi, arg);
        }
        return -ENOTTY; /* Inappropriate ioctl for device. */
}

#ifdef CONFIG_COMPAT
long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg)

{
        return ntfs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
}
#endif

/*
 * ntfs_getattr - inode_operations::getattr
 */
int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path,
                 struct kstat *stat, u32 request_mask, u32 flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct ntfs_inode *ni = ntfs_i(inode);

        if (is_compressed(ni))
                stat->attributes |= STATX_ATTR_COMPRESSED;

        if (is_encrypted(ni))
                stat->attributes |= STATX_ATTR_ENCRYPTED;

        stat->attributes_mask |= STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED;

        generic_fillattr(idmap, request_mask, inode, stat);

        stat->result_mask |= STATX_BTIME;
        stat->btime = ni->i_crtime;
        stat->blksize = ni->mi.sbi->cluster_size; /* 512, 1K, ..., 2M */

        return 0;
}

static int ntfs_extend_initialized_size(struct file *file,
                                        struct ntfs_inode *ni,
                                        const loff_t valid,
                                        const loff_t new_valid)
{
        struct inode *inode = &ni->vfs_inode;
        struct address_space *mapping = inode->i_mapping;
        struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
        loff_t pos = valid;
        int err;

        if (is_resident(ni)) {
                ni->i_valid = new_valid;
                return 0;
        }

        WARN_ON(is_compressed(ni));
        WARN_ON(valid >= new_valid);

        for (;;) {
                u32 zerofrom, len;
                struct page *page;
                u8 bits;
                CLST vcn, lcn, clen;

                if (is_sparsed(ni)) {
                        bits = sbi->cluster_bits;
                        vcn = pos >> bits;

                        err = attr_data_get_block(ni, vcn, 1, &lcn, &clen, NULL,
                                                  false);
                        if (err)
                                goto out;

                        if (lcn == SPARSE_LCN) {
                                pos = ((loff_t)clen + vcn) << bits;
                                ni->i_valid = pos;
                                goto next;
                        }
                }

                zerofrom = pos & (PAGE_SIZE - 1);
                len = PAGE_SIZE - zerofrom;

                if (pos + len > new_valid)
                        len = new_valid - pos;

                err = ntfs_write_begin(file, mapping, pos, len, &page, NULL);
                if (err)
                        goto out;

                zero_user_segment(page, zerofrom, PAGE_SIZE);

                /* This function in any case puts page. */
                err = ntfs_write_end(file, mapping, pos, len, len, page, NULL);
                if (err < 0)
                        goto out;
                pos += len;

next:
                if (pos >= new_valid)
                        break;

                balance_dirty_pages_ratelimited(mapping);
                cond_resched();
        }

        return 0;

out:
        ni->i_valid = valid;
        ntfs_inode_warn(inode, "failed to extend initialized size to %llx.",
                        new_valid);
        return err;
}

/*
 * ntfs_zero_range - Helper function for punch_hole.
 *
 * It zeroes a range [vbo, vbo_to).
 */
static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
{
        int err = 0;
        struct address_space *mapping = inode->i_mapping;
        u32 blocksize = i_blocksize(inode);
        pgoff_t idx = vbo >> PAGE_SHIFT;
        u32 from = vbo & (PAGE_SIZE - 1);
        pgoff_t idx_end = (vbo_to + PAGE_SIZE - 1) >> PAGE_SHIFT;
        loff_t page_off;
        struct buffer_head *head, *bh;
        u32 bh_next, bh_off, to;
        sector_t iblock;
        struct folio *folio;
        bool dirty = false;

        for (; idx < idx_end; idx += 1, from = 0) {
                page_off = (loff_t)idx << PAGE_SHIFT;
                to = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off) :
                                                       PAGE_SIZE;
                iblock = page_off >> inode->i_blkbits;

                folio = __filemap_get_folio(mapping, idx,
                                FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
                                mapping_gfp_constraint(mapping, ~__GFP_FS));
                if (IS_ERR(folio))
                        return PTR_ERR(folio);

                head = folio_buffers(folio);
                if (!head)
                        head = create_empty_buffers(folio, blocksize, 0);

                bh = head;
                bh_off = 0;
                do {
                        bh_next = bh_off + blocksize;

                        if (bh_next <= from || bh_off >= to)
                                continue;

                        if (!buffer_mapped(bh)) {
                                ntfs_get_block(inode, iblock, bh, 0);
                                /* Unmapped? It's a hole - nothing to do. */
                                if (!buffer_mapped(bh))
                                        continue;
                        }

                        /* Ok, it's mapped. Make sure it's up-to-date. */
                        if (folio_test_uptodate(folio))
                                set_buffer_uptodate(bh);
                        else if (bh_read(bh, 0) < 0) {
                                err = -EIO;
                                folio_unlock(folio);
                                folio_put(folio);
                                goto out;
                        }

                        mark_buffer_dirty(bh);
                } while (bh_off = bh_next, iblock += 1,
                         head != (bh = bh->b_this_page));

                folio_zero_segment(folio, from, to);
                dirty = true;

                folio_unlock(folio);
                folio_put(folio);
                cond_resched();
        }
out:
        if (dirty)
                mark_inode_dirty(inode);
        return err;
}

/*
 * ntfs_file_mmap - file_operations::mmap
 */
static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        struct ntfs_inode *ni = ntfs_i(inode);
        u64 from = ((u64)vma->vm_pgoff << PAGE_SHIFT);
        bool rw = vma->vm_flags & VM_WRITE;
        int err;

        if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
                return -EIO;

        if (is_encrypted(ni)) {
                ntfs_inode_warn(inode, "mmap encrypted not supported");
                return -EOPNOTSUPP;
        }

        if (is_dedup(ni)) {
                ntfs_inode_warn(inode, "mmap deduplicated not supported");
                return -EOPNOTSUPP;
        }

        if (is_compressed(ni) && rw) {
                ntfs_inode_warn(inode, "mmap(write) compressed not supported");
                return -EOPNOTSUPP;
        }

        if (rw) {
                u64 to = min_t(loff_t, i_size_read(inode),
                               from + vma->vm_end - vma->vm_start);

                if (is_sparsed(ni)) {
                        /* Allocate clusters for rw map. */
                        struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
                        CLST lcn, len;
                        CLST vcn = from >> sbi->cluster_bits;
                        CLST end = bytes_to_cluster(sbi, to);
                        bool new;

                        for (; vcn < end; vcn += len) {
                                err = attr_data_get_block(ni, vcn, 1, &lcn,
                                                          &len, &new, true);
                                if (err)
                                        goto out;
                        }
                }

                if (ni->i_valid < to) {
                        if (!inode_trylock(inode)) {
                                err = -EAGAIN;
                                goto out;
                        }
                        err = ntfs_extend_initialized_size(file, ni,
                                                           ni->i_valid, to);
                        inode_unlock(inode);
                        if (err)
                                goto out;
                }
        }

        err = generic_file_mmap(file, vma);
out:
        return err;
}

static int ntfs_extend(struct inode *inode, loff_t pos, size_t count,
                       struct file *file)
{
        struct ntfs_inode *ni = ntfs_i(inode);
        struct address_space *mapping = inode->i_mapping;
        loff_t end = pos + count;
        bool extend_init = file && pos > ni->i_valid;
        int err;

        if (end <= inode->i_size && !extend_init)
                return 0;

        /* Mark rw ntfs as dirty. It will be cleared at umount. */
        ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_DIRTY);

        if (end > inode->i_size) {
                err = ntfs_set_size(inode, end);
                if (err)
                        goto out;
        }

        if (extend_init && !is_compressed(ni)) {
                err = ntfs_extend_initialized_size(file, ni, ni->i_valid, pos);
                if (err)
                        goto out;
        } else {
                err = 0;
        }

        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        mark_inode_dirty(inode);

        if (IS_SYNC(inode)) {
                int err2;

                err = filemap_fdatawrite_range(mapping, pos, end - 1);
                err2 = sync_mapping_buffers(mapping);
                if (!err)
                        err = err2;
                err2 = write_inode_now(inode, 1);
                if (!err)
                        err = err2;
                if (!err)
                        err = filemap_fdatawait_range(mapping, pos, end - 1);
        }

out:
        return err;
}

static int ntfs_truncate(struct inode *inode, loff_t new_size)
{
        struct super_block *sb = inode->i_sb;
        struct ntfs_inode *ni = ntfs_i(inode);
        int err, dirty = 0;
        u64 new_valid;

        if (!S_ISREG(inode->i_mode))
                return 0;

        if (is_compressed(ni)) {
                if (ni->i_valid > new_size)
                        ni->i_valid = new_size;
        } else {
                err = block_truncate_page(inode->i_mapping, new_size,
                                          ntfs_get_block);
                if (err)
                        return err;
        }

        new_valid = ntfs_up_block(sb, min_t(u64, ni->i_valid, new_size));

        truncate_setsize(inode, new_size);

        ni_lock(ni);

        down_write(&ni->file.run_lock);
        err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, new_size,
                            &new_valid, ni->mi.sbi->options->prealloc, NULL);
        up_write(&ni->file.run_lock);

        if (new_valid < ni->i_valid)
                ni->i_valid = new_valid;

        ni_unlock(ni);

        ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE;
        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        if (!IS_DIRSYNC(inode)) {
                dirty = 1;
        } else {
                err = ntfs_sync_inode(inode);
                if (err)
                        return err;
        }

        if (dirty)
                mark_inode_dirty(inode);

        /*ntfs_flush_inodes(inode->i_sb, inode, NULL);*/

        return 0;
}

/*
 * ntfs_fallocate
 *
 * Preallocate space for a file. This implements ntfs's fallocate file
 * operation, which gets called from sys_fallocate system call. User
 * space requests 'len' bytes at 'vbo'. If FALLOC_FL_KEEP_SIZE is set
 * we just allocate clusters without zeroing them out. Otherwise we
 * allocate and zero out clusters via an expanding truncate.
 */
static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
{
        struct inode *inode = file->f_mapping->host;
        struct address_space *mapping = inode->i_mapping;
        struct super_block *sb = inode->i_sb;
        struct ntfs_sb_info *sbi = sb->s_fs_info;
        struct ntfs_inode *ni = ntfs_i(inode);
        loff_t end = vbo + len;
        loff_t vbo_down = round_down(vbo, max_t(unsigned long,
                                                sbi->cluster_size, PAGE_SIZE));
        bool is_supported_holes = is_sparsed(ni) || is_compressed(ni);
        loff_t i_size, new_size;
        bool map_locked;
        int err;

        /* No support for dir. */
        if (!S_ISREG(inode->i_mode))
                return -EOPNOTSUPP;

        /*
         * vfs_fallocate checks all possible combinations of mode.
         * Do additional checks here before ntfs_set_state(dirty).
         */
        if (mode & FALLOC_FL_PUNCH_HOLE) {
                if (!is_supported_holes)
                        return -EOPNOTSUPP;
        } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
        } else if (mode & FALLOC_FL_INSERT_RANGE) {
                if (!is_supported_holes)
                        return -EOPNOTSUPP;
        } else if (mode &
                   ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
                     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)) {
                ntfs_inode_warn(inode, "fallocate(0x%x) is not supported",
                                mode);
                return -EOPNOTSUPP;
        }

        ntfs_set_state(sbi, NTFS_DIRTY_DIRTY);

        inode_lock(inode);
        i_size = inode->i_size;
        new_size = max(end, i_size);
        map_locked = false;

        if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) {
                /* Should never be here, see ntfs_file_open. */
                err = -EOPNOTSUPP;
                goto out;
        }

        if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE |
                    FALLOC_FL_INSERT_RANGE)) {
                inode_dio_wait(inode);
                filemap_invalidate_lock(mapping);
                map_locked = true;
        }

        if (mode & FALLOC_FL_PUNCH_HOLE) {
                u32 frame_size;
                loff_t mask, vbo_a, end_a, tmp;

                err = filemap_write_and_wait_range(mapping, vbo_down,
                                                   LLONG_MAX);
                if (err)
                        goto out;

                truncate_pagecache(inode, vbo_down);

                ni_lock(ni);
                err = attr_punch_hole(ni, vbo, len, &frame_size);
                ni_unlock(ni);
                if (!err)
                        goto ok;

                if (err != E_NTFS_NOTALIGNED)
                        goto out;

                /* Process not aligned punch. */
                err = 0;
                mask = frame_size - 1;
                vbo_a = (vbo + mask) & ~mask;
                end_a = end & ~mask;

                tmp = min(vbo_a, end);
                if (tmp > vbo) {
                        err = ntfs_zero_range(inode, vbo, tmp);
                        if (err)
                                goto out;
                }

                if (vbo < end_a && end_a < end) {
                        err = ntfs_zero_range(inode, end_a, end);
                        if (err)
                                goto out;
                }

                /* Aligned punch_hole */
                if (end_a > vbo_a) {
                        ni_lock(ni);
                        err = attr_punch_hole(ni, vbo_a, end_a - vbo_a, NULL);
                        ni_unlock(ni);
                        if (err)
                                goto out;
                }
        } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
                /*
                 * Write tail of the last page before removed range since
                 * it will get removed from the page cache below.
                 */
                err = filemap_write_and_wait_range(mapping, vbo_down, vbo);
                if (err)
                        goto out;

                /*
                 * Write data that will be shifted to preserve them
                 * when discarding page cache below.
                 */
                err = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
                if (err)
                        goto out;

                truncate_pagecache(inode, vbo_down);

                ni_lock(ni);
                err = attr_collapse_range(ni, vbo, len);
                ni_unlock(ni);
        } else if (mode & FALLOC_FL_INSERT_RANGE) {
                /* Check new size. */
                err = inode_newsize_ok(inode, new_size);
                if (err)
                        goto out;

                /* Write out all dirty pages. */
                err = filemap_write_and_wait_range(mapping, vbo_down,
                                                   LLONG_MAX);
                if (err)
                        goto out;
                truncate_pagecache(inode, vbo_down);

                ni_lock(ni);
                err = attr_insert_range(ni, vbo, len);
                ni_unlock(ni);
                if (err)
                        goto out;
        } else {
                /* Check new size. */
                u8 cluster_bits = sbi->cluster_bits;

                /* Be sure file is non resident. */
                if (is_resident(ni)) {
                        ni_lock(ni);
                        err = attr_force_nonresident(ni);
                        ni_unlock(ni);
                        if (err)
                                goto out;
                }

                /* generic/213: expected -ENOSPC instead of -EFBIG. */
                if (!is_supported_holes) {
                        loff_t to_alloc = new_size - inode_get_bytes(inode);

                        if (to_alloc > 0 &&
                            (to_alloc >> cluster_bits) >
                                    wnd_zeroes(&sbi->used.bitmap)) {
                                err = -ENOSPC;
                                goto out;
                        }
                }

                err = inode_newsize_ok(inode, new_size);
                if (err)
                        goto out;

                if (new_size > i_size) {
                        /*
                         * Allocate clusters, do not change 'valid' size.
                         */
                        err = ntfs_set_size(inode, new_size);
                        if (err)
                                goto out;
                }

                if (is_supported_holes) {
                        CLST vcn = vbo >> cluster_bits;
                        CLST cend = bytes_to_cluster(sbi, end);
                        CLST cend_v = bytes_to_cluster(sbi, ni->i_valid);
                        CLST lcn, clen;
                        bool new;

                        if (cend_v > cend)
                                cend_v = cend;

                        /*
                         * Allocate and zero new clusters.
                         * Zeroing these clusters may be too long.
                         */
                        for (; vcn < cend_v; vcn += clen) {
                                err = attr_data_get_block(ni, vcn, cend_v - vcn,
                                                          &lcn, &clen, &new,
                                                          true);
                                if (err)
                                        goto out;
                        }
                        /*
                         * Allocate but not zero new clusters.
                         */
                        for (; vcn < cend; vcn += clen) {
                                err = attr_data_get_block(ni, vcn, cend - vcn,
                                                          &lcn, &clen, &new,
                                                          false);
                                if (err)
                                        goto out;
                        }
                }

                if (mode & FALLOC_FL_KEEP_SIZE) {
                        ni_lock(ni);
                        /* True - Keep preallocated. */
                        err = attr_set_size(ni, ATTR_DATA, NULL, 0,
                                            &ni->file.run, i_size, &ni->i_valid,
                                            true, NULL);
                        ni_unlock(ni);
                        if (err)
                                goto out;
                } else if (new_size > i_size) {
                        i_size_write(inode, new_size);
                }
        }

ok:
        err = file_modified(file);
        if (err)
                goto out;

out:
        if (map_locked)
                filemap_invalidate_unlock(mapping);

        if (!err) {
                inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
                mark_inode_dirty(inode);
        }

        inode_unlock(inode);
        return err;
}

/*
 * ntfs3_setattr - inode_operations::setattr
 */
int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                  struct iattr *attr)
{
        struct inode *inode = d_inode(dentry);
        struct ntfs_inode *ni = ntfs_i(inode);
        u32 ia_valid = attr->ia_valid;
        umode_t mode = inode->i_mode;
        int err;

        if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
                return -EIO;

        err = setattr_prepare(idmap, dentry, attr);
        if (err)
                goto out;

        if (ia_valid & ATTR_SIZE) {
                loff_t newsize, oldsize;

                if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) {
                        /* Should never be here, see ntfs_file_open(). */
                        err = -EOPNOTSUPP;
                        goto out;
                }
                inode_dio_wait(inode);
                oldsize = i_size_read(inode);
                newsize = attr->ia_size;

                if (newsize <= oldsize)
                        err = ntfs_truncate(inode, newsize);
                else
                        err = ntfs_extend(inode, newsize, 0, NULL);

                if (err)
                        goto out;

                ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
                i_size_write(inode, newsize);
        }

        setattr_copy(idmap, inode, attr);

        if (mode != inode->i_mode) {
                err = ntfs_acl_chmod(idmap, dentry);
                if (err)
                        goto out;

                /* Linux 'w' -> Windows 'ro'. */
                if (0222 & inode->i_mode)
                        ni->std_fa &= ~FILE_ATTRIBUTE_READONLY;
                else
                        ni->std_fa |= FILE_ATTRIBUTE_READONLY;
        }

        if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE))
                ntfs_save_wsl_perm(inode, NULL);
        mark_inode_dirty(inode);
out:
        return err;
}

static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        struct ntfs_inode *ni = ntfs_i(inode);

        if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
                return -EIO;

        if (is_encrypted(ni)) {
                ntfs_inode_warn(inode, "encrypted i/o not supported");
                return -EOPNOTSUPP;
        }

        if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) {
                ntfs_inode_warn(inode, "direct i/o + compressed not supported");
                return -EOPNOTSUPP;
        }

#ifndef CONFIG_NTFS3_LZX_XPRESS
        if (ni->ni_flags & NI_FLAG_COMPRESSED_MASK) {
                ntfs_inode_warn(
                        inode,
                        "activate CONFIG_NTFS3_LZX_XPRESS to read external compressed files");
                return -EOPNOTSUPP;
        }
#endif

        if (is_dedup(ni)) {
                ntfs_inode_warn(inode, "read deduplicated not supported");
                return -EOPNOTSUPP;
        }

        return generic_file_read_iter(iocb, iter);
}

static ssize_t ntfs_file_splice_read(struct file *in, loff_t *ppos,
                                     struct pipe_inode_info *pipe, size_t len,
                                     unsigned int flags)
{
        struct inode *inode = in->f_mapping->host;
        struct ntfs_inode *ni = ntfs_i(inode);

        if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
                return -EIO;

        if (is_encrypted(ni)) {
                ntfs_inode_warn(inode, "encrypted i/o not supported");
                return -EOPNOTSUPP;
        }

#ifndef CONFIG_NTFS3_LZX_XPRESS
        if (ni->ni_flags & NI_FLAG_COMPRESSED_MASK) {
                ntfs_inode_warn(
                        inode,
                        "activate CONFIG_NTFS3_LZX_XPRESS to read external compressed files");
                return -EOPNOTSUPP;
        }
#endif

        if (is_dedup(ni)) {
                ntfs_inode_warn(inode, "read deduplicated not supported");
                return -EOPNOTSUPP;
        }

        return filemap_splice_read(in, ppos, pipe, len, flags);
}

/*
 * ntfs_get_frame_pages
 *
 * Return: Array of locked pages.
 */
static int ntfs_get_frame_pages(struct address_space *mapping, pgoff_t index,
                                struct page **pages, u32 pages_per_frame,
                                bool *frame_uptodate)
{
        gfp_t gfp_mask = mapping_gfp_mask(mapping);
        u32 npages;

        *frame_uptodate = true;

        for (npages = 0; npages < pages_per_frame; npages++, index++) {
                struct page *page;

                page = find_or_create_page(mapping, index, gfp_mask);
                if (!page) {
                        while (npages--) {
                                page = pages[npages];
                                unlock_page(page);
                                put_page(page);
                        }

                        return -ENOMEM;
                }

                if (!PageUptodate(page))
                        *frame_uptodate = false;

                pages[npages] = page;
        }

        return 0;
}

/*
 * ntfs_compress_write - Helper for ntfs_file_write_iter() (compressed files).
 */
static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
{
        int err;
        struct file *file = iocb->ki_filp;
        size_t count = iov_iter_count(from);
        loff_t pos = iocb->ki_pos;
        struct inode *inode = file_inode(file);
        loff_t i_size = i_size_read(inode);
        struct address_space *mapping = inode->i_mapping;
        struct ntfs_inode *ni = ntfs_i(inode);
        u64 valid = ni->i_valid;
        struct ntfs_sb_info *sbi = ni->mi.sbi;
        struct page *page, **pages = NULL;
        size_t written = 0;
        u8 frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits;
        u32 frame_size = 1u << frame_bits;
        u32 pages_per_frame = frame_size >> PAGE_SHIFT;
        u32 ip, off;
        CLST frame;
        u64 frame_vbo;
        pgoff_t index;
        bool frame_uptodate;

        if (frame_size < PAGE_SIZE) {
                /*
                 * frame_size == 8K if cluster 512
                 * frame_size == 64K if cluster 4096
                 */
                ntfs_inode_warn(inode, "page size is bigger than frame size");
                return -EOPNOTSUPP;
        }

        pages = kmalloc_array(pages_per_frame, sizeof(struct page *), GFP_NOFS);
        if (!pages)
                return -ENOMEM;

        err = file_remove_privs(file);
        if (err)
                goto out;

        err = file_update_time(file);
        if (err)
                goto out;

        /* Zero range [valid : pos). */
        while (valid < pos) {
                CLST lcn, clen;

                frame = valid >> frame_bits;
                frame_vbo = valid & ~(frame_size - 1);
                off = valid & (frame_size - 1);

                err = attr_data_get_block(ni, frame << NTFS_LZNT_CUNIT, 1, &lcn,
                                          &clen, NULL, false);
                if (err)
                        goto out;

                if (lcn == SPARSE_LCN) {
                        ni->i_valid = valid =
                                frame_vbo + ((u64)clen << sbi->cluster_bits);
                        continue;
                }

                /* Load full frame. */
                err = ntfs_get_frame_pages(mapping, frame_vbo >> PAGE_SHIFT,
                                           pages, pages_per_frame,
                                           &frame_uptodate);
                if (err)
                        goto out;

                if (!frame_uptodate && off) {
                        err = ni_read_frame(ni, frame_vbo, pages,
                                            pages_per_frame);
                        if (err) {
                                for (ip = 0; ip < pages_per_frame; ip++) {
                                        page = pages[ip];
                                        unlock_page(page);
                                        put_page(page);
                                }
                                goto out;
                        }
                }

                ip = off >> PAGE_SHIFT;
                off = offset_in_page(valid);
                for (; ip < pages_per_frame; ip++, off = 0) {
                        page = pages[ip];
                        zero_user_segment(page, off, PAGE_SIZE);
                        flush_dcache_page(page);
                        SetPageUptodate(page);
                }

                ni_lock(ni);
                err = ni_write_frame(ni, pages, pages_per_frame);
                ni_unlock(ni);

                for (ip = 0; ip < pages_per_frame; ip++) {
                        page = pages[ip];
                        SetPageUptodate(page);
                        unlock_page(page);
                        put_page(page);
                }

                if (err)
                        goto out;

                ni->i_valid = valid = frame_vbo + frame_size;
        }

        /* Copy user data [pos : pos + count). */
        while (count) {
                size_t copied, bytes;

                off = pos & (frame_size - 1);
                bytes = frame_size - off;
                if (bytes > count)
                        bytes = count;

                frame_vbo = pos & ~(frame_size - 1);
                index = frame_vbo >> PAGE_SHIFT;

                if (unlikely(fault_in_iov_iter_readable(from, bytes))) {
                        err = -EFAULT;
                        goto out;
                }

                /* Load full frame. */
                err = ntfs_get_frame_pages(mapping, index, pages,
                                           pages_per_frame, &frame_uptodate);
                if (err)
                        goto out;

                if (!frame_uptodate) {
                        loff_t to = pos + bytes;

                        if (off || (to < i_size && (to & (frame_size - 1)))) {
                                err = ni_read_frame(ni, frame_vbo, pages,
                                                    pages_per_frame);
                                if (err) {
                                        for (ip = 0; ip < pages_per_frame;
                                             ip++) {
                                                page = pages[ip];
                                                unlock_page(page);
                                                put_page(page);
                                        }
                                        goto out;
                                }
                        }
                }

                WARN_ON(!bytes);
                copied = 0;
                ip = off >> PAGE_SHIFT;
                off = offset_in_page(pos);

                /* Copy user data to pages. */
                for (;;) {
                        size_t cp, tail = PAGE_SIZE - off;

                        page = pages[ip];
                        cp = copy_page_from_iter_atomic(page, off,
                                                        min(tail, bytes), from);
                        flush_dcache_page(page);

                        copied += cp;
                        bytes -= cp;
                        if (!bytes || !cp)
                                break;

                        if (cp < tail) {
                                off += cp;
                        } else {
                                ip++;
                                off = 0;
                        }
                }

                ni_lock(ni);
                err = ni_write_frame(ni, pages, pages_per_frame);
                ni_unlock(ni);

                for (ip = 0; ip < pages_per_frame; ip++) {
                        page = pages[ip];
                        ClearPageDirty(page);
                        SetPageUptodate(page);
                        unlock_page(page);
                        put_page(page);
                }

                if (err)
                        goto out;

                /*
                 * We can loop for a long time in here. Be nice and allow
                 * us to schedule out to avoid softlocking if preempt
                 * is disabled.
                 */
                cond_resched();

                pos += copied;
                written += copied;

                count = iov_iter_count(from);
        }

out:
        kfree(pages);

        if (err < 0)
                return err;

        iocb->ki_pos += written;
        if (iocb->ki_pos > ni->i_valid)
                ni->i_valid = iocb->ki_pos;
        if (iocb->ki_pos > i_size)
                i_size_write(inode, iocb->ki_pos);

        return written;
}

/*
 * ntfs_file_write_iter - file_operations::write_iter
 */
static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        ssize_t ret;
        int err;
        struct ntfs_inode *ni = ntfs_i(inode);

        if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
                return -EIO;

        if (is_encrypted(ni)) {
                ntfs_inode_warn(inode, "encrypted i/o not supported");
                return -EOPNOTSUPP;
        }

        if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) {
                ntfs_inode_warn(inode, "direct i/o + compressed not supported");
                return -EOPNOTSUPP;
        }

        if (is_dedup(ni)) {
                ntfs_inode_warn(inode, "write into deduplicated not supported");
                return -EOPNOTSUPP;
        }

        if (!inode_trylock(inode)) {
                if (iocb->ki_flags & IOCB_NOWAIT)
                        return -EAGAIN;
                inode_lock(inode);
        }

        ret = generic_write_checks(iocb, from);
        if (ret <= 0)
                goto out;

        err = file_modified(iocb->ki_filp);
        if (err) {
                ret = err;
                goto out;
        }

        if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) {
                /* Should never be here, see ntfs_file_open(). */
                ret = -EOPNOTSUPP;
                goto out;
        }

        ret = ntfs_extend(inode, iocb->ki_pos, ret, file);
        if (ret)
                goto out;

        ret = is_compressed(ni) ? ntfs_compress_write(iocb, from) :
                                  __generic_file_write_iter(iocb, from);

out:
        inode_unlock(inode);

        if (ret > 0)
                ret = generic_write_sync(iocb, ret);

        return ret;
}

/*
 * ntfs_file_open - file_operations::open
 */
int ntfs_file_open(struct inode *inode, struct file *file)
{
        struct ntfs_inode *ni = ntfs_i(inode);

        if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
                return -EIO;

        if (unlikely((is_compressed(ni) || is_encrypted(ni)) &&
                     (file->f_flags & O_DIRECT))) {
                return -EOPNOTSUPP;
        }

        /* Decompress "external compressed" file if opened for rw. */
        if ((ni->ni_flags & NI_FLAG_COMPRESSED_MASK) &&
            (file->f_flags & (O_WRONLY | O_RDWR | O_TRUNC))) {
#ifdef CONFIG_NTFS3_LZX_XPRESS
                int err = ni_decompress_file(ni);

                if (err)
                        return err;
#else
                ntfs_inode_warn(
                        inode,
                        "activate CONFIG_NTFS3_LZX_XPRESS to write external compressed files");
                return -EOPNOTSUPP;
#endif
        }

        return generic_file_open(inode, file);
}

/*
 * ntfs_file_release - file_operations::release
 */
static int ntfs_file_release(struct inode *inode, struct file *file)
{
        struct ntfs_inode *ni = ntfs_i(inode);
        struct ntfs_sb_info *sbi = ni->mi.sbi;
        int err = 0;

        /* If we are last writer on the inode, drop the block reservation. */
        if (sbi->options->prealloc &&
            ((file->f_mode & FMODE_WRITE) &&
             atomic_read(&inode->i_writecount) == 1)) {
                ni_lock(ni);
                down_write(&ni->file.run_lock);

                err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run,
                                    i_size_read(inode), &ni->i_valid, false,
                                    NULL);

                up_write(&ni->file.run_lock);
                ni_unlock(ni);
        }
        return err;
}

/*
 * ntfs_fiemap - file_operations::fiemap
 */
int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len)
{
        int err;
        struct ntfs_inode *ni = ntfs_i(inode);

        err = fiemap_prep(inode, fieinfo, start, &len, ~FIEMAP_FLAG_XATTR);
        if (err)
                return err;

        ni_lock(ni);

        err = ni_fiemap(ni, fieinfo, start, len);

        ni_unlock(ni);

        return err;
}

// clang-format off
const struct inode_operations ntfs_file_inode_operations = {
        .getattr        = ntfs_getattr,
        .setattr        = ntfs3_setattr,
        .listxattr        = ntfs_listxattr,
        .get_acl        = ntfs_get_acl,
        .set_acl        = ntfs_set_acl,
        .fiemap                = ntfs_fiemap,
};

const struct file_operations ntfs_file_operations = {
        .llseek                = generic_file_llseek,
        .read_iter        = ntfs_file_read_iter,
        .write_iter        = ntfs_file_write_iter,
        .unlocked_ioctl = ntfs_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl        = ntfs_compat_ioctl,
#endif
        .splice_read        = ntfs_file_splice_read,
        .mmap                = ntfs_file_mmap,
        .open                = ntfs_file_open,
        .fsync                = generic_file_fsync,
        .splice_write        = iter_file_splice_write,
        .fallocate        = ntfs_fallocate,
        .release        = ntfs_file_release,
};

const struct file_operations ntfs_legacy_file_operations = {
        .llseek                = generic_file_llseek,
        .read_iter        = ntfs_file_read_iter,
        .splice_read        = ntfs_file_splice_read,
        .open                = ntfs_file_open,
        .release        = ntfs_file_release,
};
// clang-format on













































    3 




















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

/*
 * Basic idea behind the notification queue: An fsnotify group (like inotify)
 * sends the userspace notification about events asynchronously some time after
 * the event happened.  When inotify gets an event it will need to add that
 * event to the group notify queue.  Since a single event might need to be on
 * multiple group's notification queues we can't add the event directly to each
 * queue and instead add a small "event_holder" to each queue.  This event_holder
 * has a pointer back to the original event.  Since the majority of events are
 * going to end up on one, and only one, notification queue we embed one
 * event_holder into each event.  This means we have a single allocation instead
 * of always needing two.  If the embedded event_holder is already in use by
 * another group a new event_holder (from fsnotify_event_holder_cachep) will be
 * allocated and used.
 */

#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/mutex.h>
#include <linux/namei.h>
#include <linux/path.h>
#include <linux/slab.h>
#include <linux/spinlock.h>

#include <linux/atomic.h>

#include <linux/fsnotify_backend.h>
#include "fsnotify.h"

static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);

/**
 * fsnotify_get_cookie - return a unique cookie for use in synchronizing events.
 * Called from fsnotify_move, which is inlined into filesystem modules.
 */
u32 fsnotify_get_cookie(void)
{
        return atomic_inc_return(&fsnotify_sync_cookie);
}
EXPORT_SYMBOL_GPL(fsnotify_get_cookie);

void fsnotify_destroy_event(struct fsnotify_group *group,
                            struct fsnotify_event *event)
{
        /* Overflow events are per-group and we don't want to free them */
        if (!event || event == group->overflow_event)
                return;
        /*
         * If the event is still queued, we have a problem... Do an unreliable
         * lockless check first to avoid locking in the common case. The
         * locking may be necessary for permission events which got removed
         * from the list by a different CPU than the one freeing the event.
         */
        if (!list_empty(&event->list)) {
                spin_lock(&group->notification_lock);
                WARN_ON(!list_empty(&event->list));
                spin_unlock(&group->notification_lock);
        }
        group->ops->free_event(group, event);
}

/*
 * Try to add an event to the notification queue.
 * The group can later pull this event off the queue to deal with.
 * The group can use the @merge hook to merge the event with a queued event.
 * The group can use the @insert hook to insert the event into hash table.
 * The function returns:
 * 0 if the event was added to a queue
 * 1 if the event was merged with some other queued event
 * 2 if the event was not queued - either the queue of events has overflown
 *   or the group is shutting down.
 */
int fsnotify_insert_event(struct fsnotify_group *group,
                          struct fsnotify_event *event,
                          int (*merge)(struct fsnotify_group *,
                                       struct fsnotify_event *),
                          void (*insert)(struct fsnotify_group *,
                                         struct fsnotify_event *))
{
        int ret = 0;
        struct list_head *list = &group->notification_list;

        pr_debug("%s: group=%p event=%p\n", __func__, group, event);

        spin_lock(&group->notification_lock);

        if (group->shutdown) {
                spin_unlock(&group->notification_lock);
                return 2;
        }

        if (event == group->overflow_event ||
            group->q_len >= group->max_events) {
                ret = 2;
                /* Queue overflow event only if it isn't already queued */
                if (!list_empty(&group->overflow_event->list)) {
                        spin_unlock(&group->notification_lock);
                        return ret;
                }
                event = group->overflow_event;
                goto queue;
        }

        if (!list_empty(list) && merge) {
                ret = merge(group, event);
                if (ret) {
                        spin_unlock(&group->notification_lock);
                        return ret;
                }
        }

queue:
        group->q_len++;
        list_add_tail(&event->list, list);
        if (insert)
                insert(group, event);
        spin_unlock(&group->notification_lock);

        wake_up(&group->notification_waitq);
        kill_fasync(&group->fsn_fa, SIGIO, POLL_IN);
        return ret;
}

void fsnotify_remove_queued_event(struct fsnotify_group *group,
                                  struct fsnotify_event *event)
{
        assert_spin_locked(&group->notification_lock);
        /*
         * We need to init list head for the case of overflow event so that
         * check in fsnotify_add_event() works
         */
        list_del_init(&event->list);
        group->q_len--;
}

/*
 * Return the first event on the notification list without removing it.
 * Returns NULL if the list is empty.
 */
struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group)
{
        assert_spin_locked(&group->notification_lock);

        if (fsnotify_notify_queue_is_empty(group))
                return NULL;

        return list_first_entry(&group->notification_list,
                                struct fsnotify_event, list);
}

/*
 * Remove and return the first event from the notification list.  It is the
 * responsibility of the caller to destroy the obtained event
 */
struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
{
        struct fsnotify_event *event = fsnotify_peek_first_event(group);

        if (!event)
                return NULL;

        pr_debug("%s: group=%p event=%p\n", __func__, group, event);

        fsnotify_remove_queued_event(group, event);

        return event;
}

/*
 * Called when a group is being torn down to clean up any outstanding
 * event notifications.
 */
void fsnotify_flush_notify(struct fsnotify_group *group)
{
        struct fsnotify_event *event;

        spin_lock(&group->notification_lock);
        while (!fsnotify_notify_queue_is_empty(group)) {
                event = fsnotify_remove_first_event(group);
                spin_unlock(&group->notification_lock);
                fsnotify_destroy_event(group, event);
                spin_lock(&group->notification_lock);
        }
        spin_unlock(&group->notification_lock);
}
















































































































































































































    4 
    1 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef BTRFS_SPACE_INFO_H
#define BTRFS_SPACE_INFO_H

#include <trace/events/btrfs.h>
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/kobject.h>
#include <linux/lockdep.h>
#include <linux/wait.h>
#include <linux/rwsem.h>
#include "volumes.h"

struct btrfs_fs_info;
struct btrfs_block_group;

/*
 * Different levels for to flush space when doing space reservations.
 *
 * The higher the level, the more methods we try to reclaim space.
 */
enum btrfs_reserve_flush_enum {
        /* If we are in the transaction, we can't flush anything.*/
        BTRFS_RESERVE_NO_FLUSH,

        /*
         * Flush space by:
         * - Running delayed inode items
         * - Allocating a new chunk
         */
        BTRFS_RESERVE_FLUSH_LIMIT,

        /*
         * Flush space by:
         * - Running delayed inode items
         * - Running delayed refs
         * - Running delalloc and waiting for ordered extents
         * - Allocating a new chunk
         * - Committing transaction
         */
        BTRFS_RESERVE_FLUSH_EVICT,

        /*
         * Flush space by above mentioned methods and by:
         * - Running delayed iputs
         * - Committing transaction
         *
         * Can be interrupted by a fatal signal.
         */
        BTRFS_RESERVE_FLUSH_DATA,
        BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE,
        BTRFS_RESERVE_FLUSH_ALL,

        /*
         * Pretty much the same as FLUSH_ALL, but can also steal space from
         * global rsv.
         *
         * Can be interrupted by a fatal signal.
         */
        BTRFS_RESERVE_FLUSH_ALL_STEAL,

        /*
         * This is for btrfs_use_block_rsv only.  We have exhausted our block
         * rsv and our global block rsv.  This can happen for things like
         * delalloc where we are overwriting a lot of extents with a single
         * extent and didn't reserve enough space.  Alternatively it can happen
         * with delalloc where we reserve 1 extents worth for a large extent but
         * fragmentation leads to multiple extents being created.  This will
         * give us the reservation in the case of
         *
         * if (num_bytes < (space_info->total_bytes -
         *                    btrfs_space_info_used(space_info, false))
         *
         * Which ignores bytes_may_use.  This is potentially dangerous, but our
         * reservation system is generally pessimistic so is able to absorb this
         * style of mistake.
         */
        BTRFS_RESERVE_FLUSH_EMERGENCY,
};

enum btrfs_flush_state {
        FLUSH_DELAYED_ITEMS_NR        = 1,
        FLUSH_DELAYED_ITEMS        = 2,
        FLUSH_DELAYED_REFS_NR        = 3,
        FLUSH_DELAYED_REFS        = 4,
        FLUSH_DELALLOC                = 5,
        FLUSH_DELALLOC_WAIT        = 6,
        FLUSH_DELALLOC_FULL        = 7,
        ALLOC_CHUNK                = 8,
        ALLOC_CHUNK_FORCE        = 9,
        RUN_DELAYED_IPUTS        = 10,
        COMMIT_TRANS                = 11,
};

struct btrfs_space_info {
        spinlock_t lock;

        u64 total_bytes;        /* total bytes in the space,
                                   this doesn't take mirrors into account */
        u64 bytes_used;                /* total bytes used,
                                   this doesn't take mirrors into account */
        u64 bytes_pinned;        /* total bytes pinned, will be freed when the
                                   transaction finishes */
        u64 bytes_reserved;        /* total bytes the allocator has reserved for
                                   current allocations */
        u64 bytes_may_use;        /* number of bytes that may be used for
                                   delalloc/allocations */
        u64 bytes_readonly;        /* total bytes that are read only */
        u64 bytes_zone_unusable;        /* total bytes that are unusable until
                                           resetting the device zone */

        u64 max_extent_size;        /* This will hold the maximum extent size of
                                   the space info if we had an ENOSPC in the
                                   allocator. */
        /* Chunk size in bytes */
        u64 chunk_size;

        /*
         * Once a block group drops below this threshold (percents) we'll
         * schedule it for reclaim.
         */
        int bg_reclaim_threshold;

        int clamp;                /* Used to scale our threshold for preemptive
                                   flushing. The value is >> clamp, so turns
                                   out to be a 2^clamp divisor. */

        unsigned int full:1;        /* indicates that we cannot allocate any more
                                   chunks for this space */
        unsigned int chunk_alloc:1;        /* set if we are allocating a chunk */

        unsigned int flush:1;                /* set if we are trying to make space */

        unsigned int force_alloc;        /* set if we need to force a chunk
                                           alloc for this space */

        u64 disk_used;                /* total bytes used on disk */
        u64 disk_total;                /* total bytes on disk, takes mirrors into
                                   account */

        u64 flags;

        struct list_head list;
        /* Protected by the spinlock 'lock'. */
        struct list_head ro_bgs;
        struct list_head priority_tickets;
        struct list_head tickets;

        /*
         * Size of space that needs to be reclaimed in order to satisfy pending
         * tickets
         */
        u64 reclaim_size;

        /*
         * tickets_id just indicates the next ticket will be handled, so note
         * it's not stored per ticket.
         */
        u64 tickets_id;

        struct rw_semaphore groups_sem;
        /* for block groups in our same type */
        struct list_head block_groups[BTRFS_NR_RAID_TYPES];

        struct kobject kobj;
        struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
};

struct reserve_ticket {
        u64 bytes;
        int error;
        bool steal;
        struct list_head list;
        wait_queue_head_t wait;
};

static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
{
        return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
                (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
}

/*
 *
 * Declare a helper function to detect underflow of various space info members
 */
#define DECLARE_SPACE_INFO_UPDATE(name, trace_name)                        \
static inline void                                                        \
btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info,                \
                               struct btrfs_space_info *sinfo,                \
                               s64 bytes)                                \
{                                                                        \
        const u64 abs_bytes = (bytes < 0) ? -bytes : bytes;                \
        lockdep_assert_held(&sinfo->lock);                                \
        trace_update_##name(fs_info, sinfo, sinfo->name, bytes);        \
        trace_btrfs_space_reservation(fs_info, trace_name,                \
                                      sinfo->flags, abs_bytes,                \
                                      bytes > 0);                        \
        if (bytes < 0 && sinfo->name < -bytes) {                        \
                WARN_ON(1);                                                \
                sinfo->name = 0;                                        \
                return;                                                        \
        }                                                                \
        sinfo->name += bytes;                                                \
}

DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info");
DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned");

int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
                                struct btrfs_block_group *block_group);
void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
                                        u64 chunk_size);
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
                                               u64 flags);
u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
                          bool may_use_included);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
                           struct btrfs_space_info *info, u64 bytes,
                           int dump_block_groups);
int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
                                 struct btrfs_space_info *space_info,
                                 u64 orig_bytes,
                                 enum btrfs_reserve_flush_enum flush);
void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
                                struct btrfs_space_info *space_info);
int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
                         struct btrfs_space_info *space_info, u64 bytes,
                         enum btrfs_reserve_flush_enum flush);

static inline void btrfs_space_info_free_bytes_may_use(
                                struct btrfs_fs_info *fs_info,
                                struct btrfs_space_info *space_info,
                                u64 num_bytes)
{
        spin_lock(&space_info->lock);
        btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
        btrfs_try_granting_tickets(fs_info, space_info);
        spin_unlock(&space_info->lock);
}
int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
                             enum btrfs_reserve_flush_enum flush);
void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info);
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);

#endif /* BTRFS_SPACE_INFO_H */
















































































    1 








    1 





    1 

    1 



























































   16 




























































   15 










































   13 





















    4 

    4 




    3 




    4 

    4 












    2 

    2 


















    3 
    3 

    2 
    2 












    2 

    2 














    5 




    7 
















    5 
    5 






























    6 























    1 







    8 
    9 





    3 

    2 


















    5 
    5 















   14 








    4 










   14 












    1 
    1 



    1 















    3 
























































    4 





















    4 

















    4 





    1 
    3 

    4 










    4 




    4 


    4 






    4 






















































































































































































































    3 



    3 













    2 












    3 



    2 













    3 

























    3 
    3 



    2 









    3 


    3 















   11 
   11 






















   10 




   10 






    7 
    7 



    5 





















    5 
    4 

    5 






























    3 

    3 
    3 



    3 


    3 












































    2 
    3 





    3 















    3 










    3 







    2 






























    3 


    2 
    2 




    3 





















    3 
    3 








    1 
    1 






    2 










    2 

    2 





































    1 


    1 





















    1 









    1 


    1 


    1 

    1 






    1 


    1 

































    3 





























    4 


    3 
    3 
    2 
    2 

















    2 
    2 









    2 
    2 























































































































    1 
    1 


    1 


    1 
    1 









    5 


    5 




    5 
    1 













    5 








































    1 





















    4 


    2 

    2 

    2 





















    4 
















   11 
    8 



    6 
    4 






    4 








































    3 






    7 





    2 































    8 








    5 


    5 


    6 


    2 


    6 


    7 






















    5 



    6 














    3 
   10 














    9 














    9 




    9 




    9 










   10 



    7 


    7 













    7 


    7 










    2 




    1 






    1 


















    5 
    4 




    2 






    3 







    4 














    2 















    7 



    7 


    6 











    1 
    6 





















    4 



    4 
    2 


    4 



















    3 














    3 
    3 




    2 














    3 






















    2 
    2 






























































































































    7 

    7 



























    5 




    5 




































    3 
    3 






















    5 
    5 
    5 




    1 



















   30 



   23 

    4 


    1 

















   31 



   31 











   17 





















    3 
    3 






















    5 
    5 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
// SPDX-License-Identifier: GPL-2.0-only
/*
 * (C) 1997 Linus Torvalds
 * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
 */
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/mm.h>
#include <linux/backing-dev.h>
#include <linux/hash.h>
#include <linux/swap.h>
#include <linux/security.h>
#include <linux/cdev.h>
#include <linux/memblock.h>
#include <linux/fsnotify.h>
#include <linux/mount.h>
#include <linux/posix_acl.h>
#include <linux/buffer_head.h> /* for inode_has_buffers */
#include <linux/ratelimit.h>
#include <linux/list_lru.h>
#include <linux/iversion.h>
#include <linux/rw_hint.h>
#include <trace/events/writeback.h>
#include "internal.h"

/*
 * Inode locking rules:
 *
 * inode->i_lock protects:
 *   inode->i_state, inode->i_hash, __iget(), inode->i_io_list
 * Inode LRU list locks protect:
 *   inode->i_sb->s_inode_lru, inode->i_lru
 * inode->i_sb->s_inode_list_lock protects:
 *   inode->i_sb->s_inodes, inode->i_sb_list
 * bdi->wb.list_lock protects:
 *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
 * inode_hash_lock protects:
 *   inode_hashtable, inode->i_hash
 *
 * Lock ordering:
 *
 * inode->i_sb->s_inode_list_lock
 *   inode->i_lock
 *     Inode LRU list locks
 *
 * bdi->wb.list_lock
 *   inode->i_lock
 *
 * inode_hash_lock
 *   inode->i_sb->s_inode_list_lock
 *   inode->i_lock
 *
 * iunique_lock
 *   inode_hash_lock
 */

static unsigned int i_hash_mask __ro_after_init;
static unsigned int i_hash_shift __ro_after_init;
static struct hlist_head *inode_hashtable __ro_after_init;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);

/*
 * Empty aops. Can be used for the cases where the user does not
 * define any of the address_space operations.
 */
const struct address_space_operations empty_aops = {
};
EXPORT_SYMBOL(empty_aops);

static DEFINE_PER_CPU(unsigned long, nr_inodes);
static DEFINE_PER_CPU(unsigned long, nr_unused);

static struct kmem_cache *inode_cachep __ro_after_init;

static long get_nr_inodes(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_inodes, i);
        return sum < 0 ? 0 : sum;
}

static inline long get_nr_inodes_unused(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_unused, i);
        return sum < 0 ? 0 : sum;
}

long get_nr_dirty_inodes(void)
{
        /* not actually dirty inodes, but a wild approximation */
        long nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
        return nr_dirty > 0 ? nr_dirty : 0;
}

/*
 * Handle nr_inode sysctl
 */
#ifdef CONFIG_SYSCTL
/*
 * Statistics gathering..
 */
static struct inodes_stat_t inodes_stat;

static int proc_nr_inodes(struct ctl_table *table, int write, void *buffer,
                          size_t *lenp, loff_t *ppos)
{
        inodes_stat.nr_inodes = get_nr_inodes();
        inodes_stat.nr_unused = get_nr_inodes_unused();
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

static struct ctl_table inodes_sysctls[] = {
        {
                .procname        = "inode-nr",
                .data                = &inodes_stat,
                .maxlen                = 2*sizeof(long),
                .mode                = 0444,
                .proc_handler        = proc_nr_inodes,
        },
        {
                .procname        = "inode-state",
                .data                = &inodes_stat,
                .maxlen                = 7*sizeof(long),
                .mode                = 0444,
                .proc_handler        = proc_nr_inodes,
        },
};

static int __init init_fs_inode_sysctls(void)
{
        register_sysctl_init("fs", inodes_sysctls);
        return 0;
}
early_initcall(init_fs_inode_sysctls);
#endif

static int no_open(struct inode *inode, struct file *file)
{
        return -ENXIO;
}

/**
 * inode_init_always - perform inode structure initialisation
 * @sb: superblock inode belongs to
 * @inode: inode to initialise
 *
 * These are initializations that need to be done on every inode
 * allocation as the fields are not initialised by slab allocation.
 */
int inode_init_always(struct super_block *sb, struct inode *inode)
{
        static const struct inode_operations empty_iops;
        static const struct file_operations no_open_fops = {.open = no_open};
        struct address_space *const mapping = &inode->i_data;

        inode->i_sb = sb;
        inode->i_blkbits = sb->s_blocksize_bits;
        inode->i_flags = 0;
        atomic64_set(&inode->i_sequence, 0);
        atomic_set(&inode->i_count, 1);
        inode->i_op = &empty_iops;
        inode->i_fop = &no_open_fops;
        inode->i_ino = 0;
        inode->__i_nlink = 1;
        inode->i_opflags = 0;
        if (sb->s_xattr)
                inode->i_opflags |= IOP_XATTR;
        i_uid_write(inode, 0);
        i_gid_write(inode, 0);
        atomic_set(&inode->i_writecount, 0);
        inode->i_size = 0;
        inode->i_write_hint = WRITE_LIFE_NOT_SET;
        inode->i_blocks = 0;
        inode->i_bytes = 0;
        inode->i_generation = 0;
        inode->i_pipe = NULL;
        inode->i_cdev = NULL;
        inode->i_link = NULL;
        inode->i_dir_seq = 0;
        inode->i_rdev = 0;
        inode->dirtied_when = 0;

#ifdef CONFIG_CGROUP_WRITEBACK
        inode->i_wb_frn_winner = 0;
        inode->i_wb_frn_avg_time = 0;
        inode->i_wb_frn_history = 0;
#endif

        spin_lock_init(&inode->i_lock);
        lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);

        init_rwsem(&inode->i_rwsem);
        lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);

        atomic_set(&inode->i_dio_count, 0);

        mapping->a_ops = &empty_aops;
        mapping->host = inode;
        mapping->flags = 0;
        mapping->wb_err = 0;
        atomic_set(&mapping->i_mmap_writable, 0);
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        atomic_set(&mapping->nr_thps, 0);
#endif
        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
        mapping->i_private_data = NULL;
        mapping->writeback_index = 0;
        init_rwsem(&mapping->invalidate_lock);
        lockdep_set_class_and_name(&mapping->invalidate_lock,
                                   &sb->s_type->invalidate_lock_key,
                                   "mapping.invalidate_lock");
        if (sb->s_iflags & SB_I_STABLE_WRITES)
                mapping_set_stable_writes(mapping);
        inode->i_private = NULL;
        inode->i_mapping = mapping;
        INIT_HLIST_HEAD(&inode->i_dentry);        /* buggered by rcu freeing */
#ifdef CONFIG_FS_POSIX_ACL
        inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
#endif

#ifdef CONFIG_FSNOTIFY
        inode->i_fsnotify_mask = 0;
#endif
        inode->i_flctx = NULL;

        if (unlikely(security_inode_alloc(inode)))
                return -ENOMEM;
        this_cpu_inc(nr_inodes);

        return 0;
}
EXPORT_SYMBOL(inode_init_always);

void free_inode_nonrcu(struct inode *inode)
{
        kmem_cache_free(inode_cachep, inode);
}
EXPORT_SYMBOL(free_inode_nonrcu);

static void i_callback(struct rcu_head *head)
{
        struct inode *inode = container_of(head, struct inode, i_rcu);
        if (inode->free_inode)
                inode->free_inode(inode);
        else
                free_inode_nonrcu(inode);
}

static struct inode *alloc_inode(struct super_block *sb)
{
        const struct super_operations *ops = sb->s_op;
        struct inode *inode;

        if (ops->alloc_inode)
                inode = ops->alloc_inode(sb);
        else
                inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL);

        if (!inode)
                return NULL;

        if (unlikely(inode_init_always(sb, inode))) {
                if (ops->destroy_inode) {
                        ops->destroy_inode(inode);
                        if (!ops->free_inode)
                                return NULL;
                }
                inode->free_inode = ops->free_inode;
                i_callback(&inode->i_rcu);
                return NULL;
        }

        return inode;
}

void __destroy_inode(struct inode *inode)
{
        BUG_ON(inode_has_buffers(inode));
        inode_detach_wb(inode);
        security_inode_free(inode);
        fsnotify_inode_delete(inode);
        locks_free_lock_context(inode);
        if (!inode->i_nlink) {
                WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
                atomic_long_dec(&inode->i_sb->s_remove_count);
        }

#ifdef CONFIG_FS_POSIX_ACL
        if (inode->i_acl && !is_uncached_acl(inode->i_acl))
                posix_acl_release(inode->i_acl);
        if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl))
                posix_acl_release(inode->i_default_acl);
#endif
        this_cpu_dec(nr_inodes);
}
EXPORT_SYMBOL(__destroy_inode);

static void destroy_inode(struct inode *inode)
{
        const struct super_operations *ops = inode->i_sb->s_op;

        BUG_ON(!list_empty(&inode->i_lru));
        __destroy_inode(inode);
        if (ops->destroy_inode) {
                ops->destroy_inode(inode);
                if (!ops->free_inode)
                        return;
        }
        inode->free_inode = ops->free_inode;
        call_rcu(&inode->i_rcu, i_callback);
}

/**
 * drop_nlink - directly drop an inode's link count
 * @inode: inode
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.  In cases
 * where we are attempting to track writes to the
 * filesystem, a decrement to zero means an imminent
 * write when the file is truncated and actually unlinked
 * on the filesystem.
 */
void drop_nlink(struct inode *inode)
{
        WARN_ON(inode->i_nlink == 0);
        inode->__i_nlink--;
        if (!inode->i_nlink)
                atomic_long_inc(&inode->i_sb->s_remove_count);
}
EXPORT_SYMBOL(drop_nlink);

/**
 * clear_nlink - directly zero an inode's link count
 * @inode: inode
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.  See
 * drop_nlink() for why we care about i_nlink hitting zero.
 */
void clear_nlink(struct inode *inode)
{
        if (inode->i_nlink) {
                inode->__i_nlink = 0;
                atomic_long_inc(&inode->i_sb->s_remove_count);
        }
}
EXPORT_SYMBOL(clear_nlink);

/**
 * set_nlink - directly set an inode's link count
 * @inode: inode
 * @nlink: new nlink (should be non-zero)
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.
 */
void set_nlink(struct inode *inode, unsigned int nlink)
{
        if (!nlink) {
                clear_nlink(inode);
        } else {
                /* Yes, some filesystems do change nlink from zero to one */
                if (inode->i_nlink == 0)
                        atomic_long_dec(&inode->i_sb->s_remove_count);

                inode->__i_nlink = nlink;
        }
}
EXPORT_SYMBOL(set_nlink);

/**
 * inc_nlink - directly increment an inode's link count
 * @inode: inode
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.  Currently,
 * it is only here for parity with dec_nlink().
 */
void inc_nlink(struct inode *inode)
{
        if (unlikely(inode->i_nlink == 0)) {
                WARN_ON(!(inode->i_state & I_LINKABLE));
                atomic_long_dec(&inode->i_sb->s_remove_count);
        }

        inode->__i_nlink++;
}
EXPORT_SYMBOL(inc_nlink);

static void __address_space_init_once(struct address_space *mapping)
{
        xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
        init_rwsem(&mapping->i_mmap_rwsem);
        INIT_LIST_HEAD(&mapping->i_private_list);
        spin_lock_init(&mapping->i_private_lock);
        mapping->i_mmap = RB_ROOT_CACHED;
}

void address_space_init_once(struct address_space *mapping)
{
        memset(mapping, 0, sizeof(*mapping));
        __address_space_init_once(mapping);
}
EXPORT_SYMBOL(address_space_init_once);

/*
 * These are initializations that only need to be done
 * once, because the fields are idempotent across use
 * of the inode, so let the slab aware of that.
 */
void inode_init_once(struct inode *inode)
{
        memset(inode, 0, sizeof(*inode));
        INIT_HLIST_NODE(&inode->i_hash);
        INIT_LIST_HEAD(&inode->i_devices);
        INIT_LIST_HEAD(&inode->i_io_list);
        INIT_LIST_HEAD(&inode->i_wb_list);
        INIT_LIST_HEAD(&inode->i_lru);
        INIT_LIST_HEAD(&inode->i_sb_list);
        __address_space_init_once(&inode->i_data);
        i_size_ordered_init(inode);
}
EXPORT_SYMBOL(inode_init_once);

static void init_once(void *foo)
{
        struct inode *inode = (struct inode *) foo;

        inode_init_once(inode);
}

/*
 * inode->i_lock must be held
 */
void __iget(struct inode *inode)
{
        atomic_inc(&inode->i_count);
}

/*
 * get additional reference to inode; caller must already hold one.
 */
void ihold(struct inode *inode)
{
        WARN_ON(atomic_inc_return(&inode->i_count) < 2);
}
EXPORT_SYMBOL(ihold);

static void __inode_add_lru(struct inode *inode, bool rotate)
{
        if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
                return;
        if (atomic_read(&inode->i_count))
                return;
        if (!(inode->i_sb->s_flags & SB_ACTIVE))
                return;
        if (!mapping_shrinkable(&inode->i_data))
                return;

        if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
                this_cpu_inc(nr_unused);
        else if (rotate)
                inode->i_state |= I_REFERENCED;
}

/*
 * Add inode to LRU if needed (inode is unused and clean).
 *
 * Needs inode->i_lock held.
 */
void inode_add_lru(struct inode *inode)
{
        __inode_add_lru(inode, false);
}

static void inode_lru_list_del(struct inode *inode)
{
        if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
                this_cpu_dec(nr_unused);
}

/**
 * inode_sb_list_add - add inode to the superblock list of inodes
 * @inode: inode to add
 */
void inode_sb_list_add(struct inode *inode)
{
        spin_lock(&inode->i_sb->s_inode_list_lock);
        list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
        spin_unlock(&inode->i_sb->s_inode_list_lock);
}
EXPORT_SYMBOL_GPL(inode_sb_list_add);

static inline void inode_sb_list_del(struct inode *inode)
{
        if (!list_empty(&inode->i_sb_list)) {
                spin_lock(&inode->i_sb->s_inode_list_lock);
                list_del_init(&inode->i_sb_list);
                spin_unlock(&inode->i_sb->s_inode_list_lock);
        }
}

static unsigned long hash(struct super_block *sb, unsigned long hashval)
{
        unsigned long tmp;

        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
                        L1_CACHE_BYTES;
        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
        return tmp & i_hash_mask;
}

/**
 *        __insert_inode_hash - hash an inode
 *        @inode: unhashed inode
 *        @hashval: unsigned long value used to locate this object in the
 *                inode_hashtable.
 *
 *        Add an inode to the inode hash for this superblock.
 */
void __insert_inode_hash(struct inode *inode, unsigned long hashval)
{
        struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);

        spin_lock(&inode_hash_lock);
        spin_lock(&inode->i_lock);
        hlist_add_head_rcu(&inode->i_hash, b);
        spin_unlock(&inode->i_lock);
        spin_unlock(&inode_hash_lock);
}
EXPORT_SYMBOL(__insert_inode_hash);

/**
 *        __remove_inode_hash - remove an inode from the hash
 *        @inode: inode to unhash
 *
 *        Remove an inode from the superblock.
 */
void __remove_inode_hash(struct inode *inode)
{
        spin_lock(&inode_hash_lock);
        spin_lock(&inode->i_lock);
        hlist_del_init_rcu(&inode->i_hash);
        spin_unlock(&inode->i_lock);
        spin_unlock(&inode_hash_lock);
}
EXPORT_SYMBOL(__remove_inode_hash);

void dump_mapping(const struct address_space *mapping)
{
        struct inode *host;
        const struct address_space_operations *a_ops;
        struct hlist_node *dentry_first;
        struct dentry *dentry_ptr;
        struct dentry dentry;
        unsigned long ino;

        /*
         * If mapping is an invalid pointer, we don't want to crash
         * accessing it, so probe everything depending on it carefully.
         */
        if (get_kernel_nofault(host, &mapping->host) ||
            get_kernel_nofault(a_ops, &mapping->a_ops)) {
                pr_warn("invalid mapping:%px\n", mapping);
                return;
        }

        if (!host) {
                pr_warn("aops:%ps\n", a_ops);
                return;
        }

        if (get_kernel_nofault(dentry_first, &host->i_dentry.first) ||
            get_kernel_nofault(ino, &host->i_ino)) {
                pr_warn("aops:%ps invalid inode:%px\n", a_ops, host);
                return;
        }

        if (!dentry_first) {
                pr_warn("aops:%ps ino:%lx\n", a_ops, ino);
                return;
        }

        dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
        if (get_kernel_nofault(dentry, dentry_ptr) ||
            !dentry.d_parent || !dentry.d_name.name) {
                pr_warn("aops:%ps ino:%lx invalid dentry:%px\n",
                                a_ops, ino, dentry_ptr);
                return;
        }

        /*
         * if dentry is corrupted, the %pd handler may still crash,
         * but it's unlikely that we reach here with a corrupt mapping
         */
        pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", a_ops, ino, &dentry);
}

void clear_inode(struct inode *inode)
{
        /*
         * We have to cycle the i_pages lock here because reclaim can be in the
         * process of removing the last page (in __filemap_remove_folio())
         * and we must not free the mapping under it.
         */
        xa_lock_irq(&inode->i_data.i_pages);
        BUG_ON(inode->i_data.nrpages);
        /*
         * Almost always, mapping_empty(&inode->i_data) here; but there are
         * two known and long-standing ways in which nodes may get left behind
         * (when deep radix-tree node allocation failed partway; or when THP
         * collapse_file() failed). Until those two known cases are cleaned up,
         * or a cleanup function is called here, do not BUG_ON(!mapping_empty),
         * nor even WARN_ON(!mapping_empty).
         */
        xa_unlock_irq(&inode->i_data.i_pages);
        BUG_ON(!list_empty(&inode->i_data.i_private_list));
        BUG_ON(!(inode->i_state & I_FREEING));
        BUG_ON(inode->i_state & I_CLEAR);
        BUG_ON(!list_empty(&inode->i_wb_list));
        /* don't need i_lock here, no concurrent mods to i_state */
        inode->i_state = I_FREEING | I_CLEAR;
}
EXPORT_SYMBOL(clear_inode);

/*
 * Free the inode passed in, removing it from the lists it is still connected
 * to. We remove any pages still attached to the inode and wait for any IO that
 * is still in progress before finally destroying the inode.
 *
 * An inode must already be marked I_FREEING so that we avoid the inode being
 * moved back onto lists if we race with other code that manipulates the lists
 * (e.g. writeback_single_inode). The caller is responsible for setting this.
 *
 * An inode must already be removed from the LRU list before being evicted from
 * the cache. This should occur atomically with setting the I_FREEING state
 * flag, so no inodes here should ever be on the LRU when being evicted.
 */
static void evict(struct inode *inode)
{
        const struct super_operations *op = inode->i_sb->s_op;

        BUG_ON(!(inode->i_state & I_FREEING));
        BUG_ON(!list_empty(&inode->i_lru));

        if (!list_empty(&inode->i_io_list))
                inode_io_list_del(inode);

        inode_sb_list_del(inode);

        /*
         * Wait for flusher thread to be done with the inode so that filesystem
         * does not start destroying it while writeback is still running. Since
         * the inode has I_FREEING set, flusher thread won't start new work on
         * the inode.  We just have to wait for running writeback to finish.
         */
        inode_wait_for_writeback(inode);

        if (op->evict_inode) {
                op->evict_inode(inode);
        } else {
                truncate_inode_pages_final(&inode->i_data);
                clear_inode(inode);
        }
        if (S_ISCHR(inode->i_mode) && inode->i_cdev)
                cd_forget(inode);

        remove_inode_hash(inode);

        spin_lock(&inode->i_lock);
        wake_up_bit(&inode->i_state, __I_NEW);
        BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
        spin_unlock(&inode->i_lock);

        destroy_inode(inode);
}

/*
 * dispose_list - dispose of the contents of a local list
 * @head: the head of the list to free
 *
 * Dispose-list gets a local list with local inodes in it, so it doesn't
 * need to worry about list corruption and SMP locks.
 */
static void dispose_list(struct list_head *head)
{
        while (!list_empty(head)) {
                struct inode *inode;

                inode = list_first_entry(head, struct inode, i_lru);
                list_del_init(&inode->i_lru);

                evict(inode);
                cond_resched();
        }
}

/**
 * evict_inodes        - evict all evictable inodes for a superblock
 * @sb:                superblock to operate on
 *
 * Make sure that no inodes with zero refcount are retained.  This is
 * called by superblock shutdown after having SB_ACTIVE flag removed,
 * so any inode reaching zero refcount during or after that call will
 * be immediately evicted.
 */
void evict_inodes(struct super_block *sb)
{
        struct inode *inode, *next;
        LIST_HEAD(dispose);

again:
        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
                if (atomic_read(&inode->i_count))
                        continue;

                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                inode->i_state |= I_FREEING;
                inode_lru_list_del(inode);
                spin_unlock(&inode->i_lock);
                list_add(&inode->i_lru, &dispose);

                /*
                 * We can have a ton of inodes to evict at unmount time given
                 * enough memory, check to see if we need to go to sleep for a
                 * bit so we don't livelock.
                 */
                if (need_resched()) {
                        spin_unlock(&sb->s_inode_list_lock);
                        cond_resched();
                        dispose_list(&dispose);
                        goto again;
                }
        }
        spin_unlock(&sb->s_inode_list_lock);

        dispose_list(&dispose);
}
EXPORT_SYMBOL_GPL(evict_inodes);

/**
 * invalidate_inodes        - attempt to free all inodes on a superblock
 * @sb:                superblock to operate on
 *
 * Attempts to free all inodes (including dirty inodes) for a given superblock.
 */
void invalidate_inodes(struct super_block *sb)
{
        struct inode *inode, *next;
        LIST_HEAD(dispose);

again:
        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                if (atomic_read(&inode->i_count)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                inode->i_state |= I_FREEING;
                inode_lru_list_del(inode);
                spin_unlock(&inode->i_lock);
                list_add(&inode->i_lru, &dispose);
                if (need_resched()) {
                        spin_unlock(&sb->s_inode_list_lock);
                        cond_resched();
                        dispose_list(&dispose);
                        goto again;
                }
        }
        spin_unlock(&sb->s_inode_list_lock);

        dispose_list(&dispose);
}

/*
 * Isolate the inode from the LRU in preparation for freeing it.
 *
 * If the inode has the I_REFERENCED flag set, then it means that it has been
 * used recently - the flag is set in iput_final(). When we encounter such an
 * inode, clear the flag and move it to the back of the LRU so it gets another
 * pass through the LRU before it gets reclaimed. This is necessary because of
 * the fact we are doing lazy LRU updates to minimise lock contention so the
 * LRU does not have strict ordering. Hence we don't want to reclaim inodes
 * with this flag set because they are the inodes that are out of order.
 */
static enum lru_status inode_lru_isolate(struct list_head *item,
                struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
        struct list_head *freeable = arg;
        struct inode        *inode = container_of(item, struct inode, i_lru);

        /*
         * We are inverting the lru lock/inode->i_lock here, so use a
         * trylock. If we fail to get the lock, just skip it.
         */
        if (!spin_trylock(&inode->i_lock))
                return LRU_SKIP;

        /*
         * Inodes can get referenced, redirtied, or repopulated while
         * they're already on the LRU, and this can make them
         * unreclaimable for a while. Remove them lazily here; iput,
         * sync, or the last page cache deletion will requeue them.
         */
        if (atomic_read(&inode->i_count) ||
            (inode->i_state & ~I_REFERENCED) ||
            !mapping_shrinkable(&inode->i_data)) {
                list_lru_isolate(lru, &inode->i_lru);
                spin_unlock(&inode->i_lock);
                this_cpu_dec(nr_unused);
                return LRU_REMOVED;
        }

        /* Recently referenced inodes get one more pass */
        if (inode->i_state & I_REFERENCED) {
                inode->i_state &= ~I_REFERENCED;
                spin_unlock(&inode->i_lock);
                return LRU_ROTATE;
        }

        /*
         * On highmem systems, mapping_shrinkable() permits dropping
         * page cache in order to free up struct inodes: lowmem might
         * be under pressure before the cache inside the highmem zone.
         */
        if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
                __iget(inode);
                spin_unlock(&inode->i_lock);
                spin_unlock(lru_lock);
                if (remove_inode_buffers(inode)) {
                        unsigned long reap;
                        reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
                        if (current_is_kswapd())
                                __count_vm_events(KSWAPD_INODESTEAL, reap);
                        else
                                __count_vm_events(PGINODESTEAL, reap);
                        mm_account_reclaimed_pages(reap);
                }
                iput(inode);
                spin_lock(lru_lock);
                return LRU_RETRY;
        }

        WARN_ON(inode->i_state & I_NEW);
        inode->i_state |= I_FREEING;
        list_lru_isolate_move(lru, &inode->i_lru, freeable);
        spin_unlock(&inode->i_lock);

        this_cpu_dec(nr_unused);
        return LRU_REMOVED;
}

/*
 * Walk the superblock inode LRU for freeable inodes and attempt to free them.
 * This is called from the superblock shrinker function with a number of inodes
 * to trim from the LRU. Inodes to be freed are moved to a temporary list and
 * then are freed outside inode_lock by dispose_list().
 */
long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
{
        LIST_HEAD(freeable);
        long freed;

        freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
                                     inode_lru_isolate, &freeable);
        dispose_list(&freeable);
        return freed;
}

static void __wait_on_freeing_inode(struct inode *inode);
/*
 * Called with the inode lock held.
 */
static struct inode *find_inode(struct super_block *sb,
                                struct hlist_head *head,
                                int (*test)(struct inode *, void *),
                                void *data)
{
        struct inode *inode = NULL;

repeat:
        hlist_for_each_entry(inode, head, i_hash) {
                if (inode->i_sb != sb)
                        continue;
                if (!test(inode, data))
                        continue;
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
                if (unlikely(inode->i_state & I_CREATING)) {
                        spin_unlock(&inode->i_lock);
                        return ERR_PTR(-ESTALE);
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                return inode;
        }
        return NULL;
}

/*
 * find_inode_fast is the fast path version of find_inode, see the comment at
 * iget_locked for details.
 */
static struct inode *find_inode_fast(struct super_block *sb,
                                struct hlist_head *head, unsigned long ino)
{
        struct inode *inode = NULL;

repeat:
        hlist_for_each_entry(inode, head, i_hash) {
                if (inode->i_ino != ino)
                        continue;
                if (inode->i_sb != sb)
                        continue;
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
                if (unlikely(inode->i_state & I_CREATING)) {
                        spin_unlock(&inode->i_lock);
                        return ERR_PTR(-ESTALE);
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                return inode;
        }
        return NULL;
}

/*
 * Each cpu owns a range of LAST_INO_BATCH numbers.
 * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
 * to renew the exhausted range.
 *
 * This does not significantly increase overflow rate because every CPU can
 * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
 * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
 * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
 * overflow rate by 2x, which does not seem too significant.
 *
 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
 * error if st_ino won't fit in target struct field. Use 32bit counter
 * here to attempt to avoid that.
 */
#define LAST_INO_BATCH 1024
static DEFINE_PER_CPU(unsigned int, last_ino);

unsigned int get_next_ino(void)
{
        unsigned int *p = &get_cpu_var(last_ino);
        unsigned int res = *p;

#ifdef CONFIG_SMP
        if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
                static atomic_t shared_last_ino;
                int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);

                res = next - LAST_INO_BATCH;
        }
#endif

        res++;
        /* get_next_ino should not provide a 0 inode number */
        if (unlikely(!res))
                res++;
        *p = res;
        put_cpu_var(last_ino);
        return res;
}
EXPORT_SYMBOL(get_next_ino);

/**
 *        new_inode_pseudo         - obtain an inode
 *        @sb: superblock
 *
 *        Allocates a new inode for given superblock.
 *        Inode wont be chained in superblock s_inodes list
 *        This means :
 *        - fs can't be unmount
 *        - quotas, fsnotify, writeback can't work
 */
struct inode *new_inode_pseudo(struct super_block *sb)
{
        struct inode *inode = alloc_inode(sb);

        if (inode) {
                spin_lock(&inode->i_lock);
                inode->i_state = 0;
                spin_unlock(&inode->i_lock);
        }
        return inode;
}

/**
 *        new_inode         - obtain an inode
 *        @sb: superblock
 *
 *        Allocates a new inode for given superblock. The default gfp_mask
 *        for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
 *        If HIGHMEM pages are unsuitable or it is known that pages allocated
 *        for the page cache are not reclaimable or migratable,
 *        mapping_set_gfp_mask() must be called with suitable flags on the
 *        newly created inode's mapping
 *
 */
struct inode *new_inode(struct super_block *sb)
{
        struct inode *inode;

        inode = new_inode_pseudo(sb);
        if (inode)
                inode_sb_list_add(inode);
        return inode;
}
EXPORT_SYMBOL(new_inode);

#ifdef CONFIG_DEBUG_LOCK_ALLOC
void lockdep_annotate_inode_mutex_key(struct inode *inode)
{
        if (S_ISDIR(inode->i_mode)) {
                struct file_system_type *type = inode->i_sb->s_type;

                /* Set new key only if filesystem hasn't already changed it */
                if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) {
                        /*
                         * ensure nobody is actually holding i_mutex
                         */
                        // mutex_destroy(&inode->i_mutex);
                        init_rwsem(&inode->i_rwsem);
                        lockdep_set_class(&inode->i_rwsem,
                                          &type->i_mutex_dir_key);
                }
        }
}
EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key);
#endif

/**
 * unlock_new_inode - clear the I_NEW state and wake up any waiters
 * @inode:        new inode to unlock
 *
 * Called when the inode is fully initialised to clear the new state of the
 * inode and wake up anyone waiting for the inode to finish initialisation.
 */
void unlock_new_inode(struct inode *inode)
{
        lockdep_annotate_inode_mutex_key(inode);
        spin_lock(&inode->i_lock);
        WARN_ON(!(inode->i_state & I_NEW));
        inode->i_state &= ~I_NEW & ~I_CREATING;
        smp_mb();
        wake_up_bit(&inode->i_state, __I_NEW);
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(unlock_new_inode);

void discard_new_inode(struct inode *inode)
{
        lockdep_annotate_inode_mutex_key(inode);
        spin_lock(&inode->i_lock);
        WARN_ON(!(inode->i_state & I_NEW));
        inode->i_state &= ~I_NEW;
        smp_mb();
        wake_up_bit(&inode->i_state, __I_NEW);
        spin_unlock(&inode->i_lock);
        iput(inode);
}
EXPORT_SYMBOL(discard_new_inode);

/**
 * lock_two_nondirectories - take two i_mutexes on non-directory objects
 *
 * Lock any non-NULL argument. Passed objects must not be directories.
 * Zero, one or two objects may be locked by this function.
 *
 * @inode1: first inode to lock
 * @inode2: second inode to lock
 */
void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
{
        if (inode1)
                WARN_ON_ONCE(S_ISDIR(inode1->i_mode));
        if (inode2)
                WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
        if (inode1 > inode2)
                swap(inode1, inode2);
        if (inode1)
                inode_lock(inode1);
        if (inode2 && inode2 != inode1)
                inode_lock_nested(inode2, I_MUTEX_NONDIR2);
}
EXPORT_SYMBOL(lock_two_nondirectories);

/**
 * unlock_two_nondirectories - release locks from lock_two_nondirectories()
 * @inode1: first inode to unlock
 * @inode2: second inode to unlock
 */
void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
{
        if (inode1) {
                WARN_ON_ONCE(S_ISDIR(inode1->i_mode));
                inode_unlock(inode1);
        }
        if (inode2 && inode2 != inode1) {
                WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
                inode_unlock(inode2);
        }
}
EXPORT_SYMBOL(unlock_two_nondirectories);

/**
 * inode_insert5 - obtain an inode from a mounted file system
 * @inode:        pre-allocated inode to use for insert to cache
 * @hashval:        hash value (usually inode number) to get
 * @test:        callback used for comparisons between inodes
 * @set:        callback used to initialize a new struct inode
 * @data:        opaque data pointer to pass to @test and @set
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * and if present it is return it with an increased reference count. This is
 * a variant of iget5_locked() for callers that don't want to fail on memory
 * allocation of inode.
 *
 * If the inode is not in cache, insert the pre-allocated inode to cache and
 * return it locked, hashed, and with the I_NEW flag set. The file system gets
 * to fill it in before unlocking it via unlock_new_inode().
 *
 * Note both @test and @set are called with the inode_hash_lock held, so can't
 * sleep.
 */
struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
                            int (*test)(struct inode *, void *),
                            int (*set)(struct inode *, void *), void *data)
{
        struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
        struct inode *old;

again:
        spin_lock(&inode_hash_lock);
        old = find_inode(inode->i_sb, head, test, data);
        if (unlikely(old)) {
                /*
                 * Uhhuh, somebody else created the same inode under us.
                 * Use the old inode instead of the preallocated one.
                 */
                spin_unlock(&inode_hash_lock);
                if (IS_ERR(old))
                        return NULL;
                wait_on_inode(old);
                if (unlikely(inode_unhashed(old))) {
                        iput(old);
                        goto again;
                }
                return old;
        }

        if (set && unlikely(set(inode, data))) {
                inode = NULL;
                goto unlock;
        }

        /*
         * Return the locked inode with I_NEW set, the
         * caller is responsible for filling in the contents
         */
        spin_lock(&inode->i_lock);
        inode->i_state |= I_NEW;
        hlist_add_head_rcu(&inode->i_hash, head);
        spin_unlock(&inode->i_lock);

        /*
         * Add inode to the sb list if it's not already. It has I_NEW at this
         * point, so it should be safe to test i_sb_list locklessly.
         */
        if (list_empty(&inode->i_sb_list))
                inode_sb_list_add(inode);
unlock:
        spin_unlock(&inode_hash_lock);

        return inode;
}
EXPORT_SYMBOL(inode_insert5);

/**
 * iget5_locked - obtain an inode from a mounted file system
 * @sb:                super block of file system
 * @hashval:        hash value (usually inode number) to get
 * @test:        callback used for comparisons between inodes
 * @set:        callback used to initialize a new struct inode
 * @data:        opaque data pointer to pass to @test and @set
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * and if present it is return it with an increased reference count. This is
 * a generalized version of iget_locked() for file systems where the inode
 * number is not sufficient for unique identification of an inode.
 *
 * If the inode is not in cache, allocate a new inode and return it locked,
 * hashed, and with the I_NEW flag set. The file system gets to fill it in
 * before unlocking it via unlock_new_inode().
 *
 * Note both @test and @set are called with the inode_hash_lock held, so can't
 * sleep.
 */
struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *),
                int (*set)(struct inode *, void *), void *data)
{
        struct inode *inode = ilookup5(sb, hashval, test, data);

        if (!inode) {
                struct inode *new = alloc_inode(sb);

                if (new) {
                        new->i_state = 0;
                        inode = inode_insert5(new, hashval, test, set, data);
                        if (unlikely(inode != new))
                                destroy_inode(new);
                }
        }
        return inode;
}
EXPORT_SYMBOL(iget5_locked);

/**
 * iget_locked - obtain an inode from a mounted file system
 * @sb:                super block of file system
 * @ino:        inode number to get
 *
 * Search for the inode specified by @ino in the inode cache and if present
 * return it with an increased reference count. This is for file systems
 * where the inode number is sufficient for unique identification of an inode.
 *
 * If the inode is not in cache, allocate a new inode and return it locked,
 * hashed, and with the I_NEW flag set.  The file system gets to fill it in
 * before unlocking it via unlock_new_inode().
 */
struct inode *iget_locked(struct super_block *sb, unsigned long ino)
{
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        struct inode *inode;
again:
        spin_lock(&inode_hash_lock);
        inode = find_inode_fast(sb, head, ino);
        spin_unlock(&inode_hash_lock);
        if (inode) {
                if (IS_ERR(inode))
                        return NULL;
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
                return inode;
        }

        inode = alloc_inode(sb);
        if (inode) {
                struct inode *old;

                spin_lock(&inode_hash_lock);
                /* We released the lock, so.. */
                old = find_inode_fast(sb, head, ino);
                if (!old) {
                        inode->i_ino = ino;
                        spin_lock(&inode->i_lock);
                        inode->i_state = I_NEW;
                        hlist_add_head_rcu(&inode->i_hash, head);
                        spin_unlock(&inode->i_lock);
                        inode_sb_list_add(inode);
                        spin_unlock(&inode_hash_lock);

                        /* Return the locked inode with I_NEW set, the
                         * caller is responsible for filling in the contents
                         */
                        return inode;
                }

                /*
                 * Uhhuh, somebody else created the same inode under
                 * us. Use the old inode instead of the one we just
                 * allocated.
                 */
                spin_unlock(&inode_hash_lock);
                destroy_inode(inode);
                if (IS_ERR(old))
                        return NULL;
                inode = old;
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
        }
        return inode;
}
EXPORT_SYMBOL(iget_locked);

/*
 * search the inode cache for a matching inode number.
 * If we find one, then the inode number we are trying to
 * allocate is not unique and so we should not use it.
 *
 * Returns 1 if the inode number is unique, 0 if it is not.
 */
static int test_inode_iunique(struct super_block *sb, unsigned long ino)
{
        struct hlist_head *b = inode_hashtable + hash(sb, ino);
        struct inode *inode;

        hlist_for_each_entry_rcu(inode, b, i_hash) {
                if (inode->i_ino == ino && inode->i_sb == sb)
                        return 0;
        }
        return 1;
}

/**
 *        iunique - get a unique inode number
 *        @sb: superblock
 *        @max_reserved: highest reserved inode number
 *
 *        Obtain an inode number that is unique on the system for a given
 *        superblock. This is used by file systems that have no natural
 *        permanent inode numbering system. An inode number is returned that
 *        is higher than the reserved limit but unique.
 *
 *        BUGS:
 *        With a large number of inodes live on the file system this function
 *        currently becomes quite slow.
 */
ino_t iunique(struct super_block *sb, ino_t max_reserved)
{
        /*
         * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
         * error if st_ino won't fit in target struct field. Use 32bit counter
         * here to attempt to avoid that.
         */
        static DEFINE_SPINLOCK(iunique_lock);
        static unsigned int counter;
        ino_t res;

        rcu_read_lock();
        spin_lock(&iunique_lock);
        do {
                if (counter <= max_reserved)
                        counter = max_reserved + 1;
                res = counter++;
        } while (!test_inode_iunique(sb, res));
        spin_unlock(&iunique_lock);
        rcu_read_unlock();

        return res;
}
EXPORT_SYMBOL(iunique);

struct inode *igrab(struct inode *inode)
{
        spin_lock(&inode->i_lock);
        if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
                __iget(inode);
                spin_unlock(&inode->i_lock);
        } else {
                spin_unlock(&inode->i_lock);
                /*
                 * Handle the case where s_op->clear_inode is not been
                 * called yet, and somebody is calling igrab
                 * while the inode is getting freed.
                 */
                inode = NULL;
        }
        return inode;
}
EXPORT_SYMBOL(igrab);

/**
 * ilookup5_nowait - search for an inode in the inode cache
 * @sb:                super block of file system to search
 * @hashval:        hash value (usually inode number) to search for
 * @test:        callback used for comparisons between inodes
 * @data:        opaque data pointer to pass to @test
 *
 * Search for the inode specified by @hashval and @data in the inode cache.
 * If the inode is in the cache, the inode is returned with an incremented
 * reference count.
 *
 * Note: I_NEW is not waited upon so you have to be very careful what you do
 * with the returned inode.  You probably should be using ilookup5() instead.
 *
 * Note2: @test is called with the inode_hash_lock held, so can't sleep.
 */
struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data)
{
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode;

        spin_lock(&inode_hash_lock);
        inode = find_inode(sb, head, test, data);
        spin_unlock(&inode_hash_lock);

        return IS_ERR(inode) ? NULL : inode;
}
EXPORT_SYMBOL(ilookup5_nowait);

/**
 * ilookup5 - search for an inode in the inode cache
 * @sb:                super block of file system to search
 * @hashval:        hash value (usually inode number) to search for
 * @test:        callback used for comparisons between inodes
 * @data:        opaque data pointer to pass to @test
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * and if the inode is in the cache, return the inode with an incremented
 * reference count.  Waits on I_NEW before returning the inode.
 * returned with an incremented reference count.
 *
 * This is a generalized version of ilookup() for file systems where the
 * inode number is not sufficient for unique identification of an inode.
 *
 * Note: @test is called with the inode_hash_lock held, so can't sleep.
 */
struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data)
{
        struct inode *inode;
again:
        inode = ilookup5_nowait(sb, hashval, test, data);
        if (inode) {
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
        }
        return inode;
}
EXPORT_SYMBOL(ilookup5);

/**
 * ilookup - search for an inode in the inode cache
 * @sb:                super block of file system to search
 * @ino:        inode number to search for
 *
 * Search for the inode @ino in the inode cache, and if the inode is in the
 * cache, the inode is returned with an incremented reference count.
 */
struct inode *ilookup(struct super_block *sb, unsigned long ino)
{
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        struct inode *inode;
again:
        spin_lock(&inode_hash_lock);
        inode = find_inode_fast(sb, head, ino);
        spin_unlock(&inode_hash_lock);

        if (inode) {
                if (IS_ERR(inode))
                        return NULL;
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
        }
        return inode;
}
EXPORT_SYMBOL(ilookup);

/**
 * find_inode_nowait - find an inode in the inode cache
 * @sb:                super block of file system to search
 * @hashval:        hash value (usually inode number) to search for
 * @match:        callback used for comparisons between inodes
 * @data:        opaque data pointer to pass to @match
 *
 * Search for the inode specified by @hashval and @data in the inode
 * cache, where the helper function @match will return 0 if the inode
 * does not match, 1 if the inode does match, and -1 if the search
 * should be stopped.  The @match function must be responsible for
 * taking the i_lock spin_lock and checking i_state for an inode being
 * freed or being initialized, and incrementing the reference count
 * before returning 1.  It also must not sleep, since it is called with
 * the inode_hash_lock spinlock held.
 *
 * This is a even more generalized version of ilookup5() when the
 * function must never block --- find_inode() can block in
 * __wait_on_freeing_inode() --- or when the caller can not increment
 * the reference count because the resulting iput() might cause an
 * inode eviction.  The tradeoff is that the @match funtion must be
 * very carefully implemented.
 */
struct inode *find_inode_nowait(struct super_block *sb,
                                unsigned long hashval,
                                int (*match)(struct inode *, unsigned long,
                                             void *),
                                void *data)
{
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode, *ret_inode = NULL;
        int mval;

        spin_lock(&inode_hash_lock);
        hlist_for_each_entry(inode, head, i_hash) {
                if (inode->i_sb != sb)
                        continue;
                mval = match(inode, hashval, data);
                if (mval == 0)
                        continue;
                if (mval == 1)
                        ret_inode = inode;
                goto out;
        }
out:
        spin_unlock(&inode_hash_lock);
        return ret_inode;
}
EXPORT_SYMBOL(find_inode_nowait);

/**
 * find_inode_rcu - find an inode in the inode cache
 * @sb:                Super block of file system to search
 * @hashval:        Key to hash
 * @test:        Function to test match on an inode
 * @data:        Data for test function
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * where the helper function @test will return 0 if the inode does not match
 * and 1 if it does.  The @test function must be responsible for taking the
 * i_lock spin_lock and checking i_state for an inode being freed or being
 * initialized.
 *
 * If successful, this will return the inode for which the @test function
 * returned 1 and NULL otherwise.
 *
 * The @test function is not permitted to take a ref on any inode presented.
 * It is also not permitted to sleep.
 *
 * The caller must hold the RCU read lock.
 */
struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
                             int (*test)(struct inode *, void *), void *data)
{
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
                         "suspicious find_inode_rcu() usage");

        hlist_for_each_entry_rcu(inode, head, i_hash) {
                if (inode->i_sb == sb &&
                    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) &&
                    test(inode, data))
                        return inode;
        }
        return NULL;
}
EXPORT_SYMBOL(find_inode_rcu);

/**
 * find_inode_by_ino_rcu - Find an inode in the inode cache
 * @sb:                Super block of file system to search
 * @ino:        The inode number to match
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * where the helper function @test will return 0 if the inode does not match
 * and 1 if it does.  The @test function must be responsible for taking the
 * i_lock spin_lock and checking i_state for an inode being freed or being
 * initialized.
 *
 * If successful, this will return the inode for which the @test function
 * returned 1 and NULL otherwise.
 *
 * The @test function is not permitted to take a ref on any inode presented.
 * It is also not permitted to sleep.
 *
 * The caller must hold the RCU read lock.
 */
struct inode *find_inode_by_ino_rcu(struct super_block *sb,
                                    unsigned long ino)
{
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        struct inode *inode;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
                         "suspicious find_inode_by_ino_rcu() usage");

        hlist_for_each_entry_rcu(inode, head, i_hash) {
                if (inode->i_ino == ino &&
                    inode->i_sb == sb &&
                    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)))
                    return inode;
        }
        return NULL;
}
EXPORT_SYMBOL(find_inode_by_ino_rcu);

int insert_inode_locked(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        ino_t ino = inode->i_ino;
        struct hlist_head *head = inode_hashtable + hash(sb, ino);

        while (1) {
                struct inode *old = NULL;
                spin_lock(&inode_hash_lock);
                hlist_for_each_entry(old, head, i_hash) {
                        if (old->i_ino != ino)
                                continue;
                        if (old->i_sb != sb)
                                continue;
                        spin_lock(&old->i_lock);
                        if (old->i_state & (I_FREEING|I_WILL_FREE)) {
                                spin_unlock(&old->i_lock);
                                continue;
                        }
                        break;
                }
                if (likely(!old)) {
                        spin_lock(&inode->i_lock);
                        inode->i_state |= I_NEW | I_CREATING;
                        hlist_add_head_rcu(&inode->i_hash, head);
                        spin_unlock(&inode->i_lock);
                        spin_unlock(&inode_hash_lock);
                        return 0;
                }
                if (unlikely(old->i_state & I_CREATING)) {
                        spin_unlock(&old->i_lock);
                        spin_unlock(&inode_hash_lock);
                        return -EBUSY;
                }
                __iget(old);
                spin_unlock(&old->i_lock);
                spin_unlock(&inode_hash_lock);
                wait_on_inode(old);
                if (unlikely(!inode_unhashed(old))) {
                        iput(old);
                        return -EBUSY;
                }
                iput(old);
        }
}
EXPORT_SYMBOL(insert_inode_locked);

int insert_inode_locked4(struct inode *inode, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data)
{
        struct inode *old;

        inode->i_state |= I_CREATING;
        old = inode_insert5(inode, hashval, test, NULL, data);

        if (old != inode) {
                iput(old);
                return -EBUSY;
        }
        return 0;
}
EXPORT_SYMBOL(insert_inode_locked4);


int generic_delete_inode(struct inode *inode)
{
        return 1;
}
EXPORT_SYMBOL(generic_delete_inode);

/*
 * Called when we're dropping the last reference
 * to an inode.
 *
 * Call the FS "drop_inode()" function, defaulting to
 * the legacy UNIX filesystem behaviour.  If it tells
 * us to evict inode, do so.  Otherwise, retain inode
 * in cache if fs is alive, sync and evict if fs is
 * shutting down.
 */
static void iput_final(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        const struct super_operations *op = inode->i_sb->s_op;
        unsigned long state;
        int drop;

        WARN_ON(inode->i_state & I_NEW);

        if (op->drop_inode)
                drop = op->drop_inode(inode);
        else
                drop = generic_drop_inode(inode);

        if (!drop &&
            !(inode->i_state & I_DONTCACHE) &&
            (sb->s_flags & SB_ACTIVE)) {
                __inode_add_lru(inode, true);
                spin_unlock(&inode->i_lock);
                return;
        }

        state = inode->i_state;
        if (!drop) {
                WRITE_ONCE(inode->i_state, state | I_WILL_FREE);
                spin_unlock(&inode->i_lock);

                write_inode_now(inode, 1);

                spin_lock(&inode->i_lock);
                state = inode->i_state;
                WARN_ON(state & I_NEW);
                state &= ~I_WILL_FREE;
        }

        WRITE_ONCE(inode->i_state, state | I_FREEING);
        if (!list_empty(&inode->i_lru))
                inode_lru_list_del(inode);
        spin_unlock(&inode->i_lock);

        evict(inode);
}

/**
 *        iput        - put an inode
 *        @inode: inode to put
 *
 *        Puts an inode, dropping its usage count. If the inode use count hits
 *        zero, the inode is then freed and may also be destroyed.
 *
 *        Consequently, iput() can sleep.
 */
void iput(struct inode *inode)
{
        if (!inode)
                return;
        BUG_ON(inode->i_state & I_CLEAR);
retry:
        if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
                if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
                        atomic_inc(&inode->i_count);
                        spin_unlock(&inode->i_lock);
                        trace_writeback_lazytime_iput(inode);
                        mark_inode_dirty_sync(inode);
                        goto retry;
                }
                iput_final(inode);
        }
}
EXPORT_SYMBOL(iput);

#ifdef CONFIG_BLOCK
/**
 *        bmap        - find a block number in a file
 *        @inode:  inode owning the block number being requested
 *        @block: pointer containing the block to find
 *
 *        Replaces the value in ``*block`` with the block number on the device holding
 *        corresponding to the requested block number in the file.
 *        That is, asked for block 4 of inode 1 the function will replace the
 *        4 in ``*block``, with disk block relative to the disk start that holds that
 *        block of the file.
 *
 *        Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a
 *        hole, returns 0 and ``*block`` is also set to 0.
 */
int bmap(struct inode *inode, sector_t *block)
{
        if (!inode->i_mapping->a_ops->bmap)
                return -EINVAL;

        *block = inode->i_mapping->a_ops->bmap(inode->i_mapping, *block);
        return 0;
}
EXPORT_SYMBOL(bmap);
#endif

/*
 * With relative atime, only update atime if the previous atime is
 * earlier than or equal to either the ctime or mtime,
 * or if at least a day has passed since the last atime update.
 */
static bool relatime_need_update(struct vfsmount *mnt, struct inode *inode,
                             struct timespec64 now)
{
        struct timespec64 atime, mtime, ctime;

        if (!(mnt->mnt_flags & MNT_RELATIME))
                return true;
        /*
         * Is mtime younger than or equal to atime? If yes, update atime:
         */
        atime = inode_get_atime(inode);
        mtime = inode_get_mtime(inode);
        if (timespec64_compare(&mtime, &atime) >= 0)
                return true;
        /*
         * Is ctime younger than or equal to atime? If yes, update atime:
         */
        ctime = inode_get_ctime(inode);
        if (timespec64_compare(&ctime, &atime) >= 0)
                return true;

        /*
         * Is the previous atime value older than a day? If yes,
         * update atime:
         */
        if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60)
                return true;
        /*
         * Good, we can skip the atime update:
         */
        return false;
}

/**
 * inode_update_timestamps - update the timestamps on the inode
 * @inode: inode to be updated
 * @flags: S_* flags that needed to be updated
 *
 * The update_time function is called when an inode's timestamps need to be
 * updated for a read or write operation. This function handles updating the
 * actual timestamps. It's up to the caller to ensure that the inode is marked
 * dirty appropriately.
 *
 * In the case where any of S_MTIME, S_CTIME, or S_VERSION need to be updated,
 * attempt to update all three of them. S_ATIME updates can be handled
 * independently of the rest.
 *
 * Returns a set of S_* flags indicating which values changed.
 */
int inode_update_timestamps(struct inode *inode, int flags)
{
        int updated = 0;
        struct timespec64 now;

        if (flags & (S_MTIME|S_CTIME|S_VERSION)) {
                struct timespec64 ctime = inode_get_ctime(inode);
                struct timespec64 mtime = inode_get_mtime(inode);

                now = inode_set_ctime_current(inode);
                if (!timespec64_equal(&now, &ctime))
                        updated |= S_CTIME;
                if (!timespec64_equal(&now, &mtime)) {
                        inode_set_mtime_to_ts(inode, now);
                        updated |= S_MTIME;
                }
                if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, updated))
                        updated |= S_VERSION;
        } else {
                now = current_time(inode);
        }

        if (flags & S_ATIME) {
                struct timespec64 atime = inode_get_atime(inode);

                if (!timespec64_equal(&now, &atime)) {
                        inode_set_atime_to_ts(inode, now);
                        updated |= S_ATIME;
                }
        }
        return updated;
}
EXPORT_SYMBOL(inode_update_timestamps);

/**
 * generic_update_time - update the timestamps on the inode
 * @inode: inode to be updated
 * @flags: S_* flags that needed to be updated
 *
 * The update_time function is called when an inode's timestamps need to be
 * updated for a read or write operation. In the case where any of S_MTIME, S_CTIME,
 * or S_VERSION need to be updated we attempt to update all three of them. S_ATIME
 * updates can be handled done independently of the rest.
 *
 * Returns a S_* mask indicating which fields were updated.
 */
int generic_update_time(struct inode *inode, int flags)
{
        int updated = inode_update_timestamps(inode, flags);
        int dirty_flags = 0;

        if (updated & (S_ATIME|S_MTIME|S_CTIME))
                dirty_flags = inode->i_sb->s_flags & SB_LAZYTIME ? I_DIRTY_TIME : I_DIRTY_SYNC;
        if (updated & S_VERSION)
                dirty_flags |= I_DIRTY_SYNC;
        __mark_inode_dirty(inode, dirty_flags);
        return updated;
}
EXPORT_SYMBOL(generic_update_time);

/*
 * This does the actual work of updating an inodes time or version.  Must have
 * had called mnt_want_write() before calling this.
 */
int inode_update_time(struct inode *inode, int flags)
{
        if (inode->i_op->update_time)
                return inode->i_op->update_time(inode, flags);
        generic_update_time(inode, flags);
        return 0;
}
EXPORT_SYMBOL(inode_update_time);

/**
 *        atime_needs_update        -        update the access time
 *        @path: the &struct path to update
 *        @inode: inode to update
 *
 *        Update the accessed time on an inode and mark it for writeback.
 *        This function automatically handles read only file systems and media,
 *        as well as the "noatime" flag and inode specific "noatime" markers.
 */
bool atime_needs_update(const struct path *path, struct inode *inode)
{
        struct vfsmount *mnt = path->mnt;
        struct timespec64 now, atime;

        if (inode->i_flags & S_NOATIME)
                return false;

        /* Atime updates will likely cause i_uid and i_gid to be written
         * back improprely if their true value is unknown to the vfs.
         */
        if (HAS_UNMAPPED_ID(mnt_idmap(mnt), inode))
                return false;

        if (IS_NOATIME(inode))
                return false;
        if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
                return false;

        if (mnt->mnt_flags & MNT_NOATIME)
                return false;
        if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
                return false;

        now = current_time(inode);

        if (!relatime_need_update(mnt, inode, now))
                return false;

        atime = inode_get_atime(inode);
        if (timespec64_equal(&atime, &now))
                return false;

        return true;
}

void touch_atime(const struct path *path)
{
        struct vfsmount *mnt = path->mnt;
        struct inode *inode = d_inode(path->dentry);

        if (!atime_needs_update(path, inode))
                return;

        if (!sb_start_write_trylock(inode->i_sb))
                return;

        if (mnt_get_write_access(mnt) != 0)
                goto skip_update;
        /*
         * File systems can error out when updating inodes if they need to
         * allocate new space to modify an inode (such is the case for
         * Btrfs), but since we touch atime while walking down the path we
         * really don't care if we failed to update the atime of the file,
         * so just ignore the return value.
         * We may also fail on filesystems that have the ability to make parts
         * of the fs read only, e.g. subvolumes in Btrfs.
         */
        inode_update_time(inode, S_ATIME);
        mnt_put_write_access(mnt);
skip_update:
        sb_end_write(inode->i_sb);
}
EXPORT_SYMBOL(touch_atime);

/*
 * Return mask of changes for notify_change() that need to be done as a
 * response to write or truncate. Return 0 if nothing has to be changed.
 * Negative value on error (change should be denied).
 */
int dentry_needs_remove_privs(struct mnt_idmap *idmap,
                              struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        int mask = 0;
        int ret;

        if (IS_NOSEC(inode))
                return 0;

        mask = setattr_should_drop_suidgid(idmap, inode);
        ret = security_inode_need_killpriv(dentry);
        if (ret < 0)
                return ret;
        if (ret)
                mask |= ATTR_KILL_PRIV;
        return mask;
}

static int __remove_privs(struct mnt_idmap *idmap,
                          struct dentry *dentry, int kill)
{
        struct iattr newattrs;

        newattrs.ia_valid = ATTR_FORCE | kill;
        /*
         * Note we call this on write, so notify_change will not
         * encounter any conflicting delegations:
         */
        return notify_change(idmap, dentry, &newattrs, NULL);
}

int file_remove_privs_flags(struct file *file, unsigned int flags)
{
        struct dentry *dentry = file_dentry(file);
        struct inode *inode = file_inode(file);
        int error = 0;
        int kill;

        if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
                return 0;

        kill = dentry_needs_remove_privs(file_mnt_idmap(file), dentry);
        if (kill < 0)
                return kill;

        if (kill) {
                if (flags & IOCB_NOWAIT)
                        return -EAGAIN;

                error = __remove_privs(file_mnt_idmap(file), dentry, kill);
        }

        if (!error)
                inode_has_no_xattr(inode);
        return error;
}
EXPORT_SYMBOL_GPL(file_remove_privs_flags);

/**
 * file_remove_privs - remove special file privileges (suid, capabilities)
 * @file: file to remove privileges from
 *
 * When file is modified by a write or truncation ensure that special
 * file privileges are removed.
 *
 * Return: 0 on success, negative errno on failure.
 */
int file_remove_privs(struct file *file)
{
        return file_remove_privs_flags(file, 0);
}
EXPORT_SYMBOL(file_remove_privs);

static int inode_needs_update_time(struct inode *inode)
{
        int sync_it = 0;
        struct timespec64 now = current_time(inode);
        struct timespec64 ts;

        /* First try to exhaust all avenues to not sync */
        if (IS_NOCMTIME(inode))
                return 0;

        ts = inode_get_mtime(inode);
        if (!timespec64_equal(&ts, &now))
                sync_it = S_MTIME;

        ts = inode_get_ctime(inode);
        if (!timespec64_equal(&ts, &now))
                sync_it |= S_CTIME;

        if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
                sync_it |= S_VERSION;

        return sync_it;
}

static int __file_update_time(struct file *file, int sync_mode)
{
        int ret = 0;
        struct inode *inode = file_inode(file);

        /* try to update time settings */
        if (!mnt_get_write_access_file(file)) {
                ret = inode_update_time(inode, sync_mode);
                mnt_put_write_access_file(file);
        }

        return ret;
}

/**
 * file_update_time - update mtime and ctime time
 * @file: file accessed
 *
 * Update the mtime and ctime members of an inode and mark the inode for
 * writeback. Note that this function is meant exclusively for usage in
 * the file write path of filesystems, and filesystems may choose to
 * explicitly ignore updates via this function with the _NOCMTIME inode
 * flag, e.g. for network filesystem where these imestamps are handled
 * by the server. This can return an error for file systems who need to
 * allocate space in order to update an inode.
 *
 * Return: 0 on success, negative errno on failure.
 */
int file_update_time(struct file *file)
{
        int ret;
        struct inode *inode = file_inode(file);

        ret = inode_needs_update_time(inode);
        if (ret <= 0)
                return ret;

        return __file_update_time(file, ret);
}
EXPORT_SYMBOL(file_update_time);

/**
 * file_modified_flags - handle mandated vfs changes when modifying a file
 * @file: file that was modified
 * @flags: kiocb flags
 *
 * When file has been modified ensure that special
 * file privileges are removed and time settings are updated.
 *
 * If IOCB_NOWAIT is set, special file privileges will not be removed and
 * time settings will not be updated. It will return -EAGAIN.
 *
 * Context: Caller must hold the file's inode lock.
 *
 * Return: 0 on success, negative errno on failure.
 */
static int file_modified_flags(struct file *file, int flags)
{
        int ret;
        struct inode *inode = file_inode(file);

        /*
         * Clear the security bits if the process is not being run by root.
         * This keeps people from modifying setuid and setgid binaries.
         */
        ret = file_remove_privs_flags(file, flags);
        if (ret)
                return ret;

        if (unlikely(file->f_mode & FMODE_NOCMTIME))
                return 0;

        ret = inode_needs_update_time(inode);
        if (ret <= 0)
                return ret;
        if (flags & IOCB_NOWAIT)
                return -EAGAIN;

        return __file_update_time(file, ret);
}

/**
 * file_modified - handle mandated vfs changes when modifying a file
 * @file: file that was modified
 *
 * When file has been modified ensure that special
 * file privileges are removed and time settings are updated.
 *
 * Context: Caller must hold the file's inode lock.
 *
 * Return: 0 on success, negative errno on failure.
 */
int file_modified(struct file *file)
{
        return file_modified_flags(file, 0);
}
EXPORT_SYMBOL(file_modified);

/**
 * kiocb_modified - handle mandated vfs changes when modifying a file
 * @iocb: iocb that was modified
 *
 * When file has been modified ensure that special
 * file privileges are removed and time settings are updated.
 *
 * Context: Caller must hold the file's inode lock.
 *
 * Return: 0 on success, negative errno on failure.
 */
int kiocb_modified(struct kiocb *iocb)
{
        return file_modified_flags(iocb->ki_filp, iocb->ki_flags);
}
EXPORT_SYMBOL_GPL(kiocb_modified);

int inode_needs_sync(struct inode *inode)
{
        if (IS_SYNC(inode))
                return 1;
        if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
                return 1;
        return 0;
}
EXPORT_SYMBOL(inode_needs_sync);

/*
 * If we try to find an inode in the inode hash while it is being
 * deleted, we have to wait until the filesystem completes its
 * deletion before reporting that it isn't found.  This function waits
 * until the deletion _might_ have completed.  Callers are responsible
 * to recheck inode state.
 *
 * It doesn't matter if I_NEW is not set initially, a call to
 * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
 * will DTRT.
 */
static void __wait_on_freeing_inode(struct inode *inode)
{
        wait_queue_head_t *wq;
        DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
        wq = bit_waitqueue(&inode->i_state, __I_NEW);
        prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
        spin_unlock(&inode->i_lock);
        spin_unlock(&inode_hash_lock);
        schedule();
        finish_wait(wq, &wait.wq_entry);
        spin_lock(&inode_hash_lock);
}

static __initdata unsigned long ihash_entries;
static int __init set_ihash_entries(char *str)
{
        if (!str)
                return 0;
        ihash_entries = simple_strtoul(str, &str, 0);
        return 1;
}
__setup("ihash_entries=", set_ihash_entries);

/*
 * Initialize the waitqueues and inode hash table.
 */
void __init inode_init_early(void)
{
        /* If hashes are distributed across NUMA nodes, defer
         * hash allocation until vmalloc space is available.
         */
        if (hashdist)
                return;

        inode_hashtable =
                alloc_large_system_hash("Inode-cache",
                                        sizeof(struct hlist_head),
                                        ihash_entries,
                                        14,
                                        HASH_EARLY | HASH_ZERO,
                                        &i_hash_shift,
                                        &i_hash_mask,
                                        0,
                                        0);
}

void __init inode_init(void)
{
        /* inode slab cache */
        inode_cachep = kmem_cache_create("inode_cache",
                                         sizeof(struct inode),
                                         0,
                                         (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
                                         SLAB_ACCOUNT),
                                         init_once);

        /* Hash may have been set up in inode_init_early */
        if (!hashdist)
                return;

        inode_hashtable =
                alloc_large_system_hash("Inode-cache",
                                        sizeof(struct hlist_head),
                                        ihash_entries,
                                        14,
                                        HASH_ZERO,
                                        &i_hash_shift,
                                        &i_hash_mask,
                                        0,
                                        0);
}

void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
{
        inode->i_mode = mode;
        if (S_ISCHR(mode)) {
                inode->i_fop = &def_chr_fops;
                inode->i_rdev = rdev;
        } else if (S_ISBLK(mode)) {
                if (IS_ENABLED(CONFIG_BLOCK))
                        inode->i_fop = &def_blk_fops;
                inode->i_rdev = rdev;
        } else if (S_ISFIFO(mode))
                inode->i_fop = &pipefifo_fops;
        else if (S_ISSOCK(mode))
                ;        /* leave it no_open_fops */
        else
                printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
                                  " inode %s:%lu\n", mode, inode->i_sb->s_id,
                                  inode->i_ino);
}
EXPORT_SYMBOL(init_special_inode);

/**
 * inode_init_owner - Init uid,gid,mode for new inode according to posix standards
 * @idmap: idmap of the mount the inode was created from
 * @inode: New inode
 * @dir: Directory inode
 * @mode: mode of the new inode
 *
 * If the inode has been created through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions
 * and initializing i_uid and i_gid. On non-idmapped mounts or if permission
 * checking is to be performed on the raw inode simply pass @nop_mnt_idmap.
 */
void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode,
                      const struct inode *dir, umode_t mode)
{
        inode_fsuid_set(inode, idmap);
        if (dir && dir->i_mode & S_ISGID) {
                inode->i_gid = dir->i_gid;

                /* Directories are special, and always inherit S_ISGID */
                if (S_ISDIR(mode))
                        mode |= S_ISGID;
        } else
                inode_fsgid_set(inode, idmap);
        inode->i_mode = mode;
}
EXPORT_SYMBOL(inode_init_owner);

/**
 * inode_owner_or_capable - check current task permissions to inode
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode being checked
 *
 * Return true if current either has CAP_FOWNER in a namespace with the
 * inode owner uid mapped, or owns the file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
bool inode_owner_or_capable(struct mnt_idmap *idmap,
                            const struct inode *inode)
{
        vfsuid_t vfsuid;
        struct user_namespace *ns;

        vfsuid = i_uid_into_vfsuid(idmap, inode);
        if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
                return true;

        ns = current_user_ns();
        if (vfsuid_has_mapping(ns, vfsuid) && ns_capable(ns, CAP_FOWNER))
                return true;
        return false;
}
EXPORT_SYMBOL(inode_owner_or_capable);

/*
 * Direct i/o helper functions
 */
static void __inode_dio_wait(struct inode *inode)
{
        wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
        DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);

        do {
                prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE);
                if (atomic_read(&inode->i_dio_count))
                        schedule();
        } while (atomic_read(&inode->i_dio_count));
        finish_wait(wq, &q.wq_entry);
}

/**
 * inode_dio_wait - wait for outstanding DIO requests to finish
 * @inode: inode to wait for
 *
 * Waits for all pending direct I/O requests to finish so that we can
 * proceed with a truncate or equivalent operation.
 *
 * Must be called under a lock that serializes taking new references
 * to i_dio_count, usually by inode->i_mutex.
 */
void inode_dio_wait(struct inode *inode)
{
        if (atomic_read(&inode->i_dio_count))
                __inode_dio_wait(inode);
}
EXPORT_SYMBOL(inode_dio_wait);

/*
 * inode_set_flags - atomically set some inode flags
 *
 * Note: the caller should be holding i_mutex, or else be sure that
 * they have exclusive access to the inode structure (i.e., while the
 * inode is being instantiated).  The reason for the cmpxchg() loop
 * --- which wouldn't be necessary if all code paths which modify
 * i_flags actually followed this rule, is that there is at least one
 * code path which doesn't today so we use cmpxchg() out of an abundance
 * of caution.
 *
 * In the long run, i_mutex is overkill, and we should probably look
 * at using the i_lock spinlock to protect i_flags, and then make sure
 * it is so documented in include/linux/fs.h and that all code follows
 * the locking convention!!
 */
void inode_set_flags(struct inode *inode, unsigned int flags,
                     unsigned int mask)
{
        WARN_ON_ONCE(flags & ~mask);
        set_mask_bits(&inode->i_flags, mask, flags);
}
EXPORT_SYMBOL(inode_set_flags);

void inode_nohighmem(struct inode *inode)
{
        mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
}
EXPORT_SYMBOL(inode_nohighmem);

/**
 * timestamp_truncate - Truncate timespec to a granularity
 * @t: Timespec
 * @inode: inode being updated
 *
 * Truncate a timespec to the granularity supported by the fs
 * containing the inode. Always rounds down. gran must
 * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
 */
struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        unsigned int gran = sb->s_time_gran;

        t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max);
        if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min))
                t.tv_nsec = 0;

        /* Avoid division in the common cases 1 ns and 1 s. */
        if (gran == 1)
                ; /* nothing */
        else if (gran == NSEC_PER_SEC)
                t.tv_nsec = 0;
        else if (gran > 1 && gran < NSEC_PER_SEC)
                t.tv_nsec -= t.tv_nsec % gran;
        else
                WARN(1, "invalid file time granularity: %u", gran);
        return t;
}
EXPORT_SYMBOL(timestamp_truncate);

/**
 * current_time - Return FS time
 * @inode: inode.
 *
 * Return the current time truncated to the time granularity supported by
 * the fs.
 *
 * Note that inode and inode->sb cannot be NULL.
 * Otherwise, the function warns and returns time without truncation.
 */
struct timespec64 current_time(struct inode *inode)
{
        struct timespec64 now;

        ktime_get_coarse_real_ts64(&now);
        return timestamp_truncate(now, inode);
}
EXPORT_SYMBOL(current_time);

/**
 * inode_set_ctime_current - set the ctime to current_time
 * @inode: inode
 *
 * Set the inode->i_ctime to the current value for the inode. Returns
 * the current value that was assigned to i_ctime.
 */
struct timespec64 inode_set_ctime_current(struct inode *inode)
{
        struct timespec64 now = current_time(inode);

        inode_set_ctime_to_ts(inode, now);
        return now;
}
EXPORT_SYMBOL(inode_set_ctime_current);

/**
 * in_group_or_capable - check whether caller is CAP_FSETID privileged
 * @idmap:        idmap of the mount @inode was found from
 * @inode:        inode to check
 * @vfsgid:        the new/current vfsgid of @inode
 *
 * Check wether @vfsgid is in the caller's group list or if the caller is
 * privileged with CAP_FSETID over @inode. This can be used to determine
 * whether the setgid bit can be kept or must be dropped.
 *
 * Return: true if the caller is sufficiently privileged, false if not.
 */
bool in_group_or_capable(struct mnt_idmap *idmap,
                         const struct inode *inode, vfsgid_t vfsgid)
{
        if (vfsgid_in_group_p(vfsgid))
                return true;
        if (capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID))
                return true;
        return false;
}

/**
 * mode_strip_sgid - handle the sgid bit for non-directories
 * @idmap: idmap of the mount the inode was created from
 * @dir: parent directory inode
 * @mode: mode of the file to be created in @dir
 *
 * If the @mode of the new file has both the S_ISGID and S_IXGRP bit
 * raised and @dir has the S_ISGID bit raised ensure that the caller is
 * either in the group of the parent directory or they have CAP_FSETID
 * in their user namespace and are privileged over the parent directory.
 * In all other cases, strip the S_ISGID bit from @mode.
 *
 * Return: the new mode to use for the file
 */
umode_t mode_strip_sgid(struct mnt_idmap *idmap,
                        const struct inode *dir, umode_t mode)
{
        if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP))
                return mode;
        if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID))
                return mode;
        if (in_group_or_capable(idmap, dir, i_gid_into_vfsgid(idmap, dir)))
                return mode;
        return mode & ~S_ISGID;
}
EXPORT_SYMBOL(mode_strip_sgid);








































































    1 













    1 

    1 
































































    1 









    1 
    1 

    1 





    1 

    1 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
// SPDX-License-Identifier: GPL-2.0
#include <linux/proc_fs.h>
#include <linux/nsproxy.h>
#include <linux/ptrace.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/utsname.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include "internal.h"


static const struct proc_ns_operations *ns_entries[] = {
#ifdef CONFIG_NET_NS
        &netns_operations,
#endif
#ifdef CONFIG_UTS_NS
        &utsns_operations,
#endif
#ifdef CONFIG_IPC_NS
        &ipcns_operations,
#endif
#ifdef CONFIG_PID_NS
        &pidns_operations,
        &pidns_for_children_operations,
#endif
#ifdef CONFIG_USER_NS
        &userns_operations,
#endif
        &mntns_operations,
#ifdef CONFIG_CGROUPS
        &cgroupns_operations,
#endif
#ifdef CONFIG_TIME_NS
        &timens_operations,
        &timens_for_children_operations,
#endif
};

static const char *proc_ns_get_link(struct dentry *dentry,
                                    struct inode *inode,
                                    struct delayed_call *done)
{
        const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
        struct task_struct *task;
        struct path ns_path;
        int error = -EACCES;

        if (!dentry)
                return ERR_PTR(-ECHILD);

        task = get_proc_task(inode);
        if (!task)
                return ERR_PTR(-EACCES);

        if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
                goto out;

        error = ns_get_path(&ns_path, task, ns_ops);
        if (error)
                goto out;

        error = nd_jump_link(&ns_path);
out:
        put_task_struct(task);
        return ERR_PTR(error);
}

static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
        struct inode *inode = d_inode(dentry);
        const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
        struct task_struct *task;
        char name[50];
        int res = -EACCES;

        task = get_proc_task(inode);
        if (!task)
                return res;

        if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
                res = ns_get_name(name, sizeof(name), task, ns_ops);
                if (res >= 0)
                        res = readlink_copy(buffer, buflen, name);
        }
        put_task_struct(task);
        return res;
}

static const struct inode_operations proc_ns_link_inode_operations = {
        .readlink        = proc_ns_readlink,
        .get_link        = proc_ns_get_link,
        .setattr        = proc_setattr,
};

static struct dentry *proc_ns_instantiate(struct dentry *dentry,
        struct task_struct *task, const void *ptr)
{
        const struct proc_ns_operations *ns_ops = ptr;
        struct inode *inode;
        struct proc_inode *ei;

        inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRWXUGO);
        if (!inode)
                return ERR_PTR(-ENOENT);

        ei = PROC_I(inode);
        inode->i_op = &proc_ns_link_inode_operations;
        ei->ns_ops = ns_ops;
        pid_update_inode(task, inode);

        d_set_d_op(dentry, &pid_dentry_operations);
        return d_splice_alias(inode, dentry);
}

static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        const struct proc_ns_operations **entry, **last;

        if (!task)
                return -ENOENT;

        if (!dir_emit_dots(file, ctx))
                goto out;
        if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries))
                goto out;
        entry = ns_entries + (ctx->pos - 2);
        last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
        while (entry <= last) {
                const struct proc_ns_operations *ops = *entry;
                if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name),
                                     proc_ns_instantiate, task, ops))
                        break;
                ctx->pos++;
                entry++;
        }
out:
        put_task_struct(task);
        return 0;
}

const struct file_operations proc_ns_dir_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_ns_dir_readdir,
        .llseek                = generic_file_llseek,
};

static struct dentry *proc_ns_dir_lookup(struct inode *dir,
                                struct dentry *dentry, unsigned int flags)
{
        struct task_struct *task = get_proc_task(dir);
        const struct proc_ns_operations **entry, **last;
        unsigned int len = dentry->d_name.len;
        struct dentry *res = ERR_PTR(-ENOENT);

        if (!task)
                goto out_no_task;

        last = &ns_entries[ARRAY_SIZE(ns_entries)];
        for (entry = ns_entries; entry < last; entry++) {
                if (strlen((*entry)->name) != len)
                        continue;
                if (!memcmp(dentry->d_name.name, (*entry)->name, len))
                        break;
        }
        if (entry == last)
                goto out;

        res = proc_ns_instantiate(dentry, task, *entry);
out:
        put_task_struct(task);
out_no_task:
        return res;
}

const struct inode_operations proc_ns_dir_inode_operations = {
        .lookup                = proc_ns_dir_lookup,
        .getattr        = pid_getattr,
        .setattr        = proc_setattr,
};



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef BTRFS_ACCESSORS_H
#define BTRFS_ACCESSORS_H

#include <asm/unaligned.h>
#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/align.h>
#include <linux/build_bug.h>
#include <linux/compiler.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <uapi/linux/btrfs_tree.h>

struct extent_buffer;

struct btrfs_map_token {
        struct extent_buffer *eb;
        char *kaddr;
        unsigned long offset;
};

void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb);

/*
 * Some macros to generate set/get functions for the struct fields.  This
 * assumes there is a lefoo_to_cpu for every type, so lets make a simple one
 * for u8:
 */
#define le8_to_cpu(v) (v)
#define cpu_to_le8(v) (v)
#define __le8 u8

static inline u8 get_unaligned_le8(const void *p)
{
       return *(u8 *)p;
}

static inline void put_unaligned_le8(u8 val, void *p)
{
       *(u8 *)p = val;
}

#define read_eb_member(eb, ptr, type, member, result) (\
        read_extent_buffer(eb, (char *)(result),                        \
                           ((unsigned long)(ptr)) +                        \
                            offsetof(type, member),                        \
                            sizeof_field(type, member)))

#define write_eb_member(eb, ptr, type, member, result) (\
        write_extent_buffer(eb, (char *)(result),                        \
                           ((unsigned long)(ptr)) +                        \
                            offsetof(type, member),                        \
                            sizeof_field(type, member)))

#define DECLARE_BTRFS_SETGET_BITS(bits)                                        \
u##bits btrfs_get_token_##bits(struct btrfs_map_token *token,                \
                               const void *ptr, unsigned long off);        \
void btrfs_set_token_##bits(struct btrfs_map_token *token,                \
                            const void *ptr, unsigned long off,                \
                            u##bits val);                                \
u##bits btrfs_get_##bits(const struct extent_buffer *eb,                \
                         const void *ptr, unsigned long off);                \
void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr,        \
                      unsigned long off, u##bits val);

DECLARE_BTRFS_SETGET_BITS(8)
DECLARE_BTRFS_SETGET_BITS(16)
DECLARE_BTRFS_SETGET_BITS(32)
DECLARE_BTRFS_SETGET_BITS(64)

#define BTRFS_SETGET_FUNCS(name, type, member, bits)                        \
static inline u##bits btrfs_##name(const struct extent_buffer *eb,        \
                                   const type *s)                        \
{                                                                        \
        static_assert(sizeof(u##bits) == sizeof_field(type, member));        \
        return btrfs_get_##bits(eb, s, offsetof(type, member));                \
}                                                                        \
static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
                                    u##bits val)                        \
{                                                                        \
        static_assert(sizeof(u##bits) == sizeof_field(type, member));        \
        btrfs_set_##bits(eb, s, offsetof(type, member), val);                \
}                                                                        \
static inline u##bits btrfs_token_##name(struct btrfs_map_token *token,        \
                                         const type *s)                        \
{                                                                        \
        static_assert(sizeof(u##bits) == sizeof_field(type, member));        \
        return btrfs_get_token_##bits(token, s, offsetof(type, member));\
}                                                                        \
static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
                                          type *s, u##bits val)                \
{                                                                        \
        static_assert(sizeof(u##bits) == sizeof_field(type, member));        \
        btrfs_set_token_##bits(token, s, offsetof(type, member), val);        \
}

#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)                \
static inline u##bits btrfs_##name(const struct extent_buffer *eb)        \
{                                                                        \
        const type *p = folio_address(eb->folios[0]) +                        \
                        offset_in_page(eb->start);                        \
        return get_unaligned_le##bits(&p->member);                        \
}                                                                        \
static inline void btrfs_set_##name(const struct extent_buffer *eb,        \
                                    u##bits val)                        \
{                                                                        \
        type *p = folio_address(eb->folios[0]) + offset_in_page(eb->start); \
        put_unaligned_le##bits(val, &p->member);                        \
}

#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits)                \
static inline u##bits btrfs_##name(const type *s)                        \
{                                                                        \
        return get_unaligned_le##bits(&s->member);                        \
}                                                                        \
static inline void btrfs_set_##name(type *s, u##bits val)                \
{                                                                        \
        put_unaligned_le##bits(val, &s->member);                        \
}

static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
                                           struct btrfs_dev_item *s)
{
        static_assert(sizeof(u64) == sizeof_field(struct btrfs_dev_item, total_bytes));
        return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes));
}
static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb,
                                                struct btrfs_dev_item *s,
                                                u64 val)
{
        static_assert(sizeof(u64) == sizeof_field(struct btrfs_dev_item, total_bytes));
        WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize));
        btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val);
}

BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, start_offset, 64);
BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);

BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
                         total_bytes, 64);
BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
                         bytes_used, 64);
BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
                         io_align, 32);
BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
                         io_width, 32);
BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
                         sector_size, 32);
BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, dev_group, 32);
BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
                         seek_speed, 8);
BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
                         bandwidth, 8);
BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
                         generation, 64);

static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d)
{
        return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid);
}

static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d)
{
        return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid);
}

BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);

static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
{
        return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
}

BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
                         stripe_len, 64);
BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, io_align, 32);
BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, io_width, 32);
BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
                         sector_size, 32);
BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
                         num_stripes, 16);
BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
                         sub_stripes, 16);
BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);

static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, int nr)
{
        unsigned long offset = (unsigned long)c;

        offset += offsetof(struct btrfs_chunk, stripe);
        offset += nr * sizeof(struct btrfs_stripe);
        return (struct btrfs_stripe *)offset;
}

static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
{
        return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
}

static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb,
                                         struct btrfs_chunk *c, int nr)
{
        return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
}

static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
                                              struct btrfs_chunk *c, int nr,
                                              u64 val)
{
        btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
}

static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb,
                                         struct btrfs_chunk *c, int nr)
{
        return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
}

static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
                                             struct btrfs_chunk *c, int nr,
                                             u64 val)
{
        btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
}

/* struct btrfs_block_group_item */
BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item,
                         used, 64);
BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item, used, 64);
BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid,
                        struct btrfs_block_group_item, chunk_objectid, 64);

BTRFS_SETGET_FUNCS(block_group_chunk_objectid,
                   struct btrfs_block_group_item, chunk_objectid, 64);
BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64);
BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags,
                        struct btrfs_block_group_item, flags, 64);

/* struct btrfs_free_space_info */
BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
                   extent_count, 32);
BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32);

/* struct btrfs_inode_ref */
BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_index, struct btrfs_inode_ref, index, 64);

/* struct btrfs_inode_extref */
BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
                   parent_objectid, 64);
BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
                   name_len, 16);
BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);

/* struct btrfs_inode_item */
BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
                         generation, 64);
BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
                         sequence, 64);
BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item,
                         transid, 64);
BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64);
BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, nbytes, 64);
BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item,
                         block_group, 64);
BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32);
BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32);
BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);

BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8);
BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64);
BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64);
BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_encoding,
                         struct btrfs_stripe_extent, encoding, 8);
BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64);
BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64);

/* struct btrfs_dev_extent */
BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64);
BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
                   chunk_objectid, 64);
BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
                   chunk_offset, 64);
BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_tree, struct btrfs_dev_extent,
                         chunk_tree, 64);
BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_objectid, struct btrfs_dev_extent,
                         chunk_objectid, 64);
BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_offset, struct btrfs_dev_extent,
                         chunk_offset, 64);
BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_length, struct btrfs_dev_extent, length, 64);

BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, generation, 64);
BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);

BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);

static inline void btrfs_tree_block_key(const struct extent_buffer *eb,
                                        struct btrfs_tree_block_info *item,
                                        struct btrfs_disk_key *key)
{
        read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
}

static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb,
                                            struct btrfs_tree_block_info *item,
                                            struct btrfs_disk_key *key)
{
        write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
}

BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, root, 64);
BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref,
                   objectid, 64);
BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref,
                   offset, 64);
BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 32);

BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32);

BTRFS_SETGET_FUNCS(extent_owner_ref_root_id, struct btrfs_extent_owner_ref,
                   root_id, 64);

BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
                   type, 8);
BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
                   offset, 64);

static inline u32 btrfs_extent_inline_ref_size(int type)
{
        if (type == BTRFS_TREE_BLOCK_REF_KEY ||
            type == BTRFS_SHARED_BLOCK_REF_KEY)
                return sizeof(struct btrfs_extent_inline_ref);
        if (type == BTRFS_SHARED_DATA_REF_KEY)
                return sizeof(struct btrfs_shared_data_ref) +
                       sizeof(struct btrfs_extent_inline_ref);
        if (type == BTRFS_EXTENT_DATA_REF_KEY)
                return sizeof(struct btrfs_extent_data_ref) +
                       offsetof(struct btrfs_extent_inline_ref, offset);
        if (type == BTRFS_EXTENT_OWNER_REF_KEY)
                return sizeof(struct btrfs_extent_inline_ref);
        return 0;
}

/* struct btrfs_node */
BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, blockptr, 64);
BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr,
                         generation, 64);

static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr)
{
        unsigned long ptr;

        ptr = offsetof(struct btrfs_node, ptrs) +
                sizeof(struct btrfs_key_ptr) * nr;
        return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
}

static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb,
                                           int nr, u64 val)
{
        unsigned long ptr;

        ptr = offsetof(struct btrfs_node, ptrs) +
                sizeof(struct btrfs_key_ptr) * nr;
        btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
}

static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr)
{
        unsigned long ptr;

        ptr = offsetof(struct btrfs_node, ptrs) +
                sizeof(struct btrfs_key_ptr) * nr;
        return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
}

static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb,
                                                 int nr, u64 val)
{
        unsigned long ptr;

        ptr = offsetof(struct btrfs_node, ptrs) +
                sizeof(struct btrfs_key_ptr) * nr;
        btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
}

static inline unsigned long btrfs_node_key_ptr_offset(const struct extent_buffer *eb, int nr)
{
        return offsetof(struct btrfs_node, ptrs) +
                sizeof(struct btrfs_key_ptr) * nr;
}

void btrfs_node_key(const struct extent_buffer *eb,
                    struct btrfs_disk_key *disk_key, int nr);

static inline void btrfs_set_node_key(const struct extent_buffer *eb,
                                      struct btrfs_disk_key *disk_key, int nr)
{
        unsigned long ptr;

        ptr = btrfs_node_key_ptr_offset(eb, nr);
        write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
                        struct btrfs_key_ptr, key, disk_key);
}

/* struct btrfs_item */
BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32);
BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32);
BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);

static inline unsigned long btrfs_item_nr_offset(const struct extent_buffer *eb, int nr)
{
        return offsetof(struct btrfs_leaf, items) +
                sizeof(struct btrfs_item) * nr;
}

static inline struct btrfs_item *btrfs_item_nr(const struct extent_buffer *eb, int nr)
{
        return (struct btrfs_item *)btrfs_item_nr_offset(eb, nr);
}

#define BTRFS_ITEM_SETGET_FUNCS(member)                                                \
static inline u32 btrfs_item_##member(const struct extent_buffer *eb, int slot)        \
{                                                                                \
        return btrfs_raw_item_##member(eb, btrfs_item_nr(eb, slot));                \
}                                                                                \
static inline void btrfs_set_item_##member(const struct extent_buffer *eb,        \
                                           int slot, u32 val)                        \
{                                                                                \
        btrfs_set_raw_item_##member(eb, btrfs_item_nr(eb, slot), val);                \
}                                                                                \
static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token,        \
                                            int slot)                                \
{                                                                                \
        struct btrfs_item *item = btrfs_item_nr(token->eb, slot);                \
        return btrfs_token_raw_item_##member(token, item);                        \
}                                                                                \
static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token,        \
                                                 int slot, u32 val)                \
{                                                                                \
        struct btrfs_item *item = btrfs_item_nr(token->eb, slot);                \
        btrfs_set_token_raw_item_##member(token, item, val);                        \
}

BTRFS_ITEM_SETGET_FUNCS(offset)
BTRFS_ITEM_SETGET_FUNCS(size);

static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr)
{
        return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr);
}

static inline void btrfs_item_key(const struct extent_buffer *eb,
                           struct btrfs_disk_key *disk_key, int nr)
{
        struct btrfs_item *item = btrfs_item_nr(eb, nr);

        read_eb_member(eb, item, struct btrfs_item, key, disk_key);
}

static inline void btrfs_set_item_key(struct extent_buffer *eb,
                                      struct btrfs_disk_key *disk_key, int nr)
{
        struct btrfs_item *item = btrfs_item_nr(eb, nr);

        write_eb_member(eb, item, struct btrfs_item, key, disk_key);
}

BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);

/* struct btrfs_root_ref */
BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
BTRFS_SETGET_STACK_FUNCS(stack_root_ref_dirid, struct btrfs_root_ref, dirid, 64);
BTRFS_SETGET_STACK_FUNCS(stack_root_ref_sequence, struct btrfs_root_ref, sequence, 64);
BTRFS_SETGET_STACK_FUNCS(stack_root_ref_name_len, struct btrfs_root_ref, name_len, 16);

/* struct btrfs_dir_item */
BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
BTRFS_SETGET_FUNCS(dir_flags, struct btrfs_dir_item, type, 8);
BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
BTRFS_SETGET_STACK_FUNCS(stack_dir_flags, struct btrfs_dir_item, type, 8);
BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, data_len, 16);
BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, name_len, 16);
BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, transid, 64);

static inline u8 btrfs_dir_ftype(const struct extent_buffer *eb,
                                 const struct btrfs_dir_item *item)
{
        return btrfs_dir_flags_to_ftype(btrfs_dir_flags(eb, item));
}

static inline u8 btrfs_stack_dir_ftype(const struct btrfs_dir_item *item)
{
        return btrfs_dir_flags_to_ftype(btrfs_stack_dir_flags(item));
}

static inline void btrfs_dir_item_key(const struct extent_buffer *eb,
                                      const struct btrfs_dir_item *item,
                                      struct btrfs_disk_key *key)
{
        read_eb_member(eb, item, struct btrfs_dir_item, location, key);
}

static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
                                          struct btrfs_dir_item *item,
                                          const struct btrfs_disk_key *key)
{
        write_eb_member(eb, item, struct btrfs_dir_item, location, key);
}

BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
                   num_entries, 64);
BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
                   num_bitmaps, 64);
BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
                   generation, 64);

static inline void btrfs_free_space_key(const struct extent_buffer *eb,
                                        const struct btrfs_free_space_header *h,
                                        struct btrfs_disk_key *key)
{
        read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
}

static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
                                            struct btrfs_free_space_header *h,
                                            const struct btrfs_disk_key *key)
{
        write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
}

/* struct btrfs_disk_key */
BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, objectid, 64);
BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);

#ifdef __LITTLE_ENDIAN

/*
 * Optimized helpers for little-endian architectures where CPU and on-disk
 * structures have the same endianness and we can skip conversions.
 */

static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key,
                                         const struct btrfs_disk_key *disk_key)
{
        memcpy(cpu_key, disk_key, sizeof(struct btrfs_key));
}

static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key,
                                         const struct btrfs_key *cpu_key)
{
        memcpy(disk_key, cpu_key, sizeof(struct btrfs_key));
}

static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
                                         struct btrfs_key *cpu_key, int nr)
{
        struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;

        btrfs_node_key(eb, disk_key, nr);
}

static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
                                         struct btrfs_key *cpu_key, int nr)
{
        struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;

        btrfs_item_key(eb, disk_key, nr);
}

static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
                                             const struct btrfs_dir_item *item,
                                             struct btrfs_key *cpu_key)
{
        struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key;

        btrfs_dir_item_key(eb, item, disk_key);
}

#else

static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
                                         const struct btrfs_disk_key *disk)
{
        cpu->offset = le64_to_cpu(disk->offset);
        cpu->type = disk->type;
        cpu->objectid = le64_to_cpu(disk->objectid);
}

static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
                                         const struct btrfs_key *cpu)
{
        disk->offset = cpu_to_le64(cpu->offset);
        disk->type = cpu->type;
        disk->objectid = cpu_to_le64(cpu->objectid);
}

static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb,
                                         struct btrfs_key *key, int nr)
{
        struct btrfs_disk_key disk_key;

        btrfs_node_key(eb, &disk_key, nr);
        btrfs_disk_key_to_cpu(key, &disk_key);
}

static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb,
                                         struct btrfs_key *key, int nr)
{
        struct btrfs_disk_key disk_key;

        btrfs_item_key(eb, &disk_key, nr);
        btrfs_disk_key_to_cpu(key, &disk_key);
}

static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb,
                                             const struct btrfs_dir_item *item,
                                             struct btrfs_key *key)
{
        struct btrfs_disk_key disk_key;

        btrfs_dir_item_key(eb, item, &disk_key);
        btrfs_disk_key_to_cpu(key, &disk_key);
}

#endif

/* struct btrfs_header */
BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, generation, 64);
BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header,
                         generation, 64);
BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64);
BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, nritems, 32);
BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64);

static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag)
{
        return (btrfs_header_flags(eb) & flag) == flag;
}

static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
{
        u64 flags = btrfs_header_flags(eb);

        btrfs_set_header_flags(eb, flags | flag);
}

static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
{
        u64 flags = btrfs_header_flags(eb);

        btrfs_set_header_flags(eb, flags & ~flag);
}

static inline int btrfs_header_backref_rev(const struct extent_buffer *eb)
{
        u64 flags = btrfs_header_flags(eb);

        return flags >> BTRFS_BACKREF_REV_SHIFT;
}

static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, int rev)
{
        u64 flags = btrfs_header_flags(eb);

        flags &= ~BTRFS_BACKREF_REV_MASK;
        flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT;
        btrfs_set_header_flags(eb, flags);
}

static inline int btrfs_is_leaf(const struct extent_buffer *eb)
{
        return btrfs_header_level(eb) == 0;
}

/* struct btrfs_root_item */
BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, generation, 64);
BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);

BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, generation, 64);
BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8);
BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
                         last_snapshot, 64);
BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item,
                         generation_v2, 64);
BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, ctransid, 64);
BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, otransid, 64);
BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, stransid, 64);
BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, rtransid, 64);

/* struct btrfs_root_backup */
BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
                   tree_root, 64);
BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
                   tree_root_gen, 64);
BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
                   tree_root_level, 8);

BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
                   chunk_root, 64);
BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
                   chunk_root_gen, 64);
BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
                   chunk_root_level, 8);

BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
                   extent_root, 64);
BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
                   extent_root_gen, 64);
BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
                   extent_root_level, 8);

BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
                   fs_root, 64);
BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
                   fs_root_gen, 64);
BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
                   fs_root_level, 8);

BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
                   dev_root, 64);
BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
                   dev_root_gen, 64);
BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
                   dev_root_level, 8);

BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
                   csum_root, 64);
BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
                   csum_root_gen, 64);
BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
                   csum_root_level, 8);
BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
                   total_bytes, 64);
BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
                   bytes_used, 64);
BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
                   num_devices, 64);

/* struct btrfs_balance_item */
BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);

static inline void btrfs_balance_data(const struct extent_buffer *eb,
                                      const struct btrfs_balance_item *bi,
                                      struct btrfs_disk_balance_args *ba)
{
        read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
}

static inline void btrfs_set_balance_data(struct extent_buffer *eb,
                                          struct btrfs_balance_item *bi,
                                          const struct btrfs_disk_balance_args *ba)
{
        write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
}

static inline void btrfs_balance_meta(const struct extent_buffer *eb,
                                      const struct btrfs_balance_item *bi,
                                      struct btrfs_disk_balance_args *ba)
{
        read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
}

static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
                                          struct btrfs_balance_item *bi,
                                          const struct btrfs_disk_balance_args *ba)
{
        write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
}

static inline void btrfs_balance_sys(const struct extent_buffer *eb,
                                     const struct btrfs_balance_item *bi,
                                     struct btrfs_disk_balance_args *ba)
{
        read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
}

static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
                                         struct btrfs_balance_item *bi,
                                         const struct btrfs_disk_balance_args *ba)
{
        write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
}

/* struct btrfs_super_block */
BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
                         generation, 64);
BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
                         struct btrfs_super_block, sys_chunk_array_size, 32);
BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
                         struct btrfs_super_block, chunk_root_generation, 64);
BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
                         root_level, 8);
BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
                         chunk_root, 64);
BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
                         chunk_root_level, 8);
BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, log_root, 64);
BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
                         log_root_level, 8);
BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
                         total_bytes, 64);
BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
                         bytes_used, 64);
BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
                         sectorsize, 32);
BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
                         nodesize, 32);
BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
                         stripesize, 32);
BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
                         root_dir_objectid, 64);
BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
                         num_devices, 64);
BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
                         compat_flags, 64);
BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
                         compat_ro_flags, 64);
BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
                         incompat_flags, 64);
BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
                         csum_type, 16);
BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
                         cache_generation, 64);
BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
                         uuid_tree_generation, 64);
BTRFS_SETGET_STACK_FUNCS(super_nr_global_roots, struct btrfs_super_block,
                         nr_global_roots, 64);

/* struct btrfs_file_extent_item */
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item,
                         type, 8);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
                         struct btrfs_file_extent_item, disk_bytenr, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset,
                         struct btrfs_file_extent_item, offset, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
                         struct btrfs_file_extent_item, generation, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
                         struct btrfs_file_extent_item, num_bytes, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes,
                         struct btrfs_file_extent_item, ram_bytes, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
                         struct btrfs_file_extent_item, disk_num_bytes, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
                         struct btrfs_file_extent_item, compression, 8);


BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
                   disk_bytenr, 64);
BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
                   generation, 64);
BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
                   disk_num_bytes, 64);
BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
                  offset, 64);
BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
                   num_bytes, 64);
BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
                   ram_bytes, 64);
BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
                   compression, 8);
BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
                   encryption, 8);
BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
                   other_encoding, 16);

/* btrfs_qgroup_status_item */
BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
                   generation, 64);
BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
                   version, 64);
BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
                   flags, 64);
BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item,
                   rescan, 64);
BTRFS_SETGET_FUNCS(qgroup_status_enable_gen, struct btrfs_qgroup_status_item,
                   enable_gen, 64);

/* btrfs_qgroup_info_item */
BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
                   generation, 64);
BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64);
BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item,
                   rfer_cmpr, 64);
BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64);
BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item,
                   excl_cmpr, 64);

BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation,
                         struct btrfs_qgroup_info_item, generation, 64);
BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item,
                         rfer, 64);
BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr,
                         struct btrfs_qgroup_info_item, rfer_cmpr, 64);
BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item,
                         excl, 64);
BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr,
                         struct btrfs_qgroup_info_item, excl_cmpr, 64);

/* btrfs_qgroup_limit_item */
BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, flags, 64);
BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item,
                   max_rfer, 64);
BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item,
                   max_excl, 64);
BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
                   rsv_rfer, 64);
BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
                   rsv_excl, 64);
BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_flags,
                         struct btrfs_qgroup_limit_item, flags, 64);
BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_rfer,
                         struct btrfs_qgroup_limit_item, max_rfer, 64);
BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_excl,
                         struct btrfs_qgroup_limit_item, max_excl, 64);
BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_rfer,
                         struct btrfs_qgroup_limit_item, rsv_rfer, 64);
BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_excl,
                         struct btrfs_qgroup_limit_item, rsv_excl, 64);

/* btrfs_dev_replace_item */
BTRFS_SETGET_FUNCS(dev_replace_src_devid,
                   struct btrfs_dev_replace_item, src_devid, 64);
BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
                   struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
                   64);
BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
                   replace_state, 64);
BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
                   time_started, 64);
BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
                   time_stopped, 64);
BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
                   num_write_errors, 64);
BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
                   struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
                   64);
BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
                   cursor_left, 64);
BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
                   cursor_right, 64);

BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
                         struct btrfs_dev_replace_item, src_devid, 64);
BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
                         struct btrfs_dev_replace_item,
                         cont_reading_from_srcdev_mode, 64);
BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
                         struct btrfs_dev_replace_item, replace_state, 64);
BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
                         struct btrfs_dev_replace_item, time_started, 64);
BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
                         struct btrfs_dev_replace_item, time_stopped, 64);
BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
                         struct btrfs_dev_replace_item, num_write_errors, 64);
BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
                         struct btrfs_dev_replace_item,
                         num_uncorrectable_read_errors, 64);
BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
                         struct btrfs_dev_replace_item, cursor_left, 64);
BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
                         struct btrfs_dev_replace_item, cursor_right, 64);

/* btrfs_verity_descriptor_item */
BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
                   encryption, 8);
BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item,
                   size, 64);
BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
                         struct btrfs_verity_descriptor_item, encryption, 8);
BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
                         struct btrfs_verity_descriptor_item, size, 64);

/* Cast into the data area of the leaf. */
#define btrfs_item_ptr(leaf, slot, type)                                \
        ((type *)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot)))

#define btrfs_item_ptr_offset(leaf, slot)                                \
        ((unsigned long)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot)))

#endif


























































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_GFP_H
#define __LINUX_GFP_H

#include <linux/gfp_types.h>

#include <linux/mmzone.h>
#include <linux/topology.h>
#include <linux/alloc_tag.h>
#include <linux/sched.h>

struct vm_area_struct;
struct mempolicy;

/* Convert GFP flags to their corresponding migrate type */
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
#define GFP_MOVABLE_SHIFT 3

static inline int gfp_migratetype(const gfp_t gfp_flags)
{
        VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
        BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
        BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);
        BUILD_BUG_ON((___GFP_RECLAIMABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_RECLAIMABLE);
        BUILD_BUG_ON(((___GFP_MOVABLE | ___GFP_RECLAIMABLE) >>
                      GFP_MOVABLE_SHIFT) != MIGRATE_HIGHATOMIC);

        if (unlikely(page_group_by_mobility_disabled))
                return MIGRATE_UNMOVABLE;

        /* Group based on mobility */
        return (__force unsigned long)(gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
}
#undef GFP_MOVABLE_MASK
#undef GFP_MOVABLE_SHIFT

static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
{
        return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
}

#ifdef CONFIG_HIGHMEM
#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
#else
#define OPT_ZONE_HIGHMEM ZONE_NORMAL
#endif

#ifdef CONFIG_ZONE_DMA
#define OPT_ZONE_DMA ZONE_DMA
#else
#define OPT_ZONE_DMA ZONE_NORMAL
#endif

#ifdef CONFIG_ZONE_DMA32
#define OPT_ZONE_DMA32 ZONE_DMA32
#else
#define OPT_ZONE_DMA32 ZONE_NORMAL
#endif

/*
 * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
 * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT
 * bits long and there are 16 of them to cover all possible combinations of
 * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM.
 *
 * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
 * But GFP_MOVABLE is not only a zone specifier but also an allocation
 * policy. Therefore __GFP_MOVABLE plus another zone selector is valid.
 * Only 1 bit of the lowest 3 bits (DMA,DMA32,HIGHMEM) can be set to "1".
 *
 *       bit       result
 *       =================
 *       0x0    => NORMAL
 *       0x1    => DMA or NORMAL
 *       0x2    => HIGHMEM or NORMAL
 *       0x3    => BAD (DMA+HIGHMEM)
 *       0x4    => DMA32 or NORMAL
 *       0x5    => BAD (DMA+DMA32)
 *       0x6    => BAD (HIGHMEM+DMA32)
 *       0x7    => BAD (HIGHMEM+DMA32+DMA)
 *       0x8    => NORMAL (MOVABLE+0)
 *       0x9    => DMA or NORMAL (MOVABLE+DMA)
 *       0xa    => MOVABLE (Movable is valid only if HIGHMEM is set too)
 *       0xb    => BAD (MOVABLE+HIGHMEM+DMA)
 *       0xc    => DMA32 or NORMAL (MOVABLE+DMA32)
 *       0xd    => BAD (MOVABLE+DMA32+DMA)
 *       0xe    => BAD (MOVABLE+DMA32+HIGHMEM)
 *       0xf    => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
 *
 * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms.
 */

#if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4
/* ZONE_DEVICE is not a valid GFP zone specifier */
#define GFP_ZONES_SHIFT 2
#else
#define GFP_ZONES_SHIFT ZONES_SHIFT
#endif

#if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG
#error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
#endif

#define GFP_ZONE_TABLE ( \
        (ZONE_NORMAL << 0 * GFP_ZONES_SHIFT)                                       \
        | (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT)                       \
        | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT)               \
        | (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT)                       \
        | (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT)                       \
        | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT)    \
        | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\
        | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\
)

/*
 * GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32
 * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per
 * entry starting with bit 0. Bit is set if the combination is not
 * allowed.
 */
#define GFP_ZONE_BAD ( \
        1 << (___GFP_DMA | ___GFP_HIGHMEM)                                      \
        | 1 << (___GFP_DMA | ___GFP_DMA32)                                      \
        | 1 << (___GFP_DMA32 | ___GFP_HIGHMEM)                                      \
        | 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM)  \
)

static inline enum zone_type gfp_zone(gfp_t flags)
{
        enum zone_type z;
        int bit = (__force int) (flags & GFP_ZONEMASK);

        z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
                                         ((1 << GFP_ZONES_SHIFT) - 1);
        VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
        return z;
}

/*
 * There is only one page-allocator function, and two main namespaces to
 * it. The alloc_page*() variants return 'struct page *' and as such
 * can allocate highmem pages, the *get*page*() variants return
 * virtual kernel addresses to the allocated page(s).
 */

static inline int gfp_zonelist(gfp_t flags)
{
#ifdef CONFIG_NUMA
        if (unlikely(flags & __GFP_THISNODE))
                return ZONELIST_NOFALLBACK;
#endif
        return ZONELIST_FALLBACK;
}

/*
 * gfp flag masking for nested internal allocations.
 *
 * For code that needs to do allocations inside the public allocation API (e.g.
 * memory allocation tracking code) the allocations need to obey the caller
 * allocation context constrains to prevent allocation context mismatches (e.g.
 * GFP_KERNEL allocations in GFP_NOFS contexts) from potential deadlock
 * situations.
 *
 * It is also assumed that these nested allocations are for internal kernel
 * object storage purposes only and are not going to be used for DMA, etc. Hence
 * we strip out all the zone information and leave just the context information
 * intact.
 *
 * Further, internal allocations must fail before the higher level allocation
 * can fail, so we must make them fail faster and fail silently. We also don't
 * want them to deplete emergency reserves.  Hence nested allocations must be
 * prepared for these allocations to fail.
 */
static inline gfp_t gfp_nested_mask(gfp_t flags)
{
        return ((flags & (GFP_KERNEL | GFP_ATOMIC | __GFP_NOLOCKDEP)) |
                (__GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN));
}

/*
 * We get the zone list from the current node and the gfp_mask.
 * This zone list contains a maximum of MAX_NUMNODES*MAX_NR_ZONES zones.
 * There are two zonelists per node, one for all zones with memory and
 * one containing just zones from the node the zonelist belongs to.
 *
 * For the case of non-NUMA systems the NODE_DATA() gets optimized to
 * &contig_page_data at compile-time.
 */
static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
{
        return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
}

#ifndef HAVE_ARCH_FREE_PAGE
static inline void arch_free_page(struct page *page, int order) { }
#endif
#ifndef HAVE_ARCH_ALLOC_PAGE
static inline void arch_alloc_page(struct page *page, int order) { }
#endif

struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
                nodemask_t *nodemask);
#define __alloc_pages(...)                        alloc_hooks(__alloc_pages_noprof(__VA_ARGS__))

struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
                nodemask_t *nodemask);
#define __folio_alloc(...)                        alloc_hooks(__folio_alloc_noprof(__VA_ARGS__))

unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
                                nodemask_t *nodemask, int nr_pages,
                                struct list_head *page_list,
                                struct page **page_array);
#define __alloc_pages_bulk(...)                        alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__))

unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
                                unsigned long nr_pages,
                                struct page **page_array);
#define  alloc_pages_bulk_array_mempolicy(...)                                \
        alloc_hooks(alloc_pages_bulk_array_mempolicy_noprof(__VA_ARGS__))

/* Bulk allocate order-0 pages */
#define alloc_pages_bulk_list(_gfp, _nr_pages, _list)                        \
        __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _list, NULL)

#define alloc_pages_bulk_array(_gfp, _nr_pages, _page_array)                \
        __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, NULL, _page_array)

static inline unsigned long
alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages,
                                   struct page **page_array)
{
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();

        return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, NULL, page_array);
}

#define alloc_pages_bulk_array_node(...)                                \
        alloc_hooks(alloc_pages_bulk_array_node_noprof(__VA_ARGS__))

static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask)
{
        gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN);

        if (warn_gfp != (__GFP_THISNODE|__GFP_NOWARN))
                return;

        if (node_online(this_node))
                return;

        pr_warn("%pGg allocation from offline node %d\n", &gfp_mask, this_node);
        dump_stack();
}

/*
 * Allocate pages, preferring the node given as nid. The node must be valid and
 * online. For more general interface, see alloc_pages_node().
 */
static inline struct page *
__alloc_pages_node_noprof(int nid, gfp_t gfp_mask, unsigned int order)
{
        VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
        warn_if_node_offline(nid, gfp_mask);

        return __alloc_pages_noprof(gfp_mask, order, nid, NULL);
}

#define  __alloc_pages_node(...)                alloc_hooks(__alloc_pages_node_noprof(__VA_ARGS__))

static inline
struct folio *__folio_alloc_node_noprof(gfp_t gfp, unsigned int order, int nid)
{
        VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
        warn_if_node_offline(nid, gfp);

        return __folio_alloc_noprof(gfp, order, nid, NULL);
}

#define  __folio_alloc_node(...)                alloc_hooks(__folio_alloc_node_noprof(__VA_ARGS__))

/*
 * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
 * prefer the current CPU's closest node. Otherwise node must be valid and
 * online.
 */
static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask,
                                                   unsigned int order)
{
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();

        return __alloc_pages_node_noprof(nid, gfp_mask, order);
}

#define  alloc_pages_node(...)                        alloc_hooks(alloc_pages_node_noprof(__VA_ARGS__))

#ifdef CONFIG_NUMA
struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order);
struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
                struct mempolicy *mpol, pgoff_t ilx, int nid);
struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order);
struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
                unsigned long addr, bool hugepage);
#else
static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order)
{
        return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order);
}
static inline struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
                struct mempolicy *mpol, pgoff_t ilx, int nid)
{
        return alloc_pages_noprof(gfp, order);
}
static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
{
        return __folio_alloc_node(gfp, order, numa_node_id());
}
#define vma_alloc_folio_noprof(gfp, order, vma, addr, hugepage)                \
        folio_alloc_noprof(gfp, order)
#endif

#define alloc_pages(...)                        alloc_hooks(alloc_pages_noprof(__VA_ARGS__))
#define alloc_pages_mpol(...)                        alloc_hooks(alloc_pages_mpol_noprof(__VA_ARGS__))
#define folio_alloc(...)                        alloc_hooks(folio_alloc_noprof(__VA_ARGS__))
#define vma_alloc_folio(...)                        alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__))

#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)

static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
                struct vm_area_struct *vma, unsigned long addr)
{
        struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr, false);

        return &folio->page;
}
#define alloc_page_vma(...)                        alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__))

extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order);
#define __get_free_pages(...)                        alloc_hooks(get_free_pages_noprof(__VA_ARGS__))

extern unsigned long get_zeroed_page_noprof(gfp_t gfp_mask);
#define get_zeroed_page(...)                        alloc_hooks(get_zeroed_page_noprof(__VA_ARGS__))

void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) __alloc_size(1);
#define alloc_pages_exact(...)                        alloc_hooks(alloc_pages_exact_noprof(__VA_ARGS__))

void free_pages_exact(void *virt, size_t size);

__meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2);
#define alloc_pages_exact_nid(...)                                        \
        alloc_hooks(alloc_pages_exact_nid_noprof(__VA_ARGS__))

#define __get_free_page(gfp_mask)                                        \
        __get_free_pages((gfp_mask), 0)

#define __get_dma_pages(gfp_mask, order)                                \
        __get_free_pages((gfp_mask) | GFP_DMA, (order))

extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);

struct page_frag_cache;
void page_frag_cache_drain(struct page_frag_cache *nc);
extern void __page_frag_cache_drain(struct page *page, unsigned int count);
void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz,
                              gfp_t gfp_mask, unsigned int align_mask);

static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
                                          unsigned int fragsz, gfp_t gfp_mask,
                                          unsigned int align)
{
        WARN_ON_ONCE(!is_power_of_2(align));
        return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
}

static inline void *page_frag_alloc(struct page_frag_cache *nc,
                             unsigned int fragsz, gfp_t gfp_mask)
{
        return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
}

extern void page_frag_free(void *addr);

#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr), 0)

void page_alloc_init_cpuhp(void);
int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);

void page_alloc_init_late(void);
void setup_pcp_cacheinfo(unsigned int cpu);

/*
 * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
 * GFP flags are used before interrupts are enabled. Once interrupts are
 * enabled, it is set to __GFP_BITS_MASK while the system is running. During
 * hibernation, it is used by PM to avoid I/O during memory allocation while
 * devices are suspended.
 */
extern gfp_t gfp_allowed_mask;

/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);

static inline bool gfp_has_io_fs(gfp_t gfp)
{
        return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS);
}

/*
 * Check if the gfp flags allow compaction - GFP_NOIO is a really
 * tricky context because the migration might require IO.
 */
static inline bool gfp_compaction_allowed(gfp_t gfp_mask)
{
        return IS_ENABLED(CONFIG_COMPACTION) && (gfp_mask & __GFP_IO);
}

extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);

#ifdef CONFIG_CONTIG_ALLOC
/* The below functions must be run on a range from a single zone. */
extern int alloc_contig_range_noprof(unsigned long start, unsigned long end,
                              unsigned migratetype, gfp_t gfp_mask);
#define alloc_contig_range(...)                        alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__))

extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
                                              int nid, nodemask_t *nodemask);
#define alloc_contig_pages(...)                        alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__))

#endif
void free_contig_range(unsigned long pfn, unsigned long nr_pages);

#endif /* __LINUX_GFP_H */














































   14 

































    7 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM maple_tree

#if !defined(_TRACE_MM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MM_H


#include <linux/tracepoint.h>

struct ma_state;

TRACE_EVENT(ma_op,

        TP_PROTO(const char *fn, struct ma_state *mas),

        TP_ARGS(fn, mas),

        TP_STRUCT__entry(
                        __field(const char *, fn)
                        __field(unsigned long, min)
                        __field(unsigned long, max)
                        __field(unsigned long, index)
                        __field(unsigned long, last)
                        __field(void *, node)
        ),

        TP_fast_assign(
                        __entry->fn                = fn;
                        __entry->min                = mas->min;
                        __entry->max                = mas->max;
                        __entry->index                = mas->index;
                        __entry->last                = mas->last;
                        __entry->node                = mas->node;
        ),

        TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu",
                  __entry->fn,
                  (void *) __entry->node,
                  (unsigned long) __entry->min,
                  (unsigned long) __entry->max,
                  (unsigned long) __entry->index,
                  (unsigned long) __entry->last
        )
)
TRACE_EVENT(ma_read,

        TP_PROTO(const char *fn, struct ma_state *mas),

        TP_ARGS(fn, mas),

        TP_STRUCT__entry(
                        __field(const char *, fn)
                        __field(unsigned long, min)
                        __field(unsigned long, max)
                        __field(unsigned long, index)
                        __field(unsigned long, last)
                        __field(void *, node)
        ),

        TP_fast_assign(
                        __entry->fn                = fn;
                        __entry->min                = mas->min;
                        __entry->max                = mas->max;
                        __entry->index                = mas->index;
                        __entry->last                = mas->last;
                        __entry->node                = mas->node;
        ),

        TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu",
                  __entry->fn,
                  (void *) __entry->node,
                  (unsigned long) __entry->min,
                  (unsigned long) __entry->max,
                  (unsigned long) __entry->index,
                  (unsigned long) __entry->last
        )
)

TRACE_EVENT(ma_write,

        TP_PROTO(const char *fn, struct ma_state *mas, unsigned long piv,
                 void *val),

        TP_ARGS(fn, mas, piv, val),

        TP_STRUCT__entry(
                        __field(const char *, fn)
                        __field(unsigned long, min)
                        __field(unsigned long, max)
                        __field(unsigned long, index)
                        __field(unsigned long, last)
                        __field(unsigned long, piv)
                        __field(void *, val)
                        __field(void *, node)
        ),

        TP_fast_assign(
                        __entry->fn                = fn;
                        __entry->min                = mas->min;
                        __entry->max                = mas->max;
                        __entry->index                = mas->index;
                        __entry->last                = mas->last;
                        __entry->piv                = piv;
                        __entry->val                = val;
                        __entry->node                = mas->node;
        ),

        TP_printk("%s\tNode %p (%lu %lu) range:%lu-%lu piv (%lu) val %p",
                  __entry->fn,
                  (void *) __entry->node,
                  (unsigned long) __entry->min,
                  (unsigned long) __entry->max,
                  (unsigned long) __entry->index,
                  (unsigned long) __entry->last,
                  (unsigned long) __entry->piv,
                  (void *) __entry->val
        )
)
#endif /* _TRACE_MM_H */

/* This part must be outside protection */
#include <trace/define_trace.h>










































    4 


















































    4 



















    4 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/uaccess.h>
#include <linux/mm.h>
#include <linux/bitops.h>

#include <asm/word-at-a-time.h>

/*
 * Do a strnlen, return length of string *with* final '\0'.
 * 'count' is the user-supplied count, while 'max' is the
 * address space maximum.
 *
 * Return 0 for exceptions (which includes hitting the address
 * space maximum), or 'count+1' if hitting the user-supplied
 * maximum count.
 *
 * NOTE! We can sometimes overshoot the user-supplied maximum
 * if it fits in a aligned 'long'. The caller needs to check
 * the return value against "> max".
 */
static __always_inline long do_strnlen_user(const char __user *src, unsigned long count, unsigned long max)
{
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
        unsigned long align, res = 0;
        unsigned long c;

        /*
         * Do everything aligned. But that means that we
         * need to also expand the maximum..
         */
        align = (sizeof(unsigned long) - 1) & (unsigned long)src;
        src -= align;
        max += align;

        unsafe_get_user(c, (unsigned long __user *)src, efault);
        c |= aligned_byte_mask(align);

        for (;;) {
                unsigned long data;
                if (has_zero(c, &data, &constants)) {
                        data = prep_zero_mask(c, data, &constants);
                        data = create_zero_mask(data);
                        return res + find_zero(data) + 1 - align;
                }
                res += sizeof(unsigned long);
                /* We already handled 'unsigned long' bytes. Did we do it all ? */
                if (unlikely(max <= sizeof(unsigned long)))
                        break;
                max -= sizeof(unsigned long);
                unsafe_get_user(c, (unsigned long __user *)(src+res), efault);
        }
        res -= align;

        /*
         * Uhhuh. We hit 'max'. But was that the user-specified maximum
         * too? If so, return the marker for "too long".
         */
        if (res >= count)
                return count+1;

        /*
         * Nope: we hit the address space limit, and we still had more
         * characters the caller would have wanted. That's 0.
         */
efault:
        return 0;
}

/**
 * strnlen_user: - Get the size of a user string INCLUDING final NUL.
 * @str: The string to measure.
 * @count: Maximum count (including NUL character)
 *
 * Context: User context only. This function may sleep if pagefaults are
 *          enabled.
 *
 * Get the size of a NUL-terminated string in user space.
 *
 * Returns the size of the string INCLUDING the terminating NUL.
 * If the string is too long, returns a number larger than @count. User
 * has to check the return value against "> count".
 * On exception (or invalid count), returns 0.
 *
 * NOTE! You should basically never use this function. There is
 * almost never any valid case for using the length of a user space
 * string, since the string can be changed at any time by other
 * threads. Use "strncpy_from_user()" instead to get a stable copy
 * of the string.
 */
long strnlen_user(const char __user *str, long count)
{
        unsigned long max_addr, src_addr;

        if (unlikely(count <= 0))
                return 0;

        max_addr = TASK_SIZE_MAX;
        src_addr = (unsigned long)untagged_addr(str);
        if (likely(src_addr < max_addr)) {
                unsigned long max = max_addr - src_addr;
                long retval;

                /*
                 * Truncate 'max' to the user-specified limit, so that
                 * we only have one limit we need to check in the loop
                 */
                if (max > count)
                        max = count;

                if (user_read_access_begin(str, max)) {
                        retval = do_strnlen_user(str, count, max);
                        user_read_access_end();
                        return retval;
                }
        }
        return 0;
}
EXPORT_SYMBOL(strnlen_user);








































































































































































































    2 








   20 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    6 




















    6 


    6 

    6 
    6 






























































































   30 







   28 





















































































































   24 





   22 









































































































































































   24 










   23 
































   22 






























    6 







    6 



















    8 







   15 







   13 
   11 
















































































































































































































    2 
    2 














































































































































    2 



    2 


































































































































































































































































    3 





    3 



















































    1 



    1 









































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Simple NUMA memory policy for the Linux kernel.
 *
 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
 *
 * NUMA policy allows the user to give hints in which node(s) memory should
 * be allocated.
 *
 * Support four policies per VMA and per process:
 *
 * The VMA policy has priority over the process policy for a page fault.
 *
 * interleave     Allocate memory interleaved over a set of nodes,
 *                with normal fallback if it fails.
 *                For VMA based allocations this interleaves based on the
 *                offset into the backing object or offset into the mapping
 *                for anonymous memory. For process policy an process counter
 *                is used.
 *
 * weighted interleave
 *                Allocate memory interleaved over a set of nodes based on
 *                a set of weights (per-node), with normal fallback if it
 *                fails.  Otherwise operates the same as interleave.
 *                Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
 *                on node 0 for every 1 page allocated on node 1.
 *
 * bind           Only allocate memory on a specific set of nodes,
 *                no fallback.
 *                FIXME: memory is allocated starting with the first node
 *                to the last. It would be better if bind would truly restrict
 *                the allocation to memory nodes instead
 *
 * preferred      Try a specific node first before normal fallback.
 *                As a special case NUMA_NO_NODE here means do the allocation
 *                on the local CPU. This is normally identical to default,
 *                but useful to set in a VMA when you have a non default
 *                process policy.
 *
 * preferred many Try a set of nodes first before normal fallback. This is
 *                similar to preferred without the special case.
 *
 * default        Allocate on the local node first, or when on a VMA
 *                use the process policy. This is what Linux always did
 *                  in a NUMA aware kernel and still does by, ahem, default.
 *
 * The process policy is applied for most non interrupt memory allocations
 * in that process' context. Interrupts ignore the policies and always
 * try to allocate on the local CPU. The VMA policy is only applied for memory
 * allocations for a VMA in the VM.
 *
 * Currently there are a few corner cases in swapping where the policy
 * is not applied, but the majority should be handled. When process policy
 * is used it is not remembered over swap outs/swap ins.
 *
 * Only the highest zone in the zone hierarchy gets policied. Allocations
 * requesting a lower zone just use default policy. This implies that
 * on systems with highmem kernel lowmem allocation don't get policied.
 * Same with GFP_DMA allocations.
 *
 * For shmem/tmpfs shared memory the policy is shared between
 * all users and remembered even when nobody has memory mapped.
 */

/* Notebook:
   fix mmap readahead to honour policy and enable policy for any page cache
   object
   statistics for bigpages
   global policy for page cache? currently it uses process policy. Requires
   first item above.
   handle mremap for shared memory (currently ignored for the policy)
   grows down?
   make bind policy root only? It can trigger oom much faster and the
   kernel is not always grateful with that.
*/

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mempolicy.h>
#include <linux/pagewalk.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compat.h>
#include <linux/ptrace.h>
#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/migrate.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ctype.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
#include <linux/swapops.h>

#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <linux/uaccess.h>

#include "internal.h"

/* Internal flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)        /* Skip checks for continuous vmas */
#define MPOL_MF_INVERT       (MPOL_MF_INTERNAL << 1)        /* Invert check for nodemask */
#define MPOL_MF_WRLOCK       (MPOL_MF_INTERNAL << 2)        /* Write-lock walked vmas */

static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;

/* Highest zone. An specific allocation for a zone below that is not
   policied. */
enum zone_type policy_zone = 0;

/*
 * run-time system-wide default policy => local allocation
 */
static struct mempolicy default_policy = {
        .refcnt = ATOMIC_INIT(1), /* never free it */
        .mode = MPOL_LOCAL,
};

static struct mempolicy preferred_node_policy[MAX_NUMNODES];

/*
 * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
 * system-default value should be used. A NULL iw_table also denotes that
 * system-default values should be used. Until the system-default table
 * is implemented, the system-default is always 1.
 *
 * iw_table is RCU protected
 */
static u8 __rcu *iw_table;
static DEFINE_MUTEX(iw_table_lock);

static u8 get_il_weight(int node)
{
        u8 *table;
        u8 weight;

        rcu_read_lock();
        table = rcu_dereference(iw_table);
        /* if no iw_table, use system default */
        weight = table ? table[node] : 1;
        /* if value in iw_table is 0, use system default */
        weight = weight ? weight : 1;
        rcu_read_unlock();
        return weight;
}

/**
 * numa_nearest_node - Find nearest node by state
 * @node: Node id to start the search
 * @state: State to filter the search
 *
 * Lookup the closest node by distance if @nid is not in state.
 *
 * Return: this @node if it is in state, otherwise the closest node by distance
 */
int numa_nearest_node(int node, unsigned int state)
{
        int min_dist = INT_MAX, dist, n, min_node;

        if (state >= NR_NODE_STATES)
                return -EINVAL;

        if (node == NUMA_NO_NODE || node_state(node, state))
                return node;

        min_node = node;
        for_each_node_state(n, state) {
                dist = node_distance(node, n);
                if (dist < min_dist) {
                        min_dist = dist;
                        min_node = n;
                }
        }

        return min_node;
}
EXPORT_SYMBOL_GPL(numa_nearest_node);

struct mempolicy *get_task_policy(struct task_struct *p)
{
        struct mempolicy *pol = p->mempolicy;
        int node;

        if (pol)
                return pol;

        node = numa_node_id();
        if (node != NUMA_NO_NODE) {
                pol = &preferred_node_policy[node];
                /* preferred_node_policy is not initialised early in boot */
                if (pol->mode)
                        return pol;
        }

        return &default_policy;
}

static const struct mempolicy_operations {
        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
} mpol_ops[MPOL_MAX];

static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
        return pol->flags & MPOL_MODE_FLAGS;
}

static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
                                   const nodemask_t *rel)
{
        nodemask_t tmp;
        nodes_fold(tmp, *orig, nodes_weight(*rel));
        nodes_onto(*ret, tmp, *rel);
}

static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
        if (nodes_empty(*nodes))
                return -EINVAL;
        pol->nodes = *nodes;
        return 0;
}

static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
{
        if (nodes_empty(*nodes))
                return -EINVAL;

        nodes_clear(pol->nodes);
        node_set(first_node(*nodes), pol->nodes);
        return 0;
}

/*
 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 * any, for the new policy.  mpol_new() has already validated the nodes
 * parameter with respect to the policy mode and flags.
 *
 * Must be called holding task's alloc_lock to protect task's mems_allowed
 * and mempolicy.  May also be called holding the mmap_lock for write.
 */
static int mpol_set_nodemask(struct mempolicy *pol,
                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
{
        int ret;

        /*
         * Default (pol==NULL) resp. local memory policies are not a
         * subject of any remapping. They also do not need any special
         * constructor.
         */
        if (!pol || pol->mode == MPOL_LOCAL)
                return 0;

        /* Check N_MEMORY */
        nodes_and(nsc->mask1,
                  cpuset_current_mems_allowed, node_states[N_MEMORY]);

        VM_BUG_ON(!nodes);

        if (pol->flags & MPOL_F_RELATIVE_NODES)
                mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
        else
                nodes_and(nsc->mask2, *nodes, nsc->mask1);

        if (mpol_store_user_nodemask(pol))
                pol->w.user_nodemask = *nodes;
        else
                pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;

        ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
        return ret;
}

/*
 * This function just creates a new policy, does some check and simple
 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 */
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
                                  nodemask_t *nodes)
{
        struct mempolicy *policy;

        if (mode == MPOL_DEFAULT) {
                if (nodes && !nodes_empty(*nodes))
                        return ERR_PTR(-EINVAL);
                return NULL;
        }
        VM_BUG_ON(!nodes);

        /*
         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
         * All other modes require a valid pointer to a non-empty nodemask.
         */
        if (mode == MPOL_PREFERRED) {
                if (nodes_empty(*nodes)) {
                        if (((flags & MPOL_F_STATIC_NODES) ||
                             (flags & MPOL_F_RELATIVE_NODES)))
                                return ERR_PTR(-EINVAL);

                        mode = MPOL_LOCAL;
                }
        } else if (mode == MPOL_LOCAL) {
                if (!nodes_empty(*nodes) ||
                    (flags & MPOL_F_STATIC_NODES) ||
                    (flags & MPOL_F_RELATIVE_NODES))
                        return ERR_PTR(-EINVAL);
        } else if (nodes_empty(*nodes))
                return ERR_PTR(-EINVAL);

        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
        if (!policy)
                return ERR_PTR(-ENOMEM);
        atomic_set(&policy->refcnt, 1);
        policy->mode = mode;
        policy->flags = flags;
        policy->home_node = NUMA_NO_NODE;

        return policy;
}

/* Slow path of a mpol destructor. */
void __mpol_put(struct mempolicy *pol)
{
        if (!atomic_dec_and_test(&pol->refcnt))
                return;
        kmem_cache_free(policy_cache, pol);
}

static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
{
}

static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
        nodemask_t tmp;

        if (pol->flags & MPOL_F_STATIC_NODES)
                nodes_and(tmp, pol->w.user_nodemask, *nodes);
        else if (pol->flags & MPOL_F_RELATIVE_NODES)
                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
        else {
                nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
                                                                *nodes);
                pol->w.cpuset_mems_allowed = *nodes;
        }

        if (nodes_empty(tmp))
                tmp = *nodes;

        pol->nodes = tmp;
}

static void mpol_rebind_preferred(struct mempolicy *pol,
                                                const nodemask_t *nodes)
{
        pol->w.cpuset_mems_allowed = *nodes;
}

/*
 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 *
 * Per-vma policies are protected by mmap_lock. Allocations using per-task
 * policies are protected by task->mems_allowed_seq to prevent a premature
 * OOM/allocation failure due to parallel nodemask modification.
 */
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
        if (!pol || pol->mode == MPOL_LOCAL)
                return;
        if (!mpol_store_user_nodemask(pol) &&
            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
                return;

        mpol_ops[pol->mode].rebind(pol, newmask);
}

/*
 * Wrapper for mpol_rebind_policy() that just requires task
 * pointer, and updates task mempolicy.
 *
 * Called with task's alloc_lock held.
 */
void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
{
        mpol_rebind_policy(tsk->mempolicy, new);
}

/*
 * Rebind each vma in mm to new nodemask.
 *
 * Call holding a reference to mm.  Takes mm->mmap_lock during call.
 */
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
        struct vm_area_struct *vma;
        VMA_ITERATOR(vmi, mm, 0);

        mmap_write_lock(mm);
        for_each_vma(vmi, vma) {
                vma_start_write(vma);
                mpol_rebind_policy(vma->vm_policy, new);
        }
        mmap_write_unlock(mm);
}

static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
        [MPOL_DEFAULT] = {
                .rebind = mpol_rebind_default,
        },
        [MPOL_INTERLEAVE] = {
                .create = mpol_new_nodemask,
                .rebind = mpol_rebind_nodemask,
        },
        [MPOL_PREFERRED] = {
                .create = mpol_new_preferred,
                .rebind = mpol_rebind_preferred,
        },
        [MPOL_BIND] = {
                .create = mpol_new_nodemask,
                .rebind = mpol_rebind_nodemask,
        },
        [MPOL_LOCAL] = {
                .rebind = mpol_rebind_default,
        },
        [MPOL_PREFERRED_MANY] = {
                .create = mpol_new_nodemask,
                .rebind = mpol_rebind_preferred,
        },
        [MPOL_WEIGHTED_INTERLEAVE] = {
                .create = mpol_new_nodemask,
                .rebind = mpol_rebind_nodemask,
        },
};

static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
                                unsigned long flags);
static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
                                pgoff_t ilx, int *nid);

static bool strictly_unmovable(unsigned long flags)
{
        /*
         * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
         * if any misplaced page is found.
         */
        return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
                         MPOL_MF_STRICT;
}

struct migration_mpol {                /* for alloc_migration_target_by_mpol() */
        struct mempolicy *pol;
        pgoff_t ilx;
};

struct queue_pages {
        struct list_head *pagelist;
        unsigned long flags;
        nodemask_t *nmask;
        unsigned long start;
        unsigned long end;
        struct vm_area_struct *first;
        struct folio *large;                /* note last large folio encountered */
        long nr_failed;                        /* could not be isolated at this time */
};

/*
 * Check if the folio's nid is in qp->nmask.
 *
 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 * in the invert of qp->nmask.
 */
static inline bool queue_folio_required(struct folio *folio,
                                        struct queue_pages *qp)
{
        int nid = folio_nid(folio);
        unsigned long flags = qp->flags;

        return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
}

static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
{
        struct folio *folio;
        struct queue_pages *qp = walk->private;

        if (unlikely(is_pmd_migration_entry(*pmd))) {
                qp->nr_failed++;
                return;
        }
        folio = pmd_folio(*pmd);
        if (is_huge_zero_folio(folio)) {
                walk->action = ACTION_CONTINUE;
                return;
        }
        if (!queue_folio_required(folio, qp))
                return;
        if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
            !vma_migratable(walk->vma) ||
            !migrate_folio_add(folio, qp->pagelist, qp->flags))
                qp->nr_failed++;
}

/*
 * Scan through folios, checking if they satisfy the required conditions,
 * moving them from LRU to local pagelist for migration if they do (or not).
 *
 * queue_folios_pte_range() has two possible return values:
 * 0 - continue walking to scan for more, even if an existing folio on the
 *     wrong node could not be isolated and queued for migration.
 * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
 *        and an existing folio was on a node that does not follow the policy.
 */
static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
                        unsigned long end, struct mm_walk *walk)
{
        struct vm_area_struct *vma = walk->vma;
        struct folio *folio;
        struct queue_pages *qp = walk->private;
        unsigned long flags = qp->flags;
        pte_t *pte, *mapped_pte;
        pte_t ptent;
        spinlock_t *ptl;

        ptl = pmd_trans_huge_lock(pmd, vma);
        if (ptl) {
                queue_folios_pmd(pmd, walk);
                spin_unlock(ptl);
                goto out;
        }

        mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
        if (!pte) {
                walk->action = ACTION_AGAIN;
                return 0;
        }
        for (; addr != end; pte++, addr += PAGE_SIZE) {
                ptent = ptep_get(pte);
                if (pte_none(ptent))
                        continue;
                if (!pte_present(ptent)) {
                        if (is_migration_entry(pte_to_swp_entry(ptent)))
                                qp->nr_failed++;
                        continue;
                }
                folio = vm_normal_folio(vma, addr, ptent);
                if (!folio || folio_is_zone_device(folio))
                        continue;
                /*
                 * vm_normal_folio() filters out zero pages, but there might
                 * still be reserved folios to skip, perhaps in a VDSO.
                 */
                if (folio_test_reserved(folio))
                        continue;
                if (!queue_folio_required(folio, qp))
                        continue;
                if (folio_test_large(folio)) {
                        /*
                         * A large folio can only be isolated from LRU once,
                         * but may be mapped by many PTEs (and Copy-On-Write may
                         * intersperse PTEs of other, order 0, folios).  This is
                         * a common case, so don't mistake it for failure (but
                         * there can be other cases of multi-mapped pages which
                         * this quick check does not help to filter out - and a
                         * search of the pagelist might grow to be prohibitive).
                         *
                         * migrate_pages(&pagelist) returns nr_failed folios, so
                         * check "large" now so that queue_pages_range() returns
                         * a comparable nr_failed folios.  This does imply that
                         * if folio could not be isolated for some racy reason
                         * at its first PTE, later PTEs will not give it another
                         * chance of isolation; but keeps the accounting simple.
                         */
                        if (folio == qp->large)
                                continue;
                        qp->large = folio;
                }
                if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
                    !vma_migratable(vma) ||
                    !migrate_folio_add(folio, qp->pagelist, flags)) {
                        qp->nr_failed++;
                        if (strictly_unmovable(flags))
                                break;
                }
        }
        pte_unmap_unlock(mapped_pte, ptl);
        cond_resched();
out:
        if (qp->nr_failed && strictly_unmovable(flags))
                return -EIO;
        return 0;
}

static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
                               unsigned long addr, unsigned long end,
                               struct mm_walk *walk)
{
#ifdef CONFIG_HUGETLB_PAGE
        struct queue_pages *qp = walk->private;
        unsigned long flags = qp->flags;
        struct folio *folio;
        spinlock_t *ptl;
        pte_t entry;

        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
        entry = huge_ptep_get(pte);
        if (!pte_present(entry)) {
                if (unlikely(is_hugetlb_entry_migration(entry)))
                        qp->nr_failed++;
                goto unlock;
        }
        folio = pfn_folio(pte_pfn(entry));
        if (!queue_folio_required(folio, qp))
                goto unlock;
        if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
            !vma_migratable(walk->vma)) {
                qp->nr_failed++;
                goto unlock;
        }
        /*
         * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
         * Choosing not to migrate a shared folio is not counted as a failure.
         *
         * See folio_likely_mapped_shared() on possible imprecision when we
         * cannot easily detect if a folio is shared.
         */
        if ((flags & MPOL_MF_MOVE_ALL) ||
            (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
                if (!isolate_hugetlb(folio, qp->pagelist))
                        qp->nr_failed++;
unlock:
        spin_unlock(ptl);
        if (qp->nr_failed && strictly_unmovable(flags))
                return -EIO;
#endif
        return 0;
}

#ifdef CONFIG_NUMA_BALANCING
/*
 * This is used to mark a range of virtual addresses to be inaccessible.
 * These are later cleared by a NUMA hinting fault. Depending on these
 * faults, pages may be migrated for better NUMA placement.
 *
 * This is assuming that NUMA faults are handled using PROT_NONE. If
 * an architecture makes a different choice, it will need further
 * changes to the core.
 */
unsigned long change_prot_numa(struct vm_area_struct *vma,
                        unsigned long addr, unsigned long end)
{
        struct mmu_gather tlb;
        long nr_updated;

        tlb_gather_mmu(&tlb, vma->vm_mm);

        nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
        if (nr_updated > 0)
                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);

        tlb_finish_mmu(&tlb);

        return nr_updated;
}
#endif /* CONFIG_NUMA_BALANCING */

static int queue_pages_test_walk(unsigned long start, unsigned long end,
                                struct mm_walk *walk)
{
        struct vm_area_struct *next, *vma = walk->vma;
        struct queue_pages *qp = walk->private;
        unsigned long flags = qp->flags;

        /* range check first */
        VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);

        if (!qp->first) {
                qp->first = vma;
                if (!(flags & MPOL_MF_DISCONTIG_OK) &&
                        (qp->start < vma->vm_start))
                        /* hole at head side of range */
                        return -EFAULT;
        }
        next = find_vma(vma->vm_mm, vma->vm_end);
        if (!(flags & MPOL_MF_DISCONTIG_OK) &&
                ((vma->vm_end < qp->end) &&
                (!next || vma->vm_end < next->vm_start)))
                /* hole at middle or tail of range */
                return -EFAULT;

        /*
         * Need check MPOL_MF_STRICT to return -EIO if possible
         * regardless of vma_migratable
         */
        if (!vma_migratable(vma) &&
            !(flags & MPOL_MF_STRICT))
                return 1;

        /*
         * Check page nodes, and queue pages to move, in the current vma.
         * But if no moving, and no strict checking, the scan can be skipped.
         */
        if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                return 0;
        return 1;
}

static const struct mm_walk_ops queue_pages_walk_ops = {
        .hugetlb_entry                = queue_folios_hugetlb,
        .pmd_entry                = queue_folios_pte_range,
        .test_walk                = queue_pages_test_walk,
        .walk_lock                = PGWALK_RDLOCK,
};

static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
        .hugetlb_entry                = queue_folios_hugetlb,
        .pmd_entry                = queue_folios_pte_range,
        .test_walk                = queue_pages_test_walk,
        .walk_lock                = PGWALK_WRLOCK,
};

/*
 * Walk through page tables and collect pages to be migrated.
 *
 * If pages found in a given range are not on the required set of @nodes,
 * and migration is allowed, they are isolated and queued to @pagelist.
 *
 * queue_pages_range() may return:
 * 0 - all pages already on the right node, or successfully queued for moving
 *     (or neither strict checking nor moving requested: only range checking).
 * >0 - this number of misplaced folios could not be queued for moving
 *      (a hugetlbfs page or a transparent huge page being counted as 1).
 * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
 * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
 */
static long
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                nodemask_t *nodes, unsigned long flags,
                struct list_head *pagelist)
{
        int err;
        struct queue_pages qp = {
                .pagelist = pagelist,
                .flags = flags,
                .nmask = nodes,
                .start = start,
                .end = end,
                .first = NULL,
        };
        const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
                        &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;

        err = walk_page_range(mm, start, end, ops, &qp);

        if (!qp.first)
                /* whole range in hole */
                err = -EFAULT;

        return err ? : qp.nr_failed;
}

/*
 * Apply policy to a single VMA
 * This must be called with the mmap_lock held for writing.
 */
static int vma_replace_policy(struct vm_area_struct *vma,
                                struct mempolicy *pol)
{
        int err;
        struct mempolicy *old;
        struct mempolicy *new;

        vma_assert_write_locked(vma);

        new = mpol_dup(pol);
        if (IS_ERR(new))
                return PTR_ERR(new);

        if (vma->vm_ops && vma->vm_ops->set_policy) {
                err = vma->vm_ops->set_policy(vma, new);
                if (err)
                        goto err_out;
        }

        old = vma->vm_policy;
        vma->vm_policy = new; /* protected by mmap_lock */
        mpol_put(old);

        return 0;
 err_out:
        mpol_put(new);
        return err;
}

/* Split or merge the VMA (if required) and apply the new policy */
static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
                struct vm_area_struct **prev, unsigned long start,
                unsigned long end, struct mempolicy *new_pol)
{
        unsigned long vmstart, vmend;

        vmend = min(end, vma->vm_end);
        if (start > vma->vm_start) {
                *prev = vma;
                vmstart = start;
        } else {
                vmstart = vma->vm_start;
        }

        if (mpol_equal(vma->vm_policy, new_pol)) {
                *prev = vma;
                return 0;
        }

        vma =  vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
        if (IS_ERR(vma))
                return PTR_ERR(vma);

        *prev = vma;
        return vma_replace_policy(vma, new_pol);
}

/* Set the process memory policy */
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
                             nodemask_t *nodes)
{
        struct mempolicy *new, *old;
        NODEMASK_SCRATCH(scratch);
        int ret;

        if (!scratch)
                return -ENOMEM;

        new = mpol_new(mode, flags, nodes);
        if (IS_ERR(new)) {
                ret = PTR_ERR(new);
                goto out;
        }

        task_lock(current);
        ret = mpol_set_nodemask(new, nodes, scratch);
        if (ret) {
                task_unlock(current);
                mpol_put(new);
                goto out;
        }

        old = current->mempolicy;
        current->mempolicy = new;
        if (new && (new->mode == MPOL_INTERLEAVE ||
                    new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
                current->il_prev = MAX_NUMNODES-1;
                current->il_weight = 0;
        }
        task_unlock(current);
        mpol_put(old);
        ret = 0;
out:
        NODEMASK_SCRATCH_FREE(scratch);
        return ret;
}

/*
 * Return nodemask for policy for get_mempolicy() query
 *
 * Called with task's alloc_lock held
 */
static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
{
        nodes_clear(*nodes);
        if (pol == &default_policy)
                return;

        switch (pol->mode) {
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_PREFERRED:
        case MPOL_PREFERRED_MANY:
        case MPOL_WEIGHTED_INTERLEAVE:
                *nodes = pol->nodes;
                break;
        case MPOL_LOCAL:
                /* return empty node mask for local allocation */
                break;
        default:
                BUG();
        }
}

static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
        struct page *p = NULL;
        int ret;

        ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
        if (ret > 0) {
                ret = page_to_nid(p);
                put_page(p);
        }
        return ret;
}

/* Retrieve NUMA policy */
static long do_get_mempolicy(int *policy, nodemask_t *nmask,
                             unsigned long addr, unsigned long flags)
{
        int err;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = NULL;
        struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;

        if (flags &
                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
                return -EINVAL;

        if (flags & MPOL_F_MEMS_ALLOWED) {
                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
                        return -EINVAL;
                *policy = 0;        /* just so it's initialized */
                task_lock(current);
                *nmask  = cpuset_current_mems_allowed;
                task_unlock(current);
                return 0;
        }

        if (flags & MPOL_F_ADDR) {
                pgoff_t ilx;                /* ignored here */
                /*
                 * Do NOT fall back to task policy if the
                 * vma/shared policy at addr is NULL.  We
                 * want to return MPOL_DEFAULT in this case.
                 */
                mmap_read_lock(mm);
                vma = vma_lookup(mm, addr);
                if (!vma) {
                        mmap_read_unlock(mm);
                        return -EFAULT;
                }
                pol = __get_vma_policy(vma, addr, &ilx);
        } else if (addr)
                return -EINVAL;

        if (!pol)
                pol = &default_policy;        /* indicates default behavior */

        if (flags & MPOL_F_NODE) {
                if (flags & MPOL_F_ADDR) {
                        /*
                         * Take a refcount on the mpol, because we are about to
                         * drop the mmap_lock, after which only "pol" remains
                         * valid, "vma" is stale.
                         */
                        pol_refcount = pol;
                        vma = NULL;
                        mpol_get(pol);
                        mmap_read_unlock(mm);
                        err = lookup_node(mm, addr);
                        if (err < 0)
                                goto out;
                        *policy = err;
                } else if (pol == current->mempolicy &&
                                pol->mode == MPOL_INTERLEAVE) {
                        *policy = next_node_in(current->il_prev, pol->nodes);
                } else if (pol == current->mempolicy &&
                                pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
                        if (current->il_weight)
                                *policy = current->il_prev;
                        else
                                *policy = next_node_in(current->il_prev,
                                                       pol->nodes);
                } else {
                        err = -EINVAL;
                        goto out;
                }
        } else {
                *policy = pol == &default_policy ? MPOL_DEFAULT :
                                                pol->mode;
                /*
                 * Internal mempolicy flags must be masked off before exposing
                 * the policy to userspace.
                 */
                *policy |= (pol->flags & MPOL_MODE_FLAGS);
        }

        err = 0;
        if (nmask) {
                if (mpol_store_user_nodemask(pol)) {
                        *nmask = pol->w.user_nodemask;
                } else {
                        task_lock(current);
                        get_policy_nodemask(pol, nmask);
                        task_unlock(current);
                }
        }

 out:
        mpol_cond_put(pol);
        if (vma)
                mmap_read_unlock(mm);
        if (pol_refcount)
                mpol_put(pol_refcount);
        return err;
}

#ifdef CONFIG_MIGRATION
static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
                                unsigned long flags)
{
        /*
         * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
         * Choosing not to migrate a shared folio is not counted as a failure.
         *
         * See folio_likely_mapped_shared() on possible imprecision when we
         * cannot easily detect if a folio is shared.
         */
        if ((flags & MPOL_MF_MOVE_ALL) || !folio_likely_mapped_shared(folio)) {
                if (folio_isolate_lru(folio)) {
                        list_add_tail(&folio->lru, foliolist);
                        node_stat_mod_folio(folio,
                                NR_ISOLATED_ANON + folio_is_file_lru(folio),
                                folio_nr_pages(folio));
                } else {
                        /*
                         * Non-movable folio may reach here.  And, there may be
                         * temporary off LRU folios or non-LRU movable folios.
                         * Treat them as unmovable folios since they can't be
                         * isolated, so they can't be moved at the moment.
                         */
                        return false;
                }
        }
        return true;
}

/*
 * Migrate pages from one node to a target node.
 * Returns error or the number of pages not migrated.
 */
static long migrate_to_node(struct mm_struct *mm, int source, int dest,
                            int flags)
{
        nodemask_t nmask;
        struct vm_area_struct *vma;
        LIST_HEAD(pagelist);
        long nr_failed;
        long err = 0;
        struct migration_target_control mtc = {
                .nid = dest,
                .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
                .reason = MR_SYSCALL,
        };

        nodes_clear(nmask);
        node_set(source, nmask);

        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));

        mmap_read_lock(mm);
        vma = find_vma(mm, 0);

        /*
         * This does not migrate the range, but isolates all pages that
         * need migration.  Between passing in the full user address
         * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
         * but passes back the count of pages which could not be isolated.
         */
        nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
                                      flags | MPOL_MF_DISCONTIG_OK, &pagelist);
        mmap_read_unlock(mm);

        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, alloc_migration_target, NULL,
                        (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
                if (err)
                        putback_movable_pages(&pagelist);
        }

        if (err >= 0)
                err += nr_failed;
        return err;
}

/*
 * Move pages between the two nodesets so as to preserve the physical
 * layout as much as possible.
 *
 * Returns the number of page that could not be moved.
 */
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags)
{
        long nr_failed = 0;
        long err = 0;
        nodemask_t tmp;

        lru_cache_disable();

        /*
         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
         * bit in 'tmp', and return that <source, dest> pair for migration.
         * The pair of nodemasks 'to' and 'from' define the map.
         *
         * If no pair of bits is found that way, fallback to picking some
         * pair of 'source' and 'dest' bits that are not the same.  If the
         * 'source' and 'dest' bits are the same, this represents a node
         * that will be migrating to itself, so no pages need move.
         *
         * If no bits are left in 'tmp', or if all remaining bits left
         * in 'tmp' correspond to the same bit in 'to', return false
         * (nothing left to migrate).
         *
         * This lets us pick a pair of nodes to migrate between, such that
         * if possible the dest node is not already occupied by some other
         * source node, minimizing the risk of overloading the memory on a
         * node that would happen if we migrated incoming memory to a node
         * before migrating outgoing memory source that same node.
         *
         * A single scan of tmp is sufficient.  As we go, we remember the
         * most recent <s, d> pair that moved (s != d).  If we find a pair
         * that not only moved, but what's better, moved to an empty slot
         * (d is not set in tmp), then we break out then, with that pair.
         * Otherwise when we finish scanning from_tmp, we at least have the
         * most recent <s, d> pair that moved.  If we get all the way through
         * the scan of tmp without finding any node that moved, much less
         * moved to an empty node, then there is nothing left worth migrating.
         */

        tmp = *from;
        while (!nodes_empty(tmp)) {
                int s, d;
                int source = NUMA_NO_NODE;
                int dest = 0;

                for_each_node_mask(s, tmp) {

                        /*
                         * do_migrate_pages() tries to maintain the relative
                         * node relationship of the pages established between
                         * threads and memory areas.
                         *
                         * However if the number of source nodes is not equal to
                         * the number of destination nodes we can not preserve
                         * this node relative relationship.  In that case, skip
                         * copying memory from a node that is in the destination
                         * mask.
                         *
                         * Example: [2,3,4] -> [3,4,5] moves everything.
                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
                         */

                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
                                                (node_isset(s, *to)))
                                continue;

                        d = node_remap(s, *from, *to);
                        if (s == d)
                                continue;

                        source = s;        /* Node moved. Memorize */
                        dest = d;

                        /* dest not in remaining from nodes? */
                        if (!node_isset(dest, tmp))
                                break;
                }
                if (source == NUMA_NO_NODE)
                        break;

                node_clear(source, tmp);
                err = migrate_to_node(mm, source, dest, flags);
                if (err > 0)
                        nr_failed += err;
                if (err < 0)
                        break;
        }

        lru_cache_enable();
        if (err < 0)
                return err;
        return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
}

/*
 * Allocate a new folio for page migration, according to NUMA mempolicy.
 */
static struct folio *alloc_migration_target_by_mpol(struct folio *src,
                                                    unsigned long private)
{
        struct migration_mpol *mmpol = (struct migration_mpol *)private;
        struct mempolicy *pol = mmpol->pol;
        pgoff_t ilx = mmpol->ilx;
        struct page *page;
        unsigned int order;
        int nid = numa_node_id();
        gfp_t gfp;

        order = folio_order(src);
        ilx += src->index >> order;

        if (folio_test_hugetlb(src)) {
                nodemask_t *nodemask;
                struct hstate *h;

                h = folio_hstate(src);
                gfp = htlb_alloc_mask(h);
                nodemask = policy_nodemask(gfp, pol, ilx, &nid);
                return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
                                htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
        }

        if (folio_test_large(src))
                gfp = GFP_TRANSHUGE;
        else
                gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;

        page = alloc_pages_mpol(gfp, order, pol, ilx, nid);
        return page_rmappable_folio(page);
}
#else

static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
                                unsigned long flags)
{
        return false;
}

int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags)
{
        return -ENOSYS;
}

static struct folio *alloc_migration_target_by_mpol(struct folio *src,
                                                    unsigned long private)
{
        return NULL;
}
#endif

static long do_mbind(unsigned long start, unsigned long len,
                     unsigned short mode, unsigned short mode_flags,
                     nodemask_t *nmask, unsigned long flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
        struct vma_iterator vmi;
        struct migration_mpol mmpol;
        struct mempolicy *new;
        unsigned long end;
        long err;
        long nr_failed;
        LIST_HEAD(pagelist);

        if (flags & ~(unsigned long)MPOL_MF_VALID)
                return -EINVAL;
        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
                return -EPERM;

        if (start & ~PAGE_MASK)
                return -EINVAL;

        if (mode == MPOL_DEFAULT)
                flags &= ~MPOL_MF_STRICT;

        len = PAGE_ALIGN(len);
        end = start + len;

        if (end < start)
                return -EINVAL;
        if (end == start)
                return 0;

        new = mpol_new(mode, mode_flags, nmask);
        if (IS_ERR(new))
                return PTR_ERR(new);

        /*
         * If we are using the default policy then operation
         * on discontinuous address spaces is okay after all
         */
        if (!new)
                flags |= MPOL_MF_DISCONTIG_OK;

        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                lru_cache_disable();
        {
                NODEMASK_SCRATCH(scratch);
                if (scratch) {
                        mmap_write_lock(mm);
                        err = mpol_set_nodemask(new, nmask, scratch);
                        if (err)
                                mmap_write_unlock(mm);
                } else
                        err = -ENOMEM;
                NODEMASK_SCRATCH_FREE(scratch);
        }
        if (err)
                goto mpol_out;

        /*
         * Lock the VMAs before scanning for pages to migrate,
         * to ensure we don't miss a concurrently inserted page.
         */
        nr_failed = queue_pages_range(mm, start, end, nmask,
                        flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);

        if (nr_failed < 0) {
                err = nr_failed;
                nr_failed = 0;
        } else {
                vma_iter_init(&vmi, mm, start);
                prev = vma_prev(&vmi);
                for_each_vma_range(vmi, vma, end) {
                        err = mbind_range(&vmi, vma, &prev, start, end, new);
                        if (err)
                                break;
                }
        }

        if (!err && !list_empty(&pagelist)) {
                /* Convert MPOL_DEFAULT's NULL to task or default policy */
                if (!new) {
                        new = get_task_policy(current);
                        mpol_get(new);
                }
                mmpol.pol = new;
                mmpol.ilx = 0;

                /*
                 * In the interleaved case, attempt to allocate on exactly the
                 * targeted nodes, for the first VMA to be migrated; for later
                 * VMAs, the nodes will still be interleaved from the targeted
                 * nodemask, but one by one may be selected differently.
                 */
                if (new->mode == MPOL_INTERLEAVE ||
                    new->mode == MPOL_WEIGHTED_INTERLEAVE) {
                        struct folio *folio;
                        unsigned int order;
                        unsigned long addr = -EFAULT;

                        list_for_each_entry(folio, &pagelist, lru) {
                                if (!folio_test_ksm(folio))
                                        break;
                        }
                        if (!list_entry_is_head(folio, &pagelist, lru)) {
                                vma_iter_init(&vmi, mm, start);
                                for_each_vma_range(vmi, vma, end) {
                                        addr = page_address_in_vma(
                                                folio_page(folio, 0), vma);
                                        if (addr != -EFAULT)
                                                break;
                                }
                        }
                        if (addr != -EFAULT) {
                                order = folio_order(folio);
                                /* We already know the pol, but not the ilx */
                                mpol_cond_put(get_vma_policy(vma, addr, order,
                                                             &mmpol.ilx));
                                /* Set base from which to increment by index */
                                mmpol.ilx -= folio->index >> order;
                        }
                }
        }

        mmap_write_unlock(mm);

        if (!err && !list_empty(&pagelist)) {
                nr_failed |= migrate_pages(&pagelist,
                                alloc_migration_target_by_mpol, NULL,
                                (unsigned long)&mmpol, MIGRATE_SYNC,
                                MR_MEMPOLICY_MBIND, NULL);
        }

        if (nr_failed && (flags & MPOL_MF_STRICT))
                err = -EIO;
        if (!list_empty(&pagelist))
                putback_movable_pages(&pagelist);
mpol_out:
        mpol_put(new);
        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                lru_cache_enable();
        return err;
}

/*
 * User space interface with variable sized bitmaps for nodelists.
 */
static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
                      unsigned long maxnode)
{
        unsigned long nlongs = BITS_TO_LONGS(maxnode);
        int ret;

        if (in_compat_syscall())
                ret = compat_get_bitmap(mask,
                                        (const compat_ulong_t __user *)nmask,
                                        maxnode);
        else
                ret = copy_from_user(mask, nmask,
                                     nlongs * sizeof(unsigned long));

        if (ret)
                return -EFAULT;

        if (maxnode % BITS_PER_LONG)
                mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;

        return 0;
}

/* Copy a node mask from user space. */
static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
                     unsigned long maxnode)
{
        --maxnode;
        nodes_clear(*nodes);
        if (maxnode == 0 || !nmask)
                return 0;
        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
                return -EINVAL;

        /*
         * When the user specified more nodes than supported just check
         * if the non supported part is all zero, one word at a time,
         * starting at the end.
         */
        while (maxnode > MAX_NUMNODES) {
                unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
                unsigned long t;

                if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
                        return -EFAULT;

                if (maxnode - bits >= MAX_NUMNODES) {
                        maxnode -= bits;
                } else {
                        maxnode = MAX_NUMNODES;
                        t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
                }
                if (t)
                        return -EINVAL;
        }

        return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
}

/* Copy a kernel node mask to user space */
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
                              nodemask_t *nodes)
{
        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
        unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
        bool compat = in_compat_syscall();

        if (compat)
                nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);

        if (copy > nbytes) {
                if (copy > PAGE_SIZE)
                        return -EINVAL;
                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
                        return -EFAULT;
                copy = nbytes;
                maxnode = nr_node_ids;
        }

        if (compat)
                return compat_put_bitmap((compat_ulong_t __user *)mask,
                                         nodes_addr(*nodes), maxnode);

        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}

/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
{
        *flags = *mode & MPOL_MODE_FLAGS;
        *mode &= ~MPOL_MODE_FLAGS;

        if ((unsigned int)(*mode) >=  MPOL_MAX)
                return -EINVAL;
        if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
                return -EINVAL;
        if (*flags & MPOL_F_NUMA_BALANCING) {
                if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
                        *flags |= (MPOL_F_MOF | MPOL_F_MORON);
                else
                        return -EINVAL;
        }
        return 0;
}

static long kernel_mbind(unsigned long start, unsigned long len,
                         unsigned long mode, const unsigned long __user *nmask,
                         unsigned long maxnode, unsigned int flags)
{
        unsigned short mode_flags;
        nodemask_t nodes;
        int lmode = mode;
        int err;

        start = untagged_addr(start);
        err = sanitize_mpol_flags(&lmode, &mode_flags);
        if (err)
                return err;

        err = get_nodes(&nodes, nmask, maxnode);
        if (err)
                return err;

        return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
}

SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
                unsigned long, home_node, unsigned long, flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
        struct mempolicy *new, *old;
        unsigned long end;
        int err = -ENOENT;
        VMA_ITERATOR(vmi, mm, start);

        start = untagged_addr(start);
        if (start & ~PAGE_MASK)
                return -EINVAL;
        /*
         * flags is used for future extension if any.
         */
        if (flags != 0)
                return -EINVAL;

        /*
         * Check home_node is online to avoid accessing uninitialized
         * NODE_DATA.
         */
        if (home_node >= MAX_NUMNODES || !node_online(home_node))
                return -EINVAL;

        len = PAGE_ALIGN(len);
        end = start + len;

        if (end < start)
                return -EINVAL;
        if (end == start)
                return 0;
        mmap_write_lock(mm);
        prev = vma_prev(&vmi);
        for_each_vma_range(vmi, vma, end) {
                /*
                 * If any vma in the range got policy other than MPOL_BIND
                 * or MPOL_PREFERRED_MANY we return error. We don't reset
                 * the home node for vmas we already updated before.
                 */
                old = vma_policy(vma);
                if (!old) {
                        prev = vma;
                        continue;
                }
                if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
                        err = -EOPNOTSUPP;
                        break;
                }
                new = mpol_dup(old);
                if (IS_ERR(new)) {
                        err = PTR_ERR(new);
                        break;
                }

                vma_start_write(vma);
                new->home_node = home_node;
                err = mbind_range(&vmi, vma, &prev, start, end, new);
                mpol_put(new);
                if (err)
                        break;
        }
        mmap_write_unlock(mm);
        return err;
}

SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
                unsigned long, mode, const unsigned long __user *, nmask,
                unsigned long, maxnode, unsigned int, flags)
{
        return kernel_mbind(start, len, mode, nmask, maxnode, flags);
}

/* Set the process memory policy */
static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
                                 unsigned long maxnode)
{
        unsigned short mode_flags;
        nodemask_t nodes;
        int lmode = mode;
        int err;

        err = sanitize_mpol_flags(&lmode, &mode_flags);
        if (err)
                return err;

        err = get_nodes(&nodes, nmask, maxnode);
        if (err)
                return err;

        return do_set_mempolicy(lmode, mode_flags, &nodes);
}

SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
                unsigned long, maxnode)
{
        return kernel_set_mempolicy(mode, nmask, maxnode);
}

static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
                                const unsigned long __user *old_nodes,
                                const unsigned long __user *new_nodes)
{
        struct mm_struct *mm = NULL;
        struct task_struct *task;
        nodemask_t task_nodes;
        int err;
        nodemask_t *old;
        nodemask_t *new;
        NODEMASK_SCRATCH(scratch);

        if (!scratch)
                return -ENOMEM;

        old = &scratch->mask1;
        new = &scratch->mask2;

        err = get_nodes(old, old_nodes, maxnode);
        if (err)
                goto out;

        err = get_nodes(new, new_nodes, maxnode);
        if (err)
                goto out;

        /* Find the mm_struct */
        rcu_read_lock();
        task = pid ? find_task_by_vpid(pid) : current;
        if (!task) {
                rcu_read_unlock();
                err = -ESRCH;
                goto out;
        }
        get_task_struct(task);

        err = -EINVAL;

        /*
         * Check if this process has the right to modify the specified process.
         * Use the regular "ptrace_may_access()" checks.
         */
        if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
                rcu_read_unlock();
                err = -EPERM;
                goto out_put;
        }
        rcu_read_unlock();

        task_nodes = cpuset_mems_allowed(task);
        /* Is the user allowed to access the target nodes? */
        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
                err = -EPERM;
                goto out_put;
        }

        task_nodes = cpuset_mems_allowed(current);
        nodes_and(*new, *new, task_nodes);
        if (nodes_empty(*new))
                goto out_put;

        err = security_task_movememory(task);
        if (err)
                goto out_put;

        mm = get_task_mm(task);
        put_task_struct(task);

        if (!mm) {
                err = -EINVAL;
                goto out;
        }

        err = do_migrate_pages(mm, old, new,
                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);

        mmput(mm);
out:
        NODEMASK_SCRATCH_FREE(scratch);

        return err;

out_put:
        put_task_struct(task);
        goto out;
}

SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
                const unsigned long __user *, old_nodes,
                const unsigned long __user *, new_nodes)
{
        return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
}

/* Retrieve NUMA policy */
static int kernel_get_mempolicy(int __user *policy,
                                unsigned long __user *nmask,
                                unsigned long maxnode,
                                unsigned long addr,
                                unsigned long flags)
{
        int err;
        int pval;
        nodemask_t nodes;

        if (nmask != NULL && maxnode < nr_node_ids)
                return -EINVAL;

        addr = untagged_addr(addr);

        err = do_get_mempolicy(&pval, &nodes, addr, flags);

        if (err)
                return err;

        if (policy && put_user(pval, policy))
                return -EFAULT;

        if (nmask)
                err = copy_nodes_to_user(nmask, maxnode, &nodes);

        return err;
}

SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
                unsigned long __user *, nmask, unsigned long, maxnode,
                unsigned long, addr, unsigned long, flags)
{
        return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
}

bool vma_migratable(struct vm_area_struct *vma)
{
        if (vma->vm_flags & (VM_IO | VM_PFNMAP))
                return false;

        /*
         * DAX device mappings require predictable access latency, so avoid
         * incurring periodic faults.
         */
        if (vma_is_dax(vma))
                return false;

        if (is_vm_hugetlb_page(vma) &&
                !hugepage_migration_supported(hstate_vma(vma)))
                return false;

        /*
         * Migration allocates pages in the highest zone. If we cannot
         * do so then migration (at least from node to node) is not
         * possible.
         */
        if (vma->vm_file &&
                gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
                        < policy_zone)
                return false;
        return true;
}

struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
                                   unsigned long addr, pgoff_t *ilx)
{
        *ilx = 0;
        return (vma->vm_ops && vma->vm_ops->get_policy) ?
                vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
}

/*
 * get_vma_policy(@vma, @addr, @order, @ilx)
 * @vma: virtual memory area whose policy is sought
 * @addr: address in @vma for shared policy lookup
 * @order: 0, or appropriate huge_page_order for interleaving
 * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
 *       MPOL_WEIGHTED_INTERLEAVE
 *
 * Returns effective policy for a VMA at specified address.
 * Falls back to current->mempolicy or system default policy, as necessary.
 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
 * count--added by the get_policy() vm_op, as appropriate--to protect against
 * freeing by another task.  It is the caller's responsibility to free the
 * extra reference for shared policies.
 */
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
                                 unsigned long addr, int order, pgoff_t *ilx)
{
        struct mempolicy *pol;

        pol = __get_vma_policy(vma, addr, ilx);
        if (!pol)
                pol = get_task_policy(current);
        if (pol->mode == MPOL_INTERLEAVE ||
            pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
                *ilx += vma->vm_pgoff >> order;
                *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
        }
        return pol;
}

bool vma_policy_mof(struct vm_area_struct *vma)
{
        struct mempolicy *pol;

        if (vma->vm_ops && vma->vm_ops->get_policy) {
                bool ret = false;
                pgoff_t ilx;                /* ignored here */

                pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
                if (pol && (pol->flags & MPOL_F_MOF))
                        ret = true;
                mpol_cond_put(pol);

                return ret;
        }

        pol = vma->vm_policy;
        if (!pol)
                pol = get_task_policy(current);

        return pol->flags & MPOL_F_MOF;
}

bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
        enum zone_type dynamic_policy_zone = policy_zone;

        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);

        /*
         * if policy->nodes has movable memory only,
         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
         *
         * policy->nodes is intersect with node_states[N_MEMORY].
         * so if the following test fails, it implies
         * policy->nodes has movable memory only.
         */
        if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
                dynamic_policy_zone = ZONE_MOVABLE;

        return zone >= dynamic_policy_zone;
}

static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
{
        unsigned int node;
        unsigned int cpuset_mems_cookie;

retry:
        /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
        cpuset_mems_cookie = read_mems_allowed_begin();
        node = current->il_prev;
        if (!current->il_weight || !node_isset(node, policy->nodes)) {
                node = next_node_in(node, policy->nodes);
                if (read_mems_allowed_retry(cpuset_mems_cookie))
                        goto retry;
                if (node == MAX_NUMNODES)
                        return node;
                current->il_prev = node;
                current->il_weight = get_il_weight(node);
        }
        current->il_weight--;
        return node;
}

/* Do dynamic interleaving for a process */
static unsigned int interleave_nodes(struct mempolicy *policy)
{
        unsigned int nid;
        unsigned int cpuset_mems_cookie;

        /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
        do {
                cpuset_mems_cookie = read_mems_allowed_begin();
                nid = next_node_in(current->il_prev, policy->nodes);
        } while (read_mems_allowed_retry(cpuset_mems_cookie));

        if (nid < MAX_NUMNODES)
                current->il_prev = nid;
        return nid;
}

/*
 * Depending on the memory policy provide a node from which to allocate the
 * next slab entry.
 */
unsigned int mempolicy_slab_node(void)
{
        struct mempolicy *policy;
        int node = numa_mem_id();

        if (!in_task())
                return node;

        policy = current->mempolicy;
        if (!policy)
                return node;

        switch (policy->mode) {
        case MPOL_PREFERRED:
                return first_node(policy->nodes);

        case MPOL_INTERLEAVE:
                return interleave_nodes(policy);

        case MPOL_WEIGHTED_INTERLEAVE:
                return weighted_interleave_nodes(policy);

        case MPOL_BIND:
        case MPOL_PREFERRED_MANY:
        {
                struct zoneref *z;

                /*
                 * Follow bind policy behavior and start allocation at the
                 * first node.
                 */
                struct zonelist *zonelist;
                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
                zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
                z = first_zones_zonelist(zonelist, highest_zoneidx,
                                                        &policy->nodes);
                return z->zone ? zone_to_nid(z->zone) : node;
        }
        case MPOL_LOCAL:
                return node;

        default:
                BUG();
        }
}

static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
                                              nodemask_t *mask)
{
        /*
         * barrier stabilizes the nodemask locally so that it can be iterated
         * over safely without concern for changes. Allocators validate node
         * selection does not violate mems_allowed, so this is safe.
         */
        barrier();
        memcpy(mask, &pol->nodes, sizeof(nodemask_t));
        barrier();
        return nodes_weight(*mask);
}

static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
        nodemask_t nodemask;
        unsigned int target, nr_nodes;
        u8 *table;
        unsigned int weight_total = 0;
        u8 weight;
        int nid;

        nr_nodes = read_once_policy_nodemask(pol, &nodemask);
        if (!nr_nodes)
                return numa_node_id();

        rcu_read_lock();
        table = rcu_dereference(iw_table);
        /* calculate the total weight */
        for_each_node_mask(nid, nodemask) {
                /* detect system default usage */
                weight = table ? table[nid] : 1;
                weight = weight ? weight : 1;
                weight_total += weight;
        }

        /* Calculate the node offset based on totals */
        target = ilx % weight_total;
        nid = first_node(nodemask);
        while (target) {
                /* detect system default usage */
                weight = table ? table[nid] : 1;
                weight = weight ? weight : 1;
                if (target < weight)
                        break;
                target -= weight;
                nid = next_node_in(nid, nodemask);
        }
        rcu_read_unlock();
        return nid;
}

/*
 * Do static interleaving for interleave index @ilx.  Returns the ilx'th
 * node in pol->nodes (starting from ilx=0), wrapping around if ilx
 * exceeds the number of present nodes.
 */
static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
        nodemask_t nodemask;
        unsigned int target, nnodes;
        int i;
        int nid;

        nnodes = read_once_policy_nodemask(pol, &nodemask);
        if (!nnodes)
                return numa_node_id();
        target = ilx % nnodes;
        nid = first_node(nodemask);
        for (i = 0; i < target; i++)
                nid = next_node(nid, nodemask);
        return nid;
}

/*
 * Return a nodemask representing a mempolicy for filtering nodes for
 * page allocation, together with preferred node id (or the input node id).
 */
static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
                                   pgoff_t ilx, int *nid)
{
        nodemask_t *nodemask = NULL;

        switch (pol->mode) {
        case MPOL_PREFERRED:
                /* Override input node id */
                *nid = first_node(pol->nodes);
                break;
        case MPOL_PREFERRED_MANY:
                nodemask = &pol->nodes;
                if (pol->home_node != NUMA_NO_NODE)
                        *nid = pol->home_node;
                break;
        case MPOL_BIND:
                /* Restrict to nodemask (but not on lower zones) */
                if (apply_policy_zone(pol, gfp_zone(gfp)) &&
                    cpuset_nodemask_valid_mems_allowed(&pol->nodes))
                        nodemask = &pol->nodes;
                if (pol->home_node != NUMA_NO_NODE)
                        *nid = pol->home_node;
                /*
                 * __GFP_THISNODE shouldn't even be used with the bind policy
                 * because we might easily break the expectation to stay on the
                 * requested node and not break the policy.
                 */
                WARN_ON_ONCE(gfp & __GFP_THISNODE);
                break;
        case MPOL_INTERLEAVE:
                /* Override input node id */
                *nid = (ilx == NO_INTERLEAVE_INDEX) ?
                        interleave_nodes(pol) : interleave_nid(pol, ilx);
                break;
        case MPOL_WEIGHTED_INTERLEAVE:
                *nid = (ilx == NO_INTERLEAVE_INDEX) ?
                        weighted_interleave_nodes(pol) :
                        weighted_interleave_nid(pol, ilx);
                break;
        }

        return nodemask;
}

#ifdef CONFIG_HUGETLBFS
/*
 * huge_node(@vma, @addr, @gfp_flags, @mpol)
 * @vma: virtual memory area whose policy is sought
 * @addr: address in @vma for shared policy lookup and interleave policy
 * @gfp_flags: for requested zone
 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
 *
 * Returns a nid suitable for a huge page allocation and a pointer
 * to the struct mempolicy for conditional unref after allocation.
 * If the effective policy is 'bind' or 'prefer-many', returns a pointer
 * to the mempolicy's @nodemask for filtering the zonelist.
 */
int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
                struct mempolicy **mpol, nodemask_t **nodemask)
{
        pgoff_t ilx;
        int nid;

        nid = numa_node_id();
        *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
        *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
        return nid;
}

/*
 * init_nodemask_of_mempolicy
 *
 * If the current task's mempolicy is "default" [NULL], return 'false'
 * to indicate default policy.  Otherwise, extract the policy nodemask
 * for 'bind' or 'interleave' policy into the argument nodemask, or
 * initialize the argument nodemask to contain the single node for
 * 'preferred' or 'local' policy and return 'true' to indicate presence
 * of non-default mempolicy.
 *
 * We don't bother with reference counting the mempolicy [mpol_get/put]
 * because the current task is examining it's own mempolicy and a task's
 * mempolicy is only ever changed by the task itself.
 *
 * N.B., it is the caller's responsibility to free a returned nodemask.
 */
bool init_nodemask_of_mempolicy(nodemask_t *mask)
{
        struct mempolicy *mempolicy;

        if (!(mask && current->mempolicy))
                return false;

        task_lock(current);
        mempolicy = current->mempolicy;
        switch (mempolicy->mode) {
        case MPOL_PREFERRED:
        case MPOL_PREFERRED_MANY:
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_WEIGHTED_INTERLEAVE:
                *mask = mempolicy->nodes;
                break;

        case MPOL_LOCAL:
                init_nodemask_of_node(mask, numa_node_id());
                break;

        default:
                BUG();
        }
        task_unlock(current);

        return true;
}
#endif

/*
 * mempolicy_in_oom_domain
 *
 * If tsk's mempolicy is "bind", check for intersection between mask and
 * the policy nodemask. Otherwise, return true for all other policies
 * including "interleave", as a tsk with "interleave" policy may have
 * memory allocated from all nodes in system.
 *
 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
 */
bool mempolicy_in_oom_domain(struct task_struct *tsk,
                                        const nodemask_t *mask)
{
        struct mempolicy *mempolicy;
        bool ret = true;

        if (!mask)
                return ret;

        task_lock(tsk);
        mempolicy = tsk->mempolicy;
        if (mempolicy && mempolicy->mode == MPOL_BIND)
                ret = nodes_intersects(mempolicy->nodes, *mask);
        task_unlock(tsk);

        return ret;
}

static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
                                                int nid, nodemask_t *nodemask)
{
        struct page *page;
        gfp_t preferred_gfp;

        /*
         * This is a two pass approach. The first pass will only try the
         * preferred nodes but skip the direct reclaim and allow the
         * allocation to fail, while the second pass will try all the
         * nodes in system.
         */
        preferred_gfp = gfp | __GFP_NOWARN;
        preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
        page = __alloc_pages_noprof(preferred_gfp, order, nid, nodemask);
        if (!page)
                page = __alloc_pages_noprof(gfp, order, nid, NULL);

        return page;
}

/**
 * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
 * @gfp: GFP flags.
 * @order: Order of the page allocation.
 * @pol: Pointer to the NUMA mempolicy.
 * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
 * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
 *
 * Return: The page on success or NULL if allocation fails.
 */
struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
                struct mempolicy *pol, pgoff_t ilx, int nid)
{
        nodemask_t *nodemask;
        struct page *page;

        nodemask = policy_nodemask(gfp, pol, ilx, &nid);

        if (pol->mode == MPOL_PREFERRED_MANY)
                return alloc_pages_preferred_many(gfp, order, nid, nodemask);

        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
            /* filter "hugepage" allocation, unless from alloc_pages() */
            order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
                /*
                 * For hugepage allocation and non-interleave policy which
                 * allows the current node (or other explicitly preferred
                 * node) we only try to allocate from the current/preferred
                 * node and don't fall back to other nodes, as the cost of
                 * remote accesses would likely offset THP benefits.
                 *
                 * If the policy is interleave or does not allow the current
                 * node in its nodemask, we allocate the standard way.
                 */
                if (pol->mode != MPOL_INTERLEAVE &&
                    pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
                    (!nodemask || node_isset(nid, *nodemask))) {
                        /*
                         * First, try to allocate THP only on local node, but
                         * don't reclaim unnecessarily, just compact.
                         */
                        page = __alloc_pages_node_noprof(nid,
                                gfp | __GFP_THISNODE | __GFP_NORETRY, order);
                        if (page || !(gfp & __GFP_DIRECT_RECLAIM))
                                return page;
                        /*
                         * If hugepage allocations are configured to always
                         * synchronous compact or the vma has been madvised
                         * to prefer hugepage backing, retry allowing remote
                         * memory with both reclaim and compact as well.
                         */
                }
        }

        page = __alloc_pages_noprof(gfp, order, nid, nodemask);

        if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) {
                /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
                if (static_branch_likely(&vm_numa_stat_key) &&
                    page_to_nid(page) == nid) {
                        preempt_disable();
                        __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
                        preempt_enable();
                }
        }

        return page;
}

/**
 * vma_alloc_folio - Allocate a folio for a VMA.
 * @gfp: GFP flags.
 * @order: Order of the folio.
 * @vma: Pointer to VMA.
 * @addr: Virtual address of the allocation.  Must be inside @vma.
 * @hugepage: Unused (was: For hugepages try only preferred node if possible).
 *
 * Allocate a folio for a specific address in @vma, using the appropriate
 * NUMA policy.  The caller must hold the mmap_lock of the mm_struct of the
 * VMA to prevent it from going away.  Should be used for all allocations
 * for folios that will be mapped into user space, excepting hugetlbfs, and
 * excepting where direct use of alloc_pages_mpol() is more appropriate.
 *
 * Return: The folio on success or NULL if allocation fails.
 */
struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
                unsigned long addr, bool hugepage)
{
        struct mempolicy *pol;
        pgoff_t ilx;
        struct page *page;

        pol = get_vma_policy(vma, addr, order, &ilx);
        page = alloc_pages_mpol_noprof(gfp | __GFP_COMP, order,
                                       pol, ilx, numa_node_id());
        mpol_cond_put(pol);
        return page_rmappable_folio(page);
}
EXPORT_SYMBOL(vma_alloc_folio_noprof);

/**
 * alloc_pages - Allocate pages.
 * @gfp: GFP flags.
 * @order: Power of two of number of pages to allocate.
 *
 * Allocate 1 << @order contiguous pages.  The physical address of the
 * first page is naturally aligned (eg an order-3 allocation will be aligned
 * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
 * process is honoured when in process context.
 *
 * Context: Can be called from any context, providing the appropriate GFP
 * flags are used.
 * Return: The page on success or NULL if allocation fails.
 */
struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
{
        struct mempolicy *pol = &default_policy;

        /*
         * No reference counting needed for current->mempolicy
         * nor system default_policy
         */
        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
                pol = get_task_policy(current);

        return alloc_pages_mpol_noprof(gfp, order, pol, NO_INTERLEAVE_INDEX,
                                       numa_node_id());
}
EXPORT_SYMBOL(alloc_pages_noprof);

struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
{
        return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
}
EXPORT_SYMBOL(folio_alloc_noprof);

static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
                struct mempolicy *pol, unsigned long nr_pages,
                struct page **page_array)
{
        int nodes;
        unsigned long nr_pages_per_node;
        int delta;
        int i;
        unsigned long nr_allocated;
        unsigned long total_allocated = 0;

        nodes = nodes_weight(pol->nodes);
        nr_pages_per_node = nr_pages / nodes;
        delta = nr_pages - nodes * nr_pages_per_node;

        for (i = 0; i < nodes; i++) {
                if (delta) {
                        nr_allocated = alloc_pages_bulk_noprof(gfp,
                                        interleave_nodes(pol), NULL,
                                        nr_pages_per_node + 1, NULL,
                                        page_array);
                        delta--;
                } else {
                        nr_allocated = alloc_pages_bulk_noprof(gfp,
                                        interleave_nodes(pol), NULL,
                                        nr_pages_per_node, NULL, page_array);
                }

                page_array += nr_allocated;
                total_allocated += nr_allocated;
        }

        return total_allocated;
}

static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
                struct mempolicy *pol, unsigned long nr_pages,
                struct page **page_array)
{
        struct task_struct *me = current;
        unsigned int cpuset_mems_cookie;
        unsigned long total_allocated = 0;
        unsigned long nr_allocated = 0;
        unsigned long rounds;
        unsigned long node_pages, delta;
        u8 *table, *weights, weight;
        unsigned int weight_total = 0;
        unsigned long rem_pages = nr_pages;
        nodemask_t nodes;
        int nnodes, node;
        int resume_node = MAX_NUMNODES - 1;
        u8 resume_weight = 0;
        int prev_node;
        int i;

        if (!nr_pages)
                return 0;

        /* read the nodes onto the stack, retry if done during rebind */
        do {
                cpuset_mems_cookie = read_mems_allowed_begin();
                nnodes = read_once_policy_nodemask(pol, &nodes);
        } while (read_mems_allowed_retry(cpuset_mems_cookie));

        /* if the nodemask has become invalid, we cannot do anything */
        if (!nnodes)
                return 0;

        /* Continue allocating from most recent node and adjust the nr_pages */
        node = me->il_prev;
        weight = me->il_weight;
        if (weight && node_isset(node, nodes)) {
                node_pages = min(rem_pages, weight);
                nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
                                                  NULL, page_array);
                page_array += nr_allocated;
                total_allocated += nr_allocated;
                /* if that's all the pages, no need to interleave */
                if (rem_pages <= weight) {
                        me->il_weight -= rem_pages;
                        return total_allocated;
                }
                /* Otherwise we adjust remaining pages, continue from there */
                rem_pages -= weight;
        }
        /* clear active weight in case of an allocation failure */
        me->il_weight = 0;
        prev_node = node;

        /* create a local copy of node weights to operate on outside rcu */
        weights = kzalloc(nr_node_ids, GFP_KERNEL);
        if (!weights)
                return total_allocated;

        rcu_read_lock();
        table = rcu_dereference(iw_table);
        if (table)
                memcpy(weights, table, nr_node_ids);
        rcu_read_unlock();

        /* calculate total, detect system default usage */
        for_each_node_mask(node, nodes) {
                if (!weights[node])
                        weights[node] = 1;
                weight_total += weights[node];
        }

        /*
         * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
         * Track which node weighted interleave should resume from.
         *
         * if (rounds > 0) and (delta == 0), resume_node will always be
         * the node following prev_node and its weight.
         */
        rounds = rem_pages / weight_total;
        delta = rem_pages % weight_total;
        resume_node = next_node_in(prev_node, nodes);
        resume_weight = weights[resume_node];
        for (i = 0; i < nnodes; i++) {
                node = next_node_in(prev_node, nodes);
                weight = weights[node];
                node_pages = weight * rounds;
                /* If a delta exists, add this node's portion of the delta */
                if (delta > weight) {
                        node_pages += weight;
                        delta -= weight;
                } else if (delta) {
                        /* when delta is depleted, resume from that node */
                        node_pages += delta;
                        resume_node = node;
                        resume_weight = weight - delta;
                        delta = 0;
                }
                /* node_pages can be 0 if an allocation fails and rounds == 0 */
                if (!node_pages)
                        break;
                nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
                                                  NULL, page_array);
                page_array += nr_allocated;
                total_allocated += nr_allocated;
                if (total_allocated == nr_pages)
                        break;
                prev_node = node;
        }
        me->il_prev = resume_node;
        me->il_weight = resume_weight;
        kfree(weights);
        return total_allocated;
}

static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
                struct mempolicy *pol, unsigned long nr_pages,
                struct page **page_array)
{
        gfp_t preferred_gfp;
        unsigned long nr_allocated = 0;

        preferred_gfp = gfp | __GFP_NOWARN;
        preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);

        nr_allocated  = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
                                           nr_pages, NULL, page_array);

        if (nr_allocated < nr_pages)
                nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
                                nr_pages - nr_allocated, NULL,
                                page_array + nr_allocated);
        return nr_allocated;
}

/* alloc pages bulk and mempolicy should be considered at the
 * same time in some situation such as vmalloc.
 *
 * It can accelerate memory allocation especially interleaving
 * allocate memory.
 */
unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
                unsigned long nr_pages, struct page **page_array)
{
        struct mempolicy *pol = &default_policy;
        nodemask_t *nodemask;
        int nid;

        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
                pol = get_task_policy(current);

        if (pol->mode == MPOL_INTERLEAVE)
                return alloc_pages_bulk_array_interleave(gfp, pol,
                                                         nr_pages, page_array);

        if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
                return alloc_pages_bulk_array_weighted_interleave(
                                  gfp, pol, nr_pages, page_array);

        if (pol->mode == MPOL_PREFERRED_MANY)
                return alloc_pages_bulk_array_preferred_many(gfp,
                                numa_node_id(), pol, nr_pages, page_array);

        nid = numa_node_id();
        nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
        return alloc_pages_bulk_noprof(gfp, nid, nodemask,
                                       nr_pages, NULL, page_array);
}

int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
        struct mempolicy *pol = mpol_dup(src->vm_policy);

        if (IS_ERR(pol))
                return PTR_ERR(pol);
        dst->vm_policy = pol;
        return 0;
}

/*
 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
 * with the mems_allowed returned by cpuset_mems_allowed().  This
 * keeps mempolicies cpuset relative after its cpuset moves.  See
 * further kernel/cpuset.c update_nodemask().
 *
 * current's mempolicy may be rebinded by the other task(the task that changes
 * cpuset's mems), so we needn't do rebind work for current task.
 */

/* Slow path of a mempolicy duplicate */
struct mempolicy *__mpol_dup(struct mempolicy *old)
{
        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);

        if (!new)
                return ERR_PTR(-ENOMEM);

        /* task's mempolicy is protected by alloc_lock */
        if (old == current->mempolicy) {
                task_lock(current);
                *new = *old;
                task_unlock(current);
        } else
                *new = *old;

        if (current_cpuset_is_being_rebound()) {
                nodemask_t mems = cpuset_mems_allowed(current);
                mpol_rebind_policy(new, &mems);
        }
        atomic_set(&new->refcnt, 1);
        return new;
}

/* Slow path of a mempolicy comparison */
bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        if (!a || !b)
                return false;
        if (a->mode != b->mode)
                return false;
        if (a->flags != b->flags)
                return false;
        if (a->home_node != b->home_node)
                return false;
        if (mpol_store_user_nodemask(a))
                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
                        return false;

        switch (a->mode) {
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_PREFERRED:
        case MPOL_PREFERRED_MANY:
        case MPOL_WEIGHTED_INTERLEAVE:
                return !!nodes_equal(a->nodes, b->nodes);
        case MPOL_LOCAL:
                return true;
        default:
                BUG();
                return false;
        }
}

/*
 * Shared memory backing store policy support.
 *
 * Remember policies even when nobody has shared memory mapped.
 * The policies are kept in Red-Black tree linked from the inode.
 * They are protected by the sp->lock rwlock, which should be held
 * for any accesses to the tree.
 */

/*
 * lookup first element intersecting start-end.  Caller holds sp->lock for
 * reading or for writing
 */
static struct sp_node *sp_lookup(struct shared_policy *sp,
                                        pgoff_t start, pgoff_t end)
{
        struct rb_node *n = sp->root.rb_node;

        while (n) {
                struct sp_node *p = rb_entry(n, struct sp_node, nd);

                if (start >= p->end)
                        n = n->rb_right;
                else if (end <= p->start)
                        n = n->rb_left;
                else
                        break;
        }
        if (!n)
                return NULL;
        for (;;) {
                struct sp_node *w = NULL;
                struct rb_node *prev = rb_prev(n);
                if (!prev)
                        break;
                w = rb_entry(prev, struct sp_node, nd);
                if (w->end <= start)
                        break;
                n = prev;
        }
        return rb_entry(n, struct sp_node, nd);
}

/*
 * Insert a new shared policy into the list.  Caller holds sp->lock for
 * writing.
 */
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
{
        struct rb_node **p = &sp->root.rb_node;
        struct rb_node *parent = NULL;
        struct sp_node *nd;

        while (*p) {
                parent = *p;
                nd = rb_entry(parent, struct sp_node, nd);
                if (new->start < nd->start)
                        p = &(*p)->rb_left;
                else if (new->end > nd->end)
                        p = &(*p)->rb_right;
                else
                        BUG();
        }
        rb_link_node(&new->nd, parent, p);
        rb_insert_color(&new->nd, &sp->root);
}

/* Find shared policy intersecting idx */
struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
                                                pgoff_t idx)
{
        struct mempolicy *pol = NULL;
        struct sp_node *sn;

        if (!sp->root.rb_node)
                return NULL;
        read_lock(&sp->lock);
        sn = sp_lookup(sp, idx, idx+1);
        if (sn) {
                mpol_get(sn->policy);
                pol = sn->policy;
        }
        read_unlock(&sp->lock);
        return pol;
}

static void sp_free(struct sp_node *n)
{
        mpol_put(n->policy);
        kmem_cache_free(sn_cache, n);
}

/**
 * mpol_misplaced - check whether current folio node is valid in policy
 *
 * @folio: folio to be checked
 * @vmf: structure describing the fault
 * @addr: virtual address in @vma for shared policy lookup and interleave policy
 *
 * Lookup current policy node id for vma,addr and "compare to" folio's
 * node id.  Policy determination "mimics" alloc_page_vma().
 * Called from fault path where we know the vma and faulting address.
 *
 * Return: NUMA_NO_NODE if the page is in a node that is valid for this
 * policy, or a suitable node ID to allocate a replacement folio from.
 */
int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
                   unsigned long addr)
{
        struct mempolicy *pol;
        pgoff_t ilx;
        struct zoneref *z;
        int curnid = folio_nid(folio);
        struct vm_area_struct *vma = vmf->vma;
        int thiscpu = raw_smp_processor_id();
        int thisnid = numa_node_id();
        int polnid = NUMA_NO_NODE;
        int ret = NUMA_NO_NODE;

        /*
         * Make sure ptl is held so that we don't preempt and we
         * have a stable smp processor id
         */
        lockdep_assert_held(vmf->ptl);
        pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
        if (!(pol->flags & MPOL_F_MOF))
                goto out;

        switch (pol->mode) {
        case MPOL_INTERLEAVE:
                polnid = interleave_nid(pol, ilx);
                break;

        case MPOL_WEIGHTED_INTERLEAVE:
                polnid = weighted_interleave_nid(pol, ilx);
                break;

        case MPOL_PREFERRED:
                if (node_isset(curnid, pol->nodes))
                        goto out;
                polnid = first_node(pol->nodes);
                break;

        case MPOL_LOCAL:
                polnid = numa_node_id();
                break;

        case MPOL_BIND:
        case MPOL_PREFERRED_MANY:
                /*
                 * Even though MPOL_PREFERRED_MANY can allocate pages outside
                 * policy nodemask we don't allow numa migration to nodes
                 * outside policy nodemask for now. This is done so that if we
                 * want demotion to slow memory to happen, before allocating
                 * from some DRAM node say 'x', we will end up using a
                 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
                 * we should not promote to node 'x' from slow memory node.
                 */
                if (pol->flags & MPOL_F_MORON) {
                        /*
                         * Optimize placement among multiple nodes
                         * via NUMA balancing
                         */
                        if (node_isset(thisnid, pol->nodes))
                                break;
                        goto out;
                }

                /*
                 * use current page if in policy nodemask,
                 * else select nearest allowed node, if any.
                 * If no allowed nodes, use current [!misplaced].
                 */
                if (node_isset(curnid, pol->nodes))
                        goto out;
                z = first_zones_zonelist(
                                node_zonelist(thisnid, GFP_HIGHUSER),
                                gfp_zone(GFP_HIGHUSER),
                                &pol->nodes);
                polnid = zone_to_nid(z->zone);
                break;

        default:
                BUG();
        }

        /* Migrate the folio towards the node whose CPU is referencing it */
        if (pol->flags & MPOL_F_MORON) {
                polnid = thisnid;

                if (!should_numa_migrate_memory(current, folio, curnid,
                                                thiscpu))
                        goto out;
        }

        if (curnid != polnid)
                ret = polnid;
out:
        mpol_cond_put(pol);

        return ret;
}

/*
 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
 * dropped after task->mempolicy is set to NULL so that any allocation done as
 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
 * policy.
 */
void mpol_put_task_policy(struct task_struct *task)
{
        struct mempolicy *pol;

        task_lock(task);
        pol = task->mempolicy;
        task->mempolicy = NULL;
        task_unlock(task);
        mpol_put(pol);
}

static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
        rb_erase(&n->nd, &sp->root);
        sp_free(n);
}

static void sp_node_init(struct sp_node *node, unsigned long start,
                        unsigned long end, struct mempolicy *pol)
{
        node->start = start;
        node->end = end;
        node->policy = pol;
}

static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
                                struct mempolicy *pol)
{
        struct sp_node *n;
        struct mempolicy *newpol;

        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
        if (!n)
                return NULL;

        newpol = mpol_dup(pol);
        if (IS_ERR(newpol)) {
                kmem_cache_free(sn_cache, n);
                return NULL;
        }
        newpol->flags |= MPOL_F_SHARED;
        sp_node_init(n, start, end, newpol);

        return n;
}

/* Replace a policy range. */
static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
                                 pgoff_t end, struct sp_node *new)
{
        struct sp_node *n;
        struct sp_node *n_new = NULL;
        struct mempolicy *mpol_new = NULL;
        int ret = 0;

restart:
        write_lock(&sp->lock);
        n = sp_lookup(sp, start, end);
        /* Take care of old policies in the same range. */
        while (n && n->start < end) {
                struct rb_node *next = rb_next(&n->nd);
                if (n->start >= start) {
                        if (n->end <= end)
                                sp_delete(sp, n);
                        else
                                n->start = end;
                } else {
                        /* Old policy spanning whole new range. */
                        if (n->end > end) {
                                if (!n_new)
                                        goto alloc_new;

                                *mpol_new = *n->policy;
                                atomic_set(&mpol_new->refcnt, 1);
                                sp_node_init(n_new, end, n->end, mpol_new);
                                n->end = start;
                                sp_insert(sp, n_new);
                                n_new = NULL;
                                mpol_new = NULL;
                                break;
                        } else
                                n->end = start;
                }
                if (!next)
                        break;
                n = rb_entry(next, struct sp_node, nd);
        }
        if (new)
                sp_insert(sp, new);
        write_unlock(&sp->lock);
        ret = 0;

err_out:
        if (mpol_new)
                mpol_put(mpol_new);
        if (n_new)
                kmem_cache_free(sn_cache, n_new);

        return ret;

alloc_new:
        write_unlock(&sp->lock);
        ret = -ENOMEM;
        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
        if (!n_new)
                goto err_out;
        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
        if (!mpol_new)
                goto err_out;
        atomic_set(&mpol_new->refcnt, 1);
        goto restart;
}

/**
 * mpol_shared_policy_init - initialize shared policy for inode
 * @sp: pointer to inode shared policy
 * @mpol:  struct mempolicy to install
 *
 * Install non-NULL @mpol in inode's shared policy rb-tree.
 * On entry, the current task has a reference on a non-NULL @mpol.
 * This must be released on exit.
 * This is called at get_inode() calls and we can use GFP_KERNEL.
 */
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
{
        int ret;

        sp->root = RB_ROOT;                /* empty tree == default mempolicy */
        rwlock_init(&sp->lock);

        if (mpol) {
                struct sp_node *sn;
                struct mempolicy *npol;
                NODEMASK_SCRATCH(scratch);

                if (!scratch)
                        goto put_mpol;

                /* contextualize the tmpfs mount point mempolicy to this file */
                npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
                if (IS_ERR(npol))
                        goto free_scratch; /* no valid nodemask intersection */

                task_lock(current);
                ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
                task_unlock(current);
                if (ret)
                        goto put_npol;

                /* alloc node covering entire file; adds ref to file's npol */
                sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
                if (sn)
                        sp_insert(sp, sn);
put_npol:
                mpol_put(npol);        /* drop initial ref on file's npol */
free_scratch:
                NODEMASK_SCRATCH_FREE(scratch);
put_mpol:
                mpol_put(mpol);        /* drop our incoming ref on sb mpol */
        }
}

int mpol_set_shared_policy(struct shared_policy *sp,
                        struct vm_area_struct *vma, struct mempolicy *pol)
{
        int err;
        struct sp_node *new = NULL;
        unsigned long sz = vma_pages(vma);

        if (pol) {
                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
                if (!new)
                        return -ENOMEM;
        }
        err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
        if (err && new)
                sp_free(new);
        return err;
}

/* Free a backing policy store on inode delete. */
void mpol_free_shared_policy(struct shared_policy *sp)
{
        struct sp_node *n;
        struct rb_node *next;

        if (!sp->root.rb_node)
                return;
        write_lock(&sp->lock);
        next = rb_first(&sp->root);
        while (next) {
                n = rb_entry(next, struct sp_node, nd);
                next = rb_next(&n->nd);
                sp_delete(sp, n);
        }
        write_unlock(&sp->lock);
}

#ifdef CONFIG_NUMA_BALANCING
static int __initdata numabalancing_override;

static void __init check_numabalancing_enable(void)
{
        bool numabalancing_default = false;

        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
                numabalancing_default = true;

        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
        if (numabalancing_override)
                set_numabalancing_state(numabalancing_override == 1);

        if (num_online_nodes() > 1 && !numabalancing_override) {
                pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
                        numabalancing_default ? "Enabling" : "Disabling");
                set_numabalancing_state(numabalancing_default);
        }
}

static int __init setup_numabalancing(char *str)
{
        int ret = 0;
        if (!str)
                goto out;

        if (!strcmp(str, "enable")) {
                numabalancing_override = 1;
                ret = 1;
        } else if (!strcmp(str, "disable")) {
                numabalancing_override = -1;
                ret = 1;
        }
out:
        if (!ret)
                pr_warn("Unable to parse numa_balancing=\n");

        return ret;
}
__setup("numa_balancing=", setup_numabalancing);
#else
static inline void __init check_numabalancing_enable(void)
{
}
#endif /* CONFIG_NUMA_BALANCING */

void __init numa_policy_init(void)
{
        nodemask_t interleave_nodes;
        unsigned long largest = 0;
        int nid, prefer = 0;

        policy_cache = kmem_cache_create("numa_policy",
                                         sizeof(struct mempolicy),
                                         0, SLAB_PANIC, NULL);

        sn_cache = kmem_cache_create("shared_policy_node",
                                     sizeof(struct sp_node),
                                     0, SLAB_PANIC, NULL);

        for_each_node(nid) {
                preferred_node_policy[nid] = (struct mempolicy) {
                        .refcnt = ATOMIC_INIT(1),
                        .mode = MPOL_PREFERRED,
                        .flags = MPOL_F_MOF | MPOL_F_MORON,
                        .nodes = nodemask_of_node(nid),
                };
        }

        /*
         * Set interleaving policy for system init. Interleaving is only
         * enabled across suitably sized nodes (default is >= 16MB), or
         * fall back to the largest node if they're all smaller.
         */
        nodes_clear(interleave_nodes);
        for_each_node_state(nid, N_MEMORY) {
                unsigned long total_pages = node_present_pages(nid);

                /* Preserve the largest node */
                if (largest < total_pages) {
                        largest = total_pages;
                        prefer = nid;
                }

                /* Interleave this node? */
                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
                        node_set(nid, interleave_nodes);
        }

        /* All too small, use the largest */
        if (unlikely(nodes_empty(interleave_nodes)))
                node_set(prefer, interleave_nodes);

        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
                pr_err("%s: interleaving failed\n", __func__);

        check_numabalancing_enable();
}

/* Reset policy of current process to default */
void numa_default_policy(void)
{
        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
}

/*
 * Parse and format mempolicy from/to strings
 */
static const char * const policy_modes[] =
{
        [MPOL_DEFAULT]    = "default",
        [MPOL_PREFERRED]  = "prefer",
        [MPOL_BIND]       = "bind",
        [MPOL_INTERLEAVE] = "interleave",
        [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
        [MPOL_LOCAL]      = "local",
        [MPOL_PREFERRED_MANY]  = "prefer (many)",
};

#ifdef CONFIG_TMPFS
/**
 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
 * @str:  string containing mempolicy to parse
 * @mpol:  pointer to struct mempolicy pointer, returned on success.
 *
 * Format of input:
 *        <mode>[=<flags>][:<nodelist>]
 *
 * Return: %0 on success, else %1
 */
int mpol_parse_str(char *str, struct mempolicy **mpol)
{
        struct mempolicy *new = NULL;
        unsigned short mode_flags;
        nodemask_t nodes;
        char *nodelist = strchr(str, ':');
        char *flags = strchr(str, '=');
        int err = 1, mode;

        if (flags)
                *flags++ = '\0';        /* terminate mode string */

        if (nodelist) {
                /* NUL-terminate mode or flags string */
                *nodelist++ = '\0';
                if (nodelist_parse(nodelist, nodes))
                        goto out;
                if (!nodes_subset(nodes, node_states[N_MEMORY]))
                        goto out;
        } else
                nodes_clear(nodes);

        mode = match_string(policy_modes, MPOL_MAX, str);
        if (mode < 0)
                goto out;

        switch (mode) {
        case MPOL_PREFERRED:
                /*
                 * Insist on a nodelist of one node only, although later
                 * we use first_node(nodes) to grab a single node, so here
                 * nodelist (or nodes) cannot be empty.
                 */
                if (nodelist) {
                        char *rest = nodelist;
                        while (isdigit(*rest))
                                rest++;
                        if (*rest)
                                goto out;
                        if (nodes_empty(nodes))
                                goto out;
                }
                break;
        case MPOL_INTERLEAVE:
        case MPOL_WEIGHTED_INTERLEAVE:
                /*
                 * Default to online nodes with memory if no nodelist
                 */
                if (!nodelist)
                        nodes = node_states[N_MEMORY];
                break;
        case MPOL_LOCAL:
                /*
                 * Don't allow a nodelist;  mpol_new() checks flags
                 */
                if (nodelist)
                        goto out;
                break;
        case MPOL_DEFAULT:
                /*
                 * Insist on a empty nodelist
                 */
                if (!nodelist)
                        err = 0;
                goto out;
        case MPOL_PREFERRED_MANY:
        case MPOL_BIND:
                /*
                 * Insist on a nodelist
                 */
                if (!nodelist)
                        goto out;
        }

        mode_flags = 0;
        if (flags) {
                /*
                 * Currently, we only support two mutually exclusive
                 * mode flags.
                 */
                if (!strcmp(flags, "static"))
                        mode_flags |= MPOL_F_STATIC_NODES;
                else if (!strcmp(flags, "relative"))
                        mode_flags |= MPOL_F_RELATIVE_NODES;
                else
                        goto out;
        }

        new = mpol_new(mode, mode_flags, &nodes);
        if (IS_ERR(new))
                goto out;

        /*
         * Save nodes for mpol_to_str() to show the tmpfs mount options
         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
         */
        if (mode != MPOL_PREFERRED) {
                new->nodes = nodes;
        } else if (nodelist) {
                nodes_clear(new->nodes);
                node_set(first_node(nodes), new->nodes);
        } else {
                new->mode = MPOL_LOCAL;
        }

        /*
         * Save nodes for contextualization: this will be used to "clone"
         * the mempolicy in a specific context [cpuset] at a later time.
         */
        new->w.user_nodemask = nodes;

        err = 0;

out:
        /* Restore string for error message */
        if (nodelist)
                *--nodelist = ':';
        if (flags)
                *--flags = '=';
        if (!err)
                *mpol = new;
        return err;
}
#endif /* CONFIG_TMPFS */

/**
 * mpol_to_str - format a mempolicy structure for printing
 * @buffer:  to contain formatted mempolicy string
 * @maxlen:  length of @buffer
 * @pol:  pointer to mempolicy to be formatted
 *
 * Convert @pol into a string.  If @buffer is too short, truncate the string.
 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
 * longest flag, "relative", and to display at least a few node ids.
 */
void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
        char *p = buffer;
        nodemask_t nodes = NODE_MASK_NONE;
        unsigned short mode = MPOL_DEFAULT;
        unsigned short flags = 0;

        if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
                mode = pol->mode;
                flags = pol->flags;
        }

        switch (mode) {
        case MPOL_DEFAULT:
        case MPOL_LOCAL:
                break;
        case MPOL_PREFERRED:
        case MPOL_PREFERRED_MANY:
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_WEIGHTED_INTERLEAVE:
                nodes = pol->nodes;
                break;
        default:
                WARN_ON_ONCE(1);
                snprintf(p, maxlen, "unknown");
                return;
        }

        p += snprintf(p, maxlen, "%s", policy_modes[mode]);

        if (flags & MPOL_MODE_FLAGS) {
                p += snprintf(p, buffer + maxlen - p, "=");

                /*
                 * Currently, the only defined flags are mutually exclusive
                 */
                if (flags & MPOL_F_STATIC_NODES)
                        p += snprintf(p, buffer + maxlen - p, "static");
                else if (flags & MPOL_F_RELATIVE_NODES)
                        p += snprintf(p, buffer + maxlen - p, "relative");
        }

        if (!nodes_empty(nodes))
                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
                               nodemask_pr_args(&nodes));
}

#ifdef CONFIG_SYSFS
struct iw_node_attr {
        struct kobj_attribute kobj_attr;
        int nid;
};

static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
                         char *buf)
{
        struct iw_node_attr *node_attr;
        u8 weight;

        node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
        weight = get_il_weight(node_attr->nid);
        return sysfs_emit(buf, "%d\n", weight);
}

static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
                          const char *buf, size_t count)
{
        struct iw_node_attr *node_attr;
        u8 *new;
        u8 *old;
        u8 weight = 0;

        node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
        if (count == 0 || sysfs_streq(buf, ""))
                weight = 0;
        else if (kstrtou8(buf, 0, &weight))
                return -EINVAL;

        new = kzalloc(nr_node_ids, GFP_KERNEL);
        if (!new)
                return -ENOMEM;

        mutex_lock(&iw_table_lock);
        old = rcu_dereference_protected(iw_table,
                                        lockdep_is_held(&iw_table_lock));
        if (old)
                memcpy(new, old, nr_node_ids);
        new[node_attr->nid] = weight;
        rcu_assign_pointer(iw_table, new);
        mutex_unlock(&iw_table_lock);
        synchronize_rcu();
        kfree(old);
        return count;
}

static struct iw_node_attr **node_attrs;

static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
                                  struct kobject *parent)
{
        if (!node_attr)
                return;
        sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
        kfree(node_attr->kobj_attr.attr.name);
        kfree(node_attr);
}

static void sysfs_wi_release(struct kobject *wi_kobj)
{
        int i;

        for (i = 0; i < nr_node_ids; i++)
                sysfs_wi_node_release(node_attrs[i], wi_kobj);
        kobject_put(wi_kobj);
}

static const struct kobj_type wi_ktype = {
        .sysfs_ops = &kobj_sysfs_ops,
        .release = sysfs_wi_release,
};

static int add_weight_node(int nid, struct kobject *wi_kobj)
{
        struct iw_node_attr *node_attr;
        char *name;

        node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
        if (!node_attr)
                return -ENOMEM;

        name = kasprintf(GFP_KERNEL, "node%d", nid);
        if (!name) {
                kfree(node_attr);
                return -ENOMEM;
        }

        sysfs_attr_init(&node_attr->kobj_attr.attr);
        node_attr->kobj_attr.attr.name = name;
        node_attr->kobj_attr.attr.mode = 0644;
        node_attr->kobj_attr.show = node_show;
        node_attr->kobj_attr.store = node_store;
        node_attr->nid = nid;

        if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
                kfree(node_attr->kobj_attr.attr.name);
                kfree(node_attr);
                pr_err("failed to add attribute to weighted_interleave\n");
                return -ENOMEM;
        }

        node_attrs[nid] = node_attr;
        return 0;
}

static int add_weighted_interleave_group(struct kobject *root_kobj)
{
        struct kobject *wi_kobj;
        int nid, err;

        wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
        if (!wi_kobj)
                return -ENOMEM;

        err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
                                   "weighted_interleave");
        if (err) {
                kfree(wi_kobj);
                return err;
        }

        for_each_node_state(nid, N_POSSIBLE) {
                err = add_weight_node(nid, wi_kobj);
                if (err) {
                        pr_err("failed to add sysfs [node%d]\n", nid);
                        break;
                }
        }
        if (err)
                kobject_put(wi_kobj);
        return 0;
}

static void mempolicy_kobj_release(struct kobject *kobj)
{
        u8 *old;

        mutex_lock(&iw_table_lock);
        old = rcu_dereference_protected(iw_table,
                                        lockdep_is_held(&iw_table_lock));
        rcu_assign_pointer(iw_table, NULL);
        mutex_unlock(&iw_table_lock);
        synchronize_rcu();
        kfree(old);
        kfree(node_attrs);
        kfree(kobj);
}

static const struct kobj_type mempolicy_ktype = {
        .release = mempolicy_kobj_release
};

static int __init mempolicy_sysfs_init(void)
{
        int err;
        static struct kobject *mempolicy_kobj;

        mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
        if (!mempolicy_kobj) {
                err = -ENOMEM;
                goto err_out;
        }

        node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
                             GFP_KERNEL);
        if (!node_attrs) {
                err = -ENOMEM;
                goto mempol_out;
        }

        err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
                                   "mempolicy");
        if (err)
                goto node_out;

        err = add_weighted_interleave_group(mempolicy_kobj);
        if (err) {
                pr_err("mempolicy sysfs structure failed to initialize\n");
                kobject_put(mempolicy_kobj);
                return err;
        }

        return err;
node_out:
        kfree(node_attrs);
mempol_out:
        kfree(mempolicy_kobj);
err_out:
        pr_err("failed to add mempolicy kobject to the system\n");
        return err;
}

late_initcall(mempolicy_sysfs_init);
#endif /* CONFIG_SYSFS */












































































































    3 



    3 
    3 



















































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/virtio.h>
#include <linux/spinlock.h>
#include <linux/virtio_config.h>
#include <linux/virtio_anchor.h>
#include <linux/module.h>
#include <linux/idr.h>
#include <linux/of.h>
#include <uapi/linux/virtio_ids.h>

/* Unique numbering for virtio devices. */
static DEFINE_IDA(virtio_index_ida);

static ssize_t device_show(struct device *_d,
                           struct device_attribute *attr, char *buf)
{
        struct virtio_device *dev = dev_to_virtio(_d);
        return sysfs_emit(buf, "0x%04x\n", dev->id.device);
}
static DEVICE_ATTR_RO(device);

static ssize_t vendor_show(struct device *_d,
                           struct device_attribute *attr, char *buf)
{
        struct virtio_device *dev = dev_to_virtio(_d);
        return sysfs_emit(buf, "0x%04x\n", dev->id.vendor);
}
static DEVICE_ATTR_RO(vendor);

static ssize_t status_show(struct device *_d,
                           struct device_attribute *attr, char *buf)
{
        struct virtio_device *dev = dev_to_virtio(_d);
        return sysfs_emit(buf, "0x%08x\n", dev->config->get_status(dev));
}
static DEVICE_ATTR_RO(status);

static ssize_t modalias_show(struct device *_d,
                             struct device_attribute *attr, char *buf)
{
        struct virtio_device *dev = dev_to_virtio(_d);
        return sysfs_emit(buf, "virtio:d%08Xv%08X\n",
                       dev->id.device, dev->id.vendor);
}
static DEVICE_ATTR_RO(modalias);

static ssize_t features_show(struct device *_d,
                             struct device_attribute *attr, char *buf)
{
        struct virtio_device *dev = dev_to_virtio(_d);
        unsigned int i;
        ssize_t len = 0;

        /* We actually represent this as a bitstring, as it could be
         * arbitrary length in future. */
        for (i = 0; i < sizeof(dev->features)*8; i++)
                len += sysfs_emit_at(buf, len, "%c",
                               __virtio_test_bit(dev, i) ? '1' : '0');
        len += sysfs_emit_at(buf, len, "\n");
        return len;
}
static DEVICE_ATTR_RO(features);

static struct attribute *virtio_dev_attrs[] = {
        &dev_attr_device.attr,
        &dev_attr_vendor.attr,
        &dev_attr_status.attr,
        &dev_attr_modalias.attr,
        &dev_attr_features.attr,
        NULL,
};
ATTRIBUTE_GROUPS(virtio_dev);

static inline int virtio_id_match(const struct virtio_device *dev,
                                  const struct virtio_device_id *id)
{
        if (id->device != dev->id.device && id->device != VIRTIO_DEV_ANY_ID)
                return 0;

        return id->vendor == VIRTIO_DEV_ANY_ID || id->vendor == dev->id.vendor;
}

/* This looks through all the IDs a driver claims to support.  If any of them
 * match, we return 1 and the kernel will call virtio_dev_probe(). */
static int virtio_dev_match(struct device *_dv, struct device_driver *_dr)
{
        unsigned int i;
        struct virtio_device *dev = dev_to_virtio(_dv);
        const struct virtio_device_id *ids;

        ids = drv_to_virtio(_dr)->id_table;
        for (i = 0; ids[i].device; i++)
                if (virtio_id_match(dev, &ids[i]))
                        return 1;
        return 0;
}

static int virtio_uevent(const struct device *_dv, struct kobj_uevent_env *env)
{
        const struct virtio_device *dev = dev_to_virtio(_dv);

        return add_uevent_var(env, "MODALIAS=virtio:d%08Xv%08X",
                              dev->id.device, dev->id.vendor);
}

void virtio_check_driver_offered_feature(const struct virtio_device *vdev,
                                         unsigned int fbit)
{
        unsigned int i;
        struct virtio_driver *drv = drv_to_virtio(vdev->dev.driver);

        for (i = 0; i < drv->feature_table_size; i++)
                if (drv->feature_table[i] == fbit)
                        return;

        if (drv->feature_table_legacy) {
                for (i = 0; i < drv->feature_table_size_legacy; i++)
                        if (drv->feature_table_legacy[i] == fbit)
                                return;
        }

        BUG();
}
EXPORT_SYMBOL_GPL(virtio_check_driver_offered_feature);

static void __virtio_config_changed(struct virtio_device *dev)
{
        struct virtio_driver *drv = drv_to_virtio(dev->dev.driver);

        if (!dev->config_enabled)
                dev->config_change_pending = true;
        else if (drv && drv->config_changed)
                drv->config_changed(dev);
}

void virtio_config_changed(struct virtio_device *dev)
{
        unsigned long flags;

        spin_lock_irqsave(&dev->config_lock, flags);
        __virtio_config_changed(dev);
        spin_unlock_irqrestore(&dev->config_lock, flags);
}
EXPORT_SYMBOL_GPL(virtio_config_changed);

static void virtio_config_disable(struct virtio_device *dev)
{
        spin_lock_irq(&dev->config_lock);
        dev->config_enabled = false;
        spin_unlock_irq(&dev->config_lock);
}

static void virtio_config_enable(struct virtio_device *dev)
{
        spin_lock_irq(&dev->config_lock);
        dev->config_enabled = true;
        if (dev->config_change_pending)
                __virtio_config_changed(dev);
        dev->config_change_pending = false;
        spin_unlock_irq(&dev->config_lock);
}

void virtio_add_status(struct virtio_device *dev, unsigned int status)
{
        might_sleep();
        dev->config->set_status(dev, dev->config->get_status(dev) | status);
}
EXPORT_SYMBOL_GPL(virtio_add_status);

/* Do some validation, then set FEATURES_OK */
static int virtio_features_ok(struct virtio_device *dev)
{
        unsigned int status;

        might_sleep();

        if (virtio_check_mem_acc_cb(dev)) {
                if (!virtio_has_feature(dev, VIRTIO_F_VERSION_1)) {
                        dev_warn(&dev->dev,
                                 "device must provide VIRTIO_F_VERSION_1\n");
                        return -ENODEV;
                }

                if (!virtio_has_feature(dev, VIRTIO_F_ACCESS_PLATFORM)) {
                        dev_warn(&dev->dev,
                                 "device must provide VIRTIO_F_ACCESS_PLATFORM\n");
                        return -ENODEV;
                }
        }

        if (!virtio_has_feature(dev, VIRTIO_F_VERSION_1))
                return 0;

        virtio_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
        status = dev->config->get_status(dev);
        if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) {
                dev_err(&dev->dev, "virtio: device refuses features: %x\n",
                        status);
                return -ENODEV;
        }
        return 0;
}

/**
 * virtio_reset_device - quiesce device for removal
 * @dev: the device to reset
 *
 * Prevents device from sending interrupts and accessing memory.
 *
 * Generally used for cleanup during driver / device removal.
 *
 * Once this has been invoked, caller must ensure that
 * virtqueue_notify / virtqueue_kick are not in progress.
 *
 * Note: this guarantees that vq callbacks are not in progress, however caller
 * is responsible for preventing access from other contexts, such as a system
 * call/workqueue/bh.  Invoking virtio_break_device then flushing any such
 * contexts is one way to handle that.
 * */
void virtio_reset_device(struct virtio_device *dev)
{
#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
        /*
         * The below virtio_synchronize_cbs() guarantees that any
         * interrupt for this line arriving after
         * virtio_synchronize_vqs() has completed is guaranteed to see
         * vq->broken as true.
         */
        virtio_break_device(dev);
        virtio_synchronize_cbs(dev);
#endif

        dev->config->reset(dev);
}
EXPORT_SYMBOL_GPL(virtio_reset_device);

static int virtio_dev_probe(struct device *_d)
{
        int err, i;
        struct virtio_device *dev = dev_to_virtio(_d);
        struct virtio_driver *drv = drv_to_virtio(dev->dev.driver);
        u64 device_features;
        u64 driver_features;
        u64 driver_features_legacy;

        /* We have a driver! */
        virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);

        /* Figure out what features the device supports. */
        device_features = dev->config->get_features(dev);

        /* Figure out what features the driver supports. */
        driver_features = 0;
        for (i = 0; i < drv->feature_table_size; i++) {
                unsigned int f = drv->feature_table[i];
                BUG_ON(f >= 64);
                driver_features |= (1ULL << f);
        }

        /* Some drivers have a separate feature table for virtio v1.0 */
        if (drv->feature_table_legacy) {
                driver_features_legacy = 0;
                for (i = 0; i < drv->feature_table_size_legacy; i++) {
                        unsigned int f = drv->feature_table_legacy[i];
                        BUG_ON(f >= 64);
                        driver_features_legacy |= (1ULL << f);
                }
        } else {
                driver_features_legacy = driver_features;
        }

        if (device_features & (1ULL << VIRTIO_F_VERSION_1))
                dev->features = driver_features & device_features;
        else
                dev->features = driver_features_legacy & device_features;

        /* When debugging, user may filter some features by hand. */
        virtio_debug_device_filter_features(dev);

        /* Transport features always preserved to pass to finalize_features. */
        for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++)
                if (device_features & (1ULL << i))
                        __virtio_set_bit(dev, i);

        err = dev->config->finalize_features(dev);
        if (err)
                goto err;

        if (drv->validate) {
                u64 features = dev->features;

                err = drv->validate(dev);
                if (err)
                        goto err;

                /* Did validation change any features? Then write them again. */
                if (features != dev->features) {
                        err = dev->config->finalize_features(dev);
                        if (err)
                                goto err;
                }
        }

        err = virtio_features_ok(dev);
        if (err)
                goto err;

        if (dev->config->create_avq) {
                err = dev->config->create_avq(dev);
                if (err)
                        goto err;
        }

        err = drv->probe(dev);
        if (err)
                goto err_probe;

        /* If probe didn't do it, mark device DRIVER_OK ourselves. */
        if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK))
                virtio_device_ready(dev);

        if (drv->scan)
                drv->scan(dev);

        virtio_config_enable(dev);

        return 0;

err_probe:
        if (dev->config->destroy_avq)
                dev->config->destroy_avq(dev);
err:
        virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
        return err;

}

static void virtio_dev_remove(struct device *_d)
{
        struct virtio_device *dev = dev_to_virtio(_d);
        struct virtio_driver *drv = drv_to_virtio(dev->dev.driver);

        virtio_config_disable(dev);

        drv->remove(dev);

        if (dev->config->destroy_avq)
                dev->config->destroy_avq(dev);

        /* Driver should have reset device. */
        WARN_ON_ONCE(dev->config->get_status(dev));

        /* Acknowledge the device's existence again. */
        virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);

        of_node_put(dev->dev.of_node);
}

static const struct bus_type virtio_bus = {
        .name  = "virtio",
        .match = virtio_dev_match,
        .dev_groups = virtio_dev_groups,
        .uevent = virtio_uevent,
        .probe = virtio_dev_probe,
        .remove = virtio_dev_remove,
};

int __register_virtio_driver(struct virtio_driver *driver, struct module *owner)
{
        /* Catch this early. */
        BUG_ON(driver->feature_table_size && !driver->feature_table);
        driver->driver.bus = &virtio_bus;
        driver->driver.owner = owner;

        return driver_register(&driver->driver);
}
EXPORT_SYMBOL_GPL(__register_virtio_driver);

void unregister_virtio_driver(struct virtio_driver *driver)
{
        driver_unregister(&driver->driver);
}
EXPORT_SYMBOL_GPL(unregister_virtio_driver);

static int virtio_device_of_init(struct virtio_device *dev)
{
        struct device_node *np, *pnode = dev_of_node(dev->dev.parent);
        char compat[] = "virtio,deviceXXXXXXXX";
        int ret, count;

        if (!pnode)
                return 0;

        count = of_get_available_child_count(pnode);
        if (!count)
                return 0;

        /* There can be only 1 child node */
        if (WARN_ON(count > 1))
                return -EINVAL;

        np = of_get_next_available_child(pnode, NULL);
        if (WARN_ON(!np))
                return -ENODEV;

        ret = snprintf(compat, sizeof(compat), "virtio,device%x", dev->id.device);
        BUG_ON(ret >= sizeof(compat));

        /*
         * On powerpc/pseries virtio devices are PCI devices so PCI
         * vendor/device ids play the role of the "compatible" property.
         * Simply don't init of_node in this case.
         */
        if (!of_device_is_compatible(np, compat)) {
                ret = 0;
                goto out;
        }

        dev->dev.of_node = np;
        return 0;

out:
        of_node_put(np);
        return ret;
}

/**
 * register_virtio_device - register virtio device
 * @dev        : virtio device to be registered
 *
 * On error, the caller must call put_device on &@dev->dev (and not kfree),
 * as another code path may have obtained a reference to @dev.
 *
 * Returns: 0 on suceess, -error on failure
 */
int register_virtio_device(struct virtio_device *dev)
{
        int err;

        dev->dev.bus = &virtio_bus;
        device_initialize(&dev->dev);

        /* Assign a unique device index and hence name. */
        err = ida_alloc(&virtio_index_ida, GFP_KERNEL);
        if (err < 0)
                goto out;

        dev->index = err;
        err = dev_set_name(&dev->dev, "virtio%u", dev->index);
        if (err)
                goto out_ida_remove;

        err = virtio_device_of_init(dev);
        if (err)
                goto out_ida_remove;

        spin_lock_init(&dev->config_lock);
        dev->config_enabled = false;
        dev->config_change_pending = false;

        INIT_LIST_HEAD(&dev->vqs);
        spin_lock_init(&dev->vqs_list_lock);

        /* We always start by resetting the device, in case a previous
         * driver messed it up.  This also tests that code path a little. */
        virtio_reset_device(dev);

        /* Acknowledge that we've seen the device. */
        virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);

        virtio_debug_device_init(dev);

        /*
         * device_add() causes the bus infrastructure to look for a matching
         * driver.
         */
        err = device_add(&dev->dev);
        if (err)
                goto out_of_node_put;

        return 0;

out_of_node_put:
        of_node_put(dev->dev.of_node);
out_ida_remove:
        ida_free(&virtio_index_ida, dev->index);
out:
        virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
        return err;
}
EXPORT_SYMBOL_GPL(register_virtio_device);

bool is_virtio_device(struct device *dev)
{
        return dev->bus == &virtio_bus;
}
EXPORT_SYMBOL_GPL(is_virtio_device);

void unregister_virtio_device(struct virtio_device *dev)
{
        int index = dev->index; /* save for after device release */

        device_unregister(&dev->dev);
        virtio_debug_device_exit(dev);
        ida_free(&virtio_index_ida, index);
}
EXPORT_SYMBOL_GPL(unregister_virtio_device);

#ifdef CONFIG_PM_SLEEP
int virtio_device_freeze(struct virtio_device *dev)
{
        struct virtio_driver *drv = drv_to_virtio(dev->dev.driver);
        int ret;

        virtio_config_disable(dev);

        dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED;

        if (drv && drv->freeze) {
                ret = drv->freeze(dev);
                if (ret) {
                        virtio_config_enable(dev);
                        return ret;
                }
        }

        if (dev->config->destroy_avq)
                dev->config->destroy_avq(dev);

        return 0;
}
EXPORT_SYMBOL_GPL(virtio_device_freeze);

int virtio_device_restore(struct virtio_device *dev)
{
        struct virtio_driver *drv = drv_to_virtio(dev->dev.driver);
        int ret;

        /* We always start by resetting the device, in case a previous
         * driver messed it up. */
        virtio_reset_device(dev);

        /* Acknowledge that we've seen the device. */
        virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);

        /* Maybe driver failed before freeze.
         * Restore the failed status, for debugging. */
        if (dev->failed)
                virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);

        if (!drv)
                return 0;

        /* We have a driver! */
        virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);

        ret = dev->config->finalize_features(dev);
        if (ret)
                goto err;

        ret = virtio_features_ok(dev);
        if (ret)
                goto err;

        if (dev->config->create_avq) {
                ret = dev->config->create_avq(dev);
                if (ret)
                        goto err;
        }

        if (drv->restore) {
                ret = drv->restore(dev);
                if (ret)
                        goto err_restore;
        }

        /* If restore didn't do it, mark device DRIVER_OK ourselves. */
        if (!(dev->config->get_status(dev) & VIRTIO_CONFIG_S_DRIVER_OK))
                virtio_device_ready(dev);

        virtio_config_enable(dev);

        return 0;

err_restore:
        if (dev->config->destroy_avq)
                dev->config->destroy_avq(dev);
err:
        virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
        return ret;
}
EXPORT_SYMBOL_GPL(virtio_device_restore);
#endif

static int virtio_init(void)
{
        if (bus_register(&virtio_bus) != 0)
                panic("virtio bus registration failed");
        virtio_debug_init();
        return 0;
}

static void __exit virtio_exit(void)
{
        virtio_debug_exit();
        bus_unregister(&virtio_bus);
        ida_destroy(&virtio_index_ida);
}
core_initcall(virtio_init);
module_exit(virtio_exit);

MODULE_LICENSE("GPL");




















    1 
    2 






















    4 
    4 






































    1 





    1 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
/*
 * Compatibility functions which bloat the callers too much to make inline.
 * All of the callers of these functions should be converted to use folios
 * eventually.
 */

#include <linux/migrate.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include "internal.h"

struct address_space *page_mapping(struct page *page)
{
        return folio_mapping(page_folio(page));
}
EXPORT_SYMBOL(page_mapping);

void unlock_page(struct page *page)
{
        return folio_unlock(page_folio(page));
}
EXPORT_SYMBOL(unlock_page);

void end_page_writeback(struct page *page)
{
        return folio_end_writeback(page_folio(page));
}
EXPORT_SYMBOL(end_page_writeback);

void wait_on_page_writeback(struct page *page)
{
        return folio_wait_writeback(page_folio(page));
}
EXPORT_SYMBOL_GPL(wait_on_page_writeback);

void wait_for_stable_page(struct page *page)
{
        return folio_wait_stable(page_folio(page));
}
EXPORT_SYMBOL_GPL(wait_for_stable_page);

void mark_page_accessed(struct page *page)
{
        folio_mark_accessed(page_folio(page));
}
EXPORT_SYMBOL(mark_page_accessed);

void set_page_writeback(struct page *page)
{
        folio_start_writeback(page_folio(page));
}
EXPORT_SYMBOL(set_page_writeback);

bool set_page_dirty(struct page *page)
{
        return folio_mark_dirty(page_folio(page));
}
EXPORT_SYMBOL(set_page_dirty);

bool clear_page_dirty_for_io(struct page *page)
{
        return folio_clear_dirty_for_io(page_folio(page));
}
EXPORT_SYMBOL(clear_page_dirty_for_io);

bool redirty_page_for_writepage(struct writeback_control *wbc,
                struct page *page)
{
        return folio_redirty_for_writepage(wbc, page_folio(page));
}
EXPORT_SYMBOL(redirty_page_for_writepage);

int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                pgoff_t index, gfp_t gfp)
{
        return filemap_add_folio(mapping, page_folio(page), index, gfp);
}
EXPORT_SYMBOL(add_to_page_cache_lru);

noinline
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
                fgf_t fgp_flags, gfp_t gfp)
{
        struct folio *folio;

        folio = __filemap_get_folio(mapping, index, fgp_flags, gfp);
        if (IS_ERR(folio))
                return NULL;
        return folio_file_page(folio, index);
}
EXPORT_SYMBOL(pagecache_get_page);

struct page *grab_cache_page_write_begin(struct address_space *mapping,
                                        pgoff_t index)
{
        return pagecache_get_page(mapping, index, FGP_WRITEBEGIN,
                        mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(grab_cache_page_write_begin);

bool isolate_lru_page(struct page *page)
{
        if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"))
                return false;
        return folio_isolate_lru((struct folio *)page);
}

void putback_lru_page(struct page *page)
{
        folio_putback_lru(page_folio(page));
}







































































    1 











    1 



    5 



    1 
    1 





















    5 































    2 
    2 


































































    1 



















    2 





    2 



























    2 
    2 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  fs/anon_inodes.c
 *
 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
 *
 *  Thanks to Arnd Bergmann for code review and suggestions.
 *  More changes for Thomas Gleixner suggestions.
 *
 */

#include <linux/cred.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/magic.h>
#include <linux/anon_inodes.h>
#include <linux/pseudo_fs.h>

#include <linux/uaccess.h>

static struct vfsmount *anon_inode_mnt __ro_after_init;
static struct inode *anon_inode_inode __ro_after_init;

/*
 * anon_inodefs_dname() is called from d_path().
 */
static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
{
        return dynamic_dname(buffer, buflen, "anon_inode:%s",
                                dentry->d_name.name);
}

static const struct dentry_operations anon_inodefs_dentry_operations = {
        .d_dname        = anon_inodefs_dname,
};

static int anon_inodefs_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        ctx->dops = &anon_inodefs_dentry_operations;
        return 0;
}

static struct file_system_type anon_inode_fs_type = {
        .name                = "anon_inodefs",
        .init_fs_context = anon_inodefs_init_fs_context,
        .kill_sb        = kill_anon_super,
};

static struct inode *anon_inode_make_secure_inode(
        const char *name,
        const struct inode *context_inode)
{
        struct inode *inode;
        const struct qstr qname = QSTR_INIT(name, strlen(name));
        int error;

        inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
        if (IS_ERR(inode))
                return inode;
        inode->i_flags &= ~S_PRIVATE;
        error =        security_inode_init_security_anon(inode, &qname, context_inode);
        if (error) {
                iput(inode);
                return ERR_PTR(error);
        }
        return inode;
}

static struct file *__anon_inode_getfile(const char *name,
                                         const struct file_operations *fops,
                                         void *priv, int flags,
                                         const struct inode *context_inode,
                                         bool make_inode)
{
        struct inode *inode;
        struct file *file;

        if (fops->owner && !try_module_get(fops->owner))
                return ERR_PTR(-ENOENT);

        if (make_inode) {
                inode =        anon_inode_make_secure_inode(name, context_inode);
                if (IS_ERR(inode)) {
                        file = ERR_CAST(inode);
                        goto err;
                }
        } else {
                inode =        anon_inode_inode;
                if (IS_ERR(inode)) {
                        file = ERR_PTR(-ENODEV);
                        goto err;
                }
                /*
                 * We know the anon_inode inode count is always
                 * greater than zero, so ihold() is safe.
                 */
                ihold(inode);
        }

        file = alloc_file_pseudo(inode, anon_inode_mnt, name,
                                 flags & (O_ACCMODE | O_NONBLOCK), fops);
        if (IS_ERR(file))
                goto err_iput;

        file->f_mapping = inode->i_mapping;

        file->private_data = priv;

        return file;

err_iput:
        iput(inode);
err:
        module_put(fops->owner);
        return file;
}

/**
 * anon_inode_getfile - creates a new file instance by hooking it up to an
 *                      anonymous inode, and a dentry that describe the "class"
 *                      of the file
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 *
 * Creates a new file by hooking it on a single inode. This is useful for files
 * that do not need to have a full-fledged inode in order to operate correctly.
 * All the files created with anon_inode_getfile() will share a single inode,
 * hence saving memory and avoiding code duplication for the file/inode/dentry
 * setup.  Returns the newly created file* or an error pointer.
 */
struct file *anon_inode_getfile(const char *name,
                                const struct file_operations *fops,
                                void *priv, int flags)
{
        return __anon_inode_getfile(name, fops, priv, flags, NULL, false);
}
EXPORT_SYMBOL_GPL(anon_inode_getfile);

/**
 * anon_inode_getfile_fmode - creates a new file instance by hooking it up to an
 *                      anonymous inode, and a dentry that describe the "class"
 *                      of the file
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 * @f_mode:  [in]    fmode
 *
 * Creates a new file by hooking it on a single inode. This is useful for files
 * that do not need to have a full-fledged inode in order to operate correctly.
 * All the files created with anon_inode_getfile() will share a single inode,
 * hence saving memory and avoiding code duplication for the file/inode/dentry
 * setup. Allows setting the fmode. Returns the newly created file* or an error
 * pointer.
 */
struct file *anon_inode_getfile_fmode(const char *name,
                                const struct file_operations *fops,
                                void *priv, int flags, fmode_t f_mode)
{
        struct file *file;

        file = __anon_inode_getfile(name, fops, priv, flags, NULL, false);
        if (!IS_ERR(file))
                file->f_mode |= f_mode;

        return file;
}
EXPORT_SYMBOL_GPL(anon_inode_getfile_fmode);

/**
 * anon_inode_create_getfile - Like anon_inode_getfile(), but creates a new
 *                             !S_PRIVATE anon inode rather than reuse the
 *                             singleton anon inode and calls the
 *                             inode_init_security_anon() LSM hook.
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 * @context_inode:
 *           [in]    the logical relationship with the new inode (optional)
 *
 * Create a new anonymous inode and file pair.  This can be done for two
 * reasons:
 *
 * - for the inode to have its own security context, so that LSMs can enforce
 *   policy on the inode's creation;
 *
 * - if the caller needs a unique inode, for example in order to customize
 *   the size returned by fstat()
 *
 * The LSM may use @context_inode in inode_init_security_anon(), but a
 * reference to it is not held.
 *
 * Returns the newly created file* or an error pointer.
 */
struct file *anon_inode_create_getfile(const char *name,
                                       const struct file_operations *fops,
                                       void *priv, int flags,
                                       const struct inode *context_inode)
{
        return __anon_inode_getfile(name, fops, priv, flags,
                                    context_inode, true);
}
EXPORT_SYMBOL_GPL(anon_inode_create_getfile);

static int __anon_inode_getfd(const char *name,
                              const struct file_operations *fops,
                              void *priv, int flags,
                              const struct inode *context_inode,
                              bool make_inode)
{
        int error, fd;
        struct file *file;

        error = get_unused_fd_flags(flags);
        if (error < 0)
                return error;
        fd = error;

        file = __anon_inode_getfile(name, fops, priv, flags, context_inode,
                                    make_inode);
        if (IS_ERR(file)) {
                error = PTR_ERR(file);
                goto err_put_unused_fd;
        }
        fd_install(fd, file);

        return fd;

err_put_unused_fd:
        put_unused_fd(fd);
        return error;
}

/**
 * anon_inode_getfd - creates a new file instance by hooking it up to
 *                    an anonymous inode and a dentry that describe
 *                    the "class" of the file
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 *
 * Creates a new file by hooking it on a single inode. This is
 * useful for files that do not need to have a full-fledged inode in
 * order to operate correctly.  All the files created with
 * anon_inode_getfd() will use the same singleton inode, reducing
 * memory use and avoiding code duplication for the file/inode/dentry
 * setup.  Returns a newly created file descriptor or an error code.
 */
int anon_inode_getfd(const char *name, const struct file_operations *fops,
                     void *priv, int flags)
{
        return __anon_inode_getfd(name, fops, priv, flags, NULL, false);
}
EXPORT_SYMBOL_GPL(anon_inode_getfd);

/**
 * anon_inode_create_getfd - Like anon_inode_getfd(), but creates a new
 * !S_PRIVATE anon inode rather than reuse the singleton anon inode, and calls
 * the inode_init_security_anon() LSM hook.
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 * @context_inode:
 *           [in]    the logical relationship with the new inode (optional)
 *
 * Create a new anonymous inode and file pair.  This can be done for two
 * reasons:
 *
 * - for the inode to have its own security context, so that LSMs can enforce
 *   policy on the inode's creation;
 *
 * - if the caller needs a unique inode, for example in order to customize
 *   the size returned by fstat()
 *
 * The LSM may use @context_inode in inode_init_security_anon(), but a
 * reference to it is not held.
 *
 * Returns a newly created file descriptor or an error code.
 */
int anon_inode_create_getfd(const char *name, const struct file_operations *fops,
                            void *priv, int flags,
                            const struct inode *context_inode)
{
        return __anon_inode_getfd(name, fops, priv, flags, context_inode, true);
}


static int __init anon_inode_init(void)
{
        anon_inode_mnt = kern_mount(&anon_inode_fs_type);
        if (IS_ERR(anon_inode_mnt))
                panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt));

        anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
        if (IS_ERR(anon_inode_inode))
                panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));

        return 0;
}

fs_initcall(anon_inode_init);




















































































    1 


    1 


    1 







    5 
    5 








































    5 


    4 



    5 



































    5 
    5 













    1 


    1 




    1 






    1 
    1 














    1 





    1 
    1 















    1 







    1 














    5 







    5 



















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2008 Oracle.  All rights reserved.
 */

#include <linux/sched.h>
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/page-flags.h>
#include <asm/bug.h>
#include <trace/events/btrfs.h>
#include "misc.h"
#include "ctree.h"
#include "extent_io.h"
#include "locking.h"

/*
 * Lockdep class keys for extent_buffer->lock's in this root.  For a given
 * eb, the lockdep key is determined by the btrfs_root it belongs to and
 * the level the eb occupies in the tree.
 *
 * Different roots are used for different purposes and may nest inside each
 * other and they require separate keysets.  As lockdep keys should be
 * static, assign keysets according to the purpose of the root as indicated
 * by btrfs_root->root_key.objectid.  This ensures that all special purpose
 * roots have separate keysets.
 *
 * Lock-nesting across peer nodes is always done with the immediate parent
 * node locked thus preventing deadlock.  As lockdep doesn't know this, use
 * subclass to avoid triggering lockdep warning in such cases.
 *
 * The key is set by the readpage_end_io_hook after the buffer has passed
 * csum validation but before the pages are unlocked.  It is also set by
 * btrfs_init_new_buffer on freshly allocated blocks.
 *
 * We also add a check to make sure the highest level of the tree is the
 * same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this code
 * needs update as well.
 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
#if BTRFS_MAX_LEVEL != 8
#error
#endif

#define DEFINE_LEVEL(stem, level)                                        \
        .names[level] = "btrfs-" stem "-0" #level,

#define DEFINE_NAME(stem)                                                \
        DEFINE_LEVEL(stem, 0)                                                \
        DEFINE_LEVEL(stem, 1)                                                \
        DEFINE_LEVEL(stem, 2)                                                \
        DEFINE_LEVEL(stem, 3)                                                \
        DEFINE_LEVEL(stem, 4)                                                \
        DEFINE_LEVEL(stem, 5)                                                \
        DEFINE_LEVEL(stem, 6)                                                \
        DEFINE_LEVEL(stem, 7)

static struct btrfs_lockdep_keyset {
        u64                        id;                /* root objectid */
        /* Longest entry: btrfs-block-group-00 */
        char                        names[BTRFS_MAX_LEVEL][24];
        struct lock_class_key        keys[BTRFS_MAX_LEVEL];
} btrfs_lockdep_keysets[] = {
        { .id = BTRFS_ROOT_TREE_OBJECTID,        DEFINE_NAME("root")        },
        { .id = BTRFS_EXTENT_TREE_OBJECTID,        DEFINE_NAME("extent")        },
        { .id = BTRFS_CHUNK_TREE_OBJECTID,        DEFINE_NAME("chunk")        },
        { .id = BTRFS_DEV_TREE_OBJECTID,        DEFINE_NAME("dev")        },
        { .id = BTRFS_CSUM_TREE_OBJECTID,        DEFINE_NAME("csum")        },
        { .id = BTRFS_QUOTA_TREE_OBJECTID,        DEFINE_NAME("quota")        },
        { .id = BTRFS_TREE_LOG_OBJECTID,        DEFINE_NAME("log")        },
        { .id = BTRFS_TREE_RELOC_OBJECTID,        DEFINE_NAME("treloc")        },
        { .id = BTRFS_DATA_RELOC_TREE_OBJECTID,        DEFINE_NAME("dreloc")        },
        { .id = BTRFS_UUID_TREE_OBJECTID,        DEFINE_NAME("uuid")        },
        { .id = BTRFS_FREE_SPACE_TREE_OBJECTID,        DEFINE_NAME("free-space") },
        { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") },
        { .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") },
        { .id = 0,                                DEFINE_NAME("tree")        },
};

#undef DEFINE_LEVEL
#undef DEFINE_NAME

void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level)
{
        struct btrfs_lockdep_keyset *ks;

        ASSERT(level < ARRAY_SIZE(ks->keys));

        /* Find the matching keyset, id 0 is the default entry */
        for (ks = btrfs_lockdep_keysets; ks->id; ks++)
                if (ks->id == objectid)
                        break;

        lockdep_set_class_and_name(&eb->lock, &ks->keys[level], ks->names[level]);
}

void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb)
{
        if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
                btrfs_set_buffer_lockdep_class(btrfs_root_id(root),
                                               eb, btrfs_header_level(eb));
}

#endif

#ifdef CONFIG_BTRFS_DEBUG
static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner)
{
        eb->lock_owner = owner;
}
#else
static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { }
#endif

/*
 * Extent buffer locking
 * =====================
 *
 * We use a rw_semaphore for tree locking, and the semantics are exactly the
 * same:
 *
 * - reader/writer exclusion
 * - writer/writer exclusion
 * - reader/reader sharing
 * - try-lock semantics for readers and writers
 *
 * The rwsem implementation does opportunistic spinning which reduces number of
 * times the locking task needs to sleep.
 */

/*
 * btrfs_tree_read_lock_nested - lock extent buffer for read
 * @eb:                the eb to be locked
 * @nest:        the nesting level to be used for lockdep
 *
 * This takes the read lock on the extent buffer, using the specified nesting
 * level for lockdep purposes.
 */
void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
{
        u64 start_ns = 0;

        if (trace_btrfs_tree_read_lock_enabled())
                start_ns = ktime_get_ns();

        down_read_nested(&eb->lock, nest);
        trace_btrfs_tree_read_lock(eb, start_ns);
}

/*
 * Try-lock for read.
 *
 * Return 1 if the rwlock has been taken, 0 otherwise
 */
int btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
        if (down_read_trylock(&eb->lock)) {
                trace_btrfs_try_tree_read_lock(eb);
                return 1;
        }
        return 0;
}

/*
 * Try-lock for write.
 *
 * Return 1 if the rwlock has been taken, 0 otherwise
 */
int btrfs_try_tree_write_lock(struct extent_buffer *eb)
{
        if (down_write_trylock(&eb->lock)) {
                btrfs_set_eb_lock_owner(eb, current->pid);
                trace_btrfs_try_tree_write_lock(eb);
                return 1;
        }
        return 0;
}

/*
 * Release read lock.
 */
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
        trace_btrfs_tree_read_unlock(eb);
        up_read(&eb->lock);
}

/*
 * Lock eb for write.
 *
 * @eb:                the eb to lock
 * @nest:        the nesting to use for the lock
 *
 * Returns with the eb->lock write locked.
 */
void btrfs_tree_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
        __acquires(&eb->lock)
{
        u64 start_ns = 0;

        if (trace_btrfs_tree_lock_enabled())
                start_ns = ktime_get_ns();

        down_write_nested(&eb->lock, nest);
        btrfs_set_eb_lock_owner(eb, current->pid);
        trace_btrfs_tree_lock(eb, start_ns);
}

/*
 * Release the write lock.
 */
void btrfs_tree_unlock(struct extent_buffer *eb)
{
        trace_btrfs_tree_unlock(eb);
        btrfs_set_eb_lock_owner(eb, 0);
        up_write(&eb->lock);
}

/*
 * This releases any locks held in the path starting at level and going all the
 * way up to the root.
 *
 * btrfs_search_slot will keep the lock held on higher nodes in a few corner
 * cases, such as COW of the block at slot zero in the node.  This ignores
 * those rules, and it should only be called when there are no more updates to
 * be done higher up in the tree.
 */
void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
{
        int i;

        if (path->keep_locks)
                return;

        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
                if (!path->nodes[i])
                        continue;
                if (!path->locks[i])
                        continue;
                btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
                path->locks[i] = 0;
        }
}

/*
 * Loop around taking references on and locking the root node of the tree until
 * we end up with a lock on the root node.
 *
 * Return: root extent buffer with write lock held
 */
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
{
        struct extent_buffer *eb;

        while (1) {
                eb = btrfs_root_node(root);

                btrfs_maybe_reset_lockdep_class(root, eb);
                btrfs_tree_lock(eb);
                if (eb == root->node)
                        break;
                btrfs_tree_unlock(eb);
                free_extent_buffer(eb);
        }
        return eb;
}

/*
 * Loop around taking references on and locking the root node of the tree until
 * we end up with a lock on the root node.
 *
 * Return: root extent buffer with read lock held
 */
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
{
        struct extent_buffer *eb;

        while (1) {
                eb = btrfs_root_node(root);

                btrfs_maybe_reset_lockdep_class(root, eb);
                btrfs_tree_read_lock(eb);
                if (eb == root->node)
                        break;
                btrfs_tree_read_unlock(eb);
                free_extent_buffer(eb);
        }
        return eb;
}

/*
 * Loop around taking references on and locking the root node of the tree in
 * nowait mode until we end up with a lock on the root node or returning to
 * avoid blocking.
 *
 * Return: root extent buffer with read lock held or -EAGAIN.
 */
struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root)
{
        struct extent_buffer *eb;

        while (1) {
                eb = btrfs_root_node(root);
                if (!btrfs_try_tree_read_lock(eb)) {
                        free_extent_buffer(eb);
                        return ERR_PTR(-EAGAIN);
                }
                if (eb == root->node)
                        break;
                btrfs_tree_read_unlock(eb);
                free_extent_buffer(eb);
        }
        return eb;
}

/*
 * DREW locks
 * ==========
 *
 * DREW stands for double-reader-writer-exclusion lock. It's used in situation
 * where you want to provide A-B exclusion but not AA or BB.
 *
 * Currently implementation gives more priority to reader. If a reader and a
 * writer both race to acquire their respective sides of the lock the writer
 * would yield its lock as soon as it detects a concurrent reader. Additionally
 * if there are pending readers no new writers would be allowed to come in and
 * acquire the lock.
 */

void btrfs_drew_lock_init(struct btrfs_drew_lock *lock)
{
        atomic_set(&lock->readers, 0);
        atomic_set(&lock->writers, 0);
        init_waitqueue_head(&lock->pending_readers);
        init_waitqueue_head(&lock->pending_writers);
}

/* Return true if acquisition is successful, false otherwise */
bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock)
{
        if (atomic_read(&lock->readers))
                return false;

        atomic_inc(&lock->writers);

        /* Ensure writers count is updated before we check for pending readers */
        smp_mb__after_atomic();
        if (atomic_read(&lock->readers)) {
                btrfs_drew_write_unlock(lock);
                return false;
        }

        return true;
}

void btrfs_drew_write_lock(struct btrfs_drew_lock *lock)
{
        while (true) {
                if (btrfs_drew_try_write_lock(lock))
                        return;
                wait_event(lock->pending_writers, !atomic_read(&lock->readers));
        }
}

void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock)
{
        /*
         * atomic_dec_and_test() implies a full barrier, so woken up readers are
         * guaranteed to see the decrement.
         */
        if (atomic_dec_and_test(&lock->writers))
                wake_up(&lock->pending_readers);
}

void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
{
        atomic_inc(&lock->readers);

        /*
         * Ensure the pending reader count is perceieved BEFORE this reader
         * goes to sleep in case of active writers. This guarantees new writers
         * won't be allowed and that the current reader will be woken up when
         * the last active writer finishes its jobs.
         */
        smp_mb__after_atomic();

        wait_event(lock->pending_readers, atomic_read(&lock->writers) == 0);
}

void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock)
{
        /*
         * atomic_dec_and_test implies a full barrier, so woken up writers
         * are guaranteed to see the decrement
         */
        if (atomic_dec_and_test(&lock->readers))
                wake_up(&lock->pending_writers);
}







































































































































































































































































































































































































































































































    1 
    1 










    1 
    1 
























    1 




   21 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 







    1 






















    1 







    1 






































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
/* CPU control.
 * (C) 2001, 2002, 2003, 2004 Rusty Russell
 *
 * This code is licenced under the GPL.
 */
#include <linux/sched/mm.h>
#include <linux/proc_fs.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/notifier.h>
#include <linux/sched/signal.h>
#include <linux/sched/hotplug.h>
#include <linux/sched/isolation.h>
#include <linux/sched/task.h>
#include <linux/sched/smt.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/rcupdate.h>
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/bug.h>
#include <linux/kthread.h>
#include <linux/stop_machine.h>
#include <linux/mutex.h>
#include <linux/gfp.h>
#include <linux/suspend.h>
#include <linux/lockdep.h>
#include <linux/tick.h>
#include <linux/irq.h>
#include <linux/nmi.h>
#include <linux/smpboot.h>
#include <linux/relay.h>
#include <linux/slab.h>
#include <linux/scs.h>
#include <linux/percpu-rwsem.h>
#include <linux/cpuset.h>
#include <linux/random.h>
#include <linux/cc_platform.h>

#include <trace/events/power.h>
#define CREATE_TRACE_POINTS
#include <trace/events/cpuhp.h>

#include "smpboot.h"

/**
 * struct cpuhp_cpu_state - Per cpu hotplug state storage
 * @state:        The current cpu state
 * @target:        The target state
 * @fail:        Current CPU hotplug callback state
 * @thread:        Pointer to the hotplug thread
 * @should_run:        Thread should execute
 * @rollback:        Perform a rollback
 * @single:        Single callback invocation
 * @bringup:        Single callback bringup or teardown selector
 * @node:        Remote CPU node; for multi-instance, do a
 *                single entry callback for install/remove
 * @last:        For multi-instance rollback, remember how far we got
 * @cb_state:        The state for a single callback (install/uninstall)
 * @result:        Result of the operation
 * @ap_sync_state:        State for AP synchronization
 * @done_up:        Signal completion to the issuer of the task for cpu-up
 * @done_down:        Signal completion to the issuer of the task for cpu-down
 */
struct cpuhp_cpu_state {
        enum cpuhp_state        state;
        enum cpuhp_state        target;
        enum cpuhp_state        fail;
#ifdef CONFIG_SMP
        struct task_struct        *thread;
        bool                        should_run;
        bool                        rollback;
        bool                        single;
        bool                        bringup;
        struct hlist_node        *node;
        struct hlist_node        *last;
        enum cpuhp_state        cb_state;
        int                        result;
        atomic_t                ap_sync_state;
        struct completion        done_up;
        struct completion        done_down;
#endif
};

static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
        .fail = CPUHP_INVALID,
};

#ifdef CONFIG_SMP
cpumask_t cpus_booted_once_mask;
#endif

#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
static struct lockdep_map cpuhp_state_up_map =
        STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
static struct lockdep_map cpuhp_state_down_map =
        STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);


static inline void cpuhp_lock_acquire(bool bringup)
{
        lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
}

static inline void cpuhp_lock_release(bool bringup)
{
        lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
}
#else

static inline void cpuhp_lock_acquire(bool bringup) { }
static inline void cpuhp_lock_release(bool bringup) { }

#endif

/**
 * struct cpuhp_step - Hotplug state machine step
 * @name:        Name of the step
 * @startup:        Startup function of the step
 * @teardown:        Teardown function of the step
 * @cant_stop:        Bringup/teardown can't be stopped at this step
 * @multi_instance:        State has multiple instances which get added afterwards
 */
struct cpuhp_step {
        const char                *name;
        union {
                int                (*single)(unsigned int cpu);
                int                (*multi)(unsigned int cpu,
                                         struct hlist_node *node);
        } startup;
        union {
                int                (*single)(unsigned int cpu);
                int                (*multi)(unsigned int cpu,
                                         struct hlist_node *node);
        } teardown;
        /* private: */
        struct hlist_head        list;
        /* public: */
        bool                        cant_stop;
        bool                        multi_instance;
};

static DEFINE_MUTEX(cpuhp_state_mutex);
static struct cpuhp_step cpuhp_hp_states[];

static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
{
        return cpuhp_hp_states + state;
}

static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
{
        return bringup ? !step->startup.single : !step->teardown.single;
}

/**
 * cpuhp_invoke_callback - Invoke the callbacks for a given state
 * @cpu:        The cpu for which the callback should be invoked
 * @state:        The state to do callbacks for
 * @bringup:        True if the bringup callback should be invoked
 * @node:        For multi-instance, do a single entry callback for install/remove
 * @lastp:        For multi-instance rollback, remember how far we got
 *
 * Called from cpu hotplug and from the state register machinery.
 *
 * Return: %0 on success or a negative errno code
 */
static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
                                 bool bringup, struct hlist_node *node,
                                 struct hlist_node **lastp)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        struct cpuhp_step *step = cpuhp_get_step(state);
        int (*cbm)(unsigned int cpu, struct hlist_node *node);
        int (*cb)(unsigned int cpu);
        int ret, cnt;

        if (st->fail == state) {
                st->fail = CPUHP_INVALID;
                return -EAGAIN;
        }

        if (cpuhp_step_empty(bringup, step)) {
                WARN_ON_ONCE(1);
                return 0;
        }

        if (!step->multi_instance) {
                WARN_ON_ONCE(lastp && *lastp);
                cb = bringup ? step->startup.single : step->teardown.single;

                trace_cpuhp_enter(cpu, st->target, state, cb);
                ret = cb(cpu);
                trace_cpuhp_exit(cpu, st->state, state, ret);
                return ret;
        }
        cbm = bringup ? step->startup.multi : step->teardown.multi;

        /* Single invocation for instance add/remove */
        if (node) {
                WARN_ON_ONCE(lastp && *lastp);
                trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
                ret = cbm(cpu, node);
                trace_cpuhp_exit(cpu, st->state, state, ret);
                return ret;
        }

        /* State transition. Invoke on all instances */
        cnt = 0;
        hlist_for_each(node, &step->list) {
                if (lastp && node == *lastp)
                        break;

                trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
                ret = cbm(cpu, node);
                trace_cpuhp_exit(cpu, st->state, state, ret);
                if (ret) {
                        if (!lastp)
                                goto err;

                        *lastp = node;
                        return ret;
                }
                cnt++;
        }
        if (lastp)
                *lastp = NULL;
        return 0;
err:
        /* Rollback the instances if one failed */
        cbm = !bringup ? step->startup.multi : step->teardown.multi;
        if (!cbm)
                return ret;

        hlist_for_each(node, &step->list) {
                if (!cnt--)
                        break;

                trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
                ret = cbm(cpu, node);
                trace_cpuhp_exit(cpu, st->state, state, ret);
                /*
                 * Rollback must not fail,
                 */
                WARN_ON_ONCE(ret);
        }
        return ret;
}

#ifdef CONFIG_SMP
static bool cpuhp_is_ap_state(enum cpuhp_state state)
{
        /*
         * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
         * purposes as that state is handled explicitly in cpu_down.
         */
        return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
}

static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
{
        struct completion *done = bringup ? &st->done_up : &st->done_down;
        wait_for_completion(done);
}

static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
{
        struct completion *done = bringup ? &st->done_up : &st->done_down;
        complete(done);
}

/*
 * The former STARTING/DYING states, ran with IRQs disabled and must not fail.
 */
static bool cpuhp_is_atomic_state(enum cpuhp_state state)
{
        return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
}

/* Synchronization state management */
enum cpuhp_sync_state {
        SYNC_STATE_DEAD,
        SYNC_STATE_KICKED,
        SYNC_STATE_SHOULD_DIE,
        SYNC_STATE_ALIVE,
        SYNC_STATE_SHOULD_ONLINE,
        SYNC_STATE_ONLINE,
};

#ifdef CONFIG_HOTPLUG_CORE_SYNC
/**
 * cpuhp_ap_update_sync_state - Update synchronization state during bringup/teardown
 * @state:        The synchronization state to set
 *
 * No synchronization point. Just update of the synchronization state, but implies
 * a full barrier so that the AP changes are visible before the control CPU proceeds.
 */
static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state)
{
        atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state);

        (void)atomic_xchg(st, state);
}

void __weak arch_cpuhp_sync_state_poll(void) { cpu_relax(); }

static bool cpuhp_wait_for_sync_state(unsigned int cpu, enum cpuhp_sync_state state,
                                      enum cpuhp_sync_state next_state)
{
        atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
        ktime_t now, end, start = ktime_get();
        int sync;

        end = start + 10ULL * NSEC_PER_SEC;

        sync = atomic_read(st);
        while (1) {
                if (sync == state) {
                        if (!atomic_try_cmpxchg(st, &sync, next_state))
                                continue;
                        return true;
                }

                now = ktime_get();
                if (now > end) {
                        /* Timeout. Leave the state unchanged */
                        return false;
                } else if (now - start < NSEC_PER_MSEC) {
                        /* Poll for one millisecond */
                        arch_cpuhp_sync_state_poll();
                } else {
                        usleep_range_state(USEC_PER_MSEC, 2 * USEC_PER_MSEC, TASK_UNINTERRUPTIBLE);
                }
                sync = atomic_read(st);
        }
        return true;
}
#else  /* CONFIG_HOTPLUG_CORE_SYNC */
static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state) { }
#endif /* !CONFIG_HOTPLUG_CORE_SYNC */

#ifdef CONFIG_HOTPLUG_CORE_SYNC_DEAD
/**
 * cpuhp_ap_report_dead - Update synchronization state to DEAD
 *
 * No synchronization point. Just update of the synchronization state.
 */
void cpuhp_ap_report_dead(void)
{
        cpuhp_ap_update_sync_state(SYNC_STATE_DEAD);
}

void __weak arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) { }

/*
 * Late CPU shutdown synchronization point. Cannot use cpuhp_state::done_down
 * because the AP cannot issue complete() at this stage.
 */
static void cpuhp_bp_sync_dead(unsigned int cpu)
{
        atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
        int sync = atomic_read(st);

        do {
                /* CPU can have reported dead already. Don't overwrite that! */
                if (sync == SYNC_STATE_DEAD)
                        break;
        } while (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_SHOULD_DIE));

        if (cpuhp_wait_for_sync_state(cpu, SYNC_STATE_DEAD, SYNC_STATE_DEAD)) {
                /* CPU reached dead state. Invoke the cleanup function */
                arch_cpuhp_cleanup_dead_cpu(cpu);
                return;
        }

        /* No further action possible. Emit message and give up. */
        pr_err("CPU%u failed to report dead state\n", cpu);
}
#else /* CONFIG_HOTPLUG_CORE_SYNC_DEAD */
static inline void cpuhp_bp_sync_dead(unsigned int cpu) { }
#endif /* !CONFIG_HOTPLUG_CORE_SYNC_DEAD */

#ifdef CONFIG_HOTPLUG_CORE_SYNC_FULL
/**
 * cpuhp_ap_sync_alive - Synchronize AP with the control CPU once it is alive
 *
 * Updates the AP synchronization state to SYNC_STATE_ALIVE and waits
 * for the BP to release it.
 */
void cpuhp_ap_sync_alive(void)
{
        atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state);

        cpuhp_ap_update_sync_state(SYNC_STATE_ALIVE);

        /* Wait for the control CPU to release it. */
        while (atomic_read(st) != SYNC_STATE_SHOULD_ONLINE)
                cpu_relax();
}

static bool cpuhp_can_boot_ap(unsigned int cpu)
{
        atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
        int sync = atomic_read(st);

again:
        switch (sync) {
        case SYNC_STATE_DEAD:
                /* CPU is properly dead */
                break;
        case SYNC_STATE_KICKED:
                /* CPU did not come up in previous attempt */
                break;
        case SYNC_STATE_ALIVE:
                /* CPU is stuck cpuhp_ap_sync_alive(). */
                break;
        default:
                /* CPU failed to report online or dead and is in limbo state. */
                return false;
        }

        /* Prepare for booting */
        if (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_KICKED))
                goto again;

        return true;
}

void __weak arch_cpuhp_cleanup_kick_cpu(unsigned int cpu) { }

/*
 * Early CPU bringup synchronization point. Cannot use cpuhp_state::done_up
 * because the AP cannot issue complete() so early in the bringup.
 */
static int cpuhp_bp_sync_alive(unsigned int cpu)
{
        int ret = 0;

        if (!IS_ENABLED(CONFIG_HOTPLUG_CORE_SYNC_FULL))
                return 0;

        if (!cpuhp_wait_for_sync_state(cpu, SYNC_STATE_ALIVE, SYNC_STATE_SHOULD_ONLINE)) {
                pr_err("CPU%u failed to report alive state\n", cpu);
                ret = -EIO;
        }

        /* Let the architecture cleanup the kick alive mechanics. */
        arch_cpuhp_cleanup_kick_cpu(cpu);
        return ret;
}
#else /* CONFIG_HOTPLUG_CORE_SYNC_FULL */
static inline int cpuhp_bp_sync_alive(unsigned int cpu) { return 0; }
static inline bool cpuhp_can_boot_ap(unsigned int cpu) { return true; }
#endif /* !CONFIG_HOTPLUG_CORE_SYNC_FULL */

/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
bool cpuhp_tasks_frozen;
EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen);

/*
 * The following two APIs (cpu_maps_update_begin/done) must be used when
 * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
 */
void cpu_maps_update_begin(void)
{
        mutex_lock(&cpu_add_remove_lock);
}

void cpu_maps_update_done(void)
{
        mutex_unlock(&cpu_add_remove_lock);
}

/*
 * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
 * Should always be manipulated under cpu_add_remove_lock
 */
static int cpu_hotplug_disabled;

#ifdef CONFIG_HOTPLUG_CPU

DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);

void cpus_read_lock(void)
{
        percpu_down_read(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(cpus_read_lock);

int cpus_read_trylock(void)
{
        return percpu_down_read_trylock(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(cpus_read_trylock);

void cpus_read_unlock(void)
{
        percpu_up_read(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(cpus_read_unlock);

void cpus_write_lock(void)
{
        percpu_down_write(&cpu_hotplug_lock);
}

void cpus_write_unlock(void)
{
        percpu_up_write(&cpu_hotplug_lock);
}

void lockdep_assert_cpus_held(void)
{
        /*
         * We can't have hotplug operations before userspace starts running,
         * and some init codepaths will knowingly not take the hotplug lock.
         * This is all valid, so mute lockdep until it makes sense to report
         * unheld locks.
         */
        if (system_state < SYSTEM_RUNNING)
                return;

        percpu_rwsem_assert_held(&cpu_hotplug_lock);
}

#ifdef CONFIG_LOCKDEP
int lockdep_is_cpus_held(void)
{
        return percpu_rwsem_is_held(&cpu_hotplug_lock);
}
#endif

static void lockdep_acquire_cpus_lock(void)
{
        rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
}

static void lockdep_release_cpus_lock(void)
{
        rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_);
}

/*
 * Wait for currently running CPU hotplug operations to complete (if any) and
 * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
 * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
 * hotplug path before performing hotplug operations. So acquiring that lock
 * guarantees mutual exclusion from any currently running hotplug operations.
 */
void cpu_hotplug_disable(void)
{
        cpu_maps_update_begin();
        cpu_hotplug_disabled++;
        cpu_maps_update_done();
}
EXPORT_SYMBOL_GPL(cpu_hotplug_disable);

static void __cpu_hotplug_enable(void)
{
        if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n"))
                return;
        cpu_hotplug_disabled--;
}

void cpu_hotplug_enable(void)
{
        cpu_maps_update_begin();
        __cpu_hotplug_enable();
        cpu_maps_update_done();
}
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);

#else

static void lockdep_acquire_cpus_lock(void)
{
}

static void lockdep_release_cpus_lock(void)
{
}

#endif        /* CONFIG_HOTPLUG_CPU */

/*
 * Architectures that need SMT-specific errata handling during SMT hotplug
 * should override this.
 */
void __weak arch_smt_update(void) { }

#ifdef CONFIG_HOTPLUG_SMT

enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
static unsigned int cpu_smt_max_threads __ro_after_init;
unsigned int cpu_smt_num_threads __read_mostly = UINT_MAX;

void __init cpu_smt_disable(bool force)
{
        if (!cpu_smt_possible())
                return;

        if (force) {
                pr_info("SMT: Force disabled\n");
                cpu_smt_control = CPU_SMT_FORCE_DISABLED;
        } else {
                pr_info("SMT: disabled\n");
                cpu_smt_control = CPU_SMT_DISABLED;
        }
        cpu_smt_num_threads = 1;
}

/*
 * The decision whether SMT is supported can only be done after the full
 * CPU identification. Called from architecture code.
 */
void __init cpu_smt_set_num_threads(unsigned int num_threads,
                                    unsigned int max_threads)
{
        WARN_ON(!num_threads || (num_threads > max_threads));

        if (max_threads == 1)
                cpu_smt_control = CPU_SMT_NOT_SUPPORTED;

        cpu_smt_max_threads = max_threads;

        /*
         * If SMT has been disabled via the kernel command line or SMT is
         * not supported, set cpu_smt_num_threads to 1 for consistency.
         * If enabled, take the architecture requested number of threads
         * to bring up into account.
         */
        if (cpu_smt_control != CPU_SMT_ENABLED)
                cpu_smt_num_threads = 1;
        else if (num_threads < cpu_smt_num_threads)
                cpu_smt_num_threads = num_threads;
}

static int __init smt_cmdline_disable(char *str)
{
        cpu_smt_disable(str && !strcmp(str, "force"));
        return 0;
}
early_param("nosmt", smt_cmdline_disable);

/*
 * For Archicture supporting partial SMT states check if the thread is allowed.
 * Otherwise this has already been checked through cpu_smt_max_threads when
 * setting the SMT level.
 */
static inline bool cpu_smt_thread_allowed(unsigned int cpu)
{
#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC
        return topology_smt_thread_allowed(cpu);
#else
        return true;
#endif
}

static inline bool cpu_bootable(unsigned int cpu)
{
        if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
                return true;

        /* All CPUs are bootable if controls are not configured */
        if (cpu_smt_control == CPU_SMT_NOT_IMPLEMENTED)
                return true;

        /* All CPUs are bootable if CPU is not SMT capable */
        if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
                return true;

        if (topology_is_primary_thread(cpu))
                return true;

        /*
         * On x86 it's required to boot all logical CPUs at least once so
         * that the init code can get a chance to set CR4.MCE on each
         * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any
         * core will shutdown the machine.
         */
        return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
}

/* Returns true if SMT is supported and not forcefully (irreversibly) disabled */
bool cpu_smt_possible(void)
{
        return cpu_smt_control != CPU_SMT_FORCE_DISABLED &&
                cpu_smt_control != CPU_SMT_NOT_SUPPORTED;
}
EXPORT_SYMBOL_GPL(cpu_smt_possible);

#else
static inline bool cpu_bootable(unsigned int cpu) { return true; }
#endif

static inline enum cpuhp_state
cpuhp_set_state(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
        enum cpuhp_state prev_state = st->state;
        bool bringup = st->state < target;

        st->rollback = false;
        st->last = NULL;

        st->target = target;
        st->single = false;
        st->bringup = bringup;
        if (cpu_dying(cpu) != !bringup)
                set_cpu_dying(cpu, !bringup);

        return prev_state;
}

static inline void
cpuhp_reset_state(int cpu, struct cpuhp_cpu_state *st,
                  enum cpuhp_state prev_state)
{
        bool bringup = !st->bringup;

        st->target = prev_state;

        /*
         * Already rolling back. No need invert the bringup value or to change
         * the current state.
         */
        if (st->rollback)
                return;

        st->rollback = true;

        /*
         * If we have st->last we need to undo partial multi_instance of this
         * state first. Otherwise start undo at the previous state.
         */
        if (!st->last) {
                if (st->bringup)
                        st->state--;
                else
                        st->state++;
        }

        st->bringup = bringup;
        if (cpu_dying(cpu) != !bringup)
                set_cpu_dying(cpu, !bringup);
}

/* Regular hotplug invocation of the AP hotplug thread */
static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
{
        if (!st->single && st->state == st->target)
                return;

        st->result = 0;
        /*
         * Make sure the above stores are visible before should_run becomes
         * true. Paired with the mb() above in cpuhp_thread_fun()
         */
        smp_mb();
        st->should_run = true;
        wake_up_process(st->thread);
        wait_for_ap_thread(st, st->bringup);
}

static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st,
                         enum cpuhp_state target)
{
        enum cpuhp_state prev_state;
        int ret;

        prev_state = cpuhp_set_state(cpu, st, target);
        __cpuhp_kick_ap(st);
        if ((ret = st->result)) {
                cpuhp_reset_state(cpu, st, prev_state);
                __cpuhp_kick_ap(st);
        }

        return ret;
}

static int bringup_wait_for_ap_online(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);

        /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
        wait_for_ap_thread(st, true);
        if (WARN_ON_ONCE((!cpu_online(cpu))))
                return -ECANCELED;

        /* Unpark the hotplug thread of the target cpu */
        kthread_unpark(st->thread);

        /*
         * SMT soft disabling on X86 requires to bring the CPU out of the
         * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit.  The
         * CPU marked itself as booted_once in notify_cpu_starting() so the
         * cpu_bootable() check will now return false if this is not the
         * primary sibling.
         */
        if (!cpu_bootable(cpu))
                return -ECANCELED;
        return 0;
}

#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP
static int cpuhp_kick_ap_alive(unsigned int cpu)
{
        if (!cpuhp_can_boot_ap(cpu))
                return -EAGAIN;

        return arch_cpuhp_kick_ap_alive(cpu, idle_thread_get(cpu));
}

static int cpuhp_bringup_ap(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        int ret;

        /*
         * Some architectures have to walk the irq descriptors to
         * setup the vector space for the cpu which comes online.
         * Prevent irq alloc/free across the bringup.
         */
        irq_lock_sparse();

        ret = cpuhp_bp_sync_alive(cpu);
        if (ret)
                goto out_unlock;

        ret = bringup_wait_for_ap_online(cpu);
        if (ret)
                goto out_unlock;

        irq_unlock_sparse();

        if (st->target <= CPUHP_AP_ONLINE_IDLE)
                return 0;

        return cpuhp_kick_ap(cpu, st, st->target);

out_unlock:
        irq_unlock_sparse();
        return ret;
}
#else
static int bringup_cpu(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        struct task_struct *idle = idle_thread_get(cpu);
        int ret;

        if (!cpuhp_can_boot_ap(cpu))
                return -EAGAIN;

        /*
         * Some architectures have to walk the irq descriptors to
         * setup the vector space for the cpu which comes online.
         *
         * Prevent irq alloc/free across the bringup by acquiring the
         * sparse irq lock. Hold it until the upcoming CPU completes the
         * startup in cpuhp_online_idle() which allows to avoid
         * intermediate synchronization points in the architecture code.
         */
        irq_lock_sparse();

        ret = __cpu_up(cpu, idle);
        if (ret)
                goto out_unlock;

        ret = cpuhp_bp_sync_alive(cpu);
        if (ret)
                goto out_unlock;

        ret = bringup_wait_for_ap_online(cpu);
        if (ret)
                goto out_unlock;

        irq_unlock_sparse();

        if (st->target <= CPUHP_AP_ONLINE_IDLE)
                return 0;

        return cpuhp_kick_ap(cpu, st, st->target);

out_unlock:
        irq_unlock_sparse();
        return ret;
}
#endif

static int finish_cpu(unsigned int cpu)
{
        struct task_struct *idle = idle_thread_get(cpu);
        struct mm_struct *mm = idle->active_mm;

        /*
         * idle_task_exit() will have switched to &init_mm, now
         * clean up any remaining active_mm state.
         */
        if (mm != &init_mm)
                idle->active_mm = &init_mm;
        mmdrop_lazy_tlb(mm);
        return 0;
}

/*
 * Hotplug state machine related functions
 */

/*
 * Get the next state to run. Empty ones will be skipped. Returns true if a
 * state must be run.
 *
 * st->state will be modified ahead of time, to match state_to_run, as if it
 * has already ran.
 */
static bool cpuhp_next_state(bool bringup,
                             enum cpuhp_state *state_to_run,
                             struct cpuhp_cpu_state *st,
                             enum cpuhp_state target)
{
        do {
                if (bringup) {
                        if (st->state >= target)
                                return false;

                        *state_to_run = ++st->state;
                } else {
                        if (st->state <= target)
                                return false;

                        *state_to_run = st->state--;
                }

                if (!cpuhp_step_empty(bringup, cpuhp_get_step(*state_to_run)))
                        break;
        } while (true);

        return true;
}

static int __cpuhp_invoke_callback_range(bool bringup,
                                         unsigned int cpu,
                                         struct cpuhp_cpu_state *st,
                                         enum cpuhp_state target,
                                         bool nofail)
{
        enum cpuhp_state state;
        int ret = 0;

        while (cpuhp_next_state(bringup, &state, st, target)) {
                int err;

                err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL);
                if (!err)
                        continue;

                if (nofail) {
                        pr_warn("CPU %u %s state %s (%d) failed (%d)\n",
                                cpu, bringup ? "UP" : "DOWN",
                                cpuhp_get_step(st->state)->name,
                                st->state, err);
                        ret = -1;
                } else {
                        ret = err;
                        break;
                }
        }

        return ret;
}

static inline int cpuhp_invoke_callback_range(bool bringup,
                                              unsigned int cpu,
                                              struct cpuhp_cpu_state *st,
                                              enum cpuhp_state target)
{
        return __cpuhp_invoke_callback_range(bringup, cpu, st, target, false);
}

static inline void cpuhp_invoke_callback_range_nofail(bool bringup,
                                                      unsigned int cpu,
                                                      struct cpuhp_cpu_state *st,
                                                      enum cpuhp_state target)
{
        __cpuhp_invoke_callback_range(bringup, cpu, st, target, true);
}

static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
{
        if (IS_ENABLED(CONFIG_HOTPLUG_CPU))
                return true;
        /*
         * When CPU hotplug is disabled, then taking the CPU down is not
         * possible because takedown_cpu() and the architecture and
         * subsystem specific mechanisms are not available. So the CPU
         * which would be completely unplugged again needs to stay around
         * in the current state.
         */
        return st->state <= CPUHP_BRINGUP_CPU;
}

static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
                              enum cpuhp_state target)
{
        enum cpuhp_state prev_state = st->state;
        int ret = 0;

        ret = cpuhp_invoke_callback_range(true, cpu, st, target);
        if (ret) {
                pr_debug("CPU UP failed (%d) CPU %u state %s (%d)\n",
                         ret, cpu, cpuhp_get_step(st->state)->name,
                         st->state);

                cpuhp_reset_state(cpu, st, prev_state);
                if (can_rollback_cpu(st))
                        WARN_ON(cpuhp_invoke_callback_range(false, cpu, st,
                                                            prev_state));
        }
        return ret;
}

/*
 * The cpu hotplug threads manage the bringup and teardown of the cpus
 */
static int cpuhp_should_run(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);

        return st->should_run;
}

/*
 * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
 * callbacks when a state gets [un]installed at runtime.
 *
 * Each invocation of this function by the smpboot thread does a single AP
 * state callback.
 *
 * It has 3 modes of operation:
 *  - single: runs st->cb_state
 *  - up:     runs ++st->state, while st->state < st->target
 *  - down:   runs st->state--, while st->state > st->target
 *
 * When complete or on error, should_run is cleared and the completion is fired.
 */
static void cpuhp_thread_fun(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
        bool bringup = st->bringup;
        enum cpuhp_state state;

        if (WARN_ON_ONCE(!st->should_run))
                return;

        /*
         * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
         * that if we see ->should_run we also see the rest of the state.
         */
        smp_mb();

        /*
         * The BP holds the hotplug lock, but we're now running on the AP,
         * ensure that anybody asserting the lock is held, will actually find
         * it so.
         */
        lockdep_acquire_cpus_lock();
        cpuhp_lock_acquire(bringup);

        if (st->single) {
                state = st->cb_state;
                st->should_run = false;
        } else {
                st->should_run = cpuhp_next_state(bringup, &state, st, st->target);
                if (!st->should_run)
                        goto end;
        }

        WARN_ON_ONCE(!cpuhp_is_ap_state(state));

        if (cpuhp_is_atomic_state(state)) {
                local_irq_disable();
                st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
                local_irq_enable();

                /*
                 * STARTING/DYING must not fail!
                 */
                WARN_ON_ONCE(st->result);
        } else {
                st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
        }

        if (st->result) {
                /*
                 * If we fail on a rollback, we're up a creek without no
                 * paddle, no way forward, no way back. We loose, thanks for
                 * playing.
                 */
                WARN_ON_ONCE(st->rollback);
                st->should_run = false;
        }

end:
        cpuhp_lock_release(bringup);
        lockdep_release_cpus_lock();

        if (!st->should_run)
                complete_ap_thread(st, bringup);
}

/* Invoke a single callback on a remote cpu */
static int
cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
                         struct hlist_node *node)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        int ret;

        if (!cpu_online(cpu))
                return 0;

        cpuhp_lock_acquire(false);
        cpuhp_lock_release(false);

        cpuhp_lock_acquire(true);
        cpuhp_lock_release(true);

        /*
         * If we are up and running, use the hotplug thread. For early calls
         * we invoke the thread function directly.
         */
        if (!st->thread)
                return cpuhp_invoke_callback(cpu, state, bringup, node, NULL);

        st->rollback = false;
        st->last = NULL;

        st->node = node;
        st->bringup = bringup;
        st->cb_state = state;
        st->single = true;

        __cpuhp_kick_ap(st);

        /*
         * If we failed and did a partial, do a rollback.
         */
        if ((ret = st->result) && st->last) {
                st->rollback = true;
                st->bringup = !bringup;

                __cpuhp_kick_ap(st);
        }

        /*
         * Clean up the leftovers so the next hotplug operation wont use stale
         * data.
         */
        st->node = st->last = NULL;
        return ret;
}

static int cpuhp_kick_ap_work(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        enum cpuhp_state prev_state = st->state;
        int ret;

        cpuhp_lock_acquire(false);
        cpuhp_lock_release(false);

        cpuhp_lock_acquire(true);
        cpuhp_lock_release(true);

        trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
        ret = cpuhp_kick_ap(cpu, st, st->target);
        trace_cpuhp_exit(cpu, st->state, prev_state, ret);

        return ret;
}

static struct smp_hotplug_thread cpuhp_threads = {
        .store                        = &cpuhp_state.thread,
        .thread_should_run        = cpuhp_should_run,
        .thread_fn                = cpuhp_thread_fun,
        .thread_comm                = "cpuhp/%u",
        .selfparking                = true,
};

static __init void cpuhp_init_state(void)
{
        struct cpuhp_cpu_state *st;
        int cpu;

        for_each_possible_cpu(cpu) {
                st = per_cpu_ptr(&cpuhp_state, cpu);
                init_completion(&st->done_up);
                init_completion(&st->done_down);
        }
}

void __init cpuhp_threads_init(void)
{
        cpuhp_init_state();
        BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads));
        kthread_unpark(this_cpu_read(cpuhp_state.thread));
}

#ifdef CONFIG_HOTPLUG_CPU
#ifndef arch_clear_mm_cpumask_cpu
#define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
#endif

/**
 * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
 * @cpu: a CPU id
 *
 * This function walks all processes, finds a valid mm struct for each one and
 * then clears a corresponding bit in mm's cpumask.  While this all sounds
 * trivial, there are various non-obvious corner cases, which this function
 * tries to solve in a safe manner.
 *
 * Also note that the function uses a somewhat relaxed locking scheme, so it may
 * be called only for an already offlined CPU.
 */
void clear_tasks_mm_cpumask(int cpu)
{
        struct task_struct *p;

        /*
         * This function is called after the cpu is taken down and marked
         * offline, so its not like new tasks will ever get this cpu set in
         * their mm mask. -- Peter Zijlstra
         * Thus, we may use rcu_read_lock() here, instead of grabbing
         * full-fledged tasklist_lock.
         */
        WARN_ON(cpu_online(cpu));
        rcu_read_lock();
        for_each_process(p) {
                struct task_struct *t;

                /*
                 * Main thread might exit, but other threads may still have
                 * a valid mm. Find one.
                 */
                t = find_lock_task_mm(p);
                if (!t)
                        continue;
                arch_clear_mm_cpumask_cpu(cpu, t->mm);
                task_unlock(t);
        }
        rcu_read_unlock();
}

/* Take this CPU down. */
static int take_cpu_down(void *_param)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
        enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
        int err, cpu = smp_processor_id();

        /* Ensure this CPU doesn't handle any more interrupts. */
        err = __cpu_disable();
        if (err < 0)
                return err;

        /*
         * Must be called from CPUHP_TEARDOWN_CPU, which means, as we are going
         * down, that the current state is CPUHP_TEARDOWN_CPU - 1.
         */
        WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1));

        /*
         * Invoke the former CPU_DYING callbacks. DYING must not fail!
         */
        cpuhp_invoke_callback_range_nofail(false, cpu, st, target);

        /* Park the stopper thread */
        stop_machine_park(cpu);
        return 0;
}

static int takedown_cpu(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        int err;

        /* Park the smpboot threads */
        kthread_park(st->thread);

        /*
         * Prevent irq alloc/free while the dying cpu reorganizes the
         * interrupt affinities.
         */
        irq_lock_sparse();

        /*
         * So now all preempt/rcu users must observe !cpu_active().
         */
        err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
        if (err) {
                /* CPU refused to die */
                irq_unlock_sparse();
                /* Unpark the hotplug thread so we can rollback there */
                kthread_unpark(st->thread);
                return err;
        }
        BUG_ON(cpu_online(cpu));

        /*
         * The teardown callback for CPUHP_AP_SCHED_STARTING will have removed
         * all runnable tasks from the CPU, there's only the idle task left now
         * that the migration thread is done doing the stop_machine thing.
         *
         * Wait for the stop thread to go away.
         */
        wait_for_ap_thread(st, false);
        BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);

        /* Interrupts are moved away from the dying cpu, reenable alloc/free */
        irq_unlock_sparse();

        hotplug_cpu__broadcast_tick_pull(cpu);
        /* This actually kills the CPU. */
        __cpu_die(cpu);

        cpuhp_bp_sync_dead(cpu);

        tick_cleanup_dead_cpu(cpu);

        /*
         * Callbacks must be re-integrated right away to the RCU state machine.
         * Otherwise an RCU callback could block a further teardown function
         * waiting for its completion.
         */
        rcutree_migrate_callbacks(cpu);

        return 0;
}

static void cpuhp_complete_idle_dead(void *arg)
{
        struct cpuhp_cpu_state *st = arg;

        complete_ap_thread(st, false);
}

void cpuhp_report_idle_dead(void)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);

        BUG_ON(st->state != CPUHP_AP_OFFLINE);
        tick_assert_timekeeping_handover();
        rcutree_report_cpu_dead();
        st->state = CPUHP_AP_IDLE_DEAD;
        /*
         * We cannot call complete after rcutree_report_cpu_dead() so we delegate it
         * to an online cpu.
         */
        smp_call_function_single(cpumask_first(cpu_online_mask),
                                 cpuhp_complete_idle_dead, st, 0);
}

static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
                                enum cpuhp_state target)
{
        enum cpuhp_state prev_state = st->state;
        int ret = 0;

        ret = cpuhp_invoke_callback_range(false, cpu, st, target);
        if (ret) {
                pr_debug("CPU DOWN failed (%d) CPU %u state %s (%d)\n",
                         ret, cpu, cpuhp_get_step(st->state)->name,
                         st->state);

                cpuhp_reset_state(cpu, st, prev_state);

                if (st->state < prev_state)
                        WARN_ON(cpuhp_invoke_callback_range(true, cpu, st,
                                                            prev_state));
        }

        return ret;
}

/* Requires cpu_add_remove_lock to be held */
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
                           enum cpuhp_state target)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        int prev_state, ret = 0;

        if (num_online_cpus() == 1)
                return -EBUSY;

        if (!cpu_present(cpu))
                return -EINVAL;

        cpus_write_lock();

        cpuhp_tasks_frozen = tasks_frozen;

        prev_state = cpuhp_set_state(cpu, st, target);
        /*
         * If the current CPU state is in the range of the AP hotplug thread,
         * then we need to kick the thread.
         */
        if (st->state > CPUHP_TEARDOWN_CPU) {
                st->target = max((int)target, CPUHP_TEARDOWN_CPU);
                ret = cpuhp_kick_ap_work(cpu);
                /*
                 * The AP side has done the error rollback already. Just
                 * return the error code..
                 */
                if (ret)
                        goto out;

                /*
                 * We might have stopped still in the range of the AP hotplug
                 * thread. Nothing to do anymore.
                 */
                if (st->state > CPUHP_TEARDOWN_CPU)
                        goto out;

                st->target = target;
        }
        /*
         * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
         * to do the further cleanups.
         */
        ret = cpuhp_down_callbacks(cpu, st, target);
        if (ret && st->state < prev_state) {
                if (st->state == CPUHP_TEARDOWN_CPU) {
                        cpuhp_reset_state(cpu, st, prev_state);
                        __cpuhp_kick_ap(st);
                } else {
                        WARN(1, "DEAD callback error for CPU%d", cpu);
                }
        }

out:
        cpus_write_unlock();
        /*
         * Do post unplug cleanup. This is still protected against
         * concurrent CPU hotplug via cpu_add_remove_lock.
         */
        lockup_detector_cleanup();
        arch_smt_update();
        return ret;
}

struct cpu_down_work {
        unsigned int                cpu;
        enum cpuhp_state        target;
};

static long __cpu_down_maps_locked(void *arg)
{
        struct cpu_down_work *work = arg;

        return _cpu_down(work->cpu, 0, work->target);
}

static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
{
        struct cpu_down_work work = { .cpu = cpu, .target = target, };

        /*
         * If the platform does not support hotplug, report it explicitly to
         * differentiate it from a transient offlining failure.
         */
        if (cc_platform_has(CC_ATTR_HOTPLUG_DISABLED))
                return -EOPNOTSUPP;
        if (cpu_hotplug_disabled)
                return -EBUSY;

        /*
         * Ensure that the control task does not run on the to be offlined
         * CPU to prevent a deadlock against cfs_b->period_timer.
         * Also keep at least one housekeeping cpu onlined to avoid generating
         * an empty sched_domain span.
         */
        for_each_cpu_and(cpu, cpu_online_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) {
                if (cpu != work.cpu)
                        return work_on_cpu(cpu, __cpu_down_maps_locked, &work);
        }
        return -EBUSY;
}

static int cpu_down(unsigned int cpu, enum cpuhp_state target)
{
        int err;

        cpu_maps_update_begin();
        err = cpu_down_maps_locked(cpu, target);
        cpu_maps_update_done();
        return err;
}

/**
 * cpu_device_down - Bring down a cpu device
 * @dev: Pointer to the cpu device to offline
 *
 * This function is meant to be used by device core cpu subsystem only.
 *
 * Other subsystems should use remove_cpu() instead.
 *
 * Return: %0 on success or a negative errno code
 */
int cpu_device_down(struct device *dev)
{
        return cpu_down(dev->id, CPUHP_OFFLINE);
}

int remove_cpu(unsigned int cpu)
{
        int ret;

        lock_device_hotplug();
        ret = device_offline(get_cpu_device(cpu));
        unlock_device_hotplug();

        return ret;
}
EXPORT_SYMBOL_GPL(remove_cpu);

void smp_shutdown_nonboot_cpus(unsigned int primary_cpu)
{
        unsigned int cpu;
        int error;

        cpu_maps_update_begin();

        /*
         * Make certain the cpu I'm about to reboot on is online.
         *
         * This is inline to what migrate_to_reboot_cpu() already do.
         */
        if (!cpu_online(primary_cpu))
                primary_cpu = cpumask_first(cpu_online_mask);

        for_each_online_cpu(cpu) {
                if (cpu == primary_cpu)
                        continue;

                error = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
                if (error) {
                        pr_err("Failed to offline CPU%d - error=%d",
                                cpu, error);
                        break;
                }
        }

        /*
         * Ensure all but the reboot CPU are offline.
         */
        BUG_ON(num_online_cpus() > 1);

        /*
         * Make sure the CPUs won't be enabled by someone else after this
         * point. Kexec will reboot to a new kernel shortly resetting
         * everything along the way.
         */
        cpu_hotplug_disabled++;

        cpu_maps_update_done();
}

#else
#define takedown_cpu                NULL
#endif /*CONFIG_HOTPLUG_CPU*/

/**
 * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU
 * @cpu: cpu that just started
 *
 * It must be called by the arch code on the new cpu, before the new cpu
 * enables interrupts and before the "boot" cpu returns from __cpu_up().
 */
void notify_cpu_starting(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);

        rcutree_report_cpu_starting(cpu);        /* Enables RCU usage on this CPU. */
        cpumask_set_cpu(cpu, &cpus_booted_once_mask);

        /*
         * STARTING must not fail!
         */
        cpuhp_invoke_callback_range_nofail(true, cpu, st, target);
}

/*
 * Called from the idle task. Wake up the controlling task which brings the
 * hotplug thread of the upcoming CPU up and then delegates the rest of the
 * online bringup to the hotplug thread.
 */
void cpuhp_online_idle(enum cpuhp_state state)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);

        /* Happens for the boot cpu */
        if (state != CPUHP_AP_ONLINE_IDLE)
                return;

        cpuhp_ap_update_sync_state(SYNC_STATE_ONLINE);

        /*
         * Unpark the stopper thread before we start the idle loop (and start
         * scheduling); this ensures the stopper task is always available.
         */
        stop_machine_unpark(smp_processor_id());

        st->state = CPUHP_AP_ONLINE_IDLE;
        complete_ap_thread(st, true);
}

/* Requires cpu_add_remove_lock to be held */
static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        struct task_struct *idle;
        int ret = 0;

        cpus_write_lock();

        if (!cpu_present(cpu)) {
                ret = -EINVAL;
                goto out;
        }

        /*
         * The caller of cpu_up() might have raced with another
         * caller. Nothing to do.
         */
        if (st->state >= target)
                goto out;

        if (st->state == CPUHP_OFFLINE) {
                /* Let it fail before we try to bring the cpu up */
                idle = idle_thread_get(cpu);
                if (IS_ERR(idle)) {
                        ret = PTR_ERR(idle);
                        goto out;
                }

                /*
                 * Reset stale stack state from the last time this CPU was online.
                 */
                scs_task_reset(idle);
                kasan_unpoison_task_stack(idle);
        }

        cpuhp_tasks_frozen = tasks_frozen;

        cpuhp_set_state(cpu, st, target);
        /*
         * If the current CPU state is in the range of the AP hotplug thread,
         * then we need to kick the thread once more.
         */
        if (st->state > CPUHP_BRINGUP_CPU) {
                ret = cpuhp_kick_ap_work(cpu);
                /*
                 * The AP side has done the error rollback already. Just
                 * return the error code..
                 */
                if (ret)
                        goto out;
        }

        /*
         * Try to reach the target state. We max out on the BP at
         * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is
         * responsible for bringing it up to the target state.
         */
        target = min((int)target, CPUHP_BRINGUP_CPU);
        ret = cpuhp_up_callbacks(cpu, st, target);
out:
        cpus_write_unlock();
        arch_smt_update();
        return ret;
}

static int cpu_up(unsigned int cpu, enum cpuhp_state target)
{
        int err = 0;

        if (!cpu_possible(cpu)) {
                pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
                       cpu);
                return -EINVAL;
        }

        err = try_online_node(cpu_to_node(cpu));
        if (err)
                return err;

        cpu_maps_update_begin();

        if (cpu_hotplug_disabled) {
                err = -EBUSY;
                goto out;
        }
        if (!cpu_bootable(cpu)) {
                err = -EPERM;
                goto out;
        }

        err = _cpu_up(cpu, 0, target);
out:
        cpu_maps_update_done();
        return err;
}

/**
 * cpu_device_up - Bring up a cpu device
 * @dev: Pointer to the cpu device to online
 *
 * This function is meant to be used by device core cpu subsystem only.
 *
 * Other subsystems should use add_cpu() instead.
 *
 * Return: %0 on success or a negative errno code
 */
int cpu_device_up(struct device *dev)
{
        return cpu_up(dev->id, CPUHP_ONLINE);
}

int add_cpu(unsigned int cpu)
{
        int ret;

        lock_device_hotplug();
        ret = device_online(get_cpu_device(cpu));
        unlock_device_hotplug();

        return ret;
}
EXPORT_SYMBOL_GPL(add_cpu);

/**
 * bringup_hibernate_cpu - Bring up the CPU that we hibernated on
 * @sleep_cpu: The cpu we hibernated on and should be brought up.
 *
 * On some architectures like arm64, we can hibernate on any CPU, but on
 * wake up the CPU we hibernated on might be offline as a side effect of
 * using maxcpus= for example.
 *
 * Return: %0 on success or a negative errno code
 */
int bringup_hibernate_cpu(unsigned int sleep_cpu)
{
        int ret;

        if (!cpu_online(sleep_cpu)) {
                pr_info("Hibernated on a CPU that is offline! Bringing CPU up.\n");
                ret = cpu_up(sleep_cpu, CPUHP_ONLINE);
                if (ret) {
                        pr_err("Failed to bring hibernate-CPU up!\n");
                        return ret;
                }
        }
        return 0;
}

static void __init cpuhp_bringup_mask(const struct cpumask *mask, unsigned int ncpus,
                                      enum cpuhp_state target)
{
        unsigned int cpu;

        for_each_cpu(cpu, mask) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);

                if (cpu_up(cpu, target) && can_rollback_cpu(st)) {
                        /*
                         * If this failed then cpu_up() might have only
                         * rolled back to CPUHP_BP_KICK_AP for the final
                         * online. Clean it up. NOOP if already rolled back.
                         */
                        WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, CPUHP_OFFLINE));
                }

                if (!--ncpus)
                        break;
        }
}

#ifdef CONFIG_HOTPLUG_PARALLEL
static bool __cpuhp_parallel_bringup __ro_after_init = true;

static int __init parallel_bringup_parse_param(char *arg)
{
        return kstrtobool(arg, &__cpuhp_parallel_bringup);
}
early_param("cpuhp.parallel", parallel_bringup_parse_param);

static inline bool cpuhp_smt_aware(void)
{
        return cpu_smt_max_threads > 1;
}

static inline const struct cpumask *cpuhp_get_primary_thread_mask(void)
{
        return cpu_primary_thread_mask;
}

/*
 * On architectures which have enabled parallel bringup this invokes all BP
 * prepare states for each of the to be onlined APs first. The last state
 * sends the startup IPI to the APs. The APs proceed through the low level
 * bringup code in parallel and then wait for the control CPU to release
 * them one by one for the final onlining procedure.
 *
 * This avoids waiting for each AP to respond to the startup IPI in
 * CPUHP_BRINGUP_CPU.
 */
static bool __init cpuhp_bringup_cpus_parallel(unsigned int ncpus)
{
        const struct cpumask *mask = cpu_present_mask;

        if (__cpuhp_parallel_bringup)
                __cpuhp_parallel_bringup = arch_cpuhp_init_parallel_bringup();
        if (!__cpuhp_parallel_bringup)
                return false;

        if (cpuhp_smt_aware()) {
                const struct cpumask *pmask = cpuhp_get_primary_thread_mask();
                static struct cpumask tmp_mask __initdata;

                /*
                 * X86 requires to prevent that SMT siblings stopped while
                 * the primary thread does a microcode update for various
                 * reasons. Bring the primary threads up first.
                 */
                cpumask_and(&tmp_mask, mask, pmask);
                cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_BP_KICK_AP);
                cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_ONLINE);
                /* Account for the online CPUs */
                ncpus -= num_online_cpus();
                if (!ncpus)
                        return true;
                /* Create the mask for secondary CPUs */
                cpumask_andnot(&tmp_mask, mask, pmask);
                mask = &tmp_mask;
        }

        /* Bring the not-yet started CPUs up */
        cpuhp_bringup_mask(mask, ncpus, CPUHP_BP_KICK_AP);
        cpuhp_bringup_mask(mask, ncpus, CPUHP_ONLINE);
        return true;
}
#else
static inline bool cpuhp_bringup_cpus_parallel(unsigned int ncpus) { return false; }
#endif /* CONFIG_HOTPLUG_PARALLEL */

void __init bringup_nonboot_cpus(unsigned int max_cpus)
{
        /* Try parallel bringup optimization if enabled */
        if (cpuhp_bringup_cpus_parallel(max_cpus))
                return;

        /* Full per CPU serialized bringup */
        cpuhp_bringup_mask(cpu_present_mask, max_cpus, CPUHP_ONLINE);
}

#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;

int freeze_secondary_cpus(int primary)
{
        int cpu, error = 0;

        cpu_maps_update_begin();
        if (primary == -1) {
                primary = cpumask_first(cpu_online_mask);
                if (!housekeeping_cpu(primary, HK_TYPE_TIMER))
                        primary = housekeeping_any_cpu(HK_TYPE_TIMER);
        } else {
                if (!cpu_online(primary))
                        primary = cpumask_first(cpu_online_mask);
        }

        /*
         * We take down all of the non-boot CPUs in one shot to avoid races
         * with the userspace trying to use the CPU hotplug at the same time
         */
        cpumask_clear(frozen_cpus);

        pr_info("Disabling non-boot CPUs ...\n");
        for_each_online_cpu(cpu) {
                if (cpu == primary)
                        continue;

                if (pm_wakeup_pending()) {
                        pr_info("Wakeup pending. Abort CPU freeze\n");
                        error = -EBUSY;
                        break;
                }

                trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
                error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
                trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
                if (!error)
                        cpumask_set_cpu(cpu, frozen_cpus);
                else {
                        pr_err("Error taking CPU%d down: %d\n", cpu, error);
                        break;
                }
        }

        if (!error)
                BUG_ON(num_online_cpus() > 1);
        else
                pr_err("Non-boot CPUs are not disabled\n");

        /*
         * Make sure the CPUs won't be enabled by someone else. We need to do
         * this even in case of failure as all freeze_secondary_cpus() users are
         * supposed to do thaw_secondary_cpus() on the failure path.
         */
        cpu_hotplug_disabled++;

        cpu_maps_update_done();
        return error;
}

void __weak arch_thaw_secondary_cpus_begin(void)
{
}

void __weak arch_thaw_secondary_cpus_end(void)
{
}

void thaw_secondary_cpus(void)
{
        int cpu, error;

        /* Allow everyone to use the CPU hotplug again */
        cpu_maps_update_begin();
        __cpu_hotplug_enable();
        if (cpumask_empty(frozen_cpus))
                goto out;

        pr_info("Enabling non-boot CPUs ...\n");

        arch_thaw_secondary_cpus_begin();

        for_each_cpu(cpu, frozen_cpus) {
                trace_suspend_resume(TPS("CPU_ON"), cpu, true);
                error = _cpu_up(cpu, 1, CPUHP_ONLINE);
                trace_suspend_resume(TPS("CPU_ON"), cpu, false);
                if (!error) {
                        pr_info("CPU%d is up\n", cpu);
                        continue;
                }
                pr_warn("Error taking CPU%d up: %d\n", cpu, error);
        }

        arch_thaw_secondary_cpus_end();

        cpumask_clear(frozen_cpus);
out:
        cpu_maps_update_done();
}

static int __init alloc_frozen_cpus(void)
{
        if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
                return -ENOMEM;
        return 0;
}
core_initcall(alloc_frozen_cpus);

/*
 * When callbacks for CPU hotplug notifications are being executed, we must
 * ensure that the state of the system with respect to the tasks being frozen
 * or not, as reported by the notification, remains unchanged *throughout the
 * duration* of the execution of the callbacks.
 * Hence we need to prevent the freezer from racing with regular CPU hotplug.
 *
 * This synchronization is implemented by mutually excluding regular CPU
 * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
 * Hibernate notifications.
 */
static int
cpu_hotplug_pm_callback(struct notifier_block *nb,
                        unsigned long action, void *ptr)
{
        switch (action) {

        case PM_SUSPEND_PREPARE:
        case PM_HIBERNATION_PREPARE:
                cpu_hotplug_disable();
                break;

        case PM_POST_SUSPEND:
        case PM_POST_HIBERNATION:
                cpu_hotplug_enable();
                break;

        default:
                return NOTIFY_DONE;
        }

        return NOTIFY_OK;
}


static int __init cpu_hotplug_pm_sync_init(void)
{
        /*
         * cpu_hotplug_pm_callback has higher priority than x86
         * bsp_pm_callback which depends on cpu_hotplug_pm_callback
         * to disable cpu hotplug to avoid cpu hotplug race.
         */
        pm_notifier(cpu_hotplug_pm_callback, 0);
        return 0;
}
core_initcall(cpu_hotplug_pm_sync_init);

#endif /* CONFIG_PM_SLEEP_SMP */

int __boot_cpu_id;

#endif /* CONFIG_SMP */

/* Boot processor state steps */
static struct cpuhp_step cpuhp_hp_states[] = {
        [CPUHP_OFFLINE] = {
                .name                        = "offline",
                .startup.single                = NULL,
                .teardown.single        = NULL,
        },
#ifdef CONFIG_SMP
        [CPUHP_CREATE_THREADS]= {
                .name                        = "threads:prepare",
                .startup.single                = smpboot_create_threads,
                .teardown.single        = NULL,
                .cant_stop                = true,
        },
        [CPUHP_PERF_PREPARE] = {
                .name                        = "perf:prepare",
                .startup.single                = perf_event_init_cpu,
                .teardown.single        = perf_event_exit_cpu,
        },
        [CPUHP_RANDOM_PREPARE] = {
                .name                        = "random:prepare",
                .startup.single                = random_prepare_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_WORKQUEUE_PREP] = {
                .name                        = "workqueue:prepare",
                .startup.single                = workqueue_prepare_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_HRTIMERS_PREPARE] = {
                .name                        = "hrtimers:prepare",
                .startup.single                = hrtimers_prepare_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_SMPCFD_PREPARE] = {
                .name                        = "smpcfd:prepare",
                .startup.single                = smpcfd_prepare_cpu,
                .teardown.single        = smpcfd_dead_cpu,
        },
        [CPUHP_RELAY_PREPARE] = {
                .name                        = "relay:prepare",
                .startup.single                = relay_prepare_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_RCUTREE_PREP] = {
                .name                        = "RCU/tree:prepare",
                .startup.single                = rcutree_prepare_cpu,
                .teardown.single        = rcutree_dead_cpu,
        },
        /*
         * On the tear-down path, timers_dead_cpu() must be invoked
         * before blk_mq_queue_reinit_notify() from notify_dead(),
         * otherwise a RCU stall occurs.
         */
        [CPUHP_TIMERS_PREPARE] = {
                .name                        = "timers:prepare",
                .startup.single                = timers_prepare_cpu,
                .teardown.single        = timers_dead_cpu,
        },

#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP
        /*
         * Kicks the AP alive. AP will wait in cpuhp_ap_sync_alive() until
         * the next step will release it.
         */
        [CPUHP_BP_KICK_AP] = {
                .name                        = "cpu:kick_ap",
                .startup.single                = cpuhp_kick_ap_alive,
        },

        /*
         * Waits for the AP to reach cpuhp_ap_sync_alive() and then
         * releases it for the complete bringup.
         */
        [CPUHP_BRINGUP_CPU] = {
                .name                        = "cpu:bringup",
                .startup.single                = cpuhp_bringup_ap,
                .teardown.single        = finish_cpu,
                .cant_stop                = true,
        },
#else
        /*
         * All-in-one CPU bringup state which includes the kick alive.
         */
        [CPUHP_BRINGUP_CPU] = {
                .name                        = "cpu:bringup",
                .startup.single                = bringup_cpu,
                .teardown.single        = finish_cpu,
                .cant_stop                = true,
        },
#endif
        /* Final state before CPU kills itself */
        [CPUHP_AP_IDLE_DEAD] = {
                .name                        = "idle:dead",
        },
        /*
         * Last state before CPU enters the idle loop to die. Transient state
         * for synchronization.
         */
        [CPUHP_AP_OFFLINE] = {
                .name                        = "ap:offline",
                .cant_stop                = true,
        },
        /* First state is scheduler control. Interrupts are disabled */
        [CPUHP_AP_SCHED_STARTING] = {
                .name                        = "sched:starting",
                .startup.single                = sched_cpu_starting,
                .teardown.single        = sched_cpu_dying,
        },
        [CPUHP_AP_RCUTREE_DYING] = {
                .name                        = "RCU/tree:dying",
                .startup.single                = NULL,
                .teardown.single        = rcutree_dying_cpu,
        },
        [CPUHP_AP_SMPCFD_DYING] = {
                .name                        = "smpcfd:dying",
                .startup.single                = NULL,
                .teardown.single        = smpcfd_dying_cpu,
        },
        [CPUHP_AP_HRTIMERS_DYING] = {
                .name                        = "hrtimers:dying",
                .startup.single                = NULL,
                .teardown.single        = hrtimers_cpu_dying,
        },
        [CPUHP_AP_TICK_DYING] = {
                .name                        = "tick:dying",
                .startup.single                = NULL,
                .teardown.single        = tick_cpu_dying,
        },
        /* Entry state on starting. Interrupts enabled from here on. Transient
         * state for synchronsization */
        [CPUHP_AP_ONLINE] = {
                .name                        = "ap:online",
        },
        /*
         * Handled on control processor until the plugged processor manages
         * this itself.
         */
        [CPUHP_TEARDOWN_CPU] = {
                .name                        = "cpu:teardown",
                .startup.single                = NULL,
                .teardown.single        = takedown_cpu,
                .cant_stop                = true,
        },

        [CPUHP_AP_SCHED_WAIT_EMPTY] = {
                .name                        = "sched:waitempty",
                .startup.single                = NULL,
                .teardown.single        = sched_cpu_wait_empty,
        },

        /* Handle smpboot threads park/unpark */
        [CPUHP_AP_SMPBOOT_THREADS] = {
                .name                        = "smpboot/threads:online",
                .startup.single                = smpboot_unpark_threads,
                .teardown.single        = smpboot_park_threads,
        },
        [CPUHP_AP_IRQ_AFFINITY_ONLINE] = {
                .name                        = "irq/affinity:online",
                .startup.single                = irq_affinity_online_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_AP_PERF_ONLINE] = {
                .name                        = "perf:online",
                .startup.single                = perf_event_init_cpu,
                .teardown.single        = perf_event_exit_cpu,
        },
        [CPUHP_AP_WATCHDOG_ONLINE] = {
                .name                        = "lockup_detector:online",
                .startup.single                = lockup_detector_online_cpu,
                .teardown.single        = lockup_detector_offline_cpu,
        },
        [CPUHP_AP_WORKQUEUE_ONLINE] = {
                .name                        = "workqueue:online",
                .startup.single                = workqueue_online_cpu,
                .teardown.single        = workqueue_offline_cpu,
        },
        [CPUHP_AP_RANDOM_ONLINE] = {
                .name                        = "random:online",
                .startup.single                = random_online_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_AP_RCUTREE_ONLINE] = {
                .name                        = "RCU/tree:online",
                .startup.single                = rcutree_online_cpu,
                .teardown.single        = rcutree_offline_cpu,
        },
#endif
        /*
         * The dynamically registered state space is here
         */

#ifdef CONFIG_SMP
        /* Last state is scheduler control setting the cpu active */
        [CPUHP_AP_ACTIVE] = {
                .name                        = "sched:active",
                .startup.single                = sched_cpu_activate,
                .teardown.single        = sched_cpu_deactivate,
        },
#endif

        /* CPU is fully up and running. */
        [CPUHP_ONLINE] = {
                .name                        = "online",
                .startup.single                = NULL,
                .teardown.single        = NULL,
        },
};

/* Sanity check for callbacks */
static int cpuhp_cb_check(enum cpuhp_state state)
{
        if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE)
                return -EINVAL;
        return 0;
}

/*
 * Returns a free for dynamic slot assignment of the Online state. The states
 * are protected by the cpuhp_slot_states mutex and an empty slot is identified
 * by having no name assigned.
 */
static int cpuhp_reserve_state(enum cpuhp_state state)
{
        enum cpuhp_state i, end;
        struct cpuhp_step *step;

        switch (state) {
        case CPUHP_AP_ONLINE_DYN:
                step = cpuhp_hp_states + CPUHP_AP_ONLINE_DYN;
                end = CPUHP_AP_ONLINE_DYN_END;
                break;
        case CPUHP_BP_PREPARE_DYN:
                step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN;
                end = CPUHP_BP_PREPARE_DYN_END;
                break;
        default:
                return -EINVAL;
        }

        for (i = state; i <= end; i++, step++) {
                if (!step->name)
                        return i;
        }
        WARN(1, "No more dynamic states available for CPU hotplug\n");
        return -ENOSPC;
}

static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
                                 int (*startup)(unsigned int cpu),
                                 int (*teardown)(unsigned int cpu),
                                 bool multi_instance)
{
        /* (Un)Install the callbacks for further cpu hotplug operations */
        struct cpuhp_step *sp;
        int ret = 0;

        /*
         * If name is NULL, then the state gets removed.
         *
         * CPUHP_AP_ONLINE_DYN and CPUHP_BP_PREPARE_DYN are handed out on
         * the first allocation from these dynamic ranges, so the removal
         * would trigger a new allocation and clear the wrong (already
         * empty) state, leaving the callbacks of the to be cleared state
         * dangling, which causes wreckage on the next hotplug operation.
         */
        if (name && (state == CPUHP_AP_ONLINE_DYN ||
                     state == CPUHP_BP_PREPARE_DYN)) {
                ret = cpuhp_reserve_state(state);
                if (ret < 0)
                        return ret;
                state = ret;
        }
        sp = cpuhp_get_step(state);
        if (name && sp->name)
                return -EBUSY;

        sp->startup.single = startup;
        sp->teardown.single = teardown;
        sp->name = name;
        sp->multi_instance = multi_instance;
        INIT_HLIST_HEAD(&sp->list);
        return ret;
}

static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
{
        return cpuhp_get_step(state)->teardown.single;
}

/*
 * Call the startup/teardown function for a step either on the AP or
 * on the current CPU.
 */
static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
                            struct hlist_node *node)
{
        struct cpuhp_step *sp = cpuhp_get_step(state);
        int ret;

        /*
         * If there's nothing to do, we done.
         * Relies on the union for multi_instance.
         */
        if (cpuhp_step_empty(bringup, sp))
                return 0;
        /*
         * The non AP bound callbacks can fail on bringup. On teardown
         * e.g. module removal we crash for now.
         */
#ifdef CONFIG_SMP
        if (cpuhp_is_ap_state(state))
                ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
        else
                ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#else
        ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#endif
        BUG_ON(ret && !bringup);
        return ret;
}

/*
 * Called from __cpuhp_setup_state on a recoverable failure.
 *
 * Note: The teardown callbacks for rollback are not allowed to fail!
 */
static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
                                   struct hlist_node *node)
{
        int cpu;

        /* Roll back the already executed steps on the other cpus */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpu >= failedcpu)
                        break;

                /* Did we invoke the startup call on that cpu ? */
                if (cpustate >= state)
                        cpuhp_issue_call(cpu, state, false, node);
        }
}

int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state,
                                          struct hlist_node *node,
                                          bool invoke)
{
        struct cpuhp_step *sp;
        int cpu;
        int ret;

        lockdep_assert_cpus_held();

        sp = cpuhp_get_step(state);
        if (sp->multi_instance == false)
                return -EINVAL;

        mutex_lock(&cpuhp_state_mutex);

        if (!invoke || !sp->startup.multi)
                goto add_node;

        /*
         * Try to call the startup callback for each present cpu
         * depending on the hotplug state of the cpu.
         */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpustate < state)
                        continue;

                ret = cpuhp_issue_call(cpu, state, true, node);
                if (ret) {
                        if (sp->teardown.multi)
                                cpuhp_rollback_install(cpu, state, node);
                        goto unlock;
                }
        }
add_node:
        ret = 0;
        hlist_add_head(node, &sp->list);
unlock:
        mutex_unlock(&cpuhp_state_mutex);
        return ret;
}

int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
                               bool invoke)
{
        int ret;

        cpus_read_lock();
        ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke);
        cpus_read_unlock();
        return ret;
}
EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);

/**
 * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state
 * @state:                The state to setup
 * @name:                Name of the step
 * @invoke:                If true, the startup function is invoked for cpus where
 *                        cpu state >= @state
 * @startup:                startup callback function
 * @teardown:                teardown callback function
 * @multi_instance:        State is set up for multiple instances which get
 *                        added afterwards.
 *
 * The caller needs to hold cpus read locked while calling this function.
 * Return:
 *   On success:
 *      Positive state number if @state is CPUHP_AP_ONLINE_DYN;
 *      0 for all other states
 *   On failure: proper (negative) error code
 */
int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
                                   const char *name, bool invoke,
                                   int (*startup)(unsigned int cpu),
                                   int (*teardown)(unsigned int cpu),
                                   bool multi_instance)
{
        int cpu, ret = 0;
        bool dynstate;

        lockdep_assert_cpus_held();

        if (cpuhp_cb_check(state) || !name)
                return -EINVAL;

        mutex_lock(&cpuhp_state_mutex);

        ret = cpuhp_store_callbacks(state, name, startup, teardown,
                                    multi_instance);

        dynstate = state == CPUHP_AP_ONLINE_DYN;
        if (ret > 0 && dynstate) {
                state = ret;
                ret = 0;
        }

        if (ret || !invoke || !startup)
                goto out;

        /*
         * Try to call the startup callback for each present cpu
         * depending on the hotplug state of the cpu.
         */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpustate < state)
                        continue;

                ret = cpuhp_issue_call(cpu, state, true, NULL);
                if (ret) {
                        if (teardown)
                                cpuhp_rollback_install(cpu, state, NULL);
                        cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
                        goto out;
                }
        }
out:
        mutex_unlock(&cpuhp_state_mutex);
        /*
         * If the requested state is CPUHP_AP_ONLINE_DYN, return the
         * dynamically allocated state in case of success.
         */
        if (!ret && dynstate)
                return state;
        return ret;
}
EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked);

int __cpuhp_setup_state(enum cpuhp_state state,
                        const char *name, bool invoke,
                        int (*startup)(unsigned int cpu),
                        int (*teardown)(unsigned int cpu),
                        bool multi_instance)
{
        int ret;

        cpus_read_lock();
        ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup,
                                             teardown, multi_instance);
        cpus_read_unlock();
        return ret;
}
EXPORT_SYMBOL(__cpuhp_setup_state);

int __cpuhp_state_remove_instance(enum cpuhp_state state,
                                  struct hlist_node *node, bool invoke)
{
        struct cpuhp_step *sp = cpuhp_get_step(state);
        int cpu;

        BUG_ON(cpuhp_cb_check(state));

        if (!sp->multi_instance)
                return -EINVAL;

        cpus_read_lock();
        mutex_lock(&cpuhp_state_mutex);

        if (!invoke || !cpuhp_get_teardown_cb(state))
                goto remove;
        /*
         * Call the teardown callback for each present cpu depending
         * on the hotplug state of the cpu. This function is not
         * allowed to fail currently!
         */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpustate >= state)
                        cpuhp_issue_call(cpu, state, false, node);
        }

remove:
        hlist_del(node);
        mutex_unlock(&cpuhp_state_mutex);
        cpus_read_unlock();

        return 0;
}
EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);

/**
 * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state
 * @state:        The state to remove
 * @invoke:        If true, the teardown function is invoked for cpus where
 *                cpu state >= @state
 *
 * The caller needs to hold cpus read locked while calling this function.
 * The teardown callback is currently not allowed to fail. Think
 * about module removal!
 */
void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke)
{
        struct cpuhp_step *sp = cpuhp_get_step(state);
        int cpu;

        BUG_ON(cpuhp_cb_check(state));

        lockdep_assert_cpus_held();

        mutex_lock(&cpuhp_state_mutex);
        if (sp->multi_instance) {
                WARN(!hlist_empty(&sp->list),
                     "Error: Removing state %d which has instances left.\n",
                     state);
                goto remove;
        }

        if (!invoke || !cpuhp_get_teardown_cb(state))
                goto remove;

        /*
         * Call the teardown callback for each present cpu depending
         * on the hotplug state of the cpu. This function is not
         * allowed to fail currently!
         */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpustate >= state)
                        cpuhp_issue_call(cpu, state, false, NULL);
        }
remove:
        cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
        mutex_unlock(&cpuhp_state_mutex);
}
EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked);

void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
{
        cpus_read_lock();
        __cpuhp_remove_state_cpuslocked(state, invoke);
        cpus_read_unlock();
}
EXPORT_SYMBOL(__cpuhp_remove_state);

#ifdef CONFIG_HOTPLUG_SMT
static void cpuhp_offline_cpu_device(unsigned int cpu)
{
        struct device *dev = get_cpu_device(cpu);

        dev->offline = true;
        /* Tell user space about the state change */
        kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
}

static void cpuhp_online_cpu_device(unsigned int cpu)
{
        struct device *dev = get_cpu_device(cpu);

        dev->offline = false;
        /* Tell user space about the state change */
        kobject_uevent(&dev->kobj, KOBJ_ONLINE);
}

int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
{
        int cpu, ret = 0;

        cpu_maps_update_begin();
        for_each_online_cpu(cpu) {
                if (topology_is_primary_thread(cpu))
                        continue;
                /*
                 * Disable can be called with CPU_SMT_ENABLED when changing
                 * from a higher to lower number of SMT threads per core.
                 */
                if (ctrlval == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
                        continue;
                ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
                if (ret)
                        break;
                /*
                 * As this needs to hold the cpu maps lock it's impossible
                 * to call device_offline() because that ends up calling
                 * cpu_down() which takes cpu maps lock. cpu maps lock
                 * needs to be held as this might race against in kernel
                 * abusers of the hotplug machinery (thermal management).
                 *
                 * So nothing would update device:offline state. That would
                 * leave the sysfs entry stale and prevent onlining after
                 * smt control has been changed to 'off' again. This is
                 * called under the sysfs hotplug lock, so it is properly
                 * serialized against the regular offline usage.
                 */
                cpuhp_offline_cpu_device(cpu);
        }
        if (!ret)
                cpu_smt_control = ctrlval;
        cpu_maps_update_done();
        return ret;
}

int cpuhp_smt_enable(void)
{
        int cpu, ret = 0;

        cpu_maps_update_begin();
        cpu_smt_control = CPU_SMT_ENABLED;
        for_each_present_cpu(cpu) {
                /* Skip online CPUs and CPUs on offline nodes */
                if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
                        continue;
                if (!cpu_smt_thread_allowed(cpu))
                        continue;
                ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
                if (ret)
                        break;
                /* See comment in cpuhp_smt_disable() */
                cpuhp_online_cpu_device(cpu);
        }
        cpu_maps_update_done();
        return ret;
}
#endif

#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
static ssize_t state_show(struct device *dev,
                          struct device_attribute *attr, char *buf)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);

        return sprintf(buf, "%d\n", st->state);
}
static DEVICE_ATTR_RO(state);

static ssize_t target_store(struct device *dev, struct device_attribute *attr,
                            const char *buf, size_t count)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
        struct cpuhp_step *sp;
        int target, ret;

        ret = kstrtoint(buf, 10, &target);
        if (ret)
                return ret;

#ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL
        if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE)
                return -EINVAL;
#else
        if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE)
                return -EINVAL;
#endif

        ret = lock_device_hotplug_sysfs();
        if (ret)
                return ret;

        mutex_lock(&cpuhp_state_mutex);
        sp = cpuhp_get_step(target);
        ret = !sp->name || sp->cant_stop ? -EINVAL : 0;
        mutex_unlock(&cpuhp_state_mutex);
        if (ret)
                goto out;

        if (st->state < target)
                ret = cpu_up(dev->id, target);
        else if (st->state > target)
                ret = cpu_down(dev->id, target);
        else if (WARN_ON(st->target != target))
                st->target = target;
out:
        unlock_device_hotplug();
        return ret ? ret : count;
}

static ssize_t target_show(struct device *dev,
                           struct device_attribute *attr, char *buf)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);

        return sprintf(buf, "%d\n", st->target);
}
static DEVICE_ATTR_RW(target);

static ssize_t fail_store(struct device *dev, struct device_attribute *attr,
                          const char *buf, size_t count)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
        struct cpuhp_step *sp;
        int fail, ret;

        ret = kstrtoint(buf, 10, &fail);
        if (ret)
                return ret;

        if (fail == CPUHP_INVALID) {
                st->fail = fail;
                return count;
        }

        if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE)
                return -EINVAL;

        /*
         * Cannot fail STARTING/DYING callbacks.
         */
        if (cpuhp_is_atomic_state(fail))
                return -EINVAL;

        /*
         * DEAD callbacks cannot fail...
         * ... neither can CPUHP_BRINGUP_CPU during hotunplug. The latter
         * triggering STARTING callbacks, a failure in this state would
         * hinder rollback.
         */
        if (fail <= CPUHP_BRINGUP_CPU && st->state > CPUHP_BRINGUP_CPU)
                return -EINVAL;

        /*
         * Cannot fail anything that doesn't have callbacks.
         */
        mutex_lock(&cpuhp_state_mutex);
        sp = cpuhp_get_step(fail);
        if (!sp->startup.single && !sp->teardown.single)
                ret = -EINVAL;
        mutex_unlock(&cpuhp_state_mutex);
        if (ret)
                return ret;

        st->fail = fail;

        return count;
}

static ssize_t fail_show(struct device *dev,
                         struct device_attribute *attr, char *buf)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);

        return sprintf(buf, "%d\n", st->fail);
}

static DEVICE_ATTR_RW(fail);

static struct attribute *cpuhp_cpu_attrs[] = {
        &dev_attr_state.attr,
        &dev_attr_target.attr,
        &dev_attr_fail.attr,
        NULL
};

static const struct attribute_group cpuhp_cpu_attr_group = {
        .attrs = cpuhp_cpu_attrs,
        .name = "hotplug",
        NULL
};

static ssize_t states_show(struct device *dev,
                                 struct device_attribute *attr, char *buf)
{
        ssize_t cur, res = 0;
        int i;

        mutex_lock(&cpuhp_state_mutex);
        for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) {
                struct cpuhp_step *sp = cpuhp_get_step(i);

                if (sp->name) {
                        cur = sprintf(buf, "%3d: %s\n", i, sp->name);
                        buf += cur;
                        res += cur;
                }
        }
        mutex_unlock(&cpuhp_state_mutex);
        return res;
}
static DEVICE_ATTR_RO(states);

static struct attribute *cpuhp_cpu_root_attrs[] = {
        &dev_attr_states.attr,
        NULL
};

static const struct attribute_group cpuhp_cpu_root_attr_group = {
        .attrs = cpuhp_cpu_root_attrs,
        .name = "hotplug",
        NULL
};

#ifdef CONFIG_HOTPLUG_SMT

static bool cpu_smt_num_threads_valid(unsigned int threads)
{
        if (IS_ENABLED(CONFIG_SMT_NUM_THREADS_DYNAMIC))
                return threads >= 1 && threads <= cpu_smt_max_threads;
        return threads == 1 || threads == cpu_smt_max_threads;
}

static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
                    const char *buf, size_t count)
{
        int ctrlval, ret, num_threads, orig_threads;
        bool force_off;

        if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
                return -EPERM;

        if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
                return -ENODEV;

        if (sysfs_streq(buf, "on")) {
                ctrlval = CPU_SMT_ENABLED;
                num_threads = cpu_smt_max_threads;
        } else if (sysfs_streq(buf, "off")) {
                ctrlval = CPU_SMT_DISABLED;
                num_threads = 1;
        } else if (sysfs_streq(buf, "forceoff")) {
                ctrlval = CPU_SMT_FORCE_DISABLED;
                num_threads = 1;
        } else if (kstrtoint(buf, 10, &num_threads) == 0) {
                if (num_threads == 1)
                        ctrlval = CPU_SMT_DISABLED;
                else if (cpu_smt_num_threads_valid(num_threads))
                        ctrlval = CPU_SMT_ENABLED;
                else
                        return -EINVAL;
        } else {
                return -EINVAL;
        }

        ret = lock_device_hotplug_sysfs();
        if (ret)
                return ret;

        orig_threads = cpu_smt_num_threads;
        cpu_smt_num_threads = num_threads;

        force_off = ctrlval != cpu_smt_control && ctrlval == CPU_SMT_FORCE_DISABLED;

        if (num_threads > orig_threads)
                ret = cpuhp_smt_enable();
        else if (num_threads < orig_threads || force_off)
                ret = cpuhp_smt_disable(ctrlval);

        unlock_device_hotplug();
        return ret ? ret : count;
}

#else /* !CONFIG_HOTPLUG_SMT */
static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
                    const char *buf, size_t count)
{
        return -ENODEV;
}
#endif /* CONFIG_HOTPLUG_SMT */

static const char *smt_states[] = {
        [CPU_SMT_ENABLED]                = "on",
        [CPU_SMT_DISABLED]                = "off",
        [CPU_SMT_FORCE_DISABLED]        = "forceoff",
        [CPU_SMT_NOT_SUPPORTED]                = "notsupported",
        [CPU_SMT_NOT_IMPLEMENTED]        = "notimplemented",
};

static ssize_t control_show(struct device *dev,
                            struct device_attribute *attr, char *buf)
{
        const char *state = smt_states[cpu_smt_control];

#ifdef CONFIG_HOTPLUG_SMT
        /*
         * If SMT is enabled but not all threads are enabled then show the
         * number of threads. If all threads are enabled show "on". Otherwise
         * show the state name.
         */
        if (cpu_smt_control == CPU_SMT_ENABLED &&
            cpu_smt_num_threads != cpu_smt_max_threads)
                return sysfs_emit(buf, "%d\n", cpu_smt_num_threads);
#endif

        return sysfs_emit(buf, "%s\n", state);
}

static ssize_t control_store(struct device *dev, struct device_attribute *attr,
                             const char *buf, size_t count)
{
        return __store_smt_control(dev, attr, buf, count);
}
static DEVICE_ATTR_RW(control);

static ssize_t active_show(struct device *dev,
                           struct device_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%d\n", sched_smt_active());
}
static DEVICE_ATTR_RO(active);

static struct attribute *cpuhp_smt_attrs[] = {
        &dev_attr_control.attr,
        &dev_attr_active.attr,
        NULL
};

static const struct attribute_group cpuhp_smt_attr_group = {
        .attrs = cpuhp_smt_attrs,
        .name = "smt",
        NULL
};

static int __init cpu_smt_sysfs_init(void)
{
        struct device *dev_root;
        int ret = -ENODEV;

        dev_root = bus_get_dev_root(&cpu_subsys);
        if (dev_root) {
                ret = sysfs_create_group(&dev_root->kobj, &cpuhp_smt_attr_group);
                put_device(dev_root);
        }
        return ret;
}

static int __init cpuhp_sysfs_init(void)
{
        struct device *dev_root;
        int cpu, ret;

        ret = cpu_smt_sysfs_init();
        if (ret)
                return ret;

        dev_root = bus_get_dev_root(&cpu_subsys);
        if (dev_root) {
                ret = sysfs_create_group(&dev_root->kobj, &cpuhp_cpu_root_attr_group);
                put_device(dev_root);
                if (ret)
                        return ret;
        }

        for_each_possible_cpu(cpu) {
                struct device *dev = get_cpu_device(cpu);

                if (!dev)
                        continue;
                ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group);
                if (ret)
                        return ret;
        }
        return 0;
}
device_initcall(cpuhp_sysfs_init);
#endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */

/*
 * cpu_bit_bitmap[] is a special, "compressed" data structure that
 * represents all NR_CPUS bits binary values of 1<<nr.
 *
 * It is used by cpumask_of() to get a constant address to a CPU
 * mask value that has a single bit set only.
 */

/* cpu_bit_bitmap[0] is empty - so we can back into it */
#define MASK_DECLARE_1(x)        [x+1][0] = (1UL << (x))
#define MASK_DECLARE_2(x)        MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
#define MASK_DECLARE_4(x)        MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
#define MASK_DECLARE_8(x)        MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)

const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {

        MASK_DECLARE_8(0),        MASK_DECLARE_8(8),
        MASK_DECLARE_8(16),        MASK_DECLARE_8(24),
#if BITS_PER_LONG > 32
        MASK_DECLARE_8(32),        MASK_DECLARE_8(40),
        MASK_DECLARE_8(48),        MASK_DECLARE_8(56),
#endif
};
EXPORT_SYMBOL_GPL(cpu_bit_bitmap);

const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
EXPORT_SYMBOL(cpu_all_bits);

#ifdef CONFIG_INIT_ALL_POSSIBLE
struct cpumask __cpu_possible_mask __ro_after_init
        = {CPU_BITS_ALL};
#else
struct cpumask __cpu_possible_mask __ro_after_init;
#endif
EXPORT_SYMBOL(__cpu_possible_mask);

struct cpumask __cpu_online_mask __read_mostly;
EXPORT_SYMBOL(__cpu_online_mask);

struct cpumask __cpu_present_mask __read_mostly;
EXPORT_SYMBOL(__cpu_present_mask);

struct cpumask __cpu_active_mask __read_mostly;
EXPORT_SYMBOL(__cpu_active_mask);

struct cpumask __cpu_dying_mask __read_mostly;
EXPORT_SYMBOL(__cpu_dying_mask);

atomic_t __num_online_cpus __read_mostly;
EXPORT_SYMBOL(__num_online_cpus);

void init_cpu_present(const struct cpumask *src)
{
        cpumask_copy(&__cpu_present_mask, src);
}

void init_cpu_possible(const struct cpumask *src)
{
        cpumask_copy(&__cpu_possible_mask, src);
}

void init_cpu_online(const struct cpumask *src)
{
        cpumask_copy(&__cpu_online_mask, src);
}

void set_cpu_online(unsigned int cpu, bool online)
{
        /*
         * atomic_inc/dec() is required to handle the horrid abuse of this
         * function by the reboot and kexec code which invoke it from
         * IPI/NMI broadcasts when shutting down CPUs. Invocation from
         * regular CPU hotplug is properly serialized.
         *
         * Note, that the fact that __num_online_cpus is of type atomic_t
         * does not protect readers which are not serialized against
         * concurrent hotplug operations.
         */
        if (online) {
                if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask))
                        atomic_inc(&__num_online_cpus);
        } else {
                if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask))
                        atomic_dec(&__num_online_cpus);
        }
}

/*
 * Activate the first processor.
 */
void __init boot_cpu_init(void)
{
        int cpu = smp_processor_id();

        /* Mark the boot cpu "present", "online" etc for SMP and UP case */
        set_cpu_online(cpu, true);
        set_cpu_active(cpu, true);
        set_cpu_present(cpu, true);
        set_cpu_possible(cpu, true);

#ifdef CONFIG_SMP
        __boot_cpu_id = cpu;
#endif
}

/*
 * Must be called _AFTER_ setting up the per_cpu areas
 */
void __init boot_cpu_hotplug_init(void)
{
#ifdef CONFIG_SMP
        cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
        atomic_set(this_cpu_ptr(&cpuhp_state.ap_sync_state), SYNC_STATE_ONLINE);
#endif
        this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
        this_cpu_write(cpuhp_state.target, CPUHP_ONLINE);
}

#ifdef CONFIG_CPU_MITIGATIONS
/*
 * These are used for a global "mitigations=" cmdline option for toggling
 * optional CPU mitigations.
 */
enum cpu_mitigations {
        CPU_MITIGATIONS_OFF,
        CPU_MITIGATIONS_AUTO,
        CPU_MITIGATIONS_AUTO_NOSMT,
};

static enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;

static int __init mitigations_parse_cmdline(char *arg)
{
        if (!strcmp(arg, "off"))
                cpu_mitigations = CPU_MITIGATIONS_OFF;
        else if (!strcmp(arg, "auto"))
                cpu_mitigations = CPU_MITIGATIONS_AUTO;
        else if (!strcmp(arg, "auto,nosmt"))
                cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
        else
                pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n",
                        arg);

        return 0;
}

/* mitigations=off */
bool cpu_mitigations_off(void)
{
        return cpu_mitigations == CPU_MITIGATIONS_OFF;
}
EXPORT_SYMBOL_GPL(cpu_mitigations_off);

/* mitigations=auto,nosmt */
bool cpu_mitigations_auto_nosmt(void)
{
        return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
}
EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt);
#else
static int __init mitigations_parse_cmdline(char *arg)
{
        pr_crit("Kernel compiled without mitigations, ignoring 'mitigations'; system may still be vulnerable\n");
        return 0;
}
#endif
early_param("mitigations", mitigations_parse_cmdline);




















































































































































































































































































































































































































    3 
    3 




    3 
    3 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 












    2 


    3 


    3 








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 
    3 

    3 






















    3 











    3 








    3 




    3 


























    3 









    3 














    3 


























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
// SPDX-License-Identifier: GPL-2.0+
/*
 *  Base port operations for 8250/16550-type serial ports
 *
 *  Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o.
 *  Split from 8250_core.c, Copyright (C) 2001 Russell King.
 *
 * A note about mapbase / membase
 *
 *  mapbase is the physical address of the IO port.
 *  membase is an 'ioremapped' cookie.
 */

#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/ioport.h>
#include <linux/init.h>
#include <linux/irq.h>
#include <linux/console.h>
#include <linux/gpio/consumer.h>
#include <linux/sysrq.h>
#include <linux/delay.h>
#include <linux/platform_device.h>
#include <linux/tty.h>
#include <linux/ratelimit.h>
#include <linux/tty_flip.h>
#include <linux/serial.h>
#include <linux/serial_8250.h>
#include <linux/nmi.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/pm_runtime.h>
#include <linux/ktime.h>

#include <asm/io.h>
#include <asm/irq.h>

#include "8250.h"

/*
 * Debugging.
 */
#if 0
#define DEBUG_AUTOCONF(fmt...)        printk(fmt)
#else
#define DEBUG_AUTOCONF(fmt...)        do { } while (0)
#endif

/*
 * Here we define the default xmit fifo size used for each type of UART.
 */
static const struct serial8250_config uart_config[] = {
        [PORT_UNKNOWN] = {
                .name                = "unknown",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
        },
        [PORT_8250] = {
                .name                = "8250",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
        },
        [PORT_16450] = {
                .name                = "16450",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
        },
        [PORT_16550] = {
                .name                = "16550",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
        },
        [PORT_16550A] = {
                .name                = "16550A",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 4, 8, 14},
                .flags                = UART_CAP_FIFO,
        },
        [PORT_CIRRUS] = {
                .name                = "Cirrus",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
        },
        [PORT_16650] = {
                .name                = "ST16650",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
                .flags                = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
        },
        [PORT_16650V2] = {
                .name                = "ST16650V2",
                .fifo_size        = 32,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
                                  UART_FCR_T_TRIG_00,
                .rxtrig_bytes        = {8, 16, 24, 28},
                .flags                = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
        },
        [PORT_16750] = {
                .name                = "TI16750",
                .fifo_size        = 64,
                .tx_loadsz        = 64,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 |
                                  UART_FCR7_64BYTE,
                .rxtrig_bytes        = {1, 16, 32, 56},
                .flags                = UART_CAP_FIFO | UART_CAP_SLEEP | UART_CAP_AFE,
        },
        [PORT_STARTECH] = {
                .name                = "Startech",
                .fifo_size        = 1,
                .tx_loadsz        = 1,
        },
        [PORT_16C950] = {
                .name                = "16C950/954",
                .fifo_size        = 128,
                .tx_loadsz        = 128,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01,
                .rxtrig_bytes        = {16, 32, 112, 120},
                /* UART_CAP_EFR breaks billionon CF bluetooth card. */
                .flags                = UART_CAP_FIFO | UART_CAP_SLEEP,
        },
        [PORT_16654] = {
                .name                = "ST16654",
                .fifo_size        = 64,
                .tx_loadsz        = 32,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
                                  UART_FCR_T_TRIG_10,
                .rxtrig_bytes        = {8, 16, 56, 60},
                .flags                = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
        },
        [PORT_16850] = {
                .name                = "XR16850",
                .fifo_size        = 128,
                .tx_loadsz        = 128,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .flags                = UART_CAP_FIFO | UART_CAP_EFR | UART_CAP_SLEEP,
        },
        [PORT_RSA] = {
                .name                = "RSA",
                .fifo_size        = 2048,
                .tx_loadsz        = 2048,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11,
                .flags                = UART_CAP_FIFO,
        },
        [PORT_NS16550A] = {
                .name                = "NS16550A",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .flags                = UART_CAP_FIFO | UART_NATSEMI,
        },
        [PORT_XSCALE] = {
                .name                = "XScale",
                .fifo_size        = 32,
                .tx_loadsz        = 32,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .flags                = UART_CAP_FIFO | UART_CAP_UUE | UART_CAP_RTOIE,
        },
        [PORT_OCTEON] = {
                .name                = "OCTEON",
                .fifo_size        = 64,
                .tx_loadsz        = 64,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .flags                = UART_CAP_FIFO,
        },
        [PORT_U6_16550A] = {
                .name                = "U6_16550A",
                .fifo_size        = 64,
                .tx_loadsz        = 64,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .flags                = UART_CAP_FIFO | UART_CAP_AFE,
        },
        [PORT_TEGRA] = {
                .name                = "Tegra",
                .fifo_size        = 32,
                .tx_loadsz        = 8,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01 |
                                  UART_FCR_T_TRIG_01,
                .rxtrig_bytes        = {1, 4, 8, 14},
                .flags                = UART_CAP_FIFO | UART_CAP_RTOIE,
        },
        [PORT_XR17D15X] = {
                .name                = "XR17D15X",
                .fifo_size        = 64,
                .tx_loadsz        = 64,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .flags                = UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR |
                                  UART_CAP_SLEEP,
        },
        [PORT_XR17V35X] = {
                .name                = "XR17V35X",
                .fifo_size        = 256,
                .tx_loadsz        = 256,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_11 |
                                  UART_FCR_T_TRIG_11,
                .flags                = UART_CAP_FIFO | UART_CAP_AFE | UART_CAP_EFR |
                                  UART_CAP_SLEEP,
        },
        [PORT_LPC3220] = {
                .name                = "LPC3220",
                .fifo_size        = 64,
                .tx_loadsz        = 32,
                .fcr                = UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO |
                                  UART_FCR_R_TRIG_00 | UART_FCR_T_TRIG_00,
                .flags                = UART_CAP_FIFO,
        },
        [PORT_BRCM_TRUMANAGE] = {
                .name                = "TruManage",
                .fifo_size        = 1,
                .tx_loadsz        = 1024,
                .flags                = UART_CAP_HFIFO,
        },
        [PORT_8250_CIR] = {
                .name                = "CIR port"
        },
        [PORT_ALTR_16550_F32] = {
                .name                = "Altera 16550 FIFO32",
                .fifo_size        = 32,
                .tx_loadsz        = 32,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 8, 16, 30},
                .flags                = UART_CAP_FIFO | UART_CAP_AFE,
        },
        [PORT_ALTR_16550_F64] = {
                .name                = "Altera 16550 FIFO64",
                .fifo_size        = 64,
                .tx_loadsz        = 64,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 16, 32, 62},
                .flags                = UART_CAP_FIFO | UART_CAP_AFE,
        },
        [PORT_ALTR_16550_F128] = {
                .name                = "Altera 16550 FIFO128",
                .fifo_size        = 128,
                .tx_loadsz        = 128,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 32, 64, 126},
                .flags                = UART_CAP_FIFO | UART_CAP_AFE,
        },
        /*
         * tx_loadsz is set to 63-bytes instead of 64-bytes to implement
         * workaround of errata A-008006 which states that tx_loadsz should
         * be configured less than Maximum supported fifo bytes.
         */
        [PORT_16550A_FSL64] = {
                .name                = "16550A_FSL64",
                .fifo_size        = 64,
                .tx_loadsz        = 63,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 |
                                  UART_FCR7_64BYTE,
                .flags                = UART_CAP_FIFO | UART_CAP_NOTEMT,
        },
        [PORT_RT2880] = {
                .name                = "Palmchip BK-3103",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 4, 8, 14},
                .flags                = UART_CAP_FIFO,
        },
        [PORT_DA830] = {
                .name                = "TI DA8xx/66AK2x",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_DMA_SELECT | UART_FCR_ENABLE_FIFO |
                                  UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 4, 8, 14},
                .flags                = UART_CAP_FIFO | UART_CAP_AFE,
        },
        [PORT_MTK_BTIF] = {
                .name                = "MediaTek BTIF",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO |
                                  UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT,
                .flags                = UART_CAP_FIFO,
        },
        [PORT_NPCM] = {
                .name                = "Nuvoton 16550",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10 |
                                  UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT,
                .rxtrig_bytes        = {1, 4, 8, 14},
                .flags                = UART_CAP_FIFO,
        },
        [PORT_SUNIX] = {
                .name                = "Sunix",
                .fifo_size        = 128,
                .tx_loadsz        = 128,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .rxtrig_bytes        = {1, 32, 64, 112},
                .flags                = UART_CAP_FIFO | UART_CAP_SLEEP,
        },
        [PORT_ASPEED_VUART] = {
                .name                = "ASPEED VUART",
                .fifo_size        = 16,
                .tx_loadsz        = 16,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00,
                .rxtrig_bytes        = {1, 4, 8, 14},
                .flags                = UART_CAP_FIFO,
        },
        [PORT_MCHP16550A] = {
                .name           = "MCHP16550A",
                .fifo_size      = 256,
                .tx_loadsz      = 256,
                .fcr            = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01,
                .rxtrig_bytes   = {2, 66, 130, 194},
                .flags          = UART_CAP_FIFO,
        },
        [PORT_BCM7271] = {
                .name                = "Broadcom BCM7271 UART",
                .fifo_size        = 32,
                .tx_loadsz        = 32,
                .fcr                = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_01,
                .rxtrig_bytes        = {1, 8, 16, 30},
                .flags                = UART_CAP_FIFO | UART_CAP_AFE,
        },
};

/* Uart divisor latch read */
static u32 default_serial_dl_read(struct uart_8250_port *up)
{
        /* Assign these in pieces to truncate any bits above 7.  */
        unsigned char dll = serial_in(up, UART_DLL);
        unsigned char dlm = serial_in(up, UART_DLM);

        return dll | dlm << 8;
}

/* Uart divisor latch write */
static void default_serial_dl_write(struct uart_8250_port *up, u32 value)
{
        serial_out(up, UART_DLL, value & 0xff);
        serial_out(up, UART_DLM, value >> 8 & 0xff);
}

static unsigned int hub6_serial_in(struct uart_port *p, int offset)
{
        offset = offset << p->regshift;
        outb(p->hub6 - 1 + offset, p->iobase);
        return inb(p->iobase + 1);
}

static void hub6_serial_out(struct uart_port *p, int offset, int value)
{
        offset = offset << p->regshift;
        outb(p->hub6 - 1 + offset, p->iobase);
        outb(value, p->iobase + 1);
}

static unsigned int mem_serial_in(struct uart_port *p, int offset)
{
        offset = offset << p->regshift;
        return readb(p->membase + offset);
}

static void mem_serial_out(struct uart_port *p, int offset, int value)
{
        offset = offset << p->regshift;
        writeb(value, p->membase + offset);
}

static void mem16_serial_out(struct uart_port *p, int offset, int value)
{
        offset = offset << p->regshift;
        writew(value, p->membase + offset);
}

static unsigned int mem16_serial_in(struct uart_port *p, int offset)
{
        offset = offset << p->regshift;
        return readw(p->membase + offset);
}

static void mem32_serial_out(struct uart_port *p, int offset, int value)
{
        offset = offset << p->regshift;
        writel(value, p->membase + offset);
}

static unsigned int mem32_serial_in(struct uart_port *p, int offset)
{
        offset = offset << p->regshift;
        return readl(p->membase + offset);
}

static void mem32be_serial_out(struct uart_port *p, int offset, int value)
{
        offset = offset << p->regshift;
        iowrite32be(value, p->membase + offset);
}

static unsigned int mem32be_serial_in(struct uart_port *p, int offset)
{
        offset = offset << p->regshift;
        return ioread32be(p->membase + offset);
}

static unsigned int io_serial_in(struct uart_port *p, int offset)
{
        offset = offset << p->regshift;
        return inb(p->iobase + offset);
}

static void io_serial_out(struct uart_port *p, int offset, int value)
{
        offset = offset << p->regshift;
        outb(value, p->iobase + offset);
}

static int serial8250_default_handle_irq(struct uart_port *port);

static void set_io_from_upio(struct uart_port *p)
{
        struct uart_8250_port *up = up_to_u8250p(p);

        up->dl_read = default_serial_dl_read;
        up->dl_write = default_serial_dl_write;

        switch (p->iotype) {
        case UPIO_HUB6:
                p->serial_in = hub6_serial_in;
                p->serial_out = hub6_serial_out;
                break;

        case UPIO_MEM:
                p->serial_in = mem_serial_in;
                p->serial_out = mem_serial_out;
                break;

        case UPIO_MEM16:
                p->serial_in = mem16_serial_in;
                p->serial_out = mem16_serial_out;
                break;

        case UPIO_MEM32:
                p->serial_in = mem32_serial_in;
                p->serial_out = mem32_serial_out;
                break;

        case UPIO_MEM32BE:
                p->serial_in = mem32be_serial_in;
                p->serial_out = mem32be_serial_out;
                break;

        default:
                p->serial_in = io_serial_in;
                p->serial_out = io_serial_out;
                break;
        }
        /* Remember loaded iotype */
        up->cur_iotype = p->iotype;
        p->handle_irq = serial8250_default_handle_irq;
}

static void
serial_port_out_sync(struct uart_port *p, int offset, int value)
{
        switch (p->iotype) {
        case UPIO_MEM:
        case UPIO_MEM16:
        case UPIO_MEM32:
        case UPIO_MEM32BE:
        case UPIO_AU:
                p->serial_out(p, offset, value);
                p->serial_in(p, UART_LCR);        /* safe, no side-effects */
                break;
        default:
                p->serial_out(p, offset, value);
        }
}

/*
 * FIFO support.
 */
static void serial8250_clear_fifos(struct uart_8250_port *p)
{
        if (p->capabilities & UART_CAP_FIFO) {
                serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO);
                serial_out(p, UART_FCR, UART_FCR_ENABLE_FIFO |
                               UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
                serial_out(p, UART_FCR, 0);
        }
}

static enum hrtimer_restart serial8250_em485_handle_start_tx(struct hrtimer *t);
static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t);

void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p)
{
        serial8250_clear_fifos(p);
        serial_out(p, UART_FCR, p->fcr);
}
EXPORT_SYMBOL_GPL(serial8250_clear_and_reinit_fifos);

void serial8250_rpm_get(struct uart_8250_port *p)
{
        if (!(p->capabilities & UART_CAP_RPM))
                return;
        pm_runtime_get_sync(p->port.dev);
}
EXPORT_SYMBOL_GPL(serial8250_rpm_get);

void serial8250_rpm_put(struct uart_8250_port *p)
{
        if (!(p->capabilities & UART_CAP_RPM))
                return;
        pm_runtime_mark_last_busy(p->port.dev);
        pm_runtime_put_autosuspend(p->port.dev);
}
EXPORT_SYMBOL_GPL(serial8250_rpm_put);

/**
 *        serial8250_em485_init() - put uart_8250_port into rs485 emulating
 *        @p:        uart_8250_port port instance
 *
 *        The function is used to start rs485 software emulating on the
 *        &struct uart_8250_port* @p. Namely, RTS is switched before/after
 *        transmission. The function is idempotent, so it is safe to call it
 *        multiple times.
 *
 *        The caller MUST enable interrupt on empty shift register before
 *        calling serial8250_em485_init(). This interrupt is not a part of
 *        8250 standard, but implementation defined.
 *
 *        The function is supposed to be called from .rs485_config callback
 *        or from any other callback protected with p->port.lock spinlock.
 *
 *        See also serial8250_em485_destroy()
 *
 *        Return 0 - success, -errno - otherwise
 */
static int serial8250_em485_init(struct uart_8250_port *p)
{
        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&p->port.lock);

        if (p->em485)
                goto deassert_rts;

        p->em485 = kmalloc(sizeof(struct uart_8250_em485), GFP_ATOMIC);
        if (!p->em485)
                return -ENOMEM;

        hrtimer_init(&p->em485->stop_tx_timer, CLOCK_MONOTONIC,
                     HRTIMER_MODE_REL);
        hrtimer_init(&p->em485->start_tx_timer, CLOCK_MONOTONIC,
                     HRTIMER_MODE_REL);
        p->em485->stop_tx_timer.function = &serial8250_em485_handle_stop_tx;
        p->em485->start_tx_timer.function = &serial8250_em485_handle_start_tx;
        p->em485->port = p;
        p->em485->active_timer = NULL;
        p->em485->tx_stopped = true;

deassert_rts:
        if (p->em485->tx_stopped)
                p->rs485_stop_tx(p);

        return 0;
}

/**
 *        serial8250_em485_destroy() - put uart_8250_port into normal state
 *        @p:        uart_8250_port port instance
 *
 *        The function is used to stop rs485 software emulating on the
 *        &struct uart_8250_port* @p. The function is idempotent, so it is safe to
 *        call it multiple times.
 *
 *        The function is supposed to be called from .rs485_config callback
 *        or from any other callback protected with p->port.lock spinlock.
 *
 *        See also serial8250_em485_init()
 */
void serial8250_em485_destroy(struct uart_8250_port *p)
{
        if (!p->em485)
                return;

        hrtimer_cancel(&p->em485->start_tx_timer);
        hrtimer_cancel(&p->em485->stop_tx_timer);

        kfree(p->em485);
        p->em485 = NULL;
}
EXPORT_SYMBOL_GPL(serial8250_em485_destroy);

struct serial_rs485 serial8250_em485_supported = {
        .flags = SER_RS485_ENABLED | SER_RS485_RTS_ON_SEND | SER_RS485_RTS_AFTER_SEND |
                 SER_RS485_TERMINATE_BUS | SER_RS485_RX_DURING_TX,
        .delay_rts_before_send = 1,
        .delay_rts_after_send = 1,
};
EXPORT_SYMBOL_GPL(serial8250_em485_supported);

/**
 * serial8250_em485_config() - generic ->rs485_config() callback
 * @port: uart port
 * @termios: termios structure
 * @rs485: rs485 settings
 *
 * Generic callback usable by 8250 uart drivers to activate rs485 settings
 * if the uart is incapable of driving RTS as a Transmit Enable signal in
 * hardware, relying on software emulation instead.
 */
int serial8250_em485_config(struct uart_port *port, struct ktermios *termios,
                            struct serial_rs485 *rs485)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        /*
         * Both serial8250_em485_init() and serial8250_em485_destroy()
         * are idempotent.
         */
        if (rs485->flags & SER_RS485_ENABLED)
                return serial8250_em485_init(up);

        serial8250_em485_destroy(up);
        return 0;
}
EXPORT_SYMBOL_GPL(serial8250_em485_config);

/*
 * These two wrappers ensure that enable_runtime_pm_tx() can be called more than
 * once and disable_runtime_pm_tx() will still disable RPM because the fifo is
 * empty and the HW can idle again.
 */
void serial8250_rpm_get_tx(struct uart_8250_port *p)
{
        unsigned char rpm_active;

        if (!(p->capabilities & UART_CAP_RPM))
                return;

        rpm_active = xchg(&p->rpm_tx_active, 1);
        if (rpm_active)
                return;
        pm_runtime_get_sync(p->port.dev);
}
EXPORT_SYMBOL_GPL(serial8250_rpm_get_tx);

void serial8250_rpm_put_tx(struct uart_8250_port *p)
{
        unsigned char rpm_active;

        if (!(p->capabilities & UART_CAP_RPM))
                return;

        rpm_active = xchg(&p->rpm_tx_active, 0);
        if (!rpm_active)
                return;
        pm_runtime_mark_last_busy(p->port.dev);
        pm_runtime_put_autosuspend(p->port.dev);
}
EXPORT_SYMBOL_GPL(serial8250_rpm_put_tx);

/*
 * IER sleep support.  UARTs which have EFRs need the "extended
 * capability" bit enabled.  Note that on XR16C850s, we need to
 * reset LCR to write to IER.
 */
static void serial8250_set_sleep(struct uart_8250_port *p, int sleep)
{
        unsigned char lcr = 0, efr = 0;

        serial8250_rpm_get(p);

        if (p->capabilities & UART_CAP_SLEEP) {
                /* Synchronize UART_IER access against the console. */
                uart_port_lock_irq(&p->port);
                if (p->capabilities & UART_CAP_EFR) {
                        lcr = serial_in(p, UART_LCR);
                        efr = serial_in(p, UART_EFR);
                        serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
                        serial_out(p, UART_EFR, UART_EFR_ECB);
                        serial_out(p, UART_LCR, 0);
                }
                serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0);
                if (p->capabilities & UART_CAP_EFR) {
                        serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
                        serial_out(p, UART_EFR, efr);
                        serial_out(p, UART_LCR, lcr);
                }
                uart_port_unlock_irq(&p->port);
        }

        serial8250_rpm_put(p);
}

static void serial8250_clear_IER(struct uart_8250_port *up)
{
        if (up->capabilities & UART_CAP_UUE)
                serial_out(up, UART_IER, UART_IER_UUE);
        else
                serial_out(up, UART_IER, 0);
}

#ifdef CONFIG_SERIAL_8250_RSA
/*
 * Attempts to turn on the RSA FIFO.  Returns zero on failure.
 * We set the port uart clock rate if we succeed.
 */
static int __enable_rsa(struct uart_8250_port *up)
{
        unsigned char mode;
        int result;

        mode = serial_in(up, UART_RSA_MSR);
        result = mode & UART_RSA_MSR_FIFO;

        if (!result) {
                serial_out(up, UART_RSA_MSR, mode | UART_RSA_MSR_FIFO);
                mode = serial_in(up, UART_RSA_MSR);
                result = mode & UART_RSA_MSR_FIFO;
        }

        if (result)
                up->port.uartclk = SERIAL_RSA_BAUD_BASE * 16;

        return result;
}

static void enable_rsa(struct uart_8250_port *up)
{
        if (up->port.type == PORT_RSA) {
                if (up->port.uartclk != SERIAL_RSA_BAUD_BASE * 16) {
                        uart_port_lock_irq(&up->port);
                        __enable_rsa(up);
                        uart_port_unlock_irq(&up->port);
                }
                if (up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16)
                        serial_out(up, UART_RSA_FRR, 0);
        }
}

/*
 * Attempts to turn off the RSA FIFO.  Returns zero on failure.
 * It is unknown why interrupts were disabled in here.  However,
 * the caller is expected to preserve this behaviour by grabbing
 * the spinlock before calling this function.
 */
static void disable_rsa(struct uart_8250_port *up)
{
        unsigned char mode;
        int result;

        if (up->port.type == PORT_RSA &&
            up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16) {
                uart_port_lock_irq(&up->port);

                mode = serial_in(up, UART_RSA_MSR);
                result = !(mode & UART_RSA_MSR_FIFO);

                if (!result) {
                        serial_out(up, UART_RSA_MSR, mode & ~UART_RSA_MSR_FIFO);
                        mode = serial_in(up, UART_RSA_MSR);
                        result = !(mode & UART_RSA_MSR_FIFO);
                }

                if (result)
                        up->port.uartclk = SERIAL_RSA_BAUD_BASE_LO * 16;
                uart_port_unlock_irq(&up->port);
        }
}
#endif /* CONFIG_SERIAL_8250_RSA */

/*
 * This is a quickie test to see how big the FIFO is.
 * It doesn't work at all the time, more's the pity.
 */
static int size_fifo(struct uart_8250_port *up)
{
        unsigned char old_fcr, old_mcr, old_lcr;
        u32 old_dl;
        int count;

        old_lcr = serial_in(up, UART_LCR);
        serial_out(up, UART_LCR, 0);
        old_fcr = serial_in(up, UART_FCR);
        old_mcr = serial8250_in_MCR(up);
        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO |
                    UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT);
        serial8250_out_MCR(up, UART_MCR_LOOP);
        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
        old_dl = serial_dl_read(up);
        serial_dl_write(up, 0x0001);
        serial_out(up, UART_LCR, UART_LCR_WLEN8);
        for (count = 0; count < 256; count++)
                serial_out(up, UART_TX, count);
        mdelay(20);/* FIXME - schedule_timeout */
        for (count = 0; (serial_in(up, UART_LSR) & UART_LSR_DR) &&
             (count < 256); count++)
                serial_in(up, UART_RX);
        serial_out(up, UART_FCR, old_fcr);
        serial8250_out_MCR(up, old_mcr);
        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
        serial_dl_write(up, old_dl);
        serial_out(up, UART_LCR, old_lcr);

        return count;
}

/*
 * Read UART ID using the divisor method - set DLL and DLM to zero
 * and the revision will be in DLL and device type in DLM.  We
 * preserve the device state across this.
 */
static unsigned int autoconfig_read_divisor_id(struct uart_8250_port *p)
{
        unsigned char old_lcr;
        unsigned int id, old_dl;

        old_lcr = serial_in(p, UART_LCR);
        serial_out(p, UART_LCR, UART_LCR_CONF_MODE_A);
        old_dl = serial_dl_read(p);
        serial_dl_write(p, 0);
        id = serial_dl_read(p);
        serial_dl_write(p, old_dl);

        serial_out(p, UART_LCR, old_lcr);

        return id;
}

/*
 * This is a helper routine to autodetect StarTech/Exar/Oxsemi UART's.
 * When this function is called we know it is at least a StarTech
 * 16650 V2, but it might be one of several StarTech UARTs, or one of
 * its clones.  (We treat the broken original StarTech 16650 V1 as a
 * 16550, and why not?  Startech doesn't seem to even acknowledge its
 * existence.)
 *
 * What evil have men's minds wrought...
 */
static void autoconfig_has_efr(struct uart_8250_port *up)
{
        unsigned int id1, id2, id3, rev;

        /*
         * Everything with an EFR has SLEEP
         */
        up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP;

        /*
         * First we check to see if it's an Oxford Semiconductor UART.
         *
         * If we have to do this here because some non-National
         * Semiconductor clone chips lock up if you try writing to the
         * LSR register (which serial_icr_read does)
         */

        /*
         * Check for Oxford Semiconductor 16C950.
         *
         * EFR [4] must be set else this test fails.
         *
         * This shouldn't be necessary, but Mike Hudson (Exoray@isys.ca)
         * claims that it's needed for 952 dual UART's (which are not
         * recommended for new designs).
         */
        up->acr = 0;
        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
        serial_out(up, UART_EFR, UART_EFR_ECB);
        serial_out(up, UART_LCR, 0x00);
        id1 = serial_icr_read(up, UART_ID1);
        id2 = serial_icr_read(up, UART_ID2);
        id3 = serial_icr_read(up, UART_ID3);
        rev = serial_icr_read(up, UART_REV);

        DEBUG_AUTOCONF("950id=%02x:%02x:%02x:%02x ", id1, id2, id3, rev);

        if (id1 == 0x16 && id2 == 0xC9 &&
            (id3 == 0x50 || id3 == 0x52 || id3 == 0x54)) {
                up->port.type = PORT_16C950;

                /*
                 * Enable work around for the Oxford Semiconductor 952 rev B
                 * chip which causes it to seriously miscalculate baud rates
                 * when DLL is 0.
                 */
                if (id3 == 0x52 && rev == 0x01)
                        up->bugs |= UART_BUG_QUOT;
                return;
        }

        /*
         * We check for a XR16C850 by setting DLL and DLM to 0, and then
         * reading back DLL and DLM.  The chip type depends on the DLM
         * value read back:
         *  0x10 - XR16C850 and the DLL contains the chip revision.
         *  0x12 - XR16C2850.
         *  0x14 - XR16C854.
         */
        id1 = autoconfig_read_divisor_id(up);
        DEBUG_AUTOCONF("850id=%04x ", id1);

        id2 = id1 >> 8;
        if (id2 == 0x10 || id2 == 0x12 || id2 == 0x14) {
                up->port.type = PORT_16850;
                return;
        }

        /*
         * It wasn't an XR16C850.
         *
         * We distinguish between the '654 and the '650 by counting
         * how many bytes are in the FIFO.  I'm using this for now,
         * since that's the technique that was sent to me in the
         * serial driver update, but I'm not convinced this works.
         * I've had problems doing this in the past.  -TYT
         */
        if (size_fifo(up) == 64)
                up->port.type = PORT_16654;
        else
                up->port.type = PORT_16650V2;
}

/*
 * We detected a chip without a FIFO.  Only two fall into
 * this category - the original 8250 and the 16450.  The
 * 16450 has a scratch register (accessible with LCR=0)
 */
static void autoconfig_8250(struct uart_8250_port *up)
{
        unsigned char scratch, status1, status2;

        up->port.type = PORT_8250;

        scratch = serial_in(up, UART_SCR);
        serial_out(up, UART_SCR, 0xa5);
        status1 = serial_in(up, UART_SCR);
        serial_out(up, UART_SCR, 0x5a);
        status2 = serial_in(up, UART_SCR);
        serial_out(up, UART_SCR, scratch);

        if (status1 == 0xa5 && status2 == 0x5a)
                up->port.type = PORT_16450;
}

static int broken_efr(struct uart_8250_port *up)
{
        /*
         * Exar ST16C2550 "A2" devices incorrectly detect as
         * having an EFR, and report an ID of 0x0201.  See
         * http://linux.derkeiler.com/Mailing-Lists/Kernel/2004-11/4812.html
         */
        if (autoconfig_read_divisor_id(up) == 0x0201 && size_fifo(up) == 16)
                return 1;

        return 0;
}

/*
 * We know that the chip has FIFOs.  Does it have an EFR?  The
 * EFR is located in the same register position as the IIR and
 * we know the top two bits of the IIR are currently set.  The
 * EFR should contain zero.  Try to read the EFR.
 */
static void autoconfig_16550a(struct uart_8250_port *up)
{
        unsigned char status1, status2;
        unsigned int iersave;

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&up->port.lock);

        up->port.type = PORT_16550A;
        up->capabilities |= UART_CAP_FIFO;

        if (!IS_ENABLED(CONFIG_SERIAL_8250_16550A_VARIANTS) &&
            !(up->port.flags & UPF_FULL_PROBE))
                return;

        /*
         * Check for presence of the EFR when DLAB is set.
         * Only ST16C650V1 UARTs pass this test.
         */
        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
        if (serial_in(up, UART_EFR) == 0) {
                serial_out(up, UART_EFR, 0xA8);
                if (serial_in(up, UART_EFR) != 0) {
                        DEBUG_AUTOCONF("EFRv1 ");
                        up->port.type = PORT_16650;
                        up->capabilities |= UART_CAP_EFR | UART_CAP_SLEEP;
                } else {
                        serial_out(up, UART_LCR, 0);
                        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO |
                                   UART_FCR7_64BYTE);
                        status1 = serial_in(up, UART_IIR) & UART_IIR_FIFO_ENABLED_16750;
                        serial_out(up, UART_FCR, 0);
                        serial_out(up, UART_LCR, 0);

                        if (status1 == UART_IIR_FIFO_ENABLED_16750)
                                up->port.type = PORT_16550A_FSL64;
                        else
                                DEBUG_AUTOCONF("Motorola 8xxx DUART ");
                }
                serial_out(up, UART_EFR, 0);
                return;
        }

        /*
         * Maybe it requires 0xbf to be written to the LCR.
         * (other ST16C650V2 UARTs, TI16C752A, etc)
         */
        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
        if (serial_in(up, UART_EFR) == 0 && !broken_efr(up)) {
                DEBUG_AUTOCONF("EFRv2 ");
                autoconfig_has_efr(up);
                return;
        }

        /*
         * Check for a National Semiconductor SuperIO chip.
         * Attempt to switch to bank 2, read the value of the LOOP bit
         * from EXCR1. Switch back to bank 0, change it in MCR. Then
         * switch back to bank 2, read it from EXCR1 again and check
         * it's changed. If so, set baud_base in EXCR2 to 921600. -- dwmw2
         */
        serial_out(up, UART_LCR, 0);
        status1 = serial8250_in_MCR(up);
        serial_out(up, UART_LCR, 0xE0);
        status2 = serial_in(up, 0x02); /* EXCR1 */

        if (!((status2 ^ status1) & UART_MCR_LOOP)) {
                serial_out(up, UART_LCR, 0);
                serial8250_out_MCR(up, status1 ^ UART_MCR_LOOP);
                serial_out(up, UART_LCR, 0xE0);
                status2 = serial_in(up, 0x02); /* EXCR1 */
                serial_out(up, UART_LCR, 0);
                serial8250_out_MCR(up, status1);

                if ((status2 ^ status1) & UART_MCR_LOOP) {
                        unsigned short quot;

                        serial_out(up, UART_LCR, 0xE0);

                        quot = serial_dl_read(up);
                        quot <<= 3;

                        if (ns16550a_goto_highspeed(up))
                                serial_dl_write(up, quot);

                        serial_out(up, UART_LCR, 0);

                        up->port.uartclk = 921600*16;
                        up->port.type = PORT_NS16550A;
                        up->capabilities |= UART_NATSEMI;
                        return;
                }
        }

        /*
         * No EFR.  Try to detect a TI16750, which only sets bit 5 of
         * the IIR when 64 byte FIFO mode is enabled when DLAB is set.
         * Try setting it with and without DLAB set.  Cheap clones
         * set bit 5 without DLAB set.
         */
        serial_out(up, UART_LCR, 0);
        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE);
        status1 = serial_in(up, UART_IIR) & UART_IIR_FIFO_ENABLED_16750;
        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);

        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_A);
        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO | UART_FCR7_64BYTE);
        status2 = serial_in(up, UART_IIR) & UART_IIR_FIFO_ENABLED_16750;
        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);

        serial_out(up, UART_LCR, 0);

        DEBUG_AUTOCONF("iir1=%d iir2=%d ", status1, status2);

        if (status1 == UART_IIR_FIFO_ENABLED_16550A &&
            status2 == UART_IIR_FIFO_ENABLED_16750) {
                up->port.type = PORT_16750;
                up->capabilities |= UART_CAP_AFE | UART_CAP_SLEEP;
                return;
        }

        /*
         * Try writing and reading the UART_IER_UUE bit (b6).
         * If it works, this is probably one of the Xscale platform's
         * internal UARTs.
         * We're going to explicitly set the UUE bit to 0 before
         * trying to write and read a 1 just to make sure it's not
         * already a 1 and maybe locked there before we even start.
         */
        iersave = serial_in(up, UART_IER);
        serial_out(up, UART_IER, iersave & ~UART_IER_UUE);
        if (!(serial_in(up, UART_IER) & UART_IER_UUE)) {
                /*
                 * OK it's in a known zero state, try writing and reading
                 * without disturbing the current state of the other bits.
                 */
                serial_out(up, UART_IER, iersave | UART_IER_UUE);
                if (serial_in(up, UART_IER) & UART_IER_UUE) {
                        /*
                         * It's an Xscale.
                         * We'll leave the UART_IER_UUE bit set to 1 (enabled).
                         */
                        DEBUG_AUTOCONF("Xscale ");
                        up->port.type = PORT_XSCALE;
                        up->capabilities |= UART_CAP_UUE | UART_CAP_RTOIE;
                        return;
                }
        } else {
                /*
                 * If we got here we couldn't force the IER_UUE bit to 0.
                 * Log it and continue.
                 */
                DEBUG_AUTOCONF("Couldn't force IER_UUE to 0 ");
        }
        serial_out(up, UART_IER, iersave);

        /*
         * We distinguish between 16550A and U6 16550A by counting
         * how many bytes are in the FIFO.
         */
        if (up->port.type == PORT_16550A && size_fifo(up) == 64) {
                up->port.type = PORT_U6_16550A;
                up->capabilities |= UART_CAP_AFE;
        }
}

/*
 * This routine is called by rs_init() to initialize a specific serial
 * port.  It determines what type of UART chip this serial port is
 * using: 8250, 16450, 16550, 16550A.  The important question is
 * whether or not this UART is a 16550A or not, since this will
 * determine whether or not we can use its FIFO features or not.
 */
static void autoconfig(struct uart_8250_port *up)
{
        unsigned char status1, scratch, scratch2, scratch3;
        unsigned char save_lcr, save_mcr;
        struct uart_port *port = &up->port;
        unsigned long flags;
        unsigned int old_capabilities;

        if (!port->iobase && !port->mapbase && !port->membase)
                return;

        DEBUG_AUTOCONF("%s: autoconf (0x%04lx, 0x%p): ",
                       port->name, port->iobase, port->membase);

        /*
         * We really do need global IRQs disabled here - we're going to
         * be frobbing the chips IRQ enable register to see if it exists.
         *
         * Synchronize UART_IER access against the console.
         */
        uart_port_lock_irqsave(port, &flags);

        up->capabilities = 0;
        up->bugs = 0;

        if (!(port->flags & UPF_BUGGY_UART)) {
                /*
                 * Do a simple existence test first; if we fail this,
                 * there's no point trying anything else.
                 *
                 * 0x80 is used as a nonsense port to prevent against
                 * false positives due to ISA bus float.  The
                 * assumption is that 0x80 is a non-existent port;
                 * which should be safe since include/asm/io.h also
                 * makes this assumption.
                 *
                 * Note: this is safe as long as MCR bit 4 is clear
                 * and the device is in "PC" mode.
                 */
                scratch = serial_in(up, UART_IER);
                serial_out(up, UART_IER, 0);
#ifdef __i386__
                outb(0xff, 0x080);
#endif
                /*
                 * Mask out IER[7:4] bits for test as some UARTs (e.g. TL
                 * 16C754B) allow only to modify them if an EFR bit is set.
                 */
                scratch2 = serial_in(up, UART_IER) & UART_IER_ALL_INTR;
                serial_out(up, UART_IER, UART_IER_ALL_INTR);
#ifdef __i386__
                outb(0, 0x080);
#endif
                scratch3 = serial_in(up, UART_IER) & UART_IER_ALL_INTR;
                serial_out(up, UART_IER, scratch);
                if (scratch2 != 0 || scratch3 != UART_IER_ALL_INTR) {
                        /*
                         * We failed; there's nothing here
                         */
                        uart_port_unlock_irqrestore(port, flags);
                        DEBUG_AUTOCONF("IER test failed (%02x, %02x) ",
                                       scratch2, scratch3);
                        goto out;
                }
        }

        save_mcr = serial8250_in_MCR(up);
        save_lcr = serial_in(up, UART_LCR);

        /*
         * Check to see if a UART is really there.  Certain broken
         * internal modems based on the Rockwell chipset fail this
         * test, because they apparently don't implement the loopback
         * test mode.  So this test is skipped on the COM 1 through
         * COM 4 ports.  This *should* be safe, since no board
         * manufacturer would be stupid enough to design a board
         * that conflicts with COM 1-4 --- we hope!
         */
        if (!(port->flags & UPF_SKIP_TEST)) {
                serial8250_out_MCR(up, UART_MCR_LOOP | UART_MCR_OUT2 | UART_MCR_RTS);
                status1 = serial_in(up, UART_MSR) & UART_MSR_STATUS_BITS;
                serial8250_out_MCR(up, save_mcr);
                if (status1 != (UART_MSR_DCD | UART_MSR_CTS)) {
                        uart_port_unlock_irqrestore(port, flags);
                        DEBUG_AUTOCONF("LOOP test failed (%02x) ",
                                       status1);
                        goto out;
                }
        }

        /*
         * We're pretty sure there's a port here.  Lets find out what
         * type of port it is.  The IIR top two bits allows us to find
         * out if it's 8250 or 16450, 16550, 16550A or later.  This
         * determines what we test for next.
         *
         * We also initialise the EFR (if any) to zero for later.  The
         * EFR occupies the same register location as the FCR and IIR.
         */
        serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);
        serial_out(up, UART_EFR, 0);
        serial_out(up, UART_LCR, 0);

        serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO);

        switch (serial_in(up, UART_IIR) & UART_IIR_FIFO_ENABLED) {
        case UART_IIR_FIFO_ENABLED_8250:
                autoconfig_8250(up);
                break;
        case UART_IIR_FIFO_ENABLED_16550:
                port->type = PORT_16550;
                break;
        case UART_IIR_FIFO_ENABLED_16550A:
                autoconfig_16550a(up);
                break;
        default:
                port->type = PORT_UNKNOWN;
                break;
        }

#ifdef CONFIG_SERIAL_8250_RSA
        /*
         * Only probe for RSA ports if we got the region.
         */
        if (port->type == PORT_16550A && up->probe & UART_PROBE_RSA &&
            __enable_rsa(up))
                port->type = PORT_RSA;
#endif

        serial_out(up, UART_LCR, save_lcr);

        port->fifosize = uart_config[up->port.type].fifo_size;
        old_capabilities = up->capabilities;
        up->capabilities = uart_config[port->type].flags;
        up->tx_loadsz = uart_config[port->type].tx_loadsz;

        if (port->type == PORT_UNKNOWN)
                goto out_unlock;

        /*
         * Reset the UART.
         */
#ifdef CONFIG_SERIAL_8250_RSA
        if (port->type == PORT_RSA)
                serial_out(up, UART_RSA_FRR, 0);
#endif
        serial8250_out_MCR(up, save_mcr);
        serial8250_clear_fifos(up);
        serial_in(up, UART_RX);
        serial8250_clear_IER(up);

out_unlock:
        uart_port_unlock_irqrestore(port, flags);

        /*
         * Check if the device is a Fintek F81216A
         */
        if (port->type == PORT_16550A && port->iotype == UPIO_PORT)
                fintek_8250_probe(up);

        if (up->capabilities != old_capabilities) {
                dev_warn(port->dev, "detected caps %08x should be %08x\n",
                         old_capabilities, up->capabilities);
        }
out:
        DEBUG_AUTOCONF("iir=%d ", scratch);
        DEBUG_AUTOCONF("type=%s\n", uart_config[port->type].name);
}

static void autoconfig_irq(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;
        unsigned char save_mcr, save_ier;
        unsigned char save_ICP = 0;
        unsigned int ICP = 0;
        unsigned long irqs;
        int irq;

        if (port->flags & UPF_FOURPORT) {
                ICP = (port->iobase & 0xfe0) | 0x1f;
                save_ICP = inb_p(ICP);
                outb_p(0x80, ICP);
                inb_p(ICP);
        }

        /* forget possible initially masked and pending IRQ */
        probe_irq_off(probe_irq_on());
        save_mcr = serial8250_in_MCR(up);
        /* Synchronize UART_IER access against the console. */
        uart_port_lock_irq(port);
        save_ier = serial_in(up, UART_IER);
        uart_port_unlock_irq(port);
        serial8250_out_MCR(up, UART_MCR_OUT1 | UART_MCR_OUT2);

        irqs = probe_irq_on();
        serial8250_out_MCR(up, 0);
        udelay(10);
        if (port->flags & UPF_FOURPORT) {
                serial8250_out_MCR(up, UART_MCR_DTR | UART_MCR_RTS);
        } else {
                serial8250_out_MCR(up,
                        UART_MCR_DTR | UART_MCR_RTS | UART_MCR_OUT2);
        }
        /* Synchronize UART_IER access against the console. */
        uart_port_lock_irq(port);
        serial_out(up, UART_IER, UART_IER_ALL_INTR);
        uart_port_unlock_irq(port);
        serial_in(up, UART_LSR);
        serial_in(up, UART_RX);
        serial_in(up, UART_IIR);
        serial_in(up, UART_MSR);
        serial_out(up, UART_TX, 0xFF);
        udelay(20);
        irq = probe_irq_off(irqs);

        serial8250_out_MCR(up, save_mcr);
        /* Synchronize UART_IER access against the console. */
        uart_port_lock_irq(port);
        serial_out(up, UART_IER, save_ier);
        uart_port_unlock_irq(port);

        if (port->flags & UPF_FOURPORT)
                outb_p(save_ICP, ICP);

        port->irq = (irq > 0) ? irq : 0;
}

static void serial8250_stop_rx(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&port->lock);

        serial8250_rpm_get(up);

        up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
        up->port.read_status_mask &= ~UART_LSR_DR;
        serial_port_out(port, UART_IER, up->ier);

        serial8250_rpm_put(up);
}

/**
 * serial8250_em485_stop_tx() - generic ->rs485_stop_tx() callback
 * @p: uart 8250 port
 *
 * Generic callback usable by 8250 uart drivers to stop rs485 transmission.
 */
void serial8250_em485_stop_tx(struct uart_8250_port *p)
{
        unsigned char mcr = serial8250_in_MCR(p);

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&p->port.lock);

        if (p->port.rs485.flags & SER_RS485_RTS_AFTER_SEND)
                mcr |= UART_MCR_RTS;
        else
                mcr &= ~UART_MCR_RTS;
        serial8250_out_MCR(p, mcr);

        /*
         * Empty the RX FIFO, we are not interested in anything
         * received during the half-duplex transmission.
         * Enable previously disabled RX interrupts.
         */
        if (!(p->port.rs485.flags & SER_RS485_RX_DURING_TX)) {
                serial8250_clear_and_reinit_fifos(p);

                p->ier |= UART_IER_RLSI | UART_IER_RDI;
                serial_port_out(&p->port, UART_IER, p->ier);
        }
}
EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx);

static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t)
{
        struct uart_8250_em485 *em485 = container_of(t, struct uart_8250_em485,
                        stop_tx_timer);
        struct uart_8250_port *p = em485->port;
        unsigned long flags;

        serial8250_rpm_get(p);
        uart_port_lock_irqsave(&p->port, &flags);
        if (em485->active_timer == &em485->stop_tx_timer) {
                p->rs485_stop_tx(p);
                em485->active_timer = NULL;
                em485->tx_stopped = true;
        }
        uart_port_unlock_irqrestore(&p->port, flags);
        serial8250_rpm_put(p);

        return HRTIMER_NORESTART;
}

static void start_hrtimer_ms(struct hrtimer *hrt, unsigned long msec)
{
        hrtimer_start(hrt, ms_to_ktime(msec), HRTIMER_MODE_REL);
}

static void __stop_tx_rs485(struct uart_8250_port *p, u64 stop_delay)
{
        struct uart_8250_em485 *em485 = p->em485;

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&p->port.lock);

        stop_delay += (u64)p->port.rs485.delay_rts_after_send * NSEC_PER_MSEC;

        /*
         * rs485_stop_tx() is going to set RTS according to config
         * AND flush RX FIFO if required.
         */
        if (stop_delay > 0) {
                em485->active_timer = &em485->stop_tx_timer;
                hrtimer_start(&em485->stop_tx_timer, ns_to_ktime(stop_delay), HRTIMER_MODE_REL);
        } else {
                p->rs485_stop_tx(p);
                em485->active_timer = NULL;
                em485->tx_stopped = true;
        }
}

static inline void __stop_tx(struct uart_8250_port *p)
{
        struct uart_8250_em485 *em485 = p->em485;

        if (em485) {
                u16 lsr = serial_lsr_in(p);
                u64 stop_delay = 0;

                if (!(lsr & UART_LSR_THRE))
                        return;
                /*
                 * To provide required timing and allow FIFO transfer,
                 * __stop_tx_rs485() must be called only when both FIFO and
                 * shift register are empty. The device driver should either
                 * enable interrupt on TEMT or set UART_CAP_NOTEMT that will
                 * enlarge stop_tx_timer by the tx time of one frame to cover
                 * for emptying of the shift register.
                 */
                if (!(lsr & UART_LSR_TEMT)) {
                        if (!(p->capabilities & UART_CAP_NOTEMT))
                                return;
                        /*
                         * RTS might get deasserted too early with the normal
                         * frame timing formula. It seems to suggest THRE might
                         * get asserted already during tx of the stop bit
                         * rather than after it is fully sent.
                         * Roughly estimate 1 extra bit here with / 7.
                         */
                        stop_delay = p->port.frame_time + DIV_ROUND_UP(p->port.frame_time, 7);
                }

                __stop_tx_rs485(p, stop_delay);
        }

        if (serial8250_clear_THRI(p))
                serial8250_rpm_put_tx(p);
}

static void serial8250_stop_tx(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        serial8250_rpm_get(up);
        __stop_tx(up);

        /*
         * We really want to stop the transmitter from sending.
         */
        if (port->type == PORT_16C950) {
                up->acr |= UART_ACR_TXDIS;
                serial_icr_write(up, UART_ACR, up->acr);
        }
        serial8250_rpm_put(up);
}

static inline void __start_tx(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        if (up->dma && !up->dma->tx_dma(up))
                return;

        if (serial8250_set_THRI(up)) {
                if (up->bugs & UART_BUG_TXEN) {
                        u16 lsr = serial_lsr_in(up);

                        if (lsr & UART_LSR_THRE)
                                serial8250_tx_chars(up);
                }
        }

        /*
         * Re-enable the transmitter if we disabled it.
         */
        if (port->type == PORT_16C950 && up->acr & UART_ACR_TXDIS) {
                up->acr &= ~UART_ACR_TXDIS;
                serial_icr_write(up, UART_ACR, up->acr);
        }
}

/**
 * serial8250_em485_start_tx() - generic ->rs485_start_tx() callback
 * @up: uart 8250 port
 *
 * Generic callback usable by 8250 uart drivers to start rs485 transmission.
 * Assumes that setting the RTS bit in the MCR register means RTS is high.
 * (Some chips use inverse semantics.)  Further assumes that reception is
 * stoppable by disabling the UART_IER_RDI interrupt.  (Some chips set the
 * UART_LSR_DR bit even when UART_IER_RDI is disabled, foiling this approach.)
 */
void serial8250_em485_start_tx(struct uart_8250_port *up)
{
        unsigned char mcr = serial8250_in_MCR(up);

        if (!(up->port.rs485.flags & SER_RS485_RX_DURING_TX))
                serial8250_stop_rx(&up->port);

        if (up->port.rs485.flags & SER_RS485_RTS_ON_SEND)
                mcr |= UART_MCR_RTS;
        else
                mcr &= ~UART_MCR_RTS;
        serial8250_out_MCR(up, mcr);
}
EXPORT_SYMBOL_GPL(serial8250_em485_start_tx);

/* Returns false, if start_tx_timer was setup to defer TX start */
static bool start_tx_rs485(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        struct uart_8250_em485 *em485 = up->em485;

        /*
         * While serial8250_em485_handle_stop_tx() is a noop if
         * em485->active_timer != &em485->stop_tx_timer, it might happen that
         * the timer is still armed and triggers only after the current bunch of
         * chars is send and em485->active_timer == &em485->stop_tx_timer again.
         * So cancel the timer. There is still a theoretical race condition if
         * the timer is already running and only comes around to check for
         * em485->active_timer when &em485->stop_tx_timer is armed again.
         */
        if (em485->active_timer == &em485->stop_tx_timer)
                hrtimer_try_to_cancel(&em485->stop_tx_timer);

        em485->active_timer = NULL;

        if (em485->tx_stopped) {
                em485->tx_stopped = false;

                up->rs485_start_tx(up);

                if (up->port.rs485.delay_rts_before_send > 0) {
                        em485->active_timer = &em485->start_tx_timer;
                        start_hrtimer_ms(&em485->start_tx_timer,
                                         up->port.rs485.delay_rts_before_send);
                        return false;
                }
        }

        return true;
}

static enum hrtimer_restart serial8250_em485_handle_start_tx(struct hrtimer *t)
{
        struct uart_8250_em485 *em485 = container_of(t, struct uart_8250_em485,
                        start_tx_timer);
        struct uart_8250_port *p = em485->port;
        unsigned long flags;

        uart_port_lock_irqsave(&p->port, &flags);
        if (em485->active_timer == &em485->start_tx_timer) {
                __start_tx(&p->port);
                em485->active_timer = NULL;
        }
        uart_port_unlock_irqrestore(&p->port, flags);

        return HRTIMER_NORESTART;
}

static void serial8250_start_tx(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        struct uart_8250_em485 *em485 = up->em485;

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&port->lock);

        if (!port->x_char && kfifo_is_empty(&port->state->port.xmit_fifo))
                return;

        serial8250_rpm_get_tx(up);

        if (em485) {
                if ((em485->active_timer == &em485->start_tx_timer) ||
                    !start_tx_rs485(port))
                        return;
        }
        __start_tx(port);
}

static void serial8250_throttle(struct uart_port *port)
{
        port->throttle(port);
}

static void serial8250_unthrottle(struct uart_port *port)
{
        port->unthrottle(port);
}

static void serial8250_disable_ms(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&port->lock);

        /* no MSR capabilities */
        if (up->bugs & UART_BUG_NOMSR)
                return;

        mctrl_gpio_disable_ms(up->gpios);

        up->ier &= ~UART_IER_MSI;
        serial_port_out(port, UART_IER, up->ier);
}

static void serial8250_enable_ms(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&port->lock);

        /* no MSR capabilities */
        if (up->bugs & UART_BUG_NOMSR)
                return;

        mctrl_gpio_enable_ms(up->gpios);

        up->ier |= UART_IER_MSI;

        serial8250_rpm_get(up);
        serial_port_out(port, UART_IER, up->ier);
        serial8250_rpm_put(up);
}

void serial8250_read_char(struct uart_8250_port *up, u16 lsr)
{
        struct uart_port *port = &up->port;
        u8 ch, flag = TTY_NORMAL;

        if (likely(lsr & UART_LSR_DR))
                ch = serial_in(up, UART_RX);
        else
                /*
                 * Intel 82571 has a Serial Over Lan device that will
                 * set UART_LSR_BI without setting UART_LSR_DR when
                 * it receives a break. To avoid reading from the
                 * receive buffer without UART_LSR_DR bit set, we
                 * just force the read character to be 0
                 */
                ch = 0;

        port->icount.rx++;

        lsr |= up->lsr_saved_flags;
        up->lsr_saved_flags = 0;

        if (unlikely(lsr & UART_LSR_BRK_ERROR_BITS)) {
                if (lsr & UART_LSR_BI) {
                        lsr &= ~(UART_LSR_FE | UART_LSR_PE);
                        port->icount.brk++;
                        /*
                         * We do the SysRQ and SAK checking
                         * here because otherwise the break
                         * may get masked by ignore_status_mask
                         * or read_status_mask.
                         */
                        if (uart_handle_break(port))
                                return;
                } else if (lsr & UART_LSR_PE)
                        port->icount.parity++;
                else if (lsr & UART_LSR_FE)
                        port->icount.frame++;
                if (lsr & UART_LSR_OE)
                        port->icount.overrun++;

                /*
                 * Mask off conditions which should be ignored.
                 */
                lsr &= port->read_status_mask;

                if (lsr & UART_LSR_BI) {
                        dev_dbg(port->dev, "handling break\n");
                        flag = TTY_BREAK;
                } else if (lsr & UART_LSR_PE)
                        flag = TTY_PARITY;
                else if (lsr & UART_LSR_FE)
                        flag = TTY_FRAME;
        }
        if (uart_prepare_sysrq_char(port, ch))
                return;

        uart_insert_char(port, lsr, UART_LSR_OE, ch, flag);
}
EXPORT_SYMBOL_GPL(serial8250_read_char);

/*
 * serial8250_rx_chars - Read characters. The first LSR value must be passed in.
 *
 * Returns LSR bits. The caller should rely only on non-Rx related LSR bits
 * (such as THRE) because the LSR value might come from an already consumed
 * character.
 */
u16 serial8250_rx_chars(struct uart_8250_port *up, u16 lsr)
{
        struct uart_port *port = &up->port;
        int max_count = 256;

        do {
                serial8250_read_char(up, lsr);
                if (--max_count == 0)
                        break;
                lsr = serial_in(up, UART_LSR);
        } while (lsr & (UART_LSR_DR | UART_LSR_BI));

        tty_flip_buffer_push(&port->state->port);
        return lsr;
}
EXPORT_SYMBOL_GPL(serial8250_rx_chars);

void serial8250_tx_chars(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;
        struct tty_port *tport = &port->state->port;
        int count;

        if (port->x_char) {
                uart_xchar_out(port, UART_TX);
                return;
        }
        if (uart_tx_stopped(port)) {
                serial8250_stop_tx(port);
                return;
        }
        if (kfifo_is_empty(&tport->xmit_fifo)) {
                __stop_tx(up);
                return;
        }

        count = up->tx_loadsz;
        do {
                unsigned char c;

                if (!uart_fifo_get(port, &c))
                        break;

                serial_out(up, UART_TX, c);
                if (up->bugs & UART_BUG_TXRACE) {
                        /*
                         * The Aspeed BMC virtual UARTs have a bug where data
                         * may get stuck in the BMC's Tx FIFO from bursts of
                         * writes on the APB interface.
                         *
                         * Delay back-to-back writes by a read cycle to avoid
                         * stalling the VUART. Read a register that won't have
                         * side-effects and discard the result.
                         */
                        serial_in(up, UART_SCR);
                }

                if ((up->capabilities & UART_CAP_HFIFO) &&
                    !uart_lsr_tx_empty(serial_in(up, UART_LSR)))
                        break;
                /* The BCM2835 MINI UART THRE bit is really a not-full bit. */
                if ((up->capabilities & UART_CAP_MINI) &&
                    !(serial_in(up, UART_LSR) & UART_LSR_THRE))
                        break;
        } while (--count > 0);

        if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
                uart_write_wakeup(port);

        /*
         * With RPM enabled, we have to wait until the FIFO is empty before the
         * HW can go idle. So we get here once again with empty FIFO and disable
         * the interrupt and RPM in __stop_tx()
         */
        if (kfifo_is_empty(&tport->xmit_fifo) &&
            !(up->capabilities & UART_CAP_RPM))
                __stop_tx(up);
}
EXPORT_SYMBOL_GPL(serial8250_tx_chars);

/* Caller holds uart port lock */
unsigned int serial8250_modem_status(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;
        unsigned int status = serial_in(up, UART_MSR);

        status |= up->msr_saved_flags;
        up->msr_saved_flags = 0;
        if (status & UART_MSR_ANY_DELTA && up->ier & UART_IER_MSI &&
            port->state != NULL) {
                if (status & UART_MSR_TERI)
                        port->icount.rng++;
                if (status & UART_MSR_DDSR)
                        port->icount.dsr++;
                if (status & UART_MSR_DDCD)
                        uart_handle_dcd_change(port, status & UART_MSR_DCD);
                if (status & UART_MSR_DCTS)
                        uart_handle_cts_change(port, status & UART_MSR_CTS);

                wake_up_interruptible(&port->state->port.delta_msr_wait);
        }

        return status;
}
EXPORT_SYMBOL_GPL(serial8250_modem_status);

static bool handle_rx_dma(struct uart_8250_port *up, unsigned int iir)
{
        switch (iir & 0x3f) {
        case UART_IIR_THRI:
                /*
                 * Postpone DMA or not decision to IIR_RDI or IIR_RX_TIMEOUT
                 * because it's impossible to do an informed decision about
                 * that with IIR_THRI.
                 *
                 * This also fixes one known DMA Rx corruption issue where
                 * DR is asserted but DMA Rx only gets a corrupted zero byte
                 * (too early DR?).
                 */
                return false;
        case UART_IIR_RDI:
                if (!up->dma->rx_running)
                        break;
                fallthrough;
        case UART_IIR_RLSI:
        case UART_IIR_RX_TIMEOUT:
                serial8250_rx_dma_flush(up);
                return true;
        }
        return up->dma->rx_dma(up);
}

/*
 * This handles the interrupt from one port.
 */
int serial8250_handle_irq(struct uart_port *port, unsigned int iir)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        struct tty_port *tport = &port->state->port;
        bool skip_rx = false;
        unsigned long flags;
        u16 status;

        if (iir & UART_IIR_NO_INT)
                return 0;

        uart_port_lock_irqsave(port, &flags);

        status = serial_lsr_in(up);

        /*
         * If port is stopped and there are no error conditions in the
         * FIFO, then don't drain the FIFO, as this may lead to TTY buffer
         * overflow. Not servicing, RX FIFO would trigger auto HW flow
         * control when FIFO occupancy reaches preset threshold, thus
         * halting RX. This only works when auto HW flow control is
         * available.
         */
        if (!(status & (UART_LSR_FIFOE | UART_LSR_BRK_ERROR_BITS)) &&
            (port->status & (UPSTAT_AUTOCTS | UPSTAT_AUTORTS)) &&
            !(port->read_status_mask & UART_LSR_DR))
                skip_rx = true;

        if (status & (UART_LSR_DR | UART_LSR_BI) && !skip_rx) {
                struct irq_data *d;

                d = irq_get_irq_data(port->irq);
                if (d && irqd_is_wakeup_set(d))
                        pm_wakeup_event(tport->tty->dev, 0);
                if (!up->dma || handle_rx_dma(up, iir))
                        status = serial8250_rx_chars(up, status);
        }
        serial8250_modem_status(up);
        if ((status & UART_LSR_THRE) && (up->ier & UART_IER_THRI)) {
                if (!up->dma || up->dma->tx_err)
                        serial8250_tx_chars(up);
                else if (!up->dma->tx_running)
                        __stop_tx(up);
        }

        uart_unlock_and_check_sysrq_irqrestore(port, flags);

        return 1;
}
EXPORT_SYMBOL_GPL(serial8250_handle_irq);

static int serial8250_default_handle_irq(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        unsigned int iir;
        int ret;

        serial8250_rpm_get(up);

        iir = serial_port_in(port, UART_IIR);
        ret = serial8250_handle_irq(port, iir);

        serial8250_rpm_put(up);
        return ret;
}

/*
 * Newer 16550 compatible parts such as the SC16C650 & Altera 16550 Soft IP
 * have a programmable TX threshold that triggers the THRE interrupt in
 * the IIR register. In this case, the THRE interrupt indicates the FIFO
 * has space available. Load it up with tx_loadsz bytes.
 */
static int serial8250_tx_threshold_handle_irq(struct uart_port *port)
{
        unsigned long flags;
        unsigned int iir = serial_port_in(port, UART_IIR);

        /* TX Threshold IRQ triggered so load up FIFO */
        if ((iir & UART_IIR_ID) == UART_IIR_THRI) {
                struct uart_8250_port *up = up_to_u8250p(port);

                uart_port_lock_irqsave(port, &flags);
                serial8250_tx_chars(up);
                uart_port_unlock_irqrestore(port, flags);
        }

        iir = serial_port_in(port, UART_IIR);
        return serial8250_handle_irq(port, iir);
}

static unsigned int serial8250_tx_empty(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        unsigned int result = 0;
        unsigned long flags;

        serial8250_rpm_get(up);

        uart_port_lock_irqsave(port, &flags);
        if (!serial8250_tx_dma_running(up) && uart_lsr_tx_empty(serial_lsr_in(up)))
                result = TIOCSER_TEMT;
        uart_port_unlock_irqrestore(port, flags);

        serial8250_rpm_put(up);

        return result;
}

unsigned int serial8250_do_get_mctrl(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        unsigned int status;
        unsigned int val;

        serial8250_rpm_get(up);
        status = serial8250_modem_status(up);
        serial8250_rpm_put(up);

        val = serial8250_MSR_to_TIOCM(status);
        if (up->gpios)
                return mctrl_gpio_get(up->gpios, &val);

        return val;
}
EXPORT_SYMBOL_GPL(serial8250_do_get_mctrl);

static unsigned int serial8250_get_mctrl(struct uart_port *port)
{
        if (port->get_mctrl)
                return port->get_mctrl(port);
        return serial8250_do_get_mctrl(port);
}

void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        unsigned char mcr;

        mcr = serial8250_TIOCM_to_MCR(mctrl);

        mcr |= up->mcr;

        serial8250_out_MCR(up, mcr);
}
EXPORT_SYMBOL_GPL(serial8250_do_set_mctrl);

static void serial8250_set_mctrl(struct uart_port *port, unsigned int mctrl)
{
        if (port->rs485.flags & SER_RS485_ENABLED)
                return;

        if (port->set_mctrl)
                port->set_mctrl(port, mctrl);
        else
                serial8250_do_set_mctrl(port, mctrl);
}

static void serial8250_break_ctl(struct uart_port *port, int break_state)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        unsigned long flags;

        serial8250_rpm_get(up);
        uart_port_lock_irqsave(port, &flags);
        if (break_state == -1)
                up->lcr |= UART_LCR_SBC;
        else
                up->lcr &= ~UART_LCR_SBC;
        serial_port_out(port, UART_LCR, up->lcr);
        uart_port_unlock_irqrestore(port, flags);
        serial8250_rpm_put(up);
}

static void wait_for_lsr(struct uart_8250_port *up, int bits)
{
        unsigned int status, tmout = 10000;

        /* Wait up to 10ms for the character(s) to be sent. */
        for (;;) {
                status = serial_lsr_in(up);

                if ((status & bits) == bits)
                        break;
                if (--tmout == 0)
                        break;
                udelay(1);
                touch_nmi_watchdog();
        }
}

/*
 *        Wait for transmitter & holding register to empty
 */
static void wait_for_xmitr(struct uart_8250_port *up, int bits)
{
        unsigned int tmout;

        wait_for_lsr(up, bits);

        /* Wait up to 1s for flow control if necessary */
        if (up->port.flags & UPF_CONS_FLOW) {
                for (tmout = 1000000; tmout; tmout--) {
                        unsigned int msr = serial_in(up, UART_MSR);
                        up->msr_saved_flags |= msr & MSR_SAVE_FLAGS;
                        if (msr & UART_MSR_CTS)
                                break;
                        udelay(1);
                        touch_nmi_watchdog();
                }
        }
}

#ifdef CONFIG_CONSOLE_POLL
/*
 * Console polling routines for writing and reading from the uart while
 * in an interrupt or debug context.
 */

static int serial8250_get_poll_char(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        int status;
        u16 lsr;

        serial8250_rpm_get(up);

        lsr = serial_port_in(port, UART_LSR);

        if (!(lsr & UART_LSR_DR)) {
                status = NO_POLL_CHAR;
                goto out;
        }

        status = serial_port_in(port, UART_RX);
out:
        serial8250_rpm_put(up);
        return status;
}


static void serial8250_put_poll_char(struct uart_port *port,
                         unsigned char c)
{
        unsigned int ier;
        struct uart_8250_port *up = up_to_u8250p(port);

        /*
         * Normally the port is locked to synchronize UART_IER access
         * against the console. However, this function is only used by
         * KDB/KGDB, where it may not be possible to acquire the port
         * lock because all other CPUs are quiesced. The quiescence
         * should allow safe lockless usage here.
         */

        serial8250_rpm_get(up);
        /*
         *        First save the IER then disable the interrupts
         */
        ier = serial_port_in(port, UART_IER);
        serial8250_clear_IER(up);

        wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
        /*
         *        Send the character out.
         */
        serial_port_out(port, UART_TX, c);

        /*
         *        Finally, wait for transmitter to become empty
         *        and restore the IER
         */
        wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);
        serial_port_out(port, UART_IER, ier);
        serial8250_rpm_put(up);
}

#endif /* CONFIG_CONSOLE_POLL */

int serial8250_do_startup(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        unsigned long flags;
        unsigned char iir;
        int retval;
        u16 lsr;

        if (!port->fifosize)
                port->fifosize = uart_config[port->type].fifo_size;
        if (!up->tx_loadsz)
                up->tx_loadsz = uart_config[port->type].tx_loadsz;
        if (!up->capabilities)
                up->capabilities = uart_config[port->type].flags;
        up->mcr = 0;

        if (port->iotype != up->cur_iotype)
                set_io_from_upio(port);

        serial8250_rpm_get(up);
        if (port->type == PORT_16C950) {
                /*
                 * Wake up and initialize UART
                 *
                 * Synchronize UART_IER access against the console.
                 */
                uart_port_lock_irqsave(port, &flags);
                up->acr = 0;
                serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
                serial_port_out(port, UART_EFR, UART_EFR_ECB);
                serial_port_out(port, UART_IER, 0);
                serial_port_out(port, UART_LCR, 0);
                serial_icr_write(up, UART_CSR, 0); /* Reset the UART */
                serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
                serial_port_out(port, UART_EFR, UART_EFR_ECB);
                serial_port_out(port, UART_LCR, 0);
                uart_port_unlock_irqrestore(port, flags);
        }

        if (port->type == PORT_DA830) {
                /*
                 * Reset the port
                 *
                 * Synchronize UART_IER access against the console.
                 */
                uart_port_lock_irqsave(port, &flags);
                serial_port_out(port, UART_IER, 0);
                serial_port_out(port, UART_DA830_PWREMU_MGMT, 0);
                uart_port_unlock_irqrestore(port, flags);
                mdelay(10);

                /* Enable Tx, Rx and free run mode */
                serial_port_out(port, UART_DA830_PWREMU_MGMT,
                                UART_DA830_PWREMU_MGMT_UTRST |
                                UART_DA830_PWREMU_MGMT_URRST |
                                UART_DA830_PWREMU_MGMT_FREE);
        }

#ifdef CONFIG_SERIAL_8250_RSA
        /*
         * If this is an RSA port, see if we can kick it up to the
         * higher speed clock.
         */
        enable_rsa(up);
#endif

        /*
         * Clear the FIFO buffers and disable them.
         * (they will be reenabled in set_termios())
         */
        serial8250_clear_fifos(up);

        /*
         * Clear the interrupt registers.
         */
        serial_port_in(port, UART_LSR);
        serial_port_in(port, UART_RX);
        serial_port_in(port, UART_IIR);
        serial_port_in(port, UART_MSR);

        /*
         * At this point, there's no way the LSR could still be 0xff;
         * if it is, then bail out, because there's likely no UART
         * here.
         */
        if (!(port->flags & UPF_BUGGY_UART) &&
            (serial_port_in(port, UART_LSR) == 0xff)) {
                dev_info_ratelimited(port->dev, "LSR safety check engaged!\n");
                retval = -ENODEV;
                goto out;
        }

        /*
         * For a XR16C850, we need to set the trigger levels
         */
        if (port->type == PORT_16850) {
                unsigned char fctr;

                serial_out(up, UART_LCR, UART_LCR_CONF_MODE_B);

                fctr = serial_in(up, UART_FCTR) & ~(UART_FCTR_RX|UART_FCTR_TX);
                serial_port_out(port, UART_FCTR,
                                fctr | UART_FCTR_TRGD | UART_FCTR_RX);
                serial_port_out(port, UART_TRG, UART_TRG_96);
                serial_port_out(port, UART_FCTR,
                                fctr | UART_FCTR_TRGD | UART_FCTR_TX);
                serial_port_out(port, UART_TRG, UART_TRG_96);

                serial_port_out(port, UART_LCR, 0);
        }

        /*
         * For the Altera 16550 variants, set TX threshold trigger level.
         */
        if (((port->type == PORT_ALTR_16550_F32) ||
             (port->type == PORT_ALTR_16550_F64) ||
             (port->type == PORT_ALTR_16550_F128)) && (port->fifosize > 1)) {
                /* Bounds checking of TX threshold (valid 0 to fifosize-2) */
                if ((up->tx_loadsz < 2) || (up->tx_loadsz > port->fifosize)) {
                        dev_err(port->dev, "TX FIFO Threshold errors, skipping\n");
                } else {
                        serial_port_out(port, UART_ALTR_AFR,
                                        UART_ALTR_EN_TXFIFO_LW);
                        serial_port_out(port, UART_ALTR_TX_LOW,
                                        port->fifosize - up->tx_loadsz);
                        port->handle_irq = serial8250_tx_threshold_handle_irq;
                }
        }

        /* Check if we need to have shared IRQs */
        if (port->irq && (up->port.flags & UPF_SHARE_IRQ))
                up->port.irqflags |= IRQF_SHARED;

        retval = up->ops->setup_irq(up);
        if (retval)
                goto out;

        if (port->irq && !(up->port.flags & UPF_NO_THRE_TEST)) {
                unsigned char iir1;

                if (port->irqflags & IRQF_SHARED)
                        disable_irq_nosync(port->irq);

                /*
                 * Test for UARTs that do not reassert THRE when the
                 * transmitter is idle and the interrupt has already
                 * been cleared.  Real 16550s should always reassert
                 * this interrupt whenever the transmitter is idle and
                 * the interrupt is enabled.  Delays are necessary to
                 * allow register changes to become visible.
                 *
                 * Synchronize UART_IER access against the console.
                 */
                uart_port_lock_irqsave(port, &flags);

                wait_for_xmitr(up, UART_LSR_THRE);
                serial_port_out_sync(port, UART_IER, UART_IER_THRI);
                udelay(1); /* allow THRE to set */
                iir1 = serial_port_in(port, UART_IIR);
                serial_port_out(port, UART_IER, 0);
                serial_port_out_sync(port, UART_IER, UART_IER_THRI);
                udelay(1); /* allow a working UART time to re-assert THRE */
                iir = serial_port_in(port, UART_IIR);
                serial_port_out(port, UART_IER, 0);

                uart_port_unlock_irqrestore(port, flags);

                if (port->irqflags & IRQF_SHARED)
                        enable_irq(port->irq);

                /*
                 * If the interrupt is not reasserted, or we otherwise
                 * don't trust the iir, setup a timer to kick the UART
                 * on a regular basis.
                 */
                if ((!(iir1 & UART_IIR_NO_INT) && (iir & UART_IIR_NO_INT)) ||
                    up->port.flags & UPF_BUG_THRE) {
                        up->bugs |= UART_BUG_THRE;
                }
        }

        up->ops->setup_timer(up);

        /*
         * Now, initialize the UART
         */
        serial_port_out(port, UART_LCR, UART_LCR_WLEN8);

        uart_port_lock_irqsave(port, &flags);
        if (up->port.flags & UPF_FOURPORT) {
                if (!up->port.irq)
                        up->port.mctrl |= TIOCM_OUT1;
        } else
                /*
                 * Most PC uarts need OUT2 raised to enable interrupts.
                 */
                if (port->irq)
                        up->port.mctrl |= TIOCM_OUT2;

        serial8250_set_mctrl(port, port->mctrl);

        /*
         * Serial over Lan (SoL) hack:
         * Intel 8257x Gigabit ethernet chips have a 16550 emulation, to be
         * used for Serial Over Lan.  Those chips take a longer time than a
         * normal serial device to signalize that a transmission data was
         * queued. Due to that, the above test generally fails. One solution
         * would be to delay the reading of iir. However, this is not
         * reliable, since the timeout is variable. So, let's just don't
         * test if we receive TX irq.  This way, we'll never enable
         * UART_BUG_TXEN.
         */
        if (up->port.quirks & UPQ_NO_TXEN_TEST)
                goto dont_test_tx_en;

        /*
         * Do a quick test to see if we receive an interrupt when we enable
         * the TX irq.
         */
        serial_port_out(port, UART_IER, UART_IER_THRI);
        lsr = serial_port_in(port, UART_LSR);
        iir = serial_port_in(port, UART_IIR);
        serial_port_out(port, UART_IER, 0);

        if (lsr & UART_LSR_TEMT && iir & UART_IIR_NO_INT) {
                if (!(up->bugs & UART_BUG_TXEN)) {
                        up->bugs |= UART_BUG_TXEN;
                        dev_dbg(port->dev, "enabling bad tx status workarounds\n");
                }
        } else {
                up->bugs &= ~UART_BUG_TXEN;
        }

dont_test_tx_en:
        uart_port_unlock_irqrestore(port, flags);

        /*
         * Clear the interrupt registers again for luck, and clear the
         * saved flags to avoid getting false values from polling
         * routines or the previous session.
         */
        serial_port_in(port, UART_LSR);
        serial_port_in(port, UART_RX);
        serial_port_in(port, UART_IIR);
        serial_port_in(port, UART_MSR);
        up->lsr_saved_flags = 0;
        up->msr_saved_flags = 0;

        /*
         * Request DMA channels for both RX and TX.
         */
        if (up->dma) {
                const char *msg = NULL;

                if (uart_console(port))
                        msg = "forbid DMA for kernel console";
                else if (serial8250_request_dma(up))
                        msg = "failed to request DMA";
                if (msg) {
                        dev_warn_ratelimited(port->dev, "%s\n", msg);
                        up->dma = NULL;
                }
        }

        /*
         * Set the IER shadow for rx interrupts but defer actual interrupt
         * enable until after the FIFOs are enabled; otherwise, an already-
         * active sender can swamp the interrupt handler with "too much work".
         */
        up->ier = UART_IER_RLSI | UART_IER_RDI;

        if (port->flags & UPF_FOURPORT) {
                unsigned int icp;
                /*
                 * Enable interrupts on the AST Fourport board
                 */
                icp = (port->iobase & 0xfe0) | 0x01f;
                outb_p(0x80, icp);
                inb_p(icp);
        }
        retval = 0;
out:
        serial8250_rpm_put(up);
        return retval;
}
EXPORT_SYMBOL_GPL(serial8250_do_startup);

static int serial8250_startup(struct uart_port *port)
{
        if (port->startup)
                return port->startup(port);
        return serial8250_do_startup(port);
}

void serial8250_do_shutdown(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        unsigned long flags;

        serial8250_rpm_get(up);
        /*
         * Disable interrupts from this port
         *
         * Synchronize UART_IER access against the console.
         */
        uart_port_lock_irqsave(port, &flags);
        up->ier = 0;
        serial_port_out(port, UART_IER, 0);
        uart_port_unlock_irqrestore(port, flags);

        synchronize_irq(port->irq);

        if (up->dma)
                serial8250_release_dma(up);

        uart_port_lock_irqsave(port, &flags);
        if (port->flags & UPF_FOURPORT) {
                /* reset interrupts on the AST Fourport board */
                inb((port->iobase & 0xfe0) | 0x1f);
                port->mctrl |= TIOCM_OUT1;
        } else
                port->mctrl &= ~TIOCM_OUT2;

        serial8250_set_mctrl(port, port->mctrl);
        uart_port_unlock_irqrestore(port, flags);

        /*
         * Disable break condition and FIFOs
         */
        serial_port_out(port, UART_LCR,
                        serial_port_in(port, UART_LCR) & ~UART_LCR_SBC);
        serial8250_clear_fifos(up);

#ifdef CONFIG_SERIAL_8250_RSA
        /*
         * Reset the RSA board back to 115kbps compat mode.
         */
        disable_rsa(up);
#endif

        /*
         * Read data port to reset things, and then unlink from
         * the IRQ chain.
         */
        serial_port_in(port, UART_RX);
        serial8250_rpm_put(up);

        up->ops->release_irq(up);
}
EXPORT_SYMBOL_GPL(serial8250_do_shutdown);

static void serial8250_shutdown(struct uart_port *port)
{
        if (port->shutdown)
                port->shutdown(port);
        else
                serial8250_do_shutdown(port);
}

static unsigned int serial8250_do_get_divisor(struct uart_port *port,
                                              unsigned int baud,
                                              unsigned int *frac)
{
        upf_t magic_multiplier = port->flags & UPF_MAGIC_MULTIPLIER;
        struct uart_8250_port *up = up_to_u8250p(port);
        unsigned int quot;

        /*
         * Handle magic divisors for baud rates above baud_base on SMSC
         * Super I/O chips.  We clamp custom rates from clk/6 and clk/12
         * up to clk/4 (0x8001) and clk/8 (0x8002) respectively.  These
         * magic divisors actually reprogram the baud rate generator's
         * reference clock derived from chips's 14.318MHz clock input.
         *
         * Documentation claims that with these magic divisors the base
         * frequencies of 7.3728MHz and 3.6864MHz are used respectively
         * for the extra baud rates of 460800bps and 230400bps rather
         * than the usual base frequency of 1.8462MHz.  However empirical
         * evidence contradicts that.
         *
         * Instead bit 7 of the DLM register (bit 15 of the divisor) is
         * effectively used as a clock prescaler selection bit for the
         * base frequency of 7.3728MHz, always used.  If set to 0, then
         * the base frequency is divided by 4 for use by the Baud Rate
         * Generator, for the usual arrangement where the value of 1 of
         * the divisor produces the baud rate of 115200bps.  Conversely,
         * if set to 1 and high-speed operation has been enabled with the
         * Serial Port Mode Register in the Device Configuration Space,
         * then the base frequency is supplied directly to the Baud Rate
         * Generator, so for the divisor values of 0x8001, 0x8002, 0x8003,
         * 0x8004, etc. the respective baud rates produced are 460800bps,
         * 230400bps, 153600bps, 115200bps, etc.
         *
         * In all cases only low 15 bits of the divisor are used to divide
         * the baud base and therefore 32767 is the maximum divisor value
         * possible, even though documentation says that the programmable
         * Baud Rate Generator is capable of dividing the internal PLL
         * clock by any divisor from 1 to 65535.
         */
        if (magic_multiplier && baud >= port->uartclk / 6)
                quot = 0x8001;
        else if (magic_multiplier && baud >= port->uartclk / 12)
                quot = 0x8002;
        else
                quot = uart_get_divisor(port, baud);

        /*
         * Oxford Semi 952 rev B workaround
         */
        if (up->bugs & UART_BUG_QUOT && (quot & 0xff) == 0)
                quot++;

        return quot;
}

static unsigned int serial8250_get_divisor(struct uart_port *port,
                                           unsigned int baud,
                                           unsigned int *frac)
{
        if (port->get_divisor)
                return port->get_divisor(port, baud, frac);

        return serial8250_do_get_divisor(port, baud, frac);
}

static unsigned char serial8250_compute_lcr(struct uart_8250_port *up,
                                            tcflag_t c_cflag)
{
        unsigned char cval;

        cval = UART_LCR_WLEN(tty_get_char_size(c_cflag));

        if (c_cflag & CSTOPB)
                cval |= UART_LCR_STOP;
        if (c_cflag & PARENB)
                cval |= UART_LCR_PARITY;
        if (!(c_cflag & PARODD))
                cval |= UART_LCR_EPAR;
        if (c_cflag & CMSPAR)
                cval |= UART_LCR_SPAR;

        return cval;
}

void serial8250_do_set_divisor(struct uart_port *port, unsigned int baud,
                               unsigned int quot, unsigned int quot_frac)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        /* Workaround to enable 115200 baud on OMAP1510 internal ports */
        if (is_omap1510_8250(up)) {
                if (baud == 115200) {
                        quot = 1;
                        serial_port_out(port, UART_OMAP_OSC_12M_SEL, 1);
                } else
                        serial_port_out(port, UART_OMAP_OSC_12M_SEL, 0);
        }

        /*
         * For NatSemi, switch to bank 2 not bank 1, to avoid resetting EXCR2,
         * otherwise just set DLAB
         */
        if (up->capabilities & UART_NATSEMI)
                serial_port_out(port, UART_LCR, 0xe0);
        else
                serial_port_out(port, UART_LCR, up->lcr | UART_LCR_DLAB);

        serial_dl_write(up, quot);
}
EXPORT_SYMBOL_GPL(serial8250_do_set_divisor);

static void serial8250_set_divisor(struct uart_port *port, unsigned int baud,
                                   unsigned int quot, unsigned int quot_frac)
{
        if (port->set_divisor)
                port->set_divisor(port, baud, quot, quot_frac);
        else
                serial8250_do_set_divisor(port, baud, quot, quot_frac);
}

static unsigned int serial8250_get_baud_rate(struct uart_port *port,
                                             struct ktermios *termios,
                                             const struct ktermios *old)
{
        unsigned int tolerance = port->uartclk / 100;
        unsigned int min;
        unsigned int max;

        /*
         * Handle magic divisors for baud rates above baud_base on SMSC
         * Super I/O chips.  Enable custom rates of clk/4 and clk/8, but
         * disable divisor values beyond 32767, which are unavailable.
         */
        if (port->flags & UPF_MAGIC_MULTIPLIER) {
                min = port->uartclk / 16 / UART_DIV_MAX >> 1;
                max = (port->uartclk + tolerance) / 4;
        } else {
                min = port->uartclk / 16 / UART_DIV_MAX;
                max = (port->uartclk + tolerance) / 16;
        }

        /*
         * Ask the core to calculate the divisor for us.
         * Allow 1% tolerance at the upper limit so uart clks marginally
         * slower than nominal still match standard baud rates without
         * causing transmission errors.
         */
        return uart_get_baud_rate(port, termios, old, min, max);
}

/*
 * Note in order to avoid the tty port mutex deadlock don't use the next method
 * within the uart port callbacks. Primarily it's supposed to be utilized to
 * handle a sudden reference clock rate change.
 */
void serial8250_update_uartclk(struct uart_port *port, unsigned int uartclk)
{
        struct tty_port *tport = &port->state->port;
        struct tty_struct *tty;

        tty = tty_port_tty_get(tport);
        if (!tty) {
                mutex_lock(&tport->mutex);
                port->uartclk = uartclk;
                mutex_unlock(&tport->mutex);
                return;
        }

        down_write(&tty->termios_rwsem);
        mutex_lock(&tport->mutex);

        if (port->uartclk == uartclk)
                goto out_unlock;

        port->uartclk = uartclk;

        if (!tty_port_initialized(tport))
                goto out_unlock;

        serial8250_do_set_termios(port, &tty->termios, NULL);

out_unlock:
        mutex_unlock(&tport->mutex);
        up_write(&tty->termios_rwsem);
        tty_kref_put(tty);
}
EXPORT_SYMBOL_GPL(serial8250_update_uartclk);

void
serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios,
                          const struct ktermios *old)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        unsigned char cval;
        unsigned long flags;
        unsigned int baud, quot, frac = 0;

        if (up->capabilities & UART_CAP_MINI) {
                termios->c_cflag &= ~(CSTOPB | PARENB | PARODD | CMSPAR);
                if ((termios->c_cflag & CSIZE) == CS5 ||
                    (termios->c_cflag & CSIZE) == CS6)
                        termios->c_cflag = (termios->c_cflag & ~CSIZE) | CS7;
        }
        cval = serial8250_compute_lcr(up, termios->c_cflag);

        baud = serial8250_get_baud_rate(port, termios, old);
        quot = serial8250_get_divisor(port, baud, &frac);

        /*
         * Ok, we're now changing the port state.  Do it with
         * interrupts disabled.
         *
         * Synchronize UART_IER access against the console.
         */
        serial8250_rpm_get(up);
        uart_port_lock_irqsave(port, &flags);

        up->lcr = cval;                                        /* Save computed LCR */

        if (up->capabilities & UART_CAP_FIFO && port->fifosize > 1) {
                if (baud < 2400 && !up->dma) {
                        up->fcr &= ~UART_FCR_TRIGGER_MASK;
                        up->fcr |= UART_FCR_TRIGGER_1;
                }
        }

        /*
         * MCR-based auto flow control.  When AFE is enabled, RTS will be
         * deasserted when the receive FIFO contains more characters than
         * the trigger, or the MCR RTS bit is cleared.
         */
        if (up->capabilities & UART_CAP_AFE) {
                up->mcr &= ~UART_MCR_AFE;
                if (termios->c_cflag & CRTSCTS)
                        up->mcr |= UART_MCR_AFE;
        }

        /*
         * Update the per-port timeout.
         */
        uart_update_timeout(port, termios->c_cflag, baud);

        port->read_status_mask = UART_LSR_OE | UART_LSR_THRE | UART_LSR_DR;
        if (termios->c_iflag & INPCK)
                port->read_status_mask |= UART_LSR_FE | UART_LSR_PE;
        if (termios->c_iflag & (IGNBRK | BRKINT | PARMRK))
                port->read_status_mask |= UART_LSR_BI;

        /*
         * Characters to ignore
         */
        port->ignore_status_mask = 0;
        if (termios->c_iflag & IGNPAR)
                port->ignore_status_mask |= UART_LSR_PE | UART_LSR_FE;
        if (termios->c_iflag & IGNBRK) {
                port->ignore_status_mask |= UART_LSR_BI;
                /*
                 * If we're ignoring parity and break indicators,
                 * ignore overruns too (for real raw support).
                 */
                if (termios->c_iflag & IGNPAR)
                        port->ignore_status_mask |= UART_LSR_OE;
        }

        /*
         * ignore all characters if CREAD is not set
         */
        if ((termios->c_cflag & CREAD) == 0)
                port->ignore_status_mask |= UART_LSR_DR;

        /*
         * CTS flow control flag and modem status interrupts
         */
        up->ier &= ~UART_IER_MSI;
        if (!(up->bugs & UART_BUG_NOMSR) &&
                        UART_ENABLE_MS(&up->port, termios->c_cflag))
                up->ier |= UART_IER_MSI;
        if (up->capabilities & UART_CAP_UUE)
                up->ier |= UART_IER_UUE;
        if (up->capabilities & UART_CAP_RTOIE)
                up->ier |= UART_IER_RTOIE;

        serial_port_out(port, UART_IER, up->ier);

        if (up->capabilities & UART_CAP_EFR) {
                unsigned char efr = 0;
                /*
                 * TI16C752/Startech hardware flow control.  FIXME:
                 * - TI16C752 requires control thresholds to be set.
                 * - UART_MCR_RTS is ineffective if auto-RTS mode is enabled.
                 */
                if (termios->c_cflag & CRTSCTS)
                        efr |= UART_EFR_CTS;

                serial_port_out(port, UART_LCR, UART_LCR_CONF_MODE_B);
                if (port->flags & UPF_EXAR_EFR)
                        serial_port_out(port, UART_XR_EFR, efr);
                else
                        serial_port_out(port, UART_EFR, efr);
        }

        serial8250_set_divisor(port, baud, quot, frac);

        /*
         * LCR DLAB must be set to enable 64-byte FIFO mode. If the FCR
         * is written without DLAB set, this mode will be disabled.
         */
        if (port->type == PORT_16750)
                serial_port_out(port, UART_FCR, up->fcr);

        serial_port_out(port, UART_LCR, up->lcr);        /* reset DLAB */
        if (port->type != PORT_16750) {
                /* emulated UARTs (Lucent Venus 167x) need two steps */
                if (up->fcr & UART_FCR_ENABLE_FIFO)
                        serial_port_out(port, UART_FCR, UART_FCR_ENABLE_FIFO);
                serial_port_out(port, UART_FCR, up->fcr);        /* set fcr */
        }
        serial8250_set_mctrl(port, port->mctrl);
        uart_port_unlock_irqrestore(port, flags);
        serial8250_rpm_put(up);

        /* Don't rewrite B0 */
        if (tty_termios_baud_rate(termios))
                tty_termios_encode_baud_rate(termios, baud, baud);
}
EXPORT_SYMBOL(serial8250_do_set_termios);

static void
serial8250_set_termios(struct uart_port *port, struct ktermios *termios,
                       const struct ktermios *old)
{
        if (port->set_termios)
                port->set_termios(port, termios, old);
        else
                serial8250_do_set_termios(port, termios, old);
}

void serial8250_do_set_ldisc(struct uart_port *port, struct ktermios *termios)
{
        if (termios->c_line == N_PPS) {
                port->flags |= UPF_HARDPPS_CD;
                uart_port_lock_irq(port);
                serial8250_enable_ms(port);
                uart_port_unlock_irq(port);
        } else {
                port->flags &= ~UPF_HARDPPS_CD;
                if (!UART_ENABLE_MS(port, termios->c_cflag)) {
                        uart_port_lock_irq(port);
                        serial8250_disable_ms(port);
                        uart_port_unlock_irq(port);
                }
        }
}
EXPORT_SYMBOL_GPL(serial8250_do_set_ldisc);

static void
serial8250_set_ldisc(struct uart_port *port, struct ktermios *termios)
{
        if (port->set_ldisc)
                port->set_ldisc(port, termios);
        else
                serial8250_do_set_ldisc(port, termios);
}

void serial8250_do_pm(struct uart_port *port, unsigned int state,
                      unsigned int oldstate)
{
        struct uart_8250_port *p = up_to_u8250p(port);

        serial8250_set_sleep(p, state != 0);
}
EXPORT_SYMBOL(serial8250_do_pm);

static void
serial8250_pm(struct uart_port *port, unsigned int state,
              unsigned int oldstate)
{
        if (port->pm)
                port->pm(port, state, oldstate);
        else
                serial8250_do_pm(port, state, oldstate);
}

static unsigned int serial8250_port_size(struct uart_8250_port *pt)
{
        if (pt->port.mapsize)
                return pt->port.mapsize;
        if (is_omap1_8250(pt))
                return 0x16 << pt->port.regshift;

        return 8 << pt->port.regshift;
}

/*
 * Resource handling.
 */
static int serial8250_request_std_resource(struct uart_8250_port *up)
{
        unsigned int size = serial8250_port_size(up);
        struct uart_port *port = &up->port;
        int ret = 0;

        switch (port->iotype) {
        case UPIO_AU:
        case UPIO_TSI:
        case UPIO_MEM32:
        case UPIO_MEM32BE:
        case UPIO_MEM16:
        case UPIO_MEM:
                if (!port->mapbase) {
                        ret = -EINVAL;
                        break;
                }

                if (!request_mem_region(port->mapbase, size, "serial")) {
                        ret = -EBUSY;
                        break;
                }

                if (port->flags & UPF_IOREMAP) {
                        port->membase = ioremap(port->mapbase, size);
                        if (!port->membase) {
                                release_mem_region(port->mapbase, size);
                                ret = -ENOMEM;
                        }
                }
                break;

        case UPIO_HUB6:
        case UPIO_PORT:
                if (!request_region(port->iobase, size, "serial"))
                        ret = -EBUSY;
                break;
        }
        return ret;
}

static void serial8250_release_std_resource(struct uart_8250_port *up)
{
        unsigned int size = serial8250_port_size(up);
        struct uart_port *port = &up->port;

        switch (port->iotype) {
        case UPIO_AU:
        case UPIO_TSI:
        case UPIO_MEM32:
        case UPIO_MEM32BE:
        case UPIO_MEM16:
        case UPIO_MEM:
                if (!port->mapbase)
                        break;

                if (port->flags & UPF_IOREMAP) {
                        iounmap(port->membase);
                        port->membase = NULL;
                }

                release_mem_region(port->mapbase, size);
                break;

        case UPIO_HUB6:
        case UPIO_PORT:
                release_region(port->iobase, size);
                break;
        }
}

static void serial8250_release_port(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        serial8250_release_std_resource(up);
}

static int serial8250_request_port(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        return serial8250_request_std_resource(up);
}

static int fcr_get_rxtrig_bytes(struct uart_8250_port *up)
{
        const struct serial8250_config *conf_type = &uart_config[up->port.type];
        unsigned char bytes;

        bytes = conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(up->fcr)];

        return bytes ? bytes : -EOPNOTSUPP;
}

static int bytes_to_fcr_rxtrig(struct uart_8250_port *up, unsigned char bytes)
{
        const struct serial8250_config *conf_type = &uart_config[up->port.type];
        int i;

        if (!conf_type->rxtrig_bytes[UART_FCR_R_TRIG_BITS(UART_FCR_R_TRIG_00)])
                return -EOPNOTSUPP;

        for (i = 1; i < UART_FCR_R_TRIG_MAX_STATE; i++) {
                if (bytes < conf_type->rxtrig_bytes[i])
                        /* Use the nearest lower value */
                        return (--i) << UART_FCR_R_TRIG_SHIFT;
        }

        return UART_FCR_R_TRIG_11;
}

static int do_get_rxtrig(struct tty_port *port)
{
        struct uart_state *state = container_of(port, struct uart_state, port);
        struct uart_port *uport = state->uart_port;
        struct uart_8250_port *up = up_to_u8250p(uport);

        if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1)
                return -EINVAL;

        return fcr_get_rxtrig_bytes(up);
}

static int do_serial8250_get_rxtrig(struct tty_port *port)
{
        int rxtrig_bytes;

        mutex_lock(&port->mutex);
        rxtrig_bytes = do_get_rxtrig(port);
        mutex_unlock(&port->mutex);

        return rxtrig_bytes;
}

static ssize_t rx_trig_bytes_show(struct device *dev,
        struct device_attribute *attr, char *buf)
{
        struct tty_port *port = dev_get_drvdata(dev);
        int rxtrig_bytes;

        rxtrig_bytes = do_serial8250_get_rxtrig(port);
        if (rxtrig_bytes < 0)
                return rxtrig_bytes;

        return sysfs_emit(buf, "%d\n", rxtrig_bytes);
}

static int do_set_rxtrig(struct tty_port *port, unsigned char bytes)
{
        struct uart_state *state = container_of(port, struct uart_state, port);
        struct uart_port *uport = state->uart_port;
        struct uart_8250_port *up = up_to_u8250p(uport);
        int rxtrig;

        if (!(up->capabilities & UART_CAP_FIFO) || uport->fifosize <= 1)
                return -EINVAL;

        rxtrig = bytes_to_fcr_rxtrig(up, bytes);
        if (rxtrig < 0)
                return rxtrig;

        serial8250_clear_fifos(up);
        up->fcr &= ~UART_FCR_TRIGGER_MASK;
        up->fcr |= (unsigned char)rxtrig;
        serial_out(up, UART_FCR, up->fcr);
        return 0;
}

static int do_serial8250_set_rxtrig(struct tty_port *port, unsigned char bytes)
{
        int ret;

        mutex_lock(&port->mutex);
        ret = do_set_rxtrig(port, bytes);
        mutex_unlock(&port->mutex);

        return ret;
}

static ssize_t rx_trig_bytes_store(struct device *dev,
        struct device_attribute *attr, const char *buf, size_t count)
{
        struct tty_port *port = dev_get_drvdata(dev);
        unsigned char bytes;
        int ret;

        if (!count)
                return -EINVAL;

        ret = kstrtou8(buf, 10, &bytes);
        if (ret < 0)
                return ret;

        ret = do_serial8250_set_rxtrig(port, bytes);
        if (ret < 0)
                return ret;

        return count;
}

static DEVICE_ATTR_RW(rx_trig_bytes);

static struct attribute *serial8250_dev_attrs[] = {
        &dev_attr_rx_trig_bytes.attr,
        NULL
};

static struct attribute_group serial8250_dev_attr_group = {
        .attrs = serial8250_dev_attrs,
};

static void register_dev_spec_attr_grp(struct uart_8250_port *up)
{
        const struct serial8250_config *conf_type = &uart_config[up->port.type];

        if (conf_type->rxtrig_bytes[0])
                up->port.attr_group = &serial8250_dev_attr_group;
}

static void serial8250_config_port(struct uart_port *port, int flags)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        int ret;

        /*
         * Find the region that we can probe for.  This in turn
         * tells us whether we can probe for the type of port.
         */
        ret = serial8250_request_std_resource(up);
        if (ret < 0)
                return;

        if (port->iotype != up->cur_iotype)
                set_io_from_upio(port);

        if (flags & UART_CONFIG_TYPE)
                autoconfig(up);

        /* HW bugs may trigger IRQ while IIR == NO_INT */
        if (port->type == PORT_TEGRA)
                up->bugs |= UART_BUG_NOMSR;

        if (port->type != PORT_UNKNOWN && flags & UART_CONFIG_IRQ)
                autoconfig_irq(up);

        if (port->type == PORT_UNKNOWN)
                serial8250_release_std_resource(up);

        register_dev_spec_attr_grp(up);
        up->fcr = uart_config[up->port.type].fcr;
}

static int
serial8250_verify_port(struct uart_port *port, struct serial_struct *ser)
{
        if (ser->irq >= nr_irqs || ser->irq < 0 ||
            ser->baud_base < 9600 || ser->type < PORT_UNKNOWN ||
            ser->type >= ARRAY_SIZE(uart_config) || ser->type == PORT_CIRRUS ||
            ser->type == PORT_STARTECH)
                return -EINVAL;
        return 0;
}

static const char *serial8250_type(struct uart_port *port)
{
        int type = port->type;

        if (type >= ARRAY_SIZE(uart_config))
                type = 0;
        return uart_config[type].name;
}

static const struct uart_ops serial8250_pops = {
        .tx_empty        = serial8250_tx_empty,
        .set_mctrl        = serial8250_set_mctrl,
        .get_mctrl        = serial8250_get_mctrl,
        .stop_tx        = serial8250_stop_tx,
        .start_tx        = serial8250_start_tx,
        .throttle        = serial8250_throttle,
        .unthrottle        = serial8250_unthrottle,
        .stop_rx        = serial8250_stop_rx,
        .enable_ms        = serial8250_enable_ms,
        .break_ctl        = serial8250_break_ctl,
        .startup        = serial8250_startup,
        .shutdown        = serial8250_shutdown,
        .set_termios        = serial8250_set_termios,
        .set_ldisc        = serial8250_set_ldisc,
        .pm                = serial8250_pm,
        .type                = serial8250_type,
        .release_port        = serial8250_release_port,
        .request_port        = serial8250_request_port,
        .config_port        = serial8250_config_port,
        .verify_port        = serial8250_verify_port,
#ifdef CONFIG_CONSOLE_POLL
        .poll_get_char = serial8250_get_poll_char,
        .poll_put_char = serial8250_put_poll_char,
#endif
};

void serial8250_init_port(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;

        spin_lock_init(&port->lock);
        port->ctrl_id = 0;
        port->pm = NULL;
        port->ops = &serial8250_pops;
        port->has_sysrq = IS_ENABLED(CONFIG_SERIAL_8250_CONSOLE);

        up->cur_iotype = 0xFF;
}
EXPORT_SYMBOL_GPL(serial8250_init_port);

void serial8250_set_defaults(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;

        if (up->port.flags & UPF_FIXED_TYPE) {
                unsigned int type = up->port.type;

                if (!up->port.fifosize)
                        up->port.fifosize = uart_config[type].fifo_size;
                if (!up->tx_loadsz)
                        up->tx_loadsz = uart_config[type].tx_loadsz;
                if (!up->capabilities)
                        up->capabilities = uart_config[type].flags;
        }

        set_io_from_upio(port);

        /* default dma handlers */
        if (up->dma) {
                if (!up->dma->tx_dma)
                        up->dma->tx_dma = serial8250_tx_dma;
                if (!up->dma->rx_dma)
                        up->dma->rx_dma = serial8250_rx_dma;
        }
}
EXPORT_SYMBOL_GPL(serial8250_set_defaults);

#ifdef CONFIG_SERIAL_8250_CONSOLE

static void serial8250_console_putchar(struct uart_port *port, unsigned char ch)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        wait_for_xmitr(up, UART_LSR_THRE);
        serial_port_out(port, UART_TX, ch);
}

/*
 *        Restore serial console when h/w power-off detected
 */
static void serial8250_console_restore(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;
        struct ktermios termios;
        unsigned int baud, quot, frac = 0;

        termios.c_cflag = port->cons->cflag;
        termios.c_ispeed = port->cons->ispeed;
        termios.c_ospeed = port->cons->ospeed;
        if (port->state->port.tty && termios.c_cflag == 0) {
                termios.c_cflag = port->state->port.tty->termios.c_cflag;
                termios.c_ispeed = port->state->port.tty->termios.c_ispeed;
                termios.c_ospeed = port->state->port.tty->termios.c_ospeed;
        }

        baud = serial8250_get_baud_rate(port, &termios, NULL);
        quot = serial8250_get_divisor(port, baud, &frac);

        serial8250_set_divisor(port, baud, quot, frac);
        serial_port_out(port, UART_LCR, up->lcr);
        serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS);
}

/*
 * Print a string to the serial port using the device FIFO
 *
 * It sends fifosize bytes and then waits for the fifo
 * to get empty.
 */
static void serial8250_console_fifo_write(struct uart_8250_port *up,
                                          const char *s, unsigned int count)
{
        int i;
        const char *end = s + count;
        unsigned int fifosize = up->tx_loadsz;
        bool cr_sent = false;

        while (s != end) {
                wait_for_lsr(up, UART_LSR_THRE);

                for (i = 0; i < fifosize && s != end; ++i) {
                        if (*s == '\n' && !cr_sent) {
                                serial_out(up, UART_TX, '\r');
                                cr_sent = true;
                        } else {
                                serial_out(up, UART_TX, *s++);
                                cr_sent = false;
                        }
                }
        }
}

/*
 *        Print a string to the serial port trying not to disturb
 *        any possible real use of the port...
 *
 *        The console_lock must be held when we get here.
 *
 *        Doing runtime PM is really a bad idea for the kernel console.
 *        Thus, we assume the function is called when device is powered up.
 */
void serial8250_console_write(struct uart_8250_port *up, const char *s,
                              unsigned int count)
{
        struct uart_8250_em485 *em485 = up->em485;
        struct uart_port *port = &up->port;
        unsigned long flags;
        unsigned int ier, use_fifo;
        int locked = 1;

        touch_nmi_watchdog();

        if (oops_in_progress)
                locked = uart_port_trylock_irqsave(port, &flags);
        else
                uart_port_lock_irqsave(port, &flags);

        /*
         *        First save the IER then disable the interrupts
         */
        ier = serial_port_in(port, UART_IER);
        serial8250_clear_IER(up);

        /* check scratch reg to see if port powered off during system sleep */
        if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
                serial8250_console_restore(up);
                up->canary = 0;
        }

        if (em485) {
                if (em485->tx_stopped)
                        up->rs485_start_tx(up);
                mdelay(port->rs485.delay_rts_before_send);
        }

        use_fifo = (up->capabilities & UART_CAP_FIFO) &&
                /*
                 * BCM283x requires to check the fifo
                 * after each byte.
                 */
                !(up->capabilities & UART_CAP_MINI) &&
                /*
                 * tx_loadsz contains the transmit fifo size
                 */
                up->tx_loadsz > 1 &&
                (up->fcr & UART_FCR_ENABLE_FIFO) &&
                port->state &&
                test_bit(TTY_PORT_INITIALIZED, &port->state->port.iflags) &&
                /*
                 * After we put a data in the fifo, the controller will send
                 * it regardless of the CTS state. Therefore, only use fifo
                 * if we don't use control flow.
                 */
                !(up->port.flags & UPF_CONS_FLOW);

        if (likely(use_fifo))
                serial8250_console_fifo_write(up, s, count);
        else
                uart_console_write(port, s, count, serial8250_console_putchar);

        /*
         *        Finally, wait for transmitter to become empty
         *        and restore the IER
         */
        wait_for_xmitr(up, UART_LSR_BOTH_EMPTY);

        if (em485) {
                mdelay(port->rs485.delay_rts_after_send);
                if (em485->tx_stopped)
                        up->rs485_stop_tx(up);
        }

        serial_port_out(port, UART_IER, ier);

        /*
         *        The receive handling will happen properly because the
         *        receive ready bit will still be set; it is not cleared
         *        on read.  However, modem control will not, we must
         *        call it if we have saved something in the saved flags
         *        while processing with interrupts off.
         */
        if (up->msr_saved_flags)
                serial8250_modem_status(up);

        if (locked)
                uart_port_unlock_irqrestore(port, flags);
}

static unsigned int probe_baud(struct uart_port *port)
{
        unsigned char lcr, dll, dlm;
        unsigned int quot;

        lcr = serial_port_in(port, UART_LCR);
        serial_port_out(port, UART_LCR, lcr | UART_LCR_DLAB);
        dll = serial_port_in(port, UART_DLL);
        dlm = serial_port_in(port, UART_DLM);
        serial_port_out(port, UART_LCR, lcr);

        quot = (dlm << 8) | dll;
        return (port->uartclk / 16) / quot;
}

int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
{
        int baud = 9600;
        int bits = 8;
        int parity = 'n';
        int flow = 'n';
        int ret;

        if (!port->iobase && !port->membase)
                return -ENODEV;

        if (options)
                uart_parse_options(options, &baud, &parity, &bits, &flow);
        else if (probe)
                baud = probe_baud(port);

        ret = uart_set_options(port, port->cons, baud, parity, bits, flow);
        if (ret)
                return ret;

        if (port->dev)
                pm_runtime_get_sync(port->dev);

        return 0;
}

int serial8250_console_exit(struct uart_port *port)
{
        if (port->dev)
                pm_runtime_put_sync(port->dev);

        return 0;
}

#endif /* CONFIG_SERIAL_8250_CONSOLE */

MODULE_LICENSE("GPL");





























    1 












    1 
    1 



















    1 
    1 





    1 




































    1 




    1 
    1 































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
// SPDX-License-Identifier: GPL-2.0-or-later
/* Filesystem parameter parser.
 *
 * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/export.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/namei.h>
#include "internal.h"

static const struct constant_table bool_names[] = {
        { "0",                false },
        { "1",                true },
        { "false",        false },
        { "no",                false },
        { "true",        true },
        { "yes",        true },
        { },
};

static const struct constant_table *
__lookup_constant(const struct constant_table *tbl, const char *name)
{
        for ( ; tbl->name; tbl++)
                if (strcmp(name, tbl->name) == 0)
                        return tbl;
        return NULL;
}

/**
 * lookup_constant - Look up a constant by name in an ordered table
 * @tbl: The table of constants to search.
 * @name: The name to look up.
 * @not_found: The value to return if the name is not found.
 */
int lookup_constant(const struct constant_table *tbl, const char *name, int not_found)
{
        const struct constant_table *p = __lookup_constant(tbl, name);

        return p ? p->value : not_found;
}
EXPORT_SYMBOL(lookup_constant);

static inline bool is_flag(const struct fs_parameter_spec *p)
{
        return p->type == NULL;
}

static const struct fs_parameter_spec *fs_lookup_key(
        const struct fs_parameter_spec *desc,
        struct fs_parameter *param, bool *negated)
{
        const struct fs_parameter_spec *p, *other = NULL;
        const char *name = param->key;
        bool want_flag = param->type == fs_value_is_flag;

        *negated = false;
        for (p = desc; p->name; p++) {
                if (strcmp(p->name, name) != 0)
                        continue;
                if (likely(is_flag(p) == want_flag))
                        return p;
                other = p;
        }
        if (want_flag) {
                if (name[0] == 'n' && name[1] == 'o' && name[2]) {
                        for (p = desc; p->name; p++) {
                                if (strcmp(p->name, name + 2) != 0)
                                        continue;
                                if (!(p->flags & fs_param_neg_with_no))
                                        continue;
                                *negated = true;
                                return p;
                        }
                }
        }
        return other;
}

/*
 * __fs_parse - Parse a filesystem configuration parameter
 * @log: The filesystem context to log errors through.
 * @desc: The parameter description to use.
 * @param: The parameter.
 * @result: Where to place the result of the parse
 *
 * Parse a filesystem configuration parameter and attempt a conversion for a
 * simple parameter for which this is requested.  If successful, the determined
 * parameter ID is placed into @result->key, the desired type is indicated in
 * @result->t and any converted value is placed into an appropriate member of
 * the union in @result.
 *
 * The function returns the parameter number if the parameter was matched,
 * -ENOPARAM if it wasn't matched and @desc->ignore_unknown indicated that
 * unknown parameters are okay and -EINVAL if there was a conversion issue or
 * the parameter wasn't recognised and unknowns aren't okay.
 */
int __fs_parse(struct p_log *log,
             const struct fs_parameter_spec *desc,
             struct fs_parameter *param,
             struct fs_parse_result *result)
{
        const struct fs_parameter_spec *p;

        result->uint_64 = 0;

        p = fs_lookup_key(desc, param, &result->negated);
        if (!p)
                return -ENOPARAM;

        if (p->flags & fs_param_deprecated)
                warn_plog(log, "Deprecated parameter '%s'", param->key);

        /* Try to turn the type we were given into the type desired by the
         * parameter and give an error if we can't.
         */
        if (is_flag(p)) {
                if (param->type != fs_value_is_flag)
                        return inval_plog(log, "Unexpected value for '%s'",
                                      param->key);
                result->boolean = !result->negated;
        } else  {
                int ret = p->type(log, p, param, result);
                if (ret)
                        return ret;
        }
        return p->opt;
}
EXPORT_SYMBOL(__fs_parse);

/**
 * fs_lookup_param - Look up a path referred to by a parameter
 * @fc: The filesystem context to log errors through.
 * @param: The parameter.
 * @want_bdev: T if want a blockdev
 * @flags: Pathwalk flags passed to filename_lookup()
 * @_path: The result of the lookup
 */
int fs_lookup_param(struct fs_context *fc,
                    struct fs_parameter *param,
                    bool want_bdev,
                    unsigned int flags,
                    struct path *_path)
{
        struct filename *f;
        bool put_f;
        int ret;

        switch (param->type) {
        case fs_value_is_string:
                f = getname_kernel(param->string);
                if (IS_ERR(f))
                        return PTR_ERR(f);
                put_f = true;
                break;
        case fs_value_is_filename:
                f = param->name;
                put_f = false;
                break;
        default:
                return invalf(fc, "%s: not usable as path", param->key);
        }

        ret = filename_lookup(param->dirfd, f, flags, _path, NULL);
        if (ret < 0) {
                errorf(fc, "%s: Lookup failure for '%s'", param->key, f->name);
                goto out;
        }

        if (want_bdev &&
            !S_ISBLK(d_backing_inode(_path->dentry)->i_mode)) {
                path_put(_path);
                _path->dentry = NULL;
                _path->mnt = NULL;
                errorf(fc, "%s: Non-blockdev passed as '%s'",
                       param->key, f->name);
                ret = -ENOTBLK;
        }

out:
        if (put_f)
                putname(f);
        return ret;
}
EXPORT_SYMBOL(fs_lookup_param);

static int fs_param_bad_value(struct p_log *log, struct fs_parameter *param)
{
        return inval_plog(log, "Bad value for '%s'", param->key);
}

int fs_param_is_bool(struct p_log *log, const struct fs_parameter_spec *p,
                     struct fs_parameter *param, struct fs_parse_result *result)
{
        int b;
        if (param->type != fs_value_is_string)
                return fs_param_bad_value(log, param);
        if (!*param->string && (p->flags & fs_param_can_be_empty))
                return 0;
        b = lookup_constant(bool_names, param->string, -1);
        if (b == -1)
                return fs_param_bad_value(log, param);
        result->boolean = b;
        return 0;
}
EXPORT_SYMBOL(fs_param_is_bool);

int fs_param_is_u32(struct p_log *log, const struct fs_parameter_spec *p,
                    struct fs_parameter *param, struct fs_parse_result *result)
{
        int base = (unsigned long)p->data;
        if (param->type != fs_value_is_string)
                return fs_param_bad_value(log, param);
        if (!*param->string && (p->flags & fs_param_can_be_empty))
                return 0;
        if (kstrtouint(param->string, base, &result->uint_32) < 0)
                return fs_param_bad_value(log, param);
        return 0;
}
EXPORT_SYMBOL(fs_param_is_u32);

int fs_param_is_s32(struct p_log *log, const struct fs_parameter_spec *p,
                    struct fs_parameter *param, struct fs_parse_result *result)
{
        if (param->type != fs_value_is_string)
                return fs_param_bad_value(log, param);
        if (!*param->string && (p->flags & fs_param_can_be_empty))
                return 0;
        if (kstrtoint(param->string, 0, &result->int_32) < 0)
                return fs_param_bad_value(log, param);
        return 0;
}
EXPORT_SYMBOL(fs_param_is_s32);

int fs_param_is_u64(struct p_log *log, const struct fs_parameter_spec *p,
                    struct fs_parameter *param, struct fs_parse_result *result)
{
        if (param->type != fs_value_is_string)
                return fs_param_bad_value(log, param);
        if (!*param->string && (p->flags & fs_param_can_be_empty))
                return 0;
        if (kstrtoull(param->string, 0, &result->uint_64) < 0)
                return fs_param_bad_value(log, param);
        return 0;
}
EXPORT_SYMBOL(fs_param_is_u64);

int fs_param_is_enum(struct p_log *log, const struct fs_parameter_spec *p,
                     struct fs_parameter *param, struct fs_parse_result *result)
{
        const struct constant_table *c;
        if (param->type != fs_value_is_string)
                return fs_param_bad_value(log, param);
        if (!*param->string && (p->flags & fs_param_can_be_empty))
                return 0;
        c = __lookup_constant(p->data, param->string);
        if (!c)
                return fs_param_bad_value(log, param);
        result->uint_32 = c->value;
        return 0;
}
EXPORT_SYMBOL(fs_param_is_enum);

int fs_param_is_string(struct p_log *log, const struct fs_parameter_spec *p,
                       struct fs_parameter *param, struct fs_parse_result *result)
{
        if (param->type != fs_value_is_string ||
            (!*param->string && !(p->flags & fs_param_can_be_empty)))
                return fs_param_bad_value(log, param);
        return 0;
}
EXPORT_SYMBOL(fs_param_is_string);

int fs_param_is_blob(struct p_log *log, const struct fs_parameter_spec *p,
                     struct fs_parameter *param, struct fs_parse_result *result)
{
        if (param->type != fs_value_is_blob)
                return fs_param_bad_value(log, param);
        return 0;
}
EXPORT_SYMBOL(fs_param_is_blob);

int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p,
                  struct fs_parameter *param, struct fs_parse_result *result)
{
        switch (param->type) {
        case fs_value_is_string:
                if ((!*param->string && !(p->flags & fs_param_can_be_empty)) ||
                    kstrtouint(param->string, 0, &result->uint_32) < 0)
                        break;
                if (result->uint_32 <= INT_MAX)
                        return 0;
                break;
        case fs_value_is_file:
                result->uint_32 = param->dirfd;
                if (result->uint_32 <= INT_MAX)
                        return 0;
                break;
        default:
                break;
        }
        return fs_param_bad_value(log, param);
}
EXPORT_SYMBOL(fs_param_is_fd);

int fs_param_is_blockdev(struct p_log *log, const struct fs_parameter_spec *p,
                  struct fs_parameter *param, struct fs_parse_result *result)
{
        return 0;
}
EXPORT_SYMBOL(fs_param_is_blockdev);

int fs_param_is_path(struct p_log *log, const struct fs_parameter_spec *p,
                     struct fs_parameter *param, struct fs_parse_result *result)
{
        return 0;
}
EXPORT_SYMBOL(fs_param_is_path);

#ifdef CONFIG_VALIDATE_FS_PARSER
/**
 * validate_constant_table - Validate a constant table
 * @tbl: The constant table to validate.
 * @tbl_size: The size of the table.
 * @low: The lowest permissible value.
 * @high: The highest permissible value.
 * @special: One special permissible value outside of the range.
 */
bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
                             int low, int high, int special)
{
        size_t i;
        bool good = true;

        if (tbl_size == 0) {
                pr_warn("VALIDATE C-TBL: Empty\n");
                return true;
        }

        for (i = 0; i < tbl_size; i++) {
                if (!tbl[i].name) {
                        pr_err("VALIDATE C-TBL[%zu]: Null\n", i);
                        good = false;
                } else if (i > 0 && tbl[i - 1].name) {
                        int c = strcmp(tbl[i-1].name, tbl[i].name);

                        if (c == 0) {
                                pr_err("VALIDATE C-TBL[%zu]: Duplicate %s\n",
                                       i, tbl[i].name);
                                good = false;
                        }
                        if (c > 0) {
                                pr_err("VALIDATE C-TBL[%zu]: Missorted %s>=%s\n",
                                       i, tbl[i-1].name, tbl[i].name);
                                good = false;
                        }
                }

                if (tbl[i].value != special &&
                    (tbl[i].value < low || tbl[i].value > high)) {
                        pr_err("VALIDATE C-TBL[%zu]: %s->%d const out of range (%d-%d)\n",
                               i, tbl[i].name, tbl[i].value, low, high);
                        good = false;
                }
        }

        return good;
}

/**
 * fs_validate_description - Validate a parameter description
 * @name: The parameter name to search for.
 * @desc: The parameter description to validate.
 */
bool fs_validate_description(const char *name,
        const struct fs_parameter_spec *desc)
{
        const struct fs_parameter_spec *param, *p2;
        bool good = true;

        for (param = desc; param->name; param++) {
                /* Check for duplicate parameter names */
                for (p2 = desc; p2 < param; p2++) {
                        if (strcmp(param->name, p2->name) == 0) {
                                if (is_flag(param) != is_flag(p2))
                                        continue;
                                pr_err("VALIDATE %s: PARAM[%s]: Duplicate\n",
                                       name, param->name);
                                good = false;
                        }
                }
        }
        return good;
}
#endif /* CONFIG_VALIDATE_FS_PARSER */










































    1 






    1 






















    1 






    1 
    1 




















































    1 


    1 


    1 











    1 
















    1 






    1 







































    1 





    1 


    1 








    1 


    1 


    1 





























    1 








    1 




















































































































































































































































































































    1 



    1 

    1 






















































































































































































    1 






    1 
    1 

















    1 













































































































    1 





    1 







    1 
    1 







































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/writeback.h>
#include <linux/sched/mm.h>
#include "messages.h"
#include "misc.h"
#include "ctree.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "extent_io.h"
#include "disk-io.h"
#include "compression.h"
#include "delalloc-space.h"
#include "qgroup.h"
#include "subpage.h"
#include "file.h"

static struct kmem_cache *btrfs_ordered_extent_cache;

static u64 entry_end(struct btrfs_ordered_extent *entry)
{
        if (entry->file_offset + entry->num_bytes < entry->file_offset)
                return (u64)-1;
        return entry->file_offset + entry->num_bytes;
}

/* returns NULL if the insertion worked, or it returns the node it did find
 * in the tree
 */
static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
                                   struct rb_node *node)
{
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent = NULL;
        struct btrfs_ordered_extent *entry;

        while (*p) {
                parent = *p;
                entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);

                if (file_offset < entry->file_offset)
                        p = &(*p)->rb_left;
                else if (file_offset >= entry_end(entry))
                        p = &(*p)->rb_right;
                else
                        return parent;
        }

        rb_link_node(node, parent, p);
        rb_insert_color(node, root);
        return NULL;
}

/*
 * look for a given offset in the tree, and if it can't be found return the
 * first lesser offset
 */
static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
                                     struct rb_node **prev_ret)
{
        struct rb_node *n = root->rb_node;
        struct rb_node *prev = NULL;
        struct rb_node *test;
        struct btrfs_ordered_extent *entry;
        struct btrfs_ordered_extent *prev_entry = NULL;

        while (n) {
                entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
                prev = n;
                prev_entry = entry;

                if (file_offset < entry->file_offset)
                        n = n->rb_left;
                else if (file_offset >= entry_end(entry))
                        n = n->rb_right;
                else
                        return n;
        }
        if (!prev_ret)
                return NULL;

        while (prev && file_offset >= entry_end(prev_entry)) {
                test = rb_next(prev);
                if (!test)
                        break;
                prev_entry = rb_entry(test, struct btrfs_ordered_extent,
                                      rb_node);
                if (file_offset < entry_end(prev_entry))
                        break;

                prev = test;
        }
        if (prev)
                prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
                                      rb_node);
        while (prev && file_offset < entry_end(prev_entry)) {
                test = rb_prev(prev);
                if (!test)
                        break;
                prev_entry = rb_entry(test, struct btrfs_ordered_extent,
                                      rb_node);
                prev = test;
        }
        *prev_ret = prev;
        return NULL;
}

static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
                          u64 len)
{
        if (file_offset + len <= entry->file_offset ||
            entry->file_offset + entry->num_bytes <= file_offset)
                return 0;
        return 1;
}

/*
 * look find the first ordered struct that has this offset, otherwise
 * the first one less than this offset
 */
static inline struct rb_node *ordered_tree_search(struct btrfs_inode *inode,
                                                  u64 file_offset)
{
        struct rb_node *prev = NULL;
        struct rb_node *ret;
        struct btrfs_ordered_extent *entry;

        if (inode->ordered_tree_last) {
                entry = rb_entry(inode->ordered_tree_last, struct btrfs_ordered_extent,
                                 rb_node);
                if (in_range(file_offset, entry->file_offset, entry->num_bytes))
                        return inode->ordered_tree_last;
        }
        ret = __tree_search(&inode->ordered_tree, file_offset, &prev);
        if (!ret)
                ret = prev;
        if (ret)
                inode->ordered_tree_last = ret;
        return ret;
}

static struct btrfs_ordered_extent *alloc_ordered_extent(
                        struct btrfs_inode *inode, u64 file_offset, u64 num_bytes,
                        u64 ram_bytes, u64 disk_bytenr, u64 disk_num_bytes,
                        u64 offset, unsigned long flags, int compress_type)
{
        struct btrfs_ordered_extent *entry;
        int ret;
        u64 qgroup_rsv = 0;

        if (flags &
            ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
                /* For nocow write, we can release the qgroup rsv right now */
                ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv);
                if (ret < 0)
                        return ERR_PTR(ret);
        } else {
                /*
                 * The ordered extent has reserved qgroup space, release now
                 * and pass the reserved number for qgroup_record to free.
                 */
                ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv);
                if (ret < 0)
                        return ERR_PTR(ret);
        }
        entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
        if (!entry)
                return ERR_PTR(-ENOMEM);

        entry->file_offset = file_offset;
        entry->num_bytes = num_bytes;
        entry->ram_bytes = ram_bytes;
        entry->disk_bytenr = disk_bytenr;
        entry->disk_num_bytes = disk_num_bytes;
        entry->offset = offset;
        entry->bytes_left = num_bytes;
        entry->inode = igrab(&inode->vfs_inode);
        entry->compress_type = compress_type;
        entry->truncated_len = (u64)-1;
        entry->qgroup_rsv = qgroup_rsv;
        entry->flags = flags;
        refcount_set(&entry->refs, 1);
        init_waitqueue_head(&entry->wait);
        INIT_LIST_HEAD(&entry->list);
        INIT_LIST_HEAD(&entry->log_list);
        INIT_LIST_HEAD(&entry->root_extent_list);
        INIT_LIST_HEAD(&entry->work_list);
        INIT_LIST_HEAD(&entry->bioc_list);
        init_completion(&entry->completion);

        /*
         * We don't need the count_max_extents here, we can assume that all of
         * that work has been done at higher layers, so this is truly the
         * smallest the extent is going to get.
         */
        spin_lock(&inode->lock);
        btrfs_mod_outstanding_extents(inode, 1);
        spin_unlock(&inode->lock);

        return entry;
}

static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
{
        struct btrfs_inode *inode = BTRFS_I(entry->inode);
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct rb_node *node;

        trace_btrfs_ordered_extent_add(inode, entry);

        percpu_counter_add_batch(&fs_info->ordered_bytes, entry->num_bytes,
                                 fs_info->delalloc_batch);

        /* One ref for the tree. */
        refcount_inc(&entry->refs);

        spin_lock_irq(&inode->ordered_tree_lock);
        node = tree_insert(&inode->ordered_tree, entry->file_offset,
                           &entry->rb_node);
        if (node)
                btrfs_panic(fs_info, -EEXIST,
                                "inconsistency in ordered tree at offset %llu",
                                entry->file_offset);
        spin_unlock_irq(&inode->ordered_tree_lock);

        spin_lock(&root->ordered_extent_lock);
        list_add_tail(&entry->root_extent_list,
                      &root->ordered_extents);
        root->nr_ordered_extents++;
        if (root->nr_ordered_extents == 1) {
                spin_lock(&fs_info->ordered_root_lock);
                BUG_ON(!list_empty(&root->ordered_root));
                list_add_tail(&root->ordered_root, &fs_info->ordered_roots);
                spin_unlock(&fs_info->ordered_root_lock);
        }
        spin_unlock(&root->ordered_extent_lock);
}

/*
 * Add an ordered extent to the per-inode tree.
 *
 * @inode:           Inode that this extent is for.
 * @file_offset:     Logical offset in file where the extent starts.
 * @num_bytes:       Logical length of extent in file.
 * @ram_bytes:       Full length of unencoded data.
 * @disk_bytenr:     Offset of extent on disk.
 * @disk_num_bytes:  Size of extent on disk.
 * @offset:          Offset into unencoded data where file data starts.
 * @flags:           Flags specifying type of extent (1 << BTRFS_ORDERED_*).
 * @compress_type:   Compression algorithm used for data.
 *
 * Most of these parameters correspond to &struct btrfs_file_extent_item. The
 * tree is given a single reference on the ordered extent that was inserted, and
 * the returned pointer is given a second reference.
 *
 * Return: the new ordered extent or error pointer.
 */
struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
                        struct btrfs_inode *inode, u64 file_offset,
                        u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
                        u64 disk_num_bytes, u64 offset, unsigned long flags,
                        int compress_type)
{
        struct btrfs_ordered_extent *entry;

        ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);

        entry = alloc_ordered_extent(inode, file_offset, num_bytes, ram_bytes,
                                     disk_bytenr, disk_num_bytes, offset, flags,
                                     compress_type);
        if (!IS_ERR(entry))
                insert_ordered_extent(entry);
        return entry;
}

/*
 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
 * when an ordered extent is finished.  If the list covers more than one
 * ordered extent, it is split across multiples.
 */
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
                           struct btrfs_ordered_sum *sum)
{
        struct btrfs_inode *inode = BTRFS_I(entry->inode);

        spin_lock_irq(&inode->ordered_tree_lock);
        list_add_tail(&sum->list, &entry->list);
        spin_unlock_irq(&inode->ordered_tree_lock);
}

void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered)
{
        if (!test_and_set_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
                mapping_set_error(ordered->inode->i_mapping, -EIO);
}

static void finish_ordered_fn(struct btrfs_work *work)
{
        struct btrfs_ordered_extent *ordered_extent;

        ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
        btrfs_finish_ordered_io(ordered_extent);
}

static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
                                      struct page *page, u64 file_offset,
                                      u64 len, bool uptodate)
{
        struct btrfs_inode *inode = BTRFS_I(ordered->inode);
        struct btrfs_fs_info *fs_info = inode->root->fs_info;

        lockdep_assert_held(&inode->ordered_tree_lock);

        if (page) {
                ASSERT(page->mapping);
                ASSERT(page_offset(page) <= file_offset);
                ASSERT(file_offset + len <= page_offset(page) + PAGE_SIZE);

                /*
                 * Ordered (Private2) bit indicates whether we still have
                 * pending io unfinished for the ordered extent.
                 *
                 * If there's no such bit, we need to skip to next range.
                 */
                if (!btrfs_folio_test_ordered(fs_info, page_folio(page),
                                              file_offset, len))
                        return false;
                btrfs_folio_clear_ordered(fs_info, page_folio(page), file_offset, len);
        }

        /* Now we're fine to update the accounting. */
        if (WARN_ON_ONCE(len > ordered->bytes_left)) {
                btrfs_crit(fs_info,
"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu",
                           btrfs_root_id(inode->root), btrfs_ino(inode),
                           ordered->file_offset, ordered->num_bytes,
                           len, ordered->bytes_left);
                ordered->bytes_left = 0;
        } else {
                ordered->bytes_left -= len;
        }

        if (!uptodate)
                set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);

        if (ordered->bytes_left)
                return false;

        /*
         * All the IO of the ordered extent is finished, we need to queue
         * the finish_func to be executed.
         */
        set_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags);
        cond_wake_up(&ordered->wait);
        refcount_inc(&ordered->refs);
        trace_btrfs_ordered_extent_mark_finished(inode, ordered);
        return true;
}

static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered)
{
        struct btrfs_inode *inode = BTRFS_I(ordered->inode);
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ?
                fs_info->endio_freespace_worker : fs_info->endio_write_workers;

        btrfs_init_work(&ordered->work, finish_ordered_fn, NULL);
        btrfs_queue_work(wq, &ordered->work);
}

bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
                                 struct page *page, u64 file_offset, u64 len,
                                 bool uptodate)
{
        struct btrfs_inode *inode = BTRFS_I(ordered->inode);
        unsigned long flags;
        bool ret;

        trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate);

        spin_lock_irqsave(&inode->ordered_tree_lock, flags);
        ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
        spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);

        /*
         * If this is a COW write it means we created new extent maps for the
         * range and they point to unwritten locations if we got an error either
         * before submitting a bio or during IO.
         *
         * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we
         * are queuing its completion below. During completion, at
         * btrfs_finish_one_ordered(), we will drop the extent maps for the
         * unwritten extents.
         *
         * However because completion runs in a work queue we can end up having
         * a fast fsync running before that. In the case of direct IO, once we
         * unlock the inode the fsync might start, and we queue the completion
         * before unlocking the inode. In the case of buffered IO when writeback
         * finishes (end_bbio_data_write()) we queue the completion, so if the
         * writeback was triggered by a fast fsync, the fsync might start
         * logging before ordered extent completion runs in the work queue.
         *
         * The fast fsync will log file extent items based on the extent maps it
         * finds, so if by the time it collects extent maps the ordered extent
         * completion didn't happen yet, it will log file extent items that
         * point to unwritten extents, resulting in a corruption if a crash
         * happens and the log tree is replayed. Note that a fast fsync does not
         * wait for completion of ordered extents in order to reduce latency.
         *
         * Set a flag in the inode so that the next fast fsync will wait for
         * ordered extents to complete before starting to log.
         */
        if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
                set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);

        if (ret)
                btrfs_queue_ordered_fn(ordered);
        return ret;
}

/*
 * Mark all ordered extents io inside the specified range finished.
 *
 * @page:         The involved page for the operation.
 *                 For uncompressed buffered IO, the page status also needs to be
 *                 updated to indicate whether the pending ordered io is finished.
 *                 Can be NULL for direct IO and compressed write.
 *                 For these cases, callers are ensured they won't execute the
 *                 endio function twice.
 *
 * This function is called for endio, thus the range must have ordered
 * extent(s) covering it.
 */
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
                                    struct page *page, u64 file_offset,
                                    u64 num_bytes, bool uptodate)
{
        struct rb_node *node;
        struct btrfs_ordered_extent *entry = NULL;
        unsigned long flags;
        u64 cur = file_offset;

        trace_btrfs_writepage_end_io_hook(inode, file_offset,
                                          file_offset + num_bytes - 1,
                                          uptodate);

        spin_lock_irqsave(&inode->ordered_tree_lock, flags);
        while (cur < file_offset + num_bytes) {
                u64 entry_end;
                u64 end;
                u32 len;

                node = ordered_tree_search(inode, cur);
                /* No ordered extents at all */
                if (!node)
                        break;

                entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
                entry_end = entry->file_offset + entry->num_bytes;
                /*
                 * |<-- OE --->|  |
                 *                  cur
                 * Go to next OE.
                 */
                if (cur >= entry_end) {
                        node = rb_next(node);
                        /* No more ordered extents, exit */
                        if (!node)
                                break;
                        entry = rb_entry(node, struct btrfs_ordered_extent,
                                         rb_node);

                        /* Go to next ordered extent and continue */
                        cur = entry->file_offset;
                        continue;
                }
                /*
                 * |        |<--- OE --->|
                 * cur
                 * Go to the start of OE.
                 */
                if (cur < entry->file_offset) {
                        cur = entry->file_offset;
                        continue;
                }

                /*
                 * Now we are definitely inside one ordered extent.
                 *
                 * |<--- OE --->|
                 *        |
                 *        cur
                 */
                end = min(entry->file_offset + entry->num_bytes,
                          file_offset + num_bytes) - 1;
                ASSERT(end + 1 - cur < U32_MAX);
                len = end + 1 - cur;

                if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) {
                        spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
                        btrfs_queue_ordered_fn(entry);
                        spin_lock_irqsave(&inode->ordered_tree_lock, flags);
                }
                cur += len;
        }
        spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
}

/*
 * Finish IO for one ordered extent across a given range.  The range can only
 * contain one ordered extent.
 *
 * @cached:         The cached ordered extent. If not NULL, we can skip the tree
 *               search and use the ordered extent directly.
 *                  Will be also used to store the finished ordered extent.
 * @file_offset: File offset for the finished IO
 * @io_size:         Length of the finish IO range
 *
 * Return true if the ordered extent is finished in the range, and update
 * @cached.
 * Return false otherwise.
 *
 * NOTE: The range can NOT cross multiple ordered extents.
 * Thus caller should ensure the range doesn't cross ordered extents.
 */
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
                                    struct btrfs_ordered_extent **cached,
                                    u64 file_offset, u64 io_size)
{
        struct rb_node *node;
        struct btrfs_ordered_extent *entry = NULL;
        unsigned long flags;
        bool finished = false;

        spin_lock_irqsave(&inode->ordered_tree_lock, flags);
        if (cached && *cached) {
                entry = *cached;
                goto have_entry;
        }

        node = ordered_tree_search(inode, file_offset);
        if (!node)
                goto out;

        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
have_entry:
        if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
                goto out;

        if (io_size > entry->bytes_left)
                btrfs_crit(inode->root->fs_info,
                           "bad ordered accounting left %llu size %llu",
                       entry->bytes_left, io_size);

        entry->bytes_left -= io_size;

        if (entry->bytes_left == 0) {
                /*
                 * Ensure only one caller can set the flag and finished_ret
                 * accordingly
                 */
                finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
                /* test_and_set_bit implies a barrier */
                cond_wake_up_nomb(&entry->wait);
        }
out:
        if (finished && cached && entry) {
                *cached = entry;
                refcount_inc(&entry->refs);
                trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
        }
        spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
        return finished;
}

/*
 * used to drop a reference on an ordered extent.  This will free
 * the extent if the last reference is dropped
 */
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
{
        struct list_head *cur;
        struct btrfs_ordered_sum *sum;

        trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);

        if (refcount_dec_and_test(&entry->refs)) {
                ASSERT(list_empty(&entry->root_extent_list));
                ASSERT(list_empty(&entry->log_list));
                ASSERT(RB_EMPTY_NODE(&entry->rb_node));
                if (entry->inode)
                        btrfs_add_delayed_iput(BTRFS_I(entry->inode));
                while (!list_empty(&entry->list)) {
                        cur = entry->list.next;
                        sum = list_entry(cur, struct btrfs_ordered_sum, list);
                        list_del(&sum->list);
                        kvfree(sum);
                }
                kmem_cache_free(btrfs_ordered_extent_cache, entry);
        }
}

/*
 * remove an ordered extent from the tree.  No references are dropped
 * and waiters are woken up.
 */
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
                                 struct btrfs_ordered_extent *entry)
{
        struct btrfs_root *root = btrfs_inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct rb_node *node;
        bool pending;
        bool freespace_inode;

        /*
         * If this is a free space inode the thread has not acquired the ordered
         * extents lockdep map.
         */
        freespace_inode = btrfs_is_free_space_inode(btrfs_inode);

        btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered);
        /* This is paired with btrfs_alloc_ordered_extent. */
        spin_lock(&btrfs_inode->lock);
        btrfs_mod_outstanding_extents(btrfs_inode, -1);
        spin_unlock(&btrfs_inode->lock);
        if (root != fs_info->tree_root) {
                u64 release;

                if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags))
                        release = entry->disk_num_bytes;
                else
                        release = entry->num_bytes;
                btrfs_delalloc_release_metadata(btrfs_inode, release,
                                                test_bit(BTRFS_ORDERED_IOERR,
                                                         &entry->flags));
        }

        percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
                                 fs_info->delalloc_batch);

        spin_lock_irq(&btrfs_inode->ordered_tree_lock);
        node = &entry->rb_node;
        rb_erase(node, &btrfs_inode->ordered_tree);
        RB_CLEAR_NODE(node);
        if (btrfs_inode->ordered_tree_last == node)
                btrfs_inode->ordered_tree_last = NULL;
        set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
        pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
        spin_unlock_irq(&btrfs_inode->ordered_tree_lock);

        /*
         * The current running transaction is waiting on us, we need to let it
         * know that we're complete and wake it up.
         */
        if (pending) {
                struct btrfs_transaction *trans;

                /*
                 * The checks for trans are just a formality, it should be set,
                 * but if it isn't we don't want to deref/assert under the spin
                 * lock, so be nice and check if trans is set, but ASSERT() so
                 * if it isn't set a developer will notice.
                 */
                spin_lock(&fs_info->trans_lock);
                trans = fs_info->running_transaction;
                if (trans)
                        refcount_inc(&trans->use_count);
                spin_unlock(&fs_info->trans_lock);

                ASSERT(trans || BTRFS_FS_ERROR(fs_info));
                if (trans) {
                        if (atomic_dec_and_test(&trans->pending_ordered))
                                wake_up(&trans->pending_wait);
                        btrfs_put_transaction(trans);
                }
        }

        btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered);

        spin_lock(&root->ordered_extent_lock);
        list_del_init(&entry->root_extent_list);
        root->nr_ordered_extents--;

        trace_btrfs_ordered_extent_remove(btrfs_inode, entry);

        if (!root->nr_ordered_extents) {
                spin_lock(&fs_info->ordered_root_lock);
                BUG_ON(list_empty(&root->ordered_root));
                list_del_init(&root->ordered_root);
                spin_unlock(&fs_info->ordered_root_lock);
        }
        spin_unlock(&root->ordered_extent_lock);
        wake_up(&entry->wait);
        if (!freespace_inode)
                btrfs_lockdep_release(fs_info, btrfs_ordered_extent);
}

static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
{
        struct btrfs_ordered_extent *ordered;

        ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
        btrfs_start_ordered_extent(ordered);
        complete(&ordered->completion);
}

/*
 * wait for all the ordered extents in a root.  This is done when balancing
 * space between drives.
 */
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
                               const u64 range_start, const u64 range_len)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        LIST_HEAD(splice);
        LIST_HEAD(skipped);
        LIST_HEAD(works);
        struct btrfs_ordered_extent *ordered, *next;
        u64 count = 0;
        const u64 range_end = range_start + range_len;

        mutex_lock(&root->ordered_extent_mutex);
        spin_lock(&root->ordered_extent_lock);
        list_splice_init(&root->ordered_extents, &splice);
        while (!list_empty(&splice) && nr) {
                ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
                                           root_extent_list);

                if (range_end <= ordered->disk_bytenr ||
                    ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) {
                        list_move_tail(&ordered->root_extent_list, &skipped);
                        cond_resched_lock(&root->ordered_extent_lock);
                        continue;
                }

                list_move_tail(&ordered->root_extent_list,
                               &root->ordered_extents);
                refcount_inc(&ordered->refs);
                spin_unlock(&root->ordered_extent_lock);

                btrfs_init_work(&ordered->flush_work,
                                btrfs_run_ordered_extent_work, NULL);
                list_add_tail(&ordered->work_list, &works);
                btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);

                cond_resched();
                spin_lock(&root->ordered_extent_lock);
                if (nr != U64_MAX)
                        nr--;
                count++;
        }
        list_splice_tail(&skipped, &root->ordered_extents);
        list_splice_tail(&splice, &root->ordered_extents);
        spin_unlock(&root->ordered_extent_lock);

        list_for_each_entry_safe(ordered, next, &works, work_list) {
                list_del_init(&ordered->work_list);
                wait_for_completion(&ordered->completion);
                btrfs_put_ordered_extent(ordered);
                cond_resched();
        }
        mutex_unlock(&root->ordered_extent_mutex);

        return count;
}

void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
                             const u64 range_start, const u64 range_len)
{
        struct btrfs_root *root;
        LIST_HEAD(splice);
        u64 done;

        mutex_lock(&fs_info->ordered_operations_mutex);
        spin_lock(&fs_info->ordered_root_lock);
        list_splice_init(&fs_info->ordered_roots, &splice);
        while (!list_empty(&splice) && nr) {
                root = list_first_entry(&splice, struct btrfs_root,
                                        ordered_root);
                root = btrfs_grab_root(root);
                BUG_ON(!root);
                list_move_tail(&root->ordered_root,
                               &fs_info->ordered_roots);
                spin_unlock(&fs_info->ordered_root_lock);

                done = btrfs_wait_ordered_extents(root, nr,
                                                  range_start, range_len);
                btrfs_put_root(root);

                spin_lock(&fs_info->ordered_root_lock);
                if (nr != U64_MAX) {
                        nr -= done;
                }
        }
        list_splice_tail(&splice, &fs_info->ordered_roots);
        spin_unlock(&fs_info->ordered_root_lock);
        mutex_unlock(&fs_info->ordered_operations_mutex);
}

/*
 * Start IO and wait for a given ordered extent to finish.
 *
 * Wait on page writeback for all the pages in the extent and the IO completion
 * code to insert metadata into the btree corresponding to the extent.
 */
void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
{
        u64 start = entry->file_offset;
        u64 end = start + entry->num_bytes - 1;
        struct btrfs_inode *inode = BTRFS_I(entry->inode);
        bool freespace_inode;

        trace_btrfs_ordered_extent_start(inode, entry);

        /*
         * If this is a free space inode do not take the ordered extents lockdep
         * map.
         */
        freespace_inode = btrfs_is_free_space_inode(inode);

        /*
         * pages in the range can be dirty, clean or writeback.  We
         * start IO on any dirty ones so the wait doesn't stall waiting
         * for the flusher thread to find them
         */
        if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
                filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);

        if (!freespace_inode)
                btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
        wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags));
}

/*
 * Used to wait on ordered extents across a large range of bytes.
 */
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
{
        int ret = 0;
        int ret_wb = 0;
        u64 end;
        u64 orig_end;
        struct btrfs_ordered_extent *ordered;

        if (start + len < start) {
                orig_end = OFFSET_MAX;
        } else {
                orig_end = start + len - 1;
                if (orig_end > OFFSET_MAX)
                        orig_end = OFFSET_MAX;
        }

        /* start IO across the range first to instantiate any delalloc
         * extents
         */
        ret = btrfs_fdatawrite_range(inode, start, orig_end);
        if (ret)
                return ret;

        /*
         * If we have a writeback error don't return immediately. Wait first
         * for any ordered extents that haven't completed yet. This is to make
         * sure no one can dirty the same page ranges and call writepages()
         * before the ordered extents complete - to avoid failures (-EEXIST)
         * when adding the new ordered extents to the ordered tree.
         */
        ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);

        end = orig_end;
        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
                if (!ordered)
                        break;
                if (ordered->file_offset > orig_end) {
                        btrfs_put_ordered_extent(ordered);
                        break;
                }
                if (ordered->file_offset + ordered->num_bytes <= start) {
                        btrfs_put_ordered_extent(ordered);
                        break;
                }
                btrfs_start_ordered_extent(ordered);
                end = ordered->file_offset;
                /*
                 * If the ordered extent had an error save the error but don't
                 * exit without waiting first for all other ordered extents in
                 * the range to complete.
                 */
                if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
                        ret = -EIO;
                btrfs_put_ordered_extent(ordered);
                if (end == 0 || end == start)
                        break;
                end--;
        }
        return ret_wb ? ret_wb : ret;
}

/*
 * find an ordered extent corresponding to file_offset.  return NULL if
 * nothing is found, otherwise take a reference on the extent and return it
 */
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
                                                         u64 file_offset)
{
        struct rb_node *node;
        struct btrfs_ordered_extent *entry = NULL;
        unsigned long flags;

        spin_lock_irqsave(&inode->ordered_tree_lock, flags);
        node = ordered_tree_search(inode, file_offset);
        if (!node)
                goto out;

        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
        if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
                entry = NULL;
        if (entry) {
                refcount_inc(&entry->refs);
                trace_btrfs_ordered_extent_lookup(inode, entry);
        }
out:
        spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
        return entry;
}

/* Since the DIO code tries to lock a wide area we need to look for any ordered
 * extents that exist in the range, rather than just the start of the range.
 */
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
                struct btrfs_inode *inode, u64 file_offset, u64 len)
{
        struct rb_node *node;
        struct btrfs_ordered_extent *entry = NULL;

        spin_lock_irq(&inode->ordered_tree_lock);
        node = ordered_tree_search(inode, file_offset);
        if (!node) {
                node = ordered_tree_search(inode, file_offset + len);
                if (!node)
                        goto out;
        }

        while (1) {
                entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
                if (range_overlaps(entry, file_offset, len))
                        break;

                if (entry->file_offset >= file_offset + len) {
                        entry = NULL;
                        break;
                }
                entry = NULL;
                node = rb_next(node);
                if (!node)
                        break;
        }
out:
        if (entry) {
                refcount_inc(&entry->refs);
                trace_btrfs_ordered_extent_lookup_range(inode, entry);
        }
        spin_unlock_irq(&inode->ordered_tree_lock);
        return entry;
}

/*
 * Adds all ordered extents to the given list. The list ends up sorted by the
 * file_offset of the ordered extents.
 */
void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
                                           struct list_head *list)
{
        struct rb_node *n;

        ASSERT(inode_is_locked(&inode->vfs_inode));

        spin_lock_irq(&inode->ordered_tree_lock);
        for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) {
                struct btrfs_ordered_extent *ordered;

                ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);

                if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
                        continue;

                ASSERT(list_empty(&ordered->log_list));
                list_add_tail(&ordered->log_list, list);
                refcount_inc(&ordered->refs);
                trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
        }
        spin_unlock_irq(&inode->ordered_tree_lock);
}

/*
 * lookup and return any extent before 'file_offset'.  NULL is returned
 * if none is found
 */
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
{
        struct rb_node *node;
        struct btrfs_ordered_extent *entry = NULL;

        spin_lock_irq(&inode->ordered_tree_lock);
        node = ordered_tree_search(inode, file_offset);
        if (!node)
                goto out;

        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
        refcount_inc(&entry->refs);
        trace_btrfs_ordered_extent_lookup_first(inode, entry);
out:
        spin_unlock_irq(&inode->ordered_tree_lock);
        return entry;
}

/*
 * Lookup the first ordered extent that overlaps the range
 * [@file_offset, @file_offset + @len).
 *
 * The difference between this and btrfs_lookup_first_ordered_extent() is
 * that this one won't return any ordered extent that does not overlap the range.
 * And the difference against btrfs_lookup_ordered_extent() is, this function
 * ensures the first ordered extent gets returned.
 */
struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
                        struct btrfs_inode *inode, u64 file_offset, u64 len)
{
        struct rb_node *node;
        struct rb_node *cur;
        struct rb_node *prev;
        struct rb_node *next;
        struct btrfs_ordered_extent *entry = NULL;

        spin_lock_irq(&inode->ordered_tree_lock);
        node = inode->ordered_tree.rb_node;
        /*
         * Here we don't want to use tree_search() which will use tree->last
         * and screw up the search order.
         * And __tree_search() can't return the adjacent ordered extents
         * either, thus here we do our own search.
         */
        while (node) {
                entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);

                if (file_offset < entry->file_offset) {
                        node = node->rb_left;
                } else if (file_offset >= entry_end(entry)) {
                        node = node->rb_right;
                } else {
                        /*
                         * Direct hit, got an ordered extent that starts at
                         * @file_offset
                         */
                        goto out;
                }
        }
        if (!entry) {
                /* Empty tree */
                goto out;
        }

        cur = &entry->rb_node;
        /* We got an entry around @file_offset, check adjacent entries */
        if (entry->file_offset < file_offset) {
                prev = cur;
                next = rb_next(cur);
        } else {
                prev = rb_prev(cur);
                next = cur;
        }
        if (prev) {
                entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node);
                if (range_overlaps(entry, file_offset, len))
                        goto out;
        }
        if (next) {
                entry = rb_entry(next, struct btrfs_ordered_extent, rb_node);
                if (range_overlaps(entry, file_offset, len))
                        goto out;
        }
        /* No ordered extent in the range */
        entry = NULL;
out:
        if (entry) {
                refcount_inc(&entry->refs);
                trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
        }

        spin_unlock_irq(&inode->ordered_tree_lock);
        return entry;
}

/*
 * Lock the passed range and ensures all pending ordered extents in it are run
 * to completion.
 *
 * @inode:        Inode whose ordered tree is to be searched
 * @start:        Beginning of range to flush
 * @end:          Last byte of range to lock
 * @cached_state: If passed, will return the extent state responsible for the
 *                locked range. It's the caller's responsibility to free the
 *                cached state.
 *
 * Always return with the given range locked, ensuring after it's called no
 * order extent can be pending.
 */
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
                                        u64 end,
                                        struct extent_state **cached_state)
{
        struct btrfs_ordered_extent *ordered;
        struct extent_state *cache = NULL;
        struct extent_state **cachedp = &cache;

        if (cached_state)
                cachedp = cached_state;

        while (1) {
                lock_extent(&inode->io_tree, start, end, cachedp);
                ordered = btrfs_lookup_ordered_range(inode, start,
                                                     end - start + 1);
                if (!ordered) {
                        /*
                         * If no external cached_state has been passed then
                         * decrement the extra ref taken for cachedp since we
                         * aren't exposing it outside of this function
                         */
                        if (!cached_state)
                                refcount_dec(&cache->refs);
                        break;
                }
                unlock_extent(&inode->io_tree, start, end, cachedp);
                btrfs_start_ordered_extent(ordered);
                btrfs_put_ordered_extent(ordered);
        }
}

/*
 * Lock the passed range and ensure all pending ordered extents in it are run
 * to completion in nowait mode.
 *
 * Return true if btrfs_lock_ordered_range does not return any extents,
 * otherwise false.
 */
bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
                                  struct extent_state **cached_state)
{
        struct btrfs_ordered_extent *ordered;

        if (!try_lock_extent(&inode->io_tree, start, end, cached_state))
                return false;

        ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
        if (!ordered)
                return true;

        btrfs_put_ordered_extent(ordered);
        unlock_extent(&inode->io_tree, start, end, cached_state);

        return false;
}

/* Split out a new ordered extent for this first @len bytes of @ordered. */
struct btrfs_ordered_extent *btrfs_split_ordered_extent(
                        struct btrfs_ordered_extent *ordered, u64 len)
{
        struct btrfs_inode *inode = BTRFS_I(ordered->inode);
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 file_offset = ordered->file_offset;
        u64 disk_bytenr = ordered->disk_bytenr;
        unsigned long flags = ordered->flags;
        struct btrfs_ordered_sum *sum, *tmpsum;
        struct btrfs_ordered_extent *new;
        struct rb_node *node;
        u64 offset = 0;

        trace_btrfs_ordered_extent_split(inode, ordered);

        ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED)));

        /*
         * The entire bio must be covered by the ordered extent, but we can't
         * reduce the original extent to a zero length either.
         */
        if (WARN_ON_ONCE(len >= ordered->num_bytes))
                return ERR_PTR(-EINVAL);
        /* We cannot split partially completed ordered extents. */
        if (ordered->bytes_left) {
                ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS));
                if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes))
                        return ERR_PTR(-EINVAL);
        }
        /* We cannot split a compressed ordered extent. */
        if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes))
                return ERR_PTR(-EINVAL);

        new = alloc_ordered_extent(inode, file_offset, len, len, disk_bytenr,
                                   len, 0, flags, ordered->compress_type);
        if (IS_ERR(new))
                return new;

        /* One ref for the tree. */
        refcount_inc(&new->refs);

        spin_lock_irq(&root->ordered_extent_lock);
        spin_lock(&inode->ordered_tree_lock);
        /* Remove from tree once */
        node = &ordered->rb_node;
        rb_erase(node, &inode->ordered_tree);
        RB_CLEAR_NODE(node);
        if (inode->ordered_tree_last == node)
                inode->ordered_tree_last = NULL;

        ordered->file_offset += len;
        ordered->disk_bytenr += len;
        ordered->num_bytes -= len;
        ordered->disk_num_bytes -= len;
        ordered->ram_bytes -= len;

        if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) {
                ASSERT(ordered->bytes_left == 0);
                new->bytes_left = 0;
        } else {
                ordered->bytes_left -= len;
        }

        if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) {
                if (ordered->truncated_len > len) {
                        ordered->truncated_len -= len;
                } else {
                        new->truncated_len = ordered->truncated_len;
                        ordered->truncated_len = 0;
                }
        }

        list_for_each_entry_safe(sum, tmpsum, &ordered->list, list) {
                if (offset == len)
                        break;
                list_move_tail(&sum->list, &new->list);
                offset += sum->len;
        }

        /* Re-insert the node */
        node = tree_insert(&inode->ordered_tree, ordered->file_offset,
                           &ordered->rb_node);
        if (node)
                btrfs_panic(fs_info, -EEXIST,
                        "zoned: inconsistency in ordered tree at offset %llu",
                        ordered->file_offset);

        node = tree_insert(&inode->ordered_tree, new->file_offset, &new->rb_node);
        if (node)
                btrfs_panic(fs_info, -EEXIST,
                        "zoned: inconsistency in ordered tree at offset %llu",
                        new->file_offset);
        spin_unlock(&inode->ordered_tree_lock);

        list_add_tail(&new->root_extent_list, &root->ordered_extents);
        root->nr_ordered_extents++;
        spin_unlock_irq(&root->ordered_extent_lock);
        return new;
}

int __init ordered_data_init(void)
{
        btrfs_ordered_extent_cache = KMEM_CACHE(btrfs_ordered_extent, 0);
        if (!btrfs_ordered_extent_cache)
                return -ENOMEM;

        return 0;
}

void __cold ordered_data_exit(void)
{
        kmem_cache_destroy(btrfs_ordered_extent_cache);
}









































































































    3 

    2 































































































    5 






























   14 
   10 










    1 
    2 
    3 

















    7 





















    6 








































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/writeback.h
 */
#ifndef WRITEBACK_H
#define WRITEBACK_H

#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/fs.h>
#include <linux/flex_proportions.h>
#include <linux/backing-dev-defs.h>
#include <linux/blk_types.h>
#include <linux/pagevec.h>

struct bio;

DECLARE_PER_CPU(int, dirty_throttle_leaks);

/*
 * The global dirty threshold is normally equal to the global dirty limit,
 * except when the system suddenly allocates a lot of anonymous memory and
 * knocks down the global dirty threshold quickly, in which case the global
 * dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
 */
#define DIRTY_SCOPE                8

struct backing_dev_info;

/*
 * fs/fs-writeback.c
 */
enum writeback_sync_modes {
        WB_SYNC_NONE,        /* Don't wait on anything */
        WB_SYNC_ALL,        /* Wait on every mapping */
};

/*
 * A control structure which tells the writeback code what to do.  These are
 * always on the stack, and hence need no locking.  They are always initialised
 * in a manner such that unspecified fields are set to zero.
 */
struct writeback_control {
        /* public fields that can be set and/or consumed by the caller: */
        long nr_to_write;                /* Write this many pages, and decrement
                                           this for each page written */
        long pages_skipped;                /* Pages which were not written */

        /*
         * For a_ops->writepages(): if start or end are non-zero then this is
         * a hint that the filesystem need only write out the pages inside that
         * byterange.  The byte at `end' is included in the writeout request.
         */
        loff_t range_start;
        loff_t range_end;

        enum writeback_sync_modes sync_mode;

        unsigned for_kupdate:1;                /* A kupdate writeback */
        unsigned for_background:1;        /* A background writeback */
        unsigned tagged_writepages:1;        /* tag-and-write to avoid livelock */
        unsigned for_reclaim:1;                /* Invoked from the page allocator */
        unsigned range_cyclic:1;        /* range_start is cyclic */
        unsigned for_sync:1;                /* sync(2) WB_SYNC_ALL writeback */
        unsigned unpinned_netfs_wb:1;        /* Cleared I_PINNING_NETFS_WB */

        /*
         * When writeback IOs are bounced through async layers, only the
         * initial synchronous phase should be accounted towards inode
         * cgroup ownership arbitration to avoid confusion.  Later stages
         * can set the following flag to disable the accounting.
         */
        unsigned no_cgroup_owner:1;

        /* To enable batching of swap writes to non-block-device backends,
         * "plug" can be set point to a 'struct swap_iocb *'.  When all swap
         * writes have been submitted, if with swap_iocb is not NULL,
         * swap_write_unplug() should be called.
         */
        struct swap_iocb **swap_plug;

        /* internal fields used by the ->writepages implementation: */
        struct folio_batch fbatch;
        pgoff_t index;
        int saved_err;

#ifdef CONFIG_CGROUP_WRITEBACK
        struct bdi_writeback *wb;        /* wb this writeback is issued under */
        struct inode *inode;                /* inode being written out */

        /* foreign inode detection, see wbc_detach_inode() */
        int wb_id;                        /* current wb id */
        int wb_lcand_id;                /* last foreign candidate wb id */
        int wb_tcand_id;                /* this foreign candidate wb id */
        size_t wb_bytes;                /* bytes written by current wb */
        size_t wb_lcand_bytes;                /* bytes written by last candidate */
        size_t wb_tcand_bytes;                /* bytes written by this candidate */
#endif
};

static inline blk_opf_t wbc_to_write_flags(struct writeback_control *wbc)
{
        blk_opf_t flags = 0;

        if (wbc->sync_mode == WB_SYNC_ALL)
                flags |= REQ_SYNC;
        else if (wbc->for_kupdate || wbc->for_background)
                flags |= REQ_BACKGROUND;

        return flags;
}

#ifdef CONFIG_CGROUP_WRITEBACK
#define wbc_blkcg_css(wbc) \
        ((wbc)->wb ? (wbc)->wb->blkcg_css : blkcg_root_css)
#else
#define wbc_blkcg_css(wbc)                (blkcg_root_css)
#endif /* CONFIG_CGROUP_WRITEBACK */

/*
 * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
 * and are measured against each other in.  There always is one global
 * domain, global_wb_domain, that every wb in the system is a member of.
 * This allows measuring the relative bandwidth of each wb to distribute
 * dirtyable memory accordingly.
 */
struct wb_domain {
        spinlock_t lock;

        /*
         * Scale the writeback cache size proportional to the relative
         * writeout speed.
         *
         * We do this by keeping a floating proportion between BDIs, based
         * on page writeback completions [end_page_writeback()]. Those
         * devices that write out pages fastest will get the larger share,
         * while the slower will get a smaller share.
         *
         * We use page writeout completions because we are interested in
         * getting rid of dirty pages. Having them written out is the
         * primary goal.
         *
         * We introduce a concept of time, a period over which we measure
         * these events, because demand can/will vary over time. The length
         * of this period itself is measured in page writeback completions.
         */
        struct fprop_global completions;
        struct timer_list period_timer;        /* timer for aging of completions */
        unsigned long period_time;

        /*
         * The dirtyable memory and dirty threshold could be suddenly
         * knocked down by a large amount (eg. on the startup of KVM in a
         * swapless system). This may throw the system into deep dirty
         * exceeded state and throttle heavy/light dirtiers alike. To
         * retain good responsiveness, maintain global_dirty_limit for
         * tracking slowly down to the knocked down dirty threshold.
         *
         * Both fields are protected by ->lock.
         */
        unsigned long dirty_limit_tstamp;
        unsigned long dirty_limit;
};

/**
 * wb_domain_size_changed - memory available to a wb_domain has changed
 * @dom: wb_domain of interest
 *
 * This function should be called when the amount of memory available to
 * @dom has changed.  It resets @dom's dirty limit parameters to prevent
 * the past values which don't match the current configuration from skewing
 * dirty throttling.  Without this, when memory size of a wb_domain is
 * greatly reduced, the dirty throttling logic may allow too many pages to
 * be dirtied leading to consecutive unnecessary OOMs and may get stuck in
 * that situation.
 */
static inline void wb_domain_size_changed(struct wb_domain *dom)
{
        spin_lock(&dom->lock);
        dom->dirty_limit_tstamp = jiffies;
        dom->dirty_limit = 0;
        spin_unlock(&dom->lock);
}

/*
 * fs/fs-writeback.c
 */        
struct bdi_writeback;
void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
                                                        enum wb_reason reason);
void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason);
void sync_inodes_sb(struct super_block *);
void wakeup_flusher_threads(enum wb_reason reason);
void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
                                enum wb_reason reason);
void inode_wait_for_writeback(struct inode *inode);
void inode_io_list_del(struct inode *inode);

/* writeback.h requires fs.h; it, too, is not included from here. */
static inline void wait_on_inode(struct inode *inode)
{
        wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE);
}

#ifdef CONFIG_CGROUP_WRITEBACK

#include <linux/cgroup.h>
#include <linux/bio.h>

void __inode_attach_wb(struct inode *inode, struct folio *folio);
void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
                                 struct inode *inode)
        __releases(&inode->i_lock);
void wbc_detach_inode(struct writeback_control *wbc);
void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
                              size_t bytes);
int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
                           enum wb_reason reason, struct wb_completion *done);
void cgroup_writeback_umount(void);
bool cleanup_offline_cgwb(struct bdi_writeback *wb);

/**
 * inode_attach_wb - associate an inode with its wb
 * @inode: inode of interest
 * @folio: folio being dirtied (may be NULL)
 *
 * If @inode doesn't have its wb, associate it with the wb matching the
 * memcg of @folio or, if @folio is NULL, %current.  May be called w/ or w/o
 * @inode->i_lock.
 */
static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
{
        if (!inode->i_wb)
                __inode_attach_wb(inode, folio);
}

/**
 * inode_detach_wb - disassociate an inode from its wb
 * @inode: inode of interest
 *
 * @inode is being freed.  Detach from its wb.
 */
static inline void inode_detach_wb(struct inode *inode)
{
        if (inode->i_wb) {
                WARN_ON_ONCE(!(inode->i_state & I_CLEAR));
                wb_put(inode->i_wb);
                inode->i_wb = NULL;
        }
}

/**
 * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
 * @wbc: writeback_control of interest
 * @inode: target inode
 *
 * This function is to be used by __filemap_fdatawrite_range(), which is an
 * alternative entry point into writeback code, and first ensures @inode is
 * associated with a bdi_writeback and attaches it to @wbc.
 */
static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
                                               struct inode *inode)
{
        spin_lock(&inode->i_lock);
        inode_attach_wb(inode, NULL);
        wbc_attach_and_unlock_inode(wbc, inode);
}

/**
 * wbc_init_bio - writeback specific initializtion of bio
 * @wbc: writeback_control for the writeback in progress
 * @bio: bio to be initialized
 *
 * @bio is a part of the writeback in progress controlled by @wbc.  Perform
 * writeback specific initialization.  This is used to apply the cgroup
 * writeback context.  Must be called after the bio has been associated with
 * a device.
 */
static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
{
        /*
         * pageout() path doesn't attach @wbc to the inode being written
         * out.  This is intentional as we don't want the function to block
         * behind a slow cgroup.  Ultimately, we want pageout() to kick off
         * regular writeback instead of writing things out itself.
         */
        if (wbc->wb)
                bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
}

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
{
}

static inline void inode_detach_wb(struct inode *inode)
{
}

static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
                                               struct inode *inode)
        __releases(&inode->i_lock)
{
        spin_unlock(&inode->i_lock);
}

static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
                                               struct inode *inode)
{
}

static inline void wbc_detach_inode(struct writeback_control *wbc)
{
}

static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
{
}

static inline void wbc_account_cgroup_owner(struct writeback_control *wbc,
                                            struct page *page, size_t bytes)
{
}

static inline void cgroup_writeback_umount(void)
{
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

/*
 * mm/page-writeback.c
 */
void laptop_io_completion(struct backing_dev_info *info);
void laptop_sync_completion(void);
void laptop_mode_timer_fn(struct timer_list *t);
bool node_dirty_ok(struct pglist_data *pgdat);
int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom);
#endif

extern struct wb_domain global_wb_domain;

/* These are exported to sysctl. */
extern unsigned int dirty_writeback_interval;
extern unsigned int dirty_expire_interval;
extern unsigned int dirtytime_expire_interval;
extern int laptop_mode;

int dirtytime_interval_handler(struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos);

void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
unsigned long cgwb_calc_thresh(struct bdi_writeback *wb);

void wb_update_bandwidth(struct bdi_writeback *wb);

/* Invoke balance dirty pages in async mode. */
#define BDP_ASYNC 0x0001

void balance_dirty_pages_ratelimited(struct address_space *mapping);
int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
                unsigned int flags);

bool wb_over_bg_thresh(struct bdi_writeback *wb);

struct folio *writeback_iter(struct address_space *mapping,
                struct writeback_control *wbc, struct folio *folio, int *error);

typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc,
                                void *data);

int write_cache_pages(struct address_space *mapping,
                      struct writeback_control *wbc, writepage_t writepage,
                      void *data);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
void writeback_set_ratelimit(void);
void tag_pages_for_writeback(struct address_space *mapping,
                             pgoff_t start, pgoff_t end);

bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio);
bool folio_redirty_for_writepage(struct writeback_control *, struct folio *);
bool redirty_page_for_writepage(struct writeback_control *, struct page *);

void sb_mark_inode_writeback(struct inode *inode);
void sb_clear_inode_writeback(struct inode *inode);

#endif                /* WRITEBACK_H */























    1 















    1 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#ifndef IOU_ALLOC_CACHE_H
#define IOU_ALLOC_CACHE_H

/*
 * Don't allow the cache to grow beyond this size.
 */
#define IO_ALLOC_CACHE_MAX        128

static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
                                      void *entry)
{
        if (cache->nr_cached < cache->max_cached) {
                if (!kasan_mempool_poison_object(entry))
                        return false;
                cache->entries[cache->nr_cached++] = entry;
                return true;
        }
        return false;
}

static inline void *io_alloc_cache_get(struct io_alloc_cache *cache)
{
        if (cache->nr_cached) {
                void *entry = cache->entries[--cache->nr_cached];

                kasan_mempool_unpoison_object(entry, cache->elem_size);
                return entry;
        }

        return NULL;
}

/* returns false if the cache was initialized properly */
static inline bool io_alloc_cache_init(struct io_alloc_cache *cache,
                                       unsigned max_nr, size_t size)
{
        cache->entries = kvmalloc_array(max_nr, sizeof(void *), GFP_KERNEL);
        if (cache->entries) {
                cache->nr_cached = 0;
                cache->max_cached = max_nr;
                cache->elem_size = size;
                return false;
        }
        return true;
}

static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
                                       void (*free)(const void *))
{
        void *entry;

        if (!cache->entries)
                return;

        while ((entry = io_alloc_cache_get(cache)) != NULL)
                free(entry);

        kvfree(cache->entries);
        cache->entries = NULL;
}
#endif





























































































    2 





















    1 
































    1 









    1 











































    1 










    1 










    1 




















    1 














































    1 
























































    1 


















    1 
























    1 

























    1 







    1 
















































    1 







    1 



















    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
// SPDX-License-Identifier: GPL-2.0-or-later
/* Filesystem access-by-fd.
 *
 * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/anon_inodes.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <uapi/linux/mount.h>
#include "internal.h"
#include "mount.h"

/*
 * Allow the user to read back any error, warning or informational messages.
 */
static ssize_t fscontext_read(struct file *file,
                              char __user *_buf, size_t len, loff_t *pos)
{
        struct fs_context *fc = file->private_data;
        struct fc_log *log = fc->log.log;
        unsigned int logsize = ARRAY_SIZE(log->buffer);
        ssize_t ret;
        char *p;
        bool need_free;
        int index, n;

        ret = mutex_lock_interruptible(&fc->uapi_mutex);
        if (ret < 0)
                return ret;

        if (log->head == log->tail) {
                mutex_unlock(&fc->uapi_mutex);
                return -ENODATA;
        }

        index = log->tail & (logsize - 1);
        p = log->buffer[index];
        need_free = log->need_free & (1 << index);
        log->buffer[index] = NULL;
        log->need_free &= ~(1 << index);
        log->tail++;
        mutex_unlock(&fc->uapi_mutex);

        ret = -EMSGSIZE;
        n = strlen(p);
        if (n > len)
                goto err_free;
        ret = -EFAULT;
        if (copy_to_user(_buf, p, n) != 0)
                goto err_free;
        ret = n;

err_free:
        if (need_free)
                kfree(p);
        return ret;
}

static int fscontext_release(struct inode *inode, struct file *file)
{
        struct fs_context *fc = file->private_data;

        if (fc) {
                file->private_data = NULL;
                put_fs_context(fc);
        }
        return 0;
}

const struct file_operations fscontext_fops = {
        .read                = fscontext_read,
        .release        = fscontext_release,
        .llseek                = no_llseek,
};

/*
 * Attach a filesystem context to a file and an fd.
 */
static int fscontext_create_fd(struct fs_context *fc, unsigned int o_flags)
{
        int fd;

        fd = anon_inode_getfd("[fscontext]", &fscontext_fops, fc,
                              O_RDWR | o_flags);
        if (fd < 0)
                put_fs_context(fc);
        return fd;
}

static int fscontext_alloc_log(struct fs_context *fc)
{
        fc->log.log = kzalloc(sizeof(*fc->log.log), GFP_KERNEL);
        if (!fc->log.log)
                return -ENOMEM;
        refcount_set(&fc->log.log->usage, 1);
        fc->log.log->owner = fc->fs_type->owner;
        return 0;
}

/*
 * Open a filesystem by name so that it can be configured for mounting.
 *
 * We are allowed to specify a container in which the filesystem will be
 * opened, thereby indicating which namespaces will be used (notably, which
 * network namespace will be used for network filesystems).
 */
SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags)
{
        struct file_system_type *fs_type;
        struct fs_context *fc;
        const char *fs_name;
        int ret;

        if (!may_mount())
                return -EPERM;

        if (flags & ~FSOPEN_CLOEXEC)
                return -EINVAL;

        fs_name = strndup_user(_fs_name, PAGE_SIZE);
        if (IS_ERR(fs_name))
                return PTR_ERR(fs_name);

        fs_type = get_fs_type(fs_name);
        kfree(fs_name);
        if (!fs_type)
                return -ENODEV;

        fc = fs_context_for_mount(fs_type, 0);
        put_filesystem(fs_type);
        if (IS_ERR(fc))
                return PTR_ERR(fc);

        fc->phase = FS_CONTEXT_CREATE_PARAMS;

        ret = fscontext_alloc_log(fc);
        if (ret < 0)
                goto err_fc;

        return fscontext_create_fd(fc, flags & FSOPEN_CLOEXEC ? O_CLOEXEC : 0);

err_fc:
        put_fs_context(fc);
        return ret;
}

/*
 * Pick a superblock into a context for reconfiguration.
 */
SYSCALL_DEFINE3(fspick, int, dfd, const char __user *, path, unsigned int, flags)
{
        struct fs_context *fc;
        struct path target;
        unsigned int lookup_flags;
        int ret;

        if (!may_mount())
                return -EPERM;

        if ((flags & ~(FSPICK_CLOEXEC |
                       FSPICK_SYMLINK_NOFOLLOW |
                       FSPICK_NO_AUTOMOUNT |
                       FSPICK_EMPTY_PATH)) != 0)
                return -EINVAL;

        lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
        if (flags & FSPICK_SYMLINK_NOFOLLOW)
                lookup_flags &= ~LOOKUP_FOLLOW;
        if (flags & FSPICK_NO_AUTOMOUNT)
                lookup_flags &= ~LOOKUP_AUTOMOUNT;
        if (flags & FSPICK_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;
        ret = user_path_at(dfd, path, lookup_flags, &target);
        if (ret < 0)
                goto err;

        ret = -EINVAL;
        if (target.mnt->mnt_root != target.dentry)
                goto err_path;

        fc = fs_context_for_reconfigure(target.dentry, 0, 0);
        if (IS_ERR(fc)) {
                ret = PTR_ERR(fc);
                goto err_path;
        }

        fc->phase = FS_CONTEXT_RECONF_PARAMS;

        ret = fscontext_alloc_log(fc);
        if (ret < 0)
                goto err_fc;

        path_put(&target);
        return fscontext_create_fd(fc, flags & FSPICK_CLOEXEC ? O_CLOEXEC : 0);

err_fc:
        put_fs_context(fc);
err_path:
        path_put(&target);
err:
        return ret;
}

static int vfs_cmd_create(struct fs_context *fc, bool exclusive)
{
        struct super_block *sb;
        int ret;

        if (fc->phase != FS_CONTEXT_CREATE_PARAMS)
                return -EBUSY;

        if (!mount_capable(fc))
                return -EPERM;

        /* require the new mount api */
        if (exclusive && fc->ops == &legacy_fs_context_ops)
                return -EOPNOTSUPP;

        fc->phase = FS_CONTEXT_CREATING;
        fc->exclusive = exclusive;

        ret = vfs_get_tree(fc);
        if (ret) {
                fc->phase = FS_CONTEXT_FAILED;
                return ret;
        }

        sb = fc->root->d_sb;
        ret = security_sb_kern_mount(sb);
        if (unlikely(ret)) {
                fc_drop_locked(fc);
                fc->phase = FS_CONTEXT_FAILED;
                return ret;
        }

        /* vfs_get_tree() callchains will have grabbed @s_umount */
        up_write(&sb->s_umount);
        fc->phase = FS_CONTEXT_AWAITING_MOUNT;
        return 0;
}

static int vfs_cmd_reconfigure(struct fs_context *fc)
{
        struct super_block *sb;
        int ret;

        if (fc->phase != FS_CONTEXT_RECONF_PARAMS)
                return -EBUSY;

        fc->phase = FS_CONTEXT_RECONFIGURING;

        sb = fc->root->d_sb;
        if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
                fc->phase = FS_CONTEXT_FAILED;
                return -EPERM;
        }

        down_write(&sb->s_umount);
        ret = reconfigure_super(fc);
        up_write(&sb->s_umount);
        if (ret) {
                fc->phase = FS_CONTEXT_FAILED;
                return ret;
        }

        vfs_clean_context(fc);
        return 0;
}

/*
 * Check the state and apply the configuration.  Note that this function is
 * allowed to 'steal' the value by setting param->xxx to NULL before returning.
 */
static int vfs_fsconfig_locked(struct fs_context *fc, int cmd,
                               struct fs_parameter *param)
{
        int ret;

        ret = finish_clean_context(fc);
        if (ret)
                return ret;
        switch (cmd) {
        case FSCONFIG_CMD_CREATE:
                return vfs_cmd_create(fc, false);
        case FSCONFIG_CMD_CREATE_EXCL:
                return vfs_cmd_create(fc, true);
        case FSCONFIG_CMD_RECONFIGURE:
                return vfs_cmd_reconfigure(fc);
        default:
                if (fc->phase != FS_CONTEXT_CREATE_PARAMS &&
                    fc->phase != FS_CONTEXT_RECONF_PARAMS)
                        return -EBUSY;

                return vfs_parse_fs_param(fc, param);
        }
}

/**
 * sys_fsconfig - Set parameters and trigger actions on a context
 * @fd: The filesystem context to act upon
 * @cmd: The action to take
 * @_key: Where appropriate, the parameter key to set
 * @_value: Where appropriate, the parameter value to set
 * @aux: Additional information for the value
 *
 * This system call is used to set parameters on a context, including
 * superblock settings, data source and security labelling.
 *
 * Actions include triggering the creation of a superblock and the
 * reconfiguration of the superblock attached to the specified context.
 *
 * When setting a parameter, @cmd indicates the type of value being proposed
 * and @_key indicates the parameter to be altered.
 *
 * @_value and @aux are used to specify the value, should a value be required:
 *
 * (*) fsconfig_set_flag: No value is specified.  The parameter must be boolean
 *     in nature.  The key may be prefixed with "no" to invert the
 *     setting. @_value must be NULL and @aux must be 0.
 *
 * (*) fsconfig_set_string: A string value is specified.  The parameter can be
 *     expecting boolean, integer, string or take a path.  A conversion to an
 *     appropriate type will be attempted (which may include looking up as a
 *     path).  @_value points to a NUL-terminated string and @aux must be 0.
 *
 * (*) fsconfig_set_binary: A binary blob is specified.  @_value points to the
 *     blob and @aux indicates its size.  The parameter must be expecting a
 *     blob.
 *
 * (*) fsconfig_set_path: A non-empty path is specified.  The parameter must be
 *     expecting a path object.  @_value points to a NUL-terminated string that
 *     is the path and @aux is a file descriptor at which to start a relative
 *     lookup or AT_FDCWD.
 *
 * (*) fsconfig_set_path_empty: As fsconfig_set_path, but with AT_EMPTY_PATH
 *     implied.
 *
 * (*) fsconfig_set_fd: An open file descriptor is specified.  @_value must be
 *     NULL and @aux indicates the file descriptor.
 */
SYSCALL_DEFINE5(fsconfig,
                int, fd,
                unsigned int, cmd,
                const char __user *, _key,
                const void __user *, _value,
                int, aux)
{
        struct fs_context *fc;
        struct fd f;
        int ret;
        int lookup_flags = 0;

        struct fs_parameter param = {
                .type        = fs_value_is_undefined,
        };

        if (fd < 0)
                return -EINVAL;

        switch (cmd) {
        case FSCONFIG_SET_FLAG:
                if (!_key || _value || aux)
                        return -EINVAL;
                break;
        case FSCONFIG_SET_STRING:
                if (!_key || !_value || aux)
                        return -EINVAL;
                break;
        case FSCONFIG_SET_BINARY:
                if (!_key || !_value || aux <= 0 || aux > 1024 * 1024)
                        return -EINVAL;
                break;
        case FSCONFIG_SET_PATH:
        case FSCONFIG_SET_PATH_EMPTY:
                if (!_key || !_value || (aux != AT_FDCWD && aux < 0))
                        return -EINVAL;
                break;
        case FSCONFIG_SET_FD:
                if (!_key || _value || aux < 0)
                        return -EINVAL;
                break;
        case FSCONFIG_CMD_CREATE:
        case FSCONFIG_CMD_CREATE_EXCL:
        case FSCONFIG_CMD_RECONFIGURE:
                if (_key || _value || aux)
                        return -EINVAL;
                break;
        default:
                return -EOPNOTSUPP;
        }

        f = fdget(fd);
        if (!f.file)
                return -EBADF;
        ret = -EINVAL;
        if (f.file->f_op != &fscontext_fops)
                goto out_f;

        fc = f.file->private_data;
        if (fc->ops == &legacy_fs_context_ops) {
                switch (cmd) {
                case FSCONFIG_SET_BINARY:
                case FSCONFIG_SET_PATH:
                case FSCONFIG_SET_PATH_EMPTY:
                case FSCONFIG_SET_FD:
                        ret = -EOPNOTSUPP;
                        goto out_f;
                }
        }

        if (_key) {
                param.key = strndup_user(_key, 256);
                if (IS_ERR(param.key)) {
                        ret = PTR_ERR(param.key);
                        goto out_f;
                }
        }

        switch (cmd) {
        case FSCONFIG_SET_FLAG:
                param.type = fs_value_is_flag;
                break;
        case FSCONFIG_SET_STRING:
                param.type = fs_value_is_string;
                param.string = strndup_user(_value, 256);
                if (IS_ERR(param.string)) {
                        ret = PTR_ERR(param.string);
                        goto out_key;
                }
                param.size = strlen(param.string);
                break;
        case FSCONFIG_SET_BINARY:
                param.type = fs_value_is_blob;
                param.size = aux;
                param.blob = memdup_user_nul(_value, aux);
                if (IS_ERR(param.blob)) {
                        ret = PTR_ERR(param.blob);
                        goto out_key;
                }
                break;
        case FSCONFIG_SET_PATH_EMPTY:
                lookup_flags = LOOKUP_EMPTY;
                fallthrough;
        case FSCONFIG_SET_PATH:
                param.type = fs_value_is_filename;
                param.name = getname_flags(_value, lookup_flags, NULL);
                if (IS_ERR(param.name)) {
                        ret = PTR_ERR(param.name);
                        goto out_key;
                }
                param.dirfd = aux;
                param.size = strlen(param.name->name);
                break;
        case FSCONFIG_SET_FD:
                param.type = fs_value_is_file;
                ret = -EBADF;
                param.file = fget(aux);
                if (!param.file)
                        goto out_key;
                param.dirfd = aux;
                break;
        default:
                break;
        }

        ret = mutex_lock_interruptible(&fc->uapi_mutex);
        if (ret == 0) {
                ret = vfs_fsconfig_locked(fc, cmd, &param);
                mutex_unlock(&fc->uapi_mutex);
        }

        /* Clean up the our record of any value that we obtained from
         * userspace.  Note that the value may have been stolen by the LSM or
         * filesystem, in which case the value pointer will have been cleared.
         */
        switch (cmd) {
        case FSCONFIG_SET_STRING:
        case FSCONFIG_SET_BINARY:
                kfree(param.string);
                break;
        case FSCONFIG_SET_PATH:
        case FSCONFIG_SET_PATH_EMPTY:
                if (param.name)
                        putname(param.name);
                break;
        case FSCONFIG_SET_FD:
                if (param.file)
                        fput(param.file);
                break;
        default:
                break;
        }
out_key:
        kfree(param.key);
out_f:
        fdput(f);
        return ret;
}



























   22 








   21 















    5 


    6 





    6 




    4 


    4 











   19 


    6 



    5 







    6 











   10 











    5 
   11 



   18 








    1 







   14 

    6 


    8 







    4 









    7 





    7 
























    1 
    3 









    3 














    3 


    3 

    3 


    3 

    3 
    3 























    4 












    5 

    5 











    2 
    2 
    2 

    2 



















    3 











    1 




    3 

























    2 

    2 



















    4 
   11 






    1 
    7 












    6 










    5 















    9 











    1 


    9 






   10 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FS_NOTIFY_H
#define _LINUX_FS_NOTIFY_H

/*
 * include/linux/fsnotify.h - generic hooks for filesystem notification, to
 * reduce in-source duplication from both dnotify and inotify.
 *
 * We don't compile any of this away in some complicated menagerie of ifdefs.
 * Instead, we rely on the code inside to optimize away as needed.
 *
 * (C) Copyright 2005 Robert Love
 */

#include <linux/fsnotify_backend.h>
#include <linux/audit.h>
#include <linux/slab.h>
#include <linux/bug.h>

/* Are there any inode/mount/sb objects watched with priority prio or above? */
static inline bool fsnotify_sb_has_priority_watchers(struct super_block *sb,
                                                     int prio)
{
        struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);

        /* Were any marks ever added to any object on this sb? */
        if (!sbinfo)
                return false;

        return atomic_long_read(&sbinfo->watched_objects[prio]);
}

/* Are there any inode/mount/sb objects that are being watched at all? */
static inline bool fsnotify_sb_has_watchers(struct super_block *sb)
{
        return fsnotify_sb_has_priority_watchers(sb, 0);
}

/*
 * Notify this @dir inode about a change in a child directory entry.
 * The directory entry may have turned positive or negative or its inode may
 * have changed (i.e. renamed over).
 *
 * Unlike fsnotify_parent(), the event will be reported regardless of the
 * FS_EVENT_ON_CHILD mask on the parent inode and will not be reported if only
 * the child is interested and not the parent.
 */
static inline int fsnotify_name(__u32 mask, const void *data, int data_type,
                                struct inode *dir, const struct qstr *name,
                                u32 cookie)
{
        if (!fsnotify_sb_has_watchers(dir->i_sb))
                return 0;

        return fsnotify(mask, data, data_type, dir, name, NULL, cookie);
}

static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry,
                                   __u32 mask)
{
        fsnotify_name(mask, dentry, FSNOTIFY_EVENT_DENTRY, dir, &dentry->d_name, 0);
}

static inline void fsnotify_inode(struct inode *inode, __u32 mask)
{
        if (!fsnotify_sb_has_watchers(inode->i_sb))
                return;

        if (S_ISDIR(inode->i_mode))
                mask |= FS_ISDIR;

        fsnotify(mask, inode, FSNOTIFY_EVENT_INODE, NULL, NULL, inode, 0);
}

/* Notify this dentry's parent about a child's events. */
static inline int fsnotify_parent(struct dentry *dentry, __u32 mask,
                                  const void *data, int data_type)
{
        struct inode *inode = d_inode(dentry);

        if (!fsnotify_sb_has_watchers(inode->i_sb))
                return 0;

        if (S_ISDIR(inode->i_mode)) {
                mask |= FS_ISDIR;

                /* sb/mount marks are not interested in name of directory */
                if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
                        goto notify_child;
        }

        /* disconnected dentry cannot notify parent */
        if (IS_ROOT(dentry))
                goto notify_child;

        return __fsnotify_parent(dentry, mask, data, data_type);

notify_child:
        return fsnotify(mask, data, data_type, NULL, NULL, inode, 0);
}

/*
 * Simple wrappers to consolidate calls to fsnotify_parent() when an event
 * is on a file/dentry.
 */
static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask)
{
        fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY);
}

static inline int fsnotify_file(struct file *file, __u32 mask)
{
        const struct path *path;

        if (file->f_mode & FMODE_NONOTIFY)
                return 0;

        path = &file->f_path;
        /* Permission events require group prio >= FSNOTIFY_PRIO_CONTENT */
        if (mask & ALL_FSNOTIFY_PERM_EVENTS &&
            !fsnotify_sb_has_priority_watchers(path->dentry->d_sb,
                                               FSNOTIFY_PRIO_CONTENT))
                return 0;

        return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
}

#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
/*
 * fsnotify_file_area_perm - permission hook before access to file range
 */
static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
                                          const loff_t *ppos, size_t count)
{
        __u32 fsnotify_mask = FS_ACCESS_PERM;

        /*
         * filesystem may be modified in the context of permission events
         * (e.g. by HSM filling a file on access), so sb freeze protection
         * must not be held.
         */
        lockdep_assert_once(file_write_not_started(file));

        if (!(perm_mask & MAY_READ))
                return 0;

        return fsnotify_file(file, fsnotify_mask);
}

/*
 * fsnotify_file_perm - permission hook before file access
 */
static inline int fsnotify_file_perm(struct file *file, int perm_mask)
{
        return fsnotify_file_area_perm(file, perm_mask, NULL, 0);
}

/*
 * fsnotify_open_perm - permission hook before file open
 */
static inline int fsnotify_open_perm(struct file *file)
{
        int ret;

        if (file->f_flags & __FMODE_EXEC) {
                ret = fsnotify_file(file, FS_OPEN_EXEC_PERM);
                if (ret)
                        return ret;
        }

        return fsnotify_file(file, FS_OPEN_PERM);
}

#else
static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
                                          const loff_t *ppos, size_t count)
{
        return 0;
}

static inline int fsnotify_file_perm(struct file *file, int perm_mask)
{
        return 0;
}

static inline int fsnotify_open_perm(struct file *file)
{
        return 0;
}
#endif

/*
 * fsnotify_link_count - inode's link count changed
 */
static inline void fsnotify_link_count(struct inode *inode)
{
        fsnotify_inode(inode, FS_ATTRIB);
}

/*
 * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
 */
static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
                                 const struct qstr *old_name,
                                 int isdir, struct inode *target,
                                 struct dentry *moved)
{
        struct inode *source = moved->d_inode;
        u32 fs_cookie = fsnotify_get_cookie();
        __u32 old_dir_mask = FS_MOVED_FROM;
        __u32 new_dir_mask = FS_MOVED_TO;
        __u32 rename_mask = FS_RENAME;
        const struct qstr *new_name = &moved->d_name;

        if (isdir) {
                old_dir_mask |= FS_ISDIR;
                new_dir_mask |= FS_ISDIR;
                rename_mask |= FS_ISDIR;
        }

        /* Event with information about both old and new parent+name */
        fsnotify_name(rename_mask, moved, FSNOTIFY_EVENT_DENTRY,
                      old_dir, old_name, 0);

        fsnotify_name(old_dir_mask, source, FSNOTIFY_EVENT_INODE,
                      old_dir, old_name, fs_cookie);
        fsnotify_name(new_dir_mask, source, FSNOTIFY_EVENT_INODE,
                      new_dir, new_name, fs_cookie);

        if (target)
                fsnotify_link_count(target);
        fsnotify_inode(source, FS_MOVE_SELF);
        audit_inode_child(new_dir, moved, AUDIT_TYPE_CHILD_CREATE);
}

/*
 * fsnotify_inode_delete - and inode is being evicted from cache, clean up is needed
 */
static inline void fsnotify_inode_delete(struct inode *inode)
{
        __fsnotify_inode_delete(inode);
}

/*
 * fsnotify_vfsmount_delete - a vfsmount is being destroyed, clean up is needed
 */
static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt)
{
        __fsnotify_vfsmount_delete(mnt);
}

/*
 * fsnotify_inoderemove - an inode is going away
 */
static inline void fsnotify_inoderemove(struct inode *inode)
{
        fsnotify_inode(inode, FS_DELETE_SELF);
        __fsnotify_inode_delete(inode);
}

/*
 * fsnotify_create - 'name' was linked in
 *
 * Caller must make sure that dentry->d_name is stable.
 * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate
 * ->d_inode later
 */
static inline void fsnotify_create(struct inode *dir, struct dentry *dentry)
{
        audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_dirent(dir, dentry, FS_CREATE);
}

/*
 * fsnotify_link - new hardlink in 'inode' directory
 *
 * Caller must make sure that new_dentry->d_name is stable.
 * Note: We have to pass also the linked inode ptr as some filesystems leave
 *   new_dentry->d_inode NULL and instantiate inode pointer later
 */
static inline void fsnotify_link(struct inode *dir, struct inode *inode,
                                 struct dentry *new_dentry)
{
        fsnotify_link_count(inode);
        audit_inode_child(dir, new_dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_name(FS_CREATE, inode, FSNOTIFY_EVENT_INODE,
                      dir, &new_dentry->d_name, 0);
}

/*
 * fsnotify_delete - @dentry was unlinked and unhashed
 *
 * Caller must make sure that dentry->d_name is stable.
 *
 * Note: unlike fsnotify_unlink(), we have to pass also the unlinked inode
 * as this may be called after d_delete() and old_dentry may be negative.
 */
static inline void fsnotify_delete(struct inode *dir, struct inode *inode,
                                   struct dentry *dentry)
{
        __u32 mask = FS_DELETE;

        if (S_ISDIR(inode->i_mode))
                mask |= FS_ISDIR;

        fsnotify_name(mask, inode, FSNOTIFY_EVENT_INODE, dir, &dentry->d_name,
                      0);
}

/**
 * d_delete_notify - delete a dentry and call fsnotify_delete()
 * @dentry: The dentry to delete
 *
 * This helper is used to guaranty that the unlinked inode cannot be found
 * by lookup of this name after fsnotify_delete() event has been delivered.
 */
static inline void d_delete_notify(struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        ihold(inode);
        d_delete(dentry);
        fsnotify_delete(dir, inode, dentry);
        iput(inode);
}

/*
 * fsnotify_unlink - 'name' was unlinked
 *
 * Caller must make sure that dentry->d_name is stable.
 */
static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry)
{
        if (WARN_ON_ONCE(d_is_negative(dentry)))
                return;

        fsnotify_delete(dir, d_inode(dentry), dentry);
}

/*
 * fsnotify_mkdir - directory 'name' was created
 *
 * Caller must make sure that dentry->d_name is stable.
 * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate
 * ->d_inode later
 */
static inline void fsnotify_mkdir(struct inode *dir, struct dentry *dentry)
{
        audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_dirent(dir, dentry, FS_CREATE | FS_ISDIR);
}

/*
 * fsnotify_rmdir - directory 'name' was removed
 *
 * Caller must make sure that dentry->d_name is stable.
 */
static inline void fsnotify_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (WARN_ON_ONCE(d_is_negative(dentry)))
                return;

        fsnotify_delete(dir, d_inode(dentry), dentry);
}

/*
 * fsnotify_access - file was read
 */
static inline void fsnotify_access(struct file *file)
{
        fsnotify_file(file, FS_ACCESS);
}

/*
 * fsnotify_modify - file was modified
 */
static inline void fsnotify_modify(struct file *file)
{
        fsnotify_file(file, FS_MODIFY);
}

/*
 * fsnotify_open - file was opened
 */
static inline void fsnotify_open(struct file *file)
{
        __u32 mask = FS_OPEN;

        if (file->f_flags & __FMODE_EXEC)
                mask |= FS_OPEN_EXEC;

        fsnotify_file(file, mask);
}

/*
 * fsnotify_close - file was closed
 */
static inline void fsnotify_close(struct file *file)
{
        __u32 mask = (file->f_mode & FMODE_WRITE) ? FS_CLOSE_WRITE :
                                                    FS_CLOSE_NOWRITE;

        fsnotify_file(file, mask);
}

/*
 * fsnotify_xattr - extended attributes were changed
 */
static inline void fsnotify_xattr(struct dentry *dentry)
{
        fsnotify_dentry(dentry, FS_ATTRIB);
}

/*
 * fsnotify_change - notify_change event.  file was modified and/or metadata
 * was changed.
 */
static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
{
        __u32 mask = 0;

        if (ia_valid & ATTR_UID)
                mask |= FS_ATTRIB;
        if (ia_valid & ATTR_GID)
                mask |= FS_ATTRIB;
        if (ia_valid & ATTR_SIZE)
                mask |= FS_MODIFY;

        /* both times implies a utime(s) call */
        if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME))
                mask |= FS_ATTRIB;
        else if (ia_valid & ATTR_ATIME)
                mask |= FS_ACCESS;
        else if (ia_valid & ATTR_MTIME)
                mask |= FS_MODIFY;

        if (ia_valid & ATTR_MODE)
                mask |= FS_ATTRIB;

        if (mask)
                fsnotify_dentry(dentry, mask);
}

static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode,
                                    int error)
{
        struct fs_error_report report = {
                .error = error,
                .inode = inode,
                .sb = sb,
        };

        return fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR,
                        NULL, NULL, NULL, 0);
}

#endif        /* _LINUX_FS_NOTIFY_H */





























    7 










































































































    2 






































































































    9 





    7 



    2 





















    9 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* I/O iterator iteration building functions.
 *
 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_IOV_ITER_H
#define _LINUX_IOV_ITER_H

#include <linux/uio.h>
#include <linux/bvec.h>

typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len,
                             void *priv, void *priv2);
typedef size_t (*iov_ustep_f)(void __user *iter_base, size_t progress, size_t len,
                              void *priv, void *priv2);

/*
 * Handle ITER_UBUF.
 */
static __always_inline
size_t iterate_ubuf(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                    iov_ustep_f step)
{
        void __user *base = iter->ubuf;
        size_t progress = 0, remain;

        remain = step(base + iter->iov_offset, 0, len, priv, priv2);
        progress = len - remain;
        iter->iov_offset += progress;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_IOVEC.
 */
static __always_inline
size_t iterate_iovec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                     iov_ustep_f step)
{
        const struct iovec *p = iter->__iov;
        size_t progress = 0, skip = iter->iov_offset;

        do {
                size_t remain, consumed;
                size_t part = min(len, p->iov_len - skip);

                if (likely(part)) {
                        remain = step(p->iov_base + skip, progress, part, priv, priv2);
                        consumed = part - remain;
                        progress += consumed;
                        skip += consumed;
                        len -= consumed;
                        if (skip < p->iov_len)
                                break;
                }
                p++;
                skip = 0;
        } while (len);

        iter->nr_segs -= p - iter->__iov;
        iter->__iov = p;
        iter->iov_offset = skip;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_KVEC.
 */
static __always_inline
size_t iterate_kvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                    iov_step_f step)
{
        const struct kvec *p = iter->kvec;
        size_t progress = 0, skip = iter->iov_offset;

        do {
                size_t remain, consumed;
                size_t part = min(len, p->iov_len - skip);

                if (likely(part)) {
                        remain = step(p->iov_base + skip, progress, part, priv, priv2);
                        consumed = part - remain;
                        progress += consumed;
                        skip += consumed;
                        len -= consumed;
                        if (skip < p->iov_len)
                                break;
                }
                p++;
                skip = 0;
        } while (len);

        iter->nr_segs -= p - iter->kvec;
        iter->kvec = p;
        iter->iov_offset = skip;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_BVEC.
 */
static __always_inline
size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                    iov_step_f step)
{
        const struct bio_vec *p = iter->bvec;
        size_t progress = 0, skip = iter->iov_offset;

        do {
                size_t remain, consumed;
                size_t offset = p->bv_offset + skip, part;
                void *kaddr = kmap_local_page(p->bv_page + offset / PAGE_SIZE);

                part = min3(len,
                           (size_t)(p->bv_len - skip),
                           (size_t)(PAGE_SIZE - offset % PAGE_SIZE));
                remain = step(kaddr + offset % PAGE_SIZE, progress, part, priv, priv2);
                kunmap_local(kaddr);
                consumed = part - remain;
                len -= consumed;
                progress += consumed;
                skip += consumed;
                if (skip >= p->bv_len) {
                        skip = 0;
                        p++;
                }
                if (remain)
                        break;
        } while (len);

        iter->nr_segs -= p - iter->bvec;
        iter->bvec = p;
        iter->iov_offset = skip;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_XARRAY.
 */
static __always_inline
size_t iterate_xarray(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                      iov_step_f step)
{
        struct folio *folio;
        size_t progress = 0;
        loff_t start = iter->xarray_start + iter->iov_offset;
        pgoff_t index = start / PAGE_SIZE;
        XA_STATE(xas, iter->xarray, index);

        rcu_read_lock();
        xas_for_each(&xas, folio, ULONG_MAX) {
                size_t remain, consumed, offset, part, flen;

                if (xas_retry(&xas, folio))
                        continue;
                if (WARN_ON(xa_is_value(folio)))
                        break;
                if (WARN_ON(folio_test_hugetlb(folio)))
                        break;

                offset = offset_in_folio(folio, start + progress);
                flen = min(folio_size(folio) - offset, len);

                while (flen) {
                        void *base = kmap_local_folio(folio, offset);

                        part = min_t(size_t, flen,
                                     PAGE_SIZE - offset_in_page(offset));
                        remain = step(base, progress, part, priv, priv2);
                        kunmap_local(base);

                        consumed = part - remain;
                        progress += consumed;
                        len -= consumed;

                        if (remain || len == 0)
                                goto out;
                        flen -= consumed;
                        offset += consumed;
                }
        }

out:
        rcu_read_unlock();
        iter->iov_offset += progress;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_DISCARD.
 */
static __always_inline
size_t iterate_discard(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                      iov_step_f step)
{
        size_t progress = len;

        iter->count -= progress;
        return progress;
}

/**
 * iterate_and_advance2 - Iterate over an iterator
 * @iter: The iterator to iterate over.
 * @len: The amount to iterate over.
 * @priv: Data for the step functions.
 * @priv2: More data for the step functions.
 * @ustep: Function for UBUF/IOVEC iterators; given __user addresses.
 * @step: Function for other iterators; given kernel addresses.
 *
 * Iterate over the next part of an iterator, up to the specified length.  The
 * buffer is presented in segments, which for kernel iteration are broken up by
 * physical pages and mapped, with the mapped address being presented.
 *
 * Two step functions, @step and @ustep, must be provided, one for handling
 * mapped kernel addresses and the other is given user addresses which have the
 * potential to fault since no pinning is performed.
 *
 * The step functions are passed the address and length of the segment, @priv,
 * @priv2 and the amount of data so far iterated over (which can, for example,
 * be added to @priv to point to the right part of a second buffer).  The step
 * functions should return the amount of the segment they didn't process (ie. 0
 * indicates complete processsing).
 *
 * This function returns the amount of data processed (ie. 0 means nothing was
 * processed and the value of @len means processes to completion).
 */
static __always_inline
size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv,
                            void *priv2, iov_ustep_f ustep, iov_step_f step)
{
        if (unlikely(iter->count < len))
                len = iter->count;
        if (unlikely(!len))
                return 0;

        if (likely(iter_is_ubuf(iter)))
                return iterate_ubuf(iter, len, priv, priv2, ustep);
        if (likely(iter_is_iovec(iter)))
                return iterate_iovec(iter, len, priv, priv2, ustep);
        if (iov_iter_is_bvec(iter))
                return iterate_bvec(iter, len, priv, priv2, step);
        if (iov_iter_is_kvec(iter))
                return iterate_kvec(iter, len, priv, priv2, step);
        if (iov_iter_is_xarray(iter))
                return iterate_xarray(iter, len, priv, priv2, step);
        return iterate_discard(iter, len, priv, priv2, step);
}

/**
 * iterate_and_advance - Iterate over an iterator
 * @iter: The iterator to iterate over.
 * @len: The amount to iterate over.
 * @priv: Data for the step functions.
 * @ustep: Function for UBUF/IOVEC iterators; given __user addresses.
 * @step: Function for other iterators; given kernel addresses.
 *
 * As iterate_and_advance2(), but priv2 is always NULL.
 */
static __always_inline
size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv,
                           iov_ustep_f ustep, iov_step_f step)
{
        return iterate_and_advance2(iter, len, priv, NULL, ustep, step);
}

#endif /* _LINUX_IOV_ITER_H */






















































































































































































































































































































































































































































































































































































































































    7 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SWAP_H
#define _LINUX_SWAP_H

#include <linux/spinlock.h>
#include <linux/linkage.h>
#include <linux/mmzone.h>
#include <linux/list.h>
#include <linux/memcontrol.h>
#include <linux/sched.h>
#include <linux/node.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/atomic.h>
#include <linux/page-flags.h>
#include <uapi/linux/mempolicy.h>
#include <asm/page.h>

struct notifier_block;

struct bio;

struct pagevec;

#define SWAP_FLAG_PREFER        0x8000        /* set if swap priority specified */
#define SWAP_FLAG_PRIO_MASK        0x7fff
#define SWAP_FLAG_PRIO_SHIFT        0
#define SWAP_FLAG_DISCARD        0x10000 /* enable discard for swap */
#define SWAP_FLAG_DISCARD_ONCE        0x20000 /* discard swap area at swapon-time */
#define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */

#define SWAP_FLAGS_VALID        (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \
                                 SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \
                                 SWAP_FLAG_DISCARD_PAGES)
#define SWAP_BATCH 64

static inline int current_is_kswapd(void)
{
        return current->flags & PF_KSWAPD;
}

/*
 * MAX_SWAPFILES defines the maximum number of swaptypes: things which can
 * be swapped to.  The swap type and the offset into that swap type are
 * encoded into pte's and into pgoff_t's in the swapcache.  Using five bits
 * for the type means that the maximum number of swapcache pages is 27 bits
 * on 32-bit-pgoff_t architectures.  And that assumes that the architecture packs
 * the type/offset into the pte as 5/27 as well.
 */
#define MAX_SWAPFILES_SHIFT        5

/*
 * Use some of the swap files numbers for other purposes. This
 * is a convenient way to hook into the VM to trigger special
 * actions on faults.
 */

/*
 * PTE markers are used to persist information onto PTEs that otherwise
 * should be a none pte.  As its name "PTE" hints, it should only be
 * applied to the leaves of pgtables.
 */
#define SWP_PTE_MARKER_NUM 1
#define SWP_PTE_MARKER     (MAX_SWAPFILES + SWP_HWPOISON_NUM + \
                            SWP_MIGRATION_NUM + SWP_DEVICE_NUM)

/*
 * Unaddressable device memory support. See include/linux/hmm.h and
 * Documentation/mm/hmm.rst. Short description is we need struct pages for
 * device memory that is unaddressable (inaccessible) by CPU, so that we can
 * migrate part of a process memory to device memory.
 *
 * When a page is migrated from CPU to device, we set the CPU page table entry
 * to a special SWP_DEVICE_{READ|WRITE} entry.
 *
 * When a page is mapped by the device for exclusive access we set the CPU page
 * table entries to special SWP_DEVICE_EXCLUSIVE_* entries.
 */
#ifdef CONFIG_DEVICE_PRIVATE
#define SWP_DEVICE_NUM 4
#define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM)
#define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1)
#define SWP_DEVICE_EXCLUSIVE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2)
#define SWP_DEVICE_EXCLUSIVE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+3)
#else
#define SWP_DEVICE_NUM 0
#endif

/*
 * Page migration support.
 *
 * SWP_MIGRATION_READ_EXCLUSIVE is only applicable to anonymous pages and
 * indicates that the referenced (part of) an anonymous page is exclusive to
 * a single process. For SWP_MIGRATION_WRITE, that information is implicit:
 * (part of) an anonymous page that are mapped writable are exclusive to a
 * single process.
 */
#ifdef CONFIG_MIGRATION
#define SWP_MIGRATION_NUM 3
#define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM)
#define SWP_MIGRATION_READ_EXCLUSIVE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1)
#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 2)
#else
#define SWP_MIGRATION_NUM 0
#endif

/*
 * Handling of hardware poisoned pages with memory corruption.
 */
#ifdef CONFIG_MEMORY_FAILURE
#define SWP_HWPOISON_NUM 1
#define SWP_HWPOISON                MAX_SWAPFILES
#else
#define SWP_HWPOISON_NUM 0
#endif

#define MAX_SWAPFILES \
        ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
        SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \
        SWP_PTE_MARKER_NUM)

/*
 * Magic header for a swap area. The first part of the union is
 * what the swap magic looks like for the old (limited to 128MB)
 * swap area format, the second part of the union adds - in the
 * old reserved area - some extra information. Note that the first
 * kilobyte is reserved for boot loader or disk label stuff...
 *
 * Having the magic at the end of the PAGE_SIZE makes detecting swap
 * areas somewhat tricky on machines that support multiple page sizes.
 * For 2.5 we'll probably want to move the magic to just beyond the
 * bootbits...
 */
union swap_header {
        struct {
                char reserved[PAGE_SIZE - 10];
                char magic[10];                        /* SWAP-SPACE or SWAPSPACE2 */
        } magic;
        struct {
                char                bootbits[1024];        /* Space for disklabel etc. */
                __u32                version;
                __u32                last_page;
                __u32                nr_badpages;
                unsigned char        sws_uuid[16];
                unsigned char        sws_volume[16];
                __u32                padding[117];
                __u32                badpages[1];
        } info;
};

/*
 * current->reclaim_state points to one of these when a task is running
 * memory reclaim
 */
struct reclaim_state {
        /* pages reclaimed outside of LRU-based reclaim */
        unsigned long reclaimed;
#ifdef CONFIG_LRU_GEN
        /* per-thread mm walk data */
        struct lru_gen_mm_walk *mm_walk;
#endif
};

/*
 * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based
 * reclaim
 * @pages: number of pages reclaimed
 *
 * If the current process is undergoing a reclaim operation, increment the
 * number of reclaimed pages by @pages.
 */
static inline void mm_account_reclaimed_pages(unsigned long pages)
{
        if (current->reclaim_state)
                current->reclaim_state->reclaimed += pages;
}

#ifdef __KERNEL__

struct address_space;
struct sysinfo;
struct writeback_control;
struct zone;

/*
 * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of
 * disk blocks.  A rbtree of swap extents maps the entire swapfile (Where the
 * term `swapfile' refers to either a blockdevice or an IS_REG file). Apart
 * from setup, they're handled identically.
 *
 * We always assume that blocks are of size PAGE_SIZE.
 */
struct swap_extent {
        struct rb_node rb_node;
        pgoff_t start_page;
        pgoff_t nr_pages;
        sector_t start_block;
};

/*
 * Max bad pages in the new format..
 */
#define MAX_SWAP_BADPAGES \
        ((offsetof(union swap_header, magic.magic) - \
          offsetof(union swap_header, info.badpages)) / sizeof(int))

enum {
        SWP_USED        = (1 << 0),        /* is slot in swap_info[] used? */
        SWP_WRITEOK        = (1 << 1),        /* ok to write to this swap?        */
        SWP_DISCARDABLE = (1 << 2),        /* blkdev support discard */
        SWP_DISCARDING        = (1 << 3),        /* now discarding a free cluster */
        SWP_SOLIDSTATE        = (1 << 4),        /* blkdev seeks are cheap */
        SWP_CONTINUED        = (1 << 5),        /* swap_map has count continuation */
        SWP_BLKDEV        = (1 << 6),        /* its a block device */
        SWP_ACTIVATED        = (1 << 7),        /* set after swap_activate success */
        SWP_FS_OPS        = (1 << 8),        /* swapfile operations go through fs */
        SWP_AREA_DISCARD = (1 << 9),        /* single-time swap area discards */
        SWP_PAGE_DISCARD = (1 << 10),        /* freed swap page-cluster discards */
        SWP_STABLE_WRITES = (1 << 11),        /* no overwrite PG_writeback pages */
        SWP_SYNCHRONOUS_IO = (1 << 12),        /* synchronous IO is efficient */
                                        /* add others here before... */
        SWP_SCANNING        = (1 << 14),        /* refcount in scan_swap_map */
};

#define SWAP_CLUSTER_MAX 32UL
#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX

/* Bit flag in swap_map */
#define SWAP_HAS_CACHE        0x40        /* Flag page is cached, in first swap_map */
#define COUNT_CONTINUED        0x80        /* Flag swap_map continuation for full count */

/* Special value in first swap_map */
#define SWAP_MAP_MAX        0x3e        /* Max count */
#define SWAP_MAP_BAD        0x3f        /* Note page is bad */
#define SWAP_MAP_SHMEM        0xbf        /* Owned by shmem/tmpfs */

/* Special value in each swap_map continuation */
#define SWAP_CONT_MAX        0x7f        /* Max count */

/*
 * We use this to track usage of a cluster. A cluster is a block of swap disk
 * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
 * free clusters are organized into a list. We fetch an entry from the list to
 * get a free cluster.
 *
 * The data field stores next cluster if the cluster is free or cluster usage
 * counter otherwise. The flags field determines if a cluster is free. This is
 * protected by swap_info_struct.lock.
 */
struct swap_cluster_info {
        spinlock_t lock;        /*
                                 * Protect swap_cluster_info fields
                                 * and swap_info_struct->swap_map
                                 * elements correspond to the swap
                                 * cluster
                                 */
        unsigned int data:24;
        unsigned int flags:8;
};
#define CLUSTER_FLAG_FREE 1 /* This cluster is free */
#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */

/*
 * The first page in the swap file is the swap header, which is always marked
 * bad to prevent it from being allocated as an entry. This also prevents the
 * cluster to which it belongs being marked free. Therefore 0 is safe to use as
 * a sentinel to indicate next is not valid in percpu_cluster.
 */
#define SWAP_NEXT_INVALID        0

#ifdef CONFIG_THP_SWAP
#define SWAP_NR_ORDERS                (PMD_ORDER + 1)
#else
#define SWAP_NR_ORDERS                1
#endif

/*
 * We assign a cluster to each CPU, so each CPU can allocate swap entry from
 * its own cluster and swapout sequentially. The purpose is to optimize swapout
 * throughput.
 */
struct percpu_cluster {
        unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
};

struct swap_cluster_list {
        struct swap_cluster_info head;
        struct swap_cluster_info tail;
};

/*
 * The in-memory structure used to track swap areas.
 */
struct swap_info_struct {
        struct percpu_ref users;        /* indicate and keep swap device valid. */
        unsigned long        flags;                /* SWP_USED etc: see above */
        signed short        prio;                /* swap priority of this type */
        struct plist_node list;                /* entry in swap_active_head */
        signed char        type;                /* strange name for an index */
        unsigned int        max;                /* extent of the swap_map */
        unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
        struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
        struct swap_cluster_list free_clusters; /* free clusters list */
        unsigned int lowest_bit;        /* index of first free in swap_map */
        unsigned int highest_bit;        /* index of last free in swap_map */
        unsigned int pages;                /* total of usable pages of swap */
        unsigned int inuse_pages;        /* number of those currently in use */
        unsigned int cluster_next;        /* likely index for next allocation */
        unsigned int cluster_nr;        /* countdown to next cluster search */
        unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */
        struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
        struct rb_root swap_extent_root;/* root of the swap extent rbtree */
        struct block_device *bdev;        /* swap device or bdev of swap file */
        struct file *swap_file;                /* seldom referenced */
        struct completion comp;                /* seldom referenced */
        spinlock_t lock;                /*
                                         * protect map scan related fields like
                                         * swap_map, lowest_bit, highest_bit,
                                         * inuse_pages, cluster_next,
                                         * cluster_nr, lowest_alloc,
                                         * highest_alloc, free/discard cluster
                                         * list. other fields are only changed
                                         * at swapon/swapoff, so are protected
                                         * by swap_lock. changing flags need
                                         * hold this lock and swap_lock. If
                                         * both locks need hold, hold swap_lock
                                         * first.
                                         */
        spinlock_t cont_lock;                /*
                                         * protect swap count continuation page
                                         * list.
                                         */
        struct work_struct discard_work; /* discard worker */
        struct swap_cluster_list discard_clusters; /* discard clusters list */
        struct plist_node avail_lists[]; /*
                                           * entries in swap_avail_heads, one
                                           * entry per node.
                                           * Must be last as the number of the
                                           * array is nr_node_ids, which is not
                                           * a fixed value so have to allocate
                                           * dynamically.
                                           * And it has to be an array so that
                                           * plist_for_each_* can work.
                                           */
};

static inline swp_entry_t page_swap_entry(struct page *page)
{
        struct folio *folio = page_folio(page);
        swp_entry_t entry = folio->swap;

        entry.val += folio_page_idx(folio, page);
        return entry;
}

/* linux/mm/workingset.c */
bool workingset_test_recent(void *shadow, bool file, bool *workingset);
void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg);
void workingset_refault(struct folio *folio, void *shadow);
void workingset_activation(struct folio *folio);

/* linux/mm/page_alloc.c */
extern unsigned long totalreserve_pages;

/* Definition of global_zone_page_state not available yet */
#define nr_free_pages() global_zone_page_state(NR_FREE_PAGES)


/* linux/mm/swap.c */
void lru_note_cost(struct lruvec *lruvec, bool file,
                   unsigned int nr_io, unsigned int nr_rotated);
void lru_note_cost_refault(struct folio *);
void folio_add_lru(struct folio *);
void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
void mark_page_accessed(struct page *);
void folio_mark_accessed(struct folio *);

extern atomic_t lru_disable_count;

static inline bool lru_cache_disabled(void)
{
        return atomic_read(&lru_disable_count);
}

static inline void lru_cache_enable(void)
{
        atomic_dec(&lru_disable_count);
}

extern void lru_cache_disable(void);
extern void lru_add_drain(void);
extern void lru_add_drain_cpu(int cpu);
extern void lru_add_drain_cpu_zone(struct zone *zone);
extern void lru_add_drain_all(void);
void folio_deactivate(struct folio *folio);
void folio_mark_lazyfree(struct folio *folio);
extern void swap_setup(void);

/* linux/mm/vmscan.c */
extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                        gfp_t gfp_mask, nodemask_t *mask);

#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                                  unsigned long nr_pages,
                                                  gfp_t gfp_mask,
                                                  unsigned int reclaim_options);
extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
                                                gfp_t gfp_mask, bool noswap,
                                                pg_data_t *pgdat,
                                                unsigned long *nr_scanned);
extern unsigned long shrink_all_memory(unsigned long nr_pages);
extern int vm_swappiness;
long remove_mapping(struct address_space *mapping, struct folio *folio);

#ifdef CONFIG_NUMA
extern int node_reclaim_mode;
extern int sysctl_min_unmapped_ratio;
extern int sysctl_min_slab_ratio;
#else
#define node_reclaim_mode 0
#endif

static inline bool node_reclaim_enabled(void)
{
        /* Is any node_reclaim_mode bit set? */
        return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
}

void check_move_unevictable_folios(struct folio_batch *fbatch);

extern void __meminit kswapd_run(int nid);
extern void __meminit kswapd_stop(int nid);

#ifdef CONFIG_SWAP

int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
                unsigned long nr_pages, sector_t start_block);
int generic_swapfile_activate(struct swap_info_struct *, struct file *,
                sector_t *);

static inline unsigned long total_swapcache_pages(void)
{
        return global_node_page_state(NR_SWAPCACHE);
}

void free_swap_cache(struct folio *folio);
void free_page_and_swap_cache(struct page *);
void free_pages_and_swap_cache(struct encoded_page **, int);
/* linux/mm/swapfile.c */
extern atomic_long_t nr_swap_pages;
extern long total_swap_pages;
extern atomic_t nr_rotate_swap;
extern bool has_usable_swap(void);

/* Swap 50% full? Release swapcache more aggressively.. */
static inline bool vm_swap_full(void)
{
        return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages;
}

static inline long get_nr_swap_pages(void)
{
        return atomic_long_read(&nr_swap_pages);
}

extern void si_swapinfo(struct sysinfo *);
swp_entry_t folio_alloc_swap(struct folio *folio);
bool folio_free_swap(struct folio *folio);
void put_swap_folio(struct folio *folio, swp_entry_t entry);
extern swp_entry_t get_swap_page_of_type(int);
extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order);
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
extern void swap_shmem_alloc(swp_entry_t);
extern int swap_duplicate(swp_entry_t);
extern int swapcache_prepare(swp_entry_t);
extern void swap_free(swp_entry_t);
extern void swapcache_free_entries(swp_entry_t *entries, int n);
extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
int swap_type_of(dev_t device, sector_t offset);
int find_first_swap(dev_t *device);
extern unsigned int count_swap_pages(int, int);
extern sector_t swapdev_block(int, pgoff_t);
extern int __swap_count(swp_entry_t entry);
extern int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry);
extern int swp_swapcount(swp_entry_t entry);
struct swap_info_struct *swp_swap_info(swp_entry_t entry);
struct backing_dev_info;
extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
extern void exit_swap_address_space(unsigned int type);
extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
sector_t swap_folio_sector(struct folio *folio);

static inline void put_swap_device(struct swap_info_struct *si)
{
        percpu_ref_put(&si->users);
}

#else /* CONFIG_SWAP */
static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
        return NULL;
}

static inline struct swap_info_struct *get_swap_device(swp_entry_t entry)
{
        return NULL;
}

static inline void put_swap_device(struct swap_info_struct *si)
{
}

#define get_nr_swap_pages()                        0L
#define total_swap_pages                        0L
#define total_swapcache_pages()                        0UL
#define vm_swap_full()                                0

#define si_swapinfo(val) \
        do { (val)->freeswap = (val)->totalswap = 0; } while (0)
/* only sparc can not include linux/pagemap.h in this file
 * so leave put_page and release_pages undeclared... */
#define free_page_and_swap_cache(page) \
        put_page(page)
#define free_pages_and_swap_cache(pages, nr) \
        release_pages((pages), (nr));

static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
{
}

static inline void free_swap_cache(struct folio *folio)
{
}

static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
{
        return 0;
}

static inline void swap_shmem_alloc(swp_entry_t swp)
{
}

static inline int swap_duplicate(swp_entry_t swp)
{
        return 0;
}

static inline int swapcache_prepare(swp_entry_t swp)
{
        return 0;
}

static inline void swap_free(swp_entry_t swp)
{
}

static inline void put_swap_folio(struct folio *folio, swp_entry_t swp)
{
}

static inline int __swap_count(swp_entry_t entry)
{
        return 0;
}

static inline int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
{
        return 0;
}

static inline int swp_swapcount(swp_entry_t entry)
{
        return 0;
}

static inline swp_entry_t folio_alloc_swap(struct folio *folio)
{
        swp_entry_t entry;
        entry.val = 0;
        return entry;
}

static inline bool folio_free_swap(struct folio *folio)
{
        return false;
}

static inline int add_swap_extent(struct swap_info_struct *sis,
                                  unsigned long start_page,
                                  unsigned long nr_pages, sector_t start_block)
{
        return -EINVAL;
}
#endif /* CONFIG_SWAP */

static inline void free_swap_and_cache(swp_entry_t entry)
{
        free_swap_and_cache_nr(entry, 1);
}

#ifdef CONFIG_MEMCG
static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
{
        /* Cgroup2 doesn't have per-cgroup swappiness */
        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                return READ_ONCE(vm_swappiness);

        /* root ? */
        if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))
                return READ_ONCE(vm_swappiness);

        return READ_ONCE(memcg->swappiness);
}
#else
static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
{
        return READ_ONCE(vm_swappiness);
}
#endif

#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp);
static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
        if (mem_cgroup_disabled())
                return;
        __folio_throttle_swaprate(folio, gfp);
}
#else
static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
}
#endif

#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry);
int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry);
static inline int mem_cgroup_try_charge_swap(struct folio *folio,
                swp_entry_t entry)
{
        if (mem_cgroup_disabled())
                return 0;
        return __mem_cgroup_try_charge_swap(folio, entry);
}

extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
{
        if (mem_cgroup_disabled())
                return;
        __mem_cgroup_uncharge_swap(entry, nr_pages);
}

extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
extern bool mem_cgroup_swap_full(struct folio *folio);
#else
static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
{
}

static inline int mem_cgroup_try_charge_swap(struct folio *folio,
                                             swp_entry_t entry)
{
        return 0;
}

static inline void mem_cgroup_uncharge_swap(swp_entry_t entry,
                                            unsigned int nr_pages)
{
}

static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
{
        return get_nr_swap_pages();
}

static inline bool mem_cgroup_swap_full(struct folio *folio)
{
        return vm_swap_full();
}
#endif

#endif /* __KERNEL__*/
#endif /* _LINUX_SWAP_H */





































































































































































































































   14 
   14 


















   18 









   18 








































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BACKING_DEV_DEFS_H
#define __LINUX_BACKING_DEV_DEFS_H

#include <linux/list.h>
#include <linux/radix-tree.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/percpu_counter.h>
#include <linux/percpu-refcount.h>
#include <linux/flex_proportions.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/kref.h>
#include <linux/refcount.h>

struct page;
struct device;
struct dentry;

/*
 * Bits in bdi_writeback.state
 */
enum wb_state {
        WB_registered,                /* bdi_register() was done */
        WB_writeback_running,        /* Writeback is in progress */
        WB_has_dirty_io,        /* Dirty inodes on ->b_{dirty|io|more_io} */
        WB_start_all,                /* nr_pages == 0 (all) work pending */
};

enum wb_stat_item {
        WB_RECLAIMABLE,
        WB_WRITEBACK,
        WB_DIRTIED,
        WB_WRITTEN,
        NR_WB_STAT_ITEMS
};

#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))

/*
 * why some writeback work was initiated
 */
enum wb_reason {
        WB_REASON_BACKGROUND,
        WB_REASON_VMSCAN,
        WB_REASON_SYNC,
        WB_REASON_PERIODIC,
        WB_REASON_LAPTOP_TIMER,
        WB_REASON_FS_FREE_SPACE,
        /*
         * There is no bdi forker thread any more and works are done
         * by emergency worker, however, this is TPs userland visible
         * and we'll be exposing exactly the same information,
         * so it has a mismatch name.
         */
        WB_REASON_FORKER_THREAD,
        WB_REASON_FOREIGN_FLUSH,

        WB_REASON_MAX,
};

struct wb_completion {
        atomic_t                cnt;
        wait_queue_head_t        *waitq;
};

#define __WB_COMPLETION_INIT(_waitq)        \
        (struct wb_completion){ .cnt = ATOMIC_INIT(1), .waitq = (_waitq) }

/*
 * If one wants to wait for one or more wb_writeback_works, each work's
 * ->done should be set to a wb_completion defined using the following
 * macro.  Once all work items are issued with wb_queue_work(), the caller
 * can wait for the completion of all using wb_wait_for_completion().  Work
 * items which are waited upon aren't freed automatically on completion.
 */
#define WB_COMPLETION_INIT(bdi)                __WB_COMPLETION_INIT(&(bdi)->wb_waitq)

#define DEFINE_WB_COMPLETION(cmpl, bdi)        \
        struct wb_completion cmpl = WB_COMPLETION_INIT(bdi)

/*
 * Each wb (bdi_writeback) can perform writeback operations, is measured
 * and throttled, independently.  Without cgroup writeback, each bdi
 * (bdi_writeback) is served by its embedded bdi->wb.
 *
 * On the default hierarchy, blkcg implicitly enables memcg.  This allows
 * using memcg's page ownership for attributing writeback IOs, and every
 * memcg - blkcg combination can be served by its own wb by assigning a
 * dedicated wb to each memcg, which enables isolation across different
 * cgroups and propagation of IO back pressure down from the IO layer upto
 * the tasks which are generating the dirty pages to be written back.
 *
 * A cgroup wb is indexed on its bdi by the ID of the associated memcg,
 * refcounted with the number of inodes attached to it, and pins the memcg
 * and the corresponding blkcg.  As the corresponding blkcg for a memcg may
 * change as blkcg is disabled and enabled higher up in the hierarchy, a wb
 * is tested for blkcg after lookup and removed from index on mismatch so
 * that a new wb for the combination can be created.
 *
 * Each bdi_writeback that is not embedded into the backing_dev_info must hold
 * a reference to the parent backing_dev_info.  See cgwb_create() for details.
 */
struct bdi_writeback {
        struct backing_dev_info *bdi;        /* our parent bdi */

        unsigned long state;                /* Always use atomic bitops on this */
        unsigned long last_old_flush;        /* last old data flush */

        struct list_head b_dirty;        /* dirty inodes */
        struct list_head b_io;                /* parked for writeback */
        struct list_head b_more_io;        /* parked for more writeback */
        struct list_head b_dirty_time;        /* time stamps are dirty */
        spinlock_t list_lock;                /* protects the b_* lists */

        atomic_t writeback_inodes;        /* number of inodes under writeback */
        struct percpu_counter stat[NR_WB_STAT_ITEMS];

        unsigned long bw_time_stamp;        /* last time write bw is updated */
        unsigned long dirtied_stamp;
        unsigned long written_stamp;        /* pages written at bw_time_stamp */
        unsigned long write_bandwidth;        /* the estimated write bandwidth */
        unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */

        /*
         * The base dirty throttle rate, re-calculated on every 200ms.
         * All the bdi tasks' dirty rate will be curbed under it.
         * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
         * in small steps and is much more smooth/stable than the latter.
         */
        unsigned long dirty_ratelimit;
        unsigned long balanced_dirty_ratelimit;

        struct fprop_local_percpu completions;
        int dirty_exceeded;
        enum wb_reason start_all_reason;

        spinlock_t work_lock;                /* protects work_list & dwork scheduling */
        struct list_head work_list;
        struct delayed_work dwork;        /* work item used for writeback */
        struct delayed_work bw_dwork;        /* work item used for bandwidth estimate */

        struct list_head bdi_node;        /* anchored at bdi->wb_list */

#ifdef CONFIG_CGROUP_WRITEBACK
        struct percpu_ref refcnt;        /* used only for !root wb's */
        struct fprop_local_percpu memcg_completions;
        struct cgroup_subsys_state *memcg_css; /* the associated memcg */
        struct cgroup_subsys_state *blkcg_css; /* and blkcg */
        struct list_head memcg_node;        /* anchored at memcg->cgwb_list */
        struct list_head blkcg_node;        /* anchored at blkcg->cgwb_list */
        struct list_head b_attached;        /* attached inodes, protected by list_lock */
        struct list_head offline_node;        /* anchored at offline_cgwbs */

        union {
                struct work_struct release_work;
                struct rcu_head rcu;
        };
#endif
};

struct backing_dev_info {
        u64 id;
        struct rb_node rb_node; /* keyed by ->id */
        struct list_head bdi_list;
        unsigned long ra_pages;        /* max readahead in PAGE_SIZE units */
        unsigned long io_pages;        /* max allowed IO size */

        struct kref refcnt;        /* Reference counter for the structure */
        unsigned int capabilities; /* Device capabilities */
        unsigned int min_ratio;
        unsigned int max_ratio, max_prop_frac;

        /*
         * Sum of avg_write_bw of wbs with dirty inodes.  > 0 if there are
         * any dirty wbs, which is depended upon by bdi_has_dirty().
         */
        atomic_long_t tot_write_bandwidth;
        /*
         * Jiffies when last process was dirty throttled on this bdi. Used by
         * blk-wbt.
         */
        unsigned long last_bdp_sleep;

        struct bdi_writeback wb;  /* the root writeback info for this bdi */
        struct list_head wb_list; /* list of all wbs */
#ifdef CONFIG_CGROUP_WRITEBACK
        struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
        struct mutex cgwb_release_mutex;  /* protect shutdown of wb structs */
        struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */
#endif
        wait_queue_head_t wb_waitq;

        struct device *dev;
        char dev_name[64];
        struct device *owner;

        struct timer_list laptop_mode_wb_timer;

#ifdef CONFIG_DEBUG_FS
        struct dentry *debug_dir;
#endif
};

struct wb_lock_cookie {
        bool locked;
        unsigned long flags;
};

#ifdef CONFIG_CGROUP_WRITEBACK

/**
 * wb_tryget - try to increment a wb's refcount
 * @wb: bdi_writeback to get
 */
static inline bool wb_tryget(struct bdi_writeback *wb)
{
        if (wb != &wb->bdi->wb)
                return percpu_ref_tryget(&wb->refcnt);
        return true;
}

/**
 * wb_get - increment a wb's refcount
 * @wb: bdi_writeback to get
 */
static inline void wb_get(struct bdi_writeback *wb)
{
        if (wb != &wb->bdi->wb)
                percpu_ref_get(&wb->refcnt);
}

/**
 * wb_put - decrement a wb's refcount
 * @wb: bdi_writeback to put
 * @nr: number of references to put
 */
static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr)
{
        if (WARN_ON_ONCE(!wb->bdi)) {
                /*
                 * A driver bug might cause a file to be removed before bdi was
                 * initialized.
                 */
                return;
        }

        if (wb != &wb->bdi->wb)
                percpu_ref_put_many(&wb->refcnt, nr);
}

/**
 * wb_put - decrement a wb's refcount
 * @wb: bdi_writeback to put
 */
static inline void wb_put(struct bdi_writeback *wb)
{
        wb_put_many(wb, 1);
}

/**
 * wb_dying - is a wb dying?
 * @wb: bdi_writeback of interest
 *
 * Returns whether @wb is unlinked and being drained.
 */
static inline bool wb_dying(struct bdi_writeback *wb)
{
        return percpu_ref_is_dying(&wb->refcnt);
}

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline bool wb_tryget(struct bdi_writeback *wb)
{
        return true;
}

static inline void wb_get(struct bdi_writeback *wb)
{
}

static inline void wb_put(struct bdi_writeback *wb)
{
}

static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr)
{
}

static inline bool wb_dying(struct bdi_writeback *wb)
{
        return false;
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

#endif        /* __LINUX_BACKING_DEV_DEFS_H */













































































































































































    5 











































    9 



    8 







































    5 




    6 
    5 






    5 






    6 



    1 

    2 








    2 















    5 



    5 





    5 
























    3 















    2 






    2 





























    1 








    1 






    1 















    5 












    3 

    3 














































    3 
    5 
    5 














    5 




    2 

    4 
    3 

















    6 











    3 


    3 

























    2 























    2 
























































































































































































    7 





    8 




    6 









    2 
    2 




    4 

    2 













    8 

    7 

















    6 




















    7 










    7 



    6 

    1 


    6 

    7 

    5 







    7 






    7 




    6 





    6 










    7 














    2 










    2 








    2 
    2 














   14 









   14 








   12 







    6 


    3 
    3 
    3 













   11 
    9 

   11 














   13 






































    3 




    1 
























    3 



    3 


    1 














    1 










    1 








    1 











    1 











    1 



















    1 














































    3 


    1 




    1 










    1 











    1 





















































    1 
    1 
    1 
































    8 










    9 


    9 
    1 






    7 
    4 







    5 








    1 














    1 


    7 

    1 



    7 






    2 










    2 
    4 



    5 






    2 















    4 
    4 
















    4 








    4 










    4 









    4 









    4 








































































































































































































































































































































































    5 







































































































































































    4 















    4 




    4 

    4 

    4 







    4 









    3 





    4 










    4 


































    4 

    1 
    4 

















    4 



















    4 








    4 



    4 

































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
// SPDX-License-Identifier: GPL-2.0
/*
 *  fs/ext4/extents_status.c
 *
 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
 * Modified by
 *        Allison Henderson <achender@linux.vnet.ibm.com>
 *        Hugh Dickins <hughd@google.com>
 *        Zheng Liu <wenqing.lz@taobao.com>
 *
 * Ext4 extents status tree core functions.
 */
#include <linux/list_sort.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include "ext4.h"

#include <trace/events/ext4.h>

/*
 * According to previous discussion in Ext4 Developer Workshop, we
 * will introduce a new structure called io tree to track all extent
 * status in order to solve some problems that we have met
 * (e.g. Reservation space warning), and provide extent-level locking.
 * Delay extent tree is the first step to achieve this goal.  It is
 * original built by Yongqiang Yang.  At that time it is called delay
 * extent tree, whose goal is only track delayed extents in memory to
 * simplify the implementation of fiemap and bigalloc, and introduce
 * lseek SEEK_DATA/SEEK_HOLE support.  That is why it is still called
 * delay extent tree at the first commit.  But for better understand
 * what it does, it has been rename to extent status tree.
 *
 * Step1:
 * Currently the first step has been done.  All delayed extents are
 * tracked in the tree.  It maintains the delayed extent when a delayed
 * allocation is issued, and the delayed extent is written out or
 * invalidated.  Therefore the implementation of fiemap and bigalloc
 * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
 *
 * The following comment describes the implemenmtation of extent
 * status tree and future works.
 *
 * Step2:
 * In this step all extent status are tracked by extent status tree.
 * Thus, we can first try to lookup a block mapping in this tree before
 * finding it in extent tree.  Hence, single extent cache can be removed
 * because extent status tree can do a better job.  Extents in status
 * tree are loaded on-demand.  Therefore, the extent status tree may not
 * contain all of the extents in a file.  Meanwhile we define a shrinker
 * to reclaim memory from extent status tree because fragmented extent
 * tree will make status tree cost too much memory.  written/unwritten/-
 * hole extents in the tree will be reclaimed by this shrinker when we
 * are under high memory pressure.  Delayed extents will not be
 * reclimed because fiemap, bigalloc, and seek_data/hole need it.
 */

/*
 * Extent status tree implementation for ext4.
 *
 *
 * ==========================================================================
 * Extent status tree tracks all extent status.
 *
 * 1. Why we need to implement extent status tree?
 *
 * Without extent status tree, ext4 identifies a delayed extent by looking
 * up page cache, this has several deficiencies - complicated, buggy,
 * and inefficient code.
 *
 * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
 * block or a range of blocks are belonged to a delayed extent.
 *
 * Let us have a look at how they do without extent status tree.
 *   --        FIEMAP
 *        FIEMAP looks up page cache to identify delayed allocations from holes.
 *
 *   --        SEEK_HOLE/DATA
 *        SEEK_HOLE/DATA has the same problem as FIEMAP.
 *
 *   --        bigalloc
 *        bigalloc looks up page cache to figure out if a block is
 *        already under delayed allocation or not to determine whether
 *        quota reserving is needed for the cluster.
 *
 *   --        writeout
 *        Writeout looks up whole page cache to see if a buffer is
 *        mapped, If there are not very many delayed buffers, then it is
 *        time consuming.
 *
 * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
 * bigalloc and writeout can figure out if a block or a range of
 * blocks is under delayed allocation(belonged to a delayed extent) or
 * not by searching the extent tree.
 *
 *
 * ==========================================================================
 * 2. Ext4 extent status tree impelmentation
 *
 *   --        extent
 *        A extent is a range of blocks which are contiguous logically and
 *        physically.  Unlike extent in extent tree, this extent in ext4 is
 *        a in-memory struct, there is no corresponding on-disk data.  There
 *        is no limit on length of extent, so an extent can contain as many
 *        blocks as they are contiguous logically and physically.
 *
 *   --        extent status tree
 *        Every inode has an extent status tree and all allocation blocks
 *        are added to the tree with different status.  The extent in the
 *        tree are ordered by logical block no.
 *
 *   --        operations on a extent status tree
 *        There are three important operations on a delayed extent tree: find
 *        next extent, adding a extent(a range of blocks) and removing a extent.
 *
 *   --        race on a extent status tree
 *        Extent status tree is protected by inode->i_es_lock.
 *
 *   --        memory consumption
 *      Fragmented extent tree will make extent status tree cost too much
 *      memory.  Hence, we will reclaim written/unwritten/hole extents from
 *      the tree under a heavy memory pressure.
 *
 *
 * ==========================================================================
 * 3. Performance analysis
 *
 *   --        overhead
 *        1. There is a cache extent for write access, so if writes are
 *        not very random, adding space operaions are in O(1) time.
 *
 *   --        gain
 *        2. Code is much simpler, more readable, more maintainable and
 *        more efficient.
 *
 *
 * ==========================================================================
 * 4. TODO list
 *
 *   -- Refactor delayed space reservation
 *
 *   -- Extent-level locking
 */

static struct kmem_cache *ext4_es_cachep;
static struct kmem_cache *ext4_pending_cachep;

static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
                              struct extent_status *prealloc);
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                              ext4_lblk_t end, int *reserved,
                              struct extent_status *prealloc);
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
                       struct ext4_inode_info *locked_ei);
static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
                            ext4_lblk_t len,
                            struct pending_reservation **prealloc);

int __init ext4_init_es(void)
{
        ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
        if (ext4_es_cachep == NULL)
                return -ENOMEM;
        return 0;
}

void ext4_exit_es(void)
{
        kmem_cache_destroy(ext4_es_cachep);
}

void ext4_es_init_tree(struct ext4_es_tree *tree)
{
        tree->root = RB_ROOT;
        tree->cache_es = NULL;
}

#ifdef ES_DEBUG__
static void ext4_es_print_tree(struct inode *inode)
{
        struct ext4_es_tree *tree;
        struct rb_node *node;

        printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
        tree = &EXT4_I(inode)->i_es_tree;
        node = rb_first(&tree->root);
        while (node) {
                struct extent_status *es;
                es = rb_entry(node, struct extent_status, rb_node);
                printk(KERN_DEBUG " [%u/%u) %llu %x",
                       es->es_lblk, es->es_len,
                       ext4_es_pblock(es), ext4_es_status(es));
                node = rb_next(node);
        }
        printk(KERN_DEBUG "\n");
}
#else
#define ext4_es_print_tree(inode)
#endif

static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
{
        BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
        return es->es_lblk + es->es_len - 1;
}

/*
 * search through the tree for an delayed extent with a given offset.  If
 * it can't be found, try to find next extent.
 */
static struct extent_status *__es_tree_search(struct rb_root *root,
                                              ext4_lblk_t lblk)
{
        struct rb_node *node = root->rb_node;
        struct extent_status *es = NULL;

        while (node) {
                es = rb_entry(node, struct extent_status, rb_node);
                if (lblk < es->es_lblk)
                        node = node->rb_left;
                else if (lblk > ext4_es_end(es))
                        node = node->rb_right;
                else
                        return es;
        }

        if (es && lblk < es->es_lblk)
                return es;

        if (es && lblk > ext4_es_end(es)) {
                node = rb_next(&es->rb_node);
                return node ? rb_entry(node, struct extent_status, rb_node) :
                              NULL;
        }

        return NULL;
}

/*
 * ext4_es_find_extent_range - find extent with specified status within block
 *                             range or next extent following block range in
 *                             extents status tree
 *
 * @inode - file containing the range
 * @matching_fn - pointer to function that matches extents with desired status
 * @lblk - logical block defining start of range
 * @end - logical block defining end of range
 * @es - extent found, if any
 *
 * Find the first extent within the block range specified by @lblk and @end
 * in the extents status tree that satisfies @matching_fn.  If a match
 * is found, it's returned in @es.  If not, and a matching extent is found
 * beyond the block range, it's returned in @es.  If no match is found, an
 * extent is returned in @es whose es_lblk, es_len, and es_pblk components
 * are 0.
 */
static void __es_find_extent_range(struct inode *inode,
                                   int (*matching_fn)(struct extent_status *es),
                                   ext4_lblk_t lblk, ext4_lblk_t end,
                                   struct extent_status *es)
{
        struct ext4_es_tree *tree = NULL;
        struct extent_status *es1 = NULL;
        struct rb_node *node;

        WARN_ON(es == NULL);
        WARN_ON(end < lblk);

        tree = &EXT4_I(inode)->i_es_tree;

        /* see if the extent has been cached */
        es->es_lblk = es->es_len = es->es_pblk = 0;
        es1 = READ_ONCE(tree->cache_es);
        if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
                es_debug("%u cached by [%u/%u) %llu %x\n",
                         lblk, es1->es_lblk, es1->es_len,
                         ext4_es_pblock(es1), ext4_es_status(es1));
                goto out;
        }

        es1 = __es_tree_search(&tree->root, lblk);

out:
        if (es1 && !matching_fn(es1)) {
                while ((node = rb_next(&es1->rb_node)) != NULL) {
                        es1 = rb_entry(node, struct extent_status, rb_node);
                        if (es1->es_lblk > end) {
                                es1 = NULL;
                                break;
                        }
                        if (matching_fn(es1))
                                break;
                }
        }

        if (es1 && matching_fn(es1)) {
                WRITE_ONCE(tree->cache_es, es1);
                es->es_lblk = es1->es_lblk;
                es->es_len = es1->es_len;
                es->es_pblk = es1->es_pblk;
        }

}

/*
 * Locking for __es_find_extent_range() for external use
 */
void ext4_es_find_extent_range(struct inode *inode,
                               int (*matching_fn)(struct extent_status *es),
                               ext4_lblk_t lblk, ext4_lblk_t end,
                               struct extent_status *es)
{
        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        trace_ext4_es_find_extent_range_enter(inode, lblk);

        read_lock(&EXT4_I(inode)->i_es_lock);
        __es_find_extent_range(inode, matching_fn, lblk, end, es);
        read_unlock(&EXT4_I(inode)->i_es_lock);

        trace_ext4_es_find_extent_range_exit(inode, es);
}

/*
 * __es_scan_range - search block range for block with specified status
 *                   in extents status tree
 *
 * @inode - file containing the range
 * @matching_fn - pointer to function that matches extents with desired status
 * @lblk - logical block defining start of range
 * @end - logical block defining end of range
 *
 * Returns true if at least one block in the specified block range satisfies
 * the criterion specified by @matching_fn, and false if not.  If at least
 * one extent has the specified status, then there is at least one block
 * in the cluster with that status.  Should only be called by code that has
 * taken i_es_lock.
 */
static bool __es_scan_range(struct inode *inode,
                            int (*matching_fn)(struct extent_status *es),
                            ext4_lblk_t start, ext4_lblk_t end)
{
        struct extent_status es;

        __es_find_extent_range(inode, matching_fn, start, end, &es);
        if (es.es_len == 0)
                return false;   /* no matching extent in the tree */
        else if (es.es_lblk <= start &&
                 start < es.es_lblk + es.es_len)
                return true;
        else if (start <= es.es_lblk && es.es_lblk <= end)
                return true;
        else
                return false;
}
/*
 * Locking for __es_scan_range() for external use
 */
bool ext4_es_scan_range(struct inode *inode,
                        int (*matching_fn)(struct extent_status *es),
                        ext4_lblk_t lblk, ext4_lblk_t end)
{
        bool ret;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return false;

        read_lock(&EXT4_I(inode)->i_es_lock);
        ret = __es_scan_range(inode, matching_fn, lblk, end);
        read_unlock(&EXT4_I(inode)->i_es_lock);

        return ret;
}

/*
 * __es_scan_clu - search cluster for block with specified status in
 *                 extents status tree
 *
 * @inode - file containing the cluster
 * @matching_fn - pointer to function that matches extents with desired status
 * @lblk - logical block in cluster to be searched
 *
 * Returns true if at least one extent in the cluster containing @lblk
 * satisfies the criterion specified by @matching_fn, and false if not.  If at
 * least one extent has the specified status, then there is at least one block
 * in the cluster with that status.  Should only be called by code that has
 * taken i_es_lock.
 */
static bool __es_scan_clu(struct inode *inode,
                          int (*matching_fn)(struct extent_status *es),
                          ext4_lblk_t lblk)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_lblk_t lblk_start, lblk_end;

        lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
        lblk_end = lblk_start + sbi->s_cluster_ratio - 1;

        return __es_scan_range(inode, matching_fn, lblk_start, lblk_end);
}

/*
 * Locking for __es_scan_clu() for external use
 */
bool ext4_es_scan_clu(struct inode *inode,
                      int (*matching_fn)(struct extent_status *es),
                      ext4_lblk_t lblk)
{
        bool ret;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return false;

        read_lock(&EXT4_I(inode)->i_es_lock);
        ret = __es_scan_clu(inode, matching_fn, lblk);
        read_unlock(&EXT4_I(inode)->i_es_lock);

        return ret;
}

static void ext4_es_list_add(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

        if (!list_empty(&ei->i_es_list))
                return;

        spin_lock(&sbi->s_es_lock);
        if (list_empty(&ei->i_es_list)) {
                list_add_tail(&ei->i_es_list, &sbi->s_es_list);
                sbi->s_es_nr_inode++;
        }
        spin_unlock(&sbi->s_es_lock);
}

static void ext4_es_list_del(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

        spin_lock(&sbi->s_es_lock);
        if (!list_empty(&ei->i_es_list)) {
                list_del_init(&ei->i_es_list);
                sbi->s_es_nr_inode--;
                WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
        }
        spin_unlock(&sbi->s_es_lock);
}

static inline struct pending_reservation *__alloc_pending(bool nofail)
{
        if (!nofail)
                return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);

        return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL);
}

static inline void __free_pending(struct pending_reservation *pr)
{
        kmem_cache_free(ext4_pending_cachep, pr);
}

/*
 * Returns true if we cannot fail to allocate memory for this extent_status
 * entry and cannot reclaim it until its status changes.
 */
static inline bool ext4_es_must_keep(struct extent_status *es)
{
        /* fiemap, bigalloc, and seek_data/hole need to use it. */
        if (ext4_es_is_delayed(es))
                return true;

        return false;
}

static inline struct extent_status *__es_alloc_extent(bool nofail)
{
        if (!nofail)
                return kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);

        return kmem_cache_zalloc(ext4_es_cachep, GFP_KERNEL | __GFP_NOFAIL);
}

static void ext4_es_init_extent(struct inode *inode, struct extent_status *es,
                ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk)
{
        es->es_lblk = lblk;
        es->es_len = len;
        es->es_pblk = pblk;

        /* We never try to reclaim a must kept extent, so we don't count it. */
        if (!ext4_es_must_keep(es)) {
                if (!EXT4_I(inode)->i_es_shk_nr++)
                        ext4_es_list_add(inode);
                percpu_counter_inc(&EXT4_SB(inode->i_sb)->
                                        s_es_stats.es_stats_shk_cnt);
        }

        EXT4_I(inode)->i_es_all_nr++;
        percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
}

static inline void __es_free_extent(struct extent_status *es)
{
        kmem_cache_free(ext4_es_cachep, es);
}

static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
{
        EXT4_I(inode)->i_es_all_nr--;
        percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);

        /* Decrease the shrink counter when we can reclaim the extent. */
        if (!ext4_es_must_keep(es)) {
                BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
                if (!--EXT4_I(inode)->i_es_shk_nr)
                        ext4_es_list_del(inode);
                percpu_counter_dec(&EXT4_SB(inode->i_sb)->
                                        s_es_stats.es_stats_shk_cnt);
        }

        __es_free_extent(es);
}

/*
 * Check whether or not two extents can be merged
 * Condition:
 *  - logical block number is contiguous
 *  - physical block number is contiguous
 *  - status is equal
 */
static int ext4_es_can_be_merged(struct extent_status *es1,
                                 struct extent_status *es2)
{
        if (ext4_es_type(es1) != ext4_es_type(es2))
                return 0;

        if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
                pr_warn("ES assertion failed when merging extents. "
                        "The sum of lengths of es1 (%d) and es2 (%d) "
                        "is bigger than allowed file size (%d)\n",
                        es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
                WARN_ON(1);
                return 0;
        }

        if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
                return 0;

        if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
            (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
                return 1;

        if (ext4_es_is_hole(es1))
                return 1;

        /* we need to check delayed extent is without unwritten status */
        if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
                return 1;

        return 0;
}

static struct extent_status *
ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
{
        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
        struct extent_status *es1;
        struct rb_node *node;

        node = rb_prev(&es->rb_node);
        if (!node)
                return es;

        es1 = rb_entry(node, struct extent_status, rb_node);
        if (ext4_es_can_be_merged(es1, es)) {
                es1->es_len += es->es_len;
                if (ext4_es_is_referenced(es))
                        ext4_es_set_referenced(es1);
                rb_erase(&es->rb_node, &tree->root);
                ext4_es_free_extent(inode, es);
                es = es1;
        }

        return es;
}

static struct extent_status *
ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
{
        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
        struct extent_status *es1;
        struct rb_node *node;

        node = rb_next(&es->rb_node);
        if (!node)
                return es;

        es1 = rb_entry(node, struct extent_status, rb_node);
        if (ext4_es_can_be_merged(es, es1)) {
                es->es_len += es1->es_len;
                if (ext4_es_is_referenced(es1))
                        ext4_es_set_referenced(es);
                rb_erase(node, &tree->root);
                ext4_es_free_extent(inode, es1);
        }

        return es;
}

#ifdef ES_AGGRESSIVE_TEST
#include "ext4_extents.h"        /* Needed when ES_AGGRESSIVE_TEST is defined */

static void ext4_es_insert_extent_ext_check(struct inode *inode,
                                            struct extent_status *es)
{
        struct ext4_ext_path *path = NULL;
        struct ext4_extent *ex;
        ext4_lblk_t ee_block;
        ext4_fsblk_t ee_start;
        unsigned short ee_len;
        int depth, ee_status, es_status;

        path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
        if (IS_ERR(path))
                return;

        depth = ext_depth(inode);
        ex = path[depth].p_ext;

        if (ex) {

                ee_block = le32_to_cpu(ex->ee_block);
                ee_start = ext4_ext_pblock(ex);
                ee_len = ext4_ext_get_actual_len(ex);

                ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
                es_status = ext4_es_is_unwritten(es) ? 1 : 0;

                /*
                 * Make sure ex and es are not overlap when we try to insert
                 * a delayed/hole extent.
                 */
                if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
                        if (in_range(es->es_lblk, ee_block, ee_len)) {
                                pr_warn("ES insert assertion failed for "
                                        "inode: %lu we can find an extent "
                                        "at block [%d/%d/%llu/%c], but we "
                                        "want to add a delayed/hole extent "
                                        "[%d/%d/%llu/%x]\n",
                                        inode->i_ino, ee_block, ee_len,
                                        ee_start, ee_status ? 'u' : 'w',
                                        es->es_lblk, es->es_len,
                                        ext4_es_pblock(es), ext4_es_status(es));
                        }
                        goto out;
                }

                /*
                 * We don't check ee_block == es->es_lblk, etc. because es
                 * might be a part of whole extent, vice versa.
                 */
                if (es->es_lblk < ee_block ||
                    ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "ex_status [%d/%d/%llu/%c] != "
                                "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
                                ee_block, ee_len, ee_start,
                                ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
                                ext4_es_pblock(es), es_status ? 'u' : 'w');
                        goto out;
                }

                if (ee_status ^ es_status) {
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "ex_status [%d/%d/%llu/%c] != "
                                "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
                                ee_block, ee_len, ee_start,
                                ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
                                ext4_es_pblock(es), es_status ? 'u' : 'w');
                }
        } else {
                /*
                 * We can't find an extent on disk.  So we need to make sure
                 * that we don't want to add an written/unwritten extent.
                 */
                if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "can't find an extent at block %d but we want "
                                "to add a written/unwritten extent "
                                "[%d/%d/%llu/%x]\n", inode->i_ino,
                                es->es_lblk, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                }
        }
out:
        ext4_free_ext_path(path);
}

static void ext4_es_insert_extent_ind_check(struct inode *inode,
                                            struct extent_status *es)
{
        struct ext4_map_blocks map;
        int retval;

        /*
         * Here we call ext4_ind_map_blocks to lookup a block mapping because
         * 'Indirect' structure is defined in indirect.c.  So we couldn't
         * access direct/indirect tree from outside.  It is too dirty to define
         * this function in indirect.c file.
         */

        map.m_lblk = es->es_lblk;
        map.m_len = es->es_len;

        retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
        if (retval > 0) {
                if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
                        /*
                         * We want to add a delayed/hole extent but this
                         * block has been allocated.
                         */
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "We can find blocks but we want to add a "
                                "delayed/hole extent [%d/%d/%llu/%x]\n",
                                inode->i_ino, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                        return;
                } else if (ext4_es_is_written(es)) {
                        if (retval != es->es_len) {
                                pr_warn("ES insert assertion failed for "
                                        "inode: %lu retval %d != es_len %d\n",
                                        inode->i_ino, retval, es->es_len);
                                return;
                        }
                        if (map.m_pblk != ext4_es_pblock(es)) {
                                pr_warn("ES insert assertion failed for "
                                        "inode: %lu m_pblk %llu != "
                                        "es_pblk %llu\n",
                                        inode->i_ino, map.m_pblk,
                                        ext4_es_pblock(es));
                                return;
                        }
                } else {
                        /*
                         * We don't need to check unwritten extent because
                         * indirect-based file doesn't have it.
                         */
                        BUG();
                }
        } else if (retval == 0) {
                if (ext4_es_is_written(es)) {
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "We can't find the block but we want to add "
                                "a written extent [%d/%d/%llu/%x]\n",
                                inode->i_ino, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                        return;
                }
        }
}

static inline void ext4_es_insert_extent_check(struct inode *inode,
                                               struct extent_status *es)
{
        /*
         * We don't need to worry about the race condition because
         * caller takes i_data_sem locking.
         */
        BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                ext4_es_insert_extent_ext_check(inode, es);
        else
                ext4_es_insert_extent_ind_check(inode, es);
}
#else
static inline void ext4_es_insert_extent_check(struct inode *inode,
                                               struct extent_status *es)
{
}
#endif

static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
                              struct extent_status *prealloc)
{
        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
        struct rb_node **p = &tree->root.rb_node;
        struct rb_node *parent = NULL;
        struct extent_status *es;

        while (*p) {
                parent = *p;
                es = rb_entry(parent, struct extent_status, rb_node);

                if (newes->es_lblk < es->es_lblk) {
                        if (ext4_es_can_be_merged(newes, es)) {
                                /*
                                 * Here we can modify es_lblk directly
                                 * because it isn't overlapped.
                                 */
                                es->es_lblk = newes->es_lblk;
                                es->es_len += newes->es_len;
                                if (ext4_es_is_written(es) ||
                                    ext4_es_is_unwritten(es))
                                        ext4_es_store_pblock(es,
                                                             newes->es_pblk);
                                es = ext4_es_try_to_merge_left(inode, es);
                                goto out;
                        }
                        p = &(*p)->rb_left;
                } else if (newes->es_lblk > ext4_es_end(es)) {
                        if (ext4_es_can_be_merged(es, newes)) {
                                es->es_len += newes->es_len;
                                es = ext4_es_try_to_merge_right(inode, es);
                                goto out;
                        }
                        p = &(*p)->rb_right;
                } else {
                        BUG();
                        return -EINVAL;
                }
        }

        if (prealloc)
                es = prealloc;
        else
                es = __es_alloc_extent(false);
        if (!es)
                return -ENOMEM;
        ext4_es_init_extent(inode, es, newes->es_lblk, newes->es_len,
                            newes->es_pblk);

        rb_link_node(&es->rb_node, parent, p);
        rb_insert_color(&es->rb_node, &tree->root);

out:
        tree->cache_es = es;
        return 0;
}

/*
 * ext4_es_insert_extent() adds information to an inode's extent
 * status tree.
 */
void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
                           ext4_lblk_t len, ext4_fsblk_t pblk,
                           unsigned int status)
{
        struct extent_status newes;
        ext4_lblk_t end = lblk + len - 1;
        int err1 = 0, err2 = 0, err3 = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct extent_status *es1 = NULL;
        struct extent_status *es2 = NULL;
        struct pending_reservation *pr = NULL;
        bool revise_pending = false;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
                 lblk, len, pblk, status, inode->i_ino);

        if (!len)
                return;

        BUG_ON(end < lblk);

        if ((status & EXTENT_STATUS_DELAYED) &&
            (status & EXTENT_STATUS_WRITTEN)) {
                ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
                                " delayed and written which can potentially "
                                " cause data loss.", lblk, len);
                WARN_ON(1);
        }

        newes.es_lblk = lblk;
        newes.es_len = len;
        ext4_es_store_pblock_status(&newes, pblk, status);
        trace_ext4_es_insert_extent(inode, &newes);

        ext4_es_insert_extent_check(inode, &newes);

        revise_pending = sbi->s_cluster_ratio > 1 &&
                         test_opt(inode->i_sb, DELALLOC) &&
                         (status & (EXTENT_STATUS_WRITTEN |
                                    EXTENT_STATUS_UNWRITTEN));
retry:
        if (err1 && !es1)
                es1 = __es_alloc_extent(true);
        if ((err1 || err2) && !es2)
                es2 = __es_alloc_extent(true);
        if ((err1 || err2 || err3) && revise_pending && !pr)
                pr = __alloc_pending(true);
        write_lock(&EXT4_I(inode)->i_es_lock);

        err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
        if (err1 != 0)
                goto error;
        /* Free preallocated extent if it didn't get used. */
        if (es1) {
                if (!es1->es_len)
                        __es_free_extent(es1);
                es1 = NULL;
        }

        err2 = __es_insert_extent(inode, &newes, es2);
        if (err2 == -ENOMEM && !ext4_es_must_keep(&newes))
                err2 = 0;
        if (err2 != 0)
                goto error;
        /* Free preallocated extent if it didn't get used. */
        if (es2) {
                if (!es2->es_len)
                        __es_free_extent(es2);
                es2 = NULL;
        }

        if (revise_pending) {
                err3 = __revise_pending(inode, lblk, len, &pr);
                if (err3 != 0)
                        goto error;
                if (pr) {
                        __free_pending(pr);
                        pr = NULL;
                }
        }
error:
        write_unlock(&EXT4_I(inode)->i_es_lock);
        if (err1 || err2 || err3)
                goto retry;

        ext4_es_print_tree(inode);
        return;
}

/*
 * ext4_es_cache_extent() inserts information into the extent status
 * tree if and only if there isn't information about the range in
 * question already.
 */
void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
                          ext4_lblk_t len, ext4_fsblk_t pblk,
                          unsigned int status)
{
        struct extent_status *es;
        struct extent_status newes;
        ext4_lblk_t end = lblk + len - 1;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        newes.es_lblk = lblk;
        newes.es_len = len;
        ext4_es_store_pblock_status(&newes, pblk, status);
        trace_ext4_es_cache_extent(inode, &newes);

        if (!len)
                return;

        BUG_ON(end < lblk);

        write_lock(&EXT4_I(inode)->i_es_lock);

        es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
        if (!es || es->es_lblk > end)
                __es_insert_extent(inode, &newes, NULL);
        write_unlock(&EXT4_I(inode)->i_es_lock);
}

/*
 * ext4_es_lookup_extent() looks up an extent in extent status tree.
 *
 * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
 *
 * Return: 1 on found, 0 on not
 */
int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
                          ext4_lblk_t *next_lblk,
                          struct extent_status *es)
{
        struct ext4_es_tree *tree;
        struct ext4_es_stats *stats;
        struct extent_status *es1 = NULL;
        struct rb_node *node;
        int found = 0;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return 0;

        trace_ext4_es_lookup_extent_enter(inode, lblk);
        es_debug("lookup extent in block %u\n", lblk);

        tree = &EXT4_I(inode)->i_es_tree;
        read_lock(&EXT4_I(inode)->i_es_lock);

        /* find extent in cache firstly */
        es->es_lblk = es->es_len = es->es_pblk = 0;
        es1 = READ_ONCE(tree->cache_es);
        if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
                es_debug("%u cached by [%u/%u)\n",
                         lblk, es1->es_lblk, es1->es_len);
                found = 1;
                goto out;
        }

        node = tree->root.rb_node;
        while (node) {
                es1 = rb_entry(node, struct extent_status, rb_node);
                if (lblk < es1->es_lblk)
                        node = node->rb_left;
                else if (lblk > ext4_es_end(es1))
                        node = node->rb_right;
                else {
                        found = 1;
                        break;
                }
        }

out:
        stats = &EXT4_SB(inode->i_sb)->s_es_stats;
        if (found) {
                BUG_ON(!es1);
                es->es_lblk = es1->es_lblk;
                es->es_len = es1->es_len;
                es->es_pblk = es1->es_pblk;
                if (!ext4_es_is_referenced(es1))
                        ext4_es_set_referenced(es1);
                percpu_counter_inc(&stats->es_stats_cache_hits);
                if (next_lblk) {
                        node = rb_next(&es1->rb_node);
                        if (node) {
                                es1 = rb_entry(node, struct extent_status,
                                               rb_node);
                                *next_lblk = es1->es_lblk;
                        } else
                                *next_lblk = 0;
                }
        } else {
                percpu_counter_inc(&stats->es_stats_cache_misses);
        }

        read_unlock(&EXT4_I(inode)->i_es_lock);

        trace_ext4_es_lookup_extent_exit(inode, es, found);
        return found;
}

struct rsvd_count {
        int ndelonly;
        bool first_do_lblk_found;
        ext4_lblk_t first_do_lblk;
        ext4_lblk_t last_do_lblk;
        struct extent_status *left_es;
        bool partial;
        ext4_lblk_t lclu;
};

/*
 * init_rsvd - initialize reserved count data before removing block range
 *               in file from extent status tree
 *
 * @inode - file containing range
 * @lblk - first block in range
 * @es - pointer to first extent in range
 * @rc - pointer to reserved count data
 *
 * Assumes es is not NULL
 */
static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
                      struct extent_status *es, struct rsvd_count *rc)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct rb_node *node;

        rc->ndelonly = 0;

        /*
         * for bigalloc, note the first delonly block in the range has not
         * been found, record the extent containing the block to the left of
         * the region to be removed, if any, and note that there's no partial
         * cluster to track
         */
        if (sbi->s_cluster_ratio > 1) {
                rc->first_do_lblk_found = false;
                if (lblk > es->es_lblk) {
                        rc->left_es = es;
                } else {
                        node = rb_prev(&es->rb_node);
                        rc->left_es = node ? rb_entry(node,
                                                      struct extent_status,
                                                      rb_node) : NULL;
                }
                rc->partial = false;
        }
}

/*
 * count_rsvd - count the clusters containing delayed and not unwritten
 *                (delonly) blocks in a range within an extent and add to
 *                the running tally in rsvd_count
 *
 * @inode - file containing extent
 * @lblk - first block in range
 * @len - length of range in blocks
 * @es - pointer to extent containing clusters to be counted
 * @rc - pointer to reserved count data
 *
 * Tracks partial clusters found at the beginning and end of extents so
 * they aren't overcounted when they span adjacent extents
 */
static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
                       struct extent_status *es, struct rsvd_count *rc)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_lblk_t i, end, nclu;

        if (!ext4_es_is_delonly(es))
                return;

        WARN_ON(len <= 0);

        if (sbi->s_cluster_ratio == 1) {
                rc->ndelonly += (int) len;
                return;
        }

        /* bigalloc */

        i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
        end = lblk + (ext4_lblk_t) len - 1;
        end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;

        /* record the first block of the first delonly extent seen */
        if (!rc->first_do_lblk_found) {
                rc->first_do_lblk = i;
                rc->first_do_lblk_found = true;
        }

        /* update the last lblk in the region seen so far */
        rc->last_do_lblk = end;

        /*
         * if we're tracking a partial cluster and the current extent
         * doesn't start with it, count it and stop tracking
         */
        if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
                rc->ndelonly++;
                rc->partial = false;
        }

        /*
         * if the first cluster doesn't start on a cluster boundary but
         * ends on one, count it
         */
        if (EXT4_LBLK_COFF(sbi, i) != 0) {
                if (end >= EXT4_LBLK_CFILL(sbi, i)) {
                        rc->ndelonly++;
                        rc->partial = false;
                        i = EXT4_LBLK_CFILL(sbi, i) + 1;
                }
        }

        /*
         * if the current cluster starts on a cluster boundary, count the
         * number of whole delonly clusters in the extent
         */
        if ((i + sbi->s_cluster_ratio - 1) <= end) {
                nclu = (end - i + 1) >> sbi->s_cluster_bits;
                rc->ndelonly += nclu;
                i += nclu << sbi->s_cluster_bits;
        }

        /*
         * start tracking a partial cluster if there's a partial at the end
         * of the current extent and we're not already tracking one
         */
        if (!rc->partial && i <= end) {
                rc->partial = true;
                rc->lclu = EXT4_B2C(sbi, i);
        }
}

/*
 * __pr_tree_search - search for a pending cluster reservation
 *
 * @root - root of pending reservation tree
 * @lclu - logical cluster to search for
 *
 * Returns the pending reservation for the cluster identified by @lclu
 * if found.  If not, returns a reservation for the next cluster if any,
 * and if not, returns NULL.
 */
static struct pending_reservation *__pr_tree_search(struct rb_root *root,
                                                    ext4_lblk_t lclu)
{
        struct rb_node *node = root->rb_node;
        struct pending_reservation *pr = NULL;

        while (node) {
                pr = rb_entry(node, struct pending_reservation, rb_node);
                if (lclu < pr->lclu)
                        node = node->rb_left;
                else if (lclu > pr->lclu)
                        node = node->rb_right;
                else
                        return pr;
        }
        if (pr && lclu < pr->lclu)
                return pr;
        if (pr && lclu > pr->lclu) {
                node = rb_next(&pr->rb_node);
                return node ? rb_entry(node, struct pending_reservation,
                                       rb_node) : NULL;
        }
        return NULL;
}

/*
 * get_rsvd - calculates and returns the number of cluster reservations to be
 *              released when removing a block range from the extent status tree
 *              and releases any pending reservations within the range
 *
 * @inode - file containing block range
 * @end - last block in range
 * @right_es - pointer to extent containing next block beyond end or NULL
 * @rc - pointer to reserved count data
 *
 * The number of reservations to be released is equal to the number of
 * clusters containing delayed and not unwritten (delonly) blocks within
 * the range, minus the number of clusters still containing delonly blocks
 * at the ends of the range, and minus the number of pending reservations
 * within the range.
 */
static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
                             struct extent_status *right_es,
                             struct rsvd_count *rc)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct pending_reservation *pr;
        struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
        struct rb_node *node;
        ext4_lblk_t first_lclu, last_lclu;
        bool left_delonly, right_delonly, count_pending;
        struct extent_status *es;

        if (sbi->s_cluster_ratio > 1) {
                /* count any remaining partial cluster */
                if (rc->partial)
                        rc->ndelonly++;

                if (rc->ndelonly == 0)
                        return 0;

                first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
                last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);

                /*
                 * decrease the delonly count by the number of clusters at the
                 * ends of the range that still contain delonly blocks -
                 * these clusters still need to be reserved
                 */
                left_delonly = right_delonly = false;

                es = rc->left_es;
                while (es && ext4_es_end(es) >=
                       EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
                        if (ext4_es_is_delonly(es)) {
                                rc->ndelonly--;
                                left_delonly = true;
                                break;
                        }
                        node = rb_prev(&es->rb_node);
                        if (!node)
                                break;
                        es = rb_entry(node, struct extent_status, rb_node);
                }
                if (right_es && (!left_delonly || first_lclu != last_lclu)) {
                        if (end < ext4_es_end(right_es)) {
                                es = right_es;
                        } else {
                                node = rb_next(&right_es->rb_node);
                                es = node ? rb_entry(node, struct extent_status,
                                                     rb_node) : NULL;
                        }
                        while (es && es->es_lblk <=
                               EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
                                if (ext4_es_is_delonly(es)) {
                                        rc->ndelonly--;
                                        right_delonly = true;
                                        break;
                                }
                                node = rb_next(&es->rb_node);
                                if (!node)
                                        break;
                                es = rb_entry(node, struct extent_status,
                                              rb_node);
                        }
                }

                /*
                 * Determine the block range that should be searched for
                 * pending reservations, if any.  Clusters on the ends of the
                 * original removed range containing delonly blocks are
                 * excluded.  They've already been accounted for and it's not
                 * possible to determine if an associated pending reservation
                 * should be released with the information available in the
                 * extents status tree.
                 */
                if (first_lclu == last_lclu) {
                        if (left_delonly | right_delonly)
                                count_pending = false;
                        else
                                count_pending = true;
                } else {
                        if (left_delonly)
                                first_lclu++;
                        if (right_delonly)
                                last_lclu--;
                        if (first_lclu <= last_lclu)
                                count_pending = true;
                        else
                                count_pending = false;
                }

                /*
                 * a pending reservation found between first_lclu and last_lclu
                 * represents an allocated cluster that contained at least one
                 * delonly block, so the delonly total must be reduced by one
                 * for each pending reservation found and released
                 */
                if (count_pending) {
                        pr = __pr_tree_search(&tree->root, first_lclu);
                        while (pr && pr->lclu <= last_lclu) {
                                rc->ndelonly--;
                                node = rb_next(&pr->rb_node);
                                rb_erase(&pr->rb_node, &tree->root);
                                __free_pending(pr);
                                if (!node)
                                        break;
                                pr = rb_entry(node, struct pending_reservation,
                                              rb_node);
                        }
                }
        }
        return rc->ndelonly;
}


/*
 * __es_remove_extent - removes block range from extent status tree
 *
 * @inode - file containing range
 * @lblk - first block in range
 * @end - last block in range
 * @reserved - number of cluster reservations released
 * @prealloc - pre-allocated es to avoid memory allocation failures
 *
 * If @reserved is not NULL and delayed allocation is enabled, counts
 * block/cluster reservations freed by removing range and if bigalloc
 * enabled cancels pending reservations as needed. Returns 0 on success,
 * error code on failure.
 */
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                              ext4_lblk_t end, int *reserved,
                              struct extent_status *prealloc)
{
        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
        struct rb_node *node;
        struct extent_status *es;
        struct extent_status orig_es;
        ext4_lblk_t len1, len2;
        ext4_fsblk_t block;
        int err = 0;
        bool count_reserved = true;
        struct rsvd_count rc;

        if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
                count_reserved = false;

        es = __es_tree_search(&tree->root, lblk);
        if (!es)
                goto out;
        if (es->es_lblk > end)
                goto out;

        /* Simply invalidate cache_es. */
        tree->cache_es = NULL;
        if (count_reserved)
                init_rsvd(inode, lblk, es, &rc);

        orig_es.es_lblk = es->es_lblk;
        orig_es.es_len = es->es_len;
        orig_es.es_pblk = es->es_pblk;

        len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
        len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
        if (len1 > 0)
                es->es_len = len1;
        if (len2 > 0) {
                if (len1 > 0) {
                        struct extent_status newes;

                        newes.es_lblk = end + 1;
                        newes.es_len = len2;
                        block = 0x7FDEADBEEFULL;
                        if (ext4_es_is_written(&orig_es) ||
                            ext4_es_is_unwritten(&orig_es))
                                block = ext4_es_pblock(&orig_es) +
                                        orig_es.es_len - len2;
                        ext4_es_store_pblock_status(&newes, block,
                                                    ext4_es_status(&orig_es));
                        err = __es_insert_extent(inode, &newes, prealloc);
                        if (err) {
                                if (!ext4_es_must_keep(&newes))
                                        return 0;

                                es->es_lblk = orig_es.es_lblk;
                                es->es_len = orig_es.es_len;
                                goto out;
                        }
                } else {
                        es->es_lblk = end + 1;
                        es->es_len = len2;
                        if (ext4_es_is_written(es) ||
                            ext4_es_is_unwritten(es)) {
                                block = orig_es.es_pblk + orig_es.es_len - len2;
                                ext4_es_store_pblock(es, block);
                        }
                }
                if (count_reserved)
                        count_rsvd(inode, orig_es.es_lblk + len1,
                                   orig_es.es_len - len1 - len2, &orig_es, &rc);
                goto out_get_reserved;
        }

        if (len1 > 0) {
                if (count_reserved)
                        count_rsvd(inode, lblk, orig_es.es_len - len1,
                                   &orig_es, &rc);
                node = rb_next(&es->rb_node);
                if (node)
                        es = rb_entry(node, struct extent_status, rb_node);
                else
                        es = NULL;
        }

        while (es && ext4_es_end(es) <= end) {
                if (count_reserved)
                        count_rsvd(inode, es->es_lblk, es->es_len, es, &rc);
                node = rb_next(&es->rb_node);
                rb_erase(&es->rb_node, &tree->root);
                ext4_es_free_extent(inode, es);
                if (!node) {
                        es = NULL;
                        break;
                }
                es = rb_entry(node, struct extent_status, rb_node);
        }

        if (es && es->es_lblk < end + 1) {
                ext4_lblk_t orig_len = es->es_len;

                len1 = ext4_es_end(es) - end;
                if (count_reserved)
                        count_rsvd(inode, es->es_lblk, orig_len - len1,
                                   es, &rc);
                es->es_lblk = end + 1;
                es->es_len = len1;
                if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
                        block = es->es_pblk + orig_len - len1;
                        ext4_es_store_pblock(es, block);
                }
        }

out_get_reserved:
        if (count_reserved)
                *reserved = get_rsvd(inode, end, es, &rc);
out:
        return err;
}

/*
 * ext4_es_remove_extent - removes block range from extent status tree
 *
 * @inode - file containing range
 * @lblk - first block in range
 * @len - number of blocks to remove
 *
 * Reduces block/cluster reservation count and for bigalloc cancels pending
 * reservations as needed.
 */
void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                           ext4_lblk_t len)
{
        ext4_lblk_t end;
        int err = 0;
        int reserved = 0;
        struct extent_status *es = NULL;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        trace_ext4_es_remove_extent(inode, lblk, len);
        es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
                 lblk, len, inode->i_ino);

        if (!len)
                return;

        end = lblk + len - 1;
        BUG_ON(end < lblk);

retry:
        if (err && !es)
                es = __es_alloc_extent(true);
        /*
         * ext4_clear_inode() depends on us taking i_es_lock unconditionally
         * so that we are sure __es_shrink() is done with the inode before it
         * is reclaimed.
         */
        write_lock(&EXT4_I(inode)->i_es_lock);
        err = __es_remove_extent(inode, lblk, end, &reserved, es);
        /* Free preallocated extent if it didn't get used. */
        if (es) {
                if (!es->es_len)
                        __es_free_extent(es);
                es = NULL;
        }
        write_unlock(&EXT4_I(inode)->i_es_lock);
        if (err)
                goto retry;

        ext4_es_print_tree(inode);
        ext4_da_release_space(inode, reserved);
        return;
}

static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
                       struct ext4_inode_info *locked_ei)
{
        struct ext4_inode_info *ei;
        struct ext4_es_stats *es_stats;
        ktime_t start_time;
        u64 scan_time;
        int nr_to_walk;
        int nr_shrunk = 0;
        int retried = 0, nr_skipped = 0;

        es_stats = &sbi->s_es_stats;
        start_time = ktime_get();

retry:
        spin_lock(&sbi->s_es_lock);
        nr_to_walk = sbi->s_es_nr_inode;
        while (nr_to_walk-- > 0) {
                if (list_empty(&sbi->s_es_list)) {
                        spin_unlock(&sbi->s_es_lock);
                        goto out;
                }
                ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
                                      i_es_list);
                /* Move the inode to the tail */
                list_move_tail(&ei->i_es_list, &sbi->s_es_list);

                /*
                 * Normally we try hard to avoid shrinking precached inodes,
                 * but we will as a last resort.
                 */
                if (!retried && ext4_test_inode_state(&ei->vfs_inode,
                                                EXT4_STATE_EXT_PRECACHED)) {
                        nr_skipped++;
                        continue;
                }

                if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
                        nr_skipped++;
                        continue;
                }
                /*
                 * Now we hold i_es_lock which protects us from inode reclaim
                 * freeing inode under us
                 */
                spin_unlock(&sbi->s_es_lock);

                nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
                write_unlock(&ei->i_es_lock);

                if (nr_to_scan <= 0)
                        goto out;
                spin_lock(&sbi->s_es_lock);
        }
        spin_unlock(&sbi->s_es_lock);

        /*
         * If we skipped any inodes, and we weren't able to make any
         * forward progress, try again to scan precached inodes.
         */
        if ((nr_shrunk == 0) && nr_skipped && !retried) {
                retried++;
                goto retry;
        }

        if (locked_ei && nr_shrunk == 0)
                nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);

out:
        scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
        if (likely(es_stats->es_stats_scan_time))
                es_stats->es_stats_scan_time = (scan_time +
                                es_stats->es_stats_scan_time*3) / 4;
        else
                es_stats->es_stats_scan_time = scan_time;
        if (scan_time > es_stats->es_stats_max_scan_time)
                es_stats->es_stats_max_scan_time = scan_time;
        if (likely(es_stats->es_stats_shrunk))
                es_stats->es_stats_shrunk = (nr_shrunk +
                                es_stats->es_stats_shrunk*3) / 4;
        else
                es_stats->es_stats_shrunk = nr_shrunk;

        trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
                             nr_skipped, retried);
        return nr_shrunk;
}

static unsigned long ext4_es_count(struct shrinker *shrink,
                                   struct shrink_control *sc)
{
        unsigned long nr;
        struct ext4_sb_info *sbi;

        sbi = shrink->private_data;
        nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
        trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
        return nr;
}

static unsigned long ext4_es_scan(struct shrinker *shrink,
                                  struct shrink_control *sc)
{
        struct ext4_sb_info *sbi = shrink->private_data;
        int nr_to_scan = sc->nr_to_scan;
        int ret, nr_shrunk;

        ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
        trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);

        nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);

        ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
        trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
        return nr_shrunk;
}

int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
{
        struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private);
        struct ext4_es_stats *es_stats = &sbi->s_es_stats;
        struct ext4_inode_info *ei, *max = NULL;
        unsigned int inode_cnt = 0;

        if (v != SEQ_START_TOKEN)
                return 0;

        /* here we just find an inode that has the max nr. of objects */
        spin_lock(&sbi->s_es_lock);
        list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
                inode_cnt++;
                if (max && max->i_es_all_nr < ei->i_es_all_nr)
                        max = ei;
                else if (!max)
                        max = ei;
        }
        spin_unlock(&sbi->s_es_lock);

        seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
                   percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
                   percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
        seq_printf(seq, "  %lld/%lld cache hits/misses\n",
                   percpu_counter_sum_positive(&es_stats->es_stats_cache_hits),
                   percpu_counter_sum_positive(&es_stats->es_stats_cache_misses));
        if (inode_cnt)
                seq_printf(seq, "  %d inodes on list\n", inode_cnt);

        seq_printf(seq, "average:\n  %llu us scan time\n",
            div_u64(es_stats->es_stats_scan_time, 1000));
        seq_printf(seq, "  %lu shrunk objects\n", es_stats->es_stats_shrunk);
        if (inode_cnt)
                seq_printf(seq,
                    "maximum:\n  %lu inode (%u objects, %u reclaimable)\n"
                    "  %llu us max scan time\n",
                    max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
                    div_u64(es_stats->es_stats_max_scan_time, 1000));

        return 0;
}

int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
{
        int err;

        /* Make sure we have enough bits for physical block number */
        BUILD_BUG_ON(ES_SHIFT < 48);
        INIT_LIST_HEAD(&sbi->s_es_list);
        sbi->s_es_nr_inode = 0;
        spin_lock_init(&sbi->s_es_lock);
        sbi->s_es_stats.es_stats_shrunk = 0;
        err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0,
                                  GFP_KERNEL);
        if (err)
                return err;
        err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0,
                                  GFP_KERNEL);
        if (err)
                goto err1;
        sbi->s_es_stats.es_stats_scan_time = 0;
        sbi->s_es_stats.es_stats_max_scan_time = 0;
        err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
        if (err)
                goto err2;
        err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
        if (err)
                goto err3;

        sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id);
        if (!sbi->s_es_shrinker) {
                err = -ENOMEM;
                goto err4;
        }

        sbi->s_es_shrinker->scan_objects = ext4_es_scan;
        sbi->s_es_shrinker->count_objects = ext4_es_count;
        sbi->s_es_shrinker->private_data = sbi;

        shrinker_register(sbi->s_es_shrinker);

        return 0;
err4:
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
err3:
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
err2:
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
err1:
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
        return err;
}

void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
{
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
        shrinker_free(sbi->s_es_shrinker);
}

/*
 * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
 * most *nr_to_scan extents, update *nr_to_scan accordingly.
 *
 * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
 * Increment *nr_shrunk by the number of reclaimed extents. Also update
 * ei->i_es_shrink_lblk to where we should continue scanning.
 */
static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
                                 int *nr_to_scan, int *nr_shrunk)
{
        struct inode *inode = &ei->vfs_inode;
        struct ext4_es_tree *tree = &ei->i_es_tree;
        struct extent_status *es;
        struct rb_node *node;

        es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
        if (!es)
                goto out_wrap;

        while (*nr_to_scan > 0) {
                if (es->es_lblk > end) {
                        ei->i_es_shrink_lblk = end + 1;
                        return 0;
                }

                (*nr_to_scan)--;
                node = rb_next(&es->rb_node);

                if (ext4_es_must_keep(es))
                        goto next;
                if (ext4_es_is_referenced(es)) {
                        ext4_es_clear_referenced(es);
                        goto next;
                }

                rb_erase(&es->rb_node, &tree->root);
                ext4_es_free_extent(inode, es);
                (*nr_shrunk)++;
next:
                if (!node)
                        goto out_wrap;
                es = rb_entry(node, struct extent_status, rb_node);
        }
        ei->i_es_shrink_lblk = es->es_lblk;
        return 1;
out_wrap:
        ei->i_es_shrink_lblk = 0;
        return 0;
}

static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
{
        struct inode *inode = &ei->vfs_inode;
        int nr_shrunk = 0;
        ext4_lblk_t start = ei->i_es_shrink_lblk;
        static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
                                      DEFAULT_RATELIMIT_BURST);

        if (ei->i_es_shk_nr == 0)
                return 0;

        if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
            __ratelimit(&_rs))
                ext4_warning(inode->i_sb, "forced shrink of precached extents");

        if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
            start != 0)
                es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);

        ei->i_es_tree.cache_es = NULL;
        return nr_shrunk;
}

/*
 * Called to support EXT4_IOC_CLEAR_ES_CACHE.  We can only remove
 * discretionary entries from the extent status cache.  (Some entries
 * must be present for proper operations.)
 */
void ext4_clear_inode_es(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct extent_status *es;
        struct ext4_es_tree *tree;
        struct rb_node *node;

        write_lock(&ei->i_es_lock);
        tree = &EXT4_I(inode)->i_es_tree;
        tree->cache_es = NULL;
        node = rb_first(&tree->root);
        while (node) {
                es = rb_entry(node, struct extent_status, rb_node);
                node = rb_next(node);
                if (!ext4_es_must_keep(es)) {
                        rb_erase(&es->rb_node, &tree->root);
                        ext4_es_free_extent(inode, es);
                }
        }
        ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
        write_unlock(&ei->i_es_lock);
}

#ifdef ES_DEBUG__
static void ext4_print_pending_tree(struct inode *inode)
{
        struct ext4_pending_tree *tree;
        struct rb_node *node;
        struct pending_reservation *pr;

        printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino);
        tree = &EXT4_I(inode)->i_pending_tree;
        node = rb_first(&tree->root);
        while (node) {
                pr = rb_entry(node, struct pending_reservation, rb_node);
                printk(KERN_DEBUG " %u", pr->lclu);
                node = rb_next(node);
        }
        printk(KERN_DEBUG "\n");
}
#else
#define ext4_print_pending_tree(inode)
#endif

int __init ext4_init_pending(void)
{
        ext4_pending_cachep = KMEM_CACHE(pending_reservation, SLAB_RECLAIM_ACCOUNT);
        if (ext4_pending_cachep == NULL)
                return -ENOMEM;
        return 0;
}

void ext4_exit_pending(void)
{
        kmem_cache_destroy(ext4_pending_cachep);
}

void ext4_init_pending_tree(struct ext4_pending_tree *tree)
{
        tree->root = RB_ROOT;
}

/*
 * __get_pending - retrieve a pointer to a pending reservation
 *
 * @inode - file containing the pending cluster reservation
 * @lclu - logical cluster of interest
 *
 * Returns a pointer to a pending reservation if it's a member of
 * the set, and NULL if not.  Must be called holding i_es_lock.
 */
static struct pending_reservation *__get_pending(struct inode *inode,
                                                 ext4_lblk_t lclu)
{
        struct ext4_pending_tree *tree;
        struct rb_node *node;
        struct pending_reservation *pr = NULL;

        tree = &EXT4_I(inode)->i_pending_tree;
        node = (&tree->root)->rb_node;

        while (node) {
                pr = rb_entry(node, struct pending_reservation, rb_node);
                if (lclu < pr->lclu)
                        node = node->rb_left;
                else if (lclu > pr->lclu)
                        node = node->rb_right;
                else if (lclu == pr->lclu)
                        return pr;
        }
        return NULL;
}

/*
 * __insert_pending - adds a pending cluster reservation to the set of
 *                    pending reservations
 *
 * @inode - file containing the cluster
 * @lblk - logical block in the cluster to be added
 * @prealloc - preallocated pending entry
 *
 * Returns 0 on successful insertion and -ENOMEM on failure.  If the
 * pending reservation is already in the set, returns successfully.
 */
static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
                            struct pending_reservation **prealloc)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
        struct rb_node **p = &tree->root.rb_node;
        struct rb_node *parent = NULL;
        struct pending_reservation *pr;
        ext4_lblk_t lclu;
        int ret = 0;

        lclu = EXT4_B2C(sbi, lblk);
        /* search to find parent for insertion */
        while (*p) {
                parent = *p;
                pr = rb_entry(parent, struct pending_reservation, rb_node);

                if (lclu < pr->lclu) {
                        p = &(*p)->rb_left;
                } else if (lclu > pr->lclu) {
                        p = &(*p)->rb_right;
                } else {
                        /* pending reservation already inserted */
                        goto out;
                }
        }

        if (likely(*prealloc == NULL)) {
                pr = __alloc_pending(false);
                if (!pr) {
                        ret = -ENOMEM;
                        goto out;
                }
        } else {
                pr = *prealloc;
                *prealloc = NULL;
        }
        pr->lclu = lclu;

        rb_link_node(&pr->rb_node, parent, p);
        rb_insert_color(&pr->rb_node, &tree->root);

out:
        return ret;
}

/*
 * __remove_pending - removes a pending cluster reservation from the set
 *                    of pending reservations
 *
 * @inode - file containing the cluster
 * @lblk - logical block in the pending cluster reservation to be removed
 *
 * Returns successfully if pending reservation is not a member of the set.
 */
static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct pending_reservation *pr;
        struct ext4_pending_tree *tree;

        pr = __get_pending(inode, EXT4_B2C(sbi, lblk));
        if (pr != NULL) {
                tree = &EXT4_I(inode)->i_pending_tree;
                rb_erase(&pr->rb_node, &tree->root);
                __free_pending(pr);
        }
}

/*
 * ext4_remove_pending - removes a pending cluster reservation from the set
 *                       of pending reservations
 *
 * @inode - file containing the cluster
 * @lblk - logical block in the pending cluster reservation to be removed
 *
 * Locking for external use of __remove_pending.
 */
void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        write_lock(&ei->i_es_lock);
        __remove_pending(inode, lblk);
        write_unlock(&ei->i_es_lock);
}

/*
 * ext4_is_pending - determine whether a cluster has a pending reservation
 *                   on it
 *
 * @inode - file containing the cluster
 * @lblk - logical block in the cluster
 *
 * Returns true if there's a pending reservation for the cluster in the
 * set of pending reservations, and false if not.
 */
bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
        bool ret;

        read_lock(&ei->i_es_lock);
        ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL);
        read_unlock(&ei->i_es_lock);

        return ret;
}

/*
 * ext4_es_insert_delayed_block - adds a delayed block to the extents status
 *                                tree, adding a pending reservation where
 *                                needed
 *
 * @inode - file containing the newly added block
 * @lblk - logical block to be added
 * @allocated - indicates whether a physical cluster has been allocated for
 *              the logical cluster that contains the block
 */
void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
                                  bool allocated)
{
        struct extent_status newes;
        int err1 = 0, err2 = 0, err3 = 0;
        struct extent_status *es1 = NULL;
        struct extent_status *es2 = NULL;
        struct pending_reservation *pr = NULL;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
                 lblk, inode->i_ino);

        newes.es_lblk = lblk;
        newes.es_len = 1;
        ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
        trace_ext4_es_insert_delayed_block(inode, &newes, allocated);

        ext4_es_insert_extent_check(inode, &newes);

retry:
        if (err1 && !es1)
                es1 = __es_alloc_extent(true);
        if ((err1 || err2) && !es2)
                es2 = __es_alloc_extent(true);
        if ((err1 || err2 || err3) && allocated && !pr)
                pr = __alloc_pending(true);
        write_lock(&EXT4_I(inode)->i_es_lock);

        err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
        if (err1 != 0)
                goto error;
        /* Free preallocated extent if it didn't get used. */
        if (es1) {
                if (!es1->es_len)
                        __es_free_extent(es1);
                es1 = NULL;
        }

        err2 = __es_insert_extent(inode, &newes, es2);
        if (err2 != 0)
                goto error;
        /* Free preallocated extent if it didn't get used. */
        if (es2) {
                if (!es2->es_len)
                        __es_free_extent(es2);
                es2 = NULL;
        }

        if (allocated) {
                err3 = __insert_pending(inode, lblk, &pr);
                if (err3 != 0)
                        goto error;
                if (pr) {
                        __free_pending(pr);
                        pr = NULL;
                }
        }
error:
        write_unlock(&EXT4_I(inode)->i_es_lock);
        if (err1 || err2 || err3)
                goto retry;

        ext4_es_print_tree(inode);
        ext4_print_pending_tree(inode);
        return;
}

/*
 * __es_delayed_clu - count number of clusters containing blocks that
 *                    are delayed only
 *
 * @inode - file containing block range
 * @start - logical block defining start of range
 * @end - logical block defining end of range
 *
 * Returns the number of clusters containing only delayed (not delayed
 * and unwritten) blocks in the range specified by @start and @end.  Any
 * cluster or part of a cluster within the range and containing a delayed
 * and not unwritten block within the range is counted as a whole cluster.
 */
static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
                                     ext4_lblk_t end)
{
        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
        struct extent_status *es;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct rb_node *node;
        ext4_lblk_t first_lclu, last_lclu;
        unsigned long long last_counted_lclu;
        unsigned int n = 0;

        /* guaranteed to be unequal to any ext4_lblk_t value */
        last_counted_lclu = ~0ULL;

        es = __es_tree_search(&tree->root, start);

        while (es && (es->es_lblk <= end)) {
                if (ext4_es_is_delonly(es)) {
                        if (es->es_lblk <= start)
                                first_lclu = EXT4_B2C(sbi, start);
                        else
                                first_lclu = EXT4_B2C(sbi, es->es_lblk);

                        if (ext4_es_end(es) >= end)
                                last_lclu = EXT4_B2C(sbi, end);
                        else
                                last_lclu = EXT4_B2C(sbi, ext4_es_end(es));

                        if (first_lclu == last_counted_lclu)
                                n += last_lclu - first_lclu;
                        else
                                n += last_lclu - first_lclu + 1;
                        last_counted_lclu = last_lclu;
                }
                node = rb_next(&es->rb_node);
                if (!node)
                        break;
                es = rb_entry(node, struct extent_status, rb_node);
        }

        return n;
}

/*
 * ext4_es_delayed_clu - count number of clusters containing blocks that
 *                       are both delayed and unwritten
 *
 * @inode - file containing block range
 * @lblk - logical block defining start of range
 * @len - number of blocks in range
 *
 * Locking for external use of __es_delayed_clu().
 */
unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
                                 ext4_lblk_t len)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        ext4_lblk_t end;
        unsigned int n;

        if (len == 0)
                return 0;

        end = lblk + len - 1;
        WARN_ON(end < lblk);

        read_lock(&ei->i_es_lock);

        n = __es_delayed_clu(inode, lblk, end);

        read_unlock(&ei->i_es_lock);

        return n;
}

/*
 * __revise_pending - makes, cancels, or leaves unchanged pending cluster
 *                    reservations for a specified block range depending
 *                    upon the presence or absence of delayed blocks
 *                    outside the range within clusters at the ends of the
 *                    range
 *
 * @inode - file containing the range
 * @lblk - logical block defining the start of range
 * @len  - length of range in blocks
 * @prealloc - preallocated pending entry
 *
 * Used after a newly allocated extent is added to the extents status tree.
 * Requires that the extents in the range have either written or unwritten
 * status.  Must be called while holding i_es_lock.
 */
static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
                            ext4_lblk_t len,
                            struct pending_reservation **prealloc)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_lblk_t end = lblk + len - 1;
        ext4_lblk_t first, last;
        bool f_del = false, l_del = false;
        int ret = 0;

        if (len == 0)
                return 0;

        /*
         * Two cases - block range within single cluster and block range
         * spanning two or more clusters.  Note that a cluster belonging
         * to a range starting and/or ending on a cluster boundary is treated
         * as if it does not contain a delayed extent.  The new range may
         * have allocated space for previously delayed blocks out to the
         * cluster boundary, requiring that any pre-existing pending
         * reservation be canceled.  Because this code only looks at blocks
         * outside the range, it should revise pending reservations
         * correctly even if the extent represented by the range can't be
         * inserted in the extents status tree due to ENOSPC.
         */

        if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
                first = EXT4_LBLK_CMASK(sbi, lblk);
                if (first != lblk)
                        f_del = __es_scan_range(inode, &ext4_es_is_delonly,
                                                first, lblk - 1);
                if (f_del) {
                        ret = __insert_pending(inode, first, prealloc);
                        if (ret < 0)
                                goto out;
                } else {
                        last = EXT4_LBLK_CMASK(sbi, end) +
                               sbi->s_cluster_ratio - 1;
                        if (last != end)
                                l_del = __es_scan_range(inode,
                                                        &ext4_es_is_delonly,
                                                        end + 1, last);
                        if (l_del) {
                                ret = __insert_pending(inode, last, prealloc);
                                if (ret < 0)
                                        goto out;
                        } else
                                __remove_pending(inode, last);
                }
        } else {
                first = EXT4_LBLK_CMASK(sbi, lblk);
                if (first != lblk)
                        f_del = __es_scan_range(inode, &ext4_es_is_delonly,
                                                first, lblk - 1);
                if (f_del) {
                        ret = __insert_pending(inode, first, prealloc);
                        if (ret < 0)
                                goto out;
                } else
                        __remove_pending(inode, first);

                last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
                if (last != end)
                        l_del = __es_scan_range(inode, &ext4_es_is_delonly,
                                                end + 1, last);
                if (l_del) {
                        ret = __insert_pending(inode, last, prealloc);
                        if (ret < 0)
                                goto out;
                } else
                        __remove_pending(inode, last);
        }
out:
        return ret;
}


































































































































































































    3 













































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2008 IBM Corporation
 *
 * Author: Mimi Zohar <zohar@us.ibm.com>
 *
 * File: ima_api.c
 *        Implements must_appraise_or_measure, collect_measurement,
 *        appraise_measurement, store_measurement and store_template.
 */
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/xattr.h>
#include <linux/evm.h>
#include <linux/fsverity.h>

#include "ima.h"

/*
 * ima_free_template_entry - free an existing template entry
 */
void ima_free_template_entry(struct ima_template_entry *entry)
{
        int i;

        for (i = 0; i < entry->template_desc->num_fields; i++)
                kfree(entry->template_data[i].data);

        kfree(entry->digests);
        kfree(entry);
}

/*
 * ima_alloc_init_template - create and initialize a new template entry
 */
int ima_alloc_init_template(struct ima_event_data *event_data,
                            struct ima_template_entry **entry,
                            struct ima_template_desc *desc)
{
        struct ima_template_desc *template_desc;
        struct tpm_digest *digests;
        int i, result = 0;

        if (desc)
                template_desc = desc;
        else
                template_desc = ima_template_desc_current();

        *entry = kzalloc(struct_size(*entry, template_data,
                                     template_desc->num_fields), GFP_NOFS);
        if (!*entry)
                return -ENOMEM;

        digests = kcalloc(NR_BANKS(ima_tpm_chip) + ima_extra_slots,
                          sizeof(*digests), GFP_NOFS);
        if (!digests) {
                kfree(*entry);
                *entry = NULL;
                return -ENOMEM;
        }

        (*entry)->digests = digests;
        (*entry)->template_desc = template_desc;
        for (i = 0; i < template_desc->num_fields; i++) {
                const struct ima_template_field *field =
                        template_desc->fields[i];
                u32 len;

                result = field->field_init(event_data,
                                           &((*entry)->template_data[i]));
                if (result != 0)
                        goto out;

                len = (*entry)->template_data[i].len;
                (*entry)->template_data_len += sizeof(len);
                (*entry)->template_data_len += len;
        }
        return 0;
out:
        ima_free_template_entry(*entry);
        *entry = NULL;
        return result;
}

/*
 * ima_store_template - store ima template measurements
 *
 * Calculate the hash of a template entry, add the template entry
 * to an ordered list of measurement entries maintained inside the kernel,
 * and also update the aggregate integrity value (maintained inside the
 * configured TPM PCR) over the hashes of the current list of measurement
 * entries.
 *
 * Applications retrieve the current kernel-held measurement list through
 * the securityfs entries in /sys/kernel/security/ima. The signed aggregate
 * TPM PCR (called quote) can be retrieved using a TPM user space library
 * and is used to validate the measurement list.
 *
 * Returns 0 on success, error code otherwise
 */
int ima_store_template(struct ima_template_entry *entry,
                       int violation, struct inode *inode,
                       const unsigned char *filename, int pcr)
{
        static const char op[] = "add_template_measure";
        static const char audit_cause[] = "hashing_error";
        char *template_name = entry->template_desc->name;
        int result;

        if (!violation) {
                result = ima_calc_field_array_hash(&entry->template_data[0],
                                                   entry);
                if (result < 0) {
                        integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode,
                                            template_name, op,
                                            audit_cause, result, 0);
                        return result;
                }
        }
        entry->pcr = pcr;
        result = ima_add_template_entry(entry, violation, op, inode, filename);
        return result;
}

/*
 * ima_add_violation - add violation to measurement list.
 *
 * Violations are flagged in the measurement list with zero hash values.
 * By extending the PCR with 0xFF's instead of with zeroes, the PCR
 * value is invalidated.
 */
void ima_add_violation(struct file *file, const unsigned char *filename,
                       struct ima_iint_cache *iint, const char *op,
                       const char *cause)
{
        struct ima_template_entry *entry;
        struct inode *inode = file_inode(file);
        struct ima_event_data event_data = { .iint = iint,
                                             .file = file,
                                             .filename = filename,
                                             .violation = cause };
        int violation = 1;
        int result;

        /* can overflow, only indicator */
        atomic_long_inc(&ima_htable.violations);

        result = ima_alloc_init_template(&event_data, &entry, NULL);
        if (result < 0) {
                result = -ENOMEM;
                goto err_out;
        }
        result = ima_store_template(entry, violation, inode,
                                    filename, CONFIG_IMA_MEASURE_PCR_IDX);
        if (result < 0)
                ima_free_template_entry(entry);
err_out:
        integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename,
                            op, cause, result, 0);
}

/**
 * ima_get_action - appraise & measure decision based on policy.
 * @idmap: idmap of the mount the inode was found from
 * @inode: pointer to the inode associated with the object being validated
 * @cred: pointer to credentials structure to validate
 * @secid: secid of the task being validated
 * @mask: contains the permission mask (MAY_READ, MAY_WRITE, MAY_EXEC,
 *        MAY_APPEND)
 * @func: caller identifier
 * @pcr: pointer filled in if matched measure policy sets pcr=
 * @template_desc: pointer filled in if matched measure policy sets template=
 * @func_data: func specific data, may be NULL
 * @allowed_algos: allowlist of hash algorithms for the IMA xattr
 *
 * The policy is defined in terms of keypairs:
 *                subj=, obj=, type=, func=, mask=, fsmagic=
 *        subj,obj, and type: are LSM specific.
 *        func: FILE_CHECK | BPRM_CHECK | CREDS_CHECK | MMAP_CHECK | MODULE_CHECK
 *        | KEXEC_CMDLINE | KEY_CHECK | CRITICAL_DATA | SETXATTR_CHECK
 *        | MMAP_CHECK_REQPROT
 *        mask: contains the permission mask
 *        fsmagic: hex value
 *
 * Returns IMA_MEASURE, IMA_APPRAISE mask.
 *
 */
int ima_get_action(struct mnt_idmap *idmap, struct inode *inode,
                   const struct cred *cred, u32 secid, int mask,
                   enum ima_hooks func, int *pcr,
                   struct ima_template_desc **template_desc,
                   const char *func_data, unsigned int *allowed_algos)
{
        int flags = IMA_MEASURE | IMA_AUDIT | IMA_APPRAISE | IMA_HASH;

        flags &= ima_policy_flag;

        return ima_match_policy(idmap, inode, cred, secid, func, mask,
                                flags, pcr, template_desc, func_data,
                                allowed_algos);
}

static bool ima_get_verity_digest(struct ima_iint_cache *iint,
                                  struct inode *inode,
                                  struct ima_max_digest_data *hash)
{
        enum hash_algo alg;
        int digest_len;

        /*
         * On failure, 'measure' policy rules will result in a file data
         * hash containing 0's.
         */
        digest_len = fsverity_get_digest(inode, hash->digest, NULL, &alg);
        if (digest_len == 0)
                return false;

        /*
         * Unlike in the case of actually calculating the file hash, in
         * the fsverity case regardless of the hash algorithm, return
         * the verity digest to be included in the measurement list. A
         * mismatch between the verity algorithm and the xattr signature
         * algorithm, if one exists, will be detected later.
         */
        hash->hdr.algo = alg;
        hash->hdr.length = digest_len;
        return true;
}

/*
 * ima_collect_measurement - collect file measurement
 *
 * Calculate the file hash, if it doesn't already exist,
 * storing the measurement and i_version in the iint.
 *
 * Must be called with iint->mutex held.
 *
 * Return 0 on success, error code otherwise
 */
int ima_collect_measurement(struct ima_iint_cache *iint, struct file *file,
                            void *buf, loff_t size, enum hash_algo algo,
                            struct modsig *modsig)
{
        const char *audit_cause = "failed";
        struct inode *inode = file_inode(file);
        struct inode *real_inode = d_real_inode(file_dentry(file));
        struct ima_max_digest_data hash;
        struct ima_digest_data *hash_hdr = container_of(&hash.hdr,
                                                struct ima_digest_data, hdr);
        struct name_snapshot filename;
        struct kstat stat;
        int result = 0;
        int length;
        void *tmpbuf;
        u64 i_version = 0;

        /*
         * Always collect the modsig, because IMA might have already collected
         * the file digest without collecting the modsig in a previous
         * measurement rule.
         */
        if (modsig)
                ima_collect_modsig(modsig, buf, size);

        if (iint->flags & IMA_COLLECTED)
                goto out;

        /*
         * Detecting file change is based on i_version. On filesystems
         * which do not support i_version, support was originally limited
         * to an initial measurement/appraisal/audit, but was modified to
         * assume the file changed.
         */
        result = vfs_getattr_nosec(&file->f_path, &stat, STATX_CHANGE_COOKIE,
                                   AT_STATX_SYNC_AS_STAT);
        if (!result && (stat.result_mask & STATX_CHANGE_COOKIE))
                i_version = stat.change_cookie;
        hash.hdr.algo = algo;
        hash.hdr.length = hash_digest_size[algo];

        /* Initialize hash digest to 0's in case of failure */
        memset(&hash.digest, 0, sizeof(hash.digest));

        if (iint->flags & IMA_VERITY_REQUIRED) {
                if (!ima_get_verity_digest(iint, inode, &hash)) {
                        audit_cause = "no-verity-digest";
                        result = -ENODATA;
                }
        } else if (buf) {
                result = ima_calc_buffer_hash(buf, size, hash_hdr);
        } else {
                result = ima_calc_file_hash(file, hash_hdr);
        }

        if (result && result != -EBADF && result != -EINVAL)
                goto out;

        length = sizeof(hash.hdr) + hash.hdr.length;
        tmpbuf = krealloc(iint->ima_hash, length, GFP_NOFS);
        if (!tmpbuf) {
                result = -ENOMEM;
                goto out;
        }

        iint->ima_hash = tmpbuf;
        memcpy(iint->ima_hash, &hash, length);
        if (real_inode == inode)
                iint->real_inode.version = i_version;
        else
                integrity_inode_attrs_store(&iint->real_inode, i_version,
                                            real_inode);

        /* Possibly temporary failure due to type of read (eg. O_DIRECT) */
        if (!result)
                iint->flags |= IMA_COLLECTED;
out:
        if (result) {
                if (file->f_flags & O_DIRECT)
                        audit_cause = "failed(directio)";

                take_dentry_name_snapshot(&filename, file->f_path.dentry);

                integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode,
                                    filename.name.name, "collect_data",
                                    audit_cause, result, 0);

                release_dentry_name_snapshot(&filename);
        }
        return result;
}

/*
 * ima_store_measurement - store file measurement
 *
 * Create an "ima" template and then store the template by calling
 * ima_store_template.
 *
 * We only get here if the inode has not already been measured,
 * but the measurement could already exist:
 *        - multiple copies of the same file on either the same or
 *          different filesystems.
 *        - the inode was previously flushed as well as the iint info,
 *          containing the hashing info.
 *
 * Must be called with iint->mutex held.
 */
void ima_store_measurement(struct ima_iint_cache *iint, struct file *file,
                           const unsigned char *filename,
                           struct evm_ima_xattr_data *xattr_value,
                           int xattr_len, const struct modsig *modsig, int pcr,
                           struct ima_template_desc *template_desc)
{
        static const char op[] = "add_template_measure";
        static const char audit_cause[] = "ENOMEM";
        int result = -ENOMEM;
        struct inode *inode = file_inode(file);
        struct ima_template_entry *entry;
        struct ima_event_data event_data = { .iint = iint,
                                             .file = file,
                                             .filename = filename,
                                             .xattr_value = xattr_value,
                                             .xattr_len = xattr_len,
                                             .modsig = modsig };
        int violation = 0;

        /*
         * We still need to store the measurement in the case of MODSIG because
         * we only have its contents to put in the list at the time of
         * appraisal, but a file measurement from earlier might already exist in
         * the measurement list.
         */
        if (iint->measured_pcrs & (0x1 << pcr) && !modsig)
                return;

        result = ima_alloc_init_template(&event_data, &entry, template_desc);
        if (result < 0) {
                integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename,
                                    op, audit_cause, result, 0);
                return;
        }

        result = ima_store_template(entry, violation, inode, filename, pcr);
        if ((!result || result == -EEXIST) && !(file->f_flags & O_DIRECT)) {
                iint->flags |= IMA_MEASURED;
                iint->measured_pcrs |= (0x1 << pcr);
        }
        if (result < 0)
                ima_free_template_entry(entry);
}

void ima_audit_measurement(struct ima_iint_cache *iint,
                           const unsigned char *filename)
{
        struct audit_buffer *ab;
        char *hash;
        const char *algo_name = hash_algo_name[iint->ima_hash->algo];
        int i;

        if (iint->flags & IMA_AUDITED)
                return;

        hash = kzalloc((iint->ima_hash->length * 2) + 1, GFP_KERNEL);
        if (!hash)
                return;

        for (i = 0; i < iint->ima_hash->length; i++)
                hex_byte_pack(hash + (i * 2), iint->ima_hash->digest[i]);
        hash[i * 2] = '\0';

        ab = audit_log_start(audit_context(), GFP_KERNEL,
                             AUDIT_INTEGRITY_RULE);
        if (!ab)
                goto out;

        audit_log_format(ab, "file=");
        audit_log_untrustedstring(ab, filename);
        audit_log_format(ab, " hash=\"%s:%s\"", algo_name, hash);

        audit_log_task_info(ab);
        audit_log_end(ab);

        iint->flags |= IMA_AUDITED;
out:
        kfree(hash);
        return;
}

/*
 * ima_d_path - return a pointer to the full pathname
 *
 * Attempt to return a pointer to the full pathname for use in the
 * IMA measurement list, IMA audit records, and auditing logs.
 *
 * On failure, return a pointer to a copy of the filename, not dname.
 * Returning a pointer to dname, could result in using the pointer
 * after the memory has been freed.
 */
const char *ima_d_path(const struct path *path, char **pathbuf, char *namebuf)
{
        struct name_snapshot filename;
        char *pathname = NULL;

        *pathbuf = __getname();
        if (*pathbuf) {
                pathname = d_absolute_path(path, *pathbuf, PATH_MAX);
                if (IS_ERR(pathname)) {
                        __putname(*pathbuf);
                        *pathbuf = NULL;
                        pathname = NULL;
                }
        }

        if (!pathname) {
                take_dentry_name_snapshot(&filename, path->dentry);
                strscpy(namebuf, filename.name.name, NAME_MAX);
                release_dentry_name_snapshot(&filename);

                pathname = namebuf;
        }

        return pathname;
}












































































































































































































































































































































































































    3 























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
/*
 *  linux/include/linux/console.h
 *
 *  Copyright (C) 1993        Hamish Macdonald
 *
 * This file is subject to the terms and conditions of the GNU General Public
 * License.  See the file COPYING in the main directory of this archive
 * for more details.
 *
 * Changed:
 * 10-Mar-94: Arno Griffioen: Conversion for vt100 emulator port from PC LINUX
 */

#ifndef _LINUX_CONSOLE_H_
#define _LINUX_CONSOLE_H_ 1

#include <linux/atomic.h>
#include <linux/bits.h>
#include <linux/rculist.h>
#include <linux/types.h>
#include <linux/vesa.h>

struct vc_data;
struct console_font_op;
struct console_font;
struct module;
struct tty_struct;
struct notifier_block;

enum con_scroll {
        SM_UP,
        SM_DOWN,
};

enum vc_intensity;

/**
 * struct consw - callbacks for consoles
 *
 * @owner:      the module to get references of when this console is used
 * @con_startup: set up the console and return its name (like VGA, EGA, ...)
 * @con_init:   initialize the console on @vc. @init is true for the very first
 *                call on this @vc.
 * @con_deinit: deinitialize the console from @vc.
 * @con_clear:  erase @count characters at [@x, @y] on @vc. @count >= 1.
 * @con_putc:   emit one character with attributes @ca to [@x, @y] on @vc.
 *                (optional -- @con_putcs would be called instead)
 * @con_putcs:  emit @count characters with attributes @s to [@x, @y] on @vc.
 * @con_cursor: enable/disable cursor depending on @enable
 * @con_scroll: move lines from @top to @bottom in direction @dir by @lines.
 *                Return true if no generic handling should be done.
 *                Invoked by csi_M and printing to the console.
 * @con_switch: notifier about the console switch; it is supposed to return
 *                true if a redraw is needed.
 * @con_blank:  blank/unblank the console. The target mode is passed in @blank.
 *                @mode_switch is set if changing from/to text/graphics. The hook
 *                is supposed to return true if a redraw is needed.
 * @con_font_set: set console @vc font to @font with height @vpitch. @flags can
 *                be %KD_FONT_FLAG_DONT_RECALC. (optional)
 * @con_font_get: fetch the current font on @vc of height @vpitch into @font.
 *                (optional)
 * @con_font_default: set default font on @vc. @name can be %NULL or font name
 *                to search for. @font can be filled back. (optional)
 * @con_resize:        resize the @vc console to @width x @height. @from_user is true
 *                when this change comes from the user space.
 * @con_set_palette: sets the palette of the console @vc to @table (optional)
 * @con_scrolldelta: the contents of the console should be scrolled by @lines.
 *                     Invoked by user. (optional)
 * @con_set_origin: set origin (see &vc_data::vc_origin) of the @vc. If not
 *                provided or returns false, the origin is set to
 *                @vc->vc_screenbuf. (optional)
 * @con_save_screen: save screen content into @vc->vc_screenbuf. Called e.g.
 *                upon entering graphics. (optional)
 * @con_build_attr: build attributes based on @color, @intensity and other
 *                parameters. The result is used for both normal and erase
 *                characters. (optional)
 * @con_invert_region: invert a region of length @count on @vc starting at @p.
 *                (optional)
 * @con_debug_enter: prepare the console for the debugger. This includes, but
 *                is not limited to, unblanking the console, loading an
 *                appropriate palette, and allowing debugger generated output.
 *                (optional)
 * @con_debug_leave: restore the console to its pre-debug state as closely as
 *                possible. (optional)
 */
struct consw {
        struct module *owner;
        const char *(*con_startup)(void);
        void        (*con_init)(struct vc_data *vc, bool init);
        void        (*con_deinit)(struct vc_data *vc);
        void        (*con_clear)(struct vc_data *vc, unsigned int y,
                             unsigned int x, unsigned int count);
        void        (*con_putc)(struct vc_data *vc, u16 ca, unsigned int y,
                            unsigned int x);
        void        (*con_putcs)(struct vc_data *vc, const u16 *s,
                             unsigned int count, unsigned int ypos,
                             unsigned int xpos);
        void        (*con_cursor)(struct vc_data *vc, bool enable);
        bool        (*con_scroll)(struct vc_data *vc, unsigned int top,
                        unsigned int bottom, enum con_scroll dir,
                        unsigned int lines);
        bool        (*con_switch)(struct vc_data *vc);
        bool        (*con_blank)(struct vc_data *vc, enum vesa_blank_mode blank,
                             bool mode_switch);
        int        (*con_font_set)(struct vc_data *vc,
                                const struct console_font *font,
                                unsigned int vpitch, unsigned int flags);
        int        (*con_font_get)(struct vc_data *vc, struct console_font *font,
                        unsigned int vpitch);
        int        (*con_font_default)(struct vc_data *vc,
                        struct console_font *font, const char *name);
        int     (*con_resize)(struct vc_data *vc, unsigned int width,
                              unsigned int height, bool from_user);
        void        (*con_set_palette)(struct vc_data *vc,
                        const unsigned char *table);
        void        (*con_scrolldelta)(struct vc_data *vc, int lines);
        bool        (*con_set_origin)(struct vc_data *vc);
        void        (*con_save_screen)(struct vc_data *vc);
        u8        (*con_build_attr)(struct vc_data *vc, u8 color,
                        enum vc_intensity intensity,
                        bool blink, bool underline, bool reverse, bool italic);
        void        (*con_invert_region)(struct vc_data *vc, u16 *p, int count);
        void        (*con_debug_enter)(struct vc_data *vc);
        void        (*con_debug_leave)(struct vc_data *vc);
};

extern const struct consw *conswitchp;

extern const struct consw dummy_con;        /* dummy console buffer */
extern const struct consw vga_con;        /* VGA text console */
extern const struct consw newport_con;        /* SGI Newport console  */

struct screen_info;
#ifdef CONFIG_VGA_CONSOLE
void vgacon_register_screen(struct screen_info *si);
#else
static inline void vgacon_register_screen(struct screen_info *si) { }
#endif

int con_is_bound(const struct consw *csw);
int do_unregister_con_driver(const struct consw *csw);
int do_take_over_console(const struct consw *sw, int first, int last, int deflt);
void give_up_console(const struct consw *sw);
#ifdef CONFIG_VT
void con_debug_enter(struct vc_data *vc);
void con_debug_leave(void);
#else
static inline void con_debug_enter(struct vc_data *vc) { }
static inline void con_debug_leave(void) { }
#endif

/*
 * The interface for a console, or any other device that wants to capture
 * console messages (printer driver?)
 */

/**
 * enum cons_flags - General console flags
 * @CON_PRINTBUFFER:        Used by newly registered consoles to avoid duplicate
 *                        output of messages that were already shown by boot
 *                        consoles or read by userspace via syslog() syscall.
 * @CON_CONSDEV:        Indicates that the console driver is backing
 *                        /dev/console.
 * @CON_ENABLED:        Indicates if a console is allowed to print records. If
 *                        false, the console also will not advance to later
 *                        records.
 * @CON_BOOT:                Marks the console driver as early console driver which
 *                        is used during boot before the real driver becomes
 *                        available. It will be automatically unregistered
 *                        when the real console driver is registered unless
 *                        "keep_bootcon" parameter is used.
 * @CON_ANYTIME:        A misnomed historical flag which tells the core code
 *                        that the legacy @console::write callback can be invoked
 *                        on a CPU which is marked OFFLINE. That is misleading as
 *                        it suggests that there is no contextual limit for
 *                        invoking the callback. The original motivation was
 *                        readiness of the per-CPU areas.
 * @CON_BRL:                Indicates a braille device which is exempt from
 *                        receiving the printk spam for obvious reasons.
 * @CON_EXTENDED:        The console supports the extended output format of
 *                        /dev/kmesg which requires a larger output buffer.
 * @CON_SUSPENDED:        Indicates if a console is suspended. If true, the
 *                        printing callbacks must not be called.
 * @CON_NBCON:                Console can operate outside of the legacy style console_lock
 *                        constraints.
 */
enum cons_flags {
        CON_PRINTBUFFER                = BIT(0),
        CON_CONSDEV                = BIT(1),
        CON_ENABLED                = BIT(2),
        CON_BOOT                = BIT(3),
        CON_ANYTIME                = BIT(4),
        CON_BRL                        = BIT(5),
        CON_EXTENDED                = BIT(6),
        CON_SUSPENDED                = BIT(7),
        CON_NBCON                = BIT(8),
};

/**
 * struct nbcon_state - console state for nbcon consoles
 * @atom:        Compound of the state fields for atomic operations
 *
 * @req_prio:                The priority of a handover request
 * @prio:                The priority of the current owner
 * @unsafe:                Console is busy in a non takeover region
 * @unsafe_takeover:        A hostile takeover in an unsafe state happened in the
 *                        past. The console cannot be safe until re-initialized.
 * @cpu:                The CPU on which the owner runs
 *
 * To be used for reading and preparing of the value stored in the nbcon
 * state variable @console::nbcon_state.
 *
 * The @prio and @req_prio fields are particularly important to allow
 * spin-waiting to timeout and give up without the risk of a waiter being
 * assigned the lock after giving up.
 */
struct nbcon_state {
        union {
                unsigned int        atom;
                struct {
                        unsigned int prio                :  2;
                        unsigned int req_prio                :  2;
                        unsigned int unsafe                :  1;
                        unsigned int unsafe_takeover        :  1;
                        unsigned int cpu                : 24;
                };
        };
};

/*
 * The nbcon_state struct is used to easily create and interpret values that
 * are stored in the @console::nbcon_state variable. Ensure this struct stays
 * within the size boundaries of the atomic variable's underlying type in
 * order to avoid any accidental truncation.
 */
static_assert(sizeof(struct nbcon_state) <= sizeof(int));

/**
 * enum nbcon_prio - console owner priority for nbcon consoles
 * @NBCON_PRIO_NONE:                Unused
 * @NBCON_PRIO_NORMAL:                Normal (non-emergency) usage
 * @NBCON_PRIO_EMERGENCY:        Emergency output (WARN/OOPS...)
 * @NBCON_PRIO_PANIC:                Panic output
 * @NBCON_PRIO_MAX:                The number of priority levels
 *
 * A higher priority context can takeover the console when it is
 * in the safe state. The final attempt to flush consoles in panic()
 * can be allowed to do so even in an unsafe state (Hope and pray).
 */
enum nbcon_prio {
        NBCON_PRIO_NONE = 0,
        NBCON_PRIO_NORMAL,
        NBCON_PRIO_EMERGENCY,
        NBCON_PRIO_PANIC,
        NBCON_PRIO_MAX,
};

struct console;
struct printk_buffers;

/**
 * struct nbcon_context - Context for console acquire/release
 * @console:                        The associated console
 * @spinwait_max_us:                Limit for spin-wait acquire
 * @prio:                        Priority of the context
 * @allow_unsafe_takeover:        Allow performing takeover even if unsafe. Can
 *                                be used only with NBCON_PRIO_PANIC @prio. It
 *                                might cause a system freeze when the console
 *                                is used later.
 * @backlog:                        Ringbuffer has pending records
 * @pbufs:                        Pointer to the text buffer for this context
 * @seq:                        The sequence number to print for this context
 */
struct nbcon_context {
        /* members set by caller */
        struct console                *console;
        unsigned int                spinwait_max_us;
        enum nbcon_prio                prio;
        unsigned int                allow_unsafe_takeover        : 1;

        /* members set by emit */
        unsigned int                backlog                        : 1;

        /* members set by acquire */
        struct printk_buffers        *pbufs;
        u64                        seq;
};

/**
 * struct nbcon_write_context - Context handed to the nbcon write callbacks
 * @ctxt:                The core console context
 * @outbuf:                Pointer to the text buffer for output
 * @len:                Length to write
 * @unsafe_takeover:        If a hostile takeover in an unsafe state has occurred
 */
struct nbcon_write_context {
        struct nbcon_context        __private ctxt;
        char                        *outbuf;
        unsigned int                len;
        bool                        unsafe_takeover;
};

/**
 * struct console - The console descriptor structure
 * @name:                The name of the console driver
 * @write:                Write callback to output messages (Optional)
 * @read:                Read callback for console input (Optional)
 * @device:                The underlying TTY device driver (Optional)
 * @unblank:                Callback to unblank the console (Optional)
 * @setup:                Callback for initializing the console (Optional)
 * @exit:                Callback for teardown of the console (Optional)
 * @match:                Callback for matching a console (Optional)
 * @flags:                Console flags. See enum cons_flags
 * @index:                Console index, e.g. port number
 * @cflag:                TTY control mode flags
 * @ispeed:                TTY input speed
 * @ospeed:                TTY output speed
 * @seq:                Sequence number of the next ringbuffer record to print
 * @dropped:                Number of unreported dropped ringbuffer records
 * @data:                Driver private data
 * @node:                hlist node for the console list
 *
 * @write_atomic:        Write callback for atomic context
 * @nbcon_state:        State for nbcon consoles
 * @nbcon_seq:                Sequence number of the next record for nbcon to print
 * @pbufs:                Pointer to nbcon private buffer
 */
struct console {
        char                        name[16];
        void                        (*write)(struct console *co, const char *s, unsigned int count);
        int                        (*read)(struct console *co, char *s, unsigned int count);
        struct tty_driver        *(*device)(struct console *co, int *index);
        void                        (*unblank)(void);
        int                        (*setup)(struct console *co, char *options);
        int                        (*exit)(struct console *co);
        int                        (*match)(struct console *co, char *name, int idx, char *options);
        short                        flags;
        short                        index;
        int                        cflag;
        uint                        ispeed;
        uint                        ospeed;
        u64                        seq;
        unsigned long                dropped;
        void                        *data;
        struct hlist_node        node;

        /* nbcon console specific members */
        bool                        (*write_atomic)(struct console *con,
                                                struct nbcon_write_context *wctxt);
        atomic_t                __private nbcon_state;
        atomic_long_t                __private nbcon_seq;
        struct printk_buffers        *pbufs;
};

#ifdef CONFIG_LOCKDEP
extern void lockdep_assert_console_list_lock_held(void);
#else
static inline void lockdep_assert_console_list_lock_held(void)
{
}
#endif

#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern bool console_srcu_read_lock_is_held(void);
#else
static inline bool console_srcu_read_lock_is_held(void)
{
        return 1;
}
#endif

extern int console_srcu_read_lock(void);
extern void console_srcu_read_unlock(int cookie);

extern void console_list_lock(void) __acquires(console_mutex);
extern void console_list_unlock(void) __releases(console_mutex);

extern struct hlist_head console_list;

/**
 * console_srcu_read_flags - Locklessly read the console flags
 * @con:        struct console pointer of console to read flags from
 *
 * This function provides the necessary READ_ONCE() and data_race()
 * notation for locklessly reading the console flags. The READ_ONCE()
 * in this function matches the WRITE_ONCE() when @flags are modified
 * for registered consoles with console_srcu_write_flags().
 *
 * Only use this function to read console flags when locklessly
 * iterating the console list via srcu.
 *
 * Context: Any context.
 */
static inline short console_srcu_read_flags(const struct console *con)
{
        WARN_ON_ONCE(!console_srcu_read_lock_is_held());

        /*
         * Locklessly reading console->flags provides a consistent
         * read value because there is at most one CPU modifying
         * console->flags and that CPU is using only read-modify-write
         * operations to do so.
         */
        return data_race(READ_ONCE(con->flags));
}

/**
 * console_srcu_write_flags - Write flags for a registered console
 * @con:        struct console pointer of console to write flags to
 * @flags:        new flags value to write
 *
 * Only use this function to write flags for registered consoles. It
 * requires holding the console_list_lock.
 *
 * Context: Any context.
 */
static inline void console_srcu_write_flags(struct console *con, short flags)
{
        lockdep_assert_console_list_lock_held();

        /* This matches the READ_ONCE() in console_srcu_read_flags(). */
        WRITE_ONCE(con->flags, flags);
}

/* Variant of console_is_registered() when the console_list_lock is held. */
static inline bool console_is_registered_locked(const struct console *con)
{
        lockdep_assert_console_list_lock_held();
        return !hlist_unhashed(&con->node);
}

/*
 * console_is_registered - Check if the console is registered
 * @con:        struct console pointer of console to check
 *
 * Context: Process context. May sleep while acquiring console list lock.
 * Return: true if the console is in the console list, otherwise false.
 *
 * If false is returned for a console that was previously registered, it
 * can be assumed that the console's unregistration is fully completed,
 * including the exit() callback after console list removal.
 */
static inline bool console_is_registered(const struct console *con)
{
        bool ret;

        console_list_lock();
        ret = console_is_registered_locked(con);
        console_list_unlock();
        return ret;
}

/**
 * for_each_console_srcu() - Iterator over registered consoles
 * @con:        struct console pointer used as loop cursor
 *
 * Although SRCU guarantees the console list will be consistent, the
 * struct console fields may be updated by other CPUs while iterating.
 *
 * Requires console_srcu_read_lock to be held. Can be invoked from
 * any context.
 */
#define for_each_console_srcu(con)                                        \
        hlist_for_each_entry_srcu(con, &console_list, node,                \
                                  console_srcu_read_lock_is_held())

/**
 * for_each_console() - Iterator over registered consoles
 * @con:        struct console pointer used as loop cursor
 *
 * The console list and the &console.flags are immutable while iterating.
 *
 * Requires console_list_lock to be held.
 */
#define for_each_console(con)                                                \
        lockdep_assert_console_list_lock_held();                        \
        hlist_for_each_entry(con, &console_list, node)

#ifdef CONFIG_PRINTK
extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt);
extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt);
extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt);
#else
static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; }
static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; }
static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; }
#endif

extern int console_set_on_cmdline;
extern struct console *early_console;

enum con_flush_mode {
        CONSOLE_FLUSH_PENDING,
        CONSOLE_REPLAY_ALL,
};

extern int add_preferred_console(const char *name, const short idx, char *options);
extern void console_force_preferred_locked(struct console *con);
extern void register_console(struct console *);
extern int unregister_console(struct console *);
extern void console_lock(void);
extern int console_trylock(void);
extern void console_unlock(void);
extern void console_conditional_schedule(void);
extern void console_unblank(void);
extern void console_flush_on_panic(enum con_flush_mode mode);
extern struct tty_driver *console_device(int *);
extern void console_stop(struct console *);
extern void console_start(struct console *);
extern int is_console_locked(void);
extern int braille_register_console(struct console *, int index,
                char *console_options, char *braille_options);
extern int braille_unregister_console(struct console *);
#ifdef CONFIG_TTY
extern void console_sysfs_notify(void);
#else
static inline void console_sysfs_notify(void)
{ }
#endif
extern bool console_suspend_enabled;

/* Suspend and resume console messages over PM events */
extern void suspend_console(void);
extern void resume_console(void);

int mda_console_init(void);

void vcs_make_sysfs(int index);
void vcs_remove_sysfs(int index);

/* Some debug stub to catch some of the obvious races in the VT code */
#define WARN_CONSOLE_UNLOCKED()                                                \
        WARN_ON(!atomic_read(&ignore_console_lock_warning) &&                \
                !is_console_locked() && !oops_in_progress)
/*
 * Increment ignore_console_lock_warning if you need to quiet
 * WARN_CONSOLE_UNLOCKED() for debugging purposes.
 */
extern atomic_t ignore_console_lock_warning;

extern void console_init(void);

/* For deferred console takeover */
void dummycon_register_output_notifier(struct notifier_block *nb);
void dummycon_unregister_output_notifier(struct notifier_block *nb);

#endif /* _LINUX_CONSOLE_H */
















































































































    1 






















    1 


    1 




















































    1 


    1 










    1 

    1 


















    1 


    1 














    1 


    1 







































    1 

























    1 




    1 





    1 















    1 
    1 
    1 

















    1 





    1 








    1 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/fat/cache.c
 *
 *  Written 1992,1993 by Werner Almesberger
 *
 *  Mar 1999. AV. Changed cache, so that it uses the starting cluster instead
 *        of inode number.
 *  May 1999. AV. Fixed the bogosity with FAT32 (read "FAT28"). Fscking lusers.
 */

#include <linux/slab.h>
#include "fat.h"

/* this must be > 0. */
#define FAT_MAX_CACHE        8

struct fat_cache {
        struct list_head cache_list;
        int nr_contig;        /* number of contiguous clusters */
        int fcluster;        /* cluster number in the file. */
        int dcluster;        /* cluster number on disk. */
};

struct fat_cache_id {
        unsigned int id;
        int nr_contig;
        int fcluster;
        int dcluster;
};

static inline int fat_max_cache(struct inode *inode)
{
        return FAT_MAX_CACHE;
}

static struct kmem_cache *fat_cache_cachep;

static void init_once(void *foo)
{
        struct fat_cache *cache = (struct fat_cache *)foo;

        INIT_LIST_HEAD(&cache->cache_list);
}

int __init fat_cache_init(void)
{
        fat_cache_cachep = kmem_cache_create("fat_cache",
                                sizeof(struct fat_cache),
                                0, SLAB_RECLAIM_ACCOUNT,
                                init_once);
        if (fat_cache_cachep == NULL)
                return -ENOMEM;
        return 0;
}

void fat_cache_destroy(void)
{
        kmem_cache_destroy(fat_cache_cachep);
}

static inline struct fat_cache *fat_cache_alloc(struct inode *inode)
{
        return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS);
}

static inline void fat_cache_free(struct fat_cache *cache)
{
        BUG_ON(!list_empty(&cache->cache_list));
        kmem_cache_free(fat_cache_cachep, cache);
}

static inline void fat_cache_update_lru(struct inode *inode,
                                        struct fat_cache *cache)
{
        if (MSDOS_I(inode)->cache_lru.next != &cache->cache_list)
                list_move(&cache->cache_list, &MSDOS_I(inode)->cache_lru);
}

static int fat_cache_lookup(struct inode *inode, int fclus,
                            struct fat_cache_id *cid,
                            int *cached_fclus, int *cached_dclus)
{
        static struct fat_cache nohit = { .fcluster = 0, };

        struct fat_cache *hit = &nohit, *p;
        int offset = -1;

        spin_lock(&MSDOS_I(inode)->cache_lru_lock);
        list_for_each_entry(p, &MSDOS_I(inode)->cache_lru, cache_list) {
                /* Find the cache of "fclus" or nearest cache. */
                if (p->fcluster <= fclus && hit->fcluster < p->fcluster) {
                        hit = p;
                        if ((hit->fcluster + hit->nr_contig) < fclus) {
                                offset = hit->nr_contig;
                        } else {
                                offset = fclus - hit->fcluster;
                                break;
                        }
                }
        }
        if (hit != &nohit) {
                fat_cache_update_lru(inode, hit);

                cid->id = MSDOS_I(inode)->cache_valid_id;
                cid->nr_contig = hit->nr_contig;
                cid->fcluster = hit->fcluster;
                cid->dcluster = hit->dcluster;
                *cached_fclus = cid->fcluster + offset;
                *cached_dclus = cid->dcluster + offset;
        }
        spin_unlock(&MSDOS_I(inode)->cache_lru_lock);

        return offset;
}

static struct fat_cache *fat_cache_merge(struct inode *inode,
                                         struct fat_cache_id *new)
{
        struct fat_cache *p;

        list_for_each_entry(p, &MSDOS_I(inode)->cache_lru, cache_list) {
                /* Find the same part as "new" in cluster-chain. */
                if (p->fcluster == new->fcluster) {
                        BUG_ON(p->dcluster != new->dcluster);
                        if (new->nr_contig > p->nr_contig)
                                p->nr_contig = new->nr_contig;
                        return p;
                }
        }
        return NULL;
}

static void fat_cache_add(struct inode *inode, struct fat_cache_id *new)
{
        struct fat_cache *cache, *tmp;

        if (new->fcluster == -1) /* dummy cache */
                return;

        spin_lock(&MSDOS_I(inode)->cache_lru_lock);
        if (new->id != FAT_CACHE_VALID &&
            new->id != MSDOS_I(inode)->cache_valid_id)
                goto out;        /* this cache was invalidated */

        cache = fat_cache_merge(inode, new);
        if (cache == NULL) {
                if (MSDOS_I(inode)->nr_caches < fat_max_cache(inode)) {
                        MSDOS_I(inode)->nr_caches++;
                        spin_unlock(&MSDOS_I(inode)->cache_lru_lock);

                        tmp = fat_cache_alloc(inode);
                        if (!tmp) {
                                spin_lock(&MSDOS_I(inode)->cache_lru_lock);
                                MSDOS_I(inode)->nr_caches--;
                                spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
                                return;
                        }

                        spin_lock(&MSDOS_I(inode)->cache_lru_lock);
                        cache = fat_cache_merge(inode, new);
                        if (cache != NULL) {
                                MSDOS_I(inode)->nr_caches--;
                                fat_cache_free(tmp);
                                goto out_update_lru;
                        }
                        cache = tmp;
                } else {
                        struct list_head *p = MSDOS_I(inode)->cache_lru.prev;
                        cache = list_entry(p, struct fat_cache, cache_list);
                }
                cache->fcluster = new->fcluster;
                cache->dcluster = new->dcluster;
                cache->nr_contig = new->nr_contig;
        }
out_update_lru:
        fat_cache_update_lru(inode, cache);
out:
        spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
}

/*
 * Cache invalidation occurs rarely, thus the LRU chain is not updated. It
 * fixes itself after a while.
 */
static void __fat_cache_inval_inode(struct inode *inode)
{
        struct msdos_inode_info *i = MSDOS_I(inode);
        struct fat_cache *cache;

        while (!list_empty(&i->cache_lru)) {
                cache = list_entry(i->cache_lru.next,
                                   struct fat_cache, cache_list);
                list_del_init(&cache->cache_list);
                i->nr_caches--;
                fat_cache_free(cache);
        }
        /* Update. The copy of caches before this id is discarded. */
        i->cache_valid_id++;
        if (i->cache_valid_id == FAT_CACHE_VALID)
                i->cache_valid_id++;
}

void fat_cache_inval_inode(struct inode *inode)
{
        spin_lock(&MSDOS_I(inode)->cache_lru_lock);
        __fat_cache_inval_inode(inode);
        spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
}

static inline int cache_contiguous(struct fat_cache_id *cid, int dclus)
{
        cid->nr_contig++;
        return ((cid->dcluster + cid->nr_contig) == dclus);
}

static inline void cache_init(struct fat_cache_id *cid, int fclus, int dclus)
{
        cid->id = FAT_CACHE_VALID;
        cid->fcluster = fclus;
        cid->dcluster = dclus;
        cid->nr_contig = 0;
}

int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
{
        struct super_block *sb = inode->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        const int limit = sb->s_maxbytes >> sbi->cluster_bits;
        struct fat_entry fatent;
        struct fat_cache_id cid;
        int nr;

        BUG_ON(MSDOS_I(inode)->i_start == 0);

        *fclus = 0;
        *dclus = MSDOS_I(inode)->i_start;
        if (!fat_valid_entry(sbi, *dclus)) {
                fat_fs_error_ratelimit(sb,
                        "%s: invalid start cluster (i_pos %lld, start %08x)",
                        __func__, MSDOS_I(inode)->i_pos, *dclus);
                return -EIO;
        }
        if (cluster == 0)
                return 0;

        if (fat_cache_lookup(inode, cluster, &cid, fclus, dclus) < 0) {
                /*
                 * dummy, always not contiguous
                 * This is reinitialized by cache_init(), later.
                 */
                cache_init(&cid, -1, -1);
        }

        fatent_init(&fatent);
        while (*fclus < cluster) {
                /* prevent the infinite loop of cluster chain */
                if (*fclus > limit) {
                        fat_fs_error_ratelimit(sb,
                                "%s: detected the cluster chain loop (i_pos %lld)",
                                __func__, MSDOS_I(inode)->i_pos);
                        nr = -EIO;
                        goto out;
                }

                nr = fat_ent_read(inode, &fatent, *dclus);
                if (nr < 0)
                        goto out;
                else if (nr == FAT_ENT_FREE) {
                        fat_fs_error_ratelimit(sb,
                                "%s: invalid cluster chain (i_pos %lld)",
                                __func__, MSDOS_I(inode)->i_pos);
                        nr = -EIO;
                        goto out;
                } else if (nr == FAT_ENT_EOF) {
                        fat_cache_add(inode, &cid);
                        goto out;
                }
                (*fclus)++;
                *dclus = nr;
                if (!cache_contiguous(&cid, *dclus))
                        cache_init(&cid, *fclus, *dclus);
        }
        nr = 0;
        fat_cache_add(inode, &cid);
out:
        fatent_brelse(&fatent);
        return nr;
}

static int fat_bmap_cluster(struct inode *inode, int cluster)
{
        struct super_block *sb = inode->i_sb;
        int ret, fclus, dclus;

        if (MSDOS_I(inode)->i_start == 0)
                return 0;

        ret = fat_get_cluster(inode, cluster, &fclus, &dclus);
        if (ret < 0)
                return ret;
        else if (ret == FAT_ENT_EOF) {
                fat_fs_error(sb, "%s: request beyond EOF (i_pos %lld)",
                             __func__, MSDOS_I(inode)->i_pos);
                return -EIO;
        }
        return dclus;
}

int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
                           sector_t last_block,
                           unsigned long *mapped_blocks, sector_t *bmap)
{
        struct super_block *sb = inode->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        int cluster, offset;

        cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
        offset  = sector & (sbi->sec_per_clus - 1);
        cluster = fat_bmap_cluster(inode, cluster);
        if (cluster < 0)
                return cluster;
        else if (cluster) {
                *bmap = fat_clus_to_blknr(sbi, cluster) + offset;
                *mapped_blocks = sbi->sec_per_clus - offset;
                if (*mapped_blocks > last_block - sector)
                        *mapped_blocks = last_block - sector;
        }

        return 0;
}

static int is_exceed_eof(struct inode *inode, sector_t sector,
                         sector_t *last_block, int create)
{
        struct super_block *sb = inode->i_sb;
        const unsigned long blocksize = sb->s_blocksize;
        const unsigned char blocksize_bits = sb->s_blocksize_bits;

        *last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
        if (sector >= *last_block) {
                if (!create)
                        return 1;

                /*
                 * ->mmu_private can access on only allocation path.
                 * (caller must hold ->i_mutex)
                 */
                *last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
                        >> blocksize_bits;
                if (sector >= *last_block)
                        return 1;
        }

        return 0;
}

int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
             unsigned long *mapped_blocks, int create, bool from_bmap)
{
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
        sector_t last_block;

        *phys = 0;
        *mapped_blocks = 0;
        if (!is_fat32(sbi) && (inode->i_ino == MSDOS_ROOT_INO)) {
                if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits)) {
                        *phys = sector + sbi->dir_start;
                        *mapped_blocks = 1;
                }
                return 0;
        }

        if (!from_bmap) {
                if (is_exceed_eof(inode, sector, &last_block, create))
                        return 0;
        } else {
                last_block = inode->i_blocks >>
                                (inode->i_sb->s_blocksize_bits - 9);
                if (sector >= last_block)
                        return 0;
        }

        return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks,
                                      phys);
}










































































































































































































































































































































    5 














































































































































































































































































































































































































































































































































    2 







    2 









    2 

















































































































































































































































































































































































































































































































































































































































































































































































    2 
















    2 

















    2 









    2 
    2 



































































    2 



































    2 



    2 























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
// SPDX-License-Identifier: GPL-2.0

#include "misc.h"
#include "ctree.h"
#include "space-info.h"
#include "sysfs.h"
#include "volumes.h"
#include "free-space-cache.h"
#include "ordered-data.h"
#include "transaction.h"
#include "block-group.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"

/*
 * HOW DOES SPACE RESERVATION WORK
 *
 * If you want to know about delalloc specifically, there is a separate comment
 * for that with the delalloc code.  This comment is about how the whole system
 * works generally.
 *
 * BASIC CONCEPTS
 *
 *   1) space_info.  This is the ultimate arbiter of how much space we can use.
 *   There's a description of the bytes_ fields with the struct declaration,
 *   refer to that for specifics on each field.  Suffice it to say that for
 *   reservations we care about total_bytes - SUM(space_info->bytes_) when
 *   determining if there is space to make an allocation.  There is a space_info
 *   for METADATA, SYSTEM, and DATA areas.
 *
 *   2) block_rsv's.  These are basically buckets for every different type of
 *   metadata reservation we have.  You can see the comment in the block_rsv
 *   code on the rules for each type, but generally block_rsv->reserved is how
 *   much space is accounted for in space_info->bytes_may_use.
 *
 *   3) btrfs_calc*_size.  These are the worst case calculations we used based
 *   on the number of items we will want to modify.  We have one for changing
 *   items, and one for inserting new items.  Generally we use these helpers to
 *   determine the size of the block reserves, and then use the actual bytes
 *   values to adjust the space_info counters.
 *
 * MAKING RESERVATIONS, THE NORMAL CASE
 *
 *   We call into either btrfs_reserve_data_bytes() or
 *   btrfs_reserve_metadata_bytes(), depending on which we're looking for, with
 *   num_bytes we want to reserve.
 *
 *   ->reserve
 *     space_info->bytes_may_reserve += num_bytes
 *
 *   ->extent allocation
 *     Call btrfs_add_reserved_bytes() which does
 *     space_info->bytes_may_reserve -= num_bytes
 *     space_info->bytes_reserved += extent_bytes
 *
 *   ->insert reference
 *     Call btrfs_update_block_group() which does
 *     space_info->bytes_reserved -= extent_bytes
 *     space_info->bytes_used += extent_bytes
 *
 * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority)
 *
 *   Assume we are unable to simply make the reservation because we do not have
 *   enough space
 *
 *   -> __reserve_bytes
 *     create a reserve_ticket with ->bytes set to our reservation, add it to
 *     the tail of space_info->tickets, kick async flush thread
 *
 *   ->handle_reserve_ticket
 *     wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set
 *     on the ticket.
 *
 *   -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space
 *     Flushes various things attempting to free up space.
 *
 *   -> btrfs_try_granting_tickets()
 *     This is called by anything that either subtracts space from
 *     space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the
 *     space_info->total_bytes.  This loops through the ->priority_tickets and
 *     then the ->tickets list checking to see if the reservation can be
 *     completed.  If it can the space is added to space_info->bytes_may_use and
 *     the ticket is woken up.
 *
 *   -> ticket wakeup
 *     Check if ->bytes == 0, if it does we got our reservation and we can carry
 *     on, if not return the appropriate error (ENOSPC, but can be EINTR if we
 *     were interrupted.)
 *
 * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY
 *
 *   Same as the above, except we add ourselves to the
 *   space_info->priority_tickets, and we do not use ticket->wait, we simply
 *   call flush_space() ourselves for the states that are safe for us to call
 *   without deadlocking and hope for the best.
 *
 * THE FLUSHING STATES
 *
 *   Generally speaking we will have two cases for each state, a "nice" state
 *   and a "ALL THE THINGS" state.  In btrfs we delay a lot of work in order to
 *   reduce the locking over head on the various trees, and even to keep from
 *   doing any work at all in the case of delayed refs.  Each of these delayed
 *   things however hold reservations, and so letting them run allows us to
 *   reclaim space so we can make new reservations.
 *
 *   FLUSH_DELAYED_ITEMS
 *     Every inode has a delayed item to update the inode.  Take a simple write
 *     for example, we would update the inode item at write time to update the
 *     mtime, and then again at finish_ordered_io() time in order to update the
 *     isize or bytes.  We keep these delayed items to coalesce these operations
 *     into a single operation done on demand.  These are an easy way to reclaim
 *     metadata space.
 *
 *   FLUSH_DELALLOC
 *     Look at the delalloc comment to get an idea of how much space is reserved
 *     for delayed allocation.  We can reclaim some of this space simply by
 *     running delalloc, but usually we need to wait for ordered extents to
 *     reclaim the bulk of this space.
 *
 *   FLUSH_DELAYED_REFS
 *     We have a block reserve for the outstanding delayed refs space, and every
 *     delayed ref operation holds a reservation.  Running these is a quick way
 *     to reclaim space, but we want to hold this until the end because COW can
 *     churn a lot and we can avoid making some extent tree modifications if we
 *     are able to delay for as long as possible.
 *
 *   ALLOC_CHUNK
 *     We will skip this the first time through space reservation, because of
 *     overcommit and we don't want to have a lot of useless metadata space when
 *     our worst case reservations will likely never come true.
 *
 *   RUN_DELAYED_IPUTS
 *     If we're freeing inodes we're likely freeing checksums, file extent
 *     items, and extent tree items.  Loads of space could be freed up by these
 *     operations, however they won't be usable until the transaction commits.
 *
 *   COMMIT_TRANS
 *     This will commit the transaction.  Historically we had a lot of logic
 *     surrounding whether or not we'd commit the transaction, but this waits born
 *     out of a pre-tickets era where we could end up committing the transaction
 *     thousands of times in a row without making progress.  Now thanks to our
 *     ticketing system we know if we're not making progress and can error
 *     everybody out after a few commits rather than burning the disk hoping for
 *     a different answer.
 *
 * OVERCOMMIT
 *
 *   Because we hold so many reservations for metadata we will allow you to
 *   reserve more space than is currently free in the currently allocate
 *   metadata space.  This only happens with metadata, data does not allow
 *   overcommitting.
 *
 *   You can see the current logic for when we allow overcommit in
 *   btrfs_can_overcommit(), but it only applies to unallocated space.  If there
 *   is no unallocated space to be had, all reservations are kept within the
 *   free space in the allocated metadata chunks.
 *
 *   Because of overcommitting, you generally want to use the
 *   btrfs_can_overcommit() logic for metadata allocations, as it does the right
 *   thing with or without extra unallocated space.
 */

u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
                          bool may_use_included)
{
        ASSERT(s_info);
        return s_info->bytes_used + s_info->bytes_reserved +
                s_info->bytes_pinned + s_info->bytes_readonly +
                s_info->bytes_zone_unusable +
                (may_use_included ? s_info->bytes_may_use : 0);
}

/*
 * after adding space to the filesystem, we need to clear the full flags
 * on all the space infos.
 */
void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
{
        struct list_head *head = &info->space_info;
        struct btrfs_space_info *found;

        list_for_each_entry(found, head, list)
                found->full = 0;
}

/*
 * Block groups with more than this value (percents) of unusable space will be
 * scheduled for background reclaim.
 */
#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH                        (75)

/*
 * Calculate chunk size depending on volume type (regular or zoned).
 */
static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags)
{
        if (btrfs_is_zoned(fs_info))
                return fs_info->zone_size;

        ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK);

        if (flags & BTRFS_BLOCK_GROUP_DATA)
                return BTRFS_MAX_DATA_CHUNK_SIZE;
        else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
                return SZ_32M;

        /* Handle BTRFS_BLOCK_GROUP_METADATA */
        if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G)
                return SZ_1G;

        return SZ_256M;
}

/*
 * Update default chunk size.
 */
void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
                                        u64 chunk_size)
{
        WRITE_ONCE(space_info->chunk_size, chunk_size);
}

static int create_space_info(struct btrfs_fs_info *info, u64 flags)
{

        struct btrfs_space_info *space_info;
        int i;
        int ret;

        space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
        if (!space_info)
                return -ENOMEM;

        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
                INIT_LIST_HEAD(&space_info->block_groups[i]);
        init_rwsem(&space_info->groups_sem);
        spin_lock_init(&space_info->lock);
        space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
        INIT_LIST_HEAD(&space_info->ro_bgs);
        INIT_LIST_HEAD(&space_info->tickets);
        INIT_LIST_HEAD(&space_info->priority_tickets);
        space_info->clamp = 1;
        btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags));

        if (btrfs_is_zoned(info))
                space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;

        ret = btrfs_sysfs_add_space_info_type(info, space_info);
        if (ret)
                return ret;

        list_add(&space_info->list, &info->space_info);
        if (flags & BTRFS_BLOCK_GROUP_DATA)
                info->data_sinfo = space_info;

        return ret;
}

int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
{
        struct btrfs_super_block *disk_super;
        u64 features;
        u64 flags;
        int mixed = 0;
        int ret;

        disk_super = fs_info->super_copy;
        if (!btrfs_super_root(disk_super))
                return -EINVAL;

        features = btrfs_super_incompat_flags(disk_super);
        if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
                mixed = 1;

        flags = BTRFS_BLOCK_GROUP_SYSTEM;
        ret = create_space_info(fs_info, flags);
        if (ret)
                goto out;

        if (mixed) {
                flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
                ret = create_space_info(fs_info, flags);
        } else {
                flags = BTRFS_BLOCK_GROUP_METADATA;
                ret = create_space_info(fs_info, flags);
                if (ret)
                        goto out;

                flags = BTRFS_BLOCK_GROUP_DATA;
                ret = create_space_info(fs_info, flags);
        }
out:
        return ret;
}

void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
                                struct btrfs_block_group *block_group)
{
        struct btrfs_space_info *found;
        int factor, index;

        factor = btrfs_bg_type_to_factor(block_group->flags);

        found = btrfs_find_space_info(info, block_group->flags);
        ASSERT(found);
        spin_lock(&found->lock);
        found->total_bytes += block_group->length;
        found->disk_total += block_group->length * factor;
        found->bytes_used += block_group->used;
        found->disk_used += block_group->used * factor;
        found->bytes_readonly += block_group->bytes_super;
        found->bytes_zone_unusable += block_group->zone_unusable;
        if (block_group->length > 0)
                found->full = 0;
        btrfs_try_granting_tickets(info, found);
        spin_unlock(&found->lock);

        block_group->space_info = found;

        index = btrfs_bg_flags_to_raid_index(block_group->flags);
        down_write(&found->groups_sem);
        list_add_tail(&block_group->list, &found->block_groups[index]);
        up_write(&found->groups_sem);
}

struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
                                               u64 flags)
{
        struct list_head *head = &info->space_info;
        struct btrfs_space_info *found;

        flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;

        list_for_each_entry(found, head, list) {
                if (found->flags & flags)
                        return found;
        }
        return NULL;
}

static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
                          struct btrfs_space_info *space_info,
                          enum btrfs_reserve_flush_enum flush)
{
        struct btrfs_space_info *data_sinfo;
        u64 profile;
        u64 avail;
        u64 data_chunk_size;
        int factor;

        if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
                profile = btrfs_system_alloc_profile(fs_info);
        else
                profile = btrfs_metadata_alloc_profile(fs_info);

        avail = atomic64_read(&fs_info->free_chunk_space);

        /*
         * If we have dup, raid1 or raid10 then only half of the free
         * space is actually usable.  For raid56, the space info used
         * doesn't include the parity drive, so we don't have to
         * change the math
         */
        factor = btrfs_bg_type_to_factor(profile);
        avail = div_u64(avail, factor);
        if (avail == 0)
                return 0;

        /*
         * Calculate the data_chunk_size, space_info->chunk_size is the
         * "optimal" chunk size based on the fs size.  However when we actually
         * allocate the chunk we will strip this down further, making it no more
         * than 10% of the disk or 1G, whichever is smaller.
         */
        data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
        data_chunk_size = min(data_sinfo->chunk_size,
                              mult_perc(fs_info->fs_devices->total_rw_bytes, 10));
        data_chunk_size = min_t(u64, data_chunk_size, SZ_1G);

        /*
         * Since data allocations immediately use block groups as part of the
         * reservation, because we assume that data reservations will == actual
         * usage, we could potentially overcommit and then immediately have that
         * available space used by a data allocation, which could put us in a
         * bind when we get close to filling the file system.
         *
         * To handle this simply remove the data_chunk_size from the available
         * space.  If we are relatively empty this won't affect our ability to
         * overcommit much, and if we're very close to full it'll keep us from
         * getting into a position where we've given ourselves very little
         * metadata wiggle room.
         */
        if (avail <= data_chunk_size)
                return 0;
        avail -= data_chunk_size;

        /*
         * If we aren't flushing all things, let us overcommit up to
         * 1/2th of the space. If we can flush, don't let us overcommit
         * too much, let it overcommit up to 1/8 of the space.
         */
        if (flush == BTRFS_RESERVE_FLUSH_ALL)
                avail >>= 3;
        else
                avail >>= 1;
        return avail;
}

int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
                         struct btrfs_space_info *space_info, u64 bytes,
                         enum btrfs_reserve_flush_enum flush)
{
        u64 avail;
        u64 used;

        /* Don't overcommit when in mixed mode */
        if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
                return 0;

        used = btrfs_space_info_used(space_info, true);
        avail = calc_available_free_space(fs_info, space_info, flush);

        if (used + bytes < space_info->total_bytes + avail)
                return 1;
        return 0;
}

static void remove_ticket(struct btrfs_space_info *space_info,
                          struct reserve_ticket *ticket)
{
        if (!list_empty(&ticket->list)) {
                list_del_init(&ticket->list);
                ASSERT(space_info->reclaim_size >= ticket->bytes);
                space_info->reclaim_size -= ticket->bytes;
        }
}

/*
 * This is for space we already have accounted in space_info->bytes_may_use, so
 * basically when we're returning space from block_rsv's.
 */
void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
                                struct btrfs_space_info *space_info)
{
        struct list_head *head;
        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;

        lockdep_assert_held(&space_info->lock);

        head = &space_info->priority_tickets;
again:
        while (!list_empty(head)) {
                struct reserve_ticket *ticket;
                u64 used = btrfs_space_info_used(space_info, true);

                ticket = list_first_entry(head, struct reserve_ticket, list);

                /* Check and see if our ticket can be satisfied now. */
                if ((used + ticket->bytes <= space_info->total_bytes) ||
                    btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
                                         flush)) {
                        btrfs_space_info_update_bytes_may_use(fs_info,
                                                              space_info,
                                                              ticket->bytes);
                        remove_ticket(space_info, ticket);
                        ticket->bytes = 0;
                        space_info->tickets_id++;
                        wake_up(&ticket->wait);
                } else {
                        break;
                }
        }

        if (head == &space_info->priority_tickets) {
                head = &space_info->tickets;
                flush = BTRFS_RESERVE_FLUSH_ALL;
                goto again;
        }
}

#define DUMP_BLOCK_RSV(fs_info, rsv_name)                                \
do {                                                                        \
        struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;                \
        spin_lock(&__rsv->lock);                                        \
        btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",        \
                   __rsv->size, __rsv->reserved);                        \
        spin_unlock(&__rsv->lock);                                        \
} while (0)

static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info)
{
        switch (space_info->flags) {
        case BTRFS_BLOCK_GROUP_SYSTEM:
                return "SYSTEM";
        case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA:
                return "DATA+METADATA";
        case BTRFS_BLOCK_GROUP_DATA:
                return "DATA";
        case BTRFS_BLOCK_GROUP_METADATA:
                return "METADATA";
        default:
                return "UNKNOWN";
        }
}

static void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
{
        DUMP_BLOCK_RSV(fs_info, global_block_rsv);
        DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
        DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
        DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
        DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
}

static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
                                    struct btrfs_space_info *info)
{
        const char *flag_str = space_info_flag_to_str(info);
        lockdep_assert_held(&info->lock);

        /* The free space could be negative in case of overcommit */
        btrfs_info(fs_info, "space_info %s has %lld free, is %sfull",
                   flag_str,
                   (s64)(info->total_bytes - btrfs_space_info_used(info, true)),
                   info->full ? "" : "not ");
        btrfs_info(fs_info,
"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
                info->total_bytes, info->bytes_used, info->bytes_pinned,
                info->bytes_reserved, info->bytes_may_use,
                info->bytes_readonly, info->bytes_zone_unusable);
}

void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
                           struct btrfs_space_info *info, u64 bytes,
                           int dump_block_groups)
{
        struct btrfs_block_group *cache;
        u64 total_avail = 0;
        int index = 0;

        spin_lock(&info->lock);
        __btrfs_dump_space_info(fs_info, info);
        dump_global_block_rsv(fs_info);
        spin_unlock(&info->lock);

        if (!dump_block_groups)
                return;

        down_read(&info->groups_sem);
again:
        list_for_each_entry(cache, &info->block_groups[index], list) {
                u64 avail;

                spin_lock(&cache->lock);
                avail = cache->length - cache->used - cache->pinned -
                        cache->reserved - cache->delalloc_bytes -
                        cache->bytes_super - cache->zone_unusable;
                btrfs_info(fs_info,
"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s",
                           cache->start, cache->length, cache->used, cache->pinned,
                           cache->reserved, cache->delalloc_bytes,
                           cache->bytes_super, cache->zone_unusable,
                           avail, cache->ro ? "[readonly]" : "");
                spin_unlock(&cache->lock);
                btrfs_dump_free_space(cache, bytes);
                total_avail += avail;
        }
        if (++index < BTRFS_NR_RAID_TYPES)
                goto again;
        up_read(&info->groups_sem);

        btrfs_info(fs_info, "%llu bytes available across all block groups", total_avail);
}

static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info,
                                        u64 to_reclaim)
{
        u64 bytes;
        u64 nr;

        bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
        nr = div64_u64(to_reclaim, bytes);
        if (!nr)
                nr = 1;
        return nr;
}

#define EXTENT_SIZE_PER_ITEM        SZ_256K

/*
 * shrink metadata reservation for delalloc
 */
static void shrink_delalloc(struct btrfs_fs_info *fs_info,
                            struct btrfs_space_info *space_info,
                            u64 to_reclaim, bool wait_ordered,
                            bool for_preempt)
{
        struct btrfs_trans_handle *trans;
        u64 delalloc_bytes;
        u64 ordered_bytes;
        u64 items;
        long time_left;
        int loops;

        delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
        ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
        if (delalloc_bytes == 0 && ordered_bytes == 0)
                return;

        /* Calc the number of the pages we need flush for space reservation */
        if (to_reclaim == U64_MAX) {
                items = U64_MAX;
        } else {
                /*
                 * to_reclaim is set to however much metadata we need to
                 * reclaim, but reclaiming that much data doesn't really track
                 * exactly.  What we really want to do is reclaim full inode's
                 * worth of reservations, however that's not available to us
                 * here.  We will take a fraction of the delalloc bytes for our
                 * flushing loops and hope for the best.  Delalloc will expand
                 * the amount we write to cover an entire dirty extent, which
                 * will reclaim the metadata reservation for that range.  If
                 * it's not enough subsequent flush stages will be more
                 * aggressive.
                 */
                to_reclaim = max(to_reclaim, delalloc_bytes >> 3);
                items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
        }

        trans = current->journal_info;

        /*
         * If we are doing more ordered than delalloc we need to just wait on
         * ordered extents, otherwise we'll waste time trying to flush delalloc
         * that likely won't give us the space back we need.
         */
        if (ordered_bytes > delalloc_bytes && !for_preempt)
                wait_ordered = true;

        loops = 0;
        while ((delalloc_bytes || ordered_bytes) && loops < 3) {
                u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
                long nr_pages = min_t(u64, temp, LONG_MAX);
                int async_pages;

                btrfs_start_delalloc_roots(fs_info, nr_pages, true);

                /*
                 * We need to make sure any outstanding async pages are now
                 * processed before we continue.  This is because things like
                 * sync_inode() try to be smart and skip writing if the inode is
                 * marked clean.  We don't use filemap_fwrite for flushing
                 * because we want to control how many pages we write out at a
                 * time, thus this is the only safe way to make sure we've
                 * waited for outstanding compressed workers to have started
                 * their jobs and thus have ordered extents set up properly.
                 *
                 * This exists because we do not want to wait for each
                 * individual inode to finish its async work, we simply want to
                 * start the IO on everybody, and then come back here and wait
                 * for all of the async work to catch up.  Once we're done with
                 * that we know we'll have ordered extents for everything and we
                 * can decide if we wait for that or not.
                 *
                 * If we choose to replace this in the future, make absolutely
                 * sure that the proper waiting is being done in the async case,
                 * as there have been bugs in that area before.
                 */
                async_pages = atomic_read(&fs_info->async_delalloc_pages);
                if (!async_pages)
                        goto skip_async;

                /*
                 * We don't want to wait forever, if we wrote less pages in this
                 * loop than we have outstanding, only wait for that number of
                 * pages, otherwise we can wait for all async pages to finish
                 * before continuing.
                 */
                if (async_pages > nr_pages)
                        async_pages -= nr_pages;
                else
                        async_pages = 0;
                wait_event(fs_info->async_submit_wait,
                           atomic_read(&fs_info->async_delalloc_pages) <=
                           async_pages);
skip_async:
                loops++;
                if (wait_ordered && !trans) {
                        btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
                } else {
                        time_left = schedule_timeout_killable(1);
                        if (time_left)
                                break;
                }

                /*
                 * If we are for preemption we just want a one-shot of delalloc
                 * flushing so we can stop flushing if we decide we don't need
                 * to anymore.
                 */
                if (for_preempt)
                        break;

                spin_lock(&space_info->lock);
                if (list_empty(&space_info->tickets) &&
                    list_empty(&space_info->priority_tickets)) {
                        spin_unlock(&space_info->lock);
                        break;
                }
                spin_unlock(&space_info->lock);

                delalloc_bytes = percpu_counter_sum_positive(
                                                &fs_info->delalloc_bytes);
                ordered_bytes = percpu_counter_sum_positive(
                                                &fs_info->ordered_bytes);
        }
}

/*
 * Try to flush some data based on policy set by @state. This is only advisory
 * and may fail for various reasons. The caller is supposed to examine the
 * state of @space_info to detect the outcome.
 */
static void flush_space(struct btrfs_fs_info *fs_info,
                       struct btrfs_space_info *space_info, u64 num_bytes,
                       enum btrfs_flush_state state, bool for_preempt)
{
        struct btrfs_root *root = fs_info->tree_root;
        struct btrfs_trans_handle *trans;
        int nr;
        int ret = 0;

        switch (state) {
        case FLUSH_DELAYED_ITEMS_NR:
        case FLUSH_DELAYED_ITEMS:
                if (state == FLUSH_DELAYED_ITEMS_NR)
                        nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
                else
                        nr = -1;

                trans = btrfs_join_transaction_nostart(root);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
                        if (ret == -ENOENT)
                                ret = 0;
                        break;
                }
                ret = btrfs_run_delayed_items_nr(trans, nr);
                btrfs_end_transaction(trans);
                break;
        case FLUSH_DELALLOC:
        case FLUSH_DELALLOC_WAIT:
        case FLUSH_DELALLOC_FULL:
                if (state == FLUSH_DELALLOC_FULL)
                        num_bytes = U64_MAX;
                shrink_delalloc(fs_info, space_info, num_bytes,
                                state != FLUSH_DELALLOC, for_preempt);
                break;
        case FLUSH_DELAYED_REFS_NR:
        case FLUSH_DELAYED_REFS:
                trans = btrfs_join_transaction_nostart(root);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
                        if (ret == -ENOENT)
                                ret = 0;
                        break;
                }
                if (state == FLUSH_DELAYED_REFS_NR)
                        btrfs_run_delayed_refs(trans, num_bytes);
                else
                        btrfs_run_delayed_refs(trans, 0);
                btrfs_end_transaction(trans);
                break;
        case ALLOC_CHUNK:
        case ALLOC_CHUNK_FORCE:
                trans = btrfs_join_transaction(root);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
                        break;
                }
                ret = btrfs_chunk_alloc(trans,
                                btrfs_get_alloc_profile(fs_info, space_info->flags),
                                (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
                                        CHUNK_ALLOC_FORCE);
                btrfs_end_transaction(trans);

                if (ret > 0 || ret == -ENOSPC)
                        ret = 0;
                break;
        case RUN_DELAYED_IPUTS:
                /*
                 * If we have pending delayed iputs then we could free up a
                 * bunch of pinned space, so make sure we run the iputs before
                 * we do our pinned bytes check below.
                 */
                btrfs_run_delayed_iputs(fs_info);
                btrfs_wait_on_delayed_iputs(fs_info);
                break;
        case COMMIT_TRANS:
                ASSERT(current->journal_info == NULL);
                /*
                 * We don't want to start a new transaction, just attach to the
                 * current one or wait it fully commits in case its commit is
                 * happening at the moment. Note: we don't use a nostart join
                 * because that does not wait for a transaction to fully commit
                 * (only for it to be unblocked, state TRANS_STATE_UNBLOCKED).
                 */
                trans = btrfs_attach_transaction_barrier(root);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
                        if (ret == -ENOENT)
                                ret = 0;
                        break;
                }
                ret = btrfs_commit_transaction(trans);
                break;
        default:
                ret = -ENOSPC;
                break;
        }

        trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
                                ret, for_preempt);
        return;
}

static inline u64
btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
                                 struct btrfs_space_info *space_info)
{
        u64 used;
        u64 avail;
        u64 to_reclaim = space_info->reclaim_size;

        lockdep_assert_held(&space_info->lock);

        avail = calc_available_free_space(fs_info, space_info,
                                          BTRFS_RESERVE_FLUSH_ALL);
        used = btrfs_space_info_used(space_info, true);

        /*
         * We may be flushing because suddenly we have less space than we had
         * before, and now we're well over-committed based on our current free
         * space.  If that's the case add in our overage so we make sure to put
         * appropriate pressure on the flushing state machine.
         */
        if (space_info->total_bytes + avail < used)
                to_reclaim += used - (space_info->total_bytes + avail);

        return to_reclaim;
}

static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
                                    struct btrfs_space_info *space_info)
{
        const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv);
        u64 ordered, delalloc;
        u64 thresh;
        u64 used;

        thresh = mult_perc(space_info->total_bytes, 90);

        lockdep_assert_held(&space_info->lock);

        /* If we're just plain full then async reclaim just slows us down. */
        if ((space_info->bytes_used + space_info->bytes_reserved +
             global_rsv_size) >= thresh)
                return false;

        used = space_info->bytes_may_use + space_info->bytes_pinned;

        /* The total flushable belongs to the global rsv, don't flush. */
        if (global_rsv_size >= used)
                return false;

        /*
         * 128MiB is 1/4 of the maximum global rsv size.  If we have less than
         * that devoted to other reservations then there's no sense in flushing,
         * we don't have a lot of things that need flushing.
         */
        if (used - global_rsv_size <= SZ_128M)
                return false;

        /*
         * We have tickets queued, bail so we don't compete with the async
         * flushers.
         */
        if (space_info->reclaim_size)
                return false;

        /*
         * If we have over half of the free space occupied by reservations or
         * pinned then we want to start flushing.
         *
         * We do not do the traditional thing here, which is to say
         *
         *   if (used >= ((total_bytes + avail) / 2))
         *     return 1;
         *
         * because this doesn't quite work how we want.  If we had more than 50%
         * of the space_info used by bytes_used and we had 0 available we'd just
         * constantly run the background flusher.  Instead we want it to kick in
         * if our reclaimable space exceeds our clamped free space.
         *
         * Our clamping range is 2^1 -> 2^8.  Practically speaking that means
         * the following:
         *
         * Amount of RAM        Minimum threshold       Maximum threshold
         *
         *        256GiB                     1GiB                  128GiB
         *        128GiB                   512MiB                   64GiB
         *         64GiB                   256MiB                   32GiB
         *         32GiB                   128MiB                   16GiB
         *         16GiB                    64MiB                    8GiB
         *
         * These are the range our thresholds will fall in, corresponding to how
         * much delalloc we need for the background flusher to kick in.
         */

        thresh = calc_available_free_space(fs_info, space_info,
                                           BTRFS_RESERVE_FLUSH_ALL);
        used = space_info->bytes_used + space_info->bytes_reserved +
               space_info->bytes_readonly + global_rsv_size;
        if (used < space_info->total_bytes)
                thresh += space_info->total_bytes - used;
        thresh >>= space_info->clamp;

        used = space_info->bytes_pinned;

        /*
         * If we have more ordered bytes than delalloc bytes then we're either
         * doing a lot of DIO, or we simply don't have a lot of delalloc waiting
         * around.  Preemptive flushing is only useful in that it can free up
         * space before tickets need to wait for things to finish.  In the case
         * of ordered extents, preemptively waiting on ordered extents gets us
         * nothing, if our reservations are tied up in ordered extents we'll
         * simply have to slow down writers by forcing them to wait on ordered
         * extents.
         *
         * In the case that ordered is larger than delalloc, only include the
         * block reserves that we would actually be able to directly reclaim
         * from.  In this case if we're heavy on metadata operations this will
         * clearly be heavy enough to warrant preemptive flushing.  In the case
         * of heavy DIO or ordered reservations, preemptive flushing will just
         * waste time and cause us to slow down.
         *
         * We want to make sure we truly are maxed out on ordered however, so
         * cut ordered in half, and if it's still higher than delalloc then we
         * can keep flushing.  This is to avoid the case where we start
         * flushing, and now delalloc == ordered and we stop preemptively
         * flushing when we could still have several gigs of delalloc to flush.
         */
        ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1;
        delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
        if (ordered >= delalloc)
                used += btrfs_block_rsv_reserved(&fs_info->delayed_refs_rsv) +
                        btrfs_block_rsv_reserved(&fs_info->delayed_block_rsv);
        else
                used += space_info->bytes_may_use - global_rsv_size;

        return (used >= thresh && !btrfs_fs_closing(fs_info) &&
                !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
}

static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
                                  struct btrfs_space_info *space_info,
                                  struct reserve_ticket *ticket)
{
        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
        u64 min_bytes;

        if (!ticket->steal)
                return false;

        if (global_rsv->space_info != space_info)
                return false;

        spin_lock(&global_rsv->lock);
        min_bytes = mult_perc(global_rsv->size, 10);
        if (global_rsv->reserved < min_bytes + ticket->bytes) {
                spin_unlock(&global_rsv->lock);
                return false;
        }
        global_rsv->reserved -= ticket->bytes;
        remove_ticket(space_info, ticket);
        ticket->bytes = 0;
        wake_up(&ticket->wait);
        space_info->tickets_id++;
        if (global_rsv->reserved < global_rsv->size)
                global_rsv->full = 0;
        spin_unlock(&global_rsv->lock);

        return true;
}

/*
 * We've exhausted our flushing, start failing tickets.
 *
 * @fs_info - fs_info for this fs
 * @space_info - the space info we were flushing
 *
 * We call this when we've exhausted our flushing ability and haven't made
 * progress in satisfying tickets.  The reservation code handles tickets in
 * order, so if there is a large ticket first and then smaller ones we could
 * very well satisfy the smaller tickets.  This will attempt to wake up any
 * tickets in the list to catch this case.
 *
 * This function returns true if it was able to make progress by clearing out
 * other tickets, or if it stumbles across a ticket that was smaller than the
 * first ticket.
 */
static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
                                   struct btrfs_space_info *space_info)
{
        struct reserve_ticket *ticket;
        u64 tickets_id = space_info->tickets_id;
        const bool aborted = BTRFS_FS_ERROR(fs_info);

        trace_btrfs_fail_all_tickets(fs_info, space_info);

        if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
                btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
                __btrfs_dump_space_info(fs_info, space_info);
        }

        while (!list_empty(&space_info->tickets) &&
               tickets_id == space_info->tickets_id) {
                ticket = list_first_entry(&space_info->tickets,
                                          struct reserve_ticket, list);

                if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket))
                        return true;

                if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
                        btrfs_info(fs_info, "failing ticket with %llu bytes",
                                   ticket->bytes);

                remove_ticket(space_info, ticket);
                if (aborted)
                        ticket->error = -EIO;
                else
                        ticket->error = -ENOSPC;
                wake_up(&ticket->wait);

                /*
                 * We're just throwing tickets away, so more flushing may not
                 * trip over btrfs_try_granting_tickets, so we need to call it
                 * here to see if we can make progress with the next ticket in
                 * the list.
                 */
                if (!aborted)
                        btrfs_try_granting_tickets(fs_info, space_info);
        }
        return (tickets_id != space_info->tickets_id);
}

/*
 * This is for normal flushers, we can wait all goddamned day if we want to.  We
 * will loop and continuously try to flush as long as we are making progress.
 * We count progress as clearing off tickets each time we have to loop.
 */
static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
{
        struct btrfs_fs_info *fs_info;
        struct btrfs_space_info *space_info;
        u64 to_reclaim;
        enum btrfs_flush_state flush_state;
        int commit_cycles = 0;
        u64 last_tickets_id;

        fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
        space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);

        spin_lock(&space_info->lock);
        to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
        if (!to_reclaim) {
                space_info->flush = 0;
                spin_unlock(&space_info->lock);
                return;
        }
        last_tickets_id = space_info->tickets_id;
        spin_unlock(&space_info->lock);

        flush_state = FLUSH_DELAYED_ITEMS_NR;
        do {
                flush_space(fs_info, space_info, to_reclaim, flush_state, false);
                spin_lock(&space_info->lock);
                if (list_empty(&space_info->tickets)) {
                        space_info->flush = 0;
                        spin_unlock(&space_info->lock);
                        return;
                }
                to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
                                                              space_info);
                if (last_tickets_id == space_info->tickets_id) {
                        flush_state++;
                } else {
                        last_tickets_id = space_info->tickets_id;
                        flush_state = FLUSH_DELAYED_ITEMS_NR;
                        if (commit_cycles)
                                commit_cycles--;
                }

                /*
                 * We do not want to empty the system of delalloc unless we're
                 * under heavy pressure, so allow one trip through the flushing
                 * logic before we start doing a FLUSH_DELALLOC_FULL.
                 */
                if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles)
                        flush_state++;

                /*
                 * We don't want to force a chunk allocation until we've tried
                 * pretty hard to reclaim space.  Think of the case where we
                 * freed up a bunch of space and so have a lot of pinned space
                 * to reclaim.  We would rather use that than possibly create a
                 * underutilized metadata chunk.  So if this is our first run
                 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
                 * commit the transaction.  If nothing has changed the next go
                 * around then we can force a chunk allocation.
                 */
                if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
                        flush_state++;

                if (flush_state > COMMIT_TRANS) {
                        commit_cycles++;
                        if (commit_cycles > 2) {
                                if (maybe_fail_all_tickets(fs_info, space_info)) {
                                        flush_state = FLUSH_DELAYED_ITEMS_NR;
                                        commit_cycles--;
                                } else {
                                        space_info->flush = 0;
                                }
                        } else {
                                flush_state = FLUSH_DELAYED_ITEMS_NR;
                        }
                }
                spin_unlock(&space_info->lock);
        } while (flush_state <= COMMIT_TRANS);
}

/*
 * This handles pre-flushing of metadata space before we get to the point that
 * we need to start blocking threads on tickets.  The logic here is different
 * from the other flush paths because it doesn't rely on tickets to tell us how
 * much we need to flush, instead it attempts to keep us below the 80% full
 * watermark of space by flushing whichever reservation pool is currently the
 * largest.
 */
static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
{
        struct btrfs_fs_info *fs_info;
        struct btrfs_space_info *space_info;
        struct btrfs_block_rsv *delayed_block_rsv;
        struct btrfs_block_rsv *delayed_refs_rsv;
        struct btrfs_block_rsv *global_rsv;
        struct btrfs_block_rsv *trans_rsv;
        int loops = 0;

        fs_info = container_of(work, struct btrfs_fs_info,
                               preempt_reclaim_work);
        space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
        delayed_block_rsv = &fs_info->delayed_block_rsv;
        delayed_refs_rsv = &fs_info->delayed_refs_rsv;
        global_rsv = &fs_info->global_block_rsv;
        trans_rsv = &fs_info->trans_block_rsv;

        spin_lock(&space_info->lock);
        while (need_preemptive_reclaim(fs_info, space_info)) {
                enum btrfs_flush_state flush;
                u64 delalloc_size = 0;
                u64 to_reclaim, block_rsv_size;
                const u64 global_rsv_size = btrfs_block_rsv_reserved(global_rsv);

                loops++;

                /*
                 * We don't have a precise counter for the metadata being
                 * reserved for delalloc, so we'll approximate it by subtracting
                 * out the block rsv's space from the bytes_may_use.  If that
                 * amount is higher than the individual reserves, then we can
                 * assume it's tied up in delalloc reservations.
                 */
                block_rsv_size = global_rsv_size +
                        btrfs_block_rsv_reserved(delayed_block_rsv) +
                        btrfs_block_rsv_reserved(delayed_refs_rsv) +
                        btrfs_block_rsv_reserved(trans_rsv);
                if (block_rsv_size < space_info->bytes_may_use)
                        delalloc_size = space_info->bytes_may_use - block_rsv_size;

                /*
                 * We don't want to include the global_rsv in our calculation,
                 * because that's space we can't touch.  Subtract it from the
                 * block_rsv_size for the next checks.
                 */
                block_rsv_size -= global_rsv_size;

                /*
                 * We really want to avoid flushing delalloc too much, as it
                 * could result in poor allocation patterns, so only flush it if
                 * it's larger than the rest of the pools combined.
                 */
                if (delalloc_size > block_rsv_size) {
                        to_reclaim = delalloc_size;
                        flush = FLUSH_DELALLOC;
                } else if (space_info->bytes_pinned >
                           (btrfs_block_rsv_reserved(delayed_block_rsv) +
                            btrfs_block_rsv_reserved(delayed_refs_rsv))) {
                        to_reclaim = space_info->bytes_pinned;
                        flush = COMMIT_TRANS;
                } else if (btrfs_block_rsv_reserved(delayed_block_rsv) >
                           btrfs_block_rsv_reserved(delayed_refs_rsv)) {
                        to_reclaim = btrfs_block_rsv_reserved(delayed_block_rsv);
                        flush = FLUSH_DELAYED_ITEMS_NR;
                } else {
                        to_reclaim = btrfs_block_rsv_reserved(delayed_refs_rsv);
                        flush = FLUSH_DELAYED_REFS_NR;
                }

                spin_unlock(&space_info->lock);

                /*
                 * We don't want to reclaim everything, just a portion, so scale
                 * down the to_reclaim by 1/4.  If it takes us down to 0,
                 * reclaim 1 items worth.
                 */
                to_reclaim >>= 2;
                if (!to_reclaim)
                        to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1);
                flush_space(fs_info, space_info, to_reclaim, flush, true);
                cond_resched();
                spin_lock(&space_info->lock);
        }

        /* We only went through once, back off our clamping. */
        if (loops == 1 && !space_info->reclaim_size)
                space_info->clamp = max(1, space_info->clamp - 1);
        trace_btrfs_done_preemptive_reclaim(fs_info, space_info);
        spin_unlock(&space_info->lock);
}

/*
 * FLUSH_DELALLOC_WAIT:
 *   Space is freed from flushing delalloc in one of two ways.
 *
 *   1) compression is on and we allocate less space than we reserved
 *   2) we are overwriting existing space
 *
 *   For #1 that extra space is reclaimed as soon as the delalloc pages are
 *   COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
 *   length to ->bytes_reserved, and subtracts the reserved space from
 *   ->bytes_may_use.
 *
 *   For #2 this is trickier.  Once the ordered extent runs we will drop the
 *   extent in the range we are overwriting, which creates a delayed ref for
 *   that freed extent.  This however is not reclaimed until the transaction
 *   commits, thus the next stages.
 *
 * RUN_DELAYED_IPUTS
 *   If we are freeing inodes, we want to make sure all delayed iputs have
 *   completed, because they could have been on an inode with i_nlink == 0, and
 *   thus have been truncated and freed up space.  But again this space is not
 *   immediately re-usable, it comes in the form of a delayed ref, which must be
 *   run and then the transaction must be committed.
 *
 * COMMIT_TRANS
 *   This is where we reclaim all of the pinned space generated by running the
 *   iputs
 *
 * ALLOC_CHUNK_FORCE
 *   For data we start with alloc chunk force, however we could have been full
 *   before, and then the transaction commit could have freed new block groups,
 *   so if we now have space to allocate do the force chunk allocation.
 */
static const enum btrfs_flush_state data_flush_states[] = {
        FLUSH_DELALLOC_FULL,
        RUN_DELAYED_IPUTS,
        COMMIT_TRANS,
        ALLOC_CHUNK_FORCE,
};

static void btrfs_async_reclaim_data_space(struct work_struct *work)
{
        struct btrfs_fs_info *fs_info;
        struct btrfs_space_info *space_info;
        u64 last_tickets_id;
        enum btrfs_flush_state flush_state = 0;

        fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
        space_info = fs_info->data_sinfo;

        spin_lock(&space_info->lock);
        if (list_empty(&space_info->tickets)) {
                space_info->flush = 0;
                spin_unlock(&space_info->lock);
                return;
        }
        last_tickets_id = space_info->tickets_id;
        spin_unlock(&space_info->lock);

        while (!space_info->full) {
                flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
                spin_lock(&space_info->lock);
                if (list_empty(&space_info->tickets)) {
                        space_info->flush = 0;
                        spin_unlock(&space_info->lock);
                        return;
                }

                /* Something happened, fail everything and bail. */
                if (BTRFS_FS_ERROR(fs_info))
                        goto aborted_fs;
                last_tickets_id = space_info->tickets_id;
                spin_unlock(&space_info->lock);
        }

        while (flush_state < ARRAY_SIZE(data_flush_states)) {
                flush_space(fs_info, space_info, U64_MAX,
                            data_flush_states[flush_state], false);
                spin_lock(&space_info->lock);
                if (list_empty(&space_info->tickets)) {
                        space_info->flush = 0;
                        spin_unlock(&space_info->lock);
                        return;
                }

                if (last_tickets_id == space_info->tickets_id) {
                        flush_state++;
                } else {
                        last_tickets_id = space_info->tickets_id;
                        flush_state = 0;
                }

                if (flush_state >= ARRAY_SIZE(data_flush_states)) {
                        if (space_info->full) {
                                if (maybe_fail_all_tickets(fs_info, space_info))
                                        flush_state = 0;
                                else
                                        space_info->flush = 0;
                        } else {
                                flush_state = 0;
                        }

                        /* Something happened, fail everything and bail. */
                        if (BTRFS_FS_ERROR(fs_info))
                                goto aborted_fs;

                }
                spin_unlock(&space_info->lock);
        }
        return;

aborted_fs:
        maybe_fail_all_tickets(fs_info, space_info);
        space_info->flush = 0;
        spin_unlock(&space_info->lock);
}

void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
{
        INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
        INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space);
        INIT_WORK(&fs_info->preempt_reclaim_work,
                  btrfs_preempt_reclaim_metadata_space);
}

static const enum btrfs_flush_state priority_flush_states[] = {
        FLUSH_DELAYED_ITEMS_NR,
        FLUSH_DELAYED_ITEMS,
        ALLOC_CHUNK,
};

static const enum btrfs_flush_state evict_flush_states[] = {
        FLUSH_DELAYED_ITEMS_NR,
        FLUSH_DELAYED_ITEMS,
        FLUSH_DELAYED_REFS_NR,
        FLUSH_DELAYED_REFS,
        FLUSH_DELALLOC,
        FLUSH_DELALLOC_WAIT,
        FLUSH_DELALLOC_FULL,
        ALLOC_CHUNK,
        COMMIT_TRANS,
};

static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
                                struct btrfs_space_info *space_info,
                                struct reserve_ticket *ticket,
                                const enum btrfs_flush_state *states,
                                int states_nr)
{
        u64 to_reclaim;
        int flush_state = 0;

        spin_lock(&space_info->lock);
        to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
        /*
         * This is the priority reclaim path, so to_reclaim could be >0 still
         * because we may have only satisfied the priority tickets and still
         * left non priority tickets on the list.  We would then have
         * to_reclaim but ->bytes == 0.
         */
        if (ticket->bytes == 0) {
                spin_unlock(&space_info->lock);
                return;
        }

        while (flush_state < states_nr) {
                spin_unlock(&space_info->lock);
                flush_space(fs_info, space_info, to_reclaim, states[flush_state],
                            false);
                flush_state++;
                spin_lock(&space_info->lock);
                if (ticket->bytes == 0) {
                        spin_unlock(&space_info->lock);
                        return;
                }
        }

        /*
         * Attempt to steal from the global rsv if we can, except if the fs was
         * turned into error mode due to a transaction abort when flushing space
         * above, in that case fail with the abort error instead of returning
         * success to the caller if we can steal from the global rsv - this is
         * just to have caller fail immeditelly instead of later when trying to
         * modify the fs, making it easier to debug -ENOSPC problems.
         */
        if (BTRFS_FS_ERROR(fs_info)) {
                ticket->error = BTRFS_FS_ERROR(fs_info);
                remove_ticket(space_info, ticket);
        } else if (!steal_from_global_rsv(fs_info, space_info, ticket)) {
                ticket->error = -ENOSPC;
                remove_ticket(space_info, ticket);
        }

        /*
         * We must run try_granting_tickets here because we could be a large
         * ticket in front of a smaller ticket that can now be satisfied with
         * the available space.
         */
        btrfs_try_granting_tickets(fs_info, space_info);
        spin_unlock(&space_info->lock);
}

static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
                                        struct btrfs_space_info *space_info,
                                        struct reserve_ticket *ticket)
{
        spin_lock(&space_info->lock);

        /* We could have been granted before we got here. */
        if (ticket->bytes == 0) {
                spin_unlock(&space_info->lock);
                return;
        }

        while (!space_info->full) {
                spin_unlock(&space_info->lock);
                flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
                spin_lock(&space_info->lock);
                if (ticket->bytes == 0) {
                        spin_unlock(&space_info->lock);
                        return;
                }
        }

        ticket->error = -ENOSPC;
        remove_ticket(space_info, ticket);
        btrfs_try_granting_tickets(fs_info, space_info);
        spin_unlock(&space_info->lock);
}

static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
                                struct btrfs_space_info *space_info,
                                struct reserve_ticket *ticket)

{
        DEFINE_WAIT(wait);
        int ret = 0;

        spin_lock(&space_info->lock);
        while (ticket->bytes > 0 && ticket->error == 0) {
                ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
                if (ret) {
                        /*
                         * Delete us from the list. After we unlock the space
                         * info, we don't want the async reclaim job to reserve
                         * space for this ticket. If that would happen, then the
                         * ticket's task would not known that space was reserved
                         * despite getting an error, resulting in a space leak
                         * (bytes_may_use counter of our space_info).
                         */
                        remove_ticket(space_info, ticket);
                        ticket->error = -EINTR;
                        break;
                }
                spin_unlock(&space_info->lock);

                schedule();

                finish_wait(&ticket->wait, &wait);
                spin_lock(&space_info->lock);
        }
        spin_unlock(&space_info->lock);
}

/*
 * Do the appropriate flushing and waiting for a ticket.
 *
 * @fs_info:    the filesystem
 * @space_info: space info for the reservation
 * @ticket:     ticket for the reservation
 * @start_ns:   timestamp when the reservation started
 * @orig_bytes: amount of bytes originally reserved
 * @flush:      how much we can flush
 *
 * This does the work of figuring out how to flush for the ticket, waiting for
 * the reservation, and returning the appropriate error if there is one.
 */
static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
                                 struct btrfs_space_info *space_info,
                                 struct reserve_ticket *ticket,
                                 u64 start_ns, u64 orig_bytes,
                                 enum btrfs_reserve_flush_enum flush)
{
        int ret;

        switch (flush) {
        case BTRFS_RESERVE_FLUSH_DATA:
        case BTRFS_RESERVE_FLUSH_ALL:
        case BTRFS_RESERVE_FLUSH_ALL_STEAL:
                wait_reserve_ticket(fs_info, space_info, ticket);
                break;
        case BTRFS_RESERVE_FLUSH_LIMIT:
                priority_reclaim_metadata_space(fs_info, space_info, ticket,
                                                priority_flush_states,
                                                ARRAY_SIZE(priority_flush_states));
                break;
        case BTRFS_RESERVE_FLUSH_EVICT:
                priority_reclaim_metadata_space(fs_info, space_info, ticket,
                                                evict_flush_states,
                                                ARRAY_SIZE(evict_flush_states));
                break;
        case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
                priority_reclaim_data_space(fs_info, space_info, ticket);
                break;
        default:
                ASSERT(0);
                break;
        }

        ret = ticket->error;
        ASSERT(list_empty(&ticket->list));
        /*
         * Check that we can't have an error set if the reservation succeeded,
         * as that would confuse tasks and lead them to error out without
         * releasing reserved space (if an error happens the expectation is that
         * space wasn't reserved at all).
         */
        ASSERT(!(ticket->bytes == 0 && ticket->error));
        trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes,
                                   start_ns, flush, ticket->error);
        return ret;
}

/*
 * This returns true if this flush state will go through the ordinary flushing
 * code.
 */
static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
{
        return        (flush == BTRFS_RESERVE_FLUSH_ALL) ||
                (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
}

static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info,
                                       struct btrfs_space_info *space_info)
{
        u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
        u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);

        /*
         * If we're heavy on ordered operations then clamping won't help us.  We
         * need to clamp specifically to keep up with dirty'ing buffered
         * writers, because there's not a 1:1 correlation of writing delalloc
         * and freeing space, like there is with flushing delayed refs or
         * delayed nodes.  If we're already more ordered than delalloc then
         * we're keeping up, otherwise we aren't and should probably clamp.
         */
        if (ordered < delalloc)
                space_info->clamp = min(space_info->clamp + 1, 8);
}

static inline bool can_steal(enum btrfs_reserve_flush_enum flush)
{
        return (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
                flush == BTRFS_RESERVE_FLUSH_EVICT);
}

/*
 * NO_FLUSH and FLUSH_EMERGENCY don't want to create a ticket, they just want to
 * fail as quickly as possible.
 */
static inline bool can_ticket(enum btrfs_reserve_flush_enum flush)
{
        return (flush != BTRFS_RESERVE_NO_FLUSH &&
                flush != BTRFS_RESERVE_FLUSH_EMERGENCY);
}

/*
 * Try to reserve bytes from the block_rsv's space.
 *
 * @fs_info:    the filesystem
 * @space_info: space info we want to allocate from
 * @orig_bytes: number of bytes we want
 * @flush:      whether or not we can flush to make our reservation
 *
 * This will reserve orig_bytes number of bytes from the space info associated
 * with the block_rsv.  If there is not enough space it will make an attempt to
 * flush out space to make room.  It will do this by flushing delalloc if
 * possible or committing the transaction.  If flush is 0 then no attempts to
 * regain reservations will be made and this will fail if there is not enough
 * space already.
 */
static int __reserve_bytes(struct btrfs_fs_info *fs_info,
                           struct btrfs_space_info *space_info, u64 orig_bytes,
                           enum btrfs_reserve_flush_enum flush)
{
        struct work_struct *async_work;
        struct reserve_ticket ticket;
        u64 start_ns = 0;
        u64 used;
        int ret = -ENOSPC;
        bool pending_tickets;

        ASSERT(orig_bytes);
        /*
         * If have a transaction handle (current->journal_info != NULL), then
         * the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor
         * BTRFS_RESERVE_FLUSH_EVICT, as we could deadlock because those
         * flushing methods can trigger transaction commits.
         */
        if (current->journal_info) {
                /* One assert per line for easier debugging. */
                ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL);
                ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL);
                ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT);
        }

        if (flush == BTRFS_RESERVE_FLUSH_DATA)
                async_work = &fs_info->async_data_reclaim_work;
        else
                async_work = &fs_info->async_reclaim_work;

        spin_lock(&space_info->lock);
        used = btrfs_space_info_used(space_info, true);

        /*
         * We don't want NO_FLUSH allocations to jump everybody, they can
         * generally handle ENOSPC in a different way, so treat them the same as
         * normal flushers when it comes to skipping pending tickets.
         */
        if (is_normal_flushing(flush) || (flush == BTRFS_RESERVE_NO_FLUSH))
                pending_tickets = !list_empty(&space_info->tickets) ||
                        !list_empty(&space_info->priority_tickets);
        else
                pending_tickets = !list_empty(&space_info->priority_tickets);

        /*
         * Carry on if we have enough space (short-circuit) OR call
         * can_overcommit() to ensure we can overcommit to continue.
         */
        if (!pending_tickets &&
            ((used + orig_bytes <= space_info->total_bytes) ||
             btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
                btrfs_space_info_update_bytes_may_use(fs_info, space_info,
                                                      orig_bytes);
                ret = 0;
        }

        /*
         * Things are dire, we need to make a reservation so we don't abort.  We
         * will let this reservation go through as long as we have actual space
         * left to allocate for the block.
         */
        if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) {
                used = btrfs_space_info_used(space_info, false);
                if (used + orig_bytes <= space_info->total_bytes) {
                        btrfs_space_info_update_bytes_may_use(fs_info, space_info,
                                                              orig_bytes);
                        ret = 0;
                }
        }

        /*
         * If we couldn't make a reservation then setup our reservation ticket
         * and kick the async worker if it's not already running.
         *
         * If we are a priority flusher then we just need to add our ticket to
         * the list and we will do our own flushing further down.
         */
        if (ret && can_ticket(flush)) {
                ticket.bytes = orig_bytes;
                ticket.error = 0;
                space_info->reclaim_size += ticket.bytes;
                init_waitqueue_head(&ticket.wait);
                ticket.steal = can_steal(flush);
                if (trace_btrfs_reserve_ticket_enabled())
                        start_ns = ktime_get_ns();

                if (flush == BTRFS_RESERVE_FLUSH_ALL ||
                    flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
                    flush == BTRFS_RESERVE_FLUSH_DATA) {
                        list_add_tail(&ticket.list, &space_info->tickets);
                        if (!space_info->flush) {
                                /*
                                 * We were forced to add a reserve ticket, so
                                 * our preemptive flushing is unable to keep
                                 * up.  Clamp down on the threshold for the
                                 * preemptive flushing in order to keep up with
                                 * the workload.
                                 */
                                maybe_clamp_preempt(fs_info, space_info);

                                space_info->flush = 1;
                                trace_btrfs_trigger_flush(fs_info,
                                                          space_info->flags,
                                                          orig_bytes, flush,
                                                          "enospc");
                                queue_work(system_unbound_wq, async_work);
                        }
                } else {
                        list_add_tail(&ticket.list,
                                      &space_info->priority_tickets);
                }
        } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
                /*
                 * We will do the space reservation dance during log replay,
                 * which means we won't have fs_info->fs_root set, so don't do
                 * the async reclaim as we will panic.
                 */
                if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
                    !work_busy(&fs_info->preempt_reclaim_work) &&
                    need_preemptive_reclaim(fs_info, space_info)) {
                        trace_btrfs_trigger_flush(fs_info, space_info->flags,
                                                  orig_bytes, flush, "preempt");
                        queue_work(system_unbound_wq,
                                   &fs_info->preempt_reclaim_work);
                }
        }
        spin_unlock(&space_info->lock);
        if (!ret || !can_ticket(flush))
                return ret;

        return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns,
                                     orig_bytes, flush);
}

/*
 * Try to reserve metadata bytes from the block_rsv's space.
 *
 * @fs_info:    the filesystem
 * @space_info: the space_info we're allocating for
 * @orig_bytes: number of bytes we want
 * @flush:      whether or not we can flush to make our reservation
 *
 * This will reserve orig_bytes number of bytes from the space info associated
 * with the block_rsv.  If there is not enough space it will make an attempt to
 * flush out space to make room.  It will do this by flushing delalloc if
 * possible or committing the transaction.  If flush is 0 then no attempts to
 * regain reservations will be made and this will fail if there is not enough
 * space already.
 */
int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
                                 struct btrfs_space_info *space_info,
                                 u64 orig_bytes,
                                 enum btrfs_reserve_flush_enum flush)
{
        int ret;

        ret = __reserve_bytes(fs_info, space_info, orig_bytes, flush);
        if (ret == -ENOSPC) {
                trace_btrfs_space_reservation(fs_info, "space_info:enospc",
                                              space_info->flags, orig_bytes, 1);

                if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
                        btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0);
        }
        return ret;
}

/*
 * Try to reserve data bytes for an allocation.
 *
 * @fs_info: the filesystem
 * @bytes:   number of bytes we need
 * @flush:   how we are allowed to flush
 *
 * This will reserve bytes from the data space info.  If there is not enough
 * space then we will attempt to flush space as specified by flush.
 */
int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
                             enum btrfs_reserve_flush_enum flush)
{
        struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
        int ret;

        ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
               flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE ||
               flush == BTRFS_RESERVE_NO_FLUSH);
        ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);

        ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
        if (ret == -ENOSPC) {
                trace_btrfs_space_reservation(fs_info, "space_info:enospc",
                                              data_sinfo->flags, bytes, 1);
                if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
                        btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);
        }
        return ret;
}

/* Dump all the space infos when we abort a transaction due to ENOSPC. */
__cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info)
{
        struct btrfs_space_info *space_info;

        btrfs_info(fs_info, "dumping space info:");
        list_for_each_entry(space_info, &fs_info->space_info, list) {
                spin_lock(&space_info->lock);
                __btrfs_dump_space_info(fs_info, space_info);
                spin_unlock(&space_info->lock);
        }
        dump_global_block_rsv(fs_info);
}

/*
 * Account the unused space of all the readonly block group in the space_info.
 * takes mirrors into account.
 */
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
{
        struct btrfs_block_group *block_group;
        u64 free_bytes = 0;
        int factor;

        /* It's df, we don't care if it's racy */
        if (list_empty(&sinfo->ro_bgs))
                return 0;

        spin_lock(&sinfo->lock);
        list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
                spin_lock(&block_group->lock);

                if (!block_group->ro) {
                        spin_unlock(&block_group->lock);
                        continue;
                }

                factor = btrfs_bg_type_to_factor(block_group->flags);
                free_bytes += (block_group->length -
                               block_group->used) * factor;

                spin_unlock(&block_group->lock);
        }
        spin_unlock(&sinfo->lock);

        return free_bytes;
}






















































































































































































































































































    2 













    1 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM cgroup

#if !defined(_TRACE_CGROUP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_CGROUP_H

#include <linux/cgroup.h>
#include <linux/tracepoint.h>

DECLARE_EVENT_CLASS(cgroup_root,

        TP_PROTO(struct cgroup_root *root),

        TP_ARGS(root),

        TP_STRUCT__entry(
                __field(        int,                root                        )
                __field(        u16,                ss_mask                        )
                __string(        name,                root->name                )
        ),

        TP_fast_assign(
                __entry->root = root->hierarchy_id;
                __entry->ss_mask = root->subsys_mask;
                __assign_str(name);
        ),

        TP_printk("root=%d ss_mask=%#x name=%s",
                  __entry->root, __entry->ss_mask, __get_str(name))
);

DEFINE_EVENT(cgroup_root, cgroup_setup_root,

        TP_PROTO(struct cgroup_root *root),

        TP_ARGS(root)
);

DEFINE_EVENT(cgroup_root, cgroup_destroy_root,

        TP_PROTO(struct cgroup_root *root),

        TP_ARGS(root)
);

DEFINE_EVENT(cgroup_root, cgroup_remount,

        TP_PROTO(struct cgroup_root *root),

        TP_ARGS(root)
);

DECLARE_EVENT_CLASS(cgroup,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path),

        TP_STRUCT__entry(
                __field(        int,                root                        )
                __field(        int,                level                        )
                __field(        u64,                id                        )
                __string(        path,                path                        )
        ),

        TP_fast_assign(
                __entry->root = cgrp->root->hierarchy_id;
                __entry->id = cgroup_id(cgrp);
                __entry->level = cgrp->level;
                __assign_str(path);
        ),

        TP_printk("root=%d id=%llu level=%d path=%s",
                  __entry->root, __entry->id, __entry->level, __get_str(path))
);

DEFINE_EVENT(cgroup, cgroup_mkdir,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DEFINE_EVENT(cgroup, cgroup_rmdir,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DEFINE_EVENT(cgroup, cgroup_release,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DEFINE_EVENT(cgroup, cgroup_rename,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DEFINE_EVENT(cgroup, cgroup_freeze,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DEFINE_EVENT(cgroup, cgroup_unfreeze,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DECLARE_EVENT_CLASS(cgroup_migrate,

        TP_PROTO(struct cgroup *dst_cgrp, const char *path,
                 struct task_struct *task, bool threadgroup),

        TP_ARGS(dst_cgrp, path, task, threadgroup),

        TP_STRUCT__entry(
                __field(        int,                dst_root                )
                __field(        int,                dst_level                )
                __field(        u64,                dst_id                        )
                __field(        int,                pid                        )
                __string(        dst_path,        path                        )
                __string(        comm,                task->comm                )
        ),

        TP_fast_assign(
                __entry->dst_root = dst_cgrp->root->hierarchy_id;
                __entry->dst_id = cgroup_id(dst_cgrp);
                __entry->dst_level = dst_cgrp->level;
                __assign_str(dst_path);
                __entry->pid = task->pid;
                __assign_str(comm);
        ),

        TP_printk("dst_root=%d dst_id=%llu dst_level=%d dst_path=%s pid=%d comm=%s",
                  __entry->dst_root, __entry->dst_id, __entry->dst_level,
                  __get_str(dst_path), __entry->pid, __get_str(comm))
);

DEFINE_EVENT(cgroup_migrate, cgroup_attach_task,

        TP_PROTO(struct cgroup *dst_cgrp, const char *path,
                 struct task_struct *task, bool threadgroup),

        TP_ARGS(dst_cgrp, path, task, threadgroup)
);

DEFINE_EVENT(cgroup_migrate, cgroup_transfer_tasks,

        TP_PROTO(struct cgroup *dst_cgrp, const char *path,
                 struct task_struct *task, bool threadgroup),

        TP_ARGS(dst_cgrp, path, task, threadgroup)
);

DECLARE_EVENT_CLASS(cgroup_event,

        TP_PROTO(struct cgroup *cgrp, const char *path, int val),

        TP_ARGS(cgrp, path, val),

        TP_STRUCT__entry(
                __field(        int,                root                        )
                __field(        int,                level                        )
                __field(        u64,                id                        )
                __string(        path,                path                        )
                __field(        int,                val                        )
        ),

        TP_fast_assign(
                __entry->root = cgrp->root->hierarchy_id;
                __entry->id = cgroup_id(cgrp);
                __entry->level = cgrp->level;
                __assign_str(path);
                __entry->val = val;
        ),

        TP_printk("root=%d id=%llu level=%d path=%s val=%d",
                  __entry->root, __entry->id, __entry->level, __get_str(path),
                  __entry->val)
);

DEFINE_EVENT(cgroup_event, cgroup_notify_populated,

        TP_PROTO(struct cgroup *cgrp, const char *path, int val),

        TP_ARGS(cgrp, path, val)
);

DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,

        TP_PROTO(struct cgroup *cgrp, const char *path, int val),

        TP_ARGS(cgrp, path, val)
);

DECLARE_EVENT_CLASS(cgroup_rstat,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended),

        TP_STRUCT__entry(
                __field(        int,                root                        )
                __field(        int,                level                        )
                __field(        u64,                id                        )
                __field(        int,                cpu                        )
                __field(        bool,                contended                )
        ),

        TP_fast_assign(
                __entry->root = cgrp->root->hierarchy_id;
                __entry->id = cgroup_id(cgrp);
                __entry->level = cgrp->level;
                __entry->cpu = cpu;
                __entry->contended = contended;
        ),

        TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
                  __entry->root, __entry->id, __entry->level,
                  __entry->cpu, __entry->contended)
);

/* Related to global: cgroup_rstat_lock */
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

/* Related to per CPU: cgroup_rstat_cpu_lock */
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

#endif /* _TRACE_CGROUP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>































































   22 
   23 





    1 
    2 




    9 













































    2 
    7 














































































   14 

    8 
















    6 


















   13 










































































































































































































    1 






    2 































































    1 
    1 




    4 


































    1 















    2 


    1 


    1 



















    1 











    1 


    1 

























































    3 







    3 

    3 

    2 








































    2 





    1 



    2 

    2 






    3 



    1 

    3 







































    1 


    1 



    1 






























    1 





















    1 
















    2 
    3 




















































    8 





   12 
   12 



   11 














    9 




































    8 
    6 










    1 
    6 









    7 


    8 

    7 







    8 


    6 



    5 




    7 
























    6 




























    8 




    8 

























    8 







    8 














    6 


    8 








































   10 
   10 

   11 







    9 

    4 








    9 


    9 




























   21 

   22 






























    3 

    1 





    3 
    3 























































   14 












   13 










   14 












   21 




   22 


   20 

   18 
   17 




   20 

















   20 
   23 

   13 

   14 

   14 

   21 





















   22 


   21 
   22 


    8 







    1 




    1 
    1 




























    4 












    4 
    4 























































































    3 




    3 




















    3 




    2 

























    3 


    3 







    2 












    6 









    2 



    6 


    1 

    1 
    1 

    1 





























    2 



    2 







    2 











































    2 


    4 


    2 

    3 


































    2 






    2 




































    2 














    1 












    2 





    2 


    2 

    1 





    2 




    2 

    2 



    2 




    2 





    1 
























































    1 



    1 









    1 


















    1 









































































    3 









    3 







    3 



    2 
    1 




    3 

    1 
    2 




    2 




    2 

    1 




    1 





    1 
    1 













    3 




    3 






    1 
    1 





    6 












    3 


    6 


    4 
    4 



    3 







    2 
    5 


































    4 
    4 


    4 




















































































































    1 










    1 


    1 


















    1 
    1 

    1 








    1 









    1 
    1 















    1 
    1 
    1 







    1 
    1 



    1 

































































































































































    2 
    2 























    1 












    2 









    2 



    2 



























































    2 





    2 














































   10 












    7 
    5 













   10 







    8 
    2 







    2 























    4 
    4 
    4 













    4 
    4 









    4 












    4 








    2 




    4 


    3 

    4 






























    4 




    4 
    4 

























    4 









    4 






























   12 








    9 






   11 
   10 






    4 




    3 
    3 

























    1 
    1 























    1 


































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/buffer.c
 *
 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
 */

/*
 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
 *
 * Removed a lot of unnecessary code and simplified things now that
 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
 *
 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
 *
 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
 *
 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
 */

#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/syscalls.h>
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/capability.h>
#include <linux/blkdev.h>
#include <linux/file.h>
#include <linux/quotaops.h>
#include <linux/highmem.h>
#include <linux/export.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
#include <linux/hash.h>
#include <linux/suspend.h>
#include <linux/buffer_head.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/bio.h>
#include <linux/cpu.h>
#include <linux/bitops.h>
#include <linux/mpage.h>
#include <linux/bit_spinlock.h>
#include <linux/pagevec.h>
#include <linux/sched/mm.h>
#include <trace/events/block.h>
#include <linux/fscrypt.h>
#include <linux/fsverity.h>
#include <linux/sched/isolation.h>

#include "internal.h"

static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
                          enum rw_hint hint, struct writeback_control *wbc);

#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)

inline void touch_buffer(struct buffer_head *bh)
{
        trace_block_touch_buffer(bh);
        folio_mark_accessed(bh->b_folio);
}
EXPORT_SYMBOL(touch_buffer);

void __lock_buffer(struct buffer_head *bh)
{
        wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_buffer);

void unlock_buffer(struct buffer_head *bh)
{
        clear_bit_unlock(BH_Lock, &bh->b_state);
        smp_mb__after_atomic();
        wake_up_bit(&bh->b_state, BH_Lock);
}
EXPORT_SYMBOL(unlock_buffer);

/*
 * Returns if the folio has dirty or writeback buffers. If all the buffers
 * are unlocked and clean then the folio_test_dirty information is stale. If
 * any of the buffers are locked, it is assumed they are locked for IO.
 */
void buffer_check_dirty_writeback(struct folio *folio,
                                     bool *dirty, bool *writeback)
{
        struct buffer_head *head, *bh;
        *dirty = false;
        *writeback = false;

        BUG_ON(!folio_test_locked(folio));

        head = folio_buffers(folio);
        if (!head)
                return;

        if (folio_test_writeback(folio))
                *writeback = true;

        bh = head;
        do {
                if (buffer_locked(bh))
                        *writeback = true;

                if (buffer_dirty(bh))
                        *dirty = true;

                bh = bh->b_this_page;
        } while (bh != head);
}

/*
 * Block until a buffer comes unlocked.  This doesn't stop it
 * from becoming locked again - you have to lock it yourself
 * if you want to preserve its state.
 */
void __wait_on_buffer(struct buffer_head * bh)
{
        wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__wait_on_buffer);

static void buffer_io_error(struct buffer_head *bh, char *msg)
{
        if (!test_bit(BH_Quiet, &bh->b_state))
                printk_ratelimited(KERN_ERR
                        "Buffer I/O error on dev %pg, logical block %llu%s\n",
                        bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
}

/*
 * End-of-IO handler helper function which does not touch the bh after
 * unlocking it.
 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
 * a race there is benign: unlock_buffer() only use the bh's address for
 * hashing after unlocking the buffer, so it doesn't actually touch the bh
 * itself.
 */
static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
{
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                /* This happens, due to failed read-ahead attempts. */
                clear_buffer_uptodate(bh);
        }
        unlock_buffer(bh);
}

/*
 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 * unlock the buffer.
 */
void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
{
        __end_buffer_read_notouch(bh, uptodate);
        put_bh(bh);
}
EXPORT_SYMBOL(end_buffer_read_sync);

void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
{
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                buffer_io_error(bh, ", lost sync page write");
                mark_buffer_write_io_error(bh);
                clear_buffer_uptodate(bh);
        }
        unlock_buffer(bh);
        put_bh(bh);
}
EXPORT_SYMBOL(end_buffer_write_sync);

/*
 * Various filesystems appear to want __find_get_block to be non-blocking.
 * But it's the page lock which protects the buffers.  To get around this,
 * we get exclusion from try_to_free_buffers with the blockdev mapping's
 * i_private_lock.
 *
 * Hack idea: for the blockdev mapping, i_private_lock contention
 * may be quite high.  This code could TryLock the page, and if that
 * succeeds, there is no need to take i_private_lock.
 */
static struct buffer_head *
__find_get_block_slow(struct block_device *bdev, sector_t block)
{
        struct address_space *bd_mapping = bdev->bd_mapping;
        const int blkbits = bd_mapping->host->i_blkbits;
        struct buffer_head *ret = NULL;
        pgoff_t index;
        struct buffer_head *bh;
        struct buffer_head *head;
        struct folio *folio;
        int all_mapped = 1;
        static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);

        index = ((loff_t)block << blkbits) / PAGE_SIZE;
        folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0);
        if (IS_ERR(folio))
                goto out;

        spin_lock(&bd_mapping->i_private_lock);
        head = folio_buffers(folio);
        if (!head)
                goto out_unlock;
        bh = head;
        do {
                if (!buffer_mapped(bh))
                        all_mapped = 0;
                else if (bh->b_blocknr == block) {
                        ret = bh;
                        get_bh(bh);
                        goto out_unlock;
                }
                bh = bh->b_this_page;
        } while (bh != head);

        /* we might be here because some of the buffers on this page are
         * not mapped.  This is due to various races between
         * file io on the block device and getblk.  It gets dealt with
         * elsewhere, don't buffer_error if we had some unmapped buffers
         */
        ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
        if (all_mapped && __ratelimit(&last_warned)) {
                printk("__find_get_block_slow() failed. block=%llu, "
                       "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
                       "device %pg blocksize: %d\n",
                       (unsigned long long)block,
                       (unsigned long long)bh->b_blocknr,
                       bh->b_state, bh->b_size, bdev,
                       1 << blkbits);
        }
out_unlock:
        spin_unlock(&bd_mapping->i_private_lock);
        folio_put(folio);
out:
        return ret;
}

static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
{
        unsigned long flags;
        struct buffer_head *first;
        struct buffer_head *tmp;
        struct folio *folio;
        int folio_uptodate = 1;

        BUG_ON(!buffer_async_read(bh));

        folio = bh->b_folio;
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                clear_buffer_uptodate(bh);
                buffer_io_error(bh, ", async page read");
                folio_set_error(folio);
        }

        /*
         * Be _very_ careful from here on. Bad things can happen if
         * two buffer heads end IO at almost the same time and both
         * decide that the page is now completely done.
         */
        first = folio_buffers(folio);
        spin_lock_irqsave(&first->b_uptodate_lock, flags);
        clear_buffer_async_read(bh);
        unlock_buffer(bh);
        tmp = bh;
        do {
                if (!buffer_uptodate(tmp))
                        folio_uptodate = 0;
                if (buffer_async_read(tmp)) {
                        BUG_ON(!buffer_locked(tmp));
                        goto still_busy;
                }
                tmp = tmp->b_this_page;
        } while (tmp != bh);
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);

        folio_end_read(folio, folio_uptodate);
        return;

still_busy:
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
        return;
}

struct postprocess_bh_ctx {
        struct work_struct work;
        struct buffer_head *bh;
};

static void verify_bh(struct work_struct *work)
{
        struct postprocess_bh_ctx *ctx =
                container_of(work, struct postprocess_bh_ctx, work);
        struct buffer_head *bh = ctx->bh;
        bool valid;

        valid = fsverity_verify_blocks(bh->b_folio, bh->b_size, bh_offset(bh));
        end_buffer_async_read(bh, valid);
        kfree(ctx);
}

static bool need_fsverity(struct buffer_head *bh)
{
        struct folio *folio = bh->b_folio;
        struct inode *inode = folio->mapping->host;

        return fsverity_active(inode) &&
                /* needed by ext4 */
                folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
}

static void decrypt_bh(struct work_struct *work)
{
        struct postprocess_bh_ctx *ctx =
                container_of(work, struct postprocess_bh_ctx, work);
        struct buffer_head *bh = ctx->bh;
        int err;

        err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size,
                                               bh_offset(bh));
        if (err == 0 && need_fsverity(bh)) {
                /*
                 * We use different work queues for decryption and for verity
                 * because verity may require reading metadata pages that need
                 * decryption, and we shouldn't recurse to the same workqueue.
                 */
                INIT_WORK(&ctx->work, verify_bh);
                fsverity_enqueue_verify_work(&ctx->work);
                return;
        }
        end_buffer_async_read(bh, err == 0);
        kfree(ctx);
}

/*
 * I/O completion handler for block_read_full_folio() - pages
 * which come unlocked at the end of I/O.
 */
static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
{
        struct inode *inode = bh->b_folio->mapping->host;
        bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
        bool verify = need_fsverity(bh);

        /* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
        if (uptodate && (decrypt || verify)) {
                struct postprocess_bh_ctx *ctx =
                        kmalloc(sizeof(*ctx), GFP_ATOMIC);

                if (ctx) {
                        ctx->bh = bh;
                        if (decrypt) {
                                INIT_WORK(&ctx->work, decrypt_bh);
                                fscrypt_enqueue_decrypt_work(&ctx->work);
                        } else {
                                INIT_WORK(&ctx->work, verify_bh);
                                fsverity_enqueue_verify_work(&ctx->work);
                        }
                        return;
                }
                uptodate = 0;
        }
        end_buffer_async_read(bh, uptodate);
}

/*
 * Completion handler for block_write_full_folio() - folios which are unlocked
 * during I/O, and which have the writeback flag cleared upon I/O completion.
 */
static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
{
        unsigned long flags;
        struct buffer_head *first;
        struct buffer_head *tmp;
        struct folio *folio;

        BUG_ON(!buffer_async_write(bh));

        folio = bh->b_folio;
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                buffer_io_error(bh, ", lost async page write");
                mark_buffer_write_io_error(bh);
                clear_buffer_uptodate(bh);
                folio_set_error(folio);
        }

        first = folio_buffers(folio);
        spin_lock_irqsave(&first->b_uptodate_lock, flags);

        clear_buffer_async_write(bh);
        unlock_buffer(bh);
        tmp = bh->b_this_page;
        while (tmp != bh) {
                if (buffer_async_write(tmp)) {
                        BUG_ON(!buffer_locked(tmp));
                        goto still_busy;
                }
                tmp = tmp->b_this_page;
        }
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
        folio_end_writeback(folio);
        return;

still_busy:
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
        return;
}

/*
 * If a page's buffers are under async readin (end_buffer_async_read
 * completion) then there is a possibility that another thread of
 * control could lock one of the buffers after it has completed
 * but while some of the other buffers have not completed.  This
 * locked buffer would confuse end_buffer_async_read() into not unlocking
 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 * that this buffer is not under async I/O.
 *
 * The page comes unlocked when it has no locked buffer_async buffers
 * left.
 *
 * PageLocked prevents anyone starting new async I/O reads any of
 * the buffers.
 *
 * PageWriteback is used to prevent simultaneous writeout of the same
 * page.
 *
 * PageLocked prevents anyone from starting writeback of a page which is
 * under read I/O (PageWriteback is only ever set against a locked page).
 */
static void mark_buffer_async_read(struct buffer_head *bh)
{
        bh->b_end_io = end_buffer_async_read_io;
        set_buffer_async_read(bh);
}

static void mark_buffer_async_write_endio(struct buffer_head *bh,
                                          bh_end_io_t *handler)
{
        bh->b_end_io = handler;
        set_buffer_async_write(bh);
}

void mark_buffer_async_write(struct buffer_head *bh)
{
        mark_buffer_async_write_endio(bh, end_buffer_async_write);
}
EXPORT_SYMBOL(mark_buffer_async_write);


/*
 * fs/buffer.c contains helper functions for buffer-backed address space's
 * fsync functions.  A common requirement for buffer-based filesystems is
 * that certain data from the backing blockdev needs to be written out for
 * a successful fsync().  For example, ext2 indirect blocks need to be
 * written back and waited upon before fsync() returns.
 *
 * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(),
 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 * management of a list of dependent buffers at ->i_mapping->i_private_list.
 *
 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 * from their controlling inode's queue when they are being freed.  But
 * try_to_free_buffers() will be operating against the *blockdev* mapping
 * at the time, not against the S_ISREG file which depends on those buffers.
 * So the locking for i_private_list is via the i_private_lock in the address_space
 * which backs the buffers.  Which is different from the address_space 
 * against which the buffers are listed.  So for a particular address_space,
 * mapping->i_private_lock does *not* protect mapping->i_private_list!  In fact,
 * mapping->i_private_list will always be protected by the backing blockdev's
 * ->i_private_lock.
 *
 * Which introduces a requirement: all buffers on an address_space's
 * ->i_private_list must be from the same address_space: the blockdev's.
 *
 * address_spaces which do not place buffers at ->i_private_list via these
 * utility functions are free to use i_private_lock and i_private_list for
 * whatever they want.  The only requirement is that list_empty(i_private_list)
 * be true at clear_inode() time.
 *
 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 * filesystems should do that.  invalidate_inode_buffers() should just go
 * BUG_ON(!list_empty).
 *
 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 * take an address_space, not an inode.  And it should be called
 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 * queued up.
 *
 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 * list if it is already on a list.  Because if the buffer is on a list,
 * it *must* already be on the right one.  If not, the filesystem is being
 * silly.  This will save a ton of locking.  But first we have to ensure
 * that buffers are taken *off* the old inode's list when they are freed
 * (presumably in truncate).  That requires careful auditing of all
 * filesystems (do it inside bforget()).  It could also be done by bringing
 * b_inode back.
 */

/*
 * The buffer's backing address_space's i_private_lock must be held
 */
static void __remove_assoc_queue(struct buffer_head *bh)
{
        list_del_init(&bh->b_assoc_buffers);
        WARN_ON(!bh->b_assoc_map);
        bh->b_assoc_map = NULL;
}

int inode_has_buffers(struct inode *inode)
{
        return !list_empty(&inode->i_data.i_private_list);
}

/*
 * osync is designed to support O_SYNC io.  It waits synchronously for
 * all already-submitted IO to complete, but does not queue any new
 * writes to the disk.
 *
 * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer
 * as you dirty the buffers, and then use osync_inode_buffers to wait for
 * completion.  Any other dirty buffers which are not yet queued for
 * write will not be flushed to disk by the osync.
 */
static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
{
        struct buffer_head *bh;
        struct list_head *p;
        int err = 0;

        spin_lock(lock);
repeat:
        list_for_each_prev(p, list) {
                bh = BH_ENTRY(p);
                if (buffer_locked(bh)) {
                        get_bh(bh);
                        spin_unlock(lock);
                        wait_on_buffer(bh);
                        if (!buffer_uptodate(bh))
                                err = -EIO;
                        brelse(bh);
                        spin_lock(lock);
                        goto repeat;
                }
        }
        spin_unlock(lock);
        return err;
}

/**
 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
 * @mapping: the mapping which wants those buffers written
 *
 * Starts I/O against the buffers at mapping->i_private_list, and waits upon
 * that I/O.
 *
 * Basically, this is a convenience function for fsync().
 * @mapping is a file or directory which needs those buffers to be written for
 * a successful fsync().
 */
int sync_mapping_buffers(struct address_space *mapping)
{
        struct address_space *buffer_mapping = mapping->i_private_data;

        if (buffer_mapping == NULL || list_empty(&mapping->i_private_list))
                return 0;

        return fsync_buffers_list(&buffer_mapping->i_private_lock,
                                        &mapping->i_private_list);
}
EXPORT_SYMBOL(sync_mapping_buffers);

/**
 * generic_buffers_fsync_noflush - generic buffer fsync implementation
 * for simple filesystems with no inode lock
 *
 * @file:        file to synchronize
 * @start:        start offset in bytes
 * @end:        end offset in bytes (inclusive)
 * @datasync:        only synchronize essential metadata if true
 *
 * This is a generic implementation of the fsync method for simple
 * filesystems which track all non-inode metadata in the buffers list
 * hanging off the address_space structure.
 */
int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
                                  bool datasync)
{
        struct inode *inode = file->f_mapping->host;
        int err;
        int ret;

        err = file_write_and_wait_range(file, start, end);
        if (err)
                return err;

        ret = sync_mapping_buffers(inode->i_mapping);
        if (!(inode->i_state & I_DIRTY_ALL))
                goto out;
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
                goto out;

        err = sync_inode_metadata(inode, 1);
        if (ret == 0)
                ret = err;

out:
        /* check and advance again to catch errors after syncing out buffers */
        err = file_check_and_advance_wb_err(file);
        if (ret == 0)
                ret = err;
        return ret;
}
EXPORT_SYMBOL(generic_buffers_fsync_noflush);

/**
 * generic_buffers_fsync - generic buffer fsync implementation
 * for simple filesystems with no inode lock
 *
 * @file:        file to synchronize
 * @start:        start offset in bytes
 * @end:        end offset in bytes (inclusive)
 * @datasync:        only synchronize essential metadata if true
 *
 * This is a generic implementation of the fsync method for simple
 * filesystems which track all non-inode metadata in the buffers list
 * hanging off the address_space structure. This also makes sure that
 * a device cache flush operation is called at the end.
 */
int generic_buffers_fsync(struct file *file, loff_t start, loff_t end,
                          bool datasync)
{
        struct inode *inode = file->f_mapping->host;
        int ret;

        ret = generic_buffers_fsync_noflush(file, start, end, datasync);
        if (!ret)
                ret = blkdev_issue_flush(inode->i_sb->s_bdev);
        return ret;
}
EXPORT_SYMBOL(generic_buffers_fsync);

/*
 * Called when we've recently written block `bblock', and it is known that
 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 */
void write_boundary_block(struct block_device *bdev,
                        sector_t bblock, unsigned blocksize)
{
        struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
        if (bh) {
                if (buffer_dirty(bh))
                        write_dirty_buffer(bh, 0);
                put_bh(bh);
        }
}

void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
{
        struct address_space *mapping = inode->i_mapping;
        struct address_space *buffer_mapping = bh->b_folio->mapping;

        mark_buffer_dirty(bh);
        if (!mapping->i_private_data) {
                mapping->i_private_data = buffer_mapping;
        } else {
                BUG_ON(mapping->i_private_data != buffer_mapping);
        }
        if (!bh->b_assoc_map) {
                spin_lock(&buffer_mapping->i_private_lock);
                list_move_tail(&bh->b_assoc_buffers,
                                &mapping->i_private_list);
                bh->b_assoc_map = mapping;
                spin_unlock(&buffer_mapping->i_private_lock);
        }
}
EXPORT_SYMBOL(mark_buffer_dirty_inode);

/**
 * block_dirty_folio - Mark a folio as dirty.
 * @mapping: The address space containing this folio.
 * @folio: The folio to mark dirty.
 *
 * Filesystems which use buffer_heads can use this function as their
 * ->dirty_folio implementation.  Some filesystems need to do a little
 * work before calling this function.  Filesystems which do not use
 * buffer_heads should call filemap_dirty_folio() instead.
 *
 * If the folio has buffers, the uptodate buffers are set dirty, to
 * preserve dirty-state coherency between the folio and the buffers.
 * Buffers added to a dirty folio are created dirty.
 *
 * The buffers are dirtied before the folio is dirtied.  There's a small
 * race window in which writeback may see the folio cleanness but not the
 * buffer dirtiness.  That's fine.  If this code were to set the folio
 * dirty before the buffers, writeback could clear the folio dirty flag,
 * see a bunch of clean buffers and we'd end up with dirty buffers/clean
 * folio on the dirty folio list.
 *
 * We use i_private_lock to lock against try_to_free_buffers() while
 * using the folio's buffer list.  This also prevents clean buffers
 * being added to the folio after it was set dirty.
 *
 * Context: May only be called from process context.  Does not sleep.
 * Caller must ensure that @folio cannot be truncated during this call,
 * typically by holding the folio lock or having a page in the folio
 * mapped and holding the page table lock.
 *
 * Return: True if the folio was dirtied; false if it was already dirtied.
 */
bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
{
        struct buffer_head *head;
        bool newly_dirty;

        spin_lock(&mapping->i_private_lock);
        head = folio_buffers(folio);
        if (head) {
                struct buffer_head *bh = head;

                do {
                        set_buffer_dirty(bh);
                        bh = bh->b_this_page;
                } while (bh != head);
        }
        /*
         * Lock out page's memcg migration to keep PageDirty
         * synchronized with per-memcg dirty page counters.
         */
        folio_memcg_lock(folio);
        newly_dirty = !folio_test_set_dirty(folio);
        spin_unlock(&mapping->i_private_lock);

        if (newly_dirty)
                __folio_mark_dirty(folio, mapping, 1);

        folio_memcg_unlock(folio);

        if (newly_dirty)
                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);

        return newly_dirty;
}
EXPORT_SYMBOL(block_dirty_folio);

/*
 * Write out and wait upon a list of buffers.
 *
 * We have conflicting pressures: we want to make sure that all
 * initially dirty buffers get waited on, but that any subsequently
 * dirtied buffers don't.  After all, we don't want fsync to last
 * forever if somebody is actively writing to the file.
 *
 * Do this in two main stages: first we copy dirty buffers to a
 * temporary inode list, queueing the writes as we go.  Then we clean
 * up, waiting for those writes to complete.
 * 
 * During this second stage, any subsequent updates to the file may end
 * up refiling the buffer on the original inode's dirty list again, so
 * there is a chance we will end up with a buffer queued for write but
 * not yet completed on that list.  So, as a final cleanup we go through
 * the osync code to catch these locked, dirty buffers without requeuing
 * any newly dirty buffers for write.
 */
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
{
        struct buffer_head *bh;
        struct list_head tmp;
        struct address_space *mapping;
        int err = 0, err2;
        struct blk_plug plug;

        INIT_LIST_HEAD(&tmp);
        blk_start_plug(&plug);

        spin_lock(lock);
        while (!list_empty(list)) {
                bh = BH_ENTRY(list->next);
                mapping = bh->b_assoc_map;
                __remove_assoc_queue(bh);
                /* Avoid race with mark_buffer_dirty_inode() which does
                 * a lockless check and we rely on seeing the dirty bit */
                smp_mb();
                if (buffer_dirty(bh) || buffer_locked(bh)) {
                        list_add(&bh->b_assoc_buffers, &tmp);
                        bh->b_assoc_map = mapping;
                        if (buffer_dirty(bh)) {
                                get_bh(bh);
                                spin_unlock(lock);
                                /*
                                 * Ensure any pending I/O completes so that
                                 * write_dirty_buffer() actually writes the
                                 * current contents - it is a noop if I/O is
                                 * still in flight on potentially older
                                 * contents.
                                 */
                                write_dirty_buffer(bh, REQ_SYNC);

                                /*
                                 * Kick off IO for the previous mapping. Note
                                 * that we will not run the very last mapping,
                                 * wait_on_buffer() will do that for us
                                 * through sync_buffer().
                                 */
                                brelse(bh);
                                spin_lock(lock);
                        }
                }
        }

        spin_unlock(lock);
        blk_finish_plug(&plug);
        spin_lock(lock);

        while (!list_empty(&tmp)) {
                bh = BH_ENTRY(tmp.prev);
                get_bh(bh);
                mapping = bh->b_assoc_map;
                __remove_assoc_queue(bh);
                /* Avoid race with mark_buffer_dirty_inode() which does
                 * a lockless check and we rely on seeing the dirty bit */
                smp_mb();
                if (buffer_dirty(bh)) {
                        list_add(&bh->b_assoc_buffers,
                                 &mapping->i_private_list);
                        bh->b_assoc_map = mapping;
                }
                spin_unlock(lock);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh))
                        err = -EIO;
                brelse(bh);
                spin_lock(lock);
        }
        
        spin_unlock(lock);
        err2 = osync_buffers_list(lock, list);
        if (err)
                return err;
        else
                return err2;
}

/*
 * Invalidate any and all dirty buffers on a given inode.  We are
 * probably unmounting the fs, but that doesn't mean we have already
 * done a sync().  Just drop the buffers from the inode list.
 *
 * NOTE: we take the inode's blockdev's mapping's i_private_lock.  Which
 * assumes that all the buffers are against the blockdev.  Not true
 * for reiserfs.
 */
void invalidate_inode_buffers(struct inode *inode)
{
        if (inode_has_buffers(inode)) {
                struct address_space *mapping = &inode->i_data;
                struct list_head *list = &mapping->i_private_list;
                struct address_space *buffer_mapping = mapping->i_private_data;

                spin_lock(&buffer_mapping->i_private_lock);
                while (!list_empty(list))
                        __remove_assoc_queue(BH_ENTRY(list->next));
                spin_unlock(&buffer_mapping->i_private_lock);
        }
}
EXPORT_SYMBOL(invalidate_inode_buffers);

/*
 * Remove any clean buffers from the inode's buffer list.  This is called
 * when we're trying to free the inode itself.  Those buffers can pin it.
 *
 * Returns true if all buffers were removed.
 */
int remove_inode_buffers(struct inode *inode)
{
        int ret = 1;

        if (inode_has_buffers(inode)) {
                struct address_space *mapping = &inode->i_data;
                struct list_head *list = &mapping->i_private_list;
                struct address_space *buffer_mapping = mapping->i_private_data;

                spin_lock(&buffer_mapping->i_private_lock);
                while (!list_empty(list)) {
                        struct buffer_head *bh = BH_ENTRY(list->next);
                        if (buffer_dirty(bh)) {
                                ret = 0;
                                break;
                        }
                        __remove_assoc_queue(bh);
                }
                spin_unlock(&buffer_mapping->i_private_lock);
        }
        return ret;
}

/*
 * Create the appropriate buffers when given a folio for data area and
 * the size of each buffer.. Use the bh->b_this_page linked list to
 * follow the buffers created.  Return NULL if unable to create more
 * buffers.
 *
 * The retry flag is used to differentiate async IO (paging, swapping)
 * which may not fail from ordinary buffer allocations.
 */
struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
                                        gfp_t gfp)
{
        struct buffer_head *bh, *head;
        long offset;
        struct mem_cgroup *memcg, *old_memcg;

        /* The folio lock pins the memcg */
        memcg = folio_memcg(folio);
        old_memcg = set_active_memcg(memcg);

        head = NULL;
        offset = folio_size(folio);
        while ((offset -= size) >= 0) {
                bh = alloc_buffer_head(gfp);
                if (!bh)
                        goto no_grow;

                bh->b_this_page = head;
                bh->b_blocknr = -1;
                head = bh;

                bh->b_size = size;

                /* Link the buffer to its folio */
                folio_set_bh(bh, folio, offset);
        }
out:
        set_active_memcg(old_memcg);
        return head;
/*
 * In case anything failed, we just free everything we got.
 */
no_grow:
        if (head) {
                do {
                        bh = head;
                        head = head->b_this_page;
                        free_buffer_head(bh);
                } while (head);
        }

        goto out;
}
EXPORT_SYMBOL_GPL(folio_alloc_buffers);

struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
                                       bool retry)
{
        gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
        if (retry)
                gfp |= __GFP_NOFAIL;

        return folio_alloc_buffers(page_folio(page), size, gfp);
}
EXPORT_SYMBOL_GPL(alloc_page_buffers);

static inline void link_dev_buffers(struct folio *folio,
                struct buffer_head *head)
{
        struct buffer_head *bh, *tail;

        bh = head;
        do {
                tail = bh;
                bh = bh->b_this_page;
        } while (bh);
        tail->b_this_page = head;
        folio_attach_private(folio, head);
}

static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
{
        sector_t retval = ~((sector_t)0);
        loff_t sz = bdev_nr_bytes(bdev);

        if (sz) {
                unsigned int sizebits = blksize_bits(size);
                retval = (sz >> sizebits);
        }
        return retval;
}

/*
 * Initialise the state of a blockdev folio's buffers.
 */ 
static sector_t folio_init_buffers(struct folio *folio,
                struct block_device *bdev, unsigned size)
{
        struct buffer_head *head = folio_buffers(folio);
        struct buffer_head *bh = head;
        bool uptodate = folio_test_uptodate(folio);
        sector_t block = div_u64(folio_pos(folio), size);
        sector_t end_block = blkdev_max_block(bdev, size);

        do {
                if (!buffer_mapped(bh)) {
                        bh->b_end_io = NULL;
                        bh->b_private = NULL;
                        bh->b_bdev = bdev;
                        bh->b_blocknr = block;
                        if (uptodate)
                                set_buffer_uptodate(bh);
                        if (block < end_block)
                                set_buffer_mapped(bh);
                }
                block++;
                bh = bh->b_this_page;
        } while (bh != head);

        /*
         * Caller needs to validate requested block against end of device.
         */
        return end_block;
}

/*
 * Create the page-cache folio that contains the requested block.
 *
 * This is used purely for blockdev mappings.
 *
 * Returns false if we have a failure which cannot be cured by retrying
 * without sleeping.  Returns true if we succeeded, or the caller should retry.
 */
static bool grow_dev_folio(struct block_device *bdev, sector_t block,
                pgoff_t index, unsigned size, gfp_t gfp)
{
        struct address_space *mapping = bdev->bd_mapping;
        struct folio *folio;
        struct buffer_head *bh;
        sector_t end_block = 0;

        folio = __filemap_get_folio(mapping, index,
                        FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
        if (IS_ERR(folio))
                return false;

        bh = folio_buffers(folio);
        if (bh) {
                if (bh->b_size == size) {
                        end_block = folio_init_buffers(folio, bdev, size);
                        goto unlock;
                }

                /*
                 * Retrying may succeed; for example the folio may finish
                 * writeback, or buffers may be cleaned.  This should not
                 * happen very often; maybe we have old buffers attached to
                 * this blockdev's page cache and we're trying to change
                 * the block size?
                 */
                if (!try_to_free_buffers(folio)) {
                        end_block = ~0ULL;
                        goto unlock;
                }
        }

        bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT);
        if (!bh)
                goto unlock;

        /*
         * Link the folio to the buffers and initialise them.  Take the
         * lock to be atomic wrt __find_get_block(), which does not
         * run under the folio lock.
         */
        spin_lock(&mapping->i_private_lock);
        link_dev_buffers(folio, bh);
        end_block = folio_init_buffers(folio, bdev, size);
        spin_unlock(&mapping->i_private_lock);
unlock:
        folio_unlock(folio);
        folio_put(folio);
        return block < end_block;
}

/*
 * Create buffers for the specified block device block's folio.  If
 * that folio was dirty, the buffers are set dirty also.  Returns false
 * if we've hit a permanent error.
 */
static bool grow_buffers(struct block_device *bdev, sector_t block,
                unsigned size, gfp_t gfp)
{
        loff_t pos;

        /*
         * Check for a block which lies outside our maximum possible
         * pagecache index.
         */
        if (check_mul_overflow(block, (sector_t)size, &pos) || pos > MAX_LFS_FILESIZE) {
                printk(KERN_ERR "%s: requested out-of-range block %llu for device %pg\n",
                        __func__, (unsigned long long)block,
                        bdev);
                return false;
        }

        /* Create a folio with the proper size buffers */
        return grow_dev_folio(bdev, block, pos / PAGE_SIZE, size, gfp);
}

static struct buffer_head *
__getblk_slow(struct block_device *bdev, sector_t block,
             unsigned size, gfp_t gfp)
{
        /* Size must be multiple of hard sectorsize */
        if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
                        (size < 512 || size > PAGE_SIZE))) {
                printk(KERN_ERR "getblk(): invalid block size %d requested\n",
                                        size);
                printk(KERN_ERR "logical block size: %d\n",
                                        bdev_logical_block_size(bdev));

                dump_stack();
                return NULL;
        }

        for (;;) {
                struct buffer_head *bh;

                bh = __find_get_block(bdev, block, size);
                if (bh)
                        return bh;

                if (!grow_buffers(bdev, block, size, gfp))
                        return NULL;
        }
}

/*
 * The relationship between dirty buffers and dirty pages:
 *
 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
 * the page is tagged dirty in the page cache.
 *
 * At all times, the dirtiness of the buffers represents the dirtiness of
 * subsections of the page.  If the page has buffers, the page dirty bit is
 * merely a hint about the true dirty state.
 *
 * When a page is set dirty in its entirety, all its buffers are marked dirty
 * (if the page has buffers).
 *
 * When a buffer is marked dirty, its page is dirtied, but the page's other
 * buffers are not.
 *
 * Also.  When blockdev buffers are explicitly read with bread(), they
 * individually become uptodate.  But their backing page remains not
 * uptodate - even if all of its buffers are uptodate.  A subsequent
 * block_read_full_folio() against that folio will discover all the uptodate
 * buffers, will set the folio uptodate and will perform no I/O.
 */

/**
 * mark_buffer_dirty - mark a buffer_head as needing writeout
 * @bh: the buffer_head to mark dirty
 *
 * mark_buffer_dirty() will set the dirty bit against the buffer, then set
 * its backing page dirty, then tag the page as dirty in the page cache
 * and then attach the address_space's inode to its superblock's dirty
 * inode list.
 *
 * mark_buffer_dirty() is atomic.  It takes bh->b_folio->mapping->i_private_lock,
 * i_pages lock and mapping->host->i_lock.
 */
void mark_buffer_dirty(struct buffer_head *bh)
{
        WARN_ON_ONCE(!buffer_uptodate(bh));

        trace_block_dirty_buffer(bh);

        /*
         * Very *carefully* optimize the it-is-already-dirty case.
         *
         * Don't let the final "is it dirty" escape to before we
         * perhaps modified the buffer.
         */
        if (buffer_dirty(bh)) {
                smp_mb();
                if (buffer_dirty(bh))
                        return;
        }

        if (!test_set_buffer_dirty(bh)) {
                struct folio *folio = bh->b_folio;
                struct address_space *mapping = NULL;

                folio_memcg_lock(folio);
                if (!folio_test_set_dirty(folio)) {
                        mapping = folio->mapping;
                        if (mapping)
                                __folio_mark_dirty(folio, mapping, 0);
                }
                folio_memcg_unlock(folio);
                if (mapping)
                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
        }
}
EXPORT_SYMBOL(mark_buffer_dirty);

void mark_buffer_write_io_error(struct buffer_head *bh)
{
        set_buffer_write_io_error(bh);
        /* FIXME: do we need to set this in both places? */
        if (bh->b_folio && bh->b_folio->mapping)
                mapping_set_error(bh->b_folio->mapping, -EIO);
        if (bh->b_assoc_map) {
                mapping_set_error(bh->b_assoc_map, -EIO);
                errseq_set(&bh->b_assoc_map->host->i_sb->s_wb_err, -EIO);
        }
}
EXPORT_SYMBOL(mark_buffer_write_io_error);

/**
 * __brelse - Release a buffer.
 * @bh: The buffer to release.
 *
 * This variant of brelse() can be called if @bh is guaranteed to not be NULL.
 */
void __brelse(struct buffer_head *bh)
{
        if (atomic_read(&bh->b_count)) {
                put_bh(bh);
                return;
        }
        WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
}
EXPORT_SYMBOL(__brelse);

/**
 * __bforget - Discard any dirty data in a buffer.
 * @bh: The buffer to forget.
 *
 * This variant of bforget() can be called if @bh is guaranteed to not
 * be NULL.
 */
void __bforget(struct buffer_head *bh)
{
        clear_buffer_dirty(bh);
        if (bh->b_assoc_map) {
                struct address_space *buffer_mapping = bh->b_folio->mapping;

                spin_lock(&buffer_mapping->i_private_lock);
                list_del_init(&bh->b_assoc_buffers);
                bh->b_assoc_map = NULL;
                spin_unlock(&buffer_mapping->i_private_lock);
        }
        __brelse(bh);
}
EXPORT_SYMBOL(__bforget);

static struct buffer_head *__bread_slow(struct buffer_head *bh)
{
        lock_buffer(bh);
        if (buffer_uptodate(bh)) {
                unlock_buffer(bh);
                return bh;
        } else {
                get_bh(bh);
                bh->b_end_io = end_buffer_read_sync;
                submit_bh(REQ_OP_READ, bh);
                wait_on_buffer(bh);
                if (buffer_uptodate(bh))
                        return bh;
        }
        brelse(bh);
        return NULL;
}

/*
 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
 * refcount elevated by one when they're in an LRU.  A buffer can only appear
 * once in a particular CPU's LRU.  A single buffer can be present in multiple
 * CPU's LRUs at the same time.
 *
 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
 * sb_find_get_block().
 *
 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
 * a local interrupt disable for that.
 */

#define BH_LRU_SIZE        16

struct bh_lru {
        struct buffer_head *bhs[BH_LRU_SIZE];
};

static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};

#ifdef CONFIG_SMP
#define bh_lru_lock()        local_irq_disable()
#define bh_lru_unlock()        local_irq_enable()
#else
#define bh_lru_lock()        preempt_disable()
#define bh_lru_unlock()        preempt_enable()
#endif

static inline void check_irqs_on(void)
{
#ifdef irqs_disabled
        BUG_ON(irqs_disabled());
#endif
}

/*
 * Install a buffer_head into this cpu's LRU.  If not already in the LRU, it is
 * inserted at the front, and the buffer_head at the back if any is evicted.
 * Or, if already in the LRU it is moved to the front.
 */
static void bh_lru_install(struct buffer_head *bh)
{
        struct buffer_head *evictee = bh;
        struct bh_lru *b;
        int i;

        check_irqs_on();
        bh_lru_lock();

        /*
         * the refcount of buffer_head in bh_lru prevents dropping the
         * attached page(i.e., try_to_free_buffers) so it could cause
         * failing page migration.
         * Skip putting upcoming bh into bh_lru until migration is done.
         */
        if (lru_cache_disabled() || cpu_is_isolated(smp_processor_id())) {
                bh_lru_unlock();
                return;
        }

        b = this_cpu_ptr(&bh_lrus);
        for (i = 0; i < BH_LRU_SIZE; i++) {
                swap(evictee, b->bhs[i]);
                if (evictee == bh) {
                        bh_lru_unlock();
                        return;
                }
        }

        get_bh(bh);
        bh_lru_unlock();
        brelse(evictee);
}

/*
 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
 */
static struct buffer_head *
lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
{
        struct buffer_head *ret = NULL;
        unsigned int i;

        check_irqs_on();
        bh_lru_lock();
        if (cpu_is_isolated(smp_processor_id())) {
                bh_lru_unlock();
                return NULL;
        }
        for (i = 0; i < BH_LRU_SIZE; i++) {
                struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);

                if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
                    bh->b_size == size) {
                        if (i) {
                                while (i) {
                                        __this_cpu_write(bh_lrus.bhs[i],
                                                __this_cpu_read(bh_lrus.bhs[i - 1]));
                                        i--;
                                }
                                __this_cpu_write(bh_lrus.bhs[0], bh);
                        }
                        get_bh(bh);
                        ret = bh;
                        break;
                }
        }
        bh_lru_unlock();
        return ret;
}

/*
 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
 * it in the LRU and mark it as accessed.  If it is not present then return
 * NULL
 */
struct buffer_head *
__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
{
        struct buffer_head *bh = lookup_bh_lru(bdev, block, size);

        if (bh == NULL) {
                /* __find_get_block_slow will mark the page accessed */
                bh = __find_get_block_slow(bdev, block);
                if (bh)
                        bh_lru_install(bh);
        } else
                touch_buffer(bh);

        return bh;
}
EXPORT_SYMBOL(__find_get_block);

/**
 * bdev_getblk - Get a buffer_head in a block device's buffer cache.
 * @bdev: The block device.
 * @block: The block number.
 * @size: The size of buffer_heads for this @bdev.
 * @gfp: The memory allocation flags to use.
 *
 * The returned buffer head has its reference count incremented, but is
 * not locked.  The caller should call brelse() when it has finished
 * with the buffer.  The buffer may not be uptodate.  If needed, the
 * caller can bring it uptodate either by reading it or overwriting it.
 *
 * Return: The buffer head, or NULL if memory could not be allocated.
 */
struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
                unsigned size, gfp_t gfp)
{
        struct buffer_head *bh = __find_get_block(bdev, block, size);

        might_alloc(gfp);
        if (bh)
                return bh;

        return __getblk_slow(bdev, block, size, gfp);
}
EXPORT_SYMBOL(bdev_getblk);

/*
 * Do async read-ahead on a buffer..
 */
void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
{
        struct buffer_head *bh = bdev_getblk(bdev, block, size,
                        GFP_NOWAIT | __GFP_MOVABLE);

        if (likely(bh)) {
                bh_readahead(bh, REQ_RAHEAD);
                brelse(bh);
        }
}
EXPORT_SYMBOL(__breadahead);

/**
 * __bread_gfp() - Read a block.
 * @bdev: The block device to read from.
 * @block: Block number in units of block size.
 * @size: The block size of this device in bytes.
 * @gfp: Not page allocation flags; see below.
 *
 * You are not expected to call this function.  You should use one of
 * sb_bread(), sb_bread_unmovable() or __bread().
 *
 * Read a specified block, and return the buffer head that refers to it.
 * If @gfp is 0, the memory will be allocated using the block device's
 * default GFP flags.  If @gfp is __GFP_MOVABLE, the memory may be
 * allocated from a movable area.  Do not pass in a complete set of
 * GFP flags.
 *
 * The returned buffer head has its refcount increased.  The caller should
 * call brelse() when it has finished with the buffer.
 *
 * Context: May sleep waiting for I/O.
 * Return: NULL if the block was unreadable.
 */
struct buffer_head *__bread_gfp(struct block_device *bdev, sector_t block,
                unsigned size, gfp_t gfp)
{
        struct buffer_head *bh;

        gfp |= mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);

        /*
         * Prefer looping in the allocator rather than here, at least that
         * code knows what it's doing.
         */
        gfp |= __GFP_NOFAIL;

        bh = bdev_getblk(bdev, block, size, gfp);

        if (likely(bh) && !buffer_uptodate(bh))
                bh = __bread_slow(bh);
        return bh;
}
EXPORT_SYMBOL(__bread_gfp);

static void __invalidate_bh_lrus(struct bh_lru *b)
{
        int i;

        for (i = 0; i < BH_LRU_SIZE; i++) {
                brelse(b->bhs[i]);
                b->bhs[i] = NULL;
        }
}
/*
 * invalidate_bh_lrus() is called rarely - but not only at unmount.
 * This doesn't race because it runs in each cpu either in irq
 * or with preempt disabled.
 */
static void invalidate_bh_lru(void *arg)
{
        struct bh_lru *b = &get_cpu_var(bh_lrus);

        __invalidate_bh_lrus(b);
        put_cpu_var(bh_lrus);
}

bool has_bh_in_lru(int cpu, void *dummy)
{
        struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
        int i;
        
        for (i = 0; i < BH_LRU_SIZE; i++) {
                if (b->bhs[i])
                        return true;
        }

        return false;
}

void invalidate_bh_lrus(void)
{
        on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
}
EXPORT_SYMBOL_GPL(invalidate_bh_lrus);

/*
 * It's called from workqueue context so we need a bh_lru_lock to close
 * the race with preemption/irq.
 */
void invalidate_bh_lrus_cpu(void)
{
        struct bh_lru *b;

        bh_lru_lock();
        b = this_cpu_ptr(&bh_lrus);
        __invalidate_bh_lrus(b);
        bh_lru_unlock();
}

void folio_set_bh(struct buffer_head *bh, struct folio *folio,
                  unsigned long offset)
{
        bh->b_folio = folio;
        BUG_ON(offset >= folio_size(folio));
        if (folio_test_highmem(folio))
                /*
                 * This catches illegal uses and preserves the offset:
                 */
                bh->b_data = (char *)(0 + offset);
        else
                bh->b_data = folio_address(folio) + offset;
}
EXPORT_SYMBOL(folio_set_bh);

/*
 * Called when truncating a buffer on a page completely.
 */

/* Bits that are cleared during an invalidate */
#define BUFFER_FLAGS_DISCARD \
        (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
         1 << BH_Delay | 1 << BH_Unwritten)

static void discard_buffer(struct buffer_head * bh)
{
        unsigned long b_state;

        lock_buffer(bh);
        clear_buffer_dirty(bh);
        bh->b_bdev = NULL;
        b_state = READ_ONCE(bh->b_state);
        do {
        } while (!try_cmpxchg(&bh->b_state, &b_state,
                              b_state & ~BUFFER_FLAGS_DISCARD));
        unlock_buffer(bh);
}

/**
 * block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
 * @folio: The folio which is affected.
 * @offset: start of the range to invalidate
 * @length: length of the range to invalidate
 *
 * block_invalidate_folio() is called when all or part of the folio has been
 * invalidated by a truncate operation.
 *
 * block_invalidate_folio() does not have to release all buffers, but it must
 * ensure that no dirty buffer is left outside @offset and that no I/O
 * is underway against any of the blocks which are outside the truncation
 * point.  Because the caller is about to free (and possibly reuse) those
 * blocks on-disk.
 */
void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
{
        struct buffer_head *head, *bh, *next;
        size_t curr_off = 0;
        size_t stop = length + offset;

        BUG_ON(!folio_test_locked(folio));

        /*
         * Check for overflow
         */
        BUG_ON(stop > folio_size(folio) || stop < length);

        head = folio_buffers(folio);
        if (!head)
                return;

        bh = head;
        do {
                size_t next_off = curr_off + bh->b_size;
                next = bh->b_this_page;

                /*
                 * Are we still fully in range ?
                 */
                if (next_off > stop)
                        goto out;

                /*
                 * is this block fully invalidated?
                 */
                if (offset <= curr_off)
                        discard_buffer(bh);
                curr_off = next_off;
                bh = next;
        } while (bh != head);

        /*
         * We release buffers only if the entire folio is being invalidated.
         * The get_block cached value has been unconditionally invalidated,
         * so real IO is not possible anymore.
         */
        if (length == folio_size(folio))
                filemap_release_folio(folio, 0);
out:
        return;
}
EXPORT_SYMBOL(block_invalidate_folio);

/*
 * We attach and possibly dirty the buffers atomically wrt
 * block_dirty_folio() via i_private_lock.  try_to_free_buffers
 * is already excluded via the folio lock.
 */
struct buffer_head *create_empty_buffers(struct folio *folio,
                unsigned long blocksize, unsigned long b_state)
{
        struct buffer_head *bh, *head, *tail;
        gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL;

        head = folio_alloc_buffers(folio, blocksize, gfp);
        bh = head;
        do {
                bh->b_state |= b_state;
                tail = bh;
                bh = bh->b_this_page;
        } while (bh);
        tail->b_this_page = head;

        spin_lock(&folio->mapping->i_private_lock);
        if (folio_test_uptodate(folio) || folio_test_dirty(folio)) {
                bh = head;
                do {
                        if (folio_test_dirty(folio))
                                set_buffer_dirty(bh);
                        if (folio_test_uptodate(folio))
                                set_buffer_uptodate(bh);
                        bh = bh->b_this_page;
                } while (bh != head);
        }
        folio_attach_private(folio, head);
        spin_unlock(&folio->mapping->i_private_lock);

        return head;
}
EXPORT_SYMBOL(create_empty_buffers);

/**
 * clean_bdev_aliases: clean a range of buffers in block device
 * @bdev: Block device to clean buffers in
 * @block: Start of a range of blocks to clean
 * @len: Number of blocks to clean
 *
 * We are taking a range of blocks for data and we don't want writeback of any
 * buffer-cache aliases starting from return from this function and until the
 * moment when something will explicitly mark the buffer dirty (hopefully that
 * will not happen until we will free that block ;-) We don't even need to mark
 * it not-uptodate - nobody can expect anything from a newly allocated buffer
 * anyway. We used to use unmap_buffer() for such invalidation, but that was
 * wrong. We definitely don't want to mark the alias unmapped, for example - it
 * would confuse anyone who might pick it with bread() afterwards...
 *
 * Also..  Note that bforget() doesn't lock the buffer.  So there can be
 * writeout I/O going on against recently-freed buffers.  We don't wait on that
 * I/O in bforget() - it's more efficient to wait on the I/O only if we really
 * need to.  That happens here.
 */
void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
{
        struct address_space *bd_mapping = bdev->bd_mapping;
        const int blkbits = bd_mapping->host->i_blkbits;
        struct folio_batch fbatch;
        pgoff_t index = ((loff_t)block << blkbits) / PAGE_SIZE;
        pgoff_t end;
        int i, count;
        struct buffer_head *bh;
        struct buffer_head *head;

        end = ((loff_t)(block + len - 1) << blkbits) / PAGE_SIZE;
        folio_batch_init(&fbatch);
        while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
                count = folio_batch_count(&fbatch);
                for (i = 0; i < count; i++) {
                        struct folio *folio = fbatch.folios[i];

                        if (!folio_buffers(folio))
                                continue;
                        /*
                         * We use folio lock instead of bd_mapping->i_private_lock
                         * to pin buffers here since we can afford to sleep and
                         * it scales better than a global spinlock lock.
                         */
                        folio_lock(folio);
                        /* Recheck when the folio is locked which pins bhs */
                        head = folio_buffers(folio);
                        if (!head)
                                goto unlock_page;
                        bh = head;
                        do {
                                if (!buffer_mapped(bh) || (bh->b_blocknr < block))
                                        goto next;
                                if (bh->b_blocknr >= block + len)
                                        break;
                                clear_buffer_dirty(bh);
                                wait_on_buffer(bh);
                                clear_buffer_req(bh);
next:
                                bh = bh->b_this_page;
                        } while (bh != head);
unlock_page:
                        folio_unlock(folio);
                }
                folio_batch_release(&fbatch);
                cond_resched();
                /* End of range already reached? */
                if (index > end || !index)
                        break;
        }
}
EXPORT_SYMBOL(clean_bdev_aliases);

static struct buffer_head *folio_create_buffers(struct folio *folio,
                                                struct inode *inode,
                                                unsigned int b_state)
{
        struct buffer_head *bh;

        BUG_ON(!folio_test_locked(folio));

        bh = folio_buffers(folio);
        if (!bh)
                bh = create_empty_buffers(folio,
                                1 << READ_ONCE(inode->i_blkbits), b_state);
        return bh;
}

/*
 * NOTE! All mapped/uptodate combinations are valid:
 *
 *        Mapped        Uptodate        Meaning
 *
 *        No        No                "unknown" - must do get_block()
 *        No        Yes                "hole" - zero-filled
 *        Yes        No                "allocated" - allocated on disk, not read in
 *        Yes        Yes                "valid" - allocated and up-to-date in memory.
 *
 * "Dirty" is valid only with the last case (mapped+uptodate).
 */

/*
 * While block_write_full_folio is writing back the dirty buffers under
 * the page lock, whoever dirtied the buffers may decide to clean them
 * again at any time.  We handle that by only looking at the buffer
 * state inside lock_buffer().
 *
 * If block_write_full_folio() is called for regular writeback
 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
 * locked buffer.   This only can happen if someone has written the buffer
 * directly, with submit_bh().  At the address_space level PageWriteback
 * prevents this contention from occurring.
 *
 * If block_write_full_folio() is called with wbc->sync_mode ==
 * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
 * causes the writes to be flagged as synchronous writes.
 */
int __block_write_full_folio(struct inode *inode, struct folio *folio,
                        get_block_t *get_block, struct writeback_control *wbc)
{
        int err;
        sector_t block;
        sector_t last_block;
        struct buffer_head *bh, *head;
        size_t blocksize;
        int nr_underway = 0;
        blk_opf_t write_flags = wbc_to_write_flags(wbc);

        head = folio_create_buffers(folio, inode,
                                    (1 << BH_Dirty) | (1 << BH_Uptodate));

        /*
         * Be very careful.  We have no exclusion from block_dirty_folio
         * here, and the (potentially unmapped) buffers may become dirty at
         * any time.  If a buffer becomes dirty here after we've inspected it
         * then we just miss that fact, and the folio stays dirty.
         *
         * Buffers outside i_size may be dirtied by block_dirty_folio;
         * handle that here by just cleaning them.
         */

        bh = head;
        blocksize = bh->b_size;

        block = div_u64(folio_pos(folio), blocksize);
        last_block = div_u64(i_size_read(inode) - 1, blocksize);

        /*
         * Get all the dirty buffers mapped to disk addresses and
         * handle any aliases from the underlying blockdev's mapping.
         */
        do {
                if (block > last_block) {
                        /*
                         * mapped buffers outside i_size will occur, because
                         * this folio can be outside i_size when there is a
                         * truncate in progress.
                         */
                        /*
                         * The buffer was zeroed by block_write_full_folio()
                         */
                        clear_buffer_dirty(bh);
                        set_buffer_uptodate(bh);
                } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
                           buffer_dirty(bh)) {
                        WARN_ON(bh->b_size != blocksize);
                        err = get_block(inode, block, bh, 1);
                        if (err)
                                goto recover;
                        clear_buffer_delay(bh);
                        if (buffer_new(bh)) {
                                /* blockdev mappings never come here */
                                clear_buffer_new(bh);
                                clean_bdev_bh_alias(bh);
                        }
                }
                bh = bh->b_this_page;
                block++;
        } while (bh != head);

        do {
                if (!buffer_mapped(bh))
                        continue;
                /*
                 * If it's a fully non-blocking write attempt and we cannot
                 * lock the buffer then redirty the folio.  Note that this can
                 * potentially cause a busy-wait loop from writeback threads
                 * and kswapd activity, but those code paths have their own
                 * higher-level throttling.
                 */
                if (wbc->sync_mode != WB_SYNC_NONE) {
                        lock_buffer(bh);
                } else if (!trylock_buffer(bh)) {
                        folio_redirty_for_writepage(wbc, folio);
                        continue;
                }
                if (test_clear_buffer_dirty(bh)) {
                        mark_buffer_async_write_endio(bh,
                                end_buffer_async_write);
                } else {
                        unlock_buffer(bh);
                }
        } while ((bh = bh->b_this_page) != head);

        /*
         * The folio and its buffers are protected by the writeback flag,
         * so we can drop the bh refcounts early.
         */
        BUG_ON(folio_test_writeback(folio));
        folio_start_writeback(folio);

        do {
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
                        submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
                                      inode->i_write_hint, wbc);
                        nr_underway++;
                }
                bh = next;
        } while (bh != head);
        folio_unlock(folio);

        err = 0;
done:
        if (nr_underway == 0) {
                /*
                 * The folio was marked dirty, but the buffers were
                 * clean.  Someone wrote them back by hand with
                 * write_dirty_buffer/submit_bh.  A rare case.
                 */
                folio_end_writeback(folio);

                /*
                 * The folio and buffer_heads can be released at any time from
                 * here on.
                 */
        }
        return err;

recover:
        /*
         * ENOSPC, or some other error.  We may already have added some
         * blocks to the file, so we need to write these out to avoid
         * exposing stale data.
         * The folio is currently locked and not marked for writeback
         */
        bh = head;
        /* Recovery: lock and submit the mapped buffers */
        do {
                if (buffer_mapped(bh) && buffer_dirty(bh) &&
                    !buffer_delay(bh)) {
                        lock_buffer(bh);
                        mark_buffer_async_write_endio(bh,
                                end_buffer_async_write);
                } else {
                        /*
                         * The buffer may have been set dirty during
                         * attachment to a dirty folio.
                         */
                        clear_buffer_dirty(bh);
                }
        } while ((bh = bh->b_this_page) != head);
        folio_set_error(folio);
        BUG_ON(folio_test_writeback(folio));
        mapping_set_error(folio->mapping, err);
        folio_start_writeback(folio);
        do {
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
                        clear_buffer_dirty(bh);
                        submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
                                      inode->i_write_hint, wbc);
                        nr_underway++;
                }
                bh = next;
        } while (bh != head);
        folio_unlock(folio);
        goto done;
}
EXPORT_SYMBOL(__block_write_full_folio);

/*
 * If a folio has any new buffers, zero them out here, and mark them uptodate
 * and dirty so they'll be written out (in order to prevent uninitialised
 * block data from leaking). And clear the new bit.
 */
void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to)
{
        size_t block_start, block_end;
        struct buffer_head *head, *bh;

        BUG_ON(!folio_test_locked(folio));
        head = folio_buffers(folio);
        if (!head)
                return;

        bh = head;
        block_start = 0;
        do {
                block_end = block_start + bh->b_size;

                if (buffer_new(bh)) {
                        if (block_end > from && block_start < to) {
                                if (!folio_test_uptodate(folio)) {
                                        size_t start, xend;

                                        start = max(from, block_start);
                                        xend = min(to, block_end);

                                        folio_zero_segment(folio, start, xend);
                                        set_buffer_uptodate(bh);
                                }

                                clear_buffer_new(bh);
                                mark_buffer_dirty(bh);
                        }
                }

                block_start = block_end;
                bh = bh->b_this_page;
        } while (bh != head);
}
EXPORT_SYMBOL(folio_zero_new_buffers);

static int
iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
                const struct iomap *iomap)
{
        loff_t offset = (loff_t)block << inode->i_blkbits;

        bh->b_bdev = iomap->bdev;

        /*
         * Block points to offset in file we need to map, iomap contains
         * the offset at which the map starts. If the map ends before the
         * current block, then do not map the buffer and let the caller
         * handle it.
         */
        if (offset >= iomap->offset + iomap->length)
                return -EIO;

        switch (iomap->type) {
        case IOMAP_HOLE:
                /*
                 * If the buffer is not up to date or beyond the current EOF,
                 * we need to mark it as new to ensure sub-block zeroing is
                 * executed if necessary.
                 */
                if (!buffer_uptodate(bh) ||
                    (offset >= i_size_read(inode)))
                        set_buffer_new(bh);
                return 0;
        case IOMAP_DELALLOC:
                if (!buffer_uptodate(bh) ||
                    (offset >= i_size_read(inode)))
                        set_buffer_new(bh);
                set_buffer_uptodate(bh);
                set_buffer_mapped(bh);
                set_buffer_delay(bh);
                return 0;
        case IOMAP_UNWRITTEN:
                /*
                 * For unwritten regions, we always need to ensure that regions
                 * in the block we are not writing to are zeroed. Mark the
                 * buffer as new to ensure this.
                 */
                set_buffer_new(bh);
                set_buffer_unwritten(bh);
                fallthrough;
        case IOMAP_MAPPED:
                if ((iomap->flags & IOMAP_F_NEW) ||
                    offset >= i_size_read(inode)) {
                        /*
                         * This can happen if truncating the block device races
                         * with the check in the caller as i_size updates on
                         * block devices aren't synchronized by i_rwsem for
                         * block devices.
                         */
                        if (S_ISBLK(inode->i_mode))
                                return -EIO;
                        set_buffer_new(bh);
                }
                bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
                                inode->i_blkbits;
                set_buffer_mapped(bh);
                return 0;
        default:
                WARN_ON_ONCE(1);
                return -EIO;
        }
}

int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
                get_block_t *get_block, const struct iomap *iomap)
{
        size_t from = offset_in_folio(folio, pos);
        size_t to = from + len;
        struct inode *inode = folio->mapping->host;
        size_t block_start, block_end;
        sector_t block;
        int err = 0;
        size_t blocksize;
        struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;

        BUG_ON(!folio_test_locked(folio));
        BUG_ON(to > folio_size(folio));
        BUG_ON(from > to);

        head = folio_create_buffers(folio, inode, 0);
        blocksize = head->b_size;
        block = div_u64(folio_pos(folio), blocksize);

        for (bh = head, block_start = 0; bh != head || !block_start;
            block++, block_start=block_end, bh = bh->b_this_page) {
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (folio_test_uptodate(folio)) {
                                if (!buffer_uptodate(bh))
                                        set_buffer_uptodate(bh);
                        }
                        continue;
                }
                if (buffer_new(bh))
                        clear_buffer_new(bh);
                if (!buffer_mapped(bh)) {
                        WARN_ON(bh->b_size != blocksize);
                        if (get_block)
                                err = get_block(inode, block, bh, 1);
                        else
                                err = iomap_to_bh(inode, block, bh, iomap);
                        if (err)
                                break;

                        if (buffer_new(bh)) {
                                clean_bdev_bh_alias(bh);
                                if (folio_test_uptodate(folio)) {
                                        clear_buffer_new(bh);
                                        set_buffer_uptodate(bh);
                                        mark_buffer_dirty(bh);
                                        continue;
                                }
                                if (block_end > to || block_start < from)
                                        folio_zero_segments(folio,
                                                to, block_end,
                                                block_start, from);
                                continue;
                        }
                }
                if (folio_test_uptodate(folio)) {
                        if (!buffer_uptodate(bh))
                                set_buffer_uptodate(bh);
                        continue; 
                }
                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
                    !buffer_unwritten(bh) &&
                     (block_start < from || block_end > to)) {
                        bh_read_nowait(bh, 0);
                        *wait_bh++=bh;
                }
        }
        /*
         * If we issued read requests - let them complete.
         */
        while(wait_bh > wait) {
                wait_on_buffer(*--wait_bh);
                if (!buffer_uptodate(*wait_bh))
                        err = -EIO;
        }
        if (unlikely(err))
                folio_zero_new_buffers(folio, from, to);
        return err;
}

int __block_write_begin(struct page *page, loff_t pos, unsigned len,
                get_block_t *get_block)
{
        return __block_write_begin_int(page_folio(page), pos, len, get_block,
                                       NULL);
}
EXPORT_SYMBOL(__block_write_begin);

static void __block_commit_write(struct folio *folio, size_t from, size_t to)
{
        size_t block_start, block_end;
        bool partial = false;
        unsigned blocksize;
        struct buffer_head *bh, *head;

        bh = head = folio_buffers(folio);
        blocksize = bh->b_size;

        block_start = 0;
        do {
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (!buffer_uptodate(bh))
                                partial = true;
                } else {
                        set_buffer_uptodate(bh);
                        mark_buffer_dirty(bh);
                }
                if (buffer_new(bh))
                        clear_buffer_new(bh);

                block_start = block_end;
                bh = bh->b_this_page;
        } while (bh != head);

        /*
         * If this is a partial write which happened to make all buffers
         * uptodate then we can optimize away a bogus read_folio() for
         * the next read(). Here we 'discover' whether the folio went
         * uptodate as a result of this (potentially partial) write.
         */
        if (!partial)
                folio_mark_uptodate(folio);
}

/*
 * block_write_begin takes care of the basic task of block allocation and
 * bringing partial write blocks uptodate first.
 *
 * The filesystem needs to handle block truncation upon failure.
 */
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
                struct page **pagep, get_block_t *get_block)
{
        pgoff_t index = pos >> PAGE_SHIFT;
        struct page *page;
        int status;

        page = grab_cache_page_write_begin(mapping, index);
        if (!page)
                return -ENOMEM;

        status = __block_write_begin(page, pos, len, get_block);
        if (unlikely(status)) {
                unlock_page(page);
                put_page(page);
                page = NULL;
        }

        *pagep = page;
        return status;
}
EXPORT_SYMBOL(block_write_begin);

int block_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
{
        struct folio *folio = page_folio(page);
        size_t start = pos - folio_pos(folio);

        if (unlikely(copied < len)) {
                /*
                 * The buffers that were written will now be uptodate, so
                 * we don't have to worry about a read_folio reading them
                 * and overwriting a partial write. However if we have
                 * encountered a short write and only partially written
                 * into a buffer, it will not be marked uptodate, so a
                 * read_folio might come in and destroy our partial write.
                 *
                 * Do the simplest thing, and just treat any short write to a
                 * non uptodate folio as a zero-length write, and force the
                 * caller to redo the whole thing.
                 */
                if (!folio_test_uptodate(folio))
                        copied = 0;

                folio_zero_new_buffers(folio, start+copied, start+len);
        }
        flush_dcache_folio(folio);

        /* This could be a short (even 0-length) commit */
        __block_commit_write(folio, start, start + copied);

        return copied;
}
EXPORT_SYMBOL(block_write_end);

int generic_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct page *page, void *fsdata)
{
        struct inode *inode = mapping->host;
        loff_t old_size = inode->i_size;
        bool i_size_changed = false;

        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);

        /*
         * No need to use i_size_read() here, the i_size cannot change under us
         * because we hold i_rwsem.
         *
         * But it's important to update i_size while still holding page lock:
         * page writeout could otherwise come in and zero beyond i_size.
         */
        if (pos + copied > inode->i_size) {
                i_size_write(inode, pos + copied);
                i_size_changed = true;
        }

        unlock_page(page);
        put_page(page);

        if (old_size < pos)
                pagecache_isize_extended(inode, old_size, pos);
        /*
         * Don't mark the inode dirty under page lock. First, it unnecessarily
         * makes the holding time of page lock longer. Second, it forces lock
         * ordering of page lock and transaction start for journaling
         * filesystems.
         */
        if (i_size_changed)
                mark_inode_dirty(inode);
        return copied;
}
EXPORT_SYMBOL(generic_write_end);

/*
 * block_is_partially_uptodate checks whether buffers within a folio are
 * uptodate or not.
 *
 * Returns true if all buffers which correspond to the specified part
 * of the folio are uptodate.
 */
bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
{
        unsigned block_start, block_end, blocksize;
        unsigned to;
        struct buffer_head *bh, *head;
        bool ret = true;

        head = folio_buffers(folio);
        if (!head)
                return false;
        blocksize = head->b_size;
        to = min_t(unsigned, folio_size(folio) - from, count);
        to = from + to;
        if (from < blocksize && to > folio_size(folio) - blocksize)
                return false;

        bh = head;
        block_start = 0;
        do {
                block_end = block_start + blocksize;
                if (block_end > from && block_start < to) {
                        if (!buffer_uptodate(bh)) {
                                ret = false;
                                break;
                        }
                        if (block_end >= to)
                                break;
                }
                block_start = block_end;
                bh = bh->b_this_page;
        } while (bh != head);

        return ret;
}
EXPORT_SYMBOL(block_is_partially_uptodate);

/*
 * Generic "read_folio" function for block devices that have the normal
 * get_block functionality. This is most of the block device filesystems.
 * Reads the folio asynchronously --- the unlock_buffer() and
 * set/clear_buffer_uptodate() functions propagate buffer state into the
 * folio once IO has completed.
 */
int block_read_full_folio(struct folio *folio, get_block_t *get_block)
{
        struct inode *inode = folio->mapping->host;
        sector_t iblock, lblock;
        struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
        size_t blocksize;
        int nr, i;
        int fully_mapped = 1;
        bool page_error = false;
        loff_t limit = i_size_read(inode);

        /* This is needed for ext4. */
        if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
                limit = inode->i_sb->s_maxbytes;

        VM_BUG_ON_FOLIO(folio_test_large(folio), folio);

        head = folio_create_buffers(folio, inode, 0);
        blocksize = head->b_size;

        iblock = div_u64(folio_pos(folio), blocksize);
        lblock = div_u64(limit + blocksize - 1, blocksize);
        bh = head;
        nr = 0;
        i = 0;

        do {
                if (buffer_uptodate(bh))
                        continue;

                if (!buffer_mapped(bh)) {
                        int err = 0;

                        fully_mapped = 0;
                        if (iblock < lblock) {
                                WARN_ON(bh->b_size != blocksize);
                                err = get_block(inode, iblock, bh, 0);
                                if (err) {
                                        folio_set_error(folio);
                                        page_error = true;
                                }
                        }
                        if (!buffer_mapped(bh)) {
                                folio_zero_range(folio, i * blocksize,
                                                blocksize);
                                if (!err)
                                        set_buffer_uptodate(bh);
                                continue;
                        }
                        /*
                         * get_block() might have updated the buffer
                         * synchronously
                         */
                        if (buffer_uptodate(bh))
                                continue;
                }
                arr[nr++] = bh;
        } while (i++, iblock++, (bh = bh->b_this_page) != head);

        if (fully_mapped)
                folio_set_mappedtodisk(folio);

        if (!nr) {
                /*
                 * All buffers are uptodate or get_block() returned an
                 * error when trying to map them - we can finish the read.
                 */
                folio_end_read(folio, !page_error);
                return 0;
        }

        /* Stage two: lock the buffers */
        for (i = 0; i < nr; i++) {
                bh = arr[i];
                lock_buffer(bh);
                mark_buffer_async_read(bh);
        }

        /*
         * Stage 3: start the IO.  Check for uptodateness
         * inside the buffer lock in case another process reading
         * the underlying blockdev brought it uptodate (the sct fix).
         */
        for (i = 0; i < nr; i++) {
                bh = arr[i];
                if (buffer_uptodate(bh))
                        end_buffer_async_read(bh, 1);
                else
                        submit_bh(REQ_OP_READ, bh);
        }
        return 0;
}
EXPORT_SYMBOL(block_read_full_folio);

/* utility function for filesystems that need to do work on expanding
 * truncates.  Uses filesystem pagecache writes to allow the filesystem to
 * deal with the hole.  
 */
int generic_cont_expand_simple(struct inode *inode, loff_t size)
{
        struct address_space *mapping = inode->i_mapping;
        const struct address_space_operations *aops = mapping->a_ops;
        struct page *page;
        void *fsdata = NULL;
        int err;

        err = inode_newsize_ok(inode, size);
        if (err)
                goto out;

        err = aops->write_begin(NULL, mapping, size, 0, &page, &fsdata);
        if (err)
                goto out;

        err = aops->write_end(NULL, mapping, size, 0, 0, page, fsdata);
        BUG_ON(err > 0);

out:
        return err;
}
EXPORT_SYMBOL(generic_cont_expand_simple);

static int cont_expand_zero(struct file *file, struct address_space *mapping,
                            loff_t pos, loff_t *bytes)
{
        struct inode *inode = mapping->host;
        const struct address_space_operations *aops = mapping->a_ops;
        unsigned int blocksize = i_blocksize(inode);
        struct page *page;
        void *fsdata = NULL;
        pgoff_t index, curidx;
        loff_t curpos;
        unsigned zerofrom, offset, len;
        int err = 0;

        index = pos >> PAGE_SHIFT;
        offset = pos & ~PAGE_MASK;

        while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
                zerofrom = curpos & ~PAGE_MASK;
                if (zerofrom & (blocksize-1)) {
                        *bytes |= (blocksize-1);
                        (*bytes)++;
                }
                len = PAGE_SIZE - zerofrom;

                err = aops->write_begin(file, mapping, curpos, len,
                                            &page, &fsdata);
                if (err)
                        goto out;
                zero_user(page, zerofrom, len);
                err = aops->write_end(file, mapping, curpos, len, len,
                                                page, fsdata);
                if (err < 0)
                        goto out;
                BUG_ON(err != len);
                err = 0;

                balance_dirty_pages_ratelimited(mapping);

                if (fatal_signal_pending(current)) {
                        err = -EINTR;
                        goto out;
                }
        }

        /* page covers the boundary, find the boundary offset */
        if (index == curidx) {
                zerofrom = curpos & ~PAGE_MASK;
                /* if we will expand the thing last block will be filled */
                if (offset <= zerofrom) {
                        goto out;
                }
                if (zerofrom & (blocksize-1)) {
                        *bytes |= (blocksize-1);
                        (*bytes)++;
                }
                len = offset - zerofrom;

                err = aops->write_begin(file, mapping, curpos, len,
                                            &page, &fsdata);
                if (err)
                        goto out;
                zero_user(page, zerofrom, len);
                err = aops->write_end(file, mapping, curpos, len, len,
                                                page, fsdata);
                if (err < 0)
                        goto out;
                BUG_ON(err != len);
                err = 0;
        }
out:
        return err;
}

/*
 * For moronic filesystems that do not allow holes in file.
 * We may have to extend the file.
 */
int cont_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len,
                        struct page **pagep, void **fsdata,
                        get_block_t *get_block, loff_t *bytes)
{
        struct inode *inode = mapping->host;
        unsigned int blocksize = i_blocksize(inode);
        unsigned int zerofrom;
        int err;

        err = cont_expand_zero(file, mapping, pos, bytes);
        if (err)
                return err;

        zerofrom = *bytes & ~PAGE_MASK;
        if (pos+len > *bytes && zerofrom & (blocksize-1)) {
                *bytes |= (blocksize-1);
                (*bytes)++;
        }

        return block_write_begin(mapping, pos, len, pagep, get_block);
}
EXPORT_SYMBOL(cont_write_begin);

void block_commit_write(struct page *page, unsigned from, unsigned to)
{
        struct folio *folio = page_folio(page);
        __block_commit_write(folio, from, to);
}
EXPORT_SYMBOL(block_commit_write);

/*
 * block_page_mkwrite() is not allowed to change the file size as it gets
 * called from a page fault handler when a page is first dirtied. Hence we must
 * be careful to check for EOF conditions here. We set the page up correctly
 * for a written page which means we get ENOSPC checking when writing into
 * holes and correct delalloc and unwritten extent mapping on filesystems that
 * support these features.
 *
 * We are not allowed to take the i_mutex here so we have to play games to
 * protect against truncate races as the page could now be beyond EOF.  Because
 * truncate writes the inode size before removing pages, once we have the
 * page lock we can determine safely if the page is beyond EOF. If it is not
 * beyond EOF, then the page is guaranteed safe against truncation until we
 * unlock the page.
 *
 * Direct callers of this function should protect against filesystem freezing
 * using sb_start_pagefault() - sb_end_pagefault() functions.
 */
int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
                         get_block_t get_block)
{
        struct folio *folio = page_folio(vmf->page);
        struct inode *inode = file_inode(vma->vm_file);
        unsigned long end;
        loff_t size;
        int ret;

        folio_lock(folio);
        size = i_size_read(inode);
        if ((folio->mapping != inode->i_mapping) ||
            (folio_pos(folio) >= size)) {
                /* We overload EFAULT to mean page got truncated */
                ret = -EFAULT;
                goto out_unlock;
        }

        end = folio_size(folio);
        /* folio is wholly or partially inside EOF */
        if (folio_pos(folio) + end > size)
                end = size - folio_pos(folio);

        ret = __block_write_begin_int(folio, 0, end, get_block, NULL);
        if (unlikely(ret))
                goto out_unlock;

        __block_commit_write(folio, 0, end);

        folio_mark_dirty(folio);
        folio_wait_stable(folio);
        return 0;
out_unlock:
        folio_unlock(folio);
        return ret;
}
EXPORT_SYMBOL(block_page_mkwrite);

int block_truncate_page(struct address_space *mapping,
                        loff_t from, get_block_t *get_block)
{
        pgoff_t index = from >> PAGE_SHIFT;
        unsigned blocksize;
        sector_t iblock;
        size_t offset, length, pos;
        struct inode *inode = mapping->host;
        struct folio *folio;
        struct buffer_head *bh;
        int err = 0;

        blocksize = i_blocksize(inode);
        length = from & (blocksize - 1);

        /* Block boundary? Nothing to do */
        if (!length)
                return 0;

        length = blocksize - length;
        iblock = ((loff_t)index * PAGE_SIZE) >> inode->i_blkbits;

        folio = filemap_grab_folio(mapping, index);
        if (IS_ERR(folio))
                return PTR_ERR(folio);

        bh = folio_buffers(folio);
        if (!bh)
                bh = create_empty_buffers(folio, blocksize, 0);

        /* Find the buffer that contains "offset" */
        offset = offset_in_folio(folio, from);
        pos = blocksize;
        while (offset >= pos) {
                bh = bh->b_this_page;
                iblock++;
                pos += blocksize;
        }

        if (!buffer_mapped(bh)) {
                WARN_ON(bh->b_size != blocksize);
                err = get_block(inode, iblock, bh, 0);
                if (err)
                        goto unlock;
                /* unmapped? It's a hole - nothing to do */
                if (!buffer_mapped(bh))
                        goto unlock;
        }

        /* Ok, it's mapped. Make sure it's up-to-date */
        if (folio_test_uptodate(folio))
                set_buffer_uptodate(bh);

        if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
                err = bh_read(bh, 0);
                /* Uhhuh. Read error. Complain and punt. */
                if (err < 0)
                        goto unlock;
        }

        folio_zero_range(folio, offset, length);
        mark_buffer_dirty(bh);

unlock:
        folio_unlock(folio);
        folio_put(folio);

        return err;
}
EXPORT_SYMBOL(block_truncate_page);

/*
 * The generic ->writepage function for buffer-backed address_spaces
 */
int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
                void *get_block)
{
        struct inode * const inode = folio->mapping->host;
        loff_t i_size = i_size_read(inode);

        /* Is the folio fully inside i_size? */
        if (folio_pos(folio) + folio_size(folio) <= i_size)
                return __block_write_full_folio(inode, folio, get_block, wbc);

        /* Is the folio fully outside i_size? (truncate in progress) */
        if (folio_pos(folio) >= i_size) {
                folio_unlock(folio);
                return 0; /* don't care */
        }

        /*
         * The folio straddles i_size.  It must be zeroed out on each and every
         * writepage invocation because it may be mmapped.  "A file is mapped
         * in multiples of the page size.  For a file that is not a multiple of
         * the page size, the remaining memory is zeroed when mapped, and
         * writes to that region are not written out to the file."
         */
        folio_zero_segment(folio, offset_in_folio(folio, i_size),
                        folio_size(folio));
        return __block_write_full_folio(inode, folio, get_block, wbc);
}

sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
                            get_block_t *get_block)
{
        struct inode *inode = mapping->host;
        struct buffer_head tmp = {
                .b_size = i_blocksize(inode),
        };

        get_block(inode, block, &tmp, 0);
        return tmp.b_blocknr;
}
EXPORT_SYMBOL(generic_block_bmap);

static void end_bio_bh_io_sync(struct bio *bio)
{
        struct buffer_head *bh = bio->bi_private;

        if (unlikely(bio_flagged(bio, BIO_QUIET)))
                set_bit(BH_Quiet, &bh->b_state);

        bh->b_end_io(bh, !bio->bi_status);
        bio_put(bio);
}

static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
                          enum rw_hint write_hint,
                          struct writeback_control *wbc)
{
        const enum req_op op = opf & REQ_OP_MASK;
        struct bio *bio;

        BUG_ON(!buffer_locked(bh));
        BUG_ON(!buffer_mapped(bh));
        BUG_ON(!bh->b_end_io);
        BUG_ON(buffer_delay(bh));
        BUG_ON(buffer_unwritten(bh));

        /*
         * Only clear out a write error when rewriting
         */
        if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
                clear_buffer_write_io_error(bh);

        if (buffer_meta(bh))
                opf |= REQ_META;
        if (buffer_prio(bh))
                opf |= REQ_PRIO;

        bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO);

        fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);

        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_write_hint = write_hint;

        __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));

        bio->bi_end_io = end_bio_bh_io_sync;
        bio->bi_private = bh;

        /* Take care of bh's that straddle the end of the device */
        guard_bio_eod(bio);

        if (wbc) {
                wbc_init_bio(wbc, bio);
                wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
        }

        submit_bio(bio);
}

void submit_bh(blk_opf_t opf, struct buffer_head *bh)
{
        submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL);
}
EXPORT_SYMBOL(submit_bh);

void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
{
        lock_buffer(bh);
        if (!test_clear_buffer_dirty(bh)) {
                unlock_buffer(bh);
                return;
        }
        bh->b_end_io = end_buffer_write_sync;
        get_bh(bh);
        submit_bh(REQ_OP_WRITE | op_flags, bh);
}
EXPORT_SYMBOL(write_dirty_buffer);

/*
 * For a data-integrity writeout, we need to wait upon any in-progress I/O
 * and then start new I/O and then wait upon it.  The caller must have a ref on
 * the buffer_head.
 */
int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
{
        WARN_ON(atomic_read(&bh->b_count) < 1);
        lock_buffer(bh);
        if (test_clear_buffer_dirty(bh)) {
                /*
                 * The bh should be mapped, but it might not be if the
                 * device was hot-removed. Not much we can do but fail the I/O.
                 */
                if (!buffer_mapped(bh)) {
                        unlock_buffer(bh);
                        return -EIO;
                }

                get_bh(bh);
                bh->b_end_io = end_buffer_write_sync;
                submit_bh(REQ_OP_WRITE | op_flags, bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh))
                        return -EIO;
        } else {
                unlock_buffer(bh);
        }
        return 0;
}
EXPORT_SYMBOL(__sync_dirty_buffer);

int sync_dirty_buffer(struct buffer_head *bh)
{
        return __sync_dirty_buffer(bh, REQ_SYNC);
}
EXPORT_SYMBOL(sync_dirty_buffer);

static inline int buffer_busy(struct buffer_head *bh)
{
        return atomic_read(&bh->b_count) |
                (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}

static bool
drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free)
{
        struct buffer_head *head = folio_buffers(folio);
        struct buffer_head *bh;

        bh = head;
        do {
                if (buffer_busy(bh))
                        goto failed;
                bh = bh->b_this_page;
        } while (bh != head);

        do {
                struct buffer_head *next = bh->b_this_page;

                if (bh->b_assoc_map)
                        __remove_assoc_queue(bh);
                bh = next;
        } while (bh != head);
        *buffers_to_free = head;
        folio_detach_private(folio);
        return true;
failed:
        return false;
}

/**
 * try_to_free_buffers - Release buffers attached to this folio.
 * @folio: The folio.
 *
 * If any buffers are in use (dirty, under writeback, elevated refcount),
 * no buffers will be freed.
 *
 * If the folio is dirty but all the buffers are clean then we need to
 * be sure to mark the folio clean as well.  This is because the folio
 * may be against a block device, and a later reattachment of buffers
 * to a dirty folio will set *all* buffers dirty.  Which would corrupt
 * filesystem data on the same device.
 *
 * The same applies to regular filesystem folios: if all the buffers are
 * clean then we set the folio clean and proceed.  To do that, we require
 * total exclusion from block_dirty_folio().  That is obtained with
 * i_private_lock.
 *
 * Exclusion against try_to_free_buffers may be obtained by either
 * locking the folio or by holding its mapping's i_private_lock.
 *
 * Context: Process context.  @folio must be locked.  Will not sleep.
 * Return: true if all buffers attached to this folio were freed.
 */
bool try_to_free_buffers(struct folio *folio)
{
        struct address_space * const mapping = folio->mapping;
        struct buffer_head *buffers_to_free = NULL;
        bool ret = 0;

        BUG_ON(!folio_test_locked(folio));
        if (folio_test_writeback(folio))
                return false;

        if (mapping == NULL) {                /* can this still happen? */
                ret = drop_buffers(folio, &buffers_to_free);
                goto out;
        }

        spin_lock(&mapping->i_private_lock);
        ret = drop_buffers(folio, &buffers_to_free);

        /*
         * If the filesystem writes its buffers by hand (eg ext3)
         * then we can have clean buffers against a dirty folio.  We
         * clean the folio here; otherwise the VM will never notice
         * that the filesystem did any IO at all.
         *
         * Also, during truncate, discard_buffer will have marked all
         * the folio's buffers clean.  We discover that here and clean
         * the folio also.
         *
         * i_private_lock must be held over this entire operation in order
         * to synchronise against block_dirty_folio and prevent the
         * dirty bit from being lost.
         */
        if (ret)
                folio_cancel_dirty(folio);
        spin_unlock(&mapping->i_private_lock);
out:
        if (buffers_to_free) {
                struct buffer_head *bh = buffers_to_free;

                do {
                        struct buffer_head *next = bh->b_this_page;
                        free_buffer_head(bh);
                        bh = next;
                } while (bh != buffers_to_free);
        }
        return ret;
}
EXPORT_SYMBOL(try_to_free_buffers);

/*
 * Buffer-head allocation
 */
static struct kmem_cache *bh_cachep __ro_after_init;

/*
 * Once the number of bh's in the machine exceeds this level, we start
 * stripping them in writeback.
 */
static unsigned long max_buffer_heads __ro_after_init;

int buffer_heads_over_limit;

struct bh_accounting {
        int nr;                        /* Number of live bh's */
        int ratelimit;                /* Limit cacheline bouncing */
};

static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};

static void recalc_bh_state(void)
{
        int i;
        int tot = 0;

        if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
                return;
        __this_cpu_write(bh_accounting.ratelimit, 0);
        for_each_online_cpu(i)
                tot += per_cpu(bh_accounting, i).nr;
        buffer_heads_over_limit = (tot > max_buffer_heads);
}

struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
{
        struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
        if (ret) {
                INIT_LIST_HEAD(&ret->b_assoc_buffers);
                spin_lock_init(&ret->b_uptodate_lock);
                preempt_disable();
                __this_cpu_inc(bh_accounting.nr);
                recalc_bh_state();
                preempt_enable();
        }
        return ret;
}
EXPORT_SYMBOL(alloc_buffer_head);

void free_buffer_head(struct buffer_head *bh)
{
        BUG_ON(!list_empty(&bh->b_assoc_buffers));
        kmem_cache_free(bh_cachep, bh);
        preempt_disable();
        __this_cpu_dec(bh_accounting.nr);
        recalc_bh_state();
        preempt_enable();
}
EXPORT_SYMBOL(free_buffer_head);

static int buffer_exit_cpu_dead(unsigned int cpu)
{
        int i;
        struct bh_lru *b = &per_cpu(bh_lrus, cpu);

        for (i = 0; i < BH_LRU_SIZE; i++) {
                brelse(b->bhs[i]);
                b->bhs[i] = NULL;
        }
        this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
        per_cpu(bh_accounting, cpu).nr = 0;
        return 0;
}

/**
 * bh_uptodate_or_lock - Test whether the buffer is uptodate
 * @bh: struct buffer_head
 *
 * Return true if the buffer is up-to-date and false,
 * with the buffer locked, if not.
 */
int bh_uptodate_or_lock(struct buffer_head *bh)
{
        if (!buffer_uptodate(bh)) {
                lock_buffer(bh);
                if (!buffer_uptodate(bh))
                        return 0;
                unlock_buffer(bh);
        }
        return 1;
}
EXPORT_SYMBOL(bh_uptodate_or_lock);

/**
 * __bh_read - Submit read for a locked buffer
 * @bh: struct buffer_head
 * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
 * @wait: wait until reading finish
 *
 * Returns zero on success or don't wait, and -EIO on error.
 */
int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
{
        int ret = 0;

        BUG_ON(!buffer_locked(bh));

        get_bh(bh);
        bh->b_end_io = end_buffer_read_sync;
        submit_bh(REQ_OP_READ | op_flags, bh);
        if (wait) {
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh))
                        ret = -EIO;
        }
        return ret;
}
EXPORT_SYMBOL(__bh_read);

/**
 * __bh_read_batch - Submit read for a batch of unlocked buffers
 * @nr: entry number of the buffer batch
 * @bhs: a batch of struct buffer_head
 * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
 * @force_lock: force to get a lock on the buffer if set, otherwise drops any
 *              buffer that cannot lock.
 *
 * Returns zero on success or don't wait, and -EIO on error.
 */
void __bh_read_batch(int nr, struct buffer_head *bhs[],
                     blk_opf_t op_flags, bool force_lock)
{
        int i;

        for (i = 0; i < nr; i++) {
                struct buffer_head *bh = bhs[i];

                if (buffer_uptodate(bh))
                        continue;

                if (force_lock)
                        lock_buffer(bh);
                else
                        if (!trylock_buffer(bh))
                                continue;

                if (buffer_uptodate(bh)) {
                        unlock_buffer(bh);
                        continue;
                }

                bh->b_end_io = end_buffer_read_sync;
                get_bh(bh);
                submit_bh(REQ_OP_READ | op_flags, bh);
        }
}
EXPORT_SYMBOL(__bh_read_batch);

void __init buffer_init(void)
{
        unsigned long nrpages;
        int ret;

        bh_cachep = KMEM_CACHE(buffer_head,
                                SLAB_RECLAIM_ACCOUNT|SLAB_PANIC);
        /*
         * Limit the bh occupancy to 10% of ZONE_NORMAL
         */
        nrpages = (nr_free_buffer_pages() * 10) / 100;
        max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
        ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
                                        NULL, buffer_exit_cpu_dead);
        WARN_ON(ret < 0);
}

































    1 







































































    1 

    1 

















    1 

































































































































































































































































































































































    1 





    1 















    1 




























































































































































































































    1 















































    1 





















    1 



















    1 















    1 






    1 





















    1 






    1 









    1 





























    1 































    1 



















    1 









    1 










    1 










    1 











    1 




    1 















    1 






    1 



























    1 











































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2015 Facebook.  All rights reserved.
 */

#include <linux/kernel.h>
#include <linux/sched/mm.h>
#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "locking.h"
#include "free-space-tree.h"
#include "transaction.h"
#include "block-group.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"

static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
                                        struct btrfs_block_group *block_group,
                                        struct btrfs_path *path);

static struct btrfs_root *btrfs_free_space_root(
                                struct btrfs_block_group *block_group)
{
        struct btrfs_key key = {
                .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
                .type = BTRFS_ROOT_ITEM_KEY,
                .offset = 0,
        };

        if (btrfs_fs_incompat(block_group->fs_info, EXTENT_TREE_V2))
                key.offset = block_group->global_root_id;
        return btrfs_global_root(block_group->fs_info, &key);
}

void set_free_space_tree_thresholds(struct btrfs_block_group *cache)
{
        u32 bitmap_range;
        size_t bitmap_size;
        u64 num_bitmaps, total_bitmap_size;

        if (WARN_ON(cache->length == 0))
                btrfs_warn(cache->fs_info, "block group %llu length is zero",
                           cache->start);

        /*
         * We convert to bitmaps when the disk space required for using extents
         * exceeds that required for using bitmaps.
         */
        bitmap_range = cache->fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
        num_bitmaps = div_u64(cache->length + bitmap_range - 1, bitmap_range);
        bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE;
        total_bitmap_size = num_bitmaps * bitmap_size;
        cache->bitmap_high_thresh = div_u64(total_bitmap_size,
                                            sizeof(struct btrfs_item));

        /*
         * We allow for a small buffer between the high threshold and low
         * threshold to avoid thrashing back and forth between the two formats.
         */
        if (cache->bitmap_high_thresh > 100)
                cache->bitmap_low_thresh = cache->bitmap_high_thresh - 100;
        else
                cache->bitmap_low_thresh = 0;
}

static int add_new_free_space_info(struct btrfs_trans_handle *trans,
                                   struct btrfs_block_group *block_group,
                                   struct btrfs_path *path)
{
        struct btrfs_root *root = btrfs_free_space_root(block_group);
        struct btrfs_free_space_info *info;
        struct btrfs_key key;
        struct extent_buffer *leaf;
        int ret;

        key.objectid = block_group->start;
        key.type = BTRFS_FREE_SPACE_INFO_KEY;
        key.offset = block_group->length;

        ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info));
        if (ret)
                goto out;

        leaf = path->nodes[0];
        info = btrfs_item_ptr(leaf, path->slots[0],
                              struct btrfs_free_space_info);
        btrfs_set_free_space_extent_count(leaf, info, 0);
        btrfs_set_free_space_flags(leaf, info, 0);
        btrfs_mark_buffer_dirty(trans, leaf);

        ret = 0;
out:
        btrfs_release_path(path);
        return ret;
}

EXPORT_FOR_TESTS
struct btrfs_free_space_info *search_free_space_info(
                struct btrfs_trans_handle *trans,
                struct btrfs_block_group *block_group,
                struct btrfs_path *path, int cow)
{
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        struct btrfs_root *root = btrfs_free_space_root(block_group);
        struct btrfs_key key;
        int ret;

        key.objectid = block_group->start;
        key.type = BTRFS_FREE_SPACE_INFO_KEY;
        key.offset = block_group->length;

        ret = btrfs_search_slot(trans, root, &key, path, 0, cow);
        if (ret < 0)
                return ERR_PTR(ret);
        if (ret != 0) {
                btrfs_warn(fs_info, "missing free space info for %llu",
                           block_group->start);
                ASSERT(0);
                return ERR_PTR(-ENOENT);
        }

        return btrfs_item_ptr(path->nodes[0], path->slots[0],
                              struct btrfs_free_space_info);
}

/*
 * btrfs_search_slot() but we're looking for the greatest key less than the
 * passed key.
 */
static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root,
                                  struct btrfs_key *key, struct btrfs_path *p,
                                  int ins_len, int cow)
{
        int ret;

        ret = btrfs_search_slot(trans, root, key, p, ins_len, cow);
        if (ret < 0)
                return ret;

        if (ret == 0) {
                ASSERT(0);
                return -EIO;
        }

        if (p->slots[0] == 0) {
                ASSERT(0);
                return -EIO;
        }
        p->slots[0]--;

        return 0;
}

static inline u32 free_space_bitmap_size(const struct btrfs_fs_info *fs_info,
                                         u64 size)
{
        return DIV_ROUND_UP(size >> fs_info->sectorsize_bits, BITS_PER_BYTE);
}

static unsigned long *alloc_bitmap(u32 bitmap_size)
{
        unsigned long *ret;
        unsigned int nofs_flag;
        u32 bitmap_rounded_size = round_up(bitmap_size, sizeof(unsigned long));

        /*
         * GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse
         * into the filesystem as the free space bitmap can be modified in the
         * critical section of a transaction commit.
         *
         * TODO: push the memalloc_nofs_{save,restore}() to the caller where we
         * know that recursion is unsafe.
         */
        nofs_flag = memalloc_nofs_save();
        ret = kvzalloc(bitmap_rounded_size, GFP_KERNEL);
        memalloc_nofs_restore(nofs_flag);
        return ret;
}

static void le_bitmap_set(unsigned long *map, unsigned int start, int len)
{
        u8 *p = ((u8 *)map) + BIT_BYTE(start);
        const unsigned int size = start + len;
        int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE);
        u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start);

        while (len - bits_to_set >= 0) {
                *p |= mask_to_set;
                len -= bits_to_set;
                bits_to_set = BITS_PER_BYTE;
                mask_to_set = ~0;
                p++;
        }
        if (len) {
                mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
                *p |= mask_to_set;
        }
}

EXPORT_FOR_TESTS
int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
                                  struct btrfs_block_group *block_group,
                                  struct btrfs_path *path)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *root = btrfs_free_space_root(block_group);
        struct btrfs_free_space_info *info;
        struct btrfs_key key, found_key;
        struct extent_buffer *leaf;
        unsigned long *bitmap;
        char *bitmap_cursor;
        u64 start, end;
        u64 bitmap_range, i;
        u32 bitmap_size, flags, expected_extent_count;
        u32 extent_count = 0;
        int done = 0, nr;
        int ret;

        bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
        bitmap = alloc_bitmap(bitmap_size);
        if (!bitmap) {
                ret = -ENOMEM;
                goto out;
        }

        start = block_group->start;
        end = block_group->start + block_group->length;

        key.objectid = end - 1;
        key.type = (u8)-1;
        key.offset = (u64)-1;

        while (!done) {
                ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
                if (ret)
                        goto out;

                leaf = path->nodes[0];
                nr = 0;
                path->slots[0]++;
                while (path->slots[0] > 0) {
                        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);

                        if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
                                ASSERT(found_key.objectid == block_group->start);
                                ASSERT(found_key.offset == block_group->length);
                                done = 1;
                                break;
                        } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) {
                                u64 first, last;

                                ASSERT(found_key.objectid >= start);
                                ASSERT(found_key.objectid < end);
                                ASSERT(found_key.objectid + found_key.offset <= end);

                                first = div_u64(found_key.objectid - start,
                                                fs_info->sectorsize);
                                last = div_u64(found_key.objectid + found_key.offset - start,
                                               fs_info->sectorsize);
                                le_bitmap_set(bitmap, first, last - first);

                                extent_count++;
                                nr++;
                                path->slots[0]--;
                        } else {
                                ASSERT(0);
                        }
                }

                ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
                if (ret)
                        goto out;
                btrfs_release_path(path);
        }

        info = search_free_space_info(trans, block_group, path, 1);
        if (IS_ERR(info)) {
                ret = PTR_ERR(info);
                goto out;
        }
        leaf = path->nodes[0];
        flags = btrfs_free_space_flags(leaf, info);
        flags |= BTRFS_FREE_SPACE_USING_BITMAPS;
        btrfs_set_free_space_flags(leaf, info, flags);
        expected_extent_count = btrfs_free_space_extent_count(leaf, info);
        btrfs_mark_buffer_dirty(trans, leaf);
        btrfs_release_path(path);

        if (extent_count != expected_extent_count) {
                btrfs_err(fs_info,
                          "incorrect extent count for %llu; counted %u, expected %u",
                          block_group->start, extent_count,
                          expected_extent_count);
                ASSERT(0);
                ret = -EIO;
                goto out;
        }

        bitmap_cursor = (char *)bitmap;
        bitmap_range = fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
        i = start;
        while (i < end) {
                unsigned long ptr;
                u64 extent_size;
                u32 data_size;

                extent_size = min(end - i, bitmap_range);
                data_size = free_space_bitmap_size(fs_info, extent_size);

                key.objectid = i;
                key.type = BTRFS_FREE_SPACE_BITMAP_KEY;
                key.offset = extent_size;

                ret = btrfs_insert_empty_item(trans, root, path, &key,
                                              data_size);
                if (ret)
                        goto out;

                leaf = path->nodes[0];
                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
                write_extent_buffer(leaf, bitmap_cursor, ptr,
                                    data_size);
                btrfs_mark_buffer_dirty(trans, leaf);
                btrfs_release_path(path);

                i += extent_size;
                bitmap_cursor += data_size;
        }

        ret = 0;
out:
        kvfree(bitmap);
        if (ret)
                btrfs_abort_transaction(trans, ret);
        return ret;
}

EXPORT_FOR_TESTS
int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
                                  struct btrfs_block_group *block_group,
                                  struct btrfs_path *path)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *root = btrfs_free_space_root(block_group);
        struct btrfs_free_space_info *info;
        struct btrfs_key key, found_key;
        struct extent_buffer *leaf;
        unsigned long *bitmap;
        u64 start, end;
        u32 bitmap_size, flags, expected_extent_count;
        unsigned long nrbits, start_bit, end_bit;
        u32 extent_count = 0;
        int done = 0, nr;
        int ret;

        bitmap_size = free_space_bitmap_size(fs_info, block_group->length);
        bitmap = alloc_bitmap(bitmap_size);
        if (!bitmap) {
                ret = -ENOMEM;
                goto out;
        }

        start = block_group->start;
        end = block_group->start + block_group->length;

        key.objectid = end - 1;
        key.type = (u8)-1;
        key.offset = (u64)-1;

        while (!done) {
                ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
                if (ret)
                        goto out;

                leaf = path->nodes[0];
                nr = 0;
                path->slots[0]++;
                while (path->slots[0] > 0) {
                        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);

                        if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
                                ASSERT(found_key.objectid == block_group->start);
                                ASSERT(found_key.offset == block_group->length);
                                done = 1;
                                break;
                        } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
                                unsigned long ptr;
                                char *bitmap_cursor;
                                u32 bitmap_pos, data_size;

                                ASSERT(found_key.objectid >= start);
                                ASSERT(found_key.objectid < end);
                                ASSERT(found_key.objectid + found_key.offset <= end);

                                bitmap_pos = div_u64(found_key.objectid - start,
                                                     fs_info->sectorsize *
                                                     BITS_PER_BYTE);
                                bitmap_cursor = ((char *)bitmap) + bitmap_pos;
                                data_size = free_space_bitmap_size(fs_info,
                                                                found_key.offset);

                                ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1);
                                read_extent_buffer(leaf, bitmap_cursor, ptr,
                                                   data_size);

                                nr++;
                                path->slots[0]--;
                        } else {
                                ASSERT(0);
                        }
                }

                ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
                if (ret)
                        goto out;
                btrfs_release_path(path);
        }

        info = search_free_space_info(trans, block_group, path, 1);
        if (IS_ERR(info)) {
                ret = PTR_ERR(info);
                goto out;
        }
        leaf = path->nodes[0];
        flags = btrfs_free_space_flags(leaf, info);
        flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS;
        btrfs_set_free_space_flags(leaf, info, flags);
        expected_extent_count = btrfs_free_space_extent_count(leaf, info);
        btrfs_mark_buffer_dirty(trans, leaf);
        btrfs_release_path(path);

        nrbits = block_group->length >> block_group->fs_info->sectorsize_bits;
        start_bit = find_next_bit_le(bitmap, nrbits, 0);

        while (start_bit < nrbits) {
                end_bit = find_next_zero_bit_le(bitmap, nrbits, start_bit);
                ASSERT(start_bit < end_bit);

                key.objectid = start + start_bit * block_group->fs_info->sectorsize;
                key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
                key.offset = (end_bit - start_bit) * block_group->fs_info->sectorsize;

                ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
                if (ret)
                        goto out;
                btrfs_release_path(path);

                extent_count++;

                start_bit = find_next_bit_le(bitmap, nrbits, end_bit);
        }

        if (extent_count != expected_extent_count) {
                btrfs_err(fs_info,
                          "incorrect extent count for %llu; counted %u, expected %u",
                          block_group->start, extent_count,
                          expected_extent_count);
                ASSERT(0);
                ret = -EIO;
                goto out;
        }

        ret = 0;
out:
        kvfree(bitmap);
        if (ret)
                btrfs_abort_transaction(trans, ret);
        return ret;
}

static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
                                          struct btrfs_block_group *block_group,
                                          struct btrfs_path *path,
                                          int new_extents)
{
        struct btrfs_free_space_info *info;
        u32 flags;
        u32 extent_count;
        int ret = 0;

        if (new_extents == 0)
                return 0;

        info = search_free_space_info(trans, block_group, path, 1);
        if (IS_ERR(info)) {
                ret = PTR_ERR(info);
                goto out;
        }
        flags = btrfs_free_space_flags(path->nodes[0], info);
        extent_count = btrfs_free_space_extent_count(path->nodes[0], info);

        extent_count += new_extents;
        btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count);
        btrfs_mark_buffer_dirty(trans, path->nodes[0]);
        btrfs_release_path(path);

        if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
            extent_count > block_group->bitmap_high_thresh) {
                ret = convert_free_space_to_bitmaps(trans, block_group, path);
        } else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
                   extent_count < block_group->bitmap_low_thresh) {
                ret = convert_free_space_to_extents(trans, block_group, path);
        }

out:
        return ret;
}

EXPORT_FOR_TESTS
int free_space_test_bit(struct btrfs_block_group *block_group,
                        struct btrfs_path *path, u64 offset)
{
        struct extent_buffer *leaf;
        struct btrfs_key key;
        u64 found_start, found_end;
        unsigned long ptr, i;

        leaf = path->nodes[0];
        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
        ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);

        found_start = key.objectid;
        found_end = key.objectid + key.offset;
        ASSERT(offset >= found_start && offset < found_end);

        ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
        i = div_u64(offset - found_start,
                    block_group->fs_info->sectorsize);
        return !!extent_buffer_test_bit(leaf, ptr, i);
}

static void free_space_set_bits(struct btrfs_trans_handle *trans,
                                struct btrfs_block_group *block_group,
                                struct btrfs_path *path, u64 *start, u64 *size,
                                int bit)
{
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        struct extent_buffer *leaf;
        struct btrfs_key key;
        u64 end = *start + *size;
        u64 found_start, found_end;
        unsigned long ptr, first, last;

        leaf = path->nodes[0];
        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
        ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);

        found_start = key.objectid;
        found_end = key.objectid + key.offset;
        ASSERT(*start >= found_start && *start < found_end);
        ASSERT(end > found_start);

        if (end > found_end)
                end = found_end;

        ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
        first = (*start - found_start) >> fs_info->sectorsize_bits;
        last = (end - found_start) >> fs_info->sectorsize_bits;
        if (bit)
                extent_buffer_bitmap_set(leaf, ptr, first, last - first);
        else
                extent_buffer_bitmap_clear(leaf, ptr, first, last - first);
        btrfs_mark_buffer_dirty(trans, leaf);

        *size -= end - *start;
        *start = end;
}

/*
 * We can't use btrfs_next_item() in modify_free_space_bitmap() because
 * btrfs_next_leaf() doesn't get the path for writing. We can forgo the fancy
 * tree walking in btrfs_next_leaf() anyways because we know exactly what we're
 * looking for.
 */
static int free_space_next_bitmap(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root, struct btrfs_path *p)
{
        struct btrfs_key key;

        if (p->slots[0] + 1 < btrfs_header_nritems(p->nodes[0])) {
                p->slots[0]++;
                return 0;
        }

        btrfs_item_key_to_cpu(p->nodes[0], &key, p->slots[0]);
        btrfs_release_path(p);

        key.objectid += key.offset;
        key.type = (u8)-1;
        key.offset = (u64)-1;

        return btrfs_search_prev_slot(trans, root, &key, p, 0, 1);
}

/*
 * If remove is 1, then we are removing free space, thus clearing bits in the
 * bitmap. If remove is 0, then we are adding free space, thus setting bits in
 * the bitmap.
 */
static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
                                    struct btrfs_block_group *block_group,
                                    struct btrfs_path *path,
                                    u64 start, u64 size, int remove)
{
        struct btrfs_root *root = btrfs_free_space_root(block_group);
        struct btrfs_key key;
        u64 end = start + size;
        u64 cur_start, cur_size;
        int prev_bit, next_bit;
        int new_extents;
        int ret;

        /*
         * Read the bit for the block immediately before the extent of space if
         * that block is within the block group.
         */
        if (start > block_group->start) {
                u64 prev_block = start - block_group->fs_info->sectorsize;

                key.objectid = prev_block;
                key.type = (u8)-1;
                key.offset = (u64)-1;

                ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
                if (ret)
                        goto out;

                prev_bit = free_space_test_bit(block_group, path, prev_block);

                /* The previous block may have been in the previous bitmap. */
                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
                if (start >= key.objectid + key.offset) {
                        ret = free_space_next_bitmap(trans, root, path);
                        if (ret)
                                goto out;
                }
        } else {
                key.objectid = start;
                key.type = (u8)-1;
                key.offset = (u64)-1;

                ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
                if (ret)
                        goto out;

                prev_bit = -1;
        }

        /*
         * Iterate over all of the bitmaps overlapped by the extent of space,
         * clearing/setting bits as required.
         */
        cur_start = start;
        cur_size = size;
        while (1) {
                free_space_set_bits(trans, block_group, path, &cur_start, &cur_size,
                                    !remove);
                if (cur_size == 0)
                        break;
                ret = free_space_next_bitmap(trans, root, path);
                if (ret)
                        goto out;
        }

        /*
         * Read the bit for the block immediately after the extent of space if
         * that block is within the block group.
         */
        if (end < block_group->start + block_group->length) {
                /* The next block may be in the next bitmap. */
                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
                if (end >= key.objectid + key.offset) {
                        ret = free_space_next_bitmap(trans, root, path);
                        if (ret)
                                goto out;
                }

                next_bit = free_space_test_bit(block_group, path, end);
        } else {
                next_bit = -1;
        }

        if (remove) {
                new_extents = -1;
                if (prev_bit == 1) {
                        /* Leftover on the left. */
                        new_extents++;
                }
                if (next_bit == 1) {
                        /* Leftover on the right. */
                        new_extents++;
                }
        } else {
                new_extents = 1;
                if (prev_bit == 1) {
                        /* Merging with neighbor on the left. */
                        new_extents--;
                }
                if (next_bit == 1) {
                        /* Merging with neighbor on the right. */
                        new_extents--;
                }
        }

        btrfs_release_path(path);
        ret = update_free_space_extent_count(trans, block_group, path,
                                             new_extents);

out:
        return ret;
}

static int remove_free_space_extent(struct btrfs_trans_handle *trans,
                                    struct btrfs_block_group *block_group,
                                    struct btrfs_path *path,
                                    u64 start, u64 size)
{
        struct btrfs_root *root = btrfs_free_space_root(block_group);
        struct btrfs_key key;
        u64 found_start, found_end;
        u64 end = start + size;
        int new_extents = -1;
        int ret;

        key.objectid = start;
        key.type = (u8)-1;
        key.offset = (u64)-1;

        ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
        if (ret)
                goto out;

        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);

        ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);

        found_start = key.objectid;
        found_end = key.objectid + key.offset;
        ASSERT(start >= found_start && end <= found_end);

        /*
         * Okay, now that we've found the free space extent which contains the
         * free space that we are removing, there are four cases:
         *
         * 1. We're using the whole extent: delete the key we found and
         * decrement the free space extent count.
         * 2. We are using part of the extent starting at the beginning: delete
         * the key we found and insert a new key representing the leftover at
         * the end. There is no net change in the number of extents.
         * 3. We are using part of the extent ending at the end: delete the key
         * we found and insert a new key representing the leftover at the
         * beginning. There is no net change in the number of extents.
         * 4. We are using part of the extent in the middle: delete the key we
         * found and insert two new keys representing the leftovers on each
         * side. Where we used to have one extent, we now have two, so increment
         * the extent count. We may need to convert the block group to bitmaps
         * as a result.
         */

        /* Delete the existing key (cases 1-4). */
        ret = btrfs_del_item(trans, root, path);
        if (ret)
                goto out;

        /* Add a key for leftovers at the beginning (cases 3 and 4). */
        if (start > found_start) {
                key.objectid = found_start;
                key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
                key.offset = start - found_start;

                btrfs_release_path(path);
                ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
                if (ret)
                        goto out;
                new_extents++;
        }

        /* Add a key for leftovers at the end (cases 2 and 4). */
        if (end < found_end) {
                key.objectid = end;
                key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
                key.offset = found_end - end;

                btrfs_release_path(path);
                ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
                if (ret)
                        goto out;
                new_extents++;
        }

        btrfs_release_path(path);
        ret = update_free_space_extent_count(trans, block_group, path,
                                             new_extents);

out:
        return ret;
}

EXPORT_FOR_TESTS
int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
                                  struct btrfs_block_group *block_group,
                                  struct btrfs_path *path, u64 start, u64 size)
{
        struct btrfs_free_space_info *info;
        u32 flags;
        int ret;

        if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
                ret = __add_block_group_free_space(trans, block_group, path);
                if (ret)
                        return ret;
        }

        info = search_free_space_info(NULL, block_group, path, 0);
        if (IS_ERR(info))
                return PTR_ERR(info);
        flags = btrfs_free_space_flags(path->nodes[0], info);
        btrfs_release_path(path);

        if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
                return modify_free_space_bitmap(trans, block_group, path,
                                                start, size, 1);
        } else {
                return remove_free_space_extent(trans, block_group, path,
                                                start, size);
        }
}

int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
                                u64 start, u64 size)
{
        struct btrfs_block_group *block_group;
        struct btrfs_path *path;
        int ret;

        if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
                return 0;

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }

        block_group = btrfs_lookup_block_group(trans->fs_info, start);
        if (!block_group) {
                ASSERT(0);
                ret = -ENOENT;
                goto out;
        }

        mutex_lock(&block_group->free_space_lock);
        ret = __remove_from_free_space_tree(trans, block_group, path, start,
                                            size);
        mutex_unlock(&block_group->free_space_lock);

        btrfs_put_block_group(block_group);
out:
        btrfs_free_path(path);
        if (ret)
                btrfs_abort_transaction(trans, ret);
        return ret;
}

static int add_free_space_extent(struct btrfs_trans_handle *trans,
                                 struct btrfs_block_group *block_group,
                                 struct btrfs_path *path,
                                 u64 start, u64 size)
{
        struct btrfs_root *root = btrfs_free_space_root(block_group);
        struct btrfs_key key, new_key;
        u64 found_start, found_end;
        u64 end = start + size;
        int new_extents = 1;
        int ret;

        /*
         * We are adding a new extent of free space, but we need to merge
         * extents. There are four cases here:
         *
         * 1. The new extent does not have any immediate neighbors to merge
         * with: add the new key and increment the free space extent count. We
         * may need to convert the block group to bitmaps as a result.
         * 2. The new extent has an immediate neighbor before it: remove the
         * previous key and insert a new key combining both of them. There is no
         * net change in the number of extents.
         * 3. The new extent has an immediate neighbor after it: remove the next
         * key and insert a new key combining both of them. There is no net
         * change in the number of extents.
         * 4. The new extent has immediate neighbors on both sides: remove both
         * of the keys and insert a new key combining all of them. Where we used
         * to have two extents, we now have one, so decrement the extent count.
         */

        new_key.objectid = start;
        new_key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
        new_key.offset = size;

        /* Search for a neighbor on the left. */
        if (start == block_group->start)
                goto right;
        key.objectid = start - 1;
        key.type = (u8)-1;
        key.offset = (u64)-1;

        ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
        if (ret)
                goto out;

        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);

        if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) {
                ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY);
                btrfs_release_path(path);
                goto right;
        }

        found_start = key.objectid;
        found_end = key.objectid + key.offset;
        ASSERT(found_start >= block_group->start &&
               found_end > block_group->start);
        ASSERT(found_start < start && found_end <= start);

        /*
         * Delete the neighbor on the left and absorb it into the new key (cases
         * 2 and 4).
         */
        if (found_end == start) {
                ret = btrfs_del_item(trans, root, path);
                if (ret)
                        goto out;
                new_key.objectid = found_start;
                new_key.offset += key.offset;
                new_extents--;
        }
        btrfs_release_path(path);

right:
        /* Search for a neighbor on the right. */
        if (end == block_group->start + block_group->length)
                goto insert;
        key.objectid = end;
        key.type = (u8)-1;
        key.offset = (u64)-1;

        ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
        if (ret)
                goto out;

        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);

        if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) {
                ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY);
                btrfs_release_path(path);
                goto insert;
        }

        found_start = key.objectid;
        found_end = key.objectid + key.offset;
        ASSERT(found_start >= block_group->start &&
               found_end > block_group->start);
        ASSERT((found_start < start && found_end <= start) ||
               (found_start >= end && found_end > end));

        /*
         * Delete the neighbor on the right and absorb it into the new key
         * (cases 3 and 4).
         */
        if (found_start == end) {
                ret = btrfs_del_item(trans, root, path);
                if (ret)
                        goto out;
                new_key.offset += key.offset;
                new_extents--;
        }
        btrfs_release_path(path);

insert:
        /* Insert the new key (cases 1-4). */
        ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0);
        if (ret)
                goto out;

        btrfs_release_path(path);
        ret = update_free_space_extent_count(trans, block_group, path,
                                             new_extents);

out:
        return ret;
}

EXPORT_FOR_TESTS
int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
                             struct btrfs_block_group *block_group,
                             struct btrfs_path *path, u64 start, u64 size)
{
        struct btrfs_free_space_info *info;
        u32 flags;
        int ret;

        if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
                ret = __add_block_group_free_space(trans, block_group, path);
                if (ret)
                        return ret;
        }

        info = search_free_space_info(NULL, block_group, path, 0);
        if (IS_ERR(info))
                return PTR_ERR(info);
        flags = btrfs_free_space_flags(path->nodes[0], info);
        btrfs_release_path(path);

        if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
                return modify_free_space_bitmap(trans, block_group, path,
                                                start, size, 0);
        } else {
                return add_free_space_extent(trans, block_group, path, start,
                                             size);
        }
}

int add_to_free_space_tree(struct btrfs_trans_handle *trans,
                           u64 start, u64 size)
{
        struct btrfs_block_group *block_group;
        struct btrfs_path *path;
        int ret;

        if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
                return 0;

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }

        block_group = btrfs_lookup_block_group(trans->fs_info, start);
        if (!block_group) {
                ASSERT(0);
                ret = -ENOENT;
                goto out;
        }

        mutex_lock(&block_group->free_space_lock);
        ret = __add_to_free_space_tree(trans, block_group, path, start, size);
        mutex_unlock(&block_group->free_space_lock);

        btrfs_put_block_group(block_group);
out:
        btrfs_free_path(path);
        if (ret)
                btrfs_abort_transaction(trans, ret);
        return ret;
}

/*
 * Populate the free space tree by walking the extent tree. Operations on the
 * extent tree that happen as a result of writes to the free space tree will go
 * through the normal add/remove hooks.
 */
static int populate_free_space_tree(struct btrfs_trans_handle *trans,
                                    struct btrfs_block_group *block_group)
{
        struct btrfs_root *extent_root;
        struct btrfs_path *path, *path2;
        struct btrfs_key key;
        u64 start, end;
        int ret;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        path->reada = READA_FORWARD;

        path2 = btrfs_alloc_path();
        if (!path2) {
                btrfs_free_path(path);
                return -ENOMEM;
        }

        ret = add_new_free_space_info(trans, block_group, path2);
        if (ret)
                goto out;

        mutex_lock(&block_group->free_space_lock);

        /*
         * Iterate through all of the extent and metadata items in this block
         * group, adding the free space between them and the free space at the
         * end. Note that EXTENT_ITEM and METADATA_ITEM are less than
         * BLOCK_GROUP_ITEM, so an extent may precede the block group that it's
         * contained in.
         */
        key.objectid = block_group->start;
        key.type = BTRFS_EXTENT_ITEM_KEY;
        key.offset = 0;

        extent_root = btrfs_extent_root(trans->fs_info, key.objectid);
        ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
        if (ret < 0)
                goto out_locked;
        ASSERT(ret == 0);

        start = block_group->start;
        end = block_group->start + block_group->length;
        while (1) {
                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);

                if (key.type == BTRFS_EXTENT_ITEM_KEY ||
                    key.type == BTRFS_METADATA_ITEM_KEY) {
                        if (key.objectid >= end)
                                break;

                        if (start < key.objectid) {
                                ret = __add_to_free_space_tree(trans,
                                                               block_group,
                                                               path2, start,
                                                               key.objectid -
                                                               start);
                                if (ret)
                                        goto out_locked;
                        }
                        start = key.objectid;
                        if (key.type == BTRFS_METADATA_ITEM_KEY)
                                start += trans->fs_info->nodesize;
                        else
                                start += key.offset;
                } else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
                        if (key.objectid != block_group->start)
                                break;
                }

                ret = btrfs_next_item(extent_root, path);
                if (ret < 0)
                        goto out_locked;
                if (ret)
                        break;
        }
        if (start < end) {
                ret = __add_to_free_space_tree(trans, block_group, path2,
                                               start, end - start);
                if (ret)
                        goto out_locked;
        }

        ret = 0;
out_locked:
        mutex_unlock(&block_group->free_space_lock);
out:
        btrfs_free_path(path2);
        btrfs_free_path(path);
        return ret;
}

int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
{
        struct btrfs_trans_handle *trans;
        struct btrfs_root *tree_root = fs_info->tree_root;
        struct btrfs_root *free_space_root;
        struct btrfs_block_group *block_group;
        struct rb_node *node;
        int ret;

        trans = btrfs_start_transaction(tree_root, 0);
        if (IS_ERR(trans))
                return PTR_ERR(trans);

        set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
        set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
        free_space_root = btrfs_create_tree(trans,
                                            BTRFS_FREE_SPACE_TREE_OBJECTID);
        if (IS_ERR(free_space_root)) {
                ret = PTR_ERR(free_space_root);
                btrfs_abort_transaction(trans, ret);
                btrfs_end_transaction(trans);
                goto out_clear;
        }
        ret = btrfs_global_root_insert(free_space_root);
        if (ret) {
                btrfs_put_root(free_space_root);
                btrfs_abort_transaction(trans, ret);
                btrfs_end_transaction(trans);
                goto out_clear;
        }

        node = rb_first_cached(&fs_info->block_group_cache_tree);
        while (node) {
                block_group = rb_entry(node, struct btrfs_block_group,
                                       cache_node);
                ret = populate_free_space_tree(trans, block_group);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        btrfs_end_transaction(trans);
                        goto out_clear;
                }
                node = rb_next(node);
        }

        btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE);
        btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
        clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
        ret = btrfs_commit_transaction(trans);

        /*
         * Now that we've committed the transaction any reading of our commit
         * root will be safe, so we can cache from the free space tree now.
         */
        clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
        return ret;

out_clear:
        clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
        clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
        return ret;
}

static int clear_free_space_tree(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root)
{
        struct btrfs_path *path;
        struct btrfs_key key;
        int nr;
        int ret;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        key.objectid = 0;
        key.type = 0;
        key.offset = 0;

        while (1) {
                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
                if (ret < 0)
                        goto out;

                nr = btrfs_header_nritems(path->nodes[0]);
                if (!nr)
                        break;

                path->slots[0] = 0;
                ret = btrfs_del_items(trans, root, path, 0, nr);
                if (ret)
                        goto out;

                btrfs_release_path(path);
        }

        ret = 0;
out:
        btrfs_free_path(path);
        return ret;
}

int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
{
        struct btrfs_trans_handle *trans;
        struct btrfs_root *tree_root = fs_info->tree_root;
        struct btrfs_key key = {
                .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
                .type = BTRFS_ROOT_ITEM_KEY,
                .offset = 0,
        };
        struct btrfs_root *free_space_root = btrfs_global_root(fs_info, &key);
        int ret;

        trans = btrfs_start_transaction(tree_root, 0);
        if (IS_ERR(trans))
                return PTR_ERR(trans);

        btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE);
        btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);

        ret = clear_free_space_tree(trans, free_space_root);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                btrfs_end_transaction(trans);
                return ret;
        }

        ret = btrfs_del_root(trans, &free_space_root->root_key);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                btrfs_end_transaction(trans);
                return ret;
        }

        btrfs_global_root_delete(free_space_root);

        spin_lock(&fs_info->trans_lock);
        list_del(&free_space_root->dirty_list);
        spin_unlock(&fs_info->trans_lock);

        btrfs_tree_lock(free_space_root->node);
        btrfs_clear_buffer_dirty(trans, free_space_root->node);
        btrfs_tree_unlock(free_space_root->node);
        btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
                              free_space_root->node, 0, 1);

        btrfs_put_root(free_space_root);

        return btrfs_commit_transaction(trans);
}

int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
{
        struct btrfs_trans_handle *trans;
        struct btrfs_key key = {
                .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
                .type = BTRFS_ROOT_ITEM_KEY,
                .offset = 0,
        };
        struct btrfs_root *free_space_root = btrfs_global_root(fs_info, &key);
        struct rb_node *node;
        int ret;

        trans = btrfs_start_transaction(free_space_root, 1);
        if (IS_ERR(trans))
                return PTR_ERR(trans);

        set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);
        set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);

        ret = clear_free_space_tree(trans, free_space_root);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
                btrfs_end_transaction(trans);
                return ret;
        }

        node = rb_first_cached(&fs_info->block_group_cache_tree);
        while (node) {
                struct btrfs_block_group *block_group;

                block_group = rb_entry(node, struct btrfs_block_group,
                                       cache_node);
                ret = populate_free_space_tree(trans, block_group);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        btrfs_end_transaction(trans);
                        return ret;
                }
                node = rb_next(node);
        }

        btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE);
        btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID);
        clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags);

        ret = btrfs_commit_transaction(trans);
        clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags);
        return ret;
}

static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
                                        struct btrfs_block_group *block_group,
                                        struct btrfs_path *path)
{
        int ret;

        clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags);

        ret = add_new_free_space_info(trans, block_group, path);
        if (ret)
                return ret;

        return __add_to_free_space_tree(trans, block_group, path,
                                        block_group->start,
                                        block_group->length);
}

int add_block_group_free_space(struct btrfs_trans_handle *trans,
                               struct btrfs_block_group *block_group)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_path *path = NULL;
        int ret = 0;

        if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
                return 0;

        mutex_lock(&block_group->free_space_lock);
        if (!test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags))
                goto out;

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }

        ret = __add_block_group_free_space(trans, block_group, path);

out:
        btrfs_free_path(path);
        mutex_unlock(&block_group->free_space_lock);
        if (ret)
                btrfs_abort_transaction(trans, ret);
        return ret;
}

int remove_block_group_free_space(struct btrfs_trans_handle *trans,
                                  struct btrfs_block_group *block_group)
{
        struct btrfs_root *root = btrfs_free_space_root(block_group);
        struct btrfs_path *path;
        struct btrfs_key key, found_key;
        struct extent_buffer *leaf;
        u64 start, end;
        int done = 0, nr;
        int ret;

        if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
                return 0;

        if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
                /* We never added this block group to the free space tree. */
                return 0;
        }

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }

        start = block_group->start;
        end = block_group->start + block_group->length;

        key.objectid = end - 1;
        key.type = (u8)-1;
        key.offset = (u64)-1;

        while (!done) {
                ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
                if (ret)
                        goto out;

                leaf = path->nodes[0];
                nr = 0;
                path->slots[0]++;
                while (path->slots[0] > 0) {
                        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);

                        if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
                                ASSERT(found_key.objectid == block_group->start);
                                ASSERT(found_key.offset == block_group->length);
                                done = 1;
                                nr++;
                                path->slots[0]--;
                                break;
                        } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY ||
                                   found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
                                ASSERT(found_key.objectid >= start);
                                ASSERT(found_key.objectid < end);
                                ASSERT(found_key.objectid + found_key.offset <= end);
                                nr++;
                                path->slots[0]--;
                        } else {
                                ASSERT(0);
                        }
                }

                ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
                if (ret)
                        goto out;
                btrfs_release_path(path);
        }

        ret = 0;
out:
        btrfs_free_path(path);
        if (ret)
                btrfs_abort_transaction(trans, ret);
        return ret;
}

static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
                                   struct btrfs_path *path,
                                   u32 expected_extent_count)
{
        struct btrfs_block_group *block_group;
        struct btrfs_fs_info *fs_info;
        struct btrfs_root *root;
        struct btrfs_key key;
        int prev_bit = 0, bit;
        /* Initialize to silence GCC. */
        u64 extent_start = 0;
        u64 end, offset;
        u64 total_found = 0;
        u32 extent_count = 0;
        int ret;

        block_group = caching_ctl->block_group;
        fs_info = block_group->fs_info;
        root = btrfs_free_space_root(block_group);

        end = block_group->start + block_group->length;

        while (1) {
                ret = btrfs_next_item(root, path);
                if (ret < 0)
                        goto out;
                if (ret)
                        break;

                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);

                if (key.type == BTRFS_FREE_SPACE_INFO_KEY)
                        break;

                ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
                ASSERT(key.objectid < end && key.objectid + key.offset <= end);

                offset = key.objectid;
                while (offset < key.objectid + key.offset) {
                        bit = free_space_test_bit(block_group, path, offset);
                        if (prev_bit == 0 && bit == 1) {
                                extent_start = offset;
                        } else if (prev_bit == 1 && bit == 0) {
                                u64 space_added;

                                ret = btrfs_add_new_free_space(block_group,
                                                               extent_start,
                                                               offset,
                                                               &space_added);
                                if (ret)
                                        goto out;
                                total_found += space_added;
                                if (total_found > CACHING_CTL_WAKE_UP) {
                                        total_found = 0;
                                        wake_up(&caching_ctl->wait);
                                }
                                extent_count++;
                        }
                        prev_bit = bit;
                        offset += fs_info->sectorsize;
                }
        }
        if (prev_bit == 1) {
                ret = btrfs_add_new_free_space(block_group, extent_start, end, NULL);
                if (ret)
                        goto out;
                extent_count++;
        }

        if (extent_count != expected_extent_count) {
                btrfs_err(fs_info,
                          "incorrect extent count for %llu; counted %u, expected %u",
                          block_group->start, extent_count,
                          expected_extent_count);
                ASSERT(0);
                ret = -EIO;
                goto out;
        }

        ret = 0;
out:
        return ret;
}

static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
                                   struct btrfs_path *path,
                                   u32 expected_extent_count)
{
        struct btrfs_block_group *block_group;
        struct btrfs_fs_info *fs_info;
        struct btrfs_root *root;
        struct btrfs_key key;
        u64 end;
        u64 total_found = 0;
        u32 extent_count = 0;
        int ret;

        block_group = caching_ctl->block_group;
        fs_info = block_group->fs_info;
        root = btrfs_free_space_root(block_group);

        end = block_group->start + block_group->length;

        while (1) {
                u64 space_added;

                ret = btrfs_next_item(root, path);
                if (ret < 0)
                        goto out;
                if (ret)
                        break;

                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);

                if (key.type == BTRFS_FREE_SPACE_INFO_KEY)
                        break;

                ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
                ASSERT(key.objectid < end && key.objectid + key.offset <= end);

                ret = btrfs_add_new_free_space(block_group, key.objectid,
                                               key.objectid + key.offset,
                                               &space_added);
                if (ret)
                        goto out;
                total_found += space_added;
                if (total_found > CACHING_CTL_WAKE_UP) {
                        total_found = 0;
                        wake_up(&caching_ctl->wait);
                }
                extent_count++;
        }

        if (extent_count != expected_extent_count) {
                btrfs_err(fs_info,
                          "incorrect extent count for %llu; counted %u, expected %u",
                          block_group->start, extent_count,
                          expected_extent_count);
                ASSERT(0);
                ret = -EIO;
                goto out;
        }

        ret = 0;
out:
        return ret;
}

int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
{
        struct btrfs_block_group *block_group;
        struct btrfs_free_space_info *info;
        struct btrfs_path *path;
        u32 extent_count, flags;
        int ret;

        block_group = caching_ctl->block_group;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        /*
         * Just like caching_thread() doesn't want to deadlock on the extent
         * tree, we don't want to deadlock on the free space tree.
         */
        path->skip_locking = 1;
        path->search_commit_root = 1;
        path->reada = READA_FORWARD;

        info = search_free_space_info(NULL, block_group, path, 0);
        if (IS_ERR(info)) {
                ret = PTR_ERR(info);
                goto out;
        }
        extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
        flags = btrfs_free_space_flags(path->nodes[0], info);

        /*
         * We left path pointing to the free space info item, so now
         * load_free_space_foo can just iterate through the free space tree from
         * there.
         */
        if (flags & BTRFS_FREE_SPACE_USING_BITMAPS)
                ret = load_free_space_bitmaps(caching_ctl, path, extent_count);
        else
                ret = load_free_space_extents(caching_ctl, path, extent_count);

out:
        btrfs_free_path(path);
        return ret;
}












































    2 
    2 




































    1 

    4 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_POLL_H
#define _LINUX_POLL_H


#include <linux/compiler.h>
#include <linux/ktime.h>
#include <linux/wait.h>
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <uapi/linux/poll.h>
#include <uapi/linux/eventpoll.h>

/* ~832 bytes of stack space used max in sys_select/sys_poll before allocating
   additional memory. */
#define MAX_STACK_ALLOC 832
#define FRONTEND_STACK_ALLOC        256
#define SELECT_STACK_ALLOC        FRONTEND_STACK_ALLOC
#define POLL_STACK_ALLOC        FRONTEND_STACK_ALLOC
#define WQUEUES_STACK_ALLOC        (MAX_STACK_ALLOC - FRONTEND_STACK_ALLOC)
#define N_INLINE_POLL_ENTRIES        (WQUEUES_STACK_ALLOC / sizeof(struct poll_table_entry))

#define DEFAULT_POLLMASK (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)

struct poll_table_struct;

/* 
 * structures and helpers for f_op->poll implementations
 */
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);

/*
 * Do not touch the structure directly, use the access functions
 * poll_does_not_wait() and poll_requested_events() instead.
 */
typedef struct poll_table_struct {
        poll_queue_proc _qproc;
        __poll_t _key;
} poll_table;

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
        if (p && p->_qproc && wait_address)
                p->_qproc(filp, wait_address, p);
}

/*
 * Return true if it is guaranteed that poll will not wait. This is the case
 * if the poll() of another file descriptor in the set got an event, so there
 * is no need for waiting.
 */
static inline bool poll_does_not_wait(const poll_table *p)
{
        return p == NULL || p->_qproc == NULL;
}

/*
 * Return the set of events that the application wants to poll for.
 * This is useful for drivers that need to know whether a DMA transfer has
 * to be started implicitly on poll(). You typically only want to do that
 * if the application is actually polling for POLLIN and/or POLLOUT.
 */
static inline __poll_t poll_requested_events(const poll_table *p)
{
        return p ? p->_key : ~(__poll_t)0;
}

static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
        pt->_qproc = qproc;
        pt->_key   = ~(__poll_t)0; /* all events enabled */
}

static inline bool file_can_poll(struct file *file)
{
        return file->f_op->poll;
}

static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
{
        if (unlikely(!file->f_op->poll))
                return DEFAULT_POLLMASK;
        return file->f_op->poll(file, pt);
}

struct poll_table_entry {
        struct file *filp;
        __poll_t key;
        wait_queue_entry_t wait;
        wait_queue_head_t *wait_address;
};

/*
 * Structures and helpers for select/poll syscall
 */
struct poll_wqueues {
        poll_table pt;
        struct poll_table_page *table;
        struct task_struct *polling_task;
        int triggered;
        int error;
        int inline_index;
        struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};

extern void poll_initwait(struct poll_wqueues *pwq);
extern void poll_freewait(struct poll_wqueues *pwq);
extern u64 select_estimate_accuracy(struct timespec64 *tv);

#define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)

extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
                           fd_set __user *exp, struct timespec64 *end_time);

extern int poll_select_set_timeout(struct timespec64 *to, time64_t sec,
                                   long nsec);

#define __MAP(v, from, to) \
        (from < to ? (v & from) * (to/from) : (v & from) / (from/to))

static inline __u16 mangle_poll(__poll_t val)
{
        __u16 v = (__force __u16)val;
#define M(X) __MAP(v, (__force __u16)EPOLL##X, POLL##X)
        return M(IN) | M(OUT) | M(PRI) | M(ERR) | M(NVAL) |
                M(RDNORM) | M(RDBAND) | M(WRNORM) | M(WRBAND) |
                M(HUP) | M(RDHUP) | M(MSG);
#undef M
}

static inline __poll_t demangle_poll(u16 val)
{
#define M(X) (__force __poll_t)__MAP(val, POLL##X, (__force __u16)EPOLL##X)
        return M(IN) | M(OUT) | M(PRI) | M(ERR) | M(NVAL) |
                M(RDNORM) | M(RDBAND) | M(WRNORM) | M(WRBAND) |
                M(HUP) | M(RDHUP) | M(MSG);
#undef M
}
#undef __MAP


#endif /* _LINUX_POLL_H */






































    3 
































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM ipi

#if !defined(_TRACE_IPI_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_IPI_H

#include <linux/tracepoint.h>

/**
 * ipi_raise - called when a smp cross call is made
 *
 * @mask: mask of recipient CPUs for the IPI
 * @reason: string identifying the IPI purpose
 *
 * It is necessary for @reason to be a static string declared with
 * __tracepoint_string.
 */
TRACE_EVENT(ipi_raise,

        TP_PROTO(const struct cpumask *mask, const char *reason),

        TP_ARGS(mask, reason),

        TP_STRUCT__entry(
                __bitmask(target_cpus, nr_cpumask_bits)
                __field(const char *, reason)
        ),

        TP_fast_assign(
                __assign_bitmask(target_cpus, cpumask_bits(mask), nr_cpumask_bits);
                __entry->reason = reason;
        ),

        TP_printk("target_mask=%s (%s)", __get_bitmask(target_cpus), __entry->reason)
);

TRACE_EVENT(ipi_send_cpu,

        TP_PROTO(const unsigned int cpu, unsigned long callsite, void *callback),

        TP_ARGS(cpu, callsite, callback),

        TP_STRUCT__entry(
                __field(unsigned int, cpu)
                __field(void *, callsite)
                __field(void *, callback)
        ),

        TP_fast_assign(
                __entry->cpu = cpu;
                __entry->callsite = (void *)callsite;
                __entry->callback = callback;
        ),

        TP_printk("cpu=%u callsite=%pS callback=%pS",
                  __entry->cpu, __entry->callsite, __entry->callback)
);

TRACE_EVENT(ipi_send_cpumask,

        TP_PROTO(const struct cpumask *cpumask, unsigned long callsite, void *callback),

        TP_ARGS(cpumask, callsite, callback),

        TP_STRUCT__entry(
                __cpumask(cpumask)
                __field(void *, callsite)
                __field(void *, callback)
        ),

        TP_fast_assign(
                __assign_cpumask(cpumask, cpumask_bits(cpumask));
                __entry->callsite = (void *)callsite;
                __entry->callback = callback;
        ),

        TP_printk("cpumask=%s callsite=%pS callback=%pS",
                  __get_cpumask(cpumask), __entry->callsite, __entry->callback)
);

DECLARE_EVENT_CLASS(ipi_handler,

        TP_PROTO(const char *reason),

        TP_ARGS(reason),

        TP_STRUCT__entry(
                __field(const char *, reason)
        ),

        TP_fast_assign(
                __entry->reason = reason;
        ),

        TP_printk("(%s)", __entry->reason)
);

/**
 * ipi_entry - called immediately before the IPI handler
 *
 * @reason: string identifying the IPI purpose
 *
 * It is necessary for @reason to be a static string declared with
 * __tracepoint_string, ideally the same as used with trace_ipi_raise
 * for that IPI.
 */
DEFINE_EVENT(ipi_handler, ipi_entry,

        TP_PROTO(const char *reason),

        TP_ARGS(reason)
);

/**
 * ipi_exit - called immediately after the IPI handler returns
 *
 * @reason: string identifying the IPI purpose
 *
 * It is necessary for @reason to be a static string declared with
 * __tracepoint_string, ideally the same as used with trace_ipi_raise for
 * that IPI.
 */
DEFINE_EVENT(ipi_handler, ipi_exit,

        TP_PROTO(const char *reason),

        TP_ARGS(reason)
);

#endif /* _TRACE_IPI_H */

/* This part must be outside protection */
#include <trace/define_trace.h>
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 


































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
// SPDX-License-Identifier: GPL-2.0
/*
 * ring buffer based function tracer
 *
 * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com>
 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
 *
 * Originally taken from the RT patch by:
 *    Arnaldo Carvalho de Melo <acme@redhat.com>
 *
 * Based on code from the latency_tracer, that is:
 *  Copyright (C) 2004-2006 Ingo Molnar
 *  Copyright (C) 2004 Nadia Yvette Chambers
 */
#include <linux/ring_buffer.h>
#include <linux/utsname.h>
#include <linux/stacktrace.h>
#include <linux/writeback.h>
#include <linux/kallsyms.h>
#include <linux/security.h>
#include <linux/seq_file.h>
#include <linux/irqflags.h>
#include <linux/debugfs.h>
#include <linux/tracefs.h>
#include <linux/pagemap.h>
#include <linux/hardirq.h>
#include <linux/linkage.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/ftrace.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/splice.h>
#include <linux/kdebug.h>
#include <linux/string.h>
#include <linux/mount.h>
#include <linux/rwsem.h>
#include <linux/slab.h>
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/panic_notifier.h>
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/fs.h>
#include <linux/trace.h>
#include <linux/sched/clock.h>
#include <linux/sched/rt.h>
#include <linux/fsnotify.h>
#include <linux/irq_work.h>
#include <linux/workqueue.h>

#include <asm/setup.h> /* COMMAND_LINE_SIZE */

#include "trace.h"
#include "trace_output.h"

#ifdef CONFIG_FTRACE_STARTUP_TEST
/*
 * We need to change this state when a selftest is running.
 * A selftest will lurk into the ring-buffer to count the
 * entries inserted during the selftest although some concurrent
 * insertions into the ring-buffer such as trace_printk could occurred
 * at the same time, giving false positive or negative results.
 */
static bool __read_mostly tracing_selftest_running;

/*
 * If boot-time tracing including tracers/events via kernel cmdline
 * is running, we do not want to run SELFTEST.
 */
bool __read_mostly tracing_selftest_disabled;

void __init disable_tracing_selftest(const char *reason)
{
        if (!tracing_selftest_disabled) {
                tracing_selftest_disabled = true;
                pr_info("Ftrace startup test is disabled due to %s\n", reason);
        }
}
#else
#define tracing_selftest_running        0
#define tracing_selftest_disabled        0
#endif

/* Pipe tracepoints to printk */
static struct trace_iterator *tracepoint_print_iter;
int tracepoint_printk;
static bool tracepoint_printk_stop_on_boot __initdata;
static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key);

/* For tracers that don't implement custom flags */
static struct tracer_opt dummy_tracer_opt[] = {
        { }
};

static int
dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
        return 0;
}

/*
 * To prevent the comm cache from being overwritten when no
 * tracing is active, only save the comm when a trace event
 * occurred.
 */
DEFINE_PER_CPU(bool, trace_taskinfo_save);

/*
 * Kill all tracing for good (never come back).
 * It is initialized to 1 but will turn to zero if the initialization
 * of the tracer is successful. But that is the only place that sets
 * this back to zero.
 */
static int tracing_disabled = 1;

cpumask_var_t __read_mostly        tracing_buffer_mask;

/*
 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
 *
 * If there is an oops (or kernel panic) and the ftrace_dump_on_oops
 * is set, then ftrace_dump is called. This will output the contents
 * of the ftrace buffers to the console.  This is very useful for
 * capturing traces that lead to crashes and outputing it to a
 * serial console.
 *
 * It is default off, but you can enable it with either specifying
 * "ftrace_dump_on_oops" in the kernel command line, or setting
 * /proc/sys/kernel/ftrace_dump_on_oops
 * Set 1 if you want to dump buffers of all CPUs
 * Set 2 if you want to dump the buffer of the CPU that triggered oops
 * Set instance name if you want to dump the specific trace instance
 * Multiple instance dump is also supported, and instances are seperated
 * by commas.
 */
/* Set to string format zero to disable by default */
char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0";

/* When set, tracing will stop when a WARN*() is hit */
int __disable_trace_on_warning;

#ifdef CONFIG_TRACE_EVAL_MAP_FILE
/* Map of enums to their values, for "eval_map" file */
struct trace_eval_map_head {
        struct module                        *mod;
        unsigned long                        length;
};

union trace_eval_map_item;

struct trace_eval_map_tail {
        /*
         * "end" is first and points to NULL as it must be different
         * than "mod" or "eval_string"
         */
        union trace_eval_map_item        *next;
        const char                        *end;        /* points to NULL */
};

static DEFINE_MUTEX(trace_eval_mutex);

/*
 * The trace_eval_maps are saved in an array with two extra elements,
 * one at the beginning, and one at the end. The beginning item contains
 * the count of the saved maps (head.length), and the module they
 * belong to if not built in (head.mod). The ending item contains a
 * pointer to the next array of saved eval_map items.
 */
union trace_eval_map_item {
        struct trace_eval_map                map;
        struct trace_eval_map_head        head;
        struct trace_eval_map_tail        tail;
};

static union trace_eval_map_item *trace_eval_maps;
#endif /* CONFIG_TRACE_EVAL_MAP_FILE */

int tracing_set_tracer(struct trace_array *tr, const char *buf);
static void ftrace_trace_userstack(struct trace_array *tr,
                                   struct trace_buffer *buffer,
                                   unsigned int trace_ctx);

static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;

static bool allocate_snapshot;
static bool snapshot_at_boot;

static char boot_instance_info[COMMAND_LINE_SIZE] __initdata;
static int boot_instance_index;

static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata;
static int boot_snapshot_index;

static int __init set_cmdline_ftrace(char *str)
{
        strscpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
        default_bootup_tracer = bootup_tracer_buf;
        /* We are using ftrace early, expand it */
        trace_set_ring_buffer_expanded(NULL);
        return 1;
}
__setup("ftrace=", set_cmdline_ftrace);

int ftrace_dump_on_oops_enabled(void)
{
        if (!strcmp("0", ftrace_dump_on_oops))
                return 0;
        else
                return 1;
}

static int __init set_ftrace_dump_on_oops(char *str)
{
        if (!*str) {
                strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE);
                return 1;
        }

        if (*str == ',') {
                strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE);
                strscpy(ftrace_dump_on_oops + 1, str, MAX_TRACER_SIZE - 1);
                return 1;
        }

        if (*str++ == '=') {
                strscpy(ftrace_dump_on_oops, str, MAX_TRACER_SIZE);
                return 1;
        }

        return 0;
}
__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);

static int __init stop_trace_on_warning(char *str)
{
        if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
                __disable_trace_on_warning = 1;
        return 1;
}
__setup("traceoff_on_warning", stop_trace_on_warning);

static int __init boot_alloc_snapshot(char *str)
{
        char *slot = boot_snapshot_info + boot_snapshot_index;
        int left = sizeof(boot_snapshot_info) - boot_snapshot_index;
        int ret;

        if (str[0] == '=') {
                str++;
                if (strlen(str) >= left)
                        return -1;

                ret = snprintf(slot, left, "%s\t", str);
                boot_snapshot_index += ret;
        } else {
                allocate_snapshot = true;
                /* We also need the main ring buffer expanded */
                trace_set_ring_buffer_expanded(NULL);
        }
        return 1;
}
__setup("alloc_snapshot", boot_alloc_snapshot);


static int __init boot_snapshot(char *str)
{
        snapshot_at_boot = true;
        boot_alloc_snapshot(str);
        return 1;
}
__setup("ftrace_boot_snapshot", boot_snapshot);


static int __init boot_instance(char *str)
{
        char *slot = boot_instance_info + boot_instance_index;
        int left = sizeof(boot_instance_info) - boot_instance_index;
        int ret;

        if (strlen(str) >= left)
                return -1;

        ret = snprintf(slot, left, "%s\t", str);
        boot_instance_index += ret;

        return 1;
}
__setup("trace_instance=", boot_instance);


static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;

static int __init set_trace_boot_options(char *str)
{
        strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
        return 1;
}
__setup("trace_options=", set_trace_boot_options);

static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata;
static char *trace_boot_clock __initdata;

static int __init set_trace_boot_clock(char *str)
{
        strscpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE);
        trace_boot_clock = trace_boot_clock_buf;
        return 1;
}
__setup("trace_clock=", set_trace_boot_clock);

static int __init set_tracepoint_printk(char *str)
{
        /* Ignore the "tp_printk_stop_on_boot" param */
        if (*str == '_')
                return 0;

        if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
                tracepoint_printk = 1;
        return 1;
}
__setup("tp_printk", set_tracepoint_printk);

static int __init set_tracepoint_printk_stop(char *str)
{
        tracepoint_printk_stop_on_boot = true;
        return 1;
}
__setup("tp_printk_stop_on_boot", set_tracepoint_printk_stop);

unsigned long long ns2usecs(u64 nsec)
{
        nsec += 500;
        do_div(nsec, 1000);
        return nsec;
}

static void
trace_process_export(struct trace_export *export,
               struct ring_buffer_event *event, int flag)
{
        struct trace_entry *entry;
        unsigned int size = 0;

        if (export->flags & flag) {
                entry = ring_buffer_event_data(event);
                size = ring_buffer_event_length(event);
                export->write(export, entry, size);
        }
}

static DEFINE_MUTEX(ftrace_export_lock);

static struct trace_export __rcu *ftrace_exports_list __read_mostly;

static DEFINE_STATIC_KEY_FALSE(trace_function_exports_enabled);
static DEFINE_STATIC_KEY_FALSE(trace_event_exports_enabled);
static DEFINE_STATIC_KEY_FALSE(trace_marker_exports_enabled);

static inline void ftrace_exports_enable(struct trace_export *export)
{
        if (export->flags & TRACE_EXPORT_FUNCTION)
                static_branch_inc(&trace_function_exports_enabled);

        if (export->flags & TRACE_EXPORT_EVENT)
                static_branch_inc(&trace_event_exports_enabled);

        if (export->flags & TRACE_EXPORT_MARKER)
                static_branch_inc(&trace_marker_exports_enabled);
}

static inline void ftrace_exports_disable(struct trace_export *export)
{
        if (export->flags & TRACE_EXPORT_FUNCTION)
                static_branch_dec(&trace_function_exports_enabled);

        if (export->flags & TRACE_EXPORT_EVENT)
                static_branch_dec(&trace_event_exports_enabled);

        if (export->flags & TRACE_EXPORT_MARKER)
                static_branch_dec(&trace_marker_exports_enabled);
}

static void ftrace_exports(struct ring_buffer_event *event, int flag)
{
        struct trace_export *export;

        preempt_disable_notrace();

        export = rcu_dereference_raw_check(ftrace_exports_list);
        while (export) {
                trace_process_export(export, event, flag);
                export = rcu_dereference_raw_check(export->next);
        }

        preempt_enable_notrace();
}

static inline void
add_trace_export(struct trace_export **list, struct trace_export *export)
{
        rcu_assign_pointer(export->next, *list);
        /*
         * We are entering export into the list but another
         * CPU might be walking that list. We need to make sure
         * the export->next pointer is valid before another CPU sees
         * the export pointer included into the list.
         */
        rcu_assign_pointer(*list, export);
}

static inline int
rm_trace_export(struct trace_export **list, struct trace_export *export)
{
        struct trace_export **p;

        for (p = list; *p != NULL; p = &(*p)->next)
                if (*p == export)
                        break;

        if (*p != export)
                return -1;

        rcu_assign_pointer(*p, (*p)->next);

        return 0;
}

static inline void
add_ftrace_export(struct trace_export **list, struct trace_export *export)
{
        ftrace_exports_enable(export);

        add_trace_export(list, export);
}

static inline int
rm_ftrace_export(struct trace_export **list, struct trace_export *export)
{
        int ret;

        ret = rm_trace_export(list, export);
        ftrace_exports_disable(export);

        return ret;
}

int register_ftrace_export(struct trace_export *export)
{
        if (WARN_ON_ONCE(!export->write))
                return -1;

        mutex_lock(&ftrace_export_lock);

        add_ftrace_export(&ftrace_exports_list, export);

        mutex_unlock(&ftrace_export_lock);

        return 0;
}
EXPORT_SYMBOL_GPL(register_ftrace_export);

int unregister_ftrace_export(struct trace_export *export)
{
        int ret;

        mutex_lock(&ftrace_export_lock);

        ret = rm_ftrace_export(&ftrace_exports_list, export);

        mutex_unlock(&ftrace_export_lock);

        return ret;
}
EXPORT_SYMBOL_GPL(unregister_ftrace_export);

/* trace_flags holds trace_options default values */
#define TRACE_DEFAULT_FLAGS                                                \
        (FUNCTION_DEFAULT_FLAGS |                                        \
         TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |                        \
         TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO |                \
         TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |                        \
         TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS |                        \
         TRACE_ITER_HASH_PTR)

/* trace_options that are only supported by global_trace */
#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK |                        \
               TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD)

/* trace_flags that are default zero for instances */
#define ZEROED_TRACE_FLAGS \
        (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK)

/*
 * The global_trace is the descriptor that holds the top-level tracing
 * buffers for the live tracing.
 */
static struct trace_array global_trace = {
        .trace_flags = TRACE_DEFAULT_FLAGS,
};

void trace_set_ring_buffer_expanded(struct trace_array *tr)
{
        if (!tr)
                tr = &global_trace;
        tr->ring_buffer_expanded = true;
}

LIST_HEAD(ftrace_trace_arrays);

int trace_array_get(struct trace_array *this_tr)
{
        struct trace_array *tr;
        int ret = -ENODEV;

        mutex_lock(&trace_types_lock);
        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
                if (tr == this_tr) {
                        tr->ref++;
                        ret = 0;
                        break;
                }
        }
        mutex_unlock(&trace_types_lock);

        return ret;
}

static void __trace_array_put(struct trace_array *this_tr)
{
        WARN_ON(!this_tr->ref);
        this_tr->ref--;
}

/**
 * trace_array_put - Decrement the reference counter for this trace array.
 * @this_tr : pointer to the trace array
 *
 * NOTE: Use this when we no longer need the trace array returned by
 * trace_array_get_by_name(). This ensures the trace array can be later
 * destroyed.
 *
 */
void trace_array_put(struct trace_array *this_tr)
{
        if (!this_tr)
                return;

        mutex_lock(&trace_types_lock);
        __trace_array_put(this_tr);
        mutex_unlock(&trace_types_lock);
}
EXPORT_SYMBOL_GPL(trace_array_put);

int tracing_check_open_get_tr(struct trace_array *tr)
{
        int ret;

        ret = security_locked_down(LOCKDOWN_TRACEFS);
        if (ret)
                return ret;

        if (tracing_disabled)
                return -ENODEV;

        if (tr && trace_array_get(tr) < 0)
                return -ENODEV;

        return 0;
}

int call_filter_check_discard(struct trace_event_call *call, void *rec,
                              struct trace_buffer *buffer,
                              struct ring_buffer_event *event)
{
        if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
            !filter_match_preds(call->filter, rec)) {
                __trace_event_discard_commit(buffer, event);
                return 1;
        }

        return 0;
}

/**
 * trace_find_filtered_pid - check if a pid exists in a filtered_pid list
 * @filtered_pids: The list of pids to check
 * @search_pid: The PID to find in @filtered_pids
 *
 * Returns true if @search_pid is found in @filtered_pids, and false otherwise.
 */
bool
trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
{
        return trace_pid_list_is_set(filtered_pids, search_pid);
}

/**
 * trace_ignore_this_task - should a task be ignored for tracing
 * @filtered_pids: The list of pids to check
 * @filtered_no_pids: The list of pids not to be traced
 * @task: The task that should be ignored if not filtered
 *
 * Checks if @task should be traced or not from @filtered_pids.
 * Returns true if @task should *NOT* be traced.
 * Returns false if @task should be traced.
 */
bool
trace_ignore_this_task(struct trace_pid_list *filtered_pids,
                       struct trace_pid_list *filtered_no_pids,
                       struct task_struct *task)
{
        /*
         * If filtered_no_pids is not empty, and the task's pid is listed
         * in filtered_no_pids, then return true.
         * Otherwise, if filtered_pids is empty, that means we can
         * trace all tasks. If it has content, then only trace pids
         * within filtered_pids.
         */

        return (filtered_pids &&
                !trace_find_filtered_pid(filtered_pids, task->pid)) ||
                (filtered_no_pids &&
                 trace_find_filtered_pid(filtered_no_pids, task->pid));
}

/**
 * trace_filter_add_remove_task - Add or remove a task from a pid_list
 * @pid_list: The list to modify
 * @self: The current task for fork or NULL for exit
 * @task: The task to add or remove
 *
 * If adding a task, if @self is defined, the task is only added if @self
 * is also included in @pid_list. This happens on fork and tasks should
 * only be added when the parent is listed. If @self is NULL, then the
 * @task pid will be removed from the list, which would happen on exit
 * of a task.
 */
void trace_filter_add_remove_task(struct trace_pid_list *pid_list,
                                  struct task_struct *self,
                                  struct task_struct *task)
{
        if (!pid_list)
                return;

        /* For forks, we only add if the forking task is listed */
        if (self) {
                if (!trace_find_filtered_pid(pid_list, self->pid))
                        return;
        }

        /* "self" is set for forks, and NULL for exits */
        if (self)
                trace_pid_list_set(pid_list, task->pid);
        else
                trace_pid_list_clear(pid_list, task->pid);
}

/**
 * trace_pid_next - Used for seq_file to get to the next pid of a pid_list
 * @pid_list: The pid list to show
 * @v: The last pid that was shown (+1 the actual pid to let zero be displayed)
 * @pos: The position of the file
 *
 * This is used by the seq_file "next" operation to iterate the pids
 * listed in a trace_pid_list structure.
 *
 * Returns the pid+1 as we want to display pid of zero, but NULL would
 * stop the iteration.
 */
void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos)
{
        long pid = (unsigned long)v;
        unsigned int next;

        (*pos)++;

        /* pid already is +1 of the actual previous bit */
        if (trace_pid_list_next(pid_list, pid, &next) < 0)
                return NULL;

        pid = next;

        /* Return pid + 1 to allow zero to be represented */
        return (void *)(pid + 1);
}

/**
 * trace_pid_start - Used for seq_file to start reading pid lists
 * @pid_list: The pid list to show
 * @pos: The position of the file
 *
 * This is used by seq_file "start" operation to start the iteration
 * of listing pids.
 *
 * Returns the pid+1 as we want to display pid of zero, but NULL would
 * stop the iteration.
 */
void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos)
{
        unsigned long pid;
        unsigned int first;
        loff_t l = 0;

        if (trace_pid_list_first(pid_list, &first) < 0)
                return NULL;

        pid = first;

        /* Return pid + 1 so that zero can be the exit value */
        for (pid++; pid && l < *pos;
             pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l))
                ;
        return (void *)pid;
}

/**
 * trace_pid_show - show the current pid in seq_file processing
 * @m: The seq_file structure to write into
 * @v: A void pointer of the pid (+1) value to display
 *
 * Can be directly used by seq_file operations to display the current
 * pid value.
 */
int trace_pid_show(struct seq_file *m, void *v)
{
        unsigned long pid = (unsigned long)v - 1;

        seq_printf(m, "%lu\n", pid);
        return 0;
}

/* 128 should be much more than enough */
#define PID_BUF_SIZE                127

int trace_pid_write(struct trace_pid_list *filtered_pids,
                    struct trace_pid_list **new_pid_list,
                    const char __user *ubuf, size_t cnt)
{
        struct trace_pid_list *pid_list;
        struct trace_parser parser;
        unsigned long val;
        int nr_pids = 0;
        ssize_t read = 0;
        ssize_t ret;
        loff_t pos;
        pid_t pid;

        if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1))
                return -ENOMEM;

        /*
         * Always recreate a new array. The write is an all or nothing
         * operation. Always create a new array when adding new pids by
         * the user. If the operation fails, then the current list is
         * not modified.
         */
        pid_list = trace_pid_list_alloc();
        if (!pid_list) {
                trace_parser_put(&parser);
                return -ENOMEM;
        }

        if (filtered_pids) {
                /* copy the current bits to the new max */
                ret = trace_pid_list_first(filtered_pids, &pid);
                while (!ret) {
                        trace_pid_list_set(pid_list, pid);
                        ret = trace_pid_list_next(filtered_pids, pid + 1, &pid);
                        nr_pids++;
                }
        }

        ret = 0;
        while (cnt > 0) {

                pos = 0;

                ret = trace_get_user(&parser, ubuf, cnt, &pos);
                if (ret < 0)
                        break;

                read += ret;
                ubuf += ret;
                cnt -= ret;

                if (!trace_parser_loaded(&parser))
                        break;

                ret = -EINVAL;
                if (kstrtoul(parser.buffer, 0, &val))
                        break;

                pid = (pid_t)val;

                if (trace_pid_list_set(pid_list, pid) < 0) {
                        ret = -1;
                        break;
                }
                nr_pids++;

                trace_parser_clear(&parser);
                ret = 0;
        }
        trace_parser_put(&parser);

        if (ret < 0) {
                trace_pid_list_free(pid_list);
                return ret;
        }

        if (!nr_pids) {
                /* Cleared the list of pids */
                trace_pid_list_free(pid_list);
                pid_list = NULL;
        }

        *new_pid_list = pid_list;

        return read;
}

static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu)
{
        u64 ts;

        /* Early boot up does not have a buffer yet */
        if (!buf->buffer)
                return trace_clock_local();

        ts = ring_buffer_time_stamp(buf->buffer);
        ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts);

        return ts;
}

u64 ftrace_now(int cpu)
{
        return buffer_ftrace_now(&global_trace.array_buffer, cpu);
}

/**
 * tracing_is_enabled - Show if global_trace has been enabled
 *
 * Shows if the global trace has been enabled or not. It uses the
 * mirror flag "buffer_disabled" to be used in fast paths such as for
 * the irqsoff tracer. But it may be inaccurate due to races. If you
 * need to know the accurate state, use tracing_is_on() which is a little
 * slower, but accurate.
 */
int tracing_is_enabled(void)
{
        /*
         * For quick access (irqsoff uses this in fast path), just
         * return the mirror variable of the state of the ring buffer.
         * It's a little racy, but we don't really care.
         */
        smp_rmb();
        return !global_trace.buffer_disabled;
}

/*
 * trace_buf_size is the size in bytes that is allocated
 * for a buffer. Note, the number of bytes is always rounded
 * to page size.
 *
 * This number is purposely set to a low number of 16384.
 * If the dump on oops happens, it will be much appreciated
 * to not have to wait for all that output. Anyway this can be
 * boot time and run time configurable.
 */
#define TRACE_BUF_SIZE_DEFAULT        1441792UL /* 16384 * 88 (sizeof(entry)) */

static unsigned long                trace_buf_size = TRACE_BUF_SIZE_DEFAULT;

/* trace_types holds a link list of available tracers. */
static struct tracer                *trace_types __read_mostly;

/*
 * trace_types_lock is used to protect the trace_types list.
 */
DEFINE_MUTEX(trace_types_lock);

/*
 * serialize the access of the ring buffer
 *
 * ring buffer serializes readers, but it is low level protection.
 * The validity of the events (which returns by ring_buffer_peek() ..etc)
 * are not protected by ring buffer.
 *
 * The content of events may become garbage if we allow other process consumes
 * these events concurrently:
 *   A) the page of the consumed events may become a normal page
 *      (not reader page) in ring buffer, and this page will be rewritten
 *      by events producer.
 *   B) The page of the consumed events may become a page for splice_read,
 *      and this page will be returned to system.
 *
 * These primitives allow multi process access to different cpu ring buffer
 * concurrently.
 *
 * These primitives don't distinguish read-only and read-consume access.
 * Multi read-only access are also serialized.
 */

#ifdef CONFIG_SMP
static DECLARE_RWSEM(all_cpu_access_lock);
static DEFINE_PER_CPU(struct mutex, cpu_access_lock);

static inline void trace_access_lock(int cpu)
{
        if (cpu == RING_BUFFER_ALL_CPUS) {
                /* gain it for accessing the whole ring buffer. */
                down_write(&all_cpu_access_lock);
        } else {
                /* gain it for accessing a cpu ring buffer. */

                /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */
                down_read(&all_cpu_access_lock);

                /* Secondly block other access to this @cpu ring buffer. */
                mutex_lock(&per_cpu(cpu_access_lock, cpu));
        }
}

static inline void trace_access_unlock(int cpu)
{
        if (cpu == RING_BUFFER_ALL_CPUS) {
                up_write(&all_cpu_access_lock);
        } else {
                mutex_unlock(&per_cpu(cpu_access_lock, cpu));
                up_read(&all_cpu_access_lock);
        }
}

static inline void trace_access_lock_init(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                mutex_init(&per_cpu(cpu_access_lock, cpu));
}

#else

static DEFINE_MUTEX(access_lock);

static inline void trace_access_lock(int cpu)
{
        (void)cpu;
        mutex_lock(&access_lock);
}

static inline void trace_access_unlock(int cpu)
{
        (void)cpu;
        mutex_unlock(&access_lock);
}

static inline void trace_access_lock_init(void)
{
}

#endif

#ifdef CONFIG_STACKTRACE
static void __ftrace_trace_stack(struct trace_buffer *buffer,
                                 unsigned int trace_ctx,
                                 int skip, struct pt_regs *regs);
static inline void ftrace_trace_stack(struct trace_array *tr,
                                      struct trace_buffer *buffer,
                                      unsigned int trace_ctx,
                                      int skip, struct pt_regs *regs);

#else
static inline void __ftrace_trace_stack(struct trace_buffer *buffer,
                                        unsigned int trace_ctx,
                                        int skip, struct pt_regs *regs)
{
}
static inline void ftrace_trace_stack(struct trace_array *tr,
                                      struct trace_buffer *buffer,
                                      unsigned long trace_ctx,
                                      int skip, struct pt_regs *regs)
{
}

#endif

static __always_inline void
trace_event_setup(struct ring_buffer_event *event,
                  int type, unsigned int trace_ctx)
{
        struct trace_entry *ent = ring_buffer_event_data(event);

        tracing_generic_entry_update(ent, type, trace_ctx);
}

static __always_inline struct ring_buffer_event *
__trace_buffer_lock_reserve(struct trace_buffer *buffer,
                          int type,
                          unsigned long len,
                          unsigned int trace_ctx)
{
        struct ring_buffer_event *event;

        event = ring_buffer_lock_reserve(buffer, len);
        if (event != NULL)
                trace_event_setup(event, type, trace_ctx);

        return event;
}

void tracer_tracing_on(struct trace_array *tr)
{
        if (tr->array_buffer.buffer)
                ring_buffer_record_on(tr->array_buffer.buffer);
        /*
         * This flag is looked at when buffers haven't been allocated
         * yet, or by some tracers (like irqsoff), that just want to
         * know if the ring buffer has been disabled, but it can handle
         * races of where it gets disabled but we still do a record.
         * As the check is in the fast path of the tracers, it is more
         * important to be fast than accurate.
         */
        tr->buffer_disabled = 0;
        /* Make the flag seen by readers */
        smp_wmb();
}

/**
 * tracing_on - enable tracing buffers
 *
 * This function enables tracing buffers that may have been
 * disabled with tracing_off.
 */
void tracing_on(void)
{
        tracer_tracing_on(&global_trace);
}
EXPORT_SYMBOL_GPL(tracing_on);


static __always_inline void
__buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event)
{
        __this_cpu_write(trace_taskinfo_save, true);

        /* If this is the temp buffer, we need to commit fully */
        if (this_cpu_read(trace_buffered_event) == event) {
                /* Length is in event->array[0] */
                ring_buffer_write(buffer, event->array[0], &event->array[1]);
                /* Release the temp buffer */
                this_cpu_dec(trace_buffered_event_cnt);
                /* ring_buffer_unlock_commit() enables preemption */
                preempt_enable_notrace();
        } else
                ring_buffer_unlock_commit(buffer);
}

int __trace_array_puts(struct trace_array *tr, unsigned long ip,
                       const char *str, int size)
{
        struct ring_buffer_event *event;
        struct trace_buffer *buffer;
        struct print_entry *entry;
        unsigned int trace_ctx;
        int alloc;

        if (!(tr->trace_flags & TRACE_ITER_PRINTK))
                return 0;

        if (unlikely(tracing_selftest_running && tr == &global_trace))
                return 0;

        if (unlikely(tracing_disabled))
                return 0;

        alloc = sizeof(*entry) + size + 2; /* possible \n added */

        trace_ctx = tracing_gen_ctx();
        buffer = tr->array_buffer.buffer;
        ring_buffer_nest_start(buffer);
        event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
                                            trace_ctx);
        if (!event) {
                size = 0;
                goto out;
        }

        entry = ring_buffer_event_data(event);
        entry->ip = ip;

        memcpy(&entry->buf, str, size);

        /* Add a newline if necessary */
        if (entry->buf[size - 1] != '\n') {
                entry->buf[size] = '\n';
                entry->buf[size + 1] = '\0';
        } else
                entry->buf[size] = '\0';

        __buffer_unlock_commit(buffer, event);
        ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
 out:
        ring_buffer_nest_end(buffer);
        return size;
}
EXPORT_SYMBOL_GPL(__trace_array_puts);

/**
 * __trace_puts - write a constant string into the trace buffer.
 * @ip:           The address of the caller
 * @str:   The constant string to write
 * @size:  The size of the string.
 */
int __trace_puts(unsigned long ip, const char *str, int size)
{
        return __trace_array_puts(&global_trace, ip, str, size);
}
EXPORT_SYMBOL_GPL(__trace_puts);

/**
 * __trace_bputs - write the pointer to a constant string into trace buffer
 * @ip:           The address of the caller
 * @str:   The constant string to write to the buffer to
 */
int __trace_bputs(unsigned long ip, const char *str)
{
        struct ring_buffer_event *event;
        struct trace_buffer *buffer;
        struct bputs_entry *entry;
        unsigned int trace_ctx;
        int size = sizeof(struct bputs_entry);
        int ret = 0;

        if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
                return 0;

        if (unlikely(tracing_selftest_running || tracing_disabled))
                return 0;

        trace_ctx = tracing_gen_ctx();
        buffer = global_trace.array_buffer.buffer;

        ring_buffer_nest_start(buffer);
        event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
                                            trace_ctx);
        if (!event)
                goto out;

        entry = ring_buffer_event_data(event);
        entry->ip                        = ip;
        entry->str                        = str;

        __buffer_unlock_commit(buffer, event);
        ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL);

        ret = 1;
 out:
        ring_buffer_nest_end(buffer);
        return ret;
}
EXPORT_SYMBOL_GPL(__trace_bputs);

#ifdef CONFIG_TRACER_SNAPSHOT
static void tracing_snapshot_instance_cond(struct trace_array *tr,
                                           void *cond_data)
{
        struct tracer *tracer = tr->current_trace;
        unsigned long flags;

        if (in_nmi()) {
                trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
                trace_array_puts(tr, "*** snapshot is being ignored        ***\n");
                return;
        }

        if (!tr->allocated_snapshot) {
                trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n");
                trace_array_puts(tr, "*** stopping trace here!   ***\n");
                tracer_tracing_off(tr);
                return;
        }

        /* Note, snapshot can not be used when the tracer uses it */
        if (tracer->use_max_tr) {
                trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n");
                trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
                return;
        }

        if (tr->mapped) {
                trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n");
                trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
                return;
        }

        local_irq_save(flags);
        update_max_tr(tr, current, smp_processor_id(), cond_data);
        local_irq_restore(flags);
}

void tracing_snapshot_instance(struct trace_array *tr)
{
        tracing_snapshot_instance_cond(tr, NULL);
}

/**
 * tracing_snapshot - take a snapshot of the current buffer.
 *
 * This causes a swap between the snapshot buffer and the current live
 * tracing buffer. You can use this to take snapshots of the live
 * trace when some condition is triggered, but continue to trace.
 *
 * Note, make sure to allocate the snapshot with either
 * a tracing_snapshot_alloc(), or by doing it manually
 * with: echo 1 > /sys/kernel/tracing/snapshot
 *
 * If the snapshot buffer is not allocated, it will stop tracing.
 * Basically making a permanent snapshot.
 */
void tracing_snapshot(void)
{
        struct trace_array *tr = &global_trace;

        tracing_snapshot_instance(tr);
}
EXPORT_SYMBOL_GPL(tracing_snapshot);

/**
 * tracing_snapshot_cond - conditionally take a snapshot of the current buffer.
 * @tr:                The tracing instance to snapshot
 * @cond_data:        The data to be tested conditionally, and possibly saved
 *
 * This is the same as tracing_snapshot() except that the snapshot is
 * conditional - the snapshot will only happen if the
 * cond_snapshot.update() implementation receiving the cond_data
 * returns true, which means that the trace array's cond_snapshot
 * update() operation used the cond_data to determine whether the
 * snapshot should be taken, and if it was, presumably saved it along
 * with the snapshot.
 */
void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
{
        tracing_snapshot_instance_cond(tr, cond_data);
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond);

/**
 * tracing_cond_snapshot_data - get the user data associated with a snapshot
 * @tr:                The tracing instance
 *
 * When the user enables a conditional snapshot using
 * tracing_snapshot_cond_enable(), the user-defined cond_data is saved
 * with the snapshot.  This accessor is used to retrieve it.
 *
 * Should not be called from cond_snapshot.update(), since it takes
 * the tr->max_lock lock, which the code calling
 * cond_snapshot.update() has already done.
 *
 * Returns the cond_data associated with the trace array's snapshot.
 */
void *tracing_cond_snapshot_data(struct trace_array *tr)
{
        void *cond_data = NULL;

        local_irq_disable();
        arch_spin_lock(&tr->max_lock);

        if (tr->cond_snapshot)
                cond_data = tr->cond_snapshot->cond_data;

        arch_spin_unlock(&tr->max_lock);
        local_irq_enable();

        return cond_data;
}
EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data);

static int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
                                        struct array_buffer *size_buf, int cpu_id);
static void set_buffer_entries(struct array_buffer *buf, unsigned long val);

int tracing_alloc_snapshot_instance(struct trace_array *tr)
{
        int order;
        int ret;

        if (!tr->allocated_snapshot) {

                /* Make the snapshot buffer have the same order as main buffer */
                order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
                ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
                if (ret < 0)
                        return ret;

                /* allocate spare buffer */
                ret = resize_buffer_duplicate_size(&tr->max_buffer,
                                   &tr->array_buffer, RING_BUFFER_ALL_CPUS);
                if (ret < 0)
                        return ret;

                tr->allocated_snapshot = true;
        }

        return 0;
}

static void free_snapshot(struct trace_array *tr)
{
        /*
         * We don't free the ring buffer. instead, resize it because
         * The max_tr ring buffer has some state (e.g. ring->clock) and
         * we want preserve it.
         */
        ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 0);
        ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
        set_buffer_entries(&tr->max_buffer, 1);
        tracing_reset_online_cpus(&tr->max_buffer);
        tr->allocated_snapshot = false;
}

static int tracing_arm_snapshot_locked(struct trace_array *tr)
{
        int ret;

        lockdep_assert_held(&trace_types_lock);

        spin_lock(&tr->snapshot_trigger_lock);
        if (tr->snapshot == UINT_MAX || tr->mapped) {
                spin_unlock(&tr->snapshot_trigger_lock);
                return -EBUSY;
        }

        tr->snapshot++;
        spin_unlock(&tr->snapshot_trigger_lock);

        ret = tracing_alloc_snapshot_instance(tr);
        if (ret) {
                spin_lock(&tr->snapshot_trigger_lock);
                tr->snapshot--;
                spin_unlock(&tr->snapshot_trigger_lock);
        }

        return ret;
}

int tracing_arm_snapshot(struct trace_array *tr)
{
        int ret;

        mutex_lock(&trace_types_lock);
        ret = tracing_arm_snapshot_locked(tr);
        mutex_unlock(&trace_types_lock);

        return ret;
}

void tracing_disarm_snapshot(struct trace_array *tr)
{
        spin_lock(&tr->snapshot_trigger_lock);
        if (!WARN_ON(!tr->snapshot))
                tr->snapshot--;
        spin_unlock(&tr->snapshot_trigger_lock);
}

/**
 * tracing_alloc_snapshot - allocate snapshot buffer.
 *
 * This only allocates the snapshot buffer if it isn't already
 * allocated - it doesn't also take a snapshot.
 *
 * This is meant to be used in cases where the snapshot buffer needs
 * to be set up for events that can't sleep but need to be able to
 * trigger a snapshot.
 */
int tracing_alloc_snapshot(void)
{
        struct trace_array *tr = &global_trace;
        int ret;

        ret = tracing_alloc_snapshot_instance(tr);
        WARN_ON(ret < 0);

        return ret;
}
EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);

/**
 * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer.
 *
 * This is similar to tracing_snapshot(), but it will allocate the
 * snapshot buffer if it isn't already allocated. Use this only
 * where it is safe to sleep, as the allocation may sleep.
 *
 * This causes a swap between the snapshot buffer and the current live
 * tracing buffer. You can use this to take snapshots of the live
 * trace when some condition is triggered, but continue to trace.
 */
void tracing_snapshot_alloc(void)
{
        int ret;

        ret = tracing_alloc_snapshot();
        if (ret < 0)
                return;

        tracing_snapshot();
}
EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);

/**
 * tracing_snapshot_cond_enable - enable conditional snapshot for an instance
 * @tr:                The tracing instance
 * @cond_data:        User data to associate with the snapshot
 * @update:        Implementation of the cond_snapshot update function
 *
 * Check whether the conditional snapshot for the given instance has
 * already been enabled, or if the current tracer is already using a
 * snapshot; if so, return -EBUSY, else create a cond_snapshot and
 * save the cond_data and update function inside.
 *
 * Returns 0 if successful, error otherwise.
 */
int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
                                 cond_update_fn_t update)
{
        struct cond_snapshot *cond_snapshot;
        int ret = 0;

        cond_snapshot = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL);
        if (!cond_snapshot)
                return -ENOMEM;

        cond_snapshot->cond_data = cond_data;
        cond_snapshot->update = update;

        mutex_lock(&trace_types_lock);

        if (tr->current_trace->use_max_tr) {
                ret = -EBUSY;
                goto fail_unlock;
        }

        /*
         * The cond_snapshot can only change to NULL without the
         * trace_types_lock. We don't care if we race with it going
         * to NULL, but we want to make sure that it's not set to
         * something other than NULL when we get here, which we can
         * do safely with only holding the trace_types_lock and not
         * having to take the max_lock.
         */
        if (tr->cond_snapshot) {
                ret = -EBUSY;
                goto fail_unlock;
        }

        ret = tracing_arm_snapshot_locked(tr);
        if (ret)
                goto fail_unlock;

        local_irq_disable();
        arch_spin_lock(&tr->max_lock);
        tr->cond_snapshot = cond_snapshot;
        arch_spin_unlock(&tr->max_lock);
        local_irq_enable();

        mutex_unlock(&trace_types_lock);

        return ret;

 fail_unlock:
        mutex_unlock(&trace_types_lock);
        kfree(cond_snapshot);
        return ret;
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable);

/**
 * tracing_snapshot_cond_disable - disable conditional snapshot for an instance
 * @tr:                The tracing instance
 *
 * Check whether the conditional snapshot for the given instance is
 * enabled; if so, free the cond_snapshot associated with it,
 * otherwise return -EINVAL.
 *
 * Returns 0 if successful, error otherwise.
 */
int tracing_snapshot_cond_disable(struct trace_array *tr)
{
        int ret = 0;

        local_irq_disable();
        arch_spin_lock(&tr->max_lock);

        if (!tr->cond_snapshot)
                ret = -EINVAL;
        else {
                kfree(tr->cond_snapshot);
                tr->cond_snapshot = NULL;
        }

        arch_spin_unlock(&tr->max_lock);
        local_irq_enable();

        tracing_disarm_snapshot(tr);

        return ret;
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
#else
void tracing_snapshot(void)
{
        WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
}
EXPORT_SYMBOL_GPL(tracing_snapshot);
void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
{
        WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used");
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond);
int tracing_alloc_snapshot(void)
{
        WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used");
        return -ENODEV;
}
EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
void tracing_snapshot_alloc(void)
{
        /* Give warning */
        tracing_snapshot();
}
EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
void *tracing_cond_snapshot_data(struct trace_array *tr)
{
        return NULL;
}
EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data);
int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update)
{
        return -ENODEV;
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable);
int tracing_snapshot_cond_disable(struct trace_array *tr)
{
        return false;
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
#define free_snapshot(tr)        do { } while (0)
#define tracing_arm_snapshot_locked(tr) ({ -EBUSY; })
#endif /* CONFIG_TRACER_SNAPSHOT */

void tracer_tracing_off(struct trace_array *tr)
{
        if (tr->array_buffer.buffer)
                ring_buffer_record_off(tr->array_buffer.buffer);
        /*
         * This flag is looked at when buffers haven't been allocated
         * yet, or by some tracers (like irqsoff), that just want to
         * know if the ring buffer has been disabled, but it can handle
         * races of where it gets disabled but we still do a record.
         * As the check is in the fast path of the tracers, it is more
         * important to be fast than accurate.
         */
        tr->buffer_disabled = 1;
        /* Make the flag seen by readers */
        smp_wmb();
}

/**
 * tracing_off - turn off tracing buffers
 *
 * This function stops the tracing buffers from recording data.
 * It does not disable any overhead the tracers themselves may
 * be causing. This function simply causes all recording to
 * the ring buffers to fail.
 */
void tracing_off(void)
{
        tracer_tracing_off(&global_trace);
}
EXPORT_SYMBOL_GPL(tracing_off);

void disable_trace_on_warning(void)
{
        if (__disable_trace_on_warning) {
                trace_array_printk_buf(global_trace.array_buffer.buffer, _THIS_IP_,
                        "Disabling tracing due to warning\n");
                tracing_off();
        }
}

/**
 * tracer_tracing_is_on - show real state of ring buffer enabled
 * @tr : the trace array to know if ring buffer is enabled
 *
 * Shows real state of the ring buffer if it is enabled or not.
 */
bool tracer_tracing_is_on(struct trace_array *tr)
{
        if (tr->array_buffer.buffer)
                return ring_buffer_record_is_set_on(tr->array_buffer.buffer);
        return !tr->buffer_disabled;
}

/**
 * tracing_is_on - show state of ring buffers enabled
 */
int tracing_is_on(void)
{
        return tracer_tracing_is_on(&global_trace);
}
EXPORT_SYMBOL_GPL(tracing_is_on);

static int __init set_buf_size(char *str)
{
        unsigned long buf_size;

        if (!str)
                return 0;
        buf_size = memparse(str, &str);
        /*
         * nr_entries can not be zero and the startup
         * tests require some buffer space. Therefore
         * ensure we have at least 4096 bytes of buffer.
         */
        trace_buf_size = max(4096UL, buf_size);
        return 1;
}
__setup("trace_buf_size=", set_buf_size);

static int __init set_tracing_thresh(char *str)
{
        unsigned long threshold;
        int ret;

        if (!str)
                return 0;
        ret = kstrtoul(str, 0, &threshold);
        if (ret < 0)
                return 0;
        tracing_thresh = threshold * 1000;
        return 1;
}
__setup("tracing_thresh=", set_tracing_thresh);

unsigned long nsecs_to_usecs(unsigned long nsecs)
{
        return nsecs / 1000;
}

/*
 * TRACE_FLAGS is defined as a tuple matching bit masks with strings.
 * It uses C(a, b) where 'a' is the eval (enum) name and 'b' is the string that
 * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list
 * of strings in the order that the evals (enum) were defined.
 */
#undef C
#define C(a, b) b

/* These must match the bit positions in trace_iterator_flags */
static const char *trace_options[] = {
        TRACE_FLAGS
        NULL
};

static struct {
        u64 (*func)(void);
        const char *name;
        int in_ns;                /* is this clock in nanoseconds? */
} trace_clocks[] = {
        { trace_clock_local,                "local",        1 },
        { trace_clock_global,                "global",        1 },
        { trace_clock_counter,                "counter",        0 },
        { trace_clock_jiffies,                "uptime",        0 },
        { trace_clock,                        "perf",                1 },
        { ktime_get_mono_fast_ns,        "mono",                1 },
        { ktime_get_raw_fast_ns,        "mono_raw",        1 },
        { ktime_get_boot_fast_ns,        "boot",                1 },
        { ktime_get_tai_fast_ns,        "tai",                1 },
        ARCH_TRACE_CLOCKS
};

bool trace_clock_in_ns(struct trace_array *tr)
{
        if (trace_clocks[tr->clock_id].in_ns)
                return true;

        return false;
}

/*
 * trace_parser_get_init - gets the buffer for trace parser
 */
int trace_parser_get_init(struct trace_parser *parser, int size)
{
        memset(parser, 0, sizeof(*parser));

        parser->buffer = kmalloc(size, GFP_KERNEL);
        if (!parser->buffer)
                return 1;

        parser->size = size;
        return 0;
}

/*
 * trace_parser_put - frees the buffer for trace parser
 */
void trace_parser_put(struct trace_parser *parser)
{
        kfree(parser->buffer);
        parser->buffer = NULL;
}

/*
 * trace_get_user - reads the user input string separated by  space
 * (matched by isspace(ch))
 *
 * For each string found the 'struct trace_parser' is updated,
 * and the function returns.
 *
 * Returns number of bytes read.
 *
 * See kernel/trace/trace.h for 'struct trace_parser' details.
 */
int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
        size_t cnt, loff_t *ppos)
{
        char ch;
        size_t read = 0;
        ssize_t ret;

        if (!*ppos)
                trace_parser_clear(parser);

        ret = get_user(ch, ubuf++);
        if (ret)
                goto out;

        read++;
        cnt--;

        /*
         * The parser is not finished with the last write,
         * continue reading the user input without skipping spaces.
         */
        if (!parser->cont) {
                /* skip white space */
                while (cnt && isspace(ch)) {
                        ret = get_user(ch, ubuf++);
                        if (ret)
                                goto out;
                        read++;
                        cnt--;
                }

                parser->idx = 0;

                /* only spaces were written */
                if (isspace(ch) || !ch) {
                        *ppos += read;
                        ret = read;
                        goto out;
                }
        }

        /* read the non-space input */
        while (cnt && !isspace(ch) && ch) {
                if (parser->idx < parser->size - 1)
                        parser->buffer[parser->idx++] = ch;
                else {
                        ret = -EINVAL;
                        goto out;
                }
                ret = get_user(ch, ubuf++);
                if (ret)
                        goto out;
                read++;
                cnt--;
        }

        /* We either got finished input or we have to wait for another call. */
        if (isspace(ch) || !ch) {
                parser->buffer[parser->idx] = 0;
                parser->cont = false;
        } else if (parser->idx < parser->size - 1) {
                parser->cont = true;
                parser->buffer[parser->idx++] = ch;
                /* Make sure the parsed string always terminates with '\0'. */
                parser->buffer[parser->idx] = 0;
        } else {
                ret = -EINVAL;
                goto out;
        }

        *ppos += read;
        ret = read;

out:
        return ret;
}

/* TODO add a seq_buf_to_buffer() */
static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
{
        int len;

        if (trace_seq_used(s) <= s->readpos)
                return -EBUSY;

        len = trace_seq_used(s) - s->readpos;
        if (cnt > len)
                cnt = len;
        memcpy(buf, s->buffer + s->readpos, cnt);

        s->readpos += cnt;
        return cnt;
}

unsigned long __read_mostly        tracing_thresh;

#ifdef CONFIG_TRACER_MAX_TRACE
static const struct file_operations tracing_max_lat_fops;

#ifdef LATENCY_FS_NOTIFY

static struct workqueue_struct *fsnotify_wq;

static void latency_fsnotify_workfn(struct work_struct *work)
{
        struct trace_array *tr = container_of(work, struct trace_array,
                                              fsnotify_work);
        fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY);
}

static void latency_fsnotify_workfn_irq(struct irq_work *iwork)
{
        struct trace_array *tr = container_of(iwork, struct trace_array,
                                              fsnotify_irqwork);
        queue_work(fsnotify_wq, &tr->fsnotify_work);
}

static void trace_create_maxlat_file(struct trace_array *tr,
                                     struct dentry *d_tracer)
{
        INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn);
        init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq);
        tr->d_max_latency = trace_create_file("tracing_max_latency",
                                              TRACE_MODE_WRITE,
                                              d_tracer, tr,
                                              &tracing_max_lat_fops);
}

__init static int latency_fsnotify_init(void)
{
        fsnotify_wq = alloc_workqueue("tr_max_lat_wq",
                                      WQ_UNBOUND | WQ_HIGHPRI, 0);
        if (!fsnotify_wq) {
                pr_err("Unable to allocate tr_max_lat_wq\n");
                return -ENOMEM;
        }
        return 0;
}

late_initcall_sync(latency_fsnotify_init);

void latency_fsnotify(struct trace_array *tr)
{
        if (!fsnotify_wq)
                return;
        /*
         * We cannot call queue_work(&tr->fsnotify_work) from here because it's
         * possible that we are called from __schedule() or do_idle(), which
         * could cause a deadlock.
         */
        irq_work_queue(&tr->fsnotify_irqwork);
}

#else /* !LATENCY_FS_NOTIFY */

#define trace_create_maxlat_file(tr, d_tracer)                                \
        trace_create_file("tracing_max_latency", TRACE_MODE_WRITE,        \
                          d_tracer, tr, &tracing_max_lat_fops)

#endif

/*
 * Copy the new maximum trace into the separate maximum-trace
 * structure. (this way the maximum trace is permanently saved,
 * for later retrieval via /sys/kernel/tracing/tracing_max_latency)
 */
static void
__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
        struct array_buffer *trace_buf = &tr->array_buffer;
        struct array_buffer *max_buf = &tr->max_buffer;
        struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
        struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);

        max_buf->cpu = cpu;
        max_buf->time_start = data->preempt_timestamp;

        max_data->saved_latency = tr->max_latency;
        max_data->critical_start = data->critical_start;
        max_data->critical_end = data->critical_end;

        strncpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
        max_data->pid = tsk->pid;
        /*
         * If tsk == current, then use current_uid(), as that does not use
         * RCU. The irq tracer can be called out of RCU scope.
         */
        if (tsk == current)
                max_data->uid = current_uid();
        else
                max_data->uid = task_uid(tsk);

        max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
        max_data->policy = tsk->policy;
        max_data->rt_priority = tsk->rt_priority;

        /* record this tasks comm */
        tracing_record_cmdline(tsk);
        latency_fsnotify(tr);
}

/**
 * update_max_tr - snapshot all trace buffers from global_trace to max_tr
 * @tr: tracer
 * @tsk: the task with the latency
 * @cpu: The cpu that initiated the trace.
 * @cond_data: User data associated with a conditional snapshot
 *
 * Flip the buffers between the @tr and the max_tr and record information
 * about which task was the cause of this latency.
 */
void
update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
              void *cond_data)
{
        if (tr->stop_count)
                return;

        WARN_ON_ONCE(!irqs_disabled());

        if (!tr->allocated_snapshot) {
                /* Only the nop tracer should hit this when disabling */
                WARN_ON_ONCE(tr->current_trace != &nop_trace);
                return;
        }

        arch_spin_lock(&tr->max_lock);

        /* Inherit the recordable setting from array_buffer */
        if (ring_buffer_record_is_set_on(tr->array_buffer.buffer))
                ring_buffer_record_on(tr->max_buffer.buffer);
        else
                ring_buffer_record_off(tr->max_buffer.buffer);

#ifdef CONFIG_TRACER_SNAPSHOT
        if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) {
                arch_spin_unlock(&tr->max_lock);
                return;
        }
#endif
        swap(tr->array_buffer.buffer, tr->max_buffer.buffer);

        __update_max_tr(tr, tsk, cpu);

        arch_spin_unlock(&tr->max_lock);

        /* Any waiters on the old snapshot buffer need to wake up */
        ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS);
}

/**
 * update_max_tr_single - only copy one trace over, and reset the rest
 * @tr: tracer
 * @tsk: task with the latency
 * @cpu: the cpu of the buffer to copy.
 *
 * Flip the trace of a single CPU buffer between the @tr and the max_tr.
 */
void
update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
        int ret;

        if (tr->stop_count)
                return;

        WARN_ON_ONCE(!irqs_disabled());
        if (!tr->allocated_snapshot) {
                /* Only the nop tracer should hit this when disabling */
                WARN_ON_ONCE(tr->current_trace != &nop_trace);
                return;
        }

        arch_spin_lock(&tr->max_lock);

        ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->array_buffer.buffer, cpu);

        if (ret == -EBUSY) {
                /*
                 * We failed to swap the buffer due to a commit taking
                 * place on this CPU. We fail to record, but we reset
                 * the max trace buffer (no one writes directly to it)
                 * and flag that it failed.
                 * Another reason is resize is in progress.
                 */
                trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,
                        "Failed to swap buffers due to commit or resize in progress\n");
        }

        WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);

        __update_max_tr(tr, tsk, cpu);
        arch_spin_unlock(&tr->max_lock);
}

#endif /* CONFIG_TRACER_MAX_TRACE */

struct pipe_wait {
        struct trace_iterator                *iter;
        int                                wait_index;
};

static bool wait_pipe_cond(void *data)
{
        struct pipe_wait *pwait = data;
        struct trace_iterator *iter = pwait->iter;

        if (atomic_read_acquire(&iter->wait_index) != pwait->wait_index)
                return true;

        return iter->closed;
}

static int wait_on_pipe(struct trace_iterator *iter, int full)
{
        struct pipe_wait pwait;
        int ret;

        /* Iterators are static, they should be filled or empty */
        if (trace_buffer_iter(iter, iter->cpu_file))
                return 0;

        pwait.wait_index = atomic_read_acquire(&iter->wait_index);
        pwait.iter = iter;

        ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full,
                               wait_pipe_cond, &pwait);

#ifdef CONFIG_TRACER_MAX_TRACE
        /*
         * Make sure this is still the snapshot buffer, as if a snapshot were
         * to happen, this would now be the main buffer.
         */
        if (iter->snapshot)
                iter->array_buffer = &iter->tr->max_buffer;
#endif
        return ret;
}

#ifdef CONFIG_FTRACE_STARTUP_TEST
static bool selftests_can_run;

struct trace_selftests {
        struct list_head                list;
        struct tracer                        *type;
};

static LIST_HEAD(postponed_selftests);

static int save_selftest(struct tracer *type)
{
        struct trace_selftests *selftest;

        selftest = kmalloc(sizeof(*selftest), GFP_KERNEL);
        if (!selftest)
                return -ENOMEM;

        selftest->type = type;
        list_add(&selftest->list, &postponed_selftests);
        return 0;
}

static int run_tracer_selftest(struct tracer *type)
{
        struct trace_array *tr = &global_trace;
        struct tracer *saved_tracer = tr->current_trace;
        int ret;

        if (!type->selftest || tracing_selftest_disabled)
                return 0;

        /*
         * If a tracer registers early in boot up (before scheduling is
         * initialized and such), then do not run its selftests yet.
         * Instead, run it a little later in the boot process.
         */
        if (!selftests_can_run)
                return save_selftest(type);

        if (!tracing_is_on()) {
                pr_warn("Selftest for tracer %s skipped due to tracing disabled\n",
                        type->name);
                return 0;
        }

        /*
         * Run a selftest on this tracer.
         * Here we reset the trace buffer, and set the current
         * tracer to be this tracer. The tracer can then run some
         * internal tracing to verify that everything is in order.
         * If we fail, we do not register this tracer.
         */
        tracing_reset_online_cpus(&tr->array_buffer);

        tr->current_trace = type;

#ifdef CONFIG_TRACER_MAX_TRACE
        if (type->use_max_tr) {
                /* If we expanded the buffers, make sure the max is expanded too */
                if (tr->ring_buffer_expanded)
                        ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
                                           RING_BUFFER_ALL_CPUS);
                tr->allocated_snapshot = true;
        }
#endif

        /* the test is responsible for initializing and enabling */
        pr_info("Testing tracer %s: ", type->name);
        ret = type->selftest(type, tr);
        /* the test is responsible for resetting too */
        tr->current_trace = saved_tracer;
        if (ret) {
                printk(KERN_CONT "FAILED!\n");
                /* Add the warning after printing 'FAILED' */
                WARN_ON(1);
                return -1;
        }
        /* Only reset on passing, to avoid touching corrupted buffers */
        tracing_reset_online_cpus(&tr->array_buffer);

#ifdef CONFIG_TRACER_MAX_TRACE
        if (type->use_max_tr) {
                tr->allocated_snapshot = false;

                /* Shrink the max buffer again */
                if (tr->ring_buffer_expanded)
                        ring_buffer_resize(tr->max_buffer.buffer, 1,
                                           RING_BUFFER_ALL_CPUS);
        }
#endif

        printk(KERN_CONT "PASSED\n");
        return 0;
}

static int do_run_tracer_selftest(struct tracer *type)
{
        int ret;

        /*
         * Tests can take a long time, especially if they are run one after the
         * other, as does happen during bootup when all the tracers are
         * registered. This could cause the soft lockup watchdog to trigger.
         */
        cond_resched();

        tracing_selftest_running = true;
        ret = run_tracer_selftest(type);
        tracing_selftest_running = false;

        return ret;
}

static __init int init_trace_selftests(void)
{
        struct trace_selftests *p, *n;
        struct tracer *t, **last;
        int ret;

        selftests_can_run = true;

        mutex_lock(&trace_types_lock);

        if (list_empty(&postponed_selftests))
                goto out;

        pr_info("Running postponed tracer tests:\n");

        tracing_selftest_running = true;
        list_for_each_entry_safe(p, n, &postponed_selftests, list) {
                /* This loop can take minutes when sanitizers are enabled, so
                 * lets make sure we allow RCU processing.
                 */
                cond_resched();
                ret = run_tracer_selftest(p->type);
                /* If the test fails, then warn and remove from available_tracers */
                if (ret < 0) {
                        WARN(1, "tracer: %s failed selftest, disabling\n",
                             p->type->name);
                        last = &trace_types;
                        for (t = trace_types; t; t = t->next) {
                                if (t == p->type) {
                                        *last = t->next;
                                        break;
                                }
                                last = &t->next;
                        }
                }
                list_del(&p->list);
                kfree(p);
        }
        tracing_selftest_running = false;

 out:
        mutex_unlock(&trace_types_lock);

        return 0;
}
core_initcall(init_trace_selftests);
#else
static inline int run_tracer_selftest(struct tracer *type)
{
        return 0;
}
static inline int do_run_tracer_selftest(struct tracer *type)
{
        return 0;
}
#endif /* CONFIG_FTRACE_STARTUP_TEST */

static void add_tracer_options(struct trace_array *tr, struct tracer *t);

static void __init apply_trace_boot_options(void);

/**
 * register_tracer - register a tracer with the ftrace system.
 * @type: the plugin for the tracer
 *
 * Register a new plugin tracer.
 */
int __init register_tracer(struct tracer *type)
{
        struct tracer *t;
        int ret = 0;

        if (!type->name) {
                pr_info("Tracer must have a name\n");
                return -1;
        }

        if (strlen(type->name) >= MAX_TRACER_SIZE) {
                pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
                return -1;
        }

        if (security_locked_down(LOCKDOWN_TRACEFS)) {
                pr_warn("Can not register tracer %s due to lockdown\n",
                           type->name);
                return -EPERM;
        }

        mutex_lock(&trace_types_lock);

        for (t = trace_types; t; t = t->next) {
                if (strcmp(type->name, t->name) == 0) {
                        /* already found */
                        pr_info("Tracer %s already registered\n",
                                type->name);
                        ret = -1;
                        goto out;
                }
        }

        if (!type->set_flag)
                type->set_flag = &dummy_set_flag;
        if (!type->flags) {
                /*allocate a dummy tracer_flags*/
                type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL);
                if (!type->flags) {
                        ret = -ENOMEM;
                        goto out;
                }
                type->flags->val = 0;
                type->flags->opts = dummy_tracer_opt;
        } else
                if (!type->flags->opts)
                        type->flags->opts = dummy_tracer_opt;

        /* store the tracer for __set_tracer_option */
        type->flags->trace = type;

        ret = do_run_tracer_selftest(type);
        if (ret < 0)
                goto out;

        type->next = trace_types;
        trace_types = type;
        add_tracer_options(&global_trace, type);

 out:
        mutex_unlock(&trace_types_lock);

        if (ret || !default_bootup_tracer)
                goto out_unlock;

        if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
                goto out_unlock;

        printk(KERN_INFO "Starting tracer '%s'\n", type->name);
        /* Do we want this tracer to start on bootup? */
        tracing_set_tracer(&global_trace, type->name);
        default_bootup_tracer = NULL;

        apply_trace_boot_options();

        /* disable other selftests, since this will break it. */
        disable_tracing_selftest("running a tracer");

 out_unlock:
        return ret;
}

static void tracing_reset_cpu(struct array_buffer *buf, int cpu)
{
        struct trace_buffer *buffer = buf->buffer;

        if (!buffer)
                return;

        ring_buffer_record_disable(buffer);

        /* Make sure all commits have finished */
        synchronize_rcu();
        ring_buffer_reset_cpu(buffer, cpu);

        ring_buffer_record_enable(buffer);
}

void tracing_reset_online_cpus(struct array_buffer *buf)
{
        struct trace_buffer *buffer = buf->buffer;

        if (!buffer)
                return;

        ring_buffer_record_disable(buffer);

        /* Make sure all commits have finished */
        synchronize_rcu();

        buf->time_start = buffer_ftrace_now(buf, buf->cpu);

        ring_buffer_reset_online_cpus(buffer);

        ring_buffer_record_enable(buffer);
}

/* Must have trace_types_lock held */
void tracing_reset_all_online_cpus_unlocked(void)
{
        struct trace_array *tr;

        lockdep_assert_held(&trace_types_lock);

        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
                if (!tr->clear_trace)
                        continue;
                tr->clear_trace = false;
                tracing_reset_online_cpus(&tr->array_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
                tracing_reset_online_cpus(&tr->max_buffer);
#endif
        }
}

void tracing_reset_all_online_cpus(void)
{
        mutex_lock(&trace_types_lock);
        tracing_reset_all_online_cpus_unlocked();
        mutex_unlock(&trace_types_lock);
}

int is_tracing_stopped(void)
{
        return global_trace.stop_count;
}

static void tracing_start_tr(struct trace_array *tr)
{
        struct trace_buffer *buffer;
        unsigned long flags;

        if (tracing_disabled)
                return;

        raw_spin_lock_irqsave(&tr->start_lock, flags);
        if (--tr->stop_count) {
                if (WARN_ON_ONCE(tr->stop_count < 0)) {
                        /* Someone screwed up their debugging */
                        tr->stop_count = 0;
                }
                goto out;
        }

        /* Prevent the buffers from switching */
        arch_spin_lock(&tr->max_lock);

        buffer = tr->array_buffer.buffer;
        if (buffer)
                ring_buffer_record_enable(buffer);

#ifdef CONFIG_TRACER_MAX_TRACE
        buffer = tr->max_buffer.buffer;
        if (buffer)
                ring_buffer_record_enable(buffer);
#endif

        arch_spin_unlock(&tr->max_lock);

 out:
        raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}

/**
 * tracing_start - quick start of the tracer
 *
 * If tracing is enabled but was stopped by tracing_stop,
 * this will start the tracer back up.
 */
void tracing_start(void)

{
        return tracing_start_tr(&global_trace);
}

static void tracing_stop_tr(struct trace_array *tr)
{
        struct trace_buffer *buffer;
        unsigned long flags;

        raw_spin_lock_irqsave(&tr->start_lock, flags);
        if (tr->stop_count++)
                goto out;

        /* Prevent the buffers from switching */
        arch_spin_lock(&tr->max_lock);

        buffer = tr->array_buffer.buffer;
        if (buffer)
                ring_buffer_record_disable(buffer);

#ifdef CONFIG_TRACER_MAX_TRACE
        buffer = tr->max_buffer.buffer;
        if (buffer)
                ring_buffer_record_disable(buffer);
#endif

        arch_spin_unlock(&tr->max_lock);

 out:
        raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}

/**
 * tracing_stop - quick stop of the tracer
 *
 * Light weight way to stop tracing. Use in conjunction with
 * tracing_start.
 */
void tracing_stop(void)
{
        return tracing_stop_tr(&global_trace);
}

/*
 * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq
 * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function
 * simplifies those functions and keeps them in sync.
 */
enum print_line_t trace_handle_return(struct trace_seq *s)
{
        return trace_seq_has_overflowed(s) ?
                TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED;
}
EXPORT_SYMBOL_GPL(trace_handle_return);

static unsigned short migration_disable_value(void)
{
#if defined(CONFIG_SMP)
        return current->migration_disabled;
#else
        return 0;
#endif
}

unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
{
        unsigned int trace_flags = irqs_status;
        unsigned int pc;

        pc = preempt_count();

        if (pc & NMI_MASK)
                trace_flags |= TRACE_FLAG_NMI;
        if (pc & HARDIRQ_MASK)
                trace_flags |= TRACE_FLAG_HARDIRQ;
        if (in_serving_softirq())
                trace_flags |= TRACE_FLAG_SOFTIRQ;
        if (softirq_count() >> (SOFTIRQ_SHIFT + 1))
                trace_flags |= TRACE_FLAG_BH_OFF;

        if (tif_need_resched())
                trace_flags |= TRACE_FLAG_NEED_RESCHED;
        if (test_preempt_need_resched())
                trace_flags |= TRACE_FLAG_PREEMPT_RESCHED;
        return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) |
                (min_t(unsigned int, migration_disable_value(), 0xf)) << 4;
}

struct ring_buffer_event *
trace_buffer_lock_reserve(struct trace_buffer *buffer,
                          int type,
                          unsigned long len,
                          unsigned int trace_ctx)
{
        return __trace_buffer_lock_reserve(buffer, type, len, trace_ctx);
}

DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
DEFINE_PER_CPU(int, trace_buffered_event_cnt);
static int trace_buffered_event_ref;

/**
 * trace_buffered_event_enable - enable buffering events
 *
 * When events are being filtered, it is quicker to use a temporary
 * buffer to write the event data into if there's a likely chance
 * that it will not be committed. The discard of the ring buffer
 * is not as fast as committing, and is much slower than copying
 * a commit.
 *
 * When an event is to be filtered, allocate per cpu buffers to
 * write the event data into, and if the event is filtered and discarded
 * it is simply dropped, otherwise, the entire data is to be committed
 * in one shot.
 */
void trace_buffered_event_enable(void)
{
        struct ring_buffer_event *event;
        struct page *page;
        int cpu;

        WARN_ON_ONCE(!mutex_is_locked(&event_mutex));

        if (trace_buffered_event_ref++)
                return;

        for_each_tracing_cpu(cpu) {
                page = alloc_pages_node(cpu_to_node(cpu),
                                        GFP_KERNEL | __GFP_NORETRY, 0);
                /* This is just an optimization and can handle failures */
                if (!page) {
                        pr_err("Failed to allocate event buffer\n");
                        break;
                }

                event = page_address(page);
                memset(event, 0, sizeof(*event));

                per_cpu(trace_buffered_event, cpu) = event;

                preempt_disable();
                if (cpu == smp_processor_id() &&
                    __this_cpu_read(trace_buffered_event) !=
                    per_cpu(trace_buffered_event, cpu))
                        WARN_ON_ONCE(1);
                preempt_enable();
        }
}

static void enable_trace_buffered_event(void *data)
{
        /* Probably not needed, but do it anyway */
        smp_rmb();
        this_cpu_dec(trace_buffered_event_cnt);
}

static void disable_trace_buffered_event(void *data)
{
        this_cpu_inc(trace_buffered_event_cnt);
}

/**
 * trace_buffered_event_disable - disable buffering events
 *
 * When a filter is removed, it is faster to not use the buffered
 * events, and to commit directly into the ring buffer. Free up
 * the temp buffers when there are no more users. This requires
 * special synchronization with current events.
 */
void trace_buffered_event_disable(void)
{
        int cpu;

        WARN_ON_ONCE(!mutex_is_locked(&event_mutex));

        if (WARN_ON_ONCE(!trace_buffered_event_ref))
                return;

        if (--trace_buffered_event_ref)
                return;

        /* For each CPU, set the buffer as used. */
        on_each_cpu_mask(tracing_buffer_mask, disable_trace_buffered_event,
                         NULL, true);

        /* Wait for all current users to finish */
        synchronize_rcu();

        for_each_tracing_cpu(cpu) {
                free_page((unsigned long)per_cpu(trace_buffered_event, cpu));
                per_cpu(trace_buffered_event, cpu) = NULL;
        }

        /*
         * Wait for all CPUs that potentially started checking if they can use
         * their event buffer only after the previous synchronize_rcu() call and
         * they still read a valid pointer from trace_buffered_event. It must be
         * ensured they don't see cleared trace_buffered_event_cnt else they
         * could wrongly decide to use the pointed-to buffer which is now freed.
         */
        synchronize_rcu();

        /* For each CPU, relinquish the buffer */
        on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL,
                         true);
}

static struct trace_buffer *temp_buffer;

struct ring_buffer_event *
trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
                          struct trace_event_file *trace_file,
                          int type, unsigned long len,
                          unsigned int trace_ctx)
{
        struct ring_buffer_event *entry;
        struct trace_array *tr = trace_file->tr;
        int val;

        *current_rb = tr->array_buffer.buffer;

        if (!tr->no_filter_buffering_ref &&
            (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED))) {
                preempt_disable_notrace();
                /*
                 * Filtering is on, so try to use the per cpu buffer first.
                 * This buffer will simulate a ring_buffer_event,
                 * where the type_len is zero and the array[0] will
                 * hold the full length.
                 * (see include/linux/ring-buffer.h for details on
                 *  how the ring_buffer_event is structured).
                 *
                 * Using a temp buffer during filtering and copying it
                 * on a matched filter is quicker than writing directly
                 * into the ring buffer and then discarding it when
                 * it doesn't match. That is because the discard
                 * requires several atomic operations to get right.
                 * Copying on match and doing nothing on a failed match
                 * is still quicker than no copy on match, but having
                 * to discard out of the ring buffer on a failed match.
                 */
                if ((entry = __this_cpu_read(trace_buffered_event))) {
                        int max_len = PAGE_SIZE - struct_size(entry, array, 1);

                        val = this_cpu_inc_return(trace_buffered_event_cnt);

                        /*
                         * Preemption is disabled, but interrupts and NMIs
                         * can still come in now. If that happens after
                         * the above increment, then it will have to go
                         * back to the old method of allocating the event
                         * on the ring buffer, and if the filter fails, it
                         * will have to call ring_buffer_discard_commit()
                         * to remove it.
                         *
                         * Need to also check the unlikely case that the
                         * length is bigger than the temp buffer size.
                         * If that happens, then the reserve is pretty much
                         * guaranteed to fail, as the ring buffer currently
                         * only allows events less than a page. But that may
                         * change in the future, so let the ring buffer reserve
                         * handle the failure in that case.
                         */
                        if (val == 1 && likely(len <= max_len)) {
                                trace_event_setup(entry, type, trace_ctx);
                                entry->array[0] = len;
                                /* Return with preemption disabled */
                                return entry;
                        }
                        this_cpu_dec(trace_buffered_event_cnt);
                }
                /* __trace_buffer_lock_reserve() disables preemption */
                preempt_enable_notrace();
        }

        entry = __trace_buffer_lock_reserve(*current_rb, type, len,
                                            trace_ctx);
        /*
         * If tracing is off, but we have triggers enabled
         * we still need to look at the event data. Use the temp_buffer
         * to store the trace event for the trigger to use. It's recursive
         * safe and will not be recorded anywhere.
         */
        if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) {
                *current_rb = temp_buffer;
                entry = __trace_buffer_lock_reserve(*current_rb, type, len,
                                                    trace_ctx);
        }
        return entry;
}
EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);

static DEFINE_RAW_SPINLOCK(tracepoint_iter_lock);
static DEFINE_MUTEX(tracepoint_printk_mutex);

static void output_printk(struct trace_event_buffer *fbuffer)
{
        struct trace_event_call *event_call;
        struct trace_event_file *file;
        struct trace_event *event;
        unsigned long flags;
        struct trace_iterator *iter = tracepoint_print_iter;

        /* We should never get here if iter is NULL */
        if (WARN_ON_ONCE(!iter))
                return;

        event_call = fbuffer->trace_file->event_call;
        if (!event_call || !event_call->event.funcs ||
            !event_call->event.funcs->trace)
                return;

        file = fbuffer->trace_file;
        if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
            (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
             !filter_match_preds(file->filter, fbuffer->entry)))
                return;

        event = &fbuffer->trace_file->event_call->event;

        raw_spin_lock_irqsave(&tracepoint_iter_lock, flags);
        trace_seq_init(&iter->seq);
        iter->ent = fbuffer->entry;
        event_call->event.funcs->trace(iter, 0, event);
        trace_seq_putc(&iter->seq, 0);
        printk("%s", iter->seq.buffer);

        raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
}

int tracepoint_printk_sysctl(struct ctl_table *table, int write,
                             void *buffer, size_t *lenp,
                             loff_t *ppos)
{
        int save_tracepoint_printk;
        int ret;

        mutex_lock(&tracepoint_printk_mutex);
        save_tracepoint_printk = tracepoint_printk;

        ret = proc_dointvec(table, write, buffer, lenp, ppos);

        /*
         * This will force exiting early, as tracepoint_printk
         * is always zero when tracepoint_printk_iter is not allocated
         */
        if (!tracepoint_print_iter)
                tracepoint_printk = 0;

        if (save_tracepoint_printk == tracepoint_printk)
                goto out;

        if (tracepoint_printk)
                static_key_enable(&tracepoint_printk_key.key);
        else
                static_key_disable(&tracepoint_printk_key.key);

 out:
        mutex_unlock(&tracepoint_printk_mutex);

        return ret;
}

void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
{
        enum event_trigger_type tt = ETT_NONE;
        struct trace_event_file *file = fbuffer->trace_file;

        if (__event_trigger_test_discard(file, fbuffer->buffer, fbuffer->event,
                        fbuffer->entry, &tt))
                goto discard;

        if (static_key_false(&tracepoint_printk_key.key))
                output_printk(fbuffer);

        if (static_branch_unlikely(&trace_event_exports_enabled))
                ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT);

        trace_buffer_unlock_commit_regs(file->tr, fbuffer->buffer,
                        fbuffer->event, fbuffer->trace_ctx, fbuffer->regs);

discard:
        if (tt)
                event_triggers_post_call(file, tt);

}
EXPORT_SYMBOL_GPL(trace_event_buffer_commit);

/*
 * Skip 3:
 *
 *   trace_buffer_unlock_commit_regs()
 *   trace_event_buffer_commit()
 *   trace_event_raw_event_xxx()
 */
# define STACK_SKIP 3

void trace_buffer_unlock_commit_regs(struct trace_array *tr,
                                     struct trace_buffer *buffer,
                                     struct ring_buffer_event *event,
                                     unsigned int trace_ctx,
                                     struct pt_regs *regs)
{
        __buffer_unlock_commit(buffer, event);

        /*
         * If regs is not set, then skip the necessary functions.
         * Note, we can still get here via blktrace, wakeup tracer
         * and mmiotrace, but that's ok if they lose a function or
         * two. They are not that meaningful.
         */
        ftrace_trace_stack(tr, buffer, trace_ctx, regs ? 0 : STACK_SKIP, regs);
        ftrace_trace_userstack(tr, buffer, trace_ctx);
}

/*
 * Similar to trace_buffer_unlock_commit_regs() but do not dump stack.
 */
void
trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
                                   struct ring_buffer_event *event)
{
        __buffer_unlock_commit(buffer, event);
}

void
trace_function(struct trace_array *tr, unsigned long ip, unsigned long
               parent_ip, unsigned int trace_ctx)
{
        struct trace_event_call *call = &event_function;
        struct trace_buffer *buffer = tr->array_buffer.buffer;
        struct ring_buffer_event *event;
        struct ftrace_entry *entry;

        event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
                                            trace_ctx);
        if (!event)
                return;
        entry        = ring_buffer_event_data(event);
        entry->ip                        = ip;
        entry->parent_ip                = parent_ip;

        if (!call_filter_check_discard(call, entry, buffer, event)) {
                if (static_branch_unlikely(&trace_function_exports_enabled))
                        ftrace_exports(event, TRACE_EXPORT_FUNCTION);
                __buffer_unlock_commit(buffer, event);
        }
}

#ifdef CONFIG_STACKTRACE

/* Allow 4 levels of nesting: normal, softirq, irq, NMI */
#define FTRACE_KSTACK_NESTING        4

#define FTRACE_KSTACK_ENTRIES        (PAGE_SIZE / FTRACE_KSTACK_NESTING)

struct ftrace_stack {
        unsigned long                calls[FTRACE_KSTACK_ENTRIES];
};


struct ftrace_stacks {
        struct ftrace_stack        stacks[FTRACE_KSTACK_NESTING];
};

static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
static DEFINE_PER_CPU(int, ftrace_stack_reserve);

static void __ftrace_trace_stack(struct trace_buffer *buffer,
                                 unsigned int trace_ctx,
                                 int skip, struct pt_regs *regs)
{
        struct trace_event_call *call = &event_kernel_stack;
        struct ring_buffer_event *event;
        unsigned int size, nr_entries;
        struct ftrace_stack *fstack;
        struct stack_entry *entry;
        int stackidx;

        /*
         * Add one, for this function and the call to save_stack_trace()
         * If regs is set, then these functions will not be in the way.
         */
#ifndef CONFIG_UNWINDER_ORC
        if (!regs)
                skip++;
#endif

        preempt_disable_notrace();

        stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;

        /* This should never happen. If it does, yell once and skip */
        if (WARN_ON_ONCE(stackidx >= FTRACE_KSTACK_NESTING))
                goto out;

        /*
         * The above __this_cpu_inc_return() is 'atomic' cpu local. An
         * interrupt will either see the value pre increment or post
         * increment. If the interrupt happens pre increment it will have
         * restored the counter when it returns.  We just need a barrier to
         * keep gcc from moving things around.
         */
        barrier();

        fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx;
        size = ARRAY_SIZE(fstack->calls);

        if (regs) {
                nr_entries = stack_trace_save_regs(regs, fstack->calls,
                                                   size, skip);
        } else {
                nr_entries = stack_trace_save(fstack->calls, size, skip);
        }

        event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
                                    struct_size(entry, caller, nr_entries),
                                    trace_ctx);
        if (!event)
                goto out;
        entry = ring_buffer_event_data(event);

        entry->size = nr_entries;
        memcpy(&entry->caller, fstack->calls,
               flex_array_size(entry, caller, nr_entries));

        if (!call_filter_check_discard(call, entry, buffer, event))
                __buffer_unlock_commit(buffer, event);

 out:
        /* Again, don't let gcc optimize things here */
        barrier();
        __this_cpu_dec(ftrace_stack_reserve);
        preempt_enable_notrace();

}

static inline void ftrace_trace_stack(struct trace_array *tr,
                                      struct trace_buffer *buffer,
                                      unsigned int trace_ctx,
                                      int skip, struct pt_regs *regs)
{
        if (!(tr->trace_flags & TRACE_ITER_STACKTRACE))
                return;

        __ftrace_trace_stack(buffer, trace_ctx, skip, regs);
}

void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,
                   int skip)
{
        struct trace_buffer *buffer = tr->array_buffer.buffer;

        if (rcu_is_watching()) {
                __ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
                return;
        }

        if (WARN_ON_ONCE(IS_ENABLED(CONFIG_GENERIC_ENTRY)))
                return;

        /*
         * When an NMI triggers, RCU is enabled via ct_nmi_enter(),
         * but if the above rcu_is_watching() failed, then the NMI
         * triggered someplace critical, and ct_irq_enter() should
         * not be called from NMI.
         */
        if (unlikely(in_nmi()))
                return;

        ct_irq_enter_irqson();
        __ftrace_trace_stack(buffer, trace_ctx, skip, NULL);
        ct_irq_exit_irqson();
}

/**
 * trace_dump_stack - record a stack back trace in the trace buffer
 * @skip: Number of functions to skip (helper handlers)
 */
void trace_dump_stack(int skip)
{
        if (tracing_disabled || tracing_selftest_running)
                return;

#ifndef CONFIG_UNWINDER_ORC
        /* Skip 1 to skip this function. */
        skip++;
#endif
        __ftrace_trace_stack(global_trace.array_buffer.buffer,
                             tracing_gen_ctx(), skip, NULL);
}
EXPORT_SYMBOL_GPL(trace_dump_stack);

#ifdef CONFIG_USER_STACKTRACE_SUPPORT
static DEFINE_PER_CPU(int, user_stack_count);

static void
ftrace_trace_userstack(struct trace_array *tr,
                       struct trace_buffer *buffer, unsigned int trace_ctx)
{
        struct trace_event_call *call = &event_user_stack;
        struct ring_buffer_event *event;
        struct userstack_entry *entry;

        if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE))
                return;

        /*
         * NMIs can not handle page faults, even with fix ups.
         * The save user stack can (and often does) fault.
         */
        if (unlikely(in_nmi()))
                return;

        /*
         * prevent recursion, since the user stack tracing may
         * trigger other kernel events.
         */
        preempt_disable();
        if (__this_cpu_read(user_stack_count))
                goto out;

        __this_cpu_inc(user_stack_count);

        event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
                                            sizeof(*entry), trace_ctx);
        if (!event)
                goto out_drop_count;
        entry        = ring_buffer_event_data(event);

        entry->tgid                = current->tgid;
        memset(&entry->caller, 0, sizeof(entry->caller));

        stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES);
        if (!call_filter_check_discard(call, entry, buffer, event))
                __buffer_unlock_commit(buffer, event);

 out_drop_count:
        __this_cpu_dec(user_stack_count);
 out:
        preempt_enable();
}
#else /* CONFIG_USER_STACKTRACE_SUPPORT */
static void ftrace_trace_userstack(struct trace_array *tr,
                                   struct trace_buffer *buffer,
                                   unsigned int trace_ctx)
{
}
#endif /* !CONFIG_USER_STACKTRACE_SUPPORT */

#endif /* CONFIG_STACKTRACE */

static inline void
func_repeats_set_delta_ts(struct func_repeats_entry *entry,
                          unsigned long long delta)
{
        entry->bottom_delta_ts = delta & U32_MAX;
        entry->top_delta_ts = (delta >> 32);
}

void trace_last_func_repeats(struct trace_array *tr,
                             struct trace_func_repeats *last_info,
                             unsigned int trace_ctx)
{
        struct trace_buffer *buffer = tr->array_buffer.buffer;
        struct func_repeats_entry *entry;
        struct ring_buffer_event *event;
        u64 delta;

        event = __trace_buffer_lock_reserve(buffer, TRACE_FUNC_REPEATS,
                                            sizeof(*entry), trace_ctx);
        if (!event)
                return;

        delta = ring_buffer_event_time_stamp(buffer, event) -
                last_info->ts_last_call;

        entry = ring_buffer_event_data(event);
        entry->ip = last_info->ip;
        entry->parent_ip = last_info->parent_ip;
        entry->count = last_info->count;
        func_repeats_set_delta_ts(entry, delta);

        __buffer_unlock_commit(buffer, event);
}

/* created for use with alloc_percpu */
struct trace_buffer_struct {
        int nesting;
        char buffer[4][TRACE_BUF_SIZE];
};

static struct trace_buffer_struct __percpu *trace_percpu_buffer;

/*
 * This allows for lockless recording.  If we're nested too deeply, then
 * this returns NULL.
 */
static char *get_trace_buf(void)
{
        struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);

        if (!trace_percpu_buffer || buffer->nesting >= 4)
                return NULL;

        buffer->nesting++;

        /* Interrupts must see nesting incremented before we use the buffer */
        barrier();
        return &buffer->buffer[buffer->nesting - 1][0];
}

static void put_trace_buf(void)
{
        /* Don't let the decrement of nesting leak before this */
        barrier();
        this_cpu_dec(trace_percpu_buffer->nesting);
}

static int alloc_percpu_trace_buffer(void)
{
        struct trace_buffer_struct __percpu *buffers;

        if (trace_percpu_buffer)
                return 0;

        buffers = alloc_percpu(struct trace_buffer_struct);
        if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer"))
                return -ENOMEM;

        trace_percpu_buffer = buffers;
        return 0;
}

static int buffers_allocated;

void trace_printk_init_buffers(void)
{
        if (buffers_allocated)
                return;

        if (alloc_percpu_trace_buffer())
                return;

        /* trace_printk() is for debug use only. Don't use it in production. */

        pr_warn("\n");
        pr_warn("**********************************************************\n");
        pr_warn("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
        pr_warn("**                                                      **\n");
        pr_warn("** trace_printk() being used. Allocating extra memory.  **\n");
        pr_warn("**                                                      **\n");
        pr_warn("** This means that this is a DEBUG kernel and it is     **\n");
        pr_warn("** unsafe for production use.                           **\n");
        pr_warn("**                                                      **\n");
        pr_warn("** If you see this message and you are not debugging    **\n");
        pr_warn("** the kernel, report this immediately to your vendor!  **\n");
        pr_warn("**                                                      **\n");
        pr_warn("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
        pr_warn("**********************************************************\n");

        /* Expand the buffers to set size */
        tracing_update_buffers(&global_trace);

        buffers_allocated = 1;

        /*
         * trace_printk_init_buffers() can be called by modules.
         * If that happens, then we need to start cmdline recording
         * directly here. If the global_trace.buffer is already
         * allocated here, then this was called by module code.
         */
        if (global_trace.array_buffer.buffer)
                tracing_start_cmdline_record();
}
EXPORT_SYMBOL_GPL(trace_printk_init_buffers);

void trace_printk_start_comm(void)
{
        /* Start tracing comms if trace printk is set */
        if (!buffers_allocated)
                return;
        tracing_start_cmdline_record();
}

static void trace_printk_start_stop_comm(int enabled)
{
        if (!buffers_allocated)
                return;

        if (enabled)
                tracing_start_cmdline_record();
        else
                tracing_stop_cmdline_record();
}

/**
 * trace_vbprintk - write binary msg to tracing buffer
 * @ip:    The address of the caller
 * @fmt:   The string format to write to the buffer
 * @args:  Arguments for @fmt
 */
int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
{
        struct trace_event_call *call = &event_bprint;
        struct ring_buffer_event *event;
        struct trace_buffer *buffer;
        struct trace_array *tr = &global_trace;
        struct bprint_entry *entry;
        unsigned int trace_ctx;
        char *tbuffer;
        int len = 0, size;

        if (unlikely(tracing_selftest_running || tracing_disabled))
                return 0;

        /* Don't pollute graph traces with trace_vprintk internals */
        pause_graph_tracing();

        trace_ctx = tracing_gen_ctx();
        preempt_disable_notrace();

        tbuffer = get_trace_buf();
        if (!tbuffer) {
                len = 0;
                goto out_nobuffer;
        }

        len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args);

        if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0)
                goto out_put;

        size = sizeof(*entry) + sizeof(u32) * len;
        buffer = tr->array_buffer.buffer;
        ring_buffer_nest_start(buffer);
        event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
                                            trace_ctx);
        if (!event)
                goto out;
        entry = ring_buffer_event_data(event);
        entry->ip                        = ip;
        entry->fmt                        = fmt;

        memcpy(entry->buf, tbuffer, sizeof(u32) * len);
        if (!call_filter_check_discard(call, entry, buffer, event)) {
                __buffer_unlock_commit(buffer, event);
                ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
        }

out:
        ring_buffer_nest_end(buffer);
out_put:
        put_trace_buf();

out_nobuffer:
        preempt_enable_notrace();
        unpause_graph_tracing();

        return len;
}
EXPORT_SYMBOL_GPL(trace_vbprintk);

__printf(3, 0)
static int
__trace_array_vprintk(struct trace_buffer *buffer,
                      unsigned long ip, const char *fmt, va_list args)
{
        struct trace_event_call *call = &event_print;
        struct ring_buffer_event *event;
        int len = 0, size;
        struct print_entry *entry;
        unsigned int trace_ctx;
        char *tbuffer;

        if (tracing_disabled)
                return 0;

        /* Don't pollute graph traces with trace_vprintk internals */
        pause_graph_tracing();

        trace_ctx = tracing_gen_ctx();
        preempt_disable_notrace();


        tbuffer = get_trace_buf();
        if (!tbuffer) {
                len = 0;
                goto out_nobuffer;
        }

        len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);

        size = sizeof(*entry) + len + 1;
        ring_buffer_nest_start(buffer);
        event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
                                            trace_ctx);
        if (!event)
                goto out;
        entry = ring_buffer_event_data(event);
        entry->ip = ip;

        memcpy(&entry->buf, tbuffer, len + 1);
        if (!call_filter_check_discard(call, entry, buffer, event)) {
                __buffer_unlock_commit(buffer, event);
                ftrace_trace_stack(&global_trace, buffer, trace_ctx, 6, NULL);
        }

out:
        ring_buffer_nest_end(buffer);
        put_trace_buf();

out_nobuffer:
        preempt_enable_notrace();
        unpause_graph_tracing();

        return len;
}

__printf(3, 0)
int trace_array_vprintk(struct trace_array *tr,
                        unsigned long ip, const char *fmt, va_list args)
{
        if (tracing_selftest_running && tr == &global_trace)
                return 0;

        return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args);
}

/**
 * trace_array_printk - Print a message to a specific instance
 * @tr: The instance trace_array descriptor
 * @ip: The instruction pointer that this is called from.
 * @fmt: The format to print (printf format)
 *
 * If a subsystem sets up its own instance, they have the right to
 * printk strings into their tracing instance buffer using this
 * function. Note, this function will not write into the top level
 * buffer (use trace_printk() for that), as writing into the top level
 * buffer should only have events that can be individually disabled.
 * trace_printk() is only used for debugging a kernel, and should not
 * be ever incorporated in normal use.
 *
 * trace_array_printk() can be used, as it will not add noise to the
 * top level tracing buffer.
 *
 * Note, trace_array_init_printk() must be called on @tr before this
 * can be used.
 */
__printf(3, 0)
int trace_array_printk(struct trace_array *tr,
                       unsigned long ip, const char *fmt, ...)
{
        int ret;
        va_list ap;

        if (!tr)
                return -ENOENT;

        /* This is only allowed for created instances */
        if (tr == &global_trace)
                return 0;

        if (!(tr->trace_flags & TRACE_ITER_PRINTK))
                return 0;

        va_start(ap, fmt);
        ret = trace_array_vprintk(tr, ip, fmt, ap);
        va_end(ap);
        return ret;
}
EXPORT_SYMBOL_GPL(trace_array_printk);

/**
 * trace_array_init_printk - Initialize buffers for trace_array_printk()
 * @tr: The trace array to initialize the buffers for
 *
 * As trace_array_printk() only writes into instances, they are OK to
 * have in the kernel (unlike trace_printk()). This needs to be called
 * before trace_array_printk() can be used on a trace_array.
 */
int trace_array_init_printk(struct trace_array *tr)
{
        if (!tr)
                return -ENOENT;

        /* This is only allowed for created instances */
        if (tr == &global_trace)
                return -EINVAL;

        return alloc_percpu_trace_buffer();
}
EXPORT_SYMBOL_GPL(trace_array_init_printk);

__printf(3, 4)
int trace_array_printk_buf(struct trace_buffer *buffer,
                           unsigned long ip, const char *fmt, ...)
{
        int ret;
        va_list ap;

        if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
                return 0;

        va_start(ap, fmt);
        ret = __trace_array_vprintk(buffer, ip, fmt, ap);
        va_end(ap);
        return ret;
}

__printf(2, 0)
int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
{
        return trace_array_vprintk(&global_trace, ip, fmt, args);
}
EXPORT_SYMBOL_GPL(trace_vprintk);

static void trace_iterator_increment(struct trace_iterator *iter)
{
        struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu);

        iter->idx++;
        if (buf_iter)
                ring_buffer_iter_advance(buf_iter);
}

static struct trace_entry *
peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
                unsigned long *lost_events)
{
        struct ring_buffer_event *event;
        struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu);

        if (buf_iter) {
                event = ring_buffer_iter_peek(buf_iter, ts);
                if (lost_events)
                        *lost_events = ring_buffer_iter_dropped(buf_iter) ?
                                (unsigned long)-1 : 0;
        } else {
                event = ring_buffer_peek(iter->array_buffer->buffer, cpu, ts,
                                         lost_events);
        }

        if (event) {
                iter->ent_size = ring_buffer_event_length(event);
                return ring_buffer_event_data(event);
        }
        iter->ent_size = 0;
        return NULL;
}

static struct trace_entry *
__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
                  unsigned long *missing_events, u64 *ent_ts)
{
        struct trace_buffer *buffer = iter->array_buffer->buffer;
        struct trace_entry *ent, *next = NULL;
        unsigned long lost_events = 0, next_lost = 0;
        int cpu_file = iter->cpu_file;
        u64 next_ts = 0, ts;
        int next_cpu = -1;
        int next_size = 0;
        int cpu;

        /*
         * If we are in a per_cpu trace file, don't bother by iterating over
         * all cpu and peek directly.
         */
        if (cpu_file > RING_BUFFER_ALL_CPUS) {
                if (ring_buffer_empty_cpu(buffer, cpu_file))
                        return NULL;
                ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
                if (ent_cpu)
                        *ent_cpu = cpu_file;

                return ent;
        }

        for_each_tracing_cpu(cpu) {

                if (ring_buffer_empty_cpu(buffer, cpu))
                        continue;

                ent = peek_next_entry(iter, cpu, &ts, &lost_events);

                /*
                 * Pick the entry with the smallest timestamp:
                 */
                if (ent && (!next || ts < next_ts)) {
                        next = ent;
                        next_cpu = cpu;
                        next_ts = ts;
                        next_lost = lost_events;
                        next_size = iter->ent_size;
                }
        }

        iter->ent_size = next_size;

        if (ent_cpu)
                *ent_cpu = next_cpu;

        if (ent_ts)
                *ent_ts = next_ts;

        if (missing_events)
                *missing_events = next_lost;

        return next;
}

#define STATIC_FMT_BUF_SIZE        128
static char static_fmt_buf[STATIC_FMT_BUF_SIZE];

char *trace_iter_expand_format(struct trace_iterator *iter)
{
        char *tmp;

        /*
         * iter->tr is NULL when used with tp_printk, which makes
         * this get called where it is not safe to call krealloc().
         */
        if (!iter->tr || iter->fmt == static_fmt_buf)
                return NULL;

        tmp = krealloc(iter->fmt, iter->fmt_size + STATIC_FMT_BUF_SIZE,
                       GFP_KERNEL);
        if (tmp) {
                iter->fmt_size += STATIC_FMT_BUF_SIZE;
                iter->fmt = tmp;
        }

        return tmp;
}

/* Returns true if the string is safe to dereference from an event */
static bool trace_safe_str(struct trace_iterator *iter, const char *str,
                           bool star, int len)
{
        unsigned long addr = (unsigned long)str;
        struct trace_event *trace_event;
        struct trace_event_call *event;

        /* Ignore strings with no length */
        if (star && !len)
                return true;

        /* OK if part of the event data */
        if ((addr >= (unsigned long)iter->ent) &&
            (addr < (unsigned long)iter->ent + iter->ent_size))
                return true;

        /* OK if part of the temp seq buffer */
        if ((addr >= (unsigned long)iter->tmp_seq.buffer) &&
            (addr < (unsigned long)iter->tmp_seq.buffer + TRACE_SEQ_BUFFER_SIZE))
                return true;

        /* Core rodata can not be freed */
        if (is_kernel_rodata(addr))
                return true;

        if (trace_is_tracepoint_string(str))
                return true;

        /*
         * Now this could be a module event, referencing core module
         * data, which is OK.
         */
        if (!iter->ent)
                return false;

        trace_event = ftrace_find_event(iter->ent->type);
        if (!trace_event)
                return false;

        event = container_of(trace_event, struct trace_event_call, event);
        if ((event->flags & TRACE_EVENT_FL_DYNAMIC) || !event->module)
                return false;

        /* Would rather have rodata, but this will suffice */
        if (within_module_core(addr, event->module))
                return true;

        return false;
}

static DEFINE_STATIC_KEY_FALSE(trace_no_verify);

static int test_can_verify_check(const char *fmt, ...)
{
        char buf[16];
        va_list ap;
        int ret;

        /*
         * The verifier is dependent on vsnprintf() modifies the va_list
         * passed to it, where it is sent as a reference. Some architectures
         * (like x86_32) passes it by value, which means that vsnprintf()
         * does not modify the va_list passed to it, and the verifier
         * would then need to be able to understand all the values that
         * vsnprintf can use. If it is passed by value, then the verifier
         * is disabled.
         */
        va_start(ap, fmt);
        vsnprintf(buf, 16, "%d", ap);
        ret = va_arg(ap, int);
        va_end(ap);

        return ret;
}

static void test_can_verify(void)
{
        if (!test_can_verify_check("%d %d", 0, 1)) {
                pr_info("trace event string verifier disabled\n");
                static_branch_inc(&trace_no_verify);
        }
}

/**
 * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer
 * @iter: The iterator that holds the seq buffer and the event being printed
 * @fmt: The format used to print the event
 * @ap: The va_list holding the data to print from @fmt.
 *
 * This writes the data into the @iter->seq buffer using the data from
 * @fmt and @ap. If the format has a %s, then the source of the string
 * is examined to make sure it is safe to print, otherwise it will
 * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string
 * pointer.
 */
void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
                         va_list ap)
{
        const char *p = fmt;
        const char *str;
        int i, j;

        if (WARN_ON_ONCE(!fmt))
                return;

        if (static_branch_unlikely(&trace_no_verify))
                goto print;

        /* Don't bother checking when doing a ftrace_dump() */
        if (iter->fmt == static_fmt_buf)
                goto print;

        while (*p) {
                bool star = false;
                int len = 0;

                j = 0;

                /* We only care about %s and variants */
                for (i = 0; p[i]; i++) {
                        if (i + 1 >= iter->fmt_size) {
                                /*
                                 * If we can't expand the copy buffer,
                                 * just print it.
                                 */
                                if (!trace_iter_expand_format(iter))
                                        goto print;
                        }

                        if (p[i] == '\\' && p[i+1]) {
                                i++;
                                continue;
                        }
                        if (p[i] == '%') {
                                /* Need to test cases like %08.*s */
                                for (j = 1; p[i+j]; j++) {
                                        if (isdigit(p[i+j]) ||
                                            p[i+j] == '.')
                                                continue;
                                        if (p[i+j] == '*') {
                                                star = true;
                                                continue;
                                        }
                                        break;
                                }
                                if (p[i+j] == 's')
                                        break;
                                star = false;
                        }
                        j = 0;
                }
                /* If no %s found then just print normally */
                if (!p[i])
                        break;

                /* Copy up to the %s, and print that */
                strncpy(iter->fmt, p, i);
                iter->fmt[i] = '\0';
                trace_seq_vprintf(&iter->seq, iter->fmt, ap);

                /*
                 * If iter->seq is full, the above call no longer guarantees
                 * that ap is in sync with fmt processing, and further calls
                 * to va_arg() can return wrong positional arguments.
                 *
                 * Ensure that ap is no longer used in this case.
                 */
                if (iter->seq.full) {
                        p = "";
                        break;
                }

                if (star)
                        len = va_arg(ap, int);

                /* The ap now points to the string data of the %s */
                str = va_arg(ap, const char *);

                /*
                 * If you hit this warning, it is likely that the
                 * trace event in question used %s on a string that
                 * was saved at the time of the event, but may not be
                 * around when the trace is read. Use __string(),
                 * __assign_str() and __get_str() helpers in the TRACE_EVENT()
                 * instead. See samples/trace_events/trace-events-sample.h
                 * for reference.
                 */
                if (WARN_ONCE(!trace_safe_str(iter, str, star, len),
                              "fmt: '%s' current_buffer: '%s'",
                              fmt, seq_buf_str(&iter->seq.seq))) {
                        int ret;

                        /* Try to safely read the string */
                        if (star) {
                                if (len + 1 > iter->fmt_size)
                                        len = iter->fmt_size - 1;
                                if (len < 0)
                                        len = 0;
                                ret = copy_from_kernel_nofault(iter->fmt, str, len);
                                iter->fmt[len] = 0;
                                star = false;
                        } else {
                                ret = strncpy_from_kernel_nofault(iter->fmt, str,
                                                                  iter->fmt_size);
                        }
                        if (ret < 0)
                                trace_seq_printf(&iter->seq, "(0x%px)", str);
                        else
                                trace_seq_printf(&iter->seq, "(0x%px:%s)",
                                                 str, iter->fmt);
                        str = "[UNSAFE-MEMORY]";
                        strcpy(iter->fmt, "%s");
                } else {
                        strncpy(iter->fmt, p + i, j + 1);
                        iter->fmt[j+1] = '\0';
                }
                if (star)
                        trace_seq_printf(&iter->seq, iter->fmt, len, str);
                else
                        trace_seq_printf(&iter->seq, iter->fmt, str);

                p += i + j + 1;
        }
 print:
        if (*p)
                trace_seq_vprintf(&iter->seq, p, ap);
}

const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
{
        const char *p, *new_fmt;
        char *q;

        if (WARN_ON_ONCE(!fmt))
                return fmt;

        if (!iter->tr || iter->tr->trace_flags & TRACE_ITER_HASH_PTR)
                return fmt;

        p = fmt;
        new_fmt = q = iter->fmt;
        while (*p) {
                if (unlikely(q - new_fmt + 3 > iter->fmt_size)) {
                        if (!trace_iter_expand_format(iter))
                                return fmt;

                        q += iter->fmt - new_fmt;
                        new_fmt = iter->fmt;
                }

                *q++ = *p++;

                /* Replace %p with %px */
                if (p[-1] == '%') {
                        if (p[0] == '%') {
                                *q++ = *p++;
                        } else if (p[0] == 'p' && !isalnum(p[1])) {
                                *q++ = *p++;
                                *q++ = 'x';
                        }
                }
        }
        *q = '\0';

        return new_fmt;
}

#define STATIC_TEMP_BUF_SIZE        128
static char static_temp_buf[STATIC_TEMP_BUF_SIZE] __aligned(4);

/* Find the next real entry, without updating the iterator itself */
struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
                                          int *ent_cpu, u64 *ent_ts)
{
        /* __find_next_entry will reset ent_size */
        int ent_size = iter->ent_size;
        struct trace_entry *entry;

        /*
         * If called from ftrace_dump(), then the iter->temp buffer
         * will be the static_temp_buf and not created from kmalloc.
         * If the entry size is greater than the buffer, we can
         * not save it. Just return NULL in that case. This is only
         * used to add markers when two consecutive events' time
         * stamps have a large delta. See trace_print_lat_context()
         */
        if (iter->temp == static_temp_buf &&
            STATIC_TEMP_BUF_SIZE < ent_size)
                return NULL;

        /*
         * The __find_next_entry() may call peek_next_entry(), which may
         * call ring_buffer_peek() that may make the contents of iter->ent
         * undefined. Need to copy iter->ent now.
         */
        if (iter->ent && iter->ent != iter->temp) {
                if ((!iter->temp || iter->temp_size < iter->ent_size) &&
                    !WARN_ON_ONCE(iter->temp == static_temp_buf)) {
                        void *temp;
                        temp = kmalloc(iter->ent_size, GFP_KERNEL);
                        if (!temp)
                                return NULL;
                        kfree(iter->temp);
                        iter->temp = temp;
                        iter->temp_size = iter->ent_size;
                }
                memcpy(iter->temp, iter->ent, iter->ent_size);
                iter->ent = iter->temp;
        }
        entry = __find_next_entry(iter, ent_cpu, NULL, ent_ts);
        /* Put back the original ent_size */
        iter->ent_size = ent_size;

        return entry;
}

/* Find the next real entry, and increment the iterator to the next entry */
void *trace_find_next_entry_inc(struct trace_iterator *iter)
{
        iter->ent = __find_next_entry(iter, &iter->cpu,
                                      &iter->lost_events, &iter->ts);

        if (iter->ent)
                trace_iterator_increment(iter);

        return iter->ent ? iter : NULL;
}

static void trace_consume(struct trace_iterator *iter)
{
        ring_buffer_consume(iter->array_buffer->buffer, iter->cpu, &iter->ts,
                            &iter->lost_events);
}

static void *s_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct trace_iterator *iter = m->private;
        int i = (int)*pos;
        void *ent;

        WARN_ON_ONCE(iter->leftover);

        (*pos)++;

        /* can't go backwards */
        if (iter->idx > i)
                return NULL;

        if (iter->idx < 0)
                ent = trace_find_next_entry_inc(iter);
        else
                ent = iter;

        while (ent && iter->idx < i)
                ent = trace_find_next_entry_inc(iter);

        iter->pos = *pos;

        return ent;
}

void tracing_iter_reset(struct trace_iterator *iter, int cpu)
{
        struct ring_buffer_iter *buf_iter;
        unsigned long entries = 0;
        u64 ts;

        per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = 0;

        buf_iter = trace_buffer_iter(iter, cpu);
        if (!buf_iter)
                return;

        ring_buffer_iter_reset(buf_iter);

        /*
         * We could have the case with the max latency tracers
         * that a reset never took place on a cpu. This is evident
         * by the timestamp being before the start of the buffer.
         */
        while (ring_buffer_iter_peek(buf_iter, &ts)) {
                if (ts >= iter->array_buffer->time_start)
                        break;
                entries++;
                ring_buffer_iter_advance(buf_iter);
        }

        per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries;
}

/*
 * The current tracer is copied to avoid a global locking
 * all around.
 */
static void *s_start(struct seq_file *m, loff_t *pos)
{
        struct trace_iterator *iter = m->private;
        struct trace_array *tr = iter->tr;
        int cpu_file = iter->cpu_file;
        void *p = NULL;
        loff_t l = 0;
        int cpu;

        mutex_lock(&trace_types_lock);
        if (unlikely(tr->current_trace != iter->trace)) {
                /* Close iter->trace before switching to the new current tracer */
                if (iter->trace->close)
                        iter->trace->close(iter);
                iter->trace = tr->current_trace;
                /* Reopen the new current tracer */
                if (iter->trace->open)
                        iter->trace->open(iter);
        }
        mutex_unlock(&trace_types_lock);

#ifdef CONFIG_TRACER_MAX_TRACE
        if (iter->snapshot && iter->trace->use_max_tr)
                return ERR_PTR(-EBUSY);
#endif

        if (*pos != iter->pos) {
                iter->ent = NULL;
                iter->cpu = 0;
                iter->idx = -1;

                if (cpu_file == RING_BUFFER_ALL_CPUS) {
                        for_each_tracing_cpu(cpu)
                                tracing_iter_reset(iter, cpu);
                } else
                        tracing_iter_reset(iter, cpu_file);

                iter->leftover = 0;
                for (p = iter; p && l < *pos; p = s_next(m, p, &l))
                        ;

        } else {
                /*
                 * If we overflowed the seq_file before, then we want
                 * to just reuse the trace_seq buffer again.
                 */
                if (iter->leftover)
                        p = iter;
                else {
                        l = *pos - 1;
                        p = s_next(m, p, &l);
                }
        }

        trace_event_read_lock();
        trace_access_lock(cpu_file);
        return p;
}

static void s_stop(struct seq_file *m, void *p)
{
        struct trace_iterator *iter = m->private;

#ifdef CONFIG_TRACER_MAX_TRACE
        if (iter->snapshot && iter->trace->use_max_tr)
                return;
#endif

        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
}

static void
get_total_entries_cpu(struct array_buffer *buf, unsigned long *total,
                      unsigned long *entries, int cpu)
{
        unsigned long count;

        count = ring_buffer_entries_cpu(buf->buffer, cpu);
        /*
         * If this buffer has skipped entries, then we hold all
         * entries for the trace and we need to ignore the
         * ones before the time stamp.
         */
        if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
                count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
                /* total is the same as the entries */
                *total = count;
        } else
                *total = count +
                        ring_buffer_overrun_cpu(buf->buffer, cpu);
        *entries = count;
}

static void
get_total_entries(struct array_buffer *buf,
                  unsigned long *total, unsigned long *entries)
{
        unsigned long t, e;
        int cpu;

        *total = 0;
        *entries = 0;

        for_each_tracing_cpu(cpu) {
                get_total_entries_cpu(buf, &t, &e, cpu);
                *total += t;
                *entries += e;
        }
}

unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu)
{
        unsigned long total, entries;

        if (!tr)
                tr = &global_trace;

        get_total_entries_cpu(&tr->array_buffer, &total, &entries, cpu);

        return entries;
}

unsigned long trace_total_entries(struct trace_array *tr)
{
        unsigned long total, entries;

        if (!tr)
                tr = &global_trace;

        get_total_entries(&tr->array_buffer, &total, &entries);

        return entries;
}

static void print_lat_help_header(struct seq_file *m)
{
        seq_puts(m, "#                    _------=> CPU#            \n"
                    "#                   / _-----=> irqs-off/BH-disabled\n"
                    "#                  | / _----=> need-resched    \n"
                    "#                  || / _---=> hardirq/softirq \n"
                    "#                  ||| / _--=> preempt-depth   \n"
                    "#                  |||| / _-=> migrate-disable \n"
                    "#                  ||||| /     delay           \n"
                    "#  cmd     pid     |||||| time  |   caller     \n"
                    "#     \\   /        ||||||  \\    |    /       \n");
}

static void print_event_info(struct array_buffer *buf, struct seq_file *m)
{
        unsigned long total;
        unsigned long entries;

        get_total_entries(buf, &total, &entries);
        seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu   #P:%d\n",
                   entries, total, num_online_cpus());
        seq_puts(m, "#\n");
}

static void print_func_help_header(struct array_buffer *buf, struct seq_file *m,
                                   unsigned int flags)
{
        bool tgid = flags & TRACE_ITER_RECORD_TGID;

        print_event_info(buf, m);

        seq_printf(m, "#           TASK-PID    %s CPU#     TIMESTAMP  FUNCTION\n", tgid ? "   TGID   " : "");
        seq_printf(m, "#              | |      %s   |         |         |\n",      tgid ? "     |    " : "");
}

static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m,
                                       unsigned int flags)
{
        bool tgid = flags & TRACE_ITER_RECORD_TGID;
        static const char space[] = "            ";
        int prec = tgid ? 12 : 2;

        print_event_info(buf, m);

        seq_printf(m, "#                            %.*s  _-----=> irqs-off/BH-disabled\n", prec, space);
        seq_printf(m, "#                            %.*s / _----=> need-resched\n", prec, space);
        seq_printf(m, "#                            %.*s| / _---=> hardirq/softirq\n", prec, space);
        seq_printf(m, "#                            %.*s|| / _--=> preempt-depth\n", prec, space);
        seq_printf(m, "#                            %.*s||| / _-=> migrate-disable\n", prec, space);
        seq_printf(m, "#                            %.*s|||| /     delay\n", prec, space);
        seq_printf(m, "#           TASK-PID  %.*s CPU#  |||||  TIMESTAMP  FUNCTION\n", prec, "     TGID   ");
        seq_printf(m, "#              | |    %.*s   |   |||||     |         |\n", prec, "       |    ");
}

void
print_trace_header(struct seq_file *m, struct trace_iterator *iter)
{
        unsigned long sym_flags = (global_trace.trace_flags & TRACE_ITER_SYM_MASK);
        struct array_buffer *buf = iter->array_buffer;
        struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu);
        struct tracer *type = iter->trace;
        unsigned long entries;
        unsigned long total;
        const char *name = type->name;

        get_total_entries(buf, &total, &entries);

        seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
                   name, init_utsname()->release);
        seq_puts(m, "# -----------------------------------"
                 "---------------------------------\n");
        seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |"
                   " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
                   nsecs_to_usecs(data->saved_latency),
                   entries,
                   total,
                   buf->cpu,
                   preempt_model_none()      ? "server" :
                   preempt_model_voluntary() ? "desktop" :
                   preempt_model_full()      ? "preempt" :
                   preempt_model_rt()        ? "preempt_rt" :
                   "unknown",
                   /* These are reserved for later use */
                   0, 0, 0, 0);
#ifdef CONFIG_SMP
        seq_printf(m, " #P:%d)\n", num_online_cpus());
#else
        seq_puts(m, ")\n");
#endif
        seq_puts(m, "#    -----------------\n");
        seq_printf(m, "#    | task: %.16s-%d "
                   "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
                   data->comm, data->pid,
                   from_kuid_munged(seq_user_ns(m), data->uid), data->nice,
                   data->policy, data->rt_priority);
        seq_puts(m, "#    -----------------\n");

        if (data->critical_start) {
                seq_puts(m, "#  => started at: ");
                seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags);
                trace_print_seq(m, &iter->seq);
                seq_puts(m, "\n#  => ended at:   ");
                seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
                trace_print_seq(m, &iter->seq);
                seq_puts(m, "\n#\n");
        }

        seq_puts(m, "#\n");
}

static void test_cpu_buff_start(struct trace_iterator *iter)
{
        struct trace_seq *s = &iter->seq;
        struct trace_array *tr = iter->tr;

        if (!(tr->trace_flags & TRACE_ITER_ANNOTATE))
                return;

        if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
                return;

        if (cpumask_available(iter->started) &&
            cpumask_test_cpu(iter->cpu, iter->started))
                return;

        if (per_cpu_ptr(iter->array_buffer->data, iter->cpu)->skipped_entries)
                return;

        if (cpumask_available(iter->started))
                cpumask_set_cpu(iter->cpu, iter->started);

        /* Don't print started cpu buffer for the first entry of the trace */
        if (iter->idx > 1)
                trace_seq_printf(s, "##### CPU %u buffer started ####\n",
                                iter->cpu);
}

static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
{
        struct trace_array *tr = iter->tr;
        struct trace_seq *s = &iter->seq;
        unsigned long sym_flags = (tr->trace_flags & TRACE_ITER_SYM_MASK);
        struct trace_entry *entry;
        struct trace_event *event;

        entry = iter->ent;

        test_cpu_buff_start(iter);

        event = ftrace_find_event(entry->type);

        if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
                if (iter->iter_flags & TRACE_FILE_LAT_FMT)
                        trace_print_lat_context(iter);
                else
                        trace_print_context(iter);
        }

        if (trace_seq_has_overflowed(s))
                return TRACE_TYPE_PARTIAL_LINE;

        if (event) {
                if (tr->trace_flags & TRACE_ITER_FIELDS)
                        return print_event_fields(iter, event);
                return event->funcs->trace(iter, sym_flags, event);
        }

        trace_seq_printf(s, "Unknown type %d\n", entry->type);

        return trace_handle_return(s);
}

static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
{
        struct trace_array *tr = iter->tr;
        struct trace_seq *s = &iter->seq;
        struct trace_entry *entry;
        struct trace_event *event;

        entry = iter->ent;

        if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO)
                trace_seq_printf(s, "%d %d %llu ",
                                 entry->pid, iter->cpu, iter->ts);

        if (trace_seq_has_overflowed(s))
                return TRACE_TYPE_PARTIAL_LINE;

        event = ftrace_find_event(entry->type);
        if (event)
                return event->funcs->raw(iter, 0, event);

        trace_seq_printf(s, "%d ?\n", entry->type);

        return trace_handle_return(s);
}

static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
{
        struct trace_array *tr = iter->tr;
        struct trace_seq *s = &iter->seq;
        unsigned char newline = '\n';
        struct trace_entry *entry;
        struct trace_event *event;

        entry = iter->ent;

        if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
                SEQ_PUT_HEX_FIELD(s, entry->pid);
                SEQ_PUT_HEX_FIELD(s, iter->cpu);
                SEQ_PUT_HEX_FIELD(s, iter->ts);
                if (trace_seq_has_overflowed(s))
                        return TRACE_TYPE_PARTIAL_LINE;
        }

        event = ftrace_find_event(entry->type);
        if (event) {
                enum print_line_t ret = event->funcs->hex(iter, 0, event);
                if (ret != TRACE_TYPE_HANDLED)
                        return ret;
        }

        SEQ_PUT_FIELD(s, newline);

        return trace_handle_return(s);
}

static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
{
        struct trace_array *tr = iter->tr;
        struct trace_seq *s = &iter->seq;
        struct trace_entry *entry;
        struct trace_event *event;

        entry = iter->ent;

        if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
                SEQ_PUT_FIELD(s, entry->pid);
                SEQ_PUT_FIELD(s, iter->cpu);
                SEQ_PUT_FIELD(s, iter->ts);
                if (trace_seq_has_overflowed(s))
                        return TRACE_TYPE_PARTIAL_LINE;
        }

        event = ftrace_find_event(entry->type);
        return event ? event->funcs->binary(iter, 0, event) :
                TRACE_TYPE_HANDLED;
}

int trace_empty(struct trace_iterator *iter)
{
        struct ring_buffer_iter *buf_iter;
        int cpu;

        /* If we are looking at one CPU buffer, only check that one */
        if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
                cpu = iter->cpu_file;
                buf_iter = trace_buffer_iter(iter, cpu);
                if (buf_iter) {
                        if (!ring_buffer_iter_empty(buf_iter))
                                return 0;
                } else {
                        if (!ring_buffer_empty_cpu(iter->array_buffer->buffer, cpu))
                                return 0;
                }
                return 1;
        }

        for_each_tracing_cpu(cpu) {
                buf_iter = trace_buffer_iter(iter, cpu);
                if (buf_iter) {
                        if (!ring_buffer_iter_empty(buf_iter))
                                return 0;
                } else {
                        if (!ring_buffer_empty_cpu(iter->array_buffer->buffer, cpu))
                                return 0;
                }
        }

        return 1;
}

/*  Called with trace_event_read_lock() held. */
enum print_line_t print_trace_line(struct trace_iterator *iter)
{
        struct trace_array *tr = iter->tr;
        unsigned long trace_flags = tr->trace_flags;
        enum print_line_t ret;

        if (iter->lost_events) {
                if (iter->lost_events == (unsigned long)-1)
                        trace_seq_printf(&iter->seq, "CPU:%d [LOST EVENTS]\n",
                                         iter->cpu);
                else
                        trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
                                         iter->cpu, iter->lost_events);
                if (trace_seq_has_overflowed(&iter->seq))
                        return TRACE_TYPE_PARTIAL_LINE;
        }

        if (iter->trace && iter->trace->print_line) {
                ret = iter->trace->print_line(iter);
                if (ret != TRACE_TYPE_UNHANDLED)
                        return ret;
        }

        if (iter->ent->type == TRACE_BPUTS &&
                        trace_flags & TRACE_ITER_PRINTK &&
                        trace_flags & TRACE_ITER_PRINTK_MSGONLY)
                return trace_print_bputs_msg_only(iter);

        if (iter->ent->type == TRACE_BPRINT &&
                        trace_flags & TRACE_ITER_PRINTK &&
                        trace_flags & TRACE_ITER_PRINTK_MSGONLY)
                return trace_print_bprintk_msg_only(iter);

        if (iter->ent->type == TRACE_PRINT &&
                        trace_flags & TRACE_ITER_PRINTK &&
                        trace_flags & TRACE_ITER_PRINTK_MSGONLY)
                return trace_print_printk_msg_only(iter);

        if (trace_flags & TRACE_ITER_BIN)
                return print_bin_fmt(iter);

        if (trace_flags & TRACE_ITER_HEX)
                return print_hex_fmt(iter);

        if (trace_flags & TRACE_ITER_RAW)
                return print_raw_fmt(iter);

        return print_trace_fmt(iter);
}

void trace_latency_header(struct seq_file *m)
{
        struct trace_iterator *iter = m->private;
        struct trace_array *tr = iter->tr;

        /* print nothing if the buffers are empty */
        if (trace_empty(iter))
                return;

        if (iter->iter_flags & TRACE_FILE_LAT_FMT)
                print_trace_header(m, iter);

        if (!(tr->trace_flags & TRACE_ITER_VERBOSE))
                print_lat_help_header(m);
}

void trace_default_header(struct seq_file *m)
{
        struct trace_iterator *iter = m->private;
        struct trace_array *tr = iter->tr;
        unsigned long trace_flags = tr->trace_flags;

        if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
                return;

        if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
                /* print nothing if the buffers are empty */
                if (trace_empty(iter))
                        return;
                print_trace_header(m, iter);
                if (!(trace_flags & TRACE_ITER_VERBOSE))
                        print_lat_help_header(m);
        } else {
                if (!(trace_flags & TRACE_ITER_VERBOSE)) {
                        if (trace_flags & TRACE_ITER_IRQ_INFO)
                                print_func_help_header_irq(iter->array_buffer,
                                                           m, trace_flags);
                        else
                                print_func_help_header(iter->array_buffer, m,
                                                       trace_flags);
                }
        }
}

static void test_ftrace_alive(struct seq_file *m)
{
        if (!ftrace_is_dead())
                return;
        seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"
                    "#          MAY BE MISSING FUNCTION EVENTS\n");
}

#ifdef CONFIG_TRACER_MAX_TRACE
static void show_snapshot_main_help(struct seq_file *m)
{
        seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"
                    "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
                    "#                      Takes a snapshot of the main buffer.\n"
                    "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"
                    "#                      (Doesn't have to be '2' works with any number that\n"
                    "#                       is not a '0' or '1')\n");
}

static void show_snapshot_percpu_help(struct seq_file *m)
{
        seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
        seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
                    "#                      Takes a snapshot of the main buffer for this cpu.\n");
#else
        seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n"
                    "#                     Must use main snapshot file to allocate.\n");
#endif
        seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"
                    "#                      (Doesn't have to be '2' works with any number that\n"
                    "#                       is not a '0' or '1')\n");
}

static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
{
        if (iter->tr->allocated_snapshot)
                seq_puts(m, "#\n# * Snapshot is allocated *\n#\n");
        else
                seq_puts(m, "#\n# * Snapshot is freed *\n#\n");

        seq_puts(m, "# Snapshot commands:\n");
        if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
                show_snapshot_main_help(m);
        else
                show_snapshot_percpu_help(m);
}
#else
/* Should never be called */
static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { }
#endif

static int s_show(struct seq_file *m, void *v)
{
        struct trace_iterator *iter = v;
        int ret;

        if (iter->ent == NULL) {
                if (iter->tr) {
                        seq_printf(m, "# tracer: %s\n", iter->trace->name);
                        seq_puts(m, "#\n");
                        test_ftrace_alive(m);
                }
                if (iter->snapshot && trace_empty(iter))
                        print_snapshot_help(m, iter);
                else if (iter->trace && iter->trace->print_header)
                        iter->trace->print_header(m);
                else
                        trace_default_header(m);

        } else if (iter->leftover) {
                /*
                 * If we filled the seq_file buffer earlier, we
                 * want to just show it now.
                 */
                ret = trace_print_seq(m, &iter->seq);

                /* ret should this time be zero, but you never know */
                iter->leftover = ret;

        } else {
                ret = print_trace_line(iter);
                if (ret == TRACE_TYPE_PARTIAL_LINE) {
                        iter->seq.full = 0;
                        trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n");
                }
                ret = trace_print_seq(m, &iter->seq);
                /*
                 * If we overflow the seq_file buffer, then it will
                 * ask us for this data again at start up.
                 * Use that instead.
                 *  ret is 0 if seq_file write succeeded.
                 *        -1 otherwise.
                 */
                iter->leftover = ret;
        }

        return 0;
}

/*
 * Should be used after trace_array_get(), trace_types_lock
 * ensures that i_cdev was already initialized.
 */
static inline int tracing_get_cpu(struct inode *inode)
{
        if (inode->i_cdev) /* See trace_create_cpu_file() */
                return (long)inode->i_cdev - 1;
        return RING_BUFFER_ALL_CPUS;
}

static const struct seq_operations tracer_seq_ops = {
        .start                = s_start,
        .next                = s_next,
        .stop                = s_stop,
        .show                = s_show,
};

/*
 * Note, as iter itself can be allocated and freed in different
 * ways, this function is only used to free its content, and not
 * the iterator itself. The only requirement to all the allocations
 * is that it must zero all fields (kzalloc), as freeing works with
 * ethier allocated content or NULL.
 */
static void free_trace_iter_content(struct trace_iterator *iter)
{
        /* The fmt is either NULL, allocated or points to static_fmt_buf */
        if (iter->fmt != static_fmt_buf)
                kfree(iter->fmt);

        kfree(iter->temp);
        kfree(iter->buffer_iter);
        mutex_destroy(&iter->mutex);
        free_cpumask_var(iter->started);
}

static struct trace_iterator *
__tracing_open(struct inode *inode, struct file *file, bool snapshot)
{
        struct trace_array *tr = inode->i_private;
        struct trace_iterator *iter;
        int cpu;

        if (tracing_disabled)
                return ERR_PTR(-ENODEV);

        iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter));
        if (!iter)
                return ERR_PTR(-ENOMEM);

        iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter),
                                    GFP_KERNEL);
        if (!iter->buffer_iter)
                goto release;

        /*
         * trace_find_next_entry() may need to save off iter->ent.
         * It will place it into the iter->temp buffer. As most
         * events are less than 128, allocate a buffer of that size.
         * If one is greater, then trace_find_next_entry() will
         * allocate a new buffer to adjust for the bigger iter->ent.
         * It's not critical if it fails to get allocated here.
         */
        iter->temp = kmalloc(128, GFP_KERNEL);
        if (iter->temp)
                iter->temp_size = 128;

        /*
         * trace_event_printf() may need to modify given format
         * string to replace %p with %px so that it shows real address
         * instead of hash value. However, that is only for the event
         * tracing, other tracer may not need. Defer the allocation
         * until it is needed.
         */
        iter->fmt = NULL;
        iter->fmt_size = 0;

        mutex_lock(&trace_types_lock);
        iter->trace = tr->current_trace;

        if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
                goto fail;

        iter->tr = tr;

#ifdef CONFIG_TRACER_MAX_TRACE
        /* Currently only the top directory has a snapshot */
        if (tr->current_trace->print_max || snapshot)
                iter->array_buffer = &tr->max_buffer;
        else
#endif
                iter->array_buffer = &tr->array_buffer;
        iter->snapshot = snapshot;
        iter->pos = -1;
        iter->cpu_file = tracing_get_cpu(inode);
        mutex_init(&iter->mutex);

        /* Notify the tracer early; before we stop tracing. */
        if (iter->trace->open)
                iter->trace->open(iter);

        /* Annotate start of buffers if we had overruns */
        if (ring_buffer_overruns(iter->array_buffer->buffer))
                iter->iter_flags |= TRACE_FILE_ANNOTATE;

        /* Output in nanoseconds only if we are using a clock in nanoseconds. */
        if (trace_clocks[tr->clock_id].in_ns)
                iter->iter_flags |= TRACE_FILE_TIME_IN_NS;

        /*
         * If pause-on-trace is enabled, then stop the trace while
         * dumping, unless this is the "snapshot" file
         */
        if (!iter->snapshot && (tr->trace_flags & TRACE_ITER_PAUSE_ON_TRACE))
                tracing_stop_tr(tr);

        if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
                for_each_tracing_cpu(cpu) {
                        iter->buffer_iter[cpu] =
                                ring_buffer_read_prepare(iter->array_buffer->buffer,
                                                         cpu, GFP_KERNEL);
                }
                ring_buffer_read_prepare_sync();
                for_each_tracing_cpu(cpu) {
                        ring_buffer_read_start(iter->buffer_iter[cpu]);
                        tracing_iter_reset(iter, cpu);
                }
        } else {
                cpu = iter->cpu_file;
                iter->buffer_iter[cpu] =
                        ring_buffer_read_prepare(iter->array_buffer->buffer,
                                                 cpu, GFP_KERNEL);
                ring_buffer_read_prepare_sync();
                ring_buffer_read_start(iter->buffer_iter[cpu]);
                tracing_iter_reset(iter, cpu);
        }

        mutex_unlock(&trace_types_lock);

        return iter;

 fail:
        mutex_unlock(&trace_types_lock);
        free_trace_iter_content(iter);
release:
        seq_release_private(inode, file);
        return ERR_PTR(-ENOMEM);
}

int tracing_open_generic(struct inode *inode, struct file *filp)
{
        int ret;

        ret = tracing_check_open_get_tr(NULL);
        if (ret)
                return ret;

        filp->private_data = inode->i_private;
        return 0;
}

bool tracing_is_disabled(void)
{
        return (tracing_disabled) ? true: false;
}

/*
 * Open and update trace_array ref count.
 * Must have the current trace_array passed to it.
 */
int tracing_open_generic_tr(struct inode *inode, struct file *filp)
{
        struct trace_array *tr = inode->i_private;
        int ret;

        ret = tracing_check_open_get_tr(tr);
        if (ret)
                return ret;

        filp->private_data = inode->i_private;

        return 0;
}

/*
 * The private pointer of the inode is the trace_event_file.
 * Update the tr ref count associated to it.
 */
int tracing_open_file_tr(struct inode *inode, struct file *filp)
{
        struct trace_event_file *file = inode->i_private;
        int ret;

        ret = tracing_check_open_get_tr(file->tr);
        if (ret)
                return ret;

        mutex_lock(&event_mutex);

        /* Fail if the file is marked for removal */
        if (file->flags & EVENT_FILE_FL_FREED) {
                trace_array_put(file->tr);
                ret = -ENODEV;
        } else {
                event_file_get(file);
        }

        mutex_unlock(&event_mutex);
        if (ret)
                return ret;

        filp->private_data = inode->i_private;

        return 0;
}

int tracing_release_file_tr(struct inode *inode, struct file *filp)
{
        struct trace_event_file *file = inode->i_private;

        trace_array_put(file->tr);
        event_file_put(file);

        return 0;
}

int tracing_single_release_file_tr(struct inode *inode, struct file *filp)
{
        tracing_release_file_tr(inode, filp);
        return single_release(inode, filp);
}

static int tracing_mark_open(struct inode *inode, struct file *filp)
{
        stream_open(inode, filp);
        return tracing_open_generic_tr(inode, filp);
}

static int tracing_release(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;
        struct seq_file *m = file->private_data;
        struct trace_iterator *iter;
        int cpu;

        if (!(file->f_mode & FMODE_READ)) {
                trace_array_put(tr);
                return 0;
        }

        /* Writes do not use seq_file */
        iter = m->private;
        mutex_lock(&trace_types_lock);

        for_each_tracing_cpu(cpu) {
                if (iter->buffer_iter[cpu])
                        ring_buffer_read_finish(iter->buffer_iter[cpu]);
        }

        if (iter->trace && iter->trace->close)
                iter->trace->close(iter);

        if (!iter->snapshot && tr->stop_count)
                /* reenable tracing if it was previously enabled */
                tracing_start_tr(tr);

        __trace_array_put(tr);

        mutex_unlock(&trace_types_lock);

        free_trace_iter_content(iter);
        seq_release_private(inode, file);

        return 0;
}

int tracing_release_generic_tr(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;

        trace_array_put(tr);
        return 0;
}

static int tracing_single_release_tr(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;

        trace_array_put(tr);

        return single_release(inode, file);
}

static int tracing_open(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;
        struct trace_iterator *iter;
        int ret;

        ret = tracing_check_open_get_tr(tr);
        if (ret)
                return ret;

        /* If this file was open for write, then erase contents */
        if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
                int cpu = tracing_get_cpu(inode);
                struct array_buffer *trace_buf = &tr->array_buffer;

#ifdef CONFIG_TRACER_MAX_TRACE
                if (tr->current_trace->print_max)
                        trace_buf = &tr->max_buffer;
#endif

                if (cpu == RING_BUFFER_ALL_CPUS)
                        tracing_reset_online_cpus(trace_buf);
                else
                        tracing_reset_cpu(trace_buf, cpu);
        }

        if (file->f_mode & FMODE_READ) {
                iter = __tracing_open(inode, file, false);
                if (IS_ERR(iter))
                        ret = PTR_ERR(iter);
                else if (tr->trace_flags & TRACE_ITER_LATENCY_FMT)
                        iter->iter_flags |= TRACE_FILE_LAT_FMT;
        }

        if (ret < 0)
                trace_array_put(tr);

        return ret;
}

/*
 * Some tracers are not suitable for instance buffers.
 * A tracer is always available for the global array (toplevel)
 * or if it explicitly states that it is.
 */
static bool
trace_ok_for_array(struct tracer *t, struct trace_array *tr)
{
        return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances;
}

/* Find the next tracer that this trace array may use */
static struct tracer *
get_tracer_for_array(struct trace_array *tr, struct tracer *t)
{
        while (t && !trace_ok_for_array(t, tr))
                t = t->next;

        return t;
}

static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct trace_array *tr = m->private;
        struct tracer *t = v;

        (*pos)++;

        if (t)
                t = get_tracer_for_array(tr, t->next);

        return t;
}

static void *t_start(struct seq_file *m, loff_t *pos)
{
        struct trace_array *tr = m->private;
        struct tracer *t;
        loff_t l = 0;

        mutex_lock(&trace_types_lock);

        t = get_tracer_for_array(tr, trace_types);
        for (; t && l < *pos; t = t_next(m, t, &l))
                        ;

        return t;
}

static void t_stop(struct seq_file *m, void *p)
{
        mutex_unlock(&trace_types_lock);
}

static int t_show(struct seq_file *m, void *v)
{
        struct tracer *t = v;

        if (!t)
                return 0;

        seq_puts(m, t->name);
        if (t->next)
                seq_putc(m, ' ');
        else
                seq_putc(m, '\n');

        return 0;
}

static const struct seq_operations show_traces_seq_ops = {
        .start                = t_start,
        .next                = t_next,
        .stop                = t_stop,
        .show                = t_show,
};

static int show_traces_open(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;
        struct seq_file *m;
        int ret;

        ret = tracing_check_open_get_tr(tr);
        if (ret)
                return ret;

        ret = seq_open(file, &show_traces_seq_ops);
        if (ret) {
                trace_array_put(tr);
                return ret;
        }

        m = file->private_data;
        m->private = tr;

        return 0;
}

static int show_traces_release(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;

        trace_array_put(tr);
        return seq_release(inode, file);
}

static ssize_t
tracing_write_stub(struct file *filp, const char __user *ubuf,
                   size_t count, loff_t *ppos)
{
        return count;
}

loff_t tracing_lseek(struct file *file, loff_t offset, int whence)
{
        int ret;

        if (file->f_mode & FMODE_READ)
                ret = seq_lseek(file, offset, whence);
        else
                file->f_pos = ret = 0;

        return ret;
}

static const struct file_operations tracing_fops = {
        .open                = tracing_open,
        .read                = seq_read,
        .read_iter        = seq_read_iter,
        .splice_read        = copy_splice_read,
        .write                = tracing_write_stub,
        .llseek                = tracing_lseek,
        .release        = tracing_release,
};

static const struct file_operations show_traces_fops = {
        .open                = show_traces_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = show_traces_release,
};

static ssize_t
tracing_cpumask_read(struct file *filp, char __user *ubuf,
                     size_t count, loff_t *ppos)
{
        struct trace_array *tr = file_inode(filp)->i_private;
        char *mask_str;
        int len;

        len = snprintf(NULL, 0, "%*pb\n",
                       cpumask_pr_args(tr->tracing_cpumask)) + 1;
        mask_str = kmalloc(len, GFP_KERNEL);
        if (!mask_str)
                return -ENOMEM;

        len = snprintf(mask_str, len, "%*pb\n",
                       cpumask_pr_args(tr->tracing_cpumask));
        if (len >= count) {
                count = -EINVAL;
                goto out_err;
        }
        count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);

out_err:
        kfree(mask_str);

        return count;
}

int tracing_set_cpumask(struct trace_array *tr,
                        cpumask_var_t tracing_cpumask_new)
{
        int cpu;

        if (!tr)
                return -EINVAL;

        local_irq_disable();
        arch_spin_lock(&tr->max_lock);
        for_each_tracing_cpu(cpu) {
                /*
                 * Increase/decrease the disabled counter if we are
                 * about to flip a bit in the cpumask:
                 */
                if (cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
                                !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
                        atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
                        ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu);
#ifdef CONFIG_TRACER_MAX_TRACE
                        ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu);
#endif
                }
                if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
                                cpumask_test_cpu(cpu, tracing_cpumask_new)) {
                        atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
                        ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu);
#ifdef CONFIG_TRACER_MAX_TRACE
                        ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu);
#endif
                }
        }
        arch_spin_unlock(&tr->max_lock);
        local_irq_enable();

        cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);

        return 0;
}

static ssize_t
tracing_cpumask_write(struct file *filp, const char __user *ubuf,
                      size_t count, loff_t *ppos)
{
        struct trace_array *tr = file_inode(filp)->i_private;
        cpumask_var_t tracing_cpumask_new;
        int err;

        if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
                return -ENOMEM;

        err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
        if (err)
                goto err_free;

        err = tracing_set_cpumask(tr, tracing_cpumask_new);
        if (err)
                goto err_free;

        free_cpumask_var(tracing_cpumask_new);

        return count;

err_free:
        free_cpumask_var(tracing_cpumask_new);

        return err;
}

static const struct file_operations tracing_cpumask_fops = {
        .open                = tracing_open_generic_tr,
        .read                = tracing_cpumask_read,
        .write                = tracing_cpumask_write,
        .release        = tracing_release_generic_tr,
        .llseek                = generic_file_llseek,
};

static int tracing_trace_options_show(struct seq_file *m, void *v)
{
        struct tracer_opt *trace_opts;
        struct trace_array *tr = m->private;
        u32 tracer_flags;
        int i;

        mutex_lock(&trace_types_lock);
        tracer_flags = tr->current_trace->flags->val;
        trace_opts = tr->current_trace->flags->opts;

        for (i = 0; trace_options[i]; i++) {
                if (tr->trace_flags & (1 << i))
                        seq_printf(m, "%s\n", trace_options[i]);
                else
                        seq_printf(m, "no%s\n", trace_options[i]);
        }

        for (i = 0; trace_opts[i].name; i++) {
                if (tracer_flags & trace_opts[i].bit)
                        seq_printf(m, "%s\n", trace_opts[i].name);
                else
                        seq_printf(m, "no%s\n", trace_opts[i].name);
        }
        mutex_unlock(&trace_types_lock);

        return 0;
}

static int __set_tracer_option(struct trace_array *tr,
                               struct tracer_flags *tracer_flags,
                               struct tracer_opt *opts, int neg)
{
        struct tracer *trace = tracer_flags->trace;
        int ret;

        ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg);
        if (ret)
                return ret;

        if (neg)
                tracer_flags->val &= ~opts->bit;
        else
                tracer_flags->val |= opts->bit;
        return 0;
}

/* Try to assign a tracer specific option */
static int set_tracer_option(struct trace_array *tr, char *cmp, int neg)
{
        struct tracer *trace = tr->current_trace;
        struct tracer_flags *tracer_flags = trace->flags;
        struct tracer_opt *opts = NULL;
        int i;

        for (i = 0; tracer_flags->opts[i].name; i++) {
                opts = &tracer_flags->opts[i];

                if (strcmp(cmp, opts->name) == 0)
                        return __set_tracer_option(tr, trace->flags, opts, neg);
        }

        return -EINVAL;
}

/* Some tracers require overwrite to stay enabled */
int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
{
        if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set)
                return -1;

        return 0;
}

int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
        if ((mask == TRACE_ITER_RECORD_TGID) ||
            (mask == TRACE_ITER_RECORD_CMD))
                lockdep_assert_held(&event_mutex);

        /* do nothing if flag is already set */
        if (!!(tr->trace_flags & mask) == !!enabled)
                return 0;

        /* Give the tracer a chance to approve the change */
        if (tr->current_trace->flag_changed)
                if (tr->current_trace->flag_changed(tr, mask, !!enabled))
                        return -EINVAL;

        if (enabled)
                tr->trace_flags |= mask;
        else
                tr->trace_flags &= ~mask;

        if (mask == TRACE_ITER_RECORD_CMD)
                trace_event_enable_cmd_record(enabled);

        if (mask == TRACE_ITER_RECORD_TGID) {

                if (trace_alloc_tgid_map() < 0) {
                        tr->trace_flags &= ~TRACE_ITER_RECORD_TGID;
                        return -ENOMEM;
                }

                trace_event_enable_tgid_record(enabled);
        }

        if (mask == TRACE_ITER_EVENT_FORK)
                trace_event_follow_fork(tr, enabled);

        if (mask == TRACE_ITER_FUNC_FORK)
                ftrace_pid_follow_fork(tr, enabled);

        if (mask == TRACE_ITER_OVERWRITE) {
                ring_buffer_change_overwrite(tr->array_buffer.buffer, enabled);
#ifdef CONFIG_TRACER_MAX_TRACE
                ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled);
#endif
        }

        if (mask == TRACE_ITER_PRINTK) {
                trace_printk_start_stop_comm(enabled);
                trace_printk_control(enabled);
        }

        return 0;
}

int trace_set_options(struct trace_array *tr, char *option)
{
        char *cmp;
        int neg = 0;
        int ret;
        size_t orig_len = strlen(option);
        int len;

        cmp = strstrip(option);

        len = str_has_prefix(cmp, "no");
        if (len)
                neg = 1;

        cmp += len;

        mutex_lock(&event_mutex);
        mutex_lock(&trace_types_lock);

        ret = match_string(trace_options, -1, cmp);
        /* If no option could be set, test the specific tracer options */
        if (ret < 0)
                ret = set_tracer_option(tr, cmp, neg);
        else
                ret = set_tracer_flag(tr, 1 << ret, !neg);

        mutex_unlock(&trace_types_lock);
        mutex_unlock(&event_mutex);

        /*
         * If the first trailing whitespace is replaced with '\0' by strstrip,
         * turn it back into a space.
         */
        if (orig_len > strlen(option))
                option[strlen(option)] = ' ';

        return ret;
}

static void __init apply_trace_boot_options(void)
{
        char *buf = trace_boot_options_buf;
        char *option;

        while (true) {
                option = strsep(&buf, ",");

                if (!option)
                        break;

                if (*option)
                        trace_set_options(&global_trace, option);

                /* Put back the comma to allow this to be called again */
                if (buf)
                        *(buf - 1) = ',';
        }
}

static ssize_t
tracing_trace_options_write(struct file *filp, const char __user *ubuf,
                        size_t cnt, loff_t *ppos)
{
        struct seq_file *m = filp->private_data;
        struct trace_array *tr = m->private;
        char buf[64];
        int ret;

        if (cnt >= sizeof(buf))
                return -EINVAL;

        if (copy_from_user(buf, ubuf, cnt))
                return -EFAULT;

        buf[cnt] = 0;

        ret = trace_set_options(tr, buf);
        if (ret < 0)
                return ret;

        *ppos += cnt;

        return cnt;
}

static int tracing_trace_options_open(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;
        int ret;

        ret = tracing_check_open_get_tr(tr);
        if (ret)
                return ret;

        ret = single_open(file, tracing_trace_options_show, inode->i_private);
        if (ret < 0)
                trace_array_put(tr);

        return ret;
}

static const struct file_operations tracing_iter_fops = {
        .open                = tracing_trace_options_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = tracing_single_release_tr,
        .write                = tracing_trace_options_write,
};

static const char readme_msg[] =
        "tracing mini-HOWTO:\n\n"
        "# echo 0 > tracing_on : quick way to disable tracing\n"
        "# echo 1 > tracing_on : quick way to re-enable tracing\n\n"
        " Important files:\n"
        "  trace\t\t\t- The static contents of the buffer\n"
        "\t\t\t  To clear the buffer write into this file: echo > trace\n"
        "  trace_pipe\t\t- A consuming read to see the contents of the buffer\n"
        "  current_tracer\t- function and latency tracers\n"
        "  available_tracers\t- list of configured tracers for current_tracer\n"
        "  error_log\t- error log for failed commands (that support it)\n"
        "  buffer_size_kb\t- view and modify size of per cpu buffer\n"
        "  buffer_total_size_kb  - view total size of all cpu buffers\n\n"
        "  trace_clock\t\t- change the clock used to order events\n"
        "       local:   Per cpu clock but may not be synced across CPUs\n"
        "      global:   Synced across CPUs but slows tracing down.\n"
        "     counter:   Not a clock, but just an increment\n"
        "      uptime:   Jiffy counter from time of boot\n"
        "        perf:   Same clock that perf events use\n"
#ifdef CONFIG_X86_64
        "     x86-tsc:   TSC cycle counter\n"
#endif
        "\n  timestamp_mode\t- view the mode used to timestamp events\n"
        "       delta:   Delta difference against a buffer-wide timestamp\n"
        "    absolute:   Absolute (standalone) timestamp\n"
        "\n  trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
        "\n  trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n"
        "  tracing_cpumask\t- Limit which CPUs to trace\n"
        "  instances\t\t- Make sub-buffers with: mkdir instances/foo\n"
        "\t\t\t  Remove sub-buffer with rmdir\n"
        "  trace_options\t\t- Set format or modify how tracing happens\n"
        "\t\t\t  Disable an option by prefixing 'no' to the\n"
        "\t\t\t  option name\n"
        "  saved_cmdlines_size\t- echo command number in here to store comm-pid list\n"
#ifdef CONFIG_DYNAMIC_FTRACE
        "\n  available_filter_functions - list of functions that can be filtered on\n"
        "  set_ftrace_filter\t- echo function name in here to only trace these\n"
        "\t\t\t  functions\n"
        "\t     accepts: func_full_name or glob-matching-pattern\n"
        "\t     modules: Can select a group via module\n"
        "\t      Format: :mod:<module-name>\n"
        "\t     example: echo :mod:ext3 > set_ftrace_filter\n"
        "\t    triggers: a command to perform when function is hit\n"
        "\t      Format: <function>:<trigger>[:count]\n"
        "\t     trigger: traceon, traceoff\n"
        "\t\t      enable_event:<system>:<event>\n"
        "\t\t      disable_event:<system>:<event>\n"
#ifdef CONFIG_STACKTRACE
        "\t\t      stacktrace\n"
#endif
#ifdef CONFIG_TRACER_SNAPSHOT
        "\t\t      snapshot\n"
#endif
        "\t\t      dump\n"
        "\t\t      cpudump\n"
        "\t     example: echo do_fault:traceoff > set_ftrace_filter\n"
        "\t              echo do_trap:traceoff:3 > set_ftrace_filter\n"
        "\t     The first one will disable tracing every time do_fault is hit\n"
        "\t     The second will disable tracing at most 3 times when do_trap is hit\n"
        "\t       The first time do trap is hit and it disables tracing, the\n"
        "\t       counter will decrement to 2. If tracing is already disabled,\n"
        "\t       the counter will not decrement. It only decrements when the\n"
        "\t       trigger did work\n"
        "\t     To remove trigger without count:\n"
        "\t       echo '!<function>:<trigger> > set_ftrace_filter\n"
        "\t     To remove trigger with a count:\n"
        "\t       echo '!<function>:<trigger>:0 > set_ftrace_filter\n"
        "  set_ftrace_notrace\t- echo function name in here to never trace.\n"
        "\t    accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
        "\t    modules: Can select a group via module command :mod:\n"
        "\t    Does not accept triggers\n"
#endif /* CONFIG_DYNAMIC_FTRACE */
#ifdef CONFIG_FUNCTION_TRACER
        "  set_ftrace_pid\t- Write pid(s) to only function trace those pids\n"
        "\t\t    (function)\n"
        "  set_ftrace_notrace_pid\t- Write pid(s) to not function trace those pids\n"
        "\t\t    (function)\n"
#endif
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
        "  set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
        "  set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n"
        "  max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n"
#endif
#ifdef CONFIG_TRACER_SNAPSHOT
        "\n  snapshot\t\t- Like 'trace' but shows the content of the static\n"
        "\t\t\t  snapshot buffer. Read the contents for more\n"
        "\t\t\t  information\n"
#endif
#ifdef CONFIG_STACK_TRACER
        "  stack_trace\t\t- Shows the max stack trace when active\n"
        "  stack_max_size\t- Shows current max stack size that was traced\n"
        "\t\t\t  Write into this file to reset the max size (trigger a\n"
        "\t\t\t  new trace)\n"
#ifdef CONFIG_DYNAMIC_FTRACE
        "  stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace\n"
        "\t\t\t  traces\n"
#endif
#endif /* CONFIG_STACK_TRACER */
#ifdef CONFIG_DYNAMIC_EVENTS
        "  dynamic_events\t\t- Create/append/remove/show the generic dynamic events\n"
        "\t\t\t  Write into this file to define/undefine new trace events.\n"
#endif
#ifdef CONFIG_KPROBE_EVENTS
        "  kprobe_events\t\t- Create/append/remove/show the kernel dynamic events\n"
        "\t\t\t  Write into this file to define/undefine new trace events.\n"
#endif
#ifdef CONFIG_UPROBE_EVENTS
        "  uprobe_events\t\t- Create/append/remove/show the userspace dynamic events\n"
        "\t\t\t  Write into this file to define/undefine new trace events.\n"
#endif
#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) || \
    defined(CONFIG_FPROBE_EVENTS)
        "\t  accepts: event-definitions (one definition per line)\n"
#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
        "\t   Format: p[:[<group>/][<event>]] <place> [<args>]\n"
        "\t           r[maxactive][:[<group>/][<event>]] <place> [<args>]\n"
#endif
#ifdef CONFIG_FPROBE_EVENTS
        "\t           f[:[<group>/][<event>]] <func-name>[%return] [<args>]\n"
        "\t           t[:[<group>/][<event>]] <tracepoint> [<args>]\n"
#endif
#ifdef CONFIG_HIST_TRIGGERS
        "\t           s:[synthetic/]<event> <field> [<field>]\n"
#endif
        "\t           e[:[<group>/][<event>]] <attached-group>.<attached-event> [<args>] [if <filter>]\n"
        "\t           -:[<group>/][<event>]\n"
#ifdef CONFIG_KPROBE_EVENTS
        "\t    place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
  "place (kretprobe): [<module>:]<symbol>[+<offset>]%return|<memaddr>\n"
#endif
#ifdef CONFIG_UPROBE_EVENTS
  "   place (uprobe): <path>:<offset>[%return][(ref_ctr_offset)]\n"
#endif
        "\t     args: <name>=fetcharg[:type]\n"
        "\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n"
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
        "\t           $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
        "\t           <argname>[->field[->field|.field...]],\n"
#endif
#else
        "\t           $stack<index>, $stack, $retval, $comm,\n"
#endif
        "\t           +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n"
        "\t     kernel return probes support: $retval, $arg<N>, $comm\n"
        "\t     type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n"
        "\t           b<bit-width>@<bit-offset>/<container-size>, ustring,\n"
        "\t           symstr, %pd/%pD, <type>\\[<array-size>\\]\n"
#ifdef CONFIG_HIST_TRIGGERS
        "\t    field: <stype> <name>;\n"
        "\t    stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n"
        "\t           [unsigned] char/int/long\n"
#endif
        "\t    efield: For event probes ('e' types), the field is on of the fields\n"
        "\t            of the <attached-group>/<attached-event>.\n"
#endif
        "  events/\t\t- Directory containing all trace event subsystems:\n"
        "      enable\t\t- Write 0/1 to enable/disable tracing of all events\n"
        "  events/<system>/\t- Directory containing all trace events for <system>:\n"
        "      enable\t\t- Write 0/1 to enable/disable tracing of all <system>\n"
        "\t\t\t  events\n"
        "      filter\t\t- If set, only events passing filter are traced\n"
        "  events/<system>/<event>/\t- Directory containing control files for\n"
        "\t\t\t  <event>:\n"
        "      enable\t\t- Write 0/1 to enable/disable tracing of <event>\n"
        "      filter\t\t- If set, only events passing filter are traced\n"
        "      trigger\t\t- If set, a command to perform when event is hit\n"
        "\t    Format: <trigger>[:count][if <filter>]\n"
        "\t   trigger: traceon, traceoff\n"
        "\t            enable_event:<system>:<event>\n"
        "\t            disable_event:<system>:<event>\n"
#ifdef CONFIG_HIST_TRIGGERS
        "\t            enable_hist:<system>:<event>\n"
        "\t            disable_hist:<system>:<event>\n"
#endif
#ifdef CONFIG_STACKTRACE
        "\t\t    stacktrace\n"
#endif
#ifdef CONFIG_TRACER_SNAPSHOT
        "\t\t    snapshot\n"
#endif
#ifdef CONFIG_HIST_TRIGGERS
        "\t\t    hist (see below)\n"
#endif
        "\t   example: echo traceoff > events/block/block_unplug/trigger\n"
        "\t            echo traceoff:3 > events/block/block_unplug/trigger\n"
        "\t            echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n"
        "\t                  events/block/block_unplug/trigger\n"
        "\t   The first disables tracing every time block_unplug is hit.\n"
        "\t   The second disables tracing the first 3 times block_unplug is hit.\n"
        "\t   The third enables the kmalloc event the first 3 times block_unplug\n"
        "\t     is hit and has value of greater than 1 for the 'nr_rq' event field.\n"
        "\t   Like function triggers, the counter is only decremented if it\n"
        "\t    enabled or disabled tracing.\n"
        "\t   To remove a trigger without a count:\n"
        "\t     echo '!<trigger> > <system>/<event>/trigger\n"
        "\t   To remove a trigger with a count:\n"
        "\t     echo '!<trigger>:0 > <system>/<event>/trigger\n"
        "\t   Filters can be ignored when removing a trigger.\n"
#ifdef CONFIG_HIST_TRIGGERS
        "      hist trigger\t- If set, event hits are aggregated into a hash table\n"
        "\t    Format: hist:keys=<field1[,field2,...]>\n"
        "\t            [:<var1>=<field|var_ref|numeric_literal>[,<var2>=...]]\n"
        "\t            [:values=<field1[,field2,...]>]\n"
        "\t            [:sort=<field1[,field2,...]>]\n"
        "\t            [:size=#entries]\n"
        "\t            [:pause][:continue][:clear]\n"
        "\t            [:name=histname1]\n"
        "\t            [:nohitcount]\n"
        "\t            [:<handler>.<action>]\n"
        "\t            [if <filter>]\n\n"
        "\t    Note, special fields can be used as well:\n"
        "\t            common_timestamp - to record current timestamp\n"
        "\t            common_cpu - to record the CPU the event happened on\n"
        "\n"
        "\t    A hist trigger variable can be:\n"
        "\t        - a reference to a field e.g. x=current_timestamp,\n"
        "\t        - a reference to another variable e.g. y=$x,\n"
        "\t        - a numeric literal: e.g. ms_per_sec=1000,\n"
        "\t        - an arithmetic expression: e.g. time_secs=current_timestamp/1000\n"
        "\n"
        "\t    hist trigger arithmetic expressions support addition(+), subtraction(-),\n"
        "\t    multiplication(*) and division(/) operators. An operand can be either a\n"
        "\t    variable reference, field or numeric literal.\n"
        "\n"
        "\t    When a matching event is hit, an entry is added to a hash\n"
        "\t    table using the key(s) and value(s) named, and the value of a\n"
        "\t    sum called 'hitcount' is incremented.  Keys and values\n"
        "\t    correspond to fields in the event's format description.  Keys\n"
        "\t    can be any field, or the special string 'common_stacktrace'.\n"
        "\t    Compound keys consisting of up to two fields can be specified\n"
        "\t    by the 'keys' keyword.  Values must correspond to numeric\n"
        "\t    fields.  Sort keys consisting of up to two fields can be\n"
        "\t    specified using the 'sort' keyword.  The sort direction can\n"
        "\t    be modified by appending '.descending' or '.ascending' to a\n"
        "\t    sort field.  The 'size' parameter can be used to specify more\n"
        "\t    or fewer than the default 2048 entries for the hashtable size.\n"
        "\t    If a hist trigger is given a name using the 'name' parameter,\n"
        "\t    its histogram data will be shared with other triggers of the\n"
        "\t    same name, and trigger hits will update this common data.\n\n"
        "\t    Reading the 'hist' file for the event will dump the hash\n"
        "\t    table in its entirety to stdout.  If there are multiple hist\n"
        "\t    triggers attached to an event, there will be a table for each\n"
        "\t    trigger in the output.  The table displayed for a named\n"
        "\t    trigger will be the same as any other instance having the\n"
        "\t    same name.  The default format used to display a given field\n"
        "\t    can be modified by appending any of the following modifiers\n"
        "\t    to the field name, as applicable:\n\n"
        "\t            .hex        display a number as a hex value\n"
        "\t            .sym        display an address as a symbol\n"
        "\t            .sym-offset display an address as a symbol and offset\n"
        "\t            .execname   display a common_pid as a program name\n"
        "\t            .syscall    display a syscall id as a syscall name\n"
        "\t            .log2       display log2 value rather than raw number\n"
        "\t            .buckets=size  display values in groups of size rather than raw number\n"
        "\t            .usecs      display a common_timestamp in microseconds\n"
        "\t            .percent    display a number of percentage value\n"
        "\t            .graph      display a bar-graph of a value\n\n"
        "\t    The 'pause' parameter can be used to pause an existing hist\n"
        "\t    trigger or to start a hist trigger but not log any events\n"
        "\t    until told to do so.  'continue' can be used to start or\n"
        "\t    restart a paused hist trigger.\n\n"
        "\t    The 'clear' parameter will clear the contents of a running\n"
        "\t    hist trigger and leave its current paused/active state\n"
        "\t    unchanged.\n\n"
        "\t    The 'nohitcount' (or NOHC) parameter will suppress display of\n"
        "\t    raw hitcount in the histogram.\n\n"
        "\t    The enable_hist and disable_hist triggers can be used to\n"
        "\t    have one event conditionally start and stop another event's\n"
        "\t    already-attached hist trigger.  The syntax is analogous to\n"
        "\t    the enable_event and disable_event triggers.\n\n"
        "\t    Hist trigger handlers and actions are executed whenever a\n"
        "\t    a histogram entry is added or updated.  They take the form:\n\n"
        "\t        <handler>.<action>\n\n"
        "\t    The available handlers are:\n\n"
        "\t        onmatch(matching.event)  - invoke on addition or update\n"
        "\t        onmax(var)               - invoke if var exceeds current max\n"
        "\t        onchange(var)            - invoke action if var changes\n\n"
        "\t    The available actions are:\n\n"
        "\t        trace(<synthetic_event>,param list)  - generate synthetic event\n"
        "\t        save(field,...)                      - save current event fields\n"
#ifdef CONFIG_TRACER_SNAPSHOT
        "\t        snapshot()                           - snapshot the trace buffer\n\n"
#endif
#ifdef CONFIG_SYNTH_EVENTS
        "  events/synthetic_events\t- Create/append/remove/show synthetic events\n"
        "\t  Write into this file to define/undefine new synthetic events.\n"
        "\t     example: echo 'myevent u64 lat; char name[]; long[] stack' >> synthetic_events\n"
#endif
#endif
;

static ssize_t
tracing_readme_read(struct file *filp, char __user *ubuf,
                       size_t cnt, loff_t *ppos)
{
        return simple_read_from_buffer(ubuf, cnt, ppos,
                                        readme_msg, strlen(readme_msg));
}

static const struct file_operations tracing_readme_fops = {
        .open                = tracing_open_generic,
        .read                = tracing_readme_read,
        .llseek                = generic_file_llseek,
};

#ifdef CONFIG_TRACE_EVAL_MAP_FILE
static union trace_eval_map_item *
update_eval_map(union trace_eval_map_item *ptr)
{
        if (!ptr->map.eval_string) {
                if (ptr->tail.next) {
                        ptr = ptr->tail.next;
                        /* Set ptr to the next real item (skip head) */
                        ptr++;
                } else
                        return NULL;
        }
        return ptr;
}

static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos)
{
        union trace_eval_map_item *ptr = v;

        /*
         * Paranoid! If ptr points to end, we don't want to increment past it.
         * This really should never happen.
         */
        (*pos)++;
        ptr = update_eval_map(ptr);
        if (WARN_ON_ONCE(!ptr))
                return NULL;

        ptr++;
        ptr = update_eval_map(ptr);

        return ptr;
}

static void *eval_map_start(struct seq_file *m, loff_t *pos)
{
        union trace_eval_map_item *v;
        loff_t l = 0;

        mutex_lock(&trace_eval_mutex);

        v = trace_eval_maps;
        if (v)
                v++;

        while (v && l < *pos) {
                v = eval_map_next(m, v, &l);
        }

        return v;
}

static void eval_map_stop(struct seq_file *m, void *v)
{
        mutex_unlock(&trace_eval_mutex);
}

static int eval_map_show(struct seq_file *m, void *v)
{
        union trace_eval_map_item *ptr = v;

        seq_printf(m, "%s %ld (%s)\n",
                   ptr->map.eval_string, ptr->map.eval_value,
                   ptr->map.system);

        return 0;
}

static const struct seq_operations tracing_eval_map_seq_ops = {
        .start                = eval_map_start,
        .next                = eval_map_next,
        .stop                = eval_map_stop,
        .show                = eval_map_show,
};

static int tracing_eval_map_open(struct inode *inode, struct file *filp)
{
        int ret;

        ret = tracing_check_open_get_tr(NULL);
        if (ret)
                return ret;

        return seq_open(filp, &tracing_eval_map_seq_ops);
}

static const struct file_operations tracing_eval_map_fops = {
        .open                = tracing_eval_map_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = seq_release,
};

static inline union trace_eval_map_item *
trace_eval_jmp_to_tail(union trace_eval_map_item *ptr)
{
        /* Return tail of array given the head */
        return ptr + ptr->head.length + 1;
}

static void
trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
                           int len)
{
        struct trace_eval_map **stop;
        struct trace_eval_map **map;
        union trace_eval_map_item *map_array;
        union trace_eval_map_item *ptr;

        stop = start + len;

        /*
         * The trace_eval_maps contains the map plus a head and tail item,
         * where the head holds the module and length of array, and the
         * tail holds a pointer to the next list.
         */
        map_array = kmalloc_array(len + 2, sizeof(*map_array), GFP_KERNEL);
        if (!map_array) {
                pr_warn("Unable to allocate trace eval mapping\n");
                return;
        }

        mutex_lock(&trace_eval_mutex);

        if (!trace_eval_maps)
                trace_eval_maps = map_array;
        else {
                ptr = trace_eval_maps;
                for (;;) {
                        ptr = trace_eval_jmp_to_tail(ptr);
                        if (!ptr->tail.next)
                                break;
                        ptr = ptr->tail.next;

                }
                ptr->tail.next = map_array;
        }
        map_array->head.mod = mod;
        map_array->head.length = len;
        map_array++;

        for (map = start; (unsigned long)map < (unsigned long)stop; map++) {
                map_array->map = **map;
                map_array++;
        }
        memset(map_array, 0, sizeof(*map_array));

        mutex_unlock(&trace_eval_mutex);
}

static void trace_create_eval_file(struct dentry *d_tracer)
{
        trace_create_file("eval_map", TRACE_MODE_READ, d_tracer,
                          NULL, &tracing_eval_map_fops);
}

#else /* CONFIG_TRACE_EVAL_MAP_FILE */
static inline void trace_create_eval_file(struct dentry *d_tracer) { }
static inline void trace_insert_eval_map_file(struct module *mod,
                              struct trace_eval_map **start, int len) { }
#endif /* !CONFIG_TRACE_EVAL_MAP_FILE */

static void trace_insert_eval_map(struct module *mod,
                                  struct trace_eval_map **start, int len)
{
        struct trace_eval_map **map;

        if (len <= 0)
                return;

        map = start;

        trace_event_eval_update(map, len);

        trace_insert_eval_map_file(mod, start, len);
}

static ssize_t
tracing_set_trace_read(struct file *filp, char __user *ubuf,
                       size_t cnt, loff_t *ppos)
{
        struct trace_array *tr = filp->private_data;
        char buf[MAX_TRACER_SIZE+2];
        int r;

        mutex_lock(&trace_types_lock);
        r = sprintf(buf, "%s\n", tr->current_trace->name);
        mutex_unlock(&trace_types_lock);

        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}

int tracer_init(struct tracer *t, struct trace_array *tr)
{
        tracing_reset_online_cpus(&tr->array_buffer);
        return t->init(tr);
}

static void set_buffer_entries(struct array_buffer *buf, unsigned long val)
{
        int cpu;

        for_each_tracing_cpu(cpu)
                per_cpu_ptr(buf->data, cpu)->entries = val;
}

static void update_buffer_entries(struct array_buffer *buf, int cpu)
{
        if (cpu == RING_BUFFER_ALL_CPUS) {
                set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0));
        } else {
                per_cpu_ptr(buf->data, cpu)->entries = ring_buffer_size(buf->buffer, cpu);
        }
}

#ifdef CONFIG_TRACER_MAX_TRACE
/* resize @tr's buffer to the size of @size_tr's entries */
static int resize_buffer_duplicate_size(struct array_buffer *trace_buf,
                                        struct array_buffer *size_buf, int cpu_id)
{
        int cpu, ret = 0;

        if (cpu_id == RING_BUFFER_ALL_CPUS) {
                for_each_tracing_cpu(cpu) {
                        ret = ring_buffer_resize(trace_buf->buffer,
                                 per_cpu_ptr(size_buf->data, cpu)->entries, cpu);
                        if (ret < 0)
                                break;
                        per_cpu_ptr(trace_buf->data, cpu)->entries =
                                per_cpu_ptr(size_buf->data, cpu)->entries;
                }
        } else {
                ret = ring_buffer_resize(trace_buf->buffer,
                                 per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);
                if (ret == 0)
                        per_cpu_ptr(trace_buf->data, cpu_id)->entries =
                                per_cpu_ptr(size_buf->data, cpu_id)->entries;
        }

        return ret;
}
#endif /* CONFIG_TRACER_MAX_TRACE */

static int __tracing_resize_ring_buffer(struct trace_array *tr,
                                        unsigned long size, int cpu)
{
        int ret;

        /*
         * If kernel or user changes the size of the ring buffer
         * we use the size that was given, and we can forget about
         * expanding it later.
         */
        trace_set_ring_buffer_expanded(tr);

        /* May be called before buffers are initialized */
        if (!tr->array_buffer.buffer)
                return 0;

        /* Do not allow tracing while resizing ring buffer */
        tracing_stop_tr(tr);

        ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu);
        if (ret < 0)
                goto out_start;

#ifdef CONFIG_TRACER_MAX_TRACE
        if (!tr->allocated_snapshot)
                goto out;

        ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
        if (ret < 0) {
                int r = resize_buffer_duplicate_size(&tr->array_buffer,
                                                     &tr->array_buffer, cpu);
                if (r < 0) {
                        /*
                         * AARGH! We are left with different
                         * size max buffer!!!!
                         * The max buffer is our "snapshot" buffer.
                         * When a tracer needs a snapshot (one of the
                         * latency tracers), it swaps the max buffer
                         * with the saved snap shot. We succeeded to
                         * update the size of the main buffer, but failed to
                         * update the size of the max buffer. But when we tried
                         * to reset the main buffer to the original size, we
                         * failed there too. This is very unlikely to
                         * happen, but if it does, warn and kill all
                         * tracing.
                         */
                        WARN_ON(1);
                        tracing_disabled = 1;
                }
                goto out_start;
        }

        update_buffer_entries(&tr->max_buffer, cpu);

 out:
#endif /* CONFIG_TRACER_MAX_TRACE */

        update_buffer_entries(&tr->array_buffer, cpu);
 out_start:
        tracing_start_tr(tr);
        return ret;
}

ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
                                  unsigned long size, int cpu_id)
{
        int ret;

        mutex_lock(&trace_types_lock);

        if (cpu_id != RING_BUFFER_ALL_CPUS) {
                /* make sure, this cpu is enabled in the mask */
                if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) {
                        ret = -EINVAL;
                        goto out;
                }
        }

        ret = __tracing_resize_ring_buffer(tr, size, cpu_id);
        if (ret < 0)
                ret = -ENOMEM;

out:
        mutex_unlock(&trace_types_lock);

        return ret;
}


/**
 * tracing_update_buffers - used by tracing facility to expand ring buffers
 * @tr: The tracing instance
 *
 * To save on memory when the tracing is never used on a system with it
 * configured in. The ring buffers are set to a minimum size. But once
 * a user starts to use the tracing facility, then they need to grow
 * to their default size.
 *
 * This function is to be called when a tracer is about to be used.
 */
int tracing_update_buffers(struct trace_array *tr)
{
        int ret = 0;

        mutex_lock(&trace_types_lock);
        if (!tr->ring_buffer_expanded)
                ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
                                                RING_BUFFER_ALL_CPUS);
        mutex_unlock(&trace_types_lock);

        return ret;
}

struct trace_option_dentry;

static void
create_trace_option_files(struct trace_array *tr, struct tracer *tracer);

/*
 * Used to clear out the tracer before deletion of an instance.
 * Must have trace_types_lock held.
 */
static void tracing_set_nop(struct trace_array *tr)
{
        if (tr->current_trace == &nop_trace)
                return;

        tr->current_trace->enabled--;

        if (tr->current_trace->reset)
                tr->current_trace->reset(tr);

        tr->current_trace = &nop_trace;
}

static bool tracer_options_updated;

static void add_tracer_options(struct trace_array *tr, struct tracer *t)
{
        /* Only enable if the directory has been created already. */
        if (!tr->dir)
                return;

        /* Only create trace option files after update_tracer_options finish */
        if (!tracer_options_updated)
                return;

        create_trace_option_files(tr, t);
}

int tracing_set_tracer(struct trace_array *tr, const char *buf)
{
        struct tracer *t;
#ifdef CONFIG_TRACER_MAX_TRACE
        bool had_max_tr;
#endif
        int ret = 0;

        mutex_lock(&trace_types_lock);

        if (!tr->ring_buffer_expanded) {
                ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
                                                RING_BUFFER_ALL_CPUS);
                if (ret < 0)
                        goto out;
                ret = 0;
        }

        for (t = trace_types; t; t = t->next) {
                if (strcmp(t->name, buf) == 0)
                        break;
        }
        if (!t) {
                ret = -EINVAL;
                goto out;
        }
        if (t == tr->current_trace)
                goto out;

#ifdef CONFIG_TRACER_SNAPSHOT
        if (t->use_max_tr) {
                local_irq_disable();
                arch_spin_lock(&tr->max_lock);
                if (tr->cond_snapshot)
                        ret = -EBUSY;
                arch_spin_unlock(&tr->max_lock);
                local_irq_enable();
                if (ret)
                        goto out;
        }
#endif
        /* Some tracers won't work on kernel command line */
        if (system_state < SYSTEM_RUNNING && t->noboot) {
                pr_warn("Tracer '%s' is not allowed on command line, ignored\n",
                        t->name);
                goto out;
        }

        /* Some tracers are only allowed for the top level buffer */
        if (!trace_ok_for_array(t, tr)) {
                ret = -EINVAL;
                goto out;
        }

        /* If trace pipe files are being read, we can't change the tracer */
        if (tr->trace_ref) {
                ret = -EBUSY;
                goto out;
        }

        trace_branch_disable();

        tr->current_trace->enabled--;

        if (tr->current_trace->reset)
                tr->current_trace->reset(tr);

#ifdef CONFIG_TRACER_MAX_TRACE
        had_max_tr = tr->current_trace->use_max_tr;

        /* Current trace needs to be nop_trace before synchronize_rcu */
        tr->current_trace = &nop_trace;

        if (had_max_tr && !t->use_max_tr) {
                /*
                 * We need to make sure that the update_max_tr sees that
                 * current_trace changed to nop_trace to keep it from
                 * swapping the buffers after we resize it.
                 * The update_max_tr is called from interrupts disabled
                 * so a synchronized_sched() is sufficient.
                 */
                synchronize_rcu();
                free_snapshot(tr);
                tracing_disarm_snapshot(tr);
        }

        if (!had_max_tr && t->use_max_tr) {
                ret = tracing_arm_snapshot_locked(tr);
                if (ret)
                        goto out;
        }
#else
        tr->current_trace = &nop_trace;
#endif

        if (t->init) {
                ret = tracer_init(t, tr);
                if (ret) {
#ifdef CONFIG_TRACER_MAX_TRACE
                        if (t->use_max_tr)
                                tracing_disarm_snapshot(tr);
#endif
                        goto out;
                }
        }

        tr->current_trace = t;
        tr->current_trace->enabled++;
        trace_branch_enable(tr);
 out:
        mutex_unlock(&trace_types_lock);

        return ret;
}

static ssize_t
tracing_set_trace_write(struct file *filp, const char __user *ubuf,
                        size_t cnt, loff_t *ppos)
{
        struct trace_array *tr = filp->private_data;
        char buf[MAX_TRACER_SIZE+1];
        char *name;
        size_t ret;
        int err;

        ret = cnt;

        if (cnt > MAX_TRACER_SIZE)
                cnt = MAX_TRACER_SIZE;

        if (copy_from_user(buf, ubuf, cnt))
                return -EFAULT;

        buf[cnt] = 0;

        name = strim(buf);

        err = tracing_set_tracer(tr, name);
        if (err)
                return err;

        *ppos += ret;

        return ret;
}

static ssize_t
tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
                   size_t cnt, loff_t *ppos)
{
        char buf[64];
        int r;

        r = snprintf(buf, sizeof(buf), "%ld\n",
                     *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr));
        if (r > sizeof(buf))
                r = sizeof(buf);
        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}

static ssize_t
tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf,
                    size_t cnt, loff_t *ppos)
{
        unsigned long val;
        int ret;

        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
        if (ret)
                return ret;

        *ptr = val * 1000;

        return cnt;
}

static ssize_t
tracing_thresh_read(struct file *filp, char __user *ubuf,
                    size_t cnt, loff_t *ppos)
{
        return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos);
}

static ssize_t
tracing_thresh_write(struct file *filp, const char __user *ubuf,
                     size_t cnt, loff_t *ppos)
{
        struct trace_array *tr = filp->private_data;
        int ret;

        mutex_lock(&trace_types_lock);
        ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos);
        if (ret < 0)
                goto out;

        if (tr->current_trace->update_thresh) {
                ret = tr->current_trace->update_thresh(tr);
                if (ret < 0)
                        goto out;
        }

        ret = cnt;
out:
        mutex_unlock(&trace_types_lock);

        return ret;
}

#ifdef CONFIG_TRACER_MAX_TRACE

static ssize_t
tracing_max_lat_read(struct file *filp, char __user *ubuf,
                     size_t cnt, loff_t *ppos)
{
        struct trace_array *tr = filp->private_data;

        return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos);
}

static ssize_t
tracing_max_lat_write(struct file *filp, const char __user *ubuf,
                      size_t cnt, loff_t *ppos)
{
        struct trace_array *tr = filp->private_data;

        return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos);
}

#endif

static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
{
        if (cpu == RING_BUFFER_ALL_CPUS) {
                if (cpumask_empty(tr->pipe_cpumask)) {
                        cpumask_setall(tr->pipe_cpumask);
                        return 0;
                }
        } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
                cpumask_set_cpu(cpu, tr->pipe_cpumask);
                return 0;
        }
        return -EBUSY;
}

static void close_pipe_on_cpu(struct trace_array *tr, int cpu)
{
        if (cpu == RING_BUFFER_ALL_CPUS) {
                WARN_ON(!cpumask_full(tr->pipe_cpumask));
                cpumask_clear(tr->pipe_cpumask);
        } else {
                WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask));
                cpumask_clear_cpu(cpu, tr->pipe_cpumask);
        }
}

static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
        struct trace_array *tr = inode->i_private;
        struct trace_iterator *iter;
        int cpu;
        int ret;

        ret = tracing_check_open_get_tr(tr);
        if (ret)
                return ret;

        mutex_lock(&trace_types_lock);
        cpu = tracing_get_cpu(inode);
        ret = open_pipe_on_cpu(tr, cpu);
        if (ret)
                goto fail_pipe_on_cpu;

        /* create a buffer to store the information to pass to userspace */
        iter = kzalloc(sizeof(*iter), GFP_KERNEL);
        if (!iter) {
                ret = -ENOMEM;
                goto fail_alloc_iter;
        }

        trace_seq_init(&iter->seq);
        iter->trace = tr->current_trace;

        if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
                ret = -ENOMEM;
                goto fail;
        }

        /* trace pipe does not show start of buffer */
        cpumask_setall(iter->started);

        if (tr->trace_flags & TRACE_ITER_LATENCY_FMT)
                iter->iter_flags |= TRACE_FILE_LAT_FMT;

        /* Output in nanoseconds only if we are using a clock in nanoseconds. */
        if (trace_clocks[tr->clock_id].in_ns)
                iter->iter_flags |= TRACE_FILE_TIME_IN_NS;

        iter->tr = tr;
        iter->array_buffer = &tr->array_buffer;
        iter->cpu_file = cpu;
        mutex_init(&iter->mutex);
        filp->private_data = iter;

        if (iter->trace->pipe_open)
                iter->trace->pipe_open(iter);

        nonseekable_open(inode, filp);

        tr->trace_ref++;

        mutex_unlock(&trace_types_lock);
        return ret;

fail:
        kfree(iter);
fail_alloc_iter:
        close_pipe_on_cpu(tr, cpu);
fail_pipe_on_cpu:
        __trace_array_put(tr);
        mutex_unlock(&trace_types_lock);
        return ret;
}

static int tracing_release_pipe(struct inode *inode, struct file *file)
{
        struct trace_iterator *iter = file->private_data;
        struct trace_array *tr = inode->i_private;

        mutex_lock(&trace_types_lock);

        tr->trace_ref--;

        if (iter->trace->pipe_close)
                iter->trace->pipe_close(iter);
        close_pipe_on_cpu(tr, iter->cpu_file);
        mutex_unlock(&trace_types_lock);

        free_trace_iter_content(iter);
        kfree(iter);

        trace_array_put(tr);

        return 0;
}

static __poll_t
trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
{
        struct trace_array *tr = iter->tr;

        /* Iterators are static, they should be filled or empty */
        if (trace_buffer_iter(iter, iter->cpu_file))
                return EPOLLIN | EPOLLRDNORM;

        if (tr->trace_flags & TRACE_ITER_BLOCK)
                /*
                 * Always select as readable when in blocking mode
                 */
                return EPOLLIN | EPOLLRDNORM;
        else
                return ring_buffer_poll_wait(iter->array_buffer->buffer, iter->cpu_file,
                                             filp, poll_table, iter->tr->buffer_percent);
}

static __poll_t
tracing_poll_pipe(struct file *filp, poll_table *poll_table)
{
        struct trace_iterator *iter = filp->private_data;

        return trace_poll(iter, filp, poll_table);
}

/* Must be called with iter->mutex held. */
static int tracing_wait_pipe(struct file *filp)
{
        struct trace_iterator *iter = filp->private_data;
        int ret;

        while (trace_empty(iter)) {

                if ((filp->f_flags & O_NONBLOCK)) {
                        return -EAGAIN;
                }

                /*
                 * We block until we read something and tracing is disabled.
                 * We still block if tracing is disabled, but we have never
                 * read anything. This allows a user to cat this file, and
                 * then enable tracing. But after we have read something,
                 * we give an EOF when tracing is again disabled.
                 *
                 * iter->pos will be 0 if we haven't read anything.
                 */
                if (!tracer_tracing_is_on(iter->tr) && iter->pos)
                        break;

                mutex_unlock(&iter->mutex);

                ret = wait_on_pipe(iter, 0);

                mutex_lock(&iter->mutex);

                if (ret)
                        return ret;
        }

        return 1;
}

/*
 * Consumer reader.
 */
static ssize_t
tracing_read_pipe(struct file *filp, char __user *ubuf,
                  size_t cnt, loff_t *ppos)
{
        struct trace_iterator *iter = filp->private_data;
        ssize_t sret;

        /*
         * Avoid more than one consumer on a single file descriptor
         * This is just a matter of traces coherency, the ring buffer itself
         * is protected.
         */
        mutex_lock(&iter->mutex);

        /* return any leftover data */
        sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
        if (sret != -EBUSY)
                goto out;

        trace_seq_init(&iter->seq);

        if (iter->trace->read) {
                sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
                if (sret)
                        goto out;
        }

waitagain:
        sret = tracing_wait_pipe(filp);
        if (sret <= 0)
                goto out;

        /* stop when tracing is finished */
        if (trace_empty(iter)) {
                sret = 0;
                goto out;
        }

        if (cnt >= TRACE_SEQ_BUFFER_SIZE)
                cnt = TRACE_SEQ_BUFFER_SIZE - 1;

        /* reset all but tr, trace, and overruns */
        trace_iterator_reset(iter);
        cpumask_clear(iter->started);
        trace_seq_init(&iter->seq);

        trace_event_read_lock();
        trace_access_lock(iter->cpu_file);
        while (trace_find_next_entry_inc(iter) != NULL) {
                enum print_line_t ret;
                int save_len = iter->seq.seq.len;

                ret = print_trace_line(iter);
                if (ret == TRACE_TYPE_PARTIAL_LINE) {
                        /*
                         * If one print_trace_line() fills entire trace_seq in one shot,
                         * trace_seq_to_user() will returns -EBUSY because save_len == 0,
                         * In this case, we need to consume it, otherwise, loop will peek
                         * this event next time, resulting in an infinite loop.
                         */
                        if (save_len == 0) {
                                iter->seq.full = 0;
                                trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n");
                                trace_consume(iter);
                                break;
                        }

                        /* In other cases, don't print partial lines */
                        iter->seq.seq.len = save_len;
                        break;
                }
                if (ret != TRACE_TYPE_NO_CONSUME)
                        trace_consume(iter);

                if (trace_seq_used(&iter->seq) >= cnt)
                        break;

                /*
                 * Setting the full flag means we reached the trace_seq buffer
                 * size and we should leave by partial output condition above.
                 * One of the trace_seq_* functions is not used properly.
                 */
                WARN_ONCE(iter->seq.full, "full flag set for trace type %d",
                          iter->ent->type);
        }
        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();

        /* Now copy what we have to the user */
        sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
        if (iter->seq.readpos >= trace_seq_used(&iter->seq))
                trace_seq_init(&iter->seq);

        /*
         * If there was nothing to send to user, in spite of consuming trace
         * entries, go back to wait for more entries.
         */
        if (sret == -EBUSY)
                goto waitagain;

out:
        mutex_unlock(&iter->mutex);

        return sret;
}

static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
                                     unsigned int idx)
{
        __free_page(spd->pages[idx]);
}

static size_t
tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
{
        size_t count;
        int save_len;
        int ret;

        /* Seq buffer is page-sized, exactly what we need. */
        for (;;) {
                save_len = iter->seq.seq.len;
                ret = print_trace_line(iter);

                if (trace_seq_has_overflowed(&iter->seq)) {
                        iter->seq.seq.len = save_len;
                        break;
                }

                /*
                 * This should not be hit, because it should only
                 * be set if the iter->seq overflowed. But check it
                 * anyway to be safe.
                 */
                if (ret == TRACE_TYPE_PARTIAL_LINE) {
                        iter->seq.seq.len = save_len;
                        break;
                }

                count = trace_seq_used(&iter->seq) - save_len;
                if (rem < count) {
                        rem = 0;
                        iter->seq.seq.len = save_len;
                        break;
                }

                if (ret != TRACE_TYPE_NO_CONSUME)
                        trace_consume(iter);
                rem -= count;
                if (!trace_find_next_entry_inc(iter))        {
                        rem = 0;
                        iter->ent = NULL;
                        break;
                }
        }

        return rem;
}

static ssize_t tracing_splice_read_pipe(struct file *filp,
                                        loff_t *ppos,
                                        struct pipe_inode_info *pipe,
                                        size_t len,
                                        unsigned int flags)
{
        struct page *pages_def[PIPE_DEF_BUFFERS];
        struct partial_page partial_def[PIPE_DEF_BUFFERS];
        struct trace_iterator *iter = filp->private_data;
        struct splice_pipe_desc spd = {
                .pages                = pages_def,
                .partial        = partial_def,
                .nr_pages        = 0, /* This gets updated below. */
                .nr_pages_max        = PIPE_DEF_BUFFERS,
                .ops                = &default_pipe_buf_ops,
                .spd_release        = tracing_spd_release_pipe,
        };
        ssize_t ret;
        size_t rem;
        unsigned int i;

        if (splice_grow_spd(pipe, &spd))
                return -ENOMEM;

        mutex_lock(&iter->mutex);

        if (iter->trace->splice_read) {
                ret = iter->trace->splice_read(iter, filp,
                                               ppos, pipe, len, flags);
                if (ret)
                        goto out_err;
        }

        ret = tracing_wait_pipe(filp);
        if (ret <= 0)
                goto out_err;

        if (!iter->ent && !trace_find_next_entry_inc(iter)) {
                ret = -EFAULT;
                goto out_err;
        }

        trace_event_read_lock();
        trace_access_lock(iter->cpu_file);

        /* Fill as many pages as possible. */
        for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) {
                spd.pages[i] = alloc_page(GFP_KERNEL);
                if (!spd.pages[i])
                        break;

                rem = tracing_fill_pipe_page(rem, iter);

                /* Copy the data into the page, so we can start over. */
                ret = trace_seq_to_buffer(&iter->seq,
                                          page_address(spd.pages[i]),
                                          trace_seq_used(&iter->seq));
                if (ret < 0) {
                        __free_page(spd.pages[i]);
                        break;
                }
                spd.partial[i].offset = 0;
                spd.partial[i].len = trace_seq_used(&iter->seq);

                trace_seq_init(&iter->seq);
        }

        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
        mutex_unlock(&iter->mutex);

        spd.nr_pages = i;

        if (i)
                ret = splice_to_pipe(pipe, &spd);
        else
                ret = 0;
out:
        splice_shrink_spd(&spd);
        return ret;

out_err:
        mutex_unlock(&iter->mutex);
        goto out;
}

static ssize_t
tracing_entries_read(struct file *filp, char __user *ubuf,
                     size_t cnt, loff_t *ppos)
{
        struct inode *inode = file_inode(filp);
        struct trace_array *tr = inode->i_private;
        int cpu = tracing_get_cpu(inode);
        char buf[64];
        int r = 0;
        ssize_t ret;

        mutex_lock(&trace_types_lock);

        if (cpu == RING_BUFFER_ALL_CPUS) {
                int cpu, buf_size_same;
                unsigned long size;

                size = 0;
                buf_size_same = 1;
                /* check if all cpu sizes are same */
                for_each_tracing_cpu(cpu) {
                        /* fill in the size from first enabled cpu */
                        if (size == 0)
                                size = per_cpu_ptr(tr->array_buffer.data, cpu)->entries;
                        if (size != per_cpu_ptr(tr->array_buffer.data, cpu)->entries) {
                                buf_size_same = 0;
                                break;
                        }
                }

                if (buf_size_same) {
                        if (!tr->ring_buffer_expanded)
                                r = sprintf(buf, "%lu (expanded: %lu)\n",
                                            size >> 10,
                                            trace_buf_size >> 10);
                        else
                                r = sprintf(buf, "%lu\n", size >> 10);
                } else
                        r = sprintf(buf, "X\n");
        } else
                r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10);

        mutex_unlock(&trace_types_lock);

        ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
        return ret;
}

static ssize_t
tracing_entries_write(struct file *filp, const char __user *ubuf,
                      size_t cnt, loff_t *ppos)
{
        struct inode *inode = file_inode(filp);
        struct trace_array *tr = inode->i_private;
        unsigned long val;
        int ret;

        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
        if (ret)
                return ret;

        /* must have at least 1 entry */
        if (!val)
                return -EINVAL;

        /* value is in KB */
        val <<= 10;
        ret = tracing_resize_ring_buffer(tr, val, tracing_get_cpu(inode));
        if (ret < 0)
                return ret;

        *ppos += cnt;

        return cnt;
}

static ssize_t
tracing_total_entries_read(struct file *filp, char __user *ubuf,
                                size_t cnt, loff_t *ppos)
{
        struct trace_array *tr = filp->private_data;
        char buf[64];
        int r, cpu;
        unsigned long size = 0, expanded_size = 0;

        mutex_lock(&trace_types_lock);
        for_each_tracing_cpu(cpu) {
                size += per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10;
                if (!tr->ring_buffer_expanded)
                        expanded_size += trace_buf_size >> 10;
        }
        if (tr->ring_buffer_expanded)
                r = sprintf(buf, "%lu\n", size);
        else
                r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size);
        mutex_unlock(&trace_types_lock);

        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}

static ssize_t
tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
                          size_t cnt, loff_t *ppos)
{
        /*
         * There is no need to read what the user has written, this function
         * is just to make sure that there is no error when "echo" is used
         */

        *ppos += cnt;

        return cnt;
}

static int
tracing_free_buffer_release(struct inode *inode, struct file *filp)
{
        struct trace_array *tr = inode->i_private;

        /* disable tracing ? */
        if (tr->trace_flags & TRACE_ITER_STOP_ON_FREE)
                tracer_tracing_off(tr);
        /* resize the ring buffer to 0 */
        tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);

        trace_array_put(tr);

        return 0;
}

#define TRACE_MARKER_MAX_SIZE                4096

static ssize_t
tracing_mark_write(struct file *filp, const char __user *ubuf,
                                        size_t cnt, loff_t *fpos)
{
        struct trace_array *tr = filp->private_data;
        struct ring_buffer_event *event;
        enum event_trigger_type tt = ETT_NONE;
        struct trace_buffer *buffer;
        struct print_entry *entry;
        int meta_size;
        ssize_t written;
        size_t size;
        int len;

/* Used in tracing_mark_raw_write() as well */
#define FAULTED_STR "<faulted>"
#define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */

        if (tracing_disabled)
                return -EINVAL;

        if (!(tr->trace_flags & TRACE_ITER_MARKERS))
                return -EINVAL;

        if ((ssize_t)cnt < 0)
                return -EINVAL;

        if (cnt > TRACE_MARKER_MAX_SIZE)
                cnt = TRACE_MARKER_MAX_SIZE;

        meta_size = sizeof(*entry) + 2;  /* add '\0' and possible '\n' */
 again:
        size = cnt + meta_size;

        /* If less than "<faulted>", then make sure we can still add that */
        if (cnt < FAULTED_SIZE)
                size += FAULTED_SIZE - cnt;

        buffer = tr->array_buffer.buffer;
        event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
                                            tracing_gen_ctx());
        if (unlikely(!event)) {
                /*
                 * If the size was greater than what was allowed, then
                 * make it smaller and try again.
                 */
                if (size > ring_buffer_max_event_size(buffer)) {
                        /* cnt < FAULTED size should never be bigger than max */
                        if (WARN_ON_ONCE(cnt < FAULTED_SIZE))
                                return -EBADF;
                        cnt = ring_buffer_max_event_size(buffer) - meta_size;
                        /* The above should only happen once */
                        if (WARN_ON_ONCE(cnt + meta_size == size))
                                return -EBADF;
                        goto again;
                }

                /* Ring buffer disabled, return as if not open for write */
                return -EBADF;
        }

        entry = ring_buffer_event_data(event);
        entry->ip = _THIS_IP_;

        len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt);
        if (len) {
                memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
                cnt = FAULTED_SIZE;
                written = -EFAULT;
        } else
                written = cnt;

        if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) {
                /* do not add \n before testing triggers, but add \0 */
                entry->buf[cnt] = '\0';
                tt = event_triggers_call(tr->trace_marker_file, buffer, entry, event);
        }

        if (entry->buf[cnt - 1] != '\n') {
                entry->buf[cnt] = '\n';
                entry->buf[cnt + 1] = '\0';
        } else
                entry->buf[cnt] = '\0';

        if (static_branch_unlikely(&trace_marker_exports_enabled))
                ftrace_exports(event, TRACE_EXPORT_MARKER);
        __buffer_unlock_commit(buffer, event);

        if (tt)
                event_triggers_post_call(tr->trace_marker_file, tt);

        return written;
}

static ssize_t
tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
                                        size_t cnt, loff_t *fpos)
{
        struct trace_array *tr = filp->private_data;
        struct ring_buffer_event *event;
        struct trace_buffer *buffer;
        struct raw_data_entry *entry;
        ssize_t written;
        int size;
        int len;

#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int))

        if (tracing_disabled)
                return -EINVAL;

        if (!(tr->trace_flags & TRACE_ITER_MARKERS))
                return -EINVAL;

        /* The marker must at least have a tag id */
        if (cnt < sizeof(unsigned int))
                return -EINVAL;

        size = sizeof(*entry) + cnt;
        if (cnt < FAULT_SIZE_ID)
                size += FAULT_SIZE_ID - cnt;

        buffer = tr->array_buffer.buffer;

        if (size > ring_buffer_max_event_size(buffer))
                return -EINVAL;

        event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
                                            tracing_gen_ctx());
        if (!event)
                /* Ring buffer disabled, return as if not open for write */
                return -EBADF;

        entry = ring_buffer_event_data(event);

        len = __copy_from_user_inatomic(&entry->id, ubuf, cnt);
        if (len) {
                entry->id = -1;
                memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
                written = -EFAULT;
        } else
                written = cnt;

        __buffer_unlock_commit(buffer, event);

        return written;
}

static int tracing_clock_show(struct seq_file *m, void *v)
{
        struct trace_array *tr = m->private;
        int i;

        for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
                seq_printf(m,
                        "%s%s%s%s", i ? " " : "",
                        i == tr->clock_id ? "[" : "", trace_clocks[i].name,
                        i == tr->clock_id ? "]" : "");
        seq_putc(m, '\n');

        return 0;
}

int tracing_set_clock(struct trace_array *tr, const char *clockstr)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) {
                if (strcmp(trace_clocks[i].name, clockstr) == 0)
                        break;
        }
        if (i == ARRAY_SIZE(trace_clocks))
                return -EINVAL;

        mutex_lock(&trace_types_lock);

        tr->clock_id = i;

        ring_buffer_set_clock(tr->array_buffer.buffer, trace_clocks[i].func);

        /*
         * New clock may not be consistent with the previous clock.
         * Reset the buffer so that it doesn't have incomparable timestamps.
         */
        tracing_reset_online_cpus(&tr->array_buffer);

#ifdef CONFIG_TRACER_MAX_TRACE
        if (tr->max_buffer.buffer)
                ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
        tracing_reset_online_cpus(&tr->max_buffer);
#endif

        mutex_unlock(&trace_types_lock);

        return 0;
}

static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
                                   size_t cnt, loff_t *fpos)
{
        struct seq_file *m = filp->private_data;
        struct trace_array *tr = m->private;
        char buf[64];
        const char *clockstr;
        int ret;

        if (cnt >= sizeof(buf))
                return -EINVAL;

        if (copy_from_user(buf, ubuf, cnt))
                return -EFAULT;

        buf[cnt] = 0;

        clockstr = strstrip(buf);

        ret = tracing_set_clock(tr, clockstr);
        if (ret)
                return ret;

        *fpos += cnt;

        return cnt;
}

static int tracing_clock_open(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;
        int ret;

        ret = tracing_check_open_get_tr(tr);
        if (ret)
                return ret;

        ret = single_open(file, tracing_clock_show, inode->i_private);
        if (ret < 0)
                trace_array_put(tr);

        return ret;
}

static int tracing_time_stamp_mode_show(struct seq_file *m, void *v)
{
        struct trace_array *tr = m->private;

        mutex_lock(&trace_types_lock);

        if (ring_buffer_time_stamp_abs(tr->array_buffer.buffer))
                seq_puts(m, "delta [absolute]\n");
        else
                seq_puts(m, "[delta] absolute\n");

        mutex_unlock(&trace_types_lock);

        return 0;
}

static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;
        int ret;

        ret = tracing_check_open_get_tr(tr);
        if (ret)
                return ret;

        ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private);
        if (ret < 0)
                trace_array_put(tr);

        return ret;
}

u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe)
{
        if (rbe == this_cpu_read(trace_buffered_event))
                return ring_buffer_time_stamp(buffer);

        return ring_buffer_event_time_stamp(buffer, rbe);
}

/*
 * Set or disable using the per CPU trace_buffer_event when possible.
 */
int tracing_set_filter_buffering(struct trace_array *tr, bool set)
{
        int ret = 0;

        mutex_lock(&trace_types_lock);

        if (set && tr->no_filter_buffering_ref++)
                goto out;

        if (!set) {
                if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) {
                        ret = -EINVAL;
                        goto out;
                }

                --tr->no_filter_buffering_ref;
        }
 out:
        mutex_unlock(&trace_types_lock);

        return ret;
}

struct ftrace_buffer_info {
        struct trace_iterator        iter;
        void                        *spare;
        unsigned int                spare_cpu;
        unsigned int                spare_size;
        unsigned int                read;
};

#ifdef CONFIG_TRACER_SNAPSHOT
static int tracing_snapshot_open(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;
        struct trace_iterator *iter;
        struct seq_file *m;
        int ret;

        ret = tracing_check_open_get_tr(tr);
        if (ret)
                return ret;

        if (file->f_mode & FMODE_READ) {
                iter = __tracing_open(inode, file, true);
                if (IS_ERR(iter))
                        ret = PTR_ERR(iter);
        } else {
                /* Writes still need the seq_file to hold the private data */
                ret = -ENOMEM;
                m = kzalloc(sizeof(*m), GFP_KERNEL);
                if (!m)
                        goto out;
                iter = kzalloc(sizeof(*iter), GFP_KERNEL);
                if (!iter) {
                        kfree(m);
                        goto out;
                }
                ret = 0;

                iter->tr = tr;
                iter->array_buffer = &tr->max_buffer;
                iter->cpu_file = tracing_get_cpu(inode);
                m->private = iter;
                file->private_data = m;
        }
out:
        if (ret < 0)
                trace_array_put(tr);

        return ret;
}

static void tracing_swap_cpu_buffer(void *tr)
{
        update_max_tr_single((struct trace_array *)tr, current, smp_processor_id());
}

static ssize_t
tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
                       loff_t *ppos)
{
        struct seq_file *m = filp->private_data;
        struct trace_iterator *iter = m->private;
        struct trace_array *tr = iter->tr;
        unsigned long val;
        int ret;

        ret = tracing_update_buffers(tr);
        if (ret < 0)
                return ret;

        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
        if (ret)
                return ret;

        mutex_lock(&trace_types_lock);

        if (tr->current_trace->use_max_tr) {
                ret = -EBUSY;
                goto out;
        }

        local_irq_disable();
        arch_spin_lock(&tr->max_lock);
        if (tr->cond_snapshot)
                ret = -EBUSY;
        arch_spin_unlock(&tr->max_lock);
        local_irq_enable();
        if (ret)
                goto out;

        switch (val) {
        case 0:
                if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
                        ret = -EINVAL;
                        break;
                }
                if (tr->allocated_snapshot)
                        free_snapshot(tr);
                break;
        case 1:
/* Only allow per-cpu swap if the ring buffer supports it */
#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP
                if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
                        ret = -EINVAL;
                        break;
                }
#endif
                if (tr->allocated_snapshot)
                        ret = resize_buffer_duplicate_size(&tr->max_buffer,
                                        &tr->array_buffer, iter->cpu_file);

                ret = tracing_arm_snapshot_locked(tr);
                if (ret)
                        break;

                /* Now, we're going to swap */
                if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
                        local_irq_disable();
                        update_max_tr(tr, current, smp_processor_id(), NULL);
                        local_irq_enable();
                } else {
                        smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer,
                                                 (void *)tr, 1);
                }
                tracing_disarm_snapshot(tr);
                break;
        default:
                if (tr->allocated_snapshot) {
                        if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
                                tracing_reset_online_cpus(&tr->max_buffer);
                        else
                                tracing_reset_cpu(&tr->max_buffer, iter->cpu_file);
                }
                break;
        }

        if (ret >= 0) {
                *ppos += cnt;
                ret = cnt;
        }
out:
        mutex_unlock(&trace_types_lock);
        return ret;
}

static int tracing_snapshot_release(struct inode *inode, struct file *file)
{
        struct seq_file *m = file->private_data;
        int ret;

        ret = tracing_release(inode, file);

        if (file->f_mode & FMODE_READ)
                return ret;

        /* If write only, the seq_file is just a stub */
        if (m)
                kfree(m->private);
        kfree(m);

        return 0;
}

static int tracing_buffers_open(struct inode *inode, struct file *filp);
static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf,
                                    size_t count, loff_t *ppos);
static int tracing_buffers_release(struct inode *inode, struct file *file);
static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                   struct pipe_inode_info *pipe, size_t len, unsigned int flags);

static int snapshot_raw_open(struct inode *inode, struct file *filp)
{
        struct ftrace_buffer_info *info;
        int ret;

        /* The following checks for tracefs lockdown */
        ret = tracing_buffers_open(inode, filp);
        if (ret < 0)
                return ret;

        info = filp->private_data;

        if (info->iter.trace->use_max_tr) {
                tracing_buffers_release(inode, filp);
                return -EBUSY;
        }

        info->iter.snapshot = true;
        info->iter.array_buffer = &info->iter.tr->max_buffer;

        return ret;
}

#endif /* CONFIG_TRACER_SNAPSHOT */


static const struct file_operations tracing_thresh_fops = {
        .open                = tracing_open_generic,
        .read                = tracing_thresh_read,
        .write                = tracing_thresh_write,
        .llseek                = generic_file_llseek,
};

#ifdef CONFIG_TRACER_MAX_TRACE
static const struct file_operations tracing_max_lat_fops = {
        .open                = tracing_open_generic_tr,
        .read                = tracing_max_lat_read,
        .write                = tracing_max_lat_write,
        .llseek                = generic_file_llseek,
        .release        = tracing_release_generic_tr,
};
#endif

static const struct file_operations set_tracer_fops = {
        .open                = tracing_open_generic_tr,
        .read                = tracing_set_trace_read,
        .write                = tracing_set_trace_write,
        .llseek                = generic_file_llseek,
        .release        = tracing_release_generic_tr,
};

static const struct file_operations tracing_pipe_fops = {
        .open                = tracing_open_pipe,
        .poll                = tracing_poll_pipe,
        .read                = tracing_read_pipe,
        .splice_read        = tracing_splice_read_pipe,
        .release        = tracing_release_pipe,
        .llseek                = no_llseek,
};

static const struct file_operations tracing_entries_fops = {
        .open                = tracing_open_generic_tr,
        .read                = tracing_entries_read,
        .write                = tracing_entries_write,
        .llseek                = generic_file_llseek,
        .release        = tracing_release_generic_tr,
};

static const struct file_operations tracing_total_entries_fops = {
        .open                = tracing_open_generic_tr,
        .read                = tracing_total_entries_read,
        .llseek                = generic_file_llseek,
        .release        = tracing_release_generic_tr,
};

static const struct file_operations tracing_free_buffer_fops = {
        .open                = tracing_open_generic_tr,
        .write                = tracing_free_buffer_write,
        .release        = tracing_free_buffer_release,
};

static const struct file_operations tracing_mark_fops = {
        .open                = tracing_mark_open,
        .write                = tracing_mark_write,
        .release        = tracing_release_generic_tr,
};

static const struct file_operations tracing_mark_raw_fops = {
        .open                = tracing_mark_open,
        .write                = tracing_mark_raw_write,
        .release        = tracing_release_generic_tr,
};

static const struct file_operations trace_clock_fops = {
        .open                = tracing_clock_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = tracing_single_release_tr,
        .write                = tracing_clock_write,
};

static const struct file_operations trace_time_stamp_mode_fops = {
        .open                = tracing_time_stamp_mode_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = tracing_single_release_tr,
};

#ifdef CONFIG_TRACER_SNAPSHOT
static const struct file_operations snapshot_fops = {
        .open                = tracing_snapshot_open,
        .read                = seq_read,
        .write                = tracing_snapshot_write,
        .llseek                = tracing_lseek,
        .release        = tracing_snapshot_release,
};

static const struct file_operations snapshot_raw_fops = {
        .open                = snapshot_raw_open,
        .read                = tracing_buffers_read,
        .release        = tracing_buffers_release,
        .splice_read        = tracing_buffers_splice_read,
        .llseek                = no_llseek,
};

#endif /* CONFIG_TRACER_SNAPSHOT */

/*
 * trace_min_max_write - Write a u64 value to a trace_min_max_param struct
 * @filp: The active open file structure
 * @ubuf: The userspace provided buffer to read value into
 * @cnt: The maximum number of bytes to read
 * @ppos: The current "file" position
 *
 * This function implements the write interface for a struct trace_min_max_param.
 * The filp->private_data must point to a trace_min_max_param structure that
 * defines where to write the value, the min and the max acceptable values,
 * and a lock to protect the write.
 */
static ssize_t
trace_min_max_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos)
{
        struct trace_min_max_param *param = filp->private_data;
        u64 val;
        int err;

        if (!param)
                return -EFAULT;

        err = kstrtoull_from_user(ubuf, cnt, 10, &val);
        if (err)
                return err;

        if (param->lock)
                mutex_lock(param->lock);

        if (param->min && val < *param->min)
                err = -EINVAL;

        if (param->max && val > *param->max)
                err = -EINVAL;

        if (!err)
                *param->val = val;

        if (param->lock)
                mutex_unlock(param->lock);

        if (err)
                return err;

        return cnt;
}

/*
 * trace_min_max_read - Read a u64 value from a trace_min_max_param struct
 * @filp: The active open file structure
 * @ubuf: The userspace provided buffer to read value into
 * @cnt: The maximum number of bytes to read
 * @ppos: The current "file" position
 *
 * This function implements the read interface for a struct trace_min_max_param.
 * The filp->private_data must point to a trace_min_max_param struct with valid
 * data.
 */
static ssize_t
trace_min_max_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
        struct trace_min_max_param *param = filp->private_data;
        char buf[U64_STR_SIZE];
        int len;
        u64 val;

        if (!param)
                return -EFAULT;

        val = *param->val;

        if (cnt > sizeof(buf))
                cnt = sizeof(buf);

        len = snprintf(buf, sizeof(buf), "%llu\n", val);

        return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
}

const struct file_operations trace_min_max_fops = {
        .open                = tracing_open_generic,
        .read                = trace_min_max_read,
        .write                = trace_min_max_write,
};

#define TRACING_LOG_ERRS_MAX        8
#define TRACING_LOG_LOC_MAX        128

#define CMD_PREFIX "  Command: "

struct err_info {
        const char        **errs;        /* ptr to loc-specific array of err strings */
        u8                type;        /* index into errs -> specific err string */
        u16                pos;        /* caret position */
        u64                ts;
};

struct tracing_log_err {
        struct list_head        list;
        struct err_info                info;
        char                        loc[TRACING_LOG_LOC_MAX]; /* err location */
        char                        *cmd;                     /* what caused err */
};

static DEFINE_MUTEX(tracing_err_log_lock);

static struct tracing_log_err *alloc_tracing_log_err(int len)
{
        struct tracing_log_err *err;

        err = kzalloc(sizeof(*err), GFP_KERNEL);
        if (!err)
                return ERR_PTR(-ENOMEM);

        err->cmd = kzalloc(len, GFP_KERNEL);
        if (!err->cmd) {
                kfree(err);
                return ERR_PTR(-ENOMEM);
        }

        return err;
}

static void free_tracing_log_err(struct tracing_log_err *err)
{
        kfree(err->cmd);
        kfree(err);
}

static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr,
                                                   int len)
{
        struct tracing_log_err *err;
        char *cmd;

        if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) {
                err = alloc_tracing_log_err(len);
                if (PTR_ERR(err) != -ENOMEM)
                        tr->n_err_log_entries++;

                return err;
        }
        cmd = kzalloc(len, GFP_KERNEL);
        if (!cmd)
                return ERR_PTR(-ENOMEM);
        err = list_first_entry(&tr->err_log, struct tracing_log_err, list);
        kfree(err->cmd);
        err->cmd = cmd;
        list_del(&err->list);

        return err;
}

/**
 * err_pos - find the position of a string within a command for error careting
 * @cmd: The tracing command that caused the error
 * @str: The string to position the caret at within @cmd
 *
 * Finds the position of the first occurrence of @str within @cmd.  The
 * return value can be passed to tracing_log_err() for caret placement
 * within @cmd.
 *
 * Returns the index within @cmd of the first occurrence of @str or 0
 * if @str was not found.
 */
unsigned int err_pos(char *cmd, const char *str)
{
        char *found;

        if (WARN_ON(!strlen(cmd)))
                return 0;

        found = strstr(cmd, str);
        if (found)
                return found - cmd;

        return 0;
}

/**
 * tracing_log_err - write an error to the tracing error log
 * @tr: The associated trace array for the error (NULL for top level array)
 * @loc: A string describing where the error occurred
 * @cmd: The tracing command that caused the error
 * @errs: The array of loc-specific static error strings
 * @type: The index into errs[], which produces the specific static err string
 * @pos: The position the caret should be placed in the cmd
 *
 * Writes an error into tracing/error_log of the form:
 *
 * <loc>: error: <text>
 *   Command: <cmd>
 *              ^
 *
 * tracing/error_log is a small log file containing the last
 * TRACING_LOG_ERRS_MAX errors (8).  Memory for errors isn't allocated
 * unless there has been a tracing error, and the error log can be
 * cleared and have its memory freed by writing the empty string in
 * truncation mode to it i.e. echo > tracing/error_log.
 *
 * NOTE: the @errs array along with the @type param are used to
 * produce a static error string - this string is not copied and saved
 * when the error is logged - only a pointer to it is saved.  See
 * existing callers for examples of how static strings are typically
 * defined for use with tracing_log_err().
 */
void tracing_log_err(struct trace_array *tr,
                     const char *loc, const char *cmd,
                     const char **errs, u8 type, u16 pos)
{
        struct tracing_log_err *err;
        int len = 0;

        if (!tr)
                tr = &global_trace;

        len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1;

        mutex_lock(&tracing_err_log_lock);
        err = get_tracing_log_err(tr, len);
        if (PTR_ERR(err) == -ENOMEM) {
                mutex_unlock(&tracing_err_log_lock);
                return;
        }

        snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc);
        snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd);

        err->info.errs = errs;
        err->info.type = type;
        err->info.pos = pos;
        err->info.ts = local_clock();

        list_add_tail(&err->list, &tr->err_log);
        mutex_unlock(&tracing_err_log_lock);
}

static void clear_tracing_err_log(struct trace_array *tr)
{
        struct tracing_log_err *err, *next;

        mutex_lock(&tracing_err_log_lock);
        list_for_each_entry_safe(err, next, &tr->err_log, list) {
                list_del(&err->list);
                free_tracing_log_err(err);
        }

        tr->n_err_log_entries = 0;
        mutex_unlock(&tracing_err_log_lock);
}

static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos)
{
        struct trace_array *tr = m->private;

        mutex_lock(&tracing_err_log_lock);

        return seq_list_start(&tr->err_log, *pos);
}

static void *tracing_err_log_seq_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct trace_array *tr = m->private;

        return seq_list_next(v, &tr->err_log, pos);
}

static void tracing_err_log_seq_stop(struct seq_file *m, void *v)
{
        mutex_unlock(&tracing_err_log_lock);
}

static void tracing_err_log_show_pos(struct seq_file *m, u16 pos)
{
        u16 i;

        for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++)
                seq_putc(m, ' ');
        for (i = 0; i < pos; i++)
                seq_putc(m, ' ');
        seq_puts(m, "^\n");
}

static int tracing_err_log_seq_show(struct seq_file *m, void *v)
{
        struct tracing_log_err *err = v;

        if (err) {
                const char *err_text = err->info.errs[err->info.type];
                u64 sec = err->info.ts;
                u32 nsec;

                nsec = do_div(sec, NSEC_PER_SEC);
                seq_printf(m, "[%5llu.%06u] %s%s", sec, nsec / 1000,
                           err->loc, err_text);
                seq_printf(m, "%s", err->cmd);
                tracing_err_log_show_pos(m, err->info.pos);
        }

        return 0;
}

static const struct seq_operations tracing_err_log_seq_ops = {
        .start  = tracing_err_log_seq_start,
        .next   = tracing_err_log_seq_next,
        .stop   = tracing_err_log_seq_stop,
        .show   = tracing_err_log_seq_show
};

static int tracing_err_log_open(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;
        int ret = 0;

        ret = tracing_check_open_get_tr(tr);
        if (ret)
                return ret;

        /* If this file was opened for write, then erase contents */
        if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
                clear_tracing_err_log(tr);

        if (file->f_mode & FMODE_READ) {
                ret = seq_open(file, &tracing_err_log_seq_ops);
                if (!ret) {
                        struct seq_file *m = file->private_data;
                        m->private = tr;
                } else {
                        trace_array_put(tr);
                }
        }
        return ret;
}

static ssize_t tracing_err_log_write(struct file *file,
                                     const char __user *buffer,
                                     size_t count, loff_t *ppos)
{
        return count;
}

static int tracing_err_log_release(struct inode *inode, struct file *file)
{
        struct trace_array *tr = inode->i_private;

        trace_array_put(tr);

        if (file->f_mode & FMODE_READ)
                seq_release(inode, file);

        return 0;
}

static const struct file_operations tracing_err_log_fops = {
        .open           = tracing_err_log_open,
        .write                = tracing_err_log_write,
        .read           = seq_read,
        .llseek         = tracing_lseek,
        .release        = tracing_err_log_release,
};

static int tracing_buffers_open(struct inode *inode, struct file *filp)
{
        struct trace_array *tr = inode->i_private;
        struct ftrace_buffer_info *info;
        int ret;

        ret = tracing_check_open_get_tr(tr);
        if (ret)
                return ret;

        info = kvzalloc(sizeof(*info), GFP_KERNEL);
        if (!info) {
                trace_array_put(tr);
                return -ENOMEM;
        }

        mutex_lock(&trace_types_lock);

        info->iter.tr                = tr;
        info->iter.cpu_file        = tracing_get_cpu(inode);
        info->iter.trace        = tr->current_trace;
        info->iter.array_buffer = &tr->array_buffer;
        info->spare                = NULL;
        /* Force reading ring buffer for first read */
        info->read                = (unsigned int)-1;

        filp->private_data = info;

        tr->trace_ref++;

        mutex_unlock(&trace_types_lock);

        ret = nonseekable_open(inode, filp);
        if (ret < 0)
                trace_array_put(tr);

        return ret;
}

static __poll_t
tracing_buffers_poll(struct file *filp, poll_table *poll_table)
{
        struct ftrace_buffer_info *info = filp->private_data;
        struct trace_iterator *iter = &info->iter;

        return trace_poll(iter, filp, poll_table);
}

static ssize_t
tracing_buffers_read(struct file *filp, char __user *ubuf,
                     size_t count, loff_t *ppos)
{
        struct ftrace_buffer_info *info = filp->private_data;
        struct trace_iterator *iter = &info->iter;
        void *trace_data;
        int page_size;
        ssize_t ret = 0;
        ssize_t size;

        if (!count)
                return 0;

#ifdef CONFIG_TRACER_MAX_TRACE
        if (iter->snapshot && iter->tr->current_trace->use_max_tr)
                return -EBUSY;
#endif

        page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);

        /* Make sure the spare matches the current sub buffer size */
        if (info->spare) {
                if (page_size != info->spare_size) {
                        ring_buffer_free_read_page(iter->array_buffer->buffer,
                                                   info->spare_cpu, info->spare);
                        info->spare = NULL;
                }
        }

        if (!info->spare) {
                info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer,
                                                          iter->cpu_file);
                if (IS_ERR(info->spare)) {
                        ret = PTR_ERR(info->spare);
                        info->spare = NULL;
                } else {
                        info->spare_cpu = iter->cpu_file;
                        info->spare_size = page_size;
                }
        }
        if (!info->spare)
                return ret;

        /* Do we have previous read data to read? */
        if (info->read < page_size)
                goto read;

 again:
        trace_access_lock(iter->cpu_file);
        ret = ring_buffer_read_page(iter->array_buffer->buffer,
                                    info->spare,
                                    count,
                                    iter->cpu_file, 0);
        trace_access_unlock(iter->cpu_file);

        if (ret < 0) {
                if (trace_empty(iter)) {
                        if ((filp->f_flags & O_NONBLOCK))
                                return -EAGAIN;

                        ret = wait_on_pipe(iter, 0);
                        if (ret)
                                return ret;

                        goto again;
                }
                return 0;
        }

        info->read = 0;
 read:
        size = page_size - info->read;
        if (size > count)
                size = count;
        trace_data = ring_buffer_read_page_data(info->spare);
        ret = copy_to_user(ubuf, trace_data + info->read, size);
        if (ret == size)
                return -EFAULT;

        size -= ret;

        *ppos += size;
        info->read += size;

        return size;
}

static int tracing_buffers_flush(struct file *file, fl_owner_t id)
{
        struct ftrace_buffer_info *info = file->private_data;
        struct trace_iterator *iter = &info->iter;

        iter->closed = true;
        /* Make sure the waiters see the new wait_index */
        (void)atomic_fetch_inc_release(&iter->wait_index);

        ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);

        return 0;
}

static int tracing_buffers_release(struct inode *inode, struct file *file)
{
        struct ftrace_buffer_info *info = file->private_data;
        struct trace_iterator *iter = &info->iter;

        mutex_lock(&trace_types_lock);

        iter->tr->trace_ref--;

        __trace_array_put(iter->tr);

        if (info->spare)
                ring_buffer_free_read_page(iter->array_buffer->buffer,
                                           info->spare_cpu, info->spare);
        kvfree(info);

        mutex_unlock(&trace_types_lock);

        return 0;
}

struct buffer_ref {
        struct trace_buffer        *buffer;
        void                        *page;
        int                        cpu;
        refcount_t                refcount;
};

static void buffer_ref_release(struct buffer_ref *ref)
{
        if (!refcount_dec_and_test(&ref->refcount))
                return;
        ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
        kfree(ref);
}

static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
                                    struct pipe_buffer *buf)
{
        struct buffer_ref *ref = (struct buffer_ref *)buf->private;

        buffer_ref_release(ref);
        buf->private = 0;
}

static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
                                struct pipe_buffer *buf)
{
        struct buffer_ref *ref = (struct buffer_ref *)buf->private;

        if (refcount_read(&ref->refcount) > INT_MAX/2)
                return false;

        refcount_inc(&ref->refcount);
        return true;
}

/* Pipe buffer operations for a buffer. */
static const struct pipe_buf_operations buffer_pipe_buf_ops = {
        .release                = buffer_pipe_buf_release,
        .get                        = buffer_pipe_buf_get,
};

/*
 * Callback from splice_to_pipe(), if we need to release some pages
 * at the end of the spd in case we error'ed out in filling the pipe.
 */
static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
{
        struct buffer_ref *ref =
                (struct buffer_ref *)spd->partial[i].private;

        buffer_ref_release(ref);
        spd->partial[i].private = 0;
}

static ssize_t
tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                            struct pipe_inode_info *pipe, size_t len,
                            unsigned int flags)
{
        struct ftrace_buffer_info *info = file->private_data;
        struct trace_iterator *iter = &info->iter;
        struct partial_page partial_def[PIPE_DEF_BUFFERS];
        struct page *pages_def[PIPE_DEF_BUFFERS];
        struct splice_pipe_desc spd = {
                .pages                = pages_def,
                .partial        = partial_def,
                .nr_pages_max        = PIPE_DEF_BUFFERS,
                .ops                = &buffer_pipe_buf_ops,
                .spd_release        = buffer_spd_release,
        };
        struct buffer_ref *ref;
        bool woken = false;
        int page_size;
        int entries, i;
        ssize_t ret = 0;

#ifdef CONFIG_TRACER_MAX_TRACE
        if (iter->snapshot && iter->tr->current_trace->use_max_tr)
                return -EBUSY;
#endif

        page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
        if (*ppos & (page_size - 1))
                return -EINVAL;

        if (len & (page_size - 1)) {
                if (len < page_size)
                        return -EINVAL;
                len &= (~(page_size - 1));
        }

        if (splice_grow_spd(pipe, &spd))
                return -ENOMEM;

 again:
        trace_access_lock(iter->cpu_file);
        entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);

        for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= page_size) {
                struct page *page;
                int r;

                ref = kzalloc(sizeof(*ref), GFP_KERNEL);
                if (!ref) {
                        ret = -ENOMEM;
                        break;
                }

                refcount_set(&ref->refcount, 1);
                ref->buffer = iter->array_buffer->buffer;
                ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
                if (IS_ERR(ref->page)) {
                        ret = PTR_ERR(ref->page);
                        ref->page = NULL;
                        kfree(ref);
                        break;
                }
                ref->cpu = iter->cpu_file;

                r = ring_buffer_read_page(ref->buffer, ref->page,
                                          len, iter->cpu_file, 1);
                if (r < 0) {
                        ring_buffer_free_read_page(ref->buffer, ref->cpu,
                                                   ref->page);
                        kfree(ref);
                        break;
                }

                page = virt_to_page(ring_buffer_read_page_data(ref->page));

                spd.pages[i] = page;
                spd.partial[i].len = page_size;
                spd.partial[i].offset = 0;
                spd.partial[i].private = (unsigned long)ref;
                spd.nr_pages++;
                *ppos += page_size;

                entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
        }

        trace_access_unlock(iter->cpu_file);
        spd.nr_pages = i;

        /* did we read anything? */
        if (!spd.nr_pages) {

                if (ret)
                        goto out;

                if (woken)
                        goto out;

                ret = -EAGAIN;
                if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
                        goto out;

                ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent);
                if (ret)
                        goto out;

                /* No need to wait after waking up when tracing is off */
                if (!tracer_tracing_is_on(iter->tr))
                        goto out;

                /* Iterate one more time to collect any new data then exit */
                woken = true;

                goto again;
        }

        ret = splice_to_pipe(pipe, &spd);
out:
        splice_shrink_spd(&spd);

        return ret;
}

static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        struct ftrace_buffer_info *info = file->private_data;
        struct trace_iterator *iter = &info->iter;
        int err;

        if (cmd == TRACE_MMAP_IOCTL_GET_READER) {
                if (!(file->f_flags & O_NONBLOCK)) {
                        err = ring_buffer_wait(iter->array_buffer->buffer,
                                               iter->cpu_file,
                                               iter->tr->buffer_percent,
                                               NULL, NULL);
                        if (err)
                                return err;
                }

                return ring_buffer_map_get_reader(iter->array_buffer->buffer,
                                                  iter->cpu_file);
        } else if (cmd) {
                return -ENOTTY;
        }

        /*
         * An ioctl call with cmd 0 to the ring buffer file will wake up all
         * waiters
         */
        mutex_lock(&trace_types_lock);

        /* Make sure the waiters see the new wait_index */
        (void)atomic_fetch_inc_release(&iter->wait_index);

        ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);

        mutex_unlock(&trace_types_lock);
        return 0;
}

#ifdef CONFIG_TRACER_MAX_TRACE
static int get_snapshot_map(struct trace_array *tr)
{
        int err = 0;

        /*
         * Called with mmap_lock held. lockdep would be unhappy if we would now
         * take trace_types_lock. Instead use the specific
         * snapshot_trigger_lock.
         */
        spin_lock(&tr->snapshot_trigger_lock);

        if (tr->snapshot || tr->mapped == UINT_MAX)
                err = -EBUSY;
        else
                tr->mapped++;

        spin_unlock(&tr->snapshot_trigger_lock);

        /* Wait for update_max_tr() to observe iter->tr->mapped */
        if (tr->mapped == 1)
                synchronize_rcu();

        return err;

}
static void put_snapshot_map(struct trace_array *tr)
{
        spin_lock(&tr->snapshot_trigger_lock);
        if (!WARN_ON(!tr->mapped))
                tr->mapped--;
        spin_unlock(&tr->snapshot_trigger_lock);
}
#else
static inline int get_snapshot_map(struct trace_array *tr) { return 0; }
static inline void put_snapshot_map(struct trace_array *tr) { }
#endif

static void tracing_buffers_mmap_close(struct vm_area_struct *vma)
{
        struct ftrace_buffer_info *info = vma->vm_file->private_data;
        struct trace_iterator *iter = &info->iter;

        WARN_ON(ring_buffer_unmap(iter->array_buffer->buffer, iter->cpu_file));
        put_snapshot_map(iter->tr);
}

static const struct vm_operations_struct tracing_buffers_vmops = {
        .close                = tracing_buffers_mmap_close,
};

static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
{
        struct ftrace_buffer_info *info = filp->private_data;
        struct trace_iterator *iter = &info->iter;
        int ret = 0;

        ret = get_snapshot_map(iter->tr);
        if (ret)
                return ret;

        ret = ring_buffer_map(iter->array_buffer->buffer, iter->cpu_file, vma);
        if (ret)
                put_snapshot_map(iter->tr);

        vma->vm_ops = &tracing_buffers_vmops;

        return ret;
}

static const struct file_operations tracing_buffers_fops = {
        .open                = tracing_buffers_open,
        .read                = tracing_buffers_read,
        .poll                = tracing_buffers_poll,
        .release        = tracing_buffers_release,
        .flush                = tracing_buffers_flush,
        .splice_read        = tracing_buffers_splice_read,
        .unlocked_ioctl = tracing_buffers_ioctl,
        .llseek                = no_llseek,
        .mmap                = tracing_buffers_mmap,
};

static ssize_t
tracing_stats_read(struct file *filp, char __user *ubuf,
                   size_t count, loff_t *ppos)
{
        struct inode *inode = file_inode(filp);
        struct trace_array *tr = inode->i_private;
        struct array_buffer *trace_buf = &tr->array_buffer;
        int cpu = tracing_get_cpu(inode);
        struct trace_seq *s;
        unsigned long cnt;
        unsigned long long t;
        unsigned long usec_rem;

        s = kmalloc(sizeof(*s), GFP_KERNEL);
        if (!s)
                return -ENOMEM;

        trace_seq_init(s);

        cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu);
        trace_seq_printf(s, "entries: %ld\n", cnt);

        cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu);
        trace_seq_printf(s, "overrun: %ld\n", cnt);

        cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu);
        trace_seq_printf(s, "commit overrun: %ld\n", cnt);

        cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);
        trace_seq_printf(s, "bytes: %ld\n", cnt);

        if (trace_clocks[tr->clock_id].in_ns) {
                /* local or global for trace_clock */
                t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
                usec_rem = do_div(t, USEC_PER_SEC);
                trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
                                                                t, usec_rem);

                t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer));
                usec_rem = do_div(t, USEC_PER_SEC);
                trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
        } else {
                /* counter or tsc mode for trace_clock */
                trace_seq_printf(s, "oldest event ts: %llu\n",
                                ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));

                trace_seq_printf(s, "now ts: %llu\n",
                                ring_buffer_time_stamp(trace_buf->buffer));
        }

        cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu);
        trace_seq_printf(s, "dropped events: %ld\n", cnt);

        cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
        trace_seq_printf(s, "read events: %ld\n", cnt);

        count = simple_read_from_buffer(ubuf, count, ppos,
                                        s->buffer, trace_seq_used(s));

        kfree(s);

        return count;
}

static const struct file_operations tracing_stats_fops = {
        .open                = tracing_open_generic_tr,
        .read                = tracing_stats_read,
        .llseek                = generic_file_llseek,
        .release        = tracing_release_generic_tr,
};

#ifdef CONFIG_DYNAMIC_FTRACE

static ssize_t
tracing_read_dyn_info(struct file *filp, char __user *ubuf,
                  size_t cnt, loff_t *ppos)
{
        ssize_t ret;
        char *buf;
        int r;

        /* 256 should be plenty to hold the amount needed */
        buf = kmalloc(256, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;

        r = scnprintf(buf, 256, "%ld pages:%ld groups: %ld\n",
                      ftrace_update_tot_cnt,
                      ftrace_number_of_pages,
                      ftrace_number_of_groups);

        ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
        kfree(buf);
        return ret;
}

static const struct file_operations tracing_dyn_info_fops = {
        .open                = tracing_open_generic,
        .read                = tracing_read_dyn_info,
        .llseek                = generic_file_llseek,
};
#endif /* CONFIG_DYNAMIC_FTRACE */

#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE)
static void
ftrace_snapshot(unsigned long ip, unsigned long parent_ip,
                struct trace_array *tr, struct ftrace_probe_ops *ops,
                void *data)
{
        tracing_snapshot_instance(tr);
}

static void
ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip,
                      struct trace_array *tr, struct ftrace_probe_ops *ops,
                      void *data)
{
        struct ftrace_func_mapper *mapper = data;
        long *count = NULL;

        if (mapper)
                count = (long *)ftrace_func_mapper_find_ip(mapper, ip);

        if (count) {

                if (*count <= 0)
                        return;

                (*count)--;
        }

        tracing_snapshot_instance(tr);
}

static int
ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
                      struct ftrace_probe_ops *ops, void *data)
{
        struct ftrace_func_mapper *mapper = data;
        long *count = NULL;

        seq_printf(m, "%ps:", (void *)ip);

        seq_puts(m, "snapshot");

        if (mapper)
                count = (long *)ftrace_func_mapper_find_ip(mapper, ip);

        if (count)
                seq_printf(m, ":count=%ld\n", *count);
        else
                seq_puts(m, ":unlimited\n");

        return 0;
}

static int
ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr,
                     unsigned long ip, void *init_data, void **data)
{
        struct ftrace_func_mapper *mapper = *data;

        if (!mapper) {
                mapper = allocate_ftrace_func_mapper();
                if (!mapper)
                        return -ENOMEM;
                *data = mapper;
        }

        return ftrace_func_mapper_add_ip(mapper, ip, init_data);
}

static void
ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr,
                     unsigned long ip, void *data)
{
        struct ftrace_func_mapper *mapper = data;

        if (!ip) {
                if (!mapper)
                        return;
                free_ftrace_func_mapper(mapper, NULL);
                return;
        }

        ftrace_func_mapper_remove_ip(mapper, ip);
}

static struct ftrace_probe_ops snapshot_probe_ops = {
        .func                        = ftrace_snapshot,
        .print                        = ftrace_snapshot_print,
};

static struct ftrace_probe_ops snapshot_count_probe_ops = {
        .func                        = ftrace_count_snapshot,
        .print                        = ftrace_snapshot_print,
        .init                        = ftrace_snapshot_init,
        .free                        = ftrace_snapshot_free,
};

static int
ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
                               char *glob, char *cmd, char *param, int enable)
{
        struct ftrace_probe_ops *ops;
        void *count = (void *)-1;
        char *number;
        int ret;

        if (!tr)
                return -ENODEV;

        /* hash funcs only work with set_ftrace_filter */
        if (!enable)
                return -EINVAL;

        ops = param ? &snapshot_count_probe_ops :  &snapshot_probe_ops;

        if (glob[0] == '!') {
                ret = unregister_ftrace_function_probe_func(glob+1, tr, ops);
                if (!ret)
                        tracing_disarm_snapshot(tr);

                return ret;
        }

        if (!param)
                goto out_reg;

        number = strsep(&param, ":");

        if (!strlen(number))
                goto out_reg;

        /*
         * We use the callback data field (which is a pointer)
         * as our counter.
         */
        ret = kstrtoul(number, 0, (unsigned long *)&count);
        if (ret)
                return ret;

 out_reg:
        ret = tracing_arm_snapshot(tr);
        if (ret < 0)
                goto out;

        ret = register_ftrace_function_probe(glob, tr, ops, count);
        if (ret < 0)
                tracing_disarm_snapshot(tr);
 out:
        return ret < 0 ? ret : 0;
}

static struct ftrace_func_command ftrace_snapshot_cmd = {
        .name                        = "snapshot",
        .func                        = ftrace_trace_snapshot_callback,
};

static __init int register_snapshot_cmd(void)
{
        return register_ftrace_command(&ftrace_snapshot_cmd);
}
#else
static inline __init int register_snapshot_cmd(void) { return 0; }
#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */

static struct dentry *tracing_get_dentry(struct trace_array *tr)
{
        if (WARN_ON(!tr->dir))
                return ERR_PTR(-ENODEV);

        /* Top directory uses NULL as the parent */
        if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
                return NULL;

        /* All sub buffers have a descriptor */
        return tr->dir;
}

static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
{
        struct dentry *d_tracer;

        if (tr->percpu_dir)
                return tr->percpu_dir;

        d_tracer = tracing_get_dentry(tr);
        if (IS_ERR(d_tracer))
                return NULL;

        tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer);

        MEM_FAIL(!tr->percpu_dir,
                  "Could not create tracefs directory 'per_cpu/%d'\n", cpu);

        return tr->percpu_dir;
}

static struct dentry *
trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
                      void *data, long cpu, const struct file_operations *fops)
{
        struct dentry *ret = trace_create_file(name, mode, parent, data, fops);

        if (ret) /* See tracing_get_cpu() */
                d_inode(ret)->i_cdev = (void *)(cpu + 1);
        return ret;
}

static void
tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
{
        struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
        struct dentry *d_cpu;
        char cpu_dir[30]; /* 30 characters should be more than enough */

        if (!d_percpu)
                return;

        snprintf(cpu_dir, 30, "cpu%ld", cpu);
        d_cpu = tracefs_create_dir(cpu_dir, d_percpu);
        if (!d_cpu) {
                pr_warn("Could not create tracefs '%s' entry\n", cpu_dir);
                return;
        }

        /* per cpu trace_pipe */
        trace_create_cpu_file("trace_pipe", TRACE_MODE_READ, d_cpu,
                                tr, cpu, &tracing_pipe_fops);

        /* per cpu trace */
        trace_create_cpu_file("trace", TRACE_MODE_WRITE, d_cpu,
                                tr, cpu, &tracing_fops);

        trace_create_cpu_file("trace_pipe_raw", TRACE_MODE_READ, d_cpu,
                                tr, cpu, &tracing_buffers_fops);

        trace_create_cpu_file("stats", TRACE_MODE_READ, d_cpu,
                                tr, cpu, &tracing_stats_fops);

        trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu,
                                tr, cpu, &tracing_entries_fops);

#ifdef CONFIG_TRACER_SNAPSHOT
        trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu,
                                tr, cpu, &snapshot_fops);

        trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu,
                                tr, cpu, &snapshot_raw_fops);
#endif
}

#ifdef CONFIG_FTRACE_SELFTEST
/* Let selftest have access to static functions in this file */
#include "trace_selftest.c"
#endif

static ssize_t
trace_options_read(struct file *filp, char __user *ubuf, size_t cnt,
                        loff_t *ppos)
{
        struct trace_option_dentry *topt = filp->private_data;
        char *buf;

        if (topt->flags->val & topt->opt->bit)
                buf = "1\n";
        else
                buf = "0\n";

        return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
}

static ssize_t
trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
                         loff_t *ppos)
{
        struct trace_option_dentry *topt = filp->private_data;
        unsigned long val;
        int ret;

        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
        if (ret)
                return ret;

        if (val != 0 && val != 1)
                return -EINVAL;

        if (!!(topt->flags->val & topt->opt->bit) != val) {
                mutex_lock(&trace_types_lock);
                ret = __set_tracer_option(topt->tr, topt->flags,
                                          topt->opt, !val);
                mutex_unlock(&trace_types_lock);
                if (ret)
                        return ret;
        }

        *ppos += cnt;

        return cnt;
}

static int tracing_open_options(struct inode *inode, struct file *filp)
{
        struct trace_option_dentry *topt = inode->i_private;
        int ret;

        ret = tracing_check_open_get_tr(topt->tr);
        if (ret)
                return ret;

        filp->private_data = inode->i_private;
        return 0;
}

static int tracing_release_options(struct inode *inode, struct file *file)
{
        struct trace_option_dentry *topt = file->private_data;

        trace_array_put(topt->tr);
        return 0;
}

static const struct file_operations trace_options_fops = {
        .open = tracing_open_options,
        .read = trace_options_read,
        .write = trace_options_write,
        .llseek        = generic_file_llseek,
        .release = tracing_release_options,
};

/*
 * In order to pass in both the trace_array descriptor as well as the index
 * to the flag that the trace option file represents, the trace_array
 * has a character array of trace_flags_index[], which holds the index
 * of the bit for the flag it represents. index[0] == 0, index[1] == 1, etc.
 * The address of this character array is passed to the flag option file
 * read/write callbacks.
 *
 * In order to extract both the index and the trace_array descriptor,
 * get_tr_index() uses the following algorithm.
 *
 *   idx = *ptr;
 *
 * As the pointer itself contains the address of the index (remember
 * index[1] == 1).
 *
 * Then to get the trace_array descriptor, by subtracting that index
 * from the ptr, we get to the start of the index itself.
 *
 *   ptr - idx == &index[0]
 *
 * Then a simple container_of() from that pointer gets us to the
 * trace_array descriptor.
 */
static void get_tr_index(void *data, struct trace_array **ptr,
                         unsigned int *pindex)
{
        *pindex = *(unsigned char *)data;

        *ptr = container_of(data - *pindex, struct trace_array,
                            trace_flags_index);
}

static ssize_t
trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt,
                        loff_t *ppos)
{
        void *tr_index = filp->private_data;
        struct trace_array *tr;
        unsigned int index;
        char *buf;

        get_tr_index(tr_index, &tr, &index);

        if (tr->trace_flags & (1 << index))
                buf = "1\n";
        else
                buf = "0\n";

        return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
}

static ssize_t
trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
                         loff_t *ppos)
{
        void *tr_index = filp->private_data;
        struct trace_array *tr;
        unsigned int index;
        unsigned long val;
        int ret;

        get_tr_index(tr_index, &tr, &index);

        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
        if (ret)
                return ret;

        if (val != 0 && val != 1)
                return -EINVAL;

        mutex_lock(&event_mutex);
        mutex_lock(&trace_types_lock);
        ret = set_tracer_flag(tr, 1 << index, val);
        mutex_unlock(&trace_types_lock);
        mutex_unlock(&event_mutex);

        if (ret < 0)
                return ret;

        *ppos += cnt;

        return cnt;
}

static const struct file_operations trace_options_core_fops = {
        .open = tracing_open_generic,
        .read = trace_options_core_read,
        .write = trace_options_core_write,
        .llseek = generic_file_llseek,
};

struct dentry *trace_create_file(const char *name,
                                 umode_t mode,
                                 struct dentry *parent,
                                 void *data,
                                 const struct file_operations *fops)
{
        struct dentry *ret;

        ret = tracefs_create_file(name, mode, parent, data, fops);
        if (!ret)
                pr_warn("Could not create tracefs '%s' entry\n", name);

        return ret;
}


static struct dentry *trace_options_init_dentry(struct trace_array *tr)
{
        struct dentry *d_tracer;

        if (tr->options)
                return tr->options;

        d_tracer = tracing_get_dentry(tr);
        if (IS_ERR(d_tracer))
                return NULL;

        tr->options = tracefs_create_dir("options", d_tracer);
        if (!tr->options) {
                pr_warn("Could not create tracefs directory 'options'\n");
                return NULL;
        }

        return tr->options;
}

static void
create_trace_option_file(struct trace_array *tr,
                         struct trace_option_dentry *topt,
                         struct tracer_flags *flags,
                         struct tracer_opt *opt)
{
        struct dentry *t_options;

        t_options = trace_options_init_dentry(tr);
        if (!t_options)
                return;

        topt->flags = flags;
        topt->opt = opt;
        topt->tr = tr;

        topt->entry = trace_create_file(opt->name, TRACE_MODE_WRITE,
                                        t_options, topt, &trace_options_fops);

}

static void
create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
{
        struct trace_option_dentry *topts;
        struct trace_options *tr_topts;
        struct tracer_flags *flags;
        struct tracer_opt *opts;
        int cnt;
        int i;

        if (!tracer)
                return;

        flags = tracer->flags;

        if (!flags || !flags->opts)
                return;

        /*
         * If this is an instance, only create flags for tracers
         * the instance may have.
         */
        if (!trace_ok_for_array(tracer, tr))
                return;

        for (i = 0; i < tr->nr_topts; i++) {
                /* Make sure there's no duplicate flags. */
                if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags))
                        return;
        }

        opts = flags->opts;

        for (cnt = 0; opts[cnt].name; cnt++)
                ;

        topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL);
        if (!topts)
                return;

        tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1),
                            GFP_KERNEL);
        if (!tr_topts) {
                kfree(topts);
                return;
        }

        tr->topts = tr_topts;
        tr->topts[tr->nr_topts].tracer = tracer;
        tr->topts[tr->nr_topts].topts = topts;
        tr->nr_topts++;

        for (cnt = 0; opts[cnt].name; cnt++) {
                create_trace_option_file(tr, &topts[cnt], flags,
                                         &opts[cnt]);
                MEM_FAIL(topts[cnt].entry == NULL,
                          "Failed to create trace option: %s",
                          opts[cnt].name);
        }
}

static struct dentry *
create_trace_option_core_file(struct trace_array *tr,
                              const char *option, long index)
{
        struct dentry *t_options;

        t_options = trace_options_init_dentry(tr);
        if (!t_options)
                return NULL;

        return trace_create_file(option, TRACE_MODE_WRITE, t_options,
                                 (void *)&tr->trace_flags_index[index],
                                 &trace_options_core_fops);
}

static void create_trace_options_dir(struct trace_array *tr)
{
        struct dentry *t_options;
        bool top_level = tr == &global_trace;
        int i;

        t_options = trace_options_init_dentry(tr);
        if (!t_options)
                return;

        for (i = 0; trace_options[i]; i++) {
                if (top_level ||
                    !((1 << i) & TOP_LEVEL_TRACE_FLAGS))
                        create_trace_option_core_file(tr, trace_options[i], i);
        }
}

static ssize_t
rb_simple_read(struct file *filp, char __user *ubuf,
               size_t cnt, loff_t *ppos)
{
        struct trace_array *tr = filp->private_data;
        char buf[64];
        int r;

        r = tracer_tracing_is_on(tr);
        r = sprintf(buf, "%d\n", r);

        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}

static ssize_t
rb_simple_write(struct file *filp, const char __user *ubuf,
                size_t cnt, loff_t *ppos)
{
        struct trace_array *tr = filp->private_data;
        struct trace_buffer *buffer = tr->array_buffer.buffer;
        unsigned long val;
        int ret;

        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
        if (ret)
                return ret;

        if (buffer) {
                mutex_lock(&trace_types_lock);
                if (!!val == tracer_tracing_is_on(tr)) {
                        val = 0; /* do nothing */
                } else if (val) {
                        tracer_tracing_on(tr);
                        if (tr->current_trace->start)
                                tr->current_trace->start(tr);
                } else {
                        tracer_tracing_off(tr);
                        if (tr->current_trace->stop)
                                tr->current_trace->stop(tr);
                        /* Wake up any waiters */
                        ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS);
                }
                mutex_unlock(&trace_types_lock);
        }

        (*ppos)++;

        return cnt;
}

static const struct file_operations rb_simple_fops = {
        .open                = tracing_open_generic_tr,
        .read                = rb_simple_read,
        .write                = rb_simple_write,
        .release        = tracing_release_generic_tr,
        .llseek                = default_llseek,
};

static ssize_t
buffer_percent_read(struct file *filp, char __user *ubuf,
                    size_t cnt, loff_t *ppos)
{
        struct trace_array *tr = filp->private_data;
        char buf[64];
        int r;

        r = tr->buffer_percent;
        r = sprintf(buf, "%d\n", r);

        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}

static ssize_t
buffer_percent_write(struct file *filp, const char __user *ubuf,
                     size_t cnt, loff_t *ppos)
{
        struct trace_array *tr = filp->private_data;
        unsigned long val;
        int ret;

        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
        if (ret)
                return ret;

        if (val > 100)
                return -EINVAL;

        tr->buffer_percent = val;

        (*ppos)++;

        return cnt;
}

static const struct file_operations buffer_percent_fops = {
        .open                = tracing_open_generic_tr,
        .read                = buffer_percent_read,
        .write                = buffer_percent_write,
        .release        = tracing_release_generic_tr,
        .llseek                = default_llseek,
};

static ssize_t
buffer_subbuf_size_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
        struct trace_array *tr = filp->private_data;
        size_t size;
        char buf[64];
        int order;
        int r;

        order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
        size = (PAGE_SIZE << order) / 1024;

        r = sprintf(buf, "%zd\n", size);

        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}

static ssize_t
buffer_subbuf_size_write(struct file *filp, const char __user *ubuf,
                         size_t cnt, loff_t *ppos)
{
        struct trace_array *tr = filp->private_data;
        unsigned long val;
        int old_order;
        int order;
        int pages;
        int ret;

        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
        if (ret)
                return ret;

        val *= 1024; /* value passed in is in KB */

        pages = DIV_ROUND_UP(val, PAGE_SIZE);
        order = fls(pages - 1);

        /* limit between 1 and 128 system pages */
        if (order < 0 || order > 7)
                return -EINVAL;

        /* Do not allow tracing while changing the order of the ring buffer */
        tracing_stop_tr(tr);

        old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
        if (old_order == order)
                goto out;

        ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, order);
        if (ret)
                goto out;

#ifdef CONFIG_TRACER_MAX_TRACE

        if (!tr->allocated_snapshot)
                goto out_max;

        ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
        if (ret) {
                /* Put back the old order */
                cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order);
                if (WARN_ON_ONCE(cnt)) {
                        /*
                         * AARGH! We are left with different orders!
                         * The max buffer is our "snapshot" buffer.
                         * When a tracer needs a snapshot (one of the
                         * latency tracers), it swaps the max buffer
                         * with the saved snap shot. We succeeded to
                         * update the order of the main buffer, but failed to
                         * update the order of the max buffer. But when we tried
                         * to reset the main buffer to the original size, we
                         * failed there too. This is very unlikely to
                         * happen, but if it does, warn and kill all
                         * tracing.
                         */
                        tracing_disabled = 1;
                }
                goto out;
        }
 out_max:
#endif
        (*ppos)++;
 out:
        if (ret)
                cnt = ret;
        tracing_start_tr(tr);
        return cnt;
}

static const struct file_operations buffer_subbuf_size_fops = {
        .open                = tracing_open_generic_tr,
        .read                = buffer_subbuf_size_read,
        .write                = buffer_subbuf_size_write,
        .release        = tracing_release_generic_tr,
        .llseek                = default_llseek,
};

static struct dentry *trace_instance_dir;

static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);

static int
allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size)
{
        enum ring_buffer_flags rb_flags;

        rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;

        buf->tr = tr;

        buf->buffer = ring_buffer_alloc(size, rb_flags);
        if (!buf->buffer)
                return -ENOMEM;

        buf->data = alloc_percpu(struct trace_array_cpu);
        if (!buf->data) {
                ring_buffer_free(buf->buffer);
                buf->buffer = NULL;
                return -ENOMEM;
        }

        /* Allocate the first page for all buffers */
        set_buffer_entries(&tr->array_buffer,
                           ring_buffer_size(tr->array_buffer.buffer, 0));

        return 0;
}

static void free_trace_buffer(struct array_buffer *buf)
{
        if (buf->buffer) {
                ring_buffer_free(buf->buffer);
                buf->buffer = NULL;
                free_percpu(buf->data);
                buf->data = NULL;
        }
}

static int allocate_trace_buffers(struct trace_array *tr, int size)
{
        int ret;

        ret = allocate_trace_buffer(tr, &tr->array_buffer, size);
        if (ret)
                return ret;

#ifdef CONFIG_TRACER_MAX_TRACE
        ret = allocate_trace_buffer(tr, &tr->max_buffer,
                                    allocate_snapshot ? size : 1);
        if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) {
                free_trace_buffer(&tr->array_buffer);
                return -ENOMEM;
        }
        tr->allocated_snapshot = allocate_snapshot;

        allocate_snapshot = false;
#endif

        return 0;
}

static void free_trace_buffers(struct trace_array *tr)
{
        if (!tr)
                return;

        free_trace_buffer(&tr->array_buffer);

#ifdef CONFIG_TRACER_MAX_TRACE
        free_trace_buffer(&tr->max_buffer);
#endif
}

static void init_trace_flags_index(struct trace_array *tr)
{
        int i;

        /* Used by the trace options files */
        for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++)
                tr->trace_flags_index[i] = i;
}

static void __update_tracer_options(struct trace_array *tr)
{
        struct tracer *t;

        for (t = trace_types; t; t = t->next)
                add_tracer_options(tr, t);
}

static void update_tracer_options(struct trace_array *tr)
{
        mutex_lock(&trace_types_lock);
        tracer_options_updated = true;
        __update_tracer_options(tr);
        mutex_unlock(&trace_types_lock);
}

/* Must have trace_types_lock held */
struct trace_array *trace_array_find(const char *instance)
{
        struct trace_array *tr, *found = NULL;

        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
                if (tr->name && strcmp(tr->name, instance) == 0) {
                        found = tr;
                        break;
                }
        }

        return found;
}

struct trace_array *trace_array_find_get(const char *instance)
{
        struct trace_array *tr;

        mutex_lock(&trace_types_lock);
        tr = trace_array_find(instance);
        if (tr)
                tr->ref++;
        mutex_unlock(&trace_types_lock);

        return tr;
}

static int trace_array_create_dir(struct trace_array *tr)
{
        int ret;

        tr->dir = tracefs_create_dir(tr->name, trace_instance_dir);
        if (!tr->dir)
                return -EINVAL;

        ret = event_trace_add_tracer(tr->dir, tr);
        if (ret) {
                tracefs_remove(tr->dir);
                return ret;
        }

        init_tracer_tracefs(tr, tr->dir);
        __update_tracer_options(tr);

        return ret;
}

static struct trace_array *
trace_array_create_systems(const char *name, const char *systems)
{
        struct trace_array *tr;
        int ret;

        ret = -ENOMEM;
        tr = kzalloc(sizeof(*tr), GFP_KERNEL);
        if (!tr)
                return ERR_PTR(ret);

        tr->name = kstrdup(name, GFP_KERNEL);
        if (!tr->name)
                goto out_free_tr;

        if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
                goto out_free_tr;

        if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
                goto out_free_tr;

        if (systems) {
                tr->system_names = kstrdup_const(systems, GFP_KERNEL);
                if (!tr->system_names)
                        goto out_free_tr;
        }

        tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;

        cpumask_copy(tr->tracing_cpumask, cpu_all_mask);

        raw_spin_lock_init(&tr->start_lock);

        tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
#ifdef CONFIG_TRACER_MAX_TRACE
        spin_lock_init(&tr->snapshot_trigger_lock);
#endif
        tr->current_trace = &nop_trace;

        INIT_LIST_HEAD(&tr->systems);
        INIT_LIST_HEAD(&tr->events);
        INIT_LIST_HEAD(&tr->hist_vars);
        INIT_LIST_HEAD(&tr->err_log);

        if (allocate_trace_buffers(tr, trace_buf_size) < 0)
                goto out_free_tr;

        /* The ring buffer is defaultly expanded */
        trace_set_ring_buffer_expanded(tr);

        if (ftrace_allocate_ftrace_ops(tr) < 0)
                goto out_free_tr;

        ftrace_init_trace_array(tr);

        init_trace_flags_index(tr);

        if (trace_instance_dir) {
                ret = trace_array_create_dir(tr);
                if (ret)
                        goto out_free_tr;
        } else
                __trace_early_add_events(tr);

        list_add(&tr->list, &ftrace_trace_arrays);

        tr->ref++;

        return tr;

 out_free_tr:
        ftrace_free_ftrace_ops(tr);
        free_trace_buffers(tr);
        free_cpumask_var(tr->pipe_cpumask);
        free_cpumask_var(tr->tracing_cpumask);
        kfree_const(tr->system_names);
        kfree(tr->name);
        kfree(tr);

        return ERR_PTR(ret);
}

static struct trace_array *trace_array_create(const char *name)
{
        return trace_array_create_systems(name, NULL);
}

static int instance_mkdir(const char *name)
{
        struct trace_array *tr;
        int ret;

        mutex_lock(&event_mutex);
        mutex_lock(&trace_types_lock);

        ret = -EEXIST;
        if (trace_array_find(name))
                goto out_unlock;

        tr = trace_array_create(name);

        ret = PTR_ERR_OR_ZERO(tr);

out_unlock:
        mutex_unlock(&trace_types_lock);
        mutex_unlock(&event_mutex);
        return ret;
}

/**
 * trace_array_get_by_name - Create/Lookup a trace array, given its name.
 * @name: The name of the trace array to be looked up/created.
 * @systems: A list of systems to create event directories for (NULL for all)
 *
 * Returns pointer to trace array with given name.
 * NULL, if it cannot be created.
 *
 * NOTE: This function increments the reference counter associated with the
 * trace array returned. This makes sure it cannot be freed while in use.
 * Use trace_array_put() once the trace array is no longer needed.
 * If the trace_array is to be freed, trace_array_destroy() needs to
 * be called after the trace_array_put(), or simply let user space delete
 * it from the tracefs instances directory. But until the
 * trace_array_put() is called, user space can not delete it.
 *
 */
struct trace_array *trace_array_get_by_name(const char *name, const char *systems)
{
        struct trace_array *tr;

        mutex_lock(&event_mutex);
        mutex_lock(&trace_types_lock);

        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
                if (tr->name && strcmp(tr->name, name) == 0)
                        goto out_unlock;
        }

        tr = trace_array_create_systems(name, systems);

        if (IS_ERR(tr))
                tr = NULL;
out_unlock:
        if (tr)
                tr->ref++;

        mutex_unlock(&trace_types_lock);
        mutex_unlock(&event_mutex);
        return tr;
}
EXPORT_SYMBOL_GPL(trace_array_get_by_name);

static int __remove_instance(struct trace_array *tr)
{
        int i;

        /* Reference counter for a newly created trace array = 1. */
        if (tr->ref > 1 || (tr->current_trace && tr->trace_ref))
                return -EBUSY;

        list_del(&tr->list);

        /* Disable all the flags that were enabled coming in */
        for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) {
                if ((1 << i) & ZEROED_TRACE_FLAGS)
                        set_tracer_flag(tr, 1 << i, 0);
        }

        tracing_set_nop(tr);
        clear_ftrace_function_probes(tr);
        event_trace_del_tracer(tr);
        ftrace_clear_pids(tr);
        ftrace_destroy_function_files(tr);
        tracefs_remove(tr->dir);
        free_percpu(tr->last_func_repeats);
        free_trace_buffers(tr);
        clear_tracing_err_log(tr);

        for (i = 0; i < tr->nr_topts; i++) {
                kfree(tr->topts[i].topts);
        }
        kfree(tr->topts);

        free_cpumask_var(tr->pipe_cpumask);
        free_cpumask_var(tr->tracing_cpumask);
        kfree_const(tr->system_names);
        kfree(tr->name);
        kfree(tr);

        return 0;
}

int trace_array_destroy(struct trace_array *this_tr)
{
        struct trace_array *tr;
        int ret;

        if (!this_tr)
                return -EINVAL;

        mutex_lock(&event_mutex);
        mutex_lock(&trace_types_lock);

        ret = -ENODEV;

        /* Making sure trace array exists before destroying it. */
        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
                if (tr == this_tr) {
                        ret = __remove_instance(tr);
                        break;
                }
        }

        mutex_unlock(&trace_types_lock);
        mutex_unlock(&event_mutex);

        return ret;
}
EXPORT_SYMBOL_GPL(trace_array_destroy);

static int instance_rmdir(const char *name)
{
        struct trace_array *tr;
        int ret;

        mutex_lock(&event_mutex);
        mutex_lock(&trace_types_lock);

        ret = -ENODEV;
        tr = trace_array_find(name);
        if (tr)
                ret = __remove_instance(tr);

        mutex_unlock(&trace_types_lock);
        mutex_unlock(&event_mutex);

        return ret;
}

static __init void create_trace_instances(struct dentry *d_tracer)
{
        struct trace_array *tr;

        trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer,
                                                         instance_mkdir,
                                                         instance_rmdir);
        if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n"))
                return;

        mutex_lock(&event_mutex);
        mutex_lock(&trace_types_lock);

        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
                if (!tr->name)
                        continue;
                if (MEM_FAIL(trace_array_create_dir(tr) < 0,
                             "Failed to create instance directory\n"))
                        break;
        }

        mutex_unlock(&trace_types_lock);
        mutex_unlock(&event_mutex);
}

static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
{
        int cpu;

        trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer,
                        tr, &show_traces_fops);

        trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer,
                        tr, &set_tracer_fops);

        trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer,
                          tr, &tracing_cpumask_fops);

        trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer,
                          tr, &tracing_iter_fops);

        trace_create_file("trace", TRACE_MODE_WRITE, d_tracer,
                          tr, &tracing_fops);

        trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer,
                          tr, &tracing_pipe_fops);

        trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer,
                          tr, &tracing_entries_fops);

        trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer,
                          tr, &tracing_total_entries_fops);

        trace_create_file("free_buffer", 0200, d_tracer,
                          tr, &tracing_free_buffer_fops);

        trace_create_file("trace_marker", 0220, d_tracer,
                          tr, &tracing_mark_fops);

        tr->trace_marker_file = __find_event_file(tr, "ftrace", "print");

        trace_create_file("trace_marker_raw", 0220, d_tracer,
                          tr, &tracing_mark_raw_fops);

        trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr,
                          &trace_clock_fops);

        trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer,
                          tr, &rb_simple_fops);

        trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr,
                          &trace_time_stamp_mode_fops);

        tr->buffer_percent = 50;

        trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer,
                        tr, &buffer_percent_fops);

        trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer,
                          tr, &buffer_subbuf_size_fops);

        create_trace_options_dir(tr);

#ifdef CONFIG_TRACER_MAX_TRACE
        trace_create_maxlat_file(tr, d_tracer);
#endif

        if (ftrace_create_function_files(tr, d_tracer))
                MEM_FAIL(1, "Could not allocate function filter files");

#ifdef CONFIG_TRACER_SNAPSHOT
        trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
                          tr, &snapshot_fops);
#endif

        trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer,
                          tr, &tracing_err_log_fops);

        for_each_tracing_cpu(cpu)
                tracing_init_tracefs_percpu(tr, cpu);

        ftrace_init_tracefs(tr, d_tracer);
}

static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
{
        struct vfsmount *mnt;
        struct file_system_type *type;

        /*
         * To maintain backward compatibility for tools that mount
         * debugfs to get to the tracing facility, tracefs is automatically
         * mounted to the debugfs/tracing directory.
         */
        type = get_fs_type("tracefs");
        if (!type)
                return NULL;
        mnt = vfs_submount(mntpt, type, "tracefs", NULL);
        put_filesystem(type);
        if (IS_ERR(mnt))
                return NULL;
        mntget(mnt);

        return mnt;
}

/**
 * tracing_init_dentry - initialize top level trace array
 *
 * This is called when creating files or directories in the tracing
 * directory. It is called via fs_initcall() by any of the boot up code
 * and expects to return the dentry of the top level tracing directory.
 */
int tracing_init_dentry(void)
{
        struct trace_array *tr = &global_trace;

        if (security_locked_down(LOCKDOWN_TRACEFS)) {
                pr_warn("Tracing disabled due to lockdown\n");
                return -EPERM;
        }

        /* The top level trace array uses  NULL as parent */
        if (tr->dir)
                return 0;

        if (WARN_ON(!tracefs_initialized()))
                return -ENODEV;

        /*
         * As there may still be users that expect the tracing
         * files to exist in debugfs/tracing, we must automount
         * the tracefs file system there, so older tools still
         * work with the newer kernel.
         */
        tr->dir = debugfs_create_automount("tracing", NULL,
                                           trace_automount, NULL);

        return 0;
}

extern struct trace_eval_map *__start_ftrace_eval_maps[];
extern struct trace_eval_map *__stop_ftrace_eval_maps[];

static struct workqueue_struct *eval_map_wq __initdata;
static struct work_struct eval_map_work __initdata;
static struct work_struct tracerfs_init_work __initdata;

static void __init eval_map_work_func(struct work_struct *work)
{
        int len;

        len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps;
        trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len);
}

static int __init trace_eval_init(void)
{
        INIT_WORK(&eval_map_work, eval_map_work_func);

        eval_map_wq = alloc_workqueue("eval_map_wq", WQ_UNBOUND, 0);
        if (!eval_map_wq) {
                pr_err("Unable to allocate eval_map_wq\n");
                /* Do work here */
                eval_map_work_func(&eval_map_work);
                return -ENOMEM;
        }

        queue_work(eval_map_wq, &eval_map_work);
        return 0;
}

subsys_initcall(trace_eval_init);

static int __init trace_eval_sync(void)
{
        /* Make sure the eval map updates are finished */
        if (eval_map_wq)
                destroy_workqueue(eval_map_wq);
        return 0;
}

late_initcall_sync(trace_eval_sync);


#ifdef CONFIG_MODULES
static void trace_module_add_evals(struct module *mod)
{
        if (!mod->num_trace_evals)
                return;

        /*
         * Modules with bad taint do not have events created, do
         * not bother with enums either.
         */
        if (trace_module_has_bad_taint(mod))
                return;

        trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals);
}

#ifdef CONFIG_TRACE_EVAL_MAP_FILE
static void trace_module_remove_evals(struct module *mod)
{
        union trace_eval_map_item *map;
        union trace_eval_map_item **last = &trace_eval_maps;

        if (!mod->num_trace_evals)
                return;

        mutex_lock(&trace_eval_mutex);

        map = trace_eval_maps;

        while (map) {
                if (map->head.mod == mod)
                        break;
                map = trace_eval_jmp_to_tail(map);
                last = &map->tail.next;
                map = map->tail.next;
        }
        if (!map)
                goto out;

        *last = trace_eval_jmp_to_tail(map)->tail.next;
        kfree(map);
 out:
        mutex_unlock(&trace_eval_mutex);
}
#else
static inline void trace_module_remove_evals(struct module *mod) { }
#endif /* CONFIG_TRACE_EVAL_MAP_FILE */

static int trace_module_notify(struct notifier_block *self,
                               unsigned long val, void *data)
{
        struct module *mod = data;

        switch (val) {
        case MODULE_STATE_COMING:
                trace_module_add_evals(mod);
                break;
        case MODULE_STATE_GOING:
                trace_module_remove_evals(mod);
                break;
        }

        return NOTIFY_OK;
}

static struct notifier_block trace_module_nb = {
        .notifier_call = trace_module_notify,
        .priority = 0,
};
#endif /* CONFIG_MODULES */

static __init void tracer_init_tracefs_work_func(struct work_struct *work)
{

        event_trace_init();

        init_tracer_tracefs(&global_trace, NULL);
        ftrace_init_tracefs_toplevel(&global_trace, NULL);

        trace_create_file("tracing_thresh", TRACE_MODE_WRITE, NULL,
                        &global_trace, &tracing_thresh_fops);

        trace_create_file("README", TRACE_MODE_READ, NULL,
                        NULL, &tracing_readme_fops);

        trace_create_file("saved_cmdlines", TRACE_MODE_READ, NULL,
                        NULL, &tracing_saved_cmdlines_fops);

        trace_create_file("saved_cmdlines_size", TRACE_MODE_WRITE, NULL,
                          NULL, &tracing_saved_cmdlines_size_fops);

        trace_create_file("saved_tgids", TRACE_MODE_READ, NULL,
                        NULL, &tracing_saved_tgids_fops);

        trace_create_eval_file(NULL);

#ifdef CONFIG_MODULES
        register_module_notifier(&trace_module_nb);
#endif

#ifdef CONFIG_DYNAMIC_FTRACE
        trace_create_file("dyn_ftrace_total_info", TRACE_MODE_READ, NULL,
                        NULL, &tracing_dyn_info_fops);
#endif

        create_trace_instances(NULL);

        update_tracer_options(&global_trace);
}

static __init int tracer_init_tracefs(void)
{
        int ret;

        trace_access_lock_init();

        ret = tracing_init_dentry();
        if (ret)
                return 0;

        if (eval_map_wq) {
                INIT_WORK(&tracerfs_init_work, tracer_init_tracefs_work_func);
                queue_work(eval_map_wq, &tracerfs_init_work);
        } else {
                tracer_init_tracefs_work_func(NULL);
        }

        rv_init_interface();

        return 0;
}

fs_initcall(tracer_init_tracefs);

static int trace_die_panic_handler(struct notifier_block *self,
                                unsigned long ev, void *unused);

static struct notifier_block trace_panic_notifier = {
        .notifier_call = trace_die_panic_handler,
        .priority = INT_MAX - 1,
};

static struct notifier_block trace_die_notifier = {
        .notifier_call = trace_die_panic_handler,
        .priority = INT_MAX - 1,
};

/*
 * The idea is to execute the following die/panic callback early, in order
 * to avoid showing irrelevant information in the trace (like other panic
 * notifier functions); we are the 2nd to run, after hung_task/rcu_stall
 * warnings get disabled (to prevent potential log flooding).
 */
static int trace_die_panic_handler(struct notifier_block *self,
                                unsigned long ev, void *unused)
{
        if (!ftrace_dump_on_oops_enabled())
                return NOTIFY_DONE;

        /* The die notifier requires DIE_OOPS to trigger */
        if (self == &trace_die_notifier && ev != DIE_OOPS)
                return NOTIFY_DONE;

        ftrace_dump(DUMP_PARAM);

        return NOTIFY_DONE;
}

/*
 * printk is set to max of 1024, we really don't need it that big.
 * Nothing should be printing 1000 characters anyway.
 */
#define TRACE_MAX_PRINT                1000

/*
 * Define here KERN_TRACE so that we have one place to modify
 * it if we decide to change what log level the ftrace dump
 * should be at.
 */
#define KERN_TRACE                KERN_EMERG

void
trace_printk_seq(struct trace_seq *s)
{
        /* Probably should print a warning here. */
        if (s->seq.len >= TRACE_MAX_PRINT)
                s->seq.len = TRACE_MAX_PRINT;

        /*
         * More paranoid code. Although the buffer size is set to
         * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just
         * an extra layer of protection.
         */
        if (WARN_ON_ONCE(s->seq.len >= s->seq.size))
                s->seq.len = s->seq.size - 1;

        /* should be zero ended, but we are paranoid. */
        s->buffer[s->seq.len] = 0;

        printk(KERN_TRACE "%s", s->buffer);

        trace_seq_init(s);
}

static void trace_init_iter(struct trace_iterator *iter, struct trace_array *tr)
{
        iter->tr = tr;
        iter->trace = iter->tr->current_trace;
        iter->cpu_file = RING_BUFFER_ALL_CPUS;
        iter->array_buffer = &tr->array_buffer;

        if (iter->trace && iter->trace->open)
                iter->trace->open(iter);

        /* Annotate start of buffers if we had overruns */
        if (ring_buffer_overruns(iter->array_buffer->buffer))
                iter->iter_flags |= TRACE_FILE_ANNOTATE;

        /* Output in nanoseconds only if we are using a clock in nanoseconds. */
        if (trace_clocks[iter->tr->clock_id].in_ns)
                iter->iter_flags |= TRACE_FILE_TIME_IN_NS;

        /* Can not use kmalloc for iter.temp and iter.fmt */
        iter->temp = static_temp_buf;
        iter->temp_size = STATIC_TEMP_BUF_SIZE;
        iter->fmt = static_fmt_buf;
        iter->fmt_size = STATIC_FMT_BUF_SIZE;
}

void trace_init_global_iter(struct trace_iterator *iter)
{
        trace_init_iter(iter, &global_trace);
}

static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_mode)
{
        /* use static because iter can be a bit big for the stack */
        static struct trace_iterator iter;
        unsigned int old_userobj;
        unsigned long flags;
        int cnt = 0, cpu;

        /*
         * Always turn off tracing when we dump.
         * We don't need to show trace output of what happens
         * between multiple crashes.
         *
         * If the user does a sysrq-z, then they can re-enable
         * tracing with echo 1 > tracing_on.
         */
        tracer_tracing_off(tr);

        local_irq_save(flags);

        /* Simulate the iterator */
        trace_init_iter(&iter, tr);

        for_each_tracing_cpu(cpu) {
                atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
        }

        old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ;

        /* don't look at user memory in panic mode */
        tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ;

        if (dump_mode == DUMP_ORIG)
                iter.cpu_file = raw_smp_processor_id();
        else
                iter.cpu_file = RING_BUFFER_ALL_CPUS;

        if (tr == &global_trace)
                printk(KERN_TRACE "Dumping ftrace buffer:\n");
        else
                printk(KERN_TRACE "Dumping ftrace instance %s buffer:\n", tr->name);

        /* Did function tracer already get disabled? */
        if (ftrace_is_dead()) {
                printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
                printk("#          MAY BE MISSING FUNCTION EVENTS\n");
        }

        /*
         * We need to stop all tracing on all CPUS to read
         * the next buffer. This is a bit expensive, but is
         * not done often. We fill all what we can read,
         * and then release the locks again.
         */

        while (!trace_empty(&iter)) {

                if (!cnt)
                        printk(KERN_TRACE "---------------------------------\n");

                cnt++;

                trace_iterator_reset(&iter);
                iter.iter_flags |= TRACE_FILE_LAT_FMT;

                if (trace_find_next_entry_inc(&iter) != NULL) {
                        int ret;

                        ret = print_trace_line(&iter);
                        if (ret != TRACE_TYPE_NO_CONSUME)
                                trace_consume(&iter);
                }
                touch_nmi_watchdog();

                trace_printk_seq(&iter.seq);
        }

        if (!cnt)
                printk(KERN_TRACE "   (ftrace buffer empty)\n");
        else
                printk(KERN_TRACE "---------------------------------\n");

        tr->trace_flags |= old_userobj;

        for_each_tracing_cpu(cpu) {
                atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
        }
        local_irq_restore(flags);
}

static void ftrace_dump_by_param(void)
{
        bool first_param = true;
        char dump_param[MAX_TRACER_SIZE];
        char *buf, *token, *inst_name;
        struct trace_array *tr;

        strscpy(dump_param, ftrace_dump_on_oops, MAX_TRACER_SIZE);
        buf = dump_param;

        while ((token = strsep(&buf, ",")) != NULL) {
                if (first_param) {
                        first_param = false;
                        if (!strcmp("0", token))
                                continue;
                        else if (!strcmp("1", token)) {
                                ftrace_dump_one(&global_trace, DUMP_ALL);
                                continue;
                        }
                        else if (!strcmp("2", token) ||
                          !strcmp("orig_cpu", token)) {
                                ftrace_dump_one(&global_trace, DUMP_ORIG);
                                continue;
                        }
                }

                inst_name = strsep(&token, "=");
                tr = trace_array_find(inst_name);
                if (!tr) {
                        printk(KERN_TRACE "Instance %s not found\n", inst_name);
                        continue;
                }

                if (token && (!strcmp("2", token) ||
                          !strcmp("orig_cpu", token)))
                        ftrace_dump_one(tr, DUMP_ORIG);
                else
                        ftrace_dump_one(tr, DUMP_ALL);
        }
}

void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
{
        static atomic_t dump_running;

        /* Only allow one dump user at a time. */
        if (atomic_inc_return(&dump_running) != 1) {
                atomic_dec(&dump_running);
                return;
        }

        switch (oops_dump_mode) {
        case DUMP_ALL:
                ftrace_dump_one(&global_trace, DUMP_ALL);
                break;
        case DUMP_ORIG:
                ftrace_dump_one(&global_trace, DUMP_ORIG);
                break;
        case DUMP_PARAM:
                ftrace_dump_by_param();
                break;
        case DUMP_NONE:
                break;
        default:
                printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
                ftrace_dump_one(&global_trace, DUMP_ALL);
        }

        atomic_dec(&dump_running);
}
EXPORT_SYMBOL_GPL(ftrace_dump);

#define WRITE_BUFSIZE  4096

ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
                                size_t count, loff_t *ppos,
                                int (*createfn)(const char *))
{
        char *kbuf, *buf, *tmp;
        int ret = 0;
        size_t done = 0;
        size_t size;

        kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
        if (!kbuf)
                return -ENOMEM;

        while (done < count) {
                size = count - done;

                if (size >= WRITE_BUFSIZE)
                        size = WRITE_BUFSIZE - 1;

                if (copy_from_user(kbuf, buffer + done, size)) {
                        ret = -EFAULT;
                        goto out;
                }
                kbuf[size] = '\0';
                buf = kbuf;
                do {
                        tmp = strchr(buf, '\n');
                        if (tmp) {
                                *tmp = '\0';
                                size = tmp - buf + 1;
                        } else {
                                size = strlen(buf);
                                if (done + size < count) {
                                        if (buf != kbuf)
                                                break;
                                        /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
                                        pr_warn("Line length is too long: Should be less than %d\n",
                                                WRITE_BUFSIZE - 2);
                                        ret = -EINVAL;
                                        goto out;
                                }
                        }
                        done += size;

                        /* Remove comments */
                        tmp = strchr(buf, '#');

                        if (tmp)
                                *tmp = '\0';

                        ret = createfn(buf);
                        if (ret)
                                goto out;
                        buf += size;

                } while (done < count);
        }
        ret = done;

out:
        kfree(kbuf);

        return ret;
}

#ifdef CONFIG_TRACER_MAX_TRACE
__init static bool tr_needs_alloc_snapshot(const char *name)
{
        char *test;
        int len = strlen(name);
        bool ret;

        if (!boot_snapshot_index)
                return false;

        if (strncmp(name, boot_snapshot_info, len) == 0 &&
            boot_snapshot_info[len] == '\t')
                return true;

        test = kmalloc(strlen(name) + 3, GFP_KERNEL);
        if (!test)
                return false;

        sprintf(test, "\t%s\t", name);
        ret = strstr(boot_snapshot_info, test) == NULL;
        kfree(test);
        return ret;
}

__init static void do_allocate_snapshot(const char *name)
{
        if (!tr_needs_alloc_snapshot(name))
                return;

        /*
         * When allocate_snapshot is set, the next call to
         * allocate_trace_buffers() (called by trace_array_get_by_name())
         * will allocate the snapshot buffer. That will alse clear
         * this flag.
         */
        allocate_snapshot = true;
}
#else
static inline void do_allocate_snapshot(const char *name) { }
#endif

__init static void enable_instances(void)
{
        struct trace_array *tr;
        char *curr_str;
        char *str;
        char *tok;

        /* A tab is always appended */
        boot_instance_info[boot_instance_index - 1] = '\0';
        str = boot_instance_info;

        while ((curr_str = strsep(&str, "\t"))) {

                tok = strsep(&curr_str, ",");

                if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
                        do_allocate_snapshot(tok);

                tr = trace_array_get_by_name(tok, NULL);
                if (!tr) {
                        pr_warn("Failed to create instance buffer %s\n", curr_str);
                        continue;
                }
                /* Allow user space to delete it */
                trace_array_put(tr);

                while ((tok = strsep(&curr_str, ","))) {
                        early_enable_events(tr, tok, true);
                }
        }
}

__init static int tracer_alloc_buffers(void)
{
        int ring_buf_size;
        int ret = -ENOMEM;


        if (security_locked_down(LOCKDOWN_TRACEFS)) {
                pr_warn("Tracing disabled due to lockdown\n");
                return -EPERM;
        }

        /*
         * Make sure we don't accidentally add more trace options
         * than we have bits for.
         */
        BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE);

        if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
                goto out;

        if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL))
                goto out_free_buffer_mask;

        /* Only allocate trace_printk buffers if a trace_printk exists */
        if (&__stop___trace_bprintk_fmt != &__start___trace_bprintk_fmt)
                /* Must be called before global_trace.buffer is allocated */
                trace_printk_init_buffers();

        /* To save memory, keep the ring buffer size to its minimum */
        if (global_trace.ring_buffer_expanded)
                ring_buf_size = trace_buf_size;
        else
                ring_buf_size = 1;

        cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
        cpumask_copy(global_trace.tracing_cpumask, cpu_all_mask);

        raw_spin_lock_init(&global_trace.start_lock);

        /*
         * The prepare callbacks allocates some memory for the ring buffer. We
         * don't free the buffer if the CPU goes down. If we were to free
         * the buffer, then the user would lose any trace that was in the
         * buffer. The memory will be removed once the "instance" is removed.
         */
        ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE,
                                      "trace/RB:prepare", trace_rb_cpu_prepare,
                                      NULL);
        if (ret < 0)
                goto out_free_cpumask;
        /* Used for event triggers */
        ret = -ENOMEM;
        temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
        if (!temp_buffer)
                goto out_rm_hp_state;

        if (trace_create_savedcmd() < 0)
                goto out_free_temp_buffer;

        if (!zalloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
                goto out_free_savedcmd;

        /* TODO: make the number of buffers hot pluggable with CPUS */
        if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
                MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
                goto out_free_pipe_cpumask;
        }
        if (global_trace.buffer_disabled)
                tracing_off();

        if (trace_boot_clock) {
                ret = tracing_set_clock(&global_trace, trace_boot_clock);
                if (ret < 0)
                        pr_warn("Trace clock %s not defined, going back to default\n",
                                trace_boot_clock);
        }

        /*
         * register_tracer() might reference current_trace, so it
         * needs to be set before we register anything. This is
         * just a bootstrap of current_trace anyway.
         */
        global_trace.current_trace = &nop_trace;

        global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
#ifdef CONFIG_TRACER_MAX_TRACE
        spin_lock_init(&global_trace.snapshot_trigger_lock);
#endif
        ftrace_init_global_array_ops(&global_trace);

        init_trace_flags_index(&global_trace);

        register_tracer(&nop_trace);

        /* Function tracing may start here (via kernel command line) */
        init_function_trace();

        /* All seems OK, enable tracing */
        tracing_disabled = 0;

        atomic_notifier_chain_register(&panic_notifier_list,
                                       &trace_panic_notifier);

        register_die_notifier(&trace_die_notifier);

        global_trace.flags = TRACE_ARRAY_FL_GLOBAL;

        INIT_LIST_HEAD(&global_trace.systems);
        INIT_LIST_HEAD(&global_trace.events);
        INIT_LIST_HEAD(&global_trace.hist_vars);
        INIT_LIST_HEAD(&global_trace.err_log);
        list_add(&global_trace.list, &ftrace_trace_arrays);

        apply_trace_boot_options();

        register_snapshot_cmd();

        test_can_verify();

        return 0;

out_free_pipe_cpumask:
        free_cpumask_var(global_trace.pipe_cpumask);
out_free_savedcmd:
        trace_free_saved_cmdlines_buffer();
out_free_temp_buffer:
        ring_buffer_free(temp_buffer);
out_rm_hp_state:
        cpuhp_remove_multi_state(CPUHP_TRACE_RB_PREPARE);
out_free_cpumask:
        free_cpumask_var(global_trace.tracing_cpumask);
out_free_buffer_mask:
        free_cpumask_var(tracing_buffer_mask);
out:
        return ret;
}

void __init ftrace_boot_snapshot(void)
{
#ifdef CONFIG_TRACER_MAX_TRACE
        struct trace_array *tr;

        if (!snapshot_at_boot)
                return;

        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
                if (!tr->allocated_snapshot)
                        continue;

                tracing_snapshot_instance(tr);
                trace_array_puts(tr, "** Boot snapshot taken **\n");
        }
#endif
}

void __init early_trace_init(void)
{
        if (tracepoint_printk) {
                tracepoint_print_iter =
                        kzalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
                if (MEM_FAIL(!tracepoint_print_iter,
                             "Failed to allocate trace iterator\n"))
                        tracepoint_printk = 0;
                else
                        static_key_enable(&tracepoint_printk_key.key);
        }
        tracer_alloc_buffers();

        init_events();
}

void __init trace_init(void)
{
        trace_event_init();

        if (boot_instance_index)
                enable_instances();
}

__init static void clear_boot_tracer(void)
{
        /*
         * The default tracer at boot buffer is an init section.
         * This function is called in lateinit. If we did not
         * find the boot tracer, then clear it out, to prevent
         * later registration from accessing the buffer that is
         * about to be freed.
         */
        if (!default_bootup_tracer)
                return;

        printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n",
               default_bootup_tracer);
        default_bootup_tracer = NULL;
}

#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
__init static void tracing_set_default_clock(void)
{
        /* sched_clock_stable() is determined in late_initcall */
        if (!trace_boot_clock && !sched_clock_stable()) {
                if (security_locked_down(LOCKDOWN_TRACEFS)) {
                        pr_warn("Can not set tracing clock due to lockdown\n");
                        return;
                }

                printk(KERN_WARNING
                       "Unstable clock detected, switching default tracing clock to \"global\"\n"
                       "If you want to keep using the local clock, then add:\n"
                       "  \"trace_clock=local\"\n"
                       "on the kernel command line\n");
                tracing_set_clock(&global_trace, "global");
        }
}
#else
static inline void tracing_set_default_clock(void) { }
#endif

__init static int late_trace_init(void)
{
        if (tracepoint_printk && tracepoint_printk_stop_on_boot) {
                static_key_disable(&tracepoint_printk_key.key);
                tracepoint_printk = 0;
        }

        tracing_set_default_clock();
        clear_boot_tracer();
        return 0;
}

late_initcall_sync(late_trace_init);




























































    2 











































































































































































































































































































































































   10 











































































































































































































































































































































































































































































































































































































































































































































































































   35 






   13 








   31 








   18 






    7 
    6 







   13 


























































































































































































































































































   21 



































































   19 





    8 

   16 
   18 
   18 




























































































    3 



    3 






    1 

    2 




    2 








    4 





    1 

    4 
    5 


























    2 

    3 


    1 

    4 

    3 
    2 















































































































    2 


    1 





    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
/* SPDX-License-Identifier: GPL-2.0+ */
#ifndef _LINUX_XARRAY_H
#define _LINUX_XARRAY_H
/*
 * eXtensible Arrays
 * Copyright (c) 2017 Microsoft Corporation
 * Author: Matthew Wilcox <willy@infradead.org>
 *
 * See Documentation/core-api/xarray.rst for how to use the XArray.
 */

#include <linux/bitmap.h>
#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/err.h>
#include <linux/gfp.h>
#include <linux/kconfig.h>
#include <linux/limits.h>
#include <linux/lockdep.h>
#include <linux/rcupdate.h>
#include <linux/sched/mm.h>
#include <linux/spinlock.h>
#include <linux/types.h>

struct list_lru;

/*
 * The bottom two bits of the entry determine how the XArray interprets
 * the contents:
 *
 * 00: Pointer entry
 * 10: Internal entry
 * x1: Value entry or tagged pointer
 *
 * Attempting to store internal entries in the XArray is a bug.
 *
 * Most internal entries are pointers to the next node in the tree.
 * The following internal entries have a special meaning:
 *
 * 0-62: Sibling entries
 * 256: Retry entry
 * 257: Zero entry
 *
 * Errors are also represented as internal entries, but use the negative
 * space (-4094 to -2).  They're never stored in the slots array; only
 * returned by the normal API.
 */

#define BITS_PER_XA_VALUE        (BITS_PER_LONG - 1)

/**
 * xa_mk_value() - Create an XArray entry from an integer.
 * @v: Value to store in XArray.
 *
 * Context: Any context.
 * Return: An entry suitable for storing in the XArray.
 */
static inline void *xa_mk_value(unsigned long v)
{
        WARN_ON((long)v < 0);
        return (void *)((v << 1) | 1);
}

/**
 * xa_to_value() - Get value stored in an XArray entry.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: The value stored in the XArray entry.
 */
static inline unsigned long xa_to_value(const void *entry)
{
        return (unsigned long)entry >> 1;
}

/**
 * xa_is_value() - Determine if an entry is a value.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: True if the entry is a value, false if it is a pointer.
 */
static inline bool xa_is_value(const void *entry)
{
        return (unsigned long)entry & 1;
}

/**
 * xa_tag_pointer() - Create an XArray entry for a tagged pointer.
 * @p: Plain pointer.
 * @tag: Tag value (0, 1 or 3).
 *
 * If the user of the XArray prefers, they can tag their pointers instead
 * of storing value entries.  Three tags are available (0, 1 and 3).
 * These are distinct from the xa_mark_t as they are not replicated up
 * through the array and cannot be searched for.
 *
 * Context: Any context.
 * Return: An XArray entry.
 */
static inline void *xa_tag_pointer(void *p, unsigned long tag)
{
        return (void *)((unsigned long)p | tag);
}

/**
 * xa_untag_pointer() - Turn an XArray entry into a plain pointer.
 * @entry: XArray entry.
 *
 * If you have stored a tagged pointer in the XArray, call this function
 * to get the untagged version of the pointer.
 *
 * Context: Any context.
 * Return: A pointer.
 */
static inline void *xa_untag_pointer(void *entry)
{
        return (void *)((unsigned long)entry & ~3UL);
}

/**
 * xa_pointer_tag() - Get the tag stored in an XArray entry.
 * @entry: XArray entry.
 *
 * If you have stored a tagged pointer in the XArray, call this function
 * to get the tag of that pointer.
 *
 * Context: Any context.
 * Return: A tag.
 */
static inline unsigned int xa_pointer_tag(void *entry)
{
        return (unsigned long)entry & 3UL;
}

/*
 * xa_mk_internal() - Create an internal entry.
 * @v: Value to turn into an internal entry.
 *
 * Internal entries are used for a number of purposes.  Entries 0-255 are
 * used for sibling entries (only 0-62 are used by the current code).  256
 * is used for the retry entry.  257 is used for the reserved / zero entry.
 * Negative internal entries are used to represent errnos.  Node pointers
 * are also tagged as internal entries in some situations.
 *
 * Context: Any context.
 * Return: An XArray internal entry corresponding to this value.
 */
static inline void *xa_mk_internal(unsigned long v)
{
        return (void *)((v << 2) | 2);
}

/*
 * xa_to_internal() - Extract the value from an internal entry.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: The value which was stored in the internal entry.
 */
static inline unsigned long xa_to_internal(const void *entry)
{
        return (unsigned long)entry >> 2;
}

/*
 * xa_is_internal() - Is the entry an internal entry?
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: %true if the entry is an internal entry.
 */
static inline bool xa_is_internal(const void *entry)
{
        return ((unsigned long)entry & 3) == 2;
}

#define XA_ZERO_ENTRY                xa_mk_internal(257)

/**
 * xa_is_zero() - Is the entry a zero entry?
 * @entry: Entry retrieved from the XArray
 *
 * The normal API will return NULL as the contents of a slot containing
 * a zero entry.  You can only see zero entries by using the advanced API.
 *
 * Return: %true if the entry is a zero entry.
 */
static inline bool xa_is_zero(const void *entry)
{
        return unlikely(entry == XA_ZERO_ENTRY);
}

/**
 * xa_is_err() - Report whether an XArray operation returned an error
 * @entry: Result from calling an XArray function
 *
 * If an XArray operation cannot complete an operation, it will return
 * a special value indicating an error.  This function tells you
 * whether an error occurred; xa_err() tells you which error occurred.
 *
 * Context: Any context.
 * Return: %true if the entry indicates an error.
 */
static inline bool xa_is_err(const void *entry)
{
        return unlikely(xa_is_internal(entry) &&
                        entry >= xa_mk_internal(-MAX_ERRNO));
}

/**
 * xa_err() - Turn an XArray result into an errno.
 * @entry: Result from calling an XArray function.
 *
 * If an XArray operation cannot complete an operation, it will return
 * a special pointer value which encodes an errno.  This function extracts
 * the errno from the pointer value, or returns 0 if the pointer does not
 * represent an errno.
 *
 * Context: Any context.
 * Return: A negative errno or 0.
 */
static inline int xa_err(void *entry)
{
        /* xa_to_internal() would not do sign extension. */
        if (xa_is_err(entry))
                return (long)entry >> 2;
        return 0;
}

/**
 * struct xa_limit - Represents a range of IDs.
 * @min: The lowest ID to allocate (inclusive).
 * @max: The maximum ID to allocate (inclusive).
 *
 * This structure is used either directly or via the XA_LIMIT() macro
 * to communicate the range of IDs that are valid for allocation.
 * Three common ranges are predefined for you:
 * * xa_limit_32b        - [0 - UINT_MAX]
 * * xa_limit_31b        - [0 - INT_MAX]
 * * xa_limit_16b        - [0 - USHRT_MAX]
 */
struct xa_limit {
        u32 max;
        u32 min;
};

#define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max }

#define xa_limit_32b        XA_LIMIT(0, UINT_MAX)
#define xa_limit_31b        XA_LIMIT(0, INT_MAX)
#define xa_limit_16b        XA_LIMIT(0, USHRT_MAX)

typedef unsigned __bitwise xa_mark_t;
#define XA_MARK_0                ((__force xa_mark_t)0U)
#define XA_MARK_1                ((__force xa_mark_t)1U)
#define XA_MARK_2                ((__force xa_mark_t)2U)
#define XA_PRESENT                ((__force xa_mark_t)8U)
#define XA_MARK_MAX                XA_MARK_2
#define XA_FREE_MARK                XA_MARK_0

enum xa_lock_type {
        XA_LOCK_IRQ = 1,
        XA_LOCK_BH = 2,
};

/*
 * Values for xa_flags.  The radix tree stores its GFP flags in the xa_flags,
 * and we remain compatible with that.
 */
#define XA_FLAGS_LOCK_IRQ        ((__force gfp_t)XA_LOCK_IRQ)
#define XA_FLAGS_LOCK_BH        ((__force gfp_t)XA_LOCK_BH)
#define XA_FLAGS_TRACK_FREE        ((__force gfp_t)4U)
#define XA_FLAGS_ZERO_BUSY        ((__force gfp_t)8U)
#define XA_FLAGS_ALLOC_WRAPPED        ((__force gfp_t)16U)
#define XA_FLAGS_ACCOUNT        ((__force gfp_t)32U)
#define XA_FLAGS_MARK(mark)        ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \
                                                (__force unsigned)(mark)))

/* ALLOC is for a normal 0-based alloc.  ALLOC1 is for an 1-based alloc */
#define XA_FLAGS_ALLOC        (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK))
#define XA_FLAGS_ALLOC1        (XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY)

/**
 * struct xarray - The anchor of the XArray.
 * @xa_lock: Lock that protects the contents of the XArray.
 *
 * To use the xarray, define it statically or embed it in your data structure.
 * It is a very small data structure, so it does not usually make sense to
 * allocate it separately and keep a pointer to it in your data structure.
 *
 * You may use the xa_lock to protect your own data structures as well.
 */
/*
 * If all of the entries in the array are NULL, @xa_head is a NULL pointer.
 * If the only non-NULL entry in the array is at index 0, @xa_head is that
 * entry.  If any other entry in the array is non-NULL, @xa_head points
 * to an @xa_node.
 */
struct xarray {
        spinlock_t        xa_lock;
/* private: The rest of the data structure is not to be used directly. */
        gfp_t                xa_flags;
        void __rcu *        xa_head;
};

#define XARRAY_INIT(name, flags) {                                \
        .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock),                \
        .xa_flags = flags,                                        \
        .xa_head = NULL,                                        \
}

/**
 * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags.
 * @name: A string that names your XArray.
 * @flags: XA_FLAG values.
 *
 * This is intended for file scope definitions of XArrays.  It declares
 * and initialises an empty XArray with the chosen name and flags.  It is
 * equivalent to calling xa_init_flags() on the array, but it does the
 * initialisation at compiletime instead of runtime.
 */
#define DEFINE_XARRAY_FLAGS(name, flags)                                \
        struct xarray name = XARRAY_INIT(name, flags)

/**
 * DEFINE_XARRAY() - Define an XArray.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of XArrays.  It declares
 * and initialises an empty XArray with the chosen name.  It is equivalent
 * to calling xa_init() on the array, but it does the initialisation at
 * compiletime instead of runtime.
 */
#define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0)

/**
 * DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of allocating XArrays.
 * See also DEFINE_XARRAY().
 */
#define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC)

/**
 * DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of allocating XArrays.
 * See also DEFINE_XARRAY().
 */
#define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1)

void *xa_load(struct xarray *, unsigned long index);
void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
void *xa_erase(struct xarray *, unsigned long index);
void *xa_store_range(struct xarray *, unsigned long first, unsigned long last,
                        void *entry, gfp_t);
bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t);
void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);
void *xa_find(struct xarray *xa, unsigned long *index,
                unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
void *xa_find_after(struct xarray *xa, unsigned long *index,
                unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
unsigned int xa_extract(struct xarray *, void **dst, unsigned long start,
                unsigned long max, unsigned int n, xa_mark_t);
void xa_destroy(struct xarray *);

/**
 * xa_init_flags() - Initialise an empty XArray with flags.
 * @xa: XArray.
 * @flags: XA_FLAG values.
 *
 * If you need to initialise an XArray with special flags (eg you need
 * to take the lock from interrupt context), use this function instead
 * of xa_init().
 *
 * Context: Any context.
 */
static inline void xa_init_flags(struct xarray *xa, gfp_t flags)
{
        spin_lock_init(&xa->xa_lock);
        xa->xa_flags = flags;
        xa->xa_head = NULL;
}

/**
 * xa_init() - Initialise an empty XArray.
 * @xa: XArray.
 *
 * An empty XArray is full of NULL entries.
 *
 * Context: Any context.
 */
static inline void xa_init(struct xarray *xa)
{
        xa_init_flags(xa, 0);
}

/**
 * xa_empty() - Determine if an array has any present entries.
 * @xa: XArray.
 *
 * Context: Any context.
 * Return: %true if the array contains only NULL pointers.
 */
static inline bool xa_empty(const struct xarray *xa)
{
        return xa->xa_head == NULL;
}

/**
 * xa_marked() - Inquire whether any entry in this array has a mark set
 * @xa: Array
 * @mark: Mark value
 *
 * Context: Any context.
 * Return: %true if any entry has this mark set.
 */
static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark)
{
        return xa->xa_flags & XA_FLAGS_MARK(mark);
}

/**
 * xa_for_each_range() - Iterate over a portion of an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @start: First index to retrieve from array.
 * @last: Last index to retrieve from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you
 * want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set
 * to NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_range() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_range() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each() iterator instead.
 * The xas_for_each() iterator will expand into more inline code than
 * xa_for_each_range().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_range(xa, index, entry, start, last)                \
        for (index = start,                                                \
             entry = xa_find(xa, &index, last, XA_PRESENT);                \
             entry;                                                        \
             entry = xa_find_after(xa, &index, last, XA_PRESENT))

/**
 * xa_for_each_start() - Iterate over a portion of an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @start: First index to retrieve from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you
 * want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set
 * to NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_start() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_start() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each() iterator instead.
 * The xas_for_each() iterator will expand into more inline code than
 * xa_for_each_start().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_start(xa, index, entry, start) \
        xa_for_each_range(xa, index, entry, start, ULONG_MAX)

/**
 * xa_for_each() - Iterate over present entries in an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you want
 * to skip or reprocess indices.  It is safe to modify the array during the
 * iteration.  At the end of the iteration, @entry will be set to NULL and
 * @index will have a value less than or equal to max.
 *
 * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).  xa_for_each()
 * will spin if it hits a retry entry; if you intend to see retry entries,
 * you should use the xas_for_each() iterator instead.  The xas_for_each()
 * iterator will expand into more inline code than xa_for_each().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each(xa, index, entry) \
        xa_for_each_start(xa, index, entry, 0)

/**
 * xa_for_each_marked() - Iterate over marked entries in an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @filter: Selection criterion.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  The iteration will skip all entries in the array
 * which do not match @filter.  You may modify @index during the iteration
 * if you want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set to
 * NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_marked() is O(n.log(n)) while xas_for_each_marked() is O(n).
 * You have to handle your own locking with xas_for_each(), and if you have
 * to unlock after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_marked() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each_marked() iterator
 * instead.  The xas_for_each_marked() iterator will expand into more inline
 * code than xa_for_each_marked().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_marked(xa, index, entry, filter) \
        for (index = 0, entry = xa_find(xa, &index, ULONG_MAX, filter); \
             entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter))

#define xa_trylock(xa)                spin_trylock(&(xa)->xa_lock)
#define xa_lock(xa)                spin_lock(&(xa)->xa_lock)
#define xa_unlock(xa)                spin_unlock(&(xa)->xa_lock)
#define xa_lock_bh(xa)                spin_lock_bh(&(xa)->xa_lock)
#define xa_unlock_bh(xa)        spin_unlock_bh(&(xa)->xa_lock)
#define xa_lock_irq(xa)                spin_lock_irq(&(xa)->xa_lock)
#define xa_unlock_irq(xa)        spin_unlock_irq(&(xa)->xa_lock)
#define xa_lock_irqsave(xa, flags) \
                                spin_lock_irqsave(&(xa)->xa_lock, flags)
#define xa_unlock_irqrestore(xa, flags) \
                                spin_unlock_irqrestore(&(xa)->xa_lock, flags)
#define xa_lock_nested(xa, subclass) \
                                spin_lock_nested(&(xa)->xa_lock, subclass)
#define xa_lock_bh_nested(xa, subclass) \
                                spin_lock_bh_nested(&(xa)->xa_lock, subclass)
#define xa_lock_irq_nested(xa, subclass) \
                                spin_lock_irq_nested(&(xa)->xa_lock, subclass)
#define xa_lock_irqsave_nested(xa, flags, subclass) \
                spin_lock_irqsave_nested(&(xa)->xa_lock, flags, subclass)

/*
 * Versions of the normal API which require the caller to hold the
 * xa_lock.  If the GFP flags allow it, they will drop the lock to
 * allocate memory, then reacquire it afterwards.  These functions
 * may also re-enable interrupts if the XArray flags indicate the
 * locking should be interrupt safe.
 */
void *__xa_erase(struct xarray *, unsigned long index);
void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old,
                void *entry, gfp_t);
int __must_check __xa_insert(struct xarray *, unsigned long index,
                void *entry, gfp_t);
int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry,
                struct xa_limit, gfp_t);
int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry,
                struct xa_limit, u32 *next, gfp_t);
void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);

/**
 * xa_store_bh() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_store() except it disables softirqs
 * while holding the array lock.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
static inline void *xa_store_bh(struct xarray *xa, unsigned long index,
                void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_bh(xa);
        curr = __xa_store(xa, index, entry, gfp);
        xa_unlock_bh(xa);

        return curr;
}

/**
 * xa_store_irq() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_store() except it disables interrupts
 * while holding the array lock.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
static inline void *xa_store_irq(struct xarray *xa, unsigned long index,
                void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_irq(xa);
        curr = __xa_store(xa, index, entry, gfp);
        xa_unlock_irq(xa);

        return curr;
}

/**
 * xa_erase_bh() - Erase this entry from the XArray.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: The entry which used to be at this index.
 */
static inline void *xa_erase_bh(struct xarray *xa, unsigned long index)
{
        void *entry;

        xa_lock_bh(xa);
        entry = __xa_erase(xa, index);
        xa_unlock_bh(xa);

        return entry;
}

/**
 * xa_erase_irq() - Erase this entry from the XArray.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: The entry which used to be at this index.
 */
static inline void *xa_erase_irq(struct xarray *xa, unsigned long index)
{
        void *entry;

        xa_lock_irq(xa);
        entry = __xa_erase(xa, index);
        xa_unlock_irq(xa);

        return entry;
}

/**
 * xa_cmpxchg() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * If the entry at @index is the same as @old, replace it with @entry.
 * If the return value is equal to @old, then the exchange was successful.
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep
 * if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock(xa);

        return curr;
}

/**
 * xa_cmpxchg_bh() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_cmpxchg() except it disables softirqs
 * while holding the array lock.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_bh(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock_bh(xa);

        return curr;
}

/**
 * xa_cmpxchg_irq() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_cmpxchg() except it disables interrupts
 * while holding the array lock.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_irq(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock_irq(xa);

        return curr;
}

/**
 * xa_insert() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock(xa);

        return err;
}

/**
 * xa_insert_bh() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert_bh(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_bh(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock_bh(xa);

        return err;
}

/**
 * xa_insert_irq() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert_irq(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_irq(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock_irq(xa);

        return err;
}

/**
 * xa_alloc() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline __must_check int xa_alloc(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock(xa);

        return err;
}

/**
 * xa_alloc_bh() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_bh(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock_bh(xa);

        return err;
}

/**
 * xa_alloc_irq() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_irq(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock_irq(xa);

        return err;
}

/**
 * xa_alloc_cyclic() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock(xa);

        return err;
}

/**
 * xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_bh(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock_bh(xa);

        return err;
}

/**
 * xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_irq(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock_irq(xa);

        return err;
}

/**
 * xa_reserve() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * Ensures there is somewhere to store an entry at @index in the array.
 * If there is already something stored at @index, this function does
 * nothing.  If there was nothing there, the entry is marked as reserved.
 * Loading from a reserved entry returns a %NULL pointer.
 *
 * If you do not use the entry that you have reserved, call xa_release()
 * or xa_erase() to free any unnecessary memory.
 *
 * Context: Any context.  Takes and releases the xa_lock.
 * May sleep if the @gfp flags permit.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_reserve_bh() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * A softirq-disabling version of xa_reserve().
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_reserve_irq() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * An interrupt-disabling version of xa_reserve().
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_release() - Release a reserved entry.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After calling xa_reserve(), you can call this function to release the
 * reservation.  If the entry at @index has been stored to, this function
 * will do nothing.
 */
static inline void xa_release(struct xarray *xa, unsigned long index)
{
        xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0);
}

/* Everything below here is the Advanced API.  Proceed with caution. */

/*
 * The xarray is constructed out of a set of 'chunks' of pointers.  Choosing
 * the best chunk size requires some tradeoffs.  A power of two recommends
 * itself so that we can walk the tree based purely on shifts and masks.
 * Generally, the larger the better; as the number of slots per level of the
 * tree increases, the less tall the tree needs to be.  But that needs to be
 * balanced against the memory consumption of each node.  On a 64-bit system,
 * xa_node is currently 576 bytes, and we get 7 of them per 4kB page.  If we
 * doubled the number of slots per node, we'd get only 3 nodes per 4kB page.
 */
#ifndef XA_CHUNK_SHIFT
#define XA_CHUNK_SHIFT                (IS_ENABLED(CONFIG_BASE_SMALL) ? 4 : 6)
#endif
#define XA_CHUNK_SIZE                (1UL << XA_CHUNK_SHIFT)
#define XA_CHUNK_MASK                (XA_CHUNK_SIZE - 1)
#define XA_MAX_MARKS                3
#define XA_MARK_LONGS                BITS_TO_LONGS(XA_CHUNK_SIZE)

/*
 * @count is the count of every non-NULL element in the ->slots array
 * whether that is a value entry, a retry entry, a user pointer,
 * a sibling entry or a pointer to the next level of the tree.
 * @nr_values is the count of every element in ->slots which is
 * either a value entry or a sibling of a value entry.
 */
struct xa_node {
        unsigned char        shift;                /* Bits remaining in each slot */
        unsigned char        offset;                /* Slot offset in parent */
        unsigned char        count;                /* Total entry count */
        unsigned char        nr_values;        /* Value entry count */
        struct xa_node __rcu *parent;        /* NULL at top of tree */
        struct xarray        *array;                /* The array we belong to */
        union {
                struct list_head private_list;        /* For tree user */
                struct rcu_head        rcu_head;        /* Used when freeing node */
        };
        void __rcu        *slots[XA_CHUNK_SIZE];
        union {
                unsigned long        tags[XA_MAX_MARKS][XA_MARK_LONGS];
                unsigned long        marks[XA_MAX_MARKS][XA_MARK_LONGS];
        };
};

void xa_dump(const struct xarray *);
void xa_dump_node(const struct xa_node *);

#ifdef XA_DEBUG
#define XA_BUG_ON(xa, x) do {                                        \
                if (x) {                                        \
                        xa_dump(xa);                                \
                        BUG();                                        \
                }                                                \
        } while (0)
#define XA_NODE_BUG_ON(node, x) do {                                \
                if (x) {                                        \
                        if (node) xa_dump_node(node);                \
                        BUG();                                        \
                }                                                \
        } while (0)
#else
#define XA_BUG_ON(xa, x)        do { } while (0)
#define XA_NODE_BUG_ON(node, x)        do { } while (0)
#endif

/* Private */
static inline void *xa_head(const struct xarray *xa)
{
        return rcu_dereference_check(xa->xa_head,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_head_locked(const struct xarray *xa)
{
        return rcu_dereference_protected(xa->xa_head,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_entry(const struct xarray *xa,
                                const struct xa_node *node, unsigned int offset)
{
        XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
        return rcu_dereference_check(node->slots[offset],
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_entry_locked(const struct xarray *xa,
                                const struct xa_node *node, unsigned int offset)
{
        XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
        return rcu_dereference_protected(node->slots[offset],
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline struct xa_node *xa_parent(const struct xarray *xa,
                                        const struct xa_node *node)
{
        return rcu_dereference_check(node->parent,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline struct xa_node *xa_parent_locked(const struct xarray *xa,
                                        const struct xa_node *node)
{
        return rcu_dereference_protected(node->parent,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_mk_node(const struct xa_node *node)
{
        return (void *)((unsigned long)node | 2);
}

/* Private */
static inline struct xa_node *xa_to_node(const void *entry)
{
        return (struct xa_node *)((unsigned long)entry - 2);
}

/* Private */
static inline bool xa_is_node(const void *entry)
{
        return xa_is_internal(entry) && (unsigned long)entry > 4096;
}

/* Private */
static inline void *xa_mk_sibling(unsigned int offset)
{
        return xa_mk_internal(offset);
}

/* Private */
static inline unsigned long xa_to_sibling(const void *entry)
{
        return xa_to_internal(entry);
}

/**
 * xa_is_sibling() - Is the entry a sibling entry?
 * @entry: Entry retrieved from the XArray
 *
 * Return: %true if the entry is a sibling entry.
 */
static inline bool xa_is_sibling(const void *entry)
{
        return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) &&
                (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1));
}

#define XA_RETRY_ENTRY                xa_mk_internal(256)

/**
 * xa_is_retry() - Is the entry a retry entry?
 * @entry: Entry retrieved from the XArray
 *
 * Return: %true if the entry is a retry entry.
 */
static inline bool xa_is_retry(const void *entry)
{
        return unlikely(entry == XA_RETRY_ENTRY);
}

/**
 * xa_is_advanced() - Is the entry only permitted for the advanced API?
 * @entry: Entry to be stored in the XArray.
 *
 * Return: %true if the entry cannot be stored by the normal API.
 */
static inline bool xa_is_advanced(const void *entry)
{
        return xa_is_internal(entry) && (entry <= XA_RETRY_ENTRY);
}

/**
 * typedef xa_update_node_t - A callback function from the XArray.
 * @node: The node which is being processed
 *
 * This function is called every time the XArray updates the count of
 * present and value entries in a node.  It allows advanced users to
 * maintain the private_list in the node.
 *
 * Context: The xa_lock is held and interrupts may be disabled.
 *            Implementations should not drop the xa_lock, nor re-enable
 *            interrupts.
 */
typedef void (*xa_update_node_t)(struct xa_node *node);

void xa_delete_node(struct xa_node *, xa_update_node_t);

/*
 * The xa_state is opaque to its users.  It contains various different pieces
 * of state involved in the current operation on the XArray.  It should be
 * declared on the stack and passed between the various internal routines.
 * The various elements in it should not be accessed directly, but only
 * through the provided accessor functions.  The below documentation is for
 * the benefit of those working on the code, not for users of the XArray.
 *
 * @xa_node usually points to the xa_node containing the slot we're operating
 * on (and @xa_offset is the offset in the slots array).  If there is a
 * single entry in the array at index 0, there are no allocated xa_nodes to
 * point to, and so we store %NULL in @xa_node.  @xa_node is set to
 * the value %XAS_RESTART if the xa_state is not walked to the correct
 * position in the tree of nodes for this operation.  If an error occurs
 * during an operation, it is set to an %XAS_ERROR value.  If we run off the
 * end of the allocated nodes, it is set to %XAS_BOUNDS.
 */
struct xa_state {
        struct xarray *xa;
        unsigned long xa_index;
        unsigned char xa_shift;
        unsigned char xa_sibs;
        unsigned char xa_offset;
        unsigned char xa_pad;                /* Helps gcc generate better code */
        struct xa_node *xa_node;
        struct xa_node *xa_alloc;
        xa_update_node_t xa_update;
        struct list_lru *xa_lru;
};

/*
 * We encode errnos in the xas->xa_node.  If an error has happened, we need to
 * drop the lock to fix it, and once we've done so the xa_state is invalid.
 */
#define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL))
#define XAS_BOUNDS        ((struct xa_node *)1UL)
#define XAS_RESTART        ((struct xa_node *)3UL)

#define __XA_STATE(array, index, shift, sibs)  {        \
        .xa = array,                                        \
        .xa_index = index,                                \
        .xa_shift = shift,                                \
        .xa_sibs = sibs,                                \
        .xa_offset = 0,                                        \
        .xa_pad = 0,                                        \
        .xa_node = XAS_RESTART,                                \
        .xa_alloc = NULL,                                \
        .xa_update = NULL,                                \
        .xa_lru = NULL,                                        \
}

/**
 * XA_STATE() - Declare an XArray operation state.
 * @name: Name of this operation state (usually xas).
 * @array: Array to operate on.
 * @index: Initial index of interest.
 *
 * Declare and initialise an xa_state on the stack.
 */
#define XA_STATE(name, array, index)                                \
        struct xa_state name = __XA_STATE(array, index, 0, 0)

/**
 * XA_STATE_ORDER() - Declare an XArray operation state.
 * @name: Name of this operation state (usually xas).
 * @array: Array to operate on.
 * @index: Initial index of interest.
 * @order: Order of entry.
 *
 * Declare and initialise an xa_state on the stack.  This variant of
 * XA_STATE() allows you to specify the 'order' of the element you
 * want to operate on.`
 */
#define XA_STATE_ORDER(name, array, index, order)                \
        struct xa_state name = __XA_STATE(array,                \
                        (index >> order) << order,                \
                        order - (order % XA_CHUNK_SHIFT),        \
                        (1U << (order % XA_CHUNK_SHIFT)) - 1)

#define xas_marked(xas, mark)        xa_marked((xas)->xa, (mark))
#define xas_trylock(xas)        xa_trylock((xas)->xa)
#define xas_lock(xas)                xa_lock((xas)->xa)
#define xas_unlock(xas)                xa_unlock((xas)->xa)
#define xas_lock_bh(xas)        xa_lock_bh((xas)->xa)
#define xas_unlock_bh(xas)        xa_unlock_bh((xas)->xa)
#define xas_lock_irq(xas)        xa_lock_irq((xas)->xa)
#define xas_unlock_irq(xas)        xa_unlock_irq((xas)->xa)
#define xas_lock_irqsave(xas, flags) \
                                xa_lock_irqsave((xas)->xa, flags)
#define xas_unlock_irqrestore(xas, flags) \
                                xa_unlock_irqrestore((xas)->xa, flags)

/**
 * xas_error() - Return an errno stored in the xa_state.
 * @xas: XArray operation state.
 *
 * Return: 0 if no error has been noted.  A negative errno if one has.
 */
static inline int xas_error(const struct xa_state *xas)
{
        return xa_err(xas->xa_node);
}

/**
 * xas_set_err() - Note an error in the xa_state.
 * @xas: XArray operation state.
 * @err: Negative error number.
 *
 * Only call this function with a negative @err; zero or positive errors
 * will probably not behave the way you think they should.  If you want
 * to clear the error from an xa_state, use xas_reset().
 */
static inline void xas_set_err(struct xa_state *xas, long err)
{
        xas->xa_node = XA_ERROR(err);
}

/**
 * xas_invalid() - Is the xas in a retry or error state?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas cannot be used for operations.
 */
static inline bool xas_invalid(const struct xa_state *xas)
{
        return (unsigned long)xas->xa_node & 3;
}

/**
 * xas_valid() - Is the xas a valid cursor into the array?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas can be used for operations.
 */
static inline bool xas_valid(const struct xa_state *xas)
{
        return !xas_invalid(xas);
}

/**
 * xas_is_node() - Does the xas point to a node?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas currently references a node.
 */
static inline bool xas_is_node(const struct xa_state *xas)
{
        return xas_valid(xas) && xas->xa_node;
}

/* True if the pointer is something other than a node */
static inline bool xas_not_node(struct xa_node *node)
{
        return ((unsigned long)node & 3) || !node;
}

/* True if the node represents RESTART or an error */
static inline bool xas_frozen(struct xa_node *node)
{
        return (unsigned long)node & 2;
}

/* True if the node represents head-of-tree, RESTART or BOUNDS */
static inline bool xas_top(struct xa_node *node)
{
        return node <= XAS_RESTART;
}

/**
 * xas_reset() - Reset an XArray operation state.
 * @xas: XArray operation state.
 *
 * Resets the error or walk state of the @xas so future walks of the
 * array will start from the root.  Use this if you have dropped the
 * xarray lock and want to reuse the xa_state.
 *
 * Context: Any context.
 */
static inline void xas_reset(struct xa_state *xas)
{
        xas->xa_node = XAS_RESTART;
}

/**
 * xas_retry() - Retry the operation if appropriate.
 * @xas: XArray operation state.
 * @entry: Entry from xarray.
 *
 * The advanced functions may sometimes return an internal entry, such as
 * a retry entry or a zero entry.  This function sets up the @xas to restart
 * the walk from the head of the array if needed.
 *
 * Context: Any context.
 * Return: true if the operation needs to be retried.
 */
static inline bool xas_retry(struct xa_state *xas, const void *entry)
{
        if (xa_is_zero(entry))
                return true;
        if (!xa_is_retry(entry))
                return false;
        xas_reset(xas);
        return true;
}

void *xas_load(struct xa_state *);
void *xas_store(struct xa_state *, void *entry);
void *xas_find(struct xa_state *, unsigned long max);
void *xas_find_conflict(struct xa_state *);

bool xas_get_mark(const struct xa_state *, xa_mark_t);
void xas_set_mark(const struct xa_state *, xa_mark_t);
void xas_clear_mark(const struct xa_state *, xa_mark_t);
void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t);
void xas_init_marks(const struct xa_state *);

bool xas_nomem(struct xa_state *, gfp_t);
void xas_destroy(struct xa_state *);
void xas_pause(struct xa_state *);

void xas_create_range(struct xa_state *);

#ifdef CONFIG_XARRAY_MULTI
int xa_get_order(struct xarray *, unsigned long index);
int xas_get_order(struct xa_state *xas);
void xas_split(struct xa_state *, void *entry, unsigned int order);
void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t);
#else
static inline int xa_get_order(struct xarray *xa, unsigned long index)
{
        return 0;
}

static inline int xas_get_order(struct xa_state *xas)
{
        return 0;
}

static inline void xas_split(struct xa_state *xas, void *entry,
                unsigned int order)
{
        xas_store(xas, entry);
}

static inline void xas_split_alloc(struct xa_state *xas, void *entry,
                unsigned int order, gfp_t gfp)
{
}
#endif

/**
 * xas_reload() - Refetch an entry from the xarray.
 * @xas: XArray operation state.
 *
 * Use this function to check that a previously loaded entry still has
 * the same value.  This is useful for the lockless pagecache lookup where
 * we walk the array with only the RCU lock to protect us, lock the page,
 * then check that the page hasn't moved since we looked it up.
 *
 * The caller guarantees that @xas is still valid.  If it may be in an
 * error or restart state, call xas_load() instead.
 *
 * Return: The entry at this location in the xarray.
 */
static inline void *xas_reload(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;
        void *entry;
        char offset;

        if (!node)
                return xa_head(xas->xa);
        if (IS_ENABLED(CONFIG_XARRAY_MULTI)) {
                offset = (xas->xa_index >> node->shift) & XA_CHUNK_MASK;
                entry = xa_entry(xas->xa, node, offset);
                if (!xa_is_sibling(entry))
                        return entry;
                offset = xa_to_sibling(entry);
        } else {
                offset = xas->xa_offset;
        }
        return xa_entry(xas->xa, node, offset);
}

/**
 * xas_set() - Set up XArray operation state for a different index.
 * @xas: XArray operation state.
 * @index: New index into the XArray.
 *
 * Move the operation state to refer to a different index.  This will
 * have the effect of starting a walk from the top; see xas_next()
 * to move to an adjacent index.
 */
static inline void xas_set(struct xa_state *xas, unsigned long index)
{
        xas->xa_index = index;
        xas->xa_node = XAS_RESTART;
}

/**
 * xas_advance() - Skip over sibling entries.
 * @xas: XArray operation state.
 * @index: Index of last sibling entry.
 *
 * Move the operation state to refer to the last sibling entry.
 * This is useful for loops that normally want to see sibling
 * entries but sometimes want to skip them.  Use xas_set() if you
 * want to move to an index which is not part of this entry.
 */
static inline void xas_advance(struct xa_state *xas, unsigned long index)
{
        unsigned char shift = xas_is_node(xas) ? xas->xa_node->shift : 0;

        xas->xa_index = index;
        xas->xa_offset = (index >> shift) & XA_CHUNK_MASK;
}

/**
 * xas_set_order() - Set up XArray operation state for a multislot entry.
 * @xas: XArray operation state.
 * @index: Target of the operation.
 * @order: Entry occupies 2^@order indices.
 */
static inline void xas_set_order(struct xa_state *xas, unsigned long index,
                                        unsigned int order)
{
#ifdef CONFIG_XARRAY_MULTI
        xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0;
        xas->xa_shift = order - (order % XA_CHUNK_SHIFT);
        xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
        xas->xa_node = XAS_RESTART;
#else
        BUG_ON(order > 0);
        xas_set(xas, index);
#endif
}

/**
 * xas_set_update() - Set up XArray operation state for a callback.
 * @xas: XArray operation state.
 * @update: Function to call when updating a node.
 *
 * The XArray can notify a caller after it has updated an xa_node.
 * This is advanced functionality and is only needed by the page
 * cache and swap cache.
 */
static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update)
{
        xas->xa_update = update;
}

static inline void xas_set_lru(struct xa_state *xas, struct list_lru *lru)
{
        xas->xa_lru = lru;
}

/**
 * xas_next_entry() - Advance iterator to next present entry.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 *
 * xas_next_entry() is an inline function to optimise xarray traversal for
 * speed.  It is equivalent to calling xas_find(), and will call xas_find()
 * for all the hard cases.
 *
 * Return: The next present entry after the one currently referred to by @xas.
 */
static inline void *xas_next_entry(struct xa_state *xas, unsigned long max)
{
        struct xa_node *node = xas->xa_node;
        void *entry;

        if (unlikely(xas_not_node(node) || node->shift ||
                        xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)))
                return xas_find(xas, max);

        do {
                if (unlikely(xas->xa_index >= max))
                        return xas_find(xas, max);
                if (unlikely(xas->xa_offset == XA_CHUNK_MASK))
                        return xas_find(xas, max);
                entry = xa_entry(xas->xa, node, xas->xa_offset + 1);
                if (unlikely(xa_is_internal(entry)))
                        return xas_find(xas, max);
                xas->xa_offset++;
                xas->xa_index++;
        } while (!entry);

        return entry;
}

/* Private */
static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance,
                xa_mark_t mark)
{
        unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark];
        unsigned int offset = xas->xa_offset;

        if (advance)
                offset++;
        if (XA_CHUNK_SIZE == BITS_PER_LONG) {
                if (offset < XA_CHUNK_SIZE) {
                        unsigned long data = *addr & (~0UL << offset);
                        if (data)
                                return __ffs(data);
                }
                return XA_CHUNK_SIZE;
        }

        return find_next_bit(addr, XA_CHUNK_SIZE, offset);
}

/**
 * xas_next_marked() - Advance iterator to next marked entry.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 * @mark: Mark to search for.
 *
 * xas_next_marked() is an inline function to optimise xarray traversal for
 * speed.  It is equivalent to calling xas_find_marked(), and will call
 * xas_find_marked() for all the hard cases.
 *
 * Return: The next marked entry after the one currently referred to by @xas.
 */
static inline void *xas_next_marked(struct xa_state *xas, unsigned long max,
                                                                xa_mark_t mark)
{
        struct xa_node *node = xas->xa_node;
        void *entry;
        unsigned int offset;

        if (unlikely(xas_not_node(node) || node->shift))
                return xas_find_marked(xas, max, mark);
        offset = xas_find_chunk(xas, true, mark);
        xas->xa_offset = offset;
        xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset;
        if (xas->xa_index > max)
                return NULL;
        if (offset == XA_CHUNK_SIZE)
                return xas_find_marked(xas, max, mark);
        entry = xa_entry(xas->xa, node, offset);
        if (!entry)
                return xas_find_marked(xas, max, mark);
        return entry;
}

/*
 * If iterating while holding a lock, drop the lock and reschedule
 * every %XA_CHECK_SCHED loops.
 */
enum {
        XA_CHECK_SCHED = 4096,
};

/**
 * xas_for_each() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 * @max: Maximum index to retrieve from array.
 *
 * The loop body will be executed for each entry present in the xarray
 * between the current xas position and @max.  @entry will be set to
 * the entry retrieved from the xarray.  It is safe to delete entries
 * from the array in the loop body.  You should hold either the RCU lock
 * or the xa_lock while iterating.  If you need to drop the lock, call
 * xas_pause() first.
 */
#define xas_for_each(xas, entry, max) \
        for (entry = xas_find(xas, max); entry; \
             entry = xas_next_entry(xas, max))

/**
 * xas_for_each_marked() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 * @max: Maximum index to retrieve from array.
 * @mark: Mark to search for.
 *
 * The loop body will be executed for each marked entry in the xarray
 * between the current xas position and @max.  @entry will be set to
 * the entry retrieved from the xarray.  It is safe to delete entries
 * from the array in the loop body.  You should hold either the RCU lock
 * or the xa_lock while iterating.  If you need to drop the lock, call
 * xas_pause() first.
 */
#define xas_for_each_marked(xas, entry, max, mark) \
        for (entry = xas_find_marked(xas, max, mark); entry; \
             entry = xas_next_marked(xas, max, mark))

/**
 * xas_for_each_conflict() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 *
 * The loop body will be executed for each entry in the XArray that
 * lies within the range specified by @xas.  If the loop terminates
 * normally, @entry will be %NULL.  The user may break out of the loop,
 * which will leave @entry set to the conflicting entry.  The caller
 * may also call xa_set_err() to exit the loop while setting an error
 * to record the reason.
 */
#define xas_for_each_conflict(xas, entry) \
        while ((entry = xas_find_conflict(xas)))

void *__xas_next(struct xa_state *);
void *__xas_prev(struct xa_state *);

/**
 * xas_prev() - Move iterator to previous index.
 * @xas: XArray operation state.
 *
 * If the @xas was in an error state, it will remain in an error state
 * and this function will return %NULL.  If the @xas has never been walked,
 * it will have the effect of calling xas_load().  Otherwise one will be
 * subtracted from the index and the state will be walked to the correct
 * location in the array for the next operation.
 *
 * If the iterator was referencing index 0, this function wraps
 * around to %ULONG_MAX.
 *
 * Return: The entry at the new index.  This may be %NULL or an internal
 * entry.
 */
static inline void *xas_prev(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        if (unlikely(xas_not_node(node) || node->shift ||
                                xas->xa_offset == 0))
                return __xas_prev(xas);

        xas->xa_index--;
        xas->xa_offset--;
        return xa_entry(xas->xa, node, xas->xa_offset);
}

/**
 * xas_next() - Move state to next index.
 * @xas: XArray operation state.
 *
 * If the @xas was in an error state, it will remain in an error state
 * and this function will return %NULL.  If the @xas has never been walked,
 * it will have the effect of calling xas_load().  Otherwise one will be
 * added to the index and the state will be walked to the correct
 * location in the array for the next operation.
 *
 * If the iterator was referencing index %ULONG_MAX, this function wraps
 * around to 0.
 *
 * Return: The entry at the new index.  This may be %NULL or an internal
 * entry.
 */
static inline void *xas_next(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        if (unlikely(xas_not_node(node) || node->shift ||
                                xas->xa_offset == XA_CHUNK_MASK))
                return __xas_next(xas);

        xas->xa_index++;
        xas->xa_offset++;
        return xa_entry(xas->xa, node, xas->xa_offset);
}

#endif /* _LINUX_XARRAY_H */





























































































































































    1 















































































































































































    1 








    1 



    1 


    1 


    1 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 * Copyright (C) 2014 Fujitsu.  All rights reserved.
 */

#include <linux/kthread.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/freezer.h>
#include <trace/events/btrfs.h>
#include "async-thread.h"

enum {
        WORK_DONE_BIT,
        WORK_ORDER_DONE_BIT,
};

#define NO_THRESHOLD (-1)
#define DFT_THRESHOLD (32)

struct btrfs_workqueue {
        struct workqueue_struct *normal_wq;

        /* File system this workqueue services */
        struct btrfs_fs_info *fs_info;

        /* List head pointing to ordered work list */
        struct list_head ordered_list;

        /* Spinlock for ordered_list */
        spinlock_t list_lock;

        /* Thresholding related variants */
        atomic_t pending;

        /* Up limit of concurrency workers */
        int limit_active;

        /* Current number of concurrency workers */
        int current_active;

        /* Threshold to change current_active */
        int thresh;
        unsigned int count;
        spinlock_t thres_lock;
};

struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq)
{
        return wq->fs_info;
}

struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work)
{
        return work->wq->fs_info;
}

bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq)
{
        /*
         * We could compare wq->pending with num_online_cpus()
         * to support "thresh == NO_THRESHOLD" case, but it requires
         * moving up atomic_inc/dec in thresh_queue/exec_hook. Let's
         * postpone it until someone needs the support of that case.
         */
        if (wq->thresh == NO_THRESHOLD)
                return false;

        return atomic_read(&wq->pending) > wq->thresh * 2;
}

static void btrfs_init_workqueue(struct btrfs_workqueue *wq,
                                 struct btrfs_fs_info *fs_info)
{
        wq->fs_info = fs_info;
        atomic_set(&wq->pending, 0);
        INIT_LIST_HEAD(&wq->ordered_list);
        spin_lock_init(&wq->list_lock);
        spin_lock_init(&wq->thres_lock);
}

struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
                                              const char *name, unsigned int flags,
                                              int limit_active, int thresh)
{
        struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);

        if (!ret)
                return NULL;

        btrfs_init_workqueue(ret, fs_info);

        ret->limit_active = limit_active;
        if (thresh == 0)
                thresh = DFT_THRESHOLD;
        /* For low threshold, disabling threshold is a better choice */
        if (thresh < DFT_THRESHOLD) {
                ret->current_active = limit_active;
                ret->thresh = NO_THRESHOLD;
        } else {
                /*
                 * For threshold-able wq, let its concurrency grow on demand.
                 * Use minimal max_active at alloc time to reduce resource
                 * usage.
                 */
                ret->current_active = 1;
                ret->thresh = thresh;
        }

        ret->normal_wq = alloc_workqueue("btrfs-%s", flags, ret->current_active,
                                         name);
        if (!ret->normal_wq) {
                kfree(ret);
                return NULL;
        }

        trace_btrfs_workqueue_alloc(ret, name);
        return ret;
}

struct btrfs_workqueue *btrfs_alloc_ordered_workqueue(
                                struct btrfs_fs_info *fs_info, const char *name,
                                unsigned int flags)
{
        struct btrfs_workqueue *ret;

        ret = kzalloc(sizeof(*ret), GFP_KERNEL);
        if (!ret)
                return NULL;

        btrfs_init_workqueue(ret, fs_info);

        /* Ordered workqueues don't allow @max_active adjustments. */
        ret->limit_active = 1;
        ret->current_active = 1;
        ret->thresh = NO_THRESHOLD;

        ret->normal_wq = alloc_ordered_workqueue("btrfs-%s", flags, name);
        if (!ret->normal_wq) {
                kfree(ret);
                return NULL;
        }

        trace_btrfs_workqueue_alloc(ret, name);
        return ret;
}

/*
 * Hook for threshold which will be called in btrfs_queue_work.
 * This hook WILL be called in IRQ handler context,
 * so workqueue_set_max_active MUST NOT be called in this hook
 */
static inline void thresh_queue_hook(struct btrfs_workqueue *wq)
{
        if (wq->thresh == NO_THRESHOLD)
                return;
        atomic_inc(&wq->pending);
}

/*
 * Hook for threshold which will be called before executing the work,
 * This hook is called in kthread content.
 * So workqueue_set_max_active is called here.
 */
static inline void thresh_exec_hook(struct btrfs_workqueue *wq)
{
        int new_current_active;
        long pending;
        int need_change = 0;

        if (wq->thresh == NO_THRESHOLD)
                return;

        atomic_dec(&wq->pending);
        spin_lock(&wq->thres_lock);
        /*
         * Use wq->count to limit the calling frequency of
         * workqueue_set_max_active.
         */
        wq->count++;
        wq->count %= (wq->thresh / 4);
        if (!wq->count)
                goto  out;
        new_current_active = wq->current_active;

        /*
         * pending may be changed later, but it's OK since we really
         * don't need it so accurate to calculate new_max_active.
         */
        pending = atomic_read(&wq->pending);
        if (pending > wq->thresh)
                new_current_active++;
        if (pending < wq->thresh / 2)
                new_current_active--;
        new_current_active = clamp_val(new_current_active, 1, wq->limit_active);
        if (new_current_active != wq->current_active)  {
                need_change = 1;
                wq->current_active = new_current_active;
        }
out:
        spin_unlock(&wq->thres_lock);

        if (need_change) {
                workqueue_set_max_active(wq->normal_wq, wq->current_active);
        }
}

static void run_ordered_work(struct btrfs_workqueue *wq,
                             struct btrfs_work *self)
{
        struct list_head *list = &wq->ordered_list;
        struct btrfs_work *work;
        spinlock_t *lock = &wq->list_lock;
        unsigned long flags;
        bool free_self = false;

        while (1) {
                spin_lock_irqsave(lock, flags);
                if (list_empty(list))
                        break;
                work = list_entry(list->next, struct btrfs_work,
                                  ordered_list);
                if (!test_bit(WORK_DONE_BIT, &work->flags))
                        break;
                /*
                 * Orders all subsequent loads after reading WORK_DONE_BIT,
                 * paired with the smp_mb__before_atomic in btrfs_work_helper
                 * this guarantees that the ordered function will see all
                 * updates from ordinary work function.
                 */
                smp_rmb();

                /*
                 * we are going to call the ordered done function, but
                 * we leave the work item on the list as a barrier so
                 * that later work items that are done don't have their
                 * functions called before this one returns
                 */
                if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
                        break;
                trace_btrfs_ordered_sched(work);
                spin_unlock_irqrestore(lock, flags);
                work->ordered_func(work, false);

                /* now take the lock again and drop our item from the list */
                spin_lock_irqsave(lock, flags);
                list_del(&work->ordered_list);
                spin_unlock_irqrestore(lock, flags);

                if (work == self) {
                        /*
                         * This is the work item that the worker is currently
                         * executing.
                         *
                         * The kernel workqueue code guarantees non-reentrancy
                         * of work items. I.e., if a work item with the same
                         * address and work function is queued twice, the second
                         * execution is blocked until the first one finishes. A
                         * work item may be freed and recycled with the same
                         * work function; the workqueue code assumes that the
                         * original work item cannot depend on the recycled work
                         * item in that case (see find_worker_executing_work()).
                         *
                         * Note that different types of Btrfs work can depend on
                         * each other, and one type of work on one Btrfs
                         * filesystem may even depend on the same type of work
                         * on another Btrfs filesystem via, e.g., a loop device.
                         * Therefore, we must not allow the current work item to
                         * be recycled until we are really done, otherwise we
                         * break the above assumption and can deadlock.
                         */
                        free_self = true;
                } else {
                        /*
                         * We don't want to call the ordered free functions with
                         * the lock held.
                         */
                        work->ordered_func(work, true);
                        /* NB: work must not be dereferenced past this point. */
                        trace_btrfs_all_work_done(wq->fs_info, work);
                }
        }
        spin_unlock_irqrestore(lock, flags);

        if (free_self) {
                self->ordered_func(self, true);
                /* NB: self must not be dereferenced past this point. */
                trace_btrfs_all_work_done(wq->fs_info, self);
        }
}

static void btrfs_work_helper(struct work_struct *normal_work)
{
        struct btrfs_work *work = container_of(normal_work, struct btrfs_work,
                                               normal_work);
        struct btrfs_workqueue *wq = work->wq;
        int need_order = 0;

        /*
         * We should not touch things inside work in the following cases:
         * 1) after work->func() if it has no ordered_func(..., true) to free
         *    Since the struct is freed in work->func().
         * 2) after setting WORK_DONE_BIT
         *    The work may be freed in other threads almost instantly.
         * So we save the needed things here.
         */
        if (work->ordered_func)
                need_order = 1;

        trace_btrfs_work_sched(work);
        thresh_exec_hook(wq);
        work->func(work);
        if (need_order) {
                /*
                 * Ensures all memory accesses done in the work function are
                 * ordered before setting the WORK_DONE_BIT. Ensuring the thread
                 * which is going to executed the ordered work sees them.
                 * Pairs with the smp_rmb in run_ordered_work.
                 */
                smp_mb__before_atomic();
                set_bit(WORK_DONE_BIT, &work->flags);
                run_ordered_work(wq, work);
        } else {
                /* NB: work must not be dereferenced past this point. */
                trace_btrfs_all_work_done(wq->fs_info, work);
        }
}

void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
                     btrfs_ordered_func_t ordered_func)
{
        work->func = func;
        work->ordered_func = ordered_func;
        INIT_WORK(&work->normal_work, btrfs_work_helper);
        INIT_LIST_HEAD(&work->ordered_list);
        work->flags = 0;
}

void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work)
{
        unsigned long flags;

        work->wq = wq;
        thresh_queue_hook(wq);
        if (work->ordered_func) {
                spin_lock_irqsave(&wq->list_lock, flags);
                list_add_tail(&work->ordered_list, &wq->ordered_list);
                spin_unlock_irqrestore(&wq->list_lock, flags);
        }
        trace_btrfs_work_queued(work);
        queue_work(wq->normal_wq, &work->normal_work);
}

void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
{
        if (!wq)
                return;
        destroy_workqueue(wq->normal_wq);
        trace_btrfs_workqueue_destroy(wq);
        kfree(wq);
}

void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active)
{
        if (wq)
                wq->limit_active = limit_active;
}

void btrfs_flush_workqueue(struct btrfs_workqueue *wq)
{
        flush_workqueue(wq->normal_wq);
}







































    3 





    2 






    2 
    3 




















































































































































































































    3 







    2 


    2 

    3 







    3 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/filesystems.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  table of configured filesystems
 */

#include <linux/syscalls.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kmod.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fs_parser.h>

/*
 * Handling of filesystem drivers list.
 * Rules:
 *        Inclusion to/removals from/scanning of list are protected by spinlock.
 *        During the unload module must call unregister_filesystem().
 *        We can access the fields of list element if:
 *                1) spinlock is held or
 *                2) we hold the reference to the module.
 *        The latter can be guaranteed by call of try_module_get(); if it
 *        returned 0 we must skip the element, otherwise we got the reference.
 *        Once the reference is obtained we can drop the spinlock.
 */

static struct file_system_type *file_systems;
static DEFINE_RWLOCK(file_systems_lock);

/* WARNING: This can be used only if we _already_ own a reference */
struct file_system_type *get_filesystem(struct file_system_type *fs)
{
        __module_get(fs->owner);
        return fs;
}

void put_filesystem(struct file_system_type *fs)
{
        module_put(fs->owner);
}

static struct file_system_type **find_filesystem(const char *name, unsigned len)
{
        struct file_system_type **p;
        for (p = &file_systems; *p; p = &(*p)->next)
                if (strncmp((*p)->name, name, len) == 0 &&
                    !(*p)->name[len])
                        break;
        return p;
}

/**
 *        register_filesystem - register a new filesystem
 *        @fs: the file system structure
 *
 *        Adds the file system passed to the list of file systems the kernel
 *        is aware of for mount and other syscalls. Returns 0 on success,
 *        or a negative errno code on an error.
 *
 *        The &struct file_system_type that is passed is linked into the kernel 
 *        structures and must not be freed until the file system has been
 *        unregistered.
 */
 
int register_filesystem(struct file_system_type * fs)
{
        int res = 0;
        struct file_system_type ** p;

        if (fs->parameters &&
            !fs_validate_description(fs->name, fs->parameters))
                return -EINVAL;

        BUG_ON(strchr(fs->name, '.'));
        if (fs->next)
                return -EBUSY;
        write_lock(&file_systems_lock);
        p = find_filesystem(fs->name, strlen(fs->name));
        if (*p)
                res = -EBUSY;
        else
                *p = fs;
        write_unlock(&file_systems_lock);
        return res;
}

EXPORT_SYMBOL(register_filesystem);

/**
 *        unregister_filesystem - unregister a file system
 *        @fs: filesystem to unregister
 *
 *        Remove a file system that was previously successfully registered
 *        with the kernel. An error is returned if the file system is not found.
 *        Zero is returned on a success.
 *        
 *        Once this function has returned the &struct file_system_type structure
 *        may be freed or reused.
 */
 
int unregister_filesystem(struct file_system_type * fs)
{
        struct file_system_type ** tmp;

        write_lock(&file_systems_lock);
        tmp = &file_systems;
        while (*tmp) {
                if (fs == *tmp) {
                        *tmp = fs->next;
                        fs->next = NULL;
                        write_unlock(&file_systems_lock);
                        synchronize_rcu();
                        return 0;
                }
                tmp = &(*tmp)->next;
        }
        write_unlock(&file_systems_lock);

        return -EINVAL;
}

EXPORT_SYMBOL(unregister_filesystem);

#ifdef CONFIG_SYSFS_SYSCALL
static int fs_index(const char __user * __name)
{
        struct file_system_type * tmp;
        struct filename *name;
        int err, index;

        name = getname(__name);
        err = PTR_ERR(name);
        if (IS_ERR(name))
                return err;

        err = -EINVAL;
        read_lock(&file_systems_lock);
        for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
                if (strcmp(tmp->name, name->name) == 0) {
                        err = index;
                        break;
                }
        }
        read_unlock(&file_systems_lock);
        putname(name);
        return err;
}

static int fs_name(unsigned int index, char __user * buf)
{
        struct file_system_type * tmp;
        int len, res;

        read_lock(&file_systems_lock);
        for (tmp = file_systems; tmp; tmp = tmp->next, index--)
                if (index <= 0 && try_module_get(tmp->owner))
                        break;
        read_unlock(&file_systems_lock);
        if (!tmp)
                return -EINVAL;

        /* OK, we got the reference, so we can safely block */
        len = strlen(tmp->name) + 1;
        res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0;
        put_filesystem(tmp);
        return res;
}

static int fs_maxindex(void)
{
        struct file_system_type * tmp;
        int index;

        read_lock(&file_systems_lock);
        for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++)
                ;
        read_unlock(&file_systems_lock);
        return index;
}

/*
 * Whee.. Weird sysv syscall. 
 */
SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
{
        int retval = -EINVAL;

        switch (option) {
                case 1:
                        retval = fs_index((const char __user *) arg1);
                        break;

                case 2:
                        retval = fs_name(arg1, (char __user *) arg2);
                        break;

                case 3:
                        retval = fs_maxindex();
                        break;
        }
        return retval;
}
#endif

int __init list_bdev_fs_names(char *buf, size_t size)
{
        struct file_system_type *p;
        size_t len;
        int count = 0;

        read_lock(&file_systems_lock);
        for (p = file_systems; p; p = p->next) {
                if (!(p->fs_flags & FS_REQUIRES_DEV))
                        continue;
                len = strlen(p->name) + 1;
                if (len > size) {
                        pr_warn("%s: truncating file system list\n", __func__);
                        break;
                }
                memcpy(buf, p->name, len);
                buf += len;
                size -= len;
                count++;
        }
        read_unlock(&file_systems_lock);
        return count;
}

#ifdef CONFIG_PROC_FS
static int filesystems_proc_show(struct seq_file *m, void *v)
{
        struct file_system_type * tmp;

        read_lock(&file_systems_lock);
        tmp = file_systems;
        while (tmp) {
                seq_printf(m, "%s\t%s\n",
                        (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
                        tmp->name);
                tmp = tmp->next;
        }
        read_unlock(&file_systems_lock);
        return 0;
}

static int __init proc_filesystems_init(void)
{
        proc_create_single("filesystems", 0, NULL, filesystems_proc_show);
        return 0;
}
module_init(proc_filesystems_init);
#endif

static struct file_system_type *__get_fs_type(const char *name, int len)
{
        struct file_system_type *fs;

        read_lock(&file_systems_lock);
        fs = *(find_filesystem(name, len));
        if (fs && !try_module_get(fs->owner))
                fs = NULL;
        read_unlock(&file_systems_lock);
        return fs;
}

struct file_system_type *get_fs_type(const char *name)
{
        struct file_system_type *fs;
        const char *dot = strchr(name, '.');
        int len = dot ? dot - name : strlen(name);

        fs = __get_fs_type(name, len);
        if (!fs && (request_module("fs-%.*s", len, name) == 0)) {
                fs = __get_fs_type(name, len);
                if (!fs)
                        pr_warn_once("request_module fs-%.*s succeeded, but still no fs?\n",
                                     len, name);
        }

        if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
                put_filesystem(fs);
                fs = NULL;
        }
        return fs;
}

EXPORT_SYMBOL(get_fs_type);







































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef BTRFS_EXTENT_MAP_H
#define BTRFS_EXTENT_MAP_H

#include <linux/compiler_types.h>
#include <linux/rwlock_types.h>
#include <linux/rbtree.h>
#include <linux/list.h>
#include <linux/refcount.h>
#include "misc.h"
#include "extent_map.h"
#include "compression.h"

struct btrfs_inode;
struct btrfs_fs_info;

#define EXTENT_MAP_LAST_BYTE ((u64)-4)
#define EXTENT_MAP_HOLE ((u64)-3)
#define EXTENT_MAP_INLINE ((u64)-2)

/* bits for the extent_map::flags field */
enum {
        /* this entry not yet on disk, don't free it */
        ENUM_BIT(EXTENT_FLAG_PINNED),
        ENUM_BIT(EXTENT_FLAG_COMPRESS_ZLIB),
        ENUM_BIT(EXTENT_FLAG_COMPRESS_LZO),
        ENUM_BIT(EXTENT_FLAG_COMPRESS_ZSTD),
        /* pre-allocated extent */
        ENUM_BIT(EXTENT_FLAG_PREALLOC),
        /* Logging this extent */
        ENUM_BIT(EXTENT_FLAG_LOGGING),
        /* This em is merged from two or more physically adjacent ems */
        ENUM_BIT(EXTENT_FLAG_MERGED),
};

/*
 * This structure represents file extents and holes.
 *
 * Unlike on-disk file extent items, extent maps can be merged to save memory.
 * This means members only match file extent items before any merging.
 *
 * Keep this structure as compact as possible, as we can have really large
 * amounts of allocated extent maps at any time.
 */
struct extent_map {
        struct rb_node rb_node;

        /* All of these are in bytes. */

        /* File offset matching the offset of a BTRFS_EXTENT_ITEM_KEY key. */
        u64 start;

        /*
         * Length of the file extent.
         *
         * For non-inlined file extents it's btrfs_file_extent_item::num_bytes.
         * For inline extents it's sectorsize, since inline data starts at
         * offsetof(struct btrfs_file_extent_item, disk_bytenr) thus
         * btrfs_file_extent_item::num_bytes is not valid.
         */
        u64 len;

        /*
         * The file offset of the original file extent before splitting.
         *
         * This is an in-memory only member, matching
         * extent_map::start - btrfs_file_extent_item::offset for
         * regular/preallocated extents. EXTENT_MAP_HOLE otherwise.
         */
        u64 orig_start;

        /*
         * The full on-disk extent length, matching
         * btrfs_file_extent_item::disk_num_bytes.
         */
        u64 orig_block_len;

        /*
         * The decompressed size of the whole on-disk extent, matching
         * btrfs_file_extent_item::ram_bytes.
         */
        u64 ram_bytes;

        /*
         * The on-disk logical bytenr for the file extent.
         *
         * For compressed extents it matches btrfs_file_extent_item::disk_bytenr.
         * For uncompressed extents it matches
         * btrfs_file_extent_item::disk_bytenr + btrfs_file_extent_item::offset
         *
         * For holes it is EXTENT_MAP_HOLE and for inline extents it is
         * EXTENT_MAP_INLINE.
         */
        u64 block_start;

        /*
         * The on-disk length for the file extent.
         *
         * For compressed extents it matches btrfs_file_extent_item::disk_num_bytes.
         * For uncompressed extents it matches extent_map::len.
         * For holes and inline extents it's -1 and shouldn't be used.
         */
        u64 block_len;

        /*
         * Generation of the extent map, for merged em it's the highest
         * generation of all merged ems.
         * For non-merged extents, it's from btrfs_file_extent_item::generation.
         */
        u64 generation;
        u32 flags;
        refcount_t refs;
        struct list_head list;
};

struct extent_map_tree {
        struct rb_root_cached map;
        struct list_head modified_extents;
        rwlock_t lock;
};

struct btrfs_inode;

static inline void extent_map_set_compression(struct extent_map *em,
                                              enum btrfs_compression_type type)
{
        if (type == BTRFS_COMPRESS_ZLIB)
                em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
        else if (type == BTRFS_COMPRESS_LZO)
                em->flags |= EXTENT_FLAG_COMPRESS_LZO;
        else if (type == BTRFS_COMPRESS_ZSTD)
                em->flags |= EXTENT_FLAG_COMPRESS_ZSTD;
}

static inline enum btrfs_compression_type extent_map_compression(const struct extent_map *em)
{
        if (em->flags & EXTENT_FLAG_COMPRESS_ZLIB)
                return BTRFS_COMPRESS_ZLIB;

        if (em->flags & EXTENT_FLAG_COMPRESS_LZO)
                return BTRFS_COMPRESS_LZO;

        if (em->flags & EXTENT_FLAG_COMPRESS_ZSTD)
                return BTRFS_COMPRESS_ZSTD;

        return BTRFS_COMPRESS_NONE;
}

/*
 * More efficient way to determine if extent is compressed, instead of using
 * 'extent_map_compression() != BTRFS_COMPRESS_NONE'.
 */
static inline bool extent_map_is_compressed(const struct extent_map *em)
{
        return (em->flags & (EXTENT_FLAG_COMPRESS_ZLIB |
                             EXTENT_FLAG_COMPRESS_LZO |
                             EXTENT_FLAG_COMPRESS_ZSTD)) != 0;
}

static inline int extent_map_in_tree(const struct extent_map *em)
{
        return !RB_EMPTY_NODE(&em->rb_node);
}

static inline u64 extent_map_end(const struct extent_map *em)
{
        if (em->start + em->len < em->start)
                return (u64)-1;
        return em->start + em->len;
}

void extent_map_tree_init(struct extent_map_tree *tree);
struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
                                         u64 start, u64 len);
void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em);
int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
                     u64 new_logical);

struct extent_map *alloc_extent_map(void);
void free_extent_map(struct extent_map *em);
int __init extent_map_init(void);
void __cold extent_map_exit(void);
int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen);
void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em);
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
                                         u64 start, u64 len);
int btrfs_add_extent_mapping(struct btrfs_inode *inode,
                             struct extent_map **em_in, u64 start, u64 len);
void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
                                 u64 start, u64 end,
                                 bool skip_pinned);
int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
                                   struct extent_map *new_em,
                                   bool modified);
long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);

#endif






























































































































































































































































































































































































































   16 







   13 









































































































































































































































































































































































































































































































































    7 



































































































































































    5 
    5 







    5 





























































































    4 




    4 






































































































































    2 
    1 






    1 
    1 
































    1 
    1 









    2 









    4 







    2 
    2 





    4 
    4 









    2 
    2 




    1 
    1 




    2 
    2 



    1 
    1 
































   13 




















    8 




    7 



























































    1 

















    1 








    1 

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Landlock LSM - Filesystem management and hooks
 *
 * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
 * Copyright © 2018-2020 ANSSI
 * Copyright © 2021-2022 Microsoft Corporation
 * Copyright © 2022 Günther Noack <gnoack3000@gmail.com>
 * Copyright © 2023-2024 Google LLC
 */

#include <asm/ioctls.h>
#include <kunit/test.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/bits.h>
#include <linux/compiler_types.h>
#include <linux/dcache.h>
#include <linux/err.h>
#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/limits.h>
#include <linux/list.h>
#include <linux/lsm_hooks.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/path.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/stat.h>
#include <linux/types.h>
#include <linux/wait_bit.h>
#include <linux/workqueue.h>
#include <uapi/linux/fiemap.h>
#include <uapi/linux/landlock.h>

#include "common.h"
#include "cred.h"
#include "fs.h"
#include "limits.h"
#include "object.h"
#include "ruleset.h"
#include "setup.h"

/* Underlying object management */

static void release_inode(struct landlock_object *const object)
        __releases(object->lock)
{
        struct inode *const inode = object->underobj;
        struct super_block *sb;

        if (!inode) {
                spin_unlock(&object->lock);
                return;
        }

        /*
         * Protects against concurrent use by hook_sb_delete() of the reference
         * to the underlying inode.
         */
        object->underobj = NULL;
        /*
         * Makes sure that if the filesystem is concurrently unmounted,
         * hook_sb_delete() will wait for us to finish iput().
         */
        sb = inode->i_sb;
        atomic_long_inc(&landlock_superblock(sb)->inode_refs);
        spin_unlock(&object->lock);
        /*
         * Because object->underobj was not NULL, hook_sb_delete() and
         * get_inode_object() guarantee that it is safe to reset
         * landlock_inode(inode)->object while it is not NULL.  It is therefore
         * not necessary to lock inode->i_lock.
         */
        rcu_assign_pointer(landlock_inode(inode)->object, NULL);
        /*
         * Now, new rules can safely be tied to @inode with get_inode_object().
         */

        iput(inode);
        if (atomic_long_dec_and_test(&landlock_superblock(sb)->inode_refs))
                wake_up_var(&landlock_superblock(sb)->inode_refs);
}

static const struct landlock_object_underops landlock_fs_underops = {
        .release = release_inode
};

/* IOCTL helpers */

/**
 * is_masked_device_ioctl - Determine whether an IOCTL command is always
 * permitted with Landlock for device files.  These commands can not be
 * restricted on device files by enforcing a Landlock policy.
 *
 * @cmd: The IOCTL command that is supposed to be run.
 *
 * By default, any IOCTL on a device file requires the
 * LANDLOCK_ACCESS_FS_IOCTL_DEV right.  However, we blanket-permit some
 * commands, if:
 *
 * 1. The command is implemented in fs/ioctl.c's do_vfs_ioctl(),
 *    not in f_ops->unlocked_ioctl() or f_ops->compat_ioctl().
 *
 * 2. The command is harmless when invoked on devices.
 *
 * We also permit commands that do not make sense for devices, but where the
 * do_vfs_ioctl() implementation returns a more conventional error code.
 *
 * Any new IOCTL commands that are implemented in fs/ioctl.c's do_vfs_ioctl()
 * should be considered for inclusion here.
 *
 * Returns: true if the IOCTL @cmd can not be restricted with Landlock for
 * device files.
 */
static __attribute_const__ bool is_masked_device_ioctl(const unsigned int cmd)
{
        switch (cmd) {
        /*
         * FIOCLEX, FIONCLEX, FIONBIO and FIOASYNC manipulate the FD's
         * close-on-exec and the file's buffered-IO and async flags.  These
         * operations are also available through fcntl(2), and are
         * unconditionally permitted in Landlock.
         */
        case FIOCLEX:
        case FIONCLEX:
        case FIONBIO:
        case FIOASYNC:
        /*
         * FIOQSIZE queries the size of a regular file, directory, or link.
         *
         * We still permit it, because it always returns -ENOTTY for
         * other file types.
         */
        case FIOQSIZE:
        /*
         * FIFREEZE and FITHAW freeze and thaw the file system which the
         * given file belongs to.  Requires CAP_SYS_ADMIN.
         *
         * These commands operate on the file system's superblock rather
         * than on the file itself.  The same operations can also be
         * done through any other file or directory on the same file
         * system, so it is safe to permit these.
         */
        case FIFREEZE:
        case FITHAW:
        /*
         * FS_IOC_FIEMAP queries information about the allocation of
         * blocks within a file.
         *
         * This IOCTL command only makes sense for regular files and is
         * not implemented by devices. It is harmless to permit.
         */
        case FS_IOC_FIEMAP:
        /*
         * FIGETBSZ queries the file system's block size for a file or
         * directory.
         *
         * This command operates on the file system's superblock rather
         * than on the file itself.  The same operation can also be done
         * through any other file or directory on the same file system,
         * so it is safe to permit it.
         */
        case FIGETBSZ:
        /*
         * FICLONE, FICLONERANGE and FIDEDUPERANGE make files share
         * their underlying storage ("reflink") between source and
         * destination FDs, on file systems which support that.
         *
         * These IOCTL commands only apply to regular files
         * and are harmless to permit for device files.
         */
        case FICLONE:
        case FICLONERANGE:
        case FIDEDUPERANGE:
        /*
         * FS_IOC_GETFSUUID and FS_IOC_GETFSSYSFSPATH both operate on
         * the file system superblock, not on the specific file, so
         * these operations are available through any other file on the
         * same file system as well.
         */
        case FS_IOC_GETFSUUID:
        case FS_IOC_GETFSSYSFSPATH:
                return true;

        /*
         * FIONREAD, FS_IOC_GETFLAGS, FS_IOC_SETFLAGS, FS_IOC_FSGETXATTR and
         * FS_IOC_FSSETXATTR are forwarded to device implementations.
         */

        /*
         * file_ioctl() commands (FIBMAP, FS_IOC_RESVSP, FS_IOC_RESVSP64,
         * FS_IOC_UNRESVSP, FS_IOC_UNRESVSP64 and FS_IOC_ZERO_RANGE) are
         * forwarded to device implementations, so not permitted.
         */

        /* Other commands are guarded by the access right. */
        default:
                return false;
        }
}

/*
 * is_masked_device_ioctl_compat - same as the helper above, but checking the
 * "compat" IOCTL commands.
 *
 * The IOCTL commands with special handling in compat-mode should behave the
 * same as their non-compat counterparts.
 */
static __attribute_const__ bool
is_masked_device_ioctl_compat(const unsigned int cmd)
{
        switch (cmd) {
        /* FICLONE is permitted, same as in the non-compat variant. */
        case FICLONE:
                return true;

#if defined(CONFIG_X86_64)
        /*
         * FS_IOC_RESVSP_32, FS_IOC_RESVSP64_32, FS_IOC_UNRESVSP_32,
         * FS_IOC_UNRESVSP64_32, FS_IOC_ZERO_RANGE_32: not blanket-permitted,
         * for consistency with their non-compat variants.
         */
        case FS_IOC_RESVSP_32:
        case FS_IOC_RESVSP64_32:
        case FS_IOC_UNRESVSP_32:
        case FS_IOC_UNRESVSP64_32:
        case FS_IOC_ZERO_RANGE_32:
#endif

        /*
         * FS_IOC32_GETFLAGS, FS_IOC32_SETFLAGS are forwarded to their device
         * implementations.
         */
        case FS_IOC32_GETFLAGS:
        case FS_IOC32_SETFLAGS:
                return false;
        default:
                return is_masked_device_ioctl(cmd);
        }
}

/* Ruleset management */

static struct landlock_object *get_inode_object(struct inode *const inode)
{
        struct landlock_object *object, *new_object;
        struct landlock_inode_security *inode_sec = landlock_inode(inode);

        rcu_read_lock();
retry:
        object = rcu_dereference(inode_sec->object);
        if (object) {
                if (likely(refcount_inc_not_zero(&object->usage))) {
                        rcu_read_unlock();
                        return object;
                }
                /*
                 * We are racing with release_inode(), the object is going
                 * away.  Wait for release_inode(), then retry.
                 */
                spin_lock(&object->lock);
                spin_unlock(&object->lock);
                goto retry;
        }
        rcu_read_unlock();

        /*
         * If there is no object tied to @inode, then create a new one (without
         * holding any locks).
         */
        new_object = landlock_create_object(&landlock_fs_underops, inode);
        if (IS_ERR(new_object))
                return new_object;

        /*
         * Protects against concurrent calls to get_inode_object() or
         * hook_sb_delete().
         */
        spin_lock(&inode->i_lock);
        if (unlikely(rcu_access_pointer(inode_sec->object))) {
                /* Someone else just created the object, bail out and retry. */
                spin_unlock(&inode->i_lock);
                kfree(new_object);

                rcu_read_lock();
                goto retry;
        }

        /*
         * @inode will be released by hook_sb_delete() on its superblock
         * shutdown, or by release_inode() when no more ruleset references the
         * related object.
         */
        ihold(inode);
        rcu_assign_pointer(inode_sec->object, new_object);
        spin_unlock(&inode->i_lock);
        return new_object;
}

/* All access rights that can be tied to files. */
/* clang-format off */
#define ACCESS_FILE ( \
        LANDLOCK_ACCESS_FS_EXECUTE | \
        LANDLOCK_ACCESS_FS_WRITE_FILE | \
        LANDLOCK_ACCESS_FS_READ_FILE | \
        LANDLOCK_ACCESS_FS_TRUNCATE | \
        LANDLOCK_ACCESS_FS_IOCTL_DEV)
/* clang-format on */

/*
 * @path: Should have been checked by get_path_from_fd().
 */
int landlock_append_fs_rule(struct landlock_ruleset *const ruleset,
                            const struct path *const path,
                            access_mask_t access_rights)
{
        int err;
        struct landlock_id id = {
                .type = LANDLOCK_KEY_INODE,
        };

        /* Files only get access rights that make sense. */
        if (!d_is_dir(path->dentry) &&
            (access_rights | ACCESS_FILE) != ACCESS_FILE)
                return -EINVAL;
        if (WARN_ON_ONCE(ruleset->num_layers != 1))
                return -EINVAL;

        /* Transforms relative access rights to absolute ones. */
        access_rights |= LANDLOCK_MASK_ACCESS_FS &
                         ~landlock_get_fs_access_mask(ruleset, 0);
        id.key.object = get_inode_object(d_backing_inode(path->dentry));
        if (IS_ERR(id.key.object))
                return PTR_ERR(id.key.object);
        mutex_lock(&ruleset->lock);
        err = landlock_insert_rule(ruleset, id, access_rights);
        mutex_unlock(&ruleset->lock);
        /*
         * No need to check for an error because landlock_insert_rule()
         * increments the refcount for the new object if needed.
         */
        landlock_put_object(id.key.object);
        return err;
}

/* Access-control management */

/*
 * The lifetime of the returned rule is tied to @domain.
 *
 * Returns NULL if no rule is found or if @dentry is negative.
 */
static const struct landlock_rule *
find_rule(const struct landlock_ruleset *const domain,
          const struct dentry *const dentry)
{
        const struct landlock_rule *rule;
        const struct inode *inode;
        struct landlock_id id = {
                .type = LANDLOCK_KEY_INODE,
        };

        /* Ignores nonexistent leafs. */
        if (d_is_negative(dentry))
                return NULL;

        inode = d_backing_inode(dentry);
        rcu_read_lock();
        id.key.object = rcu_dereference(landlock_inode(inode)->object);
        rule = landlock_find_rule(domain, id);
        rcu_read_unlock();
        return rule;
}

/*
 * Allows access to pseudo filesystems that will never be mountable (e.g.
 * sockfs, pipefs), but can still be reachable through
 * /proc/<pid>/fd/<file-descriptor>
 */
static bool is_nouser_or_private(const struct dentry *dentry)
{
        return (dentry->d_sb->s_flags & SB_NOUSER) ||
               (d_is_positive(dentry) &&
                unlikely(IS_PRIVATE(d_backing_inode(dentry))));
}

static access_mask_t
get_raw_handled_fs_accesses(const struct landlock_ruleset *const domain)
{
        access_mask_t access_dom = 0;
        size_t layer_level;

        for (layer_level = 0; layer_level < domain->num_layers; layer_level++)
                access_dom |=
                        landlock_get_raw_fs_access_mask(domain, layer_level);
        return access_dom;
}

static access_mask_t
get_handled_fs_accesses(const struct landlock_ruleset *const domain)
{
        /* Handles all initially denied by default access rights. */
        return get_raw_handled_fs_accesses(domain) |
               LANDLOCK_ACCESS_FS_INITIALLY_DENIED;
}

static const struct landlock_ruleset *
get_fs_domain(const struct landlock_ruleset *const domain)
{
        if (!domain || !get_raw_handled_fs_accesses(domain))
                return NULL;

        return domain;
}

static const struct landlock_ruleset *get_current_fs_domain(void)
{
        return get_fs_domain(landlock_get_current_domain());
}

/*
 * Check that a destination file hierarchy has more restrictions than a source
 * file hierarchy.  This is only used for link and rename actions.
 *
 * @layer_masks_child2: Optional child masks.
 */
static bool no_more_access(
        const layer_mask_t (*const layer_masks_parent1)[LANDLOCK_NUM_ACCESS_FS],
        const layer_mask_t (*const layer_masks_child1)[LANDLOCK_NUM_ACCESS_FS],
        const bool child1_is_directory,
        const layer_mask_t (*const layer_masks_parent2)[LANDLOCK_NUM_ACCESS_FS],
        const layer_mask_t (*const layer_masks_child2)[LANDLOCK_NUM_ACCESS_FS],
        const bool child2_is_directory)
{
        unsigned long access_bit;

        for (access_bit = 0; access_bit < ARRAY_SIZE(*layer_masks_parent2);
             access_bit++) {
                /* Ignores accesses that only make sense for directories. */
                const bool is_file_access =
                        !!(BIT_ULL(access_bit) & ACCESS_FILE);

                if (child1_is_directory || is_file_access) {
                        /*
                         * Checks if the destination restrictions are a
                         * superset of the source ones (i.e. inherited access
                         * rights without child exceptions):
                         * restrictions(parent2) >= restrictions(child1)
                         */
                        if ((((*layer_masks_parent1)[access_bit] &
                              (*layer_masks_child1)[access_bit]) |
                             (*layer_masks_parent2)[access_bit]) !=
                            (*layer_masks_parent2)[access_bit])
                                return false;
                }

                if (!layer_masks_child2)
                        continue;
                if (child2_is_directory || is_file_access) {
                        /*
                         * Checks inverted restrictions for RENAME_EXCHANGE:
                         * restrictions(parent1) >= restrictions(child2)
                         */
                        if ((((*layer_masks_parent2)[access_bit] &
                              (*layer_masks_child2)[access_bit]) |
                             (*layer_masks_parent1)[access_bit]) !=
                            (*layer_masks_parent1)[access_bit])
                                return false;
                }
        }
        return true;
}

#define NMA_TRUE(...) KUNIT_EXPECT_TRUE(test, no_more_access(__VA_ARGS__))
#define NMA_FALSE(...) KUNIT_EXPECT_FALSE(test, no_more_access(__VA_ARGS__))

#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST

static void test_no_more_access(struct kunit *const test)
{
        const layer_mask_t rx0[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
                [BIT_INDEX(LANDLOCK_ACCESS_FS_READ_FILE)] = BIT_ULL(0),
        };
        const layer_mask_t mx0[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
                [BIT_INDEX(LANDLOCK_ACCESS_FS_MAKE_REG)] = BIT_ULL(0),
        };
        const layer_mask_t x0[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
        };
        const layer_mask_t x1[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(1),
        };
        const layer_mask_t x01[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0) |
                                                          BIT_ULL(1),
        };
        const layer_mask_t allows_all[LANDLOCK_NUM_ACCESS_FS] = {};

        /* Checks without restriction. */
        NMA_TRUE(&x0, &allows_all, false, &allows_all, NULL, false);
        NMA_TRUE(&allows_all, &x0, false, &allows_all, NULL, false);
        NMA_FALSE(&x0, &x0, false, &allows_all, NULL, false);

        /*
         * Checks that we can only refer a file if no more access could be
         * inherited.
         */
        NMA_TRUE(&x0, &x0, false, &rx0, NULL, false);
        NMA_TRUE(&rx0, &rx0, false, &rx0, NULL, false);
        NMA_FALSE(&rx0, &rx0, false, &x0, NULL, false);
        NMA_FALSE(&rx0, &rx0, false, &x1, NULL, false);

        /* Checks allowed referring with different nested domains. */
        NMA_TRUE(&x0, &x1, false, &x0, NULL, false);
        NMA_TRUE(&x1, &x0, false, &x0, NULL, false);
        NMA_TRUE(&x0, &x01, false, &x0, NULL, false);
        NMA_TRUE(&x0, &x01, false, &rx0, NULL, false);
        NMA_TRUE(&x01, &x0, false, &x0, NULL, false);
        NMA_TRUE(&x01, &x0, false, &rx0, NULL, false);
        NMA_FALSE(&x01, &x01, false, &x0, NULL, false);

        /* Checks that file access rights are also enforced for a directory. */
        NMA_FALSE(&rx0, &rx0, true, &x0, NULL, false);

        /* Checks that directory access rights don't impact file referring... */
        NMA_TRUE(&mx0, &mx0, false, &x0, NULL, false);
        /* ...but only directory referring. */
        NMA_FALSE(&mx0, &mx0, true, &x0, NULL, false);

        /* Checks directory exchange. */
        NMA_TRUE(&mx0, &mx0, true, &mx0, &mx0, true);
        NMA_TRUE(&mx0, &mx0, true, &mx0, &x0, true);
        NMA_FALSE(&mx0, &mx0, true, &x0, &mx0, true);
        NMA_FALSE(&mx0, &mx0, true, &x0, &x0, true);
        NMA_FALSE(&mx0, &mx0, true, &x1, &x1, true);

        /* Checks file exchange with directory access rights... */
        NMA_TRUE(&mx0, &mx0, false, &mx0, &mx0, false);
        NMA_TRUE(&mx0, &mx0, false, &mx0, &x0, false);
        NMA_TRUE(&mx0, &mx0, false, &x0, &mx0, false);
        NMA_TRUE(&mx0, &mx0, false, &x0, &x0, false);
        /* ...and with file access rights. */
        NMA_TRUE(&rx0, &rx0, false, &rx0, &rx0, false);
        NMA_TRUE(&rx0, &rx0, false, &rx0, &x0, false);
        NMA_FALSE(&rx0, &rx0, false, &x0, &rx0, false);
        NMA_FALSE(&rx0, &rx0, false, &x0, &x0, false);
        NMA_FALSE(&rx0, &rx0, false, &x1, &x1, false);

        /*
         * Allowing the following requests should not be a security risk
         * because domain 0 denies execute access, and domain 1 is always
         * nested with domain 0.  However, adding an exception for this case
         * would mean to check all nested domains to make sure none can get
         * more privileges (e.g. processes only sandboxed by domain 0).
         * Moreover, this behavior (i.e. composition of N domains) could then
         * be inconsistent compared to domain 1's ruleset alone (e.g. it might
         * be denied to link/rename with domain 1's ruleset, whereas it would
         * be allowed if nested on top of domain 0).  Another drawback would be
         * to create a cover channel that could enable sandboxed processes to
         * infer most of the filesystem restrictions from their domain.  To
         * make it simple, efficient, safe, and more consistent, this case is
         * always denied.
         */
        NMA_FALSE(&x1, &x1, false, &x0, NULL, false);
        NMA_FALSE(&x1, &x1, false, &rx0, NULL, false);
        NMA_FALSE(&x1, &x1, true, &x0, NULL, false);
        NMA_FALSE(&x1, &x1, true, &rx0, NULL, false);

        /* Checks the same case of exclusive domains with a file... */
        NMA_TRUE(&x1, &x1, false, &x01, NULL, false);
        NMA_FALSE(&x1, &x1, false, &x01, &x0, false);
        NMA_FALSE(&x1, &x1, false, &x01, &x01, false);
        NMA_FALSE(&x1, &x1, false, &x0, &x0, false);
        /* ...and with a directory. */
        NMA_FALSE(&x1, &x1, false, &x0, &x0, true);
        NMA_FALSE(&x1, &x1, true, &x0, &x0, false);
        NMA_FALSE(&x1, &x1, true, &x0, &x0, true);
}

#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */

#undef NMA_TRUE
#undef NMA_FALSE

/*
 * Removes @layer_masks accesses that are not requested.
 *
 * Returns true if the request is allowed, false otherwise.
 */
static bool
scope_to_request(const access_mask_t access_request,
                 layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS])
{
        const unsigned long access_req = access_request;
        unsigned long access_bit;

        if (WARN_ON_ONCE(!layer_masks))
                return true;

        for_each_clear_bit(access_bit, &access_req, ARRAY_SIZE(*layer_masks))
                (*layer_masks)[access_bit] = 0;
        return !memchr_inv(layer_masks, 0, sizeof(*layer_masks));
}

#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST

static void test_scope_to_request_with_exec_none(struct kunit *const test)
{
        /* Allows everything. */
        layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};

        /* Checks and scopes with execute. */
        KUNIT_EXPECT_TRUE(test, scope_to_request(LANDLOCK_ACCESS_FS_EXECUTE,
                                                 &layer_masks));
        KUNIT_EXPECT_EQ(test, 0,
                        layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]);
        KUNIT_EXPECT_EQ(test, 0,
                        layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]);
}

static void test_scope_to_request_with_exec_some(struct kunit *const test)
{
        /* Denies execute and write. */
        layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
                [BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(1),
        };

        /* Checks and scopes with execute. */
        KUNIT_EXPECT_FALSE(test, scope_to_request(LANDLOCK_ACCESS_FS_EXECUTE,
                                                  &layer_masks));
        KUNIT_EXPECT_EQ(test, BIT_ULL(0),
                        layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]);
        KUNIT_EXPECT_EQ(test, 0,
                        layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]);
}

static void test_scope_to_request_without_access(struct kunit *const test)
{
        /* Denies execute and write. */
        layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
                [BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(1),
        };

        /* Checks and scopes without access request. */
        KUNIT_EXPECT_TRUE(test, scope_to_request(0, &layer_masks));
        KUNIT_EXPECT_EQ(test, 0,
                        layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]);
        KUNIT_EXPECT_EQ(test, 0,
                        layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]);
}

#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */

/*
 * Returns true if there is at least one access right different than
 * LANDLOCK_ACCESS_FS_REFER.
 */
static bool
is_eacces(const layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS],
          const access_mask_t access_request)
{
        unsigned long access_bit;
        /* LANDLOCK_ACCESS_FS_REFER alone must return -EXDEV. */
        const unsigned long access_check = access_request &
                                           ~LANDLOCK_ACCESS_FS_REFER;

        if (!layer_masks)
                return false;

        for_each_set_bit(access_bit, &access_check, ARRAY_SIZE(*layer_masks)) {
                if ((*layer_masks)[access_bit])
                        return true;
        }
        return false;
}

#define IE_TRUE(...) KUNIT_EXPECT_TRUE(test, is_eacces(__VA_ARGS__))
#define IE_FALSE(...) KUNIT_EXPECT_FALSE(test, is_eacces(__VA_ARGS__))

#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST

static void test_is_eacces_with_none(struct kunit *const test)
{
        const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};

        IE_FALSE(&layer_masks, 0);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE);
}

static void test_is_eacces_with_refer(struct kunit *const test)
{
        const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_REFER)] = BIT_ULL(0),
        };

        IE_FALSE(&layer_masks, 0);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE);
}

static void test_is_eacces_with_write(struct kunit *const test)
{
        const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(0),
        };

        IE_FALSE(&layer_masks, 0);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE);

        IE_TRUE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE);
}

#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */

#undef IE_TRUE
#undef IE_FALSE

/**
 * is_access_to_paths_allowed - Check accesses for requests with a common path
 *
 * @domain: Domain to check against.
 * @path: File hierarchy to walk through.
 * @access_request_parent1: Accesses to check, once @layer_masks_parent1 is
 *     equal to @layer_masks_parent2 (if any).  This is tied to the unique
 *     requested path for most actions, or the source in case of a refer action
 *     (i.e. rename or link), or the source and destination in case of
 *     RENAME_EXCHANGE.
 * @layer_masks_parent1: Pointer to a matrix of layer masks per access
 *     masks, identifying the layers that forbid a specific access.  Bits from
 *     this matrix can be unset according to the @path walk.  An empty matrix
 *     means that @domain allows all possible Landlock accesses (i.e. not only
 *     those identified by @access_request_parent1).  This matrix can
 *     initially refer to domain layer masks and, when the accesses for the
 *     destination and source are the same, to requested layer masks.
 * @dentry_child1: Dentry to the initial child of the parent1 path.  This
 *     pointer must be NULL for non-refer actions (i.e. not link nor rename).
 * @access_request_parent2: Similar to @access_request_parent1 but for a
 *     request involving a source and a destination.  This refers to the
 *     destination, except in case of RENAME_EXCHANGE where it also refers to
 *     the source.  Must be set to 0 when using a simple path request.
 * @layer_masks_parent2: Similar to @layer_masks_parent1 but for a refer
 *     action.  This must be NULL otherwise.
 * @dentry_child2: Dentry to the initial child of the parent2 path.  This
 *     pointer is only set for RENAME_EXCHANGE actions and must be NULL
 *     otherwise.
 *
 * This helper first checks that the destination has a superset of restrictions
 * compared to the source (if any) for a common path.  Because of
 * RENAME_EXCHANGE actions, source and destinations may be swapped.  It then
 * checks that the collected accesses and the remaining ones are enough to
 * allow the request.
 *
 * Returns:
 * - true if the access request is granted;
 * - false otherwise.
 */
static bool is_access_to_paths_allowed(
        const struct landlock_ruleset *const domain,
        const struct path *const path,
        const access_mask_t access_request_parent1,
        layer_mask_t (*const layer_masks_parent1)[LANDLOCK_NUM_ACCESS_FS],
        const struct dentry *const dentry_child1,
        const access_mask_t access_request_parent2,
        layer_mask_t (*const layer_masks_parent2)[LANDLOCK_NUM_ACCESS_FS],
        const struct dentry *const dentry_child2)
{
        bool allowed_parent1 = false, allowed_parent2 = false, is_dom_check,
             child1_is_directory = true, child2_is_directory = true;
        struct path walker_path;
        access_mask_t access_masked_parent1, access_masked_parent2;
        layer_mask_t _layer_masks_child1[LANDLOCK_NUM_ACCESS_FS],
                _layer_masks_child2[LANDLOCK_NUM_ACCESS_FS];
        layer_mask_t(*layer_masks_child1)[LANDLOCK_NUM_ACCESS_FS] = NULL,
        (*layer_masks_child2)[LANDLOCK_NUM_ACCESS_FS] = NULL;

        if (!access_request_parent1 && !access_request_parent2)
                return true;
        if (WARN_ON_ONCE(!domain || !path))
                return true;
        if (is_nouser_or_private(path->dentry))
                return true;
        if (WARN_ON_ONCE(domain->num_layers < 1 || !layer_masks_parent1))
                return false;

        if (unlikely(layer_masks_parent2)) {
                if (WARN_ON_ONCE(!dentry_child1))
                        return false;
                /*
                 * For a double request, first check for potential privilege
                 * escalation by looking at domain handled accesses (which are
                 * a superset of the meaningful requested accesses).
                 */
                access_masked_parent1 = access_masked_parent2 =
                        get_handled_fs_accesses(domain);
                is_dom_check = true;
        } else {
                if (WARN_ON_ONCE(dentry_child1 || dentry_child2))
                        return false;
                /* For a simple request, only check for requested accesses. */
                access_masked_parent1 = access_request_parent1;
                access_masked_parent2 = access_request_parent2;
                is_dom_check = false;
        }

        if (unlikely(dentry_child1)) {
                landlock_unmask_layers(
                        find_rule(domain, dentry_child1),
                        landlock_init_layer_masks(
                                domain, LANDLOCK_MASK_ACCESS_FS,
                                &_layer_masks_child1, LANDLOCK_KEY_INODE),
                        &_layer_masks_child1, ARRAY_SIZE(_layer_masks_child1));
                layer_masks_child1 = &_layer_masks_child1;
                child1_is_directory = d_is_dir(dentry_child1);
        }
        if (unlikely(dentry_child2)) {
                landlock_unmask_layers(
                        find_rule(domain, dentry_child2),
                        landlock_init_layer_masks(
                                domain, LANDLOCK_MASK_ACCESS_FS,
                                &_layer_masks_child2, LANDLOCK_KEY_INODE),
                        &_layer_masks_child2, ARRAY_SIZE(_layer_masks_child2));
                layer_masks_child2 = &_layer_masks_child2;
                child2_is_directory = d_is_dir(dentry_child2);
        }

        walker_path = *path;
        path_get(&walker_path);
        /*
         * We need to walk through all the hierarchy to not miss any relevant
         * restriction.
         */
        while (true) {
                struct dentry *parent_dentry;
                const struct landlock_rule *rule;

                /*
                 * If at least all accesses allowed on the destination are
                 * already allowed on the source, respectively if there is at
                 * least as much as restrictions on the destination than on the
                 * source, then we can safely refer files from the source to
                 * the destination without risking a privilege escalation.
                 * This also applies in the case of RENAME_EXCHANGE, which
                 * implies checks on both direction.  This is crucial for
                 * standalone multilayered security policies.  Furthermore,
                 * this helps avoid policy writers to shoot themselves in the
                 * foot.
                 */
                if (unlikely(is_dom_check &&
                             no_more_access(
                                     layer_masks_parent1, layer_masks_child1,
                                     child1_is_directory, layer_masks_parent2,
                                     layer_masks_child2,
                                     child2_is_directory))) {
                        allowed_parent1 = scope_to_request(
                                access_request_parent1, layer_masks_parent1);
                        allowed_parent2 = scope_to_request(
                                access_request_parent2, layer_masks_parent2);

                        /* Stops when all accesses are granted. */
                        if (allowed_parent1 && allowed_parent2)
                                break;

                        /*
                         * Now, downgrades the remaining checks from domain
                         * handled accesses to requested accesses.
                         */
                        is_dom_check = false;
                        access_masked_parent1 = access_request_parent1;
                        access_masked_parent2 = access_request_parent2;
                }

                rule = find_rule(domain, walker_path.dentry);
                allowed_parent1 = landlock_unmask_layers(
                        rule, access_masked_parent1, layer_masks_parent1,
                        ARRAY_SIZE(*layer_masks_parent1));
                allowed_parent2 = landlock_unmask_layers(
                        rule, access_masked_parent2, layer_masks_parent2,
                        ARRAY_SIZE(*layer_masks_parent2));

                /* Stops when a rule from each layer grants access. */
                if (allowed_parent1 && allowed_parent2)
                        break;
jump_up:
                if (walker_path.dentry == walker_path.mnt->mnt_root) {
                        if (follow_up(&walker_path)) {
                                /* Ignores hidden mount points. */
                                goto jump_up;
                        } else {
                                /*
                                 * Stops at the real root.  Denies access
                                 * because not all layers have granted access.
                                 */
                                break;
                        }
                }
                if (unlikely(IS_ROOT(walker_path.dentry))) {
                        /*
                         * Stops at disconnected root directories.  Only allows
                         * access to internal filesystems (e.g. nsfs, which is
                         * reachable through /proc/<pid>/ns/<namespace>).
                         */
                        allowed_parent1 = allowed_parent2 =
                                !!(walker_path.mnt->mnt_flags & MNT_INTERNAL);
                        break;
                }
                parent_dentry = dget_parent(walker_path.dentry);
                dput(walker_path.dentry);
                walker_path.dentry = parent_dentry;
        }
        path_put(&walker_path);

        return allowed_parent1 && allowed_parent2;
}

static int check_access_path(const struct landlock_ruleset *const domain,
                             const struct path *const path,
                             access_mask_t access_request)
{
        layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};

        access_request = landlock_init_layer_masks(
                domain, access_request, &layer_masks, LANDLOCK_KEY_INODE);
        if (is_access_to_paths_allowed(domain, path, access_request,
                                       &layer_masks, NULL, 0, NULL, NULL))
                return 0;
        return -EACCES;
}

static int current_check_access_path(const struct path *const path,
                                     const access_mask_t access_request)
{
        const struct landlock_ruleset *const dom = get_current_fs_domain();

        if (!dom)
                return 0;
        return check_access_path(dom, path, access_request);
}

static access_mask_t get_mode_access(const umode_t mode)
{
        switch (mode & S_IFMT) {
        case S_IFLNK:
                return LANDLOCK_ACCESS_FS_MAKE_SYM;
        case 0:
                /* A zero mode translates to S_IFREG. */
        case S_IFREG:
                return LANDLOCK_ACCESS_FS_MAKE_REG;
        case S_IFDIR:
                return LANDLOCK_ACCESS_FS_MAKE_DIR;
        case S_IFCHR:
                return LANDLOCK_ACCESS_FS_MAKE_CHAR;
        case S_IFBLK:
                return LANDLOCK_ACCESS_FS_MAKE_BLOCK;
        case S_IFIFO:
                return LANDLOCK_ACCESS_FS_MAKE_FIFO;
        case S_IFSOCK:
                return LANDLOCK_ACCESS_FS_MAKE_SOCK;
        default:
                WARN_ON_ONCE(1);
                return 0;
        }
}

static access_mask_t maybe_remove(const struct dentry *const dentry)
{
        if (d_is_negative(dentry))
                return 0;
        return d_is_dir(dentry) ? LANDLOCK_ACCESS_FS_REMOVE_DIR :
                                  LANDLOCK_ACCESS_FS_REMOVE_FILE;
}

/**
 * collect_domain_accesses - Walk through a file path and collect accesses
 *
 * @domain: Domain to check against.
 * @mnt_root: Last directory to check.
 * @dir: Directory to start the walk from.
 * @layer_masks_dom: Where to store the collected accesses.
 *
 * This helper is useful to begin a path walk from the @dir directory to a
 * @mnt_root directory used as a mount point.  This mount point is the common
 * ancestor between the source and the destination of a renamed and linked
 * file.  While walking from @dir to @mnt_root, we record all the domain's
 * allowed accesses in @layer_masks_dom.
 *
 * This is similar to is_access_to_paths_allowed() but much simpler because it
 * only handles walking on the same mount point and only checks one set of
 * accesses.
 *
 * Returns:
 * - true if all the domain access rights are allowed for @dir;
 * - false if the walk reached @mnt_root.
 */
static bool collect_domain_accesses(
        const struct landlock_ruleset *const domain,
        const struct dentry *const mnt_root, struct dentry *dir,
        layer_mask_t (*const layer_masks_dom)[LANDLOCK_NUM_ACCESS_FS])
{
        unsigned long access_dom;
        bool ret = false;

        if (WARN_ON_ONCE(!domain || !mnt_root || !dir || !layer_masks_dom))
                return true;
        if (is_nouser_or_private(dir))
                return true;

        access_dom = landlock_init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS,
                                               layer_masks_dom,
                                               LANDLOCK_KEY_INODE);

        dget(dir);
        while (true) {
                struct dentry *parent_dentry;

                /* Gets all layers allowing all domain accesses. */
                if (landlock_unmask_layers(find_rule(domain, dir), access_dom,
                                           layer_masks_dom,
                                           ARRAY_SIZE(*layer_masks_dom))) {
                        /*
                         * Stops when all handled accesses are allowed by at
                         * least one rule in each layer.
                         */
                        ret = true;
                        break;
                }

                /* We should not reach a root other than @mnt_root. */
                if (dir == mnt_root || WARN_ON_ONCE(IS_ROOT(dir)))
                        break;

                parent_dentry = dget_parent(dir);
                dput(dir);
                dir = parent_dentry;
        }
        dput(dir);
        return ret;
}

/**
 * current_check_refer_path - Check if a rename or link action is allowed
 *
 * @old_dentry: File or directory requested to be moved or linked.
 * @new_dir: Destination parent directory.
 * @new_dentry: Destination file or directory.
 * @removable: Sets to true if it is a rename operation.
 * @exchange: Sets to true if it is a rename operation with RENAME_EXCHANGE.
 *
 * Because of its unprivileged constraints, Landlock relies on file hierarchies
 * (and not only inodes) to tie access rights to files.  Being able to link or
 * rename a file hierarchy brings some challenges.  Indeed, moving or linking a
 * file (i.e. creating a new reference to an inode) can have an impact on the
 * actions allowed for a set of files if it would change its parent directory
 * (i.e. reparenting).
 *
 * To avoid trivial access right bypasses, Landlock first checks if the file or
 * directory requested to be moved would gain new access rights inherited from
 * its new hierarchy.  Before returning any error, Landlock then checks that
 * the parent source hierarchy and the destination hierarchy would allow the
 * link or rename action.  If it is not the case, an error with EACCES is
 * returned to inform user space that there is no way to remove or create the
 * requested source file type.  If it should be allowed but the new inherited
 * access rights would be greater than the source access rights, then the
 * kernel returns an error with EXDEV.  Prioritizing EACCES over EXDEV enables
 * user space to abort the whole operation if there is no way to do it, or to
 * manually copy the source to the destination if this remains allowed, e.g.
 * because file creation is allowed on the destination directory but not direct
 * linking.
 *
 * To achieve this goal, the kernel needs to compare two file hierarchies: the
 * one identifying the source file or directory (including itself), and the
 * destination one.  This can be seen as a multilayer partial ordering problem.
 * The kernel walks through these paths and collects in a matrix the access
 * rights that are denied per layer.  These matrices are then compared to see
 * if the destination one has more (or the same) restrictions as the source
 * one.  If this is the case, the requested action will not return EXDEV, which
 * doesn't mean the action is allowed.  The parent hierarchy of the source
 * (i.e. parent directory), and the destination hierarchy must also be checked
 * to verify that they explicitly allow such action (i.e.  referencing,
 * creation and potentially removal rights).  The kernel implementation is then
 * required to rely on potentially four matrices of access rights: one for the
 * source file or directory (i.e. the child), a potentially other one for the
 * other source/destination (in case of RENAME_EXCHANGE), one for the source
 * parent hierarchy and a last one for the destination hierarchy.  These
 * ephemeral matrices take some space on the stack, which limits the number of
 * layers to a deemed reasonable number: 16.
 *
 * Returns:
 * - 0 if access is allowed;
 * - -EXDEV if @old_dentry would inherit new access rights from @new_dir;
 * - -EACCES if file removal or creation is denied.
 */
static int current_check_refer_path(struct dentry *const old_dentry,
                                    const struct path *const new_dir,
                                    struct dentry *const new_dentry,
                                    const bool removable, const bool exchange)
{
        const struct landlock_ruleset *const dom = get_current_fs_domain();
        bool allow_parent1, allow_parent2;
        access_mask_t access_request_parent1, access_request_parent2;
        struct path mnt_dir;
        struct dentry *old_parent;
        layer_mask_t layer_masks_parent1[LANDLOCK_NUM_ACCESS_FS] = {},
                     layer_masks_parent2[LANDLOCK_NUM_ACCESS_FS] = {};

        if (!dom)
                return 0;
        if (WARN_ON_ONCE(dom->num_layers < 1))
                return -EACCES;
        if (unlikely(d_is_negative(old_dentry)))
                return -ENOENT;
        if (exchange) {
                if (unlikely(d_is_negative(new_dentry)))
                        return -ENOENT;
                access_request_parent1 =
                        get_mode_access(d_backing_inode(new_dentry)->i_mode);
        } else {
                access_request_parent1 = 0;
        }
        access_request_parent2 =
                get_mode_access(d_backing_inode(old_dentry)->i_mode);
        if (removable) {
                access_request_parent1 |= maybe_remove(old_dentry);
                access_request_parent2 |= maybe_remove(new_dentry);
        }

        /* The mount points are the same for old and new paths, cf. EXDEV. */
        if (old_dentry->d_parent == new_dir->dentry) {
                /*
                 * The LANDLOCK_ACCESS_FS_REFER access right is not required
                 * for same-directory referer (i.e. no reparenting).
                 */
                access_request_parent1 = landlock_init_layer_masks(
                        dom, access_request_parent1 | access_request_parent2,
                        &layer_masks_parent1, LANDLOCK_KEY_INODE);
                if (is_access_to_paths_allowed(
                            dom, new_dir, access_request_parent1,
                            &layer_masks_parent1, NULL, 0, NULL, NULL))
                        return 0;
                return -EACCES;
        }

        access_request_parent1 |= LANDLOCK_ACCESS_FS_REFER;
        access_request_parent2 |= LANDLOCK_ACCESS_FS_REFER;

        /* Saves the common mount point. */
        mnt_dir.mnt = new_dir->mnt;
        mnt_dir.dentry = new_dir->mnt->mnt_root;

        /*
         * old_dentry may be the root of the common mount point and
         * !IS_ROOT(old_dentry) at the same time (e.g. with open_tree() and
         * OPEN_TREE_CLONE).  We do not need to call dget(old_parent) because
         * we keep a reference to old_dentry.
         */
        old_parent = (old_dentry == mnt_dir.dentry) ? old_dentry :
                                                      old_dentry->d_parent;

        /* new_dir->dentry is equal to new_dentry->d_parent */
        allow_parent1 = collect_domain_accesses(dom, mnt_dir.dentry, old_parent,
                                                &layer_masks_parent1);
        allow_parent2 = collect_domain_accesses(
                dom, mnt_dir.dentry, new_dir->dentry, &layer_masks_parent2);

        if (allow_parent1 && allow_parent2)
                return 0;

        /*
         * To be able to compare source and destination domain access rights,
         * take into account the @old_dentry access rights aggregated with its
         * parent access rights.  This will be useful to compare with the
         * destination parent access rights.
         */
        if (is_access_to_paths_allowed(
                    dom, &mnt_dir, access_request_parent1, &layer_masks_parent1,
                    old_dentry, access_request_parent2, &layer_masks_parent2,
                    exchange ? new_dentry : NULL))
                return 0;

        /*
         * This prioritizes EACCES over EXDEV for all actions, including
         * renames with RENAME_EXCHANGE.
         */
        if (likely(is_eacces(&layer_masks_parent1, access_request_parent1) ||
                   is_eacces(&layer_masks_parent2, access_request_parent2)))
                return -EACCES;

        /*
         * Gracefully forbids reparenting if the destination directory
         * hierarchy is not a superset of restrictions of the source directory
         * hierarchy, or if LANDLOCK_ACCESS_FS_REFER is not allowed by the
         * source or the destination.
         */
        return -EXDEV;
}

/* Inode hooks */

static void hook_inode_free_security(struct inode *const inode)
{
        /*
         * All inodes must already have been untied from their object by
         * release_inode() or hook_sb_delete().
         */
        WARN_ON_ONCE(landlock_inode(inode)->object);
}

/* Super-block hooks */

/*
 * Release the inodes used in a security policy.
 *
 * Cf. fsnotify_unmount_inodes() and invalidate_inodes()
 */
static void hook_sb_delete(struct super_block *const sb)
{
        struct inode *inode, *prev_inode = NULL;

        if (!landlock_initialized)
                return;

        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                struct landlock_object *object;

                /* Only handles referenced inodes. */
                if (!atomic_read(&inode->i_count))
                        continue;

                /*
                 * Protects against concurrent modification of inode (e.g.
                 * from get_inode_object()).
                 */
                spin_lock(&inode->i_lock);
                /*
                 * Checks I_FREEING and I_WILL_FREE  to protect against a race
                 * condition when release_inode() just called iput(), which
                 * could lead to a NULL dereference of inode->security or a
                 * second call to iput() for the same Landlock object.  Also
                 * checks I_NEW because such inode cannot be tied to an object.
                 */
                if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                rcu_read_lock();
                object = rcu_dereference(landlock_inode(inode)->object);
                if (!object) {
                        rcu_read_unlock();
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                /* Keeps a reference to this inode until the next loop walk. */
                __iget(inode);
                spin_unlock(&inode->i_lock);

                /*
                 * If there is no concurrent release_inode() ongoing, then we
                 * are in charge of calling iput() on this inode, otherwise we
                 * will just wait for it to finish.
                 */
                spin_lock(&object->lock);
                if (object->underobj == inode) {
                        object->underobj = NULL;
                        spin_unlock(&object->lock);
                        rcu_read_unlock();

                        /*
                         * Because object->underobj was not NULL,
                         * release_inode() and get_inode_object() guarantee
                         * that it is safe to reset
                         * landlock_inode(inode)->object while it is not NULL.
                         * It is therefore not necessary to lock inode->i_lock.
                         */
                        rcu_assign_pointer(landlock_inode(inode)->object, NULL);
                        /*
                         * At this point, we own the ihold() reference that was
                         * originally set up by get_inode_object() and the
                         * __iget() reference that we just set in this loop
                         * walk.  Therefore the following call to iput() will
                         * not sleep nor drop the inode because there is now at
                         * least two references to it.
                         */
                        iput(inode);
                } else {
                        spin_unlock(&object->lock);
                        rcu_read_unlock();
                }

                if (prev_inode) {
                        /*
                         * At this point, we still own the __iget() reference
                         * that we just set in this loop walk.  Therefore we
                         * can drop the list lock and know that the inode won't
                         * disappear from under us until the next loop walk.
                         */
                        spin_unlock(&sb->s_inode_list_lock);
                        /*
                         * We can now actually put the inode reference from the
                         * previous loop walk, which is not needed anymore.
                         */
                        iput(prev_inode);
                        cond_resched();
                        spin_lock(&sb->s_inode_list_lock);
                }
                prev_inode = inode;
        }
        spin_unlock(&sb->s_inode_list_lock);

        /* Puts the inode reference from the last loop walk, if any. */
        if (prev_inode)
                iput(prev_inode);
        /* Waits for pending iput() in release_inode(). */
        wait_var_event(&landlock_superblock(sb)->inode_refs,
                       !atomic_long_read(&landlock_superblock(sb)->inode_refs));
}

/*
 * Because a Landlock security policy is defined according to the filesystem
 * topology (i.e. the mount namespace), changing it may grant access to files
 * not previously allowed.
 *
 * To make it simple, deny any filesystem topology modification by landlocked
 * processes.  Non-landlocked processes may still change the namespace of a
 * landlocked process, but this kind of threat must be handled by a system-wide
 * access-control security policy.
 *
 * This could be lifted in the future if Landlock can safely handle mount
 * namespace updates requested by a landlocked process.  Indeed, we could
 * update the current domain (which is currently read-only) by taking into
 * account the accesses of the source and the destination of a new mount point.
 * However, it would also require to make all the child domains dynamically
 * inherit these new constraints.  Anyway, for backward compatibility reasons,
 * a dedicated user space option would be required (e.g. as a ruleset flag).
 */
static int hook_sb_mount(const char *const dev_name,
                         const struct path *const path, const char *const type,
                         const unsigned long flags, void *const data)
{
        if (!get_current_fs_domain())
                return 0;
        return -EPERM;
}

static int hook_move_mount(const struct path *const from_path,
                           const struct path *const to_path)
{
        if (!get_current_fs_domain())
                return 0;
        return -EPERM;
}

/*
 * Removing a mount point may reveal a previously hidden file hierarchy, which
 * may then grant access to files, which may have previously been forbidden.
 */
static int hook_sb_umount(struct vfsmount *const mnt, const int flags)
{
        if (!get_current_fs_domain())
                return 0;
        return -EPERM;
}

static int hook_sb_remount(struct super_block *const sb, void *const mnt_opts)
{
        if (!get_current_fs_domain())
                return 0;
        return -EPERM;
}

/*
 * pivot_root(2), like mount(2), changes the current mount namespace.  It must
 * then be forbidden for a landlocked process.
 *
 * However, chroot(2) may be allowed because it only changes the relative root
 * directory of the current process.  Moreover, it can be used to restrict the
 * view of the filesystem.
 */
static int hook_sb_pivotroot(const struct path *const old_path,
                             const struct path *const new_path)
{
        if (!get_current_fs_domain())
                return 0;
        return -EPERM;
}

/* Path hooks */

static int hook_path_link(struct dentry *const old_dentry,
                          const struct path *const new_dir,
                          struct dentry *const new_dentry)
{
        return current_check_refer_path(old_dentry, new_dir, new_dentry, false,
                                        false);
}

static int hook_path_rename(const struct path *const old_dir,
                            struct dentry *const old_dentry,
                            const struct path *const new_dir,
                            struct dentry *const new_dentry,
                            const unsigned int flags)
{
        /* old_dir refers to old_dentry->d_parent and new_dir->mnt */
        return current_check_refer_path(old_dentry, new_dir, new_dentry, true,
                                        !!(flags & RENAME_EXCHANGE));
}

static int hook_path_mkdir(const struct path *const dir,
                           struct dentry *const dentry, const umode_t mode)
{
        return current_check_access_path(dir, LANDLOCK_ACCESS_FS_MAKE_DIR);
}

static int hook_path_mknod(const struct path *const dir,
                           struct dentry *const dentry, const umode_t mode,
                           const unsigned int dev)
{
        const struct landlock_ruleset *const dom = get_current_fs_domain();

        if (!dom)
                return 0;
        return check_access_path(dom, dir, get_mode_access(mode));
}

static int hook_path_symlink(const struct path *const dir,
                             struct dentry *const dentry,
                             const char *const old_name)
{
        return current_check_access_path(dir, LANDLOCK_ACCESS_FS_MAKE_SYM);
}

static int hook_path_unlink(const struct path *const dir,
                            struct dentry *const dentry)
{
        return current_check_access_path(dir, LANDLOCK_ACCESS_FS_REMOVE_FILE);
}

static int hook_path_rmdir(const struct path *const dir,
                           struct dentry *const dentry)
{
        return current_check_access_path(dir, LANDLOCK_ACCESS_FS_REMOVE_DIR);
}

static int hook_path_truncate(const struct path *const path)
{
        return current_check_access_path(path, LANDLOCK_ACCESS_FS_TRUNCATE);
}

/* File hooks */

/**
 * get_required_file_open_access - Get access needed to open a file
 *
 * @file: File being opened.
 *
 * Returns the access rights that are required for opening the given file,
 * depending on the file type and open mode.
 */
static access_mask_t
get_required_file_open_access(const struct file *const file)
{
        access_mask_t access = 0;

        if (file->f_mode & FMODE_READ) {
                /* A directory can only be opened in read mode. */
                if (S_ISDIR(file_inode(file)->i_mode))
                        return LANDLOCK_ACCESS_FS_READ_DIR;
                access = LANDLOCK_ACCESS_FS_READ_FILE;
        }
        if (file->f_mode & FMODE_WRITE)
                access |= LANDLOCK_ACCESS_FS_WRITE_FILE;
        /* __FMODE_EXEC is indeed part of f_flags, not f_mode. */
        if (file->f_flags & __FMODE_EXEC)
                access |= LANDLOCK_ACCESS_FS_EXECUTE;
        return access;
}

static int hook_file_alloc_security(struct file *const file)
{
        /*
         * Grants all access rights, even if most of them are not checked later
         * on. It is more consistent.
         *
         * Notably, file descriptors for regular files can also be acquired
         * without going through the file_open hook, for example when using
         * memfd_create(2).
         */
        landlock_file(file)->allowed_access = LANDLOCK_MASK_ACCESS_FS;
        return 0;
}

static bool is_device(const struct file *const file)
{
        const struct inode *inode = file_inode(file);

        return S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode);
}

static int hook_file_open(struct file *const file)
{
        layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};
        access_mask_t open_access_request, full_access_request, allowed_access,
                optional_access;
        const struct landlock_ruleset *const dom =
                get_fs_domain(landlock_cred(file->f_cred)->domain);

        if (!dom)
                return 0;

        /*
         * Because a file may be opened with O_PATH, get_required_file_open_access()
         * may return 0.  This case will be handled with a future Landlock
         * evolution.
         */
        open_access_request = get_required_file_open_access(file);

        /*
         * We look up more access than what we immediately need for open(), so
         * that we can later authorize operations on opened files.
         */
        optional_access = LANDLOCK_ACCESS_FS_TRUNCATE;
        if (is_device(file))
                optional_access |= LANDLOCK_ACCESS_FS_IOCTL_DEV;

        full_access_request = open_access_request | optional_access;

        if (is_access_to_paths_allowed(
                    dom, &file->f_path,
                    landlock_init_layer_masks(dom, full_access_request,
                                              &layer_masks, LANDLOCK_KEY_INODE),
                    &layer_masks, NULL, 0, NULL, NULL)) {
                allowed_access = full_access_request;
        } else {
                unsigned long access_bit;
                const unsigned long access_req = full_access_request;

                /*
                 * Calculate the actual allowed access rights from layer_masks.
                 * Add each access right to allowed_access which has not been
                 * vetoed by any layer.
                 */
                allowed_access = 0;
                for_each_set_bit(access_bit, &access_req,
                                 ARRAY_SIZE(layer_masks)) {
                        if (!layer_masks[access_bit])
                                allowed_access |= BIT_ULL(access_bit);
                }
        }

        /*
         * For operations on already opened files (i.e. ftruncate()), it is the
         * access rights at the time of open() which decide whether the
         * operation is permitted. Therefore, we record the relevant subset of
         * file access rights in the opened struct file.
         */
        landlock_file(file)->allowed_access = allowed_access;

        if ((open_access_request & allowed_access) == open_access_request)
                return 0;

        return -EACCES;
}

static int hook_file_truncate(struct file *const file)
{
        /*
         * Allows truncation if the truncate right was available at the time of
         * opening the file, to get a consistent access check as for read, write
         * and execute operations.
         *
         * Note: For checks done based on the file's Landlock allowed access, we
         * enforce them independently of whether the current thread is in a
         * Landlock domain, so that open files passed between independent
         * processes retain their behaviour.
         */
        if (landlock_file(file)->allowed_access & LANDLOCK_ACCESS_FS_TRUNCATE)
                return 0;
        return -EACCES;
}

static int hook_file_ioctl(struct file *file, unsigned int cmd,
                           unsigned long arg)
{
        access_mask_t allowed_access = landlock_file(file)->allowed_access;

        /*
         * It is the access rights at the time of opening the file which
         * determine whether IOCTL can be used on the opened file later.
         *
         * The access right is attached to the opened file in hook_file_open().
         */
        if (allowed_access & LANDLOCK_ACCESS_FS_IOCTL_DEV)
                return 0;

        if (!is_device(file))
                return 0;

        if (is_masked_device_ioctl(cmd))
                return 0;

        return -EACCES;
}

static int hook_file_ioctl_compat(struct file *file, unsigned int cmd,
                                  unsigned long arg)
{
        access_mask_t allowed_access = landlock_file(file)->allowed_access;

        /*
         * It is the access rights at the time of opening the file which
         * determine whether IOCTL can be used on the opened file later.
         *
         * The access right is attached to the opened file in hook_file_open().
         */
        if (allowed_access & LANDLOCK_ACCESS_FS_IOCTL_DEV)
                return 0;

        if (!is_device(file))
                return 0;

        if (is_masked_device_ioctl_compat(cmd))
                return 0;

        return -EACCES;
}

static struct security_hook_list landlock_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(inode_free_security, hook_inode_free_security),

        LSM_HOOK_INIT(sb_delete, hook_sb_delete),
        LSM_HOOK_INIT(sb_mount, hook_sb_mount),
        LSM_HOOK_INIT(move_mount, hook_move_mount),
        LSM_HOOK_INIT(sb_umount, hook_sb_umount),
        LSM_HOOK_INIT(sb_remount, hook_sb_remount),
        LSM_HOOK_INIT(sb_pivotroot, hook_sb_pivotroot),

        LSM_HOOK_INIT(path_link, hook_path_link),
        LSM_HOOK_INIT(path_rename, hook_path_rename),
        LSM_HOOK_INIT(path_mkdir, hook_path_mkdir),
        LSM_HOOK_INIT(path_mknod, hook_path_mknod),
        LSM_HOOK_INIT(path_symlink, hook_path_symlink),
        LSM_HOOK_INIT(path_unlink, hook_path_unlink),
        LSM_HOOK_INIT(path_rmdir, hook_path_rmdir),
        LSM_HOOK_INIT(path_truncate, hook_path_truncate),

        LSM_HOOK_INIT(file_alloc_security, hook_file_alloc_security),
        LSM_HOOK_INIT(file_open, hook_file_open),
        LSM_HOOK_INIT(file_truncate, hook_file_truncate),
        LSM_HOOK_INIT(file_ioctl, hook_file_ioctl),
        LSM_HOOK_INIT(file_ioctl_compat, hook_file_ioctl_compat),
};

__init void landlock_add_fs_hooks(void)
{
        security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
                           &landlock_lsmid);
}

#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST

/* clang-format off */
static struct kunit_case test_cases[] = {
        KUNIT_CASE(test_no_more_access),
        KUNIT_CASE(test_scope_to_request_with_exec_none),
        KUNIT_CASE(test_scope_to_request_with_exec_some),
        KUNIT_CASE(test_scope_to_request_without_access),
        KUNIT_CASE(test_is_eacces_with_none),
        KUNIT_CASE(test_is_eacces_with_refer),
        KUNIT_CASE(test_is_eacces_with_write),
        {}
};
/* clang-format on */

static struct kunit_suite test_suite = {
        .name = "landlock_fs",
        .test_cases = test_cases,
};

kunit_test_suite(test_suite);

#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */












































































































































































    9 
   12 






































































































































    2 






    2 















    1 


















    2 

































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Fast and scalable bitmaps.
 *
 * Copyright (C) 2016 Facebook
 * Copyright (C) 2013-2014 Jens Axboe
 */

#ifndef __LINUX_SCALE_BITMAP_H
#define __LINUX_SCALE_BITMAP_H

#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/cache.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/minmax.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/types.h>
#include <linux/wait.h>

struct seq_file;

/**
 * struct sbitmap_word - Word in a &struct sbitmap.
 */
struct sbitmap_word {
        /**
         * @word: word holding free bits
         */
        unsigned long word;

        /**
         * @cleared: word holding cleared bits
         */
        unsigned long cleared ____cacheline_aligned_in_smp;
} ____cacheline_aligned_in_smp;

/**
 * struct sbitmap - Scalable bitmap.
 *
 * A &struct sbitmap is spread over multiple cachelines to avoid ping-pong. This
 * trades off higher memory usage for better scalability.
 */
struct sbitmap {
        /**
         * @depth: Number of bits used in the whole bitmap.
         */
        unsigned int depth;

        /**
         * @shift: log2(number of bits used per word)
         */
        unsigned int shift;

        /**
         * @map_nr: Number of words (cachelines) being used for the bitmap.
         */
        unsigned int map_nr;

        /**
         * @round_robin: Allocate bits in strict round-robin order.
         */
        bool round_robin;

        /**
         * @map: Allocated bitmap.
         */
        struct sbitmap_word *map;

        /*
         * @alloc_hint: Cache of last successfully allocated or freed bit.
         *
         * This is per-cpu, which allows multiple users to stick to different
         * cachelines until the map is exhausted.
         */
        unsigned int __percpu *alloc_hint;
};

#define SBQ_WAIT_QUEUES 8
#define SBQ_WAKE_BATCH 8

/**
 * struct sbq_wait_state - Wait queue in a &struct sbitmap_queue.
 */
struct sbq_wait_state {
        /**
         * @wait: Wait queue.
         */
        wait_queue_head_t wait;
} ____cacheline_aligned_in_smp;

/**
 * struct sbitmap_queue - Scalable bitmap with the added ability to wait on free
 * bits.
 *
 * A &struct sbitmap_queue uses multiple wait queues and rolling wakeups to
 * avoid contention on the wait queue spinlock. This ensures that we don't hit a
 * scalability wall when we run out of free bits and have to start putting tasks
 * to sleep.
 */
struct sbitmap_queue {
        /**
         * @sb: Scalable bitmap.
         */
        struct sbitmap sb;

        /**
         * @wake_batch: Number of bits which must be freed before we wake up any
         * waiters.
         */
        unsigned int wake_batch;

        /**
         * @wake_index: Next wait queue in @ws to wake up.
         */
        atomic_t wake_index;

        /**
         * @ws: Wait queues.
         */
        struct sbq_wait_state *ws;

        /*
         * @ws_active: count of currently active ws waitqueues
         */
        atomic_t ws_active;

        /**
         * @min_shallow_depth: The minimum shallow depth which may be passed to
         * sbitmap_queue_get_shallow()
         */
        unsigned int min_shallow_depth;

        /**
         * @completion_cnt: Number of bits cleared passed to the
         * wakeup function.
         */
        atomic_t completion_cnt;

        /**
         * @wakeup_cnt: Number of thread wake ups issued.
         */
        atomic_t wakeup_cnt;
};

/**
 * sbitmap_init_node() - Initialize a &struct sbitmap on a specific memory node.
 * @sb: Bitmap to initialize.
 * @depth: Number of bits to allocate.
 * @shift: Use 2^@shift bits per word in the bitmap; if a negative number if
 *         given, a good default is chosen.
 * @flags: Allocation flags.
 * @node: Memory node to allocate on.
 * @round_robin: If true, be stricter about allocation order; always allocate
 *               starting from the last allocated bit. This is less efficient
 *               than the default behavior (false).
 * @alloc_hint: If true, apply percpu hint for where to start searching for
 *              a free bit.
 *
 * Return: Zero on success or negative errno on failure.
 */
int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
                      gfp_t flags, int node, bool round_robin, bool alloc_hint);

/* sbitmap internal helper */
static inline unsigned int __map_depth(const struct sbitmap *sb, int index)
{
        if (index == sb->map_nr - 1)
                return sb->depth - (index << sb->shift);
        return 1U << sb->shift;
}

/**
 * sbitmap_free() - Free memory used by a &struct sbitmap.
 * @sb: Bitmap to free.
 */
static inline void sbitmap_free(struct sbitmap *sb)
{
        free_percpu(sb->alloc_hint);
        kvfree(sb->map);
        sb->map = NULL;
}

/**
 * sbitmap_resize() - Resize a &struct sbitmap.
 * @sb: Bitmap to resize.
 * @depth: New number of bits to resize to.
 *
 * Doesn't reallocate anything. It's up to the caller to ensure that the new
 * depth doesn't exceed the depth that the sb was initialized with.
 */
void sbitmap_resize(struct sbitmap *sb, unsigned int depth);

/**
 * sbitmap_get() - Try to allocate a free bit from a &struct sbitmap.
 * @sb: Bitmap to allocate from.
 *
 * This operation provides acquire barrier semantics if it succeeds.
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
int sbitmap_get(struct sbitmap *sb);

/**
 * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
 * limiting the depth used from each word.
 * @sb: Bitmap to allocate from.
 * @shallow_depth: The maximum number of bits to allocate from a single word.
 *
 * This rather specific operation allows for having multiple users with
 * different allocation limits. E.g., there can be a high-priority class that
 * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow()
 * with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority
 * class can only allocate half of the total bits in the bitmap, preventing it
 * from starving out the high-priority class.
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth);

/**
 * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap.
 * @sb: Bitmap to check.
 *
 * Return: true if any bit in the bitmap is set, false otherwise.
 */
bool sbitmap_any_bit_set(const struct sbitmap *sb);

#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))

typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *);

/**
 * __sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
 * @start: Where to start the iteration.
 * @sb: Bitmap to iterate over.
 * @fn: Callback. Should return true to continue or false to break early.
 * @data: Pointer to pass to callback.
 *
 * This is inline even though it's non-trivial so that the function calls to the
 * callback will hopefully get optimized away.
 */
static inline void __sbitmap_for_each_set(struct sbitmap *sb,
                                          unsigned int start,
                                          sb_for_each_fn fn, void *data)
{
        unsigned int index;
        unsigned int nr;
        unsigned int scanned = 0;

        if (start >= sb->depth)
                start = 0;
        index = SB_NR_TO_INDEX(sb, start);
        nr = SB_NR_TO_BIT(sb, start);

        while (scanned < sb->depth) {
                unsigned long word;
                unsigned int depth = min_t(unsigned int,
                                           __map_depth(sb, index) - nr,
                                           sb->depth - scanned);

                scanned += depth;
                word = sb->map[index].word & ~sb->map[index].cleared;
                if (!word)
                        goto next;

                /*
                 * On the first iteration of the outer loop, we need to add the
                 * bit offset back to the size of the word for find_next_bit().
                 * On all other iterations, nr is zero, so this is a noop.
                 */
                depth += nr;
                while (1) {
                        nr = find_next_bit(&word, depth, nr);
                        if (nr >= depth)
                                break;
                        if (!fn(sb, (index << sb->shift) + nr, data))
                                return;

                        nr++;
                }
next:
                nr = 0;
                if (++index >= sb->map_nr)
                        index = 0;
        }
}

/**
 * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
 * @sb: Bitmap to iterate over.
 * @fn: Callback. Should return true to continue or false to break early.
 * @data: Pointer to pass to callback.
 */
static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
                                        void *data)
{
        __sbitmap_for_each_set(sb, 0, fn, data);
}

static inline unsigned long *__sbitmap_word(struct sbitmap *sb,
                                            unsigned int bitnr)
{
        return &sb->map[SB_NR_TO_INDEX(sb, bitnr)].word;
}

/* Helpers equivalent to the operations in asm/bitops.h and linux/bitmap.h */

static inline void sbitmap_set_bit(struct sbitmap *sb, unsigned int bitnr)
{
        set_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
}

static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr)
{
        clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
}

/*
 * This one is special, since it doesn't actually clear the bit, rather it
 * sets the corresponding bit in the ->cleared mask instead. Paired with
 * the caller doing sbitmap_deferred_clear() if a given index is full, which
 * will clear the previously freed entries in the corresponding ->word.
 */
static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr)
{
        unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared;

        set_bit(SB_NR_TO_BIT(sb, bitnr), addr);
}

/*
 * Pair of sbitmap_get, and this one applies both cleared bit and
 * allocation hint.
 */
static inline void sbitmap_put(struct sbitmap *sb, unsigned int bitnr)
{
        sbitmap_deferred_clear_bit(sb, bitnr);

        if (likely(sb->alloc_hint && !sb->round_robin && bitnr < sb->depth))
                *raw_cpu_ptr(sb->alloc_hint) = bitnr;
}

static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr)
{
        return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
}

static inline int sbitmap_calculate_shift(unsigned int depth)
{
        int        shift = ilog2(BITS_PER_LONG);

        /*
         * If the bitmap is small, shrink the number of bits per word so
         * we spread over a few cachelines, at least. If less than 4
         * bits, just forget about it, it's not going to work optimally
         * anyway.
         */
        if (depth >= 4) {
                while ((4U << shift) > depth)
                        shift--;
        }

        return shift;
}

/**
 * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file.
 * @sb: Bitmap to show.
 * @m: struct seq_file to write to.
 *
 * This is intended for debugging. The format may change at any time.
 */
void sbitmap_show(struct sbitmap *sb, struct seq_file *m);


/**
 * sbitmap_weight() - Return how many set and not cleared bits in a &struct
 * sbitmap.
 * @sb: Bitmap to check.
 *
 * Return: How many set and not cleared bits set
 */
unsigned int sbitmap_weight(const struct sbitmap *sb);

/**
 * sbitmap_bitmap_show() - Write a hex dump of a &struct sbitmap to a &struct
 * seq_file.
 * @sb: Bitmap to show.
 * @m: struct seq_file to write to.
 *
 * This is intended for debugging. The output isn't guaranteed to be internally
 * consistent.
 */
void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m);

/**
 * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific
 * memory node.
 * @sbq: Bitmap queue to initialize.
 * @depth: See sbitmap_init_node().
 * @shift: See sbitmap_init_node().
 * @round_robin: See sbitmap_get().
 * @flags: Allocation flags.
 * @node: Memory node to allocate on.
 *
 * Return: Zero on success or negative errno on failure.
 */
int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
                            int shift, bool round_robin, gfp_t flags, int node);

/**
 * sbitmap_queue_free() - Free memory used by a &struct sbitmap_queue.
 *
 * @sbq: Bitmap queue to free.
 */
static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
{
        kfree(sbq->ws);
        sbitmap_free(&sbq->sb);
}

/**
 * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch
 * @sbq: Bitmap queue to recalculate wake batch.
 * @users: Number of shares.
 *
 * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch
 * by depth. This interface is for HCTX shared tags or queue shared tags.
 */
void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
                                            unsigned int users);

/**
 * sbitmap_queue_resize() - Resize a &struct sbitmap_queue.
 * @sbq: Bitmap queue to resize.
 * @depth: New number of bits to resize to.
 *
 * Like sbitmap_resize(), this doesn't reallocate anything. It has to do
 * some extra work on the &struct sbitmap_queue, so it's not safe to just
 * resize the underlying &struct sbitmap.
 */
void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);

/**
 * __sbitmap_queue_get() - Try to allocate a free bit from a &struct
 * sbitmap_queue with preemption already disabled.
 * @sbq: Bitmap queue to allocate from.
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
int __sbitmap_queue_get(struct sbitmap_queue *sbq);

/**
 * __sbitmap_queue_get_batch() - Try to allocate a batch of free bits
 * @sbq: Bitmap queue to allocate from.
 * @nr_tags: number of tags requested
 * @offset: offset to add to returned bits
 *
 * Return: Mask of allocated tags, 0 if none are found. Each tag allocated is
 * a bit in the mask returned, and the caller must add @offset to the value to
 * get the absolute tag value.
 */
unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
                                        unsigned int *offset);

/**
 * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
 * sbitmap_queue, limiting the depth used from each word, with preemption
 * already disabled.
 * @sbq: Bitmap queue to allocate from.
 * @shallow_depth: The maximum number of bits to allocate from a single word.
 * See sbitmap_get_shallow().
 *
 * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after
 * initializing @sbq.
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
                              unsigned int shallow_depth);

/**
 * sbitmap_queue_get() - Try to allocate a free bit from a &struct
 * sbitmap_queue.
 * @sbq: Bitmap queue to allocate from.
 * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to
 *       sbitmap_queue_clear()).
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
static inline int sbitmap_queue_get(struct sbitmap_queue *sbq,
                                    unsigned int *cpu)
{
        int nr;

        *cpu = get_cpu();
        nr = __sbitmap_queue_get(sbq);
        put_cpu();
        return nr;
}

/**
 * sbitmap_queue_min_shallow_depth() - Inform a &struct sbitmap_queue of the
 * minimum shallow depth that will be used.
 * @sbq: Bitmap queue in question.
 * @min_shallow_depth: The minimum shallow depth that will be passed to
 * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow().
 *
 * sbitmap_queue_clear() batches wakeups as an optimization. The batch size
 * depends on the depth of the bitmap. Since the shallow allocation functions
 * effectively operate with a different depth, the shallow depth must be taken
 * into account when calculating the batch size. This function must be called
 * with the minimum shallow depth that will be used. Failure to do so can result
 * in missed wakeups.
 */
void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
                                     unsigned int min_shallow_depth);

/**
 * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a
 * &struct sbitmap_queue.
 * @sbq: Bitmap to free from.
 * @nr: Bit number to free.
 * @cpu: CPU the bit was allocated on.
 */
void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
                         unsigned int cpu);

/**
 * sbitmap_queue_clear_batch() - Free a batch of allocated bits
 * &struct sbitmap_queue.
 * @sbq: Bitmap to free from.
 * @offset: offset for each tag in array
 * @tags: array of tags
 * @nr_tags: number of tags in array
 */
void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
                                int *tags, int nr_tags);

static inline int sbq_index_inc(int index)
{
        return (index + 1) & (SBQ_WAIT_QUEUES - 1);
}

static inline void sbq_index_atomic_inc(atomic_t *index)
{
        int old = atomic_read(index);
        int new = sbq_index_inc(old);
        atomic_cmpxchg(index, old, new);
}

/**
 * sbq_wait_ptr() - Get the next wait queue to use for a &struct
 * sbitmap_queue.
 * @sbq: Bitmap queue to wait on.
 * @wait_index: A counter per "user" of @sbq.
 */
static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq,
                                                  atomic_t *wait_index)
{
        struct sbq_wait_state *ws;

        ws = &sbq->ws[atomic_read(wait_index)];
        sbq_index_atomic_inc(wait_index);
        return ws;
}

/**
 * sbitmap_queue_wake_all() - Wake up everything waiting on a &struct
 * sbitmap_queue.
 * @sbq: Bitmap queue to wake up.
 */
void sbitmap_queue_wake_all(struct sbitmap_queue *sbq);

/**
 * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue
 * on a &struct sbitmap_queue.
 * @sbq: Bitmap queue to wake up.
 * @nr: Number of bits cleared.
 */
void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr);

/**
 * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct
 * seq_file.
 * @sbq: Bitmap queue to show.
 * @m: struct seq_file to write to.
 *
 * This is intended for debugging. The format may change at any time.
 */
void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m);

struct sbq_wait {
        struct sbitmap_queue *sbq;        /* if set, sbq_wait is accounted */
        struct wait_queue_entry wait;
};

#define DEFINE_SBQ_WAIT(name)                                                        \
        struct sbq_wait name = {                                                \
                .sbq = NULL,                                                        \
                .wait = {                                                        \
                        .private        = current,                                \
                        .func                = autoremove_wake_function,                \
                        .entry                = LIST_HEAD_INIT((name).wait.entry),        \
                }                                                                \
        }

/*
 * Wrapper around prepare_to_wait_exclusive(), which maintains some extra
 * internal state.
 */
void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
                                struct sbq_wait_state *ws,
                                struct sbq_wait *sbq_wait, int state);

/*
 * Must be paired with sbitmap_prepare_to_wait().
 */
void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
                                struct sbq_wait *sbq_wait);

/*
 * Wrapper around add_wait_queue(), which maintains some extra internal state
 */
void sbitmap_add_wait_queue(struct sbitmap_queue *sbq,
                            struct sbq_wait_state *ws,
                            struct sbq_wait *sbq_wait);

/*
 * Must be paired with sbitmap_add_wait_queue()
 */
void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait);

#endif /* __LINUX_SCALE_BITMAP_H */













































































































































































































    3 














    3 































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _SCSI_DISK_H
#define _SCSI_DISK_H

/*
 * More than enough for everybody ;)  The huge number of majors
 * is a leftover from 16bit dev_t days, we don't really need that
 * much numberspace.
 */
#define SD_MAJORS        16

/*
 * Time out in seconds for disks and Magneto-opticals (which are slower).
 */
#define SD_TIMEOUT                (30 * HZ)
#define SD_MOD_TIMEOUT                (75 * HZ)
/*
 * Flush timeout is a multiplier over the standard device timeout which is
 * user modifiable via sysfs but initially set to SD_TIMEOUT
 */
#define SD_FLUSH_TIMEOUT_MULTIPLIER        2
#define SD_WRITE_SAME_TIMEOUT        (120 * HZ)

/*
 * Number of allowed retries
 */
#define SD_MAX_RETRIES                5
#define SD_PASSTHROUGH_RETRIES        1
#define SD_MAX_MEDIUM_TIMEOUTS        2

/*
 * Size of the initial data buffer for mode and read capacity data
 */
#define SD_BUF_SIZE                512

/*
 * Number of sectors at the end of the device to avoid multi-sector
 * accesses to in the case of last_sector_bug
 */
#define SD_LAST_BUGGY_SECTORS        8

enum {
        SD_EXT_CDB_SIZE = 32,        /* Extended CDB size */
        SD_MEMPOOL_SIZE = 2,        /* CDB pool size */
};

enum {
        SD_DEF_XFER_BLOCKS = 0xffff,
        SD_MAX_XFER_BLOCKS = 0xffffffff,
        SD_MAX_WS10_BLOCKS = 0xffff,
        SD_MAX_WS16_BLOCKS = 0x7fffff,
};

enum {
        SD_LBP_FULL = 0,        /* Full logical block provisioning */
        SD_LBP_UNMAP,                /* Use UNMAP command */
        SD_LBP_WS16,                /* Use WRITE SAME(16) with UNMAP bit */
        SD_LBP_WS10,                /* Use WRITE SAME(10) with UNMAP bit */
        SD_LBP_ZERO,                /* Use WRITE SAME(10) with zero payload */
        SD_LBP_DISABLE,                /* Discard disabled due to failed cmd */
};

enum {
        SD_ZERO_WRITE = 0,        /* Use WRITE(10/16) command */
        SD_ZERO_WS,                /* Use WRITE SAME(10/16) command */
        SD_ZERO_WS16_UNMAP,        /* Use WRITE SAME(16) with UNMAP */
        SD_ZERO_WS10_UNMAP,        /* Use WRITE SAME(10) with UNMAP */
};

/**
 * struct zoned_disk_info - Specific properties of a ZBC SCSI device.
 * @nr_zones: number of zones.
 * @zone_blocks: number of logical blocks per zone.
 *
 * This data structure holds the ZBC SCSI device properties that are retrieved
 * twice: a first time before the gendisk capacity is known and a second time
 * after the gendisk capacity is known.
 */
struct zoned_disk_info {
        u32                nr_zones;
        u32                zone_blocks;
};

struct scsi_disk {
        struct scsi_device *device;

        /*
         * disk_dev is used to show attributes in /sys/class/scsi_disk/,
         * but otherwise not really needed.  Do not use for refcounting.
         */
        struct device        disk_dev;
        struct gendisk        *disk;
        struct opal_dev *opal_dev;
#ifdef CONFIG_BLK_DEV_ZONED
        /* Updated during revalidation before the gendisk capacity is known. */
        struct zoned_disk_info        early_zone_info;
        /* Updated during revalidation after the gendisk capacity is known. */
        struct zoned_disk_info        zone_info;
        u32                zones_optimal_open;
        u32                zones_optimal_nonseq;
        u32                zones_max_open;
        /*
         * Either zero or a power of two. If not zero it means that the offset
         * between zone starting LBAs is constant.
         */
        u32                zone_starting_lba_gran;
#endif
        atomic_t        openers;
        sector_t        capacity;        /* size in logical blocks */
        int                max_retries;
        u32                min_xfer_blocks;
        u32                max_xfer_blocks;
        u32                opt_xfer_blocks;
        u32                max_ws_blocks;
        u32                max_unmap_blocks;
        u32                unmap_granularity;
        u32                unmap_alignment;
        u32                index;
        unsigned int        physical_block_size;
        unsigned int        max_medium_access_timeouts;
        unsigned int        medium_access_timed_out;
                        /* number of permanent streams */
        u16                permanent_stream_count;
        u8                media_present;
        u8                write_prot;
        u8                protection_type;/* Data Integrity Field */
        u8                provisioning_mode;
        u8                zeroing_mode;
        u8                nr_actuators;                /* Number of actuators */
        bool                suspended;        /* Disk is suspended (stopped) */
        unsigned        ATO : 1;        /* state of disk ATO bit */
        unsigned        cache_override : 1; /* temp override of WCE,RCD */
        unsigned        WCE : 1;        /* state of disk WCE bit */
        unsigned        RCD : 1;        /* state of disk RCD bit, unused */
        unsigned        DPOFUA : 1;        /* state of disk DPOFUA bit */
        unsigned        first_scan : 1;
        unsigned        lbpme : 1;
        unsigned        lbprz : 1;
        unsigned        lbpu : 1;
        unsigned        lbpws : 1;
        unsigned        lbpws10 : 1;
        unsigned        lbpvpd : 1;
        unsigned        ws10 : 1;
        unsigned        ws16 : 1;
        unsigned        rc_basis: 2;
        unsigned        zoned: 2;
        unsigned        urswrz : 1;
        unsigned        security : 1;
        unsigned        ignore_medium_access_errors : 1;
        unsigned        rscs : 1; /* reduced stream control support */
};
#define to_scsi_disk(obj) container_of(obj, struct scsi_disk, disk_dev)

static inline struct scsi_disk *scsi_disk(struct gendisk *disk)
{
        return disk->private_data;
}

#define sd_printk(prefix, sdsk, fmt, a...)                                \
        (sdsk)->disk ?                                                        \
              sdev_prefix_printk(prefix, (sdsk)->device,                \
                                 (sdsk)->disk->disk_name, fmt, ##a) :        \
              sdev_printk(prefix, (sdsk)->device, fmt, ##a)

#define sd_first_printk(prefix, sdsk, fmt, a...)                        \
        do {                                                                \
                if ((sdsk)->first_scan)                                        \
                        sd_printk(prefix, sdsk, fmt, ##a);                \
        } while (0)

static inline int scsi_medium_access_command(struct scsi_cmnd *scmd)
{
        switch (scmd->cmnd[0]) {
        case READ_6:
        case READ_10:
        case READ_12:
        case READ_16:
        case SYNCHRONIZE_CACHE:
        case VERIFY:
        case VERIFY_12:
        case VERIFY_16:
        case WRITE_6:
        case WRITE_10:
        case WRITE_12:
        case WRITE_16:
        case WRITE_SAME:
        case WRITE_SAME_16:
        case UNMAP:
                return 1;
        case VARIABLE_LENGTH_CMD:
                switch (scmd->cmnd[9]) {
                case READ_32:
                case VERIFY_32:
                case WRITE_32:
                case WRITE_SAME_32:
                        return 1;
                }
        }

        return 0;
}

static inline sector_t logical_to_sectors(struct scsi_device *sdev, sector_t blocks)
{
        return blocks << (ilog2(sdev->sector_size) - 9);
}

static inline unsigned int logical_to_bytes(struct scsi_device *sdev, sector_t blocks)
{
        return blocks * sdev->sector_size;
}

static inline sector_t bytes_to_logical(struct scsi_device *sdev, unsigned int bytes)
{
        return bytes >> ilog2(sdev->sector_size);
}

static inline sector_t sectors_to_logical(struct scsi_device *sdev, sector_t sector)
{
        return sector >> (ilog2(sdev->sector_size) - 9);
}

#ifdef CONFIG_BLK_DEV_INTEGRITY

extern void sd_dif_config_host(struct scsi_disk *);

#else /* CONFIG_BLK_DEV_INTEGRITY */

static inline void sd_dif_config_host(struct scsi_disk *disk)
{
}

#endif /* CONFIG_BLK_DEV_INTEGRITY */

static inline int sd_is_zoned(struct scsi_disk *sdkp)
{
        return sdkp->zoned == 1 || sdkp->device->type == TYPE_ZBC;
}

#ifdef CONFIG_BLK_DEV_ZONED

int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]);
int sd_zbc_revalidate_zones(struct scsi_disk *sdkp);
blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd,
                                         unsigned char op, bool all);
unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
                             struct scsi_sense_hdr *sshdr);
int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
                unsigned int nr_zones, report_zones_cb cb, void *data);

#else /* CONFIG_BLK_DEV_ZONED */

static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE])
{
        return 0;
}

static inline int sd_zbc_revalidate_zones(struct scsi_disk *sdkp)
{
        return 0;
}

static inline blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd,
                                                       unsigned char op,
                                                       bool all)
{
        return BLK_STS_TARGET;
}

static inline unsigned int sd_zbc_complete(struct scsi_cmnd *cmd,
                        unsigned int good_bytes, struct scsi_sense_hdr *sshdr)
{
        return good_bytes;
}

#define sd_zbc_report_zones NULL

#endif /* CONFIG_BLK_DEV_ZONED */

void sd_print_sense_hdr(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr);
void sd_print_result(const struct scsi_disk *sdkp, const char *msg, int result);

#endif /* _SCSI_DISK_H */





























































































































































































































































































































































































    3 






















































































































































    3 






























    3 
















    3 





























    3 
    3 


    3 






    3 




    3 
    3 


    3 






    3 








    3 



















































    3 




    3 







































    3 

    3 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 




    3 























    3 



    3 


    3 








































































































    3 



    3 












    3 






    3 













































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
// SPDX-License-Identifier: GPL-2.0-or-later
/* Virtio ring implementation.
 *
 *  Copyright 2007 Rusty Russell IBM Corporation
 */
#include <linux/virtio.h>
#include <linux/virtio_ring.h>
#include <linux/virtio_config.h>
#include <linux/device.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/hrtimer.h>
#include <linux/dma-mapping.h>
#include <linux/kmsan.h>
#include <linux/spinlock.h>
#include <xen/xen.h>

#ifdef DEBUG
/* For development, we want to crash whenever the ring is screwed. */
#define BAD_RING(_vq, fmt, args...)                                \
        do {                                                        \
                dev_err(&(_vq)->vq.vdev->dev,                        \
                        "%s:"fmt, (_vq)->vq.name, ##args);        \
                BUG();                                                \
        } while (0)
/* Caller is supposed to guarantee no reentry. */
#define START_USE(_vq)                                                \
        do {                                                        \
                if ((_vq)->in_use)                                \
                        panic("%s:in_use = %i\n",                \
                              (_vq)->vq.name, (_vq)->in_use);        \
                (_vq)->in_use = __LINE__;                        \
        } while (0)
#define END_USE(_vq) \
        do { BUG_ON(!(_vq)->in_use); (_vq)->in_use = 0; } while(0)
#define LAST_ADD_TIME_UPDATE(_vq)                                \
        do {                                                        \
                ktime_t now = ktime_get();                        \
                                                                \
                /* No kick or get, with .1 second between?  Warn. */ \
                if ((_vq)->last_add_time_valid)                        \
                        WARN_ON(ktime_to_ms(ktime_sub(now,        \
                                (_vq)->last_add_time)) > 100);        \
                (_vq)->last_add_time = now;                        \
                (_vq)->last_add_time_valid = true;                \
        } while (0)
#define LAST_ADD_TIME_CHECK(_vq)                                \
        do {                                                        \
                if ((_vq)->last_add_time_valid) {                \
                        WARN_ON(ktime_to_ms(ktime_sub(ktime_get(), \
                                      (_vq)->last_add_time)) > 100); \
                }                                                \
        } while (0)
#define LAST_ADD_TIME_INVALID(_vq)                                \
        ((_vq)->last_add_time_valid = false)
#else
#define BAD_RING(_vq, fmt, args...)                                \
        do {                                                        \
                dev_err(&_vq->vq.vdev->dev,                        \
                        "%s:"fmt, (_vq)->vq.name, ##args);        \
                (_vq)->broken = true;                                \
        } while (0)
#define START_USE(vq)
#define END_USE(vq)
#define LAST_ADD_TIME_UPDATE(vq)
#define LAST_ADD_TIME_CHECK(vq)
#define LAST_ADD_TIME_INVALID(vq)
#endif

struct vring_desc_state_split {
        void *data;                        /* Data for callback. */
        struct vring_desc *indir_desc;        /* Indirect descriptor, if any. */
};

struct vring_desc_state_packed {
        void *data;                        /* Data for callback. */
        struct vring_packed_desc *indir_desc; /* Indirect descriptor, if any. */
        u16 num;                        /* Descriptor list length. */
        u16 last;                        /* The last desc state in a list. */
};

struct vring_desc_extra {
        dma_addr_t addr;                /* Descriptor DMA addr. */
        u32 len;                        /* Descriptor length. */
        u16 flags;                        /* Descriptor flags. */
        u16 next;                        /* The next desc state in a list. */
};

struct vring_virtqueue_split {
        /* Actual memory layout for this queue. */
        struct vring vring;

        /* Last written value to avail->flags */
        u16 avail_flags_shadow;

        /*
         * Last written value to avail->idx in
         * guest byte order.
         */
        u16 avail_idx_shadow;

        /* Per-descriptor state. */
        struct vring_desc_state_split *desc_state;
        struct vring_desc_extra *desc_extra;

        /* DMA address and size information */
        dma_addr_t queue_dma_addr;
        size_t queue_size_in_bytes;

        /*
         * The parameters for creating vrings are reserved for creating new
         * vring.
         */
        u32 vring_align;
        bool may_reduce_num;
};

struct vring_virtqueue_packed {
        /* Actual memory layout for this queue. */
        struct {
                unsigned int num;
                struct vring_packed_desc *desc;
                struct vring_packed_desc_event *driver;
                struct vring_packed_desc_event *device;
        } vring;

        /* Driver ring wrap counter. */
        bool avail_wrap_counter;

        /* Avail used flags. */
        u16 avail_used_flags;

        /* Index of the next avail descriptor. */
        u16 next_avail_idx;

        /*
         * Last written value to driver->flags in
         * guest byte order.
         */
        u16 event_flags_shadow;

        /* Per-descriptor state. */
        struct vring_desc_state_packed *desc_state;
        struct vring_desc_extra *desc_extra;

        /* DMA address and size information */
        dma_addr_t ring_dma_addr;
        dma_addr_t driver_event_dma_addr;
        dma_addr_t device_event_dma_addr;
        size_t ring_size_in_bytes;
        size_t event_size_in_bytes;
};

struct vring_virtqueue {
        struct virtqueue vq;

        /* Is this a packed ring? */
        bool packed_ring;

        /* Is DMA API used? */
        bool use_dma_api;

        /* Can we use weak barriers? */
        bool weak_barriers;

        /* Other side has made a mess, don't try any more. */
        bool broken;

        /* Host supports indirect buffers */
        bool indirect;

        /* Host publishes avail event idx */
        bool event;

        /* Do DMA mapping by driver */
        bool premapped;

        /* Do unmap or not for desc. Just when premapped is False and
         * use_dma_api is true, this is true.
         */
        bool do_unmap;

        /* Head of free buffer list. */
        unsigned int free_head;
        /* Number we've added since last sync. */
        unsigned int num_added;

        /* Last used index  we've seen.
         * for split ring, it just contains last used index
         * for packed ring:
         * bits up to VRING_PACKED_EVENT_F_WRAP_CTR include the last used index.
         * bits from VRING_PACKED_EVENT_F_WRAP_CTR include the used wrap counter.
         */
        u16 last_used_idx;

        /* Hint for event idx: already triggered no need to disable. */
        bool event_triggered;

        union {
                /* Available for split ring */
                struct vring_virtqueue_split split;

                /* Available for packed ring */
                struct vring_virtqueue_packed packed;
        };

        /* How to notify other side. FIXME: commonalize hcalls! */
        bool (*notify)(struct virtqueue *vq);

        /* DMA, allocation, and size information */
        bool we_own_ring;

        /* Device used for doing DMA */
        struct device *dma_dev;

#ifdef DEBUG
        /* They're supposed to lock for us. */
        unsigned int in_use;

        /* Figure out if their kicks are too delayed. */
        bool last_add_time_valid;
        ktime_t last_add_time;
#endif
};

static struct virtqueue *__vring_new_virtqueue(unsigned int index,
                                               struct vring_virtqueue_split *vring_split,
                                               struct virtio_device *vdev,
                                               bool weak_barriers,
                                               bool context,
                                               bool (*notify)(struct virtqueue *),
                                               void (*callback)(struct virtqueue *),
                                               const char *name,
                                               struct device *dma_dev);
static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num);
static void vring_free(struct virtqueue *_vq);

/*
 * Helpers.
 */

#define to_vvq(_vq) container_of_const(_vq, struct vring_virtqueue, vq)

static bool virtqueue_use_indirect(const struct vring_virtqueue *vq,
                                   unsigned int total_sg)
{
        /*
         * If the host supports indirect descriptor tables, and we have multiple
         * buffers, then go indirect. FIXME: tune this threshold
         */
        return (vq->indirect && total_sg > 1 && vq->vq.num_free);
}

/*
 * Modern virtio devices have feature bits to specify whether they need a
 * quirk and bypass the IOMMU. If not there, just use the DMA API.
 *
 * If there, the interaction between virtio and DMA API is messy.
 *
 * On most systems with virtio, physical addresses match bus addresses,
 * and it doesn't particularly matter whether we use the DMA API.
 *
 * On some systems, including Xen and any system with a physical device
 * that speaks virtio behind a physical IOMMU, we must use the DMA API
 * for virtio DMA to work at all.
 *
 * On other systems, including SPARC and PPC64, virtio-pci devices are
 * enumerated as though they are behind an IOMMU, but the virtio host
 * ignores the IOMMU, so we must either pretend that the IOMMU isn't
 * there or somehow map everything as the identity.
 *
 * For the time being, we preserve historic behavior and bypass the DMA
 * API.
 *
 * TODO: install a per-device DMA ops structure that does the right thing
 * taking into account all the above quirks, and use the DMA API
 * unconditionally on data path.
 */

static bool vring_use_dma_api(const struct virtio_device *vdev)
{
        if (!virtio_has_dma_quirk(vdev))
                return true;

        /* Otherwise, we are left to guess. */
        /*
         * In theory, it's possible to have a buggy QEMU-supposed
         * emulated Q35 IOMMU and Xen enabled at the same time.  On
         * such a configuration, virtio has never worked and will
         * not work without an even larger kludge.  Instead, enable
         * the DMA API if we're a Xen guest, which at least allows
         * all of the sensible Xen configurations to work correctly.
         */
        if (xen_domain())
                return true;

        return false;
}

size_t virtio_max_dma_size(const struct virtio_device *vdev)
{
        size_t max_segment_size = SIZE_MAX;

        if (vring_use_dma_api(vdev))
                max_segment_size = dma_max_mapping_size(vdev->dev.parent);

        return max_segment_size;
}
EXPORT_SYMBOL_GPL(virtio_max_dma_size);

static void *vring_alloc_queue(struct virtio_device *vdev, size_t size,
                               dma_addr_t *dma_handle, gfp_t flag,
                               struct device *dma_dev)
{
        if (vring_use_dma_api(vdev)) {
                return dma_alloc_coherent(dma_dev, size,
                                          dma_handle, flag);
        } else {
                void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag);

                if (queue) {
                        phys_addr_t phys_addr = virt_to_phys(queue);
                        *dma_handle = (dma_addr_t)phys_addr;

                        /*
                         * Sanity check: make sure we dind't truncate
                         * the address.  The only arches I can find that
                         * have 64-bit phys_addr_t but 32-bit dma_addr_t
                         * are certain non-highmem MIPS and x86
                         * configurations, but these configurations
                         * should never allocate physical pages above 32
                         * bits, so this is fine.  Just in case, throw a
                         * warning and abort if we end up with an
                         * unrepresentable address.
                         */
                        if (WARN_ON_ONCE(*dma_handle != phys_addr)) {
                                free_pages_exact(queue, PAGE_ALIGN(size));
                                return NULL;
                        }
                }
                return queue;
        }
}

static void vring_free_queue(struct virtio_device *vdev, size_t size,
                             void *queue, dma_addr_t dma_handle,
                             struct device *dma_dev)
{
        if (vring_use_dma_api(vdev))
                dma_free_coherent(dma_dev, size, queue, dma_handle);
        else
                free_pages_exact(queue, PAGE_ALIGN(size));
}

/*
 * The DMA ops on various arches are rather gnarly right now, and
 * making all of the arch DMA ops work on the vring device itself
 * is a mess.
 */
static struct device *vring_dma_dev(const struct vring_virtqueue *vq)
{
        return vq->dma_dev;
}

/* Map one sg entry. */
static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *sg,
                            enum dma_data_direction direction, dma_addr_t *addr)
{
        if (vq->premapped) {
                *addr = sg_dma_address(sg);
                return 0;
        }

        if (!vq->use_dma_api) {
                /*
                 * If DMA is not used, KMSAN doesn't know that the scatterlist
                 * is initialized by the hardware. Explicitly check/unpoison it
                 * depending on the direction.
                 */
                kmsan_handle_dma(sg_page(sg), sg->offset, sg->length, direction);
                *addr = (dma_addr_t)sg_phys(sg);
                return 0;
        }

        /*
         * We can't use dma_map_sg, because we don't use scatterlists in
         * the way it expects (we don't guarantee that the scatterlist
         * will exist for the lifetime of the mapping).
         */
        *addr = dma_map_page(vring_dma_dev(vq),
                            sg_page(sg), sg->offset, sg->length,
                            direction);

        if (dma_mapping_error(vring_dma_dev(vq), *addr))
                return -ENOMEM;

        return 0;
}

static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
                                   void *cpu_addr, size_t size,
                                   enum dma_data_direction direction)
{
        if (!vq->use_dma_api)
                return (dma_addr_t)virt_to_phys(cpu_addr);

        return dma_map_single(vring_dma_dev(vq),
                              cpu_addr, size, direction);
}

static int vring_mapping_error(const struct vring_virtqueue *vq,
                               dma_addr_t addr)
{
        if (!vq->use_dma_api)
                return 0;

        return dma_mapping_error(vring_dma_dev(vq), addr);
}

static void virtqueue_init(struct vring_virtqueue *vq, u32 num)
{
        vq->vq.num_free = num;

        if (vq->packed_ring)
                vq->last_used_idx = 0 | (1 << VRING_PACKED_EVENT_F_WRAP_CTR);
        else
                vq->last_used_idx = 0;

        vq->event_triggered = false;
        vq->num_added = 0;

#ifdef DEBUG
        vq->in_use = false;
        vq->last_add_time_valid = false;
#endif
}


/*
 * Split ring specific functions - *_split().
 */

static void vring_unmap_one_split_indirect(const struct vring_virtqueue *vq,
                                           const struct vring_desc *desc)
{
        u16 flags;

        if (!vq->do_unmap)
                return;

        flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);

        dma_unmap_page(vring_dma_dev(vq),
                       virtio64_to_cpu(vq->vq.vdev, desc->addr),
                       virtio32_to_cpu(vq->vq.vdev, desc->len),
                       (flags & VRING_DESC_F_WRITE) ?
                       DMA_FROM_DEVICE : DMA_TO_DEVICE);
}

static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq,
                                          unsigned int i)
{
        struct vring_desc_extra *extra = vq->split.desc_extra;
        u16 flags;

        flags = extra[i].flags;

        if (flags & VRING_DESC_F_INDIRECT) {
                if (!vq->use_dma_api)
                        goto out;

                dma_unmap_single(vring_dma_dev(vq),
                                 extra[i].addr,
                                 extra[i].len,
                                 (flags & VRING_DESC_F_WRITE) ?
                                 DMA_FROM_DEVICE : DMA_TO_DEVICE);
        } else {
                if (!vq->do_unmap)
                        goto out;

                dma_unmap_page(vring_dma_dev(vq),
                               extra[i].addr,
                               extra[i].len,
                               (flags & VRING_DESC_F_WRITE) ?
                               DMA_FROM_DEVICE : DMA_TO_DEVICE);
        }

out:
        return extra[i].next;
}

static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
                                               unsigned int total_sg,
                                               gfp_t gfp)
{
        struct vring_desc *desc;
        unsigned int i;

        /*
         * We require lowmem mappings for the descriptors because
         * otherwise virt_to_phys will give us bogus addresses in the
         * virtqueue.
         */
        gfp &= ~__GFP_HIGHMEM;

        desc = kmalloc_array(total_sg, sizeof(struct vring_desc), gfp);
        if (!desc)
                return NULL;

        for (i = 0; i < total_sg; i++)
                desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
        return desc;
}

static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq,
                                                    struct vring_desc *desc,
                                                    unsigned int i,
                                                    dma_addr_t addr,
                                                    unsigned int len,
                                                    u16 flags,
                                                    bool indirect)
{
        struct vring_virtqueue *vring = to_vvq(vq);
        struct vring_desc_extra *extra = vring->split.desc_extra;
        u16 next;

        desc[i].flags = cpu_to_virtio16(vq->vdev, flags);
        desc[i].addr = cpu_to_virtio64(vq->vdev, addr);
        desc[i].len = cpu_to_virtio32(vq->vdev, len);

        if (!indirect) {
                next = extra[i].next;
                desc[i].next = cpu_to_virtio16(vq->vdev, next);

                extra[i].addr = addr;
                extra[i].len = len;
                extra[i].flags = flags;
        } else
                next = virtio16_to_cpu(vq->vdev, desc[i].next);

        return next;
}

static inline int virtqueue_add_split(struct virtqueue *_vq,
                                      struct scatterlist *sgs[],
                                      unsigned int total_sg,
                                      unsigned int out_sgs,
                                      unsigned int in_sgs,
                                      void *data,
                                      void *ctx,
                                      gfp_t gfp)
{
        struct vring_virtqueue *vq = to_vvq(_vq);
        struct scatterlist *sg;
        struct vring_desc *desc;
        unsigned int i, n, avail, descs_used, prev, err_idx;
        int head;
        bool indirect;

        START_USE(vq);

        BUG_ON(data == NULL);
        BUG_ON(ctx && vq->indirect);

        if (unlikely(vq->broken)) {
                END_USE(vq);
                return -EIO;
        }

        LAST_ADD_TIME_UPDATE(vq);

        BUG_ON(total_sg == 0);

        head = vq->free_head;

        if (virtqueue_use_indirect(vq, total_sg))
                desc = alloc_indirect_split(_vq, total_sg, gfp);
        else {
                desc = NULL;
                WARN_ON_ONCE(total_sg > vq->split.vring.num && !vq->indirect);
        }

        if (desc) {
                /* Use a single buffer which doesn't continue */
                indirect = true;
                /* Set up rest to use this indirect table. */
                i = 0;
                descs_used = 1;
        } else {
                indirect = false;
                desc = vq->split.vring.desc;
                i = head;
                descs_used = total_sg;
        }

        if (unlikely(vq->vq.num_free < descs_used)) {
                pr_debug("Can't add buf len %i - avail = %i\n",
                         descs_used, vq->vq.num_free);
                /* FIXME: for historical reasons, we force a notify here if
                 * there are outgoing parts to the buffer.  Presumably the
                 * host should service the ring ASAP. */
                if (out_sgs)
                        vq->notify(&vq->vq);
                if (indirect)
                        kfree(desc);
                END_USE(vq);
                return -ENOSPC;
        }

        for (n = 0; n < out_sgs; n++) {
                for (sg = sgs[n]; sg; sg = sg_next(sg)) {
                        dma_addr_t addr;

                        if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr))
                                goto unmap_release;

                        prev = i;
                        /* Note that we trust indirect descriptor
                         * table since it use stream DMA mapping.
                         */
                        i = virtqueue_add_desc_split(_vq, desc, i, addr, sg->length,
                                                     VRING_DESC_F_NEXT,
                                                     indirect);
                }
        }
        for (; n < (out_sgs + in_sgs); n++) {
                for (sg = sgs[n]; sg; sg = sg_next(sg)) {
                        dma_addr_t addr;

                        if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr))
                                goto unmap_release;

                        prev = i;
                        /* Note that we trust indirect descriptor
                         * table since it use stream DMA mapping.
                         */
                        i = virtqueue_add_desc_split(_vq, desc, i, addr,
                                                     sg->length,
                                                     VRING_DESC_F_NEXT |
                                                     VRING_DESC_F_WRITE,
                                                     indirect);
                }
        }
        /* Last one doesn't continue. */
        desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
        if (!indirect && vq->do_unmap)
                vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags &=
                        ~VRING_DESC_F_NEXT;

        if (indirect) {
                /* Now that the indirect table is filled in, map it. */
                dma_addr_t addr = vring_map_single(
                        vq, desc, total_sg * sizeof(struct vring_desc),
                        DMA_TO_DEVICE);
                if (vring_mapping_error(vq, addr)) {
                        if (vq->premapped)
                                goto free_indirect;

                        goto unmap_release;
                }

                virtqueue_add_desc_split(_vq, vq->split.vring.desc,
                                         head, addr,
                                         total_sg * sizeof(struct vring_desc),
                                         VRING_DESC_F_INDIRECT,
                                         false);
        }

        /* We're using some buffers from the free list. */
        vq->vq.num_free -= descs_used;

        /* Update free pointer */
        if (indirect)
                vq->free_head = vq->split.desc_extra[head].next;
        else
                vq->free_head = i;

        /* Store token and indirect buffer state. */
        vq->split.desc_state[head].data = data;
        if (indirect)
                vq->split.desc_state[head].indir_desc = desc;
        else
                vq->split.desc_state[head].indir_desc = ctx;

        /* Put entry in available array (but don't update avail->idx until they
         * do sync). */
        avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
        vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);

        /* Descriptors and available array need to be set before we expose the
         * new available array entries. */
        virtio_wmb(vq->weak_barriers);
        vq->split.avail_idx_shadow++;
        vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
                                                vq->split.avail_idx_shadow);
        vq->num_added++;

        pr_debug("Added buffer head %i to %p\n", head, vq);
        END_USE(vq);

        /* This is very unlikely, but theoretically possible.  Kick
         * just in case. */
        if (unlikely(vq->num_added == (1 << 16) - 1))
                virtqueue_kick(_vq);

        return 0;

unmap_release:
        err_idx = i;

        if (indirect)
                i = 0;
        else
                i = head;

        for (n = 0; n < total_sg; n++) {
                if (i == err_idx)
                        break;
                if (indirect) {
                        vring_unmap_one_split_indirect(vq, &desc[i]);
                        i = virtio16_to_cpu(_vq->vdev, desc[i].next);
                } else
                        i = vring_unmap_one_split(vq, i);
        }

free_indirect:
        if (indirect)
                kfree(desc);

        END_USE(vq);
        return -ENOMEM;
}

static bool virtqueue_kick_prepare_split(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);
        u16 new, old;
        bool needs_kick;

        START_USE(vq);
        /* We need to expose available array entries before checking avail
         * event. */
        virtio_mb(vq->weak_barriers);

        old = vq->split.avail_idx_shadow - vq->num_added;
        new = vq->split.avail_idx_shadow;
        vq->num_added = 0;

        LAST_ADD_TIME_CHECK(vq);
        LAST_ADD_TIME_INVALID(vq);

        if (vq->event) {
                needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev,
                                        vring_avail_event(&vq->split.vring)),
                                              new, old);
        } else {
                needs_kick = !(vq->split.vring.used->flags &
                                        cpu_to_virtio16(_vq->vdev,
                                                VRING_USED_F_NO_NOTIFY));
        }
        END_USE(vq);
        return needs_kick;
}

static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
                             void **ctx)
{
        unsigned int i, j;
        __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);

        /* Clear data ptr. */
        vq->split.desc_state[head].data = NULL;

        /* Put back on free list: unmap first-level descriptors and find end */
        i = head;

        while (vq->split.vring.desc[i].flags & nextflag) {
                vring_unmap_one_split(vq, i);
                i = vq->split.desc_extra[i].next;
                vq->vq.num_free++;
        }

        vring_unmap_one_split(vq, i);
        vq->split.desc_extra[i].next = vq->free_head;
        vq->free_head = head;

        /* Plus final descriptor */
        vq->vq.num_free++;

        if (vq->indirect) {
                struct vring_desc *indir_desc =
                                vq->split.desc_state[head].indir_desc;
                u32 len;

                /* Free the indirect table, if any, now that it's unmapped. */
                if (!indir_desc)
                        return;

                len = vq->split.desc_extra[head].len;

                BUG_ON(!(vq->split.desc_extra[head].flags &
                                VRING_DESC_F_INDIRECT));
                BUG_ON(len == 0 || len % sizeof(struct vring_desc));

                if (vq->do_unmap) {
                        for (j = 0; j < len / sizeof(struct vring_desc); j++)
                                vring_unmap_one_split_indirect(vq, &indir_desc[j]);
                }

                kfree(indir_desc);
                vq->split.desc_state[head].indir_desc = NULL;
        } else if (ctx) {
                *ctx = vq->split.desc_state[head].indir_desc;
        }
}

static bool more_used_split(const struct vring_virtqueue *vq)
{
        return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev,
                        vq->split.vring.used->idx);
}

static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
                                         unsigned int *len,
                                         void **ctx)
{
        struct vring_virtqueue *vq = to_vvq(_vq);
        void *ret;
        unsigned int i;
        u16 last_used;

        START_USE(vq);

        if (unlikely(vq->broken)) {
                END_USE(vq);
                return NULL;
        }

        if (!more_used_split(vq)) {
                pr_debug("No more buffers in queue\n");
                END_USE(vq);
                return NULL;
        }

        /* Only get used array entries after they have been exposed by host. */
        virtio_rmb(vq->weak_barriers);

        last_used = (vq->last_used_idx & (vq->split.vring.num - 1));
        i = virtio32_to_cpu(_vq->vdev,
                        vq->split.vring.used->ring[last_used].id);
        *len = virtio32_to_cpu(_vq->vdev,
                        vq->split.vring.used->ring[last_used].len);

        if (unlikely(i >= vq->split.vring.num)) {
                BAD_RING(vq, "id %u out of range\n", i);
                return NULL;
        }
        if (unlikely(!vq->split.desc_state[i].data)) {
                BAD_RING(vq, "id %u is not a head!\n", i);
                return NULL;
        }

        /* detach_buf_split clears data, so grab it now. */
        ret = vq->split.desc_state[i].data;
        detach_buf_split(vq, i, ctx);
        vq->last_used_idx++;
        /* If we expect an interrupt for the next entry, tell host
         * by writing event index and flush out the write before
         * the read in the next get_buf call. */
        if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
                virtio_store_mb(vq->weak_barriers,
                                &vring_used_event(&vq->split.vring),
                                cpu_to_virtio16(_vq->vdev, vq->last_used_idx));

        LAST_ADD_TIME_INVALID(vq);

        END_USE(vq);
        return ret;
}

static void virtqueue_disable_cb_split(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
                vq->split.avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;

                /*
                 * If device triggered an event already it won't trigger one again:
                 * no need to disable.
                 */
                if (vq->event_triggered)
                        return;

                if (vq->event)
                        /* TODO: this is a hack. Figure out a cleaner value to write. */
                        vring_used_event(&vq->split.vring) = 0x0;
                else
                        vq->split.vring.avail->flags =
                                cpu_to_virtio16(_vq->vdev,
                                                vq->split.avail_flags_shadow);
        }
}

static unsigned int virtqueue_enable_cb_prepare_split(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);
        u16 last_used_idx;

        START_USE(vq);

        /* We optimistically turn back on interrupts, then check if there was
         * more to do. */
        /* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
         * either clear the flags bit or point the event index at the next
         * entry. Always do both to keep code simple. */
        if (vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
                vq->split.avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
                if (!vq->event)
                        vq->split.vring.avail->flags =
                                cpu_to_virtio16(_vq->vdev,
                                                vq->split.avail_flags_shadow);
        }
        vring_used_event(&vq->split.vring) = cpu_to_virtio16(_vq->vdev,
                        last_used_idx = vq->last_used_idx);
        END_USE(vq);
        return last_used_idx;
}

static bool virtqueue_poll_split(struct virtqueue *_vq, unsigned int last_used_idx)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev,
                        vq->split.vring.used->idx);
}

static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);
        u16 bufs;

        START_USE(vq);

        /* We optimistically turn back on interrupts, then check if there was
         * more to do. */
        /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
         * either clear the flags bit or point the event index at the next
         * entry. Always update the event index to keep code simple. */
        if (vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
                vq->split.avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
                if (!vq->event)
                        vq->split.vring.avail->flags =
                                cpu_to_virtio16(_vq->vdev,
                                                vq->split.avail_flags_shadow);
        }
        /* TODO: tune this threshold */
        bufs = (u16)(vq->split.avail_idx_shadow - vq->last_used_idx) * 3 / 4;

        virtio_store_mb(vq->weak_barriers,
                        &vring_used_event(&vq->split.vring),
                        cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));

        if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->split.vring.used->idx)
                                        - vq->last_used_idx) > bufs)) {
                END_USE(vq);
                return false;
        }

        END_USE(vq);
        return true;
}

static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);
        unsigned int i;
        void *buf;

        START_USE(vq);

        for (i = 0; i < vq->split.vring.num; i++) {
                if (!vq->split.desc_state[i].data)
                        continue;
                /* detach_buf_split clears data, so grab it now. */
                buf = vq->split.desc_state[i].data;
                detach_buf_split(vq, i, NULL);
                vq->split.avail_idx_shadow--;
                vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev,
                                vq->split.avail_idx_shadow);
                END_USE(vq);
                return buf;
        }
        /* That should have freed everything. */
        BUG_ON(vq->vq.num_free != vq->split.vring.num);

        END_USE(vq);
        return NULL;
}

static void virtqueue_vring_init_split(struct vring_virtqueue_split *vring_split,
                                       struct vring_virtqueue *vq)
{
        struct virtio_device *vdev;

        vdev = vq->vq.vdev;

        vring_split->avail_flags_shadow = 0;
        vring_split->avail_idx_shadow = 0;

        /* No callback?  Tell other side not to bother us. */
        if (!vq->vq.callback) {
                vring_split->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
                if (!vq->event)
                        vring_split->vring.avail->flags = cpu_to_virtio16(vdev,
                                        vring_split->avail_flags_shadow);
        }
}

static void virtqueue_reinit_split(struct vring_virtqueue *vq)
{
        int num;

        num = vq->split.vring.num;

        vq->split.vring.avail->flags = 0;
        vq->split.vring.avail->idx = 0;

        /* reset avail event */
        vq->split.vring.avail->ring[num] = 0;

        vq->split.vring.used->flags = 0;
        vq->split.vring.used->idx = 0;

        /* reset used event */
        *(__virtio16 *)&(vq->split.vring.used->ring[num]) = 0;

        virtqueue_init(vq, num);

        virtqueue_vring_init_split(&vq->split, vq);
}

static void virtqueue_vring_attach_split(struct vring_virtqueue *vq,
                                         struct vring_virtqueue_split *vring_split)
{
        vq->split = *vring_split;

        /* Put everything in free lists. */
        vq->free_head = 0;
}

static int vring_alloc_state_extra_split(struct vring_virtqueue_split *vring_split)
{
        struct vring_desc_state_split *state;
        struct vring_desc_extra *extra;
        u32 num = vring_split->vring.num;

        state = kmalloc_array(num, sizeof(struct vring_desc_state_split), GFP_KERNEL);
        if (!state)
                goto err_state;

        extra = vring_alloc_desc_extra(num);
        if (!extra)
                goto err_extra;

        memset(state, 0, num * sizeof(struct vring_desc_state_split));

        vring_split->desc_state = state;
        vring_split->desc_extra = extra;
        return 0;

err_extra:
        kfree(state);
err_state:
        return -ENOMEM;
}

static void vring_free_split(struct vring_virtqueue_split *vring_split,
                             struct virtio_device *vdev, struct device *dma_dev)
{
        vring_free_queue(vdev, vring_split->queue_size_in_bytes,
                         vring_split->vring.desc,
                         vring_split->queue_dma_addr,
                         dma_dev);

        kfree(vring_split->desc_state);
        kfree(vring_split->desc_extra);
}

static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split,
                                   struct virtio_device *vdev,
                                   u32 num,
                                   unsigned int vring_align,
                                   bool may_reduce_num,
                                   struct device *dma_dev)
{
        void *queue = NULL;
        dma_addr_t dma_addr;

        /* We assume num is a power of 2. */
        if (!is_power_of_2(num)) {
                dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num);
                return -EINVAL;
        }

        /* TODO: allocate each queue chunk individually */
        for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
                queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
                                          &dma_addr,
                                          GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
                                          dma_dev);
                if (queue)
                        break;
                if (!may_reduce_num)
                        return -ENOMEM;
        }

        if (!num)
                return -ENOMEM;

        if (!queue) {
                /* Try to get a single page. You are my only hope! */
                queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
                                          &dma_addr, GFP_KERNEL | __GFP_ZERO,
                                          dma_dev);
        }
        if (!queue)
                return -ENOMEM;

        vring_init(&vring_split->vring, num, queue, vring_align);

        vring_split->queue_dma_addr = dma_addr;
        vring_split->queue_size_in_bytes = vring_size(num, vring_align);

        vring_split->vring_align = vring_align;
        vring_split->may_reduce_num = may_reduce_num;

        return 0;
}

static struct virtqueue *vring_create_virtqueue_split(
        unsigned int index,
        unsigned int num,
        unsigned int vring_align,
        struct virtio_device *vdev,
        bool weak_barriers,
        bool may_reduce_num,
        bool context,
        bool (*notify)(struct virtqueue *),
        void (*callback)(struct virtqueue *),
        const char *name,
        struct device *dma_dev)
{
        struct vring_virtqueue_split vring_split = {};
        struct virtqueue *vq;
        int err;

        err = vring_alloc_queue_split(&vring_split, vdev, num, vring_align,
                                      may_reduce_num, dma_dev);
        if (err)
                return NULL;

        vq = __vring_new_virtqueue(index, &vring_split, vdev, weak_barriers,
                                   context, notify, callback, name, dma_dev);
        if (!vq) {
                vring_free_split(&vring_split, vdev, dma_dev);
                return NULL;
        }

        to_vvq(vq)->we_own_ring = true;

        return vq;
}

static int virtqueue_resize_split(struct virtqueue *_vq, u32 num)
{
        struct vring_virtqueue_split vring_split = {};
        struct vring_virtqueue *vq = to_vvq(_vq);
        struct virtio_device *vdev = _vq->vdev;
        int err;

        err = vring_alloc_queue_split(&vring_split, vdev, num,
                                      vq->split.vring_align,
                                      vq->split.may_reduce_num,
                                      vring_dma_dev(vq));
        if (err)
                goto err;

        err = vring_alloc_state_extra_split(&vring_split);
        if (err)
                goto err_state_extra;

        vring_free(&vq->vq);

        virtqueue_vring_init_split(&vring_split, vq);

        virtqueue_init(vq, vring_split.vring.num);
        virtqueue_vring_attach_split(vq, &vring_split);

        return 0;

err_state_extra:
        vring_free_split(&vring_split, vdev, vring_dma_dev(vq));
err:
        virtqueue_reinit_split(vq);
        return -ENOMEM;
}


/*
 * Packed ring specific functions - *_packed().
 */
static bool packed_used_wrap_counter(u16 last_used_idx)
{
        return !!(last_used_idx & (1 << VRING_PACKED_EVENT_F_WRAP_CTR));
}

static u16 packed_last_used(u16 last_used_idx)
{
        return last_used_idx & ~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR));
}

static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
                                     const struct vring_desc_extra *extra)
{
        u16 flags;

        flags = extra->flags;

        if (flags & VRING_DESC_F_INDIRECT) {
                if (!vq->use_dma_api)
                        return;

                dma_unmap_single(vring_dma_dev(vq),
                                 extra->addr, extra->len,
                                 (flags & VRING_DESC_F_WRITE) ?
                                 DMA_FROM_DEVICE : DMA_TO_DEVICE);
        } else {
                if (!vq->do_unmap)
                        return;

                dma_unmap_page(vring_dma_dev(vq),
                               extra->addr, extra->len,
                               (flags & VRING_DESC_F_WRITE) ?
                               DMA_FROM_DEVICE : DMA_TO_DEVICE);
        }
}

static void vring_unmap_desc_packed(const struct vring_virtqueue *vq,
                                    const struct vring_packed_desc *desc)
{
        u16 flags;

        if (!vq->do_unmap)
                return;

        flags = le16_to_cpu(desc->flags);

        dma_unmap_page(vring_dma_dev(vq),
                       le64_to_cpu(desc->addr),
                       le32_to_cpu(desc->len),
                       (flags & VRING_DESC_F_WRITE) ?
                       DMA_FROM_DEVICE : DMA_TO_DEVICE);
}

static struct vring_packed_desc *alloc_indirect_packed(unsigned int total_sg,
                                                       gfp_t gfp)
{
        struct vring_packed_desc *desc;

        /*
         * We require lowmem mappings for the descriptors because
         * otherwise virt_to_phys will give us bogus addresses in the
         * virtqueue.
         */
        gfp &= ~__GFP_HIGHMEM;

        desc = kmalloc_array(total_sg, sizeof(struct vring_packed_desc), gfp);

        return desc;
}

static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
                                         struct scatterlist *sgs[],
                                         unsigned int total_sg,
                                         unsigned int out_sgs,
                                         unsigned int in_sgs,
                                         void *data,
                                         gfp_t gfp)
{
        struct vring_packed_desc *desc;
        struct scatterlist *sg;
        unsigned int i, n, err_idx;
        u16 head, id;
        dma_addr_t addr;

        head = vq->packed.next_avail_idx;
        desc = alloc_indirect_packed(total_sg, gfp);
        if (!desc)
                return -ENOMEM;

        if (unlikely(vq->vq.num_free < 1)) {
                pr_debug("Can't add buf len 1 - avail = 0\n");
                kfree(desc);
                END_USE(vq);
                return -ENOSPC;
        }

        i = 0;
        id = vq->free_head;
        BUG_ON(id == vq->packed.vring.num);

        for (n = 0; n < out_sgs + in_sgs; n++) {
                for (sg = sgs[n]; sg; sg = sg_next(sg)) {
                        if (vring_map_one_sg(vq, sg, n < out_sgs ?
                                             DMA_TO_DEVICE : DMA_FROM_DEVICE, &addr))
                                goto unmap_release;

                        desc[i].flags = cpu_to_le16(n < out_sgs ?
                                                0 : VRING_DESC_F_WRITE);
                        desc[i].addr = cpu_to_le64(addr);
                        desc[i].len = cpu_to_le32(sg->length);
                        i++;
                }
        }

        /* Now that the indirect table is filled in, map it. */
        addr = vring_map_single(vq, desc,
                        total_sg * sizeof(struct vring_packed_desc),
                        DMA_TO_DEVICE);
        if (vring_mapping_error(vq, addr)) {
                if (vq->premapped)
                        goto free_desc;

                goto unmap_release;
        }

        vq->packed.vring.desc[head].addr = cpu_to_le64(addr);
        vq->packed.vring.desc[head].len = cpu_to_le32(total_sg *
                                sizeof(struct vring_packed_desc));
        vq->packed.vring.desc[head].id = cpu_to_le16(id);

        if (vq->use_dma_api) {
                vq->packed.desc_extra[id].addr = addr;
                vq->packed.desc_extra[id].len = total_sg *
                                sizeof(struct vring_packed_desc);
                vq->packed.desc_extra[id].flags = VRING_DESC_F_INDIRECT |
                                                  vq->packed.avail_used_flags;
        }

        /*
         * A driver MUST NOT make the first descriptor in the list
         * available before all subsequent descriptors comprising
         * the list are made available.
         */
        virtio_wmb(vq->weak_barriers);
        vq->packed.vring.desc[head].flags = cpu_to_le16(VRING_DESC_F_INDIRECT |
                                                vq->packed.avail_used_flags);

        /* We're using some buffers from the free list. */
        vq->vq.num_free -= 1;

        /* Update free pointer */
        n = head + 1;
        if (n >= vq->packed.vring.num) {
                n = 0;
                vq->packed.avail_wrap_counter ^= 1;
                vq->packed.avail_used_flags ^=
                                1 << VRING_PACKED_DESC_F_AVAIL |
                                1 << VRING_PACKED_DESC_F_USED;
        }
        vq->packed.next_avail_idx = n;
        vq->free_head = vq->packed.desc_extra[id].next;

        /* Store token and indirect buffer state. */
        vq->packed.desc_state[id].num = 1;
        vq->packed.desc_state[id].data = data;
        vq->packed.desc_state[id].indir_desc = desc;
        vq->packed.desc_state[id].last = id;

        vq->num_added += 1;

        pr_debug("Added buffer head %i to %p\n", head, vq);
        END_USE(vq);

        return 0;

unmap_release:
        err_idx = i;

        for (i = 0; i < err_idx; i++)
                vring_unmap_desc_packed(vq, &desc[i]);

free_desc:
        kfree(desc);

        END_USE(vq);
        return -ENOMEM;
}

static inline int virtqueue_add_packed(struct virtqueue *_vq,
                                       struct scatterlist *sgs[],
                                       unsigned int total_sg,
                                       unsigned int out_sgs,
                                       unsigned int in_sgs,
                                       void *data,
                                       void *ctx,
                                       gfp_t gfp)
{
        struct vring_virtqueue *vq = to_vvq(_vq);
        struct vring_packed_desc *desc;
        struct scatterlist *sg;
        unsigned int i, n, c, descs_used, err_idx;
        __le16 head_flags, flags;
        u16 head, id, prev, curr, avail_used_flags;
        int err;

        START_USE(vq);

        BUG_ON(data == NULL);
        BUG_ON(ctx && vq->indirect);

        if (unlikely(vq->broken)) {
                END_USE(vq);
                return -EIO;
        }

        LAST_ADD_TIME_UPDATE(vq);

        BUG_ON(total_sg == 0);

        if (virtqueue_use_indirect(vq, total_sg)) {
                err = virtqueue_add_indirect_packed(vq, sgs, total_sg, out_sgs,
                                                    in_sgs, data, gfp);
                if (err != -ENOMEM) {
                        END_USE(vq);
                        return err;
                }

                /* fall back on direct */
        }

        head = vq->packed.next_avail_idx;
        avail_used_flags = vq->packed.avail_used_flags;

        WARN_ON_ONCE(total_sg > vq->packed.vring.num && !vq->indirect);

        desc = vq->packed.vring.desc;
        i = head;
        descs_used = total_sg;

        if (unlikely(vq->vq.num_free < descs_used)) {
                pr_debug("Can't add buf len %i - avail = %i\n",
                         descs_used, vq->vq.num_free);
                END_USE(vq);
                return -ENOSPC;
        }

        id = vq->free_head;
        BUG_ON(id == vq->packed.vring.num);

        curr = id;
        c = 0;
        for (n = 0; n < out_sgs + in_sgs; n++) {
                for (sg = sgs[n]; sg; sg = sg_next(sg)) {
                        dma_addr_t addr;

                        if (vring_map_one_sg(vq, sg, n < out_sgs ?
                                             DMA_TO_DEVICE : DMA_FROM_DEVICE, &addr))
                                goto unmap_release;

                        flags = cpu_to_le16(vq->packed.avail_used_flags |
                                    (++c == total_sg ? 0 : VRING_DESC_F_NEXT) |
                                    (n < out_sgs ? 0 : VRING_DESC_F_WRITE));
                        if (i == head)
                                head_flags = flags;
                        else
                                desc[i].flags = flags;

                        desc[i].addr = cpu_to_le64(addr);
                        desc[i].len = cpu_to_le32(sg->length);
                        desc[i].id = cpu_to_le16(id);

                        if (unlikely(vq->use_dma_api)) {
                                vq->packed.desc_extra[curr].addr = addr;
                                vq->packed.desc_extra[curr].len = sg->length;
                                vq->packed.desc_extra[curr].flags =
                                        le16_to_cpu(flags);
                        }
                        prev = curr;
                        curr = vq->packed.desc_extra[curr].next;

                        if ((unlikely(++i >= vq->packed.vring.num))) {
                                i = 0;
                                vq->packed.avail_used_flags ^=
                                        1 << VRING_PACKED_DESC_F_AVAIL |
                                        1 << VRING_PACKED_DESC_F_USED;
                        }
                }
        }

        if (i <= head)
                vq->packed.avail_wrap_counter ^= 1;

        /* We're using some buffers from the free list. */
        vq->vq.num_free -= descs_used;

        /* Update free pointer */
        vq->packed.next_avail_idx = i;
        vq->free_head = curr;

        /* Store token. */
        vq->packed.desc_state[id].num = descs_used;
        vq->packed.desc_state[id].data = data;
        vq->packed.desc_state[id].indir_desc = ctx;
        vq->packed.desc_state[id].last = prev;

        /*
         * A driver MUST NOT make the first descriptor in the list
         * available before all subsequent descriptors comprising
         * the list are made available.
         */
        virtio_wmb(vq->weak_barriers);
        vq->packed.vring.desc[head].flags = head_flags;
        vq->num_added += descs_used;

        pr_debug("Added buffer head %i to %p\n", head, vq);
        END_USE(vq);

        return 0;

unmap_release:
        err_idx = i;
        i = head;
        curr = vq->free_head;

        vq->packed.avail_used_flags = avail_used_flags;

        for (n = 0; n < total_sg; n++) {
                if (i == err_idx)
                        break;
                vring_unmap_extra_packed(vq, &vq->packed.desc_extra[curr]);
                curr = vq->packed.desc_extra[curr].next;
                i++;
                if (i >= vq->packed.vring.num)
                        i = 0;
        }

        END_USE(vq);
        return -EIO;
}

static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);
        u16 new, old, off_wrap, flags, wrap_counter, event_idx;
        bool needs_kick;
        union {
                struct {
                        __le16 off_wrap;
                        __le16 flags;
                };
                u32 u32;
        } snapshot;

        START_USE(vq);

        /*
         * We need to expose the new flags value before checking notification
         * suppressions.
         */
        virtio_mb(vq->weak_barriers);

        old = vq->packed.next_avail_idx - vq->num_added;
        new = vq->packed.next_avail_idx;
        vq->num_added = 0;

        snapshot.u32 = *(u32 *)vq->packed.vring.device;
        flags = le16_to_cpu(snapshot.flags);

        LAST_ADD_TIME_CHECK(vq);
        LAST_ADD_TIME_INVALID(vq);

        if (flags != VRING_PACKED_EVENT_FLAG_DESC) {
                needs_kick = (flags != VRING_PACKED_EVENT_FLAG_DISABLE);
                goto out;
        }

        off_wrap = le16_to_cpu(snapshot.off_wrap);

        wrap_counter = off_wrap >> VRING_PACKED_EVENT_F_WRAP_CTR;
        event_idx = off_wrap & ~(1 << VRING_PACKED_EVENT_F_WRAP_CTR);
        if (wrap_counter != vq->packed.avail_wrap_counter)
                event_idx -= vq->packed.vring.num;

        needs_kick = vring_need_event(event_idx, new, old);
out:
        END_USE(vq);
        return needs_kick;
}

static void detach_buf_packed(struct vring_virtqueue *vq,
                              unsigned int id, void **ctx)
{
        struct vring_desc_state_packed *state = NULL;
        struct vring_packed_desc *desc;
        unsigned int i, curr;

        state = &vq->packed.desc_state[id];

        /* Clear data ptr. */
        state->data = NULL;

        vq->packed.desc_extra[state->last].next = vq->free_head;
        vq->free_head = id;
        vq->vq.num_free += state->num;

        if (unlikely(vq->use_dma_api)) {
                curr = id;
                for (i = 0; i < state->num; i++) {
                        vring_unmap_extra_packed(vq,
                                                 &vq->packed.desc_extra[curr]);
                        curr = vq->packed.desc_extra[curr].next;
                }
        }

        if (vq->indirect) {
                u32 len;

                /* Free the indirect table, if any, now that it's unmapped. */
                desc = state->indir_desc;
                if (!desc)
                        return;

                if (vq->do_unmap) {
                        len = vq->packed.desc_extra[id].len;
                        for (i = 0; i < len / sizeof(struct vring_packed_desc);
                                        i++)
                                vring_unmap_desc_packed(vq, &desc[i]);
                }
                kfree(desc);
                state->indir_desc = NULL;
        } else if (ctx) {
                *ctx = state->indir_desc;
        }
}

static inline bool is_used_desc_packed(const struct vring_virtqueue *vq,
                                       u16 idx, bool used_wrap_counter)
{
        bool avail, used;
        u16 flags;

        flags = le16_to_cpu(vq->packed.vring.desc[idx].flags);
        avail = !!(flags & (1 << VRING_PACKED_DESC_F_AVAIL));
        used = !!(flags & (1 << VRING_PACKED_DESC_F_USED));

        return avail == used && used == used_wrap_counter;
}

static bool more_used_packed(const struct vring_virtqueue *vq)
{
        u16 last_used;
        u16 last_used_idx;
        bool used_wrap_counter;

        last_used_idx = READ_ONCE(vq->last_used_idx);
        last_used = packed_last_used(last_used_idx);
        used_wrap_counter = packed_used_wrap_counter(last_used_idx);
        return is_used_desc_packed(vq, last_used, used_wrap_counter);
}

static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
                                          unsigned int *len,
                                          void **ctx)
{
        struct vring_virtqueue *vq = to_vvq(_vq);
        u16 last_used, id, last_used_idx;
        bool used_wrap_counter;
        void *ret;

        START_USE(vq);

        if (unlikely(vq->broken)) {
                END_USE(vq);
                return NULL;
        }

        if (!more_used_packed(vq)) {
                pr_debug("No more buffers in queue\n");
                END_USE(vq);
                return NULL;
        }

        /* Only get used elements after they have been exposed by host. */
        virtio_rmb(vq->weak_barriers);

        last_used_idx = READ_ONCE(vq->last_used_idx);
        used_wrap_counter = packed_used_wrap_counter(last_used_idx);
        last_used = packed_last_used(last_used_idx);
        id = le16_to_cpu(vq->packed.vring.desc[last_used].id);
        *len = le32_to_cpu(vq->packed.vring.desc[last_used].len);

        if (unlikely(id >= vq->packed.vring.num)) {
                BAD_RING(vq, "id %u out of range\n", id);
                return NULL;
        }
        if (unlikely(!vq->packed.desc_state[id].data)) {
                BAD_RING(vq, "id %u is not a head!\n", id);
                return NULL;
        }

        /* detach_buf_packed clears data, so grab it now. */
        ret = vq->packed.desc_state[id].data;
        detach_buf_packed(vq, id, ctx);

        last_used += vq->packed.desc_state[id].num;
        if (unlikely(last_used >= vq->packed.vring.num)) {
                last_used -= vq->packed.vring.num;
                used_wrap_counter ^= 1;
        }

        last_used = (last_used | (used_wrap_counter << VRING_PACKED_EVENT_F_WRAP_CTR));
        WRITE_ONCE(vq->last_used_idx, last_used);

        /*
         * If we expect an interrupt for the next entry, tell host
         * by writing event index and flush out the write before
         * the read in the next get_buf call.
         */
        if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DESC)
                virtio_store_mb(vq->weak_barriers,
                                &vq->packed.vring.driver->off_wrap,
                                cpu_to_le16(vq->last_used_idx));

        LAST_ADD_TIME_INVALID(vq);

        END_USE(vq);
        return ret;
}

static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        if (vq->packed.event_flags_shadow != VRING_PACKED_EVENT_FLAG_DISABLE) {
                vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE;

                /*
                 * If device triggered an event already it won't trigger one again:
                 * no need to disable.
                 */
                if (vq->event_triggered)
                        return;

                vq->packed.vring.driver->flags =
                        cpu_to_le16(vq->packed.event_flags_shadow);
        }
}

static unsigned int virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        START_USE(vq);

        /*
         * We optimistically turn back on interrupts, then check if there was
         * more to do.
         */

        if (vq->event) {
                vq->packed.vring.driver->off_wrap =
                        cpu_to_le16(vq->last_used_idx);
                /*
                 * We need to update event offset and event wrap
                 * counter first before updating event flags.
                 */
                virtio_wmb(vq->weak_barriers);
        }

        if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DISABLE) {
                vq->packed.event_flags_shadow = vq->event ?
                                VRING_PACKED_EVENT_FLAG_DESC :
                                VRING_PACKED_EVENT_FLAG_ENABLE;
                vq->packed.vring.driver->flags =
                                cpu_to_le16(vq->packed.event_flags_shadow);
        }

        END_USE(vq);
        return vq->last_used_idx;
}

static bool virtqueue_poll_packed(struct virtqueue *_vq, u16 off_wrap)
{
        struct vring_virtqueue *vq = to_vvq(_vq);
        bool wrap_counter;
        u16 used_idx;

        wrap_counter = off_wrap >> VRING_PACKED_EVENT_F_WRAP_CTR;
        used_idx = off_wrap & ~(1 << VRING_PACKED_EVENT_F_WRAP_CTR);

        return is_used_desc_packed(vq, used_idx, wrap_counter);
}

static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);
        u16 used_idx, wrap_counter, last_used_idx;
        u16 bufs;

        START_USE(vq);

        /*
         * We optimistically turn back on interrupts, then check if there was
         * more to do.
         */

        if (vq->event) {
                /* TODO: tune this threshold */
                bufs = (vq->packed.vring.num - vq->vq.num_free) * 3 / 4;
                last_used_idx = READ_ONCE(vq->last_used_idx);
                wrap_counter = packed_used_wrap_counter(last_used_idx);

                used_idx = packed_last_used(last_used_idx) + bufs;
                if (used_idx >= vq->packed.vring.num) {
                        used_idx -= vq->packed.vring.num;
                        wrap_counter ^= 1;
                }

                vq->packed.vring.driver->off_wrap = cpu_to_le16(used_idx |
                        (wrap_counter << VRING_PACKED_EVENT_F_WRAP_CTR));

                /*
                 * We need to update event offset and event wrap
                 * counter first before updating event flags.
                 */
                virtio_wmb(vq->weak_barriers);
        }

        if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DISABLE) {
                vq->packed.event_flags_shadow = vq->event ?
                                VRING_PACKED_EVENT_FLAG_DESC :
                                VRING_PACKED_EVENT_FLAG_ENABLE;
                vq->packed.vring.driver->flags =
                                cpu_to_le16(vq->packed.event_flags_shadow);
        }

        /*
         * We need to update event suppression structure first
         * before re-checking for more used buffers.
         */
        virtio_mb(vq->weak_barriers);

        last_used_idx = READ_ONCE(vq->last_used_idx);
        wrap_counter = packed_used_wrap_counter(last_used_idx);
        used_idx = packed_last_used(last_used_idx);
        if (is_used_desc_packed(vq, used_idx, wrap_counter)) {
                END_USE(vq);
                return false;
        }

        END_USE(vq);
        return true;
}

static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);
        unsigned int i;
        void *buf;

        START_USE(vq);

        for (i = 0; i < vq->packed.vring.num; i++) {
                if (!vq->packed.desc_state[i].data)
                        continue;
                /* detach_buf clears data, so grab it now. */
                buf = vq->packed.desc_state[i].data;
                detach_buf_packed(vq, i, NULL);
                END_USE(vq);
                return buf;
        }
        /* That should have freed everything. */
        BUG_ON(vq->vq.num_free != vq->packed.vring.num);

        END_USE(vq);
        return NULL;
}

static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num)
{
        struct vring_desc_extra *desc_extra;
        unsigned int i;

        desc_extra = kmalloc_array(num, sizeof(struct vring_desc_extra),
                                   GFP_KERNEL);
        if (!desc_extra)
                return NULL;

        memset(desc_extra, 0, num * sizeof(struct vring_desc_extra));

        for (i = 0; i < num - 1; i++)
                desc_extra[i].next = i + 1;

        return desc_extra;
}

static void vring_free_packed(struct vring_virtqueue_packed *vring_packed,
                              struct virtio_device *vdev,
                              struct device *dma_dev)
{
        if (vring_packed->vring.desc)
                vring_free_queue(vdev, vring_packed->ring_size_in_bytes,
                                 vring_packed->vring.desc,
                                 vring_packed->ring_dma_addr,
                                 dma_dev);

        if (vring_packed->vring.driver)
                vring_free_queue(vdev, vring_packed->event_size_in_bytes,
                                 vring_packed->vring.driver,
                                 vring_packed->driver_event_dma_addr,
                                 dma_dev);

        if (vring_packed->vring.device)
                vring_free_queue(vdev, vring_packed->event_size_in_bytes,
                                 vring_packed->vring.device,
                                 vring_packed->device_event_dma_addr,
                                 dma_dev);

        kfree(vring_packed->desc_state);
        kfree(vring_packed->desc_extra);
}

static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed,
                                    struct virtio_device *vdev,
                                    u32 num, struct device *dma_dev)
{
        struct vring_packed_desc *ring;
        struct vring_packed_desc_event *driver, *device;
        dma_addr_t ring_dma_addr, driver_event_dma_addr, device_event_dma_addr;
        size_t ring_size_in_bytes, event_size_in_bytes;

        ring_size_in_bytes = num * sizeof(struct vring_packed_desc);

        ring = vring_alloc_queue(vdev, ring_size_in_bytes,
                                 &ring_dma_addr,
                                 GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
                                 dma_dev);
        if (!ring)
                goto err;

        vring_packed->vring.desc         = ring;
        vring_packed->ring_dma_addr      = ring_dma_addr;
        vring_packed->ring_size_in_bytes = ring_size_in_bytes;

        event_size_in_bytes = sizeof(struct vring_packed_desc_event);

        driver = vring_alloc_queue(vdev, event_size_in_bytes,
                                   &driver_event_dma_addr,
                                   GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
                                   dma_dev);
        if (!driver)
                goto err;

        vring_packed->vring.driver          = driver;
        vring_packed->event_size_in_bytes   = event_size_in_bytes;
        vring_packed->driver_event_dma_addr = driver_event_dma_addr;

        device = vring_alloc_queue(vdev, event_size_in_bytes,
                                   &device_event_dma_addr,
                                   GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
                                   dma_dev);
        if (!device)
                goto err;

        vring_packed->vring.device          = device;
        vring_packed->device_event_dma_addr = device_event_dma_addr;

        vring_packed->vring.num = num;

        return 0;

err:
        vring_free_packed(vring_packed, vdev, dma_dev);
        return -ENOMEM;
}

static int vring_alloc_state_extra_packed(struct vring_virtqueue_packed *vring_packed)
{
        struct vring_desc_state_packed *state;
        struct vring_desc_extra *extra;
        u32 num = vring_packed->vring.num;

        state = kmalloc_array(num, sizeof(struct vring_desc_state_packed), GFP_KERNEL);
        if (!state)
                goto err_desc_state;

        memset(state, 0, num * sizeof(struct vring_desc_state_packed));

        extra = vring_alloc_desc_extra(num);
        if (!extra)
                goto err_desc_extra;

        vring_packed->desc_state = state;
        vring_packed->desc_extra = extra;

        return 0;

err_desc_extra:
        kfree(state);
err_desc_state:
        return -ENOMEM;
}

static void virtqueue_vring_init_packed(struct vring_virtqueue_packed *vring_packed,
                                        bool callback)
{
        vring_packed->next_avail_idx = 0;
        vring_packed->avail_wrap_counter = 1;
        vring_packed->event_flags_shadow = 0;
        vring_packed->avail_used_flags = 1 << VRING_PACKED_DESC_F_AVAIL;

        /* No callback?  Tell other side not to bother us. */
        if (!callback) {
                vring_packed->event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE;
                vring_packed->vring.driver->flags =
                        cpu_to_le16(vring_packed->event_flags_shadow);
        }
}

static void virtqueue_vring_attach_packed(struct vring_virtqueue *vq,
                                          struct vring_virtqueue_packed *vring_packed)
{
        vq->packed = *vring_packed;

        /* Put everything in free lists. */
        vq->free_head = 0;
}

static void virtqueue_reinit_packed(struct vring_virtqueue *vq)
{
        memset(vq->packed.vring.device, 0, vq->packed.event_size_in_bytes);
        memset(vq->packed.vring.driver, 0, vq->packed.event_size_in_bytes);

        /* we need to reset the desc.flags. For more, see is_used_desc_packed() */
        memset(vq->packed.vring.desc, 0, vq->packed.ring_size_in_bytes);

        virtqueue_init(vq, vq->packed.vring.num);
        virtqueue_vring_init_packed(&vq->packed, !!vq->vq.callback);
}

static struct virtqueue *vring_create_virtqueue_packed(
        unsigned int index,
        unsigned int num,
        unsigned int vring_align,
        struct virtio_device *vdev,
        bool weak_barriers,
        bool may_reduce_num,
        bool context,
        bool (*notify)(struct virtqueue *),
        void (*callback)(struct virtqueue *),
        const char *name,
        struct device *dma_dev)
{
        struct vring_virtqueue_packed vring_packed = {};
        struct vring_virtqueue *vq;
        int err;

        if (vring_alloc_queue_packed(&vring_packed, vdev, num, dma_dev))
                goto err_ring;

        vq = kmalloc(sizeof(*vq), GFP_KERNEL);
        if (!vq)
                goto err_vq;

        vq->vq.callback = callback;
        vq->vq.vdev = vdev;
        vq->vq.name = name;
        vq->vq.index = index;
        vq->vq.reset = false;
        vq->we_own_ring = true;
        vq->notify = notify;
        vq->weak_barriers = weak_barriers;
#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
        vq->broken = true;
#else
        vq->broken = false;
#endif
        vq->packed_ring = true;
        vq->dma_dev = dma_dev;
        vq->use_dma_api = vring_use_dma_api(vdev);
        vq->premapped = false;
        vq->do_unmap = vq->use_dma_api;

        vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
                !context;
        vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);

        if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM))
                vq->weak_barriers = false;

        err = vring_alloc_state_extra_packed(&vring_packed);
        if (err)
                goto err_state_extra;

        virtqueue_vring_init_packed(&vring_packed, !!callback);

        virtqueue_init(vq, num);
        virtqueue_vring_attach_packed(vq, &vring_packed);

        spin_lock(&vdev->vqs_list_lock);
        list_add_tail(&vq->vq.list, &vdev->vqs);
        spin_unlock(&vdev->vqs_list_lock);
        return &vq->vq;

err_state_extra:
        kfree(vq);
err_vq:
        vring_free_packed(&vring_packed, vdev, dma_dev);
err_ring:
        return NULL;
}

static int virtqueue_resize_packed(struct virtqueue *_vq, u32 num)
{
        struct vring_virtqueue_packed vring_packed = {};
        struct vring_virtqueue *vq = to_vvq(_vq);
        struct virtio_device *vdev = _vq->vdev;
        int err;

        if (vring_alloc_queue_packed(&vring_packed, vdev, num, vring_dma_dev(vq)))
                goto err_ring;

        err = vring_alloc_state_extra_packed(&vring_packed);
        if (err)
                goto err_state_extra;

        vring_free(&vq->vq);

        virtqueue_vring_init_packed(&vring_packed, !!vq->vq.callback);

        virtqueue_init(vq, vring_packed.vring.num);
        virtqueue_vring_attach_packed(vq, &vring_packed);

        return 0;

err_state_extra:
        vring_free_packed(&vring_packed, vdev, vring_dma_dev(vq));
err_ring:
        virtqueue_reinit_packed(vq);
        return -ENOMEM;
}

static int virtqueue_disable_and_recycle(struct virtqueue *_vq,
                                         void (*recycle)(struct virtqueue *vq, void *buf))
{
        struct vring_virtqueue *vq = to_vvq(_vq);
        struct virtio_device *vdev = vq->vq.vdev;
        void *buf;
        int err;

        if (!vq->we_own_ring)
                return -EPERM;

        if (!vdev->config->disable_vq_and_reset)
                return -ENOENT;

        if (!vdev->config->enable_vq_after_reset)
                return -ENOENT;

        err = vdev->config->disable_vq_and_reset(_vq);
        if (err)
                return err;

        while ((buf = virtqueue_detach_unused_buf(_vq)) != NULL)
                recycle(_vq, buf);

        return 0;
}

static int virtqueue_enable_after_reset(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);
        struct virtio_device *vdev = vq->vq.vdev;

        if (vdev->config->enable_vq_after_reset(_vq))
                return -EBUSY;

        return 0;
}

/*
 * Generic functions and exported symbols.
 */

static inline int virtqueue_add(struct virtqueue *_vq,
                                struct scatterlist *sgs[],
                                unsigned int total_sg,
                                unsigned int out_sgs,
                                unsigned int in_sgs,
                                void *data,
                                void *ctx,
                                gfp_t gfp)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        return vq->packed_ring ? virtqueue_add_packed(_vq, sgs, total_sg,
                                        out_sgs, in_sgs, data, ctx, gfp) :
                                 virtqueue_add_split(_vq, sgs, total_sg,
                                        out_sgs, in_sgs, data, ctx, gfp);
}

/**
 * virtqueue_add_sgs - expose buffers to other end
 * @_vq: the struct virtqueue we're talking about.
 * @sgs: array of terminated scatterlists.
 * @out_sgs: the number of scatterlists readable by other side
 * @in_sgs: the number of scatterlists which are writable (after readable ones)
 * @data: the token identifying the buffer.
 * @gfp: how to do memory allocations (if necessary).
 *
 * Caller must ensure we don't call this with other virtqueue operations
 * at the same time (except where noted).
 *
 * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
 */
int virtqueue_add_sgs(struct virtqueue *_vq,
                      struct scatterlist *sgs[],
                      unsigned int out_sgs,
                      unsigned int in_sgs,
                      void *data,
                      gfp_t gfp)
{
        unsigned int i, total_sg = 0;

        /* Count them first. */
        for (i = 0; i < out_sgs + in_sgs; i++) {
                struct scatterlist *sg;

                for (sg = sgs[i]; sg; sg = sg_next(sg))
                        total_sg++;
        }
        return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs,
                             data, NULL, gfp);
}
EXPORT_SYMBOL_GPL(virtqueue_add_sgs);

/**
 * virtqueue_add_outbuf - expose output buffers to other end
 * @vq: the struct virtqueue we're talking about.
 * @sg: scatterlist (must be well-formed and terminated!)
 * @num: the number of entries in @sg readable by other side
 * @data: the token identifying the buffer.
 * @gfp: how to do memory allocations (if necessary).
 *
 * Caller must ensure we don't call this with other virtqueue operations
 * at the same time (except where noted).
 *
 * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
 */
int virtqueue_add_outbuf(struct virtqueue *vq,
                         struct scatterlist *sg, unsigned int num,
                         void *data,
                         gfp_t gfp)
{
        return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, gfp);
}
EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);

/**
 * virtqueue_add_inbuf - expose input buffers to other end
 * @vq: the struct virtqueue we're talking about.
 * @sg: scatterlist (must be well-formed and terminated!)
 * @num: the number of entries in @sg writable by other side
 * @data: the token identifying the buffer.
 * @gfp: how to do memory allocations (if necessary).
 *
 * Caller must ensure we don't call this with other virtqueue operations
 * at the same time (except where noted).
 *
 * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
 */
int virtqueue_add_inbuf(struct virtqueue *vq,
                        struct scatterlist *sg, unsigned int num,
                        void *data,
                        gfp_t gfp)
{
        return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, gfp);
}
EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);

/**
 * virtqueue_add_inbuf_ctx - expose input buffers to other end
 * @vq: the struct virtqueue we're talking about.
 * @sg: scatterlist (must be well-formed and terminated!)
 * @num: the number of entries in @sg writable by other side
 * @data: the token identifying the buffer.
 * @ctx: extra context for the token
 * @gfp: how to do memory allocations (if necessary).
 *
 * Caller must ensure we don't call this with other virtqueue operations
 * at the same time (except where noted).
 *
 * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
 */
int virtqueue_add_inbuf_ctx(struct virtqueue *vq,
                        struct scatterlist *sg, unsigned int num,
                        void *data,
                        void *ctx,
                        gfp_t gfp)
{
        return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, gfp);
}
EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx);

/**
 * virtqueue_dma_dev - get the dma dev
 * @_vq: the struct virtqueue we're talking about.
 *
 * Returns the dma dev. That can been used for dma api.
 */
struct device *virtqueue_dma_dev(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        if (vq->use_dma_api)
                return vring_dma_dev(vq);
        else
                return NULL;
}
EXPORT_SYMBOL_GPL(virtqueue_dma_dev);

/**
 * virtqueue_kick_prepare - first half of split virtqueue_kick call.
 * @_vq: the struct virtqueue
 *
 * Instead of virtqueue_kick(), you can do:
 *        if (virtqueue_kick_prepare(vq))
 *                virtqueue_notify(vq);
 *
 * This is sometimes useful because the virtqueue_kick_prepare() needs
 * to be serialized, but the actual virtqueue_notify() call does not.
 */
bool virtqueue_kick_prepare(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        return vq->packed_ring ? virtqueue_kick_prepare_packed(_vq) :
                                 virtqueue_kick_prepare_split(_vq);
}
EXPORT_SYMBOL_GPL(virtqueue_kick_prepare);

/**
 * virtqueue_notify - second half of split virtqueue_kick call.
 * @_vq: the struct virtqueue
 *
 * This does not need to be serialized.
 *
 * Returns false if host notify failed or queue is broken, otherwise true.
 */
bool virtqueue_notify(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        if (unlikely(vq->broken))
                return false;

        /* Prod other side to tell it about changes. */
        if (!vq->notify(_vq)) {
                vq->broken = true;
                return false;
        }
        return true;
}
EXPORT_SYMBOL_GPL(virtqueue_notify);

/**
 * virtqueue_kick - update after add_buf
 * @vq: the struct virtqueue
 *
 * After one or more virtqueue_add_* calls, invoke this to kick
 * the other side.
 *
 * Caller must ensure we don't call this with other virtqueue
 * operations at the same time (except where noted).
 *
 * Returns false if kick failed, otherwise true.
 */
bool virtqueue_kick(struct virtqueue *vq)
{
        if (virtqueue_kick_prepare(vq))
                return virtqueue_notify(vq);
        return true;
}
EXPORT_SYMBOL_GPL(virtqueue_kick);

/**
 * virtqueue_get_buf_ctx - get the next used buffer
 * @_vq: the struct virtqueue we're talking about.
 * @len: the length written into the buffer
 * @ctx: extra context for the token
 *
 * If the device wrote data into the buffer, @len will be set to the
 * amount written.  This means you don't need to clear the buffer
 * beforehand to ensure there's no data leakage in the case of short
 * writes.
 *
 * Caller must ensure we don't call this with other virtqueue
 * operations at the same time (except where noted).
 *
 * Returns NULL if there are no used buffers, or the "data" token
 * handed to virtqueue_add_*().
 */
void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
                            void **ctx)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        return vq->packed_ring ? virtqueue_get_buf_ctx_packed(_vq, len, ctx) :
                                 virtqueue_get_buf_ctx_split(_vq, len, ctx);
}
EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx);

void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
{
        return virtqueue_get_buf_ctx(_vq, len, NULL);
}
EXPORT_SYMBOL_GPL(virtqueue_get_buf);
/**
 * virtqueue_disable_cb - disable callbacks
 * @_vq: the struct virtqueue we're talking about.
 *
 * Note that this is not necessarily synchronous, hence unreliable and only
 * useful as an optimization.
 *
 * Unlike other operations, this need not be serialized.
 */
void virtqueue_disable_cb(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        if (vq->packed_ring)
                virtqueue_disable_cb_packed(_vq);
        else
                virtqueue_disable_cb_split(_vq);
}
EXPORT_SYMBOL_GPL(virtqueue_disable_cb);

/**
 * virtqueue_enable_cb_prepare - restart callbacks after disable_cb
 * @_vq: the struct virtqueue we're talking about.
 *
 * This re-enables callbacks; it returns current queue state
 * in an opaque unsigned value. This value should be later tested by
 * virtqueue_poll, to detect a possible race between the driver checking for
 * more work, and enabling callbacks.
 *
 * Caller must ensure we don't call this with other virtqueue
 * operations at the same time (except where noted).
 */
unsigned int virtqueue_enable_cb_prepare(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        if (vq->event_triggered)
                vq->event_triggered = false;

        return vq->packed_ring ? virtqueue_enable_cb_prepare_packed(_vq) :
                                 virtqueue_enable_cb_prepare_split(_vq);
}
EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare);

/**
 * virtqueue_poll - query pending used buffers
 * @_vq: the struct virtqueue we're talking about.
 * @last_used_idx: virtqueue state (from call to virtqueue_enable_cb_prepare).
 *
 * Returns "true" if there are pending used buffers in the queue.
 *
 * This does not need to be serialized.
 */
bool virtqueue_poll(struct virtqueue *_vq, unsigned int last_used_idx)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        if (unlikely(vq->broken))
                return false;

        virtio_mb(vq->weak_barriers);
        return vq->packed_ring ? virtqueue_poll_packed(_vq, last_used_idx) :
                                 virtqueue_poll_split(_vq, last_used_idx);
}
EXPORT_SYMBOL_GPL(virtqueue_poll);

/**
 * virtqueue_enable_cb - restart callbacks after disable_cb.
 * @_vq: the struct virtqueue we're talking about.
 *
 * This re-enables callbacks; it returns "false" if there are pending
 * buffers in the queue, to detect a possible race between the driver
 * checking for more work, and enabling callbacks.
 *
 * Caller must ensure we don't call this with other virtqueue
 * operations at the same time (except where noted).
 */
bool virtqueue_enable_cb(struct virtqueue *_vq)
{
        unsigned int last_used_idx = virtqueue_enable_cb_prepare(_vq);

        return !virtqueue_poll(_vq, last_used_idx);
}
EXPORT_SYMBOL_GPL(virtqueue_enable_cb);

/**
 * virtqueue_enable_cb_delayed - restart callbacks after disable_cb.
 * @_vq: the struct virtqueue we're talking about.
 *
 * This re-enables callbacks but hints to the other side to delay
 * interrupts until most of the available buffers have been processed;
 * it returns "false" if there are many pending buffers in the queue,
 * to detect a possible race between the driver checking for more work,
 * and enabling callbacks.
 *
 * Caller must ensure we don't call this with other virtqueue
 * operations at the same time (except where noted).
 */
bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        if (vq->event_triggered)
                vq->event_triggered = false;

        return vq->packed_ring ? virtqueue_enable_cb_delayed_packed(_vq) :
                                 virtqueue_enable_cb_delayed_split(_vq);
}
EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);

/**
 * virtqueue_detach_unused_buf - detach first unused buffer
 * @_vq: the struct virtqueue we're talking about.
 *
 * Returns NULL or the "data" token handed to virtqueue_add_*().
 * This is not valid on an active queue; it is useful for device
 * shutdown or the reset queue.
 */
void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        return vq->packed_ring ? virtqueue_detach_unused_buf_packed(_vq) :
                                 virtqueue_detach_unused_buf_split(_vq);
}
EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf);

static inline bool more_used(const struct vring_virtqueue *vq)
{
        return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
}

/**
 * vring_interrupt - notify a virtqueue on an interrupt
 * @irq: the IRQ number (ignored)
 * @_vq: the struct virtqueue to notify
 *
 * Calls the callback function of @_vq to process the virtqueue
 * notification.
 */
irqreturn_t vring_interrupt(int irq, void *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        if (!more_used(vq)) {
                pr_debug("virtqueue interrupt with no work for %p\n", vq);
                return IRQ_NONE;
        }

        if (unlikely(vq->broken)) {
#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
                dev_warn_once(&vq->vq.vdev->dev,
                              "virtio vring IRQ raised before DRIVER_OK");
                return IRQ_NONE;
#else
                return IRQ_HANDLED;
#endif
        }

        /* Just a hint for performance: so it's ok that this can be racy! */
        if (vq->event)
                vq->event_triggered = true;

        pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback);
        if (vq->vq.callback)
                vq->vq.callback(&vq->vq);

        return IRQ_HANDLED;
}
EXPORT_SYMBOL_GPL(vring_interrupt);

/* Only available for split ring */
static struct virtqueue *__vring_new_virtqueue(unsigned int index,
                                               struct vring_virtqueue_split *vring_split,
                                               struct virtio_device *vdev,
                                               bool weak_barriers,
                                               bool context,
                                               bool (*notify)(struct virtqueue *),
                                               void (*callback)(struct virtqueue *),
                                               const char *name,
                                               struct device *dma_dev)
{
        struct vring_virtqueue *vq;
        int err;

        if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED))
                return NULL;

        vq = kmalloc(sizeof(*vq), GFP_KERNEL);
        if (!vq)
                return NULL;

        vq->packed_ring = false;
        vq->vq.callback = callback;
        vq->vq.vdev = vdev;
        vq->vq.name = name;
        vq->vq.index = index;
        vq->vq.reset = false;
        vq->we_own_ring = false;
        vq->notify = notify;
        vq->weak_barriers = weak_barriers;
#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION
        vq->broken = true;
#else
        vq->broken = false;
#endif
        vq->dma_dev = dma_dev;
        vq->use_dma_api = vring_use_dma_api(vdev);
        vq->premapped = false;
        vq->do_unmap = vq->use_dma_api;

        vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
                !context;
        vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);

        if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM))
                vq->weak_barriers = false;

        err = vring_alloc_state_extra_split(vring_split);
        if (err) {
                kfree(vq);
                return NULL;
        }

        virtqueue_vring_init_split(vring_split, vq);

        virtqueue_init(vq, vring_split->vring.num);
        virtqueue_vring_attach_split(vq, vring_split);

        spin_lock(&vdev->vqs_list_lock);
        list_add_tail(&vq->vq.list, &vdev->vqs);
        spin_unlock(&vdev->vqs_list_lock);
        return &vq->vq;
}

struct virtqueue *vring_create_virtqueue(
        unsigned int index,
        unsigned int num,
        unsigned int vring_align,
        struct virtio_device *vdev,
        bool weak_barriers,
        bool may_reduce_num,
        bool context,
        bool (*notify)(struct virtqueue *),
        void (*callback)(struct virtqueue *),
        const char *name)
{

        if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED))
                return vring_create_virtqueue_packed(index, num, vring_align,
                                vdev, weak_barriers, may_reduce_num,
                                context, notify, callback, name, vdev->dev.parent);

        return vring_create_virtqueue_split(index, num, vring_align,
                        vdev, weak_barriers, may_reduce_num,
                        context, notify, callback, name, vdev->dev.parent);
}
EXPORT_SYMBOL_GPL(vring_create_virtqueue);

struct virtqueue *vring_create_virtqueue_dma(
        unsigned int index,
        unsigned int num,
        unsigned int vring_align,
        struct virtio_device *vdev,
        bool weak_barriers,
        bool may_reduce_num,
        bool context,
        bool (*notify)(struct virtqueue *),
        void (*callback)(struct virtqueue *),
        const char *name,
        struct device *dma_dev)
{

        if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED))
                return vring_create_virtqueue_packed(index, num, vring_align,
                                vdev, weak_barriers, may_reduce_num,
                                context, notify, callback, name, dma_dev);

        return vring_create_virtqueue_split(index, num, vring_align,
                        vdev, weak_barriers, may_reduce_num,
                        context, notify, callback, name, dma_dev);
}
EXPORT_SYMBOL_GPL(vring_create_virtqueue_dma);

/**
 * virtqueue_resize - resize the vring of vq
 * @_vq: the struct virtqueue we're talking about.
 * @num: new ring num
 * @recycle: callback to recycle unused buffers
 *
 * When it is really necessary to create a new vring, it will set the current vq
 * into the reset state. Then call the passed callback to recycle the buffer
 * that is no longer used. Only after the new vring is successfully created, the
 * old vring will be released.
 *
 * Caller must ensure we don't call this with other virtqueue operations
 * at the same time (except where noted).
 *
 * Returns zero or a negative error.
 * 0: success.
 * -ENOMEM: Failed to allocate a new ring, fall back to the original ring size.
 *  vq can still work normally
 * -EBUSY: Failed to sync with device, vq may not work properly
 * -ENOENT: Transport or device not supported
 * -E2BIG/-EINVAL: num error
 * -EPERM: Operation not permitted
 *
 */
int virtqueue_resize(struct virtqueue *_vq, u32 num,
                     void (*recycle)(struct virtqueue *vq, void *buf))
{
        struct vring_virtqueue *vq = to_vvq(_vq);
        int err;

        if (num > vq->vq.num_max)
                return -E2BIG;

        if (!num)
                return -EINVAL;

        if ((vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num) == num)
                return 0;

        err = virtqueue_disable_and_recycle(_vq, recycle);
        if (err)
                return err;

        if (vq->packed_ring)
                err = virtqueue_resize_packed(_vq, num);
        else
                err = virtqueue_resize_split(_vq, num);

        return virtqueue_enable_after_reset(_vq);
}
EXPORT_SYMBOL_GPL(virtqueue_resize);

/**
 * virtqueue_set_dma_premapped - set the vring premapped mode
 * @_vq: the struct virtqueue we're talking about.
 *
 * Enable the premapped mode of the vq.
 *
 * The vring in premapped mode does not do dma internally, so the driver must
 * do dma mapping in advance. The driver must pass the dma_address through
 * dma_address of scatterlist. When the driver got a used buffer from
 * the vring, it has to unmap the dma address.
 *
 * This function must be called immediately after creating the vq, or after vq
 * reset, and before adding any buffers to it.
 *
 * Caller must ensure we don't call this with other virtqueue operations
 * at the same time (except where noted).
 *
 * Returns zero or a negative error.
 * 0: success.
 * -EINVAL: too late to enable premapped mode, the vq already contains buffers.
 */
int virtqueue_set_dma_premapped(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);
        u32 num;

        START_USE(vq);

        num = vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num;

        if (num != vq->vq.num_free) {
                END_USE(vq);
                return -EINVAL;
        }

        vq->premapped = true;
        vq->do_unmap = false;

        END_USE(vq);

        return 0;
}
EXPORT_SYMBOL_GPL(virtqueue_set_dma_premapped);

/**
 * virtqueue_reset - detach and recycle all unused buffers
 * @_vq: the struct virtqueue we're talking about.
 * @recycle: callback to recycle unused buffers
 *
 * Caller must ensure we don't call this with other virtqueue operations
 * at the same time (except where noted).
 *
 * Returns zero or a negative error.
 * 0: success.
 * -EBUSY: Failed to sync with device, vq may not work properly
 * -ENOENT: Transport or device not supported
 * -EPERM: Operation not permitted
 */
int virtqueue_reset(struct virtqueue *_vq,
                    void (*recycle)(struct virtqueue *vq, void *buf))
{
        struct vring_virtqueue *vq = to_vvq(_vq);
        int err;

        err = virtqueue_disable_and_recycle(_vq, recycle);
        if (err)
                return err;

        if (vq->packed_ring)
                virtqueue_reinit_packed(vq);
        else
                virtqueue_reinit_split(vq);

        return virtqueue_enable_after_reset(_vq);
}
EXPORT_SYMBOL_GPL(virtqueue_reset);

/* Only available for split ring */
struct virtqueue *vring_new_virtqueue(unsigned int index,
                                      unsigned int num,
                                      unsigned int vring_align,
                                      struct virtio_device *vdev,
                                      bool weak_barriers,
                                      bool context,
                                      void *pages,
                                      bool (*notify)(struct virtqueue *vq),
                                      void (*callback)(struct virtqueue *vq),
                                      const char *name)
{
        struct vring_virtqueue_split vring_split = {};

        if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED))
                return NULL;

        vring_init(&vring_split.vring, num, pages, vring_align);
        return __vring_new_virtqueue(index, &vring_split, vdev, weak_barriers,
                                     context, notify, callback, name,
                                     vdev->dev.parent);
}
EXPORT_SYMBOL_GPL(vring_new_virtqueue);

static void vring_free(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        if (vq->we_own_ring) {
                if (vq->packed_ring) {
                        vring_free_queue(vq->vq.vdev,
                                         vq->packed.ring_size_in_bytes,
                                         vq->packed.vring.desc,
                                         vq->packed.ring_dma_addr,
                                         vring_dma_dev(vq));

                        vring_free_queue(vq->vq.vdev,
                                         vq->packed.event_size_in_bytes,
                                         vq->packed.vring.driver,
                                         vq->packed.driver_event_dma_addr,
                                         vring_dma_dev(vq));

                        vring_free_queue(vq->vq.vdev,
                                         vq->packed.event_size_in_bytes,
                                         vq->packed.vring.device,
                                         vq->packed.device_event_dma_addr,
                                         vring_dma_dev(vq));

                        kfree(vq->packed.desc_state);
                        kfree(vq->packed.desc_extra);
                } else {
                        vring_free_queue(vq->vq.vdev,
                                         vq->split.queue_size_in_bytes,
                                         vq->split.vring.desc,
                                         vq->split.queue_dma_addr,
                                         vring_dma_dev(vq));
                }
        }
        if (!vq->packed_ring) {
                kfree(vq->split.desc_state);
                kfree(vq->split.desc_extra);
        }
}

void vring_del_virtqueue(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        spin_lock(&vq->vq.vdev->vqs_list_lock);
        list_del(&_vq->list);
        spin_unlock(&vq->vq.vdev->vqs_list_lock);

        vring_free(_vq);

        kfree(vq);
}
EXPORT_SYMBOL_GPL(vring_del_virtqueue);

u32 vring_notification_data(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);
        u16 next;

        if (vq->packed_ring)
                next = (vq->packed.next_avail_idx &
                                ~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR))) |
                        vq->packed.avail_wrap_counter <<
                                VRING_PACKED_EVENT_F_WRAP_CTR;
        else
                next = vq->split.avail_idx_shadow;

        return next << 16 | _vq->index;
}
EXPORT_SYMBOL_GPL(vring_notification_data);

/* Manipulates transport-specific feature bits. */
void vring_transport_features(struct virtio_device *vdev)
{
        unsigned int i;

        for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
                switch (i) {
                case VIRTIO_RING_F_INDIRECT_DESC:
                        break;
                case VIRTIO_RING_F_EVENT_IDX:
                        break;
                case VIRTIO_F_VERSION_1:
                        break;
                case VIRTIO_F_ACCESS_PLATFORM:
                        break;
                case VIRTIO_F_RING_PACKED:
                        break;
                case VIRTIO_F_ORDER_PLATFORM:
                        break;
                case VIRTIO_F_NOTIFICATION_DATA:
                        break;
                default:
                        /* We don't understand this bit. */
                        __virtio_clear_bit(vdev, i);
                }
        }
}
EXPORT_SYMBOL_GPL(vring_transport_features);

/**
 * virtqueue_get_vring_size - return the size of the virtqueue's vring
 * @_vq: the struct virtqueue containing the vring of interest.
 *
 * Returns the size of the vring.  This is mainly used for boasting to
 * userspace.  Unlike other operations, this need not be serialized.
 */
unsigned int virtqueue_get_vring_size(const struct virtqueue *_vq)
{

        const struct vring_virtqueue *vq = to_vvq(_vq);

        return vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num;
}
EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);

/*
 * This function should only be called by the core, not directly by the driver.
 */
void __virtqueue_break(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        /* Pairs with READ_ONCE() in virtqueue_is_broken(). */
        WRITE_ONCE(vq->broken, true);
}
EXPORT_SYMBOL_GPL(__virtqueue_break);

/*
 * This function should only be called by the core, not directly by the driver.
 */
void __virtqueue_unbreak(struct virtqueue *_vq)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        /* Pairs with READ_ONCE() in virtqueue_is_broken(). */
        WRITE_ONCE(vq->broken, false);
}
EXPORT_SYMBOL_GPL(__virtqueue_unbreak);

bool virtqueue_is_broken(const struct virtqueue *_vq)
{
        const struct vring_virtqueue *vq = to_vvq(_vq);

        return READ_ONCE(vq->broken);
}
EXPORT_SYMBOL_GPL(virtqueue_is_broken);

/*
 * This should prevent the device from being used, allowing drivers to
 * recover.  You may need to grab appropriate locks to flush.
 */
void virtio_break_device(struct virtio_device *dev)
{
        struct virtqueue *_vq;

        spin_lock(&dev->vqs_list_lock);
        list_for_each_entry(_vq, &dev->vqs, list) {
                struct vring_virtqueue *vq = to_vvq(_vq);

                /* Pairs with READ_ONCE() in virtqueue_is_broken(). */
                WRITE_ONCE(vq->broken, true);
        }
        spin_unlock(&dev->vqs_list_lock);
}
EXPORT_SYMBOL_GPL(virtio_break_device);

/*
 * This should allow the device to be used by the driver. You may
 * need to grab appropriate locks to flush the write to
 * vq->broken. This should only be used in some specific case e.g
 * (probing and restoring). This function should only be called by the
 * core, not directly by the driver.
 */
void __virtio_unbreak_device(struct virtio_device *dev)
{
        struct virtqueue *_vq;

        spin_lock(&dev->vqs_list_lock);
        list_for_each_entry(_vq, &dev->vqs, list) {
                struct vring_virtqueue *vq = to_vvq(_vq);

                /* Pairs with READ_ONCE() in virtqueue_is_broken(). */
                WRITE_ONCE(vq->broken, false);
        }
        spin_unlock(&dev->vqs_list_lock);
}
EXPORT_SYMBOL_GPL(__virtio_unbreak_device);

dma_addr_t virtqueue_get_desc_addr(const struct virtqueue *_vq)
{
        const struct vring_virtqueue *vq = to_vvq(_vq);

        BUG_ON(!vq->we_own_ring);

        if (vq->packed_ring)
                return vq->packed.ring_dma_addr;

        return vq->split.queue_dma_addr;
}
EXPORT_SYMBOL_GPL(virtqueue_get_desc_addr);

dma_addr_t virtqueue_get_avail_addr(const struct virtqueue *_vq)
{
        const struct vring_virtqueue *vq = to_vvq(_vq);

        BUG_ON(!vq->we_own_ring);

        if (vq->packed_ring)
                return vq->packed.driver_event_dma_addr;

        return vq->split.queue_dma_addr +
                ((char *)vq->split.vring.avail - (char *)vq->split.vring.desc);
}
EXPORT_SYMBOL_GPL(virtqueue_get_avail_addr);

dma_addr_t virtqueue_get_used_addr(const struct virtqueue *_vq)
{
        const struct vring_virtqueue *vq = to_vvq(_vq);

        BUG_ON(!vq->we_own_ring);

        if (vq->packed_ring)
                return vq->packed.device_event_dma_addr;

        return vq->split.queue_dma_addr +
                ((char *)vq->split.vring.used - (char *)vq->split.vring.desc);
}
EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);

/* Only available for split ring */
const struct vring *virtqueue_get_vring(const struct virtqueue *vq)
{
        return &to_vvq(vq)->split.vring;
}
EXPORT_SYMBOL_GPL(virtqueue_get_vring);

/**
 * virtqueue_dma_map_single_attrs - map DMA for _vq
 * @_vq: the struct virtqueue we're talking about.
 * @ptr: the pointer of the buffer to do dma
 * @size: the size of the buffer to do dma
 * @dir: DMA direction
 * @attrs: DMA Attrs
 *
 * The caller calls this to do dma mapping in advance. The DMA address can be
 * passed to this _vq when it is in pre-mapped mode.
 *
 * return DMA address. Caller should check that by virtqueue_dma_mapping_error().
 */
dma_addr_t virtqueue_dma_map_single_attrs(struct virtqueue *_vq, void *ptr,
                                          size_t size,
                                          enum dma_data_direction dir,
                                          unsigned long attrs)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        if (!vq->use_dma_api)
                return (dma_addr_t)virt_to_phys(ptr);

        return dma_map_single_attrs(vring_dma_dev(vq), ptr, size, dir, attrs);
}
EXPORT_SYMBOL_GPL(virtqueue_dma_map_single_attrs);

/**
 * virtqueue_dma_unmap_single_attrs - unmap DMA for _vq
 * @_vq: the struct virtqueue we're talking about.
 * @addr: the dma address to unmap
 * @size: the size of the buffer
 * @dir: DMA direction
 * @attrs: DMA Attrs
 *
 * Unmap the address that is mapped by the virtqueue_dma_map_* APIs.
 *
 */
void virtqueue_dma_unmap_single_attrs(struct virtqueue *_vq, dma_addr_t addr,
                                      size_t size, enum dma_data_direction dir,
                                      unsigned long attrs)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        if (!vq->use_dma_api)
                return;

        dma_unmap_single_attrs(vring_dma_dev(vq), addr, size, dir, attrs);
}
EXPORT_SYMBOL_GPL(virtqueue_dma_unmap_single_attrs);

/**
 * virtqueue_dma_mapping_error - check dma address
 * @_vq: the struct virtqueue we're talking about.
 * @addr: DMA address
 *
 * Returns 0 means dma valid. Other means invalid dma address.
 */
int virtqueue_dma_mapping_error(struct virtqueue *_vq, dma_addr_t addr)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        if (!vq->use_dma_api)
                return 0;

        return dma_mapping_error(vring_dma_dev(vq), addr);
}
EXPORT_SYMBOL_GPL(virtqueue_dma_mapping_error);

/**
 * virtqueue_dma_need_sync - check a dma address needs sync
 * @_vq: the struct virtqueue we're talking about.
 * @addr: DMA address
 *
 * Check if the dma address mapped by the virtqueue_dma_map_* APIs needs to be
 * synchronized
 *
 * return bool
 */
bool virtqueue_dma_need_sync(struct virtqueue *_vq, dma_addr_t addr)
{
        struct vring_virtqueue *vq = to_vvq(_vq);

        if (!vq->use_dma_api)
                return false;

        return dma_need_sync(vring_dma_dev(vq), addr);
}
EXPORT_SYMBOL_GPL(virtqueue_dma_need_sync);

/**
 * virtqueue_dma_sync_single_range_for_cpu - dma sync for cpu
 * @_vq: the struct virtqueue we're talking about.
 * @addr: DMA address
 * @offset: DMA address offset
 * @size: buf size for sync
 * @dir: DMA direction
 *
 * Before calling this function, use virtqueue_dma_need_sync() to confirm that
 * the DMA address really needs to be synchronized
 *
 */
void virtqueue_dma_sync_single_range_for_cpu(struct virtqueue *_vq,
                                             dma_addr_t addr,
                                             unsigned long offset, size_t size,
                                             enum dma_data_direction dir)
{
        struct vring_virtqueue *vq = to_vvq(_vq);
        struct device *dev = vring_dma_dev(vq);

        if (!vq->use_dma_api)
                return;

        dma_sync_single_range_for_cpu(dev, addr, offset, size, dir);
}
EXPORT_SYMBOL_GPL(virtqueue_dma_sync_single_range_for_cpu);

/**
 * virtqueue_dma_sync_single_range_for_device - dma sync for device
 * @_vq: the struct virtqueue we're talking about.
 * @addr: DMA address
 * @offset: DMA address offset
 * @size: buf size for sync
 * @dir: DMA direction
 *
 * Before calling this function, use virtqueue_dma_need_sync() to confirm that
 * the DMA address really needs to be synchronized
 */
void virtqueue_dma_sync_single_range_for_device(struct virtqueue *_vq,
                                                dma_addr_t addr,
                                                unsigned long offset, size_t size,
                                                enum dma_data_direction dir)
{
        struct vring_virtqueue *vq = to_vvq(_vq);
        struct device *dev = vring_dma_dev(vq);

        if (!vq->use_dma_api)
                return;

        dma_sync_single_range_for_device(dev, addr, offset, size, dir);
}
EXPORT_SYMBOL_GPL(virtqueue_dma_sync_single_range_for_device);

MODULE_LICENSE("GPL");
























































































































































































































































































    5 














    5 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/fs/ext4/acl.c
 *
 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
 */

#include <linux/quotaops.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "xattr.h"
#include "acl.h"

/*
 * Convert from filesystem to in-memory representation.
 */
static struct posix_acl *
ext4_acl_from_disk(const void *value, size_t size)
{
        const char *end = (char *)value + size;
        int n, count;
        struct posix_acl *acl;

        if (!value)
                return NULL;
        if (size < sizeof(ext4_acl_header))
                 return ERR_PTR(-EINVAL);
        if (((ext4_acl_header *)value)->a_version !=
            cpu_to_le32(EXT4_ACL_VERSION))
                return ERR_PTR(-EINVAL);
        value = (char *)value + sizeof(ext4_acl_header);
        count = ext4_acl_count(size);
        if (count < 0)
                return ERR_PTR(-EINVAL);
        if (count == 0)
                return NULL;
        acl = posix_acl_alloc(count, GFP_NOFS);
        if (!acl)
                return ERR_PTR(-ENOMEM);
        for (n = 0; n < count; n++) {
                ext4_acl_entry *entry =
                        (ext4_acl_entry *)value;
                if ((char *)value + sizeof(ext4_acl_entry_short) > end)
                        goto fail;
                acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
                acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);

                switch (acl->a_entries[n].e_tag) {
                case ACL_USER_OBJ:
                case ACL_GROUP_OBJ:
                case ACL_MASK:
                case ACL_OTHER:
                        value = (char *)value +
                                sizeof(ext4_acl_entry_short);
                        break;

                case ACL_USER:
                        value = (char *)value + sizeof(ext4_acl_entry);
                        if ((char *)value > end)
                                goto fail;
                        acl->a_entries[n].e_uid =
                                make_kuid(&init_user_ns,
                                          le32_to_cpu(entry->e_id));
                        break;
                case ACL_GROUP:
                        value = (char *)value + sizeof(ext4_acl_entry);
                        if ((char *)value > end)
                                goto fail;
                        acl->a_entries[n].e_gid =
                                make_kgid(&init_user_ns,
                                          le32_to_cpu(entry->e_id));
                        break;

                default:
                        goto fail;
                }
        }
        if (value != end)
                goto fail;
        return acl;

fail:
        posix_acl_release(acl);
        return ERR_PTR(-EINVAL);
}

/*
 * Convert from in-memory to filesystem representation.
 */
static void *
ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
{
        ext4_acl_header *ext_acl;
        char *e;
        size_t n;

        *size = ext4_acl_size(acl->a_count);
        ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count *
                        sizeof(ext4_acl_entry), GFP_NOFS);
        if (!ext_acl)
                return ERR_PTR(-ENOMEM);
        ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
        e = (char *)ext_acl + sizeof(ext4_acl_header);
        for (n = 0; n < acl->a_count; n++) {
                const struct posix_acl_entry *acl_e = &acl->a_entries[n];
                ext4_acl_entry *entry = (ext4_acl_entry *)e;
                entry->e_tag  = cpu_to_le16(acl_e->e_tag);
                entry->e_perm = cpu_to_le16(acl_e->e_perm);
                switch (acl_e->e_tag) {
                case ACL_USER:
                        entry->e_id = cpu_to_le32(
                                from_kuid(&init_user_ns, acl_e->e_uid));
                        e += sizeof(ext4_acl_entry);
                        break;
                case ACL_GROUP:
                        entry->e_id = cpu_to_le32(
                                from_kgid(&init_user_ns, acl_e->e_gid));
                        e += sizeof(ext4_acl_entry);
                        break;

                case ACL_USER_OBJ:
                case ACL_GROUP_OBJ:
                case ACL_MASK:
                case ACL_OTHER:
                        e += sizeof(ext4_acl_entry_short);
                        break;

                default:
                        goto fail;
                }
        }
        return (char *)ext_acl;

fail:
        kfree(ext_acl);
        return ERR_PTR(-EINVAL);
}

/*
 * Inode operation get_posix_acl().
 *
 * inode->i_rwsem: don't care
 */
struct posix_acl *
ext4_get_acl(struct inode *inode, int type, bool rcu)
{
        int name_index;
        char *value = NULL;
        struct posix_acl *acl;
        int retval;

        if (rcu)
                return ERR_PTR(-ECHILD);

        switch (type) {
        case ACL_TYPE_ACCESS:
                name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
                break;
        case ACL_TYPE_DEFAULT:
                name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
                break;
        default:
                BUG();
        }
        retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
        if (retval > 0) {
                value = kmalloc(retval, GFP_NOFS);
                if (!value)
                        return ERR_PTR(-ENOMEM);
                retval = ext4_xattr_get(inode, name_index, "", value, retval);
        }
        if (retval > 0)
                acl = ext4_acl_from_disk(value, retval);
        else if (retval == -ENODATA || retval == -ENOSYS)
                acl = NULL;
        else
                acl = ERR_PTR(retval);
        kfree(value);

        return acl;
}

/*
 * Set the access or default ACL of an inode.
 *
 * inode->i_rwsem: down unless called from ext4_new_inode
 */
static int
__ext4_set_acl(handle_t *handle, struct inode *inode, int type,
             struct posix_acl *acl, int xattr_flags)
{
        int name_index;
        void *value = NULL;
        size_t size = 0;
        int error;

        switch (type) {
        case ACL_TYPE_ACCESS:
                name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
                break;

        case ACL_TYPE_DEFAULT:
                name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
                if (!S_ISDIR(inode->i_mode))
                        return acl ? -EACCES : 0;
                break;

        default:
                return -EINVAL;
        }
        if (acl) {
                value = ext4_acl_to_disk(acl, &size);
                if (IS_ERR(value))
                        return (int)PTR_ERR(value);
        }

        error = ext4_xattr_set_handle(handle, inode, name_index, "",
                                      value, size, xattr_flags);

        kfree(value);
        if (!error)
                set_cached_acl(inode, type, acl);

        return error;
}

int
ext4_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
             struct posix_acl *acl, int type)
{
        handle_t *handle;
        int error, credits, retries = 0;
        size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0;
        struct inode *inode = d_inode(dentry);
        umode_t mode = inode->i_mode;
        int update_mode = 0;

        error = dquot_initialize(inode);
        if (error)
                return error;
retry:
        error = ext4_xattr_set_credits(inode, acl_size, false /* is_create */,
                                       &credits);
        if (error)
                return error;

        handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        if ((type == ACL_TYPE_ACCESS) && acl) {
                error = posix_acl_update_mode(idmap, inode, &mode, &acl);
                if (error)
                        goto out_stop;
                if (mode != inode->i_mode)
                        update_mode = 1;
        }

        error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */);
        if (!error && update_mode) {
                inode->i_mode = mode;
                inode_set_ctime_current(inode);
                error = ext4_mark_inode_dirty(handle, inode);
        }
out_stop:
        ext4_journal_stop(handle);
        if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
        return error;
}

/*
 * Initialize the ACLs of a new inode. Called from ext4_new_inode.
 *
 * dir->i_rwsem: down
 * inode->i_rwsem: up (access to inode is still exclusive)
 */
int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
{
        struct posix_acl *default_acl, *acl;
        int error;

        error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
        if (error)
                return error;

        if (default_acl) {
                error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
                                       default_acl, XATTR_CREATE);
                posix_acl_release(default_acl);
        } else {
                inode->i_default_acl = NULL;
        }
        if (acl) {
                if (!error)
                        error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
                                               acl, XATTR_CREATE);
                posix_acl_release(acl);
        } else {
                inode->i_acl = NULL;
        }
        return error;
}






















































































    1 





    1 




































































































































































    1 














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/sched/mm.h>
#include <crypto/hash.h>
#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "bio.h"
#include "compression.h"
#include "fs.h"
#include "accessors.h"
#include "file-item.h"

#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
                                   sizeof(struct btrfs_item) * 2) / \
                                  size) - 1))

#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
                                       PAGE_SIZE))

/*
 * Set inode's size according to filesystem options.
 *
 * @inode:      inode we want to update the disk_i_size for
 * @new_i_size: i_size we want to set to, 0 if we use i_size
 *
 * With NO_HOLES set this simply sets the disk_is_size to whatever i_size_read()
 * returns as it is perfectly fine with a file that has holes without hole file
 * extent items.
 *
 * However without NO_HOLES we need to only return the area that is contiguous
 * from the 0 offset of the file.  Otherwise we could end up adjust i_size up
 * to an extent that has a gap in between.
 *
 * Finally new_i_size should only be set in the case of truncate where we're not
 * ready to use i_size_read() as the limiter yet.
 */
void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        u64 start, end, i_size;
        int ret;

        spin_lock(&inode->lock);
        i_size = new_i_size ?: i_size_read(&inode->vfs_inode);
        if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
                inode->disk_i_size = i_size;
                goto out_unlock;
        }

        ret = find_contiguous_extent_bit(inode->file_extent_tree, 0, &start,
                                         &end, EXTENT_DIRTY);
        if (!ret && start == 0)
                i_size = min(i_size, end + 1);
        else
                i_size = 0;
        inode->disk_i_size = i_size;
out_unlock:
        spin_unlock(&inode->lock);
}

/*
 * Mark range within a file as having a new extent inserted.
 *
 * @inode: inode being modified
 * @start: start file offset of the file extent we've inserted
 * @len:   logical length of the file extent item
 *
 * Call when we are inserting a new file extent where there was none before.
 * Does not need to call this in the case where we're replacing an existing file
 * extent, however if not sure it's fine to call this multiple times.
 *
 * The start and len must match the file extent item, so thus must be sectorsize
 * aligned.
 */
int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
                                      u64 len)
{
        if (len == 0)
                return 0;

        ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize));

        if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
                return 0;
        return set_extent_bit(inode->file_extent_tree, start, start + len - 1,
                              EXTENT_DIRTY, NULL);
}

/*
 * Mark an inode range as not having a backing extent.
 *
 * @inode: inode being modified
 * @start: start file offset of the file extent we've inserted
 * @len:   logical length of the file extent item
 *
 * Called when we drop a file extent, for example when we truncate.  Doesn't
 * need to be called for cases where we're replacing a file extent, like when
 * we've COWed a file extent.
 *
 * The start and len must match the file extent item, so thus must be sectorsize
 * aligned.
 */
int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
                                        u64 len)
{
        if (len == 0)
                return 0;

        ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) ||
               len == (u64)-1);

        if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
                return 0;
        return clear_extent_bit(inode->file_extent_tree, start,
                                start + len - 1, EXTENT_DIRTY, NULL);
}

static size_t bytes_to_csum_size(const struct btrfs_fs_info *fs_info, u32 bytes)
{
        ASSERT(IS_ALIGNED(bytes, fs_info->sectorsize));

        return (bytes >> fs_info->sectorsize_bits) * fs_info->csum_size;
}

static size_t csum_size_to_bytes(const struct btrfs_fs_info *fs_info, u32 csum_size)
{
        ASSERT(IS_ALIGNED(csum_size, fs_info->csum_size));

        return (csum_size / fs_info->csum_size) << fs_info->sectorsize_bits;
}

static inline u32 max_ordered_sum_bytes(const struct btrfs_fs_info *fs_info)
{
        u32 max_csum_size = round_down(PAGE_SIZE - sizeof(struct btrfs_ordered_sum),
                                       fs_info->csum_size);

        return csum_size_to_bytes(fs_info, max_csum_size);
}

/*
 * Calculate the total size needed to allocate for an ordered sum structure
 * spanning @bytes in the file.
 */
static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes)
{
        return sizeof(struct btrfs_ordered_sum) + bytes_to_csum_size(fs_info, bytes);
}

int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             u64 objectid, u64 pos, u64 num_bytes)
{
        int ret = 0;
        struct btrfs_file_extent_item *item;
        struct btrfs_key file_key;
        struct btrfs_path *path;
        struct extent_buffer *leaf;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        file_key.objectid = objectid;
        file_key.offset = pos;
        file_key.type = BTRFS_EXTENT_DATA_KEY;

        ret = btrfs_insert_empty_item(trans, root, path, &file_key,
                                      sizeof(*item));
        if (ret < 0)
                goto out;
        leaf = path->nodes[0];
        item = btrfs_item_ptr(leaf, path->slots[0],
                              struct btrfs_file_extent_item);
        btrfs_set_file_extent_disk_bytenr(leaf, item, 0);
        btrfs_set_file_extent_disk_num_bytes(leaf, item, 0);
        btrfs_set_file_extent_offset(leaf, item, 0);
        btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
        btrfs_set_file_extent_ram_bytes(leaf, item, num_bytes);
        btrfs_set_file_extent_generation(leaf, item, trans->transid);
        btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
        btrfs_set_file_extent_compression(leaf, item, 0);
        btrfs_set_file_extent_encryption(leaf, item, 0);
        btrfs_set_file_extent_other_encoding(leaf, item, 0);

        btrfs_mark_buffer_dirty(trans, leaf);
out:
        btrfs_free_path(path);
        return ret;
}

static struct btrfs_csum_item *
btrfs_lookup_csum(struct btrfs_trans_handle *trans,
                  struct btrfs_root *root,
                  struct btrfs_path *path,
                  u64 bytenr, int cow)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;
        struct btrfs_key file_key;
        struct btrfs_key found_key;
        struct btrfs_csum_item *item;
        struct extent_buffer *leaf;
        u64 csum_offset = 0;
        const u32 csum_size = fs_info->csum_size;
        int csums_in_item;

        file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
        file_key.offset = bytenr;
        file_key.type = BTRFS_EXTENT_CSUM_KEY;
        ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
        if (ret < 0)
                goto fail;
        leaf = path->nodes[0];
        if (ret > 0) {
                ret = 1;
                if (path->slots[0] == 0)
                        goto fail;
                path->slots[0]--;
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                if (found_key.type != BTRFS_EXTENT_CSUM_KEY)
                        goto fail;

                csum_offset = (bytenr - found_key.offset) >>
                                fs_info->sectorsize_bits;
                csums_in_item = btrfs_item_size(leaf, path->slots[0]);
                csums_in_item /= csum_size;

                if (csum_offset == csums_in_item) {
                        ret = -EFBIG;
                        goto fail;
                } else if (csum_offset > csums_in_item) {
                        goto fail;
                }
        }
        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
        item = (struct btrfs_csum_item *)((unsigned char *)item +
                                          csum_offset * csum_size);
        return item;
fail:
        if (ret > 0)
                ret = -ENOENT;
        return ERR_PTR(ret);
}

int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid,
                             u64 offset, int mod)
{
        struct btrfs_key file_key;
        int ins_len = mod < 0 ? -1 : 0;
        int cow = mod != 0;

        file_key.objectid = objectid;
        file_key.offset = offset;
        file_key.type = BTRFS_EXTENT_DATA_KEY;

        return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
}

/*
 * Find checksums for logical bytenr range [disk_bytenr, disk_bytenr + len) and
 * store the result to @dst.
 *
 * Return >0 for the number of sectors we found.
 * Return 0 for the range [disk_bytenr, disk_bytenr + sectorsize) has no csum
 * for it. Caller may want to try next sector until one range is hit.
 * Return <0 for fatal error.
 */
static int search_csum_tree(struct btrfs_fs_info *fs_info,
                            struct btrfs_path *path, u64 disk_bytenr,
                            u64 len, u8 *dst)
{
        struct btrfs_root *csum_root;
        struct btrfs_csum_item *item = NULL;
        struct btrfs_key key;
        const u32 sectorsize = fs_info->sectorsize;
        const u32 csum_size = fs_info->csum_size;
        u32 itemsize;
        int ret;
        u64 csum_start;
        u64 csum_len;

        ASSERT(IS_ALIGNED(disk_bytenr, sectorsize) &&
               IS_ALIGNED(len, sectorsize));

        /* Check if the current csum item covers disk_bytenr */
        if (path->nodes[0]) {
                item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                      struct btrfs_csum_item);
                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
                itemsize = btrfs_item_size(path->nodes[0], path->slots[0]);

                csum_start = key.offset;
                csum_len = (itemsize / csum_size) * sectorsize;

                if (in_range(disk_bytenr, csum_start, csum_len))
                        goto found;
        }

        /* Current item doesn't contain the desired range, search again */
        btrfs_release_path(path);
        csum_root = btrfs_csum_root(fs_info, disk_bytenr);
        item = btrfs_lookup_csum(NULL, csum_root, path, disk_bytenr, 0);
        if (IS_ERR(item)) {
                ret = PTR_ERR(item);
                goto out;
        }
        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
        itemsize = btrfs_item_size(path->nodes[0], path->slots[0]);

        csum_start = key.offset;
        csum_len = (itemsize / csum_size) * sectorsize;
        ASSERT(in_range(disk_bytenr, csum_start, csum_len));

found:
        ret = (min(csum_start + csum_len, disk_bytenr + len) -
                   disk_bytenr) >> fs_info->sectorsize_bits;
        read_extent_buffer(path->nodes[0], dst, (unsigned long)item,
                        ret * csum_size);
out:
        if (ret == -ENOENT || ret == -EFBIG)
                ret = 0;
        return ret;
}

/*
 * Lookup the checksum for the read bio in csum tree.
 *
 * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
 */
blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
{
        struct btrfs_inode *inode = bbio->inode;
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct bio *bio = &bbio->bio;
        struct btrfs_path *path;
        const u32 sectorsize = fs_info->sectorsize;
        const u32 csum_size = fs_info->csum_size;
        u32 orig_len = bio->bi_iter.bi_size;
        u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
        const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
        blk_status_t ret = BLK_STS_OK;
        u32 bio_offset = 0;

        if ((inode->flags & BTRFS_INODE_NODATASUM) ||
            test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
                return BLK_STS_OK;

        /*
         * This function is only called for read bio.
         *
         * This means two things:
         * - All our csums should only be in csum tree
         *   No ordered extents csums, as ordered extents are only for write
         *   path.
         * - No need to bother any other info from bvec
         *   Since we're looking up csums, the only important info is the
         *   disk_bytenr and the length, which can be extracted from bi_iter
         *   directly.
         */
        ASSERT(bio_op(bio) == REQ_OP_READ);
        path = btrfs_alloc_path();
        if (!path)
                return BLK_STS_RESOURCE;

        if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
                bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
                if (!bbio->csum) {
                        btrfs_free_path(path);
                        return BLK_STS_RESOURCE;
                }
        } else {
                bbio->csum = bbio->csum_inline;
        }

        /*
         * If requested number of sectors is larger than one leaf can contain,
         * kick the readahead for csum tree.
         */
        if (nblocks > fs_info->csums_per_leaf)
                path->reada = READA_FORWARD;

        /*
         * the free space stuff is only read when it hasn't been
         * updated in the current transaction.  So, we can safely
         * read from the commit root and sidestep a nasty deadlock
         * between reading the free space cache and updating the csum tree.
         */
        if (btrfs_is_free_space_inode(inode)) {
                path->search_commit_root = 1;
                path->skip_locking = 1;
        }

        while (bio_offset < orig_len) {
                int count;
                u64 cur_disk_bytenr = orig_disk_bytenr + bio_offset;
                u8 *csum_dst = bbio->csum +
                        (bio_offset >> fs_info->sectorsize_bits) * csum_size;

                count = search_csum_tree(fs_info, path, cur_disk_bytenr,
                                         orig_len - bio_offset, csum_dst);
                if (count < 0) {
                        ret = errno_to_blk_status(count);
                        if (bbio->csum != bbio->csum_inline)
                                kfree(bbio->csum);
                        bbio->csum = NULL;
                        break;
                }

                /*
                 * We didn't find a csum for this range.  We need to make sure
                 * we complain loudly about this, because we are not NODATASUM.
                 *
                 * However for the DATA_RELOC inode we could potentially be
                 * relocating data extents for a NODATASUM inode, so the inode
                 * itself won't be marked with NODATASUM, but the extent we're
                 * copying is in fact NODATASUM.  If we don't find a csum we
                 * assume this is the case.
                 */
                if (count == 0) {
                        memset(csum_dst, 0, csum_size);
                        count = 1;

                        if (btrfs_root_id(inode->root) == BTRFS_DATA_RELOC_TREE_OBJECTID) {
                                u64 file_offset = bbio->file_offset + bio_offset;

                                set_extent_bit(&inode->io_tree, file_offset,
                                               file_offset + sectorsize - 1,
                                               EXTENT_NODATASUM, NULL);
                        } else {
                                btrfs_warn_rl(fs_info,
                        "csum hole found for disk bytenr range [%llu, %llu)",
                                cur_disk_bytenr, cur_disk_bytenr + sectorsize);
                        }
                }
                bio_offset += count * sectorsize;
        }

        btrfs_free_path(path);
        return ret;
}

/*
 * Search for checksums for a given logical range.
 *
 * @root:                The root where to look for checksums.
 * @start:                Logical address of target checksum range.
 * @end:                End offset (inclusive) of the target checksum range.
 * @list:                List for adding each checksum that was found.
 *                        Can be NULL in case the caller only wants to check if
 *                        there any checksums for the range.
 * @nowait:                Indicate if the search must be non-blocking or not.
 *
 * Return < 0 on error, 0 if no checksums were found, or 1 if checksums were
 * found.
 */
int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
                            struct list_head *list, bool nowait)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_key key;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_ordered_sum *sums;
        struct btrfs_csum_item *item;
        int ret;
        bool found_csums = false;

        ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
               IS_ALIGNED(end + 1, fs_info->sectorsize));

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        path->nowait = nowait;

        key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
        key.offset = start;
        key.type = BTRFS_EXTENT_CSUM_KEY;

        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto out;
        if (ret > 0 && path->slots[0] > 0) {
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);

                /*
                 * There are two cases we can hit here for the previous csum
                 * item:
                 *
                 *                |<- search range ->|
                 *        |<- csum item ->|
                 *
                 * Or
                 *                                |<- search range ->|
                 *        |<- csum item ->|
                 *
                 * Check if the previous csum item covers the leading part of
                 * the search range.  If so we have to start from previous csum
                 * item.
                 */
                if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
                    key.type == BTRFS_EXTENT_CSUM_KEY) {
                        if (bytes_to_csum_size(fs_info, start - key.offset) <
                            btrfs_item_size(leaf, path->slots[0] - 1))
                                path->slots[0]--;
                }
        }

        while (start <= end) {
                u64 csum_end;

                leaf = path->nodes[0];
                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
                        ret = btrfs_next_leaf(root, path);
                        if (ret < 0)
                                goto out;
                        if (ret > 0)
                                break;
                        leaf = path->nodes[0];
                }

                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
                if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
                    key.type != BTRFS_EXTENT_CSUM_KEY ||
                    key.offset > end)
                        break;

                if (key.offset > start)
                        start = key.offset;

                csum_end = key.offset + csum_size_to_bytes(fs_info,
                                        btrfs_item_size(leaf, path->slots[0]));
                if (csum_end <= start) {
                        path->slots[0]++;
                        continue;
                }

                found_csums = true;
                if (!list)
                        goto out;

                csum_end = min(csum_end, end + 1);
                item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                      struct btrfs_csum_item);
                while (start < csum_end) {
                        unsigned long offset;
                        size_t size;

                        size = min_t(size_t, csum_end - start,
                                     max_ordered_sum_bytes(fs_info));
                        sums = kzalloc(btrfs_ordered_sum_size(fs_info, size),
                                       GFP_NOFS);
                        if (!sums) {
                                ret = -ENOMEM;
                                goto out;
                        }

                        sums->logical = start;
                        sums->len = size;

                        offset = bytes_to_csum_size(fs_info, start - key.offset);

                        read_extent_buffer(path->nodes[0],
                                           sums->sums,
                                           ((unsigned long)item) + offset,
                                           bytes_to_csum_size(fs_info, size));

                        start += size;
                        list_add_tail(&sums->list, list);
                }
                path->slots[0]++;
        }
out:
        btrfs_free_path(path);
        if (ret < 0) {
                if (list) {
                        struct btrfs_ordered_sum *tmp_sums;

                        list_for_each_entry_safe(sums, tmp_sums, list, list)
                                kfree(sums);
                }

                return ret;
        }

        return found_csums ? 1 : 0;
}

/*
 * Do the same work as btrfs_lookup_csums_list(), the difference is in how
 * we return the result.
 *
 * This version will set the corresponding bits in @csum_bitmap to represent
 * that there is a csum found.
 * Each bit represents a sector. Thus caller should ensure @csum_buf passed
 * in is large enough to contain all csums.
 */
int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path,
                              u64 start, u64 end, u8 *csum_buf,
                              unsigned long *csum_bitmap)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_key key;
        struct extent_buffer *leaf;
        struct btrfs_csum_item *item;
        const u64 orig_start = start;
        bool free_path = false;
        int ret;

        ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
               IS_ALIGNED(end + 1, fs_info->sectorsize));

        if (!path) {
                path = btrfs_alloc_path();
                if (!path)
                        return -ENOMEM;
                free_path = true;
        }

        /* Check if we can reuse the previous path. */
        if (path->nodes[0]) {
                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);

                if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
                    key.type == BTRFS_EXTENT_CSUM_KEY &&
                    key.offset <= start)
                        goto search_forward;
                btrfs_release_path(path);
        }

        key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
        key.type = BTRFS_EXTENT_CSUM_KEY;
        key.offset = start;

        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto fail;
        if (ret > 0 && path->slots[0] > 0) {
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);

                /*
                 * There are two cases we can hit here for the previous csum
                 * item:
                 *
                 *                |<- search range ->|
                 *        |<- csum item ->|
                 *
                 * Or
                 *                                |<- search range ->|
                 *        |<- csum item ->|
                 *
                 * Check if the previous csum item covers the leading part of
                 * the search range.  If so we have to start from previous csum
                 * item.
                 */
                if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
                    key.type == BTRFS_EXTENT_CSUM_KEY) {
                        if (bytes_to_csum_size(fs_info, start - key.offset) <
                            btrfs_item_size(leaf, path->slots[0] - 1))
                                path->slots[0]--;
                }
        }

search_forward:
        while (start <= end) {
                u64 csum_end;

                leaf = path->nodes[0];
                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
                        ret = btrfs_next_leaf(root, path);
                        if (ret < 0)
                                goto fail;
                        if (ret > 0)
                                break;
                        leaf = path->nodes[0];
                }

                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
                if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
                    key.type != BTRFS_EXTENT_CSUM_KEY ||
                    key.offset > end)
                        break;

                if (key.offset > start)
                        start = key.offset;

                csum_end = key.offset + csum_size_to_bytes(fs_info,
                                        btrfs_item_size(leaf, path->slots[0]));
                if (csum_end <= start) {
                        path->slots[0]++;
                        continue;
                }

                csum_end = min(csum_end, end + 1);
                item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                      struct btrfs_csum_item);
                while (start < csum_end) {
                        unsigned long offset;
                        size_t size;
                        u8 *csum_dest = csum_buf + bytes_to_csum_size(fs_info,
                                                start - orig_start);

                        size = min_t(size_t, csum_end - start, end + 1 - start);

                        offset = bytes_to_csum_size(fs_info, start - key.offset);

                        read_extent_buffer(path->nodes[0], csum_dest,
                                           ((unsigned long)item) + offset,
                                           bytes_to_csum_size(fs_info, size));

                        bitmap_set(csum_bitmap,
                                (start - orig_start) >> fs_info->sectorsize_bits,
                                size >> fs_info->sectorsize_bits);

                        start += size;
                }
                path->slots[0]++;
        }
        ret = 0;
fail:
        if (free_path)
                btrfs_free_path(path);
        return ret;
}

/*
 * Calculate checksums of the data contained inside a bio.
 */
blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio)
{
        struct btrfs_ordered_extent *ordered = bbio->ordered;
        struct btrfs_inode *inode = bbio->inode;
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
        struct bio *bio = &bbio->bio;
        struct btrfs_ordered_sum *sums;
        char *data;
        struct bvec_iter iter;
        struct bio_vec bvec;
        int index;
        unsigned int blockcount;
        int i;
        unsigned nofs_flag;

        nofs_flag = memalloc_nofs_save();
        sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
                       GFP_KERNEL);
        memalloc_nofs_restore(nofs_flag);

        if (!sums)
                return BLK_STS_RESOURCE;

        sums->len = bio->bi_iter.bi_size;
        INIT_LIST_HEAD(&sums->list);

        sums->logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
        index = 0;

        shash->tfm = fs_info->csum_shash;

        bio_for_each_segment(bvec, bio, iter) {
                blockcount = BTRFS_BYTES_TO_BLKS(fs_info,
                                                 bvec.bv_len + fs_info->sectorsize
                                                 - 1);

                for (i = 0; i < blockcount; i++) {
                        data = bvec_kmap_local(&bvec);
                        crypto_shash_digest(shash,
                                            data + (i * fs_info->sectorsize),
                                            fs_info->sectorsize,
                                            sums->sums + index);
                        kunmap_local(data);
                        index += fs_info->csum_size;
                }

        }

        bbio->sums = sums;
        btrfs_add_ordered_sum(ordered, sums);
        return 0;
}

/*
 * Nodatasum I/O on zoned file systems still requires an btrfs_ordered_sum to
 * record the updated logical address on Zone Append completion.
 * Allocate just the structure with an empty sums array here for that case.
 */
blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio)
{
        bbio->sums = kmalloc(sizeof(*bbio->sums), GFP_NOFS);
        if (!bbio->sums)
                return BLK_STS_RESOURCE;
        bbio->sums->len = bbio->bio.bi_iter.bi_size;
        bbio->sums->logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
        btrfs_add_ordered_sum(bbio->ordered, bbio->sums);
        return 0;
}

/*
 * Remove one checksum overlapping a range.
 *
 * This expects the key to describe the csum pointed to by the path, and it
 * expects the csum to overlap the range [bytenr, len]
 *
 * The csum should not be entirely contained in the range and the range should
 * not be entirely contained in the csum.
 *
 * This calls btrfs_truncate_item with the correct args based on the overlap,
 * and fixes up the key as required.
 */
static noinline void truncate_one_csum(struct btrfs_trans_handle *trans,
                                       struct btrfs_path *path,
                                       struct btrfs_key *key,
                                       u64 bytenr, u64 len)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct extent_buffer *leaf;
        const u32 csum_size = fs_info->csum_size;
        u64 csum_end;
        u64 end_byte = bytenr + len;
        u32 blocksize_bits = fs_info->sectorsize_bits;

        leaf = path->nodes[0];
        csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size;
        csum_end <<= blocksize_bits;
        csum_end += key->offset;

        if (key->offset < bytenr && csum_end <= end_byte) {
                /*
                 *         [ bytenr - len ]
                 *         [   ]
                 *   [csum     ]
                 *   A simple truncate off the end of the item
                 */
                u32 new_size = (bytenr - key->offset) >> blocksize_bits;
                new_size *= csum_size;
                btrfs_truncate_item(trans, path, new_size, 1);
        } else if (key->offset >= bytenr && csum_end > end_byte &&
                   end_byte > key->offset) {
                /*
                 *         [ bytenr - len ]
                 *                 [ ]
                 *                 [csum     ]
                 * we need to truncate from the beginning of the csum
                 */
                u32 new_size = (csum_end - end_byte) >> blocksize_bits;
                new_size *= csum_size;

                btrfs_truncate_item(trans, path, new_size, 0);

                key->offset = end_byte;
                btrfs_set_item_key_safe(trans, path, key);
        } else {
                BUG();
        }
}

/*
 * Delete the csum items from the csum tree for a given range of bytes.
 */
int btrfs_del_csums(struct btrfs_trans_handle *trans,
                    struct btrfs_root *root, u64 bytenr, u64 len)
{
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_path *path;
        struct btrfs_key key;
        u64 end_byte = bytenr + len;
        u64 csum_end;
        struct extent_buffer *leaf;
        int ret = 0;
        const u32 csum_size = fs_info->csum_size;
        u32 blocksize_bits = fs_info->sectorsize_bits;

        ASSERT(btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID ||
               btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID);

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;

        while (1) {
                key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
                key.offset = end_byte - 1;
                key.type = BTRFS_EXTENT_CSUM_KEY;

                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
                if (ret > 0) {
                        ret = 0;
                        if (path->slots[0] == 0)
                                break;
                        path->slots[0]--;
                } else if (ret < 0) {
                        break;
                }

                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);

                if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
                    key.type != BTRFS_EXTENT_CSUM_KEY) {
                        break;
                }

                if (key.offset >= end_byte)
                        break;

                csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size;
                csum_end <<= blocksize_bits;
                csum_end += key.offset;

                /* this csum ends before we start, we're done */
                if (csum_end <= bytenr)
                        break;

                /* delete the entire item, it is inside our range */
                if (key.offset >= bytenr && csum_end <= end_byte) {
                        int del_nr = 1;

                        /*
                         * Check how many csum items preceding this one in this
                         * leaf correspond to our range and then delete them all
                         * at once.
                         */
                        if (key.offset > bytenr && path->slots[0] > 0) {
                                int slot = path->slots[0] - 1;

                                while (slot >= 0) {
                                        struct btrfs_key pk;

                                        btrfs_item_key_to_cpu(leaf, &pk, slot);
                                        if (pk.offset < bytenr ||
                                            pk.type != BTRFS_EXTENT_CSUM_KEY ||
                                            pk.objectid !=
                                            BTRFS_EXTENT_CSUM_OBJECTID)
                                                break;
                                        path->slots[0] = slot;
                                        del_nr++;
                                        key.offset = pk.offset;
                                        slot--;
                                }
                        }
                        ret = btrfs_del_items(trans, root, path,
                                              path->slots[0], del_nr);
                        if (ret)
                                break;
                        if (key.offset == bytenr)
                                break;
                } else if (key.offset < bytenr && csum_end > end_byte) {
                        unsigned long offset;
                        unsigned long shift_len;
                        unsigned long item_offset;
                        /*
                         *        [ bytenr - len ]
                         *     [csum                ]
                         *
                         * Our bytes are in the middle of the csum,
                         * we need to split this item and insert a new one.
                         *
                         * But we can't drop the path because the
                         * csum could change, get removed, extended etc.
                         *
                         * The trick here is the max size of a csum item leaves
                         * enough room in the tree block for a single
                         * item header.  So, we split the item in place,
                         * adding a new header pointing to the existing
                         * bytes.  Then we loop around again and we have
                         * a nicely formed csum item that we can neatly
                         * truncate.
                         */
                        offset = (bytenr - key.offset) >> blocksize_bits;
                        offset *= csum_size;

                        shift_len = (len >> blocksize_bits) * csum_size;

                        item_offset = btrfs_item_ptr_offset(leaf,
                                                            path->slots[0]);

                        memzero_extent_buffer(leaf, item_offset + offset,
                                             shift_len);
                        key.offset = bytenr;

                        /*
                         * btrfs_split_item returns -EAGAIN when the
                         * item changed size or key
                         */
                        ret = btrfs_split_item(trans, root, path, &key, offset);
                        if (ret && ret != -EAGAIN) {
                                btrfs_abort_transaction(trans, ret);
                                break;
                        }
                        ret = 0;

                        key.offset = end_byte - 1;
                } else {
                        truncate_one_csum(trans, path, &key, bytenr, len);
                        if (key.offset < bytenr)
                                break;
                }
                btrfs_release_path(path);
        }
        btrfs_free_path(path);
        return ret;
}

static int find_next_csum_offset(struct btrfs_root *root,
                                 struct btrfs_path *path,
                                 u64 *next_offset)
{
        const u32 nritems = btrfs_header_nritems(path->nodes[0]);
        struct btrfs_key found_key;
        int slot = path->slots[0] + 1;
        int ret;

        if (nritems == 0 || slot >= nritems) {
                ret = btrfs_next_leaf(root, path);
                if (ret < 0) {
                        return ret;
                } else if (ret > 0) {
                        *next_offset = (u64)-1;
                        return 0;
                }
                slot = path->slots[0];
        }

        btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);

        if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
            found_key.type != BTRFS_EXTENT_CSUM_KEY)
                *next_offset = (u64)-1;
        else
                *next_offset = found_key.offset;

        return 0;
}

int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct btrfs_ordered_sum *sums)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_key file_key;
        struct btrfs_key found_key;
        struct btrfs_path *path;
        struct btrfs_csum_item *item;
        struct btrfs_csum_item *item_end;
        struct extent_buffer *leaf = NULL;
        u64 next_offset;
        u64 total_bytes = 0;
        u64 csum_offset;
        u64 bytenr;
        u32 ins_size;
        int index = 0;
        int found_next;
        int ret;
        const u32 csum_size = fs_info->csum_size;

        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
again:
        next_offset = (u64)-1;
        found_next = 0;
        bytenr = sums->logical + total_bytes;
        file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
        file_key.offset = bytenr;
        file_key.type = BTRFS_EXTENT_CSUM_KEY;

        item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
        if (!IS_ERR(item)) {
                ret = 0;
                leaf = path->nodes[0];
                item_end = btrfs_item_ptr(leaf, path->slots[0],
                                          struct btrfs_csum_item);
                item_end = (struct btrfs_csum_item *)((char *)item_end +
                           btrfs_item_size(leaf, path->slots[0]));
                goto found;
        }
        ret = PTR_ERR(item);
        if (ret != -EFBIG && ret != -ENOENT)
                goto out;

        if (ret == -EFBIG) {
                u32 item_size;
                /* we found one, but it isn't big enough yet */
                leaf = path->nodes[0];
                item_size = btrfs_item_size(leaf, path->slots[0]);
                if ((item_size / csum_size) >=
                    MAX_CSUM_ITEMS(fs_info, csum_size)) {
                        /* already at max size, make a new one */
                        goto insert;
                }
        } else {
                /* We didn't find a csum item, insert one. */
                ret = find_next_csum_offset(root, path, &next_offset);
                if (ret < 0)
                        goto out;
                found_next = 1;
                goto insert;
        }

        /*
         * At this point, we know the tree has a checksum item that ends at an
         * offset matching the start of the checksum range we want to insert.
         * We try to extend that item as much as possible and then add as many
         * checksums to it as they fit.
         *
         * First check if the leaf has enough free space for at least one
         * checksum. If it has go directly to the item extension code, otherwise
         * release the path and do a search for insertion before the extension.
         */
        if (btrfs_leaf_free_space(leaf) >= csum_size) {
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                csum_offset = (bytenr - found_key.offset) >>
                        fs_info->sectorsize_bits;
                goto extend_csum;
        }

        btrfs_release_path(path);
        path->search_for_extension = 1;
        ret = btrfs_search_slot(trans, root, &file_key, path,
                                csum_size, 1);
        path->search_for_extension = 0;
        if (ret < 0)
                goto out;

        if (ret > 0) {
                if (path->slots[0] == 0)
                        goto insert;
                path->slots[0]--;
        }

        leaf = path->nodes[0];
        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
        csum_offset = (bytenr - found_key.offset) >> fs_info->sectorsize_bits;

        if (found_key.type != BTRFS_EXTENT_CSUM_KEY ||
            found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
            csum_offset >= MAX_CSUM_ITEMS(fs_info, csum_size)) {
                goto insert;
        }

extend_csum:
        if (csum_offset == btrfs_item_size(leaf, path->slots[0]) /
            csum_size) {
                int extend_nr;
                u64 tmp;
                u32 diff;

                tmp = sums->len - total_bytes;
                tmp >>= fs_info->sectorsize_bits;
                WARN_ON(tmp < 1);
                extend_nr = max_t(int, 1, tmp);

                /*
                 * A log tree can already have checksum items with a subset of
                 * the checksums we are trying to log. This can happen after
                 * doing a sequence of partial writes into prealloc extents and
                 * fsyncs in between, with a full fsync logging a larger subrange
                 * of an extent for which a previous fast fsync logged a smaller
                 * subrange. And this happens in particular due to merging file
                 * extent items when we complete an ordered extent for a range
                 * covered by a prealloc extent - this is done at
                 * btrfs_mark_extent_written().
                 *
                 * So if we try to extend the previous checksum item, which has
                 * a range that ends at the start of the range we want to insert,
                 * make sure we don't extend beyond the start offset of the next
                 * checksum item. If we are at the last item in the leaf, then
                 * forget the optimization of extending and add a new checksum
                 * item - it is not worth the complexity of releasing the path,
                 * getting the first key for the next leaf, repeat the btree
                 * search, etc, because log trees are temporary anyway and it
                 * would only save a few bytes of leaf space.
                 */
                if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) {
                        if (path->slots[0] + 1 >=
                            btrfs_header_nritems(path->nodes[0])) {
                                ret = find_next_csum_offset(root, path, &next_offset);
                                if (ret < 0)
                                        goto out;
                                found_next = 1;
                                goto insert;
                        }

                        ret = find_next_csum_offset(root, path, &next_offset);
                        if (ret < 0)
                                goto out;

                        tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits;
                        if (tmp <= INT_MAX)
                                extend_nr = min_t(int, extend_nr, tmp);
                }

                diff = (csum_offset + extend_nr) * csum_size;
                diff = min(diff,
                           MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size);

                diff = diff - btrfs_item_size(leaf, path->slots[0]);
                diff = min_t(u32, btrfs_leaf_free_space(leaf), diff);
                diff /= csum_size;
                diff *= csum_size;

                btrfs_extend_item(trans, path, diff);
                ret = 0;
                goto csum;
        }

insert:
        btrfs_release_path(path);
        csum_offset = 0;
        if (found_next) {
                u64 tmp;

                tmp = sums->len - total_bytes;
                tmp >>= fs_info->sectorsize_bits;
                tmp = min(tmp, (next_offset - file_key.offset) >>
                                         fs_info->sectorsize_bits);

                tmp = max_t(u64, 1, tmp);
                tmp = min_t(u64, tmp, MAX_CSUM_ITEMS(fs_info, csum_size));
                ins_size = csum_size * tmp;
        } else {
                ins_size = csum_size;
        }
        ret = btrfs_insert_empty_item(trans, root, path, &file_key,
                                      ins_size);
        if (ret < 0)
                goto out;
        leaf = path->nodes[0];
csum:
        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
        item_end = (struct btrfs_csum_item *)((unsigned char *)item +
                                      btrfs_item_size(leaf, path->slots[0]));
        item = (struct btrfs_csum_item *)((unsigned char *)item +
                                          csum_offset * csum_size);
found:
        ins_size = (u32)(sums->len - total_bytes) >> fs_info->sectorsize_bits;
        ins_size *= csum_size;
        ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
                              ins_size);
        write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
                            ins_size);

        index += ins_size;
        ins_size /= csum_size;
        total_bytes += ins_size * fs_info->sectorsize;

        btrfs_mark_buffer_dirty(trans, path->nodes[0]);
        if (total_bytes < sums->len) {
                btrfs_release_path(path);
                cond_resched();
                goto again;
        }
out:
        btrfs_free_path(path);
        return ret;
}

void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
                                     const struct btrfs_path *path,
                                     struct btrfs_file_extent_item *fi,
                                     struct extent_map *em)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct btrfs_root *root = inode->root;
        struct extent_buffer *leaf = path->nodes[0];
        const int slot = path->slots[0];
        struct btrfs_key key;
        u64 extent_start;
        u64 bytenr;
        u8 type = btrfs_file_extent_type(leaf, fi);
        int compress_type = btrfs_file_extent_compression(leaf, fi);

        btrfs_item_key_to_cpu(leaf, &key, slot);
        extent_start = key.offset;
        em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
        em->generation = btrfs_file_extent_generation(leaf, fi);
        if (type == BTRFS_FILE_EXTENT_REG ||
            type == BTRFS_FILE_EXTENT_PREALLOC) {
                em->start = extent_start;
                em->len = btrfs_file_extent_end(path) - extent_start;
                em->orig_start = extent_start -
                        btrfs_file_extent_offset(leaf, fi);
                em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
                if (bytenr == 0) {
                        em->block_start = EXTENT_MAP_HOLE;
                        return;
                }
                if (compress_type != BTRFS_COMPRESS_NONE) {
                        extent_map_set_compression(em, compress_type);
                        em->block_start = bytenr;
                        em->block_len = em->orig_block_len;
                } else {
                        bytenr += btrfs_file_extent_offset(leaf, fi);
                        em->block_start = bytenr;
                        em->block_len = em->len;
                        if (type == BTRFS_FILE_EXTENT_PREALLOC)
                                em->flags |= EXTENT_FLAG_PREALLOC;
                }
        } else if (type == BTRFS_FILE_EXTENT_INLINE) {
                /* Tree-checker has ensured this. */
                ASSERT(extent_start == 0);

                em->block_start = EXTENT_MAP_INLINE;
                em->start = 0;
                em->len = fs_info->sectorsize;
                /*
                 * Initialize orig_start and block_len with the same values
                 * as in inode.c:btrfs_get_extent().
                 */
                em->orig_start = EXTENT_MAP_HOLE;
                em->block_len = (u64)-1;
                extent_map_set_compression(em, compress_type);
        } else {
                btrfs_err(fs_info,
                          "unknown file extent item type %d, inode %llu, offset %llu, "
                          "root %llu", type, btrfs_ino(inode), extent_start,
                          btrfs_root_id(root));
        }
}

/*
 * Returns the end offset (non inclusive) of the file extent item the given path
 * points to. If it points to an inline extent, the returned offset is rounded
 * up to the sector size.
 */
u64 btrfs_file_extent_end(const struct btrfs_path *path)
{
        const struct extent_buffer *leaf = path->nodes[0];
        const int slot = path->slots[0];
        struct btrfs_file_extent_item *fi;
        struct btrfs_key key;
        u64 end;

        btrfs_item_key_to_cpu(leaf, &key, slot);
        ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);

        if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE)
                end = leaf->fs_info->sectorsize;
        else
                end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);

        return end;
}

























































    3 






    3 







    3 

    1 
    1 





    2 




































































    2 




    3 

    3 




    3 

















































































































    2 



    3 








    3 
    1 
    3 









    3 


    2 
    1 












    2 




















































    3 


    3 

    2 







    3 


















    2 


    2 








    2 


    1 


    2 
    2 

    2 





    2 









    2 
    2 











    1 









    2 





    1 



    1 









    1 






    2 

    2 


    1 














    2 















































    1 

    2 
    2 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/fs/ext4/page-io.c
 *
 * This contains the new page_io functions for ext4
 *
 * Written by Theodore Ts'o, 2010.
 */

#include <linux/fs.h>
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>

#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"

static struct kmem_cache *io_end_cachep;
static struct kmem_cache *io_end_vec_cachep;

int __init ext4_init_pageio(void)
{
        io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
        if (io_end_cachep == NULL)
                return -ENOMEM;

        io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0);
        if (io_end_vec_cachep == NULL) {
                kmem_cache_destroy(io_end_cachep);
                return -ENOMEM;
        }
        return 0;
}

void ext4_exit_pageio(void)
{
        kmem_cache_destroy(io_end_cachep);
        kmem_cache_destroy(io_end_vec_cachep);
}

struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end)
{
        struct ext4_io_end_vec *io_end_vec;

        io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS);
        if (!io_end_vec)
                return ERR_PTR(-ENOMEM);
        INIT_LIST_HEAD(&io_end_vec->list);
        list_add_tail(&io_end_vec->list, &io_end->list_vec);
        return io_end_vec;
}

static void ext4_free_io_end_vec(ext4_io_end_t *io_end)
{
        struct ext4_io_end_vec *io_end_vec, *tmp;

        if (list_empty(&io_end->list_vec))
                return;
        list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) {
                list_del(&io_end_vec->list);
                kmem_cache_free(io_end_vec_cachep, io_end_vec);
        }
}

struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end)
{
        BUG_ON(list_empty(&io_end->list_vec));
        return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list);
}

/*
 * Print an buffer I/O error compatible with the fs/buffer.c.  This
 * provides compatibility with dmesg scrapers that look for a specific
 * buffer I/O error message.  We really need a unified error reporting
 * structure to userspace ala Digital Unix's uerf system, but it's
 * probably not going to happen in my lifetime, due to LKML politics...
 */
static void buffer_io_error(struct buffer_head *bh)
{
        printk_ratelimited(KERN_ERR "Buffer I/O error on device %pg, logical block %llu\n",
                       bh->b_bdev,
                        (unsigned long long)bh->b_blocknr);
}

static void ext4_finish_bio(struct bio *bio)
{
        struct folio_iter fi;

        bio_for_each_folio_all(fi, bio) {
                struct folio *folio = fi.folio;
                struct folio *io_folio = NULL;
                struct buffer_head *bh, *head;
                size_t bio_start = fi.offset;
                size_t bio_end = bio_start + fi.length;
                unsigned under_io = 0;
                unsigned long flags;

                if (fscrypt_is_bounce_folio(folio)) {
                        io_folio = folio;
                        folio = fscrypt_pagecache_folio(folio);
                }

                if (bio->bi_status) {
                        int err = blk_status_to_errno(bio->bi_status);
                        mapping_set_error(folio->mapping, err);
                }
                bh = head = folio_buffers(folio);
                /*
                 * We check all buffers in the folio under b_uptodate_lock
                 * to avoid races with other end io clearing async_write flags
                 */
                spin_lock_irqsave(&head->b_uptodate_lock, flags);
                do {
                        if (bh_offset(bh) < bio_start ||
                            bh_offset(bh) + bh->b_size > bio_end) {
                                if (buffer_async_write(bh))
                                        under_io++;
                                continue;
                        }
                        clear_buffer_async_write(bh);
                        if (bio->bi_status) {
                                set_buffer_write_io_error(bh);
                                buffer_io_error(bh);
                        }
                } while ((bh = bh->b_this_page) != head);
                spin_unlock_irqrestore(&head->b_uptodate_lock, flags);
                if (!under_io) {
                        fscrypt_free_bounce_page(&io_folio->page);
                        folio_end_writeback(folio);
                }
        }
}

static void ext4_release_io_end(ext4_io_end_t *io_end)
{
        struct bio *bio, *next_bio;

        BUG_ON(!list_empty(&io_end->list));
        BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
        WARN_ON(io_end->handle);

        for (bio = io_end->bio; bio; bio = next_bio) {
                next_bio = bio->bi_private;
                ext4_finish_bio(bio);
                bio_put(bio);
        }
        ext4_free_io_end_vec(io_end);
        kmem_cache_free(io_end_cachep, io_end);
}

/*
 * Check a range of space and convert unwritten extents to written. Note that
 * we are protected from truncate touching same part of extent tree by the
 * fact that truncate code waits for all DIO to finish (thus exclusion from
 * direct IO is achieved) and also waits for PageWriteback bits. Thus we
 * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
 * completed (happens from ext4_free_ioend()).
 */
static int ext4_end_io_end(ext4_io_end_t *io_end)
{
        struct inode *inode = io_end->inode;
        handle_t *handle = io_end->handle;
        int ret = 0;

        ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p,"
                   "list->prev 0x%p\n",
                   io_end, inode->i_ino, io_end->list.next, io_end->list.prev);

        io_end->handle = NULL;        /* Following call will use up the handle */
        ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
        if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) {
                ext4_msg(inode->i_sb, KERN_EMERG,
                         "failed to convert unwritten extents to written "
                         "extents -- potential data loss!  "
                         "(inode %lu, error %d)", inode->i_ino, ret);
        }
        ext4_clear_io_unwritten_flag(io_end);
        ext4_release_io_end(io_end);
        return ret;
}

static void dump_completed_IO(struct inode *inode, struct list_head *head)
{
#ifdef        EXT4FS_DEBUG
        struct list_head *cur, *before, *after;
        ext4_io_end_t *io_end, *io_end0, *io_end1;

        if (list_empty(head))
                return;

        ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
        list_for_each_entry(io_end, head, list) {
                cur = &io_end->list;
                before = cur->prev;
                io_end0 = container_of(before, ext4_io_end_t, list);
                after = cur->next;
                io_end1 = container_of(after, ext4_io_end_t, list);

                ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
                            io_end, inode->i_ino, io_end0, io_end1);
        }
#endif
}

/* Add the io_end to per-inode completed end_io list. */
static void ext4_add_complete_io(ext4_io_end_t *io_end)
{
        struct ext4_inode_info *ei = EXT4_I(io_end->inode);
        struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb);
        struct workqueue_struct *wq;
        unsigned long flags;

        /* Only reserved conversions from writeback should enter here */
        WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
        WARN_ON(!io_end->handle && sbi->s_journal);
        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
        wq = sbi->rsv_conversion_wq;
        if (list_empty(&ei->i_rsv_conversion_list))
                queue_work(wq, &ei->i_rsv_conversion_work);
        list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
}

static int ext4_do_flush_completed_IO(struct inode *inode,
                                      struct list_head *head)
{
        ext4_io_end_t *io_end;
        struct list_head unwritten;
        unsigned long flags;
        struct ext4_inode_info *ei = EXT4_I(inode);
        int err, ret = 0;

        spin_lock_irqsave(&ei->i_completed_io_lock, flags);
        dump_completed_IO(inode, head);
        list_replace_init(head, &unwritten);
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);

        while (!list_empty(&unwritten)) {
                io_end = list_entry(unwritten.next, ext4_io_end_t, list);
                BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
                list_del_init(&io_end->list);

                err = ext4_end_io_end(io_end);
                if (unlikely(!ret && err))
                        ret = err;
        }
        return ret;
}

/*
 * work on completed IO, to convert unwritten extents to extents
 */
void ext4_end_io_rsv_work(struct work_struct *work)
{
        struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
                                                  i_rsv_conversion_work);
        ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
}

ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
{
        ext4_io_end_t *io_end = kmem_cache_zalloc(io_end_cachep, flags);

        if (io_end) {
                io_end->inode = inode;
                INIT_LIST_HEAD(&io_end->list);
                INIT_LIST_HEAD(&io_end->list_vec);
                refcount_set(&io_end->count, 1);
        }
        return io_end;
}

void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
        if (refcount_dec_and_test(&io_end->count)) {
                if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
                                list_empty(&io_end->list_vec)) {
                        ext4_release_io_end(io_end);
                        return;
                }
                ext4_add_complete_io(io_end);
        }
}

int ext4_put_io_end(ext4_io_end_t *io_end)
{
        int err = 0;

        if (refcount_dec_and_test(&io_end->count)) {
                if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
                        err = ext4_convert_unwritten_io_end_vec(io_end->handle,
                                                                io_end);
                        io_end->handle = NULL;
                        ext4_clear_io_unwritten_flag(io_end);
                }
                ext4_release_io_end(io_end);
        }
        return err;
}

ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
{
        refcount_inc(&io_end->count);
        return io_end;
}

/* BIO completion function for page writeback */
static void ext4_end_bio(struct bio *bio)
{
        ext4_io_end_t *io_end = bio->bi_private;
        sector_t bi_sector = bio->bi_iter.bi_sector;

        if (WARN_ONCE(!io_end, "io_end is NULL: %pg: sector %Lu len %u err %d\n",
                      bio->bi_bdev,
                      (long long) bio->bi_iter.bi_sector,
                      (unsigned) bio_sectors(bio),
                      bio->bi_status)) {
                ext4_finish_bio(bio);
                bio_put(bio);
                return;
        }
        bio->bi_end_io = NULL;

        if (bio->bi_status) {
                struct inode *inode = io_end->inode;

                ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
                             "starting block %llu)",
                             bio->bi_status, inode->i_ino,
                             (unsigned long long)
                             bi_sector >> (inode->i_blkbits - 9));
                mapping_set_error(inode->i_mapping,
                                blk_status_to_errno(bio->bi_status));
        }

        if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
                /*
                 * Link bio into list hanging from io_end. We have to do it
                 * atomically as bio completions can be racing against each
                 * other.
                 */
                bio->bi_private = xchg(&io_end->bio, bio);
                ext4_put_io_end_defer(io_end);
        } else {
                /*
                 * Drop io_end reference early. Inode can get freed once
                 * we finish the bio.
                 */
                ext4_put_io_end_defer(io_end);
                ext4_finish_bio(bio);
                bio_put(bio);
        }
}

void ext4_io_submit(struct ext4_io_submit *io)
{
        struct bio *bio = io->io_bio;

        if (bio) {
                if (io->io_wbc->sync_mode == WB_SYNC_ALL)
                        io->io_bio->bi_opf |= REQ_SYNC;
                submit_bio(io->io_bio);
        }
        io->io_bio = NULL;
}

void ext4_io_submit_init(struct ext4_io_submit *io,
                         struct writeback_control *wbc)
{
        io->io_wbc = wbc;
        io->io_bio = NULL;
        io->io_end = NULL;
}

static void io_submit_init_bio(struct ext4_io_submit *io,
                               struct buffer_head *bh)
{
        struct bio *bio;

        /*
         * bio_alloc will _always_ be able to allocate a bio if
         * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
         */
        bio = bio_alloc(bh->b_bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOIO);
        fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_end_io = ext4_end_bio;
        bio->bi_private = ext4_get_io_end(io->io_end);
        io->io_bio = bio;
        io->io_next_block = bh->b_blocknr;
        wbc_init_bio(io->io_wbc, bio);
}

static void io_submit_add_bh(struct ext4_io_submit *io,
                             struct inode *inode,
                             struct folio *folio,
                             struct folio *io_folio,
                             struct buffer_head *bh)
{
        if (io->io_bio && (bh->b_blocknr != io->io_next_block ||
                           !fscrypt_mergeable_bio_bh(io->io_bio, bh))) {
submit_and_retry:
                ext4_io_submit(io);
        }
        if (io->io_bio == NULL)
                io_submit_init_bio(io, bh);
        if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh)))
                goto submit_and_retry;
        wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size);
        io->io_next_block++;
}

int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
                size_t len)
{
        struct folio *io_folio = folio;
        struct inode *inode = folio->mapping->host;
        unsigned block_start;
        struct buffer_head *bh, *head;
        int ret = 0;
        int nr_to_submit = 0;
        struct writeback_control *wbc = io->io_wbc;
        bool keep_towrite = false;

        BUG_ON(!folio_test_locked(folio));
        BUG_ON(folio_test_writeback(folio));

        /*
         * Comments copied from block_write_full_folio:
         *
         * The folio straddles i_size.  It must be zeroed out on each and every
         * writepage invocation because it may be mmapped.  "A file is mapped
         * in multiples of the page size.  For a file that is not a multiple of
         * the page size, the remaining memory is zeroed when mapped, and
         * writes to that region are not written out to the file."
         */
        if (len < folio_size(folio))
                folio_zero_segment(folio, len, folio_size(folio));
        /*
         * In the first loop we prepare and mark buffers to submit. We have to
         * mark all buffers in the folio before submitting so that
         * folio_end_writeback() cannot be called from ext4_end_bio() when IO
         * on the first buffer finishes and we are still working on submitting
         * the second buffer.
         */
        bh = head = folio_buffers(folio);
        do {
                block_start = bh_offset(bh);
                if (block_start >= len) {
                        clear_buffer_dirty(bh);
                        set_buffer_uptodate(bh);
                        continue;
                }
                if (!buffer_dirty(bh) || buffer_delay(bh) ||
                    !buffer_mapped(bh) || buffer_unwritten(bh)) {
                        /* A hole? We can safely clear the dirty bit */
                        if (!buffer_mapped(bh))
                                clear_buffer_dirty(bh);
                        /*
                         * Keeping dirty some buffer we cannot write? Make sure
                         * to redirty the folio and keep TOWRITE tag so that
                         * racing WB_SYNC_ALL writeback does not skip the folio.
                         * This happens e.g. when doing writeout for
                         * transaction commit or when journalled data is not
                         * yet committed.
                         */
                        if (buffer_dirty(bh) ||
                            (buffer_jbd(bh) && buffer_jbddirty(bh))) {
                                if (!folio_test_dirty(folio))
                                        folio_redirty_for_writepage(wbc, folio);
                                keep_towrite = true;
                        }
                        continue;
                }
                if (buffer_new(bh))
                        clear_buffer_new(bh);
                set_buffer_async_write(bh);
                clear_buffer_dirty(bh);
                nr_to_submit++;
        } while ((bh = bh->b_this_page) != head);

        /* Nothing to submit? Just unlock the folio... */
        if (!nr_to_submit)
                return 0;

        bh = head = folio_buffers(folio);

        /*
         * If any blocks are being written to an encrypted file, encrypt them
         * into a bounce page.  For simplicity, just encrypt until the last
         * block which might be needed.  This may cause some unneeded blocks
         * (e.g. holes) to be unnecessarily encrypted, but this is rare and
         * can't happen in the common case of blocksize == PAGE_SIZE.
         */
        if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
                gfp_t gfp_flags = GFP_NOFS;
                unsigned int enc_bytes = round_up(len, i_blocksize(inode));
                struct page *bounce_page;

                /*
                 * Since bounce page allocation uses a mempool, we can only use
                 * a waiting mask (i.e. request guaranteed allocation) on the
                 * first page of the bio.  Otherwise it can deadlock.
                 */
                if (io->io_bio)
                        gfp_flags = GFP_NOWAIT | __GFP_NOWARN;
        retry_encrypt:
                bounce_page = fscrypt_encrypt_pagecache_blocks(&folio->page,
                                        enc_bytes, 0, gfp_flags);
                if (IS_ERR(bounce_page)) {
                        ret = PTR_ERR(bounce_page);
                        if (ret == -ENOMEM &&
                            (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) {
                                gfp_t new_gfp_flags = GFP_NOFS;
                                if (io->io_bio)
                                        ext4_io_submit(io);
                                else
                                        new_gfp_flags |= __GFP_NOFAIL;
                                memalloc_retry_wait(gfp_flags);
                                gfp_flags = new_gfp_flags;
                                goto retry_encrypt;
                        }

                        printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
                        folio_redirty_for_writepage(wbc, folio);
                        do {
                                if (buffer_async_write(bh)) {
                                        clear_buffer_async_write(bh);
                                        set_buffer_dirty(bh);
                                }
                                bh = bh->b_this_page;
                        } while (bh != head);

                        return ret;
                }
                io_folio = page_folio(bounce_page);
        }

        __folio_start_writeback(folio, keep_towrite);

        /* Now submit buffers to write */
        do {
                if (!buffer_async_write(bh))
                        continue;
                io_submit_add_bh(io, inode, folio, io_folio, bh);
        } while ((bh = bh->b_this_page) != head);

        return 0;
}






























   15 













   12 
   14 


   14 







   15 











    5 

    5 


    5 


    5 
    5 













   15 

   11 


   15 
   13 































































































































































































































   15 




   15 
   15 

   13 



















    7 





    6 
    7 
    7 

































    2 















    1 


























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef LINUX_MM_INLINE_H
#define LINUX_MM_INLINE_H

#include <linux/atomic.h>
#include <linux/huge_mm.h>
#include <linux/mm_types.h>
#include <linux/swap.h>
#include <linux/string.h>
#include <linux/userfaultfd_k.h>
#include <linux/swapops.h>

/**
 * folio_is_file_lru - Should the folio be on a file LRU or anon LRU?
 * @folio: The folio to test.
 *
 * We would like to get this info without a page flag, but the state
 * needs to survive until the folio is last deleted from the LRU, which
 * could be as far down as __page_cache_release.
 *
 * Return: An integer (not a boolean!) used to sort a folio onto the
 * right LRU list and to account folios correctly.
 * 1 if @folio is a regular filesystem backed page cache folio
 * or a lazily freed anonymous folio (e.g. via MADV_FREE).
 * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise
 * ram or swap backed folio.
 */
static inline int folio_is_file_lru(struct folio *folio)
{
        return !folio_test_swapbacked(folio);
}

static inline int page_is_file_lru(struct page *page)
{
        return folio_is_file_lru(page_folio(page));
}

static __always_inline void __update_lru_size(struct lruvec *lruvec,
                                enum lru_list lru, enum zone_type zid,
                                long nr_pages)
{
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);

        lockdep_assert_held(&lruvec->lru_lock);
        WARN_ON_ONCE(nr_pages != (int)nr_pages);

        __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
        __mod_zone_page_state(&pgdat->node_zones[zid],
                                NR_ZONE_LRU_BASE + lru, nr_pages);
}

static __always_inline void update_lru_size(struct lruvec *lruvec,
                                enum lru_list lru, enum zone_type zid,
                                long nr_pages)
{
        __update_lru_size(lruvec, lru, zid, nr_pages);
#ifdef CONFIG_MEMCG
        mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
#endif
}

/**
 * __folio_clear_lru_flags - Clear page lru flags before releasing a page.
 * @folio: The folio that was on lru and now has a zero reference.
 */
static __always_inline void __folio_clear_lru_flags(struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio);

        __folio_clear_lru(folio);

        /* this shouldn't happen, so leave the flags to bad_page() */
        if (folio_test_active(folio) && folio_test_unevictable(folio))
                return;

        __folio_clear_active(folio);
        __folio_clear_unevictable(folio);
}

/**
 * folio_lru_list - Which LRU list should a folio be on?
 * @folio: The folio to test.
 *
 * Return: The LRU list a folio should be on, as an index
 * into the array of LRU lists.
 */
static __always_inline enum lru_list folio_lru_list(struct folio *folio)
{
        enum lru_list lru;

        VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio);

        if (folio_test_unevictable(folio))
                return LRU_UNEVICTABLE;

        lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
        if (folio_test_active(folio))
                lru += LRU_ACTIVE;

        return lru;
}

#ifdef CONFIG_LRU_GEN

#ifdef CONFIG_LRU_GEN_ENABLED
static inline bool lru_gen_enabled(void)
{
        DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);

        return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
}
#else
static inline bool lru_gen_enabled(void)
{
        DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);

        return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
}
#endif

static inline bool lru_gen_in_fault(void)
{
        return current->in_lru_fault;
}

static inline int lru_gen_from_seq(unsigned long seq)
{
        return seq % MAX_NR_GENS;
}

static inline int lru_hist_from_seq(unsigned long seq)
{
        return seq % NR_HIST_GENS;
}

static inline int lru_tier_from_refs(int refs)
{
        VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));

        /* see the comment in folio_lru_refs() */
        return order_base_2(refs + 1);
}

static inline int folio_lru_refs(struct folio *folio)
{
        unsigned long flags = READ_ONCE(folio->flags);
        bool workingset = flags & BIT(PG_workingset);

        /*
         * Return the number of accesses beyond PG_referenced, i.e., N-1 if the
         * total number of accesses is N>1, since N=0,1 both map to the first
         * tier. lru_tier_from_refs() will account for this off-by-one. Also see
         * the comment on MAX_NR_TIERS.
         */
        return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset;
}

static inline int folio_lru_gen(struct folio *folio)
{
        unsigned long flags = READ_ONCE(folio->flags);

        return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
}

static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
{
        unsigned long max_seq = lruvec->lrugen.max_seq;

        VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);

        /* see the comment on MIN_NR_GENS */
        return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
}

static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio,
                                       int old_gen, int new_gen)
{
        int type = folio_is_file_lru(folio);
        int zone = folio_zonenum(folio);
        int delta = folio_nr_pages(folio);
        enum lru_list lru = type * LRU_INACTIVE_FILE;
        struct lru_gen_folio *lrugen = &lruvec->lrugen;

        VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
        VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
        VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);

        if (old_gen >= 0)
                WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
                           lrugen->nr_pages[old_gen][type][zone] - delta);
        if (new_gen >= 0)
                WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
                           lrugen->nr_pages[new_gen][type][zone] + delta);

        /* addition */
        if (old_gen < 0) {
                if (lru_gen_is_active(lruvec, new_gen))
                        lru += LRU_ACTIVE;
                __update_lru_size(lruvec, lru, zone, delta);
                return;
        }

        /* deletion */
        if (new_gen < 0) {
                if (lru_gen_is_active(lruvec, old_gen))
                        lru += LRU_ACTIVE;
                __update_lru_size(lruvec, lru, zone, -delta);
                return;
        }

        /* promotion */
        if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
                __update_lru_size(lruvec, lru, zone, -delta);
                __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
        }

        /* demotion requires isolation, e.g., lru_deactivate_fn() */
        VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
}

static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
        unsigned long seq;
        unsigned long flags;
        int gen = folio_lru_gen(folio);
        int type = folio_is_file_lru(folio);
        int zone = folio_zonenum(folio);
        struct lru_gen_folio *lrugen = &lruvec->lrugen;

        VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);

        if (folio_test_unevictable(folio) || !lrugen->enabled)
                return false;
        /*
         * There are four common cases for this page:
         * 1. If it's hot, i.e., freshly faulted in, add it to the youngest
         *    generation, and it's protected over the rest below.
         * 2. If it can't be evicted immediately, i.e., a dirty page pending
         *    writeback, add it to the second youngest generation.
         * 3. If it should be evicted first, e.g., cold and clean from
         *    folio_rotate_reclaimable(), add it to the oldest generation.
         * 4. Everything else falls between 2 & 3 above and is added to the
         *    second oldest generation if it's considered inactive, or the
         *    oldest generation otherwise. See lru_gen_is_active().
         */
        if (folio_test_active(folio))
                seq = lrugen->max_seq;
        else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
                 (folio_test_reclaim(folio) &&
                  (folio_test_dirty(folio) || folio_test_writeback(folio))))
                seq = lrugen->max_seq - 1;
        else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq)
                seq = lrugen->min_seq[type];
        else
                seq = lrugen->min_seq[type] + 1;

        gen = lru_gen_from_seq(seq);
        flags = (gen + 1UL) << LRU_GEN_PGOFF;
        /* see the comment on MIN_NR_GENS about PG_active */
        set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);

        lru_gen_update_size(lruvec, folio, -1, gen);
        /* for folio_rotate_reclaimable() */
        if (reclaiming)
                list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
        else
                list_add(&folio->lru, &lrugen->folios[gen][type][zone]);

        return true;
}

static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
        unsigned long flags;
        int gen = folio_lru_gen(folio);

        if (gen < 0)
                return false;

        VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
        VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);

        /* for folio_migrate_flags() */
        flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
        flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags);
        gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;

        lru_gen_update_size(lruvec, folio, gen, -1);
        list_del(&folio->lru);

        return true;
}

#else /* !CONFIG_LRU_GEN */

static inline bool lru_gen_enabled(void)
{
        return false;
}

static inline bool lru_gen_in_fault(void)
{
        return false;
}

static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
        return false;
}

static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
        return false;
}

#endif /* CONFIG_LRU_GEN */

static __always_inline
void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
{
        enum lru_list lru = folio_lru_list(folio);

        if (lru_gen_add_folio(lruvec, folio, false))
                return;

        update_lru_size(lruvec, lru, folio_zonenum(folio),
                        folio_nr_pages(folio));
        if (lru != LRU_UNEVICTABLE)
                list_add(&folio->lru, &lruvec->lists[lru]);
}

static __always_inline
void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
{
        enum lru_list lru = folio_lru_list(folio);

        if (lru_gen_add_folio(lruvec, folio, true))
                return;

        update_lru_size(lruvec, lru, folio_zonenum(folio),
                        folio_nr_pages(folio));
        /* This is not expected to be used on LRU_UNEVICTABLE */
        list_add_tail(&folio->lru, &lruvec->lists[lru]);
}

static __always_inline
void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
{
        enum lru_list lru = folio_lru_list(folio);

        if (lru_gen_del_folio(lruvec, folio, false))
                return;

        if (lru != LRU_UNEVICTABLE)
                list_del(&folio->lru);
        update_lru_size(lruvec, lru, folio_zonenum(folio),
                        -folio_nr_pages(folio));
}

#ifdef CONFIG_ANON_VMA_NAME
/* mmap_lock should be read-locked */
static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
{
        if (anon_name)
                kref_get(&anon_name->kref);
}

static inline void anon_vma_name_put(struct anon_vma_name *anon_name)
{
        if (anon_name)
                kref_put(&anon_name->kref, anon_vma_name_free);
}

static inline
struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name)
{
        /* Prevent anon_name refcount saturation early on */
        if (kref_read(&anon_name->kref) < REFCOUNT_MAX) {
                anon_vma_name_get(anon_name);
                return anon_name;

        }
        return anon_vma_name_alloc(anon_name->name);
}

static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
                                     struct vm_area_struct *new_vma)
{
        struct anon_vma_name *anon_name = anon_vma_name(orig_vma);

        if (anon_name)
                new_vma->anon_name = anon_vma_name_reuse(anon_name);
}

static inline void free_anon_vma_name(struct vm_area_struct *vma)
{
        /*
         * Not using anon_vma_name because it generates a warning if mmap_lock
         * is not held, which might be the case here.
         */
        anon_vma_name_put(vma->anon_name);
}

static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
                                    struct anon_vma_name *anon_name2)
{
        if (anon_name1 == anon_name2)
                return true;

        return anon_name1 && anon_name2 &&
                !strcmp(anon_name1->name, anon_name2->name);
}

#else /* CONFIG_ANON_VMA_NAME */
static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
                                     struct vm_area_struct *new_vma) {}
static inline void free_anon_vma_name(struct vm_area_struct *vma) {}

static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
                                    struct anon_vma_name *anon_name2)
{
        return true;
}

#endif  /* CONFIG_ANON_VMA_NAME */

static inline void init_tlb_flush_pending(struct mm_struct *mm)
{
        atomic_set(&mm->tlb_flush_pending, 0);
}

static inline void inc_tlb_flush_pending(struct mm_struct *mm)
{
        atomic_inc(&mm->tlb_flush_pending);
        /*
         * The only time this value is relevant is when there are indeed pages
         * to flush. And we'll only flush pages after changing them, which
         * requires the PTL.
         *
         * So the ordering here is:
         *
         *        atomic_inc(&mm->tlb_flush_pending);
         *        spin_lock(&ptl);
         *        ...
         *        set_pte_at();
         *        spin_unlock(&ptl);
         *
         *                                spin_lock(&ptl)
         *                                mm_tlb_flush_pending();
         *                                ....
         *                                spin_unlock(&ptl);
         *
         *        flush_tlb_range();
         *        atomic_dec(&mm->tlb_flush_pending);
         *
         * Where the increment if constrained by the PTL unlock, it thus
         * ensures that the increment is visible if the PTE modification is
         * visible. After all, if there is no PTE modification, nobody cares
         * about TLB flushes either.
         *
         * This very much relies on users (mm_tlb_flush_pending() and
         * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and
         * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc
         * locks (PPC) the unlock of one doesn't order against the lock of
         * another PTL.
         *
         * The decrement is ordered by the flush_tlb_range(), such that
         * mm_tlb_flush_pending() will not return false unless all flushes have
         * completed.
         */
}

static inline void dec_tlb_flush_pending(struct mm_struct *mm)
{
        /*
         * See inc_tlb_flush_pending().
         *
         * This cannot be smp_mb__before_atomic() because smp_mb() simply does
         * not order against TLB invalidate completion, which is what we need.
         *
         * Therefore we must rely on tlb_flush_*() to guarantee order.
         */
        atomic_dec(&mm->tlb_flush_pending);
}

static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
{
        /*
         * Must be called after having acquired the PTL; orders against that
         * PTLs release and therefore ensures that if we observe the modified
         * PTE we must also observe the increment from inc_tlb_flush_pending().
         *
         * That is, it only guarantees to return true if there is a flush
         * pending for _this_ PTL.
         */
        return atomic_read(&mm->tlb_flush_pending);
}

static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
{
        /*
         * Similar to mm_tlb_flush_pending(), we must have acquired the PTL
         * for which there is a TLB flush pending in order to guarantee
         * we've seen both that PTE modification and the increment.
         *
         * (no requirement on actually still holding the PTL, that is irrelevant)
         */
        return atomic_read(&mm->tlb_flush_pending) > 1;
}

#ifdef CONFIG_MMU
/*
 * Computes the pte marker to copy from the given source entry into dst_vma.
 * If no marker should be copied, returns 0.
 * The caller should insert a new pte created with make_pte_marker().
 */
static inline pte_marker copy_pte_marker(
                swp_entry_t entry, struct vm_area_struct *dst_vma)
{
        pte_marker srcm = pte_marker_get(entry);
        /* Always copy error entries. */
        pte_marker dstm = srcm & PTE_MARKER_POISONED;

        /* Only copy PTE markers if UFFD register matches. */
        if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma))
                dstm |= PTE_MARKER_UFFD_WP;

        return dstm;
}
#endif

/*
 * If this pte is wr-protected by uffd-wp in any form, arm the special pte to
 * replace a none pte.  NOTE!  This should only be called when *pte is already
 * cleared so we will never accidentally replace something valuable.  Meanwhile
 * none pte also means we are not demoting the pte so tlb flushed is not needed.
 * E.g., when pte cleared the caller should have taken care of the tlb flush.
 *
 * Must be called with pgtable lock held so that no thread will see the none
 * pte, and if they see it, they'll fault and serialize at the pgtable lock.
 *
 * This function is a no-op if PTE_MARKER_UFFD_WP is not enabled.
 */
static inline void
pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
                              pte_t *pte, pte_t pteval)
{
#ifdef CONFIG_PTE_MARKER_UFFD_WP
        bool arm_uffd_pte = false;

        /* The current status of the pte should be "cleared" before calling */
        WARN_ON_ONCE(!pte_none(ptep_get(pte)));

        /*
         * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole
         * thing, because when zapping either it means it's dropping the
         * page, or in TTU where the present pte will be quickly replaced
         * with a swap pte.  There's no way of leaking the bit.
         */
        if (vma_is_anonymous(vma) || !userfaultfd_wp(vma))
                return;

        /* A uffd-wp wr-protected normal pte */
        if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval)))
                arm_uffd_pte = true;

        /*
         * A uffd-wp wr-protected swap pte.  Note: this should even cover an
         * existing pte marker with uffd-wp bit set.
         */
        if (unlikely(pte_swp_uffd_wp_any(pteval)))
                arm_uffd_pte = true;

        if (unlikely(arm_uffd_pte))
                set_pte_at(vma->vm_mm, addr, pte,
                           make_pte_marker(PTE_MARKER_UFFD_WP));
#endif
}

static inline bool vma_has_recency(struct vm_area_struct *vma)
{
        if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))
                return false;

        if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE))
                return false;

        return true;
}

#endif











































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
/* SPDX-License-Identifier: GPL-2.0 */
/* thread_info.h: common low-level thread information accessors
 *
 * Copyright (C) 2002  David Howells (dhowells@redhat.com)
 * - Incorporating suggestions made by Linus Torvalds
 */

#ifndef _LINUX_THREAD_INFO_H
#define _LINUX_THREAD_INFO_H

#include <linux/types.h>
#include <linux/limits.h>
#include <linux/bug.h>
#include <linux/restart_block.h>
#include <linux/errno.h>

#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
 * For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the
 * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels,
 * including <asm/current.h> can cause a circular dependency on some platforms.
 */
#include <asm/current.h>
#define current_thread_info() ((struct thread_info *)current)
#endif

#include <linux/bitops.h>

/*
 * For per-arch arch_within_stack_frames() implementations, defined in
 * asm/thread_info.h.
 */
enum {
        BAD_STACK = -1,
        NOT_STACK = 0,
        GOOD_FRAME,
        GOOD_STACK,
};

#ifdef CONFIG_GENERIC_ENTRY
enum syscall_work_bit {
        SYSCALL_WORK_BIT_SECCOMP,
        SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT,
        SYSCALL_WORK_BIT_SYSCALL_TRACE,
        SYSCALL_WORK_BIT_SYSCALL_EMU,
        SYSCALL_WORK_BIT_SYSCALL_AUDIT,
        SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH,
        SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP,
};

#define SYSCALL_WORK_SECCOMP                BIT(SYSCALL_WORK_BIT_SECCOMP)
#define SYSCALL_WORK_SYSCALL_TRACEPOINT        BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
#define SYSCALL_WORK_SYSCALL_TRACE        BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
#define SYSCALL_WORK_SYSCALL_EMU        BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
#define SYSCALL_WORK_SYSCALL_AUDIT        BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
#define SYSCALL_WORK_SYSCALL_EXIT_TRAP        BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
#endif

#include <asm/thread_info.h>

#ifdef __KERNEL__

#ifndef arch_set_restart_data
#define arch_set_restart_data(restart) do { } while (0)
#endif

static inline long set_restart_fn(struct restart_block *restart,
                                        long (*fn)(struct restart_block *))
{
        restart->fn = fn;
        arch_set_restart_data(restart);
        return -ERESTART_RESTARTBLOCK;
}

#ifndef THREAD_ALIGN
#define THREAD_ALIGN        THREAD_SIZE
#endif

#define THREADINFO_GFP                (GFP_KERNEL_ACCOUNT | __GFP_ZERO)

/*
 * flag set/clear/test wrappers
 * - pass TIF_xxxx constants to these functions
 */

static inline void set_ti_thread_flag(struct thread_info *ti, int flag)
{
        set_bit(flag, (unsigned long *)&ti->flags);
}

static inline void clear_ti_thread_flag(struct thread_info *ti, int flag)
{
        clear_bit(flag, (unsigned long *)&ti->flags);
}

static inline void update_ti_thread_flag(struct thread_info *ti, int flag,
                                         bool value)
{
        if (value)
                set_ti_thread_flag(ti, flag);
        else
                clear_ti_thread_flag(ti, flag);
}

static inline int test_and_set_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_and_set_bit(flag, (unsigned long *)&ti->flags);
}

static inline int test_and_clear_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_and_clear_bit(flag, (unsigned long *)&ti->flags);
}

static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_bit(flag, (unsigned long *)&ti->flags);
}

/*
 * This may be used in noinstr code, and needs to be __always_inline to prevent
 * inadvertent instrumentation.
 */
static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti)
{
        return READ_ONCE(ti->flags);
}

#define set_thread_flag(flag) \
        set_ti_thread_flag(current_thread_info(), flag)
#define clear_thread_flag(flag) \
        clear_ti_thread_flag(current_thread_info(), flag)
#define update_thread_flag(flag, value) \
        update_ti_thread_flag(current_thread_info(), flag, value)
#define test_and_set_thread_flag(flag) \
        test_and_set_ti_thread_flag(current_thread_info(), flag)
#define test_and_clear_thread_flag(flag) \
        test_and_clear_ti_thread_flag(current_thread_info(), flag)
#define test_thread_flag(flag) \
        test_ti_thread_flag(current_thread_info(), flag)
#define read_thread_flags() \
        read_ti_thread_flags(current_thread_info())

#define read_task_thread_flags(t) \
        read_ti_thread_flags(task_thread_info(t))

#ifdef CONFIG_GENERIC_ENTRY
#define set_syscall_work(fl) \
        set_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)
#define test_syscall_work(fl) \
        test_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)
#define clear_syscall_work(fl) \
        clear_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)

#define set_task_syscall_work(t, fl) \
        set_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)
#define test_task_syscall_work(t, fl) \
        test_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)
#define clear_task_syscall_work(t, fl) \
        clear_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)

#else /* CONFIG_GENERIC_ENTRY */

#define set_syscall_work(fl)                                                \
        set_ti_thread_flag(current_thread_info(), TIF_##fl)
#define test_syscall_work(fl) \
        test_ti_thread_flag(current_thread_info(), TIF_##fl)
#define clear_syscall_work(fl) \
        clear_ti_thread_flag(current_thread_info(), TIF_##fl)

#define set_task_syscall_work(t, fl) \
        set_ti_thread_flag(task_thread_info(t), TIF_##fl)
#define test_task_syscall_work(t, fl) \
        test_ti_thread_flag(task_thread_info(t), TIF_##fl)
#define clear_task_syscall_work(t, fl) \
        clear_ti_thread_flag(task_thread_info(t), TIF_##fl)
#endif /* !CONFIG_GENERIC_ENTRY */

#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H

static __always_inline bool tif_need_resched(void)
{
        return arch_test_bit(TIF_NEED_RESCHED,
                             (unsigned long *)(&current_thread_info()->flags));
}

#else

static __always_inline bool tif_need_resched(void)
{
        return test_bit(TIF_NEED_RESCHED,
                        (unsigned long *)(&current_thread_info()->flags));
}

#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */

#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
static inline int arch_within_stack_frames(const void * const stack,
                                           const void * const stackend,
                                           const void *obj, unsigned long len)
{
        return 0;
}
#endif

#ifdef CONFIG_HARDENED_USERCOPY
extern void __check_object_size(const void *ptr, unsigned long n,
                                        bool to_user);

static __always_inline void check_object_size(const void *ptr, unsigned long n,
                                              bool to_user)
{
        if (!__builtin_constant_p(n))
                __check_object_size(ptr, n, to_user);
}
#else
static inline void check_object_size(const void *ptr, unsigned long n,
                                     bool to_user)
{ }
#endif /* CONFIG_HARDENED_USERCOPY */

extern void __compiletime_error("copy source size is too small")
__bad_copy_from(void);
extern void __compiletime_error("copy destination size is too small")
__bad_copy_to(void);

void __copy_overflow(int size, unsigned long count);

static inline void copy_overflow(int size, unsigned long count)
{
        if (IS_ENABLED(CONFIG_BUG))
                __copy_overflow(size, count);
}

static __always_inline __must_check bool
check_copy_size(const void *addr, size_t bytes, bool is_source)
{
        int sz = __builtin_object_size(addr, 0);
        if (unlikely(sz >= 0 && sz < bytes)) {
                if (!__builtin_constant_p(bytes))
                        copy_overflow(sz, bytes);
                else if (is_source)
                        __bad_copy_from();
                else
                        __bad_copy_to();
                return false;
        }
        if (WARN_ON_ONCE(bytes > INT_MAX))
                return false;
        check_object_size(addr, bytes, is_source);
        return true;
}

#ifndef arch_setup_new_exec
static inline void arch_setup_new_exec(void) { }
#endif

void arch_task_cache_init(void); /* for CONFIG_SH */
void arch_release_task_struct(struct task_struct *tsk);
int arch_dup_task_struct(struct task_struct *dst,
                                struct task_struct *src);

#endif        /* __KERNEL__ */

#endif /* _LINUX_THREAD_INFO_H */









































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MBCACHE_H
#define _LINUX_MBCACHE_H

#include <linux/hash.h>
#include <linux/list_bl.h>
#include <linux/list.h>
#include <linux/atomic.h>
#include <linux/fs.h>

struct mb_cache;

/* Cache entry flags */
enum {
        MBE_REFERENCED_B = 0,
        MBE_REUSABLE_B
};

struct mb_cache_entry {
        /* List of entries in cache - protected by cache->c_list_lock */
        struct list_head        e_list;
        /*
         * Hash table list - protected by hash chain bitlock. The entry is
         * guaranteed to be hashed while e_refcnt > 0.
         */
        struct hlist_bl_node        e_hash_list;
        /*
         * Entry refcount. Once it reaches zero, entry is unhashed and freed.
         * While refcount > 0, the entry is guaranteed to stay in the hash and
         * e.g. mb_cache_entry_try_delete() will fail.
         */
        atomic_t                e_refcnt;
        /* Key in hash - stable during lifetime of the entry */
        u32                        e_key;
        unsigned long                e_flags;
        /* User provided value - stable during lifetime of the entry */
        u64                        e_value;
};

struct mb_cache *mb_cache_create(int bucket_bits);
void mb_cache_destroy(struct mb_cache *cache);

int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
                          u64 value, bool reusable);
void __mb_cache_entry_free(struct mb_cache *cache,
                           struct mb_cache_entry *entry);
void mb_cache_entry_wait_unused(struct mb_cache_entry *entry);
static inline void mb_cache_entry_put(struct mb_cache *cache,
                                      struct mb_cache_entry *entry)
{
        unsigned int cnt = atomic_dec_return(&entry->e_refcnt);

        if (cnt > 0) {
                if (cnt <= 2)
                        wake_up_var(&entry->e_refcnt);
                return;
        }
        __mb_cache_entry_free(cache, entry);
}

struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
                                                    u32 key, u64 value);
struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
                                          u64 value);
struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
                                                 u32 key);
struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
                                                struct mb_cache_entry *entry);
void mb_cache_entry_touch(struct mb_cache *cache,
                          struct mb_cache_entry *entry);

#endif        /* _LINUX_MBCACHE_H */



























































    1 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM notifier

#if !defined(_TRACE_NOTIFIERS_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_NOTIFIERS_H

#include <linux/tracepoint.h>

DECLARE_EVENT_CLASS(notifier_info,

        TP_PROTO(void *cb),

        TP_ARGS(cb),

        TP_STRUCT__entry(
                __field(void *, cb)
        ),

        TP_fast_assign(
                __entry->cb = cb;
        ),

        TP_printk("%ps", __entry->cb)
);

/*
 * notifier_register - called upon notifier callback registration
 *
 * @cb:                callback pointer
 *
 */
DEFINE_EVENT(notifier_info, notifier_register,

        TP_PROTO(void *cb),

        TP_ARGS(cb)
);

/*
 * notifier_unregister - called upon notifier callback unregistration
 *
 * @cb:                callback pointer
 *
 */
DEFINE_EVENT(notifier_info, notifier_unregister,

        TP_PROTO(void *cb),

        TP_ARGS(cb)
);

/*
 * notifier_run - called upon notifier callback execution
 *
 * @cb:                callback pointer
 *
 */
DEFINE_EVENT(notifier_info, notifier_run,

        TP_PROTO(void *cb),

        TP_ARGS(cb)
);

#endif /* _TRACE_NOTIFIERS_H */

/* This part must be outside protection */
#include <trace/define_trace.h>




































































































































































































































































    4 






    4 













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HUGETLB_H
#define _LINUX_HUGETLB_H

#include <linux/mm.h>
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/fs.h>
#include <linux/hugetlb_inline.h>
#include <linux/cgroup.h>
#include <linux/page_ref.h>
#include <linux/list.h>
#include <linux/kref.h>
#include <linux/pgtable.h>
#include <linux/gfp.h>
#include <linux/userfaultfd_k.h>

struct ctl_table;
struct user_struct;
struct mmu_gather;
struct node;

#ifndef CONFIG_ARCH_HAS_HUGEPD
typedef struct { unsigned long pd; } hugepd_t;
#define is_hugepd(hugepd) (0)
#define __hugepd(x) ((hugepd_t) { (x) })
#endif

void free_huge_folio(struct folio *folio);

#ifdef CONFIG_HUGETLB_PAGE

#include <linux/pagemap.h>
#include <linux/shm.h>
#include <asm/tlbflush.h>

/*
 * For HugeTLB page, there are more metadata to save in the struct page. But
 * the head struct page cannot meet our needs, so we have to abuse other tail
 * struct page to store the metadata.
 */
#define __NR_USED_SUBPAGE 3

struct hugepage_subpool {
        spinlock_t lock;
        long count;
        long max_hpages;        /* Maximum huge pages or -1 if no maximum. */
        long used_hpages;        /* Used count against maximum, includes */
                                /* both allocated and reserved pages. */
        struct hstate *hstate;
        long min_hpages;        /* Minimum huge pages or -1 if no minimum. */
        long rsv_hpages;        /* Pages reserved against global pool to */
                                /* satisfy minimum size. */
};

struct resv_map {
        struct kref refs;
        spinlock_t lock;
        struct list_head regions;
        long adds_in_progress;
        struct list_head region_cache;
        long region_cache_count;
        struct rw_semaphore rw_sema;
#ifdef CONFIG_CGROUP_HUGETLB
        /*
         * On private mappings, the counter to uncharge reservations is stored
         * here. If these fields are 0, then either the mapping is shared, or
         * cgroup accounting is disabled for this resv_map.
         */
        struct page_counter *reservation_counter;
        unsigned long pages_per_hpage;
        struct cgroup_subsys_state *css;
#endif
};

/*
 * Region tracking -- allows tracking of reservations and instantiated pages
 *                    across the pages in a mapping.
 *
 * The region data structures are embedded into a resv_map and protected
 * by a resv_map's lock.  The set of regions within the resv_map represent
 * reservations for huge pages, or huge pages that have already been
 * instantiated within the map.  The from and to elements are huge page
 * indices into the associated mapping.  from indicates the starting index
 * of the region.  to represents the first index past the end of  the region.
 *
 * For example, a file region structure with from == 0 and to == 4 represents
 * four huge pages in a mapping.  It is important to note that the to element
 * represents the first element past the end of the region. This is used in
 * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
 *
 * Interval notation of the form [from, to) will be used to indicate that
 * the endpoint from is inclusive and to is exclusive.
 */
struct file_region {
        struct list_head link;
        long from;
        long to;
#ifdef CONFIG_CGROUP_HUGETLB
        /*
         * On shared mappings, each reserved region appears as a struct
         * file_region in resv_map. These fields hold the info needed to
         * uncharge each reservation.
         */
        struct page_counter *reservation_counter;
        struct cgroup_subsys_state *css;
#endif
};

struct hugetlb_vma_lock {
        struct kref refs;
        struct rw_semaphore rw_sema;
        struct vm_area_struct *vma;
};

extern struct resv_map *resv_map_alloc(void);
void resv_map_release(struct kref *ref);

extern spinlock_t hugetlb_lock;
extern int hugetlb_max_hstate __read_mostly;
#define for_each_hstate(h) \
        for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++)

struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
                                                long min_hpages);
void hugepage_put_subpool(struct hugepage_subpool *spool);

void hugetlb_dup_vma_private(struct vm_area_struct *vma);
void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
int move_hugetlb_page_tables(struct vm_area_struct *vma,
                             struct vm_area_struct *new_vma,
                             unsigned long old_addr, unsigned long new_addr,
                             unsigned long len);
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
                            struct vm_area_struct *, struct vm_area_struct *);
struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
                                      unsigned long address, unsigned int flags,
                                      unsigned int *page_mask);
void unmap_hugepage_range(struct vm_area_struct *,
                          unsigned long, unsigned long, struct page *,
                          zap_flags_t);
void __unmap_hugepage_range(struct mmu_gather *tlb,
                          struct vm_area_struct *vma,
                          unsigned long start, unsigned long end,
                          struct page *ref_page, zap_flags_t zap_flags);
void hugetlb_report_meminfo(struct seq_file *);
int hugetlb_report_node_meminfo(char *buf, int len, int nid);
void hugetlb_show_meminfo_node(int nid);
unsigned long hugetlb_total_pages(void);
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, unsigned int flags);
#ifdef CONFIG_USERFAULTFD
int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
                             struct vm_area_struct *dst_vma,
                             unsigned long dst_addr,
                             unsigned long src_addr,
                             uffd_flags_t flags,
                             struct folio **foliop);
#endif /* CONFIG_USERFAULTFD */
bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
                                                struct vm_area_struct *vma,
                                                vm_flags_t vm_flags);
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
                                                long freed);
bool isolate_hugetlb(struct folio *folio, struct list_head *list);
int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                bool *migratable_cleared);
void folio_putback_active_hugetlb(struct folio *folio);
void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
void hugetlb_fix_reserve_counts(struct inode *inode);
extern struct mutex *hugetlb_fault_mutex_table;
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);

pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
                      unsigned long addr, pud_t *pud);
bool hugetlbfs_pagecache_present(struct hstate *h,
                                 struct vm_area_struct *vma,
                                 unsigned long address);

struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio);

extern int sysctl_hugetlb_shm_group;
extern struct list_head huge_boot_pages[MAX_NUMNODES];

/* arch callbacks */

#ifndef CONFIG_HIGHPTE
/*
 * pte_offset_huge() and pte_alloc_huge() are helpers for those architectures
 * which may go down to the lowest PTE level in their huge_pte_offset() and
 * huge_pte_alloc(): to avoid reliance on pte_offset_map() without pte_unmap().
 */
static inline pte_t *pte_offset_huge(pmd_t *pmd, unsigned long address)
{
        return pte_offset_kernel(pmd, address);
}
static inline pte_t *pte_alloc_huge(struct mm_struct *mm, pmd_t *pmd,
                                    unsigned long address)
{
        return pte_alloc(mm, pmd) ? NULL : pte_offset_huge(pmd, address);
}
#endif

pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long addr, unsigned long sz);
/*
 * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE.
 * Returns the pte_t* if found, or NULL if the address is not mapped.
 *
 * IMPORTANT: we should normally not directly call this function, instead
 * this is only a common interface to implement arch-specific
 * walker. Please use hugetlb_walk() instead, because that will attempt to
 * verify the locking for you.
 *
 * Since this function will walk all the pgtable pages (including not only
 * high-level pgtable page, but also PUD entry that can be unshared
 * concurrently for VM_SHARED), the caller of this function should be
 * responsible of its thread safety.  One can follow this rule:
 *
 *  (1) For private mappings: pmd unsharing is not possible, so holding the
 *      mmap_lock for either read or write is sufficient. Most callers
 *      already hold the mmap_lock, so normally, no special action is
 *      required.
 *
 *  (2) For shared mappings: pmd unsharing is possible (so the PUD-ranged
 *      pgtable page can go away from under us!  It can be done by a pmd
 *      unshare with a follow up munmap() on the other process), then we
 *      need either:
 *
 *     (2.1) hugetlb vma lock read or write held, to make sure pmd unshare
 *           won't happen upon the range (it also makes sure the pte_t we
 *           read is the right and stable one), or,
 *
 *     (2.2) hugetlb mapping i_mmap_rwsem lock held read or write, to make
 *           sure even if unshare happened the racy unmap() will wait until
 *           i_mmap_rwsem is released.
 *
 * Option (2.1) is the safest, which guarantees pte stability from pmd
 * sharing pov, until the vma lock released.  Option (2.2) doesn't protect
 * a concurrent pmd unshare, but it makes sure the pgtable page is safe to
 * access.
 */
pte_t *huge_pte_offset(struct mm_struct *mm,
                       unsigned long addr, unsigned long sz);
unsigned long hugetlb_mask_last_page(struct hstate *h);
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
                                unsigned long addr, pte_t *ptep);
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end);

extern void __hugetlb_zap_begin(struct vm_area_struct *vma,
                                unsigned long *begin, unsigned long *end);
extern void __hugetlb_zap_end(struct vm_area_struct *vma,
                              struct zap_details *details);

static inline void hugetlb_zap_begin(struct vm_area_struct *vma,
                                     unsigned long *start, unsigned long *end)
{
        if (is_vm_hugetlb_page(vma))
                __hugetlb_zap_begin(vma, start, end);
}

static inline void hugetlb_zap_end(struct vm_area_struct *vma,
                                   struct zap_details *details)
{
        if (is_vm_hugetlb_page(vma))
                __hugetlb_zap_end(vma, details);
}

void hugetlb_vma_lock_read(struct vm_area_struct *vma);
void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
void hugetlb_vma_lock_write(struct vm_area_struct *vma);
void hugetlb_vma_unlock_write(struct vm_area_struct *vma);
int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
void hugetlb_vma_lock_release(struct kref *kref);
long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end, pgprot_t newprot,
                unsigned long cp_flags);
bool is_hugetlb_entry_migration(pte_t pte);
bool is_hugetlb_entry_hwpoisoned(pte_t pte);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);

#else /* !CONFIG_HUGETLB_PAGE */

static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma)
{
}

static inline void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
{
}

static inline unsigned long hugetlb_total_pages(void)
{
        return 0;
}

static inline struct address_space *hugetlb_folio_mapping_lock_write(
                                                        struct folio *folio)
{
        return NULL;
}

static inline int huge_pmd_unshare(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long addr, pte_t *ptep)
{
        return 0;
}

static inline void adjust_range_if_pmd_sharing_possible(
                                struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
{
}

static inline void hugetlb_zap_begin(
                                struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
{
}

static inline void hugetlb_zap_end(
                                struct vm_area_struct *vma,
                                struct zap_details *details)
{
}

static inline int copy_hugetlb_page_range(struct mm_struct *dst,
                                          struct mm_struct *src,
                                          struct vm_area_struct *dst_vma,
                                          struct vm_area_struct *src_vma)
{
        BUG();
        return 0;
}

static inline int move_hugetlb_page_tables(struct vm_area_struct *vma,
                                           struct vm_area_struct *new_vma,
                                           unsigned long old_addr,
                                           unsigned long new_addr,
                                           unsigned long len)
{
        BUG();
        return 0;
}

static inline void hugetlb_report_meminfo(struct seq_file *m)
{
}

static inline int hugetlb_report_node_meminfo(char *buf, int len, int nid)
{
        return 0;
}

static inline void hugetlb_show_meminfo_node(int nid)
{
}

static inline int prepare_hugepage_range(struct file *file,
                                unsigned long addr, unsigned long len)
{
        return -EINVAL;
}

static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma)
{
}

static inline void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
{
}

static inline void hugetlb_vma_lock_write(struct vm_area_struct *vma)
{
}

static inline void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
{
}

static inline int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
{
        return 1;
}

static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
{
}

static inline int is_hugepage_only_range(struct mm_struct *mm,
                                        unsigned long addr, unsigned long len)
{
        return 0;
}

static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        BUG();
}

#ifdef CONFIG_USERFAULTFD
static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
                                           struct vm_area_struct *dst_vma,
                                           unsigned long dst_addr,
                                           unsigned long src_addr,
                                           uffd_flags_t flags,
                                           struct folio **foliop)
{
        BUG();
        return 0;
}
#endif /* CONFIG_USERFAULTFD */

static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
                                        unsigned long sz)
{
        return NULL;
}

static inline bool isolate_hugetlb(struct folio *folio, struct list_head *list)
{
        return false;
}

static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison)
{
        return 0;
}

static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                        bool *migratable_cleared)
{
        return 0;
}

static inline void folio_putback_active_hugetlb(struct folio *folio)
{
}

static inline void move_hugetlb_state(struct folio *old_folio,
                                        struct folio *new_folio, int reason)
{
}

static inline long hugetlb_change_protection(
                        struct vm_area_struct *vma, unsigned long address,
                        unsigned long end, pgprot_t newprot,
                        unsigned long cp_flags)
{
        return 0;
}

static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
                        struct vm_area_struct *vma, unsigned long start,
                        unsigned long end, struct page *ref_page,
                        zap_flags_t zap_flags)
{
        BUG();
}

static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
                        struct vm_area_struct *vma, unsigned long address,
                        unsigned int flags)
{
        BUG();
        return 0;
}

static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }

#endif /* !CONFIG_HUGETLB_PAGE */

#ifndef pgd_write
static inline int pgd_write(pgd_t pgd)
{
        BUG();
        return 0;
}
#endif

#define HUGETLB_ANON_FILE "anon_hugepage"

enum {
        /*
         * The file will be used as an shm file so shmfs accounting rules
         * apply
         */
        HUGETLB_SHMFS_INODE     = 1,
        /*
         * The file is being created on the internal vfs mount and shmfs
         * accounting rules do not apply
         */
        HUGETLB_ANONHUGE_INODE  = 2,
};

#ifdef CONFIG_HUGETLBFS
struct hugetlbfs_sb_info {
        long        max_inodes;   /* inodes allowed */
        long        free_inodes;  /* inodes free */
        spinlock_t        stat_lock;
        struct hstate *hstate;
        struct hugepage_subpool *spool;
        kuid_t        uid;
        kgid_t        gid;
        umode_t mode;
};

static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}

struct hugetlbfs_inode_info {
        struct inode vfs_inode;
        unsigned int seals;
};

static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
{
        return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
}

extern const struct vm_operations_struct hugetlb_vm_ops;
struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
                                int creat_flags, int page_size_log);

static inline bool is_file_hugepages(const struct file *file)
{
        return file->f_op->fop_flags & FOP_HUGE_PAGES;
}

static inline struct hstate *hstate_inode(struct inode *i)
{
        return HUGETLBFS_SB(i->i_sb)->hstate;
}
#else /* !CONFIG_HUGETLBFS */

#define is_file_hugepages(file)                        false
static inline struct file *
hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag,
                int creat_flags, int page_size_log)
{
        return ERR_PTR(-ENOSYS);
}

static inline struct hstate *hstate_inode(struct inode *i)
{
        return NULL;
}
#endif /* !CONFIG_HUGETLBFS */

#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                                        unsigned long len, unsigned long pgoff,
                                        unsigned long flags);
#endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */

unsigned long
generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                                  unsigned long len, unsigned long pgoff,
                                  unsigned long flags);

/*
 * huegtlb page specific state flags.  These flags are located in page.private
 * of the hugetlb head page.  Functions created via the below macros should be
 * used to manipulate these flags.
 *
 * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at
 *        allocation time.  Cleared when page is fully instantiated.  Free
 *        routine checks flag to restore a reservation on error paths.
 *        Synchronization:  Examined or modified by code that knows it has
 *        the only reference to page.  i.e. After allocation but before use
 *        or when the page is being freed.
 * HPG_migratable  - Set after a newly allocated page is added to the page
 *        cache and/or page tables.  Indicates the page is a candidate for
 *        migration.
 *        Synchronization:  Initially set after new page allocation with no
 *        locking.  When examined and modified during migration processing
 *        (isolate, migrate, putback) the hugetlb_lock is held.
 * HPG_temporary - Set on a page that is temporarily allocated from the buddy
 *        allocator.  Typically used for migration target pages when no pages
 *        are available in the pool.  The hugetlb free page path will
 *        immediately free pages with this flag set to the buddy allocator.
 *        Synchronization: Can be set after huge page allocation from buddy when
 *        code knows it has only reference.  All other examinations and
 *        modifications require hugetlb_lock.
 * HPG_freed - Set when page is on the free lists.
 *        Synchronization: hugetlb_lock held for examination and modification.
 * HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed.
 * HPG_raw_hwp_unreliable - Set when the hugetlb page has a hwpoison sub-page
 *     that is not tracked by raw_hwp_page list.
 */
enum hugetlb_page_flags {
        HPG_restore_reserve = 0,
        HPG_migratable,
        HPG_temporary,
        HPG_freed,
        HPG_vmemmap_optimized,
        HPG_raw_hwp_unreliable,
        __NR_HPAGEFLAGS,
};

/*
 * Macros to create test, set and clear function definitions for
 * hugetlb specific page flags.
 */
#ifdef CONFIG_HUGETLB_PAGE
#define TESTHPAGEFLAG(uname, flname)                                \
static __always_inline                                                \
bool folio_test_hugetlb_##flname(struct folio *folio)                \
        {        void *private = &folio->private;                \
                return test_bit(HPG_##flname, private);                \
        }                                                        \
static inline int HPage##uname(struct page *page)                \
        { return test_bit(HPG_##flname, &(page->private)); }

#define SETHPAGEFLAG(uname, flname)                                \
static __always_inline                                                \
void folio_set_hugetlb_##flname(struct folio *folio)                \
        {        void *private = &folio->private;                \
                set_bit(HPG_##flname, private);                        \
        }                                                        \
static inline void SetHPage##uname(struct page *page)                \
        { set_bit(HPG_##flname, &(page->private)); }

#define CLEARHPAGEFLAG(uname, flname)                                \
static __always_inline                                                \
void folio_clear_hugetlb_##flname(struct folio *folio)                \
        {        void *private = &folio->private;                \
                clear_bit(HPG_##flname, private);                \
        }                                                        \
static inline void ClearHPage##uname(struct page *page)                \
        { clear_bit(HPG_##flname, &(page->private)); }
#else
#define TESTHPAGEFLAG(uname, flname)                                \
static inline bool                                                \
folio_test_hugetlb_##flname(struct folio *folio)                \
        { return 0; }                                                \
static inline int HPage##uname(struct page *page)                \
        { return 0; }

#define SETHPAGEFLAG(uname, flname)                                \
static inline void                                                \
folio_set_hugetlb_##flname(struct folio *folio)                 \
        { }                                                        \
static inline void SetHPage##uname(struct page *page)                \
        { }

#define CLEARHPAGEFLAG(uname, flname)                                \
static inline void                                                \
folio_clear_hugetlb_##flname(struct folio *folio)                \
        { }                                                        \
static inline void ClearHPage##uname(struct page *page)                \
        { }
#endif

#define HPAGEFLAG(uname, flname)                                \
        TESTHPAGEFLAG(uname, flname)                                \
        SETHPAGEFLAG(uname, flname)                                \
        CLEARHPAGEFLAG(uname, flname)                                \

/*
 * Create functions associated with hugetlb page flags
 */
HPAGEFLAG(RestoreReserve, restore_reserve)
HPAGEFLAG(Migratable, migratable)
HPAGEFLAG(Temporary, temporary)
HPAGEFLAG(Freed, freed)
HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable)

#ifdef CONFIG_HUGETLB_PAGE

#define HSTATE_NAME_LEN 32
/* Defines one hugetlb page size */
struct hstate {
        struct mutex resize_lock;
        int next_nid_to_alloc;
        int next_nid_to_free;
        unsigned int order;
        unsigned int demote_order;
        unsigned long mask;
        unsigned long max_huge_pages;
        unsigned long nr_huge_pages;
        unsigned long free_huge_pages;
        unsigned long resv_huge_pages;
        unsigned long surplus_huge_pages;
        unsigned long nr_overcommit_huge_pages;
        struct list_head hugepage_activelist;
        struct list_head hugepage_freelists[MAX_NUMNODES];
        unsigned int max_huge_pages_node[MAX_NUMNODES];
        unsigned int nr_huge_pages_node[MAX_NUMNODES];
        unsigned int free_huge_pages_node[MAX_NUMNODES];
        unsigned int surplus_huge_pages_node[MAX_NUMNODES];
#ifdef CONFIG_CGROUP_HUGETLB
        /* cgroup control files */
        struct cftype cgroup_files_dfl[8];
        struct cftype cgroup_files_legacy[10];
#endif
        char name[HSTATE_NAME_LEN];
};

struct huge_bootmem_page {
        struct list_head list;
        struct hstate *hstate;
};

int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
                                unsigned long addr, int avoid_reserve);
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
                                nodemask_t *nmask, gfp_t gfp_mask,
                                bool allow_alloc_fallback);
int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
                        pgoff_t idx);
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
                                unsigned long address, struct folio *folio);

/* arch callback */
int __init __alloc_bootmem_huge_page(struct hstate *h, int nid);
int __init alloc_bootmem_huge_page(struct hstate *h, int nid);
bool __init hugetlb_node_alloc_supported(void);

void __init hugetlb_add_hstate(unsigned order);
bool __init arch_hugetlb_valid_size(unsigned long size);
struct hstate *size_to_hstate(unsigned long size);

#ifndef HUGE_MAX_HSTATE
#define HUGE_MAX_HSTATE 1
#endif

extern struct hstate hstates[HUGE_MAX_HSTATE];
extern unsigned int default_hstate_idx;

#define default_hstate (hstates[default_hstate_idx])

static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio)
{
        return folio->_hugetlb_subpool;
}

static inline void hugetlb_set_folio_subpool(struct folio *folio,
                                        struct hugepage_subpool *subpool)
{
        folio->_hugetlb_subpool = subpool;
}

static inline struct hstate *hstate_file(struct file *f)
{
        return hstate_inode(file_inode(f));
}

static inline struct hstate *hstate_sizelog(int page_size_log)
{
        if (!page_size_log)
                return &default_hstate;

        if (page_size_log < BITS_PER_LONG)
                return size_to_hstate(1UL << page_size_log);

        return NULL;
}

static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
{
        return hstate_file(vma->vm_file);
}

static inline unsigned long huge_page_size(const struct hstate *h)
{
        return (unsigned long)PAGE_SIZE << h->order;
}

extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma);

extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma);

static inline unsigned long huge_page_mask(struct hstate *h)
{
        return h->mask;
}

static inline unsigned int huge_page_order(struct hstate *h)
{
        return h->order;
}

static inline unsigned huge_page_shift(struct hstate *h)
{
        return h->order + PAGE_SHIFT;
}

static inline bool hstate_is_gigantic(struct hstate *h)
{
        return huge_page_order(h) > MAX_PAGE_ORDER;
}

static inline unsigned int pages_per_huge_page(const struct hstate *h)
{
        return 1 << h->order;
}

static inline unsigned int blocks_per_huge_page(struct hstate *h)
{
        return huge_page_size(h) / 512;
}

static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
                                struct address_space *mapping, pgoff_t idx)
{
        return filemap_lock_folio(mapping, idx << huge_page_order(h));
}

#include <asm/hugetlb.h>

#ifndef is_hugepage_only_range
static inline int is_hugepage_only_range(struct mm_struct *mm,
                                        unsigned long addr, unsigned long len)
{
        return 0;
}
#define is_hugepage_only_range is_hugepage_only_range
#endif

#ifndef arch_clear_hugetlb_flags
static inline void arch_clear_hugetlb_flags(struct folio *folio) { }
#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
#endif

#ifndef arch_make_huge_pte
static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift,
                                       vm_flags_t flags)
{
        return pte_mkhuge(entry);
}
#endif

static inline struct hstate *folio_hstate(struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        return size_to_hstate(folio_size(folio));
}

static inline unsigned hstate_index_to_shift(unsigned index)
{
        return hstates[index].order + PAGE_SHIFT;
}

static inline int hstate_index(struct hstate *h)
{
        return h - hstates;
}

int dissolve_free_hugetlb_folio(struct folio *folio);
int dissolve_free_hugetlb_folios(unsigned long start_pfn,
                                    unsigned long end_pfn);

#ifdef CONFIG_MEMORY_FAILURE
extern void folio_clear_hugetlb_hwpoison(struct folio *folio);
#else
static inline void folio_clear_hugetlb_hwpoison(struct folio *folio)
{
}
#endif

#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
#ifndef arch_hugetlb_migration_supported
static inline bool arch_hugetlb_migration_supported(struct hstate *h)
{
        if ((huge_page_shift(h) == PMD_SHIFT) ||
                (huge_page_shift(h) == PUD_SHIFT) ||
                        (huge_page_shift(h) == PGDIR_SHIFT))
                return true;
        else
                return false;
}
#endif
#else
static inline bool arch_hugetlb_migration_supported(struct hstate *h)
{
        return false;
}
#endif

static inline bool hugepage_migration_supported(struct hstate *h)
{
        return arch_hugetlb_migration_supported(h);
}

/*
 * Movability check is different as compared to migration check.
 * It determines whether or not a huge page should be placed on
 * movable zone or not. Movability of any huge page should be
 * required only if huge page size is supported for migration.
 * There won't be any reason for the huge page to be movable if
 * it is not migratable to start with. Also the size of the huge
 * page should be large enough to be placed under a movable zone
 * and still feasible enough to be migratable. Just the presence
 * in movable zone does not make the migration feasible.
 *
 * So even though large huge page sizes like the gigantic ones
 * are migratable they should not be movable because its not
 * feasible to migrate them from movable zone.
 */
static inline bool hugepage_movable_supported(struct hstate *h)
{
        if (!hugepage_migration_supported(h))
                return false;

        if (hstate_is_gigantic(h))
                return false;
        return true;
}

/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
        if (hugepage_movable_supported(h))
                return GFP_HIGHUSER_MOVABLE;
        else
                return GFP_HIGHUSER;
}

static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
{
        gfp_t modified_mask = htlb_alloc_mask(h);

        /* Some callers might want to enforce node */
        modified_mask |= (gfp_mask & __GFP_THISNODE);

        modified_mask |= (gfp_mask & __GFP_NOWARN);

        return modified_mask;
}

static inline bool htlb_allow_alloc_fallback(int reason)
{
        bool allowed_fallback = false;

        /*
         * Note: the memory offline, memory failure and migration syscalls will
         * be allowed to fallback to other nodes due to lack of a better chioce,
         * that might break the per-node hugetlb pool. While other cases will
         * set the __GFP_THISNODE to avoid breaking the per-node hugetlb pool.
         */
        switch (reason) {
        case MR_MEMORY_HOTPLUG:
        case MR_MEMORY_FAILURE:
        case MR_SYSCALL:
        case MR_MEMPOLICY_MBIND:
                allowed_fallback = true;
                break;
        default:
                break;
        }

        return allowed_fallback;
}

static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
                                           struct mm_struct *mm, pte_t *pte)
{
        if (huge_page_size(h) == PMD_SIZE)
                return pmd_lockptr(mm, (pmd_t *) pte);
        VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
        return &mm->page_table_lock;
}

#ifndef hugepages_supported
/*
 * Some platform decide whether they support huge pages at boot
 * time. Some of them, such as powerpc, set HPAGE_SHIFT to 0
 * when there is no such support
 */
#define hugepages_supported() (HPAGE_SHIFT != 0)
#endif

void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm);

static inline void hugetlb_count_init(struct mm_struct *mm)
{
        atomic_long_set(&mm->hugetlb_usage, 0);
}

static inline void hugetlb_count_add(long l, struct mm_struct *mm)
{
        atomic_long_add(l, &mm->hugetlb_usage);
}

static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
{
        atomic_long_sub(l, &mm->hugetlb_usage);
}

#ifndef huge_ptep_modify_prot_start
#define huge_ptep_modify_prot_start huge_ptep_modify_prot_start
static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
                                                unsigned long addr, pte_t *ptep)
{
        return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
}
#endif

#ifndef huge_ptep_modify_prot_commit
#define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit
static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
                                                unsigned long addr, pte_t *ptep,
                                                pte_t old_pte, pte_t pte)
{
        unsigned long psize = huge_page_size(hstate_vma(vma));

        set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize);
}
#endif

#ifdef CONFIG_NUMA
void hugetlb_register_node(struct node *node);
void hugetlb_unregister_node(struct node *node);
#endif

/*
 * Check if a given raw @page in a hugepage is HWPOISON.
 */
bool is_raw_hwpoison_page_in_hugepage(struct page *page);

#else        /* CONFIG_HUGETLB_PAGE */
struct hstate {};

static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio)
{
        return NULL;
}

static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
                                struct address_space *mapping, pgoff_t idx)
{
        return NULL;
}

static inline int isolate_or_dissolve_huge_page(struct page *page,
                                                struct list_head *list)
{
        return -ENOMEM;
}

static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           int avoid_reserve)
{
        return NULL;
}

static inline struct folio *
alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
                        nodemask_t *nmask, gfp_t gfp_mask,
                        bool allow_alloc_fallback)
{
        return NULL;
}

static inline int __alloc_bootmem_huge_page(struct hstate *h)
{
        return 0;
}

static inline struct hstate *hstate_file(struct file *f)
{
        return NULL;
}

static inline struct hstate *hstate_sizelog(int page_size_log)
{
        return NULL;
}

static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
{
        return NULL;
}

static inline struct hstate *folio_hstate(struct folio *folio)
{
        return NULL;
}

static inline struct hstate *size_to_hstate(unsigned long size)
{
        return NULL;
}

static inline unsigned long huge_page_size(struct hstate *h)
{
        return PAGE_SIZE;
}

static inline unsigned long huge_page_mask(struct hstate *h)
{
        return PAGE_MASK;
}

static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
{
        return PAGE_SIZE;
}

static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
        return PAGE_SIZE;
}

static inline unsigned int huge_page_order(struct hstate *h)
{
        return 0;
}

static inline unsigned int huge_page_shift(struct hstate *h)
{
        return PAGE_SHIFT;
}

static inline bool hstate_is_gigantic(struct hstate *h)
{
        return false;
}

static inline unsigned int pages_per_huge_page(struct hstate *h)
{
        return 1;
}

static inline unsigned hstate_index_to_shift(unsigned index)
{
        return 0;
}

static inline int hstate_index(struct hstate *h)
{
        return 0;
}

static inline int dissolve_free_hugetlb_folio(struct folio *folio)
{
        return 0;
}

static inline int dissolve_free_hugetlb_folios(unsigned long start_pfn,
                                           unsigned long end_pfn)
{
        return 0;
}

static inline bool hugepage_migration_supported(struct hstate *h)
{
        return false;
}

static inline bool hugepage_movable_supported(struct hstate *h)
{
        return false;
}

static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
        return 0;
}

static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
{
        return 0;
}

static inline bool htlb_allow_alloc_fallback(int reason)
{
        return false;
}

static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
                                           struct mm_struct *mm, pte_t *pte)
{
        return &mm->page_table_lock;
}

static inline void hugetlb_count_init(struct mm_struct *mm)
{
}

static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m)
{
}

static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
{
}

static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
                                          unsigned long addr, pte_t *ptep)
{
#ifdef CONFIG_MMU
        return ptep_get(ptep);
#else
        return *ptep;
#endif
}

static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                                   pte_t *ptep, pte_t pte, unsigned long sz)
{
}

static inline void hugetlb_register_node(struct node *node)
{
}

static inline void hugetlb_unregister_node(struct node *node)
{
}

static inline bool hugetlbfs_pagecache_present(
    struct hstate *h, struct vm_area_struct *vma, unsigned long address)
{
        return false;
}
#endif        /* CONFIG_HUGETLB_PAGE */

static inline spinlock_t *huge_pte_lock(struct hstate *h,
                                        struct mm_struct *mm, pte_t *pte)
{
        spinlock_t *ptl;

        ptl = huge_pte_lockptr(h, mm, pte);
        spin_lock(ptl);
        return ptl;
}

#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
extern void __init hugetlb_cma_reserve(int order);
#else
static inline __init void hugetlb_cma_reserve(int order)
{
}
#endif

#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
        return page_count(virt_to_page(pte)) > 1;
}
#else
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
        return false;
}
#endif

bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);

#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
/*
 * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
 * implement this.
 */
#define flush_hugetlb_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
#endif

static inline bool __vma_shareable_lock(struct vm_area_struct *vma)
{
        return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data;
}

bool __vma_private_lock(struct vm_area_struct *vma);

/*
 * Safe version of huge_pte_offset() to check the locks.  See comments
 * above huge_pte_offset().
 */
static inline pte_t *
hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz)
{
#if defined(CONFIG_HUGETLB_PAGE) && \
        defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP)
        struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

        /*
         * If pmd sharing possible, locking needed to safely walk the
         * hugetlb pgtables.  More information can be found at the comment
         * above huge_pte_offset() in the same file.
         *
         * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP.
         */
        if (__vma_shareable_lock(vma))
                WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) &&
                             !lockdep_is_held(
                                 &vma->vm_file->f_mapping->i_mmap_rwsem));
#endif
        return huge_pte_offset(vma->vm_mm, addr, sz);
}

#endif /* _LINUX_HUGETLB_H */






































































    7 





    6 





   10 












    8 






    5 




    4 






































   31 




    2 






    8 






    5 
















   32 



    2 



   32 

   10 
   10 

   30 
    4 









   30 
   30 


   31 


























   30 


   17 




   31 



























   20 


























   16 

   18 

























    2 



    2 





















    9 
   12 





















    9 





   10 



    3 































   11 













    5 

    8 











    3 

    4 
    3 

    1 

    1 




    1 





    1 
    1 

    1 





















    4 


    3 





    3 




    1 



    1 



    4 




















































   11 




    6 


    6 
    2 




    8 



    5 




    4 











    5 
    2 

    5 








    4 
    1 




    5 






















   19 








   13 

   11 

   12 


    9 

   12 







   16 







   21 


    7 


    6 



    5 



   10 
   10 




























































    1 






   17 
   17 
    4 
















   19 












    4 





   20 

   21 





    6 
   17 
   18 


   18 
    4 










   19 

   19 




   17 




    4 


    4 

    4 






   18 
































   12 






   11 
    8 


   10 


   10 
    7 













    8 



    3 


    7 
    6 

    5 



    5 



    6 















    3 







    4 



    3 




























































































































































































































































    1 



    1 

    1 









































    6 


    2 





    2 



    7 

    4 




    5 

    5 

    4 




    4 

    1 



    4 


    3 






























   10 






    3 


    3 


    4 

   12 

    4 


    8 





    5 


    4 



    5 





    5 
    5 
    3 





    5 


    5 

    1 


    4 



    5 


    4 


    2 



    7 


















   14 










    8 

    4 

   11 

    9 








   10 

































   25 



   25 





   24 







    1 

    2 































































    2 










    1 



    2 





















    2 




























    1 










    1 




    1 








































































































































































































































































































   10 




   11 












    1 




    1 


























































































    1 



    1 




    1 

    1 

    1 





















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
// SPDX-License-Identifier: GPL-2.0+
/*
 * XArray implementation
 * Copyright (c) 2017-2018 Microsoft Corporation
 * Copyright (c) 2018-2020 Oracle
 * Author: Matthew Wilcox <willy@infradead.org>
 */

#include <linux/bitmap.h>
#include <linux/export.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/xarray.h>

#include "radix-tree.h"

/*
 * Coding conventions in this file:
 *
 * @xa is used to refer to the entire xarray.
 * @xas is the 'xarray operation state'.  It may be either a pointer to
 * an xa_state, or an xa_state stored on the stack.  This is an unfortunate
 * ambiguity.
 * @index is the index of the entry being operated on
 * @mark is an xa_mark_t; a small number indicating one of the mark bits.
 * @node refers to an xa_node; usually the primary one being operated on by
 * this function.
 * @offset is the index into the slots array inside an xa_node.
 * @parent refers to the @xa_node closer to the head than @node.
 * @entry refers to something stored in a slot in the xarray
 */

static inline unsigned int xa_lock_type(const struct xarray *xa)
{
        return (__force unsigned int)xa->xa_flags & 3;
}

static inline void xas_lock_type(struct xa_state *xas, unsigned int lock_type)
{
        if (lock_type == XA_LOCK_IRQ)
                xas_lock_irq(xas);
        else if (lock_type == XA_LOCK_BH)
                xas_lock_bh(xas);
        else
                xas_lock(xas);
}

static inline void xas_unlock_type(struct xa_state *xas, unsigned int lock_type)
{
        if (lock_type == XA_LOCK_IRQ)
                xas_unlock_irq(xas);
        else if (lock_type == XA_LOCK_BH)
                xas_unlock_bh(xas);
        else
                xas_unlock(xas);
}

static inline bool xa_track_free(const struct xarray *xa)
{
        return xa->xa_flags & XA_FLAGS_TRACK_FREE;
}

static inline bool xa_zero_busy(const struct xarray *xa)
{
        return xa->xa_flags & XA_FLAGS_ZERO_BUSY;
}

static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark)
{
        if (!(xa->xa_flags & XA_FLAGS_MARK(mark)))
                xa->xa_flags |= XA_FLAGS_MARK(mark);
}

static inline void xa_mark_clear(struct xarray *xa, xa_mark_t mark)
{
        if (xa->xa_flags & XA_FLAGS_MARK(mark))
                xa->xa_flags &= ~(XA_FLAGS_MARK(mark));
}

static inline unsigned long *node_marks(struct xa_node *node, xa_mark_t mark)
{
        return node->marks[(__force unsigned)mark];
}

static inline bool node_get_mark(struct xa_node *node,
                unsigned int offset, xa_mark_t mark)
{
        return test_bit(offset, node_marks(node, mark));
}

/* returns true if the bit was set */
static inline bool node_set_mark(struct xa_node *node, unsigned int offset,
                                xa_mark_t mark)
{
        return __test_and_set_bit(offset, node_marks(node, mark));
}

/* returns true if the bit was set */
static inline bool node_clear_mark(struct xa_node *node, unsigned int offset,
                                xa_mark_t mark)
{
        return __test_and_clear_bit(offset, node_marks(node, mark));
}

static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark)
{
        return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE);
}

static inline void node_mark_all(struct xa_node *node, xa_mark_t mark)
{
        bitmap_fill(node_marks(node, mark), XA_CHUNK_SIZE);
}

#define mark_inc(mark) do { \
        mark = (__force xa_mark_t)((__force unsigned)(mark) + 1); \
} while (0)

/*
 * xas_squash_marks() - Merge all marks to the first entry
 * @xas: Array operation state.
 *
 * Set a mark on the first entry if any entry has it set.  Clear marks on
 * all sibling entries.
 */
static void xas_squash_marks(const struct xa_state *xas)
{
        unsigned int mark = 0;
        unsigned int limit = xas->xa_offset + xas->xa_sibs + 1;

        if (!xas->xa_sibs)
                return;

        do {
                unsigned long *marks = xas->xa_node->marks[mark];
                if (find_next_bit(marks, limit, xas->xa_offset + 1) == limit)
                        continue;
                __set_bit(xas->xa_offset, marks);
                bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs);
        } while (mark++ != (__force unsigned)XA_MARK_MAX);
}

/* extracts the offset within this node from the index */
static unsigned int get_offset(unsigned long index, struct xa_node *node)
{
        return (index >> node->shift) & XA_CHUNK_MASK;
}

static void xas_set_offset(struct xa_state *xas)
{
        xas->xa_offset = get_offset(xas->xa_index, xas->xa_node);
}

/* move the index either forwards (find) or backwards (sibling slot) */
static void xas_move_index(struct xa_state *xas, unsigned long offset)
{
        unsigned int shift = xas->xa_node->shift;
        xas->xa_index &= ~XA_CHUNK_MASK << shift;
        xas->xa_index += offset << shift;
}

static void xas_next_offset(struct xa_state *xas)
{
        xas->xa_offset++;
        xas_move_index(xas, xas->xa_offset);
}

static void *set_bounds(struct xa_state *xas)
{
        xas->xa_node = XAS_BOUNDS;
        return NULL;
}

/*
 * Starts a walk.  If the @xas is already valid, we assume that it's on
 * the right path and just return where we've got to.  If we're in an
 * error state, return NULL.  If the index is outside the current scope
 * of the xarray, return NULL without changing @xas->xa_node.  Otherwise
 * set @xas->xa_node to NULL and return the current head of the array.
 */
static void *xas_start(struct xa_state *xas)
{
        void *entry;

        if (xas_valid(xas))
                return xas_reload(xas);
        if (xas_error(xas))
                return NULL;

        entry = xa_head(xas->xa);
        if (!xa_is_node(entry)) {
                if (xas->xa_index)
                        return set_bounds(xas);
        } else {
                if ((xas->xa_index >> xa_to_node(entry)->shift) > XA_CHUNK_MASK)
                        return set_bounds(xas);
        }

        xas->xa_node = NULL;
        return entry;
}

static __always_inline void *xas_descend(struct xa_state *xas,
                                        struct xa_node *node)
{
        unsigned int offset = get_offset(xas->xa_index, node);
        void *entry = xa_entry(xas->xa, node, offset);

        xas->xa_node = node;
        while (xa_is_sibling(entry)) {
                offset = xa_to_sibling(entry);
                entry = xa_entry(xas->xa, node, offset);
                if (node->shift && xa_is_node(entry))
                        entry = XA_RETRY_ENTRY;
        }

        xas->xa_offset = offset;
        return entry;
}

/**
 * xas_load() - Load an entry from the XArray (advanced).
 * @xas: XArray operation state.
 *
 * Usually walks the @xas to the appropriate state to load the entry
 * stored at xa_index.  However, it will do nothing and return %NULL if
 * @xas is in an error state.  xas_load() will never expand the tree.
 *
 * If the xa_state is set up to operate on a multi-index entry, xas_load()
 * may return %NULL or an internal entry, even if there are entries
 * present within the range specified by @xas.
 *
 * Context: Any context.  The caller should hold the xa_lock or the RCU lock.
 * Return: Usually an entry in the XArray, but see description for exceptions.
 */
void *xas_load(struct xa_state *xas)
{
        void *entry = xas_start(xas);

        while (xa_is_node(entry)) {
                struct xa_node *node = xa_to_node(entry);

                if (xas->xa_shift > node->shift)
                        break;
                entry = xas_descend(xas, node);
                if (node->shift == 0)
                        break;
        }
        return entry;
}
EXPORT_SYMBOL_GPL(xas_load);

#define XA_RCU_FREE        ((struct xarray *)1)

static void xa_node_free(struct xa_node *node)
{
        XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
        node->array = XA_RCU_FREE;
        call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
}

/*
 * xas_destroy() - Free any resources allocated during the XArray operation.
 * @xas: XArray operation state.
 *
 * Most users will not need to call this function; it is called for you
 * by xas_nomem().
 */
void xas_destroy(struct xa_state *xas)
{
        struct xa_node *next, *node = xas->xa_alloc;

        while (node) {
                XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
                next = rcu_dereference_raw(node->parent);
                radix_tree_node_rcu_free(&node->rcu_head);
                xas->xa_alloc = node = next;
        }
}

/**
 * xas_nomem() - Allocate memory if needed.
 * @xas: XArray operation state.
 * @gfp: Memory allocation flags.
 *
 * If we need to add new nodes to the XArray, we try to allocate memory
 * with GFP_NOWAIT while holding the lock, which will usually succeed.
 * If it fails, @xas is flagged as needing memory to continue.  The caller
 * should drop the lock and call xas_nomem().  If xas_nomem() succeeds,
 * the caller should retry the operation.
 *
 * Forward progress is guaranteed as one node is allocated here and
 * stored in the xa_state where it will be found by xas_alloc().  More
 * nodes will likely be found in the slab allocator, but we do not tie
 * them up here.
 *
 * Return: true if memory was needed, and was successfully allocated.
 */
bool xas_nomem(struct xa_state *xas, gfp_t gfp)
{
        if (xas->xa_node != XA_ERROR(-ENOMEM)) {
                xas_destroy(xas);
                return false;
        }
        if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
                gfp |= __GFP_ACCOUNT;
        xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp);
        if (!xas->xa_alloc)
                return false;
        xas->xa_alloc->parent = NULL;
        XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
        xas->xa_node = XAS_RESTART;
        return true;
}
EXPORT_SYMBOL_GPL(xas_nomem);

/*
 * __xas_nomem() - Drop locks and allocate memory if needed.
 * @xas: XArray operation state.
 * @gfp: Memory allocation flags.
 *
 * Internal variant of xas_nomem().
 *
 * Return: true if memory was needed, and was successfully allocated.
 */
static bool __xas_nomem(struct xa_state *xas, gfp_t gfp)
        __must_hold(xas->xa->xa_lock)
{
        unsigned int lock_type = xa_lock_type(xas->xa);

        if (xas->xa_node != XA_ERROR(-ENOMEM)) {
                xas_destroy(xas);
                return false;
        }
        if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
                gfp |= __GFP_ACCOUNT;
        if (gfpflags_allow_blocking(gfp)) {
                xas_unlock_type(xas, lock_type);
                xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp);
                xas_lock_type(xas, lock_type);
        } else {
                xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp);
        }
        if (!xas->xa_alloc)
                return false;
        xas->xa_alloc->parent = NULL;
        XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
        xas->xa_node = XAS_RESTART;
        return true;
}

static void xas_update(struct xa_state *xas, struct xa_node *node)
{
        if (xas->xa_update)
                xas->xa_update(node);
        else
                XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
}

static void *xas_alloc(struct xa_state *xas, unsigned int shift)
{
        struct xa_node *parent = xas->xa_node;
        struct xa_node *node = xas->xa_alloc;

        if (xas_invalid(xas))
                return NULL;

        if (node) {
                xas->xa_alloc = NULL;
        } else {
                gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN;

                if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
                        gfp |= __GFP_ACCOUNT;

                node = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp);
                if (!node) {
                        xas_set_err(xas, -ENOMEM);
                        return NULL;
                }
        }

        if (parent) {
                node->offset = xas->xa_offset;
                parent->count++;
                XA_NODE_BUG_ON(node, parent->count > XA_CHUNK_SIZE);
                xas_update(xas, parent);
        }
        XA_NODE_BUG_ON(node, shift > BITS_PER_LONG);
        XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
        node->shift = shift;
        node->count = 0;
        node->nr_values = 0;
        RCU_INIT_POINTER(node->parent, xas->xa_node);
        node->array = xas->xa;

        return node;
}

#ifdef CONFIG_XARRAY_MULTI
/* Returns the number of indices covered by a given xa_state */
static unsigned long xas_size(const struct xa_state *xas)
{
        return (xas->xa_sibs + 1UL) << xas->xa_shift;
}
#endif

/*
 * Use this to calculate the maximum index that will need to be created
 * in order to add the entry described by @xas.  Because we cannot store a
 * multi-index entry at index 0, the calculation is a little more complex
 * than you might expect.
 */
static unsigned long xas_max(struct xa_state *xas)
{
        unsigned long max = xas->xa_index;

#ifdef CONFIG_XARRAY_MULTI
        if (xas->xa_shift || xas->xa_sibs) {
                unsigned long mask = xas_size(xas) - 1;
                max |= mask;
                if (mask == max)
                        max++;
        }
#endif

        return max;
}

/* The maximum index that can be contained in the array without expanding it */
static unsigned long max_index(void *entry)
{
        if (!xa_is_node(entry))
                return 0;
        return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1;
}

static void xas_shrink(struct xa_state *xas)
{
        struct xarray *xa = xas->xa;
        struct xa_node *node = xas->xa_node;

        for (;;) {
                void *entry;

                XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
                if (node->count != 1)
                        break;
                entry = xa_entry_locked(xa, node, 0);
                if (!entry)
                        break;
                if (!xa_is_node(entry) && node->shift)
                        break;
                if (xa_is_zero(entry) && xa_zero_busy(xa))
                        entry = NULL;
                xas->xa_node = XAS_BOUNDS;

                RCU_INIT_POINTER(xa->xa_head, entry);
                if (xa_track_free(xa) && !node_get_mark(node, 0, XA_FREE_MARK))
                        xa_mark_clear(xa, XA_FREE_MARK);

                node->count = 0;
                node->nr_values = 0;
                if (!xa_is_node(entry))
                        RCU_INIT_POINTER(node->slots[0], XA_RETRY_ENTRY);
                xas_update(xas, node);
                xa_node_free(node);
                if (!xa_is_node(entry))
                        break;
                node = xa_to_node(entry);
                node->parent = NULL;
        }
}

/*
 * xas_delete_node() - Attempt to delete an xa_node
 * @xas: Array operation state.
 *
 * Attempts to delete the @xas->xa_node.  This will fail if xa->node has
 * a non-zero reference count.
 */
static void xas_delete_node(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        for (;;) {
                struct xa_node *parent;

                XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
                if (node->count)
                        break;

                parent = xa_parent_locked(xas->xa, node);
                xas->xa_node = parent;
                xas->xa_offset = node->offset;
                xa_node_free(node);

                if (!parent) {
                        xas->xa->xa_head = NULL;
                        xas->xa_node = XAS_BOUNDS;
                        return;
                }

                parent->slots[xas->xa_offset] = NULL;
                parent->count--;
                XA_NODE_BUG_ON(parent, parent->count > XA_CHUNK_SIZE);
                node = parent;
                xas_update(xas, node);
        }

        if (!node->parent)
                xas_shrink(xas);
}

/**
 * xas_free_nodes() - Free this node and all nodes that it references
 * @xas: Array operation state.
 * @top: Node to free
 *
 * This node has been removed from the tree.  We must now free it and all
 * of its subnodes.  There may be RCU walkers with references into the tree,
 * so we must replace all entries with retry markers.
 */
static void xas_free_nodes(struct xa_state *xas, struct xa_node *top)
{
        unsigned int offset = 0;
        struct xa_node *node = top;

        for (;;) {
                void *entry = xa_entry_locked(xas->xa, node, offset);

                if (node->shift && xa_is_node(entry)) {
                        node = xa_to_node(entry);
                        offset = 0;
                        continue;
                }
                if (entry)
                        RCU_INIT_POINTER(node->slots[offset], XA_RETRY_ENTRY);
                offset++;
                while (offset == XA_CHUNK_SIZE) {
                        struct xa_node *parent;

                        parent = xa_parent_locked(xas->xa, node);
                        offset = node->offset + 1;
                        node->count = 0;
                        node->nr_values = 0;
                        xas_update(xas, node);
                        xa_node_free(node);
                        if (node == top)
                                return;
                        node = parent;
                }
        }
}

/*
 * xas_expand adds nodes to the head of the tree until it has reached
 * sufficient height to be able to contain @xas->xa_index
 */
static int xas_expand(struct xa_state *xas, void *head)
{
        struct xarray *xa = xas->xa;
        struct xa_node *node = NULL;
        unsigned int shift = 0;
        unsigned long max = xas_max(xas);

        if (!head) {
                if (max == 0)
                        return 0;
                while ((max >> shift) >= XA_CHUNK_SIZE)
                        shift += XA_CHUNK_SHIFT;
                return shift + XA_CHUNK_SHIFT;
        } else if (xa_is_node(head)) {
                node = xa_to_node(head);
                shift = node->shift + XA_CHUNK_SHIFT;
        }
        xas->xa_node = NULL;

        while (max > max_index(head)) {
                xa_mark_t mark = 0;

                XA_NODE_BUG_ON(node, shift > BITS_PER_LONG);
                node = xas_alloc(xas, shift);
                if (!node)
                        return -ENOMEM;

                node->count = 1;
                if (xa_is_value(head))
                        node->nr_values = 1;
                RCU_INIT_POINTER(node->slots[0], head);

                /* Propagate the aggregated mark info to the new child */
                for (;;) {
                        if (xa_track_free(xa) && mark == XA_FREE_MARK) {
                                node_mark_all(node, XA_FREE_MARK);
                                if (!xa_marked(xa, XA_FREE_MARK)) {
                                        node_clear_mark(node, 0, XA_FREE_MARK);
                                        xa_mark_set(xa, XA_FREE_MARK);
                                }
                        } else if (xa_marked(xa, mark)) {
                                node_set_mark(node, 0, mark);
                        }
                        if (mark == XA_MARK_MAX)
                                break;
                        mark_inc(mark);
                }

                /*
                 * Now that the new node is fully initialised, we can add
                 * it to the tree
                 */
                if (xa_is_node(head)) {
                        xa_to_node(head)->offset = 0;
                        rcu_assign_pointer(xa_to_node(head)->parent, node);
                }
                head = xa_mk_node(node);
                rcu_assign_pointer(xa->xa_head, head);
                xas_update(xas, node);

                shift += XA_CHUNK_SHIFT;
        }

        xas->xa_node = node;
        return shift;
}

/*
 * xas_create() - Create a slot to store an entry in.
 * @xas: XArray operation state.
 * @allow_root: %true if we can store the entry in the root directly
 *
 * Most users will not need to call this function directly, as it is called
 * by xas_store().  It is useful for doing conditional store operations
 * (see the xa_cmpxchg() implementation for an example).
 *
 * Return: If the slot already existed, returns the contents of this slot.
 * If the slot was newly created, returns %NULL.  If it failed to create the
 * slot, returns %NULL and indicates the error in @xas.
 */
static void *xas_create(struct xa_state *xas, bool allow_root)
{
        struct xarray *xa = xas->xa;
        void *entry;
        void __rcu **slot;
        struct xa_node *node = xas->xa_node;
        int shift;
        unsigned int order = xas->xa_shift;

        if (xas_top(node)) {
                entry = xa_head_locked(xa);
                xas->xa_node = NULL;
                if (!entry && xa_zero_busy(xa))
                        entry = XA_ZERO_ENTRY;
                shift = xas_expand(xas, entry);
                if (shift < 0)
                        return NULL;
                if (!shift && !allow_root)
                        shift = XA_CHUNK_SHIFT;
                entry = xa_head_locked(xa);
                slot = &xa->xa_head;
        } else if (xas_error(xas)) {
                return NULL;
        } else if (node) {
                unsigned int offset = xas->xa_offset;

                shift = node->shift;
                entry = xa_entry_locked(xa, node, offset);
                slot = &node->slots[offset];
        } else {
                shift = 0;
                entry = xa_head_locked(xa);
                slot = &xa->xa_head;
        }

        while (shift > order) {
                shift -= XA_CHUNK_SHIFT;
                if (!entry) {
                        node = xas_alloc(xas, shift);
                        if (!node)
                                break;
                        if (xa_track_free(xa))
                                node_mark_all(node, XA_FREE_MARK);
                        rcu_assign_pointer(*slot, xa_mk_node(node));
                } else if (xa_is_node(entry)) {
                        node = xa_to_node(entry);
                } else {
                        break;
                }
                entry = xas_descend(xas, node);
                slot = &node->slots[xas->xa_offset];
        }

        return entry;
}

/**
 * xas_create_range() - Ensure that stores to this range will succeed
 * @xas: XArray operation state.
 *
 * Creates all of the slots in the range covered by @xas.  Sets @xas to
 * create single-index entries and positions it at the beginning of the
 * range.  This is for the benefit of users which have not yet been
 * converted to use multi-index entries.
 */
void xas_create_range(struct xa_state *xas)
{
        unsigned long index = xas->xa_index;
        unsigned char shift = xas->xa_shift;
        unsigned char sibs = xas->xa_sibs;

        xas->xa_index |= ((sibs + 1UL) << shift) - 1;
        if (xas_is_node(xas) && xas->xa_node->shift == xas->xa_shift)
                xas->xa_offset |= sibs;
        xas->xa_shift = 0;
        xas->xa_sibs = 0;

        for (;;) {
                xas_create(xas, true);
                if (xas_error(xas))
                        goto restore;
                if (xas->xa_index <= (index | XA_CHUNK_MASK))
                        goto success;
                xas->xa_index -= XA_CHUNK_SIZE;

                for (;;) {
                        struct xa_node *node = xas->xa_node;
                        if (node->shift >= shift)
                                break;
                        xas->xa_node = xa_parent_locked(xas->xa, node);
                        xas->xa_offset = node->offset - 1;
                        if (node->offset != 0)
                                break;
                }
        }

restore:
        xas->xa_shift = shift;
        xas->xa_sibs = sibs;
        xas->xa_index = index;
        return;
success:
        xas->xa_index = index;
        if (xas->xa_node)
                xas_set_offset(xas);
}
EXPORT_SYMBOL_GPL(xas_create_range);

static void update_node(struct xa_state *xas, struct xa_node *node,
                int count, int values)
{
        if (!node || (!count && !values))
                return;

        node->count += count;
        node->nr_values += values;
        XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
        XA_NODE_BUG_ON(node, node->nr_values > XA_CHUNK_SIZE);
        xas_update(xas, node);
        if (count < 0)
                xas_delete_node(xas);
}

/**
 * xas_store() - Store this entry in the XArray.
 * @xas: XArray operation state.
 * @entry: New entry.
 *
 * If @xas is operating on a multi-index entry, the entry returned by this
 * function is essentially meaningless (it may be an internal entry or it
 * may be %NULL, even if there are non-NULL entries at some of the indices
 * covered by the range).  This is not a problem for any current users,
 * and can be changed if needed.
 *
 * Return: The old entry at this index.
 */
void *xas_store(struct xa_state *xas, void *entry)
{
        struct xa_node *node;
        void __rcu **slot = &xas->xa->xa_head;
        unsigned int offset, max;
        int count = 0;
        int values = 0;
        void *first, *next;
        bool value = xa_is_value(entry);

        if (entry) {
                bool allow_root = !xa_is_node(entry) && !xa_is_zero(entry);
                first = xas_create(xas, allow_root);
        } else {
                first = xas_load(xas);
        }

        if (xas_invalid(xas))
                return first;
        node = xas->xa_node;
        if (node && (xas->xa_shift < node->shift))
                xas->xa_sibs = 0;
        if ((first == entry) && !xas->xa_sibs)
                return first;

        next = first;
        offset = xas->xa_offset;
        max = xas->xa_offset + xas->xa_sibs;
        if (node) {
                slot = &node->slots[offset];
                if (xas->xa_sibs)
                        xas_squash_marks(xas);
        }
        if (!entry)
                xas_init_marks(xas);

        for (;;) {
                /*
                 * Must clear the marks before setting the entry to NULL,
                 * otherwise xas_for_each_marked may find a NULL entry and
                 * stop early.  rcu_assign_pointer contains a release barrier
                 * so the mark clearing will appear to happen before the
                 * entry is set to NULL.
                 */
                rcu_assign_pointer(*slot, entry);
                if (xa_is_node(next) && (!node || node->shift))
                        xas_free_nodes(xas, xa_to_node(next));
                if (!node)
                        break;
                count += !next - !entry;
                values += !xa_is_value(first) - !value;
                if (entry) {
                        if (offset == max)
                                break;
                        if (!xa_is_sibling(entry))
                                entry = xa_mk_sibling(xas->xa_offset);
                } else {
                        if (offset == XA_CHUNK_MASK)
                                break;
                }
                next = xa_entry_locked(xas->xa, node, ++offset);
                if (!xa_is_sibling(next)) {
                        if (!entry && (offset > max))
                                break;
                        first = next;
                }
                slot++;
        }

        update_node(xas, node, count, values);
        return first;
}
EXPORT_SYMBOL_GPL(xas_store);

/**
 * xas_get_mark() - Returns the state of this mark.
 * @xas: XArray operation state.
 * @mark: Mark number.
 *
 * Return: true if the mark is set, false if the mark is clear or @xas
 * is in an error state.
 */
bool xas_get_mark(const struct xa_state *xas, xa_mark_t mark)
{
        if (xas_invalid(xas))
                return false;
        if (!xas->xa_node)
                return xa_marked(xas->xa, mark);
        return node_get_mark(xas->xa_node, xas->xa_offset, mark);
}
EXPORT_SYMBOL_GPL(xas_get_mark);

/**
 * xas_set_mark() - Sets the mark on this entry and its parents.
 * @xas: XArray operation state.
 * @mark: Mark number.
 *
 * Sets the specified mark on this entry, and walks up the tree setting it
 * on all the ancestor entries.  Does nothing if @xas has not been walked to
 * an entry, or is in an error state.
 */
void xas_set_mark(const struct xa_state *xas, xa_mark_t mark)
{
        struct xa_node *node = xas->xa_node;
        unsigned int offset = xas->xa_offset;

        if (xas_invalid(xas))
                return;

        while (node) {
                if (node_set_mark(node, offset, mark))
                        return;
                offset = node->offset;
                node = xa_parent_locked(xas->xa, node);
        }

        if (!xa_marked(xas->xa, mark))
                xa_mark_set(xas->xa, mark);
}
EXPORT_SYMBOL_GPL(xas_set_mark);

/**
 * xas_clear_mark() - Clears the mark on this entry and its parents.
 * @xas: XArray operation state.
 * @mark: Mark number.
 *
 * Clears the specified mark on this entry, and walks back to the head
 * attempting to clear it on all the ancestor entries.  Does nothing if
 * @xas has not been walked to an entry, or is in an error state.
 */
void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark)
{
        struct xa_node *node = xas->xa_node;
        unsigned int offset = xas->xa_offset;

        if (xas_invalid(xas))
                return;

        while (node) {
                if (!node_clear_mark(node, offset, mark))
                        return;
                if (node_any_mark(node, mark))
                        return;

                offset = node->offset;
                node = xa_parent_locked(xas->xa, node);
        }

        if (xa_marked(xas->xa, mark))
                xa_mark_clear(xas->xa, mark);
}
EXPORT_SYMBOL_GPL(xas_clear_mark);

/**
 * xas_init_marks() - Initialise all marks for the entry
 * @xas: Array operations state.
 *
 * Initialise all marks for the entry specified by @xas.  If we're tracking
 * free entries with a mark, we need to set it on all entries.  All other
 * marks are cleared.
 *
 * This implementation is not as efficient as it could be; we may walk
 * up the tree multiple times.
 */
void xas_init_marks(const struct xa_state *xas)
{
        xa_mark_t mark = 0;

        for (;;) {
                if (xa_track_free(xas->xa) && mark == XA_FREE_MARK)
                        xas_set_mark(xas, mark);
                else
                        xas_clear_mark(xas, mark);
                if (mark == XA_MARK_MAX)
                        break;
                mark_inc(mark);
        }
}
EXPORT_SYMBOL_GPL(xas_init_marks);

#ifdef CONFIG_XARRAY_MULTI
static unsigned int node_get_marks(struct xa_node *node, unsigned int offset)
{
        unsigned int marks = 0;
        xa_mark_t mark = XA_MARK_0;

        for (;;) {
                if (node_get_mark(node, offset, mark))
                        marks |= 1 << (__force unsigned int)mark;
                if (mark == XA_MARK_MAX)
                        break;
                mark_inc(mark);
        }

        return marks;
}

static inline void node_mark_slots(struct xa_node *node, unsigned int sibs,
                xa_mark_t mark)
{
        int i;

        if (sibs == 0)
                node_mark_all(node, mark);
        else {
                for (i = 0; i < XA_CHUNK_SIZE; i += sibs + 1)
                        node_set_mark(node, i, mark);
        }
}

static void node_set_marks(struct xa_node *node, unsigned int offset,
                        struct xa_node *child, unsigned int sibs,
                        unsigned int marks)
{
        xa_mark_t mark = XA_MARK_0;

        for (;;) {
                if (marks & (1 << (__force unsigned int)mark)) {
                        node_set_mark(node, offset, mark);
                        if (child)
                                node_mark_slots(child, sibs, mark);
                }
                if (mark == XA_MARK_MAX)
                        break;
                mark_inc(mark);
        }
}

/**
 * xas_split_alloc() - Allocate memory for splitting an entry.
 * @xas: XArray operation state.
 * @entry: New entry which will be stored in the array.
 * @order: Current entry order.
 * @gfp: Memory allocation flags.
 *
 * This function should be called before calling xas_split().
 * If necessary, it will allocate new nodes (and fill them with @entry)
 * to prepare for the upcoming split of an entry of @order size into
 * entries of the order stored in the @xas.
 *
 * Context: May sleep if @gfp flags permit.
 */
void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order,
                gfp_t gfp)
{
        unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
        unsigned int mask = xas->xa_sibs;

        /* XXX: no support for splitting really large entries yet */
        if (WARN_ON(xas->xa_shift + 2 * XA_CHUNK_SHIFT < order))
                goto nomem;
        if (xas->xa_shift + XA_CHUNK_SHIFT > order)
                return;

        do {
                unsigned int i;
                void *sibling = NULL;
                struct xa_node *node;

                node = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp);
                if (!node)
                        goto nomem;
                node->array = xas->xa;
                for (i = 0; i < XA_CHUNK_SIZE; i++) {
                        if ((i & mask) == 0) {
                                RCU_INIT_POINTER(node->slots[i], entry);
                                sibling = xa_mk_sibling(i);
                        } else {
                                RCU_INIT_POINTER(node->slots[i], sibling);
                        }
                }
                RCU_INIT_POINTER(node->parent, xas->xa_alloc);
                xas->xa_alloc = node;
        } while (sibs-- > 0);

        return;
nomem:
        xas_destroy(xas);
        xas_set_err(xas, -ENOMEM);
}
EXPORT_SYMBOL_GPL(xas_split_alloc);

/**
 * xas_split() - Split a multi-index entry into smaller entries.
 * @xas: XArray operation state.
 * @entry: New entry to store in the array.
 * @order: Current entry order.
 *
 * The size of the new entries is set in @xas.  The value in @entry is
 * copied to all the replacement entries.
 *
 * Context: Any context.  The caller should hold the xa_lock.
 */
void xas_split(struct xa_state *xas, void *entry, unsigned int order)
{
        unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
        unsigned int offset, marks;
        struct xa_node *node;
        void *curr = xas_load(xas);
        int values = 0;

        node = xas->xa_node;
        if (xas_top(node))
                return;

        marks = node_get_marks(node, xas->xa_offset);

        offset = xas->xa_offset + sibs;
        do {
                if (xas->xa_shift < node->shift) {
                        struct xa_node *child = xas->xa_alloc;

                        xas->xa_alloc = rcu_dereference_raw(child->parent);
                        child->shift = node->shift - XA_CHUNK_SHIFT;
                        child->offset = offset;
                        child->count = XA_CHUNK_SIZE;
                        child->nr_values = xa_is_value(entry) ?
                                        XA_CHUNK_SIZE : 0;
                        RCU_INIT_POINTER(child->parent, node);
                        node_set_marks(node, offset, child, xas->xa_sibs,
                                        marks);
                        rcu_assign_pointer(node->slots[offset],
                                        xa_mk_node(child));
                        if (xa_is_value(curr))
                                values--;
                        xas_update(xas, child);
                } else {
                        unsigned int canon = offset - xas->xa_sibs;

                        node_set_marks(node, canon, NULL, 0, marks);
                        rcu_assign_pointer(node->slots[canon], entry);
                        while (offset > canon)
                                rcu_assign_pointer(node->slots[offset--],
                                                xa_mk_sibling(canon));
                        values += (xa_is_value(entry) - xa_is_value(curr)) *
                                        (xas->xa_sibs + 1);
                }
        } while (offset-- > xas->xa_offset);

        node->nr_values += values;
        xas_update(xas, node);
}
EXPORT_SYMBOL_GPL(xas_split);
#endif

/**
 * xas_pause() - Pause a walk to drop a lock.
 * @xas: XArray operation state.
 *
 * Some users need to pause a walk and drop the lock they're holding in
 * order to yield to a higher priority thread or carry out an operation
 * on an entry.  Those users should call this function before they drop
 * the lock.  It resets the @xas to be suitable for the next iteration
 * of the loop after the user has reacquired the lock.  If most entries
 * found during a walk require you to call xas_pause(), the xa_for_each()
 * iterator may be more appropriate.
 *
 * Note that xas_pause() only works for forward iteration.  If a user needs
 * to pause a reverse iteration, we will need a xas_pause_rev().
 */
void xas_pause(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        if (xas_invalid(xas))
                return;

        xas->xa_node = XAS_RESTART;
        if (node) {
                unsigned long offset = xas->xa_offset;
                while (++offset < XA_CHUNK_SIZE) {
                        if (!xa_is_sibling(xa_entry(xas->xa, node, offset)))
                                break;
                }
                xas->xa_index += (offset - xas->xa_offset) << node->shift;
                if (xas->xa_index == 0)
                        xas->xa_node = XAS_BOUNDS;
        } else {
                xas->xa_index++;
        }
}
EXPORT_SYMBOL_GPL(xas_pause);

/*
 * __xas_prev() - Find the previous entry in the XArray.
 * @xas: XArray operation state.
 *
 * Helper function for xas_prev() which handles all the complex cases
 * out of line.
 */
void *__xas_prev(struct xa_state *xas)
{
        void *entry;

        if (!xas_frozen(xas->xa_node))
                xas->xa_index--;
        if (!xas->xa_node)
                return set_bounds(xas);
        if (xas_not_node(xas->xa_node))
                return xas_load(xas);

        if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node))
                xas->xa_offset--;

        while (xas->xa_offset == 255) {
                xas->xa_offset = xas->xa_node->offset - 1;
                xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                if (!xas->xa_node)
                        return set_bounds(xas);
        }

        for (;;) {
                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (!xa_is_node(entry))
                        return entry;

                xas->xa_node = xa_to_node(entry);
                xas_set_offset(xas);
        }
}
EXPORT_SYMBOL_GPL(__xas_prev);

/*
 * __xas_next() - Find the next entry in the XArray.
 * @xas: XArray operation state.
 *
 * Helper function for xas_next() which handles all the complex cases
 * out of line.
 */
void *__xas_next(struct xa_state *xas)
{
        void *entry;

        if (!xas_frozen(xas->xa_node))
                xas->xa_index++;
        if (!xas->xa_node)
                return set_bounds(xas);
        if (xas_not_node(xas->xa_node))
                return xas_load(xas);

        if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node))
                xas->xa_offset++;

        while (xas->xa_offset == XA_CHUNK_SIZE) {
                xas->xa_offset = xas->xa_node->offset + 1;
                xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                if (!xas->xa_node)
                        return set_bounds(xas);
        }

        for (;;) {
                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (!xa_is_node(entry))
                        return entry;

                xas->xa_node = xa_to_node(entry);
                xas_set_offset(xas);
        }
}
EXPORT_SYMBOL_GPL(__xas_next);

/**
 * xas_find() - Find the next present entry in the XArray.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 *
 * If the @xas has not yet been walked to an entry, return the entry
 * which has an index >= xas.xa_index.  If it has been walked, the entry
 * currently being pointed at has been processed, and so we move to the
 * next entry.
 *
 * If no entry is found and the array is smaller than @max, the iterator
 * is set to the smallest index not yet in the array.  This allows @xas
 * to be immediately passed to xas_store().
 *
 * Return: The entry, if found, otherwise %NULL.
 */
void *xas_find(struct xa_state *xas, unsigned long max)
{
        void *entry;

        if (xas_error(xas) || xas->xa_node == XAS_BOUNDS)
                return NULL;
        if (xas->xa_index > max)
                return set_bounds(xas);

        if (!xas->xa_node) {
                xas->xa_index = 1;
                return set_bounds(xas);
        } else if (xas->xa_node == XAS_RESTART) {
                entry = xas_load(xas);
                if (entry || xas_not_node(xas->xa_node))
                        return entry;
        } else if (!xas->xa_node->shift &&
                    xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)) {
                xas->xa_offset = ((xas->xa_index - 1) & XA_CHUNK_MASK) + 1;
        }

        xas_next_offset(xas);

        while (xas->xa_node && (xas->xa_index <= max)) {
                if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) {
                        xas->xa_offset = xas->xa_node->offset + 1;
                        xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                        continue;
                }

                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (xa_is_node(entry)) {
                        xas->xa_node = xa_to_node(entry);
                        xas->xa_offset = 0;
                        continue;
                }
                if (entry && !xa_is_sibling(entry))
                        return entry;

                xas_next_offset(xas);
        }

        if (!xas->xa_node)
                xas->xa_node = XAS_BOUNDS;
        return NULL;
}
EXPORT_SYMBOL_GPL(xas_find);

/**
 * xas_find_marked() - Find the next marked entry in the XArray.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 * @mark: Mark number to search for.
 *
 * If the @xas has not yet been walked to an entry, return the marked entry
 * which has an index >= xas.xa_index.  If it has been walked, the entry
 * currently being pointed at has been processed, and so we return the
 * first marked entry with an index > xas.xa_index.
 *
 * If no marked entry is found and the array is smaller than @max, @xas is
 * set to the bounds state and xas->xa_index is set to the smallest index
 * not yet in the array.  This allows @xas to be immediately passed to
 * xas_store().
 *
 * If no entry is found before @max is reached, @xas is set to the restart
 * state.
 *
 * Return: The entry, if found, otherwise %NULL.
 */
void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark)
{
        bool advance = true;
        unsigned int offset;
        void *entry;

        if (xas_error(xas))
                return NULL;
        if (xas->xa_index > max)
                goto max;

        if (!xas->xa_node) {
                xas->xa_index = 1;
                goto out;
        } else if (xas_top(xas->xa_node)) {
                advance = false;
                entry = xa_head(xas->xa);
                xas->xa_node = NULL;
                if (xas->xa_index > max_index(entry))
                        goto out;
                if (!xa_is_node(entry)) {
                        if (xa_marked(xas->xa, mark))
                                return entry;
                        xas->xa_index = 1;
                        goto out;
                }
                xas->xa_node = xa_to_node(entry);
                xas->xa_offset = xas->xa_index >> xas->xa_node->shift;
        }

        while (xas->xa_index <= max) {
                if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) {
                        xas->xa_offset = xas->xa_node->offset + 1;
                        xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                        if (!xas->xa_node)
                                break;
                        advance = false;
                        continue;
                }

                if (!advance) {
                        entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                        if (xa_is_sibling(entry)) {
                                xas->xa_offset = xa_to_sibling(entry);
                                xas_move_index(xas, xas->xa_offset);
                        }
                }

                offset = xas_find_chunk(xas, advance, mark);
                if (offset > xas->xa_offset) {
                        advance = false;
                        xas_move_index(xas, offset);
                        /* Mind the wrap */
                        if ((xas->xa_index - 1) >= max)
                                goto max;
                        xas->xa_offset = offset;
                        if (offset == XA_CHUNK_SIZE)
                                continue;
                }

                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (!entry && !(xa_track_free(xas->xa) && mark == XA_FREE_MARK))
                        continue;
                if (!xa_is_node(entry))
                        return entry;
                xas->xa_node = xa_to_node(entry);
                xas_set_offset(xas);
        }

out:
        if (xas->xa_index > max)
                goto max;
        return set_bounds(xas);
max:
        xas->xa_node = XAS_RESTART;
        return NULL;
}
EXPORT_SYMBOL_GPL(xas_find_marked);

/**
 * xas_find_conflict() - Find the next present entry in a range.
 * @xas: XArray operation state.
 *
 * The @xas describes both a range and a position within that range.
 *
 * Context: Any context.  Expects xa_lock to be held.
 * Return: The next entry in the range covered by @xas or %NULL.
 */
void *xas_find_conflict(struct xa_state *xas)
{
        void *curr;

        if (xas_error(xas))
                return NULL;

        if (!xas->xa_node)
                return NULL;

        if (xas_top(xas->xa_node)) {
                curr = xas_start(xas);
                if (!curr)
                        return NULL;
                while (xa_is_node(curr)) {
                        struct xa_node *node = xa_to_node(curr);
                        curr = xas_descend(xas, node);
                }
                if (curr)
                        return curr;
        }

        if (xas->xa_node->shift > xas->xa_shift)
                return NULL;

        for (;;) {
                if (xas->xa_node->shift == xas->xa_shift) {
                        if ((xas->xa_offset & xas->xa_sibs) == xas->xa_sibs)
                                break;
                } else if (xas->xa_offset == XA_CHUNK_MASK) {
                        xas->xa_offset = xas->xa_node->offset;
                        xas->xa_node = xa_parent_locked(xas->xa, xas->xa_node);
                        if (!xas->xa_node)
                                break;
                        continue;
                }
                curr = xa_entry_locked(xas->xa, xas->xa_node, ++xas->xa_offset);
                if (xa_is_sibling(curr))
                        continue;
                while (xa_is_node(curr)) {
                        xas->xa_node = xa_to_node(curr);
                        xas->xa_offset = 0;
                        curr = xa_entry_locked(xas->xa, xas->xa_node, 0);
                }
                if (curr)
                        return curr;
        }
        xas->xa_offset -= xas->xa_sibs;
        return NULL;
}
EXPORT_SYMBOL_GPL(xas_find_conflict);

/**
 * xa_load() - Load an entry from an XArray.
 * @xa: XArray.
 * @index: index into array.
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The entry at @index in @xa.
 */
void *xa_load(struct xarray *xa, unsigned long index)
{
        XA_STATE(xas, xa, index);
        void *entry;

        rcu_read_lock();
        do {
                entry = xas_load(&xas);
                if (xa_is_zero(entry))
                        entry = NULL;
        } while (xas_retry(&xas, entry));
        rcu_read_unlock();

        return entry;
}
EXPORT_SYMBOL(xa_load);

static void *xas_result(struct xa_state *xas, void *curr)
{
        if (xa_is_zero(curr))
                return NULL;
        if (xas_error(xas))
                curr = xas->xa_node;
        return curr;
}

/**
 * __xa_erase() - Erase this entry from the XArray while locked.
 * @xa: XArray.
 * @index: Index into array.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.
 * Return: The entry which used to be at this index.
 */
void *__xa_erase(struct xarray *xa, unsigned long index)
{
        XA_STATE(xas, xa, index);
        return xas_result(&xas, xas_store(&xas, NULL));
}
EXPORT_SYMBOL(__xa_erase);

/**
 * xa_erase() - Erase this entry from the XArray.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Any context.  Takes and releases the xa_lock.
 * Return: The entry which used to be at this index.
 */
void *xa_erase(struct xarray *xa, unsigned long index)
{
        void *entry;

        xa_lock(xa);
        entry = __xa_erase(xa, index);
        xa_unlock(xa);

        return entry;
}
EXPORT_SYMBOL(xa_erase);

/**
 * __xa_store() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * You must already be holding the xa_lock when calling this function.
 * It will drop the lock if needed to allocate memory, and then reacquire
 * it afterwards.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, index);
        void *curr;

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return XA_ERROR(-EINVAL);
        if (xa_track_free(xa) && !entry)
                entry = XA_ZERO_ENTRY;

        do {
                curr = xas_store(&xas, entry);
                if (xa_track_free(xa))
                        xas_clear_mark(&xas, XA_FREE_MARK);
        } while (__xas_nomem(&xas, gfp));

        return xas_result(&xas, curr);
}
EXPORT_SYMBOL(__xa_store);

/**
 * xa_store() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * After this function returns, loads from this index will return @entry.
 * Storing into an existing multi-index entry updates the entry of every index.
 * The marks associated with @index are unaffected unless @entry is %NULL.
 *
 * Context: Any context.  Takes and releases the xa_lock.
 * May sleep if the @gfp flags permit.
 * Return: The old entry at this index on success, xa_err(-EINVAL) if @entry
 * cannot be stored in an XArray, or xa_err(-ENOMEM) if memory allocation
 * failed.
 */
void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
        void *curr;

        xa_lock(xa);
        curr = __xa_store(xa, index, entry, gfp);
        xa_unlock(xa);

        return curr;
}
EXPORT_SYMBOL(xa_store);

/**
 * __xa_cmpxchg() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * You must already be holding the xa_lock when calling this function.
 * It will drop the lock if needed to allocate memory, and then reacquire
 * it afterwards.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
void *__xa_cmpxchg(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, index);
        void *curr;

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return XA_ERROR(-EINVAL);

        do {
                curr = xas_load(&xas);
                if (curr == old) {
                        xas_store(&xas, entry);
                        if (xa_track_free(xa) && entry && !curr)
                                xas_clear_mark(&xas, XA_FREE_MARK);
                }
        } while (__xas_nomem(&xas, gfp));

        return xas_result(&xas, curr);
}
EXPORT_SYMBOL(__xa_cmpxchg);

/**
 * __xa_insert() - Store this entry in the XArray if no entry is present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
int __xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, index);
        void *curr;

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return -EINVAL;
        if (!entry)
                entry = XA_ZERO_ENTRY;

        do {
                curr = xas_load(&xas);
                if (!curr) {
                        xas_store(&xas, entry);
                        if (xa_track_free(xa))
                                xas_clear_mark(&xas, XA_FREE_MARK);
                } else {
                        xas_set_err(&xas, -EBUSY);
                }
        } while (__xas_nomem(&xas, gfp));

        return xas_error(&xas);
}
EXPORT_SYMBOL(__xa_insert);

#ifdef CONFIG_XARRAY_MULTI
static void xas_set_range(struct xa_state *xas, unsigned long first,
                unsigned long last)
{
        unsigned int shift = 0;
        unsigned long sibs = last - first;
        unsigned int offset = XA_CHUNK_MASK;

        xas_set(xas, first);

        while ((first & XA_CHUNK_MASK) == 0) {
                if (sibs < XA_CHUNK_MASK)
                        break;
                if ((sibs == XA_CHUNK_MASK) && (offset < XA_CHUNK_MASK))
                        break;
                shift += XA_CHUNK_SHIFT;
                if (offset == XA_CHUNK_MASK)
                        offset = sibs & XA_CHUNK_MASK;
                sibs >>= XA_CHUNK_SHIFT;
                first >>= XA_CHUNK_SHIFT;
        }

        offset = first & XA_CHUNK_MASK;
        if (offset + sibs > XA_CHUNK_MASK)
                sibs = XA_CHUNK_MASK - offset;
        if ((((first + sibs + 1) << shift) - 1) > last)
                sibs -= 1;

        xas->xa_shift = shift;
        xas->xa_sibs = sibs;
}

/**
 * xa_store_range() - Store this entry at a range of indices in the XArray.
 * @xa: XArray.
 * @first: First index to affect.
 * @last: Last index to affect.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * After this function returns, loads from any index between @first and @last,
 * inclusive will return @entry.
 * Storing into an existing multi-index entry updates the entry of every index.
 * The marks associated with @index are unaffected unless @entry is %NULL.
 *
 * Context: Process context.  Takes and releases the xa_lock.  May sleep
 * if the @gfp flags permit.
 * Return: %NULL on success, xa_err(-EINVAL) if @entry cannot be stored in
 * an XArray, or xa_err(-ENOMEM) if memory allocation failed.
 */
void *xa_store_range(struct xarray *xa, unsigned long first,
                unsigned long last, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, 0);

        if (WARN_ON_ONCE(xa_is_internal(entry)))
                return XA_ERROR(-EINVAL);
        if (last < first)
                return XA_ERROR(-EINVAL);

        do {
                xas_lock(&xas);
                if (entry) {
                        unsigned int order = BITS_PER_LONG;
                        if (last + 1)
                                order = __ffs(last + 1);
                        xas_set_order(&xas, last, order);
                        xas_create(&xas, true);
                        if (xas_error(&xas))
                                goto unlock;
                }
                do {
                        xas_set_range(&xas, first, last);
                        xas_store(&xas, entry);
                        if (xas_error(&xas))
                                goto unlock;
                        first += xas_size(&xas);
                } while (first <= last);
unlock:
                xas_unlock(&xas);
        } while (xas_nomem(&xas, gfp));

        return xas_result(&xas, NULL);
}
EXPORT_SYMBOL(xa_store_range);

/**
 * xas_get_order() - Get the order of an entry.
 * @xas: XArray operation state.
 *
 * Called after xas_load, the xas should not be in an error state.
 *
 * Return: A number between 0 and 63 indicating the order of the entry.
 */
int xas_get_order(struct xa_state *xas)
{
        int order = 0;

        if (!xas->xa_node)
                return 0;

        for (;;) {
                unsigned int slot = xas->xa_offset + (1 << order);

                if (slot >= XA_CHUNK_SIZE)
                        break;
                if (!xa_is_sibling(xa_entry(xas->xa, xas->xa_node, slot)))
                        break;
                order++;
        }

        order += xas->xa_node->shift;
        return order;
}
EXPORT_SYMBOL_GPL(xas_get_order);

/**
 * xa_get_order() - Get the order of an entry.
 * @xa: XArray.
 * @index: Index of the entry.
 *
 * Return: A number between 0 and 63 indicating the order of the entry.
 */
int xa_get_order(struct xarray *xa, unsigned long index)
{
        XA_STATE(xas, xa, index);
        int order = 0;
        void *entry;

        rcu_read_lock();
        entry = xas_load(&xas);
        if (entry)
                order = xas_get_order(&xas);
        rcu_read_unlock();

        return order;
}
EXPORT_SYMBOL(xa_get_order);
#endif /* CONFIG_XARRAY_MULTI */

/**
 * __xa_alloc() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @limit: Range for allocated ID.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
int __xa_alloc(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, gfp_t gfp)
{
        XA_STATE(xas, xa, 0);

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return -EINVAL;
        if (WARN_ON_ONCE(!xa_track_free(xa)))
                return -EINVAL;

        if (!entry)
                entry = XA_ZERO_ENTRY;

        do {
                xas.xa_index = limit.min;
                xas_find_marked(&xas, limit.max, XA_FREE_MARK);
                if (xas.xa_node == XAS_RESTART)
                        xas_set_err(&xas, -EBUSY);
                else
                        *id = xas.xa_index;
                xas_store(&xas, entry);
                xas_clear_mark(&xas, XA_FREE_MARK);
        } while (__xas_nomem(&xas, gfp));

        return xas_error(&xas);
}
EXPORT_SYMBOL(__xa_alloc);

/**
 * __xa_alloc_cyclic() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
int __xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        u32 min = limit.min;
        int ret;

        limit.min = max(min, *next);
        ret = __xa_alloc(xa, id, entry, limit, gfp);
        if ((xa->xa_flags & XA_FLAGS_ALLOC_WRAPPED) && ret == 0) {
                xa->xa_flags &= ~XA_FLAGS_ALLOC_WRAPPED;
                ret = 1;
        }

        if (ret < 0 && limit.min > min) {
                limit.min = min;
                ret = __xa_alloc(xa, id, entry, limit, gfp);
                if (ret == 0)
                        ret = 1;
        }

        if (ret >= 0) {
                *next = *id + 1;
                if (*next == 0)
                        xa->xa_flags |= XA_FLAGS_ALLOC_WRAPPED;
        }
        return ret;
}
EXPORT_SYMBOL(__xa_alloc_cyclic);

/**
 * __xa_set_mark() - Set this mark on this entry while locked.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Attempting to set a mark on a %NULL entry does not succeed.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.
 */
void __xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        XA_STATE(xas, xa, index);
        void *entry = xas_load(&xas);

        if (entry)
                xas_set_mark(&xas, mark);
}
EXPORT_SYMBOL(__xa_set_mark);

/**
 * __xa_clear_mark() - Clear this mark on this entry while locked.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.
 */
void __xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        XA_STATE(xas, xa, index);
        void *entry = xas_load(&xas);

        if (entry)
                xas_clear_mark(&xas, mark);
}
EXPORT_SYMBOL(__xa_clear_mark);

/**
 * xa_get_mark() - Inquire whether this mark is set on this entry.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * This function uses the RCU read lock, so the result may be out of date
 * by the time it returns.  If you need the result to be stable, use a lock.
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: True if the entry at @index has this mark set, false if it doesn't.
 */
bool xa_get_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        XA_STATE(xas, xa, index);
        void *entry;

        rcu_read_lock();
        entry = xas_start(&xas);
        while (xas_get_mark(&xas, mark)) {
                if (!xa_is_node(entry))
                        goto found;
                entry = xas_descend(&xas, xa_to_node(entry));
        }
        rcu_read_unlock();
        return false;
 found:
        rcu_read_unlock();
        return true;
}
EXPORT_SYMBOL(xa_get_mark);

/**
 * xa_set_mark() - Set this mark on this entry.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Attempting to set a mark on a %NULL entry does not succeed.
 *
 * Context: Process context.  Takes and releases the xa_lock.
 */
void xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        xa_lock(xa);
        __xa_set_mark(xa, index, mark);
        xa_unlock(xa);
}
EXPORT_SYMBOL(xa_set_mark);

/**
 * xa_clear_mark() - Clear this mark on this entry.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Clearing a mark always succeeds.
 *
 * Context: Process context.  Takes and releases the xa_lock.
 */
void xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        xa_lock(xa);
        __xa_clear_mark(xa, index, mark);
        xa_unlock(xa);
}
EXPORT_SYMBOL(xa_clear_mark);

/**
 * xa_find() - Search the XArray for an entry.
 * @xa: XArray.
 * @indexp: Pointer to an index.
 * @max: Maximum index to search to.
 * @filter: Selection criterion.
 *
 * Finds the entry in @xa which matches the @filter, and has the lowest
 * index that is at least @indexp and no more than @max.
 * If an entry is found, @indexp is updated to be the index of the entry.
 * This function is protected by the RCU read lock, so it may not find
 * entries which are being simultaneously added.  It will not return an
 * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The entry, if found, otherwise %NULL.
 */
void *xa_find(struct xarray *xa, unsigned long *indexp,
                        unsigned long max, xa_mark_t filter)
{
        XA_STATE(xas, xa, *indexp);
        void *entry;

        rcu_read_lock();
        do {
                if ((__force unsigned int)filter < XA_MAX_MARKS)
                        entry = xas_find_marked(&xas, max, filter);
                else
                        entry = xas_find(&xas, max);
        } while (xas_retry(&xas, entry));
        rcu_read_unlock();

        if (entry)
                *indexp = xas.xa_index;
        return entry;
}
EXPORT_SYMBOL(xa_find);

static bool xas_sibling(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;
        unsigned long mask;

        if (!IS_ENABLED(CONFIG_XARRAY_MULTI) || !node)
                return false;
        mask = (XA_CHUNK_SIZE << node->shift) - 1;
        return (xas->xa_index & mask) >
                ((unsigned long)xas->xa_offset << node->shift);
}

/**
 * xa_find_after() - Search the XArray for a present entry.
 * @xa: XArray.
 * @indexp: Pointer to an index.
 * @max: Maximum index to search to.
 * @filter: Selection criterion.
 *
 * Finds the entry in @xa which matches the @filter and has the lowest
 * index that is above @indexp and no more than @max.
 * If an entry is found, @indexp is updated to be the index of the entry.
 * This function is protected by the RCU read lock, so it may miss entries
 * which are being simultaneously added.  It will not return an
 * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The pointer, if found, otherwise %NULL.
 */
void *xa_find_after(struct xarray *xa, unsigned long *indexp,
                        unsigned long max, xa_mark_t filter)
{
        XA_STATE(xas, xa, *indexp + 1);
        void *entry;

        if (xas.xa_index == 0)
                return NULL;

        rcu_read_lock();
        for (;;) {
                if ((__force unsigned int)filter < XA_MAX_MARKS)
                        entry = xas_find_marked(&xas, max, filter);
                else
                        entry = xas_find(&xas, max);

                if (xas_invalid(&xas))
                        break;
                if (xas_sibling(&xas))
                        continue;
                if (!xas_retry(&xas, entry))
                        break;
        }
        rcu_read_unlock();

        if (entry)
                *indexp = xas.xa_index;
        return entry;
}
EXPORT_SYMBOL(xa_find_after);

static unsigned int xas_extract_present(struct xa_state *xas, void **dst,
                        unsigned long max, unsigned int n)
{
        void *entry;
        unsigned int i = 0;

        rcu_read_lock();
        xas_for_each(xas, entry, max) {
                if (xas_retry(xas, entry))
                        continue;
                dst[i++] = entry;
                if (i == n)
                        break;
        }
        rcu_read_unlock();

        return i;
}

static unsigned int xas_extract_marked(struct xa_state *xas, void **dst,
                        unsigned long max, unsigned int n, xa_mark_t mark)
{
        void *entry;
        unsigned int i = 0;

        rcu_read_lock();
        xas_for_each_marked(xas, entry, max, mark) {
                if (xas_retry(xas, entry))
                        continue;
                dst[i++] = entry;
                if (i == n)
                        break;
        }
        rcu_read_unlock();

        return i;
}

/**
 * xa_extract() - Copy selected entries from the XArray into a normal array.
 * @xa: The source XArray to copy from.
 * @dst: The buffer to copy entries into.
 * @start: The first index in the XArray eligible to be selected.
 * @max: The last index in the XArray eligible to be selected.
 * @n: The maximum number of entries to copy.
 * @filter: Selection criterion.
 *
 * Copies up to @n entries that match @filter from the XArray.  The
 * copied entries will have indices between @start and @max, inclusive.
 *
 * The @filter may be an XArray mark value, in which case entries which are
 * marked with that mark will be copied.  It may also be %XA_PRESENT, in
 * which case all entries which are not %NULL will be copied.
 *
 * The entries returned may not represent a snapshot of the XArray at a
 * moment in time.  For example, if another thread stores to index 5, then
 * index 10, calling xa_extract() may return the old contents of index 5
 * and the new contents of index 10.  Indices not modified while this
 * function is running will not be skipped.
 *
 * If you need stronger guarantees, holding the xa_lock across calls to this
 * function will prevent concurrent modification.
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The number of entries copied.
 */
unsigned int xa_extract(struct xarray *xa, void **dst, unsigned long start,
                        unsigned long max, unsigned int n, xa_mark_t filter)
{
        XA_STATE(xas, xa, start);

        if (!n)
                return 0;

        if ((__force unsigned int)filter < XA_MAX_MARKS)
                return xas_extract_marked(&xas, dst, max, n, filter);
        return xas_extract_present(&xas, dst, max, n);
}
EXPORT_SYMBOL(xa_extract);

/**
 * xa_delete_node() - Private interface for workingset code.
 * @node: Node to be removed from the tree.
 * @update: Function to call to update ancestor nodes.
 *
 * Context: xa_lock must be held on entry and will not be released.
 */
void xa_delete_node(struct xa_node *node, xa_update_node_t update)
{
        struct xa_state xas = {
                .xa = node->array,
                .xa_index = (unsigned long)node->offset <<
                                (node->shift + XA_CHUNK_SHIFT),
                .xa_shift = node->shift + XA_CHUNK_SHIFT,
                .xa_offset = node->offset,
                .xa_node = xa_parent_locked(node->array, node),
                .xa_update = update,
        };

        xas_store(&xas, NULL);
}
EXPORT_SYMBOL_GPL(xa_delete_node);        /* For the benefit of the test suite */

/**
 * xa_destroy() - Free all internal data structures.
 * @xa: XArray.
 *
 * After calling this function, the XArray is empty and has freed all memory
 * allocated for its internal data structures.  You are responsible for
 * freeing the objects referenced by the XArray.
 *
 * Context: Any context.  Takes and releases the xa_lock, interrupt-safe.
 */
void xa_destroy(struct xarray *xa)
{
        XA_STATE(xas, xa, 0);
        unsigned long flags;
        void *entry;

        xas.xa_node = NULL;
        xas_lock_irqsave(&xas, flags);
        entry = xa_head_locked(xa);
        RCU_INIT_POINTER(xa->xa_head, NULL);
        xas_init_marks(&xas);
        if (xa_zero_busy(xa))
                xa_mark_clear(xa, XA_FREE_MARK);
        /* lockdep checks we're still holding the lock in xas_free_nodes() */
        if (xa_is_node(entry))
                xas_free_nodes(&xas, xa_to_node(entry));
        xas_unlock_irqrestore(&xas, flags);
}
EXPORT_SYMBOL(xa_destroy);

#ifdef XA_DEBUG
void xa_dump_node(const struct xa_node *node)
{
        unsigned i, j;

        if (!node)
                return;
        if ((unsigned long)node & 3) {
                pr_cont("node %px\n", node);
                return;
        }

        pr_cont("node %px %s %d parent %px shift %d count %d values %d "
                "array %px list %px %px marks",
                node, node->parent ? "offset" : "max", node->offset,
                node->parent, node->shift, node->count, node->nr_values,
                node->array, node->private_list.prev, node->private_list.next);
        for (i = 0; i < XA_MAX_MARKS; i++)
                for (j = 0; j < XA_MARK_LONGS; j++)
                        pr_cont(" %lx", node->marks[i][j]);
        pr_cont("\n");
}

void xa_dump_index(unsigned long index, unsigned int shift)
{
        if (!shift)
                pr_info("%lu: ", index);
        else if (shift >= BITS_PER_LONG)
                pr_info("0-%lu: ", ~0UL);
        else
                pr_info("%lu-%lu: ", index, index | ((1UL << shift) - 1));
}

void xa_dump_entry(const void *entry, unsigned long index, unsigned long shift)
{
        if (!entry)
                return;

        xa_dump_index(index, shift);

        if (xa_is_node(entry)) {
                if (shift == 0) {
                        pr_cont("%px\n", entry);
                } else {
                        unsigned long i;
                        struct xa_node *node = xa_to_node(entry);
                        xa_dump_node(node);
                        for (i = 0; i < XA_CHUNK_SIZE; i++)
                                xa_dump_entry(node->slots[i],
                                      index + (i << node->shift), node->shift);
                }
        } else if (xa_is_value(entry))
                pr_cont("value %ld (0x%lx) [%px]\n", xa_to_value(entry),
                                                xa_to_value(entry), entry);
        else if (!xa_is_internal(entry))
                pr_cont("%px\n", entry);
        else if (xa_is_retry(entry))
                pr_cont("retry (%ld)\n", xa_to_internal(entry));
        else if (xa_is_sibling(entry))
                pr_cont("sibling (slot %ld)\n", xa_to_sibling(entry));
        else if (xa_is_zero(entry))
                pr_cont("zero (%ld)\n", xa_to_internal(entry));
        else
                pr_cont("UNKNOWN ENTRY (%px)\n", entry);
}

void xa_dump(const struct xarray *xa)
{
        void *entry = xa->xa_head;
        unsigned int shift = 0;

        pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry,
                        xa->xa_flags, xa_marked(xa, XA_MARK_0),
                        xa_marked(xa, XA_MARK_1), xa_marked(xa, XA_MARK_2));
        if (xa_is_node(entry))
                shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT;
        xa_dump_entry(entry, 0, shift);
}
#endif













































    1 












    1 


    1 



    1 
    1 


    1 





















    1 

    1 

    1 


    1 










    1 









    1 

    1 


    1 



    1 
























    1 


















    1 
















    1 








    1 





































    1 










    1 














    1 































    1 





    1 


























    1 


















    1 

















    1 












    1 








    1 



    1 
    1 


    1 
    1 


    1 





















    1 
    1 




    1 
    1 


    1 
    1 



    1 



























    1 
    1 












    1 












    1 





    1 









    1 

















    1 



















    1 



    1 
















































































































































































































































































































































    1 

    1 























































    1 







    1 



    1 























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/fat/dir.c
 *
 *  directory handling functions for fat-based filesystems
 *
 *  Written 1992,1993 by Werner Almesberger
 *
 *  Hidden files 1995 by Albert Cahalan <albert@ccs.neu.edu> <adc@coe.neu.edu>
 *
 *  VFAT extensions by Gordon Chaffee <chaffee@plateau.cs.berkeley.edu>
 *  Merged with msdos fs by Henrik Storner <storner@osiris.ping.dk>
 *  Rewritten for constant inumbers. Plugged buffer overrun in readdir(). AV
 *  Short name translation 1999, 2001 by Wolfram Pienkoss <wp@bszh.de>
 */

#include <linux/slab.h>
#include <linux/compat.h>
#include <linux/uaccess.h>
#include <linux/iversion.h>
#include "fat.h"

/*
 * Maximum buffer size of short name.
 * [(MSDOS_NAME + '.') * max one char + nul]
 * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
 */
#define FAT_MAX_SHORT_SIZE        ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
/*
 * Maximum buffer size of unicode chars from slots.
 * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
 */
#define FAT_MAX_UNI_CHARS        ((MSDOS_SLOTS - 1) * 13 + 1)
#define FAT_MAX_UNI_SIZE        (FAT_MAX_UNI_CHARS * sizeof(wchar_t))

static inline unsigned char fat_tolower(unsigned char c)
{
        return ((c >= 'A') && (c <= 'Z')) ? c+32 : c;
}

static inline loff_t fat_make_i_pos(struct super_block *sb,
                                    struct buffer_head *bh,
                                    struct msdos_dir_entry *de)
{
        return ((loff_t)bh->b_blocknr << MSDOS_SB(sb)->dir_per_block_bits)
                | (de - (struct msdos_dir_entry *)bh->b_data);
}

static inline void fat_dir_readahead(struct inode *dir, sector_t iblock,
                                     sector_t phys)
{
        struct super_block *sb = dir->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        struct buffer_head *bh;
        int sec;

        /* This is not a first sector of cluster, or sec_per_clus == 1 */
        if ((iblock & (sbi->sec_per_clus - 1)) || sbi->sec_per_clus == 1)
                return;
        /* root dir of FAT12/FAT16 */
        if (!is_fat32(sbi) && (dir->i_ino == MSDOS_ROOT_INO))
                return;

        bh = sb_find_get_block(sb, phys);
        if (bh == NULL || !buffer_uptodate(bh)) {
                for (sec = 0; sec < sbi->sec_per_clus; sec++)
                        sb_breadahead(sb, phys + sec);
        }
        brelse(bh);
}

/* Returns the inode number of the directory entry at offset pos. If bh is
   non-NULL, it is brelse'd before. Pos is incremented. The buffer header is
   returned in bh.
   AV. Most often we do it item-by-item. Makes sense to optimize.
   AV. OK, there we go: if both bh and de are non-NULL we assume that we just
   AV. want the next entry (took one explicit de=NULL in vfat/namei.c).
   AV. It's done in fat_get_entry() (inlined), here the slow case lives.
   AV. Additionally, when we return -1 (i.e. reached the end of directory)
   AV. we make bh NULL.
 */
static int fat__get_entry(struct inode *dir, loff_t *pos,
                          struct buffer_head **bh, struct msdos_dir_entry **de)
{
        struct super_block *sb = dir->i_sb;
        sector_t phys, iblock;
        unsigned long mapped_blocks;
        int err, offset;

next:
        brelse(*bh);
        *bh = NULL;
        iblock = *pos >> sb->s_blocksize_bits;
        err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false);
        if (err || !phys)
                return -1;        /* beyond EOF or error */

        fat_dir_readahead(dir, iblock, phys);

        *bh = sb_bread(sb, phys);
        if (*bh == NULL) {
                fat_msg_ratelimit(sb, KERN_ERR,
                        "Directory bread(block %llu) failed", (llu)phys);
                /* skip this block */
                *pos = (iblock + 1) << sb->s_blocksize_bits;
                goto next;
        }

        offset = *pos & (sb->s_blocksize - 1);
        *pos += sizeof(struct msdos_dir_entry);
        *de = (struct msdos_dir_entry *)((*bh)->b_data + offset);

        return 0;
}

static inline int fat_get_entry(struct inode *dir, loff_t *pos,
                                struct buffer_head **bh,
                                struct msdos_dir_entry **de)
{
        /* Fast stuff first */
        if (*bh && *de &&
           (*de - (struct msdos_dir_entry *)(*bh)->b_data) <
                                MSDOS_SB(dir->i_sb)->dir_per_block - 1) {
                *pos += sizeof(struct msdos_dir_entry);
                (*de)++;
                return 0;
        }
        return fat__get_entry(dir, pos, bh, de);
}

/*
 * Convert Unicode 16 to UTF-8, translated Unicode, or ASCII.
 * If uni_xlate is enabled and we can't get a 1:1 conversion, use a
 * colon as an escape character since it is normally invalid on the vfat
 * filesystem. The following four characters are the hexadecimal digits
 * of Unicode value. This lets us do a full dump and restore of Unicode
 * filenames. We could get into some trouble with long Unicode names,
 * but ignore that right now.
 * Ahem... Stack smashing in ring 0 isn't fun. Fixed.
 */
static int uni16_to_x8(struct super_block *sb, unsigned char *ascii,
                       const wchar_t *uni, int len, struct nls_table *nls)
{
        int uni_xlate = MSDOS_SB(sb)->options.unicode_xlate;
        const wchar_t *ip;
        wchar_t ec;
        unsigned char *op;
        int charlen;

        ip = uni;
        op = ascii;

        while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) {
                ec = *ip++;
                charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE);
                if (charlen > 0) {
                        op += charlen;
                        len -= charlen;
                } else {
                        if (uni_xlate == 1) {
                                *op++ = ':';
                                op = hex_byte_pack(op, ec >> 8);
                                op = hex_byte_pack(op, ec);
                                len -= 5;
                        } else {
                                *op++ = '?';
                                len--;
                        }
                }
        }

        if (unlikely(*ip)) {
                fat_msg(sb, KERN_WARNING,
                        "filename was truncated while converting.");
        }

        *op = 0;
        return op - ascii;
}

static inline int fat_uni_to_x8(struct super_block *sb, const wchar_t *uni,
                                unsigned char *buf, int size)
{
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        if (sbi->options.utf8)
                return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS,
                                UTF16_HOST_ENDIAN, buf, size);
        else
                return uni16_to_x8(sb, buf, uni, size, sbi->nls_io);
}

static inline int
fat_short2uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni)
{
        int charlen;

        charlen = t->char2uni(c, clen, uni);
        if (charlen < 0) {
                *uni = 0x003f;        /* a question mark */
                charlen = 1;
        }
        return charlen;
}

static inline int
fat_short2lower_uni(struct nls_table *t, unsigned char *c,
                    int clen, wchar_t *uni)
{
        int charlen;
        wchar_t wc;

        charlen = t->char2uni(c, clen, &wc);
        if (charlen < 0) {
                *uni = 0x003f;        /* a question mark */
                charlen = 1;
        } else if (charlen <= 1) {
                unsigned char nc = t->charset2lower[*c];

                if (!nc)
                        nc = *c;

                charlen = t->char2uni(&nc, 1, uni);
                if (charlen < 0) {
                        *uni = 0x003f;        /* a question mark */
                        charlen = 1;
                }
        } else
                *uni = wc;

        return charlen;
}

static inline int
fat_shortname2uni(struct nls_table *nls, unsigned char *buf, int buf_size,
                  wchar_t *uni_buf, unsigned short opt, int lower)
{
        int len = 0;

        if (opt & VFAT_SFN_DISPLAY_LOWER)
                len =  fat_short2lower_uni(nls, buf, buf_size, uni_buf);
        else if (opt & VFAT_SFN_DISPLAY_WIN95)
                len = fat_short2uni(nls, buf, buf_size, uni_buf);
        else if (opt & VFAT_SFN_DISPLAY_WINNT) {
                if (lower)
                        len = fat_short2lower_uni(nls, buf, buf_size, uni_buf);
                else
                        len = fat_short2uni(nls, buf, buf_size, uni_buf);
        } else
                len = fat_short2uni(nls, buf, buf_size, uni_buf);

        return len;
}

static inline int fat_name_match(struct msdos_sb_info *sbi,
                                 const unsigned char *a, int a_len,
                                 const unsigned char *b, int b_len)
{
        if (a_len != b_len)
                return 0;

        if (sbi->options.name_check != 's')
                return !nls_strnicmp(sbi->nls_io, a, b, a_len);
        else
                return !memcmp(a, b, a_len);
}

enum { PARSE_INVALID = 1, PARSE_NOT_LONGNAME, PARSE_EOF, };

/**
 * fat_parse_long - Parse extended directory entry.
 *
 * @dir: Pointer to the inode that represents the directory.
 * @pos: On input, contains the starting position to read from.
 *       On output, updated with the new position.
 * @bh: Pointer to the buffer head that may be used for reading directory
 *         entries. May be updated.
 * @de: On input, points to the current directory entry.
 *      On output, points to the next directory entry.
 * @unicode: Pointer to a buffer where the parsed Unicode long filename will be
 *              stored.
 * @nr_slots: Pointer to a variable that will store the number of longname
 *               slots found.
 *
 * This function returns zero on success, negative value on error, or one of
 * the following:
 *
 * %PARSE_INVALID - Directory entry is invalid.
 * %PARSE_NOT_LONGNAME - Directory entry does not contain longname.
 * %PARSE_EOF - Directory has no more entries.
 */
static int fat_parse_long(struct inode *dir, loff_t *pos,
                          struct buffer_head **bh, struct msdos_dir_entry **de,
                          wchar_t **unicode, unsigned char *nr_slots)
{
        struct msdos_dir_slot *ds;
        unsigned char id, slot, slots, alias_checksum;

        if (!*unicode) {
                *unicode = __getname();
                if (!*unicode) {
                        brelse(*bh);
                        return -ENOMEM;
                }
        }
parse_long:
        ds = (struct msdos_dir_slot *)*de;
        id = ds->id;
        if (!(id & 0x40))
                return PARSE_INVALID;
        slots = id & ~0x40;
        if (slots > 20 || !slots)        /* ceil(256 * 2 / 26) */
                return PARSE_INVALID;
        *nr_slots = slots;
        alias_checksum = ds->alias_checksum;

        slot = slots;
        while (1) {
                int offset;

                slot--;
                offset = slot * 13;
                fat16_towchar(*unicode + offset, ds->name0_4, 5);
                fat16_towchar(*unicode + offset + 5, ds->name5_10, 6);
                fat16_towchar(*unicode + offset + 11, ds->name11_12, 2);

                if (ds->id & 0x40)
                        (*unicode)[offset + 13] = 0;
                if (fat_get_entry(dir, pos, bh, de) < 0)
                        return PARSE_EOF;
                if (slot == 0)
                        break;
                ds = (struct msdos_dir_slot *)*de;
                if (ds->attr != ATTR_EXT)
                        return PARSE_NOT_LONGNAME;
                if ((ds->id & ~0x40) != slot)
                        goto parse_long;
                if (ds->alias_checksum != alias_checksum)
                        goto parse_long;
        }
        if ((*de)->name[0] == DELETED_FLAG)
                return PARSE_INVALID;
        if ((*de)->attr == ATTR_EXT)
                goto parse_long;
        if (IS_FREE((*de)->name) || ((*de)->attr & ATTR_VOLUME))
                return PARSE_INVALID;
        if (fat_checksum((*de)->name) != alias_checksum)
                *nr_slots = 0;

        return 0;
}

/**
 * fat_parse_short - Parse MS-DOS (short) directory entry.
 * @sb:                superblock
 * @de:                directory entry to parse
 * @name:        FAT_MAX_SHORT_SIZE array in which to place extracted name
 * @dot_hidden:        Nonzero == prepend '.' to names with ATTR_HIDDEN
 *
 * Returns the number of characters extracted into 'name'.
 */
static int fat_parse_short(struct super_block *sb,
                           const struct msdos_dir_entry *de,
                           unsigned char *name, int dot_hidden)
{
        const struct msdos_sb_info *sbi = MSDOS_SB(sb);
        int isvfat = sbi->options.isvfat;
        int nocase = sbi->options.nocase;
        unsigned short opt_shortname = sbi->options.shortname;
        struct nls_table *nls_disk = sbi->nls_disk;
        wchar_t uni_name[14];
        unsigned char c, work[MSDOS_NAME];
        unsigned char *ptname = name;
        int chi, chl, i, j, k;
        int dotoffset = 0;
        int name_len = 0, uni_len = 0;

        if (!isvfat && dot_hidden && (de->attr & ATTR_HIDDEN)) {
                *ptname++ = '.';
                dotoffset = 1;
        }

        memcpy(work, de->name, sizeof(work));
        /* For an explanation of the special treatment of 0x05 in
         * filenames, see msdos_format_name in namei_msdos.c
         */
        if (work[0] == 0x05)
                work[0] = 0xE5;

        /* Filename */
        for (i = 0, j = 0; i < 8;) {
                c = work[i];
                if (!c)
                        break;
                chl = fat_shortname2uni(nls_disk, &work[i], 8 - i,
                                        &uni_name[j++], opt_shortname,
                                        de->lcase & CASE_LOWER_BASE);
                if (chl <= 1) {
                        if (!isvfat)
                                ptname[i] = nocase ? c : fat_tolower(c);
                        i++;
                        if (c != ' ') {
                                name_len = i;
                                uni_len  = j;
                        }
                } else {
                        uni_len = j;
                        if (isvfat)
                                i += min(chl, 8-i);
                        else {
                                for (chi = 0; chi < chl && i < 8; chi++, i++)
                                        ptname[i] = work[i];
                        }
                        if (chl)
                                name_len = i;
                }
        }

        i = name_len;
        j = uni_len;
        fat_short2uni(nls_disk, ".", 1, &uni_name[j++]);
        if (!isvfat)
                ptname[i] = '.';
        i++;

        /* Extension */
        for (k = 8; k < MSDOS_NAME;) {
                c = work[k];
                if (!c)
                        break;
                chl = fat_shortname2uni(nls_disk, &work[k], MSDOS_NAME - k,
                                        &uni_name[j++], opt_shortname,
                                        de->lcase & CASE_LOWER_EXT);
                if (chl <= 1) {
                        k++;
                        if (!isvfat)
                                ptname[i] = nocase ? c : fat_tolower(c);
                        i++;
                        if (c != ' ') {
                                name_len = i;
                                uni_len  = j;
                        }
                } else {
                        uni_len = j;
                        if (isvfat) {
                                int offset = min(chl, MSDOS_NAME-k);
                                k += offset;
                                i += offset;
                        } else {
                                for (chi = 0; chi < chl && k < MSDOS_NAME;
                                     chi++, i++, k++) {
                                                ptname[i] = work[k];
                                }
                        }
                        if (chl)
                                name_len = i;
                }
        }

        if (name_len > 0) {
                name_len += dotoffset;

                if (sbi->options.isvfat) {
                        uni_name[uni_len] = 0x0000;
                        name_len = fat_uni_to_x8(sb, uni_name, name,
                                                 FAT_MAX_SHORT_SIZE);
                }
        }

        return name_len;
}

/*
 * Return values: negative -> error/not found, 0 -> found.
 */
int fat_search_long(struct inode *inode, const unsigned char *name,
                    int name_len, struct fat_slot_info *sinfo)
{
        struct super_block *sb = inode->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        struct buffer_head *bh = NULL;
        struct msdos_dir_entry *de;
        unsigned char nr_slots;
        wchar_t *unicode = NULL;
        unsigned char bufname[FAT_MAX_SHORT_SIZE];
        loff_t cpos = 0;
        int err, len;

        err = -ENOENT;
        while (1) {
                if (fat_get_entry(inode, &cpos, &bh, &de) == -1)
                        goto end_of_dir;
parse_record:
                nr_slots = 0;
                if (de->name[0] == DELETED_FLAG)
                        continue;
                if (de->attr != ATTR_EXT && (de->attr & ATTR_VOLUME))
                        continue;
                if (de->attr != ATTR_EXT && IS_FREE(de->name))
                        continue;
                if (de->attr == ATTR_EXT) {
                        int status = fat_parse_long(inode, &cpos, &bh, &de,
                                                    &unicode, &nr_slots);
                        if (status < 0) {
                                err = status;
                                goto end_of_dir;
                        } else if (status == PARSE_INVALID)
                                continue;
                        else if (status == PARSE_NOT_LONGNAME)
                                goto parse_record;
                        else if (status == PARSE_EOF)
                                goto end_of_dir;
                }

                /* Never prepend '.' to hidden files here.
                 * That is done only for msdos mounts (and only when
                 * 'dotsOK=yes'); if we are executing here, it is in the
                 * context of a vfat mount.
                 */
                len = fat_parse_short(sb, de, bufname, 0);
                if (len == 0)
                        continue;

                /* Compare shortname */
                if (fat_name_match(sbi, name, name_len, bufname, len))
                        goto found;

                if (nr_slots) {
                        void *longname = unicode + FAT_MAX_UNI_CHARS;
                        int size = PATH_MAX - FAT_MAX_UNI_SIZE;

                        /* Compare longname */
                        len = fat_uni_to_x8(sb, unicode, longname, size);
                        if (fat_name_match(sbi, name, name_len, longname, len))
                                goto found;
                }
        }

found:
        nr_slots++;        /* include the de */
        sinfo->slot_off = cpos - nr_slots * sizeof(*de);
        sinfo->nr_slots = nr_slots;
        sinfo->de = de;
        sinfo->bh = bh;
        sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de);
        err = 0;
end_of_dir:
        if (unicode)
                __putname(unicode);

        return err;
}
EXPORT_SYMBOL_GPL(fat_search_long);

struct fat_ioctl_filldir_callback {
        struct dir_context ctx;
        void __user *dirent;
        int result;
        /* for dir ioctl */
        const char *longname;
        int long_len;
        const char *shortname;
        int short_len;
};

static int __fat_readdir(struct inode *inode, struct file *file,
                         struct dir_context *ctx, int short_only,
                         struct fat_ioctl_filldir_callback *both)
{
        struct super_block *sb = inode->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        struct buffer_head *bh;
        struct msdos_dir_entry *de;
        unsigned char nr_slots;
        wchar_t *unicode = NULL;
        unsigned char bufname[FAT_MAX_SHORT_SIZE];
        int isvfat = sbi->options.isvfat;
        const char *fill_name = NULL;
        int fake_offset = 0;
        loff_t cpos;
        int short_len = 0, fill_len = 0;
        int ret = 0;

        mutex_lock(&sbi->s_lock);

        cpos = ctx->pos;
        /* Fake . and .. for the root directory. */
        if (inode->i_ino == MSDOS_ROOT_INO) {
                if (!dir_emit_dots(file, ctx))
                        goto out;
                if (ctx->pos == 2) {
                        fake_offset = 1;
                        cpos = 0;
                }
        }
        if (cpos & (sizeof(struct msdos_dir_entry) - 1)) {
                ret = -ENOENT;
                goto out;
        }

        bh = NULL;
get_new:
        if (fat_get_entry(inode, &cpos, &bh, &de) == -1)
                goto end_of_dir;
parse_record:
        nr_slots = 0;
        /*
         * Check for long filename entry, but if short_only, we don't
         * need to parse long filename.
         */
        if (isvfat && !short_only) {
                if (de->name[0] == DELETED_FLAG)
                        goto record_end;
                if (de->attr != ATTR_EXT && (de->attr & ATTR_VOLUME))
                        goto record_end;
                if (de->attr != ATTR_EXT && IS_FREE(de->name))
                        goto record_end;
        } else {
                if ((de->attr & ATTR_VOLUME) || IS_FREE(de->name))
                        goto record_end;
        }

        if (isvfat && de->attr == ATTR_EXT) {
                int status = fat_parse_long(inode, &cpos, &bh, &de,
                                            &unicode, &nr_slots);
                if (status < 0) {
                        bh = NULL;
                        ret = status;
                        goto end_of_dir;
                } else if (status == PARSE_INVALID)
                        goto record_end;
                else if (status == PARSE_NOT_LONGNAME)
                        goto parse_record;
                else if (status == PARSE_EOF)
                        goto end_of_dir;

                if (nr_slots) {
                        void *longname = unicode + FAT_MAX_UNI_CHARS;
                        int size = PATH_MAX - FAT_MAX_UNI_SIZE;
                        int len = fat_uni_to_x8(sb, unicode, longname, size);

                        fill_name = longname;
                        fill_len = len;
                        /* !both && !short_only, so we don't need shortname. */
                        if (!both)
                                goto start_filldir;

                        short_len = fat_parse_short(sb, de, bufname,
                                                    sbi->options.dotsOK);
                        if (short_len == 0)
                                goto record_end;
                        /* hack for fat_ioctl_filldir() */
                        both->longname = fill_name;
                        both->long_len = fill_len;
                        both->shortname = bufname;
                        both->short_len = short_len;
                        fill_name = NULL;
                        fill_len = 0;
                        goto start_filldir;
                }
        }

        short_len = fat_parse_short(sb, de, bufname, sbi->options.dotsOK);
        if (short_len == 0)
                goto record_end;

        fill_name = bufname;
        fill_len = short_len;

start_filldir:
        ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
        if (fake_offset && ctx->pos < 2)
                ctx->pos = 2;

        if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) {
                if (!dir_emit_dot(file, ctx))
                        goto fill_failed;
        } else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
                if (!dir_emit_dotdot(file, ctx))
                        goto fill_failed;
        } else {
                unsigned long inum;
                loff_t i_pos = fat_make_i_pos(sb, bh, de);
                struct inode *tmp = fat_iget(sb, i_pos);
                if (tmp) {
                        inum = tmp->i_ino;
                        iput(tmp);
                } else
                        inum = iunique(sb, MSDOS_ROOT_INO);
                if (!dir_emit(ctx, fill_name, fill_len, inum,
                            (de->attr & ATTR_DIR) ? DT_DIR : DT_REG))
                        goto fill_failed;
        }

record_end:
        fake_offset = 0;
        ctx->pos = cpos;
        goto get_new;

end_of_dir:
        if (fake_offset && cpos < 2)
                ctx->pos = 2;
        else
                ctx->pos = cpos;
fill_failed:
        brelse(bh);
        if (unicode)
                __putname(unicode);
out:
        mutex_unlock(&sbi->s_lock);

        return ret;
}

static int fat_readdir(struct file *file, struct dir_context *ctx)
{
        return __fat_readdir(file_inode(file), file, ctx, 0, NULL);
}

#define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type)                           \
static bool func(struct dir_context *ctx, const char *name, int name_len,  \
                             loff_t offset, u64 ino, unsigned int d_type)  \
{                                                                           \
        struct fat_ioctl_filldir_callback *buf =                           \
                container_of(ctx, struct fat_ioctl_filldir_callback, ctx); \
        struct dirent_type __user *d1 = buf->dirent;                           \
        struct dirent_type __user *d2 = d1 + 1;                                   \
                                                                           \
        if (buf->result)                                                   \
                return false;                                                   \
        buf->result++;                                                           \
                                                                           \
        if (name != NULL) {                                                   \
                /* dirent has only short name */                           \
                if (name_len >= sizeof(d1->d_name))                           \
                        name_len = sizeof(d1->d_name) - 1;                   \
                                                                           \
                if (put_user(0, &d2->d_name[0])                        ||           \
                    put_user(0, &d2->d_reclen)                        ||           \
                    copy_to_user(d1->d_name, name, name_len)        ||           \
                    put_user(0, d1->d_name + name_len)                ||           \
                    put_user(name_len, &d1->d_reclen))                           \
                        goto efault;                                           \
        } else {                                                           \
                /* dirent has short and long name */                           \
                const char *longname = buf->longname;                           \
                int long_len = buf->long_len;                                   \
                const char *shortname = buf->shortname;                           \
                int short_len = buf->short_len;                                   \
                                                                           \
                if (long_len >= sizeof(d1->d_name))                           \
                        long_len = sizeof(d1->d_name) - 1;                   \
                if (short_len >= sizeof(d1->d_name))                           \
                        short_len = sizeof(d1->d_name) - 1;                   \
                                                                           \
                if (copy_to_user(d2->d_name, longname, long_len)        || \
                    put_user(0, d2->d_name + long_len)                        || \
                    put_user(long_len, &d2->d_reclen)                        || \
                    put_user(ino, &d2->d_ino)                                || \
                    put_user(offset, &d2->d_off)                        || \
                    copy_to_user(d1->d_name, shortname, short_len)        || \
                    put_user(0, d1->d_name + short_len)                        || \
                    put_user(short_len, &d1->d_reclen))                           \
                        goto efault;                                           \
        }                                                                   \
        return true;                                                           \
efault:                                                                           \
        buf->result = -EFAULT;                                                   \
        return false;                                                           \
}

FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent)

static int fat_ioctl_readdir(struct inode *inode, struct file *file,
                             void __user *dirent, filldir_t filldir,
                             int short_only, int both)
{
        struct fat_ioctl_filldir_callback buf = {
                .ctx.actor = filldir,
                .dirent = dirent
        };
        int ret;

        buf.dirent = dirent;
        buf.result = 0;
        inode_lock_shared(inode);
        buf.ctx.pos = file->f_pos;
        ret = -ENOENT;
        if (!IS_DEADDIR(inode)) {
                ret = __fat_readdir(inode, file, &buf.ctx,
                                    short_only, both ? &buf : NULL);
                file->f_pos = buf.ctx.pos;
        }
        inode_unlock_shared(inode);
        if (ret >= 0)
                ret = buf.result;
        return ret;
}

static long fat_dir_ioctl(struct file *filp, unsigned int cmd,
                          unsigned long arg)
{
        struct inode *inode = file_inode(filp);
        struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
        int short_only, both;

        switch (cmd) {
        case VFAT_IOCTL_READDIR_SHORT:
                short_only = 1;
                both = 0;
                break;
        case VFAT_IOCTL_READDIR_BOTH:
                short_only = 0;
                both = 1;
                break;
        default:
                return fat_generic_ioctl(filp, cmd, arg);
        }

        /*
         * Yes, we don't need this put_user() absolutely. However old
         * code didn't return the right value. So, app use this value,
         * in order to check whether it is EOF.
         */
        if (put_user(0, &d1->d_reclen))
                return -EFAULT;

        return fat_ioctl_readdir(inode, filp, d1, fat_ioctl_filldir,
                                 short_only, both);
}

#ifdef CONFIG_COMPAT
#define        VFAT_IOCTL_READDIR_BOTH32        _IOR('r', 1, struct compat_dirent[2])
#define        VFAT_IOCTL_READDIR_SHORT32        _IOR('r', 2, struct compat_dirent[2])

FAT_IOCTL_FILLDIR_FUNC(fat_compat_ioctl_filldir, compat_dirent)

static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
                                 unsigned long arg)
{
        struct inode *inode = file_inode(filp);
        struct compat_dirent __user *d1 = compat_ptr(arg);
        int short_only, both;

        switch (cmd) {
        case VFAT_IOCTL_READDIR_SHORT32:
                short_only = 1;
                both = 0;
                break;
        case VFAT_IOCTL_READDIR_BOTH32:
                short_only = 0;
                both = 1;
                break;
        default:
                return fat_generic_ioctl(filp, cmd, (unsigned long)arg);
        }

        /*
         * Yes, we don't need this put_user() absolutely. However old
         * code didn't return the right value. So, app use this value,
         * in order to check whether it is EOF.
         */
        if (put_user(0, &d1->d_reclen))
                return -EFAULT;

        return fat_ioctl_readdir(inode, filp, d1, fat_compat_ioctl_filldir,
                                 short_only, both);
}
#endif /* CONFIG_COMPAT */

const struct file_operations fat_dir_operations = {
        .llseek                = generic_file_llseek,
        .read                = generic_read_dir,
        .iterate_shared        = fat_readdir,
        .unlocked_ioctl        = fat_dir_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl        = fat_compat_dir_ioctl,
#endif
        .fsync                = fat_file_fsync,
};

static int fat_get_short_entry(struct inode *dir, loff_t *pos,
                               struct buffer_head **bh,
                               struct msdos_dir_entry **de)
{
        while (fat_get_entry(dir, pos, bh, de) >= 0) {
                /* free entry or long name entry or volume label */
                if (!IS_FREE((*de)->name) && !((*de)->attr & ATTR_VOLUME))
                        return 0;
        }
        return -ENOENT;
}

/*
 * The ".." entry can not provide the "struct fat_slot_info" information
 * for inode, nor a usable i_pos. So, this function provides some information
 * only.
 *
 * Since this function walks through the on-disk inodes within a directory,
 * callers are responsible for taking any locks necessary to prevent the
 * directory from changing.
 */
int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
                         struct msdos_dir_entry **de)
{
        loff_t offset = 0;

        *de = NULL;
        while (fat_get_short_entry(dir, &offset, bh, de) >= 0) {
                if (!strncmp((*de)->name, MSDOS_DOTDOT, MSDOS_NAME))
                        return 0;
        }
        return -ENOENT;
}
EXPORT_SYMBOL_GPL(fat_get_dotdot_entry);

/* See if directory is empty */
int fat_dir_empty(struct inode *dir)
{
        struct buffer_head *bh;
        struct msdos_dir_entry *de;
        loff_t cpos;
        int result = 0;

        bh = NULL;
        cpos = 0;
        while (fat_get_short_entry(dir, &cpos, &bh, &de) >= 0) {
                if (strncmp(de->name, MSDOS_DOT   , MSDOS_NAME) &&
                    strncmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
                        result = -ENOTEMPTY;
                        break;
                }
        }
        brelse(bh);
        return result;
}
EXPORT_SYMBOL_GPL(fat_dir_empty);

/*
 * fat_subdirs counts the number of sub-directories of dir. It can be run
 * on directories being created.
 */
int fat_subdirs(struct inode *dir)
{
        struct buffer_head *bh;
        struct msdos_dir_entry *de;
        loff_t cpos;
        int count = 0;

        bh = NULL;
        cpos = 0;
        while (fat_get_short_entry(dir, &cpos, &bh, &de) >= 0) {
                if (de->attr & ATTR_DIR)
                        count++;
        }
        brelse(bh);
        return count;
}

/*
 * Scans a directory for a given file (name points to its formatted name).
 * Returns an error code or zero.
 */
int fat_scan(struct inode *dir, const unsigned char *name,
             struct fat_slot_info *sinfo)
{
        struct super_block *sb = dir->i_sb;

        sinfo->slot_off = 0;
        sinfo->bh = NULL;
        while (fat_get_short_entry(dir, &sinfo->slot_off, &sinfo->bh,
                                   &sinfo->de) >= 0) {
                if (!strncmp(sinfo->de->name, name, MSDOS_NAME)) {
                        sinfo->slot_off -= sizeof(*sinfo->de);
                        sinfo->nr_slots = 1;
                        sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de);
                        return 0;
                }
        }
        return -ENOENT;
}
EXPORT_SYMBOL_GPL(fat_scan);

/*
 * Scans a directory for a given logstart.
 * Returns an error code or zero.
 */
int fat_scan_logstart(struct inode *dir, int i_logstart,
                      struct fat_slot_info *sinfo)
{
        struct super_block *sb = dir->i_sb;

        sinfo->slot_off = 0;
        sinfo->bh = NULL;
        while (fat_get_short_entry(dir, &sinfo->slot_off, &sinfo->bh,
                                   &sinfo->de) >= 0) {
                if (fat_get_start(MSDOS_SB(sb), sinfo->de) == i_logstart) {
                        sinfo->slot_off -= sizeof(*sinfo->de);
                        sinfo->nr_slots = 1;
                        sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de);
                        return 0;
                }
        }
        return -ENOENT;
}

static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
{
        struct super_block *sb = dir->i_sb;
        struct buffer_head *bh;
        struct msdos_dir_entry *de, *endp;
        int err = 0, orig_slots;

        while (nr_slots) {
                bh = NULL;
                if (fat_get_entry(dir, &pos, &bh, &de) < 0) {
                        err = -EIO;
                        break;
                }

                orig_slots = nr_slots;
                endp = (struct msdos_dir_entry *)(bh->b_data + sb->s_blocksize);
                while (nr_slots && de < endp) {
                        de->name[0] = DELETED_FLAG;
                        de++;
                        nr_slots--;
                }
                mark_buffer_dirty_inode(bh, dir);
                if (IS_DIRSYNC(dir))
                        err = sync_dirty_buffer(bh);
                brelse(bh);
                if (err)
                        break;

                /* pos is *next* de's position, so this does `- sizeof(de)' */
                pos += ((orig_slots - nr_slots) * sizeof(*de)) - sizeof(*de);
        }

        return err;
}

int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
{
        struct super_block *sb = dir->i_sb;
        struct msdos_dir_entry *de;
        struct buffer_head *bh;
        int err = 0, nr_slots;

        /*
         * First stage: Remove the shortname. By this, the directory
         * entry is removed.
         */
        nr_slots = sinfo->nr_slots;
        de = sinfo->de;
        sinfo->de = NULL;
        bh = sinfo->bh;
        sinfo->bh = NULL;
        while (nr_slots && de >= (struct msdos_dir_entry *)bh->b_data) {
                de->name[0] = DELETED_FLAG;
                de--;
                nr_slots--;
        }
        mark_buffer_dirty_inode(bh, dir);
        if (IS_DIRSYNC(dir))
                err = sync_dirty_buffer(bh);
        brelse(bh);
        if (err)
                return err;
        inode_inc_iversion(dir);

        if (nr_slots) {
                /*
                 * Second stage: remove the remaining longname slots.
                 * (This directory entry is already removed, and so return
                 * the success)
                 */
                err = __fat_remove_entries(dir, sinfo->slot_off, nr_slots);
                if (err) {
                        fat_msg(sb, KERN_WARNING,
                               "Couldn't remove the long name slots");
                }
        }

        fat_truncate_time(dir, NULL, S_ATIME|S_MTIME);
        if (IS_DIRSYNC(dir))
                (void)fat_sync_inode(dir);
        else
                mark_inode_dirty(dir);

        return 0;
}
EXPORT_SYMBOL_GPL(fat_remove_entries);

static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used,
                              struct buffer_head **bhs, int nr_bhs)
{
        struct super_block *sb = dir->i_sb;
        sector_t last_blknr = blknr + MSDOS_SB(sb)->sec_per_clus;
        int err, i, n;

        /* Zeroing the unused blocks on this cluster */
        blknr += nr_used;
        n = nr_used;
        while (blknr < last_blknr) {
                bhs[n] = sb_getblk(sb, blknr);
                if (!bhs[n]) {
                        err = -ENOMEM;
                        goto error;
                }
                /* Avoid race with userspace read via bdev */
                lock_buffer(bhs[n]);
                memset(bhs[n]->b_data, 0, sb->s_blocksize);
                set_buffer_uptodate(bhs[n]);
                unlock_buffer(bhs[n]);
                mark_buffer_dirty_inode(bhs[n], dir);

                n++;
                blknr++;
                if (n == nr_bhs) {
                        if (IS_DIRSYNC(dir)) {
                                err = fat_sync_bhs(bhs, n);
                                if (err)
                                        goto error;
                        }
                        for (i = 0; i < n; i++)
                                brelse(bhs[i]);
                        n = 0;
                }
        }
        if (IS_DIRSYNC(dir)) {
                err = fat_sync_bhs(bhs, n);
                if (err)
                        goto error;
        }
        for (i = 0; i < n; i++)
                brelse(bhs[i]);

        return 0;

error:
        for (i = 0; i < n; i++)
                bforget(bhs[i]);
        return err;
}

int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts)
{
        struct super_block *sb = dir->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        struct buffer_head *bhs[MAX_BUF_PER_PAGE];
        struct msdos_dir_entry *de;
        sector_t blknr;
        __le16 date, time;
        u8 time_cs;
        int err, cluster;

        err = fat_alloc_clusters(dir, &cluster, 1);
        if (err)
                goto error;

        blknr = fat_clus_to_blknr(sbi, cluster);
        bhs[0] = sb_getblk(sb, blknr);
        if (!bhs[0]) {
                err = -ENOMEM;
                goto error_free;
        }

        fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);

        de = (struct msdos_dir_entry *)bhs[0]->b_data;
        /* Avoid race with userspace read via bdev */
        lock_buffer(bhs[0]);
        /* filling the new directory slots ("." and ".." entries) */
        memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME);
        memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME);
        de->attr = de[1].attr = ATTR_DIR;
        de[0].lcase = de[1].lcase = 0;
        de[0].time = de[1].time = time;
        de[0].date = de[1].date = date;
        if (sbi->options.isvfat) {
                /* extra timestamps */
                de[0].ctime = de[1].ctime = time;
                de[0].ctime_cs = de[1].ctime_cs = time_cs;
                de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = date;
        } else {
                de[0].ctime = de[1].ctime = 0;
                de[0].ctime_cs = de[1].ctime_cs = 0;
                de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = 0;
        }
        fat_set_start(&de[0], cluster);
        fat_set_start(&de[1], MSDOS_I(dir)->i_logstart);
        de[0].size = de[1].size = 0;
        memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
        set_buffer_uptodate(bhs[0]);
        unlock_buffer(bhs[0]);
        mark_buffer_dirty_inode(bhs[0], dir);

        err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE);
        if (err)
                goto error_free;

        return cluster;

error_free:
        fat_free_clusters(dir, cluster);
error:
        return err;
}
EXPORT_SYMBOL_GPL(fat_alloc_new_dir);

static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
                               int *nr_cluster, struct msdos_dir_entry **de,
                               struct buffer_head **bh, loff_t *i_pos)
{
        struct super_block *sb = dir->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        struct buffer_head *bhs[MAX_BUF_PER_PAGE];
        sector_t blknr, start_blknr, last_blknr;
        unsigned long size, copy;
        int err, i, n, offset, cluster[2];

        /*
         * The minimum cluster size is 512bytes, and maximum entry
         * size is 32*slots (672bytes).  So, iff the cluster size is
         * 512bytes, we may need two clusters.
         */
        size = nr_slots * sizeof(struct msdos_dir_entry);
        *nr_cluster = (size + (sbi->cluster_size - 1)) >> sbi->cluster_bits;
        BUG_ON(*nr_cluster > 2);

        err = fat_alloc_clusters(dir, cluster, *nr_cluster);
        if (err)
                goto error;

        /*
         * First stage: Fill the directory entry.  NOTE: This cluster
         * is not referenced from any inode yet, so updates order is
         * not important.
         */
        i = n = copy = 0;
        do {
                start_blknr = blknr = fat_clus_to_blknr(sbi, cluster[i]);
                last_blknr = start_blknr + sbi->sec_per_clus;
                while (blknr < last_blknr) {
                        bhs[n] = sb_getblk(sb, blknr);
                        if (!bhs[n]) {
                                err = -ENOMEM;
                                goto error_nomem;
                        }

                        /* fill the directory entry */
                        copy = min(size, sb->s_blocksize);
                        /* Avoid race with userspace read via bdev */
                        lock_buffer(bhs[n]);
                        memcpy(bhs[n]->b_data, slots, copy);
                        set_buffer_uptodate(bhs[n]);
                        unlock_buffer(bhs[n]);
                        mark_buffer_dirty_inode(bhs[n], dir);
                        slots += copy;
                        size -= copy;
                        if (!size)
                                break;
                        n++;
                        blknr++;
                }
        } while (++i < *nr_cluster);

        memset(bhs[n]->b_data + copy, 0, sb->s_blocksize - copy);
        offset = copy - sizeof(struct msdos_dir_entry);
        get_bh(bhs[n]);
        *bh = bhs[n];
        *de = (struct msdos_dir_entry *)((*bh)->b_data + offset);
        *i_pos = fat_make_i_pos(sb, *bh, *de);

        /* Second stage: clear the rest of cluster, and write outs */
        err = fat_zeroed_cluster(dir, start_blknr, ++n, bhs, MAX_BUF_PER_PAGE);
        if (err)
                goto error_free;

        return cluster[0];

error_free:
        brelse(*bh);
        *bh = NULL;
        n = 0;
error_nomem:
        for (i = 0; i < n; i++)
                bforget(bhs[i]);
        fat_free_clusters(dir, cluster[0]);
error:
        return err;
}

int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
                    struct fat_slot_info *sinfo)
{
        struct super_block *sb = dir->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */
        struct msdos_dir_entry *de;
        int err, free_slots, i, nr_bhs;
        loff_t pos, i_pos;

        sinfo->nr_slots = nr_slots;

        /* First stage: search free directory entries */
        free_slots = nr_bhs = 0;
        bh = prev = NULL;
        pos = 0;
        err = -ENOSPC;
        while (fat_get_entry(dir, &pos, &bh, &de) > -1) {
                /* check the maximum size of directory */
                if (pos >= FAT_MAX_DIR_SIZE)
                        goto error;

                if (IS_FREE(de->name)) {
                        if (prev != bh) {
                                get_bh(bh);
                                bhs[nr_bhs] = prev = bh;
                                nr_bhs++;
                        }
                        free_slots++;
                        if (free_slots == nr_slots)
                                goto found;
                } else {
                        for (i = 0; i < nr_bhs; i++)
                                brelse(bhs[i]);
                        prev = NULL;
                        free_slots = nr_bhs = 0;
                }
        }
        if (dir->i_ino == MSDOS_ROOT_INO) {
                if (!is_fat32(sbi))
                        goto error;
        } else if (MSDOS_I(dir)->i_start == 0) {
                fat_msg(sb, KERN_ERR, "Corrupted directory (i_pos %lld)",
                       MSDOS_I(dir)->i_pos);
                err = -EIO;
                goto error;
        }

found:
        err = 0;
        pos -= free_slots * sizeof(*de);
        nr_slots -= free_slots;
        if (free_slots) {
                /*
                 * Second stage: filling the free entries with new entries.
                 * NOTE: If this slots has shortname, first, we write
                 * the long name slots, then write the short name.
                 */
                int size = free_slots * sizeof(*de);
                int offset = pos & (sb->s_blocksize - 1);
                int long_bhs = nr_bhs - (nr_slots == 0);

                /* Fill the long name slots. */
                for (i = 0; i < long_bhs; i++) {
                        int copy = min_t(int, sb->s_blocksize - offset, size);
                        memcpy(bhs[i]->b_data + offset, slots, copy);
                        mark_buffer_dirty_inode(bhs[i], dir);
                        offset = 0;
                        slots += copy;
                        size -= copy;
                }
                if (long_bhs && IS_DIRSYNC(dir))
                        err = fat_sync_bhs(bhs, long_bhs);
                if (!err && i < nr_bhs) {
                        /* Fill the short name slot. */
                        int copy = min_t(int, sb->s_blocksize - offset, size);
                        memcpy(bhs[i]->b_data + offset, slots, copy);
                        mark_buffer_dirty_inode(bhs[i], dir);
                        if (IS_DIRSYNC(dir))
                                err = sync_dirty_buffer(bhs[i]);
                }
                for (i = 0; i < nr_bhs; i++)
                        brelse(bhs[i]);
                if (err)
                        goto error_remove;
        }

        if (nr_slots) {
                int cluster, nr_cluster;

                /*
                 * Third stage: allocate the cluster for new entries.
                 * And initialize the cluster with new entries, then
                 * add the cluster to dir.
                 */
                cluster = fat_add_new_entries(dir, slots, nr_slots, &nr_cluster,
                                              &de, &bh, &i_pos);
                if (cluster < 0) {
                        err = cluster;
                        goto error_remove;
                }
                err = fat_chain_add(dir, cluster, nr_cluster);
                if (err) {
                        fat_free_clusters(dir, cluster);
                        goto error_remove;
                }
                if (dir->i_size & (sbi->cluster_size - 1)) {
                        fat_fs_error(sb, "Odd directory size");
                        dir->i_size = (dir->i_size + sbi->cluster_size - 1)
                                & ~((loff_t)sbi->cluster_size - 1);
                }
                dir->i_size += nr_cluster << sbi->cluster_bits;
                MSDOS_I(dir)->mmu_private += nr_cluster << sbi->cluster_bits;
        }
        sinfo->slot_off = pos;
        sinfo->de = de;
        sinfo->bh = bh;
        sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de);

        return 0;

error:
        brelse(bh);
        for (i = 0; i < nr_bhs; i++)
                brelse(bhs[i]);
        return err;

error_remove:
        brelse(bh);
        if (free_slots)
                __fat_remove_entries(dir, pos, free_slots);
        return err;
}
EXPORT_SYMBOL_GPL(fat_add_entries);












    2 




















































































    2 




























    1 















































    2 










    2 























    1 
    2 



























    2 










































    2 





    2 





    2 























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/rmap.h>
#include <linux/hugetlb.h>
#include <linux/swap.h>
#include <linux/swapops.h>

#include "internal.h"

static inline bool not_found(struct page_vma_mapped_walk *pvmw)
{
        page_vma_mapped_walk_done(pvmw);
        return false;
}

static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp)
{
        pte_t ptent;

        if (pvmw->flags & PVMW_SYNC) {
                /* Use the stricter lookup */
                pvmw->pte = pte_offset_map_lock(pvmw->vma->vm_mm, pvmw->pmd,
                                                pvmw->address, &pvmw->ptl);
                *ptlp = pvmw->ptl;
                return !!pvmw->pte;
        }

        /*
         * It is important to return the ptl corresponding to pte,
         * in case *pvmw->pmd changes underneath us; so we need to
         * return it even when choosing not to lock, in case caller
         * proceeds to loop over next ptes, and finds a match later.
         * Though, in most cases, page lock already protects this.
         */
        pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd,
                                          pvmw->address, ptlp);
        if (!pvmw->pte)
                return false;

        ptent = ptep_get(pvmw->pte);

        if (pvmw->flags & PVMW_MIGRATION) {
                if (!is_swap_pte(ptent))
                        return false;
        } else if (is_swap_pte(ptent)) {
                swp_entry_t entry;
                /*
                 * Handle un-addressable ZONE_DEVICE memory.
                 *
                 * We get here when we are trying to unmap a private
                 * device page from the process address space. Such
                 * page is not CPU accessible and thus is mapped as
                 * a special swap entry, nonetheless it still does
                 * count as a valid regular mapping for the page
                 * (and is accounted as such in page maps count).
                 *
                 * So handle this special case as if it was a normal
                 * page mapping ie lock CPU page table and return true.
                 *
                 * For more details on device private memory see HMM
                 * (include/linux/hmm.h or mm/hmm.c).
                 */
                entry = pte_to_swp_entry(ptent);
                if (!is_device_private_entry(entry) &&
                    !is_device_exclusive_entry(entry))
                        return false;
        } else if (!pte_present(ptent)) {
                return false;
        }
        pvmw->ptl = *ptlp;
        spin_lock(pvmw->ptl);
        return true;
}

/**
 * check_pte - check if [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages) is
 * mapped at the @pvmw->pte
 * @pvmw: page_vma_mapped_walk struct, includes a pair pte and pfn range
 * for checking
 *
 * page_vma_mapped_walk() found a place where pfn range is *potentially*
 * mapped. check_pte() has to validate this.
 *
 * pvmw->pte may point to empty PTE, swap PTE or PTE pointing to
 * arbitrary page.
 *
 * If PVMW_MIGRATION flag is set, returns true if @pvmw->pte contains migration
 * entry that points to [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages)
 *
 * If PVMW_MIGRATION flag is not set, returns true if pvmw->pte points to
 * [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages)
 *
 * Otherwise, return false.
 *
 */
static bool check_pte(struct page_vma_mapped_walk *pvmw)
{
        unsigned long pfn;
        pte_t ptent = ptep_get(pvmw->pte);

        if (pvmw->flags & PVMW_MIGRATION) {
                swp_entry_t entry;
                if (!is_swap_pte(ptent))
                        return false;
                entry = pte_to_swp_entry(ptent);

                if (!is_migration_entry(entry) &&
                    !is_device_exclusive_entry(entry))
                        return false;

                pfn = swp_offset_pfn(entry);
        } else if (is_swap_pte(ptent)) {
                swp_entry_t entry;

                /* Handle un-addressable ZONE_DEVICE memory */
                entry = pte_to_swp_entry(ptent);
                if (!is_device_private_entry(entry) &&
                    !is_device_exclusive_entry(entry))
                        return false;

                pfn = swp_offset_pfn(entry);
        } else {
                if (!pte_present(ptent))
                        return false;

                pfn = pte_pfn(ptent);
        }

        return (pfn - pvmw->pfn) < pvmw->nr_pages;
}

/* Returns true if the two ranges overlap.  Careful to not overflow. */
static bool check_pmd(unsigned long pfn, struct page_vma_mapped_walk *pvmw)
{
        if ((pfn + HPAGE_PMD_NR - 1) < pvmw->pfn)
                return false;
        if (pfn > pvmw->pfn + pvmw->nr_pages - 1)
                return false;
        return true;
}

static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size)
{
        pvmw->address = (pvmw->address + size) & ~(size - 1);
        if (!pvmw->address)
                pvmw->address = ULONG_MAX;
}

/**
 * page_vma_mapped_walk - check if @pvmw->pfn is mapped in @pvmw->vma at
 * @pvmw->address
 * @pvmw: pointer to struct page_vma_mapped_walk. page, vma, address and flags
 * must be set. pmd, pte and ptl must be NULL.
 *
 * Returns true if the page is mapped in the vma. @pvmw->pmd and @pvmw->pte point
 * to relevant page table entries. @pvmw->ptl is locked. @pvmw->address is
 * adjusted if needed (for PTE-mapped THPs).
 *
 * If @pvmw->pmd is set but @pvmw->pte is not, you have found PMD-mapped page
 * (usually THP). For PTE-mapped THP, you should run page_vma_mapped_walk() in
 * a loop to find all PTEs that map the THP.
 *
 * For HugeTLB pages, @pvmw->pte is set to the relevant page table entry
 * regardless of which page table level the page is mapped at. @pvmw->pmd is
 * NULL.
 *
 * Returns false if there are no more page table entries for the page in
 * the vma. @pvmw->ptl is unlocked and @pvmw->pte is unmapped.
 *
 * If you need to stop the walk before page_vma_mapped_walk() returned false,
 * use page_vma_mapped_walk_done(). It will do the housekeeping.
 */
bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
{
        struct vm_area_struct *vma = pvmw->vma;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long end;
        spinlock_t *ptl;
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t pmde;

        /* The only possible pmd mapping has been handled on last iteration */
        if (pvmw->pmd && !pvmw->pte)
                return not_found(pvmw);

        if (unlikely(is_vm_hugetlb_page(vma))) {
                struct hstate *hstate = hstate_vma(vma);
                unsigned long size = huge_page_size(hstate);
                /* The only possible mapping was handled on last iteration */
                if (pvmw->pte)
                        return not_found(pvmw);
                /*
                 * All callers that get here will already hold the
                 * i_mmap_rwsem.  Therefore, no additional locks need to be
                 * taken before calling hugetlb_walk().
                 */
                pvmw->pte = hugetlb_walk(vma, pvmw->address, size);
                if (!pvmw->pte)
                        return false;

                pvmw->ptl = huge_pte_lock(hstate, mm, pvmw->pte);
                if (!check_pte(pvmw))
                        return not_found(pvmw);
                return true;
        }

        end = vma_address_end(pvmw);
        if (pvmw->pte)
                goto next_pte;
restart:
        do {
                pgd = pgd_offset(mm, pvmw->address);
                if (!pgd_present(*pgd)) {
                        step_forward(pvmw, PGDIR_SIZE);
                        continue;
                }
                p4d = p4d_offset(pgd, pvmw->address);
                if (!p4d_present(*p4d)) {
                        step_forward(pvmw, P4D_SIZE);
                        continue;
                }
                pud = pud_offset(p4d, pvmw->address);
                if (!pud_present(*pud)) {
                        step_forward(pvmw, PUD_SIZE);
                        continue;
                }

                pvmw->pmd = pmd_offset(pud, pvmw->address);
                /*
                 * Make sure the pmd value isn't cached in a register by the
                 * compiler and used as a stale value after we've observed a
                 * subsequent update.
                 */
                pmde = pmdp_get_lockless(pvmw->pmd);

                if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde) ||
                    (pmd_present(pmde) && pmd_devmap(pmde))) {
                        pvmw->ptl = pmd_lock(mm, pvmw->pmd);
                        pmde = *pvmw->pmd;
                        if (!pmd_present(pmde)) {
                                swp_entry_t entry;

                                if (!thp_migration_supported() ||
                                    !(pvmw->flags & PVMW_MIGRATION))
                                        return not_found(pvmw);
                                entry = pmd_to_swp_entry(pmde);
                                if (!is_migration_entry(entry) ||
                                    !check_pmd(swp_offset_pfn(entry), pvmw))
                                        return not_found(pvmw);
                                return true;
                        }
                        if (likely(pmd_trans_huge(pmde) || pmd_devmap(pmde))) {
                                if (pvmw->flags & PVMW_MIGRATION)
                                        return not_found(pvmw);
                                if (!check_pmd(pmd_pfn(pmde), pvmw))
                                        return not_found(pvmw);
                                return true;
                        }
                        /* THP pmd was split under us: handle on pte level */
                        spin_unlock(pvmw->ptl);
                        pvmw->ptl = NULL;
                } else if (!pmd_present(pmde)) {
                        /*
                         * If PVMW_SYNC, take and drop THP pmd lock so that we
                         * cannot return prematurely, while zap_huge_pmd() has
                         * cleared *pmd but not decremented compound_mapcount().
                         */
                        if ((pvmw->flags & PVMW_SYNC) &&
                            thp_vma_suitable_order(vma, pvmw->address,
                                                   PMD_ORDER) &&
                            (pvmw->nr_pages >= HPAGE_PMD_NR)) {
                                spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);

                                spin_unlock(ptl);
                        }
                        step_forward(pvmw, PMD_SIZE);
                        continue;
                }
                if (!map_pte(pvmw, &ptl)) {
                        if (!pvmw->pte)
                                goto restart;
                        goto next_pte;
                }
this_pte:
                if (check_pte(pvmw))
                        return true;
next_pte:
                do {
                        pvmw->address += PAGE_SIZE;
                        if (pvmw->address >= end)
                                return not_found(pvmw);
                        /* Did we cross page table boundary? */
                        if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) {
                                if (pvmw->ptl) {
                                        spin_unlock(pvmw->ptl);
                                        pvmw->ptl = NULL;
                                }
                                pte_unmap(pvmw->pte);
                                pvmw->pte = NULL;
                                goto restart;
                        }
                        pvmw->pte++;
                } while (pte_none(ptep_get(pvmw->pte)));

                if (!pvmw->ptl) {
                        pvmw->ptl = ptl;
                        spin_lock(pvmw->ptl);
                }
                goto this_pte;
        } while (pvmw->address < end);

        return false;
}

#ifdef CONFIG_MEMORY_FAILURE
/**
 * page_mapped_in_vma - check whether a page is really mapped in a VMA
 * @page: the page to test
 * @vma: the VMA to test
 *
 * Return: The address the page is mapped at if the page is in the range
 * covered by the VMA and present in the page table.  If the page is
 * outside the VMA or not present, returns -EFAULT.
 * Only valid for normal file or anonymous VMAs.
 */
unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
{
        struct folio *folio = page_folio(page);
        pgoff_t pgoff = folio->index + folio_page_idx(folio, page);
        struct page_vma_mapped_walk pvmw = {
                .pfn = page_to_pfn(page),
                .nr_pages = 1,
                .vma = vma,
                .flags = PVMW_SYNC,
        };

        pvmw.address = vma_address(vma, pgoff, 1);
        if (pvmw.address == -EFAULT)
                goto out;
        if (!page_vma_mapped_walk(&pvmw))
                return -EFAULT;
        page_vma_mapped_walk_done(&pvmw);
out:
        return pvmw.address;
}
#endif


























    2 
    4 
    3 











































































































































    2 






















































































































































    5 






    2 









    5 



















    4 





    1 






















    1 




    4 





    1 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Definitions for diskquota-operations. When diskquota is configured these
 * macros expand to the right source-code.
 *
 * Author:  Marco van Wieringen <mvw@planets.elm.net>
 */
#ifndef _LINUX_QUOTAOPS_
#define _LINUX_QUOTAOPS_

#include <linux/fs.h>

#define DQUOT_SPACE_WARN        0x1
#define DQUOT_SPACE_RESERVE        0x2
#define DQUOT_SPACE_NOFAIL        0x4

static inline struct quota_info *sb_dqopt(struct super_block *sb)
{
        return &sb->s_dquot;
}

/* i_mutex must being held */
static inline bool is_quota_modification(struct mnt_idmap *idmap,
                                         struct inode *inode, struct iattr *ia)
{
        return ((ia->ia_valid & ATTR_SIZE) ||
                i_uid_needs_update(idmap, ia, inode) ||
                i_gid_needs_update(idmap, ia, inode));
}

#if defined(CONFIG_QUOTA)

#define quota_error(sb, fmt, args...) \
        __quota_error((sb), __func__, fmt , ## args)

extern __printf(3, 4)
void __quota_error(struct super_block *sb, const char *func,
                   const char *fmt, ...);

/*
 * declaration of quota_function calls in kernel.
 */
int dquot_initialize(struct inode *inode);
bool dquot_initialize_needed(struct inode *inode);
void dquot_drop(struct inode *inode);
struct dquot *dqget(struct super_block *sb, struct kqid qid);
static inline struct dquot *dqgrab(struct dquot *dquot)
{
        /* Make sure someone else has active reference to dquot */
        WARN_ON_ONCE(!atomic_read(&dquot->dq_count));
        WARN_ON_ONCE(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags));
        atomic_inc(&dquot->dq_count);
        return dquot;
}

static inline bool dquot_is_busy(struct dquot *dquot)
{
        if (test_bit(DQ_MOD_B, &dquot->dq_flags))
                return true;
        if (atomic_read(&dquot->dq_count) > 0)
                return true;
        return false;
}

void dqput(struct dquot *dquot);
int dquot_scan_active(struct super_block *sb,
                      int (*fn)(struct dquot *dquot, unsigned long priv),
                      unsigned long priv);
struct dquot *dquot_alloc(struct super_block *sb, int type);
void dquot_destroy(struct dquot *dquot);

int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags);
void __dquot_free_space(struct inode *inode, qsize_t number, int flags);

int dquot_alloc_inode(struct inode *inode);

void dquot_claim_space_nodirty(struct inode *inode, qsize_t number);
void dquot_free_inode(struct inode *inode);
void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number);

int dquot_disable(struct super_block *sb, int type, unsigned int flags);
/* Suspend quotas on remount RO */
static inline int dquot_suspend(struct super_block *sb, int type)
{
        return dquot_disable(sb, type, DQUOT_SUSPENDED);
}
int dquot_resume(struct super_block *sb, int type);

int dquot_commit(struct dquot *dquot);
int dquot_acquire(struct dquot *dquot);
int dquot_release(struct dquot *dquot);
int dquot_commit_info(struct super_block *sb, int type);
int dquot_get_next_id(struct super_block *sb, struct kqid *qid);
int dquot_mark_dquot_dirty(struct dquot *dquot);

int dquot_file_open(struct inode *inode, struct file *file);

int dquot_load_quota_sb(struct super_block *sb, int type, int format_id,
        unsigned int flags);
int dquot_load_quota_inode(struct inode *inode, int type, int format_id,
        unsigned int flags);
int dquot_quota_on(struct super_block *sb, int type, int format_id,
        const struct path *path);
int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
         int format_id, int type);
int dquot_quota_off(struct super_block *sb, int type);
int dquot_writeback_dquots(struct super_block *sb, int type);
int dquot_quota_sync(struct super_block *sb, int type);
int dquot_get_state(struct super_block *sb, struct qc_state *state);
int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii);
int dquot_get_dqblk(struct super_block *sb, struct kqid id,
                struct qc_dqblk *di);
int dquot_get_next_dqblk(struct super_block *sb, struct kqid *id,
                struct qc_dqblk *di);
int dquot_set_dqblk(struct super_block *sb, struct kqid id,
                struct qc_dqblk *di);

int __dquot_transfer(struct inode *inode, struct dquot **transfer_to);
int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode,
                   struct iattr *iattr);

static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type)
{
        return sb_dqopt(sb)->info + type;
}

/*
 * Functions for checking status of quota
 */

static inline bool sb_has_quota_usage_enabled(struct super_block *sb, int type)
{
        return sb_dqopt(sb)->flags &
                                dquot_state_flag(DQUOT_USAGE_ENABLED, type);
}

static inline bool sb_has_quota_limits_enabled(struct super_block *sb, int type)
{
        return sb_dqopt(sb)->flags &
                                dquot_state_flag(DQUOT_LIMITS_ENABLED, type);
}

static inline bool sb_has_quota_suspended(struct super_block *sb, int type)
{
        return sb_dqopt(sb)->flags &
                                dquot_state_flag(DQUOT_SUSPENDED, type);
}

static inline unsigned sb_any_quota_suspended(struct super_block *sb)
{
        return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_SUSPENDED);
}

/* Does kernel know about any quota information for given sb + type? */
static inline bool sb_has_quota_loaded(struct super_block *sb, int type)
{
        /* Currently if anything is on, then quota usage is on as well */
        return sb_has_quota_usage_enabled(sb, type);
}

static inline unsigned sb_any_quota_loaded(struct super_block *sb)
{
        return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_USAGE_ENABLED);
}

static inline bool sb_has_quota_active(struct super_block *sb, int type)
{
        return sb_has_quota_loaded(sb, type) &&
               !sb_has_quota_suspended(sb, type);
}

/*
 * Operations supported for diskquotas.
 */
extern const struct dquot_operations dquot_operations;
extern const struct quotactl_ops dquot_quotactl_sysfile_ops;

#else

static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type)
{
        return 0;
}

static inline int sb_has_quota_limits_enabled(struct super_block *sb, int type)
{
        return 0;
}

static inline int sb_has_quota_suspended(struct super_block *sb, int type)
{
        return 0;
}

static inline int sb_any_quota_suspended(struct super_block *sb)
{
        return 0;
}

/* Does kernel know about any quota information for given sb + type? */
static inline int sb_has_quota_loaded(struct super_block *sb, int type)
{
        return 0;
}

static inline int sb_any_quota_loaded(struct super_block *sb)
{
        return 0;
}

static inline int sb_has_quota_active(struct super_block *sb, int type)
{
        return 0;
}

static inline int dquot_initialize(struct inode *inode)
{
        return 0;
}

static inline bool dquot_initialize_needed(struct inode *inode)
{
        return false;
}

static inline void dquot_drop(struct inode *inode)
{
}

static inline int dquot_alloc_inode(struct inode *inode)
{
        return 0;
}

static inline void dquot_free_inode(struct inode *inode)
{
}

static inline int dquot_transfer(struct mnt_idmap *idmap,
                                 struct inode *inode, struct iattr *iattr)
{
        return 0;
}

static inline int __dquot_alloc_space(struct inode *inode, qsize_t number,
                int flags)
{
        if (!(flags & DQUOT_SPACE_RESERVE))
                inode_add_bytes(inode, number);
        return 0;
}

static inline void __dquot_free_space(struct inode *inode, qsize_t number,
                int flags)
{
        if (!(flags & DQUOT_SPACE_RESERVE))
                inode_sub_bytes(inode, number);
}

static inline void dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
{
        inode_add_bytes(inode, number);
}

static inline int dquot_reclaim_space_nodirty(struct inode *inode,
                                              qsize_t number)
{
        inode_sub_bytes(inode, number);
        return 0;
}

static inline int dquot_disable(struct super_block *sb, int type,
                unsigned int flags)
{
        return 0;
}

static inline int dquot_suspend(struct super_block *sb, int type)
{
        return 0;
}

static inline int dquot_resume(struct super_block *sb, int type)
{
        return 0;
}

#define dquot_file_open                generic_file_open

static inline int dquot_writeback_dquots(struct super_block *sb, int type)
{
        return 0;
}

#endif /* CONFIG_QUOTA */

static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr)
{
        return __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN);
}

static inline void dquot_alloc_space_nofail(struct inode *inode, qsize_t nr)
{
        __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN|DQUOT_SPACE_NOFAIL);
        mark_inode_dirty_sync(inode);
}

static inline int dquot_alloc_space(struct inode *inode, qsize_t nr)
{
        int ret;

        ret = dquot_alloc_space_nodirty(inode, nr);
        if (!ret) {
                /*
                 * Mark inode fully dirty. Since we are allocating blocks, inode
                 * would become fully dirty soon anyway and it reportedly
                 * reduces lock contention.
                 */
                mark_inode_dirty(inode);
        }
        return ret;
}

static inline int dquot_alloc_block_nodirty(struct inode *inode, qsize_t nr)
{
        return dquot_alloc_space_nodirty(inode, nr << inode->i_blkbits);
}

static inline void dquot_alloc_block_nofail(struct inode *inode, qsize_t nr)
{
        dquot_alloc_space_nofail(inode, nr << inode->i_blkbits);
}

static inline int dquot_alloc_block(struct inode *inode, qsize_t nr)
{
        return dquot_alloc_space(inode, nr << inode->i_blkbits);
}

static inline int dquot_prealloc_block_nodirty(struct inode *inode, qsize_t nr)
{
        return __dquot_alloc_space(inode, nr << inode->i_blkbits, 0);
}

static inline int dquot_prealloc_block(struct inode *inode, qsize_t nr)
{
        int ret;

        ret = dquot_prealloc_block_nodirty(inode, nr);
        if (!ret)
                mark_inode_dirty_sync(inode);
        return ret;
}

static inline int dquot_reserve_block(struct inode *inode, qsize_t nr)
{
        return __dquot_alloc_space(inode, nr << inode->i_blkbits,
                                DQUOT_SPACE_WARN|DQUOT_SPACE_RESERVE);
}

static inline void dquot_claim_block(struct inode *inode, qsize_t nr)
{
        dquot_claim_space_nodirty(inode, nr << inode->i_blkbits);
        mark_inode_dirty_sync(inode);
}

static inline void dquot_reclaim_block(struct inode *inode, qsize_t nr)
{
        dquot_reclaim_space_nodirty(inode, nr << inode->i_blkbits);
        mark_inode_dirty_sync(inode);
}

static inline void dquot_free_space_nodirty(struct inode *inode, qsize_t nr)
{
        __dquot_free_space(inode, nr, 0);
}

static inline void dquot_free_space(struct inode *inode, qsize_t nr)
{
        dquot_free_space_nodirty(inode, nr);
        mark_inode_dirty_sync(inode);
}

static inline void dquot_free_block_nodirty(struct inode *inode, qsize_t nr)
{
        dquot_free_space_nodirty(inode, nr << inode->i_blkbits);
}

static inline void dquot_free_block(struct inode *inode, qsize_t nr)
{
        dquot_free_space(inode, nr << inode->i_blkbits);
}

static inline void dquot_release_reservation_block(struct inode *inode,
                qsize_t nr)
{
        __dquot_free_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_RESERVE);
}

unsigned int qtype_enforce_flag(int type);

#endif /* _LINUX_QUOTAOPS_ */























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 












    1 



























    1 



    1 




































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
// SPDX-License-Identifier: GPL-2.0
/*
 * Basic worker thread pool for io_uring
 *
 * Copyright (C) 2019 Jens Axboe
 *
 */
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/sched/signal.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/rculist_nulls.h>
#include <linux/cpu.h>
#include <linux/task_work.h>
#include <linux/audit.h>
#include <linux/mmu_context.h>
#include <uapi/linux/io_uring.h>

#include "io-wq.h"
#include "slist.h"
#include "io_uring.h"

#define WORKER_IDLE_TIMEOUT        (5 * HZ)

enum {
        IO_WORKER_F_UP                = 0,        /* up and active */
        IO_WORKER_F_RUNNING        = 1,        /* account as running */
        IO_WORKER_F_FREE        = 2,        /* worker on free list */
        IO_WORKER_F_BOUND        = 3,        /* is doing bounded work */
};

enum {
        IO_WQ_BIT_EXIT                = 0,        /* wq exiting */
};

enum {
        IO_ACCT_STALLED_BIT        = 0,        /* stalled on hash */
};

/*
 * One for each thread in a wq pool
 */
struct io_worker {
        refcount_t ref;
        int create_index;
        unsigned long flags;
        struct hlist_nulls_node nulls_node;
        struct list_head all_list;
        struct task_struct *task;
        struct io_wq *wq;

        struct io_wq_work *cur_work;
        raw_spinlock_t lock;

        struct completion ref_done;

        unsigned long create_state;
        struct callback_head create_work;

        union {
                struct rcu_head rcu;
                struct work_struct work;
        };
};

#if BITS_PER_LONG == 64
#define IO_WQ_HASH_ORDER        6
#else
#define IO_WQ_HASH_ORDER        5
#endif

#define IO_WQ_NR_HASH_BUCKETS        (1u << IO_WQ_HASH_ORDER)

struct io_wq_acct {
        unsigned nr_workers;
        unsigned max_workers;
        int index;
        atomic_t nr_running;
        raw_spinlock_t lock;
        struct io_wq_work_list work_list;
        unsigned long flags;
};

enum {
        IO_WQ_ACCT_BOUND,
        IO_WQ_ACCT_UNBOUND,
        IO_WQ_ACCT_NR,
};

/*
 * Per io_wq state
  */
struct io_wq {
        unsigned long state;

        free_work_fn *free_work;
        io_wq_work_fn *do_work;

        struct io_wq_hash *hash;

        atomic_t worker_refs;
        struct completion worker_done;

        struct hlist_node cpuhp_node;

        struct task_struct *task;

        struct io_wq_acct acct[IO_WQ_ACCT_NR];

        /* lock protects access to elements below */
        raw_spinlock_t lock;

        struct hlist_nulls_head free_list;
        struct list_head all_list;

        struct wait_queue_entry wait;

        struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];

        cpumask_var_t cpu_mask;
};

static enum cpuhp_state io_wq_online;

struct io_cb_cancel_data {
        work_cancel_fn *fn;
        void *data;
        int nr_running;
        int nr_pending;
        bool cancel_all;
};

static bool create_io_worker(struct io_wq *wq, int index);
static void io_wq_dec_running(struct io_worker *worker);
static bool io_acct_cancel_pending_work(struct io_wq *wq,
                                        struct io_wq_acct *acct,
                                        struct io_cb_cancel_data *match);
static void create_worker_cb(struct callback_head *cb);
static void io_wq_cancel_tw_create(struct io_wq *wq);

static bool io_worker_get(struct io_worker *worker)
{
        return refcount_inc_not_zero(&worker->ref);
}

static void io_worker_release(struct io_worker *worker)
{
        if (refcount_dec_and_test(&worker->ref))
                complete(&worker->ref_done);
}

static inline struct io_wq_acct *io_get_acct(struct io_wq *wq, bool bound)
{
        return &wq->acct[bound ? IO_WQ_ACCT_BOUND : IO_WQ_ACCT_UNBOUND];
}

static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq,
                                                  struct io_wq_work *work)
{
        return io_get_acct(wq, !(work->flags & IO_WQ_WORK_UNBOUND));
}

static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker)
{
        return io_get_acct(worker->wq, test_bit(IO_WORKER_F_BOUND, &worker->flags));
}

static void io_worker_ref_put(struct io_wq *wq)
{
        if (atomic_dec_and_test(&wq->worker_refs))
                complete(&wq->worker_done);
}

bool io_wq_worker_stopped(void)
{
        struct io_worker *worker = current->worker_private;

        if (WARN_ON_ONCE(!io_wq_current_is_worker()))
                return true;

        return test_bit(IO_WQ_BIT_EXIT, &worker->wq->state);
}

static void io_worker_cancel_cb(struct io_worker *worker)
{
        struct io_wq_acct *acct = io_wq_get_acct(worker);
        struct io_wq *wq = worker->wq;

        atomic_dec(&acct->nr_running);
        raw_spin_lock(&wq->lock);
        acct->nr_workers--;
        raw_spin_unlock(&wq->lock);
        io_worker_ref_put(wq);
        clear_bit_unlock(0, &worker->create_state);
        io_worker_release(worker);
}

static bool io_task_worker_match(struct callback_head *cb, void *data)
{
        struct io_worker *worker;

        if (cb->func != create_worker_cb)
                return false;
        worker = container_of(cb, struct io_worker, create_work);
        return worker == data;
}

static void io_worker_exit(struct io_worker *worker)
{
        struct io_wq *wq = worker->wq;

        while (1) {
                struct callback_head *cb = task_work_cancel_match(wq->task,
                                                io_task_worker_match, worker);

                if (!cb)
                        break;
                io_worker_cancel_cb(worker);
        }

        io_worker_release(worker);
        wait_for_completion(&worker->ref_done);

        raw_spin_lock(&wq->lock);
        if (test_bit(IO_WORKER_F_FREE, &worker->flags))
                hlist_nulls_del_rcu(&worker->nulls_node);
        list_del_rcu(&worker->all_list);
        raw_spin_unlock(&wq->lock);
        io_wq_dec_running(worker);
        /*
         * this worker is a goner, clear ->worker_private to avoid any
         * inc/dec running calls that could happen as part of exit from
         * touching 'worker'.
         */
        current->worker_private = NULL;

        kfree_rcu(worker, rcu);
        io_worker_ref_put(wq);
        do_exit(0);
}

static inline bool __io_acct_run_queue(struct io_wq_acct *acct)
{
        return !test_bit(IO_ACCT_STALLED_BIT, &acct->flags) &&
                !wq_list_empty(&acct->work_list);
}

/*
 * If there's work to do, returns true with acct->lock acquired. If not,
 * returns false with no lock held.
 */
static inline bool io_acct_run_queue(struct io_wq_acct *acct)
        __acquires(&acct->lock)
{
        raw_spin_lock(&acct->lock);
        if (__io_acct_run_queue(acct))
                return true;

        raw_spin_unlock(&acct->lock);
        return false;
}

/*
 * Check head of free list for an available worker. If one isn't available,
 * caller must create one.
 */
static bool io_wq_activate_free_worker(struct io_wq *wq,
                                        struct io_wq_acct *acct)
        __must_hold(RCU)
{
        struct hlist_nulls_node *n;
        struct io_worker *worker;

        /*
         * Iterate free_list and see if we can find an idle worker to
         * activate. If a given worker is on the free_list but in the process
         * of exiting, keep trying.
         */
        hlist_nulls_for_each_entry_rcu(worker, n, &wq->free_list, nulls_node) {
                if (!io_worker_get(worker))
                        continue;
                if (io_wq_get_acct(worker) != acct) {
                        io_worker_release(worker);
                        continue;
                }
                /*
                 * If the worker is already running, it's either already
                 * starting work or finishing work. In either case, if it does
                 * to go sleep, we'll kick off a new task for this work anyway.
                 */
                wake_up_process(worker->task);
                io_worker_release(worker);
                return true;
        }

        return false;
}

/*
 * We need a worker. If we find a free one, we're good. If not, and we're
 * below the max number of workers, create one.
 */
static bool io_wq_create_worker(struct io_wq *wq, struct io_wq_acct *acct)
{
        /*
         * Most likely an attempt to queue unbounded work on an io_wq that
         * wasn't setup with any unbounded workers.
         */
        if (unlikely(!acct->max_workers))
                pr_warn_once("io-wq is not configured for unbound workers");

        raw_spin_lock(&wq->lock);
        if (acct->nr_workers >= acct->max_workers) {
                raw_spin_unlock(&wq->lock);
                return true;
        }
        acct->nr_workers++;
        raw_spin_unlock(&wq->lock);
        atomic_inc(&acct->nr_running);
        atomic_inc(&wq->worker_refs);
        return create_io_worker(wq, acct->index);
}

static void io_wq_inc_running(struct io_worker *worker)
{
        struct io_wq_acct *acct = io_wq_get_acct(worker);

        atomic_inc(&acct->nr_running);
}

static void create_worker_cb(struct callback_head *cb)
{
        struct io_worker *worker;
        struct io_wq *wq;

        struct io_wq_acct *acct;
        bool do_create = false;

        worker = container_of(cb, struct io_worker, create_work);
        wq = worker->wq;
        acct = &wq->acct[worker->create_index];
        raw_spin_lock(&wq->lock);

        if (acct->nr_workers < acct->max_workers) {
                acct->nr_workers++;
                do_create = true;
        }
        raw_spin_unlock(&wq->lock);
        if (do_create) {
                create_io_worker(wq, worker->create_index);
        } else {
                atomic_dec(&acct->nr_running);
                io_worker_ref_put(wq);
        }
        clear_bit_unlock(0, &worker->create_state);
        io_worker_release(worker);
}

static bool io_queue_worker_create(struct io_worker *worker,
                                   struct io_wq_acct *acct,
                                   task_work_func_t func)
{
        struct io_wq *wq = worker->wq;

        /* raced with exit, just ignore create call */
        if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
                goto fail;
        if (!io_worker_get(worker))
                goto fail;
        /*
         * create_state manages ownership of create_work/index. We should
         * only need one entry per worker, as the worker going to sleep
         * will trigger the condition, and waking will clear it once it
         * runs the task_work.
         */
        if (test_bit(0, &worker->create_state) ||
            test_and_set_bit_lock(0, &worker->create_state))
                goto fail_release;

        atomic_inc(&wq->worker_refs);
        init_task_work(&worker->create_work, func);
        worker->create_index = acct->index;
        if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) {
                /*
                 * EXIT may have been set after checking it above, check after
                 * adding the task_work and remove any creation item if it is
                 * now set. wq exit does that too, but we can have added this
                 * work item after we canceled in io_wq_exit_workers().
                 */
                if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
                        io_wq_cancel_tw_create(wq);
                io_worker_ref_put(wq);
                return true;
        }
        io_worker_ref_put(wq);
        clear_bit_unlock(0, &worker->create_state);
fail_release:
        io_worker_release(worker);
fail:
        atomic_dec(&acct->nr_running);
        io_worker_ref_put(wq);
        return false;
}

static void io_wq_dec_running(struct io_worker *worker)
{
        struct io_wq_acct *acct = io_wq_get_acct(worker);
        struct io_wq *wq = worker->wq;

        if (!test_bit(IO_WORKER_F_UP, &worker->flags))
                return;

        if (!atomic_dec_and_test(&acct->nr_running))
                return;
        if (!io_acct_run_queue(acct))
                return;

        raw_spin_unlock(&acct->lock);
        atomic_inc(&acct->nr_running);
        atomic_inc(&wq->worker_refs);
        io_queue_worker_create(worker, acct, create_worker_cb);
}

/*
 * Worker will start processing some work. Move it to the busy list, if
 * it's currently on the freelist
 */
static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker)
{
        if (test_bit(IO_WORKER_F_FREE, &worker->flags)) {
                clear_bit(IO_WORKER_F_FREE, &worker->flags);
                raw_spin_lock(&wq->lock);
                hlist_nulls_del_init_rcu(&worker->nulls_node);
                raw_spin_unlock(&wq->lock);
        }
}

/*
 * No work, worker going to sleep. Move to freelist.
 */
static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker)
        __must_hold(wq->lock)
{
        if (!test_bit(IO_WORKER_F_FREE, &worker->flags)) {
                set_bit(IO_WORKER_F_FREE, &worker->flags);
                hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list);
        }
}

static inline unsigned int io_get_work_hash(struct io_wq_work *work)
{
        return work->flags >> IO_WQ_HASH_SHIFT;
}

static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash)
{
        bool ret = false;

        spin_lock_irq(&wq->hash->wait.lock);
        if (list_empty(&wq->wait.entry)) {
                __add_wait_queue(&wq->hash->wait, &wq->wait);
                if (!test_bit(hash, &wq->hash->map)) {
                        __set_current_state(TASK_RUNNING);
                        list_del_init(&wq->wait.entry);
                        ret = true;
                }
        }
        spin_unlock_irq(&wq->hash->wait.lock);
        return ret;
}

static struct io_wq_work *io_get_next_work(struct io_wq_acct *acct,
                                           struct io_worker *worker)
        __must_hold(acct->lock)
{
        struct io_wq_work_node *node, *prev;
        struct io_wq_work *work, *tail;
        unsigned int stall_hash = -1U;
        struct io_wq *wq = worker->wq;

        wq_list_for_each(node, prev, &acct->work_list) {
                unsigned int hash;

                work = container_of(node, struct io_wq_work, list);

                /* not hashed, can run anytime */
                if (!io_wq_is_hashed(work)) {
                        wq_list_del(&acct->work_list, node, prev);
                        return work;
                }

                hash = io_get_work_hash(work);
                /* all items with this hash lie in [work, tail] */
                tail = wq->hash_tail[hash];

                /* hashed, can run if not already running */
                if (!test_and_set_bit(hash, &wq->hash->map)) {
                        wq->hash_tail[hash] = NULL;
                        wq_list_cut(&acct->work_list, &tail->list, prev);
                        return work;
                }
                if (stall_hash == -1U)
                        stall_hash = hash;
                /* fast forward to a next hash, for-each will fix up @prev */
                node = &tail->list;
        }

        if (stall_hash != -1U) {
                bool unstalled;

                /*
                 * Set this before dropping the lock to avoid racing with new
                 * work being added and clearing the stalled bit.
                 */
                set_bit(IO_ACCT_STALLED_BIT, &acct->flags);
                raw_spin_unlock(&acct->lock);
                unstalled = io_wait_on_hash(wq, stall_hash);
                raw_spin_lock(&acct->lock);
                if (unstalled) {
                        clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
                        if (wq_has_sleeper(&wq->hash->wait))
                                wake_up(&wq->hash->wait);
                }
        }

        return NULL;
}

static void io_assign_current_work(struct io_worker *worker,
                                   struct io_wq_work *work)
{
        if (work) {
                io_run_task_work();
                cond_resched();
        }

        raw_spin_lock(&worker->lock);
        worker->cur_work = work;
        raw_spin_unlock(&worker->lock);
}

/*
 * Called with acct->lock held, drops it before returning
 */
static void io_worker_handle_work(struct io_wq_acct *acct,
                                  struct io_worker *worker)
        __releases(&acct->lock)
{
        struct io_wq *wq = worker->wq;
        bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state);

        do {
                struct io_wq_work *work;

                /*
                 * If we got some work, mark us as busy. If we didn't, but
                 * the list isn't empty, it means we stalled on hashed work.
                 * Mark us stalled so we don't keep looking for work when we
                 * can't make progress, any work completion or insertion will
                 * clear the stalled flag.
                 */
                work = io_get_next_work(acct, worker);
                if (work) {
                        /*
                         * Make sure cancelation can find this, even before
                         * it becomes the active work. That avoids a window
                         * where the work has been removed from our general
                         * work list, but isn't yet discoverable as the
                         * current work item for this worker.
                         */
                        raw_spin_lock(&worker->lock);
                        worker->cur_work = work;
                        raw_spin_unlock(&worker->lock);
                }

                raw_spin_unlock(&acct->lock);

                if (!work)
                        break;

                __io_worker_busy(wq, worker);

                io_assign_current_work(worker, work);
                __set_current_state(TASK_RUNNING);

                /* handle a whole dependent link */
                do {
                        struct io_wq_work *next_hashed, *linked;
                        unsigned int hash = io_get_work_hash(work);

                        next_hashed = wq_next_work(work);

                        if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND))
                                work->flags |= IO_WQ_WORK_CANCEL;
                        wq->do_work(work);
                        io_assign_current_work(worker, NULL);

                        linked = wq->free_work(work);
                        work = next_hashed;
                        if (!work && linked && !io_wq_is_hashed(linked)) {
                                work = linked;
                                linked = NULL;
                        }
                        io_assign_current_work(worker, work);
                        if (linked)
                                io_wq_enqueue(wq, linked);

                        if (hash != -1U && !next_hashed) {
                                /* serialize hash clear with wake_up() */
                                spin_lock_irq(&wq->hash->wait.lock);
                                clear_bit(hash, &wq->hash->map);
                                clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
                                spin_unlock_irq(&wq->hash->wait.lock);
                                if (wq_has_sleeper(&wq->hash->wait))
                                        wake_up(&wq->hash->wait);
                        }
                } while (work);

                if (!__io_acct_run_queue(acct))
                        break;
                raw_spin_lock(&acct->lock);
        } while (1);
}

static int io_wq_worker(void *data)
{
        struct io_worker *worker = data;
        struct io_wq_acct *acct = io_wq_get_acct(worker);
        struct io_wq *wq = worker->wq;
        bool exit_mask = false, last_timeout = false;
        char buf[TASK_COMM_LEN];

        set_mask_bits(&worker->flags, 0,
                      BIT(IO_WORKER_F_UP) | BIT(IO_WORKER_F_RUNNING));

        snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid);
        set_task_comm(current, buf);

        while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
                long ret;

                set_current_state(TASK_INTERRUPTIBLE);

                /*
                 * If we have work to do, io_acct_run_queue() returns with
                 * the acct->lock held. If not, it will drop it.
                 */
                while (io_acct_run_queue(acct))
                        io_worker_handle_work(acct, worker);

                raw_spin_lock(&wq->lock);
                /*
                 * Last sleep timed out. Exit if we're not the last worker,
                 * or if someone modified our affinity.
                 */
                if (last_timeout && (exit_mask || acct->nr_workers > 1)) {
                        acct->nr_workers--;
                        raw_spin_unlock(&wq->lock);
                        __set_current_state(TASK_RUNNING);
                        break;
                }
                last_timeout = false;
                __io_worker_idle(wq, worker);
                raw_spin_unlock(&wq->lock);
                if (io_run_task_work())
                        continue;
                ret = schedule_timeout(WORKER_IDLE_TIMEOUT);
                if (signal_pending(current)) {
                        struct ksignal ksig;

                        if (!get_signal(&ksig))
                                continue;
                        break;
                }
                if (!ret) {
                        last_timeout = true;
                        exit_mask = !cpumask_test_cpu(raw_smp_processor_id(),
                                                        wq->cpu_mask);
                }
        }

        if (test_bit(IO_WQ_BIT_EXIT, &wq->state) && io_acct_run_queue(acct))
                io_worker_handle_work(acct, worker);

        io_worker_exit(worker);
        return 0;
}

/*
 * Called when a worker is scheduled in. Mark us as currently running.
 */
void io_wq_worker_running(struct task_struct *tsk)
{
        struct io_worker *worker = tsk->worker_private;

        if (!worker)
                return;
        if (!test_bit(IO_WORKER_F_UP, &worker->flags))
                return;
        if (test_bit(IO_WORKER_F_RUNNING, &worker->flags))
                return;
        set_bit(IO_WORKER_F_RUNNING, &worker->flags);
        io_wq_inc_running(worker);
}

/*
 * Called when worker is going to sleep. If there are no workers currently
 * running and we have work pending, wake up a free one or create a new one.
 */
void io_wq_worker_sleeping(struct task_struct *tsk)
{
        struct io_worker *worker = tsk->worker_private;

        if (!worker)
                return;
        if (!test_bit(IO_WORKER_F_UP, &worker->flags))
                return;
        if (!test_bit(IO_WORKER_F_RUNNING, &worker->flags))
                return;

        clear_bit(IO_WORKER_F_RUNNING, &worker->flags);
        io_wq_dec_running(worker);
}

static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker,
                               struct task_struct *tsk)
{
        tsk->worker_private = worker;
        worker->task = tsk;
        set_cpus_allowed_ptr(tsk, wq->cpu_mask);

        raw_spin_lock(&wq->lock);
        hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list);
        list_add_tail_rcu(&worker->all_list, &wq->all_list);
        set_bit(IO_WORKER_F_FREE, &worker->flags);
        raw_spin_unlock(&wq->lock);
        wake_up_new_task(tsk);
}

static bool io_wq_work_match_all(struct io_wq_work *work, void *data)
{
        return true;
}

static inline bool io_should_retry_thread(long err)
{
        /*
         * Prevent perpetual task_work retry, if the task (or its group) is
         * exiting.
         */
        if (fatal_signal_pending(current))
                return false;

        switch (err) {
        case -EAGAIN:
        case -ERESTARTSYS:
        case -ERESTARTNOINTR:
        case -ERESTARTNOHAND:
                return true;
        default:
                return false;
        }
}

static void create_worker_cont(struct callback_head *cb)
{
        struct io_worker *worker;
        struct task_struct *tsk;
        struct io_wq *wq;

        worker = container_of(cb, struct io_worker, create_work);
        clear_bit_unlock(0, &worker->create_state);
        wq = worker->wq;
        tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE);
        if (!IS_ERR(tsk)) {
                io_init_new_worker(wq, worker, tsk);
                io_worker_release(worker);
                return;
        } else if (!io_should_retry_thread(PTR_ERR(tsk))) {
                struct io_wq_acct *acct = io_wq_get_acct(worker);

                atomic_dec(&acct->nr_running);
                raw_spin_lock(&wq->lock);
                acct->nr_workers--;
                if (!acct->nr_workers) {
                        struct io_cb_cancel_data match = {
                                .fn                = io_wq_work_match_all,
                                .cancel_all        = true,
                        };

                        raw_spin_unlock(&wq->lock);
                        while (io_acct_cancel_pending_work(wq, acct, &match))
                                ;
                } else {
                        raw_spin_unlock(&wq->lock);
                }
                io_worker_ref_put(wq);
                kfree(worker);
                return;
        }

        /* re-create attempts grab a new worker ref, drop the existing one */
        io_worker_release(worker);
        schedule_work(&worker->work);
}

static void io_workqueue_create(struct work_struct *work)
{
        struct io_worker *worker = container_of(work, struct io_worker, work);
        struct io_wq_acct *acct = io_wq_get_acct(worker);

        if (!io_queue_worker_create(worker, acct, create_worker_cont))
                kfree(worker);
}

static bool create_io_worker(struct io_wq *wq, int index)
{
        struct io_wq_acct *acct = &wq->acct[index];
        struct io_worker *worker;
        struct task_struct *tsk;

        __set_current_state(TASK_RUNNING);

        worker = kzalloc(sizeof(*worker), GFP_KERNEL);
        if (!worker) {
fail:
                atomic_dec(&acct->nr_running);
                raw_spin_lock(&wq->lock);
                acct->nr_workers--;
                raw_spin_unlock(&wq->lock);
                io_worker_ref_put(wq);
                return false;
        }

        refcount_set(&worker->ref, 1);
        worker->wq = wq;
        raw_spin_lock_init(&worker->lock);
        init_completion(&worker->ref_done);

        if (index == IO_WQ_ACCT_BOUND)
                set_bit(IO_WORKER_F_BOUND, &worker->flags);

        tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE);
        if (!IS_ERR(tsk)) {
                io_init_new_worker(wq, worker, tsk);
        } else if (!io_should_retry_thread(PTR_ERR(tsk))) {
                kfree(worker);
                goto fail;
        } else {
                INIT_WORK(&worker->work, io_workqueue_create);
                schedule_work(&worker->work);
        }

        return true;
}

/*
 * Iterate the passed in list and call the specific function for each
 * worker that isn't exiting
 */
static bool io_wq_for_each_worker(struct io_wq *wq,
                                  bool (*func)(struct io_worker *, void *),
                                  void *data)
{
        struct io_worker *worker;
        bool ret = false;

        list_for_each_entry_rcu(worker, &wq->all_list, all_list) {
                if (io_worker_get(worker)) {
                        /* no task if node is/was offline */
                        if (worker->task)
                                ret = func(worker, data);
                        io_worker_release(worker);
                        if (ret)
                                break;
                }
        }

        return ret;
}

static bool io_wq_worker_wake(struct io_worker *worker, void *data)
{
        __set_notify_signal(worker->task);
        wake_up_process(worker->task);
        return false;
}

static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq)
{
        do {
                work->flags |= IO_WQ_WORK_CANCEL;
                wq->do_work(work);
                work = wq->free_work(work);
        } while (work);
}

static void io_wq_insert_work(struct io_wq *wq, struct io_wq_work *work)
{
        struct io_wq_acct *acct = io_work_get_acct(wq, work);
        unsigned int hash;
        struct io_wq_work *tail;

        if (!io_wq_is_hashed(work)) {
append:
                wq_list_add_tail(&work->list, &acct->work_list);
                return;
        }

        hash = io_get_work_hash(work);
        tail = wq->hash_tail[hash];
        wq->hash_tail[hash] = work;
        if (!tail)
                goto append;

        wq_list_add_after(&work->list, &tail->list, &acct->work_list);
}

static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
{
        return work == data;
}

void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
{
        struct io_wq_acct *acct = io_work_get_acct(wq, work);
        unsigned long work_flags = work->flags;
        struct io_cb_cancel_data match = {
                .fn                = io_wq_work_match_item,
                .data                = work,
                .cancel_all        = false,
        };
        bool do_create;

        /*
         * If io-wq is exiting for this task, or if the request has explicitly
         * been marked as one that should not get executed, cancel it here.
         */
        if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
            (work->flags & IO_WQ_WORK_CANCEL)) {
                io_run_cancel(work, wq);
                return;
        }

        raw_spin_lock(&acct->lock);
        io_wq_insert_work(wq, work);
        clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
        raw_spin_unlock(&acct->lock);

        rcu_read_lock();
        do_create = !io_wq_activate_free_worker(wq, acct);
        rcu_read_unlock();

        if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) ||
            !atomic_read(&acct->nr_running))) {
                bool did_create;

                did_create = io_wq_create_worker(wq, acct);
                if (likely(did_create))
                        return;

                raw_spin_lock(&wq->lock);
                if (acct->nr_workers) {
                        raw_spin_unlock(&wq->lock);
                        return;
                }
                raw_spin_unlock(&wq->lock);

                /* fatal condition, failed to create the first worker */
                io_acct_cancel_pending_work(wq, acct, &match);
        }
}

/*
 * Work items that hash to the same value will not be done in parallel.
 * Used to limit concurrent writes, generally hashed by inode.
 */
void io_wq_hash_work(struct io_wq_work *work, void *val)
{
        unsigned int bit;

        bit = hash_ptr(val, IO_WQ_HASH_ORDER);
        work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
}

static bool __io_wq_worker_cancel(struct io_worker *worker,
                                  struct io_cb_cancel_data *match,
                                  struct io_wq_work *work)
{
        if (work && match->fn(work, match->data)) {
                work->flags |= IO_WQ_WORK_CANCEL;
                __set_notify_signal(worker->task);
                return true;
        }

        return false;
}

static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
{
        struct io_cb_cancel_data *match = data;

        /*
         * Hold the lock to avoid ->cur_work going out of scope, caller
         * may dereference the passed in work.
         */
        raw_spin_lock(&worker->lock);
        if (__io_wq_worker_cancel(worker, match, worker->cur_work))
                match->nr_running++;
        raw_spin_unlock(&worker->lock);

        return match->nr_running && !match->cancel_all;
}

static inline void io_wq_remove_pending(struct io_wq *wq,
                                         struct io_wq_work *work,
                                         struct io_wq_work_node *prev)
{
        struct io_wq_acct *acct = io_work_get_acct(wq, work);
        unsigned int hash = io_get_work_hash(work);
        struct io_wq_work *prev_work = NULL;

        if (io_wq_is_hashed(work) && work == wq->hash_tail[hash]) {
                if (prev)
                        prev_work = container_of(prev, struct io_wq_work, list);
                if (prev_work && io_get_work_hash(prev_work) == hash)
                        wq->hash_tail[hash] = prev_work;
                else
                        wq->hash_tail[hash] = NULL;
        }
        wq_list_del(&acct->work_list, &work->list, prev);
}

static bool io_acct_cancel_pending_work(struct io_wq *wq,
                                        struct io_wq_acct *acct,
                                        struct io_cb_cancel_data *match)
{
        struct io_wq_work_node *node, *prev;
        struct io_wq_work *work;

        raw_spin_lock(&acct->lock);
        wq_list_for_each(node, prev, &acct->work_list) {
                work = container_of(node, struct io_wq_work, list);
                if (!match->fn(work, match->data))
                        continue;
                io_wq_remove_pending(wq, work, prev);
                raw_spin_unlock(&acct->lock);
                io_run_cancel(work, wq);
                match->nr_pending++;
                /* not safe to continue after unlock */
                return true;
        }
        raw_spin_unlock(&acct->lock);

        return false;
}

static void io_wq_cancel_pending_work(struct io_wq *wq,
                                      struct io_cb_cancel_data *match)
{
        int i;
retry:
        for (i = 0; i < IO_WQ_ACCT_NR; i++) {
                struct io_wq_acct *acct = io_get_acct(wq, i == 0);

                if (io_acct_cancel_pending_work(wq, acct, match)) {
                        if (match->cancel_all)
                                goto retry;
                        break;
                }
        }
}

static void io_wq_cancel_running_work(struct io_wq *wq,
                                       struct io_cb_cancel_data *match)
{
        rcu_read_lock();
        io_wq_for_each_worker(wq, io_wq_worker_cancel, match);
        rcu_read_unlock();
}

enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
                                  void *data, bool cancel_all)
{
        struct io_cb_cancel_data match = {
                .fn                = cancel,
                .data                = data,
                .cancel_all        = cancel_all,
        };

        /*
         * First check pending list, if we're lucky we can just remove it
         * from there. CANCEL_OK means that the work is returned as-new,
         * no completion will be posted for it.
         *
         * Then check if a free (going busy) or busy worker has the work
         * currently running. If we find it there, we'll return CANCEL_RUNNING
         * as an indication that we attempt to signal cancellation. The
         * completion will run normally in this case.
         *
         * Do both of these while holding the wq->lock, to ensure that
         * we'll find a work item regardless of state.
         */
        io_wq_cancel_pending_work(wq, &match);
        if (match.nr_pending && !match.cancel_all)
                return IO_WQ_CANCEL_OK;

        raw_spin_lock(&wq->lock);
        io_wq_cancel_running_work(wq, &match);
        raw_spin_unlock(&wq->lock);
        if (match.nr_running && !match.cancel_all)
                return IO_WQ_CANCEL_RUNNING;

        if (match.nr_running)
                return IO_WQ_CANCEL_RUNNING;
        if (match.nr_pending)
                return IO_WQ_CANCEL_OK;
        return IO_WQ_CANCEL_NOTFOUND;
}

static int io_wq_hash_wake(struct wait_queue_entry *wait, unsigned mode,
                            int sync, void *key)
{
        struct io_wq *wq = container_of(wait, struct io_wq, wait);
        int i;

        list_del_init(&wait->entry);

        rcu_read_lock();
        for (i = 0; i < IO_WQ_ACCT_NR; i++) {
                struct io_wq_acct *acct = &wq->acct[i];

                if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags))
                        io_wq_activate_free_worker(wq, acct);
        }
        rcu_read_unlock();
        return 1;
}

struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
{
        int ret, i;
        struct io_wq *wq;

        if (WARN_ON_ONCE(!data->free_work || !data->do_work))
                return ERR_PTR(-EINVAL);
        if (WARN_ON_ONCE(!bounded))
                return ERR_PTR(-EINVAL);

        wq = kzalloc(sizeof(struct io_wq), GFP_KERNEL);
        if (!wq)
                return ERR_PTR(-ENOMEM);

        refcount_inc(&data->hash->refs);
        wq->hash = data->hash;
        wq->free_work = data->free_work;
        wq->do_work = data->do_work;

        ret = -ENOMEM;

        if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL))
                goto err;
        cpumask_copy(wq->cpu_mask, cpu_possible_mask);
        wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
        wq->acct[IO_WQ_ACCT_UNBOUND].max_workers =
                                task_rlimit(current, RLIMIT_NPROC);
        INIT_LIST_HEAD(&wq->wait.entry);
        wq->wait.func = io_wq_hash_wake;
        for (i = 0; i < IO_WQ_ACCT_NR; i++) {
                struct io_wq_acct *acct = &wq->acct[i];

                acct->index = i;
                atomic_set(&acct->nr_running, 0);
                INIT_WQ_LIST(&acct->work_list);
                raw_spin_lock_init(&acct->lock);
        }

        raw_spin_lock_init(&wq->lock);
        INIT_HLIST_NULLS_HEAD(&wq->free_list, 0);
        INIT_LIST_HEAD(&wq->all_list);

        wq->task = get_task_struct(data->task);
        atomic_set(&wq->worker_refs, 1);
        init_completion(&wq->worker_done);
        ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
        if (ret)
                goto err;

        return wq;
err:
        io_wq_put_hash(data->hash);
        free_cpumask_var(wq->cpu_mask);
        kfree(wq);
        return ERR_PTR(ret);
}

static bool io_task_work_match(struct callback_head *cb, void *data)
{
        struct io_worker *worker;

        if (cb->func != create_worker_cb && cb->func != create_worker_cont)
                return false;
        worker = container_of(cb, struct io_worker, create_work);
        return worker->wq == data;
}

void io_wq_exit_start(struct io_wq *wq)
{
        set_bit(IO_WQ_BIT_EXIT, &wq->state);
}

static void io_wq_cancel_tw_create(struct io_wq *wq)
{
        struct callback_head *cb;

        while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
                struct io_worker *worker;

                worker = container_of(cb, struct io_worker, create_work);
                io_worker_cancel_cb(worker);
                /*
                 * Only the worker continuation helper has worker allocated and
                 * hence needs freeing.
                 */
                if (cb->func == create_worker_cont)
                        kfree(worker);
        }
}

static void io_wq_exit_workers(struct io_wq *wq)
{
        if (!wq->task)
                return;

        io_wq_cancel_tw_create(wq);

        rcu_read_lock();
        io_wq_for_each_worker(wq, io_wq_worker_wake, NULL);
        rcu_read_unlock();
        io_worker_ref_put(wq);
        wait_for_completion(&wq->worker_done);

        spin_lock_irq(&wq->hash->wait.lock);
        list_del_init(&wq->wait.entry);
        spin_unlock_irq(&wq->hash->wait.lock);

        put_task_struct(wq->task);
        wq->task = NULL;
}

static void io_wq_destroy(struct io_wq *wq)
{
        struct io_cb_cancel_data match = {
                .fn                = io_wq_work_match_all,
                .cancel_all        = true,
        };

        cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
        io_wq_cancel_pending_work(wq, &match);
        free_cpumask_var(wq->cpu_mask);
        io_wq_put_hash(wq->hash);
        kfree(wq);
}

void io_wq_put_and_exit(struct io_wq *wq)
{
        WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state));

        io_wq_exit_workers(wq);
        io_wq_destroy(wq);
}

struct online_data {
        unsigned int cpu;
        bool online;
};

static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
{
        struct online_data *od = data;

        if (od->online)
                cpumask_set_cpu(od->cpu, worker->wq->cpu_mask);
        else
                cpumask_clear_cpu(od->cpu, worker->wq->cpu_mask);
        return false;
}

static int __io_wq_cpu_online(struct io_wq *wq, unsigned int cpu, bool online)
{
        struct online_data od = {
                .cpu = cpu,
                .online = online
        };

        rcu_read_lock();
        io_wq_for_each_worker(wq, io_wq_worker_affinity, &od);
        rcu_read_unlock();
        return 0;
}

static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node)
{
        struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);

        return __io_wq_cpu_online(wq, cpu, true);
}

static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)
{
        struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);

        return __io_wq_cpu_online(wq, cpu, false);
}

int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask)
{
        if (!tctx || !tctx->io_wq)
                return -EINVAL;

        rcu_read_lock();
        if (mask)
                cpumask_copy(tctx->io_wq->cpu_mask, mask);
        else
                cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask);
        rcu_read_unlock();

        return 0;
}

/*
 * Set max number of unbounded workers, returns old value. If new_count is 0,
 * then just return the old value.
 */
int io_wq_max_workers(struct io_wq *wq, int *new_count)
{
        struct io_wq_acct *acct;
        int prev[IO_WQ_ACCT_NR];
        int i;

        BUILD_BUG_ON((int) IO_WQ_ACCT_BOUND   != (int) IO_WQ_BOUND);
        BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND);
        BUILD_BUG_ON((int) IO_WQ_ACCT_NR      != 2);

        for (i = 0; i < IO_WQ_ACCT_NR; i++) {
                if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
                        new_count[i] = task_rlimit(current, RLIMIT_NPROC);
        }

        for (i = 0; i < IO_WQ_ACCT_NR; i++)
                prev[i] = 0;

        rcu_read_lock();

        raw_spin_lock(&wq->lock);
        for (i = 0; i < IO_WQ_ACCT_NR; i++) {
                acct = &wq->acct[i];
                prev[i] = max_t(int, acct->max_workers, prev[i]);
                if (new_count[i])
                        acct->max_workers = new_count[i];
        }
        raw_spin_unlock(&wq->lock);
        rcu_read_unlock();

        for (i = 0; i < IO_WQ_ACCT_NR; i++)
                new_count[i] = prev[i];

        return 0;
}

static __init int io_wq_init(void)
{
        int ret;

        ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online",
                                        io_wq_cpu_online, io_wq_cpu_offline);
        if (ret < 0)
                return ret;
        io_wq_online = ret;
        return 0;
}
subsys_initcall(io_wq_init);


























































































































    8 



   12 



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
// SPDX-License-Identifier: GPL-2.0
/*
 *  gendisk handling
 *
 * Portions Copyright (C) 2020 Christoph Hellwig
 */

#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/fs.h>
#include <linux/kdev_t.h>
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/major.h>
#include <linux/mutex.h>
#include <linux/idr.h>
#include <linux/log2.h>
#include <linux/pm_runtime.h>
#include <linux/badblocks.h>
#include <linux/part_stat.h>
#include <linux/blktrace_api.h>

#include "blk-throttle.h"
#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
#include "blk-cgroup.h"

static struct kobject *block_depr;

/*
 * Unique, monotonically increasing sequential number associated with block
 * devices instances (i.e. incremented each time a device is attached).
 * Associating uevents with block devices in userspace is difficult and racy:
 * the uevent netlink socket is lossy, and on slow and overloaded systems has
 * a very high latency.
 * Block devices do not have exclusive owners in userspace, any process can set
 * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0
 * can be reused again and again).
 * A userspace process setting up a block device and watching for its events
 * cannot thus reliably tell whether an event relates to the device it just set
 * up or another earlier instance with the same name.
 * This sequential number allows userspace processes to solve this problem, and
 * uniquely associate an uevent to the lifetime to a device.
 */
static atomic64_t diskseq;

/* for extended dynamic devt allocation, currently only one major is used */
#define NR_EXT_DEVT                (1 << MINORBITS)
static DEFINE_IDA(ext_devt_ida);

void set_capacity(struct gendisk *disk, sector_t sectors)
{
        bdev_set_nr_sectors(disk->part0, sectors);
}
EXPORT_SYMBOL(set_capacity);

/*
 * Set disk capacity and notify if the size is not currently zero and will not
 * be set to zero.  Returns true if a uevent was sent, otherwise false.
 */
bool set_capacity_and_notify(struct gendisk *disk, sector_t size)
{
        sector_t capacity = get_capacity(disk);
        char *envp[] = { "RESIZE=1", NULL };

        set_capacity(disk, size);

        /*
         * Only print a message and send a uevent if the gendisk is user visible
         * and alive.  This avoids spamming the log and udev when setting the
         * initial capacity during probing.
         */
        if (size == capacity ||
            !disk_live(disk) ||
            (disk->flags & GENHD_FL_HIDDEN))
                return false;

        pr_info("%s: detected capacity change from %lld to %lld\n",
                disk->disk_name, capacity, size);

        /*
         * Historically we did not send a uevent for changes to/from an empty
         * device.
         */
        if (!capacity || !size)
                return false;
        kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
        return true;
}
EXPORT_SYMBOL_GPL(set_capacity_and_notify);

static void part_stat_read_all(struct block_device *part,
                struct disk_stats *stat)
{
        int cpu;

        memset(stat, 0, sizeof(struct disk_stats));
        for_each_possible_cpu(cpu) {
                struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu);
                int group;

                for (group = 0; group < NR_STAT_GROUPS; group++) {
                        stat->nsecs[group] += ptr->nsecs[group];
                        stat->sectors[group] += ptr->sectors[group];
                        stat->ios[group] += ptr->ios[group];
                        stat->merges[group] += ptr->merges[group];
                }

                stat->io_ticks += ptr->io_ticks;
        }
}

unsigned int part_in_flight(struct block_device *part)
{
        unsigned int inflight = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
                            part_stat_local_read_cpu(part, in_flight[1], cpu);
        }
        if ((int)inflight < 0)
                inflight = 0;

        return inflight;
}

static void part_in_flight_rw(struct block_device *part,
                unsigned int inflight[2])
{
        int cpu;

        inflight[0] = 0;
        inflight[1] = 0;
        for_each_possible_cpu(cpu) {
                inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
                inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
        }
        if ((int)inflight[0] < 0)
                inflight[0] = 0;
        if ((int)inflight[1] < 0)
                inflight[1] = 0;
}

/*
 * Can be deleted altogether. Later.
 *
 */
#define BLKDEV_MAJOR_HASH_SIZE 255
static struct blk_major_name {
        struct blk_major_name *next;
        int major;
        char name[16];
#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
        void (*probe)(dev_t devt);
#endif
} *major_names[BLKDEV_MAJOR_HASH_SIZE];
static DEFINE_MUTEX(major_names_lock);
static DEFINE_SPINLOCK(major_names_spinlock);

/* index in the above - for now: assume no multimajor ranges */
static inline int major_to_index(unsigned major)
{
        return major % BLKDEV_MAJOR_HASH_SIZE;
}

#ifdef CONFIG_PROC_FS
void blkdev_show(struct seq_file *seqf, off_t offset)
{
        struct blk_major_name *dp;

        spin_lock(&major_names_spinlock);
        for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
                if (dp->major == offset)
                        seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
        spin_unlock(&major_names_spinlock);
}
#endif /* CONFIG_PROC_FS */

/**
 * __register_blkdev - register a new block device
 *
 * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
 *         @major = 0, try to allocate any unused major number.
 * @name: the name of the new block device as a zero terminated string
 * @probe: pre-devtmpfs / pre-udev callback used to create disks when their
 *           pre-created device node is accessed. When a probe call uses
 *           add_disk() and it fails the driver must cleanup resources. This
 *           interface may soon be removed.
 *
 * The @name must be unique within the system.
 *
 * The return value depends on the @major input parameter:
 *
 *  - if a major device number was requested in range [1..BLKDEV_MAJOR_MAX-1]
 *    then the function returns zero on success, or a negative error code
 *  - if any unused major number was requested with @major = 0 parameter
 *    then the return value is the allocated major number in range
 *    [1..BLKDEV_MAJOR_MAX-1] or a negative error code otherwise
 *
 * See Documentation/admin-guide/devices.txt for the list of allocated
 * major numbers.
 *
 * Use register_blkdev instead for any new code.
 */
int __register_blkdev(unsigned int major, const char *name,
                void (*probe)(dev_t devt))
{
        struct blk_major_name **n, *p;
        int index, ret = 0;

        mutex_lock(&major_names_lock);

        /* temporary */
        if (major == 0) {
                for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
                        if (major_names[index] == NULL)
                                break;
                }

                if (index == 0) {
                        printk("%s: failed to get major for %s\n",
                               __func__, name);
                        ret = -EBUSY;
                        goto out;
                }
                major = index;
                ret = major;
        }

        if (major >= BLKDEV_MAJOR_MAX) {
                pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n",
                       __func__, major, BLKDEV_MAJOR_MAX-1, name);

                ret = -EINVAL;
                goto out;
        }

        p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
        if (p == NULL) {
                ret = -ENOMEM;
                goto out;
        }

        p->major = major;
#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
        p->probe = probe;
#endif
        strscpy(p->name, name, sizeof(p->name));
        p->next = NULL;
        index = major_to_index(major);

        spin_lock(&major_names_spinlock);
        for (n = &major_names[index]; *n; n = &(*n)->next) {
                if ((*n)->major == major)
                        break;
        }
        if (!*n)
                *n = p;
        else
                ret = -EBUSY;
        spin_unlock(&major_names_spinlock);

        if (ret < 0) {
                printk("register_blkdev: cannot get major %u for %s\n",
                       major, name);
                kfree(p);
        }
out:
        mutex_unlock(&major_names_lock);
        return ret;
}
EXPORT_SYMBOL(__register_blkdev);

void unregister_blkdev(unsigned int major, const char *name)
{
        struct blk_major_name **n;
        struct blk_major_name *p = NULL;
        int index = major_to_index(major);

        mutex_lock(&major_names_lock);
        spin_lock(&major_names_spinlock);
        for (n = &major_names[index]; *n; n = &(*n)->next)
                if ((*n)->major == major)
                        break;
        if (!*n || strcmp((*n)->name, name)) {
                WARN_ON(1);
        } else {
                p = *n;
                *n = p->next;
        }
        spin_unlock(&major_names_spinlock);
        mutex_unlock(&major_names_lock);
        kfree(p);
}

EXPORT_SYMBOL(unregister_blkdev);

int blk_alloc_ext_minor(void)
{
        int idx;

        idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT - 1, GFP_KERNEL);
        if (idx == -ENOSPC)
                return -EBUSY;
        return idx;
}

void blk_free_ext_minor(unsigned int minor)
{
        ida_free(&ext_devt_ida, minor);
}

void disk_uevent(struct gendisk *disk, enum kobject_action action)
{
        struct block_device *part;
        unsigned long idx;

        rcu_read_lock();
        xa_for_each(&disk->part_tbl, idx, part) {
                if (bdev_is_partition(part) && !bdev_nr_sectors(part))
                        continue;
                if (!kobject_get_unless_zero(&part->bd_device.kobj))
                        continue;

                rcu_read_unlock();
                kobject_uevent(bdev_kobj(part), action);
                put_device(&part->bd_device);
                rcu_read_lock();
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(disk_uevent);

int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
{
        struct file *file;
        int ret = 0;

        if (!disk_has_partscan(disk))
                return -EINVAL;
        if (disk->open_partitions)
                return -EBUSY;

        /*
         * If the device is opened exclusively by current thread already, it's
         * safe to scan partitons, otherwise, use bd_prepare_to_claim() to
         * synchronize with other exclusive openers and other partition
         * scanners.
         */
        if (!(mode & BLK_OPEN_EXCL)) {
                ret = bd_prepare_to_claim(disk->part0, disk_scan_partitions,
                                          NULL);
                if (ret)
                        return ret;
        }

        set_bit(GD_NEED_PART_SCAN, &disk->state);
        file = bdev_file_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL,
                                     NULL, NULL);
        if (IS_ERR(file))
                ret = PTR_ERR(file);
        else
                fput(file);

        /*
         * If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set,
         * and this will cause that re-assemble partitioned raid device will
         * creat partition for underlying disk.
         */
        clear_bit(GD_NEED_PART_SCAN, &disk->state);
        if (!(mode & BLK_OPEN_EXCL))
                bd_abort_claiming(disk->part0, disk_scan_partitions);
        return ret;
}

/**
 * device_add_disk - add disk information to kernel list
 * @parent: parent device for the disk
 * @disk: per-device partitioning information
 * @groups: Additional per-device sysfs groups
 *
 * This function registers the partitioning information in @disk
 * with the kernel.
 */
int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
                                 const struct attribute_group **groups)

{
        struct device *ddev = disk_to_dev(disk);
        int ret;

        /* Only makes sense for bio-based to set ->poll_bio */
        if (queue_is_mq(disk->queue) && disk->fops->poll_bio)
                return -EINVAL;

        /*
         * The disk queue should now be all set with enough information about
         * the device for the elevator code to pick an adequate default
         * elevator if one is needed, that is, for devices requesting queue
         * registration.
         */
        elevator_init_mq(disk->queue);

        /* Mark bdev as having a submit_bio, if needed */
        if (disk->fops->submit_bio)
                bdev_set_flag(disk->part0, BD_HAS_SUBMIT_BIO);

        /*
         * If the driver provides an explicit major number it also must provide
         * the number of minors numbers supported, and those will be used to
         * setup the gendisk.
         * Otherwise just allocate the device numbers for both the whole device
         * and all partitions from the extended dev_t space.
         */
        ret = -EINVAL;
        if (disk->major) {
                if (WARN_ON(!disk->minors))
                        goto out_exit_elevator;

                if (disk->minors > DISK_MAX_PARTS) {
                        pr_err("block: can't allocate more than %d partitions\n",
                                DISK_MAX_PARTS);
                        disk->minors = DISK_MAX_PARTS;
                }
                if (disk->first_minor > MINORMASK ||
                    disk->minors > MINORMASK + 1 ||
                    disk->first_minor + disk->minors > MINORMASK + 1)
                        goto out_exit_elevator;
        } else {
                if (WARN_ON(disk->minors))
                        goto out_exit_elevator;

                ret = blk_alloc_ext_minor();
                if (ret < 0)
                        goto out_exit_elevator;
                disk->major = BLOCK_EXT_MAJOR;
                disk->first_minor = ret;
        }

        /* delay uevents, until we scanned partition table */
        dev_set_uevent_suppress(ddev, 1);

        ddev->parent = parent;
        ddev->groups = groups;
        dev_set_name(ddev, "%s", disk->disk_name);
        if (!(disk->flags & GENHD_FL_HIDDEN))
                ddev->devt = MKDEV(disk->major, disk->first_minor);
        ret = device_add(ddev);
        if (ret)
                goto out_free_ext_minor;

        ret = disk_alloc_events(disk);
        if (ret)
                goto out_device_del;

        ret = sysfs_create_link(block_depr, &ddev->kobj,
                                kobject_name(&ddev->kobj));
        if (ret)
                goto out_device_del;

        /*
         * avoid probable deadlock caused by allocating memory with
         * GFP_KERNEL in runtime_resume callback of its all ancestor
         * devices
         */
        pm_runtime_set_memalloc_noio(ddev, true);

        disk->part0->bd_holder_dir =
                kobject_create_and_add("holders", &ddev->kobj);
        if (!disk->part0->bd_holder_dir) {
                ret = -ENOMEM;
                goto out_del_block_link;
        }
        disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
        if (!disk->slave_dir) {
                ret = -ENOMEM;
                goto out_put_holder_dir;
        }

        ret = blk_register_queue(disk);
        if (ret)
                goto out_put_slave_dir;

        if (!(disk->flags & GENHD_FL_HIDDEN)) {
                ret = bdi_register(disk->bdi, "%u:%u",
                                   disk->major, disk->first_minor);
                if (ret)
                        goto out_unregister_queue;
                bdi_set_owner(disk->bdi, ddev);
                ret = sysfs_create_link(&ddev->kobj,
                                        &disk->bdi->dev->kobj, "bdi");
                if (ret)
                        goto out_unregister_bdi;

                /* Make sure the first partition scan will be proceed */
                if (get_capacity(disk) && disk_has_partscan(disk))
                        set_bit(GD_NEED_PART_SCAN, &disk->state);

                bdev_add(disk->part0, ddev->devt);
                if (get_capacity(disk))
                        disk_scan_partitions(disk, BLK_OPEN_READ);

                /*
                 * Announce the disk and partitions after all partitions are
                 * created. (for hidden disks uevents remain suppressed forever)
                 */
                dev_set_uevent_suppress(ddev, 0);
                disk_uevent(disk, KOBJ_ADD);
        } else {
                /*
                 * Even if the block_device for a hidden gendisk is not
                 * registered, it needs to have a valid bd_dev so that the
                 * freeing of the dynamic major works.
                 */
                disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor);
        }

        disk_update_readahead(disk);
        disk_add_events(disk);
        set_bit(GD_ADDED, &disk->state);
        return 0;

out_unregister_bdi:
        if (!(disk->flags & GENHD_FL_HIDDEN))
                bdi_unregister(disk->bdi);
out_unregister_queue:
        blk_unregister_queue(disk);
        rq_qos_exit(disk->queue);
out_put_slave_dir:
        kobject_put(disk->slave_dir);
        disk->slave_dir = NULL;
out_put_holder_dir:
        kobject_put(disk->part0->bd_holder_dir);
out_del_block_link:
        sysfs_remove_link(block_depr, dev_name(ddev));
        pm_runtime_set_memalloc_noio(ddev, false);
out_device_del:
        device_del(ddev);
out_free_ext_minor:
        if (disk->major == BLOCK_EXT_MAJOR)
                blk_free_ext_minor(disk->first_minor);
out_exit_elevator:
        if (disk->queue->elevator)
                elevator_exit(disk->queue);
        return ret;
}
EXPORT_SYMBOL(device_add_disk);

static void blk_report_disk_dead(struct gendisk *disk, bool surprise)
{
        struct block_device *bdev;
        unsigned long idx;

        /*
         * On surprise disk removal, bdev_mark_dead() may call into file
         * systems below. Make it clear that we're expecting to not hold
         * disk->open_mutex.
         */
        lockdep_assert_not_held(&disk->open_mutex);

        rcu_read_lock();
        xa_for_each(&disk->part_tbl, idx, bdev) {
                if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
                        continue;
                rcu_read_unlock();

                bdev_mark_dead(bdev, surprise);

                put_device(&bdev->bd_device);
                rcu_read_lock();
        }
        rcu_read_unlock();
}

static void __blk_mark_disk_dead(struct gendisk *disk)
{
        /*
         * Fail any new I/O.
         */
        if (test_and_set_bit(GD_DEAD, &disk->state))
                return;

        if (test_bit(GD_OWNS_QUEUE, &disk->state))
                blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue);

        /*
         * Stop buffered writers from dirtying pages that can't be written out.
         */
        set_capacity(disk, 0);

        /*
         * Prevent new I/O from crossing bio_queue_enter().
         */
        blk_queue_start_drain(disk->queue);
}

/**
 * blk_mark_disk_dead - mark a disk as dead
 * @disk: disk to mark as dead
 *
 * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O
 * to this disk.
 */
void blk_mark_disk_dead(struct gendisk *disk)
{
        __blk_mark_disk_dead(disk);
        blk_report_disk_dead(disk, true);
}
EXPORT_SYMBOL_GPL(blk_mark_disk_dead);

/**
 * del_gendisk - remove the gendisk
 * @disk: the struct gendisk to remove
 *
 * Removes the gendisk and all its associated resources. This deletes the
 * partitions associated with the gendisk, and unregisters the associated
 * request_queue.
 *
 * This is the counter to the respective __device_add_disk() call.
 *
 * The final removal of the struct gendisk happens when its refcount reaches 0
 * with put_disk(), which should be called after del_gendisk(), if
 * __device_add_disk() was used.
 *
 * Drivers exist which depend on the release of the gendisk to be synchronous,
 * it should not be deferred.
 *
 * Context: can sleep
 */
void del_gendisk(struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct block_device *part;
        unsigned long idx;

        might_sleep();

        if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN)))
                return;

        disk_del_events(disk);

        /*
         * Prevent new openers by unlinked the bdev inode.
         */
        mutex_lock(&disk->open_mutex);
        xa_for_each(&disk->part_tbl, idx, part)
                bdev_unhash(part);
        mutex_unlock(&disk->open_mutex);

        /*
         * Tell the file system to write back all dirty data and shut down if
         * it hasn't been notified earlier.
         */
        if (!test_bit(GD_DEAD, &disk->state))
                blk_report_disk_dead(disk, false);
        __blk_mark_disk_dead(disk);

        /*
         * Drop all partitions now that the disk is marked dead.
         */
        mutex_lock(&disk->open_mutex);
        xa_for_each_start(&disk->part_tbl, idx, part, 1)
                drop_partition(part);
        mutex_unlock(&disk->open_mutex);

        if (!(disk->flags & GENHD_FL_HIDDEN)) {
                sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");

                /*
                 * Unregister bdi before releasing device numbers (as they can
                 * get reused and we'd get clashes in sysfs).
                 */
                bdi_unregister(disk->bdi);
        }

        blk_unregister_queue(disk);

        kobject_put(disk->part0->bd_holder_dir);
        kobject_put(disk->slave_dir);
        disk->slave_dir = NULL;

        part_stat_set_all(disk->part0, 0);
        disk->part0->bd_stamp = 0;
        sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
        pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
        device_del(disk_to_dev(disk));

        blk_mq_freeze_queue_wait(q);

        blk_throtl_cancel_bios(disk);

        blk_sync_queue(q);
        blk_flush_integrity();

        if (queue_is_mq(q))
                blk_mq_cancel_work_sync(q);

        blk_mq_quiesce_queue(q);
        if (q->elevator) {
                mutex_lock(&q->sysfs_lock);
                elevator_exit(q);
                mutex_unlock(&q->sysfs_lock);
        }
        rq_qos_exit(q);
        blk_mq_unquiesce_queue(q);

        /*
         * If the disk does not own the queue, allow using passthrough requests
         * again.  Else leave the queue frozen to fail all I/O.
         */
        if (!test_bit(GD_OWNS_QUEUE, &disk->state)) {
                blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q);
                __blk_mq_unfreeze_queue(q, true);
        } else {
                if (queue_is_mq(q))
                        blk_mq_exit_queue(q);
        }
}
EXPORT_SYMBOL(del_gendisk);

/**
 * invalidate_disk - invalidate the disk
 * @disk: the struct gendisk to invalidate
 *
 * A helper to invalidates the disk. It will clean the disk's associated
 * buffer/page caches and reset its internal states so that the disk
 * can be reused by the drivers.
 *
 * Context: can sleep
 */
void invalidate_disk(struct gendisk *disk)
{
        struct block_device *bdev = disk->part0;

        invalidate_bdev(bdev);
        bdev->bd_mapping->wb_err = 0;
        set_capacity(disk, 0);
}
EXPORT_SYMBOL(invalidate_disk);

/* sysfs access to bad-blocks list. */
static ssize_t disk_badblocks_show(struct device *dev,
                                        struct device_attribute *attr,
                                        char *page)
{
        struct gendisk *disk = dev_to_disk(dev);

        if (!disk->bb)
                return sprintf(page, "\n");

        return badblocks_show(disk->bb, page, 0);
}

static ssize_t disk_badblocks_store(struct device *dev,
                                        struct device_attribute *attr,
                                        const char *page, size_t len)
{
        struct gendisk *disk = dev_to_disk(dev);

        if (!disk->bb)
                return -ENXIO;

        return badblocks_store(disk->bb, page, len, 0);
}

#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
void blk_request_module(dev_t devt)
{
        unsigned int major = MAJOR(devt);
        struct blk_major_name **n;

        mutex_lock(&major_names_lock);
        for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
                if ((*n)->major == major && (*n)->probe) {
                        (*n)->probe(devt);
                        mutex_unlock(&major_names_lock);
                        return;
                }
        }
        mutex_unlock(&major_names_lock);

        if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
                /* Make old-style 2.4 aliases work */
                request_module("block-major-%d", MAJOR(devt));
}
#endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */

#ifdef CONFIG_PROC_FS
/* iterator */
static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos)
{
        loff_t skip = *pos;
        struct class_dev_iter *iter;
        struct device *dev;

        iter = kmalloc(sizeof(*iter), GFP_KERNEL);
        if (!iter)
                return ERR_PTR(-ENOMEM);

        seqf->private = iter;
        class_dev_iter_init(iter, &block_class, NULL, &disk_type);
        do {
                dev = class_dev_iter_next(iter);
                if (!dev)
                        return NULL;
        } while (skip--);

        return dev_to_disk(dev);
}

static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos)
{
        struct device *dev;

        (*pos)++;
        dev = class_dev_iter_next(seqf->private);
        if (dev)
                return dev_to_disk(dev);

        return NULL;
}

static void disk_seqf_stop(struct seq_file *seqf, void *v)
{
        struct class_dev_iter *iter = seqf->private;

        /* stop is called even after start failed :-( */
        if (iter) {
                class_dev_iter_exit(iter);
                kfree(iter);
                seqf->private = NULL;
        }
}

static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
{
        void *p;

        p = disk_seqf_start(seqf, pos);
        if (!IS_ERR_OR_NULL(p) && !*pos)
                seq_puts(seqf, "major minor  #blocks  name\n\n");
        return p;
}

static int show_partition(struct seq_file *seqf, void *v)
{
        struct gendisk *sgp = v;
        struct block_device *part;
        unsigned long idx;

        if (!get_capacity(sgp) || (sgp->flags & GENHD_FL_HIDDEN))
                return 0;

        rcu_read_lock();
        xa_for_each(&sgp->part_tbl, idx, part) {
                if (!bdev_nr_sectors(part))
                        continue;
                seq_printf(seqf, "%4d  %7d %10llu %pg\n",
                           MAJOR(part->bd_dev), MINOR(part->bd_dev),
                           bdev_nr_sectors(part) >> 1, part);
        }
        rcu_read_unlock();
        return 0;
}

static const struct seq_operations partitions_op = {
        .start        = show_partition_start,
        .next        = disk_seqf_next,
        .stop        = disk_seqf_stop,
        .show        = show_partition
};
#endif

static int __init genhd_device_init(void)
{
        int error;

        error = class_register(&block_class);
        if (unlikely(error))
                return error;
        blk_dev_init();

        register_blkdev(BLOCK_EXT_MAJOR, "blkext");

        /* create top-level block dir */
        block_depr = kobject_create_and_add("block", NULL);
        return 0;
}

subsys_initcall(genhd_device_init);

static ssize_t disk_range_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sprintf(buf, "%d\n", disk->minors);
}

static ssize_t disk_ext_range_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sprintf(buf, "%d\n",
                (disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS);
}

static ssize_t disk_removable_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sprintf(buf, "%d\n",
                       (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
}

static ssize_t disk_hidden_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sprintf(buf, "%d\n",
                       (disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
}

static ssize_t disk_ro_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
}

ssize_t part_size_show(struct device *dev,
                       struct device_attribute *attr, char *buf)
{
        return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
}

ssize_t part_stat_show(struct device *dev,
                       struct device_attribute *attr, char *buf)
{
        struct block_device *bdev = dev_to_bdev(dev);
        struct disk_stats stat;
        unsigned int inflight;

        inflight = part_in_flight(bdev);
        if (inflight) {
                part_stat_lock();
                update_io_ticks(bdev, jiffies, true);
                part_stat_unlock();
        }
        part_stat_read_all(bdev, &stat);
        return sprintf(buf,
                "%8lu %8lu %8llu %8u "
                "%8lu %8lu %8llu %8u "
                "%8u %8u %8u "
                "%8lu %8lu %8llu %8u "
                "%8lu %8u"
                "\n",
                stat.ios[STAT_READ],
                stat.merges[STAT_READ],
                (unsigned long long)stat.sectors[STAT_READ],
                (unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC),
                stat.ios[STAT_WRITE],
                stat.merges[STAT_WRITE],
                (unsigned long long)stat.sectors[STAT_WRITE],
                (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC),
                inflight,
                jiffies_to_msecs(stat.io_ticks),
                (unsigned int)div_u64(stat.nsecs[STAT_READ] +
                                      stat.nsecs[STAT_WRITE] +
                                      stat.nsecs[STAT_DISCARD] +
                                      stat.nsecs[STAT_FLUSH],
                                                NSEC_PER_MSEC),
                stat.ios[STAT_DISCARD],
                stat.merges[STAT_DISCARD],
                (unsigned long long)stat.sectors[STAT_DISCARD],
                (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC),
                stat.ios[STAT_FLUSH],
                (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
}

ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
                           char *buf)
{
        struct block_device *bdev = dev_to_bdev(dev);
        struct request_queue *q = bdev_get_queue(bdev);
        unsigned int inflight[2];

        if (queue_is_mq(q))
                blk_mq_in_flight_rw(q, bdev, inflight);
        else
                part_in_flight_rw(bdev, inflight);

        return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]);
}

static ssize_t disk_capability_show(struct device *dev,
                                    struct device_attribute *attr, char *buf)
{
        dev_warn_once(dev, "the capability attribute has been deprecated.\n");
        return sprintf(buf, "0\n");
}

static ssize_t disk_alignment_offset_show(struct device *dev,
                                          struct device_attribute *attr,
                                          char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0));
}

static ssize_t disk_discard_alignment_show(struct device *dev,
                                           struct device_attribute *attr,
                                           char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0));
}

static ssize_t diskseq_show(struct device *dev,
                            struct device_attribute *attr, char *buf)
{
        struct gendisk *disk = dev_to_disk(dev);

        return sprintf(buf, "%llu\n", disk->diskseq);
}

static ssize_t partscan_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        return sprintf(buf, "%u\n", disk_has_partscan(dev_to_disk(dev)));
}

static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL);
static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL);
static DEVICE_ATTR(size, 0444, part_size_show, NULL);
static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL);
static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL);
static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
static DEVICE_ATTR(partscan, 0444, partscan_show, NULL);

#ifdef CONFIG_FAIL_MAKE_REQUEST
ssize_t part_fail_show(struct device *dev,
                       struct device_attribute *attr, char *buf)
{
        return sprintf(buf, "%d\n",
                       bdev_test_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL));
}

ssize_t part_fail_store(struct device *dev,
                        struct device_attribute *attr,
                        const char *buf, size_t count)
{
        int i;

        if (count > 0 && sscanf(buf, "%d", &i) > 0) {
                if (i)
                        bdev_set_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL);
                else
                        bdev_clear_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL);
        }
        return count;
}

static struct device_attribute dev_attr_fail =
        __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
#endif /* CONFIG_FAIL_MAKE_REQUEST */

#ifdef CONFIG_FAIL_IO_TIMEOUT
static struct device_attribute dev_attr_fail_timeout =
        __ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
#endif

static struct attribute *disk_attrs[] = {
        &dev_attr_range.attr,
        &dev_attr_ext_range.attr,
        &dev_attr_removable.attr,
        &dev_attr_hidden.attr,
        &dev_attr_ro.attr,
        &dev_attr_size.attr,
        &dev_attr_alignment_offset.attr,
        &dev_attr_discard_alignment.attr,
        &dev_attr_capability.attr,
        &dev_attr_stat.attr,
        &dev_attr_inflight.attr,
        &dev_attr_badblocks.attr,
        &dev_attr_events.attr,
        &dev_attr_events_async.attr,
        &dev_attr_events_poll_msecs.attr,
        &dev_attr_diskseq.attr,
        &dev_attr_partscan.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
        &dev_attr_fail.attr,
#endif
#ifdef CONFIG_FAIL_IO_TIMEOUT
        &dev_attr_fail_timeout.attr,
#endif
        NULL
};

static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n)
{
        struct device *dev = container_of(kobj, typeof(*dev), kobj);
        struct gendisk *disk = dev_to_disk(dev);

        if (a == &dev_attr_badblocks.attr && !disk->bb)
                return 0;
        return a->mode;
}

static struct attribute_group disk_attr_group = {
        .attrs = disk_attrs,
        .is_visible = disk_visible,
};

static const struct attribute_group *disk_attr_groups[] = {
        &disk_attr_group,
#ifdef CONFIG_BLK_DEV_IO_TRACE
        &blk_trace_attr_group,
#endif
#ifdef CONFIG_BLK_DEV_INTEGRITY
        &blk_integrity_attr_group,
#endif
        NULL
};

/**
 * disk_release - releases all allocated resources of the gendisk
 * @dev: the device representing this disk
 *
 * This function releases all allocated resources of the gendisk.
 *
 * Drivers which used __device_add_disk() have a gendisk with a request_queue
 * assigned. Since the request_queue sits on top of the gendisk for these
 * drivers we also call blk_put_queue() for them, and we expect the
 * request_queue refcount to reach 0 at this point, and so the request_queue
 * will also be freed prior to the disk.
 *
 * Context: can sleep
 */
static void disk_release(struct device *dev)
{
        struct gendisk *disk = dev_to_disk(dev);

        might_sleep();
        WARN_ON_ONCE(disk_live(disk));

        blk_trace_remove(disk->queue);

        /*
         * To undo the all initialization from blk_mq_init_allocated_queue in
         * case of a probe failure where add_disk is never called we have to
         * call blk_mq_exit_queue here. We can't do this for the more common
         * teardown case (yet) as the tagset can be gone by the time the disk
         * is released once it was added.
         */
        if (queue_is_mq(disk->queue) &&
            test_bit(GD_OWNS_QUEUE, &disk->state) &&
            !test_bit(GD_ADDED, &disk->state))
                blk_mq_exit_queue(disk->queue);

        blkcg_exit_disk(disk);

        bioset_exit(&disk->bio_split);

        disk_release_events(disk);
        kfree(disk->random);
        disk_free_zone_resources(disk);
        xa_destroy(&disk->part_tbl);

        disk->queue->disk = NULL;
        blk_put_queue(disk->queue);

        if (test_bit(GD_ADDED, &disk->state) && disk->fops->free_disk)
                disk->fops->free_disk(disk);

        bdev_drop(disk->part0);        /* frees the disk */
}

static int block_uevent(const struct device *dev, struct kobj_uevent_env *env)
{
        const struct gendisk *disk = dev_to_disk(dev);

        return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq);
}

const struct class block_class = {
        .name                = "block",
        .dev_uevent        = block_uevent,
};

static char *block_devnode(const struct device *dev, umode_t *mode,
                           kuid_t *uid, kgid_t *gid)
{
        struct gendisk *disk = dev_to_disk(dev);

        if (disk->fops->devnode)
                return disk->fops->devnode(disk, mode);
        return NULL;
}

const struct device_type disk_type = {
        .name                = "disk",
        .groups                = disk_attr_groups,
        .release        = disk_release,
        .devnode        = block_devnode,
};

#ifdef CONFIG_PROC_FS
/*
 * aggregate disk stat collector.  Uses the same stats that the sysfs
 * entries do, above, but makes them available through one seq_file.
 *
 * The output looks suspiciously like /proc/partitions with a bunch of
 * extra fields.
 */
static int diskstats_show(struct seq_file *seqf, void *v)
{
        struct gendisk *gp = v;
        struct block_device *hd;
        unsigned int inflight;
        struct disk_stats stat;
        unsigned long idx;

        /*
        if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
                seq_puts(seqf,        "major minor name"
                                "     rio rmerge rsect ruse wio wmerge "
                                "wsect wuse running use aveq"
                                "\n\n");
        */

        rcu_read_lock();
        xa_for_each(&gp->part_tbl, idx, hd) {
                if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
                        continue;

                inflight = part_in_flight(hd);
                if (inflight) {
                        part_stat_lock();
                        update_io_ticks(hd, jiffies, true);
                        part_stat_unlock();
                }
                part_stat_read_all(hd, &stat);
                seq_printf(seqf, "%4d %7d %pg "
                           "%lu %lu %lu %u "
                           "%lu %lu %lu %u "
                           "%u %u %u "
                           "%lu %lu %lu %u "
                           "%lu %u"
                           "\n",
                           MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd,
                           stat.ios[STAT_READ],
                           stat.merges[STAT_READ],
                           stat.sectors[STAT_READ],
                           (unsigned int)div_u64(stat.nsecs[STAT_READ],
                                                        NSEC_PER_MSEC),
                           stat.ios[STAT_WRITE],
                           stat.merges[STAT_WRITE],
                           stat.sectors[STAT_WRITE],
                           (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
                                                        NSEC_PER_MSEC),
                           inflight,
                           jiffies_to_msecs(stat.io_ticks),
                           (unsigned int)div_u64(stat.nsecs[STAT_READ] +
                                                 stat.nsecs[STAT_WRITE] +
                                                 stat.nsecs[STAT_DISCARD] +
                                                 stat.nsecs[STAT_FLUSH],
                                                        NSEC_PER_MSEC),
                           stat.ios[STAT_DISCARD],
                           stat.merges[STAT_DISCARD],
                           stat.sectors[STAT_DISCARD],
                           (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
                                                 NSEC_PER_MSEC),
                           stat.ios[STAT_FLUSH],
                           (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
                                                 NSEC_PER_MSEC)
                        );
        }
        rcu_read_unlock();

        return 0;
}

static const struct seq_operations diskstats_op = {
        .start        = disk_seqf_start,
        .next        = disk_seqf_next,
        .stop        = disk_seqf_stop,
        .show        = diskstats_show
};

static int __init proc_genhd_init(void)
{
        proc_create_seq("diskstats", 0, NULL, &diskstats_op);
        proc_create_seq("partitions", 0, NULL, &partitions_op);
        return 0;
}
module_init(proc_genhd_init);
#endif /* CONFIG_PROC_FS */

dev_t part_devt(struct gendisk *disk, u8 partno)
{
        struct block_device *part;
        dev_t devt = 0;

        rcu_read_lock();
        part = xa_load(&disk->part_tbl, partno);
        if (part)
                devt = part->bd_dev;
        rcu_read_unlock();

        return devt;
}

struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
                struct lock_class_key *lkclass)
{
        struct gendisk *disk;

        disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
        if (!disk)
                return NULL;

        if (bioset_init(&disk->bio_split, BIO_POOL_SIZE, 0, 0))
                goto out_free_disk;

        disk->bdi = bdi_alloc(node_id);
        if (!disk->bdi)
                goto out_free_bioset;

        /* bdev_alloc() might need the queue, set before the first call */
        disk->queue = q;

        disk->part0 = bdev_alloc(disk, 0);
        if (!disk->part0)
                goto out_free_bdi;

        disk->node_id = node_id;
        mutex_init(&disk->open_mutex);
        xa_init(&disk->part_tbl);
        if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
                goto out_destroy_part_tbl;

        if (blkcg_init_disk(disk))
                goto out_erase_part0;

        disk_init_zone_resources(disk);
        rand_initialize_disk(disk);
        disk_to_dev(disk)->class = &block_class;
        disk_to_dev(disk)->type = &disk_type;
        device_initialize(disk_to_dev(disk));
        inc_diskseq(disk);
        q->disk = disk;
        lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0);
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
        INIT_LIST_HEAD(&disk->slave_bdevs);
#endif
        return disk;

out_erase_part0:
        xa_erase(&disk->part_tbl, 0);
out_destroy_part_tbl:
        xa_destroy(&disk->part_tbl);
        disk->part0->bd_disk = NULL;
        bdev_drop(disk->part0);
out_free_bdi:
        bdi_put(disk->bdi);
out_free_bioset:
        bioset_exit(&disk->bio_split);
out_free_disk:
        kfree(disk);
        return NULL;
}

struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node,
                struct lock_class_key *lkclass)
{
        struct queue_limits default_lim = { };
        struct request_queue *q;
        struct gendisk *disk;

        q = blk_alloc_queue(lim ? lim : &default_lim, node);
        if (IS_ERR(q))
                return ERR_CAST(q);

        disk = __alloc_disk_node(q, node, lkclass);
        if (!disk) {
                blk_put_queue(q);
                return ERR_PTR(-ENOMEM);
        }
        set_bit(GD_OWNS_QUEUE, &disk->state);
        return disk;
}
EXPORT_SYMBOL(__blk_alloc_disk);

/**
 * put_disk - decrements the gendisk refcount
 * @disk: the struct gendisk to decrement the refcount for
 *
 * This decrements the refcount for the struct gendisk. When this reaches 0
 * we'll have disk_release() called.
 *
 * Note: for blk-mq disk put_disk must be called before freeing the tag_set
 * when handling probe errors (that is before add_disk() is called).
 *
 * Context: Any context, but the last reference must not be dropped from
 *          atomic context.
 */
void put_disk(struct gendisk *disk)
{
        if (disk)
                put_device(disk_to_dev(disk));
}
EXPORT_SYMBOL(put_disk);

static void set_disk_ro_uevent(struct gendisk *gd, int ro)
{
        char event[] = "DISK_RO=1";
        char *envp[] = { event, NULL };

        if (!ro)
                event[8] = '0';
        kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
}

/**
 * set_disk_ro - set a gendisk read-only
 * @disk:        gendisk to operate on
 * @read_only:        %true to set the disk read-only, %false set the disk read/write
 *
 * This function is used to indicate whether a given disk device should have its
 * read-only flag set. set_disk_ro() is typically used by device drivers to
 * indicate whether the underlying physical device is write-protected.
 */
void set_disk_ro(struct gendisk *disk, bool read_only)
{
        if (read_only) {
                if (test_and_set_bit(GD_READ_ONLY, &disk->state))
                        return;
        } else {
                if (!test_and_clear_bit(GD_READ_ONLY, &disk->state))
                        return;
        }
        set_disk_ro_uevent(disk, read_only);
}
EXPORT_SYMBOL(set_disk_ro);

void inc_diskseq(struct gendisk *disk)
{
        disk->diskseq = atomic64_inc_return(&diskseq);
}







































































































































































































































































































































































































































































































    1 






























    2 






    2 

















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
// SPDX-License-Identifier: GPL-2.0-or-later
/* Task credentials management - see Documentation/security/credentials.rst
 *
 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#define pr_fmt(fmt) "CRED: " fmt

#include <linux/export.h>
#include <linux/cred.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/coredump.h>
#include <linux/key.h>
#include <linux/keyctl.h>
#include <linux/init_task.h>
#include <linux/security.h>
#include <linux/binfmts.h>
#include <linux/cn_proc.h>
#include <linux/uidgid.h>

#if 0
#define kdebug(FMT, ...)                                                \
        printk("[%-5.5s%5u] " FMT "\n",                                        \
               current->comm, current->pid, ##__VA_ARGS__)
#else
#define kdebug(FMT, ...)                                                \
do {                                                                        \
        if (0)                                                                \
                no_printk("[%-5.5s%5u] " FMT "\n",                        \
                          current->comm, current->pid, ##__VA_ARGS__);        \
} while (0)
#endif

static struct kmem_cache *cred_jar;

/* init to 2 - one for init_task, one to ensure it is never freed */
static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) };

/*
 * The initial credentials for the initial task
 */
struct cred init_cred = {
        .usage                        = ATOMIC_INIT(4),
        .uid                        = GLOBAL_ROOT_UID,
        .gid                        = GLOBAL_ROOT_GID,
        .suid                        = GLOBAL_ROOT_UID,
        .sgid                        = GLOBAL_ROOT_GID,
        .euid                        = GLOBAL_ROOT_UID,
        .egid                        = GLOBAL_ROOT_GID,
        .fsuid                        = GLOBAL_ROOT_UID,
        .fsgid                        = GLOBAL_ROOT_GID,
        .securebits                = SECUREBITS_DEFAULT,
        .cap_inheritable        = CAP_EMPTY_SET,
        .cap_permitted                = CAP_FULL_SET,
        .cap_effective                = CAP_FULL_SET,
        .cap_bset                = CAP_FULL_SET,
        .user                        = INIT_USER,
        .user_ns                = &init_user_ns,
        .group_info                = &init_groups,
        .ucounts                = &init_ucounts,
};

/*
 * The RCU callback to actually dispose of a set of credentials
 */
static void put_cred_rcu(struct rcu_head *rcu)
{
        struct cred *cred = container_of(rcu, struct cred, rcu);

        kdebug("put_cred_rcu(%p)", cred);

        if (atomic_long_read(&cred->usage) != 0)
                panic("CRED: put_cred_rcu() sees %p with usage %ld\n",
                      cred, atomic_long_read(&cred->usage));

        security_cred_free(cred);
        key_put(cred->session_keyring);
        key_put(cred->process_keyring);
        key_put(cred->thread_keyring);
        key_put(cred->request_key_auth);
        if (cred->group_info)
                put_group_info(cred->group_info);
        free_uid(cred->user);
        if (cred->ucounts)
                put_ucounts(cred->ucounts);
        put_user_ns(cred->user_ns);
        kmem_cache_free(cred_jar, cred);
}

/**
 * __put_cred - Destroy a set of credentials
 * @cred: The record to release
 *
 * Destroy a set of credentials on which no references remain.
 */
void __put_cred(struct cred *cred)
{
        kdebug("__put_cred(%p{%ld})", cred,
               atomic_long_read(&cred->usage));

        BUG_ON(atomic_long_read(&cred->usage) != 0);
        BUG_ON(cred == current->cred);
        BUG_ON(cred == current->real_cred);

        if (cred->non_rcu)
                put_cred_rcu(&cred->rcu);
        else
                call_rcu(&cred->rcu, put_cred_rcu);
}
EXPORT_SYMBOL(__put_cred);

/*
 * Clean up a task's credentials when it exits
 */
void exit_creds(struct task_struct *tsk)
{
        struct cred *real_cred, *cred;

        kdebug("exit_creds(%u,%p,%p,{%ld})", tsk->pid, tsk->real_cred, tsk->cred,
               atomic_long_read(&tsk->cred->usage));

        real_cred = (struct cred *) tsk->real_cred;
        tsk->real_cred = NULL;

        cred = (struct cred *) tsk->cred;
        tsk->cred = NULL;

        if (real_cred == cred) {
                put_cred_many(cred, 2);
        } else {
                put_cred(real_cred);
                put_cred(cred);
        }

#ifdef CONFIG_KEYS_REQUEST_CACHE
        key_put(tsk->cached_requested_key);
        tsk->cached_requested_key = NULL;
#endif
}

/**
 * get_task_cred - Get another task's objective credentials
 * @task: The task to query
 *
 * Get the objective credentials of a task, pinning them so that they can't go
 * away.  Accessing a task's credentials directly is not permitted.
 *
 * The caller must also make sure task doesn't get deleted, either by holding a
 * ref on task or by holding tasklist_lock to prevent it from being unlinked.
 */
const struct cred *get_task_cred(struct task_struct *task)
{
        const struct cred *cred;

        rcu_read_lock();

        do {
                cred = __task_cred((task));
                BUG_ON(!cred);
        } while (!get_cred_rcu(cred));

        rcu_read_unlock();
        return cred;
}
EXPORT_SYMBOL(get_task_cred);

/*
 * Allocate blank credentials, such that the credentials can be filled in at a
 * later date without risk of ENOMEM.
 */
struct cred *cred_alloc_blank(void)
{
        struct cred *new;

        new = kmem_cache_zalloc(cred_jar, GFP_KERNEL);
        if (!new)
                return NULL;

        atomic_long_set(&new->usage, 1);
        if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
                goto error;

        return new;

error:
        abort_creds(new);
        return NULL;
}

/**
 * prepare_creds - Prepare a new set of credentials for modification
 *
 * Prepare a new set of task credentials for modification.  A task's creds
 * shouldn't generally be modified directly, therefore this function is used to
 * prepare a new copy, which the caller then modifies and then commits by
 * calling commit_creds().
 *
 * Preparation involves making a copy of the objective creds for modification.
 *
 * Returns a pointer to the new creds-to-be if successful, NULL otherwise.
 *
 * Call commit_creds() or abort_creds() to clean up.
 */
struct cred *prepare_creds(void)
{
        struct task_struct *task = current;
        const struct cred *old;
        struct cred *new;

        new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
        if (!new)
                return NULL;

        kdebug("prepare_creds() alloc %p", new);

        old = task->cred;
        memcpy(new, old, sizeof(struct cred));

        new->non_rcu = 0;
        atomic_long_set(&new->usage, 1);
        get_group_info(new->group_info);
        get_uid(new->user);
        get_user_ns(new->user_ns);

#ifdef CONFIG_KEYS
        key_get(new->session_keyring);
        key_get(new->process_keyring);
        key_get(new->thread_keyring);
        key_get(new->request_key_auth);
#endif

#ifdef CONFIG_SECURITY
        new->security = NULL;
#endif

        new->ucounts = get_ucounts(new->ucounts);
        if (!new->ucounts)
                goto error;

        if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
                goto error;

        return new;

error:
        abort_creds(new);
        return NULL;
}
EXPORT_SYMBOL(prepare_creds);

/*
 * Prepare credentials for current to perform an execve()
 * - The caller must hold ->cred_guard_mutex
 */
struct cred *prepare_exec_creds(void)
{
        struct cred *new;

        new = prepare_creds();
        if (!new)
                return new;

#ifdef CONFIG_KEYS
        /* newly exec'd tasks don't get a thread keyring */
        key_put(new->thread_keyring);
        new->thread_keyring = NULL;

        /* inherit the session keyring; new process keyring */
        key_put(new->process_keyring);
        new->process_keyring = NULL;
#endif

        new->suid = new->fsuid = new->euid;
        new->sgid = new->fsgid = new->egid;

        return new;
}

/*
 * Copy credentials for the new process created by fork()
 *
 * We share if we can, but under some circumstances we have to generate a new
 * set.
 *
 * The new process gets the current process's subjective credentials as its
 * objective and subjective credentials
 */
int copy_creds(struct task_struct *p, unsigned long clone_flags)
{
        struct cred *new;
        int ret;

#ifdef CONFIG_KEYS_REQUEST_CACHE
        p->cached_requested_key = NULL;
#endif

        if (
#ifdef CONFIG_KEYS
                !p->cred->thread_keyring &&
#endif
                clone_flags & CLONE_THREAD
            ) {
                p->real_cred = get_cred_many(p->cred, 2);
                kdebug("share_creds(%p{%ld})",
                       p->cred, atomic_long_read(&p->cred->usage));
                inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
                return 0;
        }

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        if (clone_flags & CLONE_NEWUSER) {
                ret = create_user_ns(new);
                if (ret < 0)
                        goto error_put;
                ret = set_cred_ucounts(new);
                if (ret < 0)
                        goto error_put;
        }

#ifdef CONFIG_KEYS
        /* new threads get their own thread keyrings if their parent already
         * had one */
        if (new->thread_keyring) {
                key_put(new->thread_keyring);
                new->thread_keyring = NULL;
                if (clone_flags & CLONE_THREAD)
                        install_thread_keyring_to_cred(new);
        }

        /* The process keyring is only shared between the threads in a process;
         * anything outside of those threads doesn't inherit.
         */
        if (!(clone_flags & CLONE_THREAD)) {
                key_put(new->process_keyring);
                new->process_keyring = NULL;
        }
#endif

        p->cred = p->real_cred = get_cred(new);
        inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
        return 0;

error_put:
        put_cred(new);
        return ret;
}

static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
{
        const struct user_namespace *set_ns = set->user_ns;
        const struct user_namespace *subset_ns = subset->user_ns;

        /* If the two credentials are in the same user namespace see if
         * the capabilities of subset are a subset of set.
         */
        if (set_ns == subset_ns)
                return cap_issubset(subset->cap_permitted, set->cap_permitted);

        /* The credentials are in a different user namespaces
         * therefore one is a subset of the other only if a set is an
         * ancestor of subset and set->euid is owner of subset or one
         * of subsets ancestors.
         */
        for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) {
                if ((set_ns == subset_ns->parent)  &&
                    uid_eq(subset_ns->owner, set->euid))
                        return true;
        }

        return false;
}

/**
 * commit_creds - Install new credentials upon the current task
 * @new: The credentials to be assigned
 *
 * Install a new set of credentials to the current task, using RCU to replace
 * the old set.  Both the objective and the subjective credentials pointers are
 * updated.  This function may not be called if the subjective credentials are
 * in an overridden state.
 *
 * This function eats the caller's reference to the new credentials.
 *
 * Always returns 0 thus allowing this function to be tail-called at the end
 * of, say, sys_setgid().
 */
int commit_creds(struct cred *new)
{
        struct task_struct *task = current;
        const struct cred *old = task->real_cred;

        kdebug("commit_creds(%p{%ld})", new,
               atomic_long_read(&new->usage));

        BUG_ON(task->cred != old);
        BUG_ON(atomic_long_read(&new->usage) < 1);

        get_cred(new); /* we will require a ref for the subj creds too */

        /* dumpability changes */
        if (!uid_eq(old->euid, new->euid) ||
            !gid_eq(old->egid, new->egid) ||
            !uid_eq(old->fsuid, new->fsuid) ||
            !gid_eq(old->fsgid, new->fsgid) ||
            !cred_cap_issubset(old, new)) {
                if (task->mm)
                        set_dumpable(task->mm, suid_dumpable);
                task->pdeath_signal = 0;
                /*
                 * If a task drops privileges and becomes nondumpable,
                 * the dumpability change must become visible before
                 * the credential change; otherwise, a __ptrace_may_access()
                 * racing with this change may be able to attach to a task it
                 * shouldn't be able to attach to (as if the task had dropped
                 * privileges without becoming nondumpable).
                 * Pairs with a read barrier in __ptrace_may_access().
                 */
                smp_wmb();
        }

        /* alter the thread keyring */
        if (!uid_eq(new->fsuid, old->fsuid))
                key_fsuid_changed(new);
        if (!gid_eq(new->fsgid, old->fsgid))
                key_fsgid_changed(new);

        /* do it
         * RLIMIT_NPROC limits on user->processes have already been checked
         * in set_user().
         */
        if (new->user != old->user || new->user_ns != old->user_ns)
                inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
        rcu_assign_pointer(task->real_cred, new);
        rcu_assign_pointer(task->cred, new);
        if (new->user != old->user || new->user_ns != old->user_ns)
                dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);

        /* send notifications */
        if (!uid_eq(new->uid,   old->uid)  ||
            !uid_eq(new->euid,  old->euid) ||
            !uid_eq(new->suid,  old->suid) ||
            !uid_eq(new->fsuid, old->fsuid))
                proc_id_connector(task, PROC_EVENT_UID);

        if (!gid_eq(new->gid,   old->gid)  ||
            !gid_eq(new->egid,  old->egid) ||
            !gid_eq(new->sgid,  old->sgid) ||
            !gid_eq(new->fsgid, old->fsgid))
                proc_id_connector(task, PROC_EVENT_GID);

        /* release the old obj and subj refs both */
        put_cred_many(old, 2);
        return 0;
}
EXPORT_SYMBOL(commit_creds);

/**
 * abort_creds - Discard a set of credentials and unlock the current task
 * @new: The credentials that were going to be applied
 *
 * Discard a set of credentials that were under construction and unlock the
 * current task.
 */
void abort_creds(struct cred *new)
{
        kdebug("abort_creds(%p{%ld})", new,
               atomic_long_read(&new->usage));

        BUG_ON(atomic_long_read(&new->usage) < 1);
        put_cred(new);
}
EXPORT_SYMBOL(abort_creds);

/**
 * override_creds - Override the current process's subjective credentials
 * @new: The credentials to be assigned
 *
 * Install a set of temporary override subjective credentials on the current
 * process, returning the old set for later reversion.
 */
const struct cred *override_creds(const struct cred *new)
{
        const struct cred *old = current->cred;

        kdebug("override_creds(%p{%ld})", new,
               atomic_long_read(&new->usage));

        /*
         * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'.
         *
         * That means that we do not clear the 'non_rcu' flag, since
         * we are only installing the cred into the thread-synchronous
         * '->cred' pointer, not the '->real_cred' pointer that is
         * visible to other threads under RCU.
         */
        get_new_cred((struct cred *)new);
        rcu_assign_pointer(current->cred, new);

        kdebug("override_creds() = %p{%ld}", old,
               atomic_long_read(&old->usage));
        return old;
}
EXPORT_SYMBOL(override_creds);

/**
 * revert_creds - Revert a temporary subjective credentials override
 * @old: The credentials to be restored
 *
 * Revert a temporary set of override subjective credentials to an old set,
 * discarding the override set.
 */
void revert_creds(const struct cred *old)
{
        const struct cred *override = current->cred;

        kdebug("revert_creds(%p{%ld})", old,
               atomic_long_read(&old->usage));

        rcu_assign_pointer(current->cred, old);
        put_cred(override);
}
EXPORT_SYMBOL(revert_creds);

/**
 * cred_fscmp - Compare two credentials with respect to filesystem access.
 * @a: The first credential
 * @b: The second credential
 *
 * cred_cmp() will return zero if both credentials have the same
 * fsuid, fsgid, and supplementary groups.  That is, if they will both
 * provide the same access to files based on mode/uid/gid.
 * If the credentials are different, then either -1 or 1 will
 * be returned depending on whether @a comes before or after @b
 * respectively in an arbitrary, but stable, ordering of credentials.
 *
 * Return: -1, 0, or 1 depending on comparison
 */
int cred_fscmp(const struct cred *a, const struct cred *b)
{
        struct group_info *ga, *gb;
        int g;

        if (a == b)
                return 0;
        if (uid_lt(a->fsuid, b->fsuid))
                return -1;
        if (uid_gt(a->fsuid, b->fsuid))
                return 1;

        if (gid_lt(a->fsgid, b->fsgid))
                return -1;
        if (gid_gt(a->fsgid, b->fsgid))
                return 1;

        ga = a->group_info;
        gb = b->group_info;
        if (ga == gb)
                return 0;
        if (ga == NULL)
                return -1;
        if (gb == NULL)
                return 1;
        if (ga->ngroups < gb->ngroups)
                return -1;
        if (ga->ngroups > gb->ngroups)
                return 1;

        for (g = 0; g < ga->ngroups; g++) {
                if (gid_lt(ga->gid[g], gb->gid[g]))
                        return -1;
                if (gid_gt(ga->gid[g], gb->gid[g]))
                        return 1;
        }
        return 0;
}
EXPORT_SYMBOL(cred_fscmp);

int set_cred_ucounts(struct cred *new)
{
        struct ucounts *new_ucounts, *old_ucounts = new->ucounts;

        /*
         * This optimization is needed because alloc_ucounts() uses locks
         * for table lookups.
         */
        if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->uid))
                return 0;

        if (!(new_ucounts = alloc_ucounts(new->user_ns, new->uid)))
                return -EAGAIN;

        new->ucounts = new_ucounts;
        put_ucounts(old_ucounts);

        return 0;
}

/*
 * initialise the credentials stuff
 */
void __init cred_init(void)
{
        /* allocate a slab in which we can store credentials */
        cred_jar = KMEM_CACHE(cred,
                              SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
}

/**
 * prepare_kernel_cred - Prepare a set of credentials for a kernel service
 * @daemon: A userspace daemon to be used as a reference
 *
 * Prepare a set of credentials for a kernel service.  This can then be used to
 * override a task's own credentials so that work can be done on behalf of that
 * task that requires a different subjective context.
 *
 * @daemon is used to provide a base cred, with the security data derived from
 * that; if this is "&init_task", they'll be set to 0, no groups, full
 * capabilities, and no keys.
 *
 * The caller may change these controls afterwards if desired.
 *
 * Returns the new credentials or NULL if out of memory.
 */
struct cred *prepare_kernel_cred(struct task_struct *daemon)
{
        const struct cred *old;
        struct cred *new;

        if (WARN_ON_ONCE(!daemon))
                return NULL;

        new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
        if (!new)
                return NULL;

        kdebug("prepare_kernel_cred() alloc %p", new);

        old = get_task_cred(daemon);

        *new = *old;
        new->non_rcu = 0;
        atomic_long_set(&new->usage, 1);
        get_uid(new->user);
        get_user_ns(new->user_ns);
        get_group_info(new->group_info);

#ifdef CONFIG_KEYS
        new->session_keyring = NULL;
        new->process_keyring = NULL;
        new->thread_keyring = NULL;
        new->request_key_auth = NULL;
        new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
#endif

#ifdef CONFIG_SECURITY
        new->security = NULL;
#endif
        new->ucounts = get_ucounts(new->ucounts);
        if (!new->ucounts)
                goto error;

        if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
                goto error;

        put_cred(old);
        return new;

error:
        put_cred(new);
        put_cred(old);
        return NULL;
}
EXPORT_SYMBOL(prepare_kernel_cred);

/**
 * set_security_override - Set the security ID in a set of credentials
 * @new: The credentials to alter
 * @secid: The LSM security ID to set
 *
 * Set the LSM security ID in a set of credentials so that the subjective
 * security is overridden when an alternative set of credentials is used.
 */
int set_security_override(struct cred *new, u32 secid)
{
        return security_kernel_act_as(new, secid);
}
EXPORT_SYMBOL(set_security_override);

/**
 * set_security_override_from_ctx - Set the security ID in a set of credentials
 * @new: The credentials to alter
 * @secctx: The LSM security context to generate the security ID from.
 *
 * Set the LSM security ID in a set of credentials so that the subjective
 * security is overridden when an alternative set of credentials is used.  The
 * security ID is specified in string form as a security context to be
 * interpreted by the LSM.
 */
int set_security_override_from_ctx(struct cred *new, const char *secctx)
{
        u32 secid;
        int ret;

        ret = security_secctx_to_secid(secctx, strlen(secctx), &secid);
        if (ret < 0)
                return ret;

        return set_security_override(new, secid);
}
EXPORT_SYMBOL(set_security_override_from_ctx);

/**
 * set_create_files_as - Set the LSM file create context in a set of credentials
 * @new: The credentials to alter
 * @inode: The inode to take the context from
 *
 * Change the LSM file creation context in a set of credentials to be the same
 * as the object context of the specified inode, so that the new inodes have
 * the same MAC context as that inode.
 */
int set_create_files_as(struct cred *new, struct inode *inode)
{
        if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
                return -EINVAL;
        new->fsuid = inode->i_uid;
        new->fsgid = inode->i_gid;
        return security_kernel_create_files_as(new, inode);
}
EXPORT_SYMBOL(set_create_files_as);
































































































































































































































































































































































































































































































































































































    1 
























































    1 




    1 










    1 
































    1 
    1 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 1991-1998  Linus Torvalds
 * Re-organised Feb 1998 Russell King
 * Copyright (C) 2020 Christoph Hellwig
 */
#include <linux/fs.h>
#include <linux/major.h>
#include <linux/slab.h>
#include <linux/ctype.h>
#include <linux/vmalloc.h>
#include <linux/raid/detect.h>
#include "check.h"

static int (*const check_part[])(struct parsed_partitions *) = {
        /*
         * Probe partition formats with tables at disk address 0
         * that also have an ADFS boot block at 0xdc0.
         */
#ifdef CONFIG_ACORN_PARTITION_ICS
        adfspart_check_ICS,
#endif
#ifdef CONFIG_ACORN_PARTITION_POWERTEC
        adfspart_check_POWERTEC,
#endif
#ifdef CONFIG_ACORN_PARTITION_EESOX
        adfspart_check_EESOX,
#endif

        /*
         * Now move on to formats that only have partition info at
         * disk address 0xdc0.  Since these may also have stale
         * PC/BIOS partition tables, they need to come before
         * the msdos entry.
         */
#ifdef CONFIG_ACORN_PARTITION_CUMANA
        adfspart_check_CUMANA,
#endif
#ifdef CONFIG_ACORN_PARTITION_ADFS
        adfspart_check_ADFS,
#endif

#ifdef CONFIG_CMDLINE_PARTITION
        cmdline_partition,
#endif
#ifdef CONFIG_EFI_PARTITION
        efi_partition,                /* this must come before msdos */
#endif
#ifdef CONFIG_SGI_PARTITION
        sgi_partition,
#endif
#ifdef CONFIG_LDM_PARTITION
        ldm_partition,                /* this must come before msdos */
#endif
#ifdef CONFIG_MSDOS_PARTITION
        msdos_partition,
#endif
#ifdef CONFIG_OSF_PARTITION
        osf_partition,
#endif
#ifdef CONFIG_SUN_PARTITION
        sun_partition,
#endif
#ifdef CONFIG_AMIGA_PARTITION
        amiga_partition,
#endif
#ifdef CONFIG_ATARI_PARTITION
        atari_partition,
#endif
#ifdef CONFIG_MAC_PARTITION
        mac_partition,
#endif
#ifdef CONFIG_ULTRIX_PARTITION
        ultrix_partition,
#endif
#ifdef CONFIG_IBM_PARTITION
        ibm_partition,
#endif
#ifdef CONFIG_KARMA_PARTITION
        karma_partition,
#endif
#ifdef CONFIG_SYSV68_PARTITION
        sysv68_partition,
#endif
        NULL
};

static struct parsed_partitions *allocate_partitions(struct gendisk *hd)
{
        struct parsed_partitions *state;
        int nr = DISK_MAX_PARTS;

        state = kzalloc(sizeof(*state), GFP_KERNEL);
        if (!state)
                return NULL;

        state->parts = vzalloc(array_size(nr, sizeof(state->parts[0])));
        if (!state->parts) {
                kfree(state);
                return NULL;
        }

        state->limit = nr;

        return state;
}

static void free_partitions(struct parsed_partitions *state)
{
        vfree(state->parts);
        kfree(state);
}

static struct parsed_partitions *check_partition(struct gendisk *hd)
{
        struct parsed_partitions *state;
        int i, res, err;

        state = allocate_partitions(hd);
        if (!state)
                return NULL;
        state->pp_buf = (char *)__get_free_page(GFP_KERNEL);
        if (!state->pp_buf) {
                free_partitions(state);
                return NULL;
        }
        state->pp_buf[0] = '\0';

        state->disk = hd;
        snprintf(state->name, BDEVNAME_SIZE, "%s", hd->disk_name);
        snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
        if (isdigit(state->name[strlen(state->name)-1]))
                sprintf(state->name, "p");

        i = res = err = 0;
        while (!res && check_part[i]) {
                memset(state->parts, 0, state->limit * sizeof(state->parts[0]));
                res = check_part[i++](state);
                if (res < 0) {
                        /*
                         * We have hit an I/O error which we don't report now.
                         * But record it, and let the others do their job.
                         */
                        err = res;
                        res = 0;
                }

        }
        if (res > 0) {
                printk(KERN_INFO "%s", state->pp_buf);

                free_page((unsigned long)state->pp_buf);
                return state;
        }
        if (state->access_beyond_eod)
                err = -ENOSPC;
        /*
         * The partition is unrecognized. So report I/O errors if there were any
         */
        if (err)
                res = err;
        if (res) {
                strlcat(state->pp_buf,
                        " unable to read partition table\n", PAGE_SIZE);
                printk(KERN_INFO "%s", state->pp_buf);
        }

        free_page((unsigned long)state->pp_buf);
        free_partitions(state);
        return ERR_PTR(res);
}

static ssize_t part_partition_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        return sprintf(buf, "%d\n", bdev_partno(dev_to_bdev(dev)));
}

static ssize_t part_start_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        return sprintf(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect);
}

static ssize_t part_ro_show(struct device *dev,
                            struct device_attribute *attr, char *buf)
{
        return sprintf(buf, "%d\n", bdev_read_only(dev_to_bdev(dev)));
}

static ssize_t part_alignment_offset_show(struct device *dev,
                                          struct device_attribute *attr, char *buf)
{
        return sprintf(buf, "%u\n", bdev_alignment_offset(dev_to_bdev(dev)));
}

static ssize_t part_discard_alignment_show(struct device *dev,
                                           struct device_attribute *attr, char *buf)
{
        return sprintf(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev)));
}

static DEVICE_ATTR(partition, 0444, part_partition_show, NULL);
static DEVICE_ATTR(start, 0444, part_start_show, NULL);
static DEVICE_ATTR(size, 0444, part_size_show, NULL);
static DEVICE_ATTR(ro, 0444, part_ro_show, NULL);
static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL);
static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL);
static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
#ifdef CONFIG_FAIL_MAKE_REQUEST
static struct device_attribute dev_attr_fail =
        __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
#endif

static struct attribute *part_attrs[] = {
        &dev_attr_partition.attr,
        &dev_attr_start.attr,
        &dev_attr_size.attr,
        &dev_attr_ro.attr,
        &dev_attr_alignment_offset.attr,
        &dev_attr_discard_alignment.attr,
        &dev_attr_stat.attr,
        &dev_attr_inflight.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
        &dev_attr_fail.attr,
#endif
        NULL
};

static const struct attribute_group part_attr_group = {
        .attrs = part_attrs,
};

static const struct attribute_group *part_attr_groups[] = {
        &part_attr_group,
#ifdef CONFIG_BLK_DEV_IO_TRACE
        &blk_trace_attr_group,
#endif
        NULL
};

static void part_release(struct device *dev)
{
        put_disk(dev_to_bdev(dev)->bd_disk);
        bdev_drop(dev_to_bdev(dev));
}

static int part_uevent(const struct device *dev, struct kobj_uevent_env *env)
{
        const struct block_device *part = dev_to_bdev(dev);

        add_uevent_var(env, "PARTN=%u", bdev_partno(part));
        if (part->bd_meta_info && part->bd_meta_info->volname[0])
                add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname);
        return 0;
}

const struct device_type part_type = {
        .name                = "partition",
        .groups                = part_attr_groups,
        .release        = part_release,
        .uevent                = part_uevent,
};

void drop_partition(struct block_device *part)
{
        lockdep_assert_held(&part->bd_disk->open_mutex);

        xa_erase(&part->bd_disk->part_tbl, bdev_partno(part));
        kobject_put(part->bd_holder_dir);

        device_del(&part->bd_device);
        put_device(&part->bd_device);
}

static ssize_t whole_disk_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        return 0;
}
static const DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL);

/*
 * Must be called either with open_mutex held, before a disk can be opened or
 * after all disk users are gone.
 */
static struct block_device *add_partition(struct gendisk *disk, int partno,
                                sector_t start, sector_t len, int flags,
                                struct partition_meta_info *info)
{
        dev_t devt = MKDEV(0, 0);
        struct device *ddev = disk_to_dev(disk);
        struct device *pdev;
        struct block_device *bdev;
        const char *dname;
        int err;

        lockdep_assert_held(&disk->open_mutex);

        if (partno >= DISK_MAX_PARTS)
                return ERR_PTR(-EINVAL);

        /*
         * Partitions are not supported on zoned block devices that are used as
         * such.
         */
        if (bdev_is_zoned(disk->part0)) {
                pr_warn("%s: partitions not supported on host managed zoned block device\n",
                        disk->disk_name);
                return ERR_PTR(-ENXIO);
        }

        if (xa_load(&disk->part_tbl, partno))
                return ERR_PTR(-EBUSY);

        /* ensure we always have a reference to the whole disk */
        get_device(disk_to_dev(disk));

        err = -ENOMEM;
        bdev = bdev_alloc(disk, partno);
        if (!bdev)
                goto out_put_disk;

        bdev->bd_start_sect = start;
        bdev_set_nr_sectors(bdev, len);

        pdev = &bdev->bd_device;
        dname = dev_name(ddev);
        if (isdigit(dname[strlen(dname) - 1]))
                dev_set_name(pdev, "%sp%d", dname, partno);
        else
                dev_set_name(pdev, "%s%d", dname, partno);

        device_initialize(pdev);
        pdev->class = &block_class;
        pdev->type = &part_type;
        pdev->parent = ddev;

        /* in consecutive minor range? */
        if (bdev_partno(bdev) < disk->minors) {
                devt = MKDEV(disk->major, disk->first_minor + bdev_partno(bdev));
        } else {
                err = blk_alloc_ext_minor();
                if (err < 0)
                        goto out_put;
                devt = MKDEV(BLOCK_EXT_MAJOR, err);
        }
        pdev->devt = devt;

        if (info) {
                err = -ENOMEM;
                bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL);
                if (!bdev->bd_meta_info)
                        goto out_put;
        }

        /* delay uevent until 'holders' subdir is created */
        dev_set_uevent_suppress(pdev, 1);
        err = device_add(pdev);
        if (err)
                goto out_put;

        err = -ENOMEM;
        bdev->bd_holder_dir = kobject_create_and_add("holders", &pdev->kobj);
        if (!bdev->bd_holder_dir)
                goto out_del;

        dev_set_uevent_suppress(pdev, 0);
        if (flags & ADDPART_FLAG_WHOLEDISK) {
                err = device_create_file(pdev, &dev_attr_whole_disk);
                if (err)
                        goto out_del;
        }

        /* everything is up and running, commence */
        err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL);
        if (err)
                goto out_del;
        bdev_add(bdev, devt);

        /* suppress uevent if the disk suppresses it */
        if (!dev_get_uevent_suppress(ddev))
                kobject_uevent(&pdev->kobj, KOBJ_ADD);
        return bdev;

out_del:
        kobject_put(bdev->bd_holder_dir);
        device_del(pdev);
out_put:
        put_device(pdev);
        return ERR_PTR(err);
out_put_disk:
        put_disk(disk);
        return ERR_PTR(err);
}

static bool partition_overlaps(struct gendisk *disk, sector_t start,
                sector_t length, int skip_partno)
{
        struct block_device *part;
        bool overlap = false;
        unsigned long idx;

        rcu_read_lock();
        xa_for_each_start(&disk->part_tbl, idx, part, 1) {
                if (bdev_partno(part) != skip_partno &&
                    start < part->bd_start_sect + bdev_nr_sectors(part) &&
                    start + length > part->bd_start_sect) {
                        overlap = true;
                        break;
                }
        }
        rcu_read_unlock();

        return overlap;
}

int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
                sector_t length)
{
        struct block_device *part;
        int ret;

        mutex_lock(&disk->open_mutex);
        if (!disk_live(disk)) {
                ret = -ENXIO;
                goto out;
        }

        if (disk->flags & GENHD_FL_NO_PART) {
                ret = -EINVAL;
                goto out;
        }

        if (partition_overlaps(disk, start, length, -1)) {
                ret = -EBUSY;
                goto out;
        }

        part = add_partition(disk, partno, start, length,
                        ADDPART_FLAG_NONE, NULL);
        ret = PTR_ERR_OR_ZERO(part);
out:
        mutex_unlock(&disk->open_mutex);
        return ret;
}

int bdev_del_partition(struct gendisk *disk, int partno)
{
        struct block_device *part = NULL;
        int ret = -ENXIO;

        mutex_lock(&disk->open_mutex);
        part = xa_load(&disk->part_tbl, partno);
        if (!part)
                goto out_unlock;

        ret = -EBUSY;
        if (atomic_read(&part->bd_openers))
                goto out_unlock;

        /*
         * We verified that @part->bd_openers is zero above and so
         * @part->bd_holder{_ops} can't be set. And since we hold
         * @disk->open_mutex the device can't be claimed by anyone.
         *
         * So no need to call @part->bd_holder_ops->mark_dead() here.
         * Just delete the partition and invalidate it.
         */

        bdev_unhash(part);
        invalidate_bdev(part);
        drop_partition(part);
        ret = 0;
out_unlock:
        mutex_unlock(&disk->open_mutex);
        return ret;
}

int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
                sector_t length)
{
        struct block_device *part = NULL;
        int ret = -ENXIO;

        mutex_lock(&disk->open_mutex);
        part = xa_load(&disk->part_tbl, partno);
        if (!part)
                goto out_unlock;

        ret = -EINVAL;
        if (start != part->bd_start_sect)
                goto out_unlock;

        ret = -EBUSY;
        if (partition_overlaps(disk, start, length, partno))
                goto out_unlock;

        bdev_set_nr_sectors(part, length);

        ret = 0;
out_unlock:
        mutex_unlock(&disk->open_mutex);
        return ret;
}

static bool disk_unlock_native_capacity(struct gendisk *disk)
{
        if (!disk->fops->unlock_native_capacity ||
            test_and_set_bit(GD_NATIVE_CAPACITY, &disk->state)) {
                printk(KERN_CONT "truncated\n");
                return false;
        }

        printk(KERN_CONT "enabling native capacity\n");
        disk->fops->unlock_native_capacity(disk);
        return true;
}

static bool blk_add_partition(struct gendisk *disk,
                struct parsed_partitions *state, int p)
{
        sector_t size = state->parts[p].size;
        sector_t from = state->parts[p].from;
        struct block_device *part;

        if (!size)
                return true;

        if (from >= get_capacity(disk)) {
                printk(KERN_WARNING
                       "%s: p%d start %llu is beyond EOD, ",
                       disk->disk_name, p, (unsigned long long) from);
                if (disk_unlock_native_capacity(disk))
                        return false;
                return true;
        }

        if (from + size > get_capacity(disk)) {
                printk(KERN_WARNING
                       "%s: p%d size %llu extends beyond EOD, ",
                       disk->disk_name, p, (unsigned long long) size);

                if (disk_unlock_native_capacity(disk))
                        return false;

                /*
                 * We can not ignore partitions of broken tables created by for
                 * example camera firmware, but we limit them to the end of the
                 * disk to avoid creating invalid block devices.
                 */
                size = get_capacity(disk) - from;
        }

        part = add_partition(disk, p, from, size, state->parts[p].flags,
                             &state->parts[p].info);
        if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) {
                printk(KERN_ERR " %s: p%d could not be added: %pe\n",
                       disk->disk_name, p, part);
                return true;
        }

        if (IS_BUILTIN(CONFIG_BLK_DEV_MD) &&
            (state->parts[p].flags & ADDPART_FLAG_RAID))
                md_autodetect_dev(part->bd_dev);

        return true;
}

static int blk_add_partitions(struct gendisk *disk)
{
        struct parsed_partitions *state;
        int ret = -EAGAIN, p;

        if (!disk_has_partscan(disk))
                return 0;

        state = check_partition(disk);
        if (!state)
                return 0;
        if (IS_ERR(state)) {
                /*
                 * I/O error reading the partition table.  If we tried to read
                 * beyond EOD, retry after unlocking the native capacity.
                 */
                if (PTR_ERR(state) == -ENOSPC) {
                        printk(KERN_WARNING "%s: partition table beyond EOD, ",
                               disk->disk_name);
                        if (disk_unlock_native_capacity(disk))
                                return -EAGAIN;
                }
                return -EIO;
        }

        /*
         * Partitions are not supported on host managed zoned block devices.
         */
        if (bdev_is_zoned(disk->part0)) {
                pr_warn("%s: ignoring partition table on host managed zoned block device\n",
                        disk->disk_name);
                ret = 0;
                goto out_free_state;
        }

        /*
         * If we read beyond EOD, try unlocking native capacity even if the
         * partition table was successfully read as we could be missing some
         * partitions.
         */
        if (state->access_beyond_eod) {
                printk(KERN_WARNING
                       "%s: partition table partially beyond EOD, ",
                       disk->disk_name);
                if (disk_unlock_native_capacity(disk))
                        goto out_free_state;
        }

        /* tell userspace that the media / partition table may have changed */
        kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);

        for (p = 1; p < state->limit; p++)
                if (!blk_add_partition(disk, state, p))
                        goto out_free_state;

        ret = 0;
out_free_state:
        free_partitions(state);
        return ret;
}

int bdev_disk_changed(struct gendisk *disk, bool invalidate)
{
        struct block_device *part;
        unsigned long idx;
        int ret = 0;

        lockdep_assert_held(&disk->open_mutex);

        if (!disk_live(disk))
                return -ENXIO;

rescan:
        if (disk->open_partitions)
                return -EBUSY;
        sync_blockdev(disk->part0);
        invalidate_bdev(disk->part0);

        xa_for_each_start(&disk->part_tbl, idx, part, 1) {
                /*
                 * Remove the block device from the inode hash, so that
                 * it cannot be looked up any more even when openers
                 * still hold references.
                 */
                bdev_unhash(part);

                /*
                 * If @disk->open_partitions isn't elevated but there's
                 * still an active holder of that block device things
                 * are broken.
                 */
                WARN_ON_ONCE(atomic_read(&part->bd_openers));
                invalidate_bdev(part);
                drop_partition(part);
        }
        clear_bit(GD_NEED_PART_SCAN, &disk->state);

        /*
         * Historically we only set the capacity to zero for devices that
         * support partitions (independ of actually having partitions created).
         * Doing that is rather inconsistent, but changing it broke legacy
         * udisks polling for legacy ide-cdrom devices.  Use the crude check
         * below to get the sane behavior for most device while not breaking
         * userspace for this particular setup.
         */
        if (invalidate) {
                if (!(disk->flags & GENHD_FL_NO_PART) ||
                    !(disk->flags & GENHD_FL_REMOVABLE))
                        set_capacity(disk, 0);
        }

        if (get_capacity(disk)) {
                ret = blk_add_partitions(disk);
                if (ret == -EAGAIN)
                        goto rescan;
        } else if (invalidate) {
                /*
                 * Tell userspace that the media / partition table may have
                 * changed.
                 */
                kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
        }

        return ret;
}
/*
 * Only exported for loop and dasd for historic reasons.  Don't use in new
 * code!
 */
EXPORT_SYMBOL_GPL(bdev_disk_changed);

void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p)
{
        struct address_space *mapping = state->disk->part0->bd_mapping;
        struct folio *folio;

        if (n >= get_capacity(state->disk)) {
                state->access_beyond_eod = true;
                goto out;
        }

        folio = read_mapping_folio(mapping, n >> PAGE_SECTORS_SHIFT, NULL);
        if (IS_ERR(folio))
                goto out;

        p->v = folio;
        return folio_address(folio) + offset_in_folio(folio, n * SECTOR_SIZE);
out:
        p->v = NULL;
        return NULL;
}






















































































































































































































































































































































































































































































    4 

    1 














    3 











































































































    2 









    2 








    1 













    4 










    4 












    2 














    4 









    2 
    4 













    4 




    4 





    4 























































































    1 




    1 

    1 















    2 



    1 



    2 



    1 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 














































































    1 






    1 

















































































































































































































































































































































































    3 
    3 





































































































































































    2 





























































    4 




    2 
    2 



    2 

    2 










    2 






    4 
    4 









    4 



    2 
    4 


    4 
























    1 



    1 











    4 







    1 
    1 


    4 











































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
// SPDX-License-Identifier: GPL-2.0+
/*
 * linux/fs/jbd2/journal.c
 *
 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
 *
 * Copyright 1998 Red Hat corp --- All Rights Reserved
 *
 * Generic filesystem journal-writing code; part of the ext2fs
 * journaling system.
 *
 * This file manages journals: areas of disk reserved for logging
 * transactional updates.  This includes the kernel journaling thread
 * which is responsible for scheduling updates to the log.
 *
 * We do not actually manage the physical storage of the journal in this
 * file: that is left to a per-journal policy function, which allows us
 * to store the journal within a filesystem-specified area for ext2
 * journaling (ext2 can use a reserved inode for storing the log).
 */

#include <linux/module.h>
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/freezer.h>
#include <linux/pagemap.h>
#include <linux/kthread.h>
#include <linux/poison.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/math64.h>
#include <linux/hash.h>
#include <linux/log2.h>
#include <linux/vmalloc.h>
#include <linux/backing-dev.h>
#include <linux/bitops.h>
#include <linux/ratelimit.h>
#include <linux/sched/mm.h>

#define CREATE_TRACE_POINTS
#include <trace/events/jbd2.h>

#include <linux/uaccess.h>
#include <asm/page.h>

#ifdef CONFIG_JBD2_DEBUG
static ushort jbd2_journal_enable_debug __read_mostly;

module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644);
MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2");
#endif

EXPORT_SYMBOL(jbd2_journal_extend);
EXPORT_SYMBOL(jbd2_journal_stop);
EXPORT_SYMBOL(jbd2_journal_lock_updates);
EXPORT_SYMBOL(jbd2_journal_unlock_updates);
EXPORT_SYMBOL(jbd2_journal_get_write_access);
EXPORT_SYMBOL(jbd2_journal_get_create_access);
EXPORT_SYMBOL(jbd2_journal_get_undo_access);
EXPORT_SYMBOL(jbd2_journal_set_triggers);
EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
EXPORT_SYMBOL(jbd2_journal_forget);
EXPORT_SYMBOL(jbd2_journal_flush);
EXPORT_SYMBOL(jbd2_journal_revoke);

EXPORT_SYMBOL(jbd2_journal_init_dev);
EXPORT_SYMBOL(jbd2_journal_init_inode);
EXPORT_SYMBOL(jbd2_journal_check_used_features);
EXPORT_SYMBOL(jbd2_journal_check_available_features);
EXPORT_SYMBOL(jbd2_journal_set_features);
EXPORT_SYMBOL(jbd2_journal_load);
EXPORT_SYMBOL(jbd2_journal_destroy);
EXPORT_SYMBOL(jbd2_journal_abort);
EXPORT_SYMBOL(jbd2_journal_errno);
EXPORT_SYMBOL(jbd2_journal_ack_err);
EXPORT_SYMBOL(jbd2_journal_clear_err);
EXPORT_SYMBOL(jbd2_log_wait_commit);
EXPORT_SYMBOL(jbd2_journal_start_commit);
EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
EXPORT_SYMBOL(jbd2_journal_wipe);
EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
EXPORT_SYMBOL(jbd2_journal_invalidate_folio);
EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait);
EXPORT_SYMBOL(jbd2_journal_finish_inode_data_buffers);
EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
EXPORT_SYMBOL(jbd2_inode_cache);

static int jbd2_journal_create_slab(size_t slab_size);

#ifdef CONFIG_JBD2_DEBUG
void __jbd2_debug(int level, const char *file, const char *func,
                  unsigned int line, const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        if (level > jbd2_journal_enable_debug)
                return;
        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
        printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf);
        va_end(args);
}
#endif

/* Checksumming functions */
static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
{
        __u32 csum;
        __be32 old_csum;

        old_csum = sb->s_checksum;
        sb->s_checksum = 0;
        csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t));
        sb->s_checksum = old_csum;

        return cpu_to_be32(csum);
}

/*
 * Helper function used to manage commit timeouts
 */

static void commit_timeout(struct timer_list *t)
{
        journal_t *journal = from_timer(journal, t, j_commit_timer);

        wake_up_process(journal->j_task);
}

/*
 * kjournald2: The main thread function used to manage a logging device
 * journal.
 *
 * This kernel thread is responsible for two things:
 *
 * 1) COMMIT:  Every so often we need to commit the current state of the
 *    filesystem to disk.  The journal thread is responsible for writing
 *    all of the metadata buffers to disk. If a fast commit is ongoing
 *    journal thread waits until it's done and then continues from
 *    there on.
 *
 * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
 *    of the data in that part of the log has been rewritten elsewhere on
 *    the disk.  Flushing these old buffers to reclaim space in the log is
 *    known as checkpointing, and this thread is responsible for that job.
 */

static int kjournald2(void *arg)
{
        journal_t *journal = arg;
        transaction_t *transaction;

        /*
         * Set up an interval timer which can be used to trigger a commit wakeup
         * after the commit interval expires
         */
        timer_setup(&journal->j_commit_timer, commit_timeout, 0);

        set_freezable();

        /* Record that the journal thread is running */
        journal->j_task = current;
        wake_up(&journal->j_wait_done_commit);

        /*
         * Make sure that no allocations from this kernel thread will ever
         * recurse to the fs layer because we are responsible for the
         * transaction commit and any fs involvement might get stuck waiting for
         * the trasn. commit.
         */
        memalloc_nofs_save();

        /*
         * And now, wait forever for commit wakeup events.
         */
        write_lock(&journal->j_state_lock);

loop:
        if (journal->j_flags & JBD2_UNMOUNT)
                goto end_loop;

        jbd2_debug(1, "commit_sequence=%u, commit_request=%u\n",
                journal->j_commit_sequence, journal->j_commit_request);

        if (journal->j_commit_sequence != journal->j_commit_request) {
                jbd2_debug(1, "OK, requests differ\n");
                write_unlock(&journal->j_state_lock);
                del_timer_sync(&journal->j_commit_timer);
                jbd2_journal_commit_transaction(journal);
                write_lock(&journal->j_state_lock);
                goto loop;
        }

        wake_up(&journal->j_wait_done_commit);
        if (freezing(current)) {
                /*
                 * The simpler the better. Flushing journal isn't a
                 * good idea, because that depends on threads that may
                 * be already stopped.
                 */
                jbd2_debug(1, "Now suspending kjournald2\n");
                write_unlock(&journal->j_state_lock);
                try_to_freeze();
                write_lock(&journal->j_state_lock);
        } else {
                /*
                 * We assume on resume that commits are already there,
                 * so we don't sleep
                 */
                DEFINE_WAIT(wait);
                int should_sleep = 1;

                prepare_to_wait(&journal->j_wait_commit, &wait,
                                TASK_INTERRUPTIBLE);
                if (journal->j_commit_sequence != journal->j_commit_request)
                        should_sleep = 0;
                transaction = journal->j_running_transaction;
                if (transaction && time_after_eq(jiffies,
                                                transaction->t_expires))
                        should_sleep = 0;
                if (journal->j_flags & JBD2_UNMOUNT)
                        should_sleep = 0;
                if (should_sleep) {
                        write_unlock(&journal->j_state_lock);
                        schedule();
                        write_lock(&journal->j_state_lock);
                }
                finish_wait(&journal->j_wait_commit, &wait);
        }

        jbd2_debug(1, "kjournald2 wakes\n");

        /*
         * Were we woken up by a commit wakeup event?
         */
        transaction = journal->j_running_transaction;
        if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
                journal->j_commit_request = transaction->t_tid;
                jbd2_debug(1, "woke because of timeout\n");
        }
        goto loop;

end_loop:
        del_timer_sync(&journal->j_commit_timer);
        journal->j_task = NULL;
        wake_up(&journal->j_wait_done_commit);
        jbd2_debug(1, "Journal thread exiting.\n");
        write_unlock(&journal->j_state_lock);
        return 0;
}

static int jbd2_journal_start_thread(journal_t *journal)
{
        struct task_struct *t;

        t = kthread_run(kjournald2, journal, "jbd2/%s",
                        journal->j_devname);
        if (IS_ERR(t))
                return PTR_ERR(t);

        wait_event(journal->j_wait_done_commit, journal->j_task != NULL);
        return 0;
}

static void journal_kill_thread(journal_t *journal)
{
        write_lock(&journal->j_state_lock);
        journal->j_flags |= JBD2_UNMOUNT;

        while (journal->j_task) {
                write_unlock(&journal->j_state_lock);
                wake_up(&journal->j_wait_commit);
                wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
                write_lock(&journal->j_state_lock);
        }
        write_unlock(&journal->j_state_lock);
}

/*
 * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal.
 *
 * Writes a metadata buffer to a given disk block.  The actual IO is not
 * performed but a new buffer_head is constructed which labels the data
 * to be written with the correct destination disk block.
 *
 * Any magic-number escaping which needs to be done will cause a
 * copy-out here.  If the buffer happens to start with the
 * JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the
 * magic number is only written to the log for descripter blocks.  In
 * this case, we copy the data and replace the first word with 0, and we
 * return a result code which indicates that this buffer needs to be
 * marked as an escaped buffer in the corresponding log descriptor
 * block.  The missing word can then be restored when the block is read
 * during recovery.
 *
 * If the source buffer has already been modified by a new transaction
 * since we took the last commit snapshot, we use the frozen copy of
 * that data for IO. If we end up using the existing buffer_head's data
 * for the write, then we have to make sure nobody modifies it while the
 * IO is in progress. do_get_write_access() handles this.
 *
 * The function returns a pointer to the buffer_head to be used for IO.
 *
 *
 * Return value:
 *  <0: Error
 * >=0: Finished OK
 *
 * On success:
 * Bit 0 set == escape performed on the data
 * Bit 1 set == buffer copy-out performed (kfree the data after IO)
 */

int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
                                  struct journal_head  *jh_in,
                                  struct buffer_head **bh_out,
                                  sector_t blocknr)
{
        int need_copy_out = 0;
        int done_copy_out = 0;
        int do_escape = 0;
        char *mapped_data;
        struct buffer_head *new_bh;
        struct folio *new_folio;
        unsigned int new_offset;
        struct buffer_head *bh_in = jh2bh(jh_in);
        journal_t *journal = transaction->t_journal;

        /*
         * The buffer really shouldn't be locked: only the current committing
         * transaction is allowed to write it, so nobody else is allowed
         * to do any IO.
         *
         * akpm: except if we're journalling data, and write() output is
         * also part of a shared mapping, and another thread has
         * decided to launch a writepage() against this buffer.
         */
        J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));

        new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);

        /* keep subsequent assertions sane */
        atomic_set(&new_bh->b_count, 1);

        spin_lock(&jh_in->b_state_lock);
repeat:
        /*
         * If a new transaction has already done a buffer copy-out, then
         * we use that version of the data for the commit.
         */
        if (jh_in->b_frozen_data) {
                done_copy_out = 1;
                new_folio = virt_to_folio(jh_in->b_frozen_data);
                new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
        } else {
                new_folio = jh2bh(jh_in)->b_folio;
                new_offset = offset_in_folio(new_folio, jh2bh(jh_in)->b_data);
        }

        mapped_data = kmap_local_folio(new_folio, new_offset);
        /*
         * Fire data frozen trigger if data already wasn't frozen.  Do this
         * before checking for escaping, as the trigger may modify the magic
         * offset.  If a copy-out happens afterwards, it will have the correct
         * data in the buffer.
         */
        if (!done_copy_out)
                jbd2_buffer_frozen_trigger(jh_in, mapped_data,
                                           jh_in->b_triggers);

        /*
         * Check for escaping
         */
        if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER)) {
                need_copy_out = 1;
                do_escape = 1;
        }
        kunmap_local(mapped_data);

        /*
         * Do we need to do a data copy?
         */
        if (need_copy_out && !done_copy_out) {
                char *tmp;

                spin_unlock(&jh_in->b_state_lock);
                tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
                if (!tmp) {
                        brelse(new_bh);
                        return -ENOMEM;
                }
                spin_lock(&jh_in->b_state_lock);
                if (jh_in->b_frozen_data) {
                        jbd2_free(tmp, bh_in->b_size);
                        goto repeat;
                }

                jh_in->b_frozen_data = tmp;
                memcpy_from_folio(tmp, new_folio, new_offset, bh_in->b_size);

                new_folio = virt_to_folio(tmp);
                new_offset = offset_in_folio(new_folio, tmp);
                done_copy_out = 1;

                /*
                 * This isn't strictly necessary, as we're using frozen
                 * data for the escaping, but it keeps consistency with
                 * b_frozen_data usage.
                 */
                jh_in->b_frozen_triggers = jh_in->b_triggers;
        }

        /*
         * Did we need to do an escaping?  Now we've done all the
         * copying, we can finally do so.
         */
        if (do_escape) {
                mapped_data = kmap_local_folio(new_folio, new_offset);
                *((unsigned int *)mapped_data) = 0;
                kunmap_local(mapped_data);
        }

        folio_set_bh(new_bh, new_folio, new_offset);
        new_bh->b_size = bh_in->b_size;
        new_bh->b_bdev = journal->j_dev;
        new_bh->b_blocknr = blocknr;
        new_bh->b_private = bh_in;
        set_buffer_mapped(new_bh);
        set_buffer_dirty(new_bh);

        *bh_out = new_bh;

        /*
         * The to-be-written buffer needs to get moved to the io queue,
         * and the original buffer whose contents we are shadowing or
         * copying is moved to the transaction's shadow queue.
         */
        JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
        spin_lock(&journal->j_list_lock);
        __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
        spin_unlock(&journal->j_list_lock);
        set_buffer_shadow(bh_in);
        spin_unlock(&jh_in->b_state_lock);

        return do_escape | (done_copy_out << 1);
}

/*
 * Allocation code for the journal file.  Manage the space left in the
 * journal, so that we can begin checkpointing when appropriate.
 */

/*
 * Called with j_state_lock locked for writing.
 * Returns true if a transaction commit was started.
 */
static int __jbd2_log_start_commit(journal_t *journal, tid_t target)
{
        /* Return if the txn has already requested to be committed */
        if (journal->j_commit_request == target)
                return 0;

        /*
         * The only transaction we can possibly wait upon is the
         * currently running transaction (if it exists).  Otherwise,
         * the target tid must be an old one.
         */
        if (journal->j_running_transaction &&
            journal->j_running_transaction->t_tid == target) {
                /*
                 * We want a new commit: OK, mark the request and wakeup the
                 * commit thread.  We do _not_ do the commit ourselves.
                 */

                journal->j_commit_request = target;
                jbd2_debug(1, "JBD2: requesting commit %u/%u\n",
                          journal->j_commit_request,
                          journal->j_commit_sequence);
                journal->j_running_transaction->t_requested = jiffies;
                wake_up(&journal->j_wait_commit);
                return 1;
        } else if (!tid_geq(journal->j_commit_request, target))
                /* This should never happen, but if it does, preserve
                   the evidence before kjournald goes into a loop and
                   increments j_commit_sequence beyond all recognition. */
                WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n",
                          journal->j_commit_request,
                          journal->j_commit_sequence,
                          target, journal->j_running_transaction ?
                          journal->j_running_transaction->t_tid : 0);
        return 0;
}

int jbd2_log_start_commit(journal_t *journal, tid_t tid)
{
        int ret;

        write_lock(&journal->j_state_lock);
        ret = __jbd2_log_start_commit(journal, tid);
        write_unlock(&journal->j_state_lock);
        return ret;
}

/*
 * Force and wait any uncommitted transactions.  We can only force the running
 * transaction if we don't have an active handle, otherwise, we will deadlock.
 * Returns: <0 in case of error,
 *           0 if nothing to commit,
 *           1 if transaction was successfully committed.
 */
static int __jbd2_journal_force_commit(journal_t *journal)
{
        transaction_t *transaction = NULL;
        tid_t tid;
        int need_to_start = 0, ret = 0;

        read_lock(&journal->j_state_lock);
        if (journal->j_running_transaction && !current->journal_info) {
                transaction = journal->j_running_transaction;
                if (!tid_geq(journal->j_commit_request, transaction->t_tid))
                        need_to_start = 1;
        } else if (journal->j_committing_transaction)
                transaction = journal->j_committing_transaction;

        if (!transaction) {
                /* Nothing to commit */
                read_unlock(&journal->j_state_lock);
                return 0;
        }
        tid = transaction->t_tid;
        read_unlock(&journal->j_state_lock);
        if (need_to_start)
                jbd2_log_start_commit(journal, tid);
        ret = jbd2_log_wait_commit(journal, tid);
        if (!ret)
                ret = 1;

        return ret;
}

/**
 * jbd2_journal_force_commit_nested - Force and wait upon a commit if the
 * calling process is not within transaction.
 *
 * @journal: journal to force
 * Returns true if progress was made.
 *
 * This is used for forcing out undo-protected data which contains
 * bitmaps, when the fs is running out of space.
 */
int jbd2_journal_force_commit_nested(journal_t *journal)
{
        int ret;

        ret = __jbd2_journal_force_commit(journal);
        return ret > 0;
}

/**
 * jbd2_journal_force_commit() - force any uncommitted transactions
 * @journal: journal to force
 *
 * Caller want unconditional commit. We can only force the running transaction
 * if we don't have an active handle, otherwise, we will deadlock.
 */
int jbd2_journal_force_commit(journal_t *journal)
{
        int ret;

        J_ASSERT(!current->journal_info);
        ret = __jbd2_journal_force_commit(journal);
        if (ret > 0)
                ret = 0;
        return ret;
}

/*
 * Start a commit of the current running transaction (if any).  Returns true
 * if a transaction is going to be committed (or is currently already
 * committing), and fills its tid in at *ptid
 */
int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
{
        int ret = 0;

        write_lock(&journal->j_state_lock);
        if (journal->j_running_transaction) {
                tid_t tid = journal->j_running_transaction->t_tid;

                __jbd2_log_start_commit(journal, tid);
                /* There's a running transaction and we've just made sure
                 * it's commit has been scheduled. */
                if (ptid)
                        *ptid = tid;
                ret = 1;
        } else if (journal->j_committing_transaction) {
                /*
                 * If commit has been started, then we have to wait for
                 * completion of that transaction.
                 */
                if (ptid)
                        *ptid = journal->j_committing_transaction->t_tid;
                ret = 1;
        }
        write_unlock(&journal->j_state_lock);
        return ret;
}

/*
 * Return 1 if a given transaction has not yet sent barrier request
 * connected with a transaction commit. If 0 is returned, transaction
 * may or may not have sent the barrier. Used to avoid sending barrier
 * twice in common cases.
 */
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
{
        int ret = 0;
        transaction_t *commit_trans;

        if (!(journal->j_flags & JBD2_BARRIER))
                return 0;
        read_lock(&journal->j_state_lock);
        /* Transaction already committed? */
        if (tid_geq(journal->j_commit_sequence, tid))
                goto out;
        commit_trans = journal->j_committing_transaction;
        if (!commit_trans || commit_trans->t_tid != tid) {
                ret = 1;
                goto out;
        }
        /*
         * Transaction is being committed and we already proceeded to
         * submitting a flush to fs partition?
         */
        if (journal->j_fs_dev != journal->j_dev) {
                if (!commit_trans->t_need_data_flush ||
                    commit_trans->t_state >= T_COMMIT_DFLUSH)
                        goto out;
        } else {
                if (commit_trans->t_state >= T_COMMIT_JFLUSH)
                        goto out;
        }
        ret = 1;
out:
        read_unlock(&journal->j_state_lock);
        return ret;
}
EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);

/*
 * Wait for a specified commit to complete.
 * The caller may not hold the journal lock.
 */
int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
{
        int err = 0;

        read_lock(&journal->j_state_lock);
#ifdef CONFIG_PROVE_LOCKING
        /*
         * Some callers make sure transaction is already committing and in that
         * case we cannot block on open handles anymore. So don't warn in that
         * case.
         */
        if (tid_gt(tid, journal->j_commit_sequence) &&
            (!journal->j_committing_transaction ||
             journal->j_committing_transaction->t_tid != tid)) {
                read_unlock(&journal->j_state_lock);
                jbd2_might_wait_for_commit(journal);
                read_lock(&journal->j_state_lock);
        }
#endif
#ifdef CONFIG_JBD2_DEBUG
        if (!tid_geq(journal->j_commit_request, tid)) {
                printk(KERN_ERR
                       "%s: error: j_commit_request=%u, tid=%u\n",
                       __func__, journal->j_commit_request, tid);
        }
#endif
        while (tid_gt(tid, journal->j_commit_sequence)) {
                jbd2_debug(1, "JBD2: want %u, j_commit_sequence=%u\n",
                                  tid, journal->j_commit_sequence);
                read_unlock(&journal->j_state_lock);
                wake_up(&journal->j_wait_commit);
                wait_event(journal->j_wait_done_commit,
                                !tid_gt(tid, journal->j_commit_sequence));
                read_lock(&journal->j_state_lock);
        }
        read_unlock(&journal->j_state_lock);

        if (unlikely(is_journal_aborted(journal)))
                err = -EIO;
        return err;
}

/*
 * Start a fast commit. If there's an ongoing fast or full commit wait for
 * it to complete. Returns 0 if a new fast commit was started. Returns -EALREADY
 * if a fast commit is not needed, either because there's an already a commit
 * going on or this tid has already been committed. Returns -EINVAL if no jbd2
 * commit has yet been performed.
 */
int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
{
        if (unlikely(is_journal_aborted(journal)))
                return -EIO;
        /*
         * Fast commits only allowed if at least one full commit has
         * been processed.
         */
        if (!journal->j_stats.ts_tid)
                return -EINVAL;

        write_lock(&journal->j_state_lock);
        if (tid <= journal->j_commit_sequence) {
                write_unlock(&journal->j_state_lock);
                return -EALREADY;
        }

        if (journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
            (journal->j_flags & JBD2_FAST_COMMIT_ONGOING)) {
                DEFINE_WAIT(wait);

                prepare_to_wait(&journal->j_fc_wait, &wait,
                                TASK_UNINTERRUPTIBLE);
                write_unlock(&journal->j_state_lock);
                schedule();
                finish_wait(&journal->j_fc_wait, &wait);
                return -EALREADY;
        }
        journal->j_flags |= JBD2_FAST_COMMIT_ONGOING;
        write_unlock(&journal->j_state_lock);
        jbd2_journal_lock_updates(journal);

        return 0;
}
EXPORT_SYMBOL(jbd2_fc_begin_commit);

/*
 * Stop a fast commit. If fallback is set, this function starts commit of
 * TID tid before any other fast commit can start.
 */
static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
{
        jbd2_journal_unlock_updates(journal);
        if (journal->j_fc_cleanup_callback)
                journal->j_fc_cleanup_callback(journal, 0, tid);
        write_lock(&journal->j_state_lock);
        journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
        if (fallback)
                journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
        write_unlock(&journal->j_state_lock);
        wake_up(&journal->j_fc_wait);
        if (fallback)
                return jbd2_complete_transaction(journal, tid);
        return 0;
}

int jbd2_fc_end_commit(journal_t *journal)
{
        return __jbd2_fc_end_commit(journal, 0, false);
}
EXPORT_SYMBOL(jbd2_fc_end_commit);

int jbd2_fc_end_commit_fallback(journal_t *journal)
{
        tid_t tid;

        read_lock(&journal->j_state_lock);
        tid = journal->j_running_transaction ?
                journal->j_running_transaction->t_tid : 0;
        read_unlock(&journal->j_state_lock);
        return __jbd2_fc_end_commit(journal, tid, true);
}
EXPORT_SYMBOL(jbd2_fc_end_commit_fallback);

/* Return 1 when transaction with given tid has already committed. */
int jbd2_transaction_committed(journal_t *journal, tid_t tid)
{
        int ret = 1;

        read_lock(&journal->j_state_lock);
        if (journal->j_running_transaction &&
            journal->j_running_transaction->t_tid == tid)
                ret = 0;
        if (journal->j_committing_transaction &&
            journal->j_committing_transaction->t_tid == tid)
                ret = 0;
        read_unlock(&journal->j_state_lock);
        return ret;
}
EXPORT_SYMBOL(jbd2_transaction_committed);

/*
 * When this function returns the transaction corresponding to tid
 * will be completed.  If the transaction has currently running, start
 * committing that transaction before waiting for it to complete.  If
 * the transaction id is stale, it is by definition already completed,
 * so just return SUCCESS.
 */
int jbd2_complete_transaction(journal_t *journal, tid_t tid)
{
        int        need_to_wait = 1;

        read_lock(&journal->j_state_lock);
        if (journal->j_running_transaction &&
            journal->j_running_transaction->t_tid == tid) {
                if (journal->j_commit_request != tid) {
                        /* transaction not yet started, so request it */
                        read_unlock(&journal->j_state_lock);
                        jbd2_log_start_commit(journal, tid);
                        goto wait_commit;
                }
        } else if (!(journal->j_committing_transaction &&
                     journal->j_committing_transaction->t_tid == tid))
                need_to_wait = 0;
        read_unlock(&journal->j_state_lock);
        if (!need_to_wait)
                return 0;
wait_commit:
        return jbd2_log_wait_commit(journal, tid);
}
EXPORT_SYMBOL(jbd2_complete_transaction);

/*
 * Log buffer allocation routines:
 */

int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
{
        unsigned long blocknr;

        write_lock(&journal->j_state_lock);
        J_ASSERT(journal->j_free > 1);

        blocknr = journal->j_head;
        journal->j_head++;
        journal->j_free--;
        if (journal->j_head == journal->j_last)
                journal->j_head = journal->j_first;
        write_unlock(&journal->j_state_lock);
        return jbd2_journal_bmap(journal, blocknr, retp);
}

/* Map one fast commit buffer for use by the file system */
int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)
{
        unsigned long long pblock;
        unsigned long blocknr;
        int ret = 0;
        struct buffer_head *bh;
        int fc_off;

        *bh_out = NULL;

        if (journal->j_fc_off + journal->j_fc_first < journal->j_fc_last) {
                fc_off = journal->j_fc_off;
                blocknr = journal->j_fc_first + fc_off;
                journal->j_fc_off++;
        } else {
                ret = -EINVAL;
        }

        if (ret)
                return ret;

        ret = jbd2_journal_bmap(journal, blocknr, &pblock);
        if (ret)
                return ret;

        bh = __getblk(journal->j_dev, pblock, journal->j_blocksize);
        if (!bh)
                return -ENOMEM;


        journal->j_fc_wbuf[fc_off] = bh;

        *bh_out = bh;

        return 0;
}
EXPORT_SYMBOL(jbd2_fc_get_buf);

/*
 * Wait on fast commit buffers that were allocated by jbd2_fc_get_buf
 * for completion.
 */
int jbd2_fc_wait_bufs(journal_t *journal, int num_blks)
{
        struct buffer_head *bh;
        int i, j_fc_off;

        j_fc_off = journal->j_fc_off;

        /*
         * Wait in reverse order to minimize chances of us being woken up before
         * all IOs have completed
         */
        for (i = j_fc_off - 1; i >= j_fc_off - num_blks; i--) {
                bh = journal->j_fc_wbuf[i];
                wait_on_buffer(bh);
                /*
                 * Update j_fc_off so jbd2_fc_release_bufs can release remain
                 * buffer head.
                 */
                if (unlikely(!buffer_uptodate(bh))) {
                        journal->j_fc_off = i + 1;
                        return -EIO;
                }
                put_bh(bh);
                journal->j_fc_wbuf[i] = NULL;
        }

        return 0;
}
EXPORT_SYMBOL(jbd2_fc_wait_bufs);

int jbd2_fc_release_bufs(journal_t *journal)
{
        struct buffer_head *bh;
        int i, j_fc_off;

        j_fc_off = journal->j_fc_off;

        for (i = j_fc_off - 1; i >= 0; i--) {
                bh = journal->j_fc_wbuf[i];
                if (!bh)
                        break;
                put_bh(bh);
                journal->j_fc_wbuf[i] = NULL;
        }

        return 0;
}
EXPORT_SYMBOL(jbd2_fc_release_bufs);

/*
 * Conversion of logical to physical block numbers for the journal
 *
 * On external journals the journal blocks are identity-mapped, so
 * this is a no-op.  If needed, we can use j_blk_offset - everything is
 * ready.
 */
int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
                 unsigned long long *retp)
{
        int err = 0;
        unsigned long long ret;
        sector_t block = blocknr;

        if (journal->j_bmap) {
                err = journal->j_bmap(journal, &block);
                if (err == 0)
                        *retp = block;
        } else if (journal->j_inode) {
                ret = bmap(journal->j_inode, &block);

                if (ret || !block) {
                        printk(KERN_ALERT "%s: journal block not found "
                                        "at offset %lu on %s\n",
                               __func__, blocknr, journal->j_devname);
                        err = -EIO;
                        jbd2_journal_abort(journal, err);
                } else {
                        *retp = block;
                }

        } else {
                *retp = blocknr; /* +journal->j_blk_offset */
        }
        return err;
}

/*
 * We play buffer_head aliasing tricks to write data/metadata blocks to
 * the journal without copying their contents, but for journal
 * descriptor blocks we do need to generate bona fide buffers.
 *
 * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying
 * the buffer's contents they really should run flush_dcache_page(bh->b_page).
 * But we don't bother doing that, so there will be coherency problems with
 * mmaps of blockdevs which hold live JBD-controlled filesystems.
 */
struct buffer_head *
jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type)
{
        journal_t *journal = transaction->t_journal;
        struct buffer_head *bh;
        unsigned long long blocknr;
        journal_header_t *header;
        int err;

        err = jbd2_journal_next_log_block(journal, &blocknr);

        if (err)
                return NULL;

        bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
        if (!bh)
                return NULL;
        atomic_dec(&transaction->t_outstanding_credits);
        lock_buffer(bh);
        memset(bh->b_data, 0, journal->j_blocksize);
        header = (journal_header_t *)bh->b_data;
        header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
        header->h_blocktype = cpu_to_be32(type);
        header->h_sequence = cpu_to_be32(transaction->t_tid);
        set_buffer_uptodate(bh);
        unlock_buffer(bh);
        BUFFER_TRACE(bh, "return this buffer");
        return bh;
}

void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh)
{
        struct jbd2_journal_block_tail *tail;
        __u32 csum;

        if (!jbd2_journal_has_csum_v2or3(j))
                return;

        tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
                        sizeof(struct jbd2_journal_block_tail));
        tail->t_checksum = 0;
        csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
        tail->t_checksum = cpu_to_be32(csum);
}

/*
 * Return tid of the oldest transaction in the journal and block in the journal
 * where the transaction starts.
 *
 * If the journal is now empty, return which will be the next transaction ID
 * we will write and where will that transaction start.
 *
 * The return value is 0 if journal tail cannot be pushed any further, 1 if
 * it can.
 */
int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
                              unsigned long *block)
{
        transaction_t *transaction;
        int ret;

        read_lock(&journal->j_state_lock);
        spin_lock(&journal->j_list_lock);
        transaction = journal->j_checkpoint_transactions;
        if (transaction) {
                *tid = transaction->t_tid;
                *block = transaction->t_log_start;
        } else if ((transaction = journal->j_committing_transaction) != NULL) {
                *tid = transaction->t_tid;
                *block = transaction->t_log_start;
        } else if ((transaction = journal->j_running_transaction) != NULL) {
                *tid = transaction->t_tid;
                *block = journal->j_head;
        } else {
                *tid = journal->j_transaction_sequence;
                *block = journal->j_head;
        }
        ret = tid_gt(*tid, journal->j_tail_sequence);
        spin_unlock(&journal->j_list_lock);
        read_unlock(&journal->j_state_lock);

        return ret;
}

/*
 * Update information in journal structure and in on disk journal superblock
 * about log tail. This function does not check whether information passed in
 * really pushes log tail further. It's responsibility of the caller to make
 * sure provided log tail information is valid (e.g. by holding
 * j_checkpoint_mutex all the time between computing log tail and calling this
 * function as is the case with jbd2_cleanup_journal_tail()).
 *
 * Requires j_checkpoint_mutex
 */
int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
{
        unsigned long freed;
        int ret;

        BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));

        /*
         * We cannot afford for write to remain in drive's caches since as
         * soon as we update j_tail, next transaction can start reusing journal
         * space and if we lose sb update during power failure we'd replay
         * old transaction with possibly newly overwritten data.
         */
        ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA);
        if (ret)
                goto out;

        write_lock(&journal->j_state_lock);
        freed = block - journal->j_tail;
        if (block < journal->j_tail)
                freed += journal->j_last - journal->j_first;

        trace_jbd2_update_log_tail(journal, tid, block, freed);
        jbd2_debug(1,
                  "Cleaning journal tail from %u to %u (offset %lu), "
                  "freeing %lu\n",
                  journal->j_tail_sequence, tid, block, freed);

        journal->j_free += freed;
        journal->j_tail_sequence = tid;
        journal->j_tail = block;
        write_unlock(&journal->j_state_lock);

out:
        return ret;
}

/*
 * This is a variation of __jbd2_update_log_tail which checks for validity of
 * provided log tail and locks j_checkpoint_mutex. So it is safe against races
 * with other threads updating log tail.
 */
void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
{
        mutex_lock_io(&journal->j_checkpoint_mutex);
        if (tid_gt(tid, journal->j_tail_sequence))
                __jbd2_update_log_tail(journal, tid, block);
        mutex_unlock(&journal->j_checkpoint_mutex);
}

struct jbd2_stats_proc_session {
        journal_t *journal;
        struct transaction_stats_s *stats;
        int start;
        int max;
};

static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
{
        return *pos ? NULL : SEQ_START_TOKEN;
}

static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
{
        (*pos)++;
        return NULL;
}

static int jbd2_seq_info_show(struct seq_file *seq, void *v)
{
        struct jbd2_stats_proc_session *s = seq->private;

        if (v != SEQ_START_TOKEN)
                return 0;
        seq_printf(seq, "%lu transactions (%lu requested), "
                   "each up to %u blocks\n",
                   s->stats->ts_tid, s->stats->ts_requested,
                   s->journal->j_max_transaction_buffers);
        if (s->stats->ts_tid == 0)
                return 0;
        seq_printf(seq, "average: \n  %ums waiting for transaction\n",
            jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
        seq_printf(seq, "  %ums request delay\n",
            (s->stats->ts_requested == 0) ? 0 :
            jiffies_to_msecs(s->stats->run.rs_request_delay /
                             s->stats->ts_requested));
        seq_printf(seq, "  %ums running transaction\n",
            jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
        seq_printf(seq, "  %ums transaction was being locked\n",
            jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid));
        seq_printf(seq, "  %ums flushing data (in ordered mode)\n",
            jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid));
        seq_printf(seq, "  %ums logging transaction\n",
            jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid));
        seq_printf(seq, "  %lluus average transaction commit time\n",
                   div_u64(s->journal->j_average_commit_time, 1000));
        seq_printf(seq, "  %lu handles per transaction\n",
            s->stats->run.rs_handle_count / s->stats->ts_tid);
        seq_printf(seq, "  %lu blocks per transaction\n",
            s->stats->run.rs_blocks / s->stats->ts_tid);
        seq_printf(seq, "  %lu logged blocks per transaction\n",
            s->stats->run.rs_blocks_logged / s->stats->ts_tid);
        return 0;
}

static void jbd2_seq_info_stop(struct seq_file *seq, void *v)
{
}

static const struct seq_operations jbd2_seq_info_ops = {
        .start  = jbd2_seq_info_start,
        .next   = jbd2_seq_info_next,
        .stop   = jbd2_seq_info_stop,
        .show   = jbd2_seq_info_show,
};

static int jbd2_seq_info_open(struct inode *inode, struct file *file)
{
        journal_t *journal = pde_data(inode);
        struct jbd2_stats_proc_session *s;
        int rc, size;

        s = kmalloc(sizeof(*s), GFP_KERNEL);
        if (s == NULL)
                return -ENOMEM;
        size = sizeof(struct transaction_stats_s);
        s->stats = kmalloc(size, GFP_KERNEL);
        if (s->stats == NULL) {
                kfree(s);
                return -ENOMEM;
        }
        spin_lock(&journal->j_history_lock);
        memcpy(s->stats, &journal->j_stats, size);
        s->journal = journal;
        spin_unlock(&journal->j_history_lock);

        rc = seq_open(file, &jbd2_seq_info_ops);
        if (rc == 0) {
                struct seq_file *m = file->private_data;
                m->private = s;
        } else {
                kfree(s->stats);
                kfree(s);
        }
        return rc;

}

static int jbd2_seq_info_release(struct inode *inode, struct file *file)
{
        struct seq_file *seq = file->private_data;
        struct jbd2_stats_proc_session *s = seq->private;
        kfree(s->stats);
        kfree(s);
        return seq_release(inode, file);
}

static const struct proc_ops jbd2_info_proc_ops = {
        .proc_open        = jbd2_seq_info_open,
        .proc_read        = seq_read,
        .proc_lseek        = seq_lseek,
        .proc_release        = jbd2_seq_info_release,
};

static struct proc_dir_entry *proc_jbd2_stats;

static void jbd2_stats_proc_init(journal_t *journal)
{
        journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
        if (journal->j_proc_entry) {
                proc_create_data("info", S_IRUGO, journal->j_proc_entry,
                                 &jbd2_info_proc_ops, journal);
        }
}

static void jbd2_stats_proc_exit(journal_t *journal)
{
        remove_proc_entry("info", journal->j_proc_entry);
        remove_proc_entry(journal->j_devname, proc_jbd2_stats);
}

/* Minimum size of descriptor tag */
static int jbd2_min_tag_size(void)
{
        /*
         * Tag with 32-bit block numbers does not use last four bytes of the
         * structure
         */
        return sizeof(journal_block_tag_t) - 4;
}

/**
 * jbd2_journal_shrink_scan()
 * @shrink: shrinker to work on
 * @sc: reclaim request to process
 *
 * Scan the checkpointed buffer on the checkpoint list and release the
 * journal_head.
 */
static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
                                              struct shrink_control *sc)
{
        journal_t *journal = shrink->private_data;
        unsigned long nr_to_scan = sc->nr_to_scan;
        unsigned long nr_shrunk;
        unsigned long count;

        count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
        trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count);

        nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan);

        count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
        trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count);

        return nr_shrunk;
}

/**
 * jbd2_journal_shrink_count()
 * @shrink: shrinker to work on
 * @sc: reclaim request to process
 *
 * Count the number of checkpoint buffers on the checkpoint list.
 */
static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink,
                                               struct shrink_control *sc)
{
        journal_t *journal = shrink->private_data;
        unsigned long count;

        count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count);
        trace_jbd2_shrink_count(journal, sc->nr_to_scan, count);

        return count;
}

/*
 * If the journal init or create aborts, we need to mark the journal
 * superblock as being NULL to prevent the journal destroy from writing
 * back a bogus superblock.
 */
static void journal_fail_superblock(journal_t *journal)
{
        struct buffer_head *bh = journal->j_sb_buffer;
        brelse(bh);
        journal->j_sb_buffer = NULL;
}

/*
 * Check the superblock for a given journal, performing initial
 * validation of the format.
 */
static int journal_check_superblock(journal_t *journal)
{
        journal_superblock_t *sb = journal->j_superblock;
        int num_fc_blks;
        int err = -EINVAL;

        if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
            sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
                printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
                return err;
        }

        if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 &&
            be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) {
                printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
                return err;
        }

        if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) {
                printk(KERN_WARNING "JBD2: journal file too short\n");
                return err;
        }

        if (be32_to_cpu(sb->s_first) == 0 ||
            be32_to_cpu(sb->s_first) >= journal->j_total_len) {
                printk(KERN_WARNING
                        "JBD2: Invalid start block of journal: %u\n",
                        be32_to_cpu(sb->s_first));
                return err;
        }

        /*
         * If this is a V2 superblock, then we have to check the
         * features flags on it.
         */
        if (!jbd2_format_support_feature(journal))
                return 0;

        if ((sb->s_feature_ro_compat &
                        ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
            (sb->s_feature_incompat &
                        ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
                printk(KERN_WARNING "JBD2: Unrecognised features on journal\n");
                return err;
        }

        num_fc_blks = jbd2_has_feature_fast_commit(journal) ?
                                jbd2_journal_get_num_fc_blks(sb) : 0;
        if (be32_to_cpu(sb->s_maxlen) < JBD2_MIN_JOURNAL_BLOCKS ||
            be32_to_cpu(sb->s_maxlen) - JBD2_MIN_JOURNAL_BLOCKS < num_fc_blks) {
                printk(KERN_ERR "JBD2: journal file too short %u,%d\n",
                       be32_to_cpu(sb->s_maxlen), num_fc_blks);
                return err;
        }

        if (jbd2_has_feature_csum2(journal) &&
            jbd2_has_feature_csum3(journal)) {
                /* Can't have checksum v2 and v3 at the same time! */
                printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
                       "at the same time!\n");
                return err;
        }

        if (jbd2_journal_has_csum_v2or3_feature(journal) &&
            jbd2_has_feature_checksum(journal)) {
                /* Can't have checksum v1 and v2 on at the same time! */
                printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
                       "at the same time!\n");
                return err;
        }

        /* Load the checksum driver */
        if (jbd2_journal_has_csum_v2or3_feature(journal)) {
                if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) {
                        printk(KERN_ERR "JBD2: Unknown checksum type\n");
                        return err;
                }

                journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
                if (IS_ERR(journal->j_chksum_driver)) {
                        printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
                        err = PTR_ERR(journal->j_chksum_driver);
                        journal->j_chksum_driver = NULL;
                        return err;
                }
                /* Check superblock checksum */
                if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) {
                        printk(KERN_ERR "JBD2: journal checksum error\n");
                        err = -EFSBADCRC;
                        return err;
                }
        }

        return 0;
}

static int journal_revoke_records_per_block(journal_t *journal)
{
        int record_size;
        int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t);

        if (jbd2_has_feature_64bit(journal))
                record_size = 8;
        else
                record_size = 4;

        if (jbd2_journal_has_csum_v2or3(journal))
                space -= sizeof(struct jbd2_journal_block_tail);
        return space / record_size;
}

/*
 * Load the on-disk journal superblock and read the key fields into the
 * journal_t.
 */
static int journal_load_superblock(journal_t *journal)
{
        int err;
        struct buffer_head *bh;
        journal_superblock_t *sb;

        bh = getblk_unmovable(journal->j_dev, journal->j_blk_offset,
                              journal->j_blocksize);
        if (bh)
                err = bh_read(bh, 0);
        if (!bh || err < 0) {
                pr_err("%s: Cannot read journal superblock\n", __func__);
                brelse(bh);
                return -EIO;
        }

        journal->j_sb_buffer = bh;
        sb = (journal_superblock_t *)bh->b_data;
        journal->j_superblock = sb;
        err = journal_check_superblock(journal);
        if (err) {
                journal_fail_superblock(journal);
                return err;
        }

        journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
        journal->j_tail = be32_to_cpu(sb->s_start);
        journal->j_first = be32_to_cpu(sb->s_first);
        journal->j_errno = be32_to_cpu(sb->s_errno);
        journal->j_last = be32_to_cpu(sb->s_maxlen);

        if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len)
                journal->j_total_len = be32_to_cpu(sb->s_maxlen);
        /* Precompute checksum seed for all metadata */
        if (jbd2_journal_has_csum_v2or3(journal))
                journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
                                                   sizeof(sb->s_uuid));
        journal->j_revoke_records_per_block =
                                journal_revoke_records_per_block(journal);

        if (jbd2_has_feature_fast_commit(journal)) {
                journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
                journal->j_last = journal->j_fc_last -
                                  jbd2_journal_get_num_fc_blks(sb);
                journal->j_fc_first = journal->j_last + 1;
                journal->j_fc_off = 0;
        }

        return 0;
}


/*
 * Management for journal control blocks: functions to create and
 * destroy journal_t structures, and to initialise and read existing
 * journal blocks from disk.  */

/* First: create and setup a journal_t object in memory.  We initialise
 * very few fields yet: that has to wait until we have created the
 * journal structures from from scratch, or loaded them from disk. */

static journal_t *journal_init_common(struct block_device *bdev,
                        struct block_device *fs_dev,
                        unsigned long long start, int len, int blocksize)
{
        static struct lock_class_key jbd2_trans_commit_key;
        journal_t *journal;
        int err;
        int n;

        journal = kzalloc(sizeof(*journal), GFP_KERNEL);
        if (!journal)
                return ERR_PTR(-ENOMEM);

        journal->j_blocksize = blocksize;
        journal->j_dev = bdev;
        journal->j_fs_dev = fs_dev;
        journal->j_blk_offset = start;
        journal->j_total_len = len;
        jbd2_init_fs_dev_write_error(journal);

        err = journal_load_superblock(journal);
        if (err)
                goto err_cleanup;

        init_waitqueue_head(&journal->j_wait_transaction_locked);
        init_waitqueue_head(&journal->j_wait_done_commit);
        init_waitqueue_head(&journal->j_wait_commit);
        init_waitqueue_head(&journal->j_wait_updates);
        init_waitqueue_head(&journal->j_wait_reserved);
        init_waitqueue_head(&journal->j_fc_wait);
        mutex_init(&journal->j_abort_mutex);
        mutex_init(&journal->j_barrier);
        mutex_init(&journal->j_checkpoint_mutex);
        spin_lock_init(&journal->j_revoke_lock);
        spin_lock_init(&journal->j_list_lock);
        spin_lock_init(&journal->j_history_lock);
        rwlock_init(&journal->j_state_lock);

        journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
        journal->j_min_batch_time = 0;
        journal->j_max_batch_time = 15000; /* 15ms */
        atomic_set(&journal->j_reserved_credits, 0);
        lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle",
                         &jbd2_trans_commit_key, 0);

        /* The journal is marked for error until we succeed with recovery! */
        journal->j_flags = JBD2_ABORT;

        /* Set up a default-sized revoke table for the new mount. */
        err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
        if (err)
                goto err_cleanup;

        /*
         * journal descriptor can store up to n blocks, we need enough
         * buffers to write out full descriptor block.
         */
        err = -ENOMEM;
        n = journal->j_blocksize / jbd2_min_tag_size();
        journal->j_wbufsize = n;
        journal->j_fc_wbuf = NULL;
        journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *),
                                        GFP_KERNEL);
        if (!journal->j_wbuf)
                goto err_cleanup;

        err = percpu_counter_init(&journal->j_checkpoint_jh_count, 0,
                                  GFP_KERNEL);
        if (err)
                goto err_cleanup;

        journal->j_shrink_transaction = NULL;

        journal->j_shrinker = shrinker_alloc(0, "jbd2-journal:(%u:%u)",
                                             MAJOR(bdev->bd_dev),
                                             MINOR(bdev->bd_dev));
        if (!journal->j_shrinker) {
                err = -ENOMEM;
                goto err_cleanup;
        }

        journal->j_shrinker->scan_objects = jbd2_journal_shrink_scan;
        journal->j_shrinker->count_objects = jbd2_journal_shrink_count;
        journal->j_shrinker->batch = journal->j_max_transaction_buffers;
        journal->j_shrinker->private_data = journal;

        shrinker_register(journal->j_shrinker);

        return journal;

err_cleanup:
        percpu_counter_destroy(&journal->j_checkpoint_jh_count);
        if (journal->j_chksum_driver)
                crypto_free_shash(journal->j_chksum_driver);
        kfree(journal->j_wbuf);
        jbd2_journal_destroy_revoke(journal);
        journal_fail_superblock(journal);
        kfree(journal);
        return ERR_PTR(err);
}

/* jbd2_journal_init_dev and jbd2_journal_init_inode:
 *
 * Create a journal structure assigned some fixed set of disk blocks to
 * the journal.  We don't actually touch those disk blocks yet, but we
 * need to set up all of the mapping information to tell the journaling
 * system where the journal blocks are.
 *
 */

/**
 *  journal_t * jbd2_journal_init_dev() - creates and initialises a journal structure
 *  @bdev: Block device on which to create the journal
 *  @fs_dev: Device which hold journalled filesystem for this journal.
 *  @start: Block nr Start of journal.
 *  @len:  Length of the journal in blocks.
 *  @blocksize: blocksize of journalling device
 *
 *  Returns: a newly created journal_t *
 *
 *  jbd2_journal_init_dev creates a journal which maps a fixed contiguous
 *  range of blocks on an arbitrary block device.
 *
 */
journal_t *jbd2_journal_init_dev(struct block_device *bdev,
                        struct block_device *fs_dev,
                        unsigned long long start, int len, int blocksize)
{
        journal_t *journal;

        journal = journal_init_common(bdev, fs_dev, start, len, blocksize);
        if (IS_ERR(journal))
                return ERR_CAST(journal);

        snprintf(journal->j_devname, sizeof(journal->j_devname),
                 "%pg", journal->j_dev);
        strreplace(journal->j_devname, '/', '!');
        jbd2_stats_proc_init(journal);

        return journal;
}

/**
 *  journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode.
 *  @inode: An inode to create the journal in
 *
 * jbd2_journal_init_inode creates a journal which maps an on-disk inode as
 * the journal.  The inode must exist already, must support bmap() and
 * must have all data blocks preallocated.
 */
journal_t *jbd2_journal_init_inode(struct inode *inode)
{
        journal_t *journal;
        sector_t blocknr;
        int err = 0;

        blocknr = 0;
        err = bmap(inode, &blocknr);
        if (err || !blocknr) {
                pr_err("%s: Cannot locate journal superblock\n", __func__);
                return err ? ERR_PTR(err) : ERR_PTR(-EINVAL);
        }

        jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
                  inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size,
                  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);

        journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev,
                        blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits,
                        inode->i_sb->s_blocksize);
        if (IS_ERR(journal))
                return ERR_CAST(journal);

        journal->j_inode = inode;
        snprintf(journal->j_devname, sizeof(journal->j_devname),
                 "%pg-%lu", journal->j_dev, journal->j_inode->i_ino);
        strreplace(journal->j_devname, '/', '!');
        jbd2_stats_proc_init(journal);

        return journal;
}

/*
 * Given a journal_t structure, initialise the various fields for
 * startup of a new journaling session.  We use this both when creating
 * a journal, and after recovering an old journal to reset it for
 * subsequent use.
 */

static int journal_reset(journal_t *journal)
{
        journal_superblock_t *sb = journal->j_superblock;
        unsigned long long first, last;

        first = be32_to_cpu(sb->s_first);
        last = be32_to_cpu(sb->s_maxlen);
        if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
                printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n",
                       first, last);
                journal_fail_superblock(journal);
                return -EINVAL;
        }

        journal->j_first = first;
        journal->j_last = last;

        if (journal->j_head != 0 && journal->j_flags & JBD2_CYCLE_RECORD) {
                /*
                 * Disable the cycled recording mode if the journal head block
                 * number is not correct.
                 */
                if (journal->j_head < first || journal->j_head >= last) {
                        printk(KERN_WARNING "JBD2: Incorrect Journal head block %lu, "
                               "disable journal_cycle_record\n",
                               journal->j_head);
                        journal->j_head = journal->j_first;
                }
        } else {
                journal->j_head = journal->j_first;
        }
        journal->j_tail = journal->j_head;
        journal->j_free = journal->j_last - journal->j_first;

        journal->j_tail_sequence = journal->j_transaction_sequence;
        journal->j_commit_sequence = journal->j_transaction_sequence - 1;
        journal->j_commit_request = journal->j_commit_sequence;

        journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal);

        /*
         * Now that journal recovery is done, turn fast commits off here. This
         * way, if fast commit was enabled before the crash but if now FS has
         * disabled it, we don't enable fast commits.
         */
        jbd2_clear_feature_fast_commit(journal);

        /*
         * As a special case, if the on-disk copy is already marked as needing
         * no recovery (s_start == 0), then we can safely defer the superblock
         * update until the next commit by setting JBD2_FLUSHED.  This avoids
         * attempting a write to a potential-readonly device.
         */
        if (sb->s_start == 0) {
                jbd2_debug(1, "JBD2: Skipping superblock update on recovered sb "
                        "(start %ld, seq %u, errno %d)\n",
                        journal->j_tail, journal->j_tail_sequence,
                        journal->j_errno);
                journal->j_flags |= JBD2_FLUSHED;
        } else {
                /* Lock here to make assertions happy... */
                mutex_lock_io(&journal->j_checkpoint_mutex);
                /*
                 * Update log tail information. We use REQ_FUA since new
                 * transaction will start reusing journal space and so we
                 * must make sure information about current log tail is on
                 * disk before that.
                 */
                jbd2_journal_update_sb_log_tail(journal,
                                                journal->j_tail_sequence,
                                                journal->j_tail, REQ_FUA);
                mutex_unlock(&journal->j_checkpoint_mutex);
        }
        return jbd2_journal_start_thread(journal);
}

/*
 * This function expects that the caller will have locked the journal
 * buffer head, and will return with it unlocked
 */
static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags)
{
        struct buffer_head *bh = journal->j_sb_buffer;
        journal_superblock_t *sb = journal->j_superblock;
        int ret = 0;

        /* Buffer got discarded which means block device got invalidated */
        if (!buffer_mapped(bh)) {
                unlock_buffer(bh);
                return -EIO;
        }

        /*
         * Always set high priority flags to exempt from block layer's
         * QOS policies, e.g. writeback throttle.
         */
        write_flags |= JBD2_JOURNAL_REQ_FLAGS;
        if (!(journal->j_flags & JBD2_BARRIER))
                write_flags &= ~(REQ_FUA | REQ_PREFLUSH);

        trace_jbd2_write_superblock(journal, write_flags);

        if (buffer_write_io_error(bh)) {
                /*
                 * Oh, dear.  A previous attempt to write the journal
                 * superblock failed.  This could happen because the
                 * USB device was yanked out.  Or it could happen to
                 * be a transient write error and maybe the block will
                 * be remapped.  Nothing we can do but to retry the
                 * write and hope for the best.
                 */
                printk(KERN_ERR "JBD2: previous I/O error detected "
                       "for journal superblock update for %s.\n",
                       journal->j_devname);
                clear_buffer_write_io_error(bh);
                set_buffer_uptodate(bh);
        }
        if (jbd2_journal_has_csum_v2or3(journal))
                sb->s_checksum = jbd2_superblock_csum(journal, sb);
        get_bh(bh);
        bh->b_end_io = end_buffer_write_sync;
        submit_bh(REQ_OP_WRITE | write_flags, bh);
        wait_on_buffer(bh);
        if (buffer_write_io_error(bh)) {
                clear_buffer_write_io_error(bh);
                set_buffer_uptodate(bh);
                ret = -EIO;
        }
        if (ret) {
                printk(KERN_ERR "JBD2: I/O error when updating journal superblock for %s.\n",
                                journal->j_devname);
                if (!is_journal_aborted(journal))
                        jbd2_journal_abort(journal, ret);
        }

        return ret;
}

/**
 * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk.
 * @journal: The journal to update.
 * @tail_tid: TID of the new transaction at the tail of the log
 * @tail_block: The first block of the transaction at the tail of the log
 * @write_flags: Flags for the journal sb write operation
 *
 * Update a journal's superblock information about log tail and write it to
 * disk, waiting for the IO to complete.
 */
int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
                                    unsigned long tail_block,
                                    blk_opf_t write_flags)
{
        journal_superblock_t *sb = journal->j_superblock;
        int ret;

        if (is_journal_aborted(journal))
                return -EIO;
        if (jbd2_check_fs_dev_write_error(journal)) {
                jbd2_journal_abort(journal, -EIO);
                return -EIO;
        }

        BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
        jbd2_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
                  tail_block, tail_tid);

        lock_buffer(journal->j_sb_buffer);
        sb->s_sequence = cpu_to_be32(tail_tid);
        sb->s_start    = cpu_to_be32(tail_block);

        ret = jbd2_write_superblock(journal, write_flags);
        if (ret)
                goto out;

        /* Log is no longer empty */
        write_lock(&journal->j_state_lock);
        WARN_ON(!sb->s_sequence);
        journal->j_flags &= ~JBD2_FLUSHED;
        write_unlock(&journal->j_state_lock);

out:
        return ret;
}

/**
 * jbd2_mark_journal_empty() - Mark on disk journal as empty.
 * @journal: The journal to update.
 * @write_flags: Flags for the journal sb write operation
 *
 * Update a journal's dynamic superblock fields to show that journal is empty.
 * Write updated superblock to disk waiting for IO to complete.
 */
static void jbd2_mark_journal_empty(journal_t *journal, blk_opf_t write_flags)
{
        journal_superblock_t *sb = journal->j_superblock;
        bool had_fast_commit = false;

        BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
        lock_buffer(journal->j_sb_buffer);
        if (sb->s_start == 0) {                /* Is it already empty? */
                unlock_buffer(journal->j_sb_buffer);
                return;
        }

        jbd2_debug(1, "JBD2: Marking journal as empty (seq %u)\n",
                  journal->j_tail_sequence);

        sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
        sb->s_start    = cpu_to_be32(0);
        sb->s_head     = cpu_to_be32(journal->j_head);
        if (jbd2_has_feature_fast_commit(journal)) {
                /*
                 * When journal is clean, no need to commit fast commit flag and
                 * make file system incompatible with older kernels.
                 */
                jbd2_clear_feature_fast_commit(journal);
                had_fast_commit = true;
        }

        jbd2_write_superblock(journal, write_flags);

        if (had_fast_commit)
                jbd2_set_feature_fast_commit(journal);

        /* Log is no longer empty */
        write_lock(&journal->j_state_lock);
        journal->j_flags |= JBD2_FLUSHED;
        write_unlock(&journal->j_state_lock);
}

/**
 * __jbd2_journal_erase() - Discard or zeroout journal blocks (excluding superblock)
 * @journal: The journal to erase.
 * @flags: A discard/zeroout request is sent for each physically contigous
 *        region of the journal. Either JBD2_JOURNAL_FLUSH_DISCARD or
 *        JBD2_JOURNAL_FLUSH_ZEROOUT must be set to determine which operation
 *        to perform.
 *
 * Note: JBD2_JOURNAL_FLUSH_ZEROOUT attempts to use hardware offload. Zeroes
 * will be explicitly written if no hardware offload is available, see
 * blkdev_issue_zeroout for more details.
 */
static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
{
        int err = 0;
        unsigned long block, log_offset; /* logical */
        unsigned long long phys_block, block_start, block_stop; /* physical */
        loff_t byte_start, byte_stop, byte_count;

        /* flags must be set to either discard or zeroout */
        if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) || !flags ||
                        ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
                        (flags & JBD2_JOURNAL_FLUSH_ZEROOUT)))
                return -EINVAL;

        if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
            !bdev_max_discard_sectors(journal->j_dev))
                return -EOPNOTSUPP;

        /*
         * lookup block mapping and issue discard/zeroout for each
         * contiguous region
         */
        log_offset = be32_to_cpu(journal->j_superblock->s_first);
        block_start =  ~0ULL;
        for (block = log_offset; block < journal->j_total_len; block++) {
                err = jbd2_journal_bmap(journal, block, &phys_block);
                if (err) {
                        pr_err("JBD2: bad block at offset %lu", block);
                        return err;
                }

                if (block_start == ~0ULL) {
                        block_start = phys_block;
                        block_stop = block_start - 1;
                }

                /*
                 * last block not contiguous with current block,
                 * process last contiguous region and return to this block on
                 * next loop
                 */
                if (phys_block != block_stop + 1) {
                        block--;
                } else {
                        block_stop++;
                        /*
                         * if this isn't the last block of journal,
                         * no need to process now because next block may also
                         * be part of this contiguous region
                         */
                        if (block != journal->j_total_len - 1)
                                continue;
                }

                /*
                 * end of contiguous region or this is last block of journal,
                 * take care of the region
                 */
                byte_start = block_start * journal->j_blocksize;
                byte_stop = block_stop * journal->j_blocksize;
                byte_count = (block_stop - block_start + 1) *
                                journal->j_blocksize;

                truncate_inode_pages_range(journal->j_dev->bd_mapping,
                                byte_start, byte_stop);

                if (flags & JBD2_JOURNAL_FLUSH_DISCARD) {
                        err = blkdev_issue_discard(journal->j_dev,
                                        byte_start >> SECTOR_SHIFT,
                                        byte_count >> SECTOR_SHIFT,
                                        GFP_NOFS);
                } else if (flags & JBD2_JOURNAL_FLUSH_ZEROOUT) {
                        err = blkdev_issue_zeroout(journal->j_dev,
                                        byte_start >> SECTOR_SHIFT,
                                        byte_count >> SECTOR_SHIFT,
                                        GFP_NOFS, 0);
                }

                if (unlikely(err != 0)) {
                        pr_err("JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu",
                                        err, block_start, block_stop);
                        return err;
                }

                /* reset start and stop after processing a region */
                block_start = ~0ULL;
        }

        return blkdev_issue_flush(journal->j_dev);
}

/**
 * jbd2_journal_update_sb_errno() - Update error in the journal.
 * @journal: The journal to update.
 *
 * Update a journal's errno.  Write updated superblock to disk waiting for IO
 * to complete.
 */
void jbd2_journal_update_sb_errno(journal_t *journal)
{
        journal_superblock_t *sb = journal->j_superblock;
        int errcode;

        lock_buffer(journal->j_sb_buffer);
        errcode = journal->j_errno;
        if (errcode == -ESHUTDOWN)
                errcode = 0;
        jbd2_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
        sb->s_errno    = cpu_to_be32(errcode);

        jbd2_write_superblock(journal, REQ_FUA);
}
EXPORT_SYMBOL(jbd2_journal_update_sb_errno);

/**
 * jbd2_journal_load() - Read journal from disk.
 * @journal: Journal to act on.
 *
 * Given a journal_t structure which tells us which disk blocks contain
 * a journal, read the journal from disk to initialise the in-memory
 * structures.
 */
int jbd2_journal_load(journal_t *journal)
{
        int err;
        journal_superblock_t *sb = journal->j_superblock;

        /*
         * Create a slab for this blocksize
         */
        err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
        if (err)
                return err;

        /* Let the recovery code check whether it needs to recover any
         * data from the journal. */
        err = jbd2_journal_recover(journal);
        if (err) {
                pr_warn("JBD2: journal recovery failed\n");
                return err;
        }

        if (journal->j_failed_commit) {
                printk(KERN_ERR "JBD2: journal transaction %u on %s "
                       "is corrupt.\n", journal->j_failed_commit,
                       journal->j_devname);
                return -EFSCORRUPTED;
        }
        /*
         * clear JBD2_ABORT flag initialized in journal_init_common
         * here to update log tail information with the newest seq.
         */
        journal->j_flags &= ~JBD2_ABORT;

        /* OK, we've finished with the dynamic journal bits:
         * reinitialise the dynamic contents of the superblock in memory
         * and reset them on disk. */
        err = journal_reset(journal);
        if (err) {
                pr_warn("JBD2: journal reset failed\n");
                return err;
        }

        journal->j_flags |= JBD2_LOADED;
        return 0;
}

/**
 * jbd2_journal_destroy() - Release a journal_t structure.
 * @journal: Journal to act on.
 *
 * Release a journal_t structure once it is no longer in use by the
 * journaled object.
 * Return <0 if we couldn't clean up the journal.
 */
int jbd2_journal_destroy(journal_t *journal)
{
        int err = 0;

        /* Wait for the commit thread to wake up and die. */
        journal_kill_thread(journal);

        /* Force a final log commit */
        if (journal->j_running_transaction)
                jbd2_journal_commit_transaction(journal);

        /* Force any old transactions to disk */

        /* Totally anal locking here... */
        spin_lock(&journal->j_list_lock);
        while (journal->j_checkpoint_transactions != NULL) {
                spin_unlock(&journal->j_list_lock);
                mutex_lock_io(&journal->j_checkpoint_mutex);
                err = jbd2_log_do_checkpoint(journal);
                mutex_unlock(&journal->j_checkpoint_mutex);
                /*
                 * If checkpointing failed, just free the buffers to avoid
                 * looping forever
                 */
                if (err) {
                        jbd2_journal_destroy_checkpoint(journal);
                        spin_lock(&journal->j_list_lock);
                        break;
                }
                spin_lock(&journal->j_list_lock);
        }

        J_ASSERT(journal->j_running_transaction == NULL);
        J_ASSERT(journal->j_committing_transaction == NULL);
        J_ASSERT(journal->j_checkpoint_transactions == NULL);
        spin_unlock(&journal->j_list_lock);

        /*
         * OK, all checkpoint transactions have been checked, now check the
         * writeback errseq of fs dev and abort the journal if some buffer
         * failed to write back to the original location, otherwise the
         * filesystem may become inconsistent.
         */
        if (!is_journal_aborted(journal) &&
            jbd2_check_fs_dev_write_error(journal))
                jbd2_journal_abort(journal, -EIO);

        if (journal->j_sb_buffer) {
                if (!is_journal_aborted(journal)) {
                        mutex_lock_io(&journal->j_checkpoint_mutex);

                        write_lock(&journal->j_state_lock);
                        journal->j_tail_sequence =
                                ++journal->j_transaction_sequence;
                        write_unlock(&journal->j_state_lock);

                        jbd2_mark_journal_empty(journal, REQ_PREFLUSH | REQ_FUA);
                        mutex_unlock(&journal->j_checkpoint_mutex);
                } else
                        err = -EIO;
                brelse(journal->j_sb_buffer);
        }

        if (journal->j_shrinker) {
                percpu_counter_destroy(&journal->j_checkpoint_jh_count);
                shrinker_free(journal->j_shrinker);
        }
        if (journal->j_proc_entry)
                jbd2_stats_proc_exit(journal);
        iput(journal->j_inode);
        if (journal->j_revoke)
                jbd2_journal_destroy_revoke(journal);
        if (journal->j_chksum_driver)
                crypto_free_shash(journal->j_chksum_driver);
        kfree(journal->j_fc_wbuf);
        kfree(journal->j_wbuf);
        kfree(journal);

        return err;
}


/**
 * jbd2_journal_check_used_features() - Check if features specified are used.
 * @journal: Journal to check.
 * @compat: bitmask of compatible features
 * @ro: bitmask of features that force read-only mount
 * @incompat: bitmask of incompatible features
 *
 * Check whether the journal uses all of a given set of
 * features.  Return true (non-zero) if it does.
 **/

int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat,
                                 unsigned long ro, unsigned long incompat)
{
        journal_superblock_t *sb;

        if (!compat && !ro && !incompat)
                return 1;
        if (!jbd2_format_support_feature(journal))
                return 0;

        sb = journal->j_superblock;

        if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
            ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
            ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
                return 1;

        return 0;
}

/**
 * jbd2_journal_check_available_features() - Check feature set in journalling layer
 * @journal: Journal to check.
 * @compat: bitmask of compatible features
 * @ro: bitmask of features that force read-only mount
 * @incompat: bitmask of incompatible features
 *
 * Check whether the journaling code supports the use of
 * all of a given set of features on this journal.  Return true
 * (non-zero) if it can. */

int jbd2_journal_check_available_features(journal_t *journal, unsigned long compat,
                                      unsigned long ro, unsigned long incompat)
{
        if (!compat && !ro && !incompat)
                return 1;

        if (!jbd2_format_support_feature(journal))
                return 0;

        if ((compat   & JBD2_KNOWN_COMPAT_FEATURES) == compat &&
            (ro       & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro &&
            (incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat)
                return 1;

        return 0;
}

static int
jbd2_journal_initialize_fast_commit(journal_t *journal)
{
        journal_superblock_t *sb = journal->j_superblock;
        unsigned long long num_fc_blks;

        num_fc_blks = jbd2_journal_get_num_fc_blks(sb);
        if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS)
                return -ENOSPC;

        /* Are we called twice? */
        WARN_ON(journal->j_fc_wbuf != NULL);
        journal->j_fc_wbuf = kmalloc_array(num_fc_blks,
                                sizeof(struct buffer_head *), GFP_KERNEL);
        if (!journal->j_fc_wbuf)
                return -ENOMEM;

        journal->j_fc_wbufsize = num_fc_blks;
        journal->j_fc_last = journal->j_last;
        journal->j_last = journal->j_fc_last - num_fc_blks;
        journal->j_fc_first = journal->j_last + 1;
        journal->j_fc_off = 0;
        journal->j_free = journal->j_last - journal->j_first;
        journal->j_max_transaction_buffers =
                jbd2_journal_get_max_txn_bufs(journal);

        return 0;
}

/**
 * jbd2_journal_set_features() - Mark a given journal feature in the superblock
 * @journal: Journal to act on.
 * @compat: bitmask of compatible features
 * @ro: bitmask of features that force read-only mount
 * @incompat: bitmask of incompatible features
 *
 * Mark a given journal feature as present on the
 * superblock.  Returns true if the requested features could be set.
 *
 */

int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
                          unsigned long ro, unsigned long incompat)
{
#define INCOMPAT_FEATURE_ON(f) \
                ((incompat & (f)) && !(sb->s_feature_incompat & cpu_to_be32(f)))
#define COMPAT_FEATURE_ON(f) \
                ((compat & (f)) && !(sb->s_feature_compat & cpu_to_be32(f)))
        journal_superblock_t *sb;

        if (jbd2_journal_check_used_features(journal, compat, ro, incompat))
                return 1;

        if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
                return 0;

        /* If enabling v2 checksums, turn on v3 instead */
        if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) {
                incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2;
                incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3;
        }

        /* Asking for checksumming v3 and v1?  Only give them v3. */
        if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 &&
            compat & JBD2_FEATURE_COMPAT_CHECKSUM)
                compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;

        jbd2_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
                  compat, ro, incompat);

        sb = journal->j_superblock;

        if (incompat & JBD2_FEATURE_INCOMPAT_FAST_COMMIT) {
                if (jbd2_journal_initialize_fast_commit(journal)) {
                        pr_err("JBD2: Cannot enable fast commits.\n");
                        return 0;
                }
        }

        /* Load the checksum driver if necessary */
        if ((journal->j_chksum_driver == NULL) &&
            INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
                journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
                if (IS_ERR(journal->j_chksum_driver)) {
                        printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
                        journal->j_chksum_driver = NULL;
                        return 0;
                }
                /* Precompute checksum seed for all metadata */
                journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
                                                   sizeof(sb->s_uuid));
        }

        lock_buffer(journal->j_sb_buffer);

        /* If enabling v3 checksums, update superblock */
        if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
                sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
                sb->s_feature_compat &=
                        ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
        }

        /* If enabling v1 checksums, downgrade superblock */
        if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM))
                sb->s_feature_incompat &=
                        ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 |
                                     JBD2_FEATURE_INCOMPAT_CSUM_V3);

        sb->s_feature_compat    |= cpu_to_be32(compat);
        sb->s_feature_ro_compat |= cpu_to_be32(ro);
        sb->s_feature_incompat  |= cpu_to_be32(incompat);
        unlock_buffer(journal->j_sb_buffer);
        journal->j_revoke_records_per_block =
                                journal_revoke_records_per_block(journal);

        return 1;
#undef COMPAT_FEATURE_ON
#undef INCOMPAT_FEATURE_ON
}

/*
 * jbd2_journal_clear_features() - Clear a given journal feature in the
 *                                     superblock
 * @journal: Journal to act on.
 * @compat: bitmask of compatible features
 * @ro: bitmask of features that force read-only mount
 * @incompat: bitmask of incompatible features
 *
 * Clear a given journal feature as present on the
 * superblock.
 */
void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
                                unsigned long ro, unsigned long incompat)
{
        journal_superblock_t *sb;

        jbd2_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
                  compat, ro, incompat);

        sb = journal->j_superblock;

        sb->s_feature_compat    &= ~cpu_to_be32(compat);
        sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
        sb->s_feature_incompat  &= ~cpu_to_be32(incompat);
        journal->j_revoke_records_per_block =
                                journal_revoke_records_per_block(journal);
}
EXPORT_SYMBOL(jbd2_journal_clear_features);

/**
 * jbd2_journal_flush() - Flush journal
 * @journal: Journal to act on.
 * @flags: optional operation on the journal blocks after the flush (see below)
 *
 * Flush all data for a given journal to disk and empty the journal.
 * Filesystems can use this when remounting readonly to ensure that
 * recovery does not need to happen on remount. Optionally, a discard or zeroout
 * can be issued on the journal blocks after flushing.
 *
 * flags:
 *        JBD2_JOURNAL_FLUSH_DISCARD: issues discards for the journal blocks
 *        JBD2_JOURNAL_FLUSH_ZEROOUT: issues zeroouts for the journal blocks
 */
int jbd2_journal_flush(journal_t *journal, unsigned int flags)
{
        int err = 0;
        transaction_t *transaction = NULL;

        write_lock(&journal->j_state_lock);

        /* Force everything buffered to the log... */
        if (journal->j_running_transaction) {
                transaction = journal->j_running_transaction;
                __jbd2_log_start_commit(journal, transaction->t_tid);
        } else if (journal->j_committing_transaction)
                transaction = journal->j_committing_transaction;

        /* Wait for the log commit to complete... */
        if (transaction) {
                tid_t tid = transaction->t_tid;

                write_unlock(&journal->j_state_lock);
                jbd2_log_wait_commit(journal, tid);
        } else {
                write_unlock(&journal->j_state_lock);
        }

        /* ...and flush everything in the log out to disk. */
        spin_lock(&journal->j_list_lock);
        while (!err && journal->j_checkpoint_transactions != NULL) {
                spin_unlock(&journal->j_list_lock);
                mutex_lock_io(&journal->j_checkpoint_mutex);
                err = jbd2_log_do_checkpoint(journal);
                mutex_unlock(&journal->j_checkpoint_mutex);
                spin_lock(&journal->j_list_lock);
        }
        spin_unlock(&journal->j_list_lock);

        if (is_journal_aborted(journal))
                return -EIO;

        mutex_lock_io(&journal->j_checkpoint_mutex);
        if (!err) {
                err = jbd2_cleanup_journal_tail(journal);
                if (err < 0) {
                        mutex_unlock(&journal->j_checkpoint_mutex);
                        goto out;
                }
                err = 0;
        }

        /* Finally, mark the journal as really needing no recovery.
         * This sets s_start==0 in the underlying superblock, which is
         * the magic code for a fully-recovered superblock.  Any future
         * commits of data to the journal will restore the current
         * s_start value. */
        jbd2_mark_journal_empty(journal, REQ_FUA);

        if (flags)
                err = __jbd2_journal_erase(journal, flags);

        mutex_unlock(&journal->j_checkpoint_mutex);
        write_lock(&journal->j_state_lock);
        J_ASSERT(!journal->j_running_transaction);
        J_ASSERT(!journal->j_committing_transaction);
        J_ASSERT(!journal->j_checkpoint_transactions);
        J_ASSERT(journal->j_head == journal->j_tail);
        J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
        write_unlock(&journal->j_state_lock);
out:
        return err;
}

/**
 * jbd2_journal_wipe() - Wipe journal contents
 * @journal: Journal to act on.
 * @write: flag (see below)
 *
 * Wipe out all of the contents of a journal, safely.  This will produce
 * a warning if the journal contains any valid recovery information.
 * Must be called between journal_init_*() and jbd2_journal_load().
 *
 * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
 * we merely suppress recovery.
 */

int jbd2_journal_wipe(journal_t *journal, int write)
{
        int err;

        J_ASSERT (!(journal->j_flags & JBD2_LOADED));

        if (!journal->j_tail)
                return 0;

        printk(KERN_WARNING "JBD2: %s recovery information on journal\n",
                write ? "Clearing" : "Ignoring");

        err = jbd2_journal_skip_recovery(journal);
        if (write) {
                /* Lock to make assertions happy... */
                mutex_lock_io(&journal->j_checkpoint_mutex);
                jbd2_mark_journal_empty(journal, REQ_FUA);
                mutex_unlock(&journal->j_checkpoint_mutex);
        }

        return err;
}

/**
 * jbd2_journal_abort () - Shutdown the journal immediately.
 * @journal: the journal to shutdown.
 * @errno:   an error number to record in the journal indicating
 *           the reason for the shutdown.
 *
 * Perform a complete, immediate shutdown of the ENTIRE
 * journal (not of a single transaction).  This operation cannot be
 * undone without closing and reopening the journal.
 *
 * The jbd2_journal_abort function is intended to support higher level error
 * recovery mechanisms such as the ext2/ext3 remount-readonly error
 * mode.
 *
 * Journal abort has very specific semantics.  Any existing dirty,
 * unjournaled buffers in the main filesystem will still be written to
 * disk by bdflush, but the journaling mechanism will be suspended
 * immediately and no further transaction commits will be honoured.
 *
 * Any dirty, journaled buffers will be written back to disk without
 * hitting the journal.  Atomicity cannot be guaranteed on an aborted
 * filesystem, but we _do_ attempt to leave as much data as possible
 * behind for fsck to use for cleanup.
 *
 * Any attempt to get a new transaction handle on a journal which is in
 * ABORT state will just result in an -EROFS error return.  A
 * jbd2_journal_stop on an existing handle will return -EIO if we have
 * entered abort state during the update.
 *
 * Recursive transactions are not disturbed by journal abort until the
 * final jbd2_journal_stop, which will receive the -EIO error.
 *
 * Finally, the jbd2_journal_abort call allows the caller to supply an errno
 * which will be recorded (if possible) in the journal superblock.  This
 * allows a client to record failure conditions in the middle of a
 * transaction without having to complete the transaction to record the
 * failure to disk.  ext3_error, for example, now uses this
 * functionality.
 *
 */

void jbd2_journal_abort(journal_t *journal, int errno)
{
        transaction_t *transaction;

        /*
         * Lock the aborting procedure until everything is done, this avoid
         * races between filesystem's error handling flow (e.g. ext4_abort()),
         * ensure panic after the error info is written into journal's
         * superblock.
         */
        mutex_lock(&journal->j_abort_mutex);
        /*
         * ESHUTDOWN always takes precedence because a file system check
         * caused by any other journal abort error is not required after
         * a shutdown triggered.
         */
        write_lock(&journal->j_state_lock);
        if (journal->j_flags & JBD2_ABORT) {
                int old_errno = journal->j_errno;

                write_unlock(&journal->j_state_lock);
                if (old_errno != -ESHUTDOWN && errno == -ESHUTDOWN) {
                        journal->j_errno = errno;
                        jbd2_journal_update_sb_errno(journal);
                }
                mutex_unlock(&journal->j_abort_mutex);
                return;
        }

        /*
         * Mark the abort as occurred and start current running transaction
         * to release all journaled buffer.
         */
        pr_err("Aborting journal on device %s.\n", journal->j_devname);

        journal->j_flags |= JBD2_ABORT;
        journal->j_errno = errno;
        transaction = journal->j_running_transaction;
        if (transaction)
                __jbd2_log_start_commit(journal, transaction->t_tid);
        write_unlock(&journal->j_state_lock);

        /*
         * Record errno to the journal super block, so that fsck and jbd2
         * layer could realise that a filesystem check is needed.
         */
        jbd2_journal_update_sb_errno(journal);
        mutex_unlock(&journal->j_abort_mutex);
}

/**
 * jbd2_journal_errno() - returns the journal's error state.
 * @journal: journal to examine.
 *
 * This is the errno number set with jbd2_journal_abort(), the last
 * time the journal was mounted - if the journal was stopped
 * without calling abort this will be 0.
 *
 * If the journal has been aborted on this mount time -EROFS will
 * be returned.
 */
int jbd2_journal_errno(journal_t *journal)
{
        int err;

        read_lock(&journal->j_state_lock);
        if (journal->j_flags & JBD2_ABORT)
                err = -EROFS;
        else
                err = journal->j_errno;
        read_unlock(&journal->j_state_lock);
        return err;
}

/**
 * jbd2_journal_clear_err() - clears the journal's error state
 * @journal: journal to act on.
 *
 * An error must be cleared or acked to take a FS out of readonly
 * mode.
 */
int jbd2_journal_clear_err(journal_t *journal)
{
        int err = 0;

        write_lock(&journal->j_state_lock);
        if (journal->j_flags & JBD2_ABORT)
                err = -EROFS;
        else
                journal->j_errno = 0;
        write_unlock(&journal->j_state_lock);
        return err;
}

/**
 * jbd2_journal_ack_err() - Ack journal err.
 * @journal: journal to act on.
 *
 * An error must be cleared or acked to take a FS out of readonly
 * mode.
 */
void jbd2_journal_ack_err(journal_t *journal)
{
        write_lock(&journal->j_state_lock);
        if (journal->j_errno)
                journal->j_flags |= JBD2_ACK_ERR;
        write_unlock(&journal->j_state_lock);
}

int jbd2_journal_blocks_per_page(struct inode *inode)
{
        return 1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
}

/*
 * helper functions to deal with 32 or 64bit block numbers.
 */
size_t journal_tag_bytes(journal_t *journal)
{
        size_t sz;

        if (jbd2_has_feature_csum3(journal))
                return sizeof(journal_block_tag3_t);

        sz = sizeof(journal_block_tag_t);

        if (jbd2_has_feature_csum2(journal))
                sz += sizeof(__u16);

        if (jbd2_has_feature_64bit(journal))
                return sz;
        else
                return sz - sizeof(__u32);
}

/*
 * JBD memory management
 *
 * These functions are used to allocate block-sized chunks of memory
 * used for making copies of buffer_head data.  Very often it will be
 * page-sized chunks of data, but sometimes it will be in
 * sub-page-size chunks.  (For example, 16k pages on Power systems
 * with a 4k block file system.)  For blocks smaller than a page, we
 * use a SLAB allocator.  There are slab caches for each block size,
 * which are allocated at mount time, if necessary, and we only free
 * (all of) the slab caches when/if the jbd2 module is unloaded.  For
 * this reason we don't need to a mutex to protect access to
 * jbd2_slab[] allocating or releasing memory; only in
 * jbd2_journal_create_slab().
 */
#define JBD2_MAX_SLABS 8
static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];

static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
        "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
        "jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
};


static void jbd2_journal_destroy_slabs(void)
{
        int i;

        for (i = 0; i < JBD2_MAX_SLABS; i++) {
                kmem_cache_destroy(jbd2_slab[i]);
                jbd2_slab[i] = NULL;
        }
}

static int jbd2_journal_create_slab(size_t size)
{
        static DEFINE_MUTEX(jbd2_slab_create_mutex);
        int i = order_base_2(size) - 10;
        size_t slab_size;

        if (size == PAGE_SIZE)
                return 0;

        if (i >= JBD2_MAX_SLABS)
                return -EINVAL;

        if (unlikely(i < 0))
                i = 0;
        mutex_lock(&jbd2_slab_create_mutex);
        if (jbd2_slab[i]) {
                mutex_unlock(&jbd2_slab_create_mutex);
                return 0;        /* Already created */
        }

        slab_size = 1 << (i+10);
        jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
                                         slab_size, 0, NULL);
        mutex_unlock(&jbd2_slab_create_mutex);
        if (!jbd2_slab[i]) {
                printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
                return -ENOMEM;
        }
        return 0;
}

static struct kmem_cache *get_slab(size_t size)
{
        int i = order_base_2(size) - 10;

        BUG_ON(i >= JBD2_MAX_SLABS);
        if (unlikely(i < 0))
                i = 0;
        BUG_ON(jbd2_slab[i] == NULL);
        return jbd2_slab[i];
}

void *jbd2_alloc(size_t size, gfp_t flags)
{
        void *ptr;

        BUG_ON(size & (size-1)); /* Must be a power of 2 */

        if (size < PAGE_SIZE)
                ptr = kmem_cache_alloc(get_slab(size), flags);
        else
                ptr = (void *)__get_free_pages(flags, get_order(size));

        /* Check alignment; SLUB has gotten this wrong in the past,
         * and this can lead to user data corruption! */
        BUG_ON(((unsigned long) ptr) & (size-1));

        return ptr;
}

void jbd2_free(void *ptr, size_t size)
{
        if (size < PAGE_SIZE)
                kmem_cache_free(get_slab(size), ptr);
        else
                free_pages((unsigned long)ptr, get_order(size));
};

/*
 * Journal_head storage management
 */
static struct kmem_cache *jbd2_journal_head_cache;
#ifdef CONFIG_JBD2_DEBUG
static atomic_t nr_journal_heads = ATOMIC_INIT(0);
#endif

static int __init jbd2_journal_init_journal_head_cache(void)
{
        J_ASSERT(!jbd2_journal_head_cache);
        jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
                                sizeof(struct journal_head),
                                0,                /* offset */
                                SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU,
                                NULL);                /* ctor */
        if (!jbd2_journal_head_cache) {
                printk(KERN_EMERG "JBD2: no memory for journal_head cache\n");
                return -ENOMEM;
        }
        return 0;
}

static void jbd2_journal_destroy_journal_head_cache(void)
{
        kmem_cache_destroy(jbd2_journal_head_cache);
        jbd2_journal_head_cache = NULL;
}

/*
 * journal_head splicing and dicing
 */
static struct journal_head *journal_alloc_journal_head(void)
{
        struct journal_head *ret;

#ifdef CONFIG_JBD2_DEBUG
        atomic_inc(&nr_journal_heads);
#endif
        ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
        if (!ret) {
                jbd2_debug(1, "out of memory for journal_head\n");
                pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
                ret = kmem_cache_zalloc(jbd2_journal_head_cache,
                                GFP_NOFS | __GFP_NOFAIL);
        }
        if (ret)
                spin_lock_init(&ret->b_state_lock);
        return ret;
}

static void journal_free_journal_head(struct journal_head *jh)
{
#ifdef CONFIG_JBD2_DEBUG
        atomic_dec(&nr_journal_heads);
        memset(jh, JBD2_POISON_FREE, sizeof(*jh));
#endif
        kmem_cache_free(jbd2_journal_head_cache, jh);
}

/*
 * A journal_head is attached to a buffer_head whenever JBD has an
 * interest in the buffer.
 *
 * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
 * is set.  This bit is tested in core kernel code where we need to take
 * JBD-specific actions.  Testing the zeroness of ->b_private is not reliable
 * there.
 *
 * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
 *
 * When a buffer has its BH_JBD bit set it is immune from being released by
 * core kernel code, mainly via ->b_count.
 *
 * A journal_head is detached from its buffer_head when the journal_head's
 * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
 * transaction (b_cp_transaction) hold their references to b_jcount.
 *
 * Various places in the kernel want to attach a journal_head to a buffer_head
 * _before_ attaching the journal_head to a transaction.  To protect the
 * journal_head in this situation, jbd2_journal_add_journal_head elevates the
 * journal_head's b_jcount refcount by one.  The caller must call
 * jbd2_journal_put_journal_head() to undo this.
 *
 * So the typical usage would be:
 *
 *        (Attach a journal_head if needed.  Increments b_jcount)
 *        struct journal_head *jh = jbd2_journal_add_journal_head(bh);
 *        ...
 *      (Get another reference for transaction)
 *        jbd2_journal_grab_journal_head(bh);
 *        jh->b_transaction = xxx;
 *        (Put original reference)
 *        jbd2_journal_put_journal_head(jh);
 */

/*
 * Give a buffer_head a journal_head.
 *
 * May sleep.
 */
struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
{
        struct journal_head *jh;
        struct journal_head *new_jh = NULL;

repeat:
        if (!buffer_jbd(bh))
                new_jh = journal_alloc_journal_head();

        jbd_lock_bh_journal_head(bh);
        if (buffer_jbd(bh)) {
                jh = bh2jh(bh);
        } else {
                J_ASSERT_BH(bh,
                        (atomic_read(&bh->b_count) > 0) ||
                        (bh->b_folio && bh->b_folio->mapping));

                if (!new_jh) {
                        jbd_unlock_bh_journal_head(bh);
                        goto repeat;
                }

                jh = new_jh;
                new_jh = NULL;                /* We consumed it */
                set_buffer_jbd(bh);
                bh->b_private = jh;
                jh->b_bh = bh;
                get_bh(bh);
                BUFFER_TRACE(bh, "added journal_head");
        }
        jh->b_jcount++;
        jbd_unlock_bh_journal_head(bh);
        if (new_jh)
                journal_free_journal_head(new_jh);
        return bh->b_private;
}

/*
 * Grab a ref against this buffer_head's journal_head.  If it ended up not
 * having a journal_head, return NULL
 */
struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh)
{
        struct journal_head *jh = NULL;

        jbd_lock_bh_journal_head(bh);
        if (buffer_jbd(bh)) {
                jh = bh2jh(bh);
                jh->b_jcount++;
        }
        jbd_unlock_bh_journal_head(bh);
        return jh;
}
EXPORT_SYMBOL(jbd2_journal_grab_journal_head);

static void __journal_remove_journal_head(struct buffer_head *bh)
{
        struct journal_head *jh = bh2jh(bh);

        J_ASSERT_JH(jh, jh->b_transaction == NULL);
        J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
        J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
        J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
        J_ASSERT_BH(bh, buffer_jbd(bh));
        J_ASSERT_BH(bh, jh2bh(jh) == bh);
        BUFFER_TRACE(bh, "remove journal_head");

        /* Unlink before dropping the lock */
        bh->b_private = NULL;
        jh->b_bh = NULL;        /* debug, really */
        clear_buffer_jbd(bh);
}

static void journal_release_journal_head(struct journal_head *jh, size_t b_size)
{
        if (jh->b_frozen_data) {
                printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
                jbd2_free(jh->b_frozen_data, b_size);
        }
        if (jh->b_committed_data) {
                printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
                jbd2_free(jh->b_committed_data, b_size);
        }
        journal_free_journal_head(jh);
}

/*
 * Drop a reference on the passed journal_head.  If it fell to zero then
 * release the journal_head from the buffer_head.
 */
void jbd2_journal_put_journal_head(struct journal_head *jh)
{
        struct buffer_head *bh = jh2bh(jh);

        jbd_lock_bh_journal_head(bh);
        J_ASSERT_JH(jh, jh->b_jcount > 0);
        --jh->b_jcount;
        if (!jh->b_jcount) {
                __journal_remove_journal_head(bh);
                jbd_unlock_bh_journal_head(bh);
                journal_release_journal_head(jh, bh->b_size);
                __brelse(bh);
        } else {
                jbd_unlock_bh_journal_head(bh);
        }
}
EXPORT_SYMBOL(jbd2_journal_put_journal_head);

/*
 * Initialize jbd inode head
 */
void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
{
        jinode->i_transaction = NULL;
        jinode->i_next_transaction = NULL;
        jinode->i_vfs_inode = inode;
        jinode->i_flags = 0;
        jinode->i_dirty_start = 0;
        jinode->i_dirty_end = 0;
        INIT_LIST_HEAD(&jinode->i_list);
}

/*
 * Function to be called before we start removing inode from memory (i.e.,
 * clear_inode() is a fine place to be called from). It removes inode from
 * transaction's lists.
 */
void jbd2_journal_release_jbd_inode(journal_t *journal,
                                    struct jbd2_inode *jinode)
{
        if (!journal)
                return;
restart:
        spin_lock(&journal->j_list_lock);
        /* Is commit writing out inode - we have to wait */
        if (jinode->i_flags & JI_COMMIT_RUNNING) {
                wait_queue_head_t *wq;
                DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
                wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
                prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
                spin_unlock(&journal->j_list_lock);
                schedule();
                finish_wait(wq, &wait.wq_entry);
                goto restart;
        }

        if (jinode->i_transaction) {
                list_del(&jinode->i_list);
                jinode->i_transaction = NULL;
        }
        spin_unlock(&journal->j_list_lock);
}


#ifdef CONFIG_PROC_FS

#define JBD2_STATS_PROC_NAME "fs/jbd2"

static void __init jbd2_create_jbd_stats_proc_entry(void)
{
        proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL);
}

static void __exit jbd2_remove_jbd_stats_proc_entry(void)
{
        if (proc_jbd2_stats)
                remove_proc_entry(JBD2_STATS_PROC_NAME, NULL);
}

#else

#define jbd2_create_jbd_stats_proc_entry() do {} while (0)
#define jbd2_remove_jbd_stats_proc_entry() do {} while (0)

#endif

struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;

static int __init jbd2_journal_init_inode_cache(void)
{
        J_ASSERT(!jbd2_inode_cache);
        jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
        if (!jbd2_inode_cache) {
                pr_emerg("JBD2: failed to create inode cache\n");
                return -ENOMEM;
        }
        return 0;
}

static int __init jbd2_journal_init_handle_cache(void)
{
        J_ASSERT(!jbd2_handle_cache);
        jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
        if (!jbd2_handle_cache) {
                printk(KERN_EMERG "JBD2: failed to create handle cache\n");
                return -ENOMEM;
        }
        return 0;
}

static void jbd2_journal_destroy_inode_cache(void)
{
        kmem_cache_destroy(jbd2_inode_cache);
        jbd2_inode_cache = NULL;
}

static void jbd2_journal_destroy_handle_cache(void)
{
        kmem_cache_destroy(jbd2_handle_cache);
        jbd2_handle_cache = NULL;
}

/*
 * Module startup and shutdown
 */

static int __init journal_init_caches(void)
{
        int ret;

        ret = jbd2_journal_init_revoke_record_cache();
        if (ret == 0)
                ret = jbd2_journal_init_revoke_table_cache();
        if (ret == 0)
                ret = jbd2_journal_init_journal_head_cache();
        if (ret == 0)
                ret = jbd2_journal_init_handle_cache();
        if (ret == 0)
                ret = jbd2_journal_init_inode_cache();
        if (ret == 0)
                ret = jbd2_journal_init_transaction_cache();
        return ret;
}

static void jbd2_journal_destroy_caches(void)
{
        jbd2_journal_destroy_revoke_record_cache();
        jbd2_journal_destroy_revoke_table_cache();
        jbd2_journal_destroy_journal_head_cache();
        jbd2_journal_destroy_handle_cache();
        jbd2_journal_destroy_inode_cache();
        jbd2_journal_destroy_transaction_cache();
        jbd2_journal_destroy_slabs();
}

static int __init journal_init(void)
{
        int ret;

        BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);

        ret = journal_init_caches();
        if (ret == 0) {
                jbd2_create_jbd_stats_proc_entry();
        } else {
                jbd2_journal_destroy_caches();
        }
        return ret;
}

static void __exit journal_exit(void)
{
#ifdef CONFIG_JBD2_DEBUG
        int n = atomic_read(&nr_journal_heads);
        if (n)
                printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n);
#endif
        jbd2_remove_jbd_stats_proc_entry();
        jbd2_journal_destroy_caches();
}

MODULE_LICENSE("GPL");
module_init(journal_init);
module_exit(journal_exit);




































































































    1 











    1 




    1 

































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/proc/array.c
 *
 *  Copyright (C) 1992  by Linus Torvalds
 *  based on ideas by Darren Senn
 *
 * Fixes:
 * Michael. K. Johnson: stat,statm extensions.
 *                      <johnsonm@stolaf.edu>
 *
 * Pauline Middelink :  Made cmdline,envline only break at '\0's, to
 *                      make sure SET_PROCTITLE works. Also removed
 *                      bad '!' which forced address recalculation for
 *                      EVERY character on the current page.
 *                      <middelin@polyware.iaf.nl>
 *
 * Danny ter Haar    :        added cpuinfo
 *                        <dth@cistron.nl>
 *
 * Alessandro Rubini :  profile extension.
 *                      <rubini@ipvvis.unipv.it>
 *
 * Jeff Tranter      :  added BogoMips field to cpuinfo
 *                      <Jeff_Tranter@Mitel.COM>
 *
 * Bruno Haible      :  remove 4K limit for the maps file
 *                        <haible@ma2s2.mathematik.uni-karlsruhe.de>
 *
 * Yves Arrouye      :  remove removal of trailing spaces in get_array.
 *                        <Yves.Arrouye@marin.fdn.fr>
 *
 * Jerome Forissier  :  added per-CPU time information to /proc/stat
 *                      and /proc/<pid>/cpu extension
 *                      <forissier@isia.cma.fr>
 *                        - Incorporation and non-SMP safe operation
 *                        of forissier patch in 2.1.78 by
 *                        Hans Marcus <crowbar@concepts.nl>
 *
 * aeb@cwi.nl        :  /proc/partitions
 *
 *
 * Alan Cox             :  security fixes.
 *                        <alan@lxorguk.ukuu.org.uk>
 *
 * Al Viro           :  safe handling of mm_struct
 *
 * Gerhard Wichert   :  added BIGMEM support
 * Siemens AG           <Gerhard.Wichert@pdb.siemens.de>
 *
 * Al Viro & Jeff Garzik :  moved most of the thing into base.c and
 *                         :  proc_misc.c. The rest may eventually go into
 *                         :  base.c too.
 */

#include <linux/types.h>
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/time_namespace.h>
#include <linux/kernel.h>
#include <linux/kernel_stat.h>
#include <linux/tty.h>
#include <linux/string.h>
#include <linux/mman.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/task.h>
#include <linux/sched/cputime.h>
#include <linux/proc_fs.h>
#include <linux/ioport.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/signal.h>
#include <linux/highmem.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/times.h>
#include <linux/cpuset.h>
#include <linux/rcupdate.h>
#include <linux/delayacct.h>
#include <linux/seq_file.h>
#include <linux/pid_namespace.h>
#include <linux/prctl.h>
#include <linux/ptrace.h>
#include <linux/string_helpers.h>
#include <linux/user_namespace.h>
#include <linux/fs_struct.h>
#include <linux/kthread.h>
#include <linux/mmu_context.h>

#include <asm/processor.h>
#include "internal.h"

void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape)
{
        char tcomm[64];

        /*
         * Test before PF_KTHREAD because all workqueue worker threads are
         * kernel threads.
         */
        if (p->flags & PF_WQ_WORKER)
                wq_worker_comm(tcomm, sizeof(tcomm), p);
        else if (p->flags & PF_KTHREAD)
                get_kthread_comm(tcomm, sizeof(tcomm), p);
        else
                __get_task_comm(tcomm, sizeof(tcomm), p);

        if (escape)
                seq_escape_str(m, tcomm, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
        else
                seq_printf(m, "%.64s", tcomm);
}

/*
 * The task state array is a strange "bitmap" of
 * reasons to sleep. Thus "running" is zero, and
 * you can test for combinations of others with
 * simple bit tests.
 */
static const char * const task_state_array[] = {

        /* states in TASK_REPORT: */
        "R (running)",                /* 0x00 */
        "S (sleeping)",                /* 0x01 */
        "D (disk sleep)",        /* 0x02 */
        "T (stopped)",                /* 0x04 */
        "t (tracing stop)",        /* 0x08 */
        "X (dead)",                /* 0x10 */
        "Z (zombie)",                /* 0x20 */
        "P (parked)",                /* 0x40 */

        /* states beyond TASK_REPORT: */
        "I (idle)",                /* 0x80 */
};

static inline const char *get_task_state(struct task_struct *tsk)
{
        BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array));
        return task_state_array[task_state_index(tsk)];
}

static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *p)
{
        struct user_namespace *user_ns = seq_user_ns(m);
        struct group_info *group_info;
        int g, umask = -1;
        struct task_struct *tracer;
        const struct cred *cred;
        pid_t ppid, tpid = 0, tgid, ngid;
        unsigned int max_fds = 0;

        rcu_read_lock();
        ppid = pid_alive(p) ?
                task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;

        tracer = ptrace_parent(p);
        if (tracer)
                tpid = task_pid_nr_ns(tracer, ns);

        tgid = task_tgid_nr_ns(p, ns);
        ngid = task_numa_group_id(p);
        cred = get_task_cred(p);

        task_lock(p);
        if (p->fs)
                umask = p->fs->umask;
        if (p->files)
                max_fds = files_fdtable(p->files)->max_fds;
        task_unlock(p);
        rcu_read_unlock();

        if (umask >= 0)
                seq_printf(m, "Umask:\t%#04o\n", umask);
        seq_puts(m, "State:\t");
        seq_puts(m, get_task_state(p));

        seq_put_decimal_ull(m, "\nTgid:\t", tgid);
        seq_put_decimal_ull(m, "\nNgid:\t", ngid);
        seq_put_decimal_ull(m, "\nPid:\t", pid_nr_ns(pid, ns));
        seq_put_decimal_ull(m, "\nPPid:\t", ppid);
        seq_put_decimal_ull(m, "\nTracerPid:\t", tpid);
        seq_put_decimal_ull(m, "\nUid:\t", from_kuid_munged(user_ns, cred->uid));
        seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->euid));
        seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->suid));
        seq_put_decimal_ull(m, "\t", from_kuid_munged(user_ns, cred->fsuid));
        seq_put_decimal_ull(m, "\nGid:\t", from_kgid_munged(user_ns, cred->gid));
        seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->egid));
        seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->sgid));
        seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->fsgid));
        seq_put_decimal_ull(m, "\nFDSize:\t", max_fds);

        seq_puts(m, "\nGroups:\t");
        group_info = cred->group_info;
        for (g = 0; g < group_info->ngroups; g++)
                seq_put_decimal_ull(m, g ? " " : "",
                                from_kgid_munged(user_ns, group_info->gid[g]));
        put_cred(cred);
        /* Trailing space shouldn't have been added in the first place. */
        seq_putc(m, ' ');

#ifdef CONFIG_PID_NS
        seq_puts(m, "\nNStgid:");
        for (g = ns->level; g <= pid->level; g++)
                seq_put_decimal_ull(m, "\t", task_tgid_nr_ns(p, pid->numbers[g].ns));
        seq_puts(m, "\nNSpid:");
        for (g = ns->level; g <= pid->level; g++)
                seq_put_decimal_ull(m, "\t", task_pid_nr_ns(p, pid->numbers[g].ns));
        seq_puts(m, "\nNSpgid:");
        for (g = ns->level; g <= pid->level; g++)
                seq_put_decimal_ull(m, "\t", task_pgrp_nr_ns(p, pid->numbers[g].ns));
        seq_puts(m, "\nNSsid:");
        for (g = ns->level; g <= pid->level; g++)
                seq_put_decimal_ull(m, "\t", task_session_nr_ns(p, pid->numbers[g].ns));
#endif
        seq_putc(m, '\n');

        seq_printf(m, "Kthread:\t%c\n", p->flags & PF_KTHREAD ? '1' : '0');
}

void render_sigset_t(struct seq_file *m, const char *header,
                                sigset_t *set)
{
        int i;

        seq_puts(m, header);

        i = _NSIG;
        do {
                int x = 0;

                i -= 4;
                if (sigismember(set, i+1)) x |= 1;
                if (sigismember(set, i+2)) x |= 2;
                if (sigismember(set, i+3)) x |= 4;
                if (sigismember(set, i+4)) x |= 8;
                seq_putc(m, hex_asc[x]);
        } while (i >= 4);

        seq_putc(m, '\n');
}

static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *sigign,
                                    sigset_t *sigcatch)
{
        struct k_sigaction *k;
        int i;

        k = p->sighand->action;
        for (i = 1; i <= _NSIG; ++i, ++k) {
                if (k->sa.sa_handler == SIG_IGN)
                        sigaddset(sigign, i);
                else if (k->sa.sa_handler != SIG_DFL)
                        sigaddset(sigcatch, i);
        }
}

static inline void task_sig(struct seq_file *m, struct task_struct *p)
{
        unsigned long flags;
        sigset_t pending, shpending, blocked, ignored, caught;
        int num_threads = 0;
        unsigned int qsize = 0;
        unsigned long qlim = 0;

        sigemptyset(&pending);
        sigemptyset(&shpending);
        sigemptyset(&blocked);
        sigemptyset(&ignored);
        sigemptyset(&caught);

        if (lock_task_sighand(p, &flags)) {
                pending = p->pending.signal;
                shpending = p->signal->shared_pending.signal;
                blocked = p->blocked;
                collect_sigign_sigcatch(p, &ignored, &caught);
                num_threads = get_nr_threads(p);
                rcu_read_lock();  /* FIXME: is this correct? */
                qsize = get_rlimit_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING);
                rcu_read_unlock();
                qlim = task_rlimit(p, RLIMIT_SIGPENDING);
                unlock_task_sighand(p, &flags);
        }

        seq_put_decimal_ull(m, "Threads:\t", num_threads);
        seq_put_decimal_ull(m, "\nSigQ:\t", qsize);
        seq_put_decimal_ull(m, "/", qlim);

        /* render them all */
        render_sigset_t(m, "\nSigPnd:\t", &pending);
        render_sigset_t(m, "ShdPnd:\t", &shpending);
        render_sigset_t(m, "SigBlk:\t", &blocked);
        render_sigset_t(m, "SigIgn:\t", &ignored);
        render_sigset_t(m, "SigCgt:\t", &caught);
}

static void render_cap_t(struct seq_file *m, const char *header,
                        kernel_cap_t *a)
{
        seq_puts(m, header);
        seq_put_hex_ll(m, NULL, a->val, 16);
        seq_putc(m, '\n');
}

static inline void task_cap(struct seq_file *m, struct task_struct *p)
{
        const struct cred *cred;
        kernel_cap_t cap_inheritable, cap_permitted, cap_effective,
                        cap_bset, cap_ambient;

        rcu_read_lock();
        cred = __task_cred(p);
        cap_inheritable        = cred->cap_inheritable;
        cap_permitted        = cred->cap_permitted;
        cap_effective        = cred->cap_effective;
        cap_bset        = cred->cap_bset;
        cap_ambient        = cred->cap_ambient;
        rcu_read_unlock();

        render_cap_t(m, "CapInh:\t", &cap_inheritable);
        render_cap_t(m, "CapPrm:\t", &cap_permitted);
        render_cap_t(m, "CapEff:\t", &cap_effective);
        render_cap_t(m, "CapBnd:\t", &cap_bset);
        render_cap_t(m, "CapAmb:\t", &cap_ambient);
}

static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
{
        seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p));
#ifdef CONFIG_SECCOMP
        seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
#ifdef CONFIG_SECCOMP_FILTER
        seq_put_decimal_ull(m, "\nSeccomp_filters:\t",
                            atomic_read(&p->seccomp.filter_count));
#endif
#endif
        seq_puts(m, "\nSpeculation_Store_Bypass:\t");
        switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) {
        case -EINVAL:
                seq_puts(m, "unknown");
                break;
        case PR_SPEC_NOT_AFFECTED:
                seq_puts(m, "not vulnerable");
                break;
        case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE:
                seq_puts(m, "thread force mitigated");
                break;
        case PR_SPEC_PRCTL | PR_SPEC_DISABLE:
                seq_puts(m, "thread mitigated");
                break;
        case PR_SPEC_PRCTL | PR_SPEC_ENABLE:
                seq_puts(m, "thread vulnerable");
                break;
        case PR_SPEC_DISABLE:
                seq_puts(m, "globally mitigated");
                break;
        default:
                seq_puts(m, "vulnerable");
                break;
        }

        seq_puts(m, "\nSpeculationIndirectBranch:\t");
        switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_INDIRECT_BRANCH)) {
        case -EINVAL:
                seq_puts(m, "unsupported");
                break;
        case PR_SPEC_NOT_AFFECTED:
                seq_puts(m, "not affected");
                break;
        case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE:
                seq_puts(m, "conditional force disabled");
                break;
        case PR_SPEC_PRCTL | PR_SPEC_DISABLE:
                seq_puts(m, "conditional disabled");
                break;
        case PR_SPEC_PRCTL | PR_SPEC_ENABLE:
                seq_puts(m, "conditional enabled");
                break;
        case PR_SPEC_ENABLE:
                seq_puts(m, "always enabled");
                break;
        case PR_SPEC_DISABLE:
                seq_puts(m, "always disabled");
                break;
        default:
                seq_puts(m, "unknown");
                break;
        }
        seq_putc(m, '\n');
}

static inline void task_context_switch_counts(struct seq_file *m,
                                                struct task_struct *p)
{
        seq_put_decimal_ull(m, "voluntary_ctxt_switches:\t", p->nvcsw);
        seq_put_decimal_ull(m, "\nnonvoluntary_ctxt_switches:\t", p->nivcsw);
        seq_putc(m, '\n');
}

static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
{
        seq_printf(m, "Cpus_allowed:\t%*pb\n",
                   cpumask_pr_args(&task->cpus_mask));
        seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
                   cpumask_pr_args(&task->cpus_mask));
}

static inline void task_core_dumping(struct seq_file *m, struct task_struct *task)
{
        seq_put_decimal_ull(m, "CoreDumping:\t", !!task->signal->core_state);
        seq_putc(m, '\n');
}

static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm)
{
        bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE);

        if (thp_enabled)
                thp_enabled = !test_bit(MMF_DISABLE_THP, &mm->flags);
        seq_printf(m, "THP_enabled:\t%d\n", thp_enabled);
}

static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm)
{
        seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm));
}

__weak void arch_proc_pid_thread_features(struct seq_file *m,
                                          struct task_struct *task)
{
}

int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task)
{
        struct mm_struct *mm = get_task_mm(task);

        seq_puts(m, "Name:\t");
        proc_task_name(m, task, true);
        seq_putc(m, '\n');

        task_state(m, ns, pid, task);

        if (mm) {
                task_mem(m, mm);
                task_core_dumping(m, task);
                task_thp_status(m, mm);
                task_untag_mask(m, mm);
                mmput(mm);
        }
        task_sig(m, task);
        task_cap(m, task);
        task_seccomp(m, task);
        task_cpus_allowed(m, task);
        cpuset_task_status_allowed(m, task);
        task_context_switch_counts(m, task);
        arch_proc_pid_thread_features(m, task);
        return 0;
}

static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task, int whole)
{
        unsigned long vsize, eip, esp, wchan = 0;
        int priority, nice;
        int tty_pgrp = -1, tty_nr = 0;
        sigset_t sigign, sigcatch;
        char state;
        pid_t ppid = 0, pgid = -1, sid = -1;
        int num_threads = 0;
        int permitted;
        struct mm_struct *mm;
        unsigned long long start_time;
        unsigned long cmin_flt, cmaj_flt, min_flt, maj_flt;
        u64 cutime, cstime, cgtime, utime, stime, gtime;
        unsigned long rsslim = 0;
        unsigned long flags;
        int exit_code = task->exit_code;
        struct signal_struct *sig = task->signal;
        unsigned int seq = 1;

        state = *get_task_state(task);
        vsize = eip = esp = 0;
        permitted = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS | PTRACE_MODE_NOAUDIT);
        mm = get_task_mm(task);
        if (mm) {
                vsize = task_vsize(mm);
                /*
                 * esp and eip are intentionally zeroed out.  There is no
                 * non-racy way to read them without freezing the task.
                 * Programs that need reliable values can use ptrace(2).
                 *
                 * The only exception is if the task is core dumping because
                 * a program is not able to use ptrace(2) in that case. It is
                 * safe because the task has stopped executing permanently.
                 */
                if (permitted && (task->flags & (PF_EXITING|PF_DUMPCORE))) {
                        if (try_get_task_stack(task)) {
                                eip = KSTK_EIP(task);
                                esp = KSTK_ESP(task);
                                put_task_stack(task);
                        }
                }
        }

        sigemptyset(&sigign);
        sigemptyset(&sigcatch);

        if (lock_task_sighand(task, &flags)) {
                if (sig->tty) {
                        struct pid *pgrp = tty_get_pgrp(sig->tty);
                        tty_pgrp = pid_nr_ns(pgrp, ns);
                        put_pid(pgrp);
                        tty_nr = new_encode_dev(tty_devnum(sig->tty));
                }

                num_threads = get_nr_threads(task);
                collect_sigign_sigcatch(task, &sigign, &sigcatch);

                rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);

                if (whole) {
                        if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED))
                                exit_code = sig->group_exit_code;
                }

                sid = task_session_nr_ns(task, ns);
                ppid = task_tgid_nr_ns(task->real_parent, ns);
                pgid = task_pgrp_nr_ns(task, ns);

                unlock_task_sighand(task, &flags);
        }

        if (permitted && (!whole || num_threads < 2))
                wchan = !task_is_running(task);

        do {
                seq++; /* 2 on the 1st/lockless path, otherwise odd */
                flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);

                cmin_flt = sig->cmin_flt;
                cmaj_flt = sig->cmaj_flt;
                cutime = sig->cutime;
                cstime = sig->cstime;
                cgtime = sig->cgtime;

                if (whole) {
                        struct task_struct *t;

                        min_flt = sig->min_flt;
                        maj_flt = sig->maj_flt;
                        gtime = sig->gtime;

                        rcu_read_lock();
                        __for_each_thread(sig, t) {
                                min_flt += t->min_flt;
                                maj_flt += t->maj_flt;
                                gtime += task_gtime(t);
                        }
                        rcu_read_unlock();
                }
        } while (need_seqretry(&sig->stats_lock, seq));
        done_seqretry_irqrestore(&sig->stats_lock, seq, flags);

        if (whole) {
                thread_group_cputime_adjusted(task, &utime, &stime);
        } else {
                task_cputime_adjusted(task, &utime, &stime);
                min_flt = task->min_flt;
                maj_flt = task->maj_flt;
                gtime = task_gtime(task);
        }

        /* scale priority and nice values from timeslices to -20..20 */
        /* to make it look like a "normal" Unix priority/nice value  */
        priority = task_prio(task);
        nice = task_nice(task);

        /* apply timens offset for boottime and convert nsec -> ticks */
        start_time =
                nsec_to_clock_t(timens_add_boottime_ns(task->start_boottime));

        seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns));
        seq_puts(m, " (");
        proc_task_name(m, task, false);
        seq_puts(m, ") ");
        seq_putc(m, state);
        seq_put_decimal_ll(m, " ", ppid);
        seq_put_decimal_ll(m, " ", pgid);
        seq_put_decimal_ll(m, " ", sid);
        seq_put_decimal_ll(m, " ", tty_nr);
        seq_put_decimal_ll(m, " ", tty_pgrp);
        seq_put_decimal_ull(m, " ", task->flags);
        seq_put_decimal_ull(m, " ", min_flt);
        seq_put_decimal_ull(m, " ", cmin_flt);
        seq_put_decimal_ull(m, " ", maj_flt);
        seq_put_decimal_ull(m, " ", cmaj_flt);
        seq_put_decimal_ull(m, " ", nsec_to_clock_t(utime));
        seq_put_decimal_ull(m, " ", nsec_to_clock_t(stime));
        seq_put_decimal_ll(m, " ", nsec_to_clock_t(cutime));
        seq_put_decimal_ll(m, " ", nsec_to_clock_t(cstime));
        seq_put_decimal_ll(m, " ", priority);
        seq_put_decimal_ll(m, " ", nice);
        seq_put_decimal_ll(m, " ", num_threads);
        seq_put_decimal_ull(m, " ", 0);
        seq_put_decimal_ull(m, " ", start_time);
        seq_put_decimal_ull(m, " ", vsize);
        seq_put_decimal_ull(m, " ", mm ? get_mm_rss(mm) : 0);
        seq_put_decimal_ull(m, " ", rsslim);
        seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->start_code : 1) : 0);
        seq_put_decimal_ull(m, " ", mm ? (permitted ? mm->end_code : 1) : 0);
        seq_put_decimal_ull(m, " ", (permitted && mm) ? mm->start_stack : 0);
        seq_put_decimal_ull(m, " ", esp);
        seq_put_decimal_ull(m, " ", eip);
        /* The signal information here is obsolete.
         * It must be decimal for Linux 2.0 compatibility.
         * Use /proc/#/status for real-time signals.
         */
        seq_put_decimal_ull(m, " ", task->pending.signal.sig[0] & 0x7fffffffUL);
        seq_put_decimal_ull(m, " ", task->blocked.sig[0] & 0x7fffffffUL);
        seq_put_decimal_ull(m, " ", sigign.sig[0] & 0x7fffffffUL);
        seq_put_decimal_ull(m, " ", sigcatch.sig[0] & 0x7fffffffUL);

        /*
         * We used to output the absolute kernel address, but that's an
         * information leak - so instead we show a 0/1 flag here, to signal
         * to user-space whether there's a wchan field in /proc/PID/wchan.
         *
         * This works with older implementations of procps as well.
         */
        seq_put_decimal_ull(m, " ", wchan);

        seq_put_decimal_ull(m, " ", 0);
        seq_put_decimal_ull(m, " ", 0);
        seq_put_decimal_ll(m, " ", task->exit_signal);
        seq_put_decimal_ll(m, " ", task_cpu(task));
        seq_put_decimal_ull(m, " ", task->rt_priority);
        seq_put_decimal_ull(m, " ", task->policy);
        seq_put_decimal_ull(m, " ", delayacct_blkio_ticks(task));
        seq_put_decimal_ull(m, " ", nsec_to_clock_t(gtime));
        seq_put_decimal_ll(m, " ", nsec_to_clock_t(cgtime));

        if (mm && permitted) {
                seq_put_decimal_ull(m, " ", mm->start_data);
                seq_put_decimal_ull(m, " ", mm->end_data);
                seq_put_decimal_ull(m, " ", mm->start_brk);
                seq_put_decimal_ull(m, " ", mm->arg_start);
                seq_put_decimal_ull(m, " ", mm->arg_end);
                seq_put_decimal_ull(m, " ", mm->env_start);
                seq_put_decimal_ull(m, " ", mm->env_end);
        } else
                seq_puts(m, " 0 0 0 0 0 0 0");

        if (permitted)
                seq_put_decimal_ll(m, " ", exit_code);
        else
                seq_puts(m, " 0");

        seq_putc(m, '\n');
        if (mm)
                mmput(mm);
        return 0;
}

int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task)
{
        return do_task_stat(m, ns, pid, task, 0);
}

int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task)
{
        return do_task_stat(m, ns, pid, task, 1);
}

int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task)
{
        struct mm_struct *mm = get_task_mm(task);

        if (mm) {
                unsigned long size;
                unsigned long resident = 0;
                unsigned long shared = 0;
                unsigned long text = 0;
                unsigned long data = 0;

                size = task_statm(mm, &shared, &text, &data, &resident);
                mmput(mm);

                /*
                 * For quick read, open code by putting numbers directly
                 * expected format is
                 * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
                 *               size, resident, shared, text, data);
                 */
                seq_put_decimal_ull(m, "", size);
                seq_put_decimal_ull(m, " ", resident);
                seq_put_decimal_ull(m, " ", shared);
                seq_put_decimal_ull(m, " ", text);
                seq_put_decimal_ull(m, " ", 0);
                seq_put_decimal_ull(m, " ", data);
                seq_put_decimal_ull(m, " ", 0);
                seq_putc(m, '\n');
        } else {
                seq_write(m, "0 0 0 0 0 0 0\n", 14);
        }
        return 0;
}

#ifdef CONFIG_PROC_CHILDREN
static struct pid *
get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
{
        struct task_struct *start, *task;
        struct pid *pid = NULL;

        read_lock(&tasklist_lock);

        start = pid_task(proc_pid(inode), PIDTYPE_PID);
        if (!start)
                goto out;

        /*
         * Lets try to continue searching first, this gives
         * us significant speedup on children-rich processes.
         */
        if (pid_prev) {
                task = pid_task(pid_prev, PIDTYPE_PID);
                if (task && task->real_parent == start &&
                    !(list_empty(&task->sibling))) {
                        if (list_is_last(&task->sibling, &start->children))
                                goto out;
                        task = list_first_entry(&task->sibling,
                                                struct task_struct, sibling);
                        pid = get_pid(task_pid(task));
                        goto out;
                }
        }

        /*
         * Slow search case.
         *
         * We might miss some children here if children
         * are exited while we were not holding the lock,
         * but it was never promised to be accurate that
         * much.
         *
         * "Just suppose that the parent sleeps, but N children
         *  exit after we printed their tids. Now the slow paths
         *  skips N extra children, we miss N tasks." (c)
         *
         * So one need to stop or freeze the leader and all
         * its children to get a precise result.
         */
        list_for_each_entry(task, &start->children, sibling) {
                if (pos-- == 0) {
                        pid = get_pid(task_pid(task));
                        break;
                }
        }

out:
        read_unlock(&tasklist_lock);
        return pid;
}

static int children_seq_show(struct seq_file *seq, void *v)
{
        struct inode *inode = file_inode(seq->file);

        seq_printf(seq, "%d ", pid_nr_ns(v, proc_pid_ns(inode->i_sb)));
        return 0;
}

static void *children_seq_start(struct seq_file *seq, loff_t *pos)
{
        return get_children_pid(file_inode(seq->file), NULL, *pos);
}

static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct pid *pid;

        pid = get_children_pid(file_inode(seq->file), v, *pos + 1);
        put_pid(v);

        ++*pos;
        return pid;
}

static void children_seq_stop(struct seq_file *seq, void *v)
{
        put_pid(v);
}

static const struct seq_operations children_seq_ops = {
        .start        = children_seq_start,
        .next        = children_seq_next,
        .stop        = children_seq_stop,
        .show        = children_seq_show,
};

static int children_seq_open(struct inode *inode, struct file *file)
{
        return seq_open(file, &children_seq_ops);
}

const struct file_operations proc_tid_children_operations = {
        .open    = children_seq_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
        .release = seq_release,
};
#endif /* CONFIG_PROC_CHILDREN */




























































































































   12 





















   19 


   20 


























    1 









    1 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
// SPDX-License-Identifier: GPL-2.0
#include <linux/err.h>
#include <linux/bug.h>
#include <linux/atomic.h>
#include <linux/errseq.h>
#include <linux/log2.h>

/*
 * An errseq_t is a way of recording errors in one place, and allowing any
 * number of "subscribers" to tell whether it has changed since a previous
 * point where it was sampled.
 *
 * It's implemented as an unsigned 32-bit value. The low order bits are
 * designated to hold an error code (between 0 and -MAX_ERRNO). The upper bits
 * are used as a counter. This is done with atomics instead of locking so that
 * these functions can be called from any context.
 *
 * The general idea is for consumers to sample an errseq_t value. That value
 * can later be used to tell whether any new errors have occurred since that
 * sampling was done.
 *
 * Note that there is a risk of collisions if new errors are being recorded
 * frequently, since we have so few bits to use as a counter.
 *
 * To mitigate this, one bit is used as a flag to tell whether the value has
 * been sampled since a new value was recorded. That allows us to avoid bumping
 * the counter if no one has sampled it since the last time an error was
 * recorded.
 *
 * A new errseq_t should always be zeroed out.  A errseq_t value of all zeroes
 * is the special (but common) case where there has never been an error. An all
 * zero value thus serves as the "epoch" if one wishes to know whether there
 * has ever been an error set since it was first initialized.
 */

/* The low bits are designated for error code (max of MAX_ERRNO) */
#define ERRSEQ_SHIFT                ilog2(MAX_ERRNO + 1)

/* This bit is used as a flag to indicate whether the value has been seen */
#define ERRSEQ_SEEN                (1 << ERRSEQ_SHIFT)

/* The lowest bit of the counter */
#define ERRSEQ_CTR_INC                (1 << (ERRSEQ_SHIFT + 1))

/**
 * errseq_set - set a errseq_t for later reporting
 * @eseq: errseq_t field that should be set
 * @err: error to set (must be between -1 and -MAX_ERRNO)
 *
 * This function sets the error in @eseq, and increments the sequence counter
 * if the last sequence was sampled at some point in the past.
 *
 * Any error set will always overwrite an existing error.
 *
 * Return: The previous value, primarily for debugging purposes. The
 * return value should not be used as a previously sampled value in later
 * calls as it will not have the SEEN flag set.
 */
errseq_t errseq_set(errseq_t *eseq, int err)
{
        errseq_t cur, old;

        /* MAX_ERRNO must be able to serve as a mask */
        BUILD_BUG_ON_NOT_POWER_OF_2(MAX_ERRNO + 1);

        /*
         * Ensure the error code actually fits where we want it to go. If it
         * doesn't then just throw a warning and don't record anything. We
         * also don't accept zero here as that would effectively clear a
         * previous error.
         */
        old = READ_ONCE(*eseq);

        if (WARN(unlikely(err == 0 || (unsigned int)-err > MAX_ERRNO),
                                "err = %d\n", err))
                return old;

        for (;;) {
                errseq_t new;

                /* Clear out error bits and set new error */
                new = (old & ~(MAX_ERRNO|ERRSEQ_SEEN)) | -err;

                /* Only increment if someone has looked at it */
                if (old & ERRSEQ_SEEN)
                        new += ERRSEQ_CTR_INC;

                /* If there would be no change, then call it done */
                if (new == old) {
                        cur = new;
                        break;
                }

                /* Try to swap the new value into place */
                cur = cmpxchg(eseq, old, new);

                /*
                 * Call it success if we did the swap or someone else beat us
                 * to it for the same value.
                 */
                if (likely(cur == old || cur == new))
                        break;

                /* Raced with an update, try again */
                old = cur;
        }
        return cur;
}
EXPORT_SYMBOL(errseq_set);

/**
 * errseq_sample() - Grab current errseq_t value.
 * @eseq: Pointer to errseq_t to be sampled.
 *
 * This function allows callers to initialise their errseq_t variable.
 * If the error has been "seen", new callers will not see an old error.
 * If there is an unseen error in @eseq, the caller of this function will
 * see it the next time it checks for an error.
 *
 * Context: Any context.
 * Return: The current errseq value.
 */
errseq_t errseq_sample(errseq_t *eseq)
{
        errseq_t old = READ_ONCE(*eseq);

        /* If nobody has seen this error yet, then we can be the first. */
        if (!(old & ERRSEQ_SEEN))
                old = 0;
        return old;
}
EXPORT_SYMBOL(errseq_sample);

/**
 * errseq_check() - Has an error occurred since a particular sample point?
 * @eseq: Pointer to errseq_t value to be checked.
 * @since: Previously-sampled errseq_t from which to check.
 *
 * Grab the value that eseq points to, and see if it has changed @since
 * the given value was sampled. The @since value is not advanced, so there
 * is no need to mark the value as seen.
 *
 * Return: The latest error set in the errseq_t or 0 if it hasn't changed.
 */
int errseq_check(errseq_t *eseq, errseq_t since)
{
        errseq_t cur = READ_ONCE(*eseq);

        if (likely(cur == since))
                return 0;
        return -(cur & MAX_ERRNO);
}
EXPORT_SYMBOL(errseq_check);

/**
 * errseq_check_and_advance() - Check an errseq_t and advance to current value.
 * @eseq: Pointer to value being checked and reported.
 * @since: Pointer to previously-sampled errseq_t to check against and advance.
 *
 * Grab the eseq value, and see whether it matches the value that @since
 * points to. If it does, then just return 0.
 *
 * If it doesn't, then the value has changed. Set the "seen" flag, and try to
 * swap it into place as the new eseq value. Then, set that value as the new
 * "since" value, and return whatever the error portion is set to.
 *
 * Note that no locking is provided here for concurrent updates to the "since"
 * value. The caller must provide that if necessary. Because of this, callers
 * may want to do a lockless errseq_check before taking the lock and calling
 * this.
 *
 * Return: Negative errno if one has been stored, or 0 if no new error has
 * occurred.
 */
int errseq_check_and_advance(errseq_t *eseq, errseq_t *since)
{
        int err = 0;
        errseq_t old, new;

        /*
         * Most callers will want to use the inline wrapper to check this,
         * so that the common case of no error is handled without needing
         * to take the lock that protects the "since" value.
         */
        old = READ_ONCE(*eseq);
        if (old != *since) {
                /*
                 * Set the flag and try to swap it into place if it has
                 * changed.
                 *
                 * We don't care about the outcome of the swap here. If the
                 * swap doesn't occur, then it has either been updated by a
                 * writer who is altering the value in some way (updating
                 * counter or resetting the error), or another reader who is
                 * just setting the "seen" flag. Either outcome is OK, and we
                 * can advance "since" and return an error based on what we
                 * have.
                 */
                new = old | ERRSEQ_SEEN;
                if (new != old)
                        cmpxchg(eseq, old, new);
                *since = new;
                err = -(new & MAX_ERRNO);
        }
        return err;
}
EXPORT_SYMBOL(errseq_check_and_advance);
























































































































































    5 















    5 

















    5 






























































































    3 











    4 

    3 























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef BTRFS_EXTENT_IO_H
#define BTRFS_EXTENT_IO_H

#include <linux/rbtree.h>
#include <linux/refcount.h>
#include <linux/fiemap.h>
#include <linux/btrfs_tree.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/rwsem.h>
#include <linux/list.h>
#include <linux/slab.h>
#include "compression.h"
#include "messages.h"
#include "ulist.h"
#include "misc.h"

struct page;
struct file;
struct folio;
struct inode;
struct fiemap_extent_info;
struct readahead_control;
struct address_space;
struct writeback_control;
struct extent_io_tree;
struct extent_map_tree;
struct extent_state;
struct btrfs_block_group;
struct btrfs_fs_info;
struct btrfs_inode;
struct btrfs_root;
struct btrfs_trans_handle;
struct btrfs_tree_parent_check;

enum {
        EXTENT_BUFFER_UPTODATE,
        EXTENT_BUFFER_DIRTY,
        EXTENT_BUFFER_CORRUPT,
        /* this got triggered by readahead */
        EXTENT_BUFFER_READAHEAD,
        EXTENT_BUFFER_TREE_REF,
        EXTENT_BUFFER_STALE,
        EXTENT_BUFFER_WRITEBACK,
        /* read IO error */
        EXTENT_BUFFER_READ_ERR,
        EXTENT_BUFFER_UNMAPPED,
        EXTENT_BUFFER_IN_TREE,
        /* write IO error */
        EXTENT_BUFFER_WRITE_ERR,
        /* Indicate the extent buffer is written zeroed out (for zoned) */
        EXTENT_BUFFER_ZONED_ZEROOUT,
        /* Indicate that extent buffer pages a being read */
        EXTENT_BUFFER_READING,
};

/* these are flags for __process_pages_contig */
enum {
        ENUM_BIT(PAGE_UNLOCK),
        /* Page starts writeback, clear dirty bit and set writeback bit */
        ENUM_BIT(PAGE_START_WRITEBACK),
        ENUM_BIT(PAGE_END_WRITEBACK),
        ENUM_BIT(PAGE_SET_ORDERED),
};

/*
 * Folio private values.  Every page that is controlled by the extent map has
 * folio private set to this value.
 */
#define EXTENT_FOLIO_PRIVATE                        1

/*
 * The extent buffer bitmap operations are done with byte granularity instead of
 * word granularity for two reasons:
 * 1. The bitmaps must be little-endian on disk.
 * 2. Bitmap items are not guaranteed to be aligned to a word and therefore a
 *    single word in a bitmap may straddle two pages in the extent buffer.
 */
#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE)
#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1)
#define BITMAP_FIRST_BYTE_MASK(start) \
        ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK)
#define BITMAP_LAST_BYTE_MASK(nbits) \
        (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))


int __init extent_buffer_init_cachep(void);
void __cold extent_buffer_free_cachep(void);

#define INLINE_EXTENT_BUFFER_PAGES     (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE)
struct extent_buffer {
        u64 start;
        u32 len;
        u32 folio_size;
        unsigned long bflags;
        struct btrfs_fs_info *fs_info;

        /*
         * The address where the eb can be accessed without any cross-page handling.
         * This can be NULL if not possible.
         */
        void *addr;

        spinlock_t refs_lock;
        atomic_t refs;
        int read_mirror;
        /* >= 0 if eb belongs to a log tree, -1 otherwise */
        s8 log_index;
        u8 folio_shift;
        struct rcu_head rcu_head;

        struct rw_semaphore lock;

        /*
         * Pointers to all the folios of the extent buffer.
         *
         * For now the folio is always order 0 (aka, a single page).
         */
        struct folio *folios[INLINE_EXTENT_BUFFER_PAGES];
#ifdef CONFIG_BTRFS_DEBUG
        struct list_head leak_list;
        pid_t lock_owner;
#endif
};

struct btrfs_eb_write_context {
        struct writeback_control *wbc;
        struct extent_buffer *eb;
        /* Block group @eb resides in. Only used for zoned mode. */
        struct btrfs_block_group *zoned_bg;
};

static inline unsigned long offset_in_eb_folio(const struct extent_buffer *eb,
                                               u64 start)
{
        ASSERT(eb->folio_size);
        return start & (eb->folio_size - 1);
}

/*
 * Get the correct offset inside the page of extent buffer.
 *
 * @eb:                target extent buffer
 * @start:        offset inside the extent buffer
 *
 * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases.
 */
static inline size_t get_eb_offset_in_folio(const struct extent_buffer *eb,
                                            unsigned long offset)
{
        /*
         * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case
         *    1.1) One large folio covering the whole eb
         *           The eb->start is aligned to folio size, thus adding it
         *           won't cause any difference.
         *    1.2) Several page sized folios
         *           The eb->start is aligned to folio (page) size, thus
         *           adding it won't cause any difference.
         *
         * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case
         *    In this case there would only be one page sized folio, and there
         *    may be several different extent buffers in the page/folio.
         *    We need to add eb->start to properly access the offset inside
         *    that eb.
         */
        return offset_in_folio(eb->folios[0], offset + eb->start);
}

static inline unsigned long get_eb_folio_index(const struct extent_buffer *eb,
                                               unsigned long offset)
{
        /*
         * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case
         *    1.1) One large folio covering the whole eb.
         *           the folio_shift would be large enough to always make us
         *           return 0 as index.
         *    1.2) Several page sized folios
         *         The folio_shift would be PAGE_SHIFT, giving us the correct
         *         index.
         *
         * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case
         *    The folio would only be page sized, and always give us 0 as index.
         */
        return offset >> eb->folio_shift;
}

/*
 * Structure to record how many bytes and which ranges are set/cleared
 */
struct extent_changeset {
        /* How many bytes are set/cleared in this operation */
        u64 bytes_changed;

        /* Changed ranges */
        struct ulist range_changed;
};

static inline void extent_changeset_init(struct extent_changeset *changeset)
{
        changeset->bytes_changed = 0;
        ulist_init(&changeset->range_changed);
}

static inline struct extent_changeset *extent_changeset_alloc(void)
{
        struct extent_changeset *ret;

        ret = kmalloc(sizeof(*ret), GFP_KERNEL);
        if (!ret)
                return NULL;

        extent_changeset_init(ret);
        return ret;
}

static inline void extent_changeset_release(struct extent_changeset *changeset)
{
        if (!changeset)
                return;
        changeset->bytes_changed = 0;
        ulist_release(&changeset->range_changed);
}

static inline void extent_changeset_free(struct extent_changeset *changeset)
{
        if (!changeset)
                return;
        extent_changeset_release(changeset);
        kfree(changeset);
}

bool try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);

int btrfs_read_folio(struct file *file, struct folio *folio);
void extent_write_locked_range(struct inode *inode, struct page *locked_page,
                               u64 start, u64 end, struct writeback_control *wbc,
                               bool pages_dirty);
int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
                            struct writeback_control *wbc);
void btrfs_readahead(struct readahead_control *rac);
int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
                  u64 start, u64 len);
int set_folio_extent_mapped(struct folio *folio);
int set_page_extent_mapped(struct page *page);
void clear_page_extent_mapped(struct page *page);

struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
                                          u64 start, u64 owner_root, int level);
struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
                                                  u64 start, unsigned long len);
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
                                                u64 start);
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src);
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
                                         u64 start);
void free_extent_buffer(struct extent_buffer *eb);
void free_extent_buffer_stale(struct extent_buffer *eb);
#define WAIT_NONE        0
#define WAIT_COMPLETE        1
#define WAIT_PAGE_LOCK        2
int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
                             struct btrfs_tree_parent_check *parent_check);
void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
                                u64 bytenr, u64 owner_root, u64 gen, int level);
void btrfs_readahead_node_child(struct extent_buffer *node, int slot);

static inline int num_extent_pages(const struct extent_buffer *eb)
{
        /*
         * For sectorsize == PAGE_SIZE case, since nodesize is always aligned to
         * sectorsize, it's just eb->len >> PAGE_SHIFT.
         *
         * For sectorsize < PAGE_SIZE case, we could have nodesize < PAGE_SIZE,
         * thus have to ensure we get at least one page.
         */
        return (eb->len >> PAGE_SHIFT) ?: 1;
}

/*
 * This can only be determined at runtime by checking eb::folios[0].
 *
 * As we can have either one large folio covering the whole eb
 * (either nodesize <= PAGE_SIZE, or high order folio), or multiple
 * single-paged folios.
 */
static inline int num_extent_folios(const struct extent_buffer *eb)
{
        if (folio_order(eb->folios[0]))
                return 1;
        return num_extent_pages(eb);
}

static inline int extent_buffer_uptodate(const struct extent_buffer *eb)
{
        return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
}

int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
                         unsigned long start, unsigned long len);
void read_extent_buffer(const struct extent_buffer *eb, void *dst,
                        unsigned long start,
                        unsigned long len);
int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
                                       void __user *dst, unsigned long start,
                                       unsigned long len);
void write_extent_buffer(const struct extent_buffer *eb, const void *src,
                         unsigned long start, unsigned long len);

static inline void write_extent_buffer_chunk_tree_uuid(
                const struct extent_buffer *eb, const void *chunk_tree_uuid)
{
        write_extent_buffer(eb, chunk_tree_uuid,
                            offsetof(struct btrfs_header, chunk_tree_uuid),
                            BTRFS_FSID_SIZE);
}

static inline void write_extent_buffer_fsid(const struct extent_buffer *eb,
                                            const void *fsid)
{
        write_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
                            BTRFS_FSID_SIZE);
}

void copy_extent_buffer_full(const struct extent_buffer *dst,
                             const struct extent_buffer *src);
void copy_extent_buffer(const struct extent_buffer *dst,
                        const struct extent_buffer *src,
                        unsigned long dst_offset, unsigned long src_offset,
                        unsigned long len);
void memcpy_extent_buffer(const struct extent_buffer *dst,
                          unsigned long dst_offset, unsigned long src_offset,
                          unsigned long len);
void memmove_extent_buffer(const struct extent_buffer *dst,
                           unsigned long dst_offset, unsigned long src_offset,
                           unsigned long len);
void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
                           unsigned long len);
int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
                           unsigned long pos);
void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
                              unsigned long pos, unsigned long len);
void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
                                unsigned long start, unsigned long pos,
                                unsigned long len);
void set_extent_buffer_dirty(struct extent_buffer *eb);
void set_extent_buffer_uptodate(struct extent_buffer *eb);
void clear_extent_buffer_uptodate(struct extent_buffer *eb);
void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
                                  struct page *locked_page,
                                  struct extent_state **cached,
                                  u32 bits_to_clear, unsigned long page_ops);
int extent_invalidate_folio(struct extent_io_tree *tree,
                            struct folio *folio, size_t offset);
void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
                              struct extent_buffer *buf);

int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
                           gfp_t extra_gfp);
int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array,
                            gfp_t extra_gfp);

#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
                             struct page *locked_page, u64 *start,
                             u64 *end);
#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
                                               u64 start);

#ifdef CONFIG_BTRFS_DEBUG
void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info);
#else
#define btrfs_extent_buffer_leak_debug_check(fs_info)        do {} while (0)
#endif

#endif

























































































































































































































































































































































































































































































































    5 











































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#ifndef BTRFS_CTREE_H
#define BTRFS_CTREE_H

#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
#include <linux/mutex.h>
#include <linux/wait.h>
#include <linux/list.h>
#include <linux/atomic.h>
#include <linux/xarray.h>
#include <linux/refcount.h>
#include <uapi/linux/btrfs_tree.h>
#include "locking.h"
#include "fs.h"
#include "accessors.h"
#include "extent-io-tree.h"

struct extent_buffer;
struct btrfs_block_rsv;
struct btrfs_trans_handle;
struct btrfs_block_group;

/* Read ahead values for struct btrfs_path.reada */
enum {
        READA_NONE,
        READA_BACK,
        READA_FORWARD,
        /*
         * Similar to READA_FORWARD but unlike it:
         *
         * 1) It will trigger readahead even for leaves that are not close to
         *    each other on disk;
         * 2) It also triggers readahead for nodes;
         * 3) During a search, even when a node or leaf is already in memory, it
         *    will still trigger readahead for other nodes and leaves that follow
         *    it.
         *
         * This is meant to be used only when we know we are iterating over the
         * entire tree or a very large part of it.
         */
        READA_FORWARD_ALWAYS,
};

/*
 * btrfs_paths remember the path taken from the root down to the leaf.
 * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
 * to any other levels that are present.
 *
 * The slots array records the index of the item or block pointer
 * used while walking the tree.
 */
struct btrfs_path {
        struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
        int slots[BTRFS_MAX_LEVEL];
        /* if there is real range locking, this locks field will change */
        u8 locks[BTRFS_MAX_LEVEL];
        u8 reada;
        /* keep some upper locks as we walk down */
        u8 lowest_level;

        /*
         * set by btrfs_split_item, tells search_slot to keep all locks
         * and to force calls to keep space in the nodes
         */
        unsigned int search_for_split:1;
        unsigned int keep_locks:1;
        unsigned int skip_locking:1;
        unsigned int search_commit_root:1;
        unsigned int need_commit_sem:1;
        unsigned int skip_release_on_error:1;
        /*
         * Indicate that new item (btrfs_search_slot) is extending already
         * existing item and ins_len contains only the data size and not item
         * header (ie. sizeof(struct btrfs_item) is not included).
         */
        unsigned int search_for_extension:1;
        /* Stop search if any locks need to be taken (for read) */
        unsigned int nowait:1;
};

/*
 * The state of btrfs root
 */
enum {
        /*
         * btrfs_record_root_in_trans is a multi-step process, and it can race
         * with the balancing code.   But the race is very small, and only the
         * first time the root is added to each transaction.  So IN_TRANS_SETUP
         * is used to tell us when more checks are required
         */
        BTRFS_ROOT_IN_TRANS_SETUP,

        /*
         * Set if tree blocks of this root can be shared by other roots.
         * Only subvolume trees and their reloc trees have this bit set.
         * Conflicts with TRACK_DIRTY bit.
         *
         * This affects two things:
         *
         * - How balance works
         *   For shareable roots, we need to use reloc tree and do path
         *   replacement for balance, and need various pre/post hooks for
         *   snapshot creation to handle them.
         *
         *   While for non-shareable trees, we just simply do a tree search
         *   with COW.
         *
         * - How dirty roots are tracked
         *   For shareable roots, btrfs_record_root_in_trans() is needed to
         *   track them, while non-subvolume roots have TRACK_DIRTY bit, they
         *   don't need to set this manually.
         */
        BTRFS_ROOT_SHAREABLE,
        BTRFS_ROOT_TRACK_DIRTY,
        BTRFS_ROOT_IN_RADIX,
        BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
        BTRFS_ROOT_DEFRAG_RUNNING,
        BTRFS_ROOT_FORCE_COW,
        BTRFS_ROOT_MULTI_LOG_TASKS,
        BTRFS_ROOT_DIRTY,
        BTRFS_ROOT_DELETING,

        /*
         * Reloc tree is orphan, only kept here for qgroup delayed subtree scan
         *
         * Set for the subvolume tree owning the reloc tree.
         */
        BTRFS_ROOT_DEAD_RELOC_TREE,
        /* Mark dead root stored on device whose cleanup needs to be resumed */
        BTRFS_ROOT_DEAD_TREE,
        /* The root has a log tree. Used for subvolume roots and the tree root. */
        BTRFS_ROOT_HAS_LOG_TREE,
        /* Qgroup flushing is in progress */
        BTRFS_ROOT_QGROUP_FLUSHING,
        /* We started the orphan cleanup for this root. */
        BTRFS_ROOT_ORPHAN_CLEANUP,
        /* This root has a drop operation that was started previously. */
        BTRFS_ROOT_UNFINISHED_DROP,
        /* This reloc root needs to have its buffers lockdep class reset. */
        BTRFS_ROOT_RESET_LOCKDEP_CLASS,
};

/*
 * Record swapped tree blocks of a subvolume tree for delayed subtree trace
 * code. For detail check comment in fs/btrfs/qgroup.c.
 */
struct btrfs_qgroup_swapped_blocks {
        spinlock_t lock;
        /* RM_EMPTY_ROOT() of above blocks[] */
        bool swapped;
        struct rb_root blocks[BTRFS_MAX_LEVEL];
};

/*
 * in ram representation of the tree.  extent_root is used for all allocations
 * and for the extent tree extent_root root.
 */
struct btrfs_root {
        struct rb_node rb_node;

        struct extent_buffer *node;

        struct extent_buffer *commit_root;
        struct btrfs_root *log_root;
        struct btrfs_root *reloc_root;

        unsigned long state;
        struct btrfs_root_item root_item;
        struct btrfs_key root_key;
        struct btrfs_fs_info *fs_info;
        struct extent_io_tree dirty_log_pages;

        struct mutex objectid_mutex;

        spinlock_t accounting_lock;
        struct btrfs_block_rsv *block_rsv;

        struct mutex log_mutex;
        wait_queue_head_t log_writer_wait;
        wait_queue_head_t log_commit_wait[2];
        struct list_head log_ctxs[2];
        /* Used only for log trees of subvolumes, not for the log root tree */
        atomic_t log_writers;
        atomic_t log_commit[2];
        /* Used only for log trees of subvolumes, not for the log root tree */
        atomic_t log_batch;
        /*
         * Protected by the 'log_mutex' lock but can be read without holding
         * that lock to avoid unnecessary lock contention, in which case it
         * should be read using btrfs_get_root_log_transid() except if it's a
         * log tree in which case it can be directly accessed. Updates to this
         * field should always use btrfs_set_root_log_transid(), except for log
         * trees where the field can be updated directly.
         */
        int log_transid;
        /* No matter the commit succeeds or not*/
        int log_transid_committed;
        /*
         * Just be updated when the commit succeeds. Use
         * btrfs_get_root_last_log_commit() and btrfs_set_root_last_log_commit()
         * to access this field.
         */
        int last_log_commit;
        pid_t log_start_pid;

        u64 last_trans;

        u64 free_objectid;

        struct btrfs_key defrag_progress;
        struct btrfs_key defrag_max;

        /* The dirty list is only used by non-shareable roots */
        struct list_head dirty_list;

        struct list_head root_list;

        spinlock_t inode_lock;
        /* red-black tree that keeps track of in-memory inodes */
        struct rb_root inode_tree;

        /*
         * Xarray that keeps track of delayed nodes of every inode, protected
         * by @inode_lock.
         */
        struct xarray delayed_nodes;
        /*
         * right now this just gets used so that a root has its own devid
         * for stat.  It may be used for more later
         */
        dev_t anon_dev;

        spinlock_t root_item_lock;
        refcount_t refs;

        struct mutex delalloc_mutex;
        spinlock_t delalloc_lock;
        /*
         * all of the inodes that have delalloc bytes.  It is possible for
         * this list to be empty even when there is still dirty data=ordered
         * extents waiting to finish IO.
         */
        struct list_head delalloc_inodes;
        struct list_head delalloc_root;
        u64 nr_delalloc_inodes;

        struct mutex ordered_extent_mutex;
        /*
         * this is used by the balancing code to wait for all the pending
         * ordered extents
         */
        spinlock_t ordered_extent_lock;

        /*
         * all of the data=ordered extents pending writeback
         * these can span multiple transactions and basically include
         * every dirty data page that isn't from nodatacow
         */
        struct list_head ordered_extents;
        struct list_head ordered_root;
        u64 nr_ordered_extents;

        /*
         * Not empty if this subvolume root has gone through tree block swap
         * (relocation)
         *
         * Will be used by reloc_control::dirty_subvol_roots.
         */
        struct list_head reloc_dirty_list;

        /*
         * Number of currently running SEND ioctls to prevent
         * manipulation with the read-only status via SUBVOL_SETFLAGS
         */
        int send_in_progress;
        /*
         * Number of currently running deduplication operations that have a
         * destination inode belonging to this root. Protected by the lock
         * root_item_lock.
         */
        int dedupe_in_progress;
        /* For exclusion of snapshot creation and nocow writes */
        struct btrfs_drew_lock snapshot_lock;

        atomic_t snapshot_force_cow;

        /* For qgroup metadata reserved space */
        spinlock_t qgroup_meta_rsv_lock;
        u64 qgroup_meta_rsv_pertrans;
        u64 qgroup_meta_rsv_prealloc;
        wait_queue_head_t qgroup_flush_wait;

        /* Number of active swapfiles */
        atomic_t nr_swapfiles;

        /* Record pairs of swapped blocks for qgroup */
        struct btrfs_qgroup_swapped_blocks swapped_blocks;

        /* Used only by log trees, when logging csum items */
        struct extent_io_tree log_csum_range;

        /* Used in simple quotas, track root during relocation. */
        u64 relocation_src_root;

#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
        u64 alloc_bytenr;
#endif

#ifdef CONFIG_BTRFS_DEBUG
        struct list_head leak_list;
#endif
};

static inline bool btrfs_root_readonly(const struct btrfs_root *root)
{
        /* Byte-swap the constant at compile time, root_item::flags is LE */
        return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
}

static inline bool btrfs_root_dead(const struct btrfs_root *root)
{
        /* Byte-swap the constant at compile time, root_item::flags is LE */
        return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
}

static inline u64 btrfs_root_id(const struct btrfs_root *root)
{
        return root->root_key.objectid;
}

static inline int btrfs_get_root_log_transid(const struct btrfs_root *root)
{
        return READ_ONCE(root->log_transid);
}

static inline void btrfs_set_root_log_transid(struct btrfs_root *root, int log_transid)
{
        WRITE_ONCE(root->log_transid, log_transid);
}

static inline int btrfs_get_root_last_log_commit(const struct btrfs_root *root)
{
        return READ_ONCE(root->last_log_commit);
}

static inline void btrfs_set_root_last_log_commit(struct btrfs_root *root, int commit_id)
{
        WRITE_ONCE(root->last_log_commit, commit_id);
}

/*
 * Structure that conveys information about an extent that is going to replace
 * all the extents in a file range.
 */
struct btrfs_replace_extent_info {
        u64 disk_offset;
        u64 disk_len;
        u64 data_offset;
        u64 data_len;
        u64 file_offset;
        /* Pointer to a file extent item of type regular or prealloc. */
        char *extent_buf;
        /*
         * Set to true when attempting to replace a file range with a new extent
         * described by this structure, set to false when attempting to clone an
         * existing extent into a file range.
         */
        bool is_new_extent;
        /* Indicate if we should update the inode's mtime and ctime. */
        bool update_times;
        /* Meaningful only if is_new_extent is true. */
        int qgroup_reserved;
        /*
         * Meaningful only if is_new_extent is true.
         * Used to track how many extent items we have already inserted in a
         * subvolume tree that refer to the extent described by this structure,
         * so that we know when to create a new delayed ref or update an existing
         * one.
         */
        int insertions;
};

/* Arguments for btrfs_drop_extents() */
struct btrfs_drop_extents_args {
        /* Input parameters */

        /*
         * If NULL, btrfs_drop_extents() will allocate and free its own path.
         * If 'replace_extent' is true, this must not be NULL. Also the path
         * is always released except if 'replace_extent' is true and
         * btrfs_drop_extents() sets 'extent_inserted' to true, in which case
         * the path is kept locked.
         */
        struct btrfs_path *path;
        /* Start offset of the range to drop extents from */
        u64 start;
        /* End (exclusive, last byte + 1) of the range to drop extents from */
        u64 end;
        /* If true drop all the extent maps in the range */
        bool drop_cache;
        /*
         * If true it means we want to insert a new extent after dropping all
         * the extents in the range. If this is true, the 'extent_item_size'
         * parameter must be set as well and the 'extent_inserted' field will
         * be set to true by btrfs_drop_extents() if it could insert the new
         * extent.
         * Note: when this is set to true the path must not be NULL.
         */
        bool replace_extent;
        /*
         * Used if 'replace_extent' is true. Size of the file extent item to
         * insert after dropping all existing extents in the range
         */
        u32 extent_item_size;

        /* Output parameters */

        /*
         * Set to the minimum between the input parameter 'end' and the end
         * (exclusive, last byte + 1) of the last dropped extent. This is always
         * set even if btrfs_drop_extents() returns an error.
         */
        u64 drop_end;
        /*
         * The number of allocated bytes found in the range. This can be smaller
         * than the range's length when there are holes in the range.
         */
        u64 bytes_found;
        /*
         * Only set if 'replace_extent' is true. Set to true if we were able
         * to insert a replacement extent after dropping all extents in the
         * range, otherwise set to false by btrfs_drop_extents().
         * Also, if btrfs_drop_extents() has set this to true it means it
         * returned with the path locked, otherwise if it has set this to
         * false it has returned with the path released.
         */
        bool extent_inserted;
};

struct btrfs_file_private {
        void *filldir_buf;
        u64 last_index;
        struct extent_state *llseek_cached_state;
};

static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
{
        return info->nodesize - sizeof(struct btrfs_header);
}

static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info)
{
        return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item);
}

static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_fs_info *info)
{
        return BTRFS_LEAF_DATA_SIZE(info) / sizeof(struct btrfs_key_ptr);
}

static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
{
        return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item);
}

#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
                                ((bytes) >> (fs_info)->sectorsize_bits)

static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
{
        return mapping_gfp_constraint(mapping, ~__GFP_FS);
}

void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
                         u64 num_bytes, u64 *actual_bytes);
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);

/* ctree.c */
int __init btrfs_ctree_init(void);
void __cold btrfs_ctree_exit(void);

int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
                     const struct btrfs_key *key, int *slot);

int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);

#ifdef __LITTLE_ENDIAN

/*
 * Compare two keys, on little-endian the disk order is same as CPU order and
 * we can avoid the conversion.
 */
static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk_key,
                                  const struct btrfs_key *k2)
{
        const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;

        return btrfs_comp_cpu_keys(k1, k2);
}

#else

/* Compare two keys in a memcmp fashion. */
static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk,
                                  const struct btrfs_key *k2)
{
        struct btrfs_key k1;

        btrfs_disk_key_to_cpu(&k1, disk);

        return btrfs_comp_cpu_keys(&k1, k2);
}

#endif

int btrfs_previous_item(struct btrfs_root *root,
                        struct btrfs_path *path, u64 min_objectid,
                        int type);
int btrfs_previous_extent_item(struct btrfs_root *root,
                        struct btrfs_path *path, u64 min_objectid);
void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
                             struct btrfs_path *path,
                             const struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
                        struct btrfs_key *key, int lowest_level,
                        u64 min_trans);
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
                         struct btrfs_path *path,
                         u64 min_trans);
struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
                                           int slot);

int btrfs_cow_block(struct btrfs_trans_handle *trans,
                    struct btrfs_root *root, struct extent_buffer *buf,
                    struct extent_buffer *parent, int parent_slot,
                    struct extent_buffer **cow_ret,
                    enum btrfs_lock_nesting nest);
int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          struct extent_buffer *buf,
                          struct extent_buffer *parent, int parent_slot,
                          struct extent_buffer **cow_ret,
                          u64 search_start, u64 empty_size,
                          enum btrfs_lock_nesting nest);
int btrfs_copy_root(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      struct extent_buffer *buf,
                      struct extent_buffer **cow_ret, u64 new_root_objectid);
bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               struct extent_buffer *buf);
int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                  struct btrfs_path *path, int level, int slot);
void btrfs_extend_item(struct btrfs_trans_handle *trans,
                       struct btrfs_path *path, u32 data_size);
void btrfs_truncate_item(struct btrfs_trans_handle *trans,
                         struct btrfs_path *path, u32 new_size, int from_end);
int btrfs_split_item(struct btrfs_trans_handle *trans,
                     struct btrfs_root *root,
                     struct btrfs_path *path,
                     const struct btrfs_key *new_key,
                     unsigned long split_offset);
int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         struct btrfs_path *path,
                         const struct btrfs_key *new_key);
int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
                u64 inum, u64 ioff, u8 key_type, struct btrfs_key *found_key);
int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                      const struct btrfs_key *key, struct btrfs_path *p,
                      int ins_len, int cow);
int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
                          struct btrfs_path *p, u64 time_seq);
int btrfs_search_slot_for_read(struct btrfs_root *root,
                               const struct btrfs_key *key,
                               struct btrfs_path *p, int find_higher,
                               int return_any);
void btrfs_release_path(struct btrfs_path *p);
struct btrfs_path *btrfs_alloc_path(void);
void btrfs_free_path(struct btrfs_path *p);

int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                   struct btrfs_path *path, int slot, int nr);
static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path)
{
        return btrfs_del_items(trans, root, path, path->slots[0], 1);
}

/*
 * Describes a batch of items to insert in a btree. This is used by
 * btrfs_insert_empty_items().
 */
struct btrfs_item_batch {
        /*
         * Pointer to an array containing the keys of the items to insert (in
         * sorted order).
         */
        const struct btrfs_key *keys;
        /* Pointer to an array containing the data size for each item to insert. */
        const u32 *data_sizes;
        /*
         * The sum of data sizes for all items. The caller can compute this while
         * setting up the data_sizes array, so it ends up being more efficient
         * than having btrfs_insert_empty_items() or setup_item_for_insert()
         * doing it, as it would avoid an extra loop over a potentially large
         * array, and in the case of setup_item_for_insert(), we would be doing
         * it while holding a write lock on a leaf and often on upper level nodes
         * too, unnecessarily increasing the size of a critical section.
         */
        u32 total_data_size;
        /* Size of the keys and data_sizes arrays (number of items in the batch). */
        int nr;
};

void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path,
                                 const struct btrfs_key *key,
                                 u32 data_size);
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                      const struct btrfs_key *key, void *data, u32 data_size);
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path,
                             const struct btrfs_item_batch *batch);

static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
                                          struct btrfs_path *path,
                                          const struct btrfs_key *key,
                                          u32 data_size)
{
        struct btrfs_item_batch batch;

        batch.keys = key;
        batch.data_sizes = &data_size;
        batch.total_data_size = data_size;
        batch.nr = 1;

        return btrfs_insert_empty_items(trans, root, path, &batch);
}

int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
                        u64 time_seq);

int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
                           struct btrfs_path *path);

int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
                              struct btrfs_path *path);

/*
 * Search in @root for a given @key, and store the slot found in @found_key.
 *
 * @root:        The root node of the tree.
 * @key:        The key we are looking for.
 * @found_key:        Will hold the found item.
 * @path:        Holds the current slot/leaf.
 * @iter_ret:        Contains the value returned from btrfs_search_slot or
 *                 btrfs_get_next_valid_item, whichever was executed last.
 *
 * The @iter_ret is an output variable that will contain the return value of
 * btrfs_search_slot, if it encountered an error, or the value returned from
 * btrfs_get_next_valid_item otherwise. That return value can be 0, if a valid
 * slot was found, 1 if there were no more leaves, and <0 if there was an error.
 *
 * It's recommended to use a separate variable for iter_ret and then use it to
 * set the function return value so there's no confusion of the 0/1/errno
 * values stemming from btrfs_search_slot.
 */
#define btrfs_for_each_slot(root, key, found_key, path, iter_ret)                \
        for (iter_ret = btrfs_search_slot(NULL, (root), (key), (path), 0, 0);        \
                (iter_ret) >= 0 &&                                                \
                (iter_ret = btrfs_get_next_valid_item((root), (found_key), (path))) == 0; \
                (path)->slots[0]++                                                \
        )

int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq);

/*
 * Search the tree again to find a leaf with greater keys.
 *
 * Returns 0 if it found something or 1 if there are no greater leaves.
 * Returns < 0 on error.
 */
static inline int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
{
        return btrfs_next_old_leaf(root, path, 0);
}

static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
{
        return btrfs_next_old_item(root, p, 0);
}
int btrfs_leaf_free_space(const struct extent_buffer *leaf);

static inline int is_fstree(u64 rootid)
{
        if (rootid == BTRFS_FS_TREE_OBJECTID ||
            ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID &&
              !btrfs_qgroup_level(rootid)))
                return 1;
        return 0;
}

static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
{
        return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
}

u16 btrfs_csum_type_size(u16 type);
int btrfs_super_csum_size(const struct btrfs_super_block *s);
const char *btrfs_super_csum_name(u16 csum_type);
const char *btrfs_super_csum_driver(u16 csum_type);
size_t __attribute_const__ btrfs_get_num_csums(void);

/*
 * We use page status Private2 to indicate there is an ordered extent with
 * unfinished IO.
 *
 * Rename the Private2 accessors to Ordered, to improve readability.
 */
#define PageOrdered(page)                PagePrivate2(page)
#define SetPageOrdered(page)                SetPagePrivate2(page)
#define ClearPageOrdered(page)                ClearPagePrivate2(page)
#define folio_test_ordered(folio)        folio_test_private_2(folio)
#define folio_set_ordered(folio)        folio_set_private_2(folio)
#define folio_clear_ordered(folio)        folio_clear_private_2(folio)

#endif






































































































































































































































    3 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PIPE_FS_I_H
#define _LINUX_PIPE_FS_I_H

#define PIPE_DEF_BUFFERS        16

#define PIPE_BUF_FLAG_LRU        0x01        /* page is on the LRU */
#define PIPE_BUF_FLAG_ATOMIC        0x02        /* was atomically mapped */
#define PIPE_BUF_FLAG_GIFT        0x04        /* page is a gift */
#define PIPE_BUF_FLAG_PACKET        0x08        /* read() as a packet */
#define PIPE_BUF_FLAG_CAN_MERGE        0x10        /* can merge buffers */
#define PIPE_BUF_FLAG_WHOLE        0x20        /* read() must return entire buffer or error */
#ifdef CONFIG_WATCH_QUEUE
#define PIPE_BUF_FLAG_LOSS        0x40        /* Message loss happened after this buffer */
#endif

/**
 *        struct pipe_buffer - a linux kernel pipe buffer
 *        @page: the page containing the data for the pipe buffer
 *        @offset: offset of data inside the @page
 *        @len: length of data inside the @page
 *        @ops: operations associated with this buffer. See @pipe_buf_operations.
 *        @flags: pipe buffer flags. See above.
 *        @private: private data owned by the ops.
 **/
struct pipe_buffer {
        struct page *page;
        unsigned int offset, len;
        const struct pipe_buf_operations *ops;
        unsigned int flags;
        unsigned long private;
};

/**
 *        struct pipe_inode_info - a linux kernel pipe
 *        @mutex: mutex protecting the whole thing
 *        @rd_wait: reader wait point in case of empty pipe
 *        @wr_wait: writer wait point in case of full pipe
 *        @head: The point of buffer production
 *        @tail: The point of buffer consumption
 *        @note_loss: The next read() should insert a data-lost message
 *        @max_usage: The maximum number of slots that may be used in the ring
 *        @ring_size: total number of buffers (should be a power of 2)
 *        @nr_accounted: The amount this pipe accounts for in user->pipe_bufs
 *        @tmp_page: cached released page
 *        @readers: number of current readers of this pipe
 *        @writers: number of current writers of this pipe
 *        @files: number of struct file referring this pipe (protected by ->i_lock)
 *        @r_counter: reader counter
 *        @w_counter: writer counter
 *        @poll_usage: is this pipe used for epoll, which has crazy wakeups?
 *        @fasync_readers: reader side fasync
 *        @fasync_writers: writer side fasync
 *        @bufs: the circular array of pipe buffers
 *        @user: the user who created this pipe
 *        @watch_queue: If this pipe is a watch_queue, this is the stuff for that
 **/
struct pipe_inode_info {
        struct mutex mutex;
        wait_queue_head_t rd_wait, wr_wait;
        unsigned int head;
        unsigned int tail;
        unsigned int max_usage;
        unsigned int ring_size;
        unsigned int nr_accounted;
        unsigned int readers;
        unsigned int writers;
        unsigned int files;
        unsigned int r_counter;
        unsigned int w_counter;
        bool poll_usage;
#ifdef CONFIG_WATCH_QUEUE
        bool note_loss;
#endif
        struct page *tmp_page;
        struct fasync_struct *fasync_readers;
        struct fasync_struct *fasync_writers;
        struct pipe_buffer *bufs;
        struct user_struct *user;
#ifdef CONFIG_WATCH_QUEUE
        struct watch_queue *watch_queue;
#endif
};

/*
 * Note on the nesting of these functions:
 *
 * ->confirm()
 *        ->try_steal()
 *
 * That is, ->try_steal() must be called on a confirmed buffer.  See below for
 * the meaning of each operation.  Also see the kerneldoc in fs/pipe.c for the
 * pipe and generic variants of these hooks.
 */
struct pipe_buf_operations {
        /*
         * ->confirm() verifies that the data in the pipe buffer is there
         * and that the contents are good. If the pages in the pipe belong
         * to a file system, we may need to wait for IO completion in this
         * hook. Returns 0 for good, or a negative error value in case of
         * error.  If not present all pages are considered good.
         */
        int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);

        /*
         * When the contents of this pipe buffer has been completely
         * consumed by a reader, ->release() is called.
         */
        void (*release)(struct pipe_inode_info *, struct pipe_buffer *);

        /*
         * Attempt to take ownership of the pipe buffer and its contents.
         * ->try_steal() returns %true for success, in which case the contents
         * of the pipe (the buf->page) is locked and now completely owned by the
         * caller. The page may then be transferred to a different mapping, the
         * most often used case is insertion into different file address space
         * cache.
         */
        bool (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);

        /*
         * Get a reference to the pipe buffer.
         */
        bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};

/**
 * pipe_has_watch_queue - Check whether the pipe is a watch_queue,
 * i.e. it was created with O_NOTIFICATION_PIPE
 * @pipe: The pipe to check
 *
 * Return: true if pipe is a watch queue, false otherwise.
 */
static inline bool pipe_has_watch_queue(const struct pipe_inode_info *pipe)
{
#ifdef CONFIG_WATCH_QUEUE
        return pipe->watch_queue != NULL;
#else
        return false;
#endif
}

/**
 * pipe_empty - Return true if the pipe is empty
 * @head: The pipe ring head pointer
 * @tail: The pipe ring tail pointer
 */
static inline bool pipe_empty(unsigned int head, unsigned int tail)
{
        return head == tail;
}

/**
 * pipe_occupancy - Return number of slots used in the pipe
 * @head: The pipe ring head pointer
 * @tail: The pipe ring tail pointer
 */
static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail)
{
        return head - tail;
}

/**
 * pipe_full - Return true if the pipe is full
 * @head: The pipe ring head pointer
 * @tail: The pipe ring tail pointer
 * @limit: The maximum amount of slots available.
 */
static inline bool pipe_full(unsigned int head, unsigned int tail,
                             unsigned int limit)
{
        return pipe_occupancy(head, tail) >= limit;
}

/**
 * pipe_buf - Return the pipe buffer for the specified slot in the pipe ring
 * @pipe: The pipe to access
 * @slot: The slot of interest
 */
static inline struct pipe_buffer *pipe_buf(const struct pipe_inode_info *pipe,
                                           unsigned int slot)
{
        return &pipe->bufs[slot & (pipe->ring_size - 1)];
}

/**
 * pipe_head_buf - Return the pipe buffer at the head of the pipe ring
 * @pipe: The pipe to access
 */
static inline struct pipe_buffer *pipe_head_buf(const struct pipe_inode_info *pipe)
{
        return pipe_buf(pipe, pipe->head);
}

/**
 * pipe_buf_get - get a reference to a pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to get a reference to
 *
 * Return: %true if the reference was successfully obtained.
 */
static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe,
                                struct pipe_buffer *buf)
{
        return buf->ops->get(pipe, buf);
}

/**
 * pipe_buf_release - put a reference to a pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to put a reference to
 */
static inline void pipe_buf_release(struct pipe_inode_info *pipe,
                                    struct pipe_buffer *buf)
{
        const struct pipe_buf_operations *ops = buf->ops;

        buf->ops = NULL;
        ops->release(pipe, buf);
}

/**
 * pipe_buf_confirm - verify contents of the pipe buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to confirm
 */
static inline int pipe_buf_confirm(struct pipe_inode_info *pipe,
                                   struct pipe_buffer *buf)
{
        if (!buf->ops->confirm)
                return 0;
        return buf->ops->confirm(pipe, buf);
}

/**
 * pipe_buf_try_steal - attempt to take ownership of a pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to attempt to steal
 */
static inline bool pipe_buf_try_steal(struct pipe_inode_info *pipe,
                struct pipe_buffer *buf)
{
        if (!buf->ops->try_steal)
                return false;
        return buf->ops->try_steal(pipe, buf);
}

static inline void pipe_discard_from(struct pipe_inode_info *pipe,
                unsigned int old_head)
{
        unsigned int mask = pipe->ring_size - 1;

        while (pipe->head > old_head)
                pipe_buf_release(pipe, &pipe->bufs[--pipe->head & mask]);
}

/* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual
   memory allocation, whereas PIPE_BUF makes atomicity guarantees.  */
#define PIPE_SIZE                PAGE_SIZE

/* Pipe lock and unlock operations */
void pipe_lock(struct pipe_inode_info *);
void pipe_unlock(struct pipe_inode_info *);
void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);

/* Wait for a pipe to be readable/writable while dropping the pipe lock */
void pipe_wait_readable(struct pipe_inode_info *);
void pipe_wait_writable(struct pipe_inode_info *);

struct pipe_inode_info *alloc_pipe_info(void);
void free_pipe_info(struct pipe_inode_info *);

/* Generic pipe buffer ops functions */
bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
bool generic_pipe_buf_try_steal(struct pipe_inode_info *, struct pipe_buffer *);
void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);

extern const struct pipe_buf_operations nosteal_pipe_buf_ops;

unsigned long account_pipe_buffers(struct user_struct *user,
                                   unsigned long old, unsigned long new);
bool too_many_pipe_buffers_soft(unsigned long user_bufs);
bool too_many_pipe_buffers_hard(unsigned long user_bufs);
bool pipe_is_unprivileged_user(void);

/* for F_SETPIPE_SZ and F_GETPIPE_SZ */
int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots);
long pipe_fcntl(struct file *, unsigned int, unsigned int arg);
struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice);

int create_pipe_files(struct file **, int);
unsigned int round_pipe_size(unsigned int size);

#endif







































































































































































    1 




























    5 





















    5 
























    1 





















    1 









































   18 





















    1 
















































    1 






    2 


































    1 















    2 







    3 









































    3 




























    3 


























    3 

































































    4 


































    2 
































































    2 















    1 

















































    2 






















    1 




















    6 







































    6 









































    3 






























    3 


























    3 






















    2 











































    6 


































































    2 







































































    1 










    3 










    1 


























    1 


































    4 




























    1 



















































    1 






    1 













    1 










































































































































































    3 






    3 



















































































































    6 






    1 










































    6 






    1 



















































    5 





























   18 







































































































    2 


































































    2 





























    3 














































    3 


































































    3 





























    3 









































































    7 





    2 





    4 
























    5 





















    5 





























   14 





















   13 

























































































    1 













































































    4 































































































































    1 






















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM ext4

#if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_EXT4_H

#include <linux/writeback.h>
#include <linux/tracepoint.h>

struct ext4_allocation_context;
struct ext4_allocation_request;
struct ext4_extent;
struct ext4_prealloc_space;
struct ext4_inode_info;
struct mpage_da_data;
struct ext4_map_blocks;
struct extent_status;
struct ext4_fsmap;
struct partial_cluster;

#define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))

#define show_mballoc_flags(flags) __print_flags(flags, "|",        \
        { EXT4_MB_HINT_MERGE,                "HINT_MERGE" },                \
        { EXT4_MB_HINT_RESERVED,        "HINT_RESV" },                \
        { EXT4_MB_HINT_METADATA,        "HINT_MDATA" },                \
        { EXT4_MB_HINT_FIRST,                "HINT_FIRST" },                \
        { EXT4_MB_HINT_BEST,                "HINT_BEST" },                \
        { EXT4_MB_HINT_DATA,                "HINT_DATA" },                \
        { EXT4_MB_HINT_NOPREALLOC,        "HINT_NOPREALLOC" },        \
        { EXT4_MB_HINT_GROUP_ALLOC,        "HINT_GRP_ALLOC" },        \
        { EXT4_MB_HINT_GOAL_ONLY,        "HINT_GOAL_ONLY" },        \
        { EXT4_MB_HINT_TRY_GOAL,        "HINT_TRY_GOAL" },        \
        { EXT4_MB_DELALLOC_RESERVED,        "DELALLOC_RESV" },        \
        { EXT4_MB_STREAM_ALLOC,                "STREAM_ALLOC" },        \
        { EXT4_MB_USE_ROOT_BLOCKS,        "USE_ROOT_BLKS" },        \
        { EXT4_MB_USE_RESERVED,                "USE_RESV" },                \
        { EXT4_MB_STRICT_CHECK,                "STRICT_CHECK" })

#define show_map_flags(flags) __print_flags(flags, "|",                        \
        { EXT4_GET_BLOCKS_CREATE,                "CREATE" },                \
        { EXT4_GET_BLOCKS_UNWRIT_EXT,                "UNWRIT" },                \
        { EXT4_GET_BLOCKS_DELALLOC_RESERVE,        "DELALLOC" },                \
        { EXT4_GET_BLOCKS_PRE_IO,                "PRE_IO" },                \
        { EXT4_GET_BLOCKS_CONVERT,                "CONVERT" },                \
        { EXT4_GET_BLOCKS_METADATA_NOFAIL,        "METADATA_NOFAIL" },        \
        { EXT4_GET_BLOCKS_NO_NORMALIZE,                "NO_NORMALIZE" },        \
        { EXT4_GET_BLOCKS_CONVERT_UNWRITTEN,        "CONVERT_UNWRITTEN" },  \
        { EXT4_GET_BLOCKS_ZERO,                        "ZERO" },                \
        { EXT4_GET_BLOCKS_IO_SUBMIT,                "IO_SUBMIT" },                \
        { EXT4_EX_NOCACHE,                        "EX_NOCACHE" })

/*
 * __print_flags() requires that all enum values be wrapped in the
 * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
 * ring buffer.
 */
TRACE_DEFINE_ENUM(BH_New);
TRACE_DEFINE_ENUM(BH_Mapped);
TRACE_DEFINE_ENUM(BH_Unwritten);
TRACE_DEFINE_ENUM(BH_Boundary);

#define show_mflags(flags) __print_flags(flags, "",        \
        { EXT4_MAP_NEW,                "N" },                        \
        { EXT4_MAP_MAPPED,        "M" },                        \
        { EXT4_MAP_UNWRITTEN,        "U" },                        \
        { EXT4_MAP_BOUNDARY,        "B" })

#define show_free_flags(flags) __print_flags(flags, "|",        \
        { EXT4_FREE_BLOCKS_METADATA,                "METADATA" },        \
        { EXT4_FREE_BLOCKS_FORGET,                "FORGET" },        \
        { EXT4_FREE_BLOCKS_VALIDATED,                "VALIDATED" },        \
        { EXT4_FREE_BLOCKS_NO_QUOT_UPDATE,        "NO_QUOTA" },        \
        { EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER,"1ST_CLUSTER" },\
        { EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER,        "LAST_CLUSTER" })

TRACE_DEFINE_ENUM(ES_WRITTEN_B);
TRACE_DEFINE_ENUM(ES_UNWRITTEN_B);
TRACE_DEFINE_ENUM(ES_DELAYED_B);
TRACE_DEFINE_ENUM(ES_HOLE_B);
TRACE_DEFINE_ENUM(ES_REFERENCED_B);

#define show_extent_status(status) __print_flags(status, "",        \
        { EXTENT_STATUS_WRITTEN,        "W" },                        \
        { EXTENT_STATUS_UNWRITTEN,        "U" },                        \
        { EXTENT_STATUS_DELAYED,        "D" },                        \
        { EXTENT_STATUS_HOLE,                "H" },                        \
        { EXTENT_STATUS_REFERENCED,        "R" })

#define show_falloc_mode(mode) __print_flags(mode, "|",                \
        { FALLOC_FL_KEEP_SIZE,                "KEEP_SIZE"},                \
        { FALLOC_FL_PUNCH_HOLE,                "PUNCH_HOLE"},                \
        { FALLOC_FL_NO_HIDE_STALE,        "NO_HIDE_STALE"},        \
        { FALLOC_FL_COLLAPSE_RANGE,        "COLLAPSE_RANGE"},        \
        { FALLOC_FL_ZERO_RANGE,                "ZERO_RANGE"})

TRACE_DEFINE_ENUM(EXT4_FC_REASON_XATTR);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_CROSS_RENAME);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_NOMEM);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_SWAP_BOOT);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_RESIZE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_RENAME_DIR);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_FALLOC_RANGE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_ENCRYPTED_FILENAME);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);

#define show_fc_reason(reason)                                                \
        __print_symbolic(reason,                                        \
                { EXT4_FC_REASON_XATTR,                "XATTR"},                \
                { EXT4_FC_REASON_CROSS_RENAME,        "CROSS_RENAME"},        \
                { EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, "JOURNAL_FLAG_CHANGE"}, \
                { EXT4_FC_REASON_NOMEM,        "NO_MEM"},                        \
                { EXT4_FC_REASON_SWAP_BOOT,        "SWAP_BOOT"},                \
                { EXT4_FC_REASON_RESIZE,        "RESIZE"},                \
                { EXT4_FC_REASON_RENAME_DIR,        "RENAME_DIR"},                \
                { EXT4_FC_REASON_FALLOC_RANGE,        "FALLOC_RANGE"},        \
                { EXT4_FC_REASON_INODE_JOURNAL_DATA,        "INODE_JOURNAL_DATA"}, \
                { EXT4_FC_REASON_ENCRYPTED_FILENAME,        "ENCRYPTED_FILENAME"})

TRACE_DEFINE_ENUM(CR_POWER2_ALIGNED);
TRACE_DEFINE_ENUM(CR_GOAL_LEN_FAST);
TRACE_DEFINE_ENUM(CR_BEST_AVAIL_LEN);
TRACE_DEFINE_ENUM(CR_GOAL_LEN_SLOW);
TRACE_DEFINE_ENUM(CR_ANY_FREE);

#define show_criteria(cr)                                               \
        __print_symbolic(cr,                                            \
                         { CR_POWER2_ALIGNED, "CR_POWER2_ALIGNED" },        \
                         { CR_GOAL_LEN_FAST, "CR_GOAL_LEN_FAST" },      \
                         { CR_BEST_AVAIL_LEN, "CR_BEST_AVAIL_LEN" },    \
                         { CR_GOAL_LEN_SLOW, "CR_GOAL_LEN_SLOW" },      \
                         { CR_ANY_FREE, "CR_ANY_FREE" })

TRACE_EVENT(ext4_other_inode_update_time,
        TP_PROTO(struct inode *inode, ino_t orig_ino),

        TP_ARGS(inode, orig_ino),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        ino_t,        orig_ino                )
                __field(        uid_t,        uid                        )
                __field(        gid_t,        gid                        )
                __field(        __u16, mode                        )
        ),

        TP_fast_assign(
                __entry->orig_ino = orig_ino;
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->uid        = i_uid_read(inode);
                __entry->gid        = i_gid_read(inode);
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d orig_ino %lu ino %lu mode 0%o uid %u gid %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->orig_ino,
                  (unsigned long) __entry->ino, __entry->mode,
                  __entry->uid, __entry->gid)
);

TRACE_EVENT(ext4_free_inode,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        uid_t,        uid                        )
                __field(        gid_t,        gid                        )
                __field(        __u64, blocks                        )
                __field(        __u16, mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->uid        = i_uid_read(inode);
                __entry->gid        = i_gid_read(inode);
                __entry->blocks        = inode->i_blocks;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->mode,
                  __entry->uid, __entry->gid, __entry->blocks)
);

TRACE_EVENT(ext4_request_inode,
        TP_PROTO(struct inode *dir, int mode),

        TP_ARGS(dir, mode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        dir                        )
                __field(        __u16, mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = dir->i_sb->s_dev;
                __entry->dir        = dir->i_ino;
                __entry->mode        = mode;
        ),

        TP_printk("dev %d,%d dir %lu mode 0%o",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->dir, __entry->mode)
);

TRACE_EVENT(ext4_allocate_inode,
        TP_PROTO(struct inode *inode, struct inode *dir, int mode),

        TP_ARGS(inode, dir, mode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        ino_t,        dir                        )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->dir        = dir->i_ino;
                __entry->mode        = mode;
        ),

        TP_printk("dev %d,%d ino %lu dir %lu mode 0%o",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->dir, __entry->mode)
);

TRACE_EVENT(ext4_evict_inode,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        nlink                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->nlink        = inode->i_nlink;
        ),

        TP_printk("dev %d,%d ino %lu nlink %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->nlink)
);

TRACE_EVENT(ext4_drop_inode,
        TP_PROTO(struct inode *inode, int drop),

        TP_ARGS(inode, drop),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        drop                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->drop        = drop;
        ),

        TP_printk("dev %d,%d ino %lu drop %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->drop)
);

TRACE_EVENT(ext4_nfs_commit_metadata,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
        ),

        TP_printk("dev %d,%d ino %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino)
);

TRACE_EVENT(ext4_mark_inode_dirty,
        TP_PROTO(struct inode *inode, unsigned long IP),

        TP_ARGS(inode, IP),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(unsigned long,        ip                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->ip        = IP;
        ),

        TP_printk("dev %d,%d ino %lu caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, (void *)__entry->ip)
);

TRACE_EVENT(ext4_begin_ordered_truncate,
        TP_PROTO(struct inode *inode, loff_t new_size),

        TP_ARGS(inode, new_size),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        new_size                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->new_size        = new_size;
        ),

        TP_printk("dev %d,%d ino %lu new_size %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->new_size)
);

DECLARE_EVENT_CLASS(ext4__write_begin,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len),

        TP_ARGS(inode, pos, len),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        unsigned int, len                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = pos;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu pos %lld len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->len)
);

DEFINE_EVENT(ext4__write_begin, ext4_write_begin,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len),

        TP_ARGS(inode, pos, len)
);

DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len),

        TP_ARGS(inode, pos, len)
);

DECLARE_EVENT_CLASS(ext4__write_end,
        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                        unsigned int copied),

        TP_ARGS(inode, pos, len, copied),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        unsigned int, len                )
                __field(        unsigned int, copied                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = pos;
                __entry->len        = len;
                __entry->copied        = copied;
        ),

        TP_printk("dev %d,%d ino %lu pos %lld len %u copied %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->len, __entry->copied)
);

DEFINE_EVENT(ext4__write_end, ext4_write_end,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),

        TP_ARGS(inode, pos, len, copied)
);

DEFINE_EVENT(ext4__write_end, ext4_journalled_write_end,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),

        TP_ARGS(inode, pos, len, copied)
);

DEFINE_EVENT(ext4__write_end, ext4_da_write_end,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),

        TP_ARGS(inode, pos, len, copied)
);

TRACE_EVENT(ext4_writepages,
        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        long,        nr_to_write                )
                __field(        long,        pages_skipped                )
                __field(        loff_t,        range_start                )
                __field(        loff_t,        range_end                )
                __field(       pgoff_t,        writeback_index                )
                __field(        int,        sync_mode                )
                __field(        char,        for_kupdate                )
                __field(        char,        range_cyclic                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->pages_skipped        = wbc->pages_skipped;
                __entry->range_start        = wbc->range_start;
                __entry->range_end        = wbc->range_end;
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->sync_mode        = wbc->sync_mode;
                __entry->for_kupdate        = wbc->for_kupdate;
                __entry->range_cyclic        = wbc->range_cyclic;
        ),

        TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld "
                  "range_start %lld range_end %lld sync_mode %d "
                  "for_kupdate %d range_cyclic %d writeback_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->nr_to_write,
                  __entry->pages_skipped, __entry->range_start,
                  __entry->range_end, __entry->sync_mode,
                  __entry->for_kupdate, __entry->range_cyclic,
                  (unsigned long) __entry->writeback_index)
);

TRACE_EVENT(ext4_da_write_pages,
        TP_PROTO(struct inode *inode, pgoff_t first_page,
                 struct writeback_control *wbc),

        TP_ARGS(inode, first_page, wbc),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(      pgoff_t,        first_page                )
                __field(         long,        nr_to_write                )
                __field(          int,        sync_mode                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->first_page        = first_page;
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->sync_mode        = wbc->sync_mode;
        ),

        TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld "
                  "sync_mode %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->first_page,
                  __entry->nr_to_write, __entry->sync_mode)
);

TRACE_EVENT(ext4_da_write_pages_extent,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map),

        TP_ARGS(inode, map),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        lblk                        )
                __field(        __u32,        len                        )
                __field(        __u32,        flags                        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->lblk                = map->m_lblk;
                __entry->len                = map->m_len;
                __entry->flags                = map->m_flags;
        ),

        TP_printk("dev %d,%d ino %lu lblk %llu len %u flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->lblk, __entry->len,
                  show_mflags(__entry->flags))
);

TRACE_EVENT(ext4_writepages_result,
        TP_PROTO(struct inode *inode, struct writeback_control *wbc,
                        int ret, int pages_written),

        TP_ARGS(inode, wbc, ret, pages_written),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        ret                        )
                __field(        int,        pages_written                )
                __field(        long,        pages_skipped                )
                __field(       pgoff_t,        writeback_index                )
                __field(        int,        sync_mode                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->ret                = ret;
                __entry->pages_written        = pages_written;
                __entry->pages_skipped        = wbc->pages_skipped;
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->sync_mode        = wbc->sync_mode;
        ),

        TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld "
                  "sync_mode %d writeback_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->ret,
                  __entry->pages_written, __entry->pages_skipped,
                  __entry->sync_mode,
                  (unsigned long) __entry->writeback_index)
);

DECLARE_EVENT_CLASS(ext4__folio_op,
        TP_PROTO(struct inode *inode, struct folio *folio),

        TP_ARGS(inode, folio),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        pgoff_t, index                        )

        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->index        = folio->index;
        ),

        TP_printk("dev %d,%d ino %lu folio_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->index)
);

DEFINE_EVENT(ext4__folio_op, ext4_read_folio,

        TP_PROTO(struct inode *inode, struct folio *folio),

        TP_ARGS(inode, folio)
);

DEFINE_EVENT(ext4__folio_op, ext4_release_folio,

        TP_PROTO(struct inode *inode, struct folio *folio),

        TP_ARGS(inode, folio)
);

DECLARE_EVENT_CLASS(ext4_invalidate_folio_op,
        TP_PROTO(struct folio *folio, size_t offset, size_t length),

        TP_ARGS(folio, offset, length),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        pgoff_t, index                        )
                __field(        size_t, offset                        )
                __field(        size_t, length                        )
        ),

        TP_fast_assign(
                __entry->dev        = folio->mapping->host->i_sb->s_dev;
                __entry->ino        = folio->mapping->host->i_ino;
                __entry->index        = folio->index;
                __entry->offset        = offset;
                __entry->length        = length;
        ),

        TP_printk("dev %d,%d ino %lu folio_index %lu offset %zu length %zu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->index,
                  __entry->offset, __entry->length)
);

DEFINE_EVENT(ext4_invalidate_folio_op, ext4_invalidate_folio,
        TP_PROTO(struct folio *folio, size_t offset, size_t length),

        TP_ARGS(folio, offset, length)
);

DEFINE_EVENT(ext4_invalidate_folio_op, ext4_journalled_invalidate_folio,
        TP_PROTO(struct folio *folio, size_t offset, size_t length),

        TP_ARGS(folio, offset, length)
);

TRACE_EVENT(ext4_discard_blocks,
        TP_PROTO(struct super_block *sb, unsigned long long blk,
                        unsigned long long count),

        TP_ARGS(sb, blk, count),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u64,        blk                        )
                __field(        __u64,        count                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->blk        = blk;
                __entry->count        = count;
        ),

        TP_printk("dev %d,%d blk %llu count %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->blk, __entry->count)
);

DECLARE_EVENT_CLASS(ext4__mb_new_pa,
        TP_PROTO(struct ext4_allocation_context *ac,
                 struct ext4_prealloc_space *pa),

        TP_ARGS(ac, pa),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        pa_pstart                )
                __field(        __u64,        pa_lstart                )
                __field(        __u32,        pa_len                        )

        ),

        TP_fast_assign(
                __entry->dev                = ac->ac_sb->s_dev;
                __entry->ino                = ac->ac_inode->i_ino;
                __entry->pa_pstart        = pa->pa_pstart;
                __entry->pa_lstart        = pa->pa_lstart;
                __entry->pa_len                = pa->pa_len;
        ),

        TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart)
);

DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa,

        TP_PROTO(struct ext4_allocation_context *ac,
                 struct ext4_prealloc_space *pa),

        TP_ARGS(ac, pa)
);

DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa,

        TP_PROTO(struct ext4_allocation_context *ac,
                 struct ext4_prealloc_space *pa),

        TP_ARGS(ac, pa)
);

TRACE_EVENT(ext4_mb_release_inode_pa,
        TP_PROTO(struct ext4_prealloc_space *pa,
                 unsigned long long block, unsigned int count),

        TP_ARGS(pa, block, count),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        block                        )
                __field(        __u32,        count                        )

        ),

        TP_fast_assign(
                __entry->dev                = pa->pa_inode->i_sb->s_dev;
                __entry->ino                = pa->pa_inode->i_ino;
                __entry->block                = block;
                __entry->count                = count;
        ),

        TP_printk("dev %d,%d ino %lu block %llu count %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->block, __entry->count)
);

TRACE_EVENT(ext4_mb_release_group_pa,
        TP_PROTO(struct super_block *sb, struct ext4_prealloc_space *pa),

        TP_ARGS(sb, pa),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u64,        pa_pstart                )
                __field(        __u32,        pa_len                        )

        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->pa_pstart        = pa->pa_pstart;
                __entry->pa_len                = pa->pa_len;
        ),

        TP_printk("dev %d,%d pstart %llu len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->pa_pstart, __entry->pa_len)
);

TRACE_EVENT(ext4_discard_preallocations,
        TP_PROTO(struct inode *inode, unsigned int len),

        TP_ARGS(inode, len),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        unsigned int,        len                )

        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu len: %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->len)
);

TRACE_EVENT(ext4_mb_discard_preallocations,
        TP_PROTO(struct super_block *sb, int needed),

        TP_ARGS(sb, needed),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        needed                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->needed        = needed;
        ),

        TP_printk("dev %d,%d needed %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->needed)
);

TRACE_EVENT(ext4_request_blocks,
        TP_PROTO(struct ext4_allocation_request *ar),

        TP_ARGS(ar),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        unsigned int, len                )
                __field(        __u32,  logical                        )
                __field(        __u32,        lleft                        )
                __field(        __u32,        lright                        )
                __field(        __u64,        goal                        )
                __field(        __u64,        pleft                        )
                __field(        __u64,        pright                        )
                __field(        unsigned int, flags                )
        ),

        TP_fast_assign(
                __entry->dev        = ar->inode->i_sb->s_dev;
                __entry->ino        = ar->inode->i_ino;
                __entry->len        = ar->len;
                __entry->logical = ar->logical;
                __entry->goal        = ar->goal;
                __entry->lleft        = ar->lleft;
                __entry->lright        = ar->lright;
                __entry->pleft        = ar->pleft;
                __entry->pright        = ar->pright;
                __entry->flags        = ar->flags;
        ),

        TP_printk("dev %d,%d ino %lu flags %s len %u lblk %u goal %llu "
                  "lleft %u lright %u pleft %llu pright %llu ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags),
                  __entry->len, __entry->logical, __entry->goal,
                  __entry->lleft, __entry->lright, __entry->pleft,
                  __entry->pright)
);

TRACE_EVENT(ext4_allocate_blocks,
        TP_PROTO(struct ext4_allocation_request *ar, unsigned long long block),

        TP_ARGS(ar, block),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        block                        )
                __field(        unsigned int, len                )
                __field(        __u32,  logical                        )
                __field(        __u32,        lleft                        )
                __field(        __u32,        lright                        )
                __field(        __u64,        goal                        )
                __field(        __u64,        pleft                        )
                __field(        __u64,        pright                        )
                __field(        unsigned int, flags                )
        ),

        TP_fast_assign(
                __entry->dev        = ar->inode->i_sb->s_dev;
                __entry->ino        = ar->inode->i_ino;
                __entry->block        = block;
                __entry->len        = ar->len;
                __entry->logical = ar->logical;
                __entry->goal        = ar->goal;
                __entry->lleft        = ar->lleft;
                __entry->lright        = ar->lright;
                __entry->pleft        = ar->pleft;
                __entry->pright        = ar->pright;
                __entry->flags        = ar->flags;
        ),

        TP_printk("dev %d,%d ino %lu flags %s len %u block %llu lblk %u "
                  "goal %llu lleft %u lright %u pleft %llu pright %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags),
                  __entry->len, __entry->block, __entry->logical,
                  __entry->goal,  __entry->lleft, __entry->lright,
                  __entry->pleft, __entry->pright)
);

TRACE_EVENT(ext4_free_blocks,
        TP_PROTO(struct inode *inode, __u64 block, unsigned long count,
                 int flags),

        TP_ARGS(inode, block, count, flags),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        block                        )
                __field(        unsigned long,        count                )
                __field(        int,        flags                        )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->block                = block;
                __entry->count                = count;
                __entry->flags                = flags;
                __entry->mode                = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->block, __entry->count,
                  show_free_flags(__entry->flags))
);

TRACE_EVENT(ext4_sync_file_enter,
        TP_PROTO(struct file *file, int datasync),

        TP_ARGS(file, datasync),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        ino_t,        parent                        )
                __field(        int,        datasync                )
        ),

        TP_fast_assign(
                struct dentry *dentry = file->f_path.dentry;

                __entry->dev                = dentry->d_sb->s_dev;
                __entry->ino                = d_inode(dentry)->i_ino;
                __entry->datasync        = datasync;
                __entry->parent                = d_inode(dentry->d_parent)->i_ino;
        ),

        TP_printk("dev %d,%d ino %lu parent %lu datasync %d ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->parent, __entry->datasync)
);

TRACE_EVENT(ext4_sync_file_exit,
        TP_PROTO(struct inode *inode, int ret),

        TP_ARGS(inode, ret),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        ret                        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->ret                = ret;
        ),

        TP_printk("dev %d,%d ino %lu ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->ret)
);

TRACE_EVENT(ext4_sync_fs,
        TP_PROTO(struct super_block *sb, int wait),

        TP_ARGS(sb, wait),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        wait                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->wait        = wait;
        ),

        TP_printk("dev %d,%d wait %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->wait)
);

TRACE_EVENT(ext4_alloc_da_blocks,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field( unsigned int,        data_blocks                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
        ),

        TP_printk("dev %d,%d ino %lu reserved_data_blocks %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->data_blocks)
);

TRACE_EVENT(ext4_mballoc_alloc,
        TP_PROTO(struct ext4_allocation_context *ac),

        TP_ARGS(ac),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u32,         orig_logical                )
                __field(          int,        orig_start                )
                __field(        __u32,         orig_group                )
                __field(          int,        orig_len                )
                __field(        __u32,         goal_logical                )
                __field(          int,        goal_start                )
                __field(        __u32,         goal_group                )
                __field(          int,        goal_len                )
                __field(        __u32,         result_logical                )
                __field(          int,        result_start                )
                __field(        __u32,         result_group                )
                __field(          int,        result_len                )
                __field(        __u16,        found                        )
                __field(        __u16,        groups                        )
                __field(        __u16,        buddy                        )
                __field(        __u16,        flags                        )
                __field(        __u16,        tail                        )
                __field(        __u8,        cr                        )
        ),

        TP_fast_assign(
                __entry->dev                = ac->ac_inode->i_sb->s_dev;
                __entry->ino                = ac->ac_inode->i_ino;
                __entry->orig_logical        = ac->ac_o_ex.fe_logical;
                __entry->orig_start        = ac->ac_o_ex.fe_start;
                __entry->orig_group        = ac->ac_o_ex.fe_group;
                __entry->orig_len        = ac->ac_o_ex.fe_len;
                __entry->goal_logical        = ac->ac_g_ex.fe_logical;
                __entry->goal_start        = ac->ac_g_ex.fe_start;
                __entry->goal_group        = ac->ac_g_ex.fe_group;
                __entry->goal_len        = ac->ac_g_ex.fe_len;
                __entry->result_logical        = ac->ac_f_ex.fe_logical;
                __entry->result_start        = ac->ac_f_ex.fe_start;
                __entry->result_group        = ac->ac_f_ex.fe_group;
                __entry->result_len        = ac->ac_f_ex.fe_len;
                __entry->found                = ac->ac_found;
                __entry->flags                = ac->ac_flags;
                __entry->groups                = ac->ac_groups_scanned;
                __entry->buddy                = ac->ac_buddy;
                __entry->tail                = ac->ac_tail;
                __entry->cr                = ac->ac_criteria;
        ),

        TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u "
                  "result %u/%d/%u@%u blks %u grps %u cr %s flags %s "
                  "tail %u broken %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->orig_group, __entry->orig_start,
                  __entry->orig_len, __entry->orig_logical,
                  __entry->goal_group, __entry->goal_start,
                  __entry->goal_len, __entry->goal_logical,
                  __entry->result_group, __entry->result_start,
                  __entry->result_len, __entry->result_logical,
                  __entry->found, __entry->groups, show_criteria(__entry->cr),
                  show_mballoc_flags(__entry->flags), __entry->tail,
                  __entry->buddy ? 1 << __entry->buddy : 0)
);

TRACE_EVENT(ext4_mballoc_prealloc,
        TP_PROTO(struct ext4_allocation_context *ac),

        TP_ARGS(ac),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u32,         orig_logical                )
                __field(          int,        orig_start                )
                __field(        __u32,         orig_group                )
                __field(          int,        orig_len                )
                __field(        __u32,         result_logical                )
                __field(          int,        result_start                )
                __field(        __u32,         result_group                )
                __field(          int,        result_len                )
        ),

        TP_fast_assign(
                __entry->dev                = ac->ac_inode->i_sb->s_dev;
                __entry->ino                = ac->ac_inode->i_ino;
                __entry->orig_logical        = ac->ac_o_ex.fe_logical;
                __entry->orig_start        = ac->ac_o_ex.fe_start;
                __entry->orig_group        = ac->ac_o_ex.fe_group;
                __entry->orig_len        = ac->ac_o_ex.fe_len;
                __entry->result_logical        = ac->ac_b_ex.fe_logical;
                __entry->result_start        = ac->ac_b_ex.fe_start;
                __entry->result_group        = ac->ac_b_ex.fe_group;
                __entry->result_len        = ac->ac_b_ex.fe_len;
        ),

        TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->orig_group, __entry->orig_start,
                  __entry->orig_len, __entry->orig_logical,
                  __entry->result_group, __entry->result_start,
                  __entry->result_len, __entry->result_logical)
);

DECLARE_EVENT_CLASS(ext4__mballoc,
        TP_PROTO(struct super_block *sb,
                 struct inode *inode,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, inode, group, start, len),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(          int,        result_start                )
                __field(        __u32,         result_group                )
                __field(          int,        result_len                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->ino                = inode ? inode->i_ino : 0;
                __entry->result_start        = start;
                __entry->result_group        = group;
                __entry->result_len        = len;
        ),

        TP_printk("dev %d,%d inode %lu extent %u/%d/%d ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->result_group, __entry->result_start,
                  __entry->result_len)
);

DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard,

        TP_PROTO(struct super_block *sb,
                 struct inode *inode,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, inode, group, start, len)
);

DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free,

        TP_PROTO(struct super_block *sb,
                 struct inode *inode,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, inode, group, start, len)
);

TRACE_EVENT(ext4_forget,
        TP_PROTO(struct inode *inode, int is_metadata, __u64 block),

        TP_ARGS(inode, is_metadata, block),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        block                        )
                __field(        int,        is_metadata                )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->block        = block;
                __entry->is_metadata = is_metadata;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->is_metadata, __entry->block)
);

TRACE_EVENT(ext4_da_update_reserve_space,
        TP_PROTO(struct inode *inode, int used_blocks, int quota_claim),

        TP_ARGS(inode, used_blocks, quota_claim),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        i_blocks                )
                __field(        int,        used_blocks                )
                __field(        int,        reserved_data_blocks        )
                __field(        int,        quota_claim                )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->i_blocks = inode->i_blocks;
                __entry->used_blocks = used_blocks;
                __entry->reserved_data_blocks =
                                EXT4_I(inode)->i_reserved_data_blocks;
                __entry->quota_claim = quota_claim;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d "
                  "reserved_data_blocks %d quota_claim %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->i_blocks,
                  __entry->used_blocks, __entry->reserved_data_blocks,
                  __entry->quota_claim)
);

TRACE_EVENT(ext4_da_reserve_space,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        i_blocks                )
                __field(        int,        reserved_data_blocks        )
                __field(        __u16,  mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->i_blocks = inode->i_blocks;
                __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu "
                  "reserved_data_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->i_blocks,
                  __entry->reserved_data_blocks)
);

TRACE_EVENT(ext4_da_release_space,
        TP_PROTO(struct inode *inode, int freed_blocks),

        TP_ARGS(inode, freed_blocks),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        i_blocks                )
                __field(        int,        freed_blocks                )
                __field(        int,        reserved_data_blocks        )
                __field(        __u16,  mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->i_blocks = inode->i_blocks;
                __entry->freed_blocks = freed_blocks;
                __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d "
                  "reserved_data_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->i_blocks,
                  __entry->freed_blocks, __entry->reserved_data_blocks)
);

DECLARE_EVENT_CLASS(ext4__bitmap_load,
        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
        ),

        TP_printk("dev %d,%d group %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->group)
);

DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load,

        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group)
);

DEFINE_EVENT(ext4__bitmap_load, ext4_mb_buddy_bitmap_load,

        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group)
);

DEFINE_EVENT(ext4__bitmap_load, ext4_load_inode_bitmap,

        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group)
);

TRACE_EVENT(ext4_read_block_bitmap_load,
        TP_PROTO(struct super_block *sb, unsigned long group, bool prefetch),

        TP_ARGS(sb, group, prefetch),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )
                __field(        bool,        prefetch                )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
                __entry->prefetch = prefetch;
        ),

        TP_printk("dev %d,%d group %u prefetch %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->group, __entry->prefetch)
);

DECLARE_EVENT_CLASS(ext4__fallocate_mode,
        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        offset                        )
                __field(        loff_t, len                        )
                __field(        int,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->offset        = offset;
                __entry->len        = len;
                __entry->mode        = mode;
        ),

        TP_printk("dev %d,%d ino %lu offset %lld len %lld mode %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->offset, __entry->len,
                  show_falloc_mode(__entry->mode))
);

DEFINE_EVENT(ext4__fallocate_mode, ext4_fallocate_enter,

        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode)
);

DEFINE_EVENT(ext4__fallocate_mode, ext4_punch_hole,

        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode)
);

DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range,

        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode)
);

TRACE_EVENT(ext4_fallocate_exit,
        TP_PROTO(struct inode *inode, loff_t offset,
                 unsigned int max_blocks, int ret),

        TP_ARGS(inode, offset, max_blocks, ret),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        unsigned int,        blocks                )
                __field(        int,         ret                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = offset;
                __entry->blocks        = max_blocks;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d ino %lu pos %lld blocks %u ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->blocks,
                  __entry->ret)
);

TRACE_EVENT(ext4_unlink_enter,
        TP_PROTO(struct inode *parent, struct dentry *dentry),

        TP_ARGS(parent, dentry),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        ino_t,        parent                        )
                __field(        loff_t,        size                        )
        ),

        TP_fast_assign(
                __entry->dev                = dentry->d_sb->s_dev;
                __entry->ino                = d_inode(dentry)->i_ino;
                __entry->parent                = parent->i_ino;
                __entry->size                = d_inode(dentry)->i_size;
        ),

        TP_printk("dev %d,%d ino %lu size %lld parent %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->size,
                  (unsigned long) __entry->parent)
);

TRACE_EVENT(ext4_unlink_exit,
        TP_PROTO(struct dentry *dentry, int ret),

        TP_ARGS(dentry, ret),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        ret                        )
        ),

        TP_fast_assign(
                __entry->dev                = dentry->d_sb->s_dev;
                __entry->ino                = d_inode(dentry)->i_ino;
                __entry->ret                = ret;
        ),

        TP_printk("dev %d,%d ino %lu ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->ret)
);

DECLARE_EVENT_CLASS(ext4__truncate,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        __u64,                blocks                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->blocks        = inode->i_blocks;
        ),

        TP_printk("dev %d,%d ino %lu blocks %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->blocks)
);

DEFINE_EVENT(ext4__truncate, ext4_truncate_enter,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(ext4__truncate, ext4_truncate_exit,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

/* 'ux' is the unwritten extent. */
TRACE_EVENT(ext4_ext_convert_to_initialized_enter,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
                 struct ext4_extent *ux),

        TP_ARGS(inode, map, ux),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        m_lblk        )
                __field(        unsigned,        m_len        )
                __field(        ext4_lblk_t,        u_lblk        )
                __field(        unsigned,        u_len        )
                __field(        ext4_fsblk_t,        u_pblk        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->m_lblk                = map->m_lblk;
                __entry->m_len                = map->m_len;
                __entry->u_lblk                = le32_to_cpu(ux->ee_block);
                __entry->u_len                = ext4_ext_get_actual_len(ux);
                __entry->u_pblk                = ext4_ext_pblock(ux);
        ),

        TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u u_lblk %u u_len %u "
                  "u_pblk %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->m_lblk, __entry->m_len,
                  __entry->u_lblk, __entry->u_len, __entry->u_pblk)
);

/*
 * 'ux' is the unwritten extent.
 * 'ix' is the initialized extent to which blocks are transferred.
 */
TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
                 struct ext4_extent *ux, struct ext4_extent *ix),

        TP_ARGS(inode, map, ux, ix),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        m_lblk        )
                __field(        unsigned,        m_len        )
                __field(        ext4_lblk_t,        u_lblk        )
                __field(        unsigned,        u_len        )
                __field(        ext4_fsblk_t,        u_pblk        )
                __field(        ext4_lblk_t,        i_lblk        )
                __field(        unsigned,        i_len        )
                __field(        ext4_fsblk_t,        i_pblk        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->m_lblk                = map->m_lblk;
                __entry->m_len                = map->m_len;
                __entry->u_lblk                = le32_to_cpu(ux->ee_block);
                __entry->u_len                = ext4_ext_get_actual_len(ux);
                __entry->u_pblk                = ext4_ext_pblock(ux);
                __entry->i_lblk                = le32_to_cpu(ix->ee_block);
                __entry->i_len                = ext4_ext_get_actual_len(ix);
                __entry->i_pblk                = ext4_ext_pblock(ix);
        ),

        TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u "
                  "u_lblk %u u_len %u u_pblk %llu "
                  "i_lblk %u i_len %u i_pblk %llu ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->m_lblk, __entry->m_len,
                  __entry->u_lblk, __entry->u_len, __entry->u_pblk,
                  __entry->i_lblk, __entry->i_len, __entry->i_pblk)
);

DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
                 unsigned int len, unsigned int flags),

        TP_ARGS(inode, lblk, len, flags),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        unsigned int,        len                )
                __field(        unsigned int,        flags                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->lblk        = lblk;
                __entry->len        = len;
                __entry->flags        = flags;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u len %u flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len, show_map_flags(__entry->flags))
);

DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
                 unsigned len, unsigned flags),

        TP_ARGS(inode, lblk, len, flags)
);

DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
                 unsigned len, unsigned flags),

        TP_ARGS(inode, lblk, len, flags)
);

DECLARE_EVENT_CLASS(ext4__map_blocks_exit,
        TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map,
                 int ret),

        TP_ARGS(inode, flags, map, ret),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        unsigned int,        flags                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        unsigned int,        len                )
                __field(        unsigned int,        mflags                )
                __field(        int,                ret                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->flags        = flags;
                __entry->pblk        = map->m_pblk;
                __entry->lblk        = map->m_lblk;
                __entry->len        = map->m_len;
                __entry->mflags        = map->m_flags;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d ino %lu flags %s lblk %u pblk %llu len %u "
                  "mflags %s ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  show_map_flags(__entry->flags), __entry->lblk, __entry->pblk,
                  __entry->len, show_mflags(__entry->mflags), __entry->ret)
);

DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit,
        TP_PROTO(struct inode *inode, unsigned flags,
                 struct ext4_map_blocks *map, int ret),

        TP_ARGS(inode, flags, map, ret)
);

DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit,
        TP_PROTO(struct inode *inode, unsigned flags,
                 struct ext4_map_blocks *map, int ret),

        TP_ARGS(inode, flags, map, ret)
);

TRACE_EVENT(ext4_ext_load_extent,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk),

        TP_ARGS(inode, lblk, pblk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        ext4_lblk_t,        lblk                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->pblk        = pblk;
                __entry->lblk        = lblk;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u pblk %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->pblk)
);

TRACE_EVENT(ext4_load_inode,
        TP_PROTO(struct super_block *sb, unsigned long ino),

        TP_ARGS(sb, ino),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                )
                __field(        ino_t,        ino                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->ino                = ino;
        ),

        TP_printk("dev %d,%d ino %ld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino)
);

TRACE_EVENT(ext4_journal_start_sb,
        TP_PROTO(struct super_block *sb, int blocks, int rsv_blocks,
                 int revoke_creds, int type, unsigned long IP),

        TP_ARGS(sb, blocks, rsv_blocks, revoke_creds, type, IP),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        unsigned long,        ip                )
                __field(        int,                blocks                )
                __field(        int,                rsv_blocks        )
                __field(        int,                revoke_creds        )
                __field(        int,                type                )
        ),

        TP_fast_assign(
                __entry->dev                 = sb->s_dev;
                __entry->ip                 = IP;
                __entry->blocks                 = blocks;
                __entry->rsv_blocks         = rsv_blocks;
                __entry->revoke_creds         = revoke_creds;
                __entry->type                 = type;
        ),

        TP_printk("dev %d,%d blocks %d, rsv_blocks %d, revoke_creds %d,"
                  " type %d, caller %pS", MAJOR(__entry->dev),
                  MINOR(__entry->dev), __entry->blocks, __entry->rsv_blocks,
                  __entry->revoke_creds, __entry->type, (void *)__entry->ip)
);

TRACE_EVENT(ext4_journal_start_inode,
        TP_PROTO(struct inode *inode, int blocks, int rsv_blocks,
                 int revoke_creds, int type, unsigned long IP),

        TP_ARGS(inode, blocks, rsv_blocks, revoke_creds, type, IP),

        TP_STRUCT__entry(
                __field(        unsigned long,        ino                )
                __field(        dev_t,                dev                )
                __field(        unsigned long,        ip                )
                __field(        int,                blocks                )
                __field(        int,                rsv_blocks        )
                __field(        int,                revoke_creds        )
                __field(        int,                type                )
        ),

        TP_fast_assign(
                __entry->dev                 = inode->i_sb->s_dev;
                __entry->ip                 = IP;
                __entry->blocks                 = blocks;
                __entry->rsv_blocks         = rsv_blocks;
                __entry->revoke_creds         = revoke_creds;
                __entry->type                 = type;
                __entry->ino                 = inode->i_ino;
        ),

        TP_printk("dev %d,%d blocks %d, rsv_blocks %d, revoke_creds %d,"
                  " type %d, ino %lu, caller %pS", MAJOR(__entry->dev),
                  MINOR(__entry->dev), __entry->blocks, __entry->rsv_blocks,
                  __entry->revoke_creds, __entry->type, __entry->ino,
                  (void *)__entry->ip)
);

TRACE_EVENT(ext4_journal_start_reserved,
        TP_PROTO(struct super_block *sb, int blocks, unsigned long IP),

        TP_ARGS(sb, blocks, IP),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(unsigned long,        ip                        )
                __field(          int,        blocks                        )
        ),

        TP_fast_assign(
                __entry->dev                 = sb->s_dev;
                __entry->ip                 = IP;
                __entry->blocks                 = blocks;
        ),

        TP_printk("dev %d,%d blocks, %d caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->blocks, (void *)__entry->ip)
);

DECLARE_EVENT_CLASS(ext4__trim,
        TP_PROTO(struct super_block *sb,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, group, start, len),

        TP_STRUCT__entry(
                __field(        int,        dev_major                )
                __field(        int,        dev_minor                )
                __field(        __u32,         group                        )
                __field(        int,        start                        )
                __field(        int,        len                        )
        ),

        TP_fast_assign(
                __entry->dev_major        = MAJOR(sb->s_dev);
                __entry->dev_minor        = MINOR(sb->s_dev);
                __entry->group                = group;
                __entry->start                = start;
                __entry->len                = len;
        ),

        TP_printk("dev %d,%d group %u, start %d, len %d",
                  __entry->dev_major, __entry->dev_minor,
                  __entry->group, __entry->start, __entry->len)
);

DEFINE_EVENT(ext4__trim, ext4_trim_extent,

        TP_PROTO(struct super_block *sb,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, group, start, len)
);

DEFINE_EVENT(ext4__trim, ext4_trim_all_free,

        TP_PROTO(struct super_block *sb,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, group, start, len)
);

TRACE_EVENT(ext4_ext_handle_unwritten_extents,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int flags,
                 unsigned int allocated, ext4_fsblk_t newblock),

        TP_ARGS(inode, map, flags, allocated, newblock),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        int,                flags                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        unsigned int,        len                )
                __field(        unsigned int,        allocated        )
                __field(        ext4_fsblk_t,        newblk                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->flags                = flags;
                __entry->lblk                = map->m_lblk;
                __entry->pblk                = map->m_pblk;
                __entry->len                = map->m_len;
                __entry->allocated        = allocated;
                __entry->newblk                = newblock;
        ),

        TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %s "
                  "allocated %d newblock %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->lblk, (unsigned long long) __entry->pblk,
                  __entry->len, show_map_flags(__entry->flags),
                  (unsigned int) __entry->allocated,
                  (unsigned long long) __entry->newblk)
);

TRACE_EVENT(ext4_get_implied_cluster_alloc_exit,
        TP_PROTO(struct super_block *sb, struct ext4_map_blocks *map, int ret),

        TP_ARGS(sb, map, ret),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        unsigned int,        flags        )
                __field(        ext4_lblk_t,        lblk        )
                __field(        ext4_fsblk_t,        pblk        )
                __field(        unsigned int,        len        )
                __field(        int,                ret        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->flags        = map->m_flags;
                __entry->lblk        = map->m_lblk;
                __entry->pblk        = map->m_pblk;
                __entry->len        = map->m_len;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %s ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->lblk, (unsigned long long) __entry->pblk,
                  __entry->len, show_mflags(__entry->flags), __entry->ret)
);

TRACE_EVENT(ext4_ext_show_extent,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
                 unsigned short len),

        TP_ARGS(inode, lblk, pblk, len),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_fsblk_t,        pblk        )
                __field(        ext4_lblk_t,        lblk        )
                __field(        unsigned short,        len        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pblk        = pblk;
                __entry->lblk        = lblk;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->lblk,
                  (unsigned long long) __entry->pblk,
                  (unsigned short) __entry->len)
);

TRACE_EVENT(ext4_remove_blocks,
        TP_PROTO(struct inode *inode, struct ext4_extent *ex,
                 ext4_lblk_t from, ext4_fsblk_t to,
                 struct partial_cluster *pc),

        TP_ARGS(inode, ex, from, to, pc),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        from        )
                __field(        ext4_lblk_t,        to        )
                __field(        ext4_fsblk_t,        ee_pblk        )
                __field(        ext4_lblk_t,        ee_lblk        )
                __field(        unsigned short,        ee_len        )
                __field(        ext4_fsblk_t,        pc_pclu        )
                __field(        ext4_lblk_t,        pc_lblk        )
                __field(        int,                pc_state)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->from                = from;
                __entry->to                = to;
                __entry->ee_pblk        = ext4_ext_pblock(ex);
                __entry->ee_lblk        = le32_to_cpu(ex->ee_block);
                __entry->ee_len                = ext4_ext_get_actual_len(ex);
                __entry->pc_pclu        = pc->pclu;
                __entry->pc_lblk        = pc->lblk;
                __entry->pc_state        = pc->state;
        ),

        TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]"
                  "from %u to %u partial [pclu %lld lblk %u state %d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->ee_lblk,
                  (unsigned long long) __entry->ee_pblk,
                  (unsigned short) __entry->ee_len,
                  (unsigned) __entry->from,
                  (unsigned) __entry->to,
                  (long long) __entry->pc_pclu,
                  (unsigned int) __entry->pc_lblk,
                  (int) __entry->pc_state)
);

TRACE_EVENT(ext4_ext_rm_leaf,
        TP_PROTO(struct inode *inode, ext4_lblk_t start,
                 struct ext4_extent *ex,
                 struct partial_cluster *pc),

        TP_ARGS(inode, start, ex, pc),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        start        )
                __field(        ext4_lblk_t,        ee_lblk        )
                __field(        ext4_fsblk_t,        ee_pblk        )
                __field(        short,                ee_len        )
                __field(        ext4_fsblk_t,        pc_pclu        )
                __field(        ext4_lblk_t,        pc_lblk        )
                __field(        int,                pc_state)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->start                = start;
                __entry->ee_lblk        = le32_to_cpu(ex->ee_block);
                __entry->ee_pblk        = ext4_ext_pblock(ex);
                __entry->ee_len                = ext4_ext_get_actual_len(ex);
                __entry->pc_pclu        = pc->pclu;
                __entry->pc_lblk        = pc->lblk;
                __entry->pc_state        = pc->state;
        ),

        TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]"
                  "partial [pclu %lld lblk %u state %d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->start,
                  (unsigned) __entry->ee_lblk,
                  (unsigned long long) __entry->ee_pblk,
                  (unsigned short) __entry->ee_len,
                  (long long) __entry->pc_pclu,
                  (unsigned int) __entry->pc_lblk,
                  (int) __entry->pc_state)
);

TRACE_EVENT(ext4_ext_rm_idx,
        TP_PROTO(struct inode *inode, ext4_fsblk_t pblk),

        TP_ARGS(inode, pblk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_fsblk_t,        pblk        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pblk        = pblk;
        ),

        TP_printk("dev %d,%d ino %lu index_pblk %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long long) __entry->pblk)
);

TRACE_EVENT(ext4_ext_remove_space,
        TP_PROTO(struct inode *inode, ext4_lblk_t start,
                 ext4_lblk_t end, int depth),

        TP_ARGS(inode, start, end, depth),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        start        )
                __field(        ext4_lblk_t,        end        )
                __field(        int,                depth        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->start        = start;
                __entry->end        = end;
                __entry->depth        = depth;
        ),

        TP_printk("dev %d,%d ino %lu since %u end %u depth %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->start,
                  (unsigned) __entry->end,
                  __entry->depth)
);

TRACE_EVENT(ext4_ext_remove_space_done,
        TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end,
                 int depth, struct partial_cluster *pc, __le16 eh_entries),

        TP_ARGS(inode, start, end, depth, pc, eh_entries),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        start                )
                __field(        ext4_lblk_t,        end                )
                __field(        int,                depth                )
                __field(        ext4_fsblk_t,        pc_pclu                )
                __field(        ext4_lblk_t,        pc_lblk                )
                __field(        int,                pc_state        )
                __field(        unsigned short,        eh_entries        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->start                = start;
                __entry->end                = end;
                __entry->depth                = depth;
                __entry->pc_pclu        = pc->pclu;
                __entry->pc_lblk        = pc->lblk;
                __entry->pc_state        = pc->state;
                __entry->eh_entries        = le16_to_cpu(eh_entries);
        ),

        TP_printk("dev %d,%d ino %lu since %u end %u depth %d "
                  "partial [pclu %lld lblk %u state %d] "
                  "remaining_entries %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->start,
                  (unsigned) __entry->end,
                  __entry->depth,
                  (long long) __entry->pc_pclu,
                  (unsigned int) __entry->pc_lblk,
                  (int) __entry->pc_state,
                  (unsigned short) __entry->eh_entries)
);

DECLARE_EVENT_CLASS(ext4__es_extent,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        char, status        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = es->es_lblk;
                __entry->len        = es->es_len;
                __entry->pblk        = ext4_es_show_pblock(es);
                __entry->status        = ext4_es_status(es);
        ),

        TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len,
                  __entry->pblk, show_extent_status(__entry->status))
);

DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es)
);

DEFINE_EVENT(ext4__es_extent, ext4_es_cache_extent,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es)
);

TRACE_EVENT(ext4_es_remove_extent,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len),

        TP_ARGS(inode, lblk, len),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        lblk                        )
                __field(        loff_t,        len                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu es [%lld/%lld)",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len)
);

TRACE_EVENT(ext4_es_find_extent_range_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk),

        TP_ARGS(inode, lblk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->lblk)
);

TRACE_EVENT(ext4_es_find_extent_range_exit,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        char, status        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = es->es_lblk;
                __entry->len        = es->es_len;
                __entry->pblk        = ext4_es_show_pblock(es);
                __entry->status        = ext4_es_status(es);
        ),

        TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len,
                  __entry->pblk, show_extent_status(__entry->status))
);

TRACE_EVENT(ext4_es_lookup_extent_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk),

        TP_ARGS(inode, lblk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->lblk)
);

TRACE_EVENT(ext4_es_lookup_extent_exit,
        TP_PROTO(struct inode *inode, struct extent_status *es,
                 int found),

        TP_ARGS(inode, es, found),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        char,                status                )
                __field(        int,                found                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = es->es_lblk;
                __entry->len        = es->es_len;
                __entry->pblk        = ext4_es_show_pblock(es);
                __entry->status        = ext4_es_status(es);
                __entry->found        = found;
        ),

        TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->found,
                  __entry->lblk, __entry->len,
                  __entry->found ? __entry->pblk : 0,
                  show_extent_status(__entry->found ? __entry->status : 0))
);

DECLARE_EVENT_CLASS(ext4__es_shrink_enter,
        TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),

        TP_ARGS(sb, nr_to_scan, cache_cnt),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        nr_to_scan                )
                __field(        int,        cache_cnt                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->nr_to_scan        = nr_to_scan;
                __entry->cache_cnt        = cache_cnt;
        ),

        TP_printk("dev %d,%d nr_to_scan %d cache_cnt %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nr_to_scan, __entry->cache_cnt)
);

DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_count,
        TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),

        TP_ARGS(sb, nr_to_scan, cache_cnt)
);

DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_scan_enter,
        TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),

        TP_ARGS(sb, nr_to_scan, cache_cnt)
);

TRACE_EVENT(ext4_es_shrink_scan_exit,
        TP_PROTO(struct super_block *sb, int nr_shrunk, int cache_cnt),

        TP_ARGS(sb, nr_shrunk, cache_cnt),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        nr_shrunk                )
                __field(        int,        cache_cnt                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->nr_shrunk        = nr_shrunk;
                __entry->cache_cnt        = cache_cnt;
        ),

        TP_printk("dev %d,%d nr_shrunk %d cache_cnt %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nr_shrunk, __entry->cache_cnt)
);

TRACE_EVENT(ext4_collapse_range,
        TP_PROTO(struct inode *inode, loff_t offset, loff_t len),

        TP_ARGS(inode, offset, len),

        TP_STRUCT__entry(
                __field(dev_t,        dev)
                __field(ino_t,        ino)
                __field(loff_t,        offset)
                __field(loff_t, len)
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->offset        = offset;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu offset %lld len %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->offset, __entry->len)
);

TRACE_EVENT(ext4_insert_range,
        TP_PROTO(struct inode *inode, loff_t offset, loff_t len),

        TP_ARGS(inode, offset, len),

        TP_STRUCT__entry(
                __field(dev_t,        dev)
                __field(ino_t,        ino)
                __field(loff_t,        offset)
                __field(loff_t, len)
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->offset        = offset;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu offset %lld len %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->offset, __entry->len)
);

TRACE_EVENT(ext4_es_shrink,
        TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time,
                 int nr_skipped, int retried),

        TP_ARGS(sb, nr_shrunk, scan_time, nr_skipped, retried),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        int,                nr_shrunk        )
                __field(        unsigned long long, scan_time        )
                __field(        int,                nr_skipped        )
                __field(        int,                retried                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->nr_shrunk        = nr_shrunk;
                __entry->scan_time        = div_u64(scan_time, 1000);
                __entry->nr_skipped        = nr_skipped;
                __entry->retried        = retried;
        ),

        TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu "
                  "nr_skipped %d retried %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk,
                  __entry->scan_time, __entry->nr_skipped, __entry->retried)
);

TRACE_EVENT(ext4_es_insert_delayed_block,
        TP_PROTO(struct inode *inode, struct extent_status *es,
                 bool allocated),

        TP_ARGS(inode, es, allocated),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        char,                status                )
                __field(        bool,                allocated        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->lblk                = es->es_lblk;
                __entry->len                = es->es_len;
                __entry->pblk                = ext4_es_show_pblock(es);
                __entry->status                = ext4_es_status(es);
                __entry->allocated        = allocated;
        ),

        TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s "
                  "allocated %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len,
                  __entry->pblk, show_extent_status(__entry->status),
                  __entry->allocated)
);

/* fsmap traces */
DECLARE_EVENT_CLASS(ext4_fsmap_class,
        TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len,
                 u64 owner),
        TP_ARGS(sb, keydev, agno, bno, len, owner),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(dev_t, keydev)
                __field(u32, agno)
                __field(u64, bno)
                __field(u64, len)
                __field(u64, owner)
        ),
        TP_fast_assign(
                __entry->dev = sb->s_bdev->bd_dev;
                __entry->keydev = new_decode_dev(keydev);
                __entry->agno = agno;
                __entry->bno = bno;
                __entry->len = len;
                __entry->owner = owner;
        ),
        TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld\n",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  MAJOR(__entry->keydev), MINOR(__entry->keydev),
                  __entry->agno,
                  __entry->bno,
                  __entry->len,
                  __entry->owner)
)
#define DEFINE_FSMAP_EVENT(name) \
DEFINE_EVENT(ext4_fsmap_class, name, \
        TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, \
                 u64 owner), \
        TP_ARGS(sb, keydev, agno, bno, len, owner))
DEFINE_FSMAP_EVENT(ext4_fsmap_low_key);
DEFINE_FSMAP_EVENT(ext4_fsmap_high_key);
DEFINE_FSMAP_EVENT(ext4_fsmap_mapping);

DECLARE_EVENT_CLASS(ext4_getfsmap_class,
        TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap),
        TP_ARGS(sb, fsmap),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(dev_t, keydev)
                __field(u64, block)
                __field(u64, len)
                __field(u64, owner)
                __field(u64, flags)
        ),
        TP_fast_assign(
                __entry->dev = sb->s_bdev->bd_dev;
                __entry->keydev = new_decode_dev(fsmap->fmr_device);
                __entry->block = fsmap->fmr_physical;
                __entry->len = fsmap->fmr_length;
                __entry->owner = fsmap->fmr_owner;
                __entry->flags = fsmap->fmr_flags;
        ),
        TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld flags 0x%llx\n",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  MAJOR(__entry->keydev), MINOR(__entry->keydev),
                  __entry->block,
                  __entry->len,
                  __entry->owner,
                  __entry->flags)
)
#define DEFINE_GETFSMAP_EVENT(name) \
DEFINE_EVENT(ext4_getfsmap_class, name, \
        TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap), \
        TP_ARGS(sb, fsmap))
DEFINE_GETFSMAP_EVENT(ext4_getfsmap_low_key);
DEFINE_GETFSMAP_EVENT(ext4_getfsmap_high_key);
DEFINE_GETFSMAP_EVENT(ext4_getfsmap_mapping);

TRACE_EVENT(ext4_shutdown,
        TP_PROTO(struct super_block *sb, unsigned long flags),

        TP_ARGS(sb, flags),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(     unsigned,        flags                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->flags        = flags;
        ),

        TP_printk("dev %d,%d flags %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->flags)
);

TRACE_EVENT(ext4_error,
        TP_PROTO(struct super_block *sb, const char *function,
                 unsigned int line),

        TP_ARGS(sb, function, line),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field( const char *,        function                )
                __field(     unsigned,        line                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->function = function;
                __entry->line        = line;
        ),

        TP_printk("dev %d,%d function %s line %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->function, __entry->line)
);

TRACE_EVENT(ext4_prefetch_bitmaps,
            TP_PROTO(struct super_block *sb, ext4_group_t group,
                     ext4_group_t next, unsigned int prefetch_ios),

        TP_ARGS(sb, group, next, prefetch_ios),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )
                __field(        __u32,        next                        )
                __field(        __u32,        ios                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
                __entry->next        = next;
                __entry->ios        = prefetch_ios;
        ),

        TP_printk("dev %d,%d group %u next %u ios %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->group, __entry->next, __entry->ios)
);

TRACE_EVENT(ext4_lazy_itable_init,
            TP_PROTO(struct super_block *sb, ext4_group_t group),

        TP_ARGS(sb, group),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
        ),

        TP_printk("dev %d,%d group %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group)
);

TRACE_EVENT(ext4_fc_replay_scan,
        TP_PROTO(struct super_block *sb, int error, int off),

        TP_ARGS(sb, error, off),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, error)
                __field(int, off)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->error = error;
                __entry->off = off;
        ),

        TP_printk("dev %d,%d error %d, off %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->error, __entry->off)
);

TRACE_EVENT(ext4_fc_replay,
        TP_PROTO(struct super_block *sb, int tag, int ino, int priv1, int priv2),

        TP_ARGS(sb, tag, ino, priv1, priv2),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, tag)
                __field(int, ino)
                __field(int, priv1)
                __field(int, priv2)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->tag = tag;
                __entry->ino = ino;
                __entry->priv1 = priv1;
                __entry->priv2 = priv2;
        ),

        TP_printk("dev %d,%d: tag %d, ino %d, data1 %d, data2 %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->tag, __entry->ino, __entry->priv1, __entry->priv2)
);

TRACE_EVENT(ext4_fc_commit_start,
        TP_PROTO(struct super_block *sb, tid_t commit_tid),

        TP_ARGS(sb, commit_tid),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(tid_t, tid)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->tid = commit_tid;
        ),

        TP_printk("dev %d,%d tid %u", MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->tid)
);

TRACE_EVENT(ext4_fc_commit_stop,
            TP_PROTO(struct super_block *sb, int nblks, int reason,
                     tid_t commit_tid),

        TP_ARGS(sb, nblks, reason, commit_tid),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, nblks)
                __field(int, reason)
                __field(int, num_fc)
                __field(int, num_fc_ineligible)
                __field(int, nblks_agg)
                __field(tid_t, tid)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->nblks = nblks;
                __entry->reason = reason;
                __entry->num_fc = EXT4_SB(sb)->s_fc_stats.fc_num_commits;
                __entry->num_fc_ineligible =
                        EXT4_SB(sb)->s_fc_stats.fc_ineligible_commits;
                __entry->nblks_agg = EXT4_SB(sb)->s_fc_stats.fc_numblks;
                __entry->tid = commit_tid;
        ),

        TP_printk("dev %d,%d nblks %d, reason %d, fc = %d, ineligible = %d, agg_nblks %d, tid %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nblks, __entry->reason, __entry->num_fc,
                  __entry->num_fc_ineligible, __entry->nblks_agg, __entry->tid)
);

#define FC_REASON_NAME_STAT(reason)                                        \
        show_fc_reason(reason),                                                \
        __entry->fc_ineligible_rc[reason]

TRACE_EVENT(ext4_fc_stats,
        TP_PROTO(struct super_block *sb),

        TP_ARGS(sb),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __array(unsigned int, fc_ineligible_rc, EXT4_FC_REASON_MAX)
                __field(unsigned long, fc_commits)
                __field(unsigned long, fc_ineligible_commits)
                __field(unsigned long, fc_numblks)
        ),

        TP_fast_assign(
                int i;

                __entry->dev = sb->s_dev;
                for (i = 0; i < EXT4_FC_REASON_MAX; i++) {
                        __entry->fc_ineligible_rc[i] =
                                EXT4_SB(sb)->s_fc_stats.fc_ineligible_reason_count[i];
                }
                __entry->fc_commits = EXT4_SB(sb)->s_fc_stats.fc_num_commits;
                __entry->fc_ineligible_commits =
                        EXT4_SB(sb)->s_fc_stats.fc_ineligible_commits;
                __entry->fc_numblks = EXT4_SB(sb)->s_fc_stats.fc_numblks;
        ),

        TP_printk("dev %d,%d fc ineligible reasons:\n"
                  "%s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u"
                  "num_commits:%lu, ineligible: %lu, numblks: %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_XATTR),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_CROSS_RENAME),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_NOMEM),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_SWAP_BOOT),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_RESIZE),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_RENAME_DIR),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_FALLOC_RANGE),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_INODE_JOURNAL_DATA),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_ENCRYPTED_FILENAME),
                  __entry->fc_commits, __entry->fc_ineligible_commits,
                  __entry->fc_numblks)
);

DECLARE_EVENT_CLASS(ext4_fc_track_dentry,

        TP_PROTO(handle_t *handle, struct inode *inode,
                 struct dentry *dentry, int ret),

        TP_ARGS(handle, inode, dentry, ret),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(tid_t, t_tid)
                __field(ino_t, i_ino)
                __field(tid_t, i_sync_tid)
                __field(int, error)
        ),

        TP_fast_assign(
                struct ext4_inode_info *ei = EXT4_I(inode);

                __entry->dev = inode->i_sb->s_dev;
                __entry->t_tid = handle->h_transaction->t_tid;
                __entry->i_ino = inode->i_ino;
                __entry->i_sync_tid = ei->i_sync_tid;
                __entry->error = ret;
        ),

        TP_printk("dev %d,%d, t_tid %u, ino %lu, i_sync_tid %u, error %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->t_tid, __entry->i_ino, __entry->i_sync_tid,
                  __entry->error
        )
);

#define DEFINE_EVENT_CLASS_DENTRY(__type)                                \
DEFINE_EVENT(ext4_fc_track_dentry, ext4_fc_track_##__type,                \
        TP_PROTO(handle_t *handle, struct inode *inode,                        \
                 struct dentry *dentry, int ret),                        \
        TP_ARGS(handle, inode, dentry, ret)                                \
)

DEFINE_EVENT_CLASS_DENTRY(create);
DEFINE_EVENT_CLASS_DENTRY(link);
DEFINE_EVENT_CLASS_DENTRY(unlink);

TRACE_EVENT(ext4_fc_track_inode,
        TP_PROTO(handle_t *handle, struct inode *inode, int ret),

        TP_ARGS(handle, inode, ret),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(tid_t, t_tid)
                __field(ino_t, i_ino)
                __field(tid_t, i_sync_tid)
                __field(int, error)
        ),

        TP_fast_assign(
                struct ext4_inode_info *ei = EXT4_I(inode);

                __entry->dev = inode->i_sb->s_dev;
                __entry->t_tid = handle->h_transaction->t_tid;
                __entry->i_ino = inode->i_ino;
                __entry->i_sync_tid = ei->i_sync_tid;
                __entry->error = ret;
        ),

        TP_printk("dev %d:%d, t_tid %u, inode %lu, i_sync_tid %u, error %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->t_tid, __entry->i_ino, __entry->i_sync_tid,
                  __entry->error)
        );

TRACE_EVENT(ext4_fc_track_range,
        TP_PROTO(handle_t *handle, struct inode *inode,
                 long start, long end, int ret),

        TP_ARGS(handle, inode, start, end, ret),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(tid_t, t_tid)
                __field(ino_t, i_ino)
                __field(tid_t, i_sync_tid)
                __field(long, start)
                __field(long, end)
                __field(int, error)
        ),

        TP_fast_assign(
                struct ext4_inode_info *ei = EXT4_I(inode);

                __entry->dev = inode->i_sb->s_dev;
                __entry->t_tid = handle->h_transaction->t_tid;
                __entry->i_ino = inode->i_ino;
                __entry->i_sync_tid = ei->i_sync_tid;
                __entry->start = start;
                __entry->end = end;
                __entry->error = ret;
        ),

        TP_printk("dev %d:%d, t_tid %u, inode %lu, i_sync_tid %u, error %d, start %ld, end %ld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->t_tid, __entry->i_ino, __entry->i_sync_tid,
                  __entry->error, __entry->start, __entry->end)
        );

TRACE_EVENT(ext4_fc_cleanup,
        TP_PROTO(journal_t *journal, int full, tid_t tid),

        TP_ARGS(journal, full, tid),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, j_fc_off)
                __field(int, full)
                __field(tid_t, tid)
        ),

        TP_fast_assign(
                struct super_block *sb = journal->j_private;

                __entry->dev = sb->s_dev;
                __entry->j_fc_off = journal->j_fc_off;
                __entry->full = full;
                __entry->tid = tid;
        ),

        TP_printk("dev %d,%d, j_fc_off %d, full %d, tid %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->j_fc_off, __entry->full, __entry->tid)
        );

TRACE_EVENT(ext4_update_sb,
        TP_PROTO(struct super_block *sb, ext4_fsblk_t fsblk,
                 unsigned int flags),

        TP_ARGS(sb, fsblk, flags),

        TP_STRUCT__entry(
                __field(dev_t,                dev)
                __field(ext4_fsblk_t,        fsblk)
                __field(unsigned int,        flags)
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->fsblk        = fsblk;
                __entry->flags        = flags;
        ),

        TP_printk("dev %d,%d fsblk %llu flags %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->fsblk, __entry->flags)
);

#endif /* _TRACE_EXT4_H */

/* This part must be outside protection */
#include <trace/define_trace.h>

























































   37 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM x86_fpu

#if !defined(_TRACE_FPU_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_FPU_H

#include <linux/tracepoint.h>

DECLARE_EVENT_CLASS(x86_fpu,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu),

        TP_STRUCT__entry(
                __field(struct fpu *, fpu)
                __field(bool, load_fpu)
                __field(u64, xfeatures)
                __field(u64, xcomp_bv)
                ),

        TP_fast_assign(
                __entry->fpu                = fpu;
                __entry->load_fpu        = test_thread_flag(TIF_NEED_FPU_LOAD);
                if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
                        __entry->xfeatures = fpu->fpstate->regs.xsave.header.xfeatures;
                        __entry->xcomp_bv  = fpu->fpstate->regs.xsave.header.xcomp_bv;
                }
        ),
        TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx",
                        __entry->fpu,
                        __entry->load_fpu,
                        __entry->xfeatures,
                        __entry->xcomp_bv
        )
);

DEFINE_EVENT(x86_fpu, x86_fpu_before_save,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_after_save,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_before_restore,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_after_restore,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_init_state,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_dropped,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_copy_src,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_copy_dst,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

DEFINE_EVENT(x86_fpu, x86_fpu_xstate_check_failed,
        TP_PROTO(struct fpu *fpu),
        TP_ARGS(fpu)
);

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH asm/trace/
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE fpu
#endif /* _TRACE_FPU_H */

/* This part must be outside protection */
#include <trace/define_trace.h>


























































    1 













    1 




    1 






    1 






    1 

    1 




















    1 

    1 









    1 



    1 










































































































    1 




    1 
    1 






    1 



















    1 






    1 


    1 
    1 










    1 


    1 
    1 



    1 


    1 













    1 




    1 








    1 











    1 
    1 









    1 

    1 


    1 
    1 







    1 

    1 








    1 
    1 





















    1 













    1 

    1 

    1 








    1 



    1 
    1 




    1 



























































    1 

    1 










































    1 














    1 




















    1 




























    1 








    1 












    1 

















    1 









    1 
    1 














    1 


    1 










    1 







    1 


























    1 





















    1 






    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2010 Red Hat, Inc.
 * Copyright (c) 2016-2021 Christoph Hellwig.
 */
#include <linux/module.h>
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/fscrypt.h>
#include <linux/pagemap.h>
#include <linux/iomap.h>
#include <linux/backing-dev.h>
#include <linux/uio.h>
#include <linux/task_io_accounting_ops.h>
#include "trace.h"

#include "../internal.h"

/*
 * Private flags for iomap_dio, must not overlap with the public ones in
 * iomap.h:
 */
#define IOMAP_DIO_CALLER_COMP        (1U << 26)
#define IOMAP_DIO_INLINE_COMP        (1U << 27)
#define IOMAP_DIO_WRITE_THROUGH        (1U << 28)
#define IOMAP_DIO_NEED_SYNC        (1U << 29)
#define IOMAP_DIO_WRITE                (1U << 30)
#define IOMAP_DIO_DIRTY                (1U << 31)

struct iomap_dio {
        struct kiocb                *iocb;
        const struct iomap_dio_ops *dops;
        loff_t                        i_size;
        loff_t                        size;
        atomic_t                ref;
        unsigned                flags;
        int                        error;
        size_t                        done_before;
        bool                        wait_for_completion;

        union {
                /* used during submission and for synchronous completion: */
                struct {
                        struct iov_iter                *iter;
                        struct task_struct        *waiter;
                } submit;

                /* used for aio completion: */
                struct {
                        struct work_struct        work;
                } aio;
        };
};

static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter,
                struct iomap_dio *dio, unsigned short nr_vecs, blk_opf_t opf)
{
        if (dio->dops && dio->dops->bio_set)
                return bio_alloc_bioset(iter->iomap.bdev, nr_vecs, opf,
                                        GFP_KERNEL, dio->dops->bio_set);
        return bio_alloc(iter->iomap.bdev, nr_vecs, opf, GFP_KERNEL);
}

static void iomap_dio_submit_bio(const struct iomap_iter *iter,
                struct iomap_dio *dio, struct bio *bio, loff_t pos)
{
        struct kiocb *iocb = dio->iocb;

        atomic_inc(&dio->ref);

        /* Sync dio can't be polled reliably */
        if ((iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(iocb)) {
                bio_set_polled(bio, iocb);
                WRITE_ONCE(iocb->private, bio);
        }

        if (dio->dops && dio->dops->submit_io)
                dio->dops->submit_io(iter, bio, pos);
        else
                submit_bio(bio);
}

ssize_t iomap_dio_complete(struct iomap_dio *dio)
{
        const struct iomap_dio_ops *dops = dio->dops;
        struct kiocb *iocb = dio->iocb;
        loff_t offset = iocb->ki_pos;
        ssize_t ret = dio->error;

        if (dops && dops->end_io)
                ret = dops->end_io(iocb, dio->size, ret, dio->flags);

        if (likely(!ret)) {
                ret = dio->size;
                /* check for short read */
                if (offset + ret > dio->i_size &&
                    !(dio->flags & IOMAP_DIO_WRITE))
                        ret = dio->i_size - offset;
        }

        /*
         * Try again to invalidate clean pages which might have been cached by
         * non-direct readahead, or faulted in by get_user_pages() if the source
         * of the write was an mmap'ed region of the file we're writing.  Either
         * one is a pretty crazy thing to do, so we don't support it 100%.  If
         * this invalidation fails, tough, the write still worked...
         *
         * And this page cache invalidation has to be after ->end_io(), as some
         * filesystems convert unwritten extents to real allocations in
         * ->end_io() when necessary, otherwise a racing buffer read would cache
         * zeros from unwritten extents.
         */
        if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE))
                kiocb_invalidate_post_direct_write(iocb, dio->size);

        inode_dio_end(file_inode(iocb->ki_filp));

        if (ret > 0) {
                iocb->ki_pos += ret;

                /*
                 * If this is a DSYNC write, make sure we push it to stable
                 * storage now that we've written data.
                 */
                if (dio->flags & IOMAP_DIO_NEED_SYNC)
                        ret = generic_write_sync(iocb, ret);
                if (ret > 0)
                        ret += dio->done_before;
        }
        trace_iomap_dio_complete(iocb, dio->error, ret);
        kfree(dio);
        return ret;
}
EXPORT_SYMBOL_GPL(iomap_dio_complete);

static ssize_t iomap_dio_deferred_complete(void *data)
{
        return iomap_dio_complete(data);
}

static void iomap_dio_complete_work(struct work_struct *work)
{
        struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
        struct kiocb *iocb = dio->iocb;

        iocb->ki_complete(iocb, iomap_dio_complete(dio));
}

/*
 * Set an error in the dio if none is set yet.  We have to use cmpxchg
 * as the submission context and the completion context(s) can race to
 * update the error.
 */
static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
{
        cmpxchg(&dio->error, 0, ret);
}

void iomap_dio_bio_end_io(struct bio *bio)
{
        struct iomap_dio *dio = bio->bi_private;
        bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
        struct kiocb *iocb = dio->iocb;

        if (bio->bi_status)
                iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
        if (!atomic_dec_and_test(&dio->ref))
                goto release_bio;

        /*
         * Synchronous dio, task itself will handle any completion work
         * that needs after IO. All we need to do is wake the task.
         */
        if (dio->wait_for_completion) {
                struct task_struct *waiter = dio->submit.waiter;

                WRITE_ONCE(dio->submit.waiter, NULL);
                blk_wake_io_task(waiter);
                goto release_bio;
        }

        /*
         * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline
         */
        if (dio->flags & IOMAP_DIO_INLINE_COMP) {
                WRITE_ONCE(iocb->private, NULL);
                iomap_dio_complete_work(&dio->aio.work);
                goto release_bio;
        }

        /*
         * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
         * our completion that way to avoid an async punt to a workqueue.
         */
        if (dio->flags & IOMAP_DIO_CALLER_COMP) {
                /* only polled IO cares about private cleared */
                iocb->private = dio;
                iocb->dio_complete = iomap_dio_deferred_complete;

                /*
                 * Invoke ->ki_complete() directly. We've assigned our
                 * dio_complete callback handler, and since the issuer set
                 * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will
                 * notice ->dio_complete being set and will defer calling that
                 * handler until it can be done from a safe task context.
                 *
                 * Note that the 'res' being passed in here is not important
                 * for this case. The actual completion value of the request
                 * will be gotten from dio_complete when that is run by the
                 * issuer.
                 */
                iocb->ki_complete(iocb, 0);
                goto release_bio;
        }

        /*
         * Async DIO completion that requires filesystem level completion work
         * gets punted to a work queue to complete as the operation may require
         * more IO to be issued to finalise filesystem metadata changes or
         * guarantee data integrity.
         */
        INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
        queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq,
                        &dio->aio.work);
release_bio:
        if (should_dirty) {
                bio_check_pages_dirty(bio);
        } else {
                bio_release_pages(bio, false);
                bio_put(bio);
        }
}
EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);

static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
                loff_t pos, unsigned len)
{
        struct inode *inode = file_inode(dio->iocb->ki_filp);
        struct page *page = ZERO_PAGE(0);
        struct bio *bio;

        bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
        fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
                                  GFP_KERNEL);
        bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
        bio->bi_private = dio;
        bio->bi_end_io = iomap_dio_bio_end_io;

        __bio_add_page(bio, page, len, 0);
        iomap_dio_submit_bio(iter, dio, bio, pos);
}

/*
 * Figure out the bio's operation flags from the dio request, the
 * mapping, and whether or not we want FUA.  Note that we can end up
 * clearing the WRITE_THROUGH flag in the dio request.
 */
static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
                const struct iomap *iomap, bool use_fua)
{
        blk_opf_t opflags = REQ_SYNC | REQ_IDLE;

        if (!(dio->flags & IOMAP_DIO_WRITE))
                return REQ_OP_READ;

        opflags |= REQ_OP_WRITE;
        if (use_fua)
                opflags |= REQ_FUA;
        else
                dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;

        return opflags;
}

static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
                struct iomap_dio *dio)
{
        const struct iomap *iomap = &iter->iomap;
        struct inode *inode = iter->inode;
        unsigned int fs_block_size = i_blocksize(inode), pad;
        loff_t length = iomap_length(iter);
        loff_t pos = iter->pos;
        blk_opf_t bio_opf;
        struct bio *bio;
        bool need_zeroout = false;
        bool use_fua = false;
        int nr_pages, ret = 0;
        size_t copied = 0;
        size_t orig_count;

        if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
            !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
                return -EINVAL;

        if (iomap->type == IOMAP_UNWRITTEN) {
                dio->flags |= IOMAP_DIO_UNWRITTEN;
                need_zeroout = true;
        }

        if (iomap->flags & IOMAP_F_SHARED)
                dio->flags |= IOMAP_DIO_COW;

        if (iomap->flags & IOMAP_F_NEW) {
                need_zeroout = true;
        } else if (iomap->type == IOMAP_MAPPED) {
                /*
                 * Use a FUA write if we need datasync semantics, this is a pure
                 * data IO that doesn't require any metadata updates (including
                 * after IO completion such as unwritten extent conversion) and
                 * the underlying device either supports FUA or doesn't have
                 * a volatile write cache. This allows us to avoid cache flushes
                 * on IO completion. If we can't use writethrough and need to
                 * sync, disable in-task completions as dio completion will
                 * need to call generic_write_sync() which will do a blocking
                 * fsync / cache flush call.
                 */
                if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
                    (dio->flags & IOMAP_DIO_WRITE_THROUGH) &&
                    (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
                        use_fua = true;
                else if (dio->flags & IOMAP_DIO_NEED_SYNC)
                        dio->flags &= ~IOMAP_DIO_CALLER_COMP;
        }

        /*
         * Save the original count and trim the iter to just the extent we
         * are operating on right now.  The iter will be re-expanded once
         * we are done.
         */
        orig_count = iov_iter_count(dio->submit.iter);
        iov_iter_truncate(dio->submit.iter, length);

        if (!iov_iter_count(dio->submit.iter))
                goto out;

        /*
         * We can only do deferred completion for pure overwrites that
         * don't require additional IO at completion. This rules out
         * writes that need zeroing or extent conversion, extend
         * the file size, or issue journal IO or cache flushes
         * during completion processing.
         */
        if (need_zeroout ||
            ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) ||
            ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
                dio->flags &= ~IOMAP_DIO_CALLER_COMP;

        /*
         * The rules for polled IO completions follow the guidelines as the
         * ones we set for inline and deferred completions. If none of those
         * are available for this IO, clear the polled flag.
         */
        if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP)))
                dio->iocb->ki_flags &= ~IOCB_HIPRI;

        if (need_zeroout) {
                /* zero out from the start of the block to the write offset */
                pad = pos & (fs_block_size - 1);
                if (pad)
                        iomap_dio_zero(iter, dio, pos - pad, pad);
        }

        /*
         * Set the operation flags early so that bio_iov_iter_get_pages
         * can set up the page vector appropriately for a ZONE_APPEND
         * operation.
         */
        bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);

        nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
        do {
                size_t n;
                if (dio->error) {
                        iov_iter_revert(dio->submit.iter, copied);
                        copied = ret = 0;
                        goto out;
                }

                bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf);
                fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
                                          GFP_KERNEL);
                bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
                bio->bi_write_hint = inode->i_write_hint;
                bio->bi_ioprio = dio->iocb->ki_ioprio;
                bio->bi_private = dio;
                bio->bi_end_io = iomap_dio_bio_end_io;

                ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
                if (unlikely(ret)) {
                        /*
                         * We have to stop part way through an IO. We must fall
                         * through to the sub-block tail zeroing here, otherwise
                         * this short IO may expose stale data in the tail of
                         * the block we haven't written data to.
                         */
                        bio_put(bio);
                        goto zero_tail;
                }

                n = bio->bi_iter.bi_size;
                if (dio->flags & IOMAP_DIO_WRITE) {
                        task_io_account_write(n);
                } else {
                        if (dio->flags & IOMAP_DIO_DIRTY)
                                bio_set_pages_dirty(bio);
                }

                dio->size += n;
                copied += n;

                nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
                                                 BIO_MAX_VECS);
                /*
                 * We can only poll for single bio I/Os.
                 */
                if (nr_pages)
                        dio->iocb->ki_flags &= ~IOCB_HIPRI;
                iomap_dio_submit_bio(iter, dio, bio, pos);
                pos += n;
        } while (nr_pages);

        /*
         * We need to zeroout the tail of a sub-block write if the extent type
         * requires zeroing or the write extends beyond EOF. If we don't zero
         * the block tail in the latter case, we can expose stale data via mmap
         * reads of the EOF block.
         */
zero_tail:
        if (need_zeroout ||
            ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
                /* zero out from the end of the write to the end of the block */
                pad = pos & (fs_block_size - 1);
                if (pad)
                        iomap_dio_zero(iter, dio, pos, fs_block_size - pad);
        }
out:
        /* Undo iter limitation to current extent */
        iov_iter_reexpand(dio->submit.iter, orig_count - copied);
        if (copied)
                return copied;
        return ret;
}

static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter,
                struct iomap_dio *dio)
{
        loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter);

        dio->size += length;
        if (!length)
                return -EFAULT;
        return length;
}

static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
                struct iomap_dio *dio)
{
        const struct iomap *iomap = &iomi->iomap;
        struct iov_iter *iter = dio->submit.iter;
        void *inline_data = iomap_inline_data(iomap, iomi->pos);
        loff_t length = iomap_length(iomi);
        loff_t pos = iomi->pos;
        size_t copied;

        if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap)))
                return -EIO;

        if (dio->flags & IOMAP_DIO_WRITE) {
                loff_t size = iomi->inode->i_size;

                if (pos > size)
                        memset(iomap_inline_data(iomap, size), 0, pos - size);
                copied = copy_from_iter(inline_data, length, iter);
                if (copied) {
                        if (pos + copied > size)
                                i_size_write(iomi->inode, pos + copied);
                        mark_inode_dirty(iomi->inode);
                }
        } else {
                copied = copy_to_iter(inline_data, length, iter);
        }
        dio->size += copied;
        if (!copied)
                return -EFAULT;
        return copied;
}

static loff_t iomap_dio_iter(const struct iomap_iter *iter,
                struct iomap_dio *dio)
{
        switch (iter->iomap.type) {
        case IOMAP_HOLE:
                if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
                        return -EIO;
                return iomap_dio_hole_iter(iter, dio);
        case IOMAP_UNWRITTEN:
                if (!(dio->flags & IOMAP_DIO_WRITE))
                        return iomap_dio_hole_iter(iter, dio);
                return iomap_dio_bio_iter(iter, dio);
        case IOMAP_MAPPED:
                return iomap_dio_bio_iter(iter, dio);
        case IOMAP_INLINE:
                return iomap_dio_inline_iter(iter, dio);
        case IOMAP_DELALLOC:
                /*
                 * DIO is not serialised against mmap() access at all, and so
                 * if the page_mkwrite occurs between the writeback and the
                 * iomap_iter() call in the DIO path, then it will see the
                 * DELALLOC block that the page-mkwrite allocated.
                 */
                pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n",
                                    dio->iocb->ki_filp, current->comm);
                return -EIO;
        default:
                WARN_ON_ONCE(1);
                return -EIO;
        }
}

/*
 * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
 * is being issued as AIO or not.  This allows us to optimise pure data writes
 * to use REQ_FUA rather than requiring generic_write_sync() to issue a
 * REQ_FLUSH post write. This is slightly tricky because a single request here
 * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
 * may be pure data writes. In that case, we still need to do a full data sync
 * completion.
 *
 * When page faults are disabled and @dio_flags includes IOMAP_DIO_PARTIAL,
 * __iomap_dio_rw can return a partial result if it encounters a non-resident
 * page in @iter after preparing a transfer.  In that case, the non-resident
 * pages can be faulted in and the request resumed with @done_before set to the
 * number of bytes previously transferred.  The request will then complete with
 * the correct total number of bytes transferred; this is essential for
 * completing partial requests asynchronously.
 *
 * Returns -ENOTBLK In case of a page invalidation invalidation failure for
 * writes.  The callers needs to fall back to buffered I/O in this case.
 */
struct iomap_dio *
__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
                unsigned int dio_flags, void *private, size_t done_before)
{
        struct inode *inode = file_inode(iocb->ki_filp);
        struct iomap_iter iomi = {
                .inode                = inode,
                .pos                = iocb->ki_pos,
                .len                = iov_iter_count(iter),
                .flags                = IOMAP_DIRECT,
                .private        = private,
        };
        bool wait_for_completion =
                is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT);
        struct blk_plug plug;
        struct iomap_dio *dio;
        loff_t ret = 0;

        trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);

        if (!iomi.len)
                return NULL;

        dio = kmalloc(sizeof(*dio), GFP_KERNEL);
        if (!dio)
                return ERR_PTR(-ENOMEM);

        dio->iocb = iocb;
        atomic_set(&dio->ref, 1);
        dio->size = 0;
        dio->i_size = i_size_read(inode);
        dio->dops = dops;
        dio->error = 0;
        dio->flags = 0;
        dio->done_before = done_before;

        dio->submit.iter = iter;
        dio->submit.waiter = current;

        if (iocb->ki_flags & IOCB_NOWAIT)
                iomi.flags |= IOMAP_NOWAIT;

        if (iov_iter_rw(iter) == READ) {
                /* reads can always complete inline */
                dio->flags |= IOMAP_DIO_INLINE_COMP;

                if (iomi.pos >= dio->i_size)
                        goto out_free_dio;

                if (user_backed_iter(iter))
                        dio->flags |= IOMAP_DIO_DIRTY;

                ret = kiocb_write_and_wait(iocb, iomi.len);
                if (ret)
                        goto out_free_dio;
        } else {
                iomi.flags |= IOMAP_WRITE;
                dio->flags |= IOMAP_DIO_WRITE;

                /*
                 * Flag as supporting deferred completions, if the issuer
                 * groks it. This can avoid a workqueue punt for writes.
                 * We may later clear this flag if we need to do other IO
                 * as part of this IO completion.
                 */
                if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
                        dio->flags |= IOMAP_DIO_CALLER_COMP;

                if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
                        ret = -EAGAIN;
                        if (iomi.pos >= dio->i_size ||
                            iomi.pos + iomi.len > dio->i_size)
                                goto out_free_dio;
                        iomi.flags |= IOMAP_OVERWRITE_ONLY;
                }

                /* for data sync or sync, we need sync completion processing */
                if (iocb_is_dsync(iocb)) {
                        dio->flags |= IOMAP_DIO_NEED_SYNC;

                       /*
                        * For datasync only writes, we optimistically try using
                        * WRITE_THROUGH for this IO. This flag requires either
                        * FUA writes through the device's write cache, or a
                        * normal write to a device without a volatile write
                        * cache. For the former, Any non-FUA write that occurs
                        * will clear this flag, hence we know before completion
                        * whether a cache flush is necessary.
                        */
                        if (!(iocb->ki_flags & IOCB_SYNC))
                                dio->flags |= IOMAP_DIO_WRITE_THROUGH;
                }

                /*
                 * Try to invalidate cache pages for the range we are writing.
                 * If this invalidation fails, let the caller fall back to
                 * buffered I/O.
                 */
                ret = kiocb_invalidate_pages(iocb, iomi.len);
                if (ret) {
                        if (ret != -EAGAIN) {
                                trace_iomap_dio_invalidate_fail(inode, iomi.pos,
                                                                iomi.len);
                                ret = -ENOTBLK;
                        }
                        goto out_free_dio;
                }

                if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) {
                        ret = sb_init_dio_done_wq(inode->i_sb);
                        if (ret < 0)
                                goto out_free_dio;
                }
        }

        inode_dio_begin(inode);

        blk_start_plug(&plug);
        while ((ret = iomap_iter(&iomi, ops)) > 0) {
                iomi.processed = iomap_dio_iter(&iomi, dio);

                /*
                 * We can only poll for single bio I/Os.
                 */
                iocb->ki_flags &= ~IOCB_HIPRI;
        }

        blk_finish_plug(&plug);

        /*
         * We only report that we've read data up to i_size.
         * Revert iter to a state corresponding to that as some callers (such
         * as the splice code) rely on it.
         */
        if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size)
                iov_iter_revert(iter, iomi.pos - dio->i_size);

        if (ret == -EFAULT && dio->size && (dio_flags & IOMAP_DIO_PARTIAL)) {
                if (!(iocb->ki_flags & IOCB_NOWAIT))
                        wait_for_completion = true;
                ret = 0;
        }

        /* magic error code to fall back to buffered I/O */
        if (ret == -ENOTBLK) {
                wait_for_completion = true;
                ret = 0;
        }
        if (ret < 0)
                iomap_dio_set_error(dio, ret);

        /*
         * If all the writes we issued were already written through to the
         * media, we don't need to flush the cache on IO completion. Clear the
         * sync flag for this case.
         */
        if (dio->flags & IOMAP_DIO_WRITE_THROUGH)
                dio->flags &= ~IOMAP_DIO_NEED_SYNC;

        /*
         * We are about to drop our additional submission reference, which
         * might be the last reference to the dio.  There are three different
         * ways we can progress here:
         *
         *  (a) If this is the last reference we will always complete and free
         *        the dio ourselves.
         *  (b) If this is not the last reference, and we serve an asynchronous
         *        iocb, we must never touch the dio after the decrement, the
         *        I/O completion handler will complete and free it.
         *  (c) If this is not the last reference, but we serve a synchronous
         *        iocb, the I/O completion handler will wake us up on the drop
         *        of the final reference, and we will complete and free it here
         *        after we got woken by the I/O completion handler.
         */
        dio->wait_for_completion = wait_for_completion;
        if (!atomic_dec_and_test(&dio->ref)) {
                if (!wait_for_completion) {
                        trace_iomap_dio_rw_queued(inode, iomi.pos, iomi.len);
                        return ERR_PTR(-EIOCBQUEUED);
                }

                for (;;) {
                        set_current_state(TASK_UNINTERRUPTIBLE);
                        if (!READ_ONCE(dio->submit.waiter))
                                break;

                        blk_io_schedule();
                }
                __set_current_state(TASK_RUNNING);
        }

        return dio;

out_free_dio:
        kfree(dio);
        if (ret)
                return ERR_PTR(ret);
        return NULL;
}
EXPORT_SYMBOL_GPL(__iomap_dio_rw);

ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
                unsigned int dio_flags, void *private, size_t done_before)
{
        struct iomap_dio *dio;

        dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, private,
                             done_before);
        if (IS_ERR_OR_NULL(dio))
                return PTR_ERR_OR_ZERO(dio);
        return iomap_dio_complete(dio);
}
EXPORT_SYMBOL_GPL(iomap_dio_rw);





































































    4 








    4 








    4 






















































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SIGNAL_H
#define _LINUX_SIGNAL_H

#include <linux/bug.h>
#include <linux/list.h>
#include <linux/signal_types.h>
#include <linux/string.h>

struct task_struct;

/* for sysctl */
extern int print_fatal_signals;

static inline void copy_siginfo(kernel_siginfo_t *to,
                                const kernel_siginfo_t *from)
{
        memcpy(to, from, sizeof(*to));
}

static inline void clear_siginfo(kernel_siginfo_t *info)
{
        memset(info, 0, sizeof(*info));
}

#define SI_EXPANSION_SIZE (sizeof(struct siginfo) - sizeof(struct kernel_siginfo))

static inline void copy_siginfo_to_external(siginfo_t *to,
                                            const kernel_siginfo_t *from)
{
        memcpy(to, from, sizeof(*from));
        memset(((char *)to) + sizeof(struct kernel_siginfo), 0,
                SI_EXPANSION_SIZE);
}

int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from);
int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from);

enum siginfo_layout {
        SIL_KILL,
        SIL_TIMER,
        SIL_POLL,
        SIL_FAULT,
        SIL_FAULT_TRAPNO,
        SIL_FAULT_MCEERR,
        SIL_FAULT_BNDERR,
        SIL_FAULT_PKUERR,
        SIL_FAULT_PERF_EVENT,
        SIL_CHLD,
        SIL_RT,
        SIL_SYS,
};

enum siginfo_layout siginfo_layout(unsigned sig, int si_code);

/*
 * Define some primitives to manipulate sigset_t.
 */

#ifndef __HAVE_ARCH_SIG_BITOPS
#include <linux/bitops.h>

/* We don't use <linux/bitops.h> for these because there is no need to
   be atomic.  */
static inline void sigaddset(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                set->sig[0] |= 1UL << sig;
        else
                set->sig[sig / _NSIG_BPW] |= 1UL << (sig % _NSIG_BPW);
}

static inline void sigdelset(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                set->sig[0] &= ~(1UL << sig);
        else
                set->sig[sig / _NSIG_BPW] &= ~(1UL << (sig % _NSIG_BPW));
}

static inline int sigismember(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                return 1 & (set->sig[0] >> sig);
        else
                return 1 & (set->sig[sig / _NSIG_BPW] >> (sig % _NSIG_BPW));
}

#endif /* __HAVE_ARCH_SIG_BITOPS */

static inline int sigisemptyset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        case 4:
                return (set->sig[3] | set->sig[2] |
                        set->sig[1] | set->sig[0]) == 0;
        case 2:
                return (set->sig[1] | set->sig[0]) == 0;
        case 1:
                return set->sig[0] == 0;
        default:
                BUILD_BUG();
                return 0;
        }
}

static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2)
{
        switch (_NSIG_WORDS) {
        case 4:
                return        (set1->sig[3] == set2->sig[3]) &&
                        (set1->sig[2] == set2->sig[2]) &&
                        (set1->sig[1] == set2->sig[1]) &&
                        (set1->sig[0] == set2->sig[0]);
        case 2:
                return        (set1->sig[1] == set2->sig[1]) &&
                        (set1->sig[0] == set2->sig[0]);
        case 1:
                return        set1->sig[0] == set2->sig[0];
        }
        return 0;
}

#define sigmask(sig)        (1UL << ((sig) - 1))

#ifndef __HAVE_ARCH_SIG_SETOPS

#define _SIG_SET_BINOP(name, op)                                        \
static inline void name(sigset_t *r, const sigset_t *a, const sigset_t *b) \
{                                                                        \
        unsigned long a0, a1, a2, a3, b0, b1, b2, b3;                        \
                                                                        \
        switch (_NSIG_WORDS) {                                                \
        case 4:                                                                \
                a3 = a->sig[3]; a2 = a->sig[2];                                \
                b3 = b->sig[3]; b2 = b->sig[2];                                \
                r->sig[3] = op(a3, b3);                                        \
                r->sig[2] = op(a2, b2);                                        \
                fallthrough;                                                \
        case 2:                                                                \
                a1 = a->sig[1]; b1 = b->sig[1];                                \
                r->sig[1] = op(a1, b1);                                        \
                fallthrough;                                                \
        case 1:                                                                \
                a0 = a->sig[0]; b0 = b->sig[0];                                \
                r->sig[0] = op(a0, b0);                                        \
                break;                                                        \
        default:                                                        \
                BUILD_BUG();                                                \
        }                                                                \
}

#define _sig_or(x,y)        ((x) | (y))
_SIG_SET_BINOP(sigorsets, _sig_or)

#define _sig_and(x,y)        ((x) & (y))
_SIG_SET_BINOP(sigandsets, _sig_and)

#define _sig_andn(x,y)        ((x) & ~(y))
_SIG_SET_BINOP(sigandnsets, _sig_andn)

#undef _SIG_SET_BINOP
#undef _sig_or
#undef _sig_and
#undef _sig_andn

#define _SIG_SET_OP(name, op)                                                \
static inline void name(sigset_t *set)                                        \
{                                                                        \
        switch (_NSIG_WORDS) {                                                \
        case 4:        set->sig[3] = op(set->sig[3]);                                \
                set->sig[2] = op(set->sig[2]);                                \
                fallthrough;                                                \
        case 2:        set->sig[1] = op(set->sig[1]);                                \
                fallthrough;                                                \
        case 1:        set->sig[0] = op(set->sig[0]);                                \
                    break;                                                \
        default:                                                        \
                BUILD_BUG();                                                \
        }                                                                \
}

#define _sig_not(x)        (~(x))
_SIG_SET_OP(signotset, _sig_not)

#undef _SIG_SET_OP
#undef _sig_not

static inline void sigemptyset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        default:
                memset(set, 0, sizeof(sigset_t));
                break;
        case 2: set->sig[1] = 0;
                fallthrough;
        case 1:        set->sig[0] = 0;
                break;
        }
}

static inline void sigfillset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        default:
                memset(set, -1, sizeof(sigset_t));
                break;
        case 2: set->sig[1] = -1;
                fallthrough;
        case 1:        set->sig[0] = -1;
                break;
        }
}

/* Some extensions for manipulating the low 32 signals in particular.  */

static inline void sigaddsetmask(sigset_t *set, unsigned long mask)
{
        set->sig[0] |= mask;
}

static inline void sigdelsetmask(sigset_t *set, unsigned long mask)
{
        set->sig[0] &= ~mask;
}

static inline int sigtestsetmask(sigset_t *set, unsigned long mask)
{
        return (set->sig[0] & mask) != 0;
}

static inline void siginitset(sigset_t *set, unsigned long mask)
{
        set->sig[0] = mask;
        switch (_NSIG_WORDS) {
        default:
                memset(&set->sig[1], 0, sizeof(long)*(_NSIG_WORDS-1));
                break;
        case 2: set->sig[1] = 0;
                break;
        case 1: ;
        }
}

static inline void siginitsetinv(sigset_t *set, unsigned long mask)
{
        set->sig[0] = ~mask;
        switch (_NSIG_WORDS) {
        default:
                memset(&set->sig[1], -1, sizeof(long)*(_NSIG_WORDS-1));
                break;
        case 2: set->sig[1] = -1;
                break;
        case 1: ;
        }
}

#endif /* __HAVE_ARCH_SIG_SETOPS */

static inline void init_sigpending(struct sigpending *sig)
{
        sigemptyset(&sig->signal);
        INIT_LIST_HEAD(&sig->list);
}

extern void flush_sigqueue(struct sigpending *queue);

/* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
static inline int valid_signal(unsigned long sig)
{
        return sig <= _NSIG ? 1 : 0;
}

struct timespec;
struct pt_regs;
enum pid_type;

extern int next_signal(struct sigpending *pending, sigset_t *mask);
extern int do_send_sig_info(int sig, struct kernel_siginfo *info,
                                struct task_struct *p, enum pid_type type);
extern int group_send_sig_info(int sig, struct kernel_siginfo *info,
                               struct task_struct *p, enum pid_type type);
extern int send_signal_locked(int sig, struct kernel_siginfo *info,
                              struct task_struct *p, enum pid_type type);
extern int sigprocmask(int, sigset_t *, sigset_t *);
extern void set_current_blocked(sigset_t *);
extern void __set_current_blocked(const sigset_t *);
extern int show_unhandled_signals;

extern bool get_signal(struct ksignal *ksig);
extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping);
extern void exit_signals(struct task_struct *tsk);
extern void kernel_sigaction(int, __sighandler_t);

#define SIG_KTHREAD ((__force __sighandler_t)2)
#define SIG_KTHREAD_KERNEL ((__force __sighandler_t)3)

static inline void allow_signal(int sig)
{
        /*
         * Kernel threads handle their own signals. Let the signal code
         * know it'll be handled, so that they don't get converted to
         * SIGKILL or just silently dropped.
         */
        kernel_sigaction(sig, SIG_KTHREAD);
}

static inline void allow_kernel_signal(int sig)
{
        /*
         * Kernel threads handle their own signals. Let the signal code
         * know signals sent by the kernel will be handled, so that they
         * don't get silently dropped.
         */
        kernel_sigaction(sig, SIG_KTHREAD_KERNEL);
}

static inline void disallow_signal(int sig)
{
        kernel_sigaction(sig, SIG_IGN);
}

extern struct kmem_cache *sighand_cachep;

extern bool unhandled_signal(struct task_struct *tsk, int sig);

/*
 * In POSIX a signal is sent either to a specific thread (Linux task)
 * or to the process as a whole (Linux thread group).  How the signal
 * is sent determines whether it's to one thread or the whole group,
 * which determines which signal mask(s) are involved in blocking it
 * from being delivered until later.  When the signal is delivered,
 * either it's caught or ignored by a user handler or it has a default
 * effect that applies to the whole thread group (POSIX process).
 *
 * The possible effects an unblocked signal set to SIG_DFL can have are:
 *   ignore        - Nothing Happens
 *   terminate        - kill the process, i.e. all threads in the group,
 *                   similar to exit_group.  The group leader (only) reports
 *                  WIFSIGNALED status to its parent.
 *   coredump        - write a core dump file describing all threads using
 *                  the same mm and then kill all those threads
 *   stop         - stop all the threads in the group, i.e. TASK_STOPPED state
 *
 * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
 * Other signals when not blocked and set to SIG_DFL behaves as follows.
 * The job control signals also have other special effects.
 *
 *        +--------------------+------------------+
 *        |  POSIX signal      |  default action  |
 *        +--------------------+------------------+
 *        |  SIGHUP            |  terminate        |
 *        |  SIGINT            |        terminate        |
 *        |  SIGQUIT           |        coredump         |
 *        |  SIGILL            |        coredump         |
 *        |  SIGTRAP           |        coredump         |
 *        |  SIGABRT/SIGIOT    |        coredump         |
 *        |  SIGBUS            |        coredump         |
 *        |  SIGFPE            |        coredump         |
 *        |  SIGKILL           |        terminate(+)        |
 *        |  SIGUSR1           |        terminate        |
 *        |  SIGSEGV           |        coredump         |
 *        |  SIGUSR2           |        terminate        |
 *        |  SIGPIPE           |        terminate        |
 *        |  SIGALRM           |        terminate        |
 *        |  SIGTERM           |        terminate        |
 *        |  SIGCHLD           |        ignore           |
 *        |  SIGCONT           |        ignore(*)        |
 *        |  SIGSTOP           |        stop(*)(+)          |
 *        |  SIGTSTP           |        stop(*)          |
 *        |  SIGTTIN           |        stop(*)          |
 *        |  SIGTTOU           |        stop(*)          |
 *        |  SIGURG            |        ignore           |
 *        |  SIGXCPU           |        coredump         |
 *        |  SIGXFSZ           |        coredump         |
 *        |  SIGVTALRM         |        terminate        |
 *        |  SIGPROF           |        terminate        |
 *        |  SIGPOLL/SIGIO     |        terminate        |
 *        |  SIGSYS/SIGUNUSED  |        coredump         |
 *        |  SIGSTKFLT         |        terminate        |
 *        |  SIGWINCH          |        ignore           |
 *        |  SIGPWR            |        terminate        |
 *        |  SIGRTMIN-SIGRTMAX |        terminate       |
 *        +--------------------+------------------+
 *        |  non-POSIX signal  |  default action  |
 *        +--------------------+------------------+
 *        |  SIGEMT            |  coredump        |
 *        +--------------------+------------------+
 *
 * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
 * (*) Special job control effects:
 * When SIGCONT is sent, it resumes the process (all threads in the group)
 * from TASK_STOPPED state and also clears any pending/queued stop signals
 * (any of those marked with "stop(*)").  This happens regardless of blocking,
 * catching, or ignoring SIGCONT.  When any stop signal is sent, it clears
 * any pending/queued SIGCONT signals; this happens regardless of blocking,
 * catching, or ignored the stop signal, though (except for SIGSTOP) the
 * default action of stopping the process may happen later or never.
 */

#ifdef SIGEMT
#define SIGEMT_MASK        rt_sigmask(SIGEMT)
#else
#define SIGEMT_MASK        0
#endif

#if SIGRTMIN > BITS_PER_LONG
#define rt_sigmask(sig)        (1ULL << ((sig)-1))
#else
#define rt_sigmask(sig)        sigmask(sig)
#endif

#define siginmask(sig, mask) \
        ((sig) > 0 && (sig) < SIGRTMIN && (rt_sigmask(sig) & (mask)))

#define SIG_KERNEL_ONLY_MASK (\
        rt_sigmask(SIGKILL)   |  rt_sigmask(SIGSTOP))

#define SIG_KERNEL_STOP_MASK (\
        rt_sigmask(SIGSTOP)   |  rt_sigmask(SIGTSTP)   | \
        rt_sigmask(SIGTTIN)   |  rt_sigmask(SIGTTOU)   )

#define SIG_KERNEL_COREDUMP_MASK (\
        rt_sigmask(SIGQUIT)   |  rt_sigmask(SIGILL)    | \
        rt_sigmask(SIGTRAP)   |  rt_sigmask(SIGABRT)   | \
        rt_sigmask(SIGFPE)    |  rt_sigmask(SIGSEGV)   | \
        rt_sigmask(SIGBUS)    |  rt_sigmask(SIGSYS)    | \
        rt_sigmask(SIGXCPU)   |  rt_sigmask(SIGXFSZ)   | \
        SIGEMT_MASK                                       )

#define SIG_KERNEL_IGNORE_MASK (\
        rt_sigmask(SIGCONT)   |  rt_sigmask(SIGCHLD)   | \
        rt_sigmask(SIGWINCH)  |  rt_sigmask(SIGURG)    )

#define SIG_SPECIFIC_SICODES_MASK (\
        rt_sigmask(SIGILL)    |  rt_sigmask(SIGFPE)    | \
        rt_sigmask(SIGSEGV)   |  rt_sigmask(SIGBUS)    | \
        rt_sigmask(SIGTRAP)   |  rt_sigmask(SIGCHLD)   | \
        rt_sigmask(SIGPOLL)   |  rt_sigmask(SIGSYS)    | \
        SIGEMT_MASK                                    )

#define sig_kernel_only(sig)                siginmask(sig, SIG_KERNEL_ONLY_MASK)
#define sig_kernel_coredump(sig)        siginmask(sig, SIG_KERNEL_COREDUMP_MASK)
#define sig_kernel_ignore(sig)                siginmask(sig, SIG_KERNEL_IGNORE_MASK)
#define sig_kernel_stop(sig)                siginmask(sig, SIG_KERNEL_STOP_MASK)
#define sig_specific_sicodes(sig)        siginmask(sig, SIG_SPECIFIC_SICODES_MASK)

#define sig_fatal(t, signr) \
        (!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
         (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)

void signals_init(void);

int restore_altstack(const stack_t __user *);
int __save_altstack(stack_t __user *, unsigned long);

#define unsafe_save_altstack(uss, sp, label) do { \
        stack_t __user *__uss = uss; \
        struct task_struct *t = current; \
        unsafe_put_user((void __user *)t->sas_ss_sp, &__uss->ss_sp, label); \
        unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \
        unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \
} while (0);

#ifdef CONFIG_DYNAMIC_SIGFRAME
bool sigaltstack_size_valid(size_t ss_size);
#else
static inline bool sigaltstack_size_valid(size_t size) { return true; }
#endif /* !CONFIG_DYNAMIC_SIGFRAME */

#ifdef CONFIG_PROC_FS
struct seq_file;
extern void render_sigset_t(struct seq_file *, const char *, sigset_t *);
#endif

#ifndef arch_untagged_si_addr
/*
 * Given a fault address and a signal and si_code which correspond to the
 * _sigfault union member, returns the address that must appear in si_addr if
 * the signal handler does not have SA_EXPOSE_TAGBITS enabled in sa_flags.
 */
static inline void __user *arch_untagged_si_addr(void __user *addr,
                                                 unsigned long sig,
                                                 unsigned long si_code)
{
        return addr;
}
#endif

#endif /* _LINUX_SIGNAL_H */








































































































































    2 

































































    6 
















































    1 




    7 





























































































































   16 



























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HUGE_MM_H
#define _LINUX_HUGE_MM_H

#include <linux/sched/coredump.h>
#include <linux/mm_types.h>

#include <linux/fs.h> /* only for vma_is_dax() */

vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
void huge_pmd_set_accessed(struct vm_fault *vmf);
int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
                  struct vm_area_struct *vma);

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud);
#else
static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
{
}
#endif

vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf);
bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                           pmd_t *pmd, unsigned long addr, unsigned long next);
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd,
                 unsigned long addr);
int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud,
                 unsigned long addr);
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
                   unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd);
int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
                    unsigned long cp_flags);

vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);

enum transparent_hugepage_flag {
        TRANSPARENT_HUGEPAGE_UNSUPPORTED,
        TRANSPARENT_HUGEPAGE_FLAG,
        TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
        TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
};

struct kobject;
struct kobj_attribute;

ssize_t single_hugepage_flag_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count,
                                   enum transparent_hugepage_flag flag);
ssize_t single_hugepage_flag_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf,
                                  enum transparent_hugepage_flag flag);
extern struct kobj_attribute shmem_enabled_attr;

/*
 * Mask of all large folio orders supported for anonymous THP; all orders up to
 * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
 * (which is a limitation of the THP implementation).
 */
#define THP_ORDERS_ALL_ANON        ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))

/*
 * Mask of all large folio orders supported for file THP.
 */
#define THP_ORDERS_ALL_FILE        (BIT(PMD_ORDER) | BIT(PUD_ORDER))

/*
 * Mask of all large folio orders supported for THP.
 */
#define THP_ORDERS_ALL                (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE)

#define TVA_SMAPS                (1 << 0)        /* Will be used for procfs */
#define TVA_IN_PF                (1 << 1)        /* Page fault handler */
#define TVA_ENFORCE_SYSFS        (1 << 2)        /* Obey sysfs configuration */

#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \
        (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order)))

#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
#define HPAGE_PMD_SHIFT PMD_SHIFT
#define HPAGE_PUD_SHIFT PUD_SHIFT
#else
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
#define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; })
#endif

#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
#define HPAGE_PMD_MASK        (~(HPAGE_PMD_SIZE - 1))
#define HPAGE_PMD_SIZE        ((1UL) << HPAGE_PMD_SHIFT)

#define HPAGE_PUD_ORDER (HPAGE_PUD_SHIFT-PAGE_SHIFT)
#define HPAGE_PUD_NR (1<<HPAGE_PUD_ORDER)
#define HPAGE_PUD_MASK        (~(HPAGE_PUD_SIZE - 1))
#define HPAGE_PUD_SIZE        ((1UL) << HPAGE_PUD_SHIFT)

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

extern unsigned long transparent_hugepage_flags;
extern unsigned long huge_anon_orders_always;
extern unsigned long huge_anon_orders_madvise;
extern unsigned long huge_anon_orders_inherit;

static inline bool hugepage_global_enabled(void)
{
        return transparent_hugepage_flags &
                        ((1<<TRANSPARENT_HUGEPAGE_FLAG) |
                        (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG));
}

static inline bool hugepage_global_always(void)
{
        return transparent_hugepage_flags &
                        (1<<TRANSPARENT_HUGEPAGE_FLAG);
}

static inline bool hugepage_flags_enabled(void)
{
        /*
         * We cover both the anon and the file-backed case here; we must return
         * true if globally enabled, even when all anon sizes are set to never.
         * So we don't need to look at huge_anon_orders_inherit.
         */
        return hugepage_global_enabled() ||
               huge_anon_orders_always ||
               huge_anon_orders_madvise;
}

static inline int highest_order(unsigned long orders)
{
        return fls_long(orders) - 1;
}

static inline int next_order(unsigned long *orders, int prev)
{
        *orders &= ~BIT(prev);
        return highest_order(*orders);
}

/*
 * Do the below checks:
 *   - For file vma, check if the linear page offset of vma is
 *     order-aligned within the file.  The hugepage is
 *     guaranteed to be order-aligned within the file, but we must
 *     check that the order-aligned addresses in the VMA map to
 *     order-aligned offsets within the file, else the hugepage will
 *     not be mappable.
 *   - For all vmas, check if the haddr is in an aligned hugepage
 *     area.
 */
static inline bool thp_vma_suitable_order(struct vm_area_struct *vma,
                unsigned long addr, int order)
{
        unsigned long hpage_size = PAGE_SIZE << order;
        unsigned long haddr;

        /* Don't have to check pgoff for anonymous vma */
        if (!vma_is_anonymous(vma)) {
                if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
                                hpage_size >> PAGE_SHIFT))
                        return false;
        }

        haddr = ALIGN_DOWN(addr, hpage_size);

        if (haddr < vma->vm_start || haddr + hpage_size > vma->vm_end)
                return false;
        return true;
}

/*
 * Filter the bitfield of input orders to the ones suitable for use in the vma.
 * See thp_vma_suitable_order().
 * All orders that pass the checks are returned as a bitfield.
 */
static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
                unsigned long addr, unsigned long orders)
{
        int order;

        /*
         * Iterate over orders, highest to lowest, removing orders that don't
         * meet alignment requirements from the set. Exit loop at first order
         * that meets requirements, since all lower orders must also meet
         * requirements.
         */

        order = highest_order(orders);

        while (orders) {
                if (thp_vma_suitable_order(vma, addr, order))
                        break;
                order = next_order(&orders, order);
        }

        return orders;
}

static inline bool file_thp_enabled(struct vm_area_struct *vma)
{
        struct inode *inode;

        if (!vma->vm_file)
                return false;

        inode = vma->vm_file->f_inode;

        return (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) &&
               !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
}

unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
                                         unsigned long vm_flags,
                                         unsigned long tva_flags,
                                         unsigned long orders);

/**
 * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma
 * @vma:  the vm area to check
 * @vm_flags: use these vm_flags instead of vma->vm_flags
 * @tva_flags: Which TVA flags to honour
 * @orders: bitfield of all orders to consider
 *
 * Calculates the intersection of the requested hugepage orders and the allowed
 * hugepage orders for the provided vma. Permitted orders are encoded as a set
 * bit at the corresponding bit position (bit-2 corresponds to order-2, bit-3
 * corresponds to order-3, etc). Order-0 is never considered a hugepage order.
 *
 * Return: bitfield of orders allowed for hugepage in the vma. 0 if no hugepage
 * orders are allowed.
 */
static inline
unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
                                       unsigned long vm_flags,
                                       unsigned long tva_flags,
                                       unsigned long orders)
{
        /* Optimization to check if required orders are enabled early. */
        if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
                unsigned long mask = READ_ONCE(huge_anon_orders_always);

                if (vm_flags & VM_HUGEPAGE)
                        mask |= READ_ONCE(huge_anon_orders_madvise);
                if (hugepage_global_always() ||
                    ((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled()))
                        mask |= READ_ONCE(huge_anon_orders_inherit);

                orders &= mask;
                if (!orders)
                        return 0;
        }

        return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
}

enum mthp_stat_item {
        MTHP_STAT_ANON_FAULT_ALLOC,
        MTHP_STAT_ANON_FAULT_FALLBACK,
        MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
        MTHP_STAT_SWPOUT,
        MTHP_STAT_SWPOUT_FALLBACK,
        __MTHP_STAT_COUNT
};

struct mthp_stat {
        unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT];
};

#ifdef CONFIG_SYSFS
DECLARE_PER_CPU(struct mthp_stat, mthp_stats);

static inline void count_mthp_stat(int order, enum mthp_stat_item item)
{
        if (order <= 0 || order > PMD_ORDER)
                return;

        this_cpu_inc(mthp_stats.stats[order][item]);
}
#else
static inline void count_mthp_stat(int order, enum mthp_stat_item item)
{
}
#endif

#define transparent_hugepage_use_zero_page()                                \
        (transparent_hugepage_flags &                                        \
         (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))

unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags);
unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags,
                vm_flags_t vm_flags);

bool can_split_folio(struct folio *folio, int *pextra_pins);
int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
                unsigned int new_order);
static inline int split_huge_page(struct page *page)
{
        return split_huge_page_to_list_to_order(page, NULL, 0);
}
void deferred_split_folio(struct folio *folio);

void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long address, bool freeze, struct folio *folio);

#define split_huge_pmd(__vma, __pmd, __address)                                \
        do {                                                                \
                pmd_t *____pmd = (__pmd);                                \
                if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd)        \
                                        || pmd_devmap(*____pmd))        \
                        __split_huge_pmd(__vma, __pmd, __address,        \
                                                false, NULL);                \
        }  while (0)


void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
                bool freeze, struct folio *folio);

void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
                unsigned long address);

#define split_huge_pud(__vma, __pud, __address)                                \
        do {                                                                \
                pud_t *____pud = (__pud);                                \
                if (pud_trans_huge(*____pud)                                \
                                        || pud_devmap(*____pud))        \
                        __split_huge_pud(__vma, __pud, __address);        \
        }  while (0)

int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags,
                     int advice);
int madvise_collapse(struct vm_area_struct *vma,
                     struct vm_area_struct **prev,
                     unsigned long start, unsigned long end);
void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
                           unsigned long end, long adjust_next);
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma);

static inline int is_swap_pmd(pmd_t pmd)
{
        return !pmd_none(pmd) && !pmd_present(pmd);
}

/* mmap_lock must be held on entry */
static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
                struct vm_area_struct *vma)
{
        if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd))
                return __pmd_trans_huge_lock(pmd, vma);
        else
                return NULL;
}
static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
                struct vm_area_struct *vma)
{
        if (pud_trans_huge(*pud) || pud_devmap(*pud))
                return __pud_trans_huge_lock(pud, vma);
        else
                return NULL;
}

/**
 * folio_test_pmd_mappable - Can we map this folio with a PMD?
 * @folio: The folio to test
 */
static inline bool folio_test_pmd_mappable(struct folio *folio)
{
        return folio_order(folio) >= HPAGE_PMD_ORDER;
}

struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
                pmd_t *pmd, int flags, struct dev_pagemap **pgmap);

vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);

extern struct folio *huge_zero_folio;
extern unsigned long huge_zero_pfn;

static inline bool is_huge_zero_folio(const struct folio *folio)
{
        return READ_ONCE(huge_zero_folio) == folio;
}

static inline bool is_huge_zero_pmd(pmd_t pmd)
{
        return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd);
}

static inline bool is_huge_zero_pud(pud_t pud)
{
        return false;
}

struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
void mm_put_huge_zero_folio(struct mm_struct *mm);

#define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot))

static inline bool thp_migration_supported(void)
{
        return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
}

#else /* CONFIG_TRANSPARENT_HUGEPAGE */

static inline bool folio_test_pmd_mappable(struct folio *folio)
{
        return false;
}

static inline bool thp_vma_suitable_order(struct vm_area_struct *vma,
                unsigned long addr, int order)
{
        return false;
}

static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
                unsigned long addr, unsigned long orders)
{
        return 0;
}

static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
                                        unsigned long vm_flags,
                                        unsigned long tva_flags,
                                        unsigned long orders)
{
        return 0;
}

#define transparent_hugepage_flags 0UL

#define thp_get_unmapped_area        NULL

static inline unsigned long
thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
                              unsigned long len, unsigned long pgoff,
                              unsigned long flags, vm_flags_t vm_flags)
{
        return 0;
}

static inline bool
can_split_folio(struct folio *folio, int *pextra_pins)
{
        return false;
}
static inline int
split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
                unsigned int new_order)
{
        return 0;
}
static inline int split_huge_page(struct page *page)
{
        return 0;
}
static inline void deferred_split_folio(struct folio *folio) {}
#define split_huge_pmd(__vma, __pmd, __address)        \
        do { } while (0)

static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long address, bool freeze, struct folio *folio) {}
static inline void split_huge_pmd_address(struct vm_area_struct *vma,
                unsigned long address, bool freeze, struct folio *folio) {}

#define split_huge_pud(__vma, __pmd, __address)        \
        do { } while (0)

static inline int hugepage_madvise(struct vm_area_struct *vma,
                                   unsigned long *vm_flags, int advice)
{
        return -EINVAL;
}

static inline int madvise_collapse(struct vm_area_struct *vma,
                                   struct vm_area_struct **prev,
                                   unsigned long start, unsigned long end)
{
        return -EINVAL;
}

static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
                                         unsigned long start,
                                         unsigned long end,
                                         long adjust_next)
{
}
static inline int is_swap_pmd(pmd_t pmd)
{
        return 0;
}
static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
                struct vm_area_struct *vma)
{
        return NULL;
}
static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
                struct vm_area_struct *vma)
{
        return NULL;
}

static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
{
        return 0;
}

static inline bool is_huge_zero_folio(const struct folio *folio)
{
        return false;
}

static inline bool is_huge_zero_pmd(pmd_t pmd)
{
        return false;
}

static inline bool is_huge_zero_pud(pud_t pud)
{
        return false;
}

static inline void mm_put_huge_zero_folio(struct mm_struct *mm)
{
        return;
}

static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
        unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
{
        return NULL;
}

static inline bool thp_migration_supported(void)
{
        return false;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static inline int split_folio_to_list_to_order(struct folio *folio,
                struct list_head *list, int new_order)
{
        return split_huge_page_to_list_to_order(&folio->page, list, new_order);
}

static inline int split_folio_to_order(struct folio *folio, int new_order)
{
        return split_folio_to_list_to_order(folio, NULL, new_order);
}

#define split_folio_to_list(f, l) split_folio_to_list_to_order(f, l, 0)
#define split_folio(f) split_folio_to_order(f, 0)

#endif /* _LINUX_HUGE_MM_H */

























































































































































































































   16 




   16 


   14 






























   14 
    9 






   15 
















































































































































































































































   14 







   15 





   14 





















   13 

   14 

    1 

   14 



   15 
















   13 





    5 
    5 


    4 



    4 

























   14 






   15 












   14 


   15 
















    2 
    2 


    2 

    1 

































































































































































    2 































































































































































    9 








































   33 














   35 


   34 
   33 
   34 










    1 







































































































    9 

   32 



   23 
   25 




   11 
   11 














    4 
    5 



   10 
   10 


    9 







    4 


































    2 

















    2 

    2 

    2 

    2 



    2 



    2 























































































    2 















    2 







    2 








    1 




    1 















    1 


    1 







    1 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/file.c
 *
 *  Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
 *
 *  Manage the dynamic fd arrays in the process files_struct.
 */

#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/close_range.h>
#include <net/sock.h>

#include "internal.h"

unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */
#define __const_min(x, y) ((x) < (y) ? (x) : (y))
unsigned int sysctl_nr_open_max =
        __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;

static void __free_fdtable(struct fdtable *fdt)
{
        kvfree(fdt->fd);
        kvfree(fdt->open_fds);
        kfree(fdt);
}

static void free_fdtable_rcu(struct rcu_head *rcu)
{
        __free_fdtable(container_of(rcu, struct fdtable, rcu));
}

#define BITBIT_NR(nr)        BITS_TO_LONGS(BITS_TO_LONGS(nr))
#define BITBIT_SIZE(nr)        (BITBIT_NR(nr) * sizeof(long))

/*
 * Copy 'count' fd bits from the old table to the new table and clear the extra
 * space if any.  This does not copy the file pointers.  Called with the files
 * spinlock held for write.
 */
static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
                            unsigned int count)
{
        unsigned int cpy, set;

        cpy = count / BITS_PER_BYTE;
        set = (nfdt->max_fds - count) / BITS_PER_BYTE;
        memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
        memset((char *)nfdt->open_fds + cpy, 0, set);
        memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
        memset((char *)nfdt->close_on_exec + cpy, 0, set);

        cpy = BITBIT_SIZE(count);
        set = BITBIT_SIZE(nfdt->max_fds) - cpy;
        memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
        memset((char *)nfdt->full_fds_bits + cpy, 0, set);
}

/*
 * Copy all file descriptors from the old table to the new, expanded table and
 * clear the extra space.  Called with the files spinlock held for write.
 */
static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
{
        size_t cpy, set;

        BUG_ON(nfdt->max_fds < ofdt->max_fds);

        cpy = ofdt->max_fds * sizeof(struct file *);
        set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
        memcpy(nfdt->fd, ofdt->fd, cpy);
        memset((char *)nfdt->fd + cpy, 0, set);

        copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
}

/*
 * Note how the fdtable bitmap allocations very much have to be a multiple of
 * BITS_PER_LONG. This is not only because we walk those things in chunks of
 * 'unsigned long' in some places, but simply because that is how the Linux
 * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
 * they are very much "bits in an array of unsigned long".
 *
 * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
 * by that "1024/sizeof(ptr)" before, we already know there are sufficient
 * clear low bits. Clang seems to realize that, gcc ends up being confused.
 *
 * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
 * let's consider it documentation (and maybe a test-case for gcc to improve
 * its code generation ;)
 */
static struct fdtable * alloc_fdtable(unsigned int nr)
{
        struct fdtable *fdt;
        void *data;

        /*
         * Figure out how many fds we actually want to support in this fdtable.
         * Allocation steps are keyed to the size of the fdarray, since it
         * grows far faster than any of the other dynamic data. We try to fit
         * the fdarray into comfortable page-tuned chunks: starting at 1024B
         * and growing in powers of two from there on.
         */
        nr /= (1024 / sizeof(struct file *));
        nr = roundup_pow_of_two(nr + 1);
        nr *= (1024 / sizeof(struct file *));
        nr = ALIGN(nr, BITS_PER_LONG);
        /*
         * Note that this can drive nr *below* what we had passed if sysctl_nr_open
         * had been set lower between the check in expand_files() and here.  Deal
         * with that in caller, it's cheaper that way.
         *
         * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
         * bitmaps handling below becomes unpleasant, to put it mildly...
         */
        if (unlikely(nr > sysctl_nr_open))
                nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;

        fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
        if (!fdt)
                goto out;
        fdt->max_fds = nr;
        data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT);
        if (!data)
                goto out_fdt;
        fdt->fd = data;

        data = kvmalloc(max_t(size_t,
                                 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
                                 GFP_KERNEL_ACCOUNT);
        if (!data)
                goto out_arr;
        fdt->open_fds = data;
        data += nr / BITS_PER_BYTE;
        fdt->close_on_exec = data;
        data += nr / BITS_PER_BYTE;
        fdt->full_fds_bits = data;

        return fdt;

out_arr:
        kvfree(fdt->fd);
out_fdt:
        kfree(fdt);
out:
        return NULL;
}

/*
 * Expand the file descriptor table.
 * This function will allocate a new fdtable and both fd array and fdset, of
 * the given size.
 * Return <0 error code on error; 1 on successful completion.
 * The files->file_lock should be held on entry, and will be held on exit.
 */
static int expand_fdtable(struct files_struct *files, unsigned int nr)
        __releases(files->file_lock)
        __acquires(files->file_lock)
{
        struct fdtable *new_fdt, *cur_fdt;

        spin_unlock(&files->file_lock);
        new_fdt = alloc_fdtable(nr);

        /* make sure all fd_install() have seen resize_in_progress
         * or have finished their rcu_read_lock_sched() section.
         */
        if (atomic_read(&files->count) > 1)
                synchronize_rcu();

        spin_lock(&files->file_lock);
        if (!new_fdt)
                return -ENOMEM;
        /*
         * extremely unlikely race - sysctl_nr_open decreased between the check in
         * caller and alloc_fdtable().  Cheaper to catch it here...
         */
        if (unlikely(new_fdt->max_fds <= nr)) {
                __free_fdtable(new_fdt);
                return -EMFILE;
        }
        cur_fdt = files_fdtable(files);
        BUG_ON(nr < cur_fdt->max_fds);
        copy_fdtable(new_fdt, cur_fdt);
        rcu_assign_pointer(files->fdt, new_fdt);
        if (cur_fdt != &files->fdtab)
                call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
        /* coupled with smp_rmb() in fd_install() */
        smp_wmb();
        return 1;
}

/*
 * Expand files.
 * This function will expand the file structures, if the requested size exceeds
 * the current capacity and there is room for expansion.
 * Return <0 error code on error; 0 when nothing done; 1 when files were
 * expanded and execution may have blocked.
 * The files->file_lock should be held on entry, and will be held on exit.
 */
static int expand_files(struct files_struct *files, unsigned int nr)
        __releases(files->file_lock)
        __acquires(files->file_lock)
{
        struct fdtable *fdt;
        int expanded = 0;

repeat:
        fdt = files_fdtable(files);

        /* Do we need to expand? */
        if (nr < fdt->max_fds)
                return expanded;

        /* Can we expand? */
        if (nr >= sysctl_nr_open)
                return -EMFILE;

        if (unlikely(files->resize_in_progress)) {
                spin_unlock(&files->file_lock);
                expanded = 1;
                wait_event(files->resize_wait, !files->resize_in_progress);
                spin_lock(&files->file_lock);
                goto repeat;
        }

        /* All good, so we try */
        files->resize_in_progress = true;
        expanded = expand_fdtable(files, nr);
        files->resize_in_progress = false;

        wake_up_all(&files->resize_wait);
        return expanded;
}

static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt)
{
        __set_bit(fd, fdt->close_on_exec);
}

static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)
{
        if (test_bit(fd, fdt->close_on_exec))
                __clear_bit(fd, fdt->close_on_exec);
}

static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
{
        __set_bit(fd, fdt->open_fds);
        fd /= BITS_PER_LONG;
        if (!~fdt->open_fds[fd])
                __set_bit(fd, fdt->full_fds_bits);
}

static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
{
        __clear_bit(fd, fdt->open_fds);
        __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
}

static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
{
        return test_bit(fd, fdt->open_fds);
}

static unsigned int count_open_files(struct fdtable *fdt)
{
        unsigned int size = fdt->max_fds;
        unsigned int i;

        /* Find the last open fd */
        for (i = size / BITS_PER_LONG; i > 0; ) {
                if (fdt->open_fds[--i])
                        break;
        }
        i = (i + 1) * BITS_PER_LONG;
        return i;
}

/*
 * Note that a sane fdtable size always has to be a multiple of
 * BITS_PER_LONG, since we have bitmaps that are sized by this.
 *
 * 'max_fds' will normally already be properly aligned, but it
 * turns out that in the close_range() -> __close_range() ->
 * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
 * up having a 'max_fds' value that isn't already aligned.
 *
 * Rather than make close_range() have to worry about this,
 * just make that BITS_PER_LONG alignment be part of a sane
 * fdtable size. Becuase that's really what it is.
 */
static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
{
        unsigned int count;

        count = count_open_files(fdt);
        if (max_fds < NR_OPEN_DEFAULT)
                max_fds = NR_OPEN_DEFAULT;
        return ALIGN(min(count, max_fds), BITS_PER_LONG);
}

/*
 * Allocate a new files structure and copy contents from the
 * passed in files structure.
 * errorp will be valid only when the returned files_struct is NULL.
 */
struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp)
{
        struct files_struct *newf;
        struct file **old_fds, **new_fds;
        unsigned int open_files, i;
        struct fdtable *old_fdt, *new_fdt;

        *errorp = -ENOMEM;
        newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
        if (!newf)
                goto out;

        atomic_set(&newf->count, 1);

        spin_lock_init(&newf->file_lock);
        newf->resize_in_progress = false;
        init_waitqueue_head(&newf->resize_wait);
        newf->next_fd = 0;
        new_fdt = &newf->fdtab;
        new_fdt->max_fds = NR_OPEN_DEFAULT;
        new_fdt->close_on_exec = newf->close_on_exec_init;
        new_fdt->open_fds = newf->open_fds_init;
        new_fdt->full_fds_bits = newf->full_fds_bits_init;
        new_fdt->fd = &newf->fd_array[0];

        spin_lock(&oldf->file_lock);
        old_fdt = files_fdtable(oldf);
        open_files = sane_fdtable_size(old_fdt, max_fds);

        /*
         * Check whether we need to allocate a larger fd array and fd set.
         */
        while (unlikely(open_files > new_fdt->max_fds)) {
                spin_unlock(&oldf->file_lock);

                if (new_fdt != &newf->fdtab)
                        __free_fdtable(new_fdt);

                new_fdt = alloc_fdtable(open_files - 1);
                if (!new_fdt) {
                        *errorp = -ENOMEM;
                        goto out_release;
                }

                /* beyond sysctl_nr_open; nothing to do */
                if (unlikely(new_fdt->max_fds < open_files)) {
                        __free_fdtable(new_fdt);
                        *errorp = -EMFILE;
                        goto out_release;
                }

                /*
                 * Reacquire the oldf lock and a pointer to its fd table
                 * who knows it may have a new bigger fd table. We need
                 * the latest pointer.
                 */
                spin_lock(&oldf->file_lock);
                old_fdt = files_fdtable(oldf);
                open_files = sane_fdtable_size(old_fdt, max_fds);
        }

        copy_fd_bitmaps(new_fdt, old_fdt, open_files);

        old_fds = old_fdt->fd;
        new_fds = new_fdt->fd;

        for (i = open_files; i != 0; i--) {
                struct file *f = *old_fds++;
                if (f) {
                        get_file(f);
                } else {
                        /*
                         * The fd may be claimed in the fd bitmap but not yet
                         * instantiated in the files array if a sibling thread
                         * is partway through open().  So make sure that this
                         * fd is available to the new process.
                         */
                        __clear_open_fd(open_files - i, new_fdt);
                }
                rcu_assign_pointer(*new_fds++, f);
        }
        spin_unlock(&oldf->file_lock);

        /* clear the remainder */
        memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));

        rcu_assign_pointer(newf->fdt, new_fdt);

        return newf;

out_release:
        kmem_cache_free(files_cachep, newf);
out:
        return NULL;
}

static struct fdtable *close_files(struct files_struct * files)
{
        /*
         * It is safe to dereference the fd table without RCU or
         * ->file_lock because this is the last reference to the
         * files structure.
         */
        struct fdtable *fdt = rcu_dereference_raw(files->fdt);
        unsigned int i, j = 0;

        for (;;) {
                unsigned long set;
                i = j * BITS_PER_LONG;
                if (i >= fdt->max_fds)
                        break;
                set = fdt->open_fds[j++];
                while (set) {
                        if (set & 1) {
                                struct file * file = xchg(&fdt->fd[i], NULL);
                                if (file) {
                                        filp_close(file, files);
                                        cond_resched();
                                }
                        }
                        i++;
                        set >>= 1;
                }
        }

        return fdt;
}

void put_files_struct(struct files_struct *files)
{
        if (atomic_dec_and_test(&files->count)) {
                struct fdtable *fdt = close_files(files);

                /* free the arrays if they are not embedded */
                if (fdt != &files->fdtab)
                        __free_fdtable(fdt);
                kmem_cache_free(files_cachep, files);
        }
}

void exit_files(struct task_struct *tsk)
{
        struct files_struct * files = tsk->files;

        if (files) {
                task_lock(tsk);
                tsk->files = NULL;
                task_unlock(tsk);
                put_files_struct(files);
        }
}

struct files_struct init_files = {
        .count                = ATOMIC_INIT(1),
        .fdt                = &init_files.fdtab,
        .fdtab                = {
                .max_fds        = NR_OPEN_DEFAULT,
                .fd                = &init_files.fd_array[0],
                .close_on_exec        = init_files.close_on_exec_init,
                .open_fds        = init_files.open_fds_init,
                .full_fds_bits        = init_files.full_fds_bits_init,
        },
        .file_lock        = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
        .resize_wait        = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
};

static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
{
        unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
        unsigned int maxbit = maxfd / BITS_PER_LONG;
        unsigned int bitbit = start / BITS_PER_LONG;

        bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
        if (bitbit >= maxfd)
                return maxfd;
        if (bitbit > start)
                start = bitbit;
        return find_next_zero_bit(fdt->open_fds, maxfd, start);
}

/*
 * allocate a file descriptor, mark it busy.
 */
static int alloc_fd(unsigned start, unsigned end, unsigned flags)
{
        struct files_struct *files = current->files;
        unsigned int fd;
        int error;
        struct fdtable *fdt;

        spin_lock(&files->file_lock);
repeat:
        fdt = files_fdtable(files);
        fd = start;
        if (fd < files->next_fd)
                fd = files->next_fd;

        if (fd < fdt->max_fds)
                fd = find_next_fd(fdt, fd);

        /*
         * N.B. For clone tasks sharing a files structure, this test
         * will limit the total number of files that can be opened.
         */
        error = -EMFILE;
        if (fd >= end)
                goto out;

        error = expand_files(files, fd);
        if (error < 0)
                goto out;

        /*
         * If we needed to expand the fs array we
         * might have blocked - try again.
         */
        if (error)
                goto repeat;

        if (start <= files->next_fd)
                files->next_fd = fd + 1;

        __set_open_fd(fd, fdt);
        if (flags & O_CLOEXEC)
                __set_close_on_exec(fd, fdt);
        else
                __clear_close_on_exec(fd, fdt);
        error = fd;
#if 1
        /* Sanity check */
        if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
                printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
                rcu_assign_pointer(fdt->fd[fd], NULL);
        }
#endif

out:
        spin_unlock(&files->file_lock);
        return error;
}

int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
{
        return alloc_fd(0, nofile, flags);
}

int get_unused_fd_flags(unsigned flags)
{
        return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE));
}
EXPORT_SYMBOL(get_unused_fd_flags);

static void __put_unused_fd(struct files_struct *files, unsigned int fd)
{
        struct fdtable *fdt = files_fdtable(files);
        __clear_open_fd(fd, fdt);
        if (fd < files->next_fd)
                files->next_fd = fd;
}

void put_unused_fd(unsigned int fd)
{
        struct files_struct *files = current->files;
        spin_lock(&files->file_lock);
        __put_unused_fd(files, fd);
        spin_unlock(&files->file_lock);
}

EXPORT_SYMBOL(put_unused_fd);

/*
 * Install a file pointer in the fd array.
 *
 * The VFS is full of places where we drop the files lock between
 * setting the open_fds bitmap and installing the file in the file
 * array.  At any such point, we are vulnerable to a dup2() race
 * installing a file in the array before us.  We need to detect this and
 * fput() the struct file we are about to overwrite in this case.
 *
 * It should never happen - if we allow dup2() do it, _really_ bad things
 * will follow.
 *
 * This consumes the "file" refcount, so callers should treat it
 * as if they had called fput(file).
 */

void fd_install(unsigned int fd, struct file *file)
{
        struct files_struct *files = current->files;
        struct fdtable *fdt;

        if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING)))
                return;

        rcu_read_lock_sched();

        if (unlikely(files->resize_in_progress)) {
                rcu_read_unlock_sched();
                spin_lock(&files->file_lock);
                fdt = files_fdtable(files);
                BUG_ON(fdt->fd[fd] != NULL);
                rcu_assign_pointer(fdt->fd[fd], file);
                spin_unlock(&files->file_lock);
                return;
        }
        /* coupled with smp_wmb() in expand_fdtable() */
        smp_rmb();
        fdt = rcu_dereference_sched(files->fdt);
        BUG_ON(fdt->fd[fd] != NULL);
        rcu_assign_pointer(fdt->fd[fd], file);
        rcu_read_unlock_sched();
}

EXPORT_SYMBOL(fd_install);

/**
 * file_close_fd_locked - return file associated with fd
 * @files: file struct to retrieve file from
 * @fd: file descriptor to retrieve file for
 *
 * Doesn't take a separate reference count.
 *
 * Context: files_lock must be held.
 *
 * Returns: The file associated with @fd (NULL if @fd is not open)
 */
struct file *file_close_fd_locked(struct files_struct *files, unsigned fd)
{
        struct fdtable *fdt = files_fdtable(files);
        struct file *file;

        lockdep_assert_held(&files->file_lock);

        if (fd >= fdt->max_fds)
                return NULL;

        fd = array_index_nospec(fd, fdt->max_fds);
        file = fdt->fd[fd];
        if (file) {
                rcu_assign_pointer(fdt->fd[fd], NULL);
                __put_unused_fd(files, fd);
        }
        return file;
}

int close_fd(unsigned fd)
{
        struct files_struct *files = current->files;
        struct file *file;

        spin_lock(&files->file_lock);
        file = file_close_fd_locked(files, fd);
        spin_unlock(&files->file_lock);
        if (!file)
                return -EBADF;

        return filp_close(file, files);
}
EXPORT_SYMBOL(close_fd); /* for ksys_close() */

/**
 * last_fd - return last valid index into fd table
 * @fdt: File descriptor table.
 *
 * Context: Either rcu read lock or files_lock must be held.
 *
 * Returns: Last valid index into fdtable.
 */
static inline unsigned last_fd(struct fdtable *fdt)
{
        return fdt->max_fds - 1;
}

static inline void __range_cloexec(struct files_struct *cur_fds,
                                   unsigned int fd, unsigned int max_fd)
{
        struct fdtable *fdt;

        /* make sure we're using the correct maximum value */
        spin_lock(&cur_fds->file_lock);
        fdt = files_fdtable(cur_fds);
        max_fd = min(last_fd(fdt), max_fd);
        if (fd <= max_fd)
                bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
        spin_unlock(&cur_fds->file_lock);
}

static inline void __range_close(struct files_struct *files, unsigned int fd,
                                 unsigned int max_fd)
{
        struct file *file;
        unsigned n;

        spin_lock(&files->file_lock);
        n = last_fd(files_fdtable(files));
        max_fd = min(max_fd, n);

        for (; fd <= max_fd; fd++) {
                file = file_close_fd_locked(files, fd);
                if (file) {
                        spin_unlock(&files->file_lock);
                        filp_close(file, files);
                        cond_resched();
                        spin_lock(&files->file_lock);
                } else if (need_resched()) {
                        spin_unlock(&files->file_lock);
                        cond_resched();
                        spin_lock(&files->file_lock);
                }
        }
        spin_unlock(&files->file_lock);
}

/**
 * __close_range() - Close all file descriptors in a given range.
 *
 * @fd:     starting file descriptor to close
 * @max_fd: last file descriptor to close
 * @flags:  CLOSE_RANGE flags.
 *
 * This closes a range of file descriptors. All file descriptors
 * from @fd up to and including @max_fd are closed.
 */
int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
{
        struct task_struct *me = current;
        struct files_struct *cur_fds = me->files, *fds = NULL;

        if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
                return -EINVAL;

        if (fd > max_fd)
                return -EINVAL;

        if (flags & CLOSE_RANGE_UNSHARE) {
                int ret;
                unsigned int max_unshare_fds = NR_OPEN_MAX;

                /*
                 * If the caller requested all fds to be made cloexec we always
                 * copy all of the file descriptors since they still want to
                 * use them.
                 */
                if (!(flags & CLOSE_RANGE_CLOEXEC)) {
                        /*
                         * If the requested range is greater than the current
                         * maximum, we're closing everything so only copy all
                         * file descriptors beneath the lowest file descriptor.
                         */
                        rcu_read_lock();
                        if (max_fd >= last_fd(files_fdtable(cur_fds)))
                                max_unshare_fds = fd;
                        rcu_read_unlock();
                }

                ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
                if (ret)
                        return ret;

                /*
                 * We used to share our file descriptor table, and have now
                 * created a private one, make sure we're using it below.
                 */
                if (fds)
                        swap(cur_fds, fds);
        }

        if (flags & CLOSE_RANGE_CLOEXEC)
                __range_cloexec(cur_fds, fd, max_fd);
        else
                __range_close(cur_fds, fd, max_fd);

        if (fds) {
                /*
                 * We're done closing the files we were supposed to. Time to install
                 * the new file descriptor table and drop the old one.
                 */
                task_lock(me);
                me->files = cur_fds;
                task_unlock(me);
                put_files_struct(fds);
        }

        return 0;
}

/**
 * file_close_fd - return file associated with fd
 * @fd: file descriptor to retrieve file for
 *
 * Doesn't take a separate reference count.
 *
 * Returns: The file associated with @fd (NULL if @fd is not open)
 */
struct file *file_close_fd(unsigned int fd)
{
        struct files_struct *files = current->files;
        struct file *file;

        spin_lock(&files->file_lock);
        file = file_close_fd_locked(files, fd);
        spin_unlock(&files->file_lock);

        return file;
}

void do_close_on_exec(struct files_struct *files)
{
        unsigned i;
        struct fdtable *fdt;

        /* exec unshares first */
        spin_lock(&files->file_lock);
        for (i = 0; ; i++) {
                unsigned long set;
                unsigned fd = i * BITS_PER_LONG;
                fdt = files_fdtable(files);
                if (fd >= fdt->max_fds)
                        break;
                set = fdt->close_on_exec[i];
                if (!set)
                        continue;
                fdt->close_on_exec[i] = 0;
                for ( ; set ; fd++, set >>= 1) {
                        struct file *file;
                        if (!(set & 1))
                                continue;
                        file = fdt->fd[fd];
                        if (!file)
                                continue;
                        rcu_assign_pointer(fdt->fd[fd], NULL);
                        __put_unused_fd(files, fd);
                        spin_unlock(&files->file_lock);
                        filp_close(file, files);
                        cond_resched();
                        spin_lock(&files->file_lock);
                }

        }
        spin_unlock(&files->file_lock);
}

static struct file *__get_file_rcu(struct file __rcu **f)
{
        struct file __rcu *file;
        struct file __rcu *file_reloaded;
        struct file __rcu *file_reloaded_cmp;

        file = rcu_dereference_raw(*f);
        if (!file)
                return NULL;

        if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
                return ERR_PTR(-EAGAIN);

        file_reloaded = rcu_dereference_raw(*f);

        /*
         * Ensure that all accesses have a dependency on the load from
         * rcu_dereference_raw() above so we get correct ordering
         * between reuse/allocation and the pointer check below.
         */
        file_reloaded_cmp = file_reloaded;
        OPTIMIZER_HIDE_VAR(file_reloaded_cmp);

        /*
         * atomic_long_inc_not_zero() above provided a full memory
         * barrier when we acquired a reference.
         *
         * This is paired with the write barrier from assigning to the
         * __rcu protected file pointer so that if that pointer still
         * matches the current file, we know we have successfully
         * acquired a reference to the right file.
         *
         * If the pointers don't match the file has been reallocated by
         * SLAB_TYPESAFE_BY_RCU.
         */
        if (file == file_reloaded_cmp)
                return file_reloaded;

        fput(file);
        return ERR_PTR(-EAGAIN);
}

/**
 * get_file_rcu - try go get a reference to a file under rcu
 * @f: the file to get a reference on
 *
 * This function tries to get a reference on @f carefully verifying that
 * @f hasn't been reused.
 *
 * This function should rarely have to be used and only by users who
 * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
 *
 * Return: Returns @f with the reference count increased or NULL.
 */
struct file *get_file_rcu(struct file __rcu **f)
{
        for (;;) {
                struct file __rcu *file;

                file = __get_file_rcu(f);
                if (!IS_ERR(file))
                        return file;
        }
}
EXPORT_SYMBOL_GPL(get_file_rcu);

/**
 * get_file_active - try go get a reference to a file
 * @f: the file to get a reference on
 *
 * In contast to get_file_rcu() the pointer itself isn't part of the
 * reference counting.
 *
 * This function should rarely have to be used and only by users who
 * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
 *
 * Return: Returns @f with the reference count increased or NULL.
 */
struct file *get_file_active(struct file **f)
{
        struct file __rcu *file;

        rcu_read_lock();
        file = __get_file_rcu(f);
        rcu_read_unlock();
        if (IS_ERR(file))
                file = NULL;
        return file;
}
EXPORT_SYMBOL_GPL(get_file_active);

static inline struct file *__fget_files_rcu(struct files_struct *files,
       unsigned int fd, fmode_t mask)
{
        for (;;) {
                struct file *file;
                struct fdtable *fdt = rcu_dereference_raw(files->fdt);
                struct file __rcu **fdentry;
                unsigned long nospec_mask;

                /* Mask is a 0 for invalid fd's, ~0 for valid ones */
                nospec_mask = array_index_mask_nospec(fd, fdt->max_fds);

                /*
                 * fdentry points to the 'fd' offset, or fdt->fd[0].
                 * Loading from fdt->fd[0] is always safe, because the
                 * array always exists.
                 */
                fdentry = fdt->fd + (fd & nospec_mask);

                /* Do the load, then mask any invalid result */
                file = rcu_dereference_raw(*fdentry);
                file = (void *)(nospec_mask & (unsigned long)file);
                if (unlikely(!file))
                        return NULL;

                /*
                 * Ok, we have a file pointer that was valid at
                 * some point, but it might have become stale since.
                 *
                 * We need to confirm it by incrementing the refcount
                 * and then check the lookup again.
                 *
                 * atomic_long_inc_not_zero() gives us a full memory
                 * barrier. We only really need an 'acquire' one to
                 * protect the loads below, but we don't have that.
                 */
                if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
                        continue;

                /*
                 * Such a race can take two forms:
                 *
                 *  (a) the file ref already went down to zero and the
                 *      file hasn't been reused yet or the file count
                 *      isn't zero but the file has already been reused.
                 *
                 *  (b) the file table entry has changed under us.
                 *       Note that we don't need to re-check the 'fdt->fd'
                 *       pointer having changed, because it always goes
                 *       hand-in-hand with 'fdt'.
                 *
                 * If so, we need to put our ref and try again.
                 */
                if (unlikely(file != rcu_dereference_raw(*fdentry)) ||
                    unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
                        fput(file);
                        continue;
                }

                /*
                 * This isn't the file we're looking for or we're not
                 * allowed to get a reference to it.
                 */
                if (unlikely(file->f_mode & mask)) {
                        fput(file);
                        return NULL;
                }

                /*
                 * Ok, we have a ref to the file, and checked that it
                 * still exists.
                 */
                return file;
        }
}

static struct file *__fget_files(struct files_struct *files, unsigned int fd,
                                 fmode_t mask)
{
        struct file *file;

        rcu_read_lock();
        file = __fget_files_rcu(files, fd, mask);
        rcu_read_unlock();

        return file;
}

static inline struct file *__fget(unsigned int fd, fmode_t mask)
{
        return __fget_files(current->files, fd, mask);
}

struct file *fget(unsigned int fd)
{
        return __fget(fd, FMODE_PATH);
}
EXPORT_SYMBOL(fget);

struct file *fget_raw(unsigned int fd)
{
        return __fget(fd, 0);
}
EXPORT_SYMBOL(fget_raw);

struct file *fget_task(struct task_struct *task, unsigned int fd)
{
        struct file *file = NULL;

        task_lock(task);
        if (task->files)
                file = __fget_files(task->files, fd, 0);
        task_unlock(task);

        return file;
}

struct file *lookup_fdget_rcu(unsigned int fd)
{
        return __fget_files_rcu(current->files, fd, 0);

}
EXPORT_SYMBOL_GPL(lookup_fdget_rcu);

struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd)
{
        /* Must be called with rcu_read_lock held */
        struct files_struct *files;
        struct file *file = NULL;

        task_lock(task);
        files = task->files;
        if (files)
                file = __fget_files_rcu(files, fd, 0);
        task_unlock(task);

        return file;
}

struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd)
{
        /* Must be called with rcu_read_lock held */
        struct files_struct *files;
        unsigned int fd = *ret_fd;
        struct file *file = NULL;

        task_lock(task);
        files = task->files;
        if (files) {
                for (; fd < files_fdtable(files)->max_fds; fd++) {
                        file = __fget_files_rcu(files, fd, 0);
                        if (file)
                                break;
                }
        }
        task_unlock(task);
        *ret_fd = fd;
        return file;
}
EXPORT_SYMBOL(task_lookup_next_fdget_rcu);

/*
 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
 *
 * You can use this instead of fget if you satisfy all of the following
 * conditions:
 * 1) You must call fput_light before exiting the syscall and returning control
 *    to userspace (i.e. you cannot remember the returned struct file * after
 *    returning to userspace).
 * 2) You must not call filp_close on the returned struct file * in between
 *    calls to fget_light and fput_light.
 * 3) You must not clone the current task in between the calls to fget_light
 *    and fput_light.
 *
 * The fput_needed flag returned by fget_light should be passed to the
 * corresponding fput_light.
 */
static unsigned long __fget_light(unsigned int fd, fmode_t mask)
{
        struct files_struct *files = current->files;
        struct file *file;

        /*
         * If another thread is concurrently calling close_fd() followed
         * by put_files_struct(), we must not observe the old table
         * entry combined with the new refcount - otherwise we could
         * return a file that is concurrently being freed.
         *
         * atomic_read_acquire() pairs with atomic_dec_and_test() in
         * put_files_struct().
         */
        if (likely(atomic_read_acquire(&files->count) == 1)) {
                file = files_lookup_fd_raw(files, fd);
                if (!file || unlikely(file->f_mode & mask))
                        return 0;
                return (unsigned long)file;
        } else {
                file = __fget_files(files, fd, mask);
                if (!file)
                        return 0;
                return FDPUT_FPUT | (unsigned long)file;
        }
}
unsigned long __fdget(unsigned int fd)
{
        return __fget_light(fd, FMODE_PATH);
}
EXPORT_SYMBOL(__fdget);

unsigned long __fdget_raw(unsigned int fd)
{
        return __fget_light(fd, 0);
}

/*
 * Try to avoid f_pos locking. We only need it if the
 * file is marked for FMODE_ATOMIC_POS, and it can be
 * accessed multiple ways.
 *
 * Always do it for directories, because pidfd_getfd()
 * can make a file accessible even if it otherwise would
 * not be, and for directories this is a correctness
 * issue, not a "POSIX requirement".
 */
static inline bool file_needs_f_pos_lock(struct file *file)
{
        return (file->f_mode & FMODE_ATOMIC_POS) &&
                (file_count(file) > 1 || file->f_op->iterate_shared);
}

unsigned long __fdget_pos(unsigned int fd)
{
        unsigned long v = __fdget(fd);
        struct file *file = (struct file *)(v & ~3);

        if (file && file_needs_f_pos_lock(file)) {
                v |= FDPUT_POS_UNLOCK;
                mutex_lock(&file->f_pos_lock);
        }
        return v;
}

void __f_unlock_pos(struct file *f)
{
        mutex_unlock(&f->f_pos_lock);
}

/*
 * We only lock f_pos if we have threads or if the file might be
 * shared with another process. In both cases we'll have an elevated
 * file count (done either by fdget() or by fork()).
 */

void set_close_on_exec(unsigned int fd, int flag)
{
        struct files_struct *files = current->files;
        struct fdtable *fdt;
        spin_lock(&files->file_lock);
        fdt = files_fdtable(files);
        if (flag)
                __set_close_on_exec(fd, fdt);
        else
                __clear_close_on_exec(fd, fdt);
        spin_unlock(&files->file_lock);
}

bool get_close_on_exec(unsigned int fd)
{
        bool res;
        rcu_read_lock();
        res = close_on_exec(fd, current->files);
        rcu_read_unlock();
        return res;
}

static int do_dup2(struct files_struct *files,
        struct file *file, unsigned fd, unsigned flags)
__releases(&files->file_lock)
{
        struct file *tofree;
        struct fdtable *fdt;

        /*
         * We need to detect attempts to do dup2() over allocated but still
         * not finished descriptor.  NB: OpenBSD avoids that at the price of
         * extra work in their equivalent of fget() - they insert struct
         * file immediately after grabbing descriptor, mark it larval if
         * more work (e.g. actual opening) is needed and make sure that
         * fget() treats larval files as absent.  Potentially interesting,
         * but while extra work in fget() is trivial, locking implications
         * and amount of surgery on open()-related paths in VFS are not.
         * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
         * deadlocks in rather amusing ways, AFAICS.  All of that is out of
         * scope of POSIX or SUS, since neither considers shared descriptor
         * tables and this condition does not arise without those.
         */
        fdt = files_fdtable(files);
        tofree = fdt->fd[fd];
        if (!tofree && fd_is_open(fd, fdt))
                goto Ebusy;
        get_file(file);
        rcu_assign_pointer(fdt->fd[fd], file);
        __set_open_fd(fd, fdt);
        if (flags & O_CLOEXEC)
                __set_close_on_exec(fd, fdt);
        else
                __clear_close_on_exec(fd, fdt);
        spin_unlock(&files->file_lock);

        if (tofree)
                filp_close(tofree, files);

        return fd;

Ebusy:
        spin_unlock(&files->file_lock);
        return -EBUSY;
}

int replace_fd(unsigned fd, struct file *file, unsigned flags)
{
        int err;
        struct files_struct *files = current->files;

        if (!file)
                return close_fd(fd);

        if (fd >= rlimit(RLIMIT_NOFILE))
                return -EBADF;

        spin_lock(&files->file_lock);
        err = expand_files(files, fd);
        if (unlikely(err < 0))
                goto out_unlock;
        return do_dup2(files, file, fd, flags);

out_unlock:
        spin_unlock(&files->file_lock);
        return err;
}

/**
 * receive_fd() - Install received file into file descriptor table
 * @file: struct file that was received from another process
 * @ufd: __user pointer to write new fd number to
 * @o_flags: the O_* flags to apply to the new fd entry
 *
 * Installs a received file into the file descriptor table, with appropriate
 * checks and count updates. Optionally writes the fd number to userspace, if
 * @ufd is non-NULL.
 *
 * This helper handles its own reference counting of the incoming
 * struct file.
 *
 * Returns newly install fd or -ve on error.
 */
int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
{
        int new_fd;
        int error;

        error = security_file_receive(file);
        if (error)
                return error;

        new_fd = get_unused_fd_flags(o_flags);
        if (new_fd < 0)
                return new_fd;

        if (ufd) {
                error = put_user(new_fd, ufd);
                if (error) {
                        put_unused_fd(new_fd);
                        return error;
                }
        }

        fd_install(new_fd, get_file(file));
        __receive_sock(file);
        return new_fd;
}
EXPORT_SYMBOL_GPL(receive_fd);

int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
{
        int error;

        error = security_file_receive(file);
        if (error)
                return error;
        error = replace_fd(new_fd, file, o_flags);
        if (error)
                return error;
        __receive_sock(file);
        return new_fd;
}

static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
{
        int err = -EBADF;
        struct file *file;
        struct files_struct *files = current->files;

        if ((flags & ~O_CLOEXEC) != 0)
                return -EINVAL;

        if (unlikely(oldfd == newfd))
                return -EINVAL;

        if (newfd >= rlimit(RLIMIT_NOFILE))
                return -EBADF;

        spin_lock(&files->file_lock);
        err = expand_files(files, newfd);
        file = files_lookup_fd_locked(files, oldfd);
        if (unlikely(!file))
                goto Ebadf;
        if (unlikely(err < 0)) {
                if (err == -EMFILE)
                        goto Ebadf;
                goto out_unlock;
        }
        return do_dup2(files, file, newfd, flags);

Ebadf:
        err = -EBADF;
out_unlock:
        spin_unlock(&files->file_lock);
        return err;
}

SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
{
        return ksys_dup3(oldfd, newfd, flags);
}

SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
{
        if (unlikely(newfd == oldfd)) { /* corner case */
                struct files_struct *files = current->files;
                struct file *f;
                int retval = oldfd;

                rcu_read_lock();
                f = __fget_files_rcu(files, oldfd, 0);
                if (!f)
                        retval = -EBADF;
                rcu_read_unlock();
                if (f)
                        fput(f);
                return retval;
        }
        return ksys_dup3(oldfd, newfd, 0);
}

SYSCALL_DEFINE1(dup, unsigned int, fildes)
{
        int ret = -EBADF;
        struct file *file = fget_raw(fildes);

        if (file) {
                ret = get_unused_fd_flags(0);
                if (ret >= 0)
                        fd_install(ret, file);
                else
                        fput(file);
        }
        return ret;
}

int f_dupfd(unsigned int from, struct file *file, unsigned flags)
{
        unsigned long nofile = rlimit(RLIMIT_NOFILE);
        int err;
        if (from >= nofile)
                return -EINVAL;
        err = alloc_fd(from, nofile, flags);
        if (err >= 0) {
                get_file(file);
                fd_install(err, file);
        }
        return err;
}

int iterate_fd(struct files_struct *files, unsigned n,
                int (*f)(const void *, struct file *, unsigned),
                const void *p)
{
        struct fdtable *fdt;
        int res = 0;
        if (!files)
                return 0;
        spin_lock(&files->file_lock);
        for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
                struct file *file;
                file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
                if (!file)
                        continue;
                res = f(p, file, n);
                if (res)
                        break;
        }
        spin_unlock(&files->file_lock);
        return res;
}
EXPORT_SYMBOL(iterate_fd);











































   22 


















    3 













    3 











































   35 








































    7 

    7 









   31 













   29 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
// SPDX-License-Identifier: GPL-2.0
#include <linux/export.h>
#include <linux/lockref.h>

#if USE_CMPXCHG_LOCKREF

/*
 * Note that the "cmpxchg()" reloads the "old" value for the
 * failure case.
 */
#define CMPXCHG_LOOP(CODE, SUCCESS) do {                                        \
        int retry = 100;                                                        \
        struct lockref old;                                                        \
        BUILD_BUG_ON(sizeof(old) != 8);                                                \
        old.lock_count = READ_ONCE(lockref->lock_count);                        \
        while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) {          \
                struct lockref new = old;                                        \
                CODE                                                                \
                if (likely(try_cmpxchg64_relaxed(&lockref->lock_count,                \
                                                 &old.lock_count,                \
                                                 new.lock_count))) {                \
                        SUCCESS;                                                \
                }                                                                \
                if (!--retry)                                                        \
                        break;                                                        \
        }                                                                        \
} while (0)

#else

#define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0)

#endif

/**
 * lockref_get - Increments reference count unconditionally
 * @lockref: pointer to lockref structure
 *
 * This operation is only valid if you already hold a reference
 * to the object, so you know the count cannot be zero.
 */
void lockref_get(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count++;
        ,
                return;
        );

        spin_lock(&lockref->lock);
        lockref->count++;
        spin_unlock(&lockref->lock);
}
EXPORT_SYMBOL(lockref_get);

/**
 * lockref_get_not_zero - Increments count unless the count is 0 or dead
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if count was zero
 */
int lockref_get_not_zero(struct lockref *lockref)
{
        int retval;

        CMPXCHG_LOOP(
                new.count++;
                if (old.count <= 0)
                        return 0;
        ,
                return 1;
        );

        spin_lock(&lockref->lock);
        retval = 0;
        if (lockref->count > 0) {
                lockref->count++;
                retval = 1;
        }
        spin_unlock(&lockref->lock);
        return retval;
}
EXPORT_SYMBOL(lockref_get_not_zero);

/**
 * lockref_put_not_zero - Decrements count unless count <= 1 before decrement
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if count would become zero
 */
int lockref_put_not_zero(struct lockref *lockref)
{
        int retval;

        CMPXCHG_LOOP(
                new.count--;
                if (old.count <= 1)
                        return 0;
        ,
                return 1;
        );

        spin_lock(&lockref->lock);
        retval = 0;
        if (lockref->count > 1) {
                lockref->count--;
                retval = 1;
        }
        spin_unlock(&lockref->lock);
        return retval;
}
EXPORT_SYMBOL(lockref_put_not_zero);

/**
 * lockref_put_return - Decrement reference count if possible
 * @lockref: pointer to lockref structure
 *
 * Decrement the reference count and return the new value.
 * If the lockref was dead or locked, return an error.
 */
int lockref_put_return(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count--;
                if (old.count <= 0)
                        return -1;
        ,
                return new.count;
        );
        return -1;
}
EXPORT_SYMBOL(lockref_put_return);

/**
 * lockref_put_or_lock - decrements count unless count <= 1 before decrement
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if count <= 1 and lock taken
 */
int lockref_put_or_lock(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count--;
                if (old.count <= 1)
                        break;
        ,
                return 1;
        );

        spin_lock(&lockref->lock);
        if (lockref->count <= 1)
                return 0;
        lockref->count--;
        spin_unlock(&lockref->lock);
        return 1;
}
EXPORT_SYMBOL(lockref_put_or_lock);

/**
 * lockref_mark_dead - mark lockref dead
 * @lockref: pointer to lockref structure
 */
void lockref_mark_dead(struct lockref *lockref)
{
        assert_spin_locked(&lockref->lock);
        lockref->count = -128;
}
EXPORT_SYMBOL(lockref_mark_dead);

/**
 * lockref_get_not_dead - Increments count unless the ref is dead
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if lockref was dead
 */
int lockref_get_not_dead(struct lockref *lockref)
{
        int retval;

        CMPXCHG_LOOP(
                new.count++;
                if (old.count < 0)
                        return 0;
        ,
                return 1;
        );

        spin_lock(&lockref->lock);
        retval = 0;
        if (lockref->count >= 0) {
                lockref->count++;
                retval = 1;
        }
        spin_unlock(&lockref->lock);
        return retval;
}
EXPORT_SYMBOL(lockref_get_not_dead);



































































































































































































































































































































































































































    1 































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 *
 * Copyright (C) 2011 Novell Inc.
 */

#include <linux/kernel.h>
#include <linux/uuid.h>
#include <linux/fs.h>
#include <linux/fsverity.h>
#include <linux/namei.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
#include "ovl_entry.h"

#undef pr_fmt
#define pr_fmt(fmt) "overlayfs: " fmt

enum ovl_path_type {
        __OVL_PATH_UPPER        = (1 << 0),
        __OVL_PATH_MERGE        = (1 << 1),
        __OVL_PATH_ORIGIN        = (1 << 2),
};

#define OVL_TYPE_UPPER(type)        ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type)        ((type) & __OVL_PATH_MERGE)
#define OVL_TYPE_ORIGIN(type)        ((type) & __OVL_PATH_ORIGIN)

#define OVL_XATTR_NAMESPACE "overlay."
#define OVL_XATTR_TRUSTED_PREFIX XATTR_TRUSTED_PREFIX OVL_XATTR_NAMESPACE
#define OVL_XATTR_TRUSTED_PREFIX_LEN (sizeof(OVL_XATTR_TRUSTED_PREFIX) - 1)
#define OVL_XATTR_USER_PREFIX XATTR_USER_PREFIX OVL_XATTR_NAMESPACE
#define OVL_XATTR_USER_PREFIX_LEN (sizeof(OVL_XATTR_USER_PREFIX) - 1)

#define OVL_XATTR_ESCAPE_PREFIX OVL_XATTR_NAMESPACE
#define OVL_XATTR_ESCAPE_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_PREFIX) - 1)
#define OVL_XATTR_ESCAPE_TRUSTED_PREFIX OVL_XATTR_TRUSTED_PREFIX OVL_XATTR_ESCAPE_PREFIX
#define OVL_XATTR_ESCAPE_TRUSTED_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_TRUSTED_PREFIX) - 1)
#define OVL_XATTR_ESCAPE_USER_PREFIX OVL_XATTR_USER_PREFIX OVL_XATTR_ESCAPE_PREFIX
#define OVL_XATTR_ESCAPE_USER_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_USER_PREFIX) - 1)

enum ovl_xattr {
        OVL_XATTR_OPAQUE,
        OVL_XATTR_REDIRECT,
        OVL_XATTR_ORIGIN,
        OVL_XATTR_IMPURE,
        OVL_XATTR_NLINK,
        OVL_XATTR_UPPER,
        OVL_XATTR_UUID,
        OVL_XATTR_METACOPY,
        OVL_XATTR_PROTATTR,
        OVL_XATTR_XWHITEOUT,
};

enum ovl_inode_flag {
        /* Pure upper dir that may contain non pure upper entries */
        OVL_IMPURE,
        /* Non-merge dir that may contain whiteout entries */
        OVL_WHITEOUTS,
        OVL_INDEX,
        OVL_UPPERDATA,
        /* Inode number will remain constant over copy up. */
        OVL_CONST_INO,
        OVL_HAS_DIGEST,
        OVL_VERIFIED_DIGEST,
};

enum ovl_entry_flag {
        OVL_E_UPPER_ALIAS,
        OVL_E_OPAQUE,
        OVL_E_CONNECTED,
        /* Lower stack may contain xwhiteout entries */
        OVL_E_XWHITEOUTS,
};

enum {
        OVL_REDIRECT_OFF,        /* "off" mode is never used. In effect        */
        OVL_REDIRECT_FOLLOW,        /* ...it translates to either "follow"        */
        OVL_REDIRECT_NOFOLLOW,        /* ...or "nofollow".                        */
        OVL_REDIRECT_ON,
};

enum {
        OVL_UUID_OFF,
        OVL_UUID_NULL,
        OVL_UUID_AUTO,
        OVL_UUID_ON,
};

enum {
        OVL_XINO_OFF,
        OVL_XINO_AUTO,
        OVL_XINO_ON,
};

enum {
        OVL_VERITY_OFF,
        OVL_VERITY_ON,
        OVL_VERITY_REQUIRE,
};

/*
 * The tuple (fh,uuid) is a universal unique identifier for a copy up origin,
 * where:
 * origin.fh        - exported file handle of the lower file
 * origin.uuid        - uuid of the lower filesystem
 */
#define OVL_FH_VERSION        0
#define OVL_FH_MAGIC        0xfb

/* CPU byte order required for fid decoding:  */
#define OVL_FH_FLAG_BIG_ENDIAN        (1 << 0)
#define OVL_FH_FLAG_ANY_ENDIAN        (1 << 1)
/* Is the real inode encoded in fid an upper inode? */
#define OVL_FH_FLAG_PATH_UPPER        (1 << 2)

#define OVL_FH_FLAG_ALL (OVL_FH_FLAG_BIG_ENDIAN | OVL_FH_FLAG_ANY_ENDIAN | \
                         OVL_FH_FLAG_PATH_UPPER)

#if defined(__LITTLE_ENDIAN)
#define OVL_FH_FLAG_CPU_ENDIAN 0
#elif defined(__BIG_ENDIAN)
#define OVL_FH_FLAG_CPU_ENDIAN OVL_FH_FLAG_BIG_ENDIAN
#else
#error Endianness not defined
#endif

/* The type used to be returned by overlay exportfs for misaligned fid */
#define OVL_FILEID_V0        0xfb
/* The type returned by overlay exportfs for 32bit aligned fid */
#define OVL_FILEID_V1        0xf8

/* On-disk format for "origin" file handle */
struct ovl_fb {
        u8 version;        /* 0 */
        u8 magic;        /* 0xfb */
        u8 len;                /* size of this header + size of fid */
        u8 flags;        /* OVL_FH_FLAG_* */
        u8 type;        /* fid_type of fid */
        uuid_t uuid;        /* uuid of filesystem */
        u32 fid[];        /* file identifier should be 32bit aligned in-memory */
} __packed;

/* In-memory and on-wire format for overlay file handle */
struct ovl_fh {
        u8 padding[3];        /* make sure fb.fid is 32bit aligned */
        union {
                struct ovl_fb fb;
                DECLARE_FLEX_ARRAY(u8, buf);
        };
} __packed;

#define OVL_FH_WIRE_OFFSET        offsetof(struct ovl_fh, fb)
#define OVL_FH_LEN(fh)                (OVL_FH_WIRE_OFFSET + (fh)->fb.len)
#define OVL_FH_FID_OFFSET        (OVL_FH_WIRE_OFFSET + \
                                 offsetof(struct ovl_fb, fid))

/* On-disk format for "metacopy" xattr (if non-zero size) */
struct ovl_metacopy {
        u8 version;        /* 0 */
        u8 len;         /* size of this header + used digest bytes */
        u8 flags;
        u8 digest_algo;        /* FS_VERITY_HASH_ALG_* constant, 0 for no digest */
        u8 digest[FS_VERITY_MAX_DIGEST_SIZE];  /* Only the used part on disk */
} __packed;

#define OVL_METACOPY_MAX_SIZE (sizeof(struct ovl_metacopy))
#define OVL_METACOPY_MIN_SIZE (OVL_METACOPY_MAX_SIZE - FS_VERITY_MAX_DIGEST_SIZE)
#define OVL_METACOPY_INIT { 0, OVL_METACOPY_MIN_SIZE }

static inline int ovl_metadata_digest_size(const struct ovl_metacopy *metacopy)
{
        if (metacopy->len < OVL_METACOPY_MIN_SIZE)
                return 0;
        return (int)metacopy->len - OVL_METACOPY_MIN_SIZE;
}

/* No atime modification on underlying */
#define OVL_OPEN_FLAGS (O_NOATIME)

extern const char *const ovl_xattr_table[][2];
static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox)
{
        return ovl_xattr_table[ox][ofs->config.userxattr];
}

/*
 * When changing ownership of an upper object map the intended ownership
 * according to the upper layer's idmapping. When an upper mount idmaps files
 * that are stored on-disk as owned by id 1001 to id 1000 this means stat on
 * this object will report it as being owned by id 1000 when calling stat via
 * the upper mount.
 * In order to change ownership of an object so stat reports id 1000 when
 * called on an idmapped upper mount the value written to disk - i.e., the
 * value stored in ia_*id - must 1001. The mount mapping helper will thus take
 * care to map 1000 to 1001.
 * The mnt idmapping helpers are nops if the upper layer isn't idmapped.
 */
static inline int ovl_do_notify_change(struct ovl_fs *ofs,
                                       struct dentry *upperdentry,
                                       struct iattr *attr)
{
        return notify_change(ovl_upper_mnt_idmap(ofs), upperdentry, attr, NULL);
}

static inline int ovl_do_rmdir(struct ovl_fs *ofs,
                               struct inode *dir, struct dentry *dentry)
{
        int err = vfs_rmdir(ovl_upper_mnt_idmap(ofs), dir, dentry);

        pr_debug("rmdir(%pd2) = %i\n", dentry, err);
        return err;
}

static inline int ovl_do_unlink(struct ovl_fs *ofs, struct inode *dir,
                                struct dentry *dentry)
{
        int err = vfs_unlink(ovl_upper_mnt_idmap(ofs), dir, dentry, NULL);

        pr_debug("unlink(%pd2) = %i\n", dentry, err);
        return err;
}

static inline int ovl_do_link(struct ovl_fs *ofs, struct dentry *old_dentry,
                              struct inode *dir, struct dentry *new_dentry)
{
        int err = vfs_link(old_dentry, ovl_upper_mnt_idmap(ofs), dir,
                           new_dentry, NULL);

        pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err);
        return err;
}

static inline int ovl_do_create(struct ovl_fs *ofs,
                                struct inode *dir, struct dentry *dentry,
                                umode_t mode)
{
        int err = vfs_create(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, true);

        pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
        return err;
}

static inline int ovl_do_mkdir(struct ovl_fs *ofs,
                               struct inode *dir, struct dentry *dentry,
                               umode_t mode)
{
        int err = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode);
        pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
        return err;
}

static inline int ovl_do_mknod(struct ovl_fs *ofs,
                               struct inode *dir, struct dentry *dentry,
                               umode_t mode, dev_t dev)
{
        int err = vfs_mknod(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, dev);

        pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err);
        return err;
}

static inline int ovl_do_symlink(struct ovl_fs *ofs,
                                 struct inode *dir, struct dentry *dentry,
                                 const char *oldname)
{
        int err = vfs_symlink(ovl_upper_mnt_idmap(ofs), dir, dentry, oldname);

        pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
        return err;
}

static inline ssize_t ovl_do_getxattr(const struct path *path, const char *name,
                                      void *value, size_t size)
{
        int err, len;

        WARN_ON(path->dentry->d_sb != path->mnt->mnt_sb);

        err = vfs_getxattr(mnt_idmap(path->mnt), path->dentry,
                               name, value, size);
        len = (value && err > 0) ? err : 0;

        pr_debug("getxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n",
                 path->dentry, name, min(len, 48), value, size, err);
        return err;
}

static inline ssize_t ovl_getxattr_upper(struct ovl_fs *ofs,
                                         struct dentry *upperdentry,
                                         enum ovl_xattr ox, void *value,
                                         size_t size)
{
        struct path upperpath = {
                .dentry = upperdentry,
                .mnt = ovl_upper_mnt(ofs),
        };

        return ovl_do_getxattr(&upperpath, ovl_xattr(ofs, ox), value, size);
}

static inline ssize_t ovl_path_getxattr(struct ovl_fs *ofs,
                                         const struct path *path,
                                         enum ovl_xattr ox, void *value,
                                         size_t size)
{
        return ovl_do_getxattr(path, ovl_xattr(ofs, ox), value, size);
}

static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry,
                                  const char *name, const void *value,
                                  size_t size, int flags)
{
        int err = vfs_setxattr(ovl_upper_mnt_idmap(ofs), dentry, name,
                               value, size, flags);

        pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n",
                 dentry, name, min((int)size, 48), value, size, flags, err);
        return err;
}

static inline int ovl_setxattr(struct ovl_fs *ofs, struct dentry *dentry,
                               enum ovl_xattr ox, const void *value,
                               size_t size)
{
        return ovl_do_setxattr(ofs, dentry, ovl_xattr(ofs, ox), value, size, 0);
}

static inline int ovl_do_removexattr(struct ovl_fs *ofs, struct dentry *dentry,
                                     const char *name)
{
        int err = vfs_removexattr(ovl_upper_mnt_idmap(ofs), dentry, name);
        pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
        return err;
}

static inline int ovl_removexattr(struct ovl_fs *ofs, struct dentry *dentry,
                                  enum ovl_xattr ox)
{
        return ovl_do_removexattr(ofs, dentry, ovl_xattr(ofs, ox));
}

static inline int ovl_do_set_acl(struct ovl_fs *ofs, struct dentry *dentry,
                                 const char *acl_name, struct posix_acl *acl)
{
        return vfs_set_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name, acl);
}

static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry,
                                    const char *acl_name)
{
        return vfs_remove_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name);
}

static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir,
                                struct dentry *olddentry, struct inode *newdir,
                                struct dentry *newdentry, unsigned int flags)
{
        int err;
        struct renamedata rd = {
                .old_mnt_idmap        = ovl_upper_mnt_idmap(ofs),
                .old_dir         = olddir,
                .old_dentry         = olddentry,
                .new_mnt_idmap        = ovl_upper_mnt_idmap(ofs),
                .new_dir         = newdir,
                .new_dentry         = newdentry,
                .flags                 = flags,
        };

        pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags);
        err = vfs_rename(&rd);
        if (err) {
                pr_debug("...rename(%pd2, %pd2, ...) = %i\n",
                         olddentry, newdentry, err);
        }
        return err;
}

static inline int ovl_do_whiteout(struct ovl_fs *ofs,
                                  struct inode *dir, struct dentry *dentry)
{
        int err = vfs_whiteout(ovl_upper_mnt_idmap(ofs), dir, dentry);
        pr_debug("whiteout(%pd2) = %i\n", dentry, err);
        return err;
}

static inline struct file *ovl_do_tmpfile(struct ovl_fs *ofs,
                                          struct dentry *dentry, umode_t mode)
{
        struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = dentry };
        struct file *file = kernel_tmpfile_open(ovl_upper_mnt_idmap(ofs), &path,
                                                mode, O_LARGEFILE | O_WRONLY,
                                                current_cred());
        int err = PTR_ERR_OR_ZERO(file);

        pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err);
        return file;
}

static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs,
                                              const char *name,
                                              struct dentry *base, int len)
{
        return lookup_one(ovl_upper_mnt_idmap(ofs), name, base, len);
}

static inline bool ovl_open_flags_need_copy_up(int flags)
{
        if (!flags)
                return false;

        return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC));
}

static inline int ovl_do_getattr(const struct path *path, struct kstat *stat,
                                 u32 request_mask, unsigned int flags)
{
        if (flags & AT_GETATTR_NOSEC)
                return vfs_getattr_nosec(path, stat, request_mask, flags);
        return vfs_getattr(path, stat, request_mask, flags);
}

/* util.c */
int ovl_get_write_access(struct dentry *dentry);
void ovl_put_write_access(struct dentry *dentry);
void ovl_start_write(struct dentry *dentry);
void ovl_end_write(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
struct dentry *ovl_workdir(struct dentry *dentry);
const struct cred *ovl_override_creds(struct super_block *sb);

static inline const struct cred *ovl_creds(struct super_block *sb)
{
        return OVL_FS(sb)->creator_cred;
}

int ovl_can_decode_fh(struct super_block *sb);
struct dentry *ovl_indexdir(struct super_block *sb);
bool ovl_index_all(struct super_block *sb);
bool ovl_verify_lower(struct super_block *sb);
struct ovl_path *ovl_stack_alloc(unsigned int n);
void ovl_stack_cpy(struct ovl_path *dst, struct ovl_path *src, unsigned int n);
void ovl_stack_put(struct ovl_path *stack, unsigned int n);
void ovl_stack_free(struct ovl_path *stack, unsigned int n);
struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
void ovl_free_entry(struct ovl_entry *oe);
bool ovl_dentry_remote(struct dentry *dentry);
void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *realdentry);
void ovl_dentry_init_reval(struct dentry *dentry, struct dentry *upperdentry,
                           struct ovl_entry *oe);
void ovl_dentry_init_flags(struct dentry *dentry, struct dentry *upperdentry,
                           struct ovl_entry *oe, unsigned int mask);
bool ovl_dentry_weird(struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
void ovl_path_lowerdata(struct dentry *dentry, struct path *path);
struct inode *ovl_i_path_real(struct inode *inode, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_lowerdata(struct dentry *dentry);
int ovl_dentry_set_lowerdata(struct dentry *dentry, struct ovl_path *datapath);
const struct ovl_layer *ovl_i_layer_lower(struct inode *inode);
const struct ovl_layer *ovl_layer_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_i_dentry_upper(struct inode *inode);
struct inode *ovl_inode_upper(struct inode *inode);
struct inode *ovl_inode_lower(struct inode *inode);
struct inode *ovl_inode_lowerdata(struct inode *inode);
struct inode *ovl_inode_real(struct inode *inode);
struct inode *ovl_inode_realdata(struct inode *inode);
const char *ovl_lowerdata_redirect(struct inode *inode);
struct ovl_dir_cache *ovl_dir_cache(struct inode *inode);
void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache);
void ovl_dentry_set_flag(unsigned long flag, struct dentry *dentry);
void ovl_dentry_clear_flag(unsigned long flag, struct dentry *dentry);
bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
bool ovl_dentry_is_whiteout(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry);
bool ovl_dentry_has_xwhiteouts(struct dentry *dentry);
void ovl_dentry_set_xwhiteouts(struct dentry *dentry);
void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs,
                              const struct ovl_layer *layer);
bool ovl_dentry_has_upper_alias(struct dentry *dentry);
void ovl_dentry_set_upper_alias(struct dentry *dentry);
bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags);
bool ovl_dentry_needs_data_copy_up_locked(struct dentry *dentry, int flags);
bool ovl_has_upperdata(struct inode *inode);
void ovl_set_upperdata(struct inode *inode);
const char *ovl_dentry_get_redirect(struct dentry *dentry);
void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect);
void ovl_inode_update(struct inode *inode, struct dentry *upperdentry);
void ovl_dir_modified(struct dentry *dentry, bool impurity);
u64 ovl_inode_version_get(struct inode *inode);
bool ovl_is_whiteout(struct dentry *dentry);
bool ovl_path_is_whiteout(struct ovl_fs *ofs, const struct path *path);
struct file *ovl_path_open(const struct path *path, int flags);
int ovl_copy_up_start(struct dentry *dentry, int flags);
void ovl_copy_up_end(struct dentry *dentry);
bool ovl_already_copied_up(struct dentry *dentry, int flags);
char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path,
                           enum ovl_xattr ox);
bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path);
bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path);
bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs,
                         const struct path *upperpath);

static inline bool ovl_upper_is_whiteout(struct ovl_fs *ofs,
                                         struct dentry *upperdentry)
{
        struct path upperpath = {
                .dentry = upperdentry,
                .mnt = ovl_upper_mnt(ofs),
        };
        return ovl_path_is_whiteout(ofs, &upperpath);
}

static inline bool ovl_check_origin_xattr(struct ovl_fs *ofs,
                                          struct dentry *upperdentry)
{
        struct path upperpath = {
                .dentry = upperdentry,
                .mnt = ovl_upper_mnt(ofs),
        };
        return ovl_path_check_origin_xattr(ofs, &upperpath);
}

int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry,
                       enum ovl_xattr ox, const void *value, size_t size,
                       int xerr);
int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry);
bool ovl_inuse_trylock(struct dentry *dentry);
void ovl_inuse_unlock(struct dentry *dentry);
bool ovl_is_inuse(struct dentry *dentry);
bool ovl_need_index(struct dentry *dentry);
int ovl_nlink_start(struct dentry *dentry);
void ovl_nlink_end(struct dentry *dentry);
int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path,
                             struct ovl_metacopy *data);
int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d,
                           struct ovl_metacopy *metacopy);
bool ovl_is_metacopy_dentry(struct dentry *dentry);
char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding);
int ovl_ensure_verity_loaded(struct path *path);
int ovl_get_verity_xattr(struct ovl_fs *ofs, const struct path *path,
                         u8 *digest_buf, int *buf_length);
int ovl_validate_verity(struct ovl_fs *ofs,
                        struct path *metapath,
                        struct path *datapath);
int ovl_get_verity_digest(struct ovl_fs *ofs, struct path *src,
                          struct ovl_metacopy *metacopy);
int ovl_sync_status(struct ovl_fs *ofs);

static inline void ovl_set_flag(unsigned long flag, struct inode *inode)
{
        set_bit(flag, &OVL_I(inode)->flags);
}

static inline void ovl_clear_flag(unsigned long flag, struct inode *inode)
{
        clear_bit(flag, &OVL_I(inode)->flags);
}

static inline bool ovl_test_flag(unsigned long flag, struct inode *inode)
{
        return test_bit(flag, &OVL_I(inode)->flags);
}

static inline bool ovl_is_impuredir(struct super_block *sb,
                                    struct dentry *upperdentry)
{
        struct ovl_fs *ofs = OVL_FS(sb);
        struct path upperpath = {
                .dentry = upperdentry,
                .mnt = ovl_upper_mnt(ofs),
        };

        return ovl_get_dir_xattr_val(ofs, &upperpath, OVL_XATTR_IMPURE) == 'y';
}

static inline char ovl_get_opaquedir_val(struct ovl_fs *ofs,
                                         const struct path *path)
{
        return ovl_get_dir_xattr_val(ofs, path, OVL_XATTR_OPAQUE);
}

static inline bool ovl_redirect_follow(struct ovl_fs *ofs)
{
        return ofs->config.redirect_mode != OVL_REDIRECT_NOFOLLOW;
}

static inline bool ovl_redirect_dir(struct ovl_fs *ofs)
{
        return ofs->config.redirect_mode == OVL_REDIRECT_ON;
}

static inline bool ovl_origin_uuid(struct ovl_fs *ofs)
{
        return ofs->config.uuid != OVL_UUID_OFF;
}

static inline bool ovl_has_fsid(struct ovl_fs *ofs)
{
        return ofs->config.uuid == OVL_UUID_ON ||
               ofs->config.uuid == OVL_UUID_AUTO;
}

/*
 * With xino=auto, we do best effort to keep all inodes on same st_dev and
 * d_ino consistent with st_ino.
 * With xino=on, we do the same effort but we warn if we failed.
 */
static inline bool ovl_xino_warn(struct ovl_fs *ofs)
{
        return ofs->config.xino == OVL_XINO_ON;
}

/*
 * To avoid regressions in existing setups with overlay lower offline changes,
 * we allow lower changes only if none of the new features are used.
 */
static inline bool ovl_allow_offline_changes(struct ovl_fs *ofs)
{
        return (!ofs->config.index && !ofs->config.metacopy &&
                !ovl_redirect_dir(ofs) && !ovl_xino_warn(ofs));
}

/* All layers on same fs? */
static inline bool ovl_same_fs(struct ovl_fs *ofs)
{
        return ofs->xino_mode == 0;
}

/* All overlay inodes have same st_dev? */
static inline bool ovl_same_dev(struct ovl_fs *ofs)
{
        return ofs->xino_mode >= 0;
}

static inline unsigned int ovl_xino_bits(struct ovl_fs *ofs)
{
        return ovl_same_dev(ofs) ? ofs->xino_mode : 0;
}

static inline void ovl_inode_lock(struct inode *inode)
{
        mutex_lock(&OVL_I(inode)->lock);
}

static inline int ovl_inode_lock_interruptible(struct inode *inode)
{
        return mutex_lock_interruptible(&OVL_I(inode)->lock);
}

static inline void ovl_inode_unlock(struct inode *inode)
{
        mutex_unlock(&OVL_I(inode)->lock);
}


/* namei.c */
int ovl_check_fb_len(struct ovl_fb *fb, int fb_len);

static inline int ovl_check_fh_len(struct ovl_fh *fh, int fh_len)
{
        if (fh_len < sizeof(struct ovl_fh))
                return -EINVAL;

        return ovl_check_fb_len(&fh->fb, fh_len - OVL_FH_WIRE_OFFSET);
}

struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
                                  struct vfsmount *mnt, bool connected);
int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
                        struct dentry *upperdentry, struct ovl_path **stackp);
int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
                      enum ovl_xattr ox, const struct ovl_fh *fh,
                      bool is_upper, bool set);
int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry,
                            enum ovl_xattr ox, struct dentry *real,
                            bool is_upper, bool set);
struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index,
                               bool connected);
int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index);
int ovl_get_index_name_fh(const struct ovl_fh *fh, struct qstr *name);
int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
                       struct qstr *name);
struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh);
struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
                                struct dentry *origin, bool verify);
int ovl_path_next(int idx, struct dentry *dentry, struct path *path,
                  const struct ovl_layer **layer);
int ovl_verify_lowerdata(struct dentry *dentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
                          unsigned int flags);
bool ovl_lower_positive(struct dentry *dentry);

static inline int ovl_verify_origin_fh(struct ovl_fs *ofs, struct dentry *upper,
                                       const struct ovl_fh *fh, bool set)
{
        return ovl_verify_set_fh(ofs, upper, OVL_XATTR_ORIGIN, fh, false, set);
}

static inline int ovl_verify_origin(struct ovl_fs *ofs, struct dentry *upper,
                                    struct dentry *origin, bool set)
{
        return ovl_verify_origin_xattr(ofs, upper, OVL_XATTR_ORIGIN, origin,
                                       false, set);
}

static inline int ovl_verify_upper(struct ovl_fs *ofs, struct dentry *index,
                                   struct dentry *upper, bool set)
{
        return ovl_verify_origin_xattr(ofs, index, OVL_XATTR_UPPER, upper,
                                       true, set);
}

/* readdir.c */
extern const struct file_operations ovl_dir_operations;
struct file *ovl_dir_real_file(const struct file *file, bool want_upper);
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper,
                           struct list_head *list);
void ovl_cache_free(struct list_head *list);
void ovl_dir_cache_free(struct inode *inode);
int ovl_check_d_type_supported(const struct path *realpath);
int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir,
                        struct vfsmount *mnt, struct dentry *dentry, int level);
int ovl_indexdir_cleanup(struct ovl_fs *ofs);

/*
 * Can we iterate real dir directly?
 *
 * Non-merge dir may contain whiteouts from a time it was a merge upper, before
 * lower dir was removed under it and possibly before it was rotated from upper
 * to lower layer.
 */
static inline bool ovl_dir_is_real(struct inode *dir)
{
        return !ovl_test_flag(OVL_WHITEOUTS, dir);
}

/* inode.c */
int ovl_set_nlink_upper(struct dentry *dentry);
int ovl_set_nlink_lower(struct dentry *dentry);
unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry,
                           struct dentry *upperdentry,
                           unsigned int fallback);
int ovl_permission(struct mnt_idmap *idmap, struct inode *inode,
                   int mask);

#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap,
                                 struct inode *inode, int type,
                                 bool rcu, bool noperm);
static inline struct posix_acl *ovl_get_inode_acl(struct inode *inode, int type,
                                                  bool rcu)
{
        return do_ovl_get_acl(&nop_mnt_idmap, inode, type, rcu, true);
}
static inline struct posix_acl *ovl_get_acl(struct mnt_idmap *idmap,
                                            struct dentry *dentry, int type)
{
        return do_ovl_get_acl(idmap, d_inode(dentry), type, false, false);
}
int ovl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                struct posix_acl *acl, int type);
struct posix_acl *ovl_get_acl_path(const struct path *path,
                                   const char *acl_name, bool noperm);
#else
#define ovl_get_inode_acl        NULL
#define ovl_get_acl                NULL
#define ovl_set_acl                NULL
static inline struct posix_acl *ovl_get_acl_path(const struct path *path,
                                                 const char *acl_name,
                                                 bool noperm)
{
        return NULL;
}
#endif

int ovl_update_time(struct inode *inode, int flags);
bool ovl_is_private_xattr(struct super_block *sb, const char *name);

struct ovl_inode_params {
        struct inode *newinode;
        struct dentry *upperdentry;
        struct ovl_entry *oe;
        bool index;
        char *redirect;
        char *lowerdata_redirect;
};
void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip,
                    unsigned long ino, int fsid);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
                               bool is_upper);
bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir);
struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir);
struct inode *ovl_get_inode(struct super_block *sb,
                            struct ovl_inode_params *oip);
void ovl_copyattr(struct inode *to);

/* vfs inode flags copied from real to ovl inode */
#define OVL_COPY_I_FLAGS_MASK        (S_SYNC | S_NOATIME | S_APPEND | S_IMMUTABLE)
/* vfs inode flags read from overlay.protattr xattr to ovl inode */
#define OVL_PROT_I_FLAGS_MASK        (S_APPEND | S_IMMUTABLE)

/*
 * fileattr flags copied from lower to upper inode on copy up.
 * We cannot copy up immutable/append-only flags, because that would prevent
 * linking temp inode to upper dir, so we store them in xattr instead.
 */
#define OVL_COPY_FS_FLAGS_MASK        (FS_SYNC_FL | FS_NOATIME_FL)
#define OVL_COPY_FSX_FLAGS_MASK        (FS_XFLAG_SYNC | FS_XFLAG_NOATIME)
#define OVL_PROT_FS_FLAGS_MASK  (FS_APPEND_FL | FS_IMMUTABLE_FL)
#define OVL_PROT_FSX_FLAGS_MASK (FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE)

void ovl_check_protattr(struct inode *inode, struct dentry *upper);
int ovl_set_protattr(struct inode *inode, struct dentry *upper,
                      struct fileattr *fa);

static inline void ovl_copyflags(struct inode *from, struct inode *to)
{
        unsigned int mask = OVL_COPY_I_FLAGS_MASK;

        inode_set_flags(to, from->i_flags & mask, mask);
}

/* dir.c */
extern const struct inode_operations ovl_dir_inode_operations;
int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir,
                             struct dentry *dentry);
struct ovl_cattr {
        dev_t rdev;
        umode_t mode;
        const char *link;
        struct dentry *hardlink;
};

#define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) })

int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir,
                   struct dentry **newdentry, umode_t mode);
struct dentry *ovl_create_real(struct ovl_fs *ofs,
                               struct inode *dir, struct dentry *newdentry,
                               struct ovl_cattr *attr);
int ovl_cleanup(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry);
struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir);
struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
                               struct ovl_cattr *attr);

/* file.c */
extern const struct file_operations ovl_file_operations;
int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa);
int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa);
int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa);
int ovl_fileattr_set(struct mnt_idmap *idmap,
                     struct dentry *dentry, struct fileattr *fa);

/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_with_data(struct dentry *dentry);
int ovl_maybe_copy_up(struct dentry *dentry, int flags);
int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentry *new);
int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat);
struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
                                  bool is_upper);
struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin);
int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh,
                      struct dentry *upper);

/* export.c */
extern const struct export_operations ovl_export_operations;
extern const struct export_operations ovl_export_fid_operations;

/* super.c */
int ovl_fill_super(struct super_block *sb, struct fs_context *fc);

/* Will this overlay be forced to mount/remount ro? */
static inline bool ovl_force_readonly(struct ovl_fs *ofs)
{
        return (!ovl_upper_mnt(ofs) || !ofs->workdir);
}

/* xattr.c */

const struct xattr_handler * const *ovl_xattr_handlers(struct ovl_fs *ofs);
int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                struct iattr *attr);
int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
                struct kstat *stat, u32 request_mask, unsigned int flags);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);













































































































































































    2 









    1 




    1 






































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
// SPDX-License-Identifier: GPL-2.0
/*
 *  fs/ext4/extents_status.h
 *
 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
 * Modified by
 *        Allison Henderson <achender@linux.vnet.ibm.com>
 *        Zheng Liu <wenqing.lz@taobao.com>
 *
 */

#ifndef _EXT4_EXTENTS_STATUS_H
#define _EXT4_EXTENTS_STATUS_H

/*
 * Turn on ES_DEBUG__ to get lots of info about extent status operations.
 */
#ifdef ES_DEBUG__
#define es_debug(fmt, ...)        printk(fmt, ##__VA_ARGS__)
#else
#define es_debug(fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

/*
 * With ES_AGGRESSIVE_TEST defined, the result of es caching will be
 * checked with old map_block's result.
 */
#define ES_AGGRESSIVE_TEST__

/*
 * These flags live in the high bits of extent_status.es_pblk
 */
enum {
        ES_WRITTEN_B,
        ES_UNWRITTEN_B,
        ES_DELAYED_B,
        ES_HOLE_B,
        ES_REFERENCED_B,
        ES_FLAGS
};

#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS)
#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT)

#define EXTENT_STATUS_WRITTEN        (1 << ES_WRITTEN_B)
#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B)
#define EXTENT_STATUS_DELAYED        (1 << ES_DELAYED_B)
#define EXTENT_STATUS_HOLE        (1 << ES_HOLE_B)
#define EXTENT_STATUS_REFERENCED        (1 << ES_REFERENCED_B)

#define ES_TYPE_MASK        ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
                          EXTENT_STATUS_UNWRITTEN | \
                          EXTENT_STATUS_DELAYED | \
                          EXTENT_STATUS_HOLE) << ES_SHIFT)

struct ext4_sb_info;
struct ext4_extent;

struct extent_status {
        struct rb_node rb_node;
        ext4_lblk_t es_lblk;        /* first logical block extent covers */
        ext4_lblk_t es_len;        /* length of extent in block */
        ext4_fsblk_t es_pblk;        /* first physical block */
};

struct ext4_es_tree {
        struct rb_root root;
        struct extent_status *cache_es;        /* recently accessed extent */
};

struct ext4_es_stats {
        unsigned long es_stats_shrunk;
        struct percpu_counter es_stats_cache_hits;
        struct percpu_counter es_stats_cache_misses;
        u64 es_stats_scan_time;
        u64 es_stats_max_scan_time;
        struct percpu_counter es_stats_all_cnt;
        struct percpu_counter es_stats_shk_cnt;
};

/*
 * Pending cluster reservations for bigalloc file systems
 *
 * A cluster with a pending reservation is a logical cluster shared by at
 * least one extent in the extents status tree with delayed and unwritten
 * status and at least one other written or unwritten extent.  The
 * reservation is said to be pending because a cluster reservation would
 * have to be taken in the event all blocks in the cluster shared with
 * written or unwritten extents were deleted while the delayed and
 * unwritten blocks remained.
 *
 * The set of pending cluster reservations is an auxiliary data structure
 * used with the extents status tree to implement reserved cluster/block
 * accounting for bigalloc file systems.  The set is kept in memory and
 * records all pending cluster reservations.
 *
 * Its primary function is to avoid the need to read extents from the
 * disk when invalidating pages as a result of a truncate, punch hole, or
 * collapse range operation.  Page invalidation requires a decrease in the
 * reserved cluster count if it results in the removal of all delayed
 * and unwritten extents (blocks) from a cluster that is not shared with a
 * written or unwritten extent, and no decrease otherwise.  Determining
 * whether the cluster is shared can be done by searching for a pending
 * reservation on it.
 *
 * Secondarily, it provides a potentially faster method for determining
 * whether the reserved cluster count should be increased when a physical
 * cluster is deallocated as a result of a truncate, punch hole, or
 * collapse range operation.  The necessary information is also present
 * in the extents status tree, but might be more rapidly accessed in
 * the pending reservation set in many cases due to smaller size.
 *
 * The pending cluster reservation set is implemented as a red-black tree
 * with the goal of minimizing per page search time overhead.
 */

struct pending_reservation {
        struct rb_node rb_node;
        ext4_lblk_t lclu;
};

struct ext4_pending_tree {
        struct rb_root root;
};

extern int __init ext4_init_es(void);
extern void ext4_exit_es(void);
extern void ext4_es_init_tree(struct ext4_es_tree *tree);

extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
                                  ext4_lblk_t len, ext4_fsblk_t pblk,
                                  unsigned int status);
extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
                                 ext4_lblk_t len, ext4_fsblk_t pblk,
                                 unsigned int status);
extern void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                                  ext4_lblk_t len);
extern void ext4_es_find_extent_range(struct inode *inode,
                                      int (*match_fn)(struct extent_status *es),
                                      ext4_lblk_t lblk, ext4_lblk_t end,
                                      struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
                                 ext4_lblk_t *next_lblk,
                                 struct extent_status *es);
extern bool ext4_es_scan_range(struct inode *inode,
                               int (*matching_fn)(struct extent_status *es),
                               ext4_lblk_t lblk, ext4_lblk_t end);
extern bool ext4_es_scan_clu(struct inode *inode,
                             int (*matching_fn)(struct extent_status *es),
                             ext4_lblk_t lblk);

static inline unsigned int ext4_es_status(struct extent_status *es)
{
        return es->es_pblk >> ES_SHIFT;
}

static inline unsigned int ext4_es_type(struct extent_status *es)
{
        return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT;
}

static inline int ext4_es_is_written(struct extent_status *es)
{
        return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0;
}

static inline int ext4_es_is_unwritten(struct extent_status *es)
{
        return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0;
}

static inline int ext4_es_is_delayed(struct extent_status *es)
{
        return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0;
}

static inline int ext4_es_is_hole(struct extent_status *es)
{
        return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
}

static inline int ext4_es_is_mapped(struct extent_status *es)
{
        return (ext4_es_is_written(es) || ext4_es_is_unwritten(es));
}

static inline int ext4_es_is_delonly(struct extent_status *es)
{
        return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es));
}

static inline void ext4_es_set_referenced(struct extent_status *es)
{
        es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
}

static inline void ext4_es_clear_referenced(struct extent_status *es)
{
        es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT);
}

static inline int ext4_es_is_referenced(struct extent_status *es)
{
        return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0;
}

static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
{
        return es->es_pblk & ~ES_MASK;
}

static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es)
{
        ext4_fsblk_t pblock = ext4_es_pblock(es);
        return pblock == ~ES_MASK ? 0 : pblock;
}

static inline void ext4_es_store_pblock(struct extent_status *es,
                                        ext4_fsblk_t pb)
{
        ext4_fsblk_t block;

        block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK);
        es->es_pblk = block;
}

static inline void ext4_es_store_status(struct extent_status *es,
                                        unsigned int status)
{
        es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
                      (es->es_pblk & ~ES_MASK);
}

static inline void ext4_es_store_pblock_status(struct extent_status *es,
                                               ext4_fsblk_t pb,
                                               unsigned int status)
{
        es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
                      (pb & ~ES_MASK);
}

extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);

extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v);

extern int __init ext4_init_pending(void);
extern void ext4_exit_pending(void);
extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
                                         bool allocated);
extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
                                        ext4_lblk_t len);
extern void ext4_clear_inode_es(struct inode *inode);

#endif /* _EXT4_EXTENTS_STATUS_H */



















































    1 









    7 
    7 




















    8 




    8 
    8 

    8 



    7 




    8 



    8 






















    1 









    1 



    1 
    1 
































































































































































































    1 












    1 




    1 











    1 



















    1 

    1 
    1 


    1 






























    1 


















    1 
    1 




















    1 




















    1 







    1 









    1 




















    1 










    1 





    1 















    1 





    1 



    1 

















    1 







    1 


    1 










    1 





    1 



















    1 













































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/dir.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/dir.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  ext4 directory handling functions
 *
 *  Big-endian to little-endian byte-swapping/bitmaps by
 *        David S. Miller (davem@caip.rutgers.edu), 1995
 *
 * Hash Tree Directory indexing (c) 2001  Daniel Phillips
 *
 */

#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/slab.h>
#include <linux/iversion.h>
#include <linux/unicode.h>
#include "ext4.h"
#include "xattr.h"

static int ext4_dx_readdir(struct file *, struct dir_context *);

/**
 * is_dx_dir() - check if a directory is using htree indexing
 * @inode: directory inode
 *
 * Check if the given dir-inode refers to an htree-indexed directory
 * (or a directory which could potentially get converted to use htree
 * indexing).
 *
 * Return 1 if it is a dx dir, 0 if not
 */
static int is_dx_dir(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;

        if (ext4_has_feature_dir_index(inode->i_sb) &&
            ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
             ((inode->i_size >> sb->s_blocksize_bits) == 1) ||
             ext4_has_inline_data(inode)))
                return 1;

        return 0;
}

static bool is_fake_dir_entry(struct ext4_dir_entry_2 *de)
{
        /* Check if . or .. , or skip if namelen is 0 */
        if ((de->name_len > 0) && (de->name_len <= 2) && (de->name[0] == '.') &&
            (de->name[1] == '.' || de->name[1] == '\0'))
                return true;
        /* Check if this is a csum entry */
        if (de->file_type == EXT4_FT_DIR_CSUM)
                return true;
        return false;
}

/*
 * Return 0 if the directory entry is OK, and 1 if there is a problem
 *
 * Note: this is the opposite of what ext2 and ext3 historically returned...
 *
 * bh passed here can be an inode block or a dir data block, depending
 * on the inode inline data flag.
 */
int __ext4_check_dir_entry(const char *function, unsigned int line,
                           struct inode *dir, struct file *filp,
                           struct ext4_dir_entry_2 *de,
                           struct buffer_head *bh, char *buf, int size,
                           unsigned int offset)
{
        const char *error_msg = NULL;
        const int rlen = ext4_rec_len_from_disk(de->rec_len,
                                                dir->i_sb->s_blocksize);
        const int next_offset = ((char *) de - buf) + rlen;
        bool fake = is_fake_dir_entry(de);
        bool has_csum = ext4_has_metadata_csum(dir->i_sb);

        if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir)))
                error_msg = "rec_len is smaller than minimal";
        else if (unlikely(rlen % 4 != 0))
                error_msg = "rec_len % 4 != 0";
        else if (unlikely(rlen < ext4_dir_rec_len(de->name_len,
                                                        fake ? NULL : dir)))
                error_msg = "rec_len is too small for name_len";
        else if (unlikely(next_offset > size))
                error_msg = "directory entry overrun";
        else if (unlikely(next_offset > size - ext4_dir_rec_len(1,
                                                  has_csum ? NULL : dir) &&
                          next_offset != size))
                error_msg = "directory entry too close to block end";
        else if (unlikely(le32_to_cpu(de->inode) >
                        le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
                error_msg = "inode out of bounds";
        else
                return 0;

        if (filp)
                ext4_error_file(filp, function, line, bh->b_blocknr,
                                "bad entry in directory: %s - offset=%u, "
                                "inode=%u, rec_len=%d, size=%d fake=%d",
                                error_msg, offset, le32_to_cpu(de->inode),
                                rlen, size, fake);
        else
                ext4_error_inode(dir, function, line, bh->b_blocknr,
                                "bad entry in directory: %s - offset=%u, "
                                "inode=%u, rec_len=%d, size=%d fake=%d",
                                 error_msg, offset, le32_to_cpu(de->inode),
                                 rlen, size, fake);

        return 1;
}

static int ext4_readdir(struct file *file, struct dir_context *ctx)
{
        unsigned int offset;
        int i;
        struct ext4_dir_entry_2 *de;
        int err;
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        struct buffer_head *bh = NULL;
        struct fscrypt_str fstr = FSTR_INIT(NULL, 0);

        err = fscrypt_prepare_readdir(inode);
        if (err)
                return err;

        if (is_dx_dir(inode)) {
                err = ext4_dx_readdir(file, ctx);
                if (err != ERR_BAD_DX_DIR)
                        return err;

                /* Can we just clear INDEX flag to ignore htree information? */
                if (!ext4_has_metadata_csum(sb)) {
                        /*
                         * We don't set the inode dirty flag since it's not
                         * critical that it gets flushed back to the disk.
                         */
                        ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
                }
        }

        if (ext4_has_inline_data(inode)) {
                int has_inline_data = 1;
                err = ext4_read_inline_dir(file, ctx,
                                           &has_inline_data);
                if (has_inline_data)
                        return err;
        }

        if (IS_ENCRYPTED(inode)) {
                err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr);
                if (err < 0)
                        return err;
        }

        while (ctx->pos < inode->i_size) {
                struct ext4_map_blocks map;

                if (fatal_signal_pending(current)) {
                        err = -ERESTARTSYS;
                        goto errout;
                }
                cond_resched();
                offset = ctx->pos & (sb->s_blocksize - 1);
                map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
                map.m_len = 1;
                err = ext4_map_blocks(NULL, inode, &map, 0);
                if (err == 0) {
                        /* m_len should never be zero but let's avoid
                         * an infinite loop if it somehow is */
                        if (map.m_len == 0)
                                map.m_len = 1;
                        ctx->pos += map.m_len * sb->s_blocksize;
                        continue;
                }
                if (err > 0) {
                        pgoff_t index = map.m_pblk >>
                                        (PAGE_SHIFT - inode->i_blkbits);
                        if (!ra_has_index(&file->f_ra, index))
                                page_cache_sync_readahead(
                                        sb->s_bdev->bd_mapping,
                                        &file->f_ra, file,
                                        index, 1);
                        file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT;
                        bh = ext4_bread(NULL, inode, map.m_lblk, 0);
                        if (IS_ERR(bh)) {
                                err = PTR_ERR(bh);
                                bh = NULL;
                                goto errout;
                        }
                }

                if (!bh) {
                        /* corrupt size?  Maybe no more blocks to read */
                        if (ctx->pos > inode->i_blocks << 9)
                                break;
                        ctx->pos += sb->s_blocksize - offset;
                        continue;
                }

                /* Check the checksum */
                if (!buffer_verified(bh) &&
                    !ext4_dirblock_csum_verify(inode, bh)) {
                        EXT4_ERROR_FILE(file, 0, "directory fails checksum "
                                        "at offset %llu",
                                        (unsigned long long)ctx->pos);
                        ctx->pos += sb->s_blocksize - offset;
                        brelse(bh);
                        bh = NULL;
                        continue;
                }
                set_buffer_verified(bh);

                /* If the dir block has changed since the last call to
                 * readdir(2), then we might be pointing to an invalid
                 * dirent right now.  Scan from the start of the block
                 * to make sure. */
                if (!inode_eq_iversion(inode, file->f_version)) {
                        for (i = 0; i < sb->s_blocksize && i < offset; ) {
                                de = (struct ext4_dir_entry_2 *)
                                        (bh->b_data + i);
                                /* It's too expensive to do a full
                                 * dirent test each time round this
                                 * loop, but we do have to test at
                                 * least that it is non-zero.  A
                                 * failure will be detected in the
                                 * dirent test below. */
                                if (ext4_rec_len_from_disk(de->rec_len,
                                        sb->s_blocksize) < ext4_dir_rec_len(1,
                                                                        inode))
                                        break;
                                i += ext4_rec_len_from_disk(de->rec_len,
                                                            sb->s_blocksize);
                        }
                        offset = i;
                        ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
                                | offset;
                        file->f_version = inode_query_iversion(inode);
                }

                while (ctx->pos < inode->i_size
                       && offset < sb->s_blocksize) {
                        de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
                        if (ext4_check_dir_entry(inode, file, de, bh,
                                                 bh->b_data, bh->b_size,
                                                 offset)) {
                                /*
                                 * On error, skip to the next block
                                 */
                                ctx->pos = (ctx->pos |
                                                (sb->s_blocksize - 1)) + 1;
                                break;
                        }
                        offset += ext4_rec_len_from_disk(de->rec_len,
                                        sb->s_blocksize);
                        if (le32_to_cpu(de->inode)) {
                                if (!IS_ENCRYPTED(inode)) {
                                        if (!dir_emit(ctx, de->name,
                                            de->name_len,
                                            le32_to_cpu(de->inode),
                                            get_dtype(sb, de->file_type)))
                                                goto done;
                                } else {
                                        int save_len = fstr.len;
                                        struct fscrypt_str de_name =
                                                        FSTR_INIT(de->name,
                                                                de->name_len);

                                        /* Directory is encrypted */
                                        err = fscrypt_fname_disk_to_usr(inode,
                                                EXT4_DIRENT_HASH(de),
                                                EXT4_DIRENT_MINOR_HASH(de),
                                                &de_name, &fstr);
                                        de_name = fstr;
                                        fstr.len = save_len;
                                        if (err)
                                                goto errout;
                                        if (!dir_emit(ctx,
                                            de_name.name, de_name.len,
                                            le32_to_cpu(de->inode),
                                            get_dtype(sb, de->file_type)))
                                                goto done;
                                }
                        }
                        ctx->pos += ext4_rec_len_from_disk(de->rec_len,
                                                sb->s_blocksize);
                }
                if ((ctx->pos < inode->i_size) && !dir_relax_shared(inode))
                        goto done;
                brelse(bh);
                bh = NULL;
        }
done:
        err = 0;
errout:
        fscrypt_fname_free_buffer(&fstr);
        brelse(bh);
        return err;
}

static inline int is_32bit_api(void)
{
#ifdef CONFIG_COMPAT
        return in_compat_syscall();
#else
        return (BITS_PER_LONG == 32);
#endif
}

/*
 * These functions convert from the major/minor hash to an f_pos
 * value for dx directories
 *
 * Upper layer (for example NFS) should specify FMODE_32BITHASH or
 * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted
 * directly on both 32-bit and 64-bit nodes, under such case, neither
 * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
 */
static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
{
        if ((filp->f_mode & FMODE_32BITHASH) ||
            (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
                return major >> 1;
        else
                return ((__u64)(major >> 1) << 32) | (__u64)minor;
}

static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
{
        if ((filp->f_mode & FMODE_32BITHASH) ||
            (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
                return (pos << 1) & 0xffffffff;
        else
                return ((pos >> 32) << 1) & 0xffffffff;
}

static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
{
        if ((filp->f_mode & FMODE_32BITHASH) ||
            (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
                return 0;
        else
                return pos & 0xffffffff;
}

/*
 * Return 32- or 64-bit end-of-file for dx directories
 */
static inline loff_t ext4_get_htree_eof(struct file *filp)
{
        if ((filp->f_mode & FMODE_32BITHASH) ||
            (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
                return EXT4_HTREE_EOF_32BIT;
        else
                return EXT4_HTREE_EOF_64BIT;
}


/*
 * ext4_dir_llseek() calls generic_file_llseek_size to handle htree
 * directories, where the "offset" is in terms of the filename hash
 * value instead of the byte offset.
 *
 * Because we may return a 64-bit hash that is well beyond offset limits,
 * we need to pass the max hash as the maximum allowable offset in
 * the htree directory case.
 *
 * For non-htree, ext4_llseek already chooses the proper max offset.
 */
static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *inode = file->f_mapping->host;
        int dx_dir = is_dx_dir(inode);
        loff_t ret, htree_max = ext4_get_htree_eof(file);

        if (likely(dx_dir))
                ret = generic_file_llseek_size(file, offset, whence,
                                                    htree_max, htree_max);
        else
                ret = ext4_llseek(file, offset, whence);
        file->f_version = inode_peek_iversion(inode) - 1;
        return ret;
}

/*
 * This structure holds the nodes of the red-black tree used to store
 * the directory entry in hash order.
 */
struct fname {
        __u32                hash;
        __u32                minor_hash;
        struct rb_node        rb_hash;
        struct fname        *next;
        __u32                inode;
        __u8                name_len;
        __u8                file_type;
        char                name[];
};

/*
 * This function implements a non-recursive way of freeing all of the
 * nodes in the red-black tree.
 */
static void free_rb_tree_fname(struct rb_root *root)
{
        struct fname *fname, *next;

        rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
                while (fname) {
                        struct fname *old = fname;
                        fname = fname->next;
                        kfree(old);
                }

        *root = RB_ROOT;
}


static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
                                                           loff_t pos)
{
        struct dir_private_info *p;

        p = kzalloc(sizeof(*p), GFP_KERNEL);
        if (!p)
                return NULL;
        p->curr_hash = pos2maj_hash(filp, pos);
        p->curr_minor_hash = pos2min_hash(filp, pos);
        return p;
}

void ext4_htree_free_dir_info(struct dir_private_info *p)
{
        free_rb_tree_fname(&p->root);
        kfree(p);
}

/*
 * Given a directory entry, enter it into the fname rb tree.
 *
 * When filename encryption is enabled, the dirent will hold the
 * encrypted filename, while the htree will hold decrypted filename.
 * The decrypted filename is passed in via ent_name.  parameter.
 */
int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                             __u32 minor_hash,
                            struct ext4_dir_entry_2 *dirent,
                            struct fscrypt_str *ent_name)
{
        struct rb_node **p, *parent = NULL;
        struct fname *fname, *new_fn;
        struct dir_private_info *info;
        int len;

        info = dir_file->private_data;
        p = &info->root.rb_node;

        /* Create and allocate the fname structure */
        len = sizeof(struct fname) + ent_name->len + 1;
        new_fn = kzalloc(len, GFP_KERNEL);
        if (!new_fn)
                return -ENOMEM;
        new_fn->hash = hash;
        new_fn->minor_hash = minor_hash;
        new_fn->inode = le32_to_cpu(dirent->inode);
        new_fn->name_len = ent_name->len;
        new_fn->file_type = dirent->file_type;
        memcpy(new_fn->name, ent_name->name, ent_name->len);

        while (*p) {
                parent = *p;
                fname = rb_entry(parent, struct fname, rb_hash);

                /*
                 * If the hash and minor hash match up, then we put
                 * them on a linked list.  This rarely happens...
                 */
                if ((new_fn->hash == fname->hash) &&
                    (new_fn->minor_hash == fname->minor_hash)) {
                        new_fn->next = fname->next;
                        fname->next = new_fn;
                        return 0;
                }

                if (new_fn->hash < fname->hash)
                        p = &(*p)->rb_left;
                else if (new_fn->hash > fname->hash)
                        p = &(*p)->rb_right;
                else if (new_fn->minor_hash < fname->minor_hash)
                        p = &(*p)->rb_left;
                else /* if (new_fn->minor_hash > fname->minor_hash) */
                        p = &(*p)->rb_right;
        }

        rb_link_node(&new_fn->rb_hash, parent, p);
        rb_insert_color(&new_fn->rb_hash, &info->root);
        return 0;
}



/*
 * This is a helper function for ext4_dx_readdir.  It calls filldir
 * for all entries on the fname linked list.  (Normally there is only
 * one entry on the linked list, unless there are 62 bit hash collisions.)
 */
static int call_filldir(struct file *file, struct dir_context *ctx,
                        struct fname *fname)
{
        struct dir_private_info *info = file->private_data;
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;

        if (!fname) {
                ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
                         "called with null fname?!?", __func__, __LINE__,
                         inode->i_ino, current->comm);
                return 0;
        }
        ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
        while (fname) {
                if (!dir_emit(ctx, fname->name,
                                fname->name_len,
                                fname->inode,
                                get_dtype(sb, fname->file_type))) {
                        info->extra_fname = fname;
                        return 1;
                }
                fname = fname->next;
        }
        return 0;
}

static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
{
        struct dir_private_info *info = file->private_data;
        struct inode *inode = file_inode(file);
        struct fname *fname;
        int ret = 0;

        if (!info) {
                info = ext4_htree_create_dir_info(file, ctx->pos);
                if (!info)
                        return -ENOMEM;
                file->private_data = info;
        }

        if (ctx->pos == ext4_get_htree_eof(file))
                return 0;        /* EOF */

        /* Some one has messed with f_pos; reset the world */
        if (info->last_pos != ctx->pos) {
                free_rb_tree_fname(&info->root);
                info->curr_node = NULL;
                info->extra_fname = NULL;
                info->curr_hash = pos2maj_hash(file, ctx->pos);
                info->curr_minor_hash = pos2min_hash(file, ctx->pos);
        }

        /*
         * If there are any leftover names on the hash collision
         * chain, return them first.
         */
        if (info->extra_fname) {
                if (call_filldir(file, ctx, info->extra_fname))
                        goto finished;
                info->extra_fname = NULL;
                goto next_node;
        } else if (!info->curr_node)
                info->curr_node = rb_first(&info->root);

        while (1) {
                /*
                 * Fill the rbtree if we have no more entries,
                 * or the inode has changed since we last read in the
                 * cached entries.
                 */
                if ((!info->curr_node) ||
                    !inode_eq_iversion(inode, file->f_version)) {
                        info->curr_node = NULL;
                        free_rb_tree_fname(&info->root);
                        file->f_version = inode_query_iversion(inode);
                        ret = ext4_htree_fill_tree(file, info->curr_hash,
                                                   info->curr_minor_hash,
                                                   &info->next_hash);
                        if (ret < 0)
                                goto finished;
                        if (ret == 0) {
                                ctx->pos = ext4_get_htree_eof(file);
                                break;
                        }
                        info->curr_node = rb_first(&info->root);
                }

                fname = rb_entry(info->curr_node, struct fname, rb_hash);
                info->curr_hash = fname->hash;
                info->curr_minor_hash = fname->minor_hash;
                if (call_filldir(file, ctx, fname))
                        break;
        next_node:
                info->curr_node = rb_next(info->curr_node);
                if (info->curr_node) {
                        fname = rb_entry(info->curr_node, struct fname,
                                         rb_hash);
                        info->curr_hash = fname->hash;
                        info->curr_minor_hash = fname->minor_hash;
                } else {
                        if (info->next_hash == ~0) {
                                ctx->pos = ext4_get_htree_eof(file);
                                break;
                        }
                        info->curr_hash = info->next_hash;
                        info->curr_minor_hash = 0;
                }
        }
finished:
        info->last_pos = ctx->pos;
        return ret < 0 ? ret : 0;
}

static int ext4_release_dir(struct inode *inode, struct file *filp)
{
        if (filp->private_data)
                ext4_htree_free_dir_info(filp->private_data);

        return 0;
}

int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
                      int buf_size)
{
        struct ext4_dir_entry_2 *de;
        int rlen;
        unsigned int offset = 0;
        char *top;

        de = buf;
        top = buf + buf_size;
        while ((char *) de < top) {
                if (ext4_check_dir_entry(dir, NULL, de, bh,
                                         buf, buf_size, offset))
                        return -EFSCORRUPTED;
                rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
                de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
                offset += rlen;
        }
        if ((char *) de > top)
                return -EFSCORRUPTED;

        return 0;
}

const struct file_operations ext4_dir_operations = {
        .llseek                = ext4_dir_llseek,
        .read                = generic_read_dir,
        .iterate_shared        = ext4_readdir,
        .unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl        = ext4_compat_ioctl,
#endif
        .fsync                = ext4_sync_file,
        .release        = ext4_release_dir,
};






























































































    2 





















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PAGE_REF_H
#define _LINUX_PAGE_REF_H

#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/tracepoint-defs.h>

DECLARE_TRACEPOINT(page_ref_set);
DECLARE_TRACEPOINT(page_ref_mod);
DECLARE_TRACEPOINT(page_ref_mod_and_test);
DECLARE_TRACEPOINT(page_ref_mod_and_return);
DECLARE_TRACEPOINT(page_ref_mod_unless);
DECLARE_TRACEPOINT(page_ref_freeze);
DECLARE_TRACEPOINT(page_ref_unfreeze);

#ifdef CONFIG_DEBUG_PAGE_REF

/*
 * Ideally we would want to use the trace_<tracepoint>_enabled() helper
 * functions. But due to include header file issues, that is not
 * feasible. Instead we have to open code the static key functions.
 *
 * See trace_##name##_enabled(void) in include/linux/tracepoint.h
 */
#define page_ref_tracepoint_active(t) tracepoint_enabled(t)

extern void __page_ref_set(struct page *page, int v);
extern void __page_ref_mod(struct page *page, int v);
extern void __page_ref_mod_and_test(struct page *page, int v, int ret);
extern void __page_ref_mod_and_return(struct page *page, int v, int ret);
extern void __page_ref_mod_unless(struct page *page, int v, int u);
extern void __page_ref_freeze(struct page *page, int v, int ret);
extern void __page_ref_unfreeze(struct page *page, int v);

#else

#define page_ref_tracepoint_active(t) false

static inline void __page_ref_set(struct page *page, int v)
{
}
static inline void __page_ref_mod(struct page *page, int v)
{
}
static inline void __page_ref_mod_and_test(struct page *page, int v, int ret)
{
}
static inline void __page_ref_mod_and_return(struct page *page, int v, int ret)
{
}
static inline void __page_ref_mod_unless(struct page *page, int v, int u)
{
}
static inline void __page_ref_freeze(struct page *page, int v, int ret)
{
}
static inline void __page_ref_unfreeze(struct page *page, int v)
{
}

#endif

static inline int page_ref_count(const struct page *page)
{
        return atomic_read(&page->_refcount);
}

/**
 * folio_ref_count - The reference count on this folio.
 * @folio: The folio.
 *
 * The refcount is usually incremented by calls to folio_get() and
 * decremented by calls to folio_put().  Some typical users of the
 * folio refcount:
 *
 * - Each reference from a page table
 * - The page cache
 * - Filesystem private data
 * - The LRU list
 * - Pipes
 * - Direct IO which references this page in the process address space
 *
 * Return: The number of references to this folio.
 */
static inline int folio_ref_count(const struct folio *folio)
{
        return page_ref_count(&folio->page);
}

static inline int page_count(const struct page *page)
{
        return folio_ref_count(page_folio(page));
}

static inline void set_page_count(struct page *page, int v)
{
        atomic_set(&page->_refcount, v);
        if (page_ref_tracepoint_active(page_ref_set))
                __page_ref_set(page, v);
}

static inline void folio_set_count(struct folio *folio, int v)
{
        set_page_count(&folio->page, v);
}

/*
 * Setup the page count before being freed into the page allocator for
 * the first time (boot or memory hotplug)
 */
static inline void init_page_count(struct page *page)
{
        set_page_count(page, 1);
}

static inline void page_ref_add(struct page *page, int nr)
{
        atomic_add(nr, &page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, nr);
}

static inline void folio_ref_add(struct folio *folio, int nr)
{
        page_ref_add(&folio->page, nr);
}

static inline void page_ref_sub(struct page *page, int nr)
{
        atomic_sub(nr, &page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, -nr);
}

static inline void folio_ref_sub(struct folio *folio, int nr)
{
        page_ref_sub(&folio->page, nr);
}

static inline int folio_ref_sub_return(struct folio *folio, int nr)
{
        int ret = atomic_sub_return(nr, &folio->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(&folio->page, -nr, ret);
        return ret;
}

static inline void page_ref_inc(struct page *page)
{
        atomic_inc(&page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, 1);
}

static inline void folio_ref_inc(struct folio *folio)
{
        page_ref_inc(&folio->page);
}

static inline void page_ref_dec(struct page *page)
{
        atomic_dec(&page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, -1);
}

static inline void folio_ref_dec(struct folio *folio)
{
        page_ref_dec(&folio->page);
}

static inline int page_ref_sub_and_test(struct page *page, int nr)
{
        int ret = atomic_sub_and_test(nr, &page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_test))
                __page_ref_mod_and_test(page, -nr, ret);
        return ret;
}

static inline int folio_ref_sub_and_test(struct folio *folio, int nr)
{
        return page_ref_sub_and_test(&folio->page, nr);
}

static inline int page_ref_inc_return(struct page *page)
{
        int ret = atomic_inc_return(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(page, 1, ret);
        return ret;
}

static inline int folio_ref_inc_return(struct folio *folio)
{
        return page_ref_inc_return(&folio->page);
}

static inline int page_ref_dec_and_test(struct page *page)
{
        int ret = atomic_dec_and_test(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_test))
                __page_ref_mod_and_test(page, -1, ret);
        return ret;
}

static inline int folio_ref_dec_and_test(struct folio *folio)
{
        return page_ref_dec_and_test(&folio->page);
}

static inline int page_ref_dec_return(struct page *page)
{
        int ret = atomic_dec_return(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(page, -1, ret);
        return ret;
}

static inline int folio_ref_dec_return(struct folio *folio)
{
        return page_ref_dec_return(&folio->page);
}

static inline bool page_ref_add_unless(struct page *page, int nr, int u)
{
        bool ret = atomic_add_unless(&page->_refcount, nr, u);

        if (page_ref_tracepoint_active(page_ref_mod_unless))
                __page_ref_mod_unless(page, nr, ret);
        return ret;
}

static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u)
{
        return page_ref_add_unless(&folio->page, nr, u);
}

/**
 * folio_try_get - Attempt to increase the refcount on a folio.
 * @folio: The folio.
 *
 * If you do not already have a reference to a folio, you can attempt to
 * get one using this function.  It may fail if, for example, the folio
 * has been freed since you found a pointer to it, or it is frozen for
 * the purposes of splitting or migration.
 *
 * Return: True if the reference count was successfully incremented.
 */
static inline bool folio_try_get(struct folio *folio)
{
        return folio_ref_add_unless(folio, 1, 0);
}

static inline bool folio_ref_try_add_rcu(struct folio *folio, int count)
{
#ifdef CONFIG_TINY_RCU
        /*
         * The caller guarantees the folio will not be freed from interrupt
         * context, so (on !SMP) we only need preemption to be disabled
         * and TINY_RCU does that for us.
         */
# ifdef CONFIG_PREEMPT_COUNT
        VM_BUG_ON(!in_atomic() && !irqs_disabled());
# endif
        VM_BUG_ON_FOLIO(folio_ref_count(folio) == 0, folio);
        folio_ref_add(folio, count);
#else
        if (unlikely(!folio_ref_add_unless(folio, count, 0))) {
                /* Either the folio has been freed, or will be freed. */
                return false;
        }
#endif
        return true;
}

/**
 * folio_try_get_rcu - Attempt to increase the refcount on a folio.
 * @folio: The folio.
 *
 * This is a version of folio_try_get() optimised for non-SMP kernels.
 * If you are still holding the rcu_read_lock() after looking up the
 * page and know that the page cannot have its refcount decreased to
 * zero in interrupt context, you can use this instead of folio_try_get().
 *
 * Example users include get_user_pages_fast() (as pages are not unmapped
 * from interrupt context) and the page cache lookups (as pages are not
 * truncated from interrupt context).  We also know that pages are not
 * frozen in interrupt context for the purposes of splitting or migration.
 *
 * You can also use this function if you're holding a lock that prevents
 * pages being frozen & removed; eg the i_pages lock for the page cache
 * or the mmap_lock or page table lock for page tables.  In this case,
 * it will always succeed, and you could have used a plain folio_get(),
 * but it's sometimes more convenient to have a common function called
 * from both locked and RCU-protected contexts.
 *
 * Return: True if the reference count was successfully incremented.
 */
static inline bool folio_try_get_rcu(struct folio *folio)
{
        return folio_ref_try_add_rcu(folio, 1);
}

static inline int page_ref_freeze(struct page *page, int count)
{
        int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);

        if (page_ref_tracepoint_active(page_ref_freeze))
                __page_ref_freeze(page, count, ret);
        return ret;
}

static inline int folio_ref_freeze(struct folio *folio, int count)
{
        return page_ref_freeze(&folio->page, count);
}

static inline void page_ref_unfreeze(struct page *page, int count)
{
        VM_BUG_ON_PAGE(page_count(page) != 0, page);
        VM_BUG_ON(count == 0);

        atomic_set_release(&page->_refcount, count);
        if (page_ref_tracepoint_active(page_ref_unfreeze))
                __page_ref_unfreeze(page, count);
}

static inline void folio_ref_unfreeze(struct folio *folio, int count)
{
        page_ref_unfreeze(&folio->page, count);
}
#endif































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PKEYS_H
#define _ASM_X86_PKEYS_H

/*
 * If more than 16 keys are ever supported, a thorough audit
 * will be necessary to ensure that the types that store key
 * numbers and masks have sufficient capacity.
 */
#define arch_max_pkey() (cpu_feature_enabled(X86_FEATURE_OSPKE) ? 16 : 1)

extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
                unsigned long init_val);

static inline bool arch_pkeys_enabled(void)
{
        return cpu_feature_enabled(X86_FEATURE_OSPKE);
}

/*
 * Try to dedicate one of the protection keys to be used as an
 * execute-only protection key.
 */
extern int __execute_only_pkey(struct mm_struct *mm);
static inline int execute_only_pkey(struct mm_struct *mm)
{
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return ARCH_DEFAULT_PKEY;

        return __execute_only_pkey(mm);
}

extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma,
                int prot, int pkey);
static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
                int prot, int pkey)
{
        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return 0;

        return __arch_override_mprotect_pkey(vma, prot, pkey);
}

#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3)

#define mm_pkey_allocation_map(mm)        (mm->context.pkey_allocation_map)
#define mm_set_pkey_allocated(mm, pkey) do {                \
        mm_pkey_allocation_map(mm) |= (1U << pkey);        \
} while (0)
#define mm_set_pkey_free(mm, pkey) do {                        \
        mm_pkey_allocation_map(mm) &= ~(1U << pkey);        \
} while (0)

static inline
bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
{
        /*
         * "Allocated" pkeys are those that have been returned
         * from pkey_alloc() or pkey 0 which is allocated
         * implicitly when the mm is created.
         */
        if (pkey < 0)
                return false;
        if (pkey >= arch_max_pkey())
                return false;
        /*
         * The exec-only pkey is set in the allocation map, but
         * is not available to any of the user interfaces like
         * mprotect_pkey().
         */
        if (pkey == mm->context.execute_only_pkey)
                return false;

        return mm_pkey_allocation_map(mm) & (1U << pkey);
}

/*
 * Returns a positive, 4-bit key on success, or -1 on failure.
 */
static inline
int mm_pkey_alloc(struct mm_struct *mm)
{
        /*
         * Note: this is the one and only place we make sure
         * that the pkey is valid as far as the hardware is
         * concerned.  The rest of the kernel trusts that
         * only good, valid pkeys come out of here.
         */
        u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1);
        int ret;

        /*
         * Are we out of pkeys?  We must handle this specially
         * because ffz() behavior is undefined if there are no
         * zeros.
         */
        if (mm_pkey_allocation_map(mm) == all_pkeys_mask)
                return -1;

        ret = ffz(mm_pkey_allocation_map(mm));

        mm_set_pkey_allocated(mm, ret);

        return ret;
}

static inline
int mm_pkey_free(struct mm_struct *mm, int pkey)
{
        if (!mm_pkey_is_allocated(mm, pkey))
                return -EINVAL;

        mm_set_pkey_free(mm, pkey);

        return 0;
}

static inline int vma_pkey(struct vm_area_struct *vma)
{
        unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
                                      VM_PKEY_BIT2 | VM_PKEY_BIT3;

        return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
}

#endif /*_ASM_X86_PKEYS_H */














































































































































































    2 




    2 









    2 




    4 












    1 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_WAIT_H
#define _LINUX_WAIT_H
/*
 * Linux wait queue related types and methods
 */
#include <linux/list.h>
#include <linux/stddef.h>
#include <linux/spinlock.h>

#include <asm/current.h>

typedef struct wait_queue_entry wait_queue_entry_t;

typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);
int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);

/* wait_queue_entry::flags */
#define WQ_FLAG_EXCLUSIVE        0x01
#define WQ_FLAG_WOKEN                0x02
#define WQ_FLAG_CUSTOM                0x04
#define WQ_FLAG_DONE                0x08
#define WQ_FLAG_PRIORITY        0x10

/*
 * A single wait-queue entry structure:
 */
struct wait_queue_entry {
        unsigned int                flags;
        void                        *private;
        wait_queue_func_t        func;
        struct list_head        entry;
};

struct wait_queue_head {
        spinlock_t                lock;
        struct list_head        head;
};
typedef struct wait_queue_head wait_queue_head_t;

struct task_struct;

/*
 * Macros for declaration and initialisaton of the datatypes
 */

#define __WAITQUEUE_INITIALIZER(name, tsk) {                                        \
        .private        = tsk,                                                        \
        .func                = default_wake_function,                                \
        .entry                = { NULL, NULL } }

#define DECLARE_WAITQUEUE(name, tsk)                                                \
        struct wait_queue_entry name = __WAITQUEUE_INITIALIZER(name, tsk)

#define __WAIT_QUEUE_HEAD_INITIALIZER(name) {                                        \
        .lock                = __SPIN_LOCK_UNLOCKED(name.lock),                        \
        .head                = LIST_HEAD_INIT(name.head) }

#define DECLARE_WAIT_QUEUE_HEAD(name) \
        struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name)

extern void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *);

#define init_waitqueue_head(wq_head)                                                \
        do {                                                                        \
                static struct lock_class_key __key;                                \
                                                                                \
                __init_waitqueue_head((wq_head), #wq_head, &__key);                \
        } while (0)

#ifdef CONFIG_LOCKDEP
# define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \
        ({ init_waitqueue_head(&name); name; })
# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) \
        struct wait_queue_head name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name)
#else
# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name)
#endif

static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, struct task_struct *p)
{
        wq_entry->flags                = 0;
        wq_entry->private        = p;
        wq_entry->func                = default_wake_function;
}

static inline void
init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func)
{
        wq_entry->flags                = 0;
        wq_entry->private        = NULL;
        wq_entry->func                = func;
}

/**
 * waitqueue_active -- locklessly test for waiters on the queue
 * @wq_head: the waitqueue to test for waiters
 *
 * returns true if the wait list is not empty
 *
 * NOTE: this function is lockless and requires care, incorrect usage _will_
 * lead to sporadic and non-obvious failure.
 *
 * Use either while holding wait_queue_head::lock or when used for wakeups
 * with an extra smp_mb() like::
 *
 *      CPU0 - waker                    CPU1 - waiter
 *
 *                                      for (;;) {
 *      @cond = true;                     prepare_to_wait(&wq_head, &wait, state);
 *      smp_mb();                         // smp_mb() from set_current_state()
 *      if (waitqueue_active(wq_head))         if (@cond)
 *        wake_up(wq_head);                      break;
 *                                        schedule();
 *                                      }
 *                                      finish_wait(&wq_head, &wait);
 *
 * Because without the explicit smp_mb() it's possible for the
 * waitqueue_active() load to get hoisted over the @cond store such that we'll
 * observe an empty wait list while the waiter might not observe @cond.
 *
 * Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
 * which (when the lock is uncontended) are of roughly equal cost.
 */
static inline int waitqueue_active(struct wait_queue_head *wq_head)
{
        return !list_empty(&wq_head->head);
}

/**
 * wq_has_single_sleeper - check if there is only one sleeper
 * @wq_head: wait queue head
 *
 * Returns true of wq_head has only one sleeper on the list.
 *
 * Please refer to the comment for waitqueue_active.
 */
static inline bool wq_has_single_sleeper(struct wait_queue_head *wq_head)
{
        return list_is_singular(&wq_head->head);
}

/**
 * wq_has_sleeper - check if there are any waiting processes
 * @wq_head: wait queue head
 *
 * Returns true if wq_head has waiting processes
 *
 * Please refer to the comment for waitqueue_active.
 */
static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
{
        /*
         * We need to be sure we are in sync with the
         * add_wait_queue modifications to the wait queue.
         *
         * This memory barrier should be paired with one on the
         * waiting side.
         */
        smp_mb();
        return waitqueue_active(wq_head);
}

extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);

static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        struct list_head *head = &wq_head->head;
        struct wait_queue_entry *wq;

        list_for_each_entry(wq, &wq_head->head, entry) {
                if (!(wq->flags & WQ_FLAG_PRIORITY))
                        break;
                head = &wq->entry;
        }
        list_add(&wq_entry->entry, head);
}

/*
 * Used for wake-one threads:
 */
static inline void
__add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
        __add_wait_queue(wq_head, wq_entry);
}

static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        list_add_tail(&wq_entry->entry, &wq_head->head);
}

static inline void
__add_wait_queue_entry_tail_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
        __add_wait_queue_entry_tail(wq_head, wq_entry);
}

static inline void
__remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        list_del(&wq_entry->entry);
}

int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
void __wake_up_pollfree(struct wait_queue_head *wq_head);

#define wake_up(x)                        __wake_up(x, TASK_NORMAL, 1, NULL)
#define wake_up_nr(x, nr)                __wake_up(x, TASK_NORMAL, nr, NULL)
#define wake_up_all(x)                        __wake_up(x, TASK_NORMAL, 0, NULL)
#define wake_up_locked(x)                __wake_up_locked((x), TASK_NORMAL, 1)
#define wake_up_all_locked(x)                __wake_up_locked((x), TASK_NORMAL, 0)

#define wake_up_interruptible(x)        __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
#define wake_up_interruptible_nr(x, nr)        __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
#define wake_up_interruptible_all(x)        __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
#define wake_up_interruptible_sync(x)        __wake_up_sync((x), TASK_INTERRUPTIBLE)

/*
 * Wakeup macros to be used to report events to the targets.
 */
#define poll_to_key(m) ((void *)(__force uintptr_t)(__poll_t)(m))
#define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m))
#define wake_up_poll(x, m)                                                        \
        __wake_up(x, TASK_NORMAL, 1, poll_to_key(m))
#define wake_up_poll_on_current_cpu(x, m)                                        \
        __wake_up_on_current_cpu(x, TASK_NORMAL, poll_to_key(m))
#define wake_up_locked_poll(x, m)                                                \
        __wake_up_locked_key((x), TASK_NORMAL, poll_to_key(m))
#define wake_up_interruptible_poll(x, m)                                        \
        __wake_up(x, TASK_INTERRUPTIBLE, 1, poll_to_key(m))
#define wake_up_interruptible_sync_poll(x, m)                                        \
        __wake_up_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))
#define wake_up_interruptible_sync_poll_locked(x, m)                                \
        __wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))

/**
 * wake_up_pollfree - signal that a polled waitqueue is going away
 * @wq_head: the wait queue head
 *
 * In the very rare cases where a ->poll() implementation uses a waitqueue whose
 * lifetime is tied to a task rather than to the 'struct file' being polled,
 * this function must be called before the waitqueue is freed so that
 * non-blocking polls (e.g. epoll) are notified that the queue is going away.
 *
 * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via
 * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU.
 */
static inline void wake_up_pollfree(struct wait_queue_head *wq_head)
{
        /*
         * For performance reasons, we don't always take the queue lock here.
         * Therefore, we might race with someone removing the last entry from
         * the queue, and proceed while they still hold the queue lock.
         * However, rcu_read_lock() is required to be held in such cases, so we
         * can safely proceed with an RCU-delayed free.
         */
        if (waitqueue_active(wq_head))
                __wake_up_pollfree(wq_head);
}

#define ___wait_cond_timeout(condition)                                                \
({                                                                                \
        bool __cond = (condition);                                                \
        if (__cond && !__ret)                                                        \
                __ret = 1;                                                        \
        __cond || !__ret;                                                        \
})

#define ___wait_is_interruptible(state)                                                \
        (!__builtin_constant_p(state) ||                                        \
         (state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))

extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags);

/*
 * The below macro ___wait_event() has an explicit shadow of the __ret
 * variable when used from the wait_event_*() macros.
 *
 * This is so that both can use the ___wait_cond_timeout() construct
 * to wrap the condition.
 *
 * The type inconsistency of the wait_event_*() __ret variable is also
 * on purpose; we use long where we can return timeout values and int
 * otherwise.
 */

#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd)                \
({                                                                                \
        __label__ __out;                                                        \
        struct wait_queue_entry __wq_entry;                                        \
        long __ret = ret;        /* explicit shadow */                                \
                                                                                \
        init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0);        \
        for (;;) {                                                                \
                long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\
                                                                                \
                if (condition)                                                        \
                        break;                                                        \
                                                                                \
                if (___wait_is_interruptible(state) && __int) {                        \
                        __ret = __int;                                                \
                        goto __out;                                                \
                }                                                                \
                                                                                \
                cmd;                                                                \
        }                                                                        \
        finish_wait(&wq_head, &__wq_entry);                                        \
__out:        __ret;                                                                        \
})

#define __wait_event(wq_head, condition)                                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            schedule())

/**
 * wait_event - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event(wq_head, condition)                                                \
do {                                                                                \
        might_sleep();                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event(wq_head, condition);                                        \
} while (0)

#define __io_wait_event(wq_head, condition)                                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            io_schedule())

/*
 * io_wait_event() -- like wait_event() but with io_schedule()
 */
#define io_wait_event(wq_head, condition)                                        \
do {                                                                                \
        might_sleep();                                                                \
        if (condition)                                                                \
                break;                                                                \
        __io_wait_event(wq_head, condition);                                        \
} while (0)

#define __wait_event_freezable(wq_head, condition)                                \
        ___wait_event(wq_head, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE),        \
                        0, 0, schedule())

/**
 * wait_event_freezable - sleep (or freeze) until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE -- so as not to contribute
 * to system load) until the @condition evaluates to true. The
 * @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event_freezable(wq_head, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_freezable(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_timeout(wq_head, condition, timeout)                        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_UNINTERRUPTIBLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_timeout(wq_head, condition, timeout)                                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_timeout(wq_head, condition, timeout);        \
        __ret;                                                                        \
})

#define __wait_event_freezable_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 0, timeout,                \
                      __ret = schedule_timeout(__ret))

/*
 * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid
 * increasing load and is freezable.
 */
#define wait_event_freezable_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_freezable_timeout(wq_head, condition, timeout); \
        __ret;                                                                        \
})

#define __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2)                \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 1, 0,        \
                            cmd1; schedule(); cmd2)
/*
 * Just like wait_event_cmd(), except it sets exclusive flag
 */
#define wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2)                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2);                \
} while (0)

#define __wait_event_cmd(wq_head, condition, cmd1, cmd2)                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            cmd1; schedule(); cmd2)

/**
 * wait_event_cmd - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @cmd1: the command will be executed before sleep
 * @cmd2: the command will be executed after sleep
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event_cmd(wq_head, condition, cmd1, cmd2)                                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_cmd(wq_head, condition, cmd1, cmd2);                        \
} while (0)

#define __wait_event_interruptible(wq_head, condition)                                \
        ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0,                \
                      schedule())

/**
 * wait_event_interruptible - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible(wq_head, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_interruptible_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_INTERRUPTIBLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
 * interrupted by a signal.
 */
#define wait_event_interruptible_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_interruptible_timeout(wq_head,                \
                                                condition, timeout);                \
        __ret;                                                                        \
})

#define __wait_event_hrtimeout(wq_head, condition, timeout, state)                \
({                                                                                \
        int __ret = 0;                                                                \
        struct hrtimer_sleeper __t;                                                \
                                                                                \
        hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC,                        \
                                      HRTIMER_MODE_REL);                        \
        if ((timeout) != KTIME_MAX) {                                                \
                hrtimer_set_expires_range_ns(&__t.timer, timeout,                \
                                        current->timer_slack_ns);                \
                hrtimer_sleeper_start_expires(&__t, HRTIMER_MODE_REL);                \
        }                                                                        \
                                                                                \
        __ret = ___wait_event(wq_head, condition, state, 0, 0,                        \
                if (!__t.task) {                                                \
                        __ret = -ETIME;                                                \
                        break;                                                        \
                }                                                                \
                schedule());                                                        \
                                                                                \
        hrtimer_cancel(&__t.timer);                                                \
        destroy_hrtimer_on_stack(&__t.timer);                                        \
        __ret;                                                                        \
})

/**
 * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, as a ktime_t
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function returns 0 if @condition became true, or -ETIME if the timeout
 * elapsed.
 */
#define wait_event_hrtimeout(wq_head, condition, timeout)                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_hrtimeout(wq_head, condition, timeout,        \
                                               TASK_UNINTERRUPTIBLE);                \
        __ret;                                                                        \
})

/**
 * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, as a ktime_t
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function returns 0 if @condition became true, -ERESTARTSYS if it was
 * interrupted by a signal, or -ETIME if the timeout elapsed.
 */
#define wait_event_interruptible_hrtimeout(wq, condition, timeout)                \
({                                                                                \
        long __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_hrtimeout(wq, condition, timeout,                \
                                               TASK_INTERRUPTIBLE);                \
        __ret;                                                                        \
})

#define __wait_event_interruptible_exclusive(wq, condition)                        \
        ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0,                        \
                      schedule())

#define wait_event_interruptible_exclusive(wq, condition)                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_exclusive(wq, condition);        \
        __ret;                                                                        \
})

#define __wait_event_killable_exclusive(wq, condition)                                \
        ___wait_event(wq, condition, TASK_KILLABLE, 1, 0,                        \
                      schedule())

#define wait_event_killable_exclusive(wq, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_killable_exclusive(wq, condition);                \
        __ret;                                                                        \
})


#define __wait_event_freezable_exclusive(wq, condition)                                \
        ___wait_event(wq, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 1, 0,\
                        schedule())

#define wait_event_freezable_exclusive(wq, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_freezable_exclusive(wq, condition);        \
        __ret;                                                                        \
})

/**
 * wait_event_idle - wait for a condition without contributing to system load
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 */
#define wait_event_idle(wq_head, condition)                                        \
do {                                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                ___wait_event(wq_head, condition, TASK_IDLE, 0, 0, schedule());        \
} while (0)

/**
 * wait_event_idle_exclusive - wait for a condition with contributing to system load
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus if other processes wait on the same list, when this
 * process is woken further processes are not considered.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 */
#define wait_event_idle_exclusive(wq_head, condition)                                \
do {                                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                ___wait_event(wq_head, condition, TASK_IDLE, 1, 0, schedule());        \
} while (0)

#define __wait_event_idle_timeout(wq_head, condition, timeout)                        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_IDLE, 0, timeout,                                        \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_idle_timeout - sleep without load until a condition becomes true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_idle_timeout(wq_head, condition, timeout)                        \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_idle_timeout(wq_head, condition, timeout);        \
        __ret;                                                                        \
})

#define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout)        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_IDLE, 1, timeout,                                        \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_idle_exclusive_timeout - sleep without load until a condition becomes true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus if other processes wait on the same list, when this
 * process is woken further processes are not considered.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_idle_exclusive_timeout(wq_head, condition, timeout);\
        __ret;                                                                        \
})

extern int do_wait_intr(wait_queue_head_t *, wait_queue_entry_t *);
extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *);

#define __wait_event_interruptible_locked(wq, condition, exclusive, fn)                \
({                                                                                \
        int __ret;                                                                \
        DEFINE_WAIT(__wait);                                                        \
        if (exclusive)                                                                \
                __wait.flags |= WQ_FLAG_EXCLUSIVE;                                \
        do {                                                                        \
                __ret = fn(&(wq), &__wait);                                        \
                if (__ret)                                                        \
                        break;                                                        \
        } while (!(condition));                                                        \
        __remove_wait_queue(&(wq), &__wait);                                        \
        __set_current_state(TASK_RUNNING);                                        \
        __ret;                                                                        \
})


/**
 * wait_event_interruptible_locked - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock()/spin_unlock()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_locked(wq, condition)                                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr))

/**
 * wait_event_interruptible_locked_irq - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_locked_irq(wq, condition)                        \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr_irq))

/**
 * wait_event_interruptible_exclusive_locked - sleep exclusively until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock()/spin_unlock()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus when other process waits process on the list if this
 * process is awaken further processes are not considered.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_exclusive_locked(wq, condition)                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr))

/**
 * wait_event_interruptible_exclusive_locked_irq - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus when other process waits process on the list if this
 * process is awaken further processes are not considered.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_exclusive_locked_irq(wq, condition)                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr_irq))


#define __wait_event_killable(wq, condition)                                        \
        ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule())

/**
 * wait_event_killable - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_KILLABLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_killable(wq_head, condition)                                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_killable(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_state(wq, condition, state)                                \
        ___wait_event(wq, condition, state, 0, 0, schedule())

/**
 * wait_event_state - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @state: state to sleep in
 *
 * The process is put to sleep (@state) until the @condition evaluates to true
 * or a signal is received (when allowed by @state).  The @condition is checked
 * each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a signal
 * (when allowed by @state) and 0 if @condition evaluated to true.
 */
#define wait_event_state(wq_head, condition, state)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_state(wq_head, condition, state);                \
        __ret;                                                                        \
})

#define __wait_event_killable_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_KILLABLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_KILLABLE) until the
 * @condition evaluates to true or a kill signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
 * interrupted by a kill signal.
 *
 * Only kill signals interrupt this process.
 */
#define wait_event_killable_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_killable_timeout(wq_head,                        \
                                                condition, timeout);                \
        __ret;                                                                        \
})


#define __wait_event_lock_irq(wq_head, condition, lock, cmd)                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            spin_unlock_irq(&lock);                                \
                            cmd;                                                \
                            schedule();                                                \
                            spin_lock_irq(&lock))

/**
 * wait_event_lock_irq_cmd - sleep until a condition gets true. The
 *                             condition is checked under the lock. This
 *                             is expected to be called with the lock
 *                             taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before cmd
 *          and schedule() and reacquired afterwards.
 * @cmd: a command which is invoked outside the critical section before
 *         sleep
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before invoking the cmd and going to sleep and is reacquired
 * afterwards.
 */
#define wait_event_lock_irq_cmd(wq_head, condition, lock, cmd)                        \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_lock_irq(wq_head, condition, lock, cmd);                        \
} while (0)

/**
 * wait_event_lock_irq - sleep until a condition gets true. The
 *                         condition is checked under the lock. This
 *                         is expected to be called with the lock
 *                         taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 */
#define wait_event_lock_irq(wq_head, condition, lock)                                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_lock_irq(wq_head, condition, lock, );                        \
} while (0)


#define __wait_event_interruptible_lock_irq(wq_head, condition, lock, cmd)        \
        ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0,                \
                      spin_unlock_irq(&lock);                                        \
                      cmd;                                                        \
                      schedule();                                                \
                      spin_lock_irq(&lock))

/**
 * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true.
 *                The condition is checked under the lock. This is expected to
 *                be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before cmd and
 *          schedule() and reacquired afterwards.
 * @cmd: a command which is invoked outside the critical section before
 *         sleep
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before invoking the cmd and going to sleep and is reacquired
 * afterwards.
 *
 * The macro will return -ERESTARTSYS if it was interrupted by a signal
 * and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_lock_irq_cmd(wq_head, condition, lock, cmd)        \
({                                                                                \
        int __ret = 0;                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_lock_irq(wq_head,                \
                                                condition, lock, cmd);                \
        __ret;                                                                        \
})

/**
 * wait_event_interruptible_lock_irq - sleep until a condition gets true.
 *                The condition is checked under the lock. This is expected
 *                to be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 *
 * The macro will return -ERESTARTSYS if it was interrupted by a signal
 * and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_lock_irq(wq_head, condition, lock)                \
({                                                                                \
        int __ret = 0;                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_lock_irq(wq_head,                \
                                                condition, lock,);                \
        __ret;                                                                        \
})

#define __wait_event_lock_irq_timeout(wq_head, condition, lock, timeout, state)        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      state, 0, timeout,                                        \
                      spin_unlock_irq(&lock);                                        \
                      __ret = schedule_timeout(__ret);                                \
                      spin_lock_irq(&lock));

/**
 * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets
 *                true or a timeout elapses. The condition is checked under
 *                the lock. This is expected to be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 *
 * The function returns 0 if the @timeout elapsed, -ERESTARTSYS if it
 * was interrupted by a signal, and the remaining jiffies otherwise
 * if the condition evaluated to true before the timeout elapsed.
 */
#define wait_event_interruptible_lock_irq_timeout(wq_head, condition, lock,        \
                                                  timeout)                        \
({                                                                                \
        long __ret = timeout;                                                        \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_lock_irq_timeout(                                \
                                        wq_head, condition, lock, timeout,        \
                                        TASK_INTERRUPTIBLE);                        \
        __ret;                                                                        \
})

#define wait_event_lock_irq_timeout(wq_head, condition, lock, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_lock_irq_timeout(                                \
                                        wq_head, condition, lock, timeout,        \
                                        TASK_UNINTERRUPTIBLE);                        \
        __ret;                                                                        \
})

/*
 * Waitqueues which are removed from the waitqueue_head at wakeup time
 */
void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);

#define DEFINE_WAIT_FUNC(name, function)                                        \
        struct wait_queue_entry name = {                                        \
                .private        = current,                                        \
                .func                = function,                                        \
                .entry                = LIST_HEAD_INIT((name).entry),                        \
        }

#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)

#define init_wait(wait)                                                                \
        do {                                                                        \
                (wait)->private = current;                                        \
                (wait)->func = autoremove_wake_function;                        \
                INIT_LIST_HEAD(&(wait)->entry);                                        \
                (wait)->flags = 0;                                                \
        } while (0)

typedef int (*task_call_f)(struct task_struct *p, void *arg);
extern int task_call_func(struct task_struct *p, task_call_f func, void *arg);

#endif /* _LINUX_WAIT_H */






























    1 































    1 













    1 







    1 










    2 




    1 





    1 








    1 














    1 






    1 



    1 






    1 



































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/nospec.h>
#include <linux/io_uring.h>

#include <uapi/linux/io_uring.h>

#include "io_uring.h"
#include "tctx.h"

static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
                                        struct task_struct *task)
{
        struct io_wq_hash *hash;
        struct io_wq_data data;
        unsigned int concurrency;

        mutex_lock(&ctx->uring_lock);
        hash = ctx->hash_map;
        if (!hash) {
                hash = kzalloc(sizeof(*hash), GFP_KERNEL);
                if (!hash) {
                        mutex_unlock(&ctx->uring_lock);
                        return ERR_PTR(-ENOMEM);
                }
                refcount_set(&hash->refs, 1);
                init_waitqueue_head(&hash->wait);
                ctx->hash_map = hash;
        }
        mutex_unlock(&ctx->uring_lock);

        data.hash = hash;
        data.task = task;
        data.free_work = io_wq_free_work;
        data.do_work = io_wq_submit_work;

        /* Do QD, or 4 * CPUS, whatever is smallest */
        concurrency = min(ctx->sq_entries, 4 * num_online_cpus());

        return io_wq_create(concurrency, &data);
}

void __io_uring_free(struct task_struct *tsk)
{
        struct io_uring_task *tctx = tsk->io_uring;

        WARN_ON_ONCE(!xa_empty(&tctx->xa));
        WARN_ON_ONCE(tctx->io_wq);
        WARN_ON_ONCE(tctx->cached_refs);

        percpu_counter_destroy(&tctx->inflight);
        kfree(tctx);
        tsk->io_uring = NULL;
}

__cold int io_uring_alloc_task_context(struct task_struct *task,
                                       struct io_ring_ctx *ctx)
{
        struct io_uring_task *tctx;
        int ret;

        tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
        if (unlikely(!tctx))
                return -ENOMEM;

        ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
        if (unlikely(ret)) {
                kfree(tctx);
                return ret;
        }

        tctx->io_wq = io_init_wq_offload(ctx, task);
        if (IS_ERR(tctx->io_wq)) {
                ret = PTR_ERR(tctx->io_wq);
                percpu_counter_destroy(&tctx->inflight);
                kfree(tctx);
                return ret;
        }

        xa_init(&tctx->xa);
        init_waitqueue_head(&tctx->wait);
        atomic_set(&tctx->in_cancel, 0);
        atomic_set(&tctx->inflight_tracked, 0);
        task->io_uring = tctx;
        init_llist_head(&tctx->task_list);
        init_task_work(&tctx->task_work, tctx_task_work);
        return 0;
}

int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
{
        struct io_uring_task *tctx = current->io_uring;
        struct io_tctx_node *node;
        int ret;

        if (unlikely(!tctx)) {
                ret = io_uring_alloc_task_context(current, ctx);
                if (unlikely(ret))
                        return ret;

                tctx = current->io_uring;
                if (ctx->iowq_limits_set) {
                        unsigned int limits[2] = { ctx->iowq_limits[0],
                                                   ctx->iowq_limits[1], };

                        ret = io_wq_max_workers(tctx->io_wq, limits);
                        if (ret)
                                return ret;
                }
        }
        if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
                node = kmalloc(sizeof(*node), GFP_KERNEL);
                if (!node)
                        return -ENOMEM;
                node->ctx = ctx;
                node->task = current;

                ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
                                        node, GFP_KERNEL));
                if (ret) {
                        kfree(node);
                        return ret;
                }

                mutex_lock(&ctx->uring_lock);
                list_add(&node->ctx_node, &ctx->tctx_list);
                mutex_unlock(&ctx->uring_lock);
        }
        return 0;
}

int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx)
{
        int ret;

        if (ctx->flags & IORING_SETUP_SINGLE_ISSUER
            && ctx->submitter_task != current)
                return -EEXIST;

        ret = __io_uring_add_tctx_node(ctx);
        if (ret)
                return ret;

        current->io_uring->last = ctx;
        return 0;
}

/*
 * Remove this io_uring_file -> task mapping.
 */
__cold void io_uring_del_tctx_node(unsigned long index)
{
        struct io_uring_task *tctx = current->io_uring;
        struct io_tctx_node *node;

        if (!tctx)
                return;
        node = xa_erase(&tctx->xa, index);
        if (!node)
                return;

        WARN_ON_ONCE(current != node->task);
        WARN_ON_ONCE(list_empty(&node->ctx_node));

        mutex_lock(&node->ctx->uring_lock);
        list_del(&node->ctx_node);
        mutex_unlock(&node->ctx->uring_lock);

        if (tctx->last == node->ctx)
                tctx->last = NULL;
        kfree(node);
}

__cold void io_uring_clean_tctx(struct io_uring_task *tctx)
{
        struct io_wq *wq = tctx->io_wq;
        struct io_tctx_node *node;
        unsigned long index;

        xa_for_each(&tctx->xa, index, node) {
                io_uring_del_tctx_node(index);
                cond_resched();
        }
        if (wq) {
                /*
                 * Must be after io_uring_del_tctx_node() (removes nodes under
                 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
                 */
                io_wq_put_and_exit(wq);
                tctx->io_wq = NULL;
        }
}

void io_uring_unreg_ringfd(void)
{
        struct io_uring_task *tctx = current->io_uring;
        int i;

        for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
                if (tctx->registered_rings[i]) {
                        fput(tctx->registered_rings[i]);
                        tctx->registered_rings[i] = NULL;
                }
        }
}

int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
                                     int start, int end)
{
        int offset;
        for (offset = start; offset < end; offset++) {
                offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
                if (tctx->registered_rings[offset])
                        continue;

                tctx->registered_rings[offset] = file;
                return offset;
        }
        return -EBUSY;
}

static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
                                     int start, int end)
{
        struct file *file;
        int offset;

        file = fget(fd);
        if (!file) {
                return -EBADF;
        } else if (!io_is_uring_fops(file)) {
                fput(file);
                return -EOPNOTSUPP;
        }
        offset = io_ring_add_registered_file(tctx, file, start, end);
        if (offset < 0)
                fput(file);
        return offset;
}

/*
 * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
 * invocation. User passes in an array of struct io_uring_rsrc_update
 * with ->data set to the ring_fd, and ->offset given for the desired
 * index. If no index is desired, application may set ->offset == -1U
 * and we'll find an available index. Returns number of entries
 * successfully processed, or < 0 on error if none were processed.
 */
int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
                       unsigned nr_args)
{
        struct io_uring_rsrc_update __user *arg = __arg;
        struct io_uring_rsrc_update reg;
        struct io_uring_task *tctx;
        int ret, i;

        if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
                return -EINVAL;

        mutex_unlock(&ctx->uring_lock);
        ret = __io_uring_add_tctx_node(ctx);
        mutex_lock(&ctx->uring_lock);
        if (ret)
                return ret;

        tctx = current->io_uring;
        for (i = 0; i < nr_args; i++) {
                int start, end;

                if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
                        ret = -EFAULT;
                        break;
                }

                if (reg.resv) {
                        ret = -EINVAL;
                        break;
                }

                if (reg.offset == -1U) {
                        start = 0;
                        end = IO_RINGFD_REG_MAX;
                } else {
                        if (reg.offset >= IO_RINGFD_REG_MAX) {
                                ret = -EINVAL;
                                break;
                        }
                        start = reg.offset;
                        end = start + 1;
                }

                ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
                if (ret < 0)
                        break;

                reg.offset = ret;
                if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
                        fput(tctx->registered_rings[reg.offset]);
                        tctx->registered_rings[reg.offset] = NULL;
                        ret = -EFAULT;
                        break;
                }
        }

        return i ? i : ret;
}

int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
                         unsigned nr_args)
{
        struct io_uring_rsrc_update __user *arg = __arg;
        struct io_uring_task *tctx = current->io_uring;
        struct io_uring_rsrc_update reg;
        int ret = 0, i;

        if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
                return -EINVAL;
        if (!tctx)
                return 0;

        for (i = 0; i < nr_args; i++) {
                if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
                        ret = -EFAULT;
                        break;
                }
                if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) {
                        ret = -EINVAL;
                        break;
                }

                reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
                if (tctx->registered_rings[reg.offset]) {
                        fput(tctx->registered_rings[reg.offset]);
                        tctx->registered_rings[reg.offset] = NULL;
                }
        }

        return i ? i : ret;
}











    6 






























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM percpu

#if !defined(_TRACE_PERCPU_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PERCPU_H

#include <linux/tracepoint.h>
#include <trace/events/mmflags.h>

TRACE_EVENT(percpu_alloc_percpu,

        TP_PROTO(unsigned long call_site,
                 bool reserved, bool is_atomic, size_t size,
                 size_t align, void *base_addr, int off,
                 void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags),

        TP_ARGS(call_site, reserved, is_atomic, size, align, base_addr, off,
                ptr, bytes_alloc, gfp_flags),

        TP_STRUCT__entry(
                __field(        unsigned long,                call_site        )
                __field(        bool,                        reserved        )
                __field(        bool,                        is_atomic        )
                __field(        size_t,                        size                )
                __field(        size_t,                        align                )
                __field(        void *,                        base_addr        )
                __field(        int,                        off                )
                __field(        void __percpu *,        ptr                )
                __field(        size_t,                        bytes_alloc        )
                __field(        unsigned long,                gfp_flags        )
        ),
        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->reserved        = reserved;
                __entry->is_atomic        = is_atomic;
                __entry->size                = size;
                __entry->align                = align;
                __entry->base_addr        = base_addr;
                __entry->off                = off;
                __entry->ptr                = ptr;
                __entry->bytes_alloc        = bytes_alloc;
                __entry->gfp_flags        = (__force unsigned long)gfp_flags;
        ),

        TP_printk("call_site=%pS reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p bytes_alloc=%zu gfp_flags=%s",
                  (void *)__entry->call_site,
                  __entry->reserved, __entry->is_atomic,
                  __entry->size, __entry->align,
                  __entry->base_addr, __entry->off, __entry->ptr,
                  __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags))
);

TRACE_EVENT(percpu_free_percpu,

        TP_PROTO(void *base_addr, int off, void __percpu *ptr),

        TP_ARGS(base_addr, off, ptr),

        TP_STRUCT__entry(
                __field(        void *,                        base_addr        )
                __field(        int,                        off                )
                __field(        void __percpu *,        ptr                )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
                __entry->off                = off;
                __entry->ptr                = ptr;
        ),

        TP_printk("base_addr=%p off=%d ptr=%p",
                __entry->base_addr, __entry->off, __entry->ptr)
);

TRACE_EVENT(percpu_alloc_percpu_fail,

        TP_PROTO(bool reserved, bool is_atomic, size_t size, size_t align),

        TP_ARGS(reserved, is_atomic, size, align),

        TP_STRUCT__entry(
                __field(        bool,        reserved        )
                __field(        bool,        is_atomic        )
                __field(        size_t,        size                )
                __field(        size_t, align                )
        ),

        TP_fast_assign(
                __entry->reserved        = reserved;
                __entry->is_atomic        = is_atomic;
                __entry->size                = size;
                __entry->align                = align;
        ),

        TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu",
                  __entry->reserved, __entry->is_atomic,
                  __entry->size, __entry->align)
);

TRACE_EVENT(percpu_create_chunk,

        TP_PROTO(void *base_addr),

        TP_ARGS(base_addr),

        TP_STRUCT__entry(
                __field(        void *, base_addr        )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
        ),

        TP_printk("base_addr=%p", __entry->base_addr)
);

TRACE_EVENT(percpu_destroy_chunk,

        TP_PROTO(void *base_addr),

        TP_ARGS(base_addr),

        TP_STRUCT__entry(
                __field(        void *,        base_addr        )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
        ),

        TP_printk("base_addr=%p", __entry->base_addr)
);

#endif /* _TRACE_PERCPU_H */

#include <trace/define_trace.h>











































    1 






    1 

















































































    1 



















    1 




















    1 










    1 











    1 






    1 



    1 






























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/minix/bitmap.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Modified for 680x0 by Hamish Macdonald
 * Fixed for 680x0 by Andreas Schwab
 */

/* bitmap.c contains the code that handles the inode and block bitmaps */

#include "minix.h"
#include <linux/buffer_head.h>
#include <linux/bitops.h>
#include <linux/sched.h>

static DEFINE_SPINLOCK(bitmap_lock);

/*
 * bitmap consists of blocks filled with 16bit words
 * bit set == busy, bit clear == free
 * endianness is a mess, but for counting zero bits it really doesn't matter...
 */
static __u32 count_free(struct buffer_head *map[], unsigned blocksize, __u32 numbits)
{
        __u32 sum = 0;
        unsigned blocks = DIV_ROUND_UP(numbits, blocksize * 8);

        while (blocks--) {
                unsigned words = blocksize / 2;
                __u16 *p = (__u16 *)(*map++)->b_data;
                while (words--)
                        sum += 16 - hweight16(*p++);
        }

        return sum;
}

void minix_free_block(struct inode *inode, unsigned long block)
{
        struct super_block *sb = inode->i_sb;
        struct minix_sb_info *sbi = minix_sb(sb);
        struct buffer_head *bh;
        int k = sb->s_blocksize_bits + 3;
        unsigned long bit, zone;

        if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) {
                printk("Trying to free block not in datazone\n");
                return;
        }
        zone = block - sbi->s_firstdatazone + 1;
        bit = zone & ((1<<k) - 1);
        zone >>= k;
        if (zone >= sbi->s_zmap_blocks) {
                printk("minix_free_block: nonexistent bitmap buffer\n");
                return;
        }
        bh = sbi->s_zmap[zone];
        spin_lock(&bitmap_lock);
        if (!minix_test_and_clear_bit(bit, bh->b_data))
                printk("minix_free_block (%s:%lu): bit already cleared\n",
                       sb->s_id, block);
        spin_unlock(&bitmap_lock);
        mark_buffer_dirty(bh);
        return;
}

int minix_new_block(struct inode * inode)
{
        struct minix_sb_info *sbi = minix_sb(inode->i_sb);
        int bits_per_zone = 8 * inode->i_sb->s_blocksize;
        int i;

        for (i = 0; i < sbi->s_zmap_blocks; i++) {
                struct buffer_head *bh = sbi->s_zmap[i];
                int j;

                spin_lock(&bitmap_lock);
                j = minix_find_first_zero_bit(bh->b_data, bits_per_zone);
                if (j < bits_per_zone) {
                        minix_set_bit(j, bh->b_data);
                        spin_unlock(&bitmap_lock);
                        mark_buffer_dirty(bh);
                        j += i * bits_per_zone + sbi->s_firstdatazone-1;
                        if (j < sbi->s_firstdatazone || j >= sbi->s_nzones)
                                break;
                        return j;
                }
                spin_unlock(&bitmap_lock);
        }
        return 0;
}

unsigned long minix_count_free_blocks(struct super_block *sb)
{
        struct minix_sb_info *sbi = minix_sb(sb);
        u32 bits = sbi->s_nzones - sbi->s_firstdatazone + 1;

        return (count_free(sbi->s_zmap, sb->s_blocksize, bits)
                << sbi->s_log_zone_size);
}

struct minix_inode *
minix_V1_raw_inode(struct super_block *sb, ino_t ino, struct buffer_head **bh)
{
        int block;
        struct minix_sb_info *sbi = minix_sb(sb);
        struct minix_inode *p;

        if (!ino || ino > sbi->s_ninodes) {
                printk("Bad inode number on dev %s: %ld is out of range\n",
                       sb->s_id, (long)ino);
                return NULL;
        }
        ino--;
        block = 2 + sbi->s_imap_blocks + sbi->s_zmap_blocks +
                 ino / MINIX_INODES_PER_BLOCK;
        *bh = sb_bread(sb, block);
        if (!*bh) {
                printk("Unable to read inode block\n");
                return NULL;
        }
        p = (void *)(*bh)->b_data;
        return p + ino % MINIX_INODES_PER_BLOCK;
}

struct minix2_inode *
minix_V2_raw_inode(struct super_block *sb, ino_t ino, struct buffer_head **bh)
{
        int block;
        struct minix_sb_info *sbi = minix_sb(sb);
        struct minix2_inode *p;
        int minix2_inodes_per_block = sb->s_blocksize / sizeof(struct minix2_inode);

        *bh = NULL;
        if (!ino || ino > sbi->s_ninodes) {
                printk("Bad inode number on dev %s: %ld is out of range\n",
                       sb->s_id, (long)ino);
                return NULL;
        }
        ino--;
        block = 2 + sbi->s_imap_blocks + sbi->s_zmap_blocks +
                 ino / minix2_inodes_per_block;
        *bh = sb_bread(sb, block);
        if (!*bh) {
                printk("Unable to read inode block\n");
                return NULL;
        }
        p = (void *)(*bh)->b_data;
        return p + ino % minix2_inodes_per_block;
}

/* Clear the link count and mode of a deleted inode on disk. */

static void minix_clear_inode(struct inode *inode)
{
        struct buffer_head *bh = NULL;

        if (INODE_VERSION(inode) == MINIX_V1) {
                struct minix_inode *raw_inode;
                raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh);
                if (raw_inode) {
                        raw_inode->i_nlinks = 0;
                        raw_inode->i_mode = 0;
                }
        } else {
                struct minix2_inode *raw_inode;
                raw_inode = minix_V2_raw_inode(inode->i_sb, inode->i_ino, &bh);
                if (raw_inode) {
                        raw_inode->i_nlinks = 0;
                        raw_inode->i_mode = 0;
                }
        }
        if (bh) {
                mark_buffer_dirty(bh);
                brelse (bh);
        }
}

void minix_free_inode(struct inode * inode)
{
        struct super_block *sb = inode->i_sb;
        struct minix_sb_info *sbi = minix_sb(inode->i_sb);
        struct buffer_head *bh;
        int k = sb->s_blocksize_bits + 3;
        unsigned long ino, bit;

        ino = inode->i_ino;
        if (ino < 1 || ino > sbi->s_ninodes) {
                printk("minix_free_inode: inode 0 or nonexistent inode\n");
                return;
        }
        bit = ino & ((1<<k) - 1);
        ino >>= k;
        if (ino >= sbi->s_imap_blocks) {
                printk("minix_free_inode: nonexistent imap in superblock\n");
                return;
        }

        minix_clear_inode(inode);        /* clear on-disk copy */

        bh = sbi->s_imap[ino];
        spin_lock(&bitmap_lock);
        if (!minix_test_and_clear_bit(bit, bh->b_data))
                printk("minix_free_inode: bit %lu already cleared\n", bit);
        spin_unlock(&bitmap_lock);
        mark_buffer_dirty(bh);
}

struct inode *minix_new_inode(const struct inode *dir, umode_t mode)
{
        struct super_block *sb = dir->i_sb;
        struct minix_sb_info *sbi = minix_sb(sb);
        struct inode *inode = new_inode(sb);
        struct buffer_head * bh;
        int bits_per_zone = 8 * sb->s_blocksize;
        unsigned long j;
        int i;

        if (!inode)
                return ERR_PTR(-ENOMEM);
        j = bits_per_zone;
        bh = NULL;
        spin_lock(&bitmap_lock);
        for (i = 0; i < sbi->s_imap_blocks; i++) {
                bh = sbi->s_imap[i];
                j = minix_find_first_zero_bit(bh->b_data, bits_per_zone);
                if (j < bits_per_zone)
                        break;
        }
        if (!bh || j >= bits_per_zone) {
                spin_unlock(&bitmap_lock);
                iput(inode);
                return ERR_PTR(-ENOSPC);
        }
        if (minix_test_and_set_bit(j, bh->b_data)) {        /* shouldn't happen */
                spin_unlock(&bitmap_lock);
                printk("minix_new_inode: bit already set\n");
                iput(inode);
                return ERR_PTR(-ENOSPC);
        }
        spin_unlock(&bitmap_lock);
        mark_buffer_dirty(bh);
        j += i * bits_per_zone;
        if (!j || j > sbi->s_ninodes) {
                iput(inode);
                return ERR_PTR(-ENOSPC);
        }
        inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
        inode->i_ino = j;
        simple_inode_init_ts(inode);
        inode->i_blocks = 0;
        memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u));
        insert_inode_hash(inode);
        mark_inode_dirty(inode);

        return inode;
}

unsigned long minix_count_free_inodes(struct super_block *sb)
{
        struct minix_sb_info *sbi = minix_sb(sb);
        u32 bits = sbi->s_ninodes + 1;

        return count_free(sbi->s_imap, sb->s_blocksize, bits);
}























































    1 


























    1 



    1 







































    1 









    1 





    1 

















    1 














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#include <linux/sched.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "locking.h"
#include "accessors.h"
#include "messages.h"
#include "delalloc-space.h"
#include "subpage.h"
#include "defrag.h"
#include "file-item.h"
#include "super.h"

static struct kmem_cache *btrfs_inode_defrag_cachep;

/*
 * When auto defrag is enabled we queue up these defrag structs to remember
 * which inodes need defragging passes.
 */
struct inode_defrag {
        struct rb_node rb_node;
        /* Inode number */
        u64 ino;
        /*
         * Transid where the defrag was added, we search for extents newer than
         * this.
         */
        u64 transid;

        /* Root objectid */
        u64 root;

        /*
         * The extent size threshold for autodefrag.
         *
         * This value is different for compressed/non-compressed extents, thus
         * needs to be passed from higher layer.
         * (aka, inode_should_defrag())
         */
        u32 extent_thresh;
};

static int __compare_inode_defrag(struct inode_defrag *defrag1,
                                  struct inode_defrag *defrag2)
{
        if (defrag1->root > defrag2->root)
                return 1;
        else if (defrag1->root < defrag2->root)
                return -1;
        else if (defrag1->ino > defrag2->ino)
                return 1;
        else if (defrag1->ino < defrag2->ino)
                return -1;
        else
                return 0;
}

/*
 * Pop a record for an inode into the defrag tree.  The lock must be held
 * already.
 *
 * If you're inserting a record for an older transid than an existing record,
 * the transid already in the tree is lowered.
 *
 * If an existing record is found the defrag item you pass in is freed.
 */
static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
                                    struct inode_defrag *defrag)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct inode_defrag *entry;
        struct rb_node **p;
        struct rb_node *parent = NULL;
        int ret;

        p = &fs_info->defrag_inodes.rb_node;
        while (*p) {
                parent = *p;
                entry = rb_entry(parent, struct inode_defrag, rb_node);

                ret = __compare_inode_defrag(defrag, entry);
                if (ret < 0)
                        p = &parent->rb_left;
                else if (ret > 0)
                        p = &parent->rb_right;
                else {
                        /*
                         * If we're reinserting an entry for an old defrag run,
                         * make sure to lower the transid of our existing
                         * record.
                         */
                        if (defrag->transid < entry->transid)
                                entry->transid = defrag->transid;
                        entry->extent_thresh = min(defrag->extent_thresh,
                                                   entry->extent_thresh);
                        return -EEXIST;
                }
        }
        set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
        rb_link_node(&defrag->rb_node, parent, p);
        rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
        return 0;
}

static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
{
        if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
                return 0;

        if (btrfs_fs_closing(fs_info))
                return 0;

        return 1;
}

/*
 * Insert a defrag record for this inode if auto defrag is enabled.
 */
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
                           struct btrfs_inode *inode, u32 extent_thresh)
{
        struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct inode_defrag *defrag;
        u64 transid;
        int ret;

        if (!__need_auto_defrag(fs_info))
                return 0;

        if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
                return 0;

        if (trans)
                transid = trans->transid;
        else
                transid = inode->root->last_trans;

        defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
        if (!defrag)
                return -ENOMEM;

        defrag->ino = btrfs_ino(inode);
        defrag->transid = transid;
        defrag->root = btrfs_root_id(root);
        defrag->extent_thresh = extent_thresh;

        spin_lock(&fs_info->defrag_inodes_lock);
        if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
                /*
                 * If we set IN_DEFRAG flag and evict the inode from memory,
                 * and then re-read this inode, this new inode doesn't have
                 * IN_DEFRAG flag. At the case, we may find the existed defrag.
                 */
                ret = __btrfs_add_inode_defrag(inode, defrag);
                if (ret)
                        kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
        } else {
                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
        }
        spin_unlock(&fs_info->defrag_inodes_lock);
        return 0;
}

/*
 * Pick the defragable inode that we want, if it doesn't exist, we will get the
 * next one.
 */
static struct inode_defrag *btrfs_pick_defrag_inode(
                        struct btrfs_fs_info *fs_info, u64 root, u64 ino)
{
        struct inode_defrag *entry = NULL;
        struct inode_defrag tmp;
        struct rb_node *p;
        struct rb_node *parent = NULL;
        int ret;

        tmp.ino = ino;
        tmp.root = root;

        spin_lock(&fs_info->defrag_inodes_lock);
        p = fs_info->defrag_inodes.rb_node;
        while (p) {
                parent = p;
                entry = rb_entry(parent, struct inode_defrag, rb_node);

                ret = __compare_inode_defrag(&tmp, entry);
                if (ret < 0)
                        p = parent->rb_left;
                else if (ret > 0)
                        p = parent->rb_right;
                else
                        goto out;
        }

        if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
                parent = rb_next(parent);
                if (parent)
                        entry = rb_entry(parent, struct inode_defrag, rb_node);
                else
                        entry = NULL;
        }
out:
        if (entry)
                rb_erase(parent, &fs_info->defrag_inodes);
        spin_unlock(&fs_info->defrag_inodes_lock);
        return entry;
}

void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
{
        struct inode_defrag *defrag;
        struct rb_node *node;

        spin_lock(&fs_info->defrag_inodes_lock);
        node = rb_first(&fs_info->defrag_inodes);
        while (node) {
                rb_erase(node, &fs_info->defrag_inodes);
                defrag = rb_entry(node, struct inode_defrag, rb_node);
                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);

                cond_resched_lock(&fs_info->defrag_inodes_lock);

                node = rb_first(&fs_info->defrag_inodes);
        }
        spin_unlock(&fs_info->defrag_inodes_lock);
}

#define BTRFS_DEFRAG_BATCH        1024

static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
                                    struct inode_defrag *defrag)
{
        struct btrfs_root *inode_root;
        struct inode *inode;
        struct btrfs_ioctl_defrag_range_args range;
        int ret = 0;
        u64 cur = 0;

again:
        if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
                goto cleanup;
        if (!__need_auto_defrag(fs_info))
                goto cleanup;

        /* Get the inode */
        inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
        if (IS_ERR(inode_root)) {
                ret = PTR_ERR(inode_root);
                goto cleanup;
        }

        inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
        btrfs_put_root(inode_root);
        if (IS_ERR(inode)) {
                ret = PTR_ERR(inode);
                goto cleanup;
        }

        if (cur >= i_size_read(inode)) {
                iput(inode);
                goto cleanup;
        }

        /* Do a chunk of defrag */
        clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
        memset(&range, 0, sizeof(range));
        range.len = (u64)-1;
        range.start = cur;
        range.extent_thresh = defrag->extent_thresh;

        sb_start_write(fs_info->sb);
        ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
                                       BTRFS_DEFRAG_BATCH);
        sb_end_write(fs_info->sb);
        iput(inode);

        if (ret < 0)
                goto cleanup;

        cur = max(cur + fs_info->sectorsize, range.start);
        goto again;

cleanup:
        kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
        return ret;
}

/*
 * Run through the list of inodes in the FS that need defragging.
 */
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
{
        struct inode_defrag *defrag;
        u64 first_ino = 0;
        u64 root_objectid = 0;

        atomic_inc(&fs_info->defrag_running);
        while (1) {
                /* Pause the auto defragger. */
                if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
                        break;

                if (!__need_auto_defrag(fs_info))
                        break;

                /* find an inode to defrag */
                defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino);
                if (!defrag) {
                        if (root_objectid || first_ino) {
                                root_objectid = 0;
                                first_ino = 0;
                                continue;
                        } else {
                                break;
                        }
                }

                first_ino = defrag->ino + 1;
                root_objectid = defrag->root;

                __btrfs_run_defrag_inode(fs_info, defrag);
        }
        atomic_dec(&fs_info->defrag_running);

        /*
         * During unmount, we use the transaction_wait queue to wait for the
         * defragger to stop.
         */
        wake_up(&fs_info->transaction_wait);
        return 0;
}

/*
 * Check if two blocks addresses are close, used by defrag.
 */
static bool close_blocks(u64 blocknr, u64 other, u32 blocksize)
{
        if (blocknr < other && other - (blocknr + blocksize) < SZ_32K)
                return true;
        if (blocknr > other && blocknr - (other + blocksize) < SZ_32K)
                return true;
        return false;
}

/*
 * Go through all the leaves pointed to by a node and reallocate them so that
 * disk order is close to key order.
 */
static int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              struct extent_buffer *parent,
                              int start_slot, u64 *last_ret,
                              struct btrfs_key *progress)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        const u32 blocksize = fs_info->nodesize;
        const int end_slot = btrfs_header_nritems(parent) - 1;
        u64 search_start = *last_ret;
        u64 last_block = 0;
        int ret = 0;
        bool progress_passed = false;

        /*
         * COWing must happen through a running transaction, which always
         * matches the current fs generation (it's a transaction with a state
         * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
         * into error state to prevent the commit of any transaction.
         */
        if (unlikely(trans->transaction != fs_info->running_transaction ||
                     trans->transid != fs_info->generation)) {
                btrfs_abort_transaction(trans, -EUCLEAN);
                btrfs_crit(fs_info,
"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu",
                           parent->start, btrfs_root_id(root), trans->transid,
                           fs_info->running_transaction->transid,
                           fs_info->generation);
                return -EUCLEAN;
        }

        if (btrfs_header_nritems(parent) <= 1)
                return 0;

        for (int i = start_slot; i <= end_slot; i++) {
                struct extent_buffer *cur;
                struct btrfs_disk_key disk_key;
                u64 blocknr;
                u64 other;
                bool close = true;

                btrfs_node_key(parent, &disk_key, i);
                if (!progress_passed && btrfs_comp_keys(&disk_key, progress) < 0)
                        continue;

                progress_passed = true;
                blocknr = btrfs_node_blockptr(parent, i);
                if (last_block == 0)
                        last_block = blocknr;

                if (i > 0) {
                        other = btrfs_node_blockptr(parent, i - 1);
                        close = close_blocks(blocknr, other, blocksize);
                }
                if (!close && i < end_slot) {
                        other = btrfs_node_blockptr(parent, i + 1);
                        close = close_blocks(blocknr, other, blocksize);
                }
                if (close) {
                        last_block = blocknr;
                        continue;
                }

                cur = btrfs_read_node_slot(parent, i);
                if (IS_ERR(cur))
                        return PTR_ERR(cur);
                if (search_start == 0)
                        search_start = last_block;

                btrfs_tree_lock(cur);
                ret = btrfs_force_cow_block(trans, root, cur, parent, i,
                                            &cur, search_start,
                                            min(16 * blocksize,
                                                (end_slot - i) * blocksize),
                                            BTRFS_NESTING_COW);
                if (ret) {
                        btrfs_tree_unlock(cur);
                        free_extent_buffer(cur);
                        break;
                }
                search_start = cur->start;
                last_block = cur->start;
                *last_ret = search_start;
                btrfs_tree_unlock(cur);
                free_extent_buffer(cur);
        }
        return ret;
}

/*
 * Defrag all the leaves in a given btree.
 * Read all the leaves and try to get key order to
 * better reflect disk order
 */

static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root)
{
        struct btrfs_path *path = NULL;
        struct btrfs_key key;
        int ret = 0;
        int wret;
        int level;
        int next_key_ret = 0;
        u64 last_ret = 0;

        if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
                goto out;

        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
                goto out;
        }

        level = btrfs_header_level(root->node);

        if (level == 0)
                goto out;

        if (root->defrag_progress.objectid == 0) {
                struct extent_buffer *root_node;
                u32 nritems;

                root_node = btrfs_lock_root_node(root);
                nritems = btrfs_header_nritems(root_node);
                root->defrag_max.objectid = 0;
                /* from above we know this is not a leaf */
                btrfs_node_key_to_cpu(root_node, &root->defrag_max,
                                      nritems - 1);
                btrfs_tree_unlock(root_node);
                free_extent_buffer(root_node);
                memset(&key, 0, sizeof(key));
        } else {
                memcpy(&key, &root->defrag_progress, sizeof(key));
        }

        path->keep_locks = 1;

        ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
        if (ret < 0)
                goto out;
        if (ret > 0) {
                ret = 0;
                goto out;
        }
        btrfs_release_path(path);
        /*
         * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
         * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
         * a deadlock (attempting to write lock an already write locked leaf).
         */
        path->lowest_level = 1;
        wret = btrfs_search_slot(trans, root, &key, path, 0, 1);

        if (wret < 0) {
                ret = wret;
                goto out;
        }
        if (!path->nodes[1]) {
                ret = 0;
                goto out;
        }
        /*
         * The node at level 1 must always be locked when our path has
         * keep_locks set and lowest_level is 1, regardless of the value of
         * path->slots[1].
         */
        ASSERT(path->locks[1] != 0);
        ret = btrfs_realloc_node(trans, root,
                                 path->nodes[1], 0,
                                 &last_ret,
                                 &root->defrag_progress);
        if (ret) {
                WARN_ON(ret == -EAGAIN);
                goto out;
        }
        /*
         * Now that we reallocated the node we can find the next key. Note that
         * btrfs_find_next_key() can release our path and do another search
         * without COWing, this is because even with path->keep_locks = 1,
         * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
         * node when path->slots[node_level - 1] does not point to the last
         * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
         * we search for the next key after reallocating our node.
         */
        path->slots[1] = btrfs_header_nritems(path->nodes[1]);
        next_key_ret = btrfs_find_next_key(root, path, &key, 1,
                                           BTRFS_OLDEST_GENERATION);
        if (next_key_ret == 0) {
                memcpy(&root->defrag_progress, &key, sizeof(key));
                ret = -EAGAIN;
        }
out:
        btrfs_free_path(path);
        if (ret == -EAGAIN) {
                if (root->defrag_max.objectid > root->defrag_progress.objectid)
                        goto done;
                if (root->defrag_max.type > root->defrag_progress.type)
                        goto done;
                if (root->defrag_max.offset > root->defrag_progress.offset)
                        goto done;
                ret = 0;
        }
done:
        if (ret != -EAGAIN)
                memset(&root->defrag_progress, 0,
                       sizeof(root->defrag_progress));

        return ret;
}

/*
 * Defrag a given btree.  Every leaf in the btree is read and defragmented.
 */
int btrfs_defrag_root(struct btrfs_root *root)
{
        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret;

        if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
                return 0;

        while (1) {
                struct btrfs_trans_handle *trans;

                trans = btrfs_start_transaction(root, 0);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
                        break;
                }

                ret = btrfs_defrag_leaves(trans, root);

                btrfs_end_transaction(trans);
                btrfs_btree_balance_dirty(fs_info);
                cond_resched();

                if (btrfs_fs_closing(fs_info) || ret != -EAGAIN)
                        break;

                if (btrfs_defrag_cancelled(fs_info)) {
                        btrfs_debug(fs_info, "defrag_root cancelled");
                        ret = -EAGAIN;
                        break;
                }
        }
        clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
        return ret;
}

/*
 * Defrag specific helper to get an extent map.
 *
 * Differences between this and btrfs_get_extent() are:
 *
 * - No extent_map will be added to inode->extent_tree
 *   To reduce memory usage in the long run.
 *
 * - Extra optimization to skip file extents older than @newer_than
 *   By using btrfs_search_forward() we can skip entire file ranges that
 *   have extents created in past transactions, because btrfs_search_forward()
 *   will not visit leaves and nodes with a generation smaller than given
 *   minimal generation threshold (@newer_than).
 *
 * Return valid em if we find a file extent matching the requirement.
 * Return NULL if we can not find a file extent matching the requirement.
 *
 * Return ERR_PTR() for error.
 */
static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
                                            u64 start, u64 newer_than)
{
        struct btrfs_root *root = inode->root;
        struct btrfs_file_extent_item *fi;
        struct btrfs_path path = { 0 };
        struct extent_map *em;
        struct btrfs_key key;
        u64 ino = btrfs_ino(inode);
        int ret;

        em = alloc_extent_map();
        if (!em) {
                ret = -ENOMEM;
                goto err;
        }

        key.objectid = ino;
        key.type = BTRFS_EXTENT_DATA_KEY;
        key.offset = start;

        if (newer_than) {
                ret = btrfs_search_forward(root, &key, &path, newer_than);
                if (ret < 0)
                        goto err;
                /* Can't find anything newer */
                if (ret > 0)
                        goto not_found;
        } else {
                ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
                if (ret < 0)
                        goto err;
        }
        if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
                /*
                 * If btrfs_search_slot() makes path to point beyond nritems,
                 * we should not have an empty leaf, as this inode must at
                 * least have its INODE_ITEM.
                 */
                ASSERT(btrfs_header_nritems(path.nodes[0]));
                path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
        }
        btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
        /* Perfect match, no need to go one slot back */
        if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
            key.offset == start)
                goto iterate;

        /* We didn't find a perfect match, needs to go one slot back */
        if (path.slots[0] > 0) {
                btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
                if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
                        path.slots[0]--;
        }

iterate:
        /* Iterate through the path to find a file extent covering @start */
        while (true) {
                u64 extent_end;

                if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
                        goto next;

                btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);

                /*
                 * We may go one slot back to INODE_REF/XATTR item, then
                 * need to go forward until we reach an EXTENT_DATA.
                 * But we should still has the correct ino as key.objectid.
                 */
                if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
                        goto next;

                /* It's beyond our target range, definitely not extent found */
                if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
                        goto not_found;

                /*
                 *        |        |<- File extent ->|
                 *        \- start
                 *
                 * This means there is a hole between start and key.offset.
                 */
                if (key.offset > start) {
                        em->start = start;
                        em->orig_start = start;
                        em->block_start = EXTENT_MAP_HOLE;
                        em->len = key.offset - start;
                        break;
                }

                fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
                                    struct btrfs_file_extent_item);
                extent_end = btrfs_file_extent_end(&path);

                /*
                 *        |<- file extent ->|        |
                 *                                \- start
                 *
                 * We haven't reached start, search next slot.
                 */
                if (extent_end <= start)
                        goto next;

                /* Now this extent covers @start, convert it to em */
                btrfs_extent_item_to_extent_map(inode, &path, fi, em);
                break;
next:
                ret = btrfs_next_item(root, &path);
                if (ret < 0)
                        goto err;
                if (ret > 0)
                        goto not_found;
        }
        btrfs_release_path(&path);
        return em;

not_found:
        btrfs_release_path(&path);
        free_extent_map(em);
        return NULL;

err:
        btrfs_release_path(&path);
        free_extent_map(em);
        return ERR_PTR(ret);
}

static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
                                               u64 newer_than, bool locked)
{
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_map *em;
        const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;

        /*
         * Hopefully we have this extent in the tree already, try without the
         * full extent lock.
         */
        read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, start, sectorsize);
        read_unlock(&em_tree->lock);

        /*
         * We can get a merged extent, in that case, we need to re-search
         * tree to get the original em for defrag.
         *
         * If @newer_than is 0 or em::generation < newer_than, we can trust
         * this em, as either we don't care about the generation, or the
         * merged extent map will be rejected anyway.
         */
        if (em && (em->flags & EXTENT_FLAG_MERGED) &&
            newer_than && em->generation >= newer_than) {
                free_extent_map(em);
                em = NULL;
        }

        if (!em) {
                struct extent_state *cached = NULL;
                u64 end = start + sectorsize - 1;

                /* Get the big lock and read metadata off disk. */
                if (!locked)
                        lock_extent(io_tree, start, end, &cached);
                em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
                if (!locked)
                        unlock_extent(io_tree, start, end, &cached);

                if (IS_ERR(em))
                        return NULL;
        }

        return em;
}

static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
                                   const struct extent_map *em)
{
        if (extent_map_is_compressed(em))
                return BTRFS_MAX_COMPRESSED;
        return fs_info->max_extent_size;
}

static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
                                     u32 extent_thresh, u64 newer_than, bool locked)
{
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct extent_map *next;
        bool ret = false;

        /* This is the last extent */
        if (em->start + em->len >= i_size_read(inode))
                return false;

        /*
         * Here we need to pass @newer_then when checking the next extent, or
         * we will hit a case we mark current extent for defrag, but the next
         * one will not be a target.
         * This will just cause extra IO without really reducing the fragments.
         */
        next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
        /* No more em or hole */
        if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
                goto out;
        if (next->flags & EXTENT_FLAG_PREALLOC)
                goto out;
        /*
         * If the next extent is at its max capacity, defragging current extent
         * makes no sense, as the total number of extents won't change.
         */
        if (next->len >= get_extent_max_capacity(fs_info, em))
                goto out;
        /* Skip older extent */
        if (next->generation < newer_than)
                goto out;
        /* Also check extent size */
        if (next->len >= extent_thresh)
                goto out;

        ret = true;
out:
        free_extent_map(next);
        return ret;
}

/*
 * Prepare one page to be defragged.
 *
 * This will ensure:
 *
 * - Returned page is locked and has been set up properly.
 * - No ordered extent exists in the page.
 * - The page is uptodate.
 *
 * NOTE: Caller should also wait for page writeback after the cluster is
 * prepared, here we don't do writeback wait for each page.
 */
static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t index)
{
        struct address_space *mapping = inode->vfs_inode.i_mapping;
        gfp_t mask = btrfs_alloc_write_mask(mapping);
        u64 page_start = (u64)index << PAGE_SHIFT;
        u64 page_end = page_start + PAGE_SIZE - 1;
        struct extent_state *cached_state = NULL;
        struct folio *folio;
        int ret;

again:
        folio = __filemap_get_folio(mapping, index,
                                    FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
        if (IS_ERR(folio))
                return folio;

        /*
         * Since we can defragment files opened read-only, we can encounter
         * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
         * can't do I/O using huge pages yet, so return an error for now.
         * Filesystem transparent huge pages are typically only used for
         * executables that explicitly enable them, so this isn't very
         * restrictive.
         */
        if (folio_test_large(folio)) {
                folio_unlock(folio);
                folio_put(folio);
                return ERR_PTR(-ETXTBSY);
        }

        ret = set_folio_extent_mapped(folio);
        if (ret < 0) {
                folio_unlock(folio);
                folio_put(folio);
                return ERR_PTR(ret);
        }

        /* Wait for any existing ordered extent in the range */
        while (1) {
                struct btrfs_ordered_extent *ordered;

                lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
                ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
                unlock_extent(&inode->io_tree, page_start, page_end,
                              &cached_state);
                if (!ordered)
                        break;

                folio_unlock(folio);
                btrfs_start_ordered_extent(ordered);
                btrfs_put_ordered_extent(ordered);
                folio_lock(folio);
                /*
                 * We unlocked the folio above, so we need check if it was
                 * released or not.
                 */
                if (folio->mapping != mapping || !folio->private) {
                        folio_unlock(folio);
                        folio_put(folio);
                        goto again;
                }
        }

        /*
         * Now the page range has no ordered extent any more.  Read the page to
         * make it uptodate.
         */
        if (!folio_test_uptodate(folio)) {
                btrfs_read_folio(NULL, folio);
                folio_lock(folio);
                if (folio->mapping != mapping || !folio->private) {
                        folio_unlock(folio);
                        folio_put(folio);
                        goto again;
                }
                if (!folio_test_uptodate(folio)) {
                        folio_unlock(folio);
                        folio_put(folio);
                        return ERR_PTR(-EIO);
                }
        }
        return folio;
}

struct defrag_target_range {
        struct list_head list;
        u64 start;
        u64 len;
};

/*
 * Collect all valid target extents.
 *
 * @start:           file offset to lookup
 * @len:           length to lookup
 * @extent_thresh: file extent size threshold, any extent size >= this value
 *                   will be ignored
 * @newer_than:    only defrag extents newer than this value
 * @do_compress:   whether the defrag is doing compression
 *                   if true, @extent_thresh will be ignored and all regular
 *                   file extents meeting @newer_than will be targets.
 * @locked:           if the range has already held extent lock
 * @target_list:   list of targets file extents
 */
static int defrag_collect_targets(struct btrfs_inode *inode,
                                  u64 start, u64 len, u32 extent_thresh,
                                  u64 newer_than, bool do_compress,
                                  bool locked, struct list_head *target_list,
                                  u64 *last_scanned_ret)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        bool last_is_target = false;
        u64 cur = start;
        int ret = 0;

        while (cur < start + len) {
                struct extent_map *em;
                struct defrag_target_range *new;
                bool next_mergeable = true;
                u64 range_len;

                last_is_target = false;
                em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked);
                if (!em)
                        break;

                /*
                 * If the file extent is an inlined one, we may still want to
                 * defrag it (fallthrough) if it will cause a regular extent.
                 * This is for users who want to convert inline extents to
                 * regular ones through max_inline= mount option.
                 */
                if (em->block_start == EXTENT_MAP_INLINE &&
                    em->len <= inode->root->fs_info->max_inline)
                        goto next;

                /* Skip holes and preallocated extents. */
                if (em->block_start == EXTENT_MAP_HOLE ||
                    (em->flags & EXTENT_FLAG_PREALLOC))
                        goto next;

                /* Skip older extent */
                if (em->generation < newer_than)
                        goto next;

                /* This em is under writeback, no need to defrag */
                if (em->generation == (u64)-1)
                        goto next;

                /*
                 * Our start offset might be in the middle of an existing extent
                 * map, so take that into account.
                 */
                range_len = em->len - (cur - em->start);
                /*
                 * If this range of the extent map is already flagged for delalloc,
                 * skip it, because:
                 *
                 * 1) We could deadlock later, when trying to reserve space for
                 *    delalloc, because in case we can't immediately reserve space
                 *    the flusher can start delalloc and wait for the respective
                 *    ordered extents to complete. The deadlock would happen
                 *    because we do the space reservation while holding the range
                 *    locked, and starting writeback, or finishing an ordered
                 *    extent, requires locking the range;
                 *
                 * 2) If there's delalloc there, it means there's dirty pages for
                 *    which writeback has not started yet (we clean the delalloc
                 *    flag when starting writeback and after creating an ordered
                 *    extent). If we mark pages in an adjacent range for defrag,
                 *    then we will have a larger contiguous range for delalloc,
                 *    very likely resulting in a larger extent after writeback is
                 *    triggered (except in a case of free space fragmentation).
                 */
                if (test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1,
                                          EXTENT_DELALLOC))
                        goto next;

                /*
                 * For do_compress case, we want to compress all valid file
                 * extents, thus no @extent_thresh or mergeable check.
                 */
                if (do_compress)
                        goto add;

                /* Skip too large extent */
                if (em->len >= extent_thresh)
                        goto next;

                /*
                 * Skip extents already at its max capacity, this is mostly for
                 * compressed extents, which max cap is only 128K.
                 */
                if (em->len >= get_extent_max_capacity(fs_info, em))
                        goto next;

                /*
                 * Normally there are no more extents after an inline one, thus
                 * @next_mergeable will normally be false and not defragged.
                 * So if an inline extent passed all above checks, just add it
                 * for defrag, and be converted to regular extents.
                 */
                if (em->block_start == EXTENT_MAP_INLINE)
                        goto add;

                next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
                                                extent_thresh, newer_than, locked);
                if (!next_mergeable) {
                        struct defrag_target_range *last;

                        /* Empty target list, no way to merge with last entry */
                        if (list_empty(target_list))
                                goto next;
                        last = list_entry(target_list->prev,
                                          struct defrag_target_range, list);
                        /* Not mergeable with last entry */
                        if (last->start + last->len != cur)
                                goto next;

                        /* Mergeable, fall through to add it to @target_list. */
                }

add:
                last_is_target = true;
                range_len = min(extent_map_end(em), start + len) - cur;
                /*
                 * This one is a good target, check if it can be merged into
                 * last range of the target list.
                 */
                if (!list_empty(target_list)) {
                        struct defrag_target_range *last;

                        last = list_entry(target_list->prev,
                                          struct defrag_target_range, list);
                        ASSERT(last->start + last->len <= cur);
                        if (last->start + last->len == cur) {
                                /* Mergeable, enlarge the last entry */
                                last->len += range_len;
                                goto next;
                        }
                        /* Fall through to allocate a new entry */
                }

                /* Allocate new defrag_target_range */
                new = kmalloc(sizeof(*new), GFP_NOFS);
                if (!new) {
                        free_extent_map(em);
                        ret = -ENOMEM;
                        break;
                }
                new->start = cur;
                new->len = range_len;
                list_add_tail(&new->list, target_list);

next:
                cur = extent_map_end(em);
                free_extent_map(em);
        }
        if (ret < 0) {
                struct defrag_target_range *entry;
                struct defrag_target_range *tmp;

                list_for_each_entry_safe(entry, tmp, target_list, list) {
                        list_del_init(&entry->list);
                        kfree(entry);
                }
        }
        if (!ret && last_scanned_ret) {
                /*
                 * If the last extent is not a target, the caller can skip to
                 * the end of that extent.
                 * Otherwise, we can only go the end of the specified range.
                 */
                if (!last_is_target)
                        *last_scanned_ret = max(cur, *last_scanned_ret);
                else
                        *last_scanned_ret = max(start + len, *last_scanned_ret);
        }
        return ret;
}

#define CLUSTER_SIZE        (SZ_256K)
static_assert(PAGE_ALIGNED(CLUSTER_SIZE));

/*
 * Defrag one contiguous target range.
 *
 * @inode:        target inode
 * @target:        target range to defrag
 * @pages:        locked pages covering the defrag range
 * @nr_pages:        number of locked pages
 *
 * Caller should ensure:
 *
 * - Pages are prepared
 *   Pages should be locked, no ordered extent in the pages range,
 *   no writeback.
 *
 * - Extent bits are locked
 */
static int defrag_one_locked_target(struct btrfs_inode *inode,
                                    struct defrag_target_range *target,
                                    struct folio **folios, int nr_pages,
                                    struct extent_state **cached_state)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct extent_changeset *data_reserved = NULL;
        const u64 start = target->start;
        const u64 len = target->len;
        unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
        unsigned long start_index = start >> PAGE_SHIFT;
        unsigned long first_index = folios[0]->index;
        int ret = 0;
        int i;

        ASSERT(last_index - first_index + 1 <= nr_pages);

        ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
        if (ret < 0)
                return ret;
        clear_extent_bit(&inode->io_tree, start, start + len - 1,
                         EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
                         EXTENT_DEFRAG, cached_state);
        set_extent_bit(&inode->io_tree, start, start + len - 1,
                       EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state);

        /* Update the page status */
        for (i = start_index - first_index; i <= last_index - first_index; i++) {
                folio_clear_checked(folios[i]);
                btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len);
        }
        btrfs_delalloc_release_extents(inode, len);
        extent_changeset_free(data_reserved);

        return ret;
}

static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
                            u32 extent_thresh, u64 newer_than, bool do_compress,
                            u64 *last_scanned_ret)
{
        struct extent_state *cached_state = NULL;
        struct defrag_target_range *entry;
        struct defrag_target_range *tmp;
        LIST_HEAD(target_list);
        struct folio **folios;
        const u32 sectorsize = inode->root->fs_info->sectorsize;
        u64 last_index = (start + len - 1) >> PAGE_SHIFT;
        u64 start_index = start >> PAGE_SHIFT;
        unsigned int nr_pages = last_index - start_index + 1;
        int ret = 0;
        int i;

        ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
        ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));

        folios = kcalloc(nr_pages, sizeof(struct folio *), GFP_NOFS);
        if (!folios)
                return -ENOMEM;

        /* Prepare all pages */
        for (i = 0; i < nr_pages; i++) {
                folios[i] = defrag_prepare_one_folio(inode, start_index + i);
                if (IS_ERR(folios[i])) {
                        ret = PTR_ERR(folios[i]);
                        nr_pages = i;
                        goto free_folios;
                }
        }
        for (i = 0; i < nr_pages; i++)
                folio_wait_writeback(folios[i]);

        /* Lock the pages range */
        lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
                    (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
                    &cached_state);
        /*
         * Now we have a consistent view about the extent map, re-check
         * which range really needs to be defragged.
         *
         * And this time we have extent locked already, pass @locked = true
         * so that we won't relock the extent range and cause deadlock.
         */
        ret = defrag_collect_targets(inode, start, len, extent_thresh,
                                     newer_than, do_compress, true,
                                     &target_list, last_scanned_ret);
        if (ret < 0)
                goto unlock_extent;

        list_for_each_entry(entry, &target_list, list) {
                ret = defrag_one_locked_target(inode, entry, folios, nr_pages,
                                               &cached_state);
                if (ret < 0)
                        break;
        }

        list_for_each_entry_safe(entry, tmp, &target_list, list) {
                list_del_init(&entry->list);
                kfree(entry);
        }
unlock_extent:
        unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
                      (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
                      &cached_state);
free_folios:
        for (i = 0; i < nr_pages; i++) {
                folio_unlock(folios[i]);
                folio_put(folios[i]);
        }
        kfree(folios);
        return ret;
}

static int defrag_one_cluster(struct btrfs_inode *inode,
                              struct file_ra_state *ra,
                              u64 start, u32 len, u32 extent_thresh,
                              u64 newer_than, bool do_compress,
                              unsigned long *sectors_defragged,
                              unsigned long max_sectors,
                              u64 *last_scanned_ret)
{
        const u32 sectorsize = inode->root->fs_info->sectorsize;
        struct defrag_target_range *entry;
        struct defrag_target_range *tmp;
        LIST_HEAD(target_list);
        int ret;

        ret = defrag_collect_targets(inode, start, len, extent_thresh,
                                     newer_than, do_compress, false,
                                     &target_list, NULL);
        if (ret < 0)
                goto out;

        list_for_each_entry(entry, &target_list, list) {
                u32 range_len = entry->len;

                /* Reached or beyond the limit */
                if (max_sectors && *sectors_defragged >= max_sectors) {
                        ret = 1;
                        break;
                }

                if (max_sectors)
                        range_len = min_t(u32, range_len,
                                (max_sectors - *sectors_defragged) * sectorsize);

                /*
                 * If defrag_one_range() has updated last_scanned_ret,
                 * our range may already be invalid (e.g. hole punched).
                 * Skip if our range is before last_scanned_ret, as there is
                 * no need to defrag the range anymore.
                 */
                if (entry->start + range_len <= *last_scanned_ret)
                        continue;

                if (ra)
                        page_cache_sync_readahead(inode->vfs_inode.i_mapping,
                                ra, NULL, entry->start >> PAGE_SHIFT,
                                ((entry->start + range_len - 1) >> PAGE_SHIFT) -
                                (entry->start >> PAGE_SHIFT) + 1);
                /*
                 * Here we may not defrag any range if holes are punched before
                 * we locked the pages.
                 * But that's fine, it only affects the @sectors_defragged
                 * accounting.
                 */
                ret = defrag_one_range(inode, entry->start, range_len,
                                       extent_thresh, newer_than, do_compress,
                                       last_scanned_ret);
                if (ret < 0)
                        break;
                *sectors_defragged += range_len >>
                                      inode->root->fs_info->sectorsize_bits;
        }
out:
        list_for_each_entry_safe(entry, tmp, &target_list, list) {
                list_del_init(&entry->list);
                kfree(entry);
        }
        if (ret >= 0)
                *last_scanned_ret = max(*last_scanned_ret, start + len);
        return ret;
}

/*
 * Entry point to file defragmentation.
 *
 * @inode:           inode to be defragged
 * @ra:                   readahead state (can be NUL)
 * @range:           defrag options including range and flags
 * @newer_than:           minimum transid to defrag
 * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
 *                   will be defragged.
 *
 * Return <0 for error.
 * Return >=0 for the number of sectors defragged, and range->start will be updated
 * to indicate the file offset where next defrag should be started at.
 * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
 *  defragging all the range).
 */
int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
                      struct btrfs_ioctl_defrag_range_args *range,
                      u64 newer_than, unsigned long max_to_defrag)
{
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        unsigned long sectors_defragged = 0;
        u64 isize = i_size_read(inode);
        u64 cur;
        u64 last_byte;
        bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS);
        bool ra_allocated = false;
        int compress_type = BTRFS_COMPRESS_ZLIB;
        int ret = 0;
        u32 extent_thresh = range->extent_thresh;
        pgoff_t start_index;

        if (isize == 0)
                return 0;

        if (range->start >= isize)
                return -EINVAL;

        if (do_compress) {
                if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
                        return -EINVAL;
                if (range->compress_type)
                        compress_type = range->compress_type;
        }

        if (extent_thresh == 0)
                extent_thresh = SZ_256K;

        if (range->start + range->len > range->start) {
                /* Got a specific range */
                last_byte = min(isize, range->start + range->len);
        } else {
                /* Defrag until file end */
                last_byte = isize;
        }

        /* Align the range */
        cur = round_down(range->start, fs_info->sectorsize);
        last_byte = round_up(last_byte, fs_info->sectorsize) - 1;

        /*
         * If we were not given a ra, allocate a readahead context. As
         * readahead is just an optimization, defrag will work without it so
         * we don't error out.
         */
        if (!ra) {
                ra_allocated = true;
                ra = kzalloc(sizeof(*ra), GFP_KERNEL);
                if (ra)
                        file_ra_state_init(ra, inode->i_mapping);
        }

        /*
         * Make writeback start from the beginning of the range, so that the
         * defrag range can be written sequentially.
         */
        start_index = cur >> PAGE_SHIFT;
        if (start_index < inode->i_mapping->writeback_index)
                inode->i_mapping->writeback_index = start_index;

        while (cur < last_byte) {
                const unsigned long prev_sectors_defragged = sectors_defragged;
                u64 last_scanned = cur;
                u64 cluster_end;

                if (btrfs_defrag_cancelled(fs_info)) {
                        ret = -EAGAIN;
                        break;
                }

                /* We want the cluster end at page boundary when possible */
                cluster_end = (((cur >> PAGE_SHIFT) +
                               (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
                cluster_end = min(cluster_end, last_byte);

                btrfs_inode_lock(BTRFS_I(inode), 0);
                if (IS_SWAPFILE(inode)) {
                        ret = -ETXTBSY;
                        btrfs_inode_unlock(BTRFS_I(inode), 0);
                        break;
                }
                if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
                        btrfs_inode_unlock(BTRFS_I(inode), 0);
                        break;
                }
                if (do_compress)
                        BTRFS_I(inode)->defrag_compress = compress_type;
                ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
                                cluster_end + 1 - cur, extent_thresh,
                                newer_than, do_compress, &sectors_defragged,
                                max_to_defrag, &last_scanned);

                if (sectors_defragged > prev_sectors_defragged)
                        balance_dirty_pages_ratelimited(inode->i_mapping);

                btrfs_inode_unlock(BTRFS_I(inode), 0);
                if (ret < 0)
                        break;
                cur = max(cluster_end + 1, last_scanned);
                if (ret > 0) {
                        ret = 0;
                        break;
                }
                cond_resched();
        }

        if (ra_allocated)
                kfree(ra);
        /*
         * Update range.start for autodefrag, this will indicate where to start
         * in next run.
         */
        range->start = cur;
        if (sectors_defragged) {
                /*
                 * We have defragged some sectors, for compression case they
                 * need to be written back immediately.
                 */
                if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
                        filemap_flush(inode->i_mapping);
                        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
                                     &BTRFS_I(inode)->runtime_flags))
                                filemap_flush(inode->i_mapping);
                }
                if (range->compress_type == BTRFS_COMPRESS_LZO)
                        btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
                else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
                        btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
                ret = sectors_defragged;
        }
        if (do_compress) {
                btrfs_inode_lock(BTRFS_I(inode), 0);
                BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
                btrfs_inode_unlock(BTRFS_I(inode), 0);
        }
        return ret;
}

void __cold btrfs_auto_defrag_exit(void)
{
        kmem_cache_destroy(btrfs_inode_defrag_cachep);
}

int __init btrfs_auto_defrag_init(void)
{
        btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
                                        sizeof(struct inode_defrag), 0, 0, NULL);
        if (!btrfs_inode_defrag_cachep)
                return -ENOMEM;

        return 0;
}























    1 








    1 



























































































































    1 





    1 




    1 




































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/minix/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include "minix.h"

static int add_nondir(struct dentry *dentry, struct inode *inode)
{
        int err = minix_add_link(dentry, inode);
        if (!err) {
                d_instantiate(dentry, inode);
                return 0;
        }
        inode_dec_link_count(inode);
        iput(inode);
        return err;
}

static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags)
{
        struct inode * inode = NULL;
        ino_t ino;

        if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen)
                return ERR_PTR(-ENAMETOOLONG);

        ino = minix_inode_by_name(dentry);
        if (ino)
                inode = minix_iget(dir->i_sb, ino);
        return d_splice_alias(inode, dentry);
}

static int minix_mknod(struct mnt_idmap *idmap, struct inode *dir,
                       struct dentry *dentry, umode_t mode, dev_t rdev)
{
        struct inode *inode;

        if (!old_valid_dev(rdev))
                return -EINVAL;

        inode = minix_new_inode(dir, mode);
        if (IS_ERR(inode))
                return PTR_ERR(inode);

        minix_set_inode(inode, rdev);
        mark_inode_dirty(inode);
        return add_nondir(dentry, inode);
}

static int minix_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
                         struct file *file, umode_t mode)
{
        struct inode *inode = minix_new_inode(dir, mode);

        if (IS_ERR(inode))
                return finish_open_simple(file, PTR_ERR(inode));
        minix_set_inode(inode, 0);
        mark_inode_dirty(inode);
        d_tmpfile(file, inode);
        return finish_open_simple(file, 0);
}

static int minix_create(struct mnt_idmap *idmap, struct inode *dir,
                        struct dentry *dentry, umode_t mode, bool excl)
{
        return minix_mknod(&nop_mnt_idmap, dir, dentry, mode, 0);
}

static int minix_symlink(struct mnt_idmap *idmap, struct inode *dir,
                         struct dentry *dentry, const char *symname)
{
        int i = strlen(symname)+1;
        struct inode * inode;
        int err;

        if (i > dir->i_sb->s_blocksize)
                return -ENAMETOOLONG;

        inode = minix_new_inode(dir, S_IFLNK | 0777);
        if (IS_ERR(inode))
                return PTR_ERR(inode);

        minix_set_inode(inode, 0);
        err = page_symlink(inode, symname, i);
        if (unlikely(err)) {
                inode_dec_link_count(inode);
                iput(inode);
                return err;
        }
        return add_nondir(dentry, inode);
}

static int minix_link(struct dentry * old_dentry, struct inode * dir,
        struct dentry *dentry)
{
        struct inode *inode = d_inode(old_dentry);

        inode_set_ctime_current(inode);
        inode_inc_link_count(inode);
        ihold(inode);
        return add_nondir(dentry, inode);
}

static int minix_mkdir(struct mnt_idmap *idmap, struct inode *dir,
                       struct dentry *dentry, umode_t mode)
{
        struct inode * inode;
        int err;

        inode = minix_new_inode(dir, S_IFDIR | mode);
        if (IS_ERR(inode))
                return PTR_ERR(inode);

        inode_inc_link_count(dir);
        minix_set_inode(inode, 0);
        inode_inc_link_count(inode);

        err = minix_make_empty(inode, dir);
        if (err)
                goto out_fail;

        err = minix_add_link(dentry, inode);
        if (err)
                goto out_fail;

        d_instantiate(dentry, inode);
out:
        return err;

out_fail:
        inode_dec_link_count(inode);
        inode_dec_link_count(inode);
        iput(inode);
        inode_dec_link_count(dir);
        goto out;
}

static int minix_unlink(struct inode * dir, struct dentry *dentry)
{
        struct inode * inode = d_inode(dentry);
        struct page * page;
        struct minix_dir_entry * de;
        int err;

        de = minix_find_entry(dentry, &page);
        if (!de)
                return -ENOENT;
        err = minix_delete_entry(de, page);
        unmap_and_put_page(page, de);

        if (err)
                return err;
        inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
        inode_dec_link_count(inode);
        return 0;
}

static int minix_rmdir(struct inode * dir, struct dentry *dentry)
{
        struct inode * inode = d_inode(dentry);
        int err = -ENOTEMPTY;

        if (minix_empty_dir(inode)) {
                err = minix_unlink(dir, dentry);
                if (!err) {
                        inode_dec_link_count(dir);
                        inode_dec_link_count(inode);
                }
        }
        return err;
}

static int minix_rename(struct mnt_idmap *idmap,
                        struct inode *old_dir, struct dentry *old_dentry,
                        struct inode *new_dir, struct dentry *new_dentry,
                        unsigned int flags)
{
        struct inode * old_inode = d_inode(old_dentry);
        struct inode * new_inode = d_inode(new_dentry);
        struct page * dir_page = NULL;
        struct minix_dir_entry * dir_de = NULL;
        struct page * old_page;
        struct minix_dir_entry * old_de;
        int err = -ENOENT;

        if (flags & ~RENAME_NOREPLACE)
                return -EINVAL;

        old_de = minix_find_entry(old_dentry, &old_page);
        if (!old_de)
                goto out;

        if (S_ISDIR(old_inode->i_mode)) {
                err = -EIO;
                dir_de = minix_dotdot(old_inode, &dir_page);
                if (!dir_de)
                        goto out_old;
        }

        if (new_inode) {
                struct page * new_page;
                struct minix_dir_entry * new_de;

                err = -ENOTEMPTY;
                if (dir_de && !minix_empty_dir(new_inode))
                        goto out_dir;

                err = -ENOENT;
                new_de = minix_find_entry(new_dentry, &new_page);
                if (!new_de)
                        goto out_dir;
                err = minix_set_link(new_de, new_page, old_inode);
                kunmap(new_page);
                put_page(new_page);
                if (err)
                        goto out_dir;
                inode_set_ctime_current(new_inode);
                if (dir_de)
                        drop_nlink(new_inode);
                inode_dec_link_count(new_inode);
        } else {
                err = minix_add_link(new_dentry, old_inode);
                if (err)
                        goto out_dir;
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }

        err = minix_delete_entry(old_de, old_page);
        if (err)
                goto out_dir;

        mark_inode_dirty(old_inode);

        if (dir_de) {
                err = minix_set_link(dir_de, dir_page, new_dir);
                if (!err)
                        inode_dec_link_count(old_dir);
        }
out_dir:
        if (dir_de)
                unmap_and_put_page(dir_page, dir_de);
out_old:
        unmap_and_put_page(old_page, old_de);
out:
        return err;
}

/*
 * directories can handle most operations...
 */
const struct inode_operations minix_dir_inode_operations = {
        .create                = minix_create,
        .lookup                = minix_lookup,
        .link                = minix_link,
        .unlink                = minix_unlink,
        .symlink        = minix_symlink,
        .mkdir                = minix_mkdir,
        .rmdir                = minix_rmdir,
        .mknod                = minix_mknod,
        .rename                = minix_rename,
        .getattr        = minix_getattr,
        .tmpfile        = minix_tmpfile,
};


































































































































































































































































































































































































































































































































































































    3 
    1 











    3 


    2 
    1 

    3 



    2 

    2 


    2 

    2 








    2 

    2 








    2 



    2 



    2 














































    2 




























































    4 




    2 
    3 

    4 
    4 
    4 

    2 


    3 






    2 















    2 


    2 


    2 


    4 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2008 IBM Corporation
 * Author: Mimi Zohar <zohar@us.ibm.com>
 *
 * ima_policy.c
 *        - initialize default measure policy rules
 */

#include <linux/init.h>
#include <linux/list.h>
#include <linux/kernel_read_file.h>
#include <linux/fs.h>
#include <linux/security.h>
#include <linux/magic.h>
#include <linux/parser.h>
#include <linux/slab.h>
#include <linux/rculist.h>
#include <linux/seq_file.h>
#include <linux/ima.h>

#include "ima.h"

/* flags definitions */
#define IMA_FUNC        0x0001
#define IMA_MASK        0x0002
#define IMA_FSMAGIC        0x0004
#define IMA_UID                0x0008
#define IMA_FOWNER        0x0010
#define IMA_FSUUID        0x0020
#define IMA_INMASK        0x0040
#define IMA_EUID        0x0080
#define IMA_PCR                0x0100
#define IMA_FSNAME        0x0200
#define IMA_KEYRINGS        0x0400
#define IMA_LABEL        0x0800
#define IMA_VALIDATE_ALGOS        0x1000
#define IMA_GID                0x2000
#define IMA_EGID        0x4000
#define IMA_FGROUP        0x8000

#define UNKNOWN                0
#define MEASURE                0x0001        /* same as IMA_MEASURE */
#define DONT_MEASURE        0x0002
#define APPRAISE        0x0004        /* same as IMA_APPRAISE */
#define DONT_APPRAISE        0x0008
#define AUDIT                0x0040
#define HASH                0x0100
#define DONT_HASH        0x0200

#define INVALID_PCR(a) (((a) < 0) || \
        (a) >= (sizeof_field(struct ima_iint_cache, measured_pcrs) * 8))

int ima_policy_flag;
static int temp_ima_appraise;
static int build_ima_appraise __ro_after_init;

atomic_t ima_setxattr_allowed_hash_algorithms;

#define MAX_LSM_RULES 6
enum lsm_rule_types { LSM_OBJ_USER, LSM_OBJ_ROLE, LSM_OBJ_TYPE,
        LSM_SUBJ_USER, LSM_SUBJ_ROLE, LSM_SUBJ_TYPE
};

enum policy_types { ORIGINAL_TCB = 1, DEFAULT_TCB };

enum policy_rule_list { IMA_DEFAULT_POLICY = 1, IMA_CUSTOM_POLICY };

struct ima_rule_opt_list {
        size_t count;
        char *items[] __counted_by(count);
};

/*
 * These comparators are needed nowhere outside of ima so just define them here.
 * This pattern should hopefully never be needed outside of ima.
 */
static inline bool vfsuid_gt_kuid(vfsuid_t vfsuid, kuid_t kuid)
{
        return __vfsuid_val(vfsuid) > __kuid_val(kuid);
}

static inline bool vfsgid_gt_kgid(vfsgid_t vfsgid, kgid_t kgid)
{
        return __vfsgid_val(vfsgid) > __kgid_val(kgid);
}

static inline bool vfsuid_lt_kuid(vfsuid_t vfsuid, kuid_t kuid)
{
        return __vfsuid_val(vfsuid) < __kuid_val(kuid);
}

static inline bool vfsgid_lt_kgid(vfsgid_t vfsgid, kgid_t kgid)
{
        return __vfsgid_val(vfsgid) < __kgid_val(kgid);
}

struct ima_rule_entry {
        struct list_head list;
        int action;
        unsigned int flags;
        enum ima_hooks func;
        int mask;
        unsigned long fsmagic;
        uuid_t fsuuid;
        kuid_t uid;
        kgid_t gid;
        kuid_t fowner;
        kgid_t fgroup;
        bool (*uid_op)(kuid_t cred_uid, kuid_t rule_uid);    /* Handlers for operators       */
        bool (*gid_op)(kgid_t cred_gid, kgid_t rule_gid);
        bool (*fowner_op)(vfsuid_t vfsuid, kuid_t rule_uid); /* vfsuid_eq_kuid(), vfsuid_gt_kuid(), vfsuid_lt_kuid() */
        bool (*fgroup_op)(vfsgid_t vfsgid, kgid_t rule_gid); /* vfsgid_eq_kgid(), vfsgid_gt_kgid(), vfsgid_lt_kgid() */
        int pcr;
        unsigned int allowed_algos; /* bitfield of allowed hash algorithms */
        struct {
                void *rule;        /* LSM file metadata specific */
                char *args_p;        /* audit value */
                int type;        /* audit type */
        } lsm[MAX_LSM_RULES];
        char *fsname;
        struct ima_rule_opt_list *keyrings; /* Measure keys added to these keyrings */
        struct ima_rule_opt_list *label; /* Measure data grouped under this label */
        struct ima_template_desc *template;
};

/*
 * sanity check in case the kernels gains more hash algorithms that can
 * fit in an unsigned int
 */
static_assert(
        8 * sizeof(unsigned int) >= HASH_ALGO__LAST,
        "The bitfield allowed_algos in ima_rule_entry is too small to contain all the supported hash algorithms, consider using a bigger type");

/*
 * Without LSM specific knowledge, the default policy can only be
 * written in terms of .action, .func, .mask, .fsmagic, .uid, .gid,
 * .fowner, and .fgroup
 */

/*
 * The minimum rule set to allow for full TCB coverage.  Measures all files
 * opened or mmap for exec and everything read by root.  Dangerous because
 * normal users can easily run the machine out of memory simply building
 * and running executables.
 */
static struct ima_rule_entry dont_measure_rules[] __ro_after_init = {
        {.action = DONT_MEASURE, .fsmagic = PROC_SUPER_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = SYSFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = DEBUGFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = TMPFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = DEVPTS_SUPER_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = BINFMTFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = SECURITYFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = SELINUX_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = SMACK_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = CGROUP_SUPER_MAGIC,
         .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = CGROUP2_SUPER_MAGIC,
         .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = NSFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE, .fsmagic = EFIVARFS_MAGIC, .flags = IMA_FSMAGIC}
};

static struct ima_rule_entry original_measurement_rules[] __ro_after_init = {
        {.action = MEASURE, .func = MMAP_CHECK, .mask = MAY_EXEC,
         .flags = IMA_FUNC | IMA_MASK},
        {.action = MEASURE, .func = BPRM_CHECK, .mask = MAY_EXEC,
         .flags = IMA_FUNC | IMA_MASK},
        {.action = MEASURE, .func = FILE_CHECK, .mask = MAY_READ,
         .uid = GLOBAL_ROOT_UID, .uid_op = &uid_eq,
         .flags = IMA_FUNC | IMA_MASK | IMA_UID},
        {.action = MEASURE, .func = MODULE_CHECK, .flags = IMA_FUNC},
        {.action = MEASURE, .func = FIRMWARE_CHECK, .flags = IMA_FUNC},
};

static struct ima_rule_entry default_measurement_rules[] __ro_after_init = {
        {.action = MEASURE, .func = MMAP_CHECK, .mask = MAY_EXEC,
         .flags = IMA_FUNC | IMA_MASK},
        {.action = MEASURE, .func = BPRM_CHECK, .mask = MAY_EXEC,
         .flags = IMA_FUNC | IMA_MASK},
        {.action = MEASURE, .func = FILE_CHECK, .mask = MAY_READ,
         .uid = GLOBAL_ROOT_UID, .uid_op = &uid_eq,
         .flags = IMA_FUNC | IMA_INMASK | IMA_EUID},
        {.action = MEASURE, .func = FILE_CHECK, .mask = MAY_READ,
         .uid = GLOBAL_ROOT_UID, .uid_op = &uid_eq,
         .flags = IMA_FUNC | IMA_INMASK | IMA_UID},
        {.action = MEASURE, .func = MODULE_CHECK, .flags = IMA_FUNC},
        {.action = MEASURE, .func = FIRMWARE_CHECK, .flags = IMA_FUNC},
        {.action = MEASURE, .func = POLICY_CHECK, .flags = IMA_FUNC},
};

static struct ima_rule_entry default_appraise_rules[] __ro_after_init = {
        {.action = DONT_APPRAISE, .fsmagic = PROC_SUPER_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = SYSFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = DEBUGFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = TMPFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = RAMFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = DEVPTS_SUPER_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = BINFMTFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = SECURITYFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = SELINUX_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = SMACK_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = NSFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = EFIVARFS_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = CGROUP_SUPER_MAGIC, .flags = IMA_FSMAGIC},
        {.action = DONT_APPRAISE, .fsmagic = CGROUP2_SUPER_MAGIC, .flags = IMA_FSMAGIC},
#ifdef CONFIG_IMA_WRITE_POLICY
        {.action = APPRAISE, .func = POLICY_CHECK,
        .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
#endif
#ifndef CONFIG_IMA_APPRAISE_SIGNED_INIT
        {.action = APPRAISE, .fowner = GLOBAL_ROOT_UID, .fowner_op = &vfsuid_eq_kuid,
         .flags = IMA_FOWNER},
#else
        /* force signature */
        {.action = APPRAISE, .fowner = GLOBAL_ROOT_UID, .fowner_op = &vfsuid_eq_kuid,
         .flags = IMA_FOWNER | IMA_DIGSIG_REQUIRED},
#endif
};

static struct ima_rule_entry build_appraise_rules[] __ro_after_init = {
#ifdef CONFIG_IMA_APPRAISE_REQUIRE_MODULE_SIGS
        {.action = APPRAISE, .func = MODULE_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
#endif
#ifdef CONFIG_IMA_APPRAISE_REQUIRE_FIRMWARE_SIGS
        {.action = APPRAISE, .func = FIRMWARE_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
#endif
#ifdef CONFIG_IMA_APPRAISE_REQUIRE_KEXEC_SIGS
        {.action = APPRAISE, .func = KEXEC_KERNEL_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
#endif
#ifdef CONFIG_IMA_APPRAISE_REQUIRE_POLICY_SIGS
        {.action = APPRAISE, .func = POLICY_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
#endif
};

static struct ima_rule_entry secure_boot_rules[] __ro_after_init = {
        {.action = APPRAISE, .func = MODULE_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
        {.action = APPRAISE, .func = FIRMWARE_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
        {.action = APPRAISE, .func = KEXEC_KERNEL_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
        {.action = APPRAISE, .func = POLICY_CHECK,
         .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
};

static struct ima_rule_entry critical_data_rules[] __ro_after_init = {
        {.action = MEASURE, .func = CRITICAL_DATA, .flags = IMA_FUNC},
};

/* An array of architecture specific rules */
static struct ima_rule_entry *arch_policy_entry __ro_after_init;

static LIST_HEAD(ima_default_rules);
static LIST_HEAD(ima_policy_rules);
static LIST_HEAD(ima_temp_rules);
static struct list_head __rcu *ima_rules = (struct list_head __rcu *)(&ima_default_rules);

static int ima_policy __initdata;

static int __init default_measure_policy_setup(char *str)
{
        if (ima_policy)
                return 1;

        ima_policy = ORIGINAL_TCB;
        return 1;
}
__setup("ima_tcb", default_measure_policy_setup);

static bool ima_use_appraise_tcb __initdata;
static bool ima_use_secure_boot __initdata;
static bool ima_use_critical_data __initdata;
static bool ima_fail_unverifiable_sigs __ro_after_init;
static int __init policy_setup(char *str)
{
        char *p;

        while ((p = strsep(&str, " |\n")) != NULL) {
                if (*p == ' ')
                        continue;
                if ((strcmp(p, "tcb") == 0) && !ima_policy)
                        ima_policy = DEFAULT_TCB;
                else if (strcmp(p, "appraise_tcb") == 0)
                        ima_use_appraise_tcb = true;
                else if (strcmp(p, "secure_boot") == 0)
                        ima_use_secure_boot = true;
                else if (strcmp(p, "critical_data") == 0)
                        ima_use_critical_data = true;
                else if (strcmp(p, "fail_securely") == 0)
                        ima_fail_unverifiable_sigs = true;
                else
                        pr_err("policy \"%s\" not found", p);
        }

        return 1;
}
__setup("ima_policy=", policy_setup);

static int __init default_appraise_policy_setup(char *str)
{
        ima_use_appraise_tcb = true;
        return 1;
}
__setup("ima_appraise_tcb", default_appraise_policy_setup);

static struct ima_rule_opt_list *ima_alloc_rule_opt_list(const substring_t *src)
{
        struct ima_rule_opt_list *opt_list;
        size_t count = 0;
        char *src_copy;
        char *cur, *next;
        size_t i;

        src_copy = match_strdup(src);
        if (!src_copy)
                return ERR_PTR(-ENOMEM);

        next = src_copy;
        while ((cur = strsep(&next, "|"))) {
                /* Don't accept an empty list item */
                if (!(*cur)) {
                        kfree(src_copy);
                        return ERR_PTR(-EINVAL);
                }
                count++;
        }

        /* Don't accept an empty list */
        if (!count) {
                kfree(src_copy);
                return ERR_PTR(-EINVAL);
        }

        opt_list = kzalloc(struct_size(opt_list, items, count), GFP_KERNEL);
        if (!opt_list) {
                kfree(src_copy);
                return ERR_PTR(-ENOMEM);
        }
        opt_list->count = count;

        /*
         * strsep() has already replaced all instances of '|' with '\0',
         * leaving a byte sequence of NUL-terminated strings. Reference each
         * string with the array of items.
         *
         * IMPORTANT: Ownership of the allocated buffer is transferred from
         * src_copy to the first element in the items array. To free the
         * buffer, kfree() must only be called on the first element of the
         * array.
         */
        for (i = 0, cur = src_copy; i < count; i++) {
                opt_list->items[i] = cur;
                cur = strchr(cur, '\0') + 1;
        }

        return opt_list;
}

static void ima_free_rule_opt_list(struct ima_rule_opt_list *opt_list)
{
        if (!opt_list)
                return;

        if (opt_list->count) {
                kfree(opt_list->items[0]);
                opt_list->count = 0;
        }

        kfree(opt_list);
}

static void ima_lsm_free_rule(struct ima_rule_entry *entry)
{
        int i;

        for (i = 0; i < MAX_LSM_RULES; i++) {
                ima_filter_rule_free(entry->lsm[i].rule);
                kfree(entry->lsm[i].args_p);
        }
}

static void ima_free_rule(struct ima_rule_entry *entry)
{
        if (!entry)
                return;

        /*
         * entry->template->fields may be allocated in ima_parse_rule() but that
         * reference is owned by the corresponding ima_template_desc element in
         * the defined_templates list and cannot be freed here
         */
        kfree(entry->fsname);
        ima_free_rule_opt_list(entry->keyrings);
        ima_lsm_free_rule(entry);
        kfree(entry);
}

static struct ima_rule_entry *ima_lsm_copy_rule(struct ima_rule_entry *entry,
                                                gfp_t gfp)
{
        struct ima_rule_entry *nentry;
        int i;

        /*
         * Immutable elements are copied over as pointers and data; only
         * lsm rules can change
         */
        nentry = kmemdup(entry, sizeof(*nentry), gfp);
        if (!nentry)
                return NULL;

        memset(nentry->lsm, 0, sizeof_field(struct ima_rule_entry, lsm));

        for (i = 0; i < MAX_LSM_RULES; i++) {
                if (!entry->lsm[i].args_p)
                        continue;

                nentry->lsm[i].type = entry->lsm[i].type;
                nentry->lsm[i].args_p = entry->lsm[i].args_p;

                ima_filter_rule_init(nentry->lsm[i].type, Audit_equal,
                                     nentry->lsm[i].args_p,
                                     &nentry->lsm[i].rule,
                                     gfp);
                if (!nentry->lsm[i].rule)
                        pr_warn("rule for LSM \'%s\' is undefined\n",
                                nentry->lsm[i].args_p);
        }
        return nentry;
}

static int ima_lsm_update_rule(struct ima_rule_entry *entry)
{
        int i;
        struct ima_rule_entry *nentry;

        nentry = ima_lsm_copy_rule(entry, GFP_KERNEL);
        if (!nentry)
                return -ENOMEM;

        list_replace_rcu(&entry->list, &nentry->list);
        synchronize_rcu();
        /*
         * ima_lsm_copy_rule() shallow copied all references, except for the
         * LSM references, from entry to nentry so we only want to free the LSM
         * references and the entry itself. All other memory references will now
         * be owned by nentry.
         */
        for (i = 0; i < MAX_LSM_RULES; i++)
                ima_filter_rule_free(entry->lsm[i].rule);
        kfree(entry);

        return 0;
}

static bool ima_rule_contains_lsm_cond(struct ima_rule_entry *entry)
{
        int i;

        for (i = 0; i < MAX_LSM_RULES; i++)
                if (entry->lsm[i].args_p)
                        return true;

        return false;
}

/*
 * The LSM policy can be reloaded, leaving the IMA LSM based rules referring
 * to the old, stale LSM policy.  Update the IMA LSM based rules to reflect
 * the reloaded LSM policy.
 */
static void ima_lsm_update_rules(void)
{
        struct ima_rule_entry *entry, *e;
        int result;

        list_for_each_entry_safe(entry, e, &ima_policy_rules, list) {
                if (!ima_rule_contains_lsm_cond(entry))
                        continue;

                result = ima_lsm_update_rule(entry);
                if (result) {
                        pr_err("lsm rule update error %d\n", result);
                        return;
                }
        }
}

int ima_lsm_policy_change(struct notifier_block *nb, unsigned long event,
                          void *lsm_data)
{
        if (event != LSM_POLICY_CHANGE)
                return NOTIFY_DONE;

        ima_lsm_update_rules();
        return NOTIFY_OK;
}

/**
 * ima_match_rule_data - determine whether func_data matches the policy rule
 * @rule: a pointer to a rule
 * @func_data: data to match against the measure rule data
 * @cred: a pointer to a credentials structure for user validation
 *
 * Returns true if func_data matches one in the rule, false otherwise.
 */
static bool ima_match_rule_data(struct ima_rule_entry *rule,
                                const char *func_data,
                                const struct cred *cred)
{
        const struct ima_rule_opt_list *opt_list = NULL;
        bool matched = false;
        size_t i;

        if ((rule->flags & IMA_UID) && !rule->uid_op(cred->uid, rule->uid))
                return false;

        switch (rule->func) {
        case KEY_CHECK:
                if (!rule->keyrings)
                        return true;

                opt_list = rule->keyrings;
                break;
        case CRITICAL_DATA:
                if (!rule->label)
                        return true;

                opt_list = rule->label;
                break;
        default:
                return false;
        }

        if (!func_data)
                return false;

        for (i = 0; i < opt_list->count; i++) {
                if (!strcmp(opt_list->items[i], func_data)) {
                        matched = true;
                        break;
                }
        }

        return matched;
}

/**
 * ima_match_rules - determine whether an inode matches the policy rule.
 * @rule: a pointer to a rule
 * @idmap: idmap of the mount the inode was found from
 * @inode: a pointer to an inode
 * @cred: a pointer to a credentials structure for user validation
 * @secid: the secid of the task to be validated
 * @func: LIM hook identifier
 * @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC)
 * @func_data: func specific data, may be NULL
 *
 * Returns true on rule match, false on failure.
 */
static bool ima_match_rules(struct ima_rule_entry *rule,
                            struct mnt_idmap *idmap,
                            struct inode *inode, const struct cred *cred,
                            u32 secid, enum ima_hooks func, int mask,
                            const char *func_data)
{
        int i;
        bool result = false;
        struct ima_rule_entry *lsm_rule = rule;
        bool rule_reinitialized = false;

        if ((rule->flags & IMA_FUNC) &&
            (rule->func != func && func != POST_SETATTR))
                return false;

        switch (func) {
        case KEY_CHECK:
        case CRITICAL_DATA:
                return ((rule->func == func) &&
                        ima_match_rule_data(rule, func_data, cred));
        default:
                break;
        }

        if ((rule->flags & IMA_MASK) &&
            (rule->mask != mask && func != POST_SETATTR))
                return false;
        if ((rule->flags & IMA_INMASK) &&
            (!(rule->mask & mask) && func != POST_SETATTR))
                return false;
        if ((rule->flags & IMA_FSMAGIC)
            && rule->fsmagic != inode->i_sb->s_magic)
                return false;
        if ((rule->flags & IMA_FSNAME)
            && strcmp(rule->fsname, inode->i_sb->s_type->name))
                return false;
        if ((rule->flags & IMA_FSUUID) &&
            !uuid_equal(&rule->fsuuid, &inode->i_sb->s_uuid))
                return false;
        if ((rule->flags & IMA_UID) && !rule->uid_op(cred->uid, rule->uid))
                return false;
        if (rule->flags & IMA_EUID) {
                if (has_capability_noaudit(current, CAP_SETUID)) {
                        if (!rule->uid_op(cred->euid, rule->uid)
                            && !rule->uid_op(cred->suid, rule->uid)
                            && !rule->uid_op(cred->uid, rule->uid))
                                return false;
                } else if (!rule->uid_op(cred->euid, rule->uid))
                        return false;
        }
        if ((rule->flags & IMA_GID) && !rule->gid_op(cred->gid, rule->gid))
                return false;
        if (rule->flags & IMA_EGID) {
                if (has_capability_noaudit(current, CAP_SETGID)) {
                        if (!rule->gid_op(cred->egid, rule->gid)
                            && !rule->gid_op(cred->sgid, rule->gid)
                            && !rule->gid_op(cred->gid, rule->gid))
                                return false;
                } else if (!rule->gid_op(cred->egid, rule->gid))
                        return false;
        }
        if ((rule->flags & IMA_FOWNER) &&
            !rule->fowner_op(i_uid_into_vfsuid(idmap, inode),
                             rule->fowner))
                return false;
        if ((rule->flags & IMA_FGROUP) &&
            !rule->fgroup_op(i_gid_into_vfsgid(idmap, inode),
                             rule->fgroup))
                return false;
        for (i = 0; i < MAX_LSM_RULES; i++) {
                int rc = 0;
                u32 osid;

                if (!lsm_rule->lsm[i].rule) {
                        if (!lsm_rule->lsm[i].args_p)
                                continue;
                        else
                                return false;
                }

retry:
                switch (i) {
                case LSM_OBJ_USER:
                case LSM_OBJ_ROLE:
                case LSM_OBJ_TYPE:
                        security_inode_getsecid(inode, &osid);
                        rc = ima_filter_rule_match(osid, lsm_rule->lsm[i].type,
                                                   Audit_equal,
                                                   lsm_rule->lsm[i].rule);
                        break;
                case LSM_SUBJ_USER:
                case LSM_SUBJ_ROLE:
                case LSM_SUBJ_TYPE:
                        rc = ima_filter_rule_match(secid, lsm_rule->lsm[i].type,
                                                   Audit_equal,
                                                   lsm_rule->lsm[i].rule);
                        break;
                default:
                        break;
                }

                if (rc == -ESTALE && !rule_reinitialized) {
                        lsm_rule = ima_lsm_copy_rule(rule, GFP_ATOMIC);
                        if (lsm_rule) {
                                rule_reinitialized = true;
                                goto retry;
                        }
                }
                if (!rc) {
                        result = false;
                        goto out;
                }
        }
        result = true;

out:
        if (rule_reinitialized) {
                for (i = 0; i < MAX_LSM_RULES; i++)
                        ima_filter_rule_free(lsm_rule->lsm[i].rule);
                kfree(lsm_rule);
        }
        return result;
}

/*
 * In addition to knowing that we need to appraise the file in general,
 * we need to differentiate between calling hooks, for hook specific rules.
 */
static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func)
{
        if (!(rule->flags & IMA_FUNC))
                return IMA_FILE_APPRAISE;

        switch (func) {
        case MMAP_CHECK:
        case MMAP_CHECK_REQPROT:
                return IMA_MMAP_APPRAISE;
        case BPRM_CHECK:
                return IMA_BPRM_APPRAISE;
        case CREDS_CHECK:
                return IMA_CREDS_APPRAISE;
        case FILE_CHECK:
        case POST_SETATTR:
                return IMA_FILE_APPRAISE;
        case MODULE_CHECK ... MAX_CHECK - 1:
        default:
                return IMA_READ_APPRAISE;
        }
}

/**
 * ima_match_policy - decision based on LSM and other conditions
 * @idmap: idmap of the mount the inode was found from
 * @inode: pointer to an inode for which the policy decision is being made
 * @cred: pointer to a credentials structure for which the policy decision is
 *        being made
 * @secid: LSM secid of the task to be validated
 * @func: IMA hook identifier
 * @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC)
 * @flags: IMA actions to consider (e.g. IMA_MEASURE | IMA_APPRAISE)
 * @pcr: set the pcr to extend
 * @template_desc: the template that should be used for this rule
 * @func_data: func specific data, may be NULL
 * @allowed_algos: allowlist of hash algorithms for the IMA xattr
 *
 * Measure decision based on func/mask/fsmagic and LSM(subj/obj/type)
 * conditions.
 *
 * Since the IMA policy may be updated multiple times we need to lock the
 * list when walking it.  Reads are many orders of magnitude more numerous
 * than writes so ima_match_policy() is classical RCU candidate.
 */
int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode,
                     const struct cred *cred, u32 secid, enum ima_hooks func,
                     int mask, int flags, int *pcr,
                     struct ima_template_desc **template_desc,
                     const char *func_data, unsigned int *allowed_algos)
{
        struct ima_rule_entry *entry;
        int action = 0, actmask = flags | (flags << 1);
        struct list_head *ima_rules_tmp;

        if (template_desc && !*template_desc)
                *template_desc = ima_template_desc_current();

        rcu_read_lock();
        ima_rules_tmp = rcu_dereference(ima_rules);
        list_for_each_entry_rcu(entry, ima_rules_tmp, list) {

                if (!(entry->action & actmask))
                        continue;

                if (!ima_match_rules(entry, idmap, inode, cred, secid,
                                     func, mask, func_data))
                        continue;

                action |= entry->flags & IMA_NONACTION_FLAGS;

                action |= entry->action & IMA_DO_MASK;
                if (entry->action & IMA_APPRAISE) {
                        action |= get_subaction(entry, func);
                        action &= ~IMA_HASH;
                        if (ima_fail_unverifiable_sigs)
                                action |= IMA_FAIL_UNVERIFIABLE_SIGS;

                        if (allowed_algos &&
                            entry->flags & IMA_VALIDATE_ALGOS)
                                *allowed_algos = entry->allowed_algos;
                }

                if (entry->action & IMA_DO_MASK)
                        actmask &= ~(entry->action | entry->action << 1);
                else
                        actmask &= ~(entry->action | entry->action >> 1);

                if ((pcr) && (entry->flags & IMA_PCR))
                        *pcr = entry->pcr;

                if (template_desc && entry->template)
                        *template_desc = entry->template;

                if (!actmask)
                        break;
        }
        rcu_read_unlock();

        return action;
}

/**
 * ima_update_policy_flags() - Update global IMA variables
 *
 * Update ima_policy_flag and ima_setxattr_allowed_hash_algorithms
 * based on the currently loaded policy.
 *
 * With ima_policy_flag, the decision to short circuit out of a function
 * or not call the function in the first place can be made earlier.
 *
 * With ima_setxattr_allowed_hash_algorithms, the policy can restrict the
 * set of hash algorithms accepted when updating the security.ima xattr of
 * a file.
 *
 * Context: called after a policy update and at system initialization.
 */
void ima_update_policy_flags(void)
{
        struct ima_rule_entry *entry;
        int new_policy_flag = 0;
        struct list_head *ima_rules_tmp;

        rcu_read_lock();
        ima_rules_tmp = rcu_dereference(ima_rules);
        list_for_each_entry_rcu(entry, ima_rules_tmp, list) {
                /*
                 * SETXATTR_CHECK rules do not implement a full policy check
                 * because rule checking would probably have an important
                 * performance impact on setxattr(). As a consequence, only one
                 * SETXATTR_CHECK can be active at a given time.
                 * Because we want to preserve that property, we set out to use
                 * atomic_cmpxchg. Either:
                 * - the atomic was non-zero: a setxattr hash policy is
                 *   already enforced, we do nothing
                 * - the atomic was zero: no setxattr policy was set, enable
                 *   the setxattr hash policy
                 */
                if (entry->func == SETXATTR_CHECK) {
                        atomic_cmpxchg(&ima_setxattr_allowed_hash_algorithms,
                                       0, entry->allowed_algos);
                        /* SETXATTR_CHECK doesn't impact ima_policy_flag */
                        continue;
                }

                if (entry->action & IMA_DO_MASK)
                        new_policy_flag |= entry->action;
        }
        rcu_read_unlock();

        ima_appraise |= (build_ima_appraise | temp_ima_appraise);
        if (!ima_appraise)
                new_policy_flag &= ~IMA_APPRAISE;

        ima_policy_flag = new_policy_flag;
}

static int ima_appraise_flag(enum ima_hooks func)
{
        if (func == MODULE_CHECK)
                return IMA_APPRAISE_MODULES;
        else if (func == FIRMWARE_CHECK)
                return IMA_APPRAISE_FIRMWARE;
        else if (func == POLICY_CHECK)
                return IMA_APPRAISE_POLICY;
        else if (func == KEXEC_KERNEL_CHECK)
                return IMA_APPRAISE_KEXEC;
        return 0;
}

static void add_rules(struct ima_rule_entry *entries, int count,
                      enum policy_rule_list policy_rule)
{
        int i = 0;

        for (i = 0; i < count; i++) {
                struct ima_rule_entry *entry;

                if (policy_rule & IMA_DEFAULT_POLICY)
                        list_add_tail(&entries[i].list, &ima_default_rules);

                if (policy_rule & IMA_CUSTOM_POLICY) {
                        entry = kmemdup(&entries[i], sizeof(*entry),
                                        GFP_KERNEL);
                        if (!entry)
                                continue;

                        list_add_tail(&entry->list, &ima_policy_rules);
                }
                if (entries[i].action == APPRAISE) {
                        if (entries != build_appraise_rules)
                                temp_ima_appraise |=
                                        ima_appraise_flag(entries[i].func);
                        else
                                build_ima_appraise |=
                                        ima_appraise_flag(entries[i].func);
                }
        }
}

static int ima_parse_rule(char *rule, struct ima_rule_entry *entry);

static int __init ima_init_arch_policy(void)
{
        const char * const *arch_rules;
        const char * const *rules;
        int arch_entries = 0;
        int i = 0;

        arch_rules = arch_get_ima_policy();
        if (!arch_rules)
                return arch_entries;

        /* Get number of rules */
        for (rules = arch_rules; *rules != NULL; rules++)
                arch_entries++;

        arch_policy_entry = kcalloc(arch_entries + 1,
                                    sizeof(*arch_policy_entry), GFP_KERNEL);
        if (!arch_policy_entry)
                return 0;

        /* Convert each policy string rules to struct ima_rule_entry format */
        for (rules = arch_rules, i = 0; *rules != NULL; rules++) {
                char rule[255];
                int result;

                result = strscpy(rule, *rules, sizeof(rule));

                INIT_LIST_HEAD(&arch_policy_entry[i].list);
                result = ima_parse_rule(rule, &arch_policy_entry[i]);
                if (result) {
                        pr_warn("Skipping unknown architecture policy rule: %s\n",
                                rule);
                        memset(&arch_policy_entry[i], 0,
                               sizeof(*arch_policy_entry));
                        continue;
                }
                i++;
        }
        return i;
}

/**
 * ima_init_policy - initialize the default measure rules.
 *
 * ima_rules points to either the ima_default_rules or the new ima_policy_rules.
 */
void __init ima_init_policy(void)
{
        int build_appraise_entries, arch_entries;

        /* if !ima_policy, we load NO default rules */
        if (ima_policy)
                add_rules(dont_measure_rules, ARRAY_SIZE(dont_measure_rules),
                          IMA_DEFAULT_POLICY);

        switch (ima_policy) {
        case ORIGINAL_TCB:
                add_rules(original_measurement_rules,
                          ARRAY_SIZE(original_measurement_rules),
                          IMA_DEFAULT_POLICY);
                break;
        case DEFAULT_TCB:
                add_rules(default_measurement_rules,
                          ARRAY_SIZE(default_measurement_rules),
                          IMA_DEFAULT_POLICY);
                break;
        default:
                break;
        }

        /*
         * Based on runtime secure boot flags, insert arch specific measurement
         * and appraise rules requiring file signatures for both the initial
         * and custom policies, prior to other appraise rules.
         * (Highest priority)
         */
        arch_entries = ima_init_arch_policy();
        if (!arch_entries)
                pr_info("No architecture policies found\n");
        else
                add_rules(arch_policy_entry, arch_entries,
                          IMA_DEFAULT_POLICY | IMA_CUSTOM_POLICY);

        /*
         * Insert the builtin "secure_boot" policy rules requiring file
         * signatures, prior to other appraise rules.
         */
        if (ima_use_secure_boot)
                add_rules(secure_boot_rules, ARRAY_SIZE(secure_boot_rules),
                          IMA_DEFAULT_POLICY);

        /*
         * Insert the build time appraise rules requiring file signatures
         * for both the initial and custom policies, prior to other appraise
         * rules. As the secure boot rules includes all of the build time
         * rules, include either one or the other set of rules, but not both.
         */
        build_appraise_entries = ARRAY_SIZE(build_appraise_rules);
        if (build_appraise_entries) {
                if (ima_use_secure_boot)
                        add_rules(build_appraise_rules, build_appraise_entries,
                                  IMA_CUSTOM_POLICY);
                else
                        add_rules(build_appraise_rules, build_appraise_entries,
                                  IMA_DEFAULT_POLICY | IMA_CUSTOM_POLICY);
        }

        if (ima_use_appraise_tcb)
                add_rules(default_appraise_rules,
                          ARRAY_SIZE(default_appraise_rules),
                          IMA_DEFAULT_POLICY);

        if (ima_use_critical_data)
                add_rules(critical_data_rules,
                          ARRAY_SIZE(critical_data_rules),
                          IMA_DEFAULT_POLICY);

        atomic_set(&ima_setxattr_allowed_hash_algorithms, 0);

        ima_update_policy_flags();
}

/* Make sure we have a valid policy, at least containing some rules. */
int ima_check_policy(void)
{
        if (list_empty(&ima_temp_rules))
                return -EINVAL;
        return 0;
}

/**
 * ima_update_policy - update default_rules with new measure rules
 *
 * Called on file .release to update the default rules with a complete new
 * policy.  What we do here is to splice ima_policy_rules and ima_temp_rules so
 * they make a queue.  The policy may be updated multiple times and this is the
 * RCU updater.
 *
 * Policy rules are never deleted so ima_policy_flag gets zeroed only once when
 * we switch from the default policy to user defined.
 */
void ima_update_policy(void)
{
        struct list_head *policy = &ima_policy_rules;

        list_splice_tail_init_rcu(&ima_temp_rules, policy, synchronize_rcu);

        if (ima_rules != (struct list_head __rcu *)policy) {
                ima_policy_flag = 0;

                rcu_assign_pointer(ima_rules, policy);
                /*
                 * IMA architecture specific policy rules are specified
                 * as strings and converted to an array of ima_entry_rules
                 * on boot.  After loading a custom policy, free the
                 * architecture specific rules stored as an array.
                 */
                kfree(arch_policy_entry);
        }
        ima_update_policy_flags();

        /* Custom IMA policy has been loaded */
        ima_process_queued_keys();
}

/* Keep the enumeration in sync with the policy_tokens! */
enum policy_opt {
        Opt_measure, Opt_dont_measure,
        Opt_appraise, Opt_dont_appraise,
        Opt_audit, Opt_hash, Opt_dont_hash,
        Opt_obj_user, Opt_obj_role, Opt_obj_type,
        Opt_subj_user, Opt_subj_role, Opt_subj_type,
        Opt_func, Opt_mask, Opt_fsmagic, Opt_fsname, Opt_fsuuid,
        Opt_uid_eq, Opt_euid_eq, Opt_gid_eq, Opt_egid_eq,
        Opt_fowner_eq, Opt_fgroup_eq,
        Opt_uid_gt, Opt_euid_gt, Opt_gid_gt, Opt_egid_gt,
        Opt_fowner_gt, Opt_fgroup_gt,
        Opt_uid_lt, Opt_euid_lt, Opt_gid_lt, Opt_egid_lt,
        Opt_fowner_lt, Opt_fgroup_lt,
        Opt_digest_type,
        Opt_appraise_type, Opt_appraise_flag, Opt_appraise_algos,
        Opt_permit_directio, Opt_pcr, Opt_template, Opt_keyrings,
        Opt_label, Opt_err
};

static const match_table_t policy_tokens = {
        {Opt_measure, "measure"},
        {Opt_dont_measure, "dont_measure"},
        {Opt_appraise, "appraise"},
        {Opt_dont_appraise, "dont_appraise"},
        {Opt_audit, "audit"},
        {Opt_hash, "hash"},
        {Opt_dont_hash, "dont_hash"},
        {Opt_obj_user, "obj_user=%s"},
        {Opt_obj_role, "obj_role=%s"},
        {Opt_obj_type, "obj_type=%s"},
        {Opt_subj_user, "subj_user=%s"},
        {Opt_subj_role, "subj_role=%s"},
        {Opt_subj_type, "subj_type=%s"},
        {Opt_func, "func=%s"},
        {Opt_mask, "mask=%s"},
        {Opt_fsmagic, "fsmagic=%s"},
        {Opt_fsname, "fsname=%s"},
        {Opt_fsuuid, "fsuuid=%s"},
        {Opt_uid_eq, "uid=%s"},
        {Opt_euid_eq, "euid=%s"},
        {Opt_gid_eq, "gid=%s"},
        {Opt_egid_eq, "egid=%s"},
        {Opt_fowner_eq, "fowner=%s"},
        {Opt_fgroup_eq, "fgroup=%s"},
        {Opt_uid_gt, "uid>%s"},
        {Opt_euid_gt, "euid>%s"},
        {Opt_gid_gt, "gid>%s"},
        {Opt_egid_gt, "egid>%s"},
        {Opt_fowner_gt, "fowner>%s"},
        {Opt_fgroup_gt, "fgroup>%s"},
        {Opt_uid_lt, "uid<%s"},
        {Opt_euid_lt, "euid<%s"},
        {Opt_gid_lt, "gid<%s"},
        {Opt_egid_lt, "egid<%s"},
        {Opt_fowner_lt, "fowner<%s"},
        {Opt_fgroup_lt, "fgroup<%s"},
        {Opt_digest_type, "digest_type=%s"},
        {Opt_appraise_type, "appraise_type=%s"},
        {Opt_appraise_flag, "appraise_flag=%s"},
        {Opt_appraise_algos, "appraise_algos=%s"},
        {Opt_permit_directio, "permit_directio"},
        {Opt_pcr, "pcr=%s"},
        {Opt_template, "template=%s"},
        {Opt_keyrings, "keyrings=%s"},
        {Opt_label, "label=%s"},
        {Opt_err, NULL}
};

static int ima_lsm_rule_init(struct ima_rule_entry *entry,
                             substring_t *args, int lsm_rule, int audit_type)
{
        int result;

        if (entry->lsm[lsm_rule].rule)
                return -EINVAL;

        entry->lsm[lsm_rule].args_p = match_strdup(args);
        if (!entry->lsm[lsm_rule].args_p)
                return -ENOMEM;

        entry->lsm[lsm_rule].type = audit_type;
        result = ima_filter_rule_init(entry->lsm[lsm_rule].type, Audit_equal,
                                      entry->lsm[lsm_rule].args_p,
                                      &entry->lsm[lsm_rule].rule,
                                      GFP_KERNEL);
        if (!entry->lsm[lsm_rule].rule) {
                pr_warn("rule for LSM \'%s\' is undefined\n",
                        entry->lsm[lsm_rule].args_p);

                if (ima_rules == (struct list_head __rcu *)(&ima_default_rules)) {
                        kfree(entry->lsm[lsm_rule].args_p);
                        entry->lsm[lsm_rule].args_p = NULL;
                        result = -EINVAL;
                } else
                        result = 0;
        }

        return result;
}

static void ima_log_string_op(struct audit_buffer *ab, char *key, char *value,
                              enum policy_opt rule_operator)
{
        if (!ab)
                return;

        switch (rule_operator) {
        case Opt_uid_gt:
        case Opt_euid_gt:
        case Opt_gid_gt:
        case Opt_egid_gt:
        case Opt_fowner_gt:
        case Opt_fgroup_gt:
                audit_log_format(ab, "%s>", key);
                break;
        case Opt_uid_lt:
        case Opt_euid_lt:
        case Opt_gid_lt:
        case Opt_egid_lt:
        case Opt_fowner_lt:
        case Opt_fgroup_lt:
                audit_log_format(ab, "%s<", key);
                break;
        default:
                audit_log_format(ab, "%s=", key);
        }
        audit_log_format(ab, "%s ", value);
}
static void ima_log_string(struct audit_buffer *ab, char *key, char *value)
{
        ima_log_string_op(ab, key, value, Opt_err);
}

/*
 * Validating the appended signature included in the measurement list requires
 * the file hash calculated without the appended signature (i.e., the 'd-modsig'
 * field). Therefore, notify the user if they have the 'modsig' field but not
 * the 'd-modsig' field in the template.
 */
static void check_template_modsig(const struct ima_template_desc *template)
{
#define MSG "template with 'modsig' field also needs 'd-modsig' field\n"
        bool has_modsig, has_dmodsig;
        static bool checked;
        int i;

        /* We only need to notify the user once. */
        if (checked)
                return;

        has_modsig = has_dmodsig = false;
        for (i = 0; i < template->num_fields; i++) {
                if (!strcmp(template->fields[i]->field_id, "modsig"))
                        has_modsig = true;
                else if (!strcmp(template->fields[i]->field_id, "d-modsig"))
                        has_dmodsig = true;
        }

        if (has_modsig && !has_dmodsig)
                pr_notice(MSG);

        checked = true;
#undef MSG
}

/*
 * Warn if the template does not contain the given field.
 */
static void check_template_field(const struct ima_template_desc *template,
                                 const char *field, const char *msg)
{
        int i;

        for (i = 0; i < template->num_fields; i++)
                if (!strcmp(template->fields[i]->field_id, field))
                        return;

        pr_notice_once("%s", msg);
}

static bool ima_validate_rule(struct ima_rule_entry *entry)
{
        /* Ensure that the action is set and is compatible with the flags */
        if (entry->action == UNKNOWN)
                return false;

        if (entry->action != MEASURE && entry->flags & IMA_PCR)
                return false;

        if (entry->action != APPRAISE &&
            entry->flags & (IMA_DIGSIG_REQUIRED | IMA_MODSIG_ALLOWED |
                            IMA_CHECK_BLACKLIST | IMA_VALIDATE_ALGOS))
                return false;

        /*
         * The IMA_FUNC bit must be set if and only if there's a valid hook
         * function specified, and vice versa. Enforcing this property allows
         * for the NONE case below to validate a rule without an explicit hook
         * function.
         */
        if (((entry->flags & IMA_FUNC) && entry->func == NONE) ||
            (!(entry->flags & IMA_FUNC) && entry->func != NONE))
                return false;

        /*
         * Ensure that the hook function is compatible with the other
         * components of the rule
         */
        switch (entry->func) {
        case NONE:
        case FILE_CHECK:
        case MMAP_CHECK:
        case MMAP_CHECK_REQPROT:
        case BPRM_CHECK:
        case CREDS_CHECK:
        case POST_SETATTR:
        case FIRMWARE_CHECK:
        case POLICY_CHECK:
                if (entry->flags & ~(IMA_FUNC | IMA_MASK | IMA_FSMAGIC |
                                     IMA_UID | IMA_FOWNER | IMA_FSUUID |
                                     IMA_INMASK | IMA_EUID | IMA_PCR |
                                     IMA_FSNAME | IMA_GID | IMA_EGID |
                                     IMA_FGROUP | IMA_DIGSIG_REQUIRED |
                                     IMA_PERMIT_DIRECTIO | IMA_VALIDATE_ALGOS |
                                     IMA_CHECK_BLACKLIST | IMA_VERITY_REQUIRED))
                        return false;

                break;
        case MODULE_CHECK:
        case KEXEC_KERNEL_CHECK:
        case KEXEC_INITRAMFS_CHECK:
                if (entry->flags & ~(IMA_FUNC | IMA_MASK | IMA_FSMAGIC |
                                     IMA_UID | IMA_FOWNER | IMA_FSUUID |
                                     IMA_INMASK | IMA_EUID | IMA_PCR |
                                     IMA_FSNAME | IMA_GID | IMA_EGID |
                                     IMA_FGROUP | IMA_DIGSIG_REQUIRED |
                                     IMA_PERMIT_DIRECTIO | IMA_MODSIG_ALLOWED |
                                     IMA_CHECK_BLACKLIST | IMA_VALIDATE_ALGOS))
                        return false;

                break;
        case KEXEC_CMDLINE:
                if (entry->action & ~(MEASURE | DONT_MEASURE))
                        return false;

                if (entry->flags & ~(IMA_FUNC | IMA_FSMAGIC | IMA_UID |
                                     IMA_FOWNER | IMA_FSUUID | IMA_EUID |
                                     IMA_PCR | IMA_FSNAME | IMA_GID | IMA_EGID |
                                     IMA_FGROUP))
                        return false;

                break;
        case KEY_CHECK:
                if (entry->action & ~(MEASURE | DONT_MEASURE))
                        return false;

                if (entry->flags & ~(IMA_FUNC | IMA_UID | IMA_GID | IMA_PCR |
                                     IMA_KEYRINGS))
                        return false;

                if (ima_rule_contains_lsm_cond(entry))
                        return false;

                break;
        case CRITICAL_DATA:
                if (entry->action & ~(MEASURE | DONT_MEASURE))
                        return false;

                if (entry->flags & ~(IMA_FUNC | IMA_UID | IMA_GID | IMA_PCR |
                                     IMA_LABEL))
                        return false;

                if (ima_rule_contains_lsm_cond(entry))
                        return false;

                break;
        case SETXATTR_CHECK:
                /* any action other than APPRAISE is unsupported */
                if (entry->action != APPRAISE)
                        return false;

                /* SETXATTR_CHECK requires an appraise_algos parameter */
                if (!(entry->flags & IMA_VALIDATE_ALGOS))
                        return false;

                /*
                 * full policies are not supported, they would have too
                 * much of a performance impact
                 */
                if (entry->flags & ~(IMA_FUNC | IMA_VALIDATE_ALGOS))
                        return false;

                break;
        default:
                return false;
        }

        /* Ensure that combinations of flags are compatible with each other */
        if (entry->flags & IMA_CHECK_BLACKLIST &&
            !(entry->flags & IMA_DIGSIG_REQUIRED))
                return false;

        /*
         * Unlike for regular IMA 'appraise' policy rules where security.ima
         * xattr may contain either a file hash or signature, the security.ima
         * xattr for fsverity must contain a file signature (sigv3).  Ensure
         * that 'appraise' rules for fsverity require file signatures by
         * checking the IMA_DIGSIG_REQUIRED flag is set.
         */
        if (entry->action == APPRAISE &&
            (entry->flags & IMA_VERITY_REQUIRED) &&
            !(entry->flags & IMA_DIGSIG_REQUIRED))
                return false;

        return true;
}

static unsigned int ima_parse_appraise_algos(char *arg)
{
        unsigned int res = 0;
        int idx;
        char *token;

        while ((token = strsep(&arg, ",")) != NULL) {
                idx = match_string(hash_algo_name, HASH_ALGO__LAST, token);

                if (idx < 0) {
                        pr_err("unknown hash algorithm \"%s\"",
                               token);
                        return 0;
                }

                if (!crypto_has_alg(hash_algo_name[idx], 0, 0)) {
                        pr_err("unavailable hash algorithm \"%s\", check your kernel configuration",
                               token);
                        return 0;
                }

                /* Add the hash algorithm to the 'allowed' bitfield */
                res |= (1U << idx);
        }

        return res;
}

static int ima_parse_rule(char *rule, struct ima_rule_entry *entry)
{
        struct audit_buffer *ab;
        char *from;
        char *p;
        bool eid_token; /* either euid or egid */
        struct ima_template_desc *template_desc;
        int result = 0;

        ab = integrity_audit_log_start(audit_context(), GFP_KERNEL,
                                       AUDIT_INTEGRITY_POLICY_RULE);

        entry->uid = INVALID_UID;
        entry->gid = INVALID_GID;
        entry->fowner = INVALID_UID;
        entry->fgroup = INVALID_GID;
        entry->uid_op = &uid_eq;
        entry->gid_op = &gid_eq;
        entry->fowner_op = &vfsuid_eq_kuid;
        entry->fgroup_op = &vfsgid_eq_kgid;
        entry->action = UNKNOWN;
        while ((p = strsep(&rule, " \t")) != NULL) {
                substring_t args[MAX_OPT_ARGS];
                int token;
                unsigned long lnum;

                if (result < 0)
                        break;
                if ((*p == '\0') || (*p == ' ') || (*p == '\t'))
                        continue;
                token = match_token(p, policy_tokens, args);
                switch (token) {
                case Opt_measure:
                        ima_log_string(ab, "action", "measure");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = MEASURE;
                        break;
                case Opt_dont_measure:
                        ima_log_string(ab, "action", "dont_measure");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = DONT_MEASURE;
                        break;
                case Opt_appraise:
                        ima_log_string(ab, "action", "appraise");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = APPRAISE;
                        break;
                case Opt_dont_appraise:
                        ima_log_string(ab, "action", "dont_appraise");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = DONT_APPRAISE;
                        break;
                case Opt_audit:
                        ima_log_string(ab, "action", "audit");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = AUDIT;
                        break;
                case Opt_hash:
                        ima_log_string(ab, "action", "hash");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = HASH;
                        break;
                case Opt_dont_hash:
                        ima_log_string(ab, "action", "dont_hash");

                        if (entry->action != UNKNOWN)
                                result = -EINVAL;

                        entry->action = DONT_HASH;
                        break;
                case Opt_func:
                        ima_log_string(ab, "func", args[0].from);

                        if (entry->func)
                                result = -EINVAL;

                        if (strcmp(args[0].from, "FILE_CHECK") == 0)
                                entry->func = FILE_CHECK;
                        /* PATH_CHECK is for backwards compat */
                        else if (strcmp(args[0].from, "PATH_CHECK") == 0)
                                entry->func = FILE_CHECK;
                        else if (strcmp(args[0].from, "MODULE_CHECK") == 0)
                                entry->func = MODULE_CHECK;
                        else if (strcmp(args[0].from, "FIRMWARE_CHECK") == 0)
                                entry->func = FIRMWARE_CHECK;
                        else if ((strcmp(args[0].from, "FILE_MMAP") == 0)
                                || (strcmp(args[0].from, "MMAP_CHECK") == 0))
                                entry->func = MMAP_CHECK;
                        else if ((strcmp(args[0].from, "MMAP_CHECK_REQPROT") == 0))
                                entry->func = MMAP_CHECK_REQPROT;
                        else if (strcmp(args[0].from, "BPRM_CHECK") == 0)
                                entry->func = BPRM_CHECK;
                        else if (strcmp(args[0].from, "CREDS_CHECK") == 0)
                                entry->func = CREDS_CHECK;
                        else if (strcmp(args[0].from, "KEXEC_KERNEL_CHECK") ==
                                 0)
                                entry->func = KEXEC_KERNEL_CHECK;
                        else if (strcmp(args[0].from, "KEXEC_INITRAMFS_CHECK")
                                 == 0)
                                entry->func = KEXEC_INITRAMFS_CHECK;
                        else if (strcmp(args[0].from, "POLICY_CHECK") == 0)
                                entry->func = POLICY_CHECK;
                        else if (strcmp(args[0].from, "KEXEC_CMDLINE") == 0)
                                entry->func = KEXEC_CMDLINE;
                        else if (IS_ENABLED(CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS) &&
                                 strcmp(args[0].from, "KEY_CHECK") == 0)
                                entry->func = KEY_CHECK;
                        else if (strcmp(args[0].from, "CRITICAL_DATA") == 0)
                                entry->func = CRITICAL_DATA;
                        else if (strcmp(args[0].from, "SETXATTR_CHECK") == 0)
                                entry->func = SETXATTR_CHECK;
                        else
                                result = -EINVAL;
                        if (!result)
                                entry->flags |= IMA_FUNC;
                        break;
                case Opt_mask:
                        ima_log_string(ab, "mask", args[0].from);

                        if (entry->mask)
                                result = -EINVAL;

                        from = args[0].from;
                        if (*from == '^')
                                from++;

                        if ((strcmp(from, "MAY_EXEC")) == 0)
                                entry->mask = MAY_EXEC;
                        else if (strcmp(from, "MAY_WRITE") == 0)
                                entry->mask = MAY_WRITE;
                        else if (strcmp(from, "MAY_READ") == 0)
                                entry->mask = MAY_READ;
                        else if (strcmp(from, "MAY_APPEND") == 0)
                                entry->mask = MAY_APPEND;
                        else
                                result = -EINVAL;
                        if (!result)
                                entry->flags |= (*args[0].from == '^')
                                     ? IMA_INMASK : IMA_MASK;
                        break;
                case Opt_fsmagic:
                        ima_log_string(ab, "fsmagic", args[0].from);

                        if (entry->fsmagic) {
                                result = -EINVAL;
                                break;
                        }

                        result = kstrtoul(args[0].from, 16, &entry->fsmagic);
                        if (!result)
                                entry->flags |= IMA_FSMAGIC;
                        break;
                case Opt_fsname:
                        ima_log_string(ab, "fsname", args[0].from);

                        entry->fsname = kstrdup(args[0].from, GFP_KERNEL);
                        if (!entry->fsname) {
                                result = -ENOMEM;
                                break;
                        }
                        result = 0;
                        entry->flags |= IMA_FSNAME;
                        break;
                case Opt_keyrings:
                        ima_log_string(ab, "keyrings", args[0].from);

                        if (!IS_ENABLED(CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS) ||
                            entry->keyrings) {
                                result = -EINVAL;
                                break;
                        }

                        entry->keyrings = ima_alloc_rule_opt_list(args);
                        if (IS_ERR(entry->keyrings)) {
                                result = PTR_ERR(entry->keyrings);
                                entry->keyrings = NULL;
                                break;
                        }

                        entry->flags |= IMA_KEYRINGS;
                        break;
                case Opt_label:
                        ima_log_string(ab, "label", args[0].from);

                        if (entry->label) {
                                result = -EINVAL;
                                break;
                        }

                        entry->label = ima_alloc_rule_opt_list(args);
                        if (IS_ERR(entry->label)) {
                                result = PTR_ERR(entry->label);
                                entry->label = NULL;
                                break;
                        }

                        entry->flags |= IMA_LABEL;
                        break;
                case Opt_fsuuid:
                        ima_log_string(ab, "fsuuid", args[0].from);

                        if (!uuid_is_null(&entry->fsuuid)) {
                                result = -EINVAL;
                                break;
                        }

                        result = uuid_parse(args[0].from, &entry->fsuuid);
                        if (!result)
                                entry->flags |= IMA_FSUUID;
                        break;
                case Opt_uid_gt:
                case Opt_euid_gt:
                        entry->uid_op = &uid_gt;
                        fallthrough;
                case Opt_uid_lt:
                case Opt_euid_lt:
                        if ((token == Opt_uid_lt) || (token == Opt_euid_lt))
                                entry->uid_op = &uid_lt;
                        fallthrough;
                case Opt_uid_eq:
                case Opt_euid_eq:
                        eid_token = (token == Opt_euid_eq) ||
                                    (token == Opt_euid_gt) ||
                                    (token == Opt_euid_lt);

                        ima_log_string_op(ab, eid_token ? "euid" : "uid",
                                          args[0].from, token);

                        if (uid_valid(entry->uid)) {
                                result = -EINVAL;
                                break;
                        }

                        result = kstrtoul(args[0].from, 10, &lnum);
                        if (!result) {
                                entry->uid = make_kuid(current_user_ns(),
                                                       (uid_t) lnum);
                                if (!uid_valid(entry->uid) ||
                                    (uid_t)lnum != lnum)
                                        result = -EINVAL;
                                else
                                        entry->flags |= eid_token
                                            ? IMA_EUID : IMA_UID;
                        }
                        break;
                case Opt_gid_gt:
                case Opt_egid_gt:
                        entry->gid_op = &gid_gt;
                        fallthrough;
                case Opt_gid_lt:
                case Opt_egid_lt:
                        if ((token == Opt_gid_lt) || (token == Opt_egid_lt))
                                entry->gid_op = &gid_lt;
                        fallthrough;
                case Opt_gid_eq:
                case Opt_egid_eq:
                        eid_token = (token == Opt_egid_eq) ||
                                    (token == Opt_egid_gt) ||
                                    (token == Opt_egid_lt);

                        ima_log_string_op(ab, eid_token ? "egid" : "gid",
                                          args[0].from, token);

                        if (gid_valid(entry->gid)) {
                                result = -EINVAL;
                                break;
                        }

                        result = kstrtoul(args[0].from, 10, &lnum);
                        if (!result) {
                                entry->gid = make_kgid(current_user_ns(),
                                                       (gid_t)lnum);
                                if (!gid_valid(entry->gid) ||
                                    (((gid_t)lnum) != lnum))
                                        result = -EINVAL;
                                else
                                        entry->flags |= eid_token
                                            ? IMA_EGID : IMA_GID;
                        }
                        break;
                case Opt_fowner_gt:
                        entry->fowner_op = &vfsuid_gt_kuid;
                        fallthrough;
                case Opt_fowner_lt:
                        if (token == Opt_fowner_lt)
                                entry->fowner_op = &vfsuid_lt_kuid;
                        fallthrough;
                case Opt_fowner_eq:
                        ima_log_string_op(ab, "fowner", args[0].from, token);

                        if (uid_valid(entry->fowner)) {
                                result = -EINVAL;
                                break;
                        }

                        result = kstrtoul(args[0].from, 10, &lnum);
                        if (!result) {
                                entry->fowner = make_kuid(current_user_ns(),
                                                          (uid_t)lnum);
                                if (!uid_valid(entry->fowner) ||
                                    (((uid_t)lnum) != lnum))
                                        result = -EINVAL;
                                else
                                        entry->flags |= IMA_FOWNER;
                        }
                        break;
                case Opt_fgroup_gt:
                        entry->fgroup_op = &vfsgid_gt_kgid;
                        fallthrough;
                case Opt_fgroup_lt:
                        if (token == Opt_fgroup_lt)
                                entry->fgroup_op = &vfsgid_lt_kgid;
                        fallthrough;
                case Opt_fgroup_eq:
                        ima_log_string_op(ab, "fgroup", args[0].from, token);

                        if (gid_valid(entry->fgroup)) {
                                result = -EINVAL;
                                break;
                        }

                        result = kstrtoul(args[0].from, 10, &lnum);
                        if (!result) {
                                entry->fgroup = make_kgid(current_user_ns(),
                                                          (gid_t)lnum);
                                if (!gid_valid(entry->fgroup) ||
                                    (((gid_t)lnum) != lnum))
                                        result = -EINVAL;
                                else
                                        entry->flags |= IMA_FGROUP;
                        }
                        break;
                case Opt_obj_user:
                        ima_log_string(ab, "obj_user", args[0].from);
                        result = ima_lsm_rule_init(entry, args,
                                                   LSM_OBJ_USER,
                                                   AUDIT_OBJ_USER);
                        break;
                case Opt_obj_role:
                        ima_log_string(ab, "obj_role", args[0].from);
                        result = ima_lsm_rule_init(entry, args,
                                                   LSM_OBJ_ROLE,
                                                   AUDIT_OBJ_ROLE);
                        break;
                case Opt_obj_type:
                        ima_log_string(ab, "obj_type", args[0].from);
                        result = ima_lsm_rule_init(entry, args,
                                                   LSM_OBJ_TYPE,
                                                   AUDIT_OBJ_TYPE);
                        break;
                case Opt_subj_user:
                        ima_log_string(ab, "subj_user", args[0].from);
                        result = ima_lsm_rule_init(entry, args,
                                                   LSM_SUBJ_USER,
                                                   AUDIT_SUBJ_USER);
                        break;
                case Opt_subj_role:
                        ima_log_string(ab, "subj_role", args[0].from);
                        result = ima_lsm_rule_init(entry, args,
                                                   LSM_SUBJ_ROLE,
                                                   AUDIT_SUBJ_ROLE);
                        break;
                case Opt_subj_type:
                        ima_log_string(ab, "subj_type", args[0].from);
                        result = ima_lsm_rule_init(entry, args,
                                                   LSM_SUBJ_TYPE,
                                                   AUDIT_SUBJ_TYPE);
                        break;
                case Opt_digest_type:
                        ima_log_string(ab, "digest_type", args[0].from);
                        if (entry->flags & IMA_DIGSIG_REQUIRED)
                                result = -EINVAL;
                        else if ((strcmp(args[0].from, "verity")) == 0)
                                entry->flags |= IMA_VERITY_REQUIRED;
                        else
                                result = -EINVAL;
                        break;
                case Opt_appraise_type:
                        ima_log_string(ab, "appraise_type", args[0].from);

                        if ((strcmp(args[0].from, "imasig")) == 0) {
                                if (entry->flags & IMA_VERITY_REQUIRED)
                                        result = -EINVAL;
                                else
                                        entry->flags |= IMA_DIGSIG_REQUIRED | IMA_CHECK_BLACKLIST;
                        } else if (strcmp(args[0].from, "sigv3") == 0) {
                                /* Only fsverity supports sigv3 for now */
                                if (entry->flags & IMA_VERITY_REQUIRED)
                                        entry->flags |= IMA_DIGSIG_REQUIRED | IMA_CHECK_BLACKLIST;
                                else
                                        result = -EINVAL;
                        } else if (IS_ENABLED(CONFIG_IMA_APPRAISE_MODSIG) &&
                                 strcmp(args[0].from, "imasig|modsig") == 0) {
                                if (entry->flags & IMA_VERITY_REQUIRED)
                                        result = -EINVAL;
                                else
                                        entry->flags |= IMA_DIGSIG_REQUIRED |
                                                IMA_MODSIG_ALLOWED | IMA_CHECK_BLACKLIST;
                        } else {
                                result = -EINVAL;
                        }
                        break;
                case Opt_appraise_flag:
                        ima_log_string(ab, "appraise_flag", args[0].from);
                        break;
                case Opt_appraise_algos:
                        ima_log_string(ab, "appraise_algos", args[0].from);

                        if (entry->allowed_algos) {
                                result = -EINVAL;
                                break;
                        }

                        entry->allowed_algos =
                                ima_parse_appraise_algos(args[0].from);
                        /* invalid or empty list of algorithms */
                        if (!entry->allowed_algos) {
                                result = -EINVAL;
                                break;
                        }

                        entry->flags |= IMA_VALIDATE_ALGOS;

                        break;
                case Opt_permit_directio:
                        entry->flags |= IMA_PERMIT_DIRECTIO;
                        break;
                case Opt_pcr:
                        ima_log_string(ab, "pcr", args[0].from);

                        result = kstrtoint(args[0].from, 10, &entry->pcr);
                        if (result || INVALID_PCR(entry->pcr))
                                result = -EINVAL;
                        else
                                entry->flags |= IMA_PCR;

                        break;
                case Opt_template:
                        ima_log_string(ab, "template", args[0].from);
                        if (entry->action != MEASURE) {
                                result = -EINVAL;
                                break;
                        }
                        template_desc = lookup_template_desc(args[0].from);
                        if (!template_desc || entry->template) {
                                result = -EINVAL;
                                break;
                        }

                        /*
                         * template_desc_init_fields() does nothing if
                         * the template is already initialised, so
                         * it's safe to do this unconditionally
                         */
                        template_desc_init_fields(template_desc->fmt,
                                                 &(template_desc->fields),
                                                 &(template_desc->num_fields));
                        entry->template = template_desc;
                        break;
                case Opt_err:
                        ima_log_string(ab, "UNKNOWN", p);
                        result = -EINVAL;
                        break;
                }
        }
        if (!result && !ima_validate_rule(entry))
                result = -EINVAL;
        else if (entry->action == APPRAISE)
                temp_ima_appraise |= ima_appraise_flag(entry->func);

        if (!result && entry->flags & IMA_MODSIG_ALLOWED) {
                template_desc = entry->template ? entry->template :
                                                  ima_template_desc_current();
                check_template_modsig(template_desc);
        }

        /* d-ngv2 template field recommended for unsigned fs-verity digests */
        if (!result && entry->action == MEASURE &&
            entry->flags & IMA_VERITY_REQUIRED) {
                template_desc = entry->template ? entry->template :
                                                  ima_template_desc_current();
                check_template_field(template_desc, "d-ngv2",
                                     "verity rules should include d-ngv2");
        }

        audit_log_format(ab, "res=%d", !result);
        audit_log_end(ab);
        return result;
}

/**
 * ima_parse_add_rule - add a rule to ima_policy_rules
 * @rule: ima measurement policy rule
 *
 * Avoid locking by allowing just one writer at a time in ima_write_policy()
 * Returns the length of the rule parsed, an error code on failure
 */
ssize_t ima_parse_add_rule(char *rule)
{
        static const char op[] = "update_policy";
        char *p;
        struct ima_rule_entry *entry;
        ssize_t result, len;
        int audit_info = 0;

        p = strsep(&rule, "\n");
        len = strlen(p) + 1;
        p += strspn(p, " \t");

        if (*p == '#' || *p == '\0')
                return len;

        entry = kzalloc(sizeof(*entry), GFP_KERNEL);
        if (!entry) {
                integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL,
                                    NULL, op, "-ENOMEM", -ENOMEM, audit_info);
                return -ENOMEM;
        }

        INIT_LIST_HEAD(&entry->list);

        result = ima_parse_rule(p, entry);
        if (result) {
                ima_free_rule(entry);
                integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL,
                                    NULL, op, "invalid-policy", result,
                                    audit_info);
                return result;
        }

        list_add_tail(&entry->list, &ima_temp_rules);

        return len;
}

/**
 * ima_delete_rules() - called to cleanup invalid in-flight policy.
 *
 * We don't need locking as we operate on the temp list, which is
 * different from the active one.  There is also only one user of
 * ima_delete_rules() at a time.
 */
void ima_delete_rules(void)
{
        struct ima_rule_entry *entry, *tmp;

        temp_ima_appraise = 0;
        list_for_each_entry_safe(entry, tmp, &ima_temp_rules, list) {
                list_del(&entry->list);
                ima_free_rule(entry);
        }
}

#define __ima_hook_stringify(func, str)        (#func),

const char *const func_tokens[] = {
        __ima_hooks(__ima_hook_stringify)
};

#ifdef        CONFIG_IMA_READ_POLICY
enum {
        mask_exec = 0, mask_write, mask_read, mask_append
};

static const char *const mask_tokens[] = {
        "^MAY_EXEC",
        "^MAY_WRITE",
        "^MAY_READ",
        "^MAY_APPEND"
};

void *ima_policy_start(struct seq_file *m, loff_t *pos)
{
        loff_t l = *pos;
        struct ima_rule_entry *entry;
        struct list_head *ima_rules_tmp;

        rcu_read_lock();
        ima_rules_tmp = rcu_dereference(ima_rules);
        list_for_each_entry_rcu(entry, ima_rules_tmp, list) {
                if (!l--) {
                        rcu_read_unlock();
                        return entry;
                }
        }
        rcu_read_unlock();
        return NULL;
}

void *ima_policy_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct ima_rule_entry *entry = v;

        rcu_read_lock();
        entry = list_entry_rcu(entry->list.next, struct ima_rule_entry, list);
        rcu_read_unlock();
        (*pos)++;

        return (&entry->list == &ima_default_rules ||
                &entry->list == &ima_policy_rules) ? NULL : entry;
}

void ima_policy_stop(struct seq_file *m, void *v)
{
}

#define pt(token)        policy_tokens[token].pattern
#define mt(token)        mask_tokens[token]

/*
 * policy_func_show - display the ima_hooks policy rule
 */
static void policy_func_show(struct seq_file *m, enum ima_hooks func)
{
        if (func > 0 && func < MAX_CHECK)
                seq_printf(m, "func=%s ", func_tokens[func]);
        else
                seq_printf(m, "func=%d ", func);
}

static void ima_show_rule_opt_list(struct seq_file *m,
                                   const struct ima_rule_opt_list *opt_list)
{
        size_t i;

        for (i = 0; i < opt_list->count; i++)
                seq_printf(m, "%s%s", i ? "|" : "", opt_list->items[i]);
}

static void ima_policy_show_appraise_algos(struct seq_file *m,
                                           unsigned int allowed_hashes)
{
        int idx, list_size = 0;

        for (idx = 0; idx < HASH_ALGO__LAST; idx++) {
                if (!(allowed_hashes & (1U << idx)))
                        continue;

                /* only add commas if the list contains multiple entries */
                if (list_size++)
                        seq_puts(m, ",");

                seq_puts(m, hash_algo_name[idx]);
        }
}

int ima_policy_show(struct seq_file *m, void *v)
{
        struct ima_rule_entry *entry = v;
        int i;
        char tbuf[64] = {0,};
        int offset = 0;

        rcu_read_lock();

        /* Do not print rules with inactive LSM labels */
        for (i = 0; i < MAX_LSM_RULES; i++) {
                if (entry->lsm[i].args_p && !entry->lsm[i].rule) {
                        rcu_read_unlock();
                        return 0;
                }
        }

        if (entry->action & MEASURE)
                seq_puts(m, pt(Opt_measure));
        if (entry->action & DONT_MEASURE)
                seq_puts(m, pt(Opt_dont_measure));
        if (entry->action & APPRAISE)
                seq_puts(m, pt(Opt_appraise));
        if (entry->action & DONT_APPRAISE)
                seq_puts(m, pt(Opt_dont_appraise));
        if (entry->action & AUDIT)
                seq_puts(m, pt(Opt_audit));
        if (entry->action & HASH)
                seq_puts(m, pt(Opt_hash));
        if (entry->action & DONT_HASH)
                seq_puts(m, pt(Opt_dont_hash));

        seq_puts(m, " ");

        if (entry->flags & IMA_FUNC)
                policy_func_show(m, entry->func);

        if ((entry->flags & IMA_MASK) || (entry->flags & IMA_INMASK)) {
                if (entry->flags & IMA_MASK)
                        offset = 1;
                if (entry->mask & MAY_EXEC)
                        seq_printf(m, pt(Opt_mask), mt(mask_exec) + offset);
                if (entry->mask & MAY_WRITE)
                        seq_printf(m, pt(Opt_mask), mt(mask_write) + offset);
                if (entry->mask & MAY_READ)
                        seq_printf(m, pt(Opt_mask), mt(mask_read) + offset);
                if (entry->mask & MAY_APPEND)
                        seq_printf(m, pt(Opt_mask), mt(mask_append) + offset);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_FSMAGIC) {
                snprintf(tbuf, sizeof(tbuf), "0x%lx", entry->fsmagic);
                seq_printf(m, pt(Opt_fsmagic), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_FSNAME) {
                snprintf(tbuf, sizeof(tbuf), "%s", entry->fsname);
                seq_printf(m, pt(Opt_fsname), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_KEYRINGS) {
                seq_puts(m, "keyrings=");
                ima_show_rule_opt_list(m, entry->keyrings);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_LABEL) {
                seq_puts(m, "label=");
                ima_show_rule_opt_list(m, entry->label);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_PCR) {
                snprintf(tbuf, sizeof(tbuf), "%d", entry->pcr);
                seq_printf(m, pt(Opt_pcr), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_FSUUID) {
                seq_printf(m, "fsuuid=%pU", &entry->fsuuid);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_UID) {
                snprintf(tbuf, sizeof(tbuf), "%d", __kuid_val(entry->uid));
                if (entry->uid_op == &uid_gt)
                        seq_printf(m, pt(Opt_uid_gt), tbuf);
                else if (entry->uid_op == &uid_lt)
                        seq_printf(m, pt(Opt_uid_lt), tbuf);
                else
                        seq_printf(m, pt(Opt_uid_eq), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_EUID) {
                snprintf(tbuf, sizeof(tbuf), "%d", __kuid_val(entry->uid));
                if (entry->uid_op == &uid_gt)
                        seq_printf(m, pt(Opt_euid_gt), tbuf);
                else if (entry->uid_op == &uid_lt)
                        seq_printf(m, pt(Opt_euid_lt), tbuf);
                else
                        seq_printf(m, pt(Opt_euid_eq), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_GID) {
                snprintf(tbuf, sizeof(tbuf), "%d", __kgid_val(entry->gid));
                if (entry->gid_op == &gid_gt)
                        seq_printf(m, pt(Opt_gid_gt), tbuf);
                else if (entry->gid_op == &gid_lt)
                        seq_printf(m, pt(Opt_gid_lt), tbuf);
                else
                        seq_printf(m, pt(Opt_gid_eq), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_EGID) {
                snprintf(tbuf, sizeof(tbuf), "%d", __kgid_val(entry->gid));
                if (entry->gid_op == &gid_gt)
                        seq_printf(m, pt(Opt_egid_gt), tbuf);
                else if (entry->gid_op == &gid_lt)
                        seq_printf(m, pt(Opt_egid_lt), tbuf);
                else
                        seq_printf(m, pt(Opt_egid_eq), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_FOWNER) {
                snprintf(tbuf, sizeof(tbuf), "%d", __kuid_val(entry->fowner));
                if (entry->fowner_op == &vfsuid_gt_kuid)
                        seq_printf(m, pt(Opt_fowner_gt), tbuf);
                else if (entry->fowner_op == &vfsuid_lt_kuid)
                        seq_printf(m, pt(Opt_fowner_lt), tbuf);
                else
                        seq_printf(m, pt(Opt_fowner_eq), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_FGROUP) {
                snprintf(tbuf, sizeof(tbuf), "%d", __kgid_val(entry->fgroup));
                if (entry->fgroup_op == &vfsgid_gt_kgid)
                        seq_printf(m, pt(Opt_fgroup_gt), tbuf);
                else if (entry->fgroup_op == &vfsgid_lt_kgid)
                        seq_printf(m, pt(Opt_fgroup_lt), tbuf);
                else
                        seq_printf(m, pt(Opt_fgroup_eq), tbuf);
                seq_puts(m, " ");
        }

        if (entry->flags & IMA_VALIDATE_ALGOS) {
                seq_puts(m, "appraise_algos=");
                ima_policy_show_appraise_algos(m, entry->allowed_algos);
                seq_puts(m, " ");
        }

        for (i = 0; i < MAX_LSM_RULES; i++) {
                if (entry->lsm[i].rule) {
                        switch (i) {
                        case LSM_OBJ_USER:
                                seq_printf(m, pt(Opt_obj_user),
                                           entry->lsm[i].args_p);
                                break;
                        case LSM_OBJ_ROLE:
                                seq_printf(m, pt(Opt_obj_role),
                                           entry->lsm[i].args_p);
                                break;
                        case LSM_OBJ_TYPE:
                                seq_printf(m, pt(Opt_obj_type),
                                           entry->lsm[i].args_p);
                                break;
                        case LSM_SUBJ_USER:
                                seq_printf(m, pt(Opt_subj_user),
                                           entry->lsm[i].args_p);
                                break;
                        case LSM_SUBJ_ROLE:
                                seq_printf(m, pt(Opt_subj_role),
                                           entry->lsm[i].args_p);
                                break;
                        case LSM_SUBJ_TYPE:
                                seq_printf(m, pt(Opt_subj_type),
                                           entry->lsm[i].args_p);
                                break;
                        }
                        seq_puts(m, " ");
                }
        }
        if (entry->template)
                seq_printf(m, "template=%s ", entry->template->name);
        if (entry->flags & IMA_DIGSIG_REQUIRED) {
                if (entry->flags & IMA_VERITY_REQUIRED)
                        seq_puts(m, "appraise_type=sigv3 ");
                else if (entry->flags & IMA_MODSIG_ALLOWED)
                        seq_puts(m, "appraise_type=imasig|modsig ");
                else
                        seq_puts(m, "appraise_type=imasig ");
        }
        if (entry->flags & IMA_VERITY_REQUIRED)
                seq_puts(m, "digest_type=verity ");
        if (entry->flags & IMA_PERMIT_DIRECTIO)
                seq_puts(m, "permit_directio ");
        rcu_read_unlock();
        seq_puts(m, "\n");
        return 0;
}
#endif        /* CONFIG_IMA_READ_POLICY */

#if defined(CONFIG_IMA_APPRAISE) && defined(CONFIG_INTEGRITY_TRUSTED_KEYRING)
/*
 * ima_appraise_signature: whether IMA will appraise a given function using
 * an IMA digital signature. This is restricted to cases where the kernel
 * has a set of built-in trusted keys in order to avoid an attacker simply
 * loading additional keys.
 */
bool ima_appraise_signature(enum kernel_read_file_id id)
{
        struct ima_rule_entry *entry;
        bool found = false;
        enum ima_hooks func;
        struct list_head *ima_rules_tmp;

        if (id >= READING_MAX_ID)
                return false;

        if (id == READING_KEXEC_IMAGE && !(ima_appraise & IMA_APPRAISE_ENFORCE)
            && security_locked_down(LOCKDOWN_KEXEC))
                return false;

        func = read_idmap[id] ?: FILE_CHECK;

        rcu_read_lock();
        ima_rules_tmp = rcu_dereference(ima_rules);
        list_for_each_entry_rcu(entry, ima_rules_tmp, list) {
                if (entry->action != APPRAISE)
                        continue;

                /*
                 * A generic entry will match, but otherwise require that it
                 * match the func we're looking for
                 */
                if (entry->func && entry->func != func)
                        continue;

                /*
                 * We require this to be a digital signature, not a raw IMA
                 * hash.
                 */
                if (entry->flags & IMA_DIGSIG_REQUIRED)
                        found = true;

                /*
                 * We've found a rule that matches, so break now even if it
                 * didn't require a digital signature - a later rule that does
                 * won't override it, so would be a false positive.
                 */
                break;
        }

        rcu_read_unlock();
        return found;
}
#endif /* CONFIG_IMA_APPRAISE && CONFIG_INTEGRITY_TRUSTED_KEYRING */










































    9 












































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 1994 Linus Torvalds
 *
 * Pentium III FXSR, SSE support
 * General FPU state handling cleanups
 *        Gareth Hughes <gareth@valinux.com>, May 2000
 * x86-64 work by Andi Kleen 2002
 */

#ifndef _ASM_X86_FPU_API_H
#define _ASM_X86_FPU_API_H
#include <linux/bottom_half.h>

#include <asm/fpu/types.h>

/*
 * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
 * disables preemption so be careful if you intend to use it for long periods
 * of time.
 * If you intend to use the FPU in irq/softirq you need to check first with
 * irq_fpu_usable() if it is possible.
 */

/* Kernel FPU states to initialize in kernel_fpu_begin_mask() */
#define KFPU_387        _BITUL(0)        /* 387 state will be initialized */
#define KFPU_MXCSR        _BITUL(1)        /* MXCSR will be initialized */

extern void kernel_fpu_begin_mask(unsigned int kfpu_mask);
extern void kernel_fpu_end(void);
extern bool irq_fpu_usable(void);
extern void fpregs_mark_activate(void);

/* Code that is unaware of kernel_fpu_begin_mask() can use this */
static inline void kernel_fpu_begin(void)
{
#ifdef CONFIG_X86_64
        /*
         * Any 64-bit code that uses 387 instructions must explicitly request
         * KFPU_387.
         */
        kernel_fpu_begin_mask(KFPU_MXCSR);
#else
        /*
         * 32-bit kernel code may use 387 operations as well as SSE2, etc,
         * as long as it checks that the CPU has the required capability.
         */
        kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR);
#endif
}

/*
 * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate.
 * A context switch will (and softirq might) save CPU's FPU registers to
 * fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in
 * a random state.
 *
 * local_bh_disable() protects against both preemption and soft interrupts
 * on !RT kernels.
 *
 * On RT kernels local_bh_disable() is not sufficient because it only
 * serializes soft interrupt related sections via a local lock, but stays
 * preemptible. Disabling preemption is the right choice here as bottom
 * half processing is always in thread context on RT kernels so it
 * implicitly prevents bottom half processing as well.
 *
 * Disabling preemption also serializes against kernel_fpu_begin().
 */
static inline void fpregs_lock(void)
{
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                local_bh_disable();
        else
                preempt_disable();
}

static inline void fpregs_unlock(void)
{
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                local_bh_enable();
        else
                preempt_enable();
}

/*
 * FPU state gets lazily restored before returning to userspace. So when in the
 * kernel, the valid FPU state may be kept in the buffer. This function will force
 * restore all the fpu state to the registers early if needed, and lock them from
 * being automatically saved/restored. Then FPU state can be modified safely in the
 * registers, before unlocking with fpregs_unlock().
 */
void fpregs_lock_and_load(void);

#ifdef CONFIG_X86_DEBUG_FPU
extern void fpregs_assert_state_consistent(void);
#else
static inline void fpregs_assert_state_consistent(void) { }
#endif

/*
 * Load the task FPU state before returning to userspace.
 */
extern void switch_fpu_return(void);

/*
 * Query the presence of one or more xfeatures. Works on any legacy CPU as well.
 *
 * If 'feature_name' is set then put a human-readable description of
 * the feature there as well - this can be used to print error (or success)
 * messages.
 */
extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);

/* Trap handling */
extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
extern void fpu_sync_fpstate(struct fpu *fpu);
extern void fpu_reset_from_exception_fixup(void);

/* Boot, hotplug and resume */
extern void fpu__init_cpu(void);
extern void fpu__init_system(void);
extern void fpu__init_check_bugs(void);
extern void fpu__resume_cpu(void);

#ifdef CONFIG_MATH_EMULATION
extern void fpstate_init_soft(struct swregs_state *soft);
#else
static inline void fpstate_init_soft(struct swregs_state *soft) {}
#endif

/* State tracking */
DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);

/* Process cleanup */
#ifdef CONFIG_X86_64
extern void fpstate_free(struct fpu *fpu);
#else
static inline void fpstate_free(struct fpu *fpu) { }
#endif

/* fpstate-related functions which are exported to KVM */
extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature);

extern u64 xstate_get_guest_group_perm(void);

extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);


/* KVM specific functions */
extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu);
extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu);
extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest);
extern int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures);

#ifdef CONFIG_X86_64
extern void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd);
extern void fpu_sync_guest_vmexit_xfd_state(void);
#else
static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { }
static inline void fpu_sync_guest_vmexit_xfd_state(void) { }
#endif

extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
                                           unsigned int size, u64 xfeatures, u32 pkru);
extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru);

static inline void fpstate_set_confidential(struct fpu_guest *gfpu)
{
        gfpu->fpstate->is_confidential = true;
}

static inline bool fpstate_is_confidential(struct fpu_guest *gfpu)
{
        return gfpu->fpstate->is_confidential;
}

/* prctl */
extern long fpu_xstate_prctl(int option, unsigned long arg2);

extern void fpu_idle_fpregs(void);

#endif /* _ASM_X86_FPU_API_H */


























    3 
    3 



    3 
    3 






























































































    3 













    3 

    3 

















































































































































    3 



























    3 





    3 




















    3 

    3 




    3 
    3 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2007 Jens Axboe <jens.axboe@oracle.com>
 *
 * Scatterlist handling helpers.
 */
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/scatterlist.h>
#include <linux/highmem.h>
#include <linux/kmemleak.h>
#include <linux/bvec.h>
#include <linux/uio.h>

/**
 * sg_next - return the next scatterlist entry in a list
 * @sg:                The current sg entry
 *
 * Description:
 *   Usually the next entry will be @sg@ + 1, but if this sg element is part
 *   of a chained scatterlist, it could jump to the start of a new
 *   scatterlist array.
 *
 **/
struct scatterlist *sg_next(struct scatterlist *sg)
{
        if (sg_is_last(sg))
                return NULL;

        sg++;
        if (unlikely(sg_is_chain(sg)))
                sg = sg_chain_ptr(sg);

        return sg;
}
EXPORT_SYMBOL(sg_next);

/**
 * sg_nents - return total count of entries in scatterlist
 * @sg:                The scatterlist
 *
 * Description:
 * Allows to know how many entries are in sg, taking into account
 * chaining as well
 *
 **/
int sg_nents(struct scatterlist *sg)
{
        int nents;
        for (nents = 0; sg; sg = sg_next(sg))
                nents++;
        return nents;
}
EXPORT_SYMBOL(sg_nents);

/**
 * sg_nents_for_len - return total count of entries in scatterlist
 *                    needed to satisfy the supplied length
 * @sg:                The scatterlist
 * @len:        The total required length
 *
 * Description:
 * Determines the number of entries in sg that are required to meet
 * the supplied length, taking into account chaining as well
 *
 * Returns:
 *   the number of sg entries needed, negative error on failure
 *
 **/
int sg_nents_for_len(struct scatterlist *sg, u64 len)
{
        int nents;
        u64 total;

        if (!len)
                return 0;

        for (nents = 0, total = 0; sg; sg = sg_next(sg)) {
                nents++;
                total += sg->length;
                if (total >= len)
                        return nents;
        }

        return -EINVAL;
}
EXPORT_SYMBOL(sg_nents_for_len);

/**
 * sg_last - return the last scatterlist entry in a list
 * @sgl:        First entry in the scatterlist
 * @nents:        Number of entries in the scatterlist
 *
 * Description:
 *   Should only be used casually, it (currently) scans the entire list
 *   to get the last entry.
 *
 *   Note that the @sgl@ pointer passed in need not be the first one,
 *   the important bit is that @nents@ denotes the number of entries that
 *   exist from @sgl@.
 *
 **/
struct scatterlist *sg_last(struct scatterlist *sgl, unsigned int nents)
{
        struct scatterlist *sg, *ret = NULL;
        unsigned int i;

        for_each_sg(sgl, sg, nents, i)
                ret = sg;

        BUG_ON(!sg_is_last(ret));
        return ret;
}
EXPORT_SYMBOL(sg_last);

/**
 * sg_init_table - Initialize SG table
 * @sgl:           The SG table
 * @nents:           Number of entries in table
 *
 * Notes:
 *   If this is part of a chained sg table, sg_mark_end() should be
 *   used only on the last table part.
 *
 **/
void sg_init_table(struct scatterlist *sgl, unsigned int nents)
{
        memset(sgl, 0, sizeof(*sgl) * nents);
        sg_init_marker(sgl, nents);
}
EXPORT_SYMBOL(sg_init_table);

/**
 * sg_init_one - Initialize a single entry sg list
 * @sg:                 SG entry
 * @buf:         Virtual address for IO
 * @buflen:         IO length
 *
 **/
void sg_init_one(struct scatterlist *sg, const void *buf, unsigned int buflen)
{
        sg_init_table(sg, 1);
        sg_set_buf(sg, buf, buflen);
}
EXPORT_SYMBOL(sg_init_one);

/*
 * The default behaviour of sg_alloc_table() is to use these kmalloc/kfree
 * helpers.
 */
static struct scatterlist *sg_kmalloc(unsigned int nents, gfp_t gfp_mask)
{
        if (nents == SG_MAX_SINGLE_ALLOC) {
                /*
                 * Kmemleak doesn't track page allocations as they are not
                 * commonly used (in a raw form) for kernel data structures.
                 * As we chain together a list of pages and then a normal
                 * kmalloc (tracked by kmemleak), in order to for that last
                 * allocation not to become decoupled (and thus a
                 * false-positive) we need to inform kmemleak of all the
                 * intermediate allocations.
                 */
                void *ptr = (void *) __get_free_page(gfp_mask);
                kmemleak_alloc(ptr, PAGE_SIZE, 1, gfp_mask);
                return ptr;
        } else
                return kmalloc_array(nents, sizeof(struct scatterlist),
                                     gfp_mask);
}

static void sg_kfree(struct scatterlist *sg, unsigned int nents)
{
        if (nents == SG_MAX_SINGLE_ALLOC) {
                kmemleak_free(sg);
                free_page((unsigned long) sg);
        } else
                kfree(sg);
}

/**
 * __sg_free_table - Free a previously mapped sg table
 * @table:        The sg table header to use
 * @max_ents:        The maximum number of entries per single scatterlist
 * @nents_first_chunk: Number of entries int the (preallocated) first
 *         scatterlist chunk, 0 means no such preallocated first chunk
 * @free_fn:        Free function
 * @num_ents:        Number of entries in the table
 *
 *  Description:
 *    Free an sg table previously allocated and setup with
 *    __sg_alloc_table().  The @max_ents value must be identical to
 *    that previously used with __sg_alloc_table().
 *
 **/
void __sg_free_table(struct sg_table *table, unsigned int max_ents,
                     unsigned int nents_first_chunk, sg_free_fn *free_fn,
                     unsigned int num_ents)
{
        struct scatterlist *sgl, *next;
        unsigned curr_max_ents = nents_first_chunk ?: max_ents;

        if (unlikely(!table->sgl))
                return;

        sgl = table->sgl;
        while (num_ents) {
                unsigned int alloc_size = num_ents;
                unsigned int sg_size;

                /*
                 * If we have more than max_ents segments left,
                 * then assign 'next' to the sg table after the current one.
                 * sg_size is then one less than alloc size, since the last
                 * element is the chain pointer.
                 */
                if (alloc_size > curr_max_ents) {
                        next = sg_chain_ptr(&sgl[curr_max_ents - 1]);
                        alloc_size = curr_max_ents;
                        sg_size = alloc_size - 1;
                } else {
                        sg_size = alloc_size;
                        next = NULL;
                }

                num_ents -= sg_size;
                if (nents_first_chunk)
                        nents_first_chunk = 0;
                else
                        free_fn(sgl, alloc_size);
                sgl = next;
                curr_max_ents = max_ents;
        }

        table->sgl = NULL;
}
EXPORT_SYMBOL(__sg_free_table);

/**
 * sg_free_append_table - Free a previously allocated append sg table.
 * @table:         The mapped sg append table header
 *
 **/
void sg_free_append_table(struct sg_append_table *table)
{
        __sg_free_table(&table->sgt, SG_MAX_SINGLE_ALLOC, 0, sg_kfree,
                        table->total_nents);
}
EXPORT_SYMBOL(sg_free_append_table);


/**
 * sg_free_table - Free a previously allocated sg table
 * @table:        The mapped sg table header
 *
 **/
void sg_free_table(struct sg_table *table)
{
        __sg_free_table(table, SG_MAX_SINGLE_ALLOC, 0, sg_kfree,
                        table->orig_nents);
}
EXPORT_SYMBOL(sg_free_table);

/**
 * __sg_alloc_table - Allocate and initialize an sg table with given allocator
 * @table:        The sg table header to use
 * @nents:        Number of entries in sg list
 * @max_ents:        The maximum number of entries the allocator returns per call
 * @first_chunk: first SGL if preallocated (may be %NULL)
 * @nents_first_chunk: Number of entries in the (preallocated) first
 *         scatterlist chunk, 0 means no such preallocated chunk provided by user
 * @gfp_mask:        GFP allocation mask
 * @alloc_fn:        Allocator to use
 *
 * Description:
 *   This function returns a @table @nents long. The allocator is
 *   defined to return scatterlist chunks of maximum size @max_ents.
 *   Thus if @nents is bigger than @max_ents, the scatterlists will be
 *   chained in units of @max_ents.
 *
 * Notes:
 *   If this function returns non-0 (eg failure), the caller must call
 *   __sg_free_table() to cleanup any leftover allocations.
 *
 **/
int __sg_alloc_table(struct sg_table *table, unsigned int nents,
                     unsigned int max_ents, struct scatterlist *first_chunk,
                     unsigned int nents_first_chunk, gfp_t gfp_mask,
                     sg_alloc_fn *alloc_fn)
{
        struct scatterlist *sg, *prv;
        unsigned int left;
        unsigned curr_max_ents = nents_first_chunk ?: max_ents;
        unsigned prv_max_ents;

        memset(table, 0, sizeof(*table));

        if (nents == 0)
                return -EINVAL;
#ifdef CONFIG_ARCH_NO_SG_CHAIN
        if (WARN_ON_ONCE(nents > max_ents))
                return -EINVAL;
#endif

        left = nents;
        prv = NULL;
        do {
                unsigned int sg_size, alloc_size = left;

                if (alloc_size > curr_max_ents) {
                        alloc_size = curr_max_ents;
                        sg_size = alloc_size - 1;
                } else
                        sg_size = alloc_size;

                left -= sg_size;

                if (first_chunk) {
                        sg = first_chunk;
                        first_chunk = NULL;
                } else {
                        sg = alloc_fn(alloc_size, gfp_mask);
                }
                if (unlikely(!sg)) {
                        /*
                         * Adjust entry count to reflect that the last
                         * entry of the previous table won't be used for
                         * linkage.  Without this, sg_kfree() may get
                         * confused.
                         */
                        if (prv)
                                table->nents = ++table->orig_nents;

                        return -ENOMEM;
                }

                sg_init_table(sg, alloc_size);
                table->nents = table->orig_nents += sg_size;

                /*
                 * If this is the first mapping, assign the sg table header.
                 * If this is not the first mapping, chain previous part.
                 */
                if (prv)
                        sg_chain(prv, prv_max_ents, sg);
                else
                        table->sgl = sg;

                /*
                 * If no more entries after this one, mark the end
                 */
                if (!left)
                        sg_mark_end(&sg[sg_size - 1]);

                prv = sg;
                prv_max_ents = curr_max_ents;
                curr_max_ents = max_ents;
        } while (left);

        return 0;
}
EXPORT_SYMBOL(__sg_alloc_table);

/**
 * sg_alloc_table - Allocate and initialize an sg table
 * @table:        The sg table header to use
 * @nents:        Number of entries in sg list
 * @gfp_mask:        GFP allocation mask
 *
 *  Description:
 *    Allocate and initialize an sg table. If @nents@ is larger than
 *    SG_MAX_SINGLE_ALLOC a chained sg table will be setup.
 *
 **/
int sg_alloc_table(struct sg_table *table, unsigned int nents, gfp_t gfp_mask)
{
        int ret;

        ret = __sg_alloc_table(table, nents, SG_MAX_SINGLE_ALLOC,
                               NULL, 0, gfp_mask, sg_kmalloc);
        if (unlikely(ret))
                sg_free_table(table);
        return ret;
}
EXPORT_SYMBOL(sg_alloc_table);

static struct scatterlist *get_next_sg(struct sg_append_table *table,
                                       struct scatterlist *cur,
                                       unsigned long needed_sges,
                                       gfp_t gfp_mask)
{
        struct scatterlist *new_sg, *next_sg;
        unsigned int alloc_size;

        if (cur) {
                next_sg = sg_next(cur);
                /* Check if last entry should be keeped for chainning */
                if (!sg_is_last(next_sg) || needed_sges == 1)
                        return next_sg;
        }

        alloc_size = min_t(unsigned long, needed_sges, SG_MAX_SINGLE_ALLOC);
        new_sg = sg_kmalloc(alloc_size, gfp_mask);
        if (!new_sg)
                return ERR_PTR(-ENOMEM);
        sg_init_table(new_sg, alloc_size);
        if (cur) {
                table->total_nents += alloc_size - 1;
                __sg_chain(next_sg, new_sg);
        } else {
                table->sgt.sgl = new_sg;
                table->total_nents = alloc_size;
        }
        return new_sg;
}

static bool pages_are_mergeable(struct page *a, struct page *b)
{
        if (page_to_pfn(a) != page_to_pfn(b) + 1)
                return false;
        if (!zone_device_pages_have_same_pgmap(a, b))
                return false;
        return true;
}

/**
 * sg_alloc_append_table_from_pages - Allocate and initialize an append sg
 *                                    table from an array of pages
 * @sgt_append:  The sg append table to use
 * @pages:       Pointer to an array of page pointers
 * @n_pages:     Number of pages in the pages array
 * @offset:      Offset from start of the first page to the start of a buffer
 * @size:        Number of valid bytes in the buffer (after offset)
 * @max_segment: Maximum size of a scatterlist element in bytes
 * @left_pages:  Left pages caller have to set after this call
 * @gfp_mask:         GFP allocation mask
 *
 * Description:
 *    In the first call it allocate and initialize an sg table from a list of
 *    pages, else reuse the scatterlist from sgt_append. Contiguous ranges of
 *    the pages are squashed into a single scatterlist entry up to the maximum
 *    size specified in @max_segment.  A user may provide an offset at a start
 *    and a size of valid data in a buffer specified by the page array. The
 *    returned sg table is released by sg_free_append_table
 *
 * Returns:
 *   0 on success, negative error on failure
 *
 * Notes:
 *   If this function returns non-0 (eg failure), the caller must call
 *   sg_free_append_table() to cleanup any leftover allocations.
 *
 *   In the fist call, sgt_append must by initialized.
 */
int sg_alloc_append_table_from_pages(struct sg_append_table *sgt_append,
                struct page **pages, unsigned int n_pages, unsigned int offset,
                unsigned long size, unsigned int max_segment,
                unsigned int left_pages, gfp_t gfp_mask)
{
        unsigned int chunks, cur_page, seg_len, i, prv_len = 0;
        unsigned int added_nents = 0;
        struct scatterlist *s = sgt_append->prv;
        struct page *last_pg;

        /*
         * The algorithm below requires max_segment to be aligned to PAGE_SIZE
         * otherwise it can overshoot.
         */
        max_segment = ALIGN_DOWN(max_segment, PAGE_SIZE);
        if (WARN_ON(max_segment < PAGE_SIZE))
                return -EINVAL;

        if (IS_ENABLED(CONFIG_ARCH_NO_SG_CHAIN) && sgt_append->prv)
                return -EOPNOTSUPP;

        if (sgt_append->prv) {
                unsigned long next_pfn = (page_to_phys(sg_page(sgt_append->prv)) +
                        sgt_append->prv->offset + sgt_append->prv->length) / PAGE_SIZE;

                if (WARN_ON(offset))
                        return -EINVAL;

                /* Merge contiguous pages into the last SG */
                prv_len = sgt_append->prv->length;
                if (page_to_pfn(pages[0]) == next_pfn) {
                        last_pg = pfn_to_page(next_pfn - 1);
                        while (n_pages && pages_are_mergeable(pages[0], last_pg)) {
                                if (sgt_append->prv->length + PAGE_SIZE > max_segment)
                                        break;
                                sgt_append->prv->length += PAGE_SIZE;
                                last_pg = pages[0];
                                pages++;
                                n_pages--;
                        }
                        if (!n_pages)
                                goto out;
                }
        }

        /* compute number of contiguous chunks */
        chunks = 1;
        seg_len = 0;
        for (i = 1; i < n_pages; i++) {
                seg_len += PAGE_SIZE;
                if (seg_len >= max_segment ||
                    !pages_are_mergeable(pages[i], pages[i - 1])) {
                        chunks++;
                        seg_len = 0;
                }
        }

        /* merging chunks and putting them into the scatterlist */
        cur_page = 0;
        for (i = 0; i < chunks; i++) {
                unsigned int j, chunk_size;

                /* look for the end of the current chunk */
                seg_len = 0;
                for (j = cur_page + 1; j < n_pages; j++) {
                        seg_len += PAGE_SIZE;
                        if (seg_len >= max_segment ||
                            !pages_are_mergeable(pages[j], pages[j - 1]))
                                break;
                }

                /* Pass how many chunks might be left */
                s = get_next_sg(sgt_append, s, chunks - i + left_pages,
                                gfp_mask);
                if (IS_ERR(s)) {
                        /*
                         * Adjust entry length to be as before function was
                         * called.
                         */
                        if (sgt_append->prv)
                                sgt_append->prv->length = prv_len;
                        return PTR_ERR(s);
                }
                chunk_size = ((j - cur_page) << PAGE_SHIFT) - offset;
                sg_set_page(s, pages[cur_page],
                            min_t(unsigned long, size, chunk_size), offset);
                added_nents++;
                size -= chunk_size;
                offset = 0;
                cur_page = j;
        }
        sgt_append->sgt.nents += added_nents;
        sgt_append->sgt.orig_nents = sgt_append->sgt.nents;
        sgt_append->prv = s;
out:
        if (!left_pages)
                sg_mark_end(s);
        return 0;
}
EXPORT_SYMBOL(sg_alloc_append_table_from_pages);

/**
 * sg_alloc_table_from_pages_segment - Allocate and initialize an sg table from
 *                                     an array of pages and given maximum
 *                                     segment.
 * @sgt:         The sg table header to use
 * @pages:         Pointer to an array of page pointers
 * @n_pages:         Number of pages in the pages array
 * @offset:      Offset from start of the first page to the start of a buffer
 * @size:        Number of valid bytes in the buffer (after offset)
 * @max_segment: Maximum size of a scatterlist element in bytes
 * @gfp_mask:         GFP allocation mask
 *
 *  Description:
 *    Allocate and initialize an sg table from a list of pages. Contiguous
 *    ranges of the pages are squashed into a single scatterlist node up to the
 *    maximum size specified in @max_segment. A user may provide an offset at a
 *    start and a size of valid data in a buffer specified by the page array.
 *
 *    The returned sg table is released by sg_free_table.
 *
 *  Returns:
 *   0 on success, negative error on failure
 */
int sg_alloc_table_from_pages_segment(struct sg_table *sgt, struct page **pages,
                                unsigned int n_pages, unsigned int offset,
                                unsigned long size, unsigned int max_segment,
                                gfp_t gfp_mask)
{
        struct sg_append_table append = {};
        int err;

        err = sg_alloc_append_table_from_pages(&append, pages, n_pages, offset,
                                               size, max_segment, 0, gfp_mask);
        if (err) {
                sg_free_append_table(&append);
                return err;
        }
        memcpy(sgt, &append.sgt, sizeof(*sgt));
        WARN_ON(append.total_nents != sgt->orig_nents);
        return 0;
}
EXPORT_SYMBOL(sg_alloc_table_from_pages_segment);

#ifdef CONFIG_SGL_ALLOC

/**
 * sgl_alloc_order - allocate a scatterlist and its pages
 * @length: Length in bytes of the scatterlist. Must be at least one
 * @order: Second argument for alloc_pages()
 * @chainable: Whether or not to allocate an extra element in the scatterlist
 *        for scatterlist chaining purposes
 * @gfp: Memory allocation flags
 * @nent_p: [out] Number of entries in the scatterlist that have pages
 *
 * Returns: A pointer to an initialized scatterlist or %NULL upon failure.
 */
struct scatterlist *sgl_alloc_order(unsigned long long length,
                                    unsigned int order, bool chainable,
                                    gfp_t gfp, unsigned int *nent_p)
{
        struct scatterlist *sgl, *sg;
        struct page *page;
        unsigned int nent, nalloc;
        u32 elem_len;

        nent = round_up(length, PAGE_SIZE << order) >> (PAGE_SHIFT + order);
        /* Check for integer overflow */
        if (length > (nent << (PAGE_SHIFT + order)))
                return NULL;
        nalloc = nent;
        if (chainable) {
                /* Check for integer overflow */
                if (nalloc + 1 < nalloc)
                        return NULL;
                nalloc++;
        }
        sgl = kmalloc_array(nalloc, sizeof(struct scatterlist),
                            gfp & ~GFP_DMA);
        if (!sgl)
                return NULL;

        sg_init_table(sgl, nalloc);
        sg = sgl;
        while (length) {
                elem_len = min_t(u64, length, PAGE_SIZE << order);
                page = alloc_pages(gfp, order);
                if (!page) {
                        sgl_free_order(sgl, order);
                        return NULL;
                }

                sg_set_page(sg, page, elem_len, 0);
                length -= elem_len;
                sg = sg_next(sg);
        }
        WARN_ONCE(length, "length = %lld\n", length);
        if (nent_p)
                *nent_p = nent;
        return sgl;
}
EXPORT_SYMBOL(sgl_alloc_order);

/**
 * sgl_alloc - allocate a scatterlist and its pages
 * @length: Length in bytes of the scatterlist
 * @gfp: Memory allocation flags
 * @nent_p: [out] Number of entries in the scatterlist
 *
 * Returns: A pointer to an initialized scatterlist or %NULL upon failure.
 */
struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp,
                              unsigned int *nent_p)
{
        return sgl_alloc_order(length, 0, false, gfp, nent_p);
}
EXPORT_SYMBOL(sgl_alloc);

/**
 * sgl_free_n_order - free a scatterlist and its pages
 * @sgl: Scatterlist with one or more elements
 * @nents: Maximum number of elements to free
 * @order: Second argument for __free_pages()
 *
 * Notes:
 * - If several scatterlists have been chained and each chain element is
 *   freed separately then it's essential to set nents correctly to avoid that a
 *   page would get freed twice.
 * - All pages in a chained scatterlist can be freed at once by setting @nents
 *   to a high number.
 */
void sgl_free_n_order(struct scatterlist *sgl, int nents, int order)
{
        struct scatterlist *sg;
        struct page *page;
        int i;

        for_each_sg(sgl, sg, nents, i) {
                if (!sg)
                        break;
                page = sg_page(sg);
                if (page)
                        __free_pages(page, order);
        }
        kfree(sgl);
}
EXPORT_SYMBOL(sgl_free_n_order);

/**
 * sgl_free_order - free a scatterlist and its pages
 * @sgl: Scatterlist with one or more elements
 * @order: Second argument for __free_pages()
 */
void sgl_free_order(struct scatterlist *sgl, int order)
{
        sgl_free_n_order(sgl, INT_MAX, order);
}
EXPORT_SYMBOL(sgl_free_order);

/**
 * sgl_free - free a scatterlist and its pages
 * @sgl: Scatterlist with one or more elements
 */
void sgl_free(struct scatterlist *sgl)
{
        sgl_free_order(sgl, 0);
}
EXPORT_SYMBOL(sgl_free);

#endif /* CONFIG_SGL_ALLOC */

void __sg_page_iter_start(struct sg_page_iter *piter,
                          struct scatterlist *sglist, unsigned int nents,
                          unsigned long pgoffset)
{
        piter->__pg_advance = 0;
        piter->__nents = nents;

        piter->sg = sglist;
        piter->sg_pgoffset = pgoffset;
}
EXPORT_SYMBOL(__sg_page_iter_start);

static int sg_page_count(struct scatterlist *sg)
{
        return PAGE_ALIGN(sg->offset + sg->length) >> PAGE_SHIFT;
}

bool __sg_page_iter_next(struct sg_page_iter *piter)
{
        if (!piter->__nents || !piter->sg)
                return false;

        piter->sg_pgoffset += piter->__pg_advance;
        piter->__pg_advance = 1;

        while (piter->sg_pgoffset >= sg_page_count(piter->sg)) {
                piter->sg_pgoffset -= sg_page_count(piter->sg);
                piter->sg = sg_next(piter->sg);
                if (!--piter->__nents || !piter->sg)
                        return false;
        }

        return true;
}
EXPORT_SYMBOL(__sg_page_iter_next);

static int sg_dma_page_count(struct scatterlist *sg)
{
        return PAGE_ALIGN(sg->offset + sg_dma_len(sg)) >> PAGE_SHIFT;
}

bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter)
{
        struct sg_page_iter *piter = &dma_iter->base;

        if (!piter->__nents || !piter->sg)
                return false;

        piter->sg_pgoffset += piter->__pg_advance;
        piter->__pg_advance = 1;

        while (piter->sg_pgoffset >= sg_dma_page_count(piter->sg)) {
                piter->sg_pgoffset -= sg_dma_page_count(piter->sg);
                piter->sg = sg_next(piter->sg);
                if (!--piter->__nents || !piter->sg)
                        return false;
        }

        return true;
}
EXPORT_SYMBOL(__sg_page_iter_dma_next);

/**
 * sg_miter_start - start mapping iteration over a sg list
 * @miter: sg mapping iter to be started
 * @sgl: sg list to iterate over
 * @nents: number of sg entries
 * @flags: sg iterator flags
 *
 * Description:
 *   Starts mapping iterator @miter.
 *
 * Context:
 *   Don't care.
 */
void sg_miter_start(struct sg_mapping_iter *miter, struct scatterlist *sgl,
                    unsigned int nents, unsigned int flags)
{
        memset(miter, 0, sizeof(struct sg_mapping_iter));

        __sg_page_iter_start(&miter->piter, sgl, nents, 0);
        WARN_ON(!(flags & (SG_MITER_TO_SG | SG_MITER_FROM_SG)));
        miter->__flags = flags;
}
EXPORT_SYMBOL(sg_miter_start);

static bool sg_miter_get_next_page(struct sg_mapping_iter *miter)
{
        if (!miter->__remaining) {
                struct scatterlist *sg;

                if (!__sg_page_iter_next(&miter->piter))
                        return false;

                sg = miter->piter.sg;

                miter->__offset = miter->piter.sg_pgoffset ? 0 : sg->offset;
                miter->piter.sg_pgoffset += miter->__offset >> PAGE_SHIFT;
                miter->__offset &= PAGE_SIZE - 1;
                miter->__remaining = sg->offset + sg->length -
                                     (miter->piter.sg_pgoffset << PAGE_SHIFT) -
                                     miter->__offset;
                miter->__remaining = min_t(unsigned long, miter->__remaining,
                                           PAGE_SIZE - miter->__offset);
        }

        return true;
}

/**
 * sg_miter_skip - reposition mapping iterator
 * @miter: sg mapping iter to be skipped
 * @offset: number of bytes to plus the current location
 *
 * Description:
 *   Sets the offset of @miter to its current location plus @offset bytes.
 *   If mapping iterator @miter has been proceeded by sg_miter_next(), this
 *   stops @miter.
 *
 * Context:
 *   Don't care.
 *
 * Returns:
 *   true if @miter contains the valid mapping.  false if end of sg
 *   list is reached.
 */
bool sg_miter_skip(struct sg_mapping_iter *miter, off_t offset)
{
        sg_miter_stop(miter);

        while (offset) {
                off_t consumed;

                if (!sg_miter_get_next_page(miter))
                        return false;

                consumed = min_t(off_t, offset, miter->__remaining);
                miter->__offset += consumed;
                miter->__remaining -= consumed;
                offset -= consumed;
        }

        return true;
}
EXPORT_SYMBOL(sg_miter_skip);

/**
 * sg_miter_next - proceed mapping iterator to the next mapping
 * @miter: sg mapping iter to proceed
 *
 * Description:
 *   Proceeds @miter to the next mapping.  @miter should have been started
 *   using sg_miter_start().  On successful return, @miter->page,
 *   @miter->addr and @miter->length point to the current mapping.
 *
 * Context:
 *   May sleep if !SG_MITER_ATOMIC.
 *
 * Returns:
 *   true if @miter contains the next mapping.  false if end of sg
 *   list is reached.
 */
bool sg_miter_next(struct sg_mapping_iter *miter)
{
        sg_miter_stop(miter);

        /*
         * Get to the next page if necessary.
         * __remaining, __offset is adjusted by sg_miter_stop
         */
        if (!sg_miter_get_next_page(miter))
                return false;

        miter->page = sg_page_iter_page(&miter->piter);
        miter->consumed = miter->length = miter->__remaining;

        if (miter->__flags & SG_MITER_ATOMIC)
                miter->addr = kmap_atomic(miter->page) + miter->__offset;
        else
                miter->addr = kmap(miter->page) + miter->__offset;

        return true;
}
EXPORT_SYMBOL(sg_miter_next);

/**
 * sg_miter_stop - stop mapping iteration
 * @miter: sg mapping iter to be stopped
 *
 * Description:
 *   Stops mapping iterator @miter.  @miter should have been started
 *   using sg_miter_start().  A stopped iteration can be resumed by
 *   calling sg_miter_next() on it.  This is useful when resources (kmap)
 *   need to be released during iteration.
 *
 * Context:
 *   Don't care otherwise.
 */
void sg_miter_stop(struct sg_mapping_iter *miter)
{
        WARN_ON(miter->consumed > miter->length);

        /* drop resources from the last iteration */
        if (miter->addr) {
                miter->__offset += miter->consumed;
                miter->__remaining -= miter->consumed;

                if (miter->__flags & SG_MITER_TO_SG)
                        flush_dcache_page(miter->page);

                if (miter->__flags & SG_MITER_ATOMIC) {
                        WARN_ON_ONCE(!pagefault_disabled());
                        kunmap_atomic(miter->addr);
                } else
                        kunmap(miter->page);

                miter->page = NULL;
                miter->addr = NULL;
                miter->length = 0;
                miter->consumed = 0;
        }
}
EXPORT_SYMBOL(sg_miter_stop);

/**
 * sg_copy_buffer - Copy data between a linear buffer and an SG list
 * @sgl:                 The SG list
 * @nents:                 Number of SG entries
 * @buf:                 Where to copy from
 * @buflen:                 The number of bytes to copy
 * @skip:                 Number of bytes to skip before copying
 * @to_buffer:                 transfer direction (true == from an sg list to a
 *                         buffer, false == from a buffer to an sg list)
 *
 * Returns the number of copied bytes.
 *
 **/
size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
                      size_t buflen, off_t skip, bool to_buffer)
{
        unsigned int offset = 0;
        struct sg_mapping_iter miter;
        unsigned int sg_flags = SG_MITER_ATOMIC;

        if (to_buffer)
                sg_flags |= SG_MITER_FROM_SG;
        else
                sg_flags |= SG_MITER_TO_SG;

        sg_miter_start(&miter, sgl, nents, sg_flags);

        if (!sg_miter_skip(&miter, skip))
                return 0;

        while ((offset < buflen) && sg_miter_next(&miter)) {
                unsigned int len;

                len = min(miter.length, buflen - offset);

                if (to_buffer)
                        memcpy(buf + offset, miter.addr, len);
                else
                        memcpy(miter.addr, buf + offset, len);

                offset += len;
        }

        sg_miter_stop(&miter);

        return offset;
}
EXPORT_SYMBOL(sg_copy_buffer);

/**
 * sg_copy_from_buffer - Copy from a linear buffer to an SG list
 * @sgl:                 The SG list
 * @nents:                 Number of SG entries
 * @buf:                 Where to copy from
 * @buflen:                 The number of bytes to copy
 *
 * Returns the number of copied bytes.
 *
 **/
size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents,
                           const void *buf, size_t buflen)
{
        return sg_copy_buffer(sgl, nents, (void *)buf, buflen, 0, false);
}
EXPORT_SYMBOL(sg_copy_from_buffer);

/**
 * sg_copy_to_buffer - Copy from an SG list to a linear buffer
 * @sgl:                 The SG list
 * @nents:                 Number of SG entries
 * @buf:                 Where to copy to
 * @buflen:                 The number of bytes to copy
 *
 * Returns the number of copied bytes.
 *
 **/
size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents,
                         void *buf, size_t buflen)
{
        return sg_copy_buffer(sgl, nents, buf, buflen, 0, true);
}
EXPORT_SYMBOL(sg_copy_to_buffer);

/**
 * sg_pcopy_from_buffer - Copy from a linear buffer to an SG list
 * @sgl:                 The SG list
 * @nents:                 Number of SG entries
 * @buf:                 Where to copy from
 * @buflen:                 The number of bytes to copy
 * @skip:                 Number of bytes to skip before copying
 *
 * Returns the number of copied bytes.
 *
 **/
size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents,
                            const void *buf, size_t buflen, off_t skip)
{
        return sg_copy_buffer(sgl, nents, (void *)buf, buflen, skip, false);
}
EXPORT_SYMBOL(sg_pcopy_from_buffer);

/**
 * sg_pcopy_to_buffer - Copy from an SG list to a linear buffer
 * @sgl:                 The SG list
 * @nents:                 Number of SG entries
 * @buf:                 Where to copy to
 * @buflen:                 The number of bytes to copy
 * @skip:                 Number of bytes to skip before copying
 *
 * Returns the number of copied bytes.
 *
 **/
size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents,
                          void *buf, size_t buflen, off_t skip)
{
        return sg_copy_buffer(sgl, nents, buf, buflen, skip, true);
}
EXPORT_SYMBOL(sg_pcopy_to_buffer);

/**
 * sg_zero_buffer - Zero-out a part of a SG list
 * @sgl:                 The SG list
 * @nents:                 Number of SG entries
 * @buflen:                 The number of bytes to zero out
 * @skip:                 Number of bytes to skip before zeroing
 *
 * Returns the number of bytes zeroed.
 **/
size_t sg_zero_buffer(struct scatterlist *sgl, unsigned int nents,
                       size_t buflen, off_t skip)
{
        unsigned int offset = 0;
        struct sg_mapping_iter miter;
        unsigned int sg_flags = SG_MITER_ATOMIC | SG_MITER_TO_SG;

        sg_miter_start(&miter, sgl, nents, sg_flags);

        if (!sg_miter_skip(&miter, skip))
                return false;

        while (offset < buflen && sg_miter_next(&miter)) {
                unsigned int len;

                len = min(miter.length, buflen - offset);
                memset(miter.addr, 0, len);

                offset += len;
        }

        sg_miter_stop(&miter);
        return offset;
}
EXPORT_SYMBOL(sg_zero_buffer);

/*
 * Extract and pin a list of up to sg_max pages from UBUF- or IOVEC-class
 * iterators, and add them to the scatterlist.
 */
static ssize_t extract_user_to_sg(struct iov_iter *iter,
                                  ssize_t maxsize,
                                  struct sg_table *sgtable,
                                  unsigned int sg_max,
                                  iov_iter_extraction_t extraction_flags)
{
        struct scatterlist *sg = sgtable->sgl + sgtable->nents;
        struct page **pages;
        unsigned int npages;
        ssize_t ret = 0, res;
        size_t len, off;

        /* We decant the page list into the tail of the scatterlist */
        pages = (void *)sgtable->sgl +
                array_size(sg_max, sizeof(struct scatterlist));
        pages -= sg_max;

        do {
                res = iov_iter_extract_pages(iter, &pages, maxsize, sg_max,
                                             extraction_flags, &off);
                if (res <= 0)
                        goto failed;

                len = res;
                maxsize -= len;
                ret += len;
                npages = DIV_ROUND_UP(off + len, PAGE_SIZE);
                sg_max -= npages;

                for (; npages > 0; npages--) {
                        struct page *page = *pages;
                        size_t seg = min_t(size_t, PAGE_SIZE - off, len);

                        *pages++ = NULL;
                        sg_set_page(sg, page, seg, off);
                        sgtable->nents++;
                        sg++;
                        len -= seg;
                        off = 0;
                }
        } while (maxsize > 0 && sg_max > 0);

        return ret;

failed:
        while (sgtable->nents > sgtable->orig_nents)
                unpin_user_page(sg_page(&sgtable->sgl[--sgtable->nents]));
        return res;
}

/*
 * Extract up to sg_max pages from a BVEC-type iterator and add them to the
 * scatterlist.  The pages are not pinned.
 */
static ssize_t extract_bvec_to_sg(struct iov_iter *iter,
                                  ssize_t maxsize,
                                  struct sg_table *sgtable,
                                  unsigned int sg_max,
                                  iov_iter_extraction_t extraction_flags)
{
        const struct bio_vec *bv = iter->bvec;
        struct scatterlist *sg = sgtable->sgl + sgtable->nents;
        unsigned long start = iter->iov_offset;
        unsigned int i;
        ssize_t ret = 0;

        for (i = 0; i < iter->nr_segs; i++) {
                size_t off, len;

                len = bv[i].bv_len;
                if (start >= len) {
                        start -= len;
                        continue;
                }

                len = min_t(size_t, maxsize, len - start);
                off = bv[i].bv_offset + start;

                sg_set_page(sg, bv[i].bv_page, len, off);
                sgtable->nents++;
                sg++;
                sg_max--;

                ret += len;
                maxsize -= len;
                if (maxsize <= 0 || sg_max == 0)
                        break;
                start = 0;
        }

        if (ret > 0)
                iov_iter_advance(iter, ret);
        return ret;
}

/*
 * Extract up to sg_max pages from a KVEC-type iterator and add them to the
 * scatterlist.  This can deal with vmalloc'd buffers as well as kmalloc'd or
 * static buffers.  The pages are not pinned.
 */
static ssize_t extract_kvec_to_sg(struct iov_iter *iter,
                                  ssize_t maxsize,
                                  struct sg_table *sgtable,
                                  unsigned int sg_max,
                                  iov_iter_extraction_t extraction_flags)
{
        const struct kvec *kv = iter->kvec;
        struct scatterlist *sg = sgtable->sgl + sgtable->nents;
        unsigned long start = iter->iov_offset;
        unsigned int i;
        ssize_t ret = 0;

        for (i = 0; i < iter->nr_segs; i++) {
                struct page *page;
                unsigned long kaddr;
                size_t off, len, seg;

                len = kv[i].iov_len;
                if (start >= len) {
                        start -= len;
                        continue;
                }

                kaddr = (unsigned long)kv[i].iov_base + start;
                off = kaddr & ~PAGE_MASK;
                len = min_t(size_t, maxsize, len - start);
                kaddr &= PAGE_MASK;

                maxsize -= len;
                ret += len;
                do {
                        seg = min_t(size_t, len, PAGE_SIZE - off);
                        if (is_vmalloc_or_module_addr((void *)kaddr))
                                page = vmalloc_to_page((void *)kaddr);
                        else
                                page = virt_to_page((void *)kaddr);

                        sg_set_page(sg, page, len, off);
                        sgtable->nents++;
                        sg++;
                        sg_max--;

                        len -= seg;
                        kaddr += PAGE_SIZE;
                        off = 0;
                } while (len > 0 && sg_max > 0);

                if (maxsize <= 0 || sg_max == 0)
                        break;
                start = 0;
        }

        if (ret > 0)
                iov_iter_advance(iter, ret);
        return ret;
}

/*
 * Extract up to sg_max folios from an XARRAY-type iterator and add them to
 * the scatterlist.  The pages are not pinned.
 */
static ssize_t extract_xarray_to_sg(struct iov_iter *iter,
                                    ssize_t maxsize,
                                    struct sg_table *sgtable,
                                    unsigned int sg_max,
                                    iov_iter_extraction_t extraction_flags)
{
        struct scatterlist *sg = sgtable->sgl + sgtable->nents;
        struct xarray *xa = iter->xarray;
        struct folio *folio;
        loff_t start = iter->xarray_start + iter->iov_offset;
        pgoff_t index = start / PAGE_SIZE;
        ssize_t ret = 0;
        size_t offset, len;
        XA_STATE(xas, xa, index);

        rcu_read_lock();

        xas_for_each(&xas, folio, ULONG_MAX) {
                if (xas_retry(&xas, folio))
                        continue;
                if (WARN_ON(xa_is_value(folio)))
                        break;
                if (WARN_ON(folio_test_hugetlb(folio)))
                        break;

                offset = offset_in_folio(folio, start);
                len = min_t(size_t, maxsize, folio_size(folio) - offset);

                sg_set_page(sg, folio_page(folio, 0), len, offset);
                sgtable->nents++;
                sg++;
                sg_max--;

                maxsize -= len;
                ret += len;
                if (maxsize <= 0 || sg_max == 0)
                        break;
        }

        rcu_read_unlock();
        if (ret > 0)
                iov_iter_advance(iter, ret);
        return ret;
}

/**
 * extract_iter_to_sg - Extract pages from an iterator and add to an sglist
 * @iter: The iterator to extract from
 * @maxsize: The amount of iterator to copy
 * @sgtable: The scatterlist table to fill in
 * @sg_max: Maximum number of elements in @sgtable that may be filled
 * @extraction_flags: Flags to qualify the request
 *
 * Extract the page fragments from the given amount of the source iterator and
 * add them to a scatterlist that refers to all of those bits, to a maximum
 * addition of @sg_max elements.
 *
 * The pages referred to by UBUF- and IOVEC-type iterators are extracted and
 * pinned; BVEC-, KVEC- and XARRAY-type are extracted but aren't pinned; PIPE-
 * and DISCARD-type are not supported.
 *
 * No end mark is placed on the scatterlist; that's left to the caller.
 *
 * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA
 * be allowed on the pages extracted.
 *
 * If successful, @sgtable->nents is updated to include the number of elements
 * added and the number of bytes added is returned.  @sgtable->orig_nents is
 * left unaltered.
 *
 * The iov_iter_extract_mode() function should be used to query how cleanup
 * should be performed.
 */
ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t maxsize,
                           struct sg_table *sgtable, unsigned int sg_max,
                           iov_iter_extraction_t extraction_flags)
{
        if (maxsize == 0)
                return 0;

        switch (iov_iter_type(iter)) {
        case ITER_UBUF:
        case ITER_IOVEC:
                return extract_user_to_sg(iter, maxsize, sgtable, sg_max,
                                          extraction_flags);
        case ITER_BVEC:
                return extract_bvec_to_sg(iter, maxsize, sgtable, sg_max,
                                          extraction_flags);
        case ITER_KVEC:
                return extract_kvec_to_sg(iter, maxsize, sgtable, sg_max,
                                          extraction_flags);
        case ITER_XARRAY:
                return extract_xarray_to_sg(iter, maxsize, sgtable, sg_max,
                                            extraction_flags);
        default:
                pr_err("%s(%u) unsupported\n", __func__, iov_iter_type(iter));
                WARN_ON_ONCE(1);
                return -EIO;
        }
}
EXPORT_SYMBOL_GPL(extract_iter_to_sg);















































































































































































































































    7 










    7 
















































    2 














































    5 
    6 

    3 








































































    6 





    6 


    7 

    1 












    7 
    5 

    7 













    6 




























































































    7 
    6 


    7 



















    6 

    6 
















































    6 



    6 


    3 
    5 



    1 

    4 


    4 





































    2 
    3 
    1 


























    2 



























































    6 






    6 




    4 






    7 















    5 

































    6 



    5 


    4 
    4 







    6 















    5 















    1 





    1 









    1 













    7 


    7 











    5 
















































































































































    2 





























    6 











    6 

    4 
    2 





















































    7 


    2 
    1 
































    7 





    7 

    6 






    7 




    7 
    2 












    5 


    6 




















































































































































































































































































































































































    6 
















    6 
    7 


































































































    6 












    6 











    7 













    6 


    3 







    5 








    7 


















    7 
    6 











    7 





























    6 



    3 




















    4 


    6 



    6 





    6 






















































































































































































































































































































































































































    2 





    3 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/percpu.c - percpu memory allocator
 *
 * Copyright (C) 2009                SUSE Linux Products GmbH
 * Copyright (C) 2009                Tejun Heo <tj@kernel.org>
 *
 * Copyright (C) 2017                Facebook Inc.
 * Copyright (C) 2017                Dennis Zhou <dennis@kernel.org>
 *
 * The percpu allocator handles both static and dynamic areas.  Percpu
 * areas are allocated in chunks which are divided into units.  There is
 * a 1-to-1 mapping for units to possible cpus.  These units are grouped
 * based on NUMA properties of the machine.
 *
 *  c0                           c1                         c2
 *  -------------------          -------------------        ------------
 * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
 *  -------------------  ......  -------------------  ....  ------------
 *
 * Allocation is done by offsets into a unit's address space.  Ie., an
 * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
 * c1:u1, c1:u2, etc.  On NUMA machines, the mapping may be non-linear
 * and even sparse.  Access is handled by configuring percpu base
 * registers according to the cpu to unit mappings and offsetting the
 * base address using pcpu_unit_size.
 *
 * There is special consideration for the first chunk which must handle
 * the static percpu variables in the kernel image as allocation services
 * are not online yet.  In short, the first chunk is structured like so:
 *
 *                  <Static | [Reserved] | Dynamic>
 *
 * The static data is copied from the original section managed by the
 * linker.  The reserved section, if non-zero, primarily manages static
 * percpu variables from kernel modules.  Finally, the dynamic section
 * takes care of normal allocations.
 *
 * The allocator organizes chunks into lists according to free size and
 * memcg-awareness.  To make a percpu allocation memcg-aware the __GFP_ACCOUNT
 * flag should be passed.  All memcg-aware allocations are sharing one set
 * of chunks and all unaccounted allocations and allocations performed
 * by processes belonging to the root memory cgroup are using the second set.
 *
 * The allocator tries to allocate from the fullest chunk first. Each chunk
 * is managed by a bitmap with metadata blocks.  The allocation map is updated
 * on every allocation and free to reflect the current state while the boundary
 * map is only updated on allocation.  Each metadata block contains
 * information to help mitigate the need to iterate over large portions
 * of the bitmap.  The reverse mapping from page to chunk is stored in
 * the page's index.  Lastly, units are lazily backed and grow in unison.
 *
 * There is a unique conversion that goes on here between bytes and bits.
 * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE.  The chunk
 * tracks the number of pages it is responsible for in nr_pages.  Helper
 * functions are used to convert from between the bytes, bits, and blocks.
 * All hints are managed in bits unless explicitly stated.
 *
 * To use this allocator, arch code should do the following:
 *
 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
 *   regular address to percpu pointer and back if they need to be
 *   different from the default
 *
 * - use pcpu_setup_first_chunk() during percpu area initialization to
 *   setup the first chunk containing the kernel static percpu area
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/bitmap.h>
#include <linux/cpumask.h>
#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/pfn.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include <linux/kmemleak.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/memcontrol.h>

#include <asm/cacheflush.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/io.h>

#define CREATE_TRACE_POINTS
#include <trace/events/percpu.h>

#include "percpu-internal.h"

/*
 * The slots are sorted by the size of the biggest continuous free area.
 * 1-31 bytes share the same slot.
 */
#define PCPU_SLOT_BASE_SHIFT                5
/* chunks in slots below this are subject to being sidelined on failed alloc */
#define PCPU_SLOT_FAIL_THRESHOLD        3

#define PCPU_EMPTY_POP_PAGES_LOW        2
#define PCPU_EMPTY_POP_PAGES_HIGH        4

#ifdef CONFIG_SMP
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
#ifndef __addr_to_pcpu_ptr
#define __addr_to_pcpu_ptr(addr)                                        \
        (void __percpu *)((unsigned long)(addr) -                        \
                          (unsigned long)pcpu_base_addr        +                \
                          (unsigned long)__per_cpu_start)
#endif
#ifndef __pcpu_ptr_to_addr
#define __pcpu_ptr_to_addr(ptr)                                                \
        (void __force *)((unsigned long)(ptr) +                                \
                         (unsigned long)pcpu_base_addr -                \
                         (unsigned long)__per_cpu_start)
#endif
#else        /* CONFIG_SMP */
/* on UP, it's always identity mapped */
#define __addr_to_pcpu_ptr(addr)        (void __percpu *)(addr)
#define __pcpu_ptr_to_addr(ptr)                (void __force *)(ptr)
#endif        /* CONFIG_SMP */

static int pcpu_unit_pages __ro_after_init;
static int pcpu_unit_size __ro_after_init;
static int pcpu_nr_units __ro_after_init;
static int pcpu_atom_size __ro_after_init;
int pcpu_nr_slots __ro_after_init;
static int pcpu_free_slot __ro_after_init;
int pcpu_sidelined_slot __ro_after_init;
int pcpu_to_depopulate_slot __ro_after_init;
static size_t pcpu_chunk_struct_size __ro_after_init;

/* cpus with the lowest and highest unit addresses */
static unsigned int pcpu_low_unit_cpu __ro_after_init;
static unsigned int pcpu_high_unit_cpu __ro_after_init;

/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __ro_after_init;

static const int *pcpu_unit_map __ro_after_init;                /* cpu -> unit */
const unsigned long *pcpu_unit_offsets __ro_after_init;        /* cpu -> unit offset */

/* group information, used for vm allocation */
static int pcpu_nr_groups __ro_after_init;
static const unsigned long *pcpu_group_offsets __ro_after_init;
static const size_t *pcpu_group_sizes __ro_after_init;

/*
 * The first chunk which always exists.  Note that unlike other
 * chunks, this one can be allocated and mapped in several different
 * ways and thus often doesn't live in the vmalloc area.
 */
struct pcpu_chunk *pcpu_first_chunk __ro_after_init;

/*
 * Optional reserved chunk.  This chunk reserves part of the first
 * chunk and serves it for reserved allocations.  When the reserved
 * region doesn't exist, the following variable is NULL.
 */
struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;

DEFINE_SPINLOCK(pcpu_lock);        /* all internal data structures */
static DEFINE_MUTEX(pcpu_alloc_mutex);        /* chunk create/destroy, [de]pop, map ext */

struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */

/*
 * The number of empty populated pages, protected by pcpu_lock.
 * The reserved chunk doesn't contribute to the count.
 */
int pcpu_nr_empty_pop_pages;

/*
 * The number of populated pages in use by the allocator, protected by
 * pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page gets
 * allocated/deallocated, it is allocated/deallocated in all units of a chunk
 * and increments/decrements this count by 1).
 */
static unsigned long pcpu_nr_populated;

/*
 * Balance work is used to populate or destroy chunks asynchronously.  We
 * try to keep the number of populated free pages between
 * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
 * empty chunk.
 */
static void pcpu_balance_workfn(struct work_struct *work);
static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
static bool pcpu_async_enabled __read_mostly;
static bool pcpu_atomic_alloc_failed;

static void pcpu_schedule_balance_work(void)
{
        if (pcpu_async_enabled)
                schedule_work(&pcpu_balance_work);
}

/**
 * pcpu_addr_in_chunk - check if the address is served from this chunk
 * @chunk: chunk of interest
 * @addr: percpu address
 *
 * RETURNS:
 * True if the address is served from this chunk.
 */
static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
{
        void *start_addr, *end_addr;

        if (!chunk)
                return false;

        start_addr = chunk->base_addr + chunk->start_offset;
        end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
                   chunk->end_offset;

        return addr >= start_addr && addr < end_addr;
}

static int __pcpu_size_to_slot(int size)
{
        int highbit = fls(size);        /* size is in bytes */
        return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
}

static int pcpu_size_to_slot(int size)
{
        if (size == pcpu_unit_size)
                return pcpu_free_slot;
        return __pcpu_size_to_slot(size);
}

static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
        const struct pcpu_block_md *chunk_md = &chunk->chunk_md;

        if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
            chunk_md->contig_hint == 0)
                return 0;

        return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
}

/* set the pointer to a chunk in a page struct */
static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
{
        page->index = (unsigned long)pcpu;
}

/* obtain pointer to a chunk from a page struct */
static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
{
        return (struct pcpu_chunk *)page->index;
}

static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
{
        return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
}

static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
{
        return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
}

static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
                                     unsigned int cpu, int page_idx)
{
        return (unsigned long)chunk->base_addr +
               pcpu_unit_page_offset(cpu, page_idx);
}

/*
 * The following are helper functions to help access bitmaps and convert
 * between bitmap offsets to address offsets.
 */
static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
{
        return chunk->alloc_map +
               (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
}

static unsigned long pcpu_off_to_block_index(int off)
{
        return off / PCPU_BITMAP_BLOCK_BITS;
}

static unsigned long pcpu_off_to_block_off(int off)
{
        return off & (PCPU_BITMAP_BLOCK_BITS - 1);
}

static unsigned long pcpu_block_off_to_off(int index, int off)
{
        return index * PCPU_BITMAP_BLOCK_BITS + off;
}

/**
 * pcpu_check_block_hint - check against the contig hint
 * @block: block of interest
 * @bits: size of allocation
 * @align: alignment of area (max PAGE_SIZE)
 *
 * Check to see if the allocation can fit in the block's contig hint.
 * Note, a chunk uses the same hints as a block so this can also check against
 * the chunk's contig hint.
 */
static bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits,
                                  size_t align)
{
        int bit_off = ALIGN(block->contig_hint_start, align) -
                block->contig_hint_start;

        return bit_off + bits <= block->contig_hint;
}

/*
 * pcpu_next_hint - determine which hint to use
 * @block: block of interest
 * @alloc_bits: size of allocation
 *
 * This determines if we should scan based on the scan_hint or first_free.
 * In general, we want to scan from first_free to fulfill allocations by
 * first fit.  However, if we know a scan_hint at position scan_hint_start
 * cannot fulfill an allocation, we can begin scanning from there knowing
 * the contig_hint will be our fallback.
 */
static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
{
        /*
         * The three conditions below determine if we can skip past the
         * scan_hint.  First, does the scan hint exist.  Second, is the
         * contig_hint after the scan_hint (possibly not true iff
         * contig_hint == scan_hint).  Third, is the allocation request
         * larger than the scan_hint.
         */
        if (block->scan_hint &&
            block->contig_hint_start > block->scan_hint_start &&
            alloc_bits > block->scan_hint)
                return block->scan_hint_start + block->scan_hint;

        return block->first_free;
}

/**
 * pcpu_next_md_free_region - finds the next hint free area
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Helper function for pcpu_for_each_md_free_region.  It checks
 * block->contig_hint and performs aggregation across blocks to find the
 * next hint.  It modifies bit_off and bits in-place to be consumed in the
 * loop.
 */
static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
                                     int *bits)
{
        int i = pcpu_off_to_block_index(*bit_off);
        int block_off = pcpu_off_to_block_off(*bit_off);
        struct pcpu_block_md *block;

        *bits = 0;
        for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
             block++, i++) {
                /* handles contig area across blocks */
                if (*bits) {
                        *bits += block->left_free;
                        if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
                                continue;
                        return;
                }

                /*
                 * This checks three things.  First is there a contig_hint to
                 * check.  Second, have we checked this hint before by
                 * comparing the block_off.  Third, is this the same as the
                 * right contig hint.  In the last case, it spills over into
                 * the next block and should be handled by the contig area
                 * across blocks code.
                 */
                *bits = block->contig_hint;
                if (*bits && block->contig_hint_start >= block_off &&
                    *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
                        *bit_off = pcpu_block_off_to_off(i,
                                        block->contig_hint_start);
                        return;
                }
                /* reset to satisfy the second predicate above */
                block_off = 0;

                *bits = block->right_free;
                *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
        }
}

/**
 * pcpu_next_fit_region - finds fit areas for a given allocation request
 * @chunk: chunk of interest
 * @alloc_bits: size of allocation
 * @align: alignment of area (max PAGE_SIZE)
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Finds the next free region that is viable for use with a given size and
 * alignment.  This only returns if there is a valid area to be used for this
 * allocation.  block->first_free is returned if the allocation request fits
 * within the block to see if the request can be fulfilled prior to the contig
 * hint.
 */
static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
                                 int align, int *bit_off, int *bits)
{
        int i = pcpu_off_to_block_index(*bit_off);
        int block_off = pcpu_off_to_block_off(*bit_off);
        struct pcpu_block_md *block;

        *bits = 0;
        for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
             block++, i++) {
                /* handles contig area across blocks */
                if (*bits) {
                        *bits += block->left_free;
                        if (*bits >= alloc_bits)
                                return;
                        if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
                                continue;
                }

                /* check block->contig_hint */
                *bits = ALIGN(block->contig_hint_start, align) -
                        block->contig_hint_start;
                /*
                 * This uses the block offset to determine if this has been
                 * checked in the prior iteration.
                 */
                if (block->contig_hint &&
                    block->contig_hint_start >= block_off &&
                    block->contig_hint >= *bits + alloc_bits) {
                        int start = pcpu_next_hint(block, alloc_bits);

                        *bits += alloc_bits + block->contig_hint_start -
                                 start;
                        *bit_off = pcpu_block_off_to_off(i, start);
                        return;
                }
                /* reset to satisfy the second predicate above */
                block_off = 0;

                *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
                                 align);
                *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
                *bit_off = pcpu_block_off_to_off(i, *bit_off);
                if (*bits >= alloc_bits)
                        return;
        }

        /* no valid offsets were found - fail condition */
        *bit_off = pcpu_chunk_map_bits(chunk);
}

/*
 * Metadata free area iterators.  These perform aggregation of free areas
 * based on the metadata blocks and return the offset @bit_off and size in
 * bits of the free area @bits.  pcpu_for_each_fit_region only returns when
 * a fit is found for the allocation request.
 */
#define pcpu_for_each_md_free_region(chunk, bit_off, bits)                \
        for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits));        \
             (bit_off) < pcpu_chunk_map_bits((chunk));                        \
             (bit_off) += (bits) + 1,                                        \
             pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))

#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits)     \
        for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
                                  &(bits));                                      \
             (bit_off) < pcpu_chunk_map_bits((chunk));                              \
             (bit_off) += (bits),                                              \
             pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
                                  &(bits)))

/**
 * pcpu_mem_zalloc - allocate memory
 * @size: bytes to allocate
 * @gfp: allocation flags
 *
 * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
 * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
 * This is to facilitate passing through whitelisted flags.  The
 * returned memory is always zeroed.
 *
 * RETURNS:
 * Pointer to the allocated area on success, NULL on failure.
 */
static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
{
        if (WARN_ON_ONCE(!slab_is_available()))
                return NULL;

        if (size <= PAGE_SIZE)
                return kzalloc(size, gfp);
        else
                return __vmalloc(size, gfp | __GFP_ZERO);
}

/**
 * pcpu_mem_free - free memory
 * @ptr: memory to free
 *
 * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
 */
static void pcpu_mem_free(void *ptr)
{
        kvfree(ptr);
}

static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
                              bool move_front)
{
        if (chunk != pcpu_reserved_chunk) {
                if (move_front)
                        list_move(&chunk->list, &pcpu_chunk_lists[slot]);
                else
                        list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]);
        }
}

static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
{
        __pcpu_chunk_move(chunk, slot, true);
}

/**
 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
 * @chunk: chunk of interest
 * @oslot: the previous slot it was on
 *
 * This function is called after an allocation or free changed @chunk.
 * New slot according to the changed state is determined and @chunk is
 * moved to the slot.  Note that the reserved chunk is never put on
 * chunk slots.
 *
 * CONTEXT:
 * pcpu_lock.
 */
static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
        int nslot = pcpu_chunk_slot(chunk);

        /* leave isolated chunks in-place */
        if (chunk->isolated)
                return;

        if (oslot != nslot)
                __pcpu_chunk_move(chunk, nslot, oslot < nslot);
}

static void pcpu_isolate_chunk(struct pcpu_chunk *chunk)
{
        lockdep_assert_held(&pcpu_lock);

        if (!chunk->isolated) {
                chunk->isolated = true;
                pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages;
        }
        list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]);
}

static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk)
{
        lockdep_assert_held(&pcpu_lock);

        if (chunk->isolated) {
                chunk->isolated = false;
                pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages;
                pcpu_chunk_relocate(chunk, -1);
        }
}

/*
 * pcpu_update_empty_pages - update empty page counters
 * @chunk: chunk of interest
 * @nr: nr of empty pages
 *
 * This is used to keep track of the empty pages now based on the premise
 * a md_block covers a page.  The hint update functions recognize if a block
 * is made full or broken to calculate deltas for keeping track of free pages.
 */
static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
{
        chunk->nr_empty_pop_pages += nr;
        if (chunk != pcpu_reserved_chunk && !chunk->isolated)
                pcpu_nr_empty_pop_pages += nr;
}

/*
 * pcpu_region_overlap - determines if two regions overlap
 * @a: start of first region, inclusive
 * @b: end of first region, exclusive
 * @x: start of second region, inclusive
 * @y: end of second region, exclusive
 *
 * This is used to determine if the hint region [a, b) overlaps with the
 * allocated region [x, y).
 */
static inline bool pcpu_region_overlap(int a, int b, int x, int y)
{
        return (a < y) && (x < b);
}

/**
 * pcpu_block_update - updates a block given a free area
 * @block: block of interest
 * @start: start offset in block
 * @end: end offset in block
 *
 * Updates a block given a known free area.  The region [start, end) is
 * expected to be the entirety of the free area within a block.  Chooses
 * the best starting offset if the contig hints are equal.
 */
static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
{
        int contig = end - start;

        block->first_free = min(block->first_free, start);
        if (start == 0)
                block->left_free = contig;

        if (end == block->nr_bits)
                block->right_free = contig;

        if (contig > block->contig_hint) {
                /* promote the old contig_hint to be the new scan_hint */
                if (start > block->contig_hint_start) {
                        if (block->contig_hint > block->scan_hint) {
                                block->scan_hint_start =
                                        block->contig_hint_start;
                                block->scan_hint = block->contig_hint;
                        } else if (start < block->scan_hint_start) {
                                /*
                                 * The old contig_hint == scan_hint.  But, the
                                 * new contig is larger so hold the invariant
                                 * scan_hint_start < contig_hint_start.
                                 */
                                block->scan_hint = 0;
                        }
                } else {
                        block->scan_hint = 0;
                }
                block->contig_hint_start = start;
                block->contig_hint = contig;
        } else if (contig == block->contig_hint) {
                if (block->contig_hint_start &&
                    (!start ||
                     __ffs(start) > __ffs(block->contig_hint_start))) {
                        /* start has a better alignment so use it */
                        block->contig_hint_start = start;
                        if (start < block->scan_hint_start &&
                            block->contig_hint > block->scan_hint)
                                block->scan_hint = 0;
                } else if (start > block->scan_hint_start ||
                           block->contig_hint > block->scan_hint) {
                        /*
                         * Knowing contig == contig_hint, update the scan_hint
                         * if it is farther than or larger than the current
                         * scan_hint.
                         */
                        block->scan_hint_start = start;
                        block->scan_hint = contig;
                }
        } else {
                /*
                 * The region is smaller than the contig_hint.  So only update
                 * the scan_hint if it is larger than or equal and farther than
                 * the current scan_hint.
                 */
                if ((start < block->contig_hint_start &&
                     (contig > block->scan_hint ||
                      (contig == block->scan_hint &&
                       start > block->scan_hint_start)))) {
                        block->scan_hint_start = start;
                        block->scan_hint = contig;
                }
        }
}

/*
 * pcpu_block_update_scan - update a block given a free area from a scan
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Finding the final allocation spot first goes through pcpu_find_block_fit()
 * to find a block that can hold the allocation and then pcpu_alloc_area()
 * where a scan is used.  When allocations require specific alignments,
 * we can inadvertently create holes which will not be seen in the alloc
 * or free paths.
 *
 * This takes a given free area hole and updates a block as it may change the
 * scan_hint.  We need to scan backwards to ensure we don't miss free bits
 * from alignment.
 */
static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
                                   int bits)
{
        int s_off = pcpu_off_to_block_off(bit_off);
        int e_off = s_off + bits;
        int s_index, l_bit;
        struct pcpu_block_md *block;

        if (e_off > PCPU_BITMAP_BLOCK_BITS)
                return;

        s_index = pcpu_off_to_block_index(bit_off);
        block = chunk->md_blocks + s_index;

        /* scan backwards in case of alignment skipping free bits */
        l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
        s_off = (s_off == l_bit) ? 0 : l_bit + 1;

        pcpu_block_update(block, s_off, e_off);
}

/**
 * pcpu_chunk_refresh_hint - updates metadata about a chunk
 * @chunk: chunk of interest
 * @full_scan: if we should scan from the beginning
 *
 * Iterates over the metadata blocks to find the largest contig area.
 * A full scan can be avoided on the allocation path as this is triggered
 * if we broke the contig_hint.  In doing so, the scan_hint will be before
 * the contig_hint or after if the scan_hint == contig_hint.  This cannot
 * be prevented on freeing as we want to find the largest area possibly
 * spanning blocks.
 */
static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits;

        /* promote scan_hint to contig_hint */
        if (!full_scan && chunk_md->scan_hint) {
                bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
                chunk_md->contig_hint_start = chunk_md->scan_hint_start;
                chunk_md->contig_hint = chunk_md->scan_hint;
                chunk_md->scan_hint = 0;
        } else {
                bit_off = chunk_md->first_free;
                chunk_md->contig_hint = 0;
        }

        bits = 0;
        pcpu_for_each_md_free_region(chunk, bit_off, bits)
                pcpu_block_update(chunk_md, bit_off, bit_off + bits);
}

/**
 * pcpu_block_refresh_hint
 * @chunk: chunk of interest
 * @index: index of the metadata block
 *
 * Scans over the block beginning at first_free and updates the block
 * metadata accordingly.
 */
static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
{
        struct pcpu_block_md *block = chunk->md_blocks + index;
        unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
        unsigned int start, end;        /* region start, region end */

        /* promote scan_hint to contig_hint */
        if (block->scan_hint) {
                start = block->scan_hint_start + block->scan_hint;
                block->contig_hint_start = block->scan_hint_start;
                block->contig_hint = block->scan_hint;
                block->scan_hint = 0;
        } else {
                start = block->first_free;
                block->contig_hint = 0;
        }

        block->right_free = 0;

        /* iterate over free areas and update the contig hints */
        for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS)
                pcpu_block_update(block, start, end);
}

/**
 * pcpu_block_update_hint_alloc - update hint on allocation path
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of request
 *
 * Updates metadata for the allocation path.  The metadata only has to be
 * refreshed by a full scan iff the chunk's contig hint is broken.  Block level
 * scans are required if the block's contig hint is broken.
 */
static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
                                         int bits)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int nr_empty_pages = 0;
        struct pcpu_block_md *s_block, *e_block, *block;
        int s_index, e_index;        /* block indexes of the freed allocation */
        int s_off, e_off;        /* block offsets of the freed allocation */

        /*
         * Calculate per block offsets.
         * The calculation uses an inclusive range, but the resulting offsets
         * are [start, end).  e_index always points to the last block in the
         * range.
         */
        s_index = pcpu_off_to_block_index(bit_off);
        e_index = pcpu_off_to_block_index(bit_off + bits - 1);
        s_off = pcpu_off_to_block_off(bit_off);
        e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;

        s_block = chunk->md_blocks + s_index;
        e_block = chunk->md_blocks + e_index;

        /*
         * Update s_block.
         */
        if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
                nr_empty_pages++;

        /*
         * block->first_free must be updated if the allocation takes its place.
         * If the allocation breaks the contig_hint, a scan is required to
         * restore this hint.
         */
        if (s_off == s_block->first_free)
                s_block->first_free = find_next_zero_bit(
                                        pcpu_index_alloc_map(chunk, s_index),
                                        PCPU_BITMAP_BLOCK_BITS,
                                        s_off + bits);

        if (pcpu_region_overlap(s_block->scan_hint_start,
                                s_block->scan_hint_start + s_block->scan_hint,
                                s_off,
                                s_off + bits))
                s_block->scan_hint = 0;

        if (pcpu_region_overlap(s_block->contig_hint_start,
                                s_block->contig_hint_start +
                                s_block->contig_hint,
                                s_off,
                                s_off + bits)) {
                /* block contig hint is broken - scan to fix it */
                if (!s_off)
                        s_block->left_free = 0;
                pcpu_block_refresh_hint(chunk, s_index);
        } else {
                /* update left and right contig manually */
                s_block->left_free = min(s_block->left_free, s_off);
                if (s_index == e_index)
                        s_block->right_free = min_t(int, s_block->right_free,
                                        PCPU_BITMAP_BLOCK_BITS - e_off);
                else
                        s_block->right_free = 0;
        }

        /*
         * Update e_block.
         */
        if (s_index != e_index) {
                if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
                        nr_empty_pages++;

                /*
                 * When the allocation is across blocks, the end is along
                 * the left part of the e_block.
                 */
                e_block->first_free = find_next_zero_bit(
                                pcpu_index_alloc_map(chunk, e_index),
                                PCPU_BITMAP_BLOCK_BITS, e_off);

                if (e_off == PCPU_BITMAP_BLOCK_BITS) {
                        /* reset the block */
                        e_block++;
                } else {
                        if (e_off > e_block->scan_hint_start)
                                e_block->scan_hint = 0;

                        e_block->left_free = 0;
                        if (e_off > e_block->contig_hint_start) {
                                /* contig hint is broken - scan to fix it */
                                pcpu_block_refresh_hint(chunk, e_index);
                        } else {
                                e_block->right_free =
                                        min_t(int, e_block->right_free,
                                              PCPU_BITMAP_BLOCK_BITS - e_off);
                        }
                }

                /* update in-between md_blocks */
                nr_empty_pages += (e_index - s_index - 1);
                for (block = s_block + 1; block < e_block; block++) {
                        block->scan_hint = 0;
                        block->contig_hint = 0;
                        block->left_free = 0;
                        block->right_free = 0;
                }
        }

        /*
         * If the allocation is not atomic, some blocks may not be
         * populated with pages, while we account it here.  The number
         * of pages will be added back with pcpu_chunk_populated()
         * when populating pages.
         */
        if (nr_empty_pages)
                pcpu_update_empty_pages(chunk, -nr_empty_pages);

        if (pcpu_region_overlap(chunk_md->scan_hint_start,
                                chunk_md->scan_hint_start +
                                chunk_md->scan_hint,
                                bit_off,
                                bit_off + bits))
                chunk_md->scan_hint = 0;

        /*
         * The only time a full chunk scan is required is if the chunk
         * contig hint is broken.  Otherwise, it means a smaller space
         * was used and therefore the chunk contig hint is still correct.
         */
        if (pcpu_region_overlap(chunk_md->contig_hint_start,
                                chunk_md->contig_hint_start +
                                chunk_md->contig_hint,
                                bit_off,
                                bit_off + bits))
                pcpu_chunk_refresh_hint(chunk, false);
}

/**
 * pcpu_block_update_hint_free - updates the block hints on the free path
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of request
 *
 * Updates metadata for the allocation path.  This avoids a blind block
 * refresh by making use of the block contig hints.  If this fails, it scans
 * forward and backward to determine the extent of the free area.  This is
 * capped at the boundary of blocks.
 *
 * A chunk update is triggered if a page becomes free, a block becomes free,
 * or the free spans across blocks.  This tradeoff is to minimize iterating
 * over the block metadata to update chunk_md->contig_hint.
 * chunk_md->contig_hint may be off by up to a page, but it will never be more
 * than the available space.  If the contig hint is contained in one block, it
 * will be accurate.
 */
static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
                                        int bits)
{
        int nr_empty_pages = 0;
        struct pcpu_block_md *s_block, *e_block, *block;
        int s_index, e_index;        /* block indexes of the freed allocation */
        int s_off, e_off;        /* block offsets of the freed allocation */
        int start, end;                /* start and end of the whole free area */

        /*
         * Calculate per block offsets.
         * The calculation uses an inclusive range, but the resulting offsets
         * are [start, end).  e_index always points to the last block in the
         * range.
         */
        s_index = pcpu_off_to_block_index(bit_off);
        e_index = pcpu_off_to_block_index(bit_off + bits - 1);
        s_off = pcpu_off_to_block_off(bit_off);
        e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;

        s_block = chunk->md_blocks + s_index;
        e_block = chunk->md_blocks + e_index;

        /*
         * Check if the freed area aligns with the block->contig_hint.
         * If it does, then the scan to find the beginning/end of the
         * larger free area can be avoided.
         *
         * start and end refer to beginning and end of the free area
         * within each their respective blocks.  This is not necessarily
         * the entire free area as it may span blocks past the beginning
         * or end of the block.
         */
        start = s_off;
        if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
                start = s_block->contig_hint_start;
        } else {
                /*
                 * Scan backwards to find the extent of the free area.
                 * find_last_bit returns the starting bit, so if the start bit
                 * is returned, that means there was no last bit and the
                 * remainder of the chunk is free.
                 */
                int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
                                          start);
                start = (start == l_bit) ? 0 : l_bit + 1;
        }

        end = e_off;
        if (e_off == e_block->contig_hint_start)
                end = e_block->contig_hint_start + e_block->contig_hint;
        else
                end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
                                    PCPU_BITMAP_BLOCK_BITS, end);

        /* update s_block */
        e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
        if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
                nr_empty_pages++;
        pcpu_block_update(s_block, start, e_off);

        /* freeing in the same block */
        if (s_index != e_index) {
                /* update e_block */
                if (end == PCPU_BITMAP_BLOCK_BITS)
                        nr_empty_pages++;
                pcpu_block_update(e_block, 0, end);

                /* reset md_blocks in the middle */
                nr_empty_pages += (e_index - s_index - 1);
                for (block = s_block + 1; block < e_block; block++) {
                        block->first_free = 0;
                        block->scan_hint = 0;
                        block->contig_hint_start = 0;
                        block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
                        block->left_free = PCPU_BITMAP_BLOCK_BITS;
                        block->right_free = PCPU_BITMAP_BLOCK_BITS;
                }
        }

        if (nr_empty_pages)
                pcpu_update_empty_pages(chunk, nr_empty_pages);

        /*
         * Refresh chunk metadata when the free makes a block free or spans
         * across blocks.  The contig_hint may be off by up to a page, but if
         * the contig_hint is contained in a block, it will be accurate with
         * the else condition below.
         */
        if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
                pcpu_chunk_refresh_hint(chunk, true);
        else
                pcpu_block_update(&chunk->chunk_md,
                                  pcpu_block_off_to_off(s_index, start),
                                  end);
}

/**
 * pcpu_is_populated - determines if the region is populated
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of area
 * @next_off: return value for the next offset to start searching
 *
 * For atomic allocations, check if the backing pages are populated.
 *
 * RETURNS:
 * Bool if the backing pages are populated.
 * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
 */
static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
                              int *next_off)
{
        unsigned int start, end;

        start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
        end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);

        start = find_next_zero_bit(chunk->populated, end, start);
        if (start >= end)
                return true;

        end = find_next_bit(chunk->populated, end, start + 1);

        *next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
        return false;
}

/**
 * pcpu_find_block_fit - finds the block index to start searching
 * @chunk: chunk of interest
 * @alloc_bits: size of request in allocation units
 * @align: alignment of area (max PAGE_SIZE bytes)
 * @pop_only: use populated regions only
 *
 * Given a chunk and an allocation spec, find the offset to begin searching
 * for a free region.  This iterates over the bitmap metadata blocks to
 * find an offset that will be guaranteed to fit the requirements.  It is
 * not quite first fit as if the allocation does not fit in the contig hint
 * of a block or chunk, it is skipped.  This errs on the side of caution
 * to prevent excess iteration.  Poor alignment can cause the allocator to
 * skip over blocks and chunks that have valid free areas.
 *
 * RETURNS:
 * The offset in the bitmap to begin searching.
 * -1 if no offset is found.
 */
static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
                               size_t align, bool pop_only)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits, next_off;

        /*
         * This is an optimization to prevent scanning by assuming if the
         * allocation cannot fit in the global hint, there is memory pressure
         * and creating a new chunk would happen soon.
         */
        if (!pcpu_check_block_hint(chunk_md, alloc_bits, align))
                return -1;

        bit_off = pcpu_next_hint(chunk_md, alloc_bits);
        bits = 0;
        pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
                if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
                                                   &next_off))
                        break;

                bit_off = next_off;
                bits = 0;
        }

        if (bit_off == pcpu_chunk_map_bits(chunk))
                return -1;

        return bit_off;
}

/*
 * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off()
 * @map: the address to base the search on
 * @size: the bitmap size in bits
 * @start: the bitnumber to start searching at
 * @nr: the number of zeroed bits we're looking for
 * @align_mask: alignment mask for zero area
 * @largest_off: offset of the largest area skipped
 * @largest_bits: size of the largest area skipped
 *
 * The @align_mask should be one less than a power of 2.
 *
 * This is a modified version of bitmap_find_next_zero_area_off() to remember
 * the largest area that was skipped.  This is imperfect, but in general is
 * good enough.  The largest remembered region is the largest failed region
 * seen.  This does not include anything we possibly skipped due to alignment.
 * pcpu_block_update_scan() does scan backwards to try and recover what was
 * lost to alignment.  While this can cause scanning to miss earlier possible
 * free areas, smaller allocations will eventually fill those holes.
 */
static unsigned long pcpu_find_zero_area(unsigned long *map,
                                         unsigned long size,
                                         unsigned long start,
                                         unsigned long nr,
                                         unsigned long align_mask,
                                         unsigned long *largest_off,
                                         unsigned long *largest_bits)
{
        unsigned long index, end, i, area_off, area_bits;
again:
        index = find_next_zero_bit(map, size, start);

        /* Align allocation */
        index = __ALIGN_MASK(index, align_mask);
        area_off = index;

        end = index + nr;
        if (end > size)
                return end;
        i = find_next_bit(map, end, index);
        if (i < end) {
                area_bits = i - area_off;
                /* remember largest unused area with best alignment */
                if (area_bits > *largest_bits ||
                    (area_bits == *largest_bits && *largest_off &&
                     (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
                        *largest_off = area_off;
                        *largest_bits = area_bits;
                }

                start = i + 1;
                goto again;
        }
        return index;
}

/**
 * pcpu_alloc_area - allocates an area from a pcpu_chunk
 * @chunk: chunk of interest
 * @alloc_bits: size of request in allocation units
 * @align: alignment of area (max PAGE_SIZE)
 * @start: bit_off to start searching
 *
 * This function takes in a @start offset to begin searching to fit an
 * allocation of @alloc_bits with alignment @align.  It needs to scan
 * the allocation map because if it fits within the block's contig hint,
 * @start will be block->first_free. This is an attempt to fill the
 * allocation prior to breaking the contig hint.  The allocation and
 * boundary maps are updated accordingly if it confirms a valid
 * free area.
 *
 * RETURNS:
 * Allocated addr offset in @chunk on success.
 * -1 if no matching area is found.
 */
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
                           size_t align, int start)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        size_t align_mask = (align) ? (align - 1) : 0;
        unsigned long area_off = 0, area_bits = 0;
        int bit_off, end, oslot;

        lockdep_assert_held(&pcpu_lock);

        oslot = pcpu_chunk_slot(chunk);

        /*
         * Search to find a fit.
         */
        end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
                    pcpu_chunk_map_bits(chunk));
        bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
                                      align_mask, &area_off, &area_bits);
        if (bit_off >= end)
                return -1;

        if (area_bits)
                pcpu_block_update_scan(chunk, area_off, area_bits);

        /* update alloc map */
        bitmap_set(chunk->alloc_map, bit_off, alloc_bits);

        /* update boundary map */
        set_bit(bit_off, chunk->bound_map);
        bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
        set_bit(bit_off + alloc_bits, chunk->bound_map);

        chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;

        /* update first free bit */
        if (bit_off == chunk_md->first_free)
                chunk_md->first_free = find_next_zero_bit(
                                        chunk->alloc_map,
                                        pcpu_chunk_map_bits(chunk),
                                        bit_off + alloc_bits);

        pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);

        pcpu_chunk_relocate(chunk, oslot);

        return bit_off * PCPU_MIN_ALLOC_SIZE;
}

/**
 * pcpu_free_area - frees the corresponding offset
 * @chunk: chunk of interest
 * @off: addr offset into chunk
 *
 * This function determines the size of an allocation to free using
 * the boundary bitmap and clears the allocation map.
 *
 * RETURNS:
 * Number of freed bytes.
 */
static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits, end, oslot, freed;

        lockdep_assert_held(&pcpu_lock);
        pcpu_stats_area_dealloc(chunk);

        oslot = pcpu_chunk_slot(chunk);

        bit_off = off / PCPU_MIN_ALLOC_SIZE;

        /* find end index */
        end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
                            bit_off + 1);
        bits = end - bit_off;
        bitmap_clear(chunk->alloc_map, bit_off, bits);

        freed = bits * PCPU_MIN_ALLOC_SIZE;

        /* update metadata */
        chunk->free_bytes += freed;

        /* update first free bit */
        chunk_md->first_free = min(chunk_md->first_free, bit_off);

        pcpu_block_update_hint_free(chunk, bit_off, bits);

        pcpu_chunk_relocate(chunk, oslot);

        return freed;
}

static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
{
        block->scan_hint = 0;
        block->contig_hint = nr_bits;
        block->left_free = nr_bits;
        block->right_free = nr_bits;
        block->first_free = 0;
        block->nr_bits = nr_bits;
}

static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
{
        struct pcpu_block_md *md_block;

        /* init the chunk's block */
        pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));

        for (md_block = chunk->md_blocks;
             md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
             md_block++)
                pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
}

/**
 * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
 * @tmp_addr: the start of the region served
 * @map_size: size of the region served
 *
 * This is responsible for creating the chunks that serve the first chunk.  The
 * base_addr is page aligned down of @tmp_addr while the region end is page
 * aligned up.  Offsets are kept track of to determine the region served. All
 * this is done to appease the bitmap allocator in avoiding partial blocks.
 *
 * RETURNS:
 * Chunk serving the region at @tmp_addr of @map_size.
 */
static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
                                                         int map_size)
{
        struct pcpu_chunk *chunk;
        unsigned long aligned_addr;
        int start_offset, offset_bits, region_size, region_bits;
        size_t alloc_size;

        /* region calculations */
        aligned_addr = tmp_addr & PAGE_MASK;

        start_offset = tmp_addr - aligned_addr;
        region_size = ALIGN(start_offset + map_size, PAGE_SIZE);

        /* allocate chunk */
        alloc_size = struct_size(chunk, populated,
                                 BITS_TO_LONGS(region_size >> PAGE_SHIFT));
        chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!chunk)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        INIT_LIST_HEAD(&chunk->list);

        chunk->base_addr = (void *)aligned_addr;
        chunk->start_offset = start_offset;
        chunk->end_offset = region_size - chunk->start_offset - map_size;

        chunk->nr_pages = region_size >> PAGE_SHIFT;
        region_bits = pcpu_chunk_map_bits(chunk);

        alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
        chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!chunk->alloc_map)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        alloc_size =
                BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
        chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!chunk->bound_map)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
        chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!chunk->md_blocks)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

#ifdef NEED_PCPUOBJ_EXT
        /* first chunk is free to use */
        chunk->obj_exts = NULL;
#endif
        pcpu_init_md_blocks(chunk);

        /* manage populated page bitmap */
        chunk->immutable = true;
        bitmap_fill(chunk->populated, chunk->nr_pages);
        chunk->nr_populated = chunk->nr_pages;
        chunk->nr_empty_pop_pages = chunk->nr_pages;

        chunk->free_bytes = map_size;

        if (chunk->start_offset) {
                /* hide the beginning of the bitmap */
                offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
                bitmap_set(chunk->alloc_map, 0, offset_bits);
                set_bit(0, chunk->bound_map);
                set_bit(offset_bits, chunk->bound_map);

                chunk->chunk_md.first_free = offset_bits;

                pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
        }

        if (chunk->end_offset) {
                /* hide the end of the bitmap */
                offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
                bitmap_set(chunk->alloc_map,
                           pcpu_chunk_map_bits(chunk) - offset_bits,
                           offset_bits);
                set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
                        chunk->bound_map);
                set_bit(region_bits, chunk->bound_map);

                pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
                                             - offset_bits, offset_bits);
        }

        return chunk;
}

static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
{
        struct pcpu_chunk *chunk;
        int region_bits;

        chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
        if (!chunk)
                return NULL;

        INIT_LIST_HEAD(&chunk->list);
        chunk->nr_pages = pcpu_unit_pages;
        region_bits = pcpu_chunk_map_bits(chunk);

        chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
                                           sizeof(chunk->alloc_map[0]), gfp);
        if (!chunk->alloc_map)
                goto alloc_map_fail;

        chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
                                           sizeof(chunk->bound_map[0]), gfp);
        if (!chunk->bound_map)
                goto bound_map_fail;

        chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
                                           sizeof(chunk->md_blocks[0]), gfp);
        if (!chunk->md_blocks)
                goto md_blocks_fail;

#ifdef NEED_PCPUOBJ_EXT
        if (need_pcpuobj_ext()) {
                chunk->obj_exts =
                        pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
                                        sizeof(struct pcpuobj_ext), gfp);
                if (!chunk->obj_exts)
                        goto objcg_fail;
        }
#endif

        pcpu_init_md_blocks(chunk);

        /* init metadata */
        chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;

        return chunk;

#ifdef NEED_PCPUOBJ_EXT
objcg_fail:
        pcpu_mem_free(chunk->md_blocks);
#endif
md_blocks_fail:
        pcpu_mem_free(chunk->bound_map);
bound_map_fail:
        pcpu_mem_free(chunk->alloc_map);
alloc_map_fail:
        pcpu_mem_free(chunk);

        return NULL;
}

static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
        if (!chunk)
                return;
#ifdef NEED_PCPUOBJ_EXT
        pcpu_mem_free(chunk->obj_exts);
#endif
        pcpu_mem_free(chunk->md_blocks);
        pcpu_mem_free(chunk->bound_map);
        pcpu_mem_free(chunk->alloc_map);
        pcpu_mem_free(chunk);
}

/**
 * pcpu_chunk_populated - post-population bookkeeping
 * @chunk: pcpu_chunk which got populated
 * @page_start: the start page
 * @page_end: the end page
 *
 * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
 * the bookkeeping information accordingly.  Must be called after each
 * successful population.
 */
static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
                                 int page_end)
{
        int nr = page_end - page_start;

        lockdep_assert_held(&pcpu_lock);

        bitmap_set(chunk->populated, page_start, nr);
        chunk->nr_populated += nr;
        pcpu_nr_populated += nr;

        pcpu_update_empty_pages(chunk, nr);
}

/**
 * pcpu_chunk_depopulated - post-depopulation bookkeeping
 * @chunk: pcpu_chunk which got depopulated
 * @page_start: the start page
 * @page_end: the end page
 *
 * Pages in [@page_start,@page_end) have been depopulated from @chunk.
 * Update the bookkeeping information accordingly.  Must be called after
 * each successful depopulation.
 */
static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
                                   int page_start, int page_end)
{
        int nr = page_end - page_start;

        lockdep_assert_held(&pcpu_lock);

        bitmap_clear(chunk->populated, page_start, nr);
        chunk->nr_populated -= nr;
        pcpu_nr_populated -= nr;

        pcpu_update_empty_pages(chunk, -nr);
}

/*
 * Chunk management implementation.
 *
 * To allow different implementations, chunk alloc/free and
 * [de]population are implemented in a separate file which is pulled
 * into this file and compiled together.  The following functions
 * should be implemented.
 *
 * pcpu_populate_chunk                - populate the specified range of a chunk
 * pcpu_depopulate_chunk        - depopulate the specified range of a chunk
 * pcpu_post_unmap_tlb_flush        - flush tlb for the specified range of a chunk
 * pcpu_create_chunk                - create a new chunk
 * pcpu_destroy_chunk                - destroy a chunk, always preceded by full depop
 * pcpu_addr_to_page                - translate address to physical address
 * pcpu_verify_alloc_info        - check alloc_info is acceptable during init
 */
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
                               int page_start, int page_end, gfp_t gfp);
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
                                  int page_start, int page_end);
static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
                                      int page_start, int page_end);
static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);

#ifdef CONFIG_NEED_PER_CPU_KM
#include "percpu-km.c"
#else
#include "percpu-vm.c"
#endif

/**
 * pcpu_chunk_addr_search - determine chunk containing specified address
 * @addr: address for which the chunk needs to be determined.
 *
 * This is an internal function that handles all but static allocations.
 * Static percpu address values should never be passed into the allocator.
 *
 * RETURNS:
 * The address of the found chunk.
 */
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
        /* is it in the dynamic region (first chunk)? */
        if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
                return pcpu_first_chunk;

        /* is it in the reserved region? */
        if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
                return pcpu_reserved_chunk;

        /*
         * The address is relative to unit0 which might be unused and
         * thus unmapped.  Offset the address to the unit space of the
         * current processor before looking it up in the vmalloc
         * space.  Note that any possible cpu id can be used here, so
         * there's no need to worry about preemption or cpu hotplug.
         */
        addr += pcpu_unit_offsets[raw_smp_processor_id()];
        return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
}

#ifdef CONFIG_MEMCG_KMEM
static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
                                      struct obj_cgroup **objcgp)
{
        struct obj_cgroup *objcg;

        if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT))
                return true;

        objcg = current_obj_cgroup();
        if (!objcg)
                return true;

        if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size)))
                return false;

        *objcgp = objcg;
        return true;
}

static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
                                       struct pcpu_chunk *chunk, int off,
                                       size_t size)
{
        if (!objcg)
                return;

        if (likely(chunk && chunk->obj_exts)) {
                obj_cgroup_get(objcg);
                chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = objcg;

                rcu_read_lock();
                mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
                                pcpu_obj_full_size(size));
                rcu_read_unlock();
        } else {
                obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
        }
}

static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
        struct obj_cgroup *objcg;

        if (unlikely(!chunk->obj_exts))
                return;

        objcg = chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup;
        if (!objcg)
                return;
        chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = NULL;

        obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));

        rcu_read_lock();
        mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
                        -pcpu_obj_full_size(size));
        rcu_read_unlock();

        obj_cgroup_put(objcg);
}

#else /* CONFIG_MEMCG_KMEM */
static bool
pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
{
        return true;
}

static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
                                       struct pcpu_chunk *chunk, int off,
                                       size_t size)
{
}

static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
}
#endif /* CONFIG_MEMCG_KMEM */

#ifdef CONFIG_MEM_ALLOC_PROFILING
static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
                                      size_t size)
{
        if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) {
                alloc_tag_add(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag,
                              current->alloc_tag, size);
        }
}

static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
        if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts))
                alloc_tag_sub(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, size);
}
#else
static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
                                      size_t size)
{
}

static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
}
#endif

/**
 * pcpu_alloc - the percpu allocator
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
 * @reserved: allocate from the reserved chunk if available
 * @gfp: allocation flags
 *
 * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
 * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
 * then no warning will be triggered on invalid or failed allocation
 * requests.
 *
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
                                 gfp_t gfp)
{
        gfp_t pcpu_gfp;
        bool is_atomic;
        bool do_warn;
        struct obj_cgroup *objcg = NULL;
        static int warn_limit = 10;
        struct pcpu_chunk *chunk, *next;
        const char *err;
        int slot, off, cpu, ret;
        unsigned long flags;
        void __percpu *ptr;
        size_t bits, bit_align;

        gfp = current_gfp_context(gfp);
        /* whitelisted flags that can be passed to the backing allocators */
        pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
        is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
        do_warn = !(gfp & __GFP_NOWARN);

        /*
         * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
         * therefore alignment must be a minimum of that many bytes.
         * An allocation may have internal fragmentation from rounding up
         * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
         */
        if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
                align = PCPU_MIN_ALLOC_SIZE;

        size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
        bits = size >> PCPU_MIN_ALLOC_SHIFT;
        bit_align = align >> PCPU_MIN_ALLOC_SHIFT;

        if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
                     !is_power_of_2(align))) {
                WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
                     size, align);
                return NULL;
        }

        if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg)))
                return NULL;

        if (!is_atomic) {
                /*
                 * pcpu_balance_workfn() allocates memory under this mutex,
                 * and it may wait for memory reclaim. Allow current task
                 * to become OOM victim, in case of memory pressure.
                 */
                if (gfp & __GFP_NOFAIL) {
                        mutex_lock(&pcpu_alloc_mutex);
                } else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
                        pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
                        return NULL;
                }
        }

        spin_lock_irqsave(&pcpu_lock, flags);

        /* serve reserved allocations from the reserved chunk if available */
        if (reserved && pcpu_reserved_chunk) {
                chunk = pcpu_reserved_chunk;

                off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
                if (off < 0) {
                        err = "alloc from reserved chunk failed";
                        goto fail_unlock;
                }

                off = pcpu_alloc_area(chunk, bits, bit_align, off);
                if (off >= 0)
                        goto area_found;

                err = "alloc from reserved chunk failed";
                goto fail_unlock;
        }

restart:
        /* search through normal chunks */
        for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) {
                list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot],
                                         list) {
                        off = pcpu_find_block_fit(chunk, bits, bit_align,
                                                  is_atomic);
                        if (off < 0) {
                                if (slot < PCPU_SLOT_FAIL_THRESHOLD)
                                        pcpu_chunk_move(chunk, 0);
                                continue;
                        }

                        off = pcpu_alloc_area(chunk, bits, bit_align, off);
                        if (off >= 0) {
                                pcpu_reintegrate_chunk(chunk);
                                goto area_found;
                        }
                }
        }

        spin_unlock_irqrestore(&pcpu_lock, flags);

        if (is_atomic) {
                err = "atomic alloc failed, no space left";
                goto fail;
        }

        /* No space left.  Create a new chunk. */
        if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) {
                chunk = pcpu_create_chunk(pcpu_gfp);
                if (!chunk) {
                        err = "failed to allocate new chunk";
                        goto fail;
                }

                spin_lock_irqsave(&pcpu_lock, flags);
                pcpu_chunk_relocate(chunk, -1);
        } else {
                spin_lock_irqsave(&pcpu_lock, flags);
        }

        goto restart;

area_found:
        pcpu_stats_area_alloc(chunk, size);
        spin_unlock_irqrestore(&pcpu_lock, flags);

        /* populate if not all pages are already there */
        if (!is_atomic) {
                unsigned int page_end, rs, re;

                rs = PFN_DOWN(off);
                page_end = PFN_UP(off + size);

                for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) {
                        WARN_ON(chunk->immutable);

                        ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);

                        spin_lock_irqsave(&pcpu_lock, flags);
                        if (ret) {
                                pcpu_free_area(chunk, off);
                                err = "failed to populate";
                                goto fail_unlock;
                        }
                        pcpu_chunk_populated(chunk, rs, re);
                        spin_unlock_irqrestore(&pcpu_lock, flags);
                }

                mutex_unlock(&pcpu_alloc_mutex);
        }

        if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
                pcpu_schedule_balance_work();

        /* clear the areas and return address relative to base address */
        for_each_possible_cpu(cpu)
                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);

        ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
        kmemleak_alloc_percpu(ptr, size, gfp);

        trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align,
                                  chunk->base_addr, off, ptr,
                                  pcpu_obj_full_size(size), gfp);

        pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);

        pcpu_alloc_tag_alloc_hook(chunk, off, size);

        return ptr;

fail_unlock:
        spin_unlock_irqrestore(&pcpu_lock, flags);
fail:
        trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);

        if (do_warn && warn_limit) {
                pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
                        size, align, is_atomic, err);
                if (!is_atomic)
                        dump_stack();
                if (!--warn_limit)
                        pr_info("limit reached, disable warning\n");
        }

        if (is_atomic) {
                /* see the flag handling in pcpu_balance_workfn() */
                pcpu_atomic_alloc_failed = true;
                pcpu_schedule_balance_work();
        } else {
                mutex_unlock(&pcpu_alloc_mutex);
        }

        pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);

        return NULL;
}
EXPORT_SYMBOL_GPL(pcpu_alloc_noprof);

/**
 * pcpu_balance_free - manage the amount of free chunks
 * @empty_only: free chunks only if there are no populated pages
 *
 * If empty_only is %false, reclaim all fully free chunks regardless of the
 * number of populated pages.  Otherwise, only reclaim chunks that have no
 * populated pages.
 *
 * CONTEXT:
 * pcpu_lock (can be dropped temporarily)
 */
static void pcpu_balance_free(bool empty_only)
{
        LIST_HEAD(to_free);
        struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot];
        struct pcpu_chunk *chunk, *next;

        lockdep_assert_held(&pcpu_lock);

        /*
         * There's no reason to keep around multiple unused chunks and VM
         * areas can be scarce.  Destroy all free chunks except for one.
         */
        list_for_each_entry_safe(chunk, next, free_head, list) {
                WARN_ON(chunk->immutable);

                /* spare the first one */
                if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
                        continue;

                if (!empty_only || chunk->nr_empty_pop_pages == 0)
                        list_move(&chunk->list, &to_free);
        }

        if (list_empty(&to_free))
                return;

        spin_unlock_irq(&pcpu_lock);
        list_for_each_entry_safe(chunk, next, &to_free, list) {
                unsigned int rs, re;

                for_each_set_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
                        pcpu_depopulate_chunk(chunk, rs, re);
                        spin_lock_irq(&pcpu_lock);
                        pcpu_chunk_depopulated(chunk, rs, re);
                        spin_unlock_irq(&pcpu_lock);
                }
                pcpu_destroy_chunk(chunk);
                cond_resched();
        }
        spin_lock_irq(&pcpu_lock);
}

/**
 * pcpu_balance_populated - manage the amount of populated pages
 *
 * Maintain a certain amount of populated pages to satisfy atomic allocations.
 * It is possible that this is called when physical memory is scarce causing
 * OOM killer to be triggered.  We should avoid doing so until an actual
 * allocation causes the failure as it is possible that requests can be
 * serviced from already backed regions.
 *
 * CONTEXT:
 * pcpu_lock (can be dropped temporarily)
 */
static void pcpu_balance_populated(void)
{
        /* gfp flags passed to underlying allocators */
        const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
        struct pcpu_chunk *chunk;
        int slot, nr_to_pop, ret;

        lockdep_assert_held(&pcpu_lock);

        /*
         * Ensure there are certain number of free populated pages for
         * atomic allocs.  Fill up from the most packed so that atomic
         * allocs don't increase fragmentation.  If atomic allocation
         * failed previously, always populate the maximum amount.  This
         * should prevent atomic allocs larger than PAGE_SIZE from keeping
         * failing indefinitely; however, large atomic allocs are not
         * something we support properly and can be highly unreliable and
         * inefficient.
         */
retry_pop:
        if (pcpu_atomic_alloc_failed) {
                nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
                /* best effort anyway, don't worry about synchronization */
                pcpu_atomic_alloc_failed = false;
        } else {
                nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
                                  pcpu_nr_empty_pop_pages,
                                  0, PCPU_EMPTY_POP_PAGES_HIGH);
        }

        for (slot = pcpu_size_to_slot(PAGE_SIZE); slot <= pcpu_free_slot; slot++) {
                unsigned int nr_unpop = 0, rs, re;

                if (!nr_to_pop)
                        break;

                list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
                        nr_unpop = chunk->nr_pages - chunk->nr_populated;
                        if (nr_unpop)
                                break;
                }

                if (!nr_unpop)
                        continue;

                /* @chunk can't go away while pcpu_alloc_mutex is held */
                for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
                        int nr = min_t(int, re - rs, nr_to_pop);

                        spin_unlock_irq(&pcpu_lock);
                        ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
                        cond_resched();
                        spin_lock_irq(&pcpu_lock);
                        if (!ret) {
                                nr_to_pop -= nr;
                                pcpu_chunk_populated(chunk, rs, rs + nr);
                        } else {
                                nr_to_pop = 0;
                        }

                        if (!nr_to_pop)
                                break;
                }
        }

        if (nr_to_pop) {
                /* ran out of chunks to populate, create a new one and retry */
                spin_unlock_irq(&pcpu_lock);
                chunk = pcpu_create_chunk(gfp);
                cond_resched();
                spin_lock_irq(&pcpu_lock);
                if (chunk) {
                        pcpu_chunk_relocate(chunk, -1);
                        goto retry_pop;
                }
        }
}

/**
 * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages
 *
 * Scan over chunks in the depopulate list and try to release unused populated
 * pages back to the system.  Depopulated chunks are sidelined to prevent
 * repopulating these pages unless required.  Fully free chunks are reintegrated
 * and freed accordingly (1 is kept around).  If we drop below the empty
 * populated pages threshold, reintegrate the chunk if it has empty free pages.
 * Each chunk is scanned in the reverse order to keep populated pages close to
 * the beginning of the chunk.
 *
 * CONTEXT:
 * pcpu_lock (can be dropped temporarily)
 *
 */
static void pcpu_reclaim_populated(void)
{
        struct pcpu_chunk *chunk;
        struct pcpu_block_md *block;
        int freed_page_start, freed_page_end;
        int i, end;
        bool reintegrate;

        lockdep_assert_held(&pcpu_lock);

        /*
         * Once a chunk is isolated to the to_depopulate list, the chunk is no
         * longer discoverable to allocations whom may populate pages.  The only
         * other accessor is the free path which only returns area back to the
         * allocator not touching the populated bitmap.
         */
        while ((chunk = list_first_entry_or_null(
                        &pcpu_chunk_lists[pcpu_to_depopulate_slot],
                        struct pcpu_chunk, list))) {
                WARN_ON(chunk->immutable);

                /*
                 * Scan chunk's pages in the reverse order to keep populated
                 * pages close to the beginning of the chunk.
                 */
                freed_page_start = chunk->nr_pages;
                freed_page_end = 0;
                reintegrate = false;
                for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) {
                        /* no more work to do */
                        if (chunk->nr_empty_pop_pages == 0)
                                break;

                        /* reintegrate chunk to prevent atomic alloc failures */
                        if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
                                reintegrate = true;
                                break;
                        }

                        /*
                         * If the page is empty and populated, start or
                         * extend the (i, end) range.  If i == 0, decrease
                         * i and perform the depopulation to cover the last
                         * (first) page in the chunk.
                         */
                        block = chunk->md_blocks + i;
                        if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS &&
                            test_bit(i, chunk->populated)) {
                                if (end == -1)
                                        end = i;
                                if (i > 0)
                                        continue;
                                i--;
                        }

                        /* depopulate if there is an active range */
                        if (end == -1)
                                continue;

                        spin_unlock_irq(&pcpu_lock);
                        pcpu_depopulate_chunk(chunk, i + 1, end + 1);
                        cond_resched();
                        spin_lock_irq(&pcpu_lock);

                        pcpu_chunk_depopulated(chunk, i + 1, end + 1);
                        freed_page_start = min(freed_page_start, i + 1);
                        freed_page_end = max(freed_page_end, end + 1);

                        /* reset the range and continue */
                        end = -1;
                }

                /* batch tlb flush per chunk to amortize cost */
                if (freed_page_start < freed_page_end) {
                        spin_unlock_irq(&pcpu_lock);
                        pcpu_post_unmap_tlb_flush(chunk,
                                                  freed_page_start,
                                                  freed_page_end);
                        cond_resched();
                        spin_lock_irq(&pcpu_lock);
                }

                if (reintegrate || chunk->free_bytes == pcpu_unit_size)
                        pcpu_reintegrate_chunk(chunk);
                else
                        list_move_tail(&chunk->list,
                                       &pcpu_chunk_lists[pcpu_sidelined_slot]);
        }
}

/**
 * pcpu_balance_workfn - manage the amount of free chunks and populated pages
 * @work: unused
 *
 * For each chunk type, manage the number of fully free chunks and the number of
 * populated pages.  An important thing to consider is when pages are freed and
 * how they contribute to the global counts.
 */
static void pcpu_balance_workfn(struct work_struct *work)
{
        /*
         * pcpu_balance_free() is called twice because the first time we may
         * trim pages in the active pcpu_nr_empty_pop_pages which may cause us
         * to grow other chunks.  This then gives pcpu_reclaim_populated() time
         * to move fully free chunks to the active list to be freed if
         * appropriate.
         */
        mutex_lock(&pcpu_alloc_mutex);
        spin_lock_irq(&pcpu_lock);

        pcpu_balance_free(false);
        pcpu_reclaim_populated();
        pcpu_balance_populated();
        pcpu_balance_free(true);

        spin_unlock_irq(&pcpu_lock);
        mutex_unlock(&pcpu_alloc_mutex);
}

/**
 * pcpu_alloc_size - the size of the dynamic percpu area
 * @ptr: pointer to the dynamic percpu area
 *
 * Returns the size of the @ptr allocation.  This is undefined for statically
 * defined percpu variables as there is no corresponding chunk->bound_map.
 *
 * RETURNS:
 * The size of the dynamic percpu area.
 *
 * CONTEXT:
 * Can be called from atomic context.
 */
size_t pcpu_alloc_size(void __percpu *ptr)
{
        struct pcpu_chunk *chunk;
        unsigned long bit_off, end;
        void *addr;

        if (!ptr)
                return 0;

        addr = __pcpu_ptr_to_addr(ptr);
        /* No pcpu_lock here: ptr has not been freed, so chunk is still alive */
        chunk = pcpu_chunk_addr_search(addr);
        bit_off = (addr - chunk->base_addr) / PCPU_MIN_ALLOC_SIZE;
        end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
                            bit_off + 1);
        return (end - bit_off) * PCPU_MIN_ALLOC_SIZE;
}

/**
 * free_percpu - free percpu area
 * @ptr: pointer to area to free
 *
 * Free percpu area @ptr.
 *
 * CONTEXT:
 * Can be called from atomic context.
 */
void free_percpu(void __percpu *ptr)
{
        void *addr;
        struct pcpu_chunk *chunk;
        unsigned long flags;
        int size, off;
        bool need_balance = false;

        if (!ptr)
                return;

        kmemleak_free_percpu(ptr);

        addr = __pcpu_ptr_to_addr(ptr);
        chunk = pcpu_chunk_addr_search(addr);
        off = addr - chunk->base_addr;

        spin_lock_irqsave(&pcpu_lock, flags);
        size = pcpu_free_area(chunk, off);

        pcpu_alloc_tag_free_hook(chunk, off, size);

        pcpu_memcg_free_hook(chunk, off, size);

        /*
         * If there are more than one fully free chunks, wake up grim reaper.
         * If the chunk is isolated, it may be in the process of being
         * reclaimed.  Let reclaim manage cleaning up of that chunk.
         */
        if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) {
                struct pcpu_chunk *pos;

                list_for_each_entry(pos, &pcpu_chunk_lists[pcpu_free_slot], list)
                        if (pos != chunk) {
                                need_balance = true;
                                break;
                        }
        } else if (pcpu_should_reclaim_chunk(chunk)) {
                pcpu_isolate_chunk(chunk);
                need_balance = true;
        }

        trace_percpu_free_percpu(chunk->base_addr, off, ptr);

        spin_unlock_irqrestore(&pcpu_lock, flags);

        if (need_balance)
                pcpu_schedule_balance_work();
}
EXPORT_SYMBOL_GPL(free_percpu);

bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
{
#ifdef CONFIG_SMP
        const size_t static_size = __per_cpu_end - __per_cpu_start;
        void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
        unsigned int cpu;

        for_each_possible_cpu(cpu) {
                void *start = per_cpu_ptr(base, cpu);
                void *va = (void *)addr;

                if (va >= start && va < start + static_size) {
                        if (can_addr) {
                                *can_addr = (unsigned long) (va - start);
                                *can_addr += (unsigned long)
                                        per_cpu_ptr(base, get_boot_cpu_id());
                        }
                        return true;
                }
        }
#endif
        /* on UP, can't distinguish from other static vars, always false */
        return false;
}

/**
 * is_kernel_percpu_address - test whether address is from static percpu area
 * @addr: address to test
 *
 * Test whether @addr belongs to in-kernel static percpu area.  Module
 * static percpu areas are not considered.  For those, use
 * is_module_percpu_address().
 *
 * RETURNS:
 * %true if @addr is from in-kernel static percpu area, %false otherwise.
 */
bool is_kernel_percpu_address(unsigned long addr)
{
        return __is_kernel_percpu_address(addr, NULL);
}

/**
 * per_cpu_ptr_to_phys - convert translated percpu address to physical address
 * @addr: the address to be converted to physical address
 *
 * Given @addr which is dereferenceable address obtained via one of
 * percpu access macros, this function translates it into its physical
 * address.  The caller is responsible for ensuring @addr stays valid
 * until this function finishes.
 *
 * percpu allocator has special setup for the first chunk, which currently
 * supports either embedding in linear address space or vmalloc mapping,
 * and, from the second one, the backing allocator (currently either vm or
 * km) provides translation.
 *
 * The addr can be translated simply without checking if it falls into the
 * first chunk. But the current code reflects better how percpu allocator
 * actually works, and the verification can discover both bugs in percpu
 * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
 * code.
 *
 * RETURNS:
 * The physical address for @addr.
 */
phys_addr_t per_cpu_ptr_to_phys(void *addr)
{
        void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
        bool in_first_chunk = false;
        unsigned long first_low, first_high;
        unsigned int cpu;

        /*
         * The following test on unit_low/high isn't strictly
         * necessary but will speed up lookups of addresses which
         * aren't in the first chunk.
         *
         * The address check is against full chunk sizes.  pcpu_base_addr
         * points to the beginning of the first chunk including the
         * static region.  Assumes good intent as the first chunk may
         * not be full (ie. < pcpu_unit_pages in size).
         */
        first_low = (unsigned long)pcpu_base_addr +
                    pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
        first_high = (unsigned long)pcpu_base_addr +
                     pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
        if ((unsigned long)addr >= first_low &&
            (unsigned long)addr < first_high) {
                for_each_possible_cpu(cpu) {
                        void *start = per_cpu_ptr(base, cpu);

                        if (addr >= start && addr < start + pcpu_unit_size) {
                                in_first_chunk = true;
                                break;
                        }
                }
        }

        if (in_first_chunk) {
                if (!is_vmalloc_addr(addr))
                        return __pa(addr);
                else
                        return page_to_phys(vmalloc_to_page(addr)) +
                               offset_in_page(addr);
        } else
                return page_to_phys(pcpu_addr_to_page(addr)) +
                       offset_in_page(addr);
}

/**
 * pcpu_alloc_alloc_info - allocate percpu allocation info
 * @nr_groups: the number of groups
 * @nr_units: the number of units
 *
 * Allocate ai which is large enough for @nr_groups groups containing
 * @nr_units units.  The returned ai's groups[0].cpu_map points to the
 * cpu_map array which is long enough for @nr_units and filled with
 * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
 * pointer of other groups.
 *
 * RETURNS:
 * Pointer to the allocated pcpu_alloc_info on success, NULL on
 * failure.
 */
struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
                                                      int nr_units)
{
        struct pcpu_alloc_info *ai;
        size_t base_size, ai_size;
        void *ptr;
        int unit;

        base_size = ALIGN(struct_size(ai, groups, nr_groups),
                          __alignof__(ai->groups[0].cpu_map[0]));
        ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);

        ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
        if (!ptr)
                return NULL;
        ai = ptr;
        ptr += base_size;

        ai->groups[0].cpu_map = ptr;

        for (unit = 0; unit < nr_units; unit++)
                ai->groups[0].cpu_map[unit] = NR_CPUS;

        ai->nr_groups = nr_groups;
        ai->__ai_size = PFN_ALIGN(ai_size);

        return ai;
}

/**
 * pcpu_free_alloc_info - free percpu allocation info
 * @ai: pcpu_alloc_info to free
 *
 * Free @ai which was allocated by pcpu_alloc_alloc_info().
 */
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
{
        memblock_free(ai, ai->__ai_size);
}

/**
 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
 * @lvl: loglevel
 * @ai: allocation info to dump
 *
 * Print out information about @ai using loglevel @lvl.
 */
static void pcpu_dump_alloc_info(const char *lvl,
                                 const struct pcpu_alloc_info *ai)
{
        int group_width = 1, cpu_width = 1, width;
        char empty_str[] = "--------";
        int alloc = 0, alloc_end = 0;
        int group, v;
        int upa, apl;        /* units per alloc, allocs per line */

        v = ai->nr_groups;
        while (v /= 10)
                group_width++;

        v = num_possible_cpus();
        while (v /= 10)
                cpu_width++;
        empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';

        upa = ai->alloc_size / ai->unit_size;
        width = upa * (cpu_width + 1) + group_width + 3;
        apl = rounddown_pow_of_two(max(60 / width, 1));

        printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
               lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
               ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);

        for (group = 0; group < ai->nr_groups; group++) {
                const struct pcpu_group_info *gi = &ai->groups[group];
                int unit = 0, unit_end = 0;

                BUG_ON(gi->nr_units % upa);
                for (alloc_end += gi->nr_units / upa;
                     alloc < alloc_end; alloc++) {
                        if (!(alloc % apl)) {
                                pr_cont("\n");
                                printk("%spcpu-alloc: ", lvl);
                        }
                        pr_cont("[%0*d] ", group_width, group);

                        for (unit_end += upa; unit < unit_end; unit++)
                                if (gi->cpu_map[unit] != NR_CPUS)
                                        pr_cont("%0*d ",
                                                cpu_width, gi->cpu_map[unit]);
                                else
                                        pr_cont("%s ", empty_str);
                }
        }
        pr_cont("\n");
}

/**
 * pcpu_setup_first_chunk - initialize the first percpu chunk
 * @ai: pcpu_alloc_info describing how to percpu area is shaped
 * @base_addr: mapped address
 *
 * Initialize the first percpu chunk which contains the kernel static
 * percpu area.  This function is to be called from arch percpu area
 * setup path.
 *
 * @ai contains all information necessary to initialize the first
 * chunk and prime the dynamic percpu allocator.
 *
 * @ai->static_size is the size of static percpu area.
 *
 * @ai->reserved_size, if non-zero, specifies the amount of bytes to
 * reserve after the static area in the first chunk.  This reserves
 * the first chunk such that it's available only through reserved
 * percpu allocation.  This is primarily used to serve module percpu
 * static areas on architectures where the addressing model has
 * limited offset range for symbol relocations to guarantee module
 * percpu symbols fall inside the relocatable range.
 *
 * @ai->dyn_size determines the number of bytes available for dynamic
 * allocation in the first chunk.  The area between @ai->static_size +
 * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
 *
 * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
 * and equal to or larger than @ai->static_size + @ai->reserved_size +
 * @ai->dyn_size.
 *
 * @ai->atom_size is the allocation atom size and used as alignment
 * for vm areas.
 *
 * @ai->alloc_size is the allocation size and always multiple of
 * @ai->atom_size.  This is larger than @ai->atom_size if
 * @ai->unit_size is larger than @ai->atom_size.
 *
 * @ai->nr_groups and @ai->groups describe virtual memory layout of
 * percpu areas.  Units which should be colocated are put into the
 * same group.  Dynamic VM areas will be allocated according to these
 * groupings.  If @ai->nr_groups is zero, a single group containing
 * all units is assumed.
 *
 * The caller should have mapped the first chunk at @base_addr and
 * copied static data to each unit.
 *
 * The first chunk will always contain a static and a dynamic region.
 * However, the static region is not managed by any chunk.  If the first
 * chunk also contains a reserved region, it is served by two chunks -
 * one for the reserved region and one for the dynamic region.  They
 * share the same vm, but use offset regions in the area allocation map.
 * The chunk serving the dynamic region is circulated in the chunk slots
 * and available for dynamic allocation like any other chunk.
 */
void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
                                   void *base_addr)
{
        size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
        size_t static_size, dyn_size;
        unsigned long *group_offsets;
        size_t *group_sizes;
        unsigned long *unit_off;
        unsigned int cpu;
        int *unit_map;
        int group, unit, i;
        unsigned long tmp_addr;
        size_t alloc_size;

#define PCPU_SETUP_BUG_ON(cond)        do {                                        \
        if (unlikely(cond)) {                                                \
                pr_emerg("failed to initialize, %s\n", #cond);                \
                pr_emerg("cpu_possible_mask=%*pb\n",                        \
                         cpumask_pr_args(cpu_possible_mask));                \
                pcpu_dump_alloc_info(KERN_EMERG, ai);                        \
                BUG();                                                        \
        }                                                                \
} while (0)

        /* sanity checks */
        PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
#ifdef CONFIG_SMP
        PCPU_SETUP_BUG_ON(!ai->static_size);
        PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
#endif
        PCPU_SETUP_BUG_ON(!base_addr);
        PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
        PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
        PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
        PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
        PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
        PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
        PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
        PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
                            IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
        PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);

        /* process group information and build config tables accordingly */
        alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
        group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!group_offsets)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
        group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!group_sizes)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
        unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!unit_map)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
        unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
        if (!unit_off)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      alloc_size);

        for (cpu = 0; cpu < nr_cpu_ids; cpu++)
                unit_map[cpu] = UINT_MAX;

        pcpu_low_unit_cpu = NR_CPUS;
        pcpu_high_unit_cpu = NR_CPUS;

        for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
                const struct pcpu_group_info *gi = &ai->groups[group];

                group_offsets[group] = gi->base_offset;
                group_sizes[group] = gi->nr_units * ai->unit_size;

                for (i = 0; i < gi->nr_units; i++) {
                        cpu = gi->cpu_map[i];
                        if (cpu == NR_CPUS)
                                continue;

                        PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
                        PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
                        PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);

                        unit_map[cpu] = unit + i;
                        unit_off[cpu] = gi->base_offset + i * ai->unit_size;

                        /* determine low/high unit_cpu */
                        if (pcpu_low_unit_cpu == NR_CPUS ||
                            unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
                                pcpu_low_unit_cpu = cpu;
                        if (pcpu_high_unit_cpu == NR_CPUS ||
                            unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
                                pcpu_high_unit_cpu = cpu;
                }
        }
        pcpu_nr_units = unit;

        for_each_possible_cpu(cpu)
                PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);

        /* we're done parsing the input, undefine BUG macro and dump config */
#undef PCPU_SETUP_BUG_ON
        pcpu_dump_alloc_info(KERN_DEBUG, ai);

        pcpu_nr_groups = ai->nr_groups;
        pcpu_group_offsets = group_offsets;
        pcpu_group_sizes = group_sizes;
        pcpu_unit_map = unit_map;
        pcpu_unit_offsets = unit_off;

        /* determine basic parameters */
        pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
        pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
        pcpu_atom_size = ai->atom_size;
        pcpu_chunk_struct_size = struct_size((struct pcpu_chunk *)0, populated,
                                             BITS_TO_LONGS(pcpu_unit_pages));

        pcpu_stats_save_ai(ai);

        /*
         * Allocate chunk slots.  The slots after the active slots are:
         *   sidelined_slot - isolated, depopulated chunks
         *   free_slot - fully free chunks
         *   to_depopulate_slot - isolated, chunks to depopulate
         */
        pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1;
        pcpu_free_slot = pcpu_sidelined_slot + 1;
        pcpu_to_depopulate_slot = pcpu_free_slot + 1;
        pcpu_nr_slots = pcpu_to_depopulate_slot + 1;
        pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
                                          sizeof(pcpu_chunk_lists[0]),
                                          SMP_CACHE_BYTES);
        if (!pcpu_chunk_lists)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]));

        for (i = 0; i < pcpu_nr_slots; i++)
                INIT_LIST_HEAD(&pcpu_chunk_lists[i]);

        /*
         * The end of the static region needs to be aligned with the
         * minimum allocation size as this offsets the reserved and
         * dynamic region.  The first chunk ends page aligned by
         * expanding the dynamic region, therefore the dynamic region
         * can be shrunk to compensate while still staying above the
         * configured sizes.
         */
        static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
        dyn_size = ai->dyn_size - (static_size - ai->static_size);

        /*
         * Initialize first chunk:
         * This chunk is broken up into 3 parts:
         *                < static | [reserved] | dynamic >
         * - static - there is no backing chunk because these allocations can
         *   never be freed.
         * - reserved (pcpu_reserved_chunk) - exists primarily to serve
         *   allocations from module load.
         * - dynamic (pcpu_first_chunk) - serves the dynamic part of the first
         *   chunk.
         */
        tmp_addr = (unsigned long)base_addr + static_size;
        if (ai->reserved_size)
                pcpu_reserved_chunk = pcpu_alloc_first_chunk(tmp_addr,
                                                ai->reserved_size);
        tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size;
        pcpu_first_chunk = pcpu_alloc_first_chunk(tmp_addr, dyn_size);

        pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
        pcpu_chunk_relocate(pcpu_first_chunk, -1);

        /* include all regions of the first chunk */
        pcpu_nr_populated += PFN_DOWN(size_sum);

        pcpu_stats_chunk_alloc();
        trace_percpu_create_chunk(base_addr);

        /* we're done */
        pcpu_base_addr = base_addr;
}

#ifdef CONFIG_SMP

const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
        [PCPU_FC_AUTO]        = "auto",
        [PCPU_FC_EMBED]        = "embed",
        [PCPU_FC_PAGE]        = "page",
};

enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;

static int __init percpu_alloc_setup(char *str)
{
        if (!str)
                return -EINVAL;

        if (0)
                /* nada */;
#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
        else if (!strcmp(str, "embed"))
                pcpu_chosen_fc = PCPU_FC_EMBED;
#endif
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
        else if (!strcmp(str, "page"))
                pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
        else
                pr_warn("unknown allocator %s specified\n", str);

        return 0;
}
early_param("percpu_alloc", percpu_alloc_setup);

/*
 * pcpu_embed_first_chunk() is used by the generic percpu setup.
 * Build it if needed by the arch config or the generic setup is going
 * to be used.
 */
#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
        !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
#define BUILD_EMBED_FIRST_CHUNK
#endif

/* build pcpu_page_first_chunk() iff needed by the arch config */
#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
#define BUILD_PAGE_FIRST_CHUNK
#endif

/* pcpu_build_alloc_info() is used by both embed and page first chunk */
#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
/**
 * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
 * @reserved_size: the size of reserved percpu area in bytes
 * @dyn_size: minimum free size for dynamic allocation in bytes
 * @atom_size: allocation atom size
 * @cpu_distance_fn: callback to determine distance between cpus, optional
 *
 * This function determines grouping of units, their mappings to cpus
 * and other parameters considering needed percpu size, allocation
 * atom size and distances between CPUs.
 *
 * Groups are always multiples of atom size and CPUs which are of
 * LOCAL_DISTANCE both ways are grouped together and share space for
 * units in the same group.  The returned configuration is guaranteed
 * to have CPUs on different nodes on different groups and >=75% usage
 * of allocated virtual address space.
 *
 * RETURNS:
 * On success, pointer to the new allocation_info is returned.  On
 * failure, ERR_PTR value is returned.
 */
static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
                                size_t reserved_size, size_t dyn_size,
                                size_t atom_size,
                                pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
        static int group_map[NR_CPUS] __initdata;
        static int group_cnt[NR_CPUS] __initdata;
        static struct cpumask mask __initdata;
        const size_t static_size = __per_cpu_end - __per_cpu_start;
        int nr_groups = 1, nr_units = 0;
        size_t size_sum, min_unit_size, alloc_size;
        int upa, max_upa, best_upa;        /* units_per_alloc */
        int last_allocs, group, unit;
        unsigned int cpu, tcpu;
        struct pcpu_alloc_info *ai;
        unsigned int *cpu_map;

        /* this function may be called multiple times */
        memset(group_map, 0, sizeof(group_map));
        memset(group_cnt, 0, sizeof(group_cnt));
        cpumask_clear(&mask);

        /* calculate size_sum and ensure dyn_size is enough for early alloc */
        size_sum = PFN_ALIGN(static_size + reserved_size +
                            max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
        dyn_size = size_sum - static_size - reserved_size;

        /*
         * Determine min_unit_size, alloc_size and max_upa such that
         * alloc_size is multiple of atom_size and is the smallest
         * which can accommodate 4k aligned segments which are equal to
         * or larger than min_unit_size.
         */
        min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);

        /* determine the maximum # of units that can fit in an allocation */
        alloc_size = roundup(min_unit_size, atom_size);
        upa = alloc_size / min_unit_size;
        while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
                upa--;
        max_upa = upa;

        cpumask_copy(&mask, cpu_possible_mask);

        /* group cpus according to their proximity */
        for (group = 0; !cpumask_empty(&mask); group++) {
                /* pop the group's first cpu */
                cpu = cpumask_first(&mask);
                group_map[cpu] = group;
                group_cnt[group]++;
                cpumask_clear_cpu(cpu, &mask);

                for_each_cpu(tcpu, &mask) {
                        if (!cpu_distance_fn ||
                            (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE &&
                             cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) {
                                group_map[tcpu] = group;
                                group_cnt[group]++;
                                cpumask_clear_cpu(tcpu, &mask);
                        }
                }
        }
        nr_groups = group;

        /*
         * Wasted space is caused by a ratio imbalance of upa to group_cnt.
         * Expand the unit_size until we use >= 75% of the units allocated.
         * Related to atom_size, which could be much larger than the unit_size.
         */
        last_allocs = INT_MAX;
        best_upa = 0;
        for (upa = max_upa; upa; upa--) {
                int allocs = 0, wasted = 0;

                if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
                        continue;

                for (group = 0; group < nr_groups; group++) {
                        int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
                        allocs += this_allocs;
                        wasted += this_allocs * upa - group_cnt[group];
                }

                /*
                 * Don't accept if wastage is over 1/3.  The
                 * greater-than comparison ensures upa==1 always
                 * passes the following check.
                 */
                if (wasted > num_possible_cpus() / 3)
                        continue;

                /* and then don't consume more memory */
                if (allocs > last_allocs)
                        break;
                last_allocs = allocs;
                best_upa = upa;
        }
        BUG_ON(!best_upa);
        upa = best_upa;

        /* allocate and fill alloc_info */
        for (group = 0; group < nr_groups; group++)
                nr_units += roundup(group_cnt[group], upa);

        ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
        if (!ai)
                return ERR_PTR(-ENOMEM);
        cpu_map = ai->groups[0].cpu_map;

        for (group = 0; group < nr_groups; group++) {
                ai->groups[group].cpu_map = cpu_map;
                cpu_map += roundup(group_cnt[group], upa);
        }

        ai->static_size = static_size;
        ai->reserved_size = reserved_size;
        ai->dyn_size = dyn_size;
        ai->unit_size = alloc_size / upa;
        ai->atom_size = atom_size;
        ai->alloc_size = alloc_size;

        for (group = 0, unit = 0; group < nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];

                /*
                 * Initialize base_offset as if all groups are located
                 * back-to-back.  The caller should update this to
                 * reflect actual allocation.
                 */
                gi->base_offset = unit * ai->unit_size;

                for_each_possible_cpu(cpu)
                        if (group_map[cpu] == group)
                                gi->cpu_map[gi->nr_units++] = cpu;
                gi->nr_units = roundup(gi->nr_units, upa);
                unit += gi->nr_units;
        }
        BUG_ON(unit != nr_units);

        return ai;
}

static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align,
                                   pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
        const unsigned long goal = __pa(MAX_DMA_ADDRESS);
#ifdef CONFIG_NUMA
        int node = NUMA_NO_NODE;
        void *ptr;

        if (cpu_to_nd_fn)
                node = cpu_to_nd_fn(cpu);

        if (node == NUMA_NO_NODE || !node_online(node) || !NODE_DATA(node)) {
                ptr = memblock_alloc_from(size, align, goal);
                pr_info("cpu %d has no node %d or node-local memory\n",
                        cpu, node);
                pr_debug("per cpu data for cpu%d %zu bytes at 0x%llx\n",
                         cpu, size, (u64)__pa(ptr));
        } else {
                ptr = memblock_alloc_try_nid(size, align, goal,
                                             MEMBLOCK_ALLOC_ACCESSIBLE,
                                             node);

                pr_debug("per cpu data for cpu%d %zu bytes on node%d at 0x%llx\n",
                         cpu, size, node, (u64)__pa(ptr));
        }
        return ptr;
#else
        return memblock_alloc_from(size, align, goal);
#endif
}

static void __init pcpu_fc_free(void *ptr, size_t size)
{
        memblock_free(ptr, size);
}
#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */

#if defined(BUILD_EMBED_FIRST_CHUNK)
/**
 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
 * @reserved_size: the size of reserved percpu area in bytes
 * @dyn_size: minimum free size for dynamic allocation in bytes
 * @atom_size: allocation atom size
 * @cpu_distance_fn: callback to determine distance between cpus, optional
 * @cpu_to_nd_fn: callback to convert cpu to it's node, optional
 *
 * This is a helper to ease setting up embedded first percpu chunk and
 * can be called where pcpu_setup_first_chunk() is expected.
 *
 * If this function is used to setup the first chunk, it is allocated
 * by calling pcpu_fc_alloc and used as-is without being mapped into
 * vmalloc area.  Allocations are always whole multiples of @atom_size
 * aligned to @atom_size.
 *
 * This enables the first chunk to piggy back on the linear physical
 * mapping which often uses larger page size.  Please note that this
 * can result in very sparse cpu->unit mapping on NUMA machines thus
 * requiring large vmalloc address space.  Don't use this allocator if
 * vmalloc space is not orders of magnitude larger than distances
 * between node memory addresses (ie. 32bit NUMA machines).
 *
 * @dyn_size specifies the minimum dynamic area size.
 *
 * If the needed size is smaller than the minimum or specified unit
 * size, the leftover is returned using pcpu_fc_free.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
                                  size_t atom_size,
                                  pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
                                  pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
        void *base = (void *)ULONG_MAX;
        void **areas = NULL;
        struct pcpu_alloc_info *ai;
        size_t size_sum, areas_size;
        unsigned long max_distance;
        int group, i, highest_group, rc = 0;

        ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
                                   cpu_distance_fn);
        if (IS_ERR(ai))
                return PTR_ERR(ai);

        size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
        areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));

        areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
        if (!areas) {
                rc = -ENOMEM;
                goto out_free;
        }

        /* allocate, copy and determine base address & max_distance */
        highest_group = 0;
        for (group = 0; group < ai->nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];
                unsigned int cpu = NR_CPUS;
                void *ptr;

                for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
                        cpu = gi->cpu_map[i];
                BUG_ON(cpu == NR_CPUS);

                /* allocate space for the whole group */
                ptr = pcpu_fc_alloc(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn);
                if (!ptr) {
                        rc = -ENOMEM;
                        goto out_free_areas;
                }
                /* kmemleak tracks the percpu allocations separately */
                kmemleak_ignore_phys(__pa(ptr));
                areas[group] = ptr;

                base = min(ptr, base);
                if (ptr > areas[highest_group])
                        highest_group = group;
        }
        max_distance = areas[highest_group] - base;
        max_distance += ai->unit_size * ai->groups[highest_group].nr_units;

        /* warn if maximum distance is further than 75% of vmalloc space */
        if (max_distance > VMALLOC_TOTAL * 3 / 4) {
                pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
                                max_distance, VMALLOC_TOTAL);
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
                /* and fail if we have fallback */
                rc = -EINVAL;
                goto out_free_areas;
#endif
        }

        /*
         * Copy data and free unused parts.  This should happen after all
         * allocations are complete; otherwise, we may end up with
         * overlapping groups.
         */
        for (group = 0; group < ai->nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];
                void *ptr = areas[group];

                for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
                        if (gi->cpu_map[i] == NR_CPUS) {
                                /* unused unit, free whole */
                                pcpu_fc_free(ptr, ai->unit_size);
                                continue;
                        }
                        /* copy and return the unused part */
                        memcpy(ptr, __per_cpu_load, ai->static_size);
                        pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum);
                }
        }

        /* base address is now known, determine group base offsets */
        for (group = 0; group < ai->nr_groups; group++) {
                ai->groups[group].base_offset = areas[group] - base;
        }

        pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
                PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
                ai->dyn_size, ai->unit_size);

        pcpu_setup_first_chunk(ai, base);
        goto out_free;

out_free_areas:
        for (group = 0; group < ai->nr_groups; group++)
                if (areas[group])
                        pcpu_fc_free(areas[group],
                                ai->groups[group].nr_units * ai->unit_size);
out_free:
        pcpu_free_alloc_info(ai);
        if (areas)
                memblock_free(areas, areas_size);
        return rc;
}
#endif /* BUILD_EMBED_FIRST_CHUNK */

#ifdef BUILD_PAGE_FIRST_CHUNK
#include <asm/pgalloc.h>

#ifndef P4D_TABLE_SIZE
#define P4D_TABLE_SIZE PAGE_SIZE
#endif

#ifndef PUD_TABLE_SIZE
#define PUD_TABLE_SIZE PAGE_SIZE
#endif

#ifndef PMD_TABLE_SIZE
#define PMD_TABLE_SIZE PAGE_SIZE
#endif

#ifndef PTE_TABLE_SIZE
#define PTE_TABLE_SIZE PAGE_SIZE
#endif
void __init __weak pcpu_populate_pte(unsigned long addr)
{
        pgd_t *pgd = pgd_offset_k(addr);
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        if (pgd_none(*pgd)) {
                p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
                if (!p4d)
                        goto err_alloc;
                pgd_populate(&init_mm, pgd, p4d);
        }

        p4d = p4d_offset(pgd, addr);
        if (p4d_none(*p4d)) {
                pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
                if (!pud)
                        goto err_alloc;
                p4d_populate(&init_mm, p4d, pud);
        }

        pud = pud_offset(p4d, addr);
        if (pud_none(*pud)) {
                pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
                if (!pmd)
                        goto err_alloc;
                pud_populate(&init_mm, pud, pmd);
        }

        pmd = pmd_offset(pud, addr);
        if (!pmd_present(*pmd)) {
                pte_t *new;

                new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
                if (!new)
                        goto err_alloc;
                pmd_populate_kernel(&init_mm, pmd, new);
        }

        return;

err_alloc:
        panic("%s: Failed to allocate memory\n", __func__);
}

/**
 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
 * @reserved_size: the size of reserved percpu area in bytes
 * @cpu_to_nd_fn: callback to convert cpu to it's node, optional
 *
 * This is a helper to ease setting up page-remapped first percpu
 * chunk and can be called where pcpu_setup_first_chunk() is expected.
 *
 * This is the basic allocator.  Static percpu area is allocated
 * page-by-page into vmalloc area.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
        static struct vm_struct vm;
        struct pcpu_alloc_info *ai;
        char psize_str[16];
        int unit_pages;
        size_t pages_size;
        struct page **pages;
        int unit, i, j, rc = 0;
        int upa;
        int nr_g0_units;

        snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);

        ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
        if (IS_ERR(ai))
                return PTR_ERR(ai);
        BUG_ON(ai->nr_groups != 1);
        upa = ai->alloc_size/ai->unit_size;
        nr_g0_units = roundup(num_possible_cpus(), upa);
        if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
                pcpu_free_alloc_info(ai);
                return -EINVAL;
        }

        unit_pages = ai->unit_size >> PAGE_SHIFT;

        /* unaligned allocations can't be freed, round up to page size */
        pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
                               sizeof(pages[0]));
        pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
        if (!pages)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
                      pages_size);

        /* allocate pages */
        j = 0;
        for (unit = 0; unit < num_possible_cpus(); unit++) {
                unsigned int cpu = ai->groups[0].cpu_map[unit];
                for (i = 0; i < unit_pages; i++) {
                        void *ptr;

                        ptr = pcpu_fc_alloc(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn);
                        if (!ptr) {
                                pr_warn("failed to allocate %s page for cpu%u\n",
                                                psize_str, cpu);
                                goto enomem;
                        }
                        /* kmemleak tracks the percpu allocations separately */
                        kmemleak_ignore_phys(__pa(ptr));
                        pages[j++] = virt_to_page(ptr);
                }
        }

        /* allocate vm area, map the pages and copy static data */
        vm.flags = VM_ALLOC;
        vm.size = num_possible_cpus() * ai->unit_size;
        vm_area_register_early(&vm, PAGE_SIZE);

        for (unit = 0; unit < num_possible_cpus(); unit++) {
                unsigned long unit_addr =
                        (unsigned long)vm.addr + unit * ai->unit_size;

                for (i = 0; i < unit_pages; i++)
                        pcpu_populate_pte(unit_addr + (i << PAGE_SHIFT));

                /* pte already populated, the following shouldn't fail */
                rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
                                      unit_pages);
                if (rc < 0)
                        panic("failed to map percpu area, err=%d\n", rc);

                flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size);

                /* copy static data */
                memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
        }

        /* we're ready, commit */
        pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
                unit_pages, psize_str, ai->static_size,
                ai->reserved_size, ai->dyn_size);

        pcpu_setup_first_chunk(ai, vm.addr);
        goto out_free_ar;

enomem:
        while (--j >= 0)
                pcpu_fc_free(page_address(pages[j]), PAGE_SIZE);
        rc = -ENOMEM;
out_free_ar:
        memblock_free(pages, pages_size);
        pcpu_free_alloc_info(ai);
        return rc;
}
#endif /* BUILD_PAGE_FIRST_CHUNK */

#ifndef        CONFIG_HAVE_SETUP_PER_CPU_AREA
/*
 * Generic SMP percpu area setup.
 *
 * The embedding helper is used because its behavior closely resembles
 * the original non-dynamic generic percpu area setup.  This is
 * important because many archs have addressing restrictions and might
 * fail if the percpu area is located far away from the previous
 * location.  As an added bonus, in non-NUMA cases, embedding is
 * generally a good idea TLB-wise because percpu area can piggy back
 * on the physical linear memory mapping which uses large page
 * mappings on applicable archs.
 */
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);

void __init setup_per_cpu_areas(void)
{
        unsigned long delta;
        unsigned int cpu;
        int rc;

        /*
         * Always reserve area for module percpu variables.  That's
         * what the legacy allocator did.
         */
        rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE,
                                    PAGE_SIZE, NULL, NULL);
        if (rc < 0)
                panic("Failed to initialize percpu areas.");

        delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
        for_each_possible_cpu(cpu)
                __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
}
#endif        /* CONFIG_HAVE_SETUP_PER_CPU_AREA */

#else        /* CONFIG_SMP */

/*
 * UP percpu area setup.
 *
 * UP always uses km-based percpu allocator with identity mapping.
 * Static percpu variables are indistinguishable from the usual static
 * variables and don't require any special preparation.
 */
void __init setup_per_cpu_areas(void)
{
        const size_t unit_size =
                roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
                                         PERCPU_DYNAMIC_RESERVE));
        struct pcpu_alloc_info *ai;
        void *fc;

        ai = pcpu_alloc_alloc_info(1, 1);
        fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
        if (!ai || !fc)
                panic("Failed to allocate memory for percpu areas.");
        /* kmemleak tracks the percpu allocations separately */
        kmemleak_ignore_phys(__pa(fc));

        ai->dyn_size = unit_size;
        ai->unit_size = unit_size;
        ai->atom_size = unit_size;
        ai->alloc_size = unit_size;
        ai->groups[0].nr_units = 1;
        ai->groups[0].cpu_map[0] = 0;

        pcpu_setup_first_chunk(ai, fc);
        pcpu_free_alloc_info(ai);
}

#endif        /* CONFIG_SMP */

/*
 * pcpu_nr_pages - calculate total number of populated backing pages
 *
 * This reflects the number of pages populated to back chunks.  Metadata is
 * excluded in the number exposed in meminfo as the number of backing pages
 * scales with the number of cpus and can quickly outweigh the memory used for
 * metadata.  It also keeps this calculation nice and simple.
 *
 * RETURNS:
 * Total number of populated backing pages in use by the allocator.
 */
unsigned long pcpu_nr_pages(void)
{
        return pcpu_nr_populated * pcpu_nr_units;
}

/*
 * Percpu allocator is initialized early during boot when neither slab or
 * workqueue is available.  Plug async management until everything is up
 * and running.
 */
static int __init percpu_enable_async(void)
{
        pcpu_async_enabled = true;
        return 0;
}
subsys_initcall(percpu_enable_async);
















































































































































































































































































































































































































































































































































































































































































































































    1 












    1 







    1 
    1 














    1 




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 










    1 


















































































































    2 














    2 
    2 


    1 









    1 





    1 

































    1 











    1 





























































































    1 












    1 




    1 






    1 



    1 









    1 


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






    1 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/proc/base.c
 *
 *  Copyright (C) 1991, 1992 Linus Torvalds
 *
 *  proc base directory handling functions
 *
 *  1999, Al Viro. Rewritten. Now it covers the whole per-process part.
 *  Instead of using magical inumbers to determine the kind of object
 *  we allocate and fill in-core inodes upon lookup. They don't even
 *  go into icache. We cache the reference to task_struct upon lookup too.
 *  Eventually it should become a filesystem in its own. We don't use the
 *  rest of procfs anymore.
 *
 *
 *  Changelog:
 *  17-Jan-2005
 *  Allan Bezerra
 *  Bruna Moreira <bruna.moreira@indt.org.br>
 *  Edjard Mota <edjard.mota@indt.org.br>
 *  Ilias Biris <ilias.biris@indt.org.br>
 *  Mauricio Lin <mauricio.lin@indt.org.br>
 *
 *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
 *
 *  A new process specific entry (smaps) included in /proc. It shows the
 *  size of rss for each memory area. The maps entry lacks information
 *  about physical memory size (rss) for each mapped file, i.e.,
 *  rss information for executables and library files.
 *  This additional information is useful for any tools that need to know
 *  about physical memory consumption for a process specific library.
 *
 *  Changelog:
 *  21-Feb-2005
 *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
 *  Pud inclusion in the page table walking.
 *
 *  ChangeLog:
 *  10-Mar-2005
 *  10LE Instituto Nokia de Tecnologia - INdT:
 *  A better way to walks through the page table as suggested by Hugh Dickins.
 *
 *  Simo Piiroinen <simo.piiroinen@nokia.com>:
 *  Smaps information related to shared, private, clean and dirty pages.
 *
 *  Paul Mundt <paul.mundt@nokia.com>:
 *  Overall revision about smaps.
 */

#include <linux/uaccess.h>

#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/init.h>
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/generic-radix-tree.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/namei.h>
#include <linux/mnt_namespace.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/rcupdate.h>
#include <linux/kallsyms.h>
#include <linux/stacktrace.h>
#include <linux/resource.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/ptrace.h>
#include <linux/printk.h>
#include <linux/cache.h>
#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/audit.h>
#include <linux/poll.h>
#include <linux/nsproxy.h>
#include <linux/oom.h>
#include <linux/elf.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/debug.h>
#include <linux/sched/stat.h>
#include <linux/posix-timers.h>
#include <linux/time_namespace.h>
#include <linux/resctrl.h>
#include <linux/cn_proc.h>
#include <linux/ksm.h>
#include <uapi/linux/lsm.h>
#include <trace/events/oom.h>
#include "internal.h"
#include "fd.h"

#include "../../lib/kstrtox.h"

/* NOTE:
 *        Implementing inode permission operations in /proc is almost
 *        certainly an error.  Permission checks need to happen during
 *        each system call not at open time.  The reason is that most of
 *        what we wish to check for permissions in /proc varies at runtime.
 *
 *        The classic example of a problem is opening file descriptors
 *        in /proc for a task before it execs a suid executable.
 */

static u8 nlink_tid __ro_after_init;
static u8 nlink_tgid __ro_after_init;

struct pid_entry {
        const char *name;
        unsigned int len;
        umode_t mode;
        const struct inode_operations *iop;
        const struct file_operations *fop;
        union proc_op op;
};

#define NOD(NAME, MODE, IOP, FOP, OP) {                        \
        .name = (NAME),                                        \
        .len  = sizeof(NAME) - 1,                        \
        .mode = MODE,                                        \
        .iop  = IOP,                                        \
        .fop  = FOP,                                        \
        .op   = OP,                                        \
}

#define DIR(NAME, MODE, iops, fops)        \
        NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
#define LNK(NAME, get_link)                                        \
        NOD(NAME, (S_IFLNK|S_IRWXUGO),                                \
                &proc_pid_link_inode_operations, NULL,                \
                { .proc_get_link = get_link } )
#define REG(NAME, MODE, fops)                                \
        NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
#define ONE(NAME, MODE, show)                                \
        NOD(NAME, (S_IFREG|(MODE)),                        \
                NULL, &proc_single_file_operations,        \
                { .proc_show = show } )
#define ATTR(LSMID, NAME, MODE)                                \
        NOD(NAME, (S_IFREG|(MODE)),                        \
                NULL, &proc_pid_attr_operations,        \
                { .lsmid = LSMID })

/*
 * Count the number of hardlinks for the pid_entry table, excluding the .
 * and .. links.
 */
static unsigned int __init pid_entry_nlink(const struct pid_entry *entries,
        unsigned int n)
{
        unsigned int i;
        unsigned int count;

        count = 2;
        for (i = 0; i < n; ++i) {
                if (S_ISDIR(entries[i].mode))
                        ++count;
        }

        return count;
}

static int get_task_root(struct task_struct *task, struct path *root)
{
        int result = -ENOENT;

        task_lock(task);
        if (task->fs) {
                get_fs_root(task->fs, root);
                result = 0;
        }
        task_unlock(task);
        return result;
}

static int proc_cwd_link(struct dentry *dentry, struct path *path)
{
        struct task_struct *task = get_proc_task(d_inode(dentry));
        int result = -ENOENT;

        if (task) {
                task_lock(task);
                if (task->fs) {
                        get_fs_pwd(task->fs, path);
                        result = 0;
                }
                task_unlock(task);
                put_task_struct(task);
        }
        return result;
}

static int proc_root_link(struct dentry *dentry, struct path *path)
{
        struct task_struct *task = get_proc_task(d_inode(dentry));
        int result = -ENOENT;

        if (task) {
                result = get_task_root(task, path);
                put_task_struct(task);
        }
        return result;
}

/*
 * If the user used setproctitle(), we just get the string from
 * user space at arg_start, and limit it to a maximum of one page.
 */
static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf,
                                size_t count, unsigned long pos,
                                unsigned long arg_start)
{
        char *page;
        int ret, got;

        if (pos >= PAGE_SIZE)
                return 0;

        page = (char *)__get_free_page(GFP_KERNEL);
        if (!page)
                return -ENOMEM;

        ret = 0;
        got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON);
        if (got > 0) {
                int len = strnlen(page, got);

                /* Include the NUL character if it was found */
                if (len < got)
                        len++;

                if (len > pos) {
                        len -= pos;
                        if (len > count)
                                len = count;
                        len -= copy_to_user(buf, page+pos, len);
                        if (!len)
                                len = -EFAULT;
                        ret = len;
                }
        }
        free_page((unsigned long)page);
        return ret;
}

static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
                              size_t count, loff_t *ppos)
{
        unsigned long arg_start, arg_end, env_start, env_end;
        unsigned long pos, len;
        char *page, c;

        /* Check if process spawned far enough to have cmdline. */
        if (!mm->env_end)
                return 0;

        spin_lock(&mm->arg_lock);
        arg_start = mm->arg_start;
        arg_end = mm->arg_end;
        env_start = mm->env_start;
        env_end = mm->env_end;
        spin_unlock(&mm->arg_lock);

        if (arg_start >= arg_end)
                return 0;

        /*
         * We allow setproctitle() to overwrite the argument
         * strings, and overflow past the original end. But
         * only when it overflows into the environment area.
         */
        if (env_start != arg_end || env_end < env_start)
                env_start = env_end = arg_end;
        len = env_end - arg_start;

        /* We're not going to care if "*ppos" has high bits set */
        pos = *ppos;
        if (pos >= len)
                return 0;
        if (count > len - pos)
                count = len - pos;
        if (!count)
                return 0;

        /*
         * Magical special case: if the argv[] end byte is not
         * zero, the user has overwritten it with setproctitle(3).
         *
         * Possible future enhancement: do this only once when
         * pos is 0, and set a flag in the 'struct file'.
         */
        if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c)
                return get_mm_proctitle(mm, buf, count, pos, arg_start);

        /*
         * For the non-setproctitle() case we limit things strictly
         * to the [arg_start, arg_end[ range.
         */
        pos += arg_start;
        if (pos < arg_start || pos >= arg_end)
                return 0;
        if (count > arg_end - pos)
                count = arg_end - pos;

        page = (char *)__get_free_page(GFP_KERNEL);
        if (!page)
                return -ENOMEM;

        len = 0;
        while (count) {
                int got;
                size_t size = min_t(size_t, PAGE_SIZE, count);

                got = access_remote_vm(mm, pos, page, size, FOLL_ANON);
                if (got <= 0)
                        break;
                got -= copy_to_user(buf, page, got);
                if (unlikely(!got)) {
                        if (!len)
                                len = -EFAULT;
                        break;
                }
                pos += got;
                buf += got;
                len += got;
                count -= got;
        }

        free_page((unsigned long)page);
        return len;
}

static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf,
                                size_t count, loff_t *pos)
{
        struct mm_struct *mm;
        ssize_t ret;

        mm = get_task_mm(tsk);
        if (!mm)
                return 0;

        ret = get_mm_cmdline(mm, buf, count, pos);
        mmput(mm);
        return ret;
}

static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
                                     size_t count, loff_t *pos)
{
        struct task_struct *tsk;
        ssize_t ret;

        BUG_ON(*pos < 0);

        tsk = get_proc_task(file_inode(file));
        if (!tsk)
                return -ESRCH;
        ret = get_task_cmdline(tsk, buf, count, pos);
        put_task_struct(tsk);
        if (ret > 0)
                *pos += ret;
        return ret;
}

static const struct file_operations proc_pid_cmdline_ops = {
        .read        = proc_pid_cmdline_read,
        .llseek        = generic_file_llseek,
};

#ifdef CONFIG_KALLSYMS
/*
 * Provides a wchan file via kallsyms in a proper one-value-per-file format.
 * Returns the resolved symbol.  If that fails, simply return the address.
 */
static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
                          struct pid *pid, struct task_struct *task)
{
        unsigned long wchan;
        char symname[KSYM_NAME_LEN];

        if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
                goto print0;

        wchan = get_wchan(task);
        if (wchan && !lookup_symbol_name(wchan, symname)) {
                seq_puts(m, symname);
                return 0;
        }

print0:
        seq_putc(m, '0');
        return 0;
}
#endif /* CONFIG_KALLSYMS */

static int lock_trace(struct task_struct *task)
{
        int err = down_read_killable(&task->signal->exec_update_lock);
        if (err)
                return err;
        if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
                up_read(&task->signal->exec_update_lock);
                return -EPERM;
        }
        return 0;
}

static void unlock_trace(struct task_struct *task)
{
        up_read(&task->signal->exec_update_lock);
}

#ifdef CONFIG_STACKTRACE

#define MAX_STACK_TRACE_DEPTH        64

static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
                          struct pid *pid, struct task_struct *task)
{
        unsigned long *entries;
        int err;

        /*
         * The ability to racily run the kernel stack unwinder on a running task
         * and then observe the unwinder output is scary; while it is useful for
         * debugging kernel issues, it can also allow an attacker to leak kernel
         * stack contents.
         * Doing this in a manner that is at least safe from races would require
         * some work to ensure that the remote task can not be scheduled; and
         * even then, this would still expose the unwinder as local attack
         * surface.
         * Therefore, this interface is restricted to root.
         */
        if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
                return -EACCES;

        entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
                                GFP_KERNEL);
        if (!entries)
                return -ENOMEM;

        err = lock_trace(task);
        if (!err) {
                unsigned int i, nr_entries;

                nr_entries = stack_trace_save_tsk(task, entries,
                                                  MAX_STACK_TRACE_DEPTH, 0);

                for (i = 0; i < nr_entries; i++) {
                        seq_printf(m, "[<0>] %pB\n", (void *)entries[i]);
                }

                unlock_trace(task);
        }
        kfree(entries);

        return err;
}
#endif

#ifdef CONFIG_SCHED_INFO
/*
 * Provides /proc/PID/schedstat
 */
static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
                              struct pid *pid, struct task_struct *task)
{
        if (unlikely(!sched_info_on()))
                seq_puts(m, "0 0 0\n");
        else
                seq_printf(m, "%llu %llu %lu\n",
                   (unsigned long long)task->se.sum_exec_runtime,
                   (unsigned long long)task->sched_info.run_delay,
                   task->sched_info.pcount);

        return 0;
}
#endif

#ifdef CONFIG_LATENCYTOP
static int lstats_show_proc(struct seq_file *m, void *v)
{
        int i;
        struct inode *inode = m->private;
        struct task_struct *task = get_proc_task(inode);

        if (!task)
                return -ESRCH;
        seq_puts(m, "Latency Top version : v0.1\n");
        for (i = 0; i < LT_SAVECOUNT; i++) {
                struct latency_record *lr = &task->latency_record[i];
                if (lr->backtrace[0]) {
                        int q;
                        seq_printf(m, "%i %li %li",
                                   lr->count, lr->time, lr->max);
                        for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
                                unsigned long bt = lr->backtrace[q];

                                if (!bt)
                                        break;
                                seq_printf(m, " %ps", (void *)bt);
                        }
                        seq_putc(m, '\n');
                }

        }
        put_task_struct(task);
        return 0;
}

static int lstats_open(struct inode *inode, struct file *file)
{
        return single_open(file, lstats_show_proc, inode);
}

static ssize_t lstats_write(struct file *file, const char __user *buf,
                            size_t count, loff_t *offs)
{
        struct task_struct *task = get_proc_task(file_inode(file));

        if (!task)
                return -ESRCH;
        clear_tsk_latency_tracing(task);
        put_task_struct(task);

        return count;
}

static const struct file_operations proc_lstats_operations = {
        .open                = lstats_open,
        .read                = seq_read,
        .write                = lstats_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

#endif

static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
                          struct pid *pid, struct task_struct *task)
{
        unsigned long totalpages = totalram_pages() + total_swap_pages;
        unsigned long points = 0;
        long badness;

        badness = oom_badness(task, totalpages);
        /*
         * Special case OOM_SCORE_ADJ_MIN for all others scale the
         * badness value into [0, 2000] range which we have been
         * exporting for a long time so userspace might depend on it.
         */
        if (badness != LONG_MIN)
                points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3;

        seq_printf(m, "%lu\n", points);

        return 0;
}

struct limit_names {
        const char *name;
        const char *unit;
};

static const struct limit_names lnames[RLIM_NLIMITS] = {
        [RLIMIT_CPU] = {"Max cpu time", "seconds"},
        [RLIMIT_FSIZE] = {"Max file size", "bytes"},
        [RLIMIT_DATA] = {"Max data size", "bytes"},
        [RLIMIT_STACK] = {"Max stack size", "bytes"},
        [RLIMIT_CORE] = {"Max core file size", "bytes"},
        [RLIMIT_RSS] = {"Max resident set", "bytes"},
        [RLIMIT_NPROC] = {"Max processes", "processes"},
        [RLIMIT_NOFILE] = {"Max open files", "files"},
        [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"},
        [RLIMIT_AS] = {"Max address space", "bytes"},
        [RLIMIT_LOCKS] = {"Max file locks", "locks"},
        [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"},
        [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
        [RLIMIT_NICE] = {"Max nice priority", NULL},
        [RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
        [RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
};

/* Display limits for a process */
static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
                           struct pid *pid, struct task_struct *task)
{
        unsigned int i;
        unsigned long flags;

        struct rlimit rlim[RLIM_NLIMITS];

        if (!lock_task_sighand(task, &flags))
                return 0;
        memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
        unlock_task_sighand(task, &flags);

        /*
         * print the file header
         */
        seq_puts(m, "Limit                     "
                "Soft Limit           "
                "Hard Limit           "
                "Units     \n");

        for (i = 0; i < RLIM_NLIMITS; i++) {
                if (rlim[i].rlim_cur == RLIM_INFINITY)
                        seq_printf(m, "%-25s %-20s ",
                                   lnames[i].name, "unlimited");
                else
                        seq_printf(m, "%-25s %-20lu ",
                                   lnames[i].name, rlim[i].rlim_cur);

                if (rlim[i].rlim_max == RLIM_INFINITY)
                        seq_printf(m, "%-20s ", "unlimited");
                else
                        seq_printf(m, "%-20lu ", rlim[i].rlim_max);

                if (lnames[i].unit)
                        seq_printf(m, "%-10s\n", lnames[i].unit);
                else
                        seq_putc(m, '\n');
        }

        return 0;
}

#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
                            struct pid *pid, struct task_struct *task)
{
        struct syscall_info info;
        u64 *args = &info.data.args[0];
        int res;

        res = lock_trace(task);
        if (res)
                return res;

        if (task_current_syscall(task, &info))
                seq_puts(m, "running\n");
        else if (info.data.nr < 0)
                seq_printf(m, "%d 0x%llx 0x%llx\n",
                           info.data.nr, info.sp, info.data.instruction_pointer);
        else
                seq_printf(m,
                       "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n",
                       info.data.nr,
                       args[0], args[1], args[2], args[3], args[4], args[5],
                       info.sp, info.data.instruction_pointer);
        unlock_trace(task);

        return 0;
}
#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */

/************************************************************************/
/*                       Here the fs part begins                        */
/************************************************************************/

/* permission checks */
static bool proc_fd_access_allowed(struct inode *inode)
{
        struct task_struct *task;
        bool allowed = false;
        /* Allow access to a task's file descriptors if it is us or we
         * may use ptrace attach to the process and find out that
         * information.
         */
        task = get_proc_task(inode);
        if (task) {
                allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
                put_task_struct(task);
        }
        return allowed;
}

int proc_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                 struct iattr *attr)
{
        int error;
        struct inode *inode = d_inode(dentry);

        if (attr->ia_valid & ATTR_MODE)
                return -EPERM;

        error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
        if (error)
                return error;

        setattr_copy(&nop_mnt_idmap, inode, attr);
        return 0;
}

/*
 * May current process learn task's sched/cmdline info (for hide_pid_min=1)
 * or euid/egid (for hide_pid_min=2)?
 */
static bool has_pid_permissions(struct proc_fs_info *fs_info,
                                 struct task_struct *task,
                                 enum proc_hidepid hide_pid_min)
{
        /*
         * If 'hidpid' mount option is set force a ptrace check,
         * we indicate that we are using a filesystem syscall
         * by passing PTRACE_MODE_READ_FSCREDS
         */
        if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE)
                return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);

        if (fs_info->hide_pid < hide_pid_min)
                return true;
        if (in_group_p(fs_info->pid_gid))
                return true;
        return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
}


static int proc_pid_permission(struct mnt_idmap *idmap,
                               struct inode *inode, int mask)
{
        struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
        struct task_struct *task;
        bool has_perms;

        task = get_proc_task(inode);
        if (!task)
                return -ESRCH;
        has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS);
        put_task_struct(task);

        if (!has_perms) {
                if (fs_info->hide_pid == HIDEPID_INVISIBLE) {
                        /*
                         * Let's make getdents(), stat(), and open()
                         * consistent with each other.  If a process
                         * may not stat() a file, it shouldn't be seen
                         * in procfs at all.
                         */
                        return -ENOENT;
                }

                return -EPERM;
        }
        return generic_permission(&nop_mnt_idmap, inode, mask);
}



static const struct inode_operations proc_def_inode_operations = {
        .setattr        = proc_setattr,
};

static int proc_single_show(struct seq_file *m, void *v)
{
        struct inode *inode = m->private;
        struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
        struct pid *pid = proc_pid(inode);
        struct task_struct *task;
        int ret;

        task = get_pid_task(pid, PIDTYPE_PID);
        if (!task)
                return -ESRCH;

        ret = PROC_I(inode)->op.proc_show(m, ns, pid, task);

        put_task_struct(task);
        return ret;
}

static int proc_single_open(struct inode *inode, struct file *filp)
{
        return single_open(filp, proc_single_show, inode);
}

static const struct file_operations proc_single_file_operations = {
        .open                = proc_single_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = single_release,
};


struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
{
        struct task_struct *task = get_proc_task(inode);
        struct mm_struct *mm = ERR_PTR(-ESRCH);

        if (task) {
                mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
                put_task_struct(task);

                if (!IS_ERR_OR_NULL(mm)) {
                        /* ensure this mm_struct can't be freed */
                        mmgrab(mm);
                        /* but do not pin its memory */
                        mmput(mm);
                }
        }

        return mm;
}

static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
{
        struct mm_struct *mm = proc_mem_open(inode, mode);

        if (IS_ERR(mm))
                return PTR_ERR(mm);

        file->private_data = mm;
        return 0;
}

static int mem_open(struct inode *inode, struct file *file)
{
        int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);

        /* OK to pass negative loff_t, we can catch out-of-range */
        file->f_mode |= FMODE_UNSIGNED_OFFSET;

        return ret;
}

static ssize_t mem_rw(struct file *file, char __user *buf,
                        size_t count, loff_t *ppos, int write)
{
        struct mm_struct *mm = file->private_data;
        unsigned long addr = *ppos;
        ssize_t copied;
        char *page;
        unsigned int flags;

        if (!mm)
                return 0;

        page = (char *)__get_free_page(GFP_KERNEL);
        if (!page)
                return -ENOMEM;

        copied = 0;
        if (!mmget_not_zero(mm))
                goto free;

        flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);

        while (count > 0) {
                size_t this_len = min_t(size_t, count, PAGE_SIZE);

                if (write && copy_from_user(page, buf, this_len)) {
                        copied = -EFAULT;
                        break;
                }

                this_len = access_remote_vm(mm, addr, page, this_len, flags);
                if (!this_len) {
                        if (!copied)
                                copied = -EIO;
                        break;
                }

                if (!write && copy_to_user(buf, page, this_len)) {
                        copied = -EFAULT;
                        break;
                }

                buf += this_len;
                addr += this_len;
                copied += this_len;
                count -= this_len;
        }
        *ppos = addr;

        mmput(mm);
free:
        free_page((unsigned long) page);
        return copied;
}

static ssize_t mem_read(struct file *file, char __user *buf,
                        size_t count, loff_t *ppos)
{
        return mem_rw(file, buf, count, ppos, 0);
}

static ssize_t mem_write(struct file *file, const char __user *buf,
                         size_t count, loff_t *ppos)
{
        return mem_rw(file, (char __user*)buf, count, ppos, 1);
}

loff_t mem_lseek(struct file *file, loff_t offset, int orig)
{
        switch (orig) {
        case 0:
                file->f_pos = offset;
                break;
        case 1:
                file->f_pos += offset;
                break;
        default:
                return -EINVAL;
        }
        force_successful_syscall_return();
        return file->f_pos;
}

static int mem_release(struct inode *inode, struct file *file)
{
        struct mm_struct *mm = file->private_data;
        if (mm)
                mmdrop(mm);
        return 0;
}

static const struct file_operations proc_mem_operations = {
        .llseek                = mem_lseek,
        .read                = mem_read,
        .write                = mem_write,
        .open                = mem_open,
        .release        = mem_release,
};

static int environ_open(struct inode *inode, struct file *file)
{
        return __mem_open(inode, file, PTRACE_MODE_READ);
}

static ssize_t environ_read(struct file *file, char __user *buf,
                        size_t count, loff_t *ppos)
{
        char *page;
        unsigned long src = *ppos;
        int ret = 0;
        struct mm_struct *mm = file->private_data;
        unsigned long env_start, env_end;

        /* Ensure the process spawned far enough to have an environment. */
        if (!mm || !mm->env_end)
                return 0;

        page = (char *)__get_free_page(GFP_KERNEL);
        if (!page)
                return -ENOMEM;

        ret = 0;
        if (!mmget_not_zero(mm))
                goto free;

        spin_lock(&mm->arg_lock);
        env_start = mm->env_start;
        env_end = mm->env_end;
        spin_unlock(&mm->arg_lock);

        while (count > 0) {
                size_t this_len, max_len;
                int retval;

                if (src >= (env_end - env_start))
                        break;

                this_len = env_end - (env_start + src);

                max_len = min_t(size_t, PAGE_SIZE, count);
                this_len = min(max_len, this_len);

                retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON);

                if (retval <= 0) {
                        ret = retval;
                        break;
                }

                if (copy_to_user(buf, page, retval)) {
                        ret = -EFAULT;
                        break;
                }

                ret += retval;
                src += retval;
                buf += retval;
                count -= retval;
        }
        *ppos = src;
        mmput(mm);

free:
        free_page((unsigned long) page);
        return ret;
}

static const struct file_operations proc_environ_operations = {
        .open                = environ_open,
        .read                = environ_read,
        .llseek                = generic_file_llseek,
        .release        = mem_release,
};

static int auxv_open(struct inode *inode, struct file *file)
{
        return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
}

static ssize_t auxv_read(struct file *file, char __user *buf,
                        size_t count, loff_t *ppos)
{
        struct mm_struct *mm = file->private_data;
        unsigned int nwords = 0;

        if (!mm)
                return 0;
        do {
                nwords += 2;
        } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
        return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv,
                                       nwords * sizeof(mm->saved_auxv[0]));
}

static const struct file_operations proc_auxv_operations = {
        .open                = auxv_open,
        .read                = auxv_read,
        .llseek                = generic_file_llseek,
        .release        = mem_release,
};

static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
                            loff_t *ppos)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        char buffer[PROC_NUMBUF];
        int oom_adj = OOM_ADJUST_MIN;
        size_t len;

        if (!task)
                return -ESRCH;
        if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
                oom_adj = OOM_ADJUST_MAX;
        else
                oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
                          OOM_SCORE_ADJ_MAX;
        put_task_struct(task);
        if (oom_adj > OOM_ADJUST_MAX)
                oom_adj = OOM_ADJUST_MAX;
        len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
        return simple_read_from_buffer(buf, count, ppos, buffer, len);
}

static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
{
        struct mm_struct *mm = NULL;
        struct task_struct *task;
        int err = 0;

        task = get_proc_task(file_inode(file));
        if (!task)
                return -ESRCH;

        mutex_lock(&oom_adj_mutex);
        if (legacy) {
                if (oom_adj < task->signal->oom_score_adj &&
                                !capable(CAP_SYS_RESOURCE)) {
                        err = -EACCES;
                        goto err_unlock;
                }
                /*
                 * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
                 * /proc/pid/oom_score_adj instead.
                 */
                pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
                          current->comm, task_pid_nr(current), task_pid_nr(task),
                          task_pid_nr(task));
        } else {
                if ((short)oom_adj < task->signal->oom_score_adj_min &&
                                !capable(CAP_SYS_RESOURCE)) {
                        err = -EACCES;
                        goto err_unlock;
                }
        }

        /*
         * Make sure we will check other processes sharing the mm if this is
         * not vfrok which wants its own oom_score_adj.
         * pin the mm so it doesn't go away and get reused after task_unlock
         */
        if (!task->vfork_done) {
                struct task_struct *p = find_lock_task_mm(task);

                if (p) {
                        if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) {
                                mm = p->mm;
                                mmgrab(mm);
                        }
                        task_unlock(p);
                }
        }

        task->signal->oom_score_adj = oom_adj;
        if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
                task->signal->oom_score_adj_min = (short)oom_adj;
        trace_oom_score_adj_update(task);

        if (mm) {
                struct task_struct *p;

                rcu_read_lock();
                for_each_process(p) {
                        if (same_thread_group(task, p))
                                continue;

                        /* do not touch kernel threads or the global init */
                        if (p->flags & PF_KTHREAD || is_global_init(p))
                                continue;

                        task_lock(p);
                        if (!p->vfork_done && process_shares_mm(p, mm)) {
                                p->signal->oom_score_adj = oom_adj;
                                if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
                                        p->signal->oom_score_adj_min = (short)oom_adj;
                        }
                        task_unlock(p);
                }
                rcu_read_unlock();
                mmdrop(mm);
        }
err_unlock:
        mutex_unlock(&oom_adj_mutex);
        put_task_struct(task);
        return err;
}

/*
 * /proc/pid/oom_adj exists solely for backwards compatibility with previous
 * kernels.  The effective policy is defined by oom_score_adj, which has a
 * different scale: oom_adj grew exponentially and oom_score_adj grows linearly.
 * Values written to oom_adj are simply mapped linearly to oom_score_adj.
 * Processes that become oom disabled via oom_adj will still be oom disabled
 * with this implementation.
 *
 * oom_adj cannot be removed since existing userspace binaries use it.
 */
static ssize_t oom_adj_write(struct file *file, const char __user *buf,
                             size_t count, loff_t *ppos)
{
        char buffer[PROC_NUMBUF] = {};
        int oom_adj;
        int err;

        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count)) {
                err = -EFAULT;
                goto out;
        }

        err = kstrtoint(strstrip(buffer), 0, &oom_adj);
        if (err)
                goto out;
        if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
             oom_adj != OOM_DISABLE) {
                err = -EINVAL;
                goto out;
        }

        /*
         * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
         * value is always attainable.
         */
        if (oom_adj == OOM_ADJUST_MAX)
                oom_adj = OOM_SCORE_ADJ_MAX;
        else
                oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;

        err = __set_oom_adj(file, oom_adj, true);
out:
        return err < 0 ? err : count;
}

static const struct file_operations proc_oom_adj_operations = {
        .read                = oom_adj_read,
        .write                = oom_adj_write,
        .llseek                = generic_file_llseek,
};

static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
                                        size_t count, loff_t *ppos)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        char buffer[PROC_NUMBUF];
        short oom_score_adj = OOM_SCORE_ADJ_MIN;
        size_t len;

        if (!task)
                return -ESRCH;
        oom_score_adj = task->signal->oom_score_adj;
        put_task_struct(task);
        len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
        return simple_read_from_buffer(buf, count, ppos, buffer, len);
}

static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                                        size_t count, loff_t *ppos)
{
        char buffer[PROC_NUMBUF] = {};
        int oom_score_adj;
        int err;

        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count)) {
                err = -EFAULT;
                goto out;
        }

        err = kstrtoint(strstrip(buffer), 0, &oom_score_adj);
        if (err)
                goto out;
        if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
                        oom_score_adj > OOM_SCORE_ADJ_MAX) {
                err = -EINVAL;
                goto out;
        }

        err = __set_oom_adj(file, oom_score_adj, false);
out:
        return err < 0 ? err : count;
}

static const struct file_operations proc_oom_score_adj_operations = {
        .read                = oom_score_adj_read,
        .write                = oom_score_adj_write,
        .llseek                = default_llseek,
};

#ifdef CONFIG_AUDIT
#define TMPBUFLEN 11
static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
                                  size_t count, loff_t *ppos)
{
        struct inode * inode = file_inode(file);
        struct task_struct *task = get_proc_task(inode);
        ssize_t length;
        char tmpbuf[TMPBUFLEN];

        if (!task)
                return -ESRCH;
        length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
                           from_kuid(file->f_cred->user_ns,
                                     audit_get_loginuid(task)));
        put_task_struct(task);
        return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
}

static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
                                   size_t count, loff_t *ppos)
{
        struct inode * inode = file_inode(file);
        uid_t loginuid;
        kuid_t kloginuid;
        int rv;

        /* Don't let kthreads write their own loginuid */
        if (current->flags & PF_KTHREAD)
                return -EPERM;

        rcu_read_lock();
        if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
                rcu_read_unlock();
                return -EPERM;
        }
        rcu_read_unlock();

        if (*ppos != 0) {
                /* No partial writes. */
                return -EINVAL;
        }

        rv = kstrtou32_from_user(buf, count, 10, &loginuid);
        if (rv < 0)
                return rv;

        /* is userspace tring to explicitly UNSET the loginuid? */
        if (loginuid == AUDIT_UID_UNSET) {
                kloginuid = INVALID_UID;
        } else {
                kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
                if (!uid_valid(kloginuid))
                        return -EINVAL;
        }

        rv = audit_set_loginuid(kloginuid);
        if (rv < 0)
                return rv;
        return count;
}

static const struct file_operations proc_loginuid_operations = {
        .read                = proc_loginuid_read,
        .write                = proc_loginuid_write,
        .llseek                = generic_file_llseek,
};

static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
                                  size_t count, loff_t *ppos)
{
        struct inode * inode = file_inode(file);
        struct task_struct *task = get_proc_task(inode);
        ssize_t length;
        char tmpbuf[TMPBUFLEN];

        if (!task)
                return -ESRCH;
        length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
                                audit_get_sessionid(task));
        put_task_struct(task);
        return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
}

static const struct file_operations proc_sessionid_operations = {
        .read                = proc_sessionid_read,
        .llseek                = generic_file_llseek,
};
#endif

#ifdef CONFIG_FAULT_INJECTION
static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
                                      size_t count, loff_t *ppos)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        char buffer[PROC_NUMBUF];
        size_t len;
        int make_it_fail;

        if (!task)
                return -ESRCH;
        make_it_fail = task->make_it_fail;
        put_task_struct(task);

        len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);

        return simple_read_from_buffer(buf, count, ppos, buffer, len);
}

static ssize_t proc_fault_inject_write(struct file * file,
                        const char __user * buf, size_t count, loff_t *ppos)
{
        struct task_struct *task;
        char buffer[PROC_NUMBUF] = {};
        int make_it_fail;
        int rv;

        if (!capable(CAP_SYS_RESOURCE))
                return -EPERM;

        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;
        rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
        if (rv < 0)
                return rv;
        if (make_it_fail < 0 || make_it_fail > 1)
                return -EINVAL;

        task = get_proc_task(file_inode(file));
        if (!task)
                return -ESRCH;
        task->make_it_fail = make_it_fail;
        put_task_struct(task);

        return count;
}

static const struct file_operations proc_fault_inject_operations = {
        .read                = proc_fault_inject_read,
        .write                = proc_fault_inject_write,
        .llseek                = generic_file_llseek,
};

static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf,
                                   size_t count, loff_t *ppos)
{
        struct task_struct *task;
        int err;
        unsigned int n;

        err = kstrtouint_from_user(buf, count, 0, &n);
        if (err)
                return err;

        task = get_proc_task(file_inode(file));
        if (!task)
                return -ESRCH;
        task->fail_nth = n;
        put_task_struct(task);

        return count;
}

static ssize_t proc_fail_nth_read(struct file *file, char __user *buf,
                                  size_t count, loff_t *ppos)
{
        struct task_struct *task;
        char numbuf[PROC_NUMBUF];
        ssize_t len;

        task = get_proc_task(file_inode(file));
        if (!task)
                return -ESRCH;
        len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth);
        put_task_struct(task);
        return simple_read_from_buffer(buf, count, ppos, numbuf, len);
}

static const struct file_operations proc_fail_nth_operations = {
        .read                = proc_fail_nth_read,
        .write                = proc_fail_nth_write,
};
#endif


#ifdef CONFIG_SCHED_DEBUG
/*
 * Print out various scheduling related per-task fields:
 */
static int sched_show(struct seq_file *m, void *v)
{
        struct inode *inode = m->private;
        struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
        struct task_struct *p;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;
        proc_sched_show_task(p, ns, m);

        put_task_struct(p);

        return 0;
}

static ssize_t
sched_write(struct file *file, const char __user *buf,
            size_t count, loff_t *offset)
{
        struct inode *inode = file_inode(file);
        struct task_struct *p;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;
        proc_sched_set_task(p);

        put_task_struct(p);

        return count;
}

static int sched_open(struct inode *inode, struct file *filp)
{
        return single_open(filp, sched_show, inode);
}

static const struct file_operations proc_pid_sched_operations = {
        .open                = sched_open,
        .read                = seq_read,
        .write                = sched_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

#endif

#ifdef CONFIG_SCHED_AUTOGROUP
/*
 * Print out autogroup related information:
 */
static int sched_autogroup_show(struct seq_file *m, void *v)
{
        struct inode *inode = m->private;
        struct task_struct *p;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;
        proc_sched_autogroup_show_task(p, m);

        put_task_struct(p);

        return 0;
}

static ssize_t
sched_autogroup_write(struct file *file, const char __user *buf,
            size_t count, loff_t *offset)
{
        struct inode *inode = file_inode(file);
        struct task_struct *p;
        char buffer[PROC_NUMBUF] = {};
        int nice;
        int err;

        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count))
                return -EFAULT;

        err = kstrtoint(strstrip(buffer), 0, &nice);
        if (err < 0)
                return err;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;

        err = proc_sched_autogroup_set_nice(p, nice);
        if (err)
                count = err;

        put_task_struct(p);

        return count;
}

static int sched_autogroup_open(struct inode *inode, struct file *filp)
{
        int ret;

        ret = single_open(filp, sched_autogroup_show, NULL);
        if (!ret) {
                struct seq_file *m = filp->private_data;

                m->private = inode;
        }
        return ret;
}

static const struct file_operations proc_pid_sched_autogroup_operations = {
        .open                = sched_autogroup_open,
        .read                = seq_read,
        .write                = sched_autogroup_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

#endif /* CONFIG_SCHED_AUTOGROUP */

#ifdef CONFIG_TIME_NS
static int timens_offsets_show(struct seq_file *m, void *v)
{
        struct task_struct *p;

        p = get_proc_task(file_inode(m->file));
        if (!p)
                return -ESRCH;
        proc_timens_show_offsets(p, m);

        put_task_struct(p);

        return 0;
}

static ssize_t timens_offsets_write(struct file *file, const char __user *buf,
                                    size_t count, loff_t *ppos)
{
        struct inode *inode = file_inode(file);
        struct proc_timens_offset offsets[2];
        char *kbuf = NULL, *pos, *next_line;
        struct task_struct *p;
        int ret, noffsets;

        /* Only allow < page size writes at the beginning of the file */
        if ((*ppos != 0) || (count >= PAGE_SIZE))
                return -EINVAL;

        /* Slurp in the user data */
        kbuf = memdup_user_nul(buf, count);
        if (IS_ERR(kbuf))
                return PTR_ERR(kbuf);

        /* Parse the user data */
        ret = -EINVAL;
        noffsets = 0;
        for (pos = kbuf; pos; pos = next_line) {
                struct proc_timens_offset *off = &offsets[noffsets];
                char clock[10];
                int err;

                /* Find the end of line and ensure we don't look past it */
                next_line = strchr(pos, '\n');
                if (next_line) {
                        *next_line = '\0';
                        next_line++;
                        if (*next_line == '\0')
                                next_line = NULL;
                }

                err = sscanf(pos, "%9s %lld %lu", clock,
                                &off->val.tv_sec, &off->val.tv_nsec);
                if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC)
                        goto out;

                clock[sizeof(clock) - 1] = 0;
                if (strcmp(clock, "monotonic") == 0 ||
                    strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0)
                        off->clockid = CLOCK_MONOTONIC;
                else if (strcmp(clock, "boottime") == 0 ||
                         strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0)
                        off->clockid = CLOCK_BOOTTIME;
                else
                        goto out;

                noffsets++;
                if (noffsets == ARRAY_SIZE(offsets)) {
                        if (next_line)
                                count = next_line - kbuf;
                        break;
                }
        }

        ret = -ESRCH;
        p = get_proc_task(inode);
        if (!p)
                goto out;
        ret = proc_timens_set_offset(file, p, offsets, noffsets);
        put_task_struct(p);
        if (ret)
                goto out;

        ret = count;
out:
        kfree(kbuf);
        return ret;
}

static int timens_offsets_open(struct inode *inode, struct file *filp)
{
        return single_open(filp, timens_offsets_show, inode);
}

static const struct file_operations proc_timens_offsets_operations = {
        .open                = timens_offsets_open,
        .read                = seq_read,
        .write                = timens_offsets_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};
#endif /* CONFIG_TIME_NS */

static ssize_t comm_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *offset)
{
        struct inode *inode = file_inode(file);
        struct task_struct *p;
        char buffer[TASK_COMM_LEN] = {};
        const size_t maxlen = sizeof(buffer) - 1;

        if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
                return -EFAULT;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;

        if (same_thread_group(current, p)) {
                set_task_comm(p, buffer);
                proc_comm_connector(p);
        }
        else
                count = -EINVAL;

        put_task_struct(p);

        return count;
}

static int comm_show(struct seq_file *m, void *v)
{
        struct inode *inode = m->private;
        struct task_struct *p;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;

        proc_task_name(m, p, false);
        seq_putc(m, '\n');

        put_task_struct(p);

        return 0;
}

static int comm_open(struct inode *inode, struct file *filp)
{
        return single_open(filp, comm_show, inode);
}

static const struct file_operations proc_pid_set_comm_operations = {
        .open                = comm_open,
        .read                = seq_read,
        .write                = comm_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
{
        struct task_struct *task;
        struct file *exe_file;

        task = get_proc_task(d_inode(dentry));
        if (!task)
                return -ENOENT;
        exe_file = get_task_exe_file(task);
        put_task_struct(task);
        if (exe_file) {
                *exe_path = exe_file->f_path;
                path_get(&exe_file->f_path);
                fput(exe_file);
                return 0;
        } else
                return -ENOENT;
}

static const char *proc_pid_get_link(struct dentry *dentry,
                                     struct inode *inode,
                                     struct delayed_call *done)
{
        struct path path;
        int error = -EACCES;

        if (!dentry)
                return ERR_PTR(-ECHILD);

        /* Are we allowed to snoop on the tasks file descriptors? */
        if (!proc_fd_access_allowed(inode))
                goto out;

        error = PROC_I(inode)->op.proc_get_link(dentry, &path);
        if (error)
                goto out;

        error = nd_jump_link(&path);
out:
        return ERR_PTR(error);
}

static int do_proc_readlink(const struct path *path, char __user *buffer, int buflen)
{
        char *tmp = kmalloc(PATH_MAX, GFP_KERNEL);
        char *pathname;
        int len;

        if (!tmp)
                return -ENOMEM;

        pathname = d_path(path, tmp, PATH_MAX);
        len = PTR_ERR(pathname);
        if (IS_ERR(pathname))
                goto out;
        len = tmp + PATH_MAX - 1 - pathname;

        if (len > buflen)
                len = buflen;
        if (copy_to_user(buffer, pathname, len))
                len = -EFAULT;
 out:
        kfree(tmp);
        return len;
}

static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
{
        int error = -EACCES;
        struct inode *inode = d_inode(dentry);
        struct path path;

        /* Are we allowed to snoop on the tasks file descriptors? */
        if (!proc_fd_access_allowed(inode))
                goto out;

        error = PROC_I(inode)->op.proc_get_link(dentry, &path);
        if (error)
                goto out;

        error = do_proc_readlink(&path, buffer, buflen);
        path_put(&path);
out:
        return error;
}

const struct inode_operations proc_pid_link_inode_operations = {
        .readlink        = proc_pid_readlink,
        .get_link        = proc_pid_get_link,
        .setattr        = proc_setattr,
};


/* building an inode */

void task_dump_owner(struct task_struct *task, umode_t mode,
                     kuid_t *ruid, kgid_t *rgid)
{
        /* Depending on the state of dumpable compute who should own a
         * proc file for a task.
         */
        const struct cred *cred;
        kuid_t uid;
        kgid_t gid;

        if (unlikely(task->flags & PF_KTHREAD)) {
                *ruid = GLOBAL_ROOT_UID;
                *rgid = GLOBAL_ROOT_GID;
                return;
        }

        /* Default to the tasks effective ownership */
        rcu_read_lock();
        cred = __task_cred(task);
        uid = cred->euid;
        gid = cred->egid;
        rcu_read_unlock();

        /*
         * Before the /proc/pid/status file was created the only way to read
         * the effective uid of a /process was to stat /proc/pid.  Reading
         * /proc/pid/status is slow enough that procps and other packages
         * kept stating /proc/pid.  To keep the rules in /proc simple I have
         * made this apply to all per process world readable and executable
         * directories.
         */
        if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
                struct mm_struct *mm;
                task_lock(task);
                mm = task->mm;
                /* Make non-dumpable tasks owned by some root */
                if (mm) {
                        if (get_dumpable(mm) != SUID_DUMP_USER) {
                                struct user_namespace *user_ns = mm->user_ns;

                                uid = make_kuid(user_ns, 0);
                                if (!uid_valid(uid))
                                        uid = GLOBAL_ROOT_UID;

                                gid = make_kgid(user_ns, 0);
                                if (!gid_valid(gid))
                                        gid = GLOBAL_ROOT_GID;
                        }
                } else {
                        uid = GLOBAL_ROOT_UID;
                        gid = GLOBAL_ROOT_GID;
                }
                task_unlock(task);
        }
        *ruid = uid;
        *rgid = gid;
}

void proc_pid_evict_inode(struct proc_inode *ei)
{
        struct pid *pid = ei->pid;

        if (S_ISDIR(ei->vfs_inode.i_mode)) {
                spin_lock(&pid->lock);
                hlist_del_init_rcu(&ei->sibling_inodes);
                spin_unlock(&pid->lock);
        }
}

struct inode *proc_pid_make_inode(struct super_block *sb,
                                  struct task_struct *task, umode_t mode)
{
        struct inode * inode;
        struct proc_inode *ei;
        struct pid *pid;

        /* We need a new inode */

        inode = new_inode(sb);
        if (!inode)
                goto out;

        /* Common stuff */
        ei = PROC_I(inode);
        inode->i_mode = mode;
        inode->i_ino = get_next_ino();
        simple_inode_init_ts(inode);
        inode->i_op = &proc_def_inode_operations;

        /*
         * grab the reference to task.
         */
        pid = get_task_pid(task, PIDTYPE_PID);
        if (!pid)
                goto out_unlock;

        /* Let the pid remember us for quick removal */
        ei->pid = pid;

        task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
        security_task_to_inode(task, inode);

out:
        return inode;

out_unlock:
        iput(inode);
        return NULL;
}

/*
 * Generating an inode and adding it into @pid->inodes, so that task will
 * invalidate inode's dentry before being released.
 *
 * This helper is used for creating dir-type entries under '/proc' and
 * '/proc/<tgid>/task'. Other entries(eg. fd, stat) under '/proc/<tgid>'
 * can be released by invalidating '/proc/<tgid>' dentry.
 * In theory, dentries under '/proc/<tgid>/task' can also be released by
 * invalidating '/proc/<tgid>' dentry, we reserve it to handle single
 * thread exiting situation: Any one of threads should invalidate its
 * '/proc/<tgid>/task/<pid>' dentry before released.
 */
static struct inode *proc_pid_make_base_inode(struct super_block *sb,
                                struct task_struct *task, umode_t mode)
{
        struct inode *inode;
        struct proc_inode *ei;
        struct pid *pid;

        inode = proc_pid_make_inode(sb, task, mode);
        if (!inode)
                return NULL;

        /* Let proc_flush_pid find this directory inode */
        ei = PROC_I(inode);
        pid = ei->pid;
        spin_lock(&pid->lock);
        hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
        spin_unlock(&pid->lock);

        return inode;
}

int pid_getattr(struct mnt_idmap *idmap, const struct path *path,
                struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
        struct task_struct *task;

        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);

        stat->uid = GLOBAL_ROOT_UID;
        stat->gid = GLOBAL_ROOT_GID;
        rcu_read_lock();
        task = pid_task(proc_pid(inode), PIDTYPE_PID);
        if (task) {
                if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) {
                        rcu_read_unlock();
                        /*
                         * This doesn't prevent learning whether PID exists,
                         * it only makes getattr() consistent with readdir().
                         */
                        return -ENOENT;
                }
                task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
        }
        rcu_read_unlock();
        return 0;
}

/* dentry stuff */

/*
 * Set <pid>/... inode ownership (can change due to setuid(), etc.)
 */
void pid_update_inode(struct task_struct *task, struct inode *inode)
{
        task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);

        inode->i_mode &= ~(S_ISUID | S_ISGID);
        security_task_to_inode(task, inode);
}

/*
 * Rewrite the inode's ownerships here because the owning task may have
 * performed a setuid(), etc.
 *
 */
static int pid_revalidate(struct dentry *dentry, unsigned int flags)
{
        struct inode *inode;
        struct task_struct *task;
        int ret = 0;

        rcu_read_lock();
        inode = d_inode_rcu(dentry);
        if (!inode)
                goto out;
        task = pid_task(proc_pid(inode), PIDTYPE_PID);

        if (task) {
                pid_update_inode(task, inode);
                ret = 1;
        }
out:
        rcu_read_unlock();
        return ret;
}

static inline bool proc_inode_is_dead(struct inode *inode)
{
        return !proc_pid(inode)->tasks[PIDTYPE_PID].first;
}

int pid_delete_dentry(const struct dentry *dentry)
{
        /* Is the task we represent dead?
         * If so, then don't put the dentry on the lru list,
         * kill it immediately.
         */
        return proc_inode_is_dead(d_inode(dentry));
}

const struct dentry_operations pid_dentry_operations =
{
        .d_revalidate        = pid_revalidate,
        .d_delete        = pid_delete_dentry,
};

/* Lookups */

/*
 * Fill a directory entry.
 *
 * If possible create the dcache entry and derive our inode number and
 * file type from dcache entry.
 *
 * Since all of the proc inode numbers are dynamically generated, the inode
 * numbers do not exist until the inode is cache.  This means creating
 * the dcache entry in readdir is necessary to keep the inode numbers
 * reported by readdir in sync with the inode numbers reported
 * by stat.
 */
bool proc_fill_cache(struct file *file, struct dir_context *ctx,
        const char *name, unsigned int len,
        instantiate_t instantiate, struct task_struct *task, const void *ptr)
{
        struct dentry *child, *dir = file->f_path.dentry;
        struct qstr qname = QSTR_INIT(name, len);
        struct inode *inode;
        unsigned type = DT_UNKNOWN;
        ino_t ino = 1;

        child = d_hash_and_lookup(dir, &qname);
        if (!child) {
                DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
                child = d_alloc_parallel(dir, &qname, &wq);
                if (IS_ERR(child))
                        goto end_instantiate;
                if (d_in_lookup(child)) {
                        struct dentry *res;
                        res = instantiate(child, task, ptr);
                        d_lookup_done(child);
                        if (unlikely(res)) {
                                dput(child);
                                child = res;
                                if (IS_ERR(child))
                                        goto end_instantiate;
                        }
                }
        }
        inode = d_inode(child);
        ino = inode->i_ino;
        type = inode->i_mode >> 12;
        dput(child);
end_instantiate:
        return dir_emit(ctx, name, len, ino, type);
}

/*
 * dname_to_vma_addr - maps a dentry name into two unsigned longs
 * which represent vma start and end addresses.
 */
static int dname_to_vma_addr(struct dentry *dentry,
                             unsigned long *start, unsigned long *end)
{
        const char *str = dentry->d_name.name;
        unsigned long long sval, eval;
        unsigned int len;

        if (str[0] == '0' && str[1] != '-')
                return -EINVAL;
        len = _parse_integer(str, 16, &sval);
        if (len & KSTRTOX_OVERFLOW)
                return -EINVAL;
        if (sval != (unsigned long)sval)
                return -EINVAL;
        str += len;

        if (*str != '-')
                return -EINVAL;
        str++;

        if (str[0] == '0' && str[1])
                return -EINVAL;
        len = _parse_integer(str, 16, &eval);
        if (len & KSTRTOX_OVERFLOW)
                return -EINVAL;
        if (eval != (unsigned long)eval)
                return -EINVAL;
        str += len;

        if (*str != '\0')
                return -EINVAL;

        *start = sval;
        *end = eval;

        return 0;
}

static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
{
        unsigned long vm_start, vm_end;
        bool exact_vma_exists = false;
        struct mm_struct *mm = NULL;
        struct task_struct *task;
        struct inode *inode;
        int status = 0;

        if (flags & LOOKUP_RCU)
                return -ECHILD;

        inode = d_inode(dentry);
        task = get_proc_task(inode);
        if (!task)
                goto out_notask;

        mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
        if (IS_ERR_OR_NULL(mm))
                goto out;

        if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
                status = mmap_read_lock_killable(mm);
                if (!status) {
                        exact_vma_exists = !!find_exact_vma(mm, vm_start,
                                                            vm_end);
                        mmap_read_unlock(mm);
                }
        }

        mmput(mm);

        if (exact_vma_exists) {
                task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);

                security_task_to_inode(task, inode);
                status = 1;
        }

out:
        put_task_struct(task);

out_notask:
        return status;
}

static const struct dentry_operations tid_map_files_dentry_operations = {
        .d_revalidate        = map_files_d_revalidate,
        .d_delete        = pid_delete_dentry,
};

static int map_files_get_link(struct dentry *dentry, struct path *path)
{
        unsigned long vm_start, vm_end;
        struct vm_area_struct *vma;
        struct task_struct *task;
        struct mm_struct *mm;
        int rc;

        rc = -ENOENT;
        task = get_proc_task(d_inode(dentry));
        if (!task)
                goto out;

        mm = get_task_mm(task);
        put_task_struct(task);
        if (!mm)
                goto out;

        rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
        if (rc)
                goto out_mmput;

        rc = mmap_read_lock_killable(mm);
        if (rc)
                goto out_mmput;

        rc = -ENOENT;
        vma = find_exact_vma(mm, vm_start, vm_end);
        if (vma && vma->vm_file) {
                *path = *file_user_path(vma->vm_file);
                path_get(path);
                rc = 0;
        }
        mmap_read_unlock(mm);

out_mmput:
        mmput(mm);
out:
        return rc;
}

struct map_files_info {
        unsigned long        start;
        unsigned long        end;
        fmode_t                mode;
};

/*
 * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, due
 * to concerns about how the symlinks may be used to bypass permissions on
 * ancestor directories in the path to the file in question.
 */
static const char *
proc_map_files_get_link(struct dentry *dentry,
                        struct inode *inode,
                        struct delayed_call *done)
{
        if (!checkpoint_restore_ns_capable(&init_user_ns))
                return ERR_PTR(-EPERM);

        return proc_pid_get_link(dentry, inode, done);
}

/*
 * Identical to proc_pid_link_inode_operations except for get_link()
 */
static const struct inode_operations proc_map_files_link_inode_operations = {
        .readlink        = proc_pid_readlink,
        .get_link        = proc_map_files_get_link,
        .setattr        = proc_setattr,
};

static struct dentry *
proc_map_files_instantiate(struct dentry *dentry,
                           struct task_struct *task, const void *ptr)
{
        fmode_t mode = (fmode_t)(unsigned long)ptr;
        struct proc_inode *ei;
        struct inode *inode;

        inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK |
                                    ((mode & FMODE_READ ) ? S_IRUSR : 0) |
                                    ((mode & FMODE_WRITE) ? S_IWUSR : 0));
        if (!inode)
                return ERR_PTR(-ENOENT);

        ei = PROC_I(inode);
        ei->op.proc_get_link = map_files_get_link;

        inode->i_op = &proc_map_files_link_inode_operations;
        inode->i_size = 64;

        d_set_d_op(dentry, &tid_map_files_dentry_operations);
        return d_splice_alias(inode, dentry);
}

static struct dentry *proc_map_files_lookup(struct inode *dir,
                struct dentry *dentry, unsigned int flags)
{
        unsigned long vm_start, vm_end;
        struct vm_area_struct *vma;
        struct task_struct *task;
        struct dentry *result;
        struct mm_struct *mm;

        result = ERR_PTR(-ENOENT);
        task = get_proc_task(dir);
        if (!task)
                goto out;

        result = ERR_PTR(-EACCES);
        if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
                goto out_put_task;

        result = ERR_PTR(-ENOENT);
        if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
                goto out_put_task;

        mm = get_task_mm(task);
        if (!mm)
                goto out_put_task;

        result = ERR_PTR(-EINTR);
        if (mmap_read_lock_killable(mm))
                goto out_put_mm;

        result = ERR_PTR(-ENOENT);
        vma = find_exact_vma(mm, vm_start, vm_end);
        if (!vma)
                goto out_no_vma;

        if (vma->vm_file)
                result = proc_map_files_instantiate(dentry, task,
                                (void *)(unsigned long)vma->vm_file->f_mode);

out_no_vma:
        mmap_read_unlock(mm);
out_put_mm:
        mmput(mm);
out_put_task:
        put_task_struct(task);
out:
        return result;
}

static const struct inode_operations proc_map_files_inode_operations = {
        .lookup                = proc_map_files_lookup,
        .permission        = proc_fd_permission,
        .setattr        = proc_setattr,
};

static int
proc_map_files_readdir(struct file *file, struct dir_context *ctx)
{
        struct vm_area_struct *vma;
        struct task_struct *task;
        struct mm_struct *mm;
        unsigned long nr_files, pos, i;
        GENRADIX(struct map_files_info) fa;
        struct map_files_info *p;
        int ret;
        struct vma_iterator vmi;

        genradix_init(&fa);

        ret = -ENOENT;
        task = get_proc_task(file_inode(file));
        if (!task)
                goto out;

        ret = -EACCES;
        if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
                goto out_put_task;

        ret = 0;
        if (!dir_emit_dots(file, ctx))
                goto out_put_task;

        mm = get_task_mm(task);
        if (!mm)
                goto out_put_task;

        ret = mmap_read_lock_killable(mm);
        if (ret) {
                mmput(mm);
                goto out_put_task;
        }

        nr_files = 0;

        /*
         * We need two passes here:
         *
         *  1) Collect vmas of mapped files with mmap_lock taken
         *  2) Release mmap_lock and instantiate entries
         *
         * otherwise we get lockdep complained, since filldir()
         * routine might require mmap_lock taken in might_fault().
         */

        pos = 2;
        vma_iter_init(&vmi, mm, 0);
        for_each_vma(vmi, vma) {
                if (!vma->vm_file)
                        continue;
                if (++pos <= ctx->pos)
                        continue;

                p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL);
                if (!p) {
                        ret = -ENOMEM;
                        mmap_read_unlock(mm);
                        mmput(mm);
                        goto out_put_task;
                }

                p->start = vma->vm_start;
                p->end = vma->vm_end;
                p->mode = vma->vm_file->f_mode;
        }
        mmap_read_unlock(mm);
        mmput(mm);

        for (i = 0; i < nr_files; i++) {
                char buf[4 * sizeof(long) + 2];        /* max: %lx-%lx\0 */
                unsigned int len;

                p = genradix_ptr(&fa, i);
                len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
                if (!proc_fill_cache(file, ctx,
                                      buf, len,
                                      proc_map_files_instantiate,
                                      task,
                                      (void *)(unsigned long)p->mode))
                        break;
                ctx->pos++;
        }

out_put_task:
        put_task_struct(task);
out:
        genradix_free(&fa);
        return ret;
}

static const struct file_operations proc_map_files_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_map_files_readdir,
        .llseek                = generic_file_llseek,
};

#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
struct timers_private {
        struct pid *pid;
        struct task_struct *task;
        struct sighand_struct *sighand;
        struct pid_namespace *ns;
        unsigned long flags;
};

static void *timers_start(struct seq_file *m, loff_t *pos)
{
        struct timers_private *tp = m->private;

        tp->task = get_pid_task(tp->pid, PIDTYPE_PID);
        if (!tp->task)
                return ERR_PTR(-ESRCH);

        tp->sighand = lock_task_sighand(tp->task, &tp->flags);
        if (!tp->sighand)
                return ERR_PTR(-ESRCH);

        return seq_list_start(&tp->task->signal->posix_timers, *pos);
}

static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct timers_private *tp = m->private;
        return seq_list_next(v, &tp->task->signal->posix_timers, pos);
}

static void timers_stop(struct seq_file *m, void *v)
{
        struct timers_private *tp = m->private;

        if (tp->sighand) {
                unlock_task_sighand(tp->task, &tp->flags);
                tp->sighand = NULL;
        }

        if (tp->task) {
                put_task_struct(tp->task);
                tp->task = NULL;
        }
}

static int show_timer(struct seq_file *m, void *v)
{
        struct k_itimer *timer;
        struct timers_private *tp = m->private;
        int notify;
        static const char * const nstr[] = {
                [SIGEV_SIGNAL] = "signal",
                [SIGEV_NONE] = "none",
                [SIGEV_THREAD] = "thread",
        };

        timer = list_entry((struct list_head *)v, struct k_itimer, list);
        notify = timer->it_sigev_notify;

        seq_printf(m, "ID: %d\n", timer->it_id);
        seq_printf(m, "signal: %d/%px\n",
                   timer->sigq->info.si_signo,
                   timer->sigq->info.si_value.sival_ptr);
        seq_printf(m, "notify: %s/%s.%d\n",
                   nstr[notify & ~SIGEV_THREAD_ID],
                   (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
                   pid_nr_ns(timer->it_pid, tp->ns));
        seq_printf(m, "ClockID: %d\n", timer->it_clock);

        return 0;
}

static const struct seq_operations proc_timers_seq_ops = {
        .start        = timers_start,
        .next        = timers_next,
        .stop        = timers_stop,
        .show        = show_timer,
};

static int proc_timers_open(struct inode *inode, struct file *file)
{
        struct timers_private *tp;

        tp = __seq_open_private(file, &proc_timers_seq_ops,
                        sizeof(struct timers_private));
        if (!tp)
                return -ENOMEM;

        tp->pid = proc_pid(inode);
        tp->ns = proc_pid_ns(inode->i_sb);
        return 0;
}

static const struct file_operations proc_timers_operations = {
        .open                = proc_timers_open,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = seq_release_private,
};
#endif

static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
                                        size_t count, loff_t *offset)
{
        struct inode *inode = file_inode(file);
        struct task_struct *p;
        u64 slack_ns;
        int err;

        err = kstrtoull_from_user(buf, count, 10, &slack_ns);
        if (err < 0)
                return err;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;

        if (p != current) {
                rcu_read_lock();
                if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
                        rcu_read_unlock();
                        count = -EPERM;
                        goto out;
                }
                rcu_read_unlock();

                err = security_task_setscheduler(p);
                if (err) {
                        count = err;
                        goto out;
                }
        }

        task_lock(p);
        if (slack_ns == 0)
                p->timer_slack_ns = p->default_timer_slack_ns;
        else
                p->timer_slack_ns = slack_ns;
        task_unlock(p);

out:
        put_task_struct(p);

        return count;
}

static int timerslack_ns_show(struct seq_file *m, void *v)
{
        struct inode *inode = m->private;
        struct task_struct *p;
        int err = 0;

        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;

        if (p != current) {
                rcu_read_lock();
                if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
                        rcu_read_unlock();
                        err = -EPERM;
                        goto out;
                }
                rcu_read_unlock();

                err = security_task_getscheduler(p);
                if (err)
                        goto out;
        }

        task_lock(p);
        seq_printf(m, "%llu\n", p->timer_slack_ns);
        task_unlock(p);

out:
        put_task_struct(p);

        return err;
}

static int timerslack_ns_open(struct inode *inode, struct file *filp)
{
        return single_open(filp, timerslack_ns_show, inode);
}

static const struct file_operations proc_pid_set_timerslack_ns_operations = {
        .open                = timerslack_ns_open,
        .read                = seq_read,
        .write                = timerslack_ns_write,
        .llseek                = seq_lseek,
        .release        = single_release,
};

static struct dentry *proc_pident_instantiate(struct dentry *dentry,
        struct task_struct *task, const void *ptr)
{
        const struct pid_entry *p = ptr;
        struct inode *inode;
        struct proc_inode *ei;

        inode = proc_pid_make_inode(dentry->d_sb, task, p->mode);
        if (!inode)
                return ERR_PTR(-ENOENT);

        ei = PROC_I(inode);
        if (S_ISDIR(inode->i_mode))
                set_nlink(inode, 2);        /* Use getattr to fix if necessary */
        if (p->iop)
                inode->i_op = p->iop;
        if (p->fop)
                inode->i_fop = p->fop;
        ei->op = p->op;
        pid_update_inode(task, inode);
        d_set_d_op(dentry, &pid_dentry_operations);
        return d_splice_alias(inode, dentry);
}

static struct dentry *proc_pident_lookup(struct inode *dir, 
                                         struct dentry *dentry,
                                         const struct pid_entry *p,
                                         const struct pid_entry *end)
{
        struct task_struct *task = get_proc_task(dir);
        struct dentry *res = ERR_PTR(-ENOENT);

        if (!task)
                goto out_no_task;

        /*
         * Yes, it does not scale. And it should not. Don't add
         * new entries into /proc/<tgid>/ without very good reasons.
         */
        for (; p < end; p++) {
                if (p->len != dentry->d_name.len)
                        continue;
                if (!memcmp(dentry->d_name.name, p->name, p->len)) {
                        res = proc_pident_instantiate(dentry, task, p);
                        break;
                }
        }
        put_task_struct(task);
out_no_task:
        return res;
}

static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
                const struct pid_entry *ents, unsigned int nents)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        const struct pid_entry *p;

        if (!task)
                return -ENOENT;

        if (!dir_emit_dots(file, ctx))
                goto out;

        if (ctx->pos >= nents + 2)
                goto out;

        for (p = ents + (ctx->pos - 2); p < ents + nents; p++) {
                if (!proc_fill_cache(file, ctx, p->name, p->len,
                                proc_pident_instantiate, task, p))
                        break;
                ctx->pos++;
        }
out:
        put_task_struct(task);
        return 0;
}

#ifdef CONFIG_SECURITY
static int proc_pid_attr_open(struct inode *inode, struct file *file)
{
        file->private_data = NULL;
        __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
        return 0;
}

static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
                                  size_t count, loff_t *ppos)
{
        struct inode * inode = file_inode(file);
        char *p = NULL;
        ssize_t length;
        struct task_struct *task = get_proc_task(inode);

        if (!task)
                return -ESRCH;

        length = security_getprocattr(task, PROC_I(inode)->op.lsmid,
                                      file->f_path.dentry->d_name.name,
                                      &p);
        put_task_struct(task);
        if (length > 0)
                length = simple_read_from_buffer(buf, count, ppos, p, length);
        kfree(p);
        return length;
}

static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
                                   size_t count, loff_t *ppos)
{
        struct inode * inode = file_inode(file);
        struct task_struct *task;
        void *page;
        int rv;

        /* A task may only write when it was the opener. */
        if (file->private_data != current->mm)
                return -EPERM;

        rcu_read_lock();
        task = pid_task(proc_pid(inode), PIDTYPE_PID);
        if (!task) {
                rcu_read_unlock();
                return -ESRCH;
        }
        /* A task may only write its own attributes. */
        if (current != task) {
                rcu_read_unlock();
                return -EACCES;
        }
        /* Prevent changes to overridden credentials. */
        if (current_cred() != current_real_cred()) {
                rcu_read_unlock();
                return -EBUSY;
        }
        rcu_read_unlock();

        if (count > PAGE_SIZE)
                count = PAGE_SIZE;

        /* No partial writes. */
        if (*ppos != 0)
                return -EINVAL;

        page = memdup_user(buf, count);
        if (IS_ERR(page)) {
                rv = PTR_ERR(page);
                goto out;
        }

        /* Guard against adverse ptrace interaction */
        rv = mutex_lock_interruptible(&current->signal->cred_guard_mutex);
        if (rv < 0)
                goto out_free;

        rv = security_setprocattr(PROC_I(inode)->op.lsmid,
                                  file->f_path.dentry->d_name.name, page,
                                  count);
        mutex_unlock(&current->signal->cred_guard_mutex);
out_free:
        kfree(page);
out:
        return rv;
}

static const struct file_operations proc_pid_attr_operations = {
        .open                = proc_pid_attr_open,
        .read                = proc_pid_attr_read,
        .write                = proc_pid_attr_write,
        .llseek                = generic_file_llseek,
        .release        = mem_release,
};

#define LSM_DIR_OPS(LSM) \
static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
                             struct dir_context *ctx) \
{ \
        return proc_pident_readdir(filp, ctx, \
                                   LSM##_attr_dir_stuff, \
                                   ARRAY_SIZE(LSM##_attr_dir_stuff)); \
} \
\
static const struct file_operations proc_##LSM##_attr_dir_ops = { \
        .read                = generic_read_dir, \
        .iterate_shared        = proc_##LSM##_attr_dir_iterate, \
        .llseek                = default_llseek, \
}; \
\
static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
                                struct dentry *dentry, unsigned int flags) \
{ \
        return proc_pident_lookup(dir, dentry, \
                                  LSM##_attr_dir_stuff, \
                                  LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \
} \
\
static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
        .lookup                = proc_##LSM##_attr_dir_lookup, \
        .getattr        = pid_getattr, \
        .setattr        = proc_setattr, \
}

#ifdef CONFIG_SECURITY_SMACK
static const struct pid_entry smack_attr_dir_stuff[] = {
        ATTR(LSM_ID_SMACK, "current",        0666),
};
LSM_DIR_OPS(smack);
#endif

#ifdef CONFIG_SECURITY_APPARMOR
static const struct pid_entry apparmor_attr_dir_stuff[] = {
        ATTR(LSM_ID_APPARMOR, "current",        0666),
        ATTR(LSM_ID_APPARMOR, "prev",                0444),
        ATTR(LSM_ID_APPARMOR, "exec",                0666),
};
LSM_DIR_OPS(apparmor);
#endif

static const struct pid_entry attr_dir_stuff[] = {
        ATTR(LSM_ID_UNDEF, "current",        0666),
        ATTR(LSM_ID_UNDEF, "prev",                0444),
        ATTR(LSM_ID_UNDEF, "exec",                0666),
        ATTR(LSM_ID_UNDEF, "fscreate",        0666),
        ATTR(LSM_ID_UNDEF, "keycreate",        0666),
        ATTR(LSM_ID_UNDEF, "sockcreate",        0666),
#ifdef CONFIG_SECURITY_SMACK
        DIR("smack",                        0555,
            proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
#endif
#ifdef CONFIG_SECURITY_APPARMOR
        DIR("apparmor",                        0555,
            proc_apparmor_attr_dir_inode_ops, proc_apparmor_attr_dir_ops),
#endif
};

static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
{
        return proc_pident_readdir(file, ctx, 
                                   attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
}

static const struct file_operations proc_attr_dir_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_attr_dir_readdir,
        .llseek                = generic_file_llseek,
};

static struct dentry *proc_attr_dir_lookup(struct inode *dir,
                                struct dentry *dentry, unsigned int flags)
{
        return proc_pident_lookup(dir, dentry,
                                  attr_dir_stuff,
                                  attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff));
}

static const struct inode_operations proc_attr_dir_inode_operations = {
        .lookup                = proc_attr_dir_lookup,
        .getattr        = pid_getattr,
        .setattr        = proc_setattr,
};

#endif

#ifdef CONFIG_ELF_CORE
static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
                                         size_t count, loff_t *ppos)
{
        struct task_struct *task = get_proc_task(file_inode(file));
        struct mm_struct *mm;
        char buffer[PROC_NUMBUF];
        size_t len;
        int ret;

        if (!task)
                return -ESRCH;

        ret = 0;
        mm = get_task_mm(task);
        if (mm) {
                len = snprintf(buffer, sizeof(buffer), "%08lx\n",
                               ((mm->flags & MMF_DUMP_FILTER_MASK) >>
                                MMF_DUMP_FILTER_SHIFT));
                mmput(mm);
                ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
        }

        put_task_struct(task);

        return ret;
}

static ssize_t proc_coredump_filter_write(struct file *file,
                                          const char __user *buf,
                                          size_t count,
                                          loff_t *ppos)
{
        struct task_struct *task;
        struct mm_struct *mm;
        unsigned int val;
        int ret;
        int i;
        unsigned long mask;

        ret = kstrtouint_from_user(buf, count, 0, &val);
        if (ret < 0)
                return ret;

        ret = -ESRCH;
        task = get_proc_task(file_inode(file));
        if (!task)
                goto out_no_task;

        mm = get_task_mm(task);
        if (!mm)
                goto out_no_mm;
        ret = 0;

        for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
                if (val & mask)
                        set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
                else
                        clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
        }

        mmput(mm);
 out_no_mm:
        put_task_struct(task);
 out_no_task:
        if (ret < 0)
                return ret;
        return count;
}

static const struct file_operations proc_coredump_filter_operations = {
        .read                = proc_coredump_filter_read,
        .write                = proc_coredump_filter_write,
        .llseek                = generic_file_llseek,
};
#endif

#ifdef CONFIG_TASK_IO_ACCOUNTING
static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
{
        struct task_io_accounting acct;
        int result;

        result = down_read_killable(&task->signal->exec_update_lock);
        if (result)
                return result;

        if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
                result = -EACCES;
                goto out_unlock;
        }

        if (whole) {
                struct signal_struct *sig = task->signal;
                struct task_struct *t;
                unsigned int seq = 1;
                unsigned long flags;

                rcu_read_lock();
                do {
                        seq++; /* 2 on the 1st/lockless path, otherwise odd */
                        flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);

                        acct = sig->ioac;
                        __for_each_thread(sig, t)
                                task_io_accounting_add(&acct, &t->ioac);

                } while (need_seqretry(&sig->stats_lock, seq));
                done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
                rcu_read_unlock();
        } else {
                acct = task->ioac;
        }

        seq_printf(m,
                   "rchar: %llu\n"
                   "wchar: %llu\n"
                   "syscr: %llu\n"
                   "syscw: %llu\n"
                   "read_bytes: %llu\n"
                   "write_bytes: %llu\n"
                   "cancelled_write_bytes: %llu\n",
                   (unsigned long long)acct.rchar,
                   (unsigned long long)acct.wchar,
                   (unsigned long long)acct.syscr,
                   (unsigned long long)acct.syscw,
                   (unsigned long long)acct.read_bytes,
                   (unsigned long long)acct.write_bytes,
                   (unsigned long long)acct.cancelled_write_bytes);
        result = 0;

out_unlock:
        up_read(&task->signal->exec_update_lock);
        return result;
}

static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
                                  struct pid *pid, struct task_struct *task)
{
        return do_io_accounting(task, m, 0);
}

static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
                                   struct pid *pid, struct task_struct *task)
{
        return do_io_accounting(task, m, 1);
}
#endif /* CONFIG_TASK_IO_ACCOUNTING */

#ifdef CONFIG_USER_NS
static int proc_id_map_open(struct inode *inode, struct file *file,
        const struct seq_operations *seq_ops)
{
        struct user_namespace *ns = NULL;
        struct task_struct *task;
        struct seq_file *seq;
        int ret = -EINVAL;

        task = get_proc_task(inode);
        if (task) {
                rcu_read_lock();
                ns = get_user_ns(task_cred_xxx(task, user_ns));
                rcu_read_unlock();
                put_task_struct(task);
        }
        if (!ns)
                goto err;

        ret = seq_open(file, seq_ops);
        if (ret)
                goto err_put_ns;

        seq = file->private_data;
        seq->private = ns;

        return 0;
err_put_ns:
        put_user_ns(ns);
err:
        return ret;
}

static int proc_id_map_release(struct inode *inode, struct file *file)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        put_user_ns(ns);
        return seq_release(inode, file);
}

static int proc_uid_map_open(struct inode *inode, struct file *file)
{
        return proc_id_map_open(inode, file, &proc_uid_seq_operations);
}

static int proc_gid_map_open(struct inode *inode, struct file *file)
{
        return proc_id_map_open(inode, file, &proc_gid_seq_operations);
}

static int proc_projid_map_open(struct inode *inode, struct file *file)
{
        return proc_id_map_open(inode, file, &proc_projid_seq_operations);
}

static const struct file_operations proc_uid_map_operations = {
        .open                = proc_uid_map_open,
        .write                = proc_uid_map_write,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = proc_id_map_release,
};

static const struct file_operations proc_gid_map_operations = {
        .open                = proc_gid_map_open,
        .write                = proc_gid_map_write,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = proc_id_map_release,
};

static const struct file_operations proc_projid_map_operations = {
        .open                = proc_projid_map_open,
        .write                = proc_projid_map_write,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = proc_id_map_release,
};

static int proc_setgroups_open(struct inode *inode, struct file *file)
{
        struct user_namespace *ns = NULL;
        struct task_struct *task;
        int ret;

        ret = -ESRCH;
        task = get_proc_task(inode);
        if (task) {
                rcu_read_lock();
                ns = get_user_ns(task_cred_xxx(task, user_ns));
                rcu_read_unlock();
                put_task_struct(task);
        }
        if (!ns)
                goto err;

        if (file->f_mode & FMODE_WRITE) {
                ret = -EACCES;
                if (!ns_capable(ns, CAP_SYS_ADMIN))
                        goto err_put_ns;
        }

        ret = single_open(file, &proc_setgroups_show, ns);
        if (ret)
                goto err_put_ns;

        return 0;
err_put_ns:
        put_user_ns(ns);
err:
        return ret;
}

static int proc_setgroups_release(struct inode *inode, struct file *file)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        int ret = single_release(inode, file);
        put_user_ns(ns);
        return ret;
}

static const struct file_operations proc_setgroups_operations = {
        .open                = proc_setgroups_open,
        .write                = proc_setgroups_write,
        .read                = seq_read,
        .llseek                = seq_lseek,
        .release        = proc_setgroups_release,
};
#endif /* CONFIG_USER_NS */

static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
{
        int err = lock_trace(task);
        if (!err) {
                seq_printf(m, "%08x\n", task->personality);
                unlock_trace(task);
        }
        return err;
}

#ifdef CONFIG_LIVEPATCH
static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
{
        seq_printf(m, "%d\n", task->patch_state);
        return 0;
}
#endif /* CONFIG_LIVEPATCH */

#ifdef CONFIG_KSM
static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
{
        struct mm_struct *mm;

        mm = get_task_mm(task);
        if (mm) {
                seq_printf(m, "%lu\n", mm->ksm_merging_pages);
                mmput(mm);
        }

        return 0;
}
static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
{
        struct mm_struct *mm;

        mm = get_task_mm(task);
        if (mm) {
                seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
                seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm));
                seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
                seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
                mmput(mm);
        }

        return 0;
}
#endif /* CONFIG_KSM */

#ifdef CONFIG_STACKLEAK_METRICS
static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
{
        unsigned long prev_depth = THREAD_SIZE -
                                (task->prev_lowest_stack & (THREAD_SIZE - 1));
        unsigned long depth = THREAD_SIZE -
                                (task->lowest_stack & (THREAD_SIZE - 1));

        seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n",
                                                        prev_depth, depth);
        return 0;
}
#endif /* CONFIG_STACKLEAK_METRICS */

/*
 * Thread groups
 */
static const struct file_operations proc_task_operations;
static const struct inode_operations proc_task_inode_operations;

static const struct pid_entry tgid_base_stuff[] = {
        DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
        DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
        DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
        DIR("fdinfo",     S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations),
        DIR("ns",          S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
        DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
#endif
        REG("environ",    S_IRUSR, proc_environ_operations),
        REG("auxv",       S_IRUSR, proc_auxv_operations),
        ONE("status",     S_IRUGO, proc_pid_status),
        ONE("personality", S_IRUSR, proc_pid_personality),
        ONE("limits",          S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
#ifdef CONFIG_SCHED_AUTOGROUP
        REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
#endif
#ifdef CONFIG_TIME_NS
        REG("timens_offsets",  S_IRUGO|S_IWUSR, proc_timens_offsets_operations),
#endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        ONE("syscall",    S_IRUSR, proc_pid_syscall),
#endif
        REG("cmdline",    S_IRUGO, proc_pid_cmdline_ops),
        ONE("stat",       S_IRUGO, proc_tgid_stat),
        ONE("statm",      S_IRUGO, proc_pid_statm),
        REG("maps",       S_IRUGO, proc_pid_maps_operations),
#ifdef CONFIG_NUMA
        REG("numa_maps",  S_IRUGO, proc_pid_numa_maps_operations),
#endif
        REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
        LNK("cwd",        proc_cwd_link),
        LNK("root",       proc_root_link),
        LNK("exe",        proc_exe_link),
        REG("mounts",     S_IRUGO, proc_mounts_operations),
        REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
        REG("mountstats", S_IRUSR, proc_mountstats_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
        REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
        REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
        DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
#endif
#ifdef CONFIG_KALLSYMS
        ONE("wchan",      S_IRUGO, proc_pid_wchan),
#endif
#ifdef CONFIG_STACKTRACE
        ONE("stack",      S_IRUSR, proc_pid_stack),
#endif
#ifdef CONFIG_SCHED_INFO
        ONE("schedstat",  S_IRUGO, proc_pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
        REG("latency",  S_IRUGO, proc_lstats_operations),
#endif
#ifdef CONFIG_PROC_PID_CPUSET
        ONE("cpuset",     S_IRUGO, proc_cpuset_show),
#endif
#ifdef CONFIG_CGROUPS
        ONE("cgroup",  S_IRUGO, proc_cgroup_show),
#endif
#ifdef CONFIG_PROC_CPU_RESCTRL
        ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
#endif
        ONE("oom_score",  S_IRUGO, proc_oom_score),
        REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDIT
        REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
        REG("sessionid",  S_IRUGO, proc_sessionid_operations),
#endif
#ifdef CONFIG_FAULT_INJECTION
        REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
        REG("fail-nth", 0644, proc_fail_nth_operations),
#endif
#ifdef CONFIG_ELF_CORE
        REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
        ONE("io",        S_IRUSR, proc_tgid_io_accounting),
#endif
#ifdef CONFIG_USER_NS
        REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
        REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
        REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
        REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
        REG("timers",          S_IRUGO, proc_timers_operations),
#endif
        REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
#ifdef CONFIG_LIVEPATCH
        ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
#endif
#ifdef CONFIG_STACKLEAK_METRICS
        ONE("stack_depth", S_IRUGO, proc_stack_depth),
#endif
#ifdef CONFIG_PROC_PID_ARCH_STATUS
        ONE("arch_status", S_IRUGO, proc_pid_arch_status),
#endif
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
        ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
#endif
#ifdef CONFIG_KSM
        ONE("ksm_merging_pages",  S_IRUSR, proc_pid_ksm_merging_pages),
        ONE("ksm_stat",  S_IRUSR, proc_pid_ksm_stat),
#endif
};

static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
{
        return proc_pident_readdir(file, ctx,
                                   tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
}

static const struct file_operations proc_tgid_base_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_tgid_base_readdir,
        .llseek                = generic_file_llseek,
};

struct pid *tgid_pidfd_to_pid(const struct file *file)
{
        if (file->f_op != &proc_tgid_base_operations)
                return ERR_PTR(-EBADF);

        return proc_pid(file_inode(file));
}

static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        return proc_pident_lookup(dir, dentry,
                                  tgid_base_stuff,
                                  tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff));
}

static const struct inode_operations proc_tgid_base_inode_operations = {
        .lookup                = proc_tgid_base_lookup,
        .getattr        = pid_getattr,
        .setattr        = proc_setattr,
        .permission        = proc_pid_permission,
};

/**
 * proc_flush_pid -  Remove dcache entries for @pid from the /proc dcache.
 * @pid: pid that should be flushed.
 *
 * This function walks a list of inodes (that belong to any proc
 * filesystem) that are attached to the pid and flushes them from
 * the dentry cache.
 *
 * It is safe and reasonable to cache /proc entries for a task until
 * that task exits.  After that they just clog up the dcache with
 * useless entries, possibly causing useful dcache entries to be
 * flushed instead.  This routine is provided to flush those useless
 * dcache entries when a process is reaped.
 *
 * NOTE: This routine is just an optimization so it does not guarantee
 *       that no dcache entries will exist after a process is reaped
 *       it just makes it very unlikely that any will persist.
 */

void proc_flush_pid(struct pid *pid)
{
        proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock);
}

static struct dentry *proc_pid_instantiate(struct dentry * dentry,
                                   struct task_struct *task, const void *ptr)
{
        struct inode *inode;

        inode = proc_pid_make_base_inode(dentry->d_sb, task,
                                         S_IFDIR | S_IRUGO | S_IXUGO);
        if (!inode)
                return ERR_PTR(-ENOENT);

        inode->i_op = &proc_tgid_base_inode_operations;
        inode->i_fop = &proc_tgid_base_operations;
        inode->i_flags|=S_IMMUTABLE;

        set_nlink(inode, nlink_tgid);
        pid_update_inode(task, inode);

        d_set_d_op(dentry, &pid_dentry_operations);
        return d_splice_alias(inode, dentry);
}

struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
{
        struct task_struct *task;
        unsigned tgid;
        struct proc_fs_info *fs_info;
        struct pid_namespace *ns;
        struct dentry *result = ERR_PTR(-ENOENT);

        tgid = name_to_int(&dentry->d_name);
        if (tgid == ~0U)
                goto out;

        fs_info = proc_sb_info(dentry->d_sb);
        ns = fs_info->pid_ns;
        rcu_read_lock();
        task = find_task_by_pid_ns(tgid, ns);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();
        if (!task)
                goto out;

        /* Limit procfs to only ptraceable tasks */
        if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) {
                if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS))
                        goto out_put_task;
        }

        result = proc_pid_instantiate(dentry, task, NULL);
out_put_task:
        put_task_struct(task);
out:
        return result;
}

/*
 * Find the first task with tgid >= tgid
 *
 */
struct tgid_iter {
        unsigned int tgid;
        struct task_struct *task;
};
static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
{
        struct pid *pid;

        if (iter.task)
                put_task_struct(iter.task);
        rcu_read_lock();
retry:
        iter.task = NULL;
        pid = find_ge_pid(iter.tgid, ns);
        if (pid) {
                iter.tgid = pid_nr_ns(pid, ns);
                iter.task = pid_task(pid, PIDTYPE_TGID);
                if (!iter.task) {
                        iter.tgid += 1;
                        goto retry;
                }
                get_task_struct(iter.task);
        }
        rcu_read_unlock();
        return iter;
}

#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2)

/* for the /proc/ directory itself, after non-process stuff has been done */
int proc_pid_readdir(struct file *file, struct dir_context *ctx)
{
        struct tgid_iter iter;
        struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb);
        struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb);
        loff_t pos = ctx->pos;

        if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
                return 0;

        if (pos == TGID_OFFSET - 2) {
                struct inode *inode = d_inode(fs_info->proc_self);
                if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
                        return 0;
                ctx->pos = pos = pos + 1;
        }
        if (pos == TGID_OFFSET - 1) {
                struct inode *inode = d_inode(fs_info->proc_thread_self);
                if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
                        return 0;
                ctx->pos = pos = pos + 1;
        }
        iter.tgid = pos - TGID_OFFSET;
        iter.task = NULL;
        for (iter = next_tgid(ns, iter);
             iter.task;
             iter.tgid += 1, iter = next_tgid(ns, iter)) {
                char name[10 + 1];
                unsigned int len;

                cond_resched();
                if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE))
                        continue;

                len = snprintf(name, sizeof(name), "%u", iter.tgid);
                ctx->pos = iter.tgid + TGID_OFFSET;
                if (!proc_fill_cache(file, ctx, name, len,
                                     proc_pid_instantiate, iter.task, NULL)) {
                        put_task_struct(iter.task);
                        return 0;
                }
        }
        ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
        return 0;
}

/*
 * proc_tid_comm_permission is a special permission function exclusively
 * used for the node /proc/<pid>/task/<tid>/comm.
 * It bypasses generic permission checks in the case where a task of the same
 * task group attempts to access the node.
 * The rationale behind this is that glibc and bionic access this node for
 * cross thread naming (pthread_set/getname_np(!self)). However, if
 * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0,
 * which locks out the cross thread naming implementation.
 * This function makes sure that the node is always accessible for members of
 * same thread group.
 */
static int proc_tid_comm_permission(struct mnt_idmap *idmap,
                                    struct inode *inode, int mask)
{
        bool is_same_tgroup;
        struct task_struct *task;

        task = get_proc_task(inode);
        if (!task)
                return -ESRCH;
        is_same_tgroup = same_thread_group(current, task);
        put_task_struct(task);

        if (likely(is_same_tgroup && !(mask & MAY_EXEC))) {
                /* This file (/proc/<pid>/task/<tid>/comm) can always be
                 * read or written by the members of the corresponding
                 * thread group.
                 */
                return 0;
        }

        return generic_permission(&nop_mnt_idmap, inode, mask);
}

static const struct inode_operations proc_tid_comm_inode_operations = {
                .setattr        = proc_setattr,
                .permission        = proc_tid_comm_permission,
};

/*
 * Tasks
 */
static const struct pid_entry tid_base_stuff[] = {
        DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
        DIR("fdinfo",    S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations),
        DIR("ns",         S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
        DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
#endif
        REG("environ",   S_IRUSR, proc_environ_operations),
        REG("auxv",      S_IRUSR, proc_auxv_operations),
        ONE("status",    S_IRUGO, proc_pid_status),
        ONE("personality", S_IRUSR, proc_pid_personality),
        ONE("limits",         S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
        REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
        NOD("comm",      S_IFREG|S_IRUGO|S_IWUSR,
                         &proc_tid_comm_inode_operations,
                         &proc_pid_set_comm_operations, {}),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        ONE("syscall",   S_IRUSR, proc_pid_syscall),
#endif
        REG("cmdline",   S_IRUGO, proc_pid_cmdline_ops),
        ONE("stat",      S_IRUGO, proc_tid_stat),
        ONE("statm",     S_IRUGO, proc_pid_statm),
        REG("maps",      S_IRUGO, proc_pid_maps_operations),
#ifdef CONFIG_PROC_CHILDREN
        REG("children",  S_IRUGO, proc_tid_children_operations),
#endif
#ifdef CONFIG_NUMA
        REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
#endif
        REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
        LNK("cwd",       proc_cwd_link),
        LNK("root",      proc_root_link),
        LNK("exe",       proc_exe_link),
        REG("mounts",    S_IRUGO, proc_mounts_operations),
        REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
#ifdef CONFIG_PROC_PAGE_MONITOR
        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
        REG("smaps",     S_IRUGO, proc_pid_smaps_operations),
        REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
        DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
#endif
#ifdef CONFIG_KALLSYMS
        ONE("wchan",     S_IRUGO, proc_pid_wchan),
#endif
#ifdef CONFIG_STACKTRACE
        ONE("stack",      S_IRUSR, proc_pid_stack),
#endif
#ifdef CONFIG_SCHED_INFO
        ONE("schedstat", S_IRUGO, proc_pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
        REG("latency",  S_IRUGO, proc_lstats_operations),
#endif
#ifdef CONFIG_PROC_PID_CPUSET
        ONE("cpuset",    S_IRUGO, proc_cpuset_show),
#endif
#ifdef CONFIG_CGROUPS
        ONE("cgroup",  S_IRUGO, proc_cgroup_show),
#endif
#ifdef CONFIG_PROC_CPU_RESCTRL
        ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
#endif
        ONE("oom_score", S_IRUGO, proc_oom_score),
        REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDIT
        REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
        REG("sessionid",  S_IRUGO, proc_sessionid_operations),
#endif
#ifdef CONFIG_FAULT_INJECTION
        REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
        REG("fail-nth", 0644, proc_fail_nth_operations),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
        ONE("io",        S_IRUSR, proc_tid_io_accounting),
#endif
#ifdef CONFIG_USER_NS
        REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
        REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
        REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
        REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
#ifdef CONFIG_LIVEPATCH
        ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
#endif
#ifdef CONFIG_PROC_PID_ARCH_STATUS
        ONE("arch_status", S_IRUGO, proc_pid_arch_status),
#endif
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
        ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
#endif
#ifdef CONFIG_KSM
        ONE("ksm_merging_pages",  S_IRUSR, proc_pid_ksm_merging_pages),
        ONE("ksm_stat",  S_IRUSR, proc_pid_ksm_stat),
#endif
};

static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
{
        return proc_pident_readdir(file, ctx,
                                   tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
}

static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        return proc_pident_lookup(dir, dentry,
                                  tid_base_stuff,
                                  tid_base_stuff + ARRAY_SIZE(tid_base_stuff));
}

static const struct file_operations proc_tid_base_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_tid_base_readdir,
        .llseek                = generic_file_llseek,
};

static const struct inode_operations proc_tid_base_inode_operations = {
        .lookup                = proc_tid_base_lookup,
        .getattr        = pid_getattr,
        .setattr        = proc_setattr,
};

static struct dentry *proc_task_instantiate(struct dentry *dentry,
        struct task_struct *task, const void *ptr)
{
        struct inode *inode;
        inode = proc_pid_make_base_inode(dentry->d_sb, task,
                                         S_IFDIR | S_IRUGO | S_IXUGO);
        if (!inode)
                return ERR_PTR(-ENOENT);

        inode->i_op = &proc_tid_base_inode_operations;
        inode->i_fop = &proc_tid_base_operations;
        inode->i_flags |= S_IMMUTABLE;

        set_nlink(inode, nlink_tid);
        pid_update_inode(task, inode);

        d_set_d_op(dentry, &pid_dentry_operations);
        return d_splice_alias(inode, dentry);
}

static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
{
        struct task_struct *task;
        struct task_struct *leader = get_proc_task(dir);
        unsigned tid;
        struct proc_fs_info *fs_info;
        struct pid_namespace *ns;
        struct dentry *result = ERR_PTR(-ENOENT);

        if (!leader)
                goto out_no_task;

        tid = name_to_int(&dentry->d_name);
        if (tid == ~0U)
                goto out;

        fs_info = proc_sb_info(dentry->d_sb);
        ns = fs_info->pid_ns;
        rcu_read_lock();
        task = find_task_by_pid_ns(tid, ns);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();
        if (!task)
                goto out;
        if (!same_thread_group(leader, task))
                goto out_drop_task;

        result = proc_task_instantiate(dentry, task, NULL);
out_drop_task:
        put_task_struct(task);
out:
        put_task_struct(leader);
out_no_task:
        return result;
}

/*
 * Find the first tid of a thread group to return to user space.
 *
 * Usually this is just the thread group leader, but if the users
 * buffer was too small or there was a seek into the middle of the
 * directory we have more work todo.
 *
 * In the case of a short read we start with find_task_by_pid.
 *
 * In the case of a seek we start with the leader and walk nr
 * threads past it.
 */
static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
                                        struct pid_namespace *ns)
{
        struct task_struct *pos, *task;
        unsigned long nr = f_pos;

        if (nr != f_pos)        /* 32bit overflow? */
                return NULL;

        rcu_read_lock();
        task = pid_task(pid, PIDTYPE_PID);
        if (!task)
                goto fail;

        /* Attempt to start with the tid of a thread */
        if (tid && nr) {
                pos = find_task_by_pid_ns(tid, ns);
                if (pos && same_thread_group(pos, task))
                        goto found;
        }

        /* If nr exceeds the number of threads there is nothing todo */
        if (nr >= get_nr_threads(task))
                goto fail;

        /* If we haven't found our starting place yet start
         * with the leader and walk nr threads forward.
         */
        for_each_thread(task, pos) {
                if (!nr--)
                        goto found;
        }
fail:
        pos = NULL;
        goto out;
found:
        get_task_struct(pos);
out:
        rcu_read_unlock();
        return pos;
}

/*
 * Find the next thread in the thread list.
 * Return NULL if there is an error or no next thread.
 *
 * The reference to the input task_struct is released.
 */
static struct task_struct *next_tid(struct task_struct *start)
{
        struct task_struct *pos = NULL;
        rcu_read_lock();
        if (pid_alive(start)) {
                pos = __next_thread(start);
                if (pos)
                        get_task_struct(pos);
        }
        rcu_read_unlock();
        put_task_struct(start);
        return pos;
}

/* for the /proc/TGID/task/ directories */
static int proc_task_readdir(struct file *file, struct dir_context *ctx)
{
        struct inode *inode = file_inode(file);
        struct task_struct *task;
        struct pid_namespace *ns;
        int tid;

        if (proc_inode_is_dead(inode))
                return -ENOENT;

        if (!dir_emit_dots(file, ctx))
                return 0;

        /* f_version caches the tgid value that the last readdir call couldn't
         * return. lseek aka telldir automagically resets f_version to 0.
         */
        ns = proc_pid_ns(inode->i_sb);
        tid = (int)file->f_version;
        file->f_version = 0;
        for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
             task;
             task = next_tid(task), ctx->pos++) {
                char name[10 + 1];
                unsigned int len;

                tid = task_pid_nr_ns(task, ns);
                if (!tid)
                        continue;        /* The task has just exited. */
                len = snprintf(name, sizeof(name), "%u", tid);
                if (!proc_fill_cache(file, ctx, name, len,
                                proc_task_instantiate, task, NULL)) {
                        /* returning this tgid failed, save it as the first
                         * pid for the next readir call */
                        file->f_version = (u64)tid;
                        put_task_struct(task);
                        break;
                }
        }

        return 0;
}

static int proc_task_getattr(struct mnt_idmap *idmap,
                             const struct path *path, struct kstat *stat,
                             u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct task_struct *p = get_proc_task(inode);
        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);

        if (p) {
                stat->nlink += get_nr_threads(p);
                put_task_struct(p);
        }

        return 0;
}

static const struct inode_operations proc_task_inode_operations = {
        .lookup                = proc_task_lookup,
        .getattr        = proc_task_getattr,
        .setattr        = proc_setattr,
        .permission        = proc_pid_permission,
};

static const struct file_operations proc_task_operations = {
        .read                = generic_read_dir,
        .iterate_shared        = proc_task_readdir,
        .llseek                = generic_file_llseek,
};

void __init set_proc_pid_nlink(void)
{
        nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
        nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
}























































































































































































































































































































































































































































































































































































































    3 
    3 














































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
// SPDX-License-Identifier: GPL-2.0+
/*
 *  Universal/legacy driver for 8250/16550-type serial ports
 *
 *  Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o.
 *
 *  Copyright (C) 2001 Russell King.
 *
 *  Supports: ISA-compatible 8250/16550 ports
 *              PNP 8250/16550 ports
 *              early_serial_setup() ports
 *              userspace-configurable "phantom" ports
 *              "serial8250" platform devices
 *              serial8250_register_8250_port() ports
 */

#include <linux/acpi.h>
#include <linux/cleanup.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/ioport.h>
#include <linux/init.h>
#include <linux/console.h>
#include <linux/sysrq.h>
#include <linux/delay.h>
#include <linux/platform_device.h>
#include <linux/pm_runtime.h>
#include <linux/tty.h>
#include <linux/ratelimit.h>
#include <linux/tty_flip.h>
#include <linux/serial.h>
#include <linux/serial_8250.h>
#include <linux/nmi.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/string_helpers.h>
#include <linux/uaccess.h>
#include <linux/io.h>
#ifdef CONFIG_SPARC
#include <linux/sunserialcore.h>
#endif

#include <asm/irq.h>

#include "../serial_base.h"        /* For serial_base_add_isa_preferred_console() */

#include "8250.h"

/*
 * Configuration:
 *   share_irqs - whether we pass IRQF_SHARED to request_irq().  This option
 *                is unsafe when used on edge-triggered interrupts.
 */
static unsigned int share_irqs = SERIAL8250_SHARE_IRQS;

static unsigned int nr_uarts = CONFIG_SERIAL_8250_RUNTIME_UARTS;

static struct uart_driver serial8250_reg;

static unsigned int skip_txen_test; /* force skip of txen test at init time */

#define PASS_LIMIT        512

#include <asm/serial.h>
/*
 * SERIAL_PORT_DFNS tells us about built-in ports that have no
 * standard enumeration mechanism.   Platforms that can find all
 * serial ports via mechanisms like ACPI or PCI need not supply it.
 */
#ifndef SERIAL_PORT_DFNS
#define SERIAL_PORT_DFNS
#endif

static const struct old_serial_port old_serial_port[] = {
        SERIAL_PORT_DFNS /* defined in asm/serial.h */
};

#define UART_NR        CONFIG_SERIAL_8250_NR_UARTS

#ifdef CONFIG_SERIAL_8250_RSA

#define PORT_RSA_MAX 4
static unsigned long probe_rsa[PORT_RSA_MAX];
static unsigned int probe_rsa_count;
#endif /* CONFIG_SERIAL_8250_RSA  */

struct irq_info {
        struct                        hlist_node node;
        int                        irq;
        spinlock_t                lock;        /* Protects list not the hash */
        struct list_head        *head;
};

#define NR_IRQ_HASH                32        /* Can be adjusted later */
static struct hlist_head irq_lists[NR_IRQ_HASH];
static DEFINE_MUTEX(hash_mutex);        /* Used to walk the hash */

/*
 * This is the serial driver's interrupt routine.
 *
 * Arjan thinks the old way was overly complex, so it got simplified.
 * Alan disagrees, saying that need the complexity to handle the weird
 * nature of ISA shared interrupts.  (This is a special exception.)
 *
 * In order to handle ISA shared interrupts properly, we need to check
 * that all ports have been serviced, and therefore the ISA interrupt
 * line has been de-asserted.
 *
 * This means we need to loop through all ports. checking that they
 * don't have an interrupt pending.
 */
static irqreturn_t serial8250_interrupt(int irq, void *dev_id)
{
        struct irq_info *i = dev_id;
        struct list_head *l, *end = NULL;
        int pass_counter = 0, handled = 0;

        pr_debug("%s(%d): start\n", __func__, irq);

        spin_lock(&i->lock);

        l = i->head;
        do {
                struct uart_8250_port *up;
                struct uart_port *port;

                up = list_entry(l, struct uart_8250_port, list);
                port = &up->port;

                if (port->handle_irq(port)) {
                        handled = 1;
                        end = NULL;
                } else if (end == NULL)
                        end = l;

                l = l->next;

                if (l == i->head && pass_counter++ > PASS_LIMIT)
                        break;
        } while (l != end);

        spin_unlock(&i->lock);

        pr_debug("%s(%d): end\n", __func__, irq);

        return IRQ_RETVAL(handled);
}

/*
 * To support ISA shared interrupts, we need to have one interrupt
 * handler that ensures that the IRQ line has been deasserted
 * before returning.  Failing to do this will result in the IRQ
 * line being stuck active, and, since ISA irqs are edge triggered,
 * no more IRQs will be seen.
 */
static void serial_do_unlink(struct irq_info *i, struct uart_8250_port *up)
{
        spin_lock_irq(&i->lock);

        if (!list_empty(i->head)) {
                if (i->head == &up->list)
                        i->head = i->head->next;
                list_del(&up->list);
        } else {
                BUG_ON(i->head != &up->list);
                i->head = NULL;
        }
        spin_unlock_irq(&i->lock);
        /* List empty so throw away the hash node */
        if (i->head == NULL) {
                hlist_del(&i->node);
                kfree(i);
        }
}

static int serial_link_irq_chain(struct uart_8250_port *up)
{
        struct hlist_head *h;
        struct irq_info *i;
        int ret;

        mutex_lock(&hash_mutex);

        h = &irq_lists[up->port.irq % NR_IRQ_HASH];

        hlist_for_each_entry(i, h, node)
                if (i->irq == up->port.irq)
                        break;

        if (i == NULL) {
                i = kzalloc(sizeof(struct irq_info), GFP_KERNEL);
                if (i == NULL) {
                        mutex_unlock(&hash_mutex);
                        return -ENOMEM;
                }
                spin_lock_init(&i->lock);
                i->irq = up->port.irq;
                hlist_add_head(&i->node, h);
        }
        mutex_unlock(&hash_mutex);

        spin_lock_irq(&i->lock);

        if (i->head) {
                list_add(&up->list, i->head);
                spin_unlock_irq(&i->lock);

                ret = 0;
        } else {
                INIT_LIST_HEAD(&up->list);
                i->head = &up->list;
                spin_unlock_irq(&i->lock);
                ret = request_irq(up->port.irq, serial8250_interrupt,
                                  up->port.irqflags, up->port.name, i);
                if (ret < 0)
                        serial_do_unlink(i, up);
        }

        return ret;
}

static void serial_unlink_irq_chain(struct uart_8250_port *up)
{
        struct irq_info *i;
        struct hlist_head *h;

        mutex_lock(&hash_mutex);

        h = &irq_lists[up->port.irq % NR_IRQ_HASH];

        hlist_for_each_entry(i, h, node)
                if (i->irq == up->port.irq)
                        break;

        BUG_ON(i == NULL);
        BUG_ON(i->head == NULL);

        if (list_empty(i->head))
                free_irq(up->port.irq, i);

        serial_do_unlink(i, up);
        mutex_unlock(&hash_mutex);
}

/*
 * This function is used to handle ports that do not have an
 * interrupt.  This doesn't work very well for 16450's, but gives
 * barely passable results for a 16550A.  (Although at the expense
 * of much CPU overhead).
 */
static void serial8250_timeout(struct timer_list *t)
{
        struct uart_8250_port *up = from_timer(up, t, timer);

        up->port.handle_irq(&up->port);
        mod_timer(&up->timer, jiffies + uart_poll_timeout(&up->port));
}

static void serial8250_backup_timeout(struct timer_list *t)
{
        struct uart_8250_port *up = from_timer(up, t, timer);
        unsigned int iir, ier = 0, lsr;
        unsigned long flags;

        uart_port_lock_irqsave(&up->port, &flags);

        /*
         * Must disable interrupts or else we risk racing with the interrupt
         * based handler.
         */
        if (up->port.irq) {
                ier = serial_in(up, UART_IER);
                serial_out(up, UART_IER, 0);
        }

        iir = serial_in(up, UART_IIR);

        /*
         * This should be a safe test for anyone who doesn't trust the
         * IIR bits on their UART, but it's specifically designed for
         * the "Diva" UART used on the management processor on many HP
         * ia64 and parisc boxes.
         */
        lsr = serial_lsr_in(up);
        if ((iir & UART_IIR_NO_INT) && (up->ier & UART_IER_THRI) &&
            (!kfifo_is_empty(&up->port.state->port.xmit_fifo) ||
             up->port.x_char) &&
            (lsr & UART_LSR_THRE)) {
                iir &= ~(UART_IIR_ID | UART_IIR_NO_INT);
                iir |= UART_IIR_THRI;
        }

        if (!(iir & UART_IIR_NO_INT))
                serial8250_tx_chars(up);

        if (up->port.irq)
                serial_out(up, UART_IER, ier);

        uart_port_unlock_irqrestore(&up->port, flags);

        /* Standard timer interval plus 0.2s to keep the port running */
        mod_timer(&up->timer,
                jiffies + uart_poll_timeout(&up->port) + HZ / 5);
}

static void univ8250_setup_timer(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;

        /*
         * The above check will only give an accurate result the first time
         * the port is opened so this value needs to be preserved.
         */
        if (up->bugs & UART_BUG_THRE) {
                pr_debug("%s - using backup timer\n", port->name);

                up->timer.function = serial8250_backup_timeout;
                mod_timer(&up->timer, jiffies +
                          uart_poll_timeout(port) + HZ / 5);
        }

        /*
         * If the "interrupt" for this port doesn't correspond with any
         * hardware interrupt, we use a timer-based system.  The original
         * driver used to do this with IRQ0.
         */
        if (!port->irq)
                mod_timer(&up->timer, jiffies + uart_poll_timeout(port));
}

static int univ8250_setup_irq(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;

        if (port->irq)
                return serial_link_irq_chain(up);

        return 0;
}

static void univ8250_release_irq(struct uart_8250_port *up)
{
        struct uart_port *port = &up->port;

        del_timer_sync(&up->timer);
        up->timer.function = serial8250_timeout;
        if (port->irq)
                serial_unlink_irq_chain(up);
}

#ifdef CONFIG_SERIAL_8250_RSA
static int serial8250_request_rsa_resource(struct uart_8250_port *up)
{
        unsigned long start = UART_RSA_BASE << up->port.regshift;
        unsigned int size = 8 << up->port.regshift;
        struct uart_port *port = &up->port;
        int ret = -EINVAL;

        switch (port->iotype) {
        case UPIO_HUB6:
        case UPIO_PORT:
                start += port->iobase;
                if (request_region(start, size, "serial-rsa"))
                        ret = 0;
                else
                        ret = -EBUSY;
                break;
        }

        return ret;
}

static void serial8250_release_rsa_resource(struct uart_8250_port *up)
{
        unsigned long offset = UART_RSA_BASE << up->port.regshift;
        unsigned int size = 8 << up->port.regshift;
        struct uart_port *port = &up->port;

        switch (port->iotype) {
        case UPIO_HUB6:
        case UPIO_PORT:
                release_region(port->iobase + offset, size);
                break;
        }
}
#endif

static const struct uart_ops *base_ops;
static struct uart_ops univ8250_port_ops;

static const struct uart_8250_ops univ8250_driver_ops = {
        .setup_irq        = univ8250_setup_irq,
        .release_irq        = univ8250_release_irq,
        .setup_timer        = univ8250_setup_timer,
};

static struct uart_8250_port serial8250_ports[UART_NR];

/**
 * serial8250_get_port - retrieve struct uart_8250_port
 * @line: serial line number
 *
 * This function retrieves struct uart_8250_port for the specific line.
 * This struct *must* *not* be used to perform a 8250 or serial core operation
 * which is not accessible otherwise. Its only purpose is to make the struct
 * accessible to the runtime-pm callbacks for context suspend/restore.
 * The lock assumption made here is none because runtime-pm suspend/resume
 * callbacks should not be invoked if there is any operation performed on the
 * port.
 */
struct uart_8250_port *serial8250_get_port(int line)
{
        return &serial8250_ports[line];
}
EXPORT_SYMBOL_GPL(serial8250_get_port);

static void (*serial8250_isa_config)(int port, struct uart_port *up,
        u32 *capabilities);

void serial8250_set_isa_configurator(
        void (*v)(int port, struct uart_port *up, u32 *capabilities))
{
        serial8250_isa_config = v;
}
EXPORT_SYMBOL(serial8250_set_isa_configurator);

#ifdef CONFIG_SERIAL_8250_RSA

static void univ8250_config_port(struct uart_port *port, int flags)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        up->probe &= ~UART_PROBE_RSA;
        if (port->type == PORT_RSA) {
                if (serial8250_request_rsa_resource(up) == 0)
                        up->probe |= UART_PROBE_RSA;
        } else if (flags & UART_CONFIG_TYPE) {
                int i;

                for (i = 0; i < probe_rsa_count; i++) {
                        if (probe_rsa[i] == up->port.iobase) {
                                if (serial8250_request_rsa_resource(up) == 0)
                                        up->probe |= UART_PROBE_RSA;
                                break;
                        }
                }
        }

        base_ops->config_port(port, flags);

        if (port->type != PORT_RSA && up->probe & UART_PROBE_RSA)
                serial8250_release_rsa_resource(up);
}

static int univ8250_request_port(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);
        int ret;

        ret = base_ops->request_port(port);
        if (ret == 0 && port->type == PORT_RSA) {
                ret = serial8250_request_rsa_resource(up);
                if (ret < 0)
                        base_ops->release_port(port);
        }

        return ret;
}

static void univ8250_release_port(struct uart_port *port)
{
        struct uart_8250_port *up = up_to_u8250p(port);

        if (port->type == PORT_RSA)
                serial8250_release_rsa_resource(up);
        base_ops->release_port(port);
}

static void univ8250_rsa_support(struct uart_ops *ops)
{
        ops->config_port  = univ8250_config_port;
        ops->request_port = univ8250_request_port;
        ops->release_port = univ8250_release_port;
}

#else
#define univ8250_rsa_support(x)                do { } while (0)
#endif /* CONFIG_SERIAL_8250_RSA */

static inline void serial8250_apply_quirks(struct uart_8250_port *up)
{
        up->port.quirks |= skip_txen_test ? UPQ_NO_TXEN_TEST : 0;
}

static struct uart_8250_port *serial8250_setup_port(int index)
{
        struct uart_8250_port *up;

        if (index >= UART_NR)
                return NULL;

        up = &serial8250_ports[index];
        up->port.line = index;
        up->port.port_id = index;

        serial8250_init_port(up);
        if (!base_ops)
                base_ops = up->port.ops;
        up->port.ops = &univ8250_port_ops;

        timer_setup(&up->timer, serial8250_timeout, 0);

        up->ops = &univ8250_driver_ops;

        serial8250_set_defaults(up);

        return up;
}

static void __init serial8250_isa_init_ports(void)
{
        struct uart_8250_port *up;
        static int first = 1;
        int i, irqflag = 0;

        if (!first)
                return;
        first = 0;

        if (nr_uarts > UART_NR)
                nr_uarts = UART_NR;

        /*
         * Set up initial isa ports based on nr_uart module param, or else
         * default to CONFIG_SERIAL_8250_RUNTIME_UARTS. Note that we do not
         * need to increase nr_uarts when setting up the initial isa ports.
         */
        for (i = 0; i < nr_uarts; i++)
                serial8250_setup_port(i);

        /* chain base port ops to support Remote Supervisor Adapter */
        univ8250_port_ops = *base_ops;
        univ8250_rsa_support(&univ8250_port_ops);

        if (share_irqs)
                irqflag = IRQF_SHARED;

        for (i = 0, up = serial8250_ports;
             i < ARRAY_SIZE(old_serial_port) && i < nr_uarts;
             i++, up++) {
                struct uart_port *port = &up->port;

                port->iobase   = old_serial_port[i].port;
                port->irq      = irq_canonicalize(old_serial_port[i].irq);
                port->irqflags = 0;
                port->uartclk  = old_serial_port[i].baud_base * 16;
                port->flags    = old_serial_port[i].flags;
                port->hub6     = 0;
                port->membase  = old_serial_port[i].iomem_base;
                port->iotype   = old_serial_port[i].io_type;
                port->regshift = old_serial_port[i].iomem_reg_shift;

                port->irqflags |= irqflag;
                if (serial8250_isa_config != NULL)
                        serial8250_isa_config(i, &up->port, &up->capabilities);

                serial_base_add_isa_preferred_console(serial8250_reg.dev_name, i);
        }
}

static void __init
serial8250_register_ports(struct uart_driver *drv, struct device *dev)
{
        int i;

        for (i = 0; i < nr_uarts; i++) {
                struct uart_8250_port *up = &serial8250_ports[i];

                if (up->port.type == PORT_8250_CIR)
                        continue;

                if (up->port.dev)
                        continue;

                up->port.dev = dev;

                if (uart_console_registered(&up->port))
                        pm_runtime_get_sync(up->port.dev);

                serial8250_apply_quirks(up);
                uart_add_one_port(drv, &up->port);
        }
}

#ifdef CONFIG_SERIAL_8250_CONSOLE

static void univ8250_console_write(struct console *co, const char *s,
                                   unsigned int count)
{
        struct uart_8250_port *up = &serial8250_ports[co->index];

        serial8250_console_write(up, s, count);
}

static int univ8250_console_setup(struct console *co, char *options)
{
        struct uart_8250_port *up;
        struct uart_port *port;
        int retval, i;

        /*
         * Check whether an invalid uart number has been specified, and
         * if so, search for the first available port that does have
         * console support.
         */
        if (co->index < 0 || co->index >= UART_NR)
                co->index = 0;

        /*
         * If the console is past the initial isa ports, init more ports up to
         * co->index as needed and increment nr_uarts accordingly.
         */
        for (i = nr_uarts; i <= co->index; i++) {
                up = serial8250_setup_port(i);
                if (!up)
                        return -ENODEV;
                nr_uarts++;
        }

        port = &serial8250_ports[co->index].port;
        /* link port to console */
        port->cons = co;

        retval = serial8250_console_setup(port, options, false);
        if (retval != 0)
                port->cons = NULL;
        return retval;
}

static int univ8250_console_exit(struct console *co)
{
        struct uart_port *port;

        port = &serial8250_ports[co->index].port;
        return serial8250_console_exit(port);
}

/**
 *        univ8250_console_match - non-standard console matching
 *        @co:          registering console
 *        @name:          name from console command line
 *        @idx:          index from console command line
 *        @options: ptr to option string from console command line
 *
 *        Only attempts to match console command lines of the form:
 *            console=uart[8250],io|mmio|mmio16|mmio32,<addr>[,<options>]
 *            console=uart[8250],0x<addr>[,<options>]
 *        This form is used to register an initial earlycon boot console and
 *        replace it with the serial8250_console at 8250 driver init.
 *
 *        Performs console setup for a match (as required by interface)
 *        If no <options> are specified, then assume the h/w is already setup.
 *
 *        Returns 0 if console matches; otherwise non-zero to use default matching
 */
static int univ8250_console_match(struct console *co, char *name, int idx,
                                  char *options)
{
        char match[] = "uart";        /* 8250-specific earlycon name */
        unsigned char iotype;
        resource_size_t addr;
        int i;

        if (strncmp(name, match, 4) != 0)
                return -ENODEV;

        if (uart_parse_earlycon(options, &iotype, &addr, &options))
                return -ENODEV;

        /* try to match the port specified on the command line */
        for (i = 0; i < nr_uarts; i++) {
                struct uart_port *port = &serial8250_ports[i].port;

                if (port->iotype != iotype)
                        continue;
                if ((iotype == UPIO_MEM || iotype == UPIO_MEM16 ||
                     iotype == UPIO_MEM32 || iotype == UPIO_MEM32BE)
                    && (port->mapbase != addr))
                        continue;
                if (iotype == UPIO_PORT && port->iobase != addr)
                        continue;

                co->index = i;
                port->cons = co;
                return serial8250_console_setup(port, options, true);
        }

        return -ENODEV;
}

static struct console univ8250_console = {
        .name                = "ttyS",
        .write                = univ8250_console_write,
        .device                = uart_console_device,
        .setup                = univ8250_console_setup,
        .exit                = univ8250_console_exit,
        .match                = univ8250_console_match,
        .flags                = CON_PRINTBUFFER | CON_ANYTIME,
        .index                = -1,
        .data                = &serial8250_reg,
};

static int __init univ8250_console_init(void)
{
        if (nr_uarts == 0)
                return -ENODEV;

        serial8250_isa_init_ports();
        register_console(&univ8250_console);
        return 0;
}
console_initcall(univ8250_console_init);

#define SERIAL8250_CONSOLE        (&univ8250_console)
#else
#define SERIAL8250_CONSOLE        NULL
#endif

static struct uart_driver serial8250_reg = {
        .owner                        = THIS_MODULE,
        .driver_name                = "serial",
        .dev_name                = "ttyS",
        .major                        = TTY_MAJOR,
        .minor                        = 64,
        .cons                        = SERIAL8250_CONSOLE,
};

/*
 * early_serial_setup - early registration for 8250 ports
 *
 * Setup an 8250 port structure prior to console initialisation.  Use
 * after console initialisation will cause undefined behaviour.
 */
int __init early_serial_setup(struct uart_port *port)
{
        struct uart_port *p;

        if (port->line >= ARRAY_SIZE(serial8250_ports) || nr_uarts == 0)
                return -ENODEV;

        serial8250_isa_init_ports();
        p = &serial8250_ports[port->line].port;
        p->iobase       = port->iobase;
        p->membase      = port->membase;
        p->irq          = port->irq;
        p->irqflags     = port->irqflags;
        p->uartclk      = port->uartclk;
        p->fifosize     = port->fifosize;
        p->regshift     = port->regshift;
        p->iotype       = port->iotype;
        p->flags        = port->flags;
        p->mapbase      = port->mapbase;
        p->mapsize      = port->mapsize;
        p->private_data = port->private_data;
        p->type                = port->type;
        p->line                = port->line;

        serial8250_set_defaults(up_to_u8250p(p));

        if (port->serial_in)
                p->serial_in = port->serial_in;
        if (port->serial_out)
                p->serial_out = port->serial_out;
        if (port->handle_irq)
                p->handle_irq = port->handle_irq;

        return 0;
}

/**
 *        serial8250_suspend_port - suspend one serial port
 *        @line:  serial line number
 *
 *        Suspend one serial port.
 */
void serial8250_suspend_port(int line)
{
        struct uart_8250_port *up = &serial8250_ports[line];
        struct uart_port *port = &up->port;

        if (!console_suspend_enabled && uart_console(port) &&
            port->type != PORT_8250) {
                unsigned char canary = 0xa5;

                serial_out(up, UART_SCR, canary);
                if (serial_in(up, UART_SCR) == canary)
                        up->canary = canary;
        }

        uart_suspend_port(&serial8250_reg, port);
}
EXPORT_SYMBOL(serial8250_suspend_port);

/**
 *        serial8250_resume_port - resume one serial port
 *        @line:  serial line number
 *
 *        Resume one serial port.
 */
void serial8250_resume_port(int line)
{
        struct uart_8250_port *up = &serial8250_ports[line];
        struct uart_port *port = &up->port;

        up->canary = 0;

        if (up->capabilities & UART_NATSEMI) {
                /* Ensure it's still in high speed mode */
                serial_port_out(port, UART_LCR, 0xE0);

                ns16550a_goto_highspeed(up);

                serial_port_out(port, UART_LCR, 0);
                port->uartclk = 921600*16;
        }
        uart_resume_port(&serial8250_reg, port);
}
EXPORT_SYMBOL(serial8250_resume_port);

/*
 * Register a set of serial devices attached to a platform device.  The
 * list is terminated with a zero flags entry, which means we expect
 * all entries to have at least UPF_BOOT_AUTOCONF set.
 */
static int serial8250_probe(struct platform_device *dev)
{
        struct plat_serial8250_port *p = dev_get_platdata(&dev->dev);
        struct uart_8250_port uart;
        int ret, i, irqflag = 0;

        memset(&uart, 0, sizeof(uart));

        if (share_irqs)
                irqflag = IRQF_SHARED;

        for (i = 0; p && p->flags != 0; p++, i++) {
                uart.port.iobase        = p->iobase;
                uart.port.membase        = p->membase;
                uart.port.irq                = p->irq;
                uart.port.irqflags        = p->irqflags;
                uart.port.uartclk        = p->uartclk;
                uart.port.regshift        = p->regshift;
                uart.port.iotype        = p->iotype;
                uart.port.flags                = p->flags;
                uart.port.mapbase        = p->mapbase;
                uart.port.mapsize        = p->mapsize;
                uart.port.hub6                = p->hub6;
                uart.port.has_sysrq        = p->has_sysrq;
                uart.port.private_data        = p->private_data;
                uart.port.type                = p->type;
                uart.bugs                = p->bugs;
                uart.port.serial_in        = p->serial_in;
                uart.port.serial_out        = p->serial_out;
                uart.dl_read                = p->dl_read;
                uart.dl_write                = p->dl_write;
                uart.port.handle_irq        = p->handle_irq;
                uart.port.handle_break        = p->handle_break;
                uart.port.set_termios        = p->set_termios;
                uart.port.set_ldisc        = p->set_ldisc;
                uart.port.get_mctrl        = p->get_mctrl;
                uart.port.pm                = p->pm;
                uart.port.dev                = &dev->dev;
                uart.port.irqflags        |= irqflag;
                ret = serial8250_register_8250_port(&uart);
                if (ret < 0) {
                        dev_err(&dev->dev, "unable to register port at index %d "
                                "(IO%lx MEM%llx IRQ%d): %d\n", i,
                                p->iobase, (unsigned long long)p->mapbase,
                                p->irq, ret);
                }
        }
        return 0;
}

/*
 * Remove serial ports registered against a platform device.
 */
static void serial8250_remove(struct platform_device *dev)
{
        int i;

        for (i = 0; i < nr_uarts; i++) {
                struct uart_8250_port *up = &serial8250_ports[i];

                if (up->port.dev == &dev->dev)
                        serial8250_unregister_port(i);
        }
}

static int serial8250_suspend(struct platform_device *dev, pm_message_t state)
{
        int i;

        for (i = 0; i < UART_NR; i++) {
                struct uart_8250_port *up = &serial8250_ports[i];

                if (up->port.type != PORT_UNKNOWN && up->port.dev == &dev->dev)
                        uart_suspend_port(&serial8250_reg, &up->port);
        }

        return 0;
}

static int serial8250_resume(struct platform_device *dev)
{
        int i;

        for (i = 0; i < UART_NR; i++) {
                struct uart_8250_port *up = &serial8250_ports[i];

                if (up->port.type != PORT_UNKNOWN && up->port.dev == &dev->dev)
                        serial8250_resume_port(i);
        }

        return 0;
}

static struct platform_driver serial8250_isa_driver = {
        .probe                = serial8250_probe,
        .remove_new        = serial8250_remove,
        .suspend        = serial8250_suspend,
        .resume                = serial8250_resume,
        .driver                = {
                .name        = "serial8250",
        },
};

/*
 * This "device" covers _all_ ISA 8250-compatible serial devices listed
 * in the table in include/asm/serial.h
 */
static struct platform_device *serial8250_isa_devs;

/*
 * serial8250_register_8250_port and serial8250_unregister_port allows for
 * 16x50 serial ports to be configured at run-time, to support PCMCIA
 * modems and PCI multiport cards.
 */
static DEFINE_MUTEX(serial_mutex);

static struct uart_8250_port *serial8250_find_match_or_unused(const struct uart_port *port)
{
        int i;

        /*
         * First, find a port entry which matches.
         */
        for (i = 0; i < nr_uarts; i++)
                if (uart_match_port(&serial8250_ports[i].port, port))
                        return &serial8250_ports[i];

        /* try line number first if still available */
        i = port->line;
        if (i < nr_uarts && serial8250_ports[i].port.type == PORT_UNKNOWN &&
                        serial8250_ports[i].port.iobase == 0)
                return &serial8250_ports[i];
        /*
         * We didn't find a matching entry, so look for the first
         * free entry.  We look for one which hasn't been previously
         * used (indicated by zero iobase).
         */
        for (i = 0; i < nr_uarts; i++)
                if (serial8250_ports[i].port.type == PORT_UNKNOWN &&
                    serial8250_ports[i].port.iobase == 0)
                        return &serial8250_ports[i];

        /*
         * That also failed.  Last resort is to find any entry which
         * doesn't have a real port associated with it.
         */
        for (i = 0; i < nr_uarts; i++)
                if (serial8250_ports[i].port.type == PORT_UNKNOWN)
                        return &serial8250_ports[i];

        return NULL;
}

static void serial_8250_overrun_backoff_work(struct work_struct *work)
{
        struct uart_8250_port *up =
            container_of(to_delayed_work(work), struct uart_8250_port,
                         overrun_backoff);
        struct uart_port *port = &up->port;
        unsigned long flags;

        uart_port_lock_irqsave(port, &flags);
        up->ier |= UART_IER_RLSI | UART_IER_RDI;
        up->port.read_status_mask |= UART_LSR_DR;
        serial_out(up, UART_IER, up->ier);
        uart_port_unlock_irqrestore(port, flags);
}

/**
 *        serial8250_register_8250_port - register a serial port
 *        @up: serial port template
 *
 *        Configure the serial port specified by the request. If the
 *        port exists and is in use, it is hung up and unregistered
 *        first.
 *
 *        The port is then probed and if necessary the IRQ is autodetected
 *        If this fails an error is returned.
 *
 *        On success the port is ready to use and the line number is returned.
 */
int serial8250_register_8250_port(const struct uart_8250_port *up)
{
        struct uart_8250_port *uart;
        int ret = -ENOSPC;

        if (up->port.uartclk == 0)
                return -EINVAL;

        mutex_lock(&serial_mutex);

        uart = serial8250_find_match_or_unused(&up->port);
        if (!uart) {
                /*
                 * If the port is past the initial isa ports, initialize a new
                 * port and increment nr_uarts accordingly.
                 */
                uart = serial8250_setup_port(nr_uarts);
                if (!uart)
                        goto unlock;
                nr_uarts++;
        }

        if (uart->port.type != PORT_8250_CIR) {
                struct mctrl_gpios *gpios;

                if (uart->port.dev)
                        uart_remove_one_port(&serial8250_reg, &uart->port);

                uart->port.ctrl_id        = up->port.ctrl_id;
                uart->port.port_id        = up->port.port_id;
                uart->port.iobase       = up->port.iobase;
                uart->port.membase      = up->port.membase;
                uart->port.irq          = up->port.irq;
                uart->port.irqflags     = up->port.irqflags;
                uart->port.uartclk      = up->port.uartclk;
                uart->port.fifosize     = up->port.fifosize;
                uart->port.regshift     = up->port.regshift;
                uart->port.iotype       = up->port.iotype;
                uart->port.flags        = up->port.flags | UPF_BOOT_AUTOCONF;
                uart->bugs                = up->bugs;
                uart->port.mapbase      = up->port.mapbase;
                uart->port.mapsize      = up->port.mapsize;
                uart->port.private_data = up->port.private_data;
                uart->tx_loadsz                = up->tx_loadsz;
                uart->capabilities        = up->capabilities;
                uart->port.throttle        = up->port.throttle;
                uart->port.unthrottle        = up->port.unthrottle;
                uart->port.rs485_config        = up->port.rs485_config;
                uart->port.rs485_supported = up->port.rs485_supported;
                uart->port.rs485        = up->port.rs485;
                uart->rs485_start_tx        = up->rs485_start_tx;
                uart->rs485_stop_tx        = up->rs485_stop_tx;
                uart->lsr_save_mask        = up->lsr_save_mask;
                uart->dma                = up->dma;

                /* Take tx_loadsz from fifosize if it wasn't set separately */
                if (uart->port.fifosize && !uart->tx_loadsz)
                        uart->tx_loadsz = uart->port.fifosize;

                if (up->port.dev) {
                        uart->port.dev = up->port.dev;
                        ret = uart_get_rs485_mode(&uart->port);
                        if (ret)
                                goto err;
                }

                if (up->port.flags & UPF_FIXED_TYPE)
                        uart->port.type = up->port.type;

                /*
                 * Only call mctrl_gpio_init(), if the device has no ACPI
                 * companion device
                 */
                if (!has_acpi_companion(uart->port.dev)) {
                        gpios = mctrl_gpio_init(&uart->port, 0);
                        if (IS_ERR(gpios)) {
                                ret = PTR_ERR(gpios);
                                goto err;
                        } else {
                                uart->gpios = gpios;
                        }
                }

                serial8250_set_defaults(uart);

                /* Possibly override default I/O functions.  */
                if (up->port.serial_in)
                        uart->port.serial_in = up->port.serial_in;
                if (up->port.serial_out)
                        uart->port.serial_out = up->port.serial_out;
                if (up->port.handle_irq)
                        uart->port.handle_irq = up->port.handle_irq;
                /*  Possibly override set_termios call */
                if (up->port.set_termios)
                        uart->port.set_termios = up->port.set_termios;
                if (up->port.set_ldisc)
                        uart->port.set_ldisc = up->port.set_ldisc;
                if (up->port.get_mctrl)
                        uart->port.get_mctrl = up->port.get_mctrl;
                if (up->port.set_mctrl)
                        uart->port.set_mctrl = up->port.set_mctrl;
                if (up->port.get_divisor)
                        uart->port.get_divisor = up->port.get_divisor;
                if (up->port.set_divisor)
                        uart->port.set_divisor = up->port.set_divisor;
                if (up->port.startup)
                        uart->port.startup = up->port.startup;
                if (up->port.shutdown)
                        uart->port.shutdown = up->port.shutdown;
                if (up->port.pm)
                        uart->port.pm = up->port.pm;
                if (up->port.handle_break)
                        uart->port.handle_break = up->port.handle_break;
                if (up->dl_read)
                        uart->dl_read = up->dl_read;
                if (up->dl_write)
                        uart->dl_write = up->dl_write;

                if (uart->port.type != PORT_8250_CIR) {
                        if (serial8250_isa_config != NULL)
                                serial8250_isa_config(0, &uart->port,
                                                &uart->capabilities);

                        serial8250_apply_quirks(uart);
                        ret = uart_add_one_port(&serial8250_reg,
                                                &uart->port);
                        if (ret)
                                goto err;

                        ret = uart->port.line;
                } else {
                        dev_info(uart->port.dev,
                                "skipping CIR port at 0x%lx / 0x%llx, IRQ %d\n",
                                uart->port.iobase,
                                (unsigned long long)uart->port.mapbase,
                                uart->port.irq);

                        ret = 0;
                }

                if (!uart->lsr_save_mask)
                        uart->lsr_save_mask = LSR_SAVE_FLAGS;        /* Use default LSR mask */

                /* Initialise interrupt backoff work if required */
                if (up->overrun_backoff_time_ms > 0) {
                        uart->overrun_backoff_time_ms =
                                up->overrun_backoff_time_ms;
                        INIT_DELAYED_WORK(&uart->overrun_backoff,
                                        serial_8250_overrun_backoff_work);
                } else {
                        uart->overrun_backoff_time_ms = 0;
                }
        }

unlock:
        mutex_unlock(&serial_mutex);

        return ret;

err:
        uart->port.dev = NULL;
        mutex_unlock(&serial_mutex);
        return ret;
}
EXPORT_SYMBOL(serial8250_register_8250_port);

/**
 *        serial8250_unregister_port - remove a 16x50 serial port at runtime
 *        @line: serial line number
 *
 *        Remove one serial port.  This may not be called from interrupt
 *        context.  We hand the port back to the our control.
 */
void serial8250_unregister_port(int line)
{
        struct uart_8250_port *uart = &serial8250_ports[line];

        mutex_lock(&serial_mutex);

        if (uart->em485) {
                unsigned long flags;

                uart_port_lock_irqsave(&uart->port, &flags);
                serial8250_em485_destroy(uart);
                uart_port_unlock_irqrestore(&uart->port, flags);
        }

        uart_remove_one_port(&serial8250_reg, &uart->port);
        if (serial8250_isa_devs) {
                uart->port.flags &= ~UPF_BOOT_AUTOCONF;
                uart->port.type = PORT_UNKNOWN;
                uart->port.dev = &serial8250_isa_devs->dev;
                uart->port.port_id = line;
                uart->capabilities = 0;
                serial8250_init_port(uart);
                serial8250_apply_quirks(uart);
                uart_add_one_port(&serial8250_reg, &uart->port);
        } else {
                uart->port.dev = NULL;
        }
        mutex_unlock(&serial_mutex);
}
EXPORT_SYMBOL(serial8250_unregister_port);

static int __init serial8250_init(void)
{
        int ret;

        if (nr_uarts == 0)
                return -ENODEV;

        serial8250_isa_init_ports();

        pr_info("Serial: 8250/16550 driver, %d ports, IRQ sharing %s\n",
                nr_uarts, str_enabled_disabled(share_irqs));

#ifdef CONFIG_SPARC
        ret = sunserial_register_minors(&serial8250_reg, UART_NR);
#else
        serial8250_reg.nr = UART_NR;
        ret = uart_register_driver(&serial8250_reg);
#endif
        if (ret)
                goto out;

        ret = serial8250_pnp_init();
        if (ret)
                goto unreg_uart_drv;

        serial8250_isa_devs = platform_device_alloc("serial8250",
                                                    PLAT8250_DEV_LEGACY);
        if (!serial8250_isa_devs) {
                ret = -ENOMEM;
                goto unreg_pnp;
        }

        ret = platform_device_add(serial8250_isa_devs);
        if (ret)
                goto put_dev;

        serial8250_register_ports(&serial8250_reg, &serial8250_isa_devs->dev);

        ret = platform_driver_register(&serial8250_isa_driver);
        if (ret == 0)
                goto out;

        platform_device_del(serial8250_isa_devs);
put_dev:
        platform_device_put(serial8250_isa_devs);
unreg_pnp:
        serial8250_pnp_exit();
unreg_uart_drv:
#ifdef CONFIG_SPARC
        sunserial_unregister_minors(&serial8250_reg, UART_NR);
#else
        uart_unregister_driver(&serial8250_reg);
#endif
out:
        return ret;
}

static void __exit serial8250_exit(void)
{
        struct platform_device *isa_dev = serial8250_isa_devs;

        /*
         * This tells serial8250_unregister_port() not to re-register
         * the ports (thereby making serial8250_isa_driver permanently
         * in use.)
         */
        serial8250_isa_devs = NULL;

        platform_driver_unregister(&serial8250_isa_driver);
        platform_device_unregister(isa_dev);

        serial8250_pnp_exit();

#ifdef CONFIG_SPARC
        sunserial_unregister_minors(&serial8250_reg, UART_NR);
#else
        uart_unregister_driver(&serial8250_reg);
#endif
}

module_init(serial8250_init);
module_exit(serial8250_exit);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Generic 8250/16x50 serial driver");

module_param_hw(share_irqs, uint, other, 0644);
MODULE_PARM_DESC(share_irqs, "Share IRQs with other non-8250/16x50 devices (unsafe)");

module_param(nr_uarts, uint, 0644);
MODULE_PARM_DESC(nr_uarts, "Maximum number of UARTs supported. (1-" __MODULE_STRING(CONFIG_SERIAL_8250_NR_UARTS) ")");

module_param(skip_txen_test, uint, 0644);
MODULE_PARM_DESC(skip_txen_test, "Skip checking for the TXEN bug at init time");

#ifdef CONFIG_SERIAL_8250_RSA
module_param_hw_array(probe_rsa, ulong, ioport, &probe_rsa_count, 0444);
MODULE_PARM_DESC(probe_rsa, "Probe I/O ports for RSA");
#endif
MODULE_ALIAS_CHARDEV_MAJOR(TTY_MAJOR);

#ifdef CONFIG_SERIAL_8250_DEPRECATED_OPTIONS
#ifndef MODULE
/* This module was renamed to 8250_core in 3.7.  Keep the old "8250" name
 * working as well for the module options so we don't break people.  We
 * need to keep the names identical and the convenient macros will happily
 * refuse to let us do that by failing the build with redefinition errors
 * of global variables.  So we stick them inside a dummy function to avoid
 * those conflicts.  The options still get parsed, and the redefined
 * MODULE_PARAM_PREFIX lets us keep the "8250." syntax alive.
 *
 * This is hacky.  I'm sorry.
 */
static void __used s8250_options(void)
{
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "8250_core."

        module_param_cb(share_irqs, &param_ops_uint, &share_irqs, 0644);
        module_param_cb(nr_uarts, &param_ops_uint, &nr_uarts, 0644);
        module_param_cb(skip_txen_test, &param_ops_uint, &skip_txen_test, 0644);
#ifdef CONFIG_SERIAL_8250_RSA
        __module_param_call(MODULE_PARAM_PREFIX, probe_rsa,
                &param_array_ops, .arr = &__param_arr_probe_rsa,
                0444, -1, 0);
#endif
}
#else
MODULE_ALIAS("8250_core");
#endif
#endif






































































































































































































































































































































































































































































    1 






















































































    5 



    5 


















































































































































































































































    5 








    6 

    6 












    6 





























    6 



    7 


































































































































































































































    2 







































    2 





































    7 



    6 
    6 


    7 



























    1 













    1 






    1 



    1 




    1 




























































    1 





























    1 










    1 












    1 





    1 








    1 




    1 





    1 




































    1 







    1 










    1 


    1 




    1 








    1 



    1 
    1 
    1 







































    1 


















    1 

    1 












    1 





















    1 












    1 




    1 


















    1 











    6 

























    6 




















    6 




































    6 













    6 



























    6 




























    6 

    5 

    6 




    5 




    5 

    5 

    5 













    3 







    3 













    2 



    1 





    2 












    5 


    1 




    5 







    2 
















































    1 
    2 
    2 
    1 
    1 




    2 












    2 





















    2 
    2 

    2 



























    1 
    1 


    1 

    2 








    1 
    2 


    2 
    1 



    1 
    2 









    5 






    5 



    1 








    5 



    5 

    5 
    2 




    2 
    1 




    2 



    5 















    5 


















    5 
    3 




    5 

    3 

    4 



    5 


    5 


    5 
    5 




    5 








    5 
    5 





    5 






    5 











    5 










    5 






























    4 
    2 









    4 
    2 











    4 




    4 








    1 


    4 
    4 







































    4 














    4 








    4 









    1 

    4 







    4 





    1 










    1 






    1 











    6 









    6 








    2 







    2 
    1 












    1 



    1 























    1 








    1 
    1 


















    1 

    1 







    1 














    4 












    4 

















    1 










    4 
























    4 





    4 





















































    5 

    6 




    1 










    4 




    2 










    1 





    4 



    1 


































    6 

    6 
    1 










    6 

    1 

    6 





    5 













    1 
    1 

    1 







    6 





    6 

    1 










    6 





    6 











    6 




    1 
    1 
























    6 



    6 
    3 
    6 




    6 









    5 













    4 
    1 





    1 



















    5 






    5 

    2 













    6 










    2 
    2 














    6 
    2 


    2 
    4 








    6 
    2 









    5 














    1 

    4 









    4 









    6 


    5 


    2 


    6 






    1 




















    5 









    1 
    6 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    6 












    6 





















    5 



    6 
    6 






    6 









    5 



    3 








    6 

    1 
    5 
    5 



    4 

    3 









    6 





    6 










    5 




















    5 
    5 
























    5 









    1 

    5 







































































































    1 




    1 



    1 





































    2 





    1 




    1 













































    1 







    1 




















    1 
    1 


    1 

















    1 












    5 









    4 












    2 







    2 
    2 
    2 




    2 









    1 

    1 

    1 



    1 





















    2 
















    1 
    2 


    2 











    2 

    2 































    2 



    2 








    1 
    2 

    2 













    6 























    6 

    2 













































    2 









    2 







    2 


    2 




































































    2 

























    5 









    4 











    3 





















    2 
























































    2 





    2 










    2 








































    2 











    2 




    1 
    2 
    2 










    2 

    2 














    1 



















    1 








































    5 























    2 









    1 


























    1 
    1 






    1 














    2 























    2 














    2 




    1 


























    1 




    1 







    1 



    2 









    2 












    2 


    2 






























































    2 













    1 











    1 
    1 


    1 










    1 
    1 




    1 
















































    1 












    1 





























































































    2 










    1 







    2 





    2 



















    1 


    1 
























    2 




















    1 
    1 






    1 
    1 











    6 





    5 




    5 






    4 































































































    4 







    3 

    3 


    3 
    1 

































    6 

















    6 
    6 


















    6 


















































































































































    5 
    2 






















    2 

    3 
    5 
    3 
    5 
    5 
    2 
    6 









    1 

    1 


    1 






    1 









    1 




    1 




    1 
    1 





















































































    6 













    6 




    6 


    2 




    1 

















    5 
    5 






    5 
















    2 



    6 

















    3 

    4 







    5 



    1 










    5 





    6 


    6 
    1 
    5 
    1 





    6 













    1 







    1 














    2 

















    1 








    2 



    1 

    1 















    2 






    1 






    1 













































    3 




















    3 






    3 
    3 







    3 














    3 












    2 
    1 













    2 

















    1 

    1 












    3 
















    3 














    3 






    3 













    3 








    3 

    3 














    3 















    3 











    2 



    1 


    1 




    3 











































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
 * Written by Alex Tomas <alex@clusterfs.com>
 */


/*
 * mballoc.c contains the multiblocks allocation routines
 */

#include "ext4_jbd2.h"
#include "mballoc.h"
#include <linux/log2.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/nospec.h>
#include <linux/backing-dev.h>
#include <linux/freezer.h>
#include <trace/events/ext4.h>
#include <kunit/static_stub.h>

/*
 * MUSTDO:
 *   - test ext4_ext_search_left() and ext4_ext_search_right()
 *   - search for metadata in few groups
 *
 * TODO v4:
 *   - normalization should take into account whether file is still open
 *   - discard preallocations if no free space left (policy?)
 *   - don't normalize tails
 *   - quota
 *   - reservation for superuser
 *
 * TODO v3:
 *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
 *   - track min/max extents in each group for better group selection
 *   - mb_mark_used() may allocate chunk right after splitting buddy
 *   - tree of groups sorted by number of free blocks
 *   - error handling
 */

/*
 * The allocation request involve request for multiple number of blocks
 * near to the goal(block) value specified.
 *
 * During initialization phase of the allocator we decide to use the
 * group preallocation or inode preallocation depending on the size of
 * the file. The size of the file could be the resulting file size we
 * would have after allocation, or the current file size, which ever
 * is larger. If the size is less than sbi->s_mb_stream_request we
 * select to use the group preallocation. The default value of
 * s_mb_stream_request is 16 blocks. This can also be tuned via
 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
 * terms of number of blocks.
 *
 * The main motivation for having small file use group preallocation is to
 * ensure that we have small files closer together on the disk.
 *
 * First stage the allocator looks at the inode prealloc list,
 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
 * spaces for this particular inode. The inode prealloc space is
 * represented as:
 *
 * pa_lstart -> the logical start block for this prealloc space
 * pa_pstart -> the physical start block for this prealloc space
 * pa_len    -> length for this prealloc space (in clusters)
 * pa_free   ->  free space available in this prealloc space (in clusters)
 *
 * The inode preallocation space is used looking at the _logical_ start
 * block. If only the logical file block falls within the range of prealloc
 * space we will consume the particular prealloc space. This makes sure that
 * we have contiguous physical blocks representing the file blocks
 *
 * The important thing to be noted in case of inode prealloc space is that
 * we don't modify the values associated to inode prealloc space except
 * pa_free.
 *
 * If we are not able to find blocks in the inode prealloc space and if we
 * have the group allocation flag set then we look at the locality group
 * prealloc space. These are per CPU prealloc list represented as
 *
 * ext4_sb_info.s_locality_groups[smp_processor_id()]
 *
 * The reason for having a per cpu locality group is to reduce the contention
 * between CPUs. It is possible to get scheduled at this point.
 *
 * The locality group prealloc space is used looking at whether we have
 * enough free space (pa_free) within the prealloc space.
 *
 * If we can't allocate blocks via inode prealloc or/and locality group
 * prealloc then we look at the buddy cache. The buddy cache is represented
 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
 * mapped to the buddy and bitmap information regarding different
 * groups. The buddy information is attached to buddy cache inode so that
 * we can access them through the page cache. The information regarding
 * each group is loaded via ext4_mb_load_buddy.  The information involve
 * block bitmap and buddy information. The information are stored in the
 * inode as:
 *
 *  {                        page                        }
 *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 *
 *
 * one block each for bitmap and buddy information.  So for each group we
 * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE /
 * blocksize) blocks.  So it can have information regarding groups_per_page
 * which is blocks_per_page/2
 *
 * The buddy cache inode is not stored on disk. The inode is thrown
 * away when the filesystem is unmounted.
 *
 * We look for count number of blocks in the buddy cache. If we were able
 * to locate that many free blocks we return with additional information
 * regarding rest of the contiguous physical block available
 *
 * Before allocating blocks via buddy cache we normalize the request
 * blocks. This ensure we ask for more blocks that we needed. The extra
 * blocks that we get after allocation is added to the respective prealloc
 * list. In case of inode preallocation we follow a list of heuristics
 * based on file size. This can be found in ext4_mb_normalize_request. If
 * we are doing a group prealloc we try to normalize the request to
 * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
 * dependent on the cluster size; for non-bigalloc file systems, it is
 * 512 blocks. This can be tuned via
 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
 * terms of number of blocks. If we have mounted the file system with -O
 * stripe=<value> option the group prealloc request is normalized to the
 * smallest multiple of the stripe value (sbi->s_stripe) which is
 * greater than the default mb_group_prealloc.
 *
 * If "mb_optimize_scan" mount option is set, we maintain in memory group info
 * structures in two data structures:
 *
 * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders)
 *
 *    Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks)
 *
 *    This is an array of lists where the index in the array represents the
 *    largest free order in the buddy bitmap of the participating group infos of
 *    that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total
 *    number of buddy bitmap orders possible) number of lists. Group-infos are
 *    placed in appropriate lists.
 *
 * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size)
 *
 *    Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks)
 *
 *    This is an array of lists where in the i-th list there are groups with
 *    average fragment size >= 2^i and < 2^(i+1). The average fragment size
 *    is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments.
 *    Note that we don't bother with a special list for completely empty groups
 *    so we only have MB_NUM_ORDERS(sb) lists.
 *
 * When "mb_optimize_scan" mount option is set, mballoc consults the above data
 * structures to decide the order in which groups are to be traversed for
 * fulfilling an allocation request.
 *
 * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order
 * >= the order of the request. We directly look at the largest free order list
 * in the data structure (1) above where largest_free_order = order of the
 * request. If that list is empty, we look at remaining list in the increasing
 * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED
 * lookup in O(1) time.
 *
 * At CR_GOAL_LEN_FAST, we only consider groups where
 * average fragment size > request size. So, we lookup a group which has average
 * fragment size just above or equal to request size using our average fragment
 * size group lists (data structure 2) in O(1) time.
 *
 * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied
 * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in
 * CR_GOAL_LEN_FAST suggests that there is no BG that has avg
 * fragment size > goal length. So before falling to the slower
 * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and
 * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big
 * enough average fragment size. This increases the chances of finding a
 * suitable block group in O(1) time and results in faster allocation at the
 * cost of reduced size of allocation.
 *
 * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
 * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and
 * CR_GOAL_LEN_FAST phase.
 *
 * The regular allocator (using the buddy cache) supports a few tunables.
 *
 * /sys/fs/ext4/<partition>/mb_min_to_scan
 * /sys/fs/ext4/<partition>/mb_max_to_scan
 * /sys/fs/ext4/<partition>/mb_order2_req
 * /sys/fs/ext4/<partition>/mb_linear_limit
 *
 * The regular allocator uses buddy scan only if the request len is power of
 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
 * value of s_mb_order2_reqs can be tuned via
 * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
 * stripe size (sbi->s_stripe), we try to search for contiguous block in
 * stripe size. This should result in better allocation on RAID setups. If
 * not, we search in the specific group using bitmap for best extents. The
 * tunable min_to_scan and max_to_scan control the behaviour here.
 * min_to_scan indicate how long the mballoc __must__ look for a best
 * extent and max_to_scan indicates how long the mballoc __can__ look for a
 * best extent in the found extents. Searching for the blocks starts with
 * the group specified as the goal value in allocation context via
 * ac_g_ex. Each group is first checked based on the criteria whether it
 * can be used for allocation. ext4_mb_good_group explains how the groups are
 * checked.
 *
 * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not
 * get traversed linearly. That may result in subsequent allocations being not
 * close to each other. And so, the underlying device may get filled up in a
 * non-linear fashion. While that may not matter on non-rotational devices, for
 * rotational devices that may result in higher seek times. "mb_linear_limit"
 * tells mballoc how many groups mballoc should search linearly before
 * performing consulting above data structures for more efficient lookups. For
 * non rotational devices, this value defaults to 0 and for rotational devices
 * this is set to MB_DEFAULT_LINEAR_LIMIT.
 *
 * Both the prealloc space are getting populated as above. So for the first
 * request we will hit the buddy cache which will result in this prealloc
 * space getting filled. The prealloc space is then later used for the
 * subsequent request.
 */

/*
 * mballoc operates on the following data:
 *  - on-disk bitmap
 *  - in-core buddy (actually includes buddy and bitmap)
 *  - preallocation descriptors (PAs)
 *
 * there are two types of preallocations:
 *  - inode
 *    assiged to specific inode and can be used for this inode only.
 *    it describes part of inode's space preallocated to specific
 *    physical blocks. any block from that preallocated can be used
 *    independent. the descriptor just tracks number of blocks left
 *    unused. so, before taking some block from descriptor, one must
 *    make sure corresponded logical block isn't allocated yet. this
 *    also means that freeing any block within descriptor's range
 *    must discard all preallocated blocks.
 *  - locality group
 *    assigned to specific locality group which does not translate to
 *    permanent set of inodes: inode can join and leave group. space
 *    from this type of preallocation can be used for any inode. thus
 *    it's consumed from the beginning to the end.
 *
 * relation between them can be expressed as:
 *    in-core buddy = on-disk bitmap + preallocation descriptors
 *
 * this mean blocks mballoc considers used are:
 *  - allocated blocks (persistent)
 *  - preallocated blocks (non-persistent)
 *
 * consistency in mballoc world means that at any time a block is either
 * free or used in ALL structures. notice: "any time" should not be read
 * literally -- time is discrete and delimited by locks.
 *
 *  to keep it simple, we don't use block numbers, instead we count number of
 *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
 *
 * all operations can be expressed as:
 *  - init buddy:                        buddy = on-disk + PAs
 *  - new PA:                                buddy += N; PA = N
 *  - use inode PA:                        on-disk += N; PA -= N
 *  - discard inode PA                        buddy -= on-disk - PA; PA = 0
 *  - use locality group PA                on-disk += N; PA -= N
 *  - discard locality group PA                buddy -= PA; PA = 0
 *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
 *        is used in real operation because we can't know actual used
 *        bits from PA, only from on-disk bitmap
 *
 * if we follow this strict logic, then all operations above should be atomic.
 * given some of them can block, we'd have to use something like semaphores
 * killing performance on high-end SMP hardware. let's try to relax it using
 * the following knowledge:
 *  1) if buddy is referenced, it's already initialized
 *  2) while block is used in buddy and the buddy is referenced,
 *     nobody can re-allocate that block
 *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
 *     bit set and PA claims same block, it's OK. IOW, one can set bit in
 *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
 *     block
 *
 * so, now we're building a concurrency table:
 *  - init buddy vs.
 *    - new PA
 *      blocks for PA are allocated in the buddy, buddy must be referenced
 *      until PA is linked to allocation group to avoid concurrent buddy init
 *    - use inode PA
 *      we need to make sure that either on-disk bitmap or PA has uptodate data
 *      given (3) we care that PA-=N operation doesn't interfere with init
 *    - discard inode PA
 *      the simplest way would be to have buddy initialized by the discard
 *    - use locality group PA
 *      again PA-=N must be serialized with init
 *    - discard locality group PA
 *      the simplest way would be to have buddy initialized by the discard
 *  - new PA vs.
 *    - use inode PA
 *      i_data_sem serializes them
 *    - discard inode PA
 *      discard process must wait until PA isn't used by another process
 *    - use locality group PA
 *      some mutex should serialize them
 *    - discard locality group PA
 *      discard process must wait until PA isn't used by another process
 *  - use inode PA
 *    - use inode PA
 *      i_data_sem or another mutex should serializes them
 *    - discard inode PA
 *      discard process must wait until PA isn't used by another process
 *    - use locality group PA
 *      nothing wrong here -- they're different PAs covering different blocks
 *    - discard locality group PA
 *      discard process must wait until PA isn't used by another process
 *
 * now we're ready to make few consequences:
 *  - PA is referenced and while it is no discard is possible
 *  - PA is referenced until block isn't marked in on-disk bitmap
 *  - PA changes only after on-disk bitmap
 *  - discard must not compete with init. either init is done before
 *    any discard or they're serialized somehow
 *  - buddy init as sum of on-disk bitmap and PAs is done atomically
 *
 * a special case when we've used PA to emptiness. no need to modify buddy
 * in this case, but we should care about concurrent init
 *
 */

 /*
 * Logic in few words:
 *
 *  - allocation:
 *    load group
 *    find blocks
 *    mark bits in on-disk bitmap
 *    release group
 *
 *  - use preallocation:
 *    find proper PA (per-inode or group)
 *    load group
 *    mark bits in on-disk bitmap
 *    release group
 *    release PA
 *
 *  - free:
 *    load group
 *    mark bits in on-disk bitmap
 *    release group
 *
 *  - discard preallocations in group:
 *    mark PAs deleted
 *    move them onto local list
 *    load on-disk bitmap
 *    load group
 *    remove PA from object (inode or locality group)
 *    mark free blocks in-core
 *
 *  - discard inode's preallocations:
 */

/*
 * Locking rules
 *
 * Locks:
 *  - bitlock on a group        (group)
 *  - object (inode/locality)        (object)
 *  - per-pa lock                (pa)
 *  - cr_power2_aligned lists lock        (cr_power2_aligned)
 *  - cr_goal_len_fast lists lock        (cr_goal_len_fast)
 *
 * Paths:
 *  - new pa
 *    object
 *    group
 *
 *  - find and use pa:
 *    pa
 *
 *  - release consumed pa:
 *    pa
 *    group
 *    object
 *
 *  - generate in-core bitmap:
 *    group
 *        pa
 *
 *  - discard all for given object (inode, locality group):
 *    object
 *        pa
 *    group
 *
 *  - discard all for given group:
 *    group
 *        pa
 *    group
 *        object
 *
 *  - allocation path (ext4_mb_regular_allocator)
 *    group
 *    cr_power2_aligned/cr_goal_len_fast
 */
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
static struct kmem_cache *ext4_free_data_cachep;

/* We create slab caches for groupinfo data structures based on the
 * superblock block size.  There will be one per mounted filesystem for
 * each unique s_blocksize_bits */
#define NR_GRPINFO_CACHES 8
static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];

static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
        "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
        "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
        "ext4_groupinfo_64k", "ext4_groupinfo_128k"
};

static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group);
static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);

static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
                               ext4_group_t group, enum criteria cr);

static int ext4_try_to_trim_range(struct super_block *sb,
                struct ext4_buddy *e4b, ext4_grpblk_t start,
                ext4_grpblk_t max, ext4_grpblk_t minblocks);

/*
 * The algorithm using this percpu seq counter goes below:
 * 1. We sample the percpu discard_pa_seq counter before trying for block
 *    allocation in ext4_mb_new_blocks().
 * 2. We increment this percpu discard_pa_seq counter when we either allocate
 *    or free these blocks i.e. while marking those blocks as used/free in
 *    mb_mark_used()/mb_free_blocks().
 * 3. We also increment this percpu seq counter when we successfully identify
 *    that the bb_prealloc_list is not empty and hence proceed for discarding
 *    of those PAs inside ext4_mb_discard_group_preallocations().
 *
 * Now to make sure that the regular fast path of block allocation is not
 * affected, as a small optimization we only sample the percpu seq counter
 * on that cpu. Only when the block allocation fails and when freed blocks
 * found were 0, that is when we sample percpu seq counter for all cpus using
 * below function ext4_get_discard_pa_seq_sum(). This happens after making
 * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty.
 */
static DEFINE_PER_CPU(u64, discard_pa_seq);
static inline u64 ext4_get_discard_pa_seq_sum(void)
{
        int __cpu;
        u64 __seq = 0;

        for_each_possible_cpu(__cpu)
                __seq += per_cpu(discard_pa_seq, __cpu);
        return __seq;
}

static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
{
#if BITS_PER_LONG == 64
        *bit += ((unsigned long) addr & 7UL) << 3;
        addr = (void *) ((unsigned long) addr & ~7UL);
#elif BITS_PER_LONG == 32
        *bit += ((unsigned long) addr & 3UL) << 3;
        addr = (void *) ((unsigned long) addr & ~3UL);
#else
#error "how many bits you are?!"
#endif
        return addr;
}

static inline int mb_test_bit(int bit, void *addr)
{
        /*
         * ext4_test_bit on architecture like powerpc
         * needs unsigned long aligned address
         */
        addr = mb_correct_addr_and_bit(&bit, addr);
        return ext4_test_bit(bit, addr);
}

static inline void mb_set_bit(int bit, void *addr)
{
        addr = mb_correct_addr_and_bit(&bit, addr);
        ext4_set_bit(bit, addr);
}

static inline void mb_clear_bit(int bit, void *addr)
{
        addr = mb_correct_addr_and_bit(&bit, addr);
        ext4_clear_bit(bit, addr);
}

static inline int mb_test_and_clear_bit(int bit, void *addr)
{
        addr = mb_correct_addr_and_bit(&bit, addr);
        return ext4_test_and_clear_bit(bit, addr);
}

static inline int mb_find_next_zero_bit(void *addr, int max, int start)
{
        int fix = 0, ret, tmpmax;
        addr = mb_correct_addr_and_bit(&fix, addr);
        tmpmax = max + fix;
        start += fix;

        ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
        if (ret > max)
                return max;
        return ret;
}

static inline int mb_find_next_bit(void *addr, int max, int start)
{
        int fix = 0, ret, tmpmax;
        addr = mb_correct_addr_and_bit(&fix, addr);
        tmpmax = max + fix;
        start += fix;

        ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
        if (ret > max)
                return max;
        return ret;
}

static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
{
        char *bb;

        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
        BUG_ON(max == NULL);

        if (order > e4b->bd_blkbits + 1) {
                *max = 0;
                return NULL;
        }

        /* at order 0 we see each particular block */
        if (order == 0) {
                *max = 1 << (e4b->bd_blkbits + 3);
                return e4b->bd_bitmap;
        }

        bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
        *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];

        return bb;
}

#ifdef DOUBLE_CHECK
static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
                           int first, int count)
{
        int i;
        struct super_block *sb = e4b->bd_sb;

        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
                return;
        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
        for (i = 0; i < count; i++) {
                if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
                        ext4_fsblk_t blocknr;

                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
                        blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
                        ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                        ext4_grp_locked_error(sb, e4b->bd_group,
                                              inode ? inode->i_ino : 0,
                                              blocknr,
                                              "freeing block already freed "
                                              "(bit %u)",
                                              first + i);
                }
                mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
        }
}

static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
{
        int i;

        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
                return;
        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        for (i = 0; i < count; i++) {
                BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
                mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
        }
}

static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
{
        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
                return;
        if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
                unsigned char *b1, *b2;
                int i;
                b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
                b2 = (unsigned char *) bitmap;
                for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
                        if (b1[i] != b2[i]) {
                                ext4_msg(e4b->bd_sb, KERN_ERR,
                                         "corruption in group %u "
                                         "at byte %u(%u): %x in copy != %x "
                                         "on disk/prealloc",
                                         e4b->bd_group, i, i * 8, b1[i], b2[i]);
                                BUG();
                        }
                }
        }
}

static void mb_group_bb_bitmap_alloc(struct super_block *sb,
                        struct ext4_group_info *grp, ext4_group_t group)
{
        struct buffer_head *bh;

        grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
        if (!grp->bb_bitmap)
                return;

        bh = ext4_read_block_bitmap(sb, group);
        if (IS_ERR_OR_NULL(bh)) {
                kfree(grp->bb_bitmap);
                grp->bb_bitmap = NULL;
                return;
        }

        memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize);
        put_bh(bh);
}

static void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
{
        kfree(grp->bb_bitmap);
}

#else
static inline void mb_free_blocks_double(struct inode *inode,
                                struct ext4_buddy *e4b, int first, int count)
{
        return;
}
static inline void mb_mark_used_double(struct ext4_buddy *e4b,
                                                int first, int count)
{
        return;
}
static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
{
        return;
}

static inline void mb_group_bb_bitmap_alloc(struct super_block *sb,
                        struct ext4_group_info *grp, ext4_group_t group)
{
        return;
}

static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
{
        return;
}
#endif

#ifdef AGGRESSIVE_CHECK

#define MB_CHECK_ASSERT(assert)                                                \
do {                                                                        \
        if (!(assert)) {                                                \
                printk(KERN_EMERG                                        \
                        "Assertion failure in %s() at %s:%d: \"%s\"\n",        \
                        function, file, line, # assert);                \
                BUG();                                                        \
        }                                                                \
} while (0)

static void __mb_check_buddy(struct ext4_buddy *e4b, char *file,
                                const char *function, int line)
{
        struct super_block *sb = e4b->bd_sb;
        int order = e4b->bd_blkbits + 1;
        int max;
        int max2;
        int i;
        int j;
        int k;
        int count;
        struct ext4_group_info *grp;
        int fragments = 0;
        int fstart;
        struct list_head *cur;
        void *buddy;
        void *buddy2;

        if (e4b->bd_info->bb_check_counter++ % 10)
                return;

        while (order > 1) {
                buddy = mb_find_buddy(e4b, order, &max);
                MB_CHECK_ASSERT(buddy);
                buddy2 = mb_find_buddy(e4b, order - 1, &max2);
                MB_CHECK_ASSERT(buddy2);
                MB_CHECK_ASSERT(buddy != buddy2);
                MB_CHECK_ASSERT(max * 2 == max2);

                count = 0;
                for (i = 0; i < max; i++) {

                        if (mb_test_bit(i, buddy)) {
                                /* only single bit in buddy2 may be 0 */
                                if (!mb_test_bit(i << 1, buddy2)) {
                                        MB_CHECK_ASSERT(
                                                mb_test_bit((i<<1)+1, buddy2));
                                }
                                continue;
                        }

                        /* both bits in buddy2 must be 1 */
                        MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
                        MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));

                        for (j = 0; j < (1 << order); j++) {
                                k = (i * (1 << order)) + j;
                                MB_CHECK_ASSERT(
                                        !mb_test_bit(k, e4b->bd_bitmap));
                        }
                        count++;
                }
                MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
                order--;
        }

        fstart = -1;
        buddy = mb_find_buddy(e4b, 0, &max);
        for (i = 0; i < max; i++) {
                if (!mb_test_bit(i, buddy)) {
                        MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
                        if (fstart == -1) {
                                fragments++;
                                fstart = i;
                        }
                        continue;
                }
                fstart = -1;
                /* check used bits only */
                for (j = 0; j < e4b->bd_blkbits + 1; j++) {
                        buddy2 = mb_find_buddy(e4b, j, &max2);
                        k = i >> j;
                        MB_CHECK_ASSERT(k < max2);
                        MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
                }
        }
        MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
        MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);

        grp = ext4_get_group_info(sb, e4b->bd_group);
        if (!grp)
                return;
        list_for_each(cur, &grp->bb_prealloc_list) {
                ext4_group_t groupnr;
                struct ext4_prealloc_space *pa;
                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
                MB_CHECK_ASSERT(groupnr == e4b->bd_group);
                for (i = 0; i < pa->pa_len; i++)
                        MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
        }
}
#undef MB_CHECK_ASSERT
#define mb_check_buddy(e4b) __mb_check_buddy(e4b,        \
                                        __FILE__, __func__, __LINE__)
#else
#define mb_check_buddy(e4b)
#endif

/*
 * Divide blocks started from @first with length @len into
 * smaller chunks with power of 2 blocks.
 * Clear the bits in bitmap which the blocks of the chunk(s) covered,
 * then increase bb_counters[] for corresponded chunk size.
 */
static void ext4_mb_mark_free_simple(struct super_block *sb,
                                void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
                                        struct ext4_group_info *grp)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_grpblk_t min;
        ext4_grpblk_t max;
        ext4_grpblk_t chunk;
        unsigned int border;

        BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));

        border = 2 << sb->s_blocksize_bits;

        while (len > 0) {
                /* find how many blocks can be covered since this position */
                max = ffs(first | border) - 1;

                /* find how many blocks of power 2 we need to mark */
                min = fls(len) - 1;

                if (max < min)
                        min = max;
                chunk = 1 << min;

                /* mark multiblock chunks only */
                grp->bb_counters[min]++;
                if (min > 0)
                        mb_clear_bit(first >> min,
                                     buddy + sbi->s_mb_offsets[min]);

                len -= chunk;
                first += chunk;
        }
}

static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
{
        int order;

        /*
         * We don't bother with a special lists groups with only 1 block free
         * extents and for completely empty groups.
         */
        order = fls(len) - 2;
        if (order < 0)
                return 0;
        if (order == MB_NUM_ORDERS(sb))
                order--;
        if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb)))
                order = MB_NUM_ORDERS(sb) - 1;
        return order;
}

/* Move group to appropriate avg_fragment_size list */
static void
mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int new_order;

        if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_fragments == 0)
                return;

        new_order = mb_avg_fragment_size_order(sb,
                                        grp->bb_free / grp->bb_fragments);
        if (new_order == grp->bb_avg_fragment_size_order)
                return;

        if (grp->bb_avg_fragment_size_order != -1) {
                write_lock(&sbi->s_mb_avg_fragment_size_locks[
                                        grp->bb_avg_fragment_size_order]);
                list_del(&grp->bb_avg_fragment_size_node);
                write_unlock(&sbi->s_mb_avg_fragment_size_locks[
                                        grp->bb_avg_fragment_size_order]);
        }
        grp->bb_avg_fragment_size_order = new_order;
        write_lock(&sbi->s_mb_avg_fragment_size_locks[
                                        grp->bb_avg_fragment_size_order]);
        list_add_tail(&grp->bb_avg_fragment_size_node,
                &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]);
        write_unlock(&sbi->s_mb_avg_fragment_size_locks[
                                        grp->bb_avg_fragment_size_order]);
}

/*
 * Choose next group by traversing largest_free_order lists. Updates *new_cr if
 * cr level needs an update.
 */
static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac,
                        enum criteria *new_cr, ext4_group_t *group)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_group_info *iter;
        int i;

        if (ac->ac_status == AC_STATUS_FOUND)
                return;

        if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED))
                atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions);

        for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
                if (list_empty(&sbi->s_mb_largest_free_orders[i]))
                        continue;
                read_lock(&sbi->s_mb_largest_free_orders_locks[i]);
                if (list_empty(&sbi->s_mb_largest_free_orders[i])) {
                        read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
                        continue;
                }
                list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i],
                                    bb_largest_free_order_node) {
                        if (sbi->s_mb_stats)
                                atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]);
                        if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) {
                                *group = iter->bb_group;
                                ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED;
                                read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
                                return;
                        }
                }
                read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
        }

        /* Increment cr and search again if no group is found */
        *new_cr = CR_GOAL_LEN_FAST;
}

/*
 * Find a suitable group of given order from the average fragments list.
 */
static struct ext4_group_info *
ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int order)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct list_head *frag_list = &sbi->s_mb_avg_fragment_size[order];
        rwlock_t *frag_list_lock = &sbi->s_mb_avg_fragment_size_locks[order];
        struct ext4_group_info *grp = NULL, *iter;
        enum criteria cr = ac->ac_criteria;

        if (list_empty(frag_list))
                return NULL;
        read_lock(frag_list_lock);
        if (list_empty(frag_list)) {
                read_unlock(frag_list_lock);
                return NULL;
        }
        list_for_each_entry(iter, frag_list, bb_avg_fragment_size_node) {
                if (sbi->s_mb_stats)
                        atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]);
                if (likely(ext4_mb_good_group(ac, iter->bb_group, cr))) {
                        grp = iter;
                        break;
                }
        }
        read_unlock(frag_list_lock);
        return grp;
}

/*
 * Choose next group by traversing average fragment size list of suitable
 * order. Updates *new_cr if cr level needs an update.
 */
static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac,
                enum criteria *new_cr, ext4_group_t *group)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_group_info *grp = NULL;
        int i;

        if (unlikely(ac->ac_flags & EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED)) {
                if (sbi->s_mb_stats)
                        atomic_inc(&sbi->s_bal_goal_fast_bad_suggestions);
        }

        for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
             i < MB_NUM_ORDERS(ac->ac_sb); i++) {
                grp = ext4_mb_find_good_group_avg_frag_lists(ac, i);
                if (grp) {
                        *group = grp->bb_group;
                        ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED;
                        return;
                }
        }

        /*
         * CR_BEST_AVAIL_LEN works based on the concept that we have
         * a larger normalized goal len request which can be trimmed to
         * a smaller goal len such that it can still satisfy original
         * request len. However, allocation request for non-regular
         * files never gets normalized.
         * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA).
         */
        if (ac->ac_flags & EXT4_MB_HINT_DATA)
                *new_cr = CR_BEST_AVAIL_LEN;
        else
                *new_cr = CR_GOAL_LEN_SLOW;
}

/*
 * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment
 * order we have and proactively trim the goal request length to that order to
 * find a suitable group faster.
 *
 * This optimizes allocation speed at the cost of slightly reduced
 * preallocations. However, we make sure that we don't trim the request too
 * much and fall to CR_GOAL_LEN_SLOW in that case.
 */
static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac,
                enum criteria *new_cr, ext4_group_t *group)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_group_info *grp = NULL;
        int i, order, min_order;
        unsigned long num_stripe_clusters = 0;

        if (unlikely(ac->ac_flags & EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED)) {
                if (sbi->s_mb_stats)
                        atomic_inc(&sbi->s_bal_best_avail_bad_suggestions);
        }

        /*
         * mb_avg_fragment_size_order() returns order in a way that makes
         * retrieving back the length using (1 << order) inaccurate. Hence, use
         * fls() instead since we need to know the actual length while modifying
         * goal length.
         */
        order = fls(ac->ac_g_ex.fe_len) - 1;
        if (WARN_ON_ONCE(order - 1 > MB_NUM_ORDERS(ac->ac_sb)))
                order = MB_NUM_ORDERS(ac->ac_sb);
        min_order = order - sbi->s_mb_best_avail_max_trim_order;
        if (min_order < 0)
                min_order = 0;

        if (sbi->s_stripe > 0) {
                /*
                 * We are assuming that stripe size is always a multiple of
                 * cluster ratio otherwise __ext4_fill_super exists early.
                 */
                num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe);
                if (1 << min_order < num_stripe_clusters)
                        /*
                         * We consider 1 order less because later we round
                         * up the goal len to num_stripe_clusters
                         */
                        min_order = fls(num_stripe_clusters) - 1;
        }

        if (1 << min_order < ac->ac_o_ex.fe_len)
                min_order = fls(ac->ac_o_ex.fe_len);

        for (i = order; i >= min_order; i--) {
                int frag_order;
                /*
                 * Scale down goal len to make sure we find something
                 * in the free fragments list. Basically, reduce
                 * preallocations.
                 */
                ac->ac_g_ex.fe_len = 1 << i;

                if (num_stripe_clusters > 0) {
                        /*
                         * Try to round up the adjusted goal length to
                         * stripe size (in cluster units) multiple for
                         * efficiency.
                         */
                        ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len,
                                                     num_stripe_clusters);
                }

                frag_order = mb_avg_fragment_size_order(ac->ac_sb,
                                                        ac->ac_g_ex.fe_len);

                grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order);
                if (grp) {
                        *group = grp->bb_group;
                        ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED;
                        return;
                }
        }

        /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */
        ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
        *new_cr = CR_GOAL_LEN_SLOW;
}

static inline int should_optimize_scan(struct ext4_allocation_context *ac)
{
        if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN)))
                return 0;
        if (ac->ac_criteria >= CR_GOAL_LEN_SLOW)
                return 0;
        if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
                return 0;
        return 1;
}

/*
 * Return next linear group for allocation.
 */
static ext4_group_t
next_linear_group(ext4_group_t group, ext4_group_t ngroups)
{
        /*
         * Artificially restricted ngroups for non-extent
         * files makes group > ngroups possible on first loop.
         */
        return group + 1 >= ngroups ? 0 : group + 1;
}

/*
 * ext4_mb_choose_next_group: choose next group for allocation.
 *
 * @ac        Allocation Context
 * @new_cr    This is an output parameter. If the there is no good group
 *            available at current CR level, this field is updated to indicate
 *            the new cr level that should be used.
 * @group     This is an input / output parameter. As an input it indicates the
 *            next group that the allocator intends to use for allocation. As
 *            output, this field indicates the next group that should be used as
 *            determined by the optimization functions.
 * @ngroups   Total number of groups
 */
static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
                enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
{
        *new_cr = ac->ac_criteria;

        if (!should_optimize_scan(ac)) {
                *group = next_linear_group(*group, ngroups);
                return;
        }

        /*
         * Optimized scanning can return non adjacent groups which can cause
         * seek overhead for rotational disks. So try few linear groups before
         * trying optimized scan.
         */
        if (ac->ac_groups_linear_remaining) {
                *group = next_linear_group(*group, ngroups);
                ac->ac_groups_linear_remaining--;
                return;
        }

        if (*new_cr == CR_POWER2_ALIGNED) {
                ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group);
        } else if (*new_cr == CR_GOAL_LEN_FAST) {
                ext4_mb_choose_next_group_goal_fast(ac, new_cr, group);
        } else if (*new_cr == CR_BEST_AVAIL_LEN) {
                ext4_mb_choose_next_group_best_avail(ac, new_cr, group);
        } else {
                /*
                 * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an
                 * rb tree sorted by bb_free. But until that happens, we should
                 * never come here.
                 */
                WARN_ON(1);
        }
}

/*
 * Cache the order of the largest free extent we have available in this block
 * group.
 */
static void
mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int i;

        for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--)
                if (grp->bb_counters[i] > 0)
                        break;
        /* No need to move between order lists? */
        if (!test_opt2(sb, MB_OPTIMIZE_SCAN) ||
            i == grp->bb_largest_free_order) {
                grp->bb_largest_free_order = i;
                return;
        }

        if (grp->bb_largest_free_order >= 0) {
                write_lock(&sbi->s_mb_largest_free_orders_locks[
                                              grp->bb_largest_free_order]);
                list_del_init(&grp->bb_largest_free_order_node);
                write_unlock(&sbi->s_mb_largest_free_orders_locks[
                                              grp->bb_largest_free_order]);
        }
        grp->bb_largest_free_order = i;
        if (grp->bb_largest_free_order >= 0 && grp->bb_free) {
                write_lock(&sbi->s_mb_largest_free_orders_locks[
                                              grp->bb_largest_free_order]);
                list_add_tail(&grp->bb_largest_free_order_node,
                      &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]);
                write_unlock(&sbi->s_mb_largest_free_orders_locks[
                                              grp->bb_largest_free_order]);
        }
}

static noinline_for_stack
void ext4_mb_generate_buddy(struct super_block *sb,
                            void *buddy, void *bitmap, ext4_group_t group,
                            struct ext4_group_info *grp)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
        ext4_grpblk_t i = 0;
        ext4_grpblk_t first;
        ext4_grpblk_t len;
        unsigned free = 0;
        unsigned fragments = 0;
        unsigned long long period = get_cycles();

        /* initialize buddy from bitmap which is aggregation
         * of on-disk bitmap and preallocations */
        i = mb_find_next_zero_bit(bitmap, max, 0);
        grp->bb_first_free = i;
        while (i < max) {
                fragments++;
                first = i;
                i = mb_find_next_bit(bitmap, max, i);
                len = i - first;
                free += len;
                if (len > 1)
                        ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
                else
                        grp->bb_counters[0]++;
                if (i < max)
                        i = mb_find_next_zero_bit(bitmap, max, i);
        }
        grp->bb_fragments = fragments;

        if (free != grp->bb_free) {
                ext4_grp_locked_error(sb, group, 0, 0,
                                      "block bitmap and bg descriptor "
                                      "inconsistent: %u vs %u free clusters",
                                      free, grp->bb_free);
                /*
                 * If we intend to continue, we consider group descriptor
                 * corrupt and update bb_free using bitmap value
                 */
                grp->bb_free = free;
                ext4_mark_group_bitmap_corrupted(sb, group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
        }
        mb_set_largest_free_order(sb, grp);
        mb_update_avg_fragment_size(sb, grp);

        clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));

        period = get_cycles() - period;
        atomic_inc(&sbi->s_mb_buddies_generated);
        atomic64_add(period, &sbi->s_mb_generation_time);
}

static void mb_regenerate_buddy(struct ext4_buddy *e4b)
{
        int count;
        int order = 1;
        void *buddy;

        while ((buddy = mb_find_buddy(e4b, order++, &count)))
                mb_set_bits(buddy, 0, count);

        e4b->bd_info->bb_fragments = 0;
        memset(e4b->bd_info->bb_counters, 0,
                sizeof(*e4b->bd_info->bb_counters) *
                (e4b->bd_sb->s_blocksize_bits + 2));

        ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
                e4b->bd_bitmap, e4b->bd_group, e4b->bd_info);
}

/* The buddy information is attached the buddy cache inode
 * for convenience. The information regarding each group
 * is loaded via ext4_mb_load_buddy. The information involve
 * block bitmap and buddy information. The information are
 * stored in the inode as
 *
 * {                        page                        }
 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 *
 *
 * one block each for bitmap and buddy information.
 * So for each group we take up 2 blocks. A page can
 * contain blocks_per_page (PAGE_SIZE / blocksize)  blocks.
 * So it can have information regarding groups_per_page which
 * is blocks_per_page/2
 *
 * Locking note:  This routine takes the block group lock of all groups
 * for this page; do not hold this lock when calling this routine!
 */

static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp)
{
        ext4_group_t ngroups;
        unsigned int blocksize;
        int blocks_per_page;
        int groups_per_page;
        int err = 0;
        int i;
        ext4_group_t first_group, group;
        int first_block;
        struct super_block *sb;
        struct buffer_head *bhs;
        struct buffer_head **bh = NULL;
        struct inode *inode;
        char *data;
        char *bitmap;
        struct ext4_group_info *grinfo;

        inode = folio->mapping->host;
        sb = inode->i_sb;
        ngroups = ext4_get_groups_count(sb);
        blocksize = i_blocksize(inode);
        blocks_per_page = PAGE_SIZE / blocksize;

        mb_debug(sb, "init folio %lu\n", folio->index);

        groups_per_page = blocks_per_page >> 1;
        if (groups_per_page == 0)
                groups_per_page = 1;

        /* allocate buffer_heads to read bitmaps */
        if (groups_per_page > 1) {
                i = sizeof(struct buffer_head *) * groups_per_page;
                bh = kzalloc(i, gfp);
                if (bh == NULL)
                        return -ENOMEM;
        } else
                bh = &bhs;

        first_group = folio->index * blocks_per_page / 2;

        /* read all groups the folio covers into the cache */
        for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
                if (group >= ngroups)
                        break;

                grinfo = ext4_get_group_info(sb, group);
                if (!grinfo)
                        continue;
                /*
                 * If page is uptodate then we came here after online resize
                 * which added some new uninitialized group info structs, so
                 * we must skip all initialized uptodate buddies on the folio,
                 * which may be currently in use by an allocating task.
                 */
                if (folio_test_uptodate(folio) &&
                                !EXT4_MB_GRP_NEED_INIT(grinfo)) {
                        bh[i] = NULL;
                        continue;
                }
                bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
                if (IS_ERR(bh[i])) {
                        err = PTR_ERR(bh[i]);
                        bh[i] = NULL;
                        goto out;
                }
                mb_debug(sb, "read bitmap for group %u\n", group);
        }

        /* wait for I/O completion */
        for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
                int err2;

                if (!bh[i])
                        continue;
                err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
                if (!err)
                        err = err2;
        }

        first_block = folio->index * blocks_per_page;
        for (i = 0; i < blocks_per_page; i++) {
                group = (first_block + i) >> 1;
                if (group >= ngroups)
                        break;

                if (!bh[group - first_group])
                        /* skip initialized uptodate buddy */
                        continue;

                if (!buffer_verified(bh[group - first_group]))
                        /* Skip faulty bitmaps */
                        continue;
                err = 0;

                /*
                 * data carry information regarding this
                 * particular group in the format specified
                 * above
                 *
                 */
                data = folio_address(folio) + (i * blocksize);
                bitmap = bh[group - first_group]->b_data;

                /*
                 * We place the buddy block and bitmap block
                 * close together
                 */
                grinfo = ext4_get_group_info(sb, group);
                if (!grinfo) {
                        err = -EFSCORRUPTED;
                        goto out;
                }
                if ((first_block + i) & 1) {
                        /* this is block of buddy */
                        BUG_ON(incore == NULL);
                        mb_debug(sb, "put buddy for group %u in folio %lu/%x\n",
                                group, folio->index, i * blocksize);
                        trace_ext4_mb_buddy_bitmap_load(sb, group);
                        grinfo->bb_fragments = 0;
                        memset(grinfo->bb_counters, 0,
                               sizeof(*grinfo->bb_counters) *
                               (MB_NUM_ORDERS(sb)));
                        /*
                         * incore got set to the group block bitmap below
                         */
                        ext4_lock_group(sb, group);
                        /* init the buddy */
                        memset(data, 0xff, blocksize);
                        ext4_mb_generate_buddy(sb, data, incore, group, grinfo);
                        ext4_unlock_group(sb, group);
                        incore = NULL;
                } else {
                        /* this is block of bitmap */
                        BUG_ON(incore != NULL);
                        mb_debug(sb, "put bitmap for group %u in folio %lu/%x\n",
                                group, folio->index, i * blocksize);
                        trace_ext4_mb_bitmap_load(sb, group);

                        /* see comments in ext4_mb_put_pa() */
                        ext4_lock_group(sb, group);
                        memcpy(data, bitmap, blocksize);

                        /* mark all preallocated blks used in in-core bitmap */
                        ext4_mb_generate_from_pa(sb, data, group);
                        WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root));
                        ext4_unlock_group(sb, group);

                        /* set incore so that the buddy information can be
                         * generated using this
                         */
                        incore = data;
                }
        }
        folio_mark_uptodate(folio);

out:
        if (bh) {
                for (i = 0; i < groups_per_page; i++)
                        brelse(bh[i]);
                if (bh != &bhs)
                        kfree(bh);
        }
        return err;
}

/*
 * Lock the buddy and bitmap pages. This make sure other parallel init_group
 * on the same buddy page doesn't happen whild holding the buddy page lock.
 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
 * are on the same page e4b->bd_buddy_folio is NULL and return value is 0.
 */
static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
                ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
{
        struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
        int block, pnum, poff;
        int blocks_per_page;
        struct folio *folio;

        e4b->bd_buddy_folio = NULL;
        e4b->bd_bitmap_folio = NULL;

        blocks_per_page = PAGE_SIZE / sb->s_blocksize;
        /*
         * the buddy cache inode stores the block bitmap
         * and buddy information in consecutive blocks.
         * So for each group we need two blocks.
         */
        block = group * 2;
        pnum = block / blocks_per_page;
        poff = block % blocks_per_page;
        folio = __filemap_get_folio(inode->i_mapping, pnum,
                        FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
        if (IS_ERR(folio))
                return PTR_ERR(folio);
        BUG_ON(folio->mapping != inode->i_mapping);
        e4b->bd_bitmap_folio = folio;
        e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize);

        if (blocks_per_page >= 2) {
                /* buddy and bitmap are on the same page */
                return 0;
        }

        /* blocks_per_page == 1, hence we need another page for the buddy */
        folio = __filemap_get_folio(inode->i_mapping, block + 1,
                        FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
        if (IS_ERR(folio))
                return PTR_ERR(folio);
        BUG_ON(folio->mapping != inode->i_mapping);
        e4b->bd_buddy_folio = folio;
        return 0;
}

static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
{
        if (e4b->bd_bitmap_folio) {
                folio_unlock(e4b->bd_bitmap_folio);
                folio_put(e4b->bd_bitmap_folio);
        }
        if (e4b->bd_buddy_folio) {
                folio_unlock(e4b->bd_buddy_folio);
                folio_put(e4b->bd_buddy_folio);
        }
}

/*
 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
 * block group lock of all groups for this page; do not hold the BG lock when
 * calling this routine!
 */
static noinline_for_stack
int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
{

        struct ext4_group_info *this_grp;
        struct ext4_buddy e4b;
        struct folio *folio;
        int ret = 0;

        might_sleep();
        mb_debug(sb, "init group %u\n", group);
        this_grp = ext4_get_group_info(sb, group);
        if (!this_grp)
                return -EFSCORRUPTED;

        /*
         * This ensures that we don't reinit the buddy cache
         * page which map to the group from which we are already
         * allocating. If we are looking at the buddy cache we would
         * have taken a reference using ext4_mb_load_buddy and that
         * would have pinned buddy page to page cache.
         * The call to ext4_mb_get_buddy_page_lock will mark the
         * page accessed.
         */
        ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
        if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
                /*
                 * somebody initialized the group
                 * return without doing anything
                 */
                goto err;
        }

        folio = e4b.bd_bitmap_folio;
        ret = ext4_mb_init_cache(folio, NULL, gfp);
        if (ret)
                goto err;
        if (!folio_test_uptodate(folio)) {
                ret = -EIO;
                goto err;
        }

        if (e4b.bd_buddy_folio == NULL) {
                /*
                 * If both the bitmap and buddy are in
                 * the same page we don't need to force
                 * init the buddy
                 */
                ret = 0;
                goto err;
        }
        /* init buddy cache */
        folio = e4b.bd_buddy_folio;
        ret = ext4_mb_init_cache(folio, e4b.bd_bitmap, gfp);
        if (ret)
                goto err;
        if (!folio_test_uptodate(folio)) {
                ret = -EIO;
                goto err;
        }
err:
        ext4_mb_put_buddy_page_lock(&e4b);
        return ret;
}

/*
 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
 * block group lock of all groups for this page; do not hold the BG lock when
 * calling this routine!
 */
static noinline_for_stack int
ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
                       struct ext4_buddy *e4b, gfp_t gfp)
{
        int blocks_per_page;
        int block;
        int pnum;
        int poff;
        struct folio *folio;
        int ret;
        struct ext4_group_info *grp;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct inode *inode = sbi->s_buddy_cache;

        might_sleep();
        mb_debug(sb, "load group %u\n", group);

        blocks_per_page = PAGE_SIZE / sb->s_blocksize;
        grp = ext4_get_group_info(sb, group);
        if (!grp)
                return -EFSCORRUPTED;

        e4b->bd_blkbits = sb->s_blocksize_bits;
        e4b->bd_info = grp;
        e4b->bd_sb = sb;
        e4b->bd_group = group;
        e4b->bd_buddy_folio = NULL;
        e4b->bd_bitmap_folio = NULL;

        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
                /*
                 * we need full data about the group
                 * to make a good selection
                 */
                ret = ext4_mb_init_group(sb, group, gfp);
                if (ret)
                        return ret;
        }

        /*
         * the buddy cache inode stores the block bitmap
         * and buddy information in consecutive blocks.
         * So for each group we need two blocks.
         */
        block = group * 2;
        pnum = block / blocks_per_page;
        poff = block % blocks_per_page;

        /* Avoid locking the folio in the fast path ... */
        folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0);
        if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
                if (!IS_ERR(folio))
                        /*
                         * drop the folio reference and try
                         * to get the folio with lock. If we
                         * are not uptodate that implies
                         * somebody just created the folio but
                         * is yet to initialize it. So
                         * wait for it to initialize.
                         */
                        folio_put(folio);
                folio = __filemap_get_folio(inode->i_mapping, pnum,
                                FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
                if (!IS_ERR(folio)) {
                        if (WARN_RATELIMIT(folio->mapping != inode->i_mapping,
        "ext4: bitmap's mapping != inode->i_mapping\n")) {
                                /* should never happen */
                                folio_unlock(folio);
                                ret = -EINVAL;
                                goto err;
                        }
                        if (!folio_test_uptodate(folio)) {
                                ret = ext4_mb_init_cache(folio, NULL, gfp);
                                if (ret) {
                                        folio_unlock(folio);
                                        goto err;
                                }
                                mb_cmp_bitmaps(e4b, folio_address(folio) +
                                               (poff * sb->s_blocksize));
                        }
                        folio_unlock(folio);
                }
        }
        if (IS_ERR(folio)) {
                ret = PTR_ERR(folio);
                goto err;
        }
        if (!folio_test_uptodate(folio)) {
                ret = -EIO;
                goto err;
        }

        /* Folios marked accessed already */
        e4b->bd_bitmap_folio = folio;
        e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize);

        block++;
        pnum = block / blocks_per_page;
        poff = block % blocks_per_page;

        folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0);
        if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
                if (!IS_ERR(folio))
                        folio_put(folio);
                folio = __filemap_get_folio(inode->i_mapping, pnum,
                                FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
                if (!IS_ERR(folio)) {
                        if (WARN_RATELIMIT(folio->mapping != inode->i_mapping,
        "ext4: buddy bitmap's mapping != inode->i_mapping\n")) {
                                /* should never happen */
                                folio_unlock(folio);
                                ret = -EINVAL;
                                goto err;
                        }
                        if (!folio_test_uptodate(folio)) {
                                ret = ext4_mb_init_cache(folio, e4b->bd_bitmap,
                                                         gfp);
                                if (ret) {
                                        folio_unlock(folio);
                                        goto err;
                                }
                        }
                        folio_unlock(folio);
                }
        }
        if (IS_ERR(folio)) {
                ret = PTR_ERR(folio);
                goto err;
        }
        if (!folio_test_uptodate(folio)) {
                ret = -EIO;
                goto err;
        }

        /* Folios marked accessed already */
        e4b->bd_buddy_folio = folio;
        e4b->bd_buddy = folio_address(folio) + (poff * sb->s_blocksize);

        return 0;

err:
        if (!IS_ERR_OR_NULL(folio))
                folio_put(folio);
        if (e4b->bd_bitmap_folio)
                folio_put(e4b->bd_bitmap_folio);

        e4b->bd_buddy = NULL;
        e4b->bd_bitmap = NULL;
        return ret;
}

static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                              struct ext4_buddy *e4b)
{
        return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
}

static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
{
        if (e4b->bd_bitmap_folio)
                folio_put(e4b->bd_bitmap_folio);
        if (e4b->bd_buddy_folio)
                folio_put(e4b->bd_buddy_folio);
}


static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
{
        int order = 1, max;
        void *bb;

        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
        BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));

        while (order <= e4b->bd_blkbits + 1) {
                bb = mb_find_buddy(e4b, order, &max);
                if (!mb_test_bit(block >> order, bb)) {
                        /* this block is part of buddy of order 'order' */
                        return order;
                }
                order++;
        }
        return 0;
}

static void mb_clear_bits(void *bm, int cur, int len)
{
        __u32 *addr;

        len = cur + len;
        while (cur < len) {
                if ((cur & 31) == 0 && (len - cur) >= 32) {
                        /* fast path: clear whole word at once */
                        addr = bm + (cur >> 3);
                        *addr = 0;
                        cur += 32;
                        continue;
                }
                mb_clear_bit(cur, bm);
                cur++;
        }
}

/* clear bits in given range
 * will return first found zero bit if any, -1 otherwise
 */
static int mb_test_and_clear_bits(void *bm, int cur, int len)
{
        __u32 *addr;
        int zero_bit = -1;

        len = cur + len;
        while (cur < len) {
                if ((cur & 31) == 0 && (len - cur) >= 32) {
                        /* fast path: clear whole word at once */
                        addr = bm + (cur >> 3);
                        if (*addr != (__u32)(-1) && zero_bit == -1)
                                zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
                        *addr = 0;
                        cur += 32;
                        continue;
                }
                if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
                        zero_bit = cur;
                cur++;
        }

        return zero_bit;
}

void mb_set_bits(void *bm, int cur, int len)
{
        __u32 *addr;

        len = cur + len;
        while (cur < len) {
                if ((cur & 31) == 0 && (len - cur) >= 32) {
                        /* fast path: set whole word at once */
                        addr = bm + (cur >> 3);
                        *addr = 0xffffffff;
                        cur += 32;
                        continue;
                }
                mb_set_bit(cur, bm);
                cur++;
        }
}

static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
{
        if (mb_test_bit(*bit + side, bitmap)) {
                mb_clear_bit(*bit, bitmap);
                (*bit) -= side;
                return 1;
        }
        else {
                (*bit) += side;
                mb_set_bit(*bit, bitmap);
                return -1;
        }
}

static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
{
        int max;
        int order = 1;
        void *buddy = mb_find_buddy(e4b, order, &max);

        while (buddy) {
                void *buddy2;

                /* Bits in range [first; last] are known to be set since
                 * corresponding blocks were allocated. Bits in range
                 * (first; last) will stay set because they form buddies on
                 * upper layer. We just deal with borders if they don't
                 * align with upper layer and then go up.
                 * Releasing entire group is all about clearing
                 * single bit of highest order buddy.
                 */

                /* Example:
                 * ---------------------------------
                 * |   1   |   1   |   1   |   1   |
                 * ---------------------------------
                 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
                 * ---------------------------------
                 *   0   1   2   3   4   5   6   7
                 *      \_____________________/
                 *
                 * Neither [1] nor [6] is aligned to above layer.
                 * Left neighbour [0] is free, so mark it busy,
                 * decrease bb_counters and extend range to
                 * [0; 6]
                 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
                 * mark [6] free, increase bb_counters and shrink range to
                 * [0; 5].
                 * Then shift range to [0; 2], go up and do the same.
                 */


                if (first & 1)
                        e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
                if (!(last & 1))
                        e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
                if (first > last)
                        break;
                order++;

                buddy2 = mb_find_buddy(e4b, order, &max);
                if (!buddy2) {
                        mb_clear_bits(buddy, first, last - first + 1);
                        e4b->bd_info->bb_counters[order - 1] += last - first + 1;
                        break;
                }
                first >>= 1;
                last >>= 1;
                buddy = buddy2;
        }
}

static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                           int first, int count)
{
        int left_is_free = 0;
        int right_is_free = 0;
        int block;
        int last = first + count - 1;
        struct super_block *sb = e4b->bd_sb;

        if (WARN_ON(count == 0))
                return;
        BUG_ON(last >= (sb->s_blocksize << 3));
        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
        /* Don't bother if the block group is corrupt. */
        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
                return;

        mb_check_buddy(e4b);
        mb_free_blocks_double(inode, e4b, first, count);

        /* access memory sequentially: check left neighbour,
         * clear range and then check right neighbour
         */
        if (first != 0)
                left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
        block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
        if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
                right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);

        if (unlikely(block != -1)) {
                struct ext4_sb_info *sbi = EXT4_SB(sb);
                ext4_fsblk_t blocknr;

                /*
                 * Fastcommit replay can free already freed blocks which
                 * corrupts allocation info. Regenerate it.
                 */
                if (sbi->s_mount_state & EXT4_FC_REPLAY) {
                        mb_regenerate_buddy(e4b);
                        goto check;
                }

                blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
                blocknr += EXT4_C2B(sbi, block);
                ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
                                EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                ext4_grp_locked_error(sb, e4b->bd_group,
                                      inode ? inode->i_ino : 0, blocknr,
                                      "freeing already freed block (bit %u); block bitmap corrupt.",
                                      block);
                return;
        }

        this_cpu_inc(discard_pa_seq);
        e4b->bd_info->bb_free += count;
        if (first < e4b->bd_info->bb_first_free)
                e4b->bd_info->bb_first_free = first;

        /* let's maintain fragments counter */
        if (left_is_free && right_is_free)
                e4b->bd_info->bb_fragments--;
        else if (!left_is_free && !right_is_free)
                e4b->bd_info->bb_fragments++;

        /* buddy[0] == bd_bitmap is a special case, so handle
         * it right away and let mb_buddy_mark_free stay free of
         * zero order checks.
         * Check if neighbours are to be coaleasced,
         * adjust bitmap bb_counters and borders appropriately.
         */
        if (first & 1) {
                first += !left_is_free;
                e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
        }
        if (!(last & 1)) {
                last -= !right_is_free;
                e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
        }

        if (first <= last)
                mb_buddy_mark_free(e4b, first >> 1, last >> 1);

        mb_set_largest_free_order(sb, e4b->bd_info);
        mb_update_avg_fragment_size(sb, e4b->bd_info);
check:
        mb_check_buddy(e4b);
}

static int mb_find_extent(struct ext4_buddy *e4b, int block,
                                int needed, struct ext4_free_extent *ex)
{
        int max, order, next;
        void *buddy;

        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        BUG_ON(ex == NULL);

        buddy = mb_find_buddy(e4b, 0, &max);
        BUG_ON(buddy == NULL);
        BUG_ON(block >= max);
        if (mb_test_bit(block, buddy)) {
                ex->fe_len = 0;
                ex->fe_start = 0;
                ex->fe_group = 0;
                return 0;
        }

        /* find actual order */
        order = mb_find_order_for_block(e4b, block);

        ex->fe_len = (1 << order) - (block & ((1 << order) - 1));
        ex->fe_start = block;
        ex->fe_group = e4b->bd_group;

        block = block >> order;

        while (needed > ex->fe_len &&
               mb_find_buddy(e4b, order, &max)) {

                if (block + 1 >= max)
                        break;

                next = (block + 1) * (1 << order);
                if (mb_test_bit(next, e4b->bd_bitmap))
                        break;

                order = mb_find_order_for_block(e4b, next);

                block = next >> order;
                ex->fe_len += 1 << order;
        }

        if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) {
                /* Should never happen! (but apparently sometimes does?!?) */
                WARN_ON(1);
                ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0,
                        "corruption or bug in mb_find_extent "
                        "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
                        block, order, needed, ex->fe_group, ex->fe_start,
                        ex->fe_len, ex->fe_logical);
                ex->fe_len = 0;
                ex->fe_start = 0;
                ex->fe_group = 0;
        }
        return ex->fe_len;
}

static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
{
        int ord;
        int mlen = 0;
        int max = 0;
        int start = ex->fe_start;
        int len = ex->fe_len;
        unsigned ret = 0;
        int len0 = len;
        void *buddy;
        int ord_start, ord_end;

        BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
        BUG_ON(e4b->bd_group != ex->fe_group);
        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        mb_check_buddy(e4b);
        mb_mark_used_double(e4b, start, len);

        this_cpu_inc(discard_pa_seq);
        e4b->bd_info->bb_free -= len;
        if (e4b->bd_info->bb_first_free == start)
                e4b->bd_info->bb_first_free += len;

        /* let's maintain fragments counter */
        if (start != 0)
                mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
        if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
                max = !mb_test_bit(start + len, e4b->bd_bitmap);
        if (mlen && max)
                e4b->bd_info->bb_fragments++;
        else if (!mlen && !max)
                e4b->bd_info->bb_fragments--;

        /* let's maintain buddy itself */
        while (len) {
                ord = mb_find_order_for_block(e4b, start);

                if (((start >> ord) << ord) == start && len >= (1 << ord)) {
                        /* the whole chunk may be allocated at once! */
                        mlen = 1 << ord;
                        buddy = mb_find_buddy(e4b, ord, &max);
                        BUG_ON((start >> ord) >= max);
                        mb_set_bit(start >> ord, buddy);
                        e4b->bd_info->bb_counters[ord]--;
                        start += mlen;
                        len -= mlen;
                        BUG_ON(len < 0);
                        continue;
                }

                /* store for history */
                if (ret == 0)
                        ret = len | (ord << 16);

                BUG_ON(ord <= 0);
                buddy = mb_find_buddy(e4b, ord, &max);
                mb_set_bit(start >> ord, buddy);
                e4b->bd_info->bb_counters[ord]--;

                ord_start = (start >> ord) << ord;
                ord_end = ord_start + (1 << ord);
                /* first chunk */
                if (start > ord_start)
                        ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy,
                                                 ord_start, start - ord_start,
                                                 e4b->bd_info);

                /* last chunk */
                if (start + len < ord_end) {
                        ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy,
                                                 start + len,
                                                 ord_end - (start + len),
                                                 e4b->bd_info);
                        break;
                }
                len = start + len - ord_end;
                start = ord_end;
        }
        mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);

        mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info);
        mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
        mb_check_buddy(e4b);

        return ret;
}

/*
 * Must be called under group lock!
 */
static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        int ret;

        BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
        BUG_ON(ac->ac_status == AC_STATUS_FOUND);

        ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
        ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
        ret = mb_mark_used(e4b, &ac->ac_b_ex);

        /* preallocation can change ac_b_ex, thus we store actually
         * allocated blocks for history */
        ac->ac_f_ex = ac->ac_b_ex;

        ac->ac_status = AC_STATUS_FOUND;
        ac->ac_tail = ret & 0xffff;
        ac->ac_buddy = ret >> 16;

        /*
         * take the page reference. We want the page to be pinned
         * so that we don't get a ext4_mb_init_cache_call for this
         * group until we update the bitmap. That would mean we
         * double allocate blocks. The reference is dropped
         * in ext4_mb_release_context
         */
        ac->ac_bitmap_folio = e4b->bd_bitmap_folio;
        folio_get(ac->ac_bitmap_folio);
        ac->ac_buddy_folio = e4b->bd_buddy_folio;
        folio_get(ac->ac_buddy_folio);
        /* store last allocated for subsequent stream allocation */
        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
                spin_lock(&sbi->s_md_lock);
                sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
                sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
                spin_unlock(&sbi->s_md_lock);
        }
        /*
         * As we've just preallocated more space than
         * user requested originally, we store allocated
         * space in a special descriptor.
         */
        if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
                ext4_mb_new_preallocation(ac);

}

static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b,
                                        int finish_group)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_free_extent *bex = &ac->ac_b_ex;
        struct ext4_free_extent *gex = &ac->ac_g_ex;

        if (ac->ac_status == AC_STATUS_FOUND)
                return;
        /*
         * We don't want to scan for a whole year
         */
        if (ac->ac_found > sbi->s_mb_max_to_scan &&
                        !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
                ac->ac_status = AC_STATUS_BREAK;
                return;
        }

        /*
         * Haven't found good chunk so far, let's continue
         */
        if (bex->fe_len < gex->fe_len)
                return;

        if (finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
                ext4_mb_use_best_found(ac, e4b);
}

/*
 * The routine checks whether found extent is good enough. If it is,
 * then the extent gets marked used and flag is set to the context
 * to stop scanning. Otherwise, the extent is compared with the
 * previous found extent and if new one is better, then it's stored
 * in the context. Later, the best found extent will be used, if
 * mballoc can't find good enough extent.
 *
 * The algorithm used is roughly as follows:
 *
 * * If free extent found is exactly as big as goal, then
 *   stop the scan and use it immediately
 *
 * * If free extent found is smaller than goal, then keep retrying
 *   upto a max of sbi->s_mb_max_to_scan times (default 200). After
 *   that stop scanning and use whatever we have.
 *
 * * If free extent found is bigger than goal, then keep retrying
 *   upto a max of sbi->s_mb_min_to_scan times (default 10) before
 *   stopping the scan and using the extent.
 *
 *
 * FIXME: real allocation policy is to be designed yet!
 */
static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
                                        struct ext4_free_extent *ex,
                                        struct ext4_buddy *e4b)
{
        struct ext4_free_extent *bex = &ac->ac_b_ex;
        struct ext4_free_extent *gex = &ac->ac_g_ex;

        BUG_ON(ex->fe_len <= 0);
        BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
        BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
        BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);

        ac->ac_found++;
        ac->ac_cX_found[ac->ac_criteria]++;

        /*
         * The special case - take what you catch first
         */
        if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
                *bex = *ex;
                ext4_mb_use_best_found(ac, e4b);
                return;
        }

        /*
         * Let's check whether the chuck is good enough
         */
        if (ex->fe_len == gex->fe_len) {
                *bex = *ex;
                ext4_mb_use_best_found(ac, e4b);
                return;
        }

        /*
         * If this is first found extent, just store it in the context
         */
        if (bex->fe_len == 0) {
                *bex = *ex;
                return;
        }

        /*
         * If new found extent is better, store it in the context
         */
        if (bex->fe_len < gex->fe_len) {
                /* if the request isn't satisfied, any found extent
                 * larger than previous best one is better */
                if (ex->fe_len > bex->fe_len)
                        *bex = *ex;
        } else if (ex->fe_len > gex->fe_len) {
                /* if the request is satisfied, then we try to find
                 * an extent that still satisfy the request, but is
                 * smaller than previous one */
                if (ex->fe_len < bex->fe_len)
                        *bex = *ex;
        }

        ext4_mb_check_limits(ac, e4b, 0);
}

static noinline_for_stack
void ext4_mb_try_best_found(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
{
        struct ext4_free_extent ex = ac->ac_b_ex;
        ext4_group_t group = ex.fe_group;
        int max;
        int err;

        BUG_ON(ex.fe_len <= 0);
        err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
        if (err)
                return;

        ext4_lock_group(ac->ac_sb, group);
        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
                goto out;

        max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);

        if (max > 0) {
                ac->ac_b_ex = ex;
                ext4_mb_use_best_found(ac, e4b);
        }

out:
        ext4_unlock_group(ac->ac_sb, group);
        ext4_mb_unload_buddy(e4b);
}

static noinline_for_stack
int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
                                struct ext4_buddy *e4b)
{
        ext4_group_t group = ac->ac_g_ex.fe_group;
        int max;
        int err;
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
        struct ext4_free_extent ex;

        if (!grp)
                return -EFSCORRUPTED;
        if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY)))
                return 0;
        if (grp->bb_free == 0)
                return 0;

        err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
        if (err)
                return err;

        ext4_lock_group(ac->ac_sb, group);
        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
                goto out;

        max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
                             ac->ac_g_ex.fe_len, &ex);
        ex.fe_logical = 0xDEADFA11; /* debug value */

        if (max >= ac->ac_g_ex.fe_len &&
            ac->ac_g_ex.fe_len == EXT4_B2C(sbi, sbi->s_stripe)) {
                ext4_fsblk_t start;

                start = ext4_grp_offs_to_block(ac->ac_sb, &ex);
                /* use do_div to get remainder (would be 64-bit modulo) */
                if (do_div(start, sbi->s_stripe) == 0) {
                        ac->ac_found++;
                        ac->ac_b_ex = ex;
                        ext4_mb_use_best_found(ac, e4b);
                }
        } else if (max >= ac->ac_g_ex.fe_len) {
                BUG_ON(ex.fe_len <= 0);
                BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
                BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
                ac->ac_found++;
                ac->ac_b_ex = ex;
                ext4_mb_use_best_found(ac, e4b);
        } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
                /* Sometimes, caller may want to merge even small
                 * number of blocks to an existing extent */
                BUG_ON(ex.fe_len <= 0);
                BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
                BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
                ac->ac_found++;
                ac->ac_b_ex = ex;
                ext4_mb_use_best_found(ac, e4b);
        }
out:
        ext4_unlock_group(ac->ac_sb, group);
        ext4_mb_unload_buddy(e4b);

        return 0;
}

/*
 * The routine scans buddy structures (not bitmap!) from given order
 * to max order and tries to find big enough chunk to satisfy the req
 */
static noinline_for_stack
void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
{
        struct super_block *sb = ac->ac_sb;
        struct ext4_group_info *grp = e4b->bd_info;
        void *buddy;
        int i;
        int k;
        int max;

        BUG_ON(ac->ac_2order <= 0);
        for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) {
                if (grp->bb_counters[i] == 0)
                        continue;

                buddy = mb_find_buddy(e4b, i, &max);
                if (WARN_RATELIMIT(buddy == NULL,
                         "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i))
                        continue;

                k = mb_find_next_zero_bit(buddy, max, 0);
                if (k >= max) {
                        ext4_mark_group_bitmap_corrupted(ac->ac_sb,
                                        e4b->bd_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                        ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0,
                                "%d free clusters of order %d. But found 0",
                                grp->bb_counters[i], i);
                        break;
                }
                ac->ac_found++;
                ac->ac_cX_found[ac->ac_criteria]++;

                ac->ac_b_ex.fe_len = 1 << i;
                ac->ac_b_ex.fe_start = k << i;
                ac->ac_b_ex.fe_group = e4b->bd_group;

                ext4_mb_use_best_found(ac, e4b);

                BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);

                if (EXT4_SB(sb)->s_mb_stats)
                        atomic_inc(&EXT4_SB(sb)->s_bal_2orders);

                break;
        }
}

/*
 * The routine scans the group and measures all found extents.
 * In order to optimize scanning, caller must pass number of
 * free blocks in the group, so the routine can know upper limit.
 */
static noinline_for_stack
void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                                        struct ext4_buddy *e4b)
{
        struct super_block *sb = ac->ac_sb;
        void *bitmap = e4b->bd_bitmap;
        struct ext4_free_extent ex;
        int i, j, freelen;
        int free;

        free = e4b->bd_info->bb_free;
        if (WARN_ON(free <= 0))
                return;

        i = e4b->bd_info->bb_first_free;

        while (free && ac->ac_status == AC_STATUS_CONTINUE) {
                i = mb_find_next_zero_bit(bitmap,
                                                EXT4_CLUSTERS_PER_GROUP(sb), i);
                if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
                        /*
                         * IF we have corrupt bitmap, we won't find any
                         * free blocks even though group info says we
                         * have free blocks
                         */
                        ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
                                        "%d free clusters as per "
                                        "group info. But bitmap says 0",
                                        free);
                        break;
                }

                if (!ext4_mb_cr_expensive(ac->ac_criteria)) {
                        /*
                         * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are
                         * sure that this group will have a large enough
                         * continuous free extent, so skip over the smaller free
                         * extents
                         */
                        j = mb_find_next_bit(bitmap,
                                                EXT4_CLUSTERS_PER_GROUP(sb), i);
                        freelen = j - i;

                        if (freelen < ac->ac_g_ex.fe_len) {
                                i = j;
                                free -= freelen;
                                continue;
                        }
                }

                mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
                if (WARN_ON(ex.fe_len <= 0))
                        break;
                if (free < ex.fe_len) {
                        ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
                                        EXT4_GROUP_INFO_BBITMAP_CORRUPT);
                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
                                        "%d free clusters as per "
                                        "group info. But got %d blocks",
                                        free, ex.fe_len);
                        /*
                         * The number of free blocks differs. This mostly
                         * indicate that the bitmap is corrupt. So exit
                         * without claiming the space.
                         */
                        break;
                }
                ex.fe_logical = 0xDEADC0DE; /* debug value */
                ext4_mb_measure_extent(ac, &ex, e4b);

                i += ex.fe_len;
                free -= ex.fe_len;
        }

        ext4_mb_check_limits(ac, e4b, 1);
}

/*
 * This is a special case for storages like raid5
 * we try to find stripe-aligned chunks for stripe-size-multiple requests
 */
static noinline_for_stack
void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
                                 struct ext4_buddy *e4b)
{
        struct super_block *sb = ac->ac_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        void *bitmap = e4b->bd_bitmap;
        struct ext4_free_extent ex;
        ext4_fsblk_t first_group_block;
        ext4_fsblk_t a;
        ext4_grpblk_t i, stripe;
        int max;

        BUG_ON(sbi->s_stripe == 0);

        /* find first stripe-aligned block in group */
        first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);

        a = first_group_block + sbi->s_stripe - 1;
        do_div(a, sbi->s_stripe);
        i = (a * sbi->s_stripe) - first_group_block;

        stripe = EXT4_B2C(sbi, sbi->s_stripe);
        i = EXT4_B2C(sbi, i);
        while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
                if (!mb_test_bit(i, bitmap)) {
                        max = mb_find_extent(e4b, i, stripe, &ex);
                        if (max >= stripe) {
                                ac->ac_found++;
                                ac->ac_cX_found[ac->ac_criteria]++;
                                ex.fe_logical = 0xDEADF00D; /* debug value */
                                ac->ac_b_ex = ex;
                                ext4_mb_use_best_found(ac, e4b);
                                break;
                        }
                }
                i += stripe;
        }
}

/*
 * This is also called BEFORE we load the buddy bitmap.
 * Returns either 1 or 0 indicating that the group is either suitable
 * for the allocation or not.
 */
static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
                                ext4_group_t group, enum criteria cr)
{
        ext4_grpblk_t free, fragments;
        int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);

        BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS);

        if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
                return false;

        free = grp->bb_free;
        if (free == 0)
                return false;

        fragments = grp->bb_fragments;
        if (fragments == 0)
                return false;

        switch (cr) {
        case CR_POWER2_ALIGNED:
                BUG_ON(ac->ac_2order == 0);

                /* Avoid using the first bg of a flexgroup for data files */
                if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
                    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
                    ((group % flex_size) == 0))
                        return false;

                if (free < ac->ac_g_ex.fe_len)
                        return false;

                if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb))
                        return true;

                if (grp->bb_largest_free_order < ac->ac_2order)
                        return false;

                return true;
        case CR_GOAL_LEN_FAST:
        case CR_BEST_AVAIL_LEN:
                if ((free / fragments) >= ac->ac_g_ex.fe_len)
                        return true;
                break;
        case CR_GOAL_LEN_SLOW:
                if (free >= ac->ac_g_ex.fe_len)
                        return true;
                break;
        case CR_ANY_FREE:
                return true;
        default:
                BUG();
        }

        return false;
}

/*
 * This could return negative error code if something goes wrong
 * during ext4_mb_init_group(). This should not be called with
 * ext4_lock_group() held.
 *
 * Note: because we are conditionally operating with the group lock in
 * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this
 * function using __acquire and __release.  This means we need to be
 * super careful before messing with the error path handling via "goto
 * out"!
 */
static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
                                     ext4_group_t group, enum criteria cr)
{
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
        struct super_block *sb = ac->ac_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
        ext4_grpblk_t free;
        int ret = 0;

        if (!grp)
                return -EFSCORRUPTED;
        if (sbi->s_mb_stats)
                atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
        if (should_lock) {
                ext4_lock_group(sb, group);
                __release(ext4_group_lock_ptr(sb, group));
        }
        free = grp->bb_free;
        if (free == 0)
                goto out;
        /*
         * In all criterias except CR_ANY_FREE we try to avoid groups that
         * can't possibly satisfy the full goal request due to insufficient
         * free blocks.
         */
        if (cr < CR_ANY_FREE && free < ac->ac_g_ex.fe_len)
                goto out;
        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
                goto out;
        if (should_lock) {
                __acquire(ext4_group_lock_ptr(sb, group));
                ext4_unlock_group(sb, group);
        }

        /* We only do this if the grp has never been initialized */
        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
                struct ext4_group_desc *gdp =
                        ext4_get_group_desc(sb, group, NULL);
                int ret;

                /*
                 * CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic
                 * search to find large good chunks almost for free. If buddy
                 * data is not ready, then this optimization makes no sense. But
                 * we never skip the first block group in a flex_bg, since this
                 * gets used for metadata block allocation, and we want to make
                 * sure we locate metadata blocks in the first block group in
                 * the flex_bg if possible.
                 */
                if (!ext4_mb_cr_expensive(cr) &&
                    (!sbi->s_log_groups_per_flex ||
                     ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
                    !(ext4_has_group_desc_csum(sb) &&
                      (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
                        return 0;
                ret = ext4_mb_init_group(sb, group, GFP_NOFS);
                if (ret)
                        return ret;
        }

        if (should_lock) {
                ext4_lock_group(sb, group);
                __release(ext4_group_lock_ptr(sb, group));
        }
        ret = ext4_mb_good_group(ac, group, cr);
out:
        if (should_lock) {
                __acquire(ext4_group_lock_ptr(sb, group));
                ext4_unlock_group(sb, group);
        }
        return ret;
}

/*
 * Start prefetching @nr block bitmaps starting at @group.
 * Return the next group which needs to be prefetched.
 */
ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
                              unsigned int nr, int *cnt)
{
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        struct buffer_head *bh;
        struct blk_plug plug;

        blk_start_plug(&plug);
        while (nr-- > 0) {
                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
                                                                  NULL);
                struct ext4_group_info *grp = ext4_get_group_info(sb, group);

                /*
                 * Prefetch block groups with free blocks; but don't
                 * bother if it is marked uninitialized on disk, since
                 * it won't require I/O to read.  Also only try to
                 * prefetch once, so we avoid getblk() call, which can
                 * be expensive.
                 */
                if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
                    EXT4_MB_GRP_NEED_INIT(grp) &&
                    ext4_free_group_clusters(sb, gdp) > 0 ) {
                        bh = ext4_read_block_bitmap_nowait(sb, group, true);
                        if (bh && !IS_ERR(bh)) {
                                if (!buffer_uptodate(bh) && cnt)
                                        (*cnt)++;
                                brelse(bh);
                        }
                }
                if (++group >= ngroups)
                        group = 0;
        }
        blk_finish_plug(&plug);
        return group;
}

/*
 * Prefetching reads the block bitmap into the buffer cache; but we
 * need to make sure that the buddy bitmap in the page cache has been
 * initialized.  Note that ext4_mb_init_group() will block if the I/O
 * is not yet completed, or indeed if it was not initiated by
 * ext4_mb_prefetch did not start the I/O.
 *
 * TODO: We should actually kick off the buddy bitmap setup in a work
 * queue when the buffer I/O is completed, so that we don't block
 * waiting for the block allocation bitmap read to finish when
 * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
 */
void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
                           unsigned int nr)
{
        struct ext4_group_desc *gdp;
        struct ext4_group_info *grp;

        while (nr-- > 0) {
                if (!group)
                        group = ext4_get_groups_count(sb);
                group--;
                gdp = ext4_get_group_desc(sb, group, NULL);
                grp = ext4_get_group_info(sb, group);

                if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) &&
                    ext4_free_group_clusters(sb, gdp) > 0) {
                        if (ext4_mb_init_group(sb, group, GFP_NOFS))
                                break;
                }
        }
}

static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
        ext4_group_t prefetch_grp = 0, ngroups, group, i;
        enum criteria new_cr, cr = CR_GOAL_LEN_FAST;
        int err = 0, first_err = 0;
        unsigned int nr = 0, prefetch_ios = 0;
        struct ext4_sb_info *sbi;
        struct super_block *sb;
        struct ext4_buddy e4b;
        int lost;

        sb = ac->ac_sb;
        sbi = EXT4_SB(sb);
        ngroups = ext4_get_groups_count(sb);
        /* non-extent files are limited to low blocks/groups */
        if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
                ngroups = sbi->s_blockfile_groups;

        BUG_ON(ac->ac_status == AC_STATUS_FOUND);

        /* first, try the goal */
        err = ext4_mb_find_by_goal(ac, &e4b);
        if (err || ac->ac_status == AC_STATUS_FOUND)
                goto out;

        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
                goto out;

        /*
         * ac->ac_2order is set only if the fe_len is a power of 2
         * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED
         * so that we try exact allocation using buddy.
         */
        i = fls(ac->ac_g_ex.fe_len);
        ac->ac_2order = 0;
        /*
         * We search using buddy data only if the order of the request
         * is greater than equal to the sbi_s_mb_order2_reqs
         * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
         * We also support searching for power-of-two requests only for
         * requests upto maximum buddy size we have constructed.
         */
        if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
                if (is_power_of_2(ac->ac_g_ex.fe_len))
                        ac->ac_2order = array_index_nospec(i - 1,
                                                           MB_NUM_ORDERS(sb));
        }

        /* if stream allocation is enabled, use global goal */
        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
                /* TBD: may be hot point */
                spin_lock(&sbi->s_md_lock);
                ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
                ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
                spin_unlock(&sbi->s_md_lock);
        }

        /*
         * Let's just scan groups to find more-less suitable blocks We
         * start with CR_GOAL_LEN_FAST, unless it is power of 2
         * aligned, in which case let's do that faster approach first.
         */
        if (ac->ac_2order)
                cr = CR_POWER2_ALIGNED;
repeat:
        for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
                ac->ac_criteria = cr;
                /*
                 * searching for the right group start
                 * from the goal value specified
                 */
                group = ac->ac_g_ex.fe_group;
                ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
                prefetch_grp = group;
                nr = 0;

                for (i = 0, new_cr = cr; i < ngroups; i++,
                     ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) {
                        int ret = 0;

                        cond_resched();
                        if (new_cr != cr) {
                                cr = new_cr;
                                goto repeat;
                        }

                        /*
                         * Batch reads of the block allocation bitmaps
                         * to get multiple READs in flight; limit
                         * prefetching at inexpensive CR, otherwise mballoc
                         * can spend a lot of time loading imperfect groups
                         */
                        if ((prefetch_grp == group) &&
                            (ext4_mb_cr_expensive(cr) ||
                             prefetch_ios < sbi->s_mb_prefetch_limit)) {
                                nr = sbi->s_mb_prefetch;
                                if (ext4_has_feature_flex_bg(sb)) {
                                        nr = 1 << sbi->s_log_groups_per_flex;
                                        nr -= group & (nr - 1);
                                        nr = min(nr, sbi->s_mb_prefetch);
                                }
                                prefetch_grp = ext4_mb_prefetch(sb, group,
                                                        nr, &prefetch_ios);
                        }

                        /* This now checks without needing the buddy page */
                        ret = ext4_mb_good_group_nolock(ac, group, cr);
                        if (ret <= 0) {
                                if (!first_err)
                                        first_err = ret;
                                continue;
                        }

                        err = ext4_mb_load_buddy(sb, group, &e4b);
                        if (err)
                                goto out;

                        ext4_lock_group(sb, group);

                        /*
                         * We need to check again after locking the
                         * block group
                         */
                        ret = ext4_mb_good_group(ac, group, cr);
                        if (ret == 0) {
                                ext4_unlock_group(sb, group);
                                ext4_mb_unload_buddy(&e4b);
                                continue;
                        }

                        ac->ac_groups_scanned++;
                        if (cr == CR_POWER2_ALIGNED)
                                ext4_mb_simple_scan_group(ac, &e4b);
                        else {
                                bool is_stripe_aligned = sbi->s_stripe &&
                                        !(ac->ac_g_ex.fe_len %
                                          EXT4_B2C(sbi, sbi->s_stripe));

                                if ((cr == CR_GOAL_LEN_FAST ||
                                     cr == CR_BEST_AVAIL_LEN) &&
                                    is_stripe_aligned)
                                        ext4_mb_scan_aligned(ac, &e4b);

                                if (ac->ac_status == AC_STATUS_CONTINUE)
                                        ext4_mb_complex_scan_group(ac, &e4b);
                        }

                        ext4_unlock_group(sb, group);
                        ext4_mb_unload_buddy(&e4b);

                        if (ac->ac_status != AC_STATUS_CONTINUE)
                                break;
                }
                /* Processed all groups and haven't found blocks */
                if (sbi->s_mb_stats && i == ngroups)
                        atomic64_inc(&sbi->s_bal_cX_failed[cr]);

                if (i == ngroups && ac->ac_criteria == CR_BEST_AVAIL_LEN)
                        /* Reset goal length to original goal length before
                         * falling into CR_GOAL_LEN_SLOW */
                        ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
        }

        if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
            !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
                /*
                 * We've been searching too long. Let's try to allocate
                 * the best chunk we've found so far
                 */
                ext4_mb_try_best_found(ac, &e4b);
                if (ac->ac_status != AC_STATUS_FOUND) {
                        /*
                         * Someone more lucky has already allocated it.
                         * The only thing we can do is just take first
                         * found block(s)
                         */
                        lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
                        mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
                                 ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
                                 ac->ac_b_ex.fe_len, lost);

                        ac->ac_b_ex.fe_group = 0;
                        ac->ac_b_ex.fe_start = 0;
                        ac->ac_b_ex.fe_len = 0;
                        ac->ac_status = AC_STATUS_CONTINUE;
                        ac->ac_flags |= EXT4_MB_HINT_FIRST;
                        cr = CR_ANY_FREE;
                        goto repeat;
                }
        }

        if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND)
                atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
out:
        if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
                err = first_err;

        mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
                 ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
                 ac->ac_flags, cr, err);

        if (nr)
                ext4_mb_prefetch_fini(sb, prefetch_grp, nr);

        return err;
}

static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
{
        struct super_block *sb = pde_data(file_inode(seq->file));
        ext4_group_t group;

        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
                return NULL;
        group = *pos + 1;
        return (void *) ((unsigned long) group);
}

static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct super_block *sb = pde_data(file_inode(seq->file));
        ext4_group_t group;

        ++*pos;
        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
                return NULL;
        group = *pos + 1;
        return (void *) ((unsigned long) group);
}

static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
{
        struct super_block *sb = pde_data(file_inode(seq->file));
        ext4_group_t group = (ext4_group_t) ((unsigned long) v);
        int i, err;
        char nbuf[16];
        struct ext4_buddy e4b;
        struct ext4_group_info *grinfo;
        unsigned char blocksize_bits = min_t(unsigned char,
                                             sb->s_blocksize_bits,
                                             EXT4_MAX_BLOCK_LOG_SIZE);
        struct sg {
                struct ext4_group_info info;
                ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
        } sg;

        group--;
        if (group == 0)
                seq_puts(seq, "#group: free  frags first ["
                              " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "
                              " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]\n");

        i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
                sizeof(struct ext4_group_info);

        grinfo = ext4_get_group_info(sb, group);
        if (!grinfo)
                return 0;
        /* Load the group info in memory only if not already loaded. */
        if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
                err = ext4_mb_load_buddy(sb, group, &e4b);
                if (err) {
                        seq_printf(seq, "#%-5u: %s\n", group, ext4_decode_error(NULL, err, nbuf));
                        return 0;
                }
                ext4_mb_unload_buddy(&e4b);
        }

        /*
         * We care only about free space counters in the group info and
         * these are safe to access even after the buddy has been unloaded
         */
        memcpy(&sg, grinfo, i);
        seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
                        sg.info.bb_fragments, sg.info.bb_first_free);
        for (i = 0; i <= 13; i++)
                seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
                                sg.info.bb_counters[i] : 0);
        seq_puts(seq, " ]");
        if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info))
                seq_puts(seq, " Block bitmap corrupted!");
        seq_puts(seq, "\n");

        return 0;
}

static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
{
}

const struct seq_operations ext4_mb_seq_groups_ops = {
        .start  = ext4_mb_seq_groups_start,
        .next   = ext4_mb_seq_groups_next,
        .stop   = ext4_mb_seq_groups_stop,
        .show   = ext4_mb_seq_groups_show,
};

int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
{
        struct super_block *sb = seq->private;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        seq_puts(seq, "mballoc:\n");
        if (!sbi->s_mb_stats) {
                seq_puts(seq, "\tmb stats collection turned off.\n");
                seq_puts(
                        seq,
                        "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
                return 0;
        }
        seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
        seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));

        seq_printf(seq, "\tgroups_scanned: %u\n",
                   atomic_read(&sbi->s_bal_groups_scanned));

        /* CR_POWER2_ALIGNED stats */
        seq_puts(seq, "\tcr_p2_aligned_stats:\n");
        seq_printf(seq, "\t\thits: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_hits[CR_POWER2_ALIGNED]));
        seq_printf(
                seq, "\t\tgroups_considered: %llu\n",
                atomic64_read(
                        &sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]));
        seq_printf(seq, "\t\textents_scanned: %u\n",
                   atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED]));
        seq_printf(seq, "\t\tuseless_loops: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED]));
        seq_printf(seq, "\t\tbad_suggestions: %u\n",
                   atomic_read(&sbi->s_bal_p2_aligned_bad_suggestions));

        /* CR_GOAL_LEN_FAST stats */
        seq_puts(seq, "\tcr_goal_fast_stats:\n");
        seq_printf(seq, "\t\thits: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_FAST]));
        seq_printf(seq, "\t\tgroups_considered: %llu\n",
                   atomic64_read(
                           &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_FAST]));
        seq_printf(seq, "\t\textents_scanned: %u\n",
                   atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST]));
        seq_printf(seq, "\t\tuseless_loops: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST]));
        seq_printf(seq, "\t\tbad_suggestions: %u\n",
                   atomic_read(&sbi->s_bal_goal_fast_bad_suggestions));

        /* CR_BEST_AVAIL_LEN stats */
        seq_puts(seq, "\tcr_best_avail_stats:\n");
        seq_printf(seq, "\t\thits: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_hits[CR_BEST_AVAIL_LEN]));
        seq_printf(
                seq, "\t\tgroups_considered: %llu\n",
                atomic64_read(
                        &sbi->s_bal_cX_groups_considered[CR_BEST_AVAIL_LEN]));
        seq_printf(seq, "\t\textents_scanned: %u\n",
                   atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN]));
        seq_printf(seq, "\t\tuseless_loops: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN]));
        seq_printf(seq, "\t\tbad_suggestions: %u\n",
                   atomic_read(&sbi->s_bal_best_avail_bad_suggestions));

        /* CR_GOAL_LEN_SLOW stats */
        seq_puts(seq, "\tcr_goal_slow_stats:\n");
        seq_printf(seq, "\t\thits: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_SLOW]));
        seq_printf(seq, "\t\tgroups_considered: %llu\n",
                   atomic64_read(
                           &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_SLOW]));
        seq_printf(seq, "\t\textents_scanned: %u\n",
                   atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_SLOW]));
        seq_printf(seq, "\t\tuseless_loops: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_SLOW]));

        /* CR_ANY_FREE stats */
        seq_puts(seq, "\tcr_any_free_stats:\n");
        seq_printf(seq, "\t\thits: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_hits[CR_ANY_FREE]));
        seq_printf(
                seq, "\t\tgroups_considered: %llu\n",
                atomic64_read(&sbi->s_bal_cX_groups_considered[CR_ANY_FREE]));
        seq_printf(seq, "\t\textents_scanned: %u\n",
                   atomic_read(&sbi->s_bal_cX_ex_scanned[CR_ANY_FREE]));
        seq_printf(seq, "\t\tuseless_loops: %llu\n",
                   atomic64_read(&sbi->s_bal_cX_failed[CR_ANY_FREE]));

        /* Aggregates */
        seq_printf(seq, "\textents_scanned: %u\n",
                   atomic_read(&sbi->s_bal_ex_scanned));
        seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
        seq_printf(seq, "\t\tlen_goal_hits: %u\n",
                   atomic_read(&sbi->s_bal_len_goals));
        seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
        seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
        seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
        seq_printf(seq, "\tbuddies_generated: %u/%u\n",
                   atomic_read(&sbi->s_mb_buddies_generated),
                   ext4_get_groups_count(sb));
        seq_printf(seq, "\tbuddies_time_used: %llu\n",
                   atomic64_read(&sbi->s_mb_generation_time));
        seq_printf(seq, "\tpreallocated: %u\n",
                   atomic_read(&sbi->s_mb_preallocated));
        seq_printf(seq, "\tdiscarded: %u\n", atomic_read(&sbi->s_mb_discarded));
        return 0;
}

static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos)
{
        struct super_block *sb = pde_data(file_inode(seq->file));
        unsigned long position;

        if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
                return NULL;
        position = *pos + 1;
        return (void *) ((unsigned long) position);
}

static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct super_block *sb = pde_data(file_inode(seq->file));
        unsigned long position;

        ++*pos;
        if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
                return NULL;
        position = *pos + 1;
        return (void *) ((unsigned long) position);
}

static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
{
        struct super_block *sb = pde_data(file_inode(seq->file));
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned long position = ((unsigned long) v);
        struct ext4_group_info *grp;
        unsigned int count;

        position--;
        if (position >= MB_NUM_ORDERS(sb)) {
                position -= MB_NUM_ORDERS(sb);
                if (position == 0)
                        seq_puts(seq, "avg_fragment_size_lists:\n");

                count = 0;
                read_lock(&sbi->s_mb_avg_fragment_size_locks[position]);
                list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position],
                                    bb_avg_fragment_size_node)
                        count++;
                read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]);
                seq_printf(seq, "\tlist_order_%u_groups: %u\n",
                                        (unsigned int)position, count);
                return 0;
        }

        if (position == 0) {
                seq_printf(seq, "optimize_scan: %d\n",
                           test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0);
                seq_puts(seq, "max_free_order_lists:\n");
        }
        count = 0;
        read_lock(&sbi->s_mb_largest_free_orders_locks[position]);
        list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position],
                            bb_largest_free_order_node)
                count++;
        read_unlock(&sbi->s_mb_largest_free_orders_locks[position]);
        seq_printf(seq, "\tlist_order_%u_groups: %u\n",
                   (unsigned int)position, count);

        return 0;
}

static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v)
{
}

const struct seq_operations ext4_mb_seq_structs_summary_ops = {
        .start  = ext4_mb_seq_structs_summary_start,
        .next   = ext4_mb_seq_structs_summary_next,
        .stop   = ext4_mb_seq_structs_summary_stop,
        .show   = ext4_mb_seq_structs_summary_show,
};

static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
{
        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
        struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];

        BUG_ON(!cachep);
        return cachep;
}

/*
 * Allocate the top-level s_group_info array for the specified number
 * of groups
 */
int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned size;
        struct ext4_group_info ***old_groupinfo, ***new_groupinfo;

        size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
                EXT4_DESC_PER_BLOCK_BITS(sb);
        if (size <= sbi->s_group_info_size)
                return 0;

        size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
        new_groupinfo = kvzalloc(size, GFP_KERNEL);
        if (!new_groupinfo) {
                ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
                return -ENOMEM;
        }
        rcu_read_lock();
        old_groupinfo = rcu_dereference(sbi->s_group_info);
        if (old_groupinfo)
                memcpy(new_groupinfo, old_groupinfo,
                       sbi->s_group_info_size * sizeof(*sbi->s_group_info));
        rcu_read_unlock();
        rcu_assign_pointer(sbi->s_group_info, new_groupinfo);
        sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
        if (old_groupinfo)
                ext4_kvfree_array_rcu(old_groupinfo);
        ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
                   sbi->s_group_info_size);
        return 0;
}

/* Create and initialize ext4_group_info data for the given group. */
int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                          struct ext4_group_desc *desc)
{
        int i;
        int metalen = 0;
        int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb);
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_info **meta_group_info;
        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);

        /*
         * First check if this group is the first of a reserved block.
         * If it's true, we have to allocate a new table of pointers
         * to ext4_group_info structures
         */
        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
                metalen = sizeof(*meta_group_info) <<
                        EXT4_DESC_PER_BLOCK_BITS(sb);
                meta_group_info = kmalloc(metalen, GFP_NOFS);
                if (meta_group_info == NULL) {
                        ext4_msg(sb, KERN_ERR, "can't allocate mem "
                                 "for a buddy group");
                        return -ENOMEM;
                }
                rcu_read_lock();
                rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
                rcu_read_unlock();
        }

        meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx);
        i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);

        meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
        if (meta_group_info[i] == NULL) {
                ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
                goto exit_group_info;
        }
        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
                &(meta_group_info[i]->bb_state));

        /*
         * initialize bb_free to be able to skip
         * empty groups without initialization
         */
        if (ext4_has_group_desc_csum(sb) &&
            (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
                meta_group_info[i]->bb_free =
                        ext4_free_clusters_after_init(sb, group, desc);
        } else {
                meta_group_info[i]->bb_free =
                        ext4_free_group_clusters(sb, desc);
        }

        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
        init_rwsem(&meta_group_info[i]->alloc_sem);
        meta_group_info[i]->bb_free_root = RB_ROOT;
        INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
        INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node);
        meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
        meta_group_info[i]->bb_avg_fragment_size_order = -1;  /* uninit */
        meta_group_info[i]->bb_group = group;

        mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
        return 0;

exit_group_info:
        /* If a meta_group_info table has been allocated, release it now */
        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
                struct ext4_group_info ***group_info;

                rcu_read_lock();
                group_info = rcu_dereference(sbi->s_group_info);
                kfree(group_info[idx]);
                group_info[idx] = NULL;
                rcu_read_unlock();
        }
        return -ENOMEM;
} /* ext4_mb_add_groupinfo */

static int ext4_mb_init_backend(struct super_block *sb)
{
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        ext4_group_t i;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        int err;
        struct ext4_group_desc *desc;
        struct ext4_group_info ***group_info;
        struct kmem_cache *cachep;

        err = ext4_mb_alloc_groupinfo(sb, ngroups);
        if (err)
                return err;

        sbi->s_buddy_cache = new_inode(sb);
        if (sbi->s_buddy_cache == NULL) {
                ext4_msg(sb, KERN_ERR, "can't get new inode");
                goto err_freesgi;
        }
        /* To avoid potentially colliding with an valid on-disk inode number,
         * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
         * not in the inode hash, so it should never be found by iget(), but
         * this will avoid confusion if it ever shows up during debugging. */
        sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
        EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
        for (i = 0; i < ngroups; i++) {
                cond_resched();
                desc = ext4_get_group_desc(sb, i, NULL);
                if (desc == NULL) {
                        ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
                        goto err_freebuddy;
                }
                if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
                        goto err_freebuddy;
        }

        if (ext4_has_feature_flex_bg(sb)) {
                /* a single flex group is supposed to be read by a single IO.
                 * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
                 * unsigned integer, so the maximum shift is 32.
                 */
                if (sbi->s_es->s_log_groups_per_flex >= 32) {
                        ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
                        goto err_freebuddy;
                }
                sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
                        BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
                sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
        } else {
                sbi->s_mb_prefetch = 32;
        }
        if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
                sbi->s_mb_prefetch = ext4_get_groups_count(sb);
        /*
         * now many real IOs to prefetch within a single allocation at
         * CR_POWER2_ALIGNED. Given CR_POWER2_ALIGNED is an CPU-related
         * optimization we shouldn't try to load too many groups, at some point
         * we should start to use what we've got in memory.
         * with an average random access time 5ms, it'd take a second to get
         * 200 groups (* N with flex_bg), so let's make this limit 4
         */
        sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
        if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
                sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);

        return 0;

err_freebuddy:
        cachep = get_groupinfo_cache(sb->s_blocksize_bits);
        while (i-- > 0) {
                struct ext4_group_info *grp = ext4_get_group_info(sb, i);

                if (grp)
                        kmem_cache_free(cachep, grp);
        }
        i = sbi->s_group_info_size;
        rcu_read_lock();
        group_info = rcu_dereference(sbi->s_group_info);
        while (i-- > 0)
                kfree(group_info[i]);
        rcu_read_unlock();
        iput(sbi->s_buddy_cache);
err_freesgi:
        rcu_read_lock();
        kvfree(rcu_dereference(sbi->s_group_info));
        rcu_read_unlock();
        return -ENOMEM;
}

static void ext4_groupinfo_destroy_slabs(void)
{
        int i;

        for (i = 0; i < NR_GRPINFO_CACHES; i++) {
                kmem_cache_destroy(ext4_groupinfo_caches[i]);
                ext4_groupinfo_caches[i] = NULL;
        }
}

static int ext4_groupinfo_create_slab(size_t size)
{
        static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
        int slab_size;
        int blocksize_bits = order_base_2(size);
        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
        struct kmem_cache *cachep;

        if (cache_index >= NR_GRPINFO_CACHES)
                return -EINVAL;

        if (unlikely(cache_index < 0))
                cache_index = 0;

        mutex_lock(&ext4_grpinfo_slab_create_mutex);
        if (ext4_groupinfo_caches[cache_index]) {
                mutex_unlock(&ext4_grpinfo_slab_create_mutex);
                return 0;        /* Already created */
        }

        slab_size = offsetof(struct ext4_group_info,
                                bb_counters[blocksize_bits + 2]);

        cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
                                        slab_size, 0, SLAB_RECLAIM_ACCOUNT,
                                        NULL);

        ext4_groupinfo_caches[cache_index] = cachep;

        mutex_unlock(&ext4_grpinfo_slab_create_mutex);
        if (!cachep) {
                printk(KERN_EMERG
                       "EXT4-fs: no memory for groupinfo slab cache\n");
                return -ENOMEM;
        }

        return 0;
}

static void ext4_discard_work(struct work_struct *work)
{
        struct ext4_sb_info *sbi = container_of(work,
                        struct ext4_sb_info, s_discard_work);
        struct super_block *sb = sbi->s_sb;
        struct ext4_free_data *fd, *nfd;
        struct ext4_buddy e4b;
        LIST_HEAD(discard_list);
        ext4_group_t grp, load_grp;
        int err = 0;

        spin_lock(&sbi->s_md_lock);
        list_splice_init(&sbi->s_discard_list, &discard_list);
        spin_unlock(&sbi->s_md_lock);

        load_grp = UINT_MAX;
        list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) {
                /*
                 * If filesystem is umounting or no memory or suffering
                 * from no space, give up the discard
                 */
                if ((sb->s_flags & SB_ACTIVE) && !err &&
                    !atomic_read(&sbi->s_retry_alloc_pending)) {
                        grp = fd->efd_group;
                        if (grp != load_grp) {
                                if (load_grp != UINT_MAX)
                                        ext4_mb_unload_buddy(&e4b);

                                err = ext4_mb_load_buddy(sb, grp, &e4b);
                                if (err) {
                                        kmem_cache_free(ext4_free_data_cachep, fd);
                                        load_grp = UINT_MAX;
                                        continue;
                                } else {
                                        load_grp = grp;
                                }
                        }

                        ext4_lock_group(sb, grp);
                        ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster,
                                                fd->efd_start_cluster + fd->efd_count - 1, 1);
                        ext4_unlock_group(sb, grp);
                }
                kmem_cache_free(ext4_free_data_cachep, fd);
        }

        if (load_grp != UINT_MAX)
                ext4_mb_unload_buddy(&e4b);
}

int ext4_mb_init(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned i, j;
        unsigned offset, offset_incr;
        unsigned max;
        int ret;

        i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets);

        sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_offsets == NULL) {
                ret = -ENOMEM;
                goto out;
        }

        i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs);
        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_maxs == NULL) {
                ret = -ENOMEM;
                goto out;
        }

        ret = ext4_groupinfo_create_slab(sb->s_blocksize);
        if (ret < 0)
                goto out;

        /* order 0 is regular bitmap */
        sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
        sbi->s_mb_offsets[0] = 0;

        i = 1;
        offset = 0;
        offset_incr = 1 << (sb->s_blocksize_bits - 1);
        max = sb->s_blocksize << 2;
        do {
                sbi->s_mb_offsets[i] = offset;
                sbi->s_mb_maxs[i] = max;
                offset += offset_incr;
                offset_incr = offset_incr >> 1;
                max = max >> 1;
                i++;
        } while (i < MB_NUM_ORDERS(sb));

        sbi->s_mb_avg_fragment_size =
                kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
                        GFP_KERNEL);
        if (!sbi->s_mb_avg_fragment_size) {
                ret = -ENOMEM;
                goto out;
        }
        sbi->s_mb_avg_fragment_size_locks =
                kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
                        GFP_KERNEL);
        if (!sbi->s_mb_avg_fragment_size_locks) {
                ret = -ENOMEM;
                goto out;
        }
        for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
                INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]);
                rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]);
        }
        sbi->s_mb_largest_free_orders =
                kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
                        GFP_KERNEL);
        if (!sbi->s_mb_largest_free_orders) {
                ret = -ENOMEM;
                goto out;
        }
        sbi->s_mb_largest_free_orders_locks =
                kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
                        GFP_KERNEL);
        if (!sbi->s_mb_largest_free_orders_locks) {
                ret = -ENOMEM;
                goto out;
        }
        for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
                INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
                rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
        }

        spin_lock_init(&sbi->s_md_lock);
        sbi->s_mb_free_pending = 0;
        INIT_LIST_HEAD(&sbi->s_freed_data_list[0]);
        INIT_LIST_HEAD(&sbi->s_freed_data_list[1]);
        INIT_LIST_HEAD(&sbi->s_discard_list);
        INIT_WORK(&sbi->s_discard_work, ext4_discard_work);
        atomic_set(&sbi->s_retry_alloc_pending, 0);

        sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
        sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
        sbi->s_mb_stats = MB_DEFAULT_STATS;
        sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
        sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
        sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER;

        /*
         * The default group preallocation is 512, which for 4k block
         * sizes translates to 2 megabytes.  However for bigalloc file
         * systems, this is probably too big (i.e, if the cluster size
         * is 1 megabyte, then group preallocation size becomes half a
         * gigabyte!).  As a default, we will keep a two megabyte
         * group pralloc size for cluster sizes up to 64k, and after
         * that, we will force a minimum group preallocation size of
         * 32 clusters.  This translates to 8 megs when the cluster
         * size is 256k, and 32 megs when the cluster size is 1 meg,
         * which seems reasonable as a default.
         */
        sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
                                       sbi->s_cluster_bits, 32);
        /*
         * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
         * to the lowest multiple of s_stripe which is bigger than
         * the s_mb_group_prealloc as determined above. We want
         * the preallocation size to be an exact multiple of the
         * RAID stripe size so that preallocations don't fragment
         * the stripes.
         */
        if (sbi->s_stripe > 1) {
                sbi->s_mb_group_prealloc = roundup(
                        sbi->s_mb_group_prealloc, EXT4_B2C(sbi, sbi->s_stripe));
        }

        sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
        if (sbi->s_locality_groups == NULL) {
                ret = -ENOMEM;
                goto out;
        }
        for_each_possible_cpu(i) {
                struct ext4_locality_group *lg;
                lg = per_cpu_ptr(sbi->s_locality_groups, i);
                mutex_init(&lg->lg_mutex);
                for (j = 0; j < PREALLOC_TB_SIZE; j++)
                        INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
                spin_lock_init(&lg->lg_prealloc_lock);
        }

        if (bdev_nonrot(sb->s_bdev))
                sbi->s_mb_max_linear_groups = 0;
        else
                sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT;
        /* init file for buddy data */
        ret = ext4_mb_init_backend(sb);
        if (ret != 0)
                goto out_free_locality_groups;

        return 0;

out_free_locality_groups:
        free_percpu(sbi->s_locality_groups);
        sbi->s_locality_groups = NULL;
out:
        kfree(sbi->s_mb_avg_fragment_size);
        kfree(sbi->s_mb_avg_fragment_size_locks);
        kfree(sbi->s_mb_largest_free_orders);
        kfree(sbi->s_mb_largest_free_orders_locks);
        kfree(sbi->s_mb_offsets);
        sbi->s_mb_offsets = NULL;
        kfree(sbi->s_mb_maxs);
        sbi->s_mb_maxs = NULL;
        return ret;
}

/* need to called with the ext4 group lock held */
static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
{
        struct ext4_prealloc_space *pa;
        struct list_head *cur, *tmp;
        int count = 0;

        list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
                list_del(&pa->pa_group_list);
                count++;
                kmem_cache_free(ext4_pspace_cachep, pa);
        }
        return count;
}

void ext4_mb_release(struct super_block *sb)
{
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        ext4_group_t i;
        int num_meta_group_infos;
        struct ext4_group_info *grinfo, ***group_info;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
        int count;

        if (test_opt(sb, DISCARD)) {
                /*
                 * wait the discard work to drain all of ext4_free_data
                 */
                flush_work(&sbi->s_discard_work);
                WARN_ON_ONCE(!list_empty(&sbi->s_discard_list));
        }

        if (sbi->s_group_info) {
                for (i = 0; i < ngroups; i++) {
                        cond_resched();
                        grinfo = ext4_get_group_info(sb, i);
                        if (!grinfo)
                                continue;
                        mb_group_bb_bitmap_free(grinfo);
                        ext4_lock_group(sb, i);
                        count = ext4_mb_cleanup_pa(grinfo);
                        if (count)
                                mb_debug(sb, "mballoc: %d PAs left\n",
                                         count);
                        ext4_unlock_group(sb, i);
                        kmem_cache_free(cachep, grinfo);
                }
                num_meta_group_infos = (ngroups +
                                EXT4_DESC_PER_BLOCK(sb) - 1) >>
                        EXT4_DESC_PER_BLOCK_BITS(sb);
                rcu_read_lock();
                group_info = rcu_dereference(sbi->s_group_info);
                for (i = 0; i < num_meta_group_infos; i++)
                        kfree(group_info[i]);
                kvfree(group_info);
                rcu_read_unlock();
        }
        kfree(sbi->s_mb_avg_fragment_size);
        kfree(sbi->s_mb_avg_fragment_size_locks);
        kfree(sbi->s_mb_largest_free_orders);
        kfree(sbi->s_mb_largest_free_orders_locks);
        kfree(sbi->s_mb_offsets);
        kfree(sbi->s_mb_maxs);
        iput(sbi->s_buddy_cache);
        if (sbi->s_mb_stats) {
                ext4_msg(sb, KERN_INFO,
                       "mballoc: %u blocks %u reqs (%u success)",
                                atomic_read(&sbi->s_bal_allocated),
                                atomic_read(&sbi->s_bal_reqs),
                                atomic_read(&sbi->s_bal_success));
                ext4_msg(sb, KERN_INFO,
                      "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
                                "%u 2^N hits, %u breaks, %u lost",
                                atomic_read(&sbi->s_bal_ex_scanned),
                                atomic_read(&sbi->s_bal_groups_scanned),
                                atomic_read(&sbi->s_bal_goals),
                                atomic_read(&sbi->s_bal_2orders),
                                atomic_read(&sbi->s_bal_breaks),
                                atomic_read(&sbi->s_mb_lost_chunks));
                ext4_msg(sb, KERN_INFO,
                       "mballoc: %u generated and it took %llu",
                                atomic_read(&sbi->s_mb_buddies_generated),
                                atomic64_read(&sbi->s_mb_generation_time));
                ext4_msg(sb, KERN_INFO,
                       "mballoc: %u preallocated, %u discarded",
                                atomic_read(&sbi->s_mb_preallocated),
                                atomic_read(&sbi->s_mb_discarded));
        }

        free_percpu(sbi->s_locality_groups);
}

static inline int ext4_issue_discard(struct super_block *sb,
                ext4_group_t block_group, ext4_grpblk_t cluster, int count)
{
        ext4_fsblk_t discard_block;

        discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
                         ext4_group_first_block_no(sb, block_group));
        count = EXT4_C2B(EXT4_SB(sb), count);
        trace_ext4_discard_blocks(sb,
                        (unsigned long long) discard_block, count);

        return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
}

static void ext4_free_data_in_buddy(struct super_block *sb,
                                    struct ext4_free_data *entry)
{
        struct ext4_buddy e4b;
        struct ext4_group_info *db;
        int err, count = 0;

        mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
                 entry->efd_count, entry->efd_group, entry);

        err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
        /* we expect to find existing buddy because it's pinned */
        BUG_ON(err != 0);

        spin_lock(&EXT4_SB(sb)->s_md_lock);
        EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count;
        spin_unlock(&EXT4_SB(sb)->s_md_lock);

        db = e4b.bd_info;
        /* there are blocks to put in buddy to make them really free */
        count += entry->efd_count;
        ext4_lock_group(sb, entry->efd_group);
        /* Take it out of per group rb tree */
        rb_erase(&entry->efd_node, &(db->bb_free_root));
        mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);

        /*
         * Clear the trimmed flag for the group so that the next
         * ext4_trim_fs can trim it.
         * If the volume is mounted with -o discard, online discard
         * is supported and the free blocks will be trimmed online.
         */
        if (!test_opt(sb, DISCARD))
                EXT4_MB_GRP_CLEAR_TRIMMED(db);

        if (!db->bb_free_root.rb_node) {
                /* No more items in the per group rb tree
                 * balance refcounts from ext4_mb_free_metadata()
                 */
                folio_put(e4b.bd_buddy_folio);
                folio_put(e4b.bd_bitmap_folio);
        }
        ext4_unlock_group(sb, entry->efd_group);
        ext4_mb_unload_buddy(&e4b);

        mb_debug(sb, "freed %d blocks in 1 structures\n", count);
}

/*
 * This function is called by the jbd2 layer once the commit has finished,
 * so we know we can free the blocks that were released with that commit.
 */
void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_free_data *entry, *tmp;
        LIST_HEAD(freed_data_list);
        struct list_head *s_freed_head = &sbi->s_freed_data_list[commit_tid & 1];
        bool wake;

        list_replace_init(s_freed_head, &freed_data_list);

        list_for_each_entry(entry, &freed_data_list, efd_list)
                ext4_free_data_in_buddy(sb, entry);

        if (test_opt(sb, DISCARD)) {
                spin_lock(&sbi->s_md_lock);
                wake = list_empty(&sbi->s_discard_list);
                list_splice_tail(&freed_data_list, &sbi->s_discard_list);
                spin_unlock(&sbi->s_md_lock);
                if (wake)
                        queue_work(system_unbound_wq, &sbi->s_discard_work);
        } else {
                list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
                        kmem_cache_free(ext4_free_data_cachep, entry);
        }
}

int __init ext4_init_mballoc(void)
{
        ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
                                        SLAB_RECLAIM_ACCOUNT);
        if (ext4_pspace_cachep == NULL)
                goto out;

        ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
                                    SLAB_RECLAIM_ACCOUNT);
        if (ext4_ac_cachep == NULL)
                goto out_pa_free;

        ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
                                           SLAB_RECLAIM_ACCOUNT);
        if (ext4_free_data_cachep == NULL)
                goto out_ac_free;

        return 0;

out_ac_free:
        kmem_cache_destroy(ext4_ac_cachep);
out_pa_free:
        kmem_cache_destroy(ext4_pspace_cachep);
out:
        return -ENOMEM;
}

void ext4_exit_mballoc(void)
{
        /*
         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
         * before destroying the slab cache.
         */
        rcu_barrier();
        kmem_cache_destroy(ext4_pspace_cachep);
        kmem_cache_destroy(ext4_ac_cachep);
        kmem_cache_destroy(ext4_free_data_cachep);
        ext4_groupinfo_destroy_slabs();
}

#define EXT4_MB_BITMAP_MARKED_CHECK 0x0001
#define EXT4_MB_SYNC_UPDATE 0x0002
static int
ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state,
                     ext4_group_t group, ext4_grpblk_t blkoff,
                     ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_group_desc *gdp;
        struct buffer_head *gdp_bh;
        int err;
        unsigned int i, already, changed = len;

        KUNIT_STATIC_STUB_REDIRECT(ext4_mb_mark_context,
                                   handle, sb, state, group, blkoff, len,
                                   flags, ret_changed);

        if (ret_changed)
                *ret_changed = 0;
        bitmap_bh = ext4_read_block_bitmap(sb, group);
        if (IS_ERR(bitmap_bh))
                return PTR_ERR(bitmap_bh);

        if (handle) {
                BUFFER_TRACE(bitmap_bh, "getting write access");
                err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
                                                    EXT4_JTR_NONE);
                if (err)
                        goto out_err;
        }

        err = -EIO;
        gdp = ext4_get_group_desc(sb, group, &gdp_bh);
        if (!gdp)
                goto out_err;

        if (handle) {
                BUFFER_TRACE(gdp_bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, sb, gdp_bh,
                                                    EXT4_JTR_NONE);
                if (err)
                        goto out_err;
        }

        ext4_lock_group(sb, group);
        if (ext4_has_group_desc_csum(sb) &&
            (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                ext4_free_group_clusters_set(sb, gdp,
                        ext4_free_clusters_after_init(sb, group, gdp));
        }

        if (flags & EXT4_MB_BITMAP_MARKED_CHECK) {
                already = 0;
                for (i = 0; i < len; i++)
                        if (mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
                                        state)
                                already++;
                changed = len - already;
        }

        if (state) {
                mb_set_bits(bitmap_bh->b_data, blkoff, len);
                ext4_free_group_clusters_set(sb, gdp,
                        ext4_free_group_clusters(sb, gdp) - changed);
        } else {
                mb_clear_bits(bitmap_bh->b_data, blkoff, len);
                ext4_free_group_clusters_set(sb, gdp,
                        ext4_free_group_clusters(sb, gdp) + changed);
        }

        ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
        ext4_group_desc_csum_set(sb, group, gdp);
        ext4_unlock_group(sb, group);
        if (ret_changed)
                *ret_changed = changed;

        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, group);
                struct flex_groups *fg = sbi_array_rcu_deref(sbi,
                                           s_flex_groups, flex_group);

                if (state)
                        atomic64_sub(changed, &fg->free_clusters);
                else
                        atomic64_add(changed, &fg->free_clusters);
        }

        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
        if (err)
                goto out_err;
        err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
        if (err)
                goto out_err;

        if (flags & EXT4_MB_SYNC_UPDATE) {
                sync_dirty_buffer(bitmap_bh);
                sync_dirty_buffer(gdp_bh);
        }

out_err:
        brelse(bitmap_bh);
        return err;
}

/*
 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
 * Returns 0 if success or error code
 */
static noinline_for_stack int
ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                                handle_t *handle, unsigned int reserv_clstrs)
{
        struct ext4_group_desc *gdp;
        struct ext4_sb_info *sbi;
        struct super_block *sb;
        ext4_fsblk_t block;
        int err, len;
        int flags = 0;
        ext4_grpblk_t changed;

        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
        BUG_ON(ac->ac_b_ex.fe_len <= 0);

        sb = ac->ac_sb;
        sbi = EXT4_SB(sb);

        gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, NULL);
        if (!gdp)
                return -EIO;
        ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
                        ext4_free_group_clusters(sb, gdp));

        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
        len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
        if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
                ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
                           "fs metadata", block, block+len);
                /* File system mounted not to panic on error
                 * Fix the bitmap and return EFSCORRUPTED
                 * We leak some of the blocks here.
                 */
                err = ext4_mb_mark_context(handle, sb, true,
                                           ac->ac_b_ex.fe_group,
                                           ac->ac_b_ex.fe_start,
                                           ac->ac_b_ex.fe_len,
                                           0, NULL);
                if (!err)
                        err = -EFSCORRUPTED;
                return err;
        }

#ifdef AGGRESSIVE_CHECK
        flags |= EXT4_MB_BITMAP_MARKED_CHECK;
#endif
        err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group,
                                   ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len,
                                   flags, &changed);

        if (err && changed == 0)
                return err;

#ifdef AGGRESSIVE_CHECK
        BUG_ON(changed != ac->ac_b_ex.fe_len);
#endif
        percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
        /*
         * Now reduce the dirty block count also. Should not go negative
         */
        if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
                /* release all the reserved blocks if non delalloc */
                percpu_counter_sub(&sbi->s_dirtyclusters_counter,
                                   reserv_clstrs);

        return err;
}

/*
 * Idempotent helper for Ext4 fast commit replay path to set the state of
 * blocks in bitmaps and update counters.
 */
void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
                     int len, bool state)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t group;
        ext4_grpblk_t blkoff;
        int err = 0;
        unsigned int clen, thisgrp_len;

        while (len > 0) {
                ext4_get_group_no_and_offset(sb, block, &group, &blkoff);

                /*
                 * Check to see if we are freeing blocks across a group
                 * boundary.
                 * In case of flex_bg, this can happen that (block, len) may
                 * span across more than one group. In that case we need to
                 * get the corresponding group metadata to work with.
                 * For this we have goto again loop.
                 */
                thisgrp_len = min_t(unsigned int, (unsigned int)len,
                        EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
                clen = EXT4_NUM_B2C(sbi, thisgrp_len);

                if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) {
                        ext4_error(sb, "Marking blocks in system zone - "
                                   "Block = %llu, len = %u",
                                   block, thisgrp_len);
                        break;
                }

                err = ext4_mb_mark_context(NULL, sb, state,
                                           group, blkoff, clen,
                                           EXT4_MB_BITMAP_MARKED_CHECK |
                                           EXT4_MB_SYNC_UPDATE,
                                           NULL);
                if (err)
                        break;

                block += thisgrp_len;
                len -= thisgrp_len;
                BUG_ON(len < 0);
        }
}

/*
 * here we normalize request for locality group
 * Group request are normalized to s_mb_group_prealloc, which goes to
 * s_strip if we set the same via mount option.
 * s_mb_group_prealloc can be configured via
 * /sys/fs/ext4/<partition>/mb_group_prealloc
 *
 * XXX: should we try to preallocate more than the group has now?
 */
static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
{
        struct super_block *sb = ac->ac_sb;
        struct ext4_locality_group *lg = ac->ac_lg;

        BUG_ON(lg == NULL);
        ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
        mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len);
}

/*
 * This function returns the next element to look at during inode
 * PA rbtree walk. We assume that we have held the inode PA rbtree lock
 * (ei->i_prealloc_lock)
 *
 * new_start        The start of the range we want to compare
 * cur_start        The existing start that we are comparing against
 * node        The node of the rb_tree
 */
static inline struct rb_node*
ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node)
{
        if (new_start < cur_start)
                return node->rb_left;
        else
                return node->rb_right;
}

static inline void
ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac,
                          ext4_lblk_t start, loff_t end)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
        struct ext4_prealloc_space *tmp_pa;
        ext4_lblk_t tmp_pa_start;
        loff_t tmp_pa_end;
        struct rb_node *iter;

        read_lock(&ei->i_prealloc_lock);
        for (iter = ei->i_prealloc_node.rb_node; iter;
             iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) {
                tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
                                  pa_node.inode_node);
                tmp_pa_start = tmp_pa->pa_lstart;
                tmp_pa_end = pa_logical_end(sbi, tmp_pa);

                spin_lock(&tmp_pa->pa_lock);
                if (tmp_pa->pa_deleted == 0)
                        BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start));
                spin_unlock(&tmp_pa->pa_lock);
        }
        read_unlock(&ei->i_prealloc_lock);
}

/*
 * Given an allocation context "ac" and a range "start", "end", check
 * and adjust boundaries if the range overlaps with any of the existing
 * preallocatoins stored in the corresponding inode of the allocation context.
 *
 * Parameters:
 *        ac                        allocation context
 *        start                        start of the new range
 *        end                        end of the new range
 */
static inline void
ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac,
                          ext4_lblk_t *start, loff_t *end)
{
        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL;
        struct rb_node *iter;
        ext4_lblk_t new_start, tmp_pa_start, right_pa_start = -1;
        loff_t new_end, tmp_pa_end, left_pa_end = -1;

        new_start = *start;
        new_end = *end;

        /*
         * Adjust the normalized range so that it doesn't overlap with any
         * existing preallocated blocks(PAs). Make sure to hold the rbtree lock
         * so it doesn't change underneath us.
         */
        read_lock(&ei->i_prealloc_lock);

        /* Step 1: find any one immediate neighboring PA of the normalized range */
        for (iter = ei->i_prealloc_node.rb_node; iter;
             iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
                                            tmp_pa_start, iter)) {
                tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
                                  pa_node.inode_node);
                tmp_pa_start = tmp_pa->pa_lstart;
                tmp_pa_end = pa_logical_end(sbi, tmp_pa);

                /* PA must not overlap original request */
                spin_lock(&tmp_pa->pa_lock);
                if (tmp_pa->pa_deleted == 0)
                        BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end ||
                                 ac->ac_o_ex.fe_logical < tmp_pa_start));
                spin_unlock(&tmp_pa->pa_lock);
        }

        /*
         * Step 2: check if the found PA is left or right neighbor and
         * get the other neighbor
         */
        if (tmp_pa) {
                if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) {
                        struct rb_node *tmp;

                        left_pa = tmp_pa;
                        tmp = rb_next(&left_pa->pa_node.inode_node);
                        if (tmp) {
                                right_pa = rb_entry(tmp,
                                                    struct ext4_prealloc_space,
                                                    pa_node.inode_node);
                        }
                } else {
                        struct rb_node *tmp;

                        right_pa = tmp_pa;
                        tmp = rb_prev(&right_pa->pa_node.inode_node);
                        if (tmp) {
                                left_pa = rb_entry(tmp,
                                                   struct ext4_prealloc_space,
                                                   pa_node.inode_node);
                        }
                }
        }

        /* Step 3: get the non deleted neighbors */
        if (left_pa) {
                for (iter = &left_pa->pa_node.inode_node;;
                     iter = rb_prev(iter)) {
                        if (!iter) {
                                left_pa = NULL;
                                break;
                        }

                        tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
                                          pa_node.inode_node);
                        left_pa = tmp_pa;
                        spin_lock(&tmp_pa->pa_lock);
                        if (tmp_pa->pa_deleted == 0) {
                                spin_unlock(&tmp_pa->pa_lock);
                                break;
                        }
                        spin_unlock(&tmp_pa->pa_lock);
                }
        }

        if (right_pa) {
                for (iter = &right_pa->pa_node.inode_node;;
                     iter = rb_next(iter)) {
                        if (!iter) {
                                right_pa = NULL;
                                break;
                        }

                        tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
                                          pa_node.inode_node);
                        right_pa = tmp_pa;
                        spin_lock(&tmp_pa->pa_lock);
                        if (tmp_pa->pa_deleted == 0) {
                                spin_unlock(&tmp_pa->pa_lock);
                                break;
                        }
                        spin_unlock(&tmp_pa->pa_lock);
                }
        }

        if (left_pa) {
                left_pa_end = pa_logical_end(sbi, left_pa);
                BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical);
        }

        if (right_pa) {
                right_pa_start = right_pa->pa_lstart;
                BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical);
        }

        /* Step 4: trim our normalized range to not overlap with the neighbors */
        if (left_pa) {
                if (left_pa_end > new_start)
                        new_start = left_pa_end;
        }

        if (right_pa) {
                if (right_pa_start < new_end)
                        new_end = right_pa_start;
        }
        read_unlock(&ei->i_prealloc_lock);

        /* XXX: extra loop to check we really don't overlap preallocations */
        ext4_mb_pa_assert_overlap(ac, new_start, new_end);

        *start = new_start;
        *end = new_end;
}

/*
 * Normalization means making request better in terms of
 * size and alignment
 */
static noinline_for_stack void
ext4_mb_normalize_request(struct ext4_allocation_context *ac,
                                struct ext4_allocation_request *ar)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_super_block *es = sbi->s_es;
        int bsbits, max;
        loff_t size, start_off, end;
        loff_t orig_size __maybe_unused;
        ext4_lblk_t start;

        /* do normalize only data requests, metadata requests
           do not need preallocation */
        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
                return;

        /* sometime caller may want exact blocks */
        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
                return;

        /* caller may indicate that preallocation isn't
         * required (it's a tail, for example) */
        if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
                return;

        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
                ext4_mb_normalize_group_request(ac);
                return ;
        }

        bsbits = ac->ac_sb->s_blocksize_bits;

        /* first, let's learn actual file size
         * given current request is allocated */
        size = extent_logical_end(sbi, &ac->ac_o_ex);
        size = size << bsbits;
        if (size < i_size_read(ac->ac_inode))
                size = i_size_read(ac->ac_inode);
        orig_size = size;

        /* max size of free chunks */
        max = 2 << bsbits;

#define NRL_CHECK_SIZE(req, size, max, chunk_size)        \
                (req <= (size) || max <= (chunk_size))

        /* first, try to predict filesize */
        /* XXX: should this table be tunable? */
        start_off = 0;
        if (size <= 16 * 1024) {
                size = 16 * 1024;
        } else if (size <= 32 * 1024) {
                size = 32 * 1024;
        } else if (size <= 64 * 1024) {
                size = 64 * 1024;
        } else if (size <= 128 * 1024) {
                size = 128 * 1024;
        } else if (size <= 256 * 1024) {
                size = 256 * 1024;
        } else if (size <= 512 * 1024) {
                size = 512 * 1024;
        } else if (size <= 1024 * 1024) {
                size = 1024 * 1024;
        } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
                                                (21 - bsbits)) << 21;
                size = 2 * 1024 * 1024;
        } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
                                                        (22 - bsbits)) << 22;
                size = 4 * 1024 * 1024;
        } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len),
                                        (8<<20)>>bsbits, max, 8 * 1024)) {
                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
                                                        (23 - bsbits)) << 23;
                size = 8 * 1024 * 1024;
        } else {
                start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
                size          = (loff_t) EXT4_C2B(sbi,
                                              ac->ac_o_ex.fe_len) << bsbits;
        }
        size = size >> bsbits;
        start = start_off >> bsbits;

        /*
         * For tiny groups (smaller than 8MB) the chosen allocation
         * alignment may be larger than group size. Make sure the
         * alignment does not move allocation to a different group which
         * makes mballoc fail assertions later.
         */
        start = max(start, rounddown(ac->ac_o_ex.fe_logical,
                        (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));

        /* avoid unnecessary preallocation that may trigger assertions */
        if (start + size > EXT_MAX_BLOCKS)
                size = EXT_MAX_BLOCKS - start;

        /* don't cover already allocated blocks in selected range */
        if (ar->pleft && start <= ar->lleft) {
                size -= ar->lleft + 1 - start;
                start = ar->lleft + 1;
        }
        if (ar->pright && start + size - 1 >= ar->lright)
                size -= start + size - ar->lright;

        /*
         * Trim allocation request for filesystems with artificially small
         * groups.
         */
        if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
                size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);

        end = start + size;

        ext4_mb_pa_adjust_overlap(ac, &start, &end);

        size = end - start;

        /*
         * In this function "start" and "size" are normalized for better
         * alignment and length such that we could preallocate more blocks.
         * This normalization is done such that original request of
         * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and
         * "size" boundaries.
         * (Note fe_len can be relaxed since FS block allocation API does not
         * provide gurantee on number of contiguous blocks allocation since that
         * depends upon free space left, etc).
         * In case of inode pa, later we use the allocated blocks
         * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated
         * range of goal/best blocks [start, size] to put it at the
         * ac_o_ex.fe_logical extent of this inode.
         * (See ext4_mb_use_inode_pa() for more details)
         */
        if (start + size <= ac->ac_o_ex.fe_logical ||
                        start > ac->ac_o_ex.fe_logical) {
                ext4_msg(ac->ac_sb, KERN_ERR,
                         "start %lu, size %lu, fe_logical %lu",
                         (unsigned long) start, (unsigned long) size,
                         (unsigned long) ac->ac_o_ex.fe_logical);
                BUG();
        }
        BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));

        /* now prepare goal request */

        /* XXX: is it better to align blocks WRT to logical
         * placement or satisfy big request as is */
        ac->ac_g_ex.fe_logical = start;
        ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
        ac->ac_orig_goal_len = ac->ac_g_ex.fe_len;

        /* define goal start in order to merge */
        if (ar->pright && (ar->lright == (start + size)) &&
            ar->pright >= size &&
            ar->pright - size >= le32_to_cpu(es->s_first_data_block)) {
                /* merge to the right */
                ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
                                                &ac->ac_g_ex.fe_group,
                                                &ac->ac_g_ex.fe_start);
                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
        }
        if (ar->pleft && (ar->lleft + 1 == start) &&
            ar->pleft + 1 < ext4_blocks_count(es)) {
                /* merge to the left */
                ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
                                                &ac->ac_g_ex.fe_group,
                                                &ac->ac_g_ex.fe_start);
                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
        }

        mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size,
                 orig_size, start);
}

static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);

        if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
                atomic_inc(&sbi->s_bal_reqs);
                atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
                if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
                        atomic_inc(&sbi->s_bal_success);

                atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
                for (int i=0; i<EXT4_MB_NUM_CRS; i++) {
                        atomic_add(ac->ac_cX_found[i], &sbi->s_bal_cX_ex_scanned[i]);
                }

                atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
                if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
                                ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
                        atomic_inc(&sbi->s_bal_goals);
                /* did we allocate as much as normalizer originally wanted? */
                if (ac->ac_f_ex.fe_len == ac->ac_orig_goal_len)
                        atomic_inc(&sbi->s_bal_len_goals);

                if (ac->ac_found > sbi->s_mb_max_to_scan)
                        atomic_inc(&sbi->s_bal_breaks);
        }

        if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
                trace_ext4_mballoc_alloc(ac);
        else
                trace_ext4_mballoc_prealloc(ac);
}

/*
 * Called on failure; free up any blocks from the inode PA for this
 * context.  We don't need this for MB_GROUP_PA because we only change
 * pa_free in ext4_mb_release_context(), but on failure, we've already
 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
 */
static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
{
        struct ext4_prealloc_space *pa = ac->ac_pa;
        struct ext4_buddy e4b;
        int err;

        if (pa == NULL) {
                if (ac->ac_f_ex.fe_len == 0)
                        return;
                err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
                if (WARN_RATELIMIT(err,
                                   "ext4: mb_load_buddy failed (%d)", err))
                        /*
                         * This should never happen since we pin the
                         * pages in the ext4_allocation_context so
                         * ext4_mb_load_buddy() should never fail.
                         */
                        return;
                ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
                mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
                               ac->ac_f_ex.fe_len);
                ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
                ext4_mb_unload_buddy(&e4b);
                return;
        }
        if (pa->pa_type == MB_INODE_PA) {
                spin_lock(&pa->pa_lock);
                pa->pa_free += ac->ac_b_ex.fe_len;
                spin_unlock(&pa->pa_lock);
        }
}

/*
 * use blocks preallocated to inode
 */
static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
                                struct ext4_prealloc_space *pa)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        ext4_fsblk_t start;
        ext4_fsblk_t end;
        int len;

        /* found preallocated blocks, use them */
        start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
        end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
                  start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
        len = EXT4_NUM_B2C(sbi, end - start);
        ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
                                        &ac->ac_b_ex.fe_start);
        ac->ac_b_ex.fe_len = len;
        ac->ac_status = AC_STATUS_FOUND;
        ac->ac_pa = pa;

        BUG_ON(start < pa->pa_pstart);
        BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
        BUG_ON(pa->pa_free < len);
        BUG_ON(ac->ac_b_ex.fe_len <= 0);
        pa->pa_free -= len;

        mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa);
}

/*
 * use blocks preallocated to locality group
 */
static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
                                struct ext4_prealloc_space *pa)
{
        unsigned int len = ac->ac_o_ex.fe_len;

        ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
                                        &ac->ac_b_ex.fe_group,
                                        &ac->ac_b_ex.fe_start);
        ac->ac_b_ex.fe_len = len;
        ac->ac_status = AC_STATUS_FOUND;
        ac->ac_pa = pa;

        /* we don't correct pa_pstart or pa_len here to avoid
         * possible race when the group is being loaded concurrently
         * instead we correct pa later, after blocks are marked
         * in on-disk bitmap -- see ext4_mb_release_context()
         * Other CPUs are prevented from allocating from this pa by lg_mutex
         */
        mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n",
                 pa->pa_lstart, len, pa);
}

/*
 * Return the prealloc space that have minimal distance
 * from the goal block. @cpa is the prealloc
 * space that is having currently known minimal distance
 * from the goal block.
 */
static struct ext4_prealloc_space *
ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
                        struct ext4_prealloc_space *pa,
                        struct ext4_prealloc_space *cpa)
{
        ext4_fsblk_t cur_distance, new_distance;

        if (cpa == NULL) {
                atomic_inc(&pa->pa_count);
                return pa;
        }
        cur_distance = abs(goal_block - cpa->pa_pstart);
        new_distance = abs(goal_block - pa->pa_pstart);

        if (cur_distance <= new_distance)
                return cpa;

        /* drop the previous reference */
        atomic_dec(&cpa->pa_count);
        atomic_inc(&pa->pa_count);
        return pa;
}

/*
 * check if found pa meets EXT4_MB_HINT_GOAL_ONLY
 */
static bool
ext4_mb_pa_goal_check(struct ext4_allocation_context *ac,
                      struct ext4_prealloc_space *pa)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        ext4_fsblk_t start;

        if (likely(!(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)))
                return true;

        /*
         * If EXT4_MB_HINT_GOAL_ONLY is set, ac_g_ex will not be adjusted
         * in ext4_mb_normalize_request and will keep same with ac_o_ex
         * from ext4_mb_initialize_context. Choose ac_g_ex here to keep
         * consistent with ext4_mb_find_by_goal.
         */
        start = pa->pa_pstart +
                (ac->ac_g_ex.fe_logical - pa->pa_lstart);
        if (ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex) != start)
                return false;

        if (ac->ac_g_ex.fe_len > pa->pa_len -
            EXT4_B2C(sbi, ac->ac_g_ex.fe_logical - pa->pa_lstart))
                return false;

        return true;
}

/*
 * search goal blocks in preallocated space
 */
static noinline_for_stack bool
ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        int order, i;
        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
        struct ext4_locality_group *lg;
        struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL;
        struct rb_node *iter;
        ext4_fsblk_t goal_block;

        /* only data can be preallocated */
        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
                return false;

        /*
         * first, try per-file preallocation by searching the inode pa rbtree.
         *
         * Here, we can't do a direct traversal of the tree because
         * ext4_mb_discard_group_preallocation() can paralelly mark the pa
         * deleted and that can cause direct traversal to skip some entries.
         */
        read_lock(&ei->i_prealloc_lock);

        if (RB_EMPTY_ROOT(&ei->i_prealloc_node)) {
                goto try_group_pa;
        }

        /*
         * Step 1: Find a pa with logical start immediately adjacent to the
         * original logical start. This could be on the left or right.
         *
         * (tmp_pa->pa_lstart never changes so we can skip locking for it).
         */
        for (iter = ei->i_prealloc_node.rb_node; iter;
             iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
                                            tmp_pa->pa_lstart, iter)) {
                tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
                                  pa_node.inode_node);
        }

        /*
         * Step 2: The adjacent pa might be to the right of logical start, find
         * the left adjacent pa. After this step we'd have a valid tmp_pa whose
         * logical start is towards the left of original request's logical start
         */
        if (tmp_pa->pa_lstart > ac->ac_o_ex.fe_logical) {
                struct rb_node *tmp;
                tmp = rb_prev(&tmp_pa->pa_node.inode_node);

                if (tmp) {
                        tmp_pa = rb_entry(tmp, struct ext4_prealloc_space,
                                            pa_node.inode_node);
                } else {
                        /*
                         * If there is no adjacent pa to the left then finding
                         * an overlapping pa is not possible hence stop searching
                         * inode pa tree
                         */
                        goto try_group_pa;
                }
        }

        BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical));

        /*
         * Step 3: If the left adjacent pa is deleted, keep moving left to find
         * the first non deleted adjacent pa. After this step we should have a
         * valid tmp_pa which is guaranteed to be non deleted.
         */
        for (iter = &tmp_pa->pa_node.inode_node;; iter = rb_prev(iter)) {
                if (!iter) {
                        /*
                         * no non deleted left adjacent pa, so stop searching
                         * inode pa tree
                         */
                        goto try_group_pa;
                }
                tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
                                  pa_node.inode_node);
                spin_lock(&tmp_pa->pa_lock);
                if (tmp_pa->pa_deleted == 0) {
                        /*
                         * We will keep holding the pa_lock from
                         * this point on because we don't want group discard
                         * to delete this pa underneath us. Since group
                         * discard is anyways an ENOSPC operation it
                         * should be okay for it to wait a few more cycles.
                         */
                        break;
                } else {
                        spin_unlock(&tmp_pa->pa_lock);
                }
        }

        BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical));
        BUG_ON(tmp_pa->pa_deleted == 1);

        /*
         * Step 4: We now have the non deleted left adjacent pa. Only this
         * pa can possibly satisfy the request hence check if it overlaps
         * original logical start and stop searching if it doesn't.
         */
        if (ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, tmp_pa)) {
                spin_unlock(&tmp_pa->pa_lock);
                goto try_group_pa;
        }

        /* non-extent files can't have physical blocks past 2^32 */
        if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
            (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) >
             EXT4_MAX_BLOCK_FILE_PHYS)) {
                /*
                 * Since PAs don't overlap, we won't find any other PA to
                 * satisfy this.
                 */
                spin_unlock(&tmp_pa->pa_lock);
                goto try_group_pa;
        }

        if (tmp_pa->pa_free && likely(ext4_mb_pa_goal_check(ac, tmp_pa))) {
                atomic_inc(&tmp_pa->pa_count);
                ext4_mb_use_inode_pa(ac, tmp_pa);
                spin_unlock(&tmp_pa->pa_lock);
                read_unlock(&ei->i_prealloc_lock);
                return true;
        } else {
                /*
                 * We found a valid overlapping pa but couldn't use it because
                 * it had no free blocks. This should ideally never happen
                 * because:
                 *
                 * 1. When a new inode pa is added to rbtree it must have
                 *    pa_free > 0 since otherwise we won't actually need
                 *    preallocation.
                 *
                 * 2. An inode pa that is in the rbtree can only have it's
                 *    pa_free become zero when another thread calls:
                 *      ext4_mb_new_blocks
                 *       ext4_mb_use_preallocated
                 *        ext4_mb_use_inode_pa
                 *
                 * 3. Further, after the above calls make pa_free == 0, we will
                 *    immediately remove it from the rbtree in:
                 *      ext4_mb_new_blocks
                 *       ext4_mb_release_context
                 *        ext4_mb_put_pa
                 *
                 * 4. Since the pa_free becoming 0 and pa_free getting removed
                 * from tree both happen in ext4_mb_new_blocks, which is always
                 * called with i_data_sem held for data allocations, we can be
                 * sure that another process will never see a pa in rbtree with
                 * pa_free == 0.
                 */
                WARN_ON_ONCE(tmp_pa->pa_free == 0);
        }
        spin_unlock(&tmp_pa->pa_lock);
try_group_pa:
        read_unlock(&ei->i_prealloc_lock);

        /* can we use group allocation? */
        if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
                return false;

        /* inode may have no locality group for some reason */
        lg = ac->ac_lg;
        if (lg == NULL)
                return false;
        order  = fls(ac->ac_o_ex.fe_len) - 1;
        if (order > PREALLOC_TB_SIZE - 1)
                /* The max size of hash table is PREALLOC_TB_SIZE */
                order = PREALLOC_TB_SIZE - 1;

        goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
        /*
         * search for the prealloc space that is having
         * minimal distance from the goal block.
         */
        for (i = order; i < PREALLOC_TB_SIZE; i++) {
                rcu_read_lock();
                list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i],
                                        pa_node.lg_list) {
                        spin_lock(&tmp_pa->pa_lock);
                        if (tmp_pa->pa_deleted == 0 &&
                                        tmp_pa->pa_free >= ac->ac_o_ex.fe_len) {

                                cpa = ext4_mb_check_group_pa(goal_block,
                                                                tmp_pa, cpa);
                        }
                        spin_unlock(&tmp_pa->pa_lock);
                }
                rcu_read_unlock();
        }
        if (cpa) {
                ext4_mb_use_group_pa(ac, cpa);
                return true;
        }
        return false;
}

/*
 * the function goes through all preallocation in this group and marks them
 * used in in-core bitmap. buddy must be generated from this bitmap
 * Need to be called with ext4 group lock held
 */
static noinline_for_stack
void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group)
{
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct ext4_prealloc_space *pa;
        struct list_head *cur;
        ext4_group_t groupnr;
        ext4_grpblk_t start;
        int preallocated = 0;
        int len;

        if (!grp)
                return;

        /* all form of preallocation discards first load group,
         * so the only competing code is preallocation use.
         * we don't need any locking here
         * notice we do NOT ignore preallocations with pa_deleted
         * otherwise we could leave used blocks available for
         * allocation in buddy when concurrent ext4_mb_put_pa()
         * is dropping preallocation
         */
        list_for_each(cur, &grp->bb_prealloc_list) {
                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
                spin_lock(&pa->pa_lock);
                ext4_get_group_no_and_offset(sb, pa->pa_pstart,
                                             &groupnr, &start);
                len = pa->pa_len;
                spin_unlock(&pa->pa_lock);
                if (unlikely(len == 0))
                        continue;
                BUG_ON(groupnr != group);
                mb_set_bits(bitmap, start, len);
                preallocated += len;
        }
        mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
}

static void ext4_mb_mark_pa_deleted(struct super_block *sb,
                                    struct ext4_prealloc_space *pa)
{
        struct ext4_inode_info *ei;

        if (pa->pa_deleted) {
                ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
                             pa->pa_type, pa->pa_pstart, pa->pa_lstart,
                             pa->pa_len);
                return;
        }

        pa->pa_deleted = 1;

        if (pa->pa_type == MB_INODE_PA) {
                ei = EXT4_I(pa->pa_inode);
                atomic_dec(&ei->i_prealloc_active);
        }
}

static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa)
{
        BUG_ON(!pa);
        BUG_ON(atomic_read(&pa->pa_count));
        BUG_ON(pa->pa_deleted == 0);
        kmem_cache_free(ext4_pspace_cachep, pa);
}

static void ext4_mb_pa_callback(struct rcu_head *head)
{
        struct ext4_prealloc_space *pa;

        pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
        ext4_mb_pa_free(pa);
}

/*
 * drops a reference to preallocated space descriptor
 * if this was the last reference and the space is consumed
 */
static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
                        struct super_block *sb, struct ext4_prealloc_space *pa)
{
        ext4_group_t grp;
        ext4_fsblk_t grp_blk;
        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);

        /* in this short window concurrent discard can set pa_deleted */
        spin_lock(&pa->pa_lock);
        if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
                spin_unlock(&pa->pa_lock);
                return;
        }

        if (pa->pa_deleted == 1) {
                spin_unlock(&pa->pa_lock);
                return;
        }

        ext4_mb_mark_pa_deleted(sb, pa);
        spin_unlock(&pa->pa_lock);

        grp_blk = pa->pa_pstart;
        /*
         * If doing group-based preallocation, pa_pstart may be in the
         * next group when pa is used up
         */
        if (pa->pa_type == MB_GROUP_PA)
                grp_blk--;

        grp = ext4_get_group_number(sb, grp_blk);

        /*
         * possible race:
         *
         *  P1 (buddy init)                        P2 (regular allocation)
         *                                        find block B in PA
         *  copy on-disk bitmap to buddy
         *                                          mark B in on-disk bitmap
         *                                        drop PA from group
         *  mark all PAs in buddy
         *
         * thus, P1 initializes buddy with B available. to prevent this
         * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
         * against that pair
         */
        ext4_lock_group(sb, grp);
        list_del(&pa->pa_group_list);
        ext4_unlock_group(sb, grp);

        if (pa->pa_type == MB_INODE_PA) {
                write_lock(pa->pa_node_lock.inode_lock);
                rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
                write_unlock(pa->pa_node_lock.inode_lock);
                ext4_mb_pa_free(pa);
        } else {
                spin_lock(pa->pa_node_lock.lg_lock);
                list_del_rcu(&pa->pa_node.lg_list);
                spin_unlock(pa->pa_node_lock.lg_lock);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
}

static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new)
{
        struct rb_node **iter = &root->rb_node, *parent = NULL;
        struct ext4_prealloc_space *iter_pa, *new_pa;
        ext4_lblk_t iter_start, new_start;

        while (*iter) {
                iter_pa = rb_entry(*iter, struct ext4_prealloc_space,
                                   pa_node.inode_node);
                new_pa = rb_entry(new, struct ext4_prealloc_space,
                                   pa_node.inode_node);
                iter_start = iter_pa->pa_lstart;
                new_start = new_pa->pa_lstart;

                parent = *iter;
                if (new_start < iter_start)
                        iter = &((*iter)->rb_left);
                else
                        iter = &((*iter)->rb_right);
        }

        rb_link_node(new, parent, iter);
        rb_insert_color(new, root);
}

/*
 * creates new preallocated space for given inode
 */
static noinline_for_stack void
ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
{
        struct super_block *sb = ac->ac_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_prealloc_space *pa;
        struct ext4_group_info *grp;
        struct ext4_inode_info *ei;

        /* preallocate only when found space is larger then requested */
        BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
        BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
        BUG_ON(ac->ac_pa == NULL);

        pa = ac->ac_pa;

        if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) {
                struct ext4_free_extent ex = {
                        .fe_logical = ac->ac_g_ex.fe_logical,
                        .fe_len = ac->ac_orig_goal_len,
                };
                loff_t orig_goal_end = extent_logical_end(sbi, &ex);
                loff_t o_ex_end = extent_logical_end(sbi, &ac->ac_o_ex);

                /*
                 * We can't allocate as much as normalizer wants, so we try
                 * to get proper lstart to cover the original request, except
                 * when the goal doesn't cover the original request as below:
                 *
                 * orig_ex:2045/2055(10), isize:8417280 -> normalized:0/2048
                 * best_ex:0/200(200) -> adjusted: 1848/2048(200)
                 */
                BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
                BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);

                /*
                 * Use the below logic for adjusting best extent as it keeps
                 * fragmentation in check while ensuring logical range of best
                 * extent doesn't overflow out of goal extent:
                 *
                 * 1. Check if best ex can be kept at end of goal (before
                 *    cr_best_avail trimmed it) and still cover original start
                 * 2. Else, check if best ex can be kept at start of goal and
                 *    still cover original end
                 * 3. Else, keep the best ex at start of original request.
                 */
                ex.fe_len = ac->ac_b_ex.fe_len;

                ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len);
                if (ac->ac_o_ex.fe_logical >= ex.fe_logical)
                        goto adjust_bex;

                ex.fe_logical = ac->ac_g_ex.fe_logical;
                if (o_ex_end <= extent_logical_end(sbi, &ex))
                        goto adjust_bex;

                ex.fe_logical = ac->ac_o_ex.fe_logical;
adjust_bex:
                ac->ac_b_ex.fe_logical = ex.fe_logical;

                BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
                BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end);
        }

        pa->pa_lstart = ac->ac_b_ex.fe_logical;
        pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
        pa->pa_len = ac->ac_b_ex.fe_len;
        pa->pa_free = pa->pa_len;
        spin_lock_init(&pa->pa_lock);
        INIT_LIST_HEAD(&pa->pa_group_list);
        pa->pa_deleted = 0;
        pa->pa_type = MB_INODE_PA;

        mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
                 pa->pa_len, pa->pa_lstart);
        trace_ext4_mb_new_inode_pa(ac, pa);

        atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
        ext4_mb_use_inode_pa(ac, pa);

        ei = EXT4_I(ac->ac_inode);
        grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
        if (!grp)
                return;

        pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock;
        pa->pa_inode = ac->ac_inode;

        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);

        write_lock(pa->pa_node_lock.inode_lock);
        ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node);
        write_unlock(pa->pa_node_lock.inode_lock);
        atomic_inc(&ei->i_prealloc_active);
}

/*
 * creates new preallocated space for locality group inodes belongs to
 */
static noinline_for_stack void
ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
{
        struct super_block *sb = ac->ac_sb;
        struct ext4_locality_group *lg;
        struct ext4_prealloc_space *pa;
        struct ext4_group_info *grp;

        /* preallocate only when found space is larger then requested */
        BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
        BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
        BUG_ON(ac->ac_pa == NULL);

        pa = ac->ac_pa;

        pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
        pa->pa_lstart = pa->pa_pstart;
        pa->pa_len = ac->ac_b_ex.fe_len;
        pa->pa_free = pa->pa_len;
        spin_lock_init(&pa->pa_lock);
        INIT_LIST_HEAD(&pa->pa_node.lg_list);
        INIT_LIST_HEAD(&pa->pa_group_list);
        pa->pa_deleted = 0;
        pa->pa_type = MB_GROUP_PA;

        mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
                 pa->pa_len, pa->pa_lstart);
        trace_ext4_mb_new_group_pa(ac, pa);

        ext4_mb_use_group_pa(ac, pa);
        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);

        grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
        if (!grp)
                return;
        lg = ac->ac_lg;
        BUG_ON(lg == NULL);

        pa->pa_node_lock.lg_lock = &lg->lg_prealloc_lock;
        pa->pa_inode = NULL;

        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);

        /*
         * We will later add the new pa to the right bucket
         * after updating the pa_free in ext4_mb_release_context
         */
}

static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
{
        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
                ext4_mb_new_group_pa(ac);
        else
                ext4_mb_new_inode_pa(ac);
}

/*
 * finds all unused blocks in on-disk bitmap, frees them in
 * in-core bitmap and buddy.
 * @pa must be unlinked from inode and group lists, so that
 * nobody else can find/use it.
 * the caller MUST hold group/inode locks.
 * TODO: optimize the case when there are no in-core structures yet
 */
static noinline_for_stack void
ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                        struct ext4_prealloc_space *pa)
{
        struct super_block *sb = e4b->bd_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned int end;
        unsigned int next;
        ext4_group_t group;
        ext4_grpblk_t bit;
        unsigned long long grp_blk_start;
        int free = 0;

        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
        end = bit + pa->pa_len;

        while (bit < end) {
                bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
                if (bit >= end)
                        break;
                next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
                mb_debug(sb, "free preallocated %u/%u in group %u\n",
                         (unsigned) ext4_group_first_block_no(sb, group) + bit,
                         (unsigned) next - bit, (unsigned) group);
                free += next - bit;

                trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
                trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
                                                    EXT4_C2B(sbi, bit)),
                                               next - bit);
                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                bit = next + 1;
        }
        if (free != pa->pa_free) {
                ext4_msg(e4b->bd_sb, KERN_CRIT,
                         "pa %p: logic %lu, phys. %lu, len %d",
                         pa, (unsigned long) pa->pa_lstart,
                         (unsigned long) pa->pa_pstart,
                         pa->pa_len);
                ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
                                        free, pa->pa_free);
                /*
                 * pa is already deleted so we use the value obtained
                 * from the bitmap and continue.
                 */
        }
        atomic_add(free, &sbi->s_mb_discarded);
}

static noinline_for_stack void
ext4_mb_release_group_pa(struct ext4_buddy *e4b,
                                struct ext4_prealloc_space *pa)
{
        struct super_block *sb = e4b->bd_sb;
        ext4_group_t group;
        ext4_grpblk_t bit;

        trace_ext4_mb_release_group_pa(sb, pa);
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) {
                ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu",
                             e4b->bd_group, group, pa->pa_pstart);
                return;
        }
        mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
        atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
        trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
}

/*
 * releases all preallocations in given group
 *
 * first, we need to decide discard policy:
 * - when do we discard
 *   1) ENOSPC
 * - how many do we discard
 *   1) how many requested
 */
static noinline_for_stack int
ext4_mb_discard_group_preallocations(struct super_block *sb,
                                     ext4_group_t group, int *busy)
{
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_prealloc_space *pa, *tmp;
        LIST_HEAD(list);
        struct ext4_buddy e4b;
        struct ext4_inode_info *ei;
        int err;
        int free = 0;

        if (!grp)
                return 0;
        mb_debug(sb, "discard preallocation for group %u\n", group);
        if (list_empty(&grp->bb_prealloc_list))
                goto out_dbg;

        bitmap_bh = ext4_read_block_bitmap(sb, group);
        if (IS_ERR(bitmap_bh)) {
                err = PTR_ERR(bitmap_bh);
                ext4_error_err(sb, -err,
                               "Error %d reading block bitmap for %u",
                               err, group);
                goto out_dbg;
        }

        err = ext4_mb_load_buddy(sb, group, &e4b);
        if (err) {
                ext4_warning(sb, "Error %d loading buddy information for %u",
                             err, group);
                put_bh(bitmap_bh);
                goto out_dbg;
        }

        ext4_lock_group(sb, group);
        list_for_each_entry_safe(pa, tmp,
                                &grp->bb_prealloc_list, pa_group_list) {
                spin_lock(&pa->pa_lock);
                if (atomic_read(&pa->pa_count)) {
                        spin_unlock(&pa->pa_lock);
                        *busy = 1;
                        continue;
                }
                if (pa->pa_deleted) {
                        spin_unlock(&pa->pa_lock);
                        continue;
                }

                /* seems this one can be freed ... */
                ext4_mb_mark_pa_deleted(sb, pa);

                if (!free)
                        this_cpu_inc(discard_pa_seq);

                /* we can trust pa_free ... */
                free += pa->pa_free;

                spin_unlock(&pa->pa_lock);

                list_del(&pa->pa_group_list);
                list_add(&pa->u.pa_tmp_list, &list);
        }

        /* now free all selected PAs */
        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {

                /* remove from object (inode or locality group) */
                if (pa->pa_type == MB_GROUP_PA) {
                        spin_lock(pa->pa_node_lock.lg_lock);
                        list_del_rcu(&pa->pa_node.lg_list);
                        spin_unlock(pa->pa_node_lock.lg_lock);
                } else {
                        write_lock(pa->pa_node_lock.inode_lock);
                        ei = EXT4_I(pa->pa_inode);
                        rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
                        write_unlock(pa->pa_node_lock.inode_lock);
                }

                list_del(&pa->u.pa_tmp_list);

                if (pa->pa_type == MB_GROUP_PA) {
                        ext4_mb_release_group_pa(&e4b, pa);
                        call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
                } else {
                        ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
                        ext4_mb_pa_free(pa);
                }
        }

        ext4_unlock_group(sb, group);
        ext4_mb_unload_buddy(&e4b);
        put_bh(bitmap_bh);
out_dbg:
        mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
                 free, group, grp->bb_free);
        return free;
}

/*
 * releases all non-used preallocated blocks for given inode
 *
 * It's important to discard preallocations under i_data_sem
 * We don't want another block to be served from the prealloc
 * space when we are discarding the inode prealloc space.
 *
 * FIXME!! Make sure it is valid at all the call sites
 */
void ext4_discard_preallocations(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct super_block *sb = inode->i_sb;
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_prealloc_space *pa, *tmp;
        ext4_group_t group = 0;
        LIST_HEAD(list);
        struct ext4_buddy e4b;
        struct rb_node *iter;
        int err;

        if (!S_ISREG(inode->i_mode))
                return;

        if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        mb_debug(sb, "discard preallocation for inode %lu\n",
                 inode->i_ino);
        trace_ext4_discard_preallocations(inode,
                        atomic_read(&ei->i_prealloc_active));

repeat:
        /* first, collect all pa's in the inode */
        write_lock(&ei->i_prealloc_lock);
        for (iter = rb_first(&ei->i_prealloc_node); iter;
             iter = rb_next(iter)) {
                pa = rb_entry(iter, struct ext4_prealloc_space,
                              pa_node.inode_node);
                BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock);

                spin_lock(&pa->pa_lock);
                if (atomic_read(&pa->pa_count)) {
                        /* this shouldn't happen often - nobody should
                         * use preallocation while we're discarding it */
                        spin_unlock(&pa->pa_lock);
                        write_unlock(&ei->i_prealloc_lock);
                        ext4_msg(sb, KERN_ERR,
                                 "uh-oh! used pa while discarding");
                        WARN_ON(1);
                        schedule_timeout_uninterruptible(HZ);
                        goto repeat;

                }
                if (pa->pa_deleted == 0) {
                        ext4_mb_mark_pa_deleted(sb, pa);
                        spin_unlock(&pa->pa_lock);
                        rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
                        list_add(&pa->u.pa_tmp_list, &list);
                        continue;
                }

                /* someone is deleting pa right now */
                spin_unlock(&pa->pa_lock);
                write_unlock(&ei->i_prealloc_lock);

                /* we have to wait here because pa_deleted
                 * doesn't mean pa is already unlinked from
                 * the list. as we might be called from
                 * ->clear_inode() the inode will get freed
                 * and concurrent thread which is unlinking
                 * pa from inode's list may access already
                 * freed memory, bad-bad-bad */

                /* XXX: if this happens too often, we can
                 * add a flag to force wait only in case
                 * of ->clear_inode(), but not in case of
                 * regular truncate */
                schedule_timeout_uninterruptible(HZ);
                goto repeat;
        }
        write_unlock(&ei->i_prealloc_lock);

        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
                BUG_ON(pa->pa_type != MB_INODE_PA);
                group = ext4_get_group_number(sb, pa->pa_pstart);

                err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
                                             GFP_NOFS|__GFP_NOFAIL);
                if (err) {
                        ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
                                       err, group);
                        continue;
                }

                bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (IS_ERR(bitmap_bh)) {
                        err = PTR_ERR(bitmap_bh);
                        ext4_error_err(sb, -err, "Error %d reading block bitmap for %u",
                                       err, group);
                        ext4_mb_unload_buddy(&e4b);
                        continue;
                }

                ext4_lock_group(sb, group);
                list_del(&pa->pa_group_list);
                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
                ext4_unlock_group(sb, group);

                ext4_mb_unload_buddy(&e4b);
                put_bh(bitmap_bh);

                list_del(&pa->u.pa_tmp_list);
                ext4_mb_pa_free(pa);
        }
}

static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
{
        struct ext4_prealloc_space *pa;

        BUG_ON(ext4_pspace_cachep == NULL);
        pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
        if (!pa)
                return -ENOMEM;
        atomic_set(&pa->pa_count, 1);
        ac->ac_pa = pa;
        return 0;
}

static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac)
{
        struct ext4_prealloc_space *pa = ac->ac_pa;

        BUG_ON(!pa);
        ac->ac_pa = NULL;
        WARN_ON(!atomic_dec_and_test(&pa->pa_count));
        /*
         * current function is only called due to an error or due to
         * len of found blocks < len of requested blocks hence the PA has not
         * been added to grp->bb_prealloc_list. So we don't need to lock it
         */
        pa->pa_deleted = 1;
        ext4_mb_pa_free(pa);
}

#ifdef CONFIG_EXT4_DEBUG
static inline void ext4_mb_show_pa(struct super_block *sb)
{
        ext4_group_t i, ngroups;

        if (ext4_forced_shutdown(sb))
                return;

        ngroups = ext4_get_groups_count(sb);
        mb_debug(sb, "groups: ");
        for (i = 0; i < ngroups; i++) {
                struct ext4_group_info *grp = ext4_get_group_info(sb, i);
                struct ext4_prealloc_space *pa;
                ext4_grpblk_t start;
                struct list_head *cur;

                if (!grp)
                        continue;
                ext4_lock_group(sb, i);
                list_for_each(cur, &grp->bb_prealloc_list) {
                        pa = list_entry(cur, struct ext4_prealloc_space,
                                        pa_group_list);
                        spin_lock(&pa->pa_lock);
                        ext4_get_group_no_and_offset(sb, pa->pa_pstart,
                                                     NULL, &start);
                        spin_unlock(&pa->pa_lock);
                        mb_debug(sb, "PA:%u:%d:%d\n", i, start,
                                 pa->pa_len);
                }
                ext4_unlock_group(sb, i);
                mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free,
                         grp->bb_fragments);
        }
}

static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
        struct super_block *sb = ac->ac_sb;

        if (ext4_forced_shutdown(sb))
                return;

        mb_debug(sb, "Can't allocate:"
                        " Allocation context details:");
        mb_debug(sb, "status %u flags 0x%x",
                        ac->ac_status, ac->ac_flags);
        mb_debug(sb, "orig %lu/%lu/%lu@%lu, "
                        "goal %lu/%lu/%lu@%lu, "
                        "best %lu/%lu/%lu@%lu cr %d",
                        (unsigned long)ac->ac_o_ex.fe_group,
                        (unsigned long)ac->ac_o_ex.fe_start,
                        (unsigned long)ac->ac_o_ex.fe_len,
                        (unsigned long)ac->ac_o_ex.fe_logical,
                        (unsigned long)ac->ac_g_ex.fe_group,
                        (unsigned long)ac->ac_g_ex.fe_start,
                        (unsigned long)ac->ac_g_ex.fe_len,
                        (unsigned long)ac->ac_g_ex.fe_logical,
                        (unsigned long)ac->ac_b_ex.fe_group,
                        (unsigned long)ac->ac_b_ex.fe_start,
                        (unsigned long)ac->ac_b_ex.fe_len,
                        (unsigned long)ac->ac_b_ex.fe_logical,
                        (int)ac->ac_criteria);
        mb_debug(sb, "%u found", ac->ac_found);
        mb_debug(sb, "used pa: %s, ", ac->ac_pa ? "yes" : "no");
        if (ac->ac_pa)
                mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ?
                         "group pa" : "inode pa");
        ext4_mb_show_pa(sb);
}
#else
static inline void ext4_mb_show_pa(struct super_block *sb)
{
}
static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
        ext4_mb_show_pa(ac->ac_sb);
}
#endif

/*
 * We use locality group preallocation for small size file. The size of the
 * file is determined by the current size or the resulting size after
 * allocation which ever is larger
 *
 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
 */
static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        int bsbits = ac->ac_sb->s_blocksize_bits;
        loff_t size, isize;
        bool inode_pa_eligible, group_pa_eligible;

        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
                return;

        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
                return;

        group_pa_eligible = sbi->s_mb_group_prealloc > 0;
        inode_pa_eligible = true;
        size = extent_logical_end(sbi, &ac->ac_o_ex);
        isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
                >> bsbits;

        /* No point in using inode preallocation for closed files */
        if ((size == isize) && !ext4_fs_is_busy(sbi) &&
            !inode_is_open_for_write(ac->ac_inode))
                inode_pa_eligible = false;

        size = max(size, isize);
        /* Don't use group allocation for large files */
        if (size > sbi->s_mb_stream_request)
                group_pa_eligible = false;

        if (!group_pa_eligible) {
                if (inode_pa_eligible)
                        ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
                else
                        ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
                return;
        }

        BUG_ON(ac->ac_lg != NULL);
        /*
         * locality group prealloc space are per cpu. The reason for having
         * per cpu locality group is to reduce the contention between block
         * request from multiple CPUs.
         */
        ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);

        /* we're going to use group allocation */
        ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;

        /* serialize all allocations in the group */
        mutex_lock(&ac->ac_lg->lg_mutex);
}

static noinline_for_stack void
ext4_mb_initialize_context(struct ext4_allocation_context *ac,
                                struct ext4_allocation_request *ar)
{
        struct super_block *sb = ar->inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        ext4_group_t group;
        unsigned int len;
        ext4_fsblk_t goal;
        ext4_grpblk_t block;

        /* we can't allocate > group size */
        len = ar->len;

        /* just a dirty hack to filter too big requests  */
        if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
                len = EXT4_CLUSTERS_PER_GROUP(sb);

        /* start searching from the goal */
        goal = ar->goal;
        if (goal < le32_to_cpu(es->s_first_data_block) ||
                        goal >= ext4_blocks_count(es))
                goal = le32_to_cpu(es->s_first_data_block);
        ext4_get_group_no_and_offset(sb, goal, &group, &block);

        /* set up allocation goals */
        ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
        ac->ac_status = AC_STATUS_CONTINUE;
        ac->ac_sb = sb;
        ac->ac_inode = ar->inode;
        ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
        ac->ac_o_ex.fe_group = group;
        ac->ac_o_ex.fe_start = block;
        ac->ac_o_ex.fe_len = len;
        ac->ac_g_ex = ac->ac_o_ex;
        ac->ac_orig_goal_len = ac->ac_g_ex.fe_len;
        ac->ac_flags = ar->flags;

        /* we have to define context: we'll work with a file or
         * locality group. this is a policy, actually */
        ext4_mb_group_or_file(ac);

        mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, "
                        "left: %u/%u, right %u/%u to %swritable\n",
                        (unsigned) ar->len, (unsigned) ar->logical,
                        (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
                        (unsigned) ar->lleft, (unsigned) ar->pleft,
                        (unsigned) ar->lright, (unsigned) ar->pright,
                        inode_is_open_for_write(ar->inode) ? "" : "non-");
}

static noinline_for_stack void
ext4_mb_discard_lg_preallocations(struct super_block *sb,
                                        struct ext4_locality_group *lg,
                                        int order, int total_entries)
{
        ext4_group_t group = 0;
        struct ext4_buddy e4b;
        LIST_HEAD(discard_list);
        struct ext4_prealloc_space *pa, *tmp;

        mb_debug(sb, "discard locality group preallocation\n");

        spin_lock(&lg->lg_prealloc_lock);
        list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
                                pa_node.lg_list,
                                lockdep_is_held(&lg->lg_prealloc_lock)) {
                spin_lock(&pa->pa_lock);
                if (atomic_read(&pa->pa_count)) {
                        /*
                         * This is the pa that we just used
                         * for block allocation. So don't
                         * free that
                         */
                        spin_unlock(&pa->pa_lock);
                        continue;
                }
                if (pa->pa_deleted) {
                        spin_unlock(&pa->pa_lock);
                        continue;
                }
                /* only lg prealloc space */
                BUG_ON(pa->pa_type != MB_GROUP_PA);

                /* seems this one can be freed ... */
                ext4_mb_mark_pa_deleted(sb, pa);
                spin_unlock(&pa->pa_lock);

                list_del_rcu(&pa->pa_node.lg_list);
                list_add(&pa->u.pa_tmp_list, &discard_list);

                total_entries--;
                if (total_entries <= 5) {
                        /*
                         * we want to keep only 5 entries
                         * allowing it to grow to 8. This
                         * mak sure we don't call discard
                         * soon for this list.
                         */
                        break;
                }
        }
        spin_unlock(&lg->lg_prealloc_lock);

        list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
                int err;

                group = ext4_get_group_number(sb, pa->pa_pstart);
                err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
                                             GFP_NOFS|__GFP_NOFAIL);
                if (err) {
                        ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
                                       err, group);
                        continue;
                }
                ext4_lock_group(sb, group);
                list_del(&pa->pa_group_list);
                ext4_mb_release_group_pa(&e4b, pa);
                ext4_unlock_group(sb, group);

                ext4_mb_unload_buddy(&e4b);
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
}

/*
 * We have incremented pa_count. So it cannot be freed at this
 * point. Also we hold lg_mutex. So no parallel allocation is
 * possible from this lg. That means pa_free cannot be updated.
 *
 * A parallel ext4_mb_discard_group_preallocations is possible.
 * which can cause the lg_prealloc_list to be updated.
 */

static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
{
        int order, added = 0, lg_prealloc_count = 1;
        struct super_block *sb = ac->ac_sb;
        struct ext4_locality_group *lg = ac->ac_lg;
        struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;

        order = fls(pa->pa_free) - 1;
        if (order > PREALLOC_TB_SIZE - 1)
                /* The max size of hash table is PREALLOC_TB_SIZE */
                order = PREALLOC_TB_SIZE - 1;
        /* Add the prealloc space to lg */
        spin_lock(&lg->lg_prealloc_lock);
        list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
                                pa_node.lg_list,
                                lockdep_is_held(&lg->lg_prealloc_lock)) {
                spin_lock(&tmp_pa->pa_lock);
                if (tmp_pa->pa_deleted) {
                        spin_unlock(&tmp_pa->pa_lock);
                        continue;
                }
                if (!added && pa->pa_free < tmp_pa->pa_free) {
                        /* Add to the tail of the previous entry */
                        list_add_tail_rcu(&pa->pa_node.lg_list,
                                                &tmp_pa->pa_node.lg_list);
                        added = 1;
                        /*
                         * we want to count the total
                         * number of entries in the list
                         */
                }
                spin_unlock(&tmp_pa->pa_lock);
                lg_prealloc_count++;
        }
        if (!added)
                list_add_tail_rcu(&pa->pa_node.lg_list,
                                        &lg->lg_prealloc_list[order]);
        spin_unlock(&lg->lg_prealloc_lock);

        /* Now trim the list to be not more than 8 elements */
        if (lg_prealloc_count > 8)
                ext4_mb_discard_lg_preallocations(sb, lg,
                                                  order, lg_prealloc_count);
}

/*
 * release all resource we used in allocation
 */
static void ext4_mb_release_context(struct ext4_allocation_context *ac)
{
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_prealloc_space *pa = ac->ac_pa;
        if (pa) {
                if (pa->pa_type == MB_GROUP_PA) {
                        /* see comment in ext4_mb_use_group_pa() */
                        spin_lock(&pa->pa_lock);
                        pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
                        pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
                        pa->pa_free -= ac->ac_b_ex.fe_len;
                        pa->pa_len -= ac->ac_b_ex.fe_len;
                        spin_unlock(&pa->pa_lock);

                        /*
                         * We want to add the pa to the right bucket.
                         * Remove it from the list and while adding
                         * make sure the list to which we are adding
                         * doesn't grow big.
                         */
                        if (likely(pa->pa_free)) {
                                spin_lock(pa->pa_node_lock.lg_lock);
                                list_del_rcu(&pa->pa_node.lg_list);
                                spin_unlock(pa->pa_node_lock.lg_lock);
                                ext4_mb_add_n_trim(ac);
                        }
                }

                ext4_mb_put_pa(ac, ac->ac_sb, pa);
        }
        if (ac->ac_bitmap_folio)
                folio_put(ac->ac_bitmap_folio);
        if (ac->ac_buddy_folio)
                folio_put(ac->ac_buddy_folio);
        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
                mutex_unlock(&ac->ac_lg->lg_mutex);
        ext4_mb_collect_stats(ac);
}

static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
{
        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
        int ret;
        int freed = 0, busy = 0;
        int retry = 0;

        trace_ext4_mb_discard_preallocations(sb, needed);

        if (needed == 0)
                needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
 repeat:
        for (i = 0; i < ngroups && needed > 0; i++) {
                ret = ext4_mb_discard_group_preallocations(sb, i, &busy);
                freed += ret;
                needed -= ret;
                cond_resched();
        }

        if (needed > 0 && busy && ++retry < 3) {
                busy = 0;
                goto repeat;
        }

        return freed;
}

static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
                        struct ext4_allocation_context *ac, u64 *seq)
{
        int freed;
        u64 seq_retry = 0;
        bool ret = false;

        freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
        if (freed) {
                ret = true;
                goto out_dbg;
        }
        seq_retry = ext4_get_discard_pa_seq_sum();
        if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) {
                ac->ac_flags |= EXT4_MB_STRICT_CHECK;
                *seq = seq_retry;
                ret = true;
        }

out_dbg:
        mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no");
        return ret;
}

/*
 * Simple allocator for Ext4 fast commit replay path. It searches for blocks
 * linearly starting at the goal block and also excludes the blocks which
 * are going to be in use after fast commit replay.
 */
static ext4_fsblk_t
ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp)
{
        struct buffer_head *bitmap_bh;
        struct super_block *sb = ar->inode->i_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t group, nr;
        ext4_grpblk_t blkoff;
        ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
        ext4_grpblk_t i = 0;
        ext4_fsblk_t goal, block;
        struct ext4_super_block *es = sbi->s_es;

        goal = ar->goal;
        if (goal < le32_to_cpu(es->s_first_data_block) ||
                        goal >= ext4_blocks_count(es))
                goal = le32_to_cpu(es->s_first_data_block);

        ar->len = 0;
        ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
        for (nr = ext4_get_groups_count(sb); nr > 0; nr--) {
                bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (IS_ERR(bitmap_bh)) {
                        *errp = PTR_ERR(bitmap_bh);
                        pr_warn("Failed to read block bitmap\n");
                        return 0;
                }

                while (1) {
                        i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
                                                blkoff);
                        if (i >= max)
                                break;
                        if (ext4_fc_replay_check_excluded(sb,
                                ext4_group_first_block_no(sb, group) +
                                EXT4_C2B(sbi, i))) {
                                blkoff = i + 1;
                        } else
                                break;
                }
                brelse(bitmap_bh);
                if (i < max)
                        break;

                if (++group >= ext4_get_groups_count(sb))
                        group = 0;

                blkoff = 0;
        }

        if (i >= max) {
                *errp = -ENOSPC;
                return 0;
        }

        block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i);
        ext4_mb_mark_bb(sb, block, 1, true);
        ar->len = 1;

        *errp = 0;
        return block;
}

/*
 * Main entry point into mballoc to allocate blocks
 * it tries to use preallocation first, then falls back
 * to usual allocation
 */
ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
                                struct ext4_allocation_request *ar, int *errp)
{
        struct ext4_allocation_context *ac = NULL;
        struct ext4_sb_info *sbi;
        struct super_block *sb;
        ext4_fsblk_t block = 0;
        unsigned int inquota = 0;
        unsigned int reserv_clstrs = 0;
        int retries = 0;
        u64 seq;

        might_sleep();
        sb = ar->inode->i_sb;
        sbi = EXT4_SB(sb);

        trace_ext4_request_blocks(ar);
        if (sbi->s_mount_state & EXT4_FC_REPLAY)
                return ext4_mb_new_blocks_simple(ar, errp);

        /* Allow to use superuser reservation for quota file */
        if (ext4_is_quota_file(ar->inode))
                ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;

        if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
                /* Without delayed allocation we need to verify
                 * there is enough free blocks to do block allocation
                 * and verify allocation doesn't exceed the quota limits.
                 */
                while (ar->len &&
                        ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {

                        /* let others to free the space */
                        cond_resched();
                        ar->len = ar->len >> 1;
                }
                if (!ar->len) {
                        ext4_mb_show_pa(sb);
                        *errp = -ENOSPC;
                        return 0;
                }
                reserv_clstrs = ar->len;
                if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
                        dquot_alloc_block_nofail(ar->inode,
                                                 EXT4_C2B(sbi, ar->len));
                } else {
                        while (ar->len &&
                                dquot_alloc_block(ar->inode,
                                                  EXT4_C2B(sbi, ar->len))) {

                                ar->flags |= EXT4_MB_HINT_NOPREALLOC;
                                ar->len--;
                        }
                }
                inquota = ar->len;
                if (ar->len == 0) {
                        *errp = -EDQUOT;
                        goto out;
                }
        }

        ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
        if (!ac) {
                ar->len = 0;
                *errp = -ENOMEM;
                goto out;
        }

        ext4_mb_initialize_context(ac, ar);

        ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
        seq = this_cpu_read(discard_pa_seq);
        if (!ext4_mb_use_preallocated(ac)) {
                ac->ac_op = EXT4_MB_HISTORY_ALLOC;
                ext4_mb_normalize_request(ac, ar);

                *errp = ext4_mb_pa_alloc(ac);
                if (*errp)
                        goto errout;
repeat:
                /* allocate space in core */
                *errp = ext4_mb_regular_allocator(ac);
                /*
                 * pa allocated above is added to grp->bb_prealloc_list only
                 * when we were able to allocate some block i.e. when
                 * ac->ac_status == AC_STATUS_FOUND.
                 * And error from above mean ac->ac_status != AC_STATUS_FOUND
                 * So we have to free this pa here itself.
                 */
                if (*errp) {
                        ext4_mb_pa_put_free(ac);
                        ext4_discard_allocated_blocks(ac);
                        goto errout;
                }
                if (ac->ac_status == AC_STATUS_FOUND &&
                        ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
                        ext4_mb_pa_put_free(ac);
        }
        if (likely(ac->ac_status == AC_STATUS_FOUND)) {
                *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
                if (*errp) {
                        ext4_discard_allocated_blocks(ac);
                        goto errout;
                } else {
                        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
                        ar->len = ac->ac_b_ex.fe_len;
                }
        } else {
                if (++retries < 3 &&
                    ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
                        goto repeat;
                /*
                 * If block allocation fails then the pa allocated above
                 * needs to be freed here itself.
                 */
                ext4_mb_pa_put_free(ac);
                *errp = -ENOSPC;
        }

        if (*errp) {
errout:
                ac->ac_b_ex.fe_len = 0;
                ar->len = 0;
                ext4_mb_show_ac(ac);
        }
        ext4_mb_release_context(ac);
        kmem_cache_free(ext4_ac_cachep, ac);
out:
        if (inquota && ar->len < inquota)
                dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
        if (!ar->len) {
                if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
                        /* release all the reserved blocks if non delalloc */
                        percpu_counter_sub(&sbi->s_dirtyclusters_counter,
                                                reserv_clstrs);
        }

        trace_ext4_allocate_blocks(ar, (unsigned long long)block);

        return block;
}

/*
 * We can merge two free data extents only if the physical blocks
 * are contiguous, AND the extents were freed by the same transaction,
 * AND the blocks are associated with the same group.
 */
static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
                                        struct ext4_free_data *entry,
                                        struct ext4_free_data *new_entry,
                                        struct rb_root *entry_rb_root)
{
        if ((entry->efd_tid != new_entry->efd_tid) ||
            (entry->efd_group != new_entry->efd_group))
                return;
        if (entry->efd_start_cluster + entry->efd_count ==
            new_entry->efd_start_cluster) {
                new_entry->efd_start_cluster = entry->efd_start_cluster;
                new_entry->efd_count += entry->efd_count;
        } else if (new_entry->efd_start_cluster + new_entry->efd_count ==
                   entry->efd_start_cluster) {
                new_entry->efd_count += entry->efd_count;
        } else
                return;
        spin_lock(&sbi->s_md_lock);
        list_del(&entry->efd_list);
        spin_unlock(&sbi->s_md_lock);
        rb_erase(&entry->efd_node, entry_rb_root);
        kmem_cache_free(ext4_free_data_cachep, entry);
}

static noinline_for_stack void
ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
                      struct ext4_free_data *new_entry)
{
        ext4_group_t group = e4b->bd_group;
        ext4_grpblk_t cluster;
        ext4_grpblk_t clusters = new_entry->efd_count;
        struct ext4_free_data *entry;
        struct ext4_group_info *db = e4b->bd_info;
        struct super_block *sb = e4b->bd_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct rb_node **n = &db->bb_free_root.rb_node, *node;
        struct rb_node *parent = NULL, *new_node;

        BUG_ON(!ext4_handle_valid(handle));
        BUG_ON(e4b->bd_bitmap_folio == NULL);
        BUG_ON(e4b->bd_buddy_folio == NULL);

        new_node = &new_entry->efd_node;
        cluster = new_entry->efd_start_cluster;

        if (!*n) {
                /* first free block exent. We need to
                   protect buddy cache from being freed,
                 * otherwise we'll refresh it from
                 * on-disk bitmap and lose not-yet-available
                 * blocks */
                folio_get(e4b->bd_buddy_folio);
                folio_get(e4b->bd_bitmap_folio);
        }
        while (*n) {
                parent = *n;
                entry = rb_entry(parent, struct ext4_free_data, efd_node);
                if (cluster < entry->efd_start_cluster)
                        n = &(*n)->rb_left;
                else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
                        n = &(*n)->rb_right;
                else {
                        ext4_grp_locked_error(sb, group, 0,
                                ext4_group_first_block_no(sb, group) +
                                EXT4_C2B(sbi, cluster),
                                "Block already on to-be-freed list");
                        kmem_cache_free(ext4_free_data_cachep, new_entry);
                        return;
                }
        }

        rb_link_node(new_node, parent, n);
        rb_insert_color(new_node, &db->bb_free_root);

        /* Now try to see the extent can be merged to left and right */
        node = rb_prev(new_node);
        if (node) {
                entry = rb_entry(node, struct ext4_free_data, efd_node);
                ext4_try_merge_freed_extent(sbi, entry, new_entry,
                                            &(db->bb_free_root));
        }

        node = rb_next(new_node);
        if (node) {
                entry = rb_entry(node, struct ext4_free_data, efd_node);
                ext4_try_merge_freed_extent(sbi, entry, new_entry,
                                            &(db->bb_free_root));
        }

        spin_lock(&sbi->s_md_lock);
        list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]);
        sbi->s_mb_free_pending += clusters;
        spin_unlock(&sbi->s_md_lock);
}

static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
                                        unsigned long count)
{
        struct super_block *sb = inode->i_sb;
        ext4_group_t group;
        ext4_grpblk_t blkoff;

        ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
        ext4_mb_mark_context(NULL, sb, false, group, blkoff, count,
                             EXT4_MB_BITMAP_MARKED_CHECK |
                             EXT4_MB_SYNC_UPDATE,
                             NULL);
}

/**
 * ext4_mb_clear_bb() -- helper function for freeing blocks.
 *                        Used by ext4_free_blocks()
 * @handle:                handle for this transaction
 * @inode:                inode
 * @block:                starting physical block to be freed
 * @count:                number of blocks to be freed
 * @flags:                flags used by ext4_free_blocks
 */
static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
                               ext4_fsblk_t block, unsigned long count,
                               int flags)
{
        struct super_block *sb = inode->i_sb;
        struct ext4_group_info *grp;
        unsigned int overflow;
        ext4_grpblk_t bit;
        ext4_group_t block_group;
        struct ext4_sb_info *sbi;
        struct ext4_buddy e4b;
        unsigned int count_clusters;
        int err = 0;
        int mark_flags = 0;
        ext4_grpblk_t changed;

        sbi = EXT4_SB(sb);

        if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
            !ext4_inode_block_valid(inode, block, count)) {
                ext4_error(sb, "Freeing blocks in system zone - "
                           "Block = %llu, count = %lu", block, count);
                /* err = 0. ext4_std_error should be a no op */
                goto error_out;
        }
        flags |= EXT4_FREE_BLOCKS_VALIDATED;

do_more:
        overflow = 0;
        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);

        grp = ext4_get_group_info(sb, block_group);
        if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
                return;

        /*
         * Check to see if we are freeing blocks across a group
         * boundary.
         */
        if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
                overflow = EXT4_C2B(sbi, bit) + count -
                        EXT4_BLOCKS_PER_GROUP(sb);
                count -= overflow;
                /* The range changed so it's no longer validated */
                flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
        }
        count_clusters = EXT4_NUM_B2C(sbi, count);
        trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);

        /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
        err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
                                     GFP_NOFS|__GFP_NOFAIL);
        if (err)
                goto error_out;

        if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
            !ext4_inode_block_valid(inode, block, count)) {
                ext4_error(sb, "Freeing blocks in system zone - "
                           "Block = %llu, count = %lu", block, count);
                /* err = 0. ext4_std_error should be a no op */
                goto error_clean;
        }

#ifdef AGGRESSIVE_CHECK
        mark_flags |= EXT4_MB_BITMAP_MARKED_CHECK;
#endif
        err = ext4_mb_mark_context(handle, sb, false, block_group, bit,
                                   count_clusters, mark_flags, &changed);


        if (err && changed == 0)
                goto error_clean;

#ifdef AGGRESSIVE_CHECK
        BUG_ON(changed != count_clusters);
#endif

        /*
         * We need to make sure we don't reuse the freed block until after the
         * transaction is committed. We make an exception if the inode is to be
         * written in writeback mode since writeback mode has weak data
         * consistency guarantees.
         */
        if (ext4_handle_valid(handle) &&
            ((flags & EXT4_FREE_BLOCKS_METADATA) ||
             !ext4_should_writeback_data(inode))) {
                struct ext4_free_data *new_entry;
                /*
                 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
                 * to fail.
                 */
                new_entry = kmem_cache_alloc(ext4_free_data_cachep,
                                GFP_NOFS|__GFP_NOFAIL);
                new_entry->efd_start_cluster = bit;
                new_entry->efd_group = block_group;
                new_entry->efd_count = count_clusters;
                new_entry->efd_tid = handle->h_transaction->t_tid;

                ext4_lock_group(sb, block_group);
                ext4_mb_free_metadata(handle, &e4b, new_entry);
        } else {
                if (test_opt(sb, DISCARD)) {
                        err = ext4_issue_discard(sb, block_group, bit,
                                                 count_clusters);
                        /*
                         * Ignore EOPNOTSUPP error. This is consistent with
                         * what happens when using journal.
                         */
                        if (err == -EOPNOTSUPP)
                                err = 0;
                        if (err)
                                ext4_msg(sb, KERN_WARNING, "discard request in"
                                         " group:%u block:%d count:%lu failed"
                                         " with %d", block_group, bit, count,
                                         err);
                } else
                        EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);

                ext4_lock_group(sb, block_group);
                mb_free_blocks(inode, &e4b, bit, count_clusters);
        }

        ext4_unlock_group(sb, block_group);

        /*
         * on a bigalloc file system, defer the s_freeclusters_counter
         * update to the caller (ext4_remove_space and friends) so they
         * can determine if a cluster freed here should be rereserved
         */
        if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) {
                if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
                        dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
                percpu_counter_add(&sbi->s_freeclusters_counter,
                                   count_clusters);
        }

        if (overflow && !err) {
                block += count;
                count = overflow;
                ext4_mb_unload_buddy(&e4b);
                /* The range changed so it's no longer validated */
                flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
                goto do_more;
        }

error_clean:
        ext4_mb_unload_buddy(&e4b);
error_out:
        ext4_std_error(sb, err);
}

/**
 * ext4_free_blocks() -- Free given blocks and update quota
 * @handle:                handle for this transaction
 * @inode:                inode
 * @bh:                        optional buffer of the block to be freed
 * @block:                starting physical block to be freed
 * @count:                number of blocks to be freed
 * @flags:                flags used by ext4_free_blocks
 */
void ext4_free_blocks(handle_t *handle, struct inode *inode,
                      struct buffer_head *bh, ext4_fsblk_t block,
                      unsigned long count, int flags)
{
        struct super_block *sb = inode->i_sb;
        unsigned int overflow;
        struct ext4_sb_info *sbi;

        sbi = EXT4_SB(sb);

        if (bh) {
                if (block)
                        BUG_ON(block != bh->b_blocknr);
                else
                        block = bh->b_blocknr;
        }

        if (sbi->s_mount_state & EXT4_FC_REPLAY) {
                ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count));
                return;
        }

        might_sleep();

        if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
            !ext4_inode_block_valid(inode, block, count)) {
                ext4_error(sb, "Freeing blocks not in datazone - "
                           "block = %llu, count = %lu", block, count);
                return;
        }
        flags |= EXT4_FREE_BLOCKS_VALIDATED;

        ext4_debug("freeing block %llu\n", block);
        trace_ext4_free_blocks(inode, block, count, flags);

        if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
                BUG_ON(count > 1);

                ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
                            inode, bh, block);
        }

        /*
         * If the extent to be freed does not begin on a cluster
         * boundary, we need to deal with partial clusters at the
         * beginning and end of the extent.  Normally we will free
         * blocks at the beginning or the end unless we are explicitly
         * requested to avoid doing so.
         */
        overflow = EXT4_PBLK_COFF(sbi, block);
        if (overflow) {
                if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
                        overflow = sbi->s_cluster_ratio - overflow;
                        block += overflow;
                        if (count > overflow)
                                count -= overflow;
                        else
                                return;
                } else {
                        block -= overflow;
                        count += overflow;
                }
                /* The range changed so it's no longer validated */
                flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
        }
        overflow = EXT4_LBLK_COFF(sbi, count);
        if (overflow) {
                if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
                        if (count > overflow)
                                count -= overflow;
                        else
                                return;
                } else
                        count += sbi->s_cluster_ratio - overflow;
                /* The range changed so it's no longer validated */
                flags &= ~EXT4_FREE_BLOCKS_VALIDATED;
        }

        if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
                int i;
                int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;

                for (i = 0; i < count; i++) {
                        cond_resched();
                        if (is_metadata)
                                bh = sb_find_get_block(inode->i_sb, block + i);
                        ext4_forget(handle, is_metadata, inode, bh, block + i);
                }
        }

        ext4_mb_clear_bb(handle, inode, block, count, flags);
}

/**
 * ext4_group_add_blocks() -- Add given blocks to an existing group
 * @handle:                        handle to this transaction
 * @sb:                                super block
 * @block:                        start physical block to add to the block group
 * @count:                        number of blocks to free
 *
 * This marks the blocks as free in the bitmap and buddy.
 */
int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
                         ext4_fsblk_t block, unsigned long count)
{
        ext4_group_t block_group;
        ext4_grpblk_t bit;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_buddy e4b;
        int err = 0;
        ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block);
        ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1);
        unsigned long cluster_count = last_cluster - first_cluster + 1;
        ext4_grpblk_t changed;

        ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);

        if (cluster_count == 0)
                return 0;

        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
        /*
         * Check to see if we are freeing blocks across a group
         * boundary.
         */
        if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) {
                ext4_warning(sb, "too many blocks added to group %u",
                             block_group);
                err = -EINVAL;
                goto error_out;
        }

        err = ext4_mb_load_buddy(sb, block_group, &e4b);
        if (err)
                goto error_out;

        if (!ext4_sb_block_valid(sb, NULL, block, count)) {
                ext4_error(sb, "Adding blocks in system zones - "
                           "Block = %llu, count = %lu",
                           block, count);
                err = -EINVAL;
                goto error_clean;
        }

        err = ext4_mb_mark_context(handle, sb, false, block_group, bit,
                                   cluster_count, EXT4_MB_BITMAP_MARKED_CHECK,
                                   &changed);
        if (err && changed == 0)
                goto error_clean;

        if (changed != cluster_count)
                ext4_error(sb, "bit already cleared in group %u", block_group);

        ext4_lock_group(sb, block_group);
        mb_free_blocks(NULL, &e4b, bit, cluster_count);
        ext4_unlock_group(sb, block_group);
        percpu_counter_add(&sbi->s_freeclusters_counter,
                           changed);

error_clean:
        ext4_mb_unload_buddy(&e4b);
error_out:
        ext4_std_error(sb, err);
        return err;
}

/**
 * ext4_trim_extent -- function to TRIM one single free extent in the group
 * @sb:                super block for the file system
 * @start:        starting block of the free extent in the alloc. group
 * @count:        number of blocks to TRIM
 * @e4b:        ext4 buddy for the group
 *
 * Trim "count" blocks starting at "start" in the "group". To assure that no
 * one will allocate those blocks, mark it as used in buddy bitmap. This must
 * be called with under the group lock.
 */
static int ext4_trim_extent(struct super_block *sb,
                int start, int count, struct ext4_buddy *e4b)
__releases(bitlock)
__acquires(bitlock)
{
        struct ext4_free_extent ex;
        ext4_group_t group = e4b->bd_group;
        int ret = 0;

        trace_ext4_trim_extent(sb, group, start, count);

        assert_spin_locked(ext4_group_lock_ptr(sb, group));

        ex.fe_start = start;
        ex.fe_group = group;
        ex.fe_len = count;

        /*
         * Mark blocks used, so no one can reuse them while
         * being trimmed.
         */
        mb_mark_used(e4b, &ex);
        ext4_unlock_group(sb, group);
        ret = ext4_issue_discard(sb, group, start, count);
        ext4_lock_group(sb, group);
        mb_free_blocks(NULL, e4b, start, ex.fe_len);
        return ret;
}

static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb,
                                           ext4_group_t grp)
{
        unsigned long nr_clusters_in_group;

        if (grp < (ext4_get_groups_count(sb) - 1))
                nr_clusters_in_group = EXT4_CLUSTERS_PER_GROUP(sb);
        else
                nr_clusters_in_group = (ext4_blocks_count(EXT4_SB(sb)->s_es) -
                                        ext4_group_first_block_no(sb, grp))
                                       >> EXT4_CLUSTER_BITS(sb);

        return nr_clusters_in_group - 1;
}

static bool ext4_trim_interrupted(void)
{
        return fatal_signal_pending(current) || freezing(current);
}

static int ext4_try_to_trim_range(struct super_block *sb,
                struct ext4_buddy *e4b, ext4_grpblk_t start,
                ext4_grpblk_t max, ext4_grpblk_t minblocks)
__acquires(ext4_group_lock_ptr(sb, e4b->bd_group))
__releases(ext4_group_lock_ptr(sb, e4b->bd_group))
{
        ext4_grpblk_t next, count, free_count, last, origin_start;
        bool set_trimmed = false;
        void *bitmap;

        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
                return 0;

        last = ext4_last_grp_cluster(sb, e4b->bd_group);
        bitmap = e4b->bd_bitmap;
        if (start == 0 && max >= last)
                set_trimmed = true;
        origin_start = start;
        start = max(e4b->bd_info->bb_first_free, start);
        count = 0;
        free_count = 0;

        while (start <= max) {
                start = mb_find_next_zero_bit(bitmap, max + 1, start);
                if (start > max)
                        break;

                next = mb_find_next_bit(bitmap, last + 1, start);
                if (origin_start == 0 && next >= last)
                        set_trimmed = true;

                if ((next - start) >= minblocks) {
                        int ret = ext4_trim_extent(sb, start, next - start, e4b);

                        if (ret && ret != -EOPNOTSUPP)
                                return count;
                        count += next - start;
                }
                free_count += next - start;
                start = next + 1;

                if (ext4_trim_interrupted())
                        return count;

                if (need_resched()) {
                        ext4_unlock_group(sb, e4b->bd_group);
                        cond_resched();
                        ext4_lock_group(sb, e4b->bd_group);
                }

                if ((e4b->bd_info->bb_free - free_count) < minblocks)
                        break;
        }

        if (set_trimmed)
                EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info);

        return count;
}

/**
 * ext4_trim_all_free -- function to trim all free space in alloc. group
 * @sb:                        super block for file system
 * @group:                group to be trimmed
 * @start:                first group block to examine
 * @max:                last group block to examine
 * @minblocks:                minimum extent block count
 *
 * ext4_trim_all_free walks through group's block bitmap searching for free
 * extents. When the free extent is found, mark it as used in group buddy
 * bitmap. Then issue a TRIM command on this extent and free the extent in
 * the group buddy bitmap.
 */
static ext4_grpblk_t
ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
                   ext4_grpblk_t start, ext4_grpblk_t max,
                   ext4_grpblk_t minblocks)
{
        struct ext4_buddy e4b;
        int ret;

        trace_ext4_trim_all_free(sb, group, start, max);

        ret = ext4_mb_load_buddy(sb, group, &e4b);
        if (ret) {
                ext4_warning(sb, "Error %d loading buddy information for %u",
                             ret, group);
                return ret;
        }

        ext4_lock_group(sb, group);

        if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
            minblocks < EXT4_SB(sb)->s_last_trim_minblks)
                ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
        else
                ret = 0;

        ext4_unlock_group(sb, group);
        ext4_mb_unload_buddy(&e4b);

        ext4_debug("trimmed %d blocks in the group %d\n",
                ret, group);

        return ret;
}

/**
 * ext4_trim_fs() -- trim ioctl handle function
 * @sb:                        superblock for filesystem
 * @range:                fstrim_range structure
 *
 * start:        First Byte to trim
 * len:                number of Bytes to trim from start
 * minlen:        minimum extent length in Bytes
 * ext4_trim_fs goes through all allocation groups containing Bytes from
 * start to start+len. For each such a group ext4_trim_all_free function
 * is invoked to trim all free space.
 */
int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
        unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev);
        struct ext4_group_info *grp;
        ext4_group_t group, first_group, last_group;
        ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
        uint64_t start, end, minlen, trimmed = 0;
        ext4_fsblk_t first_data_blk =
                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
        ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
        int ret = 0;

        start = range->start >> sb->s_blocksize_bits;
        end = start + (range->len >> sb->s_blocksize_bits) - 1;
        minlen = EXT4_NUM_B2C(EXT4_SB(sb),
                              range->minlen >> sb->s_blocksize_bits);

        if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
            start >= max_blks ||
            range->len < sb->s_blocksize)
                return -EINVAL;
        /* No point to try to trim less than discard granularity */
        if (range->minlen < discard_granularity) {
                minlen = EXT4_NUM_B2C(EXT4_SB(sb),
                                discard_granularity >> sb->s_blocksize_bits);
                if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
                        goto out;
        }
        if (end >= max_blks - 1)
                end = max_blks - 1;
        if (end <= first_data_blk)
                goto out;
        if (start < first_data_blk)
                start = first_data_blk;

        /* Determine first and last group to examine based on start and end */
        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
                                     &first_group, &first_cluster);
        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
                                     &last_group, &last_cluster);

        /* end now represents the last cluster to discard in this group */
        end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;

        for (group = first_group; group <= last_group; group++) {
                if (ext4_trim_interrupted())
                        break;
                grp = ext4_get_group_info(sb, group);
                if (!grp)
                        continue;
                /* We only do this if the grp has never been initialized */
                if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
                        ret = ext4_mb_init_group(sb, group, GFP_NOFS);
                        if (ret)
                                break;
                }

                /*
                 * For all the groups except the last one, last cluster will
                 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
                 * change it for the last group, note that last_cluster is
                 * already computed earlier by ext4_get_group_no_and_offset()
                 */
                if (group == last_group)
                        end = last_cluster;
                if (grp->bb_free >= minlen) {
                        cnt = ext4_trim_all_free(sb, group, first_cluster,
                                                 end, minlen);
                        if (cnt < 0) {
                                ret = cnt;
                                break;
                        }
                        trimmed += cnt;
                }

                /*
                 * For every group except the first one, we are sure
                 * that the first cluster to discard will be cluster #0.
                 */
                first_cluster = 0;
        }

        if (!ret)
                EXT4_SB(sb)->s_last_trim_minblks = minlen;

out:
        range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
        return ret;
}

/* Iterate all the free extents in the group. */
int
ext4_mballoc_query_range(
        struct super_block                *sb,
        ext4_group_t                        group,
        ext4_grpblk_t                        start,
        ext4_grpblk_t                        end,
        ext4_mballoc_query_range_fn        formatter,
        void                                *priv)
{
        void                                *bitmap;
        ext4_grpblk_t                        next;
        struct ext4_buddy                e4b;
        int                                error;

        error = ext4_mb_load_buddy(sb, group, &e4b);
        if (error)
                return error;
        bitmap = e4b.bd_bitmap;

        ext4_lock_group(sb, group);

        start = max(e4b.bd_info->bb_first_free, start);
        if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
                end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;

        while (start <= end) {
                start = mb_find_next_zero_bit(bitmap, end + 1, start);
                if (start > end)
                        break;
                next = mb_find_next_bit(bitmap, end + 1, start);

                ext4_unlock_group(sb, group);
                error = formatter(sb, group, start, next - start, priv);
                if (error)
                        goto out_unload;
                ext4_lock_group(sb, group);

                start = next + 1;
        }

        ext4_unlock_group(sb, group);
out_unload:
        ext4_mb_unload_buddy(&e4b);

        return error;
}

#ifdef CONFIG_EXT4_KUNIT_TESTS
#include "mballoc-test.c"
#endif



































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef _ASM_X86_NOSPEC_BRANCH_H_
#define _ASM_X86_NOSPEC_BRANCH_H_

#include <linux/static_key.h>
#include <linux/objtool.h>
#include <linux/linkage.h>

#include <asm/alternative.h>
#include <asm/cpufeatures.h>
#include <asm/msr-index.h>
#include <asm/unwind_hints.h>
#include <asm/percpu.h>
#include <asm/current.h>

/*
 * Call depth tracking for Intel SKL CPUs to address the RSB underflow
 * issue in software.
 *
 * The tracking does not use a counter. It uses uses arithmetic shift
 * right on call entry and logical shift left on return.
 *
 * The depth tracking variable is initialized to 0x8000.... when the call
 * depth is zero. The arithmetic shift right sign extends the MSB and
 * saturates after the 12th call. The shift count is 5 for both directions
 * so the tracking covers 12 nested calls.
 *
 *  Call
 *  0: 0x8000000000000000        0x0000000000000000
 *  1: 0xfc00000000000000        0xf000000000000000
 * ...
 * 11: 0xfffffffffffffff8        0xfffffffffffffc00
 * 12: 0xffffffffffffffff        0xffffffffffffffe0
 *
 * After a return buffer fill the depth is credited 12 calls before the
 * next stuffing has to take place.
 *
 * There is a inaccuracy for situations like this:
 *
 *  10 calls
 *   5 returns
 *   3 calls
 *   4 returns
 *   3 calls
 *   ....
 *
 * The shift count might cause this to be off by one in either direction,
 * but there is still a cushion vs. the RSB depth. The algorithm does not
 * claim to be perfect and it can be speculated around by the CPU, but it
 * is considered that it obfuscates the problem enough to make exploitation
 * extremely difficult.
 */
#define RET_DEPTH_SHIFT                        5
#define RSB_RET_STUFF_LOOPS                16
#define RET_DEPTH_INIT                        0x8000000000000000ULL
#define RET_DEPTH_INIT_FROM_CALL        0xfc00000000000000ULL
#define RET_DEPTH_CREDIT                0xffffffffffffffffULL

#ifdef CONFIG_CALL_THUNKS_DEBUG
# define CALL_THUNKS_DEBUG_INC_CALLS                                \
        incq        PER_CPU_VAR(__x86_call_count);
# define CALL_THUNKS_DEBUG_INC_RETS                                \
        incq        PER_CPU_VAR(__x86_ret_count);
# define CALL_THUNKS_DEBUG_INC_STUFFS                                \
        incq        PER_CPU_VAR(__x86_stuffs_count);
# define CALL_THUNKS_DEBUG_INC_CTXSW                                \
        incq        PER_CPU_VAR(__x86_ctxsw_count);
#else
# define CALL_THUNKS_DEBUG_INC_CALLS
# define CALL_THUNKS_DEBUG_INC_RETS
# define CALL_THUNKS_DEBUG_INC_STUFFS
# define CALL_THUNKS_DEBUG_INC_CTXSW
#endif

#if defined(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS)

#include <asm/asm-offsets.h>

#define CREDIT_CALL_DEPTH                                        \
        movq        $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth);

#define RESET_CALL_DEPTH                                        \
        xor        %eax, %eax;                                        \
        bts        $63, %rax;                                        \
        movq        %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth);

#define RESET_CALL_DEPTH_FROM_CALL                                \
        movb        $0xfc, %al;                                        \
        shl        $56, %rax;                                        \
        movq        %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth);        \
        CALL_THUNKS_DEBUG_INC_CALLS

#define INCREMENT_CALL_DEPTH                                        \
        sarq        $5, PER_CPU_VAR(pcpu_hot + X86_call_depth);        \
        CALL_THUNKS_DEBUG_INC_CALLS

#else
#define CREDIT_CALL_DEPTH
#define RESET_CALL_DEPTH
#define RESET_CALL_DEPTH_FROM_CALL
#define INCREMENT_CALL_DEPTH
#endif

/*
 * Fill the CPU return stack buffer.
 *
 * Each entry in the RSB, if used for a speculative 'ret', contains an
 * infinite 'pause; lfence; jmp' loop to capture speculative execution.
 *
 * This is required in various cases for retpoline and IBRS-based
 * mitigations for the Spectre variant 2 vulnerability. Sometimes to
 * eliminate potentially bogus entries from the RSB, and sometimes
 * purely to ensure that it doesn't get empty, which on some CPUs would
 * allow predictions from other (unwanted!) sources to be used.
 *
 * We define a CPP macro such that it can be used from both .S files and
 * inline assembly. It's possible to do a .macro and then include that
 * from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
 */

#define RETPOLINE_THUNK_SIZE        32
#define RSB_CLEAR_LOOPS                32        /* To forcibly overwrite all entries */

/*
 * Common helper for __FILL_RETURN_BUFFER and __FILL_ONE_RETURN.
 */
#define __FILL_RETURN_SLOT                        \
        ANNOTATE_INTRA_FUNCTION_CALL;                \
        call        772f;                                \
        int3;                                        \
772:

/*
 * Stuff the entire RSB.
 *
 * Google experimented with loop-unrolling and this turned out to be
 * the optimal version - two calls, each with their own speculation
 * trap should their return address end up getting used, in a loop.
 */
#ifdef CONFIG_X86_64
#define __FILL_RETURN_BUFFER(reg, nr)                        \
        mov        $(nr/2), reg;                                \
771:                                                        \
        __FILL_RETURN_SLOT                                \
        __FILL_RETURN_SLOT                                \
        add        $(BITS_PER_LONG/8) * 2, %_ASM_SP;        \
        dec        reg;                                        \
        jnz        771b;                                        \
        /* barrier for jnz misprediction */                \
        lfence;                                                \
        CREDIT_CALL_DEPTH                                \
        CALL_THUNKS_DEBUG_INC_CTXSW
#else
/*
 * i386 doesn't unconditionally have LFENCE, as such it can't
 * do a loop.
 */
#define __FILL_RETURN_BUFFER(reg, nr)                        \
        .rept nr;                                        \
        __FILL_RETURN_SLOT;                                \
        .endr;                                                \
        add        $(BITS_PER_LONG/8) * nr, %_ASM_SP;
#endif

/*
 * Stuff a single RSB slot.
 *
 * To mitigate Post-Barrier RSB speculation, one CALL instruction must be
 * forced to retire before letting a RET instruction execute.
 *
 * On PBRSB-vulnerable CPUs, it is not safe for a RET to be executed
 * before this point.
 */
#define __FILL_ONE_RETURN                                \
        __FILL_RETURN_SLOT                                \
        add        $(BITS_PER_LONG/8), %_ASM_SP;                \
        lfence;

#ifdef __ASSEMBLY__

/*
 * This should be used immediately before an indirect jump/call. It tells
 * objtool the subsequent indirect jump/call is vouched safe for retpoline
 * builds.
 */
.macro ANNOTATE_RETPOLINE_SAFE
.Lhere_\@:
        .pushsection .discard.retpoline_safe
        .long .Lhere_\@
        .popsection
.endm

/*
 * (ab)use RETPOLINE_SAFE on RET to annotate away 'bare' RET instructions
 * vs RETBleed validation.
 */
#define ANNOTATE_UNRET_SAFE ANNOTATE_RETPOLINE_SAFE

/*
 * Abuse ANNOTATE_RETPOLINE_SAFE on a NOP to indicate UNRET_END, should
 * eventually turn into its own annotation.
 */
.macro VALIDATE_UNRET_END
#if defined(CONFIG_NOINSTR_VALIDATION) && \
        (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO))
        ANNOTATE_RETPOLINE_SAFE
        nop
#endif
.endm

/*
 * Equivalent to -mindirect-branch-cs-prefix; emit the 5 byte jmp/call
 * to the retpoline thunk with a CS prefix when the register requires
 * a RAX prefix byte to encode. Also see apply_retpolines().
 */
.macro __CS_PREFIX reg:req
        .irp rs,r8,r9,r10,r11,r12,r13,r14,r15
        .ifc \reg,\rs
        .byte 0x2e
        .endif
        .endr
.endm

/*
 * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
 * indirect jmp/call which may be susceptible to the Spectre variant 2
 * attack.
 *
 * NOTE: these do not take kCFI into account and are thus not comparable to C
 * indirect calls, take care when using. The target of these should be an ENDBR
 * instruction irrespective of kCFI.
 */
.macro JMP_NOSPEC reg:req
#ifdef CONFIG_MITIGATION_RETPOLINE
        __CS_PREFIX \reg
        jmp        __x86_indirect_thunk_\reg
#else
        jmp        *%\reg
        int3
#endif
.endm

.macro CALL_NOSPEC reg:req
#ifdef CONFIG_MITIGATION_RETPOLINE
        __CS_PREFIX \reg
        call        __x86_indirect_thunk_\reg
#else
        call        *%\reg
#endif
.endm

 /*
  * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
  * monstrosity above, manually.
  */
.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2=ALT_NOT(X86_FEATURE_ALWAYS)
        ALTERNATIVE_2 "jmp .Lskip_rsb_\@", \
                __stringify(__FILL_RETURN_BUFFER(\reg,\nr)), \ftr, \
                __stringify(nop;nop;__FILL_ONE_RETURN), \ftr2

.Lskip_rsb_\@:
.endm

/*
 * The CALL to srso_alias_untrain_ret() must be patched in directly at
 * the spot where untraining must be done, ie., srso_alias_untrain_ret()
 * must be the target of a CALL instruction instead of indirectly
 * jumping to a wrapper which then calls it. Therefore, this macro is
 * called outside of __UNTRAIN_RET below, for the time being, before the
 * kernel can support nested alternatives with arbitrary nesting.
 */
.macro CALL_UNTRAIN_RET
#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)
        ALTERNATIVE_2 "", "call entry_untrain_ret", X86_FEATURE_UNRET, \
                          "call srso_alias_untrain_ret", X86_FEATURE_SRSO_ALIAS
#endif
.endm

/*
 * Mitigate RETBleed for AMD/Hygon Zen uarch. Requires KERNEL CR3 because the
 * return thunk isn't mapped into the userspace tables (then again, AMD
 * typically has NO_MELTDOWN).
 *
 * While retbleed_untrain_ret() doesn't clobber anything but requires stack,
 * entry_ibpb() will clobber AX, CX, DX.
 *
 * As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point
 * where we have a stack but before any RET instruction.
 */
.macro __UNTRAIN_RET ibpb_feature, call_depth_insns
#if defined(CONFIG_MITIGATION_RETHUNK) || defined(CONFIG_MITIGATION_IBPB_ENTRY)
        VALIDATE_UNRET_END
        CALL_UNTRAIN_RET
        ALTERNATIVE_2 "",                                                \
                      "call entry_ibpb", \ibpb_feature,                        \
                     __stringify(\call_depth_insns), X86_FEATURE_CALL_DEPTH
#endif
.endm

#define UNTRAIN_RET \
        __UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH)

#define UNTRAIN_RET_VM \
        __UNTRAIN_RET X86_FEATURE_IBPB_ON_VMEXIT, __stringify(RESET_CALL_DEPTH)

#define UNTRAIN_RET_FROM_CALL \
        __UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH_FROM_CALL)


.macro CALL_DEPTH_ACCOUNT
#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
        ALTERNATIVE "",                                                        \
                    __stringify(INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
#endif
.endm

/*
 * Macro to execute VERW instruction that mitigate transient data sampling
 * attacks such as MDS. On affected systems a microcode update overloaded VERW
 * instruction to also clear the CPU buffers. VERW clobbers CFLAGS.ZF.
 *
 * Note: Only the memory operand variant of VERW clears the CPU buffers.
 */
.macro CLEAR_CPU_BUFFERS
        ALTERNATIVE "", __stringify(verw _ASM_RIP(mds_verw_sel)), X86_FEATURE_CLEAR_CPU_BUF
.endm

#ifdef CONFIG_X86_64
.macro CLEAR_BRANCH_HISTORY
        ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP
.endm

.macro CLEAR_BRANCH_HISTORY_VMEXIT
        ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT
.endm
#else
#define CLEAR_BRANCH_HISTORY
#define CLEAR_BRANCH_HISTORY_VMEXIT
#endif

#else /* __ASSEMBLY__ */

#define ANNOTATE_RETPOLINE_SAFE                                        \
        "999:\n\t"                                                \
        ".pushsection .discard.retpoline_safe\n\t"                \
        ".long 999b\n\t"                                        \
        ".popsection\n\t"

typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];
extern retpoline_thunk_t __x86_indirect_thunk_array[];
extern retpoline_thunk_t __x86_indirect_call_thunk_array[];
extern retpoline_thunk_t __x86_indirect_jump_thunk_array[];

#ifdef CONFIG_MITIGATION_RETHUNK
extern void __x86_return_thunk(void);
#else
static inline void __x86_return_thunk(void) {}
#endif

#ifdef CONFIG_MITIGATION_UNRET_ENTRY
extern void retbleed_return_thunk(void);
#else
static inline void retbleed_return_thunk(void) {}
#endif

extern void srso_alias_untrain_ret(void);

#ifdef CONFIG_MITIGATION_SRSO
extern void srso_return_thunk(void);
extern void srso_alias_return_thunk(void);
#else
static inline void srso_return_thunk(void) {}
static inline void srso_alias_return_thunk(void) {}
#endif

extern void retbleed_return_thunk(void);
extern void srso_return_thunk(void);
extern void srso_alias_return_thunk(void);

extern void entry_untrain_ret(void);
extern void entry_ibpb(void);

#ifdef CONFIG_X86_64
extern void clear_bhb_loop(void);
#endif

extern void (*x86_return_thunk)(void);

extern void __warn_thunk(void);

#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
extern void call_depth_return_thunk(void);

#define CALL_DEPTH_ACCOUNT                                        \
        ALTERNATIVE("",                                                \
                    __stringify(INCREMENT_CALL_DEPTH),                \
                    X86_FEATURE_CALL_DEPTH)

#ifdef CONFIG_CALL_THUNKS_DEBUG
DECLARE_PER_CPU(u64, __x86_call_count);
DECLARE_PER_CPU(u64, __x86_ret_count);
DECLARE_PER_CPU(u64, __x86_stuffs_count);
DECLARE_PER_CPU(u64, __x86_ctxsw_count);
#endif
#else /* !CONFIG_MITIGATION_CALL_DEPTH_TRACKING */

static inline void call_depth_return_thunk(void) {}
#define CALL_DEPTH_ACCOUNT ""

#endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */

#ifdef CONFIG_MITIGATION_RETPOLINE

#define GEN(reg) \
        extern retpoline_thunk_t __x86_indirect_thunk_ ## reg;
#include <asm/GEN-for-each-reg.h>
#undef GEN

#define GEN(reg)                                                \
        extern retpoline_thunk_t __x86_indirect_call_thunk_ ## reg;
#include <asm/GEN-for-each-reg.h>
#undef GEN

#define GEN(reg)                                                \
        extern retpoline_thunk_t __x86_indirect_jump_thunk_ ## reg;
#include <asm/GEN-for-each-reg.h>
#undef GEN

#ifdef CONFIG_X86_64

/*
 * Inline asm uses the %V modifier which is only in newer GCC
 * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined.
 */
# define CALL_NOSPEC                                                \
        ALTERNATIVE_2(                                                \
        ANNOTATE_RETPOLINE_SAFE                                        \
        "call *%[thunk_target]\n",                                \
        "call __x86_indirect_thunk_%V[thunk_target]\n",                \
        X86_FEATURE_RETPOLINE,                                        \
        "lfence;\n"                                                \
        ANNOTATE_RETPOLINE_SAFE                                        \
        "call *%[thunk_target]\n",                                \
        X86_FEATURE_RETPOLINE_LFENCE)

# define THUNK_TARGET(addr) [thunk_target] "r" (addr)

#else /* CONFIG_X86_32 */
/*
 * For i386 we use the original ret-equivalent retpoline, because
 * otherwise we'll run out of registers. We don't care about CET
 * here, anyway.
 */
# define CALL_NOSPEC                                                \
        ALTERNATIVE_2(                                                \
        ANNOTATE_RETPOLINE_SAFE                                        \
        "call *%[thunk_target]\n",                                \
        "       jmp    904f;\n"                                        \
        "       .align 16\n"                                        \
        "901:        call   903f;\n"                                        \
        "902:        pause;\n"                                        \
        "            lfence;\n"                                        \
        "       jmp    902b;\n"                                        \
        "       .align 16\n"                                        \
        "903:        lea    4(%%esp), %%esp;\n"                        \
        "       pushl  %[thunk_target];\n"                        \
        "       ret;\n"                                                \
        "       .align 16\n"                                        \
        "904:        call   901b;\n",                                \
        X86_FEATURE_RETPOLINE,                                        \
        "lfence;\n"                                                \
        ANNOTATE_RETPOLINE_SAFE                                        \
        "call *%[thunk_target]\n",                                \
        X86_FEATURE_RETPOLINE_LFENCE)

# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
#endif
#else /* No retpoline for C / inline asm */
# define CALL_NOSPEC "call *%[thunk_target]\n"
# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
#endif

/* The Spectre V2 mitigation variants */
enum spectre_v2_mitigation {
        SPECTRE_V2_NONE,
        SPECTRE_V2_RETPOLINE,
        SPECTRE_V2_LFENCE,
        SPECTRE_V2_EIBRS,
        SPECTRE_V2_EIBRS_RETPOLINE,
        SPECTRE_V2_EIBRS_LFENCE,
        SPECTRE_V2_IBRS,
};

/* The indirect branch speculation control variants */
enum spectre_v2_user_mitigation {
        SPECTRE_V2_USER_NONE,
        SPECTRE_V2_USER_STRICT,
        SPECTRE_V2_USER_STRICT_PREFERRED,
        SPECTRE_V2_USER_PRCTL,
        SPECTRE_V2_USER_SECCOMP,
};

/* The Speculative Store Bypass disable variants */
enum ssb_mitigation {
        SPEC_STORE_BYPASS_NONE,
        SPEC_STORE_BYPASS_DISABLE,
        SPEC_STORE_BYPASS_PRCTL,
        SPEC_STORE_BYPASS_SECCOMP,
};

static __always_inline
void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature)
{
        asm volatile(ALTERNATIVE("", "wrmsr", %c[feature])
                : : "c" (msr),
                    "a" ((u32)val),
                    "d" ((u32)(val >> 32)),
                    [feature] "i" (feature)
                : "memory");
}

extern u64 x86_pred_cmd;

static inline void indirect_branch_prediction_barrier(void)
{
        alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_USE_IBPB);
}

/* The Intel SPEC CTRL MSR base value cache */
extern u64 x86_spec_ctrl_base;
DECLARE_PER_CPU(u64, x86_spec_ctrl_current);
extern void update_spec_ctrl_cond(u64 val);
extern u64 spec_ctrl_current(void);

/*
 * With retpoline, we must use IBRS to restrict branch prediction
 * before calling into firmware.
 *
 * (Implemented as CPP macros due to header hell.)
 */
#define firmware_restrict_branch_speculation_start()                        \
do {                                                                        \
        preempt_disable();                                                \
        alternative_msr_write(MSR_IA32_SPEC_CTRL,                        \
                              spec_ctrl_current() | SPEC_CTRL_IBRS,        \
                              X86_FEATURE_USE_IBRS_FW);                        \
        alternative_msr_write(MSR_IA32_PRED_CMD, PRED_CMD_IBPB,                \
                              X86_FEATURE_USE_IBPB_FW);                        \
} while (0)

#define firmware_restrict_branch_speculation_end()                        \
do {                                                                        \
        alternative_msr_write(MSR_IA32_SPEC_CTRL,                        \
                              spec_ctrl_current(),                        \
                              X86_FEATURE_USE_IBRS_FW);                        \
        preempt_enable();                                                \
} while (0)

DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);

DECLARE_STATIC_KEY_FALSE(mds_idle_clear);

DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);

DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear);

extern u16 mds_verw_sel;

#include <asm/segment.h>

/**
 * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability
 *
 * This uses the otherwise unused and obsolete VERW instruction in
 * combination with microcode which triggers a CPU buffer flush when the
 * instruction is executed.
 */
static __always_inline void mds_clear_cpu_buffers(void)
{
        static const u16 ds = __KERNEL_DS;

        /*
         * Has to be the memory-operand variant because only that
         * guarantees the CPU buffer flush functionality according to
         * documentation. The register-operand variant does not.
         * Works with any segment selector, but a valid writable
         * data segment is the fastest variant.
         *
         * "cc" clobber is required because VERW modifies ZF.
         */
        asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc");
}

/**
 * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability
 *
 * Clear CPU buffers if the corresponding static key is enabled
 */
static __always_inline void mds_idle_clear_cpu_buffers(void)
{
        if (static_branch_likely(&mds_idle_clear))
                mds_clear_cpu_buffers();
}

#endif /* __ASSEMBLY__ */

#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */





































































    1 














    1 











    1 


























































































































































































































































    1 














    1 





































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
// SPDX-License-Identifier: GPL-2.0
/*
 *
 * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
 *
 */

#include <linux/fs.h>
#include <linux/nls.h>
#include <linux/ctype.h>
#include <linux/posix_acl.h>

#include "debug.h"
#include "ntfs.h"
#include "ntfs_fs.h"

/*
 * fill_name_de - Format NTFS_DE in @buf.
 */
int fill_name_de(struct ntfs_sb_info *sbi, void *buf, const struct qstr *name,
                 const struct cpu_str *uni)
{
        int err;
        struct NTFS_DE *e = buf;
        u16 data_size;
        struct ATTR_FILE_NAME *fname = (struct ATTR_FILE_NAME *)(e + 1);

#ifndef CONFIG_NTFS3_64BIT_CLUSTER
        e->ref.high = fname->home.high = 0;
#endif
        if (uni) {
#ifdef __BIG_ENDIAN
                int ulen = uni->len;
                __le16 *uname = fname->name;
                const u16 *name_cpu = uni->name;

                while (ulen--)
                        *uname++ = cpu_to_le16(*name_cpu++);
#else
                memcpy(fname->name, uni->name, uni->len * sizeof(u16));
#endif
                fname->name_len = uni->len;

        } else {
                /* Convert input string to unicode. */
                err = ntfs_nls_to_utf16(sbi, name->name, name->len,
                                        (struct cpu_str *)&fname->name_len,
                                        NTFS_NAME_LEN, UTF16_LITTLE_ENDIAN);
                if (err < 0)
                        return err;
        }

        fname->type = FILE_NAME_POSIX;
        data_size = fname_full_size(fname);

        e->size = cpu_to_le16(ALIGN(data_size, 8) + sizeof(struct NTFS_DE));
        e->key_size = cpu_to_le16(data_size);
        e->flags = 0;
        e->res = 0;

        return 0;
}

/*
 * ntfs_lookup - inode_operations::lookup
 */
static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry,
                                  u32 flags)
{
        struct ntfs_inode *ni = ntfs_i(dir);
        struct cpu_str *uni = __getname();
        struct inode *inode;
        int err;

        if (!uni)
                inode = ERR_PTR(-ENOMEM);
        else {
                err = ntfs_nls_to_utf16(ni->mi.sbi, dentry->d_name.name,
                                        dentry->d_name.len, uni, NTFS_NAME_LEN,
                                        UTF16_HOST_ENDIAN);
                if (err < 0)
                        inode = ERR_PTR(err);
                else {
                        ni_lock(ni);
                        inode = dir_search_u(dir, uni, NULL);
                        ni_unlock(ni);
                }
                __putname(uni);
        }

        /*
         * Check for a null pointer
         * If the MFT record of ntfs inode is not a base record, inode->i_op can be NULL.
         * This causes null pointer dereference in d_splice_alias().
         */
        if (!IS_ERR_OR_NULL(inode) && !inode->i_op) {
                iput(inode);
                inode = ERR_PTR(-EINVAL);
        }

        return d_splice_alias(inode, dentry);
}

/*
 * ntfs_create - inode_operations::create
 */
static int ntfs_create(struct mnt_idmap *idmap, struct inode *dir,
                       struct dentry *dentry, umode_t mode, bool excl)
{
        return ntfs_create_inode(idmap, dir, dentry, NULL, S_IFREG | mode, 0,
                                 NULL, 0, NULL);
}

/*
 * ntfs_mknod
 *
 * inode_operations::mknod
 */
static int ntfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
                      struct dentry *dentry, umode_t mode, dev_t rdev)
{
        return ntfs_create_inode(idmap, dir, dentry, NULL, mode, rdev, NULL, 0,
                                 NULL);
}

/*
 * ntfs_link - inode_operations::link
 */
static int ntfs_link(struct dentry *ode, struct inode *dir, struct dentry *de)
{
        int err;
        struct inode *inode = d_inode(ode);
        struct ntfs_inode *ni = ntfs_i(inode);

        if (S_ISDIR(inode->i_mode))
                return -EPERM;

        if (inode->i_nlink >= NTFS_LINK_MAX)
                return -EMLINK;

        ni_lock_dir(ntfs_i(dir));
        if (inode != dir)
                ni_lock(ni);

        inc_nlink(inode);
        ihold(inode);

        err = ntfs_link_inode(inode, de);

        if (!err) {
                inode_set_ctime_current(inode);
                inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
                mark_inode_dirty(inode);
                mark_inode_dirty(dir);
                d_instantiate(de, inode);
        } else {
                drop_nlink(inode);
                iput(inode);
        }

        if (inode != dir)
                ni_unlock(ni);
        ni_unlock(ntfs_i(dir));

        return err;
}

/*
 * ntfs_unlink - inode_operations::unlink
 */
static int ntfs_unlink(struct inode *dir, struct dentry *dentry)
{
        struct ntfs_inode *ni = ntfs_i(dir);
        int err;

        if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
                return -EIO;

        ni_lock_dir(ni);

        err = ntfs_unlink_inode(dir, dentry);

        ni_unlock(ni);

        return err;
}

/*
 * ntfs_symlink - inode_operations::symlink
 */
static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
                        struct dentry *dentry, const char *symname)
{
        u32 size = strlen(symname);

        if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
                return -EIO;

        return ntfs_create_inode(idmap, dir, dentry, NULL, S_IFLNK | 0777, 0,
                                 symname, size, NULL);
}

/*
 * ntfs_mkdir- inode_operations::mkdir
 */
static int ntfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
                      struct dentry *dentry, umode_t mode)
{
        return ntfs_create_inode(idmap, dir, dentry, NULL, S_IFDIR | mode, 0,
                                 NULL, 0, NULL);
}

/*
 * ntfs_rmdir - inode_operations::rmdir
 */
static int ntfs_rmdir(struct inode *dir, struct dentry *dentry)
{
        struct ntfs_inode *ni = ntfs_i(dir);
        int err;

        if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
                return -EIO;

        ni_lock_dir(ni);

        err = ntfs_unlink_inode(dir, dentry);

        ni_unlock(ni);

        return err;
}

/*
 * ntfs_rename - inode_operations::rename
 */
static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir,
                       struct dentry *dentry, struct inode *new_dir,
                       struct dentry *new_dentry, u32 flags)
{
        int err;
        struct super_block *sb = dir->i_sb;
        struct ntfs_sb_info *sbi = sb->s_fs_info;
        struct ntfs_inode *dir_ni = ntfs_i(dir);
        struct ntfs_inode *new_dir_ni = ntfs_i(new_dir);
        struct inode *inode = d_inode(dentry);
        struct ntfs_inode *ni = ntfs_i(inode);
        struct inode *new_inode = d_inode(new_dentry);
        struct NTFS_DE *de, *new_de;
        bool is_same, is_bad;
        /*
         * de                - memory of PATH_MAX bytes:
         * [0-1024)        - original name (dentry->d_name)
         * [1024-2048)        - paired to original name, usually DOS variant of dentry->d_name
         * [2048-3072)        - new name (new_dentry->d_name)
         */
        static_assert(SIZEOF_ATTRIBUTE_FILENAME_MAX + SIZEOF_RESIDENT < 1024);
        static_assert(SIZEOF_ATTRIBUTE_FILENAME_MAX + sizeof(struct NTFS_DE) <
                      1024);
        static_assert(PATH_MAX >= 4 * 1024);

        if (unlikely(ntfs3_forced_shutdown(sb)))
                return -EIO;

        if (flags & ~RENAME_NOREPLACE)
                return -EINVAL;

        is_same = dentry->d_name.len == new_dentry->d_name.len &&
                  !memcmp(dentry->d_name.name, new_dentry->d_name.name,
                          dentry->d_name.len);

        if (is_same && dir == new_dir) {
                /* Nothing to do. */
                return 0;
        }

        if (ntfs_is_meta_file(sbi, inode->i_ino)) {
                /* Should we print an error? */
                return -EINVAL;
        }

        if (new_inode) {
                /* Target name exists. Unlink it. */
                dget(new_dentry);
                ni_lock_dir(new_dir_ni);
                err = ntfs_unlink_inode(new_dir, new_dentry);
                ni_unlock(new_dir_ni);
                dput(new_dentry);
                if (err)
                        return err;
        }

        /* Allocate PATH_MAX bytes. */
        de = __getname();
        if (!de)
                return -ENOMEM;

        /* Translate dentry->d_name into unicode form. */
        err = fill_name_de(sbi, de, &dentry->d_name, NULL);
        if (err < 0)
                goto out;

        if (is_same) {
                /* Reuse 'de'. */
                new_de = de;
        } else {
                /* Translate new_dentry->d_name into unicode form. */
                new_de = Add2Ptr(de, 2048);
                err = fill_name_de(sbi, new_de, &new_dentry->d_name, NULL);
                if (err < 0)
                        goto out;
        }

        ni_lock_dir(dir_ni);
        ni_lock(ni);
        if (dir_ni != new_dir_ni)
                ni_lock_dir2(new_dir_ni);

        is_bad = false;
        err = ni_rename(dir_ni, new_dir_ni, ni, de, new_de, &is_bad);
        if (is_bad) {
                /* Restore after failed rename failed too. */
                _ntfs_bad_inode(inode);
        } else if (!err) {
                simple_rename_timestamp(dir, dentry, new_dir, new_dentry);
                mark_inode_dirty(inode);
                mark_inode_dirty(dir);
                if (dir != new_dir)
                        mark_inode_dirty(new_dir);

                if (IS_DIRSYNC(dir))
                        ntfs_sync_inode(dir);

                if (IS_DIRSYNC(new_dir))
                        ntfs_sync_inode(inode);
        }

        if (dir_ni != new_dir_ni)
                ni_unlock(new_dir_ni);
        ni_unlock(ni);
        ni_unlock(dir_ni);
out:
        __putname(de);
        return err;
}

struct dentry *ntfs3_get_parent(struct dentry *child)
{
        struct inode *inode = d_inode(child);
        struct ntfs_inode *ni = ntfs_i(inode);

        struct ATTR_LIST_ENTRY *le = NULL;
        struct ATTRIB *attr = NULL;
        struct ATTR_FILE_NAME *fname;

        while ((attr = ni_find_attr(ni, attr, &le, ATTR_NAME, NULL, 0, NULL,
                                    NULL))) {
                fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME);
                if (!fname)
                        continue;

                return d_obtain_alias(
                        ntfs_iget5(inode->i_sb, &fname->home, NULL));
        }

        return ERR_PTR(-ENOENT);
}

/*
 * dentry_operations::d_hash
 */
static int ntfs_d_hash(const struct dentry *dentry, struct qstr *name)
{
        struct ntfs_sb_info *sbi;
        const char *n = name->name;
        unsigned int len = name->len;
        unsigned long hash;
        struct cpu_str *uni;
        unsigned int c;
        int err;

        /* First try fast implementation. */
        hash = init_name_hash(dentry);

        for (;;) {
                if (!len--) {
                        name->hash = end_name_hash(hash);
                        return 0;
                }

                c = *n++;
                if (c >= 0x80)
                        break;

                hash = partial_name_hash(toupper(c), hash);
        }

        /*
         * Try slow way with current upcase table
         */
        uni = __getname();
        if (!uni)
                return -ENOMEM;

        sbi = dentry->d_sb->s_fs_info;

        err = ntfs_nls_to_utf16(sbi, name->name, name->len, uni, NTFS_NAME_LEN,
                                UTF16_HOST_ENDIAN);
        if (err < 0)
                goto out;

        if (!err) {
                err = -EINVAL;
                goto out;
        }

        hash = ntfs_names_hash(uni->name, uni->len, sbi->upcase,
                               init_name_hash(dentry));
        name->hash = end_name_hash(hash);
        err = 0;

out:
        __putname(uni);
        return err;
}

/*
 * dentry_operations::d_compare
 */
static int ntfs_d_compare(const struct dentry *dentry, unsigned int len1,
                          const char *str, const struct qstr *name)
{
        struct ntfs_sb_info *sbi;
        int ret;
        const char *n1 = str;
        const char *n2 = name->name;
        unsigned int len2 = name->len;
        unsigned int lm = min(len1, len2);
        unsigned char c1, c2;
        struct cpu_str *uni1;
        struct le_str *uni2;

        /* First try fast implementation. */
        for (;;) {
                if (!lm--)
                        return len1 != len2;

                if ((c1 = *n1++) == (c2 = *n2++))
                        continue;

                if (c1 >= 0x80 || c2 >= 0x80)
                        break;

                if (toupper(c1) != toupper(c2))
                        return 1;
        }

        /*
         * Try slow way with current upcase table
         */
        sbi = dentry->d_sb->s_fs_info;
        uni1 = __getname();
        if (!uni1)
                return -ENOMEM;

        ret = ntfs_nls_to_utf16(sbi, str, len1, uni1, NTFS_NAME_LEN,
                                UTF16_HOST_ENDIAN);
        if (ret < 0)
                goto out;

        if (!ret) {
                ret = -EINVAL;
                goto out;
        }

        uni2 = Add2Ptr(uni1, 2048);

        ret = ntfs_nls_to_utf16(sbi, name->name, name->len,
                                (struct cpu_str *)uni2, NTFS_NAME_LEN,
                                UTF16_LITTLE_ENDIAN);
        if (ret < 0)
                goto out;

        if (!ret) {
                ret = -EINVAL;
                goto out;
        }

        ret = !ntfs_cmp_names_cpu(uni1, uni2, sbi->upcase, false) ? 0 : 1;

out:
        __putname(uni1);
        return ret;
}

// clang-format off
const struct inode_operations ntfs_dir_inode_operations = {
        .lookup                = ntfs_lookup,
        .create                = ntfs_create,
        .link                = ntfs_link,
        .unlink                = ntfs_unlink,
        .symlink        = ntfs_symlink,
        .mkdir                = ntfs_mkdir,
        .rmdir                = ntfs_rmdir,
        .mknod                = ntfs_mknod,
        .rename                = ntfs_rename,
        .get_acl        = ntfs_get_acl,
        .set_acl        = ntfs_set_acl,
        .setattr        = ntfs3_setattr,
        .getattr        = ntfs_getattr,
        .listxattr        = ntfs_listxattr,
        .fiemap                = ntfs_fiemap,
};

const struct inode_operations ntfs_special_inode_operations = {
        .setattr        = ntfs3_setattr,
        .getattr        = ntfs_getattr,
        .listxattr        = ntfs_listxattr,
        .get_acl        = ntfs_get_acl,
        .set_acl        = ntfs_set_acl,
};

const struct dentry_operations ntfs_dentry_ops = {
        .d_hash                = ntfs_d_hash,
        .d_compare        = ntfs_d_compare,
};

// clang-format on








































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef BTRFS_BLOCK_GROUP_H
#define BTRFS_BLOCK_GROUP_H

#include <linux/atomic.h>
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/refcount.h>
#include <linux/wait.h>
#include <linux/sizes.h>
#include <linux/rwsem.h>
#include <linux/rbtree.h>
#include <uapi/linux/btrfs_tree.h>
#include "free-space-cache.h"

struct btrfs_chunk_map;
struct btrfs_fs_info;
struct btrfs_inode;
struct btrfs_trans_handle;

enum btrfs_disk_cache_state {
        BTRFS_DC_WRITTEN,
        BTRFS_DC_ERROR,
        BTRFS_DC_CLEAR,
        BTRFS_DC_SETUP,
};

enum btrfs_block_group_size_class {
        /* Unset */
        BTRFS_BG_SZ_NONE,
        /* 0 < size <= 128K */
        BTRFS_BG_SZ_SMALL,
        /* 128K < size <= 8M */
        BTRFS_BG_SZ_MEDIUM,
        /* 8M < size < BG_LENGTH */
        BTRFS_BG_SZ_LARGE,
};

/*
 * This describes the state of the block_group for async discard.  This is due
 * to the two pass nature of it where extent discarding is prioritized over
 * bitmap discarding.  BTRFS_DISCARD_RESET_CURSOR is set when we are resetting
 * between lists to prevent contention for discard state variables
 * (eg. discard_cursor).
 */
enum btrfs_discard_state {
        BTRFS_DISCARD_EXTENTS,
        BTRFS_DISCARD_BITMAPS,
        BTRFS_DISCARD_RESET_CURSOR,
};

/*
 * Control flags for do_chunk_alloc's force field CHUNK_ALLOC_NO_FORCE means to
 * only allocate a chunk if we really need one.
 *
 * CHUNK_ALLOC_LIMITED means to only try and allocate one if we have very few
 * chunks already allocated.  This is used as part of the clustering code to
 * help make sure we have a good pool of storage to cluster in, without filling
 * the FS with empty chunks
 *
 * CHUNK_ALLOC_FORCE means it must try to allocate one
 *
 * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from
 * find_free_extent() that also activaes the zone
 */
enum btrfs_chunk_alloc_enum {
        CHUNK_ALLOC_NO_FORCE,
        CHUNK_ALLOC_LIMITED,
        CHUNK_ALLOC_FORCE,
        CHUNK_ALLOC_FORCE_FOR_EXTENT,
};

/* Block group flags set at runtime */
enum btrfs_block_group_flags {
        BLOCK_GROUP_FLAG_IREF,
        BLOCK_GROUP_FLAG_REMOVED,
        BLOCK_GROUP_FLAG_TO_COPY,
        BLOCK_GROUP_FLAG_RELOCATING_REPAIR,
        BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
        BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
        BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
        /* Does the block group need to be added to the free space tree? */
        BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
        /* Indicate that the block group is placed on a sequential zone */
        BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE,
        /*
         * Indicate that block group is in the list of new block groups of a
         * transaction.
         */
        BLOCK_GROUP_FLAG_NEW,
};

enum btrfs_caching_type {
        BTRFS_CACHE_NO,
        BTRFS_CACHE_STARTED,
        BTRFS_CACHE_FINISHED,
        BTRFS_CACHE_ERROR,
};

struct btrfs_caching_control {
        struct list_head list;
        struct mutex mutex;
        wait_queue_head_t wait;
        struct btrfs_work work;
        struct btrfs_block_group *block_group;
        /* Track progress of caching during allocation. */
        atomic_t progress;
        refcount_t count;
};

/* Once caching_thread() finds this much free space, it will wake up waiters. */
#define CACHING_CTL_WAKE_UP SZ_2M

struct btrfs_block_group {
        struct btrfs_fs_info *fs_info;
        struct inode *inode;
        spinlock_t lock;
        u64 start;
        u64 length;
        u64 pinned;
        u64 reserved;
        u64 used;
        u64 delalloc_bytes;
        u64 bytes_super;
        u64 flags;
        u64 cache_generation;
        u64 global_root_id;

        /*
         * The last committed used bytes of this block group, if the above @used
         * is still the same as @commit_used, we don't need to update block
         * group item of this block group.
         */
        u64 commit_used;
        /*
         * If the free space extent count exceeds this number, convert the block
         * group to bitmaps.
         */
        u32 bitmap_high_thresh;

        /*
         * If the free space extent count drops below this number, convert the
         * block group back to extents.
         */
        u32 bitmap_low_thresh;

        /*
         * It is just used for the delayed data space allocation because
         * only the data space allocation and the relative metadata update
         * can be done cross the transaction.
         */
        struct rw_semaphore data_rwsem;

        /* For raid56, this is a full stripe, without parity */
        unsigned long full_stripe_len;
        unsigned long runtime_flags;

        unsigned int ro;

        int disk_cache_state;

        /* Cache tracking stuff */
        int cached;
        struct btrfs_caching_control *caching_ctl;

        struct btrfs_space_info *space_info;

        /* Free space cache stuff */
        struct btrfs_free_space_ctl *free_space_ctl;

        /* Block group cache stuff */
        struct rb_node cache_node;

        /* For block groups in the same raid type */
        struct list_head list;

        refcount_t refs;

        /*
         * List of struct btrfs_free_clusters for this block group.
         * Today it will only have one thing on it, but that may change
         */
        struct list_head cluster_list;

        /*
         * Used for several lists:
         *
         * 1) struct btrfs_fs_info::unused_bgs
         * 2) struct btrfs_fs_info::reclaim_bgs
         * 3) struct btrfs_transaction::deleted_bgs
         * 4) struct btrfs_trans_handle::new_bgs
         */
        struct list_head bg_list;

        /* For read-only block groups */
        struct list_head ro_list;

        /*
         * When non-zero it means the block group's logical address and its
         * device extents can not be reused for future block group allocations
         * until the counter goes down to 0. This is to prevent them from being
         * reused while some task is still using the block group after it was
         * deleted - we want to make sure they can only be reused for new block
         * groups after that task is done with the deleted block group.
         */
        atomic_t frozen;

        /* For discard operations */
        struct list_head discard_list;
        int discard_index;
        u64 discard_eligible_time;
        u64 discard_cursor;
        enum btrfs_discard_state discard_state;

        /* For dirty block groups */
        struct list_head dirty_list;
        struct list_head io_list;

        struct btrfs_io_ctl io_ctl;

        /*
         * Incremented when doing extent allocations and holding a read lock
         * on the space_info's groups_sem semaphore.
         * Decremented when an ordered extent that represents an IO against this
         * block group's range is created (after it's added to its inode's
         * root's list of ordered extents) or immediately after the allocation
         * if it's a metadata extent or fallocate extent (for these cases we
         * don't create ordered extents).
         */
        atomic_t reservations;

        /*
         * Incremented while holding the spinlock *lock* by a task checking if
         * it can perform a nocow write (incremented if the value for the *ro*
         * field is 0). Decremented by such tasks once they create an ordered
         * extent or before that if some error happens before reaching that step.
         * This is to prevent races between block group relocation and nocow
         * writes through direct IO.
         */
        atomic_t nocow_writers;

        /* Lock for free space tree operations. */
        struct mutex free_space_lock;

        /*
         * Number of extents in this block group used for swap files.
         * All accesses protected by the spinlock 'lock'.
         */
        int swap_extents;

        /*
         * Allocation offset for the block group to implement sequential
         * allocation. This is used only on a zoned filesystem.
         */
        u64 alloc_offset;
        u64 zone_unusable;
        u64 zone_capacity;
        u64 meta_write_pointer;
        struct btrfs_chunk_map *physical_map;
        struct list_head active_bg_list;
        struct work_struct zone_finish_work;
        struct extent_buffer *last_eb;
        enum btrfs_block_group_size_class size_class;
};

static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
{
        return (block_group->start + block_group->length);
}

static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
{
        lockdep_assert_held(&bg->lock);

        return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
}

static inline bool btrfs_is_block_group_data_only(
                                        struct btrfs_block_group *block_group)
{
        /*
         * In mixed mode the fragmentation is expected to be high, lowering the
         * efficiency, so only proper data block groups are considered.
         */
        return (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
               !(block_group->flags & BTRFS_BLOCK_GROUP_METADATA);
}

#ifdef CONFIG_BTRFS_DEBUG
int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group);
#endif

struct btrfs_block_group *btrfs_lookup_first_block_group(
                struct btrfs_fs_info *info, u64 bytenr);
struct btrfs_block_group *btrfs_lookup_block_group(
                struct btrfs_fs_info *info, u64 bytenr);
struct btrfs_block_group *btrfs_next_block_group(
                struct btrfs_block_group *cache);
void btrfs_get_block_group(struct btrfs_block_group *cache);
void btrfs_put_block_group(struct btrfs_block_group *cache);
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
                                        const u64 start);
void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg);
struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
                                                  u64 bytenr);
void btrfs_dec_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
                                           u64 num_bytes);
int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait);
struct btrfs_caching_control *btrfs_get_caching_control(
                struct btrfs_block_group *cache);
int btrfs_add_new_free_space(struct btrfs_block_group *block_group,
                             u64 start, u64 end, u64 *total_added_ret);
struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
                                struct btrfs_fs_info *fs_info,
                                const u64 chunk_offset);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_chunk_map *map);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
void btrfs_reclaim_bgs_work(struct work_struct *work);
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
int btrfs_read_block_groups(struct btrfs_fs_info *info);
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
                                                 u64 type,
                                                 u64 chunk_offset, u64 size);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
                             bool do_chunk_alloc);
void btrfs_dec_block_group_ro(struct btrfs_block_group *cache);
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
                             u64 bytenr, u64 num_bytes, bool alloc);
int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
                             u64 ram_bytes, u64 num_bytes, int delalloc,
                             bool force_wrong_size_class);
void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
                               u64 num_bytes, int delalloc);
int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
                      enum btrfs_chunk_alloc_enum force);
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
                                  bool is_item_insertion);
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
                     u64 physical, u64 **logical, int *naddrs, int *stripe_len);

static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
{
        return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
}

static inline u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
{
        return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
}

static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
{
        return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
}

static inline int btrfs_block_group_done(struct btrfs_block_group *cache)
{
        smp_mb();
        return cache->cached == BTRFS_CACHE_FINISHED ||
                cache->cached == BTRFS_CACHE_ERROR;
}

void btrfs_freeze_block_group(struct btrfs_block_group *cache);
void btrfs_unfreeze_block_group(struct btrfs_block_group *cache);

bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg);
void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount);

enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size);
int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
                                     enum btrfs_block_group_size_class size_class,
                                     bool force_wrong_size_class);
bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg);

#endif /* BTRFS_BLOCK_GROUP_H */































































































































































































































































































































    4 











    4 















    4 








    4 















    4 



    1 












    1 

























    1 






    1 

























    1 




    1 



    1 


    1 





































































    3 

    8 





























































































































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 
    1 




































    1 






    1 


    1 





























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/*
 * Copyright (C) 2017-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 * Copyright Matt Mackall <mpm@selenic.com>, 2003, 2004, 2005
 * Copyright Theodore Ts'o, 1994, 1995, 1996, 1997, 1998, 1999. All rights reserved.
 *
 * This driver produces cryptographically secure pseudorandom data. It is divided
 * into roughly six sections, each with a section header:
 *
 *   - Initialization and readiness waiting.
 *   - Fast key erasure RNG, the "crng".
 *   - Entropy accumulation and extraction routines.
 *   - Entropy collection routines.
 *   - Userspace reader/writer interfaces.
 *   - Sysctl interface.
 *
 * The high level overview is that there is one input pool, into which
 * various pieces of data are hashed. Prior to initialization, some of that
 * data is then "credited" as having a certain number of bits of entropy.
 * When enough bits of entropy are available, the hash is finalized and
 * handed as a key to a stream cipher that expands it indefinitely for
 * various consumers. This key is periodically refreshed as the various
 * entropy collectors, described below, add data to the input pool.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/utsname.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/string.h>
#include <linux/fcntl.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/poll.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/interrupt.h>
#include <linux/mm.h>
#include <linux/nodemask.h>
#include <linux/spinlock.h>
#include <linux/kthread.h>
#include <linux/percpu.h>
#include <linux/ptrace.h>
#include <linux/workqueue.h>
#include <linux/irq.h>
#include <linux/ratelimit.h>
#include <linux/syscalls.h>
#include <linux/completion.h>
#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <linux/suspend.h>
#include <linux/siphash.h>
#include <linux/sched/isolation.h>
#include <crypto/chacha.h>
#include <crypto/blake2s.h>
#include <asm/archrandom.h>
#include <asm/processor.h>
#include <asm/irq.h>
#include <asm/irq_regs.h>
#include <asm/io.h>

/*********************************************************************
 *
 * Initialization and readiness waiting.
 *
 * Much of the RNG infrastructure is devoted to various dependencies
 * being able to wait until the RNG has collected enough entropy and
 * is ready for safe consumption.
 *
 *********************************************************************/

/*
 * crng_init is protected by base_crng->lock, and only increases
 * its value (from empty->early->ready).
 */
static enum {
        CRNG_EMPTY = 0, /* Little to no entropy collected */
        CRNG_EARLY = 1, /* At least POOL_EARLY_BITS collected */
        CRNG_READY = 2  /* Fully initialized with POOL_READY_BITS collected */
} crng_init __read_mostly = CRNG_EMPTY;
static DEFINE_STATIC_KEY_FALSE(crng_is_ready);
#define crng_ready() (static_branch_likely(&crng_is_ready) || crng_init >= CRNG_READY)
/* Various types of waiters for crng_init->CRNG_READY transition. */
static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait);
static struct fasync_struct *fasync;
static ATOMIC_NOTIFIER_HEAD(random_ready_notifier);

/* Control how we warn userspace. */
static struct ratelimit_state urandom_warning =
        RATELIMIT_STATE_INIT_FLAGS("urandom_warning", HZ, 3, RATELIMIT_MSG_ON_RELEASE);
static int ratelimit_disable __read_mostly =
        IS_ENABLED(CONFIG_WARN_ALL_UNSEEDED_RANDOM);
module_param_named(ratelimit_disable, ratelimit_disable, int, 0644);
MODULE_PARM_DESC(ratelimit_disable, "Disable random ratelimit suppression");

/*
 * Returns whether or not the input pool has been seeded and thus guaranteed
 * to supply cryptographically secure random numbers. This applies to: the
 * /dev/urandom device, the get_random_bytes function, and the get_random_{u8,
 * u16,u32,u64,long} family of functions.
 *
 * Returns: true if the input pool has been seeded.
 *          false if the input pool has not been seeded.
 */
bool rng_is_initialized(void)
{
        return crng_ready();
}
EXPORT_SYMBOL(rng_is_initialized);

static void __cold crng_set_ready(struct work_struct *work)
{
        static_branch_enable(&crng_is_ready);
}

/* Used by wait_for_random_bytes(), and considered an entropy collector, below. */
static void try_to_generate_entropy(void);

/*
 * Wait for the input pool to be seeded and thus guaranteed to supply
 * cryptographically secure random numbers. This applies to: the /dev/urandom
 * device, the get_random_bytes function, and the get_random_{u8,u16,u32,u64,
 * long} family of functions. Using any of these functions without first
 * calling this function forfeits the guarantee of security.
 *
 * Returns: 0 if the input pool has been seeded.
 *          -ERESTARTSYS if the function was interrupted by a signal.
 */
int wait_for_random_bytes(void)
{
        while (!crng_ready()) {
                int ret;

                try_to_generate_entropy();
                ret = wait_event_interruptible_timeout(crng_init_wait, crng_ready(), HZ);
                if (ret)
                        return ret > 0 ? 0 : ret;
        }
        return 0;
}
EXPORT_SYMBOL(wait_for_random_bytes);

/*
 * Add a callback function that will be invoked when the crng is initialised,
 * or immediately if it already has been. Only use this is you are absolutely
 * sure it is required. Most users should instead be able to test
 * `rng_is_initialized()` on demand, or make use of `get_random_bytes_wait()`.
 */
int __cold execute_with_initialized_rng(struct notifier_block *nb)
{
        unsigned long flags;
        int ret = 0;

        spin_lock_irqsave(&random_ready_notifier.lock, flags);
        if (crng_ready())
                nb->notifier_call(nb, 0, NULL);
        else
                ret = raw_notifier_chain_register((struct raw_notifier_head *)&random_ready_notifier.head, nb);
        spin_unlock_irqrestore(&random_ready_notifier.lock, flags);
        return ret;
}

#define warn_unseeded_randomness() \
        if (IS_ENABLED(CONFIG_WARN_ALL_UNSEEDED_RANDOM) && !crng_ready()) \
                printk_deferred(KERN_NOTICE "random: %s called from %pS with crng_init=%d\n", \
                                __func__, (void *)_RET_IP_, crng_init)


/*********************************************************************
 *
 * Fast key erasure RNG, the "crng".
 *
 * These functions expand entropy from the entropy extractor into
 * long streams for external consumption using the "fast key erasure"
 * RNG described at <https://blog.cr.yp.to/20170723-random.html>.
 *
 * There are a few exported interfaces for use by other drivers:
 *
 *        void get_random_bytes(void *buf, size_t len)
 *        u8 get_random_u8()
 *        u16 get_random_u16()
 *        u32 get_random_u32()
 *        u32 get_random_u32_below(u32 ceil)
 *        u32 get_random_u32_above(u32 floor)
 *        u32 get_random_u32_inclusive(u32 floor, u32 ceil)
 *        u64 get_random_u64()
 *        unsigned long get_random_long()
 *
 * These interfaces will return the requested number of random bytes
 * into the given buffer or as a return value. This is equivalent to
 * a read from /dev/urandom. The u8, u16, u32, u64, long family of
 * functions may be higher performance for one-off random integers,
 * because they do a bit of buffering and do not invoke reseeding
 * until the buffer is emptied.
 *
 *********************************************************************/

enum {
        CRNG_RESEED_START_INTERVAL = HZ,
        CRNG_RESEED_INTERVAL = 60 * HZ
};

static struct {
        u8 key[CHACHA_KEY_SIZE] __aligned(__alignof__(long));
        unsigned long generation;
        spinlock_t lock;
} base_crng = {
        .lock = __SPIN_LOCK_UNLOCKED(base_crng.lock)
};

struct crng {
        u8 key[CHACHA_KEY_SIZE];
        unsigned long generation;
        local_lock_t lock;
};

static DEFINE_PER_CPU(struct crng, crngs) = {
        .generation = ULONG_MAX,
        .lock = INIT_LOCAL_LOCK(crngs.lock),
};

/*
 * Return the interval until the next reseeding, which is normally
 * CRNG_RESEED_INTERVAL, but during early boot, it is at an interval
 * proportional to the uptime.
 */
static unsigned int crng_reseed_interval(void)
{
        static bool early_boot = true;

        if (unlikely(READ_ONCE(early_boot))) {
                time64_t uptime = ktime_get_seconds();
                if (uptime >= CRNG_RESEED_INTERVAL / HZ * 2)
                        WRITE_ONCE(early_boot, false);
                else
                        return max_t(unsigned int, CRNG_RESEED_START_INTERVAL,
                                     (unsigned int)uptime / 2 * HZ);
        }
        return CRNG_RESEED_INTERVAL;
}

/* Used by crng_reseed() and crng_make_state() to extract a new seed from the input pool. */
static void extract_entropy(void *buf, size_t len);

/* This extracts a new crng key from the input pool. */
static void crng_reseed(struct work_struct *work)
{
        static DECLARE_DELAYED_WORK(next_reseed, crng_reseed);
        unsigned long flags;
        unsigned long next_gen;
        u8 key[CHACHA_KEY_SIZE];

        /* Immediately schedule the next reseeding, so that it fires sooner rather than later. */
        if (likely(system_unbound_wq))
                queue_delayed_work(system_unbound_wq, &next_reseed, crng_reseed_interval());

        extract_entropy(key, sizeof(key));

        /*
         * We copy the new key into the base_crng, overwriting the old one,
         * and update the generation counter. We avoid hitting ULONG_MAX,
         * because the per-cpu crngs are initialized to ULONG_MAX, so this
         * forces new CPUs that come online to always initialize.
         */
        spin_lock_irqsave(&base_crng.lock, flags);
        memcpy(base_crng.key, key, sizeof(base_crng.key));
        next_gen = base_crng.generation + 1;
        if (next_gen == ULONG_MAX)
                ++next_gen;
        WRITE_ONCE(base_crng.generation, next_gen);
        if (!static_branch_likely(&crng_is_ready))
                crng_init = CRNG_READY;
        spin_unlock_irqrestore(&base_crng.lock, flags);
        memzero_explicit(key, sizeof(key));
}

/*
 * This generates a ChaCha block using the provided key, and then
 * immediately overwrites that key with half the block. It returns
 * the resultant ChaCha state to the user, along with the second
 * half of the block containing 32 bytes of random data that may
 * be used; random_data_len may not be greater than 32.
 *
 * The returned ChaCha state contains within it a copy of the old
 * key value, at index 4, so the state should always be zeroed out
 * immediately after using in order to maintain forward secrecy.
 * If the state cannot be erased in a timely manner, then it is
 * safer to set the random_data parameter to &chacha_state[4] so
 * that this function overwrites it before returning.
 */
static void crng_fast_key_erasure(u8 key[CHACHA_KEY_SIZE],
                                  u32 chacha_state[CHACHA_STATE_WORDS],
                                  u8 *random_data, size_t random_data_len)
{
        u8 first_block[CHACHA_BLOCK_SIZE];

        BUG_ON(random_data_len > 32);

        chacha_init_consts(chacha_state);
        memcpy(&chacha_state[4], key, CHACHA_KEY_SIZE);
        memset(&chacha_state[12], 0, sizeof(u32) * 4);
        chacha20_block(chacha_state, first_block);

        memcpy(key, first_block, CHACHA_KEY_SIZE);
        memcpy(random_data, first_block + CHACHA_KEY_SIZE, random_data_len);
        memzero_explicit(first_block, sizeof(first_block));
}

/*
 * This function returns a ChaCha state that you may use for generating
 * random data. It also returns up to 32 bytes on its own of random data
 * that may be used; random_data_len may not be greater than 32.
 */
static void crng_make_state(u32 chacha_state[CHACHA_STATE_WORDS],
                            u8 *random_data, size_t random_data_len)
{
        unsigned long flags;
        struct crng *crng;

        BUG_ON(random_data_len > 32);

        /*
         * For the fast path, we check whether we're ready, unlocked first, and
         * then re-check once locked later. In the case where we're really not
         * ready, we do fast key erasure with the base_crng directly, extracting
         * when crng_init is CRNG_EMPTY.
         */
        if (!crng_ready()) {
                bool ready;

                spin_lock_irqsave(&base_crng.lock, flags);
                ready = crng_ready();
                if (!ready) {
                        if (crng_init == CRNG_EMPTY)
                                extract_entropy(base_crng.key, sizeof(base_crng.key));
                        crng_fast_key_erasure(base_crng.key, chacha_state,
                                              random_data, random_data_len);
                }
                spin_unlock_irqrestore(&base_crng.lock, flags);
                if (!ready)
                        return;
        }

        local_lock_irqsave(&crngs.lock, flags);
        crng = raw_cpu_ptr(&crngs);

        /*
         * If our per-cpu crng is older than the base_crng, then it means
         * somebody reseeded the base_crng. In that case, we do fast key
         * erasure on the base_crng, and use its output as the new key
         * for our per-cpu crng. This brings us up to date with base_crng.
         */
        if (unlikely(crng->generation != READ_ONCE(base_crng.generation))) {
                spin_lock(&base_crng.lock);
                crng_fast_key_erasure(base_crng.key, chacha_state,
                                      crng->key, sizeof(crng->key));
                crng->generation = base_crng.generation;
                spin_unlock(&base_crng.lock);
        }

        /*
         * Finally, when we've made it this far, our per-cpu crng has an up
         * to date key, and we can do fast key erasure with it to produce
         * some random data and a ChaCha state for the caller. All other
         * branches of this function are "unlikely", so most of the time we
         * should wind up here immediately.
         */
        crng_fast_key_erasure(crng->key, chacha_state, random_data, random_data_len);
        local_unlock_irqrestore(&crngs.lock, flags);
}

static void _get_random_bytes(void *buf, size_t len)
{
        u32 chacha_state[CHACHA_STATE_WORDS];
        u8 tmp[CHACHA_BLOCK_SIZE];
        size_t first_block_len;

        if (!len)
                return;

        first_block_len = min_t(size_t, 32, len);
        crng_make_state(chacha_state, buf, first_block_len);
        len -= first_block_len;
        buf += first_block_len;

        while (len) {
                if (len < CHACHA_BLOCK_SIZE) {
                        chacha20_block(chacha_state, tmp);
                        memcpy(buf, tmp, len);
                        memzero_explicit(tmp, sizeof(tmp));
                        break;
                }

                chacha20_block(chacha_state, buf);
                if (unlikely(chacha_state[12] == 0))
                        ++chacha_state[13];
                len -= CHACHA_BLOCK_SIZE;
                buf += CHACHA_BLOCK_SIZE;
        }

        memzero_explicit(chacha_state, sizeof(chacha_state));
}

/*
 * This returns random bytes in arbitrary quantities. The quality of the
 * random bytes is good as /dev/urandom. In order to ensure that the
 * randomness provided by this function is okay, the function
 * wait_for_random_bytes() should be called and return 0 at least once
 * at any point prior.
 */
void get_random_bytes(void *buf, size_t len)
{
        warn_unseeded_randomness();
        _get_random_bytes(buf, len);
}
EXPORT_SYMBOL(get_random_bytes);

static ssize_t get_random_bytes_user(struct iov_iter *iter)
{
        u32 chacha_state[CHACHA_STATE_WORDS];
        u8 block[CHACHA_BLOCK_SIZE];
        size_t ret = 0, copied;

        if (unlikely(!iov_iter_count(iter)))
                return 0;

        /*
         * Immediately overwrite the ChaCha key at index 4 with random
         * bytes, in case userspace causes copy_to_iter() below to sleep
         * forever, so that we still retain forward secrecy in that case.
         */
        crng_make_state(chacha_state, (u8 *)&chacha_state[4], CHACHA_KEY_SIZE);
        /*
         * However, if we're doing a read of len <= 32, we don't need to
         * use chacha_state after, so we can simply return those bytes to
         * the user directly.
         */
        if (iov_iter_count(iter) <= CHACHA_KEY_SIZE) {
                ret = copy_to_iter(&chacha_state[4], CHACHA_KEY_SIZE, iter);
                goto out_zero_chacha;
        }

        for (;;) {
                chacha20_block(chacha_state, block);
                if (unlikely(chacha_state[12] == 0))
                        ++chacha_state[13];

                copied = copy_to_iter(block, sizeof(block), iter);
                ret += copied;
                if (!iov_iter_count(iter) || copied != sizeof(block))
                        break;

                BUILD_BUG_ON(PAGE_SIZE % sizeof(block) != 0);
                if (ret % PAGE_SIZE == 0) {
                        if (signal_pending(current))
                                break;
                        cond_resched();
                }
        }

        memzero_explicit(block, sizeof(block));
out_zero_chacha:
        memzero_explicit(chacha_state, sizeof(chacha_state));
        return ret ? ret : -EFAULT;
}

/*
 * Batched entropy returns random integers. The quality of the random
 * number is good as /dev/urandom. In order to ensure that the randomness
 * provided by this function is okay, the function wait_for_random_bytes()
 * should be called and return 0 at least once at any point prior.
 */

#define DEFINE_BATCHED_ENTROPY(type)                                                \
struct batch_ ##type {                                                                \
        /*                                                                        \
         * We make this 1.5x a ChaCha block, so that we get the                        \
         * remaining 32 bytes from fast key erasure, plus one full                \
         * block from the detached ChaCha state. We can increase                \
         * the size of this later if needed so long as we keep the                \
         * formula of (integer_blocks + 0.5) * CHACHA_BLOCK_SIZE.                \
         */                                                                        \
        type entropy[CHACHA_BLOCK_SIZE * 3 / (2 * sizeof(type))];                \
        local_lock_t lock;                                                        \
        unsigned long generation;                                                \
        unsigned int position;                                                        \
};                                                                                \
                                                                                \
static DEFINE_PER_CPU(struct batch_ ##type, batched_entropy_ ##type) = {        \
        .lock = INIT_LOCAL_LOCK(batched_entropy_ ##type.lock),                        \
        .position = UINT_MAX                                                        \
};                                                                                \
                                                                                \
type get_random_ ##type(void)                                                        \
{                                                                                \
        type ret;                                                                \
        unsigned long flags;                                                        \
        struct batch_ ##type *batch;                                                \
        unsigned long next_gen;                                                        \
                                                                                \
        warn_unseeded_randomness();                                                \
                                                                                \
        if  (!crng_ready()) {                                                        \
                _get_random_bytes(&ret, sizeof(ret));                                \
                return ret;                                                        \
        }                                                                        \
                                                                                \
        local_lock_irqsave(&batched_entropy_ ##type.lock, flags);                \
        batch = raw_cpu_ptr(&batched_entropy_##type);                                \
                                                                                \
        next_gen = READ_ONCE(base_crng.generation);                                \
        if (batch->position >= ARRAY_SIZE(batch->entropy) ||                        \
            next_gen != batch->generation) {                                        \
                _get_random_bytes(batch->entropy, sizeof(batch->entropy));        \
                batch->position = 0;                                                \
                batch->generation = next_gen;                                        \
        }                                                                        \
                                                                                \
        ret = batch->entropy[batch->position];                                        \
        batch->entropy[batch->position] = 0;                                        \
        ++batch->position;                                                        \
        local_unlock_irqrestore(&batched_entropy_ ##type.lock, flags);                \
        return ret;                                                                \
}                                                                                \
EXPORT_SYMBOL(get_random_ ##type);

DEFINE_BATCHED_ENTROPY(u8)
DEFINE_BATCHED_ENTROPY(u16)
DEFINE_BATCHED_ENTROPY(u32)
DEFINE_BATCHED_ENTROPY(u64)

u32 __get_random_u32_below(u32 ceil)
{
        /*
         * This is the slow path for variable ceil. It is still fast, most of
         * the time, by doing traditional reciprocal multiplication and
         * opportunistically comparing the lower half to ceil itself, before
         * falling back to computing a larger bound, and then rejecting samples
         * whose lower half would indicate a range indivisible by ceil. The use
         * of `-ceil % ceil` is analogous to `2^32 % ceil`, but is computable
         * in 32-bits.
         */
        u32 rand = get_random_u32();
        u64 mult;

        /*
         * This function is technically undefined for ceil == 0, and in fact
         * for the non-underscored constant version in the header, we build bug
         * on that. But for the non-constant case, it's convenient to have that
         * evaluate to being a straight call to get_random_u32(), so that
         * get_random_u32_inclusive() can work over its whole range without
         * undefined behavior.
         */
        if (unlikely(!ceil))
                return rand;

        mult = (u64)ceil * rand;
        if (unlikely((u32)mult < ceil)) {
                u32 bound = -ceil % ceil;
                while (unlikely((u32)mult < bound))
                        mult = (u64)ceil * get_random_u32();
        }
        return mult >> 32;
}
EXPORT_SYMBOL(__get_random_u32_below);

#ifdef CONFIG_SMP
/*
 * This function is called when the CPU is coming up, with entry
 * CPUHP_RANDOM_PREPARE, which comes before CPUHP_WORKQUEUE_PREP.
 */
int __cold random_prepare_cpu(unsigned int cpu)
{
        /*
         * When the cpu comes back online, immediately invalidate both
         * the per-cpu crng and all batches, so that we serve fresh
         * randomness.
         */
        per_cpu_ptr(&crngs, cpu)->generation = ULONG_MAX;
        per_cpu_ptr(&batched_entropy_u8, cpu)->position = UINT_MAX;
        per_cpu_ptr(&batched_entropy_u16, cpu)->position = UINT_MAX;
        per_cpu_ptr(&batched_entropy_u32, cpu)->position = UINT_MAX;
        per_cpu_ptr(&batched_entropy_u64, cpu)->position = UINT_MAX;
        return 0;
}
#endif


/**********************************************************************
 *
 * Entropy accumulation and extraction routines.
 *
 * Callers may add entropy via:
 *
 *     static void mix_pool_bytes(const void *buf, size_t len)
 *
 * After which, if added entropy should be credited:
 *
 *     static void credit_init_bits(size_t bits)
 *
 * Finally, extract entropy via:
 *
 *     static void extract_entropy(void *buf, size_t len)
 *
 **********************************************************************/

enum {
        POOL_BITS = BLAKE2S_HASH_SIZE * 8,
        POOL_READY_BITS = POOL_BITS, /* When crng_init->CRNG_READY */
        POOL_EARLY_BITS = POOL_READY_BITS / 2 /* When crng_init->CRNG_EARLY */
};

static struct {
        struct blake2s_state hash;
        spinlock_t lock;
        unsigned int init_bits;
} input_pool = {
        .hash.h = { BLAKE2S_IV0 ^ (0x01010000 | BLAKE2S_HASH_SIZE),
                    BLAKE2S_IV1, BLAKE2S_IV2, BLAKE2S_IV3, BLAKE2S_IV4,
                    BLAKE2S_IV5, BLAKE2S_IV6, BLAKE2S_IV7 },
        .hash.outlen = BLAKE2S_HASH_SIZE,
        .lock = __SPIN_LOCK_UNLOCKED(input_pool.lock),
};

static void _mix_pool_bytes(const void *buf, size_t len)
{
        blake2s_update(&input_pool.hash, buf, len);
}

/*
 * This function adds bytes into the input pool. It does not
 * update the initialization bit counter; the caller should call
 * credit_init_bits if this is appropriate.
 */
static void mix_pool_bytes(const void *buf, size_t len)
{
        unsigned long flags;

        spin_lock_irqsave(&input_pool.lock, flags);
        _mix_pool_bytes(buf, len);
        spin_unlock_irqrestore(&input_pool.lock, flags);
}

/*
 * This is an HKDF-like construction for using the hashed collected entropy
 * as a PRF key, that's then expanded block-by-block.
 */
static void extract_entropy(void *buf, size_t len)
{
        unsigned long flags;
        u8 seed[BLAKE2S_HASH_SIZE], next_key[BLAKE2S_HASH_SIZE];
        struct {
                unsigned long rdseed[32 / sizeof(long)];
                size_t counter;
        } block;
        size_t i, longs;

        for (i = 0; i < ARRAY_SIZE(block.rdseed);) {
                longs = arch_get_random_seed_longs(&block.rdseed[i], ARRAY_SIZE(block.rdseed) - i);
                if (longs) {
                        i += longs;
                        continue;
                }
                longs = arch_get_random_longs(&block.rdseed[i], ARRAY_SIZE(block.rdseed) - i);
                if (longs) {
                        i += longs;
                        continue;
                }
                block.rdseed[i++] = random_get_entropy();
        }

        spin_lock_irqsave(&input_pool.lock, flags);

        /* seed = HASHPRF(last_key, entropy_input) */
        blake2s_final(&input_pool.hash, seed);

        /* next_key = HASHPRF(seed, RDSEED || 0) */
        block.counter = 0;
        blake2s(next_key, (u8 *)&block, seed, sizeof(next_key), sizeof(block), sizeof(seed));
        blake2s_init_key(&input_pool.hash, BLAKE2S_HASH_SIZE, next_key, sizeof(next_key));

        spin_unlock_irqrestore(&input_pool.lock, flags);
        memzero_explicit(next_key, sizeof(next_key));

        while (len) {
                i = min_t(size_t, len, BLAKE2S_HASH_SIZE);
                /* output = HASHPRF(seed, RDSEED || ++counter) */
                ++block.counter;
                blake2s(buf, (u8 *)&block, seed, i, sizeof(block), sizeof(seed));
                len -= i;
                buf += i;
        }

        memzero_explicit(seed, sizeof(seed));
        memzero_explicit(&block, sizeof(block));
}

#define credit_init_bits(bits) if (!crng_ready()) _credit_init_bits(bits)

static void __cold _credit_init_bits(size_t bits)
{
        static DECLARE_WORK(set_ready, crng_set_ready);
        unsigned int new, orig, add;
        unsigned long flags;

        if (!bits)
                return;

        add = min_t(size_t, bits, POOL_BITS);

        orig = READ_ONCE(input_pool.init_bits);
        do {
                new = min_t(unsigned int, POOL_BITS, orig + add);
        } while (!try_cmpxchg(&input_pool.init_bits, &orig, new));

        if (orig < POOL_READY_BITS && new >= POOL_READY_BITS) {
                crng_reseed(NULL); /* Sets crng_init to CRNG_READY under base_crng.lock. */
                if (static_key_initialized && system_unbound_wq)
                        queue_work(system_unbound_wq, &set_ready);
                atomic_notifier_call_chain(&random_ready_notifier, 0, NULL);
                wake_up_interruptible(&crng_init_wait);
                kill_fasync(&fasync, SIGIO, POLL_IN);
                pr_notice("crng init done\n");
                if (urandom_warning.missed)
                        pr_notice("%d urandom warning(s) missed due to ratelimiting\n",
                                  urandom_warning.missed);
        } else if (orig < POOL_EARLY_BITS && new >= POOL_EARLY_BITS) {
                spin_lock_irqsave(&base_crng.lock, flags);
                /* Check if crng_init is CRNG_EMPTY, to avoid race with crng_reseed(). */
                if (crng_init == CRNG_EMPTY) {
                        extract_entropy(base_crng.key, sizeof(base_crng.key));
                        crng_init = CRNG_EARLY;
                }
                spin_unlock_irqrestore(&base_crng.lock, flags);
        }
}


/**********************************************************************
 *
 * Entropy collection routines.
 *
 * The following exported functions are used for pushing entropy into
 * the above entropy accumulation routines:
 *
 *        void add_device_randomness(const void *buf, size_t len);
 *        void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy, bool sleep_after);
 *        void add_bootloader_randomness(const void *buf, size_t len);
 *        void add_vmfork_randomness(const void *unique_vm_id, size_t len);
 *        void add_interrupt_randomness(int irq);
 *        void add_input_randomness(unsigned int type, unsigned int code, unsigned int value);
 *        void add_disk_randomness(struct gendisk *disk);
 *
 * add_device_randomness() adds data to the input pool that
 * is likely to differ between two devices (or possibly even per boot).
 * This would be things like MAC addresses or serial numbers, or the
 * read-out of the RTC. This does *not* credit any actual entropy to
 * the pool, but it initializes the pool to different values for devices
 * that might otherwise be identical and have very little entropy
 * available to them (particularly common in the embedded world).
 *
 * add_hwgenerator_randomness() is for true hardware RNGs, and will credit
 * entropy as specified by the caller. If the entropy pool is full it will
 * block until more entropy is needed.
 *
 * add_bootloader_randomness() is called by bootloader drivers, such as EFI
 * and device tree, and credits its input depending on whether or not the
 * command line option 'random.trust_bootloader'.
 *
 * add_vmfork_randomness() adds a unique (but not necessarily secret) ID
 * representing the current instance of a VM to the pool, without crediting,
 * and then force-reseeds the crng so that it takes effect immediately.
 *
 * add_interrupt_randomness() uses the interrupt timing as random
 * inputs to the entropy pool. Using the cycle counters and the irq source
 * as inputs, it feeds the input pool roughly once a second or after 64
 * interrupts, crediting 1 bit of entropy for whichever comes first.
 *
 * add_input_randomness() uses the input layer interrupt timing, as well
 * as the event type information from the hardware.
 *
 * add_disk_randomness() uses what amounts to the seek time of block
 * layer request events, on a per-disk_devt basis, as input to the
 * entropy pool. Note that high-speed solid state drives with very low
 * seek times do not make for good sources of entropy, as their seek
 * times are usually fairly consistent.
 *
 * The last two routines try to estimate how many bits of entropy
 * to credit. They do this by keeping track of the first and second
 * order deltas of the event timings.
 *
 **********************************************************************/

static bool trust_cpu __initdata = true;
static bool trust_bootloader __initdata = true;
static int __init parse_trust_cpu(char *arg)
{
        return kstrtobool(arg, &trust_cpu);
}
static int __init parse_trust_bootloader(char *arg)
{
        return kstrtobool(arg, &trust_bootloader);
}
early_param("random.trust_cpu", parse_trust_cpu);
early_param("random.trust_bootloader", parse_trust_bootloader);

static int random_pm_notification(struct notifier_block *nb, unsigned long action, void *data)
{
        unsigned long flags, entropy = random_get_entropy();

        /*
         * Encode a representation of how long the system has been suspended,
         * in a way that is distinct from prior system suspends.
         */
        ktime_t stamps[] = { ktime_get(), ktime_get_boottime(), ktime_get_real() };

        spin_lock_irqsave(&input_pool.lock, flags);
        _mix_pool_bytes(&action, sizeof(action));
        _mix_pool_bytes(stamps, sizeof(stamps));
        _mix_pool_bytes(&entropy, sizeof(entropy));
        spin_unlock_irqrestore(&input_pool.lock, flags);

        if (crng_ready() && (action == PM_RESTORE_PREPARE ||
            (action == PM_POST_SUSPEND && !IS_ENABLED(CONFIG_PM_AUTOSLEEP) &&
             !IS_ENABLED(CONFIG_PM_USERSPACE_AUTOSLEEP)))) {
                crng_reseed(NULL);
                pr_notice("crng reseeded on system resumption\n");
        }
        return 0;
}

static struct notifier_block pm_notifier = { .notifier_call = random_pm_notification };

/*
 * This is called extremely early, before time keeping functionality is
 * available, but arch randomness is. Interrupts are not yet enabled.
 */
void __init random_init_early(const char *command_line)
{
        unsigned long entropy[BLAKE2S_BLOCK_SIZE / sizeof(long)];
        size_t i, longs, arch_bits;

#if defined(LATENT_ENTROPY_PLUGIN)
        static const u8 compiletime_seed[BLAKE2S_BLOCK_SIZE] __initconst __latent_entropy;
        _mix_pool_bytes(compiletime_seed, sizeof(compiletime_seed));
#endif

        for (i = 0, arch_bits = sizeof(entropy) * 8; i < ARRAY_SIZE(entropy);) {
                longs = arch_get_random_seed_longs(entropy, ARRAY_SIZE(entropy) - i);
                if (longs) {
                        _mix_pool_bytes(entropy, sizeof(*entropy) * longs);
                        i += longs;
                        continue;
                }
                longs = arch_get_random_longs(entropy, ARRAY_SIZE(entropy) - i);
                if (longs) {
                        _mix_pool_bytes(entropy, sizeof(*entropy) * longs);
                        i += longs;
                        continue;
                }
                arch_bits -= sizeof(*entropy) * 8;
                ++i;
        }

        _mix_pool_bytes(init_utsname(), sizeof(*(init_utsname())));
        _mix_pool_bytes(command_line, strlen(command_line));

        /* Reseed if already seeded by earlier phases. */
        if (crng_ready())
                crng_reseed(NULL);
        else if (trust_cpu)
                _credit_init_bits(arch_bits);
}

/*
 * This is called a little bit after the prior function, and now there is
 * access to timestamps counters. Interrupts are not yet enabled.
 */
void __init random_init(void)
{
        unsigned long entropy = random_get_entropy();
        ktime_t now = ktime_get_real();

        _mix_pool_bytes(&now, sizeof(now));
        _mix_pool_bytes(&entropy, sizeof(entropy));
        add_latent_entropy();

        /*
         * If we were initialized by the cpu or bootloader before jump labels
         * or workqueues are initialized, then we should enable the static
         * branch here, where it's guaranteed that these have been initialized.
         */
        if (!static_branch_likely(&crng_is_ready) && crng_init >= CRNG_READY)
                crng_set_ready(NULL);

        /* Reseed if already seeded by earlier phases. */
        if (crng_ready())
                crng_reseed(NULL);

        WARN_ON(register_pm_notifier(&pm_notifier));

        WARN(!entropy, "Missing cycle counter and fallback timer; RNG "
                       "entropy collection will consequently suffer.");
}

/*
 * Add device- or boot-specific data to the input pool to help
 * initialize it.
 *
 * None of this adds any entropy; it is meant to avoid the problem of
 * the entropy pool having similar initial state across largely
 * identical devices.
 */
void add_device_randomness(const void *buf, size_t len)
{
        unsigned long entropy = random_get_entropy();
        unsigned long flags;

        spin_lock_irqsave(&input_pool.lock, flags);
        _mix_pool_bytes(&entropy, sizeof(entropy));
        _mix_pool_bytes(buf, len);
        spin_unlock_irqrestore(&input_pool.lock, flags);
}
EXPORT_SYMBOL(add_device_randomness);

/*
 * Interface for in-kernel drivers of true hardware RNGs. Those devices
 * may produce endless random bits, so this function will sleep for
 * some amount of time after, if the sleep_after parameter is true.
 */
void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy, bool sleep_after)
{
        mix_pool_bytes(buf, len);
        credit_init_bits(entropy);

        /*
         * Throttle writing to once every reseed interval, unless we're not yet
         * initialized or no entropy is credited.
         */
        if (sleep_after && !kthread_should_stop() && (crng_ready() || !entropy))
                schedule_timeout_interruptible(crng_reseed_interval());
}
EXPORT_SYMBOL_GPL(add_hwgenerator_randomness);

/*
 * Handle random seed passed by bootloader, and credit it depending
 * on the command line option 'random.trust_bootloader'.
 */
void __init add_bootloader_randomness(const void *buf, size_t len)
{
        mix_pool_bytes(buf, len);
        if (trust_bootloader)
                credit_init_bits(len * 8);
}

#if IS_ENABLED(CONFIG_VMGENID)
static BLOCKING_NOTIFIER_HEAD(vmfork_chain);

/*
 * Handle a new unique VM ID, which is unique, not secret, so we
 * don't credit it, but we do immediately force a reseed after so
 * that it's used by the crng posthaste.
 */
void __cold add_vmfork_randomness(const void *unique_vm_id, size_t len)
{
        add_device_randomness(unique_vm_id, len);
        if (crng_ready()) {
                crng_reseed(NULL);
                pr_notice("crng reseeded due to virtual machine fork\n");
        }
        blocking_notifier_call_chain(&vmfork_chain, 0, NULL);
}
#if IS_MODULE(CONFIG_VMGENID)
EXPORT_SYMBOL_GPL(add_vmfork_randomness);
#endif

int __cold register_random_vmfork_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&vmfork_chain, nb);
}
EXPORT_SYMBOL_GPL(register_random_vmfork_notifier);

int __cold unregister_random_vmfork_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&vmfork_chain, nb);
}
EXPORT_SYMBOL_GPL(unregister_random_vmfork_notifier);
#endif

struct fast_pool {
        unsigned long pool[4];
        unsigned long last;
        unsigned int count;
        struct timer_list mix;
};

static void mix_interrupt_randomness(struct timer_list *work);

static DEFINE_PER_CPU(struct fast_pool, irq_randomness) = {
#ifdef CONFIG_64BIT
#define FASTMIX_PERM SIPHASH_PERMUTATION
        .pool = { SIPHASH_CONST_0, SIPHASH_CONST_1, SIPHASH_CONST_2, SIPHASH_CONST_3 },
#else
#define FASTMIX_PERM HSIPHASH_PERMUTATION
        .pool = { HSIPHASH_CONST_0, HSIPHASH_CONST_1, HSIPHASH_CONST_2, HSIPHASH_CONST_3 },
#endif
        .mix = __TIMER_INITIALIZER(mix_interrupt_randomness, 0)
};

/*
 * This is [Half]SipHash-1-x, starting from an empty key. Because
 * the key is fixed, it assumes that its inputs are non-malicious,
 * and therefore this has no security on its own. s represents the
 * four-word SipHash state, while v represents a two-word input.
 */
static void fast_mix(unsigned long s[4], unsigned long v1, unsigned long v2)
{
        s[3] ^= v1;
        FASTMIX_PERM(s[0], s[1], s[2], s[3]);
        s[0] ^= v1;
        s[3] ^= v2;
        FASTMIX_PERM(s[0], s[1], s[2], s[3]);
        s[0] ^= v2;
}

#ifdef CONFIG_SMP
/*
 * This function is called when the CPU has just come online, with
 * entry CPUHP_AP_RANDOM_ONLINE, just after CPUHP_AP_WORKQUEUE_ONLINE.
 */
int __cold random_online_cpu(unsigned int cpu)
{
        /*
         * During CPU shutdown and before CPU onlining, add_interrupt_
         * randomness() may schedule mix_interrupt_randomness(), and
         * set the MIX_INFLIGHT flag. However, because the worker can
         * be scheduled on a different CPU during this period, that
         * flag will never be cleared. For that reason, we zero out
         * the flag here, which runs just after workqueues are onlined
         * for the CPU again. This also has the effect of setting the
         * irq randomness count to zero so that new accumulated irqs
         * are fresh.
         */
        per_cpu_ptr(&irq_randomness, cpu)->count = 0;
        return 0;
}
#endif

static void mix_interrupt_randomness(struct timer_list *work)
{
        struct fast_pool *fast_pool = container_of(work, struct fast_pool, mix);
        /*
         * The size of the copied stack pool is explicitly 2 longs so that we
         * only ever ingest half of the siphash output each time, retaining
         * the other half as the next "key" that carries over. The entropy is
         * supposed to be sufficiently dispersed between bits so on average
         * we don't wind up "losing" some.
         */
        unsigned long pool[2];
        unsigned int count;

        /* Check to see if we're running on the wrong CPU due to hotplug. */
        local_irq_disable();
        if (fast_pool != this_cpu_ptr(&irq_randomness)) {
                local_irq_enable();
                return;
        }

        /*
         * Copy the pool to the stack so that the mixer always has a
         * consistent view, before we reenable irqs again.
         */
        memcpy(pool, fast_pool->pool, sizeof(pool));
        count = fast_pool->count;
        fast_pool->count = 0;
        fast_pool->last = jiffies;
        local_irq_enable();

        mix_pool_bytes(pool, sizeof(pool));
        credit_init_bits(clamp_t(unsigned int, (count & U16_MAX) / 64, 1, sizeof(pool) * 8));

        memzero_explicit(pool, sizeof(pool));
}

void add_interrupt_randomness(int irq)
{
        enum { MIX_INFLIGHT = 1U << 31 };
        unsigned long entropy = random_get_entropy();
        struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
        struct pt_regs *regs = get_irq_regs();
        unsigned int new_count;

        fast_mix(fast_pool->pool, entropy,
                 (regs ? instruction_pointer(regs) : _RET_IP_) ^ swab(irq));
        new_count = ++fast_pool->count;

        if (new_count & MIX_INFLIGHT)
                return;

        if (new_count < 1024 && !time_is_before_jiffies(fast_pool->last + HZ))
                return;

        fast_pool->count |= MIX_INFLIGHT;
        if (!timer_pending(&fast_pool->mix)) {
                fast_pool->mix.expires = jiffies;
                add_timer_on(&fast_pool->mix, raw_smp_processor_id());
        }
}
EXPORT_SYMBOL_GPL(add_interrupt_randomness);

/* There is one of these per entropy source */
struct timer_rand_state {
        unsigned long last_time;
        long last_delta, last_delta2;
};

/*
 * This function adds entropy to the entropy "pool" by using timing
 * delays. It uses the timer_rand_state structure to make an estimate
 * of how many bits of entropy this call has added to the pool. The
 * value "num" is also added to the pool; it should somehow describe
 * the type of event that just happened.
 */
static void add_timer_randomness(struct timer_rand_state *state, unsigned int num)
{
        unsigned long entropy = random_get_entropy(), now = jiffies, flags;
        long delta, delta2, delta3;
        unsigned int bits;

        /*
         * If we're in a hard IRQ, add_interrupt_randomness() will be called
         * sometime after, so mix into the fast pool.
         */
        if (in_hardirq()) {
                fast_mix(this_cpu_ptr(&irq_randomness)->pool, entropy, num);
        } else {
                spin_lock_irqsave(&input_pool.lock, flags);
                _mix_pool_bytes(&entropy, sizeof(entropy));
                _mix_pool_bytes(&num, sizeof(num));
                spin_unlock_irqrestore(&input_pool.lock, flags);
        }

        if (crng_ready())
                return;

        /*
         * Calculate number of bits of randomness we probably added.
         * We take into account the first, second and third-order deltas
         * in order to make our estimate.
         */
        delta = now - READ_ONCE(state->last_time);
        WRITE_ONCE(state->last_time, now);

        delta2 = delta - READ_ONCE(state->last_delta);
        WRITE_ONCE(state->last_delta, delta);

        delta3 = delta2 - READ_ONCE(state->last_delta2);
        WRITE_ONCE(state->last_delta2, delta2);

        if (delta < 0)
                delta = -delta;
        if (delta2 < 0)
                delta2 = -delta2;
        if (delta3 < 0)
                delta3 = -delta3;
        if (delta > delta2)
                delta = delta2;
        if (delta > delta3)
                delta = delta3;

        /*
         * delta is now minimum absolute delta. Round down by 1 bit
         * on general principles, and limit entropy estimate to 11 bits.
         */
        bits = min(fls(delta >> 1), 11);

        /*
         * As mentioned above, if we're in a hard IRQ, add_interrupt_randomness()
         * will run after this, which uses a different crediting scheme of 1 bit
         * per every 64 interrupts. In order to let that function do accounting
         * close to the one in this function, we credit a full 64/64 bit per bit,
         * and then subtract one to account for the extra one added.
         */
        if (in_hardirq())
                this_cpu_ptr(&irq_randomness)->count += max(1u, bits * 64) - 1;
        else
                _credit_init_bits(bits);
}

void add_input_randomness(unsigned int type, unsigned int code, unsigned int value)
{
        static unsigned char last_value;
        static struct timer_rand_state input_timer_state = { INITIAL_JIFFIES };

        /* Ignore autorepeat and the like. */
        if (value == last_value)
                return;

        last_value = value;
        add_timer_randomness(&input_timer_state,
                             (type << 4) ^ code ^ (code >> 4) ^ value);
}
EXPORT_SYMBOL_GPL(add_input_randomness);

#ifdef CONFIG_BLOCK
void add_disk_randomness(struct gendisk *disk)
{
        if (!disk || !disk->random)
                return;
        /* First major is 1, so we get >= 0x200 here. */
        add_timer_randomness(disk->random, 0x100 + disk_devt(disk));
}
EXPORT_SYMBOL_GPL(add_disk_randomness);

void __cold rand_initialize_disk(struct gendisk *disk)
{
        struct timer_rand_state *state;

        /*
         * If kzalloc returns null, we just won't use that entropy
         * source.
         */
        state = kzalloc(sizeof(struct timer_rand_state), GFP_KERNEL);
        if (state) {
                state->last_time = INITIAL_JIFFIES;
                disk->random = state;
        }
}
#endif

struct entropy_timer_state {
        unsigned long entropy;
        struct timer_list timer;
        atomic_t samples;
        unsigned int samples_per_bit;
};

/*
 * Each time the timer fires, we expect that we got an unpredictable jump in
 * the cycle counter. Even if the timer is running on another CPU, the timer
 * activity will be touching the stack of the CPU that is generating entropy.
 *
 * Note that we don't re-arm the timer in the timer itself - we are happy to be
 * scheduled away, since that just makes the load more complex, but we do not
 * want the timer to keep ticking unless the entropy loop is running.
 *
 * So the re-arming always happens in the entropy loop itself.
 */
static void __cold entropy_timer(struct timer_list *timer)
{
        struct entropy_timer_state *state = container_of(timer, struct entropy_timer_state, timer);
        unsigned long entropy = random_get_entropy();

        mix_pool_bytes(&entropy, sizeof(entropy));
        if (atomic_inc_return(&state->samples) % state->samples_per_bit == 0)
                credit_init_bits(1);
}

/*
 * If we have an actual cycle counter, see if we can generate enough entropy
 * with timing noise.
 */
static void __cold try_to_generate_entropy(void)
{
        enum { NUM_TRIAL_SAMPLES = 8192, MAX_SAMPLES_PER_BIT = HZ / 15 };
        u8 stack_bytes[sizeof(struct entropy_timer_state) + SMP_CACHE_BYTES - 1];
        struct entropy_timer_state *stack = PTR_ALIGN((void *)stack_bytes, SMP_CACHE_BYTES);
        unsigned int i, num_different = 0;
        unsigned long last = random_get_entropy();
        int cpu = -1;

        for (i = 0; i < NUM_TRIAL_SAMPLES - 1; ++i) {
                stack->entropy = random_get_entropy();
                if (stack->entropy != last)
                        ++num_different;
                last = stack->entropy;
        }
        stack->samples_per_bit = DIV_ROUND_UP(NUM_TRIAL_SAMPLES, num_different + 1);
        if (stack->samples_per_bit > MAX_SAMPLES_PER_BIT)
                return;

        atomic_set(&stack->samples, 0);
        timer_setup_on_stack(&stack->timer, entropy_timer, 0);
        while (!crng_ready() && !signal_pending(current)) {
                /*
                 * Check !timer_pending() and then ensure that any previous callback has finished
                 * executing by checking try_to_del_timer_sync(), before queueing the next one.
                 */
                if (!timer_pending(&stack->timer) && try_to_del_timer_sync(&stack->timer) >= 0) {
                        struct cpumask timer_cpus;
                        unsigned int num_cpus;

                        /*
                         * Preemption must be disabled here, both to read the current CPU number
                         * and to avoid scheduling a timer on a dead CPU.
                         */
                        preempt_disable();

                        /* Only schedule callbacks on timer CPUs that are online. */
                        cpumask_and(&timer_cpus, housekeeping_cpumask(HK_TYPE_TIMER), cpu_online_mask);
                        num_cpus = cpumask_weight(&timer_cpus);
                        /* In very bizarre case of misconfiguration, fallback to all online. */
                        if (unlikely(num_cpus == 0)) {
                                timer_cpus = *cpu_online_mask;
                                num_cpus = cpumask_weight(&timer_cpus);
                        }

                        /* Basic CPU round-robin, which avoids the current CPU. */
                        do {
                                cpu = cpumask_next(cpu, &timer_cpus);
                                if (cpu >= nr_cpu_ids)
                                        cpu = cpumask_first(&timer_cpus);
                        } while (cpu == smp_processor_id() && num_cpus > 1);

                        /* Expiring the timer at `jiffies` means it's the next tick. */
                        stack->timer.expires = jiffies;

                        add_timer_on(&stack->timer, cpu);

                        preempt_enable();
                }
                mix_pool_bytes(&stack->entropy, sizeof(stack->entropy));
                schedule();
                stack->entropy = random_get_entropy();
        }
        mix_pool_bytes(&stack->entropy, sizeof(stack->entropy));

        del_timer_sync(&stack->timer);
        destroy_timer_on_stack(&stack->timer);
}


/**********************************************************************
 *
 * Userspace reader/writer interfaces.
 *
 * getrandom(2) is the primary modern interface into the RNG and should
 * be used in preference to anything else.
 *
 * Reading from /dev/random has the same functionality as calling
 * getrandom(2) with flags=0. In earlier versions, however, it had
 * vastly different semantics and should therefore be avoided, to
 * prevent backwards compatibility issues.
 *
 * Reading from /dev/urandom has the same functionality as calling
 * getrandom(2) with flags=GRND_INSECURE. Because it does not block
 * waiting for the RNG to be ready, it should not be used.
 *
 * Writing to either /dev/random or /dev/urandom adds entropy to
 * the input pool but does not credit it.
 *
 * Polling on /dev/random indicates when the RNG is initialized, on
 * the read side, and when it wants new entropy, on the write side.
 *
 * Both /dev/random and /dev/urandom have the same set of ioctls for
 * adding entropy, getting the entropy count, zeroing the count, and
 * reseeding the crng.
 *
 **********************************************************************/

SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags)
{
        struct iov_iter iter;
        int ret;

        if (flags & ~(GRND_NONBLOCK | GRND_RANDOM | GRND_INSECURE))
                return -EINVAL;

        /*
         * Requesting insecure and blocking randomness at the same time makes
         * no sense.
         */
        if ((flags & (GRND_INSECURE | GRND_RANDOM)) == (GRND_INSECURE | GRND_RANDOM))
                return -EINVAL;

        if (!crng_ready() && !(flags & GRND_INSECURE)) {
                if (flags & GRND_NONBLOCK)
                        return -EAGAIN;
                ret = wait_for_random_bytes();
                if (unlikely(ret))
                        return ret;
        }

        ret = import_ubuf(ITER_DEST, ubuf, len, &iter);
        if (unlikely(ret))
                return ret;
        return get_random_bytes_user(&iter);
}

static __poll_t random_poll(struct file *file, poll_table *wait)
{
        poll_wait(file, &crng_init_wait, wait);
        return crng_ready() ? EPOLLIN | EPOLLRDNORM : EPOLLOUT | EPOLLWRNORM;
}

static ssize_t write_pool_user(struct iov_iter *iter)
{
        u8 block[BLAKE2S_BLOCK_SIZE];
        ssize_t ret = 0;
        size_t copied;

        if (unlikely(!iov_iter_count(iter)))
                return 0;

        for (;;) {
                copied = copy_from_iter(block, sizeof(block), iter);
                ret += copied;
                mix_pool_bytes(block, copied);
                if (!iov_iter_count(iter) || copied != sizeof(block))
                        break;

                BUILD_BUG_ON(PAGE_SIZE % sizeof(block) != 0);
                if (ret % PAGE_SIZE == 0) {
                        if (signal_pending(current))
                                break;
                        cond_resched();
                }
        }

        memzero_explicit(block, sizeof(block));
        return ret ? ret : -EFAULT;
}

static ssize_t random_write_iter(struct kiocb *kiocb, struct iov_iter *iter)
{
        return write_pool_user(iter);
}

static ssize_t urandom_read_iter(struct kiocb *kiocb, struct iov_iter *iter)
{
        static int maxwarn = 10;

        /*
         * Opportunistically attempt to initialize the RNG on platforms that
         * have fast cycle counters, but don't (for now) require it to succeed.
         */
        if (!crng_ready())
                try_to_generate_entropy();

        if (!crng_ready()) {
                if (!ratelimit_disable && maxwarn <= 0)
                        ++urandom_warning.missed;
                else if (ratelimit_disable || __ratelimit(&urandom_warning)) {
                        --maxwarn;
                        pr_notice("%s: uninitialized urandom read (%zu bytes read)\n",
                                  current->comm, iov_iter_count(iter));
                }
        }

        return get_random_bytes_user(iter);
}

static ssize_t random_read_iter(struct kiocb *kiocb, struct iov_iter *iter)
{
        int ret;

        if (!crng_ready() &&
            ((kiocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) ||
             (kiocb->ki_filp->f_flags & O_NONBLOCK)))
                return -EAGAIN;

        ret = wait_for_random_bytes();
        if (ret != 0)
                return ret;
        return get_random_bytes_user(iter);
}

static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
{
        int __user *p = (int __user *)arg;
        int ent_count;

        switch (cmd) {
        case RNDGETENTCNT:
                /* Inherently racy, no point locking. */
                if (put_user(input_pool.init_bits, p))
                        return -EFAULT;
                return 0;
        case RNDADDTOENTCNT:
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (get_user(ent_count, p))
                        return -EFAULT;
                if (ent_count < 0)
                        return -EINVAL;
                credit_init_bits(ent_count);
                return 0;
        case RNDADDENTROPY: {
                struct iov_iter iter;
                ssize_t ret;
                int len;

                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (get_user(ent_count, p++))
                        return -EFAULT;
                if (ent_count < 0)
                        return -EINVAL;
                if (get_user(len, p++))
                        return -EFAULT;
                ret = import_ubuf(ITER_SOURCE, p, len, &iter);
                if (unlikely(ret))
                        return ret;
                ret = write_pool_user(&iter);
                if (unlikely(ret < 0))
                        return ret;
                /* Since we're crediting, enforce that it was all written into the pool. */
                if (unlikely(ret != len))
                        return -EFAULT;
                credit_init_bits(ent_count);
                return 0;
        }
        case RNDZAPENTCNT:
        case RNDCLEARPOOL:
                /* No longer has any effect. */
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                return 0;
        case RNDRESEEDCRNG:
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (!crng_ready())
                        return -ENODATA;
                crng_reseed(NULL);
                return 0;
        default:
                return -EINVAL;
        }
}

static int random_fasync(int fd, struct file *filp, int on)
{
        return fasync_helper(fd, filp, on, &fasync);
}

const struct file_operations random_fops = {
        .read_iter = random_read_iter,
        .write_iter = random_write_iter,
        .poll = random_poll,
        .unlocked_ioctl = random_ioctl,
        .compat_ioctl = compat_ptr_ioctl,
        .fasync = random_fasync,
        .llseek = noop_llseek,
        .splice_read = copy_splice_read,
        .splice_write = iter_file_splice_write,
};

const struct file_operations urandom_fops = {
        .read_iter = urandom_read_iter,
        .write_iter = random_write_iter,
        .unlocked_ioctl = random_ioctl,
        .compat_ioctl = compat_ptr_ioctl,
        .fasync = random_fasync,
        .llseek = noop_llseek,
        .splice_read = copy_splice_read,
        .splice_write = iter_file_splice_write,
};


/********************************************************************
 *
 * Sysctl interface.
 *
 * These are partly unused legacy knobs with dummy values to not break
 * userspace and partly still useful things. They are usually accessible
 * in /proc/sys/kernel/random/ and are as follows:
 *
 * - boot_id - a UUID representing the current boot.
 *
 * - uuid - a random UUID, different each time the file is read.
 *
 * - poolsize - the number of bits of entropy that the input pool can
 *   hold, tied to the POOL_BITS constant.
 *
 * - entropy_avail - the number of bits of entropy currently in the
 *   input pool. Always <= poolsize.
 *
 * - write_wakeup_threshold - the amount of entropy in the input pool
 *   below which write polls to /dev/random will unblock, requesting
 *   more entropy, tied to the POOL_READY_BITS constant. It is writable
 *   to avoid breaking old userspaces, but writing to it does not
 *   change any behavior of the RNG.
 *
 * - urandom_min_reseed_secs - fixed to the value CRNG_RESEED_INTERVAL.
 *   It is writable to avoid breaking old userspaces, but writing
 *   to it does not change any behavior of the RNG.
 *
 ********************************************************************/

#ifdef CONFIG_SYSCTL

#include <linux/sysctl.h>

static int sysctl_random_min_urandom_seed = CRNG_RESEED_INTERVAL / HZ;
static int sysctl_random_write_wakeup_bits = POOL_READY_BITS;
static int sysctl_poolsize = POOL_BITS;
static u8 sysctl_bootid[UUID_SIZE];

/*
 * This function is used to return both the bootid UUID, and random
 * UUID. The difference is in whether table->data is NULL; if it is,
 * then a new UUID is generated and returned to the user.
 */
static int proc_do_uuid(struct ctl_table *table, int write, void *buf,
                        size_t *lenp, loff_t *ppos)
{
        u8 tmp_uuid[UUID_SIZE], *uuid;
        char uuid_string[UUID_STRING_LEN + 1];
        struct ctl_table fake_table = {
                .data = uuid_string,
                .maxlen = UUID_STRING_LEN
        };

        if (write)
                return -EPERM;

        uuid = table->data;
        if (!uuid) {
                uuid = tmp_uuid;
                generate_random_uuid(uuid);
        } else {
                static DEFINE_SPINLOCK(bootid_spinlock);

                spin_lock(&bootid_spinlock);
                if (!uuid[8])
                        generate_random_uuid(uuid);
                spin_unlock(&bootid_spinlock);
        }

        snprintf(uuid_string, sizeof(uuid_string), "%pU", uuid);
        return proc_dostring(&fake_table, 0, buf, lenp, ppos);
}

/* The same as proc_dointvec, but writes don't change anything. */
static int proc_do_rointvec(struct ctl_table *table, int write, void *buf,
                            size_t *lenp, loff_t *ppos)
{
        return write ? 0 : proc_dointvec(table, 0, buf, lenp, ppos);
}

static struct ctl_table random_table[] = {
        {
                .procname        = "poolsize",
                .data                = &sysctl_poolsize,
                .maxlen                = sizeof(int),
                .mode                = 0444,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "entropy_avail",
                .data                = &input_pool.init_bits,
                .maxlen                = sizeof(int),
                .mode                = 0444,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "write_wakeup_threshold",
                .data                = &sysctl_random_write_wakeup_bits,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_do_rointvec,
        },
        {
                .procname        = "urandom_min_reseed_secs",
                .data                = &sysctl_random_min_urandom_seed,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_do_rointvec,
        },
        {
                .procname        = "boot_id",
                .data                = &sysctl_bootid,
                .mode                = 0444,
                .proc_handler        = proc_do_uuid,
        },
        {
                .procname        = "uuid",
                .mode                = 0444,
                .proc_handler        = proc_do_uuid,
        },
};

/*
 * random_init() is called before sysctl_init(),
 * so we cannot call register_sysctl_init() in random_init()
 */
static int __init random_sysctls_init(void)
{
        register_sysctl_init("kernel/random", random_table);
        return 0;
}
device_initcall(random_sysctls_init);
#endif


















































































































































































   39 








































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CPUFEATURE_H
#define _ASM_X86_CPUFEATURE_H

#include <asm/processor.h>

#if defined(__KERNEL__) && !defined(__ASSEMBLY__)

#include <asm/asm.h>
#include <linux/bitops.h>
#include <asm/alternative.h>

enum cpuid_leafs
{
        CPUID_1_EDX                = 0,
        CPUID_8000_0001_EDX,
        CPUID_8086_0001_EDX,
        CPUID_LNX_1,
        CPUID_1_ECX,
        CPUID_C000_0001_EDX,
        CPUID_8000_0001_ECX,
        CPUID_LNX_2,
        CPUID_LNX_3,
        CPUID_7_0_EBX,
        CPUID_D_1_EAX,
        CPUID_LNX_4,
        CPUID_7_1_EAX,
        CPUID_8000_0008_EBX,
        CPUID_6_EAX,
        CPUID_8000_000A_EDX,
        CPUID_7_ECX,
        CPUID_8000_0007_EBX,
        CPUID_7_EDX,
        CPUID_8000_001F_EAX,
        CPUID_8000_0021_EAX,
        CPUID_LNX_5,
        NR_CPUID_WORDS,
};

#define X86_CAP_FMT_NUM "%d:%d"
#define x86_cap_flag_num(flag) ((flag) >> 5), ((flag) & 31)

extern const char * const x86_cap_flags[NCAPINTS*32];
extern const char * const x86_power_flags[32];
#define X86_CAP_FMT "%s"
#define x86_cap_flag(flag) x86_cap_flags[flag]

/*
 * In order to save room, we index into this array by doing
 * X86_BUG_<name> - NCAPINTS*32.
 */
extern const char * const x86_bug_flags[NBUGINTS*32];

#define test_cpu_cap(c, bit)                                                \
         arch_test_bit(bit, (unsigned long *)((c)->x86_capability))

/*
 * There are 32 bits/features in each mask word.  The high bits
 * (selected with (bit>>5) give us the word number and the low 5
 * bits give us the bit/feature number inside the word.
 * (1UL<<((bit)&31) gives us a mask for the feature_bit so we can
 * see if it is set in the mask word.
 */
#define CHECK_BIT_IN_MASK_WORD(maskname, word, bit)        \
        (((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word ))

/*
 * {REQUIRED,DISABLED}_MASK_CHECK below may seem duplicated with the
 * following BUILD_BUG_ON_ZERO() check but when NCAPINTS gets changed, all
 * header macros which use NCAPINTS need to be changed. The duplicated macro
 * use causes the compiler to issue errors for all headers so that all usage
 * sites can be corrected.
 */
#define REQUIRED_MASK_BIT_SET(feature_bit)                \
         ( CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  0, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  1, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  2, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  3, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  4, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  5, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  6, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  7, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  8, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK,  9, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 10, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 11, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 12, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 13, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 14, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 21, feature_bit) ||        \
           REQUIRED_MASK_CHECK                                          ||        \
           BUILD_BUG_ON_ZERO(NCAPINTS != 22))

#define DISABLED_MASK_BIT_SET(feature_bit)                                \
         ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  0, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  1, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  2, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  3, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  4, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  5, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  6, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  7, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  8, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK,  9, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 10, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 11, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 12, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 13, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 14, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) ||        \
           CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 21, feature_bit) ||        \
           DISABLED_MASK_CHECK                                          ||        \
           BUILD_BUG_ON_ZERO(NCAPINTS != 22))

#define cpu_has(c, bit)                                                        \
        (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :        \
         test_cpu_cap(c, bit))

#define this_cpu_has(bit)                                                \
        (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :        \
         x86_this_cpu_test_bit(bit, cpu_info.x86_capability))

/*
 * This macro is for detection of features which need kernel
 * infrastructure to be used.  It may *not* directly test the CPU
 * itself.  Use the cpu_has() family if you want true runtime
 * testing of CPU features, like in hypervisor code where you are
 * supporting a possible guest feature where host support for it
 * is not relevant.
 */
#define cpu_feature_enabled(bit)        \
        (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit))

#define boot_cpu_has(bit)        cpu_has(&boot_cpu_data, bit)

#define set_cpu_cap(c, bit)        set_bit(bit, (unsigned long *)((c)->x86_capability))

extern void setup_clear_cpu_cap(unsigned int bit);
extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);

#define setup_force_cpu_cap(bit) do {                        \
                                                        \
        if (!boot_cpu_has(bit))                                \
                WARN_ON(alternatives_patched);                \
                                                        \
        set_cpu_cap(&boot_cpu_data, bit);                \
        set_bit(bit, (unsigned long *)cpu_caps_set);        \
} while (0)

#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)

/*
 * Static testing of CPU features. Used the same as boot_cpu_has(). It
 * statically patches the target code for additional performance. Use
 * static_cpu_has() only in fast paths, where every cycle counts. Which
 * means that the boot_cpu_has() variant is already fast enough for the
 * majority of cases and you should stick to using it as it is generally
 * only two instructions: a RIP-relative MOV and a TEST.
 *
 * Do not use an "m" constraint for [cap_byte] here: gcc doesn't know
 * that this is only used on a fallback path and will sometimes cause
 * it to manifest the address of boot_cpu_data in a register, fouling
 * the mainline (post-initialization) code.
 */
static __always_inline bool _static_cpu_has(u16 bit)
{
        asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]")
                ".pushsection .altinstr_aux,\"ax\"\n"
                "6:\n"
                " testb %[bitnum], %a[cap_byte]\n"
                " jnz %l[t_yes]\n"
                " jmp %l[t_no]\n"
                ".popsection\n"
                 : : [feature]  "i" (bit),
                     [bitnum]   "i" (1 << (bit & 7)),
                     [cap_byte] "i" (&((const char *)boot_cpu_data.x86_capability)[bit >> 3])
                 : : t_yes, t_no);
t_yes:
        return true;
t_no:
        return false;
}

#define static_cpu_has(bit)                                        \
(                                                                \
        __builtin_constant_p(boot_cpu_has(bit)) ?                \
                boot_cpu_has(bit) :                                \
                _static_cpu_has(bit)                                \
)

#define cpu_has_bug(c, bit)                cpu_has(c, (bit))
#define set_cpu_bug(c, bit)                set_cpu_cap(c, (bit))
#define clear_cpu_bug(c, bit)                clear_cpu_cap(c, (bit))

#define static_cpu_has_bug(bit)                static_cpu_has((bit))
#define boot_cpu_has_bug(bit)                cpu_has_bug(&boot_cpu_data, (bit))
#define boot_cpu_set_bug(bit)                set_cpu_cap(&boot_cpu_data, (bit))

#define MAX_CPU_FEATURES                (NCAPINTS * 32)
#define cpu_have_feature                boot_cpu_has

#define CPU_FEATURE_TYPEFMT                "x86,ven%04Xfam%04Xmod%04X"
#define CPU_FEATURE_TYPEVAL                boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
                                        boot_cpu_data.x86_model

#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
#endif /* _ASM_X86_CPUFEATURE_H */























































































































    2 






    2 





    2 















































































































































































































































































































































































































































































































































































































    2 
    2 


    2 




















































    3 



    1 































    2 







    2 

    2 





    2 













    1 












    1 














    1 





    1 
























    1 






    1 


    1 




































































    1 
















    1 









































































































































    1 





    1 



















































    1 

























































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
 *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
 */
#include <linux/sched.h>                /* test_thread_flag(), ...        */
#include <linux/sched/task_stack.h>        /* task_stack_*(), ...                */
#include <linux/kdebug.h>                /* oops_begin/end, ...                */
#include <linux/extable.h>                /* search_exception_tables        */
#include <linux/memblock.h>                /* max_low_pfn                        */
#include <linux/kfence.h>                /* kfence_handle_page_fault        */
#include <linux/kprobes.h>                /* NOKPROBE_SYMBOL, ...                */
#include <linux/mmiotrace.h>                /* kmmio_handler, ...                */
#include <linux/perf_event.h>                /* perf_sw_event                */
#include <linux/hugetlb.h>                /* hstate_index_to_shift        */
#include <linux/prefetch.h>                /* prefetchw                        */
#include <linux/context_tracking.h>        /* exception_enter(), ...        */
#include <linux/uaccess.h>                /* faulthandler_disabled()        */
#include <linux/efi.h>                        /* efi_crash_gracefully_on_page_fault()*/
#include <linux/mm_types.h>
#include <linux/mm.h>                        /* find_and_lock_vma() */
#include <linux/vmalloc.h>

#include <asm/cpufeature.h>                /* boot_cpu_has, ...                */
#include <asm/traps.h>                        /* dotraplinkage, ...                */
#include <asm/fixmap.h>                        /* VSYSCALL_ADDR                */
#include <asm/vsyscall.h>                /* emulate_vsyscall                */
#include <asm/vm86.h>                        /* struct vm86                        */
#include <asm/mmu_context.h>                /* vma_pkey()                        */
#include <asm/efi.h>                        /* efi_crash_gracefully_on_page_fault()*/
#include <asm/desc.h>                        /* store_idt(), ...                */
#include <asm/cpu_entry_area.h>                /* exception stack                */
#include <asm/pgtable_areas.h>                /* VMALLOC_START, ...                */
#include <asm/kvm_para.h>                /* kvm_handle_async_pf                */
#include <asm/vdso.h>                        /* fixup_vdso_exception()        */
#include <asm/irq_stack.h>
#include <asm/fred.h>
#include <asm/sev.h>                        /* snp_dump_hva_rmpentry()        */

#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>

/*
 * Returns 0 if mmiotrace is disabled, or if the fault is not
 * handled by mmiotrace:
 */
static nokprobe_inline int
kmmio_fault(struct pt_regs *regs, unsigned long addr)
{
        if (unlikely(is_kmmio_active()))
                if (kmmio_handler(regs, addr) == 1)
                        return -1;
        return 0;
}

/*
 * Prefetch quirks:
 *
 * 32-bit mode:
 *
 *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
 *   Check that here and ignore it.  This is AMD erratum #91.
 *
 * 64-bit mode:
 *
 *   Sometimes the CPU reports invalid exceptions on prefetch.
 *   Check that here and ignore it.
 *
 * Opcode checker based on code by Richard Brunner.
 */
static inline int
check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
                      unsigned char opcode, int *prefetch)
{
        unsigned char instr_hi = opcode & 0xf0;
        unsigned char instr_lo = opcode & 0x0f;

        switch (instr_hi) {
        case 0x20:
        case 0x30:
                /*
                 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
                 * In X86_64 long mode, the CPU will signal invalid
                 * opcode if some of these prefixes are present so
                 * X86_64 will never get here anyway
                 */
                return ((instr_lo & 7) == 0x6);
#ifdef CONFIG_X86_64
        case 0x40:
                /*
                 * In 64-bit mode 0x40..0x4F are valid REX prefixes
                 */
                return (!user_mode(regs) || user_64bit_mode(regs));
#endif
        case 0x60:
                /* 0x64 thru 0x67 are valid prefixes in all modes. */
                return (instr_lo & 0xC) == 0x4;
        case 0xF0:
                /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
                return !instr_lo || (instr_lo>>1) == 1;
        case 0x00:
                /* Prefetch instruction is 0x0F0D or 0x0F18 */
                if (get_kernel_nofault(opcode, instr))
                        return 0;

                *prefetch = (instr_lo == 0xF) &&
                        (opcode == 0x0D || opcode == 0x18);
                return 0;
        default:
                return 0;
        }
}

static bool is_amd_k8_pre_npt(void)
{
        struct cpuinfo_x86 *c = &boot_cpu_data;

        return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) &&
                        c->x86_vendor == X86_VENDOR_AMD &&
                        c->x86 == 0xf && c->x86_model < 0x40);
}

static int
is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
{
        unsigned char *max_instr;
        unsigned char *instr;
        int prefetch = 0;

        /* Erratum #91 affects AMD K8, pre-NPT CPUs */
        if (!is_amd_k8_pre_npt())
                return 0;

        /*
         * If it was a exec (instruction fetch) fault on NX page, then
         * do not ignore the fault:
         */
        if (error_code & X86_PF_INSTR)
                return 0;

        instr = (void *)convert_ip_to_linear(current, regs);
        max_instr = instr + 15;

        /*
         * This code has historically always bailed out if IP points to a
         * not-present page (e.g. due to a race).  No one has ever
         * complained about this.
         */
        pagefault_disable();

        while (instr < max_instr) {
                unsigned char opcode;

                if (user_mode(regs)) {
                        if (get_user(opcode, (unsigned char __user *) instr))
                                break;
                } else {
                        if (get_kernel_nofault(opcode, instr))
                                break;
                }

                instr++;

                if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
                        break;
        }

        pagefault_enable();
        return prefetch;
}

DEFINE_SPINLOCK(pgd_lock);
LIST_HEAD(pgd_list);

#ifdef CONFIG_X86_32
static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
{
        unsigned index = pgd_index(address);
        pgd_t *pgd_k;
        p4d_t *p4d, *p4d_k;
        pud_t *pud, *pud_k;
        pmd_t *pmd, *pmd_k;

        pgd += index;
        pgd_k = init_mm.pgd + index;

        if (!pgd_present(*pgd_k))
                return NULL;

        /*
         * set_pgd(pgd, *pgd_k); here would be useless on PAE
         * and redundant with the set_pmd() on non-PAE. As would
         * set_p4d/set_pud.
         */
        p4d = p4d_offset(pgd, address);
        p4d_k = p4d_offset(pgd_k, address);
        if (!p4d_present(*p4d_k))
                return NULL;

        pud = pud_offset(p4d, address);
        pud_k = pud_offset(p4d_k, address);
        if (!pud_present(*pud_k))
                return NULL;

        pmd = pmd_offset(pud, address);
        pmd_k = pmd_offset(pud_k, address);

        if (pmd_present(*pmd) != pmd_present(*pmd_k))
                set_pmd(pmd, *pmd_k);

        if (!pmd_present(*pmd_k))
                return NULL;
        else
                BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k));

        return pmd_k;
}

/*
 *   Handle a fault on the vmalloc or module mapping area
 *
 *   This is needed because there is a race condition between the time
 *   when the vmalloc mapping code updates the PMD to the point in time
 *   where it synchronizes this update with the other page-tables in the
 *   system.
 *
 *   In this race window another thread/CPU can map an area on the same
 *   PMD, finds it already present and does not synchronize it with the
 *   rest of the system yet. As a result v[mz]alloc might return areas
 *   which are not mapped in every page-table in the system, causing an
 *   unhandled page-fault when they are accessed.
 */
static noinline int vmalloc_fault(unsigned long address)
{
        unsigned long pgd_paddr;
        pmd_t *pmd_k;
        pte_t *pte_k;

        /* Make sure we are in vmalloc area: */
        if (!(address >= VMALLOC_START && address < VMALLOC_END))
                return -1;

        /*
         * Synchronize this task's top level page-table
         * with the 'reference' page table.
         *
         * Do _not_ use "current" here. We might be inside
         * an interrupt in the middle of a task switch..
         */
        pgd_paddr = read_cr3_pa();
        pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
        if (!pmd_k)
                return -1;

        if (pmd_leaf(*pmd_k))
                return 0;

        pte_k = pte_offset_kernel(pmd_k, address);
        if (!pte_present(*pte_k))
                return -1;

        return 0;
}
NOKPROBE_SYMBOL(vmalloc_fault);

void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
{
        unsigned long addr;

        for (addr = start & PMD_MASK;
             addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
             addr += PMD_SIZE) {
                struct page *page;

                spin_lock(&pgd_lock);
                list_for_each_entry(page, &pgd_list, lru) {
                        spinlock_t *pgt_lock;

                        /* the pgt_lock only for Xen */
                        pgt_lock = &pgd_page_get_mm(page)->page_table_lock;

                        spin_lock(pgt_lock);
                        vmalloc_sync_one(page_address(page), addr);
                        spin_unlock(pgt_lock);
                }
                spin_unlock(&pgd_lock);
        }
}

static bool low_pfn(unsigned long pfn)
{
        return pfn < max_low_pfn;
}

static void dump_pagetable(unsigned long address)
{
        pgd_t *base = __va(read_cr3_pa());
        pgd_t *pgd = &base[pgd_index(address)];
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;

#ifdef CONFIG_X86_PAE
        pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
        if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
                goto out;
#define pr_pde pr_cont
#else
#define pr_pde pr_info
#endif
        p4d = p4d_offset(pgd, address);
        pud = pud_offset(p4d, address);
        pmd = pmd_offset(pud, address);
        pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
#undef pr_pde

        /*
         * We must not directly access the pte in the highpte
         * case if the page table is located in highmem.
         * And let's rather not kmap-atomic the pte, just in case
         * it's allocated already:
         */
        if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_leaf(*pmd))
                goto out;

        pte = pte_offset_kernel(pmd, address);
        pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
out:
        pr_cont("\n");
}

#else /* CONFIG_X86_64: */

#ifdef CONFIG_CPU_SUP_AMD
static const char errata93_warning[] =
KERN_ERR 
"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
"******* Working around it, but it may cause SEGVs or burn power.\n"
"******* Please consider a BIOS update.\n"
"******* Disabling USB legacy in the BIOS may also help.\n";
#endif

static int bad_address(void *p)
{
        unsigned long dummy;

        return get_kernel_nofault(dummy, (unsigned long *)p);
}

static void dump_pagetable(unsigned long address)
{
        pgd_t *base = __va(read_cr3_pa());
        pgd_t *pgd = base + pgd_index(address);
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;

        if (bad_address(pgd))
                goto bad;

        pr_info("PGD %lx ", pgd_val(*pgd));

        if (!pgd_present(*pgd))
                goto out;

        p4d = p4d_offset(pgd, address);
        if (bad_address(p4d))
                goto bad;

        pr_cont("P4D %lx ", p4d_val(*p4d));
        if (!p4d_present(*p4d) || p4d_leaf(*p4d))
                goto out;

        pud = pud_offset(p4d, address);
        if (bad_address(pud))
                goto bad;

        pr_cont("PUD %lx ", pud_val(*pud));
        if (!pud_present(*pud) || pud_leaf(*pud))
                goto out;

        pmd = pmd_offset(pud, address);
        if (bad_address(pmd))
                goto bad;

        pr_cont("PMD %lx ", pmd_val(*pmd));
        if (!pmd_present(*pmd) || pmd_leaf(*pmd))
                goto out;

        pte = pte_offset_kernel(pmd, address);
        if (bad_address(pte))
                goto bad;

        pr_cont("PTE %lx", pte_val(*pte));
out:
        pr_cont("\n");
        return;
bad:
        pr_info("BAD\n");
}

#endif /* CONFIG_X86_64 */

/*
 * Workaround for K8 erratum #93 & buggy BIOS.
 *
 * BIOS SMM functions are required to use a specific workaround
 * to avoid corruption of the 64bit RIP register on C stepping K8.
 *
 * A lot of BIOS that didn't get tested properly miss this.
 *
 * The OS sees this as a page fault with the upper 32bits of RIP cleared.
 * Try to work around it here.
 *
 * Note we only handle faults in kernel here.
 * Does nothing on 32-bit.
 */
static int is_errata93(struct pt_regs *regs, unsigned long address)
{
#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
        if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
            || boot_cpu_data.x86 != 0xf)
                return 0;

        if (user_mode(regs))
                return 0;

        if (address != regs->ip)
                return 0;

        if ((address >> 32) != 0)
                return 0;

        address |= 0xffffffffUL << 32;
        if ((address >= (u64)_stext && address <= (u64)_etext) ||
            (address >= MODULES_VADDR && address <= MODULES_END)) {
                printk_once(errata93_warning);
                regs->ip = address;
                return 1;
        }
#endif
        return 0;
}

/*
 * Work around K8 erratum #100 K8 in compat mode occasionally jumps
 * to illegal addresses >4GB.
 *
 * We catch this in the page fault handler because these addresses
 * are not reachable. Just detect this case and return.  Any code
 * segment in LDT is compatibility mode.
 */
static int is_errata100(struct pt_regs *regs, unsigned long address)
{
#ifdef CONFIG_X86_64
        if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
                return 1;
#endif
        return 0;
}

/* Pentium F0 0F C7 C8 bug workaround: */
static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code,
                       unsigned long address)
{
#ifdef CONFIG_X86_F00F_BUG
        if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) &&
            idt_is_f00f_address(address)) {
                handle_invalid_op(regs);
                return 1;
        }
#endif
        return 0;
}

static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
{
        u32 offset = (index >> 3) * sizeof(struct desc_struct);
        unsigned long addr;
        struct ldttss_desc desc;

        if (index == 0) {
                pr_alert("%s: NULL\n", name);
                return;
        }

        if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
                pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
                return;
        }

        if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset),
                              sizeof(struct ldttss_desc))) {
                pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
                         name, index);
                return;
        }

        addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24);
#ifdef CONFIG_X86_64
        addr |= ((u64)desc.base3 << 32);
#endif
        pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
                 name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
}

static void
show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
        if (!oops_may_print())
                return;

        if (error_code & X86_PF_INSTR) {
                unsigned int level;
                bool nx, rw;
                pgd_t *pgd;
                pte_t *pte;

                pgd = __va(read_cr3_pa());
                pgd += pgd_index(address);

                pte = lookup_address_in_pgd_attr(pgd, address, &level, &nx, &rw);

                if (pte && pte_present(*pte) && (!pte_exec(*pte) || nx))
                        pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
                                from_kuid(&init_user_ns, current_uid()));
                if (pte && pte_present(*pte) && pte_exec(*pte) && !nx &&
                                (pgd_flags(*pgd) & _PAGE_USER) &&
                                (__read_cr4() & X86_CR4_SMEP))
                        pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
                                from_kuid(&init_user_ns, current_uid()));
        }

        if (address < PAGE_SIZE && !user_mode(regs))
                pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
                        (void *)address);
        else
                pr_alert("BUG: unable to handle page fault for address: %px\n",
                        (void *)address);

        pr_alert("#PF: %s %s in %s mode\n",
                 (error_code & X86_PF_USER)  ? "user" : "supervisor",
                 (error_code & X86_PF_INSTR) ? "instruction fetch" :
                 (error_code & X86_PF_WRITE) ? "write access" :
                                               "read access",
                             user_mode(regs) ? "user" : "kernel");
        pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
                 !(error_code & X86_PF_PROT) ? "not-present page" :
                 (error_code & X86_PF_RSVD)  ? "reserved bit violation" :
                 (error_code & X86_PF_PK)    ? "protection keys violation" :
                 (error_code & X86_PF_RMP)   ? "RMP violation" :
                                               "permissions violation");

        if (!(error_code & X86_PF_USER) && user_mode(regs)) {
                struct desc_ptr idt, gdt;
                u16 ldtr, tr;

                /*
                 * This can happen for quite a few reasons.  The more obvious
                 * ones are faults accessing the GDT, or LDT.  Perhaps
                 * surprisingly, if the CPU tries to deliver a benign or
                 * contributory exception from user code and gets a page fault
                 * during delivery, the page fault can be delivered as though
                 * it originated directly from user code.  This could happen
                 * due to wrong permissions on the IDT, GDT, LDT, TSS, or
                 * kernel or IST stack.
                 */
                store_idt(&idt);

                /* Usable even on Xen PV -- it's just slow. */
                native_store_gdt(&gdt);

                pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
                         idt.address, idt.size, gdt.address, gdt.size);

                store_ldt(ldtr);
                show_ldttss(&gdt, "LDTR", ldtr);

                store_tr(tr);
                show_ldttss(&gdt, "TR", tr);
        }

        dump_pagetable(address);

        if (error_code & X86_PF_RMP)
                snp_dump_hva_rmpentry(address);
}

static noinline void
pgtable_bad(struct pt_regs *regs, unsigned long error_code,
            unsigned long address)
{
        struct task_struct *tsk;
        unsigned long flags;
        int sig;

        flags = oops_begin();
        tsk = current;
        sig = SIGKILL;

        printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
               tsk->comm, address);
        dump_pagetable(address);

        if (__die("Bad pagetable", regs, error_code))
                sig = 0;

        oops_end(flags, regs, sig);
}

static void sanitize_error_code(unsigned long address,
                                unsigned long *error_code)
{
        /*
         * To avoid leaking information about the kernel page
         * table layout, pretend that user-mode accesses to
         * kernel addresses are always protection faults.
         *
         * NB: This means that failed vsyscalls with vsyscall=none
         * will have the PROT bit.  This doesn't leak any
         * information and does not appear to cause any problems.
         */
        if (address >= TASK_SIZE_MAX)
                *error_code |= X86_PF_PROT;
}

static void set_signal_archinfo(unsigned long address,
                                unsigned long error_code)
{
        struct task_struct *tsk = current;

        tsk->thread.trap_nr = X86_TRAP_PF;
        tsk->thread.error_code = error_code | X86_PF_USER;
        tsk->thread.cr2 = address;
}

static noinline void
page_fault_oops(struct pt_regs *regs, unsigned long error_code,
                unsigned long address)
{
#ifdef CONFIG_VMAP_STACK
        struct stack_info info;
#endif
        unsigned long flags;
        int sig;

        if (user_mode(regs)) {
                /*
                 * Implicit kernel access from user mode?  Skip the stack
                 * overflow and EFI special cases.
                 */
                goto oops;
        }

#ifdef CONFIG_VMAP_STACK
        /*
         * Stack overflow?  During boot, we can fault near the initial
         * stack in the direct map, but that's not an overflow -- check
         * that we're in vmalloc space to avoid this.
         */
        if (is_vmalloc_addr((void *)address) &&
            get_stack_guard_info((void *)address, &info)) {
                /*
                 * We're likely to be running with very little stack space
                 * left.  It's plausible that we'd hit this condition but
                 * double-fault even before we get this far, in which case
                 * we're fine: the double-fault handler will deal with it.
                 *
                 * We don't want to make it all the way into the oops code
                 * and then double-fault, though, because we're likely to
                 * break the console driver and lose most of the stack dump.
                 */
                call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*),
                              handle_stack_overflow,
                              ASM_CALL_ARG3,
                              , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info));

                unreachable();
        }
#endif

        /*
         * Buggy firmware could access regions which might page fault.  If
         * this happens, EFI has a special OOPS path that will try to
         * avoid hanging the system.
         */
        if (IS_ENABLED(CONFIG_EFI))
                efi_crash_gracefully_on_page_fault(address);

        /* Only not-present faults should be handled by KFENCE. */
        if (!(error_code & X86_PF_PROT) &&
            kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
                return;

oops:
        /*
         * Oops. The kernel tried to access some bad page. We'll have to
         * terminate things with extreme prejudice:
         */
        flags = oops_begin();

        show_fault_oops(regs, error_code, address);

        if (task_stack_end_corrupted(current))
                printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");

        sig = SIGKILL;
        if (__die("Oops", regs, error_code))
                sig = 0;

        /* Executive summary in case the body of the oops scrolled away */
        printk(KERN_DEFAULT "CR2: %016lx\n", address);

        oops_end(flags, regs, sig);
}

static noinline void
kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
                         unsigned long address, int signal, int si_code,
                         u32 pkey)
{
        WARN_ON_ONCE(user_mode(regs));

        /* Are we prepared to handle this kernel fault? */
        if (fixup_exception(regs, X86_TRAP_PF, error_code, address))
                return;

        /*
         * AMD erratum #91 manifests as a spurious page fault on a PREFETCH
         * instruction.
         */
        if (is_prefetch(regs, error_code, address))
                return;

        page_fault_oops(regs, error_code, address);
}

/*
 * Print out info about fatal segfaults, if the show_unhandled_signals
 * sysctl is set:
 */
static inline void
show_signal_msg(struct pt_regs *regs, unsigned long error_code,
                unsigned long address, struct task_struct *tsk)
{
        const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
        /* This is a racy snapshot, but it's better than nothing. */
        int cpu = raw_smp_processor_id();

        if (!unhandled_signal(tsk, SIGSEGV))
                return;

        if (!printk_ratelimit())
                return;

        printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
                loglvl, tsk->comm, task_pid_nr(tsk), address,
                (void *)regs->ip, (void *)regs->sp, error_code);

        print_vma_addr(KERN_CONT " in ", regs->ip);

        /*
         * Dump the likely CPU where the fatal segfault happened.
         * This can help identify faulty hardware.
         */
        printk(KERN_CONT " likely on CPU %d (core %d, socket %d)", cpu,
               topology_core_id(cpu), topology_physical_package_id(cpu));


        printk(KERN_CONT "\n");

        show_opcodes(regs, loglvl);
}

static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
                       unsigned long address, u32 pkey, int si_code)
{
        struct task_struct *tsk = current;

        if (!user_mode(regs)) {
                kernelmode_fixup_or_oops(regs, error_code, address,
                                         SIGSEGV, si_code, pkey);
                return;
        }

        if (!(error_code & X86_PF_USER)) {
                /* Implicit user access to kernel memory -- just oops */
                page_fault_oops(regs, error_code, address);
                return;
        }

        /*
         * User mode accesses just cause a SIGSEGV.
         * It's possible to have interrupts off here:
         */
        local_irq_enable();

        /*
         * Valid to do another page fault here because this one came
         * from user space:
         */
        if (is_prefetch(regs, error_code, address))
                return;

        if (is_errata100(regs, address))
                return;

        sanitize_error_code(address, &error_code);

        if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
                return;

        if (likely(show_unhandled_signals))
                show_signal_msg(regs, error_code, address, tsk);

        set_signal_archinfo(address, error_code);

        if (si_code == SEGV_PKUERR)
                force_sig_pkuerr((void __user *)address, pkey);
        else
                force_sig_fault(SIGSEGV, si_code, (void __user *)address);

        local_irq_disable();
}

static noinline void
bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
                     unsigned long address)
{
        __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
}

static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
           unsigned long address, struct mm_struct *mm,
           struct vm_area_struct *vma, u32 pkey, int si_code)
{
        /*
         * Something tried to access memory that isn't in our memory map..
         * Fix it, but check if it's kernel or user first..
         */
        if (mm)
                mmap_read_unlock(mm);
        else
                vma_end_read(vma);

        __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
}

static inline bool bad_area_access_from_pkeys(unsigned long error_code,
                struct vm_area_struct *vma)
{
        /* This code is always called on the current mm */
        bool foreign = false;

        if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
                return false;
        if (error_code & X86_PF_PK)
                return true;
        /* this checks permission keys on the VMA: */
        if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
                                       (error_code & X86_PF_INSTR), foreign))
                return true;
        return false;
}

static noinline void
bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
                      unsigned long address, struct mm_struct *mm,
                      struct vm_area_struct *vma)
{
        /*
         * This OSPKE check is not strictly necessary at runtime.
         * But, doing it this way allows compiler optimizations
         * if pkeys are compiled out.
         */
        if (bad_area_access_from_pkeys(error_code, vma)) {
                /*
                 * A protection key fault means that the PKRU value did not allow
                 * access to some PTE.  Userspace can figure out what PKRU was
                 * from the XSAVE state.  This function captures the pkey from
                 * the vma and passes it to userspace so userspace can discover
                 * which protection key was set on the PTE.
                 *
                 * If we get here, we know that the hardware signaled a X86_PF_PK
                 * fault and that there was a VMA once we got in the fault
                 * handler.  It does *not* guarantee that the VMA we find here
                 * was the one that we faulted on.
                 *
                 * 1. T1   : mprotect_key(foo, PAGE_SIZE, pkey=4);
                 * 2. T1   : set PKRU to deny access to pkey=4, touches page
                 * 3. T1   : faults...
                 * 4.    T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
                 * 5. T1   : enters fault handler, takes mmap_lock, etc...
                 * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
                 *             faulted on a pte with its pkey=4.
                 */
                u32 pkey = vma_pkey(vma);

                __bad_area(regs, error_code, address, mm, vma, pkey, SEGV_PKUERR);
        } else {
                __bad_area(regs, error_code, address, mm, vma, 0, SEGV_ACCERR);
        }
}

static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
          vm_fault_t fault)
{
        /* Kernel mode? Handle exceptions or die: */
        if (!user_mode(regs)) {
                kernelmode_fixup_or_oops(regs, error_code, address,
                                         SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY);
                return;
        }

        /* User-space => ok to do another page fault: */
        if (is_prefetch(regs, error_code, address))
                return;

        sanitize_error_code(address, &error_code);

        if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
                return;

        set_signal_archinfo(address, error_code);

#ifdef CONFIG_MEMORY_FAILURE
        if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
                struct task_struct *tsk = current;
                unsigned lsb = 0;

                pr_err(
        "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
                        tsk->comm, tsk->pid, address);
                if (fault & VM_FAULT_HWPOISON_LARGE)
                        lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
                if (fault & VM_FAULT_HWPOISON)
                        lsb = PAGE_SHIFT;
                force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
                return;
        }
#endif
        force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
}

static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
{
        if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
                return 0;

        if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
                return 0;

        return 1;
}

/*
 * Handle a spurious fault caused by a stale TLB entry.
 *
 * This allows us to lazily refresh the TLB when increasing the
 * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
 * eagerly is very expensive since that implies doing a full
 * cross-processor TLB flush, even if no stale TLB entries exist
 * on other processors.
 *
 * Spurious faults may only occur if the TLB contains an entry with
 * fewer permission than the page table entry.  Non-present (P = 0)
 * and reserved bit (R = 1) faults are never spurious.
 *
 * There are no security implications to leaving a stale TLB when
 * increasing the permissions on a page.
 *
 * Returns non-zero if a spurious fault was handled, zero otherwise.
 *
 * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
 * (Optional Invalidation).
 */
static noinline int
spurious_kernel_fault(unsigned long error_code, unsigned long address)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;
        int ret;

        /*
         * Only writes to RO or instruction fetches from NX may cause
         * spurious faults.
         *
         * These could be from user or supervisor accesses but the TLB
         * is only lazily flushed after a kernel mapping protection
         * change, so user accesses are not expected to cause spurious
         * faults.
         */
        if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
            error_code != (X86_PF_INSTR | X86_PF_PROT))
                return 0;

        pgd = init_mm.pgd + pgd_index(address);
        if (!pgd_present(*pgd))
                return 0;

        p4d = p4d_offset(pgd, address);
        if (!p4d_present(*p4d))
                return 0;

        if (p4d_leaf(*p4d))
                return spurious_kernel_fault_check(error_code, (pte_t *) p4d);

        pud = pud_offset(p4d, address);
        if (!pud_present(*pud))
                return 0;

        if (pud_leaf(*pud))
                return spurious_kernel_fault_check(error_code, (pte_t *) pud);

        pmd = pmd_offset(pud, address);
        if (!pmd_present(*pmd))
                return 0;

        if (pmd_leaf(*pmd))
                return spurious_kernel_fault_check(error_code, (pte_t *) pmd);

        pte = pte_offset_kernel(pmd, address);
        if (!pte_present(*pte))
                return 0;

        ret = spurious_kernel_fault_check(error_code, pte);
        if (!ret)
                return 0;

        /*
         * Make sure we have permissions in PMD.
         * If not, then there's a bug in the page tables:
         */
        ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
        WARN_ONCE(!ret, "PMD has incorrect permission bits\n");

        return ret;
}
NOKPROBE_SYMBOL(spurious_kernel_fault);

int show_unhandled_signals = 1;

static inline int
access_error(unsigned long error_code, struct vm_area_struct *vma)
{
        /* This is only called for the current mm, so: */
        bool foreign = false;

        /*
         * Read or write was blocked by protection keys.  This is
         * always an unconditional error and can never result in
         * a follow-up action to resolve the fault, like a COW.
         */
        if (error_code & X86_PF_PK)
                return 1;

        /*
         * SGX hardware blocked the access.  This usually happens
         * when the enclave memory contents have been destroyed, like
         * after a suspend/resume cycle. In any case, the kernel can't
         * fix the cause of the fault.  Handle the fault as an access
         * error even in cases where no actual access violation
         * occurred.  This allows userspace to rebuild the enclave in
         * response to the signal.
         */
        if (unlikely(error_code & X86_PF_SGX))
                return 1;

        /*
         * Make sure to check the VMA so that we do not perform
         * faults just to hit a X86_PF_PK as soon as we fill in a
         * page.
         */
        if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
                                       (error_code & X86_PF_INSTR), foreign))
                return 1;

        /*
         * Shadow stack accesses (PF_SHSTK=1) are only permitted to
         * shadow stack VMAs. All other accesses result in an error.
         */
        if (error_code & X86_PF_SHSTK) {
                if (unlikely(!(vma->vm_flags & VM_SHADOW_STACK)))
                        return 1;
                if (unlikely(!(vma->vm_flags & VM_WRITE)))
                        return 1;
                return 0;
        }

        if (error_code & X86_PF_WRITE) {
                /* write, present and write, not present: */
                if (unlikely(vma->vm_flags & VM_SHADOW_STACK))
                        return 1;
                if (unlikely(!(vma->vm_flags & VM_WRITE)))
                        return 1;
                return 0;
        }

        /* read, present: */
        if (unlikely(error_code & X86_PF_PROT))
                return 1;

        /* read, not present: */
        if (unlikely(!vma_is_accessible(vma)))
                return 1;

        return 0;
}

bool fault_in_kernel_space(unsigned long address)
{
        /*
         * On 64-bit systems, the vsyscall page is at an address above
         * TASK_SIZE_MAX, but is not considered part of the kernel
         * address space.
         */
        if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
                return false;

        return address >= TASK_SIZE_MAX;
}

/*
 * Called for all faults where 'address' is part of the kernel address
 * space.  Might get called for faults that originate from *code* that
 * ran in userspace or the kernel.
 */
static void
do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
                   unsigned long address)
{
        /*
         * Protection keys exceptions only happen on user pages.  We
         * have no user pages in the kernel portion of the address
         * space, so do not expect them here.
         */
        WARN_ON_ONCE(hw_error_code & X86_PF_PK);

#ifdef CONFIG_X86_32
        /*
         * We can fault-in kernel-space virtual memory on-demand. The
         * 'reference' page table is init_mm.pgd.
         *
         * NOTE! We MUST NOT take any locks for this case. We may
         * be in an interrupt or a critical region, and should
         * only copy the information from the master page table,
         * nothing more.
         *
         * Before doing this on-demand faulting, ensure that the
         * fault is not any of the following:
         * 1. A fault on a PTE with a reserved bit set.
         * 2. A fault caused by a user-mode access.  (Do not demand-
         *    fault kernel memory due to user-mode accesses).
         * 3. A fault caused by a page-level protection violation.
         *    (A demand fault would be on a non-present page which
         *     would have X86_PF_PROT==0).
         *
         * This is only needed to close a race condition on x86-32 in
         * the vmalloc mapping/unmapping code. See the comment above
         * vmalloc_fault() for details. On x86-64 the race does not
         * exist as the vmalloc mappings don't need to be synchronized
         * there.
         */
        if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
                if (vmalloc_fault(address) >= 0)
                        return;
        }
#endif

        if (is_f00f_bug(regs, hw_error_code, address))
                return;

        /* Was the fault spurious, caused by lazy TLB invalidation? */
        if (spurious_kernel_fault(hw_error_code, address))
                return;

        /* kprobes don't want to hook the spurious faults: */
        if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
                return;

        /*
         * Note, despite being a "bad area", there are quite a few
         * acceptable reasons to get here, such as erratum fixups
         * and handling kernel code that can fault, like get_user().
         *
         * Don't take the mm semaphore here. If we fixup a prefetch
         * fault we could otherwise deadlock:
         */
        bad_area_nosemaphore(regs, hw_error_code, address);
}
NOKPROBE_SYMBOL(do_kern_addr_fault);

/*
 * Handle faults in the user portion of the address space.  Nothing in here
 * should check X86_PF_USER without a specific justification: for almost
 * all purposes, we should treat a normal kernel access to user memory
 * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction.
 * The one exception is AC flag handling, which is, per the x86
 * architecture, special for WRUSS.
 */
static inline
void do_user_addr_fault(struct pt_regs *regs,
                        unsigned long error_code,
                        unsigned long address)
{
        struct vm_area_struct *vma;
        struct task_struct *tsk;
        struct mm_struct *mm;
        vm_fault_t fault;
        unsigned int flags = FAULT_FLAG_DEFAULT;

        tsk = current;
        mm = tsk->mm;

        if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) {
                /*
                 * Whoops, this is kernel mode code trying to execute from
                 * user memory.  Unless this is AMD erratum #93, which
                 * corrupts RIP such that it looks like a user address,
                 * this is unrecoverable.  Don't even try to look up the
                 * VMA or look for extable entries.
                 */
                if (is_errata93(regs, address))
                        return;

                page_fault_oops(regs, error_code, address);
                return;
        }

        /* kprobes don't want to hook the spurious faults: */
        if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
                return;

        /*
         * Reserved bits are never expected to be set on
         * entries in the user portion of the page tables.
         */
        if (unlikely(error_code & X86_PF_RSVD))
                pgtable_bad(regs, error_code, address);

        /*
         * If SMAP is on, check for invalid kernel (supervisor) access to user
         * pages in the user address space.  The odd case here is WRUSS,
         * which, according to the preliminary documentation, does not respect
         * SMAP and will have the USER bit set so, in all cases, SMAP
         * enforcement appears to be consistent with the USER bit.
         */
        if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
                     !(error_code & X86_PF_USER) &&
                     !(regs->flags & X86_EFLAGS_AC))) {
                /*
                 * No extable entry here.  This was a kernel access to an
                 * invalid pointer.  get_kernel_nofault() will not get here.
                 */
                page_fault_oops(regs, error_code, address);
                return;
        }

        /*
         * If we're in an interrupt, have no user context or are running
         * in a region with pagefaults disabled then we must not take the fault
         */
        if (unlikely(faulthandler_disabled() || !mm)) {
                bad_area_nosemaphore(regs, error_code, address);
                return;
        }

        /* Legacy check - remove this after verifying that it doesn't trigger */
        if (WARN_ON_ONCE(!(regs->flags & X86_EFLAGS_IF))) {
                bad_area_nosemaphore(regs, error_code, address);
                return;
        }

        local_irq_enable();

        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);

        /*
         * Read-only permissions can not be expressed in shadow stack PTEs.
         * Treat all shadow stack accesses as WRITE faults. This ensures
         * that the MM will prepare everything (e.g., break COW) such that
         * maybe_mkwrite() can create a proper shadow stack PTE.
         */
        if (error_code & X86_PF_SHSTK)
                flags |= FAULT_FLAG_WRITE;
        if (error_code & X86_PF_WRITE)
                flags |= FAULT_FLAG_WRITE;
        if (error_code & X86_PF_INSTR)
                flags |= FAULT_FLAG_INSTRUCTION;

        /*
         * We set FAULT_FLAG_USER based on the register state, not
         * based on X86_PF_USER. User space accesses that cause
         * system page faults are still user accesses.
         */
        if (user_mode(regs))
                flags |= FAULT_FLAG_USER;

#ifdef CONFIG_X86_64
        /*
         * Faults in the vsyscall page might need emulation.  The
         * vsyscall page is at a high address (>PAGE_OFFSET), but is
         * considered to be part of the user address space.
         *
         * The vsyscall page does not have a "real" VMA, so do this
         * emulation before we go searching for VMAs.
         *
         * PKRU never rejects instruction fetches, so we don't need
         * to consider the PF_PK bit.
         */
        if (is_vsyscall_vaddr(address)) {
                if (emulate_vsyscall(error_code, regs, address))
                        return;
        }
#endif

        if (!(flags & FAULT_FLAG_USER))
                goto lock_mmap;

        vma = lock_vma_under_rcu(mm, address);
        if (!vma)
                goto lock_mmap;

        if (unlikely(access_error(error_code, vma))) {
                bad_area_access_error(regs, error_code, address, NULL, vma);
                count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
                return;
        }
        fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
        if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
                vma_end_read(vma);

        if (!(fault & VM_FAULT_RETRY)) {
                count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
                goto done;
        }
        count_vm_vma_lock_event(VMA_LOCK_RETRY);
        if (fault & VM_FAULT_MAJOR)
                flags |= FAULT_FLAG_TRIED;

        /* Quick path to respond to signals */
        if (fault_signal_pending(fault, regs)) {
                if (!user_mode(regs))
                        kernelmode_fixup_or_oops(regs, error_code, address,
                                                 SIGBUS, BUS_ADRERR,
                                                 ARCH_DEFAULT_PKEY);
                return;
        }
lock_mmap:

retry:
        vma = lock_mm_and_find_vma(mm, address, regs);
        if (unlikely(!vma)) {
                bad_area_nosemaphore(regs, error_code, address);
                return;
        }

        /*
         * Ok, we have a good vm_area for this memory access, so
         * we can handle it..
         */
        if (unlikely(access_error(error_code, vma))) {
                bad_area_access_error(regs, error_code, address, mm, vma);
                return;
        }

        /*
         * If for any reason at all we couldn't handle the fault,
         * make sure we exit gracefully rather than endlessly redo
         * the fault.  Since we never set FAULT_FLAG_RETRY_NOWAIT, if
         * we get VM_FAULT_RETRY back, the mmap_lock has been unlocked.
         *
         * Note that handle_userfault() may also release and reacquire mmap_lock
         * (and not return with VM_FAULT_RETRY), when returning to userland to
         * repeat the page fault later with a VM_FAULT_NOPAGE retval
         * (potentially after handling any pending signal during the return to
         * userland). The return to userland is identified whenever
         * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
         */
        fault = handle_mm_fault(vma, address, flags, regs);

        if (fault_signal_pending(fault, regs)) {
                /*
                 * Quick path to respond to signals.  The core mm code
                 * has unlocked the mm for us if we get here.
                 */
                if (!user_mode(regs))
                        kernelmode_fixup_or_oops(regs, error_code, address,
                                                 SIGBUS, BUS_ADRERR,
                                                 ARCH_DEFAULT_PKEY);
                return;
        }

        /* The fault is fully completed (including releasing mmap lock) */
        if (fault & VM_FAULT_COMPLETED)
                return;

        /*
         * If we need to retry the mmap_lock has already been released,
         * and if there is a fatal signal pending there is no guarantee
         * that we made any progress. Handle this case first.
         */
        if (unlikely(fault & VM_FAULT_RETRY)) {
                flags |= FAULT_FLAG_TRIED;
                goto retry;
        }

        mmap_read_unlock(mm);
done:
        if (likely(!(fault & VM_FAULT_ERROR)))
                return;

        if (fatal_signal_pending(current) && !user_mode(regs)) {
                kernelmode_fixup_or_oops(regs, error_code, address,
                                         0, 0, ARCH_DEFAULT_PKEY);
                return;
        }

        if (fault & VM_FAULT_OOM) {
                /* Kernel mode? Handle exceptions or die: */
                if (!user_mode(regs)) {
                        kernelmode_fixup_or_oops(regs, error_code, address,
                                                 SIGSEGV, SEGV_MAPERR,
                                                 ARCH_DEFAULT_PKEY);
                        return;
                }

                /*
                 * We ran out of memory, call the OOM killer, and return the
                 * userspace (which will retry the fault, or kill us if we got
                 * oom-killed):
                 */
                pagefault_out_of_memory();
        } else {
                if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
                             VM_FAULT_HWPOISON_LARGE))
                        do_sigbus(regs, error_code, address, fault);
                else if (fault & VM_FAULT_SIGSEGV)
                        bad_area_nosemaphore(regs, error_code, address);
                else
                        BUG();
        }
}
NOKPROBE_SYMBOL(do_user_addr_fault);

static __always_inline void
trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
                         unsigned long address)
{
        if (!trace_pagefault_enabled())
                return;

        if (user_mode(regs))
                trace_page_fault_user(address, regs, error_code);
        else
                trace_page_fault_kernel(address, regs, error_code);
}

static __always_inline void
handle_page_fault(struct pt_regs *regs, unsigned long error_code,
                              unsigned long address)
{
        trace_page_fault_entries(regs, error_code, address);

        if (unlikely(kmmio_fault(regs, address)))
                return;

        /* Was the fault on kernel-controlled part of the address space? */
        if (unlikely(fault_in_kernel_space(address))) {
                do_kern_addr_fault(regs, error_code, address);
        } else {
                do_user_addr_fault(regs, error_code, address);
                /*
                 * User address page fault handling might have reenabled
                 * interrupts. Fixing up all potential exit points of
                 * do_user_addr_fault() and its leaf functions is just not
                 * doable w/o creating an unholy mess or turning the code
                 * upside down.
                 */
                local_irq_disable();
        }
}

DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
{
        irqentry_state_t state;
        unsigned long address;

        address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2();

        prefetchw(&current->mm->mmap_lock);

        /*
         * KVM uses #PF vector to deliver 'page not present' events to guests
         * (asynchronous page fault mechanism). The event happens when a
         * userspace task is trying to access some valid (from guest's point of
         * view) memory which is not currently mapped by the host (e.g. the
         * memory is swapped out). Note, the corresponding "page ready" event
         * which is injected when the memory becomes available, is delivered via
         * an interrupt mechanism and not a #PF exception
         * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()).
         *
         * We are relying on the interrupted context being sane (valid RSP,
         * relevant locks not held, etc.), which is fine as long as the
         * interrupted context had IF=1.  We are also relying on the KVM
         * async pf type field and CR2 being read consistently instead of
         * getting values from real and async page faults mixed up.
         *
         * Fingers crossed.
         *
         * The async #PF handling code takes care of idtentry handling
         * itself.
         */
        if (kvm_handle_async_pf(regs, (u32)address))
                return;

        /*
         * Entry handling for valid #PF from kernel mode is slightly
         * different: RCU is already watching and ct_irq_enter() must not
         * be invoked because a kernel fault on a user space address might
         * sleep.
         *
         * In case the fault hit a RCU idle region the conditional entry
         * code reenabled RCU to avoid subsequent wreckage which helps
         * debuggability.
         */
        state = irqentry_enter(regs);

        instrumentation_begin();
        handle_page_fault(regs, error_code, address);
        instrumentation_end();

        irqentry_exit(regs, state);
}








































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
// SPDX-License-Identifier: GPL-2.0

/*
 * Copyright (C) 2020 Google LLC.
 */

#include <linux/filter.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/binfmts.h>
#include <linux/lsm_hooks.h>
#include <linux/bpf_lsm.h>
#include <linux/kallsyms.h>
#include <linux/bpf_verifier.h>
#include <net/bpf_sk_storage.h>
#include <linux/bpf_local_storage.h>
#include <linux/btf_ids.h>
#include <linux/ima.h>
#include <linux/bpf-cgroup.h>

/* For every LSM hook that allows attachment of BPF programs, declare a nop
 * function where a BPF program can be attached.
 */
#define LSM_HOOK(RET, DEFAULT, NAME, ...)        \
noinline RET bpf_lsm_##NAME(__VA_ARGS__)        \
{                                                \
        return DEFAULT;                                \
}

#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK

#define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME)
BTF_SET_START(bpf_lsm_hooks)
#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK
BTF_SET_END(bpf_lsm_hooks)

/* List of LSM hooks that should operate on 'current' cgroup regardless
 * of function signature.
 */
BTF_SET_START(bpf_lsm_current_hooks)
/* operate on freshly allocated sk without any cgroup association */
#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_sk_alloc_security)
BTF_ID(func, bpf_lsm_sk_free_security)
#endif
BTF_SET_END(bpf_lsm_current_hooks)

/* List of LSM hooks that trigger while the socket is properly locked.
 */
BTF_SET_START(bpf_lsm_locked_sockopt_hooks)
#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_sock_graft)
BTF_ID(func, bpf_lsm_inet_csk_clone)
BTF_ID(func, bpf_lsm_inet_conn_established)
#endif
BTF_SET_END(bpf_lsm_locked_sockopt_hooks)

/* List of LSM hooks that trigger while the socket is _not_ locked,
 * but it's ok to call bpf_{g,s}etsockopt because the socket is still
 * in the early init phase.
 */
BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks)
#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_socket_post_create)
BTF_ID(func, bpf_lsm_socket_socketpair)
#endif
BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks)

#ifdef CONFIG_CGROUP_BPF
void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
                             bpf_func_t *bpf_func)
{
        const struct btf_param *args __maybe_unused;

        if (btf_type_vlen(prog->aux->attach_func_proto) < 1 ||
            btf_id_set_contains(&bpf_lsm_current_hooks,
                                prog->aux->attach_btf_id)) {
                *bpf_func = __cgroup_bpf_run_lsm_current;
                return;
        }

#ifdef CONFIG_NET
        args = btf_params(prog->aux->attach_func_proto);

        if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCKET])
                *bpf_func = __cgroup_bpf_run_lsm_socket;
        else if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCK])
                *bpf_func = __cgroup_bpf_run_lsm_sock;
        else
#endif
                *bpf_func = __cgroup_bpf_run_lsm_current;
}
#endif

int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
                        const struct bpf_prog *prog)
{
        if (!prog->gpl_compatible) {
                bpf_log(vlog,
                        "LSM programs must have a GPL compatible license\n");
                return -EINVAL;
        }

        if (!btf_id_set_contains(&bpf_lsm_hooks, prog->aux->attach_btf_id)) {
                bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n",
                        prog->aux->attach_btf_id, prog->aux->attach_func_name);
                return -EINVAL;
        }

        return 0;
}

/* Mask for all the currently supported BPRM option flags */
#define BPF_F_BRPM_OPTS_MASK        BPF_F_BPRM_SECUREEXEC

BPF_CALL_2(bpf_bprm_opts_set, struct linux_binprm *, bprm, u64, flags)
{
        if (flags & ~BPF_F_BRPM_OPTS_MASK)
                return -EINVAL;

        bprm->secureexec = (flags & BPF_F_BPRM_SECUREEXEC);
        return 0;
}

BTF_ID_LIST_SINGLE(bpf_bprm_opts_set_btf_ids, struct, linux_binprm)

static const struct bpf_func_proto bpf_bprm_opts_set_proto = {
        .func                = bpf_bprm_opts_set,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_bprm_opts_set_btf_ids[0],
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_ima_inode_hash, struct inode *, inode, void *, dst, u32, size)
{
        return ima_inode_hash(inode, dst, size);
}

static bool bpf_ima_inode_hash_allowed(const struct bpf_prog *prog)
{
        return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);
}

BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode)

static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
        .func                = bpf_ima_inode_hash,
        .gpl_only        = false,
        .might_sleep        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_ima_inode_hash_btf_ids[0],
        .arg2_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .allowed        = bpf_ima_inode_hash_allowed,
};

BPF_CALL_3(bpf_ima_file_hash, struct file *, file, void *, dst, u32, size)
{
        return ima_file_hash(file, dst, size);
}

BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file)

static const struct bpf_func_proto bpf_ima_file_hash_proto = {
        .func                = bpf_ima_file_hash,
        .gpl_only        = false,
        .might_sleep        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_ima_file_hash_btf_ids[0],
        .arg2_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .allowed        = bpf_ima_inode_hash_allowed,
};

BPF_CALL_1(bpf_get_attach_cookie, void *, ctx)
{
        struct bpf_trace_run_ctx *run_ctx;

        run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx);
        return run_ctx->bpf_cookie;
}

static const struct bpf_func_proto bpf_get_attach_cookie_proto = {
        .func                = bpf_get_attach_cookie,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

static const struct bpf_func_proto *
bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        if (prog->expected_attach_type == BPF_LSM_CGROUP) {
                func_proto = cgroup_common_func_proto(func_id, prog);
                if (func_proto)
                        return func_proto;
        }

        switch (func_id) {
        case BPF_FUNC_inode_storage_get:
                return &bpf_inode_storage_get_proto;
        case BPF_FUNC_inode_storage_delete:
                return &bpf_inode_storage_delete_proto;
#ifdef CONFIG_NET
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
#endif /* CONFIG_NET */
        case BPF_FUNC_spin_lock:
                return &bpf_spin_lock_proto;
        case BPF_FUNC_spin_unlock:
                return &bpf_spin_unlock_proto;
        case BPF_FUNC_bprm_opts_set:
                return &bpf_bprm_opts_set_proto;
        case BPF_FUNC_ima_inode_hash:
                return &bpf_ima_inode_hash_proto;
        case BPF_FUNC_ima_file_hash:
                return &bpf_ima_file_hash_proto;
        case BPF_FUNC_get_attach_cookie:
                return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL;
#ifdef CONFIG_NET
        case BPF_FUNC_setsockopt:
                if (prog->expected_attach_type != BPF_LSM_CGROUP)
                        return NULL;
                if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks,
                                        prog->aux->attach_btf_id))
                        return &bpf_sk_setsockopt_proto;
                if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks,
                                        prog->aux->attach_btf_id))
                        return &bpf_unlocked_sk_setsockopt_proto;
                return NULL;
        case BPF_FUNC_getsockopt:
                if (prog->expected_attach_type != BPF_LSM_CGROUP)
                        return NULL;
                if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks,
                                        prog->aux->attach_btf_id))
                        return &bpf_sk_getsockopt_proto;
                if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks,
                                        prog->aux->attach_btf_id))
                        return &bpf_unlocked_sk_getsockopt_proto;
                return NULL;
#endif
        default:
                return tracing_prog_func_proto(func_id, prog);
        }
}

/* The set of hooks which are called without pagefaults disabled and are allowed
 * to "sleep" and thus can be used for sleepable BPF programs.
 */
BTF_SET_START(sleepable_lsm_hooks)
BTF_ID(func, bpf_lsm_bpf)
BTF_ID(func, bpf_lsm_bpf_map)
BTF_ID(func, bpf_lsm_bpf_map_create)
BTF_ID(func, bpf_lsm_bpf_map_free)
BTF_ID(func, bpf_lsm_bpf_prog)
BTF_ID(func, bpf_lsm_bpf_prog_load)
BTF_ID(func, bpf_lsm_bpf_prog_free)
BTF_ID(func, bpf_lsm_bpf_token_create)
BTF_ID(func, bpf_lsm_bpf_token_free)
BTF_ID(func, bpf_lsm_bpf_token_cmd)
BTF_ID(func, bpf_lsm_bpf_token_capable)
BTF_ID(func, bpf_lsm_bprm_check_security)
BTF_ID(func, bpf_lsm_bprm_committed_creds)
BTF_ID(func, bpf_lsm_bprm_committing_creds)
BTF_ID(func, bpf_lsm_bprm_creds_for_exec)
BTF_ID(func, bpf_lsm_bprm_creds_from_file)
BTF_ID(func, bpf_lsm_capget)
BTF_ID(func, bpf_lsm_capset)
BTF_ID(func, bpf_lsm_cred_prepare)
BTF_ID(func, bpf_lsm_file_ioctl)
BTF_ID(func, bpf_lsm_file_lock)
BTF_ID(func, bpf_lsm_file_open)
BTF_ID(func, bpf_lsm_file_receive)

BTF_ID(func, bpf_lsm_inode_create)
BTF_ID(func, bpf_lsm_inode_free_security)
BTF_ID(func, bpf_lsm_inode_getattr)
BTF_ID(func, bpf_lsm_inode_getxattr)
BTF_ID(func, bpf_lsm_inode_mknod)
BTF_ID(func, bpf_lsm_inode_need_killpriv)
BTF_ID(func, bpf_lsm_inode_post_setxattr)
BTF_ID(func, bpf_lsm_inode_readlink)
BTF_ID(func, bpf_lsm_inode_rename)
BTF_ID(func, bpf_lsm_inode_rmdir)
BTF_ID(func, bpf_lsm_inode_setattr)
BTF_ID(func, bpf_lsm_inode_setxattr)
BTF_ID(func, bpf_lsm_inode_symlink)
BTF_ID(func, bpf_lsm_inode_unlink)
BTF_ID(func, bpf_lsm_kernel_module_request)
BTF_ID(func, bpf_lsm_kernel_read_file)
BTF_ID(func, bpf_lsm_kernfs_init_security)

#ifdef CONFIG_SECURITY_PATH
BTF_ID(func, bpf_lsm_path_unlink)
BTF_ID(func, bpf_lsm_path_mkdir)
BTF_ID(func, bpf_lsm_path_rmdir)
BTF_ID(func, bpf_lsm_path_truncate)
BTF_ID(func, bpf_lsm_path_symlink)
BTF_ID(func, bpf_lsm_path_link)
BTF_ID(func, bpf_lsm_path_rename)
BTF_ID(func, bpf_lsm_path_chmod)
BTF_ID(func, bpf_lsm_path_chown)
#endif /* CONFIG_SECURITY_PATH */

#ifdef CONFIG_KEYS
BTF_ID(func, bpf_lsm_key_free)
#endif /* CONFIG_KEYS */

BTF_ID(func, bpf_lsm_mmap_file)
BTF_ID(func, bpf_lsm_netlink_send)
BTF_ID(func, bpf_lsm_path_notify)
BTF_ID(func, bpf_lsm_release_secctx)
BTF_ID(func, bpf_lsm_sb_alloc_security)
BTF_ID(func, bpf_lsm_sb_eat_lsm_opts)
BTF_ID(func, bpf_lsm_sb_kern_mount)
BTF_ID(func, bpf_lsm_sb_mount)
BTF_ID(func, bpf_lsm_sb_remount)
BTF_ID(func, bpf_lsm_sb_set_mnt_opts)
BTF_ID(func, bpf_lsm_sb_show_options)
BTF_ID(func, bpf_lsm_sb_statfs)
BTF_ID(func, bpf_lsm_sb_umount)
BTF_ID(func, bpf_lsm_settime)

#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_inet_conn_established)

BTF_ID(func, bpf_lsm_socket_accept)
BTF_ID(func, bpf_lsm_socket_bind)
BTF_ID(func, bpf_lsm_socket_connect)
BTF_ID(func, bpf_lsm_socket_create)
BTF_ID(func, bpf_lsm_socket_getpeername)
BTF_ID(func, bpf_lsm_socket_getpeersec_dgram)
BTF_ID(func, bpf_lsm_socket_getsockname)
BTF_ID(func, bpf_lsm_socket_getsockopt)
BTF_ID(func, bpf_lsm_socket_listen)
BTF_ID(func, bpf_lsm_socket_post_create)
BTF_ID(func, bpf_lsm_socket_recvmsg)
BTF_ID(func, bpf_lsm_socket_sendmsg)
BTF_ID(func, bpf_lsm_socket_shutdown)
BTF_ID(func, bpf_lsm_socket_socketpair)
#endif /* CONFIG_SECURITY_NETWORK */

BTF_ID(func, bpf_lsm_syslog)
BTF_ID(func, bpf_lsm_task_alloc)
BTF_ID(func, bpf_lsm_current_getsecid_subj)
BTF_ID(func, bpf_lsm_task_getsecid_obj)
BTF_ID(func, bpf_lsm_task_prctl)
BTF_ID(func, bpf_lsm_task_setscheduler)
BTF_ID(func, bpf_lsm_task_to_inode)
BTF_ID(func, bpf_lsm_userns_create)
BTF_SET_END(sleepable_lsm_hooks)

BTF_SET_START(untrusted_lsm_hooks)
BTF_ID(func, bpf_lsm_bpf_map_free)
BTF_ID(func, bpf_lsm_bpf_prog_free)
BTF_ID(func, bpf_lsm_file_alloc_security)
BTF_ID(func, bpf_lsm_file_free_security)
#ifdef CONFIG_SECURITY_NETWORK
BTF_ID(func, bpf_lsm_sk_alloc_security)
BTF_ID(func, bpf_lsm_sk_free_security)
#endif /* CONFIG_SECURITY_NETWORK */
BTF_ID(func, bpf_lsm_task_free)
BTF_SET_END(untrusted_lsm_hooks)

bool bpf_lsm_is_sleepable_hook(u32 btf_id)
{
        return btf_id_set_contains(&sleepable_lsm_hooks, btf_id);
}

bool bpf_lsm_is_trusted(const struct bpf_prog *prog)
{
        return !btf_id_set_contains(&untrusted_lsm_hooks, prog->aux->attach_btf_id);
}

const struct bpf_prog_ops lsm_prog_ops = {
};

const struct bpf_verifier_ops lsm_verifier_ops = {
        .get_func_proto = bpf_lsm_func_proto,
        .is_valid_access = btf_ctx_access,
};





















    1 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef _BLOCK_BLK_PM_H_
#define _BLOCK_BLK_PM_H_

#include <linux/pm_runtime.h>

#ifdef CONFIG_PM
static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q)
{
        if (!q->dev || !blk_queue_pm_only(q))
                return 1;        /* Nothing to do */
        if (pm && q->rpm_status != RPM_SUSPENDED)
                return 1;        /* Request allowed */
        pm_request_resume(q->dev);
        return 0;
}

static inline void blk_pm_mark_last_busy(struct request *rq)
{
        if (rq->q->dev && !(rq->rq_flags & RQF_PM))
                pm_runtime_mark_last_busy(rq->q->dev);
}
#else
static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q)
{
        return 1;
}

static inline void blk_pm_mark_last_busy(struct request *rq)
{
}
#endif

#endif /* _BLOCK_BLK_PM_H_ */




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * kernel/workqueue_internal.h
 *
 * Workqueue internal header file.  Only to be included by workqueue and
 * core kernel subsystems.
 */
#ifndef _KERNEL_WORKQUEUE_INTERNAL_H
#define _KERNEL_WORKQUEUE_INTERNAL_H

#include <linux/workqueue.h>
#include <linux/kthread.h>
#include <linux/preempt.h>

struct worker_pool;

/*
 * The poor guys doing the actual heavy lifting.  All on-duty workers are
 * either serving the manager role, on idle list or on busy hash.  For
 * details on the locking annotation (L, I, X...), refer to workqueue.c.
 *
 * Only to be used in workqueue and async.
 */
struct worker {
        /* on idle list while idle, on busy hash table while busy */
        union {
                struct list_head        entry;        /* L: while idle */
                struct hlist_node        hentry;        /* L: while busy */
        };

        struct work_struct        *current_work;        /* K: work being processed and its */
        work_func_t                current_func;        /* K: function */
        struct pool_workqueue        *current_pwq;        /* K: pwq */
        u64                        current_at;        /* K: runtime at start or last wakeup */
        unsigned int                current_color;        /* K: color */

        int                        sleeping;        /* S: is worker sleeping? */

        /* used by the scheduler to determine a worker's last known identity */
        work_func_t                last_func;        /* K: last work's fn */

        struct list_head        scheduled;        /* L: scheduled works */

        struct task_struct        *task;                /* I: worker task */
        struct worker_pool        *pool;                /* A: the associated pool */
                                                /* L: for rescuers */
        struct list_head        node;                /* A: anchored at pool->workers */
                                                /* A: runs through worker->node */

        unsigned long                last_active;        /* K: last active timestamp */
        unsigned int                flags;                /* L: flags */
        int                        id;                /* I: worker id */

        /*
         * Opaque string set with work_set_desc().  Printed out with task
         * dump for debugging - WARN, BUG, panic or sysrq.
         */
        char                        desc[WORKER_DESC_LEN];

        /* used only by rescuers to point to the target workqueue */
        struct workqueue_struct        *rescue_wq;        /* I: the workqueue to rescue */
};

/**
 * current_wq_worker - return struct worker if %current is a workqueue worker
 */
static inline struct worker *current_wq_worker(void)
{
        if (in_task() && (current->flags & PF_WQ_WORKER))
                return kthread_data(current);
        return NULL;
}

/*
 * Scheduler hooks for concurrency managed workqueue.  Only to be used from
 * sched/ and workqueue.c.
 */
void wq_worker_running(struct task_struct *task);
void wq_worker_sleeping(struct task_struct *task);
void wq_worker_tick(struct task_struct *task);
work_func_t wq_worker_last_func(struct task_struct *task);

#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */





























































































































































































    3 



    3 




















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
// SPDX-License-Identifier: GPL-2.0
/*
 * blk-integrity.c - Block layer data integrity extensions
 *
 * Copyright (C) 2007, 2008 Oracle Corporation
 * Written by: Martin K. Petersen <martin.petersen@oracle.com>
 */

#include <linux/blk-integrity.h>
#include <linux/backing-dev.h>
#include <linux/mempool.h>
#include <linux/bio.h>
#include <linux/scatterlist.h>
#include <linux/export.h>
#include <linux/slab.h>

#include "blk.h"

/**
 * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
 * @q:                request queue
 * @bio:        bio with integrity metadata attached
 *
 * Description: Returns the number of elements required in a
 * scatterlist corresponding to the integrity metadata in a bio.
 */
int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio)
{
        struct bio_vec iv, ivprv = { NULL };
        unsigned int segments = 0;
        unsigned int seg_size = 0;
        struct bvec_iter iter;
        int prev = 0;

        bio_for_each_integrity_vec(iv, bio, iter) {

                if (prev) {
                        if (!biovec_phys_mergeable(q, &ivprv, &iv))
                                goto new_segment;
                        if (seg_size + iv.bv_len > queue_max_segment_size(q))
                                goto new_segment;

                        seg_size += iv.bv_len;
                } else {
new_segment:
                        segments++;
                        seg_size = iv.bv_len;
                }

                prev = 1;
                ivprv = iv;
        }

        return segments;
}
EXPORT_SYMBOL(blk_rq_count_integrity_sg);

/**
 * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
 * @q:                request queue
 * @bio:        bio with integrity metadata attached
 * @sglist:        target scatterlist
 *
 * Description: Map the integrity vectors in request into a
 * scatterlist.  The scatterlist must be big enough to hold all
 * elements.  I.e. sized using blk_rq_count_integrity_sg().
 */
int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio,
                            struct scatterlist *sglist)
{
        struct bio_vec iv, ivprv = { NULL };
        struct scatterlist *sg = NULL;
        unsigned int segments = 0;
        struct bvec_iter iter;
        int prev = 0;

        bio_for_each_integrity_vec(iv, bio, iter) {

                if (prev) {
                        if (!biovec_phys_mergeable(q, &ivprv, &iv))
                                goto new_segment;
                        if (sg->length + iv.bv_len > queue_max_segment_size(q))
                                goto new_segment;

                        sg->length += iv.bv_len;
                } else {
new_segment:
                        if (!sg)
                                sg = sglist;
                        else {
                                sg_unmark_end(sg);
                                sg = sg_next(sg);
                        }

                        sg_set_page(sg, iv.bv_page, iv.bv_len, iv.bv_offset);
                        segments++;
                }

                prev = 1;
                ivprv = iv;
        }

        if (sg)
                sg_mark_end(sg);

        return segments;
}
EXPORT_SYMBOL(blk_rq_map_integrity_sg);

/**
 * blk_integrity_compare - Compare integrity profile of two disks
 * @gd1:        Disk to compare
 * @gd2:        Disk to compare
 *
 * Description: Meta-devices like DM and MD need to verify that all
 * sub-devices use the same integrity format before advertising to
 * upper layers that they can send/receive integrity metadata.  This
 * function can be used to check whether two gendisk devices have
 * compatible integrity formats.
 */
int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2)
{
        struct blk_integrity *b1 = &gd1->queue->integrity;
        struct blk_integrity *b2 = &gd2->queue->integrity;

        if (!b1->profile && !b2->profile)
                return 0;

        if (!b1->profile || !b2->profile)
                return -1;

        if (b1->interval_exp != b2->interval_exp) {
                pr_err("%s: %s/%s protection interval %u != %u\n",
                       __func__, gd1->disk_name, gd2->disk_name,
                       1 << b1->interval_exp, 1 << b2->interval_exp);
                return -1;
        }

        if (b1->tuple_size != b2->tuple_size) {
                pr_err("%s: %s/%s tuple sz %u != %u\n", __func__,
                       gd1->disk_name, gd2->disk_name,
                       b1->tuple_size, b2->tuple_size);
                return -1;
        }

        if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) {
                pr_err("%s: %s/%s tag sz %u != %u\n", __func__,
                       gd1->disk_name, gd2->disk_name,
                       b1->tag_size, b2->tag_size);
                return -1;
        }

        if (b1->profile != b2->profile) {
                pr_err("%s: %s/%s type %s != %s\n", __func__,
                       gd1->disk_name, gd2->disk_name,
                       b1->profile->name, b2->profile->name);
                return -1;
        }

        return 0;
}
EXPORT_SYMBOL(blk_integrity_compare);

bool blk_integrity_merge_rq(struct request_queue *q, struct request *req,
                            struct request *next)
{
        if (blk_integrity_rq(req) == 0 && blk_integrity_rq(next) == 0)
                return true;

        if (blk_integrity_rq(req) == 0 || blk_integrity_rq(next) == 0)
                return false;

        if (bio_integrity(req->bio)->bip_flags !=
            bio_integrity(next->bio)->bip_flags)
                return false;

        if (req->nr_integrity_segments + next->nr_integrity_segments >
            q->limits.max_integrity_segments)
                return false;

        if (integrity_req_gap_back_merge(req, next->bio))
                return false;

        return true;
}

bool blk_integrity_merge_bio(struct request_queue *q, struct request *req,
                             struct bio *bio)
{
        int nr_integrity_segs;
        struct bio *next = bio->bi_next;

        if (blk_integrity_rq(req) == 0 && bio_integrity(bio) == NULL)
                return true;

        if (blk_integrity_rq(req) == 0 || bio_integrity(bio) == NULL)
                return false;

        if (bio_integrity(req->bio)->bip_flags != bio_integrity(bio)->bip_flags)
                return false;

        bio->bi_next = NULL;
        nr_integrity_segs = blk_rq_count_integrity_sg(q, bio);
        bio->bi_next = next;

        if (req->nr_integrity_segments + nr_integrity_segs >
            q->limits.max_integrity_segments)
                return false;

        req->nr_integrity_segments += nr_integrity_segs;

        return true;
}

static inline struct blk_integrity *dev_to_bi(struct device *dev)
{
        return &dev_to_disk(dev)->queue->integrity;
}

static ssize_t format_show(struct device *dev, struct device_attribute *attr,
                           char *page)
{
        struct blk_integrity *bi = dev_to_bi(dev);

        if (bi->profile && bi->profile->name)
                return sysfs_emit(page, "%s\n", bi->profile->name);
        return sysfs_emit(page, "none\n");
}

static ssize_t tag_size_show(struct device *dev, struct device_attribute *attr,
                             char *page)
{
        struct blk_integrity *bi = dev_to_bi(dev);

        return sysfs_emit(page, "%u\n", bi->tag_size);
}

static ssize_t protection_interval_bytes_show(struct device *dev,
                                              struct device_attribute *attr,
                                              char *page)
{
        struct blk_integrity *bi = dev_to_bi(dev);

        return sysfs_emit(page, "%u\n",
                          bi->interval_exp ? 1 << bi->interval_exp : 0);
}

static ssize_t read_verify_store(struct device *dev,
                                 struct device_attribute *attr,
                                 const char *page, size_t count)
{
        struct blk_integrity *bi = dev_to_bi(dev);
        char *p = (char *) page;
        unsigned long val = simple_strtoul(p, &p, 10);

        if (val)
                bi->flags |= BLK_INTEGRITY_VERIFY;
        else
                bi->flags &= ~BLK_INTEGRITY_VERIFY;

        return count;
}

static ssize_t read_verify_show(struct device *dev,
                                struct device_attribute *attr, char *page)
{
        struct blk_integrity *bi = dev_to_bi(dev);

        return sysfs_emit(page, "%d\n", !!(bi->flags & BLK_INTEGRITY_VERIFY));
}

static ssize_t write_generate_store(struct device *dev,
                                    struct device_attribute *attr,
                                    const char *page, size_t count)
{
        struct blk_integrity *bi = dev_to_bi(dev);

        char *p = (char *) page;
        unsigned long val = simple_strtoul(p, &p, 10);

        if (val)
                bi->flags |= BLK_INTEGRITY_GENERATE;
        else
                bi->flags &= ~BLK_INTEGRITY_GENERATE;

        return count;
}

static ssize_t write_generate_show(struct device *dev,
                                   struct device_attribute *attr, char *page)
{
        struct blk_integrity *bi = dev_to_bi(dev);

        return sysfs_emit(page, "%d\n", !!(bi->flags & BLK_INTEGRITY_GENERATE));
}

static ssize_t device_is_integrity_capable_show(struct device *dev,
                                                struct device_attribute *attr,
                                                char *page)
{
        struct blk_integrity *bi = dev_to_bi(dev);

        return sysfs_emit(page, "%u\n",
                          !!(bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE));
}

static DEVICE_ATTR_RO(format);
static DEVICE_ATTR_RO(tag_size);
static DEVICE_ATTR_RO(protection_interval_bytes);
static DEVICE_ATTR_RW(read_verify);
static DEVICE_ATTR_RW(write_generate);
static DEVICE_ATTR_RO(device_is_integrity_capable);

static struct attribute *integrity_attrs[] = {
        &dev_attr_format.attr,
        &dev_attr_tag_size.attr,
        &dev_attr_protection_interval_bytes.attr,
        &dev_attr_read_verify.attr,
        &dev_attr_write_generate.attr,
        &dev_attr_device_is_integrity_capable.attr,
        NULL
};

const struct attribute_group blk_integrity_attr_group = {
        .name = "integrity",
        .attrs = integrity_attrs,
};

static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter)
{
        return BLK_STS_OK;
}

static void blk_integrity_nop_prepare(struct request *rq)
{
}

static void blk_integrity_nop_complete(struct request *rq,
                unsigned int nr_bytes)
{
}

static const struct blk_integrity_profile nop_profile = {
        .name = "nop",
        .generate_fn = blk_integrity_nop_fn,
        .verify_fn = blk_integrity_nop_fn,
        .prepare_fn = blk_integrity_nop_prepare,
        .complete_fn = blk_integrity_nop_complete,
};

/**
 * blk_integrity_register - Register a gendisk as being integrity-capable
 * @disk:        struct gendisk pointer to make integrity-aware
 * @template:        block integrity profile to register
 *
 * Description: When a device needs to advertise itself as being able to
 * send/receive integrity metadata it must use this function to register
 * the capability with the block layer. The template is a blk_integrity
 * struct with values appropriate for the underlying hardware. See
 * Documentation/block/data-integrity.rst.
 */
void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
{
        struct blk_integrity *bi = &disk->queue->integrity;

        bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE |
                template->flags;
        bi->interval_exp = template->interval_exp ? :
                ilog2(queue_logical_block_size(disk->queue));
        bi->profile = template->profile ? template->profile : &nop_profile;
        bi->tuple_size = template->tuple_size;
        bi->tag_size = template->tag_size;
        bi->pi_offset = template->pi_offset;

        blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);

#ifdef CONFIG_BLK_INLINE_ENCRYPTION
        if (disk->queue->crypto_profile) {
                pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
                disk->queue->crypto_profile = NULL;
        }
#endif
}
EXPORT_SYMBOL(blk_integrity_register);

/**
 * blk_integrity_unregister - Unregister block integrity profile
 * @disk:        disk whose integrity profile to unregister
 *
 * Description: This function unregisters the integrity capability from
 * a block device.
 */
void blk_integrity_unregister(struct gendisk *disk)
{
        struct blk_integrity *bi = &disk->queue->integrity;

        if (!bi->profile)
                return;

        /* ensure all bios are off the integrity workqueue */
        blk_flush_integrity();
        blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue);
        memset(bi, 0, sizeof(*bi));
}
EXPORT_SYMBOL(blk_integrity_unregister);






















    8 

    6 




    6 









    6 

    6 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
// SPDX-License-Identifier: GPL-2.0
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>

/*
 * This is an implementation of the notion of "decrement a
 * reference count, and return locked if it decremented to zero".
 *
 * NOTE NOTE NOTE! This is _not_ equivalent to
 *
 *        if (atomic_dec_and_test(&atomic)) {
 *                spin_lock(&lock);
 *                return 1;
 *        }
 *        return 0;
 *
 * because the spin-lock and the decrement must be
 * "atomic".
 */
int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
{
        /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
        if (atomic_add_unless(atomic, -1, 1))
                return 0;

        /* Otherwise do it the slow way */
        spin_lock(lock);
        if (atomic_dec_and_test(atomic))
                return 1;
        spin_unlock(lock);
        return 0;
}

EXPORT_SYMBOL(_atomic_dec_and_lock);

int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock,
                                 unsigned long *flags)
{
        /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
        if (atomic_add_unless(atomic, -1, 1))
                return 0;

        /* Otherwise do it the slow way */
        spin_lock_irqsave(lock, *flags);
        if (atomic_dec_and_test(atomic))
                return 1;
        spin_unlock_irqrestore(lock, *flags);
        return 0;
}
EXPORT_SYMBOL(_atomic_dec_and_lock_irqsave);

int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock)
{
        /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
        if (atomic_add_unless(atomic, -1, 1))
                return 0;

        /* Otherwise do it the slow way */
        raw_spin_lock(lock);
        if (atomic_dec_and_test(atomic))
                return 1;
        raw_spin_unlock(lock);
        return 0;
}
EXPORT_SYMBOL(_atomic_dec_and_raw_lock);

int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock,
                                     unsigned long *flags)
{
        /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
        if (atomic_add_unless(atomic, -1, 1))
                return 0;

        /* Otherwise do it the slow way */
        raw_spin_lock_irqsave(lock, *flags);
        if (atomic_dec_and_test(atomic))
                return 1;
        raw_spin_unlock_irqrestore(lock, *flags);
        return 0;
}
EXPORT_SYMBOL(_atomic_dec_and_raw_lock_irqsave);


































   31 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * RCU-based infrastructure for lightweight reader-writer locking
 *
 * Copyright (c) 2015, Red Hat, Inc.
 *
 * Author: Oleg Nesterov <oleg@redhat.com>
 */

#ifndef _LINUX_RCU_SYNC_H_
#define _LINUX_RCU_SYNC_H_

#include <linux/wait.h>
#include <linux/rcupdate.h>

/* Structure to mediate between updaters and fastpath-using readers.  */
struct rcu_sync {
        int                        gp_state;
        int                        gp_count;
        wait_queue_head_t        gp_wait;

        struct rcu_head                cb_head;
};

/**
 * rcu_sync_is_idle() - Are readers permitted to use their fastpaths?
 * @rsp: Pointer to rcu_sync structure to use for synchronization
 *
 * Returns true if readers are permitted to use their fastpaths.  Must be
 * invoked within some flavor of RCU read-side critical section.
 */
static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
{
        RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(),
                         "suspicious rcu_sync_is_idle() usage");
        return !READ_ONCE(rsp->gp_state); /* GP_IDLE */
}

extern void rcu_sync_init(struct rcu_sync *);
extern void rcu_sync_enter(struct rcu_sync *);
extern void rcu_sync_exit(struct rcu_sync *);
extern void rcu_sync_dtor(struct rcu_sync *);

#define __RCU_SYNC_INITIALIZER(name) {                                        \
                .gp_state = 0,                                                \
                .gp_count = 0,                                                \
                .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait),        \
        }

#define        DEFINE_RCU_SYNC(name)        \
        struct rcu_sync name = __RCU_SYNC_INITIALIZER(name)

#endif /* _LINUX_RCU_SYNC_H_ */




























































































































































































































































































    1 











































































































    2 























































    2 

























    2 



































































































































































































    2 





    1 

    1 










    1 


    1 
    1 
    1 

    1 

















    2 

    2 






    2 








    2 


























    2 


    1 
    1 








    1 












































































































































































    1 









    1 






    1 
    1 
    1 








    1 
    1 























































    2 







    1 





























































































    2 




















    1 



























































































































































    1 









    1 




























    1 

    1 







    1 

























    1 





    1 


































































    1 


































































































































    1 




    1 


    1 









    1 


    1 

    1 








    1 






    1 


    1 


















    1 


    1 







    1 






    1 
    1 






    1 


    1 




    1 


















    1 















    1 



    1 
    1 


    1 








    1 






























































































    1 





    1 





    1 















    1 
    1 





    1 






    1 


























    1 
    1 

















    1 



    1 















    1 






    1 




























    3 

    1 
    2 


















    2 


    3 





    1 
    1 



    2 


    2 











































    2 















    2 









    1 
    1 























































































    2 












    2 

































    1 




    1 




    1 





    1 













    1 









    2 














    2 









    1 















    1 




























    1 



    1 
























    1 





    1 





























    1 





    1 

    2 










    2 




    2 












    2 
































    3 


    3 



    1 





    1 









    2 












    2 




    1 






    1 



    1 









    1 



































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  fs/eventpoll.c (Efficient event retrieval implementation)
 *  Copyright (C) 2001,...,2009         Davide Libenzi
 *
 *  Davide Libenzi <davidel@xmailserver.org>
 */

#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/signal.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/string.h>
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/spinlock.h>
#include <linux/syscalls.h>
#include <linux/rbtree.h>
#include <linux/wait.h>
#include <linux/eventpoll.h>
#include <linux/mount.h>
#include <linux/bitops.h>
#include <linux/mutex.h>
#include <linux/anon_inodes.h>
#include <linux/device.h>
#include <linux/uaccess.h>
#include <asm/io.h>
#include <asm/mman.h>
#include <linux/atomic.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/compat.h>
#include <linux/rculist.h>
#include <linux/capability.h>
#include <net/busy_poll.h>

/*
 * LOCKING:
 * There are three level of locking required by epoll :
 *
 * 1) epnested_mutex (mutex)
 * 2) ep->mtx (mutex)
 * 3) ep->lock (rwlock)
 *
 * The acquire order is the one listed above, from 1 to 3.
 * We need a rwlock (ep->lock) because we manipulate objects
 * from inside the poll callback, that might be triggered from
 * a wake_up() that in turn might be called from IRQ context.
 * So we can't sleep inside the poll callback and hence we need
 * a spinlock. During the event transfer loop (from kernel to
 * user space) we could end up sleeping due a copy_to_user(), so
 * we need a lock that will allow us to sleep. This lock is a
 * mutex (ep->mtx). It is acquired during the event transfer loop,
 * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
 * The epnested_mutex is acquired when inserting an epoll fd onto another
 * epoll fd. We do this so that we walk the epoll tree and ensure that this
 * insertion does not create a cycle of epoll file descriptors, which
 * could lead to deadlock. We need a global mutex to prevent two
 * simultaneous inserts (A into B and B into A) from racing and
 * constructing a cycle without either insert observing that it is
 * going to.
 * It is necessary to acquire multiple "ep->mtx"es at once in the
 * case when one epoll fd is added to another. In this case, we
 * always acquire the locks in the order of nesting (i.e. after
 * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
 * before e2->mtx). Since we disallow cycles of epoll file
 * descriptors, this ensures that the mutexes are well-ordered. In
 * order to communicate this nesting to lockdep, when walking a tree
 * of epoll file descriptors, we use the current recursion depth as
 * the lockdep subkey.
 * It is possible to drop the "ep->mtx" and to use the global
 * mutex "epnested_mutex" (together with "ep->lock") to have it working,
 * but having "ep->mtx" will make the interface more scalable.
 * Events that require holding "epnested_mutex" are very rare, while for
 * normal operations the epoll private "ep->mtx" will guarantee
 * a better scalability.
 */

/* Epoll private bits inside the event mask */
#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)

#define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)

#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
                                EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)

/* Maximum number of nesting allowed inside epoll sets */
#define EP_MAX_NESTS 4

#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))

#define EP_UNACTIVE_PTR ((void *) -1L)

#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))

struct epoll_filefd {
        struct file *file;
        int fd;
} __packed;

/* Wait structure used by the poll hooks */
struct eppoll_entry {
        /* List header used to link this structure to the "struct epitem" */
        struct eppoll_entry *next;

        /* The "base" pointer is set to the container "struct epitem" */
        struct epitem *base;

        /*
         * Wait queue item that will be linked to the target file wait
         * queue head.
         */
        wait_queue_entry_t wait;

        /* The wait queue head that linked the "wait" wait queue item */
        wait_queue_head_t *whead;
};

/*
 * Each file descriptor added to the eventpoll interface will
 * have an entry of this type linked to the "rbr" RB tree.
 * Avoid increasing the size of this struct, there can be many thousands
 * of these on a server and we do not want this to take another cache line.
 */
struct epitem {
        union {
                /* RB tree node links this structure to the eventpoll RB tree */
                struct rb_node rbn;
                /* Used to free the struct epitem */
                struct rcu_head rcu;
        };

        /* List header used to link this structure to the eventpoll ready list */
        struct list_head rdllink;

        /*
         * Works together "struct eventpoll"->ovflist in keeping the
         * single linked chain of items.
         */
        struct epitem *next;

        /* The file descriptor information this item refers to */
        struct epoll_filefd ffd;

        /*
         * Protected by file->f_lock, true for to-be-released epitem already
         * removed from the "struct file" items list; together with
         * eventpoll->refcount orchestrates "struct eventpoll" disposal
         */
        bool dying;

        /* List containing poll wait queues */
        struct eppoll_entry *pwqlist;

        /* The "container" of this item */
        struct eventpoll *ep;

        /* List header used to link this item to the "struct file" items list */
        struct hlist_node fllink;

        /* wakeup_source used when EPOLLWAKEUP is set */
        struct wakeup_source __rcu *ws;

        /* The structure that describe the interested events and the source fd */
        struct epoll_event event;
};

/*
 * This structure is stored inside the "private_data" member of the file
 * structure and represents the main data structure for the eventpoll
 * interface.
 */
struct eventpoll {
        /*
         * This mutex is used to ensure that files are not removed
         * while epoll is using them. This is held during the event
         * collection loop, the file cleanup path, the epoll file exit
         * code and the ctl operations.
         */
        struct mutex mtx;

        /* Wait queue used by sys_epoll_wait() */
        wait_queue_head_t wq;

        /* Wait queue used by file->poll() */
        wait_queue_head_t poll_wait;

        /* List of ready file descriptors */
        struct list_head rdllist;

        /* Lock which protects rdllist and ovflist */
        rwlock_t lock;

        /* RB tree root used to store monitored fd structs */
        struct rb_root_cached rbr;

        /*
         * This is a single linked list that chains all the "struct epitem" that
         * happened while transferring ready events to userspace w/out
         * holding ->lock.
         */
        struct epitem *ovflist;

        /* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */
        struct wakeup_source *ws;

        /* The user that created the eventpoll descriptor */
        struct user_struct *user;

        struct file *file;

        /* used to optimize loop detection check */
        u64 gen;
        struct hlist_head refs;

        /*
         * usage count, used together with epitem->dying to
         * orchestrate the disposal of this struct
         */
        refcount_t refcount;

#ifdef CONFIG_NET_RX_BUSY_POLL
        /* used to track busy poll napi_id */
        unsigned int napi_id;
        /* busy poll timeout */
        u32 busy_poll_usecs;
        /* busy poll packet budget */
        u16 busy_poll_budget;
        bool prefer_busy_poll;
#endif

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        /* tracks wakeup nests for lockdep validation */
        u8 nests;
#endif
};

/* Wrapper struct used by poll queueing */
struct ep_pqueue {
        poll_table pt;
        struct epitem *epi;
};

/*
 * Configuration options available inside /proc/sys/fs/epoll/
 */
/* Maximum number of epoll watched descriptors, per user */
static long max_user_watches __read_mostly;

/* Used for cycles detection */
static DEFINE_MUTEX(epnested_mutex);

static u64 loop_check_gen = 0;

/* Used to check for epoll file descriptor inclusion loops */
static struct eventpoll *inserting_into;

/* Slab cache used to allocate "struct epitem" */
static struct kmem_cache *epi_cache __ro_after_init;

/* Slab cache used to allocate "struct eppoll_entry" */
static struct kmem_cache *pwq_cache __ro_after_init;

/*
 * List of files with newly added links, where we may need to limit the number
 * of emanating paths. Protected by the epnested_mutex.
 */
struct epitems_head {
        struct hlist_head epitems;
        struct epitems_head *next;
};
static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;

static struct kmem_cache *ephead_cache __ro_after_init;

static inline void free_ephead(struct epitems_head *head)
{
        if (head)
                kmem_cache_free(ephead_cache, head);
}

static void list_file(struct file *file)
{
        struct epitems_head *head;

        head = container_of(file->f_ep, struct epitems_head, epitems);
        if (!head->next) {
                head->next = tfile_check_list;
                tfile_check_list = head;
        }
}

static void unlist_file(struct epitems_head *head)
{
        struct epitems_head *to_free = head;
        struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));
        if (p) {
                struct epitem *epi= container_of(p, struct epitem, fllink);
                spin_lock(&epi->ffd.file->f_lock);
                if (!hlist_empty(&head->epitems))
                        to_free = NULL;
                head->next = NULL;
                spin_unlock(&epi->ffd.file->f_lock);
        }
        free_ephead(to_free);
}

#ifdef CONFIG_SYSCTL

#include <linux/sysctl.h>

static long long_zero;
static long long_max = LONG_MAX;

static struct ctl_table epoll_table[] = {
        {
                .procname        = "max_user_watches",
                .data                = &max_user_watches,
                .maxlen                = sizeof(max_user_watches),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
                .extra1                = &long_zero,
                .extra2                = &long_max,
        },
};

static void __init epoll_sysctls_init(void)
{
        register_sysctl("fs/epoll", epoll_table);
}
#else
#define epoll_sysctls_init() do { } while (0)
#endif /* CONFIG_SYSCTL */

static const struct file_operations eventpoll_fops;

static inline int is_file_epoll(struct file *f)
{
        return f->f_op == &eventpoll_fops;
}

/* Setup the structure that is used as key for the RB tree */
static inline void ep_set_ffd(struct epoll_filefd *ffd,
                              struct file *file, int fd)
{
        ffd->file = file;
        ffd->fd = fd;
}

/* Compare RB tree keys */
static inline int ep_cmp_ffd(struct epoll_filefd *p1,
                             struct epoll_filefd *p2)
{
        return (p1->file > p2->file ? +1:
                (p1->file < p2->file ? -1 : p1->fd - p2->fd));
}

/* Tells us if the item is currently linked */
static inline int ep_is_linked(struct epitem *epi)
{
        return !list_empty(&epi->rdllink);
}

static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
{
        return container_of(p, struct eppoll_entry, wait);
}

/* Get the "struct epitem" from a wait queue pointer */
static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
{
        return container_of(p, struct eppoll_entry, wait)->base;
}

/**
 * ep_events_available - Checks if ready events might be available.
 *
 * @ep: Pointer to the eventpoll context.
 *
 * Return: a value different than %zero if ready events are available,
 *          or %zero otherwise.
 */
static inline int ep_events_available(struct eventpoll *ep)
{
        return !list_empty_careful(&ep->rdllist) ||
                READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
}

#ifdef CONFIG_NET_RX_BUSY_POLL
/**
 * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value
 * from the epoll instance ep is preferred, but if it is not set fallback to
 * the system-wide global via busy_loop_timeout.
 *
 * @start_time: The start time used to compute the remaining time until timeout.
 * @ep: Pointer to the eventpoll context.
 *
 * Return: true if the timeout has expired, false otherwise.
 */
static bool busy_loop_ep_timeout(unsigned long start_time,
                                 struct eventpoll *ep)
{
        unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs);

        if (bp_usec) {
                unsigned long end_time = start_time + bp_usec;
                unsigned long now = busy_loop_current_time();

                return time_after(now, end_time);
        } else {
                return busy_loop_timeout(start_time);
        }
}

static bool ep_busy_loop_on(struct eventpoll *ep)
{
        return !!ep->busy_poll_usecs || net_busy_loop_on();
}

static bool ep_busy_loop_end(void *p, unsigned long start_time)
{
        struct eventpoll *ep = p;

        return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep);
}

/*
 * Busy poll if globally on and supporting sockets found && no events,
 * busy loop will return if need_resched or ep_events_available.
 *
 * we must do our busy polling with irqs enabled
 */
static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
{
        unsigned int napi_id = READ_ONCE(ep->napi_id);
        u16 budget = READ_ONCE(ep->busy_poll_budget);
        bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);

        if (!budget)
                budget = BUSY_POLL_BUDGET;

        if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) {
                napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end,
                               ep, prefer_busy_poll, budget);
                if (ep_events_available(ep))
                        return true;
                /*
                 * Busy poll timed out.  Drop NAPI ID for now, we can add
                 * it back in when we have moved a socket with a valid NAPI
                 * ID onto the ready list.
                 */
                ep->napi_id = 0;
                return false;
        }
        return false;
}

/*
 * Set epoll busy poll NAPI ID from sk.
 */
static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
        struct eventpoll *ep = epi->ep;
        unsigned int napi_id;
        struct socket *sock;
        struct sock *sk;

        if (!ep_busy_loop_on(ep))
                return;

        sock = sock_from_file(epi->ffd.file);
        if (!sock)
                return;

        sk = sock->sk;
        if (!sk)
                return;

        napi_id = READ_ONCE(sk->sk_napi_id);

        /* Non-NAPI IDs can be rejected
         *        or
         * Nothing to do if we already have this ID
         */
        if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
                return;

        /* record NAPI ID for use in next busy poll */
        ep->napi_id = napi_id;
}

static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
                                  unsigned long arg)
{
        struct eventpoll *ep = file->private_data;
        void __user *uarg = (void __user *)arg;
        struct epoll_params epoll_params;

        switch (cmd) {
        case EPIOCSPARAMS:
                if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params)))
                        return -EFAULT;

                /* pad byte must be zero */
                if (epoll_params.__pad)
                        return -EINVAL;

                if (epoll_params.busy_poll_usecs > S32_MAX)
                        return -EINVAL;

                if (epoll_params.prefer_busy_poll > 1)
                        return -EINVAL;

                if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT &&
                    !capable(CAP_NET_ADMIN))
                        return -EPERM;

                WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs);
                WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget);
                WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll);
                return 0;
        case EPIOCGPARAMS:
                memset(&epoll_params, 0, sizeof(epoll_params));
                epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs);
                epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget);
                epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
                if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params)))
                        return -EFAULT;
                return 0;
        default:
                return -ENOIOCTLCMD;
        }
}

#else

static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
{
        return false;
}

static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
}

static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
                                  unsigned long arg)
{
        return -EOPNOTSUPP;
}

#endif /* CONFIG_NET_RX_BUSY_POLL */

/*
 * As described in commit 0ccf831cb lockdep: annotate epoll
 * the use of wait queues used by epoll is done in a very controlled
 * manner. Wake ups can nest inside each other, but are never done
 * with the same locking. For example:
 *
 *   dfd = socket(...);
 *   efd1 = epoll_create();
 *   efd2 = epoll_create();
 *   epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
 *   epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
 *
 * When a packet arrives to the device underneath "dfd", the net code will
 * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
 * callback wakeup entry on that queue, and the wake_up() performed by the
 * "dfd" net code will end up in ep_poll_callback(). At this point epoll
 * (efd1) notices that it may have some event ready, so it needs to wake up
 * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
 * that ends up in another wake_up(), after having checked about the
 * recursion constraints. That are, no more than EP_MAX_NESTS, to avoid
 * stack blasting.
 *
 * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
 * this special case of epoll.
 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC

static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
                             unsigned pollflags)
{
        struct eventpoll *ep_src;
        unsigned long flags;
        u8 nests = 0;

        /*
         * To set the subclass or nesting level for spin_lock_irqsave_nested()
         * it might be natural to create a per-cpu nest count. However, since
         * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
         * schedule() in the -rt kernel, the per-cpu variable are no longer
         * protected. Thus, we are introducing a per eventpoll nest field.
         * If we are not being call from ep_poll_callback(), epi is NULL and
         * we are at the first level of nesting, 0. Otherwise, we are being
         * called from ep_poll_callback() and if a previous wakeup source is
         * not an epoll file itself, we are at depth 1 since the wakeup source
         * is depth 0. If the wakeup source is a previous epoll file in the
         * wakeup chain then we use its nests value and record ours as
         * nests + 1. The previous epoll file nests value is stable since its
         * already holding its own poll_wait.lock.
         */
        if (epi) {
                if ((is_file_epoll(epi->ffd.file))) {
                        ep_src = epi->ffd.file->private_data;
                        nests = ep_src->nests;
                } else {
                        nests = 1;
                }
        }
        spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
        ep->nests = nests + 1;
        wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags);
        ep->nests = 0;
        spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
}

#else

static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
                             __poll_t pollflags)
{
        wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags);
}

#endif

static void ep_remove_wait_queue(struct eppoll_entry *pwq)
{
        wait_queue_head_t *whead;

        rcu_read_lock();
        /*
         * If it is cleared by POLLFREE, it should be rcu-safe.
         * If we read NULL we need a barrier paired with
         * smp_store_release() in ep_poll_callback(), otherwise
         * we rely on whead->lock.
         */
        whead = smp_load_acquire(&pwq->whead);
        if (whead)
                remove_wait_queue(whead, &pwq->wait);
        rcu_read_unlock();
}

/*
 * This function unregisters poll callbacks from the associated file
 * descriptor.  Must be called with "mtx" held.
 */
static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
{
        struct eppoll_entry **p = &epi->pwqlist;
        struct eppoll_entry *pwq;

        while ((pwq = *p) != NULL) {
                *p = pwq->next;
                ep_remove_wait_queue(pwq);
                kmem_cache_free(pwq_cache, pwq);
        }
}

/* call only when ep->mtx is held */
static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
{
        return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
}

/* call only when ep->mtx is held */
static inline void ep_pm_stay_awake(struct epitem *epi)
{
        struct wakeup_source *ws = ep_wakeup_source(epi);

        if (ws)
                __pm_stay_awake(ws);
}

static inline bool ep_has_wakeup_source(struct epitem *epi)
{
        return rcu_access_pointer(epi->ws) ? true : false;
}

/* call when ep->mtx cannot be held (ep_poll_callback) */
static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
{
        struct wakeup_source *ws;

        rcu_read_lock();
        ws = rcu_dereference(epi->ws);
        if (ws)
                __pm_stay_awake(ws);
        rcu_read_unlock();
}


/*
 * ep->mutex needs to be held because we could be hit by
 * eventpoll_release_file() and epoll_ctl().
 */
static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
{
        /*
         * Steal the ready list, and re-init the original one to the
         * empty list. Also, set ep->ovflist to NULL so that events
         * happening while looping w/out locks, are not lost. We cannot
         * have the poll callback to queue directly on ep->rdllist,
         * because we want the "sproc" callback to be able to do it
         * in a lockless way.
         */
        lockdep_assert_irqs_enabled();
        write_lock_irq(&ep->lock);
        list_splice_init(&ep->rdllist, txlist);
        WRITE_ONCE(ep->ovflist, NULL);
        write_unlock_irq(&ep->lock);
}

static void ep_done_scan(struct eventpoll *ep,
                         struct list_head *txlist)
{
        struct epitem *epi, *nepi;

        write_lock_irq(&ep->lock);
        /*
         * During the time we spent inside the "sproc" callback, some
         * other events might have been queued by the poll callback.
         * We re-insert them inside the main ready-list here.
         */
        for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
             nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
                /*
                 * We need to check if the item is already in the list.
                 * During the "sproc" callback execution time, items are
                 * queued into ->ovflist but the "txlist" might already
                 * contain them, and the list_splice() below takes care of them.
                 */
                if (!ep_is_linked(epi)) {
                        /*
                         * ->ovflist is LIFO, so we have to reverse it in order
                         * to keep in FIFO.
                         */
                        list_add(&epi->rdllink, &ep->rdllist);
                        ep_pm_stay_awake(epi);
                }
        }
        /*
         * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
         * releasing the lock, events will be queued in the normal way inside
         * ep->rdllist.
         */
        WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);

        /*
         * Quickly re-inject items left on "txlist".
         */
        list_splice(txlist, &ep->rdllist);
        __pm_relax(ep->ws);

        if (!list_empty(&ep->rdllist)) {
                if (waitqueue_active(&ep->wq))
                        wake_up(&ep->wq);
        }

        write_unlock_irq(&ep->lock);
}

static void ep_get(struct eventpoll *ep)
{
        refcount_inc(&ep->refcount);
}

/*
 * Returns true if the event poll can be disposed
 */
static bool ep_refcount_dec_and_test(struct eventpoll *ep)
{
        if (!refcount_dec_and_test(&ep->refcount))
                return false;

        WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root));
        return true;
}

static void ep_free(struct eventpoll *ep)
{
        mutex_destroy(&ep->mtx);
        free_uid(ep->user);
        wakeup_source_unregister(ep->ws);
        kfree(ep);
}

/*
 * Removes a "struct epitem" from the eventpoll RB tree and deallocates
 * all the associated resources. Must be called with "mtx" held.
 * If the dying flag is set, do the removal only if force is true.
 * This prevents ep_clear_and_put() from dropping all the ep references
 * while running concurrently with eventpoll_release_file().
 * Returns true if the eventpoll can be disposed.
 */
static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
{
        struct file *file = epi->ffd.file;
        struct epitems_head *to_free;
        struct hlist_head *head;

        lockdep_assert_irqs_enabled();

        /*
         * Removes poll wait queue hooks.
         */
        ep_unregister_pollwait(ep, epi);

        /* Remove the current item from the list of epoll hooks */
        spin_lock(&file->f_lock);
        if (epi->dying && !force) {
                spin_unlock(&file->f_lock);
                return false;
        }

        to_free = NULL;
        head = file->f_ep;
        if (head->first == &epi->fllink && !epi->fllink.next) {
                file->f_ep = NULL;
                if (!is_file_epoll(file)) {
                        struct epitems_head *v;
                        v = container_of(head, struct epitems_head, epitems);
                        if (!smp_load_acquire(&v->next))
                                to_free = v;
                }
        }
        hlist_del_rcu(&epi->fllink);
        spin_unlock(&file->f_lock);
        free_ephead(to_free);

        rb_erase_cached(&epi->rbn, &ep->rbr);

        write_lock_irq(&ep->lock);
        if (ep_is_linked(epi))
                list_del_init(&epi->rdllink);
        write_unlock_irq(&ep->lock);

        wakeup_source_unregister(ep_wakeup_source(epi));
        /*
         * At this point it is safe to free the eventpoll item. Use the union
         * field epi->rcu, since we are trying to minimize the size of
         * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
         * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
         * use of the rbn field.
         */
        kfree_rcu(epi, rcu);

        percpu_counter_dec(&ep->user->epoll_watches);
        return ep_refcount_dec_and_test(ep);
}

/*
 * ep_remove variant for callers owing an additional reference to the ep
 */
static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi)
{
        WARN_ON_ONCE(__ep_remove(ep, epi, false));
}

static void ep_clear_and_put(struct eventpoll *ep)
{
        struct rb_node *rbp, *next;
        struct epitem *epi;
        bool dispose;

        /* We need to release all tasks waiting for these file */
        if (waitqueue_active(&ep->poll_wait))
                ep_poll_safewake(ep, NULL, 0);

        mutex_lock(&ep->mtx);

        /*
         * Walks through the whole tree by unregistering poll callbacks.
         */
        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                epi = rb_entry(rbp, struct epitem, rbn);

                ep_unregister_pollwait(ep, epi);
                cond_resched();
        }

        /*
         * Walks through the whole tree and try to free each "struct epitem".
         * Note that ep_remove_safe() will not remove the epitem in case of a
         * racing eventpoll_release_file(); the latter will do the removal.
         * At this point we are sure no poll callbacks will be lingering around.
         * Since we still own a reference to the eventpoll struct, the loop can't
         * dispose it.
         */
        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) {
                next = rb_next(rbp);
                epi = rb_entry(rbp, struct epitem, rbn);
                ep_remove_safe(ep, epi);
                cond_resched();
        }

        dispose = ep_refcount_dec_and_test(ep);
        mutex_unlock(&ep->mtx);

        if (dispose)
                ep_free(ep);
}

static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd,
                               unsigned long arg)
{
        int ret;

        if (!is_file_epoll(file))
                return -EINVAL;

        switch (cmd) {
        case EPIOCSPARAMS:
        case EPIOCGPARAMS:
                ret = ep_eventpoll_bp_ioctl(file, cmd, arg);
                break;
        default:
                ret = -EINVAL;
                break;
        }

        return ret;
}

static int ep_eventpoll_release(struct inode *inode, struct file *file)
{
        struct eventpoll *ep = file->private_data;

        if (ep)
                ep_clear_and_put(ep);

        return 0;
}

static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth);

static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth)
{
        struct eventpoll *ep = file->private_data;
        LIST_HEAD(txlist);
        struct epitem *epi, *tmp;
        poll_table pt;
        __poll_t res = 0;

        init_poll_funcptr(&pt, NULL);

        /* Insert inside our poll wait queue */
        poll_wait(file, &ep->poll_wait, wait);

        /*
         * Proceed to find out if wanted events are really available inside
         * the ready list.
         */
        mutex_lock_nested(&ep->mtx, depth);
        ep_start_scan(ep, &txlist);
        list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
                if (ep_item_poll(epi, &pt, depth + 1)) {
                        res = EPOLLIN | EPOLLRDNORM;
                        break;
                } else {
                        /*
                         * Item has been dropped into the ready list by the poll
                         * callback, but it's not actually ready, as far as
                         * caller requested events goes. We can remove it here.
                         */
                        __pm_relax(ep_wakeup_source(epi));
                        list_del_init(&epi->rdllink);
                }
        }
        ep_done_scan(ep, &txlist);
        mutex_unlock(&ep->mtx);
        return res;
}

/*
 * The ffd.file pointer may be in the process of being torn down due to
 * being closed, but we may not have finished eventpoll_release() yet.
 *
 * Normally, even with the atomic_long_inc_not_zero, the file may have
 * been free'd and then gotten re-allocated to something else (since
 * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU).
 *
 * But for epoll, users hold the ep->mtx mutex, and as such any file in
 * the process of being free'd will block in eventpoll_release_file()
 * and thus the underlying file allocation will not be free'd, and the
 * file re-use cannot happen.
 *
 * For the same reason we can avoid a rcu_read_lock() around the
 * operation - 'ffd.file' cannot go away even if the refcount has
 * reached zero (but we must still not call out to ->poll() functions
 * etc).
 */
static struct file *epi_fget(const struct epitem *epi)
{
        struct file *file;

        file = epi->ffd.file;
        if (!atomic_long_inc_not_zero(&file->f_count))
                file = NULL;
        return file;
}

/*
 * Differs from ep_eventpoll_poll() in that internal callers already have
 * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
 * is correctly annotated.
 */
static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
                                 int depth)
{
        struct file *file = epi_fget(epi);
        __poll_t res;

        /*
         * We could return EPOLLERR | EPOLLHUP or something, but let's
         * treat this more as "file doesn't exist, poll didn't happen".
         */
        if (!file)
                return 0;

        pt->_key = epi->event.events;
        if (!is_file_epoll(file))
                res = vfs_poll(file, pt);
        else
                res = __ep_eventpoll_poll(file, pt, depth);
        fput(file);
        return res & epi->event.events;
}

static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
{
        return __ep_eventpoll_poll(file, wait, 0);
}

#ifdef CONFIG_PROC_FS
static void ep_show_fdinfo(struct seq_file *m, struct file *f)
{
        struct eventpoll *ep = f->private_data;
        struct rb_node *rbp;

        mutex_lock(&ep->mtx);
        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
                struct inode *inode = file_inode(epi->ffd.file);

                seq_printf(m, "tfd: %8d events: %8x data: %16llx "
                           " pos:%lli ino:%lx sdev:%x\n",
                           epi->ffd.fd, epi->event.events,
                           (long long)epi->event.data,
                           (long long)epi->ffd.file->f_pos,
                           inode->i_ino, inode->i_sb->s_dev);
                if (seq_has_overflowed(m))
                        break;
        }
        mutex_unlock(&ep->mtx);
}
#endif

/* File callbacks that implement the eventpoll file behaviour */
static const struct file_operations eventpoll_fops = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = ep_show_fdinfo,
#endif
        .release        = ep_eventpoll_release,
        .poll                = ep_eventpoll_poll,
        .llseek                = noop_llseek,
        .unlocked_ioctl        = ep_eventpoll_ioctl,
        .compat_ioctl   = compat_ptr_ioctl,
};

/*
 * This is called from eventpoll_release() to unlink files from the eventpoll
 * interface. We need to have this facility to cleanup correctly files that are
 * closed without being removed from the eventpoll interface.
 */
void eventpoll_release_file(struct file *file)
{
        struct eventpoll *ep;
        struct epitem *epi;
        bool dispose;

        /*
         * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from
         * touching the epitems list before eventpoll_release_file() can access
         * the ep->mtx.
         */
again:
        spin_lock(&file->f_lock);
        if (file->f_ep && file->f_ep->first) {
                epi = hlist_entry(file->f_ep->first, struct epitem, fllink);
                epi->dying = true;
                spin_unlock(&file->f_lock);

                /*
                 * ep access is safe as we still own a reference to the ep
                 * struct
                 */
                ep = epi->ep;
                mutex_lock(&ep->mtx);
                dispose = __ep_remove(ep, epi, true);
                mutex_unlock(&ep->mtx);

                if (dispose)
                        ep_free(ep);
                goto again;
        }
        spin_unlock(&file->f_lock);
}

static int ep_alloc(struct eventpoll **pep)
{
        struct eventpoll *ep;

        ep = kzalloc(sizeof(*ep), GFP_KERNEL);
        if (unlikely(!ep))
                return -ENOMEM;

        mutex_init(&ep->mtx);
        rwlock_init(&ep->lock);
        init_waitqueue_head(&ep->wq);
        init_waitqueue_head(&ep->poll_wait);
        INIT_LIST_HEAD(&ep->rdllist);
        ep->rbr = RB_ROOT_CACHED;
        ep->ovflist = EP_UNACTIVE_PTR;
        ep->user = get_current_user();
        refcount_set(&ep->refcount, 1);

        *pep = ep;

        return 0;
}

/*
 * Search the file inside the eventpoll tree. The RB tree operations
 * are protected by the "mtx" mutex, and ep_find() must be called with
 * "mtx" held.
 */
static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
{
        int kcmp;
        struct rb_node *rbp;
        struct epitem *epi, *epir = NULL;
        struct epoll_filefd ffd;

        ep_set_ffd(&ffd, file, fd);
        for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
                epi = rb_entry(rbp, struct epitem, rbn);
                kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
                if (kcmp > 0)
                        rbp = rbp->rb_right;
                else if (kcmp < 0)
                        rbp = rbp->rb_left;
                else {
                        epir = epi;
                        break;
                }
        }

        return epir;
}

#ifdef CONFIG_KCMP
static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
{
        struct rb_node *rbp;
        struct epitem *epi;

        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                epi = rb_entry(rbp, struct epitem, rbn);
                if (epi->ffd.fd == tfd) {
                        if (toff == 0)
                                return epi;
                        else
                                toff--;
                }
                cond_resched();
        }

        return NULL;
}

struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
                                     unsigned long toff)
{
        struct file *file_raw;
        struct eventpoll *ep;
        struct epitem *epi;

        if (!is_file_epoll(file))
                return ERR_PTR(-EINVAL);

        ep = file->private_data;

        mutex_lock(&ep->mtx);
        epi = ep_find_tfd(ep, tfd, toff);
        if (epi)
                file_raw = epi->ffd.file;
        else
                file_raw = ERR_PTR(-ENOENT);
        mutex_unlock(&ep->mtx);

        return file_raw;
}
#endif /* CONFIG_KCMP */

/*
 * Adds a new entry to the tail of the list in a lockless way, i.e.
 * multiple CPUs are allowed to call this function concurrently.
 *
 * Beware: it is necessary to prevent any other modifications of the
 *         existing list until all changes are completed, in other words
 *         concurrent list_add_tail_lockless() calls should be protected
 *         with a read lock, where write lock acts as a barrier which
 *         makes sure all list_add_tail_lockless() calls are fully
 *         completed.
 *
 *        Also an element can be locklessly added to the list only in one
 *        direction i.e. either to the tail or to the head, otherwise
 *        concurrent access will corrupt the list.
 *
 * Return: %false if element has been already added to the list, %true
 * otherwise.
 */
static inline bool list_add_tail_lockless(struct list_head *new,
                                          struct list_head *head)
{
        struct list_head *prev;

        /*
         * This is simple 'new->next = head' operation, but cmpxchg()
         * is used in order to detect that same element has been just
         * added to the list from another CPU: the winner observes
         * new->next == new.
         */
        if (!try_cmpxchg(&new->next, &new, head))
                return false;

        /*
         * Initially ->next of a new element must be updated with the head
         * (we are inserting to the tail) and only then pointers are atomically
         * exchanged.  XCHG guarantees memory ordering, thus ->next should be
         * updated before pointers are actually swapped and pointers are
         * swapped before prev->next is updated.
         */

        prev = xchg(&head->prev, new);

        /*
         * It is safe to modify prev->next and new->prev, because a new element
         * is added only to the tail and new->next is updated before XCHG.
         */

        prev->next = new;
        new->prev = prev;

        return true;
}

/*
 * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
 * i.e. multiple CPUs are allowed to call this function concurrently.
 *
 * Return: %false if epi element has been already chained, %true otherwise.
 */
static inline bool chain_epi_lockless(struct epitem *epi)
{
        struct eventpoll *ep = epi->ep;

        /* Fast preliminary check */
        if (epi->next != EP_UNACTIVE_PTR)
                return false;

        /* Check that the same epi has not been just chained from another CPU */
        if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
                return false;

        /* Atomically exchange tail */
        epi->next = xchg(&ep->ovflist, epi);

        return true;
}

/*
 * This is the callback that is passed to the wait queue wakeup
 * mechanism. It is called by the stored file descriptors when they
 * have events to report.
 *
 * This callback takes a read lock in order not to contend with concurrent
 * events from another file descriptor, thus all modifications to ->rdllist
 * or ->ovflist are lockless.  Read lock is paired with the write lock from
 * ep_start/done_scan(), which stops all list modifications and guarantees
 * that lists state is seen correctly.
 *
 * Another thing worth to mention is that ep_poll_callback() can be called
 * concurrently for the same @epi from different CPUs if poll table was inited
 * with several wait queues entries.  Plural wakeup from different CPUs of a
 * single wait queue is serialized by wq.lock, but the case when multiple wait
 * queues are used should be detected accordingly.  This is detected using
 * cmpxchg() operation.
 */
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
        int pwake = 0;
        struct epitem *epi = ep_item_from_wait(wait);
        struct eventpoll *ep = epi->ep;
        __poll_t pollflags = key_to_poll(key);
        unsigned long flags;
        int ewake = 0;

        read_lock_irqsave(&ep->lock, flags);

        ep_set_busy_poll_napi_id(epi);

        /*
         * If the event mask does not contain any poll(2) event, we consider the
         * descriptor to be disabled. This condition is likely the effect of the
         * EPOLLONESHOT bit that disables the descriptor when an event is received,
         * until the next EPOLL_CTL_MOD will be issued.
         */
        if (!(epi->event.events & ~EP_PRIVATE_BITS))
                goto out_unlock;

        /*
         * Check the events coming with the callback. At this stage, not
         * every device reports the events in the "key" parameter of the
         * callback. We need to be able to handle both cases here, hence the
         * test for "key" != NULL before the event match test.
         */
        if (pollflags && !(pollflags & epi->event.events))
                goto out_unlock;

        /*
         * If we are transferring events to userspace, we can hold no locks
         * (because we're accessing user memory, and because of linux f_op->poll()
         * semantics). All the events that happen during that period of time are
         * chained in ep->ovflist and requeued later on.
         */
        if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
                if (chain_epi_lockless(epi))
                        ep_pm_stay_awake_rcu(epi);
        } else if (!ep_is_linked(epi)) {
                /* In the usual case, add event to ready list. */
                if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
                        ep_pm_stay_awake_rcu(epi);
        }

        /*
         * Wake up ( if active ) both the eventpoll wait list and the ->poll()
         * wait list.
         */
        if (waitqueue_active(&ep->wq)) {
                if ((epi->event.events & EPOLLEXCLUSIVE) &&
                                        !(pollflags & POLLFREE)) {
                        switch (pollflags & EPOLLINOUT_BITS) {
                        case EPOLLIN:
                                if (epi->event.events & EPOLLIN)
                                        ewake = 1;
                                break;
                        case EPOLLOUT:
                                if (epi->event.events & EPOLLOUT)
                                        ewake = 1;
                                break;
                        case 0:
                                ewake = 1;
                                break;
                        }
                }
                wake_up(&ep->wq);
        }
        if (waitqueue_active(&ep->poll_wait))
                pwake++;

out_unlock:
        read_unlock_irqrestore(&ep->lock, flags);

        /* We have to call this outside the lock */
        if (pwake)
                ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE);

        if (!(epi->event.events & EPOLLEXCLUSIVE))
                ewake = 1;

        if (pollflags & POLLFREE) {
                /*
                 * If we race with ep_remove_wait_queue() it can miss
                 * ->whead = NULL and do another remove_wait_queue() after
                 * us, so we can't use __remove_wait_queue().
                 */
                list_del_init(&wait->entry);
                /*
                 * ->whead != NULL protects us from the race with
                 * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue()
                 * takes whead->lock held by the caller. Once we nullify it,
                 * nothing protects ep/epi or even wait.
                 */
                smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
        }

        return ewake;
}

/*
 * This is the callback that is used to add our wait queue to the
 * target file wakeup lists.
 */
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
                                 poll_table *pt)
{
        struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt);
        struct epitem *epi = epq->epi;
        struct eppoll_entry *pwq;

        if (unlikely(!epi))        // an earlier allocation has failed
                return;

        pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);
        if (unlikely(!pwq)) {
                epq->epi = NULL;
                return;
        }

        init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
        pwq->whead = whead;
        pwq->base = epi;
        if (epi->event.events & EPOLLEXCLUSIVE)
                add_wait_queue_exclusive(whead, &pwq->wait);
        else
                add_wait_queue(whead, &pwq->wait);
        pwq->next = epi->pwqlist;
        epi->pwqlist = pwq;
}

static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
{
        int kcmp;
        struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
        struct epitem *epic;
        bool leftmost = true;

        while (*p) {
                parent = *p;
                epic = rb_entry(parent, struct epitem, rbn);
                kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
                if (kcmp > 0) {
                        p = &parent->rb_right;
                        leftmost = false;
                } else
                        p = &parent->rb_left;
        }
        rb_link_node(&epi->rbn, parent, p);
        rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
}



#define PATH_ARR_SIZE 5
/*
 * These are the number paths of length 1 to 5, that we are allowing to emanate
 * from a single file of interest. For example, we allow 1000 paths of length
 * 1, to emanate from each file of interest. This essentially represents the
 * potential wakeup paths, which need to be limited in order to avoid massive
 * uncontrolled wakeup storms. The common use case should be a single ep which
 * is connected to n file sources. In this case each file source has 1 path
 * of length 1. Thus, the numbers below should be more than sufficient. These
 * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
 * and delete can't add additional paths. Protected by the epnested_mutex.
 */
static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
static int path_count[PATH_ARR_SIZE];

static int path_count_inc(int nests)
{
        /* Allow an arbitrary number of depth 1 paths */
        if (nests == 0)
                return 0;

        if (++path_count[nests] > path_limits[nests])
                return -1;
        return 0;
}

static void path_count_init(void)
{
        int i;

        for (i = 0; i < PATH_ARR_SIZE; i++)
                path_count[i] = 0;
}

static int reverse_path_check_proc(struct hlist_head *refs, int depth)
{
        int error = 0;
        struct epitem *epi;

        if (depth > EP_MAX_NESTS) /* too deep nesting */
                return -1;

        /* CTL_DEL can remove links here, but that can't increase our count */
        hlist_for_each_entry_rcu(epi, refs, fllink) {
                struct hlist_head *refs = &epi->ep->refs;
                if (hlist_empty(refs))
                        error = path_count_inc(depth);
                else
                        error = reverse_path_check_proc(refs, depth + 1);
                if (error != 0)
                        break;
        }
        return error;
}

/**
 * reverse_path_check - The tfile_check_list is list of epitem_head, which have
 *                      links that are proposed to be newly added. We need to
 *                      make sure that those added links don't add too many
 *                      paths such that we will spend all our time waking up
 *                      eventpoll objects.
 *
 * Return: %zero if the proposed links don't create too many paths,
 *            %-1 otherwise.
 */
static int reverse_path_check(void)
{
        struct epitems_head *p;

        for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
                int error;
                path_count_init();
                rcu_read_lock();
                error = reverse_path_check_proc(&p->epitems, 0);
                rcu_read_unlock();
                if (error)
                        return error;
        }
        return 0;
}

static int ep_create_wakeup_source(struct epitem *epi)
{
        struct name_snapshot n;
        struct wakeup_source *ws;

        if (!epi->ep->ws) {
                epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
                if (!epi->ep->ws)
                        return -ENOMEM;
        }

        take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
        ws = wakeup_source_register(NULL, n.name.name);
        release_dentry_name_snapshot(&n);

        if (!ws)
                return -ENOMEM;
        rcu_assign_pointer(epi->ws, ws);

        return 0;
}

/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
static noinline void ep_destroy_wakeup_source(struct epitem *epi)
{
        struct wakeup_source *ws = ep_wakeup_source(epi);

        RCU_INIT_POINTER(epi->ws, NULL);

        /*
         * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
         * used internally by wakeup_source_remove, too (called by
         * wakeup_source_unregister), so we cannot use call_rcu
         */
        synchronize_rcu();
        wakeup_source_unregister(ws);
}

static int attach_epitem(struct file *file, struct epitem *epi)
{
        struct epitems_head *to_free = NULL;
        struct hlist_head *head = NULL;
        struct eventpoll *ep = NULL;

        if (is_file_epoll(file))
                ep = file->private_data;

        if (ep) {
                head = &ep->refs;
        } else if (!READ_ONCE(file->f_ep)) {
allocate:
                to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL);
                if (!to_free)
                        return -ENOMEM;
                head = &to_free->epitems;
        }
        spin_lock(&file->f_lock);
        if (!file->f_ep) {
                if (unlikely(!head)) {
                        spin_unlock(&file->f_lock);
                        goto allocate;
                }
                file->f_ep = head;
                to_free = NULL;
        }
        hlist_add_head_rcu(&epi->fllink, file->f_ep);
        spin_unlock(&file->f_lock);
        free_ephead(to_free);
        return 0;
}

/*
 * Must be called with "mtx" held.
 */
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
                     struct file *tfile, int fd, int full_check)
{
        int error, pwake = 0;
        __poll_t revents;
        struct epitem *epi;
        struct ep_pqueue epq;
        struct eventpoll *tep = NULL;

        if (is_file_epoll(tfile))
                tep = tfile->private_data;

        lockdep_assert_irqs_enabled();

        if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
                                            max_user_watches) >= 0))
                return -ENOSPC;
        percpu_counter_inc(&ep->user->epoll_watches);

        if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
                percpu_counter_dec(&ep->user->epoll_watches);
                return -ENOMEM;
        }

        /* Item initialization follow here ... */
        INIT_LIST_HEAD(&epi->rdllink);
        epi->ep = ep;
        ep_set_ffd(&epi->ffd, tfile, fd);
        epi->event = *event;
        epi->next = EP_UNACTIVE_PTR;

        if (tep)
                mutex_lock_nested(&tep->mtx, 1);
        /* Add the current item to the list of active epoll hook for this file */
        if (unlikely(attach_epitem(tfile, epi) < 0)) {
                if (tep)
                        mutex_unlock(&tep->mtx);
                kmem_cache_free(epi_cache, epi);
                percpu_counter_dec(&ep->user->epoll_watches);
                return -ENOMEM;
        }

        if (full_check && !tep)
                list_file(tfile);

        /*
         * Add the current item to the RB tree. All RB tree operations are
         * protected by "mtx", and ep_insert() is called with "mtx" held.
         */
        ep_rbtree_insert(ep, epi);
        if (tep)
                mutex_unlock(&tep->mtx);

        /*
         * ep_remove_safe() calls in the later error paths can't lead to
         * ep_free() as the ep file itself still holds an ep reference.
         */
        ep_get(ep);

        /* now check if we've created too many backpaths */
        if (unlikely(full_check && reverse_path_check())) {
                ep_remove_safe(ep, epi);
                return -EINVAL;
        }

        if (epi->event.events & EPOLLWAKEUP) {
                error = ep_create_wakeup_source(epi);
                if (error) {
                        ep_remove_safe(ep, epi);
                        return error;
                }
        }

        /* Initialize the poll table using the queue callback */
        epq.epi = epi;
        init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

        /*
         * Attach the item to the poll hooks and get current event bits.
         * We can safely use the file* here because its usage count has
         * been increased by the caller of this function. Note that after
         * this operation completes, the poll callback can start hitting
         * the new item.
         */
        revents = ep_item_poll(epi, &epq.pt, 1);

        /*
         * We have to check if something went wrong during the poll wait queue
         * install process. Namely an allocation for a wait queue failed due
         * high memory pressure.
         */
        if (unlikely(!epq.epi)) {
                ep_remove_safe(ep, epi);
                return -ENOMEM;
        }

        /* We have to drop the new item inside our item list to keep track of it */
        write_lock_irq(&ep->lock);

        /* record NAPI ID of new item if present */
        ep_set_busy_poll_napi_id(epi);

        /* If the file is already "ready" we drop it inside the ready list */
        if (revents && !ep_is_linked(epi)) {
                list_add_tail(&epi->rdllink, &ep->rdllist);
                ep_pm_stay_awake(epi);

                /* Notify waiting tasks that events are available */
                if (waitqueue_active(&ep->wq))
                        wake_up(&ep->wq);
                if (waitqueue_active(&ep->poll_wait))
                        pwake++;
        }

        write_unlock_irq(&ep->lock);

        /* We have to call this outside the lock */
        if (pwake)
                ep_poll_safewake(ep, NULL, 0);

        return 0;
}

/*
 * Modify the interest event mask by dropping an event if the new mask
 * has a match in the current file status. Must be called with "mtx" held.
 */
static int ep_modify(struct eventpoll *ep, struct epitem *epi,
                     const struct epoll_event *event)
{
        int pwake = 0;
        poll_table pt;

        lockdep_assert_irqs_enabled();

        init_poll_funcptr(&pt, NULL);

        /*
         * Set the new event interest mask before calling f_op->poll();
         * otherwise we might miss an event that happens between the
         * f_op->poll() call and the new event set registering.
         */
        epi->event.events = event->events; /* need barrier below */
        epi->event.data = event->data; /* protected by mtx */
        if (epi->event.events & EPOLLWAKEUP) {
                if (!ep_has_wakeup_source(epi))
                        ep_create_wakeup_source(epi);
        } else if (ep_has_wakeup_source(epi)) {
                ep_destroy_wakeup_source(epi);
        }

        /*
         * The following barrier has two effects:
         *
         * 1) Flush epi changes above to other CPUs.  This ensures
         *    we do not miss events from ep_poll_callback if an
         *    event occurs immediately after we call f_op->poll().
         *    We need this because we did not take ep->lock while
         *    changing epi above (but ep_poll_callback does take
         *    ep->lock).
         *
         * 2) We also need to ensure we do not miss _past_ events
         *    when calling f_op->poll().  This barrier also
         *    pairs with the barrier in wq_has_sleeper (see
         *    comments for wq_has_sleeper).
         *
         * This barrier will now guarantee ep_poll_callback or f_op->poll
         * (or both) will notice the readiness of an item.
         */
        smp_mb();

        /*
         * Get current event bits. We can safely use the file* here because
         * its usage count has been increased by the caller of this function.
         * If the item is "hot" and it is not registered inside the ready
         * list, push it inside.
         */
        if (ep_item_poll(epi, &pt, 1)) {
                write_lock_irq(&ep->lock);
                if (!ep_is_linked(epi)) {
                        list_add_tail(&epi->rdllink, &ep->rdllist);
                        ep_pm_stay_awake(epi);

                        /* Notify waiting tasks that events are available */
                        if (waitqueue_active(&ep->wq))
                                wake_up(&ep->wq);
                        if (waitqueue_active(&ep->poll_wait))
                                pwake++;
                }
                write_unlock_irq(&ep->lock);
        }

        /* We have to call this outside the lock */
        if (pwake)
                ep_poll_safewake(ep, NULL, 0);

        return 0;
}

static int ep_send_events(struct eventpoll *ep,
                          struct epoll_event __user *events, int maxevents)
{
        struct epitem *epi, *tmp;
        LIST_HEAD(txlist);
        poll_table pt;
        int res = 0;

        /*
         * Always short-circuit for fatal signals to allow threads to make a
         * timely exit without the chance of finding more events available and
         * fetching repeatedly.
         */
        if (fatal_signal_pending(current))
                return -EINTR;

        init_poll_funcptr(&pt, NULL);

        mutex_lock(&ep->mtx);
        ep_start_scan(ep, &txlist);

        /*
         * We can loop without lock because we are passed a task private list.
         * Items cannot vanish during the loop we are holding ep->mtx.
         */
        list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
                struct wakeup_source *ws;
                __poll_t revents;

                if (res >= maxevents)
                        break;

                /*
                 * Activate ep->ws before deactivating epi->ws to prevent
                 * triggering auto-suspend here (in case we reactive epi->ws
                 * below).
                 *
                 * This could be rearranged to delay the deactivation of epi->ws
                 * instead, but then epi->ws would temporarily be out of sync
                 * with ep_is_linked().
                 */
                ws = ep_wakeup_source(epi);
                if (ws) {
                        if (ws->active)
                                __pm_stay_awake(ep->ws);
                        __pm_relax(ws);
                }

                list_del_init(&epi->rdllink);

                /*
                 * If the event mask intersect the caller-requested one,
                 * deliver the event to userspace. Again, we are holding ep->mtx,
                 * so no operations coming from userspace can change the item.
                 */
                revents = ep_item_poll(epi, &pt, 1);
                if (!revents)
                        continue;

                events = epoll_put_uevent(revents, epi->event.data, events);
                if (!events) {
                        list_add(&epi->rdllink, &txlist);
                        ep_pm_stay_awake(epi);
                        if (!res)
                                res = -EFAULT;
                        break;
                }
                res++;
                if (epi->event.events & EPOLLONESHOT)
                        epi->event.events &= EP_PRIVATE_BITS;
                else if (!(epi->event.events & EPOLLET)) {
                        /*
                         * If this file has been added with Level
                         * Trigger mode, we need to insert back inside
                         * the ready list, so that the next call to
                         * epoll_wait() will check again the events
                         * availability. At this point, no one can insert
                         * into ep->rdllist besides us. The epoll_ctl()
                         * callers are locked out by
                         * ep_send_events() holding "mtx" and the
                         * poll callback will queue them in ep->ovflist.
                         */
                        list_add_tail(&epi->rdllink, &ep->rdllist);
                        ep_pm_stay_awake(epi);
                }
        }
        ep_done_scan(ep, &txlist);
        mutex_unlock(&ep->mtx);

        return res;
}

static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
{
        struct timespec64 now;

        if (ms < 0)
                return NULL;

        if (!ms) {
                to->tv_sec = 0;
                to->tv_nsec = 0;
                return to;
        }

        to->tv_sec = ms / MSEC_PER_SEC;
        to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);

        ktime_get_ts64(&now);
        *to = timespec64_add_safe(now, *to);
        return to;
}

/*
 * autoremove_wake_function, but remove even on failure to wake up, because we
 * know that default_wake_function/ttwu will only fail if the thread is already
 * woken, and in that case the ep_poll loop will remove the entry anyways, not
 * try to reuse it.
 */
static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
                                       unsigned int mode, int sync, void *key)
{
        int ret = default_wake_function(wq_entry, mode, sync, key);

        /*
         * Pairs with list_empty_careful in ep_poll, and ensures future loop
         * iterations see the cause of this wakeup.
         */
        list_del_init_careful(&wq_entry->entry);
        return ret;
}

/**
 * ep_poll - Retrieves ready events, and delivers them to the caller-supplied
 *           event buffer.
 *
 * @ep: Pointer to the eventpoll context.
 * @events: Pointer to the userspace buffer where the ready events should be
 *          stored.
 * @maxevents: Size (in terms of number of events) of the caller event buffer.
 * @timeout: Maximum timeout for the ready events fetch operation, in
 *           timespec. If the timeout is zero, the function will not block,
 *           while if the @timeout ptr is NULL, the function will block
 *           until at least one event has been retrieved (or an error
 *           occurred).
 *
 * Return: the number of ready events which have been fetched, or an
 *          error code, in case of error.
 */
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
                   int maxevents, struct timespec64 *timeout)
{
        int res, eavail, timed_out = 0;
        u64 slack = 0;
        wait_queue_entry_t wait;
        ktime_t expires, *to = NULL;

        lockdep_assert_irqs_enabled();

        if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
                slack = select_estimate_accuracy(timeout);
                to = &expires;
                *to = timespec64_to_ktime(*timeout);
        } else if (timeout) {
                /*
                 * Avoid the unnecessary trip to the wait queue loop, if the
                 * caller specified a non blocking operation.
                 */
                timed_out = 1;
        }

        /*
         * This call is racy: We may or may not see events that are being added
         * to the ready list under the lock (e.g., in IRQ callbacks). For cases
         * with a non-zero timeout, this thread will check the ready list under
         * lock and will add to the wait queue.  For cases with a zero
         * timeout, the user by definition should not care and will have to
         * recheck again.
         */
        eavail = ep_events_available(ep);

        while (1) {
                if (eavail) {
                        /*
                         * Try to transfer events to user space. In case we get
                         * 0 events and there's still timeout left over, we go
                         * trying again in search of more luck.
                         */
                        res = ep_send_events(ep, events, maxevents);
                        if (res)
                                return res;
                }

                if (timed_out)
                        return 0;

                eavail = ep_busy_loop(ep, timed_out);
                if (eavail)
                        continue;

                if (signal_pending(current))
                        return -EINTR;

                /*
                 * Internally init_wait() uses autoremove_wake_function(),
                 * thus wait entry is removed from the wait queue on each
                 * wakeup. Why it is important? In case of several waiters
                 * each new wakeup will hit the next waiter, giving it the
                 * chance to harvest new event. Otherwise wakeup can be
                 * lost. This is also good performance-wise, because on
                 * normal wakeup path no need to call __remove_wait_queue()
                 * explicitly, thus ep->lock is not taken, which halts the
                 * event delivery.
                 *
                 * In fact, we now use an even more aggressive function that
                 * unconditionally removes, because we don't reuse the wait
                 * entry between loop iterations. This lets us also avoid the
                 * performance issue if a process is killed, causing all of its
                 * threads to wake up without being removed normally.
                 */
                init_wait(&wait);
                wait.func = ep_autoremove_wake_function;

                write_lock_irq(&ep->lock);
                /*
                 * Barrierless variant, waitqueue_active() is called under
                 * the same lock on wakeup ep_poll_callback() side, so it
                 * is safe to avoid an explicit barrier.
                 */
                __set_current_state(TASK_INTERRUPTIBLE);

                /*
                 * Do the final check under the lock. ep_start/done_scan()
                 * plays with two lists (->rdllist and ->ovflist) and there
                 * is always a race when both lists are empty for short
                 * period of time although events are pending, so lock is
                 * important.
                 */
                eavail = ep_events_available(ep);
                if (!eavail)
                        __add_wait_queue_exclusive(&ep->wq, &wait);

                write_unlock_irq(&ep->lock);

                if (!eavail)
                        timed_out = !schedule_hrtimeout_range(to, slack,
                                                              HRTIMER_MODE_ABS);
                __set_current_state(TASK_RUNNING);

                /*
                 * We were woken up, thus go and try to harvest some events.
                 * If timed out and still on the wait queue, recheck eavail
                 * carefully under lock, below.
                 */
                eavail = 1;

                if (!list_empty_careful(&wait.entry)) {
                        write_lock_irq(&ep->lock);
                        /*
                         * If the thread timed out and is not on the wait queue,
                         * it means that the thread was woken up after its
                         * timeout expired before it could reacquire the lock.
                         * Thus, when wait.entry is empty, it needs to harvest
                         * events.
                         */
                        if (timed_out)
                                eavail = list_empty(&wait.entry);
                        __remove_wait_queue(&ep->wq, &wait);
                        write_unlock_irq(&ep->lock);
                }
        }
}

/**
 * ep_loop_check_proc - verify that adding an epoll file inside another
 *                      epoll structure does not violate the constraints, in
 *                      terms of closed loops, or too deep chains (which can
 *                      result in excessive stack usage).
 *
 * @ep: the &struct eventpoll to be currently checked.
 * @depth: Current depth of the path being checked.
 *
 * Return: %zero if adding the epoll @file inside current epoll
 *          structure @ep does not violate the constraints, or %-1 otherwise.
 */
static int ep_loop_check_proc(struct eventpoll *ep, int depth)
{
        int error = 0;
        struct rb_node *rbp;
        struct epitem *epi;

        mutex_lock_nested(&ep->mtx, depth + 1);
        ep->gen = loop_check_gen;
        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                epi = rb_entry(rbp, struct epitem, rbn);
                if (unlikely(is_file_epoll(epi->ffd.file))) {
                        struct eventpoll *ep_tovisit;
                        ep_tovisit = epi->ffd.file->private_data;
                        if (ep_tovisit->gen == loop_check_gen)
                                continue;
                        if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
                                error = -1;
                        else
                                error = ep_loop_check_proc(ep_tovisit, depth + 1);
                        if (error != 0)
                                break;
                } else {
                        /*
                         * If we've reached a file that is not associated with
                         * an ep, then we need to check if the newly added
                         * links are going to add too many wakeup paths. We do
                         * this by adding it to the tfile_check_list, if it's
                         * not already there, and calling reverse_path_check()
                         * during ep_insert().
                         */
                        list_file(epi->ffd.file);
                }
        }
        mutex_unlock(&ep->mtx);

        return error;
}

/**
 * ep_loop_check - Performs a check to verify that adding an epoll file (@to)
 *                 into another epoll file (represented by @ep) does not create
 *                 closed loops or too deep chains.
 *
 * @ep: Pointer to the epoll we are inserting into.
 * @to: Pointer to the epoll to be inserted.
 *
 * Return: %zero if adding the epoll @to inside the epoll @from
 * does not violate the constraints, or %-1 otherwise.
 */
static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
{
        inserting_into = ep;
        return ep_loop_check_proc(to, 0);
}

static void clear_tfile_check_list(void)
{
        rcu_read_lock();
        while (tfile_check_list != EP_UNACTIVE_PTR) {
                struct epitems_head *head = tfile_check_list;
                tfile_check_list = head->next;
                unlist_file(head);
        }
        rcu_read_unlock();
}

/*
 * Open an eventpoll file descriptor.
 */
static int do_epoll_create(int flags)
{
        int error, fd;
        struct eventpoll *ep = NULL;
        struct file *file;

        /* Check the EPOLL_* constant for consistency.  */
        BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);

        if (flags & ~EPOLL_CLOEXEC)
                return -EINVAL;
        /*
         * Create the internal data structure ("struct eventpoll").
         */
        error = ep_alloc(&ep);
        if (error < 0)
                return error;
        /*
         * Creates all the items needed to setup an eventpoll file. That is,
         * a file structure and a free file descriptor.
         */
        fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
        if (fd < 0) {
                error = fd;
                goto out_free_ep;
        }
        file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
                                 O_RDWR | (flags & O_CLOEXEC));
        if (IS_ERR(file)) {
                error = PTR_ERR(file);
                goto out_free_fd;
        }
#ifdef CONFIG_NET_RX_BUSY_POLL
        ep->busy_poll_usecs = 0;
        ep->busy_poll_budget = 0;
        ep->prefer_busy_poll = false;
#endif
        ep->file = file;
        fd_install(fd, file);
        return fd;

out_free_fd:
        put_unused_fd(fd);
out_free_ep:
        ep_clear_and_put(ep);
        return error;
}

SYSCALL_DEFINE1(epoll_create1, int, flags)
{
        return do_epoll_create(flags);
}

SYSCALL_DEFINE1(epoll_create, int, size)
{
        if (size <= 0)
                return -EINVAL;

        return do_epoll_create(0);
}

#ifdef CONFIG_PM_SLEEP
static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
{
        if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
                epev->events &= ~EPOLLWAKEUP;
}
#else
static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
{
        epev->events &= ~EPOLLWAKEUP;
}
#endif

static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
                                   bool nonblock)
{
        if (!nonblock) {
                mutex_lock_nested(mutex, depth);
                return 0;
        }
        if (mutex_trylock(mutex))
                return 0;
        return -EAGAIN;
}

int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
                 bool nonblock)
{
        int error;
        int full_check = 0;
        struct fd f, tf;
        struct eventpoll *ep;
        struct epitem *epi;
        struct eventpoll *tep = NULL;

        error = -EBADF;
        f = fdget(epfd);
        if (!f.file)
                goto error_return;

        /* Get the "struct file *" for the target file */
        tf = fdget(fd);
        if (!tf.file)
                goto error_fput;

        /* The target file descriptor must support poll */
        error = -EPERM;
        if (!file_can_poll(tf.file))
                goto error_tgt_fput;

        /* Check if EPOLLWAKEUP is allowed */
        if (ep_op_has_event(op))
                ep_take_care_of_epollwakeup(epds);

        /*
         * We have to check that the file structure underneath the file descriptor
         * the user passed to us _is_ an eventpoll file. And also we do not permit
         * adding an epoll file descriptor inside itself.
         */
        error = -EINVAL;
        if (f.file == tf.file || !is_file_epoll(f.file))
                goto error_tgt_fput;

        /*
         * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
         * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
         * Also, we do not currently supported nested exclusive wakeups.
         */
        if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
                if (op == EPOLL_CTL_MOD)
                        goto error_tgt_fput;
                if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
                                (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
                        goto error_tgt_fput;
        }

        /*
         * At this point it is safe to assume that the "private_data" contains
         * our own data structure.
         */
        ep = f.file->private_data;

        /*
         * When we insert an epoll file descriptor inside another epoll file
         * descriptor, there is the chance of creating closed loops, which are
         * better be handled here, than in more critical paths. While we are
         * checking for loops we also determine the list of files reachable
         * and hang them on the tfile_check_list, so we can check that we
         * haven't created too many possible wakeup paths.
         *
         * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
         * the epoll file descriptor is attaching directly to a wakeup source,
         * unless the epoll file descriptor is nested. The purpose of taking the
         * 'epnested_mutex' on add is to prevent complex toplogies such as loops and
         * deep wakeup paths from forming in parallel through multiple
         * EPOLL_CTL_ADD operations.
         */
        error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
        if (error)
                goto error_tgt_fput;
        if (op == EPOLL_CTL_ADD) {
                if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
                    is_file_epoll(tf.file)) {
                        mutex_unlock(&ep->mtx);
                        error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
                        if (error)
                                goto error_tgt_fput;
                        loop_check_gen++;
                        full_check = 1;
                        if (is_file_epoll(tf.file)) {
                                tep = tf.file->private_data;
                                error = -ELOOP;
                                if (ep_loop_check(ep, tep) != 0)
                                        goto error_tgt_fput;
                        }
                        error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
                        if (error)
                                goto error_tgt_fput;
                }
        }

        /*
         * Try to lookup the file inside our RB tree. Since we grabbed "mtx"
         * above, we can be sure to be able to use the item looked up by
         * ep_find() till we release the mutex.
         */
        epi = ep_find(ep, tf.file, fd);

        error = -EINVAL;
        switch (op) {
        case EPOLL_CTL_ADD:
                if (!epi) {
                        epds->events |= EPOLLERR | EPOLLHUP;
                        error = ep_insert(ep, epds, tf.file, fd, full_check);
                } else
                        error = -EEXIST;
                break;
        case EPOLL_CTL_DEL:
                if (epi) {
                        /*
                         * The eventpoll itself is still alive: the refcount
                         * can't go to zero here.
                         */
                        ep_remove_safe(ep, epi);
                        error = 0;
                } else {
                        error = -ENOENT;
                }
                break;
        case EPOLL_CTL_MOD:
                if (epi) {
                        if (!(epi->event.events & EPOLLEXCLUSIVE)) {
                                epds->events |= EPOLLERR | EPOLLHUP;
                                error = ep_modify(ep, epi, epds);
                        }
                } else
                        error = -ENOENT;
                break;
        }
        mutex_unlock(&ep->mtx);

error_tgt_fput:
        if (full_check) {
                clear_tfile_check_list();
                loop_check_gen++;
                mutex_unlock(&epnested_mutex);
        }

        fdput(tf);
error_fput:
        fdput(f);
error_return:

        return error;
}

/*
 * The following function implements the controller interface for
 * the eventpoll file that enables the insertion/removal/change of
 * file descriptors inside the interest set.
 */
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                struct epoll_event __user *, event)
{
        struct epoll_event epds;

        if (ep_op_has_event(op) &&
            copy_from_user(&epds, event, sizeof(struct epoll_event)))
                return -EFAULT;

        return do_epoll_ctl(epfd, op, fd, &epds, false);
}

/*
 * Implement the event wait interface for the eventpoll file. It is the kernel
 * part of the user space epoll_wait(2).
 */
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
                         int maxevents, struct timespec64 *to)
{
        int error;
        struct fd f;
        struct eventpoll *ep;

        /* The maximum number of event must be greater than zero */
        if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
                return -EINVAL;

        /* Verify that the area passed by the user is writeable */
        if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
                return -EFAULT;

        /* Get the "struct file *" for the eventpoll file */
        f = fdget(epfd);
        if (!f.file)
                return -EBADF;

        /*
         * We have to check that the file structure underneath the fd
         * the user passed to us _is_ an eventpoll file.
         */
        error = -EINVAL;
        if (!is_file_epoll(f.file))
                goto error_fput;

        /*
         * At this point it is safe to assume that the "private_data" contains
         * our own data structure.
         */
        ep = f.file->private_data;

        /* Time to fish for events ... */
        error = ep_poll(ep, events, maxevents, to);

error_fput:
        fdput(f);
        return error;
}

SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
                int, maxevents, int, timeout)
{
        struct timespec64 to;

        return do_epoll_wait(epfd, events, maxevents,
                             ep_timeout_to_timespec(&to, timeout));
}

/*
 * Implement the event wait interface for the eventpoll file. It is the kernel
 * part of the user space epoll_pwait(2).
 */
static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
                          int maxevents, struct timespec64 *to,
                          const sigset_t __user *sigmask, size_t sigsetsize)
{
        int error;

        /*
         * If the caller wants a certain signal mask to be set during the wait,
         * we apply it here.
         */
        error = set_user_sigmask(sigmask, sigsetsize);
        if (error)
                return error;

        error = do_epoll_wait(epfd, events, maxevents, to);

        restore_saved_sigmask_unless(error == -EINTR);

        return error;
}

SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
                int, maxevents, int, timeout, const sigset_t __user *, sigmask,
                size_t, sigsetsize)
{
        struct timespec64 to;

        return do_epoll_pwait(epfd, events, maxevents,
                              ep_timeout_to_timespec(&to, timeout),
                              sigmask, sigsetsize);
}

SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
                int, maxevents, const struct __kernel_timespec __user *, timeout,
                const sigset_t __user *, sigmask, size_t, sigsetsize)
{
        struct timespec64 ts, *to = NULL;

        if (timeout) {
                if (get_timespec64(&ts, timeout))
                        return -EFAULT;
                to = &ts;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        return do_epoll_pwait(epfd, events, maxevents, to,
                              sigmask, sigsetsize);
}

#ifdef CONFIG_COMPAT
static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
                                 int maxevents, struct timespec64 *timeout,
                                 const compat_sigset_t __user *sigmask,
                                 compat_size_t sigsetsize)
{
        long err;

        /*
         * If the caller wants a certain signal mask to be set during the wait,
         * we apply it here.
         */
        err = set_compat_user_sigmask(sigmask, sigsetsize);
        if (err)
                return err;

        err = do_epoll_wait(epfd, events, maxevents, timeout);

        restore_saved_sigmask_unless(err == -EINTR);

        return err;
}

COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
                       struct epoll_event __user *, events,
                       int, maxevents, int, timeout,
                       const compat_sigset_t __user *, sigmask,
                       compat_size_t, sigsetsize)
{
        struct timespec64 to;

        return do_compat_epoll_pwait(epfd, events, maxevents,
                                     ep_timeout_to_timespec(&to, timeout),
                                     sigmask, sigsetsize);
}

COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,
                       struct epoll_event __user *, events,
                       int, maxevents,
                       const struct __kernel_timespec __user *, timeout,
                       const compat_sigset_t __user *, sigmask,
                       compat_size_t, sigsetsize)
{
        struct timespec64 ts, *to = NULL;

        if (timeout) {
                if (get_timespec64(&ts, timeout))
                        return -EFAULT;
                to = &ts;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        return do_compat_epoll_pwait(epfd, events, maxevents, to,
                                     sigmask, sigsetsize);
}

#endif

static int __init eventpoll_init(void)
{
        struct sysinfo si;

        si_meminfo(&si);
        /*
         * Allows top 4% of lomem to be allocated for epoll watches (per user).
         */
        max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
                EP_ITEM_COST;
        BUG_ON(max_user_watches < 0);

        /*
         * We can have many thousands of epitems, so prevent this from
         * using an extra cache line on 64-bit (and smaller) CPUs
         */
        BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);

        /* Allocates slab cache used to allocate "struct epitem" items */
        epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);

        /* Allocates slab cache used to allocate "struct eppoll_entry" */
        pwq_cache = kmem_cache_create("eventpoll_pwq",
                sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
        epoll_sysctls_init();

        ephead_cache = kmem_cache_create("ep_head",
                sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);

        return 0;
}
fs_initcall(eventpoll_init);























































































    9 



    1 







    4 
    4 




















   11 


   10 

   11 



    9 









    1 


















    1 





    1 




















    1 
    1 


    1 




    1 


















    1 

    1 
















































   12 



   11 




















   10 














    2 


    2 





    2 















   13 

   14 













   15 





































































































































































































































































































































































































    5 
    5 



































































    8 






    5 

























































































    6 










    5 










































    1 



























    1 






    1 
    1 





    1 





    1 



    1 


    1 



    1 






























    1 

    1 














































































































































































































    1 


























    3 






    3 











    4 






    5 
    4 

































    1 

















    1 






































































































































    3 

    3 

















    4 











    4 

    4 














































































































    3 





    3 

    3 





















    3 
































    3 











    3 




    3 














    3 







    3 

    3 










    3 







    3 























    2 


    1 


































































































































































































































































































































































































































































































































































    1 


    1 


    1 
    1 














    1 





    1 

    1 
    1 

    1 


















































































   23 




   24 







   21 


    1 












   22 
    8 

   14 
   22 










   11 








   13 






   13 









    4 
   12 
















    1 
   15 


    4 











   11 












   10 







   10 

    1 




    4 
    2 





















    1 












    1 

    1 








    1 










    1 




















    1 












    1 



    1 





    1 











    1 



































    1 





























    1 

















    1 

    1 







    1 








































    3 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/fs-writeback.c
 *
 * Copyright (C) 2002, Linus Torvalds.
 *
 * Contains all the functions related to writing back and waiting
 * upon dirty inodes against superblocks, and writing back dirty
 * pages against inodes.  ie: data writeback.  Writeout of the
 * inode itself is not handled here.
 *
 * 10Apr2002        Andrew Morton
 *                Split out of fs/inode.c
 *                Additions for address_space-based writeback
 */

#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/kthread.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/tracepoint.h>
#include <linux/device.h>
#include <linux/memcontrol.h>
#include "internal.h"

/*
 * 4MB minimal write chunk size
 */
#define MIN_WRITEBACK_PAGES        (4096UL >> (PAGE_SHIFT - 10))

/*
 * Passed into wb_writeback(), essentially a subset of writeback_control
 */
struct wb_writeback_work {
        long nr_pages;
        struct super_block *sb;
        enum writeback_sync_modes sync_mode;
        unsigned int tagged_writepages:1;
        unsigned int for_kupdate:1;
        unsigned int range_cyclic:1;
        unsigned int for_background:1;
        unsigned int for_sync:1;        /* sync(2) WB_SYNC_ALL writeback */
        unsigned int auto_free:1;        /* free on completion */
        enum wb_reason reason;                /* why was writeback initiated? */

        struct list_head list;                /* pending work list */
        struct wb_completion *done;        /* set if the caller waits */
};

/*
 * If an inode is constantly having its pages dirtied, but then the
 * updates stop dirtytime_expire_interval seconds in the past, it's
 * possible for the worst case time between when an inode has its
 * timestamps updated and when they finally get written out to be two
 * dirtytime_expire_intervals.  We set the default to 12 hours (in
 * seconds), which means most of the time inodes will have their
 * timestamps written to disk after 12 hours, but in the worst case a
 * few inodes might not their timestamps updated for 24 hours.
 */
unsigned int dirtytime_expire_interval = 12 * 60 * 60;

static inline struct inode *wb_inode(struct list_head *head)
{
        return list_entry(head, struct inode, i_io_list);
}

/*
 * Include the creation of the trace points after defining the
 * wb_writeback_work structure and inline functions so that the definition
 * remains local to this file.
 */
#define CREATE_TRACE_POINTS
#include <trace/events/writeback.h>

EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);

static bool wb_io_lists_populated(struct bdi_writeback *wb)
{
        if (wb_has_dirty_io(wb)) {
                return false;
        } else {
                set_bit(WB_has_dirty_io, &wb->state);
                WARN_ON_ONCE(!wb->avg_write_bandwidth);
                atomic_long_add(wb->avg_write_bandwidth,
                                &wb->bdi->tot_write_bandwidth);
                return true;
        }
}

static void wb_io_lists_depopulated(struct bdi_writeback *wb)
{
        if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
            list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
                clear_bit(WB_has_dirty_io, &wb->state);
                WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
                                        &wb->bdi->tot_write_bandwidth) < 0);
        }
}

/**
 * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
 * @inode: inode to be moved
 * @wb: target bdi_writeback
 * @head: one of @wb->b_{dirty|io|more_io|dirty_time}
 *
 * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
 * Returns %true if @inode is the first occupant of the !dirty_time IO
 * lists; otherwise, %false.
 */
static bool inode_io_list_move_locked(struct inode *inode,
                                      struct bdi_writeback *wb,
                                      struct list_head *head)
{
        assert_spin_locked(&wb->list_lock);
        assert_spin_locked(&inode->i_lock);
        WARN_ON_ONCE(inode->i_state & I_FREEING);

        list_move(&inode->i_io_list, head);

        /* dirty_time doesn't count as dirty_io until expiration */
        if (head != &wb->b_dirty_time)
                return wb_io_lists_populated(wb);

        wb_io_lists_depopulated(wb);
        return false;
}

static void wb_wakeup(struct bdi_writeback *wb)
{
        spin_lock_irq(&wb->work_lock);
        if (test_bit(WB_registered, &wb->state))
                mod_delayed_work(bdi_wq, &wb->dwork, 0);
        spin_unlock_irq(&wb->work_lock);
}

/*
 * This function is used when the first inode for this wb is marked dirty. It
 * wakes-up the corresponding bdi thread which should then take care of the
 * periodic background write-out of dirty inodes. Since the write-out would
 * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
 * set up a timer which wakes the bdi thread up later.
 *
 * Note, we wouldn't bother setting up the timer, but this function is on the
 * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
 * by delaying the wake-up.
 *
 * We have to be careful not to postpone flush work if it is scheduled for
 * earlier. Thus we use queue_delayed_work().
 */
static void wb_wakeup_delayed(struct bdi_writeback *wb)
{
        unsigned long timeout;

        timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
        spin_lock_irq(&wb->work_lock);
        if (test_bit(WB_registered, &wb->state))
                queue_delayed_work(bdi_wq, &wb->dwork, timeout);
        spin_unlock_irq(&wb->work_lock);
}

static void finish_writeback_work(struct wb_writeback_work *work)
{
        struct wb_completion *done = work->done;

        if (work->auto_free)
                kfree(work);
        if (done) {
                wait_queue_head_t *waitq = done->waitq;

                /* @done can't be accessed after the following dec */
                if (atomic_dec_and_test(&done->cnt))
                        wake_up_all(waitq);
        }
}

static void wb_queue_work(struct bdi_writeback *wb,
                          struct wb_writeback_work *work)
{
        trace_writeback_queue(wb, work);

        if (work->done)
                atomic_inc(&work->done->cnt);

        spin_lock_irq(&wb->work_lock);

        if (test_bit(WB_registered, &wb->state)) {
                list_add_tail(&work->list, &wb->work_list);
                mod_delayed_work(bdi_wq, &wb->dwork, 0);
        } else
                finish_writeback_work(work);

        spin_unlock_irq(&wb->work_lock);
}

/**
 * wb_wait_for_completion - wait for completion of bdi_writeback_works
 * @done: target wb_completion
 *
 * Wait for one or more work items issued to @bdi with their ->done field
 * set to @done, which should have been initialized with
 * DEFINE_WB_COMPLETION().  This function returns after all such work items
 * are completed.  Work items which are waited upon aren't freed
 * automatically on completion.
 */
void wb_wait_for_completion(struct wb_completion *done)
{
        atomic_dec(&done->cnt);                /* put down the initial count */
        wait_event(*done->waitq, !atomic_read(&done->cnt));
}

#ifdef CONFIG_CGROUP_WRITEBACK

/*
 * Parameters for foreign inode detection, see wbc_detach_inode() to see
 * how they're used.
 *
 * These paramters are inherently heuristical as the detection target
 * itself is fuzzy.  All we want to do is detaching an inode from the
 * current owner if it's being written to by some other cgroups too much.
 *
 * The current cgroup writeback is built on the assumption that multiple
 * cgroups writing to the same inode concurrently is very rare and a mode
 * of operation which isn't well supported.  As such, the goal is not
 * taking too long when a different cgroup takes over an inode while
 * avoiding too aggressive flip-flops from occasional foreign writes.
 *
 * We record, very roughly, 2s worth of IO time history and if more than
 * half of that is foreign, trigger the switch.  The recording is quantized
 * to 16 slots.  To avoid tiny writes from swinging the decision too much,
 * writes smaller than 1/8 of avg size are ignored.
 */
#define WB_FRN_TIME_SHIFT        13        /* 1s = 2^13, upto 8 secs w/ 16bit */
#define WB_FRN_TIME_AVG_SHIFT        3        /* avg = avg * 7/8 + new * 1/8 */
#define WB_FRN_TIME_CUT_DIV        8        /* ignore rounds < avg / 8 */
#define WB_FRN_TIME_PERIOD        (2 * (1 << WB_FRN_TIME_SHIFT))        /* 2s */

#define WB_FRN_HIST_SLOTS        16        /* inode->i_wb_frn_history is 16bit */
#define WB_FRN_HIST_UNIT        (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
                                        /* each slot's duration is 2s / 16 */
#define WB_FRN_HIST_THR_SLOTS        (WB_FRN_HIST_SLOTS / 2)
                                        /* if foreign slots >= 8, switch */
#define WB_FRN_HIST_MAX_SLOTS        (WB_FRN_HIST_THR_SLOTS / 2 + 1)
                                        /* one round can affect upto 5 slots */
#define WB_FRN_MAX_IN_FLIGHT        1024        /* don't queue too many concurrently */

/*
 * Maximum inodes per isw.  A specific value has been chosen to make
 * struct inode_switch_wbs_context fit into 1024 bytes kmalloc.
 */
#define WB_MAX_INODES_PER_ISW  ((1024UL - sizeof(struct inode_switch_wbs_context)) \
                                / sizeof(struct inode *))

static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
static struct workqueue_struct *isw_wq;

void __inode_attach_wb(struct inode *inode, struct folio *folio)
{
        struct backing_dev_info *bdi = inode_to_bdi(inode);
        struct bdi_writeback *wb = NULL;

        if (inode_cgwb_enabled(inode)) {
                struct cgroup_subsys_state *memcg_css;

                if (folio) {
                        memcg_css = mem_cgroup_css_from_folio(folio);
                        wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
                } else {
                        /* must pin memcg_css, see wb_get_create() */
                        memcg_css = task_get_css(current, memory_cgrp_id);
                        wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
                        css_put(memcg_css);
                }
        }

        if (!wb)
                wb = &bdi->wb;

        /*
         * There may be multiple instances of this function racing to
         * update the same inode.  Use cmpxchg() to tell the winner.
         */
        if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
                wb_put(wb);
}
EXPORT_SYMBOL_GPL(__inode_attach_wb);

/**
 * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list
 * @inode: inode of interest with i_lock held
 * @wb: target bdi_writeback
 *
 * Remove the inode from wb's io lists and if necessarily put onto b_attached
 * list.  Only inodes attached to cgwb's are kept on this list.
 */
static void inode_cgwb_move_to_attached(struct inode *inode,
                                        struct bdi_writeback *wb)
{
        assert_spin_locked(&wb->list_lock);
        assert_spin_locked(&inode->i_lock);
        WARN_ON_ONCE(inode->i_state & I_FREEING);

        inode->i_state &= ~I_SYNC_QUEUED;
        if (wb != &wb->bdi->wb)
                list_move(&inode->i_io_list, &wb->b_attached);
        else
                list_del_init(&inode->i_io_list);
        wb_io_lists_depopulated(wb);
}

/**
 * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
 * @inode: inode of interest with i_lock held
 *
 * Returns @inode's wb with its list_lock held.  @inode->i_lock must be
 * held on entry and is released on return.  The returned wb is guaranteed
 * to stay @inode's associated wb until its list_lock is released.
 */
static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode *inode)
        __releases(&inode->i_lock)
        __acquires(&wb->list_lock)
{
        while (true) {
                struct bdi_writeback *wb = inode_to_wb(inode);

                /*
                 * inode_to_wb() association is protected by both
                 * @inode->i_lock and @wb->list_lock but list_lock nests
                 * outside i_lock.  Drop i_lock and verify that the
                 * association hasn't changed after acquiring list_lock.
                 */
                wb_get(wb);
                spin_unlock(&inode->i_lock);
                spin_lock(&wb->list_lock);

                /* i_wb may have changed inbetween, can't use inode_to_wb() */
                if (likely(wb == inode->i_wb)) {
                        wb_put(wb);        /* @inode already has ref */
                        return wb;
                }

                spin_unlock(&wb->list_lock);
                wb_put(wb);
                cpu_relax();
                spin_lock(&inode->i_lock);
        }
}

/**
 * inode_to_wb_and_lock_list - determine an inode's wb and lock it
 * @inode: inode of interest
 *
 * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
 * on entry.
 */
static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
        __acquires(&wb->list_lock)
{
        spin_lock(&inode->i_lock);
        return locked_inode_to_wb_and_lock_list(inode);
}

struct inode_switch_wbs_context {
        struct rcu_work                work;

        /*
         * Multiple inodes can be switched at once.  The switching procedure
         * consists of two parts, separated by a RCU grace period.  To make
         * sure that the second part is executed for each inode gone through
         * the first part, all inode pointers are placed into a NULL-terminated
         * array embedded into struct inode_switch_wbs_context.  Otherwise
         * an inode could be left in a non-consistent state.
         */
        struct bdi_writeback        *new_wb;
        struct inode                *inodes[];
};

static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
{
        down_write(&bdi->wb_switch_rwsem);
}

static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
{
        up_write(&bdi->wb_switch_rwsem);
}

static bool inode_do_switch_wbs(struct inode *inode,
                                struct bdi_writeback *old_wb,
                                struct bdi_writeback *new_wb)
{
        struct address_space *mapping = inode->i_mapping;
        XA_STATE(xas, &mapping->i_pages, 0);
        struct folio *folio;
        bool switched = false;

        spin_lock(&inode->i_lock);
        xa_lock_irq(&mapping->i_pages);

        /*
         * Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction
         * path owns the inode and we shouldn't modify ->i_io_list.
         */
        if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE)))
                goto skip_switch;

        trace_inode_switch_wbs(inode, old_wb, new_wb);

        /*
         * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points
         * to possibly dirty folios while PAGECACHE_TAG_WRITEBACK points to
         * folios actually under writeback.
         */
        xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
                if (folio_test_dirty(folio)) {
                        long nr = folio_nr_pages(folio);
                        wb_stat_mod(old_wb, WB_RECLAIMABLE, -nr);
                        wb_stat_mod(new_wb, WB_RECLAIMABLE, nr);
                }
        }

        xas_set(&xas, 0);
        xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
                long nr = folio_nr_pages(folio);
                WARN_ON_ONCE(!folio_test_writeback(folio));
                wb_stat_mod(old_wb, WB_WRITEBACK, -nr);
                wb_stat_mod(new_wb, WB_WRITEBACK, nr);
        }

        if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
                atomic_dec(&old_wb->writeback_inodes);
                atomic_inc(&new_wb->writeback_inodes);
        }

        wb_get(new_wb);

        /*
         * Transfer to @new_wb's IO list if necessary.  If the @inode is dirty,
         * the specific list @inode was on is ignored and the @inode is put on
         * ->b_dirty which is always correct including from ->b_dirty_time.
         * The transfer preserves @inode->dirtied_when ordering.  If the @inode
         * was clean, it means it was on the b_attached list, so move it onto
         * the b_attached list of @new_wb.
         */
        if (!list_empty(&inode->i_io_list)) {
                inode->i_wb = new_wb;

                if (inode->i_state & I_DIRTY_ALL) {
                        struct inode *pos;

                        list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
                                if (time_after_eq(inode->dirtied_when,
                                                  pos->dirtied_when))
                                        break;
                        inode_io_list_move_locked(inode, new_wb,
                                                  pos->i_io_list.prev);
                } else {
                        inode_cgwb_move_to_attached(inode, new_wb);
                }
        } else {
                inode->i_wb = new_wb;
        }

        /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
        inode->i_wb_frn_winner = 0;
        inode->i_wb_frn_avg_time = 0;
        inode->i_wb_frn_history = 0;
        switched = true;
skip_switch:
        /*
         * Paired with load_acquire in unlocked_inode_to_wb_begin() and
         * ensures that the new wb is visible if they see !I_WB_SWITCH.
         */
        smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);

        xa_unlock_irq(&mapping->i_pages);
        spin_unlock(&inode->i_lock);

        return switched;
}

static void inode_switch_wbs_work_fn(struct work_struct *work)
{
        struct inode_switch_wbs_context *isw =
                container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
        struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
        struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
        struct bdi_writeback *new_wb = isw->new_wb;
        unsigned long nr_switched = 0;
        struct inode **inodep;

        /*
         * If @inode switches cgwb membership while sync_inodes_sb() is
         * being issued, sync_inodes_sb() might miss it.  Synchronize.
         */
        down_read(&bdi->wb_switch_rwsem);

        /*
         * By the time control reaches here, RCU grace period has passed
         * since I_WB_SWITCH assertion and all wb stat update transactions
         * between unlocked_inode_to_wb_begin/end() are guaranteed to be
         * synchronizing against the i_pages lock.
         *
         * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
         * gives us exclusion against all wb related operations on @inode
         * including IO list manipulations and stat updates.
         */
        if (old_wb < new_wb) {
                spin_lock(&old_wb->list_lock);
                spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
        } else {
                spin_lock(&new_wb->list_lock);
                spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
        }

        for (inodep = isw->inodes; *inodep; inodep++) {
                WARN_ON_ONCE((*inodep)->i_wb != old_wb);
                if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
                        nr_switched++;
        }

        spin_unlock(&new_wb->list_lock);
        spin_unlock(&old_wb->list_lock);

        up_read(&bdi->wb_switch_rwsem);

        if (nr_switched) {
                wb_wakeup(new_wb);
                wb_put_many(old_wb, nr_switched);
        }

        for (inodep = isw->inodes; *inodep; inodep++)
                iput(*inodep);
        wb_put(new_wb);
        kfree(isw);
        atomic_dec(&isw_nr_in_flight);
}

static bool inode_prepare_wbs_switch(struct inode *inode,
                                     struct bdi_writeback *new_wb)
{
        /*
         * Paired with smp_mb() in cgroup_writeback_umount().
         * isw_nr_in_flight must be increased before checking SB_ACTIVE and
         * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0
         * in cgroup_writeback_umount() and the isw_wq will be not flushed.
         */
        smp_mb();

        if (IS_DAX(inode))
                return false;

        /* while holding I_WB_SWITCH, no one else can update the association */
        spin_lock(&inode->i_lock);
        if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
            inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
            inode_to_wb(inode) == new_wb) {
                spin_unlock(&inode->i_lock);
                return false;
        }
        inode->i_state |= I_WB_SWITCH;
        __iget(inode);
        spin_unlock(&inode->i_lock);

        return true;
}

/**
 * inode_switch_wbs - change the wb association of an inode
 * @inode: target inode
 * @new_wb_id: ID of the new wb
 *
 * Switch @inode's wb association to the wb identified by @new_wb_id.  The
 * switching is performed asynchronously and may fail silently.
 */
static void inode_switch_wbs(struct inode *inode, int new_wb_id)
{
        struct backing_dev_info *bdi = inode_to_bdi(inode);
        struct cgroup_subsys_state *memcg_css;
        struct inode_switch_wbs_context *isw;

        /* noop if seems to be already in progress */
        if (inode->i_state & I_WB_SWITCH)
                return;

        /* avoid queueing a new switch if too many are already in flight */
        if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
                return;

        isw = kzalloc(struct_size(isw, inodes, 2), GFP_ATOMIC);
        if (!isw)
                return;

        atomic_inc(&isw_nr_in_flight);

        /* find and pin the new wb */
        rcu_read_lock();
        memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
        if (memcg_css && !css_tryget(memcg_css))
                memcg_css = NULL;
        rcu_read_unlock();
        if (!memcg_css)
                goto out_free;

        isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
        css_put(memcg_css);
        if (!isw->new_wb)
                goto out_free;

        if (!inode_prepare_wbs_switch(inode, isw->new_wb))
                goto out_free;

        isw->inodes[0] = inode;

        /*
         * In addition to synchronizing among switchers, I_WB_SWITCH tells
         * the RCU protected stat update paths to grab the i_page
         * lock so that stat transfer can synchronize against them.
         * Let's continue after I_WB_SWITCH is guaranteed to be visible.
         */
        INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
        queue_rcu_work(isw_wq, &isw->work);
        return;

out_free:
        atomic_dec(&isw_nr_in_flight);
        if (isw->new_wb)
                wb_put(isw->new_wb);
        kfree(isw);
}

static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
                                   struct list_head *list, int *nr)
{
        struct inode *inode;

        list_for_each_entry(inode, list, i_io_list) {
                if (!inode_prepare_wbs_switch(inode, isw->new_wb))
                        continue;

                isw->inodes[*nr] = inode;
                (*nr)++;

                if (*nr >= WB_MAX_INODES_PER_ISW - 1)
                        return true;
        }
        return false;
}

/**
 * cleanup_offline_cgwb - detach associated inodes
 * @wb: target wb
 *
 * Switch all inodes attached to @wb to a nearest living ancestor's wb in order
 * to eventually release the dying @wb.  Returns %true if not all inodes were
 * switched and the function has to be restarted.
 */
bool cleanup_offline_cgwb(struct bdi_writeback *wb)
{
        struct cgroup_subsys_state *memcg_css;
        struct inode_switch_wbs_context *isw;
        int nr;
        bool restart = false;

        isw = kzalloc(struct_size(isw, inodes, WB_MAX_INODES_PER_ISW),
                      GFP_KERNEL);
        if (!isw)
                return restart;

        atomic_inc(&isw_nr_in_flight);

        for (memcg_css = wb->memcg_css->parent; memcg_css;
             memcg_css = memcg_css->parent) {
                isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
                if (isw->new_wb)
                        break;
        }
        if (unlikely(!isw->new_wb))
                isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */

        nr = 0;
        spin_lock(&wb->list_lock);
        /*
         * In addition to the inodes that have completed writeback, also switch
         * cgwbs for those inodes only with dirty timestamps. Otherwise, those
         * inodes won't be written back for a long time when lazytime is
         * enabled, and thus pinning the dying cgwbs. It won't break the
         * bandwidth restrictions, as writeback of inode metadata is not
         * accounted for.
         */
        restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
        if (!restart)
                restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
        spin_unlock(&wb->list_lock);

        /* no attached inodes? bail out */
        if (nr == 0) {
                atomic_dec(&isw_nr_in_flight);
                wb_put(isw->new_wb);
                kfree(isw);
                return restart;
        }

        /*
         * In addition to synchronizing among switchers, I_WB_SWITCH tells
         * the RCU protected stat update paths to grab the i_page
         * lock so that stat transfer can synchronize against them.
         * Let's continue after I_WB_SWITCH is guaranteed to be visible.
         */
        INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
        queue_rcu_work(isw_wq, &isw->work);

        return restart;
}

/**
 * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
 * @wbc: writeback_control of interest
 * @inode: target inode
 *
 * @inode is locked and about to be written back under the control of @wbc.
 * Record @inode's writeback context into @wbc and unlock the i_lock.  On
 * writeback completion, wbc_detach_inode() should be called.  This is used
 * to track the cgroup writeback context.
 */
void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
                                 struct inode *inode)
{
        if (!inode_cgwb_enabled(inode)) {
                spin_unlock(&inode->i_lock);
                return;
        }

        wbc->wb = inode_to_wb(inode);
        wbc->inode = inode;

        wbc->wb_id = wbc->wb->memcg_css->id;
        wbc->wb_lcand_id = inode->i_wb_frn_winner;
        wbc->wb_tcand_id = 0;
        wbc->wb_bytes = 0;
        wbc->wb_lcand_bytes = 0;
        wbc->wb_tcand_bytes = 0;

        wb_get(wbc->wb);
        spin_unlock(&inode->i_lock);

        /*
         * A dying wb indicates that either the blkcg associated with the
         * memcg changed or the associated memcg is dying.  In the first
         * case, a replacement wb should already be available and we should
         * refresh the wb immediately.  In the second case, trying to
         * refresh will keep failing.
         */
        if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
                inode_switch_wbs(inode, wbc->wb_id);
}
EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);

/**
 * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
 * @wbc: writeback_control of the just finished writeback
 *
 * To be called after a writeback attempt of an inode finishes and undoes
 * wbc_attach_and_unlock_inode().  Can be called under any context.
 *
 * As concurrent write sharing of an inode is expected to be very rare and
 * memcg only tracks page ownership on first-use basis severely confining
 * the usefulness of such sharing, cgroup writeback tracks ownership
 * per-inode.  While the support for concurrent write sharing of an inode
 * is deemed unnecessary, an inode being written to by different cgroups at
 * different points in time is a lot more common, and, more importantly,
 * charging only by first-use can too readily lead to grossly incorrect
 * behaviors (single foreign page can lead to gigabytes of writeback to be
 * incorrectly attributed).
 *
 * To resolve this issue, cgroup writeback detects the majority dirtier of
 * an inode and transfers the ownership to it.  To avoid unnecessary
 * oscillation, the detection mechanism keeps track of history and gives
 * out the switch verdict only if the foreign usage pattern is stable over
 * a certain amount of time and/or writeback attempts.
 *
 * On each writeback attempt, @wbc tries to detect the majority writer
 * using Boyer-Moore majority vote algorithm.  In addition to the byte
 * count from the majority voting, it also counts the bytes written for the
 * current wb and the last round's winner wb (max of last round's current
 * wb, the winner from two rounds ago, and the last round's majority
 * candidate).  Keeping track of the historical winner helps the algorithm
 * to semi-reliably detect the most active writer even when it's not the
 * absolute majority.
 *
 * Once the winner of the round is determined, whether the winner is
 * foreign or not and how much IO time the round consumed is recorded in
 * inode->i_wb_frn_history.  If the amount of recorded foreign IO time is
 * over a certain threshold, the switch verdict is given.
 */
void wbc_detach_inode(struct writeback_control *wbc)
{
        struct bdi_writeback *wb = wbc->wb;
        struct inode *inode = wbc->inode;
        unsigned long avg_time, max_bytes, max_time;
        u16 history;
        int max_id;

        if (!wb)
                return;

        history = inode->i_wb_frn_history;
        avg_time = inode->i_wb_frn_avg_time;

        /* pick the winner of this round */
        if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
            wbc->wb_bytes >= wbc->wb_tcand_bytes) {
                max_id = wbc->wb_id;
                max_bytes = wbc->wb_bytes;
        } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
                max_id = wbc->wb_lcand_id;
                max_bytes = wbc->wb_lcand_bytes;
        } else {
                max_id = wbc->wb_tcand_id;
                max_bytes = wbc->wb_tcand_bytes;
        }

        /*
         * Calculate the amount of IO time the winner consumed and fold it
         * into the running average kept per inode.  If the consumed IO
         * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
         * deciding whether to switch or not.  This is to prevent one-off
         * small dirtiers from skewing the verdict.
         */
        max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
                                wb->avg_write_bandwidth);
        if (avg_time)
                avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
                            (avg_time >> WB_FRN_TIME_AVG_SHIFT);
        else
                avg_time = max_time;        /* immediate catch up on first run */

        if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
                int slots;

                /*
                 * The switch verdict is reached if foreign wb's consume
                 * more than a certain proportion of IO time in a
                 * WB_FRN_TIME_PERIOD.  This is loosely tracked by 16 slot
                 * history mask where each bit represents one sixteenth of
                 * the period.  Determine the number of slots to shift into
                 * history from @max_time.
                 */
                slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
                            (unsigned long)WB_FRN_HIST_MAX_SLOTS);
                history <<= slots;
                if (wbc->wb_id != max_id)
                        history |= (1U << slots) - 1;

                if (history)
                        trace_inode_foreign_history(inode, wbc, history);

                /*
                 * Switch if the current wb isn't the consistent winner.
                 * If there are multiple closely competing dirtiers, the
                 * inode may switch across them repeatedly over time, which
                 * is okay.  The main goal is avoiding keeping an inode on
                 * the wrong wb for an extended period of time.
                 */
                if (hweight16(history) > WB_FRN_HIST_THR_SLOTS)
                        inode_switch_wbs(inode, max_id);
        }

        /*
         * Multiple instances of this function may race to update the
         * following fields but we don't mind occassional inaccuracies.
         */
        inode->i_wb_frn_winner = max_id;
        inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
        inode->i_wb_frn_history = history;

        wb_put(wbc->wb);
        wbc->wb = NULL;
}
EXPORT_SYMBOL_GPL(wbc_detach_inode);

/**
 * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
 * @wbc: writeback_control of the writeback in progress
 * @page: page being written out
 * @bytes: number of bytes being written out
 *
 * @bytes from @page are about to written out during the writeback
 * controlled by @wbc.  Keep the book for foreign inode detection.  See
 * wbc_detach_inode().
 */
void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
                              size_t bytes)
{
        struct folio *folio;
        struct cgroup_subsys_state *css;
        int id;

        /*
         * pageout() path doesn't attach @wbc to the inode being written
         * out.  This is intentional as we don't want the function to block
         * behind a slow cgroup.  Ultimately, we want pageout() to kick off
         * regular writeback instead of writing things out itself.
         */
        if (!wbc->wb || wbc->no_cgroup_owner)
                return;

        folio = page_folio(page);
        css = mem_cgroup_css_from_folio(folio);
        /* dead cgroups shouldn't contribute to inode ownership arbitration */
        if (!(css->flags & CSS_ONLINE))
                return;

        id = css->id;

        if (id == wbc->wb_id) {
                wbc->wb_bytes += bytes;
                return;
        }

        if (id == wbc->wb_lcand_id)
                wbc->wb_lcand_bytes += bytes;

        /* Boyer-Moore majority vote algorithm */
        if (!wbc->wb_tcand_bytes)
                wbc->wb_tcand_id = id;
        if (id == wbc->wb_tcand_id)
                wbc->wb_tcand_bytes += bytes;
        else
                wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
}
EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);

/**
 * wb_split_bdi_pages - split nr_pages to write according to bandwidth
 * @wb: target bdi_writeback to split @nr_pages to
 * @nr_pages: number of pages to write for the whole bdi
 *
 * Split @wb's portion of @nr_pages according to @wb's write bandwidth in
 * relation to the total write bandwidth of all wb's w/ dirty inodes on
 * @wb->bdi.
 */
static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
{
        unsigned long this_bw = wb->avg_write_bandwidth;
        unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);

        if (nr_pages == LONG_MAX)
                return LONG_MAX;

        /*
         * This may be called on clean wb's and proportional distribution
         * may not make sense, just use the original @nr_pages in those
         * cases.  In general, we wanna err on the side of writing more.
         */
        if (!tot_bw || this_bw >= tot_bw)
                return nr_pages;
        else
                return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
}

/**
 * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
 * @bdi: target backing_dev_info
 * @base_work: wb_writeback_work to issue
 * @skip_if_busy: skip wb's which already have writeback in progress
 *
 * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
 * have dirty inodes.  If @base_work->nr_page isn't %LONG_MAX, it's
 * distributed to the busy wbs according to each wb's proportion in the
 * total active write bandwidth of @bdi.
 */
static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
                                  struct wb_writeback_work *base_work,
                                  bool skip_if_busy)
{
        struct bdi_writeback *last_wb = NULL;
        struct bdi_writeback *wb = list_entry(&bdi->wb_list,
                                              struct bdi_writeback, bdi_node);

        might_sleep();
restart:
        rcu_read_lock();
        list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
                DEFINE_WB_COMPLETION(fallback_work_done, bdi);
                struct wb_writeback_work fallback_work;
                struct wb_writeback_work *work;
                long nr_pages;

                if (last_wb) {
                        wb_put(last_wb);
                        last_wb = NULL;
                }

                /* SYNC_ALL writes out I_DIRTY_TIME too */
                if (!wb_has_dirty_io(wb) &&
                    (base_work->sync_mode == WB_SYNC_NONE ||
                     list_empty(&wb->b_dirty_time)))
                        continue;
                if (skip_if_busy && writeback_in_progress(wb))
                        continue;

                nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);

                work = kmalloc(sizeof(*work), GFP_ATOMIC);
                if (work) {
                        *work = *base_work;
                        work->nr_pages = nr_pages;
                        work->auto_free = 1;
                        wb_queue_work(wb, work);
                        continue;
                }

                /*
                 * If wb_tryget fails, the wb has been shutdown, skip it.
                 *
                 * Pin @wb so that it stays on @bdi->wb_list.  This allows
                 * continuing iteration from @wb after dropping and
                 * regrabbing rcu read lock.
                 */
                if (!wb_tryget(wb))
                        continue;

                /* alloc failed, execute synchronously using on-stack fallback */
                work = &fallback_work;
                *work = *base_work;
                work->nr_pages = nr_pages;
                work->auto_free = 0;
                work->done = &fallback_work_done;

                wb_queue_work(wb, work);
                last_wb = wb;

                rcu_read_unlock();
                wb_wait_for_completion(&fallback_work_done);
                goto restart;
        }
        rcu_read_unlock();

        if (last_wb)
                wb_put(last_wb);
}

/**
 * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs
 * @bdi_id: target bdi id
 * @memcg_id: target memcg css id
 * @reason: reason why some writeback work initiated
 * @done: target wb_completion
 *
 * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id
 * with the specified parameters.
 */
int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
                           enum wb_reason reason, struct wb_completion *done)
{
        struct backing_dev_info *bdi;
        struct cgroup_subsys_state *memcg_css;
        struct bdi_writeback *wb;
        struct wb_writeback_work *work;
        unsigned long dirty;
        int ret;

        /* lookup bdi and memcg */
        bdi = bdi_get_by_id(bdi_id);
        if (!bdi)
                return -ENOENT;

        rcu_read_lock();
        memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys);
        if (memcg_css && !css_tryget(memcg_css))
                memcg_css = NULL;
        rcu_read_unlock();
        if (!memcg_css) {
                ret = -ENOENT;
                goto out_bdi_put;
        }

        /*
         * And find the associated wb.  If the wb isn't there already
         * there's nothing to flush, don't create one.
         */
        wb = wb_get_lookup(bdi, memcg_css);
        if (!wb) {
                ret = -ENOENT;
                goto out_css_put;
        }

        /*
         * The caller is attempting to write out most of
         * the currently dirty pages.  Let's take the current dirty page
         * count and inflate it by 25% which should be large enough to
         * flush out most dirty pages while avoiding getting livelocked by
         * concurrent dirtiers.
         *
         * BTW the memcg stats are flushed periodically and this is best-effort
         * estimation, so some potential error is ok.
         */
        dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY);
        dirty = dirty * 10 / 8;

        /* issue the writeback work */
        work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
        if (work) {
                work->nr_pages = dirty;
                work->sync_mode = WB_SYNC_NONE;
                work->range_cyclic = 1;
                work->reason = reason;
                work->done = done;
                work->auto_free = 1;
                wb_queue_work(wb, work);
                ret = 0;
        } else {
                ret = -ENOMEM;
        }

        wb_put(wb);
out_css_put:
        css_put(memcg_css);
out_bdi_put:
        bdi_put(bdi);
        return ret;
}

/**
 * cgroup_writeback_umount - flush inode wb switches for umount
 *
 * This function is called when a super_block is about to be destroyed and
 * flushes in-flight inode wb switches.  An inode wb switch goes through
 * RCU and then workqueue, so the two need to be flushed in order to ensure
 * that all previously scheduled switches are finished.  As wb switches are
 * rare occurrences and synchronize_rcu() can take a while, perform
 * flushing iff wb switches are in flight.
 */
void cgroup_writeback_umount(void)
{
        /*
         * SB_ACTIVE should be reliably cleared before checking
         * isw_nr_in_flight, see generic_shutdown_super().
         */
        smp_mb();

        if (atomic_read(&isw_nr_in_flight)) {
                /*
                 * Use rcu_barrier() to wait for all pending callbacks to
                 * ensure that all in-flight wb switches are in the workqueue.
                 */
                rcu_barrier();
                flush_workqueue(isw_wq);
        }
}

static int __init cgroup_writeback_init(void)
{
        isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
        if (!isw_wq)
                return -ENOMEM;
        return 0;
}
fs_initcall(cgroup_writeback_init);

#else        /* CONFIG_CGROUP_WRITEBACK */

static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }

static void inode_cgwb_move_to_attached(struct inode *inode,
                                        struct bdi_writeback *wb)
{
        assert_spin_locked(&wb->list_lock);
        assert_spin_locked(&inode->i_lock);
        WARN_ON_ONCE(inode->i_state & I_FREEING);

        inode->i_state &= ~I_SYNC_QUEUED;
        list_del_init(&inode->i_io_list);
        wb_io_lists_depopulated(wb);
}

static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode *inode)
        __releases(&inode->i_lock)
        __acquires(&wb->list_lock)
{
        struct bdi_writeback *wb = inode_to_wb(inode);

        spin_unlock(&inode->i_lock);
        spin_lock(&wb->list_lock);
        return wb;
}

static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
        __acquires(&wb->list_lock)
{
        struct bdi_writeback *wb = inode_to_wb(inode);

        spin_lock(&wb->list_lock);
        return wb;
}

static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
{
        return nr_pages;
}

static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
                                  struct wb_writeback_work *base_work,
                                  bool skip_if_busy)
{
        might_sleep();

        if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
                base_work->auto_free = 0;
                wb_queue_work(&bdi->wb, base_work);
        }
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

/*
 * Add in the number of potentially dirty inodes, because each inode
 * write can dirty pagecache in the underlying blockdev.
 */
static unsigned long get_nr_dirty_pages(void)
{
        return global_node_page_state(NR_FILE_DIRTY) +
                get_nr_dirty_inodes();
}

static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
{
        if (!wb_has_dirty_io(wb))
                return;

        /*
         * All callers of this function want to start writeback of all
         * dirty pages. Places like vmscan can call this at a very
         * high frequency, causing pointless allocations of tons of
         * work items and keeping the flusher threads busy retrieving
         * that work. Ensure that we only allow one of them pending and
         * inflight at the time.
         */
        if (test_bit(WB_start_all, &wb->state) ||
            test_and_set_bit(WB_start_all, &wb->state))
                return;

        wb->start_all_reason = reason;
        wb_wakeup(wb);
}

/**
 * wb_start_background_writeback - start background writeback
 * @wb: bdi_writback to write from
 *
 * Description:
 *   This makes sure WB_SYNC_NONE background writeback happens. When
 *   this function returns, it is only guaranteed that for given wb
 *   some IO is happening if we are over background dirty threshold.
 *   Caller need not hold sb s_umount semaphore.
 */
void wb_start_background_writeback(struct bdi_writeback *wb)
{
        /*
         * We just wake up the flusher thread. It will perform background
         * writeback as soon as there is no other work to do.
         */
        trace_writeback_wake_background(wb);
        wb_wakeup(wb);
}

/*
 * Remove the inode from the writeback list it is on.
 */
void inode_io_list_del(struct inode *inode)
{
        struct bdi_writeback *wb;

        wb = inode_to_wb_and_lock_list(inode);
        spin_lock(&inode->i_lock);

        inode->i_state &= ~I_SYNC_QUEUED;
        list_del_init(&inode->i_io_list);
        wb_io_lists_depopulated(wb);

        spin_unlock(&inode->i_lock);
        spin_unlock(&wb->list_lock);
}
EXPORT_SYMBOL(inode_io_list_del);

/*
 * mark an inode as under writeback on the sb
 */
void sb_mark_inode_writeback(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        unsigned long flags;

        if (list_empty(&inode->i_wb_list)) {
                spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
                if (list_empty(&inode->i_wb_list)) {
                        list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
                        trace_sb_mark_inode_writeback(inode);
                }
                spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
        }
}

/*
 * clear an inode as under writeback on the sb
 */
void sb_clear_inode_writeback(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        unsigned long flags;

        if (!list_empty(&inode->i_wb_list)) {
                spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
                if (!list_empty(&inode->i_wb_list)) {
                        list_del_init(&inode->i_wb_list);
                        trace_sb_clear_inode_writeback(inode);
                }
                spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
        }
}

/*
 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
 * furthest end of its superblock's dirty-inode list.
 *
 * Before stamping the inode's ->dirtied_when, we check to see whether it is
 * already the most-recently-dirtied inode on the b_dirty list.  If that is
 * the case then the inode must have been redirtied while it was being written
 * out and we don't reset its dirtied_when.
 */
static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
{
        assert_spin_locked(&inode->i_lock);

        inode->i_state &= ~I_SYNC_QUEUED;
        /*
         * When the inode is being freed just don't bother with dirty list
         * tracking. Flush worker will ignore this inode anyway and it will
         * trigger assertions in inode_io_list_move_locked().
         */
        if (inode->i_state & I_FREEING) {
                list_del_init(&inode->i_io_list);
                wb_io_lists_depopulated(wb);
                return;
        }
        if (!list_empty(&wb->b_dirty)) {
                struct inode *tail;

                tail = wb_inode(wb->b_dirty.next);
                if (time_before(inode->dirtied_when, tail->dirtied_when))
                        inode->dirtied_when = jiffies;
        }
        inode_io_list_move_locked(inode, wb, &wb->b_dirty);
}

static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
{
        spin_lock(&inode->i_lock);
        redirty_tail_locked(inode, wb);
        spin_unlock(&inode->i_lock);
}

/*
 * requeue inode for re-scanning after bdi->b_io list is exhausted.
 */
static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
{
        inode_io_list_move_locked(inode, wb, &wb->b_more_io);
}

static void inode_sync_complete(struct inode *inode)
{
        inode->i_state &= ~I_SYNC;
        /* If inode is clean an unused, put it into LRU now... */
        inode_add_lru(inode);
        /* Waiters must see I_SYNC cleared before being woken up */
        smp_mb();
        wake_up_bit(&inode->i_state, __I_SYNC);
}

static bool inode_dirtied_after(struct inode *inode, unsigned long t)
{
        bool ret = time_after(inode->dirtied_when, t);
#ifndef CONFIG_64BIT
        /*
         * For inodes being constantly redirtied, dirtied_when can get stuck.
         * It _appears_ to be in the future, but is actually in distant past.
         * This test is necessary to prevent such wrapped-around relative times
         * from permanently stopping the whole bdi writeback.
         */
        ret = ret && time_before_eq(inode->dirtied_when, jiffies);
#endif
        return ret;
}

/*
 * Move expired (dirtied before dirtied_before) dirty inodes from
 * @delaying_queue to @dispatch_queue.
 */
static int move_expired_inodes(struct list_head *delaying_queue,
                               struct list_head *dispatch_queue,
                               unsigned long dirtied_before)
{
        LIST_HEAD(tmp);
        struct list_head *pos, *node;
        struct super_block *sb = NULL;
        struct inode *inode;
        int do_sb_sort = 0;
        int moved = 0;

        while (!list_empty(delaying_queue)) {
                inode = wb_inode(delaying_queue->prev);
                if (inode_dirtied_after(inode, dirtied_before))
                        break;
                spin_lock(&inode->i_lock);
                list_move(&inode->i_io_list, &tmp);
                moved++;
                inode->i_state |= I_SYNC_QUEUED;
                spin_unlock(&inode->i_lock);
                if (sb_is_blkdev_sb(inode->i_sb))
                        continue;
                if (sb && sb != inode->i_sb)
                        do_sb_sort = 1;
                sb = inode->i_sb;
        }

        /* just one sb in list, splice to dispatch_queue and we're done */
        if (!do_sb_sort) {
                list_splice(&tmp, dispatch_queue);
                goto out;
        }

        /*
         * Although inode's i_io_list is moved from 'tmp' to 'dispatch_queue',
         * we don't take inode->i_lock here because it is just a pointless overhead.
         * Inode is already marked as I_SYNC_QUEUED so writeback list handling is
         * fully under our control.
         */
        while (!list_empty(&tmp)) {
                sb = wb_inode(tmp.prev)->i_sb;
                list_for_each_prev_safe(pos, node, &tmp) {
                        inode = wb_inode(pos);
                        if (inode->i_sb == sb)
                                list_move(&inode->i_io_list, dispatch_queue);
                }
        }
out:
        return moved;
}

/*
 * Queue all expired dirty inodes for io, eldest first.
 * Before
 *         newly dirtied     b_dirty    b_io    b_more_io
 *         =============>    gf         edc     BA
 * After
 *         newly dirtied     b_dirty    b_io    b_more_io
 *         =============>    g          fBAedc
 *                                           |
 *                                           +--> dequeue for IO
 */
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
                     unsigned long dirtied_before)
{
        int moved;
        unsigned long time_expire_jif = dirtied_before;

        assert_spin_locked(&wb->list_lock);
        list_splice_init(&wb->b_more_io, &wb->b_io);
        moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before);
        if (!work->for_sync)
                time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
        moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
                                     time_expire_jif);
        if (moved)
                wb_io_lists_populated(wb);
        trace_writeback_queue_io(wb, work, dirtied_before, moved);
}

static int write_inode(struct inode *inode, struct writeback_control *wbc)
{
        int ret;

        if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
                trace_writeback_write_inode_start(inode, wbc);
                ret = inode->i_sb->s_op->write_inode(inode, wbc);
                trace_writeback_write_inode(inode, wbc);
                return ret;
        }
        return 0;
}

/*
 * Wait for writeback on an inode to complete. Called with i_lock held.
 * Caller must make sure inode cannot go away when we drop i_lock.
 */
static void __inode_wait_for_writeback(struct inode *inode)
        __releases(inode->i_lock)
        __acquires(inode->i_lock)
{
        DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
        wait_queue_head_t *wqh;

        wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
        while (inode->i_state & I_SYNC) {
                spin_unlock(&inode->i_lock);
                __wait_on_bit(wqh, &wq, bit_wait,
                              TASK_UNINTERRUPTIBLE);
                spin_lock(&inode->i_lock);
        }
}

/*
 * Wait for writeback on an inode to complete. Caller must have inode pinned.
 */
void inode_wait_for_writeback(struct inode *inode)
{
        spin_lock(&inode->i_lock);
        __inode_wait_for_writeback(inode);
        spin_unlock(&inode->i_lock);
}

/*
 * Sleep until I_SYNC is cleared. This function must be called with i_lock
 * held and drops it. It is aimed for callers not holding any inode reference
 * so once i_lock is dropped, inode can go away.
 */
static void inode_sleep_on_writeback(struct inode *inode)
        __releases(inode->i_lock)
{
        DEFINE_WAIT(wait);
        wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
        int sleep;

        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
        sleep = inode->i_state & I_SYNC;
        spin_unlock(&inode->i_lock);
        if (sleep)
                schedule();
        finish_wait(wqh, &wait);
}

/*
 * Find proper writeback list for the inode depending on its current state and
 * possibly also change of its state while we were doing writeback.  Here we
 * handle things such as livelock prevention or fairness of writeback among
 * inodes. This function can be called only by flusher thread - noone else
 * processes all inodes in writeback lists and requeueing inodes behind flusher
 * thread's back can have unexpected consequences.
 */
static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
                          struct writeback_control *wbc,
                          unsigned long dirtied_before)
{
        if (inode->i_state & I_FREEING)
                return;

        /*
         * Sync livelock prevention. Each inode is tagged and synced in one
         * shot. If still dirty, it will be redirty_tail()'ed below.  Update
         * the dirty time to prevent enqueue and sync it again.
         */
        if ((inode->i_state & I_DIRTY) &&
            (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
                inode->dirtied_when = jiffies;

        if (wbc->pages_skipped) {
                /*
                 * Writeback is not making progress due to locked buffers.
                 * Skip this inode for now. Although having skipped pages
                 * is odd for clean inodes, it can happen for some
                 * filesystems so handle that gracefully.
                 */
                if (inode->i_state & I_DIRTY_ALL)
                        redirty_tail_locked(inode, wb);
                else
                        inode_cgwb_move_to_attached(inode, wb);
                return;
        }

        if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
                /*
                 * We didn't write back all the pages.  nfs_writepages()
                 * sometimes bales out without doing anything.
                 */
                if (wbc->nr_to_write <= 0 &&
                    !inode_dirtied_after(inode, dirtied_before)) {
                        /* Slice used up. Queue for next turn. */
                        requeue_io(inode, wb);
                } else {
                        /*
                         * Writeback blocked by something other than
                         * congestion. Delay the inode for some time to
                         * avoid spinning on the CPU (100% iowait)
                         * retrying writeback of the dirty page/inode
                         * that cannot be performed immediately.
                         */
                        redirty_tail_locked(inode, wb);
                }
        } else if (inode->i_state & I_DIRTY) {
                /*
                 * Filesystems can dirty the inode during writeback operations,
                 * such as delayed allocation during submission or metadata
                 * updates after data IO completion.
                 */
                redirty_tail_locked(inode, wb);
        } else if (inode->i_state & I_DIRTY_TIME) {
                inode->dirtied_when = jiffies;
                inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
                inode->i_state &= ~I_SYNC_QUEUED;
        } else {
                /* The inode is clean. Remove from writeback lists. */
                inode_cgwb_move_to_attached(inode, wb);
        }
}

/*
 * Write out an inode and its dirty pages (or some of its dirty pages, depending
 * on @wbc->nr_to_write), and clear the relevant dirty flags from i_state.
 *
 * This doesn't remove the inode from the writeback list it is on, except
 * potentially to move it from b_dirty_time to b_dirty due to timestamp
 * expiration.  The caller is otherwise responsible for writeback list handling.
 *
 * The caller is also responsible for setting the I_SYNC flag beforehand and
 * calling inode_sync_complete() to clear it afterwards.
 */
static int
__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
{
        struct address_space *mapping = inode->i_mapping;
        long nr_to_write = wbc->nr_to_write;
        unsigned dirty;
        int ret;

        WARN_ON(!(inode->i_state & I_SYNC));

        trace_writeback_single_inode_start(inode, wbc, nr_to_write);

        ret = do_writepages(mapping, wbc);

        /*
         * Make sure to wait on the data before writing out the metadata.
         * This is important for filesystems that modify metadata on data
         * I/O completion. We don't do it for sync(2) writeback because it has a
         * separate, external IO completion path and ->sync_fs for guaranteeing
         * inode metadata is written back correctly.
         */
        if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
                int err = filemap_fdatawait(mapping);
                if (ret == 0)
                        ret = err;
        }

        /*
         * If the inode has dirty timestamps and we need to write them, call
         * mark_inode_dirty_sync() to notify the filesystem about it and to
         * change I_DIRTY_TIME into I_DIRTY_SYNC.
         */
        if ((inode->i_state & I_DIRTY_TIME) &&
            (wbc->sync_mode == WB_SYNC_ALL ||
             time_after(jiffies, inode->dirtied_time_when +
                        dirtytime_expire_interval * HZ))) {
                trace_writeback_lazytime(inode);
                mark_inode_dirty_sync(inode);
        }

        /*
         * Get and clear the dirty flags from i_state.  This needs to be done
         * after calling writepages because some filesystems may redirty the
         * inode during writepages due to delalloc.  It also needs to be done
         * after handling timestamp expiration, as that may dirty the inode too.
         */
        spin_lock(&inode->i_lock);
        dirty = inode->i_state & I_DIRTY;
        inode->i_state &= ~dirty;

        /*
         * Paired with smp_mb() in __mark_inode_dirty().  This allows
         * __mark_inode_dirty() to test i_state without grabbing i_lock -
         * either they see the I_DIRTY bits cleared or we see the dirtied
         * inode.
         *
         * I_DIRTY_PAGES is always cleared together above even if @mapping
         * still has dirty pages.  The flag is reinstated after smp_mb() if
         * necessary.  This guarantees that either __mark_inode_dirty()
         * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
         */
        smp_mb();

        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                inode->i_state |= I_DIRTY_PAGES;
        else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) {
                if (!(inode->i_state & I_DIRTY_PAGES)) {
                        inode->i_state &= ~I_PINNING_NETFS_WB;
                        wbc->unpinned_netfs_wb = true;
                        dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */
                }
        }

        spin_unlock(&inode->i_lock);

        /* Don't write the inode if only I_DIRTY_PAGES was set */
        if (dirty & ~I_DIRTY_PAGES) {
                int err = write_inode(inode, wbc);
                if (ret == 0)
                        ret = err;
        }
        wbc->unpinned_netfs_wb = false;
        trace_writeback_single_inode(inode, wbc, nr_to_write);
        return ret;
}

/*
 * Write out an inode's dirty data and metadata on-demand, i.e. separately from
 * the regular batched writeback done by the flusher threads in
 * writeback_sb_inodes().  @wbc controls various aspects of the write, such as
 * whether it is a data-integrity sync (%WB_SYNC_ALL) or not (%WB_SYNC_NONE).
 *
 * To prevent the inode from going away, either the caller must have a reference
 * to the inode, or the inode must have I_WILL_FREE or I_FREEING set.
 */
static int writeback_single_inode(struct inode *inode,
                                  struct writeback_control *wbc)
{
        struct bdi_writeback *wb;
        int ret = 0;

        spin_lock(&inode->i_lock);
        if (!atomic_read(&inode->i_count))
                WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
        else
                WARN_ON(inode->i_state & I_WILL_FREE);

        if (inode->i_state & I_SYNC) {
                /*
                 * Writeback is already running on the inode.  For WB_SYNC_NONE,
                 * that's enough and we can just return.  For WB_SYNC_ALL, we
                 * must wait for the existing writeback to complete, then do
                 * writeback again if there's anything left.
                 */
                if (wbc->sync_mode != WB_SYNC_ALL)
                        goto out;
                __inode_wait_for_writeback(inode);
        }
        WARN_ON(inode->i_state & I_SYNC);
        /*
         * If the inode is already fully clean, then there's nothing to do.
         *
         * For data-integrity syncs we also need to check whether any pages are
         * still under writeback, e.g. due to prior WB_SYNC_NONE writeback.  If
         * there are any such pages, we'll need to wait for them.
         */
        if (!(inode->i_state & I_DIRTY_ALL) &&
            (wbc->sync_mode != WB_SYNC_ALL ||
             !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
                goto out;
        inode->i_state |= I_SYNC;
        wbc_attach_and_unlock_inode(wbc, inode);

        ret = __writeback_single_inode(inode, wbc);

        wbc_detach_inode(wbc);

        wb = inode_to_wb_and_lock_list(inode);
        spin_lock(&inode->i_lock);
        /*
         * If the inode is freeing, its i_io_list shoudn't be updated
         * as it can be finally deleted at this moment.
         */
        if (!(inode->i_state & I_FREEING)) {
                /*
                 * If the inode is now fully clean, then it can be safely
                 * removed from its writeback list (if any). Otherwise the
                 * flusher threads are responsible for the writeback lists.
                 */
                if (!(inode->i_state & I_DIRTY_ALL))
                        inode_cgwb_move_to_attached(inode, wb);
                else if (!(inode->i_state & I_SYNC_QUEUED)) {
                        if ((inode->i_state & I_DIRTY))
                                redirty_tail_locked(inode, wb);
                        else if (inode->i_state & I_DIRTY_TIME) {
                                inode->dirtied_when = jiffies;
                                inode_io_list_move_locked(inode,
                                                          wb,
                                                          &wb->b_dirty_time);
                        }
                }
        }

        spin_unlock(&wb->list_lock);
        inode_sync_complete(inode);
out:
        spin_unlock(&inode->i_lock);
        return ret;
}

static long writeback_chunk_size(struct bdi_writeback *wb,
                                 struct wb_writeback_work *work)
{
        long pages;

        /*
         * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
         * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
         * here avoids calling into writeback_inodes_wb() more than once.
         *
         * The intended call sequence for WB_SYNC_ALL writeback is:
         *
         *      wb_writeback()
         *          writeback_sb_inodes()       <== called only once
         *              write_cache_pages()     <== called once for each inode
         *                   (quickly) tag currently dirty pages
         *                   (maybe slowly) sync all tagged pages
         */
        if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
                pages = LONG_MAX;
        else {
                pages = min(wb->avg_write_bandwidth / 2,
                            global_wb_domain.dirty_limit / DIRTY_SCOPE);
                pages = min(pages, work->nr_pages);
                pages = round_down(pages + MIN_WRITEBACK_PAGES,
                                   MIN_WRITEBACK_PAGES);
        }

        return pages;
}

/*
 * Write a portion of b_io inodes which belong to @sb.
 *
 * Return the number of pages and/or inodes written.
 *
 * NOTE! This is called with wb->list_lock held, and will
 * unlock and relock that for each inode it ends up doing
 * IO for.
 */
static long writeback_sb_inodes(struct super_block *sb,
                                struct bdi_writeback *wb,
                                struct wb_writeback_work *work)
{
        struct writeback_control wbc = {
                .sync_mode                = work->sync_mode,
                .tagged_writepages        = work->tagged_writepages,
                .for_kupdate                = work->for_kupdate,
                .for_background                = work->for_background,
                .for_sync                = work->for_sync,
                .range_cyclic                = work->range_cyclic,
                .range_start                = 0,
                .range_end                = LLONG_MAX,
        };
        unsigned long start_time = jiffies;
        long write_chunk;
        long total_wrote = 0;  /* count both pages and inodes */
        unsigned long dirtied_before = jiffies;

        if (work->for_kupdate)
                dirtied_before = jiffies -
                        msecs_to_jiffies(dirty_expire_interval * 10);

        while (!list_empty(&wb->b_io)) {
                struct inode *inode = wb_inode(wb->b_io.prev);
                struct bdi_writeback *tmp_wb;
                long wrote;

                if (inode->i_sb != sb) {
                        if (work->sb) {
                                /*
                                 * We only want to write back data for this
                                 * superblock, move all inodes not belonging
                                 * to it back onto the dirty list.
                                 */
                                redirty_tail(inode, wb);
                                continue;
                        }

                        /*
                         * The inode belongs to a different superblock.
                         * Bounce back to the caller to unpin this and
                         * pin the next superblock.
                         */
                        break;
                }

                /*
                 * Don't bother with new inodes or inodes being freed, first
                 * kind does not need periodic writeout yet, and for the latter
                 * kind writeout is handled by the freer.
                 */
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        redirty_tail_locked(inode, wb);
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
                        /*
                         * If this inode is locked for writeback and we are not
                         * doing writeback-for-data-integrity, move it to
                         * b_more_io so that writeback can proceed with the
                         * other inodes on s_io.
                         *
                         * We'll have another go at writing back this inode
                         * when we completed a full scan of b_io.
                         */
                        requeue_io(inode, wb);
                        spin_unlock(&inode->i_lock);
                        trace_writeback_sb_inodes_requeue(inode);
                        continue;
                }
                spin_unlock(&wb->list_lock);

                /*
                 * We already requeued the inode if it had I_SYNC set and we
                 * are doing WB_SYNC_NONE writeback. So this catches only the
                 * WB_SYNC_ALL case.
                 */
                if (inode->i_state & I_SYNC) {
                        /* Wait for I_SYNC. This function drops i_lock... */
                        inode_sleep_on_writeback(inode);
                        /* Inode may be gone, start again */
                        spin_lock(&wb->list_lock);
                        continue;
                }
                inode->i_state |= I_SYNC;
                wbc_attach_and_unlock_inode(&wbc, inode);

                write_chunk = writeback_chunk_size(wb, work);
                wbc.nr_to_write = write_chunk;
                wbc.pages_skipped = 0;

                /*
                 * We use I_SYNC to pin the inode in memory. While it is set
                 * evict_inode() will wait so the inode cannot be freed.
                 */
                __writeback_single_inode(inode, &wbc);

                wbc_detach_inode(&wbc);
                work->nr_pages -= write_chunk - wbc.nr_to_write;
                wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped;
                wrote = wrote < 0 ? 0 : wrote;
                total_wrote += wrote;

                if (need_resched()) {
                        /*
                         * We're trying to balance between building up a nice
                         * long list of IOs to improve our merge rate, and
                         * getting those IOs out quickly for anyone throttling
                         * in balance_dirty_pages().  cond_resched() doesn't
                         * unplug, so get our IOs out the door before we
                         * give up the CPU.
                         */
                        blk_flush_plug(current->plug, false);
                        cond_resched();
                }

                /*
                 * Requeue @inode if still dirty.  Be careful as @inode may
                 * have been switched to another wb in the meantime.
                 */
                tmp_wb = inode_to_wb_and_lock_list(inode);
                spin_lock(&inode->i_lock);
                if (!(inode->i_state & I_DIRTY_ALL))
                        total_wrote++;
                requeue_inode(inode, tmp_wb, &wbc, dirtied_before);
                inode_sync_complete(inode);
                spin_unlock(&inode->i_lock);

                if (unlikely(tmp_wb != wb)) {
                        spin_unlock(&tmp_wb->list_lock);
                        spin_lock(&wb->list_lock);
                }

                /*
                 * bail out to wb_writeback() often enough to check
                 * background threshold and other termination conditions.
                 */
                if (total_wrote) {
                        if (time_is_before_jiffies(start_time + HZ / 10UL))
                                break;
                        if (work->nr_pages <= 0)
                                break;
                }
        }
        return total_wrote;
}

static long __writeback_inodes_wb(struct bdi_writeback *wb,
                                  struct wb_writeback_work *work)
{
        unsigned long start_time = jiffies;
        long wrote = 0;

        while (!list_empty(&wb->b_io)) {
                struct inode *inode = wb_inode(wb->b_io.prev);
                struct super_block *sb = inode->i_sb;

                if (!super_trylock_shared(sb)) {
                        /*
                         * super_trylock_shared() may fail consistently due to
                         * s_umount being grabbed by someone else. Don't use
                         * requeue_io() to avoid busy retrying the inode/sb.
                         */
                        redirty_tail(inode, wb);
                        continue;
                }
                wrote += writeback_sb_inodes(sb, wb, work);
                up_read(&sb->s_umount);

                /* refer to the same tests at the end of writeback_sb_inodes */
                if (wrote) {
                        if (time_is_before_jiffies(start_time + HZ / 10UL))
                                break;
                        if (work->nr_pages <= 0)
                                break;
                }
        }
        /* Leave any unwritten inodes on b_io */
        return wrote;
}

static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
                                enum wb_reason reason)
{
        struct wb_writeback_work work = {
                .nr_pages        = nr_pages,
                .sync_mode        = WB_SYNC_NONE,
                .range_cyclic        = 1,
                .reason                = reason,
        };
        struct blk_plug plug;

        blk_start_plug(&plug);
        spin_lock(&wb->list_lock);
        if (list_empty(&wb->b_io))
                queue_io(wb, &work, jiffies);
        __writeback_inodes_wb(wb, &work);
        spin_unlock(&wb->list_lock);
        blk_finish_plug(&plug);

        return nr_pages - work.nr_pages;
}

/*
 * Explicit flushing or periodic writeback of "old" data.
 *
 * Define "old": the first time one of an inode's pages is dirtied, we mark the
 * dirtying-time in the inode's address_space.  So this periodic writeback code
 * just walks the superblock inode list, writing back any inodes which are
 * older than a specific point in time.
 *
 * Try to run once per dirty_writeback_interval.  But if a writeback event
 * takes longer than a dirty_writeback_interval interval, then leave a
 * one-second gap.
 *
 * dirtied_before takes precedence over nr_to_write.  So we'll only write back
 * all dirty pages if they are all attached to "old" mappings.
 */
static long wb_writeback(struct bdi_writeback *wb,
                         struct wb_writeback_work *work)
{
        long nr_pages = work->nr_pages;
        unsigned long dirtied_before = jiffies;
        struct inode *inode;
        long progress;
        struct blk_plug plug;
        bool queued = false;

        blk_start_plug(&plug);
        for (;;) {
                /*
                 * Stop writeback when nr_pages has been consumed
                 */
                if (work->nr_pages <= 0)
                        break;

                /*
                 * Background writeout and kupdate-style writeback may
                 * run forever. Stop them if there is other work to do
                 * so that e.g. sync can proceed. They'll be restarted
                 * after the other works are all done.
                 */
                if ((work->for_background || work->for_kupdate) &&
                    !list_empty(&wb->work_list))
                        break;

                /*
                 * For background writeout, stop when we are below the
                 * background dirty threshold
                 */
                if (work->for_background && !wb_over_bg_thresh(wb))
                        break;


                spin_lock(&wb->list_lock);

                trace_writeback_start(wb, work);
                if (list_empty(&wb->b_io)) {
                        /*
                         * Kupdate and background works are special and we want
                         * to include all inodes that need writing. Livelock
                         * avoidance is handled by these works yielding to any
                         * other work so we are safe.
                         */
                        if (work->for_kupdate) {
                                dirtied_before = jiffies -
                                        msecs_to_jiffies(dirty_expire_interval *
                                                         10);
                        } else if (work->for_background)
                                dirtied_before = jiffies;

                        queue_io(wb, work, dirtied_before);
                        queued = true;
                }
                if (work->sb)
                        progress = writeback_sb_inodes(work->sb, wb, work);
                else
                        progress = __writeback_inodes_wb(wb, work);
                trace_writeback_written(wb, work);

                /*
                 * Did we write something? Try for more
                 *
                 * Dirty inodes are moved to b_io for writeback in batches.
                 * The completion of the current batch does not necessarily
                 * mean the overall work is done. So we keep looping as long
                 * as made some progress on cleaning pages or inodes.
                 */
                if (progress || !queued) {
                        spin_unlock(&wb->list_lock);
                        continue;
                }

                /*
                 * No more inodes for IO, bail
                 */
                if (list_empty(&wb->b_more_io)) {
                        spin_unlock(&wb->list_lock);
                        break;
                }

                /*
                 * Nothing written. Wait for some inode to
                 * become available for writeback. Otherwise
                 * we'll just busyloop.
                 */
                trace_writeback_wait(wb, work);
                inode = wb_inode(wb->b_more_io.prev);
                spin_lock(&inode->i_lock);
                spin_unlock(&wb->list_lock);
                /* This function drops i_lock... */
                inode_sleep_on_writeback(inode);
        }
        blk_finish_plug(&plug);

        return nr_pages - work->nr_pages;
}

/*
 * Return the next wb_writeback_work struct that hasn't been processed yet.
 */
static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
{
        struct wb_writeback_work *work = NULL;

        spin_lock_irq(&wb->work_lock);
        if (!list_empty(&wb->work_list)) {
                work = list_entry(wb->work_list.next,
                                  struct wb_writeback_work, list);
                list_del_init(&work->list);
        }
        spin_unlock_irq(&wb->work_lock);
        return work;
}

static long wb_check_background_flush(struct bdi_writeback *wb)
{
        if (wb_over_bg_thresh(wb)) {

                struct wb_writeback_work work = {
                        .nr_pages        = LONG_MAX,
                        .sync_mode        = WB_SYNC_NONE,
                        .for_background        = 1,
                        .range_cyclic        = 1,
                        .reason                = WB_REASON_BACKGROUND,
                };

                return wb_writeback(wb, &work);
        }

        return 0;
}

static long wb_check_old_data_flush(struct bdi_writeback *wb)
{
        unsigned long expired;
        long nr_pages;

        /*
         * When set to zero, disable periodic writeback
         */
        if (!dirty_writeback_interval)
                return 0;

        expired = wb->last_old_flush +
                        msecs_to_jiffies(dirty_writeback_interval * 10);
        if (time_before(jiffies, expired))
                return 0;

        wb->last_old_flush = jiffies;
        nr_pages = get_nr_dirty_pages();

        if (nr_pages) {
                struct wb_writeback_work work = {
                        .nr_pages        = nr_pages,
                        .sync_mode        = WB_SYNC_NONE,
                        .for_kupdate        = 1,
                        .range_cyclic        = 1,
                        .reason                = WB_REASON_PERIODIC,
                };

                return wb_writeback(wb, &work);
        }

        return 0;
}

static long wb_check_start_all(struct bdi_writeback *wb)
{
        long nr_pages;

        if (!test_bit(WB_start_all, &wb->state))
                return 0;

        nr_pages = get_nr_dirty_pages();
        if (nr_pages) {
                struct wb_writeback_work work = {
                        .nr_pages        = wb_split_bdi_pages(wb, nr_pages),
                        .sync_mode        = WB_SYNC_NONE,
                        .range_cyclic        = 1,
                        .reason                = wb->start_all_reason,
                };

                nr_pages = wb_writeback(wb, &work);
        }

        clear_bit(WB_start_all, &wb->state);
        return nr_pages;
}


/*
 * Retrieve work items and do the writeback they describe
 */
static long wb_do_writeback(struct bdi_writeback *wb)
{
        struct wb_writeback_work *work;
        long wrote = 0;

        set_bit(WB_writeback_running, &wb->state);
        while ((work = get_next_work_item(wb)) != NULL) {
                trace_writeback_exec(wb, work);
                wrote += wb_writeback(wb, work);
                finish_writeback_work(work);
        }

        /*
         * Check for a flush-everything request
         */
        wrote += wb_check_start_all(wb);

        /*
         * Check for periodic writeback, kupdated() style
         */
        wrote += wb_check_old_data_flush(wb);
        wrote += wb_check_background_flush(wb);
        clear_bit(WB_writeback_running, &wb->state);

        return wrote;
}

/*
 * Handle writeback of dirty data for the device backed by this bdi. Also
 * reschedules periodically and does kupdated style flushing.
 */
void wb_workfn(struct work_struct *work)
{
        struct bdi_writeback *wb = container_of(to_delayed_work(work),
                                                struct bdi_writeback, dwork);
        long pages_written;

        set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));

        if (likely(!current_is_workqueue_rescuer() ||
                   !test_bit(WB_registered, &wb->state))) {
                /*
                 * The normal path.  Keep writing back @wb until its
                 * work_list is empty.  Note that this path is also taken
                 * if @wb is shutting down even when we're running off the
                 * rescuer as work_list needs to be drained.
                 */
                do {
                        pages_written = wb_do_writeback(wb);
                        trace_writeback_pages_written(pages_written);
                } while (!list_empty(&wb->work_list));
        } else {
                /*
                 * bdi_wq can't get enough workers and we're running off
                 * the emergency worker.  Don't hog it.  Hopefully, 1024 is
                 * enough for efficient IO.
                 */
                pages_written = writeback_inodes_wb(wb, 1024,
                                                    WB_REASON_FORKER_THREAD);
                trace_writeback_pages_written(pages_written);
        }

        if (!list_empty(&wb->work_list))
                wb_wakeup(wb);
        else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
                wb_wakeup_delayed(wb);
}

/*
 * Start writeback of all dirty pages on this bdi.
 */
static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
                                         enum wb_reason reason)
{
        struct bdi_writeback *wb;

        if (!bdi_has_dirty_io(bdi))
                return;

        list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
                wb_start_writeback(wb, reason);
}

void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
                                enum wb_reason reason)
{
        rcu_read_lock();
        __wakeup_flusher_threads_bdi(bdi, reason);
        rcu_read_unlock();
}

/*
 * Wakeup the flusher threads to start writeback of all currently dirty pages
 */
void wakeup_flusher_threads(enum wb_reason reason)
{
        struct backing_dev_info *bdi;

        /*
         * If we are expecting writeback progress we must submit plugged IO.
         */
        blk_flush_plug(current->plug, true);

        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
                __wakeup_flusher_threads_bdi(bdi, reason);
        rcu_read_unlock();
}

/*
 * Wake up bdi's periodically to make sure dirtytime inodes gets
 * written back periodically.  We deliberately do *not* check the
 * b_dirtytime list in wb_has_dirty_io(), since this would cause the
 * kernel to be constantly waking up once there are any dirtytime
 * inodes on the system.  So instead we define a separate delayed work
 * function which gets called much more rarely.  (By default, only
 * once every 12 hours.)
 *
 * If there is any other write activity going on in the file system,
 * this function won't be necessary.  But if the only thing that has
 * happened on the file system is a dirtytime inode caused by an atime
 * update, we need this infrastructure below to make sure that inode
 * eventually gets pushed out to disk.
 */
static void wakeup_dirtytime_writeback(struct work_struct *w);
static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);

static void wakeup_dirtytime_writeback(struct work_struct *w)
{
        struct backing_dev_info *bdi;

        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                struct bdi_writeback *wb;

                list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
                        if (!list_empty(&wb->b_dirty_time))
                                wb_wakeup(wb);
        }
        rcu_read_unlock();
        schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
}

static int __init start_dirtytime_writeback(void)
{
        schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
        return 0;
}
__initcall(start_dirtytime_writeback);

int dirtytime_interval_handler(struct ctl_table *table, int write,
                               void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                mod_delayed_work(system_wq, &dirtytime_work, 0);
        return ret;
}

/**
 * __mark_inode_dirty -        internal function to mark an inode dirty
 *
 * @inode: inode to mark
 * @flags: what kind of dirty, e.g. I_DIRTY_SYNC.  This can be a combination of
 *           multiple I_DIRTY_* flags, except that I_DIRTY_TIME can't be combined
 *           with I_DIRTY_PAGES.
 *
 * Mark an inode as dirty.  We notify the filesystem, then update the inode's
 * dirty flags.  Then, if needed we add the inode to the appropriate dirty list.
 *
 * Most callers should use mark_inode_dirty() or mark_inode_dirty_sync()
 * instead of calling this directly.
 *
 * CAREFUL!  We only add the inode to the dirty list if it is hashed or if it
 * refers to a blockdev.  Unhashed inodes will never be added to the dirty list
 * even if they are later hashed, as they will have been marked dirty already.
 *
 * In short, ensure you hash any inodes _before_ you start marking them dirty.
 *
 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
 * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
 * the kernel-internal blockdev inode represents the dirtying time of the
 * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
 * page->mapping->host, so the page-dirtying time is recorded in the internal
 * blockdev inode.
 */
void __mark_inode_dirty(struct inode *inode, int flags)
{
        struct super_block *sb = inode->i_sb;
        int dirtytime = 0;
        struct bdi_writeback *wb = NULL;

        trace_writeback_mark_inode_dirty(inode, flags);

        if (flags & I_DIRTY_INODE) {
                /*
                 * Inode timestamp update will piggback on this dirtying.
                 * We tell ->dirty_inode callback that timestamps need to
                 * be updated by setting I_DIRTY_TIME in flags.
                 */
                if (inode->i_state & I_DIRTY_TIME) {
                        spin_lock(&inode->i_lock);
                        if (inode->i_state & I_DIRTY_TIME) {
                                inode->i_state &= ~I_DIRTY_TIME;
                                flags |= I_DIRTY_TIME;
                        }
                        spin_unlock(&inode->i_lock);
                }

                /*
                 * Notify the filesystem about the inode being dirtied, so that
                 * (if needed) it can update on-disk fields and journal the
                 * inode.  This is only needed when the inode itself is being
                 * dirtied now.  I.e. it's only needed for I_DIRTY_INODE, not
                 * for just I_DIRTY_PAGES or I_DIRTY_TIME.
                 */
                trace_writeback_dirty_inode_start(inode, flags);
                if (sb->s_op->dirty_inode)
                        sb->s_op->dirty_inode(inode,
                                flags & (I_DIRTY_INODE | I_DIRTY_TIME));
                trace_writeback_dirty_inode(inode, flags);

                /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */
                flags &= ~I_DIRTY_TIME;
        } else {
                /*
                 * Else it's either I_DIRTY_PAGES, I_DIRTY_TIME, or nothing.
                 * (We don't support setting both I_DIRTY_PAGES and I_DIRTY_TIME
                 * in one call to __mark_inode_dirty().)
                 */
                dirtytime = flags & I_DIRTY_TIME;
                WARN_ON_ONCE(dirtytime && flags != I_DIRTY_TIME);
        }

        /*
         * Paired with smp_mb() in __writeback_single_inode() for the
         * following lockless i_state test.  See there for details.
         */
        smp_mb();

        if ((inode->i_state & flags) == flags)
                return;

        spin_lock(&inode->i_lock);
        if ((inode->i_state & flags) != flags) {
                const int was_dirty = inode->i_state & I_DIRTY;

                inode_attach_wb(inode, NULL);

                inode->i_state |= flags;

                /*
                 * Grab inode's wb early because it requires dropping i_lock and we
                 * need to make sure following checks happen atomically with dirty
                 * list handling so that we don't move inodes under flush worker's
                 * hands.
                 */
                if (!was_dirty) {
                        wb = locked_inode_to_wb_and_lock_list(inode);
                        spin_lock(&inode->i_lock);
                }

                /*
                 * If the inode is queued for writeback by flush worker, just
                 * update its dirty state. Once the flush worker is done with
                 * the inode it will place it on the appropriate superblock
                 * list, based upon its state.
                 */
                if (inode->i_state & I_SYNC_QUEUED)
                        goto out_unlock;

                /*
                 * Only add valid (hashed) inodes to the superblock's
                 * dirty list.  Add blockdev inodes as well.
                 */
                if (!S_ISBLK(inode->i_mode)) {
                        if (inode_unhashed(inode))
                                goto out_unlock;
                }
                if (inode->i_state & I_FREEING)
                        goto out_unlock;

                /*
                 * If the inode was already on b_dirty/b_io/b_more_io, don't
                 * reposition it (that would break b_dirty time-ordering).
                 */
                if (!was_dirty) {
                        struct list_head *dirty_list;
                        bool wakeup_bdi = false;

                        inode->dirtied_when = jiffies;
                        if (dirtytime)
                                inode->dirtied_time_when = jiffies;

                        if (inode->i_state & I_DIRTY)
                                dirty_list = &wb->b_dirty;
                        else
                                dirty_list = &wb->b_dirty_time;

                        wakeup_bdi = inode_io_list_move_locked(inode, wb,
                                                               dirty_list);

                        spin_unlock(&wb->list_lock);
                        spin_unlock(&inode->i_lock);
                        trace_writeback_dirty_inode_enqueue(inode);

                        /*
                         * If this is the first dirty inode for this bdi,
                         * we have to wake-up the corresponding bdi thread
                         * to make sure background write-back happens
                         * later.
                         */
                        if (wakeup_bdi &&
                            (wb->bdi->capabilities & BDI_CAP_WRITEBACK))
                                wb_wakeup_delayed(wb);
                        return;
                }
        }
out_unlock:
        if (wb)
                spin_unlock(&wb->list_lock);
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(__mark_inode_dirty);

/*
 * The @s_sync_lock is used to serialise concurrent sync operations
 * to avoid lock contention problems with concurrent wait_sb_inodes() calls.
 * Concurrent callers will block on the s_sync_lock rather than doing contending
 * walks. The queueing maintains sync(2) required behaviour as all the IO that
 * has been issued up to the time this function is enter is guaranteed to be
 * completed by the time we have gained the lock and waited for all IO that is
 * in progress regardless of the order callers are granted the lock.
 */
static void wait_sb_inodes(struct super_block *sb)
{
        LIST_HEAD(sync_list);

        /*
         * We need to be protected against the filesystem going from
         * r/o to r/w or vice versa.
         */
        WARN_ON(!rwsem_is_locked(&sb->s_umount));

        mutex_lock(&sb->s_sync_lock);

        /*
         * Splice the writeback list onto a temporary list to avoid waiting on
         * inodes that have started writeback after this point.
         *
         * Use rcu_read_lock() to keep the inodes around until we have a
         * reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as
         * the local list because inodes can be dropped from either by writeback
         * completion.
         */
        rcu_read_lock();
        spin_lock_irq(&sb->s_inode_wblist_lock);
        list_splice_init(&sb->s_inodes_wb, &sync_list);

        /*
         * Data integrity sync. Must wait for all pages under writeback, because
         * there may have been pages dirtied before our sync call, but which had
         * writeout started before we write it out.  In which case, the inode
         * may not be on the dirty list, but we still have to wait for that
         * writeout.
         */
        while (!list_empty(&sync_list)) {
                struct inode *inode = list_first_entry(&sync_list, struct inode,
                                                       i_wb_list);
                struct address_space *mapping = inode->i_mapping;

                /*
                 * Move each inode back to the wb list before we drop the lock
                 * to preserve consistency between i_wb_list and the mapping
                 * writeback tag. Writeback completion is responsible to remove
                 * the inode from either list once the writeback tag is cleared.
                 */
                list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);

                /*
                 * The mapping can appear untagged while still on-list since we
                 * do not have the mapping lock. Skip it here, wb completion
                 * will remove it.
                 */
                if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
                        continue;

                spin_unlock_irq(&sb->s_inode_wblist_lock);

                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
                        spin_unlock(&inode->i_lock);

                        spin_lock_irq(&sb->s_inode_wblist_lock);
                        continue;
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                rcu_read_unlock();

                /*
                 * We keep the error status of individual mapping so that
                 * applications can catch the writeback error using fsync(2).
                 * See filemap_fdatawait_keep_errors() for details.
                 */
                filemap_fdatawait_keep_errors(mapping);

                cond_resched();

                iput(inode);

                rcu_read_lock();
                spin_lock_irq(&sb->s_inode_wblist_lock);
        }
        spin_unlock_irq(&sb->s_inode_wblist_lock);
        rcu_read_unlock();
        mutex_unlock(&sb->s_sync_lock);
}

static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
                                     enum wb_reason reason, bool skip_if_busy)
{
        struct backing_dev_info *bdi = sb->s_bdi;
        DEFINE_WB_COMPLETION(done, bdi);
        struct wb_writeback_work work = {
                .sb                        = sb,
                .sync_mode                = WB_SYNC_NONE,
                .tagged_writepages        = 1,
                .done                        = &done,
                .nr_pages                = nr,
                .reason                        = reason,
        };

        if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
                return;
        WARN_ON(!rwsem_is_locked(&sb->s_umount));

        bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
        wb_wait_for_completion(&done);
}

/**
 * writeback_inodes_sb_nr -        writeback dirty inodes from given super_block
 * @sb: the superblock
 * @nr: the number of pages to write
 * @reason: reason why some writeback work initiated
 *
 * Start writeback on some inodes on this super_block. No guarantees are made
 * on how many (if any) will be written, and this function does not wait
 * for IO completion of submitted IO.
 */
void writeback_inodes_sb_nr(struct super_block *sb,
                            unsigned long nr,
                            enum wb_reason reason)
{
        __writeback_inodes_sb_nr(sb, nr, reason, false);
}
EXPORT_SYMBOL(writeback_inodes_sb_nr);

/**
 * writeback_inodes_sb        -        writeback dirty inodes from given super_block
 * @sb: the superblock
 * @reason: reason why some writeback work was initiated
 *
 * Start writeback on some inodes on this super_block. No guarantees are made
 * on how many (if any) will be written, and this function does not wait
 * for IO completion of submitted IO.
 */
void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
        writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
}
EXPORT_SYMBOL(writeback_inodes_sb);

/**
 * try_to_writeback_inodes_sb - try to start writeback if none underway
 * @sb: the superblock
 * @reason: reason why some writeback work was initiated
 *
 * Invoke __writeback_inodes_sb_nr if no writeback is currently underway.
 */
void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
        if (!down_read_trylock(&sb->s_umount))
                return;

        __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
        up_read(&sb->s_umount);
}
EXPORT_SYMBOL(try_to_writeback_inodes_sb);

/**
 * sync_inodes_sb        -        sync sb inode pages
 * @sb: the superblock
 *
 * This function writes and waits on any dirty inode belonging to this
 * super_block.
 */
void sync_inodes_sb(struct super_block *sb)
{
        struct backing_dev_info *bdi = sb->s_bdi;
        DEFINE_WB_COMPLETION(done, bdi);
        struct wb_writeback_work work = {
                .sb                = sb,
                .sync_mode        = WB_SYNC_ALL,
                .nr_pages        = LONG_MAX,
                .range_cyclic        = 0,
                .done                = &done,
                .reason                = WB_REASON_SYNC,
                .for_sync        = 1,
        };

        /*
         * Can't skip on !bdi_has_dirty() because we should wait for !dirty
         * inodes under writeback and I_DIRTY_TIME inodes ignored by
         * bdi_has_dirty() need to be written out too.
         */
        if (bdi == &noop_backing_dev_info)
                return;
        WARN_ON(!rwsem_is_locked(&sb->s_umount));

        /* protect against inode wb switch, see inode_switch_wbs_work_fn() */
        bdi_down_write_wb_switch_rwsem(bdi);
        bdi_split_work_to_wbs(bdi, &work, false);
        wb_wait_for_completion(&done);
        bdi_up_write_wb_switch_rwsem(bdi);

        wait_sb_inodes(sb);
}
EXPORT_SYMBOL(sync_inodes_sb);

/**
 * write_inode_now        -        write an inode to disk
 * @inode: inode to write to disk
 * @sync: whether the write should be synchronous or not
 *
 * This function commits an inode to disk immediately if it is dirty. This is
 * primarily needed by knfsd.
 *
 * The caller must either have a ref on the inode or must have set I_WILL_FREE.
 */
int write_inode_now(struct inode *inode, int sync)
{
        struct writeback_control wbc = {
                .nr_to_write = LONG_MAX,
                .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
                .range_start = 0,
                .range_end = LLONG_MAX,
        };

        if (!mapping_can_writeback(inode->i_mapping))
                wbc.nr_to_write = 0;

        might_sleep();
        return writeback_single_inode(inode, &wbc);
}
EXPORT_SYMBOL(write_inode_now);

/**
 * sync_inode_metadata - write an inode to disk
 * @inode: the inode to sync
 * @wait: wait for I/O to complete.
 *
 * Write an inode to disk and adjust its dirty state after completion.
 *
 * Note: only writes the actual inode, no associated data or other metadata.
 */
int sync_inode_metadata(struct inode *inode, int wait)
{
        struct writeback_control wbc = {
                .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
                .nr_to_write = 0, /* metadata-only */
        };

        return writeback_single_inode(inode, &wbc);
}
EXPORT_SYMBOL(sync_inode_metadata);




























    6 




    6 



















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Lock-less NULL terminated single linked list
 *
 * The basic atomic operation of this list is cmpxchg on long.  On
 * architectures that don't have NMI-safe cmpxchg implementation, the
 * list can NOT be used in NMI handlers.  So code that uses the list in
 * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
 *
 * Copyright 2010,2011 Intel Corp.
 *   Author: Huang Ying <ying.huang@intel.com>
 */
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/llist.h>


/**
 * llist_add_batch - add several linked entries in batch
 * @new_first:        first entry in batch to be added
 * @new_last:        last entry in batch to be added
 * @head:        the head for your lock-less list
 *
 * Return whether list is empty before adding.
 */
bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
                     struct llist_head *head)
{
        struct llist_node *first = READ_ONCE(head->first);

        do {
                new_last->next = first;
        } while (!try_cmpxchg(&head->first, &first, new_first));

        return !first;
}
EXPORT_SYMBOL_GPL(llist_add_batch);

/**
 * llist_del_first - delete the first entry of lock-less list
 * @head:        the head for your lock-less list
 *
 * If list is empty, return NULL, otherwise, return the first entry
 * deleted, this is the newest added one.
 *
 * Only one llist_del_first user can be used simultaneously with
 * multiple llist_add users without lock.  Because otherwise
 * llist_del_first, llist_add, llist_add (or llist_del_all, llist_add,
 * llist_add) sequence in another user may change @head->first->next,
 * but keep @head->first.  If multiple consumers are needed, please
 * use llist_del_all or use lock between consumers.
 */
struct llist_node *llist_del_first(struct llist_head *head)
{
        struct llist_node *entry, *next;

        entry = smp_load_acquire(&head->first);
        do {
                if (entry == NULL)
                        return NULL;
                next = READ_ONCE(entry->next);
        } while (!try_cmpxchg(&head->first, &entry, next));

        return entry;
}
EXPORT_SYMBOL_GPL(llist_del_first);

/**
 * llist_del_first_this - delete given entry of lock-less list if it is first
 * @head:        the head for your lock-less list
 * @this:        a list entry.
 *
 * If head of the list is given entry, delete and return %true else
 * return %false.
 *
 * Multiple callers can safely call this concurrently with multiple
 * llist_add() callers, providing all the callers offer a different @this.
 */
bool llist_del_first_this(struct llist_head *head,
                          struct llist_node *this)
{
        struct llist_node *entry, *next;

        /* acquire ensures orderig wrt try_cmpxchg() is llist_del_first() */
        entry = smp_load_acquire(&head->first);
        do {
                if (entry != this)
                        return false;
                next = READ_ONCE(entry->next);
        } while (!try_cmpxchg(&head->first, &entry, next));

        return true;
}
EXPORT_SYMBOL_GPL(llist_del_first_this);

/**
 * llist_reverse_order - reverse order of a llist chain
 * @head:        first item of the list to be reversed
 *
 * Reverse the order of a chain of llist entries and return the
 * new first entry.
 */
struct llist_node *llist_reverse_order(struct llist_node *head)
{
        struct llist_node *new_head = NULL;

        while (head) {
                struct llist_node *tmp = head;
                head = head->next;
                tmp->next = new_head;
                new_head = tmp;
        }

        return new_head;
}
EXPORT_SYMBOL_GPL(llist_reverse_order);














































































































    1 




    1 

    1 

































    1 





    1 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 



    1 



















    1 





    1 



    1 





    1 




























































































    1 










    1 















    1 






    1 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
// SPDX-License-Identifier: GPL-2.0-only
/*
 * super.c
 *
 * PURPOSE
 *  Super block routines for the OSTA-UDF(tm) filesystem.
 *
 * DESCRIPTION
 *  OSTA-UDF(tm) = Optical Storage Technology Association
 *  Universal Disk Format.
 *
 *  This code is based on version 2.00 of the UDF specification,
 *  and revision 3 of the ECMA 167 standard [equivalent to ISO 13346].
 *    http://www.osta.org/
 *    https://www.ecma.ch/
 *    https://www.iso.org/
 *
 * COPYRIGHT
 *  (C) 1998 Dave Boynton
 *  (C) 1998-2004 Ben Fennema
 *  (C) 2000 Stelias Computing Inc
 *
 * HISTORY
 *
 *  09/24/98 dgb  changed to allow compiling outside of kernel, and
 *                added some debugging.
 *  10/01/98 dgb  updated to allow (some) possibility of compiling w/2.0.34
 *  10/16/98      attempting some multi-session support
 *  10/17/98      added freespace count for "df"
 *  11/11/98 gr   added novrs option
 *  11/26/98 dgb  added fileset,anchor mount options
 *  12/06/98 blf  really hosed things royally. vat/sparing support. sequenced
 *                vol descs. rewrote option handling based on isofs
 *  12/20/98      find the free space bitmap (if it exists)
 */

#include "udfdecl.h"

#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/stat.h>
#include <linux/cdrom.h>
#include <linux/nls.h>
#include <linux/vfs.h>
#include <linux/vmalloc.h>
#include <linux/errno.h>
#include <linux/seq_file.h>
#include <linux/bitmap.h>
#include <linux/crc-itu-t.h>
#include <linux/log2.h>
#include <asm/byteorder.h>
#include <linux/iversion.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>

#include "udf_sb.h"
#include "udf_i.h"

#include <linux/init.h>
#include <linux/uaccess.h>

enum {
        VDS_POS_PRIMARY_VOL_DESC,
        VDS_POS_UNALLOC_SPACE_DESC,
        VDS_POS_LOGICAL_VOL_DESC,
        VDS_POS_IMP_USE_VOL_DESC,
        VDS_POS_LENGTH
};

#define VSD_FIRST_SECTOR_OFFSET                32768
#define VSD_MAX_SECTOR_OFFSET                0x800000

/*
 * Maximum number of Terminating Descriptor / Logical Volume Integrity
 * Descriptor redirections. The chosen numbers are arbitrary - just that we
 * hopefully don't limit any real use of rewritten inode on write-once media
 * but avoid looping for too long on corrupted media.
 */
#define UDF_MAX_TD_NESTING 64
#define UDF_MAX_LVID_NESTING 1000

enum { UDF_MAX_LINKS = 0xffff };
/*
 * We limit filesize to 4TB. This is arbitrary as the on-disk format supports
 * more but because the file space is described by a linked list of extents,
 * each of which can have at most 1GB, the creation and handling of extents
 * gets unusably slow beyond certain point...
 */
#define UDF_MAX_FILESIZE (1ULL << 42)

/* These are the "meat" - everything else is stuffing */
static int udf_fill_super(struct super_block *sb, struct fs_context *fc);
static void udf_put_super(struct super_block *);
static int udf_sync_fs(struct super_block *, int);
static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad);
static void udf_open_lvid(struct super_block *);
static void udf_close_lvid(struct super_block *);
static unsigned int udf_count_free(struct super_block *);
static int udf_statfs(struct dentry *, struct kstatfs *);
static int udf_show_options(struct seq_file *, struct dentry *);
static int udf_init_fs_context(struct fs_context *fc);
static int udf_parse_param(struct fs_context *fc, struct fs_parameter *param);
static int udf_reconfigure(struct fs_context *fc);
static void udf_free_fc(struct fs_context *fc);
static const struct fs_parameter_spec udf_param_spec[];

struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb)
{
        struct logicalVolIntegrityDesc *lvid;
        unsigned int partnum;
        unsigned int offset;

        if (!UDF_SB(sb)->s_lvid_bh)
                return NULL;
        lvid = (struct logicalVolIntegrityDesc *)UDF_SB(sb)->s_lvid_bh->b_data;
        partnum = le32_to_cpu(lvid->numOfPartitions);
        /* The offset is to skip freeSpaceTable and sizeTable arrays */
        offset = partnum * 2 * sizeof(uint32_t);
        return (struct logicalVolIntegrityDescImpUse *)
                                        (((uint8_t *)(lvid + 1)) + offset);
}

/* UDF filesystem type */
static int udf_get_tree(struct fs_context *fc)
{
        return get_tree_bdev(fc, udf_fill_super);
}

static const struct fs_context_operations udf_context_ops = {
        .parse_param        = udf_parse_param,
        .get_tree        = udf_get_tree,
        .reconfigure        = udf_reconfigure,
        .free                = udf_free_fc,
};

static struct file_system_type udf_fstype = {
        .owner                = THIS_MODULE,
        .name                = "udf",
        .kill_sb        = kill_block_super,
        .fs_flags        = FS_REQUIRES_DEV,
        .init_fs_context = udf_init_fs_context,
        .parameters        = udf_param_spec,
};
MODULE_ALIAS_FS("udf");

static struct kmem_cache *udf_inode_cachep;

static struct inode *udf_alloc_inode(struct super_block *sb)
{
        struct udf_inode_info *ei;
        ei = alloc_inode_sb(sb, udf_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;

        ei->i_unique = 0;
        ei->i_lenExtents = 0;
        ei->i_lenStreams = 0;
        ei->i_next_alloc_block = 0;
        ei->i_next_alloc_goal = 0;
        ei->i_strat4096 = 0;
        ei->i_streamdir = 0;
        ei->i_hidden = 0;
        init_rwsem(&ei->i_data_sem);
        ei->cached_extent.lstart = -1;
        spin_lock_init(&ei->i_extent_cache_lock);
        inode_set_iversion(&ei->vfs_inode, 1);

        return &ei->vfs_inode;
}

static void udf_free_in_core_inode(struct inode *inode)
{
        kmem_cache_free(udf_inode_cachep, UDF_I(inode));
}

static void init_once(void *foo)
{
        struct udf_inode_info *ei = foo;

        ei->i_data = NULL;
        inode_init_once(&ei->vfs_inode);
}

static int __init init_inodecache(void)
{
        udf_inode_cachep = kmem_cache_create("udf_inode_cache",
                                             sizeof(struct udf_inode_info),
                                             0, (SLAB_RECLAIM_ACCOUNT |
                                                 SLAB_ACCOUNT),
                                             init_once);
        if (!udf_inode_cachep)
                return -ENOMEM;
        return 0;
}

static void destroy_inodecache(void)
{
        /*
         * Make sure all delayed rcu free inodes are flushed before we
         * destroy cache.
         */
        rcu_barrier();
        kmem_cache_destroy(udf_inode_cachep);
}

/* Superblock operations */
static const struct super_operations udf_sb_ops = {
        .alloc_inode        = udf_alloc_inode,
        .free_inode        = udf_free_in_core_inode,
        .write_inode        = udf_write_inode,
        .evict_inode        = udf_evict_inode,
        .put_super        = udf_put_super,
        .sync_fs        = udf_sync_fs,
        .statfs                = udf_statfs,
        .show_options        = udf_show_options,
};

struct udf_options {
        unsigned int blocksize;
        unsigned int session;
        unsigned int lastblock;
        unsigned int anchor;
        unsigned int flags;
        umode_t umask;
        kgid_t gid;
        kuid_t uid;
        umode_t fmode;
        umode_t dmode;
        struct nls_table *nls_map;
};

/*
 * UDF has historically preserved prior mount options across
 * a remount, so copy those here if remounting, otherwise set
 * initial mount defaults.
 */
static void udf_init_options(struct fs_context *fc, struct udf_options *uopt)
{
        if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
                struct super_block *sb = fc->root->d_sb;
                struct udf_sb_info *sbi = UDF_SB(sb);

                uopt->flags = sbi->s_flags;
                uopt->uid   = sbi->s_uid;
                uopt->gid   = sbi->s_gid;
                uopt->umask = sbi->s_umask;
                uopt->fmode = sbi->s_fmode;
                uopt->dmode = sbi->s_dmode;
                uopt->nls_map = NULL;
        } else {
                uopt->flags = (1 << UDF_FLAG_USE_AD_IN_ICB) |
                              (1 << UDF_FLAG_STRICT);
                /*
                 * By default we'll use overflow[ug]id when UDF
                 * inode [ug]id == -1
                 */
                uopt->uid = make_kuid(current_user_ns(), overflowuid);
                uopt->gid = make_kgid(current_user_ns(), overflowgid);
                uopt->umask = 0;
                uopt->fmode = UDF_INVALID_MODE;
                uopt->dmode = UDF_INVALID_MODE;
                uopt->nls_map = NULL;
                uopt->session = 0xFFFFFFFF;
        }
}

static int udf_init_fs_context(struct fs_context *fc)
{
        struct udf_options *uopt;

        uopt = kzalloc(sizeof(*uopt), GFP_KERNEL);
        if (!uopt)
                return -ENOMEM;

        udf_init_options(fc, uopt);

        fc->fs_private = uopt;
        fc->ops = &udf_context_ops;

        return 0;
}

static void udf_free_fc(struct fs_context *fc)
{
        struct udf_options *uopt = fc->fs_private;

        unload_nls(uopt->nls_map);
        kfree(fc->fs_private);
}

static int __init init_udf_fs(void)
{
        int err;

        err = init_inodecache();
        if (err)
                goto out1;
        err = register_filesystem(&udf_fstype);
        if (err)
                goto out;

        return 0;

out:
        destroy_inodecache();

out1:
        return err;
}

static void __exit exit_udf_fs(void)
{
        unregister_filesystem(&udf_fstype);
        destroy_inodecache();
}

static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count)
{
        struct udf_sb_info *sbi = UDF_SB(sb);

        sbi->s_partmaps = kcalloc(count, sizeof(*sbi->s_partmaps), GFP_KERNEL);
        if (!sbi->s_partmaps) {
                sbi->s_partitions = 0;
                return -ENOMEM;
        }

        sbi->s_partitions = count;
        return 0;
}

static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
{
        int i;
        int nr_groups = bitmap->s_nr_groups;

        for (i = 0; i < nr_groups; i++)
                brelse(bitmap->s_block_bitmap[i]);

        kvfree(bitmap);
}

static void udf_free_partition(struct udf_part_map *map)
{
        int i;
        struct udf_meta_data *mdata;

        if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
                iput(map->s_uspace.s_table);
        if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
                udf_sb_free_bitmap(map->s_uspace.s_bitmap);
        if (map->s_partition_type == UDF_SPARABLE_MAP15)
                for (i = 0; i < 4; i++)
                        brelse(map->s_type_specific.s_sparing.s_spar_map[i]);
        else if (map->s_partition_type == UDF_METADATA_MAP25) {
                mdata = &map->s_type_specific.s_metadata;
                iput(mdata->s_metadata_fe);
                mdata->s_metadata_fe = NULL;

                iput(mdata->s_mirror_fe);
                mdata->s_mirror_fe = NULL;

                iput(mdata->s_bitmap_fe);
                mdata->s_bitmap_fe = NULL;
        }
}

static void udf_sb_free_partitions(struct super_block *sb)
{
        struct udf_sb_info *sbi = UDF_SB(sb);
        int i;

        if (!sbi->s_partmaps)
                return;
        for (i = 0; i < sbi->s_partitions; i++)
                udf_free_partition(&sbi->s_partmaps[i]);
        kfree(sbi->s_partmaps);
        sbi->s_partmaps = NULL;
}

static int udf_show_options(struct seq_file *seq, struct dentry *root)
{
        struct super_block *sb = root->d_sb;
        struct udf_sb_info *sbi = UDF_SB(sb);

        if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT))
                seq_puts(seq, ",nostrict");
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_BLOCKSIZE_SET))
                seq_printf(seq, ",bs=%lu", sb->s_blocksize);
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
                seq_puts(seq, ",unhide");
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE))
                seq_puts(seq, ",undelete");
        if (!UDF_QUERY_FLAG(sb, UDF_FLAG_USE_AD_IN_ICB))
                seq_puts(seq, ",noadinicb");
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_USE_SHORT_AD))
                seq_puts(seq, ",shortad");
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_FORGET))
                seq_puts(seq, ",uid=forget");
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_FORGET))
                seq_puts(seq, ",gid=forget");
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET))
                seq_printf(seq, ",uid=%u", from_kuid(&init_user_ns, sbi->s_uid));
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET))
                seq_printf(seq, ",gid=%u", from_kgid(&init_user_ns, sbi->s_gid));
        if (sbi->s_umask != 0)
                seq_printf(seq, ",umask=%ho", sbi->s_umask);
        if (sbi->s_fmode != UDF_INVALID_MODE)
                seq_printf(seq, ",mode=%ho", sbi->s_fmode);
        if (sbi->s_dmode != UDF_INVALID_MODE)
                seq_printf(seq, ",dmode=%ho", sbi->s_dmode);
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_SESSION_SET))
                seq_printf(seq, ",session=%d", sbi->s_session);
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_LASTBLOCK_SET))
                seq_printf(seq, ",lastblock=%u", sbi->s_last_block);
        if (sbi->s_anchor != 0)
                seq_printf(seq, ",anchor=%u", sbi->s_anchor);
        if (sbi->s_nls_map)
                seq_printf(seq, ",iocharset=%s", sbi->s_nls_map->charset);
        else
                seq_puts(seq, ",iocharset=utf8");

        return 0;
}

/*
 * udf_parse_param
 *
 * PURPOSE
 *        Parse mount options.
 *
 * DESCRIPTION
 *        The following mount options are supported:
 *
 *        gid=                Set the default group.
 *        umask=                Set the default umask.
 *        mode=                Set the default file permissions.
 *        dmode=                Set the default directory permissions.
 *        uid=                Set the default user.
 *        bs=                Set the block size.
 *        unhide                Show otherwise hidden files.
 *        undelete        Show deleted files in lists.
 *        adinicb                Embed data in the inode (default)
 *        noadinicb        Don't embed data in the inode
 *        shortad                Use short ad's
 *        longad                Use long ad's (default)
 *        nostrict        Unset strict conformance
 *        iocharset=        Set the NLS character set
 *
 *        The remaining are for debugging and disaster recovery:
 *
 *        novrs                Skip volume sequence recognition
 *
 *        The following expect a offset from 0.
 *
 *        session=        Set the CDROM session (default= last session)
 *        anchor=                Override standard anchor location. (default= 256)
 *        volume=                Override the VolumeDesc location. (unused)
 *        partition=        Override the PartitionDesc location. (unused)
 *        lastblock=        Set the last block of the filesystem/
 *
 *        The following expect a offset from the partition root.
 *
 *        fileset=        Override the fileset block location. (unused)
 *        rootdir=        Override the root directory location. (unused)
 *                WARNING: overriding the rootdir to a non-directory may
 *                yield highly unpredictable results.
 *
 * PRE-CONDITIONS
 *        fc                fs_context with pointer to mount options variable.
 *        param                Pointer to fs_parameter being parsed.
 *
 * POST-CONDITIONS
 *        <return>        0        Mount options parsed okay.
 *        <return>        errno        Error parsing mount options.
 *
 * HISTORY
 *        July 1, 1997 - Andrew E. Mileski
 *        Written, tested, and released.
 */

enum {
        Opt_novrs, Opt_nostrict, Opt_bs, Opt_unhide, Opt_undelete,
        Opt_noadinicb, Opt_adinicb, Opt_shortad, Opt_longad,
        Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock,
        Opt_anchor, Opt_volume, Opt_partition, Opt_fileset,
        Opt_rootdir, Opt_utf8, Opt_iocharset, Opt_err, Opt_fmode, Opt_dmode
};

static const struct fs_parameter_spec udf_param_spec[] = {
        fsparam_flag        ("novrs",                Opt_novrs),
        fsparam_flag        ("nostrict",                Opt_nostrict),
        fsparam_u32        ("bs",                        Opt_bs),
        fsparam_flag        ("unhide",                Opt_unhide),
        fsparam_flag        ("undelete",                Opt_undelete),
        fsparam_flag_no        ("adinicb",                Opt_adinicb),
        fsparam_flag        ("shortad",                Opt_shortad),
        fsparam_flag        ("longad",                Opt_longad),
        fsparam_string        ("gid",                        Opt_gid),
        fsparam_string        ("uid",                        Opt_uid),
        fsparam_u32        ("umask",                Opt_umask),
        fsparam_u32        ("session",                Opt_session),
        fsparam_u32        ("lastblock",                Opt_lastblock),
        fsparam_u32        ("anchor",                Opt_anchor),
        fsparam_u32        ("volume",                Opt_volume),
        fsparam_u32        ("partition",                Opt_partition),
        fsparam_u32        ("fileset",                Opt_fileset),
        fsparam_u32        ("rootdir",                Opt_rootdir),
        fsparam_flag        ("utf8",                Opt_utf8),
        fsparam_string        ("iocharset",                Opt_iocharset),
        fsparam_u32        ("mode",                Opt_fmode),
        fsparam_u32        ("dmode",                Opt_dmode),
        {}
 };

static int udf_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        unsigned int uv;
        unsigned int n;
        struct udf_options *uopt = fc->fs_private;
        struct fs_parse_result result;
        int token;
        bool remount = (fc->purpose & FS_CONTEXT_FOR_RECONFIGURE);

        token = fs_parse(fc, udf_param_spec, param, &result);
        if (token < 0)
                return token;

        switch (token) {
        case Opt_novrs:
                uopt->flags |= (1 << UDF_FLAG_NOVRS);
                break;
        case Opt_bs:
                n = result.uint_32;
                if (n != 512 && n != 1024 && n != 2048 && n != 4096)
                        return -EINVAL;
                uopt->blocksize = n;
                uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
                break;
        case Opt_unhide:
                uopt->flags |= (1 << UDF_FLAG_UNHIDE);
                break;
        case Opt_undelete:
                uopt->flags |= (1 << UDF_FLAG_UNDELETE);
                break;
        case Opt_adinicb:
                if (result.negated)
                        uopt->flags &= ~(1 << UDF_FLAG_USE_AD_IN_ICB);
                else
                        uopt->flags |= (1 << UDF_FLAG_USE_AD_IN_ICB);
                break;
        case Opt_shortad:
                uopt->flags |= (1 << UDF_FLAG_USE_SHORT_AD);
                break;
        case Opt_longad:
                uopt->flags &= ~(1 << UDF_FLAG_USE_SHORT_AD);
                break;
        case Opt_gid:
                if (kstrtoint(param->string, 10, &uv) == 0) {
                        kgid_t gid = make_kgid(current_user_ns(), uv);
                        if (!gid_valid(gid))
                                return -EINVAL;
                        uopt->gid = gid;
                        uopt->flags |= (1 << UDF_FLAG_GID_SET);
                } else if (!strcmp(param->string, "forget")) {
                        uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
                } else if (!strcmp(param->string, "ignore")) {
                        /* this option is superseded by gid=<number> */
                        ;
                } else {
                        return -EINVAL;
                }
                break;
        case Opt_uid:
                if (kstrtoint(param->string, 10, &uv) == 0) {
                        kuid_t uid = make_kuid(current_user_ns(), uv);
                        if (!uid_valid(uid))
                                return -EINVAL;
                        uopt->uid = uid;
                        uopt->flags |= (1 << UDF_FLAG_UID_SET);
                } else if (!strcmp(param->string, "forget")) {
                        uopt->flags |= (1 << UDF_FLAG_UID_FORGET);
                } else if (!strcmp(param->string, "ignore")) {
                        /* this option is superseded by uid=<number> */
                        ;
                } else {
                        return -EINVAL;
                }
                break;
        case Opt_umask:
                uopt->umask = result.uint_32;
                break;
        case Opt_nostrict:
                uopt->flags &= ~(1 << UDF_FLAG_STRICT);
                break;
        case Opt_session:
                uopt->session = result.uint_32;
                if (!remount)
                        uopt->flags |= (1 << UDF_FLAG_SESSION_SET);
                break;
        case Opt_lastblock:
                uopt->lastblock = result.uint_32;
                if (!remount)
                        uopt->flags |= (1 << UDF_FLAG_LASTBLOCK_SET);
                break;
        case Opt_anchor:
                uopt->anchor = result.uint_32;
                break;
        case Opt_volume:
        case Opt_partition:
        case Opt_fileset:
        case Opt_rootdir:
                /* Ignored (never implemented properly) */
                break;
        case Opt_utf8:
                if (!remount) {
                        unload_nls(uopt->nls_map);
                        uopt->nls_map = NULL;
                }
                break;
        case Opt_iocharset:
                if (!remount) {
                        unload_nls(uopt->nls_map);
                        uopt->nls_map = NULL;
                }
                /* When nls_map is not loaded then UTF-8 is used */
                if (!remount && strcmp(param->string, "utf8") != 0) {
                        uopt->nls_map = load_nls(param->string);
                        if (!uopt->nls_map) {
                                errorf(fc, "iocharset %s not found",
                                        param->string);
                                return -EINVAL;
                        }
                }
                break;
        case Opt_fmode:
                uopt->fmode = result.uint_32 & 0777;
                break;
        case Opt_dmode:
                uopt->dmode = result.uint_32 & 0777;
                break;
        default:
                return -EINVAL;
        }
        return 0;
}

static int udf_reconfigure(struct fs_context *fc)
{
        struct udf_options *uopt = fc->fs_private;
        struct super_block *sb = fc->root->d_sb;
        struct udf_sb_info *sbi = UDF_SB(sb);
        int readonly = fc->sb_flags & SB_RDONLY;
        int error = 0;

        if (!readonly && UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT))
                return -EACCES;

        sync_filesystem(sb);

        write_lock(&sbi->s_cred_lock);
        sbi->s_flags = uopt->flags;
        sbi->s_uid   = uopt->uid;
        sbi->s_gid   = uopt->gid;
        sbi->s_umask = uopt->umask;
        sbi->s_fmode = uopt->fmode;
        sbi->s_dmode = uopt->dmode;
        write_unlock(&sbi->s_cred_lock);

        if (readonly == sb_rdonly(sb))
                goto out_unlock;

        if (readonly)
                udf_close_lvid(sb);
        else
                udf_open_lvid(sb);

out_unlock:
        return error;
}

/*
 * Check VSD descriptor. Returns -1 in case we are at the end of volume
 * recognition area, 0 if the descriptor is valid but non-interesting, 1 if
 * we found one of NSR descriptors we are looking for.
 */
static int identify_vsd(const struct volStructDesc *vsd)
{
        int ret = 0;

        if (!memcmp(vsd->stdIdent, VSD_STD_ID_CD001, VSD_STD_ID_LEN)) {
                switch (vsd->structType) {
                case 0:
                        udf_debug("ISO9660 Boot Record found\n");
                        break;
                case 1:
                        udf_debug("ISO9660 Primary Volume Descriptor found\n");
                        break;
                case 2:
                        udf_debug("ISO9660 Supplementary Volume Descriptor found\n");
                        break;
                case 3:
                        udf_debug("ISO9660 Volume Partition Descriptor found\n");
                        break;
                case 255:
                        udf_debug("ISO9660 Volume Descriptor Set Terminator found\n");
                        break;
                default:
                        udf_debug("ISO9660 VRS (%u) found\n", vsd->structType);
                        break;
                }
        } else if (!memcmp(vsd->stdIdent, VSD_STD_ID_BEA01, VSD_STD_ID_LEN))
                ; /* ret = 0 */
        else if (!memcmp(vsd->stdIdent, VSD_STD_ID_NSR02, VSD_STD_ID_LEN))
                ret = 1;
        else if (!memcmp(vsd->stdIdent, VSD_STD_ID_NSR03, VSD_STD_ID_LEN))
                ret = 1;
        else if (!memcmp(vsd->stdIdent, VSD_STD_ID_BOOT2, VSD_STD_ID_LEN))
                ; /* ret = 0 */
        else if (!memcmp(vsd->stdIdent, VSD_STD_ID_CDW02, VSD_STD_ID_LEN))
                ; /* ret = 0 */
        else {
                /* TEA01 or invalid id : end of volume recognition area */
                ret = -1;
        }

        return ret;
}

/*
 * Check Volume Structure Descriptors (ECMA 167 2/9.1)
 * We also check any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1)
 * @return   1 if NSR02 or NSR03 found,
 *            -1 if first sector read error, 0 otherwise
 */
static int udf_check_vsd(struct super_block *sb)
{
        struct volStructDesc *vsd = NULL;
        loff_t sector = VSD_FIRST_SECTOR_OFFSET;
        int sectorsize;
        struct buffer_head *bh = NULL;
        int nsr = 0;
        struct udf_sb_info *sbi;
        loff_t session_offset;

        sbi = UDF_SB(sb);
        if (sb->s_blocksize < sizeof(struct volStructDesc))
                sectorsize = sizeof(struct volStructDesc);
        else
                sectorsize = sb->s_blocksize;

        session_offset = (loff_t)sbi->s_session << sb->s_blocksize_bits;
        sector += session_offset;

        udf_debug("Starting at sector %u (%lu byte sectors)\n",
                  (unsigned int)(sector >> sb->s_blocksize_bits),
                  sb->s_blocksize);
        /* Process the sequence (if applicable). The hard limit on the sector
         * offset is arbitrary, hopefully large enough so that all valid UDF
         * filesystems will be recognised. There is no mention of an upper
         * bound to the size of the volume recognition area in the standard.
         *  The limit will prevent the code to read all the sectors of a
         * specially crafted image (like a bluray disc full of CD001 sectors),
         * potentially causing minutes or even hours of uninterruptible I/O
         * activity. This actually happened with uninitialised SSD partitions
         * (all 0xFF) before the check for the limit and all valid IDs were
         * added */
        for (; !nsr && sector < VSD_MAX_SECTOR_OFFSET; sector += sectorsize) {
                /* Read a block */
                bh = sb_bread(sb, sector >> sb->s_blocksize_bits);
                if (!bh)
                        break;

                vsd = (struct volStructDesc *)(bh->b_data +
                                              (sector & (sb->s_blocksize - 1)));
                nsr = identify_vsd(vsd);
                /* Found NSR or end? */
                if (nsr) {
                        brelse(bh);
                        break;
                }
                /*
                 * Special handling for improperly formatted VRS (e.g., Win10)
                 * where components are separated by 2048 bytes even though
                 * sectors are 4K
                 */
                if (sb->s_blocksize == 4096) {
                        nsr = identify_vsd(vsd + 1);
                        /* Ignore unknown IDs... */
                        if (nsr < 0)
                                nsr = 0;
                }
                brelse(bh);
        }

        if (nsr > 0)
                return 1;
        else if (!bh && sector - session_offset == VSD_FIRST_SECTOR_OFFSET)
                return -1;
        else
                return 0;
}

static int udf_verify_domain_identifier(struct super_block *sb,
                                        struct regid *ident, char *dname)
{
        struct domainIdentSuffix *suffix;

        if (memcmp(ident->ident, UDF_ID_COMPLIANT, strlen(UDF_ID_COMPLIANT))) {
                udf_warn(sb, "Not OSTA UDF compliant %s descriptor.\n", dname);
                goto force_ro;
        }
        if (ident->flags & ENTITYID_FLAGS_DIRTY) {
                udf_warn(sb, "Possibly not OSTA UDF compliant %s descriptor.\n",
                         dname);
                goto force_ro;
        }
        suffix = (struct domainIdentSuffix *)ident->identSuffix;
        if ((suffix->domainFlags & DOMAIN_FLAGS_HARD_WRITE_PROTECT) ||
            (suffix->domainFlags & DOMAIN_FLAGS_SOFT_WRITE_PROTECT)) {
                if (!sb_rdonly(sb)) {
                        udf_warn(sb, "Descriptor for %s marked write protected."
                                 " Forcing read only mount.\n", dname);
                }
                goto force_ro;
        }
        return 0;

force_ro:
        if (!sb_rdonly(sb))
                return -EACCES;
        UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
        return 0;
}

static int udf_load_fileset(struct super_block *sb, struct fileSetDesc *fset,
                            struct kernel_lb_addr *root)
{
        int ret;

        ret = udf_verify_domain_identifier(sb, &fset->domainIdent, "file set");
        if (ret < 0)
                return ret;

        *root = lelb_to_cpu(fset->rootDirectoryICB.extLocation);
        UDF_SB(sb)->s_serial_number = le16_to_cpu(fset->descTag.tagSerialNum);

        udf_debug("Rootdir at block=%u, partition=%u\n",
                  root->logicalBlockNum, root->partitionReferenceNum);
        return 0;
}

static int udf_find_fileset(struct super_block *sb,
                            struct kernel_lb_addr *fileset,
                            struct kernel_lb_addr *root)
{
        struct buffer_head *bh;
        uint16_t ident;
        int ret;

        if (fileset->logicalBlockNum == 0xFFFFFFFF &&
            fileset->partitionReferenceNum == 0xFFFF)
                return -EINVAL;

        bh = udf_read_ptagged(sb, fileset, 0, &ident);
        if (!bh)
                return -EIO;
        if (ident != TAG_IDENT_FSD) {
                brelse(bh);
                return -EINVAL;
        }

        udf_debug("Fileset at block=%u, partition=%u\n",
                  fileset->logicalBlockNum, fileset->partitionReferenceNum);

        UDF_SB(sb)->s_partition = fileset->partitionReferenceNum;
        ret = udf_load_fileset(sb, (struct fileSetDesc *)bh->b_data, root);
        brelse(bh);
        return ret;
}

/*
 * Load primary Volume Descriptor Sequence
 *
 * Return <0 on error, 0 on success. -EAGAIN is special meaning next sequence
 * should be tried.
 */
static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
{
        struct primaryVolDesc *pvoldesc;
        uint8_t *outstr;
        struct buffer_head *bh;
        uint16_t ident;
        int ret;
        struct timestamp *ts;

        outstr = kzalloc(128, GFP_KERNEL);
        if (!outstr)
                return -ENOMEM;

        bh = udf_read_tagged(sb, block, block, &ident);
        if (!bh) {
                ret = -EAGAIN;
                goto out2;
        }

        if (ident != TAG_IDENT_PVD) {
                ret = -EIO;
                goto out_bh;
        }

        pvoldesc = (struct primaryVolDesc *)bh->b_data;

        udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time,
                              pvoldesc->recordingDateAndTime);
        ts = &pvoldesc->recordingDateAndTime;
        udf_debug("recording time %04u/%02u/%02u %02u:%02u (%x)\n",
                  le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
                  ts->minute, le16_to_cpu(ts->typeAndTimezone));

        ret = udf_dstrCS0toChar(sb, outstr, 31, pvoldesc->volIdent, 32);
        if (ret < 0) {
                strscpy_pad(UDF_SB(sb)->s_volume_ident, "InvalidName");
                pr_warn("incorrect volume identification, setting to "
                        "'InvalidName'\n");
        } else {
                strscpy_pad(UDF_SB(sb)->s_volume_ident, outstr);
        }
        udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident);

        ret = udf_dstrCS0toChar(sb, outstr, 127, pvoldesc->volSetIdent, 128);
        if (ret < 0) {
                ret = 0;
                goto out_bh;
        }
        outstr[ret] = 0;
        udf_debug("volSetIdent[] = '%s'\n", outstr);

        ret = 0;
out_bh:
        brelse(bh);
out2:
        kfree(outstr);
        return ret;
}

struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
                                        u32 meta_file_loc, u32 partition_ref)
{
        struct kernel_lb_addr addr;
        struct inode *metadata_fe;

        addr.logicalBlockNum = meta_file_loc;
        addr.partitionReferenceNum = partition_ref;

        metadata_fe = udf_iget_special(sb, &addr);

        if (IS_ERR(metadata_fe)) {
                udf_warn(sb, "metadata inode efe not found\n");
                return metadata_fe;
        }
        if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) {
                udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n");
                iput(metadata_fe);
                return ERR_PTR(-EIO);
        }

        return metadata_fe;
}

static int udf_load_metadata_files(struct super_block *sb, int partition,
                                   int type1_index)
{
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct udf_part_map *map;
        struct udf_meta_data *mdata;
        struct kernel_lb_addr addr;
        struct inode *fe;

        map = &sbi->s_partmaps[partition];
        mdata = &map->s_type_specific.s_metadata;
        mdata->s_phys_partition_ref = type1_index;

        /* metadata address */
        udf_debug("Metadata file location: block = %u part = %u\n",
                  mdata->s_meta_file_loc, mdata->s_phys_partition_ref);

        fe = udf_find_metadata_inode_efe(sb, mdata->s_meta_file_loc,
                                         mdata->s_phys_partition_ref);
        if (IS_ERR(fe)) {
                /* mirror file entry */
                udf_debug("Mirror metadata file location: block = %u part = %u\n",
                          mdata->s_mirror_file_loc, mdata->s_phys_partition_ref);

                fe = udf_find_metadata_inode_efe(sb, mdata->s_mirror_file_loc,
                                                 mdata->s_phys_partition_ref);

                if (IS_ERR(fe)) {
                        udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n");
                        return PTR_ERR(fe);
                }
                mdata->s_mirror_fe = fe;
        } else
                mdata->s_metadata_fe = fe;


        /*
         * bitmap file entry
         * Note:
         * Load only if bitmap file location differs from 0xFFFFFFFF (DCN-5102)
        */
        if (mdata->s_bitmap_file_loc != 0xFFFFFFFF) {
                addr.logicalBlockNum = mdata->s_bitmap_file_loc;
                addr.partitionReferenceNum = mdata->s_phys_partition_ref;

                udf_debug("Bitmap file location: block = %u part = %u\n",
                          addr.logicalBlockNum, addr.partitionReferenceNum);

                fe = udf_iget_special(sb, &addr);
                if (IS_ERR(fe)) {
                        if (sb_rdonly(sb))
                                udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n");
                        else {
                                udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n");
                                return PTR_ERR(fe);
                        }
                } else
                        mdata->s_bitmap_fe = fe;
        }

        udf_debug("udf_load_metadata_files Ok\n");
        return 0;
}

int udf_compute_nr_groups(struct super_block *sb, u32 partition)
{
        struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
        return DIV_ROUND_UP(map->s_partition_len +
                            (sizeof(struct spaceBitmapDesc) << 3),
                            sb->s_blocksize * 8);
}

static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
{
        struct udf_bitmap *bitmap;
        int nr_groups = udf_compute_nr_groups(sb, index);

        bitmap = kvzalloc(struct_size(bitmap, s_block_bitmap, nr_groups),
                          GFP_KERNEL);
        if (!bitmap)
                return NULL;

        bitmap->s_nr_groups = nr_groups;
        return bitmap;
}

static int check_partition_desc(struct super_block *sb,
                                struct partitionDesc *p,
                                struct udf_part_map *map)
{
        bool umap, utable, fmap, ftable;
        struct partitionHeaderDesc *phd;

        switch (le32_to_cpu(p->accessType)) {
        case PD_ACCESS_TYPE_READ_ONLY:
        case PD_ACCESS_TYPE_WRITE_ONCE:
        case PD_ACCESS_TYPE_NONE:
                goto force_ro;
        }

        /* No Partition Header Descriptor? */
        if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) &&
            strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03))
                goto force_ro;

        phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
        utable = phd->unallocSpaceTable.extLength;
        umap = phd->unallocSpaceBitmap.extLength;
        ftable = phd->freedSpaceTable.extLength;
        fmap = phd->freedSpaceBitmap.extLength;

        /* No allocation info? */
        if (!utable && !umap && !ftable && !fmap)
                goto force_ro;

        /* We don't support blocks that require erasing before overwrite */
        if (ftable || fmap)
                goto force_ro;
        /* UDF 2.60: 2.3.3 - no mixing of tables & bitmaps, no VAT. */
        if (utable && umap)
                goto force_ro;

        if (map->s_partition_type == UDF_VIRTUAL_MAP15 ||
            map->s_partition_type == UDF_VIRTUAL_MAP20 ||
            map->s_partition_type == UDF_METADATA_MAP25)
                goto force_ro;

        return 0;
force_ro:
        if (!sb_rdonly(sb))
                return -EACCES;
        UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
        return 0;
}

static int udf_fill_partdesc_info(struct super_block *sb,
                struct partitionDesc *p, int p_index)
{
        struct udf_part_map *map;
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct partitionHeaderDesc *phd;
        int err;

        map = &sbi->s_partmaps[p_index];

        map->s_partition_len = le32_to_cpu(p->partitionLength); /* blocks */
        map->s_partition_root = le32_to_cpu(p->partitionStartingLocation);

        if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_READ_ONLY))
                map->s_partition_flags |= UDF_PART_FLAG_READ_ONLY;
        if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_WRITE_ONCE))
                map->s_partition_flags |= UDF_PART_FLAG_WRITE_ONCE;
        if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_REWRITABLE))
                map->s_partition_flags |= UDF_PART_FLAG_REWRITABLE;
        if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE))
                map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE;

        udf_debug("Partition (%d type %x) starts at physical %u, block length %u\n",
                  p_index, map->s_partition_type,
                  map->s_partition_root, map->s_partition_len);

        err = check_partition_desc(sb, p, map);
        if (err)
                return err;

        /*
         * Skip loading allocation info it we cannot ever write to the fs.
         * This is a correctness thing as we may have decided to force ro mount
         * to avoid allocation info we don't support.
         */
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT))
                return 0;

        phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
        if (phd->unallocSpaceTable.extLength) {
                struct kernel_lb_addr loc = {
                        .logicalBlockNum = le32_to_cpu(
                                phd->unallocSpaceTable.extPosition),
                        .partitionReferenceNum = p_index,
                };
                struct inode *inode;

                inode = udf_iget_special(sb, &loc);
                if (IS_ERR(inode)) {
                        udf_debug("cannot load unallocSpaceTable (part %d)\n",
                                  p_index);
                        return PTR_ERR(inode);
                }
                map->s_uspace.s_table = inode;
                map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
                udf_debug("unallocSpaceTable (part %d) @ %lu\n",
                          p_index, map->s_uspace.s_table->i_ino);
        }

        if (phd->unallocSpaceBitmap.extLength) {
                struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index);
                if (!bitmap)
                        return -ENOMEM;
                map->s_uspace.s_bitmap = bitmap;
                bitmap->s_extPosition = le32_to_cpu(
                                phd->unallocSpaceBitmap.extPosition);
                map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP;
                udf_debug("unallocSpaceBitmap (part %d) @ %u\n",
                          p_index, bitmap->s_extPosition);
        }

        return 0;
}

static void udf_find_vat_block(struct super_block *sb, int p_index,
                               int type1_index, sector_t start_block)
{
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct udf_part_map *map = &sbi->s_partmaps[p_index];
        sector_t vat_block;
        struct kernel_lb_addr ino;
        struct inode *inode;

        /*
         * VAT file entry is in the last recorded block. Some broken disks have
         * it a few blocks before so try a bit harder...
         */
        ino.partitionReferenceNum = type1_index;
        for (vat_block = start_block;
             vat_block >= map->s_partition_root &&
             vat_block >= start_block - 3; vat_block--) {
                ino.logicalBlockNum = vat_block - map->s_partition_root;
                inode = udf_iget_special(sb, &ino);
                if (!IS_ERR(inode)) {
                        sbi->s_vat_inode = inode;
                        break;
                }
        }
}

static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
{
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct udf_part_map *map = &sbi->s_partmaps[p_index];
        struct buffer_head *bh = NULL;
        struct udf_inode_info *vati;
        struct virtualAllocationTable20 *vat20;
        sector_t blocks = sb_bdev_nr_blocks(sb);

        udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block);
        if (!sbi->s_vat_inode &&
            sbi->s_last_block != blocks - 1) {
                pr_notice("Failed to read VAT inode from the last recorded block (%lu), retrying with the last block of the device (%lu).\n",
                          (unsigned long)sbi->s_last_block,
                          (unsigned long)blocks - 1);
                udf_find_vat_block(sb, p_index, type1_index, blocks - 1);
        }
        if (!sbi->s_vat_inode)
                return -EIO;

        if (map->s_partition_type == UDF_VIRTUAL_MAP15) {
                map->s_type_specific.s_virtual.s_start_offset = 0;
                map->s_type_specific.s_virtual.s_num_entries =
                        (sbi->s_vat_inode->i_size - 36) >> 2;
        } else if (map->s_partition_type == UDF_VIRTUAL_MAP20) {
                vati = UDF_I(sbi->s_vat_inode);
                if (vati->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
                        int err = 0;

                        bh = udf_bread(sbi->s_vat_inode, 0, 0, &err);
                        if (!bh) {
                                if (!err)
                                        err = -EFSCORRUPTED;
                                return err;
                        }
                        vat20 = (struct virtualAllocationTable20 *)bh->b_data;
                } else {
                        vat20 = (struct virtualAllocationTable20 *)
                                                        vati->i_data;
                }

                map->s_type_specific.s_virtual.s_start_offset =
                        le16_to_cpu(vat20->lengthHeader);
                map->s_type_specific.s_virtual.s_num_entries =
                        (sbi->s_vat_inode->i_size -
                                map->s_type_specific.s_virtual.
                                        s_start_offset) >> 2;
                brelse(bh);
        }
        return 0;
}

/*
 * Load partition descriptor block
 *
 * Returns <0 on error, 0 on success, -EAGAIN is special - try next descriptor
 * sequence.
 */
static int udf_load_partdesc(struct super_block *sb, sector_t block)
{
        struct buffer_head *bh;
        struct partitionDesc *p;
        struct udf_part_map *map;
        struct udf_sb_info *sbi = UDF_SB(sb);
        int i, type1_idx;
        uint16_t partitionNumber;
        uint16_t ident;
        int ret;

        bh = udf_read_tagged(sb, block, block, &ident);
        if (!bh)
                return -EAGAIN;
        if (ident != TAG_IDENT_PD) {
                ret = 0;
                goto out_bh;
        }

        p = (struct partitionDesc *)bh->b_data;
        partitionNumber = le16_to_cpu(p->partitionNumber);

        /* First scan for TYPE1 and SPARABLE partitions */
        for (i = 0; i < sbi->s_partitions; i++) {
                map = &sbi->s_partmaps[i];
                udf_debug("Searching map: (%u == %u)\n",
                          map->s_partition_num, partitionNumber);
                if (map->s_partition_num == partitionNumber &&
                    (map->s_partition_type == UDF_TYPE1_MAP15 ||
                     map->s_partition_type == UDF_SPARABLE_MAP15))
                        break;
        }

        if (i >= sbi->s_partitions) {
                udf_debug("Partition (%u) not found in partition map\n",
                          partitionNumber);
                ret = 0;
                goto out_bh;
        }

        ret = udf_fill_partdesc_info(sb, p, i);
        if (ret < 0)
                goto out_bh;

        /*
         * Now rescan for VIRTUAL or METADATA partitions when SPARABLE and
         * PHYSICAL partitions are already set up
         */
        type1_idx = i;
        map = NULL; /* supress 'maybe used uninitialized' warning */
        for (i = 0; i < sbi->s_partitions; i++) {
                map = &sbi->s_partmaps[i];

                if (map->s_partition_num == partitionNumber &&
                    (map->s_partition_type == UDF_VIRTUAL_MAP15 ||
                     map->s_partition_type == UDF_VIRTUAL_MAP20 ||
                     map->s_partition_type == UDF_METADATA_MAP25))
                        break;
        }

        if (i >= sbi->s_partitions) {
                ret = 0;
                goto out_bh;
        }

        ret = udf_fill_partdesc_info(sb, p, i);
        if (ret < 0)
                goto out_bh;

        if (map->s_partition_type == UDF_METADATA_MAP25) {
                ret = udf_load_metadata_files(sb, i, type1_idx);
                if (ret < 0) {
                        udf_err(sb, "error loading MetaData partition map %d\n",
                                i);
                        goto out_bh;
                }
        } else {
                /*
                 * If we have a partition with virtual map, we don't handle
                 * writing to it (we overwrite blocks instead of relocating
                 * them).
                 */
                if (!sb_rdonly(sb)) {
                        ret = -EACCES;
                        goto out_bh;
                }
                UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
                ret = udf_load_vat(sb, i, type1_idx);
                if (ret < 0)
                        goto out_bh;
        }
        ret = 0;
out_bh:
        /* In case loading failed, we handle cleanup in udf_fill_super */
        brelse(bh);
        return ret;
}

static int udf_load_sparable_map(struct super_block *sb,
                                 struct udf_part_map *map,
                                 struct sparablePartitionMap *spm)
{
        uint32_t loc;
        uint16_t ident;
        struct sparingTable *st;
        struct udf_sparing_data *sdata = &map->s_type_specific.s_sparing;
        int i;
        struct buffer_head *bh;

        map->s_partition_type = UDF_SPARABLE_MAP15;
        sdata->s_packet_len = le16_to_cpu(spm->packetLength);
        if (!is_power_of_2(sdata->s_packet_len)) {
                udf_err(sb, "error loading logical volume descriptor: "
                        "Invalid packet length %u\n",
                        (unsigned)sdata->s_packet_len);
                return -EIO;
        }
        if (spm->numSparingTables > 4) {
                udf_err(sb, "error loading logical volume descriptor: "
                        "Too many sparing tables (%d)\n",
                        (int)spm->numSparingTables);
                return -EIO;
        }
        if (le32_to_cpu(spm->sizeSparingTable) > sb->s_blocksize) {
                udf_err(sb, "error loading logical volume descriptor: "
                        "Too big sparing table size (%u)\n",
                        le32_to_cpu(spm->sizeSparingTable));
                return -EIO;
        }

        for (i = 0; i < spm->numSparingTables; i++) {
                loc = le32_to_cpu(spm->locSparingTable[i]);
                bh = udf_read_tagged(sb, loc, loc, &ident);
                if (!bh)
                        continue;

                st = (struct sparingTable *)bh->b_data;
                if (ident != 0 ||
                    strncmp(st->sparingIdent.ident, UDF_ID_SPARING,
                            strlen(UDF_ID_SPARING)) ||
                    sizeof(*st) + le16_to_cpu(st->reallocationTableLen) >
                                                        sb->s_blocksize) {
                        brelse(bh);
                        continue;
                }

                sdata->s_spar_map[i] = bh;
        }
        map->s_partition_func = udf_get_pblock_spar15;
        return 0;
}

static int udf_load_logicalvol(struct super_block *sb, sector_t block,
                               struct kernel_lb_addr *fileset)
{
        struct logicalVolDesc *lvd;
        int i, offset;
        uint8_t type;
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct genericPartitionMap *gpm;
        uint16_t ident;
        struct buffer_head *bh;
        unsigned int table_len;
        int ret;

        bh = udf_read_tagged(sb, block, block, &ident);
        if (!bh)
                return -EAGAIN;
        BUG_ON(ident != TAG_IDENT_LVD);
        lvd = (struct logicalVolDesc *)bh->b_data;
        table_len = le32_to_cpu(lvd->mapTableLength);
        if (table_len > sb->s_blocksize - sizeof(*lvd)) {
                udf_err(sb, "error loading logical volume descriptor: "
                        "Partition table too long (%u > %lu)\n", table_len,
                        sb->s_blocksize - sizeof(*lvd));
                ret = -EIO;
                goto out_bh;
        }

        ret = udf_verify_domain_identifier(sb, &lvd->domainIdent,
                                           "logical volume");
        if (ret)
                goto out_bh;
        ret = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps));
        if (ret)
                goto out_bh;

        for (i = 0, offset = 0;
             i < sbi->s_partitions && offset < table_len;
             i++, offset += gpm->partitionMapLength) {
                struct udf_part_map *map = &sbi->s_partmaps[i];
                gpm = (struct genericPartitionMap *)
                                &(lvd->partitionMaps[offset]);
                type = gpm->partitionMapType;
                if (type == 1) {
                        struct genericPartitionMap1 *gpm1 =
                                (struct genericPartitionMap1 *)gpm;
                        map->s_partition_type = UDF_TYPE1_MAP15;
                        map->s_volumeseqnum = le16_to_cpu(gpm1->volSeqNum);
                        map->s_partition_num = le16_to_cpu(gpm1->partitionNum);
                        map->s_partition_func = NULL;
                } else if (type == 2) {
                        struct udfPartitionMap2 *upm2 =
                                                (struct udfPartitionMap2 *)gpm;
                        if (!strncmp(upm2->partIdent.ident, UDF_ID_VIRTUAL,
                                                strlen(UDF_ID_VIRTUAL))) {
                                u16 suf =
                                        le16_to_cpu(((__le16 *)upm2->partIdent.
                                                        identSuffix)[0]);
                                if (suf < 0x0200) {
                                        map->s_partition_type =
                                                        UDF_VIRTUAL_MAP15;
                                        map->s_partition_func =
                                                        udf_get_pblock_virt15;
                                } else {
                                        map->s_partition_type =
                                                        UDF_VIRTUAL_MAP20;
                                        map->s_partition_func =
                                                        udf_get_pblock_virt20;
                                }
                        } else if (!strncmp(upm2->partIdent.ident,
                                                UDF_ID_SPARABLE,
                                                strlen(UDF_ID_SPARABLE))) {
                                ret = udf_load_sparable_map(sb, map,
                                        (struct sparablePartitionMap *)gpm);
                                if (ret < 0)
                                        goto out_bh;
                        } else if (!strncmp(upm2->partIdent.ident,
                                                UDF_ID_METADATA,
                                                strlen(UDF_ID_METADATA))) {
                                struct udf_meta_data *mdata =
                                        &map->s_type_specific.s_metadata;
                                struct metadataPartitionMap *mdm =
                                                (struct metadataPartitionMap *)
                                                &(lvd->partitionMaps[offset]);
                                udf_debug("Parsing Logical vol part %d type %u  id=%s\n",
                                          i, type, UDF_ID_METADATA);

                                map->s_partition_type = UDF_METADATA_MAP25;
                                map->s_partition_func = udf_get_pblock_meta25;

                                mdata->s_meta_file_loc   =
                                        le32_to_cpu(mdm->metadataFileLoc);
                                mdata->s_mirror_file_loc =
                                        le32_to_cpu(mdm->metadataMirrorFileLoc);
                                mdata->s_bitmap_file_loc =
                                        le32_to_cpu(mdm->metadataBitmapFileLoc);
                                mdata->s_alloc_unit_size =
                                        le32_to_cpu(mdm->allocUnitSize);
                                mdata->s_align_unit_size =
                                        le16_to_cpu(mdm->alignUnitSize);
                                if (mdm->flags & 0x01)
                                        mdata->s_flags |= MF_DUPLICATE_MD;

                                udf_debug("Metadata Ident suffix=0x%x\n",
                                          le16_to_cpu(*(__le16 *)
                                                      mdm->partIdent.identSuffix));
                                udf_debug("Metadata part num=%u\n",
                                          le16_to_cpu(mdm->partitionNum));
                                udf_debug("Metadata part alloc unit size=%u\n",
                                          le32_to_cpu(mdm->allocUnitSize));
                                udf_debug("Metadata file loc=%u\n",
                                          le32_to_cpu(mdm->metadataFileLoc));
                                udf_debug("Mirror file loc=%u\n",
                                          le32_to_cpu(mdm->metadataMirrorFileLoc));
                                udf_debug("Bitmap file loc=%u\n",
                                          le32_to_cpu(mdm->metadataBitmapFileLoc));
                                udf_debug("Flags: %d %u\n",
                                          mdata->s_flags, mdm->flags);
                        } else {
                                udf_debug("Unknown ident: %s\n",
                                          upm2->partIdent.ident);
                                continue;
                        }
                        map->s_volumeseqnum = le16_to_cpu(upm2->volSeqNum);
                        map->s_partition_num = le16_to_cpu(upm2->partitionNum);
                }
                udf_debug("Partition (%d:%u) type %u on volume %u\n",
                          i, map->s_partition_num, type, map->s_volumeseqnum);
        }

        if (fileset) {
                struct long_ad *la = (struct long_ad *)&(lvd->logicalVolContentsUse[0]);

                *fileset = lelb_to_cpu(la->extLocation);
                udf_debug("FileSet found in LogicalVolDesc at block=%u, partition=%u\n",
                          fileset->logicalBlockNum,
                          fileset->partitionReferenceNum);
        }
        if (lvd->integritySeqExt.extLength)
                udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt));
        ret = 0;

        if (!sbi->s_lvid_bh) {
                /* We can't generate unique IDs without a valid LVID */
                if (sb_rdonly(sb)) {
                        UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
                } else {
                        udf_warn(sb, "Damaged or missing LVID, forcing "
                                     "readonly mount\n");
                        ret = -EACCES;
                }
        }
out_bh:
        brelse(bh);
        return ret;
}

static bool udf_lvid_valid(struct super_block *sb,
                           struct logicalVolIntegrityDesc *lvid)
{
        u32 parts, impuselen;

        parts = le32_to_cpu(lvid->numOfPartitions);
        impuselen = le32_to_cpu(lvid->lengthOfImpUse);
        if (parts >= sb->s_blocksize || impuselen >= sb->s_blocksize ||
            sizeof(struct logicalVolIntegrityDesc) + impuselen +
            2 * parts * sizeof(u32) > sb->s_blocksize)
                return false;
        return true;
}

/*
 * Find the prevailing Logical Volume Integrity Descriptor.
 */
static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ad loc)
{
        struct buffer_head *bh, *final_bh;
        uint16_t ident;
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct logicalVolIntegrityDesc *lvid;
        int indirections = 0;

        while (++indirections <= UDF_MAX_LVID_NESTING) {
                final_bh = NULL;
                while (loc.extLength > 0 &&
                        (bh = udf_read_tagged(sb, loc.extLocation,
                                        loc.extLocation, &ident))) {
                        if (ident != TAG_IDENT_LVID) {
                                brelse(bh);
                                break;
                        }

                        brelse(final_bh);
                        final_bh = bh;

                        loc.extLength -= sb->s_blocksize;
                        loc.extLocation++;
                }

                if (!final_bh)
                        return;

                lvid = (struct logicalVolIntegrityDesc *)final_bh->b_data;
                if (udf_lvid_valid(sb, lvid)) {
                        brelse(sbi->s_lvid_bh);
                        sbi->s_lvid_bh = final_bh;
                } else {
                        udf_warn(sb, "Corrupted LVID (parts=%u, impuselen=%u), "
                                 "ignoring.\n",
                                 le32_to_cpu(lvid->numOfPartitions),
                                 le32_to_cpu(lvid->lengthOfImpUse));
                }

                if (lvid->nextIntegrityExt.extLength == 0)
                        return;

                loc = leea_to_cpu(lvid->nextIntegrityExt);
        }

        udf_warn(sb, "Too many LVID indirections (max %u), ignoring.\n",
                UDF_MAX_LVID_NESTING);
        brelse(sbi->s_lvid_bh);
        sbi->s_lvid_bh = NULL;
}

/*
 * Step for reallocation of table of partition descriptor sequence numbers.
 * Must be power of 2.
 */
#define PART_DESC_ALLOC_STEP 32

struct part_desc_seq_scan_data {
        struct udf_vds_record rec;
        u32 partnum;
};

struct desc_seq_scan_data {
        struct udf_vds_record vds[VDS_POS_LENGTH];
        unsigned int size_part_descs;
        unsigned int num_part_descs;
        struct part_desc_seq_scan_data *part_descs_loc;
};

static struct udf_vds_record *handle_partition_descriptor(
                                struct buffer_head *bh,
                                struct desc_seq_scan_data *data)
{
        struct partitionDesc *desc = (struct partitionDesc *)bh->b_data;
        int partnum;
        int i;

        partnum = le16_to_cpu(desc->partitionNumber);
        for (i = 0; i < data->num_part_descs; i++)
                if (partnum == data->part_descs_loc[i].partnum)
                        return &(data->part_descs_loc[i].rec);
        if (data->num_part_descs >= data->size_part_descs) {
                struct part_desc_seq_scan_data *new_loc;
                unsigned int new_size = ALIGN(partnum, PART_DESC_ALLOC_STEP);

                new_loc = kcalloc(new_size, sizeof(*new_loc), GFP_KERNEL);
                if (!new_loc)
                        return ERR_PTR(-ENOMEM);
                memcpy(new_loc, data->part_descs_loc,
                       data->size_part_descs * sizeof(*new_loc));
                kfree(data->part_descs_loc);
                data->part_descs_loc = new_loc;
                data->size_part_descs = new_size;
        }
        return &(data->part_descs_loc[data->num_part_descs++].rec);
}


static struct udf_vds_record *get_volume_descriptor_record(uint16_t ident,
                struct buffer_head *bh, struct desc_seq_scan_data *data)
{
        switch (ident) {
        case TAG_IDENT_PVD: /* ISO 13346 3/10.1 */
                return &(data->vds[VDS_POS_PRIMARY_VOL_DESC]);
        case TAG_IDENT_IUVD: /* ISO 13346 3/10.4 */
                return &(data->vds[VDS_POS_IMP_USE_VOL_DESC]);
        case TAG_IDENT_LVD: /* ISO 13346 3/10.6 */
                return &(data->vds[VDS_POS_LOGICAL_VOL_DESC]);
        case TAG_IDENT_USD: /* ISO 13346 3/10.8 */
                return &(data->vds[VDS_POS_UNALLOC_SPACE_DESC]);
        case TAG_IDENT_PD: /* ISO 13346 3/10.5 */
                return handle_partition_descriptor(bh, data);
        }
        return NULL;
}

/*
 * Process a main/reserve volume descriptor sequence.
 *   @block                First block of first extent of the sequence.
 *   @lastblock                Lastblock of first extent of the sequence.
 *   @fileset                There we store extent containing root fileset
 *
 * Returns <0 on error, 0 on success. -EAGAIN is special - try next descriptor
 * sequence
 */
static noinline int udf_process_sequence(
                struct super_block *sb,
                sector_t block, sector_t lastblock,
                struct kernel_lb_addr *fileset)
{
        struct buffer_head *bh = NULL;
        struct udf_vds_record *curr;
        struct generic_desc *gd;
        struct volDescPtr *vdp;
        bool done = false;
        uint32_t vdsn;
        uint16_t ident;
        int ret;
        unsigned int indirections = 0;
        struct desc_seq_scan_data data;
        unsigned int i;

        memset(data.vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH);
        data.size_part_descs = PART_DESC_ALLOC_STEP;
        data.num_part_descs = 0;
        data.part_descs_loc = kcalloc(data.size_part_descs,
                                      sizeof(*data.part_descs_loc),
                                      GFP_KERNEL);
        if (!data.part_descs_loc)
                return -ENOMEM;

        /*
         * Read the main descriptor sequence and find which descriptors
         * are in it.
         */
        for (; (!done && block <= lastblock); block++) {
                bh = udf_read_tagged(sb, block, block, &ident);
                if (!bh)
                        break;

                /* Process each descriptor (ISO 13346 3/8.3-8.4) */
                gd = (struct generic_desc *)bh->b_data;
                vdsn = le32_to_cpu(gd->volDescSeqNum);
                switch (ident) {
                case TAG_IDENT_VDP: /* ISO 13346 3/10.3 */
                        if (++indirections > UDF_MAX_TD_NESTING) {
                                udf_err(sb, "too many Volume Descriptor "
                                        "Pointers (max %u supported)\n",
                                        UDF_MAX_TD_NESTING);
                                brelse(bh);
                                ret = -EIO;
                                goto out;
                        }

                        vdp = (struct volDescPtr *)bh->b_data;
                        block = le32_to_cpu(vdp->nextVolDescSeqExt.extLocation);
                        lastblock = le32_to_cpu(
                                vdp->nextVolDescSeqExt.extLength) >>
                                sb->s_blocksize_bits;
                        lastblock += block - 1;
                        /* For loop is going to increment 'block' again */
                        block--;
                        break;
                case TAG_IDENT_PVD: /* ISO 13346 3/10.1 */
                case TAG_IDENT_IUVD: /* ISO 13346 3/10.4 */
                case TAG_IDENT_LVD: /* ISO 13346 3/10.6 */
                case TAG_IDENT_USD: /* ISO 13346 3/10.8 */
                case TAG_IDENT_PD: /* ISO 13346 3/10.5 */
                        curr = get_volume_descriptor_record(ident, bh, &data);
                        if (IS_ERR(curr)) {
                                brelse(bh);
                                ret = PTR_ERR(curr);
                                goto out;
                        }
                        /* Descriptor we don't care about? */
                        if (!curr)
                                break;
                        if (vdsn >= curr->volDescSeqNum) {
                                curr->volDescSeqNum = vdsn;
                                curr->block = block;
                        }
                        break;
                case TAG_IDENT_TD: /* ISO 13346 3/10.9 */
                        done = true;
                        break;
                }
                brelse(bh);
        }
        /*
         * Now read interesting descriptors again and process them
         * in a suitable order
         */
        if (!data.vds[VDS_POS_PRIMARY_VOL_DESC].block) {
                udf_err(sb, "Primary Volume Descriptor not found!\n");
                ret = -EAGAIN;
                goto out;
        }
        ret = udf_load_pvoldesc(sb, data.vds[VDS_POS_PRIMARY_VOL_DESC].block);
        if (ret < 0)
                goto out;

        if (data.vds[VDS_POS_LOGICAL_VOL_DESC].block) {
                ret = udf_load_logicalvol(sb,
                                data.vds[VDS_POS_LOGICAL_VOL_DESC].block,
                                fileset);
                if (ret < 0)
                        goto out;
        }

        /* Now handle prevailing Partition Descriptors */
        for (i = 0; i < data.num_part_descs; i++) {
                ret = udf_load_partdesc(sb, data.part_descs_loc[i].rec.block);
                if (ret < 0)
                        goto out;
        }
        ret = 0;
out:
        kfree(data.part_descs_loc);
        return ret;
}

/*
 * Load Volume Descriptor Sequence described by anchor in bh
 *
 * Returns <0 on error, 0 on success
 */
static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
                             struct kernel_lb_addr *fileset)
{
        struct anchorVolDescPtr *anchor;
        sector_t main_s, main_e, reserve_s, reserve_e;
        int ret;

        anchor = (struct anchorVolDescPtr *)bh->b_data;

        /* Locate the main sequence */
        main_s = le32_to_cpu(anchor->mainVolDescSeqExt.extLocation);
        main_e = le32_to_cpu(anchor->mainVolDescSeqExt.extLength);
        main_e = main_e >> sb->s_blocksize_bits;
        main_e += main_s - 1;

        /* Locate the reserve sequence */
        reserve_s = le32_to_cpu(anchor->reserveVolDescSeqExt.extLocation);
        reserve_e = le32_to_cpu(anchor->reserveVolDescSeqExt.extLength);
        reserve_e = reserve_e >> sb->s_blocksize_bits;
        reserve_e += reserve_s - 1;

        /* Process the main & reserve sequences */
        /* responsible for finding the PartitionDesc(s) */
        ret = udf_process_sequence(sb, main_s, main_e, fileset);
        if (ret != -EAGAIN)
                return ret;
        udf_sb_free_partitions(sb);
        ret = udf_process_sequence(sb, reserve_s, reserve_e, fileset);
        if (ret < 0) {
                udf_sb_free_partitions(sb);
                /* No sequence was OK, return -EIO */
                if (ret == -EAGAIN)
                        ret = -EIO;
        }
        return ret;
}

/*
 * Check whether there is an anchor block in the given block and
 * load Volume Descriptor Sequence if so.
 *
 * Returns <0 on error, 0 on success, -EAGAIN is special - try next anchor
 * block
 */
static int udf_check_anchor_block(struct super_block *sb, sector_t block,
                                  struct kernel_lb_addr *fileset)
{
        struct buffer_head *bh;
        uint16_t ident;
        int ret;

        bh = udf_read_tagged(sb, block, block, &ident);
        if (!bh)
                return -EAGAIN;
        if (ident != TAG_IDENT_AVDP) {
                brelse(bh);
                return -EAGAIN;
        }
        ret = udf_load_sequence(sb, bh, fileset);
        brelse(bh);
        return ret;
}

/*
 * Search for an anchor volume descriptor pointer.
 *
 * Returns < 0 on error, 0 on success. -EAGAIN is special - try next set
 * of anchors.
 */
static int udf_scan_anchors(struct super_block *sb, udf_pblk_t *lastblock,
                            struct kernel_lb_addr *fileset)
{
        udf_pblk_t last[6];
        int i;
        struct udf_sb_info *sbi = UDF_SB(sb);
        int last_count = 0;
        int ret;

        /* First try user provided anchor */
        if (sbi->s_anchor) {
                ret = udf_check_anchor_block(sb, sbi->s_anchor, fileset);
                if (ret != -EAGAIN)
                        return ret;
        }
        /*
         * according to spec, anchor is in either:
         *     block 256
         *     lastblock-256
         *     lastblock
         *  however, if the disc isn't closed, it could be 512.
         */
        ret = udf_check_anchor_block(sb, sbi->s_session + 256, fileset);
        if (ret != -EAGAIN)
                return ret;
        /*
         * The trouble is which block is the last one. Drives often misreport
         * this so we try various possibilities.
         */
        last[last_count++] = *lastblock;
        if (*lastblock >= 1)
                last[last_count++] = *lastblock - 1;
        last[last_count++] = *lastblock + 1;
        if (*lastblock >= 2)
                last[last_count++] = *lastblock - 2;
        if (*lastblock >= 150)
                last[last_count++] = *lastblock - 150;
        if (*lastblock >= 152)
                last[last_count++] = *lastblock - 152;

        for (i = 0; i < last_count; i++) {
                if (last[i] >= sb_bdev_nr_blocks(sb))
                        continue;
                ret = udf_check_anchor_block(sb, last[i], fileset);
                if (ret != -EAGAIN) {
                        if (!ret)
                                *lastblock = last[i];
                        return ret;
                }
                if (last[i] < 256)
                        continue;
                ret = udf_check_anchor_block(sb, last[i] - 256, fileset);
                if (ret != -EAGAIN) {
                        if (!ret)
                                *lastblock = last[i];
                        return ret;
                }
        }

        /* Finally try block 512 in case media is open */
        return udf_check_anchor_block(sb, sbi->s_session + 512, fileset);
}

/*
 * Check Volume Structure Descriptor, find Anchor block and load Volume
 * Descriptor Sequence.
 *
 * Returns < 0 on error, 0 on success. -EAGAIN is special meaning anchor
 * block was not found.
 */
static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
                        int silent, struct kernel_lb_addr *fileset)
{
        struct udf_sb_info *sbi = UDF_SB(sb);
        int nsr = 0;
        int ret;

        if (!sb_set_blocksize(sb, uopt->blocksize)) {
                if (!silent)
                        udf_warn(sb, "Bad block size\n");
                return -EINVAL;
        }
        sbi->s_last_block = uopt->lastblock;
        if (!UDF_QUERY_FLAG(sb, UDF_FLAG_NOVRS)) {
                /* Check that it is NSR02 compliant */
                nsr = udf_check_vsd(sb);
                if (!nsr) {
                        if (!silent)
                                udf_warn(sb, "No VRS found\n");
                        return -EINVAL;
                }
                if (nsr == -1)
                        udf_debug("Failed to read sector at offset %d. "
                                  "Assuming open disc. Skipping validity "
                                  "check\n", VSD_FIRST_SECTOR_OFFSET);
                if (!sbi->s_last_block)
                        sbi->s_last_block = udf_get_last_block(sb);
        } else {
                udf_debug("Validity check skipped because of novrs option\n");
        }

        /* Look for anchor block and load Volume Descriptor Sequence */
        sbi->s_anchor = uopt->anchor;
        ret = udf_scan_anchors(sb, &sbi->s_last_block, fileset);
        if (ret < 0) {
                if (!silent && ret == -EAGAIN)
                        udf_warn(sb, "No anchor found\n");
                return ret;
        }
        return 0;
}

static void udf_finalize_lvid(struct logicalVolIntegrityDesc *lvid)
{
        struct timespec64 ts;

        ktime_get_real_ts64(&ts);
        udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
        lvid->descTag.descCRC = cpu_to_le16(
                crc_itu_t(0, (char *)lvid + sizeof(struct tag),
                        le16_to_cpu(lvid->descTag.descCRCLength)));
        lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
}

static void udf_open_lvid(struct super_block *sb)
{
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct buffer_head *bh = sbi->s_lvid_bh;
        struct logicalVolIntegrityDesc *lvid;
        struct logicalVolIntegrityDescImpUse *lvidiu;

        if (!bh)
                return;
        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
        lvidiu = udf_sb_lvidiu(sb);
        if (!lvidiu)
                return;

        mutex_lock(&sbi->s_alloc_mutex);
        lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
        lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
        if (le32_to_cpu(lvid->integrityType) == LVID_INTEGRITY_TYPE_CLOSE)
                lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
        else
                UDF_SET_FLAG(sb, UDF_FLAG_INCONSISTENT);

        udf_finalize_lvid(lvid);
        mark_buffer_dirty(bh);
        sbi->s_lvid_dirty = 0;
        mutex_unlock(&sbi->s_alloc_mutex);
        /* Make opening of filesystem visible on the media immediately */
        sync_dirty_buffer(bh);
}

static void udf_close_lvid(struct super_block *sb)
{
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct buffer_head *bh = sbi->s_lvid_bh;
        struct logicalVolIntegrityDesc *lvid;
        struct logicalVolIntegrityDescImpUse *lvidiu;

        if (!bh)
                return;
        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
        lvidiu = udf_sb_lvidiu(sb);
        if (!lvidiu)
                return;

        mutex_lock(&sbi->s_alloc_mutex);
        lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
        lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
        if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev))
                lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION);
        if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev))
                lvidiu->minUDFReadRev = cpu_to_le16(sbi->s_udfrev);
        if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFWriteRev))
                lvidiu->minUDFWriteRev = cpu_to_le16(sbi->s_udfrev);
        if (!UDF_QUERY_FLAG(sb, UDF_FLAG_INCONSISTENT))
                lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);

        /*
         * We set buffer uptodate unconditionally here to avoid spurious
         * warnings from mark_buffer_dirty() when previous EIO has marked
         * the buffer as !uptodate
         */
        set_buffer_uptodate(bh);
        udf_finalize_lvid(lvid);
        mark_buffer_dirty(bh);
        sbi->s_lvid_dirty = 0;
        mutex_unlock(&sbi->s_alloc_mutex);
        /* Make closing of filesystem visible on the media immediately */
        sync_dirty_buffer(bh);
}

u64 lvid_get_unique_id(struct super_block *sb)
{
        struct buffer_head *bh;
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct logicalVolIntegrityDesc *lvid;
        struct logicalVolHeaderDesc *lvhd;
        u64 uniqueID;
        u64 ret;

        bh = sbi->s_lvid_bh;
        if (!bh)
                return 0;

        lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
        lvhd = (struct logicalVolHeaderDesc *)lvid->logicalVolContentsUse;

        mutex_lock(&sbi->s_alloc_mutex);
        ret = uniqueID = le64_to_cpu(lvhd->uniqueID);
        if (!(++uniqueID & 0xFFFFFFFF))
                uniqueID += 16;
        lvhd->uniqueID = cpu_to_le64(uniqueID);
        udf_updated_lvid(sb);
        mutex_unlock(&sbi->s_alloc_mutex);

        return ret;
}

static int udf_fill_super(struct super_block *sb, struct fs_context *fc)
{
        int ret = -EINVAL;
        struct inode *inode = NULL;
        struct udf_options *uopt = fc->fs_private;
        struct kernel_lb_addr rootdir, fileset;
        struct udf_sb_info *sbi;
        bool lvid_open = false;
        int silent = fc->sb_flags & SB_SILENT;

        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;

        sb->s_fs_info = sbi;

        mutex_init(&sbi->s_alloc_mutex);

        fileset.logicalBlockNum = 0xFFFFFFFF;
        fileset.partitionReferenceNum = 0xFFFF;

        sbi->s_flags = uopt->flags;
        sbi->s_uid = uopt->uid;
        sbi->s_gid = uopt->gid;
        sbi->s_umask = uopt->umask;
        sbi->s_fmode = uopt->fmode;
        sbi->s_dmode = uopt->dmode;
        sbi->s_nls_map = uopt->nls_map;
        uopt->nls_map = NULL;
        rwlock_init(&sbi->s_cred_lock);

        if (uopt->session == 0xFFFFFFFF)
                sbi->s_session = udf_get_last_session(sb);
        else
                sbi->s_session = uopt->session;

        udf_debug("Multi-session=%d\n", sbi->s_session);

        /* Fill in the rest of the superblock */
        sb->s_op = &udf_sb_ops;
        sb->s_export_op = &udf_export_ops;

        sb->s_magic = UDF_SUPER_MAGIC;
        sb->s_time_gran = 1000;

        if (uopt->flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
                ret = udf_load_vrs(sb, uopt, silent, &fileset);
        } else {
                uopt->blocksize = bdev_logical_block_size(sb->s_bdev);
                while (uopt->blocksize <= 4096) {
                        ret = udf_load_vrs(sb, uopt, silent, &fileset);
                        if (ret < 0) {
                                if (!silent && ret != -EACCES) {
                                        pr_notice("Scanning with blocksize %u failed\n",
                                                  uopt->blocksize);
                                }
                                brelse(sbi->s_lvid_bh);
                                sbi->s_lvid_bh = NULL;
                                /*
                                 * EACCES is special - we want to propagate to
                                 * upper layers that we cannot handle RW mount.
                                 */
                                if (ret == -EACCES)
                                        break;
                        } else
                                break;

                        uopt->blocksize <<= 1;
                }
        }
        if (ret < 0) {
                if (ret == -EAGAIN) {
                        udf_warn(sb, "No partition found (1)\n");
                        ret = -EINVAL;
                }
                goto error_out;
        }

        udf_debug("Lastblock=%u\n", sbi->s_last_block);

        if (sbi->s_lvid_bh) {
                struct logicalVolIntegrityDescImpUse *lvidiu =
                                                        udf_sb_lvidiu(sb);
                uint16_t minUDFReadRev;
                uint16_t minUDFWriteRev;

                if (!lvidiu) {
                        ret = -EINVAL;
                        goto error_out;
                }
                minUDFReadRev = le16_to_cpu(lvidiu->minUDFReadRev);
                minUDFWriteRev = le16_to_cpu(lvidiu->minUDFWriteRev);
                if (minUDFReadRev > UDF_MAX_READ_VERSION) {
                        udf_err(sb, "minUDFReadRev=%x (max is %x)\n",
                                minUDFReadRev,
                                UDF_MAX_READ_VERSION);
                        ret = -EINVAL;
                        goto error_out;
                } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION) {
                        if (!sb_rdonly(sb)) {
                                ret = -EACCES;
                                goto error_out;
                        }
                        UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
                }

                sbi->s_udfrev = minUDFWriteRev;

                if (minUDFReadRev >= UDF_VERS_USE_EXTENDED_FE)
                        UDF_SET_FLAG(sb, UDF_FLAG_USE_EXTENDED_FE);
                if (minUDFReadRev >= UDF_VERS_USE_STREAMS)
                        UDF_SET_FLAG(sb, UDF_FLAG_USE_STREAMS);
        }

        if (!sbi->s_partitions) {
                udf_warn(sb, "No partition found (2)\n");
                ret = -EINVAL;
                goto error_out;
        }

        if (sbi->s_partmaps[sbi->s_partition].s_partition_flags &
                        UDF_PART_FLAG_READ_ONLY) {
                if (!sb_rdonly(sb)) {
                        ret = -EACCES;
                        goto error_out;
                }
                UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT);
        }

        ret = udf_find_fileset(sb, &fileset, &rootdir);
        if (ret < 0) {
                udf_warn(sb, "No fileset found\n");
                goto error_out;
        }

        if (!silent) {
                struct timestamp ts;
                udf_time_to_disk_stamp(&ts, sbi->s_record_time);
                udf_info("Mounting volume '%s', timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
                         sbi->s_volume_ident,
                         le16_to_cpu(ts.year), ts.month, ts.day,
                         ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone));
        }
        if (!sb_rdonly(sb)) {
                udf_open_lvid(sb);
                lvid_open = true;
        }

        /* Assign the root inode */
        /* assign inodes by physical block number */
        /* perhaps it's not extensible enough, but for now ... */
        inode = udf_iget(sb, &rootdir);
        if (IS_ERR(inode)) {
                udf_err(sb, "Error in udf_iget, block=%u, partition=%u\n",
                       rootdir.logicalBlockNum, rootdir.partitionReferenceNum);
                ret = PTR_ERR(inode);
                goto error_out;
        }

        /* Allocate a dentry for the root inode */
        sb->s_root = d_make_root(inode);
        if (!sb->s_root) {
                udf_err(sb, "Couldn't allocate root dentry\n");
                ret = -ENOMEM;
                goto error_out;
        }
        sb->s_maxbytes = UDF_MAX_FILESIZE;
        sb->s_max_links = UDF_MAX_LINKS;
        return 0;

error_out:
        iput(sbi->s_vat_inode);
        unload_nls(uopt->nls_map);
        if (lvid_open)
                udf_close_lvid(sb);
        brelse(sbi->s_lvid_bh);
        udf_sb_free_partitions(sb);
        kfree(sbi);
        sb->s_fs_info = NULL;

        return ret;
}

void _udf_err(struct super_block *sb, const char *function,
              const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        va_start(args, fmt);

        vaf.fmt = fmt;
        vaf.va = &args;

        pr_err("error (device %s): %s: %pV", sb->s_id, function, &vaf);

        va_end(args);
}

void _udf_warn(struct super_block *sb, const char *function,
               const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        va_start(args, fmt);

        vaf.fmt = fmt;
        vaf.va = &args;

        pr_warn("warning (device %s): %s: %pV", sb->s_id, function, &vaf);

        va_end(args);
}

static void udf_put_super(struct super_block *sb)
{
        struct udf_sb_info *sbi;

        sbi = UDF_SB(sb);

        iput(sbi->s_vat_inode);
        unload_nls(sbi->s_nls_map);
        if (!sb_rdonly(sb))
                udf_close_lvid(sb);
        brelse(sbi->s_lvid_bh);
        udf_sb_free_partitions(sb);
        mutex_destroy(&sbi->s_alloc_mutex);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
}

static int udf_sync_fs(struct super_block *sb, int wait)
{
        struct udf_sb_info *sbi = UDF_SB(sb);

        mutex_lock(&sbi->s_alloc_mutex);
        if (sbi->s_lvid_dirty) {
                struct buffer_head *bh = sbi->s_lvid_bh;
                struct logicalVolIntegrityDesc *lvid;

                lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
                udf_finalize_lvid(lvid);

                /*
                 * Blockdevice will be synced later so we don't have to submit
                 * the buffer for IO
                 */
                mark_buffer_dirty(bh);
                sbi->s_lvid_dirty = 0;
        }
        mutex_unlock(&sbi->s_alloc_mutex);

        return 0;
}

static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct super_block *sb = dentry->d_sb;
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct logicalVolIntegrityDescImpUse *lvidiu;
        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);

        lvidiu = udf_sb_lvidiu(sb);
        buf->f_type = UDF_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = sbi->s_partmaps[sbi->s_partition].s_partition_len;
        buf->f_bfree = udf_count_free(sb);
        buf->f_bavail = buf->f_bfree;
        /*
         * Let's pretend each free block is also a free 'inode' since UDF does
         * not have separate preallocated table of inodes.
         */
        buf->f_files = (lvidiu != NULL ? (le32_to_cpu(lvidiu->numFiles) +
                                          le32_to_cpu(lvidiu->numDirs)) : 0)
                        + buf->f_bfree;
        buf->f_ffree = buf->f_bfree;
        buf->f_namelen = UDF_NAME_LEN;
        buf->f_fsid = u64_to_fsid(id);

        return 0;
}

static unsigned int udf_count_free_bitmap(struct super_block *sb,
                                          struct udf_bitmap *bitmap)
{
        struct buffer_head *bh = NULL;
        unsigned int accum = 0;
        int index;
        udf_pblk_t block = 0, newblock;
        struct kernel_lb_addr loc;
        uint32_t bytes;
        uint8_t *ptr;
        uint16_t ident;
        struct spaceBitmapDesc *bm;

        loc.logicalBlockNum = bitmap->s_extPosition;
        loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
        bh = udf_read_ptagged(sb, &loc, 0, &ident);

        if (!bh) {
                udf_err(sb, "udf_count_free failed\n");
                goto out;
        } else if (ident != TAG_IDENT_SBD) {
                brelse(bh);
                udf_err(sb, "udf_count_free failed\n");
                goto out;
        }

        bm = (struct spaceBitmapDesc *)bh->b_data;
        bytes = le32_to_cpu(bm->numOfBytes);
        index = sizeof(struct spaceBitmapDesc); /* offset in first block only */
        ptr = (uint8_t *)bh->b_data;

        while (bytes > 0) {
                u32 cur_bytes = min_t(u32, bytes, sb->s_blocksize - index);
                accum += bitmap_weight((const unsigned long *)(ptr + index),
                                        cur_bytes * 8);
                bytes -= cur_bytes;
                if (bytes) {
                        brelse(bh);
                        newblock = udf_get_lb_pblock(sb, &loc, ++block);
                        bh = sb_bread(sb, newblock);
                        if (!bh) {
                                udf_debug("read failed\n");
                                goto out;
                        }
                        index = 0;
                        ptr = (uint8_t *)bh->b_data;
                }
        }
        brelse(bh);
out:
        return accum;
}

static unsigned int udf_count_free_table(struct super_block *sb,
                                         struct inode *table)
{
        unsigned int accum = 0;
        uint32_t elen;
        struct kernel_lb_addr eloc;
        struct extent_position epos;

        mutex_lock(&UDF_SB(sb)->s_alloc_mutex);
        epos.block = UDF_I(table)->i_location;
        epos.offset = sizeof(struct unallocSpaceEntry);
        epos.bh = NULL;

        while (udf_next_aext(table, &epos, &eloc, &elen, 1) != -1)
                accum += (elen >> table->i_sb->s_blocksize_bits);

        brelse(epos.bh);
        mutex_unlock(&UDF_SB(sb)->s_alloc_mutex);

        return accum;
}

static unsigned int udf_count_free(struct super_block *sb)
{
        unsigned int accum = 0;
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct udf_part_map *map;
        unsigned int part = sbi->s_partition;
        int ptype = sbi->s_partmaps[part].s_partition_type;

        if (ptype == UDF_METADATA_MAP25) {
                part = sbi->s_partmaps[part].s_type_specific.s_metadata.
                                                        s_phys_partition_ref;
        } else if (ptype == UDF_VIRTUAL_MAP15 || ptype == UDF_VIRTUAL_MAP20) {
                /*
                 * Filesystems with VAT are append-only and we cannot write to
                  * them. Let's just report 0 here.
                 */
                return 0;
        }

        if (sbi->s_lvid_bh) {
                struct logicalVolIntegrityDesc *lvid =
                        (struct logicalVolIntegrityDesc *)
                        sbi->s_lvid_bh->b_data;
                if (le32_to_cpu(lvid->numOfPartitions) > part) {
                        accum = le32_to_cpu(
                                        lvid->freeSpaceTable[part]);
                        if (accum == 0xFFFFFFFF)
                                accum = 0;
                }
        }

        if (accum)
                return accum;

        map = &sbi->s_partmaps[part];
        if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
                accum += udf_count_free_bitmap(sb,
                                               map->s_uspace.s_bitmap);
        }
        if (accum)
                return accum;

        if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) {
                accum += udf_count_free_table(sb,
                                              map->s_uspace.s_table);
        }
        return accum;
}

MODULE_AUTHOR("Ben Fennema");
MODULE_DESCRIPTION("Universal Disk Format Filesystem");
MODULE_LICENSE("GPL");
module_init(init_udf_fs)
module_exit(exit_udf_fs)




































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 *  Driver for 8250/16550-type serial ports
 *
 *  Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o.
 *
 *  Copyright (C) 2001 Russell King.
 */

#include <linux/bits.h>
#include <linux/serial_8250.h>
#include <linux/serial_reg.h>
#include <linux/dmaengine.h>

#include "../serial_mctrl_gpio.h"

struct uart_8250_dma {
        int (*tx_dma)(struct uart_8250_port *p);
        int (*rx_dma)(struct uart_8250_port *p);
        void (*prepare_tx_dma)(struct uart_8250_port *p);
        void (*prepare_rx_dma)(struct uart_8250_port *p);

        /* Filter function */
        dma_filter_fn                fn;
        /* Parameter to the filter function */
        void                        *rx_param;
        void                        *tx_param;

        struct dma_slave_config        rxconf;
        struct dma_slave_config        txconf;

        struct dma_chan                *rxchan;
        struct dma_chan                *txchan;

        /* Device address base for DMA operations */
        phys_addr_t                rx_dma_addr;
        phys_addr_t                tx_dma_addr;

        /* DMA address of the buffer in memory */
        dma_addr_t                rx_addr;
        dma_addr_t                tx_addr;

        dma_cookie_t                rx_cookie;
        dma_cookie_t                tx_cookie;

        void                        *rx_buf;

        size_t                        rx_size;
        size_t                        tx_size;

        unsigned char                tx_running;
        unsigned char                tx_err;
        unsigned char                rx_running;
};

struct old_serial_port {
        unsigned int uart;
        unsigned int baud_base;
        unsigned int port;
        unsigned int irq;
        upf_t        flags;
        unsigned char io_type;
        unsigned char __iomem *iomem_base;
        unsigned short iomem_reg_shift;
};

struct serial8250_config {
        const char        *name;
        unsigned short        fifo_size;
        unsigned short        tx_loadsz;
        unsigned char        fcr;
        unsigned char        rxtrig_bytes[UART_FCR_R_TRIG_MAX_STATE];
        unsigned int        flags;
};

#define UART_CAP_FIFO        BIT(8)        /* UART has FIFO */
#define UART_CAP_EFR        BIT(9)        /* UART has EFR */
#define UART_CAP_SLEEP        BIT(10)        /* UART has IER sleep */
#define UART_CAP_AFE        BIT(11)        /* MCR-based hw flow control */
#define UART_CAP_UUE        BIT(12)        /* UART needs IER bit 6 set (Xscale) */
#define UART_CAP_RTOIE        BIT(13)        /* UART needs IER bit 4 set (Xscale, Tegra) */
#define UART_CAP_HFIFO        BIT(14)        /* UART has a "hidden" FIFO */
#define UART_CAP_RPM        BIT(15)        /* Runtime PM is active while idle */
#define UART_CAP_IRDA        BIT(16)        /* UART supports IrDA line discipline */
#define UART_CAP_MINI        BIT(17)        /* Mini UART on BCM283X family lacks:
                                         * STOP PARITY EPAR SPAR WLEN5 WLEN6
                                         */
#define UART_CAP_NOTEMT        BIT(18)        /* UART without interrupt on TEMT available */

#define UART_BUG_QUOT        BIT(0)        /* UART has buggy quot LSB */
#define UART_BUG_TXEN        BIT(1)        /* UART has buggy TX IIR status */
#define UART_BUG_NOMSR        BIT(2)        /* UART has buggy MSR status bits (Au1x00) */
#define UART_BUG_THRE        BIT(3)        /* UART has buggy THRE reassertion */
#define UART_BUG_TXRACE        BIT(5)        /* UART Tx fails to set remote DR */


#ifdef CONFIG_SERIAL_8250_SHARE_IRQ
#define SERIAL8250_SHARE_IRQS 1
#else
#define SERIAL8250_SHARE_IRQS 0
#endif

#define SERIAL8250_PORT_FLAGS(_base, _irq, _flags)                \
        {                                                        \
                .iobase                = _base,                        \
                .irq                = _irq,                                \
                .uartclk        = 1843200,                        \
                .iotype                = UPIO_PORT,                        \
                .flags                = UPF_BOOT_AUTOCONF | (_flags),        \
        }

#define SERIAL8250_PORT(_base, _irq) SERIAL8250_PORT_FLAGS(_base, _irq, 0)


static inline int serial_in(struct uart_8250_port *up, int offset)
{
        return up->port.serial_in(&up->port, offset);
}

static inline void serial_out(struct uart_8250_port *up, int offset, int value)
{
        up->port.serial_out(&up->port, offset, value);
}

/**
 *        serial_lsr_in - Read LSR register and preserve flags across reads
 *        @up:        uart 8250 port
 *
 *        Read LSR register and handle saving non-preserved flags across reads.
 *        The flags that are not preserved across reads are stored into
 *        up->lsr_saved_flags.
 *
 *        Returns LSR value or'ed with the preserved flags (if any).
 */
static inline u16 serial_lsr_in(struct uart_8250_port *up)
{
        u16 lsr = up->lsr_saved_flags;

        lsr |= serial_in(up, UART_LSR);
        up->lsr_saved_flags = lsr & up->lsr_save_mask;

        return lsr;
}

/*
 * For the 16C950
 */
static void serial_icr_write(struct uart_8250_port *up, int offset, int value)
{
        serial_out(up, UART_SCR, offset);
        serial_out(up, UART_ICR, value);
}

static unsigned int __maybe_unused serial_icr_read(struct uart_8250_port *up,
                                                   int offset)
{
        unsigned int value;

        serial_icr_write(up, UART_ACR, up->acr | UART_ACR_ICRRD);
        serial_out(up, UART_SCR, offset);
        value = serial_in(up, UART_ICR);
        serial_icr_write(up, UART_ACR, up->acr);

        return value;
}

void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p);

static inline u32 serial_dl_read(struct uart_8250_port *up)
{
        return up->dl_read(up);
}

static inline void serial_dl_write(struct uart_8250_port *up, u32 value)
{
        up->dl_write(up, value);
}

static inline bool serial8250_set_THRI(struct uart_8250_port *up)
{
        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&up->port.lock);

        if (up->ier & UART_IER_THRI)
                return false;
        up->ier |= UART_IER_THRI;
        serial_out(up, UART_IER, up->ier);
        return true;
}

static inline bool serial8250_clear_THRI(struct uart_8250_port *up)
{
        /* Port locked to synchronize UART_IER access against the console. */
        lockdep_assert_held_once(&up->port.lock);

        if (!(up->ier & UART_IER_THRI))
                return false;
        up->ier &= ~UART_IER_THRI;
        serial_out(up, UART_IER, up->ier);
        return true;
}

struct uart_8250_port *serial8250_get_port(int line);

void serial8250_rpm_get(struct uart_8250_port *p);
void serial8250_rpm_put(struct uart_8250_port *p);

void serial8250_rpm_get_tx(struct uart_8250_port *p);
void serial8250_rpm_put_tx(struct uart_8250_port *p);

int serial8250_em485_config(struct uart_port *port, struct ktermios *termios,
                            struct serial_rs485 *rs485);
void serial8250_em485_start_tx(struct uart_8250_port *p);
void serial8250_em485_stop_tx(struct uart_8250_port *p);
void serial8250_em485_destroy(struct uart_8250_port *p);
extern struct serial_rs485 serial8250_em485_supported;

/* MCR <-> TIOCM conversion */
static inline int serial8250_TIOCM_to_MCR(int tiocm)
{
        int mcr = 0;

        if (tiocm & TIOCM_RTS)
                mcr |= UART_MCR_RTS;
        if (tiocm & TIOCM_DTR)
                mcr |= UART_MCR_DTR;
        if (tiocm & TIOCM_OUT1)
                mcr |= UART_MCR_OUT1;
        if (tiocm & TIOCM_OUT2)
                mcr |= UART_MCR_OUT2;
        if (tiocm & TIOCM_LOOP)
                mcr |= UART_MCR_LOOP;

        return mcr;
}

static inline int serial8250_MCR_to_TIOCM(int mcr)
{
        int tiocm = 0;

        if (mcr & UART_MCR_RTS)
                tiocm |= TIOCM_RTS;
        if (mcr & UART_MCR_DTR)
                tiocm |= TIOCM_DTR;
        if (mcr & UART_MCR_OUT1)
                tiocm |= TIOCM_OUT1;
        if (mcr & UART_MCR_OUT2)
                tiocm |= TIOCM_OUT2;
        if (mcr & UART_MCR_LOOP)
                tiocm |= TIOCM_LOOP;

        return tiocm;
}

/* MSR <-> TIOCM conversion */
static inline int serial8250_MSR_to_TIOCM(int msr)
{
        int tiocm = 0;

        if (msr & UART_MSR_DCD)
                tiocm |= TIOCM_CAR;
        if (msr & UART_MSR_RI)
                tiocm |= TIOCM_RNG;
        if (msr & UART_MSR_DSR)
                tiocm |= TIOCM_DSR;
        if (msr & UART_MSR_CTS)
                tiocm |= TIOCM_CTS;

        return tiocm;
}

static inline void serial8250_out_MCR(struct uart_8250_port *up, int value)
{
        serial_out(up, UART_MCR, value);

        if (up->gpios)
                mctrl_gpio_set(up->gpios, serial8250_MCR_to_TIOCM(value));
}

static inline int serial8250_in_MCR(struct uart_8250_port *up)
{
        int mctrl;

        mctrl = serial_in(up, UART_MCR);

        if (up->gpios) {
                unsigned int mctrl_gpio = 0;

                mctrl_gpio = mctrl_gpio_get_outputs(up->gpios, &mctrl_gpio);
                mctrl |= serial8250_TIOCM_to_MCR(mctrl_gpio);
        }

        return mctrl;
}

#ifdef CONFIG_SERIAL_8250_PNP
int serial8250_pnp_init(void);
void serial8250_pnp_exit(void);
#else
static inline int serial8250_pnp_init(void) { return 0; }
static inline void serial8250_pnp_exit(void) { }
#endif

#ifdef CONFIG_SERIAL_8250_FINTEK
int fintek_8250_probe(struct uart_8250_port *uart);
#else
static inline int fintek_8250_probe(struct uart_8250_port *uart) { return 0; }
#endif

#ifdef CONFIG_ARCH_OMAP1
#include <linux/soc/ti/omap1-soc.h>
static inline int is_omap1_8250(struct uart_8250_port *pt)
{
        int res;

        switch (pt->port.mapbase) {
        case OMAP1_UART1_BASE:
        case OMAP1_UART2_BASE:
        case OMAP1_UART3_BASE:
                res = 1;
                break;
        default:
                res = 0;
                break;
        }

        return res;
}

static inline int is_omap1510_8250(struct uart_8250_port *pt)
{
        if (!cpu_is_omap1510())
                return 0;

        return is_omap1_8250(pt);
}
#else
static inline int is_omap1_8250(struct uart_8250_port *pt)
{
        return 0;
}
static inline int is_omap1510_8250(struct uart_8250_port *pt)
{
        return 0;
}
#endif

#ifdef CONFIG_SERIAL_8250_DMA
extern int serial8250_tx_dma(struct uart_8250_port *);
extern int serial8250_rx_dma(struct uart_8250_port *);
extern void serial8250_rx_dma_flush(struct uart_8250_port *);
extern int serial8250_request_dma(struct uart_8250_port *);
extern void serial8250_release_dma(struct uart_8250_port *);

static inline void serial8250_do_prepare_tx_dma(struct uart_8250_port *p)
{
        struct uart_8250_dma *dma = p->dma;

        if (dma->prepare_tx_dma)
                dma->prepare_tx_dma(p);
}

static inline void serial8250_do_prepare_rx_dma(struct uart_8250_port *p)
{
        struct uart_8250_dma *dma = p->dma;

        if (dma->prepare_rx_dma)
                dma->prepare_rx_dma(p);
}

static inline bool serial8250_tx_dma_running(struct uart_8250_port *p)
{
        struct uart_8250_dma *dma = p->dma;

        return dma && dma->tx_running;
}
#else
static inline int serial8250_tx_dma(struct uart_8250_port *p)
{
        return -1;
}
static inline int serial8250_rx_dma(struct uart_8250_port *p)
{
        return -1;
}
static inline void serial8250_rx_dma_flush(struct uart_8250_port *p) { }
static inline int serial8250_request_dma(struct uart_8250_port *p)
{
        return -1;
}
static inline void serial8250_release_dma(struct uart_8250_port *p) { }

static inline bool serial8250_tx_dma_running(struct uart_8250_port *p)
{
        return false;
}
#endif

static inline int ns16550a_goto_highspeed(struct uart_8250_port *up)
{
        unsigned char status;

        status = serial_in(up, 0x04); /* EXCR2 */
#define PRESL(x) ((x) & 0x30)
        if (PRESL(status) == 0x10) {
                /* already in high speed mode */
                return 0;
        } else {
                status &= ~0xB0; /* Disable LOCK, mask out PRESL[01] */
                status |= 0x10;  /* 1.625 divisor for baud_base --> 921600 */
                serial_out(up, 0x04, status);
        }
        return 1;
}

static inline int serial_index(struct uart_port *port)
{
        return port->minor - 64;
}




















































































































































































































































































































































































































































































































































































    1 
    1 


















































































































































































   12 





































































































































































































































































































    1 

    9 

















































    6 


























































































































































































































































































































    5 

















    6 
    3 































    5 
    5 
    1 
















    6 
    3 












































































































































































    1 
   28 




   27 




    8 




































































    7 












   23 
   27 











    1 










    2 
    3 






















   20 
   25 




    7 























    1 

















    3 



    1 
    1 










































































































































































































































































































































































































































   14 

   13 







   10 
















































































































































































    1 
    6 



































































































































































































































































































































    2 
    1 









    4 

























































    2 
    1 

    6 
















    2 
    1 

    6 





















































































    3 




































    1 



















    3 
















































































    1 
    3 


















































































































    1 
































































































































































    1 





































    2 






    1 








    1 




















































































































    5 


    1 




    1 
    3 




























    3 




    3 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FS_H
#define _LINUX_FS_H

#include <linux/linkage.h>
#include <linux/wait_bit.h>
#include <linux/kdev_t.h>
#include <linux/dcache.h>
#include <linux/path.h>
#include <linux/stat.h>
#include <linux/cache.h>
#include <linux/list.h>
#include <linux/list_lru.h>
#include <linux/llist.h>
#include <linux/radix-tree.h>
#include <linux/xarray.h>
#include <linux/rbtree.h>
#include <linux/init.h>
#include <linux/pid.h>
#include <linux/bug.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/mm_types.h>
#include <linux/capability.h>
#include <linux/semaphore.h>
#include <linux/fcntl.h>
#include <linux/rculist_bl.h>
#include <linux/atomic.h>
#include <linux/shrinker.h>
#include <linux/migrate_mode.h>
#include <linux/uidgid.h>
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/workqueue.h>
#include <linux/delayed_call.h>
#include <linux/uuid.h>
#include <linux/errseq.h>
#include <linux/ioprio.h>
#include <linux/fs_types.h>
#include <linux/build_bug.h>
#include <linux/stddef.h>
#include <linux/mount.h>
#include <linux/cred.h>
#include <linux/mnt_idmapping.h>
#include <linux/slab.h>
#include <linux/maple_tree.h>
#include <linux/rw_hint.h>

#include <asm/byteorder.h>
#include <uapi/linux/fs.h>

struct backing_dev_info;
struct bdi_writeback;
struct bio;
struct io_comp_batch;
struct export_operations;
struct fiemap_extent_info;
struct hd_geometry;
struct iovec;
struct kiocb;
struct kobject;
struct pipe_inode_info;
struct poll_table_struct;
struct kstatfs;
struct vm_area_struct;
struct vfsmount;
struct cred;
struct swap_info_struct;
struct seq_file;
struct workqueue_struct;
struct iov_iter;
struct fscrypt_inode_info;
struct fscrypt_operations;
struct fsverity_info;
struct fsverity_operations;
struct fsnotify_mark_connector;
struct fsnotify_sb_info;
struct fs_context;
struct fs_parameter_spec;
struct fileattr;
struct iomap_ops;

extern void __init inode_init(void);
extern void __init inode_init_early(void);
extern void __init files_init(void);
extern void __init files_maxfiles_init(void);

extern unsigned long get_max_files(void);
extern unsigned int sysctl_nr_open;

typedef __kernel_rwf_t rwf_t;

struct buffer_head;
typedef int (get_block_t)(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create);
typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
                        ssize_t bytes, void *private);

#define MAY_EXEC                0x00000001
#define MAY_WRITE                0x00000002
#define MAY_READ                0x00000004
#define MAY_APPEND                0x00000008
#define MAY_ACCESS                0x00000010
#define MAY_OPEN                0x00000020
#define MAY_CHDIR                0x00000040
/* called from RCU mode, don't block */
#define MAY_NOT_BLOCK                0x00000080

/*
 * flags in file.f_mode.  Note that FMODE_READ and FMODE_WRITE must correspond
 * to O_WRONLY and O_RDWR via the strange trick in do_dentry_open()
 */

/* file is open for reading */
#define FMODE_READ                ((__force fmode_t)(1 << 0))
/* file is open for writing */
#define FMODE_WRITE                ((__force fmode_t)(1 << 1))
/* file is seekable */
#define FMODE_LSEEK                ((__force fmode_t)(1 << 2))
/* file can be accessed using pread */
#define FMODE_PREAD                ((__force fmode_t)(1 << 3))
/* file can be accessed using pwrite */
#define FMODE_PWRITE                ((__force fmode_t)(1 << 4))
/* File is opened for execution with sys_execve / sys_uselib */
#define FMODE_EXEC                ((__force fmode_t)(1 << 5))
/* File writes are restricted (block device specific) */
#define FMODE_WRITE_RESTRICTED        ((__force fmode_t)(1 << 6))

/* FMODE_* bits 7 to 8 */

/* 32bit hashes as llseek() offset (for directories) */
#define FMODE_32BITHASH         ((__force fmode_t)(1 << 9))
/* 64bit hashes as llseek() offset (for directories) */
#define FMODE_64BITHASH         ((__force fmode_t)(1 << 10))

/*
 * Don't update ctime and mtime.
 *
 * Currently a special hack for the XFS open_by_handle ioctl, but we'll
 * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon.
 */
#define FMODE_NOCMTIME                ((__force fmode_t)(1 << 11))

/* Expect random access pattern */
#define FMODE_RANDOM                ((__force fmode_t)(1 << 12))

/* File is huge (eg. /dev/mem): treat loff_t as unsigned */
#define FMODE_UNSIGNED_OFFSET        ((__force fmode_t)(1 << 13))

/* File is opened with O_PATH; almost nothing can be done with it */
#define FMODE_PATH                ((__force fmode_t)(1 << 14))

/* File needs atomic accesses to f_pos */
#define FMODE_ATOMIC_POS        ((__force fmode_t)(1 << 15))
/* Write access to underlying fs */
#define FMODE_WRITER                ((__force fmode_t)(1 << 16))
/* Has read method(s) */
#define FMODE_CAN_READ          ((__force fmode_t)(1 << 17))
/* Has write method(s) */
#define FMODE_CAN_WRITE         ((__force fmode_t)(1 << 18))

#define FMODE_OPENED                ((__force fmode_t)(1 << 19))
#define FMODE_CREATED                ((__force fmode_t)(1 << 20))

/* File is stream-like */
#define FMODE_STREAM                ((__force fmode_t)(1 << 21))

/* File supports DIRECT IO */
#define        FMODE_CAN_ODIRECT        ((__force fmode_t)(1 << 22))

#define        FMODE_NOREUSE                ((__force fmode_t)(1 << 23))

/* FMODE_* bit 24 */

/* File is embedded in backing_file object */
#define FMODE_BACKING                ((__force fmode_t)(1 << 25))

/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY                ((__force fmode_t)(1 << 26))

/* File is capable of returning -EAGAIN if I/O will block */
#define FMODE_NOWAIT                ((__force fmode_t)(1 << 27))

/* File represents mount that needs unmounting */
#define FMODE_NEED_UNMOUNT        ((__force fmode_t)(1 << 28))

/* File does not contribute to nr_files count */
#define FMODE_NOACCOUNT                ((__force fmode_t)(1 << 29))

/*
 * Attribute flags.  These should be or-ed together to figure out what
 * has been changed!
 */
#define ATTR_MODE        (1 << 0)
#define ATTR_UID        (1 << 1)
#define ATTR_GID        (1 << 2)
#define ATTR_SIZE        (1 << 3)
#define ATTR_ATIME        (1 << 4)
#define ATTR_MTIME        (1 << 5)
#define ATTR_CTIME        (1 << 6)
#define ATTR_ATIME_SET        (1 << 7)
#define ATTR_MTIME_SET        (1 << 8)
#define ATTR_FORCE        (1 << 9) /* Not a change, but a change it */
#define ATTR_KILL_SUID        (1 << 11)
#define ATTR_KILL_SGID        (1 << 12)
#define ATTR_FILE        (1 << 13)
#define ATTR_KILL_PRIV        (1 << 14)
#define ATTR_OPEN        (1 << 15) /* Truncating from open(O_TRUNC) */
#define ATTR_TIMES_SET        (1 << 16)
#define ATTR_TOUCH        (1 << 17)

/*
 * Whiteout is represented by a char device.  The following constants define the
 * mode and device number to use.
 */
#define WHITEOUT_MODE 0
#define WHITEOUT_DEV 0

/*
 * This is the Inode Attributes structure, used for notify_change().  It
 * uses the above definitions as flags, to know which values have changed.
 * Also, in this manner, a Filesystem can look at only the values it cares
 * about.  Basically, these are the attributes that the VFS layer can
 * request to change from the FS layer.
 *
 * Derek Atkins <warlord@MIT.EDU> 94-10-20
 */
struct iattr {
        unsigned int        ia_valid;
        umode_t                ia_mode;
        /*
         * The two anonymous unions wrap structures with the same member.
         *
         * Filesystems raising FS_ALLOW_IDMAP need to use ia_vfs{g,u}id which
         * are a dedicated type requiring the filesystem to use the dedicated
         * helpers. Other filesystem can continue to use ia_{g,u}id until they
         * have been ported.
         *
         * They always contain the same value. In other words FS_ALLOW_IDMAP
         * pass down the same value on idmapped mounts as they would on regular
         * mounts.
         */
        union {
                kuid_t                ia_uid;
                vfsuid_t        ia_vfsuid;
        };
        union {
                kgid_t                ia_gid;
                vfsgid_t        ia_vfsgid;
        };
        loff_t                ia_size;
        struct timespec64 ia_atime;
        struct timespec64 ia_mtime;
        struct timespec64 ia_ctime;

        /*
         * Not an attribute, but an auxiliary info for filesystems wanting to
         * implement an ftruncate() like method.  NOTE: filesystem should
         * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL).
         */
        struct file        *ia_file;
};

/*
 * Includes for diskquotas.
 */
#include <linux/quota.h>

/*
 * Maximum number of layers of fs stack.  Needs to be limited to
 * prevent kernel stack overflow
 */
#define FILESYSTEM_MAX_STACK_DEPTH 2

/** 
 * enum positive_aop_returns - aop return codes with specific semantics
 *
 * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
 *                             completed, that the page is still locked, and
 *                             should be considered active.  The VM uses this hint
 *                             to return the page to the active list -- it won't
 *                             be a candidate for writeback again in the near
 *                             future.  Other callers must be careful to unlock
 *                             the page if they get this return.  Returned by
 *                             writepage(); 
 *
 * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
 *                          unlocked it and the page might have been truncated.
 *                          The caller should back up to acquiring a new page and
 *                          trying again.  The aop will be taking reasonable
 *                          precautions not to livelock.  If the caller held a page
 *                          reference, it should drop it before retrying.  Returned
 *                          by read_folio().
 *
 * address_space_operation functions return these large constants to indicate
 * special semantics to the caller.  These are much larger than the bytes in a
 * page to allow for functions that return the number of bytes operated on in a
 * given page.
 */

enum positive_aop_returns {
        AOP_WRITEPAGE_ACTIVATE        = 0x80000,
        AOP_TRUNCATED_PAGE        = 0x80001,
};

/*
 * oh the beauties of C type declarations.
 */
struct page;
struct address_space;
struct writeback_control;
struct readahead_control;

/* Match RWF_* bits to IOCB bits */
#define IOCB_HIPRI                (__force int) RWF_HIPRI
#define IOCB_DSYNC                (__force int) RWF_DSYNC
#define IOCB_SYNC                (__force int) RWF_SYNC
#define IOCB_NOWAIT                (__force int) RWF_NOWAIT
#define IOCB_APPEND                (__force int) RWF_APPEND

/* non-RWF related bits - start at 16 */
#define IOCB_EVENTFD                (1 << 16)
#define IOCB_DIRECT                (1 << 17)
#define IOCB_WRITE                (1 << 18)
/* iocb->ki_waitq is valid */
#define IOCB_WAITQ                (1 << 19)
#define IOCB_NOIO                (1 << 20)
/* can use bio alloc cache */
#define IOCB_ALLOC_CACHE        (1 << 21)
/*
 * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the
 * iocb completion can be passed back to the owner for execution from a safe
 * context rather than needing to be punted through a workqueue. If this
 * flag is set, the bio completion handling may set iocb->dio_complete to a
 * handler function and iocb->private to context information for that handler.
 * The issuer should call the handler with that context information from task
 * context to complete the processing of the iocb. Note that while this
 * provides a task context for the dio_complete() callback, it should only be
 * used on the completion side for non-IO generating completions. It's fine to
 * call blocking functions from this callback, but they should not wait for
 * unrelated IO (like cache flushing, new IO generation, etc).
 */
#define IOCB_DIO_CALLER_COMP        (1 << 22)
/* kiocb is a read or write operation submitted by fs/aio.c. */
#define IOCB_AIO_RW                (1 << 23)

/* for use in trace events */
#define TRACE_IOCB_STRINGS \
        { IOCB_HIPRI,                "HIPRI" }, \
        { IOCB_DSYNC,                "DSYNC" }, \
        { IOCB_SYNC,                "SYNC" }, \
        { IOCB_NOWAIT,                "NOWAIT" }, \
        { IOCB_APPEND,                "APPEND" }, \
        { IOCB_EVENTFD,                "EVENTFD"}, \
        { IOCB_DIRECT,                "DIRECT" }, \
        { IOCB_WRITE,                "WRITE" }, \
        { IOCB_WAITQ,                "WAITQ" }, \
        { IOCB_NOIO,                "NOIO" }, \
        { IOCB_ALLOC_CACHE,        "ALLOC_CACHE" }, \
        { IOCB_DIO_CALLER_COMP,        "CALLER_COMP" }

struct kiocb {
        struct file                *ki_filp;
        loff_t                        ki_pos;
        void (*ki_complete)(struct kiocb *iocb, long ret);
        void                        *private;
        int                        ki_flags;
        u16                        ki_ioprio; /* See linux/ioprio.h */
        union {
                /*
                 * Only used for async buffered reads, where it denotes the
                 * page waitqueue associated with completing the read. Valid
                 * IFF IOCB_WAITQ is set.
                 */
                struct wait_page_queue        *ki_waitq;
                /*
                 * Can be used for O_DIRECT IO, where the completion handling
                 * is punted back to the issuer of the IO. May only be set
                 * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer
                 * must then check for presence of this handler when ki_complete
                 * is invoked. The data passed in to this handler must be
                 * assigned to ->private when dio_complete is assigned.
                 */
                ssize_t (*dio_complete)(void *data);
        };
};

static inline bool is_sync_kiocb(struct kiocb *kiocb)
{
        return kiocb->ki_complete == NULL;
}

struct address_space_operations {
        int (*writepage)(struct page *page, struct writeback_control *wbc);
        int (*read_folio)(struct file *, struct folio *);

        /* Write back some dirty pages from this mapping. */
        int (*writepages)(struct address_space *, struct writeback_control *);

        /* Mark a folio dirty.  Return true if this dirtied it */
        bool (*dirty_folio)(struct address_space *, struct folio *);

        void (*readahead)(struct readahead_control *);

        int (*write_begin)(struct file *, struct address_space *mapping,
                                loff_t pos, unsigned len,
                                struct page **pagep, void **fsdata);
        int (*write_end)(struct file *, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned copied,
                                struct page *page, void *fsdata);

        /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
        sector_t (*bmap)(struct address_space *, sector_t);
        void (*invalidate_folio) (struct folio *, size_t offset, size_t len);
        bool (*release_folio)(struct folio *, gfp_t);
        void (*free_folio)(struct folio *folio);
        ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
        /*
         * migrate the contents of a folio to the specified target. If
         * migrate_mode is MIGRATE_ASYNC, it must not block.
         */
        int (*migrate_folio)(struct address_space *, struct folio *dst,
                        struct folio *src, enum migrate_mode);
        int (*launder_folio)(struct folio *);
        bool (*is_partially_uptodate) (struct folio *, size_t from,
                        size_t count);
        void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb);
        int (*error_remove_folio)(struct address_space *, struct folio *);

        /* swapfile support */
        int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
                                sector_t *span);
        void (*swap_deactivate)(struct file *file);
        int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter);
};

extern const struct address_space_operations empty_aops;

/**
 * struct address_space - Contents of a cacheable, mappable object.
 * @host: Owner, either the inode or the block_device.
 * @i_pages: Cached pages.
 * @invalidate_lock: Guards coherency between page cache contents and
 *   file offset->disk block mappings in the filesystem during invalidates.
 *   It is also used to block modification of page cache contents through
 *   memory mappings.
 * @gfp_mask: Memory allocation flags to use for allocating pages.
 * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings.
 * @nr_thps: Number of THPs in the pagecache (non-shmem only).
 * @i_mmap: Tree of private and shared mappings.
 * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
 * @nrpages: Number of page entries, protected by the i_pages lock.
 * @writeback_index: Writeback starts here.
 * @a_ops: Methods.
 * @flags: Error bits and flags (AS_*).
 * @wb_err: The most recent error which has occurred.
 * @i_private_lock: For use by the owner of the address_space.
 * @i_private_list: For use by the owner of the address_space.
 * @i_private_data: For use by the owner of the address_space.
 */
struct address_space {
        struct inode                *host;
        struct xarray                i_pages;
        struct rw_semaphore        invalidate_lock;
        gfp_t                        gfp_mask;
        atomic_t                i_mmap_writable;
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        /* number of thp, only for non-shmem files */
        atomic_t                nr_thps;
#endif
        struct rb_root_cached        i_mmap;
        unsigned long                nrpages;
        pgoff_t                        writeback_index;
        const struct address_space_operations *a_ops;
        unsigned long                flags;
        errseq_t                wb_err;
        spinlock_t                i_private_lock;
        struct list_head        i_private_list;
        struct rw_semaphore        i_mmap_rwsem;
        void *                        i_private_data;
} __attribute__((aligned(sizeof(long)))) __randomize_layout;
        /*
         * On most architectures that alignment is already the case; but
         * must be enforced here for CRIS, to let the least significant bit
         * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
         */

/* XArray tags, for tagging dirty and writeback pages in the pagecache. */
#define PAGECACHE_TAG_DIRTY        XA_MARK_0
#define PAGECACHE_TAG_WRITEBACK        XA_MARK_1
#define PAGECACHE_TAG_TOWRITE        XA_MARK_2

/*
 * Returns true if any of the pages in the mapping are marked with the tag.
 */
static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag)
{
        return xa_marked(&mapping->i_pages, tag);
}

static inline void i_mmap_lock_write(struct address_space *mapping)
{
        down_write(&mapping->i_mmap_rwsem);
}

static inline int i_mmap_trylock_write(struct address_space *mapping)
{
        return down_write_trylock(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_unlock_write(struct address_space *mapping)
{
        up_write(&mapping->i_mmap_rwsem);
}

static inline int i_mmap_trylock_read(struct address_space *mapping)
{
        return down_read_trylock(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_lock_read(struct address_space *mapping)
{
        down_read(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_unlock_read(struct address_space *mapping)
{
        up_read(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_assert_locked(struct address_space *mapping)
{
        lockdep_assert_held(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_assert_write_locked(struct address_space *mapping)
{
        lockdep_assert_held_write(&mapping->i_mmap_rwsem);
}

/*
 * Might pages of this file be mapped into userspace?
 */
static inline int mapping_mapped(struct address_space *mapping)
{
        return        !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root);
}

/*
 * Might pages of this file have been modified in userspace?
 * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap
 * marks vma as VM_SHARED if it is shared, and the file was opened for
 * writing i.e. vma may be mprotected writable even if now readonly.
 *
 * If i_mmap_writable is negative, no new writable mappings are allowed. You
 * can only deny writable mappings, if none exists right now.
 */
static inline int mapping_writably_mapped(struct address_space *mapping)
{
        return atomic_read(&mapping->i_mmap_writable) > 0;
}

static inline int mapping_map_writable(struct address_space *mapping)
{
        return atomic_inc_unless_negative(&mapping->i_mmap_writable) ?
                0 : -EPERM;
}

static inline void mapping_unmap_writable(struct address_space *mapping)
{
        atomic_dec(&mapping->i_mmap_writable);
}

static inline int mapping_deny_writable(struct address_space *mapping)
{
        return atomic_dec_unless_positive(&mapping->i_mmap_writable) ?
                0 : -EBUSY;
}

static inline void mapping_allow_writable(struct address_space *mapping)
{
        atomic_inc(&mapping->i_mmap_writable);
}

/*
 * Use sequence counter to get consistent i_size on 32-bit processors.
 */
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
#include <linux/seqlock.h>
#define __NEED_I_SIZE_ORDERED
#define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount)
#else
#define i_size_ordered_init(inode) do { } while (0)
#endif

struct posix_acl;
#define ACL_NOT_CACHED ((void *)(-1))
/*
 * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to
 * cache the ACL.  This also means that ->get_inode_acl() can be called in RCU
 * mode with the LOOKUP_RCU flag.
 */
#define ACL_DONT_CACHE ((void *)(-3))

static inline struct posix_acl *
uncached_acl_sentinel(struct task_struct *task)
{
        return (void *)task + 1;
}

static inline bool
is_uncached_acl(struct posix_acl *acl)
{
        return (long)acl & 1;
}

#define IOP_FASTPERM        0x0001
#define IOP_LOOKUP        0x0002
#define IOP_NOFOLLOW        0x0004
#define IOP_XATTR        0x0008
#define IOP_DEFAULT_READLINK        0x0010

/*
 * Keep mostly read-only and often accessed (especially for
 * the RCU path lookup and 'stat' data) fields at the beginning
 * of the 'struct inode'
 */
struct inode {
        umode_t                        i_mode;
        unsigned short                i_opflags;
        kuid_t                        i_uid;
        kgid_t                        i_gid;
        unsigned int                i_flags;

#ifdef CONFIG_FS_POSIX_ACL
        struct posix_acl        *i_acl;
        struct posix_acl        *i_default_acl;
#endif

        const struct inode_operations        *i_op;
        struct super_block        *i_sb;
        struct address_space        *i_mapping;

#ifdef CONFIG_SECURITY
        void                        *i_security;
#endif

        /* Stat data, not accessed from path walking */
        unsigned long                i_ino;
        /*
         * Filesystems may only read i_nlink directly.  They shall use the
         * following functions for modification:
         *
         *    (set|clear|inc|drop)_nlink
         *    inode_(inc|dec)_link_count
         */
        union {
                const unsigned int i_nlink;
                unsigned int __i_nlink;
        };
        dev_t                        i_rdev;
        loff_t                        i_size;
        struct timespec64        __i_atime;
        struct timespec64        __i_mtime;
        struct timespec64        __i_ctime; /* use inode_*_ctime accessors! */
        spinlock_t                i_lock;        /* i_blocks, i_bytes, maybe i_size */
        unsigned short          i_bytes;
        u8                        i_blkbits;
        enum rw_hint                i_write_hint;
        blkcnt_t                i_blocks;

#ifdef __NEED_I_SIZE_ORDERED
        seqcount_t                i_size_seqcount;
#endif

        /* Misc */
        unsigned long                i_state;
        struct rw_semaphore        i_rwsem;

        unsigned long                dirtied_when;        /* jiffies of first dirtying */
        unsigned long                dirtied_time_when;

        struct hlist_node        i_hash;
        struct list_head        i_io_list;        /* backing dev IO list */
#ifdef CONFIG_CGROUP_WRITEBACK
        struct bdi_writeback        *i_wb;                /* the associated cgroup wb */

        /* foreign inode detection, see wbc_detach_inode() */
        int                        i_wb_frn_winner;
        u16                        i_wb_frn_avg_time;
        u16                        i_wb_frn_history;
#endif
        struct list_head        i_lru;                /* inode LRU list */
        struct list_head        i_sb_list;
        struct list_head        i_wb_list;        /* backing dev writeback list */
        union {
                struct hlist_head        i_dentry;
                struct rcu_head                i_rcu;
        };
        atomic64_t                i_version;
        atomic64_t                i_sequence; /* see futex */
        atomic_t                i_count;
        atomic_t                i_dio_count;
        atomic_t                i_writecount;
#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
        atomic_t                i_readcount; /* struct files open RO */
#endif
        union {
                const struct file_operations        *i_fop;        /* former ->i_op->default_file_ops */
                void (*free_inode)(struct inode *);
        };
        struct file_lock_context        *i_flctx;
        struct address_space        i_data;
        struct list_head        i_devices;
        union {
                struct pipe_inode_info        *i_pipe;
                struct cdev                *i_cdev;
                char                        *i_link;
                unsigned                i_dir_seq;
        };

        __u32                        i_generation;

#ifdef CONFIG_FSNOTIFY
        __u32                        i_fsnotify_mask; /* all events this inode cares about */
        struct fsnotify_mark_connector __rcu        *i_fsnotify_marks;
#endif

#ifdef CONFIG_FS_ENCRYPTION
        struct fscrypt_inode_info        *i_crypt_info;
#endif

#ifdef CONFIG_FS_VERITY
        struct fsverity_info        *i_verity_info;
#endif

        void                        *i_private; /* fs or device private pointer */
} __randomize_layout;

struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode);

static inline unsigned int i_blocksize(const struct inode *node)
{
        return (1 << node->i_blkbits);
}

static inline int inode_unhashed(struct inode *inode)
{
        return hlist_unhashed(&inode->i_hash);
}

/*
 * __mark_inode_dirty expects inodes to be hashed.  Since we don't
 * want special inodes in the fileset inode space, we make them
 * appear hashed, but do not put on any lists.  hlist_del()
 * will work fine and require no locking.
 */
static inline void inode_fake_hash(struct inode *inode)
{
        hlist_add_fake(&inode->i_hash);
}

/*
 * inode->i_mutex nesting subclasses for the lock validator:
 *
 * 0: the object of the current VFS operation
 * 1: parent
 * 2: child/target
 * 3: xattr
 * 4: second non-directory
 * 5: second parent (when locking independent directories in rename)
 *
 * I_MUTEX_NONDIR2 is for certain operations (such as rename) which lock two
 * non-directories at once.
 *
 * The locking order between these classes is
 * parent[2] -> child -> grandchild -> normal -> xattr -> second non-directory
 */
enum inode_i_mutex_lock_class
{
        I_MUTEX_NORMAL,
        I_MUTEX_PARENT,
        I_MUTEX_CHILD,
        I_MUTEX_XATTR,
        I_MUTEX_NONDIR2,
        I_MUTEX_PARENT2,
};

static inline void inode_lock(struct inode *inode)
{
        down_write(&inode->i_rwsem);
}

static inline void inode_unlock(struct inode *inode)
{
        up_write(&inode->i_rwsem);
}

static inline void inode_lock_shared(struct inode *inode)
{
        down_read(&inode->i_rwsem);
}

static inline void inode_unlock_shared(struct inode *inode)
{
        up_read(&inode->i_rwsem);
}

static inline int inode_trylock(struct inode *inode)
{
        return down_write_trylock(&inode->i_rwsem);
}

static inline int inode_trylock_shared(struct inode *inode)
{
        return down_read_trylock(&inode->i_rwsem);
}

static inline int inode_is_locked(struct inode *inode)
{
        return rwsem_is_locked(&inode->i_rwsem);
}

static inline void inode_lock_nested(struct inode *inode, unsigned subclass)
{
        down_write_nested(&inode->i_rwsem, subclass);
}

static inline void inode_lock_shared_nested(struct inode *inode, unsigned subclass)
{
        down_read_nested(&inode->i_rwsem, subclass);
}

static inline void filemap_invalidate_lock(struct address_space *mapping)
{
        down_write(&mapping->invalidate_lock);
}

static inline void filemap_invalidate_unlock(struct address_space *mapping)
{
        up_write(&mapping->invalidate_lock);
}

static inline void filemap_invalidate_lock_shared(struct address_space *mapping)
{
        down_read(&mapping->invalidate_lock);
}

static inline int filemap_invalidate_trylock_shared(
                                        struct address_space *mapping)
{
        return down_read_trylock(&mapping->invalidate_lock);
}

static inline void filemap_invalidate_unlock_shared(
                                        struct address_space *mapping)
{
        up_read(&mapping->invalidate_lock);
}

void lock_two_nondirectories(struct inode *, struct inode*);
void unlock_two_nondirectories(struct inode *, struct inode*);

void filemap_invalidate_lock_two(struct address_space *mapping1,
                                 struct address_space *mapping2);
void filemap_invalidate_unlock_two(struct address_space *mapping1,
                                   struct address_space *mapping2);


/*
 * NOTE: in a 32bit arch with a preemptable kernel and
 * an UP compile the i_size_read/write must be atomic
 * with respect to the local cpu (unlike with preempt disabled),
 * but they don't need to be atomic with respect to other cpus like in
 * true SMP (so they need either to either locally disable irq around
 * the read or for example on x86 they can be still implemented as a
 * cmpxchg8b without the need of the lock prefix). For SMP compiles
 * and 64bit archs it makes no difference if preempt is enabled or not.
 */
static inline loff_t i_size_read(const struct inode *inode)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
        loff_t i_size;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&inode->i_size_seqcount);
                i_size = inode->i_size;
        } while (read_seqcount_retry(&inode->i_size_seqcount, seq));
        return i_size;
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
        loff_t i_size;

        preempt_disable();
        i_size = inode->i_size;
        preempt_enable();
        return i_size;
#else
        /* Pairs with smp_store_release() in i_size_write() */
        return smp_load_acquire(&inode->i_size);
#endif
}

/*
 * NOTE: unlike i_size_read(), i_size_write() does need locking around it
 * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount
 * can be lost, resulting in subsequent i_size_read() calls spinning forever.
 */
static inline void i_size_write(struct inode *inode, loff_t i_size)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
        preempt_disable();
        write_seqcount_begin(&inode->i_size_seqcount);
        inode->i_size = i_size;
        write_seqcount_end(&inode->i_size_seqcount);
        preempt_enable();
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
        preempt_disable();
        inode->i_size = i_size;
        preempt_enable();
#else
        /*
         * Pairs with smp_load_acquire() in i_size_read() to ensure
         * changes related to inode size (such as page contents) are
         * visible before we see the changed inode size.
         */
        smp_store_release(&inode->i_size, i_size);
#endif
}

static inline unsigned iminor(const struct inode *inode)
{
        return MINOR(inode->i_rdev);
}

static inline unsigned imajor(const struct inode *inode)
{
        return MAJOR(inode->i_rdev);
}

struct fown_struct {
        rwlock_t lock;          /* protects pid, uid, euid fields */
        struct pid *pid;        /* pid or -pgrp where SIGIO should be sent */
        enum pid_type pid_type;        /* Kind of process group SIGIO should be sent to */
        kuid_t uid, euid;        /* uid/euid of process setting the owner */
        int signum;                /* posix.1b rt signal to be delivered on IO */
};

/**
 * struct file_ra_state - Track a file's readahead state.
 * @start: Where the most recent readahead started.
 * @size: Number of pages read in the most recent readahead.
 * @async_size: Numer of pages that were/are not needed immediately
 *      and so were/are genuinely "ahead".  Start next readahead when
 *      the first of these pages is accessed.
 * @ra_pages: Maximum size of a readahead request, copied from the bdi.
 * @mmap_miss: How many mmap accesses missed in the page cache.
 * @prev_pos: The last byte in the most recent read request.
 *
 * When this structure is passed to ->readahead(), the "most recent"
 * readahead means the current readahead.
 */
struct file_ra_state {
        pgoff_t start;
        unsigned int size;
        unsigned int async_size;
        unsigned int ra_pages;
        unsigned int mmap_miss;
        loff_t prev_pos;
};

/*
 * Check if @index falls in the readahead windows.
 */
static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
{
        return (index >= ra->start &&
                index <  ra->start + ra->size);
}

/*
 * f_{lock,count,pos_lock} members can be highly contended and share
 * the same cacheline. f_{lock,mode} are very frequently used together
 * and so share the same cacheline as well. The read-mostly
 * f_{path,inode,op} are kept on a separate cacheline.
 */
struct file {
        union {
                /* fput() uses task work when closing and freeing file (default). */
                struct callback_head         f_task_work;
                /* fput() must use workqueue (most kernel threads). */
                struct llist_node        f_llist;
                unsigned int                 f_iocb_flags;
        };

        /*
         * Protects f_ep, f_flags.
         * Must not be taken from IRQ context.
         */
        spinlock_t                f_lock;
        fmode_t                        f_mode;
        atomic_long_t                f_count;
        struct mutex                f_pos_lock;
        loff_t                        f_pos;
        unsigned int                f_flags;
        struct fown_struct        f_owner;
        const struct cred        *f_cred;
        struct file_ra_state        f_ra;
        struct path                f_path;
        struct inode                *f_inode;        /* cached value */
        const struct file_operations        *f_op;

        u64                        f_version;
#ifdef CONFIG_SECURITY
        void                        *f_security;
#endif
        /* needed for tty driver, and maybe others */
        void                        *private_data;

#ifdef CONFIG_EPOLL
        /* Used by fs/eventpoll.c to link all the hooks to this file */
        struct hlist_head        *f_ep;
#endif /* #ifdef CONFIG_EPOLL */
        struct address_space        *f_mapping;
        errseq_t                f_wb_err;
        errseq_t                f_sb_err; /* for syncfs */
} __randomize_layout
  __attribute__((aligned(4)));        /* lest something weird decides that 2 is OK */

struct file_handle {
        __u32 handle_bytes;
        int handle_type;
        /* file identifier */
        unsigned char f_handle[] __counted_by(handle_bytes);
};

static inline struct file *get_file(struct file *f)
{
        long prior = atomic_long_fetch_inc_relaxed(&f->f_count);
        WARN_ONCE(!prior, "struct file::f_count incremented from zero; use-after-free condition present!\n");
        return f;
}

struct file *get_file_rcu(struct file __rcu **f);
struct file *get_file_active(struct file **f);

#define file_count(x)        atomic_long_read(&(x)->f_count)

#define        MAX_NON_LFS        ((1UL<<31) - 1)

/* Page cache limit. The filesystems should put that into their s_maxbytes 
   limits, otherwise bad things can happen in VM. */ 
#if BITS_PER_LONG==32
#define MAX_LFS_FILESIZE        ((loff_t)ULONG_MAX << PAGE_SHIFT)
#elif BITS_PER_LONG==64
#define MAX_LFS_FILESIZE         ((loff_t)LLONG_MAX)
#endif

/* legacy typedef, should eventually be removed */
typedef void *fl_owner_t;

struct file_lock;
struct file_lease;

/* The following constant reflects the upper bound of the file/locking space */
#ifndef OFFSET_MAX
#define OFFSET_MAX        type_max(loff_t)
#define OFFT_OFFSET_MAX        type_max(off_t)
#endif

extern void send_sigio(struct fown_struct *fown, int fd, int band);

static inline struct inode *file_inode(const struct file *f)
{
        return f->f_inode;
}

/*
 * file_dentry() is a relic from the days that overlayfs was using files with a
 * "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs.
 * In those days, file_dentry() was needed to get the underlying fs dentry that
 * matches f_inode.
 * Files with "fake" path should not exist nowadays, so use an assertion to make
 * sure that file_dentry() was not papering over filesystem bugs.
 */
static inline struct dentry *file_dentry(const struct file *file)
{
        struct dentry *dentry = file->f_path.dentry;

        WARN_ON_ONCE(d_inode(dentry) != file_inode(file));
        return dentry;
}

struct fasync_struct {
        rwlock_t                fa_lock;
        int                        magic;
        int                        fa_fd;
        struct fasync_struct        *fa_next; /* singly linked list */
        struct file                *fa_file;
        struct rcu_head                fa_rcu;
};

#define FASYNC_MAGIC 0x4601

/* SMP safe fasync helpers: */
extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *);
extern int fasync_remove_entry(struct file *, struct fasync_struct **);
extern struct fasync_struct *fasync_alloc(void);
extern void fasync_free(struct fasync_struct *);

/* can be called from interrupts */
extern void kill_fasync(struct fasync_struct **, int, int);

extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
extern int f_setown(struct file *filp, int who, int force);
extern void f_delown(struct file *filp);
extern pid_t f_getown(struct file *filp);
extern int send_sigurg(struct fown_struct *fown);

/*
 * sb->s_flags.  Note that these mirror the equivalent MS_* flags where
 * represented in both.
 */
#define SB_RDONLY       BIT(0)        /* Mount read-only */
#define SB_NOSUID       BIT(1)        /* Ignore suid and sgid bits */
#define SB_NODEV        BIT(2)        /* Disallow access to device special files */
#define SB_NOEXEC       BIT(3)        /* Disallow program execution */
#define SB_SYNCHRONOUS  BIT(4)        /* Writes are synced at once */
#define SB_MANDLOCK     BIT(6)        /* Allow mandatory locks on an FS */
#define SB_DIRSYNC      BIT(7)        /* Directory modifications are synchronous */
#define SB_NOATIME      BIT(10)        /* Do not update access times. */
#define SB_NODIRATIME   BIT(11)        /* Do not update directory access times */
#define SB_SILENT       BIT(15)
#define SB_POSIXACL     BIT(16)        /* Supports POSIX ACLs */
#define SB_INLINECRYPT  BIT(17)        /* Use blk-crypto for encrypted files */
#define SB_KERNMOUNT    BIT(22)        /* this is a kern_mount call */
#define SB_I_VERSION    BIT(23)        /* Update inode I_version field */
#define SB_LAZYTIME     BIT(25)        /* Update the on-disk [acm]times lazily */

/* These sb flags are internal to the kernel */
#define SB_DEAD         BIT(21)
#define SB_DYING        BIT(24)
#define SB_SUBMOUNT     BIT(26)
#define SB_FORCE        BIT(27)
#define SB_NOSEC        BIT(28)
#define SB_BORN         BIT(29)
#define SB_ACTIVE       BIT(30)
#define SB_NOUSER       BIT(31)

/* These flags relate to encoding and casefolding */
#define SB_ENC_STRICT_MODE_FL        (1 << 0)

#define sb_has_strict_encoding(sb) \
        (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL)

/*
 *        Umount options
 */

#define MNT_FORCE        0x00000001        /* Attempt to forcibily umount */
#define MNT_DETACH        0x00000002        /* Just detach from the tree */
#define MNT_EXPIRE        0x00000004        /* Mark for expiry */
#define UMOUNT_NOFOLLOW        0x00000008        /* Don't follow symlink on umount */
#define UMOUNT_UNUSED        0x80000000        /* Flag guaranteed to be unused */

/* sb->s_iflags */
#define SB_I_CGROUPWB        0x00000001        /* cgroup-aware writeback enabled */
#define SB_I_NOEXEC        0x00000002        /* Ignore executables on this fs */
#define SB_I_NODEV        0x00000004        /* Ignore devices on this fs */
#define SB_I_STABLE_WRITES 0x00000008        /* don't modify blks until WB is done */

/* sb->s_iflags to limit user namespace mounts */
#define SB_I_USERNS_VISIBLE                0x00000010 /* fstype already mounted */
#define SB_I_IMA_UNVERIFIABLE_SIGNATURE        0x00000020
#define SB_I_UNTRUSTED_MOUNTER                0x00000040
#define SB_I_EVM_HMAC_UNSUPPORTED        0x00000080

#define SB_I_SKIP_SYNC        0x00000100        /* Skip superblock at global sync */
#define SB_I_PERSB_BDI        0x00000200        /* has a per-sb bdi */
#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
#define SB_I_RETIRED        0x00000800        /* superblock shouldn't be reused */
#define SB_I_NOUMASK        0x00001000        /* VFS does not apply umask */

/* Possible states of 'frozen' field */
enum {
        SB_UNFROZEN = 0,                /* FS is unfrozen */
        SB_FREEZE_WRITE        = 1,                /* Writes, dir ops, ioctls frozen */
        SB_FREEZE_PAGEFAULT = 2,        /* Page faults stopped as well */
        SB_FREEZE_FS = 3,                /* For internal FS use (e.g. to stop
                                         * internal threads if needed) */
        SB_FREEZE_COMPLETE = 4,                /* ->freeze_fs finished successfully */
};

#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)

struct sb_writers {
        unsigned short                        frozen;                /* Is sb frozen? */
        int                                freeze_kcount;        /* How many kernel freeze requests? */
        int                                freeze_ucount;        /* How many userspace freeze requests? */
        struct percpu_rw_semaphore        rw_sem[SB_FREEZE_LEVELS];
};

struct super_block {
        struct list_head        s_list;                /* Keep this first */
        dev_t                        s_dev;                /* search index; _not_ kdev_t */
        unsigned char                s_blocksize_bits;
        unsigned long                s_blocksize;
        loff_t                        s_maxbytes;        /* Max file size */
        struct file_system_type        *s_type;
        const struct super_operations        *s_op;
        const struct dquot_operations        *dq_op;
        const struct quotactl_ops        *s_qcop;
        const struct export_operations *s_export_op;
        unsigned long                s_flags;
        unsigned long                s_iflags;        /* internal SB_I_* flags */
        unsigned long                s_magic;
        struct dentry                *s_root;
        struct rw_semaphore        s_umount;
        int                        s_count;
        atomic_t                s_active;
#ifdef CONFIG_SECURITY
        void                    *s_security;
#endif
        const struct xattr_handler * const *s_xattr;
#ifdef CONFIG_FS_ENCRYPTION
        const struct fscrypt_operations        *s_cop;
        struct fscrypt_keyring        *s_master_keys; /* master crypto keys in use */
#endif
#ifdef CONFIG_FS_VERITY
        const struct fsverity_operations *s_vop;
#endif
#if IS_ENABLED(CONFIG_UNICODE)
        struct unicode_map *s_encoding;
        __u16 s_encoding_flags;
#endif
        struct hlist_bl_head        s_roots;        /* alternate root dentries for NFS */
        struct list_head        s_mounts;        /* list of mounts; _not_ for fs use */
        struct block_device        *s_bdev;        /* can go away once we use an accessor for @s_bdev_file */
        struct file                *s_bdev_file;
        struct backing_dev_info *s_bdi;
        struct mtd_info                *s_mtd;
        struct hlist_node        s_instances;
        unsigned int                s_quota_types;        /* Bitmask of supported quota types */
        struct quota_info        s_dquot;        /* Diskquota specific options */

        struct sb_writers        s_writers;

        /*
         * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
         * s_fsnotify_info together for cache efficiency. They are frequently
         * accessed and rarely modified.
         */
        void                        *s_fs_info;        /* Filesystem private info */

        /* Granularity of c/m/atime in ns (cannot be worse than a second) */
        u32                        s_time_gran;
        /* Time limits for c/m/atime in seconds */
        time64_t                   s_time_min;
        time64_t                   s_time_max;
#ifdef CONFIG_FSNOTIFY
        __u32                        s_fsnotify_mask;
        struct fsnotify_sb_info        *s_fsnotify_info;
#endif

        /*
         * q: why are s_id and s_sysfs_name not the same? both are human
         * readable strings that identify the filesystem
         * a: s_id is allowed to change at runtime; it's used in log messages,
         * and we want to when a device starts out as single device (s_id is dev
         * name) but then a device is hot added and we have to switch to
         * identifying it by UUID
         * but s_sysfs_name is a handle for programmatic access, and can't
         * change at runtime
         */
        char                        s_id[32];        /* Informational name */
        uuid_t                        s_uuid;                /* UUID */
        u8                        s_uuid_len;        /* Default 16, possibly smaller for weird filesystems */

        /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */
        char                        s_sysfs_name[UUID_STRING_LEN + 1];

        unsigned int                s_max_links;

        /*
         * The next field is for VFS *only*. No filesystems have any business
         * even looking at it. You had been warned.
         */
        struct mutex s_vfs_rename_mutex;        /* Kludge */

        /*
         * Filesystem subtype.  If non-empty the filesystem type field
         * in /proc/mounts will be "type.subtype"
         */
        const char *s_subtype;

        const struct dentry_operations *s_d_op; /* default d_op for dentries */

        struct shrinker *s_shrink;        /* per-sb shrinker handle */

        /* Number of inodes with nlink == 0 but still referenced */
        atomic_long_t s_remove_count;

        /* Read-only state of the superblock is being changed */
        int s_readonly_remount;

        /* per-sb errseq_t for reporting writeback errors via syncfs */
        errseq_t s_wb_err;

        /* AIO completions deferred from interrupt context */
        struct workqueue_struct *s_dio_done_wq;
        struct hlist_head s_pins;

        /*
         * Owning user namespace and default context in which to
         * interpret filesystem uids, gids, quotas, device nodes,
         * xattrs and security labels.
         */
        struct user_namespace *s_user_ns;

        /*
         * The list_lru structure is essentially just a pointer to a table
         * of per-node lru lists, each of which has its own spinlock.
         * There is no need to put them into separate cachelines.
         */
        struct list_lru                s_dentry_lru;
        struct list_lru                s_inode_lru;
        struct rcu_head                rcu;
        struct work_struct        destroy_work;

        struct mutex                s_sync_lock;        /* sync serialisation lock */

        /*
         * Indicates how deep in a filesystem stack this SB is
         */
        int s_stack_depth;

        /* s_inode_list_lock protects s_inodes */
        spinlock_t                s_inode_list_lock ____cacheline_aligned_in_smp;
        struct list_head        s_inodes;        /* all inodes */

        spinlock_t                s_inode_wblist_lock;
        struct list_head        s_inodes_wb;        /* writeback inodes */
} __randomize_layout;

static inline struct user_namespace *i_user_ns(const struct inode *inode)
{
        return inode->i_sb->s_user_ns;
}

/* Helper functions so that in most cases filesystems will
 * not need to deal directly with kuid_t and kgid_t and can
 * instead deal with the raw numeric values that are stored
 * in the filesystem.
 */
static inline uid_t i_uid_read(const struct inode *inode)
{
        return from_kuid(i_user_ns(inode), inode->i_uid);
}

static inline gid_t i_gid_read(const struct inode *inode)
{
        return from_kgid(i_user_ns(inode), inode->i_gid);
}

static inline void i_uid_write(struct inode *inode, uid_t uid)
{
        inode->i_uid = make_kuid(i_user_ns(inode), uid);
}

static inline void i_gid_write(struct inode *inode, gid_t gid)
{
        inode->i_gid = make_kgid(i_user_ns(inode), gid);
}

/**
 * i_uid_into_vfsuid - map an inode's i_uid down according to an idmapping
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode to map
 *
 * Return: whe inode's i_uid mapped down according to @idmap.
 * If the inode's i_uid has no mapping INVALID_VFSUID is returned.
 */
static inline vfsuid_t i_uid_into_vfsuid(struct mnt_idmap *idmap,
                                         const struct inode *inode)
{
        return make_vfsuid(idmap, i_user_ns(inode), inode->i_uid);
}

/**
 * i_uid_needs_update - check whether inode's i_uid needs to be updated
 * @idmap: idmap of the mount the inode was found from
 * @attr: the new attributes of @inode
 * @inode: the inode to update
 *
 * Check whether the $inode's i_uid field needs to be updated taking idmapped
 * mounts into account if the filesystem supports it.
 *
 * Return: true if @inode's i_uid field needs to be updated, false if not.
 */
static inline bool i_uid_needs_update(struct mnt_idmap *idmap,
                                      const struct iattr *attr,
                                      const struct inode *inode)
{
        return ((attr->ia_valid & ATTR_UID) &&
                !vfsuid_eq(attr->ia_vfsuid,
                           i_uid_into_vfsuid(idmap, inode)));
}

/**
 * i_uid_update - update @inode's i_uid field
 * @idmap: idmap of the mount the inode was found from
 * @attr: the new attributes of @inode
 * @inode: the inode to update
 *
 * Safely update @inode's i_uid field translating the vfsuid of any idmapped
 * mount into the filesystem kuid.
 */
static inline void i_uid_update(struct mnt_idmap *idmap,
                                const struct iattr *attr,
                                struct inode *inode)
{
        if (attr->ia_valid & ATTR_UID)
                inode->i_uid = from_vfsuid(idmap, i_user_ns(inode),
                                           attr->ia_vfsuid);
}

/**
 * i_gid_into_vfsgid - map an inode's i_gid down according to an idmapping
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode to map
 *
 * Return: the inode's i_gid mapped down according to @idmap.
 * If the inode's i_gid has no mapping INVALID_VFSGID is returned.
 */
static inline vfsgid_t i_gid_into_vfsgid(struct mnt_idmap *idmap,
                                         const struct inode *inode)
{
        return make_vfsgid(idmap, i_user_ns(inode), inode->i_gid);
}

/**
 * i_gid_needs_update - check whether inode's i_gid needs to be updated
 * @idmap: idmap of the mount the inode was found from
 * @attr: the new attributes of @inode
 * @inode: the inode to update
 *
 * Check whether the $inode's i_gid field needs to be updated taking idmapped
 * mounts into account if the filesystem supports it.
 *
 * Return: true if @inode's i_gid field needs to be updated, false if not.
 */
static inline bool i_gid_needs_update(struct mnt_idmap *idmap,
                                      const struct iattr *attr,
                                      const struct inode *inode)
{
        return ((attr->ia_valid & ATTR_GID) &&
                !vfsgid_eq(attr->ia_vfsgid,
                           i_gid_into_vfsgid(idmap, inode)));
}

/**
 * i_gid_update - update @inode's i_gid field
 * @idmap: idmap of the mount the inode was found from
 * @attr: the new attributes of @inode
 * @inode: the inode to update
 *
 * Safely update @inode's i_gid field translating the vfsgid of any idmapped
 * mount into the filesystem kgid.
 */
static inline void i_gid_update(struct mnt_idmap *idmap,
                                const struct iattr *attr,
                                struct inode *inode)
{
        if (attr->ia_valid & ATTR_GID)
                inode->i_gid = from_vfsgid(idmap, i_user_ns(inode),
                                           attr->ia_vfsgid);
}

/**
 * inode_fsuid_set - initialize inode's i_uid field with callers fsuid
 * @inode: inode to initialize
 * @idmap: idmap of the mount the inode was found from
 *
 * Initialize the i_uid field of @inode. If the inode was found/created via
 * an idmapped mount map the caller's fsuid according to @idmap.
 */
static inline void inode_fsuid_set(struct inode *inode,
                                   struct mnt_idmap *idmap)
{
        inode->i_uid = mapped_fsuid(idmap, i_user_ns(inode));
}

/**
 * inode_fsgid_set - initialize inode's i_gid field with callers fsgid
 * @inode: inode to initialize
 * @idmap: idmap of the mount the inode was found from
 *
 * Initialize the i_gid field of @inode. If the inode was found/created via
 * an idmapped mount map the caller's fsgid according to @idmap.
 */
static inline void inode_fsgid_set(struct inode *inode,
                                   struct mnt_idmap *idmap)
{
        inode->i_gid = mapped_fsgid(idmap, i_user_ns(inode));
}

/**
 * fsuidgid_has_mapping() - check whether caller's fsuid/fsgid is mapped
 * @sb: the superblock we want a mapping in
 * @idmap: idmap of the relevant mount
 *
 * Check whether the caller's fsuid and fsgid have a valid mapping in the
 * s_user_ns of the superblock @sb. If the caller is on an idmapped mount map
 * the caller's fsuid and fsgid according to the @idmap first.
 *
 * Return: true if fsuid and fsgid is mapped, false if not.
 */
static inline bool fsuidgid_has_mapping(struct super_block *sb,
                                        struct mnt_idmap *idmap)
{
        struct user_namespace *fs_userns = sb->s_user_ns;
        kuid_t kuid;
        kgid_t kgid;

        kuid = mapped_fsuid(idmap, fs_userns);
        if (!uid_valid(kuid))
                return false;
        kgid = mapped_fsgid(idmap, fs_userns);
        if (!gid_valid(kgid))
                return false;
        return kuid_has_mapping(fs_userns, kuid) &&
               kgid_has_mapping(fs_userns, kgid);
}

struct timespec64 current_time(struct inode *inode);
struct timespec64 inode_set_ctime_current(struct inode *inode);

static inline time64_t inode_get_atime_sec(const struct inode *inode)
{
        return inode->__i_atime.tv_sec;
}

static inline long inode_get_atime_nsec(const struct inode *inode)
{
        return inode->__i_atime.tv_nsec;
}

static inline struct timespec64 inode_get_atime(const struct inode *inode)
{
        return inode->__i_atime;
}

static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode,
                                                      struct timespec64 ts)
{
        inode->__i_atime = ts;
        return ts;
}

static inline struct timespec64 inode_set_atime(struct inode *inode,
                                                time64_t sec, long nsec)
{
        struct timespec64 ts = { .tv_sec  = sec,
                                 .tv_nsec = nsec };
        return inode_set_atime_to_ts(inode, ts);
}

static inline time64_t inode_get_mtime_sec(const struct inode *inode)
{
        return inode->__i_mtime.tv_sec;
}

static inline long inode_get_mtime_nsec(const struct inode *inode)
{
        return inode->__i_mtime.tv_nsec;
}

static inline struct timespec64 inode_get_mtime(const struct inode *inode)
{
        return inode->__i_mtime;
}

static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode,
                                                      struct timespec64 ts)
{
        inode->__i_mtime = ts;
        return ts;
}

static inline struct timespec64 inode_set_mtime(struct inode *inode,
                                                time64_t sec, long nsec)
{
        struct timespec64 ts = { .tv_sec  = sec,
                                 .tv_nsec = nsec };
        return inode_set_mtime_to_ts(inode, ts);
}

static inline time64_t inode_get_ctime_sec(const struct inode *inode)
{
        return inode->__i_ctime.tv_sec;
}

static inline long inode_get_ctime_nsec(const struct inode *inode)
{
        return inode->__i_ctime.tv_nsec;
}

static inline struct timespec64 inode_get_ctime(const struct inode *inode)
{
        return inode->__i_ctime;
}

static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode,
                                                      struct timespec64 ts)
{
        inode->__i_ctime = ts;
        return ts;
}

/**
 * inode_set_ctime - set the ctime in the inode
 * @inode: inode in which to set the ctime
 * @sec: tv_sec value to set
 * @nsec: tv_nsec value to set
 *
 * Set the ctime in @inode to { @sec, @nsec }
 */
static inline struct timespec64 inode_set_ctime(struct inode *inode,
                                                time64_t sec, long nsec)
{
        struct timespec64 ts = { .tv_sec  = sec,
                                 .tv_nsec = nsec };

        return inode_set_ctime_to_ts(inode, ts);
}

struct timespec64 simple_inode_init_ts(struct inode *inode);

/*
 * Snapshotting support.
 */

/*
 * These are internal functions, please use sb_start_{write,pagefault,intwrite}
 * instead.
 */
static inline void __sb_end_write(struct super_block *sb, int level)
{
        percpu_up_read(sb->s_writers.rw_sem + level-1);
}

static inline void __sb_start_write(struct super_block *sb, int level)
{
        percpu_down_read(sb->s_writers.rw_sem + level - 1);
}

static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
{
        return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1);
}

#define __sb_writers_acquired(sb, lev)        \
        percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
#define __sb_writers_release(sb, lev)        \
        percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)

/**
 * __sb_write_started - check if sb freeze level is held
 * @sb: the super we write to
 * @level: the freeze level
 *
 * * > 0 - sb freeze level is held
 * *   0 - sb freeze level is not held
 * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN
 */
static inline int __sb_write_started(const struct super_block *sb, int level)
{
        return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1);
}

/**
 * sb_write_started - check if SB_FREEZE_WRITE is held
 * @sb: the super we write to
 *
 * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
 */
static inline bool sb_write_started(const struct super_block *sb)
{
        return __sb_write_started(sb, SB_FREEZE_WRITE);
}

/**
 * sb_write_not_started - check if SB_FREEZE_WRITE is not held
 * @sb: the super we write to
 *
 * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
 */
static inline bool sb_write_not_started(const struct super_block *sb)
{
        return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0;
}

/**
 * file_write_started - check if SB_FREEZE_WRITE is held
 * @file: the file we write to
 *
 * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
 * May be false positive with !S_ISREG, because file_start_write() has
 * no effect on !S_ISREG.
 */
static inline bool file_write_started(const struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return true;
        return sb_write_started(file_inode(file)->i_sb);
}

/**
 * file_write_not_started - check if SB_FREEZE_WRITE is not held
 * @file: the file we write to
 *
 * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
 * May be false positive with !S_ISREG, because file_start_write() has
 * no effect on !S_ISREG.
 */
static inline bool file_write_not_started(const struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return true;
        return sb_write_not_started(file_inode(file)->i_sb);
}

/**
 * sb_end_write - drop write access to a superblock
 * @sb: the super we wrote to
 *
 * Decrement number of writers to the filesystem. Wake up possible waiters
 * wanting to freeze the filesystem.
 */
static inline void sb_end_write(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_WRITE);
}

/**
 * sb_end_pagefault - drop write access to a superblock from a page fault
 * @sb: the super we wrote to
 *
 * Decrement number of processes handling write page fault to the filesystem.
 * Wake up possible waiters wanting to freeze the filesystem.
 */
static inline void sb_end_pagefault(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_PAGEFAULT);
}

/**
 * sb_end_intwrite - drop write access to a superblock for internal fs purposes
 * @sb: the super we wrote to
 *
 * Decrement fs-internal number of writers to the filesystem.  Wake up possible
 * waiters wanting to freeze the filesystem.
 */
static inline void sb_end_intwrite(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_FS);
}

/**
 * sb_start_write - get write access to a superblock
 * @sb: the super we write to
 *
 * When a process wants to write data or metadata to a file system (i.e. dirty
 * a page or an inode), it should embed the operation in a sb_start_write() -
 * sb_end_write() pair to get exclusion against file system freezing. This
 * function increments number of writers preventing freezing. If the file
 * system is already frozen, the function waits until the file system is
 * thawed.
 *
 * Since freeze protection behaves as a lock, users have to preserve
 * ordering of freeze protection and other filesystem locks. Generally,
 * freeze protection should be the outermost lock. In particular, we have:
 *
 * sb_start_write
 *   -> i_mutex                        (write path, truncate, directory ops, ...)
 *   -> s_umount                (freeze_super, thaw_super)
 */
static inline void sb_start_write(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_WRITE);
}

static inline bool sb_start_write_trylock(struct super_block *sb)
{
        return __sb_start_write_trylock(sb, SB_FREEZE_WRITE);
}

/**
 * sb_start_pagefault - get write access to a superblock from a page fault
 * @sb: the super we write to
 *
 * When a process starts handling write page fault, it should embed the
 * operation into sb_start_pagefault() - sb_end_pagefault() pair to get
 * exclusion against file system freezing. This is needed since the page fault
 * is going to dirty a page. This function increments number of running page
 * faults preventing freezing. If the file system is already frozen, the
 * function waits until the file system is thawed.
 *
 * Since page fault freeze protection behaves as a lock, users have to preserve
 * ordering of freeze protection and other filesystem locks. It is advised to
 * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault
 * handling code implies lock dependency:
 *
 * mmap_lock
 *   -> sb_start_pagefault
 */
static inline void sb_start_pagefault(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_PAGEFAULT);
}

/**
 * sb_start_intwrite - get write access to a superblock for internal fs purposes
 * @sb: the super we write to
 *
 * This is the third level of protection against filesystem freezing. It is
 * free for use by a filesystem. The only requirement is that it must rank
 * below sb_start_pagefault.
 *
 * For example filesystem can call sb_start_intwrite() when starting a
 * transaction which somewhat eases handling of freezing for internal sources
 * of filesystem changes (internal fs threads, discarding preallocation on file
 * close, etc.).
 */
static inline void sb_start_intwrite(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_FS);
}

static inline bool sb_start_intwrite_trylock(struct super_block *sb)
{
        return __sb_start_write_trylock(sb, SB_FREEZE_FS);
}

bool inode_owner_or_capable(struct mnt_idmap *idmap,
                            const struct inode *inode);

/*
 * VFS helper functions..
 */
int vfs_create(struct mnt_idmap *, struct inode *,
               struct dentry *, umode_t, bool);
int vfs_mkdir(struct mnt_idmap *, struct inode *,
              struct dentry *, umode_t);
int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *,
              umode_t, dev_t);
int vfs_symlink(struct mnt_idmap *, struct inode *,
                struct dentry *, const char *);
int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *,
             struct dentry *, struct inode **);
int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *);
int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *,
               struct inode **);

/**
 * struct renamedata - contains all information required for renaming
 * @old_mnt_idmap:     idmap of the old mount the inode was found from
 * @old_dir:           parent of source
 * @old_dentry:                source
 * @new_mnt_idmap:     idmap of the new mount the inode was found from
 * @new_dir:           parent of destination
 * @new_dentry:                destination
 * @delegated_inode:   returns an inode needing a delegation break
 * @flags:             rename flags
 */
struct renamedata {
        struct mnt_idmap *old_mnt_idmap;
        struct inode *old_dir;
        struct dentry *old_dentry;
        struct mnt_idmap *new_mnt_idmap;
        struct inode *new_dir;
        struct dentry *new_dentry;
        struct inode **delegated_inode;
        unsigned int flags;
} __randomize_layout;

int vfs_rename(struct renamedata *);

static inline int vfs_whiteout(struct mnt_idmap *idmap,
                               struct inode *dir, struct dentry *dentry)
{
        return vfs_mknod(idmap, dir, dentry, S_IFCHR | WHITEOUT_MODE,
                         WHITEOUT_DEV);
}

struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
                                 const struct path *parentpath,
                                 umode_t mode, int open_flag,
                                 const struct cred *cred);
struct file *kernel_file_open(const struct path *path, int flags,
                              const struct cred *cred);

int vfs_mkobj(struct dentry *, umode_t,
                int (*f)(struct dentry *, umode_t, void *),
                void *);

int vfs_fchown(struct file *file, uid_t user, gid_t group);
int vfs_fchmod(struct file *file, umode_t mode);
int vfs_utimes(const struct path *path, struct timespec64 *times);

extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);

#ifdef CONFIG_COMPAT
extern long compat_ptr_ioctl(struct file *file, unsigned int cmd,
                                        unsigned long arg);
#else
#define compat_ptr_ioctl NULL
#endif

/*
 * VFS file helper functions.
 */
void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode,
                      const struct inode *dir, umode_t mode);
extern bool may_open_dev(const struct path *path);
umode_t mode_strip_sgid(struct mnt_idmap *idmap,
                        const struct inode *dir, umode_t mode);

/*
 * This is the "filldir" function type, used by readdir() to let
 * the kernel specify what kind of dirent layout it wants to have.
 * This allows the kernel to read directories into kernel space or
 * to have different dirent layouts depending on the binary type.
 * Return 'true' to keep going and 'false' if there are no more entries.
 */
struct dir_context;
typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64,
                         unsigned);

struct dir_context {
        filldir_t actor;
        loff_t pos;
};

/*
 * These flags let !MMU mmap() govern direct device mapping vs immediate
 * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
 *
 * NOMMU_MAP_COPY:        Copy can be mapped (MAP_PRIVATE)
 * NOMMU_MAP_DIRECT:        Can be mapped directly (MAP_SHARED)
 * NOMMU_MAP_READ:        Can be mapped for reading
 * NOMMU_MAP_WRITE:        Can be mapped for writing
 * NOMMU_MAP_EXEC:        Can be mapped for execution
 */
#define NOMMU_MAP_COPY                0x00000001
#define NOMMU_MAP_DIRECT        0x00000008
#define NOMMU_MAP_READ                VM_MAYREAD
#define NOMMU_MAP_WRITE                VM_MAYWRITE
#define NOMMU_MAP_EXEC                VM_MAYEXEC

#define NOMMU_VMFLAGS \
        (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC)

/*
 * These flags control the behavior of the remap_file_range function pointer.
 * If it is called with len == 0 that means "remap to end of source file".
 * See Documentation/filesystems/vfs.rst for more details about this call.
 *
 * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate)
 * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request
 */
#define REMAP_FILE_DEDUP                (1 << 0)
#define REMAP_FILE_CAN_SHORTEN                (1 << 1)

/*
 * These flags signal that the caller is ok with altering various aspects of
 * the behavior of the remap operation.  The changes must be made by the
 * implementation; the vfs remap helper functions can take advantage of them.
 * Flags in this category exist to preserve the quirky behavior of the hoisted
 * btrfs clone/dedupe ioctls.
 */
#define REMAP_FILE_ADVISORY                (REMAP_FILE_CAN_SHORTEN)

/*
 * These flags control the behavior of vfs_copy_file_range().
 * They are not available to the user via syscall.
 *
 * COPY_FILE_SPLICE: call splice direct instead of fs clone/copy ops
 */
#define COPY_FILE_SPLICE                (1 << 0)

struct iov_iter;
struct io_uring_cmd;
struct offset_ctx;

typedef unsigned int __bitwise fop_flags_t;

struct file_operations {
        struct module *owner;
        fop_flags_t fop_flags;
        loff_t (*llseek) (struct file *, loff_t, int);
        ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
        ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
        ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
        ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
        int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *,
                        unsigned int flags);
        int (*iterate_shared) (struct file *, struct dir_context *);
        __poll_t (*poll) (struct file *, struct poll_table_struct *);
        long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
        long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
        int (*mmap) (struct file *, struct vm_area_struct *);
        int (*open) (struct inode *, struct file *);
        int (*flush) (struct file *, fl_owner_t id);
        int (*release) (struct inode *, struct file *);
        int (*fsync) (struct file *, loff_t, loff_t, int datasync);
        int (*fasync) (int, struct file *, int);
        int (*lock) (struct file *, int, struct file_lock *);
        unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
        int (*check_flags)(int);
        int (*flock) (struct file *, int, struct file_lock *);
        ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
        ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
        void (*splice_eof)(struct file *file);
        int (*setlease)(struct file *, int, struct file_lease **, void **);
        long (*fallocate)(struct file *file, int mode, loff_t offset,
                          loff_t len);
        void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
        unsigned (*mmap_capabilities)(struct file *);
#endif
        ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
                        loff_t, size_t, unsigned int);
        loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
                                   struct file *file_out, loff_t pos_out,
                                   loff_t len, unsigned int remap_flags);
        int (*fadvise)(struct file *, loff_t, loff_t, int);
        int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
        int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
                                unsigned int poll_flags);
} __randomize_layout;

/* Supports async buffered reads */
#define FOP_BUFFER_RASYNC        ((__force fop_flags_t)(1 << 0))
/* Supports async buffered writes */
#define FOP_BUFFER_WASYNC        ((__force fop_flags_t)(1 << 1))
/* Supports synchronous page faults for mappings */
#define FOP_MMAP_SYNC                ((__force fop_flags_t)(1 << 2))
/* Supports non-exclusive O_DIRECT writes from multiple threads */
#define FOP_DIO_PARALLEL_WRITE        ((__force fop_flags_t)(1 << 3))
/* Contains huge pages */
#define FOP_HUGE_PAGES                ((__force fop_flags_t)(1 << 4))

/* Wrap a directory iterator that needs exclusive inode access */
int wrap_directory_iterator(struct file *, struct dir_context *,
                            int (*) (struct file *, struct dir_context *));
#define WRAP_DIR_ITER(x) \
        static int shared_##x(struct file *file , struct dir_context *ctx) \
        { return wrap_directory_iterator(file, ctx, x); }

struct inode_operations {
        struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
        const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
        int (*permission) (struct mnt_idmap *, struct inode *, int);
        struct posix_acl * (*get_inode_acl)(struct inode *, int, bool);

        int (*readlink) (struct dentry *, char __user *,int);

        int (*create) (struct mnt_idmap *, struct inode *,struct dentry *,
                       umode_t, bool);
        int (*link) (struct dentry *,struct inode *,struct dentry *);
        int (*unlink) (struct inode *,struct dentry *);
        int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *,
                        const char *);
        int (*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *,
                      umode_t);
        int (*rmdir) (struct inode *,struct dentry *);
        int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *,
                      umode_t,dev_t);
        int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *,
                        struct inode *, struct dentry *, unsigned int);
        int (*setattr) (struct mnt_idmap *, struct dentry *, struct iattr *);
        int (*getattr) (struct mnt_idmap *, const struct path *,
                        struct kstat *, u32, unsigned int);
        ssize_t (*listxattr) (struct dentry *, char *, size_t);
        int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
                      u64 len);
        int (*update_time)(struct inode *, int);
        int (*atomic_open)(struct inode *, struct dentry *,
                           struct file *, unsigned open_flag,
                           umode_t create_mode);
        int (*tmpfile) (struct mnt_idmap *, struct inode *,
                        struct file *, umode_t);
        struct posix_acl *(*get_acl)(struct mnt_idmap *, struct dentry *,
                                     int);
        int (*set_acl)(struct mnt_idmap *, struct dentry *,
                       struct posix_acl *, int);
        int (*fileattr_set)(struct mnt_idmap *idmap,
                            struct dentry *dentry, struct fileattr *fa);
        int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
        struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
} ____cacheline_aligned;

static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
{
        return file->f_op->mmap(file, vma);
}

extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
                                   loff_t, size_t, unsigned int);
int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write);
int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
                                    struct file *file_out, loff_t pos_out,
                                    loff_t *len, unsigned int remap_flags,
                                    const struct iomap_ops *dax_read_ops);
int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
                                  struct file *file_out, loff_t pos_out,
                                  loff_t *count, unsigned int remap_flags);
extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
                                   struct file *file_out, loff_t pos_out,
                                   loff_t len, unsigned int remap_flags);
extern int vfs_dedupe_file_range(struct file *file,
                                 struct file_dedupe_range *same);
extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
                                        struct file *dst_file, loff_t dst_pos,
                                        loff_t len, unsigned int remap_flags);

/**
 * enum freeze_holder - holder of the freeze
 * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem
 * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem
 * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed
 *
 * Indicate who the owner of the freeze or thaw request is and whether
 * the freeze needs to be exclusive or can nest.
 * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the
 * same holder aren't allowed. It is however allowed to hold a single
 * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at
 * the same time. This is relied upon by some filesystems during online
 * repair or similar.
 */
enum freeze_holder {
        FREEZE_HOLDER_KERNEL        = (1U << 0),
        FREEZE_HOLDER_USERSPACE        = (1U << 1),
        FREEZE_MAY_NEST                = (1U << 2),
};

struct super_operations {
           struct inode *(*alloc_inode)(struct super_block *sb);
        void (*destroy_inode)(struct inode *);
        void (*free_inode)(struct inode *);

           void (*dirty_inode) (struct inode *, int flags);
        int (*write_inode) (struct inode *, struct writeback_control *wbc);
        int (*drop_inode) (struct inode *);
        void (*evict_inode) (struct inode *);
        void (*put_super) (struct super_block *);
        int (*sync_fs)(struct super_block *sb, int wait);
        int (*freeze_super) (struct super_block *, enum freeze_holder who);
        int (*freeze_fs) (struct super_block *);
        int (*thaw_super) (struct super_block *, enum freeze_holder who);
        int (*unfreeze_fs) (struct super_block *);
        int (*statfs) (struct dentry *, struct kstatfs *);
        int (*remount_fs) (struct super_block *, int *, char *);
        void (*umount_begin) (struct super_block *);

        int (*show_options)(struct seq_file *, struct dentry *);
        int (*show_devname)(struct seq_file *, struct dentry *);
        int (*show_path)(struct seq_file *, struct dentry *);
        int (*show_stats)(struct seq_file *, struct dentry *);
#ifdef CONFIG_QUOTA
        ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
        ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
        struct dquot __rcu **(*get_dquots)(struct inode *);
#endif
        long (*nr_cached_objects)(struct super_block *,
                                  struct shrink_control *);
        long (*free_cached_objects)(struct super_block *,
                                    struct shrink_control *);
        void (*shutdown)(struct super_block *sb);
};

/*
 * Inode flags - they have no relation to superblock flags now
 */
#define S_SYNC                (1 << 0)  /* Writes are synced at once */
#define S_NOATIME        (1 << 1)  /* Do not update access times */
#define S_APPEND        (1 << 2)  /* Append-only file */
#define S_IMMUTABLE        (1 << 3)  /* Immutable file */
#define S_DEAD                (1 << 4)  /* removed, but still open directory */
#define S_NOQUOTA        (1 << 5)  /* Inode is not counted to quota */
#define S_DIRSYNC        (1 << 6)  /* Directory modifications are synchronous */
#define S_NOCMTIME        (1 << 7)  /* Do not update file c/mtime */
#define S_SWAPFILE        (1 << 8)  /* Do not truncate: swapon got its bmaps */
#define S_PRIVATE        (1 << 9)  /* Inode is fs-internal */
#define S_IMA                (1 << 10) /* Inode has an associated IMA struct */
#define S_AUTOMOUNT        (1 << 11) /* Automount/referral quasi-directory */
#define S_NOSEC                (1 << 12) /* no suid or xattr security attributes */
#ifdef CONFIG_FS_DAX
#define S_DAX                (1 << 13) /* Direct Access, avoiding the page cache */
#else
#define S_DAX                0          /* Make all the DAX code disappear */
#endif
#define S_ENCRYPTED        (1 << 14) /* Encrypted file (using fs/crypto/) */
#define S_CASEFOLD        (1 << 15) /* Casefolded file */
#define S_VERITY        (1 << 16) /* Verity file (using fs/verity/) */
#define S_KERNEL_FILE        (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */

/*
 * Note that nosuid etc flags are inode-specific: setting some file-system
 * flags just means all the inodes inherit those flags by default. It might be
 * possible to override it selectively if you really wanted to with some
 * ioctl() that is not currently implemented.
 *
 * Exception: SB_RDONLY is always applied to the entire file system.
 *
 * Unfortunately, it is possible to change a filesystems flags with it mounted
 * with files in use.  This means that all of the inodes will not have their
 * i_flags updated.  Hence, i_flags no longer inherit the superblock mount
 * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org
 */
#define __IS_FLG(inode, flg)        ((inode)->i_sb->s_flags & (flg))

static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; }
#define IS_RDONLY(inode)        sb_rdonly((inode)->i_sb)
#define IS_SYNC(inode)                (__IS_FLG(inode, SB_SYNCHRONOUS) || \
                                        ((inode)->i_flags & S_SYNC))
#define IS_DIRSYNC(inode)        (__IS_FLG(inode, SB_SYNCHRONOUS|SB_DIRSYNC) || \
                                        ((inode)->i_flags & (S_SYNC|S_DIRSYNC)))
#define IS_MANDLOCK(inode)        __IS_FLG(inode, SB_MANDLOCK)
#define IS_NOATIME(inode)        __IS_FLG(inode, SB_RDONLY|SB_NOATIME)
#define IS_I_VERSION(inode)        __IS_FLG(inode, SB_I_VERSION)

#define IS_NOQUOTA(inode)        ((inode)->i_flags & S_NOQUOTA)
#define IS_APPEND(inode)        ((inode)->i_flags & S_APPEND)
#define IS_IMMUTABLE(inode)        ((inode)->i_flags & S_IMMUTABLE)

#ifdef CONFIG_FS_POSIX_ACL
#define IS_POSIXACL(inode)        __IS_FLG(inode, SB_POSIXACL)
#else
#define IS_POSIXACL(inode)        0
#endif

#define IS_DEADDIR(inode)        ((inode)->i_flags & S_DEAD)
#define IS_NOCMTIME(inode)        ((inode)->i_flags & S_NOCMTIME)

#ifdef CONFIG_SWAP
#define IS_SWAPFILE(inode)        ((inode)->i_flags & S_SWAPFILE)
#else
#define IS_SWAPFILE(inode)        ((void)(inode), 0U)
#endif

#define IS_PRIVATE(inode)        ((inode)->i_flags & S_PRIVATE)
#define IS_IMA(inode)                ((inode)->i_flags & S_IMA)
#define IS_AUTOMOUNT(inode)        ((inode)->i_flags & S_AUTOMOUNT)
#define IS_NOSEC(inode)                ((inode)->i_flags & S_NOSEC)
#define IS_DAX(inode)                ((inode)->i_flags & S_DAX)
#define IS_ENCRYPTED(inode)        ((inode)->i_flags & S_ENCRYPTED)
#define IS_CASEFOLDED(inode)        ((inode)->i_flags & S_CASEFOLD)
#define IS_VERITY(inode)        ((inode)->i_flags & S_VERITY)

#define IS_WHITEOUT(inode)        (S_ISCHR(inode->i_mode) && \
                                 (inode)->i_rdev == WHITEOUT_DEV)

static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap,
                                   struct inode *inode)
{
        return !vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
               !vfsgid_valid(i_gid_into_vfsgid(idmap, inode));
}

static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
{
        *kiocb = (struct kiocb) {
                .ki_filp = filp,
                .ki_flags = filp->f_iocb_flags,
                .ki_ioprio = get_current_ioprio(),
        };
}

static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
                               struct file *filp)
{
        *kiocb = (struct kiocb) {
                .ki_filp = filp,
                .ki_flags = kiocb_src->ki_flags,
                .ki_ioprio = kiocb_src->ki_ioprio,
                .ki_pos = kiocb_src->ki_pos,
        };
}

/*
 * Inode state bits.  Protected by inode->i_lock
 *
 * Four bits determine the dirty state of the inode: I_DIRTY_SYNC,
 * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME.
 *
 * Four bits define the lifetime of an inode.  Initially, inodes are I_NEW,
 * until that flag is cleared.  I_WILL_FREE, I_FREEING and I_CLEAR are set at
 * various stages of removing an inode.
 *
 * Two bits are used for locking and completion notification, I_NEW and I_SYNC.
 *
 * I_DIRTY_SYNC                Inode is dirty, but doesn't have to be written on
 *                        fdatasync() (unless I_DIRTY_DATASYNC is also set).
 *                        Timestamp updates are the usual cause.
 * I_DIRTY_DATASYNC        Data-related inode changes pending.  We keep track of
 *                        these changes separately from I_DIRTY_SYNC so that we
 *                        don't have to write inode on fdatasync() when only
 *                        e.g. the timestamps have changed.
 * I_DIRTY_PAGES        Inode has dirty pages.  Inode itself may be clean.
 * I_DIRTY_TIME                The inode itself has dirty timestamps, and the
 *                        lazytime mount option is enabled.  We keep track of this
 *                        separately from I_DIRTY_SYNC in order to implement
 *                        lazytime.  This gets cleared if I_DIRTY_INODE
 *                        (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But
 *                        I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already
 *                        in place because writeback might already be in progress
 *                        and we don't want to lose the time update
 * I_NEW                Serves as both a mutex and completion notification.
 *                        New inodes set I_NEW.  If two processes both create
 *                        the same inode, one of them will release its inode and
 *                        wait for I_NEW to be released before returning.
 *                        Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can
 *                        also cause waiting on I_NEW, without I_NEW actually
 *                        being set.  find_inode() uses this to prevent returning
 *                        nearly-dead inodes.
 * I_WILL_FREE                Must be set when calling write_inode_now() if i_count
 *                        is zero.  I_FREEING must be set when I_WILL_FREE is
 *                        cleared.
 * I_FREEING                Set when inode is about to be freed but still has dirty
 *                        pages or buffers attached or the inode itself is still
 *                        dirty.
 * I_CLEAR                Added by clear_inode().  In this state the inode is
 *                        clean and can be destroyed.  Inode keeps I_FREEING.
 *
 *                        Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are
 *                        prohibited for many purposes.  iget() must wait for
 *                        the inode to be completely released, then create it
 *                        anew.  Other functions will just ignore such inodes,
 *                        if appropriate.  I_NEW is used for waiting.
 *
 * I_SYNC                Writeback of inode is running. The bit is set during
 *                        data writeback, and cleared with a wakeup on the bit
 *                        address once it is done. The bit is also used to pin
 *                        the inode in memory for flusher thread.
 *
 * I_REFERENCED                Marks the inode as recently references on the LRU list.
 *
 * I_DIO_WAKEUP                Never set.  Only used as a key for wait_on_bit().
 *
 * I_WB_SWITCH                Cgroup bdi_writeback switching in progress.  Used to
 *                        synchronize competing switching instances and to tell
 *                        wb stat updates to grab the i_pages lock.  See
 *                        inode_switch_wbs_work_fn() for details.
 *
 * I_OVL_INUSE                Used by overlayfs to get exclusive ownership on upper
 *                        and work dirs among overlayfs mounts.
 *
 * I_CREATING                New object's inode in the middle of setting up.
 *
 * I_DONTCACHE                Evict inode as soon as it is not used anymore.
 *
 * I_SYNC_QUEUED        Inode is queued in b_io or b_more_io writeback lists.
 *                        Used to detect that mark_inode_dirty() should not move
 *                         inode between dirty lists.
 *
 * I_PINNING_FSCACHE_WB        Inode is pinning an fscache object for writeback.
 *
 * Q: What is the difference between I_WILL_FREE and I_FREEING?
 */
#define I_DIRTY_SYNC                (1 << 0)
#define I_DIRTY_DATASYNC        (1 << 1)
#define I_DIRTY_PAGES                (1 << 2)
#define __I_NEW                        3
#define I_NEW                        (1 << __I_NEW)
#define I_WILL_FREE                (1 << 4)
#define I_FREEING                (1 << 5)
#define I_CLEAR                        (1 << 6)
#define __I_SYNC                7
#define I_SYNC                        (1 << __I_SYNC)
#define I_REFERENCED                (1 << 8)
#define __I_DIO_WAKEUP                9
#define I_DIO_WAKEUP                (1 << __I_DIO_WAKEUP)
#define I_LINKABLE                (1 << 10)
#define I_DIRTY_TIME                (1 << 11)
#define I_WB_SWITCH                (1 << 13)
#define I_OVL_INUSE                (1 << 14)
#define I_CREATING                (1 << 15)
#define I_DONTCACHE                (1 << 16)
#define I_SYNC_QUEUED                (1 << 17)
#define I_PINNING_NETFS_WB        (1 << 18)

#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)

extern void __mark_inode_dirty(struct inode *, int);
static inline void mark_inode_dirty(struct inode *inode)
{
        __mark_inode_dirty(inode, I_DIRTY);
}

static inline void mark_inode_dirty_sync(struct inode *inode)
{
        __mark_inode_dirty(inode, I_DIRTY_SYNC);
}

/*
 * Returns true if the given inode itself only has dirty timestamps (its pages
 * may still be dirty) and isn't currently being allocated or freed.
 * Filesystems should call this if when writing an inode when lazytime is
 * enabled, they want to opportunistically write the timestamps of other inodes
 * located very nearby on-disk, e.g. in the same inode block.  This returns true
 * if the given inode is in need of such an opportunistic update.  Requires
 * i_lock, or at least later re-checking under i_lock.
 */
static inline bool inode_is_dirtytime_only(struct inode *inode)
{
        return (inode->i_state & (I_DIRTY_TIME | I_NEW |
                                  I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME;
}

extern void inc_nlink(struct inode *inode);
extern void drop_nlink(struct inode *inode);
extern void clear_nlink(struct inode *inode);
extern void set_nlink(struct inode *inode, unsigned int nlink);

static inline void inode_inc_link_count(struct inode *inode)
{
        inc_nlink(inode);
        mark_inode_dirty(inode);
}

static inline void inode_dec_link_count(struct inode *inode)
{
        drop_nlink(inode);
        mark_inode_dirty(inode);
}

enum file_time_flags {
        S_ATIME = 1,
        S_MTIME = 2,
        S_CTIME = 4,
        S_VERSION = 8,
};

extern bool atime_needs_update(const struct path *, struct inode *);
extern void touch_atime(const struct path *);
int inode_update_time(struct inode *inode, int flags);

static inline void file_accessed(struct file *file)
{
        if (!(file->f_flags & O_NOATIME))
                touch_atime(&file->f_path);
}

extern int file_modified(struct file *file);
int kiocb_modified(struct kiocb *iocb);

int sync_inode_metadata(struct inode *inode, int wait);

struct file_system_type {
        const char *name;
        int fs_flags;
#define FS_REQUIRES_DEV                1 
#define FS_BINARY_MOUNTDATA        2
#define FS_HAS_SUBTYPE                4
#define FS_USERNS_MOUNT                8        /* Can be mounted by userns root */
#define FS_DISALLOW_NOTIFY_PERM        16        /* Disable fanotify permission events */
#define FS_ALLOW_IDMAP         32      /* FS has been updated to handle vfs idmappings. */
#define FS_RENAME_DOES_D_MOVE        32768        /* FS will handle d_move() during rename() internally. */
        int (*init_fs_context)(struct fs_context *);
        const struct fs_parameter_spec *parameters;
        struct dentry *(*mount) (struct file_system_type *, int,
                       const char *, void *);
        void (*kill_sb) (struct super_block *);
        struct module *owner;
        struct file_system_type * next;
        struct hlist_head fs_supers;

        struct lock_class_key s_lock_key;
        struct lock_class_key s_umount_key;
        struct lock_class_key s_vfs_rename_key;
        struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];

        struct lock_class_key i_lock_key;
        struct lock_class_key i_mutex_key;
        struct lock_class_key invalidate_lock_key;
        struct lock_class_key i_mutex_dir_key;
};

#define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)

extern struct dentry *mount_bdev(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data,
        int (*fill_super)(struct super_block *, void *, int));
extern struct dentry *mount_single(struct file_system_type *fs_type,
        int flags, void *data,
        int (*fill_super)(struct super_block *, void *, int));
extern struct dentry *mount_nodev(struct file_system_type *fs_type,
        int flags, void *data,
        int (*fill_super)(struct super_block *, void *, int));
extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path);
void retire_super(struct super_block *sb);
void generic_shutdown_super(struct super_block *sb);
void kill_block_super(struct super_block *sb);
void kill_anon_super(struct super_block *sb);
void kill_litter_super(struct super_block *sb);
void deactivate_super(struct super_block *sb);
void deactivate_locked_super(struct super_block *sb);
int set_anon_super(struct super_block *s, void *data);
int set_anon_super_fc(struct super_block *s, struct fs_context *fc);
int get_anon_bdev(dev_t *);
void free_anon_bdev(dev_t);
struct super_block *sget_fc(struct fs_context *fc,
                            int (*test)(struct super_block *, struct fs_context *),
                            int (*set)(struct super_block *, struct fs_context *));
struct super_block *sget(struct file_system_type *type,
                        int (*test)(struct super_block *,void *),
                        int (*set)(struct super_block *,void *),
                        int flags, void *data);
struct super_block *sget_dev(struct fs_context *fc, dev_t dev);

/* Alas, no aliases. Too much hassle with bringing module.h everywhere */
#define fops_get(fops) \
        (((fops) && try_module_get((fops)->owner) ? (fops) : NULL))
#define fops_put(fops) \
        do { if (fops) module_put((fops)->owner); } while(0)
/*
 * This one is to be used *ONLY* from ->open() instances.
 * fops must be non-NULL, pinned down *and* module dependencies
 * should be sufficient to pin the caller down as well.
 */
#define replace_fops(f, fops) \
        do {        \
                struct file *__file = (f); \
                fops_put(__file->f_op); \
                BUG_ON(!(__file->f_op = (fops))); \
        } while(0)

extern int register_filesystem(struct file_system_type *);
extern int unregister_filesystem(struct file_system_type *);
extern int vfs_statfs(const struct path *, struct kstatfs *);
extern int user_statfs(const char __user *, struct kstatfs *);
extern int fd_statfs(int, struct kstatfs *);
int freeze_super(struct super_block *super, enum freeze_holder who);
int thaw_super(struct super_block *super, enum freeze_holder who);
extern __printf(2, 3)
int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
extern int super_setup_bdi(struct super_block *sb);

static inline void super_set_uuid(struct super_block *sb, const u8 *uuid, unsigned len)
{
        if (WARN_ON(len > sizeof(sb->s_uuid)))
                len = sizeof(sb->s_uuid);
        sb->s_uuid_len = len;
        memcpy(&sb->s_uuid, uuid, len);
}

/* set sb sysfs name based on sb->s_bdev */
static inline void super_set_sysfs_name_bdev(struct super_block *sb)
{
        snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pg", sb->s_bdev);
}

/* set sb sysfs name based on sb->s_uuid */
static inline void super_set_sysfs_name_uuid(struct super_block *sb)
{
        WARN_ON(sb->s_uuid_len != sizeof(sb->s_uuid));
        snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pU", sb->s_uuid.b);
}

/* set sb sysfs name based on sb->s_id */
static inline void super_set_sysfs_name_id(struct super_block *sb)
{
        strscpy(sb->s_sysfs_name, sb->s_id, sizeof(sb->s_sysfs_name));
}

/* try to use something standard before you use this */
__printf(2, 3)
static inline void super_set_sysfs_name_generic(struct super_block *sb, const char *fmt, ...)
{
        va_list args;

        va_start(args, fmt);
        vsnprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), fmt, args);
        va_end(args);
}

extern int current_umask(void);

extern void ihold(struct inode * inode);
extern void iput(struct inode *);
int inode_update_timestamps(struct inode *inode, int flags);
int generic_update_time(struct inode *, int);

/* /sys/fs */
extern struct kobject *fs_kobj;

#define MAX_RW_COUNT (INT_MAX & PAGE_MASK)

/* fs/open.c */
struct audit_names;
struct filename {
        const char                *name;        /* pointer to actual string */
        const __user char        *uptr;        /* original userland pointer */
        atomic_t                refcnt;
        struct audit_names        *aname;
        const char                iname[];
};
static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);

static inline struct mnt_idmap *file_mnt_idmap(const struct file *file)
{
        return mnt_idmap(file->f_path.mnt);
}

/**
 * is_idmapped_mnt - check whether a mount is mapped
 * @mnt: the mount to check
 *
 * If @mnt has an non @nop_mnt_idmap attached to it then @mnt is mapped.
 *
 * Return: true if mount is mapped, false if not.
 */
static inline bool is_idmapped_mnt(const struct vfsmount *mnt)
{
        return mnt_idmap(mnt) != &nop_mnt_idmap;
}

extern long vfs_truncate(const struct path *, loff_t);
int do_truncate(struct mnt_idmap *, struct dentry *, loff_t start,
                unsigned int time_attrs, struct file *filp);
extern int vfs_fallocate(struct file *file, int mode, loff_t offset,
                        loff_t len);
extern long do_sys_open(int dfd, const char __user *filename, int flags,
                        umode_t mode);
extern struct file *file_open_name(struct filename *, int, umode_t);
extern struct file *filp_open(const char *, int, umode_t);
extern struct file *file_open_root(const struct path *,
                                   const char *, int, umode_t);
static inline struct file *file_open_root_mnt(struct vfsmount *mnt,
                                   const char *name, int flags, umode_t mode)
{
        return file_open_root(&(struct path){.mnt = mnt, .dentry = mnt->mnt_root},
                              name, flags, mode);
}
struct file *dentry_open(const struct path *path, int flags,
                         const struct cred *creds);
struct file *dentry_create(const struct path *path, int flags, umode_t mode,
                           const struct cred *cred);
struct path *backing_file_user_path(struct file *f);

/*
 * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file
 * stored in ->vm_file is a backing file whose f_inode is on the underlying
 * filesystem.  When the mapped file path and inode number are displayed to
 * user (e.g. via /proc/<pid>/maps), these helpers should be used to get the
 * path and inode number to display to the user, which is the path of the fd
 * that user has requested to map and the inode number that would be returned
 * by fstat() on that same fd.
 */
/* Get the path to display in /proc/<pid>/maps */
static inline const struct path *file_user_path(struct file *f)
{
        if (unlikely(f->f_mode & FMODE_BACKING))
                return backing_file_user_path(f);
        return &f->f_path;
}
/* Get the inode whose inode number to display in /proc/<pid>/maps */
static inline const struct inode *file_user_inode(struct file *f)
{
        if (unlikely(f->f_mode & FMODE_BACKING))
                return d_inode(backing_file_user_path(f)->dentry);
        return file_inode(f);
}

static inline struct file *file_clone_open(struct file *file)
{
        return dentry_open(&file->f_path, file->f_flags, file->f_cred);
}
extern int filp_close(struct file *, fl_owner_t id);

extern struct filename *getname_flags(const char __user *, int, int *);
extern struct filename *getname_uflags(const char __user *, int);
extern struct filename *getname(const char __user *);
extern struct filename *getname_kernel(const char *);
extern void putname(struct filename *name);

extern int finish_open(struct file *file, struct dentry *dentry,
                        int (*open)(struct inode *, struct file *));
extern int finish_no_open(struct file *file, struct dentry *dentry);

/* Helper for the simple case when original dentry is used */
static inline int finish_open_simple(struct file *file, int error)
{
        if (error)
                return error;

        return finish_open(file, file->f_path.dentry, NULL);
}

/* fs/dcache.c */
extern void __init vfs_caches_init_early(void);
extern void __init vfs_caches_init(void);

extern struct kmem_cache *names_cachep;

#define __getname()                kmem_cache_alloc(names_cachep, GFP_KERNEL)
#define __putname(name)                kmem_cache_free(names_cachep, (void *)(name))

extern struct super_block *blockdev_superblock;
static inline bool sb_is_blkdev_sb(struct super_block *sb)
{
        return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock;
}

void emergency_thaw_all(void);
extern int sync_filesystem(struct super_block *);
extern const struct file_operations def_blk_fops;
extern const struct file_operations def_chr_fops;

/* fs/char_dev.c */
#define CHRDEV_MAJOR_MAX 512
/* Marks the bottom of the first segment of free char majors */
#define CHRDEV_MAJOR_DYN_END 234
/* Marks the top and bottom of the second segment of free char majors */
#define CHRDEV_MAJOR_DYN_EXT_START 511
#define CHRDEV_MAJOR_DYN_EXT_END 384

extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
extern int register_chrdev_region(dev_t, unsigned, const char *);
extern int __register_chrdev(unsigned int major, unsigned int baseminor,
                             unsigned int count, const char *name,
                             const struct file_operations *fops);
extern void __unregister_chrdev(unsigned int major, unsigned int baseminor,
                                unsigned int count, const char *name);
extern void unregister_chrdev_region(dev_t, unsigned);
extern void chrdev_show(struct seq_file *,off_t);

static inline int register_chrdev(unsigned int major, const char *name,
                                  const struct file_operations *fops)
{
        return __register_chrdev(major, 0, 256, name, fops);
}

static inline void unregister_chrdev(unsigned int major, const char *name)
{
        __unregister_chrdev(major, 0, 256, name);
}

extern void init_special_inode(struct inode *, umode_t, dev_t);

/* Invalid inode operations -- fs/bad_inode.c */
extern void make_bad_inode(struct inode *);
extern bool is_bad_inode(struct inode *);

extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
                                                loff_t lend);
extern int __must_check file_check_and_advance_wb_err(struct file *file);
extern int __must_check file_write_and_wait_range(struct file *file,
                                                loff_t start, loff_t end);

static inline int file_write_and_wait(struct file *file)
{
        return file_write_and_wait_range(file, 0, LLONG_MAX);
}

extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
                           int datasync);
extern int vfs_fsync(struct file *file, int datasync);

extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
                                unsigned int flags);

static inline bool iocb_is_dsync(const struct kiocb *iocb)
{
        return (iocb->ki_flags & IOCB_DSYNC) ||
                IS_SYNC(iocb->ki_filp->f_mapping->host);
}

/*
 * Sync the bytes written if this was a synchronous write.  Expect ki_pos
 * to already be updated for the write, and will return either the amount
 * of bytes passed in, or an error if syncing the file failed.
 */
static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
{
        if (iocb_is_dsync(iocb)) {
                int ret = vfs_fsync_range(iocb->ki_filp,
                                iocb->ki_pos - count, iocb->ki_pos - 1,
                                (iocb->ki_flags & IOCB_SYNC) ? 0 : 1);
                if (ret)
                        return ret;
        }

        return count;
}

extern void emergency_sync(void);
extern void emergency_remount(void);

#ifdef CONFIG_BLOCK
extern int bmap(struct inode *inode, sector_t *block);
#else
static inline int bmap(struct inode *inode,  sector_t *block)
{
        return -EINVAL;
}
#endif

int notify_change(struct mnt_idmap *, struct dentry *,
                  struct iattr *, struct inode **);
int inode_permission(struct mnt_idmap *, struct inode *, int);
int generic_permission(struct mnt_idmap *, struct inode *, int);
static inline int file_permission(struct file *file, int mask)
{
        return inode_permission(file_mnt_idmap(file),
                                file_inode(file), mask);
}
static inline int path_permission(const struct path *path, int mask)
{
        return inode_permission(mnt_idmap(path->mnt),
                                d_inode(path->dentry), mask);
}
int __check_sticky(struct mnt_idmap *idmap, struct inode *dir,
                   struct inode *inode);

static inline bool execute_ok(struct inode *inode)
{
        return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode);
}

static inline bool inode_wrong_type(const struct inode *inode, umode_t mode)
{
        return (inode->i_mode ^ mode) & S_IFMT;
}

/**
 * file_start_write - get write access to a superblock for regular file io
 * @file: the file we want to write to
 *
 * This is a variant of sb_start_write() which is a noop on non-regualr file.
 * Should be matched with a call to file_end_write().
 */
static inline void file_start_write(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return;
        sb_start_write(file_inode(file)->i_sb);
}

static inline bool file_start_write_trylock(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return true;
        return sb_start_write_trylock(file_inode(file)->i_sb);
}

/**
 * file_end_write - drop write access to a superblock of a regular file
 * @file: the file we wrote to
 *
 * Should be matched with a call to file_start_write().
 */
static inline void file_end_write(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return;
        sb_end_write(file_inode(file)->i_sb);
}

/**
 * kiocb_start_write - get write access to a superblock for async file io
 * @iocb: the io context we want to submit the write with
 *
 * This is a variant of sb_start_write() for async io submission.
 * Should be matched with a call to kiocb_end_write().
 */
static inline void kiocb_start_write(struct kiocb *iocb)
{
        struct inode *inode = file_inode(iocb->ki_filp);

        sb_start_write(inode->i_sb);
        /*
         * Fool lockdep by telling it the lock got released so that it
         * doesn't complain about the held lock when we return to userspace.
         */
        __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
}

/**
 * kiocb_end_write - drop write access to a superblock after async file io
 * @iocb: the io context we sumbitted the write with
 *
 * Should be matched with a call to kiocb_start_write().
 */
static inline void kiocb_end_write(struct kiocb *iocb)
{
        struct inode *inode = file_inode(iocb->ki_filp);

        /*
         * Tell lockdep we inherited freeze protection from submission thread.
         */
        __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
        sb_end_write(inode->i_sb);
}

/*
 * This is used for regular files where some users -- especially the
 * currently executed binary in a process, previously handled via
 * VM_DENYWRITE -- cannot handle concurrent write (and maybe mmap
 * read-write shared) accesses.
 *
 * get_write_access() gets write permission for a file.
 * put_write_access() releases this write permission.
 * deny_write_access() denies write access to a file.
 * allow_write_access() re-enables write access to a file.
 *
 * The i_writecount field of an inode can have the following values:
 * 0: no write access, no denied write access
 * < 0: (-i_writecount) users that denied write access to the file.
 * > 0: (i_writecount) users that have write access to the file.
 *
 * Normally we operate on that counter with atomic_{inc,dec} and it's safe
 * except for the cases where we don't hold i_writecount yet. Then we need to
 * use {get,deny}_write_access() - these functions check the sign and refuse
 * to do the change if sign is wrong.
 */
static inline int get_write_access(struct inode *inode)
{
        return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY;
}
static inline int deny_write_access(struct file *file)
{
        struct inode *inode = file_inode(file);
        return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY;
}
static inline void put_write_access(struct inode * inode)
{
        atomic_dec(&inode->i_writecount);
}
static inline void allow_write_access(struct file *file)
{
        if (file)
                atomic_inc(&file_inode(file)->i_writecount);
}
static inline bool inode_is_open_for_write(const struct inode *inode)
{
        return atomic_read(&inode->i_writecount) > 0;
}

#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
static inline void i_readcount_dec(struct inode *inode)
{
        BUG_ON(atomic_dec_return(&inode->i_readcount) < 0);
}
static inline void i_readcount_inc(struct inode *inode)
{
        atomic_inc(&inode->i_readcount);
}
#else
static inline void i_readcount_dec(struct inode *inode)
{
        return;
}
static inline void i_readcount_inc(struct inode *inode)
{
        return;
}
#endif
extern int do_pipe_flags(int *, int);

extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *);
ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos);
extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *);
extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *);
extern struct file * open_exec(const char *);
 
/* fs/dcache.c -- generic fs support functions */
extern bool is_subdir(struct dentry *, struct dentry *);
extern bool path_is_under(const struct path *, const struct path *);

extern char *file_path(struct file *, char *, int);

/**
 * is_dot_dotdot - returns true only if @name is "." or ".."
 * @name: file name to check
 * @len: length of file name, in bytes
 */
static inline bool is_dot_dotdot(const char *name, size_t len)
{
        return len && unlikely(name[0] == '.') &&
                (len == 1 || (len == 2 && name[1] == '.'));
}

#include <linux/err.h>

/* needed for stackable file system support */
extern loff_t default_llseek(struct file *file, loff_t offset, int whence);

extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence);

extern int inode_init_always(struct super_block *, struct inode *);
extern void inode_init_once(struct inode *);
extern void address_space_init_once(struct address_space *mapping);
extern struct inode * igrab(struct inode *);
extern ino_t iunique(struct super_block *, ino_t);
extern int inode_needs_sync(struct inode *inode);
extern int generic_delete_inode(struct inode *inode);
static inline int generic_drop_inode(struct inode *inode)
{
        return !inode->i_nlink || inode_unhashed(inode);
}
extern void d_mark_dontcache(struct inode *inode);

extern struct inode *ilookup5_nowait(struct super_block *sb,
                unsigned long hashval, int (*test)(struct inode *, void *),
                void *data);
extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data);
extern struct inode *ilookup(struct super_block *sb, unsigned long ino);

extern struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
                int (*test)(struct inode *, void *),
                int (*set)(struct inode *, void *),
                void *data);
extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
extern struct inode * iget_locked(struct super_block *, unsigned long);
extern struct inode *find_inode_nowait(struct super_block *,
                                       unsigned long,
                                       int (*match)(struct inode *,
                                                    unsigned long, void *),
                                       void *data);
extern struct inode *find_inode_rcu(struct super_block *, unsigned long,
                                    int (*)(struct inode *, void *), void *);
extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long);
extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
extern int insert_inode_locked(struct inode *);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern void lockdep_annotate_inode_mutex_key(struct inode *inode);
#else
static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { };
#endif
extern void unlock_new_inode(struct inode *);
extern void discard_new_inode(struct inode *);
extern unsigned int get_next_ino(void);
extern void evict_inodes(struct super_block *sb);
void dump_mapping(const struct address_space *);

/*
 * Userspace may rely on the inode number being non-zero. For example, glibc
 * simply ignores files with zero i_ino in unlink() and other places.
 *
 * As an additional complication, if userspace was compiled with
 * _FILE_OFFSET_BITS=32 on a 64-bit kernel we'll only end up reading out the
 * lower 32 bits, so we need to check that those aren't zero explicitly. With
 * _FILE_OFFSET_BITS=64, this may cause some harmless false-negatives, but
 * better safe than sorry.
 */
static inline bool is_zero_ino(ino_t ino)
{
        return (u32)ino == 0;
}

extern void __iget(struct inode * inode);
extern void iget_failed(struct inode *);
extern void clear_inode(struct inode *);
extern void __destroy_inode(struct inode *);
extern struct inode *new_inode_pseudo(struct super_block *sb);
extern struct inode *new_inode(struct super_block *sb);
extern void free_inode_nonrcu(struct inode *inode);
extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *);
extern int file_remove_privs_flags(struct file *file, unsigned int flags);
extern int file_remove_privs(struct file *);
int setattr_should_drop_sgid(struct mnt_idmap *idmap,
                             const struct inode *inode);

/*
 * This must be used for allocating filesystems specific inodes to set
 * up the inode reclaim context correctly.
 */
#define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp)

extern void __insert_inode_hash(struct inode *, unsigned long hashval);
static inline void insert_inode_hash(struct inode *inode)
{
        __insert_inode_hash(inode, inode->i_ino);
}

extern void __remove_inode_hash(struct inode *);
static inline void remove_inode_hash(struct inode *inode)
{
        if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash))
                __remove_inode_hash(inode);
}

extern void inode_sb_list_add(struct inode *inode);
extern void inode_add_lru(struct inode *inode);

extern int sb_set_blocksize(struct super_block *, int);
extern int sb_min_blocksize(struct super_block *, int);

extern int generic_file_mmap(struct file *, struct vm_area_struct *);
extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
int generic_write_checks_count(struct kiocb *iocb, loff_t *count);
extern int generic_write_check_limits(struct file *file, loff_t pos,
                loff_t *count);
extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *to,
                ssize_t already_read);
extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *);
ssize_t generic_perform_write(struct kiocb *, struct iov_iter *);
ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
                ssize_t direct_written, ssize_t buffered_written);

ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
                rwf_t flags);
ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
                rwf_t flags);
ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
                           struct iov_iter *iter);
ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
                            struct iov_iter *iter);

/* fs/splice.c */
ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
                            struct pipe_inode_info *pipe,
                            size_t len, unsigned int flags);
ssize_t copy_splice_read(struct file *in, loff_t *ppos,
                         struct pipe_inode_info *pipe,
                         size_t len, unsigned int flags);
extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
                struct file *, loff_t *, size_t, unsigned int);


extern void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
extern loff_t noop_llseek(struct file *file, loff_t offset, int whence);
#define no_llseek NULL
extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize);
extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence);
extern loff_t generic_file_llseek_size(struct file *file, loff_t offset,
                int whence, loff_t maxsize, loff_t eof);
extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
                int whence, loff_t size);
extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t);
extern loff_t no_seek_end_llseek(struct file *, loff_t, int);
int rw_verify_area(int, struct file *, const loff_t *, size_t);
extern int generic_file_open(struct inode * inode, struct file * filp);
extern int nonseekable_open(struct inode * inode, struct file * filp);
extern int stream_open(struct inode * inode, struct file * filp);

#ifdef CONFIG_BLOCK
typedef void (dio_submit_t)(struct bio *bio, struct inode *inode,
                            loff_t file_offset);

enum {
        /* need locking between buffered and direct access */
        DIO_LOCKING        = 0x01,

        /* filesystem does not support filling holes */
        DIO_SKIP_HOLES        = 0x02,
};

ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
                             struct block_device *bdev, struct iov_iter *iter,
                             get_block_t get_block,
                             dio_iodone_t end_io,
                             int flags);

static inline ssize_t blockdev_direct_IO(struct kiocb *iocb,
                                         struct inode *inode,
                                         struct iov_iter *iter,
                                         get_block_t get_block)
{
        return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
                        get_block, NULL, DIO_LOCKING | DIO_SKIP_HOLES);
}
#endif

void inode_dio_wait(struct inode *inode);

/**
 * inode_dio_begin - signal start of a direct I/O requests
 * @inode: inode the direct I/O happens on
 *
 * This is called once we've finished processing a direct I/O request,
 * and is used to wake up callers waiting for direct I/O to be quiesced.
 */
static inline void inode_dio_begin(struct inode *inode)
{
        atomic_inc(&inode->i_dio_count);
}

/**
 * inode_dio_end - signal finish of a direct I/O requests
 * @inode: inode the direct I/O happens on
 *
 * This is called once we've finished processing a direct I/O request,
 * and is used to wake up callers waiting for direct I/O to be quiesced.
 */
static inline void inode_dio_end(struct inode *inode)
{
        if (atomic_dec_and_test(&inode->i_dio_count))
                wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
}

extern void inode_set_flags(struct inode *inode, unsigned int flags,
                            unsigned int mask);

extern const struct file_operations generic_ro_fops;

#define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))

extern int readlink_copy(char __user *, int, const char *);
extern int page_readlink(struct dentry *, char __user *, int);
extern const char *page_get_link(struct dentry *, struct inode *,
                                 struct delayed_call *);
extern void page_put_link(void *);
extern int page_symlink(struct inode *inode, const char *symname, int len);
extern const struct inode_operations page_symlink_inode_operations;
extern void kfree_link(void *);
void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *);
void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
void __inode_add_bytes(struct inode *inode, loff_t bytes);
void inode_add_bytes(struct inode *inode, loff_t bytes);
void __inode_sub_bytes(struct inode *inode, loff_t bytes);
void inode_sub_bytes(struct inode *inode, loff_t bytes);
static inline loff_t __inode_get_bytes(struct inode *inode)
{
        return (((loff_t)inode->i_blocks) << 9) + inode->i_bytes;
}
loff_t inode_get_bytes(struct inode *inode);
void inode_set_bytes(struct inode *inode, loff_t bytes);
const char *simple_get_link(struct dentry *, struct inode *,
                            struct delayed_call *);
extern const struct inode_operations simple_symlink_inode_operations;

extern int iterate_dir(struct file *, struct dir_context *);

int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
                int flags);
int vfs_fstat(int fd, struct kstat *stat);

static inline int vfs_stat(const char __user *filename, struct kstat *stat)
{
        return vfs_fstatat(AT_FDCWD, filename, stat, 0);
}
static inline int vfs_lstat(const char __user *name, struct kstat *stat)
{
        return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
}

extern const char *vfs_get_link(struct dentry *, struct delayed_call *);
extern int vfs_readlink(struct dentry *, char __user *, int);

extern struct file_system_type *get_filesystem(struct file_system_type *fs);
extern void put_filesystem(struct file_system_type *fs);
extern struct file_system_type *get_fs_type(const char *name);
extern void drop_super(struct super_block *sb);
extern void drop_super_exclusive(struct super_block *sb);
extern void iterate_supers(void (*)(struct super_block *, void *), void *);
extern void iterate_supers_type(struct file_system_type *,
                                void (*)(struct super_block *, void *), void *);

extern int dcache_dir_open(struct inode *, struct file *);
extern int dcache_dir_close(struct inode *, struct file *);
extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
extern int dcache_readdir(struct file *, struct dir_context *);
extern int simple_setattr(struct mnt_idmap *, struct dentry *,
                          struct iattr *);
extern int simple_getattr(struct mnt_idmap *, const struct path *,
                          struct kstat *, u32, unsigned int);
extern int simple_statfs(struct dentry *, struct kstatfs *);
extern int simple_open(struct inode *inode, struct file *file);
extern int simple_link(struct dentry *, struct inode *, struct dentry *);
extern int simple_unlink(struct inode *, struct dentry *);
extern int simple_rmdir(struct inode *, struct dentry *);
void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry);
extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
                                  struct inode *new_dir, struct dentry *new_dentry);
extern int simple_rename(struct mnt_idmap *, struct inode *,
                         struct dentry *, struct inode *, struct dentry *,
                         unsigned int);
extern void simple_recursive_removal(struct dentry *,
                              void (*callback)(struct dentry *));
extern int noop_fsync(struct file *, loff_t, loff_t, int);
extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
extern int simple_empty(struct dentry *);
extern int simple_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len,
                        struct page **pagep, void **fsdata);
extern const struct address_space_operations ram_aops;
extern int always_delete_dentry(const struct dentry *);
extern struct inode *alloc_anon_inode(struct super_block *);
extern int simple_nosetlease(struct file *, int, struct file_lease **, void **);
extern const struct dentry_operations simple_dentry_operations;

extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags);
extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
extern const struct file_operations simple_dir_operations;
extern const struct inode_operations simple_dir_inode_operations;
extern void make_empty_dir_inode(struct inode *inode);
extern bool is_empty_dir_inode(struct inode *inode);
struct tree_descr { const char *name; const struct file_operations *ops; int mode; };
struct dentry *d_alloc_name(struct dentry *, const char *);
extern int simple_fill_super(struct super_block *, unsigned long,
                             const struct tree_descr *);
extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count);
extern void simple_release_fs(struct vfsmount **mount, int *count);

extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
                        loff_t *ppos, const void *from, size_t available);
extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
                const void __user *from, size_t count);

struct offset_ctx {
        struct maple_tree        mt;
        unsigned long                next_offset;
};

void simple_offset_init(struct offset_ctx *octx);
int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry);
void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry);
int simple_offset_empty(struct dentry *dentry);
int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
                         struct inode *new_dir, struct dentry *new_dentry);
int simple_offset_rename_exchange(struct inode *old_dir,
                                  struct dentry *old_dentry,
                                  struct inode *new_dir,
                                  struct dentry *new_dentry);
void simple_offset_destroy(struct offset_ctx *octx);

extern const struct file_operations simple_offset_dir_operations;

extern int __generic_file_fsync(struct file *, loff_t, loff_t, int);
extern int generic_file_fsync(struct file *, loff_t, loff_t, int);

extern int generic_check_addressable(unsigned, u64);

extern void generic_set_sb_d_ops(struct super_block *sb);

static inline bool sb_has_encoding(const struct super_block *sb)
{
#if IS_ENABLED(CONFIG_UNICODE)
        return !!sb->s_encoding;
#else
        return false;
#endif
}

int may_setattr(struct mnt_idmap *idmap, struct inode *inode,
                unsigned int ia_valid);
int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *);
extern int inode_newsize_ok(const struct inode *, loff_t offset);
void setattr_copy(struct mnt_idmap *, struct inode *inode,
                  const struct iattr *attr);

extern int file_update_time(struct file *file);

static inline bool vma_is_dax(const struct vm_area_struct *vma)
{
        return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
}

static inline bool vma_is_fsdax(struct vm_area_struct *vma)
{
        struct inode *inode;

        if (!IS_ENABLED(CONFIG_FS_DAX) || !vma->vm_file)
                return false;
        if (!vma_is_dax(vma))
                return false;
        inode = file_inode(vma->vm_file);
        if (S_ISCHR(inode->i_mode))
                return false; /* device-dax */
        return true;
}

static inline int iocb_flags(struct file *file)
{
        int res = 0;
        if (file->f_flags & O_APPEND)
                res |= IOCB_APPEND;
        if (file->f_flags & O_DIRECT)
                res |= IOCB_DIRECT;
        if (file->f_flags & O_DSYNC)
                res |= IOCB_DSYNC;
        if (file->f_flags & __O_SYNC)
                res |= IOCB_SYNC;
        return res;
}

static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
{
        int kiocb_flags = 0;

        /* make sure there's no overlap between RWF and private IOCB flags */
        BUILD_BUG_ON((__force int) RWF_SUPPORTED & IOCB_EVENTFD);

        if (!flags)
                return 0;
        if (unlikely(flags & ~RWF_SUPPORTED))
                return -EOPNOTSUPP;
        if (unlikely((flags & RWF_APPEND) && (flags & RWF_NOAPPEND)))
                return -EINVAL;

        if (flags & RWF_NOWAIT) {
                if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
                        return -EOPNOTSUPP;
                kiocb_flags |= IOCB_NOIO;
        }
        kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
        if (flags & RWF_SYNC)
                kiocb_flags |= IOCB_DSYNC;

        if ((flags & RWF_NOAPPEND) && (ki->ki_flags & IOCB_APPEND)) {
                if (IS_APPEND(file_inode(ki->ki_filp)))
                        return -EPERM;
                ki->ki_flags &= ~IOCB_APPEND;
        }

        ki->ki_flags |= kiocb_flags;
        return 0;
}

static inline ino_t parent_ino(struct dentry *dentry)
{
        ino_t res;

        /*
         * Don't strictly need d_lock here? If the parent ino could change
         * then surely we'd have a deeper race in the caller?
         */
        spin_lock(&dentry->d_lock);
        res = dentry->d_parent->d_inode->i_ino;
        spin_unlock(&dentry->d_lock);
        return res;
}

/* Transaction based IO helpers */

/*
 * An argresp is stored in an allocated page and holds the
 * size of the argument or response, along with its content
 */
struct simple_transaction_argresp {
        ssize_t size;
        char data[];
};

#define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp))

char *simple_transaction_get(struct file *file, const char __user *buf,
                                size_t size);
ssize_t simple_transaction_read(struct file *file, char __user *buf,
                                size_t size, loff_t *pos);
int simple_transaction_release(struct inode *inode, struct file *file);

void simple_transaction_set(struct file *file, size_t n);

/*
 * simple attribute files
 *
 * These attributes behave similar to those in sysfs:
 *
 * Writing to an attribute immediately sets a value, an open file can be
 * written to multiple times.
 *
 * Reading from an attribute creates a buffer from the value that might get
 * read with multiple read calls. When the attribute has been read
 * completely, no further read calls are possible until the file is opened
 * again.
 *
 * All attributes contain a text representation of a numeric value
 * that are accessed with the get() and set() functions.
 */
#define DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed)        \
static int __fops ## _open(struct inode *inode, struct file *file)        \
{                                                                        \
        __simple_attr_check_format(__fmt, 0ull);                        \
        return simple_attr_open(inode, file, __get, __set, __fmt);        \
}                                                                        \
static const struct file_operations __fops = {                                \
        .owner         = THIS_MODULE,                                                \
        .open         = __fops ## _open,                                        \
        .release = simple_attr_release,                                        \
        .read         = simple_attr_read,                                        \
        .write         = (__is_signed) ? simple_attr_write_signed : simple_attr_write,        \
        .llseek         = generic_file_llseek,                                        \
}

#define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt)                \
        DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false)

#define DEFINE_SIMPLE_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt)        \
        DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true)

static inline __printf(1, 2)
void __simple_attr_check_format(const char *fmt, ...)
{
        /* don't do anything, just let the compiler check the arguments; */
}

int simple_attr_open(struct inode *inode, struct file *file,
                     int (*get)(void *, u64 *), int (*set)(void *, u64),
                     const char *fmt);
int simple_attr_release(struct inode *inode, struct file *file);
ssize_t simple_attr_read(struct file *file, char __user *buf,
                         size_t len, loff_t *ppos);
ssize_t simple_attr_write(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos);
ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
                                 size_t len, loff_t *ppos);

struct ctl_table;
int __init list_bdev_fs_names(char *buf, size_t size);

#define __FMODE_EXEC                ((__force int) FMODE_EXEC)
#define __FMODE_NONOTIFY        ((__force int) FMODE_NONOTIFY)

#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
#define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \
                                            (flag & __FMODE_NONOTIFY)))

static inline bool is_sxid(umode_t mode)
{
        return mode & (S_ISUID | S_ISGID);
}

static inline int check_sticky(struct mnt_idmap *idmap,
                               struct inode *dir, struct inode *inode)
{
        if (!(dir->i_mode & S_ISVTX))
                return 0;

        return __check_sticky(idmap, dir, inode);
}

static inline void inode_has_no_xattr(struct inode *inode)
{
        if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & SB_NOSEC))
                inode->i_flags |= S_NOSEC;
}

static inline bool is_root_inode(struct inode *inode)
{
        return inode == inode->i_sb->s_root->d_inode;
}

static inline bool dir_emit(struct dir_context *ctx,
                            const char *name, int namelen,
                            u64 ino, unsigned type)
{
        return ctx->actor(ctx, name, namelen, ctx->pos, ino, type);
}
static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx)
{
        return ctx->actor(ctx, ".", 1, ctx->pos,
                          file->f_path.dentry->d_inode->i_ino, DT_DIR);
}
static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx)
{
        return ctx->actor(ctx, "..", 2, ctx->pos,
                          parent_ino(file->f_path.dentry), DT_DIR);
}
static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx)
{
        if (ctx->pos == 0) {
                if (!dir_emit_dot(file, ctx))
                        return false;
                ctx->pos = 1;
        }
        if (ctx->pos == 1) {
                if (!dir_emit_dotdot(file, ctx))
                        return false;
                ctx->pos = 2;
        }
        return true;
}
static inline bool dir_relax(struct inode *inode)
{
        inode_unlock(inode);
        inode_lock(inode);
        return !IS_DEADDIR(inode);
}

static inline bool dir_relax_shared(struct inode *inode)
{
        inode_unlock_shared(inode);
        inode_lock_shared(inode);
        return !IS_DEADDIR(inode);
}

extern bool path_noexec(const struct path *path);
extern void inode_nohighmem(struct inode *inode);

/* mm/fadvise.c */
extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
                       int advice);
extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
                           int advice);

#endif /* _LINUX_FS_H */
































































































    1 












































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PARAVIRT_H
#define _ASM_X86_PARAVIRT_H
/* Various instructions on x86 need to be replaced for
 * para-virtualization: those hooks are defined here. */

#include <asm/paravirt_types.h>

#ifndef __ASSEMBLY__
struct mm_struct;
#endif

#ifdef CONFIG_PARAVIRT
#include <asm/pgtable_types.h>
#include <asm/asm.h>
#include <asm/nospec-branch.h>

#ifndef __ASSEMBLY__
#include <linux/bug.h>
#include <linux/types.h>
#include <linux/cpumask.h>
#include <linux/static_call_types.h>
#include <asm/frame.h>

u64 dummy_steal_clock(int cpu);
u64 dummy_sched_clock(void);

DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock);
DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock);

void paravirt_set_sched_clock(u64 (*func)(void));

static __always_inline u64 paravirt_sched_clock(void)
{
        return static_call(pv_sched_clock)();
}

struct static_key;
extern struct static_key paravirt_steal_enabled;
extern struct static_key paravirt_steal_rq_enabled;

__visible void __native_queued_spin_unlock(struct qspinlock *lock);
bool pv_is_native_spin_unlock(void);
__visible bool __native_vcpu_is_preempted(long cpu);
bool pv_is_native_vcpu_is_preempted(void);

static inline u64 paravirt_steal_clock(int cpu)
{
        return static_call(pv_steal_clock)(cpu);
}

#ifdef CONFIG_PARAVIRT_SPINLOCKS
void __init paravirt_set_cap(void);
#endif

/* The paravirtualized I/O functions */
static inline void slow_down_io(void)
{
        PVOP_VCALL0(cpu.io_delay);
#ifdef REALLY_SLOW_IO
        PVOP_VCALL0(cpu.io_delay);
        PVOP_VCALL0(cpu.io_delay);
        PVOP_VCALL0(cpu.io_delay);
#endif
}

void native_flush_tlb_local(void);
void native_flush_tlb_global(void);
void native_flush_tlb_one_user(unsigned long addr);
void native_flush_tlb_multi(const struct cpumask *cpumask,
                             const struct flush_tlb_info *info);

static inline void __flush_tlb_local(void)
{
        PVOP_VCALL0(mmu.flush_tlb_user);
}

static inline void __flush_tlb_global(void)
{
        PVOP_VCALL0(mmu.flush_tlb_kernel);
}

static inline void __flush_tlb_one_user(unsigned long addr)
{
        PVOP_VCALL1(mmu.flush_tlb_one_user, addr);
}

static inline void __flush_tlb_multi(const struct cpumask *cpumask,
                                      const struct flush_tlb_info *info)
{
        PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
}

static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
{
        PVOP_VCALL2(mmu.tlb_remove_table, tlb, table);
}

static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
{
        PVOP_VCALL1(mmu.exit_mmap, mm);
}

static inline void notify_page_enc_status_changed(unsigned long pfn,
                                                  int npages, bool enc)
{
        PVOP_VCALL3(mmu.notify_page_enc_status_changed, pfn, npages, enc);
}

#ifdef CONFIG_PARAVIRT_XXL
static inline void load_sp0(unsigned long sp0)
{
        PVOP_VCALL1(cpu.load_sp0, sp0);
}

/* The paravirtualized CPUID instruction. */
static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
                           unsigned int *ecx, unsigned int *edx)
{
        PVOP_VCALL4(cpu.cpuid, eax, ebx, ecx, edx);
}

/*
 * These special macros can be used to get or set a debugging register
 */
static __always_inline unsigned long paravirt_get_debugreg(int reg)
{
        return PVOP_CALL1(unsigned long, cpu.get_debugreg, reg);
}
#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg)
static __always_inline void set_debugreg(unsigned long val, int reg)
{
        PVOP_VCALL2(cpu.set_debugreg, reg, val);
}

static inline unsigned long read_cr0(void)
{
        return PVOP_CALL0(unsigned long, cpu.read_cr0);
}

static inline void write_cr0(unsigned long x)
{
        PVOP_VCALL1(cpu.write_cr0, x);
}

static __always_inline unsigned long read_cr2(void)
{
        return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2,
                                "mov %%cr2, %%rax;", ALT_NOT_XEN);
}

static __always_inline void write_cr2(unsigned long x)
{
        PVOP_VCALL1(mmu.write_cr2, x);
}

static inline unsigned long __read_cr3(void)
{
        return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3,
                              "mov %%cr3, %%rax;", ALT_NOT_XEN);
}

static inline void write_cr3(unsigned long x)
{
        PVOP_ALT_VCALL1(mmu.write_cr3, x, "mov %%rdi, %%cr3", ALT_NOT_XEN);
}

static inline void __write_cr4(unsigned long x)
{
        PVOP_VCALL1(cpu.write_cr4, x);
}

static __always_inline void arch_safe_halt(void)
{
        PVOP_VCALL0(irq.safe_halt);
}

static inline void halt(void)
{
        PVOP_VCALL0(irq.halt);
}

extern noinstr void pv_native_wbinvd(void);

static __always_inline void wbinvd(void)
{
        PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT_XEN);
}

static inline u64 paravirt_read_msr(unsigned msr)
{
        return PVOP_CALL1(u64, cpu.read_msr, msr);
}

static inline void paravirt_write_msr(unsigned msr,
                                      unsigned low, unsigned high)
{
        PVOP_VCALL3(cpu.write_msr, msr, low, high);
}

static inline u64 paravirt_read_msr_safe(unsigned msr, int *err)
{
        return PVOP_CALL2(u64, cpu.read_msr_safe, msr, err);
}

static inline int paravirt_write_msr_safe(unsigned msr,
                                          unsigned low, unsigned high)
{
        return PVOP_CALL3(int, cpu.write_msr_safe, msr, low, high);
}

#define rdmsr(msr, val1, val2)                        \
do {                                                \
        u64 _l = paravirt_read_msr(msr);        \
        val1 = (u32)_l;                                \
        val2 = _l >> 32;                        \
} while (0)

#define wrmsr(msr, val1, val2)                        \
do {                                                \
        paravirt_write_msr(msr, val1, val2);        \
} while (0)

#define rdmsrl(msr, val)                        \
do {                                                \
        val = paravirt_read_msr(msr);                \
} while (0)

static inline void wrmsrl(unsigned msr, u64 val)
{
        wrmsr(msr, (u32)val, (u32)(val>>32));
}

#define wrmsr_safe(msr, a, b)        paravirt_write_msr_safe(msr, a, b)

/* rdmsr with exception handling */
#define rdmsr_safe(msr, a, b)                                \
({                                                        \
        int _err;                                        \
        u64 _l = paravirt_read_msr_safe(msr, &_err);        \
        (*a) = (u32)_l;                                        \
        (*b) = _l >> 32;                                \
        _err;                                                \
})

static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
{
        int err;

        *p = paravirt_read_msr_safe(msr, &err);
        return err;
}

static inline unsigned long long paravirt_read_pmc(int counter)
{
        return PVOP_CALL1(u64, cpu.read_pmc, counter);
}

#define rdpmc(counter, low, high)                \
do {                                                \
        u64 _l = paravirt_read_pmc(counter);        \
        low = (u32)_l;                                \
        high = _l >> 32;                        \
} while (0)

#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))

static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
{
        PVOP_VCALL2(cpu.alloc_ldt, ldt, entries);
}

static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
{
        PVOP_VCALL2(cpu.free_ldt, ldt, entries);
}

static inline void load_TR_desc(void)
{
        PVOP_VCALL0(cpu.load_tr_desc);
}
static inline void load_gdt(const struct desc_ptr *dtr)
{
        PVOP_VCALL1(cpu.load_gdt, dtr);
}
static inline void load_idt(const struct desc_ptr *dtr)
{
        PVOP_VCALL1(cpu.load_idt, dtr);
}
static inline void set_ldt(const void *addr, unsigned entries)
{
        PVOP_VCALL2(cpu.set_ldt, addr, entries);
}
static inline unsigned long paravirt_store_tr(void)
{
        return PVOP_CALL0(unsigned long, cpu.store_tr);
}

#define store_tr(tr)        ((tr) = paravirt_store_tr())
static inline void load_TLS(struct thread_struct *t, unsigned cpu)
{
        PVOP_VCALL2(cpu.load_tls, t, cpu);
}

static inline void load_gs_index(unsigned int gs)
{
        PVOP_VCALL1(cpu.load_gs_index, gs);
}

static inline void write_ldt_entry(struct desc_struct *dt, int entry,
                                   const void *desc)
{
        PVOP_VCALL3(cpu.write_ldt_entry, dt, entry, desc);
}

static inline void write_gdt_entry(struct desc_struct *dt, int entry,
                                   void *desc, int type)
{
        PVOP_VCALL4(cpu.write_gdt_entry, dt, entry, desc, type);
}

static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
{
        PVOP_VCALL3(cpu.write_idt_entry, dt, entry, g);
}

#ifdef CONFIG_X86_IOPL_IOPERM
static inline void tss_invalidate_io_bitmap(void)
{
        PVOP_VCALL0(cpu.invalidate_io_bitmap);
}

static inline void tss_update_io_bitmap(void)
{
        PVOP_VCALL0(cpu.update_io_bitmap);
}
#endif

static inline void paravirt_enter_mmap(struct mm_struct *next)
{
        PVOP_VCALL1(mmu.enter_mmap, next);
}

static inline int paravirt_pgd_alloc(struct mm_struct *mm)
{
        return PVOP_CALL1(int, mmu.pgd_alloc, mm);
}

static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
        PVOP_VCALL2(mmu.pgd_free, mm, pgd);
}

static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn)
{
        PVOP_VCALL2(mmu.alloc_pte, mm, pfn);
}
static inline void paravirt_release_pte(unsigned long pfn)
{
        PVOP_VCALL1(mmu.release_pte, pfn);
}

static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
{
        PVOP_VCALL2(mmu.alloc_pmd, mm, pfn);
}

static inline void paravirt_release_pmd(unsigned long pfn)
{
        PVOP_VCALL1(mmu.release_pmd, pfn);
}

static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn)
{
        PVOP_VCALL2(mmu.alloc_pud, mm, pfn);
}
static inline void paravirt_release_pud(unsigned long pfn)
{
        PVOP_VCALL1(mmu.release_pud, pfn);
}

static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn)
{
        PVOP_VCALL2(mmu.alloc_p4d, mm, pfn);
}

static inline void paravirt_release_p4d(unsigned long pfn)
{
        PVOP_VCALL1(mmu.release_p4d, pfn);
}

static inline pte_t __pte(pteval_t val)
{
        return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val,
                                          "mov %%rdi, %%rax", ALT_NOT_XEN) };
}

static inline pteval_t pte_val(pte_t pte)
{
        return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte,
                                "mov %%rdi, %%rax", ALT_NOT_XEN);
}

static inline pgd_t __pgd(pgdval_t val)
{
        return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val,
                                          "mov %%rdi, %%rax", ALT_NOT_XEN) };
}

static inline pgdval_t pgd_val(pgd_t pgd)
{
        return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd,
                                "mov %%rdi, %%rax", ALT_NOT_XEN);
}

#define  __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
                                           pte_t *ptep)
{
        pteval_t ret;

        ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, vma, addr, ptep);

        return (pte_t) { .pte = ret };
}

static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
                                           pte_t *ptep, pte_t old_pte, pte_t pte)
{

        PVOP_VCALL4(mmu.ptep_modify_prot_commit, vma, addr, ptep, pte.pte);
}

static inline void set_pte(pte_t *ptep, pte_t pte)
{
        PVOP_VCALL2(mmu.set_pte, ptep, pte.pte);
}

static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
{
        PVOP_VCALL2(mmu.set_pmd, pmdp, native_pmd_val(pmd));
}

static inline pmd_t __pmd(pmdval_t val)
{
        return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val,
                                          "mov %%rdi, %%rax", ALT_NOT_XEN) };
}

static inline pmdval_t pmd_val(pmd_t pmd)
{
        return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd,
                                "mov %%rdi, %%rax", ALT_NOT_XEN);
}

static inline void set_pud(pud_t *pudp, pud_t pud)
{
        PVOP_VCALL2(mmu.set_pud, pudp, native_pud_val(pud));
}

static inline pud_t __pud(pudval_t val)
{
        pudval_t ret;

        ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val,
                               "mov %%rdi, %%rax", ALT_NOT_XEN);

        return (pud_t) { ret };
}

static inline pudval_t pud_val(pud_t pud)
{
        return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud,
                                "mov %%rdi, %%rax", ALT_NOT_XEN);
}

static inline void pud_clear(pud_t *pudp)
{
        set_pud(pudp, native_make_pud(0));
}

static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
{
        p4dval_t val = native_p4d_val(p4d);

        PVOP_VCALL2(mmu.set_p4d, p4dp, val);
}

#if CONFIG_PGTABLE_LEVELS >= 5

static inline p4d_t __p4d(p4dval_t val)
{
        p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val,
                                        "mov %%rdi, %%rax", ALT_NOT_XEN);

        return (p4d_t) { ret };
}

static inline p4dval_t p4d_val(p4d_t p4d)
{
        return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d,
                                "mov %%rdi, %%rax", ALT_NOT_XEN);
}

static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
{
        PVOP_VCALL2(mmu.set_pgd, pgdp, native_pgd_val(pgd));
}

#define set_pgd(pgdp, pgdval) do {                                        \
        if (pgtable_l5_enabled())                                                \
                __set_pgd(pgdp, pgdval);                                \
        else                                                                \
                set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd });        \
} while (0)

#define pgd_clear(pgdp) do {                                                \
        if (pgtable_l5_enabled())                                        \
                set_pgd(pgdp, native_make_pgd(0));                        \
} while (0)

#endif  /* CONFIG_PGTABLE_LEVELS == 5 */

static inline void p4d_clear(p4d_t *p4dp)
{
        set_p4d(p4dp, native_make_p4d(0));
}

static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
{
        set_pte(ptep, pte);
}

static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
                             pte_t *ptep)
{
        set_pte(ptep, native_make_pte(0));
}

static inline void pmd_clear(pmd_t *pmdp)
{
        set_pmd(pmdp, native_make_pmd(0));
}

#define  __HAVE_ARCH_START_CONTEXT_SWITCH
static inline void arch_start_context_switch(struct task_struct *prev)
{
        PVOP_VCALL1(cpu.start_context_switch, prev);
}

static inline void arch_end_context_switch(struct task_struct *next)
{
        PVOP_VCALL1(cpu.end_context_switch, next);
}

#define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
static inline void arch_enter_lazy_mmu_mode(void)
{
        PVOP_VCALL0(mmu.lazy_mode.enter);
}

static inline void arch_leave_lazy_mmu_mode(void)
{
        PVOP_VCALL0(mmu.lazy_mode.leave);
}

static inline void arch_flush_lazy_mmu_mode(void)
{
        PVOP_VCALL0(mmu.lazy_mode.flush);
}

static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
                                phys_addr_t phys, pgprot_t flags)
{
        pv_ops.mmu.set_fixmap(idx, phys, flags);
}
#endif

#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)

static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
                                                        u32 val)
{
        PVOP_VCALL2(lock.queued_spin_lock_slowpath, lock, val);
}

static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
{
        PVOP_ALT_VCALLEE1(lock.queued_spin_unlock, lock,
                          "movb $0, (%%" _ASM_ARG1 ");",
                          ALT_NOT(X86_FEATURE_PVUNLOCK));
}

static __always_inline void pv_wait(u8 *ptr, u8 val)
{
        PVOP_VCALL2(lock.wait, ptr, val);
}

static __always_inline void pv_kick(int cpu)
{
        PVOP_VCALL1(lock.kick, cpu);
}

static __always_inline bool pv_vcpu_is_preempted(long cpu)
{
        return PVOP_ALT_CALLEE1(bool, lock.vcpu_is_preempted, cpu,
                                "xor %%" _ASM_AX ", %%" _ASM_AX ";",
                                ALT_NOT(X86_FEATURE_VCPUPREEMPT));
}

void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock);
bool __raw_callee_save___native_vcpu_is_preempted(long cpu);

#endif /* SMP && PARAVIRT_SPINLOCKS */

#ifdef CONFIG_X86_32
/* save and restore all caller-save registers, except return value */
#define PV_SAVE_ALL_CALLER_REGS                "pushl %ecx;"
#define PV_RESTORE_ALL_CALLER_REGS        "popl  %ecx;"
#else
/* save and restore all caller-save registers, except return value */
#define PV_SAVE_ALL_CALLER_REGS                                                \
        "push %rcx;"                                                        \
        "push %rdx;"                                                        \
        "push %rsi;"                                                        \
        "push %rdi;"                                                        \
        "push %r8;"                                                        \
        "push %r9;"                                                        \
        "push %r10;"                                                        \
        "push %r11;"
#define PV_RESTORE_ALL_CALLER_REGS                                        \
        "pop %r11;"                                                        \
        "pop %r10;"                                                        \
        "pop %r9;"                                                        \
        "pop %r8;"                                                        \
        "pop %rdi;"                                                        \
        "pop %rsi;"                                                        \
        "pop %rdx;"                                                        \
        "pop %rcx;"
#endif

/*
 * Generate a thunk around a function which saves all caller-save
 * registers except for the return value.  This allows C functions to
 * be called from assembler code where fewer than normal registers are
 * available.  It may also help code generation around calls from C
 * code if the common case doesn't use many registers.
 *
 * When a callee is wrapped in a thunk, the caller can assume that all
 * arg regs and all scratch registers are preserved across the
 * call. The return value in rax/eax will not be saved, even for void
 * functions.
 */
#define PV_THUNK_NAME(func) "__raw_callee_save_" #func
#define __PV_CALLEE_SAVE_REGS_THUNK(func, section)                        \
        extern typeof(func) __raw_callee_save_##func;                        \
                                                                        \
        asm(".pushsection " section ", \"ax\";"                                \
            ".globl " PV_THUNK_NAME(func) ";"                                \
            ".type " PV_THUNK_NAME(func) ", @function;"                        \
            ASM_FUNC_ALIGN                                                \
            PV_THUNK_NAME(func) ":"                                        \
            ASM_ENDBR                                                        \
            FRAME_BEGIN                                                        \
            PV_SAVE_ALL_CALLER_REGS                                        \
            "call " #func ";"                                                \
            PV_RESTORE_ALL_CALLER_REGS                                        \
            FRAME_END                                                        \
            ASM_RET                                                        \
            ".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";"        \
            ".popsection")

#define PV_CALLEE_SAVE_REGS_THUNK(func)                        \
        __PV_CALLEE_SAVE_REGS_THUNK(func, ".text")

/* Get a reference to a callee-save function */
#define PV_CALLEE_SAVE(func)                                                \
        ((struct paravirt_callee_save) { __raw_callee_save_##func })

/* Promise that "func" already uses the right calling convention */
#define __PV_IS_CALLEE_SAVE(func)                        \
        ((struct paravirt_callee_save) { func })

#ifdef CONFIG_PARAVIRT_XXL
static __always_inline unsigned long arch_local_save_flags(void)
{
        return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;",
                                ALT_NOT_XEN);
}

static __always_inline void arch_local_irq_disable(void)
{
        PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT_XEN);
}

static __always_inline void arch_local_irq_enable(void)
{
        PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT_XEN);
}

static __always_inline unsigned long arch_local_irq_save(void)
{
        unsigned long f;

        f = arch_local_save_flags();
        arch_local_irq_disable();
        return f;
}
#endif


/* Make sure as little as possible of this mess escapes. */
#undef PARAVIRT_CALL
#undef __PVOP_CALL
#undef __PVOP_VCALL
#undef PVOP_VCALL0
#undef PVOP_CALL0
#undef PVOP_VCALL1
#undef PVOP_CALL1
#undef PVOP_VCALL2
#undef PVOP_CALL2
#undef PVOP_VCALL3
#undef PVOP_CALL3
#undef PVOP_VCALL4
#undef PVOP_CALL4

extern void default_banner(void);
void native_pv_lock_init(void) __init;

#else  /* __ASSEMBLY__ */

#ifdef CONFIG_X86_64
#ifdef CONFIG_PARAVIRT_XXL
#ifdef CONFIG_DEBUG_ENTRY

#define PARA_INDIRECT(addr)        *addr(%rip)

.macro PARA_IRQ_save_fl
        ANNOTATE_RETPOLINE_SAFE;
        call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);
.endm

#define SAVE_FLAGS ALTERNATIVE_2 "PARA_IRQ_save_fl;",                        \
                                 "ALT_CALL_INSTR;", ALT_CALL_ALWAYS,        \
                                 "pushf; pop %rax;", ALT_NOT_XEN
#endif
#endif /* CONFIG_PARAVIRT_XXL */
#endif        /* CONFIG_X86_64 */

#endif /* __ASSEMBLY__ */
#else  /* CONFIG_PARAVIRT */
# define default_banner x86_init_noop

#ifndef __ASSEMBLY__
static inline void native_pv_lock_init(void)
{
}
#endif
#endif /* !CONFIG_PARAVIRT */

#ifndef __ASSEMBLY__
#ifndef CONFIG_PARAVIRT_XXL
static inline void paravirt_enter_mmap(struct mm_struct *mm)
{
}
#endif

#ifndef CONFIG_PARAVIRT
static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
{
}
#endif

#ifndef CONFIG_PARAVIRT_SPINLOCKS
static inline void paravirt_set_cap(void)
{
}
#endif
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PARAVIRT_H */























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 

































































































































































































    9 







    9 









































































































































































    3 





























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Portions Copyright (C) 1992 Drew Eckhardt
 */
#ifndef _LINUX_BLKDEV_H
#define _LINUX_BLKDEV_H

#include <linux/types.h>
#include <linux/blk_types.h>
#include <linux/device.h>
#include <linux/list.h>
#include <linux/llist.h>
#include <linux/minmax.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/wait.h>
#include <linux/bio.h>
#include <linux/gfp.h>
#include <linux/kdev_t.h>
#include <linux/rcupdate.h>
#include <linux/percpu-refcount.h>
#include <linux/blkzoned.h>
#include <linux/sched.h>
#include <linux/sbitmap.h>
#include <linux/uuid.h>
#include <linux/xarray.h>
#include <linux/file.h>

struct module;
struct request_queue;
struct elevator_queue;
struct blk_trace;
struct request;
struct sg_io_hdr;
struct blkcg_gq;
struct blk_flush_queue;
struct kiocb;
struct pr_ops;
struct rq_qos;
struct blk_queue_stats;
struct blk_stat_callback;
struct blk_crypto_profile;

extern const struct device_type disk_type;
extern const struct device_type part_type;
extern const struct class block_class;

/*
 * Maximum number of blkcg policies allowed to be registered concurrently.
 * Defined here to simplify include dependency.
 */
#define BLKCG_MAX_POLS                6

#define DISK_MAX_PARTS                        256
#define DISK_NAME_LEN                        32

#define PARTITION_META_INFO_VOLNAMELTH        64
/*
 * Enough for the string representation of any kind of UUID plus NULL.
 * EFI UUID is 36 characters. MSDOS UUID is 11 characters.
 */
#define PARTITION_META_INFO_UUIDLTH        (UUID_STRING_LEN + 1)

struct partition_meta_info {
        char uuid[PARTITION_META_INFO_UUIDLTH];
        u8 volname[PARTITION_META_INFO_VOLNAMELTH];
};

/**
 * DOC: genhd capability flags
 *
 * ``GENHD_FL_REMOVABLE``: indicates that the block device gives access to
 * removable media.  When set, the device remains present even when media is not
 * inserted.  Shall not be set for devices which are removed entirely when the
 * media is removed.
 *
 * ``GENHD_FL_HIDDEN``: the block device is hidden; it doesn't produce events,
 * doesn't appear in sysfs, and can't be opened from userspace or using
 * blkdev_get*. Used for the underlying components of multipath devices.
 *
 * ``GENHD_FL_NO_PART``: partition support is disabled.  The kernel will not
 * scan for partitions from add_disk, and users can't add partitions manually.
 *
 */
enum {
        GENHD_FL_REMOVABLE                        = 1 << 0,
        GENHD_FL_HIDDEN                                = 1 << 1,
        GENHD_FL_NO_PART                        = 1 << 2,
};

enum {
        DISK_EVENT_MEDIA_CHANGE                        = 1 << 0, /* media changed */
        DISK_EVENT_EJECT_REQUEST                = 1 << 1, /* eject requested */
};

enum {
        /* Poll even if events_poll_msecs is unset */
        DISK_EVENT_FLAG_POLL                        = 1 << 0,
        /* Forward events to udev */
        DISK_EVENT_FLAG_UEVENT                        = 1 << 1,
        /* Block event polling when open for exclusive write */
        DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE        = 1 << 2,
};

struct disk_events;
struct badblocks;

struct blk_integrity {
        const struct blk_integrity_profile        *profile;
        unsigned char                                flags;
        unsigned char                                tuple_size;
        unsigned char                                pi_offset;
        unsigned char                                interval_exp;
        unsigned char                                tag_size;
};

typedef unsigned int __bitwise blk_mode_t;

/* open for reading */
#define BLK_OPEN_READ                ((__force blk_mode_t)(1 << 0))
/* open for writing */
#define BLK_OPEN_WRITE                ((__force blk_mode_t)(1 << 1))
/* open exclusively (vs other exclusive openers */
#define BLK_OPEN_EXCL                ((__force blk_mode_t)(1 << 2))
/* opened with O_NDELAY */
#define BLK_OPEN_NDELAY                ((__force blk_mode_t)(1 << 3))
/* open for "writes" only for ioctls (specialy hack for floppy.c) */
#define BLK_OPEN_WRITE_IOCTL        ((__force blk_mode_t)(1 << 4))
/* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */
#define BLK_OPEN_RESTRICT_WRITES        ((__force blk_mode_t)(1 << 5))
/* return partition scanning errors */
#define BLK_OPEN_STRICT_SCAN        ((__force blk_mode_t)(1 << 6))

struct gendisk {
        /*
         * major/first_minor/minors should not be set by any new driver, the
         * block core will take care of allocating them automatically.
         */
        int major;
        int first_minor;
        int minors;

        char disk_name[DISK_NAME_LEN];        /* name of major driver */

        unsigned short events;                /* supported events */
        unsigned short event_flags;        /* flags related to event processing */

        struct xarray part_tbl;
        struct block_device *part0;

        const struct block_device_operations *fops;
        struct request_queue *queue;
        void *private_data;

        struct bio_set bio_split;

        int flags;
        unsigned long state;
#define GD_NEED_PART_SCAN                0
#define GD_READ_ONLY                        1
#define GD_DEAD                                2
#define GD_NATIVE_CAPACITY                3
#define GD_ADDED                        4
#define GD_SUPPRESS_PART_SCAN                5
#define GD_OWNS_QUEUE                        6

        struct mutex open_mutex;        /* open/close mutex */
        unsigned open_partitions;        /* number of open partitions */

        struct backing_dev_info        *bdi;
        struct kobject queue_kobj;        /* the queue/ directory */
        struct kobject *slave_dir;
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
        struct list_head slave_bdevs;
#endif
        struct timer_rand_state *random;
        atomic_t sync_io;                /* RAID */
        struct disk_events *ev;

#ifdef CONFIG_BLK_DEV_ZONED
        /*
         * Zoned block device information. Reads of this information must be
         * protected with blk_queue_enter() / blk_queue_exit(). Modifying this
         * information is only allowed while no requests are being processed.
         * See also blk_mq_freeze_queue() and blk_mq_unfreeze_queue().
         */
        unsigned int                nr_zones;
        unsigned int                zone_capacity;
        unsigned int                last_zone_capacity;
        unsigned long                *conv_zones_bitmap;
        unsigned int            zone_wplugs_hash_bits;
        spinlock_t              zone_wplugs_lock;
        struct mempool_s        *zone_wplugs_pool;
        struct hlist_head       *zone_wplugs_hash;
        struct list_head        zone_wplugs_err_list;
        struct work_struct        zone_wplugs_work;
        struct workqueue_struct *zone_wplugs_wq;
#endif /* CONFIG_BLK_DEV_ZONED */

#if IS_ENABLED(CONFIG_CDROM)
        struct cdrom_device_info *cdi;
#endif
        int node_id;
        struct badblocks *bb;
        struct lockdep_map lockdep_map;
        u64 diskseq;
        blk_mode_t open_mode;

        /*
         * Independent sector access ranges. This is always NULL for
         * devices that do not have multiple independent access ranges.
         */
        struct blk_independent_access_ranges *ia_ranges;
};

/**
 * disk_openers - returns how many openers are there for a disk
 * @disk: disk to check
 *
 * This returns the number of openers for a disk.  Note that this value is only
 * stable if disk->open_mutex is held.
 *
 * Note: Due to a quirk in the block layer open code, each open partition is
 * only counted once even if there are multiple openers.
 */
static inline unsigned int disk_openers(struct gendisk *disk)
{
        return atomic_read(&disk->part0->bd_openers);
}

/**
 * disk_has_partscan - return %true if partition scanning is enabled on a disk
 * @disk: disk to check
 *
 * Returns %true if partitions scanning is enabled for @disk, or %false if
 * partition scanning is disabled either permanently or temporarily.
 */
static inline bool disk_has_partscan(struct gendisk *disk)
{
        return !(disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) &&
                !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
}

/*
 * The gendisk is refcounted by the part0 block_device, and the bd_device
 * therein is also used for device model presentation in sysfs.
 */
#define dev_to_disk(device) \
        (dev_to_bdev(device)->bd_disk)
#define disk_to_dev(disk) \
        (&((disk)->part0->bd_device))

#if IS_REACHABLE(CONFIG_CDROM)
#define disk_to_cdi(disk)        ((disk)->cdi)
#else
#define disk_to_cdi(disk)        NULL
#endif

static inline dev_t disk_devt(struct gendisk *disk)
{
        return MKDEV(disk->major, disk->first_minor);
}

static inline int blk_validate_block_size(unsigned long bsize)
{
        if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize))
                return -EINVAL;

        return 0;
}

static inline bool blk_op_is_passthrough(blk_opf_t op)
{
        op &= REQ_OP_MASK;
        return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
}

/*
 * BLK_BOUNCE_NONE:        never bounce (default)
 * BLK_BOUNCE_HIGH:        bounce all highmem pages
 */
enum blk_bounce {
        BLK_BOUNCE_NONE,
        BLK_BOUNCE_HIGH,
};

struct queue_limits {
        enum blk_bounce                bounce;
        unsigned long                seg_boundary_mask;
        unsigned long                virt_boundary_mask;

        unsigned int                max_hw_sectors;
        unsigned int                max_dev_sectors;
        unsigned int                chunk_sectors;
        unsigned int                max_sectors;
        unsigned int                max_user_sectors;
        unsigned int                max_segment_size;
        unsigned int                physical_block_size;
        unsigned int                logical_block_size;
        unsigned int                alignment_offset;
        unsigned int                io_min;
        unsigned int                io_opt;
        unsigned int                max_discard_sectors;
        unsigned int                max_hw_discard_sectors;
        unsigned int                max_user_discard_sectors;
        unsigned int                max_secure_erase_sectors;
        unsigned int                max_write_zeroes_sectors;
        unsigned int                max_zone_append_sectors;
        unsigned int                discard_granularity;
        unsigned int                discard_alignment;
        unsigned int                zone_write_granularity;

        unsigned short                max_segments;
        unsigned short                max_integrity_segments;
        unsigned short                max_discard_segments;

        unsigned char                misaligned;
        unsigned char                discard_misaligned;
        unsigned char                raid_partial_stripes_expensive;
        bool                        zoned;
        unsigned int                max_open_zones;
        unsigned int                max_active_zones;

        /*
         * Drivers that set dma_alignment to less than 511 must be prepared to
         * handle individual bvec's that are not a multiple of a SECTOR_SIZE
         * due to possible offsets.
         */
        unsigned int                dma_alignment;
};

typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
                               void *data);

void disk_set_zoned(struct gendisk *disk);

#define BLK_ALL_ZONES  ((unsigned int)-1)
int blkdev_report_zones(struct block_device *bdev, sector_t sector,
                unsigned int nr_zones, report_zones_cb cb, void *data);
int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
                sector_t sectors, sector_t nr_sectors);
int blk_revalidate_disk_zones(struct gendisk *disk);

/*
 * Independent access ranges: struct blk_independent_access_range describes
 * a range of contiguous sectors that can be accessed using device command
 * execution resources that are independent from the resources used for
 * other access ranges. This is typically found with single-LUN multi-actuator
 * HDDs where each access range is served by a different set of heads.
 * The set of independent ranges supported by the device is defined using
 * struct blk_independent_access_ranges. The independent ranges must not overlap
 * and must include all sectors within the disk capacity (no sector holes
 * allowed).
 * For a device with multiple ranges, requests targeting sectors in different
 * ranges can be executed in parallel. A request can straddle an access range
 * boundary.
 */
struct blk_independent_access_range {
        struct kobject                kobj;
        sector_t                sector;
        sector_t                nr_sectors;
};

struct blk_independent_access_ranges {
        struct kobject                                kobj;
        bool                                        sysfs_registered;
        unsigned int                                nr_ia_ranges;
        struct blk_independent_access_range        ia_range[];
};

struct request_queue {
        /*
         * The queue owner gets to use this for whatever they like.
         * ll_rw_blk doesn't touch it.
         */
        void                        *queuedata;

        struct elevator_queue        *elevator;

        const struct blk_mq_ops        *mq_ops;

        /* sw queues */
        struct blk_mq_ctx __percpu        *queue_ctx;

        /*
         * various queue flags, see QUEUE_* below
         */
        unsigned long                queue_flags;

        unsigned int                rq_timeout;

        unsigned int                queue_depth;

        refcount_t                refs;

        /* hw dispatch queues */
        unsigned int                nr_hw_queues;
        struct xarray                hctx_table;

        struct percpu_ref        q_usage_counter;

        struct request                *last_merge;

        spinlock_t                queue_lock;

        int                        quiesce_depth;

        struct gendisk                *disk;

        /*
         * mq queue kobject
         */
        struct kobject *mq_kobj;

        struct queue_limits        limits;

#ifdef  CONFIG_BLK_DEV_INTEGRITY
        struct blk_integrity integrity;
#endif        /* CONFIG_BLK_DEV_INTEGRITY */

#ifdef CONFIG_PM
        struct device                *dev;
        enum rpm_status                rpm_status;
#endif

        /*
         * Number of contexts that have called blk_set_pm_only(). If this
         * counter is above zero then only RQF_PM requests are processed.
         */
        atomic_t                pm_only;

        struct blk_queue_stats        *stats;
        struct rq_qos                *rq_qos;
        struct mutex                rq_qos_mutex;

        /*
         * ida allocated id for this queue.  Used to index queues from
         * ioctx.
         */
        int                        id;

        unsigned int                dma_pad_mask;

        /*
         * queue settings
         */
        unsigned long                nr_requests;        /* Max # of requests */

#ifdef CONFIG_BLK_INLINE_ENCRYPTION
        struct blk_crypto_profile *crypto_profile;
        struct kobject *crypto_kobject;
#endif

        struct timer_list        timeout;
        struct work_struct        timeout_work;

        atomic_t                nr_active_requests_shared_tags;

        struct blk_mq_tags        *sched_shared_tags;

        struct list_head        icq_list;
#ifdef CONFIG_BLK_CGROUP
        DECLARE_BITMAP                (blkcg_pols, BLKCG_MAX_POLS);
        struct blkcg_gq                *root_blkg;
        struct list_head        blkg_list;
        struct mutex                blkcg_mutex;
#endif

        int                        node;

        spinlock_t                requeue_lock;
        struct list_head        requeue_list;
        struct delayed_work        requeue_work;

#ifdef CONFIG_BLK_DEV_IO_TRACE
        struct blk_trace __rcu        *blk_trace;
#endif
        /*
         * for flush operations
         */
        struct blk_flush_queue        *fq;
        struct list_head        flush_list;

        struct mutex                sysfs_lock;
        struct mutex                sysfs_dir_lock;
        struct mutex                limits_lock;

        /*
         * for reusing dead hctx instance in case of updating
         * nr_hw_queues
         */
        struct list_head        unused_hctx_list;
        spinlock_t                unused_hctx_lock;

        int                        mq_freeze_depth;

#ifdef CONFIG_BLK_DEV_THROTTLING
        /* Throttle data */
        struct throtl_data *td;
#endif
        struct rcu_head                rcu_head;
        wait_queue_head_t        mq_freeze_wq;
        /*
         * Protect concurrent access to q_usage_counter by
         * percpu_ref_kill() and percpu_ref_reinit().
         */
        struct mutex                mq_freeze_lock;

        struct blk_mq_tag_set        *tag_set;
        struct list_head        tag_set_list;

        struct dentry                *debugfs_dir;
        struct dentry                *sched_debugfs_dir;
        struct dentry                *rqos_debugfs_dir;
        /*
         * Serializes all debugfs metadata operations using the above dentries.
         */
        struct mutex                debugfs_mutex;

        bool                        mq_sysfs_init_done;
};

/* Keep blk_queue_flag_name[] in sync with the definitions below */
#define QUEUE_FLAG_STOPPED        0        /* queue is stopped */
#define QUEUE_FLAG_DYING        1        /* queue being torn down */
#define QUEUE_FLAG_NOMERGES     3        /* disable merge attempts */
#define QUEUE_FLAG_SAME_COMP        4        /* complete on same CPU-group */
#define QUEUE_FLAG_FAIL_IO        5        /* fake timeout */
#define QUEUE_FLAG_NONROT        6        /* non-rotational device (SSD) */
#define QUEUE_FLAG_VIRT                QUEUE_FLAG_NONROT /* paravirt device */
#define QUEUE_FLAG_IO_STAT        7        /* do disk/partitions IO accounting */
#define QUEUE_FLAG_NOXMERGES        9        /* No extended merges */
#define QUEUE_FLAG_ADD_RANDOM        10        /* Contributes to random pool */
#define QUEUE_FLAG_SYNCHRONOUS        11        /* always completes in submit context */
#define QUEUE_FLAG_SAME_FORCE        12        /* force complete on same CPU */
#define QUEUE_FLAG_HW_WC        13        /* Write back caching supported */
#define QUEUE_FLAG_INIT_DONE        14        /* queue is initialized */
#define QUEUE_FLAG_STABLE_WRITES 15        /* don't modify blks until WB is done */
#define QUEUE_FLAG_POLL                16        /* IO polling enabled if set */
#define QUEUE_FLAG_WC                17        /* Write back caching */
#define QUEUE_FLAG_FUA                18        /* device supports FUA writes */
#define QUEUE_FLAG_DAX                19        /* device supports DAX */
#define QUEUE_FLAG_STATS        20        /* track IO start and completion times */
#define QUEUE_FLAG_REGISTERED        22        /* queue has been registered to a disk */
#define QUEUE_FLAG_QUIESCED        24        /* queue has been quiesced */
#define QUEUE_FLAG_PCI_P2PDMA        25        /* device supports PCI p2p requests */
#define QUEUE_FLAG_ZONE_RESETALL 26        /* supports Zone Reset All */
#define QUEUE_FLAG_RQ_ALLOC_TIME 27        /* record rq->alloc_time_ns */
#define QUEUE_FLAG_HCTX_ACTIVE        28        /* at least one blk-mq hctx is active */
#define QUEUE_FLAG_NOWAIT       29        /* device supports NOWAIT */
#define QUEUE_FLAG_SQ_SCHED     30        /* single queue style io dispatch */
#define QUEUE_FLAG_SKIP_TAGSET_QUIESCE        31 /* quiesce_tagset skip the queue*/

#define QUEUE_FLAG_MQ_DEFAULT        ((1UL << QUEUE_FLAG_IO_STAT) |                \
                                 (1UL << QUEUE_FLAG_SAME_COMP) |        \
                                 (1UL << QUEUE_FLAG_NOWAIT))

void blk_queue_flag_set(unsigned int flag, struct request_queue *q);
void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);

#define blk_queue_stopped(q)        test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
#define blk_queue_dying(q)        test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
#define blk_queue_init_done(q)        test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
#define blk_queue_nomerges(q)        test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
#define blk_queue_noxmerges(q)        \
        test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
#define blk_queue_nonrot(q)        test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
#define blk_queue_stable_writes(q) \
        test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags)
#define blk_queue_io_stat(q)        test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
#define blk_queue_add_random(q)        test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
#define blk_queue_zone_resetall(q)        \
        test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags)
#define blk_queue_dax(q)        test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags)
#define blk_queue_pci_p2pdma(q)        \
        test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags)
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
#define blk_queue_rq_alloc_time(q)        \
        test_bit(QUEUE_FLAG_RQ_ALLOC_TIME, &(q)->queue_flags)
#else
#define blk_queue_rq_alloc_time(q)        false
#endif

#define blk_noretry_request(rq) \
        ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
                             REQ_FAILFAST_DRIVER))
#define blk_queue_quiesced(q)        test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags)
#define blk_queue_pm_only(q)        atomic_read(&(q)->pm_only)
#define blk_queue_registered(q)        test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
#define blk_queue_sq_sched(q)        test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags)
#define blk_queue_skip_tagset_quiesce(q) \
        test_bit(QUEUE_FLAG_SKIP_TAGSET_QUIESCE, &(q)->queue_flags)

extern void blk_set_pm_only(struct request_queue *q);
extern void blk_clear_pm_only(struct request_queue *q);

#define list_entry_rq(ptr)        list_entry((ptr), struct request, queuelist)

#define dma_map_bvec(dev, bv, dir, attrs) \
        dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \
        (dir), (attrs))

static inline bool queue_is_mq(struct request_queue *q)
{
        return q->mq_ops;
}

#ifdef CONFIG_PM
static inline enum rpm_status queue_rpm_status(struct request_queue *q)
{
        return q->rpm_status;
}
#else
static inline enum rpm_status queue_rpm_status(struct request_queue *q)
{
        return RPM_ACTIVE;
}
#endif

static inline bool blk_queue_is_zoned(struct request_queue *q)
{
        return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && q->limits.zoned;
}

#ifdef CONFIG_BLK_DEV_ZONED
unsigned int bdev_nr_zones(struct block_device *bdev);

static inline unsigned int disk_nr_zones(struct gendisk *disk)
{
        return blk_queue_is_zoned(disk->queue) ? disk->nr_zones : 0;
}

static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector)
{
        if (!blk_queue_is_zoned(disk->queue))
                return 0;
        return sector >> ilog2(disk->queue->limits.chunk_sectors);
}

static inline void disk_set_max_open_zones(struct gendisk *disk,
                unsigned int max_open_zones)
{
        disk->queue->limits.max_open_zones = max_open_zones;
}

static inline void disk_set_max_active_zones(struct gendisk *disk,
                unsigned int max_active_zones)
{
        disk->queue->limits.max_active_zones = max_active_zones;
}

static inline unsigned int bdev_max_open_zones(struct block_device *bdev)
{
        return bdev->bd_disk->queue->limits.max_open_zones;
}

static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
{
        return bdev->bd_disk->queue->limits.max_active_zones;
}

bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs);
#else /* CONFIG_BLK_DEV_ZONED */
static inline unsigned int bdev_nr_zones(struct block_device *bdev)
{
        return 0;
}

static inline unsigned int disk_nr_zones(struct gendisk *disk)
{
        return 0;
}
static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector)
{
        return 0;
}
static inline unsigned int bdev_max_open_zones(struct block_device *bdev)
{
        return 0;
}

static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
{
        return 0;
}
static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
{
        return false;
}
#endif /* CONFIG_BLK_DEV_ZONED */

static inline unsigned int blk_queue_depth(struct request_queue *q)
{
        if (q->queue_depth)
                return q->queue_depth;

        return q->nr_requests;
}

/*
 * default timeout for SG_IO if none specified
 */
#define BLK_DEFAULT_SG_TIMEOUT        (60 * HZ)
#define BLK_MIN_SG_TIMEOUT        (7 * HZ)

/* This should not be used directly - use rq_for_each_segment */
#define for_each_bio(_bio)                \
        for (; _bio; _bio = _bio->bi_next)

int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
                                 const struct attribute_group **groups);
static inline int __must_check add_disk(struct gendisk *disk)
{
        return device_add_disk(NULL, disk, NULL);
}
void del_gendisk(struct gendisk *gp);
void invalidate_disk(struct gendisk *disk);
void set_disk_ro(struct gendisk *disk, bool read_only);
void disk_uevent(struct gendisk *disk, enum kobject_action action);

static inline u8 bdev_partno(const struct block_device *bdev)
{
        return atomic_read(&bdev->__bd_flags) & BD_PARTNO;
}

static inline bool bdev_test_flag(const struct block_device *bdev, unsigned flag)
{
        return atomic_read(&bdev->__bd_flags) & flag;
}

static inline void bdev_set_flag(struct block_device *bdev, unsigned flag)
{
        atomic_or(flag, &bdev->__bd_flags);
}

static inline void bdev_clear_flag(struct block_device *bdev, unsigned flag)
{
        atomic_andnot(flag, &bdev->__bd_flags);
}

static inline int get_disk_ro(struct gendisk *disk)
{
        return bdev_test_flag(disk->part0, BD_READ_ONLY) ||
                test_bit(GD_READ_ONLY, &disk->state);
}

static inline int bdev_read_only(struct block_device *bdev)
{
        return bdev_test_flag(bdev, BD_READ_ONLY) || get_disk_ro(bdev->bd_disk);
}

bool set_capacity_and_notify(struct gendisk *disk, sector_t size);
void disk_force_media_change(struct gendisk *disk);
void bdev_mark_dead(struct block_device *bdev, bool surprise);

void add_disk_randomness(struct gendisk *disk) __latent_entropy;
void rand_initialize_disk(struct gendisk *disk);

static inline sector_t get_start_sect(struct block_device *bdev)
{
        return bdev->bd_start_sect;
}

static inline sector_t bdev_nr_sectors(struct block_device *bdev)
{
        return bdev->bd_nr_sectors;
}

static inline loff_t bdev_nr_bytes(struct block_device *bdev)
{
        return (loff_t)bdev_nr_sectors(bdev) << SECTOR_SHIFT;
}

static inline sector_t get_capacity(struct gendisk *disk)
{
        return bdev_nr_sectors(disk->part0);
}

static inline u64 sb_bdev_nr_blocks(struct super_block *sb)
{
        return bdev_nr_sectors(sb->s_bdev) >>
                (sb->s_blocksize_bits - SECTOR_SHIFT);
}

int bdev_disk_changed(struct gendisk *disk, bool invalidate);

void put_disk(struct gendisk *disk);
struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node,
                struct lock_class_key *lkclass);

/**
 * blk_alloc_disk - allocate a gendisk structure
 * @lim: queue limits to be used for this disk.
 * @node_id: numa node to allocate on
 *
 * Allocate and pre-initialize a gendisk structure for use with BIO based
 * drivers.
 *
 * Returns an ERR_PTR on error, else the allocated disk.
 *
 * Context: can sleep
 */
#define blk_alloc_disk(lim, node_id)                                        \
({                                                                        \
        static struct lock_class_key __key;                                \
                                                                        \
        __blk_alloc_disk(lim, node_id, &__key);                                \
})

int __register_blkdev(unsigned int major, const char *name,
                void (*probe)(dev_t devt));
#define register_blkdev(major, name) \
        __register_blkdev(major, name, NULL)
void unregister_blkdev(unsigned int major, const char *name);

bool disk_check_media_change(struct gendisk *disk);
void set_capacity(struct gendisk *disk, sector_t size);

#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk);
#else
static inline int bd_link_disk_holder(struct block_device *bdev,
                                      struct gendisk *disk)
{
        return 0;
}
static inline void bd_unlink_disk_holder(struct block_device *bdev,
                                         struct gendisk *disk)
{
}
#endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */

dev_t part_devt(struct gendisk *disk, u8 partno);
void inc_diskseq(struct gendisk *disk);
void blk_request_module(dev_t devt);

extern int blk_register_queue(struct gendisk *disk);
extern void blk_unregister_queue(struct gendisk *disk);
void submit_bio_noacct(struct bio *bio);
struct bio *bio_split_to_limits(struct bio *bio);

extern int blk_lld_busy(struct request_queue *q);
extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
extern void blk_queue_exit(struct request_queue *q);
extern void blk_sync_queue(struct request_queue *q);

/* Helper to convert REQ_OP_XXX to its string format XXX */
extern const char *blk_op_str(enum req_op op);

int blk_status_to_errno(blk_status_t status);
blk_status_t errno_to_blk_status(int errno);
const char *blk_status_to_str(blk_status_t status);

/* only poll the hardware once, don't continue until a completion was found */
#define BLK_POLL_ONESHOT                (1 << 0)
int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags);
int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
                        unsigned int flags);

static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
{
        return bdev->bd_queue;        /* this is never NULL */
}

/* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */
const char *blk_zone_cond_str(enum blk_zone_cond zone_cond);

static inline unsigned int bio_zone_no(struct bio *bio)
{
        return disk_zone_no(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector);
}

static inline bool bio_straddles_zones(struct bio *bio)
{
        return bio_sectors(bio) &&
                bio_zone_no(bio) !=
                disk_zone_no(bio->bi_bdev->bd_disk, bio_end_sector(bio) - 1);
}

/*
 * Return how much of the chunk is left to be used for I/O at a given offset.
 */
static inline unsigned int blk_chunk_sectors_left(sector_t offset,
                unsigned int chunk_sectors)
{
        if (unlikely(!is_power_of_2(chunk_sectors)))
                return chunk_sectors - sector_div(offset, chunk_sectors);
        return chunk_sectors - (offset & (chunk_sectors - 1));
}

/**
 * queue_limits_start_update - start an atomic update of queue limits
 * @q:                queue to update
 *
 * This functions starts an atomic update of the queue limits.  It takes a lock
 * to prevent other updates and returns a snapshot of the current limits that
 * the caller can modify.  The caller must call queue_limits_commit_update()
 * to finish the update.
 *
 * Context: process context.  The caller must have frozen the queue or ensured
 * that there is outstanding I/O by other means.
 */
static inline struct queue_limits
queue_limits_start_update(struct request_queue *q)
        __acquires(q->limits_lock)
{
        mutex_lock(&q->limits_lock);
        return q->limits;
}
int queue_limits_commit_update(struct request_queue *q,
                struct queue_limits *lim);
int queue_limits_set(struct request_queue *q, struct queue_limits *lim);

/**
 * queue_limits_cancel_update - cancel an atomic update of queue limits
 * @q:                queue to update
 *
 * This functions cancels an atomic update of the queue limits started by
 * queue_limits_start_update() and should be used when an error occurs after
 * starting update.
 */
static inline void queue_limits_cancel_update(struct request_queue *q)
{
        mutex_unlock(&q->limits_lock);
}

/*
 * Access functions for manipulating queue properties
 */
extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int);
void blk_queue_max_secure_erase_sectors(struct request_queue *q,
                unsigned int max_sectors);
extern void blk_queue_max_discard_sectors(struct request_queue *q,
                unsigned int max_discard_sectors);
extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
                unsigned int max_write_same_sectors);
extern void blk_queue_logical_block_size(struct request_queue *, unsigned int);
extern void blk_queue_max_zone_append_sectors(struct request_queue *q,
                unsigned int max_zone_append_sectors);
extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
void blk_queue_zone_write_granularity(struct request_queue *q,
                                      unsigned int size);
extern void blk_queue_alignment_offset(struct request_queue *q,
                                       unsigned int alignment);
void disk_update_readahead(struct gendisk *disk);
extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth);
extern void blk_set_stacking_limits(struct queue_limits *lim);
extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
                            sector_t offset);
void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev,
                sector_t offset, const char *pfx);
extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);

struct blk_independent_access_ranges *
disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges);
void disk_set_independent_access_ranges(struct gendisk *disk,
                                struct blk_independent_access_ranges *iars);

bool __must_check blk_get_queue(struct request_queue *);
extern void blk_put_queue(struct request_queue *);

void blk_mark_disk_dead(struct gendisk *disk);

#ifdef CONFIG_BLOCK
/*
 * blk_plug permits building a queue of related requests by holding the I/O
 * fragments for a short period. This allows merging of sequential requests
 * into single larger request. As the requests are moved from a per-task list to
 * the device's request_queue in a batch, this results in improved scalability
 * as the lock contention for request_queue lock is reduced.
 *
 * It is ok not to disable preemption when adding the request to the plug list
 * or when attempting a merge. For details, please see schedule() where
 * blk_flush_plug() is called.
 */
struct blk_plug {
        struct request *mq_list; /* blk-mq requests */

        /* if ios_left is > 1, we can batch tag/rq allocations */
        struct request *cached_rq;
        u64 cur_ktime;
        unsigned short nr_ios;

        unsigned short rq_count;

        bool multiple_queues;
        bool has_elevator;

        struct list_head cb_list; /* md requires an unplug callback */
};

struct blk_plug_cb;
typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool);
struct blk_plug_cb {
        struct list_head list;
        blk_plug_cb_fn callback;
        void *data;
};
extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug,
                                             void *data, int size);
extern void blk_start_plug(struct blk_plug *);
extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short);
extern void blk_finish_plug(struct blk_plug *);

void __blk_flush_plug(struct blk_plug *plug, bool from_schedule);
static inline void blk_flush_plug(struct blk_plug *plug, bool async)
{
        if (plug)
                __blk_flush_plug(plug, async);
}

/*
 * tsk == current here
 */
static inline void blk_plug_invalidate_ts(struct task_struct *tsk)
{
        struct blk_plug *plug = tsk->plug;

        if (plug)
                plug->cur_ktime = 0;
        current->flags &= ~PF_BLOCK_TS;
}

int blkdev_issue_flush(struct block_device *bdev);
long nr_blockdev_pages(void);
#else /* CONFIG_BLOCK */
struct blk_plug {
};

static inline void blk_start_plug_nr_ios(struct blk_plug *plug,
                                         unsigned short nr_ios)
{
}

static inline void blk_start_plug(struct blk_plug *plug)
{
}

static inline void blk_finish_plug(struct blk_plug *plug)
{
}

static inline void blk_flush_plug(struct blk_plug *plug, bool async)
{
}

static inline void blk_plug_invalidate_ts(struct task_struct *tsk)
{
}

static inline int blkdev_issue_flush(struct block_device *bdev)
{
        return 0;
}

static inline long nr_blockdev_pages(void)
{
        return 0;
}
#endif /* CONFIG_BLOCK */

extern void blk_io_schedule(void);

int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask);
int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, struct bio **biop);
int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp);

#define BLKDEV_ZERO_NOUNMAP        (1 << 0)  /* do not free blocks */
#define BLKDEV_ZERO_NOFALLBACK        (1 << 1)  /* don't write explicit zeroes */

extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
                unsigned flags);
extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
                sector_t nr_sects, gfp_t gfp_mask, unsigned flags);

static inline int sb_issue_discard(struct super_block *sb, sector_t block,
                sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
{
        return blkdev_issue_discard(sb->s_bdev,
                                    block << (sb->s_blocksize_bits -
                                              SECTOR_SHIFT),
                                    nr_blocks << (sb->s_blocksize_bits -
                                                  SECTOR_SHIFT),
                                    gfp_mask);
}
static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
                sector_t nr_blocks, gfp_t gfp_mask)
{
        return blkdev_issue_zeroout(sb->s_bdev,
                                    block << (sb->s_blocksize_bits -
                                              SECTOR_SHIFT),
                                    nr_blocks << (sb->s_blocksize_bits -
                                                  SECTOR_SHIFT),
                                    gfp_mask, 0);
}

static inline bool bdev_is_partition(struct block_device *bdev)
{
        return bdev_partno(bdev) != 0;
}

enum blk_default_limits {
        BLK_MAX_SEGMENTS        = 128,
        BLK_SAFE_MAX_SECTORS        = 255,
        BLK_MAX_SEGMENT_SIZE        = 65536,
        BLK_SEG_BOUNDARY_MASK        = 0xFFFFFFFFUL,
};

/*
 * Default upper limit for the software max_sectors limit used for
 * regular file system I/O.  This can be increased through sysfs.
 *
 * Not to be confused with the max_hw_sector limit that is entirely
 * controlled by the driver, usually based on hardware limits.
 */
#define BLK_DEF_MAX_SECTORS_CAP        2560u

static inline unsigned long queue_segment_boundary(const struct request_queue *q)
{
        return q->limits.seg_boundary_mask;
}

static inline unsigned long queue_virt_boundary(const struct request_queue *q)
{
        return q->limits.virt_boundary_mask;
}

static inline unsigned int queue_max_sectors(const struct request_queue *q)
{
        return q->limits.max_sectors;
}

static inline unsigned int queue_max_bytes(struct request_queue *q)
{
        return min_t(unsigned int, queue_max_sectors(q), INT_MAX >> 9) << 9;
}

static inline unsigned int queue_max_hw_sectors(const struct request_queue *q)
{
        return q->limits.max_hw_sectors;
}

static inline unsigned short queue_max_segments(const struct request_queue *q)
{
        return q->limits.max_segments;
}

static inline unsigned short queue_max_discard_segments(const struct request_queue *q)
{
        return q->limits.max_discard_segments;
}

static inline unsigned int queue_max_segment_size(const struct request_queue *q)
{
        return q->limits.max_segment_size;
}

static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l)
{
        unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors);

        return min_not_zero(l->max_zone_append_sectors, max_sectors);
}

static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q)
{
        if (!blk_queue_is_zoned(q))
                return 0;

        return queue_limits_max_zone_append_sectors(&q->limits);
}

static inline bool queue_emulates_zone_append(struct request_queue *q)
{
        return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors;
}

static inline bool bdev_emulates_zone_append(struct block_device *bdev)
{
        return queue_emulates_zone_append(bdev_get_queue(bdev));
}

static inline unsigned int
bdev_max_zone_append_sectors(struct block_device *bdev)
{
        return queue_max_zone_append_sectors(bdev_get_queue(bdev));
}

static inline unsigned int bdev_max_segments(struct block_device *bdev)
{
        return queue_max_segments(bdev_get_queue(bdev));
}

static inline unsigned queue_logical_block_size(const struct request_queue *q)
{
        int retval = 512;

        if (q && q->limits.logical_block_size)
                retval = q->limits.logical_block_size;

        return retval;
}

static inline unsigned int bdev_logical_block_size(struct block_device *bdev)
{
        return queue_logical_block_size(bdev_get_queue(bdev));
}

static inline unsigned int queue_physical_block_size(const struct request_queue *q)
{
        return q->limits.physical_block_size;
}

static inline unsigned int bdev_physical_block_size(struct block_device *bdev)
{
        return queue_physical_block_size(bdev_get_queue(bdev));
}

static inline unsigned int queue_io_min(const struct request_queue *q)
{
        return q->limits.io_min;
}

static inline int bdev_io_min(struct block_device *bdev)
{
        return queue_io_min(bdev_get_queue(bdev));
}

static inline unsigned int queue_io_opt(const struct request_queue *q)
{
        return q->limits.io_opt;
}

static inline int bdev_io_opt(struct block_device *bdev)
{
        return queue_io_opt(bdev_get_queue(bdev));
}

static inline unsigned int
queue_zone_write_granularity(const struct request_queue *q)
{
        return q->limits.zone_write_granularity;
}

static inline unsigned int
bdev_zone_write_granularity(struct block_device *bdev)
{
        return queue_zone_write_granularity(bdev_get_queue(bdev));
}

int bdev_alignment_offset(struct block_device *bdev);
unsigned int bdev_discard_alignment(struct block_device *bdev);

static inline unsigned int bdev_max_discard_sectors(struct block_device *bdev)
{
        return bdev_get_queue(bdev)->limits.max_discard_sectors;
}

static inline unsigned int bdev_discard_granularity(struct block_device *bdev)
{
        return bdev_get_queue(bdev)->limits.discard_granularity;
}

static inline unsigned int
bdev_max_secure_erase_sectors(struct block_device *bdev)
{
        return bdev_get_queue(bdev)->limits.max_secure_erase_sectors;
}

static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev)
{
        struct request_queue *q = bdev_get_queue(bdev);

        if (q)
                return q->limits.max_write_zeroes_sectors;

        return 0;
}

static inline bool bdev_nonrot(struct block_device *bdev)
{
        return blk_queue_nonrot(bdev_get_queue(bdev));
}

static inline bool bdev_synchronous(struct block_device *bdev)
{
        return test_bit(QUEUE_FLAG_SYNCHRONOUS,
                        &bdev_get_queue(bdev)->queue_flags);
}

static inline bool bdev_stable_writes(struct block_device *bdev)
{
        return test_bit(QUEUE_FLAG_STABLE_WRITES,
                        &bdev_get_queue(bdev)->queue_flags);
}

static inline bool bdev_write_cache(struct block_device *bdev)
{
        return test_bit(QUEUE_FLAG_WC, &bdev_get_queue(bdev)->queue_flags);
}

static inline bool bdev_fua(struct block_device *bdev)
{
        return test_bit(QUEUE_FLAG_FUA, &bdev_get_queue(bdev)->queue_flags);
}

static inline bool bdev_nowait(struct block_device *bdev)
{
        return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags);
}

static inline bool bdev_is_zoned(struct block_device *bdev)
{
        return blk_queue_is_zoned(bdev_get_queue(bdev));
}

static inline unsigned int bdev_zone_no(struct block_device *bdev, sector_t sec)
{
        return disk_zone_no(bdev->bd_disk, sec);
}

static inline sector_t bdev_zone_sectors(struct block_device *bdev)
{
        struct request_queue *q = bdev_get_queue(bdev);

        if (!blk_queue_is_zoned(q))
                return 0;
        return q->limits.chunk_sectors;
}

static inline sector_t bdev_offset_from_zone_start(struct block_device *bdev,
                                                   sector_t sector)
{
        return sector & (bdev_zone_sectors(bdev) - 1);
}

static inline sector_t bio_offset_from_zone_start(struct bio *bio)
{
        return bdev_offset_from_zone_start(bio->bi_bdev,
                                           bio->bi_iter.bi_sector);
}

static inline bool bdev_is_zone_start(struct block_device *bdev,
                                      sector_t sector)
{
        return bdev_offset_from_zone_start(bdev, sector) == 0;
}

static inline int queue_dma_alignment(const struct request_queue *q)
{
        return q ? q->limits.dma_alignment : 511;
}

static inline unsigned int bdev_dma_alignment(struct block_device *bdev)
{
        return queue_dma_alignment(bdev_get_queue(bdev));
}

static inline bool bdev_iter_is_aligned(struct block_device *bdev,
                                        struct iov_iter *iter)
{
        return iov_iter_is_aligned(iter, bdev_dma_alignment(bdev),
                                   bdev_logical_block_size(bdev) - 1);
}

static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
                                 unsigned int len)
{
        unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask;
        return !(addr & alignment) && !(len & alignment);
}

/* assumes size > 256 */
static inline unsigned int blksize_bits(unsigned int size)
{
        return order_base_2(size >> SECTOR_SHIFT) + SECTOR_SHIFT;
}

int kblockd_schedule_work(struct work_struct *work);
int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);

#define MODULE_ALIAS_BLOCKDEV(major,minor) \
        MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
#define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
        MODULE_ALIAS("block-major-" __stringify(major) "-*")

#ifdef CONFIG_BLK_INLINE_ENCRYPTION

bool blk_crypto_register(struct blk_crypto_profile *profile,
                         struct request_queue *q);

#else /* CONFIG_BLK_INLINE_ENCRYPTION */

static inline bool blk_crypto_register(struct blk_crypto_profile *profile,
                                       struct request_queue *q)
{
        return true;
}

#endif /* CONFIG_BLK_INLINE_ENCRYPTION */

enum blk_unique_id {
        /* these match the Designator Types specified in SPC */
        BLK_UID_T10        = 1,
        BLK_UID_EUI64        = 2,
        BLK_UID_NAA        = 3,
};

struct block_device_operations {
        void (*submit_bio)(struct bio *bio);
        int (*poll_bio)(struct bio *bio, struct io_comp_batch *iob,
                        unsigned int flags);
        int (*open)(struct gendisk *disk, blk_mode_t mode);
        void (*release)(struct gendisk *disk);
        int (*ioctl)(struct block_device *bdev, blk_mode_t mode,
                        unsigned cmd, unsigned long arg);
        int (*compat_ioctl)(struct block_device *bdev, blk_mode_t mode,
                        unsigned cmd, unsigned long arg);
        unsigned int (*check_events) (struct gendisk *disk,
                                      unsigned int clearing);
        void (*unlock_native_capacity) (struct gendisk *);
        int (*getgeo)(struct block_device *, struct hd_geometry *);
        int (*set_read_only)(struct block_device *bdev, bool ro);
        void (*free_disk)(struct gendisk *disk);
        /* this callback is with swap_lock and sometimes page table lock held */
        void (*swap_slot_free_notify) (struct block_device *, unsigned long);
        int (*report_zones)(struct gendisk *, sector_t sector,
                        unsigned int nr_zones, report_zones_cb cb, void *data);
        char *(*devnode)(struct gendisk *disk, umode_t *mode);
        /* returns the length of the identifier or a negative errno: */
        int (*get_unique_id)(struct gendisk *disk, u8 id[16],
                        enum blk_unique_id id_type);
        struct module *owner;
        const struct pr_ops *pr_ops;

        /*
         * Special callback for probing GPT entry at a given sector.
         * Needed by Android devices, used by GPT scanner and MMC blk
         * driver.
         */
        int (*alternative_gpt_sector)(struct gendisk *disk, sector_t *sector);
};

#ifdef CONFIG_COMPAT
extern int blkdev_compat_ptr_ioctl(struct block_device *, blk_mode_t,
                                      unsigned int, unsigned long);
#else
#define blkdev_compat_ptr_ioctl NULL
#endif

static inline void blk_wake_io_task(struct task_struct *waiter)
{
        /*
         * If we're polling, the task itself is doing the completions. For
         * that case, we don't need to signal a wakeup, it's enough to just
         * mark us as RUNNING.
         */
        if (waiter == current)
                __set_current_state(TASK_RUNNING);
        else
                wake_up_process(waiter);
}

unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op,
                                 unsigned long start_time);
void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
                      unsigned int sectors, unsigned long start_time);

unsigned long bio_start_io_acct(struct bio *bio);
void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
                struct block_device *orig_bdev);

/**
 * bio_end_io_acct - end I/O accounting for bio based drivers
 * @bio:        bio to end account for
 * @start_time:        start time returned by bio_start_io_acct()
 */
static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time)
{
        return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev);
}

int bdev_read_only(struct block_device *bdev);
int set_blocksize(struct file *file, int size);

int lookup_bdev(const char *pathname, dev_t *dev);

void blkdev_show(struct seq_file *seqf, off_t offset);

#define BDEVNAME_SIZE        32        /* Largest string for a blockdev identifier */
#define BDEVT_SIZE        10        /* Largest string for MAJ:MIN for blkdev */
#ifdef CONFIG_BLOCK
#define BLKDEV_MAJOR_MAX        512
#else
#define BLKDEV_MAJOR_MAX        0
#endif

struct blk_holder_ops {
        void (*mark_dead)(struct block_device *bdev, bool surprise);

        /*
         * Sync the file system mounted on the block device.
         */
        void (*sync)(struct block_device *bdev);

        /*
         * Freeze the file system mounted on the block device.
         */
        int (*freeze)(struct block_device *bdev);

        /*
         * Thaw the file system mounted on the block device.
         */
        int (*thaw)(struct block_device *bdev);
};

/*
 * For filesystems using @fs_holder_ops, the @holder argument passed to
 * helpers used to open and claim block devices via
 * bd_prepare_to_claim() must point to a superblock.
 */
extern const struct blk_holder_ops fs_holder_ops;

/*
 * Return the correct open flags for blkdev_get_by_* for super block flags
 * as stored in sb->s_flags.
 */
#define sb_open_mode(flags) \
        (BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES | \
         (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE))

struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
                const struct blk_holder_ops *hops);
struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
                void *holder, const struct blk_holder_ops *hops);
int bd_prepare_to_claim(struct block_device *bdev, void *holder,
                const struct blk_holder_ops *hops);
void bd_abort_claiming(struct block_device *bdev, void *holder);

/* just for blk-cgroup, don't use elsewhere */
struct block_device *blkdev_get_no_open(dev_t dev);
void blkdev_put_no_open(struct block_device *bdev);

struct block_device *I_BDEV(struct inode *inode);
struct block_device *file_bdev(struct file *bdev_file);
bool disk_live(struct gendisk *disk);
unsigned int block_size(struct block_device *bdev);

#ifdef CONFIG_BLOCK
void invalidate_bdev(struct block_device *bdev);
int sync_blockdev(struct block_device *bdev);
int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend);
int sync_blockdev_nowait(struct block_device *bdev);
void sync_bdevs(bool wait);
void bdev_statx_dioalign(struct inode *inode, struct kstat *stat);
void printk_all_partitions(void);
int __init early_lookup_bdev(const char *pathname, dev_t *dev);
#else
static inline void invalidate_bdev(struct block_device *bdev)
{
}
static inline int sync_blockdev(struct block_device *bdev)
{
        return 0;
}
static inline int sync_blockdev_nowait(struct block_device *bdev)
{
        return 0;
}
static inline void sync_bdevs(bool wait)
{
}
static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
{
}
static inline void printk_all_partitions(void)
{
}
static inline int early_lookup_bdev(const char *pathname, dev_t *dev)
{
        return -EINVAL;
}
#endif /* CONFIG_BLOCK */

int bdev_freeze(struct block_device *bdev);
int bdev_thaw(struct block_device *bdev);
void bdev_fput(struct file *bdev_file);

struct io_comp_batch {
        struct request *req_list;
        bool need_ts;
        void (*complete)(struct io_comp_batch *);
};

#define DEFINE_IO_COMP_BATCH(name)        struct io_comp_batch name = { }

#endif /* _LINUX_BLKDEV_H */
















































































































































































































































































































































































































































































































































   28 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   28 















    2 


   30 









   29 

























































































































































































































































   28 
   30 




























































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/common.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include <linux/uaccess.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/string_helpers.h>
#include "common.h"

/* String table for operation mode. */
const char * const tomoyo_mode[TOMOYO_CONFIG_MAX_MODE] = {
        [TOMOYO_CONFIG_DISABLED]   = "disabled",
        [TOMOYO_CONFIG_LEARNING]   = "learning",
        [TOMOYO_CONFIG_PERMISSIVE] = "permissive",
        [TOMOYO_CONFIG_ENFORCING]  = "enforcing"
};

/* String table for /sys/kernel/security/tomoyo/profile */
const char * const tomoyo_mac_keywords[TOMOYO_MAX_MAC_INDEX
                                       + TOMOYO_MAX_MAC_CATEGORY_INDEX] = {
        /* CONFIG::file group */
        [TOMOYO_MAC_FILE_EXECUTE]    = "execute",
        [TOMOYO_MAC_FILE_OPEN]       = "open",
        [TOMOYO_MAC_FILE_CREATE]     = "create",
        [TOMOYO_MAC_FILE_UNLINK]     = "unlink",
        [TOMOYO_MAC_FILE_GETATTR]    = "getattr",
        [TOMOYO_MAC_FILE_MKDIR]      = "mkdir",
        [TOMOYO_MAC_FILE_RMDIR]      = "rmdir",
        [TOMOYO_MAC_FILE_MKFIFO]     = "mkfifo",
        [TOMOYO_MAC_FILE_MKSOCK]     = "mksock",
        [TOMOYO_MAC_FILE_TRUNCATE]   = "truncate",
        [TOMOYO_MAC_FILE_SYMLINK]    = "symlink",
        [TOMOYO_MAC_FILE_MKBLOCK]    = "mkblock",
        [TOMOYO_MAC_FILE_MKCHAR]     = "mkchar",
        [TOMOYO_MAC_FILE_LINK]       = "link",
        [TOMOYO_MAC_FILE_RENAME]     = "rename",
        [TOMOYO_MAC_FILE_CHMOD]      = "chmod",
        [TOMOYO_MAC_FILE_CHOWN]      = "chown",
        [TOMOYO_MAC_FILE_CHGRP]      = "chgrp",
        [TOMOYO_MAC_FILE_IOCTL]      = "ioctl",
        [TOMOYO_MAC_FILE_CHROOT]     = "chroot",
        [TOMOYO_MAC_FILE_MOUNT]      = "mount",
        [TOMOYO_MAC_FILE_UMOUNT]     = "unmount",
        [TOMOYO_MAC_FILE_PIVOT_ROOT] = "pivot_root",
        /* CONFIG::network group */
        [TOMOYO_MAC_NETWORK_INET_STREAM_BIND]       = "inet_stream_bind",
        [TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN]     = "inet_stream_listen",
        [TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT]    = "inet_stream_connect",
        [TOMOYO_MAC_NETWORK_INET_DGRAM_BIND]        = "inet_dgram_bind",
        [TOMOYO_MAC_NETWORK_INET_DGRAM_SEND]        = "inet_dgram_send",
        [TOMOYO_MAC_NETWORK_INET_RAW_BIND]          = "inet_raw_bind",
        [TOMOYO_MAC_NETWORK_INET_RAW_SEND]          = "inet_raw_send",
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND]       = "unix_stream_bind",
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN]     = "unix_stream_listen",
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT]    = "unix_stream_connect",
        [TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND]        = "unix_dgram_bind",
        [TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND]        = "unix_dgram_send",
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND]    = "unix_seqpacket_bind",
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN]  = "unix_seqpacket_listen",
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT] = "unix_seqpacket_connect",
        /* CONFIG::misc group */
        [TOMOYO_MAC_ENVIRON] = "env",
        /* CONFIG group */
        [TOMOYO_MAX_MAC_INDEX + TOMOYO_MAC_CATEGORY_FILE] = "file",
        [TOMOYO_MAX_MAC_INDEX + TOMOYO_MAC_CATEGORY_NETWORK] = "network",
        [TOMOYO_MAX_MAC_INDEX + TOMOYO_MAC_CATEGORY_MISC] = "misc",
};

/* String table for conditions. */
const char * const tomoyo_condition_keyword[TOMOYO_MAX_CONDITION_KEYWORD] = {
        [TOMOYO_TASK_UID]             = "task.uid",
        [TOMOYO_TASK_EUID]            = "task.euid",
        [TOMOYO_TASK_SUID]            = "task.suid",
        [TOMOYO_TASK_FSUID]           = "task.fsuid",
        [TOMOYO_TASK_GID]             = "task.gid",
        [TOMOYO_TASK_EGID]            = "task.egid",
        [TOMOYO_TASK_SGID]            = "task.sgid",
        [TOMOYO_TASK_FSGID]           = "task.fsgid",
        [TOMOYO_TASK_PID]             = "task.pid",
        [TOMOYO_TASK_PPID]            = "task.ppid",
        [TOMOYO_EXEC_ARGC]            = "exec.argc",
        [TOMOYO_EXEC_ENVC]            = "exec.envc",
        [TOMOYO_TYPE_IS_SOCKET]       = "socket",
        [TOMOYO_TYPE_IS_SYMLINK]      = "symlink",
        [TOMOYO_TYPE_IS_FILE]         = "file",
        [TOMOYO_TYPE_IS_BLOCK_DEV]    = "block",
        [TOMOYO_TYPE_IS_DIRECTORY]    = "directory",
        [TOMOYO_TYPE_IS_CHAR_DEV]     = "char",
        [TOMOYO_TYPE_IS_FIFO]         = "fifo",
        [TOMOYO_MODE_SETUID]          = "setuid",
        [TOMOYO_MODE_SETGID]          = "setgid",
        [TOMOYO_MODE_STICKY]          = "sticky",
        [TOMOYO_MODE_OWNER_READ]      = "owner_read",
        [TOMOYO_MODE_OWNER_WRITE]     = "owner_write",
        [TOMOYO_MODE_OWNER_EXECUTE]   = "owner_execute",
        [TOMOYO_MODE_GROUP_READ]      = "group_read",
        [TOMOYO_MODE_GROUP_WRITE]     = "group_write",
        [TOMOYO_MODE_GROUP_EXECUTE]   = "group_execute",
        [TOMOYO_MODE_OTHERS_READ]     = "others_read",
        [TOMOYO_MODE_OTHERS_WRITE]    = "others_write",
        [TOMOYO_MODE_OTHERS_EXECUTE]  = "others_execute",
        [TOMOYO_EXEC_REALPATH]        = "exec.realpath",
        [TOMOYO_SYMLINK_TARGET]       = "symlink.target",
        [TOMOYO_PATH1_UID]            = "path1.uid",
        [TOMOYO_PATH1_GID]            = "path1.gid",
        [TOMOYO_PATH1_INO]            = "path1.ino",
        [TOMOYO_PATH1_MAJOR]          = "path1.major",
        [TOMOYO_PATH1_MINOR]          = "path1.minor",
        [TOMOYO_PATH1_PERM]           = "path1.perm",
        [TOMOYO_PATH1_TYPE]           = "path1.type",
        [TOMOYO_PATH1_DEV_MAJOR]      = "path1.dev_major",
        [TOMOYO_PATH1_DEV_MINOR]      = "path1.dev_minor",
        [TOMOYO_PATH2_UID]            = "path2.uid",
        [TOMOYO_PATH2_GID]            = "path2.gid",
        [TOMOYO_PATH2_INO]            = "path2.ino",
        [TOMOYO_PATH2_MAJOR]          = "path2.major",
        [TOMOYO_PATH2_MINOR]          = "path2.minor",
        [TOMOYO_PATH2_PERM]           = "path2.perm",
        [TOMOYO_PATH2_TYPE]           = "path2.type",
        [TOMOYO_PATH2_DEV_MAJOR]      = "path2.dev_major",
        [TOMOYO_PATH2_DEV_MINOR]      = "path2.dev_minor",
        [TOMOYO_PATH1_PARENT_UID]     = "path1.parent.uid",
        [TOMOYO_PATH1_PARENT_GID]     = "path1.parent.gid",
        [TOMOYO_PATH1_PARENT_INO]     = "path1.parent.ino",
        [TOMOYO_PATH1_PARENT_PERM]    = "path1.parent.perm",
        [TOMOYO_PATH2_PARENT_UID]     = "path2.parent.uid",
        [TOMOYO_PATH2_PARENT_GID]     = "path2.parent.gid",
        [TOMOYO_PATH2_PARENT_INO]     = "path2.parent.ino",
        [TOMOYO_PATH2_PARENT_PERM]    = "path2.parent.perm",
};

/* String table for PREFERENCE keyword. */
static const char * const tomoyo_pref_keywords[TOMOYO_MAX_PREF] = {
        [TOMOYO_PREF_MAX_AUDIT_LOG]      = "max_audit_log",
        [TOMOYO_PREF_MAX_LEARNING_ENTRY] = "max_learning_entry",
};

/* String table for path operation. */
const char * const tomoyo_path_keyword[TOMOYO_MAX_PATH_OPERATION] = {
        [TOMOYO_TYPE_EXECUTE]    = "execute",
        [TOMOYO_TYPE_READ]       = "read",
        [TOMOYO_TYPE_WRITE]      = "write",
        [TOMOYO_TYPE_APPEND]     = "append",
        [TOMOYO_TYPE_UNLINK]     = "unlink",
        [TOMOYO_TYPE_GETATTR]    = "getattr",
        [TOMOYO_TYPE_RMDIR]      = "rmdir",
        [TOMOYO_TYPE_TRUNCATE]   = "truncate",
        [TOMOYO_TYPE_SYMLINK]    = "symlink",
        [TOMOYO_TYPE_CHROOT]     = "chroot",
        [TOMOYO_TYPE_UMOUNT]     = "unmount",
};

/* String table for socket's operation. */
const char * const tomoyo_socket_keyword[TOMOYO_MAX_NETWORK_OPERATION] = {
        [TOMOYO_NETWORK_BIND]    = "bind",
        [TOMOYO_NETWORK_LISTEN]  = "listen",
        [TOMOYO_NETWORK_CONNECT] = "connect",
        [TOMOYO_NETWORK_SEND]    = "send",
};

/* String table for categories. */
static const char * const tomoyo_category_keywords
[TOMOYO_MAX_MAC_CATEGORY_INDEX] = {
        [TOMOYO_MAC_CATEGORY_FILE]    = "file",
        [TOMOYO_MAC_CATEGORY_NETWORK] = "network",
        [TOMOYO_MAC_CATEGORY_MISC]    = "misc",
};

/* Permit policy management by non-root user? */
static bool tomoyo_manage_by_non_root;

/* Utility functions. */

/**
 * tomoyo_addprintf - strncat()-like-snprintf().
 *
 * @buffer: Buffer to write to. Must be '\0'-terminated.
 * @len:    Size of @buffer.
 * @fmt:    The printf()'s format string, followed by parameters.
 *
 * Returns nothing.
 */
__printf(3, 4)
static void tomoyo_addprintf(char *buffer, int len, const char *fmt, ...)
{
        va_list args;
        const int pos = strlen(buffer);

        va_start(args, fmt);
        vsnprintf(buffer + pos, len - pos - 1, fmt, args);
        va_end(args);
}

/**
 * tomoyo_flush - Flush queued string to userspace's buffer.
 *
 * @head:   Pointer to "struct tomoyo_io_buffer".
 *
 * Returns true if all data was flushed, false otherwise.
 */
static bool tomoyo_flush(struct tomoyo_io_buffer *head)
{
        while (head->r.w_pos) {
                const char *w = head->r.w[0];
                size_t len = strlen(w);

                if (len) {
                        if (len > head->read_user_buf_avail)
                                len = head->read_user_buf_avail;
                        if (!len)
                                return false;
                        if (copy_to_user(head->read_user_buf, w, len))
                                return false;
                        head->read_user_buf_avail -= len;
                        head->read_user_buf += len;
                        w += len;
                }
                head->r.w[0] = w;
                if (*w)
                        return false;
                /* Add '\0' for audit logs and query. */
                if (head->poll) {
                        if (!head->read_user_buf_avail ||
                            copy_to_user(head->read_user_buf, "", 1))
                                return false;
                        head->read_user_buf_avail--;
                        head->read_user_buf++;
                }
                head->r.w_pos--;
                for (len = 0; len < head->r.w_pos; len++)
                        head->r.w[len] = head->r.w[len + 1];
        }
        head->r.avail = 0;
        return true;
}

/**
 * tomoyo_set_string - Queue string to "struct tomoyo_io_buffer" structure.
 *
 * @head:   Pointer to "struct tomoyo_io_buffer".
 * @string: String to print.
 *
 * Note that @string has to be kept valid until @head is kfree()d.
 * This means that char[] allocated on stack memory cannot be passed to
 * this function. Use tomoyo_io_printf() for char[] allocated on stack memory.
 */
static void tomoyo_set_string(struct tomoyo_io_buffer *head, const char *string)
{
        if (head->r.w_pos < TOMOYO_MAX_IO_READ_QUEUE) {
                head->r.w[head->r.w_pos++] = string;
                tomoyo_flush(head);
        } else
                WARN_ON(1);
}

static void tomoyo_io_printf(struct tomoyo_io_buffer *head, const char *fmt,
                             ...) __printf(2, 3);

/**
 * tomoyo_io_printf - printf() to "struct tomoyo_io_buffer" structure.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @fmt:  The printf()'s format string, followed by parameters.
 */
static void tomoyo_io_printf(struct tomoyo_io_buffer *head, const char *fmt,
                             ...)
{
        va_list args;
        size_t len;
        size_t pos = head->r.avail;
        int size = head->readbuf_size - pos;

        if (size <= 0)
                return;
        va_start(args, fmt);
        len = vsnprintf(head->read_buf + pos, size, fmt, args) + 1;
        va_end(args);
        if (pos + len >= head->readbuf_size) {
                WARN_ON(1);
                return;
        }
        head->r.avail += len;
        tomoyo_set_string(head, head->read_buf + pos);
}

/**
 * tomoyo_set_space - Put a space to "struct tomoyo_io_buffer" structure.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static void tomoyo_set_space(struct tomoyo_io_buffer *head)
{
        tomoyo_set_string(head, " ");
}

/**
 * tomoyo_set_lf - Put a line feed to "struct tomoyo_io_buffer" structure.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static bool tomoyo_set_lf(struct tomoyo_io_buffer *head)
{
        tomoyo_set_string(head, "\n");
        return !head->r.w_pos;
}

/**
 * tomoyo_set_slash - Put a shash to "struct tomoyo_io_buffer" structure.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static void tomoyo_set_slash(struct tomoyo_io_buffer *head)
{
        tomoyo_set_string(head, "/");
}

/* List of namespaces. */
LIST_HEAD(tomoyo_namespace_list);
/* True if namespace other than tomoyo_kernel_namespace is defined. */
static bool tomoyo_namespace_enabled;

/**
 * tomoyo_init_policy_namespace - Initialize namespace.
 *
 * @ns: Pointer to "struct tomoyo_policy_namespace".
 *
 * Returns nothing.
 */
void tomoyo_init_policy_namespace(struct tomoyo_policy_namespace *ns)
{
        unsigned int idx;

        for (idx = 0; idx < TOMOYO_MAX_ACL_GROUPS; idx++)
                INIT_LIST_HEAD(&ns->acl_group[idx]);
        for (idx = 0; idx < TOMOYO_MAX_GROUP; idx++)
                INIT_LIST_HEAD(&ns->group_list[idx]);
        for (idx = 0; idx < TOMOYO_MAX_POLICY; idx++)
                INIT_LIST_HEAD(&ns->policy_list[idx]);
        ns->profile_version = 20150505;
        tomoyo_namespace_enabled = !list_empty(&tomoyo_namespace_list);
        list_add_tail_rcu(&ns->namespace_list, &tomoyo_namespace_list);
}

/**
 * tomoyo_print_namespace - Print namespace header.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static void tomoyo_print_namespace(struct tomoyo_io_buffer *head)
{
        if (!tomoyo_namespace_enabled)
                return;
        tomoyo_set_string(head,
                          container_of(head->r.ns,
                                       struct tomoyo_policy_namespace,
                                       namespace_list)->name);
        tomoyo_set_space(head);
}

/**
 * tomoyo_print_name_union - Print a tomoyo_name_union.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @ptr:  Pointer to "struct tomoyo_name_union".
 */
static void tomoyo_print_name_union(struct tomoyo_io_buffer *head,
                                    const struct tomoyo_name_union *ptr)
{
        tomoyo_set_space(head);
        if (ptr->group) {
                tomoyo_set_string(head, "@");
                tomoyo_set_string(head, ptr->group->group_name->name);
        } else {
                tomoyo_set_string(head, ptr->filename->name);
        }
}

/**
 * tomoyo_print_name_union_quoted - Print a tomoyo_name_union with a quote.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @ptr:  Pointer to "struct tomoyo_name_union".
 *
 * Returns nothing.
 */
static void tomoyo_print_name_union_quoted(struct tomoyo_io_buffer *head,
                                           const struct tomoyo_name_union *ptr)
{
        if (ptr->group) {
                tomoyo_set_string(head, "@");
                tomoyo_set_string(head, ptr->group->group_name->name);
        } else {
                tomoyo_set_string(head, "\"");
                tomoyo_set_string(head, ptr->filename->name);
                tomoyo_set_string(head, "\"");
        }
}

/**
 * tomoyo_print_number_union_nospace - Print a tomoyo_number_union without a space.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @ptr:  Pointer to "struct tomoyo_number_union".
 *
 * Returns nothing.
 */
static void tomoyo_print_number_union_nospace
(struct tomoyo_io_buffer *head, const struct tomoyo_number_union *ptr)
{
        if (ptr->group) {
                tomoyo_set_string(head, "@");
                tomoyo_set_string(head, ptr->group->group_name->name);
        } else {
                int i;
                unsigned long min = ptr->values[0];
                const unsigned long max = ptr->values[1];
                u8 min_type = ptr->value_type[0];
                const u8 max_type = ptr->value_type[1];
                char buffer[128];

                buffer[0] = '\0';
                for (i = 0; i < 2; i++) {
                        switch (min_type) {
                        case TOMOYO_VALUE_TYPE_HEXADECIMAL:
                                tomoyo_addprintf(buffer, sizeof(buffer),
                                                 "0x%lX", min);
                                break;
                        case TOMOYO_VALUE_TYPE_OCTAL:
                                tomoyo_addprintf(buffer, sizeof(buffer),
                                                 "0%lo", min);
                                break;
                        default:
                                tomoyo_addprintf(buffer, sizeof(buffer), "%lu",
                                                 min);
                                break;
                        }
                        if (min == max && min_type == max_type)
                                break;
                        tomoyo_addprintf(buffer, sizeof(buffer), "-");
                        min_type = max_type;
                        min = max;
                }
                tomoyo_io_printf(head, "%s", buffer);
        }
}

/**
 * tomoyo_print_number_union - Print a tomoyo_number_union.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @ptr:  Pointer to "struct tomoyo_number_union".
 *
 * Returns nothing.
 */
static void tomoyo_print_number_union(struct tomoyo_io_buffer *head,
                                      const struct tomoyo_number_union *ptr)
{
        tomoyo_set_space(head);
        tomoyo_print_number_union_nospace(head, ptr);
}

/**
 * tomoyo_assign_profile - Create a new profile.
 *
 * @ns:      Pointer to "struct tomoyo_policy_namespace".
 * @profile: Profile number to create.
 *
 * Returns pointer to "struct tomoyo_profile" on success, NULL otherwise.
 */
static struct tomoyo_profile *tomoyo_assign_profile
(struct tomoyo_policy_namespace *ns, const unsigned int profile)
{
        struct tomoyo_profile *ptr;
        struct tomoyo_profile *entry;

        if (profile >= TOMOYO_MAX_PROFILES)
                return NULL;
        ptr = ns->profile_ptr[profile];
        if (ptr)
                return ptr;
        entry = kzalloc(sizeof(*entry), GFP_NOFS | __GFP_NOWARN);
        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                goto out;
        ptr = ns->profile_ptr[profile];
        if (!ptr && tomoyo_memory_ok(entry)) {
                ptr = entry;
                ptr->default_config = TOMOYO_CONFIG_DISABLED |
                        TOMOYO_CONFIG_WANT_GRANT_LOG |
                        TOMOYO_CONFIG_WANT_REJECT_LOG;
                memset(ptr->config, TOMOYO_CONFIG_USE_DEFAULT,
                       sizeof(ptr->config));
                ptr->pref[TOMOYO_PREF_MAX_AUDIT_LOG] =
                        CONFIG_SECURITY_TOMOYO_MAX_AUDIT_LOG;
                ptr->pref[TOMOYO_PREF_MAX_LEARNING_ENTRY] =
                        CONFIG_SECURITY_TOMOYO_MAX_ACCEPT_ENTRY;
                mb(); /* Avoid out-of-order execution. */
                ns->profile_ptr[profile] = ptr;
                entry = NULL;
        }
        mutex_unlock(&tomoyo_policy_lock);
 out:
        kfree(entry);
        return ptr;
}

/**
 * tomoyo_profile - Find a profile.
 *
 * @ns:      Pointer to "struct tomoyo_policy_namespace".
 * @profile: Profile number to find.
 *
 * Returns pointer to "struct tomoyo_profile".
 */
struct tomoyo_profile *tomoyo_profile(const struct tomoyo_policy_namespace *ns,
                                      const u8 profile)
{
        static struct tomoyo_profile tomoyo_null_profile;
        struct tomoyo_profile *ptr = ns->profile_ptr[profile];

        if (!ptr)
                ptr = &tomoyo_null_profile;
        return ptr;
}

/**
 * tomoyo_find_yesno - Find values for specified keyword.
 *
 * @string: String to check.
 * @find:   Name of keyword.
 *
 * Returns 1 if "@find=yes" was found, 0 if "@find=no" was found, -1 otherwise.
 */
static s8 tomoyo_find_yesno(const char *string, const char *find)
{
        const char *cp = strstr(string, find);

        if (cp) {
                cp += strlen(find);
                if (!strncmp(cp, "=yes", 4))
                        return 1;
                else if (!strncmp(cp, "=no", 3))
                        return 0;
        }
        return -1;
}

/**
 * tomoyo_set_uint - Set value for specified preference.
 *
 * @i:      Pointer to "unsigned int".
 * @string: String to check.
 * @find:   Name of keyword.
 *
 * Returns nothing.
 */
static void tomoyo_set_uint(unsigned int *i, const char *string,
                            const char *find)
{
        const char *cp = strstr(string, find);

        if (cp)
                sscanf(cp + strlen(find), "=%u", i);
}

/**
 * tomoyo_set_mode - Set mode for specified profile.
 *
 * @name:    Name of functionality.
 * @value:   Mode for @name.
 * @profile: Pointer to "struct tomoyo_profile".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_set_mode(char *name, const char *value,
                           struct tomoyo_profile *profile)
{
        u8 i;
        u8 config;

        if (!strcmp(name, "CONFIG")) {
                i = TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX;
                config = profile->default_config;
        } else if (tomoyo_str_starts(&name, "CONFIG::")) {
                config = 0;
                for (i = 0; i < TOMOYO_MAX_MAC_INDEX
                             + TOMOYO_MAX_MAC_CATEGORY_INDEX; i++) {
                        int len = 0;

                        if (i < TOMOYO_MAX_MAC_INDEX) {
                                const u8 c = tomoyo_index2category[i];
                                const char *category =
                                        tomoyo_category_keywords[c];

                                len = strlen(category);
                                if (strncmp(name, category, len) ||
                                    name[len++] != ':' || name[len++] != ':')
                                        continue;
                        }
                        if (strcmp(name + len, tomoyo_mac_keywords[i]))
                                continue;
                        config = profile->config[i];
                        break;
                }
                if (i == TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX)
                        return -EINVAL;
        } else {
                return -EINVAL;
        }
        if (strstr(value, "use_default")) {
                config = TOMOYO_CONFIG_USE_DEFAULT;
        } else {
                u8 mode;

                for (mode = 0; mode < 4; mode++)
                        if (strstr(value, tomoyo_mode[mode]))
                                /*
                                 * Update lower 3 bits in order to distinguish
                                 * 'config' from 'TOMOYO_CONFIG_USE_DEFAULT'.
                                 */
                                config = (config & ~7) | mode;
                if (config != TOMOYO_CONFIG_USE_DEFAULT) {
                        switch (tomoyo_find_yesno(value, "grant_log")) {
                        case 1:
                                config |= TOMOYO_CONFIG_WANT_GRANT_LOG;
                                break;
                        case 0:
                                config &= ~TOMOYO_CONFIG_WANT_GRANT_LOG;
                                break;
                        }
                        switch (tomoyo_find_yesno(value, "reject_log")) {
                        case 1:
                                config |= TOMOYO_CONFIG_WANT_REJECT_LOG;
                                break;
                        case 0:
                                config &= ~TOMOYO_CONFIG_WANT_REJECT_LOG;
                                break;
                        }
                }
        }
        if (i < TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX)
                profile->config[i] = config;
        else if (config != TOMOYO_CONFIG_USE_DEFAULT)
                profile->default_config = config;
        return 0;
}

/**
 * tomoyo_write_profile - Write profile table.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_write_profile(struct tomoyo_io_buffer *head)
{
        char *data = head->write_buf;
        unsigned int i;
        char *cp;
        struct tomoyo_profile *profile;

        if (sscanf(data, "PROFILE_VERSION=%u", &head->w.ns->profile_version)
            == 1)
                return 0;
        i = simple_strtoul(data, &cp, 10);
        if (*cp != '-')
                return -EINVAL;
        data = cp + 1;
        profile = tomoyo_assign_profile(head->w.ns, i);
        if (!profile)
                return -EINVAL;
        cp = strchr(data, '=');
        if (!cp)
                return -EINVAL;
        *cp++ = '\0';
        if (!strcmp(data, "COMMENT")) {
                static DEFINE_SPINLOCK(lock);
                const struct tomoyo_path_info *new_comment
                        = tomoyo_get_name(cp);
                const struct tomoyo_path_info *old_comment;

                if (!new_comment)
                        return -ENOMEM;
                spin_lock(&lock);
                old_comment = profile->comment;
                profile->comment = new_comment;
                spin_unlock(&lock);
                tomoyo_put_name(old_comment);
                return 0;
        }
        if (!strcmp(data, "PREFERENCE")) {
                for (i = 0; i < TOMOYO_MAX_PREF; i++)
                        tomoyo_set_uint(&profile->pref[i], cp,
                                        tomoyo_pref_keywords[i]);
                return 0;
        }
        return tomoyo_set_mode(data, cp, profile);
}

/**
 * tomoyo_print_config - Print mode for specified functionality.
 *
 * @head:   Pointer to "struct tomoyo_io_buffer".
 * @config: Mode for that functionality.
 *
 * Returns nothing.
 *
 * Caller prints functionality's name.
 */
static void tomoyo_print_config(struct tomoyo_io_buffer *head, const u8 config)
{
        tomoyo_io_printf(head, "={ mode=%s grant_log=%s reject_log=%s }\n",
                         tomoyo_mode[config & 3],
                         str_yes_no(config & TOMOYO_CONFIG_WANT_GRANT_LOG),
                         str_yes_no(config & TOMOYO_CONFIG_WANT_REJECT_LOG));
}

/**
 * tomoyo_read_profile - Read profile table.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static void tomoyo_read_profile(struct tomoyo_io_buffer *head)
{
        u8 index;
        struct tomoyo_policy_namespace *ns =
                container_of(head->r.ns, typeof(*ns), namespace_list);
        const struct tomoyo_profile *profile;

        if (head->r.eof)
                return;
 next:
        index = head->r.index;
        profile = ns->profile_ptr[index];
        switch (head->r.step) {
        case 0:
                tomoyo_print_namespace(head);
                tomoyo_io_printf(head, "PROFILE_VERSION=%u\n",
                                 ns->profile_version);
                head->r.step++;
                break;
        case 1:
                for ( ; head->r.index < TOMOYO_MAX_PROFILES;
                      head->r.index++)
                        if (ns->profile_ptr[head->r.index])
                                break;
                if (head->r.index == TOMOYO_MAX_PROFILES) {
                        head->r.eof = true;
                        return;
                }
                head->r.step++;
                break;
        case 2:
                {
                        u8 i;
                        const struct tomoyo_path_info *comment =
                                profile->comment;

                        tomoyo_print_namespace(head);
                        tomoyo_io_printf(head, "%u-COMMENT=", index);
                        tomoyo_set_string(head, comment ? comment->name : "");
                        tomoyo_set_lf(head);
                        tomoyo_print_namespace(head);
                        tomoyo_io_printf(head, "%u-PREFERENCE={ ", index);
                        for (i = 0; i < TOMOYO_MAX_PREF; i++)
                                tomoyo_io_printf(head, "%s=%u ",
                                                 tomoyo_pref_keywords[i],
                                                 profile->pref[i]);
                        tomoyo_set_string(head, "}\n");
                        head->r.step++;
                }
                break;
        case 3:
                {
                        tomoyo_print_namespace(head);
                        tomoyo_io_printf(head, "%u-%s", index, "CONFIG");
                        tomoyo_print_config(head, profile->default_config);
                        head->r.bit = 0;
                        head->r.step++;
                }
                break;
        case 4:
                for ( ; head->r.bit < TOMOYO_MAX_MAC_INDEX
                              + TOMOYO_MAX_MAC_CATEGORY_INDEX; head->r.bit++) {
                        const u8 i = head->r.bit;
                        const u8 config = profile->config[i];

                        if (config == TOMOYO_CONFIG_USE_DEFAULT)
                                continue;
                        tomoyo_print_namespace(head);
                        if (i < TOMOYO_MAX_MAC_INDEX)
                                tomoyo_io_printf(head, "%u-CONFIG::%s::%s",
                                                 index,
                                                 tomoyo_category_keywords
                                                 [tomoyo_index2category[i]],
                                                 tomoyo_mac_keywords[i]);
                        else
                                tomoyo_io_printf(head, "%u-CONFIG::%s", index,
                                                 tomoyo_mac_keywords[i]);
                        tomoyo_print_config(head, config);
                        head->r.bit++;
                        break;
                }
                if (head->r.bit == TOMOYO_MAX_MAC_INDEX
                    + TOMOYO_MAX_MAC_CATEGORY_INDEX) {
                        head->r.index++;
                        head->r.step = 1;
                }
                break;
        }
        if (tomoyo_flush(head))
                goto next;
}

/**
 * tomoyo_same_manager - Check for duplicated "struct tomoyo_manager" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_head".
 * @b: Pointer to "struct tomoyo_acl_head".
 *
 * Returns true if @a == @b, false otherwise.
 */
static bool tomoyo_same_manager(const struct tomoyo_acl_head *a,
                                const struct tomoyo_acl_head *b)
{
        return container_of(a, struct tomoyo_manager, head)->manager ==
                container_of(b, struct tomoyo_manager, head)->manager;
}

/**
 * tomoyo_update_manager_entry - Add a manager entry.
 *
 * @manager:   The path to manager or the domainnamme.
 * @is_delete: True if it is a delete request.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_update_manager_entry(const char *manager,
                                       const bool is_delete)
{
        struct tomoyo_manager e = { };
        struct tomoyo_acl_param param = {
                /* .ns = &tomoyo_kernel_namespace, */
                .is_delete = is_delete,
                .list = &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER],
        };
        int error = is_delete ? -ENOENT : -ENOMEM;

        if (!tomoyo_correct_domain(manager) &&
            !tomoyo_correct_word(manager))
                return -EINVAL;
        e.manager = tomoyo_get_name(manager);
        if (e.manager) {
                error = tomoyo_update_policy(&e.head, sizeof(e), &param,
                                             tomoyo_same_manager);
                tomoyo_put_name(e.manager);
        }
        return error;
}

/**
 * tomoyo_write_manager - Write manager policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_write_manager(struct tomoyo_io_buffer *head)
{
        char *data = head->write_buf;

        if (!strcmp(data, "manage_by_non_root")) {
                tomoyo_manage_by_non_root = !head->w.is_delete;
                return 0;
        }
        return tomoyo_update_manager_entry(data, head->w.is_delete);
}

/**
 * tomoyo_read_manager - Read manager policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Caller holds tomoyo_read_lock().
 */
static void tomoyo_read_manager(struct tomoyo_io_buffer *head)
{
        if (head->r.eof)
                return;
        list_for_each_cookie(head->r.acl, &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER]) {
                struct tomoyo_manager *ptr =
                        list_entry(head->r.acl, typeof(*ptr), head.list);

                if (ptr->head.is_deleted)
                        continue;
                if (!tomoyo_flush(head))
                        return;
                tomoyo_set_string(head, ptr->manager->name);
                tomoyo_set_lf(head);
        }
        head->r.eof = true;
}

/**
 * tomoyo_manager - Check whether the current process is a policy manager.
 *
 * Returns true if the current process is permitted to modify policy
 * via /sys/kernel/security/tomoyo/ interface.
 *
 * Caller holds tomoyo_read_lock().
 */
static bool tomoyo_manager(void)
{
        struct tomoyo_manager *ptr;
        const char *exe;
        const struct task_struct *task = current;
        const struct tomoyo_path_info *domainname = tomoyo_domain()->domainname;
        bool found = IS_ENABLED(CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING);

        if (!tomoyo_policy_loaded)
                return true;
        if (!tomoyo_manage_by_non_root &&
            (!uid_eq(task->cred->uid,  GLOBAL_ROOT_UID) ||
             !uid_eq(task->cred->euid, GLOBAL_ROOT_UID)))
                return false;
        exe = tomoyo_get_exe();
        if (!exe)
                return false;
        list_for_each_entry_rcu(ptr, &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER], head.list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (!ptr->head.is_deleted &&
                    (!tomoyo_pathcmp(domainname, ptr->manager) ||
                     !strcmp(exe, ptr->manager->name))) {
                        found = true;
                        break;
                }
        }
        if (!found) { /* Reduce error messages. */
                static pid_t last_pid;
                const pid_t pid = current->pid;

                if (last_pid != pid) {
                        pr_warn("%s ( %s ) is not permitted to update policies.\n",
                                domainname->name, exe);
                        last_pid = pid;
                }
        }
        kfree(exe);
        return found;
}

static struct tomoyo_domain_info *tomoyo_find_domain_by_qid
(unsigned int serial);

/**
 * tomoyo_select_domain - Parse select command.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @data: String to parse.
 *
 * Returns true on success, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static bool tomoyo_select_domain(struct tomoyo_io_buffer *head,
                                 const char *data)
{
        unsigned int pid;
        struct tomoyo_domain_info *domain = NULL;
        bool global_pid = false;

        if (strncmp(data, "select ", 7))
                return false;
        data += 7;
        if (sscanf(data, "pid=%u", &pid) == 1 ||
            (global_pid = true, sscanf(data, "global-pid=%u", &pid) == 1)) {
                struct task_struct *p;

                rcu_read_lock();
                if (global_pid)
                        p = find_task_by_pid_ns(pid, &init_pid_ns);
                else
                        p = find_task_by_vpid(pid);
                if (p)
                        domain = tomoyo_task(p)->domain_info;
                rcu_read_unlock();
        } else if (!strncmp(data, "domain=", 7)) {
                if (tomoyo_domain_def(data + 7))
                        domain = tomoyo_find_domain(data + 7);
        } else if (sscanf(data, "Q=%u", &pid) == 1) {
                domain = tomoyo_find_domain_by_qid(pid);
        } else
                return false;
        head->w.domain = domain;
        /* Accessing read_buf is safe because head->io_sem is held. */
        if (!head->read_buf)
                return true; /* Do nothing if open(O_WRONLY). */
        memset(&head->r, 0, sizeof(head->r));
        head->r.print_this_domain_only = true;
        if (domain)
                head->r.domain = &domain->list;
        else
                head->r.eof = true;
        tomoyo_io_printf(head, "# select %s\n", data);
        if (domain && domain->is_deleted)
                tomoyo_io_printf(head, "# This is a deleted domain.\n");
        return true;
}

/**
 * tomoyo_same_task_acl - Check for duplicated "struct tomoyo_task_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b, false otherwise.
 */
static bool tomoyo_same_task_acl(const struct tomoyo_acl_info *a,
                                 const struct tomoyo_acl_info *b)
{
        const struct tomoyo_task_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_task_acl *p2 = container_of(b, typeof(*p2), head);

        return p1->domainname == p2->domainname;
}

/**
 * tomoyo_write_task - Update task related list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_write_task(struct tomoyo_acl_param *param)
{
        int error = -EINVAL;

        if (tomoyo_str_starts(&param->data, "manual_domain_transition ")) {
                struct tomoyo_task_acl e = {
                        .head.type = TOMOYO_TYPE_MANUAL_TASK_ACL,
                        .domainname = tomoyo_get_domainname(param),
                };

                if (e.domainname)
                        error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                                     tomoyo_same_task_acl,
                                                     NULL);
                tomoyo_put_name(e.domainname);
        }
        return error;
}

/**
 * tomoyo_delete_domain - Delete a domain.
 *
 * @domainname: The name of domain.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_delete_domain(char *domainname)
{
        struct tomoyo_domain_info *domain;
        struct tomoyo_path_info name;

        name.name = domainname;
        tomoyo_fill_path_info(&name);
        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                return -EINTR;
        /* Is there an active domain? */
        list_for_each_entry_rcu(domain, &tomoyo_domain_list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                /* Never delete tomoyo_kernel_domain */
                if (domain == &tomoyo_kernel_domain)
                        continue;
                if (domain->is_deleted ||
                    tomoyo_pathcmp(domain->domainname, &name))
                        continue;
                domain->is_deleted = true;
                break;
        }
        mutex_unlock(&tomoyo_policy_lock);
        return 0;
}

/**
 * tomoyo_write_domain2 - Write domain policy.
 *
 * @ns:        Pointer to "struct tomoyo_policy_namespace".
 * @list:      Pointer to "struct list_head".
 * @data:      Policy to be interpreted.
 * @is_delete: True if it is a delete request.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_write_domain2(struct tomoyo_policy_namespace *ns,
                                struct list_head *list, char *data,
                                const bool is_delete)
{
        struct tomoyo_acl_param param = {
                .ns = ns,
                .list = list,
                .data = data,
                .is_delete = is_delete,
        };
        static const struct {
                const char *keyword;
                int (*write)(struct tomoyo_acl_param *param);
        } tomoyo_callback[5] = {
                { "file ", tomoyo_write_file },
                { "network inet ", tomoyo_write_inet_network },
                { "network unix ", tomoyo_write_unix_network },
                { "misc ", tomoyo_write_misc },
                { "task ", tomoyo_write_task },
        };
        u8 i;

        for (i = 0; i < ARRAY_SIZE(tomoyo_callback); i++) {
                if (!tomoyo_str_starts(&param.data,
                                       tomoyo_callback[i].keyword))
                        continue;
                return tomoyo_callback[i].write(&param);
        }
        return -EINVAL;
}

/* String table for domain flags. */
const char * const tomoyo_dif[TOMOYO_MAX_DOMAIN_INFO_FLAGS] = {
        [TOMOYO_DIF_QUOTA_WARNED]      = "quota_exceeded\n",
        [TOMOYO_DIF_TRANSITION_FAILED] = "transition_failed\n",
};

/**
 * tomoyo_write_domain - Write domain policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_write_domain(struct tomoyo_io_buffer *head)
{
        char *data = head->write_buf;
        struct tomoyo_policy_namespace *ns;
        struct tomoyo_domain_info *domain = head->w.domain;
        const bool is_delete = head->w.is_delete;
        bool is_select = !is_delete && tomoyo_str_starts(&data, "select ");
        unsigned int idx;

        if (*data == '<') {
                int ret = 0;

                domain = NULL;
                if (is_delete)
                        ret = tomoyo_delete_domain(data);
                else if (is_select)
                        domain = tomoyo_find_domain(data);
                else
                        domain = tomoyo_assign_domain(data, false);
                head->w.domain = domain;
                return ret;
        }
        if (!domain)
                return -EINVAL;
        ns = domain->ns;
        if (sscanf(data, "use_profile %u", &idx) == 1
            && idx < TOMOYO_MAX_PROFILES) {
                if (!tomoyo_policy_loaded || ns->profile_ptr[idx])
                        if (!is_delete)
                                domain->profile = (u8) idx;
                return 0;
        }
        if (sscanf(data, "use_group %u\n", &idx) == 1
            && idx < TOMOYO_MAX_ACL_GROUPS) {
                if (!is_delete)
                        set_bit(idx, domain->group);
                else
                        clear_bit(idx, domain->group);
                return 0;
        }
        for (idx = 0; idx < TOMOYO_MAX_DOMAIN_INFO_FLAGS; idx++) {
                const char *cp = tomoyo_dif[idx];

                if (strncmp(data, cp, strlen(cp) - 1))
                        continue;
                domain->flags[idx] = !is_delete;
                return 0;
        }
        return tomoyo_write_domain2(ns, &domain->acl_info_list, data,
                                    is_delete);
}

/**
 * tomoyo_print_condition - Print condition part.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @cond: Pointer to "struct tomoyo_condition".
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_print_condition(struct tomoyo_io_buffer *head,
                                   const struct tomoyo_condition *cond)
{
        switch (head->r.cond_step) {
        case 0:
                head->r.cond_index = 0;
                head->r.cond_step++;
                if (cond->transit) {
                        tomoyo_set_space(head);
                        tomoyo_set_string(head, cond->transit->name);
                }
                fallthrough;
        case 1:
                {
                        const u16 condc = cond->condc;
                        const struct tomoyo_condition_element *condp =
                                (typeof(condp)) (cond + 1);
                        const struct tomoyo_number_union *numbers_p =
                                (typeof(numbers_p)) (condp + condc);
                        const struct tomoyo_name_union *names_p =
                                (typeof(names_p))
                                (numbers_p + cond->numbers_count);
                        const struct tomoyo_argv *argv =
                                (typeof(argv)) (names_p + cond->names_count);
                        const struct tomoyo_envp *envp =
                                (typeof(envp)) (argv + cond->argc);
                        u16 skip;

                        for (skip = 0; skip < head->r.cond_index; skip++) {
                                const u8 left = condp->left;
                                const u8 right = condp->right;

                                condp++;
                                switch (left) {
                                case TOMOYO_ARGV_ENTRY:
                                        argv++;
                                        continue;
                                case TOMOYO_ENVP_ENTRY:
                                        envp++;
                                        continue;
                                case TOMOYO_NUMBER_UNION:
                                        numbers_p++;
                                        break;
                                }
                                switch (right) {
                                case TOMOYO_NAME_UNION:
                                        names_p++;
                                        break;
                                case TOMOYO_NUMBER_UNION:
                                        numbers_p++;
                                        break;
                                }
                        }
                        while (head->r.cond_index < condc) {
                                const u8 match = condp->equals;
                                const u8 left = condp->left;
                                const u8 right = condp->right;

                                if (!tomoyo_flush(head))
                                        return false;
                                condp++;
                                head->r.cond_index++;
                                tomoyo_set_space(head);
                                switch (left) {
                                case TOMOYO_ARGV_ENTRY:
                                        tomoyo_io_printf(head,
                                                         "exec.argv[%lu]%s=\"",
                                                         argv->index, argv->is_not ? "!" : "");
                                        tomoyo_set_string(head,
                                                          argv->value->name);
                                        tomoyo_set_string(head, "\"");
                                        argv++;
                                        continue;
                                case TOMOYO_ENVP_ENTRY:
                                        tomoyo_set_string(head,
                                                          "exec.envp[\"");
                                        tomoyo_set_string(head,
                                                          envp->name->name);
                                        tomoyo_io_printf(head, "\"]%s=", envp->is_not ? "!" : "");
                                        if (envp->value) {
                                                tomoyo_set_string(head, "\"");
                                                tomoyo_set_string(head, envp->value->name);
                                                tomoyo_set_string(head, "\"");
                                        } else {
                                                tomoyo_set_string(head,
                                                                  "NULL");
                                        }
                                        envp++;
                                        continue;
                                case TOMOYO_NUMBER_UNION:
                                        tomoyo_print_number_union_nospace
                                                (head, numbers_p++);
                                        break;
                                default:
                                        tomoyo_set_string(head,
                                               tomoyo_condition_keyword[left]);
                                        break;
                                }
                                tomoyo_set_string(head, match ? "=" : "!=");
                                switch (right) {
                                case TOMOYO_NAME_UNION:
                                        tomoyo_print_name_union_quoted
                                                (head, names_p++);
                                        break;
                                case TOMOYO_NUMBER_UNION:
                                        tomoyo_print_number_union_nospace
                                                (head, numbers_p++);
                                        break;
                                default:
                                        tomoyo_set_string(head,
                                          tomoyo_condition_keyword[right]);
                                        break;
                                }
                        }
                }
                head->r.cond_step++;
                fallthrough;
        case 2:
                if (!tomoyo_flush(head))
                        break;
                head->r.cond_step++;
                fallthrough;
        case 3:
                if (cond->grant_log != TOMOYO_GRANTLOG_AUTO)
                        tomoyo_io_printf(head, " grant_log=%s",
                                         str_yes_no(cond->grant_log ==
                                                    TOMOYO_GRANTLOG_YES));
                tomoyo_set_lf(head);
                return true;
        }
        return false;
}

/**
 * tomoyo_set_group - Print "acl_group " header keyword and category name.
 *
 * @head:     Pointer to "struct tomoyo_io_buffer".
 * @category: Category name.
 *
 * Returns nothing.
 */
static void tomoyo_set_group(struct tomoyo_io_buffer *head,
                             const char *category)
{
        if (head->type == TOMOYO_EXCEPTIONPOLICY) {
                tomoyo_print_namespace(head);
                tomoyo_io_printf(head, "acl_group %u ",
                                 head->r.acl_group_index);
        }
        tomoyo_set_string(head, category);
}

/**
 * tomoyo_print_entry - Print an ACL entry.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @acl:  Pointer to an ACL entry.
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_print_entry(struct tomoyo_io_buffer *head,
                               struct tomoyo_acl_info *acl)
{
        const u8 acl_type = acl->type;
        bool first = true;
        u8 bit;

        if (head->r.print_cond_part)
                goto print_cond_part;
        if (acl->is_deleted)
                return true;
        if (!tomoyo_flush(head))
                return false;
        else if (acl_type == TOMOYO_TYPE_PATH_ACL) {
                struct tomoyo_path_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u16 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_PATH_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (head->r.print_transition_related_only &&
                            bit != TOMOYO_TYPE_EXECUTE)
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "file ");
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_path_keyword[bit]);
                }
                if (first)
                        return true;
                tomoyo_print_name_union(head, &ptr->name);
        } else if (acl_type == TOMOYO_TYPE_MANUAL_TASK_ACL) {
                struct tomoyo_task_acl *ptr =
                        container_of(acl, typeof(*ptr), head);

                tomoyo_set_group(head, "task ");
                tomoyo_set_string(head, "manual_domain_transition ");
                tomoyo_set_string(head, ptr->domainname->name);
        } else if (head->r.print_transition_related_only) {
                return true;
        } else if (acl_type == TOMOYO_TYPE_PATH2_ACL) {
                struct tomoyo_path2_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_PATH2_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "file ");
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_mac_keywords
                                          [tomoyo_pp2mac[bit]]);
                }
                if (first)
                        return true;
                tomoyo_print_name_union(head, &ptr->name1);
                tomoyo_print_name_union(head, &ptr->name2);
        } else if (acl_type == TOMOYO_TYPE_PATH_NUMBER_ACL) {
                struct tomoyo_path_number_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_PATH_NUMBER_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "file ");
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_mac_keywords
                                          [tomoyo_pn2mac[bit]]);
                }
                if (first)
                        return true;
                tomoyo_print_name_union(head, &ptr->name);
                tomoyo_print_number_union(head, &ptr->number);
        } else if (acl_type == TOMOYO_TYPE_MKDEV_ACL) {
                struct tomoyo_mkdev_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_MKDEV_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "file ");
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_mac_keywords
                                          [tomoyo_pnnn2mac[bit]]);
                }
                if (first)
                        return true;
                tomoyo_print_name_union(head, &ptr->name);
                tomoyo_print_number_union(head, &ptr->mode);
                tomoyo_print_number_union(head, &ptr->major);
                tomoyo_print_number_union(head, &ptr->minor);
        } else if (acl_type == TOMOYO_TYPE_INET_ACL) {
                struct tomoyo_inet_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_NETWORK_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "network inet ");
                                tomoyo_set_string(head, tomoyo_proto_keyword
                                                  [ptr->protocol]);
                                tomoyo_set_space(head);
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_socket_keyword[bit]);
                }
                if (first)
                        return true;
                tomoyo_set_space(head);
                if (ptr->address.group) {
                        tomoyo_set_string(head, "@");
                        tomoyo_set_string(head, ptr->address.group->group_name
                                          ->name);
                } else {
                        char buf[128];

                        tomoyo_print_ip(buf, sizeof(buf), &ptr->address);
                        tomoyo_io_printf(head, "%s", buf);
                }
                tomoyo_print_number_union(head, &ptr->port);
        } else if (acl_type == TOMOYO_TYPE_UNIX_ACL) {
                struct tomoyo_unix_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_NETWORK_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "network unix ");
                                tomoyo_set_string(head, tomoyo_proto_keyword
                                                  [ptr->protocol]);
                                tomoyo_set_space(head);
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_socket_keyword[bit]);
                }
                if (first)
                        return true;
                tomoyo_print_name_union(head, &ptr->name);
        } else if (acl_type == TOMOYO_TYPE_MOUNT_ACL) {
                struct tomoyo_mount_acl *ptr =
                        container_of(acl, typeof(*ptr), head);

                tomoyo_set_group(head, "file mount");
                tomoyo_print_name_union(head, &ptr->dev_name);
                tomoyo_print_name_union(head, &ptr->dir_name);
                tomoyo_print_name_union(head, &ptr->fs_type);
                tomoyo_print_number_union(head, &ptr->flags);
        } else if (acl_type == TOMOYO_TYPE_ENV_ACL) {
                struct tomoyo_env_acl *ptr =
                        container_of(acl, typeof(*ptr), head);

                tomoyo_set_group(head, "misc env ");
                tomoyo_set_string(head, ptr->env->name);
        }
        if (acl->cond) {
                head->r.print_cond_part = true;
                head->r.cond_step = 0;
                if (!tomoyo_flush(head))
                        return false;
print_cond_part:
                if (!tomoyo_print_condition(head, acl->cond))
                        return false;
                head->r.print_cond_part = false;
        } else {
                tomoyo_set_lf(head);
        }
        return true;
}

/**
 * tomoyo_read_domain2 - Read domain policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @list: Pointer to "struct list_head".
 *
 * Caller holds tomoyo_read_lock().
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_read_domain2(struct tomoyo_io_buffer *head,
                                struct list_head *list)
{
        list_for_each_cookie(head->r.acl, list) {
                struct tomoyo_acl_info *ptr =
                        list_entry(head->r.acl, typeof(*ptr), list);

                if (!tomoyo_print_entry(head, ptr))
                        return false;
        }
        head->r.acl = NULL;
        return true;
}

/**
 * tomoyo_read_domain - Read domain policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Caller holds tomoyo_read_lock().
 */
static void tomoyo_read_domain(struct tomoyo_io_buffer *head)
{
        if (head->r.eof)
                return;
        list_for_each_cookie(head->r.domain, &tomoyo_domain_list) {
                struct tomoyo_domain_info *domain =
                        list_entry(head->r.domain, typeof(*domain), list);
                u8 i;

                switch (head->r.step) {
                case 0:
                        if (domain->is_deleted &&
                            !head->r.print_this_domain_only)
                                continue;
                        /* Print domainname and flags. */
                        tomoyo_set_string(head, domain->domainname->name);
                        tomoyo_set_lf(head);
                        tomoyo_io_printf(head, "use_profile %u\n",
                                         domain->profile);
                        for (i = 0; i < TOMOYO_MAX_DOMAIN_INFO_FLAGS; i++)
                                if (domain->flags[i])
                                        tomoyo_set_string(head, tomoyo_dif[i]);
                        head->r.index = 0;
                        head->r.step++;
                        fallthrough;
                case 1:
                        while (head->r.index < TOMOYO_MAX_ACL_GROUPS) {
                                i = head->r.index++;
                                if (!test_bit(i, domain->group))
                                        continue;
                                tomoyo_io_printf(head, "use_group %u\n", i);
                                if (!tomoyo_flush(head))
                                        return;
                        }
                        head->r.index = 0;
                        head->r.step++;
                        tomoyo_set_lf(head);
                        fallthrough;
                case 2:
                        if (!tomoyo_read_domain2(head, &domain->acl_info_list))
                                return;
                        head->r.step++;
                        if (!tomoyo_set_lf(head))
                                return;
                        fallthrough;
                case 3:
                        head->r.step = 0;
                        if (head->r.print_this_domain_only)
                                goto done;
                }
        }
 done:
        head->r.eof = true;
}

/**
 * tomoyo_write_pid: Specify PID to obtain domainname.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0.
 */
static int tomoyo_write_pid(struct tomoyo_io_buffer *head)
{
        head->r.eof = false;
        return 0;
}

/**
 * tomoyo_read_pid - Get domainname of the specified PID.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns the domainname which the specified PID is in on success,
 * empty string otherwise.
 * The PID is specified by tomoyo_write_pid() so that the user can obtain
 * using read()/write() interface rather than sysctl() interface.
 */
static void tomoyo_read_pid(struct tomoyo_io_buffer *head)
{
        char *buf = head->write_buf;
        bool global_pid = false;
        unsigned int pid;
        struct task_struct *p;
        struct tomoyo_domain_info *domain = NULL;

        /* Accessing write_buf is safe because head->io_sem is held. */
        if (!buf) {
                head->r.eof = true;
                return; /* Do nothing if open(O_RDONLY). */
        }
        if (head->r.w_pos || head->r.eof)
                return;
        head->r.eof = true;
        if (tomoyo_str_starts(&buf, "global-pid "))
                global_pid = true;
        if (kstrtouint(buf, 10, &pid))
                return;
        rcu_read_lock();
        if (global_pid)
                p = find_task_by_pid_ns(pid, &init_pid_ns);
        else
                p = find_task_by_vpid(pid);
        if (p)
                domain = tomoyo_task(p)->domain_info;
        rcu_read_unlock();
        if (!domain)
                return;
        tomoyo_io_printf(head, "%u %u ", pid, domain->profile);
        tomoyo_set_string(head, domain->domainname->name);
}

/* String table for domain transition control keywords. */
static const char *tomoyo_transition_type[TOMOYO_MAX_TRANSITION_TYPE] = {
        [TOMOYO_TRANSITION_CONTROL_NO_RESET]      = "no_reset_domain ",
        [TOMOYO_TRANSITION_CONTROL_RESET]         = "reset_domain ",
        [TOMOYO_TRANSITION_CONTROL_NO_INITIALIZE] = "no_initialize_domain ",
        [TOMOYO_TRANSITION_CONTROL_INITIALIZE]    = "initialize_domain ",
        [TOMOYO_TRANSITION_CONTROL_NO_KEEP]       = "no_keep_domain ",
        [TOMOYO_TRANSITION_CONTROL_KEEP]          = "keep_domain ",
};

/* String table for grouping keywords. */
static const char *tomoyo_group_name[TOMOYO_MAX_GROUP] = {
        [TOMOYO_PATH_GROUP]    = "path_group ",
        [TOMOYO_NUMBER_GROUP]  = "number_group ",
        [TOMOYO_ADDRESS_GROUP] = "address_group ",
};

/**
 * tomoyo_write_exception - Write exception policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_write_exception(struct tomoyo_io_buffer *head)
{
        const bool is_delete = head->w.is_delete;
        struct tomoyo_acl_param param = {
                .ns = head->w.ns,
                .is_delete = is_delete,
                .data = head->write_buf,
        };
        u8 i;

        if (tomoyo_str_starts(&param.data, "aggregator "))
                return tomoyo_write_aggregator(&param);
        for (i = 0; i < TOMOYO_MAX_TRANSITION_TYPE; i++)
                if (tomoyo_str_starts(&param.data, tomoyo_transition_type[i]))
                        return tomoyo_write_transition_control(&param, i);
        for (i = 0; i < TOMOYO_MAX_GROUP; i++)
                if (tomoyo_str_starts(&param.data, tomoyo_group_name[i]))
                        return tomoyo_write_group(&param, i);
        if (tomoyo_str_starts(&param.data, "acl_group ")) {
                unsigned int group;
                char *data;

                group = simple_strtoul(param.data, &data, 10);
                if (group < TOMOYO_MAX_ACL_GROUPS && *data++ == ' ')
                        return tomoyo_write_domain2
                                (head->w.ns, &head->w.ns->acl_group[group],
                                 data, is_delete);
        }
        return -EINVAL;
}

/**
 * tomoyo_read_group - Read "struct tomoyo_path_group"/"struct tomoyo_number_group"/"struct tomoyo_address_group" list.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @idx:  Index number.
 *
 * Returns true on success, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static bool tomoyo_read_group(struct tomoyo_io_buffer *head, const int idx)
{
        struct tomoyo_policy_namespace *ns =
                container_of(head->r.ns, typeof(*ns), namespace_list);
        struct list_head *list = &ns->group_list[idx];

        list_for_each_cookie(head->r.group, list) {
                struct tomoyo_group *group =
                        list_entry(head->r.group, typeof(*group), head.list);

                list_for_each_cookie(head->r.acl, &group->member_list) {
                        struct tomoyo_acl_head *ptr =
                                list_entry(head->r.acl, typeof(*ptr), list);

                        if (ptr->is_deleted)
                                continue;
                        if (!tomoyo_flush(head))
                                return false;
                        tomoyo_print_namespace(head);
                        tomoyo_set_string(head, tomoyo_group_name[idx]);
                        tomoyo_set_string(head, group->group_name->name);
                        if (idx == TOMOYO_PATH_GROUP) {
                                tomoyo_set_space(head);
                                tomoyo_set_string(head, container_of
                                               (ptr, struct tomoyo_path_group,
                                                head)->member_name->name);
                        } else if (idx == TOMOYO_NUMBER_GROUP) {
                                tomoyo_print_number_union(head, &container_of
                                                          (ptr,
                                                   struct tomoyo_number_group,
                                                           head)->number);
                        } else if (idx == TOMOYO_ADDRESS_GROUP) {
                                char buffer[128];
                                struct tomoyo_address_group *member =
                                        container_of(ptr, typeof(*member),
                                                     head);

                                tomoyo_print_ip(buffer, sizeof(buffer),
                                                &member->address);
                                tomoyo_io_printf(head, " %s", buffer);
                        }
                        tomoyo_set_lf(head);
                }
                head->r.acl = NULL;
        }
        head->r.group = NULL;
        return true;
}

/**
 * tomoyo_read_policy - Read "struct tomoyo_..._entry" list.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @idx:  Index number.
 *
 * Returns true on success, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static bool tomoyo_read_policy(struct tomoyo_io_buffer *head, const int idx)
{
        struct tomoyo_policy_namespace *ns =
                container_of(head->r.ns, typeof(*ns), namespace_list);
        struct list_head *list = &ns->policy_list[idx];

        list_for_each_cookie(head->r.acl, list) {
                struct tomoyo_acl_head *acl =
                        container_of(head->r.acl, typeof(*acl), list);
                if (acl->is_deleted)
                        continue;
                if (!tomoyo_flush(head))
                        return false;
                switch (idx) {
                case TOMOYO_ID_TRANSITION_CONTROL:
                        {
                                struct tomoyo_transition_control *ptr =
                                        container_of(acl, typeof(*ptr), head);

                                tomoyo_print_namespace(head);
                                tomoyo_set_string(head, tomoyo_transition_type
                                                  [ptr->type]);
                                tomoyo_set_string(head, ptr->program ?
                                                  ptr->program->name : "any");
                                tomoyo_set_string(head, " from ");
                                tomoyo_set_string(head, ptr->domainname ?
                                                  ptr->domainname->name :
                                                  "any");
                        }
                        break;
                case TOMOYO_ID_AGGREGATOR:
                        {
                                struct tomoyo_aggregator *ptr =
                                        container_of(acl, typeof(*ptr), head);

                                tomoyo_print_namespace(head);
                                tomoyo_set_string(head, "aggregator ");
                                tomoyo_set_string(head,
                                                  ptr->original_name->name);
                                tomoyo_set_space(head);
                                tomoyo_set_string(head,
                                               ptr->aggregated_name->name);
                        }
                        break;
                default:
                        continue;
                }
                tomoyo_set_lf(head);
        }
        head->r.acl = NULL;
        return true;
}

/**
 * tomoyo_read_exception - Read exception policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Caller holds tomoyo_read_lock().
 */
static void tomoyo_read_exception(struct tomoyo_io_buffer *head)
{
        struct tomoyo_policy_namespace *ns =
                container_of(head->r.ns, typeof(*ns), namespace_list);

        if (head->r.eof)
                return;
        while (head->r.step < TOMOYO_MAX_POLICY &&
               tomoyo_read_policy(head, head->r.step))
                head->r.step++;
        if (head->r.step < TOMOYO_MAX_POLICY)
                return;
        while (head->r.step < TOMOYO_MAX_POLICY + TOMOYO_MAX_GROUP &&
               tomoyo_read_group(head, head->r.step - TOMOYO_MAX_POLICY))
                head->r.step++;
        if (head->r.step < TOMOYO_MAX_POLICY + TOMOYO_MAX_GROUP)
                return;
        while (head->r.step < TOMOYO_MAX_POLICY + TOMOYO_MAX_GROUP
               + TOMOYO_MAX_ACL_GROUPS) {
                head->r.acl_group_index = head->r.step - TOMOYO_MAX_POLICY
                        - TOMOYO_MAX_GROUP;
                if (!tomoyo_read_domain2(head, &ns->acl_group
                                         [head->r.acl_group_index]))
                        return;
                head->r.step++;
        }
        head->r.eof = true;
}

/* Wait queue for kernel -> userspace notification. */
static DECLARE_WAIT_QUEUE_HEAD(tomoyo_query_wait);
/* Wait queue for userspace -> kernel notification. */
static DECLARE_WAIT_QUEUE_HEAD(tomoyo_answer_wait);

/* Structure for query. */
struct tomoyo_query {
        struct list_head list;
        struct tomoyo_domain_info *domain;
        char *query;
        size_t query_len;
        unsigned int serial;
        u8 timer;
        u8 answer;
        u8 retry;
};

/* The list for "struct tomoyo_query". */
static LIST_HEAD(tomoyo_query_list);

/* Lock for manipulating tomoyo_query_list. */
static DEFINE_SPINLOCK(tomoyo_query_list_lock);

/*
 * Number of "struct file" referring /sys/kernel/security/tomoyo/query
 * interface.
 */
static atomic_t tomoyo_query_observers = ATOMIC_INIT(0);

/**
 * tomoyo_truncate - Truncate a line.
 *
 * @str: String to truncate.
 *
 * Returns length of truncated @str.
 */
static int tomoyo_truncate(char *str)
{
        char *start = str;

        while (*(unsigned char *) str > (unsigned char) ' ')
                str++;
        *str = '\0';
        return strlen(start) + 1;
}

/**
 * tomoyo_add_entry - Add an ACL to current thread's domain. Used by learning mode.
 *
 * @domain: Pointer to "struct tomoyo_domain_info".
 * @header: Lines containing ACL.
 *
 * Returns nothing.
 */
static void tomoyo_add_entry(struct tomoyo_domain_info *domain, char *header)
{
        char *buffer;
        char *realpath = NULL;
        char *argv0 = NULL;
        char *symlink = NULL;
        char *cp = strchr(header, '\n');
        int len;

        if (!cp)
                return;
        cp = strchr(cp + 1, '\n');
        if (!cp)
                return;
        *cp++ = '\0';
        len = strlen(cp) + 1;
        /* strstr() will return NULL if ordering is wrong. */
        if (*cp == 'f') {
                argv0 = strstr(header, " argv[]={ \"");
                if (argv0) {
                        argv0 += 10;
                        len += tomoyo_truncate(argv0) + 14;
                }
                realpath = strstr(header, " exec={ realpath=\"");
                if (realpath) {
                        realpath += 8;
                        len += tomoyo_truncate(realpath) + 6;
                }
                symlink = strstr(header, " symlink.target=\"");
                if (symlink)
                        len += tomoyo_truncate(symlink + 1) + 1;
        }
        buffer = kmalloc(len, GFP_NOFS);
        if (!buffer)
                return;
        snprintf(buffer, len - 1, "%s", cp);
        if (realpath)
                tomoyo_addprintf(buffer, len, " exec.%s", realpath);
        if (argv0)
                tomoyo_addprintf(buffer, len, " exec.argv[0]=%s", argv0);
        if (symlink)
                tomoyo_addprintf(buffer, len, "%s", symlink);
        tomoyo_normalize_line(buffer);
        if (!tomoyo_write_domain2(domain->ns, &domain->acl_info_list, buffer,
                                  false))
                tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES);
        kfree(buffer);
}

/**
 * tomoyo_supervisor - Ask for the supervisor's decision.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @fmt: The printf()'s format string, followed by parameters.
 *
 * Returns 0 if the supervisor decided to permit the access request which
 * violated the policy in enforcing mode, TOMOYO_RETRY_REQUEST if the
 * supervisor decided to retry the access request which violated the policy in
 * enforcing mode, 0 if it is not in enforcing mode, -EPERM otherwise.
 */
int tomoyo_supervisor(struct tomoyo_request_info *r, const char *fmt, ...)
{
        va_list args;
        int error;
        int len;
        static unsigned int tomoyo_serial;
        struct tomoyo_query entry = { };
        bool quota_exceeded = false;

        va_start(args, fmt);
        len = vsnprintf(NULL, 0, fmt, args) + 1;
        va_end(args);
        /* Write /sys/kernel/security/tomoyo/audit. */
        va_start(args, fmt);
        tomoyo_write_log2(r, len, fmt, args);
        va_end(args);
        /* Nothing more to do if granted. */
        if (r->granted)
                return 0;
        if (r->mode)
                tomoyo_update_stat(r->mode);
        switch (r->mode) {
        case TOMOYO_CONFIG_ENFORCING:
                error = -EPERM;
                if (atomic_read(&tomoyo_query_observers))
                        break;
                goto out;
        case TOMOYO_CONFIG_LEARNING:
                error = 0;
                /* Check max_learning_entry parameter. */
                if (tomoyo_domain_quota_is_ok(r))
                        break;
                fallthrough;
        default:
                return 0;
        }
        /* Get message. */
        va_start(args, fmt);
        entry.query = tomoyo_init_log(r, len, fmt, args);
        va_end(args);
        if (!entry.query)
                goto out;
        entry.query_len = strlen(entry.query) + 1;
        if (!error) {
                tomoyo_add_entry(r->domain, entry.query);
                goto out;
        }
        len = kmalloc_size_roundup(entry.query_len);
        entry.domain = r->domain;
        spin_lock(&tomoyo_query_list_lock);
        if (tomoyo_memory_quota[TOMOYO_MEMORY_QUERY] &&
            tomoyo_memory_used[TOMOYO_MEMORY_QUERY] + len
            >= tomoyo_memory_quota[TOMOYO_MEMORY_QUERY]) {
                quota_exceeded = true;
        } else {
                entry.serial = tomoyo_serial++;
                entry.retry = r->retry;
                tomoyo_memory_used[TOMOYO_MEMORY_QUERY] += len;
                list_add_tail(&entry.list, &tomoyo_query_list);
        }
        spin_unlock(&tomoyo_query_list_lock);
        if (quota_exceeded)
                goto out;
        /* Give 10 seconds for supervisor's opinion. */
        while (entry.timer < 10) {
                wake_up_all(&tomoyo_query_wait);
                if (wait_event_interruptible_timeout
                    (tomoyo_answer_wait, entry.answer ||
                     !atomic_read(&tomoyo_query_observers), HZ))
                        break;
                entry.timer++;
        }
        spin_lock(&tomoyo_query_list_lock);
        list_del(&entry.list);
        tomoyo_memory_used[TOMOYO_MEMORY_QUERY] -= len;
        spin_unlock(&tomoyo_query_list_lock);
        switch (entry.answer) {
        case 3: /* Asked to retry by administrator. */
                error = TOMOYO_RETRY_REQUEST;
                r->retry++;
                break;
        case 1:
                /* Granted by administrator. */
                error = 0;
                break;
        default:
                /* Timed out or rejected by administrator. */
                break;
        }
out:
        kfree(entry.query);
        return error;
}

/**
 * tomoyo_find_domain_by_qid - Get domain by query id.
 *
 * @serial: Query ID assigned by tomoyo_supervisor().
 *
 * Returns pointer to "struct tomoyo_domain_info" if found, NULL otherwise.
 */
static struct tomoyo_domain_info *tomoyo_find_domain_by_qid
(unsigned int serial)
{
        struct tomoyo_query *ptr;
        struct tomoyo_domain_info *domain = NULL;

        spin_lock(&tomoyo_query_list_lock);
        list_for_each_entry(ptr, &tomoyo_query_list, list) {
                if (ptr->serial != serial)
                        continue;
                domain = ptr->domain;
                break;
        }
        spin_unlock(&tomoyo_query_list_lock);
        return domain;
}

/**
 * tomoyo_poll_query - poll() for /sys/kernel/security/tomoyo/query.
 *
 * @file: Pointer to "struct file".
 * @wait: Pointer to "poll_table".
 *
 * Returns EPOLLIN | EPOLLRDNORM when ready to read, 0 otherwise.
 *
 * Waits for access requests which violated policy in enforcing mode.
 */
static __poll_t tomoyo_poll_query(struct file *file, poll_table *wait)
{
        if (!list_empty(&tomoyo_query_list))
                return EPOLLIN | EPOLLRDNORM;
        poll_wait(file, &tomoyo_query_wait, wait);
        if (!list_empty(&tomoyo_query_list))
                return EPOLLIN | EPOLLRDNORM;
        return 0;
}

/**
 * tomoyo_read_query - Read access requests which violated policy in enforcing mode.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 */
static void tomoyo_read_query(struct tomoyo_io_buffer *head)
{
        struct list_head *tmp;
        unsigned int pos = 0;
        size_t len = 0;
        char *buf;

        if (head->r.w_pos)
                return;
        kfree(head->read_buf);
        head->read_buf = NULL;
        spin_lock(&tomoyo_query_list_lock);
        list_for_each(tmp, &tomoyo_query_list) {
                struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);

                if (pos++ != head->r.query_index)
                        continue;
                len = ptr->query_len;
                break;
        }
        spin_unlock(&tomoyo_query_list_lock);
        if (!len) {
                head->r.query_index = 0;
                return;
        }
        buf = kzalloc(len + 32, GFP_NOFS);
        if (!buf)
                return;
        pos = 0;
        spin_lock(&tomoyo_query_list_lock);
        list_for_each(tmp, &tomoyo_query_list) {
                struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);

                if (pos++ != head->r.query_index)
                        continue;
                /*
                 * Some query can be skipped because tomoyo_query_list
                 * can change, but I don't care.
                 */
                if (len == ptr->query_len)
                        snprintf(buf, len + 31, "Q%u-%hu\n%s", ptr->serial,
                                 ptr->retry, ptr->query);
                break;
        }
        spin_unlock(&tomoyo_query_list_lock);
        if (buf[0]) {
                head->read_buf = buf;
                head->r.w[head->r.w_pos++] = buf;
                head->r.query_index++;
        } else {
                kfree(buf);
        }
}

/**
 * tomoyo_write_answer - Write the supervisor's decision.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0 on success, -EINVAL otherwise.
 */
static int tomoyo_write_answer(struct tomoyo_io_buffer *head)
{
        char *data = head->write_buf;
        struct list_head *tmp;
        unsigned int serial;
        unsigned int answer;

        spin_lock(&tomoyo_query_list_lock);
        list_for_each(tmp, &tomoyo_query_list) {
                struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);

                ptr->timer = 0;
        }
        spin_unlock(&tomoyo_query_list_lock);
        if (sscanf(data, "A%u=%u", &serial, &answer) != 2)
                return -EINVAL;
        spin_lock(&tomoyo_query_list_lock);
        list_for_each(tmp, &tomoyo_query_list) {
                struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);

                if (ptr->serial != serial)
                        continue;
                ptr->answer = answer;
                /* Remove from tomoyo_query_list. */
                if (ptr->answer)
                        list_del_init(&ptr->list);
                break;
        }
        spin_unlock(&tomoyo_query_list_lock);
        return 0;
}

/**
 * tomoyo_read_version: Get version.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns version information.
 */
static void tomoyo_read_version(struct tomoyo_io_buffer *head)
{
        if (!head->r.eof) {
                tomoyo_io_printf(head, "2.6.0");
                head->r.eof = true;
        }
}

/* String table for /sys/kernel/security/tomoyo/stat interface. */
static const char * const tomoyo_policy_headers[TOMOYO_MAX_POLICY_STAT] = {
        [TOMOYO_STAT_POLICY_UPDATES]    = "update:",
        [TOMOYO_STAT_POLICY_LEARNING]   = "violation in learning mode:",
        [TOMOYO_STAT_POLICY_PERMISSIVE] = "violation in permissive mode:",
        [TOMOYO_STAT_POLICY_ENFORCING]  = "violation in enforcing mode:",
};

/* String table for /sys/kernel/security/tomoyo/stat interface. */
static const char * const tomoyo_memory_headers[TOMOYO_MAX_MEMORY_STAT] = {
        [TOMOYO_MEMORY_POLICY] = "policy:",
        [TOMOYO_MEMORY_AUDIT]  = "audit log:",
        [TOMOYO_MEMORY_QUERY]  = "query message:",
};

/* Counter for number of updates. */
static atomic_t tomoyo_stat_updated[TOMOYO_MAX_POLICY_STAT];
/* Timestamp counter for last updated. */
static time64_t tomoyo_stat_modified[TOMOYO_MAX_POLICY_STAT];

/**
 * tomoyo_update_stat - Update statistic counters.
 *
 * @index: Index for policy type.
 *
 * Returns nothing.
 */
void tomoyo_update_stat(const u8 index)
{
        atomic_inc(&tomoyo_stat_updated[index]);
        tomoyo_stat_modified[index] = ktime_get_real_seconds();
}

/**
 * tomoyo_read_stat - Read statistic data.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static void tomoyo_read_stat(struct tomoyo_io_buffer *head)
{
        u8 i;
        unsigned int total = 0;

        if (head->r.eof)
                return;
        for (i = 0; i < TOMOYO_MAX_POLICY_STAT; i++) {
                tomoyo_io_printf(head, "Policy %-30s %10u",
                                 tomoyo_policy_headers[i],
                                 atomic_read(&tomoyo_stat_updated[i]));
                if (tomoyo_stat_modified[i]) {
                        struct tomoyo_time stamp;

                        tomoyo_convert_time(tomoyo_stat_modified[i], &stamp);
                        tomoyo_io_printf(head, " (Last: %04u/%02u/%02u %02u:%02u:%02u)",
                                         stamp.year, stamp.month, stamp.day,
                                         stamp.hour, stamp.min, stamp.sec);
                }
                tomoyo_set_lf(head);
        }
        for (i = 0; i < TOMOYO_MAX_MEMORY_STAT; i++) {
                unsigned int used = tomoyo_memory_used[i];

                total += used;
                tomoyo_io_printf(head, "Memory used by %-22s %10u",
                                 tomoyo_memory_headers[i], used);
                used = tomoyo_memory_quota[i];
                if (used)
                        tomoyo_io_printf(head, " (Quota: %10u)", used);
                tomoyo_set_lf(head);
        }
        tomoyo_io_printf(head, "Total memory used:                    %10u\n",
                         total);
        head->r.eof = true;
}

/**
 * tomoyo_write_stat - Set memory quota.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0.
 */
static int tomoyo_write_stat(struct tomoyo_io_buffer *head)
{
        char *data = head->write_buf;
        u8 i;

        if (tomoyo_str_starts(&data, "Memory used by "))
                for (i = 0; i < TOMOYO_MAX_MEMORY_STAT; i++)
                        if (tomoyo_str_starts(&data, tomoyo_memory_headers[i]))
                                sscanf(data, "%u", &tomoyo_memory_quota[i]);
        return 0;
}

/**
 * tomoyo_open_control - open() for /sys/kernel/security/tomoyo/ interface.
 *
 * @type: Type of interface.
 * @file: Pointer to "struct file".
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_open_control(const u8 type, struct file *file)
{
        struct tomoyo_io_buffer *head = kzalloc(sizeof(*head), GFP_NOFS);

        if (!head)
                return -ENOMEM;
        mutex_init(&head->io_sem);
        head->type = type;
        switch (type) {
        case TOMOYO_DOMAINPOLICY:
                /* /sys/kernel/security/tomoyo/domain_policy */
                head->write = tomoyo_write_domain;
                head->read = tomoyo_read_domain;
                break;
        case TOMOYO_EXCEPTIONPOLICY:
                /* /sys/kernel/security/tomoyo/exception_policy */
                head->write = tomoyo_write_exception;
                head->read = tomoyo_read_exception;
                break;
        case TOMOYO_AUDIT:
                /* /sys/kernel/security/tomoyo/audit */
                head->poll = tomoyo_poll_log;
                head->read = tomoyo_read_log;
                break;
        case TOMOYO_PROCESS_STATUS:
                /* /sys/kernel/security/tomoyo/.process_status */
                head->write = tomoyo_write_pid;
                head->read = tomoyo_read_pid;
                break;
        case TOMOYO_VERSION:
                /* /sys/kernel/security/tomoyo/version */
                head->read = tomoyo_read_version;
                head->readbuf_size = 128;
                break;
        case TOMOYO_STAT:
                /* /sys/kernel/security/tomoyo/stat */
                head->write = tomoyo_write_stat;
                head->read = tomoyo_read_stat;
                head->readbuf_size = 1024;
                break;
        case TOMOYO_PROFILE:
                /* /sys/kernel/security/tomoyo/profile */
                head->write = tomoyo_write_profile;
                head->read = tomoyo_read_profile;
                break;
        case TOMOYO_QUERY: /* /sys/kernel/security/tomoyo/query */
                head->poll = tomoyo_poll_query;
                head->write = tomoyo_write_answer;
                head->read = tomoyo_read_query;
                break;
        case TOMOYO_MANAGER:
                /* /sys/kernel/security/tomoyo/manager */
                head->write = tomoyo_write_manager;
                head->read = tomoyo_read_manager;
                break;
        }
        if (!(file->f_mode & FMODE_READ)) {
                /*
                 * No need to allocate read_buf since it is not opened
                 * for reading.
                 */
                head->read = NULL;
                head->poll = NULL;
        } else if (!head->poll) {
                /* Don't allocate read_buf for poll() access. */
                if (!head->readbuf_size)
                        head->readbuf_size = 4096 * 2;
                head->read_buf = kzalloc(head->readbuf_size, GFP_NOFS);
                if (!head->read_buf) {
                        kfree(head);
                        return -ENOMEM;
                }
        }
        if (!(file->f_mode & FMODE_WRITE)) {
                /*
                 * No need to allocate write_buf since it is not opened
                 * for writing.
                 */
                head->write = NULL;
        } else if (head->write) {
                head->writebuf_size = 4096 * 2;
                head->write_buf = kzalloc(head->writebuf_size, GFP_NOFS);
                if (!head->write_buf) {
                        kfree(head->read_buf);
                        kfree(head);
                        return -ENOMEM;
                }
        }
        /*
         * If the file is /sys/kernel/security/tomoyo/query , increment the
         * observer counter.
         * The obserber counter is used by tomoyo_supervisor() to see if
         * there is some process monitoring /sys/kernel/security/tomoyo/query.
         */
        if (type == TOMOYO_QUERY)
                atomic_inc(&tomoyo_query_observers);
        file->private_data = head;
        tomoyo_notify_gc(head, true);
        return 0;
}

/**
 * tomoyo_poll_control - poll() for /sys/kernel/security/tomoyo/ interface.
 *
 * @file: Pointer to "struct file".
 * @wait: Pointer to "poll_table". Maybe NULL.
 *
 * Returns EPOLLIN | EPOLLRDNORM | EPOLLOUT | EPOLLWRNORM if ready to read/write,
 * EPOLLOUT | EPOLLWRNORM otherwise.
 */
__poll_t tomoyo_poll_control(struct file *file, poll_table *wait)
{
        struct tomoyo_io_buffer *head = file->private_data;

        if (head->poll)
                return head->poll(file, wait) | EPOLLOUT | EPOLLWRNORM;
        return EPOLLIN | EPOLLRDNORM | EPOLLOUT | EPOLLWRNORM;
}

/**
 * tomoyo_set_namespace_cursor - Set namespace to read.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static inline void tomoyo_set_namespace_cursor(struct tomoyo_io_buffer *head)
{
        struct list_head *ns;

        if (head->type != TOMOYO_EXCEPTIONPOLICY &&
            head->type != TOMOYO_PROFILE)
                return;
        /*
         * If this is the first read, or reading previous namespace finished
         * and has more namespaces to read, update the namespace cursor.
         */
        ns = head->r.ns;
        if (!ns || (head->r.eof && ns->next != &tomoyo_namespace_list)) {
                /* Clearing is OK because tomoyo_flush() returned true. */
                memset(&head->r, 0, sizeof(head->r));
                head->r.ns = ns ? ns->next : tomoyo_namespace_list.next;
        }
}

/**
 * tomoyo_has_more_namespace - Check for unread namespaces.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns true if we have more entries to print, false otherwise.
 */
static inline bool tomoyo_has_more_namespace(struct tomoyo_io_buffer *head)
{
        return (head->type == TOMOYO_EXCEPTIONPOLICY ||
                head->type == TOMOYO_PROFILE) && head->r.eof &&
                head->r.ns->next != &tomoyo_namespace_list;
}

/**
 * tomoyo_read_control - read() for /sys/kernel/security/tomoyo/ interface.
 *
 * @head:       Pointer to "struct tomoyo_io_buffer".
 * @buffer:     Pointer to buffer to write to.
 * @buffer_len: Size of @buffer.
 *
 * Returns bytes read on success, negative value otherwise.
 */
ssize_t tomoyo_read_control(struct tomoyo_io_buffer *head, char __user *buffer,
                            const int buffer_len)
{
        int len;
        int idx;

        if (!head->read)
                return -EINVAL;
        if (mutex_lock_interruptible(&head->io_sem))
                return -EINTR;
        head->read_user_buf = buffer;
        head->read_user_buf_avail = buffer_len;
        idx = tomoyo_read_lock();
        if (tomoyo_flush(head))
                /* Call the policy handler. */
                do {
                        tomoyo_set_namespace_cursor(head);
                        head->read(head);
                } while (tomoyo_flush(head) &&
                         tomoyo_has_more_namespace(head));
        tomoyo_read_unlock(idx);
        len = head->read_user_buf - buffer;
        mutex_unlock(&head->io_sem);
        return len;
}

/**
 * tomoyo_parse_policy - Parse a policy line.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @line: Line to parse.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_parse_policy(struct tomoyo_io_buffer *head, char *line)
{
        /* Delete request? */
        head->w.is_delete = !strncmp(line, "delete ", 7);
        if (head->w.is_delete)
                memmove(line, line + 7, strlen(line + 7) + 1);
        /* Selecting namespace to update. */
        if (head->type == TOMOYO_EXCEPTIONPOLICY ||
            head->type == TOMOYO_PROFILE) {
                if (*line == '<') {
                        char *cp = strchr(line, ' ');

                        if (cp) {
                                *cp++ = '\0';
                                head->w.ns = tomoyo_assign_namespace(line);
                                memmove(line, cp, strlen(cp) + 1);
                        } else
                                head->w.ns = NULL;
                } else
                        head->w.ns = &tomoyo_kernel_namespace;
                /* Don't allow updating if namespace is invalid. */
                if (!head->w.ns)
                        return -ENOENT;
        }
        /* Do the update. */
        return head->write(head);
}

/**
 * tomoyo_write_control - write() for /sys/kernel/security/tomoyo/ interface.
 *
 * @head:       Pointer to "struct tomoyo_io_buffer".
 * @buffer:     Pointer to buffer to read from.
 * @buffer_len: Size of @buffer.
 *
 * Returns @buffer_len on success, negative value otherwise.
 */
ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head,
                             const char __user *buffer, const int buffer_len)
{
        int error = buffer_len;
        size_t avail_len = buffer_len;
        char *cp0;
        int idx;

        if (!head->write)
                return -EINVAL;
        if (mutex_lock_interruptible(&head->io_sem))
                return -EINTR;
        cp0 = head->write_buf;
        head->read_user_buf_avail = 0;
        idx = tomoyo_read_lock();
        /* Read a line and dispatch it to the policy handler. */
        while (avail_len > 0) {
                char c;

                if (head->w.avail >= head->writebuf_size - 1) {
                        const int len = head->writebuf_size * 2;
                        char *cp = kzalloc(len, GFP_NOFS);

                        if (!cp) {
                                error = -ENOMEM;
                                break;
                        }
                        memmove(cp, cp0, head->w.avail);
                        kfree(cp0);
                        head->write_buf = cp;
                        cp0 = cp;
                        head->writebuf_size = len;
                }
                if (get_user(c, buffer)) {
                        error = -EFAULT;
                        break;
                }
                buffer++;
                avail_len--;
                cp0[head->w.avail++] = c;
                if (c != '\n')
                        continue;
                cp0[head->w.avail - 1] = '\0';
                head->w.avail = 0;
                tomoyo_normalize_line(cp0);
                if (!strcmp(cp0, "reset")) {
                        head->w.ns = &tomoyo_kernel_namespace;
                        head->w.domain = NULL;
                        memset(&head->r, 0, sizeof(head->r));
                        continue;
                }
                /* Don't allow updating policies by non manager programs. */
                switch (head->type) {
                case TOMOYO_PROCESS_STATUS:
                        /* This does not write anything. */
                        break;
                case TOMOYO_DOMAINPOLICY:
                        if (tomoyo_select_domain(head, cp0))
                                continue;
                        fallthrough;
                case TOMOYO_EXCEPTIONPOLICY:
                        if (!strcmp(cp0, "select transition_only")) {
                                head->r.print_transition_related_only = true;
                                continue;
                        }
                        fallthrough;
                default:
                        if (!tomoyo_manager()) {
                                error = -EPERM;
                                goto out;
                        }
                }
                switch (tomoyo_parse_policy(head, cp0)) {
                case -EPERM:
                        error = -EPERM;
                        goto out;
                case 0:
                        switch (head->type) {
                        case TOMOYO_DOMAINPOLICY:
                        case TOMOYO_EXCEPTIONPOLICY:
                        case TOMOYO_STAT:
                        case TOMOYO_PROFILE:
                        case TOMOYO_MANAGER:
                                tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES);
                                break;
                        default:
                                break;
                        }
                        break;
                }
        }
out:
        tomoyo_read_unlock(idx);
        mutex_unlock(&head->io_sem);
        return error;
}

/**
 * tomoyo_close_control - close() for /sys/kernel/security/tomoyo/ interface.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 */
void tomoyo_close_control(struct tomoyo_io_buffer *head)
{
        /*
         * If the file is /sys/kernel/security/tomoyo/query , decrement the
         * observer counter.
         */
        if (head->type == TOMOYO_QUERY &&
            atomic_dec_and_test(&tomoyo_query_observers))
                wake_up_all(&tomoyo_answer_wait);
        tomoyo_notify_gc(head, false);
}

/**
 * tomoyo_check_profile - Check all profiles currently assigned to domains are defined.
 */
void tomoyo_check_profile(void)
{
        struct tomoyo_domain_info *domain;
        const int idx = tomoyo_read_lock();

        tomoyo_policy_loaded = true;
        pr_info("TOMOYO: 2.6.0\n");
        list_for_each_entry_rcu(domain, &tomoyo_domain_list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                const u8 profile = domain->profile;
                struct tomoyo_policy_namespace *ns = domain->ns;

                if (ns->profile_version == 20110903) {
                        pr_info_once("Converting profile version from %u to %u.\n",
                                     20110903, 20150505);
                        ns->profile_version = 20150505;
                }
                if (ns->profile_version != 20150505)
                        pr_err("Profile version %u is not supported.\n",
                               ns->profile_version);
                else if (!ns->profile_ptr[profile])
                        pr_err("Profile %u (used by '%s') is not defined.\n",
                               profile, domain->domainname->name);
                else
                        continue;
                pr_err("Userland tools for TOMOYO 2.6 must be installed and policy must be initialized.\n");
                pr_err("Please see https://tomoyo.sourceforge.net/2.6/ for more information.\n");
                panic("STOP!");
        }
        tomoyo_read_unlock(idx);
        pr_info("Mandatory Access Control activated.\n");
}

/**
 * tomoyo_load_builtin_policy - Load built-in policy.
 *
 * Returns nothing.
 */
void __init tomoyo_load_builtin_policy(void)
{
#ifdef CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING
        static char tomoyo_builtin_profile[] __initdata =
                "PROFILE_VERSION=20150505\n"
                "0-CONFIG={ mode=learning grant_log=no reject_log=yes }\n";
        static char tomoyo_builtin_exception_policy[] __initdata =
                "aggregator proc:/self/exe /proc/self/exe\n";
        static char tomoyo_builtin_domain_policy[] __initdata = "";
        static char tomoyo_builtin_manager[] __initdata = "";
        static char tomoyo_builtin_stat[] __initdata = "";
#else
        /*
         * This include file is manually created and contains built-in policy
         * named "tomoyo_builtin_profile", "tomoyo_builtin_exception_policy",
         * "tomoyo_builtin_domain_policy", "tomoyo_builtin_manager",
         * "tomoyo_builtin_stat" in the form of "static char [] __initdata".
         */
#include "builtin-policy.h"
#endif
        u8 i;
        const int idx = tomoyo_read_lock();

        for (i = 0; i < 5; i++) {
                struct tomoyo_io_buffer head = { };
                char *start = "";

                switch (i) {
                case 0:
                        start = tomoyo_builtin_profile;
                        head.type = TOMOYO_PROFILE;
                        head.write = tomoyo_write_profile;
                        break;
                case 1:
                        start = tomoyo_builtin_exception_policy;
                        head.type = TOMOYO_EXCEPTIONPOLICY;
                        head.write = tomoyo_write_exception;
                        break;
                case 2:
                        start = tomoyo_builtin_domain_policy;
                        head.type = TOMOYO_DOMAINPOLICY;
                        head.write = tomoyo_write_domain;
                        break;
                case 3:
                        start = tomoyo_builtin_manager;
                        head.type = TOMOYO_MANAGER;
                        head.write = tomoyo_write_manager;
                        break;
                case 4:
                        start = tomoyo_builtin_stat;
                        head.type = TOMOYO_STAT;
                        head.write = tomoyo_write_stat;
                        break;
                }
                while (1) {
                        char *end = strchr(start, '\n');

                        if (!end)
                                break;
                        *end = '\0';
                        tomoyo_normalize_line(start);
                        head.write_buf = start;
                        tomoyo_parse_policy(&head, start);
                        start = end + 1;
                }
        }
        tomoyo_read_unlock(idx);
#ifdef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER
        tomoyo_check_profile();
#endif
}














































































    1 






    1 





































    2 

    1 


    1 

    1 




    1 




















































    1 



    1 
































    1 



    1 



























































    1 

















    1 




    1 












    1 



























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/pnode.c
 *
 * (C) Copyright IBM Corporation 2005.
 *        Author : Ram Pai (linuxram@us.ibm.com)
 */
#include <linux/mnt_namespace.h>
#include <linux/mount.h>
#include <linux/fs.h>
#include <linux/nsproxy.h>
#include <uapi/linux/mount.h>
#include "internal.h"
#include "pnode.h"

/* return the next shared peer mount of @p */
static inline struct mount *next_peer(struct mount *p)
{
        return list_entry(p->mnt_share.next, struct mount, mnt_share);
}

static inline struct mount *first_slave(struct mount *p)
{
        return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
}

static inline struct mount *last_slave(struct mount *p)
{
        return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave);
}

static inline struct mount *next_slave(struct mount *p)
{
        return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
}

static struct mount *get_peer_under_root(struct mount *mnt,
                                         struct mnt_namespace *ns,
                                         const struct path *root)
{
        struct mount *m = mnt;

        do {
                /* Check the namespace first for optimization */
                if (m->mnt_ns == ns && is_path_reachable(m, m->mnt.mnt_root, root))
                        return m;

                m = next_peer(m);
        } while (m != mnt);

        return NULL;
}

/*
 * Get ID of closest dominating peer group having a representative
 * under the given root.
 *
 * Caller must hold namespace_sem
 */
int get_dominating_id(struct mount *mnt, const struct path *root)
{
        struct mount *m;

        for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) {
                struct mount *d = get_peer_under_root(m, mnt->mnt_ns, root);
                if (d)
                        return d->mnt_group_id;
        }

        return 0;
}

static int do_make_slave(struct mount *mnt)
{
        struct mount *master, *slave_mnt;

        if (list_empty(&mnt->mnt_share)) {
                if (IS_MNT_SHARED(mnt)) {
                        mnt_release_group_id(mnt);
                        CLEAR_MNT_SHARED(mnt);
                }
                master = mnt->mnt_master;
                if (!master) {
                        struct list_head *p = &mnt->mnt_slave_list;
                        while (!list_empty(p)) {
                                slave_mnt = list_first_entry(p,
                                                struct mount, mnt_slave);
                                list_del_init(&slave_mnt->mnt_slave);
                                slave_mnt->mnt_master = NULL;
                        }
                        return 0;
                }
        } else {
                struct mount *m;
                /*
                 * slave 'mnt' to a peer mount that has the
                 * same root dentry. If none is available then
                 * slave it to anything that is available.
                 */
                for (m = master = next_peer(mnt); m != mnt; m = next_peer(m)) {
                        if (m->mnt.mnt_root == mnt->mnt.mnt_root) {
                                master = m;
                                break;
                        }
                }
                list_del_init(&mnt->mnt_share);
                mnt->mnt_group_id = 0;
                CLEAR_MNT_SHARED(mnt);
        }
        list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
                slave_mnt->mnt_master = master;
        list_move(&mnt->mnt_slave, &master->mnt_slave_list);
        list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
        INIT_LIST_HEAD(&mnt->mnt_slave_list);
        mnt->mnt_master = master;
        return 0;
}

/*
 * vfsmount lock must be held for write
 */
void change_mnt_propagation(struct mount *mnt, int type)
{
        if (type == MS_SHARED) {
                set_mnt_shared(mnt);
                return;
        }
        do_make_slave(mnt);
        if (type != MS_SLAVE) {
                list_del_init(&mnt->mnt_slave);
                mnt->mnt_master = NULL;
                if (type == MS_UNBINDABLE)
                        mnt->mnt.mnt_flags |= MNT_UNBINDABLE;
                else
                        mnt->mnt.mnt_flags &= ~MNT_UNBINDABLE;
        }
}

/*
 * get the next mount in the propagation tree.
 * @m: the mount seen last
 * @origin: the original mount from where the tree walk initiated
 *
 * Note that peer groups form contiguous segments of slave lists.
 * We rely on that in get_source() to be able to find out if
 * vfsmount found while iterating with propagation_next() is
 * a peer of one we'd found earlier.
 */
static struct mount *propagation_next(struct mount *m,
                                         struct mount *origin)
{
        /* are there any slaves of this mount? */
        if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
                return first_slave(m);

        while (1) {
                struct mount *master = m->mnt_master;

                if (master == origin->mnt_master) {
                        struct mount *next = next_peer(m);
                        return (next == origin) ? NULL : next;
                } else if (m->mnt_slave.next != &master->mnt_slave_list)
                        return next_slave(m);

                /* back at master */
                m = master;
        }
}

static struct mount *skip_propagation_subtree(struct mount *m,
                                                struct mount *origin)
{
        /*
         * Advance m such that propagation_next will not return
         * the slaves of m.
         */
        if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
                m = last_slave(m);

        return m;
}

static struct mount *next_group(struct mount *m, struct mount *origin)
{
        while (1) {
                while (1) {
                        struct mount *next;
                        if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
                                return first_slave(m);
                        next = next_peer(m);
                        if (m->mnt_group_id == origin->mnt_group_id) {
                                if (next == origin)
                                        return NULL;
                        } else if (m->mnt_slave.next != &next->mnt_slave)
                                break;
                        m = next;
                }
                /* m is the last peer */
                while (1) {
                        struct mount *master = m->mnt_master;
                        if (m->mnt_slave.next != &master->mnt_slave_list)
                                return next_slave(m);
                        m = next_peer(master);
                        if (master->mnt_group_id == origin->mnt_group_id)
                                break;
                        if (master->mnt_slave.next == &m->mnt_slave)
                                break;
                        m = master;
                }
                if (m == origin)
                        return NULL;
        }
}

/* all accesses are serialized by namespace_sem */
static struct mount *last_dest, *first_source, *last_source, *dest_master;
static struct hlist_head *list;

static inline bool peers(const struct mount *m1, const struct mount *m2)
{
        return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
}

static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
{
        struct mount *child;
        int type;
        /* skip ones added by this propagate_mnt() */
        if (IS_MNT_NEW(m))
                return 0;
        /* skip if mountpoint isn't covered by it */
        if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root))
                return 0;
        if (peers(m, last_dest)) {
                type = CL_MAKE_SHARED;
        } else {
                struct mount *n, *p;
                bool done;
                for (n = m; ; n = p) {
                        p = n->mnt_master;
                        if (p == dest_master || IS_MNT_MARKED(p))
                                break;
                }
                do {
                        struct mount *parent = last_source->mnt_parent;
                        if (peers(last_source, first_source))
                                break;
                        done = parent->mnt_master == p;
                        if (done && peers(n, parent))
                                break;
                        last_source = last_source->mnt_master;
                } while (!done);

                type = CL_SLAVE;
                /* beginning of peer group among the slaves? */
                if (IS_MNT_SHARED(m))
                        type |= CL_MAKE_SHARED;
        }
                
        child = copy_tree(last_source, last_source->mnt.mnt_root, type);
        if (IS_ERR(child))
                return PTR_ERR(child);
        read_seqlock_excl(&mount_lock);
        mnt_set_mountpoint(m, dest_mp, child);
        if (m->mnt_master != dest_master)
                SET_MNT_MARK(m->mnt_master);
        read_sequnlock_excl(&mount_lock);
        last_dest = m;
        last_source = child;
        hlist_add_head(&child->mnt_hash, list);
        return count_mounts(m->mnt_ns, child);
}

/*
 * mount 'source_mnt' under the destination 'dest_mnt' at
 * dentry 'dest_dentry'. And propagate that mount to
 * all the peer and slave mounts of 'dest_mnt'.
 * Link all the new mounts into a propagation tree headed at
 * source_mnt. Also link all the new mounts using ->mnt_list
 * headed at source_mnt's ->mnt_list
 *
 * @dest_mnt: destination mount.
 * @dest_dentry: destination dentry.
 * @source_mnt: source mount.
 * @tree_list : list of heads of trees to be attached.
 */
int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
                    struct mount *source_mnt, struct hlist_head *tree_list)
{
        struct mount *m, *n;
        int ret = 0;

        /*
         * we don't want to bother passing tons of arguments to
         * propagate_one(); everything is serialized by namespace_sem,
         * so globals will do just fine.
         */
        last_dest = dest_mnt;
        first_source = source_mnt;
        last_source = source_mnt;
        list = tree_list;
        dest_master = dest_mnt->mnt_master;

        /* all peers of dest_mnt, except dest_mnt itself */
        for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) {
                ret = propagate_one(n, dest_mp);
                if (ret)
                        goto out;
        }

        /* all slave groups */
        for (m = next_group(dest_mnt, dest_mnt); m;
                        m = next_group(m, dest_mnt)) {
                /* everything in that slave group */
                n = m;
                do {
                        ret = propagate_one(n, dest_mp);
                        if (ret)
                                goto out;
                        n = next_peer(n);
                } while (n != m);
        }
out:
        read_seqlock_excl(&mount_lock);
        hlist_for_each_entry(n, tree_list, mnt_hash) {
                m = n->mnt_parent;
                if (m->mnt_master != dest_mnt->mnt_master)
                        CLEAR_MNT_MARK(m->mnt_master);
        }
        read_sequnlock_excl(&mount_lock);
        return ret;
}

static struct mount *find_topper(struct mount *mnt)
{
        /* If there is exactly one mount covering mnt completely return it. */
        struct mount *child;

        if (!list_is_singular(&mnt->mnt_mounts))
                return NULL;

        child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
        if (child->mnt_mountpoint != mnt->mnt.mnt_root)
                return NULL;

        return child;
}

/*
 * return true if the refcount is greater than count
 */
static inline int do_refcount_check(struct mount *mnt, int count)
{
        return mnt_get_count(mnt) > count;
}

/**
 * propagation_would_overmount - check whether propagation from @from
 *                               would overmount @to
 * @from: shared mount
 * @to:   mount to check
 * @mp:   future mountpoint of @to on @from
 *
 * If @from propagates mounts to @to, @from and @to must either be peers
 * or one of the masters in the hierarchy of masters of @to must be a
 * peer of @from.
 *
 * If the root of the @to mount is equal to the future mountpoint @mp of
 * the @to mount on @from then @to will be overmounted by whatever is
 * propagated to it.
 *
 * Context: This function expects namespace_lock() to be held and that
 *          @mp is stable.
 * Return: If @from overmounts @to, true is returned, false if not.
 */
bool propagation_would_overmount(const struct mount *from,
                                 const struct mount *to,
                                 const struct mountpoint *mp)
{
        if (!IS_MNT_SHARED(from))
                return false;

        if (IS_MNT_NEW(to))
                return false;

        if (to->mnt.mnt_root != mp->m_dentry)
                return false;

        for (const struct mount *m = to; m; m = m->mnt_master) {
                if (peers(from, m))
                        return true;
        }

        return false;
}

/*
 * check if the mount 'mnt' can be unmounted successfully.
 * @mnt: the mount to be checked for unmount
 * NOTE: unmounting 'mnt' would naturally propagate to all
 * other mounts its parent propagates to.
 * Check if any of these mounts that **do not have submounts**
 * have more references than 'refcnt'. If so return busy.
 *
 * vfsmount lock must be held for write
 */
int propagate_mount_busy(struct mount *mnt, int refcnt)
{
        struct mount *m, *child, *topper;
        struct mount *parent = mnt->mnt_parent;

        if (mnt == parent)
                return do_refcount_check(mnt, refcnt);

        /*
         * quickly check if the current mount can be unmounted.
         * If not, we don't have to go checking for all other
         * mounts
         */
        if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt))
                return 1;

        for (m = propagation_next(parent, parent); m;
                             m = propagation_next(m, parent)) {
                int count = 1;
                child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
                if (!child)
                        continue;

                /* Is there exactly one mount on the child that covers
                 * it completely whose reference should be ignored?
                 */
                topper = find_topper(child);
                if (topper)
                        count += 1;
                else if (!list_empty(&child->mnt_mounts))
                        continue;

                if (do_refcount_check(child, count))
                        return 1;
        }
        return 0;
}

/*
 * Clear MNT_LOCKED when it can be shown to be safe.
 *
 * mount_lock lock must be held for write
 */
void propagate_mount_unlock(struct mount *mnt)
{
        struct mount *parent = mnt->mnt_parent;
        struct mount *m, *child;

        BUG_ON(parent == mnt);

        for (m = propagation_next(parent, parent); m;
                        m = propagation_next(m, parent)) {
                child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
                if (child)
                        child->mnt.mnt_flags &= ~MNT_LOCKED;
        }
}

static void umount_one(struct mount *mnt, struct list_head *to_umount)
{
        CLEAR_MNT_MARK(mnt);
        mnt->mnt.mnt_flags |= MNT_UMOUNT;
        list_del_init(&mnt->mnt_child);
        list_del_init(&mnt->mnt_umounting);
        move_from_ns(mnt, to_umount);
}

/*
 * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
 * parent propagates to.
 */
static bool __propagate_umount(struct mount *mnt,
                               struct list_head *to_umount,
                               struct list_head *to_restore)
{
        bool progress = false;
        struct mount *child;

        /*
         * The state of the parent won't change if this mount is
         * already unmounted or marked as without children.
         */
        if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED))
                goto out;

        /* Verify topper is the only grandchild that has not been
         * speculatively unmounted.
         */
        list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
                if (child->mnt_mountpoint == mnt->mnt.mnt_root)
                        continue;
                if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child))
                        continue;
                /* Found a mounted child */
                goto children;
        }

        /* Mark mounts that can be unmounted if not locked */
        SET_MNT_MARK(mnt);
        progress = true;

        /* If a mount is without children and not locked umount it. */
        if (!IS_MNT_LOCKED(mnt)) {
                umount_one(mnt, to_umount);
        } else {
children:
                list_move_tail(&mnt->mnt_umounting, to_restore);
        }
out:
        return progress;
}

static void umount_list(struct list_head *to_umount,
                        struct list_head *to_restore)
{
        struct mount *mnt, *child, *tmp;
        list_for_each_entry(mnt, to_umount, mnt_list) {
                list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) {
                        /* topper? */
                        if (child->mnt_mountpoint == mnt->mnt.mnt_root)
                                list_move_tail(&child->mnt_umounting, to_restore);
                        else
                                umount_one(child, to_umount);
                }
        }
}

static void restore_mounts(struct list_head *to_restore)
{
        /* Restore mounts to a clean working state */
        while (!list_empty(to_restore)) {
                struct mount *mnt, *parent;
                struct mountpoint *mp;

                mnt = list_first_entry(to_restore, struct mount, mnt_umounting);
                CLEAR_MNT_MARK(mnt);
                list_del_init(&mnt->mnt_umounting);

                /* Should this mount be reparented? */
                mp = mnt->mnt_mp;
                parent = mnt->mnt_parent;
                while (parent->mnt.mnt_flags & MNT_UMOUNT) {
                        mp = parent->mnt_mp;
                        parent = parent->mnt_parent;
                }
                if (parent != mnt->mnt_parent)
                        mnt_change_mountpoint(parent, mp, mnt);
        }
}

static void cleanup_umount_visitations(struct list_head *visited)
{
        while (!list_empty(visited)) {
                struct mount *mnt =
                        list_first_entry(visited, struct mount, mnt_umounting);
                list_del_init(&mnt->mnt_umounting);
        }
}

/*
 * collect all mounts that receive propagation from the mount in @list,
 * and return these additional mounts in the same list.
 * @list: the list of mounts to be unmounted.
 *
 * vfsmount lock must be held for write
 */
int propagate_umount(struct list_head *list)
{
        struct mount *mnt;
        LIST_HEAD(to_restore);
        LIST_HEAD(to_umount);
        LIST_HEAD(visited);

        /* Find candidates for unmounting */
        list_for_each_entry_reverse(mnt, list, mnt_list) {
                struct mount *parent = mnt->mnt_parent;
                struct mount *m;

                /*
                 * If this mount has already been visited it is known that it's
                 * entire peer group and all of their slaves in the propagation
                 * tree for the mountpoint has already been visited and there is
                 * no need to visit them again.
                 */
                if (!list_empty(&mnt->mnt_umounting))
                        continue;

                list_add_tail(&mnt->mnt_umounting, &visited);
                for (m = propagation_next(parent, parent); m;
                     m = propagation_next(m, parent)) {
                        struct mount *child = __lookup_mnt(&m->mnt,
                                                           mnt->mnt_mountpoint);
                        if (!child)
                                continue;

                        if (!list_empty(&child->mnt_umounting)) {
                                /*
                                 * If the child has already been visited it is
                                 * know that it's entire peer group and all of
                                 * their slaves in the propgation tree for the
                                 * mountpoint has already been visited and there
                                 * is no need to visit this subtree again.
                                 */
                                m = skip_propagation_subtree(m, parent);
                                continue;
                        } else if (child->mnt.mnt_flags & MNT_UMOUNT) {
                                /*
                                 * We have come accross an partially unmounted
                                 * mount in list that has not been visited yet.
                                 * Remember it has been visited and continue
                                 * about our merry way.
                                 */
                                list_add_tail(&child->mnt_umounting, &visited);
                                continue;
                        }

                        /* Check the child and parents while progress is made */
                        while (__propagate_umount(child,
                                                  &to_umount, &to_restore)) {
                                /* Is the parent a umount candidate? */
                                child = child->mnt_parent;
                                if (list_empty(&child->mnt_umounting))
                                        break;
                        }
                }
        }

        umount_list(&to_umount, &to_restore);
        restore_mounts(&to_restore);
        cleanup_umount_visitations(&visited);
        list_splice_tail(&to_umount, list);

        return 0;
}



























    1 









































    1 






































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM io_uring

#if !defined(_TRACE_IO_URING_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_IO_URING_H

#include <linux/tracepoint.h>
#include <uapi/linux/io_uring.h>
#include <linux/io_uring_types.h>
#include <linux/io_uring.h>

struct io_wq_work;

/**
 * io_uring_create - called after a new io_uring context was prepared
 *
 * @fd:                corresponding file descriptor
 * @ctx:        pointer to a ring context structure
 * @sq_entries:        actual SQ size
 * @cq_entries:        actual CQ size
 * @flags:        SQ ring flags, provided to io_uring_setup(2)
 *
 * Allows to trace io_uring creation and provide pointer to a context, that can
 * be used later to find correlated events.
 */
TRACE_EVENT(io_uring_create,

        TP_PROTO(int fd, void *ctx, u32 sq_entries, u32 cq_entries, u32 flags),

        TP_ARGS(fd, ctx, sq_entries, cq_entries, flags),

        TP_STRUCT__entry (
                __field(  int,                fd                )
                __field(  void *,        ctx                )
                __field(  u32,                sq_entries        )
                __field(  u32,                cq_entries        )
                __field(  u32,                flags                )
        ),

        TP_fast_assign(
                __entry->fd                = fd;
                __entry->ctx                = ctx;
                __entry->sq_entries        = sq_entries;
                __entry->cq_entries        = cq_entries;
                __entry->flags                = flags;
        ),

        TP_printk("ring %p, fd %d sq size %d, cq size %d, flags 0x%x",
                          __entry->ctx, __entry->fd, __entry->sq_entries,
                          __entry->cq_entries, __entry->flags)
);

/**
 * io_uring_register - called after a buffer/file/eventfd was successfully
 *                                            registered for a ring
 *
 * @ctx:                pointer to a ring context structure
 * @opcode:                describes which operation to perform
 * @nr_user_files:        number of registered files
 * @nr_user_bufs:        number of registered buffers
 * @ret:                return code
 *
 * Allows to trace fixed files/buffers, that could be registered to
 * avoid an overhead of getting references to them for every operation. This
 * event, together with io_uring_file_get, can provide a full picture of how
 * much overhead one can reduce via fixing.
 */
TRACE_EVENT(io_uring_register,

        TP_PROTO(void *ctx, unsigned opcode, unsigned nr_files,
                         unsigned nr_bufs, long ret),

        TP_ARGS(ctx, opcode, nr_files, nr_bufs, ret),

        TP_STRUCT__entry (
                __field(  void *,        ctx        )
                __field(  unsigned,        opcode        )
                __field(  unsigned,        nr_files)
                __field(  unsigned,        nr_bufs        )
                __field(  long,                ret        )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->opcode                = opcode;
                __entry->nr_files        = nr_files;
                __entry->nr_bufs        = nr_bufs;
                __entry->ret                = ret;
        ),

        TP_printk("ring %p, opcode %d, nr_user_files %d, nr_user_bufs %d, "
                          "ret %ld",
                          __entry->ctx, __entry->opcode, __entry->nr_files,
                          __entry->nr_bufs, __entry->ret)
);

/**
 * io_uring_file_get - called before getting references to an SQE file
 *
 * @req:        pointer to a submitted request
 * @fd:                SQE file descriptor
 *
 * Allows to trace out how often an SQE file reference is obtained, which can
 * help figuring out if it makes sense to use fixed files, or check that fixed
 * files are used correctly.
 */
TRACE_EVENT(io_uring_file_get,

        TP_PROTO(struct io_kiocb *req, int fd),

        TP_ARGS(req, fd),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  void *,        req                )
                __field(  u64,                user_data        )
                __field(  int,                fd                )
        ),

        TP_fast_assign(
                __entry->ctx                = req->ctx;
                __entry->req                = req;
                __entry->user_data        = req->cqe.user_data;
                __entry->fd                = fd;
        ),

        TP_printk("ring %p, req %p, user_data 0x%llx, fd %d",
                __entry->ctx, __entry->req, __entry->user_data, __entry->fd)
);

/**
 * io_uring_queue_async_work - called before submitting a new async work
 *
 * @req:        pointer to a submitted request
 * @rw:                type of workqueue, hashed or normal
 *
 * Allows to trace asynchronous work submission.
 */
TRACE_EVENT(io_uring_queue_async_work,

        TP_PROTO(struct io_kiocb *req, int rw),

        TP_ARGS(req, rw),

        TP_STRUCT__entry (
                __field(  void *,                        ctx                )
                __field(  void *,                        req                )
                __field(  u64,                                user_data        )
                __field(  u8,                                opcode                )
                __field(  unsigned long long,                flags                )
                __field(  struct io_wq_work *,                work                )
                __field(  int,                                rw                )

                __string( op_str, io_uring_get_opcode(req->opcode)        )
        ),

        TP_fast_assign(
                __entry->ctx                = req->ctx;
                __entry->req                = req;
                __entry->user_data        = req->cqe.user_data;
                __entry->flags                = (__force unsigned long long) req->flags;
                __entry->opcode                = req->opcode;
                __entry->work                = &req->work;
                __entry->rw                = rw;

                __assign_str(op_str);
        ),

        TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p",
                __entry->ctx, __entry->req, __entry->user_data,
                __get_str(op_str), __entry->flags,
                __entry->rw ? "hashed" : "normal", __entry->work)
);

/**
 * io_uring_defer - called when an io_uring request is deferred
 *
 * @req:        pointer to a deferred request
 *
 * Allows to track deferred requests, to get an insight about what requests are
 * not started immediately.
 */
TRACE_EVENT(io_uring_defer,

        TP_PROTO(struct io_kiocb *req),

        TP_ARGS(req),

        TP_STRUCT__entry (
                __field(  void *,                ctx        )
                __field(  void *,                req        )
                __field(  unsigned long long,        data        )
                __field(  u8,                        opcode        )

                __string( op_str, io_uring_get_opcode(req->opcode) )
        ),

        TP_fast_assign(
                __entry->ctx        = req->ctx;
                __entry->req        = req;
                __entry->data        = req->cqe.user_data;
                __entry->opcode        = req->opcode;

                __assign_str(op_str);
        ),

        TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s",
                __entry->ctx, __entry->req, __entry->data,
                __get_str(op_str))
);

/**
 * io_uring_link - called before the io_uring request added into link_list of
 *                    another request
 *
 * @req:                pointer to a linked request
 * @target_req:                pointer to a previous request, that would contain @req
 *
 * Allows to track linked requests, to understand dependencies between requests
 * and how does it influence their execution flow.
 */
TRACE_EVENT(io_uring_link,

        TP_PROTO(struct io_kiocb *req, struct io_kiocb *target_req),

        TP_ARGS(req, target_req),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  void *,        req                )
                __field(  void *,        target_req        )
        ),

        TP_fast_assign(
                __entry->ctx                = req->ctx;
                __entry->req                = req;
                __entry->target_req        = target_req;
        ),

        TP_printk("ring %p, request %p linked after %p",
                          __entry->ctx, __entry->req, __entry->target_req)
);

/**
 * io_uring_cqring_wait - called before start waiting for an available CQE
 *
 * @ctx:                pointer to a ring context structure
 * @min_events:        minimal number of events to wait for
 *
 * Allows to track waiting for CQE, so that we can e.g. troubleshoot
 * situations, when an application wants to wait for an event, that never
 * comes.
 */
TRACE_EVENT(io_uring_cqring_wait,

        TP_PROTO(void *ctx, int min_events),

        TP_ARGS(ctx, min_events),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  int,                min_events        )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->min_events        = min_events;
        ),

        TP_printk("ring %p, min_events %d", __entry->ctx, __entry->min_events)
);

/**
 * io_uring_fail_link - called before failing a linked request
 *
 * @req:        request, which links were cancelled
 * @link:        cancelled link
 *
 * Allows to track linked requests cancellation, to see not only that some work
 * was cancelled, but also which request was the reason.
 */
TRACE_EVENT(io_uring_fail_link,

        TP_PROTO(struct io_kiocb *req, struct io_kiocb *link),

        TP_ARGS(req, link),

        TP_STRUCT__entry (
                __field(  void *,                ctx                )
                __field(  void *,                req                )
                __field(  unsigned long long,        user_data        )
                __field(  u8,                        opcode                )
                __field(  void *,                link                )

                __string( op_str, io_uring_get_opcode(req->opcode) )
        ),

        TP_fast_assign(
                __entry->ctx                = req->ctx;
                __entry->req                = req;
                __entry->user_data        = req->cqe.user_data;
                __entry->opcode                = req->opcode;
                __entry->link                = link;

                __assign_str(op_str);
        ),

        TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p",
                __entry->ctx, __entry->req, __entry->user_data,
                __get_str(op_str), __entry->link)
);

/**
 * io_uring_complete - called when completing an SQE
 *
 * @ctx:                pointer to a ring context structure
 * @req:                pointer to a submitted request
 * @user_data:                user data associated with the request
 * @res:                result of the request
 * @cflags:                completion flags
 * @extra1:                extra 64-bit data for CQE32
 * @extra2:                extra 64-bit data for CQE32
 *
 */
TRACE_EVENT(io_uring_complete,

        TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags,
                 u64 extra1, u64 extra2),

        TP_ARGS(ctx, req, user_data, res, cflags, extra1, extra2),

        TP_STRUCT__entry (
                __field(  void *,        ctx                )
                __field(  void *,        req                )
                __field(  u64,                user_data        )
                __field(  int,                res                )
                __field(  unsigned,        cflags                )
                __field(  u64,                extra1                )
                __field(  u64,                extra2                )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->req                = req;
                __entry->user_data        = user_data;
                __entry->res                = res;
                __entry->cflags                = cflags;
                __entry->extra1                = extra1;
                __entry->extra2                = extra2;
        ),

        TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x "
                  "extra1 %llu extra2 %llu ",
                __entry->ctx, __entry->req,
                __entry->user_data,
                __entry->res, __entry->cflags,
                (unsigned long long) __entry->extra1,
                (unsigned long long) __entry->extra2)
);

/**
 * io_uring_submit_req - called before submitting a request
 *
 * @req:                pointer to a submitted request
 *
 * Allows to track SQE submitting, to understand what was the source of it, SQ
 * thread or io_uring_enter call.
 */
TRACE_EVENT(io_uring_submit_req,

        TP_PROTO(struct io_kiocb *req),

        TP_ARGS(req),

        TP_STRUCT__entry (
                __field(  void *,                ctx                )
                __field(  void *,                req                )
                __field(  unsigned long long,        user_data        )
                __field(  u8,                        opcode                )
                __field(  unsigned long long,        flags                )
                __field(  bool,                        sq_thread        )

                __string( op_str, io_uring_get_opcode(req->opcode) )
        ),

        TP_fast_assign(
                __entry->ctx                = req->ctx;
                __entry->req                = req;
                __entry->user_data        = req->cqe.user_data;
                __entry->opcode                = req->opcode;
                __entry->flags                = (__force unsigned long long) req->flags;
                __entry->sq_thread        = req->ctx->flags & IORING_SETUP_SQPOLL;

                __assign_str(op_str);
        ),

        TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%llx, "
                  "sq_thread %d", __entry->ctx, __entry->req,
                  __entry->user_data, __get_str(op_str), __entry->flags,
                  __entry->sq_thread)
);

/*
 * io_uring_poll_arm - called after arming a poll wait if successful
 *
 * @req:                pointer to the armed request
 * @mask:                request poll events mask
 * @events:                registered events of interest
 *
 * Allows to track which fds are waiting for and what are the events of
 * interest.
 */
TRACE_EVENT(io_uring_poll_arm,

        TP_PROTO(struct io_kiocb *req, int mask, int events),

        TP_ARGS(req, mask, events),

        TP_STRUCT__entry (
                __field(  void *,                ctx                )
                __field(  void *,                req                )
                __field(  unsigned long long,        user_data        )
                __field(  u8,                        opcode                )
                __field(  int,                        mask                )
                __field(  int,                        events                )

                __string( op_str, io_uring_get_opcode(req->opcode) )
        ),

        TP_fast_assign(
                __entry->ctx                = req->ctx;
                __entry->req                = req;
                __entry->user_data        = req->cqe.user_data;
                __entry->opcode                = req->opcode;
                __entry->mask                = mask;
                __entry->events                = events;

                __assign_str(op_str);
        ),

        TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x",
                  __entry->ctx, __entry->req, __entry->user_data,
                  __get_str(op_str),
                  __entry->mask, __entry->events)
);

/*
 * io_uring_task_add - called after adding a task
 *
 * @req:                pointer to request
 * @mask:                request poll events mask
 *
 */
TRACE_EVENT(io_uring_task_add,

        TP_PROTO(struct io_kiocb *req, int mask),

        TP_ARGS(req, mask),

        TP_STRUCT__entry (
                __field(  void *,                ctx                )
                __field(  void *,                req                )
                __field(  unsigned long long,        user_data        )
                __field(  u8,                        opcode                )
                __field(  int,                        mask                )

                __string( op_str, io_uring_get_opcode(req->opcode) )
        ),

        TP_fast_assign(
                __entry->ctx                = req->ctx;
                __entry->req                = req;
                __entry->user_data        = req->cqe.user_data;
                __entry->opcode                = req->opcode;
                __entry->mask                = mask;

                __assign_str(op_str);
        ),

        TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x",
                __entry->ctx, __entry->req, __entry->user_data,
                __get_str(op_str),
                __entry->mask)
);

/*
 * io_uring_req_failed - called when an sqe is errored dring submission
 *
 * @sqe:                pointer to the io_uring_sqe that failed
 * @req:                pointer to request
 * @error:                error it failed with
 *
 * Allows easier diagnosing of malformed requests in production systems.
 */
TRACE_EVENT(io_uring_req_failed,

        TP_PROTO(const struct io_uring_sqe *sqe, struct io_kiocb *req, int error),

        TP_ARGS(sqe, req, error),

        TP_STRUCT__entry (
                __field(  void *,                ctx                )
                __field(  void *,                req                )
                __field(  unsigned long long,        user_data        )
                __field(  u8,                        opcode                )
                __field(  u8,                        flags                )
                __field(  u8,                        ioprio                )
                __field( u64,                        off                )
                __field( u64,                        addr                )
                __field( u32,                        len                )
                __field( u32,                        op_flags        )
                __field( u16,                        buf_index        )
                __field( u16,                        personality        )
                __field( u32,                        file_index        )
                __field( u64,                        pad1                )
                __field( u64,                        addr3                )
                __field( int,                        error                )

                __string( op_str, io_uring_get_opcode(sqe->opcode) )
        ),

        TP_fast_assign(
                __entry->ctx                = req->ctx;
                __entry->req                = req;
                __entry->user_data        = sqe->user_data;
                __entry->opcode                = sqe->opcode;
                __entry->flags                = sqe->flags;
                __entry->ioprio                = sqe->ioprio;
                __entry->off                = sqe->off;
                __entry->addr                = sqe->addr;
                __entry->len                = sqe->len;
                __entry->op_flags        = sqe->poll32_events;
                __entry->buf_index        = sqe->buf_index;
                __entry->personality        = sqe->personality;
                __entry->file_index        = sqe->file_index;
                __entry->pad1                = sqe->__pad2[0];
                __entry->addr3                = sqe->addr3;
                __entry->error                = error;

                __assign_str(op_str);
        ),

        TP_printk("ring %p, req %p, user_data 0x%llx, "
                  "opcode %s, flags 0x%x, prio=%d, off=%llu, addr=%llu, "
                  "len=%u, rw_flags=0x%x, buf_index=%d, "
                  "personality=%d, file_index=%d, pad=0x%llx, addr3=%llx, "
                  "error=%d",
                  __entry->ctx, __entry->req, __entry->user_data,
                  __get_str(op_str),
                  __entry->flags, __entry->ioprio,
                  (unsigned long long)__entry->off,
                  (unsigned long long) __entry->addr, __entry->len,
                  __entry->op_flags,
                  __entry->buf_index, __entry->personality, __entry->file_index,
                  (unsigned long long) __entry->pad1,
                  (unsigned long long) __entry->addr3, __entry->error)
);


/*
 * io_uring_cqe_overflow - a CQE overflowed
 *
 * @ctx:                pointer to a ring context structure
 * @user_data:                user data associated with the request
 * @res:                CQE result
 * @cflags:                CQE flags
 * @ocqe:                pointer to the overflow cqe (if available)
 *
 */
TRACE_EVENT(io_uring_cqe_overflow,

        TP_PROTO(void *ctx, unsigned long long user_data, s32 res, u32 cflags,
                 void *ocqe),

        TP_ARGS(ctx, user_data, res, cflags, ocqe),

        TP_STRUCT__entry (
                __field(  void *,                ctx                )
                __field(  unsigned long long,        user_data        )
                __field(  s32,                        res                )
                __field(  u32,                        cflags                )
                __field(  void *,                ocqe                )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->user_data        = user_data;
                __entry->res                = res;
                __entry->cflags                = cflags;
                __entry->ocqe                = ocqe;
        ),

        TP_printk("ring %p, user_data 0x%llx, res %d, cflags 0x%x, "
                  "overflow_cqe %p",
                  __entry->ctx, __entry->user_data, __entry->res,
                  __entry->cflags, __entry->ocqe)
);

/*
 * io_uring_task_work_run - ran task work
 *
 * @tctx:                pointer to a io_uring_task
 * @count:                how many functions it ran
 *
 */
TRACE_EVENT(io_uring_task_work_run,

        TP_PROTO(void *tctx, unsigned int count),

        TP_ARGS(tctx, count),

        TP_STRUCT__entry (
                __field(  void *,                tctx                )
                __field(  unsigned int,                count                )
        ),

        TP_fast_assign(
                __entry->tctx                = tctx;
                __entry->count                = count;
        ),

        TP_printk("tctx %p, count %u", __entry->tctx, __entry->count)
);

TRACE_EVENT(io_uring_short_write,

        TP_PROTO(void *ctx, u64 fpos, u64 wanted, u64 got),

        TP_ARGS(ctx, fpos, wanted, got),

        TP_STRUCT__entry(
                __field(void *,        ctx)
                __field(u64,        fpos)
                __field(u64,        wanted)
                __field(u64,        got)
        ),

        TP_fast_assign(
                __entry->ctx        = ctx;
                __entry->fpos        = fpos;
                __entry->wanted        = wanted;
                __entry->got        = got;
        ),

        TP_printk("ring %p, fpos %lld, wanted %lld, got %lld",
                          __entry->ctx, __entry->fpos,
                          __entry->wanted, __entry->got)
);

/*
 * io_uring_local_work_run - ran ring local task work
 *
 * @tctx:                pointer to a io_uring_ctx
 * @count:                how many functions it ran
 * @loops:                how many loops it ran
 *
 */
TRACE_EVENT(io_uring_local_work_run,

        TP_PROTO(void *ctx, int count, unsigned int loops),

        TP_ARGS(ctx, count, loops),

        TP_STRUCT__entry (
                __field(void *,                ctx        )
                __field(int,                count        )
                __field(unsigned int,        loops        )
        ),

        TP_fast_assign(
                __entry->ctx                = ctx;
                __entry->count                = count;
                __entry->loops                = loops;
        ),

        TP_printk("ring %p, count %d, loops %u", __entry->ctx, __entry->count, __entry->loops)
);

#endif /* _TRACE_IO_URING_H */

/* This part must be outside protection */
#include <trace/define_trace.h>





































    4 








    3 
    4 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
#ifndef _LINUX_RSEQ_H
#define _LINUX_RSEQ_H

#ifdef CONFIG_RSEQ

#include <linux/preempt.h>
#include <linux/sched.h>

/*
 * Map the event mask on the user-space ABI enum rseq_cs_flags
 * for direct mask checks.
 */
enum rseq_event_mask_bits {
        RSEQ_EVENT_PREEMPT_BIT        = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
        RSEQ_EVENT_SIGNAL_BIT        = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
        RSEQ_EVENT_MIGRATE_BIT        = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
};

enum rseq_event_mask {
        RSEQ_EVENT_PREEMPT        = (1U << RSEQ_EVENT_PREEMPT_BIT),
        RSEQ_EVENT_SIGNAL        = (1U << RSEQ_EVENT_SIGNAL_BIT),
        RSEQ_EVENT_MIGRATE        = (1U << RSEQ_EVENT_MIGRATE_BIT),
};

static inline void rseq_set_notify_resume(struct task_struct *t)
{
        if (t->rseq)
                set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
}

void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);

static inline void rseq_handle_notify_resume(struct ksignal *ksig,
                                             struct pt_regs *regs)
{
        if (current->rseq)
                __rseq_handle_notify_resume(ksig, regs);
}

static inline void rseq_signal_deliver(struct ksignal *ksig,
                                       struct pt_regs *regs)
{
        preempt_disable();
        __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
        preempt_enable();
        rseq_handle_notify_resume(ksig, regs);
}

/* rseq_preempt() requires preemption to be disabled. */
static inline void rseq_preempt(struct task_struct *t)
{
        __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
        rseq_set_notify_resume(t);
}

/* rseq_migrate() requires preemption to be disabled. */
static inline void rseq_migrate(struct task_struct *t)
{
        __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
        rseq_set_notify_resume(t);
}

/*
 * If parent process has a registered restartable sequences area, the
 * child inherits. Unregister rseq for a clone with CLONE_VM set.
 */
static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
{
        if (clone_flags & CLONE_VM) {
                t->rseq = NULL;
                t->rseq_len = 0;
                t->rseq_sig = 0;
                t->rseq_event_mask = 0;
        } else {
                t->rseq = current->rseq;
                t->rseq_len = current->rseq_len;
                t->rseq_sig = current->rseq_sig;
                t->rseq_event_mask = current->rseq_event_mask;
        }
}

static inline void rseq_execve(struct task_struct *t)
{
        t->rseq = NULL;
        t->rseq_len = 0;
        t->rseq_sig = 0;
        t->rseq_event_mask = 0;
}

#else

static inline void rseq_set_notify_resume(struct task_struct *t)
{
}
static inline void rseq_handle_notify_resume(struct ksignal *ksig,
                                             struct pt_regs *regs)
{
}
static inline void rseq_signal_deliver(struct ksignal *ksig,
                                       struct pt_regs *regs)
{
}
static inline void rseq_preempt(struct task_struct *t)
{
}
static inline void rseq_migrate(struct task_struct *t)
{
}
static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
{
}
static inline void rseq_execve(struct task_struct *t)
{
}

#endif

#ifdef CONFIG_DEBUG_RSEQ

void rseq_syscall(struct pt_regs *regs);

#else

static inline void rseq_syscall(struct pt_regs *regs)
{
}

#endif

#endif /* _LINUX_RSEQ_H */
















    8 





    8 
    8 












    1 


    1 


    1 


   11 


   11 


   11 











   12 


   11 


   11 


   11 


   12 


   12 


    3 

    3 



    3 


    1 

    1 



    1 





















































































































































    7 













    8 































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
// SPDX-License-Identifier: GPL-2.0

#include "blk-rq-qos.h"

/*
 * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
 * false if 'v' + 1 would be bigger than 'below'.
 */
static bool atomic_inc_below(atomic_t *v, unsigned int below)
{
        unsigned int cur = atomic_read(v);

        do {
                if (cur >= below)
                        return false;
        } while (!atomic_try_cmpxchg(v, &cur, cur + 1));

        return true;
}

bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit)
{
        return atomic_inc_below(&rq_wait->inflight, limit);
}

void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio)
{
        do {
                if (rqos->ops->cleanup)
                        rqos->ops->cleanup(rqos, bio);
                rqos = rqos->next;
        } while (rqos);
}

void __rq_qos_done(struct rq_qos *rqos, struct request *rq)
{
        do {
                if (rqos->ops->done)
                        rqos->ops->done(rqos, rq);
                rqos = rqos->next;
        } while (rqos);
}

void __rq_qos_issue(struct rq_qos *rqos, struct request *rq)
{
        do {
                if (rqos->ops->issue)
                        rqos->ops->issue(rqos, rq);
                rqos = rqos->next;
        } while (rqos);
}

void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq)
{
        do {
                if (rqos->ops->requeue)
                        rqos->ops->requeue(rqos, rq);
                rqos = rqos->next;
        } while (rqos);
}

void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio)
{
        do {
                if (rqos->ops->throttle)
                        rqos->ops->throttle(rqos, bio);
                rqos = rqos->next;
        } while (rqos);
}

void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
{
        do {
                if (rqos->ops->track)
                        rqos->ops->track(rqos, rq, bio);
                rqos = rqos->next;
        } while (rqos);
}

void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio)
{
        do {
                if (rqos->ops->merge)
                        rqos->ops->merge(rqos, rq, bio);
                rqos = rqos->next;
        } while (rqos);
}

void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio)
{
        do {
                if (rqos->ops->done_bio)
                        rqos->ops->done_bio(rqos, bio);
                rqos = rqos->next;
        } while (rqos);
}

void __rq_qos_queue_depth_changed(struct rq_qos *rqos)
{
        do {
                if (rqos->ops->queue_depth_changed)
                        rqos->ops->queue_depth_changed(rqos);
                rqos = rqos->next;
        } while (rqos);
}

/*
 * Return true, if we can't increase the depth further by scaling
 */
bool rq_depth_calc_max_depth(struct rq_depth *rqd)
{
        unsigned int depth;
        bool ret = false;

        /*
         * For QD=1 devices, this is a special case. It's important for those
         * to have one request ready when one completes, so force a depth of
         * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
         * since the device can't have more than that in flight. If we're
         * scaling down, then keep a setting of 1/1/1.
         */
        if (rqd->queue_depth == 1) {
                if (rqd->scale_step > 0)
                        rqd->max_depth = 1;
                else {
                        rqd->max_depth = 2;
                        ret = true;
                }
        } else {
                /*
                 * scale_step == 0 is our default state. If we have suffered
                 * latency spikes, step will be > 0, and we shrink the
                 * allowed write depths. If step is < 0, we're only doing
                 * writes, and we allow a temporarily higher depth to
                 * increase performance.
                 */
                depth = min_t(unsigned int, rqd->default_depth,
                              rqd->queue_depth);
                if (rqd->scale_step > 0)
                        depth = 1 + ((depth - 1) >> min(31, rqd->scale_step));
                else if (rqd->scale_step < 0) {
                        unsigned int maxd = 3 * rqd->queue_depth / 4;

                        depth = 1 + ((depth - 1) << -rqd->scale_step);
                        if (depth > maxd) {
                                depth = maxd;
                                ret = true;
                        }
                }

                rqd->max_depth = depth;
        }

        return ret;
}

/* Returns true on success and false if scaling up wasn't possible */
bool rq_depth_scale_up(struct rq_depth *rqd)
{
        /*
         * Hit max in previous round, stop here
         */
        if (rqd->scaled_max)
                return false;

        rqd->scale_step--;

        rqd->scaled_max = rq_depth_calc_max_depth(rqd);
        return true;
}

/*
 * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
 * had a latency violation. Returns true on success and returns false if
 * scaling down wasn't possible.
 */
bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
{
        /*
         * Stop scaling down when we've hit the limit. This also prevents
         * ->scale_step from going to crazy values, if the device can't
         * keep up.
         */
        if (rqd->max_depth == 1)
                return false;

        if (rqd->scale_step < 0 && hard_throttle)
                rqd->scale_step = 0;
        else
                rqd->scale_step++;

        rqd->scaled_max = false;
        rq_depth_calc_max_depth(rqd);
        return true;
}

struct rq_qos_wait_data {
        struct wait_queue_entry wq;
        struct task_struct *task;
        struct rq_wait *rqw;
        acquire_inflight_cb_t *cb;
        void *private_data;
        bool got_token;
};

static int rq_qos_wake_function(struct wait_queue_entry *curr,
                                unsigned int mode, int wake_flags, void *key)
{
        struct rq_qos_wait_data *data = container_of(curr,
                                                     struct rq_qos_wait_data,
                                                     wq);

        /*
         * If we fail to get a budget, return -1 to interrupt the wake up loop
         * in __wake_up_common.
         */
        if (!data->cb(data->rqw, data->private_data))
                return -1;

        data->got_token = true;
        smp_wmb();
        list_del_init(&curr->entry);
        wake_up_process(data->task);
        return 1;
}

/**
 * rq_qos_wait - throttle on a rqw if we need to
 * @rqw: rqw to throttle on
 * @private_data: caller provided specific data
 * @acquire_inflight_cb: inc the rqw->inflight counter if we can
 * @cleanup_cb: the callback to cleanup in case we race with a waker
 *
 * This provides a uniform place for the rq_qos users to do their throttling.
 * Since you can end up with a lot of things sleeping at once, this manages the
 * waking up based on the resources available.  The acquire_inflight_cb should
 * inc the rqw->inflight if we have the ability to do so, or return false if not
 * and then we will sleep until the room becomes available.
 *
 * cleanup_cb is in case that we race with a waker and need to cleanup the
 * inflight count accordingly.
 */
void rq_qos_wait(struct rq_wait *rqw, void *private_data,
                 acquire_inflight_cb_t *acquire_inflight_cb,
                 cleanup_cb_t *cleanup_cb)
{
        struct rq_qos_wait_data data = {
                .wq = {
                        .func        = rq_qos_wake_function,
                        .entry        = LIST_HEAD_INIT(data.wq.entry),
                },
                .task = current,
                .rqw = rqw,
                .cb = acquire_inflight_cb,
                .private_data = private_data,
        };
        bool has_sleeper;

        has_sleeper = wq_has_sleeper(&rqw->wait);
        if (!has_sleeper && acquire_inflight_cb(rqw, private_data))
                return;

        has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq,
                                                 TASK_UNINTERRUPTIBLE);
        do {
                /* The memory barrier in set_task_state saves us here. */
                if (data.got_token)
                        break;
                if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
                        finish_wait(&rqw->wait, &data.wq);

                        /*
                         * We raced with rq_qos_wake_function() getting a token,
                         * which means we now have two. Put our local token
                         * and wake anyone else potentially waiting for one.
                         */
                        smp_rmb();
                        if (data.got_token)
                                cleanup_cb(rqw, private_data);
                        break;
                }
                io_schedule();
                has_sleeper = true;
                set_current_state(TASK_UNINTERRUPTIBLE);
        } while (1);
        finish_wait(&rqw->wait, &data.wq);
}

void rq_qos_exit(struct request_queue *q)
{
        mutex_lock(&q->rq_qos_mutex);
        while (q->rq_qos) {
                struct rq_qos *rqos = q->rq_qos;
                q->rq_qos = rqos->next;
                rqos->ops->exit(rqos);
        }
        mutex_unlock(&q->rq_qos_mutex);
}

int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
                const struct rq_qos_ops *ops)
{
        struct request_queue *q = disk->queue;

        lockdep_assert_held(&q->rq_qos_mutex);

        rqos->disk = disk;
        rqos->id = id;
        rqos->ops = ops;

        /*
         * No IO can be in-flight when adding rqos, so freeze queue, which
         * is fine since we only support rq_qos for blk-mq queue.
         */
        blk_mq_freeze_queue(q);

        if (rq_qos_id(q, rqos->id))
                goto ebusy;
        rqos->next = q->rq_qos;
        q->rq_qos = rqos;

        blk_mq_unfreeze_queue(q);

        if (rqos->ops->debugfs_attrs) {
                mutex_lock(&q->debugfs_mutex);
                blk_mq_debugfs_register_rqos(rqos);
                mutex_unlock(&q->debugfs_mutex);
        }

        return 0;
ebusy:
        blk_mq_unfreeze_queue(q);
        return -EBUSY;
}

void rq_qos_del(struct rq_qos *rqos)
{
        struct request_queue *q = rqos->disk->queue;
        struct rq_qos **cur;

        lockdep_assert_held(&q->rq_qos_mutex);

        blk_mq_freeze_queue(q);
        for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) {
                if (*cur == rqos) {
                        *cur = rqos->next;
                        break;
                }
        }
        blk_mq_unfreeze_queue(q);

        mutex_lock(&q->debugfs_mutex);
        blk_mq_debugfs_unregister_rqos(rqos);
        mutex_unlock(&q->debugfs_mutex);
}

































    1 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Supervisor Mode Access Prevention support
 *
 * Copyright (C) 2012 Intel Corporation
 * Author: H. Peter Anvin <hpa@linux.intel.com>
 */

#ifndef _ASM_X86_SMAP_H
#define _ASM_X86_SMAP_H

#include <asm/nops.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>

/* "Raw" instruction opcodes */
#define __ASM_CLAC        ".byte 0x0f,0x01,0xca"
#define __ASM_STAC        ".byte 0x0f,0x01,0xcb"

#ifdef __ASSEMBLY__

#define ASM_CLAC \
        ALTERNATIVE "", __ASM_CLAC, X86_FEATURE_SMAP

#define ASM_STAC \
        ALTERNATIVE "", __ASM_STAC, X86_FEATURE_SMAP

#else /* __ASSEMBLY__ */

static __always_inline void clac(void)
{
        /* Note: a barrier is implicit in alternative() */
        alternative("", __ASM_CLAC, X86_FEATURE_SMAP);
}

static __always_inline void stac(void)
{
        /* Note: a barrier is implicit in alternative() */
        alternative("", __ASM_STAC, X86_FEATURE_SMAP);
}

static __always_inline unsigned long smap_save(void)
{
        unsigned long flags;

        asm volatile ("# smap_save\n\t"
                      ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC "\n\t",
                                  X86_FEATURE_SMAP)
                      : "=rm" (flags) : : "memory", "cc");

        return flags;
}

static __always_inline void smap_restore(unsigned long flags)
{
        asm volatile ("# smap_restore\n\t"
                      ALTERNATIVE("", "push %0; popf\n\t",
                                  X86_FEATURE_SMAP)
                      : : "g" (flags) : "memory", "cc");
}

/* These macros can be used in asm() statements */
#define ASM_CLAC \
        ALTERNATIVE("", __ASM_CLAC, X86_FEATURE_SMAP)
#define ASM_STAC \
        ALTERNATIVE("", __ASM_STAC, X86_FEATURE_SMAP)

#endif /* __ASSEMBLY__ */

#endif /* _ASM_X86_SMAP_H */


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 





































































































































































































































































































































   11 


































































    4 










































































































































































   19 





































    1 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   16 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 












































    3 





















    3 






















































































































































































































































    2 















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
// SPDX-License-Identifier: GPL-2.0

// Generated by scripts/atomic/gen-atomic-instrumented.sh
// DO NOT MODIFY THIS FILE DIRECTLY

/*
 * This file provoides atomic operations with explicit instrumentation (e.g.
 * KASAN, KCSAN), which should be used unless it is necessary to avoid
 * instrumentation. Where it is necessary to aovid instrumenation, the
 * raw_atomic*() operations should be used.
 */
#ifndef _LINUX_ATOMIC_INSTRUMENTED_H
#define _LINUX_ATOMIC_INSTRUMENTED_H

#include <linux/build_bug.h>
#include <linux/compiler.h>
#include <linux/instrumented.h>

/**
 * atomic_read() - atomic load with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_read() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline int
atomic_read(const atomic_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic_read(v);
}

/**
 * atomic_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_read_acquire() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline int
atomic_read_acquire(const atomic_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic_read_acquire(v);
}

/**
 * atomic_set() - atomic set with relaxed ordering
 * @v: pointer to atomic_t
 * @i: int value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_set() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_set(atomic_t *v, int i)
{
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic_set(v, i);
}

/**
 * atomic_set_release() - atomic set with release ordering
 * @v: pointer to atomic_t
 * @i: int value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_set_release() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_set_release(atomic_t *v, int i)
{
        kcsan_release();
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic_set_release(v, i);
}

/**
 * atomic_add() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_add(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_add(i, v);
}

/**
 * atomic_add_return() - atomic add with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_add_return(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_return(i, v);
}

/**
 * atomic_add_return_acquire() - atomic add with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_add_return_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_return_acquire(i, v);
}

/**
 * atomic_add_return_release() - atomic add with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_add_return_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_return_release(i, v);
}

/**
 * atomic_add_return_relaxed() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_add_return_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_return_relaxed(i, v);
}

/**
 * atomic_fetch_add() - atomic add with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add(i, v);
}

/**
 * atomic_fetch_add_acquire() - atomic add with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add_acquire(i, v);
}

/**
 * atomic_fetch_add_release() - atomic add with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add_release(i, v);
}

/**
 * atomic_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add_relaxed(i, v);
}

/**
 * atomic_sub() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_sub(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_sub(i, v);
}

/**
 * atomic_sub_return() - atomic subtract with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_sub_return(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_return(i, v);
}

/**
 * atomic_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_sub_return_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_return_acquire(i, v);
}

/**
 * atomic_sub_return_release() - atomic subtract with release ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_sub_return_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_return_release(i, v);
}

/**
 * atomic_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_sub_return_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_return_relaxed(i, v);
}

/**
 * atomic_fetch_sub() - atomic subtract with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_sub() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_sub(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_sub(i, v);
}

/**
 * atomic_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_sub_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_sub_acquire(i, v);
}

/**
 * atomic_fetch_sub_release() - atomic subtract with release ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_sub_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_sub_release(i, v);
}

/**
 * atomic_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_sub_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_sub_relaxed(i, v);
}

/**
 * atomic_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_inc(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_inc(v);
}

/**
 * atomic_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_inc_return(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_return(v);
}

/**
 * atomic_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_inc_return_acquire(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_return_acquire(v);
}

/**
 * atomic_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_inc_return_release(atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_return_release(v);
}

/**
 * atomic_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_inc_return_relaxed(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_return_relaxed(v);
}

/**
 * atomic_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_inc() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_inc(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_inc(v);
}

/**
 * atomic_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_inc_acquire(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_inc_acquire(v);
}

/**
 * atomic_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_inc_release(atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_inc_release(v);
}

/**
 * atomic_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_inc_relaxed(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_inc_relaxed(v);
}

/**
 * atomic_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_dec(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_dec(v);
}

/**
 * atomic_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_dec_return(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_return(v);
}

/**
 * atomic_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_dec_return_acquire(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_return_acquire(v);
}

/**
 * atomic_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_dec_return_release(atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_return_release(v);
}

/**
 * atomic_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_dec_return_relaxed(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_return_relaxed(v);
}

/**
 * atomic_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_dec() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_dec(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_dec(v);
}

/**
 * atomic_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_dec_acquire(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_dec_acquire(v);
}

/**
 * atomic_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_dec_release(atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_dec_release(v);
}

/**
 * atomic_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_dec_relaxed(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_dec_relaxed(v);
}

/**
 * atomic_and() - atomic bitwise AND with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_and() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_and(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_and(i, v);
}

/**
 * atomic_fetch_and() - atomic bitwise AND with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_and() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_and(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_and(i, v);
}

/**
 * atomic_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_and_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_and_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_and_acquire(i, v);
}

/**
 * atomic_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_and_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_and_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_and_release(i, v);
}

/**
 * atomic_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_and_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_and_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_and_relaxed(i, v);
}

/**
 * atomic_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_andnot() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_andnot(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_andnot(i, v);
}

/**
 * atomic_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_andnot(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_andnot(i, v);
}

/**
 * atomic_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_andnot_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_andnot_acquire(i, v);
}

/**
 * atomic_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_andnot_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_andnot_release(i, v);
}

/**
 * atomic_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_andnot_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_andnot_relaxed(i, v);
}

/**
 * atomic_or() - atomic bitwise OR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_or() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_or(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_or(i, v);
}

/**
 * atomic_fetch_or() - atomic bitwise OR with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_or() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_or(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_or(i, v);
}

/**
 * atomic_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_or_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_or_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_or_acquire(i, v);
}

/**
 * atomic_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_or_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_or_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_or_release(i, v);
}

/**
 * atomic_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_or_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_or_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_or_relaxed(i, v);
}

/**
 * atomic_xor() - atomic bitwise XOR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xor() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_xor(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_xor(i, v);
}

/**
 * atomic_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_xor() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_xor(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_xor(i, v);
}

/**
 * atomic_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_xor_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_xor_acquire(i, v);
}

/**
 * atomic_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_xor_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_xor_release(i, v);
}

/**
 * atomic_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_xor_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_xor_relaxed(i, v);
}

/**
 * atomic_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_xchg(atomic_t *v, int new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_xchg(v, new);
}

/**
 * atomic_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_xchg_acquire(atomic_t *v, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_xchg_acquire(v, new);
}

/**
 * atomic_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_xchg_release(atomic_t *v, int new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_xchg_release(v, new);
}

/**
 * atomic_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_xchg_relaxed(atomic_t *v, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_xchg_relaxed(v, new);
}

/**
 * atomic_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_cmpxchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_cmpxchg(atomic_t *v, int old, int new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_cmpxchg(v, old, new);
}

/**
 * atomic_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_cmpxchg_acquire(atomic_t *v, int old, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_cmpxchg_acquire(v, old, new);
}

/**
 * atomic_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_cmpxchg_release(atomic_t *v, int old, int new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_cmpxchg_release(v, old, new);
}

/**
 * atomic_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_cmpxchg_relaxed(atomic_t *v, int old, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_try_cmpxchg(v, old, new);
}

/**
 * atomic_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_acquire() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_try_cmpxchg_acquire(v, old, new);
}

/**
 * atomic_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_release() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_try_cmpxchg_release(v, old, new);
}

/**
 * atomic_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_relaxed() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_try_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_sub_and_test(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_and_test(i, v);
}

/**
 * atomic_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_dec_and_test(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_and_test(v);
}

/**
 * atomic_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_inc_and_test(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_and_test(v);
}

/**
 * atomic_add_negative() - atomic add and test if negative with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_negative() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_add_negative(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_negative(i, v);
}

/**
 * atomic_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_negative_acquire() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_add_negative_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_negative_acquire(i, v);
}

/**
 * atomic_add_negative_release() - atomic add and test if negative with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_negative_release() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_add_negative_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_negative_release(i, v);
}

/**
 * atomic_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_negative_relaxed() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_add_negative_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_negative_relaxed(i, v);
}

/**
 * atomic_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_t
 * @a: int value to add
 * @u: int value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add_unless() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add_unless(atomic_t *v, int a, int u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add_unless(v, a, u);
}

/**
 * atomic_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_t
 * @a: int value to add
 * @u: int value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_unless() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_add_unless(atomic_t *v, int a, int u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_unless(v, a, u);
}

/**
 * atomic_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_not_zero() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_inc_not_zero(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_not_zero(v);
}

/**
 * atomic_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_unless_negative() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_inc_unless_negative(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_unless_negative(v);
}

/**
 * atomic_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_unless_positive() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_dec_unless_positive(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_unless_positive(v);
}

/**
 * atomic_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_if_positive() there.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline int
atomic_dec_if_positive(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_if_positive(v);
}

/**
 * atomic64_read() - atomic load with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_read() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline s64
atomic64_read(const atomic64_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic64_read(v);
}

/**
 * atomic64_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_read_acquire() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline s64
atomic64_read_acquire(const atomic64_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic64_read_acquire(v);
}

/**
 * atomic64_set() - atomic set with relaxed ordering
 * @v: pointer to atomic64_t
 * @i: s64 value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_set() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_set(atomic64_t *v, s64 i)
{
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic64_set(v, i);
}

/**
 * atomic64_set_release() - atomic set with release ordering
 * @v: pointer to atomic64_t
 * @i: s64 value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_set_release() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_set_release(atomic64_t *v, s64 i)
{
        kcsan_release();
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic64_set_release(v, i);
}

/**
 * atomic64_add() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_add(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_add(i, v);
}

/**
 * atomic64_add_return() - atomic add with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_add_return(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_return(i, v);
}

/**
 * atomic64_add_return_acquire() - atomic add with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_add_return_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_return_acquire(i, v);
}

/**
 * atomic64_add_return_release() - atomic add with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_add_return_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_return_release(i, v);
}

/**
 * atomic64_add_return_relaxed() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_add_return_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_return_relaxed(i, v);
}

/**
 * atomic64_fetch_add() - atomic add with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add(i, v);
}

/**
 * atomic64_fetch_add_acquire() - atomic add with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add_acquire(i, v);
}

/**
 * atomic64_fetch_add_release() - atomic add with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add_release(i, v);
}

/**
 * atomic64_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add_relaxed(i, v);
}

/**
 * atomic64_sub() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_sub(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_sub(i, v);
}

/**
 * atomic64_sub_return() - atomic subtract with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_sub_return(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_return(i, v);
}

/**
 * atomic64_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_sub_return_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_return_acquire(i, v);
}

/**
 * atomic64_sub_return_release() - atomic subtract with release ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_sub_return_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_return_release(i, v);
}

/**
 * atomic64_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_sub_return_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_return_relaxed(i, v);
}

/**
 * atomic64_fetch_sub() - atomic subtract with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_sub(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_sub(i, v);
}

/**
 * atomic64_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_sub_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_sub_acquire(i, v);
}

/**
 * atomic64_fetch_sub_release() - atomic subtract with release ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_sub_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_sub_release(i, v);
}

/**
 * atomic64_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_sub_relaxed(i, v);
}

/**
 * atomic64_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_inc(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_inc(v);
}

/**
 * atomic64_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_inc_return(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_return(v);
}

/**
 * atomic64_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_inc_return_acquire(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_return_acquire(v);
}

/**
 * atomic64_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_inc_return_release(atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_return_release(v);
}

/**
 * atomic64_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_inc_return_relaxed(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_return_relaxed(v);
}

/**
 * atomic64_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_inc(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_inc(v);
}

/**
 * atomic64_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_inc_acquire(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_inc_acquire(v);
}

/**
 * atomic64_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_inc_release(atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_inc_release(v);
}

/**
 * atomic64_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_inc_relaxed(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_inc_relaxed(v);
}

/**
 * atomic64_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_dec(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_dec(v);
}

/**
 * atomic64_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_dec_return(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_return(v);
}

/**
 * atomic64_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_dec_return_acquire(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_return_acquire(v);
}

/**
 * atomic64_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_dec_return_release(atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_return_release(v);
}

/**
 * atomic64_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_dec_return_relaxed(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_return_relaxed(v);
}

/**
 * atomic64_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_dec(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_dec(v);
}

/**
 * atomic64_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_dec_acquire(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_dec_acquire(v);
}

/**
 * atomic64_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_dec_release(atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_dec_release(v);
}

/**
 * atomic64_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_dec_relaxed(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_dec_relaxed(v);
}

/**
 * atomic64_and() - atomic bitwise AND with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_and() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_and(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_and(i, v);
}

/**
 * atomic64_fetch_and() - atomic bitwise AND with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_and() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_and(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_and(i, v);
}

/**
 * atomic64_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_and_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_and_acquire(i, v);
}

/**
 * atomic64_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_and_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_and_release(i, v);
}

/**
 * atomic64_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_and_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_and_relaxed(i, v);
}

/**
 * atomic64_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_andnot() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_andnot(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_andnot(i, v);
}

/**
 * atomic64_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_andnot(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_andnot(i, v);
}

/**
 * atomic64_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_andnot_acquire(i, v);
}

/**
 * atomic64_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_andnot_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_andnot_release(i, v);
}

/**
 * atomic64_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_andnot_relaxed(i, v);
}

/**
 * atomic64_or() - atomic bitwise OR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_or() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_or(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_or(i, v);
}

/**
 * atomic64_fetch_or() - atomic bitwise OR with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_or() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_or(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_or(i, v);
}

/**
 * atomic64_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_or_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_or_acquire(i, v);
}

/**
 * atomic64_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_or_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_or_release(i, v);
}

/**
 * atomic64_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_or_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_or_relaxed(i, v);
}

/**
 * atomic64_xor() - atomic bitwise XOR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xor() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_xor(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_xor(i, v);
}

/**
 * atomic64_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_xor(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_xor(i, v);
}

/**
 * atomic64_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_xor_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_xor_acquire(i, v);
}

/**
 * atomic64_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_xor_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_xor_release(i, v);
}

/**
 * atomic64_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_xor_relaxed(i, v);
}

/**
 * atomic64_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_xchg(atomic64_t *v, s64 new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_xchg(v, new);
}

/**
 * atomic64_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_xchg_acquire(atomic64_t *v, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_xchg_acquire(v, new);
}

/**
 * atomic64_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_xchg_release(atomic64_t *v, s64 new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_xchg_release(v, new);
}

/**
 * atomic64_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_xchg_relaxed(atomic64_t *v, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_xchg_relaxed(v, new);
}

/**
 * atomic64_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_cmpxchg(v, old, new);
}

/**
 * atomic64_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_cmpxchg_acquire(v, old, new);
}

/**
 * atomic64_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_cmpxchg_release(v, old, new);
}

/**
 * atomic64_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic64_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic64_try_cmpxchg(v, old, new);
}

/**
 * atomic64_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_acquire() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic64_try_cmpxchg_acquire(v, old, new);
}

/**
 * atomic64_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_release() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic64_try_cmpxchg_release(v, old, new);
}

/**
 * atomic64_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_relaxed() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic64_try_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic64_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic64_sub_and_test(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_and_test(i, v);
}

/**
 * atomic64_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic64_dec_and_test(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_and_test(v);
}

/**
 * atomic64_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic64_inc_and_test(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_and_test(v);
}

/**
 * atomic64_add_negative() - atomic add and test if negative with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_negative() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic64_add_negative(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_negative(i, v);
}

/**
 * atomic64_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_negative_acquire() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic64_add_negative_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_negative_acquire(i, v);
}

/**
 * atomic64_add_negative_release() - atomic add and test if negative with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_negative_release() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic64_add_negative_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_negative_release(i, v);
}

/**
 * atomic64_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_negative_relaxed() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_negative_relaxed(i, v);
}

/**
 * atomic64_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic64_t
 * @a: s64 value to add
 * @u: s64 value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_unless() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add_unless(v, a, u);
}

/**
 * atomic64_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic64_t
 * @a: s64 value to add
 * @u: s64 value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_unless() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_unless(v, a, u);
}

/**
 * atomic64_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_not_zero() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic64_inc_not_zero(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_not_zero(v);
}

/**
 * atomic64_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_unless_negative() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic64_inc_unless_negative(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_unless_negative(v);
}

/**
 * atomic64_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_unless_positive() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic64_dec_unless_positive(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_unless_positive(v);
}

/**
 * atomic64_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_if_positive() there.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline s64
atomic64_dec_if_positive(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_if_positive(v);
}

/**
 * atomic_long_read() - atomic load with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_read() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline long
atomic_long_read(const atomic_long_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic_long_read(v);
}

/**
 * atomic_long_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_read_acquire() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline long
atomic_long_read_acquire(const atomic_long_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic_long_read_acquire(v);
}

/**
 * atomic_long_set() - atomic set with relaxed ordering
 * @v: pointer to atomic_long_t
 * @i: long value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_set() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_set(atomic_long_t *v, long i)
{
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic_long_set(v, i);
}

/**
 * atomic_long_set_release() - atomic set with release ordering
 * @v: pointer to atomic_long_t
 * @i: long value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_set_release() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_set_release(atomic_long_t *v, long i)
{
        kcsan_release();
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic_long_set_release(v, i);
}

/**
 * atomic_long_add() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_add(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_add(i, v);
}

/**
 * atomic_long_add_return() - atomic add with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_add_return(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_return(i, v);
}

/**
 * atomic_long_add_return_acquire() - atomic add with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_add_return_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_return_acquire(i, v);
}

/**
 * atomic_long_add_return_release() - atomic add with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_add_return_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_return_release(i, v);
}

/**
 * atomic_long_add_return_relaxed() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_add_return_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_return_relaxed(i, v);
}

/**
 * atomic_long_fetch_add() - atomic add with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add(i, v);
}

/**
 * atomic_long_fetch_add_acquire() - atomic add with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add_acquire(i, v);
}

/**
 * atomic_long_fetch_add_release() - atomic add with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add_release(i, v);
}

/**
 * atomic_long_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add_relaxed(i, v);
}

/**
 * atomic_long_sub() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_sub(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_sub(i, v);
}

/**
 * atomic_long_sub_return() - atomic subtract with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_sub_return(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_return(i, v);
}

/**
 * atomic_long_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_sub_return_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_return_acquire(i, v);
}

/**
 * atomic_long_sub_return_release() - atomic subtract with release ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_sub_return_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_return_release(i, v);
}

/**
 * atomic_long_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_sub_return_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_return_relaxed(i, v);
}

/**
 * atomic_long_fetch_sub() - atomic subtract with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_sub(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_sub(i, v);
}

/**
 * atomic_long_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_sub_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_sub_acquire(i, v);
}

/**
 * atomic_long_fetch_sub_release() - atomic subtract with release ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_sub_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_sub_release(i, v);
}

/**
 * atomic_long_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_sub_relaxed(i, v);
}

/**
 * atomic_long_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_inc(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_inc(v);
}

/**
 * atomic_long_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_inc_return(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_return(v);
}

/**
 * atomic_long_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_inc_return_acquire(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_return_acquire(v);
}

/**
 * atomic_long_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_inc_return_release(atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_return_release(v);
}

/**
 * atomic_long_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_inc_return_relaxed(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_return_relaxed(v);
}

/**
 * atomic_long_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_inc(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_inc(v);
}

/**
 * atomic_long_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_inc_acquire(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_inc_acquire(v);
}

/**
 * atomic_long_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_inc_release(atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_inc_release(v);
}

/**
 * atomic_long_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_inc_relaxed(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_inc_relaxed(v);
}

/**
 * atomic_long_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_dec(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_dec(v);
}

/**
 * atomic_long_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_dec_return(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_return(v);
}

/**
 * atomic_long_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_dec_return_acquire(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_return_acquire(v);
}

/**
 * atomic_long_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_dec_return_release(atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_return_release(v);
}

/**
 * atomic_long_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_dec_return_relaxed(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_return_relaxed(v);
}

/**
 * atomic_long_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_dec(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_dec(v);
}

/**
 * atomic_long_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_dec_acquire(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_dec_acquire(v);
}

/**
 * atomic_long_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_dec_release(atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_dec_release(v);
}

/**
 * atomic_long_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_dec_relaxed(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_dec_relaxed(v);
}

/**
 * atomic_long_and() - atomic bitwise AND with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_and() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_and(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_and(i, v);
}

/**
 * atomic_long_fetch_and() - atomic bitwise AND with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_and(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_and(i, v);
}

/**
 * atomic_long_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_and_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_and_acquire(i, v);
}

/**
 * atomic_long_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_and_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_and_release(i, v);
}

/**
 * atomic_long_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_and_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_and_relaxed(i, v);
}

/**
 * atomic_long_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_andnot() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_andnot(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_andnot(i, v);
}

/**
 * atomic_long_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_andnot(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_andnot(i, v);
}

/**
 * atomic_long_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_andnot_acquire(i, v);
}

/**
 * atomic_long_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_andnot_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_andnot_release(i, v);
}

/**
 * atomic_long_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_andnot_relaxed(i, v);
}

/**
 * atomic_long_or() - atomic bitwise OR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_or() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_or(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_or(i, v);
}

/**
 * atomic_long_fetch_or() - atomic bitwise OR with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_or(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_or(i, v);
}

/**
 * atomic_long_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_or_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_or_acquire(i, v);
}

/**
 * atomic_long_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_or_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_or_release(i, v);
}

/**
 * atomic_long_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_or_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_or_relaxed(i, v);
}

/**
 * atomic_long_xor() - atomic bitwise XOR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xor() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_xor(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_xor(i, v);
}

/**
 * atomic_long_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_xor(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_xor(i, v);
}

/**
 * atomic_long_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_xor_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_xor_acquire(i, v);
}

/**
 * atomic_long_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_xor_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_xor_release(i, v);
}

/**
 * atomic_long_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_xor_relaxed(i, v);
}

/**
 * atomic_long_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_xchg(atomic_long_t *v, long new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_xchg(v, new);
}

/**
 * atomic_long_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_xchg_acquire(atomic_long_t *v, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_xchg_acquire(v, new);
}

/**
 * atomic_long_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_xchg_release(atomic_long_t *v, long new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_xchg_release(v, new);
}

/**
 * atomic_long_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_xchg_relaxed(atomic_long_t *v, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_xchg_relaxed(v, new);
}

/**
 * atomic_long_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_cmpxchg(atomic_long_t *v, long old, long new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_cmpxchg(v, old, new);
}

/**
 * atomic_long_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_cmpxchg_acquire(v, old, new);
}

/**
 * atomic_long_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_cmpxchg_release(v, old, new);
}

/**
 * atomic_long_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic_long_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_long_try_cmpxchg(v, old, new);
}

/**
 * atomic_long_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_acquire() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_long_try_cmpxchg_acquire(v, old, new);
}

/**
 * atomic_long_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_release() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_long_try_cmpxchg_release(v, old, new);
}

/**
 * atomic_long_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_relaxed() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_long_try_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_long_sub_and_test(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_and_test(i, v);
}

/**
 * atomic_long_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_long_dec_and_test(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_and_test(v);
}

/**
 * atomic_long_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_long_inc_and_test(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_and_test(v);
}

/**
 * atomic_long_add_negative() - atomic add and test if negative with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_negative() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_long_add_negative(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_negative(i, v);
}

/**
 * atomic_long_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_acquire() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_long_add_negative_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_negative_acquire(i, v);
}

/**
 * atomic_long_add_negative_release() - atomic add and test if negative with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_release() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_long_add_negative_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_negative_release(i, v);
}

/**
 * atomic_long_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_relaxed() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_negative_relaxed(i, v);
}

/**
 * atomic_long_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_long_t
 * @a: long value to add
 * @u: long value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_unless() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add_unless(v, a, u);
}

/**
 * atomic_long_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_long_t
 * @a: long value to add
 * @u: long value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_unless() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_long_add_unless(atomic_long_t *v, long a, long u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_unless(v, a, u);
}

/**
 * atomic_long_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_not_zero() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_long_inc_not_zero(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_not_zero(v);
}

/**
 * atomic_long_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_unless_negative() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_long_inc_unless_negative(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_unless_negative(v);
}

/**
 * atomic_long_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_unless_positive() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_long_dec_unless_positive(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_unless_positive(v);
}

/**
 * atomic_long_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_if_positive() there.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline long
atomic_long_dec_if_positive(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_if_positive(v);
}

#define xchg(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_xchg(__ai_ptr, __VA_ARGS__); \
})

#define xchg_acquire(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_xchg_acquire(__ai_ptr, __VA_ARGS__); \
})

#define xchg_release(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_xchg_release(__ai_ptr, __VA_ARGS__); \
})

#define xchg_relaxed(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_xchg_relaxed(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg_acquire(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg_acquire(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg_release(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg_release(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg_relaxed(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg_relaxed(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64_acquire(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64_acquire(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64_release(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64_release(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64_relaxed(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64_relaxed(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128_acquire(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128_acquire(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128_release(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128_release(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128_relaxed(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128_relaxed(__ai_ptr, __VA_ARGS__); \
})

#define try_cmpxchg(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg_acquire(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg_release(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg_relaxed(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64_acquire(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64_release(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64_relaxed(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128_acquire(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128_release(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128_relaxed(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define cmpxchg_local(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg_local(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64_local(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64_local(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128_local(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128_local(__ai_ptr, __VA_ARGS__); \
})

#define sync_cmpxchg(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_sync_cmpxchg(__ai_ptr, __VA_ARGS__); \
})

#define try_cmpxchg_local(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64_local(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128_local(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define sync_try_cmpxchg(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_sync_try_cmpxchg(__ai_ptr, __VA_ARGS__); \
})


#endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
// 8829b337928e9508259079d32581775ececd415b


















   22 


   28 




































































































    6 











    1 











    1 












    1 
















    2 















    2 
















    2 

















    4 















    4 






















    2 






















    4 



    4 




















    1 
    1 













    6 




    8 














    1 












    3 














    4 


    1 


    2 

    3 











    1 
















    2 



























    1 





















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/tomoyo.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include <linux/lsm_hooks.h>
#include <uapi/linux/lsm.h>
#include "common.h"

/**
 * tomoyo_domain - Get "struct tomoyo_domain_info" for current thread.
 *
 * Returns pointer to "struct tomoyo_domain_info" for current thread.
 */
struct tomoyo_domain_info *tomoyo_domain(void)
{
        struct tomoyo_task *s = tomoyo_task(current);

        if (s->old_domain_info && !current->in_execve) {
                atomic_dec(&s->old_domain_info->users);
                s->old_domain_info = NULL;
        }
        return s->domain_info;
}

/**
 * tomoyo_cred_prepare - Target for security_prepare_creds().
 *
 * @new: Pointer to "struct cred".
 * @old: Pointer to "struct cred".
 * @gfp: Memory allocation flags.
 *
 * Returns 0.
 */
static int tomoyo_cred_prepare(struct cred *new, const struct cred *old,
                               gfp_t gfp)
{
        /* Restore old_domain_info saved by previous execve() request. */
        struct tomoyo_task *s = tomoyo_task(current);

        if (s->old_domain_info && !current->in_execve) {
                atomic_dec(&s->domain_info->users);
                s->domain_info = s->old_domain_info;
                s->old_domain_info = NULL;
        }
        return 0;
}

/**
 * tomoyo_bprm_committed_creds - Target for security_bprm_committed_creds().
 *
 * @bprm: Pointer to "struct linux_binprm".
 */
static void tomoyo_bprm_committed_creds(const struct linux_binprm *bprm)
{
        /* Clear old_domain_info saved by execve() request. */
        struct tomoyo_task *s = tomoyo_task(current);

        atomic_dec(&s->old_domain_info->users);
        s->old_domain_info = NULL;
}

#ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER
/**
 * tomoyo_bprm_creds_for_exec - Target for security_bprm_creds_for_exec().
 *
 * @bprm: Pointer to "struct linux_binprm".
 *
 * Returns 0.
 */
static int tomoyo_bprm_creds_for_exec(struct linux_binprm *bprm)
{
        /*
         * Load policy if /sbin/tomoyo-init exists and /sbin/init is requested
         * for the first time.
         */
        if (!tomoyo_policy_loaded)
                tomoyo_load_policy(bprm->filename);
        return 0;
}
#endif

/**
 * tomoyo_bprm_check_security - Target for security_bprm_check().
 *
 * @bprm: Pointer to "struct linux_binprm".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_bprm_check_security(struct linux_binprm *bprm)
{
        struct tomoyo_task *s = tomoyo_task(current);

        /*
         * Execute permission is checked against pathname passed to execve()
         * using current domain.
         */
        if (!s->old_domain_info) {
                const int idx = tomoyo_read_lock();
                const int err = tomoyo_find_next_domain(bprm);

                tomoyo_read_unlock(idx);
                return err;
        }
        /*
         * Read permission is checked against interpreters using next domain.
         */
        return tomoyo_check_open_permission(s->domain_info,
                                            &bprm->file->f_path, O_RDONLY);
}

/**
 * tomoyo_inode_getattr - Target for security_inode_getattr().
 *
 * @path: Pointer to "struct path".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_inode_getattr(const struct path *path)
{
        return tomoyo_path_perm(TOMOYO_TYPE_GETATTR, path, NULL);
}

/**
 * tomoyo_path_truncate - Target for security_path_truncate().
 *
 * @path: Pointer to "struct path".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_truncate(const struct path *path)
{
        return tomoyo_path_perm(TOMOYO_TYPE_TRUNCATE, path, NULL);
}

/**
 * tomoyo_file_truncate - Target for security_file_truncate().
 *
 * @file: Pointer to "struct file".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_file_truncate(struct file *file)
{
        return tomoyo_path_truncate(&file->f_path);
}

/**
 * tomoyo_path_unlink - Target for security_path_unlink().
 *
 * @parent: Pointer to "struct path".
 * @dentry: Pointer to "struct dentry".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_unlink(const struct path *parent, struct dentry *dentry)
{
        struct path path = { .mnt = parent->mnt, .dentry = dentry };

        return tomoyo_path_perm(TOMOYO_TYPE_UNLINK, &path, NULL);
}

/**
 * tomoyo_path_mkdir - Target for security_path_mkdir().
 *
 * @parent: Pointer to "struct path".
 * @dentry: Pointer to "struct dentry".
 * @mode:   DAC permission mode.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_mkdir(const struct path *parent, struct dentry *dentry,
                             umode_t mode)
{
        struct path path = { .mnt = parent->mnt, .dentry = dentry };

        return tomoyo_path_number_perm(TOMOYO_TYPE_MKDIR, &path,
                                       mode & S_IALLUGO);
}

/**
 * tomoyo_path_rmdir - Target for security_path_rmdir().
 *
 * @parent: Pointer to "struct path".
 * @dentry: Pointer to "struct dentry".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_rmdir(const struct path *parent, struct dentry *dentry)
{
        struct path path = { .mnt = parent->mnt, .dentry = dentry };

        return tomoyo_path_perm(TOMOYO_TYPE_RMDIR, &path, NULL);
}

/**
 * tomoyo_path_symlink - Target for security_path_symlink().
 *
 * @parent:   Pointer to "struct path".
 * @dentry:   Pointer to "struct dentry".
 * @old_name: Symlink's content.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_symlink(const struct path *parent, struct dentry *dentry,
                               const char *old_name)
{
        struct path path = { .mnt = parent->mnt, .dentry = dentry };

        return tomoyo_path_perm(TOMOYO_TYPE_SYMLINK, &path, old_name);
}

/**
 * tomoyo_path_mknod - Target for security_path_mknod().
 *
 * @parent: Pointer to "struct path".
 * @dentry: Pointer to "struct dentry".
 * @mode:   DAC permission mode.
 * @dev:    Device attributes.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_mknod(const struct path *parent, struct dentry *dentry,
                             umode_t mode, unsigned int dev)
{
        struct path path = { .mnt = parent->mnt, .dentry = dentry };
        int type = TOMOYO_TYPE_CREATE;
        const unsigned int perm = mode & S_IALLUGO;

        switch (mode & S_IFMT) {
        case S_IFCHR:
                type = TOMOYO_TYPE_MKCHAR;
                break;
        case S_IFBLK:
                type = TOMOYO_TYPE_MKBLOCK;
                break;
        default:
                goto no_dev;
        }
        return tomoyo_mkdev_perm(type, &path, perm, dev);
 no_dev:
        switch (mode & S_IFMT) {
        case S_IFIFO:
                type = TOMOYO_TYPE_MKFIFO;
                break;
        case S_IFSOCK:
                type = TOMOYO_TYPE_MKSOCK;
                break;
        }
        return tomoyo_path_number_perm(type, &path, perm);
}

/**
 * tomoyo_path_link - Target for security_path_link().
 *
 * @old_dentry: Pointer to "struct dentry".
 * @new_dir:    Pointer to "struct path".
 * @new_dentry: Pointer to "struct dentry".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_link(struct dentry *old_dentry, const struct path *new_dir,
                            struct dentry *new_dentry)
{
        struct path path1 = { .mnt = new_dir->mnt, .dentry = old_dentry };
        struct path path2 = { .mnt = new_dir->mnt, .dentry = new_dentry };

        return tomoyo_path2_perm(TOMOYO_TYPE_LINK, &path1, &path2);
}

/**
 * tomoyo_path_rename - Target for security_path_rename().
 *
 * @old_parent: Pointer to "struct path".
 * @old_dentry: Pointer to "struct dentry".
 * @new_parent: Pointer to "struct path".
 * @new_dentry: Pointer to "struct dentry".
 * @flags: Rename options.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_rename(const struct path *old_parent,
                              struct dentry *old_dentry,
                              const struct path *new_parent,
                              struct dentry *new_dentry,
                              const unsigned int flags)
{
        struct path path1 = { .mnt = old_parent->mnt, .dentry = old_dentry };
        struct path path2 = { .mnt = new_parent->mnt, .dentry = new_dentry };

        if (flags & RENAME_EXCHANGE) {
                const int err = tomoyo_path2_perm(TOMOYO_TYPE_RENAME, &path2,
                                &path1);

                if (err)
                        return err;
        }
        return tomoyo_path2_perm(TOMOYO_TYPE_RENAME, &path1, &path2);
}

/**
 * tomoyo_file_fcntl - Target for security_file_fcntl().
 *
 * @file: Pointer to "struct file".
 * @cmd:  Command for fcntl().
 * @arg:  Argument for @cmd.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_file_fcntl(struct file *file, unsigned int cmd,
                             unsigned long arg)
{
        if (!(cmd == F_SETFL && ((arg ^ file->f_flags) & O_APPEND)))
                return 0;
        return tomoyo_check_open_permission(tomoyo_domain(), &file->f_path,
                                            O_WRONLY | (arg & O_APPEND));
}

/**
 * tomoyo_file_open - Target for security_file_open().
 *
 * @f: Pointer to "struct file".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_file_open(struct file *f)
{
        /* Don't check read permission here if called from execve(). */
        /* Illogically, FMODE_EXEC is in f_flags, not f_mode. */
        if (f->f_flags & __FMODE_EXEC)
                return 0;
        return tomoyo_check_open_permission(tomoyo_domain(), &f->f_path,
                                            f->f_flags);
}

/**
 * tomoyo_file_ioctl - Target for security_file_ioctl().
 *
 * @file: Pointer to "struct file".
 * @cmd:  Command for ioctl().
 * @arg:  Argument for @cmd.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_file_ioctl(struct file *file, unsigned int cmd,
                             unsigned long arg)
{
        return tomoyo_path_number_perm(TOMOYO_TYPE_IOCTL, &file->f_path, cmd);
}

/**
 * tomoyo_path_chmod - Target for security_path_chmod().
 *
 * @path: Pointer to "struct path".
 * @mode: DAC permission mode.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_chmod(const struct path *path, umode_t mode)
{
        return tomoyo_path_number_perm(TOMOYO_TYPE_CHMOD, path,
                                       mode & S_IALLUGO);
}

/**
 * tomoyo_path_chown - Target for security_path_chown().
 *
 * @path: Pointer to "struct path".
 * @uid:  Owner ID.
 * @gid:  Group ID.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
{
        int error = 0;

        if (uid_valid(uid))
                error = tomoyo_path_number_perm(TOMOYO_TYPE_CHOWN, path,
                                                from_kuid(&init_user_ns, uid));
        if (!error && gid_valid(gid))
                error = tomoyo_path_number_perm(TOMOYO_TYPE_CHGRP, path,
                                                from_kgid(&init_user_ns, gid));
        return error;
}

/**
 * tomoyo_path_chroot - Target for security_path_chroot().
 *
 * @path: Pointer to "struct path".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_chroot(const struct path *path)
{
        return tomoyo_path_perm(TOMOYO_TYPE_CHROOT, path, NULL);
}

/**
 * tomoyo_sb_mount - Target for security_sb_mount().
 *
 * @dev_name: Name of device file. Maybe NULL.
 * @path:     Pointer to "struct path".
 * @type:     Name of filesystem type. Maybe NULL.
 * @flags:    Mount options.
 * @data:     Optional data. Maybe NULL.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_sb_mount(const char *dev_name, const struct path *path,
                           const char *type, unsigned long flags, void *data)
{
        return tomoyo_mount_permission(dev_name, path, type, flags, data);
}

/**
 * tomoyo_sb_umount - Target for security_sb_umount().
 *
 * @mnt:   Pointer to "struct vfsmount".
 * @flags: Unmount options.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_sb_umount(struct vfsmount *mnt, int flags)
{
        struct path path = { .mnt = mnt, .dentry = mnt->mnt_root };

        return tomoyo_path_perm(TOMOYO_TYPE_UMOUNT, &path, NULL);
}

/**
 * tomoyo_sb_pivotroot - Target for security_sb_pivotroot().
 *
 * @old_path: Pointer to "struct path".
 * @new_path: Pointer to "struct path".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_sb_pivotroot(const struct path *old_path, const struct path *new_path)
{
        return tomoyo_path2_perm(TOMOYO_TYPE_PIVOT_ROOT, new_path, old_path);
}

/**
 * tomoyo_socket_listen - Check permission for listen().
 *
 * @sock:    Pointer to "struct socket".
 * @backlog: Backlog parameter.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_socket_listen(struct socket *sock, int backlog)
{
        return tomoyo_socket_listen_permission(sock);
}

/**
 * tomoyo_socket_connect - Check permission for connect().
 *
 * @sock:     Pointer to "struct socket".
 * @addr:     Pointer to "struct sockaddr".
 * @addr_len: Size of @addr.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_socket_connect(struct socket *sock, struct sockaddr *addr,
                                 int addr_len)
{
        return tomoyo_socket_connect_permission(sock, addr, addr_len);
}

/**
 * tomoyo_socket_bind - Check permission for bind().
 *
 * @sock:     Pointer to "struct socket".
 * @addr:     Pointer to "struct sockaddr".
 * @addr_len: Size of @addr.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_socket_bind(struct socket *sock, struct sockaddr *addr,
                              int addr_len)
{
        return tomoyo_socket_bind_permission(sock, addr, addr_len);
}

/**
 * tomoyo_socket_sendmsg - Check permission for sendmsg().
 *
 * @sock: Pointer to "struct socket".
 * @msg:  Pointer to "struct msghdr".
 * @size: Size of message.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_socket_sendmsg(struct socket *sock, struct msghdr *msg,
                                 int size)
{
        return tomoyo_socket_sendmsg_permission(sock, msg, size);
}

struct lsm_blob_sizes tomoyo_blob_sizes __ro_after_init = {
        .lbs_task = sizeof(struct tomoyo_task),
};

/**
 * tomoyo_task_alloc - Target for security_task_alloc().
 *
 * @task:        Pointer to "struct task_struct".
 * @clone_flags: clone() flags.
 *
 * Returns 0.
 */
static int tomoyo_task_alloc(struct task_struct *task,
                             unsigned long clone_flags)
{
        struct tomoyo_task *old = tomoyo_task(current);
        struct tomoyo_task *new = tomoyo_task(task);

        new->domain_info = old->domain_info;
        atomic_inc(&new->domain_info->users);
        new->old_domain_info = NULL;
        return 0;
}

/**
 * tomoyo_task_free - Target for security_task_free().
 *
 * @task: Pointer to "struct task_struct".
 */
static void tomoyo_task_free(struct task_struct *task)
{
        struct tomoyo_task *s = tomoyo_task(task);

        if (s->domain_info) {
                atomic_dec(&s->domain_info->users);
                s->domain_info = NULL;
        }
        if (s->old_domain_info) {
                atomic_dec(&s->old_domain_info->users);
                s->old_domain_info = NULL;
        }
}

static const struct lsm_id tomoyo_lsmid = {
        .name = "tomoyo",
        .id = LSM_ID_TOMOYO,
};

/*
 * tomoyo_security_ops is a "struct security_operations" which is used for
 * registering TOMOYO.
 */
static struct security_hook_list tomoyo_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(cred_prepare, tomoyo_cred_prepare),
        LSM_HOOK_INIT(bprm_committed_creds, tomoyo_bprm_committed_creds),
        LSM_HOOK_INIT(task_alloc, tomoyo_task_alloc),
        LSM_HOOK_INIT(task_free, tomoyo_task_free),
#ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER
        LSM_HOOK_INIT(bprm_creds_for_exec, tomoyo_bprm_creds_for_exec),
#endif
        LSM_HOOK_INIT(bprm_check_security, tomoyo_bprm_check_security),
        LSM_HOOK_INIT(file_fcntl, tomoyo_file_fcntl),
        LSM_HOOK_INIT(file_open, tomoyo_file_open),
        LSM_HOOK_INIT(file_truncate, tomoyo_file_truncate),
        LSM_HOOK_INIT(path_truncate, tomoyo_path_truncate),
        LSM_HOOK_INIT(path_unlink, tomoyo_path_unlink),
        LSM_HOOK_INIT(path_mkdir, tomoyo_path_mkdir),
        LSM_HOOK_INIT(path_rmdir, tomoyo_path_rmdir),
        LSM_HOOK_INIT(path_symlink, tomoyo_path_symlink),
        LSM_HOOK_INIT(path_mknod, tomoyo_path_mknod),
        LSM_HOOK_INIT(path_link, tomoyo_path_link),
        LSM_HOOK_INIT(path_rename, tomoyo_path_rename),
        LSM_HOOK_INIT(inode_getattr, tomoyo_inode_getattr),
        LSM_HOOK_INIT(file_ioctl, tomoyo_file_ioctl),
        LSM_HOOK_INIT(file_ioctl_compat, tomoyo_file_ioctl),
        LSM_HOOK_INIT(path_chmod, tomoyo_path_chmod),
        LSM_HOOK_INIT(path_chown, tomoyo_path_chown),
        LSM_HOOK_INIT(path_chroot, tomoyo_path_chroot),
        LSM_HOOK_INIT(sb_mount, tomoyo_sb_mount),
        LSM_HOOK_INIT(sb_umount, tomoyo_sb_umount),
        LSM_HOOK_INIT(sb_pivotroot, tomoyo_sb_pivotroot),
        LSM_HOOK_INIT(socket_bind, tomoyo_socket_bind),
        LSM_HOOK_INIT(socket_connect, tomoyo_socket_connect),
        LSM_HOOK_INIT(socket_listen, tomoyo_socket_listen),
        LSM_HOOK_INIT(socket_sendmsg, tomoyo_socket_sendmsg),
};

/* Lock for GC. */
DEFINE_SRCU(tomoyo_ss);

int tomoyo_enabled __ro_after_init = 1;

/**
 * tomoyo_init - Register TOMOYO Linux as a LSM module.
 *
 * Returns 0.
 */
static int __init tomoyo_init(void)
{
        struct tomoyo_task *s = tomoyo_task(current);

        /* register ourselves with the security framework */
        security_add_hooks(tomoyo_hooks, ARRAY_SIZE(tomoyo_hooks),
                           &tomoyo_lsmid);
        pr_info("TOMOYO Linux initialized\n");
        s->domain_info = &tomoyo_kernel_domain;
        atomic_inc(&tomoyo_kernel_domain.users);
        s->old_domain_info = NULL;
        tomoyo_mm_init();

        return 0;
}

DEFINE_LSM(tomoyo) = {
        .name = "tomoyo",
        .enabled = &tomoyo_enabled,
        .flags = LSM_FLAG_LEGACY_MAJOR,
        .blobs = &tomoyo_blob_sizes,
        .init = tomoyo_init,
};











































































































    1 








    1 
















    1 




















































    1 






    1 
    1 



    1 


    1 
    1 





    1 








    1 




    1 
    1 

    1 


    1 























    1 








    1 



    1 







    1 
    1 



    1 






    1 





    1 

    1 






























    1 





    1 



















    1 













    1 





    1 




































    1 









    1 


































































































































































































































































































































    1 








    1 
    1 










    1 



























































    1 
    1 


















    1 





    1 
    1 






    1 
    1 




    1 












    1 






























    1 


    1 




    1 










    1 


    1 
    1 






















    1 



    1 




    1 











































































































































































































































































    1 







    1 


























    1 




    1 





























    1 













    1 

































    1 


















    1 






















    1 



































    1 











    1 







    1 























    1 








    1 

    1 









    1 


    1 













    1 


    1 









    1 



















    1 



















    1 




























































































    5 


    5 

    5 
    1 
    5 


    1 





















































    1 
    1 





    1 



    1 


    1 


    1 
    1 



















    1 

    1 
    1 









    1 










































































































    1 



    1 






    1 







































    1 











    1 






















    1 







    1 

    1 

    1 




    1 
    1 

    1 














    1 











































    1 


    1 

    1 







































































    1 




    1 




    1 


    1 




    1 
    1 

    1 







    1 









    1 





    1 





    1 





    1 











    1 

    1 




















    1 








































































    1 




    1 






    1 
    1 
    1 
    1 























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 








    1 

    1 













    1 

    1 
    1 

    1 


    1 



























    1 


    1 


    1 





    1 

    1 














    1 















    1 























































































    5 

























    3 




    1 





    4 



    5 
    5 




    4 



    1 














    5 



    5 






























































































    1 






    1 































    1 


    1 











































    1 
    1 


    1 
















    1 











    1 





































    1 















    1 




    1 



































    1 
    1 







    1 













    1 



    1 







    1 

    1 

















    1 
    1 


















































    5 
    4 

    4 








    1 

















    5 





    5 







    3 


    3 




    3 
    4 












    1 






    1 

    1 
































    1 





    1 











    1 




    1 






















    1 







    1 
    1 
    1 
    1 

    1 













    1 

    1 
    1 

    1 




































    1 

    1 


    1 
    1 






    1 


































































































































































    5 




    5 











    5 






























































    1 





    1 






    1 

























































    1 





    1 










    1 

























    1 





    1 




    1 

















    1 


    1 




    1 





    1 
    1 


    1 
























































































































































































    1 











    1 
























    1 








    1 




    1 




















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
// SPDX-License-Identifier: GPL-2.0

#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/bio.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/page-flags.h>
#include <linux/sched/mm.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/fsverity.h>
#include "extent_io.h"
#include "extent-io-tree.h"
#include "extent_map.h"
#include "ctree.h"
#include "btrfs_inode.h"
#include "bio.h"
#include "locking.h"
#include "backref.h"
#include "disk-io.h"
#include "subpage.h"
#include "zoned.h"
#include "block-group.h"
#include "compression.h"
#include "fs.h"
#include "accessors.h"
#include "file-item.h"
#include "file.h"
#include "dev-replace.h"
#include "super.h"
#include "transaction.h"

static struct kmem_cache *extent_buffer_cache;

#ifdef CONFIG_BTRFS_DEBUG
static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb)
{
        struct btrfs_fs_info *fs_info = eb->fs_info;
        unsigned long flags;

        spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
        list_add(&eb->leak_list, &fs_info->allocated_ebs);
        spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}

static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb)
{
        struct btrfs_fs_info *fs_info = eb->fs_info;
        unsigned long flags;

        spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
        list_del(&eb->leak_list);
        spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}

void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
{
        struct extent_buffer *eb;
        unsigned long flags;

        /*
         * If we didn't get into open_ctree our allocated_ebs will not be
         * initialized, so just skip this.
         */
        if (!fs_info->allocated_ebs.next)
                return;

        WARN_ON(!list_empty(&fs_info->allocated_ebs));
        spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
        while (!list_empty(&fs_info->allocated_ebs)) {
                eb = list_first_entry(&fs_info->allocated_ebs,
                                      struct extent_buffer, leak_list);
                pr_err(
        "BTRFS: buffer leak start %llu len %u refs %d bflags %lu owner %llu\n",
                       eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
                       btrfs_header_owner(eb));
                list_del(&eb->leak_list);
                WARN_ON_ONCE(1);
                kmem_cache_free(extent_buffer_cache, eb);
        }
        spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
}
#else
#define btrfs_leak_debug_add_eb(eb)                        do {} while (0)
#define btrfs_leak_debug_del_eb(eb)                        do {} while (0)
#endif

/*
 * Structure to record info about the bio being assembled, and other info like
 * how many bytes are there before stripe/ordered extent boundary.
 */
struct btrfs_bio_ctrl {
        struct btrfs_bio *bbio;
        enum btrfs_compression_type compress_type;
        u32 len_to_oe_boundary;
        blk_opf_t opf;
        btrfs_bio_end_io_t end_io_func;
        struct writeback_control *wbc;
};

static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
{
        struct btrfs_bio *bbio = bio_ctrl->bbio;

        if (!bbio)
                return;

        /* Caller should ensure the bio has at least some range added */
        ASSERT(bbio->bio.bi_iter.bi_size);

        if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ &&
            bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
                btrfs_submit_compressed_read(bbio);
        else
                btrfs_submit_bio(bbio, 0);

        /* The bbio is owned by the end_io handler now */
        bio_ctrl->bbio = NULL;
}

/*
 * Submit or fail the current bio in the bio_ctrl structure.
 */
static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret)
{
        struct btrfs_bio *bbio = bio_ctrl->bbio;

        if (!bbio)
                return;

        if (ret) {
                ASSERT(ret < 0);
                btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
                /* The bio is owned by the end_io handler now */
                bio_ctrl->bbio = NULL;
        } else {
                submit_one_bio(bio_ctrl);
        }
}

int __init extent_buffer_init_cachep(void)
{
        extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
                                                sizeof(struct extent_buffer), 0, 0,
                                                NULL);
        if (!extent_buffer_cache)
                return -ENOMEM;

        return 0;
}

void __cold extent_buffer_free_cachep(void)
{
        /*
         * Make sure all delayed rcu free are flushed before we
         * destroy caches.
         */
        rcu_barrier();
        kmem_cache_destroy(extent_buffer_cache);
}

void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
{
        unsigned long index = start >> PAGE_SHIFT;
        unsigned long end_index = end >> PAGE_SHIFT;
        struct page *page;

        while (index <= end_index) {
                page = find_get_page(inode->i_mapping, index);
                BUG_ON(!page); /* Pages should be in the extent_io_tree */
                clear_page_dirty_for_io(page);
                put_page(page);
                index++;
        }
}

static void process_one_page(struct btrfs_fs_info *fs_info,
                             struct page *page, struct page *locked_page,
                             unsigned long page_ops, u64 start, u64 end)
{
        struct folio *folio = page_folio(page);
        u32 len;

        ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
        len = end + 1 - start;

        if (page_ops & PAGE_SET_ORDERED)
                btrfs_folio_clamp_set_ordered(fs_info, folio, start, len);
        if (page_ops & PAGE_START_WRITEBACK) {
                btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len);
                btrfs_folio_clamp_set_writeback(fs_info, folio, start, len);
        }
        if (page_ops & PAGE_END_WRITEBACK)
                btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len);

        if (page != locked_page && (page_ops & PAGE_UNLOCK))
                btrfs_folio_end_writer_lock(fs_info, folio, start, len);
}

static void __process_pages_contig(struct address_space *mapping,
                                   struct page *locked_page, u64 start, u64 end,
                                   unsigned long page_ops)
{
        struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
        pgoff_t start_index = start >> PAGE_SHIFT;
        pgoff_t end_index = end >> PAGE_SHIFT;
        pgoff_t index = start_index;
        struct folio_batch fbatch;
        int i;

        folio_batch_init(&fbatch);
        while (index <= end_index) {
                int found_folios;

                found_folios = filemap_get_folios_contig(mapping, &index,
                                end_index, &fbatch);
                for (i = 0; i < found_folios; i++) {
                        struct folio *folio = fbatch.folios[i];

                        process_one_page(fs_info, &folio->page, locked_page,
                                         page_ops, start, end);
                }
                folio_batch_release(&fbatch);
                cond_resched();
        }
}

static noinline void __unlock_for_delalloc(struct inode *inode,
                                           struct page *locked_page,
                                           u64 start, u64 end)
{
        unsigned long index = start >> PAGE_SHIFT;
        unsigned long end_index = end >> PAGE_SHIFT;

        ASSERT(locked_page);
        if (index == locked_page->index && end_index == index)
                return;

        __process_pages_contig(inode->i_mapping, locked_page, start, end,
                               PAGE_UNLOCK);
}

static noinline int lock_delalloc_pages(struct inode *inode,
                                        struct page *locked_page,
                                        u64 start,
                                        u64 end)
{
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct address_space *mapping = inode->i_mapping;
        pgoff_t start_index = start >> PAGE_SHIFT;
        pgoff_t end_index = end >> PAGE_SHIFT;
        pgoff_t index = start_index;
        u64 processed_end = start;
        struct folio_batch fbatch;

        if (index == locked_page->index && index == end_index)
                return 0;

        folio_batch_init(&fbatch);
        while (index <= end_index) {
                unsigned int found_folios, i;

                found_folios = filemap_get_folios_contig(mapping, &index,
                                end_index, &fbatch);
                if (found_folios == 0)
                        goto out;

                for (i = 0; i < found_folios; i++) {
                        struct folio *folio = fbatch.folios[i];
                        struct page *page = folio_page(folio, 0);
                        u32 len = end + 1 - start;

                        if (page == locked_page)
                                continue;

                        if (btrfs_folio_start_writer_lock(fs_info, folio, start,
                                                          len))
                                goto out;

                        if (!PageDirty(page) || page->mapping != mapping) {
                                btrfs_folio_end_writer_lock(fs_info, folio, start,
                                                            len);
                                goto out;
                        }

                        processed_end = page_offset(page) + PAGE_SIZE - 1;
                }
                folio_batch_release(&fbatch);
                cond_resched();
        }

        return 0;
out:
        folio_batch_release(&fbatch);
        if (processed_end > start)
                __unlock_for_delalloc(inode, locked_page, start, processed_end);
        return -EAGAIN;
}

/*
 * Find and lock a contiguous range of bytes in the file marked as delalloc, no
 * more than @max_bytes.
 *
 * @start:        The original start bytenr to search.
 *                Will store the extent range start bytenr.
 * @end:        The original end bytenr of the search range
 *                Will store the extent range end bytenr.
 *
 * Return true if we find a delalloc range which starts inside the original
 * range, and @start/@end will store the delalloc range start/end.
 *
 * Return false if we can't find any delalloc range which starts inside the
 * original range, and @start/@end will be the non-delalloc range start/end.
 */
EXPORT_FOR_TESTS
noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
                                    struct page *locked_page, u64 *start,
                                    u64 *end)
{
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
        const u64 orig_start = *start;
        const u64 orig_end = *end;
        /* The sanity tests may not set a valid fs_info. */
        u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE;
        u64 delalloc_start;
        u64 delalloc_end;
        bool found;
        struct extent_state *cached_state = NULL;
        int ret;
        int loops = 0;

        /* Caller should pass a valid @end to indicate the search range end */
        ASSERT(orig_end > orig_start);

        /* The range should at least cover part of the page */
        ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE ||
                 orig_end <= page_offset(locked_page)));
again:
        /* step one, find a bunch of delalloc bytes starting at start */
        delalloc_start = *start;
        delalloc_end = 0;
        found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
                                          max_bytes, &cached_state);
        if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
                *start = delalloc_start;

                /* @delalloc_end can be -1, never go beyond @orig_end */
                *end = min(delalloc_end, orig_end);
                free_extent_state(cached_state);
                return false;
        }

        /*
         * start comes from the offset of locked_page.  We have to lock
         * pages in order, so we can't process delalloc bytes before
         * locked_page
         */
        if (delalloc_start < *start)
                delalloc_start = *start;

        /*
         * make sure to limit the number of pages we try to lock down
         */
        if (delalloc_end + 1 - delalloc_start > max_bytes)
                delalloc_end = delalloc_start + max_bytes - 1;

        /* step two, lock all the pages after the page that has start */
        ret = lock_delalloc_pages(inode, locked_page,
                                  delalloc_start, delalloc_end);
        ASSERT(!ret || ret == -EAGAIN);
        if (ret == -EAGAIN) {
                /* some of the pages are gone, lets avoid looping by
                 * shortening the size of the delalloc range we're searching
                 */
                free_extent_state(cached_state);
                cached_state = NULL;
                if (!loops) {
                        max_bytes = PAGE_SIZE;
                        loops = 1;
                        goto again;
                } else {
                        found = false;
                        goto out_failed;
                }
        }

        /* step three, lock the state bits for the whole range */
        lock_extent(tree, delalloc_start, delalloc_end, &cached_state);

        /* then test to make sure it is all still delalloc */
        ret = test_range_bit(tree, delalloc_start, delalloc_end,
                             EXTENT_DELALLOC, cached_state);

        unlock_extent(tree, delalloc_start, delalloc_end, &cached_state);
        if (!ret) {
                __unlock_for_delalloc(inode, locked_page,
                              delalloc_start, delalloc_end);
                cond_resched();
                goto again;
        }
        *start = delalloc_start;
        *end = delalloc_end;
out_failed:
        return found;
}

void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
                                  struct page *locked_page,
                                  struct extent_state **cached,
                                  u32 clear_bits, unsigned long page_ops)
{
        clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached);

        __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
                               start, end, page_ops);
}

static bool btrfs_verify_page(struct page *page, u64 start)
{
        if (!fsverity_active(page->mapping->host) ||
            PageUptodate(page) ||
            start >= i_size_read(page->mapping->host))
                return true;
        return fsverity_verify_page(page);
}

static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
{
        struct btrfs_fs_info *fs_info = page_to_fs_info(page);
        struct folio *folio = page_folio(page);

        ASSERT(page_offset(page) <= start &&
               start + len <= page_offset(page) + PAGE_SIZE);

        if (uptodate && btrfs_verify_page(page, start))
                btrfs_folio_set_uptodate(fs_info, folio, start, len);
        else
                btrfs_folio_clear_uptodate(fs_info, folio, start, len);

        if (!btrfs_is_subpage(fs_info, page->mapping))
                unlock_page(page);
        else
                btrfs_subpage_end_reader(fs_info, folio, start, len);
}

/*
 * After a write IO is done, we need to:
 *
 * - clear the uptodate bits on error
 * - clear the writeback bits in the extent tree for the range
 * - filio_end_writeback()  if there is no more pending io for the folio
 *
 * Scheduling is not allowed, so the extent state tree is expected
 * to have one and only one object corresponding to this IO.
 */
static void end_bbio_data_write(struct btrfs_bio *bbio)
{
        struct btrfs_fs_info *fs_info = bbio->fs_info;
        struct bio *bio = &bbio->bio;
        int error = blk_status_to_errno(bio->bi_status);
        struct folio_iter fi;
        const u32 sectorsize = fs_info->sectorsize;

        ASSERT(!bio_flagged(bio, BIO_CLONED));
        bio_for_each_folio_all(fi, bio) {
                struct folio *folio = fi.folio;
                u64 start = folio_pos(folio) + fi.offset;
                u32 len = fi.length;

                /* Only order 0 (single page) folios are allowed for data. */
                ASSERT(folio_order(folio) == 0);

                /* Our read/write should always be sector aligned. */
                if (!IS_ALIGNED(fi.offset, sectorsize))
                        btrfs_err(fs_info,
                "partial page write in btrfs with offset %zu and length %zu",
                                  fi.offset, fi.length);
                else if (!IS_ALIGNED(fi.length, sectorsize))
                        btrfs_info(fs_info,
                "incomplete page write with offset %zu and length %zu",
                                   fi.offset, fi.length);

                btrfs_finish_ordered_extent(bbio->ordered,
                                folio_page(folio, 0), start, len, !error);
                if (error)
                        mapping_set_error(folio->mapping, error);
                btrfs_folio_clear_writeback(fs_info, folio, start, len);
        }

        bio_put(bio);
}

/*
 * Record previously processed extent range
 *
 * For endio_readpage_release_extent() to handle a full extent range, reducing
 * the extent io operations.
 */
struct processed_extent {
        struct btrfs_inode *inode;
        /* Start of the range in @inode */
        u64 start;
        /* End of the range in @inode */
        u64 end;
        bool uptodate;
};

/*
 * Try to release processed extent range
 *
 * May not release the extent range right now if the current range is
 * contiguous to processed extent.
 *
 * Will release processed extent when any of @inode, @uptodate, the range is
 * no longer contiguous to the processed range.
 *
 * Passing @inode == NULL will force processed extent to be released.
 */
static void endio_readpage_release_extent(struct processed_extent *processed,
                              struct btrfs_inode *inode, u64 start, u64 end,
                              bool uptodate)
{
        struct extent_state *cached = NULL;
        struct extent_io_tree *tree;

        /* The first extent, initialize @processed */
        if (!processed->inode)
                goto update;

        /*
         * Contiguous to processed extent, just uptodate the end.
         *
         * Several things to notice:
         *
         * - bio can be merged as long as on-disk bytenr is contiguous
         *   This means we can have page belonging to other inodes, thus need to
         *   check if the inode still matches.
         * - bvec can contain range beyond current page for multi-page bvec
         *   Thus we need to do processed->end + 1 >= start check
         */
        if (processed->inode == inode && processed->uptodate == uptodate &&
            processed->end + 1 >= start && end >= processed->end) {
                processed->end = end;
                return;
        }

        tree = &processed->inode->io_tree;
        /*
         * Now we don't have range contiguous to the processed range, release
         * the processed range now.
         */
        unlock_extent(tree, processed->start, processed->end, &cached);

update:
        /* Update processed to current range */
        processed->inode = inode;
        processed->start = start;
        processed->end = end;
        processed->uptodate = uptodate;
}

static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
{
        struct folio *folio = page_folio(page);

        ASSERT(folio_test_locked(folio));
        if (!btrfs_is_subpage(fs_info, folio->mapping))
                return;

        ASSERT(folio_test_private(folio));
        btrfs_subpage_start_reader(fs_info, folio, page_offset(page), PAGE_SIZE);
}

/*
 * After a data read IO is done, we need to:
 *
 * - clear the uptodate bits on error
 * - set the uptodate bits if things worked
 * - set the folio up to date if all extents in the tree are uptodate
 * - clear the lock bit in the extent tree
 * - unlock the folio if there are no other extents locked for it
 *
 * Scheduling is not allowed, so the extent state tree is expected
 * to have one and only one object corresponding to this IO.
 */
static void end_bbio_data_read(struct btrfs_bio *bbio)
{
        struct btrfs_fs_info *fs_info = bbio->fs_info;
        struct bio *bio = &bbio->bio;
        struct processed_extent processed = { 0 };
        struct folio_iter fi;
        const u32 sectorsize = fs_info->sectorsize;

        ASSERT(!bio_flagged(bio, BIO_CLONED));
        bio_for_each_folio_all(fi, &bbio->bio) {
                bool uptodate = !bio->bi_status;
                struct folio *folio = fi.folio;
                struct inode *inode = folio->mapping->host;
                u64 start;
                u64 end;
                u32 len;

                /* For now only order 0 folios are supported for data. */
                ASSERT(folio_order(folio) == 0);
                btrfs_debug(fs_info,
                        "%s: bi_sector=%llu, err=%d, mirror=%u",
                        __func__, bio->bi_iter.bi_sector, bio->bi_status,
                        bbio->mirror_num);

                /*
                 * We always issue full-sector reads, but if some block in a
                 * folio fails to read, blk_update_request() will advance
                 * bv_offset and adjust bv_len to compensate.  Print a warning
                 * for unaligned offsets, and an error if they don't add up to
                 * a full sector.
                 */
                if (!IS_ALIGNED(fi.offset, sectorsize))
                        btrfs_err(fs_info,
                "partial page read in btrfs with offset %zu and length %zu",
                                  fi.offset, fi.length);
                else if (!IS_ALIGNED(fi.offset + fi.length, sectorsize))
                        btrfs_info(fs_info,
                "incomplete page read with offset %zu and length %zu",
                                   fi.offset, fi.length);

                start = folio_pos(folio) + fi.offset;
                end = start + fi.length - 1;
                len = fi.length;

                if (likely(uptodate)) {
                        loff_t i_size = i_size_read(inode);
                        pgoff_t end_index = i_size >> folio_shift(folio);

                        /*
                         * Zero out the remaining part if this range straddles
                         * i_size.
                         *
                         * Here we should only zero the range inside the folio,
                         * not touch anything else.
                         *
                         * NOTE: i_size is exclusive while end is inclusive.
                         */
                        if (folio_index(folio) == end_index && i_size <= end) {
                                u32 zero_start = max(offset_in_folio(folio, i_size),
                                                     offset_in_folio(folio, start));
                                u32 zero_len = offset_in_folio(folio, end) + 1 -
                                               zero_start;

                                folio_zero_range(folio, zero_start, zero_len);
                        }
                }

                /* Update page status and unlock. */
                end_page_read(folio_page(folio, 0), uptodate, start, len);
                endio_readpage_release_extent(&processed, BTRFS_I(inode),
                                              start, end, uptodate);
        }
        /* Release the last extent */
        endio_readpage_release_extent(&processed, NULL, 0, 0, false);
        bio_put(bio);
}

/*
 * Populate every free slot in a provided array with folios.
 *
 * @nr_folios:   number of folios to allocate
 * @folio_array: the array to fill with folios; any existing non-NULL entries in
 *                 the array will be skipped
 * @extra_gfp:         the extra GFP flags for the allocation
 *
 * Return: 0        if all folios were able to be allocated;
 *         -ENOMEM  otherwise, the partially allocated folios would be freed and
 *                  the array slots zeroed
 */
int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array,
                            gfp_t extra_gfp)
{
        for (int i = 0; i < nr_folios; i++) {
                if (folio_array[i])
                        continue;
                folio_array[i] = folio_alloc(GFP_NOFS | extra_gfp, 0);
                if (!folio_array[i])
                        goto error;
        }
        return 0;
error:
        for (int i = 0; i < nr_folios; i++) {
                if (folio_array[i])
                        folio_put(folio_array[i]);
        }
        return -ENOMEM;
}

/*
 * Populate every free slot in a provided array with pages.
 *
 * @nr_pages:   number of pages to allocate
 * @page_array: the array to fill with pages; any existing non-null entries in
 *                 the array will be skipped
 * @extra_gfp:        the extra GFP flags for the allocation.
 *
 * Return: 0        if all pages were able to be allocated;
 *         -ENOMEM  otherwise, the partially allocated pages would be freed and
 *                  the array slots zeroed
 */
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
                           gfp_t extra_gfp)
{
        const gfp_t gfp = GFP_NOFS | extra_gfp;
        unsigned int allocated;

        for (allocated = 0; allocated < nr_pages;) {
                unsigned int last = allocated;

                allocated = alloc_pages_bulk_array(gfp, nr_pages, page_array);
                if (unlikely(allocated == last)) {
                        /* No progress, fail and do cleanup. */
                        for (int i = 0; i < allocated; i++) {
                                __free_page(page_array[i]);
                                page_array[i] = NULL;
                        }
                        return -ENOMEM;
                }
        }
        return 0;
}

/*
 * Populate needed folios for the extent buffer.
 *
 * For now, the folios populated are always in order 0 (aka, single page).
 */
static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp)
{
        struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 };
        int num_pages = num_extent_pages(eb);
        int ret;

        ret = btrfs_alloc_page_array(num_pages, page_array, extra_gfp);
        if (ret < 0)
                return ret;

        for (int i = 0; i < num_pages; i++)
                eb->folios[i] = page_folio(page_array[i]);
        eb->folio_size = PAGE_SIZE;
        eb->folio_shift = PAGE_SHIFT;
        return 0;
}

static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
                                struct page *page, u64 disk_bytenr,
                                unsigned int pg_offset)
{
        struct bio *bio = &bio_ctrl->bbio->bio;
        struct bio_vec *bvec = bio_last_bvec_all(bio);
        const sector_t sector = disk_bytenr >> SECTOR_SHIFT;

        if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
                /*
                 * For compression, all IO should have its logical bytenr set
                 * to the starting bytenr of the compressed extent.
                 */
                return bio->bi_iter.bi_sector == sector;
        }

        /*
         * The contig check requires the following conditions to be met:
         *
         * 1) The pages are belonging to the same inode
         *    This is implied by the call chain.
         *
         * 2) The range has adjacent logical bytenr
         *
         * 3) The range has adjacent file offset
         *    This is required for the usage of btrfs_bio->file_offset.
         */
        return bio_end_sector(bio) == sector &&
                page_offset(bvec->bv_page) + bvec->bv_offset + bvec->bv_len ==
                page_offset(page) + pg_offset;
}

static void alloc_new_bio(struct btrfs_inode *inode,
                          struct btrfs_bio_ctrl *bio_ctrl,
                          u64 disk_bytenr, u64 file_offset)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct btrfs_bio *bbio;

        bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info,
                               bio_ctrl->end_io_func, NULL);
        bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
        bbio->inode = inode;
        bbio->file_offset = file_offset;
        bio_ctrl->bbio = bbio;
        bio_ctrl->len_to_oe_boundary = U32_MAX;

        /* Limit data write bios to the ordered boundary. */
        if (bio_ctrl->wbc) {
                struct btrfs_ordered_extent *ordered;

                ordered = btrfs_lookup_ordered_extent(inode, file_offset);
                if (ordered) {
                        bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
                                        ordered->file_offset +
                                        ordered->disk_num_bytes - file_offset);
                        bbio->ordered = ordered;
                }

                /*
                 * Pick the last added device to support cgroup writeback.  For
                 * multi-device file systems this means blk-cgroup policies have
                 * to always be set on the last added/replaced device.
                 * This is a bit odd but has been like that for a long time.
                 */
                bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
                wbc_init_bio(bio_ctrl->wbc, &bbio->bio);
        }
}

/*
 * @disk_bytenr: logical bytenr where the write will be
 * @page:        page to add to the bio
 * @size:        portion of page that we want to write to
 * @pg_offset:        offset of the new bio or to check whether we are adding
 *              a contiguous page to the previous one
 *
 * The will either add the page into the existing @bio_ctrl->bbio, or allocate a
 * new one in @bio_ctrl->bbio.
 * The mirror number for this IO should already be initizlied in
 * @bio_ctrl->mirror_num.
 */
static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
                               u64 disk_bytenr, struct page *page,
                               size_t size, unsigned long pg_offset)
{
        struct btrfs_inode *inode = page_to_inode(page);

        ASSERT(pg_offset + size <= PAGE_SIZE);
        ASSERT(bio_ctrl->end_io_func);

        if (bio_ctrl->bbio &&
            !btrfs_bio_is_contig(bio_ctrl, page, disk_bytenr, pg_offset))
                submit_one_bio(bio_ctrl);

        do {
                u32 len = size;

                /* Allocate new bio if needed */
                if (!bio_ctrl->bbio) {
                        alloc_new_bio(inode, bio_ctrl, disk_bytenr,
                                      page_offset(page) + pg_offset);
                }

                /* Cap to the current ordered extent boundary if there is one. */
                if (len > bio_ctrl->len_to_oe_boundary) {
                        ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE);
                        ASSERT(is_data_inode(&inode->vfs_inode));
                        len = bio_ctrl->len_to_oe_boundary;
                }

                if (bio_add_page(&bio_ctrl->bbio->bio, page, len, pg_offset) != len) {
                        /* bio full: move on to a new one */
                        submit_one_bio(bio_ctrl);
                        continue;
                }

                if (bio_ctrl->wbc)
                        wbc_account_cgroup_owner(bio_ctrl->wbc, page, len);

                size -= len;
                pg_offset += len;
                disk_bytenr += len;

                /*
                 * len_to_oe_boundary defaults to U32_MAX, which isn't page or
                 * sector aligned.  alloc_new_bio() then sets it to the end of
                 * our ordered extent for writes into zoned devices.
                 *
                 * When len_to_oe_boundary is tracking an ordered extent, we
                 * trust the ordered extent code to align things properly, and
                 * the check above to cap our write to the ordered extent
                 * boundary is correct.
                 *
                 * When len_to_oe_boundary is U32_MAX, the cap above would
                 * result in a 4095 byte IO for the last page right before
                 * we hit the bio limit of UINT_MAX.  bio_add_page() has all
                 * the checks required to make sure we don't overflow the bio,
                 * and we should just ignore len_to_oe_boundary completely
                 * unless we're using it to track an ordered extent.
                 *
                 * It's pretty hard to make a bio sized U32_MAX, but it can
                 * happen when the page cache is able to feed us contiguous
                 * pages for large extents.
                 */
                if (bio_ctrl->len_to_oe_boundary != U32_MAX)
                        bio_ctrl->len_to_oe_boundary -= len;

                /* Ordered extent boundary: move on to a new bio. */
                if (bio_ctrl->len_to_oe_boundary == 0)
                        submit_one_bio(bio_ctrl);
        } while (size);
}

static int attach_extent_buffer_folio(struct extent_buffer *eb,
                                      struct folio *folio,
                                      struct btrfs_subpage *prealloc)
{
        struct btrfs_fs_info *fs_info = eb->fs_info;
        int ret = 0;

        /*
         * If the page is mapped to btree inode, we should hold the private
         * lock to prevent race.
         * For cloned or dummy extent buffers, their pages are not mapped and
         * will not race with any other ebs.
         */
        if (folio->mapping)
                lockdep_assert_held(&folio->mapping->i_private_lock);

        if (fs_info->nodesize >= PAGE_SIZE) {
                if (!folio_test_private(folio))
                        folio_attach_private(folio, eb);
                else
                        WARN_ON(folio_get_private(folio) != eb);
                return 0;
        }

        /* Already mapped, just free prealloc */
        if (folio_test_private(folio)) {
                btrfs_free_subpage(prealloc);
                return 0;
        }

        if (prealloc)
                /* Has preallocated memory for subpage */
                folio_attach_private(folio, prealloc);
        else
                /* Do new allocation to attach subpage */
                ret = btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
        return ret;
}

int set_page_extent_mapped(struct page *page)
{
        return set_folio_extent_mapped(page_folio(page));
}

int set_folio_extent_mapped(struct folio *folio)
{
        struct btrfs_fs_info *fs_info;

        ASSERT(folio->mapping);

        if (folio_test_private(folio))
                return 0;

        fs_info = folio_to_fs_info(folio);

        if (btrfs_is_subpage(fs_info, folio->mapping))
                return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);

        folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE);
        return 0;
}

void clear_page_extent_mapped(struct page *page)
{
        struct folio *folio = page_folio(page);
        struct btrfs_fs_info *fs_info;

        ASSERT(page->mapping);

        if (!folio_test_private(folio))
                return;

        fs_info = page_to_fs_info(page);
        if (btrfs_is_subpage(fs_info, page->mapping))
                return btrfs_detach_subpage(fs_info, folio);

        folio_detach_private(folio);
}

static struct extent_map *__get_extent_map(struct inode *inode, struct page *page,
                 u64 start, u64 len, struct extent_map **em_cached)
{
        struct extent_map *em;

        ASSERT(em_cached);

        if (*em_cached) {
                em = *em_cached;
                if (extent_map_in_tree(em) && start >= em->start &&
                    start < extent_map_end(em)) {
                        refcount_inc(&em->refs);
                        return em;
                }

                free_extent_map(em);
                *em_cached = NULL;
        }

        em = btrfs_get_extent(BTRFS_I(inode), page, start, len);
        if (!IS_ERR(em)) {
                BUG_ON(*em_cached);
                refcount_inc(&em->refs);
                *em_cached = em;
        }
        return em;
}
/*
 * basic readpage implementation.  Locked extent state structs are inserted
 * into the tree that are removed when the IO is done (by the end_io
 * handlers)
 * XXX JDM: This needs looking at to ensure proper page locking
 * return 0 on success, otherwise return error
 */
static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
                      struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
{
        struct inode *inode = page->mapping->host;
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        u64 start = page_offset(page);
        const u64 end = start + PAGE_SIZE - 1;
        u64 cur = start;
        u64 extent_offset;
        u64 last_byte = i_size_read(inode);
        u64 block_start;
        struct extent_map *em;
        int ret = 0;
        size_t pg_offset = 0;
        size_t iosize;
        size_t blocksize = fs_info->sectorsize;
        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;

        ret = set_page_extent_mapped(page);
        if (ret < 0) {
                unlock_extent(tree, start, end, NULL);
                unlock_page(page);
                return ret;
        }

        if (page->index == last_byte >> PAGE_SHIFT) {
                size_t zero_offset = offset_in_page(last_byte);

                if (zero_offset) {
                        iosize = PAGE_SIZE - zero_offset;
                        memzero_page(page, zero_offset, iosize);
                }
        }
        bio_ctrl->end_io_func = end_bbio_data_read;
        begin_page_read(fs_info, page);
        while (cur <= end) {
                enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE;
                bool force_bio_submit = false;
                u64 disk_bytenr;

                ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
                if (cur >= last_byte) {
                        iosize = PAGE_SIZE - pg_offset;
                        memzero_page(page, pg_offset, iosize);
                        unlock_extent(tree, cur, cur + iosize - 1, NULL);
                        end_page_read(page, true, cur, iosize);
                        break;
                }
                em = __get_extent_map(inode, page, cur, end - cur + 1, em_cached);
                if (IS_ERR(em)) {
                        unlock_extent(tree, cur, end, NULL);
                        end_page_read(page, false, cur, end + 1 - cur);
                        return PTR_ERR(em);
                }
                extent_offset = cur - em->start;
                BUG_ON(extent_map_end(em) <= cur);
                BUG_ON(end < cur);

                compress_type = extent_map_compression(em);

                iosize = min(extent_map_end(em) - cur, end - cur + 1);
                iosize = ALIGN(iosize, blocksize);
                if (compress_type != BTRFS_COMPRESS_NONE)
                        disk_bytenr = em->block_start;
                else
                        disk_bytenr = em->block_start + extent_offset;
                block_start = em->block_start;
                if (em->flags & EXTENT_FLAG_PREALLOC)
                        block_start = EXTENT_MAP_HOLE;

                /*
                 * If we have a file range that points to a compressed extent
                 * and it's followed by a consecutive file range that points
                 * to the same compressed extent (possibly with a different
                 * offset and/or length, so it either points to the whole extent
                 * or only part of it), we must make sure we do not submit a
                 * single bio to populate the pages for the 2 ranges because
                 * this makes the compressed extent read zero out the pages
                 * belonging to the 2nd range. Imagine the following scenario:
                 *
                 *  File layout
                 *  [0 - 8K]                     [8K - 24K]
                 *    |                               |
                 *    |                               |
                 * points to extent X,         points to extent X,
                 * offset 4K, length of 8K     offset 0, length 16K
                 *
                 * [extent X, compressed length = 4K uncompressed length = 16K]
                 *
                 * If the bio to read the compressed extent covers both ranges,
                 * it will decompress extent X into the pages belonging to the
                 * first range and then it will stop, zeroing out the remaining
                 * pages that belong to the other range that points to extent X.
                 * So here we make sure we submit 2 bios, one for the first
                 * range and another one for the third range. Both will target
                 * the same physical extent from disk, but we can't currently
                 * make the compressed bio endio callback populate the pages
                 * for both ranges because each compressed bio is tightly
                 * coupled with a single extent map, and each range can have
                 * an extent map with a different offset value relative to the
                 * uncompressed data of our extent and different lengths. This
                 * is a corner case so we prioritize correctness over
                 * non-optimal behavior (submitting 2 bios for the same extent).
                 */
                if (compress_type != BTRFS_COMPRESS_NONE &&
                    prev_em_start && *prev_em_start != (u64)-1 &&
                    *prev_em_start != em->start)
                        force_bio_submit = true;

                if (prev_em_start)
                        *prev_em_start = em->start;

                free_extent_map(em);
                em = NULL;

                /* we've found a hole, just zero and go on */
                if (block_start == EXTENT_MAP_HOLE) {
                        memzero_page(page, pg_offset, iosize);

                        unlock_extent(tree, cur, cur + iosize - 1, NULL);
                        end_page_read(page, true, cur, iosize);
                        cur = cur + iosize;
                        pg_offset += iosize;
                        continue;
                }
                /* the get_extent function already copied into the page */
                if (block_start == EXTENT_MAP_INLINE) {
                        unlock_extent(tree, cur, cur + iosize - 1, NULL);
                        end_page_read(page, true, cur, iosize);
                        cur = cur + iosize;
                        pg_offset += iosize;
                        continue;
                }

                if (bio_ctrl->compress_type != compress_type) {
                        submit_one_bio(bio_ctrl);
                        bio_ctrl->compress_type = compress_type;
                }

                if (force_bio_submit)
                        submit_one_bio(bio_ctrl);
                submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
                                   pg_offset);
                cur = cur + iosize;
                pg_offset += iosize;
        }

        return 0;
}

int btrfs_read_folio(struct file *file, struct folio *folio)
{
        struct page *page = &folio->page;
        struct btrfs_inode *inode = page_to_inode(page);
        u64 start = page_offset(page);
        u64 end = start + PAGE_SIZE - 1;
        struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
        struct extent_map *em_cached = NULL;
        int ret;

        btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);

        ret = btrfs_do_readpage(page, &em_cached, &bio_ctrl, NULL);
        free_extent_map(em_cached);

        /*
         * If btrfs_do_readpage() failed we will want to submit the assembled
         * bio to do the cleanup.
         */
        submit_one_bio(&bio_ctrl);
        return ret;
}

static inline void contiguous_readpages(struct page *pages[], int nr_pages,
                                        u64 start, u64 end,
                                        struct extent_map **em_cached,
                                        struct btrfs_bio_ctrl *bio_ctrl,
                                        u64 *prev_em_start)
{
        struct btrfs_inode *inode = page_to_inode(pages[0]);
        int index;

        ASSERT(em_cached);

        btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);

        for (index = 0; index < nr_pages; index++) {
                btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
                                  prev_em_start);
                put_page(pages[index]);
        }
}

/*
 * helper for __extent_writepage, doing all of the delayed allocation setup.
 *
 * This returns 1 if btrfs_run_delalloc_range function did all the work required
 * to write the page (copy into inline extent).  In this case the IO has
 * been started and the page is already unlocked.
 *
 * This returns 0 if all went well (page still locked)
 * This returns < 0 if there were errors (page still locked)
 */
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
                struct page *page, struct writeback_control *wbc)
{
        const u64 page_start = page_offset(page);
        const u64 page_end = page_start + PAGE_SIZE - 1;
        u64 delalloc_start = page_start;
        u64 delalloc_end = page_end;
        u64 delalloc_to_write = 0;
        int ret = 0;

        while (delalloc_start < page_end) {
                delalloc_end = page_end;
                if (!find_lock_delalloc_range(&inode->vfs_inode, page,
                                              &delalloc_start, &delalloc_end)) {
                        delalloc_start = delalloc_end + 1;
                        continue;
                }

                ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
                                               delalloc_end, wbc);
                if (ret < 0)
                        return ret;

                delalloc_start = delalloc_end + 1;
        }

        /*
         * delalloc_end is already one less than the total length, so
         * we don't subtract one from PAGE_SIZE
         */
        delalloc_to_write +=
                DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE);

        /*
         * If btrfs_run_dealloc_range() already started I/O and unlocked
         * the pages, we just need to account for them here.
         */
        if (ret == 1) {
                wbc->nr_to_write -= delalloc_to_write;
                return 1;
        }

        if (wbc->nr_to_write < delalloc_to_write) {
                int thresh = 8192;

                if (delalloc_to_write < thresh * 2)
                        thresh = delalloc_to_write;
                wbc->nr_to_write = min_t(u64, delalloc_to_write,
                                         thresh);
        }

        return 0;
}

/*
 * Find the first byte we need to write.
 *
 * For subpage, one page can contain several sectors, and
 * __extent_writepage_io() will just grab all extent maps in the page
 * range and try to submit all non-inline/non-compressed extents.
 *
 * This is a big problem for subpage, we shouldn't re-submit already written
 * data at all.
 * This function will lookup subpage dirty bit to find which range we really
 * need to submit.
 *
 * Return the next dirty range in [@start, @end).
 * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
 */
static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
                                 struct page *page, u64 *start, u64 *end)
{
        struct folio *folio = page_folio(page);
        struct btrfs_subpage *subpage = folio_get_private(folio);
        struct btrfs_subpage_info *spi = fs_info->subpage_info;
        u64 orig_start = *start;
        /* Declare as unsigned long so we can use bitmap ops */
        unsigned long flags;
        int range_start_bit;
        int range_end_bit;

        /*
         * For regular sector size == page size case, since one page only
         * contains one sector, we return the page offset directly.
         */
        if (!btrfs_is_subpage(fs_info, page->mapping)) {
                *start = page_offset(page);
                *end = page_offset(page) + PAGE_SIZE;
                return;
        }

        range_start_bit = spi->dirty_offset +
                          (offset_in_page(orig_start) >> fs_info->sectorsize_bits);

        /* We should have the page locked, but just in case */
        spin_lock_irqsave(&subpage->lock, flags);
        bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
                               spi->dirty_offset + spi->bitmap_nr_bits);
        spin_unlock_irqrestore(&subpage->lock, flags);

        range_start_bit -= spi->dirty_offset;
        range_end_bit -= spi->dirty_offset;

        *start = page_offset(page) + range_start_bit * fs_info->sectorsize;
        *end = page_offset(page) + range_end_bit * fs_info->sectorsize;
}

/*
 * helper for __extent_writepage.  This calls the writepage start hooks,
 * and does the loop to map the page into extents and bios.
 *
 * We return 1 if the IO is started and the page is unlocked,
 * 0 if all went well (page still locked)
 * < 0 if there were errors (page still locked)
 */
static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
                                 struct page *page,
                                 struct btrfs_bio_ctrl *bio_ctrl,
                                 loff_t i_size,
                                 int *nr_ret)
{
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        u64 cur = page_offset(page);
        u64 end = cur + PAGE_SIZE - 1;
        u64 extent_offset;
        u64 block_start;
        struct extent_map *em;
        int ret = 0;
        int nr = 0;

        ret = btrfs_writepage_cow_fixup(page);
        if (ret) {
                /* Fixup worker will requeue */
                redirty_page_for_writepage(bio_ctrl->wbc, page);
                unlock_page(page);
                return 1;
        }

        bio_ctrl->end_io_func = end_bbio_data_write;
        while (cur <= end) {
                u32 len = end - cur + 1;
                u64 disk_bytenr;
                u64 em_end;
                u64 dirty_range_start = cur;
                u64 dirty_range_end;
                u32 iosize;

                if (cur >= i_size) {
                        btrfs_mark_ordered_io_finished(inode, page, cur, len,
                                                       true);
                        /*
                         * This range is beyond i_size, thus we don't need to
                         * bother writing back.
                         * But we still need to clear the dirty subpage bit, or
                         * the next time the page gets dirtied, we will try to
                         * writeback the sectors with subpage dirty bits,
                         * causing writeback without ordered extent.
                         */
                        btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, len);
                        break;
                }

                find_next_dirty_byte(fs_info, page, &dirty_range_start,
                                     &dirty_range_end);
                if (cur < dirty_range_start) {
                        cur = dirty_range_start;
                        continue;
                }

                em = btrfs_get_extent(inode, NULL, cur, len);
                if (IS_ERR(em)) {
                        ret = PTR_ERR_OR_ZERO(em);
                        goto out_error;
                }

                extent_offset = cur - em->start;
                em_end = extent_map_end(em);
                ASSERT(cur <= em_end);
                ASSERT(cur < end);
                ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
                ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));

                block_start = em->block_start;
                disk_bytenr = em->block_start + extent_offset;

                ASSERT(!extent_map_is_compressed(em));
                ASSERT(block_start != EXTENT_MAP_HOLE);
                ASSERT(block_start != EXTENT_MAP_INLINE);

                /*
                 * Note that em_end from extent_map_end() and dirty_range_end from
                 * find_next_dirty_byte() are all exclusive
                 */
                iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
                free_extent_map(em);
                em = NULL;

                btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
                if (!PageWriteback(page)) {
                        btrfs_err(inode->root->fs_info,
                                   "page %lu not writeback, cur %llu end %llu",
                               page->index, cur, end);
                }

                /*
                 * Although the PageDirty bit is cleared before entering this
                 * function, subpage dirty bit is not cleared.
                 * So clear subpage dirty bit here so next time we won't submit
                 * page for range already written to disk.
                 */
                btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize);

                submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
                                   cur - page_offset(page));
                cur += iosize;
                nr++;
        }

        btrfs_folio_assert_not_dirty(fs_info, page_folio(page));
        *nr_ret = nr;
        return 0;

out_error:
        /*
         * If we finish without problem, we should not only clear page dirty,
         * but also empty subpage dirty bits
         */
        *nr_ret = nr;
        return ret;
}

/*
 * the writepage semantics are similar to regular writepage.  extent
 * records are inserted to lock ranges in the tree, and as dirty areas
 * are found, they are marked writeback.  Then the lock bits are removed
 * and the end_io handler clears the writeback ranges
 *
 * Return 0 if everything goes well.
 * Return <0 for error.
 */
static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl)
{
        struct folio *folio = page_folio(page);
        struct inode *inode = page->mapping->host;
        const u64 page_start = page_offset(page);
        int ret;
        int nr = 0;
        size_t pg_offset;
        loff_t i_size = i_size_read(inode);
        unsigned long end_index = i_size >> PAGE_SHIFT;

        trace___extent_writepage(page, inode, bio_ctrl->wbc);

        WARN_ON(!PageLocked(page));

        pg_offset = offset_in_page(i_size);
        if (page->index > end_index ||
           (page->index == end_index && !pg_offset)) {
                folio_invalidate(folio, 0, folio_size(folio));
                folio_unlock(folio);
                return 0;
        }

        if (page->index == end_index)
                memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);

        ret = set_page_extent_mapped(page);
        if (ret < 0)
                goto done;

        ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc);
        if (ret == 1)
                return 0;
        if (ret)
                goto done;

        ret = __extent_writepage_io(BTRFS_I(inode), page, bio_ctrl, i_size, &nr);
        if (ret == 1)
                return 0;

        bio_ctrl->wbc->nr_to_write--;

done:
        if (nr == 0) {
                /* make sure the mapping tag for page dirty gets cleared */
                set_page_writeback(page);
                end_page_writeback(page);
        }
        if (ret) {
                btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, page_start,
                                               PAGE_SIZE, !ret);
                mapping_set_error(page->mapping, ret);
        }
        unlock_page(page);
        ASSERT(ret <= 0);
        return ret;
}

void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
{
        wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
                       TASK_UNINTERRUPTIBLE);
}

/*
 * Lock extent buffer status and pages for writeback.
 *
 * Return %false if the extent buffer doesn't need to be submitted (e.g. the
 * extent buffer is not dirty)
 * Return %true is the extent buffer is submitted to bio.
 */
static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *eb,
                          struct writeback_control *wbc)
{
        struct btrfs_fs_info *fs_info = eb->fs_info;
        bool ret = false;

        btrfs_tree_lock(eb);
        while (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
                btrfs_tree_unlock(eb);
                if (wbc->sync_mode != WB_SYNC_ALL)
                        return false;
                wait_on_extent_buffer_writeback(eb);
                btrfs_tree_lock(eb);
        }

        /*
         * We need to do this to prevent races in people who check if the eb is
         * under IO since we can end up having no IO bits set for a short period
         * of time.
         */
        spin_lock(&eb->refs_lock);
        if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
                set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
                spin_unlock(&eb->refs_lock);
                btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
                percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
                                         -eb->len,
                                         fs_info->dirty_metadata_batch);
                ret = true;
        } else {
                spin_unlock(&eb->refs_lock);
        }
        btrfs_tree_unlock(eb);
        return ret;
}

static void set_btree_ioerr(struct extent_buffer *eb)
{
        struct btrfs_fs_info *fs_info = eb->fs_info;

        set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);

        /*
         * A read may stumble upon this buffer later, make sure that it gets an
         * error and knows there was an error.
         */
        clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);

        /*
         * We need to set the mapping with the io error as well because a write
         * error will flip the file system readonly, and then syncfs() will
         * return a 0 because we are readonly if we don't modify the err seq for
         * the superblock.
         */
        mapping_set_error(eb->fs_info->btree_inode->i_mapping, -EIO);

        /*
         * If writeback for a btree extent that doesn't belong to a log tree
         * failed, increment the counter transaction->eb_write_errors.
         * We do this because while the transaction is running and before it's
         * committing (when we call filemap_fdata[write|wait]_range against
         * the btree inode), we might have
         * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
         * returns an error or an error happens during writeback, when we're
         * committing the transaction we wouldn't know about it, since the pages
         * can be no longer dirty nor marked anymore for writeback (if a
         * subsequent modification to the extent buffer didn't happen before the
         * transaction commit), which makes filemap_fdata[write|wait]_range not
         * able to find the pages which contain errors at transaction
         * commit time. So if this happens we must abort the transaction,
         * otherwise we commit a super block with btree roots that point to
         * btree nodes/leafs whose content on disk is invalid - either garbage
         * or the content of some node/leaf from a past generation that got
         * cowed or deleted and is no longer valid.
         *
         * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
         * not be enough - we need to distinguish between log tree extents vs
         * non-log tree extents, and the next filemap_fdatawait_range() call
         * will catch and clear such errors in the mapping - and that call might
         * be from a log sync and not from a transaction commit. Also, checking
         * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
         * not done and would not be reliable - the eb might have been released
         * from memory and reading it back again means that flag would not be
         * set (since it's a runtime flag, not persisted on disk).
         *
         * Using the flags below in the btree inode also makes us achieve the
         * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
         * writeback for all dirty pages and before filemap_fdatawait_range()
         * is called, the writeback for all dirty pages had already finished
         * with errors - because we were not using AS_EIO/AS_ENOSPC,
         * filemap_fdatawait_range() would return success, as it could not know
         * that writeback errors happened (the pages were no longer tagged for
         * writeback).
         */
        switch (eb->log_index) {
        case -1:
                set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
                break;
        case 0:
                set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
                break;
        case 1:
                set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
                break;
        default:
                BUG(); /* unexpected, logic error */
        }
}

/*
 * The endio specific version which won't touch any unsafe spinlock in endio
 * context.
 */
static struct extent_buffer *find_extent_buffer_nolock(
                struct btrfs_fs_info *fs_info, u64 start)
{
        struct extent_buffer *eb;

        rcu_read_lock();
        eb = radix_tree_lookup(&fs_info->buffer_radix,
                               start >> fs_info->sectorsize_bits);
        if (eb && atomic_inc_not_zero(&eb->refs)) {
                rcu_read_unlock();
                return eb;
        }
        rcu_read_unlock();
        return NULL;
}

static void end_bbio_meta_write(struct btrfs_bio *bbio)
{
        struct extent_buffer *eb = bbio->private;
        struct btrfs_fs_info *fs_info = eb->fs_info;
        bool uptodate = !bbio->bio.bi_status;
        struct folio_iter fi;
        u32 bio_offset = 0;

        if (!uptodate)
                set_btree_ioerr(eb);

        bio_for_each_folio_all(fi, &bbio->bio) {
                u64 start = eb->start + bio_offset;
                struct folio *folio = fi.folio;
                u32 len = fi.length;

                btrfs_folio_clear_writeback(fs_info, folio, start, len);
                bio_offset += len;
        }

        clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
        smp_mb__after_atomic();
        wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);

        bio_put(&bbio->bio);
}

static void prepare_eb_write(struct extent_buffer *eb)
{
        u32 nritems;
        unsigned long start;
        unsigned long end;

        clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);

        /* Set btree blocks beyond nritems with 0 to avoid stale content */
        nritems = btrfs_header_nritems(eb);
        if (btrfs_header_level(eb) > 0) {
                end = btrfs_node_key_ptr_offset(eb, nritems);
                memzero_extent_buffer(eb, end, eb->len - end);
        } else {
                /*
                 * Leaf:
                 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
                 */
                start = btrfs_item_nr_offset(eb, nritems);
                end = btrfs_item_nr_offset(eb, 0);
                if (nritems == 0)
                        end += BTRFS_LEAF_DATA_SIZE(eb->fs_info);
                else
                        end += btrfs_item_offset(eb, nritems - 1);
                memzero_extent_buffer(eb, start, end - start);
        }
}

static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
                                            struct writeback_control *wbc)
{
        struct btrfs_fs_info *fs_info = eb->fs_info;
        struct btrfs_bio *bbio;

        prepare_eb_write(eb);

        bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
                               REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc),
                               eb->fs_info, end_bbio_meta_write, eb);
        bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
        bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
        wbc_init_bio(wbc, &bbio->bio);
        bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
        bbio->file_offset = eb->start;
        if (fs_info->nodesize < PAGE_SIZE) {
                struct folio *folio = eb->folios[0];
                bool ret;

                folio_lock(folio);
                btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len);
                if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start,
                                                       eb->len)) {
                        folio_clear_dirty_for_io(folio);
                        wbc->nr_to_write--;
                }
                ret = bio_add_folio(&bbio->bio, folio, eb->len,
                                    eb->start - folio_pos(folio));
                ASSERT(ret);
                wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len);
                folio_unlock(folio);
        } else {
                int num_folios = num_extent_folios(eb);

                for (int i = 0; i < num_folios; i++) {
                        struct folio *folio = eb->folios[i];
                        bool ret;

                        folio_lock(folio);
                        folio_clear_dirty_for_io(folio);
                        folio_start_writeback(folio);
                        ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
                        ASSERT(ret);
                        wbc_account_cgroup_owner(wbc, folio_page(folio, 0),
                                                 eb->folio_size);
                        wbc->nr_to_write -= folio_nr_pages(folio);
                        folio_unlock(folio);
                }
        }
        btrfs_submit_bio(bbio, 0);
}

/*
 * Submit one subpage btree page.
 *
 * The main difference to submit_eb_page() is:
 * - Page locking
 *   For subpage, we don't rely on page locking at all.
 *
 * - Flush write bio
 *   We only flush bio if we may be unable to fit current extent buffers into
 *   current bio.
 *
 * Return >=0 for the number of submitted extent buffers.
 * Return <0 for fatal error.
 */
static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
{
        struct btrfs_fs_info *fs_info = page_to_fs_info(page);
        struct folio *folio = page_folio(page);
        int submitted = 0;
        u64 page_start = page_offset(page);
        int bit_start = 0;
        int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;

        /* Lock and write each dirty extent buffers in the range */
        while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
                struct btrfs_subpage *subpage = folio_get_private(folio);
                struct extent_buffer *eb;
                unsigned long flags;
                u64 start;

                /*
                 * Take private lock to ensure the subpage won't be detached
                 * in the meantime.
                 */
                spin_lock(&page->mapping->i_private_lock);
                if (!folio_test_private(folio)) {
                        spin_unlock(&page->mapping->i_private_lock);
                        break;
                }
                spin_lock_irqsave(&subpage->lock, flags);
                if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
                              subpage->bitmaps)) {
                        spin_unlock_irqrestore(&subpage->lock, flags);
                        spin_unlock(&page->mapping->i_private_lock);
                        bit_start++;
                        continue;
                }

                start = page_start + bit_start * fs_info->sectorsize;
                bit_start += sectors_per_node;

                /*
                 * Here we just want to grab the eb without touching extra
                 * spin locks, so call find_extent_buffer_nolock().
                 */
                eb = find_extent_buffer_nolock(fs_info, start);
                spin_unlock_irqrestore(&subpage->lock, flags);
                spin_unlock(&page->mapping->i_private_lock);

                /*
                 * The eb has already reached 0 refs thus find_extent_buffer()
                 * doesn't return it. We don't need to write back such eb
                 * anyway.
                 */
                if (!eb)
                        continue;

                if (lock_extent_buffer_for_io(eb, wbc)) {
                        write_one_eb(eb, wbc);
                        submitted++;
                }
                free_extent_buffer(eb);
        }
        return submitted;
}

/*
 * Submit all page(s) of one extent buffer.
 *
 * @page:        the page of one extent buffer
 * @eb_context:        to determine if we need to submit this page, if current page
 *                belongs to this eb, we don't need to submit
 *
 * The caller should pass each page in their bytenr order, and here we use
 * @eb_context to determine if we have submitted pages of one extent buffer.
 *
 * If we have, we just skip until we hit a new page that doesn't belong to
 * current @eb_context.
 *
 * If not, we submit all the page(s) of the extent buffer.
 *
 * Return >0 if we have submitted the extent buffer successfully.
 * Return 0 if we don't need to submit the page, as it's already submitted by
 * previous call.
 * Return <0 for fatal error.
 */
static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx)
{
        struct writeback_control *wbc = ctx->wbc;
        struct address_space *mapping = page->mapping;
        struct folio *folio = page_folio(page);
        struct extent_buffer *eb;
        int ret;

        if (!folio_test_private(folio))
                return 0;

        if (page_to_fs_info(page)->nodesize < PAGE_SIZE)
                return submit_eb_subpage(page, wbc);

        spin_lock(&mapping->i_private_lock);
        if (!folio_test_private(folio)) {
                spin_unlock(&mapping->i_private_lock);
                return 0;
        }

        eb = folio_get_private(folio);

        /*
         * Shouldn't happen and normally this would be a BUG_ON but no point
         * crashing the machine for something we can survive anyway.
         */
        if (WARN_ON(!eb)) {
                spin_unlock(&mapping->i_private_lock);
                return 0;
        }

        if (eb == ctx->eb) {
                spin_unlock(&mapping->i_private_lock);
                return 0;
        }
        ret = atomic_inc_not_zero(&eb->refs);
        spin_unlock(&mapping->i_private_lock);
        if (!ret)
                return 0;

        ctx->eb = eb;

        ret = btrfs_check_meta_write_pointer(eb->fs_info, ctx);
        if (ret) {
                if (ret == -EBUSY)
                        ret = 0;
                free_extent_buffer(eb);
                return ret;
        }

        if (!lock_extent_buffer_for_io(eb, wbc)) {
                free_extent_buffer(eb);
                return 0;
        }
        /* Implies write in zoned mode. */
        if (ctx->zoned_bg) {
                /* Mark the last eb in the block group. */
                btrfs_schedule_zone_finish_bg(ctx->zoned_bg, eb);
                ctx->zoned_bg->meta_write_pointer += eb->len;
        }
        write_one_eb(eb, wbc);
        free_extent_buffer(eb);
        return 1;
}

int btree_write_cache_pages(struct address_space *mapping,
                                   struct writeback_control *wbc)
{
        struct btrfs_eb_write_context ctx = { .wbc = wbc };
        struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
        int ret = 0;
        int done = 0;
        int nr_to_write_done = 0;
        struct folio_batch fbatch;
        unsigned int nr_folios;
        pgoff_t index;
        pgoff_t end;                /* Inclusive */
        int scanned = 0;
        xa_mark_t tag;

        folio_batch_init(&fbatch);
        if (wbc->range_cyclic) {
                index = mapping->writeback_index; /* Start from prev offset */
                end = -1;
                /*
                 * Start from the beginning does not need to cycle over the
                 * range, mark it as scanned.
                 */
                scanned = (index == 0);
        } else {
                index = wbc->range_start >> PAGE_SHIFT;
                end = wbc->range_end >> PAGE_SHIFT;
                scanned = 1;
        }
        if (wbc->sync_mode == WB_SYNC_ALL)
                tag = PAGECACHE_TAG_TOWRITE;
        else
                tag = PAGECACHE_TAG_DIRTY;
        btrfs_zoned_meta_io_lock(fs_info);
retry:
        if (wbc->sync_mode == WB_SYNC_ALL)
                tag_pages_for_writeback(mapping, index, end);
        while (!done && !nr_to_write_done && (index <= end) &&
               (nr_folios = filemap_get_folios_tag(mapping, &index, end,
                                            tag, &fbatch))) {
                unsigned i;

                for (i = 0; i < nr_folios; i++) {
                        struct folio *folio = fbatch.folios[i];

                        ret = submit_eb_page(&folio->page, &ctx);
                        if (ret == 0)
                                continue;
                        if (ret < 0) {
                                done = 1;
                                break;
                        }

                        /*
                         * the filesystem may choose to bump up nr_to_write.
                         * We have to make sure to honor the new nr_to_write
                         * at any time
                         */
                        nr_to_write_done = wbc->nr_to_write <= 0;
                }
                folio_batch_release(&fbatch);
                cond_resched();
        }
        if (!scanned && !done) {
                /*
                 * We hit the last page and there is more work to be done: wrap
                 * back to the start of the file
                 */
                scanned = 1;
                index = 0;
                goto retry;
        }
        /*
         * If something went wrong, don't allow any metadata write bio to be
         * submitted.
         *
         * This would prevent use-after-free if we had dirty pages not
         * cleaned up, which can still happen by fuzzed images.
         *
         * - Bad extent tree
         *   Allowing existing tree block to be allocated for other trees.
         *
         * - Log tree operations
         *   Exiting tree blocks get allocated to log tree, bumps its
         *   generation, then get cleaned in tree re-balance.
         *   Such tree block will not be written back, since it's clean,
         *   thus no WRITTEN flag set.
         *   And after log writes back, this tree block is not traced by
         *   any dirty extent_io_tree.
         *
         * - Offending tree block gets re-dirtied from its original owner
         *   Since it has bumped generation, no WRITTEN flag, it can be
         *   reused without COWing. This tree block will not be traced
         *   by btrfs_transaction::dirty_pages.
         *
         *   Now such dirty tree block will not be cleaned by any dirty
         *   extent io tree. Thus we don't want to submit such wild eb
         *   if the fs already has error.
         *
         * We can get ret > 0 from submit_extent_page() indicating how many ebs
         * were submitted. Reset it to 0 to avoid false alerts for the caller.
         */
        if (ret > 0)
                ret = 0;
        if (!ret && BTRFS_FS_ERROR(fs_info))
                ret = -EROFS;

        if (ctx.zoned_bg)
                btrfs_put_block_group(ctx.zoned_bg);
        btrfs_zoned_meta_io_unlock(fs_info);
        return ret;
}

/*
 * Walk the list of dirty pages of the given address space and write all of them.
 *
 * @mapping:   address space structure to write
 * @wbc:       subtract the number of written pages from *@wbc->nr_to_write
 * @bio_ctrl:  holds context for the write, namely the bio
 *
 * If a page is already under I/O, write_cache_pages() skips it, even
 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
 * and msync() need to guarantee that all the data which was dirty at the time
 * the call was made get new I/O started against them.  If wbc->sync_mode is
 * WB_SYNC_ALL then we were called for data integrity and we must wait for
 * existing IO to complete.
 */
static int extent_write_cache_pages(struct address_space *mapping,
                             struct btrfs_bio_ctrl *bio_ctrl)
{
        struct writeback_control *wbc = bio_ctrl->wbc;
        struct inode *inode = mapping->host;
        int ret = 0;
        int done = 0;
        int nr_to_write_done = 0;
        struct folio_batch fbatch;
        unsigned int nr_folios;
        pgoff_t index;
        pgoff_t end;                /* Inclusive */
        pgoff_t done_index;
        int range_whole = 0;
        int scanned = 0;
        xa_mark_t tag;

        /*
         * We have to hold onto the inode so that ordered extents can do their
         * work when the IO finishes.  The alternative to this is failing to add
         * an ordered extent if the igrab() fails there and that is a huge pain
         * to deal with, so instead just hold onto the inode throughout the
         * writepages operation.  If it fails here we are freeing up the inode
         * anyway and we'd rather not waste our time writing out stuff that is
         * going to be truncated anyway.
         */
        if (!igrab(inode))
                return 0;

        folio_batch_init(&fbatch);
        if (wbc->range_cyclic) {
                index = mapping->writeback_index; /* Start from prev offset */
                end = -1;
                /*
                 * Start from the beginning does not need to cycle over the
                 * range, mark it as scanned.
                 */
                scanned = (index == 0);
        } else {
                index = wbc->range_start >> PAGE_SHIFT;
                end = wbc->range_end >> PAGE_SHIFT;
                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                        range_whole = 1;
                scanned = 1;
        }

        /*
         * We do the tagged writepage as long as the snapshot flush bit is set
         * and we are the first one who do the filemap_flush() on this inode.
         *
         * The nr_to_write == LONG_MAX is needed to make sure other flushers do
         * not race in and drop the bit.
         */
        if (range_whole && wbc->nr_to_write == LONG_MAX &&
            test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
                               &BTRFS_I(inode)->runtime_flags))
                wbc->tagged_writepages = 1;

        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag = PAGECACHE_TAG_TOWRITE;
        else
                tag = PAGECACHE_TAG_DIRTY;
retry:
        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag_pages_for_writeback(mapping, index, end);
        done_index = index;
        while (!done && !nr_to_write_done && (index <= end) &&
                        (nr_folios = filemap_get_folios_tag(mapping, &index,
                                                        end, tag, &fbatch))) {
                unsigned i;

                for (i = 0; i < nr_folios; i++) {
                        struct folio *folio = fbatch.folios[i];

                        done_index = folio_next_index(folio);
                        /*
                         * At this point we hold neither the i_pages lock nor
                         * the page lock: the page may be truncated or
                         * invalidated (changing page->mapping to NULL),
                         * or even swizzled back from swapper_space to
                         * tmpfs file mapping
                         */
                        if (!folio_trylock(folio)) {
                                submit_write_bio(bio_ctrl, 0);
                                folio_lock(folio);
                        }

                        if (unlikely(folio->mapping != mapping)) {
                                folio_unlock(folio);
                                continue;
                        }

                        if (!folio_test_dirty(folio)) {
                                /* Someone wrote it for us. */
                                folio_unlock(folio);
                                continue;
                        }

                        if (wbc->sync_mode != WB_SYNC_NONE) {
                                if (folio_test_writeback(folio))
                                        submit_write_bio(bio_ctrl, 0);
                                folio_wait_writeback(folio);
                        }

                        if (folio_test_writeback(folio) ||
                            !folio_clear_dirty_for_io(folio)) {
                                folio_unlock(folio);
                                continue;
                        }

                        ret = __extent_writepage(&folio->page, bio_ctrl);
                        if (ret < 0) {
                                done = 1;
                                break;
                        }

                        /*
                         * The filesystem may choose to bump up nr_to_write.
                         * We have to make sure to honor the new nr_to_write
                         * at any time.
                         */
                        nr_to_write_done = (wbc->sync_mode == WB_SYNC_NONE &&
                                            wbc->nr_to_write <= 0);
                }
                folio_batch_release(&fbatch);
                cond_resched();
        }
        if (!scanned && !done) {
                /*
                 * We hit the last page and there is more work to be done: wrap
                 * back to the start of the file
                 */
                scanned = 1;
                index = 0;

                /*
                 * If we're looping we could run into a page that is locked by a
                 * writer and that writer could be waiting on writeback for a
                 * page in our current bio, and thus deadlock, so flush the
                 * write bio here.
                 */
                submit_write_bio(bio_ctrl, 0);
                goto retry;
        }

        if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
                mapping->writeback_index = done_index;

        btrfs_add_delayed_iput(BTRFS_I(inode));
        return ret;
}

/*
 * Submit the pages in the range to bio for call sites which delalloc range has
 * already been ran (aka, ordered extent inserted) and all pages are still
 * locked.
 */
void extent_write_locked_range(struct inode *inode, struct page *locked_page,
                               u64 start, u64 end, struct writeback_control *wbc,
                               bool pages_dirty)
{
        bool found_error = false;
        int ret = 0;
        struct address_space *mapping = inode->i_mapping;
        struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
        const u32 sectorsize = fs_info->sectorsize;
        loff_t i_size = i_size_read(inode);
        u64 cur = start;
        struct btrfs_bio_ctrl bio_ctrl = {
                .wbc = wbc,
                .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc),
        };

        if (wbc->no_cgroup_owner)
                bio_ctrl.opf |= REQ_BTRFS_CGROUP_PUNT;

        ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));

        while (cur <= end) {
                u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
                u32 cur_len = cur_end + 1 - cur;
                struct page *page;
                int nr = 0;

                page = find_get_page(mapping, cur >> PAGE_SHIFT);
                ASSERT(PageLocked(page));
                if (pages_dirty && page != locked_page) {
                        ASSERT(PageDirty(page));
                        clear_page_dirty_for_io(page);
                }

                ret = __extent_writepage_io(BTRFS_I(inode), page, &bio_ctrl,
                                            i_size, &nr);
                if (ret == 1)
                        goto next_page;

                /* Make sure the mapping tag for page dirty gets cleared. */
                if (nr == 0) {
                        set_page_writeback(page);
                        end_page_writeback(page);
                }
                if (ret) {
                        btrfs_mark_ordered_io_finished(BTRFS_I(inode), page,
                                                       cur, cur_len, !ret);
                        mapping_set_error(page->mapping, ret);
                }
                btrfs_folio_unlock_writer(fs_info, page_folio(page), cur, cur_len);
                if (ret < 0)
                        found_error = true;
next_page:
                put_page(page);
                cur = cur_end + 1;
        }

        submit_write_bio(&bio_ctrl, found_error ? ret : 0);
}

int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
        struct inode *inode = mapping->host;
        int ret = 0;
        struct btrfs_bio_ctrl bio_ctrl = {
                .wbc = wbc,
                .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc),
        };

        /*
         * Allow only a single thread to do the reloc work in zoned mode to
         * protect the write pointer updates.
         */
        btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
        ret = extent_write_cache_pages(mapping, &bio_ctrl);
        submit_write_bio(&bio_ctrl, ret);
        btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
        return ret;
}

void btrfs_readahead(struct readahead_control *rac)
{
        struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD };
        struct page *pagepool[16];
        struct extent_map *em_cached = NULL;
        u64 prev_em_start = (u64)-1;
        int nr;

        while ((nr = readahead_page_batch(rac, pagepool))) {
                u64 contig_start = readahead_pos(rac);
                u64 contig_end = contig_start + readahead_batch_length(rac) - 1;

                contiguous_readpages(pagepool, nr, contig_start, contig_end,
                                &em_cached, &bio_ctrl, &prev_em_start);
        }

        if (em_cached)
                free_extent_map(em_cached);
        submit_one_bio(&bio_ctrl);
}

/*
 * basic invalidate_folio code, this waits on any locked or writeback
 * ranges corresponding to the folio, and then deletes any extent state
 * records from the tree
 */
int extent_invalidate_folio(struct extent_io_tree *tree,
                          struct folio *folio, size_t offset)
{
        struct extent_state *cached_state = NULL;
        u64 start = folio_pos(folio);
        u64 end = start + folio_size(folio) - 1;
        size_t blocksize = folio_to_fs_info(folio)->sectorsize;

        /* This function is only called for the btree inode */
        ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);

        start += ALIGN(offset, blocksize);
        if (start > end)
                return 0;

        lock_extent(tree, start, end, &cached_state);
        folio_wait_writeback(folio);

        /*
         * Currently for btree io tree, only EXTENT_LOCKED is utilized,
         * so here we only need to unlock the extent range to free any
         * existing extent state.
         */
        unlock_extent(tree, start, end, &cached_state);
        return 0;
}

/*
 * a helper for release_folio, this tests for areas of the page that
 * are locked or under IO and drops the related state bits if it is safe
 * to drop the page.
 */
static bool try_release_extent_state(struct extent_io_tree *tree,
                                    struct page *page, gfp_t mask)
{
        u64 start = page_offset(page);
        u64 end = start + PAGE_SIZE - 1;
        bool ret;

        if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) {
                ret = false;
        } else {
                u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
                                   EXTENT_DELALLOC_NEW | EXTENT_CTLBITS |
                                   EXTENT_QGROUP_RESERVED);
                int ret2;

                /*
                 * At this point we can safely clear everything except the
                 * locked bit, the nodatasum bit and the delalloc new bit.
                 * The delalloc new bit will be cleared by ordered extent
                 * completion.
                 */
                ret2 = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL);

                /* if clear_extent_bit failed for enomem reasons,
                 * we can't allow the release to continue.
                 */
                if (ret2 < 0)
                        ret = false;
                else
                        ret = true;
        }
        return ret;
}

/*
 * a helper for release_folio.  As long as there are no locked extents
 * in the range corresponding to the page, both state records and extent
 * map records are removed
 */
bool try_release_extent_mapping(struct page *page, gfp_t mask)
{
        u64 start = page_offset(page);
        u64 end = start + PAGE_SIZE - 1;
        struct btrfs_inode *inode = page_to_inode(page);
        struct extent_io_tree *io_tree = &inode->io_tree;

        while (start <= end) {
                const u64 cur_gen = btrfs_get_fs_generation(inode->root->fs_info);
                const u64 len = end - start + 1;
                struct extent_map_tree *extent_tree = &inode->extent_tree;
                struct extent_map *em;

                write_lock(&extent_tree->lock);
                em = lookup_extent_mapping(extent_tree, start, len);
                if (!em) {
                        write_unlock(&extent_tree->lock);
                        break;
                }
                if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) {
                        write_unlock(&extent_tree->lock);
                        free_extent_map(em);
                        break;
                }
                if (test_range_bit_exists(io_tree, em->start,
                                          extent_map_end(em) - 1, EXTENT_LOCKED))
                        goto next;
                /*
                 * If it's not in the list of modified extents, used by a fast
                 * fsync, we can remove it. If it's being logged we can safely
                 * remove it since fsync took an extra reference on the em.
                 */
                if (list_empty(&em->list) || (em->flags & EXTENT_FLAG_LOGGING))
                        goto remove_em;
                /*
                 * If it's in the list of modified extents, remove it only if
                 * its generation is older then the current one, in which case
                 * we don't need it for a fast fsync. Otherwise don't remove it,
                 * we could be racing with an ongoing fast fsync that could miss
                 * the new extent.
                 */
                if (em->generation >= cur_gen)
                        goto next;
remove_em:
                /*
                 * We only remove extent maps that are not in the list of
                 * modified extents or that are in the list but with a
                 * generation lower then the current generation, so there is no
                 * need to set the full fsync flag on the inode (it hurts the
                 * fsync performance for workloads with a data size that exceeds
                 * or is close to the system's memory).
                 */
                remove_extent_mapping(inode, em);
                /* Once for the inode's extent map tree. */
                free_extent_map(em);
next:
                start = extent_map_end(em);
                write_unlock(&extent_tree->lock);

                /* Once for us, for the lookup_extent_mapping() reference. */
                free_extent_map(em);

                if (need_resched()) {
                        /*
                         * If we need to resched but we can't block just exit
                         * and leave any remaining extent maps.
                         */
                        if (!gfpflags_allow_blocking(mask))
                                break;

                        cond_resched();
                }
        }
        return try_release_extent_state(io_tree, page, mask);
}

struct btrfs_fiemap_entry {
        u64 offset;
        u64 phys;
        u64 len;
        u32 flags;
};

/*
 * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file
 * range from the inode's io tree, unlock the subvolume tree search path, flush
 * the fiemap cache and relock the file range and research the subvolume tree.
 * The value here is something negative that can't be confused with a valid
 * errno value and different from 1 because that's also a return value from
 * fiemap_fill_next_extent() and also it's often used to mean some btree search
 * did not find a key, so make it some distinct negative value.
 */
#define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1))

/*
 * Used to:
 *
 * - Cache the next entry to be emitted to the fiemap buffer, so that we can
 *   merge extents that are contiguous and can be grouped as a single one;
 *
 * - Store extents ready to be written to the fiemap buffer in an intermediary
 *   buffer. This intermediary buffer is to ensure that in case the fiemap
 *   buffer is memory mapped to the fiemap target file, we don't deadlock
 *   during btrfs_page_mkwrite(). This is because during fiemap we are locking
 *   an extent range in order to prevent races with delalloc flushing and
 *   ordered extent completion, which is needed in order to reliably detect
 *   delalloc in holes and prealloc extents. And this can lead to a deadlock
 *   if the fiemap buffer is memory mapped to the file we are running fiemap
 *   against (a silly, useless in practice scenario, but possible) because
 *   btrfs_page_mkwrite() will try to lock the same extent range.
 */
struct fiemap_cache {
        /* An array of ready fiemap entries. */
        struct btrfs_fiemap_entry *entries;
        /* Number of entries in the entries array. */
        int entries_size;
        /* Index of the next entry in the entries array to write to. */
        int entries_pos;
        /*
         * Once the entries array is full, this indicates what's the offset for
         * the next file extent item we must search for in the inode's subvolume
         * tree after unlocking the extent range in the inode's io tree and
         * releasing the search path.
         */
        u64 next_search_offset;
        /*
         * This matches struct fiemap_extent_info::fi_mapped_extents, we use it
         * to count ourselves emitted extents and stop instead of relying on
         * fiemap_fill_next_extent() because we buffer ready fiemap entries at
         * the @entries array, and we want to stop as soon as we hit the max
         * amount of extents to map, not just to save time but also to make the
         * logic at extent_fiemap() simpler.
         */
        unsigned int extents_mapped;
        /* Fields for the cached extent (unsubmitted, not ready, extent). */
        u64 offset;
        u64 phys;
        u64 len;
        u32 flags;
        bool cached;
};

static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo,
                              struct fiemap_cache *cache)
{
        for (int i = 0; i < cache->entries_pos; i++) {
                struct btrfs_fiemap_entry *entry = &cache->entries[i];
                int ret;

                ret = fiemap_fill_next_extent(fieinfo, entry->offset,
                                              entry->phys, entry->len,
                                              entry->flags);
                /*
                 * Ignore 1 (reached max entries) because we keep track of that
                 * ourselves in emit_fiemap_extent().
                 */
                if (ret < 0)
                        return ret;
        }
        cache->entries_pos = 0;

        return 0;
}

/*
 * Helper to submit fiemap extent.
 *
 * Will try to merge current fiemap extent specified by @offset, @phys,
 * @len and @flags with cached one.
 * And only when we fails to merge, cached one will be submitted as
 * fiemap extent.
 *
 * Return value is the same as fiemap_fill_next_extent().
 */
static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
                                struct fiemap_cache *cache,
                                u64 offset, u64 phys, u64 len, u32 flags)
{
        struct btrfs_fiemap_entry *entry;
        u64 cache_end;

        /* Set at the end of extent_fiemap(). */
        ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);

        if (!cache->cached)
                goto assign;

        /*
         * When iterating the extents of the inode, at extent_fiemap(), we may
         * find an extent that starts at an offset behind the end offset of the
         * previous extent we processed. This happens if fiemap is called
         * without FIEMAP_FLAG_SYNC and there are ordered extents completing
         * after we had to unlock the file range, release the search path, emit
         * the fiemap extents stored in the buffer (cache->entries array) and
         * the lock the remainder of the range and re-search the btree.
         *
         * For example we are in leaf X processing its last item, which is the
         * file extent item for file range [512K, 1M[, and after
         * btrfs_next_leaf() releases the path, there's an ordered extent that
         * completes for the file range [768K, 2M[, and that results in trimming
         * the file extent item so that it now corresponds to the file range
         * [512K, 768K[ and a new file extent item is inserted for the file
         * range [768K, 2M[, which may end up as the last item of leaf X or as
         * the first item of the next leaf - in either case btrfs_next_leaf()
         * will leave us with a path pointing to the new extent item, for the
         * file range [768K, 2M[, since that's the first key that follows the
         * last one we processed. So in order not to report overlapping extents
         * to user space, we trim the length of the previously cached extent and
         * emit it.
         *
         * Upon calling btrfs_next_leaf() we may also find an extent with an
         * offset smaller than or equals to cache->offset, and this happens
         * when we had a hole or prealloc extent with several delalloc ranges in
         * it, but after btrfs_next_leaf() released the path, delalloc was
         * flushed and the resulting ordered extents were completed, so we can
         * now have found a file extent item for an offset that is smaller than
         * or equals to what we have in cache->offset. We deal with this as
         * described below.
         */
        cache_end = cache->offset + cache->len;
        if (cache_end > offset) {
                if (offset == cache->offset) {
                        /*
                         * We cached a dealloc range (found in the io tree) for
                         * a hole or prealloc extent and we have now found a
                         * file extent item for the same offset. What we have
                         * now is more recent and up to date, so discard what
                         * we had in the cache and use what we have just found.
                         */
                        goto assign;
                } else if (offset > cache->offset) {
                        /*
                         * The extent range we previously found ends after the
                         * offset of the file extent item we found and that
                         * offset falls somewhere in the middle of that previous
                         * extent range. So adjust the range we previously found
                         * to end at the offset of the file extent item we have
                         * just found, since this extent is more up to date.
                         * Emit that adjusted range and cache the file extent
                         * item we have just found. This corresponds to the case
                         * where a previously found file extent item was split
                         * due to an ordered extent completing.
                         */
                        cache->len = offset - cache->offset;
                        goto emit;
                } else {
                        const u64 range_end = offset + len;

                        /*
                         * The offset of the file extent item we have just found
                         * is behind the cached offset. This means we were
                         * processing a hole or prealloc extent for which we
                         * have found delalloc ranges (in the io tree), so what
                         * we have in the cache is the last delalloc range we
                         * found while the file extent item we found can be
                         * either for a whole delalloc range we previously
                         * emmitted or only a part of that range.
                         *
                         * We have two cases here:
                         *
                         * 1) The file extent item's range ends at or behind the
                         *    cached extent's end. In this case just ignore the
                         *    current file extent item because we don't want to
                         *    overlap with previous ranges that may have been
                         *    emmitted already;
                         *
                         * 2) The file extent item starts behind the currently
                         *    cached extent but its end offset goes beyond the
                         *    end offset of the cached extent. We don't want to
                         *    overlap with a previous range that may have been
                         *    emmitted already, so we emit the currently cached
                         *    extent and then partially store the current file
                         *    extent item's range in the cache, for the subrange
                         *    going the cached extent's end to the end of the
                         *    file extent item.
                         */
                        if (range_end <= cache_end)
                                return 0;

                        if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))
                                phys += cache_end - offset;

                        offset = cache_end;
                        len = range_end - cache_end;
                        goto emit;
                }
        }

        /*
         * Only merges fiemap extents if
         * 1) Their logical addresses are continuous
         *
         * 2) Their physical addresses are continuous
         *    So truly compressed (physical size smaller than logical size)
         *    extents won't get merged with each other
         *
         * 3) Share same flags
         */
        if (cache->offset + cache->len  == offset &&
            cache->phys + cache->len == phys  &&
            cache->flags == flags) {
                cache->len += len;
                return 0;
        }

emit:
        /* Not mergeable, need to submit cached one */

        if (cache->entries_pos == cache->entries_size) {
                /*
                 * We will need to research for the end offset of the last
                 * stored extent and not from the current offset, because after
                 * unlocking the range and releasing the path, if there's a hole
                 * between that end offset and this current offset, a new extent
                 * may have been inserted due to a new write, so we don't want
                 * to miss it.
                 */
                entry = &cache->entries[cache->entries_size - 1];
                cache->next_search_offset = entry->offset + entry->len;
                cache->cached = false;

                return BTRFS_FIEMAP_FLUSH_CACHE;
        }

        entry = &cache->entries[cache->entries_pos];
        entry->offset = cache->offset;
        entry->phys = cache->phys;
        entry->len = cache->len;
        entry->flags = cache->flags;
        cache->entries_pos++;
        cache->extents_mapped++;

        if (cache->extents_mapped == fieinfo->fi_extents_max) {
                cache->cached = false;
                return 1;
        }
assign:
        cache->cached = true;
        cache->offset = offset;
        cache->phys = phys;
        cache->len = len;
        cache->flags = flags;

        return 0;
}

/*
 * Emit last fiemap cache
 *
 * The last fiemap cache may still be cached in the following case:
 * 0                      4k                    8k
 * |<- Fiemap range ->|
 * |<------------  First extent ----------->|
 *
 * In this case, the first extent range will be cached but not emitted.
 * So we must emit it before ending extent_fiemap().
 */
static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
                                  struct fiemap_cache *cache)
{
        int ret;

        if (!cache->cached)
                return 0;

        ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
                                      cache->len, cache->flags);
        cache->cached = false;
        if (ret > 0)
                ret = 0;
        return ret;
}

static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
{
        struct extent_buffer *clone = path->nodes[0];
        struct btrfs_key key;
        int slot;
        int ret;

        path->slots[0]++;
        if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
                return 0;

        /*
         * Add a temporary extra ref to an already cloned extent buffer to
         * prevent btrfs_next_leaf() freeing it, we want to reuse it to avoid
         * the cost of allocating a new one.
         */
        ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
        atomic_inc(&clone->refs);

        ret = btrfs_next_leaf(inode->root, path);
        if (ret != 0)
                goto out;

        /*
         * Don't bother with cloning if there are no more file extent items for
         * our inode.
         */
        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
        if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) {
                ret = 1;
                goto out;
        }

        /*
         * Important to preserve the start field, for the optimizations when
         * checking if extents are shared (see extent_fiemap()).
         *
         * We must set ->start before calling copy_extent_buffer_full().  If we
         * are on sub-pagesize blocksize, we use ->start to determine the offset
         * into the folio where our eb exists, and if we update ->start after
         * the fact then any subsequent reads of the eb may read from a
         * different offset in the folio than where we originally copied into.
         */
        clone->start = path->nodes[0]->start;
        /* See the comment at fiemap_search_slot() about why we clone. */
        copy_extent_buffer_full(clone, path->nodes[0]);

        slot = path->slots[0];
        btrfs_release_path(path);
        path->nodes[0] = clone;
        path->slots[0] = slot;
out:
        if (ret)
                free_extent_buffer(clone);

        return ret;
}

/*
 * Search for the first file extent item that starts at a given file offset or
 * the one that starts immediately before that offset.
 * Returns: 0 on success, < 0 on error, 1 if not found.
 */
static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
                              u64 file_offset)
{
        const u64 ino = btrfs_ino(inode);
        struct btrfs_root *root = inode->root;
        struct extent_buffer *clone;
        struct btrfs_key key;
        int slot;
        int ret;

        key.objectid = ino;
        key.type = BTRFS_EXTENT_DATA_KEY;
        key.offset = file_offset;

        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                return ret;

        if (ret > 0 && path->slots[0] > 0) {
                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
                if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
                        path->slots[0]--;
        }

        if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
                ret = btrfs_next_leaf(root, path);
                if (ret != 0)
                        return ret;

                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
                if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
                        return 1;
        }

        /*
         * We clone the leaf and use it during fiemap. This is because while
         * using the leaf we do expensive things like checking if an extent is
         * shared, which can take a long time. In order to prevent blocking
         * other tasks for too long, we use a clone of the leaf. We have locked
         * the file range in the inode's io tree, so we know none of our file
         * extent items can change. This way we avoid blocking other tasks that
         * want to insert items for other inodes in the same leaf or b+tree
         * rebalance operations (triggered for example when someone is trying
         * to push items into this leaf when trying to insert an item in a
         * neighbour leaf).
         * We also need the private clone because holding a read lock on an
         * extent buffer of the subvolume's b+tree will make lockdep unhappy
         * when we check if extents are shared, as backref walking may need to
         * lock the same leaf we are processing.
         */
        clone = btrfs_clone_extent_buffer(path->nodes[0]);
        if (!clone)
                return -ENOMEM;

        slot = path->slots[0];
        btrfs_release_path(path);
        path->nodes[0] = clone;
        path->slots[0] = slot;

        return 0;
}

/*
 * Process a range which is a hole or a prealloc extent in the inode's subvolume
 * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
 * extent. The end offset (@end) is inclusive.
 */
static int fiemap_process_hole(struct btrfs_inode *inode,
                               struct fiemap_extent_info *fieinfo,
                               struct fiemap_cache *cache,
                               struct extent_state **delalloc_cached_state,
                               struct btrfs_backref_share_check_ctx *backref_ctx,
                               u64 disk_bytenr, u64 extent_offset,
                               u64 extent_gen,
                               u64 start, u64 end)
{
        const u64 i_size = i_size_read(&inode->vfs_inode);
        u64 cur_offset = start;
        u64 last_delalloc_end = 0;
        u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
        bool checked_extent_shared = false;
        int ret;

        /*
         * There can be no delalloc past i_size, so don't waste time looking for
         * it beyond i_size.
         */
        while (cur_offset < end && cur_offset < i_size) {
                u64 delalloc_start;
                u64 delalloc_end;
                u64 prealloc_start;
                u64 prealloc_len = 0;
                bool delalloc;

                delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
                                                        delalloc_cached_state,
                                                        &delalloc_start,
                                                        &delalloc_end);
                if (!delalloc)
                        break;

                /*
                 * If this is a prealloc extent we have to report every section
                 * of it that has no delalloc.
                 */
                if (disk_bytenr != 0) {
                        if (last_delalloc_end == 0) {
                                prealloc_start = start;
                                prealloc_len = delalloc_start - start;
                        } else {
                                prealloc_start = last_delalloc_end + 1;
                                prealloc_len = delalloc_start - prealloc_start;
                        }
                }

                if (prealloc_len > 0) {
                        if (!checked_extent_shared && fieinfo->fi_extents_max) {
                                ret = btrfs_is_data_extent_shared(inode,
                                                                  disk_bytenr,
                                                                  extent_gen,
                                                                  backref_ctx);
                                if (ret < 0)
                                        return ret;
                                else if (ret > 0)
                                        prealloc_flags |= FIEMAP_EXTENT_SHARED;

                                checked_extent_shared = true;
                        }
                        ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
                                                 disk_bytenr + extent_offset,
                                                 prealloc_len, prealloc_flags);
                        if (ret)
                                return ret;
                        extent_offset += prealloc_len;
                }

                ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
                                         delalloc_end + 1 - delalloc_start,
                                         FIEMAP_EXTENT_DELALLOC |
                                         FIEMAP_EXTENT_UNKNOWN);
                if (ret)
                        return ret;

                last_delalloc_end = delalloc_end;
                cur_offset = delalloc_end + 1;
                extent_offset += cur_offset - delalloc_start;
                cond_resched();
        }

        /*
         * Either we found no delalloc for the whole prealloc extent or we have
         * a prealloc extent that spans i_size or starts at or after i_size.
         */
        if (disk_bytenr != 0 && last_delalloc_end < end) {
                u64 prealloc_start;
                u64 prealloc_len;

                if (last_delalloc_end == 0) {
                        prealloc_start = start;
                        prealloc_len = end + 1 - start;
                } else {
                        prealloc_start = last_delalloc_end + 1;
                        prealloc_len = end + 1 - prealloc_start;
                }

                if (!checked_extent_shared && fieinfo->fi_extents_max) {
                        ret = btrfs_is_data_extent_shared(inode,
                                                          disk_bytenr,
                                                          extent_gen,
                                                          backref_ctx);
                        if (ret < 0)
                                return ret;
                        else if (ret > 0)
                                prealloc_flags |= FIEMAP_EXTENT_SHARED;
                }
                ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
                                         disk_bytenr + extent_offset,
                                         prealloc_len, prealloc_flags);
                if (ret)
                        return ret;
        }

        return 0;
}

static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
                                          struct btrfs_path *path,
                                          u64 *last_extent_end_ret)
{
        const u64 ino = btrfs_ino(inode);
        struct btrfs_root *root = inode->root;
        struct extent_buffer *leaf;
        struct btrfs_file_extent_item *ei;
        struct btrfs_key key;
        u64 disk_bytenr;
        int ret;

        /*
         * Lookup the last file extent. We're not using i_size here because
         * there might be preallocation past i_size.
         */
        ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
        /* There can't be a file extent item at offset (u64)-1 */
        ASSERT(ret != 0);
        if (ret < 0)
                return ret;

        /*
         * For a non-existing key, btrfs_search_slot() always leaves us at a
         * slot > 0, except if the btree is empty, which is impossible because
         * at least it has the inode item for this inode and all the items for
         * the root inode 256.
         */
        ASSERT(path->slots[0] > 0);
        path->slots[0]--;
        leaf = path->nodes[0];
        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
        if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
                /* No file extent items in the subvolume tree. */
                *last_extent_end_ret = 0;
                return 0;
        }

        /*
         * For an inline extent, the disk_bytenr is where inline data starts at,
         * so first check if we have an inline extent item before checking if we
         * have an implicit hole (disk_bytenr == 0).
         */
        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
        if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
                *last_extent_end_ret = btrfs_file_extent_end(path);
                return 0;
        }

        /*
         * Find the last file extent item that is not a hole (when NO_HOLES is
         * not enabled). This should take at most 2 iterations in the worst
         * case: we have one hole file extent item at slot 0 of a leaf and
         * another hole file extent item as the last item in the previous leaf.
         * This is because we merge file extent items that represent holes.
         */
        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
        while (disk_bytenr == 0) {
                ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
                if (ret < 0) {
                        return ret;
                } else if (ret > 0) {
                        /* No file extent items that are not holes. */
                        *last_extent_end_ret = 0;
                        return 0;
                }
                leaf = path->nodes[0];
                ei = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_file_extent_item);
                disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
        }

        *last_extent_end_ret = btrfs_file_extent_end(path);
        return 0;
}

int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
                  u64 start, u64 len)
{
        const u64 ino = btrfs_ino(inode);
        struct extent_state *cached_state = NULL;
        struct extent_state *delalloc_cached_state = NULL;
        struct btrfs_path *path;
        struct fiemap_cache cache = { 0 };
        struct btrfs_backref_share_check_ctx *backref_ctx;
        u64 last_extent_end;
        u64 prev_extent_end;
        u64 range_start;
        u64 range_end;
        const u64 sectorsize = inode->root->fs_info->sectorsize;
        bool stopped = false;
        int ret;

        cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry);
        cache.entries = kmalloc_array(cache.entries_size,
                                      sizeof(struct btrfs_fiemap_entry),
                                      GFP_KERNEL);
        backref_ctx = btrfs_alloc_backref_share_check_ctx();
        path = btrfs_alloc_path();
        if (!cache.entries || !backref_ctx || !path) {
                ret = -ENOMEM;
                goto out;
        }

restart:
        range_start = round_down(start, sectorsize);
        range_end = round_up(start + len, sectorsize);
        prev_extent_end = range_start;

        lock_extent(&inode->io_tree, range_start, range_end, &cached_state);

        ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
        if (ret < 0)
                goto out_unlock;
        btrfs_release_path(path);

        path->reada = READA_FORWARD;
        ret = fiemap_search_slot(inode, path, range_start);
        if (ret < 0) {
                goto out_unlock;
        } else if (ret > 0) {
                /*
                 * No file extent item found, but we may have delalloc between
                 * the current offset and i_size. So check for that.
                 */
                ret = 0;
                goto check_eof_delalloc;
        }

        while (prev_extent_end < range_end) {
                struct extent_buffer *leaf = path->nodes[0];
                struct btrfs_file_extent_item *ei;
                struct btrfs_key key;
                u64 extent_end;
                u64 extent_len;
                u64 extent_offset = 0;
                u64 extent_gen;
                u64 disk_bytenr = 0;
                u64 flags = 0;
                int extent_type;
                u8 compression;

                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
                if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
                        break;

                extent_end = btrfs_file_extent_end(path);

                /*
                 * The first iteration can leave us at an extent item that ends
                 * before our range's start. Move to the next item.
                 */
                if (extent_end <= range_start)
                        goto next_item;

                backref_ctx->curr_leaf_bytenr = leaf->start;

                /* We have in implicit hole (NO_HOLES feature enabled). */
                if (prev_extent_end < key.offset) {
                        const u64 hole_end = min(key.offset, range_end) - 1;

                        ret = fiemap_process_hole(inode, fieinfo, &cache,
                                                  &delalloc_cached_state,
                                                  backref_ctx, 0, 0, 0,
                                                  prev_extent_end, hole_end);
                        if (ret < 0) {
                                goto out_unlock;
                        } else if (ret > 0) {
                                /* fiemap_fill_next_extent() told us to stop. */
                                stopped = true;
                                break;
                        }

                        /* We've reached the end of the fiemap range, stop. */
                        if (key.offset >= range_end) {
                                stopped = true;
                                break;
                        }
                }

                extent_len = extent_end - key.offset;
                ei = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_file_extent_item);
                compression = btrfs_file_extent_compression(leaf, ei);
                extent_type = btrfs_file_extent_type(leaf, ei);
                extent_gen = btrfs_file_extent_generation(leaf, ei);

                if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
                        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
                        if (compression == BTRFS_COMPRESS_NONE)
                                extent_offset = btrfs_file_extent_offset(leaf, ei);
                }

                if (compression != BTRFS_COMPRESS_NONE)
                        flags |= FIEMAP_EXTENT_ENCODED;

                if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                        flags |= FIEMAP_EXTENT_DATA_INLINE;
                        flags |= FIEMAP_EXTENT_NOT_ALIGNED;
                        ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
                                                 extent_len, flags);
                } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
                        ret = fiemap_process_hole(inode, fieinfo, &cache,
                                                  &delalloc_cached_state,
                                                  backref_ctx,
                                                  disk_bytenr, extent_offset,
                                                  extent_gen, key.offset,
                                                  extent_end - 1);
                } else if (disk_bytenr == 0) {
                        /* We have an explicit hole. */
                        ret = fiemap_process_hole(inode, fieinfo, &cache,
                                                  &delalloc_cached_state,
                                                  backref_ctx, 0, 0, 0,
                                                  key.offset, extent_end - 1);
                } else {
                        /* We have a regular extent. */
                        if (fieinfo->fi_extents_max) {
                                ret = btrfs_is_data_extent_shared(inode,
                                                                  disk_bytenr,
                                                                  extent_gen,
                                                                  backref_ctx);
                                if (ret < 0)
                                        goto out_unlock;
                                else if (ret > 0)
                                        flags |= FIEMAP_EXTENT_SHARED;
                        }

                        ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
                                                 disk_bytenr + extent_offset,
                                                 extent_len, flags);
                }

                if (ret < 0) {
                        goto out_unlock;
                } else if (ret > 0) {
                        /* emit_fiemap_extent() told us to stop. */
                        stopped = true;
                        break;
                }

                prev_extent_end = extent_end;
next_item:
                if (fatal_signal_pending(current)) {
                        ret = -EINTR;
                        goto out_unlock;
                }

                ret = fiemap_next_leaf_item(inode, path);
                if (ret < 0) {
                        goto out_unlock;
                } else if (ret > 0) {
                        /* No more file extent items for this inode. */
                        break;
                }
                cond_resched();
        }

check_eof_delalloc:
        if (!stopped && prev_extent_end < range_end) {
                ret = fiemap_process_hole(inode, fieinfo, &cache,
                                          &delalloc_cached_state, backref_ctx,
                                          0, 0, 0, prev_extent_end, range_end - 1);
                if (ret < 0)
                        goto out_unlock;
                prev_extent_end = range_end;
        }

        if (cache.cached && cache.offset + cache.len >= last_extent_end) {
                const u64 i_size = i_size_read(&inode->vfs_inode);

                if (prev_extent_end < i_size) {
                        u64 delalloc_start;
                        u64 delalloc_end;
                        bool delalloc;

                        delalloc = btrfs_find_delalloc_in_range(inode,
                                                                prev_extent_end,
                                                                i_size - 1,
                                                                &delalloc_cached_state,
                                                                &delalloc_start,
                                                                &delalloc_end);
                        if (!delalloc)
                                cache.flags |= FIEMAP_EXTENT_LAST;
                } else {
                        cache.flags |= FIEMAP_EXTENT_LAST;
                }
        }

out_unlock:
        unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);

        if (ret == BTRFS_FIEMAP_FLUSH_CACHE) {
                btrfs_release_path(path);
                ret = flush_fiemap_cache(fieinfo, &cache);
                if (ret)
                        goto out;
                len -= cache.next_search_offset - start;
                start = cache.next_search_offset;
                goto restart;
        } else if (ret < 0) {
                goto out;
        }

        /*
         * Must free the path before emitting to the fiemap buffer because we
         * may have a non-cloned leaf and if the fiemap buffer is memory mapped
         * to a file, a write into it (through btrfs_page_mkwrite()) may trigger
         * waiting for an ordered extent that in order to complete needs to
         * modify that leaf, therefore leading to a deadlock.
         */
        btrfs_free_path(path);
        path = NULL;

        ret = flush_fiemap_cache(fieinfo, &cache);
        if (ret)
                goto out;

        ret = emit_last_fiemap_cache(fieinfo, &cache);
out:
        free_extent_state(delalloc_cached_state);
        kfree(cache.entries);
        btrfs_free_backref_share_ctx(backref_ctx);
        btrfs_free_path(path);
        return ret;
}

static void __free_extent_buffer(struct extent_buffer *eb)
{
        kmem_cache_free(extent_buffer_cache, eb);
}

static int extent_buffer_under_io(const struct extent_buffer *eb)
{
        return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
                test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
}

static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio)
{
        struct btrfs_subpage *subpage;

        lockdep_assert_held(&folio->mapping->i_private_lock);

        if (folio_test_private(folio)) {
                subpage = folio_get_private(folio);
                if (atomic_read(&subpage->eb_refs))
                        return true;
                /*
                 * Even there is no eb refs here, we may still have
                 * end_page_read() call relying on page::private.
                 */
                if (atomic_read(&subpage->readers))
                        return true;
        }
        return false;
}

static void detach_extent_buffer_folio(struct extent_buffer *eb, struct folio *folio)
{
        struct btrfs_fs_info *fs_info = eb->fs_info;
        const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);

        /*
         * For mapped eb, we're going to change the folio private, which should
         * be done under the i_private_lock.
         */
        if (mapped)
                spin_lock(&folio->mapping->i_private_lock);

        if (!folio_test_private(folio)) {
                if (mapped)
                        spin_unlock(&folio->mapping->i_private_lock);
                return;
        }

        if (fs_info->nodesize >= PAGE_SIZE) {
                /*
                 * We do this since we'll remove the pages after we've
                 * removed the eb from the radix tree, so we could race
                 * and have this page now attached to the new eb.  So
                 * only clear folio if it's still connected to
                 * this eb.
                 */
                if (folio_test_private(folio) && folio_get_private(folio) == eb) {
                        BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
                        BUG_ON(folio_test_dirty(folio));
                        BUG_ON(folio_test_writeback(folio));
                        /* We need to make sure we haven't be attached to a new eb. */
                        folio_detach_private(folio);
                }
                if (mapped)
                        spin_unlock(&folio->mapping->i_private_lock);
                return;
        }

        /*
         * For subpage, we can have dummy eb with folio private attached.  In
         * this case, we can directly detach the private as such folio is only
         * attached to one dummy eb, no sharing.
         */
        if (!mapped) {
                btrfs_detach_subpage(fs_info, folio);
                return;
        }

        btrfs_folio_dec_eb_refs(fs_info, folio);

        /*
         * We can only detach the folio private if there are no other ebs in the
         * page range and no unfinished IO.
         */
        if (!folio_range_has_eb(fs_info, folio))
                btrfs_detach_subpage(fs_info, folio);

        spin_unlock(&folio->mapping->i_private_lock);
}

/* Release all pages attached to the extent buffer */
static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
{
        ASSERT(!extent_buffer_under_io(eb));

        for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) {
                struct folio *folio = eb->folios[i];

                if (!folio)
                        continue;

                detach_extent_buffer_folio(eb, folio);

                /* One for when we allocated the folio. */
                folio_put(folio);
        }
}

/*
 * Helper for releasing the extent buffer.
 */
static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
{
        btrfs_release_extent_buffer_pages(eb);
        btrfs_leak_debug_del_eb(eb);
        __free_extent_buffer(eb);
}

static struct extent_buffer *
__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
                      unsigned long len)
{
        struct extent_buffer *eb = NULL;

        eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
        eb->start = start;
        eb->len = len;
        eb->fs_info = fs_info;
        init_rwsem(&eb->lock);

        btrfs_leak_debug_add_eb(eb);

        spin_lock_init(&eb->refs_lock);
        atomic_set(&eb->refs, 1);

        ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);

        return eb;
}

struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
{
        struct extent_buffer *new;
        int num_folios = num_extent_folios(src);
        int ret;

        new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
        if (new == NULL)
                return NULL;

        /*
         * Set UNMAPPED before calling btrfs_release_extent_buffer(), as
         * btrfs_release_extent_buffer() have different behavior for
         * UNMAPPED subpage extent buffer.
         */
        set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);

        ret = alloc_eb_folio_array(new, 0);
        if (ret) {
                btrfs_release_extent_buffer(new);
                return NULL;
        }

        for (int i = 0; i < num_folios; i++) {
                struct folio *folio = new->folios[i];
                int ret;

                ret = attach_extent_buffer_folio(new, folio, NULL);
                if (ret < 0) {
                        btrfs_release_extent_buffer(new);
                        return NULL;
                }
                WARN_ON(folio_test_dirty(folio));
        }
        copy_extent_buffer_full(new, src);
        set_extent_buffer_uptodate(new);

        return new;
}

struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
                                                  u64 start, unsigned long len)
{
        struct extent_buffer *eb;
        int num_folios = 0;
        int ret;

        eb = __alloc_extent_buffer(fs_info, start, len);
        if (!eb)
                return NULL;

        ret = alloc_eb_folio_array(eb, 0);
        if (ret)
                goto err;

        num_folios = num_extent_folios(eb);
        for (int i = 0; i < num_folios; i++) {
                ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL);
                if (ret < 0)
                        goto err;
        }

        set_extent_buffer_uptodate(eb);
        btrfs_set_header_nritems(eb, 0);
        set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);

        return eb;
err:
        for (int i = 0; i < num_folios; i++) {
                if (eb->folios[i]) {
                        detach_extent_buffer_folio(eb, eb->folios[i]);
                        __folio_put(eb->folios[i]);
                }
        }
        __free_extent_buffer(eb);
        return NULL;
}

struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
                                                u64 start)
{
        return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
}

static void check_buffer_tree_ref(struct extent_buffer *eb)
{
        int refs;
        /*
         * The TREE_REF bit is first set when the extent_buffer is added
         * to the radix tree. It is also reset, if unset, when a new reference
         * is created by find_extent_buffer.
         *
         * It is only cleared in two cases: freeing the last non-tree
         * reference to the extent_buffer when its STALE bit is set or
         * calling release_folio when the tree reference is the only reference.
         *
         * In both cases, care is taken to ensure that the extent_buffer's
         * pages are not under io. However, release_folio can be concurrently
         * called with creating new references, which is prone to race
         * conditions between the calls to check_buffer_tree_ref in those
         * codepaths and clearing TREE_REF in try_release_extent_buffer.
         *
         * The actual lifetime of the extent_buffer in the radix tree is
         * adequately protected by the refcount, but the TREE_REF bit and
         * its corresponding reference are not. To protect against this
         * class of races, we call check_buffer_tree_ref from the codepaths
         * which trigger io. Note that once io is initiated, TREE_REF can no
         * longer be cleared, so that is the moment at which any such race is
         * best fixed.
         */
        refs = atomic_read(&eb->refs);
        if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
                return;

        spin_lock(&eb->refs_lock);
        if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
                atomic_inc(&eb->refs);
        spin_unlock(&eb->refs_lock);
}

static void mark_extent_buffer_accessed(struct extent_buffer *eb)
{
        int num_folios= num_extent_folios(eb);

        check_buffer_tree_ref(eb);

        for (int i = 0; i < num_folios; i++)
                folio_mark_accessed(eb->folios[i]);
}

struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
                                         u64 start)
{
        struct extent_buffer *eb;

        eb = find_extent_buffer_nolock(fs_info, start);
        if (!eb)
                return NULL;
        /*
         * Lock our eb's refs_lock to avoid races with free_extent_buffer().
         * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and
         * another task running free_extent_buffer() might have seen that flag
         * set, eb->refs == 2, that the buffer isn't under IO (dirty and
         * writeback flags not set) and it's still in the tree (flag
         * EXTENT_BUFFER_TREE_REF set), therefore being in the process of
         * decrementing the extent buffer's reference count twice.  So here we
         * could race and increment the eb's reference count, clear its stale
         * flag, mark it as dirty and drop our reference before the other task
         * finishes executing free_extent_buffer, which would later result in
         * an attempt to free an extent buffer that is dirty.
         */
        if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
                spin_lock(&eb->refs_lock);
                spin_unlock(&eb->refs_lock);
        }
        mark_extent_buffer_accessed(eb);
        return eb;
}

#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
                                        u64 start)
{
        struct extent_buffer *eb, *exists = NULL;
        int ret;

        eb = find_extent_buffer(fs_info, start);
        if (eb)
                return eb;
        eb = alloc_dummy_extent_buffer(fs_info, start);
        if (!eb)
                return ERR_PTR(-ENOMEM);
        eb->fs_info = fs_info;
again:
        ret = radix_tree_preload(GFP_NOFS);
        if (ret) {
                exists = ERR_PTR(ret);
                goto free_eb;
        }
        spin_lock(&fs_info->buffer_lock);
        ret = radix_tree_insert(&fs_info->buffer_radix,
                                start >> fs_info->sectorsize_bits, eb);
        spin_unlock(&fs_info->buffer_lock);
        radix_tree_preload_end();
        if (ret == -EEXIST) {
                exists = find_extent_buffer(fs_info, start);
                if (exists)
                        goto free_eb;
                else
                        goto again;
        }
        check_buffer_tree_ref(eb);
        set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);

        return eb;
free_eb:
        btrfs_release_extent_buffer(eb);
        return exists;
}
#endif

static struct extent_buffer *grab_extent_buffer(
                struct btrfs_fs_info *fs_info, struct page *page)
{
        struct folio *folio = page_folio(page);
        struct extent_buffer *exists;

        lockdep_assert_held(&page->mapping->i_private_lock);

        /*
         * For subpage case, we completely rely on radix tree to ensure we
         * don't try to insert two ebs for the same bytenr.  So here we always
         * return NULL and just continue.
         */
        if (fs_info->nodesize < PAGE_SIZE)
                return NULL;

        /* Page not yet attached to an extent buffer */
        if (!folio_test_private(folio))
                return NULL;

        /*
         * We could have already allocated an eb for this page and attached one
         * so lets see if we can get a ref on the existing eb, and if we can we
         * know it's good and we can just return that one, else we know we can
         * just overwrite folio private.
         */
        exists = folio_get_private(folio);
        if (atomic_inc_not_zero(&exists->refs))
                return exists;

        WARN_ON(PageDirty(page));
        folio_detach_private(folio);
        return NULL;
}

static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
{
        if (!IS_ALIGNED(start, fs_info->sectorsize)) {
                btrfs_err(fs_info, "bad tree block start %llu", start);
                return -EINVAL;
        }

        if (fs_info->nodesize < PAGE_SIZE &&
            offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) {
                btrfs_err(fs_info,
                "tree block crosses page boundary, start %llu nodesize %u",
                          start, fs_info->nodesize);
                return -EINVAL;
        }
        if (fs_info->nodesize >= PAGE_SIZE &&
            !PAGE_ALIGNED(start)) {
                btrfs_err(fs_info,
                "tree block is not page aligned, start %llu nodesize %u",
                          start, fs_info->nodesize);
                return -EINVAL;
        }
        if (!IS_ALIGNED(start, fs_info->nodesize) &&
            !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) {
                btrfs_warn(fs_info,
"tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance",
                              start, fs_info->nodesize);
        }
        return 0;
}


/*
 * Return 0 if eb->folios[i] is attached to btree inode successfully.
 * Return >0 if there is already another extent buffer for the range,
 * and @found_eb_ret would be updated.
 * Return -EAGAIN if the filemap has an existing folio but with different size
 * than @eb.
 * The caller needs to free the existing folios and retry using the same order.
 */
static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
                                      struct btrfs_subpage *prealloc,
                                      struct extent_buffer **found_eb_ret)
{

        struct btrfs_fs_info *fs_info = eb->fs_info;
        struct address_space *mapping = fs_info->btree_inode->i_mapping;
        const unsigned long index = eb->start >> PAGE_SHIFT;
        struct folio *existing_folio = NULL;
        int ret;

        ASSERT(found_eb_ret);

        /* Caller should ensure the folio exists. */
        ASSERT(eb->folios[i]);

retry:
        ret = filemap_add_folio(mapping, eb->folios[i], index + i,
                                GFP_NOFS | __GFP_NOFAIL);
        if (!ret)
                goto finish;

        existing_folio = filemap_lock_folio(mapping, index + i);
        /* The page cache only exists for a very short time, just retry. */
        if (IS_ERR(existing_folio)) {
                existing_folio = NULL;
                goto retry;
        }

        /* For now, we should only have single-page folios for btree inode. */
        ASSERT(folio_nr_pages(existing_folio) == 1);

        if (folio_size(existing_folio) != eb->folio_size) {
                folio_unlock(existing_folio);
                folio_put(existing_folio);
                return -EAGAIN;
        }

finish:
        spin_lock(&mapping->i_private_lock);
        if (existing_folio && fs_info->nodesize < PAGE_SIZE) {
                /* We're going to reuse the existing page, can drop our folio now. */
                __free_page(folio_page(eb->folios[i], 0));
                eb->folios[i] = existing_folio;
        } else if (existing_folio) {
                struct extent_buffer *existing_eb;

                existing_eb = grab_extent_buffer(fs_info,
                                                 folio_page(existing_folio, 0));
                if (existing_eb) {
                        /* The extent buffer still exists, we can use it directly. */
                        *found_eb_ret = existing_eb;
                        spin_unlock(&mapping->i_private_lock);
                        folio_unlock(existing_folio);
                        folio_put(existing_folio);
                        return 1;
                }
                /* The extent buffer no longer exists, we can reuse the folio. */
                __free_page(folio_page(eb->folios[i], 0));
                eb->folios[i] = existing_folio;
        }
        eb->folio_size = folio_size(eb->folios[i]);
        eb->folio_shift = folio_shift(eb->folios[i]);
        /* Should not fail, as we have preallocated the memory. */
        ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc);
        ASSERT(!ret);
        /*
         * To inform we have an extra eb under allocation, so that
         * detach_extent_buffer_page() won't release the folio private when the
         * eb hasn't been inserted into radix tree yet.
         *
         * The ref will be decreased when the eb releases the page, in
         * detach_extent_buffer_page().  Thus needs no special handling in the
         * error path.
         */
        btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]);
        spin_unlock(&mapping->i_private_lock);
        return 0;
}

struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
                                          u64 start, u64 owner_root, int level)
{
        unsigned long len = fs_info->nodesize;
        int num_folios;
        int attached = 0;
        struct extent_buffer *eb;
        struct extent_buffer *existing_eb = NULL;
        struct btrfs_subpage *prealloc = NULL;
        u64 lockdep_owner = owner_root;
        bool page_contig = true;
        int uptodate = 1;
        int ret;

        if (check_eb_alignment(fs_info, start))
                return ERR_PTR(-EINVAL);

#if BITS_PER_LONG == 32
        if (start >= MAX_LFS_FILESIZE) {
                btrfs_err_rl(fs_info,
                "extent buffer %llu is beyond 32bit page cache limit", start);
                btrfs_err_32bit_limit(fs_info);
                return ERR_PTR(-EOVERFLOW);
        }
        if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
                btrfs_warn_32bit_limit(fs_info);
#endif

        eb = find_extent_buffer(fs_info, start);
        if (eb)
                return eb;

        eb = __alloc_extent_buffer(fs_info, start, len);
        if (!eb)
                return ERR_PTR(-ENOMEM);

        /*
         * The reloc trees are just snapshots, so we need them to appear to be
         * just like any other fs tree WRT lockdep.
         */
        if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID)
                lockdep_owner = BTRFS_FS_TREE_OBJECTID;

        btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);

        /*
         * Preallocate folio private for subpage case, so that we won't
         * allocate memory with i_private_lock nor page lock hold.
         *
         * The memory will be freed by attach_extent_buffer_page() or freed
         * manually if we exit earlier.
         */
        if (fs_info->nodesize < PAGE_SIZE) {
                prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
                if (IS_ERR(prealloc)) {
                        ret = PTR_ERR(prealloc);
                        goto out;
                }
        }

reallocate:
        /* Allocate all pages first. */
        ret = alloc_eb_folio_array(eb, __GFP_NOFAIL);
        if (ret < 0) {
                btrfs_free_subpage(prealloc);
                goto out;
        }

        num_folios = num_extent_folios(eb);
        /* Attach all pages to the filemap. */
        for (int i = 0; i < num_folios; i++) {
                struct folio *folio;

                ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb);
                if (ret > 0) {
                        ASSERT(existing_eb);
                        goto out;
                }

                /*
                 * TODO: Special handling for a corner case where the order of
                 * folios mismatch between the new eb and filemap.
                 *
                 * This happens when:
                 *
                 * - the new eb is using higher order folio
                 *
                 * - the filemap is still using 0-order folios for the range
                 *   This can happen at the previous eb allocation, and we don't
                 *   have higher order folio for the call.
                 *
                 * - the existing eb has already been freed
                 *
                 * In this case, we have to free the existing folios first, and
                 * re-allocate using the same order.
                 * Thankfully this is not going to happen yet, as we're still
                 * using 0-order folios.
                 */
                if (unlikely(ret == -EAGAIN)) {
                        ASSERT(0);
                        goto reallocate;
                }
                attached++;

                /*
                 * Only after attach_eb_folio_to_filemap(), eb->folios[] is
                 * reliable, as we may choose to reuse the existing page cache
                 * and free the allocated page.
                 */
                folio = eb->folios[i];
                WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len));

                /*
                 * Check if the current page is physically contiguous with previous eb
                 * page.
                 * At this stage, either we allocated a large folio, thus @i
                 * would only be 0, or we fall back to per-page allocation.
                 */
                if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0))
                        page_contig = false;

                if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len))
                        uptodate = 0;

                /*
                 * We can't unlock the pages just yet since the extent buffer
                 * hasn't been properly inserted in the radix tree, this
                 * opens a race with btree_release_folio which can free a page
                 * while we are still filling in all pages for the buffer and
                 * we could crash.
                 */
        }
        if (uptodate)
                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
        /* All pages are physically contiguous, can skip cross page handling. */
        if (page_contig)
                eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start);
again:
        ret = radix_tree_preload(GFP_NOFS);
        if (ret)
                goto out;

        spin_lock(&fs_info->buffer_lock);
        ret = radix_tree_insert(&fs_info->buffer_radix,
                                start >> fs_info->sectorsize_bits, eb);
        spin_unlock(&fs_info->buffer_lock);
        radix_tree_preload_end();
        if (ret == -EEXIST) {
                ret = 0;
                existing_eb = find_extent_buffer(fs_info, start);
                if (existing_eb)
                        goto out;
                else
                        goto again;
        }
        /* add one reference for the tree */
        check_buffer_tree_ref(eb);
        set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);

        /*
         * Now it's safe to unlock the pages because any calls to
         * btree_release_folio will correctly detect that a page belongs to a
         * live buffer and won't free them prematurely.
         */
        for (int i = 0; i < num_folios; i++)
                unlock_page(folio_page(eb->folios[i], 0));
        return eb;

out:
        WARN_ON(!atomic_dec_and_test(&eb->refs));

        /*
         * Any attached folios need to be detached before we unlock them.  This
         * is because when we're inserting our new folios into the mapping, and
         * then attaching our eb to that folio.  If we fail to insert our folio
         * we'll lookup the folio for that index, and grab that EB.  We do not
         * want that to grab this eb, as we're getting ready to free it.  So we
         * have to detach it first and then unlock it.
         *
         * We have to drop our reference and NULL it out here because in the
         * subpage case detaching does a btrfs_folio_dec_eb_refs() for our eb.
         * Below when we call btrfs_release_extent_buffer() we will call
         * detach_extent_buffer_folio() on our remaining pages in the !subpage
         * case.  If we left eb->folios[i] populated in the subpage case we'd
         * double put our reference and be super sad.
         */
        for (int i = 0; i < attached; i++) {
                ASSERT(eb->folios[i]);
                detach_extent_buffer_folio(eb, eb->folios[i]);
                unlock_page(folio_page(eb->folios[i], 0));
                folio_put(eb->folios[i]);
                eb->folios[i] = NULL;
        }
        /*
         * Now all pages of that extent buffer is unmapped, set UNMAPPED flag,
         * so it can be cleaned up without utlizing page->mapping.
         */
        set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);

        btrfs_release_extent_buffer(eb);
        if (ret < 0)
                return ERR_PTR(ret);
        ASSERT(existing_eb);
        return existing_eb;
}

static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
{
        struct extent_buffer *eb =
                        container_of(head, struct extent_buffer, rcu_head);

        __free_extent_buffer(eb);
}

static int release_extent_buffer(struct extent_buffer *eb)
        __releases(&eb->refs_lock)
{
        lockdep_assert_held(&eb->refs_lock);

        WARN_ON(atomic_read(&eb->refs) == 0);
        if (atomic_dec_and_test(&eb->refs)) {
                if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
                        struct btrfs_fs_info *fs_info = eb->fs_info;

                        spin_unlock(&eb->refs_lock);

                        spin_lock(&fs_info->buffer_lock);
                        radix_tree_delete(&fs_info->buffer_radix,
                                          eb->start >> fs_info->sectorsize_bits);
                        spin_unlock(&fs_info->buffer_lock);
                } else {
                        spin_unlock(&eb->refs_lock);
                }

                btrfs_leak_debug_del_eb(eb);
                /* Should be safe to release our pages at this point */
                btrfs_release_extent_buffer_pages(eb);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
                if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
                        __free_extent_buffer(eb);
                        return 1;
                }
#endif
                call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
                return 1;
        }
        spin_unlock(&eb->refs_lock);

        return 0;
}

void free_extent_buffer(struct extent_buffer *eb)
{
        int refs;
        if (!eb)
                return;

        refs = atomic_read(&eb->refs);
        while (1) {
                if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
                    || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
                        refs == 1))
                        break;
                if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1))
                        return;
        }

        spin_lock(&eb->refs_lock);
        if (atomic_read(&eb->refs) == 2 &&
            test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
            !extent_buffer_under_io(eb) &&
            test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
                atomic_dec(&eb->refs);

        /*
         * I know this is terrible, but it's temporary until we stop tracking
         * the uptodate bits and such for the extent buffers.
         */
        release_extent_buffer(eb);
}

void free_extent_buffer_stale(struct extent_buffer *eb)
{
        if (!eb)
                return;

        spin_lock(&eb->refs_lock);
        set_bit(EXTENT_BUFFER_STALE, &eb->bflags);

        if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
            test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
                atomic_dec(&eb->refs);
        release_extent_buffer(eb);
}

static void btree_clear_folio_dirty(struct folio *folio)
{
        ASSERT(folio_test_dirty(folio));
        ASSERT(folio_test_locked(folio));
        folio_clear_dirty_for_io(folio);
        xa_lock_irq(&folio->mapping->i_pages);
        if (!folio_test_dirty(folio))
                __xa_clear_mark(&folio->mapping->i_pages,
                                folio_index(folio), PAGECACHE_TAG_DIRTY);
        xa_unlock_irq(&folio->mapping->i_pages);
}

static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
{
        struct btrfs_fs_info *fs_info = eb->fs_info;
        struct folio *folio = eb->folios[0];
        bool last;

        /* btree_clear_folio_dirty() needs page locked. */
        folio_lock(folio);
        last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len);
        if (last)
                btree_clear_folio_dirty(folio);
        folio_unlock(folio);
        WARN_ON(atomic_read(&eb->refs) == 0);
}

void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
                              struct extent_buffer *eb)
{
        struct btrfs_fs_info *fs_info = eb->fs_info;
        int num_folios;

        btrfs_assert_tree_write_locked(eb);

        if (trans && btrfs_header_generation(eb) != trans->transid)
                return;

        /*
         * Instead of clearing the dirty flag off of the buffer, mark it as
         * EXTENT_BUFFER_ZONED_ZEROOUT. This allows us to preserve
         * write-ordering in zoned mode, without the need to later re-dirty
         * the extent_buffer.
         *
         * The actual zeroout of the buffer will happen later in
         * btree_csum_one_bio.
         */
        if (btrfs_is_zoned(fs_info) && test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
                set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags);
                return;
        }

        if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
                return;

        percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len,
                                 fs_info->dirty_metadata_batch);

        if (eb->fs_info->nodesize < PAGE_SIZE)
                return clear_subpage_extent_buffer_dirty(eb);

        num_folios = num_extent_folios(eb);
        for (int i = 0; i < num_folios; i++) {
                struct folio *folio = eb->folios[i];

                if (!folio_test_dirty(folio))
                        continue;
                folio_lock(folio);
                btree_clear_folio_dirty(folio);
                folio_unlock(folio);
        }
        WARN_ON(atomic_read(&eb->refs) == 0);
}

void set_extent_buffer_dirty(struct extent_buffer *eb)
{
        int num_folios;
        bool was_dirty;

        check_buffer_tree_ref(eb);

        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);

        num_folios = num_extent_folios(eb);
        WARN_ON(atomic_read(&eb->refs) == 0);
        WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
        WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags));

        if (!was_dirty) {
                bool subpage = eb->fs_info->nodesize < PAGE_SIZE;

                /*
                 * For subpage case, we can have other extent buffers in the
                 * same page, and in clear_subpage_extent_buffer_dirty() we
                 * have to clear page dirty without subpage lock held.
                 * This can cause race where our page gets dirty cleared after
                 * we just set it.
                 *
                 * Thankfully, clear_subpage_extent_buffer_dirty() has locked
                 * its page for other reasons, we can use page lock to prevent
                 * the above race.
                 */
                if (subpage)
                        lock_page(folio_page(eb->folios[0], 0));
                for (int i = 0; i < num_folios; i++)
                        btrfs_folio_set_dirty(eb->fs_info, eb->folios[i],
                                              eb->start, eb->len);
                if (subpage)
                        unlock_page(folio_page(eb->folios[0], 0));
                percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes,
                                         eb->len,
                                         eb->fs_info->dirty_metadata_batch);
        }
#ifdef CONFIG_BTRFS_DEBUG
        for (int i = 0; i < num_folios; i++)
                ASSERT(folio_test_dirty(eb->folios[i]));
#endif
}

void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
        struct btrfs_fs_info *fs_info = eb->fs_info;
        int num_folios = num_extent_folios(eb);

        clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
        for (int i = 0; i < num_folios; i++) {
                struct folio *folio = eb->folios[i];

                if (!folio)
                        continue;

                /*
                 * This is special handling for metadata subpage, as regular
                 * btrfs_is_subpage() can not handle cloned/dummy metadata.
                 */
                if (fs_info->nodesize >= PAGE_SIZE)
                        folio_clear_uptodate(folio);
                else
                        btrfs_subpage_clear_uptodate(fs_info, folio,
                                                     eb->start, eb->len);
        }
}

void set_extent_buffer_uptodate(struct extent_buffer *eb)
{
        struct btrfs_fs_info *fs_info = eb->fs_info;
        int num_folios = num_extent_folios(eb);

        set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
        for (int i = 0; i < num_folios; i++) {
                struct folio *folio = eb->folios[i];

                /*
                 * This is special handling for metadata subpage, as regular
                 * btrfs_is_subpage() can not handle cloned/dummy metadata.
                 */
                if (fs_info->nodesize >= PAGE_SIZE)
                        folio_mark_uptodate(folio);
                else
                        btrfs_subpage_set_uptodate(fs_info, folio,
                                                   eb->start, eb->len);
        }
}

static void clear_extent_buffer_reading(struct extent_buffer *eb)
{
        clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
        smp_mb__after_atomic();
        wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
}

static void end_bbio_meta_read(struct btrfs_bio *bbio)
{
        struct extent_buffer *eb = bbio->private;
        struct btrfs_fs_info *fs_info = eb->fs_info;
        bool uptodate = !bbio->bio.bi_status;
        struct folio_iter fi;
        u32 bio_offset = 0;

        /*
         * If the extent buffer is marked UPTODATE before the read operation
         * completes, other calls to read_extent_buffer_pages() will return
         * early without waiting for the read to finish, causing data races.
         */
        WARN_ON(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags));

        eb->read_mirror = bbio->mirror_num;

        if (uptodate &&
            btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0)
                uptodate = false;

        if (uptodate) {
                set_extent_buffer_uptodate(eb);
        } else {
                clear_extent_buffer_uptodate(eb);
                set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
        }

        bio_for_each_folio_all(fi, &bbio->bio) {
                struct folio *folio = fi.folio;
                u64 start = eb->start + bio_offset;
                u32 len = fi.length;

                if (uptodate)
                        btrfs_folio_set_uptodate(fs_info, folio, start, len);
                else
                        btrfs_folio_clear_uptodate(fs_info, folio, start, len);

                bio_offset += len;
        }

        clear_extent_buffer_reading(eb);
        free_extent_buffer(eb);

        bio_put(&bbio->bio);
}

int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
                             struct btrfs_tree_parent_check *check)
{
        struct btrfs_bio *bbio;
        bool ret;

        if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
                return 0;

        /*
         * We could have had EXTENT_BUFFER_UPTODATE cleared by the write
         * operation, which could potentially still be in flight.  In this case
         * we simply want to return an error.
         */
        if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
                return -EIO;

        /* Someone else is already reading the buffer, just wait for it. */
        if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags))
                goto done;

        /*
         * Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above
         * test_and_set_bit(EXTENT_BUFFER_READING), someone else could have
         * started and finished reading the same eb.  In this case, UPTODATE
         * will now be set, and we shouldn't read it in again.
         */
        if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) {
                clear_extent_buffer_reading(eb);
                return 0;
        }

        clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
        eb->read_mirror = 0;
        check_buffer_tree_ref(eb);
        atomic_inc(&eb->refs);

        bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
                               REQ_OP_READ | REQ_META, eb->fs_info,
                               end_bbio_meta_read, eb);
        bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
        bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
        bbio->file_offset = eb->start;
        memcpy(&bbio->parent_check, check, sizeof(*check));
        if (eb->fs_info->nodesize < PAGE_SIZE) {
                ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len,
                                    eb->start - folio_pos(eb->folios[0]));
                ASSERT(ret);
        } else {
                int num_folios = num_extent_folios(eb);

                for (int i = 0; i < num_folios; i++) {
                        struct folio *folio = eb->folios[i];

                        ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
                        ASSERT(ret);
                }
        }
        btrfs_submit_bio(bbio, mirror_num);

done:
        if (wait == WAIT_COMPLETE) {
                wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE);
                if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
                        return -EIO;
        }

        return 0;
}

static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
                            unsigned long len)
{
        btrfs_warn(eb->fs_info,
                "access to eb bytenr %llu len %u out of range start %lu len %lu",
                eb->start, eb->len, start, len);
        WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));

        return true;
}

/*
 * Check if the [start, start + len) range is valid before reading/writing
 * the eb.
 * NOTE: @start and @len are offset inside the eb, not logical address.
 *
 * Caller should not touch the dst/src memory if this function returns error.
 */
static inline int check_eb_range(const struct extent_buffer *eb,
                                 unsigned long start, unsigned long len)
{
        unsigned long offset;

        /* start, start + len should not go beyond eb->len nor overflow */
        if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
                return report_eb_range(eb, start, len);

        return false;
}

void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
                        unsigned long start, unsigned long len)
{
        const int unit_size = eb->folio_size;
        size_t cur;
        size_t offset;
        char *dst = (char *)dstv;
        unsigned long i = get_eb_folio_index(eb, start);

        if (check_eb_range(eb, start, len)) {
                /*
                 * Invalid range hit, reset the memory, so callers won't get
                 * some random garbage for their uninitialized memory.
                 */
                memset(dstv, 0, len);
                return;
        }

        if (eb->addr) {
                memcpy(dstv, eb->addr + start, len);
                return;
        }

        offset = get_eb_offset_in_folio(eb, start);

        while (len > 0) {
                char *kaddr;

                cur = min(len, unit_size - offset);
                kaddr = folio_address(eb->folios[i]);
                memcpy(dst, kaddr + offset, cur);

                dst += cur;
                len -= cur;
                offset = 0;
                i++;
        }
}

int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
                                       void __user *dstv,
                                       unsigned long start, unsigned long len)
{
        const int unit_size = eb->folio_size;
        size_t cur;
        size_t offset;
        char __user *dst = (char __user *)dstv;
        unsigned long i = get_eb_folio_index(eb, start);
        int ret = 0;

        WARN_ON(start > eb->len);
        WARN_ON(start + len > eb->start + eb->len);

        if (eb->addr) {
                if (copy_to_user_nofault(dstv, eb->addr + start, len))
                        ret = -EFAULT;
                return ret;
        }

        offset = get_eb_offset_in_folio(eb, start);

        while (len > 0) {
                char *kaddr;

                cur = min(len, unit_size - offset);
                kaddr = folio_address(eb->folios[i]);
                if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
                        ret = -EFAULT;
                        break;
                }

                dst += cur;
                len -= cur;
                offset = 0;
                i++;
        }

        return ret;
}

int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
                         unsigned long start, unsigned long len)
{
        const int unit_size = eb->folio_size;
        size_t cur;
        size_t offset;
        char *kaddr;
        char *ptr = (char *)ptrv;
        unsigned long i = get_eb_folio_index(eb, start);
        int ret = 0;

        if (check_eb_range(eb, start, len))
                return -EINVAL;

        if (eb->addr)
                return memcmp(ptrv, eb->addr + start, len);

        offset = get_eb_offset_in_folio(eb, start);

        while (len > 0) {
                cur = min(len, unit_size - offset);
                kaddr = folio_address(eb->folios[i]);
                ret = memcmp(ptr, kaddr + offset, cur);
                if (ret)
                        break;

                ptr += cur;
                len -= cur;
                offset = 0;
                i++;
        }
        return ret;
}

/*
 * Check that the extent buffer is uptodate.
 *
 * For regular sector size == PAGE_SIZE case, check if @page is uptodate.
 * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
 */
static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i)
{
        struct btrfs_fs_info *fs_info = eb->fs_info;
        struct folio *folio = eb->folios[i];

        ASSERT(folio);

        /*
         * If we are using the commit root we could potentially clear a page
         * Uptodate while we're using the extent buffer that we've previously
         * looked up.  We don't want to complain in this case, as the page was
         * valid before, we just didn't write it out.  Instead we want to catch
         * the case where we didn't actually read the block properly, which
         * would have !PageUptodate and !EXTENT_BUFFER_WRITE_ERR.
         */
        if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
                return;

        if (fs_info->nodesize < PAGE_SIZE) {
                struct folio *folio = eb->folios[0];

                ASSERT(i == 0);
                if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio,
                                                         eb->start, eb->len)))
                        btrfs_subpage_dump_bitmap(fs_info, folio, eb->start, eb->len);
        } else {
                WARN_ON(!folio_test_uptodate(folio));
        }
}

static void __write_extent_buffer(const struct extent_buffer *eb,
                                  const void *srcv, unsigned long start,
                                  unsigned long len, bool use_memmove)
{
        const int unit_size = eb->folio_size;
        size_t cur;
        size_t offset;
        char *kaddr;
        char *src = (char *)srcv;
        unsigned long i = get_eb_folio_index(eb, start);
        /* For unmapped (dummy) ebs, no need to check their uptodate status. */
        const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);

        if (check_eb_range(eb, start, len))
                return;

        if (eb->addr) {
                if (use_memmove)
                        memmove(eb->addr + start, srcv, len);
                else
                        memcpy(eb->addr + start, srcv, len);
                return;
        }

        offset = get_eb_offset_in_folio(eb, start);

        while (len > 0) {
                if (check_uptodate)
                        assert_eb_folio_uptodate(eb, i);

                cur = min(len, unit_size - offset);
                kaddr = folio_address(eb->folios[i]);
                if (use_memmove)
                        memmove(kaddr + offset, src, cur);
                else
                        memcpy(kaddr + offset, src, cur);

                src += cur;
                len -= cur;
                offset = 0;
                i++;
        }
}

void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
                         unsigned long start, unsigned long len)
{
        return __write_extent_buffer(eb, srcv, start, len, false);
}

static void memset_extent_buffer(const struct extent_buffer *eb, int c,
                                 unsigned long start, unsigned long len)
{
        const int unit_size = eb->folio_size;
        unsigned long cur = start;

        if (eb->addr) {
                memset(eb->addr + start, c, len);
                return;
        }

        while (cur < start + len) {
                unsigned long index = get_eb_folio_index(eb, cur);
                unsigned int offset = get_eb_offset_in_folio(eb, cur);
                unsigned int cur_len = min(start + len - cur, unit_size - offset);

                assert_eb_folio_uptodate(eb, index);
                memset(folio_address(eb->folios[index]) + offset, c, cur_len);

                cur += cur_len;
        }
}

void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
                           unsigned long len)
{
        if (check_eb_range(eb, start, len))
                return;
        return memset_extent_buffer(eb, 0, start, len);
}

void copy_extent_buffer_full(const struct extent_buffer *dst,
                             const struct extent_buffer *src)
{
        const int unit_size = src->folio_size;
        unsigned long cur = 0;

        ASSERT(dst->len == src->len);

        while (cur < src->len) {
                unsigned long index = get_eb_folio_index(src, cur);
                unsigned long offset = get_eb_offset_in_folio(src, cur);
                unsigned long cur_len = min(src->len, unit_size - offset);
                void *addr = folio_address(src->folios[index]) + offset;

                write_extent_buffer(dst, addr, cur, cur_len);

                cur += cur_len;
        }
}

void copy_extent_buffer(const struct extent_buffer *dst,
                        const struct extent_buffer *src,
                        unsigned long dst_offset, unsigned long src_offset,
                        unsigned long len)
{
        const int unit_size = dst->folio_size;
        u64 dst_len = dst->len;
        size_t cur;
        size_t offset;
        char *kaddr;
        unsigned long i = get_eb_folio_index(dst, dst_offset);

        if (check_eb_range(dst, dst_offset, len) ||
            check_eb_range(src, src_offset, len))
                return;

        WARN_ON(src->len != dst_len);

        offset = get_eb_offset_in_folio(dst, dst_offset);

        while (len > 0) {
                assert_eb_folio_uptodate(dst, i);

                cur = min(len, (unsigned long)(unit_size - offset));

                kaddr = folio_address(dst->folios[i]);
                read_extent_buffer(src, kaddr + offset, src_offset, cur);

                src_offset += cur;
                len -= cur;
                offset = 0;
                i++;
        }
}

/*
 * Calculate the folio and offset of the byte containing the given bit number.
 *
 * @eb:           the extent buffer
 * @start:        offset of the bitmap item in the extent buffer
 * @nr:           bit number
 * @folio_index:  return index of the folio in the extent buffer that contains
 *                the given bit number
 * @folio_offset: return offset into the folio given by folio_index
 *
 * This helper hides the ugliness of finding the byte in an extent buffer which
 * contains a given bit.
 */
static inline void eb_bitmap_offset(const struct extent_buffer *eb,
                                    unsigned long start, unsigned long nr,
                                    unsigned long *folio_index,
                                    size_t *folio_offset)
{
        size_t byte_offset = BIT_BYTE(nr);
        size_t offset;

        /*
         * The byte we want is the offset of the extent buffer + the offset of
         * the bitmap item in the extent buffer + the offset of the byte in the
         * bitmap item.
         */
        offset = start + offset_in_eb_folio(eb, eb->start) + byte_offset;

        *folio_index = offset >> eb->folio_shift;
        *folio_offset = offset_in_eb_folio(eb, offset);
}

/*
 * Determine whether a bit in a bitmap item is set.
 *
 * @eb:     the extent buffer
 * @start:  offset of the bitmap item in the extent buffer
 * @nr:     bit number to test
 */
int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
                           unsigned long nr)
{
        unsigned long i;
        size_t offset;
        u8 *kaddr;

        eb_bitmap_offset(eb, start, nr, &i, &offset);
        assert_eb_folio_uptodate(eb, i);
        kaddr = folio_address(eb->folios[i]);
        return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
}

static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr)
{
        unsigned long index = get_eb_folio_index(eb, bytenr);

        if (check_eb_range(eb, bytenr, 1))
                return NULL;
        return folio_address(eb->folios[index]) + get_eb_offset_in_folio(eb, bytenr);
}

/*
 * Set an area of a bitmap to 1.
 *
 * @eb:     the extent buffer
 * @start:  offset of the bitmap item in the extent buffer
 * @pos:    bit number of the first bit
 * @len:    number of bits to set
 */
void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
                              unsigned long pos, unsigned long len)
{
        unsigned int first_byte = start + BIT_BYTE(pos);
        unsigned int last_byte = start + BIT_BYTE(pos + len - 1);
        const bool same_byte = (first_byte == last_byte);
        u8 mask = BITMAP_FIRST_BYTE_MASK(pos);
        u8 *kaddr;

        if (same_byte)
                mask &= BITMAP_LAST_BYTE_MASK(pos + len);

        /* Handle the first byte. */
        kaddr = extent_buffer_get_byte(eb, first_byte);
        *kaddr |= mask;
        if (same_byte)
                return;

        /* Handle the byte aligned part. */
        ASSERT(first_byte + 1 <= last_byte);
        memset_extent_buffer(eb, 0xff, first_byte + 1, last_byte - first_byte - 1);

        /* Handle the last byte. */
        kaddr = extent_buffer_get_byte(eb, last_byte);
        *kaddr |= BITMAP_LAST_BYTE_MASK(pos + len);
}


/*
 * Clear an area of a bitmap.
 *
 * @eb:     the extent buffer
 * @start:  offset of the bitmap item in the extent buffer
 * @pos:    bit number of the first bit
 * @len:    number of bits to clear
 */
void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
                                unsigned long start, unsigned long pos,
                                unsigned long len)
{
        unsigned int first_byte = start + BIT_BYTE(pos);
        unsigned int last_byte = start + BIT_BYTE(pos + len - 1);
        const bool same_byte = (first_byte == last_byte);
        u8 mask = BITMAP_FIRST_BYTE_MASK(pos);
        u8 *kaddr;

        if (same_byte)
                mask &= BITMAP_LAST_BYTE_MASK(pos + len);

        /* Handle the first byte. */
        kaddr = extent_buffer_get_byte(eb, first_byte);
        *kaddr &= ~mask;
        if (same_byte)
                return;

        /* Handle the byte aligned part. */
        ASSERT(first_byte + 1 <= last_byte);
        memset_extent_buffer(eb, 0, first_byte + 1, last_byte - first_byte - 1);

        /* Handle the last byte. */
        kaddr = extent_buffer_get_byte(eb, last_byte);
        *kaddr &= ~BITMAP_LAST_BYTE_MASK(pos + len);
}

static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
{
        unsigned long distance = (src > dst) ? src - dst : dst - src;
        return distance < len;
}

void memcpy_extent_buffer(const struct extent_buffer *dst,
                          unsigned long dst_offset, unsigned long src_offset,
                          unsigned long len)
{
        const int unit_size = dst->folio_size;
        unsigned long cur_off = 0;

        if (check_eb_range(dst, dst_offset, len) ||
            check_eb_range(dst, src_offset, len))
                return;

        if (dst->addr) {
                const bool use_memmove = areas_overlap(src_offset, dst_offset, len);

                if (use_memmove)
                        memmove(dst->addr + dst_offset, dst->addr + src_offset, len);
                else
                        memcpy(dst->addr + dst_offset, dst->addr + src_offset, len);
                return;
        }

        while (cur_off < len) {
                unsigned long cur_src = cur_off + src_offset;
                unsigned long folio_index = get_eb_folio_index(dst, cur_src);
                unsigned long folio_off = get_eb_offset_in_folio(dst, cur_src);
                unsigned long cur_len = min(src_offset + len - cur_src,
                                            unit_size - folio_off);
                void *src_addr = folio_address(dst->folios[folio_index]) + folio_off;
                const bool use_memmove = areas_overlap(src_offset + cur_off,
                                                       dst_offset + cur_off, cur_len);

                __write_extent_buffer(dst, src_addr, dst_offset + cur_off, cur_len,
                                      use_memmove);
                cur_off += cur_len;
        }
}

void memmove_extent_buffer(const struct extent_buffer *dst,
                           unsigned long dst_offset, unsigned long src_offset,
                           unsigned long len)
{
        unsigned long dst_end = dst_offset + len - 1;
        unsigned long src_end = src_offset + len - 1;

        if (check_eb_range(dst, dst_offset, len) ||
            check_eb_range(dst, src_offset, len))
                return;

        if (dst_offset < src_offset) {
                memcpy_extent_buffer(dst, dst_offset, src_offset, len);
                return;
        }

        if (dst->addr) {
                memmove(dst->addr + dst_offset, dst->addr + src_offset, len);
                return;
        }

        while (len > 0) {
                unsigned long src_i;
                size_t cur;
                size_t dst_off_in_folio;
                size_t src_off_in_folio;
                void *src_addr;
                bool use_memmove;

                src_i = get_eb_folio_index(dst, src_end);

                dst_off_in_folio = get_eb_offset_in_folio(dst, dst_end);
                src_off_in_folio = get_eb_offset_in_folio(dst, src_end);

                cur = min_t(unsigned long, len, src_off_in_folio + 1);
                cur = min(cur, dst_off_in_folio + 1);

                src_addr = folio_address(dst->folios[src_i]) + src_off_in_folio -
                                         cur + 1;
                use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1,
                                            cur);

                __write_extent_buffer(dst, src_addr, dst_end - cur + 1, cur,
                                      use_memmove);

                dst_end -= cur;
                src_end -= cur;
                len -= cur;
        }
}

#define GANG_LOOKUP_SIZE        16
static struct extent_buffer *get_next_extent_buffer(
                struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
{
        struct extent_buffer *gang[GANG_LOOKUP_SIZE];
        struct extent_buffer *found = NULL;
        u64 page_start = page_offset(page);
        u64 cur = page_start;

        ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
        lockdep_assert_held(&fs_info->buffer_lock);

        while (cur < page_start + PAGE_SIZE) {
                int ret;
                int i;

                ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
                                (void **)gang, cur >> fs_info->sectorsize_bits,
                                min_t(unsigned int, GANG_LOOKUP_SIZE,
                                      PAGE_SIZE / fs_info->nodesize));
                if (ret == 0)
                        goto out;
                for (i = 0; i < ret; i++) {
                        /* Already beyond page end */
                        if (gang[i]->start >= page_start + PAGE_SIZE)
                                goto out;
                        /* Found one */
                        if (gang[i]->start >= bytenr) {
                                found = gang[i];
                                goto out;
                        }
                }
                cur = gang[ret - 1]->start + gang[ret - 1]->len;
        }
out:
        return found;
}

static int try_release_subpage_extent_buffer(struct page *page)
{
        struct btrfs_fs_info *fs_info = page_to_fs_info(page);
        u64 cur = page_offset(page);
        const u64 end = page_offset(page) + PAGE_SIZE;
        int ret;

        while (cur < end) {
                struct extent_buffer *eb = NULL;

                /*
                 * Unlike try_release_extent_buffer() which uses folio private
                 * to grab buffer, for subpage case we rely on radix tree, thus
                 * we need to ensure radix tree consistency.
                 *
                 * We also want an atomic snapshot of the radix tree, thus go
                 * with spinlock rather than RCU.
                 */
                spin_lock(&fs_info->buffer_lock);
                eb = get_next_extent_buffer(fs_info, page, cur);
                if (!eb) {
                        /* No more eb in the page range after or at cur */
                        spin_unlock(&fs_info->buffer_lock);
                        break;
                }
                cur = eb->start + eb->len;

                /*
                 * The same as try_release_extent_buffer(), to ensure the eb
                 * won't disappear out from under us.
                 */
                spin_lock(&eb->refs_lock);
                if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
                        spin_unlock(&eb->refs_lock);
                        spin_unlock(&fs_info->buffer_lock);
                        break;
                }
                spin_unlock(&fs_info->buffer_lock);

                /*
                 * If tree ref isn't set then we know the ref on this eb is a
                 * real ref, so just return, this eb will likely be freed soon
                 * anyway.
                 */
                if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
                        spin_unlock(&eb->refs_lock);
                        break;
                }

                /*
                 * Here we don't care about the return value, we will always
                 * check the folio private at the end.  And
                 * release_extent_buffer() will release the refs_lock.
                 */
                release_extent_buffer(eb);
        }
        /*
         * Finally to check if we have cleared folio private, as if we have
         * released all ebs in the page, the folio private should be cleared now.
         */
        spin_lock(&page->mapping->i_private_lock);
        if (!folio_test_private(page_folio(page)))
                ret = 1;
        else
                ret = 0;
        spin_unlock(&page->mapping->i_private_lock);
        return ret;

}

int try_release_extent_buffer(struct page *page)
{
        struct folio *folio = page_folio(page);
        struct extent_buffer *eb;

        if (page_to_fs_info(page)->nodesize < PAGE_SIZE)
                return try_release_subpage_extent_buffer(page);

        /*
         * We need to make sure nobody is changing folio private, as we rely on
         * folio private as the pointer to extent buffer.
         */
        spin_lock(&page->mapping->i_private_lock);
        if (!folio_test_private(folio)) {
                spin_unlock(&page->mapping->i_private_lock);
                return 1;
        }

        eb = folio_get_private(folio);
        BUG_ON(!eb);

        /*
         * This is a little awful but should be ok, we need to make sure that
         * the eb doesn't disappear out from under us while we're looking at
         * this page.
         */
        spin_lock(&eb->refs_lock);
        if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
                spin_unlock(&eb->refs_lock);
                spin_unlock(&page->mapping->i_private_lock);
                return 0;
        }
        spin_unlock(&page->mapping->i_private_lock);

        /*
         * If tree ref isn't set then we know the ref on this eb is a real ref,
         * so just return, this page will likely be freed soon anyway.
         */
        if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
                spin_unlock(&eb->refs_lock);
                return 0;
        }

        return release_extent_buffer(eb);
}

/*
 * Attempt to readahead a child block.
 *
 * @fs_info:        the fs_info
 * @bytenr:        bytenr to read
 * @owner_root: objectid of the root that owns this eb
 * @gen:        generation for the uptodate check, can be 0
 * @level:        level for the eb
 *
 * Attempt to readahead a tree block at @bytenr.  If @gen is 0 then we do a
 * normal uptodate check of the eb, without checking the generation.  If we have
 * to read the block we will not block on anything.
 */
void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
                                u64 bytenr, u64 owner_root, u64 gen, int level)
{
        struct btrfs_tree_parent_check check = {
                .has_first_key = 0,
                .level = level,
                .transid = gen
        };
        struct extent_buffer *eb;
        int ret;

        eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
        if (IS_ERR(eb))
                return;

        if (btrfs_buffer_uptodate(eb, gen, 1)) {
                free_extent_buffer(eb);
                return;
        }

        ret = read_extent_buffer_pages(eb, WAIT_NONE, 0, &check);
        if (ret < 0)
                free_extent_buffer_stale(eb);
        else
                free_extent_buffer(eb);
}

/*
 * Readahead a node's child block.
 *
 * @node:        parent node we're reading from
 * @slot:        slot in the parent node for the child we want to read
 *
 * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
 * the slot in the node provided.
 */
void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
{
        btrfs_readahead_tree_block(node->fs_info,
                                   btrfs_node_blockptr(node, slot),
                                   btrfs_header_owner(node),
                                   btrfs_node_ptr_generation(node, slot),
                                   btrfs_header_level(node) - 1);
}
























    3 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
/* SPDX-License-Identifier: GPL-2.0-only */

#ifndef LINUX_RESUME_USER_MODE_H
#define LINUX_RESUME_USER_MODE_H

#include <linux/sched.h>
#include <linux/task_work.h>
#include <linux/memcontrol.h>
#include <linux/rseq.h>
#include <linux/blk-cgroup.h>

/**
 * set_notify_resume - cause resume_user_mode_work() to be called
 * @task:                task that will call resume_user_mode_work()
 *
 * Calling this arranges that @task will call resume_user_mode_work()
 * before returning to user mode.  If it's already running in user mode,
 * it will enter the kernel and call resume_user_mode_work() soon.
 * If it's blocked, it will not be woken.
 */
static inline void set_notify_resume(struct task_struct *task)
{
        if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_RESUME))
                kick_process(task);
}


/**
 * resume_user_mode_work - Perform work before returning to user mode
 * @regs:                user-mode registers of @current task
 *
 * This is called when %TIF_NOTIFY_RESUME has been set.  Now we are
 * about to return to user mode, and the user state in @regs can be
 * inspected or adjusted.  The caller in arch code has cleared
 * %TIF_NOTIFY_RESUME before the call.  If the flag gets set again
 * asynchronously, this will be called again before we return to
 * user mode.
 *
 * Called without locks.
 */
static inline void resume_user_mode_work(struct pt_regs *regs)
{
        clear_thread_flag(TIF_NOTIFY_RESUME);
        /*
         * This barrier pairs with task_work_add()->set_notify_resume() after
         * hlist_add_head(task->task_works);
         */
        smp_mb__after_atomic();
        if (unlikely(task_work_pending(current)))
                task_work_run();

#ifdef CONFIG_KEYS_REQUEST_CACHE
        if (unlikely(current->cached_requested_key)) {
                key_put(current->cached_requested_key);
                current->cached_requested_key = NULL;
        }
#endif

        mem_cgroup_handle_over_high(GFP_KERNEL);
        blkcg_maybe_throttle_current();

        rseq_handle_notify_resume(NULL, regs);
}

#endif /* LINUX_RESUME_USER_MODE_H */






































































































































   30 

























































































































   16 












   16 



   14 



   15 
















































































































   15 








   14 

   17 
































































































































































































































































































































































































































    3 




    4 


    4 







    4 




   12 





    9 


   11 





   12 





















































































































    2 





    2 


    2 








    2 













    2 


    2 















   27 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   27 




   30 

   30 

   30 














































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
// SPDX-License-Identifier: GPL-2.0
/*
 *  Kernel timekeeping code and accessor functions. Based on code from
 *  timer.c, moved in commit 8524070b7982.
 */
#include <linux/timekeeper_internal.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/nmi.h>
#include <linux/sched.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/clock.h>
#include <linux/syscore_ops.h>
#include <linux/clocksource.h>
#include <linux/jiffies.h>
#include <linux/time.h>
#include <linux/timex.h>
#include <linux/tick.h>
#include <linux/stop_machine.h>
#include <linux/pvclock_gtod.h>
#include <linux/compiler.h>
#include <linux/audit.h>
#include <linux/random.h>

#include "tick-internal.h"
#include "ntp_internal.h"
#include "timekeeping_internal.h"

#define TK_CLEAR_NTP                (1 << 0)
#define TK_MIRROR                (1 << 1)
#define TK_CLOCK_WAS_SET        (1 << 2)

enum timekeeping_adv_mode {
        /* Update timekeeper when a tick has passed */
        TK_ADV_TICK,

        /* Update timekeeper on a direct frequency change */
        TK_ADV_FREQ
};

DEFINE_RAW_SPINLOCK(timekeeper_lock);

/*
 * The most important data for readout fits into a single 64 byte
 * cache line.
 */
static struct {
        seqcount_raw_spinlock_t        seq;
        struct timekeeper        timekeeper;
} tk_core ____cacheline_aligned = {
        .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock),
};

static struct timekeeper shadow_timekeeper;

/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;

/**
 * struct tk_fast - NMI safe timekeeper
 * @seq:        Sequence counter for protecting updates. The lowest bit
 *                is the index for the tk_read_base array
 * @base:        tk_read_base array. Access is indexed by the lowest bit of
 *                @seq.
 *
 * See @update_fast_timekeeper() below.
 */
struct tk_fast {
        seqcount_latch_t        seq;
        struct tk_read_base        base[2];
};

/* Suspend-time cycles value for halted fast timekeeper. */
static u64 cycles_at_suspend;

static u64 dummy_clock_read(struct clocksource *cs)
{
        if (timekeeping_suspended)
                return cycles_at_suspend;
        return local_clock();
}

static struct clocksource dummy_clock = {
        .read = dummy_clock_read,
};

/*
 * Boot time initialization which allows local_clock() to be utilized
 * during early boot when clocksources are not available. local_clock()
 * returns nanoseconds already so no conversion is required, hence mult=1
 * and shift=0. When the first proper clocksource is installed then
 * the fast time keepers are updated with the correct values.
 */
#define FAST_TK_INIT                                                \
        {                                                        \
                .clock                = &dummy_clock,                        \
                .mask                = CLOCKSOURCE_MASK(64),                \
                .mult                = 1,                                \
                .shift                = 0,                                \
        }

static struct tk_fast tk_fast_mono ____cacheline_aligned = {
        .seq     = SEQCNT_LATCH_ZERO(tk_fast_mono.seq),
        .base[0] = FAST_TK_INIT,
        .base[1] = FAST_TK_INIT,
};

static struct tk_fast tk_fast_raw  ____cacheline_aligned = {
        .seq     = SEQCNT_LATCH_ZERO(tk_fast_raw.seq),
        .base[0] = FAST_TK_INIT,
        .base[1] = FAST_TK_INIT,
};

static inline void tk_normalize_xtime(struct timekeeper *tk)
{
        while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
                tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
                tk->xtime_sec++;
        }
        while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) {
                tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
                tk->raw_sec++;
        }
}

static inline struct timespec64 tk_xtime(const struct timekeeper *tk)
{
        struct timespec64 ts;

        ts.tv_sec = tk->xtime_sec;
        ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
        return ts;
}

static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
        tk->xtime_sec = ts->tv_sec;
        tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
}

static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
{
        tk->xtime_sec += ts->tv_sec;
        tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
        tk_normalize_xtime(tk);
}

static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
{
        struct timespec64 tmp;

        /*
         * Verify consistency of: offset_real = -wall_to_monotonic
         * before modifying anything
         */
        set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec,
                                        -tk->wall_to_monotonic.tv_nsec);
        WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp));
        tk->wall_to_monotonic = wtm;
        set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
        tk->offs_real = timespec64_to_ktime(tmp);
        tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));
}

static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
{
        tk->offs_boot = ktime_add(tk->offs_boot, delta);
        /*
         * Timespec representation for VDSO update to avoid 64bit division
         * on every update.
         */
        tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot);
}

/*
 * tk_clock_read - atomic clocksource read() helper
 *
 * This helper is necessary to use in the read paths because, while the
 * seqcount ensures we don't return a bad value while structures are updated,
 * it doesn't protect from potential crashes. There is the possibility that
 * the tkr's clocksource may change between the read reference, and the
 * clock reference passed to the read function.  This can cause crashes if
 * the wrong clocksource is passed to the wrong read function.
 * This isn't necessary to use when holding the timekeeper_lock or doing
 * a read of the fast-timekeeper tkrs (which is protected by its own locking
 * and update logic).
 */
static inline u64 tk_clock_read(const struct tk_read_base *tkr)
{
        struct clocksource *clock = READ_ONCE(tkr->clock);

        return clock->read(clock);
}

#ifdef CONFIG_DEBUG_TIMEKEEPING
#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */

static void timekeeping_check_update(struct timekeeper *tk, u64 offset)
{

        u64 max_cycles = tk->tkr_mono.clock->max_cycles;
        const char *name = tk->tkr_mono.clock->name;

        if (offset > max_cycles) {
                printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
                                offset, name, max_cycles);
                printk_deferred("         timekeeping: Your kernel is sick, but tries to cope by capping time updates\n");
        } else {
                if (offset > (max_cycles >> 1)) {
                        printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n",
                                        offset, name, max_cycles >> 1);
                        printk_deferred("      timekeeping: Your kernel is still fine, but is feeling a bit nervous\n");
                }
        }

        if (tk->underflow_seen) {
                if (jiffies - tk->last_warning > WARNING_FREQ) {
                        printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
                        printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
                        printk_deferred("         Your kernel is probably still fine.\n");
                        tk->last_warning = jiffies;
                }
                tk->underflow_seen = 0;
        }

        if (tk->overflow_seen) {
                if (jiffies - tk->last_warning > WARNING_FREQ) {
                        printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
                        printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
                        printk_deferred("         Your kernel is probably still fine.\n");
                        tk->last_warning = jiffies;
                }
                tk->overflow_seen = 0;
        }
}

static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles);

static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        u64 now, last, mask, max, delta;
        unsigned int seq;

        /*
         * Since we're called holding a seqcount, the data may shift
         * under us while we're doing the calculation. This can cause
         * false positives, since we'd note a problem but throw the
         * results away. So nest another seqcount here to atomically
         * grab the points we are checking with.
         */
        do {
                seq = read_seqcount_begin(&tk_core.seq);
                now = tk_clock_read(tkr);
                last = tkr->cycle_last;
                mask = tkr->mask;
                max = tkr->clock->max_cycles;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        delta = clocksource_delta(now, last, mask);

        /*
         * Try to catch underflows by checking if we are seeing small
         * mask-relative negative values.
         */
        if (unlikely((~delta & mask) < (mask >> 3)))
                tk->underflow_seen = 1;

        /* Check for multiplication overflows */
        if (unlikely(delta > max))
                tk->overflow_seen = 1;

        /* timekeeping_cycles_to_ns() handles both under and overflow */
        return timekeeping_cycles_to_ns(tkr, now);
}
#else
static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset)
{
}
static inline u64 timekeeping_debug_get_ns(const struct tk_read_base *tkr)
{
        BUG();
}
#endif

/**
 * tk_setup_internals - Set up internals to use clocksource clock.
 *
 * @tk:                The target timekeeper to setup.
 * @clock:                Pointer to clocksource.
 *
 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
 * pair and interval request.
 *
 * Unless you're the timekeeping code, you should not be using this!
 */
static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
{
        u64 interval;
        u64 tmp, ntpinterval;
        struct clocksource *old_clock;

        ++tk->cs_was_changed_seq;
        old_clock = tk->tkr_mono.clock;
        tk->tkr_mono.clock = clock;
        tk->tkr_mono.mask = clock->mask;
        tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono);

        tk->tkr_raw.clock = clock;
        tk->tkr_raw.mask = clock->mask;
        tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;

        /* Do the ns -> cycle conversion first, using original mult */
        tmp = NTP_INTERVAL_LENGTH;
        tmp <<= clock->shift;
        ntpinterval = tmp;
        tmp += clock->mult/2;
        do_div(tmp, clock->mult);
        if (tmp == 0)
                tmp = 1;

        interval = (u64) tmp;
        tk->cycle_interval = interval;

        /* Go back from cycles -> shifted ns */
        tk->xtime_interval = interval * clock->mult;
        tk->xtime_remainder = ntpinterval - tk->xtime_interval;
        tk->raw_interval = interval * clock->mult;

         /* if changing clocks, convert xtime_nsec shift units */
        if (old_clock) {
                int shift_change = clock->shift - old_clock->shift;
                if (shift_change < 0) {
                        tk->tkr_mono.xtime_nsec >>= -shift_change;
                        tk->tkr_raw.xtime_nsec >>= -shift_change;
                } else {
                        tk->tkr_mono.xtime_nsec <<= shift_change;
                        tk->tkr_raw.xtime_nsec <<= shift_change;
                }
        }

        tk->tkr_mono.shift = clock->shift;
        tk->tkr_raw.shift = clock->shift;

        tk->ntp_error = 0;
        tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
        tk->ntp_tick = ntpinterval << tk->ntp_error_shift;

        /*
         * The timekeeper keeps its own mult values for the currently
         * active clocksource. These value will be adjusted via NTP
         * to counteract clock drifting.
         */
        tk->tkr_mono.mult = clock->mult;
        tk->tkr_raw.mult = clock->mult;
        tk->ntp_err_mult = 0;
        tk->skip_second_overflow = 0;
}

/* Timekeeper helper functions. */
static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta)
{
        return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift);
}

static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
{
        /* Calculate the delta since the last update_wall_time() */
        u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask;

        /*
         * This detects both negative motion and the case where the delta
         * overflows the multiplication with tkr->mult.
         */
        if (unlikely(delta > tkr->clock->max_cycles)) {
                /*
                 * Handle clocksource inconsistency between CPUs to prevent
                 * time from going backwards by checking for the MSB of the
                 * mask being set in the delta.
                 */
                if (delta & ~(mask >> 1))
                        return tkr->xtime_nsec >> tkr->shift;

                return delta_to_ns_safe(tkr, delta);
        }

        return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift;
}

static __always_inline u64 __timekeeping_get_ns(const struct tk_read_base *tkr)
{
        return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr));
}

static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
{
        if (IS_ENABLED(CONFIG_DEBUG_TIMEKEEPING))
                return timekeeping_debug_get_ns(tkr);

        return __timekeeping_get_ns(tkr);
}

/**
 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
 * @tkr: Timekeeping readout base from which we take the update
 * @tkf: Pointer to NMI safe timekeeper
 *
 * We want to use this from any context including NMI and tracing /
 * instrumenting the timekeeping code itself.
 *
 * Employ the latch technique; see @raw_write_seqcount_latch.
 *
 * So if a NMI hits the update of base[0] then it will use base[1]
 * which is still consistent. In the worst case this can result is a
 * slightly wrong timestamp (a few nanoseconds). See
 * @ktime_get_mono_fast_ns.
 */
static void update_fast_timekeeper(const struct tk_read_base *tkr,
                                   struct tk_fast *tkf)
{
        struct tk_read_base *base = tkf->base;

        /* Force readers off to base[1] */
        raw_write_seqcount_latch(&tkf->seq);

        /* Update base[0] */
        memcpy(base, tkr, sizeof(*base));

        /* Force readers back to base[0] */
        raw_write_seqcount_latch(&tkf->seq);

        /* Update base[1] */
        memcpy(base + 1, base, sizeof(*base));
}

static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
{
        struct tk_read_base *tkr;
        unsigned int seq;
        u64 now;

        do {
                seq = raw_read_seqcount_latch(&tkf->seq);
                tkr = tkf->base + (seq & 0x01);
                now = ktime_to_ns(tkr->base);
                now += __timekeeping_get_ns(tkr);
        } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));

        return now;
}

/**
 * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
 *
 * This timestamp is not guaranteed to be monotonic across an update.
 * The timestamp is calculated by:
 *
 *        now = base_mono + clock_delta * slope
 *
 * So if the update lowers the slope, readers who are forced to the
 * not yet updated second array are still using the old steeper slope.
 *
 * tmono
 * ^
 * |    o  n
 * |   o n
 * |  u
 * | o
 * |o
 * |12345678---> reader order
 *
 * o = old slope
 * u = update
 * n = new slope
 *
 * So reader 6 will observe time going backwards versus reader 5.
 *
 * While other CPUs are likely to be able to observe that, the only way
 * for a CPU local observation is when an NMI hits in the middle of
 * the update. Timestamps taken from that NMI context might be ahead
 * of the following timestamps. Callers need to be aware of that and
 * deal with it.
 */
u64 notrace ktime_get_mono_fast_ns(void)
{
        return __ktime_get_fast_ns(&tk_fast_mono);
}
EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);

/**
 * ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw
 *
 * Contrary to ktime_get_mono_fast_ns() this is always correct because the
 * conversion factor is not affected by NTP/PTP correction.
 */
u64 notrace ktime_get_raw_fast_ns(void)
{
        return __ktime_get_fast_ns(&tk_fast_raw);
}
EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);

/**
 * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock.
 *
 * To keep it NMI safe since we're accessing from tracing, we're not using a
 * separate timekeeper with updates to monotonic clock and boot offset
 * protected with seqcounts. This has the following minor side effects:
 *
 * (1) Its possible that a timestamp be taken after the boot offset is updated
 * but before the timekeeper is updated. If this happens, the new boot offset
 * is added to the old timekeeping making the clock appear to update slightly
 * earlier:
 *    CPU 0                                        CPU 1
 *    timekeeping_inject_sleeptime64()
 *    __timekeeping_inject_sleeptime(tk, delta);
 *                                                 timestamp();
 *    timekeeping_update(tk, TK_CLEAR_NTP...);
 *
 * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
 * partially updated.  Since the tk->offs_boot update is a rare event, this
 * should be a rare occurrence which postprocessing should be able to handle.
 *
 * The caveats vs. timestamp ordering as documented for ktime_get_mono_fast_ns()
 * apply as well.
 */
u64 notrace ktime_get_boot_fast_ns(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_boot)));
}
EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);

/**
 * ktime_get_tai_fast_ns - NMI safe and fast access to tai clock.
 *
 * The same limitations as described for ktime_get_boot_fast_ns() apply. The
 * mono time and the TAI offset are not read atomically which may yield wrong
 * readouts. However, an update of the TAI offset is an rare event e.g., caused
 * by settime or adjtimex with an offset. The user of this function has to deal
 * with the possibility of wrong timestamps in post processing.
 */
u64 notrace ktime_get_tai_fast_ns(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_tai)));
}
EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns);

static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
{
        struct tk_read_base *tkr;
        u64 basem, baser, delta;
        unsigned int seq;

        do {
                seq = raw_read_seqcount_latch(&tkf->seq);
                tkr = tkf->base + (seq & 0x01);
                basem = ktime_to_ns(tkr->base);
                baser = ktime_to_ns(tkr->base_real);
                delta = __timekeeping_get_ns(tkr);
        } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));

        if (mono)
                *mono = basem + delta;
        return baser + delta;
}

/**
 * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
 *
 * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering.
 */
u64 ktime_get_real_fast_ns(void)
{
        return __ktime_get_real_fast(&tk_fast_mono, NULL);
}
EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns);

/**
 * ktime_get_fast_timestamps: - NMI safe timestamps
 * @snapshot:        Pointer to timestamp storage
 *
 * Stores clock monotonic, boottime and realtime timestamps.
 *
 * Boot time is a racy access on 32bit systems if the sleep time injection
 * happens late during resume and not in timekeeping_resume(). That could
 * be avoided by expanding struct tk_read_base with boot offset for 32bit
 * and adding more overhead to the update. As this is a hard to observe
 * once per resume event which can be filtered with reasonable effort using
 * the accurate mono/real timestamps, it's probably not worth the trouble.
 *
 * Aside of that it might be possible on 32 and 64 bit to observe the
 * following when the sleep time injection happens late:
 *
 * CPU 0                                CPU 1
 * timekeeping_resume()
 * ktime_get_fast_timestamps()
 *        mono, real = __ktime_get_real_fast()
 *                                        inject_sleep_time()
 *                                           update boot offset
 *        boot = mono + bootoffset;
 *
 * That means that boot time already has the sleep time adjustment, but
 * real time does not. On the next readout both are in sync again.
 *
 * Preventing this for 64bit is not really feasible without destroying the
 * careful cache layout of the timekeeper because the sequence count and
 * struct tk_read_base would then need two cache lines instead of one.
 *
 * Access to the time keeper clock source is disabled across the innermost
 * steps of suspend/resume. The accessors still work, but the timestamps
 * are frozen until time keeping is resumed which happens very early.
 *
 * For regular suspend/resume there is no observable difference vs. sched
 * clock, but it might affect some of the nasty low level debug printks.
 *
 * OTOH, access to sched clock is not guaranteed across suspend/resume on
 * all systems either so it depends on the hardware in use.
 *
 * If that turns out to be a real problem then this could be mitigated by
 * using sched clock in a similar way as during early boot. But it's not as
 * trivial as on early boot because it needs some careful protection
 * against the clock monotonic timestamp jumping backwards on resume.
 */
void ktime_get_fast_timestamps(struct ktime_timestamps *snapshot)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        snapshot->real = __ktime_get_real_fast(&tk_fast_mono, &snapshot->mono);
        snapshot->boot = snapshot->mono + ktime_to_ns(data_race(tk->offs_boot));
}

/**
 * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
 * @tk: Timekeeper to snapshot.
 *
 * It generally is unsafe to access the clocksource after timekeeping has been
 * suspended, so take a snapshot of the readout base of @tk and use it as the
 * fast timekeeper's readout base while suspended.  It will return the same
 * number of cycles every time until timekeeping is resumed at which time the
 * proper readout base for the fast timekeeper will be restored automatically.
 */
static void halt_fast_timekeeper(const struct timekeeper *tk)
{
        static struct tk_read_base tkr_dummy;
        const struct tk_read_base *tkr = &tk->tkr_mono;

        memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
        cycles_at_suspend = tk_clock_read(tkr);
        tkr_dummy.clock = &dummy_clock;
        tkr_dummy.base_real = tkr->base + tk->offs_real;
        update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);

        tkr = &tk->tkr_raw;
        memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
        tkr_dummy.clock = &dummy_clock;
        update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
}

static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);

static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
{
        raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk);
}

/**
 * pvclock_gtod_register_notifier - register a pvclock timedata update listener
 * @nb: Pointer to the notifier block to register
 */
int pvclock_gtod_register_notifier(struct notifier_block *nb)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;
        int ret;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
        update_pvclock_gtod(tk, true);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        return ret;
}
EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);

/**
 * pvclock_gtod_unregister_notifier - unregister a pvclock
 * timedata update listener
 * @nb: Pointer to the notifier block to unregister
 */
int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
{
        unsigned long flags;
        int ret;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        return ret;
}
EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);

/*
 * tk_update_leap_state - helper to update the next_leap_ktime
 */
static inline void tk_update_leap_state(struct timekeeper *tk)
{
        tk->next_leap_ktime = ntp_get_next_leap();
        if (tk->next_leap_ktime != KTIME_MAX)
                /* Convert to monotonic time */
                tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
}

/*
 * Update the ktime_t based scalar nsec members of the timekeeper
 */
static inline void tk_update_ktime_data(struct timekeeper *tk)
{
        u64 seconds;
        u32 nsec;

        /*
         * The xtime based monotonic readout is:
         *        nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now();
         * The ktime based monotonic readout is:
         *        nsec = base_mono + now();
         * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
         */
        seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
        nsec = (u32) tk->wall_to_monotonic.tv_nsec;
        tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);

        /*
         * The sum of the nanoseconds portions of xtime and
         * wall_to_monotonic can be greater/equal one second. Take
         * this into account before updating tk->ktime_sec.
         */
        nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
        if (nsec >= NSEC_PER_SEC)
                seconds++;
        tk->ktime_sec = seconds;

        /* Update the monotonic raw base */
        tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
}

/* must hold timekeeper_lock */
static void timekeeping_update(struct timekeeper *tk, unsigned int action)
{
        if (action & TK_CLEAR_NTP) {
                tk->ntp_error = 0;
                ntp_clear();
        }

        tk_update_leap_state(tk);
        tk_update_ktime_data(tk);

        update_vsyscall(tk);
        update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);

        tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
        update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
        update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);

        if (action & TK_CLOCK_WAS_SET)
                tk->clock_was_set_seq++;
        /*
         * The mirroring of the data to the shadow-timekeeper needs
         * to happen last here to ensure we don't over-write the
         * timekeeper structure on the next update with stale data
         */
        if (action & TK_MIRROR)
                memcpy(&shadow_timekeeper, &tk_core.timekeeper,
                       sizeof(tk_core.timekeeper));
}

/**
 * timekeeping_forward_now - update clock to the current time
 * @tk:                Pointer to the timekeeper to update
 *
 * Forward the current clock to update its state since the last call to
 * update_wall_time(). This is useful before significant clock changes,
 * as it avoids having to deal with this time offset explicitly.
 */
static void timekeeping_forward_now(struct timekeeper *tk)
{
        u64 cycle_now, delta;

        cycle_now = tk_clock_read(&tk->tkr_mono);
        delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
        tk->tkr_mono.cycle_last = cycle_now;
        tk->tkr_raw.cycle_last  = cycle_now;

        while (delta > 0) {
                u64 max = tk->tkr_mono.clock->max_cycles;
                u64 incr = delta < max ? delta : max;

                tk->tkr_mono.xtime_nsec += incr * tk->tkr_mono.mult;
                tk->tkr_raw.xtime_nsec += incr * tk->tkr_raw.mult;
                tk_normalize_xtime(tk);
                delta -= incr;
        }
}

/**
 * ktime_get_real_ts64 - Returns the time of day in a timespec64.
 * @ts:                pointer to the timespec to be set
 *
 * Returns the time of day in a timespec64 (WARN if suspended).
 */
void ktime_get_real_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ts->tv_sec = tk->xtime_sec;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        ts->tv_nsec = 0;
        timespec64_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(ktime_get_real_ts64);

ktime_t ktime_get(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = tk->tkr_mono.base;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get);

u32 ktime_get_resolution_ns(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u32 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        return nsecs;
}
EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);

static ktime_t *offsets[TK_OFFS_MAX] = {
        [TK_OFFS_REAL]        = &tk_core.timekeeper.offs_real,
        [TK_OFFS_BOOT]        = &tk_core.timekeeper.offs_boot,
        [TK_OFFS_TAI]        = &tk_core.timekeeper.offs_tai,
};

ktime_t ktime_get_with_offset(enum tk_offsets offs)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base, *offset = offsets[offs];
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = ktime_add(tk->tkr_mono.base, *offset);
                nsecs = timekeeping_get_ns(&tk->tkr_mono);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);

}
EXPORT_SYMBOL_GPL(ktime_get_with_offset);

ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base, *offset = offsets[offs];
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = ktime_add(tk->tkr_mono.base, *offset);
                nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);

/**
 * ktime_mono_to_any() - convert monotonic time to any other time
 * @tmono:        time to convert.
 * @offs:        which offset to use
 */
ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
{
        ktime_t *offset = offsets[offs];
        unsigned int seq;
        ktime_t tconv;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                tconv = ktime_add(tmono, *offset);
        } while (read_seqcount_retry(&tk_core.seq, seq));

        return tconv;
}
EXPORT_SYMBOL_GPL(ktime_mono_to_any);

/**
 * ktime_get_raw - Returns the raw monotonic time in ktime_t format
 */
ktime_t ktime_get_raw(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = tk->tkr_raw.base;
                nsecs = timekeeping_get_ns(&tk->tkr_raw);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get_raw);

/**
 * ktime_get_ts64 - get the monotonic clock in timespec64 format
 * @ts:                pointer to timespec variable
 *
 * The function calculates the monotonic clock from the realtime
 * clock and the wall_to_monotonic offset and stores the result
 * in normalized timespec64 format in the variable pointed to by @ts.
 */
void ktime_get_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct timespec64 tomono;
        unsigned int seq;
        u64 nsec;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                ts->tv_sec = tk->xtime_sec;
                nsec = timekeeping_get_ns(&tk->tkr_mono);
                tomono = tk->wall_to_monotonic;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        ts->tv_sec += tomono.tv_sec;
        ts->tv_nsec = 0;
        timespec64_add_ns(ts, nsec + tomono.tv_nsec);
}
EXPORT_SYMBOL_GPL(ktime_get_ts64);

/**
 * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
 *
 * Returns the seconds portion of CLOCK_MONOTONIC with a single non
 * serialized read. tk->ktime_sec is of type 'unsigned long' so this
 * works on both 32 and 64 bit systems. On 32 bit systems the readout
 * covers ~136 years of uptime which should be enough to prevent
 * premature wrap arounds.
 */
time64_t ktime_get_seconds(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        WARN_ON(timekeeping_suspended);
        return tk->ktime_sec;
}
EXPORT_SYMBOL_GPL(ktime_get_seconds);

/**
 * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
 *
 * Returns the wall clock seconds since 1970.
 *
 * For 64bit systems the fast access to tk->xtime_sec is preserved. On
 * 32bit systems the access must be protected with the sequence
 * counter to provide "atomic" access to the 64bit tk->xtime_sec
 * value.
 */
time64_t ktime_get_real_seconds(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        time64_t seconds;
        unsigned int seq;

        if (IS_ENABLED(CONFIG_64BIT))
                return tk->xtime_sec;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                seconds = tk->xtime_sec;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return seconds;
}
EXPORT_SYMBOL_GPL(ktime_get_real_seconds);

/**
 * __ktime_get_real_seconds - The same as ktime_get_real_seconds
 * but without the sequence counter protect. This internal function
 * is called just when timekeeping lock is already held.
 */
noinstr time64_t __ktime_get_real_seconds(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        return tk->xtime_sec;
}

/**
 * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter
 * @systime_snapshot:        pointer to struct receiving the system time snapshot
 */
void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base_raw;
        ktime_t base_real;
        u64 nsec_raw;
        u64 nsec_real;
        u64 now;

        WARN_ON_ONCE(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                now = tk_clock_read(&tk->tkr_mono);
                systime_snapshot->cs_id = tk->tkr_mono.clock->id;
                systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
                systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
                base_real = ktime_add(tk->tkr_mono.base,
                                      tk_core.timekeeper.offs_real);
                base_raw = tk->tkr_raw.base;
                nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
                nsec_raw  = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
        } while (read_seqcount_retry(&tk_core.seq, seq));

        systime_snapshot->cycles = now;
        systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
        systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
}
EXPORT_SYMBOL_GPL(ktime_get_snapshot);

/* Scale base by mult/div checking for overflow */
static int scale64_check_overflow(u64 mult, u64 div, u64 *base)
{
        u64 tmp, rem;

        tmp = div64_u64_rem(*base, div, &rem);

        if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) ||
            ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem)))
                return -EOVERFLOW;
        tmp *= mult;

        rem = div64_u64(rem * mult, div);
        *base = tmp + rem;
        return 0;
}

/**
 * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval
 * @history:                        Snapshot representing start of history
 * @partial_history_cycles:        Cycle offset into history (fractional part)
 * @total_history_cycles:        Total history length in cycles
 * @discontinuity:                True indicates clock was set on history period
 * @ts:                                Cross timestamp that should be adjusted using
 *        partial/total ratio
 *
 * Helper function used by get_device_system_crosststamp() to correct the
 * crosstimestamp corresponding to the start of the current interval to the
 * system counter value (timestamp point) provided by the driver. The
 * total_history_* quantities are the total history starting at the provided
 * reference point and ending at the start of the current interval. The cycle
 * count between the driver timestamp point and the start of the current
 * interval is partial_history_cycles.
 */
static int adjust_historical_crosststamp(struct system_time_snapshot *history,
                                         u64 partial_history_cycles,
                                         u64 total_history_cycles,
                                         bool discontinuity,
                                         struct system_device_crosststamp *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        u64 corr_raw, corr_real;
        bool interp_forward;
        int ret;

        if (total_history_cycles == 0 || partial_history_cycles == 0)
                return 0;

        /* Interpolate shortest distance from beginning or end of history */
        interp_forward = partial_history_cycles > total_history_cycles / 2;
        partial_history_cycles = interp_forward ?
                total_history_cycles - partial_history_cycles :
                partial_history_cycles;

        /*
         * Scale the monotonic raw time delta by:
         *        partial_history_cycles / total_history_cycles
         */
        corr_raw = (u64)ktime_to_ns(
                ktime_sub(ts->sys_monoraw, history->raw));
        ret = scale64_check_overflow(partial_history_cycles,
                                     total_history_cycles, &corr_raw);
        if (ret)
                return ret;

        /*
         * If there is a discontinuity in the history, scale monotonic raw
         *        correction by:
         *        mult(real)/mult(raw) yielding the realtime correction
         * Otherwise, calculate the realtime correction similar to monotonic
         *        raw calculation
         */
        if (discontinuity) {
                corr_real = mul_u64_u32_div
                        (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult);
        } else {
                corr_real = (u64)ktime_to_ns(
                        ktime_sub(ts->sys_realtime, history->real));
                ret = scale64_check_overflow(partial_history_cycles,
                                             total_history_cycles, &corr_real);
                if (ret)
                        return ret;
        }

        /* Fixup monotonic raw and real time time values */
        if (interp_forward) {
                ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw);
                ts->sys_realtime = ktime_add_ns(history->real, corr_real);
        } else {
                ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw);
                ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real);
        }

        return 0;
}

/*
 * timestamp_in_interval - true if ts is chronologically in [start, end]
 *
 * True if ts occurs chronologically at or after start, and before or at end.
 */
static bool timestamp_in_interval(u64 start, u64 end, u64 ts)
{
        if (ts >= start && ts <= end)
                return true;
        if (start > end && (ts >= start || ts <= end))
                return true;
        return false;
}

/**
 * get_device_system_crosststamp - Synchronously capture system/device timestamp
 * @get_time_fn:        Callback to get simultaneous device time and
 *        system counter from the device driver
 * @ctx:                Context passed to get_time_fn()
 * @history_begin:        Historical reference point used to interpolate system
 *        time when counter provided by the driver is before the current interval
 * @xtstamp:                Receives simultaneously captured system and device time
 *
 * Reads a timestamp from a device and correlates it to system time
 */
int get_device_system_crosststamp(int (*get_time_fn)
                                  (ktime_t *device_time,
                                   struct system_counterval_t *sys_counterval,
                                   void *ctx),
                                  void *ctx,
                                  struct system_time_snapshot *history_begin,
                                  struct system_device_crosststamp *xtstamp)
{
        struct system_counterval_t system_counterval;
        struct timekeeper *tk = &tk_core.timekeeper;
        u64 cycles, now, interval_start;
        unsigned int clock_was_set_seq = 0;
        ktime_t base_real, base_raw;
        u64 nsec_real, nsec_raw;
        u8 cs_was_changed_seq;
        unsigned int seq;
        bool do_interp;
        int ret;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                /*
                 * Try to synchronously capture device time and a system
                 * counter value calling back into the device driver
                 */
                ret = get_time_fn(&xtstamp->device, &system_counterval, ctx);
                if (ret)
                        return ret;

                /*
                 * Verify that the clocksource ID associated with the captured
                 * system counter value is the same as for the currently
                 * installed timekeeper clocksource
                 */
                if (system_counterval.cs_id == CSID_GENERIC ||
                    tk->tkr_mono.clock->id != system_counterval.cs_id)
                        return -ENODEV;
                cycles = system_counterval.cycles;

                /*
                 * Check whether the system counter value provided by the
                 * device driver is on the current timekeeping interval.
                 */
                now = tk_clock_read(&tk->tkr_mono);
                interval_start = tk->tkr_mono.cycle_last;
                if (!timestamp_in_interval(interval_start, now, cycles)) {
                        clock_was_set_seq = tk->clock_was_set_seq;
                        cs_was_changed_seq = tk->cs_was_changed_seq;
                        cycles = interval_start;
                        do_interp = true;
                } else {
                        do_interp = false;
                }

                base_real = ktime_add(tk->tkr_mono.base,
                                      tk_core.timekeeper.offs_real);
                base_raw = tk->tkr_raw.base;

                nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles);
                nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles);
        } while (read_seqcount_retry(&tk_core.seq, seq));

        xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real);
        xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw);

        /*
         * Interpolate if necessary, adjusting back from the start of the
         * current interval
         */
        if (do_interp) {
                u64 partial_history_cycles, total_history_cycles;
                bool discontinuity;

                /*
                 * Check that the counter value is not before the provided
                 * history reference and that the history doesn't cross a
                 * clocksource change
                 */
                if (!history_begin ||
                    !timestamp_in_interval(history_begin->cycles,
                                           cycles, system_counterval.cycles) ||
                    history_begin->cs_was_changed_seq != cs_was_changed_seq)
                        return -EINVAL;
                partial_history_cycles = cycles - system_counterval.cycles;
                total_history_cycles = cycles - history_begin->cycles;
                discontinuity =
                        history_begin->clock_was_set_seq != clock_was_set_seq;

                ret = adjust_historical_crosststamp(history_begin,
                                                    partial_history_cycles,
                                                    total_history_cycles,
                                                    discontinuity, xtstamp);
                if (ret)
                        return ret;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(get_device_system_crosststamp);

/**
 * do_settimeofday64 - Sets the time of day.
 * @ts:     pointer to the timespec64 variable containing the new time
 *
 * Sets the time of day to the new time and update NTP and notify hrtimers
 */
int do_settimeofday64(const struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct timespec64 ts_delta, xt;
        unsigned long flags;
        int ret = 0;

        if (!timespec64_valid_settod(ts))
                return -EINVAL;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        timekeeping_forward_now(tk);

        xt = tk_xtime(tk);
        ts_delta = timespec64_sub(*ts, xt);

        if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) {
                ret = -EINVAL;
                goto out;
        }

        tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));

        tk_set_xtime(tk, ts);
out:
        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        /* Signal hrtimers about time change */
        clock_was_set(CLOCK_SET_WALL);

        if (!ret) {
                audit_tk_injoffset(ts_delta);
                add_device_randomness(ts, sizeof(*ts));
        }

        return ret;
}
EXPORT_SYMBOL(do_settimeofday64);

/**
 * timekeeping_inject_offset - Adds or subtracts from the current time.
 * @ts:                Pointer to the timespec variable containing the offset
 *
 * Adds or subtracts an offset value from the current time.
 */
static int timekeeping_inject_offset(const struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;
        struct timespec64 tmp;
        int ret = 0;

        if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
                return -EINVAL;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        timekeeping_forward_now(tk);

        /* Make sure the proposed value is valid */
        tmp = timespec64_add(tk_xtime(tk), *ts);
        if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 ||
            !timespec64_valid_settod(&tmp)) {
                ret = -EINVAL;
                goto error;
        }

        tk_xtime_add(tk, ts);
        tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts));

error: /* even if we error out, we forwarded the time, so call update */
        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        /* Signal hrtimers about time change */
        clock_was_set(CLOCK_SET_WALL);

        return ret;
}

/*
 * Indicates if there is an offset between the system clock and the hardware
 * clock/persistent clock/rtc.
 */
int persistent_clock_is_local;

/*
 * Adjust the time obtained from the CMOS to be UTC time instead of
 * local time.
 *
 * This is ugly, but preferable to the alternatives.  Otherwise we
 * would either need to write a program to do it in /etc/rc (and risk
 * confusion if the program gets run more than once; it would also be
 * hard to make the program warp the clock precisely n hours)  or
 * compile in the timezone information into the kernel.  Bad, bad....
 *
 *                                                - TYT, 1992-01-01
 *
 * The best thing to do is to keep the CMOS clock in universal time (UTC)
 * as real UNIX machines always do it. This avoids all headaches about
 * daylight saving times and warping kernel clocks.
 */
void timekeeping_warp_clock(void)
{
        if (sys_tz.tz_minuteswest != 0) {
                struct timespec64 adjust;

                persistent_clock_is_local = 1;
                adjust.tv_sec = sys_tz.tz_minuteswest * 60;
                adjust.tv_nsec = 0;
                timekeeping_inject_offset(&adjust);
        }
}

/*
 * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
 */
static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
{
        tk->tai_offset = tai_offset;
        tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0));
}

/*
 * change_clocksource - Swaps clocksources if a new one is available
 *
 * Accumulates current time interval and initializes new clocksource
 */
static int change_clocksource(void *data)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct clocksource *new, *old = NULL;
        unsigned long flags;
        bool change = false;

        new = (struct clocksource *) data;

        /*
         * If the cs is in module, get a module reference. Succeeds
         * for built-in code (owner == NULL) as well.
         */
        if (try_module_get(new->owner)) {
                if (!new->enable || new->enable(new) == 0)
                        change = true;
                else
                        module_put(new->owner);
        }

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        timekeeping_forward_now(tk);

        if (change) {
                old = tk->tkr_mono.clock;
                tk_setup_internals(tk, new);
        }

        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        if (old) {
                if (old->disable)
                        old->disable(old);

                module_put(old->owner);
        }

        return 0;
}

/**
 * timekeeping_notify - Install a new clock source
 * @clock:                pointer to the clock source
 *
 * This function is called from clocksource.c after a new, better clock
 * source has been registered. The caller holds the clocksource_mutex.
 */
int timekeeping_notify(struct clocksource *clock)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        if (tk->tkr_mono.clock == clock)
                return 0;
        stop_machine(change_clocksource, clock, NULL);
        tick_clock_notify();
        return tk->tkr_mono.clock == clock ? 0 : -1;
}

/**
 * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec
 * @ts:                pointer to the timespec64 to be set
 *
 * Returns the raw monotonic time (completely un-modified by ntp)
 */
void ktime_get_raw_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                ts->tv_sec = tk->raw_sec;
                nsecs = timekeeping_get_ns(&tk->tkr_raw);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        ts->tv_nsec = 0;
        timespec64_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(ktime_get_raw_ts64);


/**
 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
 */
int timekeeping_valid_for_hres(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        int ret;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ret;
}

/**
 * timekeeping_max_deferment - Returns max time the clocksource can be deferred
 */
u64 timekeeping_max_deferment(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 ret;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ret = tk->tkr_mono.clock->max_idle_ns;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ret;
}

/**
 * read_persistent_clock64 -  Return time from the persistent clock.
 * @ts: Pointer to the storage for the readout value
 *
 * Weak dummy function for arches that do not yet support it.
 * Reads the time from the battery backed persistent clock.
 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
 *
 *  XXX - Do be sure to remove it once all arches implement it.
 */
void __weak read_persistent_clock64(struct timespec64 *ts)
{
        ts->tv_sec = 0;
        ts->tv_nsec = 0;
}

/**
 * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset
 *                                        from the boot.
 * @wall_time:          current time as returned by persistent clock
 * @boot_offset:  offset that is defined as wall_time - boot_time
 *
 * Weak dummy function for arches that do not yet support it.
 *
 * The default function calculates offset based on the current value of
 * local_clock(). This way architectures that support sched_clock() but don't
 * support dedicated boot time clock will provide the best estimate of the
 * boot time.
 */
void __weak __init
read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
                                     struct timespec64 *boot_offset)
{
        read_persistent_clock64(wall_time);
        *boot_offset = ns_to_timespec64(local_clock());
}

/*
 * Flag reflecting whether timekeeping_resume() has injected sleeptime.
 *
 * The flag starts of false and is only set when a suspend reaches
 * timekeeping_suspend(), timekeeping_resume() sets it to false when the
 * timekeeper clocksource is not stopping across suspend and has been
 * used to update sleep time. If the timekeeper clocksource has stopped
 * then the flag stays true and is used by the RTC resume code to decide
 * whether sleeptime must be injected and if so the flag gets false then.
 *
 * If a suspend fails before reaching timekeeping_resume() then the flag
 * stays false and prevents erroneous sleeptime injection.
 */
static bool suspend_timing_needed;

/* Flag for if there is a persistent clock on this platform */
static bool persistent_clock_exists;

/*
 * timekeeping_init - Initializes the clocksource and common timekeeping values
 */
void __init timekeeping_init(void)
{
        struct timespec64 wall_time, boot_offset, wall_to_mono;
        struct timekeeper *tk = &tk_core.timekeeper;
        struct clocksource *clock;
        unsigned long flags;

        read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
        if (timespec64_valid_settod(&wall_time) &&
            timespec64_to_ns(&wall_time) > 0) {
                persistent_clock_exists = true;
        } else if (timespec64_to_ns(&wall_time) != 0) {
                pr_warn("Persistent clock returned invalid value");
                wall_time = (struct timespec64){0};
        }

        if (timespec64_compare(&wall_time, &boot_offset) < 0)
                boot_offset = (struct timespec64){0};

        /*
         * We want set wall_to_mono, so the following is true:
         * wall time + wall_to_mono = boot time
         */
        wall_to_mono = timespec64_sub(boot_offset, wall_time);

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);
        ntp_init();

        clock = clocksource_default_clock();
        if (clock->enable)
                clock->enable(clock);
        tk_setup_internals(tk, clock);

        tk_set_xtime(tk, &wall_time);
        tk->raw_sec = 0;

        tk_set_wall_to_mono(tk, wall_to_mono);

        timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}

/* time in seconds when suspend began for persistent clock */
static struct timespec64 timekeeping_suspend_time;

/**
 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
 * @tk:                Pointer to the timekeeper to be updated
 * @delta:        Pointer to the delta value in timespec64 format
 *
 * Takes a timespec offset measuring a suspend interval and properly
 * adds the sleep offset to the timekeeping variables.
 */
static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
                                           const struct timespec64 *delta)
{
        if (!timespec64_valid_strict(delta)) {
                printk_deferred(KERN_WARNING
                                "__timekeeping_inject_sleeptime: Invalid "
                                "sleep delta value!\n");
                return;
        }
        tk_xtime_add(tk, delta);
        tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta));
        tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
        tk_debug_account_sleep_time(delta);
}

#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
/*
 * We have three kinds of time sources to use for sleep time
 * injection, the preference order is:
 * 1) non-stop clocksource
 * 2) persistent clock (ie: RTC accessible when irqs are off)
 * 3) RTC
 *
 * 1) and 2) are used by timekeeping, 3) by RTC subsystem.
 * If system has neither 1) nor 2), 3) will be used finally.
 *
 *
 * If timekeeping has injected sleeptime via either 1) or 2),
 * 3) becomes needless, so in this case we don't need to call
 * rtc_resume(), and this is what timekeeping_rtc_skipresume()
 * means.
 */
bool timekeeping_rtc_skipresume(void)
{
        return !suspend_timing_needed;
}

/*
 * 1) can be determined whether to use or not only when doing
 * timekeeping_resume() which is invoked after rtc_suspend(),
 * so we can't skip rtc_suspend() surely if system has 1).
 *
 * But if system has 2), 2) will definitely be used, so in this
 * case we don't need to call rtc_suspend(), and this is what
 * timekeeping_rtc_skipsuspend() means.
 */
bool timekeeping_rtc_skipsuspend(void)
{
        return persistent_clock_exists;
}

/**
 * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
 * @delta: pointer to a timespec64 delta value
 *
 * This hook is for architectures that cannot support read_persistent_clock64
 * because their RTC/persistent clock is only accessible when irqs are enabled.
 * and also don't have an effective nonstop clocksource.
 *
 * This function should only be called by rtc_resume(), and allows
 * a suspend offset to be injected into the timekeeping values.
 */
void timekeeping_inject_sleeptime64(const struct timespec64 *delta)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        suspend_timing_needed = false;

        timekeeping_forward_now(tk);

        __timekeeping_inject_sleeptime(tk, delta);

        timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        /* Signal hrtimers about time change */
        clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT);
}
#endif

/**
 * timekeeping_resume - Resumes the generic timekeeping subsystem.
 */
void timekeeping_resume(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct clocksource *clock = tk->tkr_mono.clock;
        unsigned long flags;
        struct timespec64 ts_new, ts_delta;
        u64 cycle_now, nsec;
        bool inject_sleeptime = false;

        read_persistent_clock64(&ts_new);

        clockevents_resume();
        clocksource_resume();

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        /*
         * After system resumes, we need to calculate the suspended time and
         * compensate it for the OS time. There are 3 sources that could be
         * used: Nonstop clocksource during suspend, persistent clock and rtc
         * device.
         *
         * One specific platform may have 1 or 2 or all of them, and the
         * preference will be:
         *        suspend-nonstop clocksource -> persistent clock -> rtc
         * The less preferred source will only be tried if there is no better
         * usable source. The rtc part is handled separately in rtc core code.
         */
        cycle_now = tk_clock_read(&tk->tkr_mono);
        nsec = clocksource_stop_suspend_timing(clock, cycle_now);
        if (nsec > 0) {
                ts_delta = ns_to_timespec64(nsec);
                inject_sleeptime = true;
        } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
                ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
                inject_sleeptime = true;
        }

        if (inject_sleeptime) {
                suspend_timing_needed = false;
                __timekeeping_inject_sleeptime(tk, &ts_delta);
        }

        /* Re-base the last cycle value */
        tk->tkr_mono.cycle_last = cycle_now;
        tk->tkr_raw.cycle_last  = cycle_now;

        tk->ntp_error = 0;
        timekeeping_suspended = 0;
        timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        touch_softlockup_watchdog();

        /* Resume the clockevent device(s) and hrtimers */
        tick_resume();
        /* Notify timerfd as resume is equivalent to clock_was_set() */
        timerfd_resume();
}

int timekeeping_suspend(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned long flags;
        struct timespec64                delta, delta_delta;
        static struct timespec64        old_delta;
        struct clocksource *curr_clock;
        u64 cycle_now;

        read_persistent_clock64(&timekeeping_suspend_time);

        /*
         * On some systems the persistent_clock can not be detected at
         * timekeeping_init by its return value, so if we see a valid
         * value returned, update the persistent_clock_exists flag.
         */
        if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
                persistent_clock_exists = true;

        suspend_timing_needed = true;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);
        timekeeping_forward_now(tk);
        timekeeping_suspended = 1;

        /*
         * Since we've called forward_now, cycle_last stores the value
         * just read from the current clocksource. Save this to potentially
         * use in suspend timing.
         */
        curr_clock = tk->tkr_mono.clock;
        cycle_now = tk->tkr_mono.cycle_last;
        clocksource_start_suspend_timing(curr_clock, cycle_now);

        if (persistent_clock_exists) {
                /*
                 * To avoid drift caused by repeated suspend/resumes,
                 * which each can add ~1 second drift error,
                 * try to compensate so the difference in system time
                 * and persistent_clock time stays close to constant.
                 */
                delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time);
                delta_delta = timespec64_sub(delta, old_delta);
                if (abs(delta_delta.tv_sec) >= 2) {
                        /*
                         * if delta_delta is too large, assume time correction
                         * has occurred and set old_delta to the current delta.
                         */
                        old_delta = delta;
                } else {
                        /* Otherwise try to adjust old_system to compensate */
                        timekeeping_suspend_time =
                                timespec64_add(timekeeping_suspend_time, delta_delta);
                }
        }

        timekeeping_update(tk, TK_MIRROR);
        halt_fast_timekeeper(tk);
        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        tick_suspend();
        clocksource_suspend();
        clockevents_suspend();

        return 0;
}

/* sysfs resume/suspend bits for timekeeping */
static struct syscore_ops timekeeping_syscore_ops = {
        .resume                = timekeeping_resume,
        .suspend        = timekeeping_suspend,
};

static int __init timekeeping_init_ops(void)
{
        register_syscore_ops(&timekeeping_syscore_ops);
        return 0;
}
device_initcall(timekeeping_init_ops);

/*
 * Apply a multiplier adjustment to the timekeeper
 */
static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
                                                         s64 offset,
                                                         s32 mult_adj)
{
        s64 interval = tk->cycle_interval;

        if (mult_adj == 0) {
                return;
        } else if (mult_adj == -1) {
                interval = -interval;
                offset = -offset;
        } else if (mult_adj != 1) {
                interval *= mult_adj;
                offset *= mult_adj;
        }

        /*
         * So the following can be confusing.
         *
         * To keep things simple, lets assume mult_adj == 1 for now.
         *
         * When mult_adj != 1, remember that the interval and offset values
         * have been appropriately scaled so the math is the same.
         *
         * The basic idea here is that we're increasing the multiplier
         * by one, this causes the xtime_interval to be incremented by
         * one cycle_interval. This is because:
         *        xtime_interval = cycle_interval * mult
         * So if mult is being incremented by one:
         *        xtime_interval = cycle_interval * (mult + 1)
         * Its the same as:
         *        xtime_interval = (cycle_interval * mult) + cycle_interval
         * Which can be shortened to:
         *        xtime_interval += cycle_interval
         *
         * So offset stores the non-accumulated cycles. Thus the current
         * time (in shifted nanoseconds) is:
         *        now = (offset * adj) + xtime_nsec
         * Now, even though we're adjusting the clock frequency, we have
         * to keep time consistent. In other words, we can't jump back
         * in time, and we also want to avoid jumping forward in time.
         *
         * So given the same offset value, we need the time to be the same
         * both before and after the freq adjustment.
         *        now = (offset * adj_1) + xtime_nsec_1
         *        now = (offset * adj_2) + xtime_nsec_2
         * So:
         *        (offset * adj_1) + xtime_nsec_1 =
         *                (offset * adj_2) + xtime_nsec_2
         * And we know:
         *        adj_2 = adj_1 + 1
         * So:
         *        (offset * adj_1) + xtime_nsec_1 =
         *                (offset * (adj_1+1)) + xtime_nsec_2
         *        (offset * adj_1) + xtime_nsec_1 =
         *                (offset * adj_1) + offset + xtime_nsec_2
         * Canceling the sides:
         *        xtime_nsec_1 = offset + xtime_nsec_2
         * Which gives us:
         *        xtime_nsec_2 = xtime_nsec_1 - offset
         * Which simplifies to:
         *        xtime_nsec -= offset
         */
        if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
                /* NTP adjustment caused clocksource mult overflow */
                WARN_ON_ONCE(1);
                return;
        }

        tk->tkr_mono.mult += mult_adj;
        tk->xtime_interval += interval;
        tk->tkr_mono.xtime_nsec -= offset;
}

/*
 * Adjust the timekeeper's multiplier to the correct frequency
 * and also to reduce the accumulated error value.
 */
static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
{
        u32 mult;

        /*
         * Determine the multiplier from the current NTP tick length.
         * Avoid expensive division when the tick length doesn't change.
         */
        if (likely(tk->ntp_tick == ntp_tick_length())) {
                mult = tk->tkr_mono.mult - tk->ntp_err_mult;
        } else {
                tk->ntp_tick = ntp_tick_length();
                mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) -
                                 tk->xtime_remainder, tk->cycle_interval);
        }

        /*
         * If the clock is behind the NTP time, increase the multiplier by 1
         * to catch up with it. If it's ahead and there was a remainder in the
         * tick division, the clock will slow down. Otherwise it will stay
         * ahead until the tick length changes to a non-divisible value.
         */
        tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0;
        mult += tk->ntp_err_mult;

        timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult);

        if (unlikely(tk->tkr_mono.clock->maxadj &&
                (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
                        > tk->tkr_mono.clock->maxadj))) {
                printk_once(KERN_WARNING
                        "Adjusting %s more than 11%% (%ld vs %ld)\n",
                        tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
                        (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
        }

        /*
         * It may be possible that when we entered this function, xtime_nsec
         * was very small.  Further, if we're slightly speeding the clocksource
         * in the code above, its possible the required corrective factor to
         * xtime_nsec could cause it to underflow.
         *
         * Now, since we have already accumulated the second and the NTP
         * subsystem has been notified via second_overflow(), we need to skip
         * the next update.
         */
        if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
                tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC <<
                                                        tk->tkr_mono.shift;
                tk->xtime_sec--;
                tk->skip_second_overflow = 1;
        }
}

/*
 * accumulate_nsecs_to_secs - Accumulates nsecs into secs
 *
 * Helper function that accumulates the nsecs greater than a second
 * from the xtime_nsec field to the xtime_secs field.
 * It also calls into the NTP code to handle leapsecond processing.
 */
static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
{
        u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
        unsigned int clock_set = 0;

        while (tk->tkr_mono.xtime_nsec >= nsecps) {
                int leap;

                tk->tkr_mono.xtime_nsec -= nsecps;
                tk->xtime_sec++;

                /*
                 * Skip NTP update if this second was accumulated before,
                 * i.e. xtime_nsec underflowed in timekeeping_adjust()
                 */
                if (unlikely(tk->skip_second_overflow)) {
                        tk->skip_second_overflow = 0;
                        continue;
                }

                /* Figure out if its a leap sec and apply if needed */
                leap = second_overflow(tk->xtime_sec);
                if (unlikely(leap)) {
                        struct timespec64 ts;

                        tk->xtime_sec += leap;

                        ts.tv_sec = leap;
                        ts.tv_nsec = 0;
                        tk_set_wall_to_mono(tk,
                                timespec64_sub(tk->wall_to_monotonic, ts));

                        __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);

                        clock_set = TK_CLOCK_WAS_SET;
                }
        }
        return clock_set;
}

/*
 * logarithmic_accumulation - shifted accumulation of cycles
 *
 * This functions accumulates a shifted interval of cycles into
 * a shifted interval nanoseconds. Allows for O(log) accumulation
 * loop.
 *
 * Returns the unconsumed cycles.
 */
static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
                                    u32 shift, unsigned int *clock_set)
{
        u64 interval = tk->cycle_interval << shift;
        u64 snsec_per_sec;

        /* If the offset is smaller than a shifted interval, do nothing */
        if (offset < interval)
                return offset;

        /* Accumulate one shifted interval */
        offset -= interval;
        tk->tkr_mono.cycle_last += interval;
        tk->tkr_raw.cycle_last  += interval;

        tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
        *clock_set |= accumulate_nsecs_to_secs(tk);

        /* Accumulate raw time */
        tk->tkr_raw.xtime_nsec += tk->raw_interval << shift;
        snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
        while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) {
                tk->tkr_raw.xtime_nsec -= snsec_per_sec;
                tk->raw_sec++;
        }

        /* Accumulate error between NTP and clock interval */
        tk->ntp_error += tk->ntp_tick << shift;
        tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
                                                (tk->ntp_error_shift + shift);

        return offset;
}

/*
 * timekeeping_advance - Updates the timekeeper to the current time and
 * current NTP tick length
 */
static bool timekeeping_advance(enum timekeeping_adv_mode mode)
{
        struct timekeeper *real_tk = &tk_core.timekeeper;
        struct timekeeper *tk = &shadow_timekeeper;
        u64 offset;
        int shift = 0, maxshift;
        unsigned int clock_set = 0;
        unsigned long flags;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);

        /* Make sure we're fully resumed: */
        if (unlikely(timekeeping_suspended))
                goto out;

        offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
                                   tk->tkr_mono.cycle_last, tk->tkr_mono.mask);

        /* Check if there's really nothing to do */
        if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
                goto out;

        /* Do some additional sanity checking */
        timekeeping_check_update(tk, offset);

        /*
         * With NO_HZ we may have to accumulate many cycle_intervals
         * (think "ticks") worth of time at once. To do this efficiently,
         * we calculate the largest doubling multiple of cycle_intervals
         * that is smaller than the offset.  We then accumulate that
         * chunk in one go, and then try to consume the next smaller
         * doubled multiple.
         */
        shift = ilog2(offset) - ilog2(tk->cycle_interval);
        shift = max(0, shift);
        /* Bound shift to one less than what overflows tick_length */
        maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
        shift = min(shift, maxshift);
        while (offset >= tk->cycle_interval) {
                offset = logarithmic_accumulation(tk, offset, shift,
                                                        &clock_set);
                if (offset < tk->cycle_interval<<shift)
                        shift--;
        }

        /* Adjust the multiplier to correct NTP error */
        timekeeping_adjust(tk, offset);

        /*
         * Finally, make sure that after the rounding
         * xtime_nsec isn't larger than NSEC_PER_SEC
         */
        clock_set |= accumulate_nsecs_to_secs(tk);

        write_seqcount_begin(&tk_core.seq);
        /*
         * Update the real timekeeper.
         *
         * We could avoid this memcpy by switching pointers, but that
         * requires changes to all other timekeeper usage sites as
         * well, i.e. move the timekeeper pointer getter into the
         * spinlocked/seqcount protected sections. And we trade this
         * memcpy under the tk_core.seq against one before we start
         * updating.
         */
        timekeeping_update(tk, clock_set);
        memcpy(real_tk, tk, sizeof(*tk));
        /* The memcpy must come last. Do not put anything here! */
        write_seqcount_end(&tk_core.seq);
out:
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        return !!clock_set;
}

/**
 * update_wall_time - Uses the current clocksource to increment the wall time
 *
 */
void update_wall_time(void)
{
        if (timekeeping_advance(TK_ADV_TICK))
                clock_was_set_delayed();
}

/**
 * getboottime64 - Return the real time of system boot.
 * @ts:                pointer to the timespec64 to be set
 *
 * Returns the wall-time of boot in a timespec64.
 *
 * This is based on the wall_to_monotonic offset and the total suspend
 * time. Calls to settimeofday will affect the value returned (which
 * basically means that however wrong your real time clock is at boot time,
 * you get the right time here).
 */
void getboottime64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);

        *ts = ktime_to_timespec64(t);
}
EXPORT_SYMBOL_GPL(getboottime64);

void ktime_get_coarse_real_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                *ts = tk_xtime(tk);
        } while (read_seqcount_retry(&tk_core.seq, seq));
}
EXPORT_SYMBOL(ktime_get_coarse_real_ts64);

void ktime_get_coarse_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct timespec64 now, mono;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                now = tk_xtime(tk);
                mono = tk->wall_to_monotonic;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
                                now.tv_nsec + mono.tv_nsec);
}
EXPORT_SYMBOL(ktime_get_coarse_ts64);

/*
 * Must hold jiffies_lock
 */
void do_timer(unsigned long ticks)
{
        jiffies_64 += ticks;
        calc_global_load();
}

/**
 * ktime_get_update_offsets_now - hrtimer helper
 * @cwsseq:        pointer to check and store the clock was set sequence number
 * @offs_real:        pointer to storage for monotonic -> realtime offset
 * @offs_boot:        pointer to storage for monotonic -> boottime offset
 * @offs_tai:        pointer to storage for monotonic -> clock tai offset
 *
 * Returns current monotonic time and updates the offsets if the
 * sequence number in @cwsseq and timekeeper.clock_was_set_seq are
 * different.
 *
 * Called from hrtimer_interrupt() or retrigger_next_event()
 */
ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
                                     ktime_t *offs_boot, ktime_t *offs_tai)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                base = tk->tkr_mono.base;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);
                base = ktime_add_ns(base, nsecs);

                if (*cwsseq != tk->clock_was_set_seq) {
                        *cwsseq = tk->clock_was_set_seq;
                        *offs_real = tk->offs_real;
                        *offs_boot = tk->offs_boot;
                        *offs_tai = tk->offs_tai;
                }

                /* Handle leapsecond insertion adjustments */
                if (unlikely(base >= tk->next_leap_ktime))
                        *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0));

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return base;
}

/*
 * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
 */
static int timekeeping_validate_timex(const struct __kernel_timex *txc)
{
        if (txc->modes & ADJ_ADJTIME) {
                /* singleshot must not be used with any other mode bits */
                if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
                        return -EINVAL;
                if (!(txc->modes & ADJ_OFFSET_READONLY) &&
                    !capable(CAP_SYS_TIME))
                        return -EPERM;
        } else {
                /* In order to modify anything, you gotta be super-user! */
                if (txc->modes && !capable(CAP_SYS_TIME))
                        return -EPERM;
                /*
                 * if the quartz is off by more than 10% then
                 * something is VERY wrong!
                 */
                if (txc->modes & ADJ_TICK &&
                    (txc->tick <  900000/USER_HZ ||
                     txc->tick > 1100000/USER_HZ))
                        return -EINVAL;
        }

        if (txc->modes & ADJ_SETOFFSET) {
                /* In order to inject time, you gotta be super-user! */
                if (!capable(CAP_SYS_TIME))
                        return -EPERM;

                /*
                 * Validate if a timespec/timeval used to inject a time
                 * offset is valid.  Offsets can be positive or negative, so
                 * we don't check tv_sec. The value of the timeval/timespec
                 * is the sum of its fields,but *NOTE*:
                 * The field tv_usec/tv_nsec must always be non-negative and
                 * we can't have more nanoseconds/microseconds than a second.
                 */
                if (txc->time.tv_usec < 0)
                        return -EINVAL;

                if (txc->modes & ADJ_NANO) {
                        if (txc->time.tv_usec >= NSEC_PER_SEC)
                                return -EINVAL;
                } else {
                        if (txc->time.tv_usec >= USEC_PER_SEC)
                                return -EINVAL;
                }
        }

        /*
         * Check for potential multiplication overflows that can
         * only happen on 64-bit systems:
         */
        if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
                if (LLONG_MIN / PPM_SCALE > txc->freq)
                        return -EINVAL;
                if (LLONG_MAX / PPM_SCALE < txc->freq)
                        return -EINVAL;
        }

        return 0;
}

/**
 * random_get_entropy_fallback - Returns the raw clock source value,
 * used by random.c for platforms with no valid random_get_entropy().
 */
unsigned long random_get_entropy_fallback(void)
{
        struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono;
        struct clocksource *clock = READ_ONCE(tkr->clock);

        if (unlikely(timekeeping_suspended || !clock))
                return 0;
        return clock->read(clock);
}
EXPORT_SYMBOL_GPL(random_get_entropy_fallback);

/**
 * do_adjtimex() - Accessor function to NTP __do_adjtimex function
 */
int do_adjtimex(struct __kernel_timex *txc)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct audit_ntp_data ad;
        bool clock_set = false;
        struct timespec64 ts;
        unsigned long flags;
        s32 orig_tai, tai;
        int ret;

        /* Validate the data before disabling interrupts */
        ret = timekeeping_validate_timex(txc);
        if (ret)
                return ret;
        add_device_randomness(txc, sizeof(*txc));

        if (txc->modes & ADJ_SETOFFSET) {
                struct timespec64 delta;
                delta.tv_sec  = txc->time.tv_sec;
                delta.tv_nsec = txc->time.tv_usec;
                if (!(txc->modes & ADJ_NANO))
                        delta.tv_nsec *= 1000;
                ret = timekeeping_inject_offset(&delta);
                if (ret)
                        return ret;

                audit_tk_injoffset(delta);
        }

        audit_ntp_init(&ad);

        ktime_get_real_ts64(&ts);
        add_device_randomness(&ts, sizeof(ts));

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        orig_tai = tai = tk->tai_offset;
        ret = __do_adjtimex(txc, &ts, &tai, &ad);

        if (tai != orig_tai) {
                __timekeeping_set_tai_offset(tk, tai);
                timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
                clock_set = true;
        }
        tk_update_leap_state(tk);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);

        audit_ntp_log(&ad);

        /* Update the multiplier immediately if frequency was set directly */
        if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
                clock_set |= timekeeping_advance(TK_ADV_FREQ);

        if (clock_set)
                clock_was_set(CLOCK_REALTIME);

        ntp_notify_cmos_timer();

        return ret;
}

#ifdef CONFIG_NTP_PPS
/**
 * hardpps() - Accessor function to NTP __hardpps function
 */
void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
{
        unsigned long flags;

        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        write_seqcount_begin(&tk_core.seq);

        __hardpps(phase_ts, raw_ts);

        write_seqcount_end(&tk_core.seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
EXPORT_SYMBOL(hardpps);
#endif /* CONFIG_NTP_PPS */










    3 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM printk

#if !defined(_TRACE_PRINTK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PRINTK_H

#include <linux/tracepoint.h>

TRACE_EVENT(console,
        TP_PROTO(const char *text, size_t len),

        TP_ARGS(text, len),

        TP_STRUCT__entry(
                __dynamic_array(char, msg, len + 1)
        ),

        TP_fast_assign(
                /*
                 * Each trace entry is printed in a new line.
                 * If the msg finishes with '\n', cut it off
                 * to avoid blank lines in the trace.
                 */
                if ((len > 0) && (text[len-1] == '\n'))
                        len -= 1;

                memcpy(__get_str(msg), text, len);
                __get_str(msg)[len] = 0;
        ),

        TP_printk("%s", __get_str(msg))
);
#endif /* _TRACE_PRINTK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>




















































































    2 
































































    3 

























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PID_H
#define _LINUX_PID_H

#include <linux/pid_types.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/wait.h>

/*
 * What is struct pid?
 *
 * A struct pid is the kernel's internal notion of a process identifier.
 * It refers to individual tasks, process groups, and sessions.  While
 * there are processes attached to it the struct pid lives in a hash
 * table, so it and then the processes that it refers to can be found
 * quickly from the numeric pid value.  The attached processes may be
 * quickly accessed by following pointers from struct pid.
 *
 * Storing pid_t values in the kernel and referring to them later has a
 * problem.  The process originally with that pid may have exited and the
 * pid allocator wrapped, and another process could have come along
 * and been assigned that pid.
 *
 * Referring to user space processes by holding a reference to struct
 * task_struct has a problem.  When the user space process exits
 * the now useless task_struct is still kept.  A task_struct plus a
 * stack consumes around 10K of low kernel memory.  More precisely
 * this is THREAD_SIZE + sizeof(struct task_struct).  By comparison
 * a struct pid is about 64 bytes.
 *
 * Holding a reference to struct pid solves both of these problems.
 * It is small so holding a reference does not consume a lot of
 * resources, and since a new struct pid is allocated when the numeric pid
 * value is reused (when pids wrap around) we don't mistakenly refer to new
 * processes.
 */


/*
 * struct upid is used to get the id of the struct pid, as it is
 * seen in particular namespace. Later the struct pid is found with
 * find_pid_ns() using the int nr and struct pid_namespace *ns.
 */

#define RESERVED_PIDS 300

struct upid {
        int nr;
        struct pid_namespace *ns;
};

struct pid
{
        refcount_t count;
        unsigned int level;
        spinlock_t lock;
        struct dentry *stashed;
        u64 ino;
        /* lists of tasks that use this pid */
        struct hlist_head tasks[PIDTYPE_MAX];
        struct hlist_head inodes;
        /* wait queue for pidfd notifications */
        wait_queue_head_t wait_pidfd;
        struct rcu_head rcu;
        struct upid numbers[];
};

extern struct pid init_struct_pid;

struct file;

struct pid *pidfd_pid(const struct file *file);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
void do_notify_pidfd(struct task_struct *task);

static inline struct pid *get_pid(struct pid *pid)
{
        if (pid)
                refcount_inc(&pid->count);
        return pid;
}

extern void put_pid(struct pid *pid);
extern struct task_struct *pid_task(struct pid *pid, enum pid_type);
static inline bool pid_has_task(struct pid *pid, enum pid_type type)
{
        return !hlist_empty(&pid->tasks[type]);
}
extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type);

extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);

/*
 * these helpers must be called with the tasklist_lock write-held.
 */
extern void attach_pid(struct task_struct *task, enum pid_type);
extern void detach_pid(struct task_struct *task, enum pid_type);
extern void change_pid(struct task_struct *task, enum pid_type,
                        struct pid *pid);
extern void exchange_tids(struct task_struct *task, struct task_struct *old);
extern void transfer_pid(struct task_struct *old, struct task_struct *new,
                         enum pid_type);

extern int pid_max;
extern int pid_max_min, pid_max_max;

/*
 * look up a PID in the hash table. Must be called with the tasklist_lock
 * or rcu_read_lock() held.
 *
 * find_pid_ns() finds the pid in the namespace specified
 * find_vpid() finds the pid by its virtual id, i.e. in the current namespace
 *
 * see also find_task_by_vpid() set in include/linux/sched.h
 */
extern struct pid *find_pid_ns(int nr, struct pid_namespace *ns);
extern struct pid *find_vpid(int nr);

/*
 * Lookup a PID in the hash table, and return with it's count elevated.
 */
extern struct pid *find_get_pid(int nr);
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);

extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
                             size_t set_tid_size);
extern void free_pid(struct pid *pid);
extern void disable_pid_allocation(struct pid_namespace *ns);

/*
 * ns_of_pid() returns the pid namespace in which the specified pid was
 * allocated.
 *
 * NOTE:
 *         ns_of_pid() is expected to be called for a process (task) that has
 *         an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid
 *         is expected to be non-NULL. If @pid is NULL, caller should handle
 *         the resulting NULL pid-ns.
 */
static inline struct pid_namespace *ns_of_pid(struct pid *pid)
{
        struct pid_namespace *ns = NULL;
        if (pid)
                ns = pid->numbers[pid->level].ns;
        return ns;
}

/*
 * is_child_reaper returns true if the pid is the init process
 * of the current namespace. As this one could be checked before
 * pid_ns->child_reaper is assigned in copy_process, we check
 * with the pid number.
 */
static inline bool is_child_reaper(struct pid *pid)
{
        return pid->numbers[pid->level].nr == 1;
}

/*
 * the helpers to get the pid's id seen from different namespaces
 *
 * pid_nr()    : global id, i.e. the id seen from the init namespace;
 * pid_vnr()   : virtual id, i.e. the id seen from the pid namespace of
 *               current.
 * pid_nr_ns() : id seen from the ns specified.
 *
 * see also task_xid_nr() etc in include/linux/sched.h
 */

static inline pid_t pid_nr(struct pid *pid)
{
        pid_t nr = 0;
        if (pid)
                nr = pid->numbers[0].nr;
        return nr;
}

pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns);
pid_t pid_vnr(struct pid *pid);

#define do_each_pid_task(pid, type, task)                                \
        do {                                                                \
                if ((pid) != NULL)                                        \
                        hlist_for_each_entry_rcu((task),                \
                                &(pid)->tasks[type], pid_links[type]) {

                        /*
                         * Both old and new leaders may be attached to
                         * the same pid in the middle of de_thread().
                         */
#define while_each_pid_task(pid, type, task)                                \
                                if (type == PIDTYPE_PID)                \
                                        break;                                \
                        }                                                \
        } while (0)

#define do_each_pid_thread(pid, type, task)                                \
        do_each_pid_task(pid, type, task) {                                \
                struct task_struct *tg___ = task;                        \
                for_each_thread(tg___, task) {

#define while_each_pid_thread(pid, type, task)                                \
                }                                                        \
                task = tg___;                                                \
        } while_each_pid_task(pid, type, task)

static inline struct pid *task_pid(struct task_struct *task)
{
        return task->thread_pid;
}

/*
 * the helpers to get the task's different pids as they are seen
 * from various namespaces
 *
 * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
 * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
 *                     current.
 * task_xid_nr_ns()  : id seen from the ns specified;
 *
 * see also pid_nr() etc in include/linux/pid.h
 */
pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);

static inline pid_t task_pid_nr(struct task_struct *tsk)
{
        return tsk->pid;
}

static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
}

static inline pid_t task_pid_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
}


static inline pid_t task_tgid_nr(struct task_struct *tsk)
{
        return tsk->tgid;
}

/**
 * pid_alive - check that a task structure is not stale
 * @p: Task structure to be checked.
 *
 * Test if a process is not yet dead (at most zombie state)
 * If pid_alive fails, then pointers within the task structure
 * can be stale and must not be dereferenced.
 *
 * Return: 1 if the process is alive. 0 otherwise.
 */
static inline int pid_alive(const struct task_struct *p)
{
        return p->thread_pid != NULL;
}

static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
}

static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
}


static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
}

static inline pid_t task_session_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
}

static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
}

static inline pid_t task_tgid_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
}

static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
{
        pid_t pid = 0;

        rcu_read_lock();
        if (pid_alive(tsk))
                pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
        rcu_read_unlock();

        return pid;
}

static inline pid_t task_ppid_nr(const struct task_struct *tsk)
{
        return task_ppid_nr_ns(tsk, &init_pid_ns);
}

/* Obsolete, do not use: */
static inline pid_t task_pgrp_nr(struct task_struct *tsk)
{
        return task_pgrp_nr_ns(tsk, &init_pid_ns);
}

/**
 * is_global_init - check if a task structure is init. Since init
 * is free to have sub-threads we need to check tgid.
 * @tsk: Task structure to be checked.
 *
 * Check if a task structure is the first user space task the kernel created.
 *
 * Return: 1 if the task structure is init. 0 otherwise.
 */
static inline int is_global_init(struct task_struct *tsk)
{
        return task_tgid_nr(tsk) == 1;
}

#endif /* _LINUX_PID_H */



























































































































































































    3 






    3 
    3 









    3 
    3 








































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright 2019 Google LLC
 */

/*
 * Refer to Documentation/block/inline-encryption.rst for detailed explanation.
 */

#define pr_fmt(fmt) "blk-crypto: " fmt

#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-crypto-profile.h>
#include <linux/module.h>
#include <linux/ratelimit.h>
#include <linux/slab.h>

#include "blk-crypto-internal.h"

const struct blk_crypto_mode blk_crypto_modes[] = {
        [BLK_ENCRYPTION_MODE_AES_256_XTS] = {
                .name = "AES-256-XTS",
                .cipher_str = "xts(aes)",
                .keysize = 64,
                .ivsize = 16,
        },
        [BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = {
                .name = "AES-128-CBC-ESSIV",
                .cipher_str = "essiv(cbc(aes),sha256)",
                .keysize = 16,
                .ivsize = 16,
        },
        [BLK_ENCRYPTION_MODE_ADIANTUM] = {
                .name = "Adiantum",
                .cipher_str = "adiantum(xchacha12,aes)",
                .keysize = 32,
                .ivsize = 32,
        },
        [BLK_ENCRYPTION_MODE_SM4_XTS] = {
                .name = "SM4-XTS",
                .cipher_str = "xts(sm4)",
                .keysize = 32,
                .ivsize = 16,
        },
};

/*
 * This number needs to be at least (the number of threads doing IO
 * concurrently) * (maximum recursive depth of a bio), so that we don't
 * deadlock on crypt_ctx allocations. The default is chosen to be the same
 * as the default number of post read contexts in both EXT4 and F2FS.
 */
static int num_prealloc_crypt_ctxs = 128;

module_param(num_prealloc_crypt_ctxs, int, 0444);
MODULE_PARM_DESC(num_prealloc_crypt_ctxs,
                "Number of bio crypto contexts to preallocate");

static struct kmem_cache *bio_crypt_ctx_cache;
static mempool_t *bio_crypt_ctx_pool;

static int __init bio_crypt_ctx_init(void)
{
        size_t i;

        bio_crypt_ctx_cache = KMEM_CACHE(bio_crypt_ctx, 0);
        if (!bio_crypt_ctx_cache)
                goto out_no_mem;

        bio_crypt_ctx_pool = mempool_create_slab_pool(num_prealloc_crypt_ctxs,
                                                      bio_crypt_ctx_cache);
        if (!bio_crypt_ctx_pool)
                goto out_no_mem;

        /* This is assumed in various places. */
        BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0);

        /* Sanity check that no algorithm exceeds the defined limits. */
        for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) {
                BUG_ON(blk_crypto_modes[i].keysize > BLK_CRYPTO_MAX_KEY_SIZE);
                BUG_ON(blk_crypto_modes[i].ivsize > BLK_CRYPTO_MAX_IV_SIZE);
        }

        return 0;
out_no_mem:
        panic("Failed to allocate mem for bio crypt ctxs\n");
}
subsys_initcall(bio_crypt_ctx_init);

void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key,
                       const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], gfp_t gfp_mask)
{
        struct bio_crypt_ctx *bc;

        /*
         * The caller must use a gfp_mask that contains __GFP_DIRECT_RECLAIM so
         * that the mempool_alloc() can't fail.
         */
        WARN_ON_ONCE(!(gfp_mask & __GFP_DIRECT_RECLAIM));

        bc = mempool_alloc(bio_crypt_ctx_pool, gfp_mask);

        bc->bc_key = key;
        memcpy(bc->bc_dun, dun, sizeof(bc->bc_dun));

        bio->bi_crypt_context = bc;
}

void __bio_crypt_free_ctx(struct bio *bio)
{
        mempool_free(bio->bi_crypt_context, bio_crypt_ctx_pool);
        bio->bi_crypt_context = NULL;
}

int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask)
{
        dst->bi_crypt_context = mempool_alloc(bio_crypt_ctx_pool, gfp_mask);
        if (!dst->bi_crypt_context)
                return -ENOMEM;
        *dst->bi_crypt_context = *src->bi_crypt_context;
        return 0;
}

/* Increments @dun by @inc, treating @dun as a multi-limb integer. */
void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
                             unsigned int inc)
{
        int i;

        for (i = 0; inc && i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) {
                dun[i] += inc;
                /*
                 * If the addition in this limb overflowed, then we need to
                 * carry 1 into the next limb. Else the carry is 0.
                 */
                if (dun[i] < inc)
                        inc = 1;
                else
                        inc = 0;
        }
}

void __bio_crypt_advance(struct bio *bio, unsigned int bytes)
{
        struct bio_crypt_ctx *bc = bio->bi_crypt_context;

        bio_crypt_dun_increment(bc->bc_dun,
                                bytes >> bc->bc_key->data_unit_size_bits);
}

/*
 * Returns true if @bc->bc_dun plus @bytes converted to data units is equal to
 * @next_dun, treating the DUNs as multi-limb integers.
 */
bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc,
                                 unsigned int bytes,
                                 const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE])
{
        int i;
        unsigned int carry = bytes >> bc->bc_key->data_unit_size_bits;

        for (i = 0; i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) {
                if (bc->bc_dun[i] + carry != next_dun[i])
                        return false;
                /*
                 * If the addition in this limb overflowed, then we need to
                 * carry 1 into the next limb. Else the carry is 0.
                 */
                if ((bc->bc_dun[i] + carry) < carry)
                        carry = 1;
                else
                        carry = 0;
        }

        /* If the DUN wrapped through 0, don't treat it as contiguous. */
        return carry == 0;
}

/*
 * Checks that two bio crypt contexts are compatible - i.e. that
 * they are mergeable except for data_unit_num continuity.
 */
static bool bio_crypt_ctx_compatible(struct bio_crypt_ctx *bc1,
                                     struct bio_crypt_ctx *bc2)
{
        if (!bc1)
                return !bc2;

        return bc2 && bc1->bc_key == bc2->bc_key;
}

bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio)
{
        return bio_crypt_ctx_compatible(rq->crypt_ctx, bio->bi_crypt_context);
}

/*
 * Checks that two bio crypt contexts are compatible, and also
 * that their data_unit_nums are continuous (and can hence be merged)
 * in the order @bc1 followed by @bc2.
 */
bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes,
                             struct bio_crypt_ctx *bc2)
{
        if (!bio_crypt_ctx_compatible(bc1, bc2))
                return false;

        return !bc1 || bio_crypt_dun_is_contiguous(bc1, bc1_bytes, bc2->bc_dun);
}

/* Check that all I/O segments are data unit aligned. */
static bool bio_crypt_check_alignment(struct bio *bio)
{
        const unsigned int data_unit_size =
                bio->bi_crypt_context->bc_key->crypto_cfg.data_unit_size;
        struct bvec_iter iter;
        struct bio_vec bv;

        bio_for_each_segment(bv, bio, iter) {
                if (!IS_ALIGNED(bv.bv_len | bv.bv_offset, data_unit_size))
                        return false;
        }

        return true;
}

blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq)
{
        return blk_crypto_get_keyslot(rq->q->crypto_profile,
                                      rq->crypt_ctx->bc_key,
                                      &rq->crypt_keyslot);
}

void __blk_crypto_rq_put_keyslot(struct request *rq)
{
        blk_crypto_put_keyslot(rq->crypt_keyslot);
        rq->crypt_keyslot = NULL;
}

void __blk_crypto_free_request(struct request *rq)
{
        /* The keyslot, if one was needed, should have been released earlier. */
        if (WARN_ON_ONCE(rq->crypt_keyslot))
                __blk_crypto_rq_put_keyslot(rq);

        mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool);
        rq->crypt_ctx = NULL;
}

/**
 * __blk_crypto_bio_prep - Prepare bio for inline encryption
 *
 * @bio_ptr: pointer to original bio pointer
 *
 * If the bio crypt context provided for the bio is supported by the underlying
 * device's inline encryption hardware, do nothing.
 *
 * Otherwise, try to perform en/decryption for this bio by falling back to the
 * kernel crypto API. When the crypto API fallback is used for encryption,
 * blk-crypto may choose to split the bio into 2 - the first one that will
 * continue to be processed and the second one that will be resubmitted via
 * submit_bio_noacct. A bounce bio will be allocated to encrypt the contents
 * of the aforementioned "first one", and *bio_ptr will be updated to this
 * bounce bio.
 *
 * Caller must ensure bio has bio_crypt_ctx.
 *
 * Return: true on success; false on error (and bio->bi_status will be set
 *           appropriately, and bio_endio() will have been called so bio
 *           submission should abort).
 */
bool __blk_crypto_bio_prep(struct bio **bio_ptr)
{
        struct bio *bio = *bio_ptr;
        const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key;

        /* Error if bio has no data. */
        if (WARN_ON_ONCE(!bio_has_data(bio))) {
                bio->bi_status = BLK_STS_IOERR;
                goto fail;
        }

        if (!bio_crypt_check_alignment(bio)) {
                bio->bi_status = BLK_STS_IOERR;
                goto fail;
        }

        /*
         * Success if device supports the encryption context, or if we succeeded
         * in falling back to the crypto API.
         */
        if (blk_crypto_config_supported_natively(bio->bi_bdev,
                                                 &bc_key->crypto_cfg))
                return true;
        if (blk_crypto_fallback_bio_prep(bio_ptr))
                return true;
fail:
        bio_endio(*bio_ptr);
        return false;
}

int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
                             gfp_t gfp_mask)
{
        if (!rq->crypt_ctx) {
                rq->crypt_ctx = mempool_alloc(bio_crypt_ctx_pool, gfp_mask);
                if (!rq->crypt_ctx)
                        return -ENOMEM;
        }
        *rq->crypt_ctx = *bio->bi_crypt_context;
        return 0;
}

/**
 * blk_crypto_init_key() - Prepare a key for use with blk-crypto
 * @blk_key: Pointer to the blk_crypto_key to initialize.
 * @raw_key: Pointer to the raw key. Must be the correct length for the chosen
 *             @crypto_mode; see blk_crypto_modes[].
 * @crypto_mode: identifier for the encryption algorithm to use
 * @dun_bytes: number of bytes that will be used to specify the DUN when this
 *               key is used
 * @data_unit_size: the data unit size to use for en/decryption
 *
 * Return: 0 on success, -errno on failure.  The caller is responsible for
 *           zeroizing both blk_key and raw_key when done with them.
 */
int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
                        enum blk_crypto_mode_num crypto_mode,
                        unsigned int dun_bytes,
                        unsigned int data_unit_size)
{
        const struct blk_crypto_mode *mode;

        memset(blk_key, 0, sizeof(*blk_key));

        if (crypto_mode >= ARRAY_SIZE(blk_crypto_modes))
                return -EINVAL;

        mode = &blk_crypto_modes[crypto_mode];
        if (mode->keysize == 0)
                return -EINVAL;

        if (dun_bytes == 0 || dun_bytes > mode->ivsize)
                return -EINVAL;

        if (!is_power_of_2(data_unit_size))
                return -EINVAL;

        blk_key->crypto_cfg.crypto_mode = crypto_mode;
        blk_key->crypto_cfg.dun_bytes = dun_bytes;
        blk_key->crypto_cfg.data_unit_size = data_unit_size;
        blk_key->data_unit_size_bits = ilog2(data_unit_size);
        blk_key->size = mode->keysize;
        memcpy(blk_key->raw, raw_key, mode->keysize);

        return 0;
}

bool blk_crypto_config_supported_natively(struct block_device *bdev,
                                          const struct blk_crypto_config *cfg)
{
        return __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile,
                                          cfg);
}

/*
 * Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the
 * block_device it's submitted to supports inline crypto, or the
 * blk-crypto-fallback is enabled and supports the cfg).
 */
bool blk_crypto_config_supported(struct block_device *bdev,
                                 const struct blk_crypto_config *cfg)
{
        return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
               blk_crypto_config_supported_natively(bdev, cfg);
}

/**
 * blk_crypto_start_using_key() - Start using a blk_crypto_key on a device
 * @bdev: block device to operate on
 * @key: A key to use on the device
 *
 * Upper layers must call this function to ensure that either the hardware
 * supports the key's crypto settings, or the crypto API fallback has transforms
 * for the needed mode allocated and ready to go. This function may allocate
 * an skcipher, and *should not* be called from the data path, since that might
 * cause a deadlock
 *
 * Return: 0 on success; -ENOPKG if the hardware doesn't support the key and
 *           blk-crypto-fallback is either disabled or the needed algorithm
 *           is disabled in the crypto API; or another -errno code.
 */
int blk_crypto_start_using_key(struct block_device *bdev,
                               const struct blk_crypto_key *key)
{
        if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
                return 0;
        return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
}

/**
 * blk_crypto_evict_key() - Evict a blk_crypto_key from a block_device
 * @bdev: a block_device on which I/O using the key may have been done
 * @key: the key to evict
 *
 * For a given block_device, this function removes the given blk_crypto_key from
 * the keyslot management structures and evicts it from any underlying hardware
 * keyslot(s) or blk-crypto-fallback keyslot it may have been programmed into.
 *
 * Upper layers must call this before freeing the blk_crypto_key.  It must be
 * called for every block_device the key may have been used on.  The key must no
 * longer be in use by any I/O when this function is called.
 *
 * Context: May sleep.
 */
void blk_crypto_evict_key(struct block_device *bdev,
                          const struct blk_crypto_key *key)
{
        struct request_queue *q = bdev_get_queue(bdev);
        int err;

        if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
                err = __blk_crypto_evict_key(q->crypto_profile, key);
        else
                err = blk_crypto_fallback_evict_key(key);
        /*
         * An error can only occur here if the key failed to be evicted from a
         * keyslot (due to a hardware or driver issue) or is allegedly still in
         * use by I/O (due to a kernel bug).  Even in these cases, the key is
         * still unlinked from the keyslot management structures, and the caller
         * is allowed and expected to free it right away.  There's nothing
         * callers can do to handle errors, so just log them and return void.
         */
        if (err)
                pr_warn_ratelimited("%pg: error %d evicting key\n", bdev, err);
}
EXPORT_SYMBOL_GPL(blk_crypto_evict_key);



















































































    1 

    1 











































































































































































































    1 
















    5 
    1 










































    5 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Berkeley style UIO structures        -        Alan Cox 1994.
 */
#ifndef __LINUX_UIO_H
#define __LINUX_UIO_H

#include <linux/kernel.h>
#include <linux/thread_info.h>
#include <linux/mm_types.h>
#include <uapi/linux/uio.h>

struct page;

typedef unsigned int __bitwise iov_iter_extraction_t;

struct kvec {
        void *iov_base; /* and that should *never* hold a userland pointer */
        size_t iov_len;
};

enum iter_type {
        /* iter types */
        ITER_UBUF,
        ITER_IOVEC,
        ITER_BVEC,
        ITER_KVEC,
        ITER_XARRAY,
        ITER_DISCARD,
};

#define ITER_SOURCE        1        // == WRITE
#define ITER_DEST        0        // == READ

struct iov_iter_state {
        size_t iov_offset;
        size_t count;
        unsigned long nr_segs;
};

struct iov_iter {
        u8 iter_type;
        bool nofault;
        bool data_source;
        size_t iov_offset;
        /*
         * Hack alert: overlay ubuf_iovec with iovec + count, so
         * that the members resolve correctly regardless of the type
         * of iterator used. This means that you can use:
         *
         * &iter->__ubuf_iovec or iter->__iov
         *
         * interchangably for the user_backed cases, hence simplifying
         * some of the cases that need to deal with both.
         */
        union {
                /*
                 * This really should be a const, but we cannot do that without
                 * also modifying any of the zero-filling iter init functions.
                 * Leave it non-const for now, but it should be treated as such.
                 */
                struct iovec __ubuf_iovec;
                struct {
                        union {
                                /* use iter_iov() to get the current vec */
                                const struct iovec *__iov;
                                const struct kvec *kvec;
                                const struct bio_vec *bvec;
                                struct xarray *xarray;
                                void __user *ubuf;
                        };
                        size_t count;
                };
        };
        union {
                unsigned long nr_segs;
                loff_t xarray_start;
        };
};

static inline const struct iovec *iter_iov(const struct iov_iter *iter)
{
        if (iter->iter_type == ITER_UBUF)
                return (const struct iovec *) &iter->__ubuf_iovec;
        return iter->__iov;
}

#define iter_iov_addr(iter)        (iter_iov(iter)->iov_base + (iter)->iov_offset)
#define iter_iov_len(iter)        (iter_iov(iter)->iov_len - (iter)->iov_offset)

static inline enum iter_type iov_iter_type(const struct iov_iter *i)
{
        return i->iter_type;
}

static inline void iov_iter_save_state(struct iov_iter *iter,
                                       struct iov_iter_state *state)
{
        state->iov_offset = iter->iov_offset;
        state->count = iter->count;
        state->nr_segs = iter->nr_segs;
}

static inline bool iter_is_ubuf(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_UBUF;
}

static inline bool iter_is_iovec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_IOVEC;
}

static inline bool iov_iter_is_kvec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_KVEC;
}

static inline bool iov_iter_is_bvec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_BVEC;
}

static inline bool iov_iter_is_discard(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_DISCARD;
}

static inline bool iov_iter_is_xarray(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_XARRAY;
}

static inline unsigned char iov_iter_rw(const struct iov_iter *i)
{
        return i->data_source ? WRITE : READ;
}

static inline bool user_backed_iter(const struct iov_iter *i)
{
        return iter_is_ubuf(i) || iter_is_iovec(i);
}

/*
 * Total number of bytes covered by an iovec.
 *
 * NOTE that it is not safe to use this function until all the iovec's
 * segment lengths have been validated.  Because the individual lengths can
 * overflow a size_t when added together.
 */
static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
{
        unsigned long seg;
        size_t ret = 0;

        for (seg = 0; seg < nr_segs; seg++)
                ret += iov[seg].iov_len;
        return ret;
}

size_t copy_page_from_iter_atomic(struct page *page, size_t offset,
                                  size_t bytes, struct iov_iter *i);
void iov_iter_advance(struct iov_iter *i, size_t bytes);
void iov_iter_revert(struct iov_iter *i, size_t bytes);
size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);
size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes);
size_t iov_iter_single_seg_count(const struct iov_iter *i);
size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i);
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i);

size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);

static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset,
                size_t bytes, struct iov_iter *i)
{
        return copy_page_to_iter(&folio->page, offset, bytes, i);
}

static inline size_t copy_folio_from_iter_atomic(struct folio *folio,
                size_t offset, size_t bytes, struct iov_iter *i)
{
        return copy_page_from_iter_atomic(&folio->page, offset, bytes, i);
}

size_t copy_page_to_iter_nofault(struct page *page, unsigned offset,
                                 size_t bytes, struct iov_iter *i);

static __always_inline __must_check
size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        if (check_copy_size(addr, bytes, true))
                return _copy_to_iter(addr, bytes, i);
        return 0;
}

static __always_inline __must_check
size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
        if (check_copy_size(addr, bytes, false))
                return _copy_from_iter(addr, bytes, i);
        return 0;
}

static __always_inline __must_check
bool copy_to_iter_full(const void *addr, size_t bytes, struct iov_iter *i)
{
        size_t copied = copy_to_iter(addr, bytes, i);
        if (likely(copied == bytes))
                return true;
        iov_iter_revert(i, copied);
        return false;
}

static __always_inline __must_check
bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
{
        size_t copied = copy_from_iter(addr, bytes, i);
        if (likely(copied == bytes))
                return true;
        iov_iter_revert(i, copied);
        return false;
}

static __always_inline __must_check
size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (check_copy_size(addr, bytes, false))
                return _copy_from_iter_nocache(addr, bytes, i);
        return 0;
}

static __always_inline __must_check
bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        size_t copied = copy_from_iter_nocache(addr, bytes, i);
        if (likely(copied == bytes))
                return true;
        iov_iter_revert(i, copied);
        return false;
}

#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
/*
 * Note, users like pmem that depend on the stricter semantics of
 * _copy_from_iter_flushcache() than _copy_from_iter_nocache() must check for
 * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the
 * destination is flushed from the cache on return.
 */
size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
#else
#define _copy_from_iter_flushcache _copy_from_iter_nocache
#endif

#ifdef CONFIG_ARCH_HAS_COPY_MC
size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
#else
#define _copy_mc_to_iter _copy_to_iter
#endif

size_t iov_iter_zero(size_t bytes, struct iov_iter *);
bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
                        unsigned len_mask);
unsigned long iov_iter_alignment(const struct iov_iter *i);
unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov,
                        unsigned long nr_segs, size_t count);
void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec,
                        unsigned long nr_segs, size_t count);
void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec,
                        unsigned long nr_segs, size_t count);
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
                     loff_t start, size_t count);
ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
                        size_t maxsize, unsigned maxpages, size_t *start);
ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages,
                        size_t maxsize, size_t *start);
int iov_iter_npages(const struct iov_iter *i, int maxpages);
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state);

const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);

static inline size_t iov_iter_count(const struct iov_iter *i)
{
        return i->count;
}

/*
 * Cap the iov_iter by given limit; note that the second argument is
 * *not* the new size - it's upper limit for such.  Passing it a value
 * greater than the amount of data in iov_iter is fine - it'll just do
 * nothing in that case.
 */
static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
{
        /*
         * count doesn't have to fit in size_t - comparison extends both
         * operands to u64 here and any value that would be truncated by
         * conversion in assignement is by definition greater than all
         * values of size_t, including old i->count.
         */
        if (i->count > count)
                i->count = count;
}

/*
 * reexpand a previously truncated iterator; count must be no more than how much
 * we had shrunk it.
 */
static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
{
        i->count = count;
}

static inline int
iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes)
{
        size_t shorted = 0;
        int npages;

        if (iov_iter_count(i) > max_bytes) {
                shorted = iov_iter_count(i) - max_bytes;
                iov_iter_truncate(i, max_bytes);
        }
        npages = iov_iter_npages(i, maxpages);
        if (shorted)
                iov_iter_reexpand(i, iov_iter_count(i) + shorted);

        return npages;
}

struct iovec *iovec_from_user(const struct iovec __user *uvector,
                unsigned long nr_segs, unsigned long fast_segs,
                struct iovec *fast_iov, bool compat);
ssize_t import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
                 struct iov_iter *i);
ssize_t __import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
                 struct iov_iter *i, bool compat);
int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i);

static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
                        void __user *buf, size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        *i = (struct iov_iter) {
                .iter_type = ITER_UBUF,
                .data_source = direction,
                .ubuf = buf,
                .count = count,
                .nr_segs = 1
        };
}
/* Flags for iov_iter_get/extract_pages*() */
/* Allow P2PDMA on the extracted pages */
#define ITER_ALLOW_P2PDMA        ((__force iov_iter_extraction_t)0x01)

ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages,
                               size_t maxsize, unsigned int maxpages,
                               iov_iter_extraction_t extraction_flags,
                               size_t *offset0);

/**
 * iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained
 * @iter: The iterator
 *
 * Examine the iterator and indicate by returning true or false as to how, if
 * at all, pages extracted from the iterator will be retained by the extraction
 * function.
 *
 * %true indicates that the pages will have a pin placed in them that the
 * caller must unpin.  This is must be done for DMA/async DIO to force fork()
 * to forcibly copy a page for the child (the parent must retain the original
 * page).
 *
 * %false indicates that no measures are taken and that it's up to the caller
 * to retain the pages.
 */
static inline bool iov_iter_extract_will_pin(const struct iov_iter *iter)
{
        return user_backed_iter(iter);
}

struct sg_table;
ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t len,
                           struct sg_table *sgtable, unsigned int sg_max,
                           iov_iter_extraction_t extraction_flags);

#endif






























































































    1 
    1 














    1 













    1 







    1 






    1 





























































































































































































































































































































    1 




    1 
    1 






    1 














































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2017 Red Hat, Inc.
 */

#include <linux/cred.h>
#include <linux/file.h>
#include <linux/mount.h>
#include <linux/xattr.h>
#include <linux/uio.h>
#include <linux/uaccess.h>
#include <linux/security.h>
#include <linux/fs.h>
#include <linux/backing-file.h>
#include "overlayfs.h"

static char ovl_whatisit(struct inode *inode, struct inode *realinode)
{
        if (realinode != ovl_inode_upper(inode))
                return 'l';
        if (ovl_has_upperdata(inode))
                return 'u';
        else
                return 'm';
}

static struct file *ovl_open_realfile(const struct file *file,
                                      const struct path *realpath)
{
        struct inode *realinode = d_inode(realpath->dentry);
        struct inode *inode = file_inode(file);
        struct mnt_idmap *real_idmap;
        struct file *realfile;
        const struct cred *old_cred;
        int flags = file->f_flags | OVL_OPEN_FLAGS;
        int acc_mode = ACC_MODE(flags);
        int err;

        if (flags & O_APPEND)
                acc_mode |= MAY_APPEND;

        old_cred = ovl_override_creds(inode->i_sb);
        real_idmap = mnt_idmap(realpath->mnt);
        err = inode_permission(real_idmap, realinode, MAY_OPEN | acc_mode);
        if (err) {
                realfile = ERR_PTR(err);
        } else {
                if (!inode_owner_or_capable(real_idmap, realinode))
                        flags &= ~O_NOATIME;

                realfile = backing_file_open(&file->f_path, flags, realpath,
                                             current_cred());
        }
        revert_creds(old_cred);

        pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n",
                 file, file, ovl_whatisit(inode, realinode), file->f_flags,
                 realfile, IS_ERR(realfile) ? 0 : realfile->f_flags);

        return realfile;
}

#define OVL_SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT)

static int ovl_change_flags(struct file *file, unsigned int flags)
{
        struct inode *inode = file_inode(file);
        int err;

        flags &= OVL_SETFL_MASK;

        if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode))
                return -EPERM;

        if ((flags & O_DIRECT) && !(file->f_mode & FMODE_CAN_ODIRECT))
                return -EINVAL;

        if (file->f_op->check_flags) {
                err = file->f_op->check_flags(flags);
                if (err)
                        return err;
        }

        spin_lock(&file->f_lock);
        file->f_flags = (file->f_flags & ~OVL_SETFL_MASK) | flags;
        file->f_iocb_flags = iocb_flags(file);
        spin_unlock(&file->f_lock);

        return 0;
}

static int ovl_real_fdget_meta(const struct file *file, struct fd *real,
                               bool allow_meta)
{
        struct dentry *dentry = file_dentry(file);
        struct path realpath;
        int err;

        real->flags = 0;
        real->file = file->private_data;

        if (allow_meta) {
                ovl_path_real(dentry, &realpath);
        } else {
                /* lazy lookup and verify of lowerdata */
                err = ovl_verify_lowerdata(dentry);
                if (err)
                        return err;

                ovl_path_realdata(dentry, &realpath);
        }
        if (!realpath.dentry)
                return -EIO;

        /* Has it been copied up since we'd opened it? */
        if (unlikely(file_inode(real->file) != d_inode(realpath.dentry))) {
                real->flags = FDPUT_FPUT;
                real->file = ovl_open_realfile(file, &realpath);

                return PTR_ERR_OR_ZERO(real->file);
        }

        /* Did the flags change since open? */
        if (unlikely((file->f_flags ^ real->file->f_flags) & ~OVL_OPEN_FLAGS))
                return ovl_change_flags(real->file, file->f_flags);

        return 0;
}

static int ovl_real_fdget(const struct file *file, struct fd *real)
{
        if (d_is_dir(file_dentry(file))) {
                real->flags = 0;
                real->file = ovl_dir_real_file(file, false);

                return PTR_ERR_OR_ZERO(real->file);
        }

        return ovl_real_fdget_meta(file, real, false);
}

static int ovl_open(struct inode *inode, struct file *file)
{
        struct dentry *dentry = file_dentry(file);
        struct file *realfile;
        struct path realpath;
        int err;

        /* lazy lookup and verify lowerdata */
        err = ovl_verify_lowerdata(dentry);
        if (err)
                return err;

        err = ovl_maybe_copy_up(dentry, file->f_flags);
        if (err)
                return err;

        /* No longer need these flags, so don't pass them on to underlying fs */
        file->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);

        ovl_path_realdata(dentry, &realpath);
        if (!realpath.dentry)
                return -EIO;

        realfile = ovl_open_realfile(file, &realpath);
        if (IS_ERR(realfile))
                return PTR_ERR(realfile);

        file->private_data = realfile;

        return 0;
}

static int ovl_release(struct inode *inode, struct file *file)
{
        fput(file->private_data);

        return 0;
}

static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *inode = file_inode(file);
        struct fd real;
        const struct cred *old_cred;
        loff_t ret;

        /*
         * The two special cases below do not need to involve real fs,
         * so we can optimizing concurrent callers.
         */
        if (offset == 0) {
                if (whence == SEEK_CUR)
                        return file->f_pos;

                if (whence == SEEK_SET)
                        return vfs_setpos(file, 0, 0);
        }

        ret = ovl_real_fdget(file, &real);
        if (ret)
                return ret;

        /*
         * Overlay file f_pos is the master copy that is preserved
         * through copy up and modified on read/write, but only real
         * fs knows how to SEEK_HOLE/SEEK_DATA and real fs may impose
         * limitations that are more strict than ->s_maxbytes for specific
         * files, so we use the real file to perform seeks.
         */
        ovl_inode_lock(inode);
        real.file->f_pos = file->f_pos;

        old_cred = ovl_override_creds(inode->i_sb);
        ret = vfs_llseek(real.file, offset, whence);
        revert_creds(old_cred);

        file->f_pos = real.file->f_pos;
        ovl_inode_unlock(inode);

        fdput(real);

        return ret;
}

static void ovl_file_modified(struct file *file)
{
        /* Update size/mtime */
        ovl_copyattr(file_inode(file));
}

static void ovl_file_accessed(struct file *file)
{
        struct inode *inode, *upperinode;
        struct timespec64 ctime, uctime;
        struct timespec64 mtime, umtime;

        if (file->f_flags & O_NOATIME)
                return;

        inode = file_inode(file);
        upperinode = ovl_inode_upper(inode);

        if (!upperinode)
                return;

        ctime = inode_get_ctime(inode);
        uctime = inode_get_ctime(upperinode);
        mtime = inode_get_mtime(inode);
        umtime = inode_get_mtime(upperinode);
        if ((!timespec64_equal(&mtime, &umtime)) ||
             !timespec64_equal(&ctime, &uctime)) {
                inode_set_mtime_to_ts(inode, inode_get_mtime(upperinode));
                inode_set_ctime_to_ts(inode, uctime);
        }

        touch_atime(&file->f_path);
}

static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        struct file *file = iocb->ki_filp;
        struct fd real;
        ssize_t ret;
        struct backing_file_ctx ctx = {
                .cred = ovl_creds(file_inode(file)->i_sb),
                .user_file = file,
                .accessed = ovl_file_accessed,
        };

        if (!iov_iter_count(iter))
                return 0;

        ret = ovl_real_fdget(file, &real);
        if (ret)
                return ret;

        ret = backing_file_read_iter(real.file, iter, iocb, iocb->ki_flags,
                                     &ctx);
        fdput(real);

        return ret;
}

static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        struct fd real;
        ssize_t ret;
        int ifl = iocb->ki_flags;
        struct backing_file_ctx ctx = {
                .cred = ovl_creds(inode->i_sb),
                .user_file = file,
                .end_write = ovl_file_modified,
        };

        if (!iov_iter_count(iter))
                return 0;

        inode_lock(inode);
        /* Update mode */
        ovl_copyattr(inode);

        ret = ovl_real_fdget(file, &real);
        if (ret)
                goto out_unlock;

        if (!ovl_should_sync(OVL_FS(inode->i_sb)))
                ifl &= ~(IOCB_DSYNC | IOCB_SYNC);

        /*
         * Overlayfs doesn't support deferred completions, don't copy
         * this property in case it is set by the issuer.
         */
        ifl &= ~IOCB_DIO_CALLER_COMP;
        ret = backing_file_write_iter(real.file, iter, iocb, ifl, &ctx);
        fdput(real);

out_unlock:
        inode_unlock(inode);

        return ret;
}

static ssize_t ovl_splice_read(struct file *in, loff_t *ppos,
                               struct pipe_inode_info *pipe, size_t len,
                               unsigned int flags)
{
        struct fd real;
        ssize_t ret;
        struct backing_file_ctx ctx = {
                .cred = ovl_creds(file_inode(in)->i_sb),
                .user_file = in,
                .accessed = ovl_file_accessed,
        };

        ret = ovl_real_fdget(in, &real);
        if (ret)
                return ret;

        ret = backing_file_splice_read(real.file, ppos, pipe, len, flags, &ctx);
        fdput(real);

        return ret;
}

/*
 * Calling iter_file_splice_write() directly from overlay's f_op may deadlock
 * due to lock order inversion between pipe->mutex in iter_file_splice_write()
 * and file_start_write(real.file) in ovl_write_iter().
 *
 * So do everything ovl_write_iter() does and call iter_file_splice_write() on
 * the real file.
 */
static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
                                loff_t *ppos, size_t len, unsigned int flags)
{
        struct fd real;
        struct inode *inode = file_inode(out);
        ssize_t ret;
        struct backing_file_ctx ctx = {
                .cred = ovl_creds(inode->i_sb),
                .user_file = out,
                .end_write = ovl_file_modified,
        };

        inode_lock(inode);
        /* Update mode */
        ovl_copyattr(inode);

        ret = ovl_real_fdget(out, &real);
        if (ret)
                goto out_unlock;

        ret = backing_file_splice_write(pipe, real.file, ppos, len, flags, &ctx);
        fdput(real);

out_unlock:
        inode_unlock(inode);

        return ret;
}

static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
        struct fd real;
        const struct cred *old_cred;
        int ret;

        ret = ovl_sync_status(OVL_FS(file_inode(file)->i_sb));
        if (ret <= 0)
                return ret;

        ret = ovl_real_fdget_meta(file, &real, !datasync);
        if (ret)
                return ret;

        /* Don't sync lower file for fear of receiving EROFS error */
        if (file_inode(real.file) == ovl_inode_upper(file_inode(file))) {
                old_cred = ovl_override_creds(file_inode(file)->i_sb);
                ret = vfs_fsync_range(real.file, start, end, datasync);
                revert_creds(old_cred);
        }

        fdput(real);

        return ret;
}

static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct file *realfile = file->private_data;
        struct backing_file_ctx ctx = {
                .cred = ovl_creds(file_inode(file)->i_sb),
                .user_file = file,
                .accessed = ovl_file_accessed,
        };

        return backing_file_mmap(realfile, vma, &ctx);
}

static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
        struct inode *inode = file_inode(file);
        struct fd real;
        const struct cred *old_cred;
        int ret;

        inode_lock(inode);
        /* Update mode */
        ovl_copyattr(inode);
        ret = file_remove_privs(file);
        if (ret)
                goto out_unlock;

        ret = ovl_real_fdget(file, &real);
        if (ret)
                goto out_unlock;

        old_cred = ovl_override_creds(file_inode(file)->i_sb);
        ret = vfs_fallocate(real.file, mode, offset, len);
        revert_creds(old_cred);

        /* Update size */
        ovl_file_modified(file);

        fdput(real);

out_unlock:
        inode_unlock(inode);

        return ret;
}

static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
{
        struct fd real;
        const struct cred *old_cred;
        int ret;

        ret = ovl_real_fdget(file, &real);
        if (ret)
                return ret;

        old_cred = ovl_override_creds(file_inode(file)->i_sb);
        ret = vfs_fadvise(real.file, offset, len, advice);
        revert_creds(old_cred);

        fdput(real);

        return ret;
}

enum ovl_copyop {
        OVL_COPY,
        OVL_CLONE,
        OVL_DEDUPE,
};

static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
                            struct file *file_out, loff_t pos_out,
                            loff_t len, unsigned int flags, enum ovl_copyop op)
{
        struct inode *inode_out = file_inode(file_out);
        struct fd real_in, real_out;
        const struct cred *old_cred;
        loff_t ret;

        inode_lock(inode_out);
        if (op != OVL_DEDUPE) {
                /* Update mode */
                ovl_copyattr(inode_out);
                ret = file_remove_privs(file_out);
                if (ret)
                        goto out_unlock;
        }

        ret = ovl_real_fdget(file_out, &real_out);
        if (ret)
                goto out_unlock;

        ret = ovl_real_fdget(file_in, &real_in);
        if (ret) {
                fdput(real_out);
                goto out_unlock;
        }

        old_cred = ovl_override_creds(file_inode(file_out)->i_sb);
        switch (op) {
        case OVL_COPY:
                ret = vfs_copy_file_range(real_in.file, pos_in,
                                          real_out.file, pos_out, len, flags);
                break;

        case OVL_CLONE:
                ret = vfs_clone_file_range(real_in.file, pos_in,
                                           real_out.file, pos_out, len, flags);
                break;

        case OVL_DEDUPE:
                ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
                                                real_out.file, pos_out, len,
                                                flags);
                break;
        }
        revert_creds(old_cred);

        /* Update size */
        ovl_file_modified(file_out);

        fdput(real_in);
        fdput(real_out);

out_unlock:
        inode_unlock(inode_out);

        return ret;
}

static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in,
                                   struct file *file_out, loff_t pos_out,
                                   size_t len, unsigned int flags)
{
        return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags,
                            OVL_COPY);
}

static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in,
                                   struct file *file_out, loff_t pos_out,
                                   loff_t len, unsigned int remap_flags)
{
        enum ovl_copyop op;

        if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
                return -EINVAL;

        if (remap_flags & REMAP_FILE_DEDUP)
                op = OVL_DEDUPE;
        else
                op = OVL_CLONE;

        /*
         * Don't copy up because of a dedupe request, this wouldn't make sense
         * most of the time (data would be duplicated instead of deduplicated).
         */
        if (op == OVL_DEDUPE &&
            (!ovl_inode_upper(file_inode(file_in)) ||
             !ovl_inode_upper(file_inode(file_out))))
                return -EPERM;

        return ovl_copyfile(file_in, pos_in, file_out, pos_out, len,
                            remap_flags, op);
}

static int ovl_flush(struct file *file, fl_owner_t id)
{
        struct fd real;
        const struct cred *old_cred;
        int err;

        err = ovl_real_fdget(file, &real);
        if (err)
                return err;

        if (real.file->f_op->flush) {
                old_cred = ovl_override_creds(file_inode(file)->i_sb);
                err = real.file->f_op->flush(real.file, id);
                revert_creds(old_cred);
        }
        fdput(real);

        return err;
}

const struct file_operations ovl_file_operations = {
        .open                = ovl_open,
        .release        = ovl_release,
        .llseek                = ovl_llseek,
        .read_iter        = ovl_read_iter,
        .write_iter        = ovl_write_iter,
        .fsync                = ovl_fsync,
        .mmap                = ovl_mmap,
        .fallocate        = ovl_fallocate,
        .fadvise        = ovl_fadvise,
        .flush                = ovl_flush,
        .splice_read    = ovl_splice_read,
        .splice_write   = ovl_splice_write,

        .copy_file_range        = ovl_copy_file_range,
        .remap_file_range        = ovl_remap_file_range,
};





















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#ifndef INTERNAL_IO_WQ_H
#define INTERNAL_IO_WQ_H

#include <linux/refcount.h>
#include <linux/io_uring_types.h>

struct io_wq;

enum {
        IO_WQ_WORK_CANCEL        = 1,
        IO_WQ_WORK_HASHED        = 2,
        IO_WQ_WORK_UNBOUND        = 4,
        IO_WQ_WORK_CONCURRENT        = 16,

        IO_WQ_HASH_SHIFT        = 24,        /* upper 8 bits are used for hash key */
};

enum io_wq_cancel {
        IO_WQ_CANCEL_OK,        /* cancelled before started */
        IO_WQ_CANCEL_RUNNING,        /* found, running, and attempted cancelled */
        IO_WQ_CANCEL_NOTFOUND,        /* work not found */
};

typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
typedef void (io_wq_work_fn)(struct io_wq_work *);

struct io_wq_hash {
        refcount_t refs;
        unsigned long map;
        struct wait_queue_head wait;
};

static inline void io_wq_put_hash(struct io_wq_hash *hash)
{
        if (refcount_dec_and_test(&hash->refs))
                kfree(hash);
}

struct io_wq_data {
        struct io_wq_hash *hash;
        struct task_struct *task;
        io_wq_work_fn *do_work;
        free_work_fn *free_work;
};

struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
void io_wq_exit_start(struct io_wq *wq);
void io_wq_put_and_exit(struct io_wq *wq);

void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val);

int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask);
int io_wq_max_workers(struct io_wq *wq, int *new_count);
bool io_wq_worker_stopped(void);

static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
        return work->flags & IO_WQ_WORK_HASHED;
}

typedef bool (work_cancel_fn)(struct io_wq_work *, void *);

enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
                                        void *data, bool cancel_all);

#if defined(CONFIG_IO_WQ)
extern void io_wq_worker_sleeping(struct task_struct *);
extern void io_wq_worker_running(struct task_struct *);
#else
static inline void io_wq_worker_sleeping(struct task_struct *tsk)
{
}
static inline void io_wq_worker_running(struct task_struct *tsk)
{
}
#endif

static inline bool io_wq_current_is_worker(void)
{
        return in_task() && (current->flags & PF_IO_WORKER) &&
                current->worker_private;
}
#endif





















































































































    1 












































    2 















    2 





    1 






    2 
    1 


    2 

    2 










    2 


























































    2 




    2 

    1 










    1 



























































    1 

















    2 















    2 
    1 



















    1 
    1 


    1 
    1 









    1 
    1 

    2 























    1 






    1 


    1 






    1 

    1 




    1 







    1 




    1 





























































































    1 





    1 


    1 



    1 










    1 






























    1 












    1 























    1 



    1 






    1 








    1 
    1 




    1 




    1 






























































    1 



    1 









    1 














































































    2 


    1 







    2 
































































































































    1 











    1 
    1 




    1 
    1 





    1 





    1 

































    2 












    2 


    1 

    1 













    2 

    1 







    2 
    1 




























    1 
    1 





    1 














































































    1 















    1 


































    2 








    1 
    2 







    1 






































































































































































































































































































































































    1 












    1 
































    1 

    1 
    1 



    1 









    1 





    1 















































    1 






    1 


















    1 




































    1 






    1 

























    1 






    1 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
// SPDX-License-Identifier: GPL-2.0

#include <linux/slab.h>
#include <trace/events/btrfs.h>
#include "messages.h"
#include "ctree.h"
#include "extent-io-tree.h"
#include "btrfs_inode.h"

static struct kmem_cache *extent_state_cache;

static inline bool extent_state_in_tree(const struct extent_state *state)
{
        return !RB_EMPTY_NODE(&state->rb_node);
}

#ifdef CONFIG_BTRFS_DEBUG
static LIST_HEAD(states);
static DEFINE_SPINLOCK(leak_lock);

static inline void btrfs_leak_debug_add_state(struct extent_state *state)
{
        unsigned long flags;

        spin_lock_irqsave(&leak_lock, flags);
        list_add(&state->leak_list, &states);
        spin_unlock_irqrestore(&leak_lock, flags);
}

static inline void btrfs_leak_debug_del_state(struct extent_state *state)
{
        unsigned long flags;

        spin_lock_irqsave(&leak_lock, flags);
        list_del(&state->leak_list);
        spin_unlock_irqrestore(&leak_lock, flags);
}

static inline void btrfs_extent_state_leak_debug_check(void)
{
        struct extent_state *state;

        while (!list_empty(&states)) {
                state = list_entry(states.next, struct extent_state, leak_list);
                pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
                       state->start, state->end, state->state,
                       extent_state_in_tree(state),
                       refcount_read(&state->refs));
                list_del(&state->leak_list);
                WARN_ON_ONCE(1);
                kmem_cache_free(extent_state_cache, state);
        }
}

#define btrfs_debug_check_extent_io_range(tree, start, end)                \
        __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
static inline void __btrfs_debug_check_extent_io_range(const char *caller,
                                                       struct extent_io_tree *tree,
                                                       u64 start, u64 end)
{
        const struct btrfs_inode *inode;
        u64 isize;

        if (tree->owner != IO_TREE_INODE_IO)
                return;

        inode = extent_io_tree_to_inode_const(tree);
        isize = i_size_read(&inode->vfs_inode);
        if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
                btrfs_debug_rl(inode->root->fs_info,
                    "%s: ino %llu isize %llu odd range [%llu,%llu]",
                        caller, btrfs_ino(inode), isize, start, end);
        }
}
#else
#define btrfs_leak_debug_add_state(state)                do {} while (0)
#define btrfs_leak_debug_del_state(state)                do {} while (0)
#define btrfs_extent_state_leak_debug_check()                do {} while (0)
#define btrfs_debug_check_extent_io_range(c, s, e)        do {} while (0)
#endif


/*
 * The only tree allowed to set the inode is IO_TREE_INODE_IO.
 */
static bool is_inode_io_tree(const struct extent_io_tree *tree)
{
        return tree->owner == IO_TREE_INODE_IO;
}

/* Return the inode if it's valid for the given tree, otherwise NULL. */
struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree)
{
        if (tree->owner == IO_TREE_INODE_IO)
                return tree->inode;
        return NULL;
}

/* Read-only access to the inode. */
const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree)
{
        if (tree->owner == IO_TREE_INODE_IO)
                return tree->inode;
        return NULL;
}

/* For read-only access to fs_info. */
const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree)
{
        if (tree->owner == IO_TREE_INODE_IO)
                return tree->inode->root->fs_info;
        return tree->fs_info;
}

void extent_io_tree_init(struct btrfs_fs_info *fs_info,
                         struct extent_io_tree *tree, unsigned int owner)
{
        tree->state = RB_ROOT;
        spin_lock_init(&tree->lock);
        tree->fs_info = fs_info;
        tree->owner = owner;
}

/*
 * Empty an io tree, removing and freeing every extent state record from the
 * tree. This should be called once we are sure no other task can access the
 * tree anymore, so no tree updates happen after we empty the tree and there
 * aren't any waiters on any extent state record (EXTENT_LOCKED bit is never
 * set on any extent state when calling this function).
 */
void extent_io_tree_release(struct extent_io_tree *tree)
{
        struct rb_root root;
        struct extent_state *state;
        struct extent_state *tmp;

        spin_lock(&tree->lock);
        root = tree->state;
        tree->state = RB_ROOT;
        rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) {
                /* Clear node to keep free_extent_state() happy. */
                RB_CLEAR_NODE(&state->rb_node);
                ASSERT(!(state->state & EXTENT_LOCKED));
                /*
                 * No need for a memory barrier here, as we are holding the tree
                 * lock and we only change the waitqueue while holding that lock
                 * (see wait_extent_bit()).
                 */
                ASSERT(!waitqueue_active(&state->wq));
                free_extent_state(state);
                cond_resched_lock(&tree->lock);
        }
        /*
         * Should still be empty even after a reschedule, no other task should
         * be accessing the tree anymore.
         */
        ASSERT(RB_EMPTY_ROOT(&tree->state));
        spin_unlock(&tree->lock);
}

static struct extent_state *alloc_extent_state(gfp_t mask)
{
        struct extent_state *state;

        /*
         * The given mask might be not appropriate for the slab allocator,
         * drop the unsupported bits
         */
        mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
        state = kmem_cache_alloc(extent_state_cache, mask);
        if (!state)
                return state;
        state->state = 0;
        RB_CLEAR_NODE(&state->rb_node);
        btrfs_leak_debug_add_state(state);
        refcount_set(&state->refs, 1);
        init_waitqueue_head(&state->wq);
        trace_alloc_extent_state(state, mask, _RET_IP_);
        return state;
}

static struct extent_state *alloc_extent_state_atomic(struct extent_state *prealloc)
{
        if (!prealloc)
                prealloc = alloc_extent_state(GFP_ATOMIC);

        return prealloc;
}

void free_extent_state(struct extent_state *state)
{
        if (!state)
                return;
        if (refcount_dec_and_test(&state->refs)) {
                WARN_ON(extent_state_in_tree(state));
                btrfs_leak_debug_del_state(state);
                trace_free_extent_state(state, _RET_IP_);
                kmem_cache_free(extent_state_cache, state);
        }
}

static int add_extent_changeset(struct extent_state *state, u32 bits,
                                 struct extent_changeset *changeset,
                                 int set)
{
        int ret;

        if (!changeset)
                return 0;
        if (set && (state->state & bits) == bits)
                return 0;
        if (!set && (state->state & bits) == 0)
                return 0;
        changeset->bytes_changed += state->end - state->start + 1;
        ret = ulist_add(&changeset->range_changed, state->start, state->end,
                        GFP_ATOMIC);
        return ret;
}

static inline struct extent_state *next_state(struct extent_state *state)
{
        struct rb_node *next = rb_next(&state->rb_node);

        if (next)
                return rb_entry(next, struct extent_state, rb_node);
        else
                return NULL;
}

static inline struct extent_state *prev_state(struct extent_state *state)
{
        struct rb_node *next = rb_prev(&state->rb_node);

        if (next)
                return rb_entry(next, struct extent_state, rb_node);
        else
                return NULL;
}

/*
 * Search @tree for an entry that contains @offset. Such entry would have
 * entry->start <= offset && entry->end >= offset.
 *
 * @tree:       the tree to search
 * @offset:     offset that should fall within an entry in @tree
 * @node_ret:   pointer where new node should be anchored (used when inserting an
 *                entry in the tree)
 * @parent_ret: points to entry which would have been the parent of the entry,
 *               containing @offset
 *
 * Return a pointer to the entry that contains @offset byte address and don't change
 * @node_ret and @parent_ret.
 *
 * If no such entry exists, return pointer to entry that ends before @offset
 * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
 */
static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree,
                                                          u64 offset,
                                                          struct rb_node ***node_ret,
                                                          struct rb_node **parent_ret)
{
        struct rb_root *root = &tree->state;
        struct rb_node **node = &root->rb_node;
        struct rb_node *prev = NULL;
        struct extent_state *entry = NULL;

        while (*node) {
                prev = *node;
                entry = rb_entry(prev, struct extent_state, rb_node);

                if (offset < entry->start)
                        node = &(*node)->rb_left;
                else if (offset > entry->end)
                        node = &(*node)->rb_right;
                else
                        return entry;
        }

        if (node_ret)
                *node_ret = node;
        if (parent_ret)
                *parent_ret = prev;

        /* Search neighbors until we find the first one past the end */
        while (entry && offset > entry->end)
                entry = next_state(entry);

        return entry;
}

/*
 * Search offset in the tree or fill neighbor rbtree node pointers.
 *
 * @tree:      the tree to search
 * @offset:    offset that should fall within an entry in @tree
 * @next_ret:  pointer to the first entry whose range ends after @offset
 * @prev_ret:  pointer to the first entry whose range begins before @offset
 *
 * Return a pointer to the entry that contains @offset byte address. If no
 * such entry exists, then return NULL and fill @prev_ret and @next_ret.
 * Otherwise return the found entry and other pointers are left untouched.
 */
static struct extent_state *tree_search_prev_next(struct extent_io_tree *tree,
                                                  u64 offset,
                                                  struct extent_state **prev_ret,
                                                  struct extent_state **next_ret)
{
        struct rb_root *root = &tree->state;
        struct rb_node **node = &root->rb_node;
        struct extent_state *orig_prev;
        struct extent_state *entry = NULL;

        ASSERT(prev_ret);
        ASSERT(next_ret);

        while (*node) {
                entry = rb_entry(*node, struct extent_state, rb_node);

                if (offset < entry->start)
                        node = &(*node)->rb_left;
                else if (offset > entry->end)
                        node = &(*node)->rb_right;
                else
                        return entry;
        }

        orig_prev = entry;
        while (entry && offset > entry->end)
                entry = next_state(entry);
        *next_ret = entry;
        entry = orig_prev;

        while (entry && offset < entry->start)
                entry = prev_state(entry);
        *prev_ret = entry;

        return NULL;
}

/*
 * Inexact rb-tree search, return the next entry if @offset is not found
 */
static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 offset)
{
        return tree_search_for_insert(tree, offset, NULL, NULL);
}

static void extent_io_tree_panic(const struct extent_io_tree *tree,
                                 const struct extent_state *state,
                                 const char *opname,
                                 int err)
{
        btrfs_panic(extent_io_tree_to_fs_info(tree), err,
                    "extent io tree error on %s state start %llu end %llu",
                    opname, state->start, state->end);
}

static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state)
{
        struct extent_state *prev;

        prev = prev_state(state);
        if (prev && prev->end == state->start - 1 && prev->state == state->state) {
                if (is_inode_io_tree(tree))
                        btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
                                                    state, prev);
                state->start = prev->start;
                rb_erase(&prev->rb_node, &tree->state);
                RB_CLEAR_NODE(&prev->rb_node);
                free_extent_state(prev);
        }
}

static void merge_next_state(struct extent_io_tree *tree, struct extent_state *state)
{
        struct extent_state *next;

        next = next_state(state);
        if (next && next->start == state->end + 1 && next->state == state->state) {
                if (is_inode_io_tree(tree))
                        btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
                                                    state, next);
                state->end = next->end;
                rb_erase(&next->rb_node, &tree->state);
                RB_CLEAR_NODE(&next->rb_node);
                free_extent_state(next);
        }
}

/*
 * Utility function to look for merge candidates inside a given range.  Any
 * extents with matching state are merged together into a single extent in the
 * tree.  Extents with EXTENT_IO in their state field are not merged because
 * the end_io handlers need to be able to do operations on them without
 * sleeping (or doing allocations/splits).
 *
 * This should be called with the tree lock held.
 */
static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
{
        if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
                return;

        merge_prev_state(tree, state);
        merge_next_state(tree, state);
}

static void set_state_bits(struct extent_io_tree *tree,
                           struct extent_state *state,
                           u32 bits, struct extent_changeset *changeset)
{
        u32 bits_to_set = bits & ~EXTENT_CTLBITS;
        int ret;

        if (is_inode_io_tree(tree))
                btrfs_set_delalloc_extent(extent_io_tree_to_inode(tree), state, bits);

        ret = add_extent_changeset(state, bits_to_set, changeset, 1);
        BUG_ON(ret < 0);
        state->state |= bits_to_set;
}

/*
 * Insert an extent_state struct into the tree.  'bits' are set on the
 * struct before it is inserted.
 *
 * Returns a pointer to the struct extent_state record containing the range
 * requested for insertion, which may be the same as the given struct or it
 * may be an existing record in the tree that was expanded to accommodate the
 * requested range. In case of an extent_state different from the one that was
 * given, the later can be freed or reused by the caller.
 *
 * On error it returns an error pointer.
 *
 * The tree lock is not taken internally.  This is a utility function and
 * probably isn't what you want to call (see set/clear_extent_bit).
 */
static struct extent_state *insert_state(struct extent_io_tree *tree,
                                         struct extent_state *state,
                                         u32 bits,
                                         struct extent_changeset *changeset)
{
        struct rb_node **node;
        struct rb_node *parent = NULL;
        const u64 start = state->start - 1;
        const u64 end = state->end + 1;
        const bool try_merge = !(bits & (EXTENT_LOCKED | EXTENT_BOUNDARY));

        set_state_bits(tree, state, bits, changeset);

        node = &tree->state.rb_node;
        while (*node) {
                struct extent_state *entry;

                parent = *node;
                entry = rb_entry(parent, struct extent_state, rb_node);

                if (state->end < entry->start) {
                        if (try_merge && end == entry->start &&
                            state->state == entry->state) {
                                if (is_inode_io_tree(tree))
                                        btrfs_merge_delalloc_extent(
                                                        extent_io_tree_to_inode(tree),
                                                        state, entry);
                                entry->start = state->start;
                                merge_prev_state(tree, entry);
                                state->state = 0;
                                return entry;
                        }
                        node = &(*node)->rb_left;
                } else if (state->end > entry->end) {
                        if (try_merge && entry->end == start &&
                            state->state == entry->state) {
                                if (is_inode_io_tree(tree))
                                        btrfs_merge_delalloc_extent(
                                                        extent_io_tree_to_inode(tree),
                                                        state, entry);
                                entry->end = state->end;
                                merge_next_state(tree, entry);
                                state->state = 0;
                                return entry;
                        }
                        node = &(*node)->rb_right;
                } else {
                        return ERR_PTR(-EEXIST);
                }
        }

        rb_link_node(&state->rb_node, parent, node);
        rb_insert_color(&state->rb_node, &tree->state);

        return state;
}

/*
 * Insert state to @tree to the location given by @node and @parent.
 */
static void insert_state_fast(struct extent_io_tree *tree,
                              struct extent_state *state, struct rb_node **node,
                              struct rb_node *parent, unsigned bits,
                              struct extent_changeset *changeset)
{
        set_state_bits(tree, state, bits, changeset);
        rb_link_node(&state->rb_node, parent, node);
        rb_insert_color(&state->rb_node, &tree->state);
        merge_state(tree, state);
}

/*
 * Split a given extent state struct in two, inserting the preallocated
 * struct 'prealloc' as the newly created second half.  'split' indicates an
 * offset inside 'orig' where it should be split.
 *
 * Before calling,
 * the tree has 'orig' at [orig->start, orig->end].  After calling, there
 * are two extent state structs in the tree:
 * prealloc: [orig->start, split - 1]
 * orig: [ split, orig->end ]
 *
 * The tree locks are not taken by this function. They need to be held
 * by the caller.
 */
static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
                       struct extent_state *prealloc, u64 split)
{
        struct rb_node *parent = NULL;
        struct rb_node **node;

        if (is_inode_io_tree(tree))
                btrfs_split_delalloc_extent(extent_io_tree_to_inode(tree), orig,
                                            split);

        prealloc->start = orig->start;
        prealloc->end = split - 1;
        prealloc->state = orig->state;
        orig->start = split;

        parent = &orig->rb_node;
        node = &parent;
        while (*node) {
                struct extent_state *entry;

                parent = *node;
                entry = rb_entry(parent, struct extent_state, rb_node);

                if (prealloc->end < entry->start) {
                        node = &(*node)->rb_left;
                } else if (prealloc->end > entry->end) {
                        node = &(*node)->rb_right;
                } else {
                        free_extent_state(prealloc);
                        return -EEXIST;
                }
        }

        rb_link_node(&prealloc->rb_node, parent, node);
        rb_insert_color(&prealloc->rb_node, &tree->state);

        return 0;
}

/*
 * Utility function to clear some bits in an extent state struct.  It will
 * optionally wake up anyone waiting on this state (wake == 1).
 *
 * If no bits are set on the state struct after clearing things, the
 * struct is freed and removed from the tree
 */
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
                                            struct extent_state *state,
                                            u32 bits, int wake,
                                            struct extent_changeset *changeset)
{
        struct extent_state *next;
        u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
        int ret;

        if (is_inode_io_tree(tree))
                btrfs_clear_delalloc_extent(extent_io_tree_to_inode(tree), state,
                                            bits);

        ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
        BUG_ON(ret < 0);
        state->state &= ~bits_to_clear;
        if (wake)
                wake_up(&state->wq);
        if (state->state == 0) {
                next = next_state(state);
                if (extent_state_in_tree(state)) {
                        rb_erase(&state->rb_node, &tree->state);
                        RB_CLEAR_NODE(&state->rb_node);
                        free_extent_state(state);
                } else {
                        WARN_ON(1);
                }
        } else {
                merge_state(tree, state);
                next = next_state(state);
        }
        return next;
}

/*
 * Detect if extent bits request NOWAIT semantics and set the gfp mask accordingly,
 * unset the EXTENT_NOWAIT bit.
 */
static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask)
{
        *mask = (*bits & EXTENT_NOWAIT ? GFP_NOWAIT : GFP_NOFS);
        *bits &= EXTENT_NOWAIT - 1;
}

/*
 * Clear some bits on a range in the tree.  This may require splitting or
 * inserting elements in the tree, so the gfp mask is used to indicate which
 * allocations or sleeping are allowed.
 *
 * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given
 * range from the tree regardless of state (ie for truncate).
 *
 * The range [start, end] is inclusive.
 *
 * This takes the tree lock, and returns 0 on success and < 0 on error.
 */
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                       u32 bits, struct extent_state **cached_state,
                       struct extent_changeset *changeset)
{
        struct extent_state *state;
        struct extent_state *cached;
        struct extent_state *prealloc = NULL;
        u64 last_end;
        int err;
        int clear = 0;
        int wake;
        int delete = (bits & EXTENT_CLEAR_ALL_BITS);
        gfp_t mask;

        set_gfp_mask_from_bits(&bits, &mask);
        btrfs_debug_check_extent_io_range(tree, start, end);
        trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);

        if (delete)
                bits |= ~EXTENT_CTLBITS;

        if (bits & EXTENT_DELALLOC)
                bits |= EXTENT_NORESERVE;

        wake = (bits & EXTENT_LOCKED) ? 1 : 0;
        if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
                clear = 1;
again:
        if (!prealloc) {
                /*
                 * Don't care for allocation failure here because we might end
                 * up not needing the pre-allocated extent state at all, which
                 * is the case if we only have in the tree extent states that
                 * cover our input range and don't cover too any other range.
                 * If we end up needing a new extent state we allocate it later.
                 */
                prealloc = alloc_extent_state(mask);
        }

        spin_lock(&tree->lock);
        if (cached_state) {
                cached = *cached_state;

                if (clear) {
                        *cached_state = NULL;
                        cached_state = NULL;
                }

                if (cached && extent_state_in_tree(cached) &&
                    cached->start <= start && cached->end > start) {
                        if (clear)
                                refcount_dec(&cached->refs);
                        state = cached;
                        goto hit_next;
                }
                if (clear)
                        free_extent_state(cached);
        }

        /* This search will find the extents that end after our range starts. */
        state = tree_search(tree, start);
        if (!state)
                goto out;
hit_next:
        if (state->start > end)
                goto out;
        WARN_ON(state->end < start);
        last_end = state->end;

        /* The state doesn't have the wanted bits, go ahead. */
        if (!(state->state & bits)) {
                state = next_state(state);
                goto next;
        }

        /*
         *     | ---- desired range ---- |
         *  | state | or
         *  | ------------- state -------------- |
         *
         * We need to split the extent we found, and may flip bits on second
         * half.
         *
         * If the extent we found extends past our range, we just split and
         * search again.  It'll get split again the next time though.
         *
         * If the extent we found is inside our range, we clear the desired bit
         * on it.
         */

        if (state->start < start) {
                prealloc = alloc_extent_state_atomic(prealloc);
                if (!prealloc)
                        goto search_again;
                err = split_state(tree, state, prealloc, start);
                if (err)
                        extent_io_tree_panic(tree, state, "split", err);

                prealloc = NULL;
                if (err)
                        goto out;
                if (state->end <= end) {
                        state = clear_state_bit(tree, state, bits, wake, changeset);
                        goto next;
                }
                goto search_again;
        }
        /*
         * | ---- desired range ---- |
         *                        | state |
         * We need to split the extent, and clear the bit on the first half.
         */
        if (state->start <= end && state->end > end) {
                prealloc = alloc_extent_state_atomic(prealloc);
                if (!prealloc)
                        goto search_again;
                err = split_state(tree, state, prealloc, end + 1);
                if (err)
                        extent_io_tree_panic(tree, state, "split", err);

                if (wake)
                        wake_up(&state->wq);

                clear_state_bit(tree, prealloc, bits, wake, changeset);

                prealloc = NULL;
                goto out;
        }

        state = clear_state_bit(tree, state, bits, wake, changeset);
next:
        if (last_end == (u64)-1)
                goto out;
        start = last_end + 1;
        if (start <= end && state && !need_resched())
                goto hit_next;

search_again:
        if (start > end)
                goto out;
        spin_unlock(&tree->lock);
        if (gfpflags_allow_blocking(mask))
                cond_resched();
        goto again;

out:
        spin_unlock(&tree->lock);
        if (prealloc)
                free_extent_state(prealloc);

        return 0;

}

/*
 * Wait for one or more bits to clear on a range in the state tree.
 * The range [start, end] is inclusive.
 * The tree lock is taken by this function
 */
static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                            u32 bits, struct extent_state **cached_state)
{
        struct extent_state *state;

        btrfs_debug_check_extent_io_range(tree, start, end);

        spin_lock(&tree->lock);
again:
        /*
         * Maintain cached_state, as we may not remove it from the tree if there
         * are more bits than the bits we're waiting on set on this state.
         */
        if (cached_state && *cached_state) {
                state = *cached_state;
                if (extent_state_in_tree(state) &&
                    state->start <= start && start < state->end)
                        goto process_node;
        }
        while (1) {
                /*
                 * This search will find all the extents that end after our
                 * range starts.
                 */
                state = tree_search(tree, start);
process_node:
                if (!state)
                        break;
                if (state->start > end)
                        goto out;

                if (state->state & bits) {
                        DEFINE_WAIT(wait);

                        start = state->start;
                        refcount_inc(&state->refs);
                        prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
                        spin_unlock(&tree->lock);
                        schedule();
                        spin_lock(&tree->lock);
                        finish_wait(&state->wq, &wait);
                        free_extent_state(state);
                        goto again;
                }
                start = state->end + 1;

                if (start > end)
                        break;

                if (!cond_resched_lock(&tree->lock)) {
                        state = next_state(state);
                        goto process_node;
                }
        }
out:
        /* This state is no longer useful, clear it and free it up. */
        if (cached_state && *cached_state) {
                state = *cached_state;
                *cached_state = NULL;
                free_extent_state(state);
        }
        spin_unlock(&tree->lock);
}

static void cache_state_if_flags(struct extent_state *state,
                                 struct extent_state **cached_ptr,
                                 unsigned flags)
{
        if (cached_ptr && !(*cached_ptr)) {
                if (!flags || (state->state & flags)) {
                        *cached_ptr = state;
                        refcount_inc(&state->refs);
                }
        }
}

static void cache_state(struct extent_state *state,
                        struct extent_state **cached_ptr)
{
        return cache_state_if_flags(state, cached_ptr,
                                    EXTENT_LOCKED | EXTENT_BOUNDARY);
}

/*
 * Find the first state struct with 'bits' set after 'start', and return it.
 * tree->lock must be held.  NULL will returned if nothing was found after
 * 'start'.
 */
static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
                                                        u64 start, u32 bits)
{
        struct extent_state *state;

        /*
         * This search will find all the extents that end after our range
         * starts.
         */
        state = tree_search(tree, start);
        while (state) {
                if (state->end >= start && (state->state & bits))
                        return state;
                state = next_state(state);
        }
        return NULL;
}

/*
 * Find the first offset in the io tree with one or more @bits set.
 *
 * Note: If there are multiple bits set in @bits, any of them will match.
 *
 * Return true if we find something, and update @start_ret and @end_ret.
 * Return false if we found nothing.
 */
bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
                           u64 *start_ret, u64 *end_ret, u32 bits,
                           struct extent_state **cached_state)
{
        struct extent_state *state;
        bool ret = false;

        spin_lock(&tree->lock);
        if (cached_state && *cached_state) {
                state = *cached_state;
                if (state->end == start - 1 && extent_state_in_tree(state)) {
                        while ((state = next_state(state)) != NULL) {
                                if (state->state & bits)
                                        break;
                        }
                        /*
                         * If we found the next extent state, clear cached_state
                         * so that we can cache the next extent state below and
                         * avoid future calls going over the same extent state
                         * again. If we haven't found any, clear as well since
                         * it's now useless.
                         */
                        free_extent_state(*cached_state);
                        *cached_state = NULL;
                        if (state)
                                goto got_it;
                        goto out;
                }
                free_extent_state(*cached_state);
                *cached_state = NULL;
        }

        state = find_first_extent_bit_state(tree, start, bits);
got_it:
        if (state) {
                cache_state_if_flags(state, cached_state, 0);
                *start_ret = state->start;
                *end_ret = state->end;
                ret = true;
        }
out:
        spin_unlock(&tree->lock);
        return ret;
}

/*
 * Find a contiguous area of bits
 *
 * @tree:      io tree to check
 * @start:     offset to start the search from
 * @start_ret: the first offset we found with the bits set
 * @end_ret:   the final contiguous range of the bits that were set
 * @bits:      bits to look for
 *
 * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
 * to set bits appropriately, and then merge them again.  During this time it
 * will drop the tree->lock, so use this helper if you want to find the actual
 * contiguous area for given bits.  We will search to the first bit we find, and
 * then walk down the tree until we find a non-contiguous area.  The area
 * returned will be the full contiguous area with the bits set.
 */
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
                               u64 *start_ret, u64 *end_ret, u32 bits)
{
        struct extent_state *state;
        int ret = 1;

        ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES));

        spin_lock(&tree->lock);
        state = find_first_extent_bit_state(tree, start, bits);
        if (state) {
                *start_ret = state->start;
                *end_ret = state->end;
                while ((state = next_state(state)) != NULL) {
                        if (state->start > (*end_ret + 1))
                                break;
                        *end_ret = state->end;
                }
                ret = 0;
        }
        spin_unlock(&tree->lock);
        return ret;
}

/*
 * Find a contiguous range of bytes in the file marked as delalloc, not more
 * than 'max_bytes'.  start and end are used to return the range,
 *
 * True is returned if we find something, false if nothing was in the tree.
 */
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
                               u64 *end, u64 max_bytes,
                               struct extent_state **cached_state)
{
        struct extent_state *state;
        u64 cur_start = *start;
        bool found = false;
        u64 total_bytes = 0;

        spin_lock(&tree->lock);

        /*
         * This search will find all the extents that end after our range
         * starts.
         */
        state = tree_search(tree, cur_start);
        if (!state) {
                *end = (u64)-1;
                goto out;
        }

        while (state) {
                if (found && (state->start != cur_start ||
                              (state->state & EXTENT_BOUNDARY))) {
                        goto out;
                }
                if (!(state->state & EXTENT_DELALLOC)) {
                        if (!found)
                                *end = state->end;
                        goto out;
                }
                if (!found) {
                        *start = state->start;
                        *cached_state = state;
                        refcount_inc(&state->refs);
                }
                found = true;
                *end = state->end;
                cur_start = state->end + 1;
                total_bytes += state->end - state->start + 1;
                if (total_bytes >= max_bytes)
                        break;
                state = next_state(state);
        }
out:
        spin_unlock(&tree->lock);
        return found;
}

/*
 * Set some bits on a range in the tree.  This may require allocations or
 * sleeping. By default all allocations use GFP_NOFS, use EXTENT_NOWAIT for
 * GFP_NOWAIT.
 *
 * If any of the exclusive bits are set, this will fail with -EEXIST if some
 * part of the range already has the desired bits set.  The extent_state of the
 * existing range is returned in failed_state in this case, and the start of the
 * existing range is returned in failed_start.  failed_state is used as an
 * optimization for wait_extent_bit, failed_start must be used as the source of
 * truth as failed_state may have changed since we returned.
 *
 * [start, end] is inclusive This takes the tree lock.
 */
static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                            u32 bits, u64 *failed_start,
                            struct extent_state **failed_state,
                            struct extent_state **cached_state,
                            struct extent_changeset *changeset)
{
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
        struct rb_node **p = NULL;
        struct rb_node *parent = NULL;
        int ret = 0;
        u64 last_start;
        u64 last_end;
        u32 exclusive_bits = (bits & EXTENT_LOCKED);
        gfp_t mask;

        set_gfp_mask_from_bits(&bits, &mask);
        btrfs_debug_check_extent_io_range(tree, start, end);
        trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);

        if (exclusive_bits)
                ASSERT(failed_start);
        else
                ASSERT(failed_start == NULL && failed_state == NULL);
again:
        if (!prealloc) {
                /*
                 * Don't care for allocation failure here because we might end
                 * up not needing the pre-allocated extent state at all, which
                 * is the case if we only have in the tree extent states that
                 * cover our input range and don't cover too any other range.
                 * If we end up needing a new extent state we allocate it later.
                 */
                prealloc = alloc_extent_state(mask);
        }

        spin_lock(&tree->lock);
        if (cached_state && *cached_state) {
                state = *cached_state;
                if (state->start <= start && state->end > start &&
                    extent_state_in_tree(state))
                        goto hit_next;
        }
        /*
         * This search will find all the extents that end after our range
         * starts.
         */
        state = tree_search_for_insert(tree, start, &p, &parent);
        if (!state) {
                prealloc = alloc_extent_state_atomic(prealloc);
                if (!prealloc)
                        goto search_again;
                prealloc->start = start;
                prealloc->end = end;
                insert_state_fast(tree, prealloc, p, parent, bits, changeset);
                cache_state(prealloc, cached_state);
                prealloc = NULL;
                goto out;
        }
hit_next:
        last_start = state->start;
        last_end = state->end;

        /*
         * | ---- desired range ---- |
         * | state |
         *
         * Just lock what we found and keep going
         */
        if (state->start == start && state->end <= end) {
                if (state->state & exclusive_bits) {
                        *failed_start = state->start;
                        cache_state(state, failed_state);
                        ret = -EEXIST;
                        goto out;
                }

                set_state_bits(tree, state, bits, changeset);
                cache_state(state, cached_state);
                merge_state(tree, state);
                if (last_end == (u64)-1)
                        goto out;
                start = last_end + 1;
                state = next_state(state);
                if (start < end && state && state->start == start &&
                    !need_resched())
                        goto hit_next;
                goto search_again;
        }

        /*
         *     | ---- desired range ---- |
         * | state |
         *   or
         * | ------------- state -------------- |
         *
         * We need to split the extent we found, and may flip bits on second
         * half.
         *
         * If the extent we found extends past our range, we just split and
         * search again.  It'll get split again the next time though.
         *
         * If the extent we found is inside our range, we set the desired bit
         * on it.
         */
        if (state->start < start) {
                if (state->state & exclusive_bits) {
                        *failed_start = start;
                        cache_state(state, failed_state);
                        ret = -EEXIST;
                        goto out;
                }

                /*
                 * If this extent already has all the bits we want set, then
                 * skip it, not necessary to split it or do anything with it.
                 */
                if ((state->state & bits) == bits) {
                        start = state->end + 1;
                        cache_state(state, cached_state);
                        goto search_again;
                }

                prealloc = alloc_extent_state_atomic(prealloc);
                if (!prealloc)
                        goto search_again;
                ret = split_state(tree, state, prealloc, start);
                if (ret)
                        extent_io_tree_panic(tree, state, "split", ret);

                prealloc = NULL;
                if (ret)
                        goto out;
                if (state->end <= end) {
                        set_state_bits(tree, state, bits, changeset);
                        cache_state(state, cached_state);
                        merge_state(tree, state);
                        if (last_end == (u64)-1)
                                goto out;
                        start = last_end + 1;
                        state = next_state(state);
                        if (start < end && state && state->start == start &&
                            !need_resched())
                                goto hit_next;
                }
                goto search_again;
        }
        /*
         * | ---- desired range ---- |
         *     | state | or               | state |
         *
         * There's a hole, we need to insert something in it and ignore the
         * extent we found.
         */
        if (state->start > start) {
                u64 this_end;
                struct extent_state *inserted_state;

                if (end < last_start)
                        this_end = end;
                else
                        this_end = last_start - 1;

                prealloc = alloc_extent_state_atomic(prealloc);
                if (!prealloc)
                        goto search_again;

                /*
                 * Avoid to free 'prealloc' if it can be merged with the later
                 * extent.
                 */
                prealloc->start = start;
                prealloc->end = this_end;
                inserted_state = insert_state(tree, prealloc, bits, changeset);
                if (IS_ERR(inserted_state)) {
                        ret = PTR_ERR(inserted_state);
                        extent_io_tree_panic(tree, prealloc, "insert", ret);
                }

                cache_state(inserted_state, cached_state);
                if (inserted_state == prealloc)
                        prealloc = NULL;
                start = this_end + 1;
                goto search_again;
        }
        /*
         * | ---- desired range ---- |
         *                        | state |
         *
         * We need to split the extent, and set the bit on the first half
         */
        if (state->start <= end && state->end > end) {
                if (state->state & exclusive_bits) {
                        *failed_start = start;
                        cache_state(state, failed_state);
                        ret = -EEXIST;
                        goto out;
                }

                prealloc = alloc_extent_state_atomic(prealloc);
                if (!prealloc)
                        goto search_again;
                ret = split_state(tree, state, prealloc, end + 1);
                if (ret)
                        extent_io_tree_panic(tree, state, "split", ret);

                set_state_bits(tree, prealloc, bits, changeset);
                cache_state(prealloc, cached_state);
                merge_state(tree, prealloc);
                prealloc = NULL;
                goto out;
        }

search_again:
        if (start > end)
                goto out;
        spin_unlock(&tree->lock);
        if (gfpflags_allow_blocking(mask))
                cond_resched();
        goto again;

out:
        spin_unlock(&tree->lock);
        if (prealloc)
                free_extent_state(prealloc);

        return ret;

}

int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                   u32 bits, struct extent_state **cached_state)
{
        return __set_extent_bit(tree, start, end, bits, NULL, NULL,
                                cached_state, NULL);
}

/*
 * Convert all bits in a given range from one bit to another
 *
 * @tree:        the io tree to search
 * @start:        the start offset in bytes
 * @end:        the end offset in bytes (inclusive)
 * @bits:        the bits to set in this range
 * @clear_bits:        the bits to clear in this range
 * @cached_state:        state that we're going to cache
 *
 * This will go through and set bits for the given range.  If any states exist
 * already in this range they are set with the given bit and cleared of the
 * clear_bits.  This is only meant to be used by things that are mergeable, ie.
 * converting from say DELALLOC to DIRTY.  This is not meant to be used with
 * boundary bits like LOCK.
 *
 * All allocations are done with GFP_NOFS.
 */
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                       u32 bits, u32 clear_bits,
                       struct extent_state **cached_state)
{
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
        struct rb_node **p = NULL;
        struct rb_node *parent = NULL;
        int ret = 0;
        u64 last_start;
        u64 last_end;
        bool first_iteration = true;

        btrfs_debug_check_extent_io_range(tree, start, end);
        trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
                                       clear_bits);

again:
        if (!prealloc) {
                /*
                 * Best effort, don't worry if extent state allocation fails
                 * here for the first iteration. We might have a cached state
                 * that matches exactly the target range, in which case no
                 * extent state allocations are needed. We'll only know this
                 * after locking the tree.
                 */
                prealloc = alloc_extent_state(GFP_NOFS);
                if (!prealloc && !first_iteration)
                        return -ENOMEM;
        }

        spin_lock(&tree->lock);
        if (cached_state && *cached_state) {
                state = *cached_state;
                if (state->start <= start && state->end > start &&
                    extent_state_in_tree(state))
                        goto hit_next;
        }

        /*
         * This search will find all the extents that end after our range
         * starts.
         */
        state = tree_search_for_insert(tree, start, &p, &parent);
        if (!state) {
                prealloc = alloc_extent_state_atomic(prealloc);
                if (!prealloc) {
                        ret = -ENOMEM;
                        goto out;
                }
                prealloc->start = start;
                prealloc->end = end;
                insert_state_fast(tree, prealloc, p, parent, bits, NULL);
                cache_state(prealloc, cached_state);
                prealloc = NULL;
                goto out;
        }
hit_next:
        last_start = state->start;
        last_end = state->end;

        /*
         * | ---- desired range ---- |
         * | state |
         *
         * Just lock what we found and keep going.
         */
        if (state->start == start && state->end <= end) {
                set_state_bits(tree, state, bits, NULL);
                cache_state(state, cached_state);
                state = clear_state_bit(tree, state, clear_bits, 0, NULL);
                if (last_end == (u64)-1)
                        goto out;
                start = last_end + 1;
                if (start < end && state && state->start == start &&
                    !need_resched())
                        goto hit_next;
                goto search_again;
        }

        /*
         *     | ---- desired range ---- |
         * | state |
         *   or
         * | ------------- state -------------- |
         *
         * We need to split the extent we found, and may flip bits on second
         * half.
         *
         * If the extent we found extends past our range, we just split and
         * search again.  It'll get split again the next time though.
         *
         * If the extent we found is inside our range, we set the desired bit
         * on it.
         */
        if (state->start < start) {
                prealloc = alloc_extent_state_atomic(prealloc);
                if (!prealloc) {
                        ret = -ENOMEM;
                        goto out;
                }
                ret = split_state(tree, state, prealloc, start);
                if (ret)
                        extent_io_tree_panic(tree, state, "split", ret);
                prealloc = NULL;
                if (ret)
                        goto out;
                if (state->end <= end) {
                        set_state_bits(tree, state, bits, NULL);
                        cache_state(state, cached_state);
                        state = clear_state_bit(tree, state, clear_bits, 0, NULL);
                        if (last_end == (u64)-1)
                                goto out;
                        start = last_end + 1;
                        if (start < end && state && state->start == start &&
                            !need_resched())
                                goto hit_next;
                }
                goto search_again;
        }
        /*
         * | ---- desired range ---- |
         *     | state | or               | state |
         *
         * There's a hole, we need to insert something in it and ignore the
         * extent we found.
         */
        if (state->start > start) {
                u64 this_end;
                struct extent_state *inserted_state;

                if (end < last_start)
                        this_end = end;
                else
                        this_end = last_start - 1;

                prealloc = alloc_extent_state_atomic(prealloc);
                if (!prealloc) {
                        ret = -ENOMEM;
                        goto out;
                }

                /*
                 * Avoid to free 'prealloc' if it can be merged with the later
                 * extent.
                 */
                prealloc->start = start;
                prealloc->end = this_end;
                inserted_state = insert_state(tree, prealloc, bits, NULL);
                if (IS_ERR(inserted_state)) {
                        ret = PTR_ERR(inserted_state);
                        extent_io_tree_panic(tree, prealloc, "insert", ret);
                }
                cache_state(inserted_state, cached_state);
                if (inserted_state == prealloc)
                        prealloc = NULL;
                start = this_end + 1;
                goto search_again;
        }
        /*
         * | ---- desired range ---- |
         *                        | state |
         *
         * We need to split the extent, and set the bit on the first half.
         */
        if (state->start <= end && state->end > end) {
                prealloc = alloc_extent_state_atomic(prealloc);
                if (!prealloc) {
                        ret = -ENOMEM;
                        goto out;
                }

                ret = split_state(tree, state, prealloc, end + 1);
                if (ret)
                        extent_io_tree_panic(tree, state, "split", ret);

                set_state_bits(tree, prealloc, bits, NULL);
                cache_state(prealloc, cached_state);
                clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
                prealloc = NULL;
                goto out;
        }

search_again:
        if (start > end)
                goto out;
        spin_unlock(&tree->lock);
        cond_resched();
        first_iteration = false;
        goto again;

out:
        spin_unlock(&tree->lock);
        if (prealloc)
                free_extent_state(prealloc);

        return ret;
}

/*
 * Find the first range that has @bits not set. This range could start before
 * @start.
 *
 * @tree:      the tree to search
 * @start:     offset at/after which the found extent should start
 * @start_ret: records the beginning of the range
 * @end_ret:   records the end of the range (inclusive)
 * @bits:      the set of bits which must be unset
 *
 * Since unallocated range is also considered one which doesn't have the bits
 * set it's possible that @end_ret contains -1, this happens in case the range
 * spans (last_range_end, end of device]. In this case it's up to the caller to
 * trim @end_ret to the appropriate size.
 */
void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
                                 u64 *start_ret, u64 *end_ret, u32 bits)
{
        struct extent_state *state;
        struct extent_state *prev = NULL, *next = NULL;

        spin_lock(&tree->lock);

        /* Find first extent with bits cleared */
        while (1) {
                state = tree_search_prev_next(tree, start, &prev, &next);
                if (!state && !next && !prev) {
                        /*
                         * Tree is completely empty, send full range and let
                         * caller deal with it
                         */
                        *start_ret = 0;
                        *end_ret = -1;
                        goto out;
                } else if (!state && !next) {
                        /*
                         * We are past the last allocated chunk, set start at
                         * the end of the last extent.
                         */
                        *start_ret = prev->end + 1;
                        *end_ret = -1;
                        goto out;
                } else if (!state) {
                        state = next;
                }

                /*
                 * At this point 'state' either contains 'start' or start is
                 * before 'state'
                 */
                if (in_range(start, state->start, state->end - state->start + 1)) {
                        if (state->state & bits) {
                                /*
                                 * |--range with bits sets--|
                                 *    |
                                 *    start
                                 */
                                start = state->end + 1;
                        } else {
                                /*
                                 * 'start' falls within a range that doesn't
                                 * have the bits set, so take its start as the
                                 * beginning of the desired range
                                 *
                                 * |--range with bits cleared----|
                                 *      |
                                 *      start
                                 */
                                *start_ret = state->start;
                                break;
                        }
                } else {
                        /*
                         * |---prev range---|---hole/unset---|---node range---|
                         *                          |
                         *                        start
                         *
                         *                        or
                         *
                         * |---hole/unset--||--first node--|
                         * 0   |
                         *    start
                         */
                        if (prev)
                                *start_ret = prev->end + 1;
                        else
                                *start_ret = 0;
                        break;
                }
        }

        /*
         * Find the longest stretch from start until an entry which has the
         * bits set
         */
        while (state) {
                if (state->end >= start && !(state->state & bits)) {
                        *end_ret = state->end;
                } else {
                        *end_ret = state->start - 1;
                        break;
                }
                state = next_state(state);
        }
out:
        spin_unlock(&tree->lock);
}

/*
 * Count the number of bytes in the tree that have a given bit(s) set for a
 * given range.
 *
 * @tree:         The io tree to search.
 * @start:        The start offset of the range. This value is updated to the
 *                offset of the first byte found with the given bit(s), so it
 *                can end up being bigger than the initial value.
 * @search_end:   The end offset (inclusive value) of the search range.
 * @max_bytes:    The maximum byte count we are interested. The search stops
 *                once it reaches this count.
 * @bits:         The bits the range must have in order to be accounted for.
 *                If multiple bits are set, then only subranges that have all
 *                the bits set are accounted for.
 * @contig:       Indicate if we should ignore holes in the range or not. If
 *                this is true, then stop once we find a hole.
 * @cached_state: A cached state to be used across multiple calls to this
 *                function in order to speedup searches. Use NULL if this is
 *                called only once or if each call does not start where the
 *                previous one ended.
 *
 * Returns the total number of bytes found within the given range that have
 * all given bits set. If the returned number of bytes is greater than zero
 * then @start is updated with the offset of the first byte with the bits set.
 */
u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end, u64 max_bytes,
                     u32 bits, int contig,
                     struct extent_state **cached_state)
{
        struct extent_state *state = NULL;
        struct extent_state *cached;
        u64 cur_start = *start;
        u64 total_bytes = 0;
        u64 last = 0;
        int found = 0;

        if (WARN_ON(search_end < cur_start))
                return 0;

        spin_lock(&tree->lock);

        if (!cached_state || !*cached_state)
                goto search;

        cached = *cached_state;

        if (!extent_state_in_tree(cached))
                goto search;

        if (cached->start <= cur_start && cur_start <= cached->end) {
                state = cached;
        } else if (cached->start > cur_start) {
                struct extent_state *prev;

                /*
                 * The cached state starts after our search range's start. Check
                 * if the previous state record starts at or before the range we
                 * are looking for, and if so, use it - this is a common case
                 * when there are holes between records in the tree. If there is
                 * no previous state record, we can start from our cached state.
                 */
                prev = prev_state(cached);
                if (!prev)
                        state = cached;
                else if (prev->start <= cur_start && cur_start <= prev->end)
                        state = prev;
        }

        /*
         * This search will find all the extents that end after our range
         * starts.
         */
search:
        if (!state)
                state = tree_search(tree, cur_start);

        while (state) {
                if (state->start > search_end)
                        break;
                if (contig && found && state->start > last + 1)
                        break;
                if (state->end >= cur_start && (state->state & bits) == bits) {
                        total_bytes += min(search_end, state->end) + 1 -
                                       max(cur_start, state->start);
                        if (total_bytes >= max_bytes)
                                break;
                        if (!found) {
                                *start = max(cur_start, state->start);
                                found = 1;
                        }
                        last = state->end;
                } else if (contig && found) {
                        break;
                }
                state = next_state(state);
        }

        if (cached_state) {
                free_extent_state(*cached_state);
                *cached_state = state;
                if (state)
                        refcount_inc(&state->refs);
        }

        spin_unlock(&tree->lock);

        return total_bytes;
}

/*
 * Check if the single @bit exists in the given range.
 */
bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit)
{
        struct extent_state *state = NULL;
        bool bitset = false;

        ASSERT(is_power_of_2(bit));

        spin_lock(&tree->lock);
        state = tree_search(tree, start);
        while (state && start <= end) {
                if (state->start > end)
                        break;

                if (state->state & bit) {
                        bitset = true;
                        break;
                }

                /* If state->end is (u64)-1, start will overflow to 0 */
                start = state->end + 1;
                if (start > end || start == 0)
                        break;
                state = next_state(state);
        }
        spin_unlock(&tree->lock);
        return bitset;
}

/*
 * Check if the whole range [@start,@end) contains the single @bit set.
 */
bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
                    struct extent_state *cached)
{
        struct extent_state *state = NULL;
        bool bitset = true;

        ASSERT(is_power_of_2(bit));

        spin_lock(&tree->lock);
        if (cached && extent_state_in_tree(cached) && cached->start <= start &&
            cached->end > start)
                state = cached;
        else
                state = tree_search(tree, start);
        while (state && start <= end) {
                if (state->start > start) {
                        bitset = false;
                        break;
                }

                if (state->start > end)
                        break;

                if ((state->state & bit) == 0) {
                        bitset = false;
                        break;
                }

                if (state->end == (u64)-1)
                        break;

                /*
                 * Last entry (if state->end is (u64)-1 and overflow happens),
                 * or next entry starts after the range.
                 */
                start = state->end + 1;
                if (start > end || start == 0)
                        break;
                state = next_state(state);
        }

        /* We ran out of states and were still inside of our range. */
        if (!state)
                bitset = false;
        spin_unlock(&tree->lock);
        return bitset;
}

/* Wrappers around set/clear extent bit */
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                           u32 bits, struct extent_changeset *changeset)
{
        /*
         * We don't support EXTENT_LOCKED yet, as current changeset will
         * record any bits changed, so for EXTENT_LOCKED case, it will
         * either fail with -EEXIST or changeset will record the whole
         * range.
         */
        ASSERT(!(bits & EXTENT_LOCKED));

        return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset);
}

int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                             u32 bits, struct extent_changeset *changeset)
{
        /*
         * Don't support EXTENT_LOCKED case, same reason as
         * set_record_extent_bits().
         */
        ASSERT(!(bits & EXTENT_LOCKED));

        return __clear_extent_bit(tree, start, end, bits, NULL, changeset);
}

int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
                    struct extent_state **cached)
{
        int err;
        u64 failed_start;

        err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
                               NULL, cached, NULL);
        if (err == -EEXIST) {
                if (failed_start > start)
                        clear_extent_bit(tree, start, failed_start - 1,
                                         EXTENT_LOCKED, cached);
                return 0;
        }
        return 1;
}

/*
 * Either insert or lock state struct between start and end use mask to tell
 * us if waiting is desired.
 */
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
                struct extent_state **cached_state)
{
        struct extent_state *failed_state = NULL;
        int err;
        u64 failed_start;

        err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
                               &failed_state, cached_state, NULL);
        while (err == -EEXIST) {
                if (failed_start != start)
                        clear_extent_bit(tree, start, failed_start - 1,
                                         EXTENT_LOCKED, cached_state);

                wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED,
                                &failed_state);
                err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
                                       &failed_start, &failed_state,
                                       cached_state, NULL);
        }
        return err;
}

void __cold extent_state_free_cachep(void)
{
        btrfs_extent_state_leak_debug_check();
        kmem_cache_destroy(extent_state_cache);
}

int __init extent_state_init_cachep(void)
{
        extent_state_cache = kmem_cache_create("btrfs_extent_state",
                                               sizeof(struct extent_state), 0, 0,
                                               NULL);
        if (!extent_state_cache)
                return -ENOMEM;

        return 0;
}





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 

















    3 


    3 


    3 

























    3 







    3 













    3 




















































































    2 



    3 































































































    3 




























    3 










    3 










































    3 


















    3 


































































































































    3 
























    3 

    3 













































    3 






    3 



    3 






    3 













    3 
































































    3 

    3 




























    3 














    3 








    3 





    3 






    3 






    3 



    3 
    3 








    3 
    3 
    3 






    3 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 1999 Eric Youngdale
 * Copyright (C) 2014 Christoph Hellwig
 *
 *  SCSI queueing library.
 *      Initial versions: Eric Youngdale (eric@andante.org).
 *                        Based upon conversations with large numbers
 *                        of people at Linux Expo.
 */

#include <linux/bio.h>
#include <linux/bitops.h>
#include <linux/blkdev.h>
#include <linux/completion.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/pci.h>
#include <linux/delay.h>
#include <linux/hardirq.h>
#include <linux/scatterlist.h>
#include <linux/blk-mq.h>
#include <linux/blk-integrity.h>
#include <linux/ratelimit.h>
#include <asm/unaligned.h>

#include <scsi/scsi.h>
#include <scsi/scsi_cmnd.h>
#include <scsi/scsi_dbg.h>
#include <scsi/scsi_device.h>
#include <scsi/scsi_driver.h>
#include <scsi/scsi_eh.h>
#include <scsi/scsi_host.h>
#include <scsi/scsi_transport.h> /* scsi_init_limits() */
#include <scsi/scsi_dh.h>

#include <trace/events/scsi.h>

#include "scsi_debugfs.h"
#include "scsi_priv.h"
#include "scsi_logging.h"

/*
 * Size of integrity metadata is usually small, 1 inline sg should
 * cover normal cases.
 */
#ifdef CONFIG_ARCH_NO_SG_CHAIN
#define  SCSI_INLINE_PROT_SG_CNT  0
#define  SCSI_INLINE_SG_CNT  0
#else
#define  SCSI_INLINE_PROT_SG_CNT  1
#define  SCSI_INLINE_SG_CNT  2
#endif

static struct kmem_cache *scsi_sense_cache;
static DEFINE_MUTEX(scsi_sense_cache_mutex);

static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd);

int scsi_init_sense_cache(struct Scsi_Host *shost)
{
        int ret = 0;

        mutex_lock(&scsi_sense_cache_mutex);
        if (!scsi_sense_cache) {
                scsi_sense_cache =
                        kmem_cache_create_usercopy("scsi_sense_cache",
                                SCSI_SENSE_BUFFERSIZE, 0, SLAB_HWCACHE_ALIGN,
                                0, SCSI_SENSE_BUFFERSIZE, NULL);
                if (!scsi_sense_cache)
                        ret = -ENOMEM;
        }
        mutex_unlock(&scsi_sense_cache_mutex);
        return ret;
}

static void
scsi_set_blocked(struct scsi_cmnd *cmd, int reason)
{
        struct Scsi_Host *host = cmd->device->host;
        struct scsi_device *device = cmd->device;
        struct scsi_target *starget = scsi_target(device);

        /*
         * Set the appropriate busy bit for the device/host.
         *
         * If the host/device isn't busy, assume that something actually
         * completed, and that we should be able to queue a command now.
         *
         * Note that the prior mid-layer assumption that any host could
         * always queue at least one command is now broken.  The mid-layer
         * will implement a user specifiable stall (see
         * scsi_host.max_host_blocked and scsi_device.max_device_blocked)
         * if a command is requeued with no other commands outstanding
         * either for the device or for the host.
         */
        switch (reason) {
        case SCSI_MLQUEUE_HOST_BUSY:
                atomic_set(&host->host_blocked, host->max_host_blocked);
                break;
        case SCSI_MLQUEUE_DEVICE_BUSY:
        case SCSI_MLQUEUE_EH_RETRY:
                atomic_set(&device->device_blocked,
                           device->max_device_blocked);
                break;
        case SCSI_MLQUEUE_TARGET_BUSY:
                atomic_set(&starget->target_blocked,
                           starget->max_target_blocked);
                break;
        }
}

static void scsi_mq_requeue_cmd(struct scsi_cmnd *cmd, unsigned long msecs)
{
        struct request *rq = scsi_cmd_to_rq(cmd);

        if (rq->rq_flags & RQF_DONTPREP) {
                rq->rq_flags &= ~RQF_DONTPREP;
                scsi_mq_uninit_cmd(cmd);
        } else {
                WARN_ON_ONCE(true);
        }

        blk_mq_requeue_request(rq, false);
        if (!scsi_host_in_recovery(cmd->device->host))
                blk_mq_delay_kick_requeue_list(rq->q, msecs);
}

/**
 * __scsi_queue_insert - private queue insertion
 * @cmd: The SCSI command being requeued
 * @reason:  The reason for the requeue
 * @unbusy: Whether the queue should be unbusied
 *
 * This is a private queue insertion.  The public interface
 * scsi_queue_insert() always assumes the queue should be unbusied
 * because it's always called before the completion.  This function is
 * for a requeue after completion, which should only occur in this
 * file.
 */
static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, bool unbusy)
{
        struct scsi_device *device = cmd->device;

        SCSI_LOG_MLQUEUE(1, scmd_printk(KERN_INFO, cmd,
                "Inserting command %p into mlqueue\n", cmd));

        scsi_set_blocked(cmd, reason);

        /*
         * Decrement the counters, since these commands are no longer
         * active on the host/device.
         */
        if (unbusy)
                scsi_device_unbusy(device, cmd);

        /*
         * Requeue this command.  It will go before all other commands
         * that are already in the queue. Schedule requeue work under
         * lock such that the kblockd_schedule_work() call happens
         * before blk_mq_destroy_queue() finishes.
         */
        cmd->result = 0;

        blk_mq_requeue_request(scsi_cmd_to_rq(cmd),
                               !scsi_host_in_recovery(cmd->device->host));
}

/**
 * scsi_queue_insert - Reinsert a command in the queue.
 * @cmd:    command that we are adding to queue.
 * @reason: why we are inserting command to queue.
 *
 * We do this for one of two cases. Either the host is busy and it cannot accept
 * any more commands for the time being, or the device returned QUEUE_FULL and
 * can accept no more commands.
 *
 * Context: This could be called either from an interrupt context or a normal
 * process context.
 */
void scsi_queue_insert(struct scsi_cmnd *cmd, int reason)
{
        __scsi_queue_insert(cmd, reason, true);
}

void scsi_failures_reset_retries(struct scsi_failures *failures)
{
        struct scsi_failure *failure;

        failures->total_retries = 0;

        for (failure = failures->failure_definitions; failure->result;
             failure++)
                failure->retries = 0;
}
EXPORT_SYMBOL_GPL(scsi_failures_reset_retries);

/**
 * scsi_check_passthrough - Determine if passthrough scsi_cmnd needs a retry.
 * @scmd: scsi_cmnd to check.
 * @failures: scsi_failures struct that lists failures to check for.
 *
 * Returns -EAGAIN if the caller should retry else 0.
 */
static int scsi_check_passthrough(struct scsi_cmnd *scmd,
                                  struct scsi_failures *failures)
{
        struct scsi_failure *failure;
        struct scsi_sense_hdr sshdr;
        enum sam_status status;

        if (!failures)
                return 0;

        for (failure = failures->failure_definitions; failure->result;
             failure++) {
                if (failure->result == SCMD_FAILURE_RESULT_ANY)
                        goto maybe_retry;

                if (host_byte(scmd->result) &&
                    host_byte(scmd->result) == host_byte(failure->result))
                        goto maybe_retry;

                status = status_byte(scmd->result);
                if (!status)
                        continue;

                if (failure->result == SCMD_FAILURE_STAT_ANY &&
                    !scsi_status_is_good(scmd->result))
                        goto maybe_retry;

                if (status != status_byte(failure->result))
                        continue;

                if (status_byte(failure->result) != SAM_STAT_CHECK_CONDITION ||
                    failure->sense == SCMD_FAILURE_SENSE_ANY)
                        goto maybe_retry;

                if (!scsi_command_normalize_sense(scmd, &sshdr))
                        return 0;

                if (failure->sense != sshdr.sense_key)
                        continue;

                if (failure->asc == SCMD_FAILURE_ASC_ANY)
                        goto maybe_retry;

                if (failure->asc != sshdr.asc)
                        continue;

                if (failure->ascq == SCMD_FAILURE_ASCQ_ANY ||
                    failure->ascq == sshdr.ascq)
                        goto maybe_retry;
        }

        return 0;

maybe_retry:
        if (failure->allowed) {
                if (failure->allowed == SCMD_FAILURE_NO_LIMIT ||
                    ++failure->retries <= failure->allowed)
                        return -EAGAIN;
        } else {
                if (failures->total_allowed == SCMD_FAILURE_NO_LIMIT ||
                    ++failures->total_retries <= failures->total_allowed)
                        return -EAGAIN;
        }

        return 0;
}

/**
 * scsi_execute_cmd - insert request and wait for the result
 * @sdev:        scsi_device
 * @cmd:        scsi command
 * @opf:        block layer request cmd_flags
 * @buffer:        data buffer
 * @bufflen:        len of buffer
 * @timeout:        request timeout in HZ
 * @ml_retries:        number of times SCSI midlayer will retry request
 * @args:        Optional args. See struct definition for field descriptions
 *
 * Returns the scsi_cmnd result field if a command was executed, or a negative
 * Linux error code if we didn't get that far.
 */
int scsi_execute_cmd(struct scsi_device *sdev, const unsigned char *cmd,
                     blk_opf_t opf, void *buffer, unsigned int bufflen,
                     int timeout, int ml_retries,
                     const struct scsi_exec_args *args)
{
        static const struct scsi_exec_args default_args;
        struct request *req;
        struct scsi_cmnd *scmd;
        int ret;

        if (!args)
                args = &default_args;
        else if (WARN_ON_ONCE(args->sense &&
                              args->sense_len != SCSI_SENSE_BUFFERSIZE))
                return -EINVAL;

retry:
        req = scsi_alloc_request(sdev->request_queue, opf, args->req_flags);
        if (IS_ERR(req))
                return PTR_ERR(req);

        if (bufflen) {
                ret = blk_rq_map_kern(sdev->request_queue, req,
                                      buffer, bufflen, GFP_NOIO);
                if (ret)
                        goto out;
        }
        scmd = blk_mq_rq_to_pdu(req);
        scmd->cmd_len = COMMAND_SIZE(cmd[0]);
        memcpy(scmd->cmnd, cmd, scmd->cmd_len);
        scmd->allowed = ml_retries;
        scmd->flags |= args->scmd_flags;
        req->timeout = timeout;
        req->rq_flags |= RQF_QUIET;

        /*
         * head injection *required* here otherwise quiesce won't work
         */
        blk_execute_rq(req, true);

        if (scsi_check_passthrough(scmd, args->failures) == -EAGAIN) {
                blk_mq_free_request(req);
                goto retry;
        }

        /*
         * Some devices (USB mass-storage in particular) may transfer
         * garbage data together with a residue indicating that the data
         * is invalid.  Prevent the garbage from being misinterpreted
         * and prevent security leaks by zeroing out the excess data.
         */
        if (unlikely(scmd->resid_len > 0 && scmd->resid_len <= bufflen))
                memset(buffer + bufflen - scmd->resid_len, 0, scmd->resid_len);

        if (args->resid)
                *args->resid = scmd->resid_len;
        if (args->sense)
                memcpy(args->sense, scmd->sense_buffer, SCSI_SENSE_BUFFERSIZE);
        if (args->sshdr)
                scsi_normalize_sense(scmd->sense_buffer, scmd->sense_len,
                                     args->sshdr);

        ret = scmd->result;
 out:
        blk_mq_free_request(req);

        return ret;
}
EXPORT_SYMBOL(scsi_execute_cmd);

/*
 * Wake up the error handler if necessary. Avoid as follows that the error
 * handler is not woken up if host in-flight requests number ==
 * shost->host_failed: use call_rcu() in scsi_eh_scmd_add() in combination
 * with an RCU read lock in this function to ensure that this function in
 * its entirety either finishes before scsi_eh_scmd_add() increases the
 * host_failed counter or that it notices the shost state change made by
 * scsi_eh_scmd_add().
 */
static void scsi_dec_host_busy(struct Scsi_Host *shost, struct scsi_cmnd *cmd)
{
        unsigned long flags;

        rcu_read_lock();
        __clear_bit(SCMD_STATE_INFLIGHT, &cmd->state);
        if (unlikely(scsi_host_in_recovery(shost))) {
                unsigned int busy = scsi_host_busy(shost);

                spin_lock_irqsave(shost->host_lock, flags);
                if (shost->host_failed || shost->host_eh_scheduled)
                        scsi_eh_wakeup(shost, busy);
                spin_unlock_irqrestore(shost->host_lock, flags);
        }
        rcu_read_unlock();
}

void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd)
{
        struct Scsi_Host *shost = sdev->host;
        struct scsi_target *starget = scsi_target(sdev);

        scsi_dec_host_busy(shost, cmd);

        if (starget->can_queue > 0)
                atomic_dec(&starget->target_busy);

        sbitmap_put(&sdev->budget_map, cmd->budget_token);
        cmd->budget_token = -1;
}

/*
 * Kick the queue of SCSI device @sdev if @sdev != current_sdev. Called with
 * interrupts disabled.
 */
static void scsi_kick_sdev_queue(struct scsi_device *sdev, void *data)
{
        struct scsi_device *current_sdev = data;

        if (sdev != current_sdev)
                blk_mq_run_hw_queues(sdev->request_queue, true);
}

/*
 * Called for single_lun devices on IO completion. Clear starget_sdev_user,
 * and call blk_run_queue for all the scsi_devices on the target -
 * including current_sdev first.
 *
 * Called with *no* scsi locks held.
 */
static void scsi_single_lun_run(struct scsi_device *current_sdev)
{
        struct Scsi_Host *shost = current_sdev->host;
        struct scsi_target *starget = scsi_target(current_sdev);
        unsigned long flags;

        spin_lock_irqsave(shost->host_lock, flags);
        starget->starget_sdev_user = NULL;
        spin_unlock_irqrestore(shost->host_lock, flags);

        /*
         * Call blk_run_queue for all LUNs on the target, starting with
         * current_sdev. We race with others (to set starget_sdev_user),
         * but in most cases, we will be first. Ideally, each LU on the
         * target would get some limited time or requests on the target.
         */
        blk_mq_run_hw_queues(current_sdev->request_queue,
                             shost->queuecommand_may_block);

        spin_lock_irqsave(shost->host_lock, flags);
        if (!starget->starget_sdev_user)
                __starget_for_each_device(starget, current_sdev,
                                          scsi_kick_sdev_queue);
        spin_unlock_irqrestore(shost->host_lock, flags);
}

static inline bool scsi_device_is_busy(struct scsi_device *sdev)
{
        if (scsi_device_busy(sdev) >= sdev->queue_depth)
                return true;
        if (atomic_read(&sdev->device_blocked) > 0)
                return true;
        return false;
}

static inline bool scsi_target_is_busy(struct scsi_target *starget)
{
        if (starget->can_queue > 0) {
                if (atomic_read(&starget->target_busy) >= starget->can_queue)
                        return true;
                if (atomic_read(&starget->target_blocked) > 0)
                        return true;
        }
        return false;
}

static inline bool scsi_host_is_busy(struct Scsi_Host *shost)
{
        if (atomic_read(&shost->host_blocked) > 0)
                return true;
        if (shost->host_self_blocked)
                return true;
        return false;
}

static void scsi_starved_list_run(struct Scsi_Host *shost)
{
        LIST_HEAD(starved_list);
        struct scsi_device *sdev;
        unsigned long flags;

        spin_lock_irqsave(shost->host_lock, flags);
        list_splice_init(&shost->starved_list, &starved_list);

        while (!list_empty(&starved_list)) {
                struct request_queue *slq;

                /*
                 * As long as shost is accepting commands and we have
                 * starved queues, call blk_run_queue. scsi_request_fn
                 * drops the queue_lock and can add us back to the
                 * starved_list.
                 *
                 * host_lock protects the starved_list and starved_entry.
                 * scsi_request_fn must get the host_lock before checking
                 * or modifying starved_list or starved_entry.
                 */
                if (scsi_host_is_busy(shost))
                        break;

                sdev = list_entry(starved_list.next,
                                  struct scsi_device, starved_entry);
                list_del_init(&sdev->starved_entry);
                if (scsi_target_is_busy(scsi_target(sdev))) {
                        list_move_tail(&sdev->starved_entry,
                                       &shost->starved_list);
                        continue;
                }

                /*
                 * Once we drop the host lock, a racing scsi_remove_device()
                 * call may remove the sdev from the starved list and destroy
                 * it and the queue.  Mitigate by taking a reference to the
                 * queue and never touching the sdev again after we drop the
                 * host lock.  Note: if __scsi_remove_device() invokes
                 * blk_mq_destroy_queue() before the queue is run from this
                 * function then blk_run_queue() will return immediately since
                 * blk_mq_destroy_queue() marks the queue with QUEUE_FLAG_DYING.
                 */
                slq = sdev->request_queue;
                if (!blk_get_queue(slq))
                        continue;
                spin_unlock_irqrestore(shost->host_lock, flags);

                blk_mq_run_hw_queues(slq, false);
                blk_put_queue(slq);

                spin_lock_irqsave(shost->host_lock, flags);
        }
        /* put any unprocessed entries back */
        list_splice(&starved_list, &shost->starved_list);
        spin_unlock_irqrestore(shost->host_lock, flags);
}

/**
 * scsi_run_queue - Select a proper request queue to serve next.
 * @q:  last request's queue
 *
 * The previous command was completely finished, start a new one if possible.
 */
static void scsi_run_queue(struct request_queue *q)
{
        struct scsi_device *sdev = q->queuedata;

        if (scsi_target(sdev)->single_lun)
                scsi_single_lun_run(sdev);
        if (!list_empty(&sdev->host->starved_list))
                scsi_starved_list_run(sdev->host);

        /* Note: blk_mq_kick_requeue_list() runs the queue asynchronously. */
        blk_mq_kick_requeue_list(q);
}

void scsi_requeue_run_queue(struct work_struct *work)
{
        struct scsi_device *sdev;
        struct request_queue *q;

        sdev = container_of(work, struct scsi_device, requeue_work);
        q = sdev->request_queue;
        scsi_run_queue(q);
}

void scsi_run_host_queues(struct Scsi_Host *shost)
{
        struct scsi_device *sdev;

        shost_for_each_device(sdev, shost)
                scsi_run_queue(sdev->request_queue);
}

static void scsi_uninit_cmd(struct scsi_cmnd *cmd)
{
        if (!blk_rq_is_passthrough(scsi_cmd_to_rq(cmd))) {
                struct scsi_driver *drv = scsi_cmd_to_driver(cmd);

                if (drv->uninit_command)
                        drv->uninit_command(cmd);
        }
}

void scsi_free_sgtables(struct scsi_cmnd *cmd)
{
        if (cmd->sdb.table.nents)
                sg_free_table_chained(&cmd->sdb.table,
                                SCSI_INLINE_SG_CNT);
        if (scsi_prot_sg_count(cmd))
                sg_free_table_chained(&cmd->prot_sdb->table,
                                SCSI_INLINE_PROT_SG_CNT);
}
EXPORT_SYMBOL_GPL(scsi_free_sgtables);

static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd)
{
        scsi_free_sgtables(cmd);
        scsi_uninit_cmd(cmd);
}

static void scsi_run_queue_async(struct scsi_device *sdev)
{
        if (scsi_host_in_recovery(sdev->host))
                return;

        if (scsi_target(sdev)->single_lun ||
            !list_empty(&sdev->host->starved_list)) {
                kblockd_schedule_work(&sdev->requeue_work);
        } else {
                /*
                 * smp_mb() present in sbitmap_queue_clear() or implied in
                 * .end_io is for ordering writing .device_busy in
                 * scsi_device_unbusy() and reading sdev->restarts.
                 */
                int old = atomic_read(&sdev->restarts);

                /*
                 * ->restarts has to be kept as non-zero if new budget
                 *  contention occurs.
                 *
                 *  No need to run queue when either another re-run
                 *  queue wins in updating ->restarts or a new budget
                 *  contention occurs.
                 */
                if (old && atomic_cmpxchg(&sdev->restarts, old, 0) == old)
                        blk_mq_run_hw_queues(sdev->request_queue, true);
        }
}

/* Returns false when no more bytes to process, true if there are more */
static bool scsi_end_request(struct request *req, blk_status_t error,
                unsigned int bytes)
{
        struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
        struct scsi_device *sdev = cmd->device;
        struct request_queue *q = sdev->request_queue;

        if (blk_update_request(req, error, bytes))
                return true;

        // XXX:
        if (blk_queue_add_random(q))
                add_disk_randomness(req->q->disk);

        WARN_ON_ONCE(!blk_rq_is_passthrough(req) &&
                     !(cmd->flags & SCMD_INITIALIZED));
        cmd->flags = 0;

        /*
         * Calling rcu_barrier() is not necessary here because the
         * SCSI error handler guarantees that the function called by
         * call_rcu() has been called before scsi_end_request() is
         * called.
         */
        destroy_rcu_head(&cmd->rcu);

        /*
         * In the MQ case the command gets freed by __blk_mq_end_request,
         * so we have to do all cleanup that depends on it earlier.
         *
         * We also can't kick the queues from irq context, so we
         * will have to defer it to a workqueue.
         */
        scsi_mq_uninit_cmd(cmd);

        /*
         * queue is still alive, so grab the ref for preventing it
         * from being cleaned up during running queue.
         */
        percpu_ref_get(&q->q_usage_counter);

        __blk_mq_end_request(req, error);

        scsi_run_queue_async(sdev);

        percpu_ref_put(&q->q_usage_counter);
        return false;
}

/**
 * scsi_result_to_blk_status - translate a SCSI result code into blk_status_t
 * @result:        scsi error code
 *
 * Translate a SCSI result code into a blk_status_t value.
 */
static blk_status_t scsi_result_to_blk_status(int result)
{
        /*
         * Check the scsi-ml byte first in case we converted a host or status
         * byte.
         */
        switch (scsi_ml_byte(result)) {
        case SCSIML_STAT_OK:
                break;
        case SCSIML_STAT_RESV_CONFLICT:
                return BLK_STS_RESV_CONFLICT;
        case SCSIML_STAT_NOSPC:
                return BLK_STS_NOSPC;
        case SCSIML_STAT_MED_ERROR:
                return BLK_STS_MEDIUM;
        case SCSIML_STAT_TGT_FAILURE:
                return BLK_STS_TARGET;
        case SCSIML_STAT_DL_TIMEOUT:
                return BLK_STS_DURATION_LIMIT;
        }

        switch (host_byte(result)) {
        case DID_OK:
                if (scsi_status_is_good(result))
                        return BLK_STS_OK;
                return BLK_STS_IOERR;
        case DID_TRANSPORT_FAILFAST:
        case DID_TRANSPORT_MARGINAL:
                return BLK_STS_TRANSPORT;
        default:
                return BLK_STS_IOERR;
        }
}

/**
 * scsi_rq_err_bytes - determine number of bytes till the next failure boundary
 * @rq: request to examine
 *
 * Description:
 *     A request could be merge of IOs which require different failure
 *     handling.  This function determines the number of bytes which
 *     can be failed from the beginning of the request without
 *     crossing into area which need to be retried further.
 *
 * Return:
 *     The number of bytes to fail.
 */
static unsigned int scsi_rq_err_bytes(const struct request *rq)
{
        blk_opf_t ff = rq->cmd_flags & REQ_FAILFAST_MASK;
        unsigned int bytes = 0;
        struct bio *bio;

        if (!(rq->rq_flags & RQF_MIXED_MERGE))
                return blk_rq_bytes(rq);

        /*
         * Currently the only 'mixing' which can happen is between
         * different fastfail types.  We can safely fail portions
         * which have all the failfast bits that the first one has -
         * the ones which are at least as eager to fail as the first
         * one.
         */
        for (bio = rq->bio; bio; bio = bio->bi_next) {
                if ((bio->bi_opf & ff) != ff)
                        break;
                bytes += bio->bi_iter.bi_size;
        }

        /* this could lead to infinite loop */
        BUG_ON(blk_rq_bytes(rq) && !bytes);
        return bytes;
}

static bool scsi_cmd_runtime_exceeced(struct scsi_cmnd *cmd)
{
        struct request *req = scsi_cmd_to_rq(cmd);
        unsigned long wait_for;

        if (cmd->allowed == SCSI_CMD_RETRIES_NO_LIMIT)
                return false;

        wait_for = (cmd->allowed + 1) * req->timeout;
        if (time_before(cmd->jiffies_at_alloc + wait_for, jiffies)) {
                scmd_printk(KERN_ERR, cmd, "timing out command, waited %lus\n",
                            wait_for/HZ);
                return true;
        }
        return false;
}

/*
 * When ALUA transition state is returned, reprep the cmd to
 * use the ALUA handler's transition timeout. Delay the reprep
 * 1 sec to avoid aggressive retries of the target in that
 * state.
 */
#define ALUA_TRANSITION_REPREP_DELAY        1000

/* Helper for scsi_io_completion() when special action required. */
static void scsi_io_completion_action(struct scsi_cmnd *cmd, int result)
{
        struct request *req = scsi_cmd_to_rq(cmd);
        int level = 0;
        enum {ACTION_FAIL, ACTION_REPREP, ACTION_DELAYED_REPREP,
              ACTION_RETRY, ACTION_DELAYED_RETRY} action;
        struct scsi_sense_hdr sshdr;
        bool sense_valid;
        bool sense_current = true;      /* false implies "deferred sense" */
        blk_status_t blk_stat;

        sense_valid = scsi_command_normalize_sense(cmd, &sshdr);
        if (sense_valid)
                sense_current = !scsi_sense_is_deferred(&sshdr);

        blk_stat = scsi_result_to_blk_status(result);

        if (host_byte(result) == DID_RESET) {
                /* Third party bus reset or reset for error recovery
                 * reasons.  Just retry the command and see what
                 * happens.
                 */
                action = ACTION_RETRY;
        } else if (sense_valid && sense_current) {
                switch (sshdr.sense_key) {
                case UNIT_ATTENTION:
                        if (cmd->device->removable) {
                                /* Detected disc change.  Set a bit
                                 * and quietly refuse further access.
                                 */
                                cmd->device->changed = 1;
                                action = ACTION_FAIL;
                        } else {
                                /* Must have been a power glitch, or a
                                 * bus reset.  Could not have been a
                                 * media change, so we just retry the
                                 * command and see what happens.
                                 */
                                action = ACTION_RETRY;
                        }
                        break;
                case ILLEGAL_REQUEST:
                        /* If we had an ILLEGAL REQUEST returned, then
                         * we may have performed an unsupported
                         * command.  The only thing this should be
                         * would be a ten byte read where only a six
                         * byte read was supported.  Also, on a system
                         * where READ CAPACITY failed, we may have
                         * read past the end of the disk.
                         */
                        if ((cmd->device->use_10_for_rw &&
                            sshdr.asc == 0x20 && sshdr.ascq == 0x00) &&
                            (cmd->cmnd[0] == READ_10 ||
                             cmd->cmnd[0] == WRITE_10)) {
                                /* This will issue a new 6-byte command. */
                                cmd->device->use_10_for_rw = 0;
                                action = ACTION_REPREP;
                        } else if (sshdr.asc == 0x10) /* DIX */ {
                                action = ACTION_FAIL;
                                blk_stat = BLK_STS_PROTECTION;
                        /* INVALID COMMAND OPCODE or INVALID FIELD IN CDB */
                        } else if (sshdr.asc == 0x20 || sshdr.asc == 0x24) {
                                action = ACTION_FAIL;
                                blk_stat = BLK_STS_TARGET;
                        } else
                                action = ACTION_FAIL;
                        break;
                case ABORTED_COMMAND:
                        action = ACTION_FAIL;
                        if (sshdr.asc == 0x10) /* DIF */
                                blk_stat = BLK_STS_PROTECTION;
                        break;
                case NOT_READY:
                        /* If the device is in the process of becoming
                         * ready, or has a temporary blockage, retry.
                         */
                        if (sshdr.asc == 0x04) {
                                switch (sshdr.ascq) {
                                case 0x01: /* becoming ready */
                                case 0x04: /* format in progress */
                                case 0x05: /* rebuild in progress */
                                case 0x06: /* recalculation in progress */
                                case 0x07: /* operation in progress */
                                case 0x08: /* Long write in progress */
                                case 0x09: /* self test in progress */
                                case 0x11: /* notify (enable spinup) required */
                                case 0x14: /* space allocation in progress */
                                case 0x1a: /* start stop unit in progress */
                                case 0x1b: /* sanitize in progress */
                                case 0x1d: /* configuration in progress */
                                case 0x24: /* depopulation in progress */
                                case 0x25: /* depopulation restore in progress */
                                        action = ACTION_DELAYED_RETRY;
                                        break;
                                case 0x0a: /* ALUA state transition */
                                        action = ACTION_DELAYED_REPREP;
                                        break;
                                default:
                                        action = ACTION_FAIL;
                                        break;
                                }
                        } else
                                action = ACTION_FAIL;
                        break;
                case VOLUME_OVERFLOW:
                        /* See SSC3rXX or current. */
                        action = ACTION_FAIL;
                        break;
                case DATA_PROTECT:
                        action = ACTION_FAIL;
                        if ((sshdr.asc == 0x0C && sshdr.ascq == 0x12) ||
                            (sshdr.asc == 0x55 &&
                             (sshdr.ascq == 0x0E || sshdr.ascq == 0x0F))) {
                                /* Insufficient zone resources */
                                blk_stat = BLK_STS_ZONE_OPEN_RESOURCE;
                        }
                        break;
                case COMPLETED:
                        fallthrough;
                default:
                        action = ACTION_FAIL;
                        break;
                }
        } else
                action = ACTION_FAIL;

        if (action != ACTION_FAIL && scsi_cmd_runtime_exceeced(cmd))
                action = ACTION_FAIL;

        switch (action) {
        case ACTION_FAIL:
                /* Give up and fail the remainder of the request */
                if (!(req->rq_flags & RQF_QUIET)) {
                        static DEFINE_RATELIMIT_STATE(_rs,
                                        DEFAULT_RATELIMIT_INTERVAL,
                                        DEFAULT_RATELIMIT_BURST);

                        if (unlikely(scsi_logging_level))
                                level =
                                     SCSI_LOG_LEVEL(SCSI_LOG_MLCOMPLETE_SHIFT,
                                                    SCSI_LOG_MLCOMPLETE_BITS);

                        /*
                         * if logging is enabled the failure will be printed
                         * in scsi_log_completion(), so avoid duplicate messages
                         */
                        if (!level && __ratelimit(&_rs)) {
                                scsi_print_result(cmd, NULL, FAILED);
                                if (sense_valid)
                                        scsi_print_sense(cmd);
                                scsi_print_command(cmd);
                        }
                }
                if (!scsi_end_request(req, blk_stat, scsi_rq_err_bytes(req)))
                        return;
                fallthrough;
        case ACTION_REPREP:
                scsi_mq_requeue_cmd(cmd, 0);
                break;
        case ACTION_DELAYED_REPREP:
                scsi_mq_requeue_cmd(cmd, ALUA_TRANSITION_REPREP_DELAY);
                break;
        case ACTION_RETRY:
                /* Retry the same command immediately */
                __scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY, false);
                break;
        case ACTION_DELAYED_RETRY:
                /* Retry the same command after a delay */
                __scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY, false);
                break;
        }
}

/*
 * Helper for scsi_io_completion() when cmd->result is non-zero. Returns a
 * new result that may suppress further error checking. Also modifies
 * *blk_statp in some cases.
 */
static int scsi_io_completion_nz_result(struct scsi_cmnd *cmd, int result,
                                        blk_status_t *blk_statp)
{
        bool sense_valid;
        bool sense_current = true;        /* false implies "deferred sense" */
        struct request *req = scsi_cmd_to_rq(cmd);
        struct scsi_sense_hdr sshdr;

        sense_valid = scsi_command_normalize_sense(cmd, &sshdr);
        if (sense_valid)
                sense_current = !scsi_sense_is_deferred(&sshdr);

        if (blk_rq_is_passthrough(req)) {
                if (sense_valid) {
                        /*
                         * SG_IO wants current and deferred errors
                         */
                        cmd->sense_len = min(8 + cmd->sense_buffer[7],
                                             SCSI_SENSE_BUFFERSIZE);
                }
                if (sense_current)
                        *blk_statp = scsi_result_to_blk_status(result);
        } else if (blk_rq_bytes(req) == 0 && sense_current) {
                /*
                 * Flush commands do not transfers any data, and thus cannot use
                 * good_bytes != blk_rq_bytes(req) as the signal for an error.
                 * This sets *blk_statp explicitly for the problem case.
                 */
                *blk_statp = scsi_result_to_blk_status(result);
        }
        /*
         * Recovered errors need reporting, but they're always treated as
         * success, so fiddle the result code here.  For passthrough requests
         * we already took a copy of the original into sreq->result which
         * is what gets returned to the user
         */
        if (sense_valid && (sshdr.sense_key == RECOVERED_ERROR)) {
                bool do_print = true;
                /*
                 * if ATA PASS-THROUGH INFORMATION AVAILABLE [0x0, 0x1d]
                 * skip print since caller wants ATA registers. Only occurs
                 * on SCSI ATA PASS_THROUGH commands when CK_COND=1
                 */
                if ((sshdr.asc == 0x0) && (sshdr.ascq == 0x1d))
                        do_print = false;
                else if (req->rq_flags & RQF_QUIET)
                        do_print = false;
                if (do_print)
                        scsi_print_sense(cmd);
                result = 0;
                /* for passthrough, *blk_statp may be set */
                *blk_statp = BLK_STS_OK;
        }
        /*
         * Another corner case: the SCSI status byte is non-zero but 'good'.
         * Example: PRE-FETCH command returns SAM_STAT_CONDITION_MET when
         * it is able to fit nominated LBs in its cache (and SAM_STAT_GOOD
         * if it can't fit). Treat SAM_STAT_CONDITION_MET and the related
         * intermediate statuses (both obsolete in SAM-4) as good.
         */
        if ((result & 0xff) && scsi_status_is_good(result)) {
                result = 0;
                *blk_statp = BLK_STS_OK;
        }
        return result;
}

/**
 * scsi_io_completion - Completion processing for SCSI commands.
 * @cmd:        command that is finished.
 * @good_bytes:        number of processed bytes.
 *
 * We will finish off the specified number of sectors. If we are done, the
 * command block will be released and the queue function will be goosed. If we
 * are not done then we have to figure out what to do next:
 *
 *   a) We can call scsi_mq_requeue_cmd().  The request will be
 *        unprepared and put back on the queue.  Then a new command will
 *        be created for it.  This should be used if we made forward
 *        progress, or if we want to switch from READ(10) to READ(6) for
 *        example.
 *
 *   b) We can call scsi_io_completion_action().  The request will be
 *        put back on the queue and retried using the same command as
 *        before, possibly after a delay.
 *
 *   c) We can call scsi_end_request() with blk_stat other than
 *        BLK_STS_OK, to fail the remainder of the request.
 */
void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
{
        int result = cmd->result;
        struct request *req = scsi_cmd_to_rq(cmd);
        blk_status_t blk_stat = BLK_STS_OK;

        if (unlikely(result))        /* a nz result may or may not be an error */
                result = scsi_io_completion_nz_result(cmd, result, &blk_stat);

        /*
         * Next deal with any sectors which we were able to correctly
         * handle.
         */
        SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, cmd,
                "%u sectors total, %d bytes done.\n",
                blk_rq_sectors(req), good_bytes));

        /*
         * Failed, zero length commands always need to drop down
         * to retry code. Fast path should return in this block.
         */
        if (likely(blk_rq_bytes(req) > 0 || blk_stat == BLK_STS_OK)) {
                if (likely(!scsi_end_request(req, blk_stat, good_bytes)))
                        return; /* no bytes remaining */
        }

        /* Kill remainder if no retries. */
        if (unlikely(blk_stat && scsi_noretry_cmd(cmd))) {
                if (scsi_end_request(req, blk_stat, blk_rq_bytes(req)))
                        WARN_ONCE(true,
                            "Bytes remaining after failed, no-retry command");
                return;
        }

        /*
         * If there had been no error, but we have leftover bytes in the
         * request just queue the command up again.
         */
        if (likely(result == 0))
                scsi_mq_requeue_cmd(cmd, 0);
        else
                scsi_io_completion_action(cmd, result);
}

static inline bool scsi_cmd_needs_dma_drain(struct scsi_device *sdev,
                struct request *rq)
{
        return sdev->dma_drain_len && blk_rq_is_passthrough(rq) &&
               !op_is_write(req_op(rq)) &&
               sdev->host->hostt->dma_need_drain(rq);
}

/**
 * scsi_alloc_sgtables - Allocate and initialize data and integrity scatterlists
 * @cmd: SCSI command data structure to initialize.
 *
 * Initializes @cmd->sdb and also @cmd->prot_sdb if data integrity is enabled
 * for @cmd.
 *
 * Returns:
 * * BLK_STS_OK       - on success
 * * BLK_STS_RESOURCE - if the failure is retryable
 * * BLK_STS_IOERR    - if the failure is fatal
 */
blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd)
{
        struct scsi_device *sdev = cmd->device;
        struct request *rq = scsi_cmd_to_rq(cmd);
        unsigned short nr_segs = blk_rq_nr_phys_segments(rq);
        struct scatterlist *last_sg = NULL;
        blk_status_t ret;
        bool need_drain = scsi_cmd_needs_dma_drain(sdev, rq);
        int count;

        if (WARN_ON_ONCE(!nr_segs))
                return BLK_STS_IOERR;

        /*
         * Make sure there is space for the drain.  The driver must adjust
         * max_hw_segments to be prepared for this.
         */
        if (need_drain)
                nr_segs++;

        /*
         * If sg table allocation fails, requeue request later.
         */
        if (unlikely(sg_alloc_table_chained(&cmd->sdb.table, nr_segs,
                        cmd->sdb.table.sgl, SCSI_INLINE_SG_CNT)))
                return BLK_STS_RESOURCE;

        /*
         * Next, walk the list, and fill in the addresses and sizes of
         * each segment.
         */
        count = __blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl, &last_sg);

        if (blk_rq_bytes(rq) & rq->q->dma_pad_mask) {
                unsigned int pad_len =
                        (rq->q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1;

                last_sg->length += pad_len;
                cmd->extra_len += pad_len;
        }

        if (need_drain) {
                sg_unmark_end(last_sg);
                last_sg = sg_next(last_sg);
                sg_set_buf(last_sg, sdev->dma_drain_buf, sdev->dma_drain_len);
                sg_mark_end(last_sg);

                cmd->extra_len += sdev->dma_drain_len;
                count++;
        }

        BUG_ON(count > cmd->sdb.table.nents);
        cmd->sdb.table.nents = count;
        cmd->sdb.length = blk_rq_payload_bytes(rq);

        if (blk_integrity_rq(rq)) {
                struct scsi_data_buffer *prot_sdb = cmd->prot_sdb;
                int ivecs;

                if (WARN_ON_ONCE(!prot_sdb)) {
                        /*
                         * This can happen if someone (e.g. multipath)
                         * queues a command to a device on an adapter
                         * that does not support DIX.
                         */
                        ret = BLK_STS_IOERR;
                        goto out_free_sgtables;
                }

                ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio);

                if (sg_alloc_table_chained(&prot_sdb->table, ivecs,
                                prot_sdb->table.sgl,
                                SCSI_INLINE_PROT_SG_CNT)) {
                        ret = BLK_STS_RESOURCE;
                        goto out_free_sgtables;
                }

                count = blk_rq_map_integrity_sg(rq->q, rq->bio,
                                                prot_sdb->table.sgl);
                BUG_ON(count > ivecs);
                BUG_ON(count > queue_max_integrity_segments(rq->q));

                cmd->prot_sdb = prot_sdb;
                cmd->prot_sdb->table.nents = count;
        }

        return BLK_STS_OK;
out_free_sgtables:
        scsi_free_sgtables(cmd);
        return ret;
}
EXPORT_SYMBOL(scsi_alloc_sgtables);

/**
 * scsi_initialize_rq - initialize struct scsi_cmnd partially
 * @rq: Request associated with the SCSI command to be initialized.
 *
 * This function initializes the members of struct scsi_cmnd that must be
 * initialized before request processing starts and that won't be
 * reinitialized if a SCSI command is requeued.
 */
static void scsi_initialize_rq(struct request *rq)
{
        struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);

        memset(cmd->cmnd, 0, sizeof(cmd->cmnd));
        cmd->cmd_len = MAX_COMMAND_SIZE;
        cmd->sense_len = 0;
        init_rcu_head(&cmd->rcu);
        cmd->jiffies_at_alloc = jiffies;
        cmd->retries = 0;
}

struct request *scsi_alloc_request(struct request_queue *q, blk_opf_t opf,
                                   blk_mq_req_flags_t flags)
{
        struct request *rq;

        rq = blk_mq_alloc_request(q, opf, flags);
        if (!IS_ERR(rq))
                scsi_initialize_rq(rq);
        return rq;
}
EXPORT_SYMBOL_GPL(scsi_alloc_request);

/*
 * Only called when the request isn't completed by SCSI, and not freed by
 * SCSI
 */
static void scsi_cleanup_rq(struct request *rq)
{
        if (rq->rq_flags & RQF_DONTPREP) {
                scsi_mq_uninit_cmd(blk_mq_rq_to_pdu(rq));
                rq->rq_flags &= ~RQF_DONTPREP;
        }
}

/* Called before a request is prepared. See also scsi_mq_prep_fn(). */
void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
{
        struct request *rq = scsi_cmd_to_rq(cmd);

        if (!blk_rq_is_passthrough(rq) && !(cmd->flags & SCMD_INITIALIZED)) {
                cmd->flags |= SCMD_INITIALIZED;
                scsi_initialize_rq(rq);
        }

        cmd->device = dev;
        INIT_LIST_HEAD(&cmd->eh_entry);
        INIT_DELAYED_WORK(&cmd->abort_work, scmd_eh_abort_handler);
}

static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev,
                struct request *req)
{
        struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);

        /*
         * Passthrough requests may transfer data, in which case they must
         * a bio attached to them.  Or they might contain a SCSI command
         * that does not transfer data, in which case they may optionally
         * submit a request without an attached bio.
         */
        if (req->bio) {
                blk_status_t ret = scsi_alloc_sgtables(cmd);
                if (unlikely(ret != BLK_STS_OK))
                        return ret;
        } else {
                BUG_ON(blk_rq_bytes(req));

                memset(&cmd->sdb, 0, sizeof(cmd->sdb));
        }

        cmd->transfersize = blk_rq_bytes(req);
        return BLK_STS_OK;
}

static blk_status_t
scsi_device_state_check(struct scsi_device *sdev, struct request *req)
{
        switch (sdev->sdev_state) {
        case SDEV_CREATED:
                return BLK_STS_OK;
        case SDEV_OFFLINE:
        case SDEV_TRANSPORT_OFFLINE:
                /*
                 * If the device is offline we refuse to process any
                 * commands.  The device must be brought online
                 * before trying any recovery commands.
                 */
                if (!sdev->offline_already) {
                        sdev->offline_already = true;
                        sdev_printk(KERN_ERR, sdev,
                                    "rejecting I/O to offline device\n");
                }
                return BLK_STS_IOERR;
        case SDEV_DEL:
                /*
                 * If the device is fully deleted, we refuse to
                 * process any commands as well.
                 */
                sdev_printk(KERN_ERR, sdev,
                            "rejecting I/O to dead device\n");
                return BLK_STS_IOERR;
        case SDEV_BLOCK:
        case SDEV_CREATED_BLOCK:
                return BLK_STS_RESOURCE;
        case SDEV_QUIESCE:
                /*
                 * If the device is blocked we only accept power management
                 * commands.
                 */
                if (req && WARN_ON_ONCE(!(req->rq_flags & RQF_PM)))
                        return BLK_STS_RESOURCE;
                return BLK_STS_OK;
        default:
                /*
                 * For any other not fully online state we only allow
                 * power management commands.
                 */
                if (req && !(req->rq_flags & RQF_PM))
                        return BLK_STS_OFFLINE;
                return BLK_STS_OK;
        }
}

/*
 * scsi_dev_queue_ready: if we can send requests to sdev, assign one token
 * and return the token else return -1.
 */
static inline int scsi_dev_queue_ready(struct request_queue *q,
                                  struct scsi_device *sdev)
{
        int token;

        token = sbitmap_get(&sdev->budget_map);
        if (token < 0)
                return -1;

        if (!atomic_read(&sdev->device_blocked))
                return token;

        /*
         * Only unblock if no other commands are pending and
         * if device_blocked has decreased to zero
         */
        if (scsi_device_busy(sdev) > 1 ||
            atomic_dec_return(&sdev->device_blocked) > 0) {
                sbitmap_put(&sdev->budget_map, token);
                return -1;
        }

        SCSI_LOG_MLQUEUE(3, sdev_printk(KERN_INFO, sdev,
                         "unblocking device at zero depth\n"));

        return token;
}

/*
 * scsi_target_queue_ready: checks if there we can send commands to target
 * @sdev: scsi device on starget to check.
 */
static inline int scsi_target_queue_ready(struct Scsi_Host *shost,
                                           struct scsi_device *sdev)
{
        struct scsi_target *starget = scsi_target(sdev);
        unsigned int busy;

        if (starget->single_lun) {
                spin_lock_irq(shost->host_lock);
                if (starget->starget_sdev_user &&
                    starget->starget_sdev_user != sdev) {
                        spin_unlock_irq(shost->host_lock);
                        return 0;
                }
                starget->starget_sdev_user = sdev;
                spin_unlock_irq(shost->host_lock);
        }

        if (starget->can_queue <= 0)
                return 1;

        busy = atomic_inc_return(&starget->target_busy) - 1;
        if (atomic_read(&starget->target_blocked) > 0) {
                if (busy)
                        goto starved;

                /*
                 * unblock after target_blocked iterates to zero
                 */
                if (atomic_dec_return(&starget->target_blocked) > 0)
                        goto out_dec;

                SCSI_LOG_MLQUEUE(3, starget_printk(KERN_INFO, starget,
                                 "unblocking target at zero depth\n"));
        }

        if (busy >= starget->can_queue)
                goto starved;

        return 1;

starved:
        spin_lock_irq(shost->host_lock);
        list_move_tail(&sdev->starved_entry, &shost->starved_list);
        spin_unlock_irq(shost->host_lock);
out_dec:
        if (starget->can_queue > 0)
                atomic_dec(&starget->target_busy);
        return 0;
}

/*
 * scsi_host_queue_ready: if we can send requests to shost, return 1 else
 * return 0. We must end up running the queue again whenever 0 is
 * returned, else IO can hang.
 */
static inline int scsi_host_queue_ready(struct request_queue *q,
                                   struct Scsi_Host *shost,
                                   struct scsi_device *sdev,
                                   struct scsi_cmnd *cmd)
{
        if (atomic_read(&shost->host_blocked) > 0) {
                if (scsi_host_busy(shost) > 0)
                        goto starved;

                /*
                 * unblock after host_blocked iterates to zero
                 */
                if (atomic_dec_return(&shost->host_blocked) > 0)
                        goto out_dec;

                SCSI_LOG_MLQUEUE(3,
                        shost_printk(KERN_INFO, shost,
                                     "unblocking host at zero depth\n"));
        }

        if (shost->host_self_blocked)
                goto starved;

        /* We're OK to process the command, so we can't be starved */
        if (!list_empty(&sdev->starved_entry)) {
                spin_lock_irq(shost->host_lock);
                if (!list_empty(&sdev->starved_entry))
                        list_del_init(&sdev->starved_entry);
                spin_unlock_irq(shost->host_lock);
        }

        __set_bit(SCMD_STATE_INFLIGHT, &cmd->state);

        return 1;

starved:
        spin_lock_irq(shost->host_lock);
        if (list_empty(&sdev->starved_entry))
                list_add_tail(&sdev->starved_entry, &shost->starved_list);
        spin_unlock_irq(shost->host_lock);
out_dec:
        scsi_dec_host_busy(shost, cmd);
        return 0;
}

/*
 * Busy state exporting function for request stacking drivers.
 *
 * For efficiency, no lock is taken to check the busy state of
 * shost/starget/sdev, since the returned value is not guaranteed and
 * may be changed after request stacking drivers call the function,
 * regardless of taking lock or not.
 *
 * When scsi can't dispatch I/Os anymore and needs to kill I/Os scsi
 * needs to return 'not busy'. Otherwise, request stacking drivers
 * may hold requests forever.
 */
static bool scsi_mq_lld_busy(struct request_queue *q)
{
        struct scsi_device *sdev = q->queuedata;
        struct Scsi_Host *shost;

        if (blk_queue_dying(q))
                return false;

        shost = sdev->host;

        /*
         * Ignore host/starget busy state.
         * Since block layer does not have a concept of fairness across
         * multiple queues, congestion of host/starget needs to be handled
         * in SCSI layer.
         */
        if (scsi_host_in_recovery(shost) || scsi_device_is_busy(sdev))
                return true;

        return false;
}

/*
 * Block layer request completion callback. May be called from interrupt
 * context.
 */
static void scsi_complete(struct request *rq)
{
        struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
        enum scsi_disposition disposition;

        INIT_LIST_HEAD(&cmd->eh_entry);

        atomic_inc(&cmd->device->iodone_cnt);
        if (cmd->result)
                atomic_inc(&cmd->device->ioerr_cnt);

        disposition = scsi_decide_disposition(cmd);
        if (disposition != SUCCESS && scsi_cmd_runtime_exceeced(cmd))
                disposition = SUCCESS;

        scsi_log_completion(cmd, disposition);

        switch (disposition) {
        case SUCCESS:
                scsi_finish_command(cmd);
                break;
        case NEEDS_RETRY:
                scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY);
                break;
        case ADD_TO_MLQUEUE:
                scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);
                break;
        default:
                scsi_eh_scmd_add(cmd);
                break;
        }
}

/**
 * scsi_dispatch_cmd - Dispatch a command to the low-level driver.
 * @cmd: command block we are dispatching.
 *
 * Return: nonzero return request was rejected and device's queue needs to be
 * plugged.
 */
static int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
{
        struct Scsi_Host *host = cmd->device->host;
        int rtn = 0;

        atomic_inc(&cmd->device->iorequest_cnt);

        /* check if the device is still usable */
        if (unlikely(cmd->device->sdev_state == SDEV_DEL)) {
                /* in SDEV_DEL we error all commands. DID_NO_CONNECT
                 * returns an immediate error upwards, and signals
                 * that the device is no longer present */
                cmd->result = DID_NO_CONNECT << 16;
                goto done;
        }

        /* Check to see if the scsi lld made this device blocked. */
        if (unlikely(scsi_device_blocked(cmd->device))) {
                /*
                 * in blocked state, the command is just put back on
                 * the device queue.  The suspend state has already
                 * blocked the queue so future requests should not
                 * occur until the device transitions out of the
                 * suspend state.
                 */
                SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
                        "queuecommand : device blocked\n"));
                atomic_dec(&cmd->device->iorequest_cnt);
                return SCSI_MLQUEUE_DEVICE_BUSY;
        }

        /* Store the LUN value in cmnd, if needed. */
        if (cmd->device->lun_in_cdb)
                cmd->cmnd[1] = (cmd->cmnd[1] & 0x1f) |
                               (cmd->device->lun << 5 & 0xe0);

        scsi_log_send(cmd);

        /*
         * Before we queue this command, check if the command
         * length exceeds what the host adapter can handle.
         */
        if (cmd->cmd_len > cmd->device->host->max_cmd_len) {
                SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
                               "queuecommand : command too long. "
                               "cdb_size=%d host->max_cmd_len=%d\n",
                               cmd->cmd_len, cmd->device->host->max_cmd_len));
                cmd->result = (DID_ABORT << 16);
                goto done;
        }

        if (unlikely(host->shost_state == SHOST_DEL)) {
                cmd->result = (DID_NO_CONNECT << 16);
                goto done;

        }

        trace_scsi_dispatch_cmd_start(cmd);
        rtn = host->hostt->queuecommand(host, cmd);
        if (rtn) {
                atomic_dec(&cmd->device->iorequest_cnt);
                trace_scsi_dispatch_cmd_error(cmd, rtn);
                if (rtn != SCSI_MLQUEUE_DEVICE_BUSY &&
                    rtn != SCSI_MLQUEUE_TARGET_BUSY)
                        rtn = SCSI_MLQUEUE_HOST_BUSY;

                SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
                        "queuecommand : request rejected\n"));
        }

        return rtn;
 done:
        scsi_done(cmd);
        return 0;
}

/* Size in bytes of the sg-list stored in the scsi-mq command-private data. */
static unsigned int scsi_mq_inline_sgl_size(struct Scsi_Host *shost)
{
        return min_t(unsigned int, shost->sg_tablesize, SCSI_INLINE_SG_CNT) *
                sizeof(struct scatterlist);
}

static blk_status_t scsi_prepare_cmd(struct request *req)
{
        struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
        struct scsi_device *sdev = req->q->queuedata;
        struct Scsi_Host *shost = sdev->host;
        bool in_flight = test_bit(SCMD_STATE_INFLIGHT, &cmd->state);
        struct scatterlist *sg;

        scsi_init_command(sdev, cmd);

        cmd->eh_eflags = 0;
        cmd->prot_type = 0;
        cmd->prot_flags = 0;
        cmd->submitter = 0;
        memset(&cmd->sdb, 0, sizeof(cmd->sdb));
        cmd->underflow = 0;
        cmd->transfersize = 0;
        cmd->host_scribble = NULL;
        cmd->result = 0;
        cmd->extra_len = 0;
        cmd->state = 0;
        if (in_flight)
                __set_bit(SCMD_STATE_INFLIGHT, &cmd->state);

        /*
         * Only clear the driver-private command data if the LLD does not supply
         * a function to initialize that data.
         */
        if (!shost->hostt->init_cmd_priv)
                memset(cmd + 1, 0, shost->hostt->cmd_size);

        cmd->prot_op = SCSI_PROT_NORMAL;
        if (blk_rq_bytes(req))
                cmd->sc_data_direction = rq_dma_dir(req);
        else
                cmd->sc_data_direction = DMA_NONE;

        sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
        cmd->sdb.table.sgl = sg;

        if (scsi_host_get_prot(shost)) {
                memset(cmd->prot_sdb, 0, sizeof(struct scsi_data_buffer));

                cmd->prot_sdb->table.sgl =
                        (struct scatterlist *)(cmd->prot_sdb + 1);
        }

        /*
         * Special handling for passthrough commands, which don't go to the ULP
         * at all:
         */
        if (blk_rq_is_passthrough(req))
                return scsi_setup_scsi_cmnd(sdev, req);

        if (sdev->handler && sdev->handler->prep_fn) {
                blk_status_t ret = sdev->handler->prep_fn(sdev, req);

                if (ret != BLK_STS_OK)
                        return ret;
        }

        /* Usually overridden by the ULP */
        cmd->allowed = 0;
        memset(cmd->cmnd, 0, sizeof(cmd->cmnd));
        return scsi_cmd_to_driver(cmd)->init_command(cmd);
}

static void scsi_done_internal(struct scsi_cmnd *cmd, bool complete_directly)
{
        struct request *req = scsi_cmd_to_rq(cmd);

        switch (cmd->submitter) {
        case SUBMITTED_BY_BLOCK_LAYER:
                break;
        case SUBMITTED_BY_SCSI_ERROR_HANDLER:
                return scsi_eh_done(cmd);
        case SUBMITTED_BY_SCSI_RESET_IOCTL:
                return;
        }

        if (unlikely(blk_should_fake_timeout(scsi_cmd_to_rq(cmd)->q)))
                return;
        if (unlikely(test_and_set_bit(SCMD_STATE_COMPLETE, &cmd->state)))
                return;
        trace_scsi_dispatch_cmd_done(cmd);

        if (complete_directly)
                blk_mq_complete_request_direct(req, scsi_complete);
        else
                blk_mq_complete_request(req);
}

void scsi_done(struct scsi_cmnd *cmd)
{
        scsi_done_internal(cmd, false);
}
EXPORT_SYMBOL(scsi_done);

void scsi_done_direct(struct scsi_cmnd *cmd)
{
        scsi_done_internal(cmd, true);
}
EXPORT_SYMBOL(scsi_done_direct);

static void scsi_mq_put_budget(struct request_queue *q, int budget_token)
{
        struct scsi_device *sdev = q->queuedata;

        sbitmap_put(&sdev->budget_map, budget_token);
}

/*
 * When to reinvoke queueing after a resource shortage. It's 3 msecs to
 * not change behaviour from the previous unplug mechanism, experimentation
 * may prove this needs changing.
 */
#define SCSI_QUEUE_DELAY 3

static int scsi_mq_get_budget(struct request_queue *q)
{
        struct scsi_device *sdev = q->queuedata;
        int token = scsi_dev_queue_ready(q, sdev);

        if (token >= 0)
                return token;

        atomic_inc(&sdev->restarts);

        /*
         * Orders atomic_inc(&sdev->restarts) and atomic_read(&sdev->device_busy).
         * .restarts must be incremented before .device_busy is read because the
         * code in scsi_run_queue_async() depends on the order of these operations.
         */
        smp_mb__after_atomic();

        /*
         * If all in-flight requests originated from this LUN are completed
         * before reading .device_busy, sdev->device_busy will be observed as
         * zero, then blk_mq_delay_run_hw_queues() will dispatch this request
         * soon. Otherwise, completion of one of these requests will observe
         * the .restarts flag, and the request queue will be run for handling
         * this request, see scsi_end_request().
         */
        if (unlikely(scsi_device_busy(sdev) == 0 &&
                                !scsi_device_blocked(sdev)))
                blk_mq_delay_run_hw_queues(sdev->request_queue, SCSI_QUEUE_DELAY);
        return -1;
}

static void scsi_mq_set_rq_budget_token(struct request *req, int token)
{
        struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);

        cmd->budget_token = token;
}

static int scsi_mq_get_rq_budget_token(struct request *req)
{
        struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);

        return cmd->budget_token;
}

static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
                         const struct blk_mq_queue_data *bd)
{
        struct request *req = bd->rq;
        struct request_queue *q = req->q;
        struct scsi_device *sdev = q->queuedata;
        struct Scsi_Host *shost = sdev->host;
        struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
        blk_status_t ret;
        int reason;

        WARN_ON_ONCE(cmd->budget_token < 0);

        /*
         * If the device is not in running state we will reject some or all
         * commands.
         */
        if (unlikely(sdev->sdev_state != SDEV_RUNNING)) {
                ret = scsi_device_state_check(sdev, req);
                if (ret != BLK_STS_OK)
                        goto out_put_budget;
        }

        ret = BLK_STS_RESOURCE;
        if (!scsi_target_queue_ready(shost, sdev))
                goto out_put_budget;
        if (unlikely(scsi_host_in_recovery(shost))) {
                if (cmd->flags & SCMD_FAIL_IF_RECOVERING)
                        ret = BLK_STS_OFFLINE;
                goto out_dec_target_busy;
        }
        if (!scsi_host_queue_ready(q, shost, sdev, cmd))
                goto out_dec_target_busy;

        if (!(req->rq_flags & RQF_DONTPREP)) {
                ret = scsi_prepare_cmd(req);
                if (ret != BLK_STS_OK)
                        goto out_dec_host_busy;
                req->rq_flags |= RQF_DONTPREP;
        } else {
                clear_bit(SCMD_STATE_COMPLETE, &cmd->state);
        }

        cmd->flags &= SCMD_PRESERVED_FLAGS;
        if (sdev->simple_tags)
                cmd->flags |= SCMD_TAGGED;
        if (bd->last)
                cmd->flags |= SCMD_LAST;

        scsi_set_resid(cmd, 0);
        memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
        cmd->submitter = SUBMITTED_BY_BLOCK_LAYER;

        blk_mq_start_request(req);
        reason = scsi_dispatch_cmd(cmd);
        if (reason) {
                scsi_set_blocked(cmd, reason);
                ret = BLK_STS_RESOURCE;
                goto out_dec_host_busy;
        }

        return BLK_STS_OK;

out_dec_host_busy:
        scsi_dec_host_busy(shost, cmd);
out_dec_target_busy:
        if (scsi_target(sdev)->can_queue > 0)
                atomic_dec(&scsi_target(sdev)->target_busy);
out_put_budget:
        scsi_mq_put_budget(q, cmd->budget_token);
        cmd->budget_token = -1;
        switch (ret) {
        case BLK_STS_OK:
                break;
        case BLK_STS_RESOURCE:
                if (scsi_device_blocked(sdev))
                        ret = BLK_STS_DEV_RESOURCE;
                break;
        case BLK_STS_AGAIN:
                cmd->result = DID_BUS_BUSY << 16;
                if (req->rq_flags & RQF_DONTPREP)
                        scsi_mq_uninit_cmd(cmd);
                break;
        default:
                if (unlikely(!scsi_device_online(sdev)))
                        cmd->result = DID_NO_CONNECT << 16;
                else
                        cmd->result = DID_ERROR << 16;
                /*
                 * Make sure to release all allocated resources when
                 * we hit an error, as we will never see this command
                 * again.
                 */
                if (req->rq_flags & RQF_DONTPREP)
                        scsi_mq_uninit_cmd(cmd);
                scsi_run_queue_async(sdev);
                break;
        }
        return ret;
}

static int scsi_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
                                unsigned int hctx_idx, unsigned int numa_node)
{
        struct Scsi_Host *shost = set->driver_data;
        struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
        struct scatterlist *sg;
        int ret = 0;

        cmd->sense_buffer =
                kmem_cache_alloc_node(scsi_sense_cache, GFP_KERNEL, numa_node);
        if (!cmd->sense_buffer)
                return -ENOMEM;

        if (scsi_host_get_prot(shost)) {
                sg = (void *)cmd + sizeof(struct scsi_cmnd) +
                        shost->hostt->cmd_size;
                cmd->prot_sdb = (void *)sg + scsi_mq_inline_sgl_size(shost);
        }

        if (shost->hostt->init_cmd_priv) {
                ret = shost->hostt->init_cmd_priv(shost, cmd);
                if (ret < 0)
                        kmem_cache_free(scsi_sense_cache, cmd->sense_buffer);
        }

        return ret;
}

static void scsi_mq_exit_request(struct blk_mq_tag_set *set, struct request *rq,
                                 unsigned int hctx_idx)
{
        struct Scsi_Host *shost = set->driver_data;
        struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);

        if (shost->hostt->exit_cmd_priv)
                shost->hostt->exit_cmd_priv(shost, cmd);
        kmem_cache_free(scsi_sense_cache, cmd->sense_buffer);
}


static int scsi_mq_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
{
        struct Scsi_Host *shost = hctx->driver_data;

        if (shost->hostt->mq_poll)
                return shost->hostt->mq_poll(shost, hctx->queue_num);

        return 0;
}

static int scsi_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
                          unsigned int hctx_idx)
{
        struct Scsi_Host *shost = data;

        hctx->driver_data = shost;
        return 0;
}

static void scsi_map_queues(struct blk_mq_tag_set *set)
{
        struct Scsi_Host *shost = container_of(set, struct Scsi_Host, tag_set);

        if (shost->hostt->map_queues)
                return shost->hostt->map_queues(shost);
        blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
}

void scsi_init_limits(struct Scsi_Host *shost, struct queue_limits *lim)
{
        struct device *dev = shost->dma_dev;

        memset(lim, 0, sizeof(*lim));
        lim->max_segments =
                min_t(unsigned short, shost->sg_tablesize, SG_MAX_SEGMENTS);

        if (scsi_host_prot_dma(shost)) {
                shost->sg_prot_tablesize =
                        min_not_zero(shost->sg_prot_tablesize,
                                     (unsigned short)SCSI_MAX_PROT_SG_SEGMENTS);
                BUG_ON(shost->sg_prot_tablesize < shost->sg_tablesize);
                lim->max_integrity_segments = shost->sg_prot_tablesize;
        }

        lim->max_hw_sectors = shost->max_sectors;
        lim->seg_boundary_mask = shost->dma_boundary;
        lim->max_segment_size = shost->max_segment_size;
        lim->virt_boundary_mask = shost->virt_boundary_mask;
        lim->dma_alignment = max_t(unsigned int,
                shost->dma_alignment, dma_get_cache_alignment() - 1);

        if (shost->no_highmem)
                lim->bounce = BLK_BOUNCE_HIGH;

        dma_set_seg_boundary(dev, shost->dma_boundary);
        dma_set_max_seg_size(dev, shost->max_segment_size);
}
EXPORT_SYMBOL_GPL(scsi_init_limits);

static const struct blk_mq_ops scsi_mq_ops_no_commit = {
        .get_budget        = scsi_mq_get_budget,
        .put_budget        = scsi_mq_put_budget,
        .queue_rq        = scsi_queue_rq,
        .complete        = scsi_complete,
        .timeout        = scsi_timeout,
#ifdef CONFIG_BLK_DEBUG_FS
        .show_rq        = scsi_show_rq,
#endif
        .init_request        = scsi_mq_init_request,
        .exit_request        = scsi_mq_exit_request,
        .cleanup_rq        = scsi_cleanup_rq,
        .busy                = scsi_mq_lld_busy,
        .map_queues        = scsi_map_queues,
        .init_hctx        = scsi_init_hctx,
        .poll                = scsi_mq_poll,
        .set_rq_budget_token = scsi_mq_set_rq_budget_token,
        .get_rq_budget_token = scsi_mq_get_rq_budget_token,
};


static void scsi_commit_rqs(struct blk_mq_hw_ctx *hctx)
{
        struct Scsi_Host *shost = hctx->driver_data;

        shost->hostt->commit_rqs(shost, hctx->queue_num);
}

static const struct blk_mq_ops scsi_mq_ops = {
        .get_budget        = scsi_mq_get_budget,
        .put_budget        = scsi_mq_put_budget,
        .queue_rq        = scsi_queue_rq,
        .commit_rqs        = scsi_commit_rqs,
        .complete        = scsi_complete,
        .timeout        = scsi_timeout,
#ifdef CONFIG_BLK_DEBUG_FS
        .show_rq        = scsi_show_rq,
#endif
        .init_request        = scsi_mq_init_request,
        .exit_request        = scsi_mq_exit_request,
        .cleanup_rq        = scsi_cleanup_rq,
        .busy                = scsi_mq_lld_busy,
        .map_queues        = scsi_map_queues,
        .init_hctx        = scsi_init_hctx,
        .poll                = scsi_mq_poll,
        .set_rq_budget_token = scsi_mq_set_rq_budget_token,
        .get_rq_budget_token = scsi_mq_get_rq_budget_token,
};

int scsi_mq_setup_tags(struct Scsi_Host *shost)
{
        unsigned int cmd_size, sgl_size;
        struct blk_mq_tag_set *tag_set = &shost->tag_set;

        sgl_size = max_t(unsigned int, sizeof(struct scatterlist),
                                scsi_mq_inline_sgl_size(shost));
        cmd_size = sizeof(struct scsi_cmnd) + shost->hostt->cmd_size + sgl_size;
        if (scsi_host_get_prot(shost))
                cmd_size += sizeof(struct scsi_data_buffer) +
                        sizeof(struct scatterlist) * SCSI_INLINE_PROT_SG_CNT;

        memset(tag_set, 0, sizeof(*tag_set));
        if (shost->hostt->commit_rqs)
                tag_set->ops = &scsi_mq_ops;
        else
                tag_set->ops = &scsi_mq_ops_no_commit;
        tag_set->nr_hw_queues = shost->nr_hw_queues ? : 1;
        tag_set->nr_maps = shost->nr_maps ? : 1;
        tag_set->queue_depth = shost->can_queue;
        tag_set->cmd_size = cmd_size;
        tag_set->numa_node = dev_to_node(shost->dma_dev);
        tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
        tag_set->flags |=
                BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy);
        if (shost->queuecommand_may_block)
                tag_set->flags |= BLK_MQ_F_BLOCKING;
        tag_set->driver_data = shost;
        if (shost->host_tagset)
                tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED;

        return blk_mq_alloc_tag_set(tag_set);
}

void scsi_mq_free_tags(struct kref *kref)
{
        struct Scsi_Host *shost = container_of(kref, typeof(*shost),
                                               tagset_refcnt);

        blk_mq_free_tag_set(&shost->tag_set);
        complete(&shost->tagset_freed);
}

/**
 * scsi_device_from_queue - return sdev associated with a request_queue
 * @q: The request queue to return the sdev from
 *
 * Return the sdev associated with a request queue or NULL if the
 * request_queue does not reference a SCSI device.
 */
struct scsi_device *scsi_device_from_queue(struct request_queue *q)
{
        struct scsi_device *sdev = NULL;

        if (q->mq_ops == &scsi_mq_ops_no_commit ||
            q->mq_ops == &scsi_mq_ops)
                sdev = q->queuedata;
        if (!sdev || !get_device(&sdev->sdev_gendev))
                sdev = NULL;

        return sdev;
}
/*
 * pktcdvd should have been integrated into the SCSI layers, but for historical
 * reasons like the old IDE driver it isn't.  This export allows it to safely
 * probe if a given device is a SCSI one and only attach to that.
 */
#ifdef CONFIG_CDROM_PKTCDVD_MODULE
EXPORT_SYMBOL_GPL(scsi_device_from_queue);
#endif

/**
 * scsi_block_requests - Utility function used by low-level drivers to prevent
 * further commands from being queued to the device.
 * @shost:  host in question
 *
 * There is no timer nor any other means by which the requests get unblocked
 * other than the low-level driver calling scsi_unblock_requests().
 */
void scsi_block_requests(struct Scsi_Host *shost)
{
        shost->host_self_blocked = 1;
}
EXPORT_SYMBOL(scsi_block_requests);

/**
 * scsi_unblock_requests - Utility function used by low-level drivers to allow
 * further commands to be queued to the device.
 * @shost:  host in question
 *
 * There is no timer nor any other means by which the requests get unblocked
 * other than the low-level driver calling scsi_unblock_requests(). This is done
 * as an API function so that changes to the internals of the scsi mid-layer
 * won't require wholesale changes to drivers that use this feature.
 */
void scsi_unblock_requests(struct Scsi_Host *shost)
{
        shost->host_self_blocked = 0;
        scsi_run_host_queues(shost);
}
EXPORT_SYMBOL(scsi_unblock_requests);

void scsi_exit_queue(void)
{
        kmem_cache_destroy(scsi_sense_cache);
}

/**
 *        scsi_mode_select - issue a mode select
 *        @sdev:        SCSI device to be queried
 *        @pf:        Page format bit (1 == standard, 0 == vendor specific)
 *        @sp:        Save page bit (0 == don't save, 1 == save)
 *        @buffer: request buffer (may not be smaller than eight bytes)
 *        @len:        length of request buffer.
 *        @timeout: command timeout
 *        @retries: number of retries before failing
 *        @data: returns a structure abstracting the mode header data
 *        @sshdr: place to put sense data (or NULL if no sense to be collected).
 *                must be SCSI_SENSE_BUFFERSIZE big.
 *
 *        Returns zero if successful; negative error number or scsi
 *        status on error
 *
 */
int scsi_mode_select(struct scsi_device *sdev, int pf, int sp,
                     unsigned char *buffer, int len, int timeout, int retries,
                     struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr)
{
        unsigned char cmd[10];
        unsigned char *real_buffer;
        const struct scsi_exec_args exec_args = {
                .sshdr = sshdr,
        };
        int ret;

        memset(cmd, 0, sizeof(cmd));
        cmd[1] = (pf ? 0x10 : 0) | (sp ? 0x01 : 0);

        /*
         * Use MODE SELECT(10) if the device asked for it or if the mode page
         * and the mode select header cannot fit within the maximumm 255 bytes
         * of the MODE SELECT(6) command.
         */
        if (sdev->use_10_for_ms ||
            len + 4 > 255 ||
            data->block_descriptor_length > 255) {
                if (len > 65535 - 8)
                        return -EINVAL;
                real_buffer = kmalloc(8 + len, GFP_KERNEL);
                if (!real_buffer)
                        return -ENOMEM;
                memcpy(real_buffer + 8, buffer, len);
                len += 8;
                real_buffer[0] = 0;
                real_buffer[1] = 0;
                real_buffer[2] = data->medium_type;
                real_buffer[3] = data->device_specific;
                real_buffer[4] = data->longlba ? 0x01 : 0;
                real_buffer[5] = 0;
                put_unaligned_be16(data->block_descriptor_length,
                                   &real_buffer[6]);

                cmd[0] = MODE_SELECT_10;
                put_unaligned_be16(len, &cmd[7]);
        } else {
                if (data->longlba)
                        return -EINVAL;

                real_buffer = kmalloc(4 + len, GFP_KERNEL);
                if (!real_buffer)
                        return -ENOMEM;
                memcpy(real_buffer + 4, buffer, len);
                len += 4;
                real_buffer[0] = 0;
                real_buffer[1] = data->medium_type;
                real_buffer[2] = data->device_specific;
                real_buffer[3] = data->block_descriptor_length;

                cmd[0] = MODE_SELECT;
                cmd[4] = len;
        }

        ret = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_OUT, real_buffer, len,
                               timeout, retries, &exec_args);
        kfree(real_buffer);
        return ret;
}
EXPORT_SYMBOL_GPL(scsi_mode_select);

/**
 *        scsi_mode_sense - issue a mode sense, falling back from 10 to six bytes if necessary.
 *        @sdev:        SCSI device to be queried
 *        @dbd:        set to prevent mode sense from returning block descriptors
 *        @modepage: mode page being requested
 *        @subpage: sub-page of the mode page being requested
 *        @buffer: request buffer (may not be smaller than eight bytes)
 *        @len:        length of request buffer.
 *        @timeout: command timeout
 *        @retries: number of retries before failing
 *        @data: returns a structure abstracting the mode header data
 *        @sshdr: place to put sense data (or NULL if no sense to be collected).
 *                must be SCSI_SENSE_BUFFERSIZE big.
 *
 *        Returns zero if successful, or a negative error number on failure
 */
int
scsi_mode_sense(struct scsi_device *sdev, int dbd, int modepage, int subpage,
                  unsigned char *buffer, int len, int timeout, int retries,
                  struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr)
{
        unsigned char cmd[12];
        int use_10_for_ms;
        int header_length;
        int result;
        struct scsi_sense_hdr my_sshdr;
        struct scsi_failure failure_defs[] = {
                {
                        .sense = UNIT_ATTENTION,
                        .asc = SCMD_FAILURE_ASC_ANY,
                        .ascq = SCMD_FAILURE_ASCQ_ANY,
                        .allowed = retries,
                        .result = SAM_STAT_CHECK_CONDITION,
                },
                {}
        };
        struct scsi_failures failures = {
                .failure_definitions = failure_defs,
        };
        const struct scsi_exec_args exec_args = {
                /* caller might not be interested in sense, but we need it */
                .sshdr = sshdr ? : &my_sshdr,
                .failures = &failures,
        };

        memset(data, 0, sizeof(*data));
        memset(&cmd[0], 0, 12);

        dbd = sdev->set_dbd_for_ms ? 8 : dbd;
        cmd[1] = dbd & 0x18;        /* allows DBD and LLBA bits */
        cmd[2] = modepage;
        cmd[3] = subpage;

        sshdr = exec_args.sshdr;

 retry:
        use_10_for_ms = sdev->use_10_for_ms || len > 255;

        if (use_10_for_ms) {
                if (len < 8 || len > 65535)
                        return -EINVAL;

                cmd[0] = MODE_SENSE_10;
                put_unaligned_be16(len, &cmd[7]);
                header_length = 8;
        } else {
                if (len < 4)
                        return -EINVAL;

                cmd[0] = MODE_SENSE;
                cmd[4] = len;
                header_length = 4;
        }

        memset(buffer, 0, len);

        result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, buffer, len,
                                  timeout, retries, &exec_args);
        if (result < 0)
                return result;

        /* This code looks awful: what it's doing is making sure an
         * ILLEGAL REQUEST sense return identifies the actual command
         * byte as the problem.  MODE_SENSE commands can return
         * ILLEGAL REQUEST if the code page isn't supported */

        if (!scsi_status_is_good(result)) {
                if (scsi_sense_valid(sshdr)) {
                        if ((sshdr->sense_key == ILLEGAL_REQUEST) &&
                            (sshdr->asc == 0x20) && (sshdr->ascq == 0)) {
                                /*
                                 * Invalid command operation code: retry using
                                 * MODE SENSE(6) if this was a MODE SENSE(10)
                                 * request, except if the request mode page is
                                 * too large for MODE SENSE single byte
                                 * allocation length field.
                                 */
                                if (use_10_for_ms) {
                                        if (len > 255)
                                                return -EIO;
                                        sdev->use_10_for_ms = 0;
                                        goto retry;
                                }
                        }
                }
                return -EIO;
        }
        if (unlikely(buffer[0] == 0x86 && buffer[1] == 0x0b &&
                     (modepage == 6 || modepage == 8))) {
                /* Initio breakage? */
                header_length = 0;
                data->length = 13;
                data->medium_type = 0;
                data->device_specific = 0;
                data->longlba = 0;
                data->block_descriptor_length = 0;
        } else if (use_10_for_ms) {
                data->length = get_unaligned_be16(&buffer[0]) + 2;
                data->medium_type = buffer[2];
                data->device_specific = buffer[3];
                data->longlba = buffer[4] & 0x01;
                data->block_descriptor_length = get_unaligned_be16(&buffer[6]);
        } else {
                data->length = buffer[0] + 1;
                data->medium_type = buffer[1];
                data->device_specific = buffer[2];
                data->block_descriptor_length = buffer[3];
        }
        data->header_length = header_length;

        return 0;
}
EXPORT_SYMBOL(scsi_mode_sense);

/**
 *        scsi_test_unit_ready - test if unit is ready
 *        @sdev:        scsi device to change the state of.
 *        @timeout: command timeout
 *        @retries: number of retries before failing
 *        @sshdr: outpout pointer for decoded sense information.
 *
 *        Returns zero if unsuccessful or an error if TUR failed.  For
 *        removable media, UNIT_ATTENTION sets ->changed flag.
 **/
int
scsi_test_unit_ready(struct scsi_device *sdev, int timeout, int retries,
                     struct scsi_sense_hdr *sshdr)
{
        char cmd[] = {
                TEST_UNIT_READY, 0, 0, 0, 0, 0,
        };
        const struct scsi_exec_args exec_args = {
                .sshdr = sshdr,
        };
        int result;

        /* try to eat the UNIT_ATTENTION if there are enough retries */
        do {
                result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, NULL, 0,
                                          timeout, 1, &exec_args);
                if (sdev->removable && result > 0 && scsi_sense_valid(sshdr) &&
                    sshdr->sense_key == UNIT_ATTENTION)
                        sdev->changed = 1;
        } while (result > 0 && scsi_sense_valid(sshdr) &&
                 sshdr->sense_key == UNIT_ATTENTION && --retries);

        return result;
}
EXPORT_SYMBOL(scsi_test_unit_ready);

/**
 *        scsi_device_set_state - Take the given device through the device state model.
 *        @sdev:        scsi device to change the state of.
 *        @state:        state to change to.
 *
 *        Returns zero if successful or an error if the requested
 *        transition is illegal.
 */
int
scsi_device_set_state(struct scsi_device *sdev, enum scsi_device_state state)
{
        enum scsi_device_state oldstate = sdev->sdev_state;

        if (state == oldstate)
                return 0;

        switch (state) {
        case SDEV_CREATED:
                switch (oldstate) {
                case SDEV_CREATED_BLOCK:
                        break;
                default:
                        goto illegal;
                }
                break;

        case SDEV_RUNNING:
                switch (oldstate) {
                case SDEV_CREATED:
                case SDEV_OFFLINE:
                case SDEV_TRANSPORT_OFFLINE:
                case SDEV_QUIESCE:
                case SDEV_BLOCK:
                        break;
                default:
                        goto illegal;
                }
                break;

        case SDEV_QUIESCE:
                switch (oldstate) {
                case SDEV_RUNNING:
                case SDEV_OFFLINE:
                case SDEV_TRANSPORT_OFFLINE:
                        break;
                default:
                        goto illegal;
                }
                break;

        case SDEV_OFFLINE:
        case SDEV_TRANSPORT_OFFLINE:
                switch (oldstate) {
                case SDEV_CREATED:
                case SDEV_RUNNING:
                case SDEV_QUIESCE:
                case SDEV_BLOCK:
                        break;
                default:
                        goto illegal;
                }
                break;

        case SDEV_BLOCK:
                switch (oldstate) {
                case SDEV_RUNNING:
                case SDEV_CREATED_BLOCK:
                case SDEV_QUIESCE:
                case SDEV_OFFLINE:
                        break;
                default:
                        goto illegal;
                }
                break;

        case SDEV_CREATED_BLOCK:
                switch (oldstate) {
                case SDEV_CREATED:
                        break;
                default:
                        goto illegal;
                }
                break;

        case SDEV_CANCEL:
                switch (oldstate) {
                case SDEV_CREATED:
                case SDEV_RUNNING:
                case SDEV_QUIESCE:
                case SDEV_OFFLINE:
                case SDEV_TRANSPORT_OFFLINE:
                        break;
                default:
                        goto illegal;
                }
                break;

        case SDEV_DEL:
                switch (oldstate) {
                case SDEV_CREATED:
                case SDEV_RUNNING:
                case SDEV_OFFLINE:
                case SDEV_TRANSPORT_OFFLINE:
                case SDEV_CANCEL:
                case SDEV_BLOCK:
                case SDEV_CREATED_BLOCK:
                        break;
                default:
                        goto illegal;
                }
                break;

        }
        sdev->offline_already = false;
        sdev->sdev_state = state;
        return 0;

 illegal:
        SCSI_LOG_ERROR_RECOVERY(1,
                                sdev_printk(KERN_ERR, sdev,
                                            "Illegal state transition %s->%s",
                                            scsi_device_state_name(oldstate),
                                            scsi_device_state_name(state))
                                );
        return -EINVAL;
}
EXPORT_SYMBOL(scsi_device_set_state);

/**
 *        scsi_evt_emit - emit a single SCSI device uevent
 *        @sdev: associated SCSI device
 *        @evt: event to emit
 *
 *        Send a single uevent (scsi_event) to the associated scsi_device.
 */
static void scsi_evt_emit(struct scsi_device *sdev, struct scsi_event *evt)
{
        int idx = 0;
        char *envp[3];

        switch (evt->evt_type) {
        case SDEV_EVT_MEDIA_CHANGE:
                envp[idx++] = "SDEV_MEDIA_CHANGE=1";
                break;
        case SDEV_EVT_INQUIRY_CHANGE_REPORTED:
                scsi_rescan_device(sdev);
                envp[idx++] = "SDEV_UA=INQUIRY_DATA_HAS_CHANGED";
                break;
        case SDEV_EVT_CAPACITY_CHANGE_REPORTED:
                envp[idx++] = "SDEV_UA=CAPACITY_DATA_HAS_CHANGED";
                break;
        case SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED:
               envp[idx++] = "SDEV_UA=THIN_PROVISIONING_SOFT_THRESHOLD_REACHED";
                break;
        case SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED:
                envp[idx++] = "SDEV_UA=MODE_PARAMETERS_CHANGED";
                break;
        case SDEV_EVT_LUN_CHANGE_REPORTED:
                envp[idx++] = "SDEV_UA=REPORTED_LUNS_DATA_HAS_CHANGED";
                break;
        case SDEV_EVT_ALUA_STATE_CHANGE_REPORTED:
                envp[idx++] = "SDEV_UA=ASYMMETRIC_ACCESS_STATE_CHANGED";
                break;
        case SDEV_EVT_POWER_ON_RESET_OCCURRED:
                envp[idx++] = "SDEV_UA=POWER_ON_RESET_OCCURRED";
                break;
        default:
                /* do nothing */
                break;
        }

        envp[idx++] = NULL;

        kobject_uevent_env(&sdev->sdev_gendev.kobj, KOBJ_CHANGE, envp);
}

/**
 *        scsi_evt_thread - send a uevent for each scsi event
 *        @work: work struct for scsi_device
 *
 *        Dispatch queued events to their associated scsi_device kobjects
 *        as uevents.
 */
void scsi_evt_thread(struct work_struct *work)
{
        struct scsi_device *sdev;
        enum scsi_device_event evt_type;
        LIST_HEAD(event_list);

        sdev = container_of(work, struct scsi_device, event_work);

        for (evt_type = SDEV_EVT_FIRST; evt_type <= SDEV_EVT_LAST; evt_type++)
                if (test_and_clear_bit(evt_type, sdev->pending_events))
                        sdev_evt_send_simple(sdev, evt_type, GFP_KERNEL);

        while (1) {
                struct scsi_event *evt;
                struct list_head *this, *tmp;
                unsigned long flags;

                spin_lock_irqsave(&sdev->list_lock, flags);
                list_splice_init(&sdev->event_list, &event_list);
                spin_unlock_irqrestore(&sdev->list_lock, flags);

                if (list_empty(&event_list))
                        break;

                list_for_each_safe(this, tmp, &event_list) {
                        evt = list_entry(this, struct scsi_event, node);
                        list_del(&evt->node);
                        scsi_evt_emit(sdev, evt);
                        kfree(evt);
                }
        }
}

/**
 *         sdev_evt_send - send asserted event to uevent thread
 *        @sdev: scsi_device event occurred on
 *        @evt: event to send
 *
 *        Assert scsi device event asynchronously.
 */
void sdev_evt_send(struct scsi_device *sdev, struct scsi_event *evt)
{
        unsigned long flags;

#if 0
        /* FIXME: currently this check eliminates all media change events
         * for polled devices.  Need to update to discriminate between AN
         * and polled events */
        if (!test_bit(evt->evt_type, sdev->supported_events)) {
                kfree(evt);
                return;
        }
#endif

        spin_lock_irqsave(&sdev->list_lock, flags);
        list_add_tail(&evt->node, &sdev->event_list);
        schedule_work(&sdev->event_work);
        spin_unlock_irqrestore(&sdev->list_lock, flags);
}
EXPORT_SYMBOL_GPL(sdev_evt_send);

/**
 *         sdev_evt_alloc - allocate a new scsi event
 *        @evt_type: type of event to allocate
 *        @gfpflags: GFP flags for allocation
 *
 *        Allocates and returns a new scsi_event.
 */
struct scsi_event *sdev_evt_alloc(enum scsi_device_event evt_type,
                                  gfp_t gfpflags)
{
        struct scsi_event *evt = kzalloc(sizeof(struct scsi_event), gfpflags);
        if (!evt)
                return NULL;

        evt->evt_type = evt_type;
        INIT_LIST_HEAD(&evt->node);

        /* evt_type-specific initialization, if any */
        switch (evt_type) {
        case SDEV_EVT_MEDIA_CHANGE:
        case SDEV_EVT_INQUIRY_CHANGE_REPORTED:
        case SDEV_EVT_CAPACITY_CHANGE_REPORTED:
        case SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED:
        case SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED:
        case SDEV_EVT_LUN_CHANGE_REPORTED:
        case SDEV_EVT_ALUA_STATE_CHANGE_REPORTED:
        case SDEV_EVT_POWER_ON_RESET_OCCURRED:
        default:
                /* do nothing */
                break;
        }

        return evt;
}
EXPORT_SYMBOL_GPL(sdev_evt_alloc);

/**
 *         sdev_evt_send_simple - send asserted event to uevent thread
 *        @sdev: scsi_device event occurred on
 *        @evt_type: type of event to send
 *        @gfpflags: GFP flags for allocation
 *
 *        Assert scsi device event asynchronously, given an event type.
 */
void sdev_evt_send_simple(struct scsi_device *sdev,
                          enum scsi_device_event evt_type, gfp_t gfpflags)
{
        struct scsi_event *evt = sdev_evt_alloc(evt_type, gfpflags);
        if (!evt) {
                sdev_printk(KERN_ERR, sdev, "event %d eaten due to OOM\n",
                            evt_type);
                return;
        }

        sdev_evt_send(sdev, evt);
}
EXPORT_SYMBOL_GPL(sdev_evt_send_simple);

/**
 *        scsi_device_quiesce - Block all commands except power management.
 *        @sdev:        scsi device to quiesce.
 *
 *        This works by trying to transition to the SDEV_QUIESCE state
 *        (which must be a legal transition).  When the device is in this
 *        state, only power management requests will be accepted, all others will
 *        be deferred.
 *
 *        Must be called with user context, may sleep.
 *
 *        Returns zero if unsuccessful or an error if not.
 */
int
scsi_device_quiesce(struct scsi_device *sdev)
{
        struct request_queue *q = sdev->request_queue;
        int err;

        /*
         * It is allowed to call scsi_device_quiesce() multiple times from
         * the same context but concurrent scsi_device_quiesce() calls are
         * not allowed.
         */
        WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current);

        if (sdev->quiesced_by == current)
                return 0;

        blk_set_pm_only(q);

        blk_mq_freeze_queue(q);
        /*
         * Ensure that the effect of blk_set_pm_only() will be visible
         * for percpu_ref_tryget() callers that occur after the queue
         * unfreeze even if the queue was already frozen before this function
         * was called. See also https://lwn.net/Articles/573497/.
         */
        synchronize_rcu();
        blk_mq_unfreeze_queue(q);

        mutex_lock(&sdev->state_mutex);
        err = scsi_device_set_state(sdev, SDEV_QUIESCE);
        if (err == 0)
                sdev->quiesced_by = current;
        else
                blk_clear_pm_only(q);
        mutex_unlock(&sdev->state_mutex);

        return err;
}
EXPORT_SYMBOL(scsi_device_quiesce);

/**
 *        scsi_device_resume - Restart user issued commands to a quiesced device.
 *        @sdev:        scsi device to resume.
 *
 *        Moves the device from quiesced back to running and restarts the
 *        queues.
 *
 *        Must be called with user context, may sleep.
 */
void scsi_device_resume(struct scsi_device *sdev)
{
        /* check if the device state was mutated prior to resume, and if
         * so assume the state is being managed elsewhere (for example
         * device deleted during suspend)
         */
        mutex_lock(&sdev->state_mutex);
        if (sdev->sdev_state == SDEV_QUIESCE)
                scsi_device_set_state(sdev, SDEV_RUNNING);
        if (sdev->quiesced_by) {
                sdev->quiesced_by = NULL;
                blk_clear_pm_only(sdev->request_queue);
        }
        mutex_unlock(&sdev->state_mutex);
}
EXPORT_SYMBOL(scsi_device_resume);

static void
device_quiesce_fn(struct scsi_device *sdev, void *data)
{
        scsi_device_quiesce(sdev);
}

void
scsi_target_quiesce(struct scsi_target *starget)
{
        starget_for_each_device(starget, NULL, device_quiesce_fn);
}
EXPORT_SYMBOL(scsi_target_quiesce);

static void
device_resume_fn(struct scsi_device *sdev, void *data)
{
        scsi_device_resume(sdev);
}

void
scsi_target_resume(struct scsi_target *starget)
{
        starget_for_each_device(starget, NULL, device_resume_fn);
}
EXPORT_SYMBOL(scsi_target_resume);

static int __scsi_internal_device_block_nowait(struct scsi_device *sdev)
{
        if (scsi_device_set_state(sdev, SDEV_BLOCK))
                return scsi_device_set_state(sdev, SDEV_CREATED_BLOCK);

        return 0;
}

void scsi_start_queue(struct scsi_device *sdev)
{
        if (cmpxchg(&sdev->queue_stopped, 1, 0))
                blk_mq_unquiesce_queue(sdev->request_queue);
}

static void scsi_stop_queue(struct scsi_device *sdev)
{
        /*
         * The atomic variable of ->queue_stopped covers that
         * blk_mq_quiesce_queue* is balanced with blk_mq_unquiesce_queue.
         *
         * The caller needs to wait until quiesce is done.
         */
        if (!cmpxchg(&sdev->queue_stopped, 0, 1))
                blk_mq_quiesce_queue_nowait(sdev->request_queue);
}

/**
 * scsi_internal_device_block_nowait - try to transition to the SDEV_BLOCK state
 * @sdev: device to block
 *
 * Pause SCSI command processing on the specified device. Does not sleep.
 *
 * Returns zero if successful or a negative error code upon failure.
 *
 * Notes:
 * This routine transitions the device to the SDEV_BLOCK state (which must be
 * a legal transition). When the device is in this state, command processing
 * is paused until the device leaves the SDEV_BLOCK state. See also
 * scsi_internal_device_unblock_nowait().
 */
int scsi_internal_device_block_nowait(struct scsi_device *sdev)
{
        int ret = __scsi_internal_device_block_nowait(sdev);

        /*
         * The device has transitioned to SDEV_BLOCK.  Stop the
         * block layer from calling the midlayer with this device's
         * request queue.
         */
        if (!ret)
                scsi_stop_queue(sdev);
        return ret;
}
EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait);

/**
 * scsi_device_block - try to transition to the SDEV_BLOCK state
 * @sdev: device to block
 * @data: dummy argument, ignored
 *
 * Pause SCSI command processing on the specified device. Callers must wait
 * until all ongoing scsi_queue_rq() calls have finished after this function
 * returns.
 *
 * Note:
 * This routine transitions the device to the SDEV_BLOCK state (which must be
 * a legal transition). When the device is in this state, command processing
 * is paused until the device leaves the SDEV_BLOCK state. See also
 * scsi_internal_device_unblock().
 */
static void scsi_device_block(struct scsi_device *sdev, void *data)
{
        int err;
        enum scsi_device_state state;

        mutex_lock(&sdev->state_mutex);
        err = __scsi_internal_device_block_nowait(sdev);
        state = sdev->sdev_state;
        if (err == 0)
                /*
                 * scsi_stop_queue() must be called with the state_mutex
                 * held. Otherwise a simultaneous scsi_start_queue() call
                 * might unquiesce the queue before we quiesce it.
                 */
                scsi_stop_queue(sdev);

        mutex_unlock(&sdev->state_mutex);

        WARN_ONCE(err, "%s: failed to block %s in state %d\n",
                  __func__, dev_name(&sdev->sdev_gendev), state);
}

/**
 * scsi_internal_device_unblock_nowait - resume a device after a block request
 * @sdev:        device to resume
 * @new_state:        state to set the device to after unblocking
 *
 * Restart the device queue for a previously suspended SCSI device. Does not
 * sleep.
 *
 * Returns zero if successful or a negative error code upon failure.
 *
 * Notes:
 * This routine transitions the device to the SDEV_RUNNING state or to one of
 * the offline states (which must be a legal transition) allowing the midlayer
 * to goose the queue for this device.
 */
int scsi_internal_device_unblock_nowait(struct scsi_device *sdev,
                                        enum scsi_device_state new_state)
{
        switch (new_state) {
        case SDEV_RUNNING:
        case SDEV_TRANSPORT_OFFLINE:
                break;
        default:
                return -EINVAL;
        }

        /*
         * Try to transition the scsi device to SDEV_RUNNING or one of the
         * offlined states and goose the device queue if successful.
         */
        switch (sdev->sdev_state) {
        case SDEV_BLOCK:
        case SDEV_TRANSPORT_OFFLINE:
                sdev->sdev_state = new_state;
                break;
        case SDEV_CREATED_BLOCK:
                if (new_state == SDEV_TRANSPORT_OFFLINE ||
                    new_state == SDEV_OFFLINE)
                        sdev->sdev_state = new_state;
                else
                        sdev->sdev_state = SDEV_CREATED;
                break;
        case SDEV_CANCEL:
        case SDEV_OFFLINE:
                break;
        default:
                return -EINVAL;
        }
        scsi_start_queue(sdev);

        return 0;
}
EXPORT_SYMBOL_GPL(scsi_internal_device_unblock_nowait);

/**
 * scsi_internal_device_unblock - resume a device after a block request
 * @sdev:        device to resume
 * @new_state:        state to set the device to after unblocking
 *
 * Restart the device queue for a previously suspended SCSI device. May sleep.
 *
 * Returns zero if successful or a negative error code upon failure.
 *
 * Notes:
 * This routine transitions the device to the SDEV_RUNNING state or to one of
 * the offline states (which must be a legal transition) allowing the midlayer
 * to goose the queue for this device.
 */
static int scsi_internal_device_unblock(struct scsi_device *sdev,
                                        enum scsi_device_state new_state)
{
        int ret;

        mutex_lock(&sdev->state_mutex);
        ret = scsi_internal_device_unblock_nowait(sdev, new_state);
        mutex_unlock(&sdev->state_mutex);

        return ret;
}

static int
target_block(struct device *dev, void *data)
{
        if (scsi_is_target_device(dev))
                starget_for_each_device(to_scsi_target(dev), NULL,
                                        scsi_device_block);
        return 0;
}

/**
 * scsi_block_targets - transition all SCSI child devices to SDEV_BLOCK state
 * @dev: a parent device of one or more scsi_target devices
 * @shost: the Scsi_Host to which this device belongs
 *
 * Iterate over all children of @dev, which should be scsi_target devices,
 * and switch all subordinate scsi devices to SDEV_BLOCK state. Wait for
 * ongoing scsi_queue_rq() calls to finish. May sleep.
 *
 * Note:
 * @dev must not itself be a scsi_target device.
 */
void
scsi_block_targets(struct Scsi_Host *shost, struct device *dev)
{
        WARN_ON_ONCE(scsi_is_target_device(dev));
        device_for_each_child(dev, NULL, target_block);
        blk_mq_wait_quiesce_done(&shost->tag_set);
}
EXPORT_SYMBOL_GPL(scsi_block_targets);

static void
device_unblock(struct scsi_device *sdev, void *data)
{
        scsi_internal_device_unblock(sdev, *(enum scsi_device_state *)data);
}

static int
target_unblock(struct device *dev, void *data)
{
        if (scsi_is_target_device(dev))
                starget_for_each_device(to_scsi_target(dev), data,
                                        device_unblock);
        return 0;
}

void
scsi_target_unblock(struct device *dev, enum scsi_device_state new_state)
{
        if (scsi_is_target_device(dev))
                starget_for_each_device(to_scsi_target(dev), &new_state,
                                        device_unblock);
        else
                device_for_each_child(dev, &new_state, target_unblock);
}
EXPORT_SYMBOL_GPL(scsi_target_unblock);

/**
 * scsi_host_block - Try to transition all logical units to the SDEV_BLOCK state
 * @shost: device to block
 *
 * Pause SCSI command processing for all logical units associated with the SCSI
 * host and wait until pending scsi_queue_rq() calls have finished.
 *
 * Returns zero if successful or a negative error code upon failure.
 */
int
scsi_host_block(struct Scsi_Host *shost)
{
        struct scsi_device *sdev;
        int ret;

        /*
         * Call scsi_internal_device_block_nowait so we can avoid
         * calling synchronize_rcu() for each LUN.
         */
        shost_for_each_device(sdev, shost) {
                mutex_lock(&sdev->state_mutex);
                ret = scsi_internal_device_block_nowait(sdev);
                mutex_unlock(&sdev->state_mutex);
                if (ret) {
                        scsi_device_put(sdev);
                        return ret;
                }
        }

        /* Wait for ongoing scsi_queue_rq() calls to finish. */
        blk_mq_wait_quiesce_done(&shost->tag_set);

        return 0;
}
EXPORT_SYMBOL_GPL(scsi_host_block);

int
scsi_host_unblock(struct Scsi_Host *shost, int new_state)
{
        struct scsi_device *sdev;
        int ret = 0;

        shost_for_each_device(sdev, shost) {
                ret = scsi_internal_device_unblock(sdev, new_state);
                if (ret) {
                        scsi_device_put(sdev);
                        break;
                }
        }
        return ret;
}
EXPORT_SYMBOL_GPL(scsi_host_unblock);

/**
 * scsi_kmap_atomic_sg - find and atomically map an sg-elemnt
 * @sgl:        scatter-gather list
 * @sg_count:        number of segments in sg
 * @offset:        offset in bytes into sg, on return offset into the mapped area
 * @len:        bytes to map, on return number of bytes mapped
 *
 * Returns virtual address of the start of the mapped page
 */
void *scsi_kmap_atomic_sg(struct scatterlist *sgl, int sg_count,
                          size_t *offset, size_t *len)
{
        int i;
        size_t sg_len = 0, len_complete = 0;
        struct scatterlist *sg;
        struct page *page;

        WARN_ON(!irqs_disabled());

        for_each_sg(sgl, sg, sg_count, i) {
                len_complete = sg_len; /* Complete sg-entries */
                sg_len += sg->length;
                if (sg_len > *offset)
                        break;
        }

        if (unlikely(i == sg_count)) {
                printk(KERN_ERR "%s: Bytes in sg: %zu, requested offset %zu, "
                        "elements %d\n",
                       __func__, sg_len, *offset, sg_count);
                WARN_ON(1);
                return NULL;
        }

        /* Offset starting from the beginning of first page in this sg-entry */
        *offset = *offset - len_complete + sg->offset;

        /* Assumption: contiguous pages can be accessed as "page + i" */
        page = nth_page(sg_page(sg), (*offset >> PAGE_SHIFT));
        *offset &= ~PAGE_MASK;

        /* Bytes in this sg-entry from *offset to the end of the page */
        sg_len = PAGE_SIZE - *offset;
        if (*len > sg_len)
                *len = sg_len;

        return kmap_atomic(page);
}
EXPORT_SYMBOL(scsi_kmap_atomic_sg);

/**
 * scsi_kunmap_atomic_sg - atomically unmap a virtual address, previously mapped with scsi_kmap_atomic_sg
 * @virt:        virtual address to be unmapped
 */
void scsi_kunmap_atomic_sg(void *virt)
{
        kunmap_atomic(virt);
}
EXPORT_SYMBOL(scsi_kunmap_atomic_sg);

void sdev_disable_disk_events(struct scsi_device *sdev)
{
        atomic_inc(&sdev->disk_events_disable_depth);
}
EXPORT_SYMBOL(sdev_disable_disk_events);

void sdev_enable_disk_events(struct scsi_device *sdev)
{
        if (WARN_ON_ONCE(atomic_read(&sdev->disk_events_disable_depth) <= 0))
                return;
        atomic_dec(&sdev->disk_events_disable_depth);
}
EXPORT_SYMBOL(sdev_enable_disk_events);

static unsigned char designator_prio(const unsigned char *d)
{
        if (d[1] & 0x30)
                /* not associated with LUN */
                return 0;

        if (d[3] == 0)
                /* invalid length */
                return 0;

        /*
         * Order of preference for lun descriptor:
         * - SCSI name string
         * - NAA IEEE Registered Extended
         * - EUI-64 based 16-byte
         * - EUI-64 based 12-byte
         * - NAA IEEE Registered
         * - NAA IEEE Extended
         * - EUI-64 based 8-byte
         * - SCSI name string (truncated)
         * - T10 Vendor ID
         * as longer descriptors reduce the likelyhood
         * of identification clashes.
         */

        switch (d[1] & 0xf) {
        case 8:
                /* SCSI name string, variable-length UTF-8 */
                return 9;
        case 3:
                switch (d[4] >> 4) {
                case 6:
                        /* NAA registered extended */
                        return 8;
                case 5:
                        /* NAA registered */
                        return 5;
                case 4:
                        /* NAA extended */
                        return 4;
                case 3:
                        /* NAA locally assigned */
                        return 1;
                default:
                        break;
                }
                break;
        case 2:
                switch (d[3]) {
                case 16:
                        /* EUI64-based, 16 byte */
                        return 7;
                case 12:
                        /* EUI64-based, 12 byte */
                        return 6;
                case 8:
                        /* EUI64-based, 8 byte */
                        return 3;
                default:
                        break;
                }
                break;
        case 1:
                /* T10 vendor ID */
                return 1;
        default:
                break;
        }

        return 0;
}

/**
 * scsi_vpd_lun_id - return a unique device identification
 * @sdev: SCSI device
 * @id:   buffer for the identification
 * @id_len:  length of the buffer
 *
 * Copies a unique device identification into @id based
 * on the information in the VPD page 0x83 of the device.
 * The string will be formatted as a SCSI name string.
 *
 * Returns the length of the identification or error on failure.
 * If the identifier is longer than the supplied buffer the actual
 * identifier length is returned and the buffer is not zero-padded.
 */
int scsi_vpd_lun_id(struct scsi_device *sdev, char *id, size_t id_len)
{
        u8 cur_id_prio = 0;
        u8 cur_id_size = 0;
        const unsigned char *d, *cur_id_str;
        const struct scsi_vpd *vpd_pg83;
        int id_size = -EINVAL;

        rcu_read_lock();
        vpd_pg83 = rcu_dereference(sdev->vpd_pg83);
        if (!vpd_pg83) {
                rcu_read_unlock();
                return -ENXIO;
        }

        /* The id string must be at least 20 bytes + terminating NULL byte */
        if (id_len < 21) {
                rcu_read_unlock();
                return -EINVAL;
        }

        memset(id, 0, id_len);
        for (d = vpd_pg83->data + 4;
             d < vpd_pg83->data + vpd_pg83->len;
             d += d[3] + 4) {
                u8 prio = designator_prio(d);

                if (prio == 0 || cur_id_prio > prio)
                        continue;

                switch (d[1] & 0xf) {
                case 0x1:
                        /* T10 Vendor ID */
                        if (cur_id_size > d[3])
                                break;
                        cur_id_prio = prio;
                        cur_id_size = d[3];
                        if (cur_id_size + 4 > id_len)
                                cur_id_size = id_len - 4;
                        cur_id_str = d + 4;
                        id_size = snprintf(id, id_len, "t10.%*pE",
                                           cur_id_size, cur_id_str);
                        break;
                case 0x2:
                        /* EUI-64 */
                        cur_id_prio = prio;
                        cur_id_size = d[3];
                        cur_id_str = d + 4;
                        switch (cur_id_size) {
                        case 8:
                                id_size = snprintf(id, id_len,
                                                   "eui.%8phN",
                                                   cur_id_str);
                                break;
                        case 12:
                                id_size = snprintf(id, id_len,
                                                   "eui.%12phN",
                                                   cur_id_str);
                                break;
                        case 16:
                                id_size = snprintf(id, id_len,
                                                   "eui.%16phN",
                                                   cur_id_str);
                                break;
                        default:
                                break;
                        }
                        break;
                case 0x3:
                        /* NAA */
                        cur_id_prio = prio;
                        cur_id_size = d[3];
                        cur_id_str = d + 4;
                        switch (cur_id_size) {
                        case 8:
                                id_size = snprintf(id, id_len,
                                                   "naa.%8phN",
                                                   cur_id_str);
                                break;
                        case 16:
                                id_size = snprintf(id, id_len,
                                                   "naa.%16phN",
                                                   cur_id_str);
                                break;
                        default:
                                break;
                        }
                        break;
                case 0x8:
                        /* SCSI name string */
                        if (cur_id_size > d[3])
                                break;
                        /* Prefer others for truncated descriptor */
                        if (d[3] > id_len) {
                                prio = 2;
                                if (cur_id_prio > prio)
                                        break;
                        }
                        cur_id_prio = prio;
                        cur_id_size = id_size = d[3];
                        cur_id_str = d + 4;
                        if (cur_id_size >= id_len)
                                cur_id_size = id_len - 1;
                        memcpy(id, cur_id_str, cur_id_size);
                        break;
                default:
                        break;
                }
        }
        rcu_read_unlock();

        return id_size;
}
EXPORT_SYMBOL(scsi_vpd_lun_id);

/*
 * scsi_vpd_tpg_id - return a target port group identifier
 * @sdev: SCSI device
 *
 * Returns the Target Port Group identifier from the information
 * froom VPD page 0x83 of the device.
 *
 * Returns the identifier or error on failure.
 */
int scsi_vpd_tpg_id(struct scsi_device *sdev, int *rel_id)
{
        const unsigned char *d;
        const struct scsi_vpd *vpd_pg83;
        int group_id = -EAGAIN, rel_port = -1;

        rcu_read_lock();
        vpd_pg83 = rcu_dereference(sdev->vpd_pg83);
        if (!vpd_pg83) {
                rcu_read_unlock();
                return -ENXIO;
        }

        d = vpd_pg83->data + 4;
        while (d < vpd_pg83->data + vpd_pg83->len) {
                switch (d[1] & 0xf) {
                case 0x4:
                        /* Relative target port */
                        rel_port = get_unaligned_be16(&d[6]);
                        break;
                case 0x5:
                        /* Target port group */
                        group_id = get_unaligned_be16(&d[6]);
                        break;
                default:
                        break;
                }
                d += d[3] + 4;
        }
        rcu_read_unlock();

        if (group_id >= 0 && rel_id && rel_port != -1)
                *rel_id = rel_port;

        return group_id;
}
EXPORT_SYMBOL(scsi_vpd_tpg_id);

/**
 * scsi_build_sense - build sense data for a command
 * @scmd:        scsi command for which the sense should be formatted
 * @desc:        Sense format (non-zero == descriptor format,
 *              0 == fixed format)
 * @key:        Sense key
 * @asc:        Additional sense code
 * @ascq:        Additional sense code qualifier
 *
 **/
void scsi_build_sense(struct scsi_cmnd *scmd, int desc, u8 key, u8 asc, u8 ascq)
{
        scsi_build_sense_buffer(desc, scmd->sense_buffer, key, asc, ascq);
        scmd->result = SAM_STAT_CHECK_CONDITION;
}
EXPORT_SYMBOL_GPL(scsi_build_sense);

#ifdef CONFIG_SCSI_LIB_KUNIT_TEST
#include "scsi_lib_test.c"
#endif


r0 = openat$nullb(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
dup(r0)
r1 = open(&(0x7f0000000000)='./bus\x00', 0x60142, 0x0)
r2 = creat(&(0x7f0000000040)='./bus\x00', 0x0)
syz_mount_image$vfat(&(0x7f00000002c0), &(0x7f0000000280)='./file0\x00', 0x400, &(0x7f0000002e00)=ANY=[@ANYBLOB="756e695f786c6174653d312c73686f72746e616d653d77696e39352c756e695f786c6174653d312c646f733178666c6f7070792c757466383d312c757466383d302c73686f72746e616d653d77696e6e742c73686f72746e616d653d6c6f7765722c756e695f786c6174653d312c007cdba2387840af9753687ae00b5f3e526fc15834b0046f2490f31d14f518c2d8e433c09c39f50d9aa6dc30cddebb136edb4708a3bc3eb96cd4b7ff5bf01e31e5211daff25f43b8000000c4c30113f9e8fe7f54eaa3c6ebabc996a37f48c781843ff765415f45cea3c19a8161392a020ad113e28486773fecdf8a9e828203bf823a48bb925a495a40cc44d9f26833a1eae4121d85eab4bcfee04e6eaab54b29"], 0x0, 0x237, &(0x7f00000009c0)="$eJzs3EGLG2UYB/DHbW23W9rsQQQF8UUvehm68RMEaUEMKGsj6kGYuhMNGZMlE1YiYnvz6ucoHr0J6hfYizfv3vYieOlBjDTJtkmNiOA62vx+EN4nvPnDM2QSnglkTt758uN+t8q6+Ti2tlNsRdyJexG796uFJxbr1qy+EMvuxMuXf/nxubfefe/1Vrt9fT+lG62brzRTSlef//aTz7564fvx5be/vvrNxTjeff/k5+ZPx08fP3Py282PelXqVWkwHKc83RoOx/mtskgHvaqfpfRmWeRVkXqDqhit7HfL4eHhJOWDgys7h6OiqlI+mKR+MUnjYRqPJin/MO8NUpZl6cpO8Fc6d/f381bdXXC2RqNWfi4iLv1hp3O3loYAgFqZ/zeZ+X8T3J//dxaf31XmfwAAAAAAAAAAAAAA+D+4N502ptNp43Q9fVyMiO2IOH1ed5+cDe//Zlv64952RPnFUeeoM1/n+61u9KKMIq5FI36dnQ8L8/rGa+3r19LMbnxX3l7kbx91zq3m96IRu+vze/N8Ws0/GTvL+WY04qn1+eba/IV46cWlfBaN+OGDGEYZB7Pz+mH+872UXn2j/Uj+0ux1AAAA8DjI0gNrr9+z7M/25/m/8fvAI9fX5+PZ8/UeOwAAAGyKavJpPy/LYqRQKBQPirq/mQAAgH/aw6G/7k4AAAAAAAAAAAAAAAAAAABgc/0btxOr+xgBAAAAAAAAAAAAAAAAAAAAAADgv+L3AAAA///u3zAA")
r3 = openat(0xffffffffffffff9c, &(0x7f0000004280)='./file0\x00', 0x0, 0x0)
syz_mount_image$ext4(&(0x7f00000002c0)='ext4\x00', &(0x7f0000000180)='./bus\x00', 0xe, &(0x7f00000005c0)={[{@barrier_val={'barrier', 0x3d, 0x101}}, {@errors_remount}]}, 0x3, 0x445, &(0x7f0000000b00)="$eJzs28+PE1UcAPDvTLeLCLgr4g9+qKto3PhjlwVUDh7UaOIBExM96HGzuxCksIZdEyFEwRg8GWPi3Xj0X/CkF2M8mXjVuyEhhgvgqWbaGbYtbdktLUX6+SQD78282fe+nXnte/PaAEbWVPZPErE1Iv6MiIl6trnAVP2/q5fPLly7fHYhiWr13X+SWrkrl88uFEWL87bkmek0Iv0iid1t6l05feb4fKWydCrPz66e+Gh25fSZF46dmD+6dHTp5P5Dhw4emHv5pf0v9iXOrE1Xdn26vGfnWx988/bhr5rib4mjT6a6HXy6Wu1zdcO1rSGdjA2xIWxIKSKyy1Wu9f+JKMXaxZuINz8fauOAgapWq9UtnQ+fqwJ3sSSa87o8jIrigz6b/xZb6yDg1cENP4bu0mv1CVAW99V8qx8ZizQvU26Z3/bTVES8f+7f77ItBvMcAgCgyU/Z+Of5duO/NB5qKHdfvjY0GRH3R8T2iHggInZExIMRtbIPR8QjG6y/dZHkxvFPerGnwNYpG/+9kq9tNY//itFfTJby3LZa/OXkyLHK0r78NZmO8qYsP9eljp/f+OPrTscax3/ZltVfjAXzdlwc29R8zuL86vytxNzo0vmIXWPt4k+urwQkEbEzInb1WMexZ3/Y0+nYzePvog/rTNXvI56pX/9z0RJ/Iem+Pjl7T1SW9s0Wd8WNfvv9wjud6r+l+Psgu/73tr3/r8c/mTSu165svI4Lf33ZcU7T6/0/nrxXS4/n+z6ZX109NRcxnhyuN7px//61c4t8UT6Lf3pv+/6/PdZeid0Rkd3Ej0bEYxHxeN72JyLiyYjY2yX+X19/6sPe4x+sLP7FDV3/tcR4tO5pnygd/+XHpkonb4j/Wvfrf7CWms73rOf9bz3t6u1uBgAAgP+fNCK2RpLOXE+n6cxM/fvyOyLSyvLK6nNHlj8+uVj/jcBklNPiSddEw/PQuXxaX8+fj4j6VwuK4wfy58bfljbX8jMLy5XFYQcPI25Lh/6f+bs07NYBA+f3WjC69H8YXfo/jC79H0ZXm/6/eRjtAG6/dp//nw2hHcDt19L/LfvBCDH/h9Gl/8Po0v9hJK1sjpv/SL5rovhLPZ5+1yaifEc0Y2CJSO+IZkgMKDHc9yUAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIB++S8AAP///fHg0g==")
r4 = openat(0xffffffffffffff9c, &(0x7f0000000000)='.\x00', 0x0, 0x0)
open_by_handle_at(r4, &(0x7f00000000c0)=ANY=[@ANYBLOB="18000000020000000b"], 0x0)
getdents64(r3, 0xfffffffffffffffe, 0x29)
readv(0xffffffffffffffff, &(0x7f00000002c0)=[{&(0x7f0000000040)=""/129, 0x81}, {0x0}], 0x2)
pwrite64(r2, &(0x7f0000000280)='+', 0x1, 0x0)
r5 = open(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
copy_file_range(r5, 0x0, r1, &(0x7f00000000c0)=0x10000, 0x6, 0x0)
r6 = open(&(0x7f00000005c0)='./bus\x00', 0x64842, 0x0)
pwritev2(r6, &(0x7f0000000240)=[{&(0x7f0000000000)="85", 0x64000}], 0x1, 0x7c00, 0x0, 0x3)


syz_mount_image$ext4(&(0x7f0000000180)='ext4\x00', &(0x7f00000000c0)='./file1\x00', 0x204410, &(0x7f0000000740), 0xfe, 0x4a1, &(0x7f00000001c0)="$eJzs3M1vVFUbAPDn3mnLN+3Li6h8SBWNjR8tLags3Gh0p4mJLnBjUttCKgM1tCRCiFZjcGlI3BvdGKJ/gSvdGHVl4lb3hoQoMQFdmDF35t4yU2ZKW6YdcH6/5JZz5p7pOc+ce+499x6mAXStwexHErE1In6JiP5atrHAYO2f61fPT/x19fxEEpXKq78n1XLXrp6fKIoW79uSZ4bSiPTDJK+k0ezZcyfGy+Wp03l+ZO7k2yOzZ889OX1y/PjU8alTY0eOHD40+szTY0+1Jc4srmu7353Zm/S8fvHliaMX3/zhqzQidu2r7a+P47akWxaSg1ngf1SqFhd7pC2V3Tm21aWTng42hBUpRUTWXb3V8d8fpbjRef3x4gcdbRywprJr04bWu+crwH9YEp1uAdAZxYU+u/8ttnWaetwRrjxXuwHK4r6eb7U9PZHmZXrXsP77IuLo/N+fZlvk/fDP1jWsEADoet9k858nms3/0thVV257voYyEBH/i4gdEfH/iNgZEfdEVMvem89nVqK2NFRayN88/0wvrzq4Zcjmf8/ma1uN879i9hcDpTy3rRp/b3Jsujx1MP9MhqJ3Q5YfXaKOb1/4+eNW+wbr5n/ZltVfzAXzdlzuWfSAbnJ8brxdk9Ir70fs7mkWf7KwEpBExP0RsXtlv3p7kZh+7NLeVoVuHf8S2rDOVPks4tFa/8/HovgLydLrkyMbozx1cKQ4Km72408XXmle+8bbi78Nsv7f3Hj8LyrR/2dSv147u/I6Lvz6Uct7ytUe/33Ja9Ux2Ze/9s743Nzp0Yi+5KVqvuH1sRvvLfJF+Sz+oQPNx/+O/D1Z/HsiIjuI90XEAxGxP2/7gxHxUEQcWCL+759/+K0VxT+9vv0/2fT8t3D8DzT2/8oTpRPffd2q/jz+4mTbov8PV1ND+SvV898ttG5OlKciKpVVH80AAABw98luvLdGkg4vpNN0eLj2f/h3xua0PDM79/ixmTOnJmvfERiI3rR40tWfPw/N7rZHk/n8N9aej47lz4qL56WH8ufGn5Q2VfPDEzPlyQ7HDt1uS4vxn/mt1OnWAWvO97Wgey0e/2mH2gGsP9d/6F7GP3Qv4x+6V934//LMhT3VxHvVn/sXdjRdC1jiL4cAd4dF1/9Ln3eqIcC6M/+H7mX8Q/cy/qEr3c73+juT2JS3/FaF+zrf1NUlvuidLWXx1e1KeiI637DGRKRLlXkjmu8ajIg1aljcER9LuxPJMg715SaOHc+HznIKd/KsBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA0D7/BgAA//9ajd4t")
mkdir(&(0x7f0000000300)='./bus\x00', 0x0)
creat(&(0x7f0000000e00)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0)
mknod$loop(&(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, 0x1)
link(&(0x7f0000001240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000bc0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
rename(&(0x7f00000003c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000000f40)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')


r0 = open(&(0x7f0000000180)='./bus\x00', 0x14927e, 0x0)
mmap(&(0x7f0000000000/0x600000)=nil, 0x600000, 0x27fffff, 0x4002011, r0, 0x0)
fallocate(r0, 0x0, 0x0, 0x1001f0)
r1 = open(&(0x7f0000000100)='./bus\x00', 0x141042, 0x0)
fallocate(r1, 0x3, 0x1800, 0x10000)
r2 = syz_open_procfs(0x0, &(0x7f0000000000)='ns\x00')
readlinkat(r2, &(0x7f0000000180)='./cgroup\x00', &(0x7f0000002780)=""/4112, 0x1010)


syz_mount_image$erofs(&(0x7f0000000040), &(0x7f00000001c0)='./file0\x00', 0x0, &(0x7f0000000240)=ANY=[], 0x0, 0x17d, &(0x7f0000001ac0)="$eJzsmLFP+kAUx7/vyg/yMy6uLg4SxcHSFjUuxLA5mogaNwlUghYx0EGYdPH/cHZwdvOPMM7qYFwY3Uxqej3oQQR10MT4PsPj+7h313evyXcoGIb5szw+vNyvFe+EAWASaaTU/89GXCO0+tfb83Jraj1/OfeUv041robPIwBB8PnnJwDcFAz4Kg+Cwd1p9VuE6OstCCwovQOCqfQeBLaVdkHYVfpA042w3jT3a55rlhteJRRWGOwwOGHIDffXPSNUtP5IW2+1O4clz3Ob3yg+ml+3IJDX+tPfV282ljY/GwK20jkQNpVeRao3m2gk2v2nE/H5xg/fnwULFr9NxP4UXBDmNX9KaP6R9evH2Va7s1irl6pu1T1ynNyKtWRZy05WGlEUx/jff+lPE9r5/0bUJimJk5LvN+0o9nMniu85rpD+J5CZjfLQ+5Mju4nWSe0jqTLGmHKGYRiGYRiGYRiGYRiGYZgvMAOSX0EldIo4GcDZkNVvAQAA///an3MA")
mkdirat(0xffffffffffffff9c, &(0x7f0000000340)='./file1\x00', 0x0)
mkdir(&(0x7f0000000300)='./bus\x00', 0x0)
mount$overlay(0x0, &(0x7f00000000c0)='./bus\x00', &(0x7f0000000080), 0x0, &(0x7f0000000900)={[{@upperdir={'upperdir', 0x3d, './file1'}}, {@lowerdir={'lowerdir', 0x3d, './file0'}}, {@workdir={'workdir', 0x3d, './bus'}}]})
chdir(&(0x7f00000003c0)='./bus\x00')
r0 = openat$cgroup_ro(0xffffffffffffff9c, &(0x7f00000002c0)='freezer.parent_freezing\x00', 0x275a, 0x0)
readahead(r0, 0x0, 0x0)


syz_mount_image$btrfs(&(0x7f0000005100), &(0x7f0000005140)='mnt\x00', 0x0, &(0x7f0000000040), 0x1, 0x50e7, &(0x7f000000a2c0)="$eJzs3U+IVVUcB/DzZpxxUpl5gcbUbGwrgeIiSDEHI2jC4JWrCnR0EYSQgxTUQhBdSLRoQAl0pYRCITE7Ny6kwBBCaRdUECFCiCC1kP4sYt6958595/rue45jY/r5xMy95/7uOfe8x13M9+W5LwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAI4dxTe07V1bdMbVh3fmrnjRO7r945d+zylRAa7eONvL5nxytvvLNrz2sjscP069m22ew2ZNb1l6wx3HFwvl/nz94QwlAywGC+fXmwMmp592B1wFpH1p6cGL+14+KZibNrth9qHKi+dOaNLPcElkt+X11fuJcm278HkjOKdunWa3Tcoln/9Ib7T14EAHBPNrXam+LP0fxP3KJ9OK0n7cmkPZu0418Is+XGYmTjDneb5/q0vkzznMyiwsqu80zq+ftftFtp/6SdRI17mGfnqXmkGek2z5mkvlzzBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHiYjG597om6+papDevOT+28cWL31Tvnjl2+EkKzfbyRlRvbhv9565mPXlr76apr+xtv35gZzPvF7YrSyeGHuPPCWAj7SpXrcdjfRkNodRbazXCyWni3vTMVCwAAADxKnm7/HijaWRwc6mg32mmy0f4vysLikbUnJ8Zv7bh4ZuLsmu2HGgcWP16ry3iTdx2vaDcXfhqlYBzjbzreQj2eerAyTr10xDTPN78c/6mufyX/N+vzf3zn5H8AAADuh/yfjlOvV/7/869fh+v6V/L/+o5LVvJ/nHHM/wNhcfkfAAAAHmYPOv9PVsap1yv//33zwvG6/pX8v6m//L+iPO148Ls44f1jIWzqNXUAAACgi/j/3Rc+Woh5PfvkIM3r33z9wXTdeJX8P9lf/h9a0lcFAAAA3I/dM5+fqqtX8n+rv/y/8oHOGgAAALgXH+/9/fm6eiX/T/eX/1fl23zlQ9bpcvxXCMfHQhiZ35nJCt+G2e1FAQAAAFgiMad/uObVXXXnVfL/TP3z/+OTDuL6/47n/1XW/5cK2VP/tnowAAAAAI+j6nr++Hj87JsLun3/fr/r/7/fuO3FuutX8v/h/vL/YHm7lN//BwAAAIvwf/v+vzcr49Tr9fz/m/uOvl/Xv5L/Z/vL/3G7uvzyLsX35+hYCOPzO/nTBL+Il9ufFOaGSoW2VtJjV+yRF+ZWlgptM0mPzWMhPDu/czgpPBkLs0nh9mheOJ0UrsVCfj8Uha+SwqV4p302mk83LVyIhXyBxVxcQbG6WBKR9PijW4/5wl17/FhcHAAA4LESw3OeZYc6myGNsnONXies6nXCQK8TBnudsCI5IT2x2/Ew3VmIx9/b/PPGUKOS/0/3l//jWzGcbbqt/w9x/X/+vYbF+v/pWGgmhblYaKVPDGjFa2Rh95N4jWYr73F7vCgAAADAIy1+LjC4zPMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAf9m7Gxi7qjoB4OfN1+tMpzODYsBKdJRIqUmn01bRuBimdncVNeuwYbNEora0U5ztYGtbEkvMZqBmGwNEiE3W3WxiiaurQaWRbJCNG7oklpAlQiBrdDcQiYoxWbbLuoFtMMvmvXvPm/vOnfdROlM67O+XdN55738+7/voO/fedy4AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPD/w/v2vlRtF3//1e9603eu/thzRz/145e+eeTkoyFM1x+vZOHKBwZ+94m1h7Zd+OWhx/dUrn9u30BeLo+H1bU/PfmdL8Zaf7kmhAcqIfSlgQ3DWaA/vz8c67tkOIQLwkKgUWJmKCuRNhx+NBjCsbAQaFT14GAIw4XAtU8+/NCdtcTRwRDeGUKopm08Xc3aGEwDlw1kgaE0sLcvC7z0SqYR+EFPFoCzFt8MjRf98enmDGOLl2vx+utfso69ttLh9cbEWOt8z29d5k4VDKQPTJ/V01aqjmVRenuc8G5bAe+20na+y9NW/CKVf0N5ZSFUDT27ZnbvuHnuYHykJ0xM9LaqaZme55+98IWdZ5JeMa/D2IGxJXkdzk4cPrXt3q333zF13f77KmuvOttu/rSwSYvp5VYN+WtuxTyP0ZTPkxXw9it9Sxr3pSuE8NX5Z15oFy/N/8faz//jyzne9jTljrW+PJLNzeMjwzFxaiSbmwMAAMCKsRL2mt598kN/0a6+0vx/vLvj//GQfz6Zz0Z7IoSpeuLwaAgX1x/PAt+Ozd0wGsLb66np5sDWJHAihDfXE+sbVSUlVsUS40ng1yN5YCoJnIyB6STwjRi4Kwl8MQaOJ4GdMXAiCXwwBsJs8zjeNZKPo+vAYAxszzbi8XgWwm9HYmvJtvq3RlUAAABLJJ8d9jffLZzrcLYZ4vTy+GCnDPEM7JYZqkkN6Qy2Ma1qWUNfpxp6OtXQGPd8++GXaq50qrl0GkalOcPOS+54PrRRmv9Ptp//VxfpSKV0/D+Ea+p/Y+6ePDLXiG+fbsoAAAAAnIW/Wr/7jnbx0vx/qrvz/+M+kd5C5vBY3A2xZzSEyeZAVu3vlQPZUe/VeQAAAABWgsbx+Max8Nn8NjtFO51Pl/NPn2H+eOB/atH8/3P5Hz7Trr+l+f90d+f/DzXfZp04GXvxldEQVhUCj8Re1gJ14zHw86uaA/n4T8YNcHusKj8xoVHV7bHE9hiYTALHWpV4olHi4uZA/mQ1Gj/cGMdsXqIQAAAAgHMu7g6Ix+Xj+f/fHZ3703blSvP/7Wd2/n99Hlw6vX9udQgb+0LoTX8Y8NhQtjBgDAxX8sQ/DmV19aZV3ToUwpW1gaVVPZuv/9+XrjH45GBWVQxc/I5vvXBZLfH1wRA2FgM/uf6e99QSB5NAo/E/GQzhbbXRpo3//aqs8f608b9cFcJbC4FGVTesCqHW2EBa1cPV/DoGaVX3VUN4YyHQqOqKagiHAgArVPyvdFfxwQOHbtmzY25uZv8yJuI+/MGwe3ZuZmLn3rld1RZ92pX0uWkZo1vLY+r2yjdxiaKnjpxe10268TvByWJb+X780omD+f34Xai/Ps7N/U13t6RDvvzSchOh8E2q1ZB7lnnIQ8VKFp7EUv0x/0BYHVbdfGBm/8Tndxw8uH9T9rfb7Juzv/EwU7atNqXbamixvnXx8uh2Ve1Xu63WFSvZePCmfRsPHLplw+xNO26cuXHms5ve/b7Nk5u3bHnvFRtro5rM/nYY6rrFqk6G+so95SEs98tibV+hknPxqSEhIbHSEnf/4umH2n38lOb/+9rP/+OnTvzkz9dnaHX8fywe5s8eXzjMvz0GjnV7/H+s1dH8xokB40lgPgbmHeYHAADg9SHujox7M+Pux4v+6ePXtStXmv/Pd/f7/yVa/7+xdP1HWi3zvz6WmGy1/n+6zH9j/f/5Vuv/p8v8N9b/P/YarP9/cyOQbJLfWv8fAAB4PTh36/93XN4/vUBAKUPH5f3TCwSUMnRcxr/bCwSc8fr/lw59PS4osKjS/P+u7ub/Fu4HAACA88f83955b7t4af5/rLv5/7lf/y+0Ov9/vFVgutXCgNb/AwAAYIVqtf7ff+z9z2fblSvN/493N/+Pp130NOWOtb48kq1pF9I17U6NNH4yAAAAACtDT5iY6O8yb9PKqFtffZtxKdB26aIrP333yXb1leb/J7qb/zf9LmN24vCpbfduvf/lO6au239fZe1VC8f/AQAAgOXT7X4JAAAAAAAAAAAAAADgtff2B7/wtXbx0u//wzX1x1v9/j9e9y/+vuDCptyx1s7r/+X3r/3o9w7Vlyx8bCSES4uBPbftuSDk1+ZfVww89Mn1F9USt6UlfvjMB39VS3w6DXx4wxterCWuTALb4yKJb04D8aqKL65JAnF5xafSQNwex9PAQB740ppsHJV0W/1mONtWlXRb/etwCKOFQGNbPTCctVFJB3g0CTQG+Lk0EAf4R3mgJ+3V91ZnvYqB4Vj0a6uzXgEAcN6K3wL7w+7ZuZnJ+BU+3q7ta76NmpYsu7V1tZ3EpcmeOnJ6XTfp3vS76MK1xvtDtTaETaWvq8Uslfool6aWDpvuwhZD7rTa23JtuoHWIxrMRjSxc+/crv6OA9/SOcvmvo5ZNpUmO8UsPfVN2kUtXfSlixF1uW266HK83xMmJnqTXO+PwbHQpNMrotvf6y+25l+rV0TNXcOHrm5XX2n+P9bd/L9aHNeL+cUA5uOV9b48apl/AAAAWF5f2nr6q/Hfd/du+Uy7vKX5/3h38/+4Bys/FJzt7TgRr/9/eDSE+qX1x7LAt2NzN4yG8PZ6ajqWyC6o/5FYYjILfDvuMFkfS2yfbq5qVQwcTwK/HskDJ5LAyRjI91J8K+S7cu4eCeE99dQ1zSX2xRJjSeDjMTCeBCZiYDIJrImBqSTw72vywHQS+OcYCLPN2+r+Nfm2AgAAOBP5PKu/+W5I53nH+zplqHTKMNQpQ0+nDNVOGVqNIt7/fszQXzwen2eID/WntQ4mtZQyxIvhn3G/ShnCE80504KlpuP5B43zDSrNGf7gif99MLRRmv9Pdjf/H2q+zVo/Gef/C9f/ywKPxO59JZ46Ph4DP7+qOZDvGDgZJ7u3N6qazkvkk/bbY4mpGBhPAvtiYCoJbL8mDxy7qDmQz7QbjR9uND6blygEAAAA4JyLOwjibprG8flNj2xrV640/5/qbv4f21tdbOyLsdZfrgnhgcpCbxqBDcNZIO7HGI4/j79kOIQLCjs4GiVmhrISA0nD4UeD2S/UB9KqHhzMfnwQ71/75MMP3VlLHB0M4Z2FvS+NNp6uZm0MpoHLBrLAUBrY25cF4p6fRuAHPVkAzlpjr2B8QeWnujSMLV6uxevv9XJN0HR4pX2gi+Rb7DdXy6WaPpDvU204s6etVB3LovT2OOHdthLfbWPebcUvUvk3lFcWQtXQs2tm946b5w7GR4q/ZC1Zpud5sV+ytksvwetw/tX3trNq2oHJ5ONjcvFyi78OK7G62YnDp7bdu/X+O6au239fZe1VXXejhbhJZzcfufynhc273Kohf82tuM+TaZ8nK/G/gXFPWwjhvWtOn9n5/9Pdzf/7ktu603FjHhgN4fLCxn0sbv5to9nnYCGQfUq+sRzIDrn/YqTlJycAAAAstcbujsb+gtn8NjshPJ0nl/NPn2H+uL9iatH83fb7ry95203t4qX5//b28/9VSTcd/3f8n2Xi+P+izvdd0avSB+bPald0qTqWheP/izrf322O/y/K8X/H/xfj+H8Hjv8v6nx/2krfkvb50hVCuPhfPrSrXbw0/9/X3fzf+n+LL9rXWP9ve6v1//a1Wv9v3vp/AADAsmqx0Fw6zyut3lfKkK7eV8rQcYHAjksMWv/vjNf/e8t/rfv90EZp/j/f3fw/vhxWF1tfKev/jV/Toqq7YmCfhQEBAAA4H7XaQQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMBr6zc3XHFRu/j7r37Xm75z9ceeO/qpH7/0zSMnHw1htv54JQtXPjDwu0+sPbTtwi8PPb6ncv1z+6p5uf789i1NuWOtL4+EcKzwyHBMnBqp3VkIXPvR7x3qqyUeGwnh0mJgz217LqglvjESwrpi4KFPrq+P6La0xA+f+eCvaolPp4EPb3jDi7XElXmgknb3b9Zk3a2k3b1zTQijhUCju3+2prmqRhsfygM9aRt/N5y1EQPDsehXh7M2YmAulphdFcLGvhB606oerWZV9aZV/UM1q6o3rerPqyFcGULoS6t6ZiCrqi8d+eMDWVUxcPE7vvXCZbXEsYEQNhYDP7n+nvfUEp9LAo3G/3gghLfVXjJp49/vzxrvTxs/2h/CW0MIA2mJ/+7LSgykJZ7tC+GNhUCj8c/0hXAo8LoQP3x2FR88cOiWPTvm5mb2L2NiIG9rMOyenZuZ2Ll3blc16VMrlUL6lVtf/dh/9sIXdtZunzpyel036b68XH+9y5v7m+5uOd97H/s1VKxk4fko1R/zD4TVYdXNB2b2T3x+x8GD+zdlf7vNvjn725tHs221aam2VW+H8tGr3VbripVsPHjTvo0HDt2yYfamHTfO3Djz2U3vft/myc1btrz3io21UU1mf5diqPeU4z3LPNS1fYVKzsUHgISExEpL9DR9uk0u1Qd5ZZk+3Upf9Bc62h+q9Q/o0rSimKVSH+VSDHprlyNcxBl/T+k4ok2liUMpy+bOWbaUJhMLWQazLPXvdaXJYbGmnvomjfd7wsREy//Ux5rvFjfv80uwebtNAwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD8HztwIAAAAAAA5P/aCFVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVXYgQMBAAAAACD/10aoqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqwg4cCwAAAAAI87cOo2cDAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALgUAAD//2McwRk=")
chdir(&(0x7f0000000c40)='./file0\x00')
ioctl$BTRFS_IOC_LOGICAL_INO(0xffffffffffffffff, 0x40089413, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='./file0\x00', 0x0, 0x0)
ioctl$FS_IOC_ENABLE_VERITY(r0, 0x40806685, &(0x7f0000000a80)={0x1, 0x2, 0x1000, 0x0, 0x0, 0x0, 0x0, 0x0})
stat(&(0x7f0000000180)='./file0\x00', &(0x7f00000001c0))


mkdirat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x0)
mount$tmpfs(0x0, &(0x7f0000000080)='./file0\x00', &(0x7f00000000c0), 0x0, 0x0)
mkdirat(0xffffffffffffff9c, &(0x7f00000001c0)='./file0/file0\x00', 0x0)
pivot_root(&(0x7f0000000500)='./file0\x00', &(0x7f0000000540)='./file0/file0\x00')


syz_mount_image$vfat(&(0x7f0000000180), &(0x7f0000000100)='./file0\x00', 0xfb36b7c5efaff487, 0x0, 0x1, 0x0, &(0x7f0000000000))
mount$tmpfs(0x0, &(0x7f00000003c0)='./file0\x00', &(0x7f0000000400), 0x0, 0x0)
chdir(&(0x7f0000000300)='./file0\x00')
mknod(&(0x7f0000000040)='./file0\x00', 0x8001420, 0x0)
r0 = open$dir(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
r1 = open$dir(&(0x7f0000000140)='./file0\x00', 0x1, 0x0)
r2 = openat$cgroup_ro(0xffffffffffffff9c, &(0x7f0000000280)='blkio.bfq.io_merged_recursive\x00', 0x275a, 0x0)
ftruncate(r2, 0x2000009)
sendfile(r1, r2, 0x0, 0x7ffff000)
readv(r0, &(0x7f0000000200)=[{&(0x7f0000000f40)=""/4096, 0x1000}], 0x1)


symlink(&(0x7f00000003c0)='./file0\x00', &(0x7f0000000000)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00')
readlinkat(0xffffffffffffff9c, &(0x7f0000000100)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0xffffffffffffffff, 0xb4)
readlink(&(0x7f0000000240)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', &(0x7f0000001200)=""/4096, 0x1000)


mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x3, 0x32, 0xffffffffffffffff, 0x0)
r0 = openat$urandom(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
read(r0, &(0x7f0000000000), 0x2000)
close(r0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000080)='./file1\x00', 0x105042, 0x1ff)
read(r1, &(0x7f0000000000), 0x2000)
close(r1)
munmap(&(0x7f0000000000/0x2000)=nil, 0x2000)


syz_mount_image$ntfs3(&(0x7f0000000100), &(0x7f00000000c0)='./file0\x00', 0x804018, &(0x7f0000000380)=ANY=[], 0x1, 0x1f211, &(0x7f000001f5c0)="$eJzs3QmYTeUfB/D37Pu+XLvBWEO2RJJ9zb6lki1kJ1uUComoZAvJlmRLCJVKIolI9iUhSZIklZDE/3HnzjTL9a9p1/v9PI8555577vuee773jPmd7X7VZGyDZjWbJiQkJBCbIUnOkzSGkCHkcuw5PjbtcmzIxP4NbLOwwl7zw85Xppm5V987eFHeNf20FivMNyWyxW791elSR7eEW7J+dalZ5y59E7r0TejZq19Cu4T2vXr1a9e+e8eEDl36diue0Kh7x3Z9OyZ06dm3Y580T3fq3qt370EJ7Xp2MNTefTr27ZvQrueghG4dByX065XQr8+ghHb3tOvSM6F48eIJhkrgN2q+8J9eAgAAAAAAAAAAAAAAAAAA+HNcvpxyaB8AAAAAAAAAAAAAAAAAAACuUTXr1KtRgigpjxnCkNqEIfMZQoj9y3zJ1/1zV2nnyqxto2NZoj9rJ4+d6Vay+4V9zNWGYrzG4pyQUIIQ0jmlfZbUjY4xRIhOE361H7I01mhsmNwvzyWS+qQmaRZ7PCS27AypnGZBbokNKydPOM3FHTqVktbU0jTtZFxrlVOvOEKITNIOHYaNDi9fvnw53ir6c1wtTaAD8qcb8qcb8qcb8qcb8qcb8r/WpKsrK6d/PnN1JXfV+r9zuvqfi1XD7FWW6++o/3untM+SJpmt/yunXUHJ/cop9X990oX0IX1i06+2H4BLv54rxx/m4S+nWs//VldLE+iA/OmG/OmG/OmG/OmG/OmG/OnGZqj/2f9T/7PXcv2fcgZD0jB1/V+P9CL3kJqkC+lOOsamX63+rxQb/rLjhY87zFOZi74I9T/8eyF/uiF/uiF/uiF/uiF/uiF/umWs/7lY/X86Xf0vxvYB8FdpyY7Vy8n1f4nfWf+nPc+fIY0yXeenldy+xCWSFqQX6U76kx6kY7TdISn9sKRDSo/8kCvvI/l6AD/6bJnYluKTmYxLmKReBDf2+ui0pBmEBEJIAkvSzJP+ORLbV1IipX+euNGx1v2ybpqie4u7kXake3RvRPL5CL0JIYVS5heInrKmY7nF3vmQlOlZUs5WyHLV/RBXSxPogPzphvzphvzphvzphvzphvzpxmeo//lY/T9Ly3j9P5+yx6hZhpb+zPr/Dx3nT/72/9gw9XH+KqQf6Uf6kOqkI+kUm552PwD3m/cDjCIZ9wNEp2VyP0C0XldJSq9XlvvKWCJpSpqRKqQBqU6qkCakOmlD6pAGpCZpSJqQ+qQKaUbqkIakQWbizuDq+/9SRR8916FEbLx2bJgYXYJmpAmpQ6qS5qQZqUHakHqkTnS5/3oJqcaHpBq/HJNIapI6pF50qRqQKqQ+qfE3LNUvSqQar0oIqZ48HlvliaQhqUrqkhqkGmkWzbb637p8SeevsKnGY5jk5WsaXbbm0YSbkdtJG1Kd1CBNSbXolEakWfST+FdplGo8fr4tSENSjzSPJvv3Z9w21Xjl1FtSyvpLu3x//rb7//VOt3x6bDx5mBj9vcKSKn/hMvw/Q64y/Zd860R/99UgLUkb0oQ0JA3/lt8rycamGq/8q8tXhdQj9UhDUu1vyfaKWanG428fVaPb7ZVPW6OrtvLXHf9Z+qvL14TUII2i/7c1jW4hjUjD6Dr9e1JeE3/5Uv4gTiQ1SJV/YLtNti3VeOU413EmLd8f9fvzP3zVZ5J+ASZGt4dapBapEf3bpXl03dVL+b+kafRvhxrR39p/iVRHgoZc7Yl/sb/nvFEc/6Ub8qdbxuP/QrT+54jNZjz+L0Qr5BJxW/q1+r/g2SKlUg+TpxdISFNqRl/3y3UETLQq+oPH/6PtM1zx6ONGac7r/+Xzn/zClL/1YgVmUTvt0GGrRYdX/j5seWWET9pvUC72kit/F5YmJaLvg4mtmORTKQrF/qVeyCzp7sIxK7aMDF85tsbTqp16hccZJvfvMEm10NLk8wb4pPMM5DTLyaUsy7/7OgX4a+D+L3RD/nRD/nRD/nRD/nRD/nTLeP8/MXb8v22c6//Ff/j6/9T3/8/0dQGx91I55X0mkbhEUpV0If1ID9KO9L7qdf/J0t9fMf3t/xymZnR4bdTT2P9HN+RPN+RPN+RPN+RPN+RPN+RPt4zH/6VY/b8xzvf/Sb9S///51/+zpF5m6/zkE8Njw+T2hWid34v0Iv2ij/8N5/3H289wtfMQ0g+dWDt/bD8Dtn+6IX+6IX+6IX+6IX+6IX+6IX+6Zaz/5Vj9/1ic+l/+19z/n/sd9/9PK/V9AaqSdqQDqRa9N2BfklRPp72PHpsyNiTN+fm/NHspNppyfv7p3GmHMcm9sZeTXvDPnieA7Z9uyJ9uyJ9uyJ9uyJ9uyJ9uyJ9uGet/JVr/K8Tm0tb/QvS5K5V3nbgt/YH6PyF9W8yvnv//p3wvQLRfiUskTUlHcjfpT/qQjunq+1+2j+Tj8b9cf590q6DeTNrhldaqk6bEqVIr+nhs8r1yYvcJWJNy/f2V+ZJac0na+wQsTXX/pCzRMxCuvP+kO/80TlweXT/Jw+R72wwmhNQhdTLMP+vgoaos88sweZdFh+j8XIb5T7vc+SvzJA9JuvZT583G3t9G8sv9Beanen/J89vp3t+U2D8SPaMiqf9Cybdeukrf6ee72jpIP9/V3nv693FtXK/xZ8P1X3RD/nRD/nRD/nRD/nRD/nTLeP2/Gjv+Pz/O8X/1X3P9P/c76v/kJU8apr7+vznpTaqRdqRvrP6Pd15+8lH8X+4FzcYd5kl4LDpMbocISecPJMTuw5dI6pCepBPpFXtV8s4PvUv2bvc9dGB/+vf919al2P9HN+RPN+RPN+RPN+RPN+RPN+RPt4zH/7XY/f/Hshnv/6f9nz1Gf6D+t9O39Tcd/4/2e6X+r0EGkn6kI+lJOkSPZw9kko9ns6Ql8+v39a/NJP27ImvSOyBtY98SpCX/zOTyFYjtGhC5RNKQtCddSZ2UexT8ee1z0fYbk/7R+yK0I4Rki7XfOfbdbpltf0i65Ze5RNKEdCS9STvSJ7qHJePnht7j7/807P+lG/KnG/KnG/KnG/KnG/KnW8bj/3q0dmdIiTj3/9f/hvv/pb0vH/ub78s3k8l4X77otN/zffwp/TMpY//N+hT7/+iG/OmG/OmG/OmG/OmG/OmG/OmW8fi/8X/qfwP1/38Mtn+6If//hN8dI/KnG/KnG/KnwpCrPYH86Yb86Zax/jf/T/1vxv3E/FLBo/6/1mD7pxvypxvypxvypxvypxvypxvyp4uQ7nHG+t/6P/W/heP//zHY/umG/OmG/OmG/OmG/OmG/OmG/OmWsf5PruOnkLT1P5Pyivj+rPr/v1ln/1th+6cb8qcb8qcb8qcb8qcb8qcb8qdbxvrfQf1PEWz/dEP+dEP+dEP+dEP+dEP+dEP+dMtY/7uo/ymC7Z9Ov7Y1Ax2QP92QP92QP92QP92QP90y1v/eH63/U91iAPX/vx22f7ohf7ohf7ohf7ohf7ohf7ohf7plrP99HP+nCLZ/uiF/uiF/uiF/uiF/uiF/uiF/umWs/wPU/xTB9v9fwfyuVyF/uiF/uiF/uiF/uiF/uiF/umWs/0PU/xTB9k835E+x0U//00sA/zBs/3RD/nRD/nRD/nTLWP9HUP9TBNs/3ZA/3ZA/3ZA/3ZA/3ZA/3ZA/3TLW/wxhSG1ikCFs2vqfj/1jSfz6/MqsbaNjSfV/7d9Y/xcgJCF9W0yck5lLEEI6p7TPkAbRMYZo0Wnar/aTvr3kfkUukTQm/Ukv0o+0I4SMJYQMjK6aK/1wpCVJmjGRNEx5rRN7w0yq/R61Y88lRNcSIYViK49J2sSEK20ksEk7UhLSrTc21u/plH5Zsi2l38Zp5k3d75rYPxK9b0OJ2HrnYzkmtZvmjcfWxeUUbIb9LT6pHB22jLVzpYFfbydJSn7p33Oc94v9PP8G+P1PN+RPN+RPN+RPN+RPN+RPt6vX/52ZtPU/RwjJ+ivH/6/V+r8haU+6kjqkQ6wOb0R+qcNLxKn/PZK2Dk8gGd/EtVHnYvunG/KnG/KnG/L/j4r3J1QcyJ9uyJ9uyJ9uV6//x8ap/7PFhvH8HfV/75T2GdLkT6r/ZS6RNCEdSW/SjvQhfUnH2PNjk/uL7QdolLIfoElKG+n3A1SO/bsiR2yYQPToUCdkVs7YGk6/3v7E/QRXi+fPmR3+Y5A/3ZD/f0tm7wKK/OmG/OmG/OmG/OnGRev/Iunq/8qEI1PSnf8vkAKke5wWfqn7kyTX/SN7FS51pe6+2jAhIXXrSdj/e9w/aenqxep+IbZUv9ZP+vau9Fss+spOpAvpTjrGzpu/Uu/bV+bmk+r90yn1fh1SmpSIzsWQtPX+4dg/klLvM6RtdAkJyR57/KvLF1sL2WKn21/pM97y5fy97afHp26/AOlISmbI80r021LOaUi6DkKOjV157c2x990y5Xk+Oo0ZkvT9kUTkSWK9ls3rVE9qM9W0WnGm1W9YnXxdJdb/v/t8if+aAv/0AsA/CvnTDfnTDfnTDfnTDfnTrcBV6n+GbGPS1//547bwu+v/OG1lpv7/s+vrIbFlZ6JH8DOeF1M59RtO1V7yMA8zLLr0v6duzkJ+f908NLZYV62bK8WdGhM/U6AF8qcb8qcb8qcb8qcb8qcb8v+n/bP1Yv6r1v+90x3/F0n+5DvCpfF76/94jf0l9X96fPr6P+n495CUKxeypNT/SqqMknBkZWwOPfbFBQ1Ii+jPK1qRfqQH6U1akb5kELmfFCNdSA/SjtxDOkb/9YyeSVCelCc3kFKkJClPbiTlSatUeyLSjlcilUirP7nVtJ+3/L/yeRPif956jN8TSflg/N/P2+Xn0nze4J+SO+5U/P6nG/KnG/KnG/KnG/KnG/Kn29XrfznD+f+/7/h/wbNF0gz/rOP/6c//v1o/ye9BjbVwpd/br1L/pzn+n+7SmPTH/5PbSx7mYR75v8f/r/TZL1Zfj015no9O+6P7c6LXHmgi6d+3Y5/iA9v169enJIkN4jxXisQG0fxRl9MMv//phvzphvzphvzphvzphvz/af++4/9stP4vF+f4fzxXq//T1+Pph5mv/5OOR6ev/3+tn9jp/aRQvqRhxvP/S6frh8Tdz/DH+4kW3Fe9ziB5rHLqN56qveRhHm74n3idQVKmf8p1BnANwu9/uiF/uiF/uiF/uiF/uiH/f9q/r/5POv6/7w9e//9n1v+9Uy1ds8zW5bH30CHWQnJdrqTU5cXJ3aQX6R6bI7P3AUhuN3mYhxn577wPQFzY/umG/OmG/OmG/OmG/OmG/OmG/P9p/7b6n8Rq3Cs1cNo79Etxlz8T9Wa82wfAPyp+pkAL5E835E835E835E835E835E83KRP1vxy3BZwPfi2LnynQAvnTDfnTDfnTDfnTDfnTDfnTTc5E/a/EbQH1/7UsfqZAC+RPN+RPN+RPN+RPN+RPN+RPNyUT9b8atwXU/9ey+JkCLZA/3ZA/3ZA/3ZA/3ZA/3ZA/3dRM1P9a3BZQ/1/L4mcKtED+dEP+dEP+dEP+dEP+dEP+dNMyUf/rcVtA/X8ti58p0AL50w350w350w350w350w35003PRP1vxG0B9f+1LH6mQAvkTzfkTzfkTzfkTzfkTzfkTzcjE/W/GbcF1P/XsviZwn9K+as/hfzphvwpwmachPzphvzphvzphvzpZmai/rfitnD1+j/OnxuEkCG/d1HhLxA/U6AF8qcb8qcb8qcb8qcb8qcb8qeblYn6347bAo7/X8viZwq0QP50Q/50Q/50Q/50Q/50Q/50szNR/ztxW0D9fy2LnynQAvnTDfnTDfnTDfnTDfnTDfnTzclE/e/GbQH1/7UsfqZAC+RPN+RPN+RPq0+jP5E/3ZA/3ZA/3dxM1P9e3BZQ/1/L4mcKtED+dEP+dEP+dEP+dEP+dEP+dPMyUf/7cVtA/X8ti58p0AL50w350w350w350w350w35083PRP0fxG0B9f+1LH6mQAvkf827fJV7+Ob4LS9G/nRD/nRD/nRD/nRD/nQLMlH/h3FbQP1/LYufKdAC+dMN+dMN+dMN+dMN+dMN+dMtzET9H4nbAur/a1n8TIEWyJ9uyJ9uyJ9uyJ9uyJ9uyJ9ukUzU/1nitoD6/1oWP1OgBfKnG/KnG/KnG/KnG/KnG/KnW5ZM1P9Z47aA+v9aFj9ToAXypxvypxvypxvypxvypxvyp1vWTNT/2eK2gPr/WhY/U6AF8qcb8qcb8qcb8qcb8qcb8qdbtkzU/9njtoD6/1oWP1OgBfKnG/KnG/KnG/KnG/KnG/KnW/ZM1P/xv1AK9f+17Dd9SRj8ZyF/uiF/uiF/uiH//6ZKym+bD/nTDfnTLUcm6v+ccVtA/X8ti58p0AL50w350w350w350w350w350y1nJur/XHFbQP1/LYufKdAC+dMN+dMN+dMN+dMN+dMN+dMtVybq/9xxW0D9fy2Lnyn85y1JGiB/uiF/uiF/uiF/uiF/uiF/uuXORP2fELcF1P/XsviZAi2QP92Q/3+JmOlXIH+6IX+6IX+6IX+6JWSi/s8TtwXU/9ey+JkCLZA/3ZA/3ZA/3ZA/3ZA/3ZA/3fJkov7PG7cF1P/XsviZAi2QP92QP92QP92QP92QP92QP93yZqL+zxe3BdT/17L4mQItkD/dkD/dkD/dkD/dkD/dkD/d8mWi/k+M20Lq+l/4qxYT/iLxMwVaIH+6IX+6If//sN/wHfDIn27In27In26JhCH8EN8mJCFW6/uEEJYQW44+ykIuM7kJw0YfCAmEkIQr41qWK4/tDNOJnzQ/kzTdvfJT85PmTT2N+ORtxkuZT0uZjzmcZhrJQhYQN03/eX7pf1aG6QAAAAAAAADwm6Wu1dPW7QAAAAAAAABwLarToHrLQnFOFStECFkpEnLcSnpskH1MvNfzsZ+dSZPomJD880y3kt0v7GOuOlRjDcSGyd9dJHOJpArpR/qRPqQ66Ug6RacyJOl8BDnz/aSTup+qpB3pQKqR7qQ/6RudKsWeF0lb0ihz/cTOn6icrh8p2k8X0o/0IO1IbyLF+mlL6mWu/eSv6khI274Qbb8X6UX6kS6kO0k6b0L7jctvkP+znuzk5a9BBpJ+pCPpSTqQpNMs2N+RA5/mE5M6h3qkF7mH1Iwuf8dY3iQ2bEvqZq6fpbHXxobJ/fBcIqlPapJmKS0n/cz0+0gOuHLGz1NS+/VJF9KH9Ek6jya2ZSmZ/zxlzCMhOY+mpCO5m/QnfUjH2Oaj/o722ehWTWKJpv68Nie9STXSjvQlHQkXncr9Gcuf0n4L0iu6zfUgHVPaa0mq/9H2o59XhisefZyQkm+zNO0WPFuk1JXXX22YvB13iLV7pZ1i0QQ7xT6fxcnd0eVPmiN7yuc07fY8slfhaHtXG6Zf/oQEYheLvjK5nxLR6bl+pf30y5/Sfux9qKnex+1p2i8ZzT/3Vdr/1fWUtHikUL606+mX9ktFp/917ZdOs/7slE8y/HbGP70A8I9C/nRD/nRD/nQzyLnLcfwyw5DkSVJswsW0DQz5WxcXAAAAAAAAAH6X0y53nqQ6GjU4erSGH1KbENIyNi3pngBlYsdXfDKKuOmuC8iSNC3d/QB+7fEVjROXR9sakon+ZzIZ+49O+x39zzp4qCrL/HLIuUOs/xKpjuUn9Z8jaRYuzT0Ooj9/Sz8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADXktMud54wvzweTAhhCD+kNiGkZWyaTwhhSRnCxh6NIi5hkl4juNFBlqRpSTMICYSQBJb86uMrGicuj7Y1JBP9z2Qy9h+d9jv6n3XwUFWWiXYa1SHWfwlCSOU0/edImoXzyWUmd3L/0Z+/pR8AAAAAAACAfxZDWMIRnghEJBKRiUJUohGdGMQkFrGJQ1ziEZ8EJCQRkoVkJdlIdpKD5CS5SG6SQPKQvCQfSST5SQFSkBQihUkRch0pSoqR4uR6UoKUJKVIaVKG3EDKkhtJOVKe3EQqkJtJRXILqUQqkyqkKqlGqpMapCapRWqTOqQuuZXUI/VJA9KQNCKNSRPSlDQjzUkLchtpSW4nd5A7SStyF2lN2pC2v+v1D5DB5EHyEHmYDCFDyTDyCBlOHiUjyEjyGBlFRpPHyRPkSTKGPEXGknFkPJlAJpKnySQymUwhz5Cp5FkyjUwnM8hMMos8R2aT58kc8gKZS+aR+WQBWUheJIvIS2QxWUKWkpfJMrKcrCCvkFfJa2QleZ28Qd4kq8hbZDV5m6wha8k7ZB15l6wn75ENZCN5n2wim8kHZAv5kGwl28h2soPsJLvIbrKH7CX7yEdkP/mYHCAHySHySSZffy7d6wcyhCEMy7AMz/CMyIiMzMiMyqiMzuiMyZiMzdiMy7iMz/hMyIRMFiYLk43JxuRgcjC5mFxMApPA5GXyMolMIlOAKcAUYgoxRZgiTFGmKFOcKc6UYEoypZhSTBmmDFOWKcuUY8oz5ZkKTAWmIlORqcRUYqowVZhqTDWmBlODqcXUYuowdZm6TD2mHtOAacA0YhoxTZgmTDOmGdOCacG0ZFoydzB3MK2YVkxrpjXTlmnLtGfaMx2YDkwnphPTmenMdGW6Mt2Z7kxPpifTm+nN9GH6MP2YfswAZgAzkBnEDGIeYB5gHmQeZB5mqrFDmWHMMGY4M5wZwYxkRjKjmNHM48zjzJPMGOYpZiwzjhnHTGAmMmeZScxkZgozhanIPstMY6YzCexMZhYzi5nNzGbmMHOYucw8Zh6zgFnIvMgsYhYxi5klzBLmZWYZs5xZzrzCvMK8xqxkVjJvMG8yq5hVzGrmHLOGWcu8w6xj3mXWM+8yG5iNzAZmE7OZ2cRsYbYwW5mtzHZmO7OT2cnsZnYze5m9zEfMR8zHzMfMGOYQc4g5zBxmjjBHmKPMUeYYc4w5zhxnTjAnmJPMSeYUc4o5zXzLfMd8y5xhzjBnmXPMeeY8c4G5wFxkLjKXmEtXNn72Cp7lWZEVWZmVWZVVWZ3VWZM1WZu1WZd1WZ/12ZAN2SxsFjYbm43NweZgc7G52AQ2D5uXzcsmsolsAbYAW4gtxBZhi7BF2aJscbY4W4ItwZZiS7Fl2BvYsuyNbDm2PHsTW4GtwFZkb2ErsZXZKmwVthpbna3B1mRrsrXZ2mxdti5bj63HNmAbsL2K9ijahB3KjGCas1eSack+xdzBjmVasXexrdk27ASmHduench0YDuyndh72MnMJKYr275od7YH25OdxvRmexadzvRl+7EzmQHsfexAdhB7P/sAO5jtUPQh9mF2LjOUHcYuYIazj7Ij2JHsYqY6eyWxGuyT7Bj2KXYsO459jZnATmSfZiexk9kp7DPsVPZZdho7nZ3BzmRnsc+xs9nn2TnsC+xcdh47n13ALmRfZBexL7GL2SXsUvZldhm7nF3BvsK+yr7GrmRfZ99g32RXsW+xq9m32TXsWvYddh37LruefY/dwG5k32c3sZvZD9gt7IfsVnYbu53dwe5kd7G72T3sXnYf+xG7n/2YPcAeZA+xn7CH2U/ZI+xn7FH2c/YY+wV7nP2SPcF+xZ5kv2ZPsd+wp9lv2e/Y79kz7A/sWfYce579kb3A/sReZH9mL7GXWcIxHMtxHM8JnMhJnMwpnMppnM4ZnMlZnM05nMt5nM8FXMhFuCxcVi4bl53LweXkcnG5uQQuD5eXy8clcvm5AlxBrhBXmCvCXccV5YpxxbnruRJcSa4UV5orw93AleVu5Mpx5bmbuArczVxF7hauEleZq8JV5apx1bkaXE2uFlebq8PV5W7l6nH1uQZcQ64R15hrwjXlmnHNuRbcbVxL7nbuDu5OrhV3F9eaa8O15dpx7bm7uQ5cR64Tdw/XmevCdeW6cd25HlxPrhfXm+vN9eH6cDzXnxvADeAGcoO4+7mfuUvcZe4h7mFuCDeUG8Y9wg3nHuVGcCO5x7hR3Gjuce4J7kluDPcUN5Ybx43nJnATuae5Sdxkbgr3DDeVe5abxk3nZnAzuVncc9xs7nluDvcCN5ebx83nFnALuRe5EbGWlv6G178T5/VPRHvfym3jtnM7uJ3cLm43t4fbyu3j9nH7uf3cAe4Ad4g7xB3mDnNHuCPcUe4od4w7xh3njnMnuBPcSe4kd4o7xZ3mvuV+5L7nznA/cGe5c9w57kfuAneBuxhbB4RneJbneJ4XeJGXeJlXeJXXeJ03eJO3eJt3eJf3eJ8P+JCP8Fn4rHw2Pjufg8/J5+Jz8wl8Hj4vn49P5PPzBfiCfCG+MF+Ev44vyhfji/PX/+HX/9ryteXb8u359nwHvgPfie/Ed+Y78135rnx3vjvfk+/J9+Z78334Pnw/vh8/gB/AD+QH8vfz9/OD+cH8Q/xD/BB+CD+Mf4Qfzj/Kj+BH8o/xo/jR/Gj+Cf4Jfgw/hh/Lj+XH8+P5ifxEfhI/iZ/CT+Gn8lP5afw0fgY/g5/Fz+Jn87P5Ofwcfi4/l5/Pz+cX8gv5RfwifjG/mF/KL+WX8cv4FfwK/lX+VX4lv5J/g3+DX8Wv4lfzq/k1/Fp+Lb+OX8ev59fzG/gN/Pv8+/xmfjO/hd/Cr+G38dv4HfwOfhe/i9/D7+H38fv4/fx+/gB/gD/EH+IP84f5I/wR/ih/lD/GH+OP88f5E/wJ/iR/kj/Fn+JP86f57/jv+DP8Gf4sf5Y/z5/nLwy9wF/kL/KX+EtX/uwTWIEVeIEXREEUZEEWVEEVdEEXTMEUbMEWXMEVfMEXQiEUsghZhGxCNiGHkEPIJeQSEoQEIa+QV0gU8gsFhIJCIaGwUES4TigqFBOKC9cLJYSSQimhtFBGuEEoK9wolBPKCzcJFYSbhYrCLUIlobJQRagqVBOqCzWEmkItobZQR6gr3CrUE+oLDYSGQiOhsdBEaCo0E5oLLYTbhJbC7cIdwp1CK+EuobXQRmj7p7Y/UnhMGCWMFh4XnhCeFMYITwljhXHCeGGCMFF4WpgkTBamCM8IU4VnhWnCdGGGMFOYJTwnzBaeF+YILwhzhXnCfGGBsFB4UVgkvCQsFpYIS4WXhWXCcmGF8IrwqvCasFJ4XSDCm8Iq4S1htfC2sEZYK7wjrBPeFdYL7wkbhI3C+8ImYbPwgbBF+FDYKmwTtgs7hJ3CLmG3sEfYK+wTPhL2Cx8LB4SDwiHhE+Gw8KlwRPhMOCp8LhwTvhCOC18KJ4SvhJPC18Ip4RvhtPCt8J3wvXBG+EE4K5wTzgs/CheEn4SLws/CJeGyQERGZEVO5EVBFEVJlEVFVEVN1EVDNEVLtEVHdEVP9MVADMWImEXMKmYTs4s5xJxiLjG3mCDmEfOK+cREMb9YQCwoFhILi0XE68SiYjGxuHi9WEIsKZYSS4tlxBvEsuKNYjmxvHiTWEG8Wawo3iJWEiuLVcSqYjWxulhDrCnWEmuLdcS64q1iPbG+2EBsKDYSG4tNxKZiM7G52EK8TWwp3i7eId4pthLvEluLbcS2YjuxvXi32EHsKHYS7xE7i13ErmI3sbvYQ+wp9hJ7i/eKfcS+Yj+xvzhAvE8cKA4S7xcfEAeLD4oPiQ+LQ8Sh4jDxEXG4+Kg4QhwpPiaOEkeLj4tPiE+KY8SnxLHiOHG8OEGcKD4tThIni1PEZ8Sp4rPiNHG6OEOcKc4SnxNni8+Lc8QXxLniPHG+uEBcKL4oLhJfEheLS8Sl4sviMnG5uEJ8RXxVfE1cKb4uviG+Ka4S3xJXi2+La8S14jviOvFdcb34nrhB3Ci+L24SN4sfiFvED8Wt4jZxu7hD3CnuEneLe8S94j7xI3G/+LF4QDwoHhI/EQ+Ln4pHxM/Eo+Ln4jHxC/G4+KV4QvxKPCl+LZ4SvxFPi9+K34nfi2fEH8Sz4jnxvPijeEH8Sbwo/ixeEi+LRGIkVuIkXhIkUZIkWVIkVdIkXTIkU7IkW3IkV/IkXwqkUIpIWaSsUjYpu5RDyinlknJLCVIeKa+UT0qU8ksFpIJSIamwVES6TioqFZOKS9dLJaSSUimptFRGukEqK90olZPKSzdJFaSbpYrSLVIlqbJURaoqVZOqSzWkmlItqbZUR6or3SrVk+pLDaSGUiOpsdREaio1k5pLLaTbpJbS7dId0p1SK+kuqbXURmortZPaS3dLHaSOUifpHqmz1EXqKnWTuks9pJ5SL6m3dK/UR+or9ZP6SwOk+6SB0iDpfukBabD0oPSQ9LA0RBoqDZMekYZLj0ojpJHSY9IoabT0uPSE9KQ0RnpKGiuNk8ZLE6SJ0tPSJGmyNEV6RpoqPStNk6ZLM6SZ0izpOWm29Lw0R3pBmivNk+ZLC6SF0ovSIuklabG0RFoqvSwtk5ZLK6RXpFel16SV0uvSG9Kb0irpLWm19La0RlorvSOtk96V1kvvSRukjdL70iZps/SBtEX6UNoqbZO2SzukndIuabe0R9or7ZM+kvZLH0sHpIPSIekT6bD0qXRE+kw6Kn0uHZO+kI5LX0onpK+kk9LX0inpG+m09K30nfS9dEb6QTornZPOSz9KF6SfpIvSz9Il6bJEZEZmZU7mZUEWZUmWZUVWZU3WZUM2ZUu2ZUd2ZU/25UAO5YicRc4qZ5OzyznknHIuObecIOeR88r55EQ5v1xALigXkgvLReTr5KJyMbm4fL1cQi4pl5JLy2XkG+Sy8o1yObm8fJNcQb5ZrijfIleSK8tV5KpyNbm6XEOuKdeSa8t15LryrXI9ub7cQG4oN5Iby03kpnIzubncQr5NbinfLt8h3ym3ku+SW8tt5LZyO7m9fLfcQe4od5LvkTvLXeSucje5u9xD7in3knvL98p95L5yP7m/PEC+Tx4oD5Lvlx+QB8sPyg/JD8tD5KHyMPkRebj8qDxCHik/Jo+SR8uPy0/IT8pj5KfksfI4ebw8QZ4oPy1PkifLU+Rn5Knys/I0ebo8Q54pz5Kfk2fLz8tz5BfkufI8eb68QF4ovygvkl+SF8tL5KXyy/Iyebm8Qn5FflV+TV4pvy6/Ib8pr5LfklfLb8tr5LXyO/I6+V15vfyevEHeKL8vb5I3yx/IW+QP5a3yNnm7vEPeKe+Sd8t75L3yPvkjeb/8sXxAPigfkj+RD8ufykfkz+Sj8ufyMfkL+bj8pXxC/ko+KX8tn5K/kU/L38rfyd/LZ+Qf5LPyOfm8/KN8Qf5Jvij/LF+SL8tEYRRW4RReERRRkRRZURRV0RRdMRRTsRRbcRRX8RRfCZRQiShZlKxKNiW7kkPJqeRScisJSh4lr5JPSVTyKwWUgkohpbBSRLlOKaoUU4or1ysllJJKKaW0Uka5QSmr3KiUU8orNykVlJuVisotSiWlslJFqapUU6orNZSaSi2ltlJHqavcqtRT6isNlIZKI6Wx0kRpqjRTmistlNuUlsrtyh3KnUor5S6ltdJGaau0U9ordysdlI5KJ+UepbPSRemqdFO6Kz2Unkovpbdyr9JH6av0U/orA5T7lIHKIOV+5QFlsPKg8pDysDJEGaoMUx5RhiuPKiOUkcpjyihltPK48oTypDJGeUoZq4xTxisTlInK08okZbIyRXlGmao8q0xTpiszlJnKLOU5ZbbyvDJHeUGZq8xT5isLlIXKi8oi5SVlsbJEWaq8rCxTlisrlFeUV5XXlJXK68obypvKKuUtZbXytrJGWau8o6xT3lXWK+8pG5SNyvvKJmWz8oGyRflQ2apsU7YrO5Sdyi5lt7JH2avsUz5S9isfKweUg8oh5RPlsPKpckT5TDmqfK4cU75QjitfKieUr5STytfKKeUb5bTyrfKd8r1yRvlBOaucU84rPyoXlJ+Ui8rPyiXlskJURmVVTuVVQRVVSZVVRVVVTdVVQzVVS7VVR3VVT/XVQA3ViJpFzapmU7OrOdScai41t5qg5lHzqvnURDW/WkAtqBZSC6tF1OvUomoxtbh6vVpCLamWUkurZdQb1LLqjWo5tbx6k1pBvVmtqN6iVlIrq1XUqmo1tbpaQ62p1lJrq3XUuuqtaj21vtpAbag2UhurTdSmajO1udpCvU1tqd6u3qHeqbZS71Jbq23Utmo7tb16t9pB7ah2Uu9RO6td1K5qN7W72kPtqfZSe6v3qn3Uvmo/tb86QL1PHagOUu9XH1AHqw+qD6kPq0PUoeow9RF1uPqoOkIdqT6mjlJHq4+rT6hPqmPUp9Sx6jh1vDpBnag+rU5SJ6tT1GfUqeqz6jR1ujpDnanOUp9TZ6vPq3PUF9S56jx1vrpAXai+qC5SX1IXq0vUperL6jJ1ubpCfUV9VX1NXam+rr6hvqmuUt9SV6tvq2vUteo76jr1XXW9+p66Qd2ovq9uUjerH6hb1A/Vreo2dbu6Q92p7lJ3q3vUveo+9SN1v/qxekA9qB5SP1EPq5+qR9TP1KPq5+ox9Qv1uPqlekL9Sj2pfq2eUr9RT6vfqt+p36tn1B/Us+o59bz6o3pB/Um9qP6sXlIvq0RjNFbjNF4TNFGTNFlTNFXTNF0zNFOzNFtzNFfzNF8LtFCLaFm0rFo2LbuWQ8up5dJyawlaHi2vlk9L1PJrBbSCWiGtsFZEu04rqhXTimvXayW0kloprbRWRrtBK6vdqJXTyms3aRW0m7WK2i1aJa2yVkWrqlXTqms1tJpaLa22Vkerq92q1dPqaw20hlojrbHWRGuqNdOaay2027SW2u3aHdqdWivtLq211kZrq7XT2mt3ax20jlon7R6ts9ZF66p107prPbSeWi+tt3av1kfrq/XT+msDtPu0gdog7X7tAW2w9qD2kPawNkQbqg3THtGGa49qI7SR2mPaKG209rj2hPakNkZ7ShurjdPGaxO0idrT2iRtsjZFe0abqj2rTdOmazO0mdos7Tlttva8Nkd7QZurzdPmawu0hdqL2iLtJW2xtkRbqr2sLdOWayu0V7RXtde0ldrr2hvam9oq7S1ttfa2tkZbq72jrdPe1dZr72kbtI3a+9ombbP2gbZF+1Dbqm3Ttms7tJ3aLm23tkfbq+3TPtL2ax9rB7SD2iHtE+2w9ql2RPtMO6p9rh3TvtCOa19qJ7SvtJPa19op7RvttPat9p32vXZG+0E7q53Tzms/ahe0n7SL2s/aJe2yRnRGZ3VO53VBF3VJl3VFV3VN13VDN3VLt3VHd3VP9/VAD/WInkXPqmfTs+s59Jx6Lj23nqDn0fPq+fREPb9eQC+oF9IL60X06/SiejG9uH69XkIvqZfSS+tl9Bv0svqNejm9vH6TXkG/Wa+o36JX0ivrVfSqejW9ul5Dr6nX0mvrdfS6+q16Pb2+3kBvqDfSG+tN9KZ6M7253kK/TW+p367fod+pt9Lv0lvrbfS2eju9vX633kHvqHfS79E76130rno3vbveQ++p99J76/fqffS+ej+9vz5Av08fqA/S79cf0AfrD+oP6Q/rQ/Sh+jD9EX24/qg+Qh+pP6aP0kfrj+tP6E/qY/Sn9LH6OH28PkGfqD+tT9In61P0Z/Sp+rP6NH26PkOfqc/Sn9Nn68/rc/QX9Ln6PH2+vkBfqL+oL9Jf0hfrS/Sl+sv6Mn25vkJ/RX9Vf01fqb+uv6G/qa/S39JX62/ra/S1+jv6Ov1dfb3+nr5B36i/r2/SN+sf6Fv0D/Wt+jZ9u75D36nv0nfre/S9+j79I32//rF+QD+oH9I/0Q/rn+pH9M/0o/rn+jH9C/24/qV+Qv9KP6l/rZ/Sv9FP69/q3+nf62f0H/Sz+jn9vP6jfkH/Sb+o/6xf0i/rxGAM1uAM3hAM0ZAM2VAM1dAM3TAM07AM23AM1/AM3wiM0IgYWYysRjYju5HDyGnkMnIbCUYeI6+Rz0g08hsFjIJGIaOwUcS4zihqFDOKG9cbJYySRimjtFHGuMEoa9xolDPKGzcZFYybjYrGLUYlo7JRxahqVDOqGzWMmkYto7ZRx6hr3GrUM+obDYyGRiOjsdHEaGo0M5obLYzbjJbG7cYdxp1GK+Muo7XRxmhrtDPaG3cbHYyORifjHqOz0cXoanQzuhs9jJ5GL6O3ca/Rx+hr9DP6GwOM+4yBxiDjfuMBY7DxoPGQ8bAxxBhqDDMeMYYbjxojjJHGY8YoY7TxuPGE8aQxxnjKGGuMM8YbE4yJxtPGJGOyMcV4xphqPGtMM6YbM4yZxizjOWO28bwxx3jBmGvMM+YbC4yFxovGIuMlY7GxxFhqvGwsM5YbK4xXjFeN14yVxuvGG8abxirjLWO18baxxlhrvGOsM9411hvvGRuMjcb7xiZjs/GBscX40NhqbDO2GzuMncYuY7exx9hr7DM+MvYbHxsHjIPGIeMT47DxqXHE+Mw4anxuHDO+MI4bXxonjK+Mk8bXxinjG+O08a3xnfG9ccb4wThrnDPOGz8aF4yfjIvGz8Yl47JBTMZkTc7kTcEUTcmUTcVUTc3UTcM0Tcu0Tcd0Tc/0zcAMzYiZxcxqZjOzmznMnGYuM7eZYOYx85r5zEQzv1nALGgWMgubRczrzKJmMbO4eb1ZwixpljJLm2XMG8yy5o1mObO8eZNZwbzZrGjeYlYyK5tVzKpmNbO6WcOsadYya5t1zLrmrWY9s77ZwGxoNjIbm03MpmYzs7nZwrzNbGnebt5h3mm2Mu8yW5ttzLZmO7O9ebfZwexodjLvMTubXcyuZjezu9nD7Gn2Mnub95p9zL5mP7O/OcC8zxxoDjLvNx8wB5sPmg+ZD5tDzKHmMPMRc7j5qDnCHGk+Zo4yR5uPm0+YT5pjzKfMseY4c7w5wZxoPm1OMiebU8xnzKnms+Y0c7o5w5xpzjKfM2ebz5tzzBfMueY8c765wFxovmguMl8yF5tLzKXmy+Yyc7m5wnzFfNV8zVxpvm6+Yb5prjLfMlebb5trzLXmO+Y6811zvfmeucHcaL5vbjI3mx+YW8wPza3mNnO7ucPcae4yd5t7zL3mPvMjc7/5sXnAPGgeMj8xD5ufmkfMz8yj5ufmMfML87j5pXnC/Mo8aX5tnjK/MU+b35rfmd+bZ8wfzLPmOfO8+aN5wfzJvGj+bF4yL5vEYizW4izeEizRkizZUizV0izdMizTsizbcizX8izfCqzQilhZLN7KZmW3clg5rVxWbivBymPltfJZiVZ+q4BV0CpkFbaKWNdZRa1iVnHrequEVdIqZZW2ylg3WGWtG61yVnnrJquCdbNV0brFqmRVtqpYVa1qVnWrhlXTqmXVtupYda1brXpWfauB1dBqZDW2mlhNrWZWc6uFdZvV0rrdusO602pl3WW1ttpYba12VnvrbquD1dHqZN1jdba6WF2tblZ3q4fV0+pl9bbutfpYfa1+Vn9rgHWfNdAaZN1vPWANth60HrIetoZYQ61h1iPWcOtRa4Q10nrMGmWNth63nrCetMZYT1ljrXHWeGuCNdF62ppkTbamWM9YU61nrWnWdGuGNdOaZT1nzbaet+ZYL1hzrXnWfGuBtdB60VpkvWQttpZYS62XrWXWcmuF9Yr1qvWatdJ63XrDetNaZb1lrbbettZYa613rHXWu9Z66z1rg7XRet/aZG22PrC2WB9aW61t1nZrh7XT2mXttvZYe6191kfWfutj64B10DpkfWIdtj61jlifWUetz61j1hfWcetL64T1lXXS+to6ZX1jnba+tb6zvrfOWD9YZ61z1nnrR+uC9ZN10frZumRdtojN2KzN2bwt2KIt2bKt2Kqt2bpt2KZt2bbt2K7t2b4d2KEdsbPYWe1sdnY7h53TzmXnthPsPHZeO5+daOe3C9gF7UJ2YbuIfZ1d1C5mF7evt0vYJe1Sdmm7jH2DXda+0S5nl7dvsivYN9sV7VvsSnZlu4pd1a5mV7dr2DXtWnZtu45d177VrmfXtxvYDe1GdmO7id3UbmY3t1vYt9kt7dvtO+w77Vb2XXZru43d1m5nt7fvtjvYHe1O9j12Z7uL3dXuZne3e9g97V52b/teu4/d1+5n97cH2PfZA+1B9v32A/Zg+0H7Ifthe4g91B5mP2IPtx+1R9gj7cfsUfZo+3H7CftJe4z9lD3WHmePtyfYE+2n7Un2ZHuK/Yw91X7WnmZPt2fYM+1Z9nP2bPt5e479gj3XnmfPtxfYC+0X7UX2S/Zie4m91H7ZXmYvt1fYr9iv2q/ZK+3X7TfsN+1V9lv2avtte4291n7HXme/a6+337M32Bvt9+1N9mb7A3uL/aG91d5mb7d32DvtXfZue4+9195nf2Tvtz+2D9gH7UP2J/Zh+1P7iP2ZfdTuxSedz/GlfcL+yj5pf22fsr+xT9vf2t/Z39tn7B/ss/Y5+7z9o33B/sm+aP9sX7Iv28RhHNbhHN4RHNGRHNlRHNXRHN0xHNOxHNtxHNfxHN8JnNCJOFmcrE42J7uTw8np5HJyOwlOHievk89JdPI7BZyCTiGnsFPEuc4p6hRzijvXOyWckk4pp7RTxrnBKevc6JRzyjs3ORWcm52Kzi1OJaeyU8Wp6lRzqjs1nJpOLae2U8ep69zq1HPqOw2chk4jp7HTxGnqNHOaOy2c25yWzu3OHc6dTivnLqe108Zp67Rz2jt3Ox2cjk4n5x6ns9PF6ep0c7o7PZyeTi+nt3Ov08fp6/Rz+jsDnPucgc4g537nAWew86DzkPOwM8QZ6gxzHnGGO486I5yRzmPOKGe087jzhPOkM8Z5yhnrjHPGOxOcic7TziRnsjPFecaZ6jzrTHOmOzOcmc4s5zlntvO8M8d5wZnrzHPmOwuchc6LziLnJWexs8RZ6rzsLHOWOyucV5xXndeclc7rzhvOm84q5y1ntfO2s8ZZ67zjrHPeddY77zkbnI3O+84mZ7PzgbPF+dDZ6mxztjs7nJ3OLme3s8fZ6+xzPnL2Ox87B5yDziHnE+ew86lzxPnMOep87hxzvnCOO186J5yvnJPO184p5xvntPOt853zvXPG+cE565xzzjs/Ohecn5yLzs/OJeeyQ1zGZV3O5V3BFV3JlV3FVV3N1V3DNV3LtV3HdV3P9d3ADd2Im8XN6mZzs7s53JxuLje3m+DmcfO6+dxEN79bwC3oFnILu0Xc69yibjG3uHu9W8It6ZZyS7tl3Bvcsu6Nbjm3vHuTW8G92a3o3uJWciu7VdyqbjW3ulvDrenWcmu7ddy67q1uPbe+28Bt6DZyG7tN3KZuM7e528K9zW3p3u7e4d7ptnLvclu7bdy2bju3vXu328Ht6HZy73E7u13crm43t7vbw+3p9nJ7u/e6fdy+bj+3vzvAvc8d6A5y73cfcAe7D7oPuQ+7Q9yh7jD3EXe4+6g7wh3pPuaOcke7j7tPuE+6Y9yn3LHuOHe8O8Gd6D7tTnInu1PcZ9yp7rPuNHe6O8Od6c5yn3Nnu8+7c9wX3LnuPHe+u8Bd6L7oLnJfche7S9yl7svuMne5u8J9xX3Vfc1d6b7uvuG+6a5y33JXu2+7a9y17jvuOvddd737nrvB3ei+725yN7sfuFtcmWx1t7nb3R3uTneXu9vd4+5197kfufvdj90D7kH3kPuJe9j91D3ifuYedT93j7lfuMfdL90T7lfuSfdr95T7jXva/db9zv3ePeP+4J51z7nn3R/dC+5P7kX3Z/eSe9klHuOxHufxnuCJnuTJnuKpnubpnuGZnuXZnuO5nuf5XuCFXsTL4mX1snnZvRxeTi+Xl9tL8PJ4eb18XqKX3yvgFfQKeYW9It51XlGvmFfcu94r4ZX0SnmlvTLeDV5Z70avnFfeu8mr4N3sVfRu8Sp5lb0qXlWvmlfdq+HV9Gp5tb06Xl3vVq+eV99r4DX0GnmNvSZeU6+Z19xr4d3mtfRu9+7w7vRaeXd5rb02Xluvndfeu9vr4HX0Onn3eJ29Ll5Xr5vX3evh9fR6eb29e70+Xl+vn9ffG+Dd5w30Bnn3ew94g70HvYe8h70h3lBvmPeIN9x71BvhjfQe80Z5o73HvSe8J70x3lPeWG+cN96b4E30nvYmeZO9Kd4z3lTvWW+aN92b4c30ZnnPebO957053gveXG+eN99b4C30XvQWeS95i70l3lLvZW+Zt9xb4b3iveq95q30Xvfe8N70Vnlveau9t7013lrvHW+d96633nvP2+Bt9N73NnmbvQ+8Ld6H3lZvm7fd2+Ht9HZ5u7093l5vn/eRt9/72DvgHfQOeZ94h71PvSPeZ95R73PvmPeFd9z70jvhfeWd9L72TnnfeKe9b73vvO+9M94P3lnvnHfe+9G74P3kXfR+9i55lz3iMz7rcz7vC77oS77sK77qa77uG77pW77tO77re77vB37oR/wsflY/m5/dz+Hn9HP5uf0EP4+f18/nJ/r5/QJ+Qb+QX9gv4l/nF/WL+cX96/0Sfkm/lF/aL+Pf4Jf1b/TL+eX9m/wK/s1+Rf8Wv5Jf2a/iV/Wr+dX9Gn5Nv5Zf26/j1/Vv9ev59f0GfkO/kd/Yb+I39Zv5zf0W/m1+S/92/w7/Tr+Vf5ff2m/jt/Xb+e39u/0Ofke/k3+P39nv4nf1u/nd/R5+T7+X39u/1+/j9/X7+f39Af59/kB/kH+//4A/2H/Qf8h/2B/iD/WH+Y/4w/1H/RH+SP8xf5Q/2n/cf8J/0h/jP+WP9cf54/0J/kT/aX+SP9mf4j/jT/Wf9af50/0Z/kx/lv+cP9t/3p/jv+DP9ef58/0F/kL/RX+R/5K/2F/iL/Vf9pf5y/0V/iv+q/5r/kr/df8N/01/lf+Wv9p/21/jr/Xf8df57/rr/ff8Df5G/31/k7/Z/8Df4n/ob/W3+dv9Hf5Of5e/29/j7/X3+R/5+/2P/QP+Qf+Q/4l/2P/UP5L7M/+o/7l/zP/CP+5/6Z/wv/JP+l/7p/xv/NP+t/53/vf+Gf8H/6x/zj/v/+hf8H/yL/o/+5f8yz4JmIANuIAPhEAMpEAOlEANtEAPjMAMrMAOnMANvMAPgiAMIkGWIGuQLcge5AhyBrmC3EFCkCfIG+QLEoP8QYGgYFAoKBwUCa4LigbFguLB9UGJoGRQKigdlAluCMoGNwblgvLBTUGF4OagYnBLUCmoHFQJqgbVgupBjaBmUCuoHdQJ6ga3BvWC+kGDoGHQKGgcNAmaBs2C5kGL4LagZXB7cEdwZ9AquCtoHbQJ2gbtgvbB3UGHoGPQKbgn6Bx0CboG3YLuQY+gZ9Ar6B3cG/QJ+gb9gv7BgOC+YGAwKLg/eCAYHDwYPBQ8HAwJhgbDgkeC4cGjwYhgZPBYMCoYHTwePBE8GYwJngrGBuOC8cGEYGLwdDApmBxMCZ4JpgbPBtOC6cGMYGYwK3gumB08H8wJXgjmBvOC+cGCYGHwYrAoeClYHCwJlgYvB8uC5cGK4JXg1eC1YGXwevBG8GawKngrWB28HawJ1gbvBOuCd4P1wXvBhmBj8H6wKdgcfBBsCT4Mtgbbgu3BjmBnsCvYHewJ9gb7go+C/cHHwYHgYHAo+CQ4HHwaHAk+C44GnwfHgi+C48GXwYngq+Bk8HVwKvgmOB18G3wXfB+cCX4IzgbngvPBj8GF4KfgYvBzcCm4HJCQCdmQC/lQCMVQCuVQCdVQC/XQCM3QCu3QCd3QC/0wCMMwEmYJs4bZwuxhjjBnmCvMHSaEecK8Yb4wMcwfFggLhoXCwmGR8LqwaFgsLB5eH5YIS4alwtJhmfCGsGx4Y1guLB/eFFYIbw4rhreElcLKYZWwalgtrB7WCGuGtcLaYZ2wbnhrWC+sHzYIG4aNwsZhk7Bp2CxsHrYIbwtbhreHd4R3hq3Cu8LWYZuwbdgubB/eHXYIO4adwnvCzmGXsGvYLewe9gh7hr3C3uG9YZ+wb9gv7B8OCO8LB4aDwvvDB8LB4YPhQ+HD4ZBwaDgsfCQcHj4ajghHho+Fo8LR4ePhE+GT4ZjwqXBsOC4cH04IJ4ZPh5PCyeGU8JlwavhsOC2cHs4IZ4azwufC2eHz4ZzwhXBuOC+cHy4IF4YvhovCl8LF4ZJwafhyuCxcHq4IXwlfDV8LV4avh2+Eb4arwrfC1eHb4ZpwbfhOuC58N1wfvhduCDeG74ebws3hB+GW8MNwa7gt3B7uCHeGu8Ld4Z5wb7gv/CjcH34cHggPhofCT8LD4afhkfCz8Gj4eXgs/CI8Hn4Zngi/Ck+GX4enwm/C0+G34Xfh9+GZ8IfwbHguPB/+GF4Ifwovhj+Hl8LLIYkwETbCRfiIEBEjUkSOKBE1okX0iBExI1bEjjgRN+JF/EgQCSORSJZI1ki2SPZIjkjOSK5I7khCJE8kbyRfJDGSP1IgUjBSKFI4UiRyXaRopFikeOT6SIlIyUipSOlImcgNkbKRGyPlIuUjN0UqRG6OVIzcEqkUqRypEql64XKkeqRGpGakVqR2pE6kbuTWSL1I/cj/aLmrGD9i9l/sG2aGYWbPjIdnfmFmZmZmZmZmZmZm2jAnG2Zm3jAnm6R6/zqtdPpeVKpa331k2Xq+lm984aciUgmpjFRBqiLVkOpIDaQmUgupjdRB6iL1kPpIA6Qh0ghpjDRBmiLNkOZIC6Ql0gppjbRB2iLtkPZIB6Qj0gnpjHRBuiLdkO5ID6Qn0gvpjfRB+iL9kP7IAGQgMggZjAxBhiLDkOHICGQkMgoZjYxBxiLjkPHIBGQiMgmZjExBpiLTkOnIDGQmMguZjcxB5iLzkPnIAmQhsghZjCxBliLLkOXICmQlsgpZjaxB1iLrkPXIBmQjsgnZjGxBtiLbkO3IDmQnsgvZjexB9iL7kHhkP3IAOYgcQg4jR5CjyDHkOHICOYmcQk4jZ5CzSAJyDjmPXEAuIpeQy8gV5CpyDbmO3EBuIreQ28gd5C5yD7mPPEAeIo+Qx8gT5CnyDHmOvEBeIq+Q18gbJBF5i7xD3iMfkI/IJ+Qz8gX5inxDviM/kJ/IL+Q3koT8Qf4i/5A4NBmaHE2BpkRToanRNGhaNB2aHs2AZkQzoZnRLGhWNBuaHc2B5kRzoblRBEVRDMVRAiVRCqVRBmVRDuVRARVRCZVRBVVRDQWojhqoiULUQm3UQV3UQ300QEM0QmNoHjQvmg/NjxZAC6KF0MJoEbQoWgwtjpZAS6Kl0NJoGbQsWg4tj1ZAK6KV0MpoFbQqWg2tjtZAa6K10NpoHbQuWg+tjzZAG6KN0MZoE7Qp2gxtjrZAW6Kt0NZoG7Qt2g5tj3ZAO6Kd0M5oF7Qr2g3tjvZAe6K90N5oH7Qv2g/tjw5AB6KD0MHoEHQoOgwdjo5AR6Kj0NHoGHQsOg4dj05AJ6KT0MnoFHQqOg2djs5AZ6Kz0NnoHHQuOg+djy5AF6KL0MXoEnQpugxdjq5AV6Kr0NXoGnQtug5dj25AN6Kb0M3oFnQrug3dju5Ad6K70N3oHnQvug+NR/ejB9CD6CH0MHoEPYoeQ4+jJ9CT6Cn0NHoGPYsmoOfQ8+gF9CJ6Cb2MXkGvotfQ6+gN9CZ6C72N3kHvovfQ++gD9CH6CH2MPkGfos/Q5+gL9CX6Cn2NvkET0bfoO/Q9+gH9iH5CP6Nf0K/oN/Q7+gP9if5Cf6NJ6B/0L/oPjcOSYcmxFFhKLBWWGkuDpcXSYemxDFhGLBOWGcuCZcWyYdmxHFhOLBeWG0MwFMMwHCMwEqMwGmMwFuMwHhMwEZMwGVMwFdMwgOmYgZkYxCzMxhzMxTzMxwIsxCIshuXB8mL5sPxYAawgVggrjBXBimLFsOJYCawkVgorjZXBymLlsPJYBawiVgmrjFXBqmLVsOpYDawmVgurjdXB6mL1sPpYA6wh1ghrjDXBmmLNsOZYC6wl1gprjbXB2mLtsPZYB6wj1gnrjHXBumLdsO5YD6wn1gvrjfXB+mL9sP7YAGwgNggbjA3BhmLDsOHYCGwkNgobjY3BxmLjsPHYBGwiNgmbjE3BpmLTsOnYDGwmNgubjc3B5mLzsPnYAmwhtghbjC3BlmLLsOXYCmwltgpbja3B1mLrsPXYBmwjtgnbjG3BtmLbsO3YDmwntgvbje3B9mL7sHhsP3YAO4gdwg5jR7Cj2DHsOHYCO4mdwk5jZ7CzWAJ2DjuPXcAuYpewy9gV7Cp2DbuO3cBuYrew29gd7C52D7uPPcAeYo+wx9gT7Cn2DHuOvcBeYq+w19gbLBF7i73D3mMfsI/YJ+wz9gX7in3DvmM/sJ/YL+w3loT9wf5i/7A4PBmeHE+Bp8RT4anxNHhaPB2eHs+AZ8Qz4ZnxLHhWPBueHc+B58Rz4blxBEdxDMdxAidxCqdxBmdxDudxARdxCZdxBVdxDQe4jhu4iUPcwm3cwV3cw308wEM8wmN4Hjwvng/PjxfAC+KF8MJ4EbwoXgwvjpfAS+Kl8NJ4GbwsXg4vj1fAK+KV8Mp4FbwqXg2vjtfAa+K18Np4HbwuXg+vjzfAG+KN8MZ4E7wp3gxvjrfAW+Kt8NZ4G7wt3g5vj3fAO+Kd8M54F7wr3g3vjvfAe+K98N54H7wv3g/vjw/AB+KD8MH4EHwoPgwfjo/AR+Kj8NH4GHwsPg4fj0/AJ+KT8Mn4FHwqPg2fjs/AZ+Kz8Nn4HHwuPg+fjy/AF+KL8MX4Enwpvgxfjq/AV+Kr8NX4Gnwtvg5fj2/AN+Kb8M34Fnwrvg3fju/Ad+K78N34Hnwvvg+Px/fjB/CD+CH8MH4EP4ofw4/jJ/CT+Cn8NH4GP4sn4Ofw8/gF/CJ+Cb+MX8Gv4tfw6/gN/CZ+C7+N38Hv4vfw+/gD/CH+CH+MP8Gf4s/w5/gL/CX+Cn+Nv8ET8bf4O/w9/gH/iH/CP+Nf8K/4N/w7/gP/if/Cf+NJ+B/8L/4PjyOSEcmJFERKIhWRmkhDpCXSEemJDERGIhORmchCZCWyEdmJHEROIheRm0AIlMAInCAIkqAImmAIluAInhAIkZAImVAIldAIQOiEQZgEJCzCJhzCJTzCJwIiJCIiRuQh8hL5iPxEAaIgUYgoTBQhihLFiOJECaIkUYooTZQhyhLliPJEBaIiUYmoTFQhqhLViOpEDaImUYuoTdQh6hL1iPpEA6Ih0YhoTDQhmhLNiOZEC6Il0YpoTbQh2hLtiPZEB6Ij0YnoTHQhuhLdiO5ED6In0YvoTfQh+hL9iP7EAGIgMYgYTAwhhhLDiOHECGIkMYoYTYwhxhLjiPHEBGIiMYmYTEwhphLTiOnEDGImMYuYTcwh5hLziPnEAmIhsYhYTCwhlhLLiOXECmIlsYpYTawh1hLriPXEBmIjsYnYTGwhthLbiO3EDmInsYvYTewh9hL7iHhiP3GAOEgcIg4TR4ijxDHiOHGCOEmcIk4TZ4izRAJxjjhPXCAuEpeIy8QV4ipxjbhO3CBuEreI28Qd4i5xj7hPPCAeEo+Ix8QT4inxjHhOvCBeEq+I18QbIpF4S7wj3hMfiI/EJ+Iz8YX4SnwjvhM/iJ/EL+I3kUT8If4S/4g4MhmZnExBpiRTkanJNGRaMh2ZnsxAZiQzkZnJLGRWMhuZncxB5iRzkblJhERJjMRJgiRJiqRJhmRJjuRJgRRJiZRJhVRJjQSkThqkSULSIm3SIV3SI30yIEMyImNkHjIvmY/MTxYgC5KFyMJkEbIoWYwsTpYgS5KlyNJkGbIsWY4sT1YgK5KVyMpkFbIqWY2sTtYga5K1yNpkHbIuWY+sTzYgG5KNyMZkE7Ip2YxsTrYgW5KtyNZkG7It2Y5sT3YgO5KdyM5kF7Ir2Y3sTvYge5K9yN5kH7Iv2Y/sTw4gB5KDyMHkEHIoOYwcTo4gR5KjyNHkGHIsOY4cT04gJ5KTyMnkFHIqOY2cTs4gZ5KzyNnkHHIuOY+cTy4gF5KLyMXkEnIpuYxcTq4gV5KryNXkGnItuY5cT24gN5KbyM3kFnIruY3cTu4gd5K7yN3kHnIvuY+MJ/eTB8iD5CHyMHmEPEoeI4+TJ8iT5CnyNHmGPEsmkOfI8+QF8iJ5ibxMXiGvktfI6+QN8iZ5i7xN3iHvkvfI++QD8iH5iHxMPiGfks/I5+QL8iX5inxNviETybfkO/I9+YH8SH4iP5NfyK/kN/I7+YP8Sf4if5NJ5B/yL/mPjKOSUcmpFFRKKhWVmkpDpaXSUempDFRGKhOVmcpCZaWyUdmpHFROKheVm0IolMIonCIokqIommIoluIonhIokZIomVIoldIoQOmUQZkUpCzKphzKpTzKpwIqpCIqRuWh8lL5qPxUAaogVYgqTBWhilLFqOJUCaokVYoqTZWhylLlqPJUBaoiVYmqTFWhqlLVqOpUDaomVYuqTdWh6lL1qPpUA6oh1YhqTDWhmlLNqOZUC6ol1YpqTbWh2lLtqPZUB6oj1YnqTHWhulLdqO5UD6on1YvqTQ1Y1ZfqR/WnmhkDqUHUYGoINZQaRg2nRlAjqVHUaGoMNZYaR42nJlATqUnUZGoKNZWaRk2nZlAzqVnUbGoONZeaR82nFlALqUXUYmoJtZRaRi2nVlArqVXUamoNtZZaR62nNlAbqU3UZmoLtZXaRm2ndlA7qV3UbmoPtZfaR8VT+6kD1EHqEHWYOkIdpY5Rx6kT1EnqFHWaOkMxVAJ1jjpPXaAuUpeoy9QV6ip1jbpO3aBuUreo29Qd6i51j7pPPaAeUo+ox9QT6in1jHpOvaBeUq+o19QbKpF6S72j3lMfqI/UJ+oz9YX6Sn2jvlM/qJ/UL+o3lUT9of5S/6g4Oo5OTienU9Ip6dR0ajotnZZOT6enM9IZ6cx0ZjornZXOTmenc9I56dx0bhqlURqncZqkSfo/g6VZmqd5WqRFWqZlWqVVGtCANmiDhjSkbdqmXdqlfdqnQzqkY3SMzkvnpfPT+emCdEG6MF2YLkoXpYvTxemSdEm6NF2aLkuXpcvT5emKdEW6Ml2ZrkpXpavT1emadE26Nl2brkvXpevT9emGdEO6Md2Ybko3pZvTzemWdEu6Nd2abku3pdvT7emOdEe6M92Z7kp3pbvT3emedE+6N92b7kv3pfvT/emB9EB6MD2YHkoPpYfTw+mR9Eh6ND2aHkuPpcfT4+mJ9ER6Mj2ZnkpPpafT0+mZ9Cx6Nj2HnkvPo+fTC+iF9CJ6Mb2YXkovpZfTy+mV9Ep6Nb2aXkuvpdfT6+mN9EZ6M72Z3kpvpbfT2+md9E56N72b3kvvpePpePoAfYA+RB+ij9BH6GP0MfoEfYI+RZ+iz9Bn6AQ6gT5Pn6cv0hfpy/Rl+ip9lb5OX6dv0jfp2/Rt+i59l75P36cf0g/px/Rj+in9lH5OP6df0i/p1/RrOpFOpN/R7+gP9Af6E/2J/kJ/ob/R3+gf9A/6F/2LTqKT6L/0Xzotk45Jz2RgMjKZmMxMFub/bpTBGJwhGJKhmNwM8r+ZZhhGZTQGMDpjMCYDGeu/HGPyMHmZfEx+pgBTkCn0Xy7NlGHKMuWY8kwFpiRT6n9zRaYSU5mpw1Rl6jHVmQZMTaYRU5upw9Rl6jH1mQZMQ6YR05ppw7Rl2jHtmQ5MR6bTf3kvs485xZxmzjBnmdvMHeYH85N5zbxhfjG/mYHMIGY0M4YZy4xjxjMTmInMpP/ybGYOM5eZx8xnFjALmUX/5dXMGmYts45Zz2xgNjKb/su7mT3MViae2c7sYHYyu/7H/6kpntnPHGAOMoeYw8wRJoE5xhxnTjAn/69aE5hzzHnmAnOTucVcZq4wV5lrzHXmxv/4PznuMveY+8wD5iXzinnMPGGeMonMc+bF//g/+RKZt8w75j3zgfnIfGKSmC/MV+Yb8/1/8v8nexLzh/nL/GPi2GRscjYFm5JNxaZm07Bp2XRsejYDm5HNxGZms7BZ2WxsdjYHm5PNxeZmERZlMRZnCZZkKZZm/8XFxXEszwqsyEqszCqsymosYHXWYE0WshZrsw7rsh7rswEbshEbY/Owedl8bH62AFuQLcQWZouwRdlibHG2BFuSLcWWZsuwZdlybHm2AluRrcRWZquwVdlqbHW2BluTrcXWZuuwddl6bH22AduQbcQ2ZpuwTdlmbHO2BduSbcW2Ztuwbdl2bHu2A9uR7cR2ZruwXdlubHe2B9uT7cX2Zvuwfdl+bH92ADuQHcQOZoewQ9lh7HB2BDuSHcWOZsewY9lx7Hh2AjuRncROZqewU9lp7HR2BjuTncXOZuewc9l57Hx2AbuQXcQuZpewS9ll7HJ2BbuSXcWuZtewa9l17Hp2A7uR3cRuZrewW9lt7HZ2B7uT3cXuZvewe9l9bDy7nz3AHmQPsYfZI+xR9hh7nD3BnmRPsafZM+xZNoE9x55nL7AX2UvsZfYKe5W9xl5nb7A32VvsbfYOe5e9x95nH7AP2UfsY/YJ+5R9xj5nX7Av2Vfsa/YNm8i+Zd+x79kP7Ef2E/uZ/cJ+Zb+x39kf7E/2F/ubTWL/sH/Zf2wcl4xLzqXgUnKpuNRcGi4tl45Lz2XgMnKZuMxcFi4rl43LzuXgcnK5uNwcwqEcxuEcwZEcxdEcw7Ecx/GcwImcxMmcwqmcxgFO5wzO5CBncTbncC7ncT4XcCEXcTEuD5eXy8fl5wpwBblCXGGuCFeUK8YV50pwJblSXGmuDFeWK8eV5ypwLleJS/2/ulRU52pwNblaXG2uDleXq8fV5xpwDblGXGOuCdeUa8Y151pwLblWXGuuDdeWa8e15zpwHblO/4/zg7kh3FBuGDeMG8GN5EZxo7kx3FhuHDeem8BN5CZxk7kp3FRuGjedm8HN5GZxs7k53FxuHjefW8At5BZxi7kl3FJuGbecW8Gt5FZxq7k13FpuHbee28Bt5DZxm7kt3FZuG7ed28Ht5HZxu7k93F5uHxfP7ecOcAe5Q9xh7gh3lDvGHedOcCe5U9xp7gx3lkvgznHnuQvcRe4Sd5m7wl3lrnHXuRvcTe4Wd5u7w93l7nH3uQfcQ+4R95h7wj3lnnHPuRfcS+4V95p7wyVyb7l33HvuA/eR+8R95r5wX7lv3HfuB/eT+8X95pK4P9xf7h8Xxyfjk/Mp+JR8Kj41n4ZPy6fj0/MZ+Ix8Jj4zn4XPymfjs/M5+Jx8Lj43j/Aoj/E4T/AkT/E0z/Asz/E8L/AiL/Eyr/Aqr/GA13mDN3nIW7zNO7zLe7zPB3zIR3yMz8Pn5fPx+fkCfEG+EF+YL8IX5YvxxfkSfEm+FF+aL8OX5cvx5fkKfEW+El+Zr8JX5avx1fkafE2+Fl+br8PX5evx9fkGfEO+Ed+Yb8I35ZvxzfkWfEu+Fd+ab8O35dvx7fkOfEe+E9+Z78J35bvx3fkefE++F9+b78P35fvx/VPF8QP5Qfxgfgg/lB/GD+dH8CP5Ufxofgw/lh/Hj+cn8BP5Sfxkfgo/lZ/GT+dn8DP5Wfxsfg4/l5/Hz+cX8Av5Rfxifgm/lF/GL+dX8Cv5Vfxqfg2/ll/Hr+c38Bv5Tfxmfgu/ld/Gb+d38Dv5Xfxufg+/l9/Hx/P7+QP8Qf4Qf5g/wh/lj/HH+RP8Sf4Uf5o/w5/lE/hz/Hn+An+Rv8Rf5q/wV/lr/HX+Bn+Tv8Xf5u/wd/l7/H3+Af+Qf8Q/5p/wT/ln/HP+Bf+Sf8W/5t/wifxb/h3/nv/Af+Q/8Z/5L/xX/hv/nf/B/+R/8b/5JP4P/5f/x8cJyYTkQgohpZBKSC2kEdIK6YT0QgYho5BJyCxkEbIK2YTsQg4hp5BLyC0gAipgAi4QAilQAi0wAitwAi8IgihIgiwogipoAhB0wRBMAQqWYAuO4Aqe4AuBEAqREBPyCHmFfEJ+oYBQUCgkFBaKCEWFYkJxoYRQUigllBbK/J/daIWKQiWhslBFqCpUE6oLNYSaQi2htlBHqCvUE+oLDYSGQiOhsdBEaCo0E5oLLYSWQiuhtdBGaCu0E9oLHYSOQiehs9BF6Cp0E7oLPYSeQi+ht9BH6Cv0E/oLA4SBwiBhsDBEGCoME4YLI4SRwihhtDBGGCuME8YLE4SJwiRhsjBFmCpME6YLM4SZwixhtjBHmCvME+YLC4SFwiJhsbBEWCosE5YLK4SVwiphtbBGWCusE9YLG4SNwiZhs7BF2CpsE7YLO4Sdwi5ht7BH2CvsE+KF/cIB4aBwSDgsHBGOCseE48IJ4aRwSjgtnBHOCgnCOeG8cEG4KFwSLgtXhKvCNeG6cEO4KdwSbgt3hLvCPeG+8EB4KDwSHgtPhKfCM+G58EJ4KbwSXgtvhEThrfBOeC98ED4Kn4TPwhfhq/BN+C78EH4Kv4TfQpLwR/gr/BPixGRicjGFmFJMJaYW04hpxXRiejGDmFHMJGYWs4hZxWxidjGHmFPMJeYWEREVMREXCZEUKZEWGZEVOZEXBVEUJVEWFVEVNRGIumiIpghFS7RFR3RFT/TFQAzFSIyJecS8Yj4xv1hALCgWEguLRcSiYjGxuFhCLCmWEkuLZcSyYjmxvFhBrChWEiuLVcSqYjWxulhDrCnWEmuLdcS6Yj2xvthAbCg2EhuLTcSmYjOxudgieUuxldhabCO2FduJ7cUOYkexk9hZ7CJ2FbuJ3cUeYk+xl9hb7CP2FfuJ/cUB4kBxkDhYHCIOFYeJw8UR4khxlDhaHCOOFceJ48UJ4kRxkjhZnCJOFaeJ08UZ4kxxljhbnCPOFeeJ88UF4kJxkbhYXCIuFZeJy8UV4krmf91ccZ24XtwgbhQ3iZvFLeJWcZu4Xdwh7hR3ibvFPeJecZ8YL+4XD4gHxUPiYfGIeFQ8Jh4XT4gnxVPiafGMeFZMEM+J58UL/6/WvBbfiIniW/Gd+F78IH4UP4mfxS/iV/Gb+F38If4Uf4m/xSTxj/hX/CfGScmk5FIKKaWUSkotpZHSSumk9FIGKaOUScosZZGyStmk7FIOKaeUS8otIRIqYRIuERIpURItMRIrcRIvCZIoSZIsKZIqaRKQdMmQTAlKlmRLjuRKnuRLgRRKkRST8kh5pXxSfqmAVFAqJBWWikhFU/yvc5NKSaWlMlJZqZxUXqogVZQqSZWlKlJVqZpUXaoh1ZRqSbWlOlJdqZ5UX2ogNZQaSY2lJlJTqZnUXGohtZRaSa2lNlJbqZ3UXuogdZQ6SZ2lLlJXqZvUXeoh9ZR6Sb2lPlJfqZ/UXxogDZQGSYOlIdJQaZg0XBohjZRGSaOlMdJYaZw0XpogTZQmSZOlKdJUaZo0XZohzZRmSbOlOdJcaZ40X1ogLZQWSYulJdJSaZm0XFohrZRWSaulNdJaaZ20XtogbZQ2SZulLdJWaZu0Xdoh7ZR2SbulPdJeaZ8UL+2XDkgHpUPSYemIdFQ6Jh2XTkgnpVPSaemMdFZKkM5J56UL0kXpknRZuiJdla5J16Ub0k3plnRbuiPdle5J96UH0kPpkfRYeiI9lZ5Jz6UX0kvplfRaeiMlSm+ld9J76YP0UfokfZa+SF+lb9J36Yf0U/ol/ZaSpD/SX+mfFCcnk5PLKeSUcio5tZxGTiunk9PLGeSMciY5s5xFzipnk7PLOeScci45t4zIqIzJuEzIpEzJtMzIrMzJvCzIoizJsqzIqqzJQNZlQzZlKFuyLTuyK3uyLwdyKEdyTM4j55XzyfnlAnJBuZBcWC4iF5WLycXlEnJJuZRcWi4jl5XLyeXlCnJFuZJcWa4iV5WrydXlGnJNuZZcW64j15XryfXlBnJDuZHcWG4iN5Wbyc3lFnJLuZXcWm4jt5Xbye3lDnJHuZPcWe4id5W7yd3lHnJPuZfcW+4j95X7yf3lAfJAeZA8WB4iD5WHycPlEfJIeZQ8Wh4jj5XHyePlCfJEeZI8WZ4iT5WnydPlGfJMeZY8W54jz5XnyfPlBfJCeZG8WF4iL5WXycvlFfJKeZW8Wl4jr5XXyevlDfJGeZO8Wd4ib5W3ydvlHfJOeZe8W94j75X3yfHyfvmAfFA+JB+Wj8hH5WPycfmEfFI+JZ+Wz8hn5QT5nHxeviBflC/Jl+Ur8lX5mnxdviHflG/Jt+U78l35nnxffiA/lB/Jj+Un8lP5mfxcfiG/lF/Jr+U3cqL8Vn4nv5c/yB/lT/Jn+Yv8Vf4mf5d/yD/lX/JvOUn+I/+V/8lxSjIluZJCSamkUlIraZS0SjolvZJByahkUjIrWZSsSjYlu5JDyankUnIriIIqmIIrhEIqlEIrjMIqnMIrgiIqkiIriqIqmgIUXTEUU4GKpdiKo7iKp/hKoIRKpMSUPEpeJZ+SXymgFFQKKYWVIkpRpZhSXCmhlFRKKaWVMkpZpZxSXqmgVFQqKZWVKkpVpZpSXamh1FRqKbWVOkpdpZ5SX2mgNFQaKY2VJkpTpZnSXGmhtFRaKa2VNkpbpZ3SXumgdFQ6KZ2VLkpXpZvSXemh9FR6Kb2VPkpfpZ/SXxmgDFQGKYOVIcpQZZgyXBmhjFRGKaOVMcpYZZwyXpmgTFQmKZOVKcpUZZoyXZmhzFRmKbOVOcpcZZ4yX1mgLFQWKYuVJcpSZZmyXFmhrFRWKauVNcpaZZ2yXtmgbFQ2KZuVLcpWZZuyXdmh7FR2KbuVPcpeZZ8Sr+xXDigHlUPKYeWIclQ5phxXTignlVPKaeWMclZJUM4p55ULykXlknJZuaJcVa4p15Ubyk3llnJbuaPcVe4p95UHykPlkfJYeaI8VZ4pz5UXykvllfJaeaMkKm+Vd8p75YPyUfmkfFa+KF+Vb8p35YfyU/ml/FaSlD/KX+WfEqcmU5OrKdSUaio1tZpGTaumU9OrGdSMaiY1s5pFzapmU7OrOdScai41t4qoqIqpuEqopEqptMqorMqpvCqooiqpsqqoqqqpQNVVQzVVqFqqrTqqq3qqrwZqqEZqTM2j5lXzqfnVAmpBtZBaWC2iFlWLqcXVEmpJtZRaWi2jllXLqeXVCmpFtZJaWa2iVlWrqdXVGmpNtZZaW62j1lXrqfXVBmpDtZHaWG2iNlWbqc3VFmpLtZXaWm2jtlXbqe3VDmpHtZPaWe2idlW7qd3VHmpPtZfaW+2j9lX7qf3VAepAdZA6WB2iDlWHqcPVEepIdZQ6Wh2jjlXHqePVCepEdZI6WZ2iTlWnqdPVGepMdZY6W52jzlXnqfPVBepCdZG6WF2iLlWXqcvVFepKdZW6Wl2jrlXXqevVDepGdZO6Wd2iblW3qdvVHepOdZe6W92j7lX3qfHqfvWAelA9pB5Wj6hH1WPqcfWEelI9pZ5Wz6hn1QT1nHpevaBeVC+pl9Ur6lX1mnpdvaHeVG+pt9U76l31nnpffaA+VB+pj9Un6lP1mfpcfaG+VF+pr9U3aqL6Vn2nvlc/qB/VT+pn9Yv6Vf2mfld/qD/VX+pvNUn9o/5V/6lxWjItuZZCS6ml0lJrabS0WjotvZZBy6hl0jJrWbSsWjYtu5ZDy6nl0nJriIZqmIZrhEZqlEZrjMZqnMZrgiZqkiZriqZqmgY0XTM0U4Oapdmao7map/laoIVapMW0PFpeLZ+WXyugFdQKaYW1IlpRrZhWXCuhldRKaaW1MlpZrZxWXqugVdQqaZW1KlpVrZpWXauh1dRqabW1OlpdrZ5WX2ugNdQaaY21JlpTrZnWXGuhtdRaaa21NlpbrZ3WXuugddQ6aZ21LlpXrZvWXeuh9dR6ab21PlpfrZ/WXxugDdQGaYO1IdpQbZg2XBuhjdRGaaO1MdpYbZw2XpugTdQmaZO1KdpUbZo2XZuhzdRmabO1OdpcbZ42X1ugLdQWaYu1JdpSbZm2XFuhrdRWaau1NdpabZ22XtugbdQ2aZu1LdpWbZu2Xduh7dR2abu1PdpebZ8Wr+3XDmgHtUPaYe2IdlQ7ph3XTmgntVPaae2MdlZL0M5p57UL2kXtknZZu6Jd1a5p17Ub2k3tlnZbu6Pd1e5p97UH2kPtkfZYe6I91Z5pz7UX2kvtlfZae6Mlam+1d9p77YP2Ufukfda+aF+1b9p37Yf2U/ul/daStD/aX+2fFgeSgeQgBUgJUoHUIA1IC9KB9CADyAgygcwgC8gKsoHsIAfICXKB3AABKMAADghAAgrQgAEs4AAPBCACCchAASrQAAA6MIAJILCADRzgAg/4IAAhiEAM5AF5QT6QHxQABUEhUBgUAUVBMVAclAAlQSlQGpQBZUE5UB5UABVBJVAZVAFVQTVQHdQANUEtUBvUAXVBPVAfNAANQSPQGDQBTUEz0By0AC1BK9AatAFtQTvQHnQAHUEn0Bl0AV1BN9Ad9AA9QS/QG/QBfUE/0B8MAAPBIDAYDAFDwTAwHIwAI8EoMBqMAWPBODAeTAATwSQwGUwBU8E0MB3MADPBLDAbzAFzwTwwHywAC8EisBgsAUvBMrAcrAArwSqwGqQFa8E6sB5sABvBJrAZbAFbwTawHewAO8EusBvsAXvBPhAP9oMD4CA4BA6DI+AoOAaOgxPgJDgFToMz4CxIAOfAeXABXASXwGVwBVwF18B1cAPcBLfAbXAH3AX3wH3wADwEj8Bj8AQ8Bc/Ac/ACvASvwGvwBiSCt+AdeA8+gI/gE/gMvoCv4Bv4Dn6An+AX+A2SwB/wF/wDcXoyPbmeQk+pp9JT62n0tHo6Pb2eQc+oZ9Iz61n0rHo2PbueQ8+p59Jz64iO6piO64RO6pRO64zO6pzO64Iu6pIu64qu6poOdF03dFOHuqXbuqO7uqf7eqCHeqTH9Dx6Xj2fnl8voBfUC+mF9SJ6Ub2YXlwvoZfUS+ml9TJ6Wb2cXl6voFfUK+mV9Sp6Vb2aXl2vodfUa+m19Tp6Xb2eXl9voDfUG+mN9SZ6U72Z3lxvobfUW+mt9TZ6W72d3l7voHfUO+md9S56V72b3l3voffUe+m99T56X72f3l8foA/UB+mD9SH6UH2YPlwfoY/UR+mj9TH6WH2cPl6foE/UJ+mT9Sn6VH2aPl2foc/UZ+mz9Tn6XH2ePl9foC/UF+mL9SX6Un2Zvlxfoa/UV+mr9TX6Wn2dvl7foG/UN+mb9S0p4/Rt+nZ9h75T36Xv1vfoe/V9ery+Xz+gH9QP6Yf1I/pR/Zh+XD+hn9RP6af1M/pZPUE/p5/XL+gX9Uv6Zf2KflW/pl/Xb+g39Vv6bf2Ofle/p9/XH+gP9Uf6Y/2J/lR/pj/XX+gv9Vf6a/2Nnqi/1d/p7/UP+kf9k/5Z/6J/1b/p3/Uf+k/9l/5bT9L/6H/1f3qckcxIbqQwUhqpjNRGGiOtkc5Ib2QwMhqZjMxGFiOrkc3IbuQwchq5jNwGYqAGZuAGYZAGZdAGY7AGZ/CGYIiGZMiGYqiGZgBDNwzDNKBhGf//7t/IaGw0NpoazYzmRt7kBZO3MloZbYw2RjvjP0/KjkYno7PRxehqdDW6Gz2MHkYvo7fRx+hr9DP6GwOMgcYgY7Ax2BhqDDWGG8ONkcZIY7Qx2hhrjDXGG+ONicZEY7Ix2ZhqTDWmG9ONmcZMY7Yx25hrzDXmG/ONhcZCY7Gx2FhqLDWWG8uNlcZKY7Wx2lhrrDXWG+uNjcZGY7Ox2dhqbDW2G9uNncZOY7ex29hr7DXijXjjgHHAOGQcMo4YR4xjxjHjhHHCOGWcMs4YZ4wEI8E4b5w3LhoXjcvGZeOqcdW4blw3bho3jdvGbeOucde4b9w3HhoPjcfGY+Op8cx4brwwXhqvjNfGGyPReGu8M94bH4yPxifjs/HF+Gp8M74bP4yfxi/jt5Fk/DH+Gv+M/y//L8qmYqqmZgJTNw3TNKFpmbbpmK7pmb4ZmKEZmTEzj5nXzGfmNwuYBc1CZmGziFnULGYWN0uYJc1SZmmzjFnWLGeWNyuYFc1KZmWzilnVrGZWN2uYNc1aZm2zjlnXrGfWNxuYDc1GZmOzidnUbGY2N1uYLc1WZmuzjdnWbGe2NzuYHc1OZuffXcyuZjezu9nD7Gn2Mnubfcy+Zj+zvznAHGgOMgebQ8yh5jBzuDnCHGmOMkebY8yx5jhzvDnBnGhOMiebU8yp5jRzujnDnGnOMmebc8y55jxzvrnAXGguMhebS8yl5jJzubnCXGmuMleba8y15jpzvbnB3GhuMjebW8yt5jZzu7nD3GnuMnebe8y95j4z3txvHjAPmofMw+YR86h5zDxunjBPmqfM0+YZ86yZYJ4zz5sXzIvmJfOyecW8al4zr5s3zJvmLfO2ece8a94z75sPzIfmI/Ox+cR8aj4zn5svzDRxcXGvzTdmovnWfGe+Nz+YH81P5mfzi/nV/GZ+N3+YP81f5m8zyfxj/jX/mXEwGUwOU8CUMBVMDdPAtDAdTA8zwIwwE8wMs8CsMBvMDnPAnDAXzA0RiEIM4pCAJKQgDRnIQg7yUIAilKAMFahCDQKoQwOaEEIL2tCBLvSgDwMYwgjGYB6YF+aD+WEBWBAWgoVhEVgUFoPFYQlYEpaCpWEZWBaWg+VhBVgRVoKVYRVYFVaD1WENWBPWgrVhHVgX1oP1YQPYEDaCjWET2BQ2g81hC9gStoKtYRvYFraD7WEH2BF2gp1hF9gVdoPdYQ/YE/aCvWEf2Bf2g/3hADgQDoKD4RA4FA6Dw+EIOBKOgqPhGDgWjoPj4QQ4EU6Ck+EUOBVOg9PhDDgTzoKz4Rw4F86D8+ECuBAugovhErgULoPL4Qq4Eq6Cq+EauBaug+vhBrgRboKb4Ra4FW6D2+EOuBPugrvhHrgX7oPxcD88AA/CQ/AwPAKPwmPwODwBT8JT8DQ8A8/CBHgOnocX4EV4CV6GV+BVeA1ehzfgTXgL3oZ34F14D96HD+BD+Ag+hk/gU/gMPocv4Ev4Cr6Gb2AifAvfwffwA/wIP8HP8Av8Cr/B7/AH/Al/wd8wCf6Bf+E/GGcls5JbKayUViortZXGSmuls9JbGayMViYrs5XFympls7JbOaycVi4rt4VYqIVZuEVYpEVZtMVYrMVZvCVYoiVZsqVYqqVZwNItwzItaFmWbTmWa3mWbwVWaEVWzMpj5bXyWfmtAlZBq5BV2CpiFbWKWcWtElZJq5RV2ipjlbXKWeWtClZFq5JV2apiVbWqWdWtGlZNq5ZV26pj1bXqWfWtBlZDq5HV2GpiNbWaWc2tFlZLq5XV2mpjtbXaWe2tDlZHq5PV2epidbW6Wd2tHlZPq5fV2+pj9bX6Wf2tAdZAa5A12BpiDbWGWcOtEdZIa5Q12hpjjbXGWeOtCdZEa5I12ZpiTbWmWdOtGdZMa5Y125pjzbXmWfOtBdZCa5G12FpiLbWWWcutFdZKa5W12lpjrbXWWeutDdZGa5O12dpibbW2WdutHdZOa5e129pj7bX2WfHWfuuAddA6ZB22jlhHrWPWceuEddI6ZZ22zlhnrQTrnHXeumBdtC5Zl60r1lXrmnXdumHdtG5Zt6071l3rnnXfemA9tB5Zj60n1lPrmfXcemG9tF5Zr603VqL11npnvbc+WB+tT9Zn64v11fpmfbd+WD+tX9ZvK8n6Y/21/llxdjI7uZ3CTmmnslPbaey0djo7vZ3BzmhnsjPbWeysdjY7u53DzmnnsnPbiI3amI3bhE3alE3bjM3anM3bgi3aki3biq3amg1s3TZs04a2Zdu2Y7u2Z/t2YId2ZMfsPHZeO5+d3y5gF7QL2YXtInZRu5hd3C5hl7RL2aXtMnZZu5xd3q5gV7Qr2ZXtKnZVu5pd3a5h17Rr2bXtOnZdu55d325gN7Qb2Y3tJnZTu5nd3G5ht7Rb2a3tNnZbu53d3u5gd7Q72Z3tLnZXu5vd3e5h97R72b3tPnZfu5/d3x5gD7QH2YPtIfZQe5g93B5hj7RH2aPtMfZYe5w93p5gT7Qn2ZPtKfZUe5o93Z5hz7Rn2bPtOfZce549315gL7QX2YvtJfZSe5m93F5hr7RX2avtNfZae5293t5gb7Q32ZvtLfZWe5u93d5h77R32bvtPfZee58db++3D9gH7UP2YfuIfdQ+Zh+3T9gn7VP2afuMfdZOsM/Z5+0L9kX7kn3ZvmJfta/Z1+0b9k37ln3bvmPfte/Z9+0H9kP7kf3YfmI/tZ/Zz+0X9kv7lf3afmMn2m/td/Z7+4P90f5kf7a/2F/tb/Z3+4f90/5l/7aT7D/2X/ufHeckc5I7KZyUTiontZPGSeukc9I7GZyMTiYns5PFyepkc7I7OZycTi4nt4M4qIM5uEM4pEM5tMM4rMM5vCM4oiM5sqM4qqM5wNEdwzEd6FiO7TiO63iO7wRO6EROzMnj5HXyOfmdAk5Bp5BT2CniFHWKOcWdEk5Jp5RT2injlHXKOeWdCk5Fp5JT2aniVHWqOdWdGk5Np5ZT26nj1HXqOfWdBk5Dp5HT2GniNHWaOc2dFk5Lp5XT2mnjtHXaOe2dDk5Hp5PT2enidHW6Od2dHk5Pp5fT2+nj9HX6Of2dAc5AZ5Az2BniDHWGOcOdEc5IZ5Qz2hnjjHXGOeOdCc5EZ5Iz2ZniTHWmOdOdGc5MZ5Yz25njzHXmOfOdBc5CZ5Gz2FniLHWWOcudFc5KZ5Wz2lnjrHXWOeudDc5GZ5Oz2dnibHW2OdudHc5OZ5ez29nj7HX2OfHOfueAc9A55Bx2jjhHnWPOceeEc9I55Zx2zjhnnQTnnHPeueBcdC45l50rzlXnmnPdueHcdG45t507zl3nnnPfeeA8dB45j50nzlPnmfPceeG8dF45r503TqLz1nnnvHc+OB+dT85n54vz1fnmfHd+OD+dX85vJ8n54/x1/jlxbjI3uZvCTemmclO7ady0bjo3vZvBzehmcjO7WdysbjY3u5vDzenmcnO7iIu6mIu7hEu6lEu7jMu6nMu7giu6kiu7iqu6mgtc3TVc04Wu5dqu47qu5/pu4IZu5MbcPG5eN5+b3y3gFnQLuYXdIm5Rt5hb3C3hlnRLuaXdMm5Zt5xb3q3gVnQruZXdKm5Vt5pb3a3h1nRrubXdOm5dt55b323gNnQbuY3dJm5Tt5nb3G3htnRbua3dNm5bt53b3u3gdnQ7uZ3dLm5Xt5vb3e3h9nR7ub3dPm5ft5/b3x3gDnQHuYPdIe5Qd5g73B3hjnRHuaPdMe5Yd5w73p3gTnQnuZPdKe5Ud5o73Z3hznRnubPdOe5cd547313gLnQXuYvdJe5Sd5m73F3hrnRXuavdNe5ad5273t3gbnQ3uZvdLe5Wd5u73d3h7nR3ubvdPe5ed58b7+53D7gH3UPuYfeIe9Q95h53T7gn3VPuafeMe9ZNcM+5590L7kX3knvZveJeda+5190b7k33lnvbvePede+5990H7kP3kfvYfeI+dZ+5z90X7kv3lfvafeMmum/dd+5794P70f3kfna/uF/db+5394f70/3l/naT3D/uX/efG+cl85J7KbyUXiovtZfGS+ul89J7GbyMXiYvs5fFy+pl87J7ObycXi4vt4d4qId5uEd4pEd5tMd4rMd5vCd4oid5sqd4qqd5wNM9wzM96Fme7Tme63me7wVe6EVezMvj5fXyefm9Al5Br5BX2CviFfWKecW9El5Jr5RX2ivjlfXKeeW9Cl5Fr5JX2aviVfWqedW9Gl5Nr5ZX26vj1fXqefW9Bl5Dr5HX2GviNfWaec29Fl5Lr5XX2mvjtfXaee29Dl5Hr5PX2evidfW6ed29Hl5Pr5fX2+vj9fX6ef29Ad5Ab5A32BviDfWGecO9Ed5Ib5Q32hvjjfXGeeO9Cd5Eb5I32ZviTfWmedO9Gd5Mb5Y325vjzfXmefO9Bd5Cb5G32FviLfWWecu9Fd5Kb5W32lvjrfXWeeu9Dd5Gb5O32dvibfW2edu9Hd5Ob5e329vj7fX2efHefu+Ad9A75B32jnhHvWPece+Ed9I75Z32znhnvQTvnHfeu+Bd9C55l70r3lXvmnfdu+Hd9G55t7073l3vnnffe+A99B55j70n3lPvmffce+G99F55r703XqL31nvnvfc+eB+9T95n74v31fvmffd+eD+9X95vL8n74/31/nlxfjI/uZ/CT+mn8lP7afy0fjo/vZ/Bz+hn8jP7WfysfjY/u5/Dz+nn8nP7iI/6mI/7hE/6lE/7jM/6nM/7gi/6ki/7iq/6mg983Td804e+5du+47u+5/t+4Id+5Mf8PH5eP5+f3y/gF/QL+YX9In5Rv5hf3C/hl/RL+aX9Mn5Zv5xf3q/gV/Qr+ZX9Kn5Vv5pf3a/h1/Rr+bX9On5dv55f32/gN/Qb+Y39Jn5Tv5nf3G/ht/Rb+a39Nn5bv53f3u/gd/Q7+Z39Ln5Xv5vf3e/h9/R7+b39Pn5fv5/f3x/gD/QH+YP9If5Qf5g/3B/hj/RH+aP9Mf5Yf5w/3p/gT/Qn+ZP9Kf5Uf5o/3Z/hz/Rn+bP9Of5cf54/31/gL/QX+Yv9Jf5Sf5m/3F/hr/RX+av9Nf5af52/3t/gb/Q3+Zv9Lf5Wf5u/3d/h7/R3+bv9Pf5ef58f7+/3D/gH/UP+Yf+If9Q/5h/3T/gn/VP+af+Mf9ZP8M/55/0L/kX/kn/Zv+Jf9a/51/0b/k3/ln/bv+Pf9e/59/0H/kP/kf/Yf+I/9Z/5z/0X/kv/lf/af+Mn+m/9d/57/4P/0f/kf/a/+F/9b/53/4f/0//l//aT/D/+X/+fHxckC5IHKYKUQaogdZAmSBukC9IHGYKMQaYgc5AlyBpkC7IHOYKcQa4gd4AEaIAFeEAEZEAFdMAEbMAFfCAEYiAFcqAEaqAFINADIzADGFiBHTiBG3iBHwRBGERBLMgT5A3yBfmDAkHBoFBQOCgSFA2KBcWDEkHJoFRQOigTlA3KBeWDCkHFoFJQOagSVA2qBdWDGkHNoFZQO6gT1A3qBfWDBkHDoFHQOGgSNA2aBc2DFkHLoFXQOmgTtA3aBe2DDkHHoFPQOegSdA26Bd2DHkHPoFfQO+gT9A36Bf2DAcHAYFAwOBgSDA2GBcODEcHIYFQwOhgTjA3GBeODCcHEYFIwOZgSTA2mBdODGcHMYFYwO5gTzA3mBfODBcHCYFGwOFgSLA2WBcuDFcHKYFWwOlgTrA3WBeuDDcHGYFOwOdgSbA22BduDHcHOYFewO9gT7A32BfHB/uBAcDA4FBwOjgRHg2PB8eBEcDI4FZwOzgRng4TgXHA+uBBcDC4Fl4MrwdXgWnA9uBHcDG4Ft4M7wd3gXnA/eBA8DB4Fj4MnwdPgWfA8eBG8DF4Fr4M3QWLwNngXvA8+BB+DT8Hn4EvwNfgWfA9+BD+DX8HvICn4E/wN/gVxYbIweZgiTBmmClOHacK0YbowfZghzBhmCjOHWcKsYbYwe5gjzBnmCnOHSIiGWIiHREiGVEiHTMiGXMiHQiiGUiiHSqiGWghCPTRCM4ShFdqhE7qhF/phEIZhFMbCPGHeMF+YPywQFgwLhYXDImHRsFhYPCwRlgxLhaXDMmHZsFxYPqwQVgwrhZXDKmHVsFpYPawR1gxrhbXDOmHdsF5YP2wQNgwbhY3DJmHTsFnYPGwRtgxbha3DNmHbsF3YPuwQdgw7hZ3DLmHXsFvYPewR9gx7hb3DPmHfsF/YPxwQDgwHhYPDIeHQcFg4PBwRjgxHhaPDMeHYcFw4PpwQTgwnhZPDKeHUcFo4PZwRzgxnhbPDOeHccF44P1wQLgwXhYvDJeHScFm4PFwRrgxXhavDNeHacF24PtwQbgw3hZvDLeHWcFu4PdwR7gx3hbvDPeHecF8YH+4PD4QHw0Ph4fBIeDQ8Fh4PT4Qnw1Ph6fBMeDZMCM+F58ML4cXwUng5vBJeDa+F18Mb4c3wVng7vBPeDe+F98MH4cPwUfg4fBI+DZ+Fz8MX4cvwVfg6fBMmhm/Dd+H78EP4MfwUfg6/hF/Db+H38Ef4M/wV/g6Twj/h3/BfGBcli5JHKaKUUaoodZQmShuli9JHGaKMUaYoc5Qlyhpli7JHOaKcUa4od4REaIRFeEREZERFdMREbMRFfCREYiRFcqREaqRFINIjIzIjGFmRHTmRG3mRHwVRGEVRLMoT5Y3yRfmjAlHBqFBUOCoSFY2KRcWjElHJqFRUOioTlY3KReWjClHFqFJUOaoSVY2qRdWjGlHNqFZUO6oT1Y3qRfWjBlHDqFHUOGoSNY2aRc2jFlHLqFXUOmoTtY3aRe2jDlHHqFPUOeoSdY26Rd2jHlHPqFfUO+oT9Y36Rf2jAdHAaFA0OBoSDY2GRcOjEdHIaFQ0OhoTjY3GReOjCdHEaFI0OZoSTY2mRdOjGdHMaFY0O5oTzY3mRfOjBdHCaFG0OFoSLY2WRcujFdHKaFW0OloTrY3WReujDdHGaFO0OdoSbY22RdujHdHOaFe0O9oT7Y32RfHR/uhAdDA6FB2OjkRHo2PR8ehEdDI6FZ2OzkRno4ToXHQ+uhBdjC5Fl6Mr0dXoWnQ9uhHdjG5Ft6M70d3oXnQ/ehA9jB5Fj6Mn0dPoWfQ8ehG9jF5Fr6M3UWL0NnoXvY8+RB+jT9Hn6Ev0NfoWfY9+RD+jX9HvKCn6E/2N/kVxsWSx5LEUsZSxVLHUsTSxtLF0sfSxDLGMsUyxzLEssayxbLHssRyxnLFcsdwxJIbGsBgeI2JkjIrRMSbGxrgYHxNiYkyKyTElpsa0GIjpMSNmxmDMitmx/4OGe2zPo2sUAJratm17sMc29p6Zu7ZS27atp7Zt20htN21qm+fLef/HulY1rDpWA6uJ1cJqY3UwDMMxAiMxgFEYjTEYi3EYjwmYiEmYjCmYimmYjhmYiVmYjTmYi3mYj0EMYQEWYhEWw+pi9bD6WAOsIdYIa4w1wZpizbDmWAusJdYKa421wdpi8Vg7rD3WAeuIdcI6Y12wrlg3rDvWA+uJ9cJ6Y32wvlg/rD82ABuIDcIGY0OwodgwbDg2AhuJjcJGY2Owsdg4bDw2AZuITcImY1Owqdg0bDo2A5uJzcJmY3NSxMXFYfOw+dgCbCG2CFuMLcGWYsuw5dgKbCW2CluNrcHWYuuw9dgGbCO2CduMbcG2Ytuw7dgObCe2C9uN7cH2Yvuw/dgB7CB2CDuMHcGOYsew49gJ7CR2CjuNncHOYuew89gFLAG7iF3CLmNXsKvYNew6dgO7id3CbmN3sLvYPew+9gB7iD3CHmNPsETsKfYMS8KeYy+wl9gr7DX2BnuLvcPeYx+wj9gn7DP2BfuKfcO+Yz+wn9gv7Df2B/uL/cPi8GR4cjwFnhJPhafG0+Bp8XR4ejwDnhHPhGfGs+BZ8Wx4djwHnhPPhefG8+B58Xx4frwAXhAvhBfGi+BF8WJ4cbwEXhIvhZfGy+Bl8XJ4ebwCXhGvhFfGq+BV8Wp4dbwGXhOvhdfG6+AYjuMETuIAp3AaZ3AW53AeF3ARl3AZV3AV13AdN3ATt3Abd3AX93AfhzjCAzzEIzyG18Xr4fXxBnhDvBHeGG+CN8Wb4c3xFnhLvBXeGm+Dt8Xj8XZ4e7wD3hHvhHfGu+Bd8W54d7wH3hPvhffG++B98X54f3wAPhAfhA/Gh+BD8WH4cHwEPhIfhY/Gx+Bj8XH4eHwCPhGfhE/Gp+BT8Wn4dHwGPhOfhc/G5+D/4XPxefh8fAG+EF+EL8aX4EvxZfhyfAW+El+Fr8bX4Gvxdfh6fAO+Ed+Eb8a34Fvxbfh2fAe+E9+F78b34Hvxffh+/AB+ED+EH8aP4EfxY/hx/AR+Ej+Fn8bP4Gfxc/h5/AKegF/EL+GX8Sv4Vfwafh2/gd/Eb+G38Tv4Xfwefh9/gD/EH+GP8Sd4Iv4Uf4Yn4c/xF/hL/BX+Gn+Dv8Xf4e/xD/hH/BP+Gf+Cf8W/4d/xH/hP/Bf+G/+D/8X/4XFEMiI5kYJISaQiUhNpiLREOiI9kYHISGQiMhNZiKxENiI7kYPISeQichN5iLxEPiI/UYAoSBQiChNFiKJEMaI4UYIoSZQiShNliLJEOaI8UYGoSFQiKhNViKpENaI6UYOoSdQiahN1CIzACYIgCUBQBE0wBEtwBE8IhEhIhEwohEpohE4YhElYhE04hEt4hE9AAhEBERIRESPqEvWI+kQDoiHRiGhMNCGaEs2I5kQLoiXRimhNtCHaEvFEO6I90YHoSHQiOhNdiK5EN6I70YPoSfQiehN9iL5EP6I/MYAYSAwiBhNDiKHEMGI4MYIYSYwiRhNjiLHEOGI8MYGYSEwiJhNTiKnENGI6MYOYScwiZhNziP+IucQ8Yj6xgFhILCIWE0uIpcQyYjmxglhJrCJWE2uItcQ6Yj2xgdhIbCI2E1uIrcQ2Yjuxg9hJ7CJ2E3uIvcQ+Yj9xgDhIHCIOE0eIo8Qx4jhxgjhJnCJOE2eIs8Q54jxxgUggLhKXiMvEFeIqcY24TtwgbhK3iNvEHeIucY+4TzwgHhKPiMfEEyKReEo8I5KI58QL4iXxinhNvCHeEu+I98QH4iPxifhMfCG+Et+I78QP4ifxi/hN/CH+Ev+IODIZmZxMQaYkU5GpyTRkWjIdmZ7MQGYkM5GZySxkVjIbmZ3MQeYkc5G5yTxkXjIfmZ8sQBYkC5GFySJkUbIYWZwsQZYkS5GlyTJkWbIcWZ6sQFYkK5GVySpkVbIaWZ2sQdYka5G1yTokRuIkQZIkICmSJhmSJTmSJwVSJCVSJhVSJTVSJw3SJC3SJh3SJT3SJyGJyIAMyYiMkXXJemR9sgHZkGxENiabkE3JZmRzsgXZkmxFtibbkG3JeLId2Z7sQHYkO5GdyS5kV7Ib2Z3sQfYke5G9yT5kX7If2Z8cQA4kB5GDySHkUHIYOZwcQY4kR5GjyTHkWHIcOZ6cQE4kJ5GTySnkVHIaOZ2cQc4kZ5GzyTnkf+Rcch45n1xALiQXkYvJJeRSchm5nFxBriRXkavJNeRach25ntxAbiQ3kZvJLeRWchu5ndxB7iR3kbvJPeRech+5nzxAHiQPkYfJI+RR8hh5nDxBniRPkafJM+RZ8hx5nrxAJpAXyUvkZfIKeZW8Rl4nb5A3yVvkbfIOeZe8R94nH5APyUfkY/IJmUg+JZ+RSeRz8gX5knxFvibfkG/Jd+R78gP5kfxEfia/kF/Jb+R38gf5k/xF/ib/kH/Jf2QcSAaSgxQgJUgFUoM0IC1IB9KDDCAjyAQygywgK8gGsoMcICfIBXKDPCAvyAfygwKgICgECoMioCgoBoqDEqAkKAVKgzKgLCgHyoMKoCKoBCqDKqAqqAaqgxqgJqgFaoM6AAM4IAAJAKAADRjAAg7wQAAikIAMFKACDejAACawgA0c4AIP+AACBAIQggjEQF1QD9QHDUBD0Ag0Bk1AU9AMNActQEvQCrQGbUBbEA/agfagA+gIOoHOoAvoCrqB7qAH6Al6gd6gD+gL+oH+YAAYCAaBwWAIGAqGgeFgBBgJRoHRYAwYC8aB8WACmAgmgclgCpgKpoHpYAaYCWaB2WAO+A/MBfPAfLAALASLwGKwBCwFy8BysAKsBKvAarAGrAXrwHqwAWwEm8BmsAVsBdvAdrAD7AS7wG6wB+wF+8B+cAAcBIfAYXAEHAXHwHFwApwEp8BpcAacBefAeXABJICL4BK4DK6Aq+AauA5ugJvgFrgN7oC74B64Dx6Ah+AReAyegETwFDwDSeA5eAFeglfgNXgD3oJ34D34AD6CT+Az+AK+gm/gO/gBfoJf4Df4A/6CfyCOSkYlp1JQKalUVGoqDZWWSkelpzJQGalMVGYqC5WVykZlp3JQOalcVG4qD5WXykflpwpQBalCVGGqCFWUKkYVp0pQJalSVGmqDFWWKkeVpypQFalKVGWqClWVqkZVp2pQNalaVG2qDoVROEVQJAUoiqIphmIpjuIpgRIpiZIphVIpjdIpgzIpi7Iph3Ipj/IpSCEqoEIqomJUXaoeVZ9qQDWkGlGNqSZUU6oZ1ZxqQbWkWlGtqTZUWyqeake1pzpQHalOVGeqC9WV6kZ1p3pQPaleVG+qD9WX6kf1pwZQA6lB1GBqCDWUGkYNp0ZQI6lR1GhqDDWWGkeNpyZQE6lJ1GRqCjWVmkZNp2ZQM6lZ1GxqDvUfNZeaR82nFlALqUXUYmoJtZRaRi2nVlArqVXUamoNtZZaR62nNlAbqU3UZmoLtZXaRm2ndlA7qV3UbmoPtZfaR+2nDlAHqUPUYeoIdZQ6Rh2nTlAnqVPUaeoMdZY6R52nLlAJ1EXqEnWZukJdpa5R16kb1E3qFnWbukPdpe5R96kH1EPqEfWYekIlUk+pZ1QS9Zx6Qb2kXlGvqTfUW+od9Z76QH2kPlGfqS/UV+ob9Z36Qf2kflG/qT/UX+ofFUcno5PTKeiUdCo6NZ2GTkuno9PTGeiMdCY6M52Fzkpno7PTOeicdC46N52Hzkvno/PTBeiCdCG6MF2ELkoXo4vTJeiSdCm6NF2GLkuXo8vTFeiKdCW6Ml2FrkpXo6vTNeiadC26Nl2HxmicJmiSBjRF0zRDszRH87RAi7REy7RCq7RG67RBm7RF27RDu7RH+zSkER3QIR3RMbouXY+uTzegG9KN6MZ0E7op3YxuTregW9Kt6NZ0G7otHU+3o9vTHeiOdCe6M92F7kp3o7vTPeiedC+6N92H7kv3o/vTA+iB9CB6MD2EHkoPo4fTI+iR9Ch6ND2GHkuPo8fTE+iJ9CR6Mj2FnkpPo6fTM+iZ9Cx6Nj2H/o+eS8+j59ML6IX0InoxvYReSi+jl9Mr6JX0Kno1vYZeS6+j19Mb6I30JnozvYXeSm+jt9M76J30Lno3vYfeS++j99MH6IP0IfowfYQ+Sh+jj9Mn6JP0Kfo0fYY+S5+jz9MX6AT6In2Jvkxfoa/S1+jr9A36Jn2Lvk3foe/S9+j79AP6If2Ifkw/oRPpp/QzOol+Tr+gX9Kv6Nf0G/ot/Y5+T3+gP9Kf6M/0F/or/Y3+Tv+gf9K/6N/0H/ov/Y+OY5IxyZkUTEomFZOaScOkZdIx6ZkMTEYmE5OZycJkZbIx2ZkcTE4mF5ObycPkZfIx+ZkCTEGmEFOYKcIUZYoxxZkSTEmmFFOaKcOUZcox5ZkKTEWmElOZqcJUZaox1ZkaTE2mFlObqcNgDM4QDMkAhmJohmFYhmN4RmBERmJkRmFURmN0xmBMxmJsxmFcxmN8BjKICZiQiZgYU5epx9RnGjANmUZMY6YJ05RpxjRnWjAtmVZMa6YN05aJZ9ox7ZkOTEemE9OZ6cJ0Zbox3ZkeTE+mF9Ob6cP0Zfox/ZkBzEBmEDOYGcIMZYYxw5kRzEhmFDOaGcOMZcYx45kJzERmEjOZmcJMZaYx05kZzExmFjObmcP8x8xl5jHzmQXMQmYRs5hZwixlljHLmRXMSmYVs5pZw6xl1jHrmQ3MRmYTs5nZwmxltjHbmR3MTmYXs5vZw+xl9jH7mQPMQeYQc5g5whxljjHHmRPMSeYUc5o5w5xlzjHnmQtMAnORucRcZq4wV5lrzHXmBnOTucXcZu4wd5l7zH3mAfOQecQ8Zp4wicxT5hmTxDxnXjAvmVfMa+YN85Z5x7xnPjAfmU/MZ+YL85X5xnxnfjA/mV/Mb+YP85f5x8SxydjkbAo2JZuKTc2mYdOy6dj0bAY2I5uJzcxmYbOy2djsbA42J5uLzc3mYfOy+dj8bAG2IFuILcwWYYuyxdjibAm2JFuKLc2WYcuy5djybAW2IluJrcxWYauy1djqbA22JluLrc3WYTEWZwmWZAFLsTTLsCzLsTwrsCIrsTKrsCqrsTprsCZrsTbrsC7rsT4LWcQGbMhGbIyty9Zj67MN2IZsI7Yx24RtyjZjm7Mt2JZsK7Y124Zty8az7dj2bAe2I9uJ7cx2Ybuy3djubA+2J9uL7c32Yfuy/dj+7AB2IDuIHcwOYYeyw9jh7Ah2JDuKHc2OYcey49jx7AR2IjuJncxOYaey09jp7Ax2JjuLnc3OYf9j57Lz2PnsAnYhu4hdzC5hl7LL2OXsCnYlu4pdza5h17Lr2PXsBnYju4ndzG5ht7Lb2O3sDnYnu4vdze5h97L72P3sAfYge4g9zB5hj7LH2OPsCfYke4o9zZ5hz7Ln2PPsBTaBvcheYi+zV9ir7DX2OnuDvcneYm+zd9i77D32PvuAfcg+Yh+zT9hE9in7jE1in7Mv2JfsK/Y1+4Z9y75j37Mf2I/sJ/Yz+4X9yn5jv7M/2J/sL/Y3+4f9y/5j47hkXHIuBZeSS8Wl5tJwabl0XHouA5eRy8Rl5rJwWblsXHYuB5eTy8Xl5vJwebl8XH6uAFeQK8QV5opwRbliXHGuBFeSK8WV5spwZblyXHmuAleRq8RV5qpwVblqXHWuBleTq8XV5upwGIdzBEdygKM4mmM4luM4nhM4kZM4mVM4ldM4nTM4k7M4m3M4l/M4n4Mc4gIu5CIuxtXl6nH1uQZcQ64R15hrwjXlmnHNuRZcS64V15prw7Xl4rl2XHuuA9eR68R15rpwXbluXHeuB9eT68X15vpwfbl+XH9uADeQG8QN5oZwQ7lh3HBuBDeSG8WN5sZwY7lx3HhuAjeRm8RN5qZwU7lp3HRuBjeTm8XN5uZw/3FzuXncfG4Bt5BbxC3mlnBLuWXccm4Ft5Jbxa3m1nBruXXcem4Dt5HbxG3mtnBbuW3cdm4Ht5Pbxe3m9nB7uX3cfu4Ad5A7xB3mjnBHuWPcce4Ed5I7xZ3mznBnuXPcee4Cl8Bd5C5xl7kr3FXuGnedu8Hd5G5xt7k73F3uHnefe8A95B5xj7knXCL3lHvGJXHPuRfcS+4V95p7w73l3nHvuQ/cR+4T95n7wn3lvnHfuR/cT+4X95v7w/3l/nFxfDI+OZ+CT8mn4lPzafi0fDo+PZ+Bz8hn4jPzWfisfDY+O5+Dz8nn4nPzefi8fD4+P1+AL8gX4gvzRfiifDG+OF+CL8mX4kvzZfiyfDm+PF+Br8hX4ivzVfiqfDW+Ol+Dr8nX4mvzdXiMx3mCJ3nAUzzNMzzLczzPC7zIS7zMK7zKa7zOG7zJW7zNO7zLe7zPQx7xAR/yER/j6/L1+Pp8A74h34hvzDfhm/LN+OZ8C74l34pvzbfh2/LxfDu+Pd+B78h34jvzXfiufDe+O9+D78n34nvzffi+fD++Pz+AH8gP4gfzQ/ih/DB+OD+CH8mP4kfzY/ix/Dh+PD+Bn8hP4ifzU/ip/DR+Oj+Dn8nP4mfzc/j/+Ln8PH4+v4BfyC/iF/NL+KX8Mn45v4Jfya/iV/Nr+LX8On49v4HfyG/iN/Nb+K38Nn47v4Pfye/id/N7+L38Pn4/f4A/yB/iD/NH+KP8Mf44f4I/yZ/iT/Nn+LP8Of48f4FP4C/yl/jL/BX+Kn+Nv87f4G/yt/jb/B3+Ln+Pv88/4B/yj/jH/BM+kX/KP+OT+Of8C/4l/4p/zb/h3/Lv+Pf8B/4j/4n/zH/hv/Lf+O/8D/4n/4v/zf/h//L/+DghmZBcSCGkFFIJqYU0QlohnZBeyCBkFDIJmYUsQlYhm5BdyCHkFHIJuYU8Ql4hn5BfKCAUFAoJhYUiQlGhmFBcKCGUFEoJpYUyQlmhnFBeqCBUFCoJlYUqQlWhmlBdqCHUFGoJtYU6AibgAiGQAhAogRYYgRU4gRcEQRQkQRYUQRU0QRcMwRQswRYcwRU8wReggIRACIVIiAl1hXpCfaGB0FBoJDQWmghNhWZCc6GF0FJoJbQW2ghthXihndBe6CB0FDoJnYUuQlehm9Bd6CH0FHoJvYU+Ql+hn9BfGCAMFAYJg4UhwlBhmDBcGCGMFEYJo4UxwlhhnDBemCBMFCYJk4UpwlRhmjBdmCHMFGYJs4U5wn/CXGGeMF9YICwUFgmLhSXCUmGZsFxYIawUVgmrhTXCWmGdsF7YIGwUNgmbhS3CVmGbsF3YIewUdgm7hT3CXmGfsF84IBwUDgmHhSPCUeGYcFw4IZwUTgmnhTPCWeGccF64ICQIF4VLwmXhinBVuCZcF24IN4Vbwm3hjnBXuCfcFx4ID4VHwmPhiZAoPBWeCUnCc+GF8FJ4JbwW3ghvhXfCe+GD8FH4JHwWvghfhW/Cd+GH8FP4JfwW/gh/hX9CnJhMTC6mEFOKqcTUYhoxrZhOTC9mEDOKmcTMYhYxq5hNzC7mEHOKucTcYh4xr5hPzC8WEAuKhcTCYhGxqFhMLC6WEEuKpcTSYhmxrFhOLC9WECuKlcTKYhWxqlhNrC7WEGuKtcTaYh0RE3GREEkRiJRIi4zIipzIi4IoipIoi4qoipqoi4ZoipZoi47oip7oi1BEYiDGxcXFxcS6Yj2xvthAbCg2EhuLTcSmYjOxudhCbCm2EluLbcS2YrzYTmwvdhA7ip3EzmIXsavYTewu9hB7ir3E3mIfsa/YT+wvDhAHioPEweIQcag4TBwujhBHiqPE0eIYcaw4ThwvThAnipPEyeIUcao4TZwuzhBnirPE2eL/eor54gJxobhIXCwuEZeKy8Tl4gpxpbhKXC2uEdeK68T14gZxo7hJ3CxuEbeK28Tt4g5xp7hL3C3uEfeK+8T94gHxoHhIPCweEY+Kx8Tj4gnxpHhKPC2eEc+K58Tz4gUxQbwoXhIvi1fEq+I18bp4Q7wp3hJvi3fEu+I98b74QHwoPhIfi0/ERPGp+ExMEp+LL8SX4ivxtfhGfCu+E9+LH8SP4ifxs/hF/Cp+E7+LP8Sf4i/xt/hH/Cv+E+OkZFJyKYWUUkolpZbSSGmldFJ6KYOUUcokZZaySFmlbFJ2KYeUU8ol5ZbySHmlfFJ+qYBUUCokFZaKSEWlYlJxqYRUUiollZbKSGWlclJ5qYJUUaokVZaqSFWlalJ1qYZUU6ol1ZbqSJiES4RESkCiJFpiJFbiJF4SJFGSJFlSJFXSJF0yJFOyJFtyJFfyJF+CEpICKZQiKSbVlepJ9aUGUkOpkdRYaiI1lZpJzaUWUkupldRaaiO1leKldlJ7qYPUUeokdZa6SF2lblJ3qYfUU+ol9Zb6SH2lflJ/aYA0UBokDZaGSEOlYdJwaYQ0UholjZbGSGOlcdJ4aYI0UZokTZamSFOladJ0aYY0U5olzZbmSP9Jc6V50nxpgbRQWiQtlpZIS6Vl0nJphbRSWiWtltZIa6V10nppg7RR2iRtlrZIW6Vt0nZph7RT2iXtlvZIe6V90n7pgHRQOiQdlo5IR6Vj0nHphHRSOiWdls5IZ6Vz0nnpgpQgXZQuSZelK9JV6Zp0Xboh3ZRuSbelO9Jd6Z50X3ogPZQeSY+lJ1Ki9FR6JiVJz6UX0kvplfRaeiO9ld5J76UP0kfpk/RZ+iJ9lb5J36Uf0k/pl/Rb+iP9lf5JcXIyObmcQk4pp5JTy2nktHI6Ob2cQc4oZ5Izy1nkrHI2ObucQ84p55Jzy3nkvHI+Ob9cQC4oF5ILy0XkonIxubhcIiExLk4uLZeRy8rl5PJyBbmiXEmuLFeRq8rV5OpyDbmmXEuuLdeRMRmXCZmUgUzJtMzIrMzJvCzIoizJsqzIqqzJumzIpmzJtuzIruzJvgxlJAdyKEdyTK4r15Pryw3khnIjubHcRG4qN5Obyy3klnIrubXcRm4rx8vt5PZyB7mj3EnuLHeRu8rd5O5yD7mn3EvuLfeR+8r95P7yAHmgPEgeLA+Rh8rD5OHyCHmkPEoeLY+Rx8rj5PHyBHmiPEmeLE+Rp8rT5OnyDHmmPEueLc+R/5PnyvPk+fICeaG8SF4sL5GXysvk5fIKeaW8Sl4tr5HXyuvk9fIGeaO8Sd4sb5G3ytvk7fIOeae8S94t75H3yvvk/fIB+aB8SD4sH5GPysfk4/IJ+aR8Sj4tn5HPyufk8/IFOUG+KF+SL8tX5KvyNfm6fEO+Kd+Sb8t35LvyPfm+/EB+KD+SH8tP5ET5qfxMTpKfyy/kl/Ir+bX8Rn4rv5Pfyx/kj/In+bP8Rf4qf5O/yz/kn/Iv+bf8R/4r/5PjlGRKciWFklJJpaRW0ihplXRKeiWDklHJpGRWsihZlWxKdiWHklPJpeRW8ih5lXxKfqWAUlAppBRWiihFlWJKcaWEUlIppZRWyihllXJKeaWCUlGppFRWqihVlWpKdaWGUlOppdRW6iiYgiuEQipAoRRaYRRW4RReERRRkRRZURRV0RRdMRRTsRRbcRRX8RRfgQpSAiVUIiWm1FXqKfWVBkpDpZHSWGmiNFWaKc2VFkpLpZXSWmmjtFXilXZKe6WD0lHppHRWuihdlW5Kd6WH0lPppfRW+ih9lX5Kf2WAMlAZpAxWhihDlWHKcGWEMlIZpYxWxihjlXHKeGWCMlGZpExWpihTlWnKdGWGMlOZpcxW5ij/KXOVecp8ZYGyUFmkLFaWKEuVZcpyZYWyUlmlrFbWKGuVdcp6ZYOyUdmkbFa2KFuVbcp2ZYeyU9ml7Fb2KHuVfcp+5YByUDmkHFaOKEeVY8px5YRyUjmlnFbOKGeVc8p55YKSoFxULimXlSvKVeWacl25odxUbim3lTvKXeWecl95oDxUHimPlSdKovJUeaYkKc+VF8pL5ZXyWnmjvFXeKe+VD8pH5ZPyWfmifFW+Kd+VH8pP5ZfyW/mj/FX+KXFqMjW5mkJNqaZSU6tp1LRqOjW9mkHNqGZSM6tZ1KxqNjW7mkPNqeZSc6t51LxqPjW/WkAtqBZSC6tF1KJqMbW4WkItqZZSS6tl1LJqObW8WkGtqFZSK6tV1KpqNbW6WkOtqdZSa6t1VEzFVUIlVaBSKq0yKqtyKq8KqqhKqqwqqqpqqq4aqqlaqq06qqt6qq9CFamBGqqRGlPrqvXU+moDtaHaSG2sNlGbqs3U5moLtaXaSm2ttlHbqvFqO7W92kHtqHZSO6td1K5qN7W72kPtqfZSe6t91L5qP7W/OkAdqA5SB6tD1KHqMHW4OkIdqY5SR6tj1LHqOHW8OkGdqE5SJ6tT1KnqNHW6OkOdqc5SZ6tz1P/Uueo8db66QF2oLlIXq0vUpeoydbm6Ql2prlJXq2vUteo6db26Qd2oblI3q1vUreo2dbu6Q92p7lJ3q3vUveo+db96QD2oHlIPq0fUo+ox9bh6Qj2pnlJPq2fUs+o59bx6QU1QL6qX1MvqFfWqek29rt5Qb6q31NvqHfWuek+9rz5QH6qP1MfqEzVRfao+U5PU5+oL9aX6Sn2tvlHfqu/U9+oH9aP6Sf2sflG/qt/U7+oP9af6S/2t/lH/qv/UOC2ZllxLoaXUUmmptTRaWi2dll7LoGXUMmmZtSxaVi2bll3LoeXUcmm5tTxaXi2fll8roBXUCmmFtSJaUa2YVlwroZXUSmmltTJaWa2cVl6roFXUKmmVtSpaVa2aVl2rodXUamm1tToapuEaoZEa0CiN1hiN1TiN1wRN1CRN1hRN1TRN1wzN1CzN1hzN1TzN16CGtEALtUiLaXW1elp9rYHWUGukNdaaaE21ZlpzrYXWUmultdbaaG21eK2d1l7roHXUOmmdtS5aV62b1l3rofXUemm9tT5aX62f1l8boA3UBmmDtSHaUG2YNlwboY3URmmjtTHaWG2cNl6boE3UJmmTtSnaVG2aNl2boc3UZmmztTnaf9pcbZ42X1ugLdQWaYu1JdpSbZm2XFuhrdRWaau1NdpabZ22XtugbdQ2aZu1LdpWbZu2Xduh7dR2abu1PdpebZ+2XzugHdQOaYe1I9pR7Zh2XDuhndROaae1M9pZ7Zx2XrugJWgXtUvaZe2KdlW7pl3Xbmg3tVvabe2Odle7p93XHmgPtUfaY+2Jlqg91Z5pSdpz7YX2UnulvdbeaG+1d9p77YP2Ufukfda+aF+1b9p37Yf2U/ul/db+aH+1f1qcnkxPrqfQU+qp9NR6Gj2tnk5Pr2fQM+qZ9Mx6Fj2rnk3PrufQc+q59Nx6Hj2vnk/PrxfQC+qF9MJ6Eb2oXkwvrpfQS+ql9NJ6Gb2sXk4vr1fQK+qV9Mp6Fb2qXk2vrtfQa+q19Np6HR3TcZ3QSR3olE7rjM7qnM7rgi7qki7riq7qmq7rhm7qlm7rju7qnu7rUEd6oId6pMf0uno9vb7eQG+oN9Ib6030pnozvbneQm+pt9Jb6230tnq83k5vr3fQO+qd9M56F72r3k3vrvfQe+q99N56H72v3k/vrw/QB+qD9MH6EH2oPkwfro/QR+qj9NH6GH2sPk4fr0/QJ+qT9Mn6FH2qPk2frs/QZ+qz9Nn6HP0/fa4+T5+vL9AX6ov0xfoSfam+TF+ur9BX6qv01foafa2+Tl+vb9A36pv0zfoWfau+Td+u79B36rv03foefa++T9+vH9AP6of0w/oR/ah+TD+un9BP6qf00/oZ/ax+Tj+vX9AT9Iv6Jf2yfkW/ql/Tr+s39Jv6Lf22fke/q9/T7+sP9If6I/2x/kRP1J/qz/Qk/bn+Qn+pv9Jf62/0t/o7/b3+Qf+of9I/61/0r/o3/bv+Q/+p/9J/63/0v/o/Pc5IZiQ3UhgpjVRGaiONkdZIZ6Q3MhgZjUxGZiOLkdXIZmQ3chg5jVxGbiOPkdfIZ+Q3ChgFjUJGYaOIUdQoZhQ3ShgljVJGaaOMUdYoZ5Q3KhgVjUpGZaOKUdWoZlQ3ahg1jVpGbaOOgRm4QRikAQzKoA3GYA3O4A3BEA3JkA3FUA3N0A3DMA3LsA3HcA3P8A1oICMwQiMyYkZdo55R32hgNDQaGY2NJkZTo5nR3GhhtDRaGa2NNkZbI95oZ7Q3OhgdjU5GZ6OL0dXoZnQ3ehg9jV5Gb6OP0dfoZ/Q3BhgDjUHGYGOIMdQYZgw3RhgjjVHGaGOMMdYYZ4w3JhgTjUnGZGOKMdWYZkw3ZhgzjVnGbGOO8Z8x15hnzDcWGAuNRcZiY4mx1FhmLDdWGCuNVcZqY42x1lhnrDc2GBuNTcZmY4ux1dhmbDd2GDuNXcZuY4+x19hn7DcOGAeNQ8Zh44hx1DhmHDdOGCeNU8Zp44xx1jhnnDcuGAnGReOScdm4Ylw1rhnXjRvGTeOWcdu4Y9w17hn3jQfGQ+OR8dh4YiQaT41nRpLx3HhhvDReGa+NN8Zb453x3vhgfDQ+GZ+NL8ZX45vx3fhh/DR+Gb+NP8Zf458RZyYzk5spzJRmKjO1mcZMa6Yz05sZzIxmJjOzmcXMamYzs5s5zJxmLjO3mcfMa+Yz85sFzIJmIbOwWcQsahYzi5slzJJmKbO0WcYsa5Yzy5sVzIpmJbOyWcWsalYzq5s1zJpmLbO2WcfETNwkTNIEJmXSJmOyJmfypmCKpmTKpmKqpmbqpmGapmXapmO6pmf6JjSRGZihGZkxs65Zz6xvNjAbmo3MxmYTs6nZzGxutjBbmq3M1mYbs60Zb7Yz25sdzI5mJ7Oz2cXsanYzu5s9zJ5mL7O32cfsa/Yz+5sDzIHmIHOwOcQcag4zh5sjzJHmKHO0OcYca44zx5sTzInmJHOyOcWcak4zp5szzJnmLHO2Ocf8z5xrzjPnmwvMheYic7G5xFxqLjOXmyvMleYqc7W5xlxrrjPXmxvMjeYmc7O5xdxqbjO3mzvMneYuc7e5x9xr7jP3mwfMg+Yh87B5xDxqHjOPmyfMk+Yp87R5xjxrnjPPmxfMBPOiecm8bF4xr5rXzOvmDfOmecu8bd4x75r3zPvmA/Oh+ch8bD4xE82n5jMzyXxuvjBfmq/M1+Yb8635znxvfjA/mp/Mz+YX86v5zfxu/jB/mr/M3+Yf86/5z4yzklnJrRRWSiuVldpKY6W10lnprQxWRiuTldnKYmW1slnZrRxWTiuXldvKY+W18ln5rQJWQauQVdgqYhW1ilnFrRJWSauUVdoqY5W1ylnlrQpWRauSVdmqYlW1qlnVrRpWTauWVduqY2EWbhEWaQGLsmiLsViLs3hLsERLsmRLsVRLs3TLsEzLsmzLsVzLs3wLWsgKrNCKrJhV16pn1bcaWA2tRlZjq4nV1GpmNbdaWC2tVlZrq43V1oq32lntrQ5WR6uT1dnqYnW1ulndrR5WT6uX1dvqY/W1+ln9rQHWQGuQNdgaYg21hlnDrRHWSGuUNdoaY421xlnjrQnWRGuSNdmaYk21plnTrRnWTGuWNduaY/1nzbXmWfOtBdZCa5G12FpiLbWWWcutFdZKa5W12lpjrbXWWeutDdZGa5O12dpibbW2WdutHdZOa5e129pj7bX2WfutA9ZB65B12DpiHbWOWcetE9ZJ65R12jpjnbXOWeetC1aCddG6ZF22rlhXrWvWdeuGddO6Zd227lh3rXvWfeuB9dB6ZD22nliJ1lPrmZVkPbdeWC+tV9Zr64311npnvbc+WB+tT9Zn64v11fpmfbd+WD+tX9Zv64/11/pnxdnJ7OR2CjulncpObaex09rp7PR2BjujncnObGexs9rZ7Ox2DjunncvObeex89r57Px2AbugXcgubBexi9rF7OJ2CbukXcoubZexy9rl7PJ2BbuiXcmubFexq9rV7Op2DbumXcuubdexMRu3CZu0gU3ZtM3YrM3ZvC3Yoi3Zsq3Yqq3Zum3Ypm3Ztu3Yru3Zvg1tZAd2aEd2zK5r17Pr2w3shnYju7HdxG5qN7Ob2y3slnYru7Xdxm5rx9vt7PZ2B7uj3cnubHexu9rd7O52D7un3cvubfex+9r97P72AHugPcgebA+xh9rD7OH2CHukPcoebY+xx9rj7PH2BHuiPcmebE+xp9rT7On2DHumPcuebc+x/7Pn2vPs+fYCe6G9yF5sL7GX2svs5fYKe6W9yl5tr7HX2uvs9fYGe6O9yd5sb7G32tvs7fYOe6e9y95t77H32vvs/fYB+6B9yD5sH7GP2sfs4/YJ+6R9yj5tn7HP2ufs8/YFO8G+aF+yL9tX7Kv2Nfu6fcO+ad+yb9t37Lv2Pfu+/cB+aD+yH9tP7ET7qf3MTrKf2y/sl/Yr+7X9xn5rv7Pf2x/sj/Yn+7P9xf5qf7O/2z/sn/Yv+7f9x/5r/7PjnGROcieFk9JJ5aR20jhpnXROeieDk9HJ5GR2sjhZnWxOdieHk9PJ5eR28jh5nXxOfqeAU9Ap5BR2ijhFnWJOcaeEU9Ip5ZR2yjhlnXJOeaeCU9Gp5FR2qjhVnWpOdaeGU9Op5dR26jiYgzuEQzrAoRzaYRzW4RzeERzRkRzZURzV0RzdMRzTsRzbcRzX8RzfgQ5yAid0Iifm1HXqOfWdBk5Dp5HT2GniNHWaOc2dFk5Lp5XT2mnjtHXinXZOe6eD09Hp5HR2ujhdnW5Od6eH09Pp5fR2+jh9nX5Of2eAM9AZ5Ax2hjhDnWHOcGeEM9IZ5Yx2xjhjnXHOeGeCM9GZ5Ex2pjhTnWnOdGeGM9OZ5cx25jj/OXOdec58Z4Gz0FnkLHaWOEudZc5yZ4Wz0lnlrHbWOGuddc56Z4Oz0dnkbHa2OFudbc52Z4ez09nl7Hb2OHudfc5+54Bz0DnkHHaOOEedY85x54Rz0jnlnHbOOGedc85554KT4Fx0LjmXnSvOVeeac9254dx0bjm3nTvOXeeec9954Dx0HjmPnSdOovPUeeYkOc+dF85L55Xz2nnjvHXeOe+dD85H55Pz2fnifHW+Od+dH85P55fz2/nj/HX+OXFuMje5m8JN6aZyU7tp3LRuOje9m8HN6GZyM7tZ3KxuNje7m8PN6eZyc7t53LxuPje/W8At6BZyC7tF3KJuMbe4W8It6ZZyS7tl3LJuObe8W8Gt6FZyK7tV3KpuNbe6W8Ot6dZya7t1XMzFXcIlXeBSLu0yLutyLu8KruhKruwqrupqru4arularu06rut6ru9CF7mBG7qRG3PruvXc+m4Dt6HbyG3sNnGbus3c5m4Lt6Xbym3ttnHbuvFuO7e928Ht6HZyO7td3K5uN7e728Pt6fZye7t93L5uP7e/O8Ad6A5yB7tD3KHuMHe4O8Id6Y5yR7tj3LHuOHe8O8Gd6E5yJ7tT3KnuNHe6O8Od6c5yZ7tz3P/cue48d767wF3oLnIXu0vcpe4yd7m7wl3prnJXu2vcte46d727wd3obnI3u1vcre42d7u7w93p7nJ3u3vcve4+d797wD3oHnIPu0fco+4x97h7wj3pnnJPu2fcs+4597x7wU1wL7qX3MvuFfeqe8297t5wb7q33NvuHfeue8+97z5wH7qP3MfuEzfRfeo+c5Pc5+4L96X7yn3tvnHfuu/c9+4H96P7yf3sfnG/ut/c7+4P96f7y/3t/nH/uv/cOC+Zl9xL4aX0UnmpvTReWi+dl97L4GX0MnmZvSxeVi+bl93L4eX0cnm5vTxeXi+fl98r4BX0CnmFvSJeUa+YV9wr4ZX0SnmlvTJeWa+cV96r4FX0KnmVvSpeVa+aV92r4dX0anm1vToe5uEe4ZEe8CiP9hiP9TiP9wRP9CRP9hRP9TRP9wzP9CzP9hzP9TzP96CHvMALvciLeXW9el59r4HX0GvkNfaaeE29Zl5zr4XX0mvltfbaeG29eK+d197r4HX0OnmdvS5eV6+b193r4fX0enm9vT5eX6+f198b4A30BnmDvSHeUG+YN9wb4Y30RnmjvTHeWG+cN96b4E30JnmTvSneVG+aN92b4c30ZnmzvTnef95cb54331vgLfQWeYu9Jd5Sb5m33FvhrfRWeau9Nd5ab5233tvgbfQ2eZu9Ld5Wb5u33dvh7fR2ebu9Pd5eb5+33zvgHfQOeYe9I95R75h33DvhnfROeae9M95Z75x33rvgJXgXvUve5TRx3lXvmnfdu+Hd9G55t7073l3vnnffe+A99B55j70nXqL31HvmJXnPvRfeS++V99p747313nnvvQ/eR++T99n74n31vnnfvR/eT++X99v74/31/nlxfjI/uZ/CT+mn8lP7afy0fjo/vZ/Bz+hn8jP7WfysfjY/u5/Dz+nn8nP7efy8fj4/v1/AL+gX8gv7RfyifjG/uF/CL+mX8kv7Zfyyfjm/vF/Br+hX8iv7VfyqfjW/ul/Dr+nX8mv7dXzMx33CJ33gUz7tMz7rcz7vC77oS77sK77qa77uG77pW77tO77re77vQx/5gR/6kR/z6/r1/Pp+A7+h38hv7Dfxm/rN/OZ+C7+l38pv7bfx2/rxfju/vd/B7+h38jv7Xfyufje/u9/D7+n38nv7ffy+fj+/vz/AH+gP8gf7Q/yh/jB/uD/CH+mP8kf7Y/yx/jh/vD/Bn+hP8if7U/yp/jR/uj/Dn+nP8mf7c/z//Ln+PH++v8Bf6C/yF/tL/KX+Mn+5v8Jf6a/yV/tr/LX+On+9v8Hf6G/yN/tb/K3+Nn+7v8Pf6e/yd/t7/L3+Pn+/f8A/6B/yD/tH/KP+Mf+4f8I/6Z/yT/tn/LP+Of+8f8FP8C/6l/zL/hX/qn/Nv+7f8G/6t/zb/h3/rn/Pv+8/8B/6j/zH/hM/0X/qP/OT/Of+C/+l/8p/7b/x3/rv/Pf+B/+j/8n/7H/xv/rf/O/+D/+n/8v/7f/x//r//DiYDCaHKWBKmAqmhmlgWpgOpocZYEaYCWaGWWBWmA1mhzlgTpgL5oZ5YF6YD+aHBWBBWAgWhkVgUVgMFoclYElYCpaGZWBZWA6WhxVgRVgJVoZVYFVYDVaHNWBNWAvWhnUgBnFIQBICSEEaMpCFHOShAEUoQRkqUIUa1KEBTWhBGzrQhR70IYQIBjCEEYzBurAerA8bwIawEWwMm8CmsBlsDlvAlrAVbA3bwLYwHraD7WEH2BF2gp1hF9gVdoPdYQ/YE/aCvWEf2Bf2g/3hADgQDoKD4RA4FA6Dw+EIOBKOgqPhGDgWjoPj4QQ4EU6Ck+EUOBVOg9PhDDgTzoKz4Rz4H5wL58H5cAFcCBfBxXAJXAqXweVwBVwJV8HVcA1cC9fB9XAD3Ag3wc1wC9wKt8HtcAfcCXfB3XAP3Av3wf3wADwID8HD8Ag8Co/B4/AEPAlPwdPwDDwLz8Hz8AJMgBfhJXgZXoFX4TV4Hd6AN+EteBvegXfhPXgfPoAP4SP4GD6BifApfAaT4HP4Ar6Er+Br+Aa+he/ge/gBfoSf4Gf4BX6F3+B3+AP+hL/gb/gH/oX/YBxKhpKjFCglSoVSozQoLUqH0qMMKCPKhDKjLCgryoayoxwoJ8qFcqM8KC/Kh/KjAqggKoQKoyKoKCqGiqMSqCQqhUqjMqgsKofKowqoIqqEKqMqqCqqhqqjGqgmqoVqozoIQzgiEIkAohCNGMQiDvFIQCKSkIwUpCIN6chAJrKQjRzkIg/5CCKEAhSiCMVQXVQP1UcNUEPUCDVGTVBT1Aw1Ry1QS9QKtUZtUFsUj9qh9qgD6og6oc6oC+qKuqHuqAfqiXqh3qgP6ov6of5oABqIBqHBaAgaioah4WgEGolGodFoDBqLxqHxaAKaiCahyWgKmoqmoeloBpqJZqHZaA76D81F89B8tAAtRIvQYrQELUXL0HK0Aq1Eq9BqtAatRevQerQBbUSb0Ga0BW1F29B2tAPtRLvQbrQH7UX70H50AB1Eh9BhdAQdRcfQcXQCnUSn0Gl0Bp1F59B5dAEloIvoErqMrqCr6Bq6jm6gm+gWuo3uoLvoHrqPHqCH6BF6jJ6gRPQUPUNJ6Dl6gV6iV+g1eoPeonfoPfqAPqJP6DP6gr6ib+g7+oF+ol/oN/qD/qJ/KC5IFiQPUgQpg1RB6iBNkDZIF6QPMgQZg0xB5iBLkDXIFmQPcgQ5g1xB7iBPkDfIF+QPClz5fwQRFA2KBcWDEkHJoFRQOigTlA3KBeWDCkHFoFJQOagSVA2qBdWDGkHNoFZQO6gTYAEeEAEZgIAK6IAJ2IAL+EAIxEAK5EAJ1EAL9MAIzMAK7MAJ3MAL/AAGKAiCMIiCWFA3qBfUDxoEDYNGQeOgSdA0aBY0D1oELYNWQeugTdA2iA/aBe2DDkHHoFPQOegSdA26Bd2DHkHPoFfQO+gT9A36Bf2DAcHAYFAwOBgSDA2GBcODEcHIYFQwOhgTjA3GBeODCcHEYFIwOZgSTA2mBdODGcHMYFYwO5gT/BfMDeYF84MFwcJgUbA4WBIsDZYFy4MVwcpgVbA6WBOsDdYF64MNwcZgU7A52BJsDbYF24Mdwc5gV7A72BPsDfYF+4MDwcHgUHA4OBIcDY4Fx4MTwcngVHA6OBOcDc4F54MLQUJwMbgUXA6uBFeDa8H14EZwM7gV3A7uBHeDe8H94EHwMHgUPA6eBInB0+BZkBQ8D14EL4NXwevgTfA2eBe8Dz4EH4NPwefgS/A1+BZ8D34EP4Nfwe/gT/A3+BfEhcnC5GGKMGWYKkwdpgnThunC9GGGMGOYKcwcZgmzhtnC7GGOMGeYK8wd5gnzhvnC/GGBsGBYKCwcFgmLhsXC4mGJsGRYKiwdlgnLhuXC8mGFsGJYKawcVgmrhtXC6mGNsGZYK6wd1gmxEA+JkAxBSIV0yIRsyIV8KIRiKIVyqIRqqIV6aIRmaIV26IRu6IV+CEMUBmEYRmEsrBvWC+uHDcKGYaOwcdgkbBo2C5uHLcKWYauwddgmbBvGh+3C9mGHsGPYKewcdgm7ht3C7mGPsGfYK+wd9gn7hv3C/uGAcGA4KBwcDgmHhsPC4eGIcGQ4KhwdjgnHhuPC8eGEcGI4KZwcTgmnhtPC6eGMcGY4K5wdzgn/C+eG88L54YJwYbgoXBwuCZeGy8Ll4YpwZbgqXB2uCdeG68L14YZwY7gp3BxuCbeG28Lt4Y5wZ7gr3B3uCfeG+8L94YHwYHgoPBweCY+Gx8Lj4YnwZHgqPB2eCc+G58Lz4YUwIbwYXgovh1fCq+G18Hp4I7wZ3gpvh3fCu+G98H74IHwYPgofh0/CxPBp+CxMCp+HL8KX4avwdfgmfBu+C9+HH8KP4afwc/gl/Bp+C7+HP8Kf4a/wd/gn/Bv+C+OiZFHyKEWUMkoVpY7SRGmjdFH6KEOUMcoUZY6yRFmjbFH2KEeUM8oV5Y7yRHmjfFH+qEBUMCoUFY6KREWjYlHxqERUMioVlY7KRGWjclH5qEJUMaoUVY6qRFWjalH1qEZUM6oV1Y7qRFiER0RERiCiIjpiIjbiIj4SIjGSIjlSIjXSIj0yIjOyIjtyIjfyIj+CEYqCKIyiKBbVjepF9aMGUcOoUdQ4ahI1jZpFzaMWUcuoVdQ6ahO1jeKjdlH7qEPUMeoUdY66RF2jblH3qEfUM+oV9Y76RH1BkhwfDYgGRoOiwdGQaGg0LBoejYhGRqOi0dGYaGw0LhofTYgmRpOiydGUaGo0LZoezYhmRrOi2dGc6L9objQvmh8tiBZGi6LF0ZJoabQsWh6tiFZGq6LV0ZpobbQuWh9tiDZGm6LN0ZZoa7Qt2h7tiHZGu6Ld0Z5ob7Qv2h8diA5Gh6LD0ZHoaHQsOh6diE5Gp6LT0ZnobHQuOh9diBKii9Gl6HJ0JboaXYuuRzeim9Gt6HZ0J7ob3YvuRw+ih9Gj6HH0JEqMnkbPoqToefQiehm9il5Hb6K30bvoffQh+hh9ij5HX6Kv0bfoe/Qj+hn9in5Hf6K/0b8oLpYsljyWIpYyliqWOpYmljaWLpY+liGWMZYpljmWJZY1li2WPZYjljOWK5Y7lieWN5Yvlj9WIFYwVihWOFYkVjRWLFY8ViJWMlYqVjpWJlY2Vi5WPlYhVjFWKVY5ViVWNVYtVj1WI1YzVitWO1bn/2i6p/a8mgUAoElt27Zt2+7mbNuaeWvbtpHatm3bblOb5+I8379YdwurhdXG6mB1sXpYfawB1hBrhDXGmmBNsWZYc6wF1hJrhbXG2mBtsXZYe6wD1hHrhHXGumBdsW5Yd6wH1hPDMBwjMBKjMBoDGIOxGIfxmICJmITJmIKpmIbpmIGZmIXZmIO5mIf5WICFWIRBDGExrBfWG+uD9cX6Yf2xAdhAbBA2GBuCDcWGYcOxEdhIbBQ2GhuDjcXGYeOxCdhEbBI2GZuCTcWmYdOxGdhMbBY2G5uDzcXmYfOxBdhCbBG2GFuCLcWWYcuxBGwFthJbha3G1mBrsXXYemwDthHbhG3GtmBbsW3YdmwHthPbhe3G9mB7sX3YfuwAdhA7hB3GjmBHsWPYcewEdhI7hZ3GzmBnsXPYeewCdhG7hF3GrmBXsWvYdewGdhO7hd3G7mB3sXvYfewB9hB7hD3GnmBPsWfYc+wF9hJ7hb3G3mCJ2FvsHfYe+4B9xD5hn7Ev2FfsG/Yd+4H9xH5hv7E/2F/sHxaHx+NJ8KR4Mjw5ngJPiafCU+Np8LR4Ojw9ngHPiGfCM+NZ8Kx4Njw7ngPPiefCc+N58Lx4Pjw/XgAviBfCC+NF8KJ4Mbw4XgIviZfCS+Nl8LJ4Obw8XgGviFfCK+NV8Kp4Nbw6XgOvidfCa+N18Lp4Pbw+3gBviDfCG+NN8KZ4M7w53gJvibfCW+Nt8LZ4O7w93qFbXFwc3hnvgnfFu+Hd8R54TxzDcZzASZzCaRzgDM7iHM7jAi7iEi7jCq7iGq7jBm7iFm7jDu7iHu7jAR7iEQ5xhMfwXnhvvA/eF++H98cH4APxQfhgfAg+FB+GD8dH4CPxUfhofAw+Fh+Hj8cn4BPxSfhkfAo+FZ+GT8dn4DPxWfhsfA4+F5+Hz8cX4AvxRfhifAm+FF+GL8cT8BX4SnwVvhpfg6/F1+Hr8Q34RnwTvhnfgm/Ft+Hb8R34TnwXvhvfg+/F9+H78QP4QfwQfhg/gh/Fj+HH8RP4SfwUfho/g5/Fz+Hn8Qv4RfwSfhm/gl/Fr+HX8Rv4TfwWfhu/g9/F7+H38Qf4Q/wR/hh/gj/Fn+HP8Rf4S/wV/hp/gyfib/F3+Hv8A/4R/4R/xr/gX/Fv+Hf8B/4T/4X/xv/gf/F/eBwRTyQhkhLJiORECiIlkYpITaQh0hLpiPREBiIjkYnITGQhshLZiOxEDiInkYvITeQh8hL5iPxEAaIgUYgoTBQhihLFiOJECaIkUYooTZQhyhLliPJEBaIiUYmoTFQhqhLViOpEDaImUYuoTdQh6hL1iPpEA6Ih0YhoTDQhmhLNiOZEC6Il0YpoTbQh2hLtiPZEB6Ij0YnoTHQhuhKJ4P/gxAicIAiSoAiaAARDsARH8IRAiIREyIRCqIRG6IRBmIRF2IRDuIRH+ERAhEREQAIRMaIX0ZvoQ/Ql+hH9iQHEQGIQMZgYQgwlhhHDiRHESGIUMZoYQ4wlxhHjiQnERGISMZmYQkwlphHTiRnETGIWMZuYQ8wl5hHziQXEQmIRsZhYQiwllhHLiQRiBbGSWEWsJtYQa4l1xHpiA7GR2ERsJrYQW4ltxHZiB7GT2EXsJvYQe4l9xH7iAHGQOEQcJo4QR4ljxHHiBHGSOEWcJs4QZ4lzxHniAnGRuERcJq4QV4lrxHXiBnGTuEXcJu4Qd4l7xH3iAfGQeEQ8Jp4QT4lnxHPiBfGSeEW8Jt4QicRb4h3xnvhAfCQ+EZ+JL8RX4hvxnfhB/CR+Eb+JP8Rf4h8RR8aTScikZDIyOZmCTEmmIlOTaci0ZDoyPZmBzEhmIjOTWcisZDYyO5mDzEnmInOTeci8ZD4yP1mALEgWIguTRciiZDGyOFmCLEmWIkuTZciyZDmyPFmBrEhWIiuTVciqZDWyOlmDrEnWImuTdci6ZD2yPtmAbEg2IhuTTcimZDOyOdmCbEm2IluTbci2ZDuyPdmB7Eh2IjuTXciuZDeyO9mD7EliJE4SJElSJE0CkiFZkiN5UiBFUiJlUiFVUiN10iBN0iJt0iFd0iN9MiBDMiIhicgY2YvsTfYh+5L9yP7kAHIgOYgcTA4hh5LDyOHkCHIkOYocTY4hx5LjyPHkBHIiOYmcTE4hp5LTyOnkDHImOYucTc4h55LzyPnkAnIhuYhcTC4hl5LLyOVkArmCXEmuIleTa8i15DpyPbmB3EhuIjeTW8it5DZyO7mD3EnuIneTe8i95D5yP3mAPEgeIg+TR8ij5DHyOHmCPEmeIk+TZ8iz5DnyPHmBvEheIi+TV8ir5DXyOnmDvEneIm+Td8i75D3yPvmAfEg+Ih+TT8in5DPyOfmCfEm+Il+Tb8hE8i35jnxPfiA/kp/Iz+QX8iv5jfxO/iB/kr/I3+Qf8i/5j4yj4qkkVFIqGZWcSkGlpFJRqak0VFoqHZWeykBlpDJRmaksVFYqG5WdykHlpHJRuak8VF4qH5WfKkAVpApRhakiVFGqGFWcKkGVpEpRpakyVFmqHFWeqkBVpCpRlakqVFWqGlWdqkHVpGpRtak6VF2qHlWfakA1pBpRjakmVFOqGdWcakG1pFpRrak2VFuqHdWe6kB1pDpRnakuVFeqG9Wd6kH1pDAKpwiKpCiKpgDFUCzFUTwlUCIlUTKlUCqlUTplUCZlUTblUC7lUT4VUCEVUZBCVIzqRfWm+lB9qX5Uf2oANZAaRA2mhlBDqWHUcGoENZIaRY2mxlBjqXHUeGoCNZGaRE2mplBTqWnU9CozqJnULGo2NYeaS82j5lMLqIXUImoxtYRaSi2jllMJ1ApqJbWKWk2todZS66j11AZqI7WJ2kxtobZS26jt1A5qJ7WL2k3tofZS+6j91AHqIHWIOkwdoY5Sx6jj1AnqJHWKOk2doc5S56jz1AXqInWJukxdoa5S16jr1A3qJnWLuk3doe5S96j71APqIfWIekw9oZ5Sz6jn1AvqJfWKek29oRKpt9Q76j31gfpIfaI+U1+or9Q36jv1g/pJ/aJ+U3+ov9Q/Ko6Op5PQSelkdHI6BZ2STkWnptPQael0dHo6A52RzkRnprPQWelsdHY6B52TzkXnpvPQeel8dH66AF2QLkQXpovQRelidHG6BF2SLkWXpsvQZelydHm6Al2RrkRXpqvQVelqdHW6Bl2TrkXXpuvQdel6dH26Ad2QbkQ3ppvQTelmdHO6Bd2SbkW3ptvQbel2dHu6A92R7kR3prvQXeludHe6B92TxmicJmiSpmiaBjRDszRH87RAi7REy7RCq7RG67RBm7RF27RDu7RH+3RAh3REQxrRMboX3ZvuQ/el+9H96QH0QHoQPZgeQg+lh9HD6RH0SHoUPZoeQ4+lx9Hj6Qn0RHoSPZmeQk+lp9HT6Rn0THoWPZueQ8+l59Hz6QX0QnoRvZheQi+ll9HL6QR6Bb2SXkWvptfQa+l19Hp6A72R3kRvprfQW+lt9HZ6B72T3kXvpvfQe+l99H76AH2QPkQfpo/QR+lj9HH6BH2SPkWfps/QZ+lz9Hn6An2RvkRfpq/QV+lr9HX6Bn2TvkXfpu/Qd+l79H36Af2QfkQ/pp/QT+ln9HP6Bf2SfkW/pt/QifRb+h39nv5Af6Q/0Z/pL/RX+hv9nf5B/6R/0b/pP/Rf+h8dB+JBEpAUJAPJQQqQEqQCqUEakBakA+lBBpARZAKZQRaQFWQD2UEOkBPkArlBHpAX5AP5QQFQEBQChUERUBQUA8VBCVASlAKlQRlQFpQD5UEFUBFUApVBFVAVVAPVQQ1QE9QCtUEdUBfUA/VBA9AQNAKNQRPQFDQDzUEL0BK0Aq1BG9AWtAPtQQfQEXQCnUEX0BV0A91BD9ATYAAHBCABBWgAAANYwAEeCEAEEpCBAlSgAR0YwAQWsIEDXOABHwQgBBGAAIEY6AV6gz6gL+gH+oMBYCAYBAaDIWAoGAaGgxFgJBgFRoMxYCwYB8aDCWAimAQmgylgKpgGpoMZYCaYBWaDOWAumAfmgwVgIVgEFoMlYClYBpaDBLACrASrwGqwBqwF68B6sAFsBJvAZrAFbAXbwHawA+wEu8BusAfsBfvAfnAAHASHwGFwBBwFx8BxcAKcBKfAaXAGnAXnwHlwAVwEl8BlcAVcBdfAdXAD3AS3wG1wB9wF98B98AA8BI/AY/AEPAXPwHPwArwEr8Br8AYkgrfgHXgPPoCP4BP4DL6Ar+Ab+A5+gJ/gF/gN/oC/4B+IY+KZJExSJhmTnEnBpGRSMamZNExaJh2TnsnAZGQyMZmZLExWJhuTncnB5GRyMbmZPExeJh+TnynAFGQKMYWZIkxRphhTnCnBlGRKMaWZMkxZphxTnqnAVGQqMZWZKkxVphpTnanB1GRqMbWZOkxdph5Tn2nANGQaMY2ZJkxTphnTnGnBtGRaMa2ZNkxbph3TnunAdGQ6MZ2ZLkxXphvTnenB9GQwBmcIhmQohmYAwzAswzE8IzAiIzEyozAqozE6YzAmYzE24zAu4zE+EzAhEzGQQUyM6cX0ZvowfZl+TH9mADOQGcQMZoYwQ5lhzHBmBDOSGcWMZsYwY5lxzHhmAjORmcRMZqYwU5lpzHRmBjOTmcXMZuYwc5l5zHxmAbOQWcQsZpYwS5llzHImgVnBrGRWMauZNcxaZh2zntnAbGQ2MZuZLcxWZhuzndnB7GR2MbuZPcxeZh+znznAHGQOMYeZI8xR5hhznDnBnGROMaeZM8xZ5hxznrnAXGQuMZeZK8xV5hpznbnB3GRuMbeZO8xd5h5zn3nAPGQeMY+ZJ8xT5hnznHnBvGReMa+ZN0wi85Z5x7xnPjAfmU/MZ+YL85X5xnxnfjA/mV/Mb+YP85f5x8Sx8WwSNimbjE3OpmBTsqnY1GwaNi2bjk3PZmAzspnYzGwWNiubjc3O5mBzsrnY3GweNi+bj83PFmALsoXYwmwRtihbjC3OlmBLsqXY0mwZtixbji3PVmArspXYymwVtipbja3O1mBrsrXY2mwdti5bj63PNmAbso3YxmwTtinbjG3OtmBbsq3Y1mwbti3bjm3PdmA7sp3YzmwXtivbje3O9mB7shiLswRLshRLs4BlWJblWJ4VWJGVWJlVWJXVWJ01WJO1WJt1WJf1WJ8N2JCNWMgiNsb2Ynuzfdi+bD+2PzuAHcgOYgezQ9ih7DB2ODuCHcmOYkezY9ix7Dh2PDuBnchOYiezU9ip7DR2OjuDncnOYmezc9i57Dx2PruAXcguYhezS9il7DJ2OZvArmBXsqvY1ewadi27jl3PbmA3spvYzewWdiu7jd3O7mB3srvY3ewedi+7j93PHmAPsofYw+wR9ih7jD3OnmBPsqfY0+wZ9ix7jj3PXmAvspfYy+wV9ip7jb3O3mBvsrfY2+wd9i57j73PPmAfso/Yx+wT9in7jH3OvmBfsq/Y1+wbNpF9y75j37Mf2I/sJ/Yz+4X9yn5jv7M/2J/sL/Y3+4f9y/5j47h4LgmXlEvGJedScCm5VFxqLg2XlkvHpecycBm5TFxmLguXlcvGZedycDm5XFxuLg+Xl8vH5ecKcAW5Qlxh7r+jvwRXkivFlebKcGW5clx5rgJXkavEVeaqcFW5alx1rgZXk6vF1ebqcHW5elx9rgHXkGvENeaacE25ZlxzrgXXkmvFtebacG25dlx7rgPXkevEdea6cF25blx3rgfXk8M4nCM4kqM4mgMcw7Ecx/GcwImcxMmcwqmcxumcwZmcxdmcw7mcx/lcwIVcxEEOcTGuF9eb68P15fpx/bkB3EBuEDeYG8IN5YZxw7kR3EhuFDeaG8ON5cZx47kJ3ERuEjeZm8JN5aZx07kZ3ExuFjebm8PN5eZx87kF3EJuEbeYW8It5ZZxy7kEbgW3klvFrebWcGu5ddx6bgO3kdvEbea2cFu5bdx2bge3k9vF7eb2cHu5fdx+7gB3kDvEHeaOcEe5Y9xx7gR3kjvFnebOcGe5c9x57gJ3kbvEXeaucFe5a9x17gZ3k7vF3ebucHe5e9x97gH3kHvEPeaecE+5Z9xz7gX3knvFvebecIncW+4d9577wH3kPnGfuS/cV+4b9537wf3kfnG/uT/cX+4fF8fH80n4pHwyPjmfgk/Jp+JT82n4tHw6Pj2fgc/IZ+Iz81n4rHw2Pjufg8/J5+Jz83n4vHw+Pj9fgC/IF+IL80X4onwxvjhfgi/Jl+JL82X4snw5vjxfga/IV+Ir81X4qnw1vjpfg6/J1+Jr83X4unw9vj7fgG/IN+Ib8034pnwzvjnfgm/Jt+Jb8234tnw7vj3fge/Id+I78134rnw3vjvfg+/JYzzOEzzJUzzNA57hWZ7jeV7gRV7iZV7hVV7jdd7gTd7ibd7hXd7jfT7gQz7iIY/4GN+L78334fvy/fj+/AB+ID+IH8wP4Yfyw/jh/Ah+JD+KH82P4cfy4/jx/AR+Ij+Jn8xP4afy0/jp/Ax+Jj+Ln83P4efy8/j5/AJ+Ib+IX8wv4Zfyy/jlfAK/gl/Jr+JX82v4tfw6fj2/gd/Ib+I381v4rfw2fju/g9/J7+J383v4vfw+fj9/gD/IH+IP80f4o/wx/jh/gj/Jn+JP82f4s/w5/jx/gb/IX+Iv81f4q/w1/jp/g7/J3+Jv83f4u/w9/j7/gH/IP+If80/4p/wz/jn/gn/Jv+Jf82/4RP4t/45/z3/gP/Kf+M/8F/4r/43/zv/gf/K/+N/8H/4v/4+PE+KFJEJSIZmQXEghpBRSCamFNEJaIZ2QXsggZBQyCZmFLEJWIZuQXcgh5BRyCbmFPEJeIZ+QXyggFBQKCYWFIkJRoZhQXCghlBRKCaWFMkJZoZxQXqggVBQqCZWFKkJVoZpQXagh1BRqCbWFOkJdoZ5QX2ggNBQaCY2FJkJToZnQXGghtBRaCa2FNkJboZ3QXuggdBQ6CZ2FLkJXoZvQXegh9BQwARcIgRQogRaAwAiswAm8IAiiIAmyoAiqoAm6YAimYAm24Aiu4Am+EAihEAlQQEJM6CX0FvoIfYV+Qn9hgDBQGCQMFoYIQ4VhwnBhhDBSGCWMFsYIY4VxwnhhgjBRmCRMFqYIU4VpwnRhhjBTmCXMFuYIc4V5wnxhgbBQWCQsFpYIS4VlwnIhQVghrBRWCauFNcJaYZ2wXtggbBQ2CZuFLcJWYZuwXdgh7BR2CbuFPcJeYZ+wXzggHBQOCYeFI8JR4ZhwXDghnBROCaeFM8JZ4ZxwXrggXBQuCZeFK8JV4ZpwXbgh3BRuCbeFO8Jd4Z5wX3ggPBQeCY+FJ8JT4ZnwXHghvBReCa+FN0Ki8FZ4J7wXPggfhU/CZ+GL8FX4JnwXfgg/hV/Cb+GP8Ff4J8SJ8WISMamYTEwuphBTiqnE1GIaMa2YTkwvZhAzipnEzGIWMauYTcwu5hBzirnE3GIeMa+YT8wvFhALioXEwmIRsahYTCwulhBLiqXE0mIZsaxYTiwvVhAripXEymIVsapYTawu1hBrirXE2mIdsa5YT6wvNhAbio3ExmITsanYTGwuthBbiq0SiottxLZiO7G92EHsKHYSO4tdxK5iN7G72EPsKWIiLhIiKVIiLQKREVmRE3lREEVREmVREVVRE3XREE3REm3REV3RE30xEEMxEqGIxJjYS+wt9hH7iv3E/uIAcaA4SBwsDhGHisPE4eIIcaQ4ShwtjhHHiuPE8eIEcaI4SZwsThGnitPE6eIMcaY4S5wtzhHnivPE+eICcaG4SFwsLhGXisvE5WKCuEJcKa4SV4trxLXiOnG9uEHcKG4SN4tbxK3iNnG7uEPcKe4Sd4t7xL3iPnG/eEA8KB4SD4tHxKPiMfG4eEI8KZ4S48Uz4lnxnHhevCBeFC+Jl8Ur4lXxmnhdvCHeFG+Jt8U74l3xnnhffCA+FB+Jj8Un4lPxmfhcfCG+FF+Jr8U3YqL4Vnwnvhc/iB/FT+Jn8Yv4Vfwmfhd/iD/FX+Jv8Y/4V/wnxknxUhIpqZRMSi6lkFJKqaTUUhoprZROSi9lkDJKmaTMUhYpq5RNyi7lkHJKuaTcUh4pr5RPyi8VkApKhaTCUhGpqFRMKi6VkEpKpaTSUhmprFROKi9VkCpKlaTKUhWpqlRNqi7VkGpKtaTaUh2prlRPqi81kBpKjaTGUhOpqdRMai61kFpKraTWUhuprdROai91kDpKnaTOUhepq9RN6i71kHpKmIRLhERKlERLQGIkVuIkXhIkUZIkWVIkVdIkXTIkU7IkW3IkV/IkXwqkUIokKCEpJvWSekt9pL5SP6m/NEAaKA2SBktDpKHSMGm4NEIaKY2SRktjpLHSOGm8NEGaKE2SJktTpKnSNGm6NEOaKc2SZktzpLnSPGm+tEBaKC2SFktLpKXSMmm5lCCtkFZKq6TV0hpprbROWi9tkDZKm6TN0hZpq7RN2i7tkHZKu6Td0h5pr7RP2i8dkA5Kh6TD0hHpqHRMOi6dkE5Kp6TT0hnprHROOi9dkC5Kl6TL0hXpqnRNui7dkG5Kt6Tb0h3prnRPui89kB5Kj6TH0hPpqfRMei69kF5Kr6TX0hspUXorvZPeSx+kj9In6bP0RfoqfZO+Sz+kn9Iv6bf0R/or/ZPi5Hg5iZxUTiYnl1PIKeVUcmo5jZxWTienlzPIGeVMcmY5i5xVziZnl3PIOeVccm45j5xXzifnlwvIBeVCcmG5iFxULiYXl0vIJeVScmm5jFxWLieXlyvIFeVKcmW5ilxVriZXl2vINeVacm25jlxXrifXlxvIDeVGcmO5idxUbiY3l1vILeVWcmu5jdxWbie3lzvIHeVOcme5i9xV7iZ3l3vIPWVMxmVCJmVKpmUgMzIrczIvC7IoS7IsK7Iqa7IuG7IpW7ItO7Ire7IvB3IoRzKUkRyTe8m95T5yX7mf3F8eIA+UB8mD5SHyUHmYPFweIY+UR8mj5THyWHmcPF6eIE+UJ8mT5SnyVHmaPF2eIc+UZ8mz5TnyXHmePF9eIC+UF8mL5SXyUnmZvFxOkFfIK+VV8mp5jbxWXievlzfIG+VN8mZ5i7xV3iZvl3fIO+Vd8m55j7xX3ifvlw/IB+VD8mH5iHxUPiYfl0/IJ+VT8mn5jHxWPiefly/IF+VL8mX5inxVviZfl2/IN+Vb8m35jnxXvifflx/ID+VH8mP5ifxUfiY/l1/IL+VX8mv5jZwov5Xfye/lD/JH+ZP8Wf4if5W/yd/lH/JP+Zf8W/4j/5X/yXFKvJJESaokU5IrKZSUSioltZJGSaukU9IrGZSMSiYls5JFyapkU7IrOZScSi4lt5JHyavkU/IrBZSCSiGlsFJEKaoUU4orJZSSSimltFJGKauUU8orFZSKSiWlslJFqapUU6orNZSaSi2ltlJHqavUU+orDZSGSiOlsdJEaao0U5orLZSWSiultdJGaau0U9orHZSOSiels9JF6ap0U7orPZSeCqbgCqGQCqXQClAYhVU4hVcERVQkRVYURVU0RVcMxVQsxVYcxVU8xVcCJVQiBSpIiSm9lN5KH6Wv0k/prwxQBiqDlMHKEGWoMkwZroxQRiqjlNHKGGWsMk4Zr0xQJiqTlMnKFGWqMk2ZrsxQZiqzlNnKHGWuMk+ZryxQFiqLlMXKEmWpskxZriQoK5SVyipltbJGWausU9YrG5SNyiZls7JF2apsU7YrO5Sdyi5lt7JH2avsU/YrB5SDyiHlsHJEOaocU44rJ5STyinltHJGOaucU84rF5SLyiXlsnJFuapcU64rN5Sbyi3ltnJHuavcU+4rD5SHyiPlsfJEeao8U54rL5SXyivltfJGSVTeKu+U98oH5aPySfmsfFG+Kt+U78oP5afyS/mt/FH+Kv+UODVeTaImVZOpydUUako1lZpaTaOmVdOp6dUMakY1k5pZzaJmVbOp2dUcak41l5pbzaPmVfOp+dUCakG1kFpYLaIWVYupxdUSakm1lFpaLaOWVcup5dUKakW1klpZraJWVaup1dUaak21llpbraPWVeup9dUGakO1kdpYbaI2VZupzdUWaku1ldpabaO2Vdup7dUOake1k9pZ7aJ2Vbup3dUeak8VU3GVUEmVUmkVqIzKqpzKq4IqqpIqq4qqqpqqq4ZqqpZqq47qqp7qq4EaqpEKVaTG1F5qb7WP2lftp/ZXB6gD1UHqYHWIOlQdpg5XR6gj1VHqaHWMOlYdp45XJ6gT1UnqZHWKOlWdpk5XZ6gz1VnqbHWOOledp85XF6gL1UXqYnWJulRdpi5XE9QV6kp1lbpaXaOuVdep69UN6kZ1k7pZ3aJuVbep29Ud6k51l7pb3aPuVfep+9UD6kH1kHpYPaIeVY+px9UT6kn1lHpaPaOeVc+p59UL6kX1knpZvaJeVa+p19Ub6k31lnpbvaPeVe+p99UH6kP1kfpYfaI+VZ+pz9UX6kv1lfpafaMmqm/Vd+p79YP6Uf2kfla/qF/Vb+p39Yf6U/2l/lb/qH/Vf2qcFq8l0ZJqybTkWgotpZZKS62l0dJq6bT0WgYto5ZJy6xl0bJq2bTsWg4tp5ZLy63l0fJq+bT8WgGtoFZIK6wV0YpqxbTiWgmtpFZKK62V0cpq5bTyWgWtolZJq6xV0apq1bTqWg2tplZLq63V0epq9bT6WgOtodZIa6w10ZpqzbTmWgutpdZKa6210dpq7bT2Wgeto9ZJ66x10bpq3bTuWg+tp4ZpuEZopEZptAY0RmM1TuM1QRM1SZM1RVM1TdM1QzM1S7M1R3M1T/O1QAu1SIMa0mJaL6231kfrq/XT+msDtIHaIG2wNkQbqg3ThmsjtJHaKG20NkYbq43TxmsTtInaJG2yNkWbqk3TpmsztJnaLG22Nkebq83T5msLtIXaIm2xtkRbqi3TlmsJ2gptpbZKW62t0dZq67T12gZto7ZJ26xt0bZq27Tt2g5tp7ZL263t0fZq+7T92gHtoHZIO6wd0Y5qx7Tj2gntpHZKO62d0c5q57Tz2gXtonZJu6xd0a5q17Tr2g3tpnZLu63d0e5q97T72gPtofZIe6w90Z5qz7Tn2gvtpfZKe6290RK1t9o77b32QfuofdI+a1+0r9o37bv2Q/up/dJ+a3+0v9o/LU6P15PoSfVkenI9hZ5ST6Wn1tPoafV0eno9g55Rz6Rn1rPoWfVsenY9h55Tz6Xn1vPoefV8en69gF5QL6QX1ovoRfVienG9hF5SL6WX1svoZfVyenm9gl5Rr6RX1qvoVfVqenW9hl5Tr6XX1uvodfV6en29gd5Qb6Q31pvoTfVmenO9hd5Sb6W31tvobfV2enu9g95R76R31rvoXfVuene9h95Tx3RcJ3RSp3RaBzqjszqn87qgi7qky7qiq7qm67qhm7ql27qju7qn+3qgh3qkQx3pMb2X3lvvo/fV++n99QH6QH2QPlgfog/Vh+nD9RH6SH2UPlofo4/Vx+nj9Qn6RH2SPlmfok/Vp+nT9Rn6TH2WPlufo8/V5+nz9QX6Qn2Rvlhfoi/Vl+nL9QR9hb5SX6Wv1tfoa/V1+np9g75R36Rv1rfoW/Vt+nZ9h75T36Xv1vfoe/V9+n79gH5QP6Qf1o/oR/Vj+nH9hH5SP6Wf1s/oZ/Vz+nn9gn5Rv6Rf1q/oV/Vr+nX9hn5Tv6Xf1u/od/V7+n39gf5Qf6Q/1p/oT/Vn+nP9hf5Sf6W/1t/oifpb/Z3+Xv+gf9Q/6Z/1L/pX/Zv+Xf+h/9R/6b/1P/pf/Z8eZ8QbSYykRjIjuZHCSGmkMlIbaYy0RjojvZHByGhkMjIbWYysRjYju5HDyGnkMnIbeYy8Rj4jv1HAKGgUMgobRYyiRjGjuFHCKGmUMkobZYyyRjmjvFHBqGhUMiobVYyqRjWjulHDqGnUMmobdYy6Rj2jvtHAaGg0MhobTYymRjOjudHCaGm0MlobbYy2RjujvdHB6Gh0MjobXYyuRjeju9HD6GlgBm4QBmlQBm0AgzFYgzN4QzBEQzJkQzFUQzN0wzBMwzJswzFcwzN8IzBCIzKggYyY0cvobfQx+hr9jP7GAGOgMcgYbAwxhhrDjOHGCGOkMcoYbYwxxhrjko83JhgTjUnGZGOKMdWYZkw3ZhgzjVnGbGOOMdeYZ8w3FhgLjUXGYmOJsdRYZiw3EowVxkpjlbHaWGOsNdYZ640NxkZjk7HZ2GJsNbYZ240dxk5jl7Hb2GPsNfYZ+40DxkHjkHHYOGIcNY4Zx40TxknjlHHaOGOcNc4Z540LxkXjknHZuGJcNa4Z140bxk3jlnHbuGPcNe4Z940HxkPjkfHYeGI8NZ4Zz40XxkvjlfHaeGMkGm+Nd8Z744Px0fhkfDa+GF+Nb8Z344fx0/hl/Db+GH+Nf0acGW8mMZOayczkZgozpZnKTG2mMdOa6cz0ZgYzo5nJzGxmMbOa2czsZg4zp5nLzG3mMfOa+cz8ZgGzoFnILGwWMYuaxcziZgmzpFnKLG2WMcua5czyZgWzolnJrGxWMaua1czqZg2zplnLrG3WMeua9cz6ZgOzodnIbGw2MZuazczmZguzpdnKbG22Mdua7cz2Zgezo9nJ7Gx2Mbua3czuZg+zp4mZuEmYpEmZtAlMxmRNzuRNwRRNyZRNxVRNzdRNwzRNy7RNx3RNz/TNwAzNyIQmMmNmL7O32cfsa/Yz+5sDzIHmIHOwOcQcag4zh5sjzJHmKHO0OcYca44zx5sTzInmJHOyOcWcak4zp5szzJnmLHO2Oceca84z55sLzIXmInOxucRcai4zl5sJ5gpzpbnKXG2uMdea68z15gZzo7nJ3GxuMbea28zt5g5zp7nL3G3uMfea+8z95gHzoHnIPGweMY+ax8zj5gnzpHnKPG2eMc+a58zz5gXzonnJvGxeMa+a18zr5g3zpnnLvG3eMe+a98z75gPzofnIfGw+MZ+az8zn5gvzpfnKfG2+MRPNt+Y78735wfxofjI/m1/Mr+Y387v5w/xp/jJ/m3/Mv+Y/M86Kt5JYSa1kVnIrhZXSSmWlttJYaa10Vnorg5XRymRltrJYWa1sVnYrh5XTymXltvJYea18Vn6rgFXQKmQVtopYRa1iVnGrhFXSKmWVtspYZa1yVnmrglXRqmRVtqpYVa1qVnWrhlXTqmXVtupYda16Vn2rgdXQamQ1tppYTa1mVnOrhdXSamW1ttpYba12Vnurg9XR6mR1trpYXa1uVnerh9XTwizcIizSoizaAhZjsRZn8ZZgiZZkyZZiqZZm6ZZhmZZl2ZZjuZZn+VZghVZkQQtZMauX1dvqY/W1+ln9rQHWQGuQNdgaYg21hlnDrRHWSGuUNdoaY421xlnjrQnWRGuSNdmaYk21plnTrRnWTGuWNduaY8215lnzrQXWQmuRtdhaYi21llnLrQRrhbXSWmWtttZYa6111nprg7XR2mRttrZYW61t1nZrh7XT2mXttvZYe6191n7rgHXQOmQdto5YR61j1nHrhHXSOmWdts5YZ61z1nnrgnXRumRdtq5YV61r1nXrhnXTumXdtu5Yd6171n3rgfXQemQ9tp5YT61n1nPrhfXSemW9tt5YidZb65313vpgfbQ+WZ+tL9ZX65v13fph/bR+Wb+tP9Zf658VZ8fbSeykdjI7uZ3CTmmnslPbaey0djo7vZ3BzmhnsjPbWeysdjY7u53DzmnnsnPbeey8dj47v13ALmgXsgvbReyidjG7uF3CLmmXskvbZeyydjm7vF3BrmhXsivbVeyqdjW7ul3DrmnXsmvbdey6dj27vt3Abmg3shvbTeymdjO7ud3Cbmm3slvbbey2dju7vd3B7mh3sjvbXeyudje7u93D7mljNm4TNmlTNm0Dm7FZm7N5W7BFW7JlW7FVW7N127BN27Jt27Fd27N9O7BDO7KhjeyY3cvubfex+9r97P72AHugPcgebA+xh9rD7OH2CHukPcoebY+xx9rj7PH2BHuiPcmebE+xp9rT7On2DHumPcuebc+x59rz7Pn2AnuhvchebC+xl9rL7OV2gr3CXmmvslfba+y19jp7vb3B3mhvsjfbW+yt9jZ7u73D3mnvsnfbe+y99j57v33APmgfsg/bR+yj9jH7uH3CPmmfsk/bZ+yz9jn7vH3Bvmhfsi/bV+yr9jX7un3Dvmnfsm/bd+y79j37vv3Afmg/sh/bT+yn9jP7uf3Cfmm/sl/bb+xE+639zn5vf7A/2p/sz/YX+6v9zf5u/7B/2r/s3/Yf+6/9z45z4p0kTlInmZPcSeGkdFI5qZ00TlonnZPeyeBkdDI5mZ0sTlYnm5PdyeHkdHI5uZ08Tl4nn5PfKeAUdAo5hZ0iTlGnmFPcKeGUdEo5pZ0yTlmnnFPeqeBUdCo5lZ0qTlWnmlPdqeHUdGo5tZ06Tl2nnlPfaeA0jI+Li3OaOE2dZk5zp4XT0mnltHbaOG2ddk57p4PT0enkdHa6OF2dbk53p4fT08Ec3CEc0qEc2gEO47AO5/CO4IiO5MiO4qiO5uiO4ZiO5diO47iO5/hO4IRO5EAHOTGnl9Pb6eP0dfo5/Z0BzkBnkDPYGeIMdYY5w50RzkhnlDPaGeOMdcY5450JzkRnkjPZmeJMdaY5050ZzkxnljPbmePMdeY5850FzkJnkbPYWeIsdZY5y50EZ4Wz0lnlrHbWOGuddc56Z4Oz0dnkbHa2OFudbc52Z4ez09nl7Hb2OHudfc5+54Bz0DnkHHaOOEedY85x54Rz0jnlnHbOOGedc85554Jz0bnkXHauOFeda85154Zz07nl3HbuOHede85954Hz0HnkPHaeOE+dZ85z54Xz0nnlvHbeOInOW+ed89754Hx0PjmfnS/OV+eb89354fx0fjm/nT/OX+efE+fGu0ncpG4yN7mbwk3ppnJTu2nctG46N72bwc3oZnIzu1ncrG42N7ubw83p5nJzu3ncvG4+N79bwC3oFnILu0Xcom4xt7hbwi3plnJLu2Xcsm45t7xbwa3oVnIru1Xcqm41t7pbw63p1nJru3Xcum49t77bwG3oNnIbu03cpm4zt7nbwm3ptnJbu23ctm47t73bwe3odnI7u13crm43t7vbw+3pYi7uEi7pUi7tApdxWZdzeVdwRVdyZVdxVVdzdddwTddybddxXddzfTdwQzdyoYvcmNvL7e32cfu6/dz+7gB3oDvIHewOcYe6w9zh7gh3pDvKHe2Occe649zx7gR3ojvJnexOcae609zp7gx3pjvLne3Ocee689z57gJ3obvIXewucZe6y9zlboK7wl3prnJXu2vcte46d727wd3obnI3u1vcre42d7u7w93p7nJ3u3vcve4+d797wD3oHnIPu0fco+4x97h7wj3pnnJPu2fcs+4597x7wb3oXnIvu1fcq+4197p7w73p3nJvu3fcu+499777wH3oPnIfu0/cp+4z97n7wn3pvnJfu2/cRPet+859735wP7qf3M/uF/er+8397v5wf7q/3N/uH/ev+8+N8+K9JF5SL5mX3EvhpfRSeam9NF5aL52X3svgZfQyeZm9LF5WL5uX3cvh5fRyebm9PF5eL5+X3yvgFfQKeYW9Il5Rr5hX3CvhlfRKeaW9Ml5Zr5xX3qvgVfQqeZW9Kl5Vr5pX3avh1fRqebW9Ol5dr55X32vgNfQaeY29Jl5Tr5nX3GvhtfRaea29Nl5br53X3uvgdfQ6eZ29Ll5Xr5vX3evh9fQwD/cIj/Qoj/aAx3isx3m8J3iiJ3myp3iqp3m6Z3imZ3m253iu53m+F3ihF3nQQ17M6+X19vp4fb1+Xn9vgDfQG+QN9oZ4Q71h3nBvhDfSG+WN9sZ4Y71x3nhvgjfRm+RN9qZ4U71p3nRvhjfTm+XN9uZ4c7153nxvgbfQW+Qt9pZ4S71l3nIvwVvhrfRWeau9Nd5ab5233tvgbfQ2eZu9Ld5Wb5u33dvh7fR2ebu9Pd5eb5+33zvgHfQOeYe9I95R75h33DvhnfROeae9M95Z75x33rvgXfQueZe9K95V75p33bvh3fRuebe9O95d755333vgPfQeeY+9J95T75n33HvhvfReea+9N16i99Z75733PngfvU/eZ++L99X75n33fng/vV/eb++P99f758X58X4SP6mfzE/up/BT+qn81H4aP62fzk/vZ/Az+pn8zH4WP6ufzc/u5/Bz+rn83H4eP6+fz8/vF/AL+oX8wn4Rv6hfzC/ul/BL+qX80n4Zv6xfzi/vV/Ar+pX8yn4Vv6pfza/u1/Br+rX82n4dv65fz6/vN/Ab+o38xn4Tv6nfzG/ut/Bb+q381n4bv63fzm/vd/A7+p38zn4Xv6vfze/u9/B7+piP+4RP+pRP+8BnfNbnfN4XfNGXfNlXfNXXfN03fNO3fNt3fNf3fN8P/NCPfOgjP+b38nv7ffy+fj+/vz/AH+gP8gf7Q/yh/jB/uD/CH+mP8kf7Y/yx/jh/vD/Bn+hP8if7U/yp/jR/uj/Dn+nP8mf7c/y5/jx/vr/AX+gv8hf7S/yl/jJ/uZ/gr/BX+qv81f4af62/zl/vb/A3+pv8zf4Wf6u/zd/u7/B3+rv83f4ef6+/z9/vH/AP+of8w/4R/6h/zD/un/BP+qf80/4Z/6x/zj/vX/Av+pf8y/4V/6p/zb/u3/Bv+rdS3Pbv+Hf9e/59/4H/0H/kP/af+E/9Z/5z/4X/0n/lv/bf+In+W/+d/97/4H/0P/mf/S/+V/+b/93/4f/0f/m//T/+X/+fHxfEB0mCpEGyIHmQIkgZpApSB2mCtEG6IH2QIcgYZAoyB1mCrEG2IHuQI8gZ5ApyB3mCvEG+IH9QICgYFAoKB0WCokGxoHhQIigZlApKB2WCskG5oHxQIagYVAoqB1WCqkG1oHpQI6gZ1ApqB3WCukG9oH7QIGgYNAoaB02CpkGzoHnQImgZtApaB22CtkG7oH3QIegYdAo6B12CrkG3oHvQI+gZYAEeEAEZUAEdgIAJ2IAL+EAIxEAK5EAJ1EAL9MAIzMAK7MAJ3MAL/CAIwiAKYICCWNAr6B30CfoG/YL+wYBgYDAoGBwMCYYGw4LhwYhgZDAqGB2MCcYG44LxwYRgYjApmBxMCaYG04LpwYxgZjArmB3MCeYG84L5wYJgYbAoWBwsCZYGy4LlQUKwIlgZrApWB2uCtcG6YH2wIdgYbAo2B1uCrcG2YHuwI9gZ7Ap2B3uCvcG+YH9wIDgYHAoOB0eCo8Gx4HhwIjgZnApOB2eCs8G54HxwIbgYXAouB1eCq8G14HpwI7gZ3ApuB3eCu8G94H7wIHgYPAoeB0+Cp8Gz4HnwIngZvApeB2+CxOBt8C54H3wIPgafgs/Bl+Br8C34HvwIfga/gt/Bn+Bv8C+IC+PDJGHSMFmYPEwRpgxThanDNGHaMF2YPswQZgwzhZnDLGHWMFuYPcwR5gxzhbnDPGHeMF+YPywQFgwLhYXDImHRsFhYPCwRlgxLhaXDMmHZsFxYPqwQVgwrhZXDKmHVsFpYPawR1gxrhbXDOmHdsF5YP2wQNgwbhY3DJmHTsFnYPGwRtgxbha3DNmHbsF3YPuwQdgw7hZ3DLmHXsFvYPewR9gyxEA+JkAypkA5ByIRsyIV8KIRiKIVyqIRqqIV6aIRmaIV26IRu6IV+GIRhGIUwRGEs7BX2DvuEfcN+Yf9wQDgwHBQODoeEQ8Nh4fBwRDgyHBWODseEY8Nx4fhwQjgxnBRODqeEU8Np4fRwRjgznBXODueEc8N54fxwQbgwXBQuDpeES8Nl4fIwIVwRrgxXhavDNeHacF24PtwQbgw3hZvDLeHWcFu4PdwR7gx3hbvDPeHecF+4PzwQHgwPhYfDI+HR8Fh4PDwRngxPhafDM+HZ8Fx4PrwQXgwvhZfDK+HV8Fp4PbwR3gxvhbfDO+Hd8F54P3wQPgwfhY/DJ+HT8Fn4PHwRvgxfha/DN2Fi+DZ8F74PP4Qfw0/h5/BL+DX8Fn4Pf4Q/w1/h7/BP+Df8F8ZF8VGSKGmULEoepYhSRqmi1FGaKG2ULkofZYgyRpmizFGWKGuULcoe5YhyRrmi3FGeKG+UL8ofFYgKRoWiwlGRqGhULCoelYhKRqWi0lGZqGxULiofVYgqRpWiylGVqGpULaoe1YhqRrWi2lGdqG5UL6ofNYgaRo2ixlGTqGnULGoetYhaRq2i1lGbqG3ULmofdYg6Rp2izlGXqGvULeoe9Yh6RliER0RERlRERyBiIjbiIj4SIjGSIjlSIjXSIj0yIjOyIjtyIjfyIj8KojCKIhihKBb1inpHfaK+Ub+ofzQgGhgNigZHQ6Kh0bBoeDQiGhmNikZHY6Kx0bhofDQhmhhNiiZHU6Kp0bRoejQjmhnNimZHc6K50bxofrQgWhgtihZHS6Kl0bJoeZQQrYhWRqui1dGaaG20LlofbYg2RpuizdGWaGu0Ldoe7Yh2Rrui3dGeaG+0L9ofHYgORoeiw9GR6Gh0LDoenYhORqei09GZ6Gx0LjofXYguRpeiy9GV6Gp0Lboe3YhuRrei29Gd6G50L7ofPYgeRo+ix9GT6Gn0LHoevYheRq+i19GbKDF6G72L3kcfoo/Rp+hz9CX6Gn2Lvkc/op/Rr+h39Cf6G/2L4mA8TAKTwmQwOUwBU8JUMDVMA9PCdDA9zAAzwkwwM8wCs8JsMDvMAXPCXDA3zAPzwnwwPywAC8JCsDAsAovCYrA4LAFLwlKwNCwDy8JysDysACvCSrAyrAKrwmqwOqwBa8JasDasA+vCerA+bAAbwkawMWwCm8JmsDlsAVvCVrA1bAPbwnawPewAO8JOsDPsArvCbrA77AF7QgzikIAkpCANAWQgCznIQwGKUIIyVKAKNahDA5rQgjZ0oAs96MMAhjCCECIYg71gb9gH9oX9YH84AA6Eg+BgOAQOhcPgcDgCjoSj4Gg4Bo6F4+B4OAFOhJPgZDgFToXT4HQ4A86Es+BsOAfOhfPgfLgALoSL4GK4BC6Fy+BymABXwJVwFVwN18C1cB1cDzfAjXAT3Ay3wK1wG9wOd8CdcBfcDffAvXAf3A8PwIPwEDwMj8Cj8Bg8Dk/Ak/AUPA3PwLPwHDwPL8CL8BK8DK/Aq/AavA5vwJvwFrwN78C78B68Dx/Ah/ARfAyfwKfwGXwOX8CX8BV8Dd/ARPgWvoPv4Qf4EX6Cn+EX+BV+g9/hD/gT/oK/4R/4F/6DcSgeJUFJUTKUHKVAKVEqlBqlQWlROpQeZUAZUSaUGWVBWVE2lB3lQDlRLpQb5UF5UT6UHxVABVEhVBgVQUVRMVQclUAlUSlUGpVBZVE5VB5VQBVRJVQZVUFVUTVUHdVANVEtVBvVQXVRPVQfNUANUSPUGDVBTVEz1By1QC1RK9QatUFtUTvUHnVAHVEn1Bl1QV1RN9Qd9UA9EYZwRCASUYhGADGIRRzikYBEJCEZKUhFGtKRgUxkIRs5yEUe8lGAQhQhiBCKoV6oN+qD+qJ+qD8agAaiQWgwGoKGomFoOBqBRqJRaDQag8aicWg8moAmokloMpqCpqJpaDqagWaiWWg2moPmonloPlqAFqJFaDFagpaiZWg5SkAr0Eq0Cq1Ga9BatA6tRxvQRrQJbUZb0Fa0DW1HO9BOtAvtRnvQXrQP7UcH0EF0CB1GR9BRdAwdRyfQSXQKnUZn0Fl0Dp1HF9BFdAldRlfQVXQNXUc30E10C91Gd9BddA/dRw/QQ/QIPUZP0FP0DD1HL9BL9Aq9Rm9QInqL3qH36AP6iD6hz+gL+oq+oe/oB/qJfqHf6A/6i/6huFh8LEksaSxZLHksRSxlLFUsdSxNLG0sXSx9LEMsYyxTLHMsSyxrLFsseyxHLGcsVyx3LE8sbyxfLH+sQKxgrFCscKxIrGisWKx4rESsZKxUrHSsTKxsrFysfKxCrGKsUqxyrEqsaqxarHqsRqxmrFasdqxOrG6sXqz+/0i4x8W8mgQAwClS205tW19t27aN45k5tvHWtm3btm0bafbH3siDNcAaYo2wxlgTrCnWDGuOtcBaYq2w1lgbrC3WDmuPdcA6Yp2wzlgXrCvWDeuO9cB6Yr2w3lgfrC/WD+uPDcAGYoOwwdgQbCg2DBuOjcBGYqOw0dgYbCw2DhuPTcAmYpOwydgUbCo2DZuOzcBmYhiGYwRGYhRGYwwGMIghjMU4jMcETMQkTMYUTMU0TMcMzMQszMYczMU8zMcCLMQiLIbNwmZjc7C52DxsPrYAW4gtwhZjS7Cl2DJsObYCW4mtwlZja7C12DpsPbYB24htwjZjW7Ct2DZsO7YD24ntwnZje7C92D5sP3YAO4gdwg5jR7Cj2DHsOHYCO4mdwk5jZ7Cz2DnsPHYBu4hdwi5jV7Cr2DXsOnYDu4ndwm5jd7C72D3sPvYAe4g9wh5jT7Cn2DPsOfYCe4m9wl5jb7C32DvsPfYB+4h9wj5jX7Cv2DfsO/YD+4n9wn5jf7C/WCL2D0vC4lAyPDmeAk+Jx+Op8NR4Gjwtng5Pj2fAM+KZ8Mx4Fjwrng3PjufAc+K58Nx4Hjwvng/PjxfAC+KF8AS8MF4EL4oXw4vjJfCSeCm8NF4GL4uXw8vjFfCKeCW8Ml4Fr4pXw6vjNfCaeC28Nl4Hr4vXw+vj/+EN8IZ4I7wx3gRvijfDm+Mt8JZ4K7w13gZvi7fD2+Md8I54J7wz3gXvinfDu+M98J54L7w33gfvi/fD++MD8IH4IHwwPgQfig/Dh+Mj8JH4KHw0PgYfi4/Dx+MT8In4JHwyPgWfik/Dp+Mz8Jk4huM4gZM4hdM4gwMc4ghncQ7ncQEXcQmXcQVXcQ3XcQM3cQu3cQd3cQ/38QAP8QiP4bPw2fgcfC4+D5+PL8AX4ovwxfgSfCm+DF+Or8BX4qvw1fgafC2+Dl+Pb8A34pvwzfgWfCu+Dd+O78B34rvw3fgefC++D9+PH8AP4ofww/gR/Ch+DD+On8BP4qfw0/gZ/Cx+Dj+PX8Av4pfwy/gV/Cp+Db+O38Bv4rfw2/gd/C5+D7+PP8Af4o/wx/gT/Cn+DH+Ov8Bf4q/w1/gb/C3+Dn+Pf8A/4p/wz/gX/Cv+Df+O/8B/4r/w3/gf/C+eiP/Dk/A4IhmRnEhBpCTiiVREaiINkZZIR6QnMhAZiUxEZiILkZXIRmQnchA5iVxEbiIPkZfIR+QnChAFiUJEAlGYKEIUJYoRxYkSREmiFFGaKEOUJcoR5YkKREWiElGZqEJUJaoR1YkaRE2iFlGbqEPUJeoR9Yn/iAZEQ6IR0ZhoQjQlmhHNiRZES6IV0ZpoQ7Ql2hHtiQ5ER6IT0ZnoQnQluhHdiR5ET6IX0ZvoQ/Ql+hH9iQHEQGIQMZgYQgwlhhHDiRHESGIUMZoYQ4wlxhHjiQnERGISMZmYQkwlphHTiRnETAIjcIIgSIIiaIIhAAEJRLAER/CEQIiERMiEQqiERuiEQZiERdiEQ7iER/hEQIRERMSIWcRsYg4xl5hHzCcWEAuJRcRiYgmxlFhGLCdWECuJVcRqYg2xllhHrCc2EBuJTcRmYguxldhGbCd2EDuJXcRuYg+xl9hH7CcOEAeJQ8Rh4ghxlDhGHCdOECeJU8Rp4gxxljhHnCcuEBeJS8Rl4gpxlbhGXCduEDeJW8Rt4g5xl7hH3CceEA+JR8Rj4gnxlHhGPCdeEC+JV8Rr4g3xlnhHvCc+EB+JT8Rn4gvxlfhGfCd+ED+JX8Rv4g/xl0gk/hFJRByZjExOpiBTkvFkKjI1mYZMS6Yj05MZyIxkJjIzmYXMSmYjs5M5yJxkLjI3mYfMS+Yj85MFyIJkITKBLEwWIYuSxcjiZAmyJFmKLE2WIcuS5cjyZAWyIlmJrExWIauS1cjqZA2yJlmLrE3WIeuS9cj65H9kA7Ih2YhsTDYhm5LNyOZkC7Il2YpsTbYh25LtyPZkB7Ij2YnsTHYhu5LdyO5kD7In2YvsTfYh+5L9yP7kAHIgOYgcTA4hh5LDyOHkCHIkOYocTY4hx5LjyPHkBHIiOYmcTE4hp5LTyOnkDHImiZE4SZBxkCJpkiEBCUlEsiRH8qRAiqREyqRCqqRG6qRBmqRF2qRDuqRH+mRAhmRExshZ5GxyDjmXnEfOJxeQC8lF5GJyCbmUXEYuJ1eQK8lV5GpyDbmWXEeuJzeQG8lN5GZyC7mV3EZuJ3eQO8ld5G5yD7mX3EfuJw+QB8lD5GHyCHmUPEYeJ0+QJ8lT5GnyDHmWPEeeJy+QF8lL5GXyCnmVvEZeJ2+QN8lb5G3yDnmXvEfeJx+QD8lH5GPyCfmUfEY+J1+QL8lX5GvyDfmWfEe+Jz+QH8lP5GfyC/mV/EZ+J3+QP8lf5G/yD/mXTCT/kUlkHJWMSk6loFJS8VQqKjWVhkpLpaPSUxmojFQmKjOVhcpKZaOyUzmonFQuKjeVh8pL5aPyUwWoglQhKoEqTBWhilLFqOJUCaokVYoqTZWhylLlqPJUBaoiVYmqTFWhqlLVqOpUDaomVYuqTdWh6lL1qPrUf1QDqiHViGpMNaGaUs2o5lQLqiXVimpNtaHaUu2o9lQHqiPViepMdaG6Ut2o7lQPqifVi+pN9aH6Uv2o/tQAaiA1iBpMDaGGUsOo4dQIaiQ1ihpNjaHGUuOo8dQEaiI1iZpMTaGmUtOo6dQMaiaFUThFUCRFUTTFUICCFKJYiqN4SqBESqJkSqFUSqN0yqBMyqJsyqFcyqN8KqBCKqJi1CxqNjWHmkvNo+ZTC6iF1CJqMbWEWkoto5ZTK6iV1CpqNbWGWkuto9ZTG6iN1CZqM7WF2kpto7ZTO6id1C5qN7WH2kvto/ZTB6iD1CHqMHWEOkodo45TJ6iT1CnqNHWGOkudo85TF6iL1CXqMnWFukpdo65TN6ib1C3qNnWHukvdo+5TD6iH1CPqMfWEeko9o55TL6iX1CvqNfWGeku9o95TH6iP1CfqM/WF+kp9o75TP6if1C/qN/WH+kslUv+oJCqOTkYnp1PQKel4OhWdmk5Dp6XT0enpDHRGOhOdmc5CZ6Wz0dnpHHROOhedm85D56Xz0fnpAnRBuhCdQBemi9BF6WJ0cboEXZIuRZemy9Bl6XJ0eboCXZGuRFemq9BV6Wp0dboGXZOuRdem69B16Xp0ffo/ugHdkG5EN6ab0E3pZnRzugXdkm5Ft6bb0G3pdnR7ugPdke5Ed6a70F3pbnR3ugfdk+5F96b70H3pfnR/egA9kB5ED6aH0EPpYfRwegQ9kh5Fj6bH0GPpcfR4egI9kZ5ET6an0FPpafR0egY9k8ZonCZokqZommZoQEMa0SzN0Twt0CIt0TKt0Cqt0Tpt0CZt0Tbt0C7t0T4d0CEd0TF6Fj2bnkPPpefR8+kF9EJ6Eb2YXkIvpZfRy+kV9Ep6Fb2aXkOvpdfR6+kN9EZ6E72Z3kJvpbfR2+kd9E56F72b3kPvpffR++kD9EH6EH2YPkIfpY/Rx+kT9En6FH2aPkOfpc/R5+kL9EX6En2ZvkJfpa/R1+kb9E36Fn2bvkPfpe/R9+kH9EP6Ef2YfkI/pZ/Rz+kX9Ev6Ff2afkO/pd/R7+kP9Ef6E/2Z/kJ/pb/R3+kf9E/6F/2b/kP/pRPpf3QSHcckY5IzKZiUTDyTiknNpGHSMumY9EwGJiOTicnMZGGyMtmY7EwOJieTi8nN5GHyMvmY/EwBpiBTiElgCjNFmKJMMaY4U4IpyZRiSjNlmLJMOaY8U4GpyFRiKjNVmKpMNaY6U4OpydRiajN1mLpMPaY+8x/TgGnINGIaM02YpkwzpjnTgmnJtGJaM22Ytkw7pj3TgenIdGI6M12Yrkw3pjvTg+nJ9GJ6M32Yvkw/pj8zgBnIDGIGM0OYocwwZjgzghnJjGJGM2OYscw4ZjwzgZnITGImM1OYqcw0Zjozg5nJYAzOEAzJUAzNMAxgIIMYluEYnhEYkZEYmVEYldEYnTEYk7EYm3EYl/EYnwmYkImYGDOLmc3MYeYy85j5zAJmIbOIWcwsYZYyy5jlzApmJbOKWc2sYdYy65j1zAZmI7OJ2cxsYbYy25jtzA5mJ7OL2c3sYfYy+5j9zAHmIHOIOcwcYY4yx5jjzAnmJHOKOc2cYc4y55jzzAXmInOJucxcYa4y15jrzA3mJnOLuc3cYe4y95j7zAPmIfOIecw8YZ4yz5jnzAvmJfOKec28Yd4y75j3zAfmI/OJ+cx8Yb4y35jvzA/mJ/OL+c38Yf4yicw/JomJA8lAcpACpATxIBVIDdKAtCAdSA8ygIwgE8gMsoCsIBvIDnKAnCAXyA3ygLwgH8gPCoCCoBBIAIVBEVAUFAPFQQlQEpQCpUEZUBaUA+VBBVARVAKVQRVQFVQD1UENUBPUArVBHVAX1AP1wX+gAWgIGoHGoAloCpqB5qAFaAlagdagDWgL2oH2oAPoCDqBzqAL6Aq6ge6gB+gJeoHeoA/oC/qB/mAAGAgGgcFgCBgKhoHhYAQYCUaB0WAMGAvGgfFgApgIJoHJYAqYCqaB6WAGmAkwgAMCkIACNGAAABAgwAIO8EAAIpCADBSgAg3owAAmsIANHOACD/ggACGIQAzMArPBHDAXzAPzwQKwECwCi8ESsBQsA8vBCrASrAKrwRqwFqwD68EGsBFsApvBFrAVbAPbwQ6wE+wCu8EesBfsA/vBAXAQHAKHwRFwFBwDx8EJcBKcAqfBGXAWnAPnwQVwEVwCl8EVcBVcA9fBDXAT3AK3wR1wF9wD98ED8BA8Ao/BE/AUPAPPwQvwErwCr8Eb8Ba8A+/BB/ARfAKfwRfwFXwD38EP8BP8Ar/BH/AXJIJ/IAnEwWQwOUwBU8J4mAqmhmlgWpgOpocZYEaYCWaGWWBWmA1mhzlgTpgL5oZ5YF6YD+aHBWBBWAgmwMKwCCwKi8HisAQsCUvB0rAMLAvLwfKwAqwIK8HKsAqsCqvB6rAGrAlrwdqwDqwL68H68D/YADaEjWBj2AQ2hc1gc9gCtoStYGvYBraF7WB72AF2hJ1gZ9gFdoXdYHfYA/aEvWBv2Af2hf1gfzgADoSD4GA4BA6Fw+BwOAKOhKPgaDgGjoXj4Hg4AU6Ek+BkOAVOhdPgdDgDzoQYxCEBSUhBGjIQQAgRZCEHeShAEUpQhgpUoQZ1aEATWtCGDnShB30YwBBGMAZnwdlwDpwL58H5cAFcCBfBxXAJXAqXweVwBVwJV8HVcA1cC9fB9XAD3Ag3wc1wC9wKt8HtcAfcCXfB3XAP3Av3wf3wADwID8HD8Ag8Co/B4/AEPAlPwdPwDDwLz8Hz8AK8CC/By/AKvAqvwevwBrwJb8Hb8A68C+/B+/ABfAgfwcfwCXwKn8Hn8AV8CV/B1/ANfAvfwffwA/wIP8HP8Av8Cr/B7/AH/Al/wd/wD/wLE+E/mATjUDKUHKVAKVE8SoVSozQoLUqH0qMMKCPKhDKjLCgryoayoxwoJ8qFcqM8KC/Kh/KjAqggKoQSUGFUBBVFxVBxVAKVRKVQaVQGlUXlUHlUAVVElVBlVAVVRdVQdVQD1US1UG1UB9VF9VB99B9qgBqiRqgxaoKaomaoOWqBWqJWqDVqg9qidqg96oA6ok6oM+qCuqJuqDvqgXqiXqg36oP6on6oPxqABqJBaDAagoaiYWg4GoFGolFoNBqDxqJxaDyagCaiSWgymoKmomloOpqBZiIM4YhAJKIQjRgEEEQIsYhDPBKQiCQkIwWpSEM6MpCJLGQjB7nIQz4KUIgiFEOz0Gw0B81F89B8tAAtRIvQYrQELUXL0HK0Aq1Eq9BqtAatRevQerQBbUSb0Ga0BW1F29B2tAPtRLvQbrQH7UX70H50AB1Eh9BhdAQdRcfQcXQCnUSn0Gl0Bp1F59B5dAFdRJfQZXQFXUXX0HV0A91Et9BtdAfdRffQffQAPUSP0GP0BD1Fz9Bz9AK9RK/Qa/QGvUXv0Hv0AX1En9Bn9AV9Rd/Qd/QD/US/0G/0B/1FiegfSkJxbDI2OZuCTcnGs6nY1GwaNi2bjk3PZmAzspnYzGwWNiubjc3O5mBzsrnY3GweNi+bj83PFmALsoXYBLYwW4QtyhZji7Ml2JJsKbY0W4Yty5Zjy7MV2IpsJbYyW4WtylZjq7M12JpsLbY2W4ety9Zj67P/sQ3YhmwjtjHbhG3KNmObsy3YlmwrtjXbhm3LtmPbsx3YjmwntjPbhe3KdmO7sz3Ynmwvtjfbh+3L9mP7swPYgewgdjA7hB3KDmOHsyPYkewodjQ7hh3LjmPHsxPYiewkdjI7hZ3KTmOnszPYmSzG4izBkizF0izDAhayiGVZjuVZgRVZiZVZhVVZjdVZgzVZi7VZh3VZj/XZgA3ZiI2xs9jZ7Bx2LjuPnc8uYBeyi9jF7BJ2KbuMXc6uYFeyq9jV7Bp2LbuOXc9uYDeym9jN7BZ2K7uN3c7uYHeyu9jd7B52L7uP3c8eYA+yh9jD7BH2KHuMPc6eYE+yp9jT7Bn2LHuOPc9eYC+yl9jL7BX2KnuNvc7eYBOTkpJus3fYu+w99j77gH3IPmIfs0/Yp+wz9jn7gn3JvmJfs2/Yt+w79j37gf3IfmI/s1/Yr+w39jv7g/3J/mJ/s3/Yv2wi+49NYuO4ZFxyLgWXkovnUnGpuTRcWi4dl57LwGXkMnGZuSxcVi4bl53LweXkcnG5uTxcXi4fl58rwBXkCnEJXGGuCFeUK8YV50pwJblSXGmuDFeWK8eV5ypwFblKXGWuCleVq8ZV52pwNblaXG2uDleXq8fV5/7jGnANuUZcY64J15RrxjXnWnAtuVZca64N15Zrx7XnOnAduU5cZ64L15XrxnXnenA9uV5cb64P15frx/XnBnADuUHcYG4IN5Qbxg3nRnAjuVHcaG4MN5Ybx43nJnATuUncZG4KN5Wbxk3nZnAzOYzDOYIjOYqjOYYDHOQQx3Icx3MCJ3ISJ3MKp3Iap3MGZ3IWZ3MO53Ie53MBF3IRF+NmcbO5Odxcbh43n1vALeQWcYu5JdxSbhm3nFvBreRWcau5Ndxabh23ntvAbeQ2cZu5LdxWbhu3ndvB7eR2cbu5Pdxebh+3nzvAHeQOcYe5I9xR7hh3nDvBneROcae5M9xZ7hx3nrvAXeQucZe5K9xV7hp3nbvB3eRucbe5O9xd7h53n3vAPeQecY+5J9xT7hn3nHvBveReca+5N9xb7h33nvvAfeQ+cZ+5L9xX7hv3nfvB/eR+cb+5P9xfLpH7xyVxcXwyPjmfgk/Jx/Op+NR8Gj4tn45Pz2fgM/KZ+Mx8Fj4rn43Pzufgc/K5+Nx8Hj4vn4/PzxfgC/KF+AS+MF+EL8oX44vzJfiSfCm+NF+GL8uX48vzFfiKfCW+Ml+Fr8pX46vzNfiafC2+Nl+Hr8vX4+vz//EN+IZ8I74x34Rvyjfjm/Mt+JZ8K74134Zvy7fj2/Md+I58J74z34Xvynfju/M9+J58L74334fvy/fj+/MD+IH8IH4wP4Qfyg/jh/Mj+JH8KH40P4Yfy4/jx/MT+In8JH4yP4Wfyk/jp/Mz+Jk8xuM8wZM8xdM8wwMe8ohneY7neYEXeYmXeYVXeY3XeYM3eYu3eYd3eY/3+YAP+YiP8bP42fwcfi4/j5/PL+AX8ov4xfwSfim/jF/Or+BX8qv41fwafi2/jl/Pb+A38pv4zfwWfiu/jd/O7+B38rv43fwefi+/j9/PH+AP8of4w/wR/ih/jD/On+BP8qf40/wZ/ix/jj/PX+Av8pf4y/wV/ip/jb/O3+Bv8rf42/wd/i5/j7/PP+Af8o/4x/wT/in/jH/Ov+Bf8q/41/wb/i3/jn/Pf+A/8p/4z/wX/iv/jf/O/+B/8r/43/wf/i+fyP/jk/g4IZmQXEghpBTihVRCaiGNkFZIJ6QXMggZhUxCZiGLkFXIJmQXcgg5hVxCbiGPkFfIJ+QXCggFhUJCglBYKCIUFYoJxYUSQkmhlFBaKCOUFcoJ5YUKQkWhklBZqCJUFaoJ1YUaQk2hllBbqCPUFeoJ9YX/hAZCQ6GR0FhoIjQVmgnNhRZCS6GV0FpoI7QV2gnthQ5CR6GT0FnoInQVugndhR5CT6GX0FvoI/QV+gn9hQHCQGGQMFgYIgwVhgnDhRHCSGGUMDpZXFycME4YL0wQJgqThMnCFGGqME2YLswQZgqYgAuEQAqUQAuMAAQoIIEVOIEXBEEUJEEWFEEVNEEXDMEULMEWHMEVPMEXAiEUIiEmzBJmC3OEucI8Yb6wQFgoLBIWC0uEpcIyYbmwQlgprBJWC2uEtcI6Yb2wQdgobBI2C1uErcI2YbuwQ9gp7BJ2C3uEvcI+Yb9wQDgoHBIOC0eEo8Ix4bhwQjgpnBJOC2eEs8I54bxwQbgoXBIuC1eEq8I14bpwQ7gp3BJuC3eEu8I94b7wQHgoPBIeC0+Ep8Iz4bnwQngpvBJeC2+Et8I74b3wQfgofBI+C1+Er8I34bvwQ/gp/BJ+C3+Ev0Ki8E9IEuLEZGJyMYWYUowXU4mpxTRiWjGdmF7MIGYUM4mZxSxiVjGbmF3MIeYUc4m5xTxiXjGfmF8sIBYUC4kJYmGxiFhULCYWF0uIJcVSYmmxjFhWLCeWFyuIFcVKYmWxilhVrCZWF2uINcVaYm2xjlhXrCfWF/8TG4gNxUZiY7GJ2FRsJjYXW4gtxVZia7GN2FZsJ7YXO4gdxU5iZ7GL2FXsJnYXe4g9xV5ib7GP2FfsJ/YXB4gDxUHiYHGIOFQcJg4XR4gjxVHiaHGMOFYcJ44XJ4gTxUniZHGKOFWcJk4XZ4gzRUzERUIkRUqkRUYEIhSRyIqcyIuCKIqSKIuKqIqaqIuGaIqWaIuO6Iqe6IuBGIqRGBNnibPFOeJccZ44X1wgLhQXiYvFJeJScZm4XFwhrhRXiavFNeJacZ24XtwgbhQ3iZvFLeJWcZu4Xdwh7hR3ibvFPeJecZ+4XzwgHhQPiYfFI+JR8Zh4XDwhnhRPiafFM+JZ8Zx4XrwgXhQviZfFK+JV8Zp4Xbwh3hRvibfFO+Jd8Z54X3wgPhQfiY/FJ+JT8Zn4XHwhvhRfia/FN+Jb8Z34XvwgfhQ/iZ/FL+JX8Zv4Xfwh/hR/ib/FP+JfMVH8JyaJcVIyKbmUQkopxUuppNRSGimtlE5KL2WQMkqZpMxSFimrlE3KLuWQckq5pNxSHimvlE/KLxWQCkqFpASpsFREKioVk4pLJaSSUimptFRGKiuVk8pLFaSKUiWpslRFqipVk6pLNaSaUi2ptlRHqivVk+pL/0kNpIZSI6mx1ERqKjWTmkstpJZSK6m11EZqK7WT2ksdpI5SJ6mz1EXqKnWTuks9pJ5SL6m31EfqK/WT+ksDpIHSIGmwNEQaKg2ThksjpJHSKGm0NEYaK42TxksTpInSJGmyNEWaKk2TpkszpJkSJuESIZESJdESIwEJSkhiJU7iJUESJUmSJUVSJU3SJUMyJUuyJUdyJU/ypUAKpUiKSbOk2dIcaa40T5ovLZAWSoukxdISaam0TFourZBWSquk1dIaaa20TlovbZA2SpukzdIWaau0Tdou7ZB2Sruk3dIeaa+0T9ovHZAOSoekw9IR6ah0TDounZBOSqek09IZ6ax0TjovXZAuSpeky9IV6ap0Tbou3ZBuSrek29Id6a50T7ovPZAeSo+kx9IT6an0THouvZBeSq+k19Ib6a30TnovfZA+Sp+kz9IX6av0Tfou/ZB+Sr+k39If6a+UKP2TkqQ4OZmcXE4hp5Tj5VRyajmNnFZOJ6eXM8gZ5UxyZjmLnFXOJmeXc8g55VxybjmPnFfOJ+eXC8gF5UJyglxYLiIXlYvJxeUSckm5lFxaLiOXlcvJ5eUKckW5klxZriJXlavJ1eUack25llxbriPXlevJ9eX/5AZyQ7mR3FhuIjeVm8nN5RZyS7mV3FpuI7eV28nt5Q5yR7mT3FnuIneVu8nd5R5yT7mX3FvuI/eV+8n95QHyQHmQPFgeIg+Vh8nD5RHySHmUPFoeI4+Vx8nj5QnyRHmSPFmeIk+Vp8nT5RnyTBmTcZmQSZmSaZmRgQxlJLMyJ/OyIIuyJMuyIquyJuuyIZuyJduyI7uyJ/tyIIdyJMfkWfJseY48V54nz5cXyAvlRfJieYm8VF4mL5dXyCvlVfJqeY28Vl4nr5c3yBvlTfJmeYu8Vd4mb5d3yDvlXfJueY+8V94n75cPyAflQ/Jh+Yh8VD4mH5dPyCflU/Jp+Yx8Vj4nn5cvyBflS/Jl+Yp8Vb4mX5dvyDflW/Jt+Y58V74n35cfyA/lR/Jj+Yn8VH4mP5dfyC/lV/Jr+Y38Vn4nv5c/yB/lT/Jn+Yv8Vf4mf5d/yD/lX/Jv+Y/8V06U/8lJcpySTEmupFBSKvFKKiW1kkZJq6RT0isZlIxKJiWzkkXJqmRTsis5lJxKLiW3kkfJq+RT8isFlIJKISVBKawUUYoqxZTiSgmlpFJKKa2UUcoq5ZTySgWlolJJqaxUUaoq1ZTqSg2lplJLqa3UUeoq9ZT6yn9KA6Wh0khprDRRmirNlOZKC6Wl0kpprbRR2irtlPZKB6Wj0knprHRRuirdlO5KD6Wn0kvprfRR+ir9lP7KAGWgMkgZrAxRhirDlOHKCGWkMkoZrYxRxirjlPHKBGWiMkmZrExRpirTlOnKDGWmgim4QiikQim0wihAgQpSWIVTeEVQREVSZEVRVEVTdMVQTMVSbMVRXMVTfCVQQiVSYsosZbYyR5mrzFPmKwuUhcoiZbGyRFmqLFOWKyuUlcoqZbWyRlmrrFPWKxuUjcomZbOyRdmqbFO2KzuUncouZbeyR9mr7FP2KweUg8oh5bByRDmqHFOOKyeUk8op5bRyRjmrnFPOKxeUi8ol5bJyRbmqXFOuKzeUm8ot5bZyR7mr3FPuKw+Uh8oj5bHyRHmqPFOeKy+Ul8or5bXyRnmrvFPeKx+Uj8on5bPyRfmqfFO+Kz+Un8ov5bfyR/mrJCr/lCQlTk2mJldTqCnVeDWVmlpNo6ZV06np1QxqRjWTmlnNomZVs6nZ1RxqTjWXmlvNo+ZV86n51QJqQbWQmqAWVouoRdVianG1hFpSLaWWVsuoZdVyanm1glpRraRWVquoVdVqanW1hlpTraXWVuuoddV6an31P7WB2lBtpDZWm6hN1WZqc7WF2lJtpbZW26ht1XZqe7WD2lHtpHZWu6hd1W5qd7WH2lPtpfZW+6h91X5qf3WAOlAdpA5Wh6hD1WHqcHWEOlIdpY5Wx6hj1XHqeHWCOlGdpE5Wp6hT1WnqdHWGOlPFVFwlVFKlVFplVKBCFamsyqm8KqiiKqmyqqiqqqm6aqimaqm26qiu6qm+GqihGqkxdZY6W52jzlXnqfPVBepCdZG6WF2iLlWXqcvVFepKdZW6Wl2jrlXXqevVDepGdZO6Wd2iblW3qdvVHepOdZe6W92j7lX3qfvVA+pB9ZB6WD2iHlWPqcfVE+pJ9ZR6Wj2jnlXPqefVC+pF9ZJ6Wb2iXlWvqdfVG+pN9ZZ6W72j3lXvqffVB+pD9ZH6WH2iPlWfqc/VF+pL9ZX6Wn2jvlXfqe/VD+pH9ZP6Wf2iflW/qd/VH+pP9Zf6W/2j/lUT1X9qkhqnJdOSaym0lFq8lkpLraXR0mrptPRaBi2jlknLrGXRsmrZtOxaDi2nlkvLreXR8mr5tPxaAa2gVkhL0AprRbSiWjGtuFZCK6mV0kprZbSyWjmtvFZBq6hV0iprVbSqWjWtulZDq6nV0mprdbS6Wj2tvvaf1kBrqDXSGmtNtKZaM6251kJrqbXSWmtttLZaO6291kHrqHXSOmtdtK5aN6271kPrqfXSemt9tL5aP62/NkAbqA3SBmtDtKHaMG24NkIbqY3SRmtjtLHaOG28NkGbqE3SJmtTtKnaNG26NkObqWEarhEaqVEarTEa0KCGNFbjNF4TNFGTNFlTNFXTNF0zNFOzNFtzNFfzNF8LtFCLtJg2S5utzdHmavO0+doCbaG2SFusLdGWasu05doKbaW2SlutrdHWauu09doGbaO2SdusbdG2atu07doObae2S9ut7dH2avu0/doB7aB2SDusHdGOase049oJ7aR2SjutndHOaue089oF7aJ2SbusXdGuate069oN7aZ2S7ut3dHuave0+9oD7aH2SHusPdGeas+059oL7aX2SnutvdHeau+099oH7aP2SfusfdG+at+079oP7af2S/ut/dH+aonaPy1Ji9OT6cn1FHpKPV5PpafW0+hp9XR6ej2DnlHPpGfWs+hZ9Wx6dj2HnlPPpefW8+h59Xx6fr2AXlAvpCfohfUielG9mF5cL6GX1EvppfUyelm9nF5er6BX1CvplfUqelW9ml5dr6HX1GvptfU6el29nl5f/09voDfUG+mN9SZ6U72Z3lxvobfUW+mt9TZ6W72d3l7voHfUO+md9S56V72b3l3voffUe+m99T56X72f3l8foA/UB+mD9SH6UH2YPlwfoY/UR+mj9TH6WH2cPl6foE/UJ+mT9Sn6VH2aPl2foc/UMR3XCZ3UKZ3WGR3oUEc6q3M6rwu6qEu6rCu6qmu6rhu6qVu6rTu6q3u6rwd6qEd6TJ+lz9bn6HP1efp8fYG+UF+kL9aX6Ev1ZfpyfYW+Ul+lr9bX6Gv1dfp6fYO+Ud+kb9a36Fv1bfp2fYe+U9+l79b36Hv1ffp+/YB+UD+kH9aP6Ef1Y/px/YR+Uj+ln9bP6Gf1c/p5/YJ+Ub+kX9av6Ff1a/p1/YZ+U7+l39bv6Hf1e/p9/YH+UH+kP9af6E/1Z/pz/YX+Un+lv9bf6G/1d/p7/YP+Uf+kf9a/6F/1b/p3/Yf+U/+l/9b/6H/1RP2fnqTHGcmM5EYKI6URb6QyUhtpjLRGOiO9kcHIaGQyMhtZjKxGNiO7kcPIaeQycht5jLxGPiO/UcAoaBQyEozCRhGjqFHMKG6UMEoapYzSRhmjrFHOKG9UMCoalYzKRhWjqlHNqG7UMGoatYzaRh2jrlHPqG/8ZzQwGhqNjMZGE6Op0cxobrQwWhqtjNZGG6Ot0c5ob3QwOhqdjM5GF6Or0c3obvQwehq9jN5GH6Ov0c/obwwwBhqDjMHGEGOoMcwYbowwRhqjjNHGGGOsMc4Yb0wwJhqTjMnGFGOqMc2YbswwZhqYgRuEQRqUQRuMAQxoIIM1OIM3BEM0JEM2FEM1NEM3DMM0LMM2HMM1PMM3AiM0IiNmzDJmG3OMucY8Y76xwFhoLDIWG0uMpcYyY7mxwlhprDJWG2uMtcY6Y72xwdhobDI2G1uMrcY2Y7uxw9hp7DJ2G3uMvcY+Y79xwDhoHDIOG0eMo8Yx47hxwjhpnDJOG2eMs8Y547xxwbhoXDIuG1eMq8Y147pxw7hp3DJuG3eMu8Y9477xwHhoPDIeG0+Mp8Yz47nxwnhpvDJeG2+Mt8Y7473xwfhofDI+G1+Mr8Y347vxw/hp/DJ+G3+Mv0ai8c9IMuLMZGZyM4WZ0ow3U5mpzTRmWjOdmd7MYGY0M5mZzSxmVjObmd3MYeY0c5m5zTxmXjOfmd8sYBY0C5kJZmGziFnULGYWN0uYJc1SZmmzjFnWLGeWNyuYFc1KZmWzilnVrGZWN2uYNc1aZm2zjlnXrGfWN/8zG5gNzUZmY7OJ2dRsZjY3W5gtzVZma7ON2dZsZ7Y3O5gdzU5mZ7OL2dXsZnY3e5g9zV5mb7OP2dfsZ/Y3B5gDzUHmYHOIOdQcZg43R5gjzVHmaHOMOdYcZ443J5gTzUnmZHOKOdWcZk43Z5gzTczETcIkTcqkTcYEJjSRyZqcyZuCKZqSKZuKqZqaqZuGaZqWaZuO6Zqe6ZuBGZqRGTNnmbPNOeZcc54531xgLjQXmYvNJeZSc5m53FxhrjRXmavNNeZac5253txgbjQ3mZvNLeZWc5u53dxh7jR3mbvNPeZec5+53zxgHjQPmYfNI+ZR85h53DxhnjRPmafNM+ZZ85x53rxgXjQvmZfNK+ZV85p53bxh3jRvmbfNO+Zd855533xgPjQfmY/NJ+ZT85n53HxhvjRfma/NN+Zb85353vxgfjQ/mZ/NL+ZX85v53fxh/jR/mb/NP+ZfM9H8ZyaZcVYyK7mVwkppxVuprNRWGiutlc5Kb2WwMlqZrMxWFiurlc3KbuWwclq5rNxWHiuvlc/KbxWwClqFrASrsFXEKmoVs4pbJaySVimrtFXGKmuVs8pbFayKViWrslXFqmpVs6pbNayaVi2rtlXHqmvVs+ongf+j4VZjq4nV1GpmNbdaWC2tVlZrq43V1mpntbc6WB2tTlZnq4vV1epmdbd6WD2tXlZvq4/V1+pn9bcGWAOtQdZga4g11BpmDbdGWCOtUdZoa4w11hpnjbcmWBOtSdZka4o11ZpmTbdmWDMtzMItwiItyqItxgIWtJDFWpzFW4IlWpIlW4qlWpqlW4ZlWpZlW47lWp7lW4EVWpEVs2ZZs6051lxrnjXfWmAttBZZi60l1lJrmbXcWmGttFZZq6011lprnbXe2mBttDZZm60t1lZrm7Xd2mHttHZZu6091l5rn7XfOmAdtA5Zh60j1lHrmHXcOmGdtE5Zp60z1lnrnHXeumBdtC5Zl60r1lXrmnXdumHdtG5Zt6071l3rnnXfemA9tB5Zj60n1lPrmfXcemG9tF5Zr6031lvrnfXe+mB9tD5Zn60v1lfrm/Xd+mH9tH5Zv60/1l8r0fpnJVlxdjI7uZ3CTmnH26ns1HYaO62dzk5vZ7Az2pnszHYWO6udzc5u57Bz2rns3HYeO6+dz85vF7AL2oXsBLuwXcQuahezi9sl7JJ2Kbu0XcYua5ezy9sV7Ip2JbuyXcWualezq9s17Jp2Lbu2Xceua9ez69v/2Q3shnYju7HdxG5qN7Ob2y3slnYru7Xdxm5rt7Pb2x3sjnYnu7Pdxe5qd7O72z3snnYvu7fdx+5r97P72wPsgfYge7A9xB5qD7OH2yPskfYoe7Q9xh5rj7PH2xPsifYke7I9xZ5qT7On2zPsmTZm4zZhkzZl0zZjAxvayGZtzuZtwRZtyZZtxVZtzdZtwzZty7Ztx3Ztz/btwA7tyI7Zs+zZ9hx7rj3Pnm8vsBfai+zF9hJ7qb3MXm6vsFfaq+zV9hp7rb3OXm9vsDfam+zN9hZ7q73N3m7vsHfau+zd9h57r73P3m8fsA/ah+zD9hH7qH3MPm6fsE/ap+zT9hn7rH3OPm9fsC/al+zL9hX7qn3Nvm7fsG/at+zb9h37rn3Pvm8/sB/aj+zH9hP7qf3Mfm6/sF/ar+zX9hv7rf3Ofm9/sD/an+zP9hf7q/3N/m7/sH/av+zf9h/7r51o/7OT7DgnmZPcSeGkdOKdVE5qJ42T1knnpHcyOBmdTE5mJ4uT1cnmZHdyODmdXE5uJ4+T18nn5HcKOAWdQk6CU9gp4hR1ijnFnRJOSaeUU9op45R1yjnlnQpORaeSU9mp4lR1qjnVnRpOTaeWU9up49R16jn1nf+cBk5Dp5HT2GniNHWaOc2dFk7L+Li4OKeN09Zp57R3OjgdnU5OZ6eL09Xp5nR3ejg9nV5Ob6eP09fp5/R3BjgDnUHOYGeIM9QZ5gx3RjgjnVHOaGeMM9YZ54x3JjgTnUnOZGeKM9WZ5kx3ZjgzHczBHcIhHcqhHcYBDnSQwzqcwzuCIzqSIzuKozqaozuGYzqWYzuO4zqe4zuBEzqRE3NmObOdOc5cZ54z31ngLHQWOYudJc5SZ5mz3FnhrHRWOaudNc5aZ52z3tngbHQ2OZudLc5WZ5uz3dnh7HR2ObudPc5eZ5+z3zngHHQOOYedI85R55hz3DnhnHROOaedM85Z55xz3rngXHQuOZedK85V55pz3bnh3HRuObedO85d555z33ngPHQeOY+dJ85T55nz3HnhvHReOa+dN85b553z3vngfHQ+OZ+dL85X55vz3fnh/HR+Ob+dP85fJ9H55yQ5cW4yN7mbwk3pxrup3NRuGjetm85N72ZwM7qZ3MxuFjerm83N7uZwc7q53NxuHjevm8/N7xZwC7qF3AS3sFvELeoWc4u7JdySbim3tFvGLeuWc8u7FdyKbiW3slvFrepWc6u7Ndyabi23tlvHrevWc+u7/7kN3IZuI7ex28Rt6jZzm7st3JZuK7e128Zt67Zz27sd3I5uJ7ez28Xt6nZzu7s93J5uL7e328ft6/Zz+7sD3IHuIHewO8Qd6g5zh7sj3JHuKHe0O8Yd645zx7sT3InuJHeyO8Wd6k5zp7sz3Jku5uIu4ZIu5dIu4wIXushlXc7lXcEVXcmVXcVVXc3VXcM1Xcu1Xcd1Xc/13cAN3ciNubPc2e4cd647z53vLnAXuovcxe4Sd6m7zF3urnBXuqvc1e4ad627zl3vbnA3upvcze4Wd6u7zd3u7nB3urvc3e4ed6+7z93vHnAPuofcw+4R96h7zD3unnBPuqfc0+4Z96x7zj3vXnAvupfcy+4V96p7zb3u3nBvurfc2+4d9657z73vPnAfuo/cx+4T96n7zH3uvnBfuq/c1+4b9637zn3vfnA/up/cz+4X96v7zf3u/nB/ur/c3+4f96+b6P5zk9w4L5mX3EvhpfTivVReai+Nl9ZL56X3MngZvUxeZi+Ll9XL5mX3cng5vVxebi+Pl9fL5+X3CngFvUJeglfYK+IV9Yp5xb0SXkmvlFfaK+OV9cp55b0KXkWvklfZq+JV9ap51b0aXk2vllfbq+PV9ep59b3/vAZeQ6+R19hr4jX1mnnNvRZeS6+V19pr47X12nntvQ5eR6+T19nr4nX1unndvR5eT6+X19vr4/X1+nn9vQHeQG+QN9gb4g31hnnDvRHeSG+UN9ob4431xnnjvQneRG+SN9mb4k31pnnTvRneTA/zcI/wSI/yaI/xgAc95LEe5/Ge4Ime5Mme4qme5ume4Zme5dme47me5/le4IVe5MW8Wd5sb44315vnzfcWeAu9Rd5ib4m31FvmLfdWeCu9Vd5qb4231lvnrfc2eBu9Td5mb4u31dvmbfd2eDu9Xd5ub4+319vn7fcOeAe9Q95h74h31DvmHfdOeCe9U95p74x31jvnnfcueBe9S95l74p31bvmXfdueDe9W95t745317vn3fceeA+9R95j74n31HvmPfdeeC+9V95r74331nvnvfc+eB+9T95n74v31fvmffd+eD+9X95v74/310v0/nlJXpyfzE/up/BT+vF+Kj+1n8ZP66fz0/sZ/Ix+Jj+zn8XP6mfzs/s5/Jx+Lj+3n8fP6+fz8/sF/IJ+IT/BL+wX8Yv6xfzifgm/pF/KL+2X8cv65fzyfgW/ol/Jr+xX8av61fzqfg2/pl/Lr+3X8ev69fz6/n9+A7+h38hv7Dfxm/rN/OZ+C7+l38pv7bfx2/rt/PZ+B7+j38nv7Hfxu/rd/O5+D7+n38vv7ffx+/r9/P7+AH+gP8gf7A/xh/rD/OH+CH+kP8of7Y/xx/rj/PH+BH+iP8mf7E/xp/rT/On+DH+mj/m4T/ikT/m0z/jAhz7yWZ/zeV/wRV/yZV/xVV/zdd/wTd/ybd/xXd/zfT/wQz/yY/4sf7Y/x5/rz/Pn+wv8hf4if7G/xF/qL/OX+yv8lf4qf7W/xl/rr/PX+xv8jf4mf7O/xd/qb/O3+zsSdvq7/N3+Hn+vv8/f7x/wD/qH/MP+Ef+of8w/7p/wT/qn/NP+Gf+sf84/71/wL/qX/Mv+Ff+qf82/7t/wb/q3/Nv+Hf+uf8+/7z/wH/qP/Mf+E/+p/8x/7r/wX/qv/Nf+G/+t/85/73/wP/qf/M/+F/+r/83/7v/wf/q//N/+H/+vn+j/85P8uCBZkDxIEaQM4oNUQeogTZA2SBekDzIEGYNMQeYgS5A1yBZkD3IEOYNcQe4gT5A3yBfkDwoEBYNCQUJQOCgSFA2KBcWDEkHJoFRQOigTlA3KBeWDCkHFoFJQOagSVA2qBdWDGkHNoFZQO6gT1A3qBfWD/4IGQcOgUdA4aBI0DZoFzYMWQcugVdA6aBO0DdoF7YMOQcegU9A56BJ0DboF3YMeQc+gV9A76BP0DfoF/YMBwcBgUDA4GBIMDYYFw4MRwchgVDA6GBOMDcYF44MJwcRgUjA5mBJMDaYF04MZwcwAC/CACMiACuiACUAAAxSwARfwgRCIgRTIgRKogRbogRGYgRXYgRO4gRf4QRCEQRTEglnB7GBOMDeYF8wPFgQLg0XB4mBJsDRYFiwPVgQrg1XB6mBNsDZYF6wPNgQbg03B5mBLsDXYFmwPdgQ7g13B7mBPsDfYF+wPDgQHg0PB4eBIcDQ4FhwPTgQng1PB6eBMcDY4F5wPLgQXg0vB5eBKcDW4FlwPbgQ3g1vB7eBOcDe4F9wPHgQPg0fB4+BJ8DR4FjwPXgQvg1fB6+BN8DZ4F7wPPgQfg0/B5+BL8DX4FnwPfgQ/g1/B7+BP8DdIDP4FSUFcmCxMHqYIU4bxYaowdZgmTBumC9OHGcKMYaYwc5glzBpmC7OHOcKcYa4wd5gnzBvmC/OHBcKCYaEwISwcFgmLhsXC4mGJsGRYKiwdlgnLhuXC8mGFsGJYKawcVgmrhtXC6mGNsGZYK6wd1gnrhvXC+uF/YYOwYdgobBw2CZuGzcLmYYuwZdgqbB22CduG7cL2YYewY9gp7Bx2CbuG3cLuYY+wZ9gr7B32CfuG/cL+4YBwYDgoHBwOCYeGw8Lh4YhwZDgqHB2OCceG48Lx4YRwYjgpnBxOCaeG08Lp4YxwZoiFeEiEZEiFdMiEIIQhCtmQC/lQCMVQCuVQCdVQC/XQCM3QCu3QCd3QC/0wCMMwCmPhrHB2OCecG84L54cLwoXhonBxuCRcGi4Ll4crwpXhqnB1uCZcG64L14cbwo3hpnBzuCXcGm4Lt4c7wp3hrnB3uCfcG+4L94cHwoPhofBweCQ8Gh4Lj4cnwpPhqfB0eCY8G54Lz4cXwovhpfByeCW8Gl4Lr4c3wpvhrfB2eCe8G94L74cPwofho/Bx+CR8Gj4Ln4cvwpfhq/B1+CZ8G74L34cfwo/hp/Bz+CX8Gn4Lv4c/wp/hr/B3+Cf8GyaG/8KkMC5KFiWPUkQpo/goVZQ6ShOljdJF6aMMUcYoU5Q5yhJljbJF2aMcUc4oV5Q7yhPljfJF+aMCUcGoUJQQFY6KREWjYlHxqERUMioVlY7KRGWjclH5qEJUMaoUVY6qRFWjalH1qEZUM6oV1Y7qRHWjelH96L+oQdQwahQ1jppETaNmUfOoRdQyahW1jtpEbaN2UfuoQ9Qx6hR1jrpEXaNuUfeoR9Qz6hX1jvpEfaN+Uf9oQDQwGhQNjoZEQ6Nh0fBoRDQyGhWNjsZEY6Nx0fhoQjQxmhRNjqZEU6Np0fRoRjQzwiI8IiIyoiI6YiIQwQhFbMRFfCREYiRFcqREaqRFemREZmRFduREbuRFfhREYRRFsWhWNDuaE82N5kXzowXRwmhRtDhaEi2NlkXLoxXRymhVtDpaE62N1kXrow3RxmhTtDnaEm2NtkXbox3RzmhXtDvaE+2N9kX7owPRwehQdDg6Eh2NjkXHoxPRyehUdDo6E52NzkXnowvRxehSdDm6El2NrkXXoxvRzehWdDu6E92N7kX3owfRw+hR9Dh6Ej2NnkXPoxfRy+hV9Dp6E72N3kXvow/Rx+hT9Dn6En2NvkXfox/Rz+hX9Dv6E/2NEqN/UVIUF0sWSx5LEUsZi4+liqWOpYmljaWLpY9liGWMZYpljmWJZY1li2WP5YjljOWK5Y7lieWN5YvljxWIFYwViiXECseKxIrGisWKx0rESsZKxUrHysTKxsrFyscqxCrGKsUqx6rEqsaqxarHasRqxmrFasfqxOrG6sXqx/6LNYg1jP2PZHvszrNZACic2rZt205TvrVt225qW7dta2ae2rZt20bOh/Mz9lr7aoo1w5pjLbB4rCWWgLXCWmNtsLZYO6w99h/WAeuIdcI6Y12wrlg3rDvWA+uJ9cJ6Y32wvlg/rD82ABuIDcIGY0OwodgwbDg2AhuJjcJGY2Owsdg4bDw2AZuITcImY1Owqdg0bDo2A5uJzcJmY3Owudg8bD62AFuIJWKLsMXYEmwptgxbjq3AVmKrsNXYGmwttg5bj23ANmKbsM3YFmwrtg3bju3AMAzHCIzEKIzGGIzFOIzHBEzEJEzGFEzFNEzHDMzELMzGHMzFPMzHAizEIgxgEENYDNuJ7cJ2Y3uwvdg+bD92ADuIHcIOY0ewo9gx7Dh2AjuJncJOY2ews9g57Dx2AbuIXcIuY1ewq9g17Dp2A7uJ3cJuY3ewu9g97D72AHuIPcIeY0+wp9gz7Dn2AnuJvcJeY2+wt9g77D32AfuIfcI+Y1+wr9g37Dv2A/uJ/cJ+Y3+wv9g/LAmLw5PhyfEUeEo8FZ4aT4OnxdPh6fEMeEY8E54Zz4JnxbPh2fEceE48F54bz4PnxfPh+fECeEG8EF4YL4IXxYvhxfESeEm8FF4aL4OXxcvh5fEKeEW8El4Zr4JXxavh1fEaeE28Fl4br4PXxevh9fEGeEO8Ed4Yb4I3xZvhzfEWeDzeEk/AW+Gt8TZ4W7wd3h7/D++Ad8Q74Z3xLnhXvBveHe+B98R74b3xPnhfvB/eHx+AD8QH4YPxIfhQfBg+HB+Bj8RH4aPxMfhYfBw+Hp+AT8Qn4ZPxKfhUfBo+HZ+Bz8Rn4bPxOfhcfB4+H1+AL8QT8UX4YnwJvhRfhi/HV+Ar8VX4anwNvhZfh6/HN+Ab8U34ZnwLvhXfhm/Hd+AYjuMETuIUTuMMzuIczuMCLuISLuMKruIaruMGbuIWbuMO7uIe7uMBHuIRDnCIIzyG78R34bvxPfhefB++Hz+AH8QP4YfxI/hR/Bh+HD+Bn8RP4afxM/hZ/Bx+Hr+AX8Qv4ZfxK/hV/Bp+Hb+B38Rv4bfxO/hd/B5+H3+AP8Qf4Y/xJ/hT/Bn+HH+Bv8Rf4a/xN/hb/B3+Hv+Af8Q/4Z/xL/hX/Bv+Hf+B/8R/4b/xP/hf/B+ehMcRyYjkRAoiJZGKSE2kIdIS6Yj0RAYiI5GJyExkIbIS2YjsRA4iJ5GLyE3kIfIS+Yj8RAGiIFGIKEwUIYoSxYjiRAmiJFGKKE2UIcoS5YjyRAWiIlGJqExUIaoS1YjqRA2iJlGLqE3UIeoS9Yj6RAOiIdGIaEw0IZoSzYjmRAsinmhJJBCtiNZEG6It0Y5oT/xHdCA6Ep2IzkQXoivRjehO9CB6Er2I3kQfoi/Rj+hPDCAGEoOIwcQQYigxjBhOjCBGEqOI0cQYYiwxjhhPTCAmEpOIycQUYioxjZhOzCBmErOI2cQcYi4xj5hPLCAWEonEImIxsYRYSiwjlhMriJXEKmI1sYZYS6wj1hMbiI3EJmIzsYXYSmwjthM7CIzACYIgCYqgCYZgCY7gCYEQCYmQCYVQCY3QCYMwCYuwCYdwCY/wiYAIiYgABCQQESN2EruI3cQeYi+xj9hPHCAOEoeIw8QR4ihxjDhOnCBOEqeI08QZ4ixxjjhPXCAuEpeIy8QV4ipxjbhO3CBuEreI28Qd4i5xj7hPPCAeEo+Ix8QT4inxjHhOvCBeEq+IuLRviLfEO+I98YH4SHwiPhNfiK/EN+I78YP4SfwifhN/iL/EPyKJiCOTkcnJFGRKMhWZmkxDpiXTkenJDGRGMhOZmcxCZiWzkdnJHGROMheZm8xD5iXzkfnJAmRBshBZmCxCFiWLkcXJEmRJshRZmixDliXLkeXJCmRFshJZmaxCViWrkdXJGmRNshZZm6xD1iXrkfXJBmRDshHZmGxCNiWbkc3JFmQ82ZJMIFuRrck2ZFuyHdme/I/sQHYkO5GdyS5kV7Ib2Z3sQfYke5G9yT5kX7If2Z8cQA4kB5GDySHkUHIYOZwcQY4kR5GjyTHkWHIcOZ6cQE4kJ5GTySnkVHIaOZ2cQc4kZ5GzyTnkXHIeOZ9cQC4kE8lF5GJyCbmUXEYuJ1eQK8lV5GpyDbmWXEeuJzeQG8lN5GZyC7mV3EZuJ3eQGImTBEmSFEmTDMmSHMmTAimSEimTCqmSGqmTBmmSFmmTDumSHumTARmSEQlISCIyRu4kd5G7yT3kXnIfuZ88QB4kD5GHySPkUfIYeZw8QZ4kT5GnyTPkWfIceZ68QF4kL5GXySvkVfIaeZ28Qd4kb5G3yTvkXfIeeZ98QD4kH5GPySfkU/IZ+Zx8Qb4kX5GvyTfkW/Id+Z78QH4kP5GfyS/kV/Ib+Z38Qf4kf5G/yT/kX/IfmUTGUcmo5FQKKiWVikpNpaHSUumo9FQGKiOVicpMZaGyUtmo7FQOKieVi8pN5aHyUvmo/FQBqiBViCpMFaGKUsWo4lQJqiRViipNlaHKUuWo8lQFqiJViapMVaGqUtWo6lQNqiZVi6pN1aHqUvWo+lQDqiHViGpMNaGaUs2o5lQLKp5qSSVQrajWVBuqLdWOak/9R3WgOlKdqM5UF6or1Y3qTvWgelK9qN5UH6ov1Y/qTw2gBlKDqMHUEGooNYwaTo2gRlKjqNHUGGosNY4aT02gJlKTqMnUFGoqNY2aTs2gZlKzqNnUHGouNY+aTy2gFlKJ1CJqMbWEWkoto5ZTK6iV1CpqNbWGWkuto9ZTG6iN1CZqM7WF2kpto7ZTOyiMwimCIimKoimGYimO4imBEimJkimFUimN0imDMimLsimHcimP8qmACqmIAhSkEBWjdlK7qN3UHmovtY/aTx2gDlKHqMPUEeoodYw6Tp2gTlKnqNPUGeosdY46T12gLlKXqMvUFeoqdY26Tt2gblK3qNvUHeoudY+6Tz2gHlKPqMfUE+op9Yx6Tr2gXlKvqNfUG+ot9Y56T32gPlKfqM/UF+or9Y36Tv2gflK/qN/UH+ov9Y9KouLoZHRyOgWdkk5Fp6bT0GnpdHR6OgOdkc5EZ6az0FnpbHR2Ogedk85F56bz0HnpfHR+ugBdkC5EF6aL0EXpYnRxugRdki5Fl6bL0GXpcnR5ugJdka5EV6ar0FXpanR1ugZdk65F16br0HXpenR9ugHdkG5EN6ab0E3pZnRzugUdT7ekE+hWdGu6Dd2Wbke3p/+jO9Ad6U50Z7oL3ZXuRnene9A96V50b7oP3ZfuR/enB9AD6UH0YHoIPZQeRg+nR9Aj6VH0aHoMPZYeR4+nJ9AT6Un0ZHoKPZWeRk+nZ9Az6Vn0bHoOPZeeR8+nF9AL6UR6Eb2YXkIvpZfRy+kV9Ep6Fb2aXkOvpdfR6+kN9EZ6E72Z3kJvpbfR2+kdNEbjNEGTNEXTNEOzNEfztECLtETLtEKrtEbrtEGbtEXbtEO7tEf7dECHdEQDGtKIjtE76V30bnoPvZfeR++nD9AH6UP0YfoIfZQ+Rh+nT9An6VP0afoMfZY+R5+nL9AX6Uv0ZfoKfZW+Rl+nb9A36Vv0bfoOfZe+R9+nH9AP6Uf0Y/oJ/ZR+Rj+nX9Av6Vf0a/oN/ZZ+R7+nP9Af6U/0Z/oL/ZX+Rn+nf9A/6V/0b/oP/Zf+RyfRcUwyJjmTgknJpGJSM2mYtEw6Jj2TgcnIZGIyM1mYrEw2JjuTg8nJ5GJyM3mYvEw+Jj9TgCnIFGIKM0WYokwxpjhTginJlGJKM2WYskw5pjxTganIVGIqM1WYqkw1pjpTg6nJ1GJqM3WYukw9pj7TgGnINGIaM02YpkwzpjnTgolnWjIJTCumNdOGacu0Y9oz/zEdmI5MJ6Yz04XpynRjujM9mJ5ML6Y304fpy/Rj+jMDmIHMIGYwM4QZygxjhjMjmJHMKGY0M4YZy4xjxjMTmInMJGYyM4WZykxjpjMzmJnMLGY2M4eZy8xj5jMLmIVMIrOIWcwsYZYyy5jlzApmJbOKWc2sYdYy65j1zAZmI7OJ2cxsYbYy25jtzA4GY3CGYEiGYmiGYViGY3hGYERGYmRGYVRGY3TGYEzGYmzGYVzGY3wmYEImYgADGcTEmJ3MLmY3s4fZy+xj9jMHmIPMIeYwc4Q5yhxjjjMnmJPMKeY0c4Y5y5xjzjMXmIvMJeYyc4W5ylxjrjM3mJvMLeY2c4e5y9xj7jMPmIfMI+Yx84R5yjxjnjMvmJfMK+Y184Z5y7xj3jMfmI/MJ+Yz84X5ynxjvjM/mJ/ML+Y384f5y/xjkpg4NhmbnE3BpmRTsanZNGxaNh2bns3AZmQzsZnZLGxWNhubnc3B5mRzsbnZPGxeNh+bny3AFmQLsYXZImxRthhbnC3BlmRLsaXZMmxZthxbnq3AVmQrsZXZKmxVthpbna3B1mRrsbXZOmxdth5bn23ANmQbsY3ZJmxTthnbnG3BxrMt2QS2FduabcO2Zdux7dn/2A5sR7YT25ntwnZlu7Hd2R5sT7YX25vtw/Zl+7H92QHsQHYQO5gdwg5lh7HD2RHsSHYUO5odw45lx7Hj2QnsRHYSO5mdwk5lp7HT2RnsTHYWO5udw85l57Hz2QXsQjaRXcQuZpewS9ll7HJ2BbuSXcWuZtewa9l17Hp2A7uR3cRuZrewW9lt7HZ2B4uxOEuwJEuxNMuwLMuxPCuwIiuxMquwKquxOmuwJmuxNuuwLuuxPhuwIRuxgIUsYmPsTnYXu5vdw+5l97H72QPsQfYQe5g9wh5lj7HH2RPsSfYUe5o9w55lz7Hn2QvsRfYSe5m9wl5lr7HX2RvsTfYWe5u9w95l77H32QfsQ/YR+5h9wj5ln7HP2RfsS/YV+5p9w75l37Hv2Q/sR/YT+5n9wn5lv7Hf2R/sT/YX+5v9w/5l/7FJbByXjEvOpeBScqm41FwaLi2XjkvPZeAycpm4zFwWLiuXjcvO5eBycrm43FweLi+Xj8vPFeAKcoW4wlwRrihXjCvOleBKcqW40lwZrixXjivPVeAqcpW4ylwVripXjavO1eBqcrW42lwdri5Xj6vPNeAaco24xlwTrinXjGvOteDiuZZcAteKa8214dpy7bj23H9cB64j14nrzHXhunLduO5cD64n14vrzfXh+nL9uP7cAG4gN4gbzA3hhnLDuOHcCG4kN4obzY3hxnLjuPHcBG4iN4mbzE3hpnLTuOncDG4mN4ubzc3h5nLzuPncAm4hl8gt4hZzS7il3DJuObeCW8mt4lZza7i13DpuPbeB28ht4jZzW7it3DZuO7eDwzicIziSoziaYziW4zieEziRkziZUziV0zidMziTszibcziX8zifC7iQizjAQQ5xMW4nt4vbze3h9nL7uP3cAe4gd4g7zB3hjnLHuOPcCe4kd4o7zZ3hznLnuPPcBe4id4m7zF3hrnLXuOvcDe4md4u7zd3h7nL3uPvcA+4h94h7zD3hnnLPuOfcC+4l94p7zb3h3nLvuPfcB+4j94n7zH3hvnLfuO/cD+4n94v7zf3h/nL/uCQujk/GJ+dT8Cn5VHxqPg2flk/Hp+cz8Bn5THxmPguflc/GZ+dz8Dn5XHxuPg+fl8/H5+cL8AX5QnxhvghflC/GF+dL8CX5Unxpvgxfli/Hl+cr8BX5Snxlvgpfla/GV+dr8DX5Wnxtvg5fl6/H1+cb8A35RnxjvgnflG/GN+db8PF8Sz6Bb8W35tvwbfl2fHv+P74D35HvxHfmu/Bd+W58d74H35Pvxffm+/B9+X58f34AP5AfxA/mh/BD+WH8cH4EP5IfxY/mx/Bj+XH8eH4CP5GfxE/mp/BT+Wn8dH4GP5Ofxc/m5/Bz+Xn8fH4Bv5BP5Bfxi/kl/FJ+Gb+cX8Gv5Ffxq/k1/Fp+Hb+e38Bv5Dfxm/kt/FZ+G7+d38FjPM4TPMlTPM0zPMtzPM8LvMhLvMwrvMprvM4bvMlbvM07vMt7vM8HfMhHPOAhj/gYv5Pfxe/m9/B7+X38fv4Af5A/xB/mj/BH+WP8cf4Ef5I/xZ/mz/Bn+XP8ef4Cf5G/xF/mr/BX+Wv8df4Gf5O/xd/m7/B3+Xv8ff4B/5B/xD/mn/BP+Wf8c/4F/5J/xb/m3/Bv+Xf8e/4D/5H/xH/mv/Bf+W/8d/4H/5P/xf/m//B/+X98Eh8nJBOSCymElEIqIbWQRkgrpBPSCxmEjEImIbOQRcgqZBOyCzmEnEIuIbeQR8gr5BPyCwWEgkIhobBQRCgqFBOKCyWEkkIpobRQRigrlBPKCxWEikIlobJQRagqVBOqCzWEmkItobZQR6gr1BPqCw2EhkIjobHQRGgqNBOaCy2EeKGlkCC0EloLbYS2QjuhvfCf0EHoKHQSOgtdhK5CN6G70EPoKfQSegt9hL5CP6G/MEAYKAwSBgtDhKHCMGG4MEIYKYwSRgtjhLHCOGG8MEGYKEwSJgtThKnCNGG6MEOYKcwSZgtzhLnCPGG+sEBYKCQKi4TFwhJhqbBMWC6sEFYKq4TVwhphrbBOWC9sEDYKm4TNwhZhq7BN2C7sEDABFwiBFCiBFhiBFTiBFwRBFCRBFhRBFTRBFwzBFCzBFhzBFTzBFwIhFCIBCFBAQkzYKewSdgt7hL3CPmG/cEA4KBwSDgtHhKPCMeG4cEI4KZwSTgtnhLPCOeG8cEG4KFwSLgtXhKvCNeG6cEO4KdwSbgt3hLvCPeG+8EB4KDwSHgtPhKfCM+G58EJ4KbwSXgtvhLfCO+G98EH4KHwSPgtfhK/CN+G78EP4KfwSfgt/hL/CPyFJiBOTicnFFGJKMZWYWkwjphXTienFDGJGMZOYWcwiZhWzidnFHGJOMZeYW8wj5hXzifnFAmJBsZBYWCwiFhWLicXFEmJJsZRYWiwjlhXLieXFCmJFsZJYWawiVhWridXFGmJNsZZYW6wj1hXrifXFBmJDsZHYWGwiNhWbic3FFmK82FJMEFuJrcU2Yluxndg+WbL//4FiZ7GL2FXsJnYXe4g9xV5ib7GP2FfsJ/YXB4gDxUHiYHGIOFQcJg4XR4gjxVHiaHGMOFYcJ44XJ4gTxUniZHGKOFWcJk4XZ4gzxVnibHGOOFecJ84XF4gLxURxkbhYXCIuFZeJy8UV4kpxlbhaXCOuFdeJ68UN4kZxk7hZ3CJuFbeJ28UdIibiIiGSIiXSIiOyIifyoiCKoiTKoiKqoibqoiGaoiXaoiO6oif6YiCGYiQCEYpIjIk7xV3ibnGPuFfcJ+4XD4gHxUPiYfGIeFQ8Jh4XT4gnxVPiafGMeFY8J54XL4gXxUviZfGKeFW8Jl4Xb4g3xVvibfGOeFe8J94XH4gPxUfiY/GJ+FR8Jj4XX4gvxVfia/GN+FZ8J74XP4gfxU/iZ/GL+FX8Jn4Xf4g/xV/ib/GP+Ff8JyaJcVIyKbmUQkoppZJSS2mktFI6Kb2UQcooZZIyS1mkrFI2KbuUQ8op5ZJyS3mkvFI+Kb9UQCooFZIKS0WkolIxqbhUQioplZJKS2WkslI5qbxUQaooVZIqS1WkqlI1qbpUQ6op1ZJqS3WkulI9qb7UQGooNZIaS02kplIzqbnUQoqXWkoJUiuptdRGaiu1k9pL/0kdpI5SJ6mz1EXqKnWTuks9pJ5SL6m31EfqK/WT+ksDpIHSIGmwNEQaKg2ThksjpJHSKGm0NEYaK42TxksTpInSJGmyNEWaKk2TpkszpJnSLGm2NEeaK82T5ksLpIVSorRIWiwtkZZKy6Tl0gpppbRKWi2tkdZK66T10gZpo7RJ2ixtkbZK26Tt0g4Jk3CJkEiJkmiJkViJk3hJkERJkmRJkVRJk3TJkEzJkmzJkVzJk3wpkEIpkoAEJSTFpJ3SLmm3tEfaK+2T9ksHpIPSIemwdEQ6Kh2TjksnpJPSKem0dEY6K52TzksXpIvSJemydEW6Kl2Trks3pJvSLem2dEe6K92T7ksPpIfSI+mx9ER6Kj2TnksvpJfSK+m19EZ6K72T3ksfpI/SJ+mz9EX6Kn2Tvks/pJ/SL+m39Ef6K/2TkqQ4OZmcXE4hp5RTyanlNHJaOZ2cXs4gZ5QzyZnlLHJWOZucXc4h55RzybnlPHJeOZ+cXy4gF5QLyYXlInJRuZhcXC4hl5RLyaXlMnJZuZxcXq4gV5QryZXlKnJVuZpcXa4h15RrybXlOnJduZ5cX24gN5QbyY3lJnJTuZncXG4hx8st5QS5ldxabiO3ldvJ7eX/5A5yR7mT3FnuIneVu8nd5R5yT7mX3FvuI/eV+8n95QHyQHmQPFgeIg+Vh8nD5RHySHmUPFoeI4+Vx8nj5QnyRHmSPFmeIk+Vp8nT5RnyTHmWPFueI8+V58nz5QXyQjlRXiQvlpfIS+Vl8nJ5hbxSXiWvltfIa+V18np5g7xR3iRvlrfIW+Vt8nZ5h4zJuEzIpEzJtMzIrMzJvCzIoizJsqzIqqzJumzIpmzJtuzIruzJvhzIoRzJQIYykmPyTnmXvFveI++V98n75QPyQfmQfFg+Ih+Vj8nH5RPySfmUfFo+I5+Vz8nn5QvyRfmSfFm+Il+Vr8nX5RvyTfmWfFu+I9+V78n35QfyQ/mR/Fh+Ij+Vn8nP5RfyS/mV/Fp+I7+V38nv5Q/yR/mT/Fn+In+Vv8nf5R/yT/mX/Fv+I/+V/8lJcpySTEmupFBSKqmU1EoaJa2STkmvZFAyKpmUzEoWJauSTcmu5FByKrmU3EoeJa+ST8mvFFAKKoWUwkoRpahSTCmulFBKKqWU0koZpaxSTimvVFAqKpWUykoVpapSTamu1FBqKrWU2kodpa5ST6mvNFAaKo2UxkoTpanSVGmuNFfilXglQUlQWiutlbZKW6W90l7poHRQOimdlC5KF6Wb0k3pofRQeim9lD5KH6Wf0k8ZoAxQBimDlSHKEGWYMkwZoYxQRimjlDHKGGWcMk6ZoExUJimTlSnKVGWaMl2ZocxUZimzlTnKXGWeMl9ZoCxUEpVEZbGyWFmqLFWWK8uVlcpKZbWyWlmrrFXWK+uVjcpGZbOyWdmqbFW2K9sVTMEVQiEVSqEVRmEVTuEVQREVSZEVRVEVTdEVQzEVS7EVR3EVT/GVQAmVSAEKVJASU3Yqu5Tdyh5lr7JP2a8cUA4qh5TDyhHlqHJMOa6cUE4qp5TTyhnlrHJOOa9cUC4ql5TLyhXlqnJNua7cUG4qt5Tbyh3lrnJPua88UB4qj5THyhPlqfJMea68UF4qr5TXyhvlrfJOea98UD4qn5TPyhflq/JN+a78UH4qv5Tfyh/lr/JPSVLi1GRqcjWFmlJNpaZW06hp1XRqejWDmlHNpGZWs6hZ1WxqdjWHmlPNpeZW86h51XxqfrWAWlAtpBZWi6hF1WJqcbW4WlItqZZWS6tl1bJqebW8WlGtqFZWK6tV1WopqqvV1ZpqTbW2Wlutq9ZV66v11YZqQ7Wx2lhtqjZVm6vN1Xg1Xk1QE9TWamu1rdpWba+2VzuoHdROaie1i9pF7aZ2U3uoPdReai+1j9pH7af2UweoA9RB6iB1iDpEHaYOU0eoI9RR6ih1jDpGHaeOUyeoE9RJ6mR1ijpVnaZOV2eoM9VZ6mx1tjpXnavOVxeoC9RENVFdrC5Wl6pL1eXqcnWlukpdra5R16jr1PXqBnWjukndrG5Rt6rb1O3qDhVTcZVQSZVSaZVRWZVTeVVQRVVSZVVRVVVTddVQTdVSbdVRXdVTfTVQQzVSgQpVpMbUneoudbe6R92r7lP3qwfUg+oh9bB6RD2qHlOPqyfUk+op9bR6Rj2rnlPPqxfUi+ol9bJ6Rb2qXlOvqzfUm+ot9bZ6R72r3lPvqw/Uh+oj9bH6RH2qPlOfqy/Ul+or9bX6Rn2rvlPfqx/Uj+on9bP6Rf2qflO/qz/Un+ov9bf6R/2r/lOT1DgtmZZcS6Gl1FJpqbU0WlotnZaU9H+ckVnLomXVsmnZtRxaTi2XllvLo+XV8mn5tQJaQa2QVlgrohXVimnFtRJaSa2UVloro5XVymnltQpaRa2SVlmrolXVqmnVtRpaTa2WVluro9XV6mn1tQZaQ62R1lhrojXVmmnNtRZavNZSS9Baaa21NlpbrZ3WXvtP66B11DppnbUuWletm9Zd66H11HppvbU+Wl+tn9ZfG6AN1AZpg7Uh2lBtmDZcG6GN1EZpo7Ux2lhtnDZem6BN1CZpk7Up2lRtmjZdm6HN1GZps7U52lxtnjZfW6At1BK1RdpibYm2VFumLddWaCu1VdpqbY22Vlunrdc2aBu1TdpmbYu2Vdumbdd2aJiGa4RGapRGa4zGapzGa4ImapIma4qmapqma4ZmapZma47map7ma4EWapEGNKghLabt1HZpu7U92l5tn7ZfO6Ad1A5ph7Uj2lHtmHZcO6Gd1E5pp7Uz2lntnHZeu6Bd1C5pl7Ur2lXtmnZdu6Hd1G5pt7U72l3tnnZfe6A91B5pj7Un2lPtmfZce6G91F5pr7U32lvtnfZe+6B91D5pn7Uv2lftm/Zd+6H91H5pv7U/2l/tn5akxenJ9OR6Cj2lnkpPrafR0+rp9PR6Bj2jnknPrGfRs+rZ9Ox6Dj2nnkvPrefR8+r59Px6Ab2gXkgvrBfRi+rF9OJ6Cb2kXkovrZfRy+rl9PJ6Bb2iXkmvrFfRq+rV9Op6Db2mXkuvrdfR6+r19Pp6A72h3khvrDfRm+rN9OZ6Cz1eb6kn6K301nobva3eTm+v/6d30DvqnfTOehe9q95N76730HvqvfTeeh+9r95P768P0Afqg/TB+hB9qD5MH66P0Efqo/TR+hh9rD5OH69P0Cfqk/TJ+hR9qj5Nn67P0Gfqs/TZ+hx9rj5Pn68v0BfqifoifbG+RF+qL9OX6yv0lfoqfbW+Rl+rr9PX6xv0jfomfbO+Rd+qb9O36zt0TMd1Qid1Sqd1Rmd1Tud1QRd1SZd1RVd1Tdd1Qzd1S7d1R3d1T/f1QA/1SAc61JEe03fqu/Td+h59r75P368f0A/qh/TD+hH9qH5MP66f0E/qp/TT+hn9rH5OP69f0C/ql/TL+hX9qn5Nv67f0G/qt/Tb+h39rn5Pv68/0B/qj/TH+hP9qf5Mf66/0F/qr/TX+hv9rf5Of69/0D/qn/TP+hf9q/5N/67/0H/qv/Tf+h/9r/5PT9LjjGRGciOFkdJIZaQ20hhpjXRGeiODkdHIZGQ2shhZjWxGdiOHkdPIZeQ28hh5jXxGfqOAUdAoZBQ2ihhFjWJGcaOEUdIoZZQ2yhhljXJGeaOCUdGoZFQ2qhhVjWpGdaOGUdOoZdQ26hh1jXpGfaOB0dBoZDQ2mhhNjWZGc6OFEW+0NBKMVkZro43R1mhntDf+MzoYHY1ORmeji9HV6GZ0N3oYPY1eRm+jj9HX6Gf0NwYYA41BxmBjiDHUGGYMN0YYI41RxmhjjDHWGGeMNyYYE41JxmRjijHVmGZMN2YYM41ZxmxjjjHXmGfMNxYYC41EY5Gx2FhiLDWWGcuNFcZKY5Wx2lhjrDXWGeuNDUaq/1do3FZjm7Hd2GFgBm4QBmlQBm0wBmtwBm8IhmhIhmwohmpohm4YhmlYhm04hmt4hm8ERmhEBjCggYyYsdPYZew29hh7jX3GfuOAcdBIShMXd8Q4ahwzjhsnjJPGKeO0ccY4a5wzzhsXjIvGJeOyccW4alwzrhs3jJvGLeO2cce4a9wz7hsPjIfGI+Ox8cR4ajwznhsvjJfGK+O18cZ4a7wz3hsfjI/GJ+Oz8cX4anwzvhs/jJ/GL+O38cf4a/wzkow4M5mZ3ExhpjRTmanNNGZaM52Z3sxgZjQzmZnNLGZWM5uZ3cxh5jRzmbnNPGZeM5+Z3yxgFjQLmYXNImZRs5hZ3CxhljRLmaXNMmZZs5xZ3qxgVjQrmZXNKmZVs5pZ3axh1jRrmbXNOmZds55Z32xgNjQbmY3NJmZTs5nZ3GxhxpstzQSzldnabGO2NduZ7c3/zA5mR7OT2dnsYnY1u5ndzR5mT7OX2dvsY/Y1+5n9zQHmQHOQOdgcYg41h5nDzRHmSHOUOdocY441x5njzQnmRHOSOdmcYk41p5nTzRnmTHOWOducY84155nzzQXmQjPRXGQuNpeYS81l5nJzhbnSXGWuNteYa8115npzg7nR3GRuNreYW81t5nZzh4mZuEmYpEmZtMmYrMmZvCmYoimZsqmYqqmZummYpmmZtumYrumZvhmYoRmZwIQmMmPmTnOXudvcY+4195n7zQPmQfOQedg8Yh41j5nHzRPmSfOUedo8Y541z5nnzQvmRfOSedm8Yl41r5nXzRvmTfOWedu8Y94175n3zQfmQ/OR+dh8Yj41n5nPzRfmS/OV+dp8Y74135nvzQ/mR/OT+dn8Yn41v5nfzR/mT/OX+dv8Y/41/5lJZpyVzEpupbBSWqms1FYaK62VzkpvZbAyWpmszFYWK6uVzcpu5bByWrms3FYeK6+Vz8pvFbAKWoWswlYRq6hVzCpulbBKWqWs0lYZq6xVzipvVbAqWpWsylYVq6pVzapu1bBqWrWs2lYdq65Vz6pvNbAaWo2sxlYTq6nVzGputbDirZZWgtXKam21sdpa7az21n9WB6uj1cnqbHWxulrdrO5WD6un1cvqbfWx+lr9rP7WAGugNcgabA2xhlrDrOHWCGukNcoabY2xxlrjrPHWBGuiNcmabE2xplrTrOnWDGumNcuabc2x5lrzrPnWAmuhlWgtshZbS6yl1jJrubXCWmmtslZba6y11jprvbXB2mhtsjZbW6yt1jZru7XDwizcIizSoizaYizW4izeEizRkizZUizV0izdMizTsizbcizX8izfCqzQiixgQQtZMWuntcvabe2x9lr7rP3WAeugdcg6bB1JcdQ6Zh23TlgnrVPWaeuMddY6Z523LlgXrUvWZeuKddW6Zl23blg3rVvWbeuOdde6Z923HlgPrUfWY+uJ9dR6Zj23XlgvrVfWa+uN9dZ6Z723PlgfrU/WZ+uL9dX6Zn23flg/rV/Wb+uP9df6ZyVZcXYyO7mdwk5pp7JT22nstHY6O72dwc5oZ7Iz21nsrHY2O7udw85p57Jz23nsvHY+O79dwC5oF7IL20XsonYxu7hdwi5pl7JL22XssnY5u7xdwa5oV7Ir21XsqnY1u7pdw65p17Jr23XsunY9u77dwG5oN7Ib203spnYzu7ndwo63W9oJdiu7td3Gbmu3s9vb/9kd7I52J7uz3cXuanezu9s97J52L7u33cfua/ez+9sD7IH2IHuwPcQeag+zh9sj7JH2KHu0PcYea4+zx9sT7In2JHuyPcWeak+zp9sz7Jn2LHu2Pceea8+z56eOi4uzE+1F9mJ7ib3UXmYvt1fYK+1V9mp7jb3WXmevtzfYG+1N9mZ7i73V3mZvt3fYmI3bhE3alE3bjM3anM3bgi3aki3biq3amq3bhm3alm3bju3anu3bgR3akQ1saCM7Zu+0d9m77T32Xnufvd8+YB+0D9mH7SP2UfuYfdw+YZ+0T9mn7TP2Wfucfd6+YF+0L9mX7Sv2Vfuafd2+Yd+0b9m37Tv2Xfuefd9+YD+0H9mP7Sf2U/uZ/dx+Yb+0X9mv7Tf2W/ud/d7+YH+0P9mf7S/2V/ub/d3+Yf+0f9m/7T/2X/ufnWTHOcmc5E4KJ6WTykntpHHSOumc9E4GJ6OTycnsZHGyOtmc7E4OJ6eTy8nt5HHyOvmc/E4Bp6BTyCnsFHGKOsWc4k4Jp6RTyintlHHKOuWc8k4Fp6JTyansVHGqOtWc6k4Np6ZTy6nt1HHqOvWc+k4Dp6HTyGnsNHGaOs2c5k4LJ95p6SQ4rZzWThunrdPOae/853RwOjqdnM5OF6er083p7vRwejq9nN5OH6ev08/p7wxwBjqDnMHOEGeoM8wZ7oxwRjqjnNHOGGesM84Z70xwJjqTnMnOFGeqM82Z7sxwZjqznNnOHGeuM8+Z7yxwFjqJziJnsbPEWeosc5Y7K5yVzipntbPGWeusc9Y7G5yNziZns7PF2epsc7Y7OxzMwR3CIR3KoR3GYR3O4R3BER3JkR3FUR3N0R3DMR3LsR3HcR3P8Z3ACZ3IAQ50kBNzEuPi4nY7e5y9zj5nv3PAOegccg47R5yjzjHnuHPCOemcck47Z5yzzjnnvHPBuehcci47V5yrzjXnunPDuenccm47d5y7zj3nvvPAeeg8ch47T5ynzjPnufPCeem8cl47b5y3zjvnvfPB+eh8cj47X5yvzjfnu/PD+en8cn47f5y/zj8nyYlzk7nJ3RRuSjeVm9pN46Z107np3QxuRjeTm9nN4mZ1s7nZ3RxuTjeXm9vN4+Z187n53QJuQbeQW9gt4hZ1i7nF3RJuSbeUW9ot45Z1y7nl3QpuRbeSW9mt4lZ1q7nV3RpuTbeWW9ut49Z167n13QZuQ7eR29ht4jZ1m7nN3RZuvNvSTXBbua3dNm5bt53b3v3P7eB2dDu5nd0uble3m9vd7eH2dHu5vd0+bl+3n9vfHeAOdAe5g90h7lB3mDvcHeGOdEe5o90x7lh3nDveneBOdCe5k90p7lR3mjvdneHOdGe5s9057lx3njvfXeAudBPdRe5id4m71F3mLndXuCvdVe5qd4271l3nrnc3uBvdTe5md4u71d3mbnd3uJiLu4RLupRLu4zLupzLu4IrupIru4qrupqru4ZrupZru47rup7ru4EbupELXOgiN+budHe5u9097l53n7vfPeAedA+5h90j7lH3mHvcPeGedE+5p90z7ln3nHveveBedC+5l90r7lX3mnvdveHedG+5t9077l33nnvffeA+dB+5j90n7lP3mfvcfeG+dF+5r9037lv3nfve/eB+dD+5n90v7lf3m/vd/eH+dH+5v90/7l/3n5vkxnnJvOReCi+ll8pL7aXx0nrpvPReBi+jl8nL7GXxsnrZvOxeDi+nl8vL7eXx8nr5vPxeAa+gV8gr7BXxinrFvOJeCa+kV8or7ZXxynrlvPJeBa+iV8mr7FXxqnrVvOpeDa+mV8ur7dXx6nr1vPpeA6+h18hr7DXxmnrNvOZeCy/ea+kleK281l4br63Xzmvv/ed18Dp6nbzOXhevq9fN6+718Hp6vbzeXh+vr9fP6+8N8AZ6g7zB3hBvqDfMG+6N8EZ6o7zR3hhvrDfOG+9N8CZ6k7zJ3hRvqjfNm+7N8GZ6s7zZ3hxvrjfPm+8t8BZ6id4ib7G3xFvqLfOWeyu8ld4qb7W3xlvrrfPWexu8jd4mb7O3xdvqbfO2ezs8zMM9wiM9yqM9xmM9zuM9wRM9yZM9xVM9zdM9wzM9y7M9x3M9z/O9wAu9yAMe9JAX83Z6u7zd3h5vr7fP2+8d8A56h7zD3hHvqHfMO+6d8E56p7zT3hnvrHfOO+9d8C56l7zL3hXvqnfNu+7d8G56t7zb3h3vrnfPu+898B56j7zH3hPvqffMe+698F56r7zX3hvvrffOe+998D56n7zP3hfvq/fN++798H56v7zf3h/vr/fPS/Li/GR+cj+Fn9JP5af20/hp/XR+ej+Dn9HP5Gf2s/hZ/Wx+dj+Hn9PP5ef28/h5/Xx+fr+AX9Av5Bf2i/hF/WJ+cb+EX9Iv5Zf2y/hl/XJ+eb+CX9Gv5Ff2q/hV/Wp+db+GX9Ov5df26/h1/Xp+fb+B39Bv5Df2m/hN/WZ+c7+FH++39BP8Vn5rv43f1m/nt/f/8zv4Hf1Ofme/i9/V7+Z393v4Pf1efm+/j9/X7+f39wf4A/1B/mB/iD/UH+YP90f4I/1R/mh/jD/WH+eP9yf4E/1J/mR/ij/Vn+ZP92f4M/1Z/mx/jj/Xn+fP9xf4C/1Ef5G/2F/iL/WX+cv9Ff5Kf5W/2l/jr/XX+ev9Df5Gf5O/2d/ib/W3+dv9HT7m4z7hkz7l0z7jsz7n877gi77ky77iq77m677hm77l277ju77n+37gh37kAx/6yI/5O/1d/m5/j7/X3+fv9w/4B/1D/mH/iH/UP+Yf90/4J/1T/mn/jH/WP+ef9y/4F/1L/mX/in/Vv+Zf92/4N/1b/m3/jn/Xv+ff9x/4D/1H/mP/if/Uf+Y/91/4L/1X/mv/jf/Wf+e/9z/4H/1P/mf/i//V/+Z/93/4P/1f/m//j//X/+cn+XFBsiB5kCJIGaQKUgdpgrRBuiB9kCHIGGQKMgdZgqxBtiB7kCPIGeQKcgd5grxBviB/UCAoGBQKCgdFgqJBsaB4UCIoGZQKSgdlgrJBuaB8UCGoGFQKKgdVgqpBtaB6UCOoGdQKagd1grpBvaB+0CBoGDQKGgdNgqZBs6B50CKID1oGCUGroHXQJmgbtAvaB/8FHYKOQaegc9Al6Bp0C7oHPYKeQa+gd9An6Bv0C/oHA4KBwaBgcDAkGBoMC4YHI4KRwahgdDAmGBuMC8YHE4KJwaRgcjAlmBpMC6YHM4KZwaxgdjAnmBvMC+YHC4KFQWKwKFgcLAmWBsuC5cGKYGWwKlgdrAnWBuuC9cGGYGOwKdgcbAm2BtuC7cGOAAvwgAjIgArogAnYgAv4QAjEQArkQAnUQAv0wAiSBVZgB07gBl7gB0EQBlEAAhigIBbsDHYFu4M9wd5gX7A/OBAcDA4Fh4MjwdHgWHA8OBGcDE4Fp4MzwdngXHA+uBBcDC4Fl4MrwdXgWnA9uBHcDG4Ft4M7wd3gXnA/eBA8DB4Fj4MnwdPgWfA8eBG8DF4Fr4M3wdvgXfA++BB8DD4Fn4MvwdfgW/A9+BH8DH4Fv4M/wd/gX5AUxIXJwuRhijBlmCpMHaYJ04bpwvRhhjBjmCnMHGYJs4bZwuxhjjBnmCvMHeYJ84b5wvxhgbBgWCgsHBYJi4bFwuJhibBkWCosHZYJy4blwvJhhbBiWCmsHFYJq4bVwuphjbBmWCusHdYJ64b1wvphg7Bh2ChsHDYJm4bNwuZhizA+bBkmhK3C1mGbsG3YLmwf/hd2CDuGncLOYZewa9gt7B72CHuGvcLeYZ+wb9gv7B8OCAeGg8LB4ZBwaDgsHB6OCEeGo8LR4ZhwbDguHB9OCCeGk8LJ4ZRwajgtnB7OCGeGs8LZ4ZxwbjgvnB8uCBeGieGicHG4JFwaLguXhyvCleGqcHW4JlwbrgvXhxvCjeGmcHO4Jdwabgu3hztCLMRDIiRDKqRDJmRDLuRDIRRDKZRDJVRDLdRDIzRDK7RDJ3RDL/TDIAzDKAQhDFEYC3eGu8Ld4Z5wb7gv3B8eCA+Gh8LD4ZHwaHgsPB6eCE+Gp8LT4ZnwbHguPB9eCC+Gl8LL4ZXwangtvB7eCG+Gt8Lb4Z3wbngvvB8+CB+Gj8LH4ZPwafgsfB6+CF+Gr8LX4ZvwbfgufB9+CD+Gn8LP4Zfwa/gt/B7+CH+Gv8Lf4Z/wb/gvTArjomRR8ihFlDJKFaWO0kRpo3RR+ihDlDHKFGWOskRZo2xR9ihHlDPKFeWO8kR5o3xR/qhAVDAqFBWOikRFo2JR8ahEVDIqFZWOykRlo3JR+ahCVDGqFFWOqkRVo2pR9ahGVDOqFdWO6kR1o3pR/ahB1DBqFDWOmkRNo2ZR86hFFB+1jBKiVlHrqE3UNmoXtY/+izpEHaNOUeeoS9Q16hZ1j3pEPaNeUe+oT9Q36hf1jwZEA6NB0eBoSDQ0GhYNj0ZEI6NR0ehoTDQ2GheNjyZEE6NJ0eRoSjQ1mhZNj2ZEM6NZ0exoTjQ3mhfNjxZEC6PEaFG0OFoSLY2WRcujFdHKaFW0OloTrY3WReujDdHGaFO0OdoSbY22RdujHREW4RERkREV0RETsREX8ZEQiZEUyZESqZEW6ZERmZEV2ZETuZEX+VEQhVEUgQhGKIpFO6Nd0e5oT7Q32hftjw5EB6ND0eHoSHQ0OhYdj05EJ6NT0enoTHQ2Ohedjy5EF6NL0eXoSnQ1uhZdj25EN6Nb0e3oTnQ3uhfdjx5ED6NH0ePoSfQ0ehY9j15EL6NX0evoTfQ2ehe9jz5EH6NP0efoS/Q1+hZ9j35EP6Nf0e/oT/Q3+hclRXEgGUgOUoCUIBVIDdKAtCAdSA8ygIwgE8gMsoCsIBvIDnKAnCAXyA3ygLwgH8gPCoCCoBAoDIqAoqAYKA5KgJKgFCgNyoCyoBwoDyqAiqASqAyqgKqgGqgOaoCaoBaoDeqAuqAeqA8agIagEWgMmoCmoBloDlqAeNASJIBWoDVoA9qCdqA9+A90AB1BJ9AZdAFdQTfQHfQAPUEv0Bv0AX1BP9AfDAADwSAwGAwBQ8EwMByMACPBKDAajAFjwTgwHkwAE8EkMBlMAVPBNDAdzAAzwSwwG8wBc8E8MB8sAAtBIlgEFoMlYClYBpaDFWAlWAVWgzVgLVgH1oMNYCPYBDaDLWAr2Aa2gx0AAzggAAkoQAMGsIADPBCACCQgAwWoQAM6MIAJLGADB7jAAz4IQAgiAAAECMTATrAL7AZ7wF6wD+wHB8BBcAgcBkfAUXAMHAcnwElwCpwGZ8BZcA6cBxfARXAJXAZXwFVwDVwHN8BNcAvcBnfAXXAP3AcPwEPwCDwGT8BT8Aw8By/AS/AKvAZvwFvwDrwHH8BH8Al8Bl/AV/ANfAc/wE/wC/wGf8Bf8A8kgTiYDCaHKWBKmAqmhmlgWpgOpocZYEaYCWaGWWBWmA1mhzlgTpgL5oZ5YF6YD+aHBWBBWAgWhkVgUVgMFoclYElYCpaGZWBZWA6WhxVgRVgJVoZVYFVYDVaHNWBNWAvWhnVgXVgP1ocNYEPYCDaGTWBT2Aw2hy1gPGwJE2Ar2Bq2gW1hO9ge/gc7wI6wE+wMu8CusBvsDnvAnrAX7A37wL6wH+wPB8CBcBAcDIfAoXAYHA5HwJFwFBwNx8CxcBwcDyfAiXASnAynwKlwGpwOZ8CZcBacDefAuXAenA8XwIUwES6Ci+ESuBQug8vhCrgSroKr4Rq4Fq6D6+EGuBFugpvhFrgVboPb4Q6IQRwSkIQUpCEDWchBHgpQhBKUoQJVqEEdGtCEFrShA13oQR8GMIQRBBBCBGNwJ9wFd8M9cC/cB/fDA/AgPAQPwyPwKDwGj8MT8CQ8BU/DM/AsPAfPwwvwIrwEL8Mr8Cq8Bq/DG/AmvAVvwzvwLrwH78MH8CF8BB/DJ/ApfAafwxfwJXwFX8M38C18B9/DD/Aj/AQ/wy/wK/wGv8Mf8Cf8BX/DP/Av/AeTYBxKhpKjFCglSoVSozQoLUqH0qMMKCPKhDKjLCgryoayoxwoJ8qFcqM8KC/Kh/KjAqggKoQKoyKoKCqGiqMSqCQqhUqjMqgsKofKowqoIqqEKqMqqCqqhqqjGqgmqoVqozqoLqqH6qMGqCFqhBqjJqgpaoaaoxYoHrVECagVao3aoLaoHWqP/kMdUEfUCXVGXVBX1A11Rz1QT9QL9UZ9UF/UD/VHA9BANAgNRkPQUDQMDUcj0Eg0Co1GY9BYNA6NRxPQRDQJTUZT0FQ0DU1HM9BMNAvNRnPQXDQPzUcL0EKUiBahxWgJWoqWoeVoBVqJVqHVaA1ai9ah9WgD2og2oc1oC9qKtqHtaAfCEI4IRCIK0YhBLOIQjwQkIgnJSEEq0pCODGQiC9nIQS7ykI8CFKIIAQQRQjG0E+1Cu9EetBftQ/vRAXQQHUKH0RF0FB1Dx9EJdBKdQqfRGXQWnUPn0QV0EV1Cl9EVdBVdQ9fRDXQT3UK30R10F91D99ED9BA9Qo/RE/QUPUPP0Qv0Er1Cr9Eb9Ba9Q+/RB/QRfUKf0Rf0FX1D39EP9BP9Qr/RH/QX/UNJKC6WLJY8liKWMpYqljqWJpY2li6WPpYhljGWKZY5liWWNZYtlj2WI5YzliuWO5YnljeWL5Y/ViBWMFYoVjhWJFY0VixWPFYiVjJWKlY6ViZWNlYuVj5WIVYxVilWOVYlVjVWLVY9ViNWM1YrVjtWJ1Y3Vi9WP9Yg1jDWKNY41iTWNNYs1jzWIhYfaxlLiLX6H4XzoJ7nkgAAOEl1atu2bdu2bVupbdu2P9vWzPy1bbvZZ+/gxVphrbE2WFusHdYe64B1xDphnbEuWFesG9Yd64H1xHphvbE+WF+sH9YfG4ANxAZhg7Eh2FBsGDYcG4GNxEZho7Ex2FhsHDYem4BNxCZhk7Ep2FRsGjYdm4HNxGZhs7E52FxsHjYfW4AtxBKxRdhibAm2FFuGLcdWYCuxVdhqbA22FluHrcc2YBuxTdhmbAu2FduGbcd2YDuxXdhubA+2F9uH7ccOYAexQ9hh7Ah2FDuGHcdOYCexU9hp7Ax2FjuHnccuYBexS9hl7Ap2FbuGXcduYBiGYwRGYhRGYwzGYhzGYwImYhImYwqmYhqmYwZmYhZmYw7mYh7mYwEWYhEGMIghLIbdxG5ht7E72F3sHnYfe4A9xB5hj7En2FPsGfYce4G9xF5hr7E32FvsHfYe+4B9xD5hn7Ev2FfsG/Yd+4H9xH5hv7E/2F/sH5aExeHxeAKeDE+Op8BT4qnw//DUeJpUafF0eHo8A54Rz4RnxrPgWfFseHY8B54Tz4XnxvPgefF8eH68AF4QL4QXxovgRfFieHG8BF4SL4WXxsvgZfFyeHm8Al4Rr4RXxqvgVfFqeHW8Bl4Tr4XXxuvgdfF6eH28Ad4Qb4Q3xpvgTfFmeHO8Bd4Sb4W3xtvgbfF2eHu8A94R74R3xrvgXfFueHe8B94T74X3xvvgffF+eH98AD4QH4QPxofgQ/Fh+HB8BD4SH4WPxsfgY/Fx+Hh8Aj4Rn4RPxqfgU/Fp+HR8Bj4Tn4XPxufgc/F5+Hx8Ab4QT8QX4YvxJfhSfBm+HF+Br8RX4avxNfhafB2+Ht+Ab8Q34ZvxLfhWPC5xO74D34nvwnfje/C9+D58P34AP4gfwg/jR/Cj+DH8OH4CP4mfwk/jZ/Cz+Dn8PH4Bv4hfwi/jV/Cr+DX8On4Dx3AcJ3ASp3AaZ3AW53AeF3ARl3AZV3AV13AdN3ATt3Abd3AX93AfD/AQj3CAQxzhMfwmfgu/jd/B7+L38Pv4A/wh/gh/jD/Bn+LP8Of4C/wl/gp/jb/B3+Lv8Pf4B/wj/gn/jH/Bv+Lf8O/4D/wn/gv/jf/B/+L/8CQ8jognEohkRHIiBZGSSEX8R6Qm0hBpiXREeiIDkZHIRGQmshBZiWxEdiIHkZPIReQm8hB5iXxEfqIAUZAoRBQmihBFiWJEcaIEUZIoRZQmyhBliXJEeaICUZGoRFQmqhBViWpEdaIGUZOoRdQm6hB1iXpEfaIB0ZBoRDQmmhBNiWZEc6IF0ZJoRbQm2hBtiXZEe6ID0ZHoRHQmuhBdiW5Ed6IH0ZPoRfQm+hB9iX5Ef2IAMZAYRAwmhhBDiWHEcGIEMZIYRYwmxhBjiXHEeGICMZGYREwmphBTiWnEdGIGMZOYRcwm5hBziXnEfGIBsZBIJBYRi4klxFJiGbGcWEGsJFYRq4k1xFpiHbGe2EBsJDYRm4ktxFZiG7Gd2EHsJHYRu4k9xF5iH7GfOEAcJA4Rh4kjxFHiGHGcOEGcJE4Rp4kzxFniHHGeuEBcJC4Rl4krxFXiGnGduEFgBE4QBElQBE0wBEtwBE8IhEhIhEwohEpohE4YhElYhE04hEt4hE8EREhEBCAggYgYcZO4Rdwm7hB3iXvEfeIB8ZB4RDwmnhBPiWfEc+IF8ZJ4Rbwm3hBviXfEe+ID8ZH4RHwmvhBfiW/Ed+IH8ZP4Rfwm/hB/iX9EEhFHxpMJZDIyOZmCTEmmIv8jU5NpyLRkOjI9mYHMSGYiM5NZyKxkNjI7mYPMSeYic5N5yLxkPjI/WYAsSBYiC5NFyKJkMbI4WYIsSZYiS5NlyLJkObI8WYGsSFYiK5NVyKpkNbI6WYOsSdYia5N1yLpkPbI+2YBsSDYiG5NNyKZkM7I52YJsSbYiW5NtyLZkO7I92YHsSHYiO5NdyK5kN7I72YPsSfYie5N9yL5kP7I/OYAcSA4iB5NDyKHkMHI4OYIcSY4iR5NjyLHkOHI8OYGcSE4iJ5NTyKnkNHI6OYOcSc4iZ5NzyLnkPHI+uYBcSCaSi8jF5BJyKbmMXE6uIFeSq8jV5BpyLbmOXE9uIDeSm8jN5BZyK7mN3E7uIHfGx5G7yT3kXnIfuZ88QB4kD5GHySPkUfIYeZw8QZ4kT5GnyTPkWfIceZ68QF4kL5GXySvkVfIaeZ28QWIkThIkSVIkTTIkS3IkTwqkSEqkTCqkSmqkThqkSVqkTTqkS3qkTwZkSEYkICGJyBh5k7xF3ibvkHfJe+R98gH5kHxEPiafkE/JZ+Rz8gX5knxFvibfkG/Jd+R78gP5kfxEfia/kF/Jb+R38gf5k/xF/ib/kH/Jf2QSGUfFUwlUMio5lYJKSaWi/qNSU2motFQ6Kj2VgcpIZaIyU1morFQ2KjuVg8pJ5aJyU3movFQ+Kj9VgCpIFaIKLypCFaWKUcWpElRJqhRVmipDlaXKUeWpClRFqhJVmapCVaWqUdWpGlRNqhZVm6pD1aXqUfWpBlRDqhHVmGpCNaWaUc2pFlRLqhXVmmpDtaXaUe2pDlRHqhPVmepCdaW6Ud2pHlRPqhfVm+pD9aX6Uf2pAdRAahA1mBpCDaWGUcOpEdRIahQ1mhpDjaXGUeOpCdREahI1mZpCTaWmUdOpGdRMahY1m5pDzaXmUfOpBdRCKpFaRC2mllBLqWXUcmoFtZJaRa2m1lBrqXXUemoDtZHaRG2mtlBbqW3UdmoHtZPaRe2m9lB7qX3UfuoAdZA6RB2mjlBHqWPUceoEdZI6RZ2mzlBnqXPUeeoCdZG6RF2mrlBXqWvUdeoGhVE4RVAkRVE0xVAsxVE8JVAiJVEypVAqpVE6ZVAmZVE25VAu5VE+FVAhFVGAghSiYtRN6hZ1m7pD3aXuUfepB9RD6hH1mHpCPaWeUc+pF9RL6hX1mnpDvaXeUe+pD9RH6hP1mfpCfaW+Ud+pH9RP6hf1m/pD/aX+UUlUHB1PJ9DJ6OR0CjolnYr+j05Np6HT0uno9HQGOiOdic5MZ6Gz0tno7HQOOiedi85N56Hz0vno/HQBuiBdiC5MF6GL0sXo4nQJuiRdii5Nl6HL0uXo8nQFuiJdia5MV6Gr0tXo6nQNuiZdi65N16Hr0vXo+nQDuiHdiG5MN6Gb0s3o5nQLuiXdim5Nt6Hb0u3o9nQHuiPdie5Md6G70t3o7nQPuifdi+5N96H70v3o/vQAeiA9iB5MD6GH0sPo4fQIeiQ9ih5Nj6HH0uPo8fQEeiI9iZ5MT6Gn0tPo6fQMeiY9i55Nz6Hn0vPo+fQCeiGdSC+iF9NL6KX0Mno5vYJeSa+iV9Nr6LX0Ono9vYHeSG+iN9Nb6K30Nno7vYPeSe+id9N76L30Pno/fYA+SB+iD9NH6KP0Mfo4fYI+SZ+iT9Nn6LP0Ofo8fYG+SF+iL9NX6Kv0Nfo6fYPGaJwmaJKmaJpmaJbmaJ4WaJGWaJlWaJXWaJ02aJO2aJt2aJf2aJ8O6JCOaEBDGtEx+iZ9i75N36Hv0vfo+/QD+iH9iH5MP6Gf0s/o5/QL+iX9in5Nv6Hf0u/o9/QH+iP9if5Mf6G/0t/o7/QP+if9i/5N/6H/0v/oJDqOiWcSmGRMciYFk5JJxfzHpGbSMGmZdEx6JgOTkcnEZGayMFmZbEx2JgeTk8nF5GbyMHmZfEx+pgBTkCnEFGaKMEWZYkxxpgRTkinFlGbKMGWZckx5pgJTkanEVGaqMFWZakx1pgZTk6nF1GbqMHWZekx9pgHTkGnENGaaME2ZZkxzpgXTkmnFtGbaMG2Zdkx7pgPTkenEdGa6MF2Zbkx3pgfTk+nF9Gb6MH2Zfkx/ZgAzkBnEDGaGMEOZYcxwZgQzkhnFjGbGMGOZccx4ZgIzkZnETGamMFOZacx0ZgYzk5nFzGbmMHOZecx8ZgGzkElkFjGLmSXMUmYZs5xZwaxkVjGrmTXMWmYds57ZwGxkNjGbmS3MVmYbs53ZwexkdjG7mT3MXmYfs585wBxkDjGHmSPMUeYYc5w5wZxkTjGnmTPMWeYcc565wFxkLjGXmSvMVeYac525wWAMzhAMyVAMzTAMy3AMzwiMyEiMzCiMymiMzhiMyViMzTiMy3iMzwRMyEQMYCCDmBhzk7nF3GbuMHeZe8x95gHzkHnEPGaeME+ZZ8xz5gXzknnFvGbeMG+Zd8x75gPzkfnEfGa+MF+Zb8x35gfzk/nF/Gb+MH+Zf0wSE8fGswlsMjY5m4JNyaZi/2NTs2nYtGw6Nj2bgc3IZmIzs1nYrGw2Njubg83J5mJzs3nYvGw+Nj9bgC3IFmILs0XYomwxtjhbgi3JlmJLs2XYsmw5tjxbga3IVmIrs1XYqmw1tjpbg63J1mJrs3XYumw9tj7bgG3INmIbs03Ypmwztjnbgm3JtmJbs23Ytmw7tj3bge3IdmI7s13Yrmw3tjvbg+3J9mJ7s33Yvmw/tj87gB3IDmIHs0PYoewwdjg7gh3JjmJHs2PYsew4djw7gZ3ITmIns1PYqew0djo7g53JzmJns3PYuew8dj67gF3IJrKL2MXsEnYpu4xdzq5gV7Kr2NXsGnYtu45dz25gN7Kb2M3sFnYru43dzu5gd7K72N3sHnYvu4/dzx5gD7KH2MPsEfYoe4w9zp5gT7Kn2NPsGfYse449z15gL7KX2MvsFfYqe429zt5gMRZnCZZkKZZmGZZlOZZnBVZkJVZmFVZlNVZnDdZkLdZmHdZlPdZnAzZkIxawkEVsjL3J3mJvs3fYu+w99j77gH3IPmIfs0/Yp+wz9jn7gn3JvmJfs2/Yt+w79j37gf3IfmI/s1/Yr+w39jubEPeT/cX+Zv+wf9l/bBIbx8VzCVwyLjmXgkvJFYz7j0vNpeHScum49FwGLiOXicvMZeGyctm47FwOLieXi8vN5eHycvm4/FwBriBXiCvMFeGKcsW44lwJriRXiivNleHKcuW48lwFriJXiavMVeGqctW46lwNriZXi6vN1eHqcvW4+lwDriHXiGvMNeGacs245lwLriXXimvNteHacu249lwHriPXievMdeG6ct247lwPrifXi+vN9eH6cv24/twAbiA3iBvMDeGGcsO44dwIbiQ3ihvNjeHGcuO48dwEbiI3iZvMTeGmctO46dwMbiY3i5vNzeHmcvO4+dwCbiGXyC3iFnNLuKXcMm45t4Jbya3iVnNruLXcOm49t4HbyG3iNnNbuK3cNm47t4Pbye3idnN7uL3cPm4/d4A7yB3iDnNHuKPcMe44d4I7yZ3iTnNnuLPcOe48d4G7yF3iLnNXuKvcNe46d4PDOJwjOJKjOJpjOJbjOJ4TOJGTOJlTOJXTOJ0zOJOzOJtzOJfzOJ8LuJCLOMBBDnEx7iZ3i7vN3eHucve4+9wD7iH3iHvMPeGecs+459wL7iX3invNveHecu+499wH7iP3ifvMfeG+ct+479wP7if3i/vN/eH+cv+4JC6Oj+cT+GR8cj4Fn5JPxf/Hp+bT8Gn5dHx6PgOfkc/EZ+az8Fn5bHx2Pgefk8/F5+bz8Hn5fHx+vgBfkC/EF+aL8EX5YnxxvgRfki/Fl+bL8GX5cnx5vgJfka/EV+ar8FX5anx1vgZfk6/F1+br8HX5enx9vgHfkG/EN+ab8E35ZnxzvgXfkm/Ft+bb8G35dnx7vgPfke/Ed+a78F35bnx3vgffk+/F9+b78H35fnx/fgA/kB/ED+aH8EP5YfxwfgQ/kh/Fj+bH8GP5cfx4fgI/kZ/ET+an8FP5afx0fgY/k5/Fz+bn8HP5efx8fgG/kE/kF/GL+SX8Un4Zv5xfwa/kV/Gr+TX8Wn4dv57fwG/kN/Gb+S38Vn4bv53fwe/kd/G7+T38Xn4fv58/wB/kD/GH+SP8Uf4Yf5w/wZ/kT/Gn+TP8Wf4cf56/wF/kL/GX+Sv8Vf4af52/wWM8zhM8yVM8zTM8y3M8zwu8yEu8zCu8ymu8zhu8yVu8zTu8y3u8zwd8yEc84CGP+Bh/k7/F3+bv8Hf5e/x9/gH/kH/EP+af8E/5Z/xz/gX/kn/Fv+bf8G/5d/x7/gP/kf/Ef+a/8F/5b/x3/gf/k//F/+b/8H/5f3wSHyfECwlCMiG5kEJIKaQS/hNSC2mEtEI6Ib2QQcgoZBIyC1mErEI2IbuQQ8gp5BJyC3mEvEI+Ib9QQCgoFBIKC0WEokIxobhQQigplBJKC2WEskI5obxQQagoVBIqC1WEqkI1obpQQ6gp1BJqC3WEukI9ob7QQGgoNBIaC02EpkIzobnQQmgptBJaC22EtkI7ob3QQegodBI6C12ErkI3obvQQ+gp9BJ6C32EvkI/ob8wQBgoDBIGC0OEocIwYbgwQhgpjBJGC2OEscI4YbwwQZgoTBImC1OEqcI0YbowQ5gpzBJmC3OEucI8Yb6wQFgoJAqLhMXCEmGpsExYLqwQVgqrhNXCGmGtsE5YL2wQNgqbhM3CFmGrsE3YLuwQdgq7hN3CHmGvsE/YLxwQDgqHhMPCEeGocEw4LpwQTgqnhNPCGeGscE44L1wQLgqXhMvCFeGqcE24LtwQMAEXCIEUKIEWGIEVOIEXBEEUJEEWFEEVNEEXDMEULMEWHMEVPMEXAiEUIgEIUEBCTLgp3BJuC3eEu8I94b7wQHgoPBIeC0+Ep8Iz4bnwQngpvBJeC2+Et8I74b3wQfgofBI+C1+Er8I34bvwQ/gp/BJ+C3+Ev8I/IUmIE+PFBDGZmFxMIaYUU4n/ianFNGJaMZ2YXswgZhQziZnFLGJWMZuYXcwh5hRzibnFPGJeMZ+YXywgFhQLiYXFImJRsZhYXCwhlhRLiaXFMmJZsZxYXqwgVhQriZXFKmJVsZpYXawh1hRribXFOmJdsZ5YX2wgNhQbiY3FJmJTsZnYXGwhthRbia3FNmJbsZ3YXuwgdhQ7iZ3FLmJXsZvYXewh9hR7ib3FPmJfsZ/YXxwgDhQHiYPFIeJQcZg4XBwhjhRHiaPFMeJYcZw4XpwgThQniZPFKeJUcZo4XZwhzhRnibPFOeJccZ44X1wgLhQTxUXiYnGJuFRcJi4XV4grxVXianGNuFZcJ64XN4gbxU3iZnGLuFXcJm4Xd4g7xV3ibnGPuFfcJ+4XD4gHxUPiYfGIeFQ8Jh4XT4gnxVPiafGMeFY8J54XL4gXxUviZfGKeFW8Jl4Xb4iYiIuESIqUSIuMyIqcyIuCKIqSKIuKqIqaqIuGaMbFi7boiK7oib4YiKEYiUCEIhJj4k3xlnhbvCPeFe+J98UH4kPxkfhYfCI+FZ+Jz8UX4kvxlfhafCO+Fd+J78UP4kfxk/hZ/CJ+Fb+J38Uf4k/xl/hb/CP+Ff+JSWKcFC8lSMmk5FIKKaWUSvpPSi2lkdJK6aT0UgYpo5RJyixlkbJK2aTsUg4pp5RLyi3lkfJK+aT8UgGpoFRIKiwVkYpKxaTiUgmppFRKKi2VkcpK5aTyUgWpolRJqixVkapK1aTqUg2pplRLqi3VkepK9aT6UgOpodRIaiw1kZpKzaTmUguppdRKai21kdpK7aT2Ugepo9RJ6ix1kbpK3aTuUg+pp9RL6i31kfpK/aT+0gBpoDRIGiwNkYZKw6Th0ghppDRKGi2NkcZK46Tx0gRpojRJmixNkaZK06Tp0gxppjRLmi3NkeZK86T50gJpoZQoLZIWS0ukpdIyabm0QloprZJWS2uktdI6ab20QdoobZI2S1ukrdI2abu0Q9op7ZJ2S3ukvdI+ab90QDooHZIOS0eko9Ix6bh0QjqZ6pR0WjojnZXOSeelC9JF6ZJ0WboiXZWuSdelGxIm4RIhkRIl0RIjsRIn8ZIgiZIkyZIiqZIm6ZIhmZIl2ZIjuZIn+VIghVIkAQlKSIpJN6Vb0m3pjnRXuifdlx5ID6VH0mPpifRUeiY9l15IL6VX0mvpjfRWeie9lz5IH6VP0mfpi/RV+iZ9l35IP6Vf0m/pj/RX+iclSXFyvJwgJ5OTyynklHIq+T85tZxGTiunk9PLGeSMciY5s5xFzipnk7PLOeScci45t5xHzivnk/PLBeSCciG5sFxELioXk4vLJeSScim5tFxGLiuXk8vLFeSKciW5slxFripXk6vLNeSaci25tlxHrivXk+vLDeSGciO5sdxEbio3k5vLLeSWciu5tdxGbiu3k9vLHeSOcie5s9xF7ip3k7vLPeSeci+5t9xH7iv3k/vLA+SB8iB5sDxEHioPk4fLI+SR8ih5tDxGHiuPk8fLE+SJ8iR5sjxFnipPk6fLM+SZ8ix5tjxHnivPk+fLC+SFcqK8SF4sL5GXysvk5fIKeaW8Sl4tr5HXyuvk9fIGeaO8Sd4sb5G3ytvk7fIOeae8S94t75H3yvvk/fIB+aB8SD4sH5GPysfk4/IJ+aR8Sj4tn5HPyufk8/IF+aJ8Sb4sX5Gvytfk6/INGZNxmZBJmZJpmZFZmZN5WZBFWZJlWZFVWZN12ZBN2ZJt2ZFd2ZN9OZBDOZKBDGUkx+Sb8i35tnxHvivfk+/LD+SH8iP5sfxEfio/k5/LL+SX8iv5tfxGfiu/k9/LH+SP8if5s/xF/ip/k7/LP+Sf8i/5t/xH/iv/k5PkOCVeSVCSKcmVFEpKJZXyn5JaSaOkVdIp6ZUMSkYlk5JZyaJkVbIp2ZUcSk4ll5JbyaPkVfIp+ZUCSkGlkFJYKaIUVYopxZUSSkmllFJaKaOUVcop5ZUKSkWlklJZqaJUVaop1ZUaSk2lllJbqaPUVeop9ZUGSkOlkdJYaaI0VZopzZUWSkulldJaaaO0Vdop7ZUOSkelk9JZ6aJ0Vbop3ZUeSk+ll9Jb6aP0Vfop/ZUBykBlkDJYGaIMVYYpw5URykhllDJaGaOMVcYp45UJykRlkjJZmaJMVaYp05UZykxlljJbmaPMVeYp85UFykIlUVmkLFaWKEuVZcpyZYWyUlmlrFbWKGuVdcp6ZYOyUdmkbFa2KFuVbcp2ZYeyU9ml7Fb2KHuVfcp+5YByUDmkHFaOKEeVY8px5YRyUjmlnFbOKGeVc8p55YJyUbmkXFauKFeVa8p15YaCKbhCKKRCKbTCKKzCKbwiKKIiKbKiKKqiKbpiKKZiKbbiKK7iKb4SKKESKUCBClJiyk3llnJbuaPcVe4p95UHykPlkfJYeaI8VZ4pz5UXykvllfJaeaO8Vd4p75UPykflk/JZ+aJ8Vb4p35Ufyk/ll/Jb+aP8Vf4pSUqcGq8mqMnU5GoKNaWaSv1PTa2mUdOq6dT0agY1o5pJzaxmUbOq2dTsag41p5pLza3mUfOq+dT8agG1oFpILawWUYuqxdTiagm1pFpKLa2WUcuq5dTyagW1olpJraxWUauq1dTqag21plpLra3WUeuq9dT6agO1odpIbaw2UZuqzdTmagu1pdpKba22Uduq7dT2age1o9pJ7ax2Ubuq3dTuag+1p9pL7a32Ufuq/dT+6gB1oDpIHawOUYeqw9Th6o+kkeoodbQ6Rh2rjlPHqxPUieokdbI6RZ2qTlOnqzPUmeosdbY6R52rzlPnqwvUhWqiukhdrC5Rl6rL1OXqCnWlukpdra5R16rr1PXqBnWjukndrG5Rt6rb1O3qDnWnukvdre5R96r71P3qAfWgekg9rB5Rj6rH1OPqCfWkeko9rZ5Rz6rn1PPqBfWiekm9rF5Rr6rX1OvqDRVTcZVQSZVSaZVRWZVTeVVQRVVSZVVRVVVTddVQTdVSbdVRXdVTfTVQQzVSgQpVpMbUm+ot9bZ6R72r3lPvqw/Uh+oj9bH6RH2qPlOfqy/Ul+or9bX6Rn2rvlPfqx/Uj+on9bP6Rf2qflO/qz/Un+ov9bf6R/2r/lOT1DgtXkvQkmnJtRRaSi1VQlxcnJZGS6ul09JrGbSMWiYts5ZFy6pl07JrObScWi4tt5ZHy6vl0/JrBbSCWiGtsFZEK6oV04prJbSSWimttFZGK6uV08prFbSKWiWtslZFq6pV06prNbSaWi2ttlZHq6vV0+prDbSGWiOtsdZEa6o105prLbSWWiuttdZGa6u109prHbSOWiets9ZF66p107prPbSeWi+tt9ZH66v10/prA7SB2v/BQ7Sh2jBtuDZCG6mN0kZrY7Sx2jhtvDZBm6hN0iZrU7Sp2jRtujZDm6nN0mZrc7S52jxtvrZAW6glaou0xdoSbam2TFuurdBWaqu01doaba22TluvbdA2apu0zdoWbau2Tduu7dB2aru03doeba+2T9uvHdAOaoe0w9oR7ah2TDuundBOaqe009oZ7ax2TjuvXdAuape0y9oV7ap2Tbuu3dAwDdcIjdQojdYYjdU4jdcETdQkTdYUTdU0TdcMzdQszdYczdU8zdcCLdQiDWhQQ1pMu6nd0m5rd7S72j3tvvZAe6g90h5rT7Sn2jPtufZCe6m90l5rb7S32jvtvfZB+6h90j5rX7Sv2jftu/ZD+6n90n5rf7S/2j8tSYvT4/UEPZmeXE+hp9RT6f/pqfU0elo9nZ5ez6Bn1DPpmfUselY9m55dz6Hn1HPpufU8el49n55fL6AX1AvphfUielG9mF5cL6GX1EvppfUyelm9nF5er6BX1CvplfUqelW9ml5dr6HX1GvptfU6el29nl5fb6A31BvpjfUmelO9md5cb6G31FvprfU2elu9nd5e76B31DvpnfUuele9m95d76H31HvpvfU+el+9n95fH6AP1Afpg/Uh+lB9mD5cH6GP1Efpo/Ux+lh9nD5en6BP1Cfpk/Up+lR9mj5dn6HP1Gfps/U5+lx9nj5fX6Av1BP1RfpifYm+VF+mL9dX6Cv1VfpqfY2+Vl+nr9c36Bv1TfpmfYu+Vd+mb9d36Dv1XfpufY++V9+n79cP6Af1Q/ph/Yh+VD+mH9dP6Cf1U/pp/Yx+Vj+nn9cv6Bf1S/pl/Yp+Vb+mX9dv6JiO64RO6pRO64zO6pzO64Iu6pIu64qu6pqu64Zu6pZu647u6p7u64Ee6pEOdKgjPabf1G/pt/U7+l39nn5ff6A/1B/pj/Un+lP9mf5cf6G/1F/pr/U3+lv9nf5e/6B/1D/pn/Uv+lf9m/5d/6H/1H/pv/U/+l/9n56kxxnxRoKRzEhupDBSGqmM/4zURhojrZHOSG9kMDIamYzMRhYjq5HNyG7kMHIauYzcRh4jr5HPyG8UMAoahYzCRhGjqFHMKG6UMEoapYzSRhmjrFHOKG9UMCoalYzKRhWjqlHNqG7UMGoatYzaRh2jrlHPqG80MBoajYzGRhOjqdHMaG60MFoarYzWRhujrdHOaG90MDoanYzORhejq9HN6G70MHoavYzeRh+jr9HP6G8MMAYag4zBxhBjqDHMGG6MMEYao4zRxhhjrDHOGG9MMCYak4zJxhRjqjHNmG7MMGYas4zZxhxjrjHPmG8sMBYaicYiY7GxxFhqLDOWGyuMlcYqY7WxxlhrrDPWGxuMjcYmY7OxxdhqbDO2GzuMncYuY7exx9hr7DP2GweMg8Yh47BxxDhqHDOOGyeMk8Yp47RxxjhrnDPOGxeMi8Yl47JxxbhqXDOuGzcMzMANwiANyqANxmANzuANwRANyZANxVANzdANwzANy7ANx3ANz/CNwAiNyAAGNJARM24at4zbxh3jrnHPuG88MB4mPDIeG0+Mp8Yz47nxwnhpvDJeG2+Mt8Y7473xwfhofDI+G1+Mr8Y347vxw/hp/DJ+G3+Mv8Y/I8mIM+PNBDOZmdxMYaY0U5n/manNNGZaM52Z3sxgZjQzmZnNLGZWM5uZ3cxh5jRzmbnNPGZeM5+Z3yxgFjQLmYXNImZRs5hZ3CxhljRLmaXNMmZZs5xZ3qxgVjQrmZXNKmZVs5pZ3axh1jRrmbXNOmZds55Z32xgNjQbmY3NJmZTs5nZ3GxhtjRbma3NNmZbs53Z3uxgdjQ7mZ3NLmZXs5vZ3exh9jR7mb3NPmZfs5/Z3xxgDjQHmYPNIeZQc5g53BxhjjRHmaPNMeZYc5w53pxgTjQnmZPNKeZUc5o53ZxhzjRnmbPNOeZcc54531xgLjQTzUXmYnOJudRcZi43V5grzVXmanONudZcZ643N5gbzU3mZnOLudXcZm43d5g7zV3mbnOPudfcZ+43D5gHzUPmYfOIedQ8Zh43T5gnzVPmafOMedY8Z543L5gXzUvmZfOKedW8Zl43b5iYiZuESZqUSZuMyZqcyZuCKZqSKZuKqZqaqZuGaZqWaZuO6Zqe6ZuBGZqRCUxoIjNm3jRvmbfNO+Zd855533xgPjQfmY/NJ+ZT85n53HxhvjRfma/NN+Zb85353vxgfjQ/mZ/NL+ZX85v53fxh/jR/mb/NP+Zf85+ZZMZZ8VaClcxKbqWwUlqprP+s1FYaK62VzkpvZbAyWpmszFYWK6uVzcpu5bByWrms3FYeK6+Vz8pvFbAKWoWswlYRq6hVzCpulbBKWqWs0lYZq6xVzipvVbAqWpWsylYVq6pVzapu1bBqWrWs2lYdq65Vz6pvNbAaWo2sxlYTq6nVzGputbBaWq2s1lYbq63VzmpvdbA6Wp2szlYXq6vVzepu9bB6Wr2s3lYfq6/Vz+pvDbAGWoOswdYQa6g1zBpujbBGWqOs0dYYa6w1zhpvTbAmWpOsydYUa6o1zZpuzbBmWrOs2dYca641z5pvLbAWWonWImuxtcRaai2zllsrrJXWKmu1tcZaa62z1lsbrI3WJmuztcXaam2ztls7rJ3WLmu3tcfaa+2z9lsHrIPWIeuwdcQ6ah2zjlsnrJPWKeu0dcY6a52zzlsXrIvWJeuydcW6al2zrls3LMzCLcIiLcqiLcZiLc7iLcESLcmSLcVSLc3SLcMyLcuyLcdyLc/yrcAKrcgCFrSQFbNuWres29Yd6651z7pvPbAeWo+sx9YT66n1zHpuvbBeWq+s19Yb6631znpvfbA+Wp+sz9YX66v1zfpu/bB+Wr+s39Yf66/1z0qy4ux4O8FOZie3U9gp7VT2f3ZqO42d1k5np7cz2BntTHZmO4ud1c5mZ7dz2DntXHZuO4+d185n57cL2AXtQnZhu4hd1C5mF7dL2CXtUnZpu4xd1i5nl7cr2BXtSnZlu4pd1a5mV7dr2DXtWnZtu45d165n17cb2A3tRnZju4nd1G5mN7db2C3tVnZru43d1m5nt7c72B3tTnZnu4vd1e5md7d72D3tXnZvu4/d1+5n97cH2APtQfZge4g91B5mD7dH2CPtUfZoe4w91h5nj7cn2BPtSfZke4o91Z5mT7dn2DPtWfZse449155nz7cX2AvtRHuRvdheYi+1l9nL7RX2SnuVvdpeY6+119nr7Q32RnuTvdneYm+1t9nb7R32TnuXvdveY++199n77QP2QfuQfdg+Yh+1j9nH7RP2SfuUfdo+Y5+1z9nn7Qv2RfuSfdm+Yl+1r9nX7Rs2ZuM2YZM2ZdM2Y7M2Z/O2YIu2ZMu2Yqu2Zuu2YZu2Zdu2Y7u2Z/t2YId2ZAMb2siO2TftW/Zt+459175n37cf2A/tR/Zj+4n91H5mP7df2C/tV/Zr+4391n5nv7c/2B/tT/Zn+4v91f5mf7d/2D/tX/Zv+4/91/5nJ9lxTryT4CRzkjspnJROKuc/J7WTxknrpHPSOxmcjE4mJ7OTxcnqZHOyOzmcnE4uJ7eTx8nr5HPyOwWcgk4hp7BTxCnqFHOKOyWckk4pp7RTxinrlHPKOxWcik4lp7JTxanqVHOqOzWcmk4tp7ZTx6nr1HPqOw2chk4jp7HTxGnqNHOaOy2clk4rp7XTxmnrtHPaOx2cjk4np7PTxenqdHO6Oz2cnk4vp7fTx+nr9HP6OwOcgc4gZ7AzxBnqDHOGOyOckc4oZ7QzxhnrjHPGOxOcic4kZ7IzxZnqTHOmOzOcmc4sZ7Yzx5nrzHPmOwuchU6is8hZ7CxxljrLnOXOCmels8pZ7axx1jrrnPXOBmejs8lJdLY4W51tznZnh7PT2eXsdvY4e519zn7ngHPQOeQcdo44R51jznHnhHPSOeWcds44Z51zznnngnPRueRcdq44V51rznXnhoM5uEM4pEM5tMM4rMM5vCM4oiM5sqM4qqM5umM4pmM5tuM4ruM5vhM4oRM5wIEOcmLOTeeWc9u549x17jn3nQfOQ+eR89h54jx1njnPnRfOS+eV89p547x13jnvnQ/OR+eT89n54nx1vjnfnR/OT+eX89v54/x1/jlJTpwb7ya4ydzkbgo3pZvK/c9N7aZx07rp3PRuBjejm8nN7GZxs7rZ3OxuDjenm8vN7eZx87r53PxuAbegW8gt7BZxi7rF3OJuCbekW8ot7ZZxy7rl3PJuBbeiW8mt7FZxq7rV3OpuDbemW8ut7dZx67r13PpuA7eh28ht7DZxm7rN3OZuC7el28pt7bZx27rt3PZuB7ej28nt7HZxu7rd3O5uD7en28vt7fZx+7r93P7uAHegO8gd7A5xh7rD3OHuCHekO8od7Y5xx7rj3PHuBHeiO8md7E5xp7rT3OnuDHemO8ud7c5x57rz3PnuAnehm+guche7S9yl7jJ3ubvCXemucle7a9y17jp3vbvB3ehucje7W9yt7jZ3u7vD3enucne7e9y97j53v3vAPegecg+7R9yj7jH3uHvCPemeck+7Z9yz7jn3vHvBveheci+7V9yr7jX3unvDxVzcJVzSpVzaZVzW5VzeFVzRlVzZVVzV1VzdNVzTtVzbdVzX9VzfDdzQjVzgQhe5Mfeme8u97d5x77r33PvuA/eh+8h97D5xn7rP3OfuC/el+8p97b5x37rv3PfuB/ej+8n97H5xv7rf3O/uD/en+8v97f5x/7r/3CQ3zov3ErxkXnIvhZfSS+X956X20nhpvXReei+Dl9HL5GX2snhZvWxedi+Hl9PL5eX28nh5vXxefq+AV9Ar5BX2inhFvWJeca+EV9Ir5ZX2ynhlvXJeea+CV9Gr5FX2qnhVvWpeda+GV9Or5dX26nh1vXpefa+B19Br5DX2mnhNvWZec6+F19Jr5bX22nhtvXZee6+D19Hr5HX2unhdvW5ed6+H19Pr5fX2+nh9vX5ef2+AN9Ab5A32hnhDvWHecG+EN9Ib5Y32xnhjvXHeeG+CN9Gb5E32pnhTvWnedG+GN9Ob5c325nhzvXnefG+Bt9BL9BZ5i70l3lJvmbfcW+Gt9FZ5q7013lpvnbfe2+Bt9DZ5m70t3lZvm7fd2+Ht9HZ5u7093l5vn7ffO+Ad9A55h70j3lHvmHfcO+Gd9E55p70z3lnvnHfeu+Bd9C55l70r3lXvmnfdu+FhHu4RHulRHu0xHutxHu8JnuhJnuwpnuppnu4ZnulZnu05nut5nu8FXuhFHvCgh7yYd9O75d327nh3vXvefe+B99B75D32nnhPvWfec++F99J75b323nhvvXfee++D99H75H32vnhfvW/ed++H99P75f32/nh/vX9ekhfnx/sJfjI/uZ/CT+mn8v/zU/tp/LR+Oj+9n8HP6GfyM/tZ/Kx+Nj+7n8PP6efyc/t5/Lx+Pj+/X8Av6BfyC/tF/KJ+Mb+4X8Iv6ZfyS/tl/LJ+Ob+8X8Gv6FfyK/tV/Kp+Nb+6X8Ov6dfya/t1/Lp+Pb++38Bv6DfyG/tN/KZ+M7+538Jv6bfyW/tt/LZ+O7+938Hv6HfyO/td/K5+N7+738Pv6ffye/t9/L5+P7+/P8Af6A/yB/tD/KH+MH+4P8If6Y/yR/tj/LH+OH+8P8Gf6E/yJ/tT/Kn+NH+6P8Of6c/yZ/tz/Ln+PH++v8Bf6Cf6i/zF/hJ/qb/MX+6v8Ff6q/zV/hp/rb/OX+9v8Df6m/zN/hZ/q7/N3+7v8Hf6u/zd/h5/r7/P3+8f8A/6h/zD/hH/qH/MP+6f8E/6p/zT/hn/rH/OP+9f8C/6l/zL/hX/qn/Nv+7f8DEf9wmf9Cmf9hmf9Tmf9wVf9CVf9hVf9TVf9w3f9C3f9h3f9T3f9wM/9CMf+NBHfsy/6d/yb/t3/Lv+Pf++/8B/6D/yH/tP/Kf+M/+5/8J/6b/yX/tv/Lf+O/+9/8H/6H/yP/tf/K/+N/+7/8P/6f/yf/t//L/+Pz/Jjwvig4QgWZA8SBGkDFIF/wWpgzRB2iBdkD7IEGQMMgWZgyxB1iBbkD3IEeQMcgW5gzxB3iBfkD8oEBQMCgWFgyJB0aBYUDwoEZQMSgWlgzJB2aBcUD6oEFQMKgWVgypB1aBaUD2oEdQMagW1gzpB3aBeUD9oEDQMGgWNgyZB06BZ0DxoEbQMWgWtgzZB26Bd0D7oEHQMOgWdgy5B16Bb0D3oEfQMegW9gz5B36Bf0D8YEAwMBgWDgyHB0GBYMDwYEYwMRgWjgzHB2GBcMD6YEEwMJgWTgynB1GBaMD2YEcwMZgWzgznB3GBeMD9YECwMEoNFweJgSbA0WBYsD1YEK4NVwepgTbA2WBesDzYEG4NNweZgS7A12BZsD3YEO4Ndwe5gT7A32BfsDw4EB4NDweHgSHA0OBYcD04EJ4NTwengTHA2OBecDy4EF4NLweXgSnA1uBZcD24EWIAHREAGVEAHTMAGXMAHQiAGUiAHSqAGWqAHRmAGVmAHTuAGXuAHQRAGUQACGKAgFtwMbgW3gzvB3eBecD94EDwMHgWPgyfB0+BZ8Dx4EbwMXgWvgzfB2+Bd8D74EHwMPgWfgy/B1+Bb8D34EfwMfgW/gz/B3+BfkBTEhfFhQpgsTB6mCFOGqcL/wtRhmjBtmC5MH2YIM4aZwsxhljBrmC3MHuYIc4a5wtxhnjBvmC/MHxYIC4aFwsJhkbBoWCwsHpYIS4alwtJhmbBsWC4sH1YIK4aVwsphlbBqWC2sHtYIa4a1wtphnbBuWC+sHzYIG4aNwsZhk7Bp2CxsHrYIW4atwtZhm7Bt2C5sH3YIO4adws5hl7Br2C3sHvYIe4a9wt5hn7Bv2C/sHw4IB4aDwsHhkHBoOCwcHo4IR4ajwtHhmHBsOC4cH04IJ4aTwsnhlHBqOC2cHs4IZ4azwtnhnHBuOC+cHy4IF4aJ4aJwcbgkXBouC5eHK8KV4apwdbgmXBuuC9eHG8KN4aZwc7gl3BpuC7eHO8Kd4a5wd7gn3BvuC/eHB8KD4aHwcHgkPBoeC4+HJ8KT4anwdHgmPBueC8+HF8KL4aXwcnglvBpeC6+HN0IsxEMiJEMqpEMmZEMu5EMhFEMplEMlVEMt1EMjNEMrtEMndEMv9MMgDMMoBCEMURgLb4a3wtvhnfBueC+8Hz4IH4aPwsfhk/Bp+Cx8Hr4IX4avwtfhm/Bt+C58H34IP4afws/hl/Br+C38Hv4If4a/wt/hn/Bv+C9MCuOi+CghShYlj1JEKaNU0X9R6ihNlDZKF6WPMkQZo0xR5ihLlDXKFmWPckQ5o1xR7ihPlDfKF+WPCkQFo0JR4ahIVDQqFhWPSkQlo1JR6ahMVDYqF5WPKkQVo0pR5ahKVDWqFlWPakQ1o1pR7ahOVDeqF9WPGkQNo0ZR46hJ1DRqFjWPWkQto1ZR66hN1DZqF7WPOkQdo05R56hL1DXqFnWPekQ9o15R76hP1DfqF/WPBkQDo0HR4GhINDQaFg2PRkQjo1HR6GhMNDYaF42PJkQTo0nR5GhKNDWaFk2PZkQzo1nR7GhONDeaF82PFkQLo8RoUbQ4WhItjZZFy6MV0cpoVbQ6WhOtjdZF66MN0cZoU7Q52hJtjbZF26Md0c5oV7Q72hPtjfZF+6MD0cHoUHQ4OhIdjY5Fx6MT0cnoVHQ6OhOdjc5F56ML0cXoUnQ5uhJdja5F16MbERbhERGRERXREROxERfxkRCJkRTJkRKpkRbpkRGZkRXZkRO5kRf5URCFURSBCEYoikU3o1vR7ehOdDe6F92PHkQPo0fR4+hJ9DR6Fj2PXkQvo1fR6+hN9DZ6F72PPkQfo0/R5+hL9DX6Fn2PfkQ/o1/R7+hP9Df6FyVFcSAeJIBkIDlIAVKCVOA/kBqkAWlBOpAeZAAZQSaQGWQBWUE2kB3kADlBLpAb5AF5QT6QHxQABUEhUBgUAUVBMVAclAAlQSlQGpQBZUE5UB5UABVBJVAZVAFVQTVQHdQANUEtUBvUAXVBPVAfNAANQSPQGDQBTUEz0By0AC1BK9AatAFtQTvQHnQAHUEn0Bl0AV1BN9Ad9AA9QS/QG/QBfUE/0B8MAAPBIDAYDAFDwTAwHIwAI8EoMBqMAWPBODAeTAATwSQwGUwBU8E0MB3MADPBLDAbzAFzwTwwHywAC0EiWAQWgyVgKVgGloMVYCVYBVaDNWAtWAfWgw1gI9gENoMtYCvYBraDHWAn2AV2gz1gL9gH9oMD4CA4BA6DI+AoOAaOgxPgJDgFToMz4Cw4B86DC+AiuAQugyvgKrgGroMbAAM4IAAJKEADBrCAAzwQgAgkIAMFqEADOjCACSxgAwe4wAM+CEAIIgAABAjEwE1wC9wGd8BdcA/cBw/AQ/AIPAZPwFPwDDwHL8BL8Aq8Bm/AW/AOvAcfwEfwCXwGX8BX8A18Bz/AT/AL/AZ/wF/wDySBOBgPE2AymBymgClhKvgfTA3TwLQwHUwPM8CMMBPMDLPArDAbzA5zwJwwF8wN88C8MB/MDwvAgrAQLAyLwKKwGCwOS8CSsBQsDcvAsrAcLA8rwIqwEqwMq8CqsBqsDmvAmrAWrA3rwLqwHqwPG8CGsBFsDJvAprAZbA5bwJawFWwN28C2sB1sDzvAjrAT7Ay7wK6wG+wOe8CesBfsDfvAvrAf7A8HwIFwEBwMh8ChcBgcDkfAkXAUHA3HwLFwHBwPJ8CJcBKcDKfAqXAanA5nwJlwFpwN58C5cB6cDxfAhTARLoKL4RK4FC6Dy+EKuBKugqvhGrgWroPr4Qa4EW6Cm+EWuBVug9vhDrgT7oK74R64F+6D++EBeBAegofhEXgUHoPH4Ql4Ep6Cp+EZeBaeg+fhBXgRXoKX4RV4FV6D1+ENiEEcEpCEFKQhA1nIQR4KUIQSlKECVahBHRrQhBa0oQNd6EEfBjCEEQQQQgRj8Ca8BW/DO/AuvAfvwwfwIXwEH8Mn8Cl8Bp/DF/AlfAVfwzfwLXwH38MP8CP8BD/DL/Ar/Aa/wx/wJ/wFf8M/8C/8B5NgHIpHCSgZSo5SoJQoFfoPpUZpUFqUDqVHGVBGlAllRllQVpQNZUc5UE6UC+VGeVBelA/lRwVQQVQIFUZFUFFUDBVHJVBJVAqVRmVQWVQOlUcVUEVUCVVGVVBVVA1VRzVQTVQL1UZ1UF1UD9VHDVBD1Ag1Rk1QU9QMNUctUEvUCrVGbVBb1A61Rx1QR9QJdUZdUFfUDXVHPVBP1Av1Rn1QX9QP9UcD0EA0CA1GQ9BQNAwNRyPQSDQKjUZj0Fg0Do1HE9BENAlNRlPQVDQNTUcz0Ew0C81Gc9BcNA/NRwvQQpSIFqHFaAlaipah5WgFWolWodVoDVqL1qH1aAPaiDahzWgL2oq2oe1oB9qJdqHdaA/ai/ah/egAOogOocPoCDqKjqHj6AQ6iU6h0+gMOovOofPoArqILqHL6Aq6iq6h6+gGwhCOCEQiCtGIQSziEI8EJCIJyUhBKtKQjgxkIgvZyEEu8pCPAhSiCAEEEUIxdBPdQrfRHXQX3UP30QP0ED1Cj9ET9BQ9Q8/RC/QSvUKv0Rv0Fr1D79EH9BF9Qp/RF/QVfUPf0Q/0E/1Cv9Ef9Bf9Q0koLhYfS4gliyWPpYiljKWK/RdLHUsTSxtLF0sfyxDLGMsUyxzLEssayxbLHssRyxnLFcsdyxPLG8sXyx8rECsYKxQrHCsSKxorFiseKxErGSsVKx0rEysbKxcrH6sQqxirFKscqxKrGqsWqx6rEasZqxWrHasTqxurF6sfaxBrGGsUaxxrEmsaaxZrHmsRaxlrFfsfS/fYoDezAAB0625t27Zt2263tm3btm3bthHbziSZecq9H+57/shpjbXB2mLtsPZYB6wj1gnrjHXBumLdsO5YD6wn1gvrjfXB+mL9sP7YAGwgNghLwAZjQ7Ch2DBsODYCG4mNwkZjY7Cx2DhsPDYBm4hNwiZjU7Cp2DRsOjYDm4nNwmZjc7C52DxsPrYAW4gtwhZjS7Cl2DJsObYCW4mtwlZja7C12DpsPbYB24htwjZjW7Ct2DZsO7YD24ntwnZje7C92D5sP3YAO4gdwg5jR7Cj2DHsOHYCO4mdwk5jZ7Cz2DnsPHYBu4hdwi5jV7Cr2DXsOnYDu4ndwm5jd7C72D3sPvYAe4g9wh5jT7Cn2DPsOfYCe4m9wl5jb7C32DvsPfYB+4h9wj5jX7Cv2DfsO/YDwzAcIzASozAaYzAW4zAeEzARkzAZUzAV0zAdMzATszAbczAX8zAfA1iAhViEQQxhMewn9gv7jf3B/mL/sEQsDk+CJ8WT4cnxFHhKPBWeGo/H0+Bp8XR4ejwDnhHPhGfGs+BZ8Wx4djwHnhPPhefG8+B58Xx4frwAXhAvhBfGi+BF8WJ4cbwEXhIvhZfGy+Bl8XJ4fFxcXEW8El4Zr4JXxavh1fEaeE28Fl4br4PXxevh9fEGeEO8Ed4Yb4I3xZvhzfEWeEu8Fd4ab4O3xdvh7fEOeEe8E94Z74J3xbvh3fEeeE+8F94b74P3xfvh/fEB+EB8EJ6AD8aH4EPxYfhwfAQ+Eh+Fj8bH4GPxcfh4fAI+EZ+ET8an4FPxafh0fAY+E5+Fz8bn4HPxefh8fAG+EF+EL8aX4EvxZfhyfAW+El+Fr8bX4Gvxdfh6fAO+Ed+Eb8a34Fvxbfh2fAe+E9+F78b34Hvxffh+/AB+ED+EH8aP4EfxY/hx/AR+Ej+Fn8bP4Gfxc/h5/AJ+Eb+EX8av4Ffxa/h1/AZ+E7+F38bv4Hfxe/h9/AH+EH+EP8af4E/xZ/hz/AX+En+Fv8bf4G/xd/h7/AP+Ef+Ef8a/4F/xb/h3/AeO4ThO4CRO4TTO4CzO4Twu4CIu4TKu4Cqu4Tpu4CZu4Tbu4C7u4T4O8AAP8QiHOMJj+E/8F/4b/4P/xf/hiXgckYRISiQjkhMpiJREKiI1EU+kIdIS6Yj0RAYiI5GJyExkIbIS2YjsRA4iJ5GLyE3kIfIS+Yj8RAGiIFGIKEwUIYoSxYjiRAmiJFGKKE2UIcoS5YjyRAWiIlGJqExUIaoS1YjqRA2iJlGLqE3UIeoS9Yj6RAOiIdGIaEw0IZoSzYjmRAuiJdGKaE20IdoS7Yj2RAeiI9GJ6Ex0IboS3YjuRA+iJ9GL6E30IfoS/Yj+xABiIDGISCAGE0OIocQwYjgxghhJjCJGE2OIscQ4YjwxgZhITCImE1OIqcQ0Yjoxg5hJzCJmE3OIucQ8Yj6xgFhILCIWE0uIpcQyYjmxglhJrCJWE2uItcQ6Yj2xgdhIbCI2E1uIrcQ2Yjuxg9hJ7CJ2E3uIvcQ+Yj9xgDhIHCIOE0eIo8Qx4jhxgjhJnCJOE2eIs8Q54jxxgbhIXCIuE1eIq8Q14jpxg7hJ3CJuE3eIu8Q94j7xgHhIPCIeE0+Ip8Qz4jnxgnhJvCJeE2+It8Q74j3xgfhIfCI+E1+Ir8Q34jvxg8AInCAIkqAImmAIluAInhAIkZAImVAIldAInTAIk7AIm3AIl/AInwBEQIREREACETHiJ/GL+E38If4S/4hEIo5MQiYlk5HJyRRkSjIVmZqMJ9OQacl0ZHoyA5mRzERmJrOQWclsZHYyB5mTzEXmJvOQecl8ZH6yAFmQLEQWJouQRcliZHGyBFmSLEWWJsuQZclyZHmyAlmRrERWJquQVclqZHWyBlmTrEXWJuuQdcl6ZH2yAdmQbEQ2JpuQTclmZHOyBdmSbEW2JtuQbcl2ZHuyA9mR7ER2JruQXcluZHeyB9mT7EX2JvuQfcl+ZH9yADmQHEQmkIPJIeRQchg5nBxBjiRHkaPJMeRYchw5npxATiQnkZPJKeRUcho5nZxBziRnkbPJOeRcch45n1xALiQXkYvJJeRSchm5nFxBriRXkavJNeRach25ntxAbiQ3kZvJLeRWchsZT+4gd5K7yN3kHnIvuY/cTx4gD5KHyMPkEfIoeYw8Tp4gT5KnyNPkGfIseY48T14gL5KXyMvkFfIqeY28Tt4gb5K3yNvkHfIueY+8Tz4gH5KPyMfkE/Ip+Yx8Tr4gX5KvyNfkG/It+Y58T34gP5KfyM/kF/Ir+Y38Tv4gMRInCZIkKZImGZIlOZInBVIkJVImFVIlNVInDdIkLdImHdIlPdInARmQIRmRkERkjPxJ/iJ/k3/Iv+Q/MpGMo5JQSalkVHIqBZWSSkWlpuKpNFRaKh2VnspAZaQyUZmpLFRWKhuVncpB5aRyUbmpPFReKh+VnypAFaQKUYWpIlRRqhhVnCpBlaRKUaWpMlRZqhxVnqpAVaQqUZWpKlRVqhpVnapB1aRqUbWpOlRdqh5Vn2pANaQaUY2pJlRTqhnVnGpBtaRaUa2pNlRbqh3VnupAdaQ6UZ2pLlRXqhvVnepB9aR6Ub2pPlRfqh/VnxpADaQGUQnUYGoINZQaRg2nRlAjqVHUaGoMNZYaR42nJlATqUnUZGoKNZWaRk2nZlAzqVnUbGoONZeaR82nFlALqUXUYmoJtZRaRi2nVlArqVXUamoNtZZaR62nNlAbqU3UZmoLtZXaRm2ndlA7qV3UbmoPtZfaR+2nDlAHqUPUYeoIdZQ6Rh2nTlAnqVPUaeoMdZY6R52nLlAXqUvUZeoKdZW6Rl2nblA3qVvUbeoOdZe6R92nHlAPqUfUY+oJ9ZR6Rj2nXlAvqVfUa+oN9ZZ6R72nPlAfqU/UZ+oL9ZX6Rn2nflAYhVMERVIURVMMxVIcxVMCJVISJVMKpVIapVMGZVIWZVMO5VIe5VOACqiQiihIISpG/aR+Ub+pP9Rf6h+VSMXRSeikdDI6OZ2CTkmnolPT8XQaOi2djk5PZ6Az0pnozHQWOiudjc5O56Bz0rno3HQeOi+dj85PF6AL0oXownQRuihdjC5Ol6BL0qXo0nQZuixdji5PV6Ar0pXoynQVuipdja5O16Br0rXo2nQdui5dj65PN6Ab0o3oxnQTuindjG5Ot6Bb0q3o1nQbui3djm5Pd6A70p3oznQXuivdje5O96B70r3o3nQfui/dj+5PD6AH0oPoBHowPYQeSg+jh9Mj6JH0KHo0PYYeS4+jx9MT6In0JHoyPYWeSk+jp9Mz6Jn0LHo2PYeeS8+j59ML6IX0InoxvYReSi+jl9Mr6JX0Kno1vYZeS6+j19Mb6I30JnozvYXeSm+jt9M76J30Lno3vYfeS++j99MH6IP0IfowfYQ+Sh+jj9Mn6JP0Kfo0fYY+S5+jz9MX6Iv0JfoyfYW+Sl+jr9M36Jv0Lfo2fYe+S9+j79MP6If0I/ox/YR+Sj+jn9Mv6Jf0K/o1/YZ+S7+j39Mf6I/0J/oz/YX+Sn+jv9M/aIzGaYImaYqmaYZmaY7maYEWaYmWaYVWaY3WaYM2aYu2aYd2aY/2aUAHdEhHNKQRHaN/0r/o3/Qf+i/9j06k45gkTFImGZOcScGkZFIxqZl4Jg2TlknHpGcyMBmZTExmJguTlcnGZGdyMDmZXExuJg+Tl8nH5GcKMAWZQkxhpghTlCnGFGdKMCWZUkxppgxTlinHlGcqMBWZSkxlpgpTlanGVGdqMDWZWkxtpg5Tl6nH1GcaMA2ZRkxjpgnTlGnGNGdaMC2ZVkxrpg3TlmnHtGc6MB2ZTkxnpgvTlenGdGd6MD2ZXkxvpg/Tl+nH9GcGMAOZQUwCM5gZwgxlhjHDmRHMSGYUM5oZw4xlxjHjmQnMRGYSM5mZwkxlpjHTmcSMM5lZzGxmDjOXmcfMZxYwC5lFzGJmCbOUWcYsZ1YwK5lVzGpmDbOWWcesZzYwG5lNzGZmC7OV2cZsZ3YwO5ldzG5mD7OX2cfsZw4wB5lDzGHmCHOUOcYcZ04wJ5lTzGnmDHOWOcecZy4wF5lLzGXmCnOVucZcZ24wN5lbzG3mDnOXucfcZx4wD5lHzGPmCfOUecY8Z14wL5lXzGvmDfOWece8Zz4wH5lPzGfmC/OV+cZ8Z34wGIMzBEOCxP9jOIZnBEZkJEZmFEZlNEZnDMZkLMZmHMZlPMZnABMwIRMxkEFMjPnJ/GJ+M3+Yv8w/JpGJY5OwSdlkbHI2BZuSTcWmZuPZNGxaNh2bns3AZmQzsZnZLGxWNhubnc3B5mRzsbnZPGxeNh+bny3AFmQLsYXZImxRthhbnC3BlmRLsaXZMmxZthxbnq3AVmQrsZXZKmxVthpbna3B1mRrsbXZOmxdth5bn23ANmQbsY3ZJmxTthnbnG3BtmRbsa3ZNmxbth3bnu3AdmQ7sZ3ZLmxXthvbne3B9mR7sb3ZPmxfth/bnx3ADmQHsQnsYHYIO5Qdxg5nR7Aj2VHsaHYMO5Ydx45nJ7AT2UnsZHYKO5Wdxk5nZ7Az2VnsbHYOO5edx85nF7AL2UXsYnYJu5Rdxi5nV7Ar2VXsanYNu5Zdx65nN7Ab2U3sZnYLu5Xdxm5nd7A72V3sbnYPu5fdx+5nD7AH2UPsYfYIe5Q9xh5nT7An2VPsafYMe5Y9x55nL7AX2UvsZfYKe5W9xl5nb7A32VvsbfYOe5e9x95nH7AP2UfsY/YJ+5R9xj5nX7Av2Vfsa/YN+5Z9x75nP7Af2U/sZ/YL+5X9xn5nf7AYi7MES7IUS7MMy7Icy7MCK7ISK7MKq7Iaq7MGa8ZZrM06rMt6rM8CNmBDNmIhi9gY+5P9xf5m/7B/2X9sIhvHJeGScsm45FwKLiWXikvNxXNpuLRcOi49l4HLyGXiMnNZuKxcNi47l4PLyeXicnN5uLxcPi4/V4AryBXiCnNFuKJcMa44V4IryZXiSnNluLJcOa48V4GryFXiKnNVuKpcNa46V4OrydXianN1uLpcPa4+14BryDXiGnNNuKZcM64514JrybXiWnNtuLZcO64914HryHXiOnNduK5cN64714PryfXienN9uL5cP64/N4AbyA3iErjB3BBuKDeMG86N4EZyo7jR3BhuLDeOG89N4CZyk7jJ3BRuKjeNm87N4GZys7jZ3BxuLjePm88t4BZyi7jF3BJuKbeMW86t4FZyq7jV3BpuLbeOW89t4DZym7jN3BZuK7eN287t4HZyu7jd3B5uL7eP288d4A5yh7jD3BHuKHeMO86d4E5yp7jT3BnuLHeOO89d4C5yl7jL3BXuKneNu87d4G5yt7jb3B3uLnePu8894B5yj7jH3BPuKfeMe8694F5yr7jX3BvuLfeOe8994D5yn7jP3BfuK/eN+8794DAO5wiO5CiO5hiO5TiO5wRO5CRO5hRO5TRO5wzO5CzO5hzO5TzO5wAXcCEXcZBDXIz7yf3ifnN/uL/cPy6Ri+OT8En5ZHxyPgWfkk/Fp+bj+TR8Wj4dn57PwGfkM/GZ+Sx8Vj4bn53Pwefkc/G5+Tx8Xj4fn58vwBfkC/GF+SJ8Ub4YX5wvwZfkS/Gl+TJ8Wb4cX56vwFfkK/GV+Sp8Vb4aX52vwdfka/G1+Tp8Xb4eX59vwDfkG/GN+SZ8U74Z35xvwbfkW/Gt+TZ8W74d357vwHfkO/Gd+S58V74b353vwffke/G9+T58X74f358fwA/kB/EJ/GB+CD+UH8YP50fwI/lR/Gh+DD+WH8eP5yfwE/lJ/GR+Cj+Vn8ZP52fwM/lZ/Gx+Dj+Xn8fP5xfwC/lF/GJ+Cb+UX8Yv51fwK/lV/Gp+Db+WX8ev5zfwG/lN/GZ+C7+V38Zv53fwO/ld/G5+D7+X38fv5w/wB/lD/GH+CH+UP8Yf50/wJ/lT/Gn+DH+WP8ef5y/wF/lL/GX+Cn+Vv8Zf52/wN/lb/G3+Dn+Xv8ff5x/wD/lH/GP+Cf+Uf8Y/51/wL/lX/Gv+Df+Wf8e/5z/wH/lP/Gf+C/+V/8Z/53/wGI/zBE/yFE/zDM/yHM/zAi/yEi/zCq/yGq/zBm/yFm/zDu/yHu/zgA/4kI94yCM+xv/kf/G/+T/8X/4fn8jHCUmEpEIyIbmQQkgppBJSC/FCGiGtkE5IL2QQMgqZhMxCFiGrkE3ILuQQcgq5hNxCHiGvkE/ILxQQCgqFhMJCEaGoUEwoLpQQSgqlhNJCGaGsUE4oL1QQKgqVhMpCFaGqUE2oLtQQagq1hNpCHaGuUE+oLzQQGgqNhMZCE6Gp0ExoLrQQWgqthNZCG6Gt0E5oL3QQOgqdhM5CF6Gr0E3oLvQQegq9hN5CH6Gv0E/oLwwQBgqDhARhsDBEGCoME4YLI4SRwihhtDBGGCuME8YLE4SJwiRhsjBFmCpME6YLM4SZwixhtjBHmCvME+YLC4SFwiJhsbBEWCosE5YLK4SVwiphtbBGWCusE9YLG4SNwiZhs7BF2CpsE7YLO4Sdwi5ht7BH2CvsE/YLB4SDwiHhsHBEOCocE44LJ4STwinhtHBGOCucE84LF4SLwiXhsnBFuCpcE64LN4Sbwi3htnBHuCvcE+4LD4SHwiPhsfBEeCo8E54LL4SXwivhtfBGeCu8E94LH4SPwifhs/BF+Cp8E74LPwRMwAVCIAVKoAVGYAVO4AVBEAVJkAVFUAVN0AVDMAVLsAVHcAVP8AUgBEIoRAIUkBATfgq/hN/CH+Gv8E9IFOLEJGJSMZmYXEwhphRTianFeDGNmFZMJ6YXM4gZxUxiZjGLmFXMJmYXc4g5xVxibjGPmFfMJ+YXC4gFxUJiYbGIWFQsJhYXS4glxVJiabGMWFYsJ5YXK4gVxUpiZbGKWFWsJlYXa4g1xVpibbGOWFesJ9YXG4gNxUZiY7GJ+N/1H9dSbCW2FtuIbcV2Ynuxg9hR7CR2FruIXcVuYnexh9hT7CX2FvuIfcV+Yn9xgDhQHCQmiIPFIeJQcZg4XBwhjhRHiaPFMeJYcZw4XpwgThQniZPFKeJUcZo4XZwhzhRnibPFOeJccZ44X1wgLhQXiYvFJeJScZm4XFwhrhRXiavFNeJacZ24XtwgbhQ3iZvFLeJWcZu4Xdwh7hR3ibvFPeJecZ+4XzwgHhQPiYfFI+JR8Zh4XDwhnhRPiafFM+JZ8Zx4XrwgXhQviZfFK+JV8Zp4Xbwh3hRvibfFO+Jd8Z54X3wgPhQfiY/FJ+JT8Zn4XHwhvhRfia/FN+Jb8Z34XvwgfhQ/iZ/FL+JX8Zv4XfwhYiIuEiIpUiItMiIrciIvCqIoSqIsKqIqaqIuGqIpWqItOqIreqIvAjEQQzESoYjEmPhT/CX+Fv+If8V/YqIYJyWRkkrJpORSCimllEpKLcVLaaS0UjopvZRByihlkjJLWaSsUjYpu5RDyinlknJLeaS8Uj4pv1RAKigVkgpLRaSiUjGpuFRCKimVkkpLZaSyUjmpvFRBqihVkipLVaSqUjWpulRDqinVkmpLdaS6Uj2pvtRAaig1khpLTaSmUjOpudRCaim1klpLbaS2UjupvdRB6ih1kjpLXaSuUjepu9RD6in1knpLfaS+Uj+pvzRAGigNkhKkwdIQaag0TBoujZBGSqOk0dIYaaw0ThovTZAmSpOkydIUaao0TZouzZBmSrOk2dIcaa40T5ovLZAWSoukxdISaam0TFourZBWSquk1dIaaa20TlovbZA2SpukzdIWaau0Tdou7ZB2Sruk3dIeaa+0T9ovHZAOSoekw9IR6ah0TDounZBOSqek09IZ6ax0TjovXZAuSpeky9IV6ap0Tbou3ZBuSrek29Id6a50T7ovPZAeSo+kx9IT6an0THouvZBeSq+k19Ib6a30TnovfZA+Sp+kz9IX6av0Tfou/ZAwCZcIiZQoiZYYiZU4iZcESZQkSZYUSZU0SZcMyZQsyZYcyZU8yZeAFEihFElQQlJM+in9kn5Lf6S/0j8pUYqTk8hJ5WRycjmFnFJOJaeW4+U0clo5nZxeziBnlDPJmeUsclY5m5xdziHnlHPJueU8cl45n5xfLiAXlAvJheUiclG5mFxcLiGXlEvJpeUyclm5nFxeriBXlCvJleUqclW5mlxdriHXlGvJteU6cl25nlxfbiA3lBvJjeUmclO5mdxcbiG3lFvJreU2clu5ndxe7iB3lDvJneUucle5m9xd7iH3lHvJveU+cl+5n9xfHiAPlAfJCfJgeYg8VB4mD5dHyCPlUfJoeYw8Vh4nj5cnyBPlSfJkeYo8VZ4mT5dnyDPlWfJseY48V54nz5cXyAvlRfJieYm8VF4mL5dXyCvlVfJqeY28Vl4nr5c3yBvlTfJmeYu8Vd4mb5d3yDvlXfJueY+8V94n75cPyAflQ/Jh+Yh8VD4mH5dPyCflU/Jp+Yx8Vj4nn5cvyBflS/Jl+Yp8Vb4mX5dvyDflW/Jt+Y58V74n35cfyA/lR/Jj+Yn8VH4mP5dfyC/lV/Jr+Y38Vn4nv5c/yB/lT/Jn+Yv8Vf4mf5d/yJiMy4RMypRMy4zMypzMy4IsypIsy4qsypqsy4ZsypZsy47syp7sy0AO5FCOZCgjOSb/lH/Jv+U/8l/5n5woxylJlKRKMiW5kkJJqaRSUivxSholrZJOSa9kUDIqmZTMShYlq5JNya7kUHIquZTcSh4lr5JPya8UUAoqhZTCShGlqFJMKa6UUEoqpZTSShmlrFJOKa9UUCoqlZTKShWlqlJNqa7UUGoqtZTaSh2lrlJPqa80UBoqjZTGShOlqdJMaa60UFoqrZTWShulrdJOaa90UDoqnZTOShelq9JN6a70UHoqvZTeSh+lr9JP6a8MUAYqg5QEZbAyRBmqDFOGKyOUkcooZbQyRhmrjFPGKxOUicokZbIyRZmqTFOmKzOUmcosZbYyR5mrzFPmKwuUhcoiZbGyRFmqLFOWKyuUlcoqZbWyRlmrrFPWKxuUjcomZbOyRdmqbFO2KzuUncouZbeyR9mr7FP2KweUg8oh5bByRDmqHFOOKyeUk8op5bRyRjmrnFPOKxeUi8ol5bJyRbmqXFOuKzeUm8ot5bZyR7mr3FPuKw+Uh8oj5bHyRHmqPFOeKy+Ul8or5bXyRnmrvFPeKx+Uj8on5bPyRfmqfFO+Kz8UTMEVQiEVSqEVRmEVTuEVQREVSZEVRVEVTdEVQzEVS7EVR3EVT/EVoARKqEQKVJASU34qv5Tfyh/lr/JPSVTi1CRqUjWZmlxNoaZUU6mp1Xg1jZpWTaemVzOoGdVMamY1i5pVzaZmV3OoOdVcam41j5pXzafmVwuoBdVCamG1iFpULaYWV0uoJdVSamm1jFpWLaeWVyuoFdVKamW1ilpVraZWV2uoNdVaam21jlpXrafWVxuoDdVGamO1idpUbaY2V1uoLdVWamu1jdpWbae2VzuoHdVOame1i9pV7aZ2V3uoPdVeam+1j9pX7af2VweoA9VBaoI6WB2iDlWHqcPVEepIdZQ6Wh2jjlXHqePVCepEdZI6WZ2iTlWnqdPVGepMdZY6W52jzlXnqfPVBepCdZG6WF2iLlWXqcvVFepKdZW6Wl2jrlXXqevVDepGdZO6Wd2iblW3qdvVHepOdZe6W92j7lX3qfvVA+pB9ZB6WD2iHlWPqcfVE+pJ9ZR6Wj2jnlXPqefVC+pF9ZJ6Wb2iXlWvqdfVG+pN9ZZ6W72j3lXvqffVB+pD9ZH6WH2iPlWfqc/VF+pL9ZX6Wn2jvlXfqe/VD+pH9ZP6Wf2iflW/qd/VHyqm4iqhkiql0iqjsiqn8qqgiqqkyqqiqqqm6qqhmqql2qqjuqqn+ipQAzVUIxWqSI2pP9Vf6m/1j/pX/acmqnFaEi2plkxLrqXQUmqptNRavJZGS6ul09JrGbSMWiYts5ZFy6pl07JrObScWi4tt5ZHy6vl0/JrBbSCWiGtsFZEK6oV04prJbSSWimttFZGK6uV08prFbSKWiWtslZFq6pV06prNbSaWi2ttlZHq6vV0+prDbSGWiOtsdZEa6o105prLbSWWiuttdZGa6u109prHbSOWiets9ZF66p107prPbSeWi+tt9ZH66v10/prA7SB2iAtQRusDdGGasO04doIbaQ2ShutjdHGauO08doEbaI2SZusTdGmatO06doMbaY2S5utzdHmavO0+doCbaG2SFusLdGWasu05doKbaW2SlutrdHWauu09doGbaO2SdusbdG2atu07doObae2S9ut7dH2avu0/doB7aB2SDusHdGOase049oJ7aR2SjutndHOaue089oF7aJ2SbusXdGuate069oN7aZ2S7ut3dHuave0+9oD7aH2SHusPdGeas+059oL7aX2SnutvdHeau+099oH7aP2SfusfdG+at+079oPDdNwjdBIjdJojdFYjdN4TdBETdJkTdFUTdN0zdBMzdJszdFczdN8DWiBFmqRBjWkxbSf2i/tt/ZH+6v90xK1OD2JnlRPpifXU+gp9VR6aj1eT6On1dPp6fUMekY9k55Zz6Jn1bPp2fUcek49l55bz6Pn1fPp+fUCekG9kF5YL6IX1YvpxfUSekm9lF5aL6OX1cvp5fUKekW9kl5Zr6JX1avp1fUaek29ll5br6PX1evp9fUGekO9kd5Yb6I31ZvpzfUWeku9ld5ab6O31dvp7fUOeke9k95Z76J31bvp3fUeek+9l95b76P31fvp/fUB+kB9kJ6gD9aH6EP1YfpwfYQ+Uh+lj9bH6GP1cfp4fYI+UZ+kT9an6FP1afp0fYY+U5+lz9bn6HP1efp8fYG+UF+kL9aX6Ev1ZfpyfYW+Ul+lr9bX6Gv1dfp6fYO+Ud+kb9a36Fv1bfp2fYe+U9+l79b36Hv1ffp+/YB+UD+kH9aP6Ef1Y/px/YR+Uj+ln9bP6Gf1c/p5/YJ+Ub+kX9av6Ff1a/p1/YZ+U7+l39bv6Hf1e/p9/YH+UH+kxyV9oj/Vn+nP9Rf6S/2V/lp/o7/V3+nv9Q/6R/2T/ln/on/Vv+nf9R86puM6oZM6pdM6o7M6p/O6oIu6pMu6oqu6puu6oZu6pdu6o7u6p/s60AM91CMd6kiP6T/1X/pv/Y/+V/+nJ+pxRhIjqZHMSG6kMFIaqYzURryRxkhrpDPSGxmMjEYmI7ORxchqZDOyGzmMnEYuI7eRx8hr5DPyGwWMgkYho7BRxChqFDOKGyWMkkYpo7RRxihrlDPKGxWMikYlo7JRxahqVDOqGzWMmkYto7ZRx6hr1DPqGw2MhkYjo7HRxGhqNDOaGy2MlkYro7XRxmhrtDPaGx2MjkYno7PRxehqdDO6Gz2MnkYvo7fRx+hr9DP6GwOMgcYgI8EYbAwxhhrDjOHGCGOkMcoYbYwxxhrjjPHGBGOiMcmYbEwxphrTjOnGDGOmMcuYbcwx5hrzjPnGAmOhschYbCwxlhrLjOXGCmOlscpYbawx1hrrjPXGBmOjscnYbGwxthrbjO3GDmOnscvYbewx9hr7jP3GAeOgccg4bBwxjhrHjOPGCeOkcco4bZwxzhrnjPPGBeOiccm4bFwxrhrXjOvGDeOmccu4bdwx7hr3jPvGA+Oh8ch4bDwxnhrPjOfGC+Ol8cp4bbwx3hrvjPfGB+Oj8cn4bHwxvhrfjO/GDwMzcIMwSIMyaIMxWIMzeEMwREMyZEMxVEMzdMMwTMMybMMxXMMzfAMYgREakQENZMSMn8Yv47fxx/hr/DMSjTgziZnUTGYmN1OYKc1UZmoz3kxjpjXTmenNDGZGM5OZ2cxiZjWzmdnNHGZOM5eZ28xj5jXzmfnNAmZBs5BZ2CxiFjWLmcXNEmZJs5RZ2ixjljXLmeXNCmZFs5JZ2axiVjWrmdXNGmZNs5ZZ26xj1jXrmfXNBmZDs5HZ2GxiNjWbmc3NFmZLs5XZ2mxjtjXbme3NDmZHs5PZ2exidjW7md3NHmZPs5fZ2+xj9jX7mf3NAeZAc5CZYA42h5hDzWHmcHOEOdIcZY42x5hjzXHmeHOCOdGcZE42p5hTzWnmdHOGOdOcZc4255hzzXnmfHOBudBcZC42l5hLzWXmcnOFudJcZa4215hrzXXmenODudHcZG42t5hbzW3mdnOHudPcZe4295h7zX3mfvOAedA8ZB42j5hHzWPmcfOEedI8ZZ42z5hnzXPmefOCedG8ZF42r5hXzWvmdfOGedO8Zd4275h3zXvmffOB+dB8ZD42n5hPzWfmc/OF+dJ8Zb4235hvzXfme/OD+dH8ZH42v5hfzW/md/OHiZm4SZikSZm0yZisyZm8KZiiKZmyqZiqqZm6aZimaZm26Ziu6Zm+CczADM3IhCYyY+ZP85f52/xj/jX/mYlmnJXESmols5JbKayUViortRVvpbHSWums9FYGK6OVycpsZbGyWtms7FYOK6eVy8pt5bHyWvms/FYBq6BVyCpsFbGKWsWs4lYJq6RVyiptlbHKWuWs8lYFq6JVyapsVbGqWtWs6lYNq6ZVy6pt1bHqWvWs+lYDq6HVyGpsNbGaWs2s5lYLq6XVympttbHaWu2s9lYHq6PVyepsdbG6Wt2s7lYPq6fVy+pt9bH6Wv2s/tYAa6A1yEqwBltDrKHWMGu4NcIaaY2yRltjrLHWOGu8NcGaaE2yJltTrKnWNGu6NcOaac2yZltzrLnWPGu+tcBaaC2yFltLrKXWMmu5tcJaaa2yVltrrLXWOmu9tcHaaG2yNltbrK3WNmu7tcPaae2ydlt7rL3WPmu/dcA6aB2yDltHrKPWMeu4dcI6aZ2yTltnrLPWOeu8dcG6aF2yLltXrKvWNeu6dcO6ad2yblt3rLvWPeu+9cB6aD2yHltPrKfWM+u59cJ6ab2yXltvrLfWO+u99cH6aH2yPltfrK/WN+u79cPCLNwiLNKiLNpiLNbiLN4SLNGSLNlSLNXSLN0yLNOyLNtyLNfyLN8CVmCFVmRBC1kx66f1y/pt/bH+Wv+sRCvOTmIntZPZye0Udko7lZ3ajrfT2GntdHZ6O4Od0c5kZ7az2FntbHZ2O4ed085l57bz2HntfHZ+u4Bd0C5kF7aL2EXtYnZxu4Rd0i5ll7bL2GXtcnZ5u4Jd0a5kV7ar2FXtanZ1u4Zd065l17br2HXtenb9hAZ2Q7uR3dhuYje1m9nN7RZ2S7uV3dpuY7e129nt7Q52R7uT3dnuYne1u9nd7R52T7uX3dvuY/e1+9n97QH2QHuQnWAPtofYQ+1h9nB7hD3SHmWPtsfYY+1x9nh7gj3RnmRPtqfYU+1p9nR7hj3TnmXPtufYc+159nx7gb3QXmQvtpfYS+1l9nJ7hb3SXmWvttfYa+119np7g73R3mRvtrfYW+1t9nZ7h73T3mXvtvfYe+199n77gH3QPmQfto/YR+1j9nH7hH3SPmWfts/YZ+1z9nn7gn3RvmRftq/YV+1r9nX7hn3TvmXftu/Yd+179n37gf3QfmQ/tp/YT+1n9nP7hf3SfmW/tt/Yb+139nv7g/3R/mR/tr/YX+1v9nf7h43ZuE3YpE3ZtM3YrM3ZvC3Yoi3Zsq3Yqq3Zum3Ypm3Ztu3Yru3Zvg3swA7tyIY2smP2T/uX/dv+Y/+1/9mJdpyTxEnqJHOSOymclE4qJ7UT76Rx0jrpnPROBiejk8nJ7GRxsjrZnOxODienk8vJ7eRx8jr5nPxOAaegU8gp7BRxijrFnOJOCaekU8op7ZRxyjrlnPJOBaeiU8mp7FRxqjrVnOpODaemU8up7dRx6jr1nPpOA6eh08hp7DRxmjrNnOZOC6el08pp7bRx2jrtnPZOB6ej08np7HRxujrdnO5OD6en08vp7fRx+jr9nP7OAGegM8hJcAY7Q5yhzjBnuDPCGemMckY7Y5yxzjhnvDPBmehMciY7U5ypzjRnujPDmenMcmY7c5y5zjxnvrPAWegschY7S5ylzjJnubPCWemsclY7a5y1zjpnvbPB2ehscjY7W5ytzjZnu7PD2enscnY7e5y9zj5nv3PAOegccg47R5yjzjHnuHPCOemcck47Z5yzzjnnvHPBuehcci47V5yrzjXnunPDuenccm47d5y7zj3nvvPAeeg8ch47T5ynzjPnufPCeem8cl47b5y3zjvnvfPB+eh8cj47X5yvzjfnu/PDwRzcIRzSoRzaYRzW4RzeERzRkRzZURzV0RzdMRzTsRzbcRzX8RzfAU7ghE7kQAc5Meen88v57fxx/jr/nEQnzk3iJnWTucndFG5KN5Wb2o1307hp3XRuejeDm9HN5GZ2s7hZ3WxudjeHm9PN5eZ287h53XxufreAW9At5BZ2i7hF3WJucbeEW9It5ZZ2y7hl3XJuebeCW9Gt5FZ2q7hV3WpudbeGW9Ot5dZ267h13XpufbeB29Bt5DZ2m7hN3WZuc7eF29Jt5bZ227ht3XZue7eD29Ht5HZ2u7hd3W5ud7eH29Pt5fZ2+7h93X5uf3eAO9Ad5Ca4g90h7lB3mDvcHeGOdEe5o90x7lh3nDveneBOdCe5k90p7lR3mjvdneHOdGe5s9057lx3njvfXeAudBe5i90l7lJ3mbvcXeGudFe5q9017lp3nbve3eBudDe5m90t7lZ3m7vd3eHudHe5u9097l53n7vfPeAedA+5h90j7lH3mHvcPeGedE+5p90z7ln3nHveveBedC+5l90r7lX3mnvdveHedG+5t9077l33nnvffeA+dB+5j90n7lP3mfvcfeG+dF+5r9037lv3nfve/eB+dD+5n90v7lf3m/vd/eFiLu4SLulSLu0yLutyLu8KruhKruwqrupqru4arularu06rut6ru8CN3BDN3Khi9yY+9P95f52/7h/3X9uohvnJfGSesm85F4KL6WXykvtxXtpvLReOi+9l8HL6GXyMntZvKxeNi+7l8PL6eXycnt5vLxePi+/V8Ar6BXyCntFvKJeMa+4V8Ir6ZXySntlvLJeOa+8V8Gr6FXyKntVvKpeNa+6V8Or6dXyant1vLpePa++18Br6DXyGntNvKZeM6+518Jr6bXyWnttvLZeO6+918Hr6HXyOntdvK5eN6+718Pr6fXyent9vL5eP6+/N8Ab6A3yErzB3hBvqDfMG+6N8EZ6o7zR3hhvrDfOG+9N8CZ6k7zJ3hRvqjfNm+7N8GZ6s7zZ3hxvrjfPm+8t8BZ6i7zF3hJvqbfMW+6t8FZ6q7zV3hpvrbfOW+9t8DZ6m7zN3hZvq7fN2+7t8HZ6u7zd3h5vr7fP2+8d8A56h7zD3hHvqHfMO+6d8E56p7zT3hnvrHfOO+9d8C56l7zL3hXvqnfNu+7d8G56t7zb3h3vrnfPu+898B56j7zH3hPvqffMe+698F56r7zX3hvvrffOe+998D56n7zP3hfvq/fN++798DAP9wiP9CiP9hiP9TiP9wRP9CRP9hRP9TRP9wzP9CzP9hzP9TzP94AXeKEXedBDXsz76f3yfnt/vL/ePy/Ri/OT+En9ZH5yP4Wf0k/lp/bj/TR+Wj+dn97P4Gf0M/mZ/Sx+Vj+bn93P4ef0c/m5/Tx+Xj+fn98v4Bf0C/mF/SJ+Ub+YX9wv4Zf0S/ml/TJ+Wb+cX96v4Ff0K/mV/Sp+Vb+aX92v4df0a/m1/Tp+Xb+eX99v4Df0G/mN/SZ+U7+Z39xv4bf0W/mt/TZ+W7+d397v4Hf0O/md/S5+V7+b393v4ff0e/m9/T5+X7+f398f4A/0B/kJ/mB/iD/UH+YP90f4I/1R/mh/jD/WH+eP9yf4E/1J/mR/ij/Vn+ZP92f4M/1Z/mx/jj/Xn+fP9xf4C/1F/mJ/ib/UX+Yv91f4K/1V/mp/jb/WX+ev9zf4G/1N/mZ/i7/V3+Zv93f4O/1d/m5/j7/X3+fv9w/4B/1D/mH/iH/UP+Yf90/4J/1T/mn/jH/WP+ef9y/4F/1L/mX/in/Vv+Zf92/4N/1b/m3/jn/Xv+ff9x/4D/1H/mP/if/Uf+Y/91/4L/1X/mv/jf/Wf+e/9z/4H/1P/mf/i//V/+Z/93/4mI/7hE/6lE/7jM/6nM/7gi/6ki/7iq/6mq/7hm/6lm/7ju/6nu/7wA/80I986CM/5v/0f/m//T/+X/+fn+jHgSQgKUgGkoMUICVIBVKDeJAGpAXpQHqQAWQEmUBmkAVkBdlAdpAD5AS5QG6QB+QF+UB+UAAUBIVAYVAEFAXFQHFQApQEpUBpUAaUBeVAeVABVASVQGVQBVQF1UB1UAPUBLVAbVAH1AX1QH3QADQEjUBj0AQ0Bc1Ac9ACtAStQGvQBrQF7UB70AF0BJ1AZ9AFdAXdQHfQA/QEvUBv0Af0Bf1AfzAADASDQAIYDIaAoWAYGA5GgJFgFBgNxoCxYBwYDyaAiWASmAymgKlgGpgOZoCZYBaYDeaAuWAemA8WgIVgEVgMloClYBlYDlaAlWAVWA3WgLVgHVgPNoCNYBPYDLaArWAb2A52gJ1gF9gN9oC9YB/YDw6Ag+AQOAyOgKPgGDgOToCT4BQ4Dc6As+AcOA8ugIvgErgMroCr4Bq4Dm6Am+AWuA3ugLvgHrgPHoCH4BF4DJ6Ap+AZeA5egJfgFXgN3oC34B14Dz6Aj+AT+Ay+gK/gG/gOfgAM4IAAJKAADRjAAg7wQAAikIAMFKACDejAACawgA0c4AIP+ACAAIQgAhAgEAM/wS/wG/wBf8E/kAjigiRB0iBZkDxIEaQMUgWpg/ggTZA2SBekDzIEGYNMQeYgS5A1yBZkD3IEOYNcQe4gT5A3yBfkDwoEBYNCQeGgSFA0KBYUD0oEJYNSQemgTFA2KBeUDyoEFePj4uKCKkHVoFpQPagR1AxqBbWDOkHdoF5QP2gQNAwaBY2DJkHToFnQPGgRtAxaBa2DNkHboF3QPugQdAw6BZ2DLkHXoFvQPegR9Ax6Bb2DPkHfoF/QPxgQDAwGBQnB4GBIMDQYFgwPRgQjg1HB6GBMMDYYF4wPJgQTg0nB5GBKMDWYFkwPZgQzg1nB7GBOMDeYF8wPFgQLg0XB4mBJsDRYFiwPVgQrg1XB6mBNsDZYF6wPNgQbg03B5mBLsDXYFmwPdgQ7g13B7mBPsDfYF+wPDgQHg0PB4eBIcDQ4FhwPTgQng1PB6eBMcDY4F5wPLgQXg0vB5eBKcDW4FlwPbgQ3g1vB7eBOcDe4F9wPHgQPg0fB4+BJ8DR4FjwPXgQvg1fB6+BN8DZ4F7wPPgQfg0/B5+BL8DX4FnwPfgRYgAdEQAZUQAdMwAZcwAdCIAZSIAdKoAZaoAdGYAZWYAdO4AZe4AcgCIIwiAIYoCAW/Ax+Bb+DP8Hf4F+QGMSFScKkYbIweZgiTBmmClOH8WGaMG2YLkwfZggzhpnCzGGWMGuYLcwe5ghzhrnC3GGeMG+YL8wfFggLhoXCwmGRsGhYLCwelghLhqXC0mGZsGxYLiwfVggrhpXCymGVsGpYLawe1ghrhrXC2mGdsG5YL6wfNggbho3CxmGTsGnYLGwetghbhq3C1mGbsG3YLmwfdgg7hp3CzmGXsGvYLewe9gh7hr3C3mGfsG/YL+wfDggHhoPChHBwOCQcGg4Lh4cjwpHhqHB0OCYcG44Lx4cTwonhpHByOCWcGk4Lp4czwpnhrHB2OCecG84L54cLwoXhonBxuCRcGi4Ll4crwpXhqnB1uCZcG64L14cbwo3hpnBzuCXcGm4Lt4c7wp3hrnB3uCfcG+4L94cHwoPhofBweCQ8Gh4Lj4cnwpPhqfB0eCY8G54Lz4cXwovhpfByeCW8Gl4Lr4c3wpvhrfB2eCe8G94L74cPwofho/Bx+CR8Gj4Ln4cvwpfhq/B1+CZ8G74L34cfwo/hp/Bz+CX8Gn4Lv4c/QizEQyIkQyqkQyZkQy7kQyEUQymUQyVUQy3UQyM0Qyu0Qyd0Qy/0QxAGYRhGIQxRGAt/hr/C3+Gf8G/4L0wM46IkUdIoWZQ8ShGljFJFqaP4KE2UNkoXpY8yRBmjTFHmKEuUNcoWZY9yRDmjXFHuKE+UN8oX5Y8KRAWjQlHhqEhUNCoWFY9KRCWjUlHpqExUNioXlY8qRBWjSlHlqEpUNaoWVY9qRDWjWlHtqE5UN6oX1Y8aRA2jRlHjqEnUNGoWNY9aRC2jVlHrqE3UNmoXtY86RB2jTlHnqEvUNeoWdY96RD2jXlHvqE/UN+oX9Y8GRAOjQVFCNDgaEg2NhkXDoxHRyGhUNDoaE42NxkXjownRxGhSNDmaEk2NpkXToxnRzGhWNDuaE82N5kXzowXRwmhRtDhaEi2NlkXLoxXRymhVtDpaE62N1kXrow3RxmhTtDnaEm2NtkXbox3RzmhXtDvaE+2N9kX7owPRwehQdDg6Eh2NjkXHoxPRyehUdDo6E52NzkXnowvRxehSdDm6El2NrkXXoxvRzehWdDu6E92N7kX3owfRw+hR9Dh6Ej2NnkXPoxfRy+hV9Dp6E72N3kXvow/Rx+hT9Dn6En2NvkXfox8RFuEREZERFdERE7ERF/GREImRFMmREqmRFumREZmRFdmRE7mRF/kRiIIojKIIRiiKRT+jX9Hv6E/0N/oXJUZxMAlMCpPB5DAFTAlTwdQwHqaBaWE6mB5mgBlhJpgZZoFZYTaYHeaAOWEumBvmgXlhPpgfFoAFYSFYGBaBRWExWByWgCVhKVgaloFlYTlYHlaAFWElWBlWgVVhNVgd1oA1YS1YG9aBdWE9WB82gA1hI9gYNoFNYTPYHLaALWEr2Bq2gW1hO9gedoAdYSfYGXaBXWE32B32gD1hL9gb9oF9YT/YHw6AA+EgmAAHwyFwKBwGh8MRcCQcBUfDMXAsHAfHwwlwIpwEJ8MpcCqcBqfDGXAmnAVnwzlwLpwH58MFcCFcBBfDJXApXAaXwxVwJVwFV8M1cC1cB9fDDXAj3AQ3wy1wK9wGt8MdcCfcBXfDPXAv3Af3wwPwIDwED8Mj8Cg8Bo/DE/AkPAVPwzPwLDwHz8ML8CK8BC/DK/AqvAavwxvwJrwFb8M78C68B+/DB/AhfAQfwyfwKXwGn8MX8CV8BV/DN/AtfAffww/wI/wEP8Mv8Cv8Br/DHxCDOCQgCSlIQwaykIM8FKAIJShDBapQgzo0oAktaEMHutCDPgQwgCGMIIQIxuBP+Av+hn/gX/gPJsI4lAQlRclQcpQCpUSpUGoUj9KgtCgdSo8yoIwoE8qMsqCsKBvKjnKgnCgXyo3yoLwoH8qPCqCCqBAqjIqgoqgYKo5KoJKoFCqNyqCyqBwqjyqgiqgSqoyqoKqoGqqOaqCaqBaqjeqguqgeqo8aoIaoEWqMmqCmqBlqjlqglqgVao3aoLaoHWqPOqCOqBPqjLqgrqgb6o56oJ6oF+qN+qC+qB/qjwaggWgQSkCD0RA0FA1Dw9EINBKNQqPRGDQWjUPj0QQ0EU1Ck9EUNBVNQ9PRDDQTzUKz0Rw0F81D89ECtBAtQovRErQULUPL0Qq0Eq1Cq9EatBatQ+vRBrQRbUKb0Ra0FW1D29EOtBPtQrvRHrQX7UP70QF0EB1Ch9ERdBQdQ8fRCXQSnUKn0Rl0Fp1D59EFdBFdQpfRFXQVXUPX0Q10E91Ct9EddBfdQ/fRA/QQPUKP0RP0FD1Dz9EL9BK9Qq/RG/QWvUPv0Qf0EX1Cn9EX9BV9Q9/RD4QhHBGIRBSiEYNYxCEeCUhEEpKRglSkIR0ZyEQWspGDXOQhHwEUoBBFCCKEYugn+oV+oz/oL/qHElFcLEksaSxZLHksRSxlLFUsdSw+liaWNpYulj6WIZYxlimWOZYlljWWLZY9liOWM5YrljuWJ5Y3li+WP1YgVjBWKFY4ViRWNFYsVjxWIlYyVipWOlYmVjZWLlY+ViFWMVYpVjlWJVY1Vi1WPVYjVjNWK1Y7VidWN1YvVj/WINYw1ijWONYk1jTWLNY81iLWMtYq1jrWJtY21i7WPtbhf+zcBfQe1b03+ieCS4FSCjX+UNpSo0Bpi0MFChR3hwABAoFAEtyCu7u7u7u7u7u722y/69DAW9py723PWe3pm89nrWTvZ8/M3vPsb5KV36xJusW6xbsluiW7pbqlu2W6ZbvluuW7FboVu5W6lbtVulW71brVu0HdGt2a3Vrd4G7tbp1u3W5It163fje026DbsBvWbdRt3A3vRnQju026TbvNus27Lbotu626rbttum277bpR3fbdDt2O3U7dzt0u3a7dbt3u3R7dnt1e3d7dPt2+3X7d/t0B3YHdQd3B3SHdod1h3eHdEd2R3VHd0d0x3bHdcd3x3Qndid1J3cndKd2p3Wnd6d0Z3ZndWd3Z3Tndud153fndBd2F3UXdxd0l3aXdZd3l3RXdld1V3dXdNd213XXd9d0N3Y3dTd3N3S3drd1t3e3dHd2d3V3d3d093b3dfd393QPdg91D3cPdI92j3WPd490T3ZPdU93T3TPds91z3fPdC92L3Uvdy90r3avda93r3Rvdm91b3dvdO9273Xvd+90H3YfdR93H3Sfdp13XhS52qctd6WrXul7oF/qHAWFgGCuMHcYJ44bxwvhhgjBhmChMHL4WJgmThsnC18Pk4RthivDNMGWYKnwrfDt8J3w3fC9MHfrCNGHa8P0wXfhB+GH4UZg+/Dj8JPw0/Cz8PMwQfhFmDDOFmcMvwyzhV+HX4Tdh1jBbmD3MEeYMc4W5wzxh3vDb8Lvw+/CHMF+YP/wxLBAWDAuFP4WFwyJh0bBYWDwsEZYMS4WlwzJh2bBcWD6sEFYMK4WVwyph1bBaWD0MCmuENcNaYXBYO6wT1g1Dwnph/TA0bBA2DMPCRmHjMDyMCCPDJmHTsFnYPGwRtgxbha3DNmHbsF0YFbYPO4Qdw05h57BL2DXsFnYPe4Q9w15h77BP2DfsF/YPB4QDw0Hh4HBIODQcFg4PR4Qjw1Hh6HBMODYcF44PJ4QTw0nh5HBKODWcFk4PZ4Qzw1nh7HBOODecF84PF4QLw0Xh4nBJuDRcFi4PV4Qrw1Xh6nBNuDZcF64PN4Qbw03h5nBLuDXcFm4Pd4Q7w13h7nBPuDfcF+4PD4QHw0Ph4fBIeDQ8Fh4PT4Qnw1Ph6fBMeDY8F54PL4QXw0vh5fBKeDW8Fl4Pb4Q3w1vh7fBOeDe8F94PH4QPw0fh4/BJ+DR0IYQYUsihhBpa6MV+sX8cEAfGseLYcZw4bhwvjh8niBPGieLE8WtxkjhpnCx+PU4evxGniN+MU8ap4rfit+N34nfj9+LUsS9OE6eN34/TxR/EH8Yfxenjj+NP4k/jz+LP4wzxF3HGOFOcOf4yzhJ/FX8dfxNnjbPF2eMccc44V5w7zhPnjb+Nv4u/j3+I88X54x/jAnHBuFD8U1w4LhIXjYvFxeMSccm4VFw6LhOXjcvF5eMKccW4Ulw5rhJXjavF1eOguEZcM64VB8e14zpx3TgkrhfXj0PjBnHDOCxuFDeOw+OIODJuEjeNm8XN4xZxy7hV3DpuE7eN28VRcfu4Q9wx7hR3jrvEXeNucfe4R9wz7hX3jvvEfeN+cf94QDwwHhQPjofEQ+Nh8fB4RDwyHhWPjsfEY+Nx8fh4QjwxnhRPjqfEU+Np8fR4RjwznhXPjufEc+N58fx4QbwwXhQvjpfES+Nl8fJ4RbwyXhWvjtfEa+N18fp4Q7wx3hRvjrfEW+Nt8fZ4R7wz3hXvjvfEe+N98f74QHwwPhQfjo/ER+Nj8fH4RHwyPhWfjs/EZ+Nz8fn4QnwxvhRfjq/EV+Nr8fX4RnwzvhXfju/Ed+N78f34QfwwfhQ/jp/ET2MXQ4wxxRxLrLHFXuqX+qcBaWAaK42dxknjpvHS+GmCNGGaKE2cvpYmSZOmydLX0+TpG2mK9M00ZZoqfSt9O30nfTd9L02d+tI0adr0/TRd+kH6YfpRmj79OP0k/TT9LP08zZB+kWZMM6WZ0y/TLOlX6dfpN2nWNFuaPc2R5kxzpbnTPGne9Nv0u/T79Ic0X5o//TEtkBZMC6U/pYXTImnRtFhaPC2RlkxLpaXTMmnZtFxaPq2QVkwrpZXTKmnVtFpaPQ1Ka6Q101ppcFo7rZPWTUPSemn9NDRtkDZMw9JGaeM0PI1II9Mm/TdNm6XN03hpy7RV2jptk7ZN26VRafu0Q9ox7ZR2TrukXdNuafe0R9oz7ZX2TvukfdN+af90QDowHZQOToekQ9Nh6fB0RDoyHZWOTsekY9Nx6fh0QjoxnZROTqekU9Np6fR0RjoznZXOTuekc9N56fx0QbowXZQuTpekS9Nl6fJ0RboyXZWuTteka9N16fp0Q7ox3ZRuTrekW9Nt6fZ0R7oz3ZXuTveke9N96f70QHowPZQeTo+kR9Nj6fH0RHoyPZWeTs+kZ9Nz6fn0QnoxvZReTq+kV9Nr6fX0RnozvZXeTu+kd9N76f30QfowfZQ+Tp+kT1OXQooppZxKqqmlXu6X++cBeWAeK4+dx8nj5vHy+HmCPGGeKE+cv5YnyZPmyfLX8+T5G3mK/M08ZZ4qfyt/O38nfzd/L0+d+/I0edr8/Txd/kH+Yf5Rnj7/OP8k/zT/LP88z5B/kWfMM+WZ8y/zLPlX+df5N3nWPFuePc+R58xz5bnzPHne/Nv8u/z7/Ic8X54//zEvkBfMC+U/5YXzInnRvFhePC+Rl8xL5aXzMnnZvFxePq+QV8wr5ZXzKnnVvFpePQ/Ka+Q181p5cF47r5PXzUPyenn9PDRvkDfMw/JGeeM8PI/II/MmedO8Wd48b5G3zFvlrfM2edu8XR6Vt8875B3zTnnnvEveNe+Wd8975D3zXnnvvE/eN++X988H5APzQfngfEg+NB+WD89H5CPzUfnofEw+Nh+Xj88n5BPzSfnkfEo+NZ+WT89n5DPzWfnsfE4+N5+Xz88X5AvzRfnifEm+NF+WL89X5CvzVfnqfE2+Nl+Xr8835BvzTfnmfEu+Nd+Wb8935DvzXfnufE++N9+X788P5AfzQ/nh/Eh+ND+WH89P5CfzU/np/Ex+Nj+Xn88v5BfzS/nl/Ep+Nb+WX89v5DfzW/nt/E5+N7+X388f5A/zR/nj/En+NHc55JhTzrnkmlvulX6lfxlQBpaxythlnDJuGa+MXyYoE5aJysTla2WSMmmZrHy9TF6+UaYo3yxTlqnKt8q3y3fKd8v3ytSlr0xTpi3fL9OVH5Qflh+V6cuPy0/KT8vPys/LDOUXZcYyU5m5/LLMUn5Vfl1+U2Yts5XZyxxlzjJXmbvMU+Ytvy2/K78vfyjzlfnLH8sCZcGyUPlTWbgsUhYti5XFyxJlybJUWbosU5Yty5XlywplxbJSWbmsUlYtq5XVy6CyRlmzrFUGl7XLOmXdMqSsV9YvQ8sGZcMyrGxUNi7Dy4gysmxSNi2blc3LFmXLslXZumxTti3blVFl+7JD2bHsVHYuu5Rdy25l97JH2bPsVfYu+5R9y35l/3JAObAcVA4uh5RDy2Hl8HJEObIcVY4ux5Rjy3Hl+HJCObGcVE4up5RTy2nl9HJGObOcVc4u55Rzy3nl/HJBubBcVC4ul5RLy2Xl8nJFubJcVa4u15Rry3Xl+nJDubHcVG4ut5Rby23l9nJHubPcVe4u95R7y33l/vJAebA8VB4uj5RHy2Pl8fJEebI8VZ4uz5Rny3Pl+fJCebG8VF4ur5RXy2vl9fJGebO8Vd4u75R3y3vl/fJB+bB8VD4un5RPS1dCiSWVXEqppZVe7Vf71wF1YB2rjl3HqePW8er4dYI6YZ2oTly/Viepk9bJ6tfr5PUbdYr6zTplnap+q367fqd+t36vTl376jR12vr9Ol39Qf1h/VGdvv64/qT+tP6s/rzOUH9RZ6wz1ZnrL+ss9Vf11/U3ddY6W529zlHnrHPVues8dd762/q7+vv6hzpfnb/+sS5QF6wL1T/VhesiddG6WF28LlGXrEvVpesyddm6XF2+rlBXrCvVlesqddW6Wl29Dqpr1DXrWnVwXbuuU9etQ+p6df06tG5QN6zD6kZ14zq8jqgj6yZ107pZ3bxuUbesW9Wt6zZ127pdHVW3rzvUHetOdee6S9217lZ3r3vUPetede+6T9237lf3rwfUA+tB9eB6SD20HlYPr0fUI+tR9eh6TD22HlePryfUE+tJ9eR6Sj21nlZPr2fUM+tZ9ex6Tj23nlfPrxfUC+tF9eJ6Sb20XlYvr1fUK+tV9ep6Tb22XlevrzfUG+tN9eZ6S7213lZvr3fUO+td9e56T7233lfvrw/UB+tD9eH6SH20PlYfr0/UJ+tT9en6TH22Plefry/UF+tL9eX6Sn21vlZfr2/UN+tb9e36Tn23vlffrx/UD+tH9eP6Sf20djXUWFPNtdRaW+21fq1/G9AGtrHa2G2cNm4br43fJmgTtonaxO1rbZI2aZusfb1N3r7RpmjfbFO2qdq32rfbd9p32/fa1K2vTdOmbd9v07UftB+2H7Xp24/bT9pP28/az9sM7RdtxjZTm7n9ss3SftV+3X7TZm2ztdnbHG3ONlebu83T5v2nrl+prdxWaau21drqbVBbo63Z1mqD29ptnbZuG9LWa+u3oW2DtmEb1jZqG7fhbUQb2TZpm7bN2uZti7Zl26pt3bZp27bt2qi2fduh7dh2aju3Xdqubbe2e9uj7dn2anu3fdq+bb+2fzugHdgOage3Q9qh7bB2eDuiHdmOake3Y9qx7bh2fDuhndhOaie3U9qp7bR2ejujndnOame3c9q57bx2frugXdguahe3S9ql7bJ2ebuiXdmuale3a9q17bp2fbuh3dhuaje3W9qt7bZ2e7uj3dnuane3e9q97b52f3ugPdgeag+3R9qj7bH2eHuiPdmeak+3Z9qz7bn2fHuhvdheai+3V9qr7bX2enujvdneam+3d9q77b32fvugfdg+ah+3T9qnrWuhxZZabqXV1loPAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAf8r8Cy4834y98b743K/Xr7dAr1/vtH69Xm+S/3PewNHtgK+Y579OXf2z3pSf/bzA572P1p9paHys31e1Y/+9yfr97dCMvV5v3S/m799b6LNev95Yn42N9f+5Tu+80ZOObj9fd+CA6XqL9ObvLT3686jR996vN++XbmTu0e28nw+8N+DvtpPO8+edOu9L8/ztrs37lxvX6/XG7X25nbRf/8/a1lr7e1v0P+Or0mTMIP8xm/zHbPIfs8l/zCb/MZv8/9P8z9aVA76y/l/3r+r/AaOr4f5fcV//ivp/oy/m799b8h+t/+f98gZ9vu64X9T/i/SG9Ib3ho8e/6rnAAP+ep/n/fvtNAPbX+zz/1ZflSZjBvmP2eQ/ZpP/mE3+Yzb5j9nkP2br/zf1f///l/q//39y/f/FGwx/bv+y/l+4N6y3Tm/+3pDe0N7g0eNfVf/PM7r9ov7/q3k/b6eZd8BnF6n/+d9L/v9XOeAfvUD+Yzb5j9nkP2aT/5hN/mO2v63/B4yu/9/7q/p/7NHPAAZ+xUyTjK6XP6//Z/wn6/8vv+ffr7f4P1znf9nn848zYLresr1hvaG9TXob9AZ/Nu+oL9bp31vrixUHjvqv7/H5vweY/LOjs4z+nTJ579h+k/X6/XmVsSYbff1nY38+Yay+Xq/X17/3pXP++lhv9LOSGb9Yf2BvstG9Eb0telv21u8N6g397GnE5+8jbNTr9ab/4vyxehN+sdOjcxv9zUd9MT7lF28rTPmVzyG+Kk3GDPIfs8l/zCb/MZv8x2zyH7PJf8w28LO6EAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADgL/UAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/kEjtthy/UFDhw4e/k93xvr/dXL/3n9jCR2df7zTf/Qv8f8t9/Mf1vk3/8EEAAD8j/s/f+n/d98JAAAAAAAAAAAAAAAAAAAAjLn+Ff+d2L/7OwIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/He8ueR+iy49/1J9fX19vUn6jR7svnzOqN6oXht9bODosTa67Tf6x+arnTHHoxPfs+5/jU089TUbb33WtNeNnGDZiya+cpze3ZOs+uZ7M7909xR3T/VmXXrdISP6hozo23DYyNkH9a0xbNjIQWsMHdy31pAR68/Qt/jQwYNGDO4bsuGIwcNH9v3F4bWHDttooy36Bm241kTjbzR88IgRfYM23KJv/cFb9I0c1jdy+BZ9g9YZNGTDvhlmmKFvovH/VXv3n2+ZM/7ddwAAAMC/xv8TAAD//+fnPuk=")
chdir(&(0x7f00000001c0)='./file0\x00')
r0 = openat$dir(0xffffffffffffff9c, &(0x7f00000001c0)='./file0\x00', 0x2a442, 0x0)
fallocate(r0, 0x0, 0x0, 0x40002)
r1 = openat$dir(0xffffffffffffff9c, &(0x7f00000003c0)='./file0\x00', 0x0, 0x0)
chdir(0x0)
rename(&(0x7f0000000000)='./file1\x00', &(0x7f0000000040)='./file0\x00')
fsync(0xffffffffffffffff)
fdatasync(r1)


r0 = syz_mount_image$btrfs(&(0x7f0000000000), &(0x7f00000015c0)='./file0\x00', 0x0, &(0x7f0000001600), 0x0, 0x559e, &(0x7f0000005680)="$eJzs3X9sVeX9B/BzWwoN+C39jhUYfxAgBoMkyJYtjqB4MQa24eKlgsKcCEQlBivYRDcYqUWSZcaghU4EF5GQaDJjscM/FMywy7CMZfzY5hZjs4JSaZZsAzVrHDG69N77XO49l9tembNOXy/SnvPcz3me+9yT88d9X/qcGwEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAURUcSc9+d0f3i0ZE1X77/Hz+e+OjGn4zfvX/roVvu23T/gjMjbto5a1nf+mlN8zdsbDjS/PS+ObdGUSLdL5Htf9u136q/88bbvlsdBly+MLOtrS31lJmuJzON4QUP9vcr/FkRRVFVbIDK7PbV7E5FwQC53cbiAQf0Tuui6O7J8ya1dT01bklyYU/xS6df9VBPYKhkr6ue89dSMv27InZErp136SUKLtFM//gF96m8CADgY5mZSm9yb0ezb3Fz7eZ4PdZOxtotsXZ4h9CS37gYmXGHl5rnpHh9iOaZzESFESXnGatnz3+unYr3j7VjUeNjzLPw0GykqS41z7Wx+lDNEwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOCzZOzxo2tWtD2y575fdtQceff9OVc+8KWOw22LT4y8eunKHWum/HTWsr7105rmb9jYcKT56X1zbo2i2nS/RKZ74kTL5b9NjZ3fvXfcG427n6vpq8yOG7bD8g6OXg87s0ZH0cq8Sk8Y9q81UZQqLKSb0Y7iwl3pnW+HAgAAAJ8nX0n/rsi1M3GwqqCdSKfJRPpfkAmL77Quiu6ePG9SW9dT45YkF/Zc/HipEuMlLzherl17/ieRF4xD/I2Pd74eDm0sGmdg8RHjef7SMWPefmty/eSvT5v7xA3PjOru+r8nZ2xJ/bGu5oUrru+tf/a6ovxfO3D+D2dO/gcAAOA/If/HxxnYYPn/jqVTt7z+i2Grft3a8MTB+h1/bv3OMzsXneq54Ud9L09N3v7o1UX5f1LBUxbl/zDjkP8roovL/wAAAPBZ9t/O/8micQY2WP5vONM3+wcHX6vr+PucxXt+9dAVi8+e/tv8U7t3DV9zR8v6uoeuLMr/M8vL/8Pypx0e/F2Y8OrRUTSz/JMKAAAAFAj/737+o4WQ1zOfHMTz+rX/vKp5380ffPMbD97zpzff/s2xA7MnrdteN/PgyzfVf1j5ve3dRfk/WV7+r/p0Xi4AAABQhuePrpw773jPucfPvtB18vDu3pMznjyzrqnvdOslLatXbTr2WlH+T5WX/0cMzcsBAAAALuDeO59bsfnVl/oe2H/X2Ck9FVc1XpK4ZduOqU0TPuq8tPfy7VuL8v/y8vL/yOw2u/Ih06kz/BVC6+goqu7fWZspHIparskVAAAAgE9IyOlbP1ixbOzOsb3jj59+rObQG4dn/2Vt55yN13RXdW/uXNZ4WdH9AkJiL3X//3Cng7D+v+D+f0Xr//MKmbv+zXZjAAAAAL6Iitfzh9vjZ765oNT375e7/n9J3cQTiba33lv11XMHzo1ZsP/7129aV9/be8+El37/wz9M/6i6KP83l5f/K/O3n+T3/wEAAMBF+F/7/r+lReMMbLD7/zdV9DWsWrd3+uota7csTCw7UH3qwdV731+w5l9Tb36+qea6A0X5v6W8/B+2o/JfXkc4P5tGR9H4/p3s3QR/Hqa7OlZor8orZE58rMeNoUe20D4ir5C2Ntbja6OjaHL/TnOs8P+h0BIrnK3JFnbFCsdCIXs95Ap7YoWOcKVtq8lON154MRSyCyzawwqKUbklEbEe75Xq0V+4YI+u3JMDAAB8oYTwnM2yVYXNKB5l2xODHTBysAMqBjugcrADhsUOiB9Y6vFoeWEhPH575yMbNjVMSb7y8NzHfvbms40T9j1+WV3v5g9f2XbvxJ3TW6YW5f9d5eX/cCqGZzal1v9HYf1/9nsNc+v/l4dCbazQHgqp+B0DUuE5MmH34fActalsj7PjcwUAAAD4XAufC1QO8TwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/s3evcdJVd0JAj/d9INumqaNE9GMk3TUgGakaWwNw+AoaoxGRZpZddxkNBBoEGmE8FgFURtQZxziZ3ztrJnoCAoiu+qHGFeDwUhcxIw6iWLiA/Cxjq7r+h6VGM2E/XTfOkXVrS67EFDa+X7/6DpVv/O89eg69946FwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACA/xjuPfjlk4YunP0PHzace8nqqqmL/kfH6Mv+cNW3vvjUPy5b9G9h/i9GnLll3kEXHjd/wbR/6Vi++ogzQmjtKleWFC977oqvPtS613HP3jFw48wbb63fUpWpNxMP/Tr/lGfuXBxbfbF/CHeXhVCRDgypSwKVmft1sb5960LYI2wLZEu01SYl0g2HB2pCWBK2BbJVra4JoS4ncMqG+++7vDNxTU0IXwkhVKfbeKY6aaMmHRhUlQRq04HpFUngt1sT2cBPypMA7LD4Zsi+6Fe15mdo6L5ckddf5U7r2KcrPbw+MdFQPN/rR+3iTuWoSj/QukNPW0F17BIFb4+13m294N1WsJ2v8LTlfpHKfEPZui1UHcontk0aP6d9dnykPDQ19SlW0y56np9+e/6E7Un3mtdh7EDDTnkdXvrYiun9lo2+9OrNvxqz4ayaA3a0m0/lbNLc9K5WHTKvuV7zPEajfJ70grdfwbekRl+6Qghbzz17xtfnTDz7iD63PLnu1QcfrNty9pwFvzhz4nmLLj55w7/Pf6lg/t/w0fP/+HKOt+V5uWOrH9Ync/P4SF1MvFmfzM0BAACg1+gNe01Xnv/6X73+/bWtMxed/u23Dj73w71afz3i/gFVB7yxrqn1/I2ff6Vg/t9Y2vH/eMi/Lne0a0MY1ZVYNCCEvbseTwIrY3e+OyCEL3elWvMDR6UCa0PYpytxULaqVIm+sURjKvByfSYwKhVYHwOtqcDyGLgiFbg4BlalAhNiYG0qcHQMhCn54/hqfWYcJQdqYmBcshFXxbMQ3qmPraW21aZsVQAAADtJZnZYmX8351yHHc0Qp5eranrKEM/ALpqhOlVDegabnVYVraGipxrKe6ohO+6Ojx5+Qc1lPdVccBpGWX6GG9f85X2LXjzsC2P3mvj5xUMvmPKz8eGst++uerx5yYtv7XvEzesK5v/NHz3/r+6mI2UFx/9DGNv1N+Yuz0Tas/FxrXkZAAAAgB1w0R//xR61Lw85oGHT+2X3zl/7xKMrfrl5j1NOf3/c8a//8PCaxnsL5v+jSjv/P+4T6ZOTOTwSd0NMHRBCc34gqXZkYSA56t0vEwAAAIDeIHs8PnssfErmNjlFOz2fLszfup3544H/Ud3m//09/7P2jq3/+mLZBd89d0TNgKX/9GrHhBNOPvqW47/1zj4VB/yyvGD+31ra+f+1+bdJJ9bHXlw9IIS+OYEHYy87A10aY+D5I/MDmfGvjxtgcawqc2JCtqrFscS4GGhOBZYUK/FotsTe+YHMk5VtfFF2HFMyJXICAAAA8ImLuwPicfl4/n/LGSNO++vvzfrbha88eN7qCy75q+Ed80eedP/THzbMvXJp2PTmEQXz/3Hbd/5/1zy44PT+9n4hDK0IoU/6hwGP1CYLA8ZAXVkmcW9tUlefdFULa0MY2TmwdFUvZNb/r0ivMfh4TVJVDOy93y1vD+pMLKsJYWhu4IlvLz2sMzEnFcg2flpNCF/qHG268bv6Jo1Xphu/tm8IX8wJZKua0DeEzsaq0lX9r+rMdQzSVa2qDmHPnEC2quHVIcwNAPRW8X/pxNwHZ82dN3V8e3vbzF2YiDvxa8KkKe1tTROmt0+sLtKniak+561jtKBwTKVe+mZTZo2ixSsnV5aSzv5QsDm3rcyO/IIzBzP345ehyq5xHlKZd7clPeQD9y9sIuR8lSo25PJdPOTa3Eq2PYkF9cf8VaFf6DtnVtvMpvPGz549c1jyt9TshyR/43GmZFsNS2+r2u76VsLLo+hyWSkfd1sNyq1k6OxpM4bOmjtvyJRp4ye3TW47p+XQP2sZMXz410YM7RxUc/K3h5EO6q7m1Ei3Li1xWDtxpF+oyKnkk/jQkJCQ6G2J/f7L5odH77n+nOt/9tqPz+/3zdPu3fvImT889KqpD1Xve/ji24ccWDD/n/HR8//4qRM/+DPrMxQ7/t8QD/Mnj287zD8uBpaUevy/odjR/OyJAY2pQEcMdDjMDwAAwGdD3B0Z92bGndKbb1m/buOSlrk/aHin5dY17Utvuum+U39y58ATvjQ47LXhuhM+VzD/7yjt9/87af3/7NL1JxRb5v+gWKK52Pr/6WX+s+v/dxRb/z+9zH92/f8ln8L6/3OygdQmecf6/wAAwGfBJ7f+f4/L+6cvEFCQocfl/dMXCCjI0OMy/qVeIGC71/9f8+Bff6Wq35g7/qTlN/WXvPZ39xzWeuS6zTP/5Etb10+877qxt6wpmP9fUdr838L9AAAAsPv4z5ddU3H02Xff0bJu6sZxbw5+98m3lgzq80HF0Q+3j3xh4Bu3nlcw/19S2vz/k1//LxQ7/7+xWKC12MKA1v8DAACglyq2/t89Q1sa/zCm/x+eHvab5Q/ePPqnj/z898v3+/mJPyvfZ8Gxz8+8bFLB/H9VafP/eNpFeV7u2JsP65M17UJ6Tbs367M/GQAAAIDeoTw0NVWWmDdvYdSjPn6bT2eWAv2odK7vvXLt2ZtfmH7c46ev+7uaEwbvOWHaBasa/2b4gXd+ftQley7ddGrB/H9tafP/vN9lXPrYiun9lo2+9MOrN/9qzIazag7YdvwfAAAA2HVK3S8BAAAAAAAAAAAAAAB8+s7tWHzhI8uOfe+bt//F/kcseXXwbXcd+Lsh/V664qoHJq1648zJXy/4/X8Y21Wu2O//43X/4u8L/igvd2y15/X/MvdPOfH2uV1LFj5SH8L+uYGpC6fuETLX5h+cG7jvjIMGdiYWpkusefbolzoT30kHjh/yuS2dicNTgXFxkcR90oF4VcUt/VOBuLzi4+lA3B6r0oGqTOCy/sk4ytLb6pW6ZFuVpbfVxroQBuQEstvq7rqkjbL0AK9JBbID/F46EAd4ciZQnu7V7f2SXsVAXSx6Q7+kVwAA7Lbit8DKMGlKe1tz/Aofb79QkX8b5S1ZtqCw2rISm9+UWZps8crJlaWk+6S/i2671nhlqO4cwrCCr6u5Wcq6Rrlzaulh0/1RkSH3tNpbeZFyadu76aqKj6gmGVHThOntEyt7HHhLz1kOqegxy7CCyU5ulvKuTVpCLSX0pYQRlbhtSuhyvF8empr6pHL9eQw2hDw9vSJK/b1+7jp/xV4FuXluO/TKt758zE+f++CfP/9E/2+cVnP7rO+/e+KvX7//wEOOuG5C05otBfP/htLm/9W549qSuRhAR7yy3sgBIYwrcUQAAADw2XfbRbfecfr09a9MWlvx5GOPTS0fc3rl1vl3zp93ycZ7Fx9/2cErdjR+2Fm//f5vBu//b89e9dJPR+7zwA03/58nD3v8z3//8I8eeqduZZ+x7xXM/xtLm//HPViZQ8HJ3o618fr/iwaE0HVp/YYksDIO97sDQvhyV6o1lkguqH9CLNGcBFbGHSYHxRLjWvOr6hsDq1KBl+szgbWpwPoYyOyluCVkduVcWR/CYV2psfklZsQSDanAmBhoTAWaYqA5FegfA6NSgdf6ZwKtqcDDMRCm5G+rH/fPbCsAAIDtkZlnVebfDel53qqKnjKU9ZShtqcM5T1lqO4pQ7FRxPt3xAyVqZNXynIyVaZrrUnVUpAhXgx/u/tVkCE8mp8zXbCg6Xj+QfZ8g7L8DFf+4NlT1w+e/tDqzcd8beBt/zhkz4Obp9e9t+CGp3475pzrnv/TQQXz/+bS5v+1+bdJ6+vj/H/b9f+SwIOxe1fHU8cbY+D5I/MDmR0D6+Nkd3G2qtZMicykfXEsMSoGGlOBGTEwKhUYNzYTWDIwP5CZaWcbX5RtfEqmRE4AAAAAPnFxB0HcTRPn/zce9YOr3x8wccuyeTPvH9vyxMmjv3H1XT+6d/9ld767YvCAce99p2D+P6q0+X9sr19uYxfH3rzYP4S7y7b1JhsYUpcE4n6Muvjz+H3rQtgjZwdHtkRbbVKiKtVweKAm+YV6Vbqq1TXJGgPx/ikb7r/v8s7ENTUhfCVn70u2jWeqkzZq0oFBVUmgNh2YXpEE4p6fbOAn5UkAdlh2r2B8QWVOdclq6L5ckdffZ+WaoOnhFewD7SZfd7+52lWq0w9k9qlmbd/TVlAdu0TB22Otd1tvfLc1eLflfpHKfEPZui1UHcontk0aP6d9dnwk95esBXbR85z7K9VS0jvhddjx8Xvbs+p0B5pTHx/N3Zfr/nVYFqu79LEV0/stG33p1Zt/NWbDWTUHlNyNIuIPhX+05X9XPpWzeXe16pB5zfW6z5NWnye98d9Ao6cthHDZ9cfsu+TdX+/33A3Pnbqu7Maxr/7lrHs2Lf+bysNHrXv/yaGjLy+Y/7eWNv+vSN12+V3cmLMGhHBgzsZ9JG7+YwYkn4M5geRTcs/CQHLI/V/ri35yAgAAwM6W3d2R3V8wJXObnBCenicX5m/dzvxxf8WobvOX2u9j121cedLQN6474G8vOPGNv7/28Kceuv6ysnXL//vYD1avuXzxe08UzP/HffT8v2+qm47/O/7PLuL4f7d2913RfdMPdOzQruiC6tglHP/v1u7+bnP8v1uO/zv+3x3H/3vg+H+3dvenreBb0gxfujonwdff+fPfTbzpg7mN+x180lPPHDrxun+6quXuu0555b+de9601761uWD+P6O0+b/1/7pftC+7/t+4Yuv/zSi2/l+H9f8AAIBdqshCc+l5XsHqfQUZ0qv3FWTocYHAHpcYtP7fdq//t3Dkv1904Q+fb7n2nTvHXb5m07Fnvvr0utXPzFpx3Lnnv9V6112tBfP/jtLm//Hl0C+39d6y/l/j2CJVXREDMywMCAAAwO6o2A4CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPl2HnvbO+5d8/R/aBv1ixc1/f+v/+7/P1q594JvfuGn4L6f86RllazZcM+LMLfMOuvC4+Qum/UvH8tVHnBHClK5yZUnxsueu+OpDrXsd9+wdAzfOvPHW+i3VmXorM7d/nJc7tvphfQhLch6pi4k36zvvbAuccuLtcys6E4/Uh7B/bmDqwql7dCaW14cwODdw3xkHDexMLEyXWPPs0S91Jr6TDhw/5HNbOhOHZwJl6e5e1z/pblm6u5f3D2FATiDb3bP751eVbeO4TKA83caKuqSNGKiLRa+tS9qIgfZYYkrfEIZWhNAnXdU/VydV9UlXdU91UlWfdFUXVYcwMoRQka7quaqkqor0yB+tSqqKgb33u+XtQZ2JpVUhDM0NPPHtpYd1JmamAtnG/1NVCF/qfMmkG/9xZdJ4Zbrx/1oZwhdDCFXpEu9VJCWq0iVeqAhhz5zAto1YEcLcwGdD/PSZmPvgrLnzpo5vb2+buQsTVZm2asKkKe1tTROmt0+sTvWpmLKc9NYFH3/sm96eP6HzdvHKyZWlpCsy5Sq7unxIZd7dlt2997FftbmVbHs+CuqP+atCv9B3zqy2mU3njZ89e+aw5G+p2Q9J/vbJRJNtNay3bKtBuZUMnT1txtBZc+cNmTJt/OS2yW3ntBz6Zy0jhg//2oihnYNqTv7ujJEu/eRH+oWKnEo+ife/hIREb0uU5326Ne/un+MFX/S3dbQyVHd9QBdMK3KzlHWNcmcM+qiPOeKP8zWlxxENK5g4FGQ5pOcsLQWTiW1ZapIsXV/rCiaHuTWVd23SeL88NDX1KbYdGvLv5m7e13dg8z6d2XSlpgEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA+P/swIEAAAAAAJD/ayNUVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVYQcOBAAAAACA/F8boaqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqgo7cCwAAAAAIMzfOoyeDQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC4FAAA//8fSxmR")
ioctl$AUTOFS_DEV_IOCTL_REQUESTER(0xffffffffffffffff, 0xc018937b, 0x0)
fchownat(r0, &(0x7f0000000340)='./file0\x00', 0x0, 0x0, 0x0)


r0 = open_tree(0xffffffffffffff9c, &(0x7f0000000640)='\x00', 0x89901)
fchdir(r0)
getcwd(&(0x7f0000000080)=""/140, 0x8c)


syz_mount_image$ext4(&(0x7f00000002c0)='ext4\x00', &(0x7f0000000300)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x10, &(0x7f0000000680), 0xfe, 0x244, &(0x7f0000000400)="$eJzs3T9oJFUcB/DvzO565m6RUxtB/AMiooFwdoJNbBQCEoKIoEJExEZJhJhgl1jZWGitksomiJ3RUtIEG0WwipoiNoIGC4OFFiu7k0hMVqNu3Dkynw9MZibz3vzesPN9u83sBmisq0mmk7SSTCbpJCmON7i7Wq4e7q5PbM8nvd4TPxWDdtV+5ajflSRrSR5KslUWeamdrGw+s/fLzmP3vbncuff9zacnxnqRh/b3dh8/eG/2jY9mHlz54qsfZotMp/un6zp/xZD/tYvklv+j2HWiaNc9Av6Judc+/Lqf+1uT3DPIfydlqhfvraUbtjp54N2/6vv2j1/ePs6xAuev1+v03wPXekDjlEm6KcqpJNV2WU5NVZ/hv2ldLl9eXHp18sXF5YUX6p6pgPPSTXYf/eTSx1dO5P/7VpV/4OLq5//JuY1v+9sHrbpHA4zFHdWqn//J51bvj/xD48g/NJf8Q3PJPzSX/ENzyT80l/zDBdb5+8PyD80l/9Bc8g/NdTz/AECz9C7V/QQyUJe65x8AAAAAAAAAAAAAAAAAAOC09Ynt+aNlXDU/eyfZfyRJe1j91uD3iJMbB38v/1z0m/2hqLqN5Nm7RjzBiD6o+enrm76rt/7nd9Zbf3UhWXs9ybV2+/T9Vxzef//dzWcc7zw/YoF/qTix//BT461/0m8b9daf2Uk+7c8/14bNP2VuG6yHzz/ds79i+Uyv/DriCQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABib3wMAAP//+kBtTA==")
r0 = creat(&(0x7f0000000040)='./file0\x00', 0x0)
mknodat$loop(0xffffffffffffff9c, &(0x7f0000000a40)='./file2\x00', 0xc000, 0x1)
r1 = openat$cgroup_pressure(0xffffffffffffffff, &(0x7f0000000080)='cpu.pressure\x00', 0x2, 0x0)
r2 = openat$cgroup_ro(0xffffffffffffffff, &(0x7f0000000100)='blkio.bfq.io_merged_recursive\x00', 0x0, 0x0)
sendfile(r1, r2, &(0x7f00000001c0)=0x8, 0x7)
chdir(&(0x7f0000000240)='./file0\x00')
r3 = openat$smackfs_cipsonum(0xffffffffffffff9c, &(0x7f0000000200)='/sys/fs/smackfs/direct\x00', 0x2, 0x0)
pwrite64(r3, &(0x7f0000000280)="7098130cc6f2e946c77711d659b33f3afde372", 0x13, 0xe7)
open(&(0x7f0000000000)='./file2\x00', 0x0, 0x0)
r4 = creat(&(0x7f0000002440)='./file0\x00', 0x0)
write$cgroup_type(r4, &(0x7f0000000240), 0xfb3f)
fallocate(r0, 0x8, 0x0, 0x8000)
read$smackfs_cipsonum(0xffffffffffffffff, &(0x7f0000000080), 0x14)
readv(0xffffffffffffffff, &(0x7f00000003c0)=[{&(0x7f0000000040)=""/24, 0x18}, {0x0}], 0x2)
fcntl$setflags(r4, 0x2, 0x1)
pwritev(r0, &(0x7f0000000040)=[{&(0x7f0000000180)="80", 0x1}], 0x1, 0x0, 0x0)


syz_open_procfs$namespace(0x0, 0xfffffffffffffffe)


r0 = syz_mount_image$fuse(&(0x7f0000000080), &(0x7f0000000000)='./file0\x00', 0x0, 0x0, 0x0, 0x0, 0x0)
mkdir(&(0x7f0000000300)='./file0\x00', 0x0)
r1 = syz_mount_image$nilfs2(&(0x7f0000000180), &(0x7f0000000a80)='./file1\x00', 0x0, &(0x7f00000001c0)=ANY=[], 0x1, 0xa95, &(0x7f0000000ac0)="$eJzs3V2IXNUBAOBzZ3c22WRtJjapq0ZNa4v2x103Jq021UQMFCmEQB8K2pcQYxqypqURqSI0+lD6IgpinvoQiy99UdpS1BcJPhQpKKW0CH2y0tcUhD5IIU7JzDmzM2dnvLOb3Z2Zne+DO2fOPXfuOXfmzp07956fAIytSuNx//6tIYRX3n75yC2vvvfi1TmHWkvUGo+TbbFqCKGI8clsfR9NNMMrnzx7oltYhH2NxxQPRy+3Xrs9hHA+7A2XQi288OiFyQ/fPPrB6+HMU8eOvP/W+mz9kmK9MwAAgCFw7NLB/bv/9debd/7vjVsPhy2t+en8vBbjM/G8/3A8UU7ny5XQGS/apnZT2XKTcapky010Wa49n2q23GSP/Key9VZ7LLelJP+JtnndthtGWdqPa6GozHXEK5W5ueZ/8tD4Xz9VzJ09vfj4uQEVFFhz/70thLDXZDKN41TfMegjEEBTfr9wmfNre6eutbbJ/vK//GCl++vbVNe0hIyFrc1go/f/nPyHO//XnlNTgbWzCfemv1/Xtl3pezQT4/l9hLz+0kq//2l9+f2Ifs8Bet1HGJX7C73KObHB5VitXuXP94vN6v4YpvfhgSw9fg8at9Pyz3RUPmOgu09d/zeZ1mS6sRh8GVY61Qd9AAKGVl5vrh6l9LxeX56+pSR9a0n6dEn6tpL07SXpMM7++OSL4aVi6XpX/p9+pdfDZrJVfGGF5cmvR640/7ze7+fqcgHjWvPP6xPDUNv+5wv3PvbI35r1/4vW/v9Z3N/3xngtfrcuxQXS9cL8unqr7n+tM5tKj+Wuz4pzXZflG893dS5X7FpaT2g7ziwrx2x6RfOK7o5ey+3pXH8tW246Tluz8ubnJ9uy16Xzj3SomekozdL2VrPtmMrKkY4rO2OYlwNWI+2PnfX/r+59jfr/rdtYs6FaPH568eTdMZ720/cmqluuzl/Y6IID16zf9j+zobP9z0xrfrXSflzYsTS/aD8u1LL5+3rMvyfG0+/cTyamG/PnTvx08bG13ngYc+eefubM8cXFkz/3ZNiehPMhDEExPBmnJ+m4cHxxsMclYP3NP/nEz+bPPf3MXaefOH7q5KmTZxcOLNx734GFhW/fN984r59vP7vfjNWFYXwt/foPuiQAAAAAAAAAAABAv059PH/xH+9+55/N9v9L7f9S+/9U8ze1//911v4/a+bfqiCc2gGmdnzL2ttnHaxOZctV4/TFbP27snx2Z6/7Ugxb4/jF9v8pu7xf11SeG7L5ef+9abnZbH7ehchU1gdJEX61rb18X4nh8zH8bYABKqa7z643e5wo69867eupfwr9Uoym9LmlvSH1Y5Laf6f23qm/ktT/Qzr+79yAMrL2NqJd4aC3EejuP5u5/+/0J2HQ5RidaXoIymDawKleN4oHMBwGPf5nuu6ZwrPvPLy1/s7Drcsalx/sPF7m/Zf25YpOyOlu2MeflP9A89/y2nPFmg4v3Br/7nOOf2c7jn/ZiHn5DaA+HXz+B/9uyzbc1G/++finqR/oXSvL/3sx/7Q1d4T+8q+/muWf3xDq0/1Z/tv6zH/Z9u9ZXf4PxPzT23bn7f3m3yxxUeksR37dON3/S9eNf5Tlfyjb/tS354q3f5UDNR6O+cM4G5VxZldqVMb/7SWvh3EgxtOBMNVzyMc7WWn5U/2K9DuwO1t/UfL7tgnH/63/sl7/zbiM4/LdGJZ9H2biZ5r2x1qXeKUtXu3y3m7WYw2Mqo828/2/YZvCEJRh1KbpISjDtU+Tyz77ynDsD/V6faB9+upQeLAG/f4P+n/CoPMf9PtfJh//t9c5fK/0fPzfH2bp+fi/+evz8fXy9Hz83/z9zMf/zdNvyNabjw88W5J+Y0n6TSXpN5ek7ylJv6Uk/csl6beWpN9Wkn59SfrtJelfLUn/Wu/0T9sX7fX6O0vW//WS9M0utUcZ1+2HcZa3z/P9h/GR7v/0+v7vKkkHRteFNxYeeuQPP6412/9Pta6HpPt4h2O8Gv87/yLG8/veoS1+Ne3dGP84Sx/26x0wTvL+M+r10PH7fseydL//sFmkel6+3zCGiu499uT323r1W9XrPJ/R8o0YfjOG34rhXTGci+F8DBdiuG+Dysf6eOj3fzr4UrH0f39Hlt5vffK8PVDeT9Q9fZYnvz7Qd332NWoVt+r8o1U2BwMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABiYSuNx//7ZIoRX3n75yMW/fP93V+ccai1RazxOtsWqrdeFcHcMJ2J4MT658smzJ9rDz2JYhH2hCEVrfjh6uZXT9hDC+bA3XAq18MKjFyY/fPPoB6+HM08dO/L+W+v3DjQV650BAAAADND/AwAA//83IBcJ")
open$dir(&(0x7f0000000240)='./file2\x00', 0x0, 0x0)
rmdir(&(0x7f00000002c0)='./file0\x00')
r2 = syz_mount_image$btrfs(&(0x7f0000000000), &(0x7f00000015c0)='./file0\x00', 0x0, &(0x7f0000001600), 0x0, 0x559e, &(0x7f0000005680)="$eJzs3X9sVeX9B/BzWwoN+C39jhUYfxAgBoMkyJYtjqB4MQa24eKlgsKcCEQlBivYRDcYqUWSZcaghU4EF5GQaDJjscM/FMywy7CMZfzY5hZjs4JSaZZsAzVrHDG69N77XO49l9tembNOXy/SnvPcz3me+9yT88d9X/qcGwEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAURUcSc9+d0f3i0ZE1X77/Hz+e+OjGn4zfvX/roVvu23T/gjMjbto5a1nf+mlN8zdsbDjS/PS+ObdGUSLdL5Htf9u136q/88bbvlsdBly+MLOtrS31lJmuJzON4QUP9vcr/FkRRVFVbIDK7PbV7E5FwQC53cbiAQf0Tuui6O7J8ya1dT01bklyYU/xS6df9VBPYKhkr6ue89dSMv27InZErp136SUKLtFM//gF96m8CADgY5mZSm9yb0ezb3Fz7eZ4PdZOxtotsXZ4h9CS37gYmXGHl5rnpHh9iOaZzESFESXnGatnz3+unYr3j7VjUeNjzLPw0GykqS41z7Wx+lDNEwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOCzZOzxo2tWtD2y575fdtQceff9OVc+8KWOw22LT4y8eunKHWum/HTWsr7105rmb9jYcKT56X1zbo2i2nS/RKZ74kTL5b9NjZ3fvXfcG427n6vpq8yOG7bD8g6OXg87s0ZH0cq8Sk8Y9q81UZQqLKSb0Y7iwl3pnW+HAgAAAJ8nX0n/rsi1M3GwqqCdSKfJRPpfkAmL77Quiu6ePG9SW9dT45YkF/Zc/HipEuMlLzherl17/ieRF4xD/I2Pd74eDm0sGmdg8RHjef7SMWPefmty/eSvT5v7xA3PjOru+r8nZ2xJ/bGu5oUrru+tf/a6ovxfO3D+D2dO/gcAAOA/If/HxxnYYPn/jqVTt7z+i2Grft3a8MTB+h1/bv3OMzsXneq54Ud9L09N3v7o1UX5f1LBUxbl/zDjkP8roovL/wAAAPBZ9t/O/8micQY2WP5vONM3+wcHX6vr+PucxXt+9dAVi8+e/tv8U7t3DV9zR8v6uoeuLMr/M8vL/8Pypx0e/F2Y8OrRUTSz/JMKAAAAFAj/737+o4WQ1zOfHMTz+rX/vKp5380ffPMbD97zpzff/s2xA7MnrdteN/PgyzfVf1j5ve3dRfk/WV7+r/p0Xi4AAABQhuePrpw773jPucfPvtB18vDu3pMznjyzrqnvdOslLatXbTr2WlH+T5WX/0cMzcsBAAAALuDeO59bsfnVl/oe2H/X2Ck9FVc1XpK4ZduOqU0TPuq8tPfy7VuL8v/y8vL/yOw2u/Ih06kz/BVC6+goqu7fWZspHIparskVAAAAgE9IyOlbP1ixbOzOsb3jj59+rObQG4dn/2Vt55yN13RXdW/uXNZ4WdH9AkJiL3X//3Cng7D+v+D+f0Xr//MKmbv+zXZjAAAAAL6Iitfzh9vjZ765oNT375e7/n9J3cQTiba33lv11XMHzo1ZsP/7129aV9/be8+El37/wz9M/6i6KP83l5f/K/O3n+T3/wEAAMBF+F/7/r+lReMMbLD7/zdV9DWsWrd3+uota7csTCw7UH3qwdV731+w5l9Tb36+qea6A0X5v6W8/B+2o/JfXkc4P5tGR9H4/p3s3QR/Hqa7OlZor8orZE58rMeNoUe20D4ir5C2Ntbja6OjaHL/TnOs8P+h0BIrnK3JFnbFCsdCIXs95Ap7YoWOcKVtq8lON154MRSyCyzawwqKUbklEbEe75Xq0V+4YI+u3JMDAAB8oYTwnM2yVYXNKB5l2xODHTBysAMqBjugcrADhsUOiB9Y6vFoeWEhPH575yMbNjVMSb7y8NzHfvbms40T9j1+WV3v5g9f2XbvxJ3TW6YW5f9d5eX/cCqGZzal1v9HYf1/9nsNc+v/l4dCbazQHgqp+B0DUuE5MmH34fActalsj7PjcwUAAAD4XAufC1QO8TwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/s3evcdJVd0JAj/d9INumqaNE9GMk3TUgGakaWwNw+AoaoxGRZpZddxkNBBoEGmE8FgFURtQZxziZ3ztrJnoCAoiu+qHGFeDwUhcxIw6iWLiA/Cxjq7r+h6VGM2E/XTfOkXVrS67EFDa+X7/6DpVv/O89eg69946FwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACA/xjuPfjlk4YunP0PHzace8nqqqmL/kfH6Mv+cNW3vvjUPy5b9G9h/i9GnLll3kEXHjd/wbR/6Vi++ogzQmjtKleWFC977oqvPtS613HP3jFw48wbb63fUpWpNxMP/Tr/lGfuXBxbfbF/CHeXhVCRDgypSwKVmft1sb5960LYI2wLZEu01SYl0g2HB2pCWBK2BbJVra4JoS4ncMqG+++7vDNxTU0IXwkhVKfbeKY6aaMmHRhUlQRq04HpFUngt1sT2cBPypMA7LD4Zsi+6Fe15mdo6L5ckddf5U7r2KcrPbw+MdFQPN/rR+3iTuWoSj/QukNPW0F17BIFb4+13m294N1WsJ2v8LTlfpHKfEPZui1UHcontk0aP6d9dnykPDQ19SlW0y56np9+e/6E7Un3mtdh7EDDTnkdXvrYiun9lo2+9OrNvxqz4ayaA3a0m0/lbNLc9K5WHTKvuV7zPEajfJ70grdfwbekRl+6Qghbzz17xtfnTDz7iD63PLnu1QcfrNty9pwFvzhz4nmLLj55w7/Pf6lg/t/w0fP/+HKOt+V5uWOrH9Ync/P4SF1MvFmfzM0BAACg1+gNe01Xnv/6X73+/bWtMxed/u23Dj73w71afz3i/gFVB7yxrqn1/I2ff6Vg/t9Y2vH/eMi/Lne0a0MY1ZVYNCCEvbseTwIrY3e+OyCEL3elWvMDR6UCa0PYpytxULaqVIm+sURjKvByfSYwKhVYHwOtqcDyGLgiFbg4BlalAhNiYG0qcHQMhCn54/hqfWYcJQdqYmBcshFXxbMQ3qmPraW21aZsVQAAADtJZnZYmX8351yHHc0Qp5eranrKEM/ALpqhOlVDegabnVYVraGipxrKe6ohO+6Ojx5+Qc1lPdVccBpGWX6GG9f85X2LXjzsC2P3mvj5xUMvmPKz8eGst++uerx5yYtv7XvEzesK5v/NHz3/r+6mI2UFx/9DGNv1N+Yuz0Tas/FxrXkZAAAAgB1w0R//xR61Lw85oGHT+2X3zl/7xKMrfrl5j1NOf3/c8a//8PCaxnsL5v+jSjv/P+4T6ZOTOTwSd0NMHRBCc34gqXZkYSA56t0vEwAAAIDeIHs8PnssfErmNjlFOz2fLszfup3544H/Ud3m//09/7P2jq3/+mLZBd89d0TNgKX/9GrHhBNOPvqW47/1zj4VB/yyvGD+31ra+f+1+bdJJ9bHXlw9IIS+OYEHYy87A10aY+D5I/MDmfGvjxtgcawqc2JCtqrFscS4GGhOBZYUK/FotsTe+YHMk5VtfFF2HFMyJXICAAAA8ImLuwPicfl4/n/LGSNO++vvzfrbha88eN7qCy75q+Ed80eedP/THzbMvXJp2PTmEQXz/3Hbd/5/1zy44PT+9n4hDK0IoU/6hwGP1CYLA8ZAXVkmcW9tUlefdFULa0MY2TmwdFUvZNb/r0ivMfh4TVJVDOy93y1vD+pMLKsJYWhu4IlvLz2sMzEnFcg2flpNCF/qHG268bv6Jo1Xphu/tm8IX8wJZKua0DeEzsaq0lX9r+rMdQzSVa2qDmHPnEC2quHVIcwNAPRW8X/pxNwHZ82dN3V8e3vbzF2YiDvxa8KkKe1tTROmt0+sLtKniak+561jtKBwTKVe+mZTZo2ixSsnV5aSzv5QsDm3rcyO/IIzBzP345ehyq5xHlKZd7clPeQD9y9sIuR8lSo25PJdPOTa3Eq2PYkF9cf8VaFf6DtnVtvMpvPGz549c1jyt9TshyR/43GmZFsNS2+r2u76VsLLo+hyWSkfd1sNyq1k6OxpM4bOmjtvyJRp4ye3TW47p+XQP2sZMXz410YM7RxUc/K3h5EO6q7m1Ei3Li1xWDtxpF+oyKnkk/jQkJCQ6G2J/f7L5odH77n+nOt/9tqPz+/3zdPu3fvImT889KqpD1Xve/ji24ccWDD/n/HR8//4qRM/+DPrMxQ7/t8QD/Mnj287zD8uBpaUevy/odjR/OyJAY2pQEcMdDjMDwAAwGdD3B0Z92bGndKbb1m/buOSlrk/aHin5dY17Utvuum+U39y58ATvjQ47LXhuhM+VzD/7yjt9/87af3/7NL1JxRb5v+gWKK52Pr/6WX+s+v/dxRb/z+9zH92/f8ln8L6/3OygdQmecf6/wAAwGfBJ7f+f4/L+6cvEFCQocfl/dMXCCjI0OMy/qVeIGC71/9f8+Bff6Wq35g7/qTlN/WXvPZ39xzWeuS6zTP/5Etb10+877qxt6wpmP9fUdr838L9AAAAsPv4z5ddU3H02Xff0bJu6sZxbw5+98m3lgzq80HF0Q+3j3xh4Bu3nlcw/19S2vz/k1//LxQ7/7+xWKC12MKA1v8DAACglyq2/t89Q1sa/zCm/x+eHvab5Q/ePPqnj/z898v3+/mJPyvfZ8Gxz8+8bFLB/H9VafP/eNpFeV7u2JsP65M17UJ6Tbs367M/GQAAAIDeoTw0NVWWmDdvYdSjPn6bT2eWAv2odK7vvXLt2ZtfmH7c46ev+7uaEwbvOWHaBasa/2b4gXd+ftQley7ddGrB/H9tafP/vN9lXPrYiun9lo2+9MOrN/9qzIazag7YdvwfAAAA2HVK3S8BAAAAAAAAAAAAAAB8+s7tWHzhI8uOfe+bt//F/kcseXXwbXcd+Lsh/V664qoHJq1648zJXy/4/X8Y21Wu2O//43X/4u8L/igvd2y15/X/MvdPOfH2uV1LFj5SH8L+uYGpC6fuETLX5h+cG7jvjIMGdiYWpkusefbolzoT30kHjh/yuS2dicNTgXFxkcR90oF4VcUt/VOBuLzi4+lA3B6r0oGqTOCy/sk4ytLb6pW6ZFuVpbfVxroQBuQEstvq7rqkjbL0AK9JBbID/F46EAd4ciZQnu7V7f2SXsVAXSx6Q7+kVwAA7Lbit8DKMGlKe1tz/Aofb79QkX8b5S1ZtqCw2rISm9+UWZps8crJlaWk+6S/i2671nhlqO4cwrCCr6u5Wcq6Rrlzaulh0/1RkSH3tNpbeZFyadu76aqKj6gmGVHThOntEyt7HHhLz1kOqegxy7CCyU5ulvKuTVpCLSX0pYQRlbhtSuhyvF8empr6pHL9eQw2hDw9vSJK/b1+7jp/xV4FuXluO/TKt758zE+f++CfP/9E/2+cVnP7rO+/e+KvX7//wEOOuG5C05otBfP/htLm/9W549qSuRhAR7yy3sgBIYwrcUQAAADw2XfbRbfecfr09a9MWlvx5GOPTS0fc3rl1vl3zp93ycZ7Fx9/2cErdjR+2Fm//f5vBu//b89e9dJPR+7zwA03/58nD3v8z3//8I8eeqduZZ+x7xXM/xtLm//HPViZQ8HJ3o618fr/iwaE0HVp/YYksDIO97sDQvhyV6o1lkguqH9CLNGcBFbGHSYHxRLjWvOr6hsDq1KBl+szgbWpwPoYyOyluCVkduVcWR/CYV2psfklZsQSDanAmBhoTAWaYqA5FegfA6NSgdf6ZwKtqcDDMRCm5G+rH/fPbCsAAIDtkZlnVebfDel53qqKnjKU9ZShtqcM5T1lqO4pQ7FRxPt3xAyVqZNXynIyVaZrrUnVUpAhXgx/u/tVkCE8mp8zXbCg6Xj+QfZ8g7L8DFf+4NlT1w+e/tDqzcd8beBt/zhkz4Obp9e9t+CGp3475pzrnv/TQQXz/+bS5v+1+bdJ6+vj/H/b9f+SwIOxe1fHU8cbY+D5I/MDmR0D6+Nkd3G2qtZMicykfXEsMSoGGlOBGTEwKhUYNzYTWDIwP5CZaWcbX5RtfEqmRE4AAAAAPnFxB0HcTRPn/zce9YOr3x8wccuyeTPvH9vyxMmjv3H1XT+6d/9ld767YvCAce99p2D+P6q0+X9sr19uYxfH3rzYP4S7y7b1JhsYUpcE4n6Muvjz+H3rQtgjZwdHtkRbbVKiKtVweKAm+YV6Vbqq1TXJGgPx/ikb7r/v8s7ENTUhfCVn70u2jWeqkzZq0oFBVUmgNh2YXpEE4p6fbOAn5UkAdlh2r2B8QWVOdclq6L5ckdffZ+WaoOnhFewD7SZfd7+52lWq0w9k9qlmbd/TVlAdu0TB22Otd1tvfLc1eLflfpHKfEPZui1UHcontk0aP6d9dnwk95esBXbR85z7K9VS0jvhddjx8Xvbs+p0B5pTHx/N3Zfr/nVYFqu79LEV0/stG33p1Zt/NWbDWTUHlNyNIuIPhX+05X9XPpWzeXe16pB5zfW6z5NWnye98d9Ao6cthHDZ9cfsu+TdX+/33A3Pnbqu7Maxr/7lrHs2Lf+bysNHrXv/yaGjLy+Y/7eWNv+vSN12+V3cmLMGhHBgzsZ9JG7+YwYkn4M5geRTcs/CQHLI/V/ri35yAgAAwM6W3d2R3V8wJXObnBCenicX5m/dzvxxf8WobvOX2u9j121cedLQN6474G8vOPGNv7/28Kceuv6ysnXL//vYD1avuXzxe08UzP/HffT8v2+qm47/O/7PLuL4f7d2913RfdMPdOzQruiC6tglHP/v1u7+bnP8v1uO/zv+3x3H/3vg+H+3dvenreBb0gxfujonwdff+fPfTbzpg7mN+x180lPPHDrxun+6quXuu0555b+de9601761uWD+P6O0+b/1/7pftC+7/t+4Yuv/zSi2/l+H9f8AAIBdqshCc+l5XsHqfQUZ0qv3FWTocYHAHpcYtP7fdq//t3Dkv1904Q+fb7n2nTvHXb5m07Fnvvr0utXPzFpx3Lnnv9V6112tBfP/jtLm//Hl0C+39d6y/l/j2CJVXREDMywMCAAAwO6o2A4CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPl2HnvbO+5d8/R/aBv1ixc1/f+v/+7/P1q594JvfuGn4L6f86RllazZcM+LMLfMOuvC4+Qum/UvH8tVHnBHClK5yZUnxsueu+OpDrXsd9+wdAzfOvPHW+i3VmXorM7d/nJc7tvphfQhLch6pi4k36zvvbAuccuLtcys6E4/Uh7B/bmDqwql7dCaW14cwODdw3xkHDexMLEyXWPPs0S91Jr6TDhw/5HNbOhOHZwJl6e5e1z/pblm6u5f3D2FATiDb3bP751eVbeO4TKA83caKuqSNGKiLRa+tS9qIgfZYYkrfEIZWhNAnXdU/VydV9UlXdU91UlWfdFUXVYcwMoRQka7quaqkqor0yB+tSqqKgb33u+XtQZ2JpVUhDM0NPPHtpYd1JmamAtnG/1NVCF/qfMmkG/9xZdJ4Zbrx/1oZwhdDCFXpEu9VJCWq0iVeqAhhz5zAto1YEcLcwGdD/PSZmPvgrLnzpo5vb2+buQsTVZm2asKkKe1tTROmt0+sTvWpmLKc9NYFH3/sm96eP6HzdvHKyZWlpCsy5Sq7unxIZd7dlt2997FftbmVbHs+CuqP+atCv9B3zqy2mU3njZ89e+aw5G+p2Q9J/vbJRJNtNay3bKtBuZUMnT1txtBZc+cNmTJt/OS2yW3ntBz6Zy0jhg//2oihnYNqTv7ujJEu/eRH+oWKnEo+ife/hIREb0uU5326Ne/un+MFX/S3dbQyVHd9QBdMK3KzlHWNcmcM+qiPOeKP8zWlxxENK5g4FGQ5pOcsLQWTiW1ZapIsXV/rCiaHuTWVd23SeL88NDX1KbYdGvLv5m7e13dg8z6d2XSlpgEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA+P/swIEAAAAAAJD/ayNUVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVYQcOBAAAAACA/F8boaqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqgo7cCwAAAAAIMzfOoyeDQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC4FAAA//8fSxmR")
ioctl$BTRFS_IOC_QUOTA_CTL(r2, 0xc0109428, &(0x7f00000000c0)={0x4})
chdir(&(0x7f0000000140)='./file0\x00')
openat$vmci(0xffffffffffffff9c, 0x0, 0x2, 0x0)
syz_mount_image$ntfs3(&(0x7f0000000000), &(0x7f0000001580)='./file1\x00', 0x200c040, &(0x7f0000000600)=ANY=[@ANYRESHEX=r0, @ANYRESHEX=r2, @ANYRES64=r1, @ANYBLOB="f5cc32767f784e4e9a23ecead1f2142dda63d2ebece4effd51c66ffd05fe76585be0f9271cedad29b748ac1d9c6fd100ccd383aed4fd1663abfa0c772a48187499dc15d9fd6656c1d88b37ebde396b328170389921672eceaa17bd3c7763451c2fc19ee3606d20d89d7adc6cc5b05cb4cb097e822b1809854d03feec9f6e7023c7cffbaab6f838b00831cbd754a1959b84c49b25d576aebadd43b4b6c425ae3aa355dccedde940c76c15b2a19cf51a016150f08f", @ANYRESDEC=r2, @ANYRESHEX=r2], 0x1, 0x1f795, &(0x7f000000ac40)="$eJzs3Qm8DfXfB/Df7Ps+R3aXbpKQfUmyr9m3kGzZ92whcUm2ZCkh2ZJsSUIlSSRRkl2yJUmSpF0Sz8vcc2/3Xvxr2n494/Pu1XfOmXvOzPcsn5k548ycLxtOqtu4WqOEhIQEwrIk2TmSThJJIpes5Mt2fNwlLnnIEEJ4QsjExIKta/aor10eZ+Zad/+QpXnW99OarjJfl8g2u9WXZ4se35ZpW5YvLzbu3KVvQpe+CT179Utom9CuV69+bdt175DQvkvfboUS6nfv0LZvh4QuPft26JPuzx279+rde1BC257tDbV3nw59+ya07TkooVuHQQn9eiX06zMooW2ntl16JhQqVCjBUAn8QU2W0O4AAAAAAAAAAAAAAAAAAAD+HpcukUsM7SYAAAAAAAAAAAAAAAAAAADgL6lWs3bVwoRLvc4QhpQmDJmUcmB/XMrFlFvahJA2waXMQa0Rv5S02CzQ/fx+5lpD8WpNXOULCIUJIZ1Tp8+SWsElhgjBOOF350O6xycUH6bMl+cSSR1SjTSOX0+K986QCukauRgfVkgZcYG76tC9lHxKhOXppsNd8XgqZHgiE0j6ocPIwfDSpUuXrvYU/TOu7BP+28K+z2SSfpj+fcZdM//NMuSfi6cj5RQh/0b+e6dOnyUNw+bfTtNomvzLqfmvQ7qQPqRPfPy1lgNcxufVvvowN598RpR/N79/FUu7AaCKvSL/7P/IP/v/Kf+pzScP0+a/NulFOpFqpAvpTjrEx18r/ylhTs1/hummDPMQhuORf/h/5cr8c/H8H82QfzG+DEgZZcfzkpL/wn8y/+m38xlSP3TO00uZvsQlkqakF+lO+pMepEMw3aTU+bCkfeoc+aTLjyPl84Af/LV4PBk+mcO4hEmei+DG7x+MS76BcHn7PYEl6W6T8W8kvqwsnDp/nrjxS33JIDKYdCNtSfdgaZSyPXJ52Zcv9fYC0VOf6fjrFH/kSanjM6durWT+w8sh/vdvAhHGX5F/Pp7/9Vf5/M9nWP//Xfn/S+t5PT6h+Fn/0q7nK5J+pB/pQ6qQDqRjfHz65QD3h5cDY8mVy4FgXMjlwLW2M1Ke19TtjAyPK2WYm8sc3Onv2c7A+v/6duX6XwjyzyW/NTKs/4UM+//+V/73273SDVPG5/3tk3Pq/X77HMGQKn99/R9Mn+EKBdfrp8vbb+/3lIeWmjc/eVDATj/MLVjBPCYRQpoFd0xebpSO77tLJDVJMVI4eBxM/Inh489Pvvj/aZvMnGGvzdx4jwxfIf4MJ0u5TQ2S5gm/yjBl/rl5P5jN8pTtBj55O0PO0GdKLyn7f+B6duX+PzG+/q9/lc//4r/8+T/t/v/Qnwv49NNO+7mgEulC+pEepC3pfc31cYqM+1NTbxIf5may/Y3r438b1v/XtyvX/1I8/6uvsv0v/UPb/2lzXjtszjP8Q1rK9IUg571IL9IvuP5f3e6X4sPU5UzGfxiMD534P+D8vcsZ5P/6dmX+5Xj+B14l/zK1/f/cn9j/n17a/QKVSFvSnlQO9g32Jcl5Sr8fjU29lJRu+zzN9wPiwU3dPr+QK/0wLmVu7KXkO/y3thOQ/+vblflXgvwr6Xf2xz+ZKn/P938SMnbB/O72/9/y7wLBfC9v/zciHch9pD/pQzpkyHeaPMSnkJpvPvnJ6M2kH16eWhXSiORhmOxMfD/BUfLbfoL1qZ+/L98ueWouSb+fYHn8/+RH68cff/IKv0HiyuD5SRkmxW83hBBSk9S84vZzDx+pxDK/DVMeR/ur3p4jZ13u3OXbpAxJhumTNK83G398W8hv+xcWpXl8Kbe3Mzy+6fH/SbBFlTz/fClvrmvMO+PtrvUcZLzdtR57xsfx31oO03Ll5381vv6ffpX1v0rt8z/3J/Kf0mnyMO3n/yakN6lM2pK+8fxfbbt8WHz427/7s1cd5kkiOdg02/dESN5+SIjvh0skNUlP0pH0it8rZeGnd8nW7YGhhw5kfNz/7vsS6//r25Xrfy2+/z+JvXL/n/b3rP/tjF38S+v/YL6X81+VDCT9SAfSk7QP1mfJn3WSc9uM+f39+jWY5P8vy5L8CEgbUjf+/CXXsP3ljS8aRC6R1CPtSFdSM3Ufxd89/Qakf7BfpC3hSNb49DsHn7D++vQvf75qSDqQ3qQt6RMsYdPD+ve/5Mr1v05IsP5PuMr+f/0f2P9H+3s56T/3M6mXro/3J9b/17cr1//G/8i/gfxHDPIfOWm+F/r7rsy/+T/ybyL//z2/9xUe4X/9Efm/vl2Zf+t/5N9C/iMG+b++XZl/O8g/IWNI+vwzqff47XZ/R/6vj5z9VyH/17cr8+8g//8/yX/mTsj/9e3K/LvI/3UE+b++XZl/D/m/jiD/17cr8+8j/9cR5P/6dmX+Y8j/dQT5v75dmf9MyP91BPm/vl2Z/xuQ/+sI8n99uzL/yef/Nkhv9srj/zL/Dcf/5f2Tx//WjR//8/ce/5J8POvA4KlIPs6wWerxrPVS7+uQDMf/pDkuOCH+JOWLH9Z0xXeMmPQPOO1xtGdT58uSHanzbZDutmnnuz7+Pwn20xSOP+/x42Pj0037wFOei0sZpH39/fgRjs3SHI8bdjpX+15VxseL5fx/0bXzf7Xzf2eJSP7THl83Kf4dxJQcFr5K/j2SPocJ5MoH8f/zfY71//Xt2vlPukr+s/4Nx/+Gyf9v5/9hUs//808dnzopZX7sb+cbTF4ONEydRsblQIU05wfIHh8mxE/cqxMyN0f8GU3rv7WcwPk/r2/Jx//mz5D/y8uDMWzG8/+cCIa/5T5ZSu6nc92D3JUe1yPdMGV8wlWO+2f/53o/uZuM5wP8vflknN7l+RYM7tkx/nsfydvNl/Nupznu/+wfOJ/v0ZRz+6TmnSFtgg4JyRa/nrGvjMOU7+llTfM7YFfrL8efnX5GfMbpF0nzxxOpy6Mdqds0yZ+D5Pily/e9I/64m6X+nQ/GZU45bbLIk8TazZrUrJI81TTjql9lXJ16VchXFeMdhFoOXuX9An/FiWvk/8rz/wvks2B4rfz/3vvyipV+yPz/3flKusZ5wVNaSj3vT4bppQxzs2ywK/Pfzs3weFvXzE3Tq469hs/C3Bj+AXTfP59dM/9trjj//1/L/9V+auYfyX9G11j/Xet8n+vjr0kyjqyOj7+8Pb+cEFI3CFjyuXhaBmcR701axn/FpyDpEpxVvBPpEPzfkxQnRUkJUiz4rxQpTEqQoqRlmiVR+svlSXnS8m+eavr3F5/h/fVZhveX8FffX5eeSff+gv+2a+f/3H9s/Z+y/f9HlzMpP5tzeb7Nr5H/q63/U2Q873+Gn+EhuRn+f67/L8+zX3z5PSn173ww7q8uv4PPHppI+vft0KfQwLb9+vUpQuKDq/ytKIkPAulzifU/bf+99T8b5D/fVdb/zar/8fyn/O7Pn89/8vro9/KfcT4kMXk6+W5MHl65/V8sw3zIH1rOhJ9PcuCutZxJOVwvdTmTYXopw9yKkLKciX914u98n3xG/vTnDPhb/Pfyn7z+3xJy/Z/x974yDsPkv3eabhpnyOXvzSel55RzPqfkUknNZSFyX/C7oMm3CLsdkDLdlGFuxvxv7gf4Q7D+p+2/ln8Sz2BpkvHnYf+Lv18Bf41EuwGgSgqR/+QtVuQ/Sv7USUMgMuQQ+VeCivxHiUK7AaBKCZH/5D3fyH+UqLQbAKrUEPlP/uYt8h8lGu0G4N+WNe1Ody1E/pO/1478R4lOuwGgSg+RfyOoyH+UGLQbAKqMEPk3g4r8R4lJuwGgygyRfyuoyH+UWLQbAKqsEPlPPnwX+Y+SKw7JhuuKHSL/wVmwkP9IcWg3AFQ5IfKf/CubyH+UuLQbAKrcEPkPzn6F/EeKR7sBoMoLkf/gSCLkP1J82g0AVX6I/MeCivxHSYx2A0BVLET+MwUV+Y+STLQbAKoyhcj/DUFF/qPkBtoNAFU34Pjf61pm2g0AVZlD5D9LUJH/KMlCuwGgKkuI/GcNKvIfJVlpNwBUZQ2R/+Qz8CP/UZKNdgNAVbYQ+U/+xSvkP0qy024AqMoeIv/Jv0iH/EdJDtoNAFU5QuQ/Z1CR/yjJSbsBoCpniPznCiryHyW5aDcAVOUKkf/kXylB/qPkar/KBNePhBD5zx1U5D9KctNuAKjKHSL/eYKK/EdJHtoNAC3BT7/lCZH/5B+kRf6j5EbaDQBVN4bIf/IPVCP/UZJIuwGgKjFE/m8KKvIfJTfRbgCouilE/vMGFfmPkry0GwCq8obI/81BRf6j5GbaDQBVN4fIf76gIv9Rko92A0BVvhD5vyWoyH+U3EK7AaDqlhD5zx9U5D9K8tNuAKjKHyL/twYV+Y+SW2k3AP+cP5DTW0Pkv0DyRJH/CClAuwGgqkCI/BcMKvIfJQVpNwBUFQyR/0JBRf6jpBDtBoCqQiHyf1tQkf8ouY12A0DVbSHyXzioyH+UFKbdAFBVOET+iwQV+Y+SIrQbAKqKhMh/0aAi/1Eh0G4AqCsaIv/Fgor8R0kx2g0AVcX+eP7F5AHyHyXFaTcAVBUPsf4vEVTkP0pK0G4AqCoRIv8lg4r8R0lJ2g0AVSVD5L9UUJH/KClFuwGgqlSI/AdjkP9IKU27AaCqdIj8lwkq8h8lZWg3AFSVCZH/24OK/EfJ7bQbAKpuD5H/skFF/qOkLO0GgKqyIfJ/R1CR/yi5g3YDQNUdIfJfLqjIf5SUo90AUFUuRP7vDCryHyV30m4AqLozRP7LBxX5j5LytBsAqsqHyH+FoCL/UVKBdgNAVYUQ+a8YVOQ/SirSbgCoqhgi/5WCivxHSSXaDQBVlULkv3JQkf8oqUy7AaCqcoj8Vwkq8h8lVWg3AFRVCZH/qkFF/qOkKu0GgKqqIfJfLajIf5RUo90AUFUtRP6rBxX5j5LqtBsAqqqHyH+NoCL/UVKDdgNAVY0Q+a8ZVOQ/SmrSbgCoqhki/7WCivxHSS3aDQBVtULk/66gIv9RchftBoCqu0Lkv3ZQ0+Vf/pfahH9IbdoNAFW1Q+S/TlCx/o+SOrQbAKrqhMh/3aAi/1FSl3YDQFXdEPmvF1TkP0rq0W4AqKoXIv/1g4r8R0l92g0AVfVD5L9BUJH/KGlAuwGgqkGI/DcMKvIfJQ1pNwBUNQyR/0ZBRf6jpBHtBoCqRiHy3zioyH+UNKbdAFDVOET+mwQV+Y+SJrQbAKqahMh/06Ai/1HSlHYDQFXTEPm/O6jIf5TcTbsBoOruEPlvFlTkP0qa0W4AqGoWIv/Ng4r8R0lz2g0AVc1D5L9FUJH/KGlBuwGgqkWI/N8TVOQ/Su6h3QBQdU+I/LcMKvIfJS1pNwBUtQyR/3uDivxHyb20GwCq7v0t/2myfvX8twoq8h8lrWg3AH+rsL/n3irE+r91UJH/KGlNuwGgqnWI/LcJKvIfJW1oNwBUtQmR/7ZBRf6jpC3tBoCqtiHy3y6oyH+UtKPdAFDVLkT+7wsq8h8l99FuAKi6L0T+2wcV+Y+S9rQbAKrah8h/h6Ai/1HSgXYDQFWHEPnvGFTkP0o60m4AqOoYIv+dgor8R0kn2g0AVZ1C5L9zUJH/KOlMuwGgqnOI/HcJKvIfJV1oNwBUdQmR/65BRf6jpCvtBoCqriHy3y2oyH+UdKPdAFDVLUT+uwcV+Y+S7rQbAKq6h8h/j6Ai/1HSg3YDQFWPEPnvGVTkP0p60m4AqOoZIv+9gor8R0kv2g0AVb2ukn/lGvnvHVTkP0p6024AqOodYv1/f1CR/yi5n3YDQNX9IfLfJ6jIf5T0od0AUNUnRP77BhX5j5K+tBsAqvqGyH+/oCL/UdKPdgNAVb8Q+e8fVOQ/SvrTbgCo6h8i/wOCivxHBz79X+8GhMj/A0FF/qPkAdoNAFUPhMj/wKAi/1EykHYDQNXAEPkfFFTkP0oG0W4AqBoUIv+Dg4r8R8lg2g0AVYND5P/BoCL/UfIg7QaAqgdD5H9IUJH/KBlCuwGgakiI/D8UVOQ/Sh6i3QBQ9VCI/A8NKvIfJUNpNwBUDQ2R/2FBRf6jZBjtBoCqYSHynxRU5D9Kkmg3AFQlhcj/8KAi/1EynHYDQNXwEPkfEVTkP0pG0G4AqBoRIv8PBxX5j5KHaTcA/wLmmn95OET+RwYV+Y+SkbQbAKpGhsj/I0FF/qPkEdoNAFWPhMj/qKAi/1EyinYDQNWoEPkfHVTkP0pG024AqBodIv9jgor8R8kY2g0AVWNC5H9sUJH/KBlLuwGgamyI/I8LKvIfJeNoNwBUjQuR/0eDivxHyaO0GwCqHg2R//FBRf6jZDztBoCq8SHy/1hQkf8oeYx2A0DVYyHyPyGoyH+UTKDdAFA1IUT+JwYV+Y+SibQbAKomhsj/pKAi/1EyiXYDQNWkEPmfHFTkP0om024AqJocIv+PBxX5j5LHaTcAVD3+x/MvJg+Q/yh5gnYDQNUTIdb/U4KK/EfJFNoNAC0uCV7/P57/J4OK/EfJk7QbAKqeDJH/qUFF/qNkKu0GgKqpIfI/LajIf5RMo90AUDUtRP6nBxX5j5LptBsAqqaHyP9TQUX+o+Qp2g0AVU+FyP+MoCL/UTKDdgNA1YwQ+X86qMh/lDxNuwGg6ukQ+Z8ZVOQ/SmbSbgComhki/7OCivxHySzaDQBVs0Lkf3ZQkf8omU27AaBqdoj8zwkq8h8lc2g3AFTNCZH/uUFF/qNkLu0GgKq5IfL/TFCR/yh5hnYDQNUzIfI/L6jIf5TMo90AUDUvRP6fDSryHxki7QaAtmdD5H9+UJH/KJlPuwGgan6I/D8XVOQ/Sp6j3QBQ9VyI/C8IKvIfJQtoNwBULQiR/4VBRf6jZCHtBoCqhSHyvyioyH+ULKLdAFC1KET+FwcV+Y+SxbQbAKoWh8j/kqAi/1GyhHYDQNWSEPl/PqjIf5Q8T7sBoOr5EPlfGlTkP0qW0m4AqFoaIv8vBBX5j5IXaDcAVL0QIv/Lgor8R8ky2g0AVctC5P/FoCL/UfIi7QaAqhdD5H95UJH/KFlOuwGganmI/L8UVOQ/Sl6i3QBQ9VKI/K8IKvIfJStoNwBUrQiR/5VBRf6jZCXtBoCqlSHyvyqoyH+UrKLdAFC1KkT+Xw4q8h8lL9NuAKh6OUT+Xwkq8h8lr9BuAKh6JUT+Xw0q8h8lr9JuAKh6NUT+VwcV+Y+S1bQbAKpWh8j/a0FF/qPkNdoNAFWvhcj/mqAi/1GyhnYDQNWaEPl/PajIf5S8TrsBoOr1EPlfG1TkP0rW0m4AqFobIv9vBBX5j5I3aDcAVL0RIv/rgor8R8k62g0AVetC5P/NoCL/UfIm7QaAqjdD5H99UJH/KFlPuwGgan2I/G8IKvIfJRtoNwBUbQiR/7eCivxHyVu0GwCq3gqR/41BRf6jZCPtBoCqjSHy/3ZQkf8oeZt2A0DV2yHyvymoyH+UbKLdAFC1KUT+3wkq8h8l79BuAKh6J0T+NwcV+Y+SzbQbAKo2h8j/lqAi/1GyhXYDQNWWEPl/N6jIf5S8S7sBoOrdEPl/L6jIf5S8R7sBoOq9EPnfGlTkP0q20m4AqNoaIv/vBxX5j5L3aTcAVL0fIv/bgor8R8k22g0AVdtC5P+DoCL/UfIB7QaAqg9C5H97UJH/KNlOuwGganuI/O8IKvIfJTtoNwBU7QiR/51BRf6jZCftBoCqnSHyvyuoyH+U7KLdAFC1K0T+dwcV+Y+S3bQbAKp2h8j/nqAi/1Gyh3YDQNWeEPnfG1TkP0r20m4AqNobIv/7gor8R8k+2g0AVftC5P/DoCL/UfIh7QaAqg9D5H9/UJH/KNlPuwGgan+I/H8UVOQ/Sj6i3QBQ9VGI/B8IKvIfJQdoNwBUHQiR/4NBRf6j5CDtBoCqgyHyfyioyH+UHKLdAFB1KET+DwcV+Y+Sw7QbAKoOh8j/kaAi/1FyhHYDQNWREPn/OKjIf5R8TLsB+BeJV4z5OET+jwYV+Y+So7QbAKqOhsj/J0FF/qPkE9oNAFWfhMj/saAi/1FyjHYDQNWxEPn/NKjIf5R8SrsBoOrTEPk/HlTkP0qO024AqDpOGMIn+TYhCXbyGJ8QwhJiy8G1zOQSk4swbHBFSCCEJFy+rGW+fN2+Yjzxk2/PJI93L1fNT75t2nHEJ28yXurttNTbMUfTjSOZyWLippt/7t/mP/eK8QAAAAAAAADwh6X9rJ7+czsAREnNulWa5SPKFePzEUJWi4SctJKvG2Q/c7X78/HamTQMLgnxmrTYLND9/H7mWkOixyegJg9Svpsoc4mkIulH+pE+pArpQDoGYxmSvD9SDj+fDNLOpxJpS9qTyqQ76U/6BmOl+N9F0obUDzeflH8rYdLPRwrm04X0Iz1IW9KbSPH5tCG1w00/IT7BhPTTF4Lp9yK9SD/ShXQnyftNtT/Yv0H+x/Nkp/RflQwk/UgH0pO0J8m7Wdk/8Trw6d4xaV+H2qQX6USqBf13SPskEoa0IbXCzad7/K7d0z9PPJdI6pBqpHG6l4kJ/zji+8lThmkfR/L065AupA/pk7wfPZ4sJfz76crXIyHl9WhEOpD7SH/Sh3SIx0f9E9Nng1ST+Cua9v3ahPQmlUlb0pd0iP/LIPd39J86/aakV5C5HqRD6vSakSp/dfrBK8JwhYLrCamvb+N0091v9wruf61hytu0fXy6l6dTMHgFO8bfn0mEBP0n3yJbfD4Z8zyd6x5Mr/S4HpeHl6caXE8Zn7H/BELsgsE9U+ZTOBif8xrTj0+XudYw5XGoaR5H8/jdk6dfJHj9c/3B6ac8P6nTT0yebr4b0z9Pv/VfNBj/z02/WLrnz059J8MfZ9BuAKjC6399w+t/fcPrf30zyOVPE41JRVKXVCEVSUNShbQmNUldUo3UIw1JHVKRNCY1ST1S9x/qwE5zuUL8u4eX1YgPE4MOGpOGpCapRJqQxqQqaU1qk5pB3/+8hDSXk9JcvhSXSKqRmqR20FVdUpHUIVX/ha5+UzjN5UqExD9FEVIpvjGcSOqRSqQWqUoqk8bBa1vlX+2vwjUup3zEv/z+u9xbk+AVbkyak9akCqlKGpHKwZj6pHHwTvyn1E9z+eqvb1NSj9QmTYJX9t9/jdukuVwh7Wec1OcvfX//ZnYv652hv5TdeinDxGC5cnkJQ0fSNcb/9vrWDJZ9VUkz0po0JPVIvX9luZJiUprLFX63v4qkNqlN6pHK/8pre9ncNJevno9KQW4vv9vq/8O9XM3y3+2vIalK6gfrtkZBQuqTesFz+u+8yuuv0V/Ki51IqpKKFHKbYkeGluT45ZRhcn/0XPsI9eQFYGKQh+qkOqkabLs0CZ672qnrkkbBtkPVYKn9j0izJy/pWn8A+I+6xFz6u+UKMfuk35uY9DsT+L2//978/9rdAf5/O+ty59KuqoYEay4+6fLn72bxccnHBBWPf/bwyVjiZvheUObkcRmOB/q965c1SFwZTCspxPznMFfOPxj3J+Y/9/CRSizz2+q6fXz+hdNsjyfPP3vyTbh0xzgF9Y/MBwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA4P+Tsy53jjC/XR9CCGEIn1SDENIsPs4nhLCkOGHj18YSlzDJ9xHcYJA5eVzyDYQEQkgCS373+mUNElcG00oKMf85zJXzD8b9ifnPPXykEssEMw20j8+/MCGkQrr5Z0++CeeTS0yulPkH9Y/MB+C/iSEs4QhPBCISichEISrRiE4MYhKL2MQhLvGIT2IkE7mBZCZZSFaSjWQnOUhOkoskkNwkD7mRJJKbSF5yM8lHbiH5ya2kAClICpHbSGFShBQlxUhxUoKUJKVIaVKG3E7KkjtIOXInKU8qkIqkEqlMqpCqpBqpTmqQmqQWuYvUJnVIXVKP1CcNSEPSiDQmTUhTcjdpRpqTFuQe0pLcS1qR1qRN2PuXv3z/B8kQ8hAZSoaRJDKcjCAPk5HkETKKjCZjyFgyjjxKxpPHyAQykUwik8nj5AkyhTxJppJpZDp5iswgT5OZZBaZTeaQueQZMo88S+aT58gCspAsIovJEvI8WUpeIMvIi2Q5eYmsICvJKvIyeYW8SlaT18ga8jpZS94g68ibZD3ZQN4iG8nbZBN5h2wmW8i75D2ylbxPtpEPyHayg+wku8husofsJfvIh2Q/+YgcIAfJIXKYHCEfh7z/TxnuP5AhDGFYhmV4hmdERmRkRmZURmV0RmdMxmRsxmZcxmV8xmcyMZmYzExmJiuTlcnOZGdyMjmZBCaBycPkYRKZRCYvk5fJx+Rj8jP5mQJMAaYQU4gpzBRhijJFmeJMcaYkU5IpzZRhyjBlmbJMOaYcU54pz1RkKjKVmcpMVaYqU52pztRkajG1mNpMbaYuU5epz9RnGjINmcZMY6Yp05RpxjRjWjAtmJZMS6YV04ppw7Rh2jHtmPZMe6Yj05HpzHRmujJdme5Md6Yn05PpzfRm+jB9mH5MP2YAM4AZyAxiBjEPMg8yDzEPMcOYyuxwZgQzghnJjGRGMaOZ0cxYZhzzKPMo8xgzgZnITGImM5OZJ5gpzI/MVGYaM52ZzpRjn2ZmMrOYBHYOM5eZy8xj5jHzmfnMAmYhs5BZzCxhnmeWMkuZZcyLzIvMS8wKZiWzknmZeZl5lVnNrGbWMK8za5m1zDrmJ2Y9s4F5i9nIvM1sYt5mNjNbmM3Me8xW5j1mG7ON2c5sZ3YyO5ndzG5mL7OX+ZD5kPmI+Yg5yBxkJjBHmCPMUeYoc4w5xhxnjjMnmBPMSeYkc4o5xZxmTjNnmDPMWeYb5lvmG+Z75nvmR+Yn5hxzjjnPnGcuMBeYi8zFy+FnL+NZnhVZkZVZmVVZldVZnTVZk7VZm3VZl/VZn83EZmIzs5nZrGxWNjubnc3J5mQT2NxsHjYPm8gmsnnZvGw+Nh+bn83PFmALsIXYQmxhtjBblC3KFmdLsCXZUmxptgx7O1uWLcuWY+9ky7MV2IpsRbYyW4WtylZjq7E12BpsLbYWW5utzdZl67K9CvQo0JAdzoximrCXX5lm7ESmBTuJacney7ZiW7NPMG3ZduwUpj3bge3IdmKnMVOZrmy7At3ZHmxPdibTm+1ZYBbTl+3HzmEGsA+wA9lB7GD2QXYI277AUHYYu4AZzo5gFzMj2UfYUexodhlThb38ilVlH2MnsBPZSexk9lXmCXYK+yQ7lZ3GTmefYmewT7Mz2VnsbHYOO5d9hp3HPsvOZ59jF7AL2UXsYnYJ+zy7lH2BXca+yC5nX2JXsCvZVezL7Cvsq+xq9jV2Dfs6u5Z9g13HvsmuZzewb7Eb2bfZTew77GZ2C/su+x67lX2f3cZ+wG5nd7A72V3sbnYPu5fdx37I7mc/Yg+wB9lD7GH2CPsxe5T9hD3GfsoeZz9jT7CfsyfZL9hT7JfsafYr9gz7NXuW/Yb9lv2O/Z79gf2R/Yk9x/7Mnmd/YS+wv7IX2Uss4RiO5TiO5wRO5CRO5hRO5TRO5wzO5CzO5hzO5TzO52JcJu4GLjOXhcvKZeOyczm4nFwuLoHLzeXhbuQSuZu4vNzNXD7uFi4/dytXgCvIFeJu4wpzRbiiXDGuOFeCK8mV4kpzZbjbubLcHVw57k6uPFeBq8hV4ipzVbiqXDWuOleDq8nV4u7ianN1uLpcPa4+14BryDXiGnNNuKbc3VwzrjnXgruHa8ndy7XiWnNtuLZcO+4+rj3XgevIdeI6c124rlw3rjvXg+vJ9eJ6c725Plwfrh/XnxvADeAGcoO4wdyv3EXuEjeUG8YlccO5EdzD3EjuEW4UN5obw43lxnGPcuO5x7gJ3ERuEjeZe5x7gpvCPclN5aZx07mnuBnc09xMbhY3m5vDzeWe4eZxz3Lzuee4BdxCbhG3mFvCPc+Nik9p+R+4/1tXuf/4YO7buR3cTm4Xt5vbw+3l9nHbuf3cfu4Ad4A7xB3ijnBHuKPcUe4Yd4w7zh3nTnAnuJPcSe4Ud4o7zZ3mznBnuLPcN9zP3Hfc99wP3I/cT9xP3M/cee48dyH+HBCe4Vme43le4EVe4mVe4VVe43Xe4E3e4m3e4V3e430+xmfib+Az81n4rHw2Pjufg8/J5+IT+Nx8Hv5GPpG/ic/L38zn42/h8/O38gX4gnwh/ra/fP/f668N34Zvx7fj2/Pt+Y58R74z35nvynflu/Pd+Z58T74335vvw/fh+/H9+AH8AH4gP5AfzA/mh/BD+KH8UD6JT+JH8A/zI/lH+FH8aH4MP5Yfx4/jx/Pj+Qn8BH4SP4l/nH+cn8JP4afyU/np/HR+Bj+Dn8nP5Gfzs/m5/Fx+Hj+Pn8/P5xfwC/hF/CJ+Cb+EX8ov5Zfxy/jl/HJ+Bb+CX8Wv4l/hX+FX86v5Nfwafi2/ll/Hr+PX8xv4DfxGfiO/id/Eb+Y38+/y7/Jb+a38Nn4bv57fwe/gd/G7+D38Hn4fv4/fz+/nD/AH+EP8If4If4Q/yh/lj/HH+OP8cf4Ef4I/yZ/kT/Gn+NP8af4Mf4Y/y5/lv+W/5b/nv+d/5H/kz/Hn+PP8ef4Cf4G/yF+8vNknsAIr8AIviIIoyIIsqIIq6IIumIIp2IItuIIr+IIvZBIyCZmFzEJWIauQXcgu5BRyCglCgpBHyCMkCjcJeYWbhXzCLUJ+4VahgFBQKCTcJhQWighFhWJCcaGEUFIoJZQWygi3C2WFO4Rywp1CeaGCUFGoJFQWqghVhWpCdaGGUFOoJdwl1BbqCHWFekJ9oYHQUGgkNBaaCE2Fu4VmQnOhhXCP0FK4V2gltBba/K3THy2MEcYK44RHhfHCY8IEYaIwSZgsPC48IUwRnhSmCtOE6cJTwgzhaWGmMEuYLcwR5grPCPOEZ4X5wnPCAmGhsEhYLCwRnheWCi8Iy4QXheXCS8IKYaWwSnhZeEV4VVgtvCasEV4X1gpvCOuEN4X1wgbhLWGj8LawSXhH2CxsEd4V3hO2Cu8L24QPhO3CDmGnsEvYLewR9gr7hA+F/cJHwgHhoHBIOCwcET4WjgqfCMeET4XjwmfCCeFz4aTwhXBK+FI4LXwlnBG+Fs4K3wjfCt8J3ws/CD8KPwnnhJ+F88IvwgXhV+GicEkgIiOyIifyoiCKoiTKoiKqoibqoiGaoiXaoiO6oif6YkzMJN4gZhaziFnFbGJ2MYeYU8wlJoi5xTzijWKieJOYV7xZzCfeIuYXbxULiAXFQuJtYmGxiFhULCYWF0uIJcVSYmmxjHi7WFa8Qywn3imWFyuIFcVKYmWxilhVrCZWF2uINcVa4l1ibbGOWFesJ9YXG4gNxUZiY7GJ2FS8W2wmNhdbiPeILcV7xVZia7GN2FZsJ94nthc7iB3FTmJnsYvYVewmdhd7iD3FXmJv8X6xj9hX7Cf2FweID4gDxUHiYPFBcYj4kDhUHCYmicPFEeLD4kjxEXGUOFocI44Vx4mPiuPFx8QJ4kRxkjhZfFx8QpwiPilOFaeJ08WnxBni0+JMcZY4W5wjzhWfEeeJz4rzxefEBeJCcZG4WFwiPi8uFV8Ql4kvisvFl8QV4kpxlfiy+Ir4qrhafE1cI74urhXfENeJb4rrxQ3iW+JG8W1xk/iOuFncIr4rviduFd8Xt4kfiNvFHeJOcZe4W9wj7hX3iR+K+8WPxAPiQfGQeFg8In4sHhU/EY+Jn4rHxc/EE+Ln4knxC/GU+KV4WvxKPCN+LZ4VvxG/Fb8Tvxd/EH8UfxLPiT+L58VfxAvir+JF8ZJIJEZiJU7iJUESJUmSJUVSJU3SJUMyJUuyJUdyJU/ypZiUSbpByixlkbJK2aTsUg4pp5RLSpByS3mkG6VE6SYpr3SzlE+6Rcov3SoVkApKhaTbpMJSEamoVEwqLpWQSkqlpNJSGel2qax0h1ROulMqL1WQKkqVpMpSFamqVE2qLtWQakq1pLuk2lIdqa5UT6ovNZAaSo2kxlITqal0t9RMai61kO6RWkr3Sq2k1lIbqa3UTrpPai91kDpKnaTOUhepq9RN6i71kHpKvaTe0v1SH6mv1E/qLw2QHpAGSoOkwdKD0hDpIWmoNExKkoZLI6SHpZHSI9IoabQ0RhorjZMelcZLj0kTpInSJGmy9Lj0hDRFelKaKk2TpktPSTOkp6WZ0ixptjRHmis9I82TnpXmS89JC6SF0iJpsbREel5aKr0gLZNelJZLL0krpJXSKull6RXpVWm19Jq0RnpdWiu9Ia2T3pTWSxukt6SN0tvSJukdabO0RXpXek/aKr0vbZM+kLZLO6Sd0i5pt7RH2ivtkz6U9ksfSQekg9Ih6bB0RPpYOip9Ih2TPpWOS59JJ6TPpZPSF9Ip6UvptPSVdEb6WjorfSN9K30nfS/9IP0o/SSdk36Wzku/SBekX6WL0iWJyIzMypzMy4IsypIsy4qsypqsy4ZsypZsy47syp7syzE5k3yDnFnOImeVs8nZ5RxyTjmXnCDnlvPIN8qJ8k1yXvlmOZ98i5xfvlUuIBeUC8m3yYXlInJRuZhcXC4hl5RLyaXlMvLtcln5DrmcfKdcXq4gV5QryZXlKnJVuZpcXa4h15RryXfJteU6cl25nlxfbiA3lBvJjeUmclP5brmZ3FxuId8jt5TvlVvJreU2clu5nXyf3F7uIHeUO8md5S5yV7mb3F3uIfeUe8m95fvlPnJfuZ/cXx4gPyAPlAfJg+UH5SHyQ/JQeZicJA+XR8gPyyPlR+RR8mh5jDxWHic/Ko+XH5MnyBPlSfJk+XH5CXmK/KQ8VZ4mT5efkmfIT8sz5VnybHmOPFd+Rp4nPyvPl5+TF8gL5UXyYnmJ/Ly8VH5BXia/KC+XX5JXyCvlVfLL8ivyq/Jq+TV5jfy6vFZ+Q14nvymvlzfIb8kb5bflTfI78mZ5i/yu/J68VX5f3iZ/IG+Xd8g75V3ybnmPvFfeJ38o75c/kg/IB+VD8mH5iPyxfFT+RD4mfyoflz+TT8ifyyflL+RT8pfyafkr+Yz8tXxW/kb+Vv5O/l7+Qf5R/kk+J/8sn5d/kS/Iv8oX5UsyURiFVTiFVwRFVCRFVhRFVTRFVwzFVCzFVhzFVTzFV2JKJuUGJbOSRcmqZFOyKzmUnEouJUHJreRRblQSlZuUvMrNSj7lFiW/cqtSQCmoFFJuUworRZSiSjGluFJCKamUUkorZZTblbLKHUo55U6lvFJBqahUUiorVZSqSjWlulJDqanUUu5Sait1lLpKPaW+0kBpqDRSGitNlKbK3UozpbnSQrlHaancq7RSWittlLZKO+U+pb3SQemodFI6K12Urko3pbvSQ+mp9FJ6K/crfZS+Sj+lvzJAeUAZqAxSBisPKkOUh5ShyjAlSRmujFAeVkYqjyijlNHKGGWsMk55VBmvPKZMUCYqk5TJyuPKE8oU5UllqjJNma48pcxQnlZmKrOU2cocZa7yjDJPeVaZrzynLFAWKouUxcoS5XllqfKCskx5UVmuvKSsUFYqq5SXlVeUV5XVymvKGuV1Za3yhrJOeVNZr2xQ3lI2Km8rm5R3lM3KFuVd5T1lq/K+sk35QNmu7FB2KruU3coeZa+yT/lQ2a98pBxQDiqHlMPKEeVj5ajyiXJM+VQ5rnymnFA+V04qXyinlC+V08pXyhnla+Ws8o3yrfKd8r3yg/Kj8pNyTvlZOa/8olxQflUuKpcUojIqq3IqrwqqqEqqrCqqqmqqrhqqqVqqrTqqq3qqr8bUTOoNamY1i5pVzaZmV3OoOdVcaoKaW82j3qgmqjepedWb1XzqLWp+9Va1gFpQLaTephZWi6hF1WJqcbWEWlItpZZWy6i3q2XVO9Ry6p1qebWCWlGtpFZWq6hV1WpqdbWGWlOtpd6l1lbrqHXVemp9tYHaUG2kNlabqE3Vu9VmanO1hXqP2lK9V22ltlbbqG3Vdup9anu1g9pR7aR2VruoXdVuane1h9pT7aX2Vu9X+6h91X5qf3WA+oA6UB2kDlYfVIeoD6lD1WFqkjpcHaE+rI5UH1FHqaPVMepYdZz6qDpefUydoE5UJ6mT1cfVJ9Qp6pPqVHWaOl19Sp2hPq3OVGeps9U56lz1GXWe+qw6X31OXaAuVBepi9Ul6vPqUvUFdZn6orpcfUldoa5UV6kvq6+or6qr1dfUNerr6lr1DXWd+qa6Xt2gvqVuVN9WN6nvqJvVLeq76nvqVvV9dZv6gbpd3aHuVHepu9U96l51n/qhul/9SD2gHlQPqYfVI+rH6lH1E/WY+ql6XP1MPaF+rp5Uv1BPqV+qp9Wv1DPq1+pZ9Rv1W/U79Xv1B/VH9Sf1nPqzel79Rb2g/qpeVC+pRGM0VuM0XhM0UZM0WVM0VdM0XTM0U7M0W3M0V/M0X4tpmbQbtMxaFi2rlk3LruXQcmq5tAQtt5ZHu1FL1G7S8mo3a/m0W7T82q1aAa2gVki7TSusFdGKasW04loJraRWSiutldFu18pqd2jltDu18loFraJWSausVdGqatW06loNraZWS7tLq63V0epq9bT6WgOtodZIa6w10Zpqd2vNtOZaC+0eraV2r9ZKa6210dpq7bT7tPZaB62j1knrrHXRumrdtO5aD62n1kvrrd2v9dH6av20/toA7QFtoDZIG6w9qA3RHtKGasO0JG24NkJ7WBupPaKN0kZrY7Sx2jjtUW289pg2QZuoTdIma49rT2hTtCe1qdo0bbr2lDZDe1qbqc3SZmtztLnaM9o87VltvvactkBbqC3SFmtLtOe1pdoL2jLtRW259pK2QluprdJe1l7RXtVWa69pa7TXtbXaG9o67U1tvbZBe0vbqL2tbdLe0TZrW7R3tfe0rdr72jbtA227tkPbqe3Sdmt7tL3aPu1Dbb/2kXZAO6gd0g5rR7SPtaPaJ9ox7VPtuPaZdkL7XDupfaGd0r7UTmtfaWe0r7Wz2jfat9p32vfaD9qP2k/aOe1n7bz2i3ZB+1W7qF3SiM7orM7pvC7ooi7psq7oqq7pum7opm7ptu7oru7pvh7TM+k36Jn1LHpWPZueXc+h59Rz6Ql6bj2PfqOeqN+k59Vv1vPpt+j59Vv1AnpBvZB+m15YL6IX1YvpxfUSekm9lF5aL6PfrpfV79DL6Xfq5fUKekW9kl5Zr6JX1avp1fUaek29ln6XXluvo9fV6+n19QZ6Q72R3lhvojfV79ab6c31Fvo9ekv9Xr2V3lpvo7fV2+n36e31DnpHvZPeWe+id9W76d31HnpPvZfeW79f76P31fvp/fUB+gP6QH2QPlh/UB+iP6QP1YfpSfpwfYT+sD5Sf0QfpY/Wx+hj9XH6o/p4/TF9gj5Rn6RP1h/Xn9Cn6E/qU/Vp+nT9KX2G/rQ+U5+lz9bn6HP1Z/R5+rP6fP05fYG+UF+kL9aX6M/rS/UX9GX6i/py/SV9hb5SX6W/rL+iv6qv1l/T1+iv62v1N/R1+pv6en2D/pa+UX9b36S/o2/Wt+jv6u/pW/X39W36B/p2fYe+U9+l79b36Hv1ffqH+n79I/2AflA/pB/Wj+gf60f1T/Rj+qf6cf0z/YT+uX5S/0I/pX+pn9a/0s/oX+tn9W/0b/Xv9O/1H/Qf9Z/0c/rP+nn9F/2C/qt+Ub+kE4MxWIMzeEMwREMyZEMxVEMzdMMwTMMybMMxXMMzfCNmZDJuMDIbWYysRjYju5HDyGnkMhKM3EYe40Yj0bjJyGvcbOQzbjHyG7caBYyCRiHjNqOwUcQoahQzihsljJJGKaO0Uca43Shr3GGUM+40yhsVjIpGJaOyUcWoalQzqhs1jJpGLeMuo7ZRx6hr1DPqGw2MhkYjo7HRxGhq3G00M5obLYx7jJbGvUYro7XRxmhrtDPuM9obHYyORiejs9HF6Gp0M7obPYyeRi+jt3G/0cfoa/Qz+hsDjAeMgcYgY7DxoDHEeMgYagwzkozhxgjjYWOk8YgxyhhtjDHGGuOMR43xxmPGBGOiMcmYbDxuPGFMMZ40phrTjOnGU8YM42ljpjHLmG3MMeYazxjzjGeN+cZzxgJjobHIWGwsMZ43lhovGMuMF43lxkvGCmOlscp42XjFeNVYbbxmrDFeN9YabxjrjDeN9cYG4y1jo/G2scl4x9hsbDHeNd4zthrvG9uMD4ztxg5jp7HL2G3sMfYa+4wPjf3GR8YB46BxyDhsHDE+No4anxjHjE+N48Znxgnjc+Ok8YVxyvjSOG18ZZwxvjbOGt8Y3xrfGd8bPxg/Gj8Z54yfjfPGL8YF41fjonHJICZjsiZn8qZgiqZkyqZiqqZm6qZhmqZl2qZjuqZn+mbMzGTeYGY2s5hZzWxmdjOHmdPMZSaYuc085o1monmTmde82cxn3mLmN281C5gFzULmbWZhs4hZ1CxmFjdLmCXNUmZps4x5u1nWvMMsZ95pljcrmBXNSmZls4pZ1axmVjdrmDXNWuZdZm2zjlnXrGfWNxuYDc1GZmOzidnUvNtsZjY3W5j3mC3Ne81WZmuzjdnWbGfeZ7Y3O5gdzU5mZ7OL2dXsZnY3e5g9zV5mb/N+s4/Z1+xn9jcHmA+YA81B5mDzQXOI+ZA51BxmJpnDzRHmw+ZI8xFzlDnaHGOONceZj5rjzcfMCeZEc5I52XzcfMKcYj5pTjWnmdPNp8wZ5tPmTHOWOducY841nzHnmc+a883nzAXmQnORudhcYj5vDjdfMJeZL5rLzZfMFeZKc5X5svmK+aq52nzNXGO+bq413zDXmW+a680N5lvmRvNtc5P5jrnZ3GK+a75nbjXfN7eZH5jbzR3mTnOXudvcY+4195kfmvvNj8wD5kHzkHnYPGJ+bB41PzGPmZ+ax83PzBPm5+ZJ8wvzlPmledr8yjxjfm2eNb8xvzW/M783fzB/NH8yz5k/m+fNX8wL5q/mRfOSSSzGYi3O4i3BEi3Jki3FUi3N0i3DMi3Lsi3Hci3P8q2Ylcm6wcpsZbGyWtms7FYOK6eVy0qwclt5rButROsmK691s5XPusXKb91qFbAKWoWs26zCVhGrqFXMKm6VsEpapazSVhnrdqusdYdVzrrTKm9VsCpalazKVhWrqlXNqm7VsGpatay7rNpWHauuVc+qbzWwGlqNrMZWE6updbfVzGputbDusVpa91qtrNZWG6ut1c66z2pvdbA6Wp2szlYXq6vVzepu9bB6Wr2s3tb9Vh+rr9XP6m8NsB6wBlqDrMHWg9YQ6yFrqDXMSrKGWyOsh62R1iPWKGu0NcYaa42zHrXGW49ZTNJEa5I12XrcesKaYj1pTbWmWdOtp6wZ1tPWTGuWNduaY821nrHmWc9a863nrAXWQmuRtdhaYj1vLbVesJZZL1rLrZesFdZKa5X1svWK9aq12nrNWmO9bq213rDWWW9a660N1lvWRutta5P1jrXZ2mK9a71nbbXet7ZZH1jbrR3WTmuXtdvaY+219lkfWvutj6wD1kHrkHXYOmJ9bB21PrGOWZ9ax63PrBPW59ZJ6wvrlPWlddr6yjpjfW2dtb6xvrW+s763frB+tH6yzlk/W+etX6wL1q/WReuSRWzGZm3O5m3BFm3Jlm3FVm3N1m3DNm3Ltm3Hdm3P9u2Yncm+wc5sZ7Gz2tns7HYOO6edy06wc9t57BvtRPsmO699s53PvsXOb99qF7AL2oXs2+zCdhG7qF3MLm6XsEvapezSdhn7drusfYddzr7TLm9XsCvalezKdhW7ql3Nrm7XsGvatey77Np2HbuuXc+ubzewG9qN7MZ2E7upfbfdzG5ut7DvsVva99qt7NZ2G7ut3c6+z25vd7A72p3sznYXu6vdze5u97B72r3s3vb9dh+7r93P7m8PsB+wB9qD7MH2g/YQ+yF7qD3MTrKH2yPsh+2R9iP2KHu0PcYea4+zH7XH24/ZE+yJ9iR7sv24/YQ9xX7SnmpPs6fbT9kz7KftmfYse7Y9x55rP2PPs5+159vP2QvshfYie7G9xH7eXmq/YC+zX7SX2y/ZK+yV9ir7ZfsV+1V7tf2avcZ+3V5rv2Gvs9+019sb7Lfsjfbb9ib7HXuzvcV+137P3mq/b2+zP7C32zvsnfYue7e9x95r77M/tPfbH9kH7IP2IfuwfcT+2D5qf2Ifsz+1j9uf2Sfsz+2T9hf2KftL+7T9lX3G/to+a39jf2t/Z39v/2D/aP9kn7N/ts/bv9gX7F/ti/YlmziMwzqcwzuCIzqSIzuKozqaozuGYzqWYzuO4zqe4zsxJ5Nzg5PZyeJkdbI52Z0cTk4nl5Pg5HbyODc6ic5NTl7nZiefc4uT37nVKeAUdAo5tzmFnSJOUaeYU9wp4ZR0SjmlnTLO7U5Z5w6nnHOnU96p4FR0KjmVnSpOVaeaU92p4dR0ajl3ObWdOk5dp55T32ngNHQaOY2dJk5T526nmdPcaeHc47R07nVaOa2dNk5bp51zn9Pe6eB0dDo5nZ0uTlenm9Pd6eH0dHo5vZ37nT5OX6ef098Z4DzgDHQGOYOdB50hzkPOUGeYk+QMd0Y4DzsjnUecUc5oZ4wz1hnnPOqMdx5zJjgTnUnOZOdx5wlnivOkM9WZ5kx3nnJmOE87M51ZzmxnjjPXecaZ5zzrzHeecxY4C51FzmJnifO8s9R5wVnmvOgsd15yVjgrnVXOy84rzqvOauc1Z43zurPWecNZ57zprHc2OG85G523nU3OO85mZ4vzrvOes9V539nmfOBsd3Y4O51dzm5nj7PX2ed86Ox3PnIOOAedQ85h54jzsXPU+cQ55nzqHHc+c044nzsnnS+cU86XzmnnK+eM87Vz1vnG+db5zvne+cH50fnJOef87Jx3fnEuLP7VuehccojLuKzLubwruKIrubKruKqrubpruKZrubbruK7rub4bczO5N7iZ3SxuVjebm93N4eZ0c7kJbm43j3ujm+je5OZ1b3bzube4+d1b3QJuQbeQe5tb2C3iFnWLucXdEm5Jt5Rb2i3j3u6Wde9wy7l3uuXdCm5Ft5Jb2a3iVnWrudXdGm5Nt5Z7l1vbrePWdeu59d0GbkO3kdvYbeI2de92m7nN3RbuPW5L9163ldvabeO2ddu597nt3Q5uR7eT29nt4nZ1u7nd3R5uT7eX29u93+3j9nX7uf3dAe4D7kB3kDvYfdAd4j7kDnWHuUnucHeE+7A70n3EHeWOdse4Y91x7qPuePcxd4I70Z3kTnYfd59wp7hPulPdae509yl3hvu0O9Od5c5257hz3Wfcee6z7nz3OXeBu9Bd5C52l7jPu0vdF9xl7ovucvcld4W70l3lvuy+4r7qrnZfc9e4r7tr3Tfcde6b7np3g/uWu9F9293kvuNudre477rvuVvd991t7gfudneHu9Pd5e5297h73X3uh+5+9yP3gHvQPeQedo+4H7tH3U/cY+6n7nH3M/eE+7l70v3CPeV+6Z52v3LPuF+7Z91v3G/d79zv3R/cH92f3HPuz+559xf3gvure9G95BKP8ViP83hP8ERP8mRP8VRP83TP8EzP8mzP8VzP83wv5mXybvAye1m8rF42L7uXw8vp5fISvNxeHu9GL9G7ycvr3ezl827x8nu3egW8gl4h7zavsFfEK+oV84p7JbySXimvtFfGu90r693hlfPu9Mp7FbyKXiWvslfFq+pV86p7NbyaXi3vLq+2V8er69Xz6nsNvIZeI6+x18Rr6t3tNfOaey28e7yW3r1eK6+118Zr67Xz7vPaex28jl4nr7PXxevqdfO6ez28nl4vr7d3v9fH6+v18/p7A7wHvIHeIG+w96A3xHvIG+oN85K84d4I72FvpPeIN8ob7Y3xxnrjvEe98d5j3gRvojfJm+w97j3hTfGe9KZ607zp3lPeDO9pb6Y3y5vtzfHmes9487xnvfnec94Cb6G3yFvsLfGe95Z6L3jLvBe95d5L3gpvpbfKe9l7xXvVW+295q3xXvfWem9467w3vfXeBu8tb6P3trfJe8fb7G3x3vXe87Z673vbvA+87d4Ob6e3y9vt7fH2evu8D7393kfeAe+gd8g77B3xPvaOep94x7xPvePeZ94J73PvpPeFd8r70jvtfeWd8b72znrfeN9633nfez94P3o/eee8n73z3i/eBe9X76J3ySM+47M+5/O+4Iu+5Mu+4qu+5uu+4Zu+5du+47u+5/t+zM/k3+Bn9rP4Wf1sfnY/h5/Tz+Un+Ln9PP6NfqJ/k5/Xv9nP59/i5/dv9Qv4Bf1C/m1+Yb+IX9Qv5hf3S/gl/VJ+ab+Mf7tf1r/DL+ff6Zf3K/gV/Up+Zb+KX9Wv5lf3a/g1/Vr+XX5tv45f16/n1/cb+A39Rn5jv4nf1L/bb+Y391v49/gt/Xv9Vn5rv43f1m/n3+e39zv4Hf1Ofme/i9/V7+Z393v4Pf1efm//fr+P39fv5/f3B/gP+AP9Qf5g/0F/iP+QP9Qf5if5w/0R/sP+SP8Rf5Q/2h/jj/XH+Y/64/3H/An+RH+SP9l/3H/Cn+I/6U/1p/nT/af8Gf7T/kx/lj/bn+PP9Z/x5/nP+vP95/wF/kJ/kb/YX+I/7y/1X/CX+S/6y/2X/BX+Sn+V/7L/iv+qv9p/zV/jv+6v9d/w1/lv+uv9Df5b/kb/bX+T/46/2d/iv+u/52/13/e3+R/42/0d/k5/l7/b3+Pv9ff5H/r7/Y/8A/5B/5B/2D/if+wf9T/xj/mf+sf9z/wT/uf+Sf8L/5T/pX/a/8o/43/tn/W/8b/1v/O/93/wf/R/8s/5P/vn/V/8C/6v/kX/kk9iTIyNcTE+JsTEmBSTY0pMjWkxPWbEzJgVs2NOzI15MT8Wi2WK3RDLHMsSyxrLFsseyxHLGcsVS4jljuWJ3RhLjN0Uyxu7OZYvdkssf+zWWIFYwVih2G2xwrEisaKxYrHisRKxkrFSsdKxMrHbY2Vjd8TKxe6MlY9ViFWMVYpVjlWJVY1Vi1WP1YjVjNWK3RWrHasTqxurF6sfaxBrGGsUaxxrEmsauzvWLNY81iJ2T6xl7N5Yq1jrWJtY21i72H2x9rEOsY6xTrHOsS6xrrFuse6xHrGesV6x3rH7Y31ifWP9Yv1jA2IPxAbGBsUGxx6MDYk9FBsaG/Z/tN1jrydB38D5M7anbXdXd1f7P7Zt27Zt27Zt2zpzxrZ1xp45M7O57mQ3ufd6sMlmt559UqnK7/sCKoUMRoYgQ5FhyHBkBDISGYWMRsYgY5FxyHhkAjIRmYRMRqYgU5FpyHRkBjITmYXMRuYgc5F5yHxkAbIQWYQsRpYgS5FlyHJkBbISWYWsRtYga5F1yHpkA7IR2YRsRrYgW5FtyHZkB7IT2YXsRvYge5F9yH7kAHIQOYQcRo4gR5FjyHHkBHISOYWcRs4gZ5FzSDxyHklALiAXkUvIZeQKchW5hlxHbiA3kVvIbeQOche5h9xHHiAPkUfIY+QJ8hR5hjxHXiAvkVfIa+QNkoi8Rd4h75EPyEfkE/IZ+YJ8Rb4h35EfyE/kF/IbSUL+IH+Rf0gcmgxNjqZAU6Kp0NRoGjQtmg5Nj2ZAM6KZ0MxoFjQrmg3NjuZAc6IIiqIYiqMESqIUSqMMyqIcyqMCKqISKqMKqqIaqqMGClATtVAbhaiDuqiH+miAhmiExtBcaG40D5oXzYfmRwugBdFCaGG0CFoULYYWR0ugJdFSaGm0DFoWLYeWRyugFdFKaGW0CloVrYZWR2ugNdFaaG20DloXrYfWRxugDdFGaGO0CdoUbYY2R1ugLdFWaGu0DdoWbYe2RzugHdFOaGe0C9oV7YZ2R3ugPdFeaG+0D9oX7Yf2RwegA9FB6GB0CDoUHYYOR0egI9FR6Gh0DDoWHYeORyegE9FJ6GR0CjoVnYZOR2egM9FZ6Gx0DjoXnYfORxegC9FF6GJ0CboUXYYuR1egK9FV6Gp0DboWXYeuRzegG9FN6GZ0C7oV3YZuR3egO9Fd6G50D7oX3YfuRw+gB9FD6GH0CHoUPYYeR0+gJ9FT6Gn0DHoWPYfGo+fRBPQCehG9hF5Gr6BX0WvodfQGehO9hd5G76B30XvoffQB+hB9hD5Gn6BP0Wfoc/QF+hJ9hb5G36CJ6Fv0Hfoe/YB+RD+hn9Ev6Ff0G/od/YH+RH+hv9Ek9A/6F/2HxmHJsORYCiwllgpLjaXB0mLpsPRYBiwjlgnLjGXBsmLZsOxYDiwnhmAohmE4RmAkRmE0xmAsxmE8JmAiJmEypmAqpmE6ZmAAMzELszGIOZiLeZiPBViIRVgMy4XlxvJgebF8WH6sAFYQK4QVxopgRbFiWHGsBFYSK4WVxspgZbFyWHmsAlYRq4RVxqpgVbFqWHWsBlYTq4XVxupgdbF6WH2sAdYQa4Q1xppgTbFmWHOsBdYSa4W1xtpgbbF2WHusA9YR64R1xrpgXbFuWHesB9YT64X1xvpgfbF+WH9sADYQG4QNxoZgQ7Fh2HBsBDYSG4WNxsZgY7Fx2HhsAjYRm4RNxqZgU7Fp2HRsBjYTm4XNxuZgc7F52HxsAbYQW4QtxpZgS7Fl2HJsBbYSW4WtxtZga7F12HpsA7YR24RtxrZgW7Ft2HZsB7YT24XtxvZge7F92H7sAHYQO4Qdxo5gR7Fj2HHsBHYSO4Wdxs5gZ7FzWDx2HkvALmAXsUvYZewKdhW7hl3HbmA3sVvYbewOdhe7h93HHmAPsUfYY+wJ9hR7hj3HXmAvsVfYa+wNloi9xd5h77EP2EfsE/YZ+4J9xb5h37Ef2E/sF/YbS8L+YH+xf1gcngxPjqfAU+Kp8NR4Gjwtng5Pj2fAM+KZ8Mx4Fjwrng3PjufAc+IIjuIYjuMETuIUTuMMzuIczuMCLuISLuMKruIaruMGDnATt3Abh7iDu7iH+3iAh3iEx/BceG48D54Xz4fnxwvgBfFCeGG8CF4UL4YXx0vgJfFSeGm8DF4WL4eXxyvgFfFKeGW8Cl4Vr4ZXx2vgNfFaeG28Dl4Xr4fXxxvgDfFGeGO8Cd4Ub4Y3x1vgLfFWeGu8Dd4Wb4e3xzvgHfFOeGe8C94V74Z3x3vgPfFeeG+8D94X74f3xwfgA/FB+GB8CD4UH4YPx0fgI/FR+Gh8DD4WH4ePxyfgE/FJ+GR8Cj4Vn4ZPx2fgM/FZ+Gx8Dj4Xn4fPxxfgC/FF+GJ8Cb4UX4Yvx1fgK/FV+Gp8Db4WX4evxzfgG/FN+GZ8C74V34Zvx3fgO/Fd+G58D74X34fvxw/gB/FD+GH8CH4UP4Yfx0/gJ/FT+Gn8DH4WP4fH4+fxBPwCfhG/hF/Gr+BX8Wv4dfwGfhO/hd/G7+B38Xv4ffwB/hB/hD/Gn+BP8Wf4c/wF/hJ/hb/G3+CJ+Fv8Hf4e/4B/xD/hn/Ev+Ff8G/4d/4H/xH/hv/Ek/A/+F/+HxxHJiORECiIlkYpITaQh0hLpiPREBiIjkYnITGQhshLZiOxEDiIngRAogRE4QRAkQRE0wRAswRE8IRAiIREyoRAqoRE6YRCAMAmLsAlIOIRLeIRPBERIRESMyEXkJvIQeYl8RH6iAFGQKEQUJooQRYliRHGiBFGSKEWUJsoQZYlyRHmiAlGRqERUJqoQVYlqRHWiBlGTqEXUJuoQdYl6RH2iAdGQaEQ0JpoQTYlmRHOiBdGSaJUyLi6OaEu0I9oTHYiORCeiM9GF6Ep0I7oTPYieRC+iN9GH6Ev0I/oTA4iBxCBiMDGEGEoMI4YTI4iRxChiNDGGGEuMI8YTE4iJxCRiMjGFmEpMI6YTM4iZxCxiNjGHmEvMI+YTC4iFxCJiMbGEWEosI5YTK4iVxCpiNbGGWEusI9YTG4iNxCZiM7GF2EpsI7YTO4idxC5iN7GH2EvsI/YTB4iDxCHiMHGEOEocI44TJ4iTxCniNHGGOEucI+KJ80QCcYG4SFwiLhNXiKvENeI6cYO4SdwibhN3iLvEPeI+8YB4SDwiHhNPiKfEM+I58YJ4SbwiXhNviETiLcER74kPxEfiE/GZ+EJ8Jb4R34kfxE/iF/GbSCL+EH+Jf0QcmYxMTqYgU5KpyNRkGjItmY5MT2YgM5KZyMxkFjIrmY3MTuYgc5IIiZIYiZMESZIUSZMMyZIcyZMCKZISKZMKqZIaqZMGCUiTtEibhKRDuqRH+mRAhmRExshcZG4yD5mXzEfmJwuQBclCZGGyCFmULEYWJ0uQJclSZGmyDFmWLEeWJyuQFclKZOW0VciqZDWyOlmDrEnWImuTdci6ZD2yPtmAbEg2IhuTTcimZDOyOdmCbEm2IluTbci2ZDuyPdmB7Eh2IjuTXciuZDeyO9mD7En2InuTfci+ZD+yPzmAHEgOIgeTQ8ih5DByODmCHEmOIkeTY8ix5DhyPDmBnEhOIieTU8ip5DRyOjmDnEnOImeTc8i55DxyPrmAXEguIheTS8il5DJyObmCXEmuIleTa8i15DpyPbmB3EhuIjeTW8it5DZyO7mD3EnuIneTe8i95D5yP3mAPEgeIg+TR8ij5DHyOHmCPEmeIk+TZ8iz5DkynjxPJpAXyIvkJfIyeYW8Sl4jr5M3yJvkLfI2eYe8S94j75MPyIfkI/Ix+YR8Sj4jn5MvyJfkK/I1+YZMJN+S78j35AfyI/mJ/Ex+Ib+S38jv5A/yJ/mL/E0mkX/Iv+Q/Mo5KRiWnUlApqVRUaioNlZZKR6WnMlAZqUxUZioLlZXKRmWnclA5KYRCKYzCKYIiKYqiKYZiKY7iKYESKYmSKYVSKY3SKYMClElZlE1ByqFcyqN8KqBCKqJiVC4qN5WHykvlo/JTBaiCVCGqMFWEKkoVo4pTJaiSVCmqNFWGKkuVo8pTFaiKVCWqMlWFqkpVo6pTNaiaVC2qNlWHqkvVo+pTDaiGVCOqMdWEako1o5pTLaiWVCuqNdWGaku1o9pTHaiOVCeqM9WF6kp1o7pTPaieVC+qN9WH6kv1o/pTA6iB1CBqMDWEGkoNo4ZTI6iR1ChqNDWGGkuNo8ZTE6iJ1CRqMjWFmkpNo6ZTM6iZ1CxqNjWHmkvNo+ZTC6iF1CJqMbWEWkoto5ZTK6iV1CpqNbWGWkuto9ZTG6iN1CZqM7WF2kpto7ZTO6id1C5qN7WH2kvto/ZTB6iD1CHqMHWEOkodo45TJ6iT1CnqNHWGOkudo+Kp81QCdYG6SF2iLlNXqKvUNeo6dYO6Sd2iblN3qLvUPeo+9YB6SD2iHlNPqKfUM+o59YJ6Sb2iXlNvqETqLfWOek99oD5Sn6jP1BfqK/WN+k79oH5Sv6jfVBL1h/pL/aPi6GR0cjoFnZJORaem09Bp6XR0ejoDnZHORGems9BZ6Wx0djoHnZNGaJTGaJwmaJKmaJpmaJbmaJ4WaJGWaJlWaJXWaJ02aECbtEXbNKQd2qU92qcDOqQjOkbnonPTeei8dD46P12ALkgXogvTReiidDG6OF2CLkmXokvTZeiydDm6PF2BrkhXoivTVeiqdDW6Ol2DrknXomvTdei6dD26Pt2Abkg3ohvTTeimdDO6Od2Cbkm3olvTbei2dDu6Pd2B7kh3ojvTXeiudDe6O92D7kn3onvTfei+dD+6Pz2AHkgPogfTQ+ih9DB6OD2CHkmPokfTY+ix9Dh6PD2BnkhPoifTU+ip9DR6Oj2DnknPomfTc+i59Dx6Pr2AXkgvohfTS+il9DJ6Ob2CXkmvolfTa+i19Dp6Pb2B3khvojfTW+it9DZ6O72D3knvonfTe+i99D56P32APkgfog/TR+ij9DH6OH2CPkmfok/TZ+iz9Dk6nj5PJ9AX6Iv0JfoyfYW+Sl+jr9M36Jv0Lfo2fYe+S9+j79MP6If0I/ox/YR+Sj+jn9Mv6Jf0K/o1/YZOpN/S7+j39Af6I/2J/kx/ob/S3+jv9A/6J/2L/k0n0X/ov/Q/Oo5JxiRnUjApmVRMaiYNk5ZJx6RnMjAZmUxMZiYLk5XJxmRncjA5GYRBGYzBGYIhGYqhGYZhGY7hGYERGYmRGYVRGY3RGYMBjMlYjM1AxmFcxmN8JmBCJmJiTC4mN5OHycvkY/IzBZiCTCGmMFOEKcoUY4ozJZiSTCmmNFOGKcuUY8ozFZiKTCWmMlOFqcpUY6ozNZiaTC2mNlOHqcvUY+ozDZiGTCOmMdOEaco0Y5ozLZiWTCumNdOGacu0Y9ozHZiOTCemM9OF6cp0Y7ozPZieTC+mNzNgVV+mH9OfaWYNZAYxg5khzFBmGDOcGcGMZEYxo5kxzFhmHDOemcBMZCYxk5kpzFRmGjOdmcHMZGYxs5k5zFxmHjOfWcAsZBYxi5klzFJmGbOcWcGsZFYxq5k1zFpmHbOe2cBsZDYxm5ktzFZmG7Od2cHsZHYxu5k9zF5mH7OfOcAcZA4xh5kjzFHmGHOcOcGcZE4xp5kzzFnmHBPPnGcSmAvMReYSc5m5wlxlrjHXmRvMTeYWc5u5w9xl7jH3mQfMQ+YR85h5wjxlnjHPmRfMS+YV85p5wyQyb5l3zHvmA/OR+cR8Zr4wX5lvzHfmB/OT+cX8ZpKYP8xf5h8Tx8axydnkbEo2JZuaTc2mZdOy6dn0bEY2I5uZzcxmZbOy2dnsbE42J4uyKIuzOEuyJEuzNPufxbM8K7IiK7Myq7Iqq7M6C1jAWqzFQhayLuuyPuuzIRuyMTbG5mZzs3nZvGx+Nj9bkC3IFmYLs0XZomxxtjhbki3JlmZLs2XZsmx5tjxbka3IVmYrs1XZqmx1tjpbk63J1mZrs3XZumx9tj7bkG3INmYbs03Zpmxztjnbkm3JtmZbs23Ztmx7tj3bke3IdmY7s13Zrmx3tjvbk+3J9mZ7s33Zvmx/tj87kB3IDmYHs0PZoexwdjg7kh3JjmZHs2PZsex4djw7kZ3ITmYns1PZqex0djo7k53FzmbnsHPZeex8dgG7kF3ELmYXs0vZpexydjm7kl3JrmZXs2vZtex6dj27kd3IbmY3s1vZrex2dju7k93J7mZ3s3vZvex+dj97kD3IHmYPs0fZo+xx9jh7kj3JnmZPs2fZs2w8G88msAnsRfYie5m9zF5lr7LX2evsTfYme5u9zd5l77L32fvsQ/Yh+5h9zD5ln7LP2efsS/Yl+5p9zSayiew79h37gf3AfmI/sV/YL+w39hv7g/3B/mJ/sUlsEvuX/cum5dJx6bkMXEYuE5eZy8L9341zBEdyFEdzDIdy2P8yy3Gczhkc4EzO4mwOcs5/OTeXh8vL5ePycwW4glyh/3JprgxXlivHlecqcCW5Uv/LFblKXGWuDleVq8dV5xpwNblGXG2uDleXq8fV5xpwDblGXGuuDdeWa8e15zpwHblO/+W93D7uNHeGO8ud425zd7gf3E/uNfeG+8X95gZyg7jR3BhuLDeOG89N4CZyk/7Ls7k53FxuHjefW8At5Bb9l1dza7i13DpuPbeB28ht+i/v5vZwW7n93HZuB7eT2/U//s9M+7kD3EHuEHeYO8Id5eK549wJ7iR36v+aNZ47zyVwF7ib3C3uMneFu8pd465zN/7H/+m4y93j7nMPuJfcK+4x94R7yiVyz7kX/+P/9CVyb7l33HvuA/eR+8QlcV+4r9w37vv/9P+nPYn7w/3l/nFxfDI+OZ+CT8mn4lPzafi0fDo+PZ+Bz8hn4jPzWfisfDY+O5+Dz8kjPMpjPM4TPMlTPM0zPMtzPM8LvMhLvMwrvMprvM4bPOBN3uJtHvIO7/Ie7/MBH/IRH+Nz8bn5PHxePh+fny/AF+QL8YX5InxRvhhfnC/Bl+RL8aX5MnxZvhxfnq/AV+Qr8ZX5KnxVvhpfna/B1+Rr8bX5Onxdvh5fn2/AN+Qb8Y35JnxTvhnfnG/Bt+Rb8a35Nnxbvh3fnu/Ad+Q78Z35LnxXvhvfne/B9+R78b35Pnxfvh/fnx/AD+QH8YP5IfxQfhg/nB/Bj+RH8aP5MfxYfhw/np/AT+Qn8ZP5KfxUfho/nZ/Bz+Rn8bP5Ofxcfh4/n1/AL+QX8Yv5JfxSfhm/nF/Br+RX8av5Nfxafh2/nt/Ab+Q38Zv5LfxWfhu/nd/B7+R38bv5Pfxefh+/nz/AH+QP8Yf5I/xR/hh/nD/Bn+RP8af5M/xZ/hwfz5/nE/gL/EX+En+Zv8Jf5a/x1/kb/E3+Fn+bv8Pf5e/x9/kH/EP+Ef+Yf8I/5Z/xz/kX/Ev+Ff+af8Mn8m/5d/x7/gP/kf/Ef+a/8F/5b/x3/gf/k//F/+aT+D/8X/4fHyckE5ILKYSUQiohtZBGSCukE9ILGYSMQiYhs5BFyCpkE7ILOYScAiKgAibgAiGQAiXQAiOwAifwgiCIgiTIgiKogibogiEAwRQswRag4Aiu4Am+EAihEAkxIZeQW8gj5BXyCfmFAkJBoZBQWCgiFBWKCcWFEkJJoZRQWigjlBXKCeWFCoIvVBIqC1WEqkI1obpQQ6gp1BJqC3WEukI9ob7QQGgoNBIaC02EpkIzobnQQmgptBJaC22EtkI7ob3QQegodPp/3B8sDBGGCsOEYcIIYaQwShgtjBHGCuOE8cIEYaIwSZgsTBGmCtOE6cIMYaYwS5gtzBHmCvOE+cICYaGwSFgsLBGWCsuE5cIKYaWwSlgtrBHWCuuE9cIGYaOwSdgsbBG2CtuE7cIOYaewS9gt7BH2CvuE/cIB4aBwSDgsHBGOCseE48IJ4aRwSjgtnBHOCueEeOG8kCBcEC4Kl4TLwhXhqnBNuC7cEG4Kt4Tbwh3hrnBPuC88EB4Kj4THwhPhqfBMeC68EF4Kr4TXwhshUXgrvBPeCx+Ej8In4bPwRfgqfBO+Cz+En8Iv4beQJPwR/gr/hDgxmZhcTCGmFFOJqcU0YloxnZhezCBmFDOJmcUsYlYxm5hdzCHmFBERFTERFwmRFCmRFhmRFTmRFwVRFCVRFhVRFTVRFw0RiKZoibYIRUd0RU/0xUAMxUiMibnE3GIeMa+YT8wvFhALioXEwmIRsahYTCwulhBLiqXE0mIZsaxYTiwvVhAripXEymIVsapYTawu1hBrirXE2mIdsa5YT6wvNhAbio3ExmITsanYTGwuthBbiq3E1mIbsa3YTmwvdhA7ip3EzmIXsavYTewu9hB7ir3E3mIfsa/YT+wvDhAHioPEweIQcag4TBwujhBHiqPE0eIYcaw4ThwvThAnipPEyeIUcao4TZwuzhBnirPE2eIcca44T5wvLhAXiovExeIScam4TFwurhBXiqvE1eIaca24TlwvbhA3ipvEzeIWcau4Tdwu7hB3irvE3eIeca+4T9wvHhAPiofEw+IR8ah4TDwunhBPiqfE0+IZ8ax4TowXz4sJ4gXxonhJvCxeEa+K18Tr4g3xpnhLvC3eEe+K98T74gPxofhIfCw+EZ+Kz8Tn4gvxpfhKfC2+ERPFt+I78b34QfwofhI/i1/Er+I38bv4Q/wp/hJ/i0niH/Gv+E+Mk5JJyaUUUkoplZRaSiOlldJJ6aUMUkYpk5RZyiJllbJJ2aUcUk4JkVAJk3CJkEiJkmiJkViJk3hJkERJkmRJkVRJk3TJkIBkSpZkS1ByJFfyJF8KpFCKpJiUS8ot5ZHySvmk/FIBqaBUSCosFZGKSsWk4lIJqaRUSiotlZHKSuWk8lIFqaJUSaosVZGqStWk6lINqaZUS6ot1ZHqSvWk+lIDqaHUSGosNZGaSs2k5lILqaXUSmottZHaSu2k9lIHqaPUSeosdZG6St2k7lIPqafUS+ot9ZH6Sv2k/tIAaaA0SBosDZGGSsOk4dIIaaQ0ShotjZHGSuOk8dIEaaI0SZosTZGmStOk6dIMaaY0S5otzZHmSvOk+dICaaG0SFosLZGWSsuk5dIKaaW0SlotrZHWSuuk9dIGaaO0SdosbZG2Stuk7dIOaae0S9ot7ZH2Svuk/dIB6aB0SDosHZGOSsek49IJ6aR0SjotnZHOSuekeOm8lCBdkC5Kl6TL0hXpqnRNui7dkG5Kt6Tb0h3prnRPui89kB5Kj6TH0hPpqfRMei69kF5Kr6TX0hspUXorvZPeSx+kj9In6bP0RfoqfZO+Sz+kn9Iv6beUJP2R/kr/pDg5mZxcTiGnlFPJqeU0clo5nZxeziBnlDPJmeUsclY5m5xdziHnlBEZlTEZlwmZlCmZlhmZlTmZlwVZlCVZlhVZlTVZlw0ZyKZsybYMZUd2ZU/25UAO5UiOybnk3HIeOa+cT84vF5ALyoXkwnIRuahcTC4ul5BLyqXk0nIZuaxcTi4vV5ArypXkynIVuapcTa4u15BryrXk2nIdua5cT64vN5Abyo3kxnITuancTG4ut5Bbyq3k1nIbua3cTm4vd5A7yp3kznIXuavcTe4u95B7yr3k3nIfua/cT+4vD5AHyoPkwfIQeag8TB4uj5BHyqPk0fIYeaw8Th4vT5AnypPkyfIUeao8TZ4uz5BnyrPk2fIcea48T54vL5AXyovkxfISeam8TF4ur5BXyqvk1fIaea28Tl4vb5A3ypvkzfIWeau8Td4u75B3yrvk3fIeea+8T94vH5APyofkw/IR+ah8TD4un5BPyqfk0/IZ+ax8To6Xz8sJ8oX/V2dey2/kRPmt/E5+L3+QP8qf5M/yF/mr/E3+Lv+Qf8q/5N9ykvxH/iv/k+OUZEpyJYWSUkmlpFbSKGmVdEp6JYOSUcmkZFayKFnjkinZlRxKTgVRUAVTcIVQSIVSaIVRWIVTeEVQREVSZEVRVEVTdMVQgGIqlmIrUHEUV/EUXwmUUImUmJJLya3kUfIq+ZT8SgGloFJIKawUUYoqxZTiSgmlpFJKKa2UUcoq5ZTySgWlolJJqaxUUaoq1ZTqSg2lplJLqa3UUeoq9ZT6SgOlodJIaaw0UZoqzZTmSgulpdJKaa20Udoq7ZT2Sgelo9JJ6ax0Uboq3ZTuSg+lp9JL6a30Ufoq/ZT+ygBloDJIGawMUYYqw5ThyghlpDJKGa2MUcYq45TxygRlojJJmaxMUaYq05TpygxlpjJLma3MUeYq85T5ygJlobJIWawsUZYqy5TlygplpbJKWa2sUdYq65T1ygZlo7JJ2axsUbYq25Ttyg5lp7JL2a3sUfYq+5T9ygHloHJIOawcUY4qx5TjygnlpHJKOa2cUc4q55R45bySoFxQLiqXlMvKFeWqck25rtxQbiq3lNvKHeWuck+5rzxQHiqPlMfKE+Wp8kx5rrxQXiqvlNfKGyVReau8U94rH5SPyifls/JF+ap8U74rP5Sfyi/lt5Kk/FH+Kv+UODWZmlxNoaZUU6mp1TRqWjWdml7NoGZUM6mZ1SxqVjWbml3NoeZUERVVMRVXCZVUKZVWGZVVOZVXBVVUJVVWFVVVNVVXDRWopmqptgpVR3VVT/XVQA3VSI2pudTcah41r5pPza8WUAuqhdTCahG1qFpMLa6WUEuqpdTSahm1rFpOLa9WUCuqldTKahW1qlpNra7WUGuqtdTaah21rlpPra82UBuqjdTGahO1qdpMba62UFuqrdTWahu1rdpOba92UDuqndTOahe1q9pN7a72UHuqvdTeah+1r9pP7a8OUAeqg9TB6hB1qDpMHa6OUEeqo9TR6hh1rDpOHa9OUCeqk9TJ6hR1qjpNna7OUGeqs9TZ6hx1rjpPna8uUBeqi9TF6hJ1qbpMXa6uUFeqq9TV6hp1rbpOXa9uUDeqm9TN6hZ1q7pN3a7uUHequ9Td6h51r7pP3a8eUA+qh9TD6hH1qHpMPa6eUE+qp9TT6hn1rHpOjVfPqwnqBfWiekm9rF5Rr6rX1OvqDfWmeku9rd5R76r31PvqA/Wh+kh9rD5Rn6rP1OfqC/Wl+kp9rb5RE9W36jv1vfpB/ah+Uj+rX9Sv6jf1u/pD/an+Un+rSeof9a/6T43TkmnJtRRaSi2VllpLo6XV0mnptQxaRi2TllnLomXVsmnZtRxaTg3RUA3TcI3QSI3SaI3RWI3TeE3QRE3SZE3RVE3TdM3QgGZqlmZrUHM0V/M0Xwu0UIu0mJZLy63l0fJq+bT8WgGtoFZIK6wV0YpqxbTiWgmtpFZKK62V0cpq5bTyWgWtolZJq6xV0apq1bTqWg2tplZLq63V0epq9bT6WgOtodZIa6w10ZpqzbTmWgutpdZKa6210dpq7bT2Wgeto9ZJ66x10bpq3bTuWg+tp9ZL66310fpq/bT+2gBtoDZIG6wN0YZqw7Th2ghtpDZKG62N0cZq47Tx2gRtojZJm6xN0aZq07Tp2gxtpjZLm63N0eZq87T52gJtobZIW6wt0ZZqy7Tl2gptpbZKW62t0dZq67T12gZto7ZJ26xt0bZq27Tt2g5tp7ZL263t0fZq+7T92gHtoHZIO6wd0Y5qx7Tj2gntpHZKO62d0c5q57R47byWoF3QLmqXtMvaFe2qdk27rt3Qbmq3tNvaHe2udk+7rz3QHmqPtMfaE+2p9kx7rr3QXmqvtNfaGy1Re6u9095rH7SP2ifts/ZF+6p9075rP7Sf2i/tt5ak/dH+av+0OD2ZnlxPoafUU+mp9TR6Wj2dnl7PoGfUM+mZ9Sx6Vj2bnl3PoefUER3VMR3XCZ3UKZ3WGZ3VOZ3XBV3UJV3WFV3VNV3XDR3opm7ptg51R3d1T/f1QA/1SI/pufTceh49r55Pz68X0AvqhfTCehG9qF5ML66X0EvqpfTSehm9rF5OL69X0CvqlfTKehW9ql5Nr67X0GvqtfTaeh29rl5Pr6830BvqjfTGehO9qd5Mb6630FvqrfTWehu9rd5Ob6930DvqnfTOehe9q95N76730HvqvfTeeh+9r95P768P0Afqg/TB+hB9qD5MH66P0Efqo/TR+hh9rD5OH69P0Cfqk/TJ+hR9qj5Nn67P0Gfqs/TZyZLpc/V5+nx9gb5QX6Qv1pfoS/Vl+nJ9hb5SX6Wv1tfoa/V1+np9g75R36Rv1rfoW/Vt+nZ9h75T36Xv1vfoe/V9+n79gH5QP6Qf1o/oR/Vj+nH9hH5SP6Wf1s/oZ/Vzerx+Xk/QL+gX9Uv6Zf2KflW/pl/Xb+g39Vv6bf2Ofle/p9/XH+gP9Uf6Y/2J/lR/pj/XX+gv9Vf6a/2Nnqi/1d/p7/UP+kf9k/5Z/6J/1b/p3/Uf+k/9l/5bT9L/6H/1f3qckcxIbqQwUhqpjNRGGiOtkc5Ib2QwMhqZjMxGFiOrkc3IbuQwchqIgRqYgRuEQRqUQRuMwRqcwRuCIRqSIRuKoRqaoRuGAQzTsAzbgIZjuIZn+EZghEZkxIxcRm4jj5HXyGfkNwoYBY1CRmGjiFHUKGYUN0oYJY1SRmmjjFHWKGeUNyoYFY1KRmWjilHVqGZUN2oYNY1aRm2jjlHXqGfUNxoYDY1GRmOjidHUaGY0N1oYLY1WRmujjdHWaGe0NzoYHY1ORmeji9HV6GZ0N3oYPY1eRm+jj9HX6Gf0NwYYA41BxmBjiDHUGGYMN0YYI41RxmhjjDHWGGeMNyYYE41JxmRjijHVmGZMN2YYM41ZxmxjjjHXmGfMNxYYC41FxmJjibHUWGYsN1YYK41VxmpjjbHWWGesNzYYG41NxmZji7HV2GZsN3YYO41dxm5jj7HX2GfsNw4YB41DxmHjiHHUOGYcN04YJ41TxmnjjHHWOGfEG+eNBOOCcdG4ZFw2rhhXjWvGdeOGcdO4Zdw27hh3jXvGfeOB8dB4ZDw2nhhPjWfGc+OF8dJ4Zbw23hiJxlvjnfHe+GB8ND4Zn40vxlfjm/Hd+GH8NH4Zv40k44/x1/hnxIFkIDlIAVKCVCA1SAPSgnQgPcgAMoJMIDPIArKCbCA7yAFyAgSgAAM4IAAJKEADBrCAAzwQgAgkIAMFqEADOjAAACawgA0gcIALPOCDAIQgAjGQC+QGeUBekA/kBwVAQVAIFAZFQFFQDBQHJUBJUAqUBmVAWVAOlAcVQEVQCVQGVUBVUA1UBzVATVAL1AZ1QF1QD9QHDUBD0Ag0Bk1AU9AMNActQEvQCrQGbUBb0A60Bx1AR9AJdAZdQFfQDXQHPUBP0Av0Bn1AX9AP9AcDwEAwCAwGQ8BQMAwMByPASDAKjAZjwFgwDowHE8BEMAlMBlPAVDANTAczwEwwC8wGc8BcMA/MBwvAQrAILAZLwFKwDCwHK8BKsAqsBmvAWrAOrAcbwEawCWwGW8BWsA1sBzvATrAL7AZ7wF6wD+wHB8BBcAgcBkfAUXAMHAcnwElwCpwGZ8BZcA7Eg/MgAVwAF8ElcBlcAVfBNXAd3AA3wS1wG9wBd8E9cB88AA/BI/AYPAFPwTPwHLwAL8Er8Bq8AYngLXgH3oMP4CP4BD6DL+Ar+Aa+gx/gJ/gFfoMk8Af8Bf9AnJnMTG6mMFOaqczUZhozrZnOTG9mMDOamczMZhYzq5nNzG7mMHOaiImamImbhEmalEmbjMmanMmbgimakimbiqmamqmbhglM07RM24SmY7qmZ/pmYIZmZMbMXGZuM4+Z18xn5jcLmAXNQmZhs4hZ1CxmFjdLmCXNUmZps4xZ1ixnljcrmBXNSmZls4pZ1axmVjdrmDXNWmZts45Z16xn1jcbmA3NRmZjs4nZ1GxmNjdbmC3NVmZrs43Z1mxntjc7mB3NTmZns4vZ1exmdjd7mD3NXmZvs4/Z1+xn9jcHmAPNQeZgc4g51BxmDjdHmCPNUeZoc4w51hxnjjcnmBPNSeZkc4o51ZxmTjdnmDPNWeZsc44515xnzjcXmAvNReZic4m51FxmLjdXmCvNVeZqc4251lxnrjc3mBvNTeZmc4u51dxmbjd3mDvNXeZuc4+519xn7jcPmAfNQ+Zh84h51DxmHjdPmCfNU+Zp84x51jxnxpvnzQTzgnnRvGReNq+YV81r5nXzhnnTvGXeNu+Yd8175n3zgfnQfGQ+Np+YT81n5nPzhfnSfGW+Nt+YieZb85353vxgfjQ/mZ/NL+ZX85v53fxh/jR/mb/NJPOP+df8Z8ZZyazkVgorpZXKSm2lsdJa6az0VgYro5XJymxlsbJa2azsVg4rp4VYqIVZuEVYpEVZtMVYrMVZvCVYoiVZsqVYqqVZumVYwDIty7ItaDnW/7/3N7IaW42tplYzq7mVN3nB5K2sVlYbq43VzmpndbA6Wp2szlYXq6vV1epu9bB6WL2s3lYfq6/Vz+pvDbAGWoOswdZga6g11BpuDbdGWiOt0dZoa6w11hpvjbcmWhOtydZka6o11ZpuTbdmWjOt2dZsa64115pvzbcWWgutxdZia6m11FpuLbdWWiut1dZqa6211lpvrbc2WhutzdZma6u11dpubbd2Wjut3dZua6+110oRt986aB20DluHraPWUeu4ddw6aZ20TlunrbPWWSveircSrATronXRumxdtq5aV63r1nXrpnXTum3dtu5ad6371n3rofXQemw9tp5az6zn1gvrpfXKem29sRKtt9Y76731wfpofbI+W1+sr9Y367v1w/pp/bJ+W0nWH+uv9c/6//L9omprtm4bNrBN27JtG9qO7dqe7duBHdqRHbNz2bntPHZeO5+d3y5gF7QL2YXtInZRu5hd3C5hl7RL2aXtMnZZu5xd3q5gV7Qr2ZXtKnZVu5pd3a5h17Rr2bXtOnZdu55d325gN7Qb2Y3tJnZTu5nd3G5ht7Rb2a3tNnZbu53d3u5gd7Q72Z3tLnZXu5vd3e5h97R72b3tPnZfu5/d3x5gD7QH2YPtIfZQe5g93B5hj7RH2aPtMfZYe5w93p5gT7Qn2ZPtKfZUe5o93Z5hz7Rn2bPtOfZce549315gL7QX2YvtJfZSe5m93F5hr7RX2avtNfZae5293t5gb7Q32ZvtLfZWe5u93d5h77R32bvtPfZee5+93z5gH7QP2YftI/ZR+5h93D5hn7RP2aftM/ZZ+5wdb5+3E+wL9kX7kn3ZvmJfta/Z1+0b9k37ln3bvmPfte/Z9+0H9kP7kf3YfmI/tZ/Zz+0X9kv7lf3afmMn2m/td/Z7+4P90f5kf7a/2F/tb/Z3+4f90/5l/7aT7D/2X/ufHQeTweQwBUwJU8HUMA1MC9PB9DADzAgzwcwwC8wKs8HsMAfMCRGIQgzikIAkpCANGchCDvJQgCKUoAwVqEIN6tCAAJrQgjaE0IEu9KAPAxjCCMZgLpgb5oF5YT6YHxaABWEhWBgWgUVhMVgcloAlYSlYGpaBZWE5WB5WgBVhJVgZVoFVYTVYHdaANWEtWBvWgXVhPVgfNoANYSPYGDaBTWEz2By2gC1hK9gatoFtYTvYHnaAHWEn2Bl2gV1hN9gd9oA9YS/YG/aBfWE/2B8OgAPhIDgYDoFD4TA4HI6AI+EoOBqOgWPhODgeToAT4SQ4GU6BU+E0OB3OgDPhLDgbzoFz4Tw4Hy6AC+EiuBgugUvhMrgcroAr4Sq4Gq6Ba+E6uB5ugBvhJrgZboFb4Ta4He6AO+EuuBvugXvhPrgfHoAH4SF4GB6BR+ExeByegCfhKXganoFn4TkYD8/DBHgBXoSX4GV4BV6F1+B1eAPehLfgbXgH3oX34H34AD6Ej+Bj+AQ+hc/gc/gCvoSv4Gv4BibCt/AdfA8/wI/wE/wMv8Cv8Bv8Dn/An/AX/A2T4B/4F/6DcU4yJ7mTwknppHJSO2mctE46J72TwcnoZHIyO1mcrE42J7uTw8npIA7qYA7uEA7pUA7tMA7rcA7vCI7oSI7sKI7qaI7uGA5wTMdybAc6juM6nuM7gRM6kRNzcjm5nTxOXiefk98p4BR0CjmFnSJOUaeYU9wp4ZR0SjmlnTJOWaecU96p4FR0KjmVnSpOVaeaU92p4dR0ajm1nTpOXaeeU99p4DR0GjmNnSZOU6eZ09xp4bR0WjmtnTZOW6ed097p4HR0OjmdnS5OV6eb093p4fR0ejm9nT5OX6ef098Z4Ax0BjmDnSHOUGeYM9wZ4Yx0RjmjnTHOWGecM96Z4Ex0JjmT/8//G5wZzkxnljPbmePMdeY5850FzkJnkbPYWeIsdZY5y50VzkpnlbPaWeOsddY5650NzkZnk7PZ2eJsdbY5250dzk5nl7Pb2ePsdfY5+50DzkHnkHPYOeIcdY45x50TzknnlHPaOeOcdc458c55J8G54Fx0LjmXnSvOVeeac9254dx0bjm3nTvOXeeec9954Dx0HjmPnSfOU+eZ89x54bx0XjmvnTdOovPWeee8dz44H51Pzmfni/PV+eZ8d344P51fzm8nyfnj/HX+OXFuMje5m8JN6aZyU7tp3LRuOje9m8HN6GZyM7tZ3KxuNje7m8PN6SIu6mIu7hIu6VIu7TIu63Iu7wqu6Equ7Cqu6mqu7houcE3Xcm0Xuo7rup7ru4EbupEbc3O5ud08bl43n5vfLeAWdAu5hd0iblG3mFvcLeGWdEu5pd0yblm3nFvereBWdCu5ld0qblW3mlvdreHWdGu5td06bl23nlvfbeA2dBu5jd0mblO3mdvcbeG2dFu5rd02blu3ndve7eB2dDu5nd0uble3m9vd7eH2dHu5vd0+bl+3n9vfHeAOdAe5g90h7lB3mDvcHeGOdEe5o90x7lh3nDveneBOdCe5k90p7lR3mjvdneHOdGe5s9057lx3njvfXeAudBe5i90l7lJ3mbvcXeGudFe5q9017lp3nbve3eBudDe5m90t7lZ3m7vd3eHudHe5u9097l53n7vfPeAedA+5h90j7lH3mHvcPeGedE+5p90z7ln3nBvvnncT3AvuRfeSe9m94l51r7nX3RvuTfeWe9u9495177n33QfuQ/eR+9h94j51n7nP3RfuS/eV+9p94ya6b9137nv3g/vR/eR+dr+4X91v7nf3h/vT/eX+dpPcP+5f958b5yXzknspvJReKi+1l8ZL66Xz0nsZvIxeJi+zl8XL6mXzsns5vJwe4qEe5uEe4ZEe5dEe47Ee5/Ge4Ime5Mme4qme5ume4QHP9CzP9qDneK7neb4XeKEXeTEvl5fby+Pl9fJ5+b0CXkGvkFfYK+IV9Yp5xb0SXkmvlFfaK+OV9cp55b0KXkWvklfZq+JV9ap51b0aXk2vllfbq+PV9ep59b0GXkOvkdfYa+I19Zp5zb0WXkuvldfaa+O19dp57b0OXkevk9fZ6+J19bp53b0eXk+vl9fb6+P19fp5/b0B3kBvkDfYG+IN9YZ5w70R3khvlDfaG+ON9cZ5470J3kRvkjfZm+JN9aZ5070Z3kxvljfbm+PN9eZ5870F3kJvkbfYW+It9ZZ5y70V3kpvlbfaW+Ot9dZ5670N3kZvk7fZ2+Jt9bZ5270d3k5vl7fb2+Pt9fZ5+70D3kHvkHfYO+Id9Y55x70T3knvlHfaO+Od9c558d55L8G74F30LnmXvSveVe+ad9274d30bnm3vTveXe+ed9974D30HnmPvSfeU++Z99x74b30XnmvvTdeovfWe+e99z54H71P3mfvi/fV++Z99354P71f3m8vyfvj/fX+eXF+Mj+5n8JP6afyU/tp/LR+Oj+9n8HP6GfyM/tZ/Kx+Nj+7n8PP6SM+6mM+7hM+6VM+7TM+63M+7wu+6Eu+7Cu+6mu+7hs+8E3f8m0f+o7v+p7v+4Ef+pEf83P5uf08fl4/n5/fL+AX9Av5hf0iflG/mF/cL+GX9Ev5pf0yflm/nF/er+BX9Cv5lf0qflW/ml/dr+HX9Gv5tf06fl2/nl/fb+A39Bv5jf0mflO/md/cb+G39Fv5rf02flu/nd/e7+B39Dv5nf0ufle/m9/d7+H39Hv5vf0+fl+/n9/fH+AP9Af5g/0h/lB/mD/cH+GP9Ef5o/0x/lh/nD/en+BP9Cf5k/0p/lR/mj/dn+HP9Gf5s/05/lx/nj/fX+Av9Bf5i/0l/lJ/mb/cX+Gv9Ff5q/01/lp/nb/e3+Bv9Df5m/0t/lZ/m7/d3+Hv9Hf5u/09/l5/n7/fP+Af9A/5h/0j/lH/mH/cP+Gf9E/5p/0z/ln/nB/vn/cT/Av+Rf+Sf9m/4l/1r/nX/Rv+Tf+Wf9u/49/17/n3/Qf+Q/+R/9h/4j/1n/nP/Rf+S/+V/9p/4yf6b/13/nv/g//R/+R/9r/4X/1v/nf/h//T/+X/9pP8P/5f/58fFyQLkgcpgpRBqiB1kCZIG6QL0gcZgoxBpiBzkCXIGmQLsgc5gpwBEqABFuABEZABFdABE7ABF/CBEIiBFMiBEqiBFuiBEYDADKzADmDgBG7gBX4QBGEQBbEgV5A7yBPkDfIF+YMCQcGgUFA4KBIUDYoFxYMSQcmgVFA6KBOUDcoF5YMKQcWgUlA5qBJUDaoF1YMaQc2gVlA7qBPUDeoF9YMGQcOgUdA4aBI0DZoFzYMWQcugVdA6aBO0DdoF7YMOQcegU9A56BJ0DboF3YMeQc+gV9A76BP0DfoF/YMBwcBgUDA4GBIMDYYFw4MRwchgVDA6GBOMDcYF44MJwcRgUjA5mBJMDaYF04MZwcxgVjA7mBPMDeYF84MFwcJgUbA4WBIsDZYFy4MVwcpgVbA6WBOsDdYF64MNwcZgU7A52BJsDbYF24Mdwc5gV7A72BPsDfYF+4MDwcHgUHA4OBIcDY4Fx4MTwcngVHA6OBOcDc4F8cH5ICG4EFwMLgWXgyvB1eBacD24EdwMbgW3gzvB3eBecD94EDwMHgWPgyfB0+BZ8Dx4EbwMXgWvgzdBYvA2eBe8Dz4EH4NPwefgS/A1+BZ8D34EP4Nfwe8gKfgT/A3+BXFhsjB5mCJMGaYKU4dpwrRhujB9mCHMGGYKM4dZwqxhtjB7mCPMGSIhGmIhHhIhGVIhHTIhG3IhHwqhGEqhHCqhGmqhHhohCM3QCu0Qhk7ohl7oh0EYhlEYC3OFucM8Yd4wX5g/LBAWDAuFhcMiYdGwWFg8LBGWDEuFpcMyYdmwXFg+rBBWDCuFlcMqYdWwWlg9rBHWDGuFtcM6Yd2wXlg/bBA2DBuFjcMmYdOwWdg8bBG2DFuFrcM2YduwXdg+7BB2DDuFncMuYdewW9g97BH2DHuFvcM+Yd+wX9g/HBAODAeFg8Mh4dBwWDg8HBGODEeFo8Mx4dhwXDg+nBBODCeFk8Mp4dRwWjg9nBHODGeFs8M54dxwXjg/XBAuDBeFi8Ml4dJwWbg8XBGuDFeFq8M14dpwXbg+3BBuDDeFm8Mt4dZwW7g93BHuDHeFu8M94d5wX7g/PBAeDA+Fh8Mj4dHwWHg8PBGeDE+Fp8Mz4dnwXBgfng8TwgvhxfBSeDm8El4Nr4XXwxvhzfBWeDu8E94N74X3wwfhw/BR+Dh8Ej4Nn4XPwxfhy/BV+Dp8EyaGb8N34fvwQ/gx/BR+Dr+EX8Nv4ffwR/gz/BX+DpPCP+Hf8F8YFyWLkkcpopRRqih1lCZKG6WL0kcZooxRpihzlCXKGmWLskc5opwREqERFuEREZERFdERE7ERF/GREImRFMmREqmRFumREYHIjKzIjmDkRG7kRX4URGEURbEoV5Q7yhPljfJF+aMCUcGoUFQ4KhIVjYpFxaMSUcmoVFQ6KhOVjcpF5aMKUcWoUlQ5qhJVjapF1aMaUc2oVlQ7qhPVjepF9aMGUcOoUdQ4ahI1jZpFzaMWUcuoVdQ6ahO1jdpF7aMOUceoU9Q56hJ1jbpF3aMeUc+oV9Q76hP1jfpF/aMB0cBoUDQ4GhINjYZFw6MR0choVDQ6GhONjcZF46MJ0cRoUjQ5mhJNjaZF06MZ0cxoVjQ7mhPNjeZF86MF0cJoUbQ4WhItjZZFy6MV0cpoVbQ6WhOtjdZF66MN0cZoU7Q52hJtjbZF26Md0c5oV7Q72hPtjfZF+6MD0cHoUHQ4OhIdjY5Fx6MT0cnoVHQ6OhOdjc5F8dH5KCG6EF2MLkWXoyvR1ehadD26Ed2MbkW3ozvR3ehedD96ED2MHkWPoyfR0+hZ9Dx6Eb2MXkWvozdRYvQ2ehe9jz5EH6NP0efoS/Q1+hZ9j35EP6Nf0e8oKfoT/Y3+RXGxZLHksRSxlLFUsdSxNLG0sXSx9LEMsYyxTLHMsSyxrLFsseyxHLGcMSSGxrAYHiNiZIyK0TEmxsa4GB8TYmJMiskxJabGtJgeM2IgZsasmB2DMSfmxryYHwtiYSyKxWK5YrljeWJ5Y/li+WMFYgVjhWKFY0ViRWPFYsVjJWIlY6VipWNlYmVj5WLlYxViFWOVYpVjVWJVY9Vi1WM1YjVjtWL/BwHwAJj1EgAAfHHZtm38bVt392Xbtm3btutl27Zbto3t/WpjdbC6WD2sPtYAa4g1whpjTbCmWDOsOdYCa4m1wlpjbbC2WDusPdYB64h1wjpjXbCuWDesO9YD64n1wnpjfbC+WD+sPzYAG4gNwgZjQ7Ch2DBsODYCG4mNwkZjY7Cx2DhsPDYBm4hNwiZjU7Cp2DRsOjYDm4nNwmZjc7C52DxsPrYAW4gtwhZjS7Cl2DJsObYCW4mtwlZja7C12DpsPbYB24htwjZjW7Ct2DbsP2w7tgPbie3CdmN7sL3YPmw/dgA7iB3CDmNHsKPYMew4dgI7iZ3CTmNnsLPYOew8dgG7iF3CLmNXsKvYNew6dgO7id3CbmN3sLvYPew+9gB7iD3CErDH2BPsKfYMe469wF5ir7DX2BvsLfYOe499wD5in7DP2BfsK/YN+479wH5iv7Df2B/sL/YPS8SSsDg8GZ4cT4GnxFPhqfF4PA2eFk+Hp8cz4BnxTHhmPAueFc+GZ8dz4DnxXHhuPA+eF8+H58cL4AXxQnhhvAheFC+GF8dL4CXxUnhpvAxeFi+Hl8cr4BXxSnhlvApeFa+GV8dr4DVxDMdxAidxCqdxBmdxDudxARdxCZdxBVdxDddxAzdxC7dxB3dxD/fxAA/xCAc4xBEew2vhtfE6eF28Hl4fb4A3xBvhjfEmeFO8Gd4cb4G3xFvhrfE2eFu8Hd4e74B3xDvhnfEueFe8G94d74H3xHvhvfE+eF+8H94fH4APxAfhg/Eh+FB8GD4cH4GPxEfho/Ex+Fh8HD4en4BPxCfhk/Ep+FR8Gj4dn4HPxGfhs/E5+Fx8Hj4fX4AvxBfhi/El+FJ8Gb4cX4GvxFfhq/E1+Fp8Hb4e34BvxDfhm/Et+FZ8G/4fvh3fge/Ed+G78T34Xnwfvh8/gB/ED+GH8SP4UfwYfhw/gZ/ET+Gn8TP4Wfwcfh6/gF/EL+GX8Sv4Vfwafh2/gd/Eb+G38Tv4Xfwefh9/gD/EH+EJ+GP8Cf4Uf4Y/x1/gL/FX+Gv8Df4Wf4e/xz/gH/FP+Gf8C/4V/4Z/x3/gP/Ff+G/8D/4X/4cn4kl4HJGMSE6kIFISqYjURDyRhkhLpCPSExmIjEQmIjORhchKZCOyEzmInEQuIjeRh8hL5CPyEwWIgkQhojBRhChKFCOKEyWIkkQpojRRhihLlCPKExWIikQlojJRhahKVCOqEzWImgRG4ARBkARF0ARDsARH8IRAiIREyIRCqIRG6IRBmIRF2IRDuIRH+ERAhEREAAISiIgRtYjaRB2iLlGPqE80IBoSjYjGRBOiKdGMaE60IFoSrYjWRBuiLdGOaE90IDoSnYjORBeiK9GN6E70IHoSvYjeRB+iL9GP6E8MIAYSg4jBxBBiKDGMGE6MIEYSo4jRxBhiLDGOGE9MICYSk4jJxBRiKjGNmE7MIGYSs4jZxBxiLjGPmE8sIBYSi4jFxBJiKbGMWE6sIFYSq4jVxBpiLbGOWE9sIDYSm4jNxBZiK7GN+I/YTuwgdhK7iN3EHmIvsY/YTxwgDhKHiMPEEeIocYw4TpwgThKniNPEGeIscY44T1wgLhKXiMvEFeIqcY24TtwgbhK3iNvEHeIucY+4TzwgHhKPiATiMfGEeEo8I54TL4iXxCviNfGGeEu8I94TH4iPxCfiM/GF+Ep8I74TP4ifxC/iN/GH+Ev8IxKJJCKOTEYmJ1OQKclUZGoynkxDpiXTkenJDGRGMhOZmcxCZiWzkdnJHGROMheZm8xD5iXzkfnJAmRBshBZmCxCFiWLkcXJEmRJshRZmixDliXLkeXJCmRFshJZmaxCViWrkdXJGmRNEiNxkiBJkiJpkiFZkiN5UiBFUiJlUiFVUiN10iBN0iJt0iFd0iN9MiBDMiIBCUlExshaZG2yDlmXrEfWJxuQDclGZGOyCdmUbEY2J1uQLclWZGuyDdmWbEe2JzuQHclOZGeyC9mV7EZ2J3uQPcleZG+yD9mX7Ef2JweQA8lB5GByCDmUHEYOJ0eQI8lR5GhyDDmWHEeOJyeQE8lJ5GRyCjmVnEZOJ2eQM8lZ5GxyDjmXnEfOJxeQC8lF5GJyCbmUXEYuJ1eQK8lV5GpyDbmWXEeuJzeQG8lN5GZyC7mV3Eb+R24nd5A7yV3kbnIPuZfcR+4nD5AHyUPkYfIIeZQ8Rh4nT5AnyVPkafIMeZY8R54nL5AXyUvkZfIKeZW8Rl4nb5A3yVvkbfIOeZe8R94nH5APyUdkAvmYfEI+JZ+Rz8kX5EvyFfmafEO+Jd+R78kP5EfyE/mZ/EJ+Jb+R38kf5E/yF/mb/EP+Jf+RiWQSGUclo5JTKaiUVCoqNRVPpaHSUumo9FQGKiOVicpMZaGyUtmo7FQOKieVi8pN5aHyUvmo/FQBqiBViCpMFaGKUsWo4lQJqiRViipNlaHKUuWo8lQFqiJViapMVaGqUtWo6lQNqiaFUThFUCRFUTTFUCzFUTwlUCIlUTKlUCqlUTplUCZlUTblUC7lUT4VUCEVUYCCFKJiVC2qNlWHqkvVo+pTDaiGVCOqMdWEako1o5pTLaiWVCuqNdWGaku1o9pTHaiOVCeqM9WF6kp1o7pTPaieVC+qN9WH6kv1o/pTA6iB1CBqMDWEGkoNo4ZTI6iR1ChqNDWGGkuNo8ZTE6iJ1CRqMjWFmkpNo6ZTM6iZ1CxqNjWHmkvNo+ZTC6iF1CJqMbWEWkoto5ZTK6iV1CpqNbWGWkuto9ZTG6iN1CZqM7WF2kpto/6jtlM7qJ3ULmo3tYfaS+2j9lMHqIPUIeowdYQ6Sh2jjlMnqJPUKeo0dYY6S52jzlMXqIvUJeoydYW6Sl2jrlM3qJvULeo2dYe6S92j7lMPqIfUIyqBekw9oZ5Sz6jn1AvqJfWKek29od5S76j31AfqI/WJ+kx9ob5S36jv1A/qJ/WL+k39of5S/6hEKomKo5PRyekUdEo6FZ2ajqfT0GnpdHR6OgOdkc5EZ6az0FnpbHR2Ogedk85F56bz0HnpfHR+ugBdkC5EF6aL0EXpYnRxugRdki5Fl6bL0GXpcnR5ugJdka5EV6ar0FXpanR1ugZdk8ZonCZokqZommZoluZonhZokZZomVZoldZonTZok7Zom3Zol/Zonw7okI5oQEMa0TG6Fl2brkPXpevR9ekGdEO6Ed2YbkI3pZvRzekWdEu6Fd2abkO3pdvR7ekOdEe6E92Z7kJ3pbvR3ekedE+6F92b7kP3pfvR/ekB9EB6ED2YHkIPpYfRw+kR9Eh6FD2aHkOPpcfR4+kJ9ER6Ej2ZnkJPpafR0+kZ9Ex6Fj2bnkPPpefR8+kF9EJ6Eb2YXkIvpZfRy+kV9Ep6Fb2aXkOvpdfR6+kN9EZ6E72Z3kJvpbfR/9Hb6R30TnoXvZveQ++l99H76QP0QfoQfZg+Qh+lj9HH6RP0SfoUfZo+Q5+lz9Hn6Qv0RfoSfZm+Ql+lr9HX6Rv0TfoWfZu+Q9+l79H36Qf0Q/oRnUA/pp/QT+ln9HP6Bf2SfkW/pt/Qb+l39Hv6A/2R/kR/pr/QX+lv9Hf6B/2T/kX/pv/Qf+l/dCKdRMcxyZjkTAomJZOKSc3EM2mYtEw6Jj2TgcnIZGIyM1mYrEw2JjuTg8nJ5GJyM3mYvEw+Jj9TgCnIFGIKM0WYokwxpjhTginJlGJKM2WYskw5pjxTganIVGIqM1WYqkw1pjpTg6nJYAzOEAzJUAzNMAzLcAzPCIzISIzMKIzKaIzOGIzJWIzNOIzLeIzPBEzIRAxgIIOYGFOLqc3UYeoy9Zj6TAOmIdOIacw0YZoyzZjmTAumJdOKac20Ydoy7Zj2TAemI9OJ6cx0Yboy3ZjuTA+mJ9OL6c30Yfoy/Zj+zABmIDOIGcwMYYYyw5jhzAhmJDOKGc2MYcYy45jxzARmIjOJmcxMYaYy05jpzAxmJjOLmc3MYeYy85j5zAJmIbOIWcwsYZYyy5jlzApmJbOKWc2sYdYy65j1zAZmI7OJ2cxsYbYy25j/mO3MDmYns4vZzexh9jL7mP3MAeYgc4g5zBxhjjLHmOPMCeYkc4o5zZxhzjLnmPPMBeYic4m5zFxhrjLXmOvMDeYmc4u5zdxh7jL3mPvMA+Yh84hJYB4zT5inzDPmOfOCecm8Yl4zb5i3zDvmPfOB+ch8Yj4zX5ivzDfmO/OD+cn8Yn4zf5i/zD8mkUli4thkbHI2BZuSTcWmZuPZNGxaNh2bns3AZmQzsZnZLGxWNhubnc3B5mRzsbnZPGxeNh+bny3AFmQLsYXZImxRthhbnC3BlmRLsaXZMmxZthxbnq3AVmQrsZXZKmxVthpbna3B1mQxFmcJlmQplmYZlmU5lmcFVmQlVmYVVmU1VmcN1mQt1mYd1mU91mcDNmQjFrCQRWyMrcXWZuuwddl6bH22AduQbcQ2ZpuwTdlmbHO2BduSbcW2Ztuwbdl2bHu2A9uR7cR2ZruwXdlubHe2B9uT7cX2Zvuwfdl+bH92ADuQHcQOZoewQ9lh7HB2BDuSHcWOZsewY9lx7Hh2AjuRncROZqewU9lp7HR2BjuTncXOZuewc9l57Hx2AbuQXcQuZpewS9ll7HJ2BbuSXcWuZtewa9l17Hp2A7uR3cRuZrewW9lt7H/sdnYHu5Pdxe5m97B72X3sfvYAe5A9xB5mj7BH2WPscfYEe5I9xZ5mz7Bn2XPsefYCe5G9xF5mr7BX2WvsdfYGe5O9xd5m77B32XvsffYB+5B9xCawj9kn7FP2GfucfcG+ZF+xr9k37Fv2Hfue/cB+ZD+xn9kv7Ff2G/ud/cH+ZH+xv9k/7F/2H5vIJrFxXDIuOZeCS8ml4lJz8VwaLi2XjkvPZeAycpm4zFwWLiuXjcvO5eBycrm43FweLi+Xj8vPFeAKcoW4wlwRrihXjCvOleBKcqW40lwZrixXjivPVeAqcpW4ylwVripXjavO1eBqchiHcwRHchRHcwzHchzHcwInchIncwqnchqncwZnchZncw7nch7ncwEXchEHOMghLsbV4mpzdbi6XD2uPteAa8g14hpzTbimXDOuOdeCa8m14lpzbbi2XDuuPdeB68h14jpzXbiuXDeuO9eD68n14npzfbi+XD+uPzeAG8gN4gZzQ7ih3DBuODeCG8mN4kZzY7ix3DhuPDeBm8hN4iZzU7ip3DRuOjeDm8nN4mZzc7i53DxuPreAW8gt4hZzS7il3DJuObeCW8mt4lZza7i13DpuPbeB28ht4jZzW7it3DbuP247t4Pbye3idnN7uL3cPm4/d4A7yB3iDnNHuKPcMe44d4I7yZ3iTnNnuLPcOe48d4G7yF3iLnNXuKvcNe46d4O7yd3ibnN3uLvcPe4+94B7yD3iErjH3BPuKfeMe8694F5yr7jX3BvuLfeOe8994D5yn7jP3BfuK/eN+8794H5yv7jf3B/uL/ePS+SSuDg+GZ+cT8Gn5FPxqfl4Pg2flk/Hp+cz8Bn5THxmPguflc/GZ+dz8Dn5XHxuPg+fl8/H5+cL8AX5QnxhvghflC/GF+dL8CX5Unxpvgxfli/Hl+cr8BX5Snxlvgpfla/GV+dr8DV5jMd5gid5iqd5hmd5jud5gRd5iZd5hVd5jdd5gzd5i7d5h3d5j/f5gA/5iAc85BEf42vxtfk6fF2+Hl+fb8A35BvxjfkmfFO+Gd+cb8G35Fvxrfk2fFu+Hd+e78B35DvxnfkufFe+G9+d78H35Hvxvfk+fF++H9+fH8AP5Afxg/kh/FB+GD+cH8GP5Efxo/kx/Fh+HD+en8BP5Cfxk/kp/FR+Gj+dn8HP5Gfxs/k5/Fx+Hj+fX8Av5Bfxi/kl/FJ+Gb+cX8Gv5Ffxq/k1/Fp+Hb+e38Bv5Dfxm/kt/FZ+G58Qt53fwe/kd/G7+T38Xn4fv58/wB/kD/GH+SP8Uf4Yf5w/wZ/kT/Gn+TP8Wf4cf56/wF/kL/GX+Sv8Vf4af52/wd/kb/G3+Tv8Xf4ef59/wD/kH/EJ/GP+Cf+Uf8Y/51/wL/lX/Gv+Df+Wf8e/5z/wH/lP/Gf+C/+V/8Z/53/wP/lf/G/+D/+X/8cn8kl8nJBMSC6kEFIKqYTUQryQRkgrpBPSCxmEjEImIbOQRcgqZBOyCzmEnEIuIbeQR8gr5BPyCwWEgkIhobBQRCgqFBOKCyWEkkIpobRQRigrlBPKCxWEikIlobJQRagqVBOqCzWEmgIm4AIhkAIl0AIjsAIn8IIgiIIkyIIiqIIm6IIhmIIl2IIjuIIn+EIghEIkAAEKSIgJtYTaQh2hrlBPqC80EBoKjYTGQhOhqdBMaC60EFoKrYTWQhuhrdBOaC90EDoKnYTOQhehq9BN6C70EHoKvYTeQh+hr9BP6C8MEAYKg4TBwhBhqDBMGC6MEEYKo4TRwhhhrDBOGC9MECYKk4TJwhRhqjBNmC7MEGYKs4TZwhxhrjBPmC8sEBYKi4TFwhJhqbBMWC6sEFYKq4TVwhphrbBOWC9sEDYKm4TNwhZhq7BN+E/YLuwQdgq7hN3CHmGvsE/YLxwQDgqHhMPCEeGocEw4LpwQTgqnhNPCGeGscE44L1wQLgqXhMvCFeGqcE24LtwQbgq3hNvCHeGucE+4LzwQHgqPhAThsfBEeCo8E54LL4SXwivhtfBGeCu8E94LH4SPwifhs/BF+Cp8E74LP4Sfwi/ht/BH+Cv8ExKFJCFOTCYmF1OIKcVUYmoxXkwjphXTienFDGJGMZOYWcwiZhWzidnFHGJOMZeYW8wj5hXzifnFAmJBsZBYWCwiFhWLicXFEmJJsZRYWiwjlhXLieXFCmJFsZJYWawiVhWridXFGmJNERNxkRBJkRJpkRFZkRN5URBFURJlURFVURN10RBN0RJt0RFd0RN9MRBDMRKBCEUkxsRaYm2xjlhXrCfWFxuIDcVGYmOxidhUbCY2F1uILcVWYmuxjdhWbCe2FzuIHcVOYmexi9hV7CZ2F3uIPcVeYm+xj9hX7Cf2FweIA8VB4mBxiDhUHCYOF0eII8VR4mhxjDhWHCeOFyeIE8VJ4mRxijhVnCZOF2eIM8VZ4mxxjjhXnCfOFxeIC8VF4mJxibhUXCYuF1eIK8VV4mpxjbhWXCeuFzeIG8VN4mZxi7hV3Cb+J24Xd4g7xV3ibnGPuFfcJ+4XD4gHxUPiYfGIeFQ8Jh4XT4gnxVPiafGMeFY8J54XL4hJ4iXxsnhFvCpeE6+LN8Sb4i3xtnhHvCveE++LD8SH4iMxQXwsPhGfis/E5+IL8aX4SnwtvhHfiu/E9+IH8aP4SfwsfhG/it/E7+IP8af4S/wt/hH/iv/ERDFJjJOSScmlFFJKKZWUWoqX0khppXRSeimDlFHKJGWWskhZpWxSdimHlFPKJeWW8kh5pXxSfqmAVFAqJBWWikhFpWJScamEVFIqJZWWykhlpXJSeamCVFGqJFWWqkhVpWpSdamGVFPCJFwiJFKiJFpiJFbiJF4SJFGSJFlSJFXSJF0yJFOyJFtyJFfyJF8KpFCKJCBBCUkxqZZUW6oj1ZXqSfWlBlJDqZHUWGoiNZWaSc2lFlJLqZXUWmojtZXaSe2lDlJHqZPUWeoidZW6Sd2lHlJPqZfUW+oj9ZX6Sf2lAdJAaZA0WBoiDZWGScOlEdJIaZQ0WhojjZXGSeOlCdJEaZI0WZoiTZWmSdOlGdJMaZY0W5ojzZXmSfOlBdJCaZG0WFoiLZWWSculFdJKaZW0WlojrZXWSeulDdJGaZO0WdoibZW2Sf9J26Ud0k5pl7Rb2iPtlfZJ+6UD0kHpkHRYOiIdlY5Jx6UT0knplHRaOiOdlc5J56UL0kXpknRZuiJdla5J16Ub0k3plnRbuiPdle5J96UH0kPpkZQgPZaeSE+lZ9Jz6YX0UnolvZbeSG+ld9J76YP0UfokfZa+SF+lb9J36Yf0U/ol/Zb+SH+lf1KilCTFycnk5HIKOaWcSk4tx8tp5LRyOjm9nEHOKGeSM8tZ5KxyNjm7nEPOKeeSc8t55LxyPjm/XEAuKBeSC8tF5KJyMbm4XEIuKZeSS8tl5LJyObm8XEGuKFeSK8tV5KpyNbm6XEOuKWMyLhMyKVMyLTMyK3MyLwuyKEuyLCuyKmuyLhuyKVuyLTuyK3uyLwdyKEcykKGM5JhcS64t15HryvXk+nIDuaHcSG4sN5Gbys3k5nILuaXcSm4tt5Hbyu3k9nIHuaPcSe4sd5G7yt3k7nIPuafcS+4t95H7yv3k/vIAeaA8SB4sD5GHysPk4fIIeaQ8Sh4tj5HHyuPk8fIEeaI8SZ4sT5GnytPk6fIMeaY8S54tz5HnyvPk+fICeaG8SF4sL5GXysvk5fIKeaW8Sl4tr5HXyuvk9fIGeaO8Sd4sb5G3ytvk/+Tt8g55p7xL3i3vkffK++T98gH5oHxIPiwfkY/Kx+Tj8gn5pHxKPi2fkc/K5+Tz8gX5onxJvixfka/K1+Tr8g35pnxLvi3fke/K9+T78gP5ofxITpAfy0/kp/Iz+bn8Qn4pv5Jfy2/kt/I7+b38Qf4of5I/y1/kr/I3+bv8Q/4p/5J/y3/kv/I/OVFOkuOUZEpyJYWSUkmlpFbilTRKWiWdkl7JoGRUMimZlSxKViWbkl3JoeRUcim5lTxKXiWfkl8poBRUCimFlSJKUaWYUlwpoZRUSimllTJKWaWcUl6poFRUKimVlSpKVaWaUl2podRUMAVXCIVUKIVWGIVVOIVXBEVUJEVWFEVVNEVX4uPi4izFVhzFVTzFVwIlVCIFKFBBSkyppdRW6ih1lXpKfaWB0lBppDRWmihNlWZKc6WF0lJppbRW2ihtlXZKe6WD0lHppHRWuihdlW5Kd6WH0lPppfRW+ih9lX5Kf2WAMlAZpAxWhihDlWHKcGWEMlIZpYxWxihjlXHKeGWCMlGZpExWpihTlWnKdGWGMlOZpcxW5ihzlXnKfGWBslBZpCxWlihLlWXKcmWFslJZpaxW1ihrlXXKemWDslHZpGxWtihblW3Kf8p2ZYeyU9ml7Fb2KHuVfcp+5YByUDmkHFaOKEeVY8px5YRyUjmlnFbOKGeVc8p55YJyUbmkXFauKFeVa8p15YZyU7ml3FbuKHeVe8p95YHyUHmkJCiPlSfKU+WZ8lx5obxUXimvlTfKW+Wd8l75oHxUPimflS/KV+Wb8l35ofxUfim/lT/KX+WfkqgkKXFqMjW5mkJNqaZSU6vxaho1rZpOTa9mUDOqmdTMahY1q5pNza7mUHOqudTcah41r5pPza8WUAuqhdTCahG1qFpMLa6WUEuqpdTSahm1rFpOLa9WUCuqldTKahW1qlpNra7WUGuqmIqrhEqqlEqrjMqqnMqrgiqqkiqriqqqmqqrhmqqlmqrjuqqnuqrgRqqkQpUqCI1ptZSa6t11LpqPbW+2kBtqDZSG6tN1KZqM7W52kJtqbZSW6tt1LZqO7W92kHtqHZSO6td1K5qN7W72kPtqfZSe6t91L5qP7W/OkAdqA5SB6tD1KHqMHW4OkIdqY5SR6tj1LHqOHW8OkGdqE5SJ6tT1KnqNHW6OkOdqc5SZ6tz1LnqPHW+ukBdqC5SF6tL1KXqMnW5ukJdqa5SV6tr1LXqOnW9ukHdqG5SN6tb1K3qNvU/dbu6Q92p7lJ3q3vUveo+db96QD2oHlIPq0fUo+ox9bh6Qj2pnlJPq2fUs+o59bx6Qb2oXlIvq1fUq+o19bp6Q72p3lJvq3fUu+o99b76QH2oPlIT1MfqE/Wp+kx9rr5QX6qv1NfqG/Wt+k59r35QP6qf1M/qF/Wr+k39rv5Qf6q/1N/qH/Wv+k9NVJPUOC2ZllxLoaXUUmmptXgtjZZWS6el1zJoGbVMWmYti5ZVy6Zl13JoObVcWm4tj5ZXy6fl1wpoBbVCWmGtiFZUK6YV10poJbVSWmmtjFZWK6eV1ypoFbVKWmWtilZVq6ZV12poNTVMwzVCIzVKozVGYzVO4zVBEzVJkzVFUzVN0zVDMzVLszVHczVP87VAC7VIAxrUkBbTamm1tTpaXa2eVl9roDXUGmmNtSZaU62Z1lxrobXUWmmttTZaW62d1l7roHXUOmmdtS5aV62b1l3rofXUemm9tT5aX62f1l8boA3UBmmDtSHaUG2YNlwboY3URmmjtTHaWG2cNl6boE3UJmmTtSnaVG2aNl2boc3UZmmztTnaXG2eNl9boC3UFmmLtSXaUm2Ztlxboa3UVmmrtTXaWm2dtl7boG3UNmmbtS3aVm2b9p+2Xduh7dR2abu1PdpebZ+2XzugHdQOaYe1I9pR7Zh2XDuhndROaae1M9pZ7Zx2XrugXdQuaZe1K9pV7Zp2Xbuh3dRuabe1O9pd7Z52X3ugPdQeaQnaY+2J9lR7pj3XXmgvtVfaa+2N9lZ7p73XPmgftU/aZ+2L9lX7pn3Xfmg/tV/ab+2P9lf7pyVqSVqcnkxPrqfQU+qp9NR6vJ5GT6un09PrGfSMeiY9s55Fz6pn07PrOfScei49t55Hz6vn0/PrBfSCeiG9sF5EL6oX04vrJfSSeim9tF5GL6uX08vrFfSKeiW9sl5Fr6pX06vrNfSaOqbjOqGTOqXTOqOzOqfzuqCLuqTLuqKruqbruqGbuqXbuqO7uqf7eqCHeqQDHepIj+m19Np6Hb2uXk+vrzfQG+qN9MZ6E72p3kxvrrfQW+qt9NZ6G72t3k5vr3fQO+qd9M56F72r3k3vrvfQe+q99N56H72v3k/vrw/QB+qD9MH6EH2oPkwfro/QR+qj9NH6GH2sPk4fr0/QJ+qT9Mn6FH2qPk2frs/QZ+qz9Nn6HH2uPk+fry/QF+qL9MX6En2pvkxfrq/QV+qr9NX6Gn2tvk5fr2/QN+qb9M36Fn2rvk3/T9+u79B36rv03foefa++T9+vH9AP6of0w/oR/ah+TD+un9BP6qf00/oZ/ax+Tj+vX9Av6pf0y/oV/ap+Tb+u39Bv6rf02/od/a5+T7+vP9Af6o/0BP2x/kR/qj/Tn+sv9Jf6K/21/kZ/q7/T3+sf9I/6J/2z/kX/qn/Tv+s/9J/6L/23/kf/q//TE/UkPc5IZiQ3UhgpjVRGaiPeSGOkNdIZ6Y0MRkYjk5HZyGJkNbIZ2Y0cRk4jl5HbyGPkNfIZ+Y0CRkGjkFHYKGIUNYoZxY0SRkmjlFHaKGOUNcoZ5Y0KRkWjklHZqGJUNaoZ1Y0aRk0DM3CDMEiDMmiDMViDM3hDMERDMmRDMVRDM3TDMEzDMmzDMVzDM3wjMEIjMoABDWTEjFpGbaOOUdeoZ9Q3GhgNjUZGY6OJ0dRoZjQ3WhgtjVZGa6ON0dZoZ7Q3OhgdjU5GZ6OL0dXoZnQ3ehg9jV5Gb6OP0dfoZ/Q3BhgDjUHGYGOIMdQYZgw3RhgjjVHGaGOMMdYYZ4w3JhgTjUnGZGOKMdWYZkw3ZhgzjVnGbGOOMdeYZ8w3FhgLjUXGYmOJsdRYZiw3VhgrjVXGamONsdZYZ6w3NhgbjU3GZmOLsdXYZvxnbDd2GDuNXcZuY4+x19hn7DcOGAeNQ8Zh44hx1DhmHDdOGCeNU8Zp44xx1jhnnDcuGBeNS8Zl44px1bhmXDduGDeNW8Zt445x17hn3DceGA+NR0aC8dh4Yjw1nhnPjRfGS+OV8dp4Y7w13hnvjQ/GR+OT8dn4Ynw1vhnfjR/GT+OX8dv4Y/w1/hmJRpIRZyYzk5spzJRmKjO1GW+mMdOa6cz0ZgYzo5nJzGxmMbOa2czsZg4zp5nLzG3mMfOa+cz8ZgGzoFnILGwWMYuaxcziZgmzpFnKLG2WMcua5czyZgWzolnJrGxWMaua1czqZg2zpomZuEmYpEmZtMmYrMmZvCmYoimZsqmYqqmZummYpmmZtumYrumZvhmYoRmZwIQmMmNmLbO2Wcesa9Yz65sNzIZmI7Ox2cRsajYzm5stzJZmK7O12cZsa7Yz25sdzI5mJ7Oz2cXsanYzu5s9zJ5mL7O32cfsa/Yz+5sDzIHmIHOwOcQcag4zh5sjzJHmKHO0OcYca44zx5sTzInmJHOyOcWcak4zp5szzJnmLHO2Oceca84z55sLzIXmInOxucRcai4zl5srzJXmKnO1ucZca64z15sbzI3mJnOzucXcam4z/zO3mzvMneYuc7e5x9xr7jP3mwfMg+Yh87B5xDxqHjOPmyfMk+Yp87R5xjxrnjPPmxfMi+Yl87J5xbxqXjOvmzfMm+Yt87Z5x7xr3jPvmw/Mh+YjM8F8bD4xn5rPzOfmC/Ol+cp8bb4x35rvzPfmB/Oj+cn8bH4xv5rfzO/mD/On+cv8bf4x/5r/zEQzyYyzklnJrRRWSiuVldqKt9JYaa10Vnorg5XRymRltrJYWa1sVnYrh5XTymXltvJYea18Vn6rgFXQKmQVtopYRa1iVnGrhFXSKmWVtspYZa1yVnmrglXRqmRVtqpYVa1qVnWrhlXTwizcIizSoizaYizW4izeEizRkizZUizV0izdMizTsizbcizX8izfCqzQiixgQQtZMauWVduqY9W16ln1rQZWQ6uR1dhqYjW1mlnNrRZWS6uV1dpqY7W12lntrQ5WR6uT1dnqYnW1ulndrR5WT6uX1dvqY/W1+ln9rQHWQGuQNdgaYg21hlnDrRHWSGuUNdoaY421xlnjrQnWRGuSNdmaYk21plnTrRnWTGuWNduaY8215lnzrQXWQmuRtdhaYi21llnLrRXWSmuVtdpaY6211lnrrQ3WRmuTtdnaYm21tln/WdutHdZOa5e129pj7bX2WfutA9ZB65B12DpiHbWOWcetE9ZJ65R12jpjnbXOWeetC9ZF65J12bpiXbWuWdetG9ZN65Z127pj3bXuWfetB9ZD65GVYD22nlhPrWfWc+uF9dJ6Zb223lhvrXfWe+uD9dH6ZH22vlhfrW/Wd+uH9dP6Zf22/lh/rX9WopVkxdnJ7OR2CjulncpObcfbaey0djo7vZ3BzmhnsjPbWeysdjY7u53DzmnnsnPbeey8dj47v13ALmgXsgvbReyidjG7uF3CLmmXskvbZeyydjm7vF3BrmhXsivbVeyqdjW7ul3DrmljNm4TNmlTNm0zNmtzNm8LtmhLtmwrtmprtm4btmlbtm07tmt7tm8HdmhHNrChjeyYXcuubdex69r17Pp2A7uh3chubDexm9rN7OZ2C7ul3cpubbex29rt7PZ2B7uj3cnubHexu9rd7O52D7un3cvubfex+9r97P72AHugPcgebA+xh9rD7OH2CHukPcoebY+xx9rj7PH2BHuiPcmebE+xp9rT7On2DHumPcuebc+x59rz7Pn2AnuhvchebC+xl9rL7OX2Cnulvcpeba+x19rr7PX2BnujvcnebG+xt9rb7P/s7fYOe6e9y95t77H32vvs/fYB+6B9yD5sH7GP2sfs4/YJ+6R9yj5tn7HP2ufs8/YF+6J9yb5sX7Gv2tfs6/YN+6Z9y75t37Hv2vfs+/YD+6H9yE6wH9tP7Kf2M/u5/cJ+ab+yX9tv7Lf2O/u9/cH+aH+yP9tf7K/2N/u7/cP+af+yf9t/7L/2PzvRTrLjnGROcieFk9JJ5aR24p00TlonnZPeyeBkdDI5mZ0sTlYnm5PdyeHkdHI5uZ08Tl4nn5PfKeAUdAo5hZ0iTlGnmFPcKeGUdEo5pZ0yTlmnnFPeqeBUdCo5lZ0qTlWnmlPdqeHUdDAHdwiHdCiHdhiHdTiHdwRHdCRHdhRHdTRHdwzHdCzHdhzHdTzHdwIndCIHONBBTsyp5dR26jh1nXpOfaeB09Bp5DR2mjhNnWZOc6eF09Jp5bR22jhtnXZOe6eD09Hp5HR2ujhdnW5Od6eH09Pp5fR2+jh9nX5Of2eAM9AZ5Ax2hjhDnWHOcGeEM9IZ5Yx2xjhjnXHOeGeCM9GZ5Ex2pjhTnWnOdGeGM9OZ5cx25jhznXnOfGeBs9BZ5Cx2ljhLnWXOcmeFs9JZ5ax21jhrnXXOemeDs9HZ5Gx2tjhbnW3Of852Z4ez09nl7Hb2OHudfc5+54Bz0DnkHHaOOEedY85x54Rz0jnlnHbOOGedc85554Jz0bnkXHauOFeda85154Zz07nl3HbuOHede85954Hz0HnkJDiPnSfOU+eZ89x54bx0XjmvnTfOW+ed89754Hx0PjmfnS/OV+eb89354fx0fjm/nT/OX+efk+gkOXFuMje5m8JN6aZyU7vxbho3rZvOTe9mcDO6mdzMbhY3q5vNze7mcHO6udzcbh43r5vPze8WcAu6hdzCbhG3qFvMLe6WcEu6pdzSbhm3rFvOLe9WcCu6ldzKbhW3qlvNre7WcGu6mIu7hEu6lEu7jMu6nMu7giu6kiu7iqu6mqu7hmu6lmu7juu6nuu7gRu6kQtc6CI35tZya7t13LpuPbe+28Bt6DZyG7tN3KZuM7e528Jt6bZyW7tt3LZuO7e928Ht6HZyO7td3K5uN7e728Pt6fZye7t93L5uP7e/O8Ad6A5yB7tD3KHuMHe4O8Id6Y5yR7tj3LHuOHe8O8Gd6E5yJ7tT3KnuNHe6O8Od6c5yZ7tz3LnuPHe+u8Bd6C5yF7tL3KXuMne5u8Jd6a5yV7tr3LXuOne9u8Hd6G5yN7tb3K3uNvc/d7u7w93p7nJ3u3vcve4+d797wD3oHnIPu0fco+4x97h7wj3pnnJPu2fcs+4597x7wb3oXnIvu1fcq+4197p7w73p3nJvu3fcu+499777wH3oPnIT3MfuE/ep+8x97r5wX7qv3NfuG/et+859735wP7qf3M/uF/er+8397v5wf7q/3N/uH/ev+89NdJPcOC+Zl9xL4aX0UnmpvXgvjZfWS+el9zJ4Gb1MXmYvi5fVy+Zl93J4Ob1cXm4vj5fXy+fl9wp4Bb1CXmGviFfUK+YV90p4Jb1SXmmvjFfWK+eV9yp4Fb1KXmWvilfVq+ZV92p4NT3Mwz3CIz3Koz3GYz3O4z3BEz3Jkz3FUz3N0z3DMz3Lsz3Hcz3P873AC73IAx70kBfzanm1vTpeXa+eV99r4DX0GnmNvSZeU6+Z19xr4bX0WnmtvTZeW6+d197r4HX0OnmdvS5eV6+b193r4fX0enm9vT5eX6+f198b4A30BnmDvSHeUG+YN9wb4Y30RnmjvTHeWG+cN96b4E30JnmTvSneVG+aN92b4c30ZnmzvTneXG+eN99b4C30FnmLvSXeUm+Zt9xb4a30VnmrvTXeWm+dt97b4G30NnmbvS3eVm+b95+33dvh7fR2ebu9Pd5eb5+33zvgHfQOeYe9I95R75h33DvhnfROeae9M95Z75x33rvgXfQueZe9K95V75p33bvh3fRuebe9O95d755333vgPfQeeQneY++J99R75j33XngvvVfea++N99Z75733PngfvU/eZ++L99X75n33fng/vV/eb++P99f75yV6SV6cn8xP7qfwU/qp/NR+vJ/GT+un89P7GfyMfiY/s5/Fz+pn87P7Ofycfi4/t5/Hz+vn8/P7BfyCfiG/sF/EL+oX84v7JfySfim/tF/GL+uX88v7FfyKfiW/sl/Fr+pX86v7NfyaPubjPuGTPuXTPuOzPufzvuCLvuTLvuKrvubrvuGbvuXbvuO7vuf7fuCHfuQDH/rIj/m1/Np+Hb+uX8+v7zfwG/qN/MZ+E7+p38xv7rfwW/qt/NZ+G7+t385v73fwO/qd/M5+F7+r383v7vfwe/q9/N5+H7+v38/v7w/wB/qD/MH+EH+oP8wf7o/wR/qj/NH+GH+sP84f70/wJ/qT/Mn+FH+qP82f7s/wZ/qz/Nn+HH+uP8+f7y/wF/qL/MX+En+pv8xf7q/wV/qr/NX+Gn+tv85f72/wN/qb/M3+Fn+rv83/z9/u7/B3+rv83f4ef6+/z9/vH/AP+of8w/4R/6h/zD/un/BP+qf80/4Z/6x/zj/vX/Av+pf8y/4V/6p/zb/u3/Bv+rf82/4d/65/z7/vP/Af+o/8BP+x/8R/6j/zn/sv/Jf+K/+1/8Z/67/z3/sf/I/+J/+z/8X/6n/zv/s//J/+L/+3/8f/6//zE/0kPy5IFiQPUgQpg1RB6iA+SBOkDdIF6YMMQcYgU5A5yBJkDbIF2YMcQc4gV5A7yBPkDfIF+YMCQcGgUFA4KBIUDYoFxYMSQcmgVFA6KBOUDcoF5YMKQcWgUlA5qBJUDaoF1YMaQc0AC/CACMiACuiACdiAC/hACMRACuRACdRAC/TACMzACuzACdzAC/wgCMIgCkAAAxTEglpB7aBOUDeoF9QPGgQNg0ZB46BJ0DRoFjQPWgQtg1ZB66BN0DZoF7QPOgQdg05B56BL0DXoFnQPegQ9g15B76BP0DfoF/QPBgQDg0HB4GBIMDQYFgwPRgQjg1HB6GBMMDYYF4wPJgQTg0nB5GBKMDWYFkwPZgQzg1nB7GBOMDeYF8wPFgQLg0XB4mBJsDRYFiwPVgQrg1XB6mBNsDZYF6wPNgQbg03B5mBLsDXYFvwXbA92BDuDXcHuYE+wN9gX7A8OBAeDQ8Hh4EhwNDgWHA9OBCeDU8Hp4ExwNjgXnA8uBBeDS8Hl4EpwNbgWXA9uBDeDW8Ht4E5wN7gX3A8eBA+DR0FC8Dh4EjwNngXPgxfBy+BV8Dp4E7wN3gXvgw/Bx+BT8Dn4EnwNvgXfgx/Bz+BX8Dv4E/wN/gWJQVIQFyYLk4cpwpRhqjB1GB+mCdOG6cL0YYYwY5gpzBxmCbOG2cLsYY4wZ5grzB3mCfOG+cL8YYGwYFgoLBwWCYuGxcLiYYmwZFgqLB2WCcuG5cLyYYWwYlgprBxWCauG1cLqYY2wZoiFeEiEZEiFdMiEbMiFfCiEYiiFcqiEaqiFemiEZmiFduiEbuiFfhiEYRiFIIQhCmNhrbB2WCesG9YL64cNwoZho7Bx2CRsGjYLm4ctwpZhq7B12CZsG7YL24cdwo5hp7Bz2CXsGnYLu4c9wp5hr7B32CfsG/YL+4cDwoHhoHBwOCQcGg4Lh4cjwpHhqHB0OCYcG44Lx4cTwonhpHByOCWcGk4Lp4czwpnhrHB2OCecG84L54cLwoXhonBxuCRcGi4Ll4crwpXhqnB1uCZcG64L14cbwo3hpnBzuCXcGm4L/wu3hzvCneGucHe4J9wb7gv3hwfCg+Gh8HB4JDwaHguPhyfCk+Gp8HR4JjwbngvPhxfCi+Gl8HJ4JbwaXguvhzfCm+Gt8HZ4J7wb3gvvhw/Ch+GjMCF8HD4Jn4bPwufhi/Bl+Cp8Hb4J34bvwvfhh/Bj+Cn8HH4Jv4bfwu/hj/Bn+Cv8Hf4J/4b/wsQwKYyLkkXJoxRRyihVlDqKj9JEaaN0UfooQ5QxyhRljrJEWaNsUfYoR5QzyhXljvJEeaN8Uf6oQFQwKhQVjopERaNiUfGoRFQyKhWVjspEZaNyUfmoQlQxqhRVjqpEVaNqUfWoRlQzwiI8IiIyoiI6YiI24iI+EiIxkiI5UiI10iI9MiIzsiI7ciI38iI/CqIwiiIQwQhFsahWVDuqE9WN6kX1owZRw6hR1DhqEjWNmkXNoxZRy6hV1DpqE7WN2kXtow5Rx6hT1DnqEnWNukXdox5Rz6hX1DvqE/WN+kX9owHRwGhQNDgaEg2NhkXDoxHRyGhUNDoaE42NxkXjownRxGhSNDmaEk2NpkXToxnRzGhWNDuaE82N5kXzowXRwmhRtDhaEi2NlkXLoxXRymhVtDpaE62N1kXrow3RxmhTtDnaEm2NtkX/RdujHdHOaFe0O9oT7Y32RfujA9HB6FB0ODoSHY2ORcejE9HJ6FR0OjoTnY3OReejC9HF6FJ0OboSXY2uRdejG9HN6FZ0O7oT3Y3uRfejB9HD6FGUED2OnkRPo2fR8+hF9DJ6Fb2O3kRvo3fR++hD9DH6FH2OvkRfo2/R9+hH9DP6Ff2O/kR/o39RYpQUxYFkIDlIAVKCVCA1iAdpQFqQDqQHGUBGkAlkBllAVpANZAc5QE6QC+QGeUBekA/kBwVAQVAIFAZFQFFQDBQHJUBJUAqUBmVAWVAOlAcVQEVQCVQGVUBVUA1UBzVATYABHBCABBSgAQNYwAEeCEAEEpCBAlSgAR0YwAQWsIEDXOABHwQgBBEAAAIEYqAWqA3qgLqgHqgPGoCGoBFoDJqApqAZaA5agJagFWgN2oC2oB1oDzqAjqAT6Ay6gK6gG+gOeoCeoBfoDfqAvqAf6A8GgIFgEBgMhoChYBgYDkaAkWAUGA3GgLFgHBgPJoCJYBKYDKaAqWAamA5mgJlgFpgN5oC5YB6YDxaAhWARWAyWgKVgGVgOVoCVYBVYDdaAtWAdWA82gI1gE9gMtoCtYBv4D2wHO8BOsAvsBnvAXrAP7AcHwEFwCBwGR8BRcAwcByfASXAKnAZnwFlwDpwHF8BFcAlcBlfAVXANXAc3wE1wC9wGd8BdcA/cBw/AQ/AIJIDH4Al4Cp6B5+AFeAlegdfgDXgL3oH34AP4CD6Bz+AL+Aq+ge/gB/gJfoHf4A/4C/6BRJAE4mAymBymgClhKpgaxsM0MC1MB9PDDDAjzAQzwywwK8wGs8McMCfMBXPDPDAvzAfzwwKwICwEC8MisCgsBovDErAkLAVLwzKwLCwHy8MKsCKsBCvDKrAqrAarwxqwJsQgDglIQgrSkIEs5CAPBShCCcpQgSrUoA4NaEIL2tCBLvSgDwMYwggCCCGCMVgL1oZ1YF1YD9aHDWBD2Ag2hk1gU9gMNoctYEvYCraGbWBb2A62hx1gR9gJdoZdYFfYDXaHPWBP2Av2hn1gX9gP9ocD4EA4CA6GQ+BQOAwOhyPgSDgKjoZj4Fg4Do6HE+BEOAlOhlPgVDgNTocz4Ew4C86Gc+BcOA/OhwvgQrgILoZL4FK4DC6HK+BKuAquhmvgWrgOrocb4Ea4CW6GW+BWuA3+B7fDHXAn3AV3wz1wL9wH98MD8CA8BA/DI/AoPAaPwxPwJDwFT8Mz8Cw8B8/DC/AivAQvwyvwKrwGr8Mb8Ca8BW/DO/AuvAfvwwfwIXwEE+Bj+AQ+hc/gc/gCvoSv4Gv4Br6F7+B7+AF+hJ/gZ/gFfoXf4Hf4A/6Ev+Bv+Af+hf9gIkyCcSgZSo5SoJQoFUqN4lEalBalQ+lRBpQRZUKZURaUFWVD2VEOlBPlQrlRHpQX5UP5UQFUEBVChVERVBQVQ8VRCVQSlUKlURlUFpVD5VEFVBFVQpVRFVQVVUPVUQ1UE2EIRwQiEYVoxCAWcYhHAhKRhGSkIBVpSEcGMpGFbOQgF3nIRwEKUYQAggihGKqFaqM6qC6qh+qjBqghaoQaoyaoKWqGmqMWqCVqhVqjNqgtaofaow6oI+qEOqMuqCvqhrqjHqgn6oV6oz6oL+qH+qMBaCAahAajIWgoGoaGoxFoJBqFRqMxaCwah8ajCWgimoQmoyloKpqGpqMZaCaahWajOWgumofmowVoIVqEFqMlaClahpajFWglWoVWozVoLVqH1qMNaCPahDajLWgr2ob+Q9vRDrQT7UK70R60F+1D+9EBdBAdQofREXQUHUPH0Ql0Ep1Cp9EZdBadQ+fRBXQRXUKX0RV0FV1D19ENdBPdQrfRHXQX3UP30QP0ED1CCegxeoKeomfoOXqBXqJX6DV6g96id+g9+oA+ok/oM/qCvqJv6Dv6gX6iX+g3+oP+on8oESWhuFiyWPJYiljKWKpY6lh8LE0sbSxdLH0sQyxjLFMscyxLLGssWyx7LEcsZyxXLHcsTyxvLF8sf6xArGCsUKxwrEisaKxYrHisRKxkrFSsdKxMrGysXKx8rEKsYqxSrHKsSqxqrFqseqxGrGYMi+ExIkbGqBgdY2JsjIvxMSEmxqSYHFNiakyL6TEjZsasmB1zYm7Mi/mxIBbGohiIwRiKxWK1Yv9zAA9QXnQLAMA3bjY32/Zm2zVz79j2zP3n2mzbX7Zt27ZtW++c9wMYxAiMxCiMxhiMxTiMxwRMxCRMxhRMxTRMxwzMxCzMxhzMxTzMxwIsxCIMYTGsB9YT64X1xvpgfbF+WH9sAJaEDcQGYYOxIdhQbBg2HBuBjcRGYaOxMdhYbBw2HpuATcQmYZOxKdhUbBo2HZuBzcRmYbOxOdhcbB42H/sPW4AtxBZhi7El2FJsGbYcW4GtxFZhq7E12FpsHbYe24BtxDZhm7Et2FZsG7Yd24HtxHZhu7E92F5sH7YfO4AdxA5hh7Ej2FHsGHYcO4GdxE5hp7Ez2FnsHHYeu4BdxC5hl7Er2FXsGnYdu4HdxG5ht7E72F3sHnYfe4A9xB5hj7En2FPsGfYce4G9xF5hr7E32FvsHfYe+4B9xD5hn7Ev2FfsG/Yd+4H9xH5hv7E/2F/sHxaHJ8OT4ynwlHgqPDUej6fB0+Lp8PR4BjwjngnPjGfBs+LZ8Ox4DjwnngvPjefB8+IJeD48P14AL4gXwgvjRfCieDG8OF4CL4mXwkvjZfCyeDm8PF4Br4hXwivjVfCqeDW8Ol4Dr4nXwmvjdfBEvC5eD6+PN8Ab4o3wxngTvCneDG+Ot8Bb4q3w1ngbvC3eDm+Pd8A74p3wzngXvCveDe+OYziOAxziBE7iFE7jDM7iHM7jAi7iEi7jCq7iGq7jBm7iFm7jDu7iHu7jAR7iEY7wGN4D74n3wnvjffC+eD+8Pz4AT8IH4oPwwfgQfCg+DB+Oj8BH4qPw0fgYfCw+Dh+PT8An4pPwyfgUfCo+DZ+Oz8Bn4rPw2fgcfC4+D5+P/4cvwBfii/DF+BJ8Kb4MX46vwFfiq/DV+Bp8Lb4OX49vwDfim/DN+BZ8K74N347vwHfiu/Dd+B58L74P348fwA/ih/DD+BH8KH4MP46fwE/ip/DT+Bn8LH4OP49fwC/il/DL+BX8Kn4Nv47fwG/it/Db+B38Ln4Pv48/wB/ij/DH+BP8Kf4Mf46/wF/ir/DX+Bv8Lf4Of49/wD/in/DP+Bf8K/4N/47/wH/iv/Df+B/8L/4PjwPJQHKQAqQEqUBqEA/SgLQgHUgPMoCMIBPIDLKArCAbyA5ygJwgF8gN8oC8IAHkA/lBAVAQFAKFQRFQFBQDxUEJUBKUAqVBGVAWlAPlQQVQEVQClUEVUBVUA9VBDVAT1AK1QR2QCOqCeqA+aAAagkagMWgCmoJmoDloAVqCVqA1aAPagnagPegAOoJOoDPoArqCbqA7wAAOAICAACSgAA0YwAIO8EAAIpCADBSgAg3owAAmsIANHOACD/ggACGIAAIx0AP0BL1Ab9AH9AX9QH8wACSBgWAQGAyGgKFgGBgORoCRYBQYDcaAsWAcGA8mgIlgEpgMpoCpYBqYDmaAmWAWmA3mgLlgHpgP/gMLwEKwCCwGS8BSsAwsByvASrAKrAZrwFqwDqwHG8BGsAlsBlvAVrANbAc7wE6wC+wGe8BesA/sBwfAQXAIHAZHwFFwDBwHJ8BJcAqcBmfAWXAOnAcXwEVwCVwGV8BVcA1cBzfATXAL3AZ3wF1wD9wHD8BD8Ag8Bk/AU/AMPAcvwEvwCrwGb8Bb8A68Bx/AR/AJfAZfwFfwDXwHP8BP8Av8Bn/AX/APxMFkMDlMAVPCVDA1jIdpYFqYDqaHGWBGmAlmhllgVpgNZoc5YE6YC+aGeWBemADzwfywACwIC8HCsAgsCovB4rAELAlLwdKwDCwLy8HysAKsCCvByrAKrAqrweqwBqwJa8HasA5MhHVhPVgfNoANYSPYGDaBTWEz2By2gC1hK9gatoFtYTvYHnaAHWEn2Bl2gV1hN9gdYhCHAEJIQBJSkIYMZCEHeShAEUpQhgpUoQZ1aEATWtCGDnShB30YwBBGEMEY7AF7wl6wN+wD+8J+sD8cAJPgQDgIDoZD4FA4DA6HI+BIOAqOhmPgWDgOjocT4EQ4CU6GU+BUOA1OhzPgTDgLzoZz4Fw4D86H/8EFcCFcBBfDJXApXAaXwxVwJVwFV8M1cC1cB9fDDXAj3AQ3wy1wK9wGt8MdcCfcBXfDPXAv3Af3wwPwIDwED8Mj8Cg8Bo/DE/AkPAVPwzPwLDwHz8ML8CK8BC/DK/AqvAavwxvwJrwFb8M78C68B+/DB/AhfAQfwyfwKXwGn8MX8CV8BV/DN/AtfAffww/wI/wEP8Mv8Cv8Br/DH/An/AV/wz/wL/wH44hkRHIiBZGSSEWkJuKJNERaIh2RnshAZCQyEZmJLERWIhuRnchB5CRyEbmJPEReIoHIR+QnChAFiUJEYaIIUZQoRhQnShAliVJEaaIMUZYoR5QnKhAViUpEZaIKUZWoRlQnahA1iVpEbaIOkUjUJeoR9YkGREOiEdGYaEI0JZoRzYkWREuiFdGaaEO0JdoR7YkOREeiE9GZ6EJ0JboR3QmMwAlAQIIgSIIiaIIhWIIjeEIgREIiZEIhVEIjdMIgTMIibMIhXMIjfCIgQiIiEBEjehA9iV5Eb6IP0ZfoR/QnBhBJxEBiEDGYGEIMJYYRw4kRxEhiFDGaGEOMJcYR44kJxERiEjGZmEJMJaYR04kZxExiFjGbmEPMJeYR84n/iAXEQmIRsZhYQiwllhHLiRXESmIVsZpYQ6wl1hHriQ3ERmITsZnYQmwlthHbiR3ETmIXsZvYQ+wl9hH7iQPEQeIQcZg4QhwljhHHiRPESeIUcZo4Q5wlzhHniQvEReIScZm4QlwlrhHXiRvETeIWcZu4Q9wl7hH3iQfEQ+IR8Zh4QjwlnhHPiRfES+IV8Zp4Q7wl3hHviQ/ER+IT8Zn4QnwlvhHfiR/ET+IX8Zv4Q/wl/hFxZDIyOZmCTEmmIlOT8WQaMi2ZjkxPZiAzkpnIzGQWMiuZjcxO5iBzkrnI3GQeMi+ZQOYj85MFyIJkIbIwWYQsShYji5MlyJJkKbI0WYYsS5Yjy5MVyIpkJbIyWYWsSlYjq5M1yJpkLbI2WYdMJOuS9cj6ZAOyIdmIbEw2IZuSzcjmZAuyJdmKbE22IduS7cj2ZAeyI9mJ7Ex2IbuS3cjuJEbiJCAhSZAkSZE0yZAsyZE8KZAiKZEyqZAqqZE6aZAmaZE26ZAu6ZE+GZAhGZGIjJE9yJ5kL7I32YfsS/Yj+5MDyCRyIDmIHEwOIYeSw8jh5AhyJDmKHE2OIceS48jx5ARyIjmJnExOIaeS08jp5AxyJjmLnE3OIeeS88j55H/kAnIhuYhcTC4hl5LLyOXkCnIluYpcTa4h15LryPXkBnIjuYncTG4ht5LbyO3kDnInuYvcTe4h95L7yP3kAfIgeYg8TB4hj5LHyOPkCfIkeYo8TZ4hz5LnyPPkBfIieYm8TF4hr5LXyOvkDfImeYu8Td4h75L3yPvkA/Ih+Yh8TD4hn5LPyOfkC/Il+Yp8Tb4h35LvyPfkB/Ij+Yn8TH4hv5LfyO/kD/In+Yv8Tf4h/5L/yDgqGZWcSkGlpFJRqal4Kg2VlkpHpacyUBmpTFRmKguVlcpGZadyUDmpXFRuKg+Vl0qg8lH5qQJUQaoQVZgqQhWlilHFqRJUSaoUVZoqQ5WlylHlqQpURaoSVZmqQlWlqlHVqRpUTaoWVZuqQyVSdal6VH2qAdWQakQ1pppQTalmVHOqBdWSakW1ptpQbal2VHuqA9WR6kR1prpQXaluVHcKo3AKUJAiKJKiKJpiKJbiKJ4SKJGSKJlSKJXSKJ0yKJOyKJtyKJfyKJ8KqJCKKETFqB5UT6oX1ZvqQ/Wl+lH9qQFUEjWQGkQNpoZQQ6lh1HBqBDWSGkWNpsZQY6lx1HhqAjWRmkRNpqZQU6lp1HRqBjWTmkXNpuZQc6l51HzqP2oBtZBaRC2mllBLqWXUcmoFtZJaRa2m1lBrqXXUemoDtZHaRG2mtlBbqW3UdmoHtZPaRe2m9lB7qX3UfuoAdZA6RB2mjlBHqWPUceoEdZI6RZ2mzlBnqXPUeeoCdZG6RF2mrlBXqWvUdeoGdZO6Rd2m7lB3qXvUfeoB9ZB6RD2mnlBPqWfUc+oF9ZJ6Rb2m3lBvqXfUe+oD9ZH6RH2mvlBfqW/Ud+oH9ZP6Rf2m/lB/qX9UHJ2MTk6noFPSqejUdDydhk5Lp6PT0xnojHQmOjOdhc5KZ6Oz0znonHQuOjedh85LJ9D56Px0AbogXYguTBehi9LF6OJ0CbokXYouTZehy9Ll6PJ0BboiXYmuTFehq9LV6Op0DbomXYuuTdehE+m6dD26Pt2Abkg3ohvTTeimdDO6Od2Cbkm3olvTbei2dDu6Pd2B7kh3ojvTXeiudDe6O43ROA1oSBM0SVM0TTM0S3M0Twu0SEu0TCu0Smu0Thu0SVu0TTu0S3u0Twd0SEc0omN0D7on3YvuTfeh+9L96P70ADqJHkgPogfTQ+ih9DB6OD2CHkmPokfTY+ix9Dh6PD2BnkhPoifTU+ip9DR6Oj2DnknPomfTc+i59Dx6Pv0fvYBeSC+iF9NL6KX0Mno5vYJeSa+iV9Nr6LX0Ono9vYHeSG+iN9Nb6K30Nno7vYPeSe+id9N76L30Pno/fYA+SB+iD9NH6KP0Mfo4fYI+SZ+iT9Nn6LP0Ofo8fYG+SF+iL9NX6Kv0Nfo6fYO+Sd+ib9N36Lv0Pfo+/YB+SD+iH9NP6Kf0M/o5/YJ+Sb+iX9Nv6Lf0O/o9/YH+SH+iP9Nf6K/0N/o7/YP+Sf+if9N/6L/0PzqOScYkZ1IwKZlUTGomnknDpGXSMemZDExGJhOTmcnCZGWyMdmZHExOJheTm8nD5GUSmHxMfqYAU5ApxBRmijBFmWJMcaYEU5IpxZRmyjBlmXJMeaYCU5GpxFRmqjBVmWpMdaYGU5OpxdRm6jCJTF2mHlOfacA0ZBoxjZkmTFOmGdOcacG0ZFoxrZk2TFumHdOe6cB0ZDoxnZkuTFemG9OdwRicAQxkCIZkKIZmGIZlOIZnBEZkJEZmFEZlNEZnDMZkLMZmHMZlPMZnAiZkIgYxMaYH05PpxfRm+jB9mX5Mf2YAk8QMZAYxg5khzFBmGDOcGcGMZEYxo5kxzFhmHDOemcBMZCYxk5kpzFRmGjOdmcHMZGYxs5k5zFxmHjOf+Y9ZwCxkFjGLmSXMUmYZs5xZwaxkVjGrmTXMWmYds57ZwGxkNjGbmS3MVmYbs53ZwexkdjG7mT3MXmYfs585wBxkDjGHmSPMUeYYc5w5wZxkTjGnmTPMWeYcc565wFxkLjGXmSvMVeYac525wdxkbjG3mTvMXeYec595wDxkHjGPmSfMU+YZ85x5wbxkXjGvmTfMW+Yd8575wHxkPjGfmS/MV+Yb8535wfxkfjG/mT/MX+YfE8cmY5OzKdiUbCo2NRvPpmHTsunY9GwGNiObic3MZmGzstnY7GwONiebi83N5mHzsglsPjY/W4AtyBZiC7NF2KJsMbY4W4ItyZZiS7Nl2LJsObY8W4GtyFZiK7NV2KpsNbY6W4OtydZia7N12ES2LluPrc82YBuyjdjGbBO2KduMbc62YFuyrdjWbBu2LduObc92YDuyndjObBe2K9uN7c5iLM4CFrIES7IUS7MMy7Icy7MCK7ISK7MKq7Iaq7MGa7JWos06rMt6rM8GbMhGLGJjbA+2J9uL7c32Yfuy/dj+7AA2iR3IDmIHs0PYoewwdjg7gh3JjmJHs2PYsew4djw7gZ3ITmIns1PYqew0djo7g53JzmJns3PYuew8dj77H7uAXcguYhezS9il7DJ2ObuCXcmuYleza9i17Dp2PbuB3chuYjezW9it7DZ2O7uD3cnuYneze9i97D52P3uAPcgeYg+zR9ij7DH2OHuCPcmeYk+zZ9iz7Dn2PHuBvcheYi+zV9ir7DX2OnuDvcneYm+zd9i77D32PvuAfcg+Yh+zT9in7DP2OfuCfcm+Yl+zb9i37Dv2PfuB/ch+Yj+zX9iv7Df2O/uD/cn+Yn+zf9i/7D82jkvGJedScCm5VFxqLp5Lw6Xl0nHpuQxcRi4Tl5nLwmXlsnHZuRxcTi4Xl5vLw+XlErh8XH6uAFeQK8QV5opwRbliXHGuBFeSK8WV5spwZblyXHmuAleRq8RV5qpwVblqXHWuBleTq8XV5upwiVxdrh5Xn2vANeQacY25JlxTrhnXnGvBteRaca25Nlxbrh3XnuvAdeQ6cZ25LlxXrhvXncM4nAMc5AiO5CiO5hiO5TiO5wRO5CRO5hRO5TRO5wzO5CzO5hzO5TzO5wIu5CIOcTGuB9eT68X15vpwfbl+XH9uAJfEDeQGcYO5IdxQbhg3nBvBjeRGcaO5MdxYbhw3npvATeQmcZO5KdxUbho3nZvBzeRmcbO5Odxcbh43n/uPW8At5BZxi7kl3FJuGbecW8Gt5FZxq7k13FpuHbee28Bt5DZxm7kt3FZuG7ed28Ht5HZxu7k93F5uH7efO8Ad5A5xh7kj3FHuGHecO8Gd5E5xp7kz3FnuHHeeu8Bd5C5xl7kr3FXuGnedu8Hd5G5xt7k73F3uHnefe8A95B5xj7kn3FPuGfece8G95F5xr7k33FvuHfee+8B95D5xn7kv3FfuG/ed+8H95H5xv7k/3F/uHxfHJ+OT8yn4lHwqPjUfz6fh0/Lp+PR8Bj4jn4nPzGfhs/LZ+Ox8Dj4nn4vPzefh8/IJfD4+P1+AL8gX4gvzRfiifDG+OF+CL8mX4kvzZfiyfDm+PF+Br8hX4ivzVfiqfDW+Ol+Dr8nX4mvzdfhEvi5fj6/PN+Ab8o34xnwTvinfjG/Ot+Bb8q341nwbvi3fjm/Pd+A78p34znwXvivfje/OYzzOAx7yBE/yFE/zDM/yHM/zAi/yEi/zCq/yGq/zBm/yFm/zDu/yHu/zAR/yEY/4GN+D78n34nvzffi+fD++Pz+AT+IH8oP4wfwQfig/jB/Oj+BH8qP40fwYfiw/jh/PT+An8pP4yfwUfio/jZ/Oz+Bn8rP42fwcfi4/j5/P/8cv4Bfyi/jF/BJ+Kb+MX86v4Ffyq/jV/Bp+Lb+OX89v4Dfym/jN/BZ+K7+N387v4Hfyu/jd/B5+L7+P388f4A/yh/jD/BH+KH+MP86f4E/yp/jT/Bn+LH+OP89f4C/yl/jL/BX+Kn+Nv87f4G/yt/jb/B3+Ln+Pv88/4B/yj/jH/BP+Kf+Mf86/4F/yr/jX/Bv+Lf+Of89/4D/yn/jP/Bf+K/+N/87/4H/yv/jf/B/+L/+PjxOSCcmFFEJKIZWQWogX0ghphXRCeiGDkFHIJGQWsghZhWxCdiGHkFPIJeQW8gh5hQQhn5BfKCAUFAoJhYUiQlGhmFBcKCGUFEoJpYUyQlmhnFBeqCBUFCoJlYUqQlWhmlBdqCHUFGoJtYU6QqJQV6gn1BcaCA2FRkJjoYnQVGgmNBdaCC2FVkJroY3QVmgntBc6CB2FTkJnoYvQVegmdBcwAReAAAVCIAVKoAVGYAVO4AVBEAVJkAVFUAVN0AVDMAVLsAVHcAVP8IVACIVIQEJM6CH0FHoJvYU+Ql+hn9BfGCAkCQOFQcJgYYgwVBgmDBdGCCOFUcJoYYwwVhgnjBcmCBOFScJkYYowVZgmTBdmCDOFWcJsYY4wV5gnzBf+ExYIC4VFwmJhibBUWCYsF1YIK4VVwmphjbBWWCesFzYIG4VNwmZhi7BV2CZsF3YIO4Vdwm5hj7BX2CfsFw4IB4VDwmHhiHBUOCYcF04IJ4VTwmnhjHBWOCecFy4IF4VLwmXhinBVuCZcF24IN4Vbwm3hjnBXuCfcFx4ID4VHwmPhifBUeCY8F14IL4VXwmvhjfBWeCe8Fz4IH4VPwmfhi/BV+CZ8F34IP4Vfwm/hj/BX+CfEicnE5GIKMaWYSkwtxotpxLRiOjG9mEHMKGYSM4tZxKxiNjG7mEPMKeYSc4t5xLxigphPzC8WEAuKhcTCYhGxqFhMLC6WEEuKpcTSYhmxrFhOLC9WECuKlcTKYhWxqlhNrC7WEGuKtcTaYh0xUawr1hPriw3EhmIjsbHYRGwqNhObiy3ElmIrsbXYRmwrthPbix3EjmInsbPYRewqdhO7i5iIi0CEIiGSIiXSIiOyIifyoiCKoiTKoiKqoibqoiGaoiXaoiO6oif6YiCGYiQiMSb2EHuKvcTeYh+xr9hP7C8OEJPEgeIgcbA4RBwqDhOHiyPEkeIocbQ4RhwrjhPHixPEieIkcbI4RZwqThOnizPEmeIscbY4R5wrzhPni/+JC8SF4iJxsbhEXCouE5eLK8SV4ipxtbhGXCuuE9eLG8SN4iZxs7hF3CpuE7eLO8Sd4i5xt7hH3CvuE/eLB8SD4iHxsHhEPCoeE4+LJ8ST4inxtHhGPCueE8+LF8SL4iXxsnhFvCpeE6+LN8Sb4i3xtnhHvCveE++LD8SH4iPxsfhEfCo+E5+LL8SX4ivxtfhGfCu+E9+LH8SP4ifxs/hF/Cp+E7+LP8Sf4i/xt/hH/Cv+E+OkZFJyKYWUUkolpZbipTRSWimdlF7KIGWUMkmZpSxSVimblF3KIeWUckm5pTxSXilByifllwpIBaVCUmGpiFRUKiYVl0pIJaVSUmmpjFRWKieVlypIFaVKUmWpilRVqiZVl2pINaVaUm2pjpQo1ZXqSfWlBlJDqZHUWGoiNZWaSc2lFlJLqZXUWmojtZXaSe2lDlJHqZPUWeoidZW6Sd0lTMIlIEGJkEiJkmiJkViJk3hJkERJkmRJkVRJk3TJkEzJkmzJkVzJk3wpkEIpkpAUk3pIPaVeUm+pj9RX6if1lwZISdJAaZA0WBoiDZWGScOlEdJIaZQ0WhojjZXGSeOlCdJEaZI0WZoiTZWmSdOlGdJMaZY0W5ojzZXmSfOl/6QF0kJpkbRYWiItlZZJy6UV0kpplbRaWiOtldZJ66UN0kZpk7RZ2iJtlbZJ26Ud0k5pl7Rb2iPtlfZJ+6UD0kHpkHRYOiIdlY5Jx6UT0knplHRaOiOdlc5J56UL0kXpknRZuiJdla5J16Ub0k3plnRbuiPdle5J96UH0kPpkfRYeiI9lZ5Jz6UX0kvplfRaeiO9ld5J76UP0kfpk/RZ+iJ9lb5J36Uf0k/pl/Rb+iP9lf5JcXIyObmcQk4pp5JTy/FyGjmtnE5OL2eQM8qZ5MxyFjmrnE3OLueQc8q55NxyHjmvnCDnk/PLBeSCciG5sFxELioXk4vLJeSScim5tFxGLiuXk8vLFeSKciW5slxFripXk6vLNeSaci25tlxHTpTryvXk+nIDuaHcSG4sN5Gbys3k5nILuaXcSm4tt5Hbyu3k9nIHuaPcSe4sd5G7yt3k7jIm4zKQoUzIpEzJtMzIrMzJvCzIoizJsqzIqqzJumzIpmzJtuzIruzJvhzIoRzJSI7JPeSeci+5t9xH7iv3k/vLA+QkeaA8SB4sD5GHysPk4fIIeaQ8Sh4tj5HHyuPk8fIEeaI8SZ4sT5GnytPk6fIMeaY8S54tz5HnyvPk+fJ/8gJ5obxIXiwvkZfKy+Tl8gp5pbxKXi2vkdfK6+T18gZ5o7xJ3ixvkbfK2+Tt8g55p7xL3i3vkffK++T98gH5oHxIPiwfkY/Kx+Tj8gn5pHxKPi2fkc/K5+Tz8gX5onxJvixfka/K1+Tr8g35pnxLvi3fke/K9+T78gP5ofxIfiw/kZ/Kz+Tn8gv5pfxKfi2/kd/K7+T38gf5o/xJ/ix/kb/K3+Tv8g/5p/xL/i3/kf/K/+Q4JZmSXEmhpFRSKamVeCWNklZJp6RXMigZlUxKZiWLklXJpmRXcig5lVxKbiWPkldJUPIp+ZUCSkGlkFJYKaIUVYopxZUSSkmllFJaKaOUVcop5ZUKSkWlklJZqaJUVaop1ZUaSk2lllJbqaMkKnWVekp9pYHSUGmkNFaaKE2VZkpzpYXSUmmltFbaKG2Vdkp7pYPSUemkdFa6KF2Vbkp3BVNwBShQIRRSoRRaYRRW4RReERRRkRRZURRV0RRdMRRTsRRbcRRX8RRfCZRQiRSkxJQeSk+ll9Jb6aP0Vfop/ZUBSpIyUBmkDFaGKEOVYcpwZYQyUhmljFbGKGOVccp4ZYIyUZmkTFamKFOVacp0ZYYyU5mlzFbmKHOVecp85T9lgbJQWaQsVpYoS5VlynJlhbJSWaWsVtYoa5V1ynplg7JR2aRsVrYoW5VtynZlh7JT2aXsVvYoe5V9yn7lgHJQOaQcVo4oR5VjynHlhHJSOaWcVs4oZ5VzynnlgnJRuaRcVq4oV5VrynXlhnJTuaXcVu4od5V7yn3lgfJQeaQ8Vp4oT5VnynPlhfJSeaW8Vt4ob5V3ynvlg/JR+aR8Vr4oX5Vvynflh/JT+aX8Vv4of5V/SpyaTE2uplBTqqnU1Gq8mkZNq6ZT06sZ1IxqJjWzmkXNqmZTs6s51JxqLjW3mkfNqyao+dT8agG1oFpILawWUYuqxdTiagm1pFpKLa2WUcuq5dTyagW1olpJraxWUauq1dTqag21plpLra3WURPVumo9tb7aQG2oNlIbq03UpmoztbnaQm2ptlJbq23Utmo7tb3aQe2odlI7q13Urmo3tbuKqbgKVKgSKqlSKq0yKqtyKq8KqqhKqqwqqqpqqq4aqqlaqq06qqt6qq8GaqhGKlJjag+1p9pL7a32Ufuq/dT+6gA1SR2oDlIHq0PUoeowdbg6Qh2pjlJHq2PUseo4dbw6QZ2oTlInq1PUqeo0dbo6Q52pzlJnq3PUueo8db76n7pAXaguUherS9Sl6jJ1ubpCXamuUlera9S16jp1vbpB3ahuStisblG3qtvU7eoOdae6S92t7lH3qvvU/eoB9aB6SD2sHlGPqsfU4+oJ9aR6Sj2tnlHPqufU8+oF9aJ6Sb2sXlGvqtfU6+oN9aZ6S72t3lHvqvfU++oD9aH6SH2sPlGfqs/U5+oL9aX6Sn2tvlHfqu/U9+oH9aP6Sf2sflG/qt/U7+oP9af6S/2t/lH/qv/UOC2ZllxLoaXUUmmptXgtjZZWS6el1zJoGbVMWmYti5ZVy6Zl13JoObVcWm4tj5ZXS9Dyafm1AlpBrZBWWCuiFdWKacW1ElpJrZRWWiujldXKaeW1ClpFrZJWWauiVdWqadW1GlpNrZZWW6ujJWp1tXpafa2B1lBrpDXWmmhNtWZac62F1lJrpbXW2mhttXZae62D1lHrpHXWumhdtW5adw3TcA1oUCM0UqM0WmM0VuM0XhM0UZM0WVM0VdM0XTM0U7M0W3M0V/M0Xwu0UIs0pMW0HlpPrZfWW+uj9dX6af21AVqSNlAbpA3WhmhDtWHacG2ENlIbpY3WxmhjtXHaeG2CNlGbpE3WpmhTtWnadG2GNlObpc3W5mhztXnafO0/bYG2MH6Rtlhboi3VlmnLtRXaSm2Vtlpbo63V1mnrtQ3aRm2Ttlnbom3VtmnbtR3aTm2Xtlvbo+3V9mn7tQPaQe2Qdlg7oh3VjmnHtRPaSe2Udlo7o53VzmnntQvaRe2Sdlm7ol3VrmnXtRvaTe2Wdlu7o93V7mn3tQfaQ+2R9lh7oj3VnmnPtRfaS+2V9lp7o73V3mnvtQ/aR+2T9ln7on3VvmnftR/aT+2X9lv7o/3V/mlxejI9uZ5CT6mn0lPr8XoaPa2eTk+vZ9Az6pn0zHoWPaueTc+u59Bz6rn03HoePa+eoOfT8+sF9IJ6Ib2wXkQvqhfTi+sl9JJ6Kb20XkYvq5fTy+sV9Ip6Jb2yXkWvqlfTq+s19Jp6Lb22XkdP1Ovq9fT6egO9od5Ib6w30ZvqzfTmegu9pd5Kb6230dvq7fT2ege9o95J76x30bvq3fTuOqbjOtChTuikTum0zuiszum8LuiiLumyruiqrum6buimbum27uiu7um+HuihHulIj+k99J56L7233kfvq/fT++sD9CR9oD5IH6wP0Yfqw/Th+gh9pD5KH62P0cfq4/Tx+gR9oj5Jn6xP0afq0/Tp+gx9pj5Ln63P0efq8/T5+n/6An2hvkhfrC/Rl+rL9OX6Cn2lvkpfra/R1+rr9PX6Bn2jvknfrG/Rt+rb9O36Dn2nvkvfre/R9+r79P36Af2gfkg/rB/Rj+rH9OP6Cf2kfko/rZ/Rz+rn9PP6Bf2ifkm/rF/Rr+rX9Ov6Df2mfku/rd/R7+r39Pv6A/2h/kh/rD/Rn+rP9Of6C/2l/kp/rb/R3+rv9Pf6B/2j/kn/rH/Rv+rf9O/6D/2n/kv/rf/R/+r/9DgjmZHcSGGkNFIZqY14I42R1khnpDcyGBmNTEZmI4uR1chmZDdyGDmNXEZuI4+R10gw8hn5jQJGQaOQUdgoYhQ1ihnFjRJGSaOUUdooY5Q1yhnljQpGRaOSUdmoYlQ1qhnVjRpGTaOWUduoYyQadY16Rn2jgdHQaGQ0NpoYTY1mRnOjhdHSaGW0NtoYbY12Rnujg9HR6GR0NroYXY1uRncDM3ADGNAgDNKgDNpgDNbgDN4QDNGQDNlQDNXQDN0wDNOwDNtwDNfwDN8IjNCIDGTEjB5GT6OX0dvoY/Q1+hn9jQFGkjHQGGQMNoYYQ41hxnBjhDHSGGWMNsYYY41xxnhjgjHRmGRMNqYYU41pxnRjhjHTmGXMNuYYc415xnzjP2OBsdBYZCw2lhhLjWXGcmOFsdJYZaw21hhrjXXGemODsdHYZGw2thhbjW3GdmOHsdPYZew29hh7jX3GfuOAcdA4ZBw2jhhHjWPGceOEcdI4ZZw2zhhnjXPGeeOCcdG4ZFw2rhhXjWvGdeOGcdO4Zdw27hh3jXvGfeOB8dB4ZDw2nhhPjWfGc+OF8dJ4Zbw23hhvjXfGe+OD8dH4ZHw2vhhfjW/Gd+OH8dP4Zfw2/hh/jX9GnJnMTG6mMFOaqczUZryZxkxrpjPTmxnMjGYmM7OZxcxqZjOzmznMnGYuM7eZx8xrJpj5zPxmAbOgWcgsbBYxi5rFzOJmCbOkWcosbZYxy5rlzPJmBbOiWcmsbFYxq5rVzOpmDbOmWcusbdYxE826Zj2zvtnAbGg2MhubTcymZjOzudnCbGm2Mlubbcy2ZjuzvdnB7Gh2MjubXcyuZjezu4mZuAlMaBImaVImbTIma3ImbwqmaEqmbCqmamqmbhqmaVqmbTqma3qmbwZmaEYmMmNmD7On2cvsbfYx+5r9zP7mADPJHGgOMgebQ8yh5jBzuDnCHGmOMkebY8yx5jhzvDnBnGhOMiebU8yp5jRzujnDnGnOMmebc8y55jxzvvmfucBcaC4yF5tLzKXmMnO5ucJcaa4yV5trzLXmOnO9ucHcaG4yN5tbzK3mNnO7ucPcae4yd5t7zL3mPnO/ecA8aB4yD5tHzKPmMfO4ecI8aZ4yT5tnzLPmOfO8ecG8aF4yL5tXzKvmNfO6ecO8ad4yb5t3zLvmPfO++cB8aD4yH5tPzKfmM/O5+cJ8ab4yX5tvzLfmO/O9+cH8aH4yP5tfzK/mN/O7+cP8af4yf5t/zL/mPzPOSmYlt1JYKa1UVmor3kpjpbXSWemtDFZGK5OV2cpiZbWyWdmtHFZOK5eV28pj5bUSrHxWfquAVdAqZBW2ilhFrWJWcauEVdIqZZW2ylhlrXJWeauCVdGqZFW2qlhVrWpWdauGVdOqZdW26liJVl2rnlXfamA1tBpZja0mVlOrmdXcamG1tFpZra02VlurndXe6mB1tDpZna0uVlerm9XdwizcAha0CIu0KIu2GIu1OIu3BEu0JEu2FEu1NEu3DMu0LMu2HMu1PMu3Aiu0IgtZMauH1dPqZfW2+lh9rX5Wf2uAlWQNtAZZg60h1lBrmDXcGmGNtEZZo60x1lhrnDXemmBNtCZZk60p1lRrmjXdmmHNtGZZs6051lxrnjXf+s9aYC20FlmLrSXWUmuZtdxaYa20VlmrrTXWWmudtd7aYG20NlmbrS3WVmubtd3aYe20dlm7rT3WXmuftd86YB20DlmHrSPWUeuYddw6YZ20TlmnrTPWWeucdd66YF20LlmXrSvWVeuadd26Yd20blm3rTvWXeuedd96YD20HlmPrSfWU+uZ9dx6Yb20XlmvrTfWW+ud9d76YH20PlmfrS/WV+ub9d36Yf20flm/rT/WX+ufFWcns5PbKeyUdio7tR1vp7HT2uns9HYGO6Odyc5sZ7Gz2tns7HYOO6edy85t57Hz2gl2Pju/XcAuaBeyC9tF7KJ2Mbu4XcIuaZeyS9tl7LJ2Obu8XcGuaFeyK9tV7Kp2Nbu6XcOuadeya9t17ES7rl3Prm83sBvajezGdhO7qd3Mbm63sFvarezWdhu7rd3Obm93sDvanezOdhe7q93N7m5jNm4DG9qETdqUTduMzdqczduCLdqSLduKrdqarduGbdqWbduO7dqe7duBHdqRjeyY3cPuafeye9t97L52P7u/PcBOsgfag+zB9hB7qD3MHm6PsEfao+zR9hh7rD3OHm9PsCfak+zJ9hR7qj3Nnm7PsGfas+zZ9hx7rj3Pnh8XFxdnL7QX2YvtJfZSe5m93F5hr7RX2avtNfZae5293t5gb7Q32ZvtLfZWe5u93d5h77R32bvtPfZee5+93z5gH7QP2YftI/ZR+5h93D5hn7RP2aftM/ZZ+5x93r5gX7Qv2ZftK/ZV+5p93b5h37Rv2bftO/Zd+559335gP7Qf2Y/tJ/ZT+5n93H5hv7Rf2a/tN/Zb+5393v5gf7Q/2Z/tL/ZX+5v93f5h/7R/2b/tP/Zf+58d5yRzkjspnJROKie1E++kcdI66Zz0TgYno5PJyexkcbI62ZzsTg4np5PLye3kcfI6CU4+J79TwCnoFHIKO0Wcok4xp7hTwinplHJKO2Wcsk45p7xTwanoVHIqO1Wcqk41p7pTw6np1HJqO3WcRKeuU8+p7zRwGjqNnMZOE6ep08xp7rRwWjqtnNZOG6et085p73RwOjqdnM5OF6er083p7mAO7gAHOoRDOpRDO4zDOpzDO4IjOpIjO4qjOpqjO4ZjOpZjO47jOp7jO4ETOpGDnJjTw+np9HJ6O32cvk4/p78zwElyBjqDnMHOEGeoM8wZ7oxwRjqjnNHOGGesM84Z70xwJjqTnMnOFGeqM82Z7sxwZjqznNnOHGeuM8+Z7/znLHAWOoucxc4SZ6mzzFnurHBWOquc1c4aZ62zzlnvbHA2Opuczc4WZ6uzzdnu7HB2Oruc3c4eZ6+zz9nvHHAOOoecw84R56hzzDnunHBOOqec084Z56xzzjnvXHAuOpecy84V56pzzbnu3HBuOrec284d565zz7nvPHAeOo+cx84T56nzzHnuvHBeOq+c184b563zznnvfHA+Op+cz84X56vzzfnu/HB+Or+c384f56/zz4lzk7nJ3RRuSjeVm9qNd9O4ad10bno3g5vRzeRmdrO4Wd1sbnY3h5vTzeXmdvO4ed0EN5+b3y3gFnQLuYXdIm5Rt5hb3C3hlnRLuaXdMm5Zt5xb3q3gVnQruZXdKm5Vt5pb3a3h1nRrubXdOm6iW9et59Z3G7gN3UZuY7eJ29Rt5jZ3W7gt3VZua7eN29Zt57Z3O7gd3U5uZ7eL29Xt5nZ3MRd3gQtdwiVdyqVdxmVdzuVdwRVdyZVdxVVdzdVdwzVdy7Vdx3Vdz/XdwA3dyEVuzO3h9nR7ub3dPm5ft5/b3x3gJrkD3UHuYHeIO9Qd5g53R7gj3VHuaHeMO9Yd5453J7gT3UnuZHeKO9Wd5k53Z7gz3VnubHeOO9ed5853/3MXuAvdRe5id4m71F3mLndXuCvdVe5qd4271l3nrnc3uBvdTe5md4u71d3mbnd3uDvdXe5ud4+7193n7ncPuAfdQ+5h94h71D3mHndPuCfdU+5p94x71j3nnncvuBfdS+5l94p71b3mXndvuDfdW+5t9457173n3ncfuA/dR+5j94n71H3mPndfuC/dV+5r94371n3nvnc/uB/dT+5n94v71f3mfnd/uD/dX+5v94/71/3nxnnJvOReCi+ll8pL7cV7aby0XjovvZfBy+hl8jJ7WbysXjYvu5fDy+nl8nJ7eby8XoKXz8vvFfAKeoW8wl4Rr6hXzCvulfBKeqW80l4Zr6xXzivvVfAqepW8yl4Vr6pXzavu1fBqerW82l4dL9Gr69Xz6nsNvIZeI6+x18Rr6jXzmnstvJZeK6+118Zr67Xz2nsdvI5eJ6+z18Xr6nXzunuYh3vAgx7hkR7l0R7jsR7n8Z7giZ7kyZ7iqZ7m6Z7hmZ7l2Z7juZ7n+V7ghV7kIS/m9fB6er283l4fr6/Xz+vvDfCSvIHeIG+wN8Qb6g3zhnsjvJHeKG+0N8Yb643zxnsTvIneJG+yN8Wb6k3zpnszvJneLG+2N8eb683z5nv/eQu8hd4ib7G3xFvqLfOWeyu8ld4qb7W3xlvrrfPWexu8jd4mb7O3xdvqbfO2ezu8nd4ub7e3x9vr7fP2ewe8g94h77B3xDvqHfOOeye8k94p77R3xjvrnfPOexe8i94l77J3xbvqXfOueze8m94t77Z3x7vr3fPuew+8h94j77H3xHvqPfOeey+8l94r77X3xnvrvfPeex+8j94n77P3xfvqffO+ez+8n94v77f3x/vr/fPi/GR+cj+Fn9JP5af24/00flo/nZ/ez+Bn9DP5mf0sflY/m5/dz+Hn9HP5uf08fl4/wc/n5/cL+AX9Qn5hv4hf1C/mF/dL+CX9Un5pv4xf1i/nl/cr+BX9Sn5lv4pf1a/mV/dr+DX9Wn5tv46f6Nf16/n1/QZ+Q7+R39hv4jf1m/nN/RZ+S7+V39pv47f12/nt/Q5+R7+T39nv4nf1u/ndfczHfeBDn/BJn/Jpn/FZn/N5X/BFX/JlX/FVX/N13/BN3/Jt3/Fd3/N9P/BDP/KRH/N7+D39Xn5vv4/f1+/n9/cH+En+QH+QP9gf4g/1h/nD/RH+SH+UP9of44/1x/nj/Qn+RH+SP9mf4k/1p/nT/Rn+TH+WP9uf48/15/nz/f/8Bf5Cf5G/2F/iL/WX+cni/i9ptb/GX+uv89f7G/yN/iZ/s7/F3+pv87f7O/yd/i5/t7/H3+vv8/f7B/yD/iH/sH/EP+of84/7J/yT/in/tH/GP+uf88/7F/yL/iX/sn/Fv+pf86/7N/yb/i3/tn/Hv+vf8+/7D/yH/iP/sf/Ef+o/85/7L/yX/iv/tf/Gf+u/89/7H/yP/if/s//F/+p/87/7P/yf/i//t//H/+v/8+OCZEHyIEWQMkgVpA7igzRB2iBdkD7IEGQMMgWZgyxB1iBbkD3IEeQMcgW5gzxB3iAhyBfkDwoEBYNCQeGgSFA0KBYUD0oEJYNSQemgTFA2KBeUDyoEFYNKQeWgSlA1qBZUD2oENYNaQe2gTpAY1A3qBfWDBkHDoFHQOGgSNA2aBc2DFkHLoFXQOmgTtA3aBe2DDkHHoFPQOegSdA26Bd0DLMADEMCACMiACuiACdiAC/hACMRACuRACdRAC/TACMzACuzACdzAC/wgCMIgClAQC3oEPYNeQe+gT9A36Bf0DwYEScHAYFAwOBgSDA2GBcODEcHIYFQwOhgTjA3GBeODCcHEYFIwOZgSTA2mBdODGcHMYFYwO5gTzA3mBfOD/4IFwcJgUbA4WBIsDZYFy4MVwcpgVbA6WBOsDdYF64MNwcZgU7A52BJsDbYF24Mdwc5gV7A72BPsDfYF+4MDwcHgUHA4OBIcDY4Fx4MTwcngVHA6OBOcDc4F54MLwcXgUnA5uBJcDa4F14Mbwc3gVnA7uBPcDe4F94MHwcPgUfA4eBI8DZ4Fz4MXwcvgVfA6eBO8Dd4F74MPwcfgU/A5+BJ8Db4F34Mfwc/gV/A7+BP8Df4FcWGyMHmYIkwZpgpTh/FhmjBtmC5MH2YIM4aZwsxhljBrmC3MHuYIc4a5wtxhnjBvmBDmC/OHBcKCYaGwcFgkLBoWC4uHJcKSYamwdFgmLBuWC8uHFcKKYaWwclglrBpWC6uHNcKaYa2wdlgnTAzrhvXC+mGDsGHYKGwcNgmbhs3C5mGLsGXYKmwdtgnbhu3C9mGHsGPYKewcdgm7ht3C7iEW4iEIYUiEZEiFdMiEbMiFfCiEYiiFcqiEaqiFemiEZmiFduiEbuiFfhiEYRiFKIyFPcKeYa+wd9gn7Bv2C/uHA8KkcGA4KBwcDgmHhsPC4eGIcGQ4KhwdjgnHhuPC8eGEcGI4KZwcTgmnhtPC6eGMcGY4K5wdzgnnhvPC+eF/4YJwYbgoXBwuCZeGy8Ll4YpwZbgqXB2uCdeG68L14YZwY7gp3BxuCbeG28Lt4Y5wZ7gr3B3uCfeG+8L94YHwYHgoPBweCY+Gx8Lj4YnwZHgqPB2eCc+G58Lz4YXwYngpvBxeCa+G18Lr4Y3wZngrvB3eCe+G98L74YPwYfgofBw+CZ+Gz8Ln4YvwZfgqfB2+Cd+G78L34YfwY/gp/Bx+Cb+G38Lv4Y/wZ/gr/B3+Cf+G/8K4KFmUPEoRpYxSRamj+ChNlDZKF6WPMkQZo0xR5ihLlDXKFmWPckQ5o1xR7ihPlDdKiPJF+aMCUcGoUFQ4KhIVjYpFxaMSUcmoVFQ6KhOVjcpF5aMKUcWoUlQ5qhJVjapF1aMaUc2oVlQ7qhMlRnWjelH9qEHUMGoUNY6aRE2jZlHzqEXUMmoVtY7aRG2jdlH7qEPUMeoUdY66RF2jblH3CIvwCEQwIiIyoiI6YiI24iI+EiIxkiI5UiI10iI9MiIzsiI7ciI38iI/CqIwiiIUxaIeUc+oV9Q76hP1jfpF/aMBUVI0MBoUDY6GREOjYdHwaEQ0MhoVjY7GRGOjcdH4aEI0MZoUTY6mRFOjadH0aEY0M5oVzY7mRHOjedH86L9oQbQwWhQtjpZES6Nl0fJoRbQyWhWtjtZEa6N10fpoQ7Qx2hRtjrZEW6Nt0fZoR7Qz2hXtjvZEe6N90f7oQHQwOhQdjo5ER6Nj0fHoRHQyOhWdjs5EZ6Nz0fnoQnQxuhRdjq5EV6Nr0fXoRnQzuhXdju5Ed6N70f3oQfQwehQ9jp5ET6Nn0fPoRfQyehW9jt5Eb6N30fvoQ/Qx+hR9jr5EX6Nv0ffoR/Qz+hX9jv5Ef6N/URxKhpKjFCglSoVSo3iUBqVF6VB6lAFlRJlQZpQFZUXZUHaUA+VEuVBulAflRQkoH8qPCqCCqBAqjIqgoqgYKo5KoJKoFCqNyqCyqBwqjyqgiqgSqoyqoKqoGqqOaqCaqBaqjeqgRFQX1UP1UQPUEDVCjVET1BQ1Q81RC9QStUKtURvUFrVD7VEH1BF1Qp1RF9QVdUPdEYZwBBBEBCIRhWjEIBZxiEcCEpGEZKQgFWlIRwYykYVs5CAXechHAQpRhBCKoR6oJ+qFeqM+qC/qh/qjASgJDUSD0GA0BA1Fw9BwNAKNRKPQaDQGjUXj0Hg0AU1Ek9BkNAVNRdPQdDQDzUSz0Gw0B81F89B89B9agBaiRWgxWoKWomVoOVqBVqJVaDVag9aidWg92oA2ok1oM9qCtqJtaDvagXaiXWg32oP2on1oPzqADqJD6DA6go6iY+g4OoFOolPoNDqDzqJz6Dy6gC6iS+gyuoKuomvoOrqBbqJb6Da6g+6ie+g+eoAeokfoMXqCnqJn6Dl6gV6iV+g1eoPeonfoPfqAPqJP6DP6gr6ib+g7+oF+ol/oN/qD/qJ/KC6WLJY8liKWMpYqljoWH0sTSxtLF0sfyxDLGMsUyxzLEssayxbLHssRyxnLFcsdyxPLG0uI5YvljxWIFYwVihWOFYkVjRWLFY+ViJWMlYqVjpWJlY2Vi5WPVYhVjFWKVY5ViVWNVYtVj9WI1YzVitWO1YklxurG6sXqxxrEGsYaxRrHmsSaxprFmsdaxFrGWsVax9rE2sbaxdrHOsQ6xjrFOse6xLrGusW6x7AYHgMxGPsfyfbYoGWzAAB4s73Ztm3brt0Nb7a7bVsz82Tbtm3btu06H84PuRKwRCwJ64f1xwZgA7H/sEHYYGwINhQbhg3HRmAjsVHYaGwMNhYbh43HJmATsUnYZGwKNhXDMBwjMBKjMBpjMBbjMB4TMBGTMBlTMBXTMB0zMBOzMBtzMBfzMB8LsBCLMIBBDGExbBo2HZuBzcRmYbOxOdhcbB42H1uALcQWYYuxJdhSbBm2HFuBrcRWYauxNdhabB22HtuAbcQ2YZuxLdhWbBu2HduB7cR2YbuxPdhebB+2HzuAHcQOYYexI9hR7Bh2HDuBncROYaexM9hZ7Bx2HruAXcQuYZexK9hV7Bp2HbuB3cRuYbexO9hd7B52H3uAPcQeYY+xJ9hT7Bn2HHuBvcReYa+xN9hb7B32HvuAfcQ+YZ+xL9hX7Bv2HfuB/cR+Yb+xP9hf7B8WhyfDk+Mp8JR4Kjw1ngZPi6fD0+MZ8Ix4JjwzngXPimfDs+M58Hg8J54Lz43nwfPi+fD8eAG8IF4IL4wXwYvixfDieAm8JF4KL42Xwcvi5fDyeAW8Il4Jr4xXwavi1fDqeA28Jl4Lr43Xwevi9fD6eAO8Id4Ib4w3wZvizfDmeAu8Jd4Kb423wdvi7fD2eAe8I94J74x3wbvi3fDueA+8J94L7433wfviCXginoT3w/vjA/CB+H/4IHwwPgQfig/Dh+Mj8JH4KHw0PgYfi4/Dx+MT8In4JHwyPgWfimM4jhM4iVM4jTM4i3M4jwu4iEu4jCu4imu4jhu4iVu4jTu4i3u4jwd4iEc4wCGO8Bg+DZ+Oz8Bn4rPw2fgcfC4+D5+PL8AX4ovwxfgSfCm+DF+Or8BX4qvw1fgafC2+Dl+Pb8A34pvwzfgWfCu+Dd+O78B34rvw3fgefC++D9+PH8AP4ofww/gR/Ch+DD+On8BP4qfw0/gZ/Cx+Dj+PX8Av4pfwy/gV/Cp+Db+O38Bv4rfw2/gd/C5+D7+PP8Af4o/wx/gT/Cn+DH+Ov8Bf4q/w1/gb/C3+Dn+Pf8A/4p/wz/gX/Cv+Df+O/8B/4r/w3/gf/C/+D48jkhHJiRRESiIVkZpIQ6Ql0hHpiQxERiITkZnIQmQlshHZiRxEPJGTyEXkJvIQeYl8RH6iAFGQKEQUJooQRYliRHGiBFGSKEWUJsoQZYlyRHmiAlGRqERUJqoQVYlqRHWiBlGTqEXUJuoQdYl6RH2iAdGQaEQ0JpoQTYlmRHOiBdGSaEW0JtoQbYl2RHuiA9GR6ER0JroQXYluRHeiB9GT6EX0JvoQfYkEIpFIIvoR/YkBxEDiP2IQMZgYQgwlhhHDiRHESGIUMZoYQ4wlxhHjiQnERGISMZmYQkwlMAInCIIkKIImGIIlOIInBEIkJEImFEIlNEInDMIkLMImHMIlPMInAiIkIgIQkEBEjJhGTCdmEDOJWcRsYg4xl5hHzCcWEAuJRcRiYgmxlFhGLCdWECuJVcRqYg2xllhHrCc2EBuJTcRmYguxldhGbCd2EDuJXcRuYg+xl9hH7CcOEAeJQ8Rh4ghxlDhGHCdOECeJU8Rp4gxxljhHnCcuEBeJS8Rl4gpxlbhGXCduEDeJW8Rt4g5xl7hH3CceEA+JR8Rj4gnxlHhGPCdeEC+JV8Rr4g3xlnhHvCc+EB+JT8Rn4gvxlfhGfCd+ED+JX8Rv4g/xl/hHxJHJyORkCjIlmYpMTaYh05LpyPRkBjIjmYnMTGYhs5LZyOxkDjKezEnmInOTeci8ZD4yP1mALEgWIguTRciiZDGyOFmCLEmWIkuTZciyZDmyPFmBrEhWIiuTVciqZDWyOlmDrEnWImuTdci6ZD2yPtmAbEg2IhuTTcimZDOyOdmCbEm2IluTbci2ZDuyPdmB7Eh2IjuTXciuZDeyO9mD7En2InuTfci+ZAKZSCaR/cj+5AByIPkfOYgcTA4hh5LDyOHkCHIkOYocTY4hx5LjyPHkBHIiOYmcTE4hp5IYiZMESZIUSZMMyZIcyZMCKZISKZMKqZIaqZMGaZIWaZMO6ZIe6ZMBGZIRCUhIIjJGTiOnkzPImeQscjY5h5xLziPnkwvIheQicjG5hFxKLiOXkyvIleQqcjW5hlxLriPXkxvIjeQmcjO5hdxKbiO3kzvIneQucje5h9xL7iP3kwfIg+Qh8jB5hDxKHiOPkyfIk+Qp8jR5hjxLniPPkxfIi+Ql8jJ5hbxKXiOvkzfIm+Qt8jZ5h7xL3iPvkw/Ih+Qj8jH5hHxKPiOfky/Il+Qr8jX5hnxLviPfkx/Ij+Qn8jP5hfxKfiO/kz/In+Qv8jf5h/xL/iPjqGRUcioFlZJKRaWm0lBpqXRUeioDlZHKRGWmslBZqWxUdioHFU/lpHJRuak8VF4qH5WfKkAVpApRhakiVFGqGFWcKkGVpEpRpakyVFmqHFWeqkBVpCpRlakqVFWqGlWdqkHVpGpRtak6VF2qHlWfakA1pBpRjakmVFOqGdWcakG1pFpRrak2VFuqHdWe6kB1pDpRnakuVFeqG9Wd6kH1pHpRvak+VF8qgUqkkqh+VH9qADWQ+o8aRA2mhlBDqWHUcGoENZIaRY2mxlBjqXHUeGoCNZGaRE2mplBTKYzCKYIiKYqiKYZiKY7iKYESKYmSKYVSKY3SKYMyKYuyKYdyKY/yqYAKqYgCFKQQFaOmUdOpGdRMahY1m5pDzaXmUfOpBdRCahG1mFpCLaWWUcupFdRKahW1mlpDraXWUeupDdRGahO1mdpCbaW2UdupHdROahe1m9pD7aX2UfupA9RB6hB1mDpCHaWOUcepE9RJ6hR1mjpDnaXOUeepC9RF6hJ1mbpCXaWuUdepG9RN6hZ1m7pD3aXuUfepB9RD6hH1mHpCPaWeUc+pF9RL6hX1mnpDvaXeUe+pD9RH6hP1mfpCfaW+Ud+pH9RP6hf1m/pD/aX+UXF0Mjo5nYJOSaeiU9Np6LR0Ojo9nYHOSGeiM9NZ6Kx0Njo7nYOOp3PSuejcdB46L52Pzk8XoAvShejCdBG6KF2MLk6XoEvSpejSdBm6LF2OLk9XoCvSlejKdBW6Kl2Nrk7XoGvStejadB26Ll2Prk83oBvSjejGdBO6Kd2Mbk63oFvSrejWdBu6Ld2Obk93oDvSnejOdBe6K92N7k73oHvSvejedB+6L51AJ9JJdD+6Pz2AHkj/Rw+iB9ND6KH0MHo4PYIeSY+iR9Nj6LH0OHo8PYGeSE+iJ9NT6Kk0RuM0QZM0RdM0Q7M0R/O0QIu0RMu0Qqu0Ruu0QZu0Rdu0Q7u0R/t0QId0RAMa0oiO0dPo6fQMeiY9i55Nz6Hn0vPo+fQCeiG9iF5ML6GX0svo5fQKeiW9il5Nr6HX0uvo9fQGeiO9id5Mb6G30tvo7fQOeie9i95N76H30vvo/fQB+iB9iD5MH6GP0sfo4/QJ+iR9ij5Nn6HP0ufo8/QF+iJ9ib5MX6Gv0tfo6/QN+iZ9i75N36Hv0vfo+/QD+iH9iH5MP6Gf0s/o5/QL+iX9in5Nv6Hf0u/o9/QH+iP9if5Mf6G/0t/o7/QP+if9i/5N/6H/0v/oOCYZk5xJwaRkUjGpmTRMWiYdk57JwGRkMjGZmSxMViYbk53JwcQzOZlcTG4mD5OXycfkZwowBZlCTGGmCFOUKcYUZ0owJZlSTGmmDFOWKceUZyowFZlKTGWmClOVqcZUZ2owNZlaTG2mDlOXqcfUZxowDZlGTGOmCdOUacY0Z1owLZlWTGumDdOWace0ZzowHZlOTGemC9OV6cZ0Z3owPZleTG+mD9OXSWASmSSmH9OfGcAMZP5jBjGDmSHMUGYYM5wZwYxkRjGjmTHMWGYcM56ZwExkJjGTmSnMVAZjcIZgSIZiaIZhWIZjeEZgREZiZEZhVEZjdMZgTMZibMZhXMZjfCZgQiZiAAMZxMSYacx0ZgYzk5nFzGbmMHOZecx8ZgGzkFnELGaWMEuZZcxyZgWzklnFrGbWMGuZdcx6ZgOzkdnEbGa2MFuZbcx2Zgezk9nF7Gb2MHuZfcx+5gBzkDnEHGaOMEeZY8xx5gRzkjnFnGbOMGeZc8x55gJzkbnEXGauMFeZa8x15gZzk7nF3GbuMHeZe8x95gHzkHnEPGaeME+ZZ8xz5gXzknnFvGbeMG+Zd8x75gPzkfnEfGa+MF+Zb8x35gfzk/nF/Gb+MH+Zf0wcm4xNzqZgU7Kp2NRsGjYtm45Nz2ZgM7KZ2MxsFjYrm43NzuZg49mcbC42N5uHzcvmY/OzBdiCbCG2MFuELcoWY4uzJdiSbCm2NFuGLcuWY8uzFdiKbCW2MluFrcpWY6uzNdiabC22NluHrcvWY+uzDdiGbCO2MduEbco2Y5uzLdiWbCu2NduGbcu2Y9uzHdiObCe2M9uF7cp2Y7uzPdiebC+2N9uH7csmsIlsEtuP7c8OYAey/7GD2MHsEHYoO4wdzo5gR7Kj2NHsGHYsO44dz05gJ7KT2MnsFHYqi7E4S7AkS7E0y7Asy7E8K7AiK7Eyq7Aqq7E6a7Ama7E267Au67E+G7AhG7GAhSxiY+w0djo7g53JzmJns3PYuew8dj67gF3ILmIXs0vYpewydjm7gl3J8nGr2TXsWnYdu57dwG5kN7Gb2S3sVnYbu53dwe5kd7G72T3sXnYfu589wB5kD7GH2SPsUfYYe5w9wZ5kT7Gn2TPsWfYce569wF5kL7GX2SvsVfYae529wd5kb7G32TvsXfYee599wD5kH7GP2SfsU/YZ+5x9wb5kX7Gv2TfsW/Yd+579wH5kP7Gf2S/sV/Yb+539wf5kf7G/2T/sX/YfG8cl45JzKbiUXCouNZeGS8ul49JzGbiMXCYuM5eFy8pl47JzObh4LieXi8vN5eHycvm4/FwBriBXiCvMFeGKcsW44lwJriRXiivNleHKcuW48lwFriJXiavMVeGqctW46lwNriZXi6vN1eHqcvW4+lwDriHXiGvMNeGacs245lwLriXXimvNteHacu249lwHriPXievMdeG6ct247lwPrifXi+vN9eH6cglcIpfE9eP6cwO4gdx/3CBuMDeEG8oN44ZzI7iR3ChuNDeGG8uN48ZzE7iJ3CRuMjeFm8phHM4RHMlRHM0xHMtxHM8JnMhJnMwpnMppnM4ZnMlZnM05nMt5nM8FXMhFHOAgh7gYN42bzs3gZnKzuNncHG4uN4+bzy3gFnKLuMXcEm4pt4xbzq3gVnKruNXcGm4tt45bz23gNnKbuM3cFm4rt43bzu3gdnK7uN3cHm4vt4/bzx3gDnKHuMPcEe4od4w7zp3gTnKnuNPcGe4sd447z13gLnKXuMvcFe4qd427zt3gbnK3uNvcHe4ud4+7zz3gHnKPuMfcE+4p94x7zr3gXnKvuNfcG+4t9457z33gPnKfuM/cF+4r9437zv3gfnK/uN/cH+4v94+L45PxyfkUfEo+FZ+aT8On5dPx6fkMfEY+E5+Zz8Jn5bPx2fkcfDyfk8/F5+bz8Hn5fHx+vgBfkC/EF+aL8EX5YnxxvgRfki/Fl+bL8GX5cnx5vgJfka/EV+ar8FX5anx1vgZfk6/F1+br8HX5enx9vgHfkG/EN+ab8E35ZnxzvgXfkm/Ft+bb8G35dnx7vgPfke/Ed+a78F35bnx3vgffk+/F9+b78H35BD6RT+L78f35AfxA/j9+ED+YH8IP5Yfxw/kR/Eh+FD+aH8OP5cfx4/kJ/ER+Ej+Zn8JP5TEe5wme5Cme5hme5Tme5wVe5CVe5hVe5TVe5w3e5C3e5h3e5T3e5wM+5CMe8JBHfIyfxk/nZ/Az+Vn8bH4OP5efx8/nF/AL+UX8Yn4Jv5Rfxi/nV/Ar+VX8an4Nv5Zfx6/nN/Ab+U38Zn4Lv5Xfxm/nd/A7+V38bn4Pv5ffx+/nD/AH+UP8Yf4If5Q/xh/nT/An+VP8af4Mf5Y/x5/nL/AX+Uv8Zf4Kf5W/xl/nb/A3+Vv8bf4Of5e/x9/nH/AP+Uf8Y/4J/5R/xj/nX/Av+Vf8a/4N/5Z/x7/nP/Af+U/8Z/4L/5X/xn/nf/A/+V/8b/4P/5f/x8cJyYTkQgohpZBKSC2kEdIK6YT0QgYho5BJyCxkEbIK2YTsQg4hXsgp5BJyC3mEvEI+Ib9QQCgoFBIKC0WEokIxobhQQigplBJKC2WEskI5obxQQagoVBIqC1WEqkI1obpQQ6gp1BJqC3WEukI9ob7QQGgoNBIaC02EpkIzobnQQmgptBJaC22EtkI7ob3QQegodBI6C12ErkI3obvQQ+gp9BJ6C32EvkKCkCgkCf2E/sIAYaDwnzBIGCwMEYYKw4ThwghhpDBKGC2MEcYK44TxwgRhojBJmCxMEaYKmIALhEAKlEALjMAKnMALgiAKkiALiqAKmqALhmAKlmALjuAKnuALgRAKkQAEKCAhJkwTpgszhJnCLGG2MEeYK8wT5gsLhIXCImGxsERYKiwTlgsrhJXCKmG1sEZYK6wT1gsbhI3CJmGzsEXYKmwTtgs7hJ3CLmG3sEfYK+wT9gsHhIPCIeGwcEQ4KhwTjgsnhJPCKeG0cEY4K5wTzgsXhIvCJeGycEW4KlwTrgs3hJvCLeG2cEe4K9wT7gsPhIfCI+Gx8ER4KjwTngsvhJfCK+G18EZ4K7wT3gsfhI/CJ+Gz8EX4KnwTvgs/hJ/CL+G38Ef4K/wT4sRkYnIxhZhSTCWmFtOIacV0Ynoxg5hRzCRmFrOIWcVsYnYxhxgv5hRzibnFPGJeMZ+YXywgFhQLiYXFImJRsZhYXCwhlhRLiaXFMmJZsZxYXqwgVhQriZXFKmJVsZpYXawh1hRribXFOmJdsZ5YX2wgNhQbiY3FJmJTsZnYXGwhthRbia3FNmJbsZ3YXuwgdhQ7iZ3FLmJXsZvYXewh9hR7ib3FPmJfMUFMFJPEfmJ/cYA4MFVcXJw4WBwiDhWHicPFEeJIcZQ4WhwjjhXHiePFCeJEcZI4WZwiThUxERcJkRQpkRYZkRU5kRcFURQlURYVURU1URcN0RQt0RYd0RU90RcDMRQjEYhQRGJMnCZOF2eIM8VZ4mxxjjhXnCfOFxeIC8VF4mJxibhUXCYuF1eIK8VV4mpxjbhWXCeuFzeIG8VN4mZxi7hV3CZuF3eIO8Vd4m5xj7hX3CfuFw+IB8VD4mHxiHhUPCYeF0+IJ8VT4mnxjHhWPCeeFy+IF8VL4mXxinhVvCZeF2+IN8Vb4m3xjnhXvCfeFx+ID8VH4mPxifhUfCY+F1+IL8VX4mvxjfhWfCe+Fz+IH8VP4mfxi/hV/CZ+F3+IP8Vf4m/xj/hX/CfGScmk5FIKKaWUSkotpZHSSumk9FIGKaOUScosZZGyStmk7FIOKV7KKeWSckt5pLxSPim/VEAqKBWSCktFpKJSMam4VEIqKZWSSktlpLJSOam8VEGqKFWSKktVpKpSNam6VEOqKdWSakt1pLpSPam+1EBqKDWSGktNpKZSM6m51EJqKbWSWkttpLZSO6m91EHqKHWSOktdpK5SN6m71EPqKfWSekt9pL5SgpQoJUn9pP7SAGmg9J80SBosDZGGSsOk4dIIaaQ0ShotjZHGSuOk8dIEaaI0SZosTZGmSpiES4RESpRES4zESpzES4IkSpIkS4qkSpqkS4ZkSpZkS47kSp7kS4EUSpEEJCghKSZNk6ZLM6SZ0ixptjRHmivNk+ZLC6SF0iJpsbREWiotk5ZLK6SV0ipptbRGWiutk9ZLG6SN0iZps7RF2iptk7ZLO6Sd0i5pt7RH2ivtk/ZLB6SD0iHpsHREOiodk45LJ6ST0inptHRGOiudk85LF6SL0iXpsnRFuipdk65LN6Sb0i3ptnRHuivdk+5LD6SH0iPpsfREeio9k55LL6SX0ivptfRGeiu9k95LH6SP0ifps/RF+ip9k75LP6Sf0i/pt/RH+iv9k+LkZHJyOYWcUk4lp5bTyGnldHJ6OYOcUc4kZ5azyFnlbHJ2OYccL+eUc8m55TxyXjmfnF8uIBeUC8mF5SJyUbmYXFwuIZeUS8ml5TJyWbmcXF6uIFeUK8mV5SpyVbmaXF2uIdeUa8m15TpyXbmeXF9uIDeUG8mN5SZyU7mZ3FxuIbeUW8mt5TZyW7md3F7uIHeUO8md5S5yV7mb3F3uIfeUe8m95T5yXzlBTpST5H5yf3mAPFD+Tx4kD5aHyEPlYfJweYQ8Uh4lj5bHyGPlcfJ4eYI8UZ4kT5anyFNlTMZlQiZlSqZlRmZlTuZlQRZlSZZlRVZlTdZlQzZlS7ZlR3ZlT/blQA7lSAYylJEck6fJ0+UZ8kx5ljxbniPPlefJ8+UF8kJ5kbxYXiIvlZfJy+UV8kp5lbxaXiOvldfJ6+UN8kZ5k7xZ3iJvlbfJ2+Ud8k55l7xb3iPvlffJ++UD8kH5kHxYPiIflY/Jx+UT8kn5lHxaPiOflc/J5+UL8kX5knxZviJfla/J1+Ub8k35lnxbviPfle/J9+UH8kP5kfxYfiI/lZ/Jz+UX8kv5lfxafiO/ld/J7+UP8kf5k/xZ/iJ/lb/J3+Uf8k/5l/xb/iP/lf/JcUoyJbmSQkmppFJSK2mUtEo6Jb2SQcmoZFIyK1mUrEo2JbuSQ4lXciq5lNxKHiWvkk/JrxRQCiqFlMJKEaWoUkwprpRQSiqllNJKGaWsUk4pr1RQKiqVlMpKFaWqUk2prtRQaiq1lNpKHaWuUk+przRQGiqNlMZKE6Wp0kxprrRQWiqtlNZKG6Wt0k5pr3RQOiqdlM5KF6Wr0k3prvRQeiq9lN5KH6WvkqAkKklKP6W/MkAZqPynDFIGK0OUocowZbgyQhmpjFJGK2OUsco4ZbwyQZmoTFImK1OUqQqm4AqhkAql0AqjsAqn8IqgiIqkyIqiqIqm6IqhmIql2IqjuIqn+EqghEqkAAUqSIkp05TpygxlpjJLma3MUeYq85T5ygJlobJIWawsUZYqy5TlygplpbJKWa2sUdYq65T1ygZlo7JJ2axsUbYq25Ttyg5lp7JL2a3sUfYq+5T9ygHloHJIOawcUY4qx5TjygnlpHJKOa2cUc4q55TzygXlonJJuaxcUa4q15Tryg3lpnJLua3cUe4q95T7ygPlofJIeaw8UZ4qz5TnygvlpfJKea28Ud4q75T3ygflo/JJ+ax8Ub4q35Tvyg/lp/JL+a38Uf4q/5Q4NZmaXE2hplRTqanVNGpaNZ2aXs2gZlQzqZnVLGpWNZuaXc2hxqs51VxqbjWPmlfNp+ZXC6gF1UJqYbWIWlQtphZXS6gl1VJqabWMWlYtp5ZXK6gV1UpqZbWKWlWtplZXa6g11VpqbbWOWletp9ZXG6gN1UZqY7WJ2lRtpjZXW6gt1VZqa7WN2lZtp7ZXO6gd1U5qZ7WL2lXtpnZXe6g91V5qb7WP2ldNUBPVJLWf2l8doA5U/1MHqYPVIepQdZg6XB2hjlRHqaPVMepYdZw6Xp2gTlQnqZPVKepUFVNxlVBJlVJplVFZlVN5VVBFVVJlVVFVVVN11VBN1VJt1VFd1VN9NVBDNVKBClWkxtRp6nR1hjpTnaXOVueoc9V56nx1gbpQXaQuVpeoS9Vl6nJ1hbpSXaWuVteoa9V16np1g7pR3aRuVreoW9Vt6nZ1h7pT3aXuVveoe9V96n71gHpQPaQeVo+oR9Vj6nH1hHpSPaWeVs+oZ9Vz6nn1gnpRvaReVq+oV9Vr6nX1hnpTvaXeVu+od9V76n31gfpQfaQ+Vp+oT9Vn6nP1hfpSfaW+Vt+ob9V36nv1g/pR/aR+Vr+oX9Vv6nf1h/pT/aX+Vv+of9V/apyWTEuupdBSaqm01FoaLa2WTkuvZdAyapm0zFoWLauWTcuu5dDitZxaLi23lkfLq+XT8msFtIJaIa2wVkQrqhXTimsltJJaKa20VkYrq5XTymsVtIpaJa2yVkWrqlXTqms1tJpaLa22Vkerq9XT6msNtIZaI62x1kRrqjXTmmsttJZaK6211kZrq7XT2msdtI5aJ62z1kXrqnXTums9tJ5aL6231kfrqyVoiVqS1k/rrw3QBmr/aYO0wdoQbag2TBuujdBGaqO00doYbaw2ThuvTdAmapO0ydoUbaqGabhGaKRGabTGaKzGabwmaKImabKmaKqmabpmaKZmabbmaK7mab4WaKEWaUCDGtJi2jRtujZDm6nN0mZrc7S52jxtvrZAW6gt0hZrS7Sl2jJtubZCW6mt0lZra7S12jptvbZB26ht0jZrW7St2jZtu7ZD26nt0nZre7S92j5tv3ZAO6gd0g5rR7Sj2jHtuHZCO6md0k5rZ7Sz2jntvHZBu6hd0i5rV7Sr2jXtunZDu6nd0m5rd7S72j3tvvZAe6g90h5rT7Sn2jPtufZCe6m90l5rb7S32jvtvfZB+6h90j5rX7Sv2jftu/ZD+6n90n5rf7S/2j8tTk+mJ9dT6Cn1VHpqPY2eVk+np9cz6Bn1THpmPYueVc+mZ9dz6PF6Tj2XnlvPo+fV8+n59QJ6Qb2QXlgvohfVi+nF9RJ6Sb2UXlovo5fVy+nl9Qp6Rb2SXlmvolfVq+nV9Rp6Tb2WXluvo9fV6+n19QZ6Q72R3lhvojfVm+nN9RZ6S72V3lpvo7fV2+nt9Q56R72T3lnvonfVu+nd9R56T72X3lvvo/fVE/REPUnvp/fXB+gD9f/0QfpgfYg+VB+mD9dH6CP1UfpofYw+Vh+nj9cn6BP1SfpkfYo+Vcd0XCd0Uqd0Wmd0Vud0Xhd0UZd0WVd0Vdd0XTd0U7d0W3d0V/d0Xw/0UI90oEMd6TF9mj5dn6HP1Gfps/U5+lx9nj5fX6Av1Bfpi/Ul+lJ9mb5cX6Gv1Ffpq/U1+lp9nb5e36Bv1Dfpm/Ut+lZ9m75d36Hv1Hfpu/U9+l59n75fP6Af1A/ph/Uj+lH9mH5cP6Gf1E/pp/Uz+ln9nH5ev6Bf1C/pl/Ur+lX9mn5dv6Hf1G/pt/U7+l39nn5ff6A/1B/pj/Un+lP9mf5cf6G/1F/pr/U3+lv9nf5e/6B/1D/pn/Uv+lf9m/5d/6H/1H/pv/U/+l/9nx5nJDOSGymMlEYqI7WRxkhrpDPSGxmMjEYmI7ORxchqZDOyGzmMeCOnkcvIbeQx8hr5jPxGAaOgUcgobBQxihrFjOJGCaOkUcoobZQxyhrljPJGBaOiUcmobFQxqhrVjOpGDaOmUcuobdQx6hr1jPpGA6Oh0chobDQxmhrNjOZGC6Ol0cpobbQx2hrtjPZGB6Oj0cnobHQxuhrdjO5GD6On0cvobfQx+hoJRqKRZPQz+hsDjIHGf8YgY7AxxBhqDDOGGyOMkcYoY7QxxhhrjDPGGxOMicYkY7IxxZhqYAZuEAZpUAZtMAZrcAZvCIZoSIZsKIZqaIZuGIZpWIZtOIZreIZvBEZoRAYwoIGMmDHNmG7MMGYas4zZxhxjrjHPmG8sMBYai4zFxhJjqbHMWG6sMFYaq4zVxhpjrbHOWG9sMDYam4zNxhZjq7HN2G7sMHYau4zdxh5jr7HP2G8cMA4ah4zDxhHjqHHMOG6cME4ap4zTxhnjrHHOOG9cMC4al4zLxhXjqnHNuG7cMG4at4zbxh3jrnHPuG88MB4aj4zHxhPjqfHMeG68MF4ar4zXxhvjrfHOeG98MD4an4zPxhfjq/HN+G78MH4av4zfxh/jr/HPiDOTmcnNFGZKM5WZ2kxjpjXTmenNDGZGM5OZ2cxiZjWzmdnNHGa8mdPMZeY285h5zXxmfrOAWdAsZBY2i5hFzWJmcbOEWdIsZZY2y5hlzXJmebOCWdGsZFY2q5hVzWpmdbOGWdOsZdY265h1zXpmfbOB2dBsZDY2m5hNzWZmc7OF2dJsZbY225htzXZme7OD2dHsZHY2u5hdzW5md7OH2dPsZfY2+5h9zQQz0Uwy+5n9zQHmQPM/c5A52BxiDjWHmcPNEeZIc5Q52hxjjjXHmePNCeZEc5I52ZxiTjUxEzcJkzQpkzYZkzU5kzcFUzQlUzYVUzU1UzcN0zQt0zYd0zU90zcDMzQjE5jQRGbMnGZON2eYM81Z5mxzjjnXnGfONxeYC81F5mJzibnUXGYuN1eYK81V5mpzjbnWXGeuNzeYG81N5mZzi7nV3GZuN3eYO81d5m5zj7nX3GfuNw+YB81D5mHziHnUPGYeN0+YJ81T5mnzjHnWPGeeNy+YF81L5mXzinnVvGZeN2+YN81b5m3zjnnXvGfeNx+YD81H5mPzifnUfGY+N1+YL81X5mvzjfnWfGe+Nz+YH81P5mfzi/nV/GZ+N3+YP81f5m/zj/nX/GfGWcms5FYKK6WVykptpbHSWums9FYGK6OVycpsZbGyWtms7FYOK97KaeWyclt5rLxWPiu/VcAqaBWyCltFrKJWMau4VcIqaZWySltlrLJWOau8VcGqaFWyKltVrKpWNau6VcOqadWyalt1rLpWPau+1cBqaDWyGltNrKZWM6u51cJqabWyWlttrLZWO6u91cHqaHWyOltdrK5WN6u71cPqafWyelt9rL5WgpVoJVn9rP7WAGug9Z81yBpsDbGGWsOs4dYIa6Q1yhptjbHGWuOs8dYEa6I1yZpsTbGmWpiFW4RFWpRFW4zFWpzFW4IlWpIlW4qlWpqlW4ZlWpZlW47lWp7lW4EVWpEFLGghK2ZNs6ZbM6yZ1ixrtjXHmmvNs+ZbC6yF1iJrsbXEWmots5ZbK6yV1iprtbXGWmuts9ZbG6yN1iZrs7XF2mpts7ZbO6yd1i5rt7XH2mvts/ZbB6yD1iHrsHXEOmods45bJ6yT1inrtHXGOmuds85bF6yL1iXrsnXFumpds65bN6yb1i3rtnXHumvds+5bD6yH1iPrsfXEemo9s55bL6yX1ivrtfXGemu9s95bH6yP1ifrs/XF+mp9s75bP6yf1i/rt/XH+mv9s+LsZHZyO4Wd0k5lp7bT2GntdHZ6O4Od0c5kZ7az2FntbHZ2O4cdb+e0c9m57Tx2Xjufnd8uYBe0C9mF7SJ2UbuYXdwuYZe0S9ml7TJ2WbucXd6uYFe0K9mV7Sp2VbuaXd2uYde0a9m17Tp2XbueXd9uYDe0G9mN7SZ2U7uZ3dxuYbe0W9mt7TZ2W7ud3d7uYHe0O9md7S52V7ub3d3uYfe0e9m97T52XzvBTrST7H52f3uAPdD+zx5kD7aH2EPtYfZwe4Q90h5lj7bH2GPtcfZ4e4I90Z5kT7an2FNtzMZtwiZtyqZtxmZtzuZtwRZtyZZtxVZtzdZtwzZty7Ztx3Ztz/btwA7tyAY2tJEds6fZ0+0Z9kx7lj3bnmPPtefZ8+0F9kJ7kb3YXmIvtZfZy+0V9kp7lb3aXmOvtdfZ6+0N9kZ7k73Z3mJvtbfZ2+0d9k57l73b3mPvtffZ++0D9kH7kH3YPmIftY/Zx+0T9kn7lH3aPmOftc/Z5+0L9kX7kn3ZvmJfta/Z1+0b9k37ln3bvmPfte/Z9+0H9kP7kf3YfmI/tZ/Zz+0X9kv7lf3afmO/td/Z7+0P9kf7k/3Z/mJ/tb/Z3+0f9k/7l/3b/mP/tf/ZcU4yJ7mTwknppHJSO2mctE46J72TwcnoZHIyO1mcrE42J7uTw4l3cjq5nNxOHievk8/J7xRwCjqFnMJOEaeoU8wp7pRwSjqlnNJOGaesU84p71RwKjqVnMpOFaeqU82p7tRwajq1nNpOHaeuU8+p7zRwGjqNnMZOE6ep08xp7rRwWjqtnNZOG6et085p73RwOjqdnM5OF6er083p7vRwejq9nN5OH6evk+AkOklOP6e/M8AZ6PznDHIGO0Ococ4wZ7gzwhnpjHJGO2Ocsc44Z7wzwZnoTHImO1OcqQ7m4A7hkA7l0A7jxP9fasWJjuTIjuKojubojuGYjuXYjuO4juf4TuCETuQABzrIiTnTnOnODGemM8uZ7cxx5jrznPnOAmehs8hZ7CxxljrLnOXOCmels8pZ7axx1jrrnPXOBmejs8nZ7GxxtjrbnO3ODmens8vZ7exx9jr7nP3OAeegc8g57BxxjjrHnOPOCeekc8o57ZxxzjrnnPPOBeeic8m57FxxrjrXnOvODeemc8u57dxx7jr3nPvOA+eh88h57DxxnjrPnOfOC+el88p57bxx3jrvnPfOB+ej88n57HxxvjrfnO/OD+en88v57fxx/jr/nDg3mZvcTeGmdFO5qd00blo3nZvezeBmdDO5md0sblY3m5vdzeHGuzndXG5uN4+b183n5ncLuAXdQm5ht4hb1C3mFndLuCXdUm5pt4xb1i3nlncruBXdSm5lt4pb1a3mVndruDXdWm5tt45b163n1ncbuA3dRm5jt4nb1G3mNndbuC3dVm5rt43b1m3ntnc7uB3dTm5nt4vb1e3mdnd7uD3dXm5vt4/b101wE90kt5/b3x3gDnT/cwe5g90h7lB3mDvcHeGOdEe5o90x7lh3nDveneBOdCe5k90p7lQXc3GXcEmXcmmXcVmXc3lXcEVXcmVXcVVXc3XXcE3Xcm3XcV3Xc303cEM3coELXeTG3GnudHeGO9Od5c5257hz3XnufHeBu9Bd5C52l7hL3WXucneFu9Jd5a5217hr3XXueneDu9Hd5G52t7hb3W3udneHu9Pd5e5297h73X3ufveAe9A95B52j7hH3WPucfeEe9I95Z52z7hn3XPuefeCe9G95F52r7hX3WvudfeGe9O95d5277h33XvuffeB+9B95D52n7hP3Wfuc/eF+9J95b5237hv3Xfue/eD+9H95H52v7hf3W/ud/eH+9P95f52/7h/3X9unJfMS+6l8FJ6qbzUXhovrZfOS+9l8DJ6mbzMXhYvq5fNy+7l8OK9nF4uL7eXx8vr5fPyewW8gl4hr7BXxCvqFfOKeyW8kl4pr7RXxivrlfPKexW8il4lr7JXxavqVfOqezW8ml4tr7ZXx6vr1fPqew28hl4jr7HXxGvqNfOaey28ll4rr7XXxmvrtfPaex28jl4nr7PXxevqdfO6ez28nl4vr7fXx+vrJXiJXpLXz+vvDfAGev95g7zB3hBvqDfMG+6N8EZ6o7zR3hhvrDfOG+9N8CZ6k7zJ3hRvqod5uEd4pEd5tMd4rMd5vCd4oid5sqd4qqd5umd4pmd5tud4rud5vhd4oRd5wIMe8mLeNG+6N8Ob6c3yZntzvLnePG++t8Bb6C3yFntLvKXeMm+5t8Jb6a3yVntrvLXeOm+9t8Hb6G3yNntbvK3eNm+7t8Pb6e3ydnt7vL3ePm+/d8A76B3yDntHvKPeMe+4d8I76Z3yTntnvLPeOe+8d8G76F3yLntXvKveNe+6d8O76d3ybnt3vLvePe++98B76D3yHntPvKfeM++598J76b3yXntvvLfeO++998H76H3yPntfvK/eN++798P76f3yfnt/vL/ePy/OT+Yn91P4Kf1Ufmo/jZ/WT+en9zP4Gf1MfmY/i5/Vz+Zn93P48X5OP5ef28/j5/Xz+fn9An5Bv5Bf2C/iF/WL+cX9En5Jv5Rf2i/jl/XL+eX9Cn5Fv5Jf2a/iV/Wr+dX9Gn5Nv5Zf26/j1/Xr+fX9Bn5Dv5Hf2G/iN/Wb+c39Fn5Lv5Xf2m/jt/Xb+e39Dn5Hv5Pf2e/id/W7+d39Hn5Pv5ff2+/j9/UT/EQ/ye/n9/cH+AP9//xB/mB/iD/UH+YP90f4I/1R/mh/jD/WH+eP9yf4E/1J/mR/ij/Vx3zcJ3zSp3zaZ3zW53zeF3zRl3zZV3zV13zdN3zTt3zbd3zX93zfD/zQj3zgQx/5MX+aP92f4c/0Z/mz/Tn+XH+eP99f4C/0F/mL/SX+Un+Zv9xf4a/0V/mr/TX+Wn+dv97f4G/0N/mb/S3+Vn+bv93f4e/0d/m7/T3+Xn+fv98/4B/0D/mH/SP+Uf+Yf9w/4Z/0T/mn/TP+Wf+cf96/4F/0L/mX/Sv+Vf+af92/4d/0b/m3/Tv+Xf+ef99/4D/0H/mP/Sf+U/+Z/9x/4b/0X/mv/Tf+W/+d/97/4H/0P/mf/S/+V/+b/93/4f/0f/m//T/+X/+fHxckC5IHKYKUQaogdZAmSBukC9IHGYKMQaYgc5AlyBpkC7IHOYL4IGeQK8gd5AnyBvmC/EGBoGBQKCgcFAmKBsWC4kGJoGRQKigdlAnKBuWC8kGFoGJQKagcVAmqBtWC6kGNoGZQK6gd1AnqBvWC+kGDoGHQKGgcNAmaBs2C5kGLoGXQKmgdtAnaBu2C9kGHoGPQKegcdAm6Bt2C7kGPoGfQK+gd9An6BglBYpAU9Av6BwOCgcF/waBgcDAkGBoMC4YHI4KRwahgdDAmGBuMC8YHE4KJwaRgcjAlmBpgAR4QARlQAR0wARtwAR8IgRhIgRwogRpogR4YgRlYgR04gRt4gR8EQRhEAQhggIJYMC2YHswIZgazgtnBnGBuMC+YHywIFgaLgsXBkmBpsCxYHqwIVgargtXBmmBtsC5YH2wINgabgs3BlmBrsC3YHuwIdga7gt3BnmBvsC/YHxwIDgaHgsPBkeBocCw4HpwITgangtPBmeBscC44H1wILgaXgsvBleBqcC24HtwIbga3gtvBneBucC+4HzwIHgaPgsfBk+Bp8Cx4HrwIXgavgtfBm+Bt8C54H3wIPgafgs/Bl+Br8C34HvwIfga/gt/Bn+Bv8C+IC5OFycMUYcowVZg6TBOmDdOF6cMMYcYwU5g5zBJmDbOF2cMcYXyYM8wV5g7zhHnDfGH+sEBYMCwUFg6LhEXDYmHxsERYMiwVlg7LhGXDcmH5sEJYMawUVg6rhFXDamH1sEZYM6wV1g7rhHXDemH9sEHYMGwUNg6bhE3DZmHzsEXYMmwVtg7bhG3DdmH7sEPYMewUdg67hF3DbmH3sEfYM+wV9g77hH3DhDAxTAr7hf3DAeHA8L9wUDg4HBIODYeFw8MR4chwVDg6HBOODceF48MJ4cRwUjg5nBJODbEQD4mQDKmQDpmQDbmQD4VQDKVQDpVQDbVQD43QDK3QDp3QDb3QD4MwDKMQhDBEYSycFk4PZ4Qzw1nh7HBOODecF84PF4QLw0Xh4nBJuDRcFi4PV4Qrw1Xh6nBNuDZcF64PN4Qbw03h5nBLuDXcFm4Pd4Q7w13h7nBPuDfcF+4PD4QHw0Ph4fBIeDQ8Fh4PT4Qnw1Ph6fBMeDY8F54PL4QXw0vh5fBKeDW8Fl4Pb4Q3w1vh7fBOeDe8F94PH4QPw0fh4/BJ+DR8Fj4PX4Qvw1fh6/BN+DZ8F74PP4Qfw0/h5/BL+DX8Fn4Pf4Q/w1/h7/BP+Df8F8ZFyaLkUYooZZQqSh2lidJG6aL0UYYoY5QpyhxlibJG2aLsUY4oPsoZ5YpyR3mivFG+KH9UICoYFYoKR0WiolGxqHhUIioZlYpKR2WislG5qHxUIaoYVYoqR1WiqlG1qHpUI6oZ1YpqR3WiulG9qH7UIGoYNYoaR02iplGzqHnUImoZtYpaR22itlG7qH3UIeoYdYo6R12irlG3qHvUI+oZ9Yp6R32ivlFClBglRf2i/tGAaGD0XzQoGhwNiYZGw6Lh0YhoZDQqGh2NicZG46Lx0YRoYjQpmhxNiaZGWIRHRERGVERHTMRGXMRHQiRGUiRHSqRGWqRHRmRGVmRHTuRGXuRHQRRGUQQiGKEoFk2LpkczopnRrGh2NCeaG82L5kcLooXRomhxtCRaGi2LlkcropXRqmh1tCZaG62L1kcboo3RpmhztCXaGm2Ltkc7op3Rrmh3tCfaG+2L9kcHooPRoehwdCQ6Gh2LjkcnopPRqeh0dCY6G52LzkcXoovRpehydCW6Gl2Lrkc3opvRreh2dCe6G92L7kcPoofRo+hx9CR6Gj2LnkcvopfRq+h19CZ6G72L3kcfoo/Rp+hz9CX6Gn2Lvkc/op/Rr+h39Cf6G/2L4kAykBykAClBKpAapAFpQTqQHmQAGUEmkBlkAVlBNpAd5ADxICfIBXKDPCAvyAfygwKgICgECoMioCgoBoqDEqAkKAVKgzKgLCgHyoMKoCKoBCqDKqAqqAaqgxqgJqgFaoM6oC6oB+qDBqAhaAQagyagKWgGmoMWoCVoBVqDNqAtaAfagw6gI+gEOoMuoCvoBrqDHqAn6AV6gz6gL0gAiSAJ9AP9wQAwEPwHBoHBYAgYCoaB4WAEGAlGgdFgDBgLxoHxYAKYCCaByWAKmAowgAMCkIACNGAACzjAAwGIQAIyUIAKNKADA5jAAjZwgAs84IMAhCACAECAQAxMA9PBDDATzAKzwRwwF8wD88ECsBAsAovBErAULAPLwQqwEqwCq8EasBasA+vBBrARbAKbwRawFWwD28EOsBPsArvBHrAX7AP7wQFwEBwCh8ERcBQcA8fBCXASnAKnwRlwFpwD58EFcBFcApfBFXAVXAPXwQ1wE9wCt8EdcBfcA/fBA/AQPAKPwRPwFDwDz8EL8BK8Aq/BG/AWvAPvwQfwEXwCn8EX8BV8A9/BD/AT/AK/wR/wF/wDcTAZTA5TwJQwFUwN08C0MB1MDzPAjDATzAyzwKwwG8wOc8B4mBPmgrlhHpgX5oP5YQFYEBaChWERWBQWg8VhCVgSloKlYRlYFpaD5WEFWBFWgpVhFVgVVoPVYQ1YE9aCtWEdWBfWg/VhA9gQNoKNYRPYFDaDzWEL2BK2gq1hG9gWtoPtYQfYEXaCnWEX2BV2g91hD9gT9oK9YR/YFybARJgE+8H+cAAcCP+Dg+BgOAQOhcPgcDgCjoSj4Gg4Bo6F4+B4OAFOhJPgZDgFToUYxCEBSUhBGjKQhRzkoQBFKEEZKlCFGtShAU1oQRs60IUe9GEAQxhBACFEMAanwelwBpwJZ8HZcA6cC+fB+XABXAgXwcVwCVwKl8HlcAVcCVfB1XANXAvXwfVwA9wIN8HNcAvcCrfB7XAH3Al3wd1wD9wL98H98AA8CA/Bw/AIPAqPwePwBDwJT8HT8Aw8C8/B8/ACvAgvwcvwCrwKr8Hr8Aa8CW/B2/AOvAvvwfvwAXwIH8HH8Al8Cp/B5/AFfAlfwdfwDXwL38H38AP8CD/Bz/AL/Aq/we/wB/wJf8Hf8A/8C//BOJQMJUcpUEqUCqVGaVBalA6lRxlQRpQJZUZZUFaUDWVHOVA8yolyodwoD8qL8qH8qAAqiAqhwqgIKoqKoeKoBCqJSqHSqAwqi8qh8qgCqogqocqoCqqKqqHqqAaqiWqh2qgOqovqofqoAWqIGqHGqAlqipqh5qgFaolaodaoDWqL2qH2qAPqiDqhzqgL6oq6oe6oB+qJeqHeqA/qixJQIkpC/VB/NAANRP+hQWgwGoKGomFoOBqBRqJRaDQag8aicWg8moAmokloMpqCpiIM4YhAJKIQjRjEIg7xSEAikpCMFKQiDenIQCaykI0c5CIP+ShAIYoQQBAhFEPT0HQ0A81Es9BsNAfNRfPQfLQALUSL0GK0BC1Fy9BytAKtRKvQarQGrUXr0Hq0AW1Em9BmtAVtRdvQdrQD7US70G60B+1F+9B+dAAdRIfQYXQEHUXH0HF0Ap1Ep9BpdAadRefQeXQBXUSX0GV0BV1F19B1dAPdRLfQbXQH3UX30H30AD1Ej9Bj9AQ9Rc/Qc/QCvUSv0Gv0Br1F79B79AF9RJ/QZ/QFfUXf0Hf0A/1Ev9Bv9Af9Rf9QXCxZLHksRSxlLFUsdSxNLG0sXSx9LEMsYyxTLHMsSyxrLFsseyxHLD6WM5YrljuWJ5Y3li+WP1YgVjBWKFY4ViRWNFYsVjxWIlYyVipWOlYmVjZWLlY+ViFWMVYpVjlWJVY1Vi1WPVYjVjNWK1Y7VidWN1YvVj/WINYw1ijWONYk1jTWLNY81iLWMtYq1jrWJtY21i7WPtYh1jHWKdY51iXWNdYt1j3WI9Yz1ivWO9Yn1jeWEEuMJf2PRXtczKNZAABc27ZtW19tu2lqp7ad2u16d9b2zry1bdu2zZw/5z4ebCAWjw3CBmNDsKHYMGw4NgIbiY3CRmNjsLHYOCwBG49NwCZik7DJ2BRsKjYNm47NwGZis7DZ2BxsLjYPm48twBZii7DF2BJsKbYMW44lYiuwldgqbDW2BluLrcPWYxuwjdgmbDO2BduKbcMwDMcIjMQojMYYjMU4DGA8JmAiJmEypmAqpmE6ZmAmZmE25mAu5mE+FmAhFmEQQ1gM247twHZiu7Dd2B5sL7YP248dwA5ih7DD2BHsKHYMO46dwE5ip7DT2BnsLHYOO49dwC5il7DL2BXsKnYNu47dwG5it7Db2B3sLnYPu489wB5ij7DH2BPsKfYMe469wF5ir7DX2BvsLfYOe499wD5in7DP2BfsK/YN+479wH5iv7Df2B/sL/YPS8KS4cnxFHhKPBWeGk+Dp8XT4enxDHhGPBOeGc+CZ8Wz4dnxHHhOPBeeG8+D58Xz4fnxAnhBvBBeGC+CF8WL4cXxEnhJvBReGi+Dl8XL4eXxCnhFvBJeGa+CV8Wr4dXxGnhNvBZeG6+D18Xr4fXxBnhDvBHeGG+CN8Wb4f/hzfEWeEu8Fd4ab4O3xdvh7fEOeEe8E94Z74J3xbvh3fEeeE+8F94b74P3xfvh/fEBeBw+EI/HB+GD8SH4UHwYPhwfgY/ER+Gj8TH4WHwcnoCPxyfgE/FJ+GR8Cj4Vn4ZPx2fgM/FZ+Gx8Dj4Xn4fPxxfgC/FF+GJ8Cb4UX4YvxxPxFfhKfBW+Gl+Dr8XX4evxDfhGfBO+Gd+Cb8W34RiO4wRO4hRO4wzO4hwOcB4XcBGXcBlXcBXXcB03cBO3cBt3cBf3cB8P8BCPcIgjPIZvx3fgO/Fd+G58D74X34fvxw/gB/FD+GH8CH4UP4Yfx0/gJ/FT+Gn8DH4WP4efxy/gF/FL+GX8Cn4Vv4Zfx2/gN/Fb+G38Dn4Xv4ffxx/gD/FH+GP8Cf4Uf4Y/x1/gL/FX+Gv8Df4Wf4e/xz/gH/FP+Gf8C/4V/4Z/x3/gP/Ff+G/8D/4X/4cn4cmI5EQKIiWRikhNpCHSEumI9EQGIiORichMZCGyEtmI7EQOIieRi8hN5CHyEvmI/EQBoiBRiChMFCGKEsWI4kQJoiRRiihNlCHKEuWI8kQFoiJRiahMVCGqEtWI6kQNoiZRi6hN1CHqEvWI+kQDoiHRiGhMNCGaEs2I/4jmRAuiJdGKaE20IdoS7Yj2RAeiI9GJ6Ex0IboS3YjuRA+iJ9GL6E30IfoS/Yj+xAAijhhIxBODiMHEEGIoMYwYTowgRhKjiNHEGGIsMY5IIMYTE4iJxCRiMjGFmEpMI6YTM4iZxCxiNjGHmEvMI+YTC4iFxCJiMbGEWEosI5YTicQKYiWxilhNrCHWEuuI9cQGYiOxidhMbCG2EtsIjMAJgiAJiqAJhmAJjgAETwiESEiETCiESmiEThiESViETTiES3iETwRESEQEJBARI7YTO4idxC5iN7GH2EvsI/YTB4iDxCHiMHGEOEocI44TJ4iTxCniNHGGOEucI84TF4iLxCXiMnGFuEpcI64TN4ibxC3iNnGHuEvcI+4TD4iHxCPiMfGEeEo8I54TL4iXxCviNfGGeEu8I94TH4iPxCfiM/GF+Ep8I74TP4ifxC/iN/GH+Ev8I5KIZGRyMgWZkkxFpibTkGnJdGR6MgOZkcxEZiazkFnJbGR2MgeZk8xF5ibzkHnJfGR+sgBZkCxEFiaLkEXJYmRxsgRZkixFlibLkGXJcmR5sgJZkaxEViarkFXJamR1sgZZk6xF1ibrkHXJemR9sgHZkGxENiabkE3JZuR/ZHOyBdmSbEW2JtuQbcl2ZHuyA9mR7ER2JruQXcluZHeyB9mT7EX2JvuQfcl+ZH9yABlHDiTjyUHkYHIIOZQcRg4nR5AjyVHkaHIMOZYcRyaQ48kJ5ERyEjmZnEJOJaeR08kZ5ExyFjmbnEPOJeeR88kF5EJyEbmYXEIuJZeRy8lEcgW5klxFribXkGvJdeR6cgO5kdxEbia3kFvJbSRG4iRBkiRF0iRDsiRHApInBVIkJVImFVIlNVInDdIkLdImHdIlPdInAzIkIxKSiIyR28kd5E5yF7mb3EPuJfeR+8kD5EHyEHmYPEIeJY+Rx8kT5EnyFHmaPEOeJc+R58kL5EXyEnmZvEJeJa+R18kb5E3yFnmbvEPeJe+R98kH5EPyEfmYfEI+JZ+Rz8kX5EvyFfmafEO+Jd+R78kP5EfyE/mZ/EJ+Jb+R38kf5E/yF/mb/EP+Jf+RSWQyKjmVgkpJpaJSU2motFQ6Kj2VgcpIZaIyU1morFQ2KjuVg8pJ5aJyU3movFQ+Kj9VgCpIFaIKU0WoolQxqjhVgipJlaJKU2WoslQ5qjxVgapIVaIqU1WoqlQ1qjpVg6pJ1aJqU3WoulQ9qj7VgGpINaIaU02oplQz6j+qOdWCakm1olpTbai2VDuqPdWB6kh1ojpTXaiuVDeqO9WD6kn1onpTfai+VD+qPzWAiqMGUvHUIGowNYQaSg2jhlMjqJHUKGo0NYYaS42jEqjx1ARqIjWJmkxNoaZS06jp1AxqJjWLmk3NoeZS86j51AJqIbWIWkwtoZZSy6jlVCK1glpJraJWU2uotdQ6aj21gdpIbaI2U1uordQ2CqNwiqBIiqJoiqFYiqMAxVMCJVISJVMKpVIapVMGZVIWZVMO5VIe5VMBFVIRBSlExajt1A5qJ7WL2k3tof4lJSXtpw5QB6lD1GHqCHWUOkYdp05QJ6lT1GnqDHWWOkedpy5QF6lL1GXqCnWVukZdp25QN6lb1G3qDnWXukfdpx5QD6lH1GPqCfWUekY9p15QL6lX1GvqDfWWeke9pz5QH6lP1GfqC/WV+kZ9p35QP6lf1G/qD/WX+kclUcno5HQKOiWdik5Np6HT0uno9HQGOiOdic5MZ6Gz0tno7HQOOiedi85N56Hz0vno/HQBuiBdiC5MF6GL0sXo4nQJuiRdii5Nl6HL0uXo8nQFuiJdia5MV6Gr0tXo6nQNuiZdi65N16Hr0vXo+nQDuiHdiG5MN6Gb0s3o/+jmdAu6Jd2Kbk23odvS7ej2dAe6I92J7kx3obvS3ejudA+6J92L7k33ofvS/ej+9AA6jh5Ix9OD6MH0EHooPYweTo+gR9Kj6NH0GHosPY5OoMfTE+iJ9CR6Mj2FnkpPo6fTM+iZ9Cx6Nj2HnkvPo+fTC+iF9CJ6Mb2EXkovo5fTifQKeiW9il5Nr6HX0uvo9fQGeiO9id5Mb6G30ttojMZpgiZpiqZphmZpjgY0Twu0SEu0TCu0Smu0Thu0SVu0TTu0S3u0Twd0SEc0pBEdo7fTO+id9C56N72H3kvvo/fTB+iD9CH6MH2EPkofo4/TJ+iT9Cn6NH2GPkufo8/TF+iL9CX6Mn2Fvkpfo6/TN+ib9C36Nn2Hvkvfo+/TD+iH9CP6Mf2Efko/o5/TL+iX9Cv6Nf2Gfku/o9/TH+iP9Cf6M/2F/kp/o7/TP+if9C/6N/2H/kv/o5PoZExyJgWTkknFpGbSMGmZdEx6JgOTkcnEZGayMFmZbEx2JgeTk8nF5GbyMHmZfEx+pgBTsGkhpjBThCnKFGOKMyWYkkwppjRThinLlGPKMxWYikwlpjJThanKVGOqMzWYmkwtpjZTh6nL1GPqMw2YhkwjpjHThGnKNGP+Y5ozLZiWTCumNdOGacu0Y9ozHZiOTCemM9OF6cp0Y7ozPZieTC+mN9OH6cv0Y/ozA5g4ZiATzwxiBjNDmKHMMGY4M4IZyYxiRjNjmLHMOCaBGc9MYCYyk5jJzBRmKjONmc7MYGYys5jZzBxmLjOPmc8sYBYyi5jFzBJmKbOMWc4kMiuYlcwqZjWzhlnLrGPWMxuYjcwmZjOzhdnKbGMwBmcIhmQohmYYhmU4BjA8IzAiIzEyozAqozE6YzAmYzE24zAu4zE+EzAhEzGQQUyM2c7sYHYyu5jdzB5mL7OP2c8cYA4yh5jDzBHmKHOMOc6cYE4yp5jTzBnmLHOOOc9cYC4yl5jLzBXmKnONuc7cYG4yt5jbzB3mLnOPuc88YB4yj5jHzBPmKfOMec68YF4yr5jXzBvmLfOOec98YD4yn5jPzBfmK/ON+c78YH4yv5jfzB/mL/OPSWKSscnZFGxKNhWbmk3DpmXTsenZDGxGNhObmc3CZmWzsdnZHGxONhebm83D5mXzsfnZAmxBthBbmC3CFmWLscXZEmxJthRbmi3DlmXLseXZCmxFthJbma3CVmWrsdXZGmxNthZbm63D1mXrsfXZBmxDthHbmG3CNmWbsf+xzdkWbEu2FduabcO2Zdux7dkObEe2E9uZ7cJ2Zbux3dkebE+2F9ub7cP2Zfux/dkBbBw7kI1nB7GD2SHsUHYYO5wdwY5kR7Gj2THsWHYcm8COZyewE9lJ7GR2CjuVncZOZ2ewM9lZ7Gx2DjuXncfOZxewC9lF7GJ2CbuUXcYuZxPZFexKdhW7ml3DrmXXsevZDexGdhO7md3CbmW3sRiLswRLshRLswzLshwLWJ4VWJGVWJlVWJXVWJ01WJO1WJt1WJf1WJ8N2JCNWMgiNsZuZ3ewO9ld7G52D7uX3cfuZw+wB9lD7GH2CHuUPcYeZ0+wJ9lT7Gn2DHuWPceeZy+wF9lL7GX2CnuVvcZeZ2+wN9lb7G32DnuXvcfeZx+wD9lH7GP2CfuUfcY+Z1+wL9lX7Gv2DfuWfce+Zz+wH9lP7Gf2C/uV/cZ+Z3+wP9lf7G/2D/uX/ccmscm45FwKLiWXikvNpeHScum49FwGLiOXicvMZeGyctm47FwOLieXi8vN5eHycvm4/FwBriBXiCvMFeGKcsW44lwJriRXiivNleHKcuW48lwFriJXiavMVeGqctW46lwNriZXi6vN1eHqcvW4+lwDriHXiGvMNeGacs24/7jmXAuuJdeKa8214dpy7bj2XAeuI9eJ68x14bpy3bjuXA+uJ9eL68314fpy/bj+3AAujhvIxXODuMHcEG4oN4wbzo3gRnKjuNHcGG4sN45L4MZzE7iJ3CRuMjeFm8pN46ZzM7iZ3CxuNjeHm8vN4+ZzC7iF3CJuMbeEW8ot45ZzidwKbiW3ilvNreHWcuu49dwGbiO3idvMbeG2cts4jMM5giM5iqM5hmM5jgMczwmcyEmczCmcymmczhmcyVmczTmcy3mczwVcyEUc5BAX47ZzO7id3C5uN7eH28vt4/ZzB7iD3CHuMHeEO8od445zJ7iT3CnuNHeGO8ud485zF7iL3CXuMneFu8pd465zN7ib3C3uNneHu8vd4+5zD7iH3CPuMfeEe8o9455zL7iX3CvuNfeGe8u9495zH7iP3CfuM/eF+8p9475zP7if3C/uN/eH+8v945K4ZCA5SAFSglQgNUgD0oJ0ID3IADKCTCAzyAKygmwgO8gBcoJcIDfIA/KCfCA/KAAKgkKgMCgCioJioDgoAUqCUqA0KAPKgnKgPKgAKoJKoDKoAqqCaqA6qAFqglqgNqgD6oJ6oD5oABqCRqAxaAKagmbgP9ActAAtQSvQGrQBbUE70B50AB1BJ9AZdAFdQTfQHfQAPUEv0Bv0AX1BP9AfDABxYCCIB4PAYDAEDAXDwHAwAowEo8BoMAaMBeNAAhgPJoCJYBKYDKaAqWAamA5mgJlgFpgN5oC5YB6YDxaAhWARWAyWgKVgGVgOEsEKsBKsAqvBGrAWrAPrwQawEWwCm8EWsBVsAxjAAQFIQAEaMIAFHACABwIQgQRkoAAVaEAHBjCBBWzgABd4wAcBCEEEIEAgBraDHWAn2AV2gz1gL9gH9oMD4CA4BA6DI+AoOAaOgxPgJDgFToMz4Cw4B86DC+AiuAQugyvgKrgGroMb4Ca4BW6DO+AuuAfugwfgIXgEHoMn4Cl4Bp6DF+AleAVegzfgLXgH3oMP4CP4BD6DL+Ar+Aa+gx/gJ/gFfoM/4C/4B5JAMj45n4JPyafiU/Np+LR8Oj49n4HPyGfiM/NZ+Kx8Nj47n4PPyefic/N5+Lx8Pj4/X4AvyBfiC/NF+KJ8Mb44X4IvyZfiS/Nl+LJ8Ob48X4GvyFfiK/NV+Kp8Nb46X4Ovydfia/N1+Lp8Pb4+34BvyDfiG/NN+KZ8M/4/vjnfgm/Jt+Jb8234tnw7vj3fge/Id+I78134rnw3vjvfg+/J9+J78334vnw/vj8/gI/jB/Lx/CB+MD+EH8oP44fzI/iR/Ch+ND+GH8uP4xP48fwEfiI/iZ/MT+Gn8tP46fwMfiY/i5/Nz+Hn8vP4+fwCfiG/iF/ML+GX8sv45Xwiv4Jfya/iV/Nr+LX8On49v4HfyG/iN/Nb+K38Nh7jcZ7gSZ7iaZ7hWZ7jAc/zAi/yEi/zCq/yGq/zBm/yFm/zDu/yHu/zAR/yEQ95xMf47fwOfie/i9/N7+H38vv4/fwB/iB/iD/MH+GP8sf44/wJ/iR/ij/Nn+HP8uf48/wF/iJ/ib/MX+Gv8tf46/wN/iZ/i7/N3+Hv8vf4+/wD/iH/iH/MP+Gf8s/45/wL/iX/in/Nv+Hf8u/49/wH/iP/if/Mf+G/8t/47/wP/if/i//N/+H/8v/4JD6ZkFxIIaQUUgmphTRCWiGdkF7IIGQUMgmZhSxCViGbkF3IIeQUcgm5hTxCXiGfkF8oIBQUCgmFhSJCUaGYUFwoIZQUSgmlhTJCWaGcUF6oIFQUKgmVhSpCVaGaUF2oIdQUagm1hTpCXaGeUF9oIDQUGgmNhSZCU6GZ8J/QXGghtBRaCa2FNkJboZ3QXuggdBQ6CZ2FLkJXoZvQXegh9BR6Cb2FPkJfoZ/QXxggxAkDhXhhkDBYGCIMFYYJw4URwkhhlDBaGCOMFcYJCcJ4YYIwUZgkTBamCFOFacJ0YYYwU5glzBbmCHOFecJ8YYGwUFgkLBaWCEuFZcJyIVFYIawUVgmrhTXCWmGdsF7YIGwUNgmbhS3CVmGbgAm4QAikQAm0wAiswAlA4AVBEAVJkAVFUAVN0AVDMAVLsAVHcAVP8IVACIVIgAISYsJ2YYewU9gl7Bb2CHuFfcJ+4YBwUDgkHBaOCEeFY8Jx4YRwUjglnBbOCGeFc8J54YJwUbgkXBauCFeFa8J14YZwU7gl3BbuCHeFe8J94YHwUHgkPBaeCE+FZ8Jz4YXwUnglvBbeCG+Fd8J74YPwUfgkfBa+CF+Fb8J34YfwU/gl/Bb+CH+Ff0KSkExMLqYQU4qpxNRiGjGtmE5ML2YQM4qZxMxiFjGrmE3MLuYQc4q5xNxiHjGvmE/MLxYQC4qFxMJiEbGoWEwsLpYQS4qlxNJiGbGsWE4sL1YQK4qVxMpiFbGqWE2sLtYQa4q1xNpiHbGuWE+sLzYQG4qNxMZiE7Gp2Ez8T2wuthBbiq3E1mIbsa3YTmwvdhA7ip3EzmIXsavYTewu9hB7ir3E3mIfsa/YT+wvDhDjxIFivDhIHCwOEYeKw8Th4ghxpDhKHC2OEceK48QEcbw4QZwoThIni1PEqeI0cbo4Q5wpzhJni3PEueI8cb64QFwoLhIXi0vEpeIycbmYKK4QV4qrxNXiGnGtuE5cL24QN4qbxM3iFnGruE3ERFwkRFKkRFpkRFbkRCDyoiCKoiTKoiKqoibqoiGaoiXaoiO6oif6YiCGYiRCEYkxcbu4Q9wp7hJ3i3vEveI+cb94QDwoHhIPi0fEo+Ix8bh4QjwpnhJPi2fEs+I58bx4QbwoXhIvi1fEq+I18bp4Q7wp3hJvi3fEu+I98b74QHwoPhIfi0/Ep//3zMnEV+Jr8Y34Vnwnvhc/iB/FT+Jn8Yv4Vfwmfhd/iD/FX+Jv8Y/4V/wnJonJpORSCimllEpKLaWR0krppPRSBimjlEnKLGWRskrZpOxSDimnlEvKLeWR8kr5pPxSAamgVEgqLBWRikrFpOJSCamkVEoqLZWRykrlpPJSBamiVEmqLFWRqkrVpOpSDammVEuqLdWR6kr1pPpSA6mh1EhqLDWRmkrNpP+k5lILqaXUSmottZHaSu2k9lIHqaPUSeosdZG6St2k7lIPqafUS+ot9ZH6Sv2k/tIAKU4aKMVLg6TB0hBpqDRMGi6NkEZKo6TR0hhprDROSpDGSxOkidIkabI0RZoqTZOmSzOkmdIsabY0R5orzZPmSwukhdIiabG0RFoqLZOWS4nSCmmltEpaLa2R1krrpPXSBmmjtEnaLG2RtkrbJEzCJUIiJUqiJUZiJU4CEi8JkihJkiwpkippki4ZkilZki05kit5ki8FUihFEpSQFJO2SzukndIuabe0R9or7ZP2Swekg9Ih6bB0RDoqHZOOSyekk9Ip6bR0RjornZPOSxeki9Il6bJ0RboqXZOuSzekm9It6bZ0R7or3ZPuSw+kh9Ij6bH0RHoqPZOeSy+kl9Ir6bX0RnorvZPeSx+kj9In6bP0RfoqfZO+Sz+kn9Iv6bf0R/or/ZOSpGRycjmFnFJOJaeW08hp5XRyejmDnFHOJGeWs8hZ5WxydjmHnFPOJeeW88h55XxyfrmAXFAuJBeWi8hF5WJycbmEXFIuJZeWy8hl5XJyebmCXFGuJFeWq8hV5WpydbmGXFOuJdeW68h15XpyfbmB3FBuJDeWm8hN5Wbyf3JzuYXcUm4lt5bbyG3ldnJ7uYPcUe4kd5a7yF3lbnJ3uYfcU+4l95b7yH3lfnJ/eYAcJw+U4+VB8mB5iDxUHiYPl0fII+VR8mh5jDxWHicnyOPlCfJEeZI8WZ4iT5WnydPlGfJMeZY8W54jz5XnyfPlBfJCeZG8WF4iL5WXycvlRHmFvFJeJa+W18hr5XXyenmDvFHeJG+Wt8hb5W0yJuMyIZMyJdMyI7MyJwOZlwVZlCVZlhVZlTVZlw3ZlC3Zlh3ZlT3ZlwM5lCMZykiOydvlHfJOeZe8W94j75X3yfvlA/JB+ZB8WD4iH5WPycflE/JJ+ZR8Wj4jn5XPyeflC/JF+ZJ8Wb4iX5WvydflG/JN+ZZ8W74j35XvyfflB/JD+ZH8WH4iP5Wfyc/lF/JL+ZX8Wn4jv5Xfye/lD/JH+ZP8Wf4if5W/yd/lH/JP+Zf8W/4j/5X/yUlyMiW5kkJJqaRSUitplLRKOiW9kkHJqGRSMitZlKxKNiW7kkPJqeRScit5lLxKPiW/UkApqBRSCitFlKJKMaW4UkIpqZRSSitllLJKOaW8UkGpqFRSKitVlKpKNaW6UkOpqdRSait1lLpKPaW+0kBpqDRSGitNlKZKM+U/pbnSQmmptFJaK22Utko7pb3SQemodFI6K12Urko3pbvSQ+mp9FJ6K32Uvko/pb8yQIlTBirxyiBlsDJEGaoMU4YrI5SRyihltDJGGauMUxKU8coEZaIySZmsTFGmKtOU6coMZaYyS5mtzFHmKvOU+coCZaGySFmsLFGWKsuU5UqiskJZqaxSVitrlLXKOmW9skHZqGxSNitblK3KNgVTcIVQSIVSaIVRWIVTgMIrgiIqkiIriqIqmqIrhmIqlmIrjuIqnuIrgRIqkQIVpMSU7coOZaeyS9mt7FH2KvuU/coB5aBySDmsHFGOKseU48oJ5aRySjmtnFHOKueU88oF5aJySbmsXFGuKteU68oN5aZyS7mt3FHuKveU+8oD5aHySHmsPFGeKs+U58oL5aXySnmtvFHeKu+U98oH5aPySfmsfFG+Kt+U78oP5afyS/mt/FH+Kv+UJCWZmlxNoaZUU6mp1TRqWjWdml7NoGZUM6mZ1SxqVjWbml3NoeZUc6m51TxqXjWfml8toBZUC6mF1SJqUbWYWlwtoZZUS6ml1TJqWbWcWl6toFZUK6mV1SpqVbWaWl2todZUa6m11TpqXbWeWl9toDZUG6mN1SZqU7WZ2kxtrjZXW6ot1dZqa7Wt2lZtr7ZXO6od1c5qZ7Wr2lXtrnZXe6o91d5qb7Wv2lftr/ZX49Q4NV6NVwerQ9Sh6lB1uDpcHamOVEero9Wx6lg1QU1QJ6gT1UnqZHWKOlWdpk5XZ6gz1VnqbHWOOledp85XF6gL1UXqInWJukRdpi5TE9VEdaW6Ul2trlbXqmvV9ep6daO6Ud2sbla3qltVTMVUQiVVSqVVRmVVTgUqrwqqqEqqrCqqqmqqrhqqqVqqrTqqq3qqrwZqqEYqVJEaU7erO9Sd6i51t7pH3avuU/erB9SD6iH1sHpEPaoeU4+rJ9ST6in1tHpGPaueU8+rF9SL6iX1snpFvapeU6+rN9Sb6i31tnpHvaveU++rD9SH6iP1sfpEfao+U5+rL9SX6iv1tfpGfau+U9+rH9SP6if1s/pF/ap+U7+rP9Sf6i/1t/pH/av+U5PUZFpyLYWWUkulpdbSaGm1dFp6LYOWUcukZdayaFm1bFp2LYeWU8ul5dbyaHm1fFp+rYBWUCukFdaKaEW1YlpxrYRWQiulldLKaGW0clo5rYJWQaukVdKqaFW0alp1rYZWQ6ul1dLqaHW0elo9rYHWQGukNdKaaE20ZlozrbnWXGuptdRaa621tlpbrb3WXuuoddQ6a521rlpXrbvWXeup9dR6a721vlpfrb/WX4vT4rR4LV4brA3WhmpDteHacG2kNlIbrY3WxmpjtQQtQZugTdAmaZO1KdpUbZo2XZuhzdRmabO12dpcba42X1ugLdAWaYu0JdoSbZm2TEvUErWV2ipttbZGW6Ot09ZrG7SN2iZts7ZF26pt0zAN1wiN1CiN1hiN1TgNaLwmaKImabKmaKqmabpmaKZmabbmaK7mab4WaKEWaVBDWkzbru3Qdmq7tN3aHm2vtk/brx3QDmqHtMPaEe2odkw7rp3QTmqntNPaGe2sdk47r13QLmqXtMvaFe2qdk27rt3Qbmq3tNvaHe2udk+7rz3QHmqPtMfaE+2p9kx7rr3QXmqvtNfaG+2t9k57r33QPmqftM/aF+2r9k37rv3Qfmq/tN/aH+2v9k9L0pLpyfUUeko9lZ5aT6On1dPp6fUMekY9k55Zz6Jn1bPp2fUcek49l55bz6Pn1fPp+fUCekG9kF5YL6IX1YvpxfUSekm9lF5aL6OX1cvp5fUKekW9kl5Zr6JX1avp1fUaek29ll5br6PX1evp9fUGekO9kd5Yb6I31Zvp/+nN9RZ6S72V3lpvo7fV2+nt9Q56R72T3lnvonfVu+nd9R56T72X3lvvo/fV++n99QF6nD5Qj9cH6YP1IfpQfZg+XB+hj9RH6aP1MfpYfZyeoI/XJ+gT9Un6ZH2KPlWfpk/XZ+gz9Vn6bH2OPlefp8/XF+gL9UX6Yn2JvlRfpi/XE/UV+kp9lb5aX6Ov1dfp6/UN+kZ9k75Z36Jv1bfpmI7rhE7qlE7rjM7qnA50Xhd0UZd0WVd0Vdd0XTd0U7d0W3d0V/d0Xw/0UI90qCM9pm/Xd+g79V36bn2Pvlffp+/XD+gH9UP6Yf2IflQ/ph/XT+gn9VP6af2MflY/p5/XL+gX9Uv6Zf2KflW/pl/Xb+g39Vv6bf2Ofle/p9/XH+gP9Uf6Y/2J/lR/pj/XX+gv9Vf6a/2N/lZ/p7/XP+gf9U/6Z/2L/lX/pn/Xf+g/9V/6b/2P/lf/pyfpyYzkRgojpZHKSG2kMdIa6Yz0RgYjo5HJyGxkMbIa2YzsRg4jp5HLyG3kMfIa+YwEo4BR0ChkFDaKGEWNYkZxo4RR0ihllDbKGGWNckZ5o4JR0ahkVDaqGFWNakZ1o4ZR06hl1DbqGHWNekZ9o4HR0GhkNDaaGE2NZsZ/RnOjhdHSaGW0NtoYbY12Rnujg9HR6GR0NroYXY1uRnejh9HT6GX0NvoYfY1+Rn9jgBFnDDTijUHGYGOIMdQYZgw3RhgjjVHGaGOMMdYYZyQY440JxkRjkjHZmGJMNaYZ040ZxkxjljHbmGPMNeYZ840FxkJjkbHYWGIsNZYZy41EY4Wx0lhlrDbWGGuNdcZ6Y4Ox0dhkbDa2GFuNbQZm4AZhkAZl0AZjsAZnAIM3BEM0JEM2FEM1NEM3DMM0LMM2HMM1PMM3AiM0IgMayIgZ240dxk5jl7Hb2GPsNfYZ+40DxkHjkHHYOGIcNY4Zx40TxknjlHHaOGOcNc4Z540LxkXjknHZuGJcNa4Z140bxk3jlnHbuGPcNe4Z940HxkPjkfHYeGI8NZ4Zz40XxkvjlfHaeGO8Nd4Z740Pxkfjk/HZ+GJ8Nb4Z340fxk/jl/Hb+GP8Nf4ZSUYyM7mZwkxppjJTm2nMtGY6M72ZwcxoZjIzm1nMrGY2M7uZw8xp5jJzm3nMvGY+M79ZwCxoFjILm0XMomYxs7hZwixpljJLm2XMsmY5s7xZwaxoVjIrm1XMqmY1s7pZw6xp1jJrm3XMumY9s77ZwGxoNjIbm03MpmYz8z+zudnCbGm2Mlubbcy2ZjuzvdnB7Gh2MjubXcyuZjezu9nD7Gn2Mnubfcy+Zj+zvznAjDMHmvHmIHOwOcQcag4zh5sjzJHmKHO0OcYca44zE8zx5gRzojnJnGxOMaea08zp5gxzpjnLnG3OMeea88z55gJzobnIXGwuMZeay8zlZqK5wlxprjJXm2vMteY6c725wdxobjI3m1vMreY2EzNxkzBJkzJpkzFZkzOByZuCKZqSKZuKqZqaqZuGaZqWaZuO6Zqe6ZuBGZqRCU1kxszt5g5zp7nL3G3uMfea+8z95gHzoHnIPGweMY+ax8zj5gnzpHnKPG2eMc+a58zz5gXzonnJvGxeMa+a18zr5g3zpnnLvG3eMe+a98z75gPzofnIfGw+MZ+az8zn5gvzpfnKfG2+Md+a78z35gfzo/nJ/Gx+Mb+a38zv5g/zp/nL/G3+Mf+a/8wkM5mV3EphpbRSWamtNFZaK52V3spgZbQyWZmtLFZWK5uV3cph5bRyWbmtPFZeK5+V3ypgFbQKWYWtIlZRq5hV3CphlbRKWaWtMlZZq5xV3qpgVbQqWZWtKlZVq5pV3aph1bRqWbWtOlZdq55V32pgNbQaWY2tJlZTq5n1n9XcamG1tFpZra02VlurndXe6mB1tDpZna0uVlerm9Xd6mH1tHpZva0+Vl+rn9XfGmDFWQOteGuQNdgaYg21hlnDrRHWSGuUNdoaY421xlkJ1nhrgjXRmmRNtqZYU61p1nRrhjXTmmXNtuZYc6151nxrgbXQWmQttpZYS61l1nIr0VphrbRWWautNdbaOGCttzZYG61N1mZri7XV2mZhFm4RFmlRFm0xFmtxFrB4S7BES7JkS7FUS7N0y7BMy7Jsy7Fcy7N8K7BCK7KghayYtd3aYe20dlm7rT3WXmuftd86YB20DlmHrSPWUeuYddw6YZ20TlmnrTPWWeucdd66YF20LlmXrSvWVeuadd26Yd20blm3rTvWXeuedd96YD20HlmPrSfWU+uZ9dx6Yb20XlmvrTfWW+ud9d76YH20PlmfrS/WV+ub9d36Yf20flm/rT/WX+uflWQls5PbKeyUdio7tZ3GTmuns9PbGeyMdiY7s53Fzmpns7PbOeycdi47t53Hzmvns/PbBeyCdiG7sF3ELmoXs4vbJeySdim7tF3GLmuXs8vbFeyKdiW7sl3FrmpXs6vbNeyadi27tl3HrmvXs+vbDeyGdiO7sd3Ebmo3s/+zm9st7JZ2K7u13cZua7ez29sd7I52J7uz3cXuanezu9s97J52L7u33cfua/ez+9sD7Dh7oB1vD7IH20PsofYwe7g9wh5pj7JH22PssfY4O8Eeb0+wJ9qT7Mn2FHuqPc2ebs+wZ9qz7Nn2HHuuPc+eby+wF9qL7MX2EnupvcxebifaK+yV9ip7dapk9lp7nb3e3mBvtDfZm+0t9lZ7m43ZuE3YpE3ZtM3YrM3ZwOZtwRZtyZZtxVZtzdZtwzZty7Ztx3Ztz/btwA7tyIY2smP2dnuHvdPeZe+299h77X32fvuAfdA+ZB+2j9hH7WP2cfuEfdI+ZZ+2z9hn7XP2efuCfdG+ZF+2r9hX7Wv2dfuGfdO+Zd+279h37Xv2ffuB/dB+ZD+2n9hP7Wf2c/uF/dJ+Zb+239hv7Xf2e/uD/dH+ZH+2v9hf7W/2d/uH/dP+Zf+2/9h/7X92kp3MSe6kcFI6qZzUThonrZPOSe9kcDI6mZzMThYnq5PNye7kcHI6uZzcTh4nr5PPye8UcAo6hZzCThGnqFPMKe6UcEo6pZzSThmnrFPOKe9UcCo6lZzKThWnqlPNqe7UcGo6tZzaTh2nrlPPqe80cBo6jZzGThOnqdPM+c9p7rRwWjqtnNZOG6et085p73RwOjqdnM5OF6er083p7vRwejq9nN5OH6ev08/p7wxw4pyBTrwzyBnsDHGGOsOc4c4IZ6QzyhntjHHGOuOcBGe8M8GZ6ExyJjtTnKnONGe6M8OZ6cxyZjtznLnOPGe+s8BZ6CxyFjtLnKXOMme5k+iscFY6q5zVzhpnrbPOWe9scDY6m5zNzhZnq7PNwRzcIRzSoRzaYRzW4Rzg8I7giI7kyI7iqI7m6I7hmI7l2I7juI7n+E7ghE7kQAc5MWe7s8PZ6exydjt7nL3OPme/c8A56BxyDjtHnKPOMee4c8I56ZxyTjtnnLPOOee8c8G56FxyLjtXnKvONee6c8O56dxybjt3nLvOPee+88B56DxyHjtPnKfOM+e588J56bxyXjtvnLfOO+e988H56HxyPjtfnK/ON+e788P56fxyfjt/nL/OPyfJSeYmd1O4Kd1Ubmo3jZvWTeemdzO4Gd1MbmY3i5vVzeZmd3O4Od1cbm43j5vXzefmdwu4Bd1CbmG3iFvULeYWd0u4Jd1Sbmm3jFvWLeeWdyu4Fd1KbmW3ilvVreZWd2u4Nd1abm23jlvXrefWdxu4Dd1GbmO3idvUbeb+5zZ3W7gt3VZua7eN29Zt57Z3O7gd3U5uZ7eL29Xt5nZ3e7g93V5ub7eP29ft5/Z3B7hx7kA33h3kDnaHuEPdYe5wd4Q70h3ljnbHuGPdcW6CO96d4E50J7mT3SnuVHeaO92d4c50Z7mz3TnuXHeeO99d4C50F7mL3SXuUneZu9xNdFe4K91V7mp3jbvWXeeudze4G91N7mZ3i7vV3eZiLu4SLulSLu0yLutyLnB5V3BFV3JlV3FVV3N113BN13Jt13Fd13N9N3BDN3Khi9yYu93d4e50d7m73T3uXnefu9894B50D7mH3SPuUfeYe9w94Z50T7mn3TPuWfece9694F50L7mX3SvuVfeae9294d50b7m33TvuXfeee9994D50H7mP3SfuU/eZ+9x94b50X7mv3TfuW/ed+9794H50P7mf3S/uV/eb+9394f50f7m/3T/uX/efm+Qm85J7KbyUXiovtZfGS+ul89J7GbyMXiYvs5fFy+pl87J7ObycXi4vt5fHy+vl8/J7BbyCXiGvsFfEK+oV84p7JbySXimvtFfGK+uV88p7FbyKXiWvslfFq+pV86p7NbyaXi2vtlfHq+vV8+p7DbyGXiOvsdfEa+o18/7zmnstvJZeK6+118Zr67Xz2nsdvI5eJ6+z18Xr6nXzuns9vJ5eL6+318fr6/Xz+nsDvDhvoBfvDfIGe0O8od4wb7g3whvpjfJGe2O8sd44L8Eb703wJnqTvMneFG+qN82b7s3wZnqzvNneHG+uN8+b7y3wFnqLvMXeEm+pt8xb7iV6K7yV3ipvtbfGW+ut89Z7G7yN3iZvs7fF2+pt8zAP9wiP9CiP9hiP9TgPeLwneKInebKneKqnebpneKZnebbneK7neb4XeKEXedBDXszb7u3wdnq7vN3eHm+vt8/b7x3wDnqHvMPeEe+od8w77p3wTnqnvNPeGe+sd847713wLnqXvMveFe+qd8277t3wbnq3vNveHe+ud8+77z3wHnqPvMfeE++p98x77r3wXnqvvNfeG++t9857733wPnqfvM/eF++r98377v3wfnq/vN/eH++v989L8pL5yf0Ufko/lZ/aT+On9dP56f0MfkY/k5/Zz+Jn9bP52f0cfk4/l5/bz+Pn9fP5+f0CfkG/kF/YL+IX9Yv5xf0Sfkm/lF/aL+OX9cv55f0KfkW/kl/Zr+JX9av51f0afk2/ll/br+PX9ev59f0GfkO/kd/Yb+I39Zv5//nN/RZ+S7+V39pv47f12/nt/Q5+R7+T39nv4nf1u/nd/R5+T7+X39vv4/f1+/n9/QF+nD/Qj/cH+YP9If5Qf5g/3B/hj/RH+aP9Mf5Yf5yf4I/3J/gT/Un+ZH+KP9Wf5k/3Z/gz/Vn+bH+OP9ef58/3F/gL/UX+Yn+Jv9Rf5i/3E/0V/kp/lb/aX+Ov9df56/0N/kZ/k7/Z3+Jv9bf5mI/7hE/6lE/7jM/6nA983hd80Zd82Vd81dd83Td807d823d81/d83w/80I986CM/5m/3d/g7/V3+bn+Pv9ff5+/3D/gH/UP+Yf+If9Q/5h/3T/gn/VP+af+Mf9Y/55/3L/gX/Uv+Zf+Kf9W/5l/3b/g3/Vv+bf+Of9e/59/3H/gP/Uf+Y/+J/9R/5j/3X/gv/Vf+a/+N/9Z/57/3P/gf/U/+Z/+L/9X/5n/3f/g//V/+b/+P/9f/5yf5yYLkQYogZZAqSB2kCdIG6YL0QYYgY5ApyBxkCbIG2YLsQY4gZ5AryB3kCfIG+YL8QYGgYFAoKBwUCYoGxYLiQYmgZFAqKB2UCcoG5YLyQYWgYlApqBxUCaoG1YLqQY2gZlArqB3UCeoG9YL6QYOgYdAoaBw0CZoGzYL/guZBi6Bl0CpoHbQJ2gbtgvZBh6Bj0CnoHHQJugbdgu5Bj6Bn0CvoHfQJ+gb9gv7BgCAuGBjEB4OCwcGQYGgwLBgejAhGBqOC0cGYYGwwLkgIxgcTgonBpGByMCWYGkwLpgczgpnBrGB2MCeYG8wL5gcLgoXBomBxsCRYGiwLlgeJwYpgZbAqWB2sCdYG64L1wYZgY7Ap2BxsCbYG2wIswAMiIAMqoAMmYAMuAAEfCIEYSIEcKIEaaIEeGIEZWIEdOIEbeIEfBEEYRAEMUBALtgc7gp3BrmB3sCfYG+wL9gcHgoPBoeBwcCQ4GhwLjgcngpPBqeB0cCY4G5wLzgcXgovBpeBycCW4GlwLrgc3gpvBreB2cCe4G9wL7gcPgofBo+Bx8CR4GjwLngcvgpfBq+B18CZ4G7wL3gcfgo/Bp+Bz8CX4GnwLvgc/gp/Br+B38Cf4G/wLkoJkYfIwRZgyTBWmDtOEacN0YfowQ5gxzBRmDrOEWcNsYfYwR5gzzBXmDvOEecN8Yf6wQFgwLBQWDouERcNiYfGwRFgyLBWWDsuEZcNyYfmwQlgxrBRWDquEVcNqYfWwRlgzrBXWDuuEdcN6Yf2wQdgwbBQ2DpuETcNm4X9h87BF2DJsFbYO24Rtw3Zh+7BD2DHsFHYOu4Rdw25h97BH2DPsFfYO+4R9w35h/3BAGBcODOPDQeHgcEg4NBwWDg9HhCPDUeHocEw4NhwXJoTjwwnhxHBSODmcEk4Np4XTwxnhzHBWODucE84N54XzwwXhwnBRuDhcEi4Nl4XLw8RwRbgyXBWuDteEa8N14fpwQ7gx3BRuDreEW8NtIRbiIRGSIRXSIROyIReCkA+FUAylUA6VUA21UA+N0Ayt0A6d0A290A+DMAyjEIYojIXbwx3hznBXuDvcE+4N94X7wwPhwfBQeDg8Eh4Nj4XHwxPhyfBUeDo8E54Nz4XnwwvhxfBSeDm8El4Nr4XXwxvhzfBWeDu8E94N74X3wwfhw/BR+Dh8Ej4Nn4XPwxfhy/BV+Dp8E74N34Xvww/hx/BT+Dn8En4Nv4Xfwx/hz/BX+Dv8E/4N/4VJYbIoeZQiShmlilJHaaK0UboofZQhyhhlijJHWaKsUbYoe5QjyhnlinJHeaK8Ub4of1QgKhgVigpHRaKiUbGoeFQiKhmVikpHZaKyUbmofFQhqhhViipHVaKqUbWoelQjqhnVimpHdaK6Ub2oftQgahg1ihpHTaKmUbPov6h51CJqGbWKWkdtorZRu6h91CHqGHWKOkddoq5Rt6h71CPqGfWKekd9or5Rv6h/NCCKiwZG8dGgaHA0JBoaDYuGRyOikdGoaHQ0JhobjYsSovHRhGhiNCmaHE2JpkbTounRjGhmNCuaHc2J5kbzovnRgmhhtChaHC2JlkbLouVRYrQiWhmtilZHa6K10bpofbQh2hhtijZHW6Kt0bYIi/CIiMiIiuiIidiIi0DER0IkRlIkR0qkRlqkR0ZkRlZkR07kRl7kR0EURlEEIxTFou3RjmhntCvaHe2J9kb7ov3RgehgdCg6HB2JjkbHouPRiehkdCo6HZ2JzkbnovPRhehidCm6HF2JrkbXouvRjehmdCu6Hd2J7kb3ovvRg+hh9Ch6HD2JnkbPoufRi+hl9Cp6Hb2J3kbvovfRh+hj9Cn6HH2Jvkbfou/Rj+hn9Cv6Hf2J/kb/oqQoGUwOU8CUMBVMDdPAtDAdTA8zwIwwE8wMs8CsMBvMDnPAnDAXzA3zwLwwH8wPC8CCsBAsDIvAorAYLA5LwJKwFCwNy8CysBwsDyvAirASrAyrwKqwGqwOa8CasBasDevAurAerA8bwIawEWwMm8CmsBn8DzaHLWBL2Aq2hm1gW9gOtocdYEfYCXaGXWBX2A12hz1gT9gL9oZ9YF/YD/aHA2AcHAjj4SA4GA6BQ+EwOByOgCPhKDgajoFj4TiYAMfDCXAinAQnwylwKpwGp8MZcCacBWfDOXAunAfnwwVwIVwEF8MlcClcBpfDRLgCroSr4Gq4Bq6F6+B6uAFuhJvgZrgFboXbIAZxSEASUpCGDGQhBwHkoQBFKEEZKlCFGtShAU1oQRs60IUe9GEAQxhBCBGMwe1wB9wJd8HdcA/cC/fB/fAAPAgPwcPwCDwKj8Hj8AQ8CU/B0/AMPAvPwfPwArwIL8HL8Aq8Cq/B6/AGvAlvwdvwDrwL78H78AF8CB/Bx/AJfAqfwefwBXwJX8HX8A18C9/B9/AD/Ag/wc/wC/wKv8Hv8Af8CX/B3/AP/Av/wSSYDCVHKVBKlAqlRmlQWpQOpUcZUEaUCWVGWVBWlA1lRzlQTpQL5UZ5UF6UD+VHBVBBVAgVRkVQUVQMFUclUElUCpVGZVBZVA6VRxVQRVQJVUZVUFVUDVVHNVBNVAvVRnVQXVQP1UcNUEPUCDVGTVBT1Az9h5qjFqglaoVaozaoLWqH2qMOqCPqhDqjLqgr6oa6ox6oJ+qFeqM+qC/qh/qjASgODUTxaBAajIagoWgYGo5GoJFoFBqNxqCxaBxKQOPRBDQRTUKT0RQ0FU1D09EMNBPNQrPRHDQXzUPz0QK0EC1Ci9EStBQtQ8tRIlqBVqJVaDVag9aidWg92oA2ok1oM9qCtqJtCEM4IhCJKEQjBrGIQwDxSEAikpCMFKQiDenIQCaykI0c5CIP+ShAIYoQRAjF0Ha0A+1Eu9ButAftRfvQfnQAHUSH0GF0BB1Fx9BxdAKdRKfQaXQGnUXn0Hl0AV1El9BldAVdRdfQdXQD3US30G10B91F99B99AA9RI/QY/QEPUXP0HP0Ar1Er9Br9Aa9Re/Qe/QBfUSf0Gf0BX1F39B39AP9RL/Qb/QH/UX/UBJKFkseSxFLGUsVSx1LE0sbSxdLH8sQyxjLFMscyxLLGssWyx7LEcsZyxXLHcsTyxvLF8sfKxArGCsUKxwrEisaKxYrHisRKxkrFSsdKxMrGysXKx+rEKsYqxSrHKsSqxqrFqseqxGrGasVqx2rE6sbqxerH2sQaxhrFGscaxJrGmsW+y/WPNYi1jLWKtY61ibWNtYu1j7WIdYx1inWOdYl1jXWLdY91iPWM9Yr1jvWJ9Y31i/WPzYgFhcbGIuPDfofyfbY4FWzAAB8qw2bjSfbtm3btZtt27Ztm8e2MTPnn23bui/uB/lhiVgSNggbjA3BhmLDsOHYCGwkNgobjY3BxmLjsPHYBGwiNgmbjE3BpmLTsOnYDGwmNgubjc3B5mLzsPnYAmwhtghbjC3BlmLLsOXYCmwltgpbja3B1mLrsPXYBmwjtgnbjG3BtmLbsO3YDmwntgvbje3B9mL7sP3YAewgdgg7jB3BjmLHsOPYCewkdgo7jZ3BzmLnsPPYBewidgm7jF3BrmIYhmMERmIURmMMxmIcxmMCJmISJmMKpmIapmMGZmIWZmMO5mIe5mMBFmIAgxjCIiyGXcOuYzewm9gt7DZ2B7uL3cPuYw+wh9gj7DH2BHuKPcOeYy+wl9gr7DX2BnuLvcPeYx+wj9gn7DP2BfuKfcO+Yz+wn9gv7Df2B/uL/cPi8GR4cjwFHo+nxFPhqfE0eAKeFk+Hp8cz4BnxTHhmPAueFc+GZ8dz4DnxXHhuPA/+H54Xz4fnxwvgBfFCeGG8CF4UL4YXx0vgJfFSeGm8DF4WL4eXxyvgFfFKeGW8Cl4Vr4ZXx2vgNfFaeG28Dl4Xr4fXxxvgDfFGeGO8Cd4Ub4Y3x1vgLfFWeGu8Dd4Wb4e3xzvgHfFOeGe8C94V74Z3x3vgPfFeeG+8D94X74f3xwfgA/FEPAkfhA/Gh+BD8WH4cHwEPhIfhY/Gx+Bj8XH4eHwCPhGfhE/Gp+BT8Wn4dHwGPhOfhc/G5+Bz8Xn4fHwBvhBfhC/Gl+BL8WX4cnwFvhJfha/G1+Br8XX4enwDvhHfhG/Gt+Bb8W34dnwHvhPfhe/G9+B78X34fvwAfhA/hB/Gj+BH8WP4cfwEfhI/hZ/Gz+Bn8XP4efwCfhG/hF/Gr+BXcQzHcQIncQqncQZncQ7ncQEXcQmXcQVXcQ3XcQM3cQu3cQd3cQ/38QAPcYBDHOERHsOv4dfxG/hN/BZ+G7+D38Xv4ffxB/hD/BH+GH+CP8Wf4c/xF/hL/BX+Gn+Dv8Xf4e/xD/hH/BP+Gf+Cf8W/4d/xH/hP/Bf+G/+D/8X/4XFEMiI5kYKIJ1ISqYjURBoigUhLpCPSExmIjEQmIjORhchKZCOyEzmInEQuIjeRh/iPyEvkI/ITBYiCRCGiMFGEKEoUI4oTJYiSRCmiNFGGKEuUI8oTFYiKRCWiMlGFqEpUI6oTNYiaRC2iNlGHqEvUI+oTDYiGRCOiMdGEaEo0I5oTLYiWRCuiNdGGaEu0I9oTHYiORCeiM9GF6Ep0I7oTPYieRC+iN9GH6Ev0I/oTA4iBRCKRRAwiBhNDiKHEMGI4MYIYSYwiRhNjiLHEOGI8MYGYSEwiJhNTiKnENGI6MYOYScwiZhNziLnEPGI+sYBYSCwiFhNLiKXEMmI5sYJYSawiVhNriLXEOmI9sYHYSGwiNhNbiK3ENmI7sYPYSewidhN7iL3EPmI/cYA4SBwiDhNHiKPEMeI4cYI4SZwiThNniLPEOeI8cYG4SFwiLhNXiKsERuAEQZAERdAEQ7AER/CEQIiERMiEQqiERuiEQZiERdiEQ7iER/hEQIQEICCBiIiIEdeI68QN4iZxi7hN3CHuEveI+8QD4iHxiHhMPCGeEs+I58QL4iXxinhNvCHeEu+I98QH4iPxifhMfCG+Et+I78QP4ifxi/hN/CH+Ev+IODIZmZxMQcaTKclUZGoyDZlApiXTkenJDGRGMhOZmcxCZiWzkdnJHGROMheZm8xD/kfmJfOR+ckCZEGyEFmYLEIWJYuRxckSZEmyFFmaLEOWJcuR5ckKZEWyElmZrEJWJauR1ckaZE2yFlmbrEPWJeuR9ckGZEOyEdmYbEI2JZuRzckWZEuyFdmabEO2JduR7ckOZEeyE9mZ7EJ2JbuR3ckeZE+yF9mb7EP2JfuR/ckB5EAykUwiB5GDySHkUHIYOZwcQY4kR5GjyTHkWHIcOZ6cQE4kJ5GTySnkVHIaOZ2cQc4kZ5GzyTnkXHIeOZ9cQC4kF5GLySXkUnIZuZxcQa4kV5GryTXkWnIduZ7cQG4kN5GbyS3kVnIbuZ3cQe4kd5G7yT3kXnIfuZ88QB4kD5GHySPkUfIYeZw8QZ4kT5GnyTPkWfIceZ68QF4kL5GXySvkVRIjcZIgSZIiaZIhWZIjeVIgRVIiZVIhVVIjddIgTdIibdIhXdIjfTIgQxKQkERkRMbIa+R18gZ5k7xF3ibvkHfJe+R98gH5kHxEPiafkE/JZ+Rz8gX5knxFvibfkG/Jd+R78gP5kfxEfia/kF/Jb+R38gf5k/xF/ib/kH/Jf2QclYxKTqWg4qmUVCoqNZWGSqDSUumo9FQGKiOVicpMZaGyUtmo7FQOKieVi8pN5aH+o/JS+aj8VAGqIFWIKkwVoYpSxajiVAmqJFWKKk2VocpS5ajyVAWqIlWJqkxVoapS1ajqVA2qJlWLqk3VoepS9aj6VAOqIdWIakw1oZpSzajmVAuqJdWKak21odpS7aj2VAeqI9WJ6kx1obpS3ajuVA+qJ9WL6k31ofpS/aj+1ABqIJVIJVGDqMHUEGooNYwaTo2gRlKjqNHUGGosNY4aT02gJlKTqMnUFGoqNY2aTs2gZlKzqNnUHGouNY+aTy2gFlKLqMXUEmoptYxaTq2gVlKrqNXUGmottY5aT22gNlKbqM3UFmortY3aTu2gdlK7qN3UHmovtY/aTx2gDlKHqMPUEeoodYw6Tp2gTlKnqNPUGeosdY46T12gLlKXqMvUFeoqhVE4RVAkRVE0xVAsxVE8JVAiJVEypVAqpVE6ZVAmZVE25VAu5VE+FVAhBShIISqiYtQ16jp1g7pJ3aJuU3eou9Q96j71gHpIPaIeU0+op9Qz6jn1gnpJvaJeU2+ot9Q76j31gfpIfaI+U1+or9Q36jv1g/pJ/aJ+U3+ov9Q/Ko5ORienU9DxdEo6FZ2aTkMn0GnpdHR6OgOdkc5EZ6az0FnpbHR2Ogedk85F56bz0P/Reel8dH66AF2QLkQXpovQRelidHG6BF2SLkWXpsvQZelydHm6Al2RrkRXpqvQVelqdHW6Bl2TrkXXpuvQdel6dH26Ad2QbkQ3ppvQTelmdHO6Bd2SbkW3ptvQbel2dHu6A92R7kR3prvQXeludHe6B92T7kX3pvvQfel+dH96AD2QTqST6EH0YHoIPZQeRg+nR9Aj6VH0aHoMPZYeR4+nJ9AT6Un0ZHoKPZWeRk+nZ9Az6Vn0bHoOPZeeR8+nF9AL6UX0YnoJvZReRi+nV9Ar6VX0anoNvZZeR6+nN9Ab6U30ZnoLvZXeRm+nd9A76V30bnoPvZfeR++nD9AH6UP0YfoIfZQ+Rh+nT9An6VP0afoMfZY+R5+nL9AX6Uv0ZfoKfZXGaJwmaJKmaJpmaJbmaJ4WaJGWaJlWaJXWaJ02aJO2aJt2aJf2aJ8O6JAGNKQRHdEx+hp9nb5B36Rv0bfpO/Rd+h59n35AP6Qf0Y/pJ/RT+hn9nH5Bv6Rf0a/pN/Rb+h39nv5Af6Q/0Z/pL/RX+hv9nf5B/6R/0b/pP/Rf+h8dxyRjkjMpmHgmJZOKSc2kYRKYtEw6Jj2TgcnIZGIyM1mYrEw2JjuTg8nJ5GJyM3mY/5i8TD4mP1OAKcgUYgozRZiiTDGmOFOCKcmUYkozZZiyTDmmPFOBqchUYiozVZiqTDWmOlODqcnUYmozdZi6TD2mPtOAacg0YhozTZimTDOmOdOCacm0YlozbZi2TDumPdOB6ch0YjozXZiuTDemO9OD6cn0YnozfZi+TD+mPzOAGcgkMknMIGYwM4QZygxjhjMjmJHMKGY0M4YZy4xjxjMTmInMJGYyM4WZykxjpjMzmJnMLGY2M4eZy8xj5jMLmIXMImYxs4RZyixjljMrmJXMKmY1s4ZZy6xj1jMbmI3MJmYzs4XZymxjtjM7mJ3MLmY3s4fZy+xj9jMHmIPMIeYwc4Q5yhxjjjMnmJPMKeY0c4Y5y5xjzjMXmIvMJeYyc4W5ymAMzhAMyVAMzTAMy3AMzwiMyEiMzCiMymiMzhiMyViMzTiMy3iMzwRMyAAGMoiJmBhzjbnO3GBuMreY28wd5i5zj7nPPGAeMo+Yx8wT5inzjHnOvGBeMq+Y18wb5i3zjnnPfGA+Mp+Yz8wX5ivzjfnO/GB+Mr+Y38wf5i/zj4ljk7HJ2RRsPJuSTcWmZtOwCWxaNh2bns3AZmQzsZnZLGxWNhubnc3B5mRzsbnZPOx/bF42H5ufLcAWZAuxhdkibFG2GFucLcGWZEuxpdkybFm2HFuercBWZCuxldkqbFW2GludrcHWZGuxtdk6bF22HlufbcA2ZBuxjdkmbFO2GducbcG2ZFuxrdk2bFu2Hdue7cB2ZDuxndkubFe2G9ud7cH2ZHuxvdk+bF+2H9ufHcAOZBPZJHYQO5gdwg5lhyWL/7+AYkezY9ix7Dh2PDuBnchOYiezU9ip7DR2OjuDncnOYmezc9i57Dx2PruAXcguYhezS9il7DJ2ObuCXcmuYleza9i17Dp2PbuB3chuYjezW9it7DZ2O7uD3cnuYneze9i97D52P3uAPcgeYg+zR9ij7DH2OHuCPcmeYk+zZ9iz7Dn2PHuBvcheYi+zV9irLMbiLMGSLMXSLMOyLMfyrMCKrMTKrMKqrMbqrMGarMXarMO6rMf6bMCGLGAhi9iIjbHX2OvsDfYme4u9zd5h77L32PvsA/Yh+4h9zD5hn7LP2OfsC/Yl+4p9zb5h37Lv2PfsB/Yj+4n9zH5hv7Lf2O/sD/Yn+4v9zf5h/7L/2DguGZecS8HFcym5VFxqLg2XwKXl0nHpuQxcRi4Tl5nLwmXlsnHZuRxcTi4Xl5vLw/3H5eXycfm5AlxBrhBXmCvCFeWKccW5ElxJrhRXmivDleXKceW5ClxFrhJXmavCVeWqcdW5GlxNrhZXm6vD1eXqcfW5BlxDrhHXmGvCNeWacc25FlxLrhXXmmvDteXace25DlxHrhPXmevCdeW6cd25HlxPrhfXm+vD9eX6cf25AdxALpFL4gZxg7kh3FBuGDecG8GN5EZxo7kx3FhuHDeem8BN5CZxk7kp3FRuGjedm8HN5GZxs7k53FxuHjefW8At5BZxi7kl3FJuGbecW8Gt5FZxq7k13FpuHbee28Bt5DZxm7kt3FZuG7ed28Ht5HZxu7k93F5uH7efO8Ad5A5xh7kj3FHuGHecO8Gd5E5xp7kz3FnuHHeeu8Bd5C5xl7kr3FUO43CO4EiO4miO4ViO43hO4ERO4mRO4VRO43TO4EzO4mzO4VzO43wu4EIOcJBDXMTFuGvcde4Gd5O7xd3m7nB3uXvcfe4B95B7xD3mnnBPuWfcc+4F95J7xb3m3nBvuXfce+4D95H7xH3mvnBfuW/cd+4H95P7xf3m/nB/uX9cHJ+MT86n4OP5lHwqPjWfhk/g0/Lp+PR8Bj4jn4nPzGfhs/LZ+Ox8Dj4nn4vPzefh/+Pz8vn4/HwBviBfiC/MF+GL8sX44nwJviRfii/Nl+HL8uX48nwFviJfia/MV+Gr8tX46nwNviZfi6/N1+Hr8vX4+nwDviHfiG/MN+Gb8s345nwLviXfim/Nt+Hb8u349nwHviPfie/Md+G78t347nwPviffi+/N9+H78v34/vwAfiCfyCfxg/jB/BB+KD+MH86P4Efyo/jR/Bh+LD+OH89P4Cfyk/jJ/BR+Kj+Nn87P4Gfys/jZ/Bx+Lj+Pn88v4Bfyi/jF/BJ+Kb+MX86v4Ffyq/jV/Bp+Lb+OX89v4Dfym/jN/BZ+K7+N387v4Hfyu/jd/B5+L7+P388f4A/yh/jD/BH+KH+MP86f4E/yp/jT/Bn+LH+OP89f4C/yl/jL/BX+Ko/xOE/wJE/xNM/wLM/xPC/wIi/xMq/wKq/xOm/wJm/xNu/wLu/xPh/wIQ94yCM+4mP8Nf46f4O/yd/ib/N3+Lv8Pf4+/4B/yD/iH/NP+Kf8M/45/4J/yb/iX/Nv+Lf8O/49/4H/yH/iP/Nf+K/8N/47/4P/yf/if/N/+L/8Pz5OSCYkF1II8UJKIZWQWkgjJAhphXRCeiGDkFHIJGQWsghZhWxCdiGHkFPIJeQW8gj/CXmFfEJ+oYBQUCgkFBaKCEWFYkJxoYRQUigllBbKCGWFckJ5oYJQUagkVBaqCFWFakJ1oYZQU6gl1BbqCHWFekJ9oYHQUGgkNBaaCE2FZkJzoYXQUmgltBbaCG2FdkJ7oYPQUegkdBa6CF2FbkJ3oYfQU+gl9Bb6CH2FfkJ/YYAwUEgUkoRBwmBhiDBUGCYMF0YII4VRwmhhjDBWGCeMFyYIE4VJwmRhijBVmCZMF2YIM4VZwmxhjjBXmCfMFxYIC4VFwmJhibBUWCYsF1YIK4VVwmphjbBWWCesFzYIG4VNwmZhi7BV2CZsF3YIO4Vdwm5hj7BX2CfsFw4IB4VDwmHhiHBUOCYcF04IJ4VTwmnhjHBWOCecFy4IF4VLwmXhinBVwARcIARSoARaYARW4AReEARRkARZUARV0ARdMARTsARbcARX8ARfCIRQAAIUkBAJMeGacF24IdwUbgm3hTvCXeGecF94IDwUHgmPhSfCU+GZ8Fx4IbwUXgmvhTfCW+Gd8F74IHwUPgmfhS/CV+Gb8F34IfwUfgm/hT/CX+GfECcmE5OLKcR4MaWYSkwtphETxLRiOjG9mEHMKGYSM4tZxKxiNjG7mEPMKeYSc4t5xP/EvGI+Mb9YQCwoFhILi0XEomIxsbhYQiwplhJLi2XEsmI5sbxYQawoVhIri1XEqmI1sbpYQ6wp1hJri3XEumI9sb7YQGwoNhIbi03EpmIzsbnYQmwpthJbi23EtmI7sb3YQewodhI7i13ErmI3sbvYQ+wp9hJ7i33EvmI/sb84QBwoJopJ4iBxsDhEHCoOE4eLI8SR4ihxtDhGHCuOE8eLE8SJ4iRxsjhFnCpOE6eLM8SZ4ixxtjhHnCvOE+eLC8SF4iJxsbhEXCouE5eLK8SV4ipxtbhGXCuuE9eLG8SN4iZxs7hF3CpuE7eLO8Sd4i5xt7hH3CvuE/eLB8SD4iHxsHhEPCoeE4+LJ8ST4inxtHhGPCueE8+LF8SL4iXxsnhFvCpiIi4SIilSIi0yIityIi8KoihKoiwqoipqoi4aoilaoi06oit6oi8GYigCEYpIjMSYeE28Lt4Qb4q3xNviHfGueE+8Lz4QH4qPxMfiE/Gp+Ex8Lr4QX4qvxNfiG/Gt+E58L34QP4qfxM/iF/Gr+E38Lv4Qf4q/xN/iH/Gv+E+Mk5JJyaUUUryUUkolpZbSSAlSWimdlF7KIGWUMkmZpSxSVimblF3KIeWUckm5pTzSf1JeKZ+UXyogFZQKSYWlIlJRqZhUXCohlZRKSaWlMlJZqZxUXqogVZQqSZWlKlJVqZpUXaoh1ZRqSbWlOlJdqZ5UX2ogNZQaSY2lJlJTqZnUXGohtZRaSa2lNlJbqZ3UXuogdZQ6SZ2lLlJXqZvUXeoh9ZR6Sb2lPlJfqZ/UXxogDZQSpSRpkDRYGiINlYZJw6UR0khplDRaGiONlcZJ46UJ0kRpkjRZmiJNlaZJ06UZ0kxpljRbmiPNleZJ86UF0kJpkbRYWiItlZZJy6UV0kpplbRaWiOtldZJ66UN0kZpk7RZ2iJtlbZJ26Ud0k5pl7Rb2iPtlfZJ+6UD0kHpkHRYOiIdlY5Jx6UT0knplHRaOiOdlc5J56UL0kXpknRZuiJdlTAJlwiJlCiJlhiJlTiJlwRJlCRJlhRJlTRJlwzJlCzJlhzJlTzJlwIplIAEJSRFUky6Jl2Xbkg3pVvSbemOdFe6J92XHkgPpUfSY+mJ9FR6Jj2XXkgvpVfSa+mN9FZ6J72XPkgfpU/SZ+mL9FX6Jn2Xfkg/pV/Sb+mP9Ff6J8XJyeTkcgo5Xk4pp5JTy2nkBDmtnE5OL2eQM8qZ5MxyFjmrnE3OLueQc8q55NxynoS4uDg5n5xfLiAXlAvJheUiclG5mFxcLiGXlEvJpeUyclm5nFxeriBXlCvJleUqclW5mlxdriHXlGvJteU6cl25nlxfbiA3lBvJjeUmclO5mdxcbiG3lFvJreU2clu5ndxe7iB3lDvJneUucle5m9xd7iH3lHvJveU+cl+5n9xfHiAPlBPlJHmQPFgeIg+Vh8nD5RHySHmUPFoeI4+Vx8nj5QnyRHmSPFmeIk+Vp8nT5RnyTHmWPFueI8+V58nz5QXyQnmRvFheIi+Vl8nL5RXySnmVvFpeI6+V18nr5Q3yRnmTvFneIm+Vt8nb5R3yTnmXvFveI++V98n75QPyQfmQfFg+Ih+Vj8nH5RPySfmUfFo+I5+Vz8nn5QvyRfmSfFm+Il+VMRmXCZmUKZmWGZmVOZmXBVmUJVmWFVmVNVmXDdmULdmWHdmVPdmXAzmUgQxlJEdyTL4mX5dvyDflW/Jt+Y58V74n35cfyA/lR/Jj+Yn8VH4mP5dfyC/lV/Jr+Y38Vn4nv5c/yB/lT/Jn+Yv8Vf4mf5d/yD/lX/Jv+Y/8V/4nxynJlORKCiVeSamkUlIraZQEJa2STkmvZFAyKpmUzEoWJauSTcmu5FByKrmU3Eoe5T8lr5JPya8UUAoqhZTCShGlqFJMKa6UUEoqpZTSShmlrFJOKa9UUCoqlZTKShWlqlJNqa7UUGoqtZTaSh2lrlJPqa80UBoqjZTGShOlqdJMaa60UFoqrZTWShulrdJOaa90UDoqnZTOShelq9JN6a70UHoqvZTeSh+lr9JP6a8MUAYqiUqSMkgZrAxRhirDlOHKCGWkMkoZrYxRxirjlPHKBGWiMkmZrExRpirTlOnKDGWmMkuZrcxR5irzlPnKAmWhskhZrCxRlirLlOXKCmWlskpZraxR1irrlPXKBmWjsknZrGxRtirblO3KDmWnskvZrexR9ir7lP3KAeWgckg5rBxRjirHlOPKCeWkcko5rZxRzirnlPPKBeWickm5rFxRriqYgiuEQiqUQiuMwiqcwiuCIiqSIiuKoiqaoiuGYiqWYiuO4iqe4iuBEipAgQpSIiWmXFOuKzeUm8ot5bZyR7mr3FPuKw+Uh8oj5bHyRHmqPFOeKy+Ul8or5bXyRnmrvFPeKx+Uj8on5bPyRfmqfFO+Kz+Un8ov5bfyR/mr/FPi1GRqcjWFGq+mVFOpqdU0aoKaVk2nplczqBnVTGpmNYuaVc2mZldzqDnVXGpuNY/6n5pXzafmVwuoBdVCamG1iFpULaYWV0uoJdVSamm1jFpWLaeWVyuoFdVKamW1ilpVraZWV2uoNdVaam21jlpXrafWVxuoDdVGamO1idpUbaY2V1uoLdVWamu1jdpWbae2VzuoHdVOame1i9pV7aZ2V3uoPdVeam+1j9pX7af2VweoA9VENUkdpA5Wh6hD1WHqcHWEOlIdpY5Wx6hj1XHqeHWCOlGdpE5Wp6hT1WnqdHWGOlOdpc5W56hz1XnqfHWBulBdpC5Wl6hL1WXqcnWFulJdpa5W16hr1XXqenWDulHdpG5Wt6hb1W3qdnWHulPdpe5W96h71X3qfvWAelA9pB5Wj6hH1WPqcfWEelI9pZ5Wz6hn1XPqefWCelG9pF5Wr6hXVUzFVUIlVUqlVUZlVU7lVUEVVUmVVUVVVU3VVUM1VUu1VUd1VU/11UANVaBCFamRGlOvqdfVG+pN9ZZ6W72j3lXvqffVB+pD9ZH6WH2iPlWfqc/VF+pL9ZX6Wn2jvlXfqe/VD+pH9ZP6Wf2iflW/qd/VH+pP9Zf6W/2j/lX/qXFaMi25lkKL11JqqbTUWhotQUurpdPSaxm0jFomLbOWRcuqZdOyazm0nFouLbeWR/tPy6vl0/JrBbSCWiGtsFZEK6oV04prJbSSWimttFZGK6uV08prFbSKWiWtslZFq6pV06prNbSaWi2ttlZHq6vV0+prDbSGWiOtsdZEa6o105prLbSWWiuttdZGa6u109prHbSOWiets9ZF66p107prPbSeWi+tt9ZH66v10/prA7SBWqKWpA3SBmtDtKHaMG24NkIbqY3SRmtjtLHaOG28NkGbqE3SJmtTtKnaNG26NkObqc3SZmtztLnaPG2+tkBbqC3SFmtLtKXaMm25tkJbqa3SVmtrtLXaOm29tkHbqG3SNmtbtK3aNm27tkPbqe3Sdmt7tL3aPm2/dkA7qB3SDmtHtKPaMe24dkI7qZ3STmtntLPaOe28dkG7qF3SLmtXtKsapuEaoZEapdEao7Eap/GaoImapMmaoqmapumaoZmapdmao7map/laoIUa0KCGtEiLade069oN7aZ2S7ut3dHuave0+9oD7aH2SHusPdGeas+059oL7aX2SnutvdHeau+099oH7aP2SfusfdG+at+079oP7af2S/ut/dH+av+0OD2ZnlxPocfrKfVUemo9jZ6gp9XT6en1DHpGPZOeWc+iZ9Wz6dn1HHpOPZeeW8+j/6fn1fPp+fUCekG9kF5YL6IX1YvpxfUSekm9lF5aL6OX1cvp5fUKekW9kl5Zr6JX1avp1fUaek29ll5br6PX1evp9fUGekO9kd5Yb6I31ZvpzfUWeku9ld5ab6O31dvp7fUOeke9k95Z76J31bvp3fUeek+9l95b76P31fvp/fUB+kA9UU/SB+mD9SH6UH2YPlwfoY/UR+mj9TH6WH2cPl6foE/UJ+mT9Sn6VH2aPl2foc/UZ+mz9Tn6XH2ePl9foC/UF+mL9SX6Un2Zvlxfoa/UV+mr9TX6Wn2dvl7foG/UN+mb9S36Vn2bvl3foe/Ud+m79T36Xn2fvl8/oB/UD+mH9SP6Uf2Yflw/oZ/UT+mn9TP6Wf2cfl6/oF/UL+mX9Sv6VR3TcZ3QSZ3SaZ3RWZ3TeV3QRV3SZV3RVV3Tdd3QTd3Sbd3RXd3TfT3QQx3oUEd6pMf0a/p1/YZ+U7+l39bv6Hf1e/p9/YH+UH+kP9af6E/1Z/pz/YX+Un+lv9bf6G/1d/p7/YP+Uf+kf9a/6F/1b/p3/Yf+U/+l/9b/6H/1f3qckcxIbqQw4o2URiojtZHGSDDSGumM9EYGI6ORychsZDGyGtmM7EYOI6eRy8ht5DH+M/Ia+Yz8RgGjoFHIKGwUMYoaxYziRgmjpFHKKG2UMcoa5YzyRgWjolHJqGxUMaoa1YzqRg2jplHLqG3UMeoa9Yz6RgOjodHIaGw0MZoazYzmRgujpdHKaG20Mdoa7Yz2Rgejo9HJ6Gx0Mboa3YzuRg+jp9HL6G30Mfoa/Yz+xgBjoJFoJBmDjMHGEGOoMcwYbowwRhqjjNHGGGOsMc4Yb0wwJhqTjMnGFGOqMc2YbswwZhqzjNnGHGOuMc+YbywwFhqLjMXGEmOpscxYbqwwVhqrjNXGGmOtsc5Yb2wwNhqbjM3GFmOrsc3Ybuwwdhq7jN3GHmOvsc/YbxwwDhqHjMPGEeOoccw4bpwwThqnjNPGGeOscc44b1wwLhqXjMvGFeOqgRm4QRikQRm0wRiswRm8IRiiIRmyoRiqoRm6YRimYRm24Riu4Rm+ERihAQxoICMyYsY147pxw7hp3DJuG3eMu8Y9477xwHhoPDIeG0+Mp8Yz47nxwnhpvDJeG2+Mt8Y7473xwfhofDI+G1+Mr8Y347vxw/hp/DJ+G3+Mv8Y/I85MZiY3U5jxZkozlZnaTGMmmGnNdGZ6M4OZ0cxkZjazmFnNbGZ2M4eZ08xl5jbzmP+Zec18Zn6zgFnQLGQWNouYRc1iZnGzhFnSLGWWNsuYZc1yZnmzglnRrGRWNquYVc1qZnWzhlnTrGXWNuuYdc16Zn2zgdnQbGQ2NpuYTc1mZnOzhdnSbGW2NtuYbc12Znuzg9nR7GR2NruYXc1uZnezh9nT7GX2NvuYfc1+Zn9zgDnQTDSTzEHmYHOIOdQcZg43R5gjzVHmaHOMOdYcZ443J5gTzUnmZHOKOdWcZk43Z5gzzVnmbHOOOdecZ843F5gLzUXmYnOJudRcZi43V5grzVXmanONudZcZ643N5gbzU3mZnOLudXcZm43d5g7zV3mbnOPudfcZ+43D5gHzUPmYfOIedQ8Zh43T5gnzVPmafOMedY8Z543L5gXzUvmZfOKedXETNwkTNKkTNpkTNbkTN4UTNGUTNlUTNXUTN00TNO0TNt0TNf0TN8MzNAEJjSRGZkx85p53bxh3jRvmbfNO+Zd855533xgPjQfmY/NJ+ZT85n53HxhvjRfma/NN+Zb85353vxgfjQ/mZ/NL+ZX85v53fxh/jR/mb/NP+Zf858ZZyWzklsprHgrpZXKSm2lsRKstFY6K72VwcpoZbIyW1msrFY2K7uVw8pp5bJyW3ms/6y8Vj4rv1XAKmgVsgpbRayiVjGruFXCKmmVskpbZayyVjmrvFXBqmhVsipbVayqVjWrulXDqmnVsmpbday6Vj2rvtXAamg1shpbTaymVjOrudXCamm1slpbbay2VjurvdXB6mh1sjpbXayuVjeru9XD6mn1snpbfay+Vj+rvzXAGmglWknWIGuwNcQaag2zhlsjrJHWKGu0NcYaa42zxlsTrInWJGuyNcWaak2zplszrJnWLGu2Nceaa82z5lsLrIXWImuxtcRaai2zllsrrJXWKmu1tcZaa62z1lsbrI3WJmuztcXaam2ztls7rJ3WLmu3tcfaa+2z9lsHrIPWIeuwdcQ6ah2zjlsnrJPWKeu0dcY6a52zzlsXrIvWJeuydcW6amEWbhEWaVEWbTEWa3EWbwmWaEmWbCmWammWbhmWaVmWbTmWa3mWbwVWaAELWsiKrJh1zbpu3bBuWres29Yd6651z7pvPbAeWo+sx9YT66n1zHpuvbBeWq+s19Yb6631znpvfbA+Wp+sz9YX66v1zfpu/bB+Wr+s39Yf66/1z4qzk9nJ7RR2vJ3STmWnttPYCXZaO52d3s5gZ7Qz2ZntLHZWO5ud3c5h57Rz2bntPPZ/dl47n53fLmAXtAvZhe0idlG7mF3cLmGXtEvZpe0ydlm7nF3ermBXtCvZle0qdlW7ml3drmHXtGvZte06dl27nl3fbmA3tBvZje0mdlO7md3cbmG3tFvZre02dlu7nd3e7mB3tDvZne0udle7m93d7mH3tHvZve0+dl+7n93fHmAPtBPtJHuQPdgeYg+1h9nD7RH2SHuUPdoeY4+1x9nj7Qn2RHuSPdmeYk+1p9nT7Rn2THuWPdueY8+159nz7QX2QnuRvdheYi+1l9nL7RX2SnuVvdpeY6+119nr7Q32RnuTvdneYm+1t9nb7R32TnuXvdveY++199n77QP2QfuQfdg+Yh+1j9nH7RP2SfuUfdo+Y5+1z9nn7Qv2RfuSfdm+Yl+1MRu3CZu0KZu2GZu1OZu3BVu0JVu2FVu1NVu3Ddu0Ldu2Hdu1Pdu3Azu0gQ1tZEd2zL5mX7dv2DftW/Zt+459175n37cf2A/tR/Zj+4n91H5mP7df2C/tV/Zr+4391n5nv7c/2B/tT/Zn+4v91f5mf7d/2D/tX/Zv+4/91/5nxznJnOROCifeSemkclI7aZwEJ62TzknvZHAyOpmczE4WJ6uTzcnu5HByOrmc3E4e5z8nr5PPye8UcAo6hZzCThGnqFPMKe6UcEo6pZzSThmnrFPOKe9UcCo6lZzKThWnqlPNqe7UcGo6tZzaTh2nrlPPqe80cBo6jZzGThOnqdPMae60cFo6rZzWThunrdPOae90cDo6nZzOThenq9PN6e70cHo6vZzeTh+nr9PP6e8McAY6iU6SM8gZ7AxxhjrDnOHOCGekM8oZ7YxxxjrjnPHOBGeiM8mZ7ExxpjrTnOnODGemM8uZ7cxx5jrznPnOAmehs8hZ7CxxljrLnOXOCmels8pZ7axx1jrrnPXOBmejs8nZ7GxxtjrbnO3ODmens8vZ7exx9jr7nP3OAeegc8g57BxxjjrHnOPOCeekc8o57ZxxzjrnnPPOBeeic8m57FxxrjqYgzuEQzqUQzuMwzqcwzuCIzqSIzuKozqaozuGYzqWYzuO4zqe4zuBEzrAgQ5yIifmXHOuOzecm84t57Zzx7nr3HPuOw+ch84j57HzxHnqPHOeOy+cl84r57XzxnnrvHPeOx+cj84n57PzxfnqfHO+Oz+cn84v57fzx/nr/HPi3GRucjeFG++mdFO5qd00boKb1k3npnczuBndTG5mN4ub1c3mZndzuDndXG5uN4/7n5vXzefmdwu4Bd1CbmG3iFvULeYWd0u4Jd1Sbmm3jFvWLeeWdyu4Fd1KbmW3ilvVreZWd2u4Nd1abm23jlvXrefWdxu4Dd1GbmO3idvUbeY2d1u4Ld1Wbmu3jdvWbee2dzu4Hd1Obme3i9vV7eZ2d3u4Pd1ebm+3j9vX7ef2dwe4A91EN8kd5A52h7hD3WHucHeEO9Id5Y52x7hj3XHueHeCO9Gd5E52p7hT3WnudHeGO9Od5c5257hz3XnufHeBu9Bd5C52l7hL3WXucneFu9Jd5a5217hr3XXueneDu9Hd5G52t7hb3W3udneHu9Pd5e5297h73X3ufveAe9A95B52j7hH3WPucfeEe9I95Z52z7hn3XPuefeCe9G95F52r7hXXczFXcIlXcqlXcZlXc7lXcEVXcmVXcVVXc3VXcM1Xcu1Xcd1Xc/13cANXeBCF7mRG3OvudfdG+5N95Z7273j3nXvuffdB+5D95H72H3iPnWfuc/dF+5L95X72n3jvnXfue/dD+5H95P72f3ifnW/ud/dH+5P95f72/3j/nX/uXFeMi+5l8KL91J6qbzUXhovwUvrpfPSexm8jF4mL7OXxcvqZfOyezm8nF4uL7eXx/vPy+vl8/J7BbyCXiGvsFfEK+oV84p7JbySXimvtFfGK+uV88p7FbyKXiWvslfFq+pV86p7NbyaXi2vtlfHq+vV8+p7DbyGXiOvsdfEa+o185p7LbyWXiuvtdfGa+u189p7HbyOXievs9fF6+p187p7PbyeXi+vt9fH6+v18/p7A7yBXqKX5A3yBntDvKHeMG+4N8Ib6Y3yRntjvLHeOG+8N8Gb6E3yJntTvKneNG+6N8Ob6c3yZntzvLnePG++t8Bb6C3yFntLvKXeMm+5t8Jb6a3yVntrvLXeOm+9t8Hb6G3yNntbvK3eNm+7t8Pb6e3ydnt7vL3ePm+/d8A76B3yDntHvKPeMe+4d8I76Z3yTntnvLPeOe+8d8G76F3yLntXvKse5uEe4ZEe5dEe47Ee5/Ge4Ime5Mme4qme5ume4Zme5dme47me5/le4IUe8KCHvMiLede8694N76Z3y7vt3fHueve8+94D76H3yHvsPfGees+8594L76X3ynvtvfHeeu+8994H76P3yfvsffG+et+8794P76f3y/vt/fH+ev+8OD+Zn9xP4cf7Kf1Ufmo/jZ/gp/XT+en9DH5GP5Of2c/iZ/Wz+dn9HH5OP5ef28/j/+fn9fP5+f0CfkG/kF/YL+IX9Yv5xf0Sfkm/lF/aL+OX9cv55f0KfkW/kl/Zr+JX9av51f0afk2/ll/br+PX9ev59f0GfkO/kd/Yb+I39Zv5zf0Wfku/ld/ab+O39dv57f0Ofke/k9/Z7+J39bv53f0efk+/l9/b7+P39fv5/f0B/kA/0U/yB/mD/SH+UH+YP9wf4Y/0R/mj/TH+WH+cP96f4E/0J/mT/Sn+VH+aP92f4c/0Z/mz/Tn+XH+eP99f4C/0F/mL/SX+Un+Zv9xf4a/0V/mr/TX+Wn+dv97f4G/0N/mb/S3+Vn+bv93f4e/0d/m7/T3+Xn+fv98/4B/0D/mH/SP+Uf+Yf9w/4Z/0T/mn/TP+Wf+cf96/4F/0L/mX/Sv+VR/zcZ/wSZ/yaZ/xWZ/zeV/wRV/yZV/xVV/zdd/wTd/ybd/xXd/zfT/wQx/40Ed+5Mf8a/51/4Z/07/l3/bv+Hf9e/59/4H/0H/kP/af+E/9Z/5z/4X/0n/lv/bf+G/9d/57/4P/0f/kf/a/+F/9b/53/4f/0//l//b/+H/9f35ckCxIHqQI4oOUQaogdZAmSAjSBumC9EGGIGOQKcgcZAmyBtmC7EGOIGeQK8gd5An+C/IG+YL8QYGgYFAoKBwUCYoGxYLiQYmgZFAqKB2UCcoG5YLyQYWgYlApqBxUCaoG1YLqQY2gZlArqB3UCeoG9YL6QYOgYdAoaBw0CZoGzYLmQYugZdAqaB20CdoG7YL2QYegY9Ap6Bx0CboG3YLuQY+gZ9Ar6B30CfoG/YL+wYBgYJAYJAWDgsHBkGBoMCwYHowIRgajgtHBmGBsMC4YH0wIJgaTgsnBlGBqMC2YHswIZgazgtnBnGBuMC+YHywIFgaLgsXBkmBpsCxYHqwIVgargtXBmmBtsC5YH2wINgabgs3BlmBrsC3YHuwIdga7gt3BnmBvsC/YHxwIDgaHgsPBkeBocCw4HpwITgangtPBmeBscC44H1wILgaXgsvBleBqgAV4QARkQAV0wARswAV8IARiIAVyoARqoAV6YARmYAV24ARu4AV+EARhAAIYoCAKYsG14HpwI7gZ3ApuB3eCu8G94H7wIHgYPAoeB0+Cp8Gz4HnwIngZvApeB2+Ct8G74H3wIfgYfAo+B1+Cr8G34HvwI/gZ/Ap+B3+Cv8G/IC5MFiYPU4TxYcowVZg6TBMmhGnDdGH6MEOYMcwUZg6zhFnDbGH2MEeYM8wV5g7zhP+FecN8Yf6wQFgwLBQWDouERcNiYfGwRFgyLBWWDsuEZcNyYfmwQlgxrBRWDquEVcNqYfWwRlgzrBXWDuuEdcN6Yf2wQdgwbBQ2DpuETcNmYfOwRdgybBW2DtuEbcN2YfuwQ9gx7BR2DruEXcNuYfewR9gz7BX2DvuEfcN+Yf9wQDgwTAyTwkHh4HBIODQcFg4PR4Qjw1Hh6HBMODYcF44PJ4QTw0nh5HBKODWcFk4PZ4Qzw1nh7HBOODecF84PF4QLw0Xh4nBJuDRcFi4PV4Qrw1Xh6nBNuDZcF64PN4Qbw03h5nBLuDXcFm4Pd4Q7w13h7nBPuDfcF+4PD4QHw0Ph4fBIeDQ8Fh4PT4Qnw1Ph6fBMeDY8F54PL4QXw0vh5fBKeDXEQjwkQjKkQjpkQjbkQj4UQjGUQjlUQjXUQj00QjO0Qjt0Qjf0Qj8MwjAEIQxRGIWx8Fp4PbwR3gxvhbfDO+Hd8F54P3wQPgwfhY/DJ+HT8Fn4PHwRvgxfha/DN+Hb8F34PvwQfgw/hZ/DL+HX8Fv4PfwR/gx/hb/DP+Hf8F8YB5KB5CAFiAcpQSqQGqQBCSAtSAfSgwwgI8gEMoMsICvIBrKDHCAnyAVygzzgP5AX5AP5QQFQEBQChUERUBQUA8VBCVASlAKlQRlQFpQD5UEFUBFUApVBFVAVVAPVQQ1QE9QCtUEdUBfUA/VBA9AQNAKNQRPQFDQDzUEL0BK0Aq1BG9AWtAPtQQfQEXQCnUEX0BV0A91BD9AT9AK9QR/QF/QD/cEAMBAkgiQwCAwGQ8BQMAwMByPASDAKjAZjwFgwDowHE8BEMAlMBlPAVDANTAczwEwwC8wGc8BcMA/MBwvAQrAILAZLwFKwDCwHK8BKsAqsBmvAWrAOrAcbwEawCWwGW8BWsA1sBzvATrAL7AZ7wF6wD+wHB8BBcAgcBkfAUXAMHAcnwElwCpwGZ8BZcA6cBxfARXAJXAZXwFWAARwQgAQUoAEDWMABHghABBKQgQJUoAEdGMAEFrCBA1zgAR8EIAQAQIBABGLgGrgOboCb4Ba4De6Au+AeuA8egIfgEXgMnoCn4Bl4Dl6Al+AVeA3egLfgHXgPPoCP4BP4DL6Ar+Ab+A5+gJ/gF/gN/oC/4B+Ig8lgcpgCxsOUMBVMDdPABJgWpoPpYQaYEWaCmWEWmBVmg9lhDpgT5oK5YR74H8wL88H8sAAsCAvBwrAILAqLweKwBCwJS8HSsAwsC8vB8rACrAgrwcqwCqwKq8HqsAasCWvB2rAOrAvrwfqwAWwIG8HGsAlsCpvB5rAFbAlbwdawDWwL28H2sAPsCDvBzrAL7Aq7we6wB+wJe8HesA/sC/vB/nAAHAgTYRIcBAfDIXAoHAaHwxFwJBwFR8MxcCwcB8fDCXAinAQnwylwKpwGp8MZcCacBWfDOXAunAfnwwVwIVwEF8MlcClcBpfDFXAlXAVXwzVwLVwH18MNcCPcBDfDLXAr3Aa3wx1wJ9wFd8M9cC/cB/fDA/AgPAQPwyPwKDwGj8MT8CQ8BU/DM/AsPAfPwwvwIrwEL8Mr8CrEIA4JSEIK0pCBLOQgDwUoQgnKUIEq1KAODWhCC9rQgS70oA8DGEIAIUQwgjF4DV6HN+BNeAvehnfgXXgP3ocP4EP4CD6GT+BT+Aw+hy/gS/gKvoZv4Fv4Dr6HH+BH+Al+hl/gV/gNfoc/4E/4C/6Gf+Bf+A/GoWQoOUqB4lFKlAqlRmlQAkqL0qH0KAPKiDKhzCgLyoqyoewoB8qJcqHcKA/6D+VF+VB+VAAVRIVQYVQEFUXFUHFUApVEpVBpVAaVReVQeVQBVUSVUGVUBVVF1VB1VAPVRLVQbVQH1UX1UH3UADVEjVBj1AQ1Rc1Qc9QCtUStUGvUBrVF7VB71AF1RJ1QZ9QFdUXdUHfUA/VEvVBv1Af1Rf1QfzQADUSJKAkNQoPREDQUDUPD0Qg0Eo1Co9EYNBaNQ+PRBDQRTUKT0RQ0FU1D09EMNBPNQrPRHDQXzUPz0QK0EC1Ci9EStBQtQ8vRCrQSrUKr0Rq0Fq1D69EGtBFtQpvRFrQVbUPb0Q60E+1Cu9EetBftQ/vRAXQQHUKH0RF0FB1Dx9EJdBKdQqfRGXQWnUPn0QV0EV1Cl9EVdBVhCEcEIhGFaMQgFnGIRwISkYRkpCAVaUhHBjKRhWzkIBd5yEcBChFAECEUoRi6hq6jG+gmuoVuozvoLrqH7qMH6CF6hB6jJ+gpeoaeoxfoJXqFXqM36C16h96jD+gj+oQ+oy/oK/qGvqMf6Cf6hX6jP+gv+ofiomRR8ihFFB+ljFJFqaM0UUKUNkoXpY8yRBmjTFHmKEuUNcoWZY9yRDmjXFHuKE/0X5Q3yhfljwpEBaNCUeGoSFQ0KhYVj0pEJaNSUemoTFQ2KheVjypEFaNKUeWoSlQ1qhZVj2pENaNaUe2oTlQ3qhfVjxpEDaNGUeOoSdQ0ahY1j1pELaNWUeuoTdQ2ahe1jzpEHaNOUeeoS9Q16hZ1j3pEPaNeUe+oT9Q36hf1jwZEA6PEKCkaFA2OhkRDo2HR8GhENDIaFY2OxkRjo3HR+GhCNDGaFE2OpkRTo2nR9GhGNDOaFc2O5kRzo3nR/GhBtDBaFC2OlkRLo2XR8mhFtDJaFa2O1kRro3XR+mhDtDHaFG2OtkRbo23R9mhHtDPaFe1O2BPtjfZF+6MD0cHoUHQ4OhIdjY5Fx6MT0cnoVHQ6OhOdjc5F56ML0cXoUnQ5uhJdjbAIj4iIjKiIjpiIjbiIj4RIjKRIjpRIjbRIj4zIjKzIjpzIjbzIj4IojEAEIxRFUSy6Fl2PbkQ3o1vR7ehOdDe6F92PHkQPo0fR4+hJ9DR6Fj2PXkQvo1fR6+hN9DZ6F72PPkQfo0/R5+hL9DX6Fn2PfkQ/o1/R7+hP9Df6F8XFksWSx1LE4mMpY6liqWNpYgmxtLF0sfSxDLGMsUyxzLEssayxbLHssRyxnLFcsdyxPLH/Ynlj+WL5YwViBWOFYoVjRWJFY8VixWMlYiVjpWKlY2ViZWPlYuVjFWIVY5VilWNVYlVj1WLVYzViNWO1YrVjdWJ1Y/Vi9WMNYg1jjWKNY01iTWPNYs1jLWItY61irWNtYm1j7WLtYx1iHWOdYp1jXWJdY91i3WM9Yj1jvWK9Y31ifWP9Yv1jA2IDY4mxpNj/CIAHQCGSAACg1c+2bdu+bNu2bdu2bdu2sbatmdlc9wZhg7Eh2FBsGDYcG4GNxEZho7Ex2FhsHDYem4BNxCZhk7Ep2FRsGjYdm4HNxGZhs7E52FxsHjYfW4AtxBZhi7El2FJsGbYcW4GtxFZhq7E12FpsHbYe24BtxDZhm7Et2FZsG7Yd24HtxHZhu7E92F5sH7YfO4AdxA5hh7Ej2FHsGHYcO4GdxE5hp7Ez2FnsHHYeu4BdxC5hl7Er2FXsGnYdu4HdxG5ht7E72F3sHnYfe4A9xB5hj7En2FPsGfYce4G9xF5hr7E32FvsHfYe+4B9xD5hn7Ev2FcMw3CMwEiMwmiMwViMw3hMwERMwmRMwVRMw3TMwEzMwmzMwVzMw3wswEIswgAGMYTF2DfsO/YD+4n9wn5jf7C/2D8sEZ4YT4In4EnxZHhyPAWeEk+Fp8bT4GnxdHh6PAOeEc+EZ8az4FnxbHh2PAeeE8+F58bz4HnxfHh+vABeEC+EF8aL4EXxYnhxvAReEi+Fl8bL4GXxcnh5vAJeEa+EV8ar4FXxanh1vAZeE6+F18br4HXxenh9vAHeEP8Pb4Q3xpvgTfFmeHO8Bd4Sb4W3xtvgbfF2eHu8A94R74R3xrvgXfFueHe8B94T74X3xvvgffF+eH98AD4QH4QPxofgQ/Fh+HB8BD4SH4WPxsfgY/Fx+Hh8Aj4Rn4RPxqfgU/Fp+HR8Bj4Tn4XPxufgc/F5+Hx8Ab4QX4QvxpfgS/Fl+HJ8Bb4SX4Wvxtfga/F1+Hp8A74R34RvxrfgW/Ft+HZ8B74T34Xvxvfge/F9+H78AH4QP4Qfxo/gR/Fj+HH8BH4SP4Wfxs/gZ/Fz+Hn8An4Rv4Rfxq/gV/Fr+HX8Bn4Tv4Xfxu/gd/F7+H38Af4Qf4Q/xp/gT/Fn+HP8Bf4Sf4W/xt/gb/F3+Hv8A/4R/4R/xr/gX3EMx3ECJ3EKp3EGZ3EO53EBF3EJl3EFV3EN13EDN3ELt3EHd3EP9/EAD/EIBzjEER7j3/Dv+A/8J/4L/43/wf/i//BERGIiCZFAJCWSEcmJFERKIhWRmkhDpCXSEemJDERGIhORmchCZCWyEdmJHEROIheRm8hD5CXyEfmJAkRBohBRmChCFCWKEcWJEkRJohRRmihDlCXKEeWJCkRFohJRmahCVCWqEdWJGkRNohZRm6hD1CXqEfWJBkRD4j+iEdGYaEI0JZoRzYkWREuiFdGaaEO0JdoR7YkOREeiE9GZ6EJ0JboR3YkeRE+iF9Gb6EP0JfoR/YkBxEBiEDGYGEIMJYYRw4kRxEhiFDGaGEOMJcYR44kJxERiEjGZmEJMJaYR04kZxExiFjGbmEPMJeYR84kFxEJiEbGYWEIsJZYRy4kVxEpiFbGaWEOsJdYR64kNxEZiE7GZ2EJsJbYR24kdxE5iF7Gb2EPsJfYR+4kDxEHiEHGYOEIcJY4Rx4kTxEniFHGaOEOcJc4R54kLxEXiEnGZuEJcJa4R14kbxE3iFnGbuEPcJe4R94kHxEPiEfGYeEI8JZ4Rz4kXxEviFfGaeEO8Jd4R74kPxEfiE/GZ+EJ8JTACJwiCJCiCJhiCJTiCJwRCJCRCJhRCJTRCJwzCJCzCJhzCJTzCJwIiJCICEJBAREx8I74TP4ifxC/iN/GH+Ev8IxKRickkZAKZlExGJidTkCnJVGRqMg2ZlkxHpiczkBnJTGRmMguZlcxGZidzkDnJXGRuMg+Zl8xH5icLkAXJQmRhsghZlCxGFidLkCXJUmRpsgxZlixHlicrkBXJSmRlsgpZlaxGVidrkDXJWmRtsg5Zl6xH1icbkA3J/8hGZGOyCdmUbEY2J1uQLclWZGuyDdmWbEe2JzuQHclOZGeyC9mV7EZ2J3uQPcleZG+yD9mX7Ef2JweQA8lB5GByCDmUHEYOJ0eQI8lR5GhyDDmWHEeOJyeQE8lJ5GRyCjmVnEZOJ2eQM8lZ5GxyDjmXnEfOJxeQC8lF5GJyCbmUXEYuJ1eQK8lV5GpyDbmWXEeuJzeQG8lN5GZyC7mV3EZuJ3eQO8ld5G5yD7mX3EfuJw+QB8lD5GHyCHmUPEYeJ0+QJ8lT5GnyDHmWPEeeJy+QF8lL5GXyCnmVvEZeJ2+QN8lb5G3yDnmXvEfeJx+QD8lH5GPyCfmUfEY+J1+QL8lX5GvyDfmWfEe+Jz+QH8lP5GfyC/mVxEicJEiSpEiaZEiW5EieFEiRlEiZVEiV1EidNEiTtEibdEiX9EifDMiQjEhAQhKRMfmN/E7+IH+Sv8jf5B/yL/mPTEQlppJQCVRSKhmVnEpBpaRSUampNFRaKh2VnspAZaQyUZmpLFRWKhuVncpB5aRyUbmpPFReKh+VnypAFaQKUYWpIlRRqhhVnCpBlaRKUaWpMlRZqhxVnqpAVaQqUZWpKlRVqhpVnapB1aRqUbWpOlRdqh5Vn2pANaT+oxpRjakmVFOqGdWcakG1pFpRrak2VFuqHdWe6kB1pDpRnakuVFeqG9Wd6kH1pHpRvak+VF+qH9WfGkANpAZRg6kh1FBqGDWcGkGNpEZRo6kx1FhqHDWemkBNpCZRk6kp1FRqGjWdmkHNpGZRs6k51FxqHjWfWkAtpBZRi6kl1FJqGbWcWkGtpFZRq6k11FpqHbWe2kBtpDZRm6kt1FZqG7Wd2kHtpHZRu6k91F5qH7WfOkAdpA5Rh6kj1FHqGHWcOkGdpE5Rp6kz1FnqHHWeukBdpC5Rl6kr1FXqGnWdukHdpG5Rt6k71F3qHnWfekA9pB5Rj6kn1FPqGfWcekG9pF5Rr6k31FvqHfWe+kB9pD5Rn6kv1FcKo3CKoEiKomiKoViKo3hKoERKomRKoVRKo3TKoEzKomzKoVzKo3wqoEIqogAFKUTF1DfqO/WD+kn9on5Tf6i/1D8qEZ2YTkIn0EnpZHRyOgWdkk5Fp6bT0GnpdHR6OgOdkc5EZ6az0FnpbHR2Ogedk85F56bz0HnpfHR+ugBdkC5EF6aL0EXpYnRxugRdki5Fl6bL0GXpcnR5ugJdka5EV6ar0FXpanR1ugZdk65F16br0HXpenR9ugHdkP6PbkQ3ppvQTelmdHO6Bd2SbkW3ptvQbel2dHu6A92R7kR3prvQXeludHe6B92T7kX3pvvQfel+dH96AD2QHkQPpofQQ+lh9HB6BD2SHkWPpsfQY+lx9Hh6Aj2RnkRPpqfQU+lp9HR6Bj2TnkXPpufQc+l59Hx6Ab2QXkQvppfQS+ll9HJ6Bb2SXkWvptfQa+l19Hp6A72R3kRvprfQW+lt9HZ6B72T3kXvpvfQe+l99H76AH2QPkQfpo/QR+lj9HH6BH2SPkWfps/QZ+lz9Hn6An2RvkRfpq/QV+lr9HX6Bn2TvkXfpu/Qd+l79H36Af2QfkQ/pp/QT+ln9HP6Bf2SfkW/pt/Qb+l39Hv6A/2R/kR/pr/QX2mMxmmCJmmKpmmGZmmO5mmBFmmJlmmFVmmN1mmDNmmLtmmHdmmP9umADumIBjSkER3T3+jv9A/6J/2L/k3/of/S/+hETGImCZPAJGWSMcmZFExKJhWTmknDpGXSMemZDExGJhOTmcnCZGWyMdmZHExOJheTm8nD5GXyMfmZAkxBphBTmCnCFGWKMcWZEkxJphRTminDlGXKMeWZCkxFphJTmanCVGWqMdWZGkxNphZTm6nD1GXqMfWZBkxD5j+mEdOYacI0ZZoxzZkWTEumFdOaacO0Zdox7ZkOTEemE9OZ6cJ0Zbox3ZkeTE+mF9Ob6cP0Zfox/ZkBzEBmEDOYGcIMZYYxw5kRzEhmFDOaGcOMZcYx45kJzERmEjOZmcJMZaYx05kZzExmFjObmcPMZeYx85kFzEJmEbOYWcIsZZYxy5kVzEpmFbOaWcOsZdYx65kNzEZmE7OZ2cJsZbYx25kdzE5mF7Ob2cPsZfYx+5kDzEHmEHOYOcIcZY4xx5kTzEnmFHOaOcOcZc4x55kLzEXmEnOZucJcZa4x15kbzE3mFnObucPcZe4x95kHzEPmEfOYecI8ZZ4xz5kXzEvmFfOaecO8Zd4x75kPzEfmE/OZ+cJ8ZTAGZwiGZCiGZhiGZTiGZwRGZCRGZhRGZTRGZwzGZCzGZhzGZTzGZwImZCIGMJBBTMx8Y74zP5ifzC/mN/OH+cv8YxKxidkkbAKblE3GJmdTsCnZVGxqNg2blk3HpmczsBnZTGxmNgublc3GZmdzsDnZXGxuNg+bl83H5mcLsAXZQmxhtghblC3GFmdLsCXZUmxptgxbli3HlmcrsBXZSmxltgpbla3GVmdrsDXZWmxttg5bl63H1mcbsA3Z/9hGbGO2CduUbcY2Z1uwLdlWbGu2DduWbce2ZzuwHdlObGe2C9uV7cZ2Z3uwPdlebG+2D9uX7cf2ZwewA9lB7GB2CDuUHcYOZ0ewI9lR7Gh2DDuWHceOZyewE9lJ7GR2CjuVncZOZ2ewM9lZ7Gx2DjuXncfOZxewC9lF7GJ2CbuUXcYuZ1ewK9lV7Gp2DbuWXceuZzewG9lN7GZ2C7uV3cZuZ3ewO9ld7G52D7uX3cfuZw+wB9lD7GH2CHuUPcYeZ0+wJ9lT7Gn2DHuWPceeZy+wF9lL7GX2CnuVvcZeZ2+wN9lb7G32DnuXvcfeZx+wD9lH7GP2CfuUfcY+Z1+wL9lX7Gv2DfuWfce+Zz+wH9lP7Gf2C/uVxVicJViSpViaZViW5VieFViRlViZVViV1VidNViTtVibdViX9VifDdiQjVjAQhaxMfuN/c7+YH+yv9jf7B/2L/uPTcQl5pJwCVxSLhmXnEvBpeRScam5NFxaLh2XnsvAZeQycZm5LFxWLhuXncvB5eRycbm5PFxeLh+XnyvAFeQKcYW5IlxRrhhXnCvBleRKcaW5MlxZrhxXnqvAVeQqcZW5KlxVrhpXnavB1eRqcbW5Olxdrh5Xn2vANeT+4xpxjbkmXFOuGdeca8G15Fpxrbk2XFuuHdee68B15DpxnbkuXFeuG9ed68H15Hpxvbk+XF+uH9efG8AN5AZxg7kh3FBuGDecG8GN5EZxo7kx3FhuHDeem8BN5CZxk7kp3FRuGjedm8HN5GZxs7k53FxuHjefW8At5BZxi7kl3FJuGbecW8Gt5FZxq7k13FpuHbee28Bt5DZxm7kt3FZuG7ed28Ht5HZxu7k93F5uH7efO8Ad5A5xh7kj3FHuGHecO8Gd5E5xp7kz3FnuHHeeu8Bd5C5xl7kr3FXuGnedu8Hd5G5xt7k73F3uHnefe8A95B5xj7kn3FPuGfece8G95F5xr7k33FvuHfee+8B95D5xn7kv3FcO43CO4EiO4miO4ViO43hO4ERO4mRO4VRO43TO4EzO4mzO4VzO43wu4EIu4gAHOcTF3DfuO/eD+8n94n5zf7i/3D8uEZ+YT8In8En5ZHxyPgWfkk/Fp+bT8Gn5dHx6PgOfkc/EZ+az8Fn5bHx2Pgefk8/F5+bz8Hn5fHx+vgBfkC/EF+aL8EX5YnxxvgRfki/Fl+bL8GX5cnx5vgJfka/EV+ar8FX5anx1vgZfk6/F1+br8HX5enx9vgHfkP+Pb8Q35pvwTflmfHO+Bd+Sb8W35tvwbfl2fHu+A9+R78R35rvwXflufHe+B9+T78X35vvwffl+fH9+AD+QH8QP5ofwQ/lh/HB+BD+SH8WP5sfwY/lx/Hh+Aj+Rn8RP5qfwU/lp/HR+Bj+Tn8XP5ufwc/l5/Hx+Ab+QX8Qv5pfwS/ll/HJ+Bb+SX8Wv5tfwa/l1/Hp+A7+R38Rv5rfwW/lt/HZ+B7+T38Xv5vfwe/l9/H7+AH+QP8Qf5o/wR/lj/HH+BH+SP8Wf5s/wZ/lz/Hn+An+Rv8Rf5q/wV/lr/HX+Bn+Tv8Xf5u/wd/l7/H3+Af+Qf8Q/5p/wT/ln/HP+Bf+Sf8W/5t/wb/l3/Hv+A/+R/8R/5r/wX3mMx3mCJ3mKp3mGZ3mO53mBF3mJl3mFV3mN13mDN3mLt3mHd3mP9/mAD/mIBzzkER/z3/jv/A/+J/+L/83/4f/y//hEQmIhiZAgJBWSCcmFFEJKIZWQWkgjpBXSCemFDEJGIZOQWcgiZBWyCdmFHEJOIZeQW8gj5BXyCfmFAkJBoZBQWCgiFBWKCcWFEkJJoZRQWigjlBXKCeWFCkJFoZJQWagiVBWqCdWFGkJNoZZQW6gj1BXqCfWFBkJD4T+hkdBYaCI0FZoJzYUWQkuhldBaaCO0FdoJ7YUOQkehk9BZ6CJ0FboJ3YUeQk+hl9Bb6CP0FfoJ/YUBwkBhkDBYGCIMFYYJw4URwkhhlDBaGCOMFcYJ44UJwkRhkjBZmCJMFaYJ04UZwkxhljBbmCPMFeYJ84UFwkJhkbBYWCIsFZYJy4UVwkphlbBaWCOsFdYJ64UNwkZhk7BZ2CJsFbYJ24Udwk5hl7Bb2CPsFfYJ+4UDwkHhkHBYOCIcFY4Jx4UTwknhlHBaOCOcFc4J54ULwkXhknBZuCJcFa4J14Ubwk3hlnBbuCPcFe4J94UHwkPhkfBYeCI8FZ4Jz4UXwkvhlfBaeCO8Fd4J74UPwkfhk/BZ+CJ8FTABFwiBFCiBFhiBFTiBFwRBFCRBFhRBFTRBFwzBFCzBFhzBFTzBFwIhFCIBCFBAQix8E74LP4Sfwi/ht/BH+Cv8ExKJicUkYoKYVEwmJhdTiCnFVGJqMY2YVkwnphcziBnFTGJmMYuYVcwmZhdziDnFXGJuMY+YV8wn5hcLiAXFQmJhsYhYVCwmFhdLiCXFUmJpsYxYViwnlhcriBXFSmJlsYpYVawmVhdriDXFWmJtsY5YV6wn1hcbiA3F/8RGYmOxidhUbCY2F1uILcVWYmuxjdhWbCe2FzuIHcVOYmexi9hV7CZ2F3uIPcVeYm+xj9hX7Cf2FweIA8VB4mBxiDhUHCYOF0eII8VR4mhxjDhWHCeOFyeIE8VJ4mRxijhVnCZOF2eIM8VZ4mxxjjhXnCfOFxeIC8VF4mJxibhUXCYuF1eIK8VV4mpxjbhWXCeuFzeIG8VN4mZxi7hV3CZuF3eIO8Vd4m5xj7hX3CfuFw+IB8VD4mHxiHhUPCYeF0+IJ8VT4mnxjHhWPCeeFy+IF8VL4mXxinhVvCZeF2+IN8Vb4m3xjnhXvCfeFx+ID8VH4mPxifhUfCY+F1+IL8VX4mvxjfhWfCe+Fz+IH8VP4mfxi/hVxERcJERSpERaZERW5EReFERRlERZVERV1ERdNERTtERbdERX9ERfDMRQjEQgQhGJsfhN/C7+EH+Kv8Tf4h/xr/hPTCQllpJICVJSKZmUXEohpZRSSamlNFJaKZ2UXsogZZQySZmlLFJWKZuUXcoh5ZRySbmlPFJeKZ+UXyogFZQKSYWlIlJRqZhUXCohlZRKSaWlMlJZqZxUXqogVZQqSZWlKlJVqZpUXaoh1ZRqSbWlOlJdqZ5UX2ogNZT+kxpJjaUmUlOpmdRcaiG1lFpJraU2UlupndRe6iB1lDpJnaUuUlepm9Rd6iH1lHpJvaU+Ul+pn9RfGiANlAZJg6Uh0lBpmDRcGiGNlEZJo6Ux0lhpnDRemiBNlCZJk6Up0lRpmjRdmiHNlGZJs6U50lxpnjRfWiAtlBZJi6Ul0lJpmbRcWiGtlFZJq6U10lppnbRe2iBtlDZJm6Ut0lZpm7Rd2iHtlHZJu6U90l5pn7RfOiAdlA5Jh6Uj0lHpmHRcOiGdlE5Jp6Uz0lnpnHReuiBdlC5Jl6Ur0lXpmnRduiHdlG5Jt6U70l3pnnRfeiA9lB5Jj6Un0lPpmfRceiG9lF5Jr6U30lvpnfRe+iB9lD5Jn6Uv0lcJk3CJkEiJkmiJkViJk3hJkERJkmRJkVRJk3TJkEzJkmzJkVzJk3wpkEIpkoAEJSTF0jfpu/RD+in9kn5Lf6S/0j8pkZxYTiInyEnlZHJyOYWcUk4lp5bTyGnldHJ6OYOcUc4kZ5azyFnlbHJ2OYecU84l55bzyHnlfHJ+uYBcUC4kF5aLyEXlYnJxuYRcUi4ll5bLyGXlcnJ5uYJcUa4kV5aryFXlanJ1uYZcU64l15bryHXlenJ9uYHcUP5PbiQ3lpvITeVmcnO5hdxSbiW3ltvIbeV2cnu5g9xR7iR3lrvIXeVucne5h9xT7iX3lvvIfeV+cn95gDxQHiQPlofIQ+Vh8nB5hDxSHiWPlsfIY+Vx8nh5gjxRniRPlqfIU+Vp8nR5hjxTniXPlufIc+V58nx5gbxQXiQvlpfIS+Vl8nJ5hbxSXiWvltfIa+V18np5g7xR3iRvlrfIW+Vt8nZ5h7xT3iXvlvfIe+V98n75gHxQPiQflo/IR+Vj8nH5hHxSPiWfls/IZ+Vz8nn5gnxRviRflq/IV+Vr8nX5hnxTviXflu/Id+V78n35gfxQfiQ/lp/IT+Vn8nP5hfxSfiW/lt/Ib+V38nv5g/xR/iR/lr/IX2VMxmVCJmVKpmVGZmVO5mVBFmVJlmVFVmVN1mVDNmVLtmVHdmVP9uVADuVIBjKUkRzL3+Tv8g/5p/xL/i3/kf/K/+RESmIliZKgJFWSKcmVFEpKJZWSWkmjpFXSKemVDEpGJZOSWcmiZFWyKdmVHEpOJZeSW8mj5FXyKfmVAkpBpZBSWCmiFFWKKcWVEkpJpZRSWimjlFXKKeWVCkpFpZJSWamiVFWqKdWVGkpNpZZSW6mj1FXqKfWVBkpD5T+lkdJYaaI0VZopzZUWSkulldJaaaO0Vdop7ZUOSkelk9JZ6aJ0Vbop3ZUeSk+ll9Jb6aP0Vfop/ZUBykBlkDJYGaIMVYYpw5URykhllDJaGaOMVcYp45UJykRlkjJZmaJMVaYp05UZykxlljJbmaPMVeYp85UFykJlkbJYWaIsVZYpy5UVykpllbJaWaOsVdYp65UNykZlk7JZ2aJsVbYp25Udyk5ll7Jb2aPsVfYp+5UDykHlkHJYOaIcVY4px5UTyknllHJaOaOcVc4p55ULykXlknJZuaJcVa4p15Ubyk3llnJbuaPcVe4p95UHykPlkfJYeaI8VZ4pz5UXykvllfJaeaO8Vd4p75UPykflk/JZ+aJ8VTAFVwiFVCiFVhiFVTiFVwRFVCRFVhRFVTRFVwzFVCzFVhzFVTzFVwIlVCIFKFBBSqx8U74rP5Sfyi/lt/JH+av8UxKpidUkaoKaVE2mJldTqCnVVGpqNY2aVk2nplczqBnVTGpmNYuaVc2mZldzqDnVXGpuNY+aV82n5lcLqAXVQmphtYhaVC2mFldLqCXVUmpptYxaVi2nllcrqBXVSmpltYpaVa2mVldrqDXVWmpttY5aV62n1lcbqA3V/9RGamO1idpUbaY2V1uoLdVWamu1jdpWbae2VzuoHdVOame1i9pV7aZ2V3uoPdVeam+1j9pX7af2VweoA9VB6mB1iDpUHaYOV0eoI9VR6mh1jDpWHaeOVyeoE9VJ6mR1ijpVnaZOV2eoM9VZ6mx1jjpXnafOVxeoC9VF6mJ1ibpUXaYuV1eoK9VV6mp1jbpWXaeuVzeoG9VN6mZ1i7pV3aZuV3eoO9Vd6m51j7pX3afuVw+oB9VD6mH1iHpUPaYeV0+oJ9VT6mn1jHpWPaeeVy+oF9VL6mX1inpVvaZeV2+oN9Vb6m31jnpXvafeVx+oD9VH6mP1ifpUfaY+V1+oL9VX6mv1jfpWfae+Vz+oH9VP6mf1i/pVxVRcJVRSpVRaZVRW5VReFVRRlVRZVVRV1VRdNVRTtVRbdVRX9VRfDdRQjVSgQhWpsfpN/a7+UH+qv9Tf6h/1r/pPTaQl1pJoCVpSLZmWXEuhpdRSaam1NFpaLZ2WXsugZdQyaZm1LFpWLZuWXcuh5dRyabm1PFpeLZ+WXyugFdQKaYW1IlpRrZhWXCuhldRKaaW1MlpZrZxWXqugVdQqaZW1KlpVrZpWXauh1dRqabW1OlpdrZ5WX2ugNdT+0xppjbUmWlOtmdZca6G11FpprbU2WlutndZe66B11DppnbUuWletm9Zd66H11HppvbU+Wl+tn9ZfG6AN1AZpg7Uh2lBtmDZcG6GN1EZpo7Ux2lhtnDZem6BN1CZpk7Up2lRtmjZdm6HN1GZps7U52lxtnjZfW6At1BZpi7Ul2lJtmbZcW6Gt1FZpq7U12lptnbZe26Bt1DZpm7Ut2lZtm7Zd26Ht1HZpu7U92l5tn7ZfO6Ad1A5ph7Uj2lHtmHZcO6Gd1E5pp7Uz2lntnHZeu6Bd1C5pl7Ur2lXtmnZdu6Hd1G5pt7U72l3tnnZfe6A91B5pj7Un2lPtmfZce6G91F5pr7U32lvtnfZe+6B91D5pn7Uv2lcN03CN0EiN0miN0ViN03hN0ERN0mRN0VRN03TN0EzN0mzN0VzN03wt0EIt0oAGNaTF2jftu/ZD+6n90n5rf7S/2j8tkZ5YT6In6En1ZHpyPYWeUk+lp9bT6Gn1dHp6PYOeUc+kZ9az6Fn1bHp2PYeeU8+l59bz6Hn1fHp+vYBeUC+kF9aL6EX1YnpxvYReUi+ll9bL6GX1cnp5vYJeUa+kV9ar6FX1anp1vYZeU6+l19br6HX1enp9vYHeUP9Pb6Q31pvoTfVmenO9hd5Sb6W31tvobfV2enu9g95R76R31rvoXfVuene9h95T76X31vvoffV+en99gD5QH6QP1ofoQ/Vh+nB9hD5SH6WP1sfoY/Vx+nh9gj5Rn6RP1qfoU/Vp+nR9hj5Tn6XP1ufoc/V5+nx9gb5QX6Qv1pfoS/Vl+nJ9hb5SX6Wv1tfoa/V1+np9g75R36Rv1rfoW/Vt+nZ9h75T36Xv1vfoe/V9+n79gH5QP6Qf1o/oR/Vj+nH9hH5SP6Wf1s/oZ/Vz+nn9gn5Rv6Rf1q/oV/Vr+nX9hn5Tv6Xf1u/od/V7+n39gf5Qf6Q/1p/oT/Vn+nP9hf5Sf6W/1t/ob/V3+nv9g/5R/6R/1r/oX3VMx3VCJ3VKp3VGZ3VO53VBF3VJl3VFV3VN13VDN3VLt3VHd3VP9/VAD/VIBzrUkR7r3/Tv+g/9p/5L/63/0f/q//RERmIjiZFgJDWSGcmNFEZKI5WR2khjpDXSGemNDEZGI5OR2chiZDWyGdmNHEZOI5eR28hj5DXyGfmNAkZBo5BR2ChiFDWKGcWNEkZJo5RR2ihjlDXKGeWNCkZFo5JR2ahiVDWqGdWNGkZNo5ZR26hj1DXqGfWNBkZD4z+jkdHYaGI0NZoZzY0WRkujldHaaGO0NdoZ7Y0ORkejk9HZ6GJ0NboZ3Y0eRk+jl9Hb6GP0NfoZ/Y0BxkBjkDHYGGIMNYYZw40RxkhjlDHaGGOMNcYZ440JxkRjkjHZmGJMNaYZ040ZxkxjljHbmGPMNeYZ840FxkJjkbHYWGIsNZYZy40VxkpjlbHaWGOsNdYZ640NxkZjk7HZ2GJsNbYZ240dxk5jl7Hb2GPsNfYZ+40DxkHjkHHYOGIcNY4Zx40TxknjlHHaOGOcNc4Z540LxkXjknHZuGJcNa4Z140bxk3jlnHbuGPcNe4Z940HxkPjkfHYeGI8NZ4Zz40XxkvjlfHaeGO8Nd4Z740Pxkfjk/HZ+GJ8NTADNwiDNCiDNhiDNTiDNwRDNCRDNhRDNTRDNwzDNCzDNhzDNTzDNwIjNCIDGNBARmx8M74bP4yfxi/jt/HH+Gv8MxKZic0kZoKZ1ExmJjdTmCnNVGZqM42Z1kxnpjczmBnNTGZmM4uZ1cxmZjdzmDnNXGZuM4+Z18xn5jcLmAXNQmZhs4hZ1CxmFjdLmCXNUmZps4xZ1ixnljcrmBXNSmZls4pZ1axmVjdrmDXNWmZts45Z16xn1jcbmA3N/8xGZmOzidnUbGY2N1uYLc1WZmuzjdnWbGe2NzuYHc1OZmezi9nV7GZ2N3uYPc1eZm+zj9nX7Gf2NweYA81B5mBziDnUHGYON0eYI81R5mhzjDnWHGeONyeYE81J5mRzijnVnGZON2eYM81Z5mxzjjnXnGfONxeYC81F5mJzibnUXGYuN1eYK81V5mpzjbnWXGeuNzeYG81N5mZzi7nV3GZuN3eYO81d5m5zj7nX3GfuNw+YB81D5mHziHnUPGYeN0+YJ81T5mnzjHnWPGeeNy+YF81L5mXzinnVvGZeN2+YN81b5m3zjnnXvGfeNx+YD81H5mPzifnUfGY+N1+YL81X5mvzjfnWfGe+Nz+YH81P5mfzi/nVxEzcJEzSpEzaZEzW5EzeFEzRlEzZVEzV1EzdNEzTtEzbdEzX9EzfDMzQjExgQhOZsfnN/G7+MH+av8zf5h/zr/nPTGQltpJYCVZSK5mV3EphpbRSWamtNFZaK52V3spgZbQyWZmtLFZWK5uV3cph5bRyWbmtPFZeK5+V3ypgFbQKWYWtIlZRq5hV3CphlbRKWaWtMlZZq5xV3qpgVbQqWZWtKlZVq5pV3aph1bRqWbWtOlZdq55V32pgNbT+sxpZja0mVlOrmdXcamG1tFpZra02VlurndXe6mB1tDpZna0uVlerm9Xd6mH1tHpZva0+Vl+rn9XfGmANtAZZg60h1lBrmDXcGmGNtEZZo60x1lhrnDXemmBNtCZZk60p1lRrmjXdmmHNtGZZs6051lxrnjXfWmAttBZZi60l1lJrmbXcWmGttFZZq6011lprnbXe2mBttDZZm60t1lZrm7Xd2mHttHZZu6091l5rn7XfOmAdtA5Zh60j1lHrmHXcOmGdtE5Zp60z1lnrnHXeumBdtC5Zl60r1lXrmnXdumHdtG5Zt6071l3rnnXfemA9tB5Zj60n1lPrmfXcemG9tF5Zr6031lvrnfXe+mB9tD5Zn60v1lcLs3CLsEiLsmiLsViLs3hLsERLsmRLsVRLs3TLsEzLsmzLsVzLs3wrsEIrsoAFLWTF1jfru/XD+mn9sn5bf6y/1j8rkZ3YTmIn2EntZHZyO4Wd0k5lp7bT2GntdHZ6O4Od0c5kZ7az2FntbHZ2O4ed085l57bz2HntfHZ+u4Bd0C5kF7aL2EXtYnZxu4Rd0i5ll7bL2GXtcnZ5u4Jd0a5kV7ar2FXtanZ1u4Zd065l17br2HXtenZ9u4Hd0P7PbmQ3tpvYTe1mdnO7hd3SbmW3ttvYbe12dnu7g93R7mR3trvYXe1udne7h93T7mX3tvvYfe1+dn97gD3QHmQPtofYQ+1h9nB7hD3SHmWPtsfYY+1x9nh7gj3RnmRPtqfYU+1p9nR7hj3TnmXPtufYc+159nx7gb3QXmQvtpfYS+1l9nJ7hb3SXmWvttfYa+119np7g73R3mRvtrfYW+1t9nZ7h73T3mXvtvfYe+199n77gH3QPmQfto/YR+1j9nH7hH3SPmWfts/YZ+1z9nn7gn3RvmRftq/YV+1r9nX7hn3TvmXftu/Yd+179n37gf3QfmQ/tp/YT+1n9nP7hf3SfmW/tt/Yb+139nv7g/3R/mR/tr/YX23Mxm3CJm3Kpm3GZm3O5m3BFm3Jlm3FVm3N1m3DNm3Ltm3Hdm3P9u3ADu3IBja0kR3b3+zv9g/7p/3L/m3/sf/a/+xETmIniZPgJHWSOcmdFE5KJ5WT2knjpHXSOemdDE5GJ5OT2cniZHWyOdmdHE5OJ5eT28nj5HXyOfmdAk5Bp5BT2CniFHWKOcWdEk5Jp5RT2injlHXKOeWdCk5Fp5JT2aniVHWqOdWdGk5Np5ZT26nj1HXqOfWdBk5D5z+nkdPYaeI0dZo5zZ0WTkunldPaaeO0ddo57Z0OTkenk9PZ6eJ0dbo53Z0eTk+nl9Pb6eP0dfo5/Z0BzkBnkDPYGeIMdYY5w50RzkhnlDPaGeOMdcY5450JzkRnkjPZmeJMdaY5050ZzkxnljPbmePMdeY5850FzkJnkbPYWeIsdZY5y50VzkpnlbPaWeOsddY5650NzkZnk7PZ2eJsdbY5250dzk5nl7Pb2ePsdfY5+50DzkHnkHPYOeIcdY45x50TzknnlHPaOeOcdc45550LzkXnknPZueJcda45150bzk3nlnPbuePcde45950HzkPnkfPYeeI8dZ45z50XzkvnlfPaeeO8dd45750Pzkfnk/PZ+eJ8dTAHdwiHdCiHdhiHdTiHdwRHdCRHdhRHdTRHdwzHdCzHdhzHdTzHdwIndCIHONBBTux8c747P5yfzi/nt/PH+ev8cxK5id0kboKb1E3mJndTuCndVG5qN42b1k3npnczuBndTG5mN4ub1c3mZndzuDndXG5uN4+b183n5ncLuAXdQm5ht4hb1C3mFndLuCXdUm5pt4xb1i3nlncruBXdSm5lt4pb1a3mVndruDXdWm5tt45b163n1ncbuA3d/9xGbmO3idvUbeY2d1u4Ld1Wbmu3jdvWbee2dzu4Hd1Obme3i9vV7eZ2d3u4Pd1ebm+3j9vX7ef2dwe4A91B7mB3iDvUHeYOd0e4I91R7mh3jDvWHeeOdye4E91J7mR3ijvVneZOd2e4M91Z7mx3jjvXnefOdxe4C91F7mJ3ibvUXeYud1e4K91V7mp3jbvWXeeudze4G91N7mZ3i7vV3eZud3e4O91d7m53j7vX3efudw+4B91D7mH3iHvUPeYed0+4J91T7mn3jHvWPeeedy+4F91L7mX3invVveZed2+4N91b7m33jnvXvefedx+4D91H7mP3ifvUfeY+d1+4L91X7mv3jfvWfee+dz+4H91P7mf3i/vVxVzcJVzSpVzaZVzW5VzeFVzRlVzZVVzV1VzdNVzTtVzbdVzX9VzfDdzQjVzgQhe5sfvN/e7+cH+6v9zf7h/3r/vPTeQl9pJ4CV5SL5mX3EvhpfRSeam9NF5aL52X3svgZfQyeZm9LF5WL5uX3cvh5fRyebm9PF5eL5+X3yvgFfQKeYW9Il5Rr5hX3CvhlfRKeaW9Ml5Zr5xX3qvgVfQqeZW9Kl5Vr5pX3avh1fRqebW9Ol5dr55X32vgNfT+8xp5jb0mXlOvmdfca+G19Fp5rb02Xluvndfe6+B19Dp5nb0uXlevm9fd6+H19Hp5vb0+Xl+vn9ffG+AN9AZ5g70h3lBvmDfcG+GN9EZ5o70x3lhvnDfem+BN9CZ5k70p3lRvmjfdm+HN9GZ5s7053lxvnjffW+At9BZ5i70l3lJvmbfcW+Gt9FZ5q7013lpvnbfe2+Bt9DZ5m70t3lZvm7fd2+Ht9HZ5u7093l5vn7ffO+Ad9A55h70j3lHvmHfcO+Gd9E55p70z3lnvnHfeu+Bd9C55l70r3lXvmnfdu+Hd9G55t7073l3vnnffe+A99B55j70n3lPvmffce+G99F55r7033lvvnffe++B99D55n70v3lcP83CP8EiP8miP8ViP83hP8ERP8mRP8VRP83TP8EzP8mzP8VzP83wv8EIv8oAHPeTF3jfvu/fD++n98n57f7y/3j8vkZ/YT+In+En9ZH5yP4Wf0k/lp/bT+Gn9dH56P4Of0c/kZ/az+Fn9bH52P4ef08/l5/bz+Hn9fH5+v4Bf0C/kF/aL+EX9Yn5xv4Rf0i/ll/bL+GX9cn55v4Jf0a/kV/ar+FX9an51v4Zf06/l1/br+HX9en59v4Hf0P/Pb+Q39pv4Tf1mfnO/hd/Sb+W39tv4bf12fnu/g9/R7+R39rv4Xf1ufne/h9/T7+X39vv4ff1+fn9/gD/QH+QP9of4Q/1h/nB/hD/SH+WP9sf4Y/1x/nh/gj/Rn+RP9qf4U/1p/nR/hj/Tn+XP9uf4c/15/nx/gb/QX+Qv9pf4S/1l/nJ/hb/SX+Wv9tf4a/11/np/g7/R3+Rv9rf4W/1t/nZ/h7/T3+Xv9vf4e/19/n7/gH/QP+Qf9o/4R/1j/nH/hH/SP+Wf9s/4Z/1z/nn/gn/Rv+Rf9q/4V/1r/nX/hn/Tv+Xf9u/4d/17/n3/gf/Qf+Q/9p/4T/1n/nP/hf/Sf+W/9t/4b/13/nv/g//R/+R/9r/4X33Mx33CJ33Kp33GZ33O533BF33Jl33FV33N133DN33Lt33Hd33P9/3AD/3IBz70kR/73/zv/g//p//L/+3/8f/6//xEQeIgSZAQJA2SBcmDFEHKIFWQOkgTpA3SBemDDEHGIFOQOcgSZA2yBdmDHEHOIFeQO8gT5A3yBfmDAkHBoFBQOCgSFA2KBcWDEkHJoFRQOigTlA3KBeWDCkHFoFJQOagSVA2qBdWDGkHNoFZQO6gT1A3qBfWDBkHDhERBo6Bx0CRoGjQLmgctgpZBq6B10CZoG7QL2gcdgo5Bp6Bz0CXoGnQLugc9gp5Br6B30CfoG/QL+gcDgoHBoGBwMCQYGgwLhgcjgpHBqGB0MCYYG4wLxgcTgonBpGByMCWYGkwLpgczgpnBrGB2MCeYG8wL5gcLgoXBomBxsCRYGiwLlgcrgpXBqmB1sCZYG6wL1gcbgo3BpmBzsCXYGmwLtgc7gp3BrmB3sCfYG+wL9gcHgoPBoeBwcCQ4GhwLjgcngpPBqeB0cCY4G5wLzgcXgovBpeBycCW4GlwLrgc3gpvBreB2cCe4G9wL7gcPgofBo+Bx8CR4GjwLngcvgpfBq+B18CZ4G7wL3gcfgo/Bp+Bz8CX4GmABHhABGVABHTABG3ABHwiBGEiBHCiBGmiBHhiBGViBHTiBG3iBHwRBGEQBCGCAgjj4FnwPfgQ/g1/B7+BP8Df4FyQKE4dJwoQwaZgsTB6mCFOGqcLUYZowbZguTB9mCDOGmcLMYZYwa5gtzB7mCHOGucLcYZ4wb5gvzB8WCAuGhcLCYZGwaFgsLB6WCEuGpcLSYZmwbFguLB9WCCuGlcLKYZWwalgtrB7WCGuGtcLaYZ2wblgvrB82CBuG/4WNwsZhk7Bp2CxsHrYIW4atwtZhm7Bt2C5sH3YIO4adws5hl7Br2C3sHvYIe4a9wt5hn7Bv2C/sHw4IB4aDwsHhkHBoOCwcHo4IR4ajwtHhmHBsOC4cH04IJ4aTwsnhlHBqOC2cHs4IZ4azwtnhnHBuOC+cHy4IF4aLwsXhknBpuCxcHq4IV4arwtXhmnBtuC5cH24IN4abws3hlnBruC3cHu4Id4a7wt3hnnBvuC/cHx4ID4aHwsPhkfBoeCw8Hp4IT4anwtPhmfBseC48H14IL4aXwsvhlfBqeC28Ht4Ib4a3wtvhnfBueC+8Hz4IH4aPwsfhk/Bp+Cx8Hr4IX4avwtfhm/Bt+C58H34IP4afws/hl/BriIV4SIRkSIV0yIRsyIV8KIRiKIVyqIRqqIV6aIRmaIV26IRu6IV+GIRhGIUghCEK4/Bb+D38Ef4Mf4W/wz/h3/BfmChKHCWJEqKkUbIoeZQiShmlilJHaaK0UboofZQhyhhlijJHWaKsUbYoe5QjyhnlinJHeaK8Ub4of1QgKhgVigpHRaKiUbGoeFQiKhmVikpHZaKyUbmofFQhqhhViipHVaKqUbWoelQjqhnVimpHdaK6Ub2oftQgahj9FzWKGkdNoqZRs6h51CJqGbWKWkdtorZRu6h91CHqGHWKOkddoq5Rt6h71CPqGfWKekd9or5Rv6h/NCAaGA2KBkdDoqHRsGh4NCIaGY2KRkdjorHRuGh8NCGaGE2KJkdToqnRtGh6NCOaGc2KZkdzornRvGh+tCBaGC2KFkdLoqXRsmh5tCJaGa2KVkdrorXRumh9tCHaGG2KNkdboq3Rtmh7tCPaGe2Kdkd7or3Rvmh/dCA6GB2KDkdHoqPRseh4dCI6GZ2KTkdnorPRueh8dCG6GF2KLkdXoqvRteh6dCO6Gd2Kbkd3orvRveh+9CB6GD2KHkdPoqfRs+h59CJ6Gb2KXkdvorfRu+h99CH6GH2KPkdfoq8RFuEREZERFdERE7ERF/GREImRFMmREqmRFumREZmRFdmRE7mRF/lREIVRFIEIRiiKo2/R9+hH9DP6Ff2O/kR/o39RIpAYJAEJIClIBpKDFCAlSAVSgzQgLUgH0oMMICPIBDKDLCAryAaygxwgJ8gFcoM8IC/IB/KDAqAgKAQKgyKgKCgGioMSoCQoBUqDMqAsKAfKgwqgIqgEKoMqoCqoBqqDGqAmqAVqgzqgLqgH6oMGoCH4DzQCjUET0BQ0A81BC9AStAKtQRvQFrQD7UEH0BF0Ap1BF9AVdAPdQQ/QE/QCvUEf0Bf0A/3BADAQDAKDwRAwFAwDw8EIMBKMAqPBGDAWjAPjwQQwEUwCk8EUMBVMA9PBDDATzAKzwRwwF8wD88ECsBAsAovBErAULAPLwQqwEqwCq8EasBasA+vBBrARbAKbwRawFWwD28EOsBPsArvBHrAX7AP7wQFwEBwCh8ERcBQcA8fBCXASnAKnwRlwFpwD58EFcBFcApfBFXAVXAPXwQ1wE9wCt8EdcBfcA/fBA/AQPAKPwRPwFDwDz8EL8BK8Aq/BG/AWvAPvwQfwEXwCn8EX8BVgAAcEIAEFaMAAFnCABwIQgQRkoAAVaEAHBjCBBWzgABd4wAcBCEEEAIAAgRh8A9/BD/AT/AK/wR/wF/wDiWBimAQmwKQwGUwOU8CUMBVMDdPAtDAdTA8zwIwwE8wMs8CsMBvMDnPAnDAXzA3zwLwwH8wPC8CCsBAsDIvAorAYLA5LwJKwFCwNy8CysBwsDyvAirASrAyrwKqwGqwOa8CasBasDevAurAerA8bwIbwP9gINoZNYFPYDDaHLWBL2Aq2hm1gW9gOtocdYEfYCXaGXWBX2A12hz1gT9gL9oZ9YF/YD/aHA+BAOAgOhkPgUDgMDocj4Eg4Co6GY+BYOA6OhxPgRDgJToZT4FQ4DU6HM+BMOAvOhnPgXDgPzocL4EK4CC6GS+BSuAwuhyvgSrgKroZr4Fq4Dq6HG+BGuAluhlvgVrgNboc74E64C+6Ge+BeuA/uhwfgQXgIHoZH4FF4DB6HJ+BJeAqehmfgWXgOnocX4EV4CV6GV+BVeA1ehzfgTXgL3oZ34F14D96HD+BD+Ag+hk/gU/gMPocv4Ev4Cr6Gb+Bb+A6+hx/gR/gJfoZf4FeIQRwSkIQUpCEDWchBHgpQhBKUoQJVqEEdGtCEFrShA13oQR8GMIQRBBBCBGP4DX6HP+BP+Av+hn/gX/gPJkKJURKUgJKiZCg5SoFSolQoNUqD0qJ0KD3KgDKiTCgzyoKyomwoO8qBcqJcKDfKg/KifCg/KoAKokKoMCqCiqJiqDgqgUqiUqg0KoPKonKoPKqAKqJKqDKqgqqiaqg6qoFqolqoNqqD6qJ6qD5qgBqi/1Aj1Bg1QU1RM9QctUAtUSvUGrVBbVE71B51QB1RJ9QZdUFdUTfUHfVAPVEv1Bv1QX1RP9QfDUAD0SA0GA1BQ9EwNByNQCPRKDQajUFj0Tg0Hk1AE9EkNBlNQVPRNDQdzUAz0Sw0G81Bc9E8NB8tQAvRIrQYLUFL0TK0HK1AK9EqtBqtQWvROrQebUAb0Sa0GW1BW9E2tB3tQDvRLrQb7UF70T60Hx1AB9EhdBgdQUfRMXQcnUAn0Sl0Gp1BZ9E5dB5dQBfRJXQZXUFX0TV0Hd1AN9EtdBvdQXfRPXQfPUAP0SP0GD1BT9Ez9By9QC/RK/QavUFv0Tv0Hn1AH9En9Bl9QV8RhnBEIBJRiEYMYhGHeCQgEUlIRgpSkYZ0ZCATWchGDnKRh3wUoBBFCCCIEIrRN/Qd/UA/0S/0G/1Bf9E/lChOHCeJE+KkcbI4eZwiThmnilPHaeK0cbo4fZwhzhhnijPHWeKscbY4e5wjzhnninPHeeK8cb44f1wgLhgXigvHReKicbG4eFwiLhmXikvHZeKycbm4fFwhrhhXiivHVeKqcbW4elwjrhnXimvHdeK6cb24ftwgbhj/FzeKG8dN4qZxs7h53CJuGbeKW8dt4rZxu7h93CHuGHeKO8dd4q5xt7h73CPuGfeKe8d94r5xv7h/PCAeGP/P3l0Aa1Yc+qL/RnB3h8Hd3YlAQgIJwULwAQYYGBiYwQYd3N3d3d3d3d3dHZa0rVchA/eQnLx7z311T15u/X5Ve3d//fXX3av3+netXbWr9ibVptVm1aBq82qLastqcLVVtXU1pNqm2rYaWm1XbV8Nq4ZXO1Q7VjtVO1e7VCOqXavdqt2rPao9q72qkdXe1T7VvtV+1f7VAdWB1UHVwdUh1aHVYdXh1RHVkdVR1dHVMdWx1XHV8dUJ1YnVSdXJ1SnVqdVp1enVGdWZ1VnV2dU51bnVedX51QXVhdVF1cXVJdWl1WXV5dUV1ZXVVdXV1TXVtdV11fXVDdWN1U3VzdUt1a3VbdXt1R3VndVd1d3VPdW91X3V/dUD1YPVQ9XD1SPVo9Vj1ePVE9WT1VPV09Uz1bPVc9Xz1QvVi9VL1cvVK9Wr1WvV69Ub1ZvVW9Xb1TvVu9V71fvVB9WH1UfVx9Un1afVZ9Xn1RfVl9VX1dfVN9W31XfV91VV1VVTtVWoYpWqXJWqq3p1n7pv3a/uX49Wj16PUY9Zj1WPXY9Tj1uPV49fT1BPWE9UT1xPUk9aT1ZPXk9RT1lPVU9dT1NPW09XT1/PUA+oZ6xnqmeuZ6lnrWerZ6/nqOes56rnruep563nq+evF6gXrBeqF64XqRetF6sXr5eol6yXqpeul6mXrZerl69XqH9R/7L+Vf3resV6pXqsv/3/zt7v61XqVes/1H+sV6v/VK9er1GvWa9Vr13/uV6n/ku9br1evX69Qb1hvVG9cT2w3qTetN6sHlRvXm9Rb1kPrreqt66H1NvU29ZD6+3q7eth9fB6h3rHeqd653qXekS9a71bvXu9R71nvVc9st673qfet96v3r8+oD6wPqg+uD6kPrQ+rD68PqI+sj6qPro+pj62Pq4+vj6hPrE+qT65PqU+tT6tPr0+oz6zPqs+uz6nPrc+rz6/vqC+sL6ovri+pL60vqy+vL6ivrK+qr66vqa+tr6uvr6+ob6xvqm+ub6lvrW+rb69vqO+s76rvru+p763vq++v36gfrB+qH64fqR+tH6sfrx+on6yfqp+un6mfrZ+rn6+fqF+sX6pfrl+pX61fq1+vX6jfrN+q367fqd+t36vfr/+oP6w/qj+uP6k/rT+rP68/qL+sv6q/rr+pv62/q7+vq7qum7qtg51rFOd61J3da/p0/Rt+jX9m9Ga0ZsxmjGbsZqxm3GacZvxmvGbCZoJm4maiZtJmkmbyZrJmymaKZupmqmbaZppm+ma6ZsZmgHNjM1MzczNLM2szWzN7M0czZzNXM3czTzNvM18zfzNAs2CzULNws0izaLNYs3izRLNks1SzdLNMs2yzXLN8s0KzS+aXza/an7drNis1Pym+W2zcvO7Pr1e769fzR+b1Zo/Nas3azRrNms1azd/btZp/tKs26zXrN9s0GzYbNRs3AxsNmk2bTZrBjWbN1s0WzaDm62arZshzTbNts3QZrtm+2ZYM7zZodmx2anZudmlGdHs2uzW7N7s0Yw26m7bp9m32a/ZvzmgObA5qDm4OaQ5tDmsObw5ojmyOao5ujmmObY5rjm+OaE5sTmpObk5pTm1Oa05vTmjObM5qzm7Oac5tzmvOb+5oLmwuai5uLmkubS5rLm8uaK5srmqubq5prm2ua65vrmhubG5qbm5uaW5tbmtub25o7mzuau5u7mnube5r7m/eaB5sHmoebh5pHm0eax5vHmiebJ5qnm6eaZ5tnmueb55oXmxeal5uXmlebV5rXm9eaN5s3mrebt5p3m3ea95v/mg+bD5qPm4+aT5tPms+bz5ovmy+ar5uvmm+bb5rvm+qZq6aZq2CU1sUpOb0nRNr+3T9m37tf3b0drR2zHaMdux2rHbcdpx2/Ha8dsJ2gnbidqJ20naSdvJ2snbKdop26naqdtp2mnb6drp2xnaAe2M7UztzO0s7aztbO3s7RztnO1c7dztPO287Xzt/O0C7YLtQu3C7SLtou1i7eLtEu2S7VLt0u0y7bLtcu3y7QrtL9pftr9qf92u2K7U/qb9bbty+7v29+0q7artH9o/tqu1f2pXb9do12zXatdu/9yu0/6lXbddr12/3aDdsN2o3bgd2G7Sbtpu1g5qN2+3aLdsB7dbtVu3Q9pt2m3boe127fbtsHZ4u0O7Y7tTu3O7Szui3bXdrd293aPds92rHdnu3e7T7tvu1+7fHtAe2B7UHtwe0h7aHtYe3h7RHtke1R7dHtMe2x7XHt+e0J7YntSe3J7Sntqe1p7entGe2Z7Vnt2e057bntee317QXthe1F7cXtJe2l7WXt5e0V7ZXtVe3V7TXtte117f3tDe2N7U3tze0t7a3tbe3t7R3tne1d7d3tPe297X3t8+0D7YPtQ+3D7SPto+1j7ePtE+2T7VPt0+0z7bPtc+377Qvti+1L7cvtK+2r7Wvt6+0b7ZvtW+3b7Tvtu+177fftB+2H7Uftx+0n7aftZ+3n7Rftl+1X7dftN+237Xft9Wbd02bduGNrapzW1pu7YX+oS+oV/oH0YLo4cxwphhrDB2GCeMG8YL44cJwoRhojBxmCRMGiYLk4cpwpRhqjB1mCZMG6YL04cZwoAwY5gpzBxmCbOG2cLsYY4wZ5grzB3mCfOG+cL8YYGwYFgoLBwWCYuGxcLiYYmwZFgqLB2WCcuG5cLyYYXwi/DL8Kvw67BiWCn8Jvw2rBx+F34fVgmrhj+EP4bVwp/C6mGNsGZYK6wd/hzWCX8J64b1wvphg7Bh2ChsHAaGTcKmYbMwKGwetghbhsFhq7B1GBK2CduGoWG7sH0YFoaHHcKOYaewc9gljAi7ht3C7mGPsGfYK4wMe4d9wr5hv7B/OCAcGA4KB4dDwqHhsHB4OCIcGY4KR4djwrHhuHB8OCGcGE4KJ4dTwqnhtHB6OCOcGc4KZ4dzwrnhvHB+uCBcGC4KF4dLwqXhsnB5uCJcGa4KV4drwrXhunB9uCHcGG4KN4dbwq3htnB7uCPcGe4Kd4d7wr3hvnB/eCA8GB4KD4dHwqPhsfB4eCI8GZ4KT4dnwrPhufB8eCG8GF4KL4dXwqvhtfB6eCO8Gd4Kb4d3wrvhvfB++CB8GD4KH4dPwqfhs/B5+CJ8Gb4KX4dvwrfhu/B9qEIdmtCGEGJIIYcSutCLfWLf2C/2j6PF0eMYccw4Vhw7jhPHjePF8eMEccI4UZw4ThInjZPFyeMUcco4VZw6ThOnjdPF6eMMcUCcMc4UZ46zxFnjbHH2OEecM84V547zxHnjfHH+uEBcMC4UF46LxEXjYnHxuERcMi4Vl47LxGXjcnH5uEL8Rfxl/FX8dVwxrhR/E38bV46/i7+Pq8RV4x/iH+Nq8U9x9bhGXDOuFdeOf47rxL/EdeN6cf24QdwwbhQ3jgPjJnHTuFkcFDePW8Qt4+C4Vdw6DonbxG3j0Lhd3D4Oi8PjDnHHuFPcOe4SR8Rd425x97hH3DPuFUfGveM+cd+4X9w/HhAPjAfFg+Mh8dB4WDw8HhGPjEfFo+Mx8dh4XDw+nhBPjCfFk+Mp8dR4Wjw9nhHPjGfFs+M58dx4Xjw/XhAvjBfFi+Ml8dJ4Wbw8XhGvjFfFq+M18dp4Xbw+3hBvjDfFm+Mt8dZ4W7w93hHvjHfFu+M98d54X7w/PhAfjA/Fh+Mj8dH4WHw8PhGfjE/Fp+Mz8dn4XHw+vhBfjC/Fl+Mr8dX4Wnw9vhHfjG/Ft+M78d34Xnw/fhA/jB/Fj+Mn8dP4Wfw8fhG/jF/Fr+M38dv4Xfw+VrGOTWxjiDGmmGOJXeylPqlv6pf6p9HS6GmMNGYaK42dxknjpvHS+GmCNGGaKE2cJkmTpsnS5GmKNGWaKk2dpknTpunS9GmGNCDNmGZKM6dZ0qxptjR7miPNmeZKc6d50rxpvjR/WiAtmBZKC6dF0qJpsbR4WiItmZZKS6dl0rJpubR8WiH9Iv0y/Sr9Oq2YVkq/Sb9NK6ffpd+nVdKq6Q/pj2m19Ke0elojrZnWSmunP6d10l/Summ9tH7aIG2YNkobp4Fpk7Rp2iwNSpunLdKWaXDaKm2dhqRt0rZpaNoubZ+GpeFph7Rj2intnHZJI9Kuabe0e9oj7Zn2SiPT3mmftG/aL+2fDkgHpoPSwemQdGg6LB2ejkhHpqPS0emYdGw6Lh2fTkgnppPSyemUdGo6LZ2ezkhnprPS2emcdG46L52fLkgXpovSxemSdGm6LF2erkhXpqvS1emadG26Ll2fbkg3ppvSzemWdGu6Ld2e7kh3prvS3emedG+6L92fHkgPpofSw+mR9Gh6LD2enkhPpqfS0+mZ9Gx6Lj2fXkgvppfSy+mV9Gp6Lb2e3khvprfS2+md9G56L72fPkgfpo/Sx+mT9Gn6LH2evkhfpq/S1+mb9G36Ln2fqlSnJrUppJhSyqmkLvVyn9w398v982h59DxGHjOPlcfO4+Rx83h5/DxBnjBPlCfOk+RJ82R58jxFnjJPlafO0+Rp83R5+jxDHpBnzDPlmfMsedY8W549z5HnzHPlufM8ed48X54/L5AXzAvlhfMiedG8WF48L5GXzEvlpfMyedm8XF4+r5B/kX+Zf5V/nVfMK+Xf5N/mlfPv8u/zKnnV/If8x7xa/lNePa+R18xr5bXzn/M6+S953bxeXj9vkDfMG+WN88C8Sd40b5YH5c3zFnnLPDhvlbfOQ/I2eds8NG+Xt8/D8vC8Q94x75R3zrvkEXnXvFvePe+R98x75ZF577xP3jfvl/fPB+QD80H54HxIPjQflg/PR+Qj81H56HxMPjYfl4/PJ+QT80n55HxKPjWflk/PZ+Qz81n57HxOPjefl8/PF+QL80X54nxJvjRfli/PV+Qr81X56nxNvjZfl6/PN+Qb80355nxLvjXflm/Pd+Q781357nxPvjffl+/PD+QH80P54fxIfjQ/lh/PT+Qn81P56fxMfjY/l5/PL+QX80v55fxKfjW/ll/Pb+Q381v57fxOfje/l9/PH+QP80f54/xJ/jR/lj/PX+Qv81f56/xN/jZ/l7/PVa5zk9sccswp51xyl3ulT+lb+pX+ZbQyehmjjFnGKmOXccq4ZbwyfpmgTFgmKhOXScqkZbIyeZmiTFmmKlOXacq0ZboyfZmhDCgzlpnKzGWWMmuZrcxe5ihzlrnK3GWeMm+Zr8xfFigLloXKwmWRsmhZrCxelihLlqXK0mWZsmxZrixfVii/KL8svyq/LiuWlcpvym/LyuV35fdllbJq+UP5Y1mt/KmsXtYoa5a1ytrlz2Wd8peyblmvrF82KBuWjcrGZWDZpGxaNiuDyuZli7JlGVy2KluXIWWbsm0ZWrYr25dhZXjZoexYdio7l13KiLJr2a3sXvYoe5a9ysiyd9mn7Fv2K/uXA8qB5aBycDmkHFoOK4eXI8qR5ahydDmmHFuOK8eXE8qJ5aRycjmlnFpOK6eXM8qZ5axydjmnnFvOK+eXC8qF5aJycbmkXFouK5eXK8qV5apydbmmXFuuK9eXG8qN5aZyc7ml3FpuK7eXO8qd5a5yd7mn3FvuK/eXB8qD5aHycHmkPFoeK4+XJ8qT5anydHmmPFueK8+XF8qL5aXycnmlvFpeK6+XN8qb5a3ydnmnvFveK++XD8qH5aPycfmkfFo+K5+XL8qX5avydfmmfFu+K9+XqtSlKW0JJZZUcimlK72uT9e369f170brRu/G6MbsxurG7sbpxu3G68bvJugm7CbqJu4m6SbtJusm76bopuym6qbupumm7abrpu9m6AZ0M3YzdTN3s3SzdrN1s3dzdHN2c3Vzd/N083bzdfN3C3QLdgt1C3eLdIt2i3WLd0t0S3ZLdUt3y3TLdst1y3cr/G99fr1u/W6DbsNuo27jbmC3Sbdpt1k3qNu826LbshvcbdVt3Q3ptum27YZ223Xbd8O64d0O3Y7dTt3O3S7diG7Xbrdu926Pbs9ur25kt3e3T7dvt1+3f3dAd2B3UHdwd0h3aHdYd3h3RHdkd1R3dHdMd2x3XHd8d0J3YndSd3J3Sndqd1p3endGd2Z3Vnd2d053bnded353QXdhd1F3cXdJd2l3WXd5d0V3ZXdVd3V3TXdtd113fXdDd2N3U3dzd0t3a3dbd3t3R3dnd1d3d3dPd293X3d/90D3YPdQ93D3SPdo91j3ePdE92T3VPd090z3bPdc93z3Qvdi91L3cvdK92r3Wvd690b3ZvdW93b3Tvdu9173fvdB92H3Ufdx90n3afdZ93n3Rfdl91X3dfdN9233Xfd9V3V113RtF7rYpS53peu6HgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD9YaeVVVlyg1++n1316fXpL9Pr0TurT6/X6/49+P1Z/7Dlhr9fb+IfalD98/+2o2siLx59nSHipzz8rR//PFtHnH5sW6PV6W/40ft/e736o9emN9kPbaP/TeXpDRg00qvxx3v79Zumt2lupt+ao1yNHrb1Pb4WfLaSMKlf4sSH1+0/LibsftqJ31c/G6fcP17PC323kgN7Py4n6jPlD2XVd959t0f8Z/7hO/v/tv3qfjdn7efnz+6zfP83/On+X/36j0tF31Ov/jvxv99P4fXur/1fzP+F/WOh/yP+YP+V/1d7g3rDesFHt/+wc6Pf3+zrhf17O2L/r1/tvz+//V33/1QvgX6rvP+S/7/9L/vv+O+X/p8X/rfyP+V+lN7S3RW+l3uDekN6gUe3/LP8/hvmn/P/duD+WM/X69Osv//xb+cf89xuV/7f+Lv+jjzoDfmyacFRefsz/Av+b+f/5c36f3mr/5Zz/3I/jj9Fvlt7avaG9Ib0de9v0Bv0w7sif5unb2+ynGfuP/Ot1/Pj7wKQ/vLvIqGRM2juzz8S9Pn+bZbSJR33+h7a/dRjtr8/vA/r2ftbn79/rjTorF/hp/v69iUfVhvdG9Hbtbd0b2Bvyw2n04/PIX8++OX7qP1pv3J92etTPadSVj/ypfcqfnlam/F8+h/r/z7vwf7H+P9wnAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD/UQ8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADgnxg+YtetBw4ZMmiYioqKyk+Vf/XJBPyf9j9C/69eCQAAAAAAAAAAAPC/6r/jz4n/1dcIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAwL+fT1c/6g9rrrTGgAEDBvT69h3V2Py8z8jeyF43wd/qE45q6/r9rezT6/X693q9I2eZd6OVt1ltnL+2jT/DHdvvftlMd+0wztrXjX/rGL3HJ9zw068Weu/xyR+f6tOy5paDhw8YPHzAtkN3GDBwwCZDh+4wcJMhgwZsNnj41vMNWG3IoIHDBw0YvO3wQcN+9vbmQ4Zut92IAQO33Wy8sbcbNmj48AEDtx0xYOtBIwbsMHTADsNGDBi4xcDB2w6Yb775Bow39n/b5v3bW+uS/ycAAP//emnTcA==")
open(&(0x7f00000005c0)='./bus\x00', 0x64842, 0x0)
syz_mount_image$ext4(&(0x7f0000000180)='ext4\x00', &(0x7f00000000c0)='./file0\x00', 0x800714, &(0x7f00000003c0)={[{@dioread_nolock}, {@journal_dev={'journal_dev', 0x3d, 0x7}}, {@quota}, {@noinit_itable}, {@errors_continue}, {@errors_continue}, {@errors_remount}, {@delalloc}, {@auto_da_alloc}, {@norecovery}, {@errors_continue}, {@journal_ioprio={'journal_ioprio', 0x3d, 0x1}}]}, 0xee, 0x442, &(0x7f0000000d00)="$eJzs281vG0UbAPBn10n6vv1KKKW0oYVAQUR8JE1aoAcuIJA4gIQEh3IMSVqFug1qgkSrCAJC5YgqcUcckfgLOMEFASckrnBHlSqIkFo4Ba29m9punObDrkP9+0nbznjHnnk8O97xTBxA1xrK/kkidkfErxHRX83WFxiq/ndjaWHy76WFySSWl9/4I6mUu760MFkULZ63K88MpxHpJ0leSb25i5fOTpTL0xfy/Oj8uXdH5y5eenrm3MSZ6TPT58dPnjxxfOy5Z8efaUmcWVzXBz+YPXzolbeuvDZ56srbP36dtffAker52jhaZSgL/M/lisZzj7W6sg7bU5NOejrYEDakFBFZd/VWxn9/lOJm5/XHyx93tHFAW2X3ph3NTy8uA3exJDrdAqAziht99v23OO7Q1GNbuPZC9QtQFveN/Kie6Yk0L9PbxvqHIuLU4j9fZEe0aR0CAKDWt9n856nV5n9pHKgptzffQxmIiHsiYl9E3BsR+yPivohK2fsj4uAG62/cGrp1/pNe3VRg65TN/57P97bq53/F7C8GSnluTyX+3uT0THn6WP6eDEfvjiw/tkYd3730y2fNztXO/7Ijq7+YC+btuNrTsEA3NTE/0apJ6bWPIgZ7Vos/WdkJSCLiUEQMbuyl9xaJmSe+Otys0O3jX0ML9pmWv4x4vNr/i9EQfyFZe39y9H9Rnj42WlwVt/rp58uvN6t/S/G3QNb/O+uv/4YS/X8ltfu1cxuv4/Jvnzb9TrPZ678vebOyZ92XP/b+xPz8hbGIvuTVSr7u8fGbzy3yRfks/uGjq4//fflzsvgfiIjsIj4SEQ9GxEN52x+OiEci4uga8f/w4qPvbD7+9srin1r182/l+h+o7/+NJ0pnv/+mWf3r6/8TldRw/kjl8+821tvArbx3AAAA8F+RRsTuSNKRlXSajoxU/4Z/f+xMy7Nz80+enn3v/FT1NwID0ZsWK139NeuhY8li/orV/Hi+VlycP56vG39e+n8lPzI5W57qcOzQ7XY1Gf+Z30udbh3Qdn6vBd2rcfynHWoHcOe5/0P3Mv6hexn/0L1WG/8fNuTtBcDdyf0fupfxD93L+IfuZfxDV9rK7/olujkR6bZoRrsScXBbNKNziU5/MgEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALTGvwEAAP//KivtwQ==")
syz_mount_image$hfs(&(0x7f0000000140), &(0x7f0000000000)='./bus\x00', 0xc0d4, &(0x7f0000002700)=ANY=[], 0x1, 0x270, &(0x7f00000008c0)="$eJzs3UFrE0EYxvFnNrFGW+raVgTxINWCJ2nrRbwI0m/gxZOoTYRiqKAV1IvVs/gBvPsV/BCexLPgzZPee6vMuxOzSbPZNCZOEv8/SNhk5915Jzvp7JtQIgD/rdtb3z9e/+FvTqqoIummlEiqSVVJ53S+9nx3b2ev2aj3O1DFIvzNKYt0R9ps7zZ6hfo4iwhS/6iqhfxzGI/at9gZYBLYuz/zK/98Ip0M707bX4uS3ejtx04gMnegA73QYuw8AABxhfU/Cev8Qrh+TxJpLSz7nev/lC+gB7ETiCy3/luVdej8+T1ju9r1npVwfn/SqhKH6WtO2cyqdCRQVlVaLsmpRzvNxrXtJ816ore6FeSardh9PZu6Lfls3xw99GqP2rSP4cc+b2M44cewWZD/8mh7LOc+uy/unkv1QfU/13/VQ+dPk52ptOtMZfmvFx/RRplmrQpGedY6uRB6CPqOsqKuNPLmwjE7PiBIy/K0qKWuqGx0GyVRyz2jNkuiVrqj2rO5OHLc3Ht3163qpz5pK3f9n/hXe02DvDN9G2sZZkbf8VStZWrribtkT+1f7NkyGXpIOL53eqgbWnz28tXjB81m4ykbQ2y8Di/mpOQz6EZrEkxKPjO74V/kKL1X/3pmRvmrhH+sfdKPGch3M7PCX3e5rP7L1SvrViL5u7TPdfph2cFzR9woqA2W7P50cQXXwdlHD/NyRV9FDFpzXb4qXRmkx0wa8pwRbktfdZ/P/wEAAAAAAAAAAAAAAAAAAKbN6P7loKaiXbHHCAAAAAAAAAAAAAAAAAAAAADAtJu43/+9o+wRv/8LjN3vAAAA//9nXXUi")
openat$cgroup_ro(0xffffffffffffff9c, &(0x7f00000000c0)='memory.events\x00', 0x26e1, 0x0)
openat$cgroup_ro(0xffffffffffffff9c, &(0x7f0000000100)='memory.events\x00', 0x100002, 0x0)
openat$fuse(0xffffffffffffff9c, &(0x7f0000000180), 0x2, 0x0)
renameat2(0xffffffffffffff9c, &(0x7f0000000080)='./file1\x00', 0xffffffffffffffff, 0x0, 0x0)
r3 = openat(0xffffffffffffff9c, &(0x7f0000000040)='./file1\x00', 0x42, 0x0)
pwrite64(r3, &(0x7f0000000140)='2', 0x1, 0x8000c61)
syz_mount_image$ext4(&(0x7f0000000040)='ext4\x00', &(0x7f0000000000)='./bus\x00', 0x400e, &(0x7f00000001c0)={[{@i_version}, {@nombcache}, {@debug_want_extra_isize={'debug_want_extra_isize', 0x3d, 0x68}}, {@lazytime}, {@block_validity}, {@quota}]}, 0x1, 0x42f, &(0x7f0000000940)="$eJzs289rHFUcAPDvzCat/WViqT+aVo1WMfgjadJae/CiKHhQEPRQjzFJS+y2kSaCLUGjSD1Kwbt4FPwLPOlF1JPgVe9SKJJLq6eV2Z1Jdje7aZJustX9fGCS92be8t53Z97ue/N2AuhZw9mfJGJ/RPweEQO1bGOB4dq/W8uLU38vL04lUam89VdSLXdzeXGqKFq8bl+R6YtIP0viSIt65y9fOT9ZLs9cyvNjCxfeH5u/fOW52QuT52bOzVycOH365InxF05NPN+ROLO4bg59NHf08GvvXHtj6sy1d3/+Ninib4qjQ4bXO/hkpdLh6rrrQF066etiQ9iUUq2bRn+1/w9EKVZP3kC8+mlXGwdsq0qlUnmg/eGlCvA/lkS3WwB0R/FFn81/i22Hhh53hRsv1SZAWdy38q12pC/SvEx/0/y2k4Yj4szSP19lW2zPfQgAgAbfZ+OfZ1uN/9Kovy90b76GMhgR90XEwYg4FRGHIuL+iGrZByPioU3W37xIsnb8k17fUmAblI3/XszXthrHf8XoLwZLee5ANf7+5OxseeZ4/p6MRP/uLD++Th0/vPLbF+2O1Y//si2rvxgL5u243re78TXTkwuTdxJzvRufRAz1tYo/WVkJSCLicEQMbbGO2ae/Odru2O3jX0cH1pkqX0c8VTv/S9EUfyFZf31y7J4ozxwfK66KtX759eqb7eq/o/g7IDv/e1te/yvxDyb167Xzm6/j6h+ft53TbPX635W83bDvw8mFhUvjEbuS12uNrt8/0VRuYrV8Fv/Isdb9/2CsvhNHIiK7iB+OiEci4tG87Y9FxOMRcWyd+H96+Yn3th7/9srin97U+V9N7IrmPa0TpfM/ftdQ6eBm4s/O/8lqaiTfs5HPv420a2tXMwAAAPz3pBGxP5J0dCWdpqOjtd/wH4q9aXlufuGZs3MfXJyuPSMwGP1pcadroO5+6Hg+rS/yE035E/l94y9Le6r50am58nS3g4cet69N/8/8Wep264Bt53kt6F36P/Qu/R96l/4PvatF/9/TjXYAO6/V9//HXWgHsPOa+r9lP+gh5v/Qu/R/6F36P/Sk+T1x+4fkJSTWJCK9K5ohsU2Jbn8yAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAdMa/AQAA//9QOObV")
syz_io_uring_setup(0x0, 0xfffffffffffffffc, 0x0, 0x0)
syz_mount_image$msdos(&(0x7f00000000c0), &(0x7f0000000080)='./file1\x00', 0x200000, &(0x7f0000000500)=ANY=[], 0xfc, 0x1c7, &(0x7f0000001100)="$eJzs3bFu00AYB/DPKSQpQ9UZMViwMFXAExShIiEiIYEywAQSTC0LLIGpI6/AW6JO3Q4lZxMcB4kAjgn8fks++e873/ksZ8rl5Y23p5+GsTAeFzE4juO4LOIwBnHzIB8/j5b99iEAYFdcphRfUtb3WACA7fD9DwD/n2fPXzy+P5mcPC3LccTFeVEdn03z58NHk5M75cLhstXFbDbdi/rcuzmfNvOrca1qfy/nZTMfxu1bOZ9nD55MVvJRvO5y4gAAAAAAAAAAAAAAAAAAAAAA0KOj8pvW/j6L/OhHea6+2x9oZf+ez3H9ytamAQAAAAAAAAAAAAAAAAAAADvt/bguPnw8fXV29ubdjhcppTSfzyatRtU9+J2rH/wFc28VEWuieuk36bBu091Q094f7LCcF0VE9LwEv/hEDZurM6hufndDLdY8Est3xKjztxAAAAAAAAAAAAAAAAAAABCNn/33PRIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA6M/y//83LNIod/ATJ6+77v6W5wkAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMC/7WsAAAD//wc2Fgs=")
syz_mount_image$udf(&(0x7f0000000280), &(0x7f0000000180)='./file1\x00', 0x0, &(0x7f0000001000)=ANY=[@ANYRESHEX=r3], 0x1, 0xc2d, &(0x7f0000002580)="$eJzs3U9sHNd9B/DfGy3FldxWTOwoThoXm7ZIZcVy9S+mYhXuqqbZBpBlIhRzC8AVSakLUyRBUo1spAXTSw89BCiKHnIi0BoFUjQwmiLokWldILn4UOTUE9HCRlD0wBYBcgoYzOxbcUmRNi2KEmV9Pjb13Z19b+a9eesZWdCbFwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAxB+8cun0mfSwWwEAPEhXRr96+qz7PwA8Vq76/38AAAAAAAAAAAAAADjoUhTxZKSYu7KWxqv3HfXL7b5bt8eGhrevdiRVNQ9V5cuf+pmz585/6YXBC9283J75gPr322fjtdGrlxovz96cm59aWJiabIzNtCdmJ6d2vYe91t/qZHUCGjdfvzV5/fpC4+zz5zZ9fHvg/f4njg9cHHz21DPdsmNDw8OjG0XqveVr99yQjp1meByOIk5Fiue+99PUiogi9n4u6g927Lc6UnXiZNWJsaHhqiPT7dbMYvnhSPdEFBGNnkrN7jnafiyi1vdA+7CzZsRS2fyywSfL7o3OteZb16anGiOt+cX2Ynt2ZiR1Wlv2pxFFXEgRyxGx2n/37vqiiFqk+M6xtXQtIg51z8MXq4nBO7ej2Mc+7kLZzkZfxHLxCIzZAdYfRbwaKX72zomYyNeZ6lrzhYhXy/xBxFtlvhSRyi/G+Yj3tvke8WiqRRF/WY7/xbU0WV0PuteVy19rfGXm+mxP2e515SPeH+66Ujyk+8ORLflgHPBrUz2KaFVX/LV077/ZAQAAAAAAAAAAAAAAAOB+OxJFfCZSvPIff1LNK45qXvqxi4N/OPCrvXPGn/6Q/ZRln4+IpWJ3c3IP54mBI2kkpYc8l/hxVo8i/jTP//vWw24MAAAAAAAAAAAAAAAAAADAY62In0SKF989kZajd03x9syNxtXWtenOqrDdtX+7a6avr6+vN1InmznHcy7lXM65knM1ZxS5fs5mzvGcSzmXc67kXM0Zh3L9nM2c4zmXci7nXMm5mjNquX7OZs7xnEs5l3Ou5FzNGQdk7V4AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAgI+TIor4RaT49jfWUqSIaEaMRydX+h926wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAUn8q4vuRovFHzTvbahGRqn87TpS/nI/m4TI/Gc3BMl+K5qWcrSprzW89hPazN32piB9Hiv7623cGPI9/X+fdna9BvPXNjXefrXXyUPfDgff7nzh+7OLg8G88vdPrtF0DTl5uz9y63RgbGh4e7dlcy0f/ZM+2gXzc4v50nYhYeOPN11vT01Pz9/6i/Arsofoj9CLVHpeeelG9iNqBaMbD6TuPgfL+/16k+N13/7N7w+/c/+vxK513d+7w8fM/27j/v7h1R7u8/9e21sv3//Kevt39/8mebS/m34301SLqizfn+o5H1BfeePNU+2brxtSNqZnzp09/eXDwy+dO9x2OqF9vT0/1vLovpwsAAAAAAAAAAAAAAADgwUlF/H6kaP14LTUi4nY1X2vg4uCzp545FIeq+Vab5m2/Nnr1UuPl2Ztz81MLC1OTjbGZ9sTs5NRuD1evpnuNDQ3vS2c+1JF9bv+R+suzc2/Mt2/88eK2nx+tX7q2sDjfmtj+4zgSRUSzd8vJqsFjQ8NVo6fbrZmq6si2k+k/ur5UxH9FionzjfT5vC3P/986w3/T/P+lrTvap/n/n+jZVh4zpSJ+Hil+56+ejs9X7Twad52zXO7vIsXJC5/L5eJwWa7bhs5zBTozA8uy/xcp/ukXm8t250M+uVH2zK5P7COiHP9jkeL7f/Hd+M28bfPzH7Yf/6Nbd7RP4/9Uz7ajm55XsOeuk8f/VKR46cm347fytg96/kf32RsncuE7z+fYp/H/VM+2gXzc374/XQcAAAAAAAAAAHik9aUi/j5S/HC4ll7I23bz9/8mt+5on/7+16d7tk3en/WKPvTFnk8qAAAAABwQfamIn0SKG4tv35lDvXn+d8/8z9/bmP85lLZ8Wv05369Vzw24n3/+12sgH3d8790GAAAAAAAAAAAAAAAAAACAAyWlIl7I66mPV/P5J3dcT30lUrzyP8/lcul4Wa67DvxA9Wv9yuzMqUvT07MTrcXWtempxuhca2KqrPtUpFj728/lukW1vnp3vfnOGu8ba7HPR4rhf+iW7azF3l2b/KmNsmfKsp+IFP/9j5vLdtex/tRG2bNl2b+JFF//l+3LHt8oe64s+91I8aOvN7plj5Zlu89H/fRG2ecnZot9GBUAAAAAAAAAAAAAAAAAAAAeN32piD+PFP97c/nOXP68/n9fz9vKW9/sWe9/i9vVOv8D1fr/O72+l/X/q+cKLO10VAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA+HhKUcSbkWLuylpa6S/fd9Qvt2du3R4bGt6+2pFU1TxUlS9/6mfOnjv/pRcGL3Tzg+vfb5+J10avXmq8PHtzbn5qYWFqsjE2056YnZza9R72Wn+rk9UJaNx8/dbk9esLjbPPn9v08e2B9/ufOD5wcfDZU890y44NDQ+P9pSp9d3z0e+Sdth+OIr460jx3Pd+mn7YH1HE3s/Fh3x39tuRqhMnq06MDQ1XHZlut2YWyw9HuieiiGj0VGp2z9EDGIs9aUYslc0vG3yy7N7oXGu+dW16qjHSml9sL7ZnZ0ZSp7VlfxpRxIUUsRwRq/13764ving9Unzn2Fr61/6IQ93z8MUro189fXbndhT72MddKNvZ6ItYLh6BMTvA+qOIf44UP3vnRPxbf0QtOj/xhYhXy/xBxFvRGe9UfjHOR7y3zfeIR1Mtivj/cvwvrqV3+svrQfe6cvlrja/MXJ/tKdu9rjzy94cH6YBfm+pRxI+qK/5a+nf/XQMAAAAAAAAAAAAAAAAcIEX8eqR48d0TqZoffGdOcXvmRuNq69p0Z1pfd+5fd870+vr6eiN1splzPOdSzuWcKzlXc0aR6+dslllfXx/P75dyLudcybmaMw7l+jmbOcdzLuVczrmSczVn1HL9nM2c4zmXci7nXMm5mjMOyNw9AAAAAAAAAAAAAAAAAADg46Wo/knx7W+spfX+zvrS49HJFeuBfuz9MgAA//8hX/ir")
r4 = openat(0xffffffffffffff9c, &(0x7f0000002540)='./file1\x00', 0x42, 0x0)
ftruncate(r4, 0xf2d)


r0 = openat$random(0xffffffffffffff9c, &(0x7f0000000000), 0x284800, 0x0)
ioctl$BTRFS_IOC_START_SYNC(r0, 0x80089418, 0x0)
syz_mount_image$jfs(&(0x7f0000000040), &(0x7f0000000240)='./file7\x00', 0xc03, &(0x7f0000001a40)=ANY=[@ANYRES8=0x0, @ANYRESOCT=0x0, @ANYRES16, @ANYRES16=0x0, @ANYRESDEC, @ANYRESOCT=0x0, @ANYRES64], 0x2, 0x5f51, &(0x7f0000007240)="$eJzs3VtvHGf9B/DfHrw+9N/U6kWVf8QhTTm0lCaNkzYtp6ZC4gIEVKpyn8i4VUQKKAkVrSziKhL3SFyj8iK4BqHeIBWJl8AbiBT3hgpEB439PMl4vM7aJN6x/Xw+kjP722fG+0y+Hs+u5/AEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABDf/96bZ3sRcfnd9MRixP/FIKIfMV/XJ+OvEXExzz+MiOOx0RxPRcSx2Yh6+Y1/nog4HxEfH4u4u766XD+9tMt+vPTGnb/98M0frf32i3/5+78/+v2f2u1vfPTdH/zxVsTi8d/87j+3Hs26AwAAQCmqqqp6Gx/zI06kz/f9rjsFAExF3v9XSX5erX6Yehhbdd0ftVqtVm9XjXerWUTEWnOZ+j2Dw/EAcMisxaddd4EOyb9ow4h4rOtOAAdar+sOsC/urq8u91K+veb+4ORmez4XZEv+a71713fsNJ2kfY7JtH6+bscgntyhP/NT6sNBkvPvt/O/vNk+SvPtd/7TslP+o81Ln4qT8x+08285Ovn3x+Zfqpz/cE/5D+QPAAAAAAAHWP77/2LHx39nH35VduVBx39PTqkPAAAAAAAAAPCo7WL8v/rBjuP/3WP8PwAAADiw6s/qtQ+P3X9up3ux1c9f6kU83pofKEy6WGah634AAAAAAAAAAAAAQEmGm+fwXupFzETE4wsLVVXVX03teq8edvnDrvT1h5J1/UseAAA2fXysdS1/L2IuIi6le/3NLCwsVNXc/EK1UM3P5vezo9m5ar7xuTZP6+dmR7t4QzwcVfU3m2ss1zTp8/Kk9vb3q19rVA120bHp6DBwAIiIzb3RXXukI6aqnoiu3+VwONj+jx7bP7vR9c8pAAAAsP+qqqp66XbeJ9Ix/37XnQIApiLv/9vHBdRqtVqtVh+9uqka71aziIi15jL1ewbD8QPAIbMWn3bdBTok/6INI+J4150ADrRe1x1gX9xdX13upXx7zf1BGt89nwuyJf+13sZyeflx00na55hM6+frdgziyR3689SU+nCQ5Pz77fwvb7aP0nz7nf+07JR/vZ6LHfSnazn/QTv/lqOTf39s/qXK+Q/3lP9A/gAAAAAAcIDlv/8vOv6bVxkAAAAAAAAADp2766vL+brXfPz/c2Pmc/3n0ZTz78m/SDn/fjv/1gk5g8bjO6/fz/+T9dXlD///whfy9MDnPzMY1a890+sPhpvn/Pwz39p0JV7cNn89TzXzVlyNa7ESZ7e1z2xpX5rQfm5b+6hun8/tp2M5fh7X4if32mcnnBg1N6G9mtCe8x/Y/ouU8x82vur8F1J7rzWt3fmgv227b07Hvc7FH392YfvWNX23Y3Bv3Zrq9TvVQX82/k8eG8Uvb6xcP/2rKzdvXj8babLl2aVIk0cs5z+TvnL+zz6z2Z5/7ze31zsfjPac/0FxO4Y75v9M43G9vs9NuW9dyPmP0lfOP++Bxm//hzn/nbf/5zvoDwAAAAAAAAAAAAAAADxIVVUbl4hejIiX0/U/XV2bCQBMV97/V/lmGIlarVar1eqjVzdV473WLCLiz81l6vcMvx73zQCAg+yziPhH152gM/IvWL7fXz39UtedAabqxnvv//TKtWsr12/8L0tXg0ffIwAAAAAAAABgr/L4nycb4z9vnAfUGjd6y/ivr8fJT9ZXl99d/NfnD934n/3RYGOs87RCT0dzfO7tIxSfigeP/z2c8HozE9pHE9pnJ7TPTWgfe6FHQ87/6ZRxzv9EWrEHjf+a829PJ7xkpx40/uuzHfSnazn/U2ms55z/V1vzNfOv/nCYx//tb8n/zM13fnHmxnvvv3D1nStvr7y98rNz55dePbt04fwrL5556+q1lfRvhz3eXzn/PPa180DLkvPPmcu/LDn/L6da/mXJ+X8l1fIvS84/v9+Tf1ly/vmzj/zLkvN/LtXyL0vO/2upln9Zcv7Pp1r+Zcn5fz3V8i9Lzv+FVMu/LDn/06mWf1ly/mdSLf+y5PzzES75lyXnn89skH9Zcv5LqZZ/WXL+51It/7Lk/M+nWv5lyfm/lGr5lyXn/3Kq95C/e38dATn/C6m2/Zcl5/9KquVflpz/q6mWf1ly/t9ItfzLkvP/ZqrlX5ac/7dSLf+y5Py/nWr5lyXn/524fzGp/MuR838t1bb/sty//78HHnjgQX7Q9W8mAAAAAAAAAAAAAKBtGqcTd72OAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/2UHDgQAAAAAgPxfG6GqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqoKO3AgAAAAAADk/9oIVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVdi7uxi5yvt+4Gff7LUh9iYxhBcDa2PAwOJdv+CX/78OBkJKoW0ICfSN1Lj22jjxW71rAgiJjaApEkjlggtaKSkgVOUiVVCbqEGiEZUqtelNe9XeVGmlRhWKQuVEvWlUcHXmPM/jmdnZmV3vrj1zzueD8M87c2bm2TNnZvc71ncGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACg3oZ7J/+wL8uy/P/aHyNZdnn+91XZvvzLmZ2XeoUAAADAYn1Y+/Pba9MJ++Zxobpt/u76f/zuuXPnzmXZn6z71Kv5DEazbM3KLCvOC4Z/cqphm+D5bLivv+7r/g43P9Dh/MEO5w91OH9Fh/NXdjh/uMP5s3bALKuK12NqV7ap9teRYpdm67Kh2nmbWlzq+b6V/f3xtZyavtplzg0dzo5mx7LJbGLWZfpq/2XZuxvy27o/i7fVX3db67MsO/uzZw/GNfSFfbwpa7ixmvr77oO7s9Gf/+zZg0+O/OK6VrPjbpi10izbvDFf5wtZdv7lqqwvW5n2SVxnf90617dY50DDOvtql8v/3rzOs/NcZ/y+h8M6/6nNOteH0566McuymWzObZo9n/Vnq5tuNe3v4eKIyK8jvys/kQ0u6DjZMI/jJL/Mj29sPE6aj8m4/zeEfTI4xxrq744PvrZi1n6/0OMk/6674VjNr/uh/EaHh+tfWm04VvNtnr1p7mOg5X3X4hhIx3LdMbCx0zHQv2Kgdgz0n1/zxoZjYOusy/RnfbXbev+m9sfA+PTxU+NTTz9zx9HjB45MHpk8sX3Htj1bt+3asXti/PDRY5Phz4Xt0h6yOutPx+DG8FwTj8FbmratPyTPvVE8Dl6/etf1reZC1jC8RI+DxawhC8fLF2/OF3R5fzbHMZ5v88LmxT8O0s/9usfBYN3joOVzaovHweA8Hgf5Nmc3z+9n5mDd/63W0Oq5cCmOgZG6Y2AxPw/r13AhPw/z23zs1rmfC9eHdb1420J/Hg7MOgbit9UXHnv5Ken3veHdYb/MPi6uyc+4bEV2Zmry9JanDkxPn96ahXFRfLLuvmo+XlbXfU/ZrOOlf8HHy77PfbTrmhanj4R9NXx7+/sq32bHWPv7qvbs3np/Npy6LQtjiV3s/dnqp1m+P1OWaLM/821euGPxvwumXFL3/DfU6flvYGiweP4bSHtjqOH5b/ZdM1BbWZadvWN+z39D4f+L/fy3rkue//J99diW9sdAvs2L4ws9BgbbPv/dGGZfWM+tITEM1+X+j2rnzxSHad192fG4GRwcCsfNYLzFxuNm+6zL5NeW3/bmiQs7bjbf2HhfNfzeUsLjJt9Xr060P27ybd7buvjnjlXxr3XPHSs6HQNDAyvy9Q6lg6B4vju3Kh4DW7KD2cnsWHYoXSa/l/PbGts2v2NgRfj/Yj93XNUlx0C+r17b1v4YyLf52+1L+7vT5nBK2qbud6fm1xfmyvzXDJ6/vubdttSZP1/nZ3a0f20o3+anOxaaM9rvp9vDKZe12E/Nj5+5julDWef9tFTHdL7OY3e2f20q32bdznkeT/uyLHvnpbeK17uK13f/4sw/f7fhdd9Wrym/89Jbn7/6hz9cyPoBALhwH9X+nFlR/K5Z9y/W8/n3fwAAAKAnxNzfH2Yi/wMAAEBpxNw/EGYi/wMAAEBpxNw/GGayr9O745XDE31ff/XD57L0boDngnh+fBnkoZXFdrHjPRO+Hj13Xn76PW8NPfi95+Z32/1Zlv3vA9e23P6JlXFdhVNxnQONp89y1Q3zuv3HHzm/Xf37J5ztL64/fj/zfRkodpXf/bd7atc7emsx33sgq82HZ158vnb9e4qv4/bv/0ex3TfCm5bsO9zXcPnNYT2bwhwN7ynz0Krz+yGf8XLfeffI33/60fO3Fy/Xt3FN7dt8bUtxvfE9ol75y2L7+H3Ptf6/fulb38m3f+qm1ut/rr/1+t8P1/vjMP/ng+L0+n3+vbr1/0FYf7y9eLktb/6g5frf/qti+7fDcfF6mM3rv/uPrvuw1f0Vb2ffYHG5ePsTf3Zf7XLx+uL1N69/ePyehv3RfP3vvVlcz94n/3ugfvt4eryd6PHBxuO7L9y/DT3yLMu+9fWsYT9nQ8Xl3mlaf7y+U4Ot13970zpPvfFE7fLN30/0zUfubfn9xvXs+/ORhu/nlTVh//Wv+of8et+/NhyP4fxfzBTX1/xepm+vaXy+idu/PlI8buP1jTet/5Wm9c/ckO+7zuu//+fF+t++a2XD+vetDcfTx4vZaf1H/nRtw+Xf+GyxntNfHTtxcurM0fgeByNNj+OVw6tWX3b5x9asDc+lzV/vPzn9xOTp0YnRiSwb7cG3DFzu9b8Z5n8VY2bpb6HwL4PFcffyg8XPrVuGiq9fCac/Hu7P+PPxm3881HC8Nt/vM8PFXOz6bwvrmK/1m36ye14b/ueOt1/914e/1Px7Qfx+Tl0xXPv+XttwZe28vveK85ufrzr59ysaH9c/WlfM74f9ei68M/PGK4vba77++N4kL3+hePzG3+Ti5bOm9xMZGWj8Pha7/h+F32N+cFXj8188Pr7/XNO7OY9kffkSZsLzQzZTnB+3ivv75bNXtry9+D482czVC1nmnKaenho/dvTEmafGpyenpsennn5m//GTZ05M76+9d+n+L3e6/PnH9+ra4/vQ5M4dWe3RfrIYy+xSr//UIwcP7Zq4+dDk4QNnDk8/cmry9JGDU1MHJw9N3Xzg8OHJr3a6/NFDe7du27N917axI0cP7d29Z8/2PWNHT5zMl1EsqoOdE18ZO3F6f+0iU3t37Nl65507JsaOnzw0uXfXxMTYmU6Xr/1sGssv/eTY6cljB6aPHp8cmzr6zOTerXt27tzW8d0fj586PDU6fvrMifEzU5Onx4vvZXS6dnL+s6/T5amGqbXh+a5JX/jt/L7bd6b3x8299bU5r6rYZKTxxJ+G94L6xvD23fP5Oub+oTAT//4PAAAApRFzf/h8ivOvu8v/AAAAUBrhA//CZ0b6938AAAAoo5j7h8NMKpL/9f/1/y+g/5/q2vr/+v+Z/r/+fwf6//r/7ej/6//38vr1//X/6azb+v8x96/KskrmfwAAAKiCmPtXh5nI/wAAAFAaMfdfFmYi/wMAAEBpxNx/eZhJRfK//r/+v8//L1//vy/LZvT/9f+7hf6//n87+v/6/728fv1//X8667b+f8z9HwszqUj+BwAAgCqIuX9NmIn8DwAAAKURc//aMBP5HwAAAEoj5v6RMJMy5f+75j5L/1//X/+/fP1/n/9f0P/vDvr/+v/t6P/r//fy+vX/9f/prNv6/zH3fzzMpEz5HwAAACou5v5PhJnI/wAAAFAaMfd/MsxE/gcAAIDSiLl/XZhJRfK//r/+v/6//r/+v/7/ctL/L0f/f1U4Wf+/kf6//r/+v/4/7XVb/z/m/ivCTCqS/wEAAKAKYu6/MsxE/gcAAIDSiLn/U2Em8j8AAACURsz9V4WZVCT/6//r/+v/6//r/+v/Lyf9/3L0/+PJ+v+N9P/1//X/9f9pr9v6/zH3Xx1mUpH8DwAAAFUQc/81YSbyPwAAAJRGzP3XhpnI/wAAAFAaMfevDzOpSP7X/9f/1//X/9f/1/9fTvr/+v/t6P/r//fy+vX/9f/prNv6/zH3XxdmUpH8DwAAAFUQc//1YSbyPwAAAJRGzP03hJnI/wAAAFAaMfePhplUJP/r/+v/6//r/+v/6/8vJ/1//f929P/1/3t5/fr/+v901m39/5j7N4SZVCT/AwAAQBXE3L8xzET+BwAAgNKIuf/GMBP5HwAAAEoj5v5NYSYVyf/6//r/+v9l7f8P6P/r/3cF/X/9/3b0//X/e3n9+v/6/3TWbf3/mPtvCjOpSP4HAACAKoi5/+YwE/kfAAAASiPm/lvCTOR/AAAAKI2Y+zeHmVQk/+v/6//r/5e1/+/z//X/u4P+v/5/O/r/+v+9vH79f/1/Ouu2/n/M/beGmVQk/wMAAEAVxNx/W5iJ/A8AAAClEXP/7WEm8j8AAACURsz9Y2EmFcn/+v/6//r/+v/6//r/y0n/X/+/Hf1//f9eXr/+v/4/nXVb/z/m/jvCTCqS/wEAAKAKYu7fEmYi/wMAAEBpxNw/HmYi/wMAAEBpxNw/EWZSkfyv/6//r/+v/6//r/+/nPT/9f/b0f/X/+/l9ev/6//TWbf1/2Pu3xpmUpH8DwAAAD3quoVsHHP/tjAT+R8AAABKI+b+7WEm8j8AAACURsz9O8JMKpL/9f/1//X/9f/1//X/l5P+v/5/O/X9//yS+v9V6f/P9ZOmV9Zf0P/X/6ezbuv/x9x/Z5hJRfI/AAAAVEHM/TvDTOR/AAAAKI2Y+3eFmczO/39z8VYFAAAALKWY+3eHmfT8v//Pr1dVqf7/gfvTX/X/C/r/+v+Z/r/+/zLT/9f/b8fn/1e1/780LvX69f/1/+ms2/r/MffvCTPp+fwPAAAARDH3/78wE/kfAAAASiPm/v8fZiL/AwAAQGnE3P9LYSYVyf+V6v/X0f8v6P/r/2f6//r/y0z/X/+/Hf1//f9eXn/39v+/vTrL9P/pDt3W/4+5f2+YSUXyPwAAAFRBzP2fDjOR/wEAAKA0Yu6/K8xE/gcAAIDSiLl/X5hJRfK//r/+v/6//r/+v/7/ctL/1/9vZ3H9/1H9/0W61P35Xl9/9/b/ff4/3aPb+v8x998dZlKR/A8AAABVEHP/PWEm8j8AAACURsz994aZyP8AAABQGjH3fybMpCL5/6L1/1sUivX/9f8z/X/9f/1//f9F0v+vcv/f5/8v1qXuz/f6+vX/9f/prNv6/zH33xdmUpH8DwAAAFUQc/9nw0zkfwAAACiNmPt/OcxE/gcAAIDSiLn//jCTiuR/n/+v/6//r/+v/6//v5z0//X/29H/1//v5fXr/+v/01m39f9j7v+VMJOK5H8AAACogpj7Hwgzkf8BAACgNGLufzDMRP4HAACA0oi5/1fDTCqS//X/9f/1//X/9f/1/5eT/r/+fzv6//r/vbx+/X/9fzrrtv5/zP2/FmZSkfwPAAAAVRBz/6+Hmcj/AAAAUBox938uzET+BwAAgNKIuf+hMJOK5H/9f/1//X/9f/1//f/lpP+v/9+O/r/+fy+vX/9f/5/Ouq3/H3P/58NMKpL/AQAAoApi7n84zET+BwAAgNKIuf8LYSbyPwAAAPSAs/PaKub+L4aZVCT/6//r/+v/6/8vYf9/Rab/n+j/r6r9qf+v/9+O/r/+fy+vX/9f/5/Ouq3/H3P/I2EmFcn/AAAAUAUx9z8aZiL/AwAAQGnE3P8bYSbyPwAAAJRGzP2/GWZSkfyv/6//r/+v/+/z//X/l5P+v/5/O/r/+v+9vH79f/1/Ouu2/n/M/b8VZlKR/A8AAABVEHP/b4eZyP8AAABQGjH3/06YifwPAAAApRFz/2NhJhXJ/0X//9GD+v8F/X/9f/1//f9I/39p6P/r/7ej/6//38vr1//X/6ezbuv/x9z/pTCTiuR/AAAAqIKY+383zET+BwAAgNKIuX9/mIn8DwAAAKURc//jYSYVyf8+/1//X/9f/38h/f9VLU7X/y/o/7em/6//347+f5n7/yuWZI2Xbv1zPWENpr/p/+v/01m39f9j7j8QZlKR/A8AAABVEHP/74WZyP8AAABQGjH3Hwwzkf8BAACgNGLuPxRmUpH8r/+v/6//r//fI5//P5QtR/9/Rv9/uZWk//+e/n9B/7+R/r/P/9f/1/+nvW7r/8fcPxlmUpH8DwAAAL1uPu86GnP/4TAT+R8AAABKI+b+I2Em8j8AAACURsz9T4SZVCT/d2P//wb9f/1//f90Pfr/Pv9f/789n/+v/5/p/1+wS92f7/X16//r/9NZt/X/Y+4/GmZSkfwPAAAAVRBz/5fDTOR/AAAAKI2Y+78SZiL/AwAAQGnE3H8szKQi+b8b+/+Z/r/+v/5/uh79f/1//f/29P/1/zP9/wt2qfvzvb5+/X/9fzrrtv5/zP3Hw0wqkv8BAACgCmLuPxFmIv8DAABAacTcfzLMRP4HAACA0oi5/1SYSU/m/745e7tz0f/X/++2/n9987LU/f+V+v/6//r/S0H/X/8/0/+/YJe6P9/r69f/1/+ns27r/8fc//thJj2Z/wEAAIBWYu4/HWYi/wMAAEBpxNw/FWYi/wMAAEBpxNw/HWbSPv/3L++qLh79f/3/buv/V/bz/wf1/yP9f/3/hdD/1///P/bua0evs4rj8OeIGDgAzhH3AuI+OOYK6JCY3kKvoYUaeu+9hQ6h9957b6FIQZpZazmD7b09I3+ed7/reU5WnHhmdmTH0V+jn/ZO/39ip93Pb/359f/6f9aN1v/n7n9A3OL7/wAAADCN3P0PjFvsfwAAAJhG7v4HxS32PwAAAEwjd/+D45Ym+//q9v/3PfIj/b/+f3ei/v/O9bFT9f/e/3/+11X/r/8/hq79f/5JqP8/pP8/mdPu57f+/Pp//T/rRuv/c/c/JG5psv8BAACgg9z9D41b7H8AAACYRu7+h8Ut9j8AAABMI3f/w+OWJvvf+//1/9vr/yd9/7/+v+j/9f/H0bX/T/r/Q/r/kzntfn7rz6//1/+zbrT+P3f/I+KWJvsfAAAAOsjdf13cYv8DAADANHL3Xx+32P8AAAAwjdz95+KWJvtf/6//1//r//X/+v990v/r/5fo/8ft/2/X/69+ff2//p91o/X/527YHez+wy/Tb/8DAABAB7n7HxW32P8AAAAwjdz9j45b7H8AAACYRu7+x8QtTfa//l//r//X/+v/9f/7pP/X/y/R/4/b/3v/v/5/7eP1/1yO0fr/3P2PjVua7H8AAADoIHf/4+IW+x8AAACmkbv/8XGL/Q8AAADTyN3/hLilyf7X/+v/9f/6f/2//n+f9P/6/yUb6f/jU1z4y6P/n7r/v/+9Vj7+kv3/mZ3+X/9PGK3/z93/xLilyf4HAACADnL3Pylusf8BAABgGrn7nxy32P8AAAAwjdz9N8QtTfa//l//r//X/+v/9f/7pP/X/y/ZSP9/Sfr/qfv/1a/v/f/6f9aN1v/n7n9K3NJk/wMAAEAHufufGrfY/wAAADCN3P1Pi1vsfwAAAJhG7v6nxy1N9r/+f73/v+5u659P/3/x59f/6//1//p//f8G+v+bL/IT9f+XRf/fqP+/64Ufr//X/7NutP4/d/8z4pYm+x8AAAA6yN3/zLjF/gcAAIBp5O5/Vtxi/wMAAMA0cvc/O265z253mRn7pun/vf9f/6//1//r//dJ/7+B/v9i9P+XRf/fqP+/CP2//p91o/X/ufufE7f4/j8AAABMI3f/c+MW+x8AAACmkbv/eXGL/Q8AAADTyN3//Lilyf7X/+v/9f9Xtf+/5y033U//H/9c/6//vxL0//r/nf7/xE67n9/68+v/9f+sG63/z91/Y9zSZP8DAABAB7n7XxC32P8AAAAwjdz9L4xb7H8AAACYRu7+F8UtTfa//l//r//3/n/9/+HzPzJ+Pyb9/5Wh/9f/L7kC/f+Nd9f/n9hp9/Nbf379v/6fdaP1/7n7Xxy3NNn/AAAA0EHu/pfELfY/AAAATCN3/01xi/0PAAAA08jd/9K4pcn+1//r//X/+n/9v/f/75P+X/+/xPv/9f9bfn79v/6fdaP1/7n7Xxa3NNn/AAAA0EHu/pfHLfY/AAAATCN3/yviFvsfAAAAppG7/5VxS5P9r//X/+v/L9X/36T//z/6f/3/Sej/9f9L9P/6/y0/v/5f/8+60fr/3P2vilua7H8AAADoIHf/zXGL/Q8AAADTyN3/6rjF/gcAAIBp5O5/TdzSZP/r//X/+n/v/9f/6//3Sf+v/1+i/9f/b/n59f/6f9aN1v/n7n9t3NJk/wMAAEAHuftfF7fY/wAAADCN3P2vj1vsfwAAAJhG7v43xC1N9r/+X/+v/9f/6//1//uk/9f/L9H/6/+3/Pz6f/0/60br/3P3vzFuabL/AQAAoIPc/W+KW+x/AAAAmEbu/jfHLfY/AAAATCN3/1vilib7X/+v/9f/6//1//r/fdL/6/+X6P/1/1t+fv2//p91o/X/ufvfGrc02f8AAADQQe7+t8Ut9j8AAABMI3f/2+MW+x8AAACmkbv/HXFLk/2v/9f/6//1//p//f8+6f/1/0u21/9fe+RH+n/9v/5f/8+y0fr/3P3vjFua7H8AAADoIHf/u+IW+x8AAACmkbv/3XGL/Q8AAADTyN3/nrilyf7X/3fu/8+c2+30/zv9v/5f/79X+n/9/5Lt9f9H6f/1//p//T/LRuv/c/e/N25psv8BAACgg9z974tb7H8AAACYRu7+98ct9j8AAABMI3f/B+KWJvtf/9+5//f+f/3/0efU/+v/90H/r/9fov/X/2/5+Ufu/6/R/zOI0fr/3P0fjFua7H8AAADoIHf/h+IW+x8AAACmkbv/w3GL/Q8AAADTyN3/kbilyf7X/+v/9f/6/zv2/2cv/lv8gP5f/38S+n/9/xL9v/5/y88/cv/v/f+MYrT+P3f/R+OWJvsfAAAAOsjd/7G4xf4HAACAaeTu/3jcYv8DAADANHL33xK3NNn/+n/9v/5f/+/9//r/fdL/6/+X6P831//f6Y4/0P/r//X/rBmt/8/d/4m4pcn+BwAAgA5y938ybrH/AQAAYBq5+z8Vt9j/AAAAMI3c/Z+OW5rsf/2//l//r//X/+v/90n/r/9fov/fXP9/hP5f/6//Z81o/X/u/s/ELU32PwAAAHSQu/+zcYv9DwAAANPI3f+5uMX+BwAAgGnk7v983NJk/+v/9f/6/2P1/9fr//X/+v/j0f/r/5ec7//vvevR/18bf6H/n+H59f/6f9aN1v/n7v9C3NJk/wMAAEAHuftvjVvsfwAAAJhG7v4vxi32PwAAAEwjd/+X4pYm+1//r//X/3v/v/5f/79P+v/j9v9nj/Vc8/T/3v+/0/9v7vn1//p/1o3W/+fu/3Lc0mT/AwAAQAe5+78St9j/AAAAMI3c/V+NW+x/AAAAmEbu/q/FLU32v/5f/6//1//r//X/+6T/9/7/Jfp//f+Wn1//r/9n3Wj9f+7+r8ctTfY/AAAAdJC7/xtxi/0PAAAA08jd/824xf4HAACAaeTu/1bc0mT/6//1//p//b/+f5D+/8w5/f8J6P/1/zv9/4mddj+/9efX/+v/WTda/5+7/9txS5P9DwAAAB3k7v9O3GL/AwAAwDRy9383brH/AQAAYBq5+78XtzTZ/yv9fzVw+v9l+v/dwX8/+v+jn1//r//3/n/9v/5/mf5f/7/l59f/6/9ZN1r/n7v/+3HL+eF39vj/lgAAAMBIcvf/IG5p8v1/AAAA6CB3/w/jFvsfAAAAppG7/0dxS5P97/3/+n/v/9f/6//1//uk/9f/L9H/6/+3/Pz6f/0/60br/3P3/zhuabL/AQAAoIPc/T+JW+x/AAAAmEbu/p/GLfY/AAAATCN3/8/ilib7X/+v/9f/6//1//r/fdL/6/+X6P/1/1t+fv2//p91o/X/uft/Hrc02f8AAACwfdes/ozc/b+IW+x/AAAAmEbu/l/GLfY/AAAATCN3/6/ilib7X/+v/9f/6//1//r/fdL/6/+X6P/1/1t+fv2//p91o/X/uft/Hbc02f8AAADQQe7+38Qt9j8AAABMI3f/b+MW+x8AAACmkbv/d3FLk/2v/99H/3+r/l//f0D/r//X/2+//z8Tf+Do/w/p/4/S/+v/9f/6f5aN1v/n7v993NJk/wMAAEAHufv/ELfY/wAAADCN3P1/jFvsfwAAAJhG7v4/xS1N9v88/X886RD9v/f/6/8P6f/1//r/7ff/Sf9/SP9/lP5f/6//1/+zbLT+P3f/n+OWJvsfAAAAOsjd/5e4xf4HAACAaeTu/2vcYv8DAADANHL3/y1uabL/5+n/g/5f/6//1//H39f/j0H/r/9fov/X/2/5+fX/+n/Wjdb/5+7/e9zSZP8DAABAB7n7/xG32P8AAAAwjdz9/4xb7H8AAACYRu7+2+KWC/b/2av4VFeP/l//v7/+//Z77Hb6f/2//l//r//X/1+a/l//v+Xn1//r/1k3Wv9/28H/a++y+9fBR/v+PwAAAMwod/+/4xb7HwAAAKaRu/8/cYv9DwAAANPI3f/fuKXJ/tf/6/+9/1//r//X/++T/l//v0T/r//f8vPr//X/rBut/8/d/78AAAD//7KvnKc=")
r1 = fspick(0xffffffffffffff9c, &(0x7f0000000000)='.\x00', 0x0)
fsconfig$FSCONFIG_CMD_RECONFIGURE(0xffffffffffffffff, 0x7, 0x0, 0x0, 0x0)
fsconfig$FSCONFIG_CMD_RECONFIGURE(r1, 0x7, 0x0, 0x0, 0x0)
syz_mount_image$msdos(&(0x7f0000000140), &(0x7f0000000100)='.\x00', 0x1a4243c, &(0x7f0000000dc0)=ANY=[@ANYRES16, @ANYRES16, @ANYRESOCT=0x0, @ANYBLOB="f1bc6305ed588d63a576cc3afd51baf29cde2281a84392f4e66ff7ef22aa9af727ceae8a8ec95fc1b73083de2de825a0cb2b0be774fdb33650d7dace27c16bc23b2f7c7fb72585548939698f280d138aa9255a8a924008f8477e82ba11cdb11efd5ca2f1ab049ce2ccc415d2daf8dac725533a558d561654faf5e0924f1376174f374d664fad4a6ab24ec0e822e7f9426e8e5de1fe58085a0ae8c7c4d5b038f6afd61834d46208b9fb4cb1a1fa962a8b0000dc2e319379ea1e5a07aeb3f9cd4e648df4dd18e6253e7b2310a78d63a232a2a40758027a472e7d263ef567a84166f26ee56e701c63a8863787889bf1c94576664da963fec5dfb990190fccf31954a940c8b584ca89a512ad36f28edec086b1c0823c40224101575a854600705670ac028840eeaf3f5d8769023c01218614f427ac6f9c6193bc21a2b833e5c9c703c4cfa063dd34c245706bde3d7ac373ab04b62b4111b59eabd436dd97e788a36ef25bad99be2aa924949558c8", @ANYRESHEX, @ANYRES8, @ANYRES64, @ANYBLOB="1eaca1ce5a49018bb44244f8b2e2683b4b329aa2188af018f727984b8dbe8a941919cc373d20ba827d3ad4574273aa70a80231414c72c615cfec7380d93e938dcffc8174dda0429832d29ca72a4cb0580f3783c38ce824d6c4274a6e99667d3b8eccc80c794989574d4152a015937f5e3add863990e9c4ebff9a09734e2cec8ca721e4a3e6c9f8dc88892a89d9194c1dace384041fd04f2ca2b62c536aeffc9bc00a390a879f703dfae3af2235fcccb0fc26802a99d736b0d0b7668e482293bda4592de7b890051e901e05ff42cf566ec59a6bc3b35ace82855630fa5237dd17dc6cbb45903df8cd2ae1b7090615a49a", @ANYRES32, @ANYRES32], 0x0, 0x0, &(0x7f0000000000))
syz_mount_image$iso9660(&(0x7f0000000dc0), &(0x7f0000002380)='./file0/file0\x00', 0x3a0cc12, &(0x7f0000000e00)=ANY=[], 0x1, 0x9ca, &(0x7f0000001500)="$eJzs3c1vXNXdB/Dv9UtiDAoB8vDkQUAm4QkYSB3bKaERizaxx46pXyrbkYiqitAmqaJYpYJWAtRFKlVdFbWLqgu6Y9kVEhvYVNm1f0EXlSr+BdRV1EWnunfGsR17PHbqN8LnY13Pffndc37H9849muuZOeGrrNFoVNN9Ll/6024my/5zYeyLjz/5sJx+eSsH0p1Xis+SviS1pCfJ0aR3dGxudrpDQTeTK0luJ0WSg2k+bsqVFL/NI8vLt1P8say3cuk+G8amNPha2+vzDwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA9qNidGxoaLjI1OTMpTdq7VVDgLfb2LVc3ufVqN/F5x3rTYpySl/f0lDfR48sb36y/HUiTzeXnq4GJE9fPnj4ycOvPdHTtbT/Bglvxb9bgyFvecd33//g5luLi9fe2aZEvmom6jOT87OT0+cn6rXJ+dnaubNnh05fHJ+vjU9O1ecvzy/Up2ujc/XzC7NztYHRF2vD586dqdUHL89empkYG5yqL6189RsjQ0Nna68Pfq9+fm5+dub064Pzoxcnp6YmZyaqmHJzGfNqeSJ+d3KhtlA/P12rXb+xeO3MqozWOSXKoOFOLSmDRjoFjQyNjAwPj4wMn33l3CuvDg31LK/oTrVi6B5Zs8vdk7Y8o7fyt2Zf+6hrs5HbewGH/0JXq//PVCYzk0t5I7V1f0YzlrnMZrrN9pal/v/k6fqG9a7s/5d6+aPLm59K1f8/21x6tl3/3yaX3ft5N+/ng9zMW1nMYq7lnVVbD95HiY3G3rdq8z8TqWcmk5nPbCYznfPVmlprTS3ncjZnM5Q3czHjmU8t45nMVOqZz+XMZyH16owazVzqOZ+FnvJA1zKQ0byYWoZzLudyJrXUM5jLmc2lzGQiYzlflXI9N6q/+5kNcrwbNLyZoJENgjbo/1srttD/8zW1A1dxuD+NVv9/oHPowOhuJAQAAABsu//7aw4defwv/0h680x1jx0AAAB40FRv13u6fOgt555JMT45VR/a67QAAACAbVRUn7ErkvTnWHNu6ZNQbgIAAADAA6L6//+z5UN/OXcshdf/AAAA8KDp/B37HSOKU6nlVrmpdrUZebUV0fqe3/7xyan64Ojs1GvDeb76loHqkwZrSuvO4aT6+MFLOd6MOt7ffOxfXWJfGTU8+Npw+nKi1ZCB58qH5wbWiRwpI1/KC83IF5Yi+7Im8kwZCQAPuhMb9Meb7f9fyqlmxKmnys702z1PrdMHD+lZAWC/6DzGTseI4ptLw/+0ef3/eK4fa76lYDDpeTuLuZpT1acNqncctErN7SLLpfaveBvCqQ53A/pXjPByaul+wLFH1r0f0L9ioJdTa+4ItIs9swtHAgB2z4k1/fAG/X+j0Zxbc/9/1ev/3Pv6v99bCgFgX7k7gv0Ozux1GwGA1e6rl+7ewYQAAAAAAAAAAAAAAAAAAAAAAAAAAADgAbAb3/+/L2duJdkHaZjZ2Zm+3TnDf3Ag2S9N3qaZvb4yAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAsBuKpHu99V3JwaRnKMnp3c9q59za6wR2QFFsIfZO7uS9HNrJfAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAvo5a3//flebjw81V6elKTia5kuT7e53jdrqz1wnssRXf/18e8zSK9DQPe4re0bG52elqMIGD5fYvPv7kw3K6n3rKAsoaVg0u0aqh/V6PVXv1j1179+bP3v5pbexCleSFhfGpsemJue8sBz5ZfJrU0pyWLOX7i6J1Fq9u+adlSzdX73hV79jaev93vb3v1nvyz79r37Zly2ncWLw2Uta0UH9j4ec/ufHeiqDHczx5biAZWF3Tj8upTU3H07tRvcWXxa+LQ/lDrlTHv0yjaBTlIXq0av9D128sXhv80duLV9vkdDjHklxN+jaf07G1R2JJddZ19Za1DlVB5a8jHcrb0MPdhxqNZonDbdrwWHXK9G+pDbX2bah0+Lu32nimTUZP5PktH+nnO9TY8q9GU3Op+LL4e3Exf8uvVoz/0VUe/5PZzLOzjKkiV5wp7dt8crnlIys3vHlvZNtnJTvgN/lhvnX3+HetuP63jtVOX48eqn6vqHHHnhdFsxdqqeaP3NMjta4+65TUUuV5pBnVJs//yctr9+uQ58sdrijb9Pxfrfiy+KgYyD9zy/g/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADA/lck3eut70pOJjmc5NFyuZY0tqO+rv5iO4rZpANr1tzaxdr3j667c8Wd3Ml7ObSn6QAAAAAAAACwbS6MffHxJx+WU/X/+O78f/FZ0tf8T39PksPF73tHx+ZmpzsU1JtcSXK7nO/bWg7lfnlkefl2uXR0620BADbnPwEAAP//sa925Q==")


syz_mount_image$btrfs(&(0x7f0000005100), &(0x7f0000000000)='./file1\x00', 0x810, &(0x7f0000000140)={[{@nossd_spread}, {@nodatacow}, {@enospc_debug}, {@nossd}, {@nodatasum}, {@autodefrag}, {@user_subvol_rm}, {@max_inline={'max_inline', 0x3d, [0x6d, 0x33, 0x78, 0x39, 0x65, 0x36]}}]}, 0x1, 0x50f3, &(0x7f000000a2c0)="$eJzs3U+IVWUfB/Dnzp1x5lVw7isEtsoikGrh4CYioqtMUFF0y8VgBE4tgnThJEi0EMQW/Vt4S4paSK6kFsksjKA2LqQwArehYS7cKAaSi3Yac8957pz7HO+5d0ZtTD8fmTnnOb/zPOe5l7O43+uccwIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACEEF74/bNDVfVT16bPnJtp7jywZebyvul1p0OodbbX8vqOrc++8ua2HS9OxA6zL2fLRqPfkFnX81ljVc/GhX69P6+HEMaSAer58pk1pVGLq3vKA1a6fnH30U17mxuPH27Xr146e7L80lkwsdITWCn5eXVh8Vxqdn6PJHt024VTr9Zzimb90xPuX3kRAMCSTLU6i+7H0fwjbre9P60n7WbSbift+AmhXWwsRzbuqn7z3JDWV2iezSwqjPedZ1LP3/9uu5X2T9pJ1FjCPHt3zSPNRL95ziX1lZonAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAwJ3kkbdHH6qqn7o2febcTHPngS0zl/dNrzsdQqOzvZaVa6vfP9z869utxw78uPmr4xeef6ye94vL0cLO4be48sRkCG8UKhfisBfXhtDqLXSa4cty4a3OynOxAAAAwN3k/s7vkW47i4NjPe1aJ03WOv+iLCxev7j76Ka9zY3HD7frVy+dPbn88Vp9xmvecLxuu7H4UysE4xh/0/EW63HXPaVxqqUjpnn+8fNTf1f1L+X/RnX+j++c/A8AAMDNkP/TcaoNyv/fvfbHJ1X9S/l/Q88hS/k/zjjm/5GwvPwPAAAAd7Lbnf+bpXGqDcr/4y+NfV3Vv5T/p4bL/6PFaceNv8YJ75oMYWrQ1AEAAIA+4v+7L361EPN69s1BmtefevTguarxSvm/OVz+H7ulrwoAAAC4GUe+2P5wVb2U/1vD5f/x2zprAAAAYCne+XDig6p6Kf/PDpf/V+fL/MqHrNNP8a8QDk2GMLGwMpcVfg7tp7sFAAAA4BaJOf3PT3f+ULVfKf/PVd//P97pIF7/33P/v9L1/4VCdte/J90YAAAAgHtR+Xr+eHv87MkF/Z6/P+z1/w/87+CrVccv5f/9w+X/enF5K5//BwAAAMvwX3v+3/bSONUG3f//vo/e/aWqfyn/t4fL/3G5pvjyTtRq2fvz3mQI6xdW8rsJfhMPtyspzI8VCh2tpMe22CMvzI8XCh1zSY/NkyE8uLCyPyn8PxbaSeHK2rxwJCmcjoX8fOgWjiWFE/FM+3xtPt208H0s5BdYzMcrKNZ0L4lIelzt12OhcMMeZ7sHBwAAuKfE8Jxn2bHeZkij7Hxt0A6rB+0wMmiH+qAdRpMd0h37bQ+zvYW4vX1m49Ke/39kuPwf34pV2aLf9f8hXv+fP9ewe/3/bCw0ksJ8LLTSOwa04jGysPtxPEajlfe4sr5bAAAAgLta/F6gvsLzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP5h715j5KruA4CffY734fVCUoXQKNkkNY6beL22gUQtVdaUqhEpzbqhoCqi2NhrsnjBjm1KjEJkbCIaIShtkJIPRRhFUc0HqBWISAoIFymOUHlEVEUBBAqtIQoipSQRaYIUqtl7z+ydc3cefqzx0t9P8s6Z+Z/nnYfn3HvnXAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA+P/h4Feu+dtm8Ud/e96zL1w8fsWetRe/et15pz4ZwsTM4x1ZuKP/xtvHf373uffseWD1HfcdPv+jvXm5PB4Gqn868zs3xFoPLw7h/o4QutPAisEs0JPfH4z1vW8whFPCbKBWYrI/K5E2HL7fF8K+MBuoVfW9vhAGC4ELn3rk4Zuridv6QlgaQqikbTxfydroSwNn9GaB/jSwtTsL/OqtTC3w3c4sAMcsvhlqL/oDE/UZhucu1+D113PcOvb2SofXFRPDjfP9bO08d6qgN31g4pietlJ1zIvS2+Ogd9sCeLeVtvOtnrbiF6n8G8pbs6FK6Nw0uXnD1dM74yOdYXS0q1FN8/Q8P/P6lzYeSXrBvA5jB4aPy+vwlieW3t21/ILH71ux9OX9H9v7yrF280eFTVpMz7dKyF9zC+Z5jMZ9niyAt1/pW9KIL10hhM2f/73PNIuX5v/Dzef/8eUcbzvrcsda3xzK5ubxkcGYeG0om5sDAADAgrEQ9ppuHX3oE83qK83/R9o7/h8P+eeT+Wy0B0MYn0nsXRLCaTOPZ4G7YnOXLQnhgzOpifrA2iRwMIT3ziSW16pKSiyKJUaSwE+G8sB4EjgUAxNJ4FsxcGsSuCEGDiSBjTFwMAmcGwNhqn4cvz+Uj6PtQF8MrM824oF4FsIvhmJrybZ6rlYVAADAcZLPDnvq7xbOdTjWDHF6eaCvVYZ4BnbDDJWkhnQGW5tWNayhu1UNna1qqI17d/Phl2ruaFVz6TSMjvoMt//ybz4bmijN/8eaz/8rc3Sko3T8P4R1M39j7s48Ml2Lr5+oywAAAAAcg4H/ffGbzeKl+f94e+f/x30iXYXM4bG4G2LLkhDG6gNZtX9YDmRHvQfyAAAAACwEtePxtWPhU/ltdop2Op8u5584wvzxwP/4nPl7Dz64vll/S/P/ifbO/++vv806cSj24mtLQlhUCPwg9rIamDESAz/+ZH0gH/+huAFuilXlJybUqropllgfA2NJYF+jEj+slTitPpA/WbXG99bGMZWXKAQAAADghIu7A+Jx+Xj+/4d+s/qaZuVK8//1R3b+/8w8uHR6//RACCu7Q+hKfxjwWH+2MGAMDHbkiYf6s7q60qqu7w/hnOrA0qpezNf/707XGHyqL6sqBk770P7Xz6gmvtkXwspi4OnP3XlWNbEzCdQa/8u+ED5QHW3a+HcWZY33pI1/fVEI7y8EalVdtiiEamO9aVWPVPLrGKRV/XMlhHcVArWqzq6EsCsAsEDF/0o3FR/csevaLRumpye3z2Mi7sPvC5unpidHN26d3lRp0KdNSZ/rljG6vjymdq9881y+RNFF964bbCdd+53gWLGtfD9+6cTB/H78LtQzM87VPXV316RD/siHy02EwjepRkPunOch9xcrmX0SS/XH/L1hICy6esfk9tEvbti5c/uq7G+72Vdnf+NhpmxbrUq3Vf9cfWvj5dFwtazE0W6rZcVKVu68ctvKHbuuXTF15YbLJy+fvGrV2avHzhxbM/bxM1dWRzWW/W0x1GVzVZ0M9a072xzXcRzq6d2FSk7Ep4aEhMRCS2wdWNb0/+TS/H9b8/l//NSJn/z5+gyNjv8Px8P82eOzh/nXx8C+do//Dzc6ml87MWAkCeyOgd0O8wMAAPDOECf5cW9m3Cv90+XfeblZudL8f3d7v/8/Tuv/15auP7/RMv/LY4mxRuv/p8v819b/391o/f90mf/a+v/73ob1/6+uBZJN8gvr/wMAAO8EJ279/5bL+6cXCChlaLm8f3qBgFKGlsv4t3uBgCNe///5//yr/w5NlOb/t7Y3/7dwPwAAAJw8vvxn1/xOs3hp/r+vvfn/iV//LzQ6/3+kUWCi0cKA1v8DAABggWq0/t/wjf2XNitXmv8faG/+H0+76KzLHWt9cyhb0y6ka9q9NlT7yQAAAAAsDJ1hdLSnzbx1K6OuPfo2n8mXAm2WLnrxTw4f2fn/B9ub/9f9LuOWJ5be3bX8gsffvG/F0pf3f2zvK7PH/wEAAID50+5+CQAAAAAAAAAAAAAA4O334n/sWdMsXvr9f1g383ij3//H6/7F3xe8uy53rLX1+n/5/Qs/fc+umSULHxsK4cPFwJY9W04J+bX5lxUDD1+y/D3VxJ60xIMvnPtSNXFpGvjUilPfqCbOSQLr4yKJ700D8aqKbyxOAnF5xX9PA3F7HEgDvXngq4uzcXSk2+qng9m26ki31bODISwpBGrb6v7BrI2OdIC3JYHaAL+QBuIA/zwPdKa9umcg61UMDMaidwxkvQIA4KQVvwX2hM1T05Nj8St8vD29u/42qluy7PpytR1tNv9cvjTZRfeuG2wn3ZV+F5291nhPqFSHsKr0dbWYpWNmlMenlhab7t0NhtxqtbfOBuVSR7rpehuPqC8b0ejGrdObeloOfE3rLKu7W2ZZVZrsFLN0zmzSNmppoy9tjKjNbdNGl+P9zjA62pXk+oMYHA51Wr0i2v29fnGdv0avgmKeqw7v/VWz+krz/+H25v+V4rjeyC8GsDteWe/vlljmHwAAAObXV9f++hvx32dvfPTpZnlL8/+R9ub/cQ9Wfig429txMF7/f++SEGYurT+cBe6KzV22JIQPzqQmYonsgvrnxxJjWeCuuMNkeSyxfqK+qkUxcCAJ/GQoDxxMAodiIN9LsT/ku3L+fiiEs2ZS6+pLbIslhpPAZ2JgJAmMxsBYElgcA+NJ4NXFeWAiCfxbDISp+m117+J8WwEAAByJfJ7VU383pPO8A92tMnS0ytDfKkNnqwyVVhkajSLe/3bM0JOcvNJRyNST1tqX1FLKEC+Gf8T9KmUIP6zPmRYsNR3PP6idb9BRn+GBT3RXQhOl+f9Ye/P//vrbrPVDcf4/e/2/LPCD2L2vxVPHR2Lgx5+sD+Q7Bg7Fye5Ntaom8hL5pP2mWGI8BkaSwLYYGE8C69flgX3vqQ/kM+1a43trjU/lJQoBAAAAOOHiDoK4mybO/+/Y8ZWBZuVK8//x9ub/sb2BYmM3xFoPLw7h/o7Z3tQCKwazQNyPMRh/Hv++wRBOKezgqJWY7M9K9CYNh+/3Zb9Q702r+l5f9uODeP/Cpx55+OZq4ra+EJYW9r7U2ni+krXRlwbO6M0C/Wlga3cWiHt+aoHvdmYBOGa1vYLxBZWf6lIzPHe5Bq+/d8o1QdPhlfaBzpFvrt9czZfSDtd8n2rNkT1tTfffctyU3h4HvdsW4rtt2Lut+EUq/4by1myoEjo3TW7ecPX0zvhI8ZesJfP0PBd/pdpO+ji8DncffW9bq6QdGEs+PsbmLjf367AjVnfLE0vv7lp+weP3rVj68v6P7X2l7W40EH8o/Mh1/zr4o8LmnW+VkL/mFtznyYTPk4X438CIpy2EsO7Vr9/ULF6a/0+0N//vTm5n/DpuzB1LQvhIYeM+Fjf/Hy/JPgcLgexT8l3lQHbI/b+GGn5yAgAAwPFW291R218wld9mJ4Sn8+Ry/okjzB/3V4zPmb/dfvf/9SVLm8VL8//1zef/i5JuOv7v+D/zxPH/OZ3su6IXpQ/sPqZd0aXqmBeO/8/pZH+3Of4/J8f/Hf+fi+P/LTj+P6eT/WkrfUva5ktXCOHlP3ro2Wbx0vx/W3vzf+v/zb1oX239v/WN1v/b1mj9v93W/wMAAOZVg4Xm0nleafW+UoZ09b5ShpYLBLZcYtD6f0e8/t9Lpz//m9BEaf6/u735f3w5DBRbXyjr/42sa1DVrTGwzcKAAAAAnIwa7SAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADg7fXAP/zPpmbxR3973rMvXDx+xZ61F7963XmnPhnC1MzjHVm4o//G28d/fve59+x5YPUd9x0+/6OVvFxPfvu7dbljrW8OhbCv8MhgTLw2VL0zG7jw0/fs6q4mHhsK4cPFwJY9W06pJr41FMKyYuDhS5a/p5rYk5Z48IVzX6omLk0Dn1px6hvVxDl5oCPt7j8uzrrbkXb35sUhLCkEat29YnF9VbU2/jQPdKZt/NNg1kYMDMai3xjM2oiB6VhialEIK7tD6EqrerSSVdWVVvUvlayqrrSqL1dCOCeE0J1W9UJvVlV3OvIne7OqYuC0D+1//YxqYl9vCCuLgac/d+dZ1cQXkkCt8b/oDeED1ZdM2vi3e7LGe9LGb+sJ4f0hhN60xC+7sxK9aYkXu0N4VyFQa/zz3SHsCrwjxA+fuk+0Hbuu3bJhenpy+zwmevO2+sLmqenJ0Y1bpzdVkj410lFIv3X90Y/9ude/tLF6e9G96wbbSXfn5Xpmury6p+7umpO997Ff/cVKZp+PUv0xf28YCIuu3jG5ffSLG3bu3L4q+9tu9tXZ3648mm2rVQtlWy0rVrJy55XbVu7Yde2KqSs3XD55+eRVq85ePXbm2Jqxj5+5sjqqsezv8RjqnSd+qKd3Fyo5ER8AEhISCy3RWffpNnayf5CXvujPdrQnVGY+oEvTimKWjplRHo9Brz3KER/N95SWI1pVmjiUsqyeI8v19VnWlCYTs7X0ZVlmvteVJofFxjpnNmm83xlGR7sabYfh+rvFzfuzY9i8z+Sbrt00AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP/HDhwIAAAAAAD5vzZCVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVFXbgQAAAAAAAyP+1EaqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqwA8cCAAAAAML8rcPo2QAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAgEsBAAD//+erI4o=")
r0 = openat$cgroup_ro(0xffffffffffffff9c, &(0x7f00000002c0)='cpuset.effective_mems\x00', 0x275a, 0x0)
write$binfmt_elf64(r0, &(0x7f0000000140)=ANY=[], 0xfe6f)
ioctl$FS_IOC_RESVSP(r0, 0x40305829, &(0x7f0000000300)={0x0, 0x0, 0x0, 0x30000007, 0xe0})
write$FUSE_BMAP(r0, &(0x7f0000000180)={0x18}, 0x18)
r1 = openat$cgroup_ro(0xffffffffffffff9c, &(0x7f0000000440)='blkio.bfq.avg_queue_size\x00', 0x275a, 0x0)
mmap(&(0x7f0000000000/0xb36000)=nil, 0xb36000, 0x2000002, 0x28011, r1, 0x0)
ftruncate(r1, 0xc17a)
syz_mount_image$exfat(&(0x7f0000000140), &(0x7f00000002c0)='./file1\x00', 0x12087, &(0x7f0000000300)=ANY=[@ANYBLOB="7001000000000000a3ca68871bfd18cfc7"], 0x1, 0x1510, &(0x7f0000002d80)="$eJzs3Au4TeX2MPAx3ved2ya0klx25njHZKVNL0kSSZJLkiQhyS0hSZIkJLfckpCE3JNcNiG5xU7b/X7JPUmOJElCQsj7PVvO55xTvr7vf+pznrPH73nmY4611hhrzDlY82Lt/V37QZXqVi5fi5nh34K//tENABIBoC8AXA0AAQCUyFEiR/rzmTV2+/feRPy5aqdc6Q7ElSTzz9hk/hmbzD9jk/lnbDL/jE3mn7HJ/DM2mb8QGdmmqXmvkSXjLnL/PyOT4/9/CfU/S5P5Z2wy/4yj1+88JvPP2GT+GRnL/DM4mX/GJvPP2GT+QmRkf8p95EwXi/0H3M/+C5a/76p/rw781X0G/6O8K/c3TwghhBBCCCGEEEIIIYQQQgghhBBCCCGEEEIIIYQQQgghhBBCCCGEEEIIIYQQQgghhBBCCCGEEEIIIYQQQgghhBBCCCGEEEIIIYQQQgghhBBCCCGEEEIIIYQQQgghhBBCCCGEEEIIIYQQQgghhBBCCCGEEEIIIYQQQgghhBBCCCFERnLaX2IA4MKKvtJdCSGEEEIIIYQQ4s/kM13pDoQQQgghhBBCCPHXQwCjwUAACZAJEiEzZIGrICtkg+xwNcTgGsgB10JOuA5yQW7IA3khCa6HfBACgQWGCPJDAYjDDVAQboRkKASF4SZwUASKws1QDG6B4nArlIDboCTcDqWgNNwBZeBOKAt3QTm4G8rDPVABKkIlqAz3QhW4D6rC/VANHoDq8CDUgIegJjwMtaA21IFHoC48CvXgMagPDaAhNILGv5+v/8/5L0FneBm6QNcL33ToAa9AT+gFvaEP9IVXoR+8Bv3hdRgAA2EQvAGD4U0YAm/BUBgGw+FtGAEjYRSMhjEwFsbBOzAe3oUJ8B5MhEkwGaZACkyFafA+TIcZMBM+gFnwIcyGOTAX5sF8+AgWwEJIhY9hEXwCabAYlsBSWAbLYQWshFWwGtbAWlgH62EDbIRN8Clshi2wFbbBdtgBO+Ez2AWfw274AvbAl/+P+af+Jb8DAgIqVGjQYAImYCImYhbMglkxK2bH7BjDGObAHJjz798JwTyYhEmYD/MhISEjY37Mj3GMY0EsiMmYjIWxMDp0WBSLYjG8BYtjcSyBJbAklsRSWBpLYxksg2WxLJbDclj+9jkAWAErYSW8F+/F+7AqVsVqWA2rY3WsgTWwJtbEWlgL62AdrIt1sR7Ww/pYHxtiQ2yMjbEJNsGm2BSbY3NsgS2wJbbEVtgKW2NrbINtsC22xXbYDttje+yAHbEjvoQv4cv4MnbFCqo79sAe2BN7Ym/sg33wVeyHr+Fr+DoOwIE4CN/AN/BNHIIncSgOw+E4HMuqkTgKRyOrsTgOx+F4HI8TcAJOxEk4CadgCk7FaTgNp+MMnIEf4Cz8ED/EOTgH5+F8nI8LcCGmYiouwlOYhotxCS7FZbgcl+FKXIUrcY36+77eiBvxU/wUt+AW3IbbcAfuwM/wM/wcP8cBuAf34F7ci/twH+7H/XgAD+BBPIiH8BAexsN4BI/gUTyGx/EYnsATeBJP4Wk8jWfwDJ7DF5K+qbOj0OoBoNIZZVSCSlCJKlFlUVlUVpVVZVfZVUzFVA6VQ+VUOVUulUvlUXlUkkpS+VQ+RYoUq0jlV/lVXMVVQVVQJatkVVgVVk45VVQVVcVUMVVcFVcl1G2qpLpdlVKlVTNXRpVRZVVzV07drcqr8qqCqqgqqcqqsqqiqqiqqqqqpqqp6qq6qqEeUjVVd+yNtVX6ZOqqgVhPDcL6qoFqqBqpN/Fx1UQNwaaqmWqunlTDcCi2VE1cK/WMaq1GYRv1nBqNz6t2aiy2Vy+qDqqj6qReUp1VU9dFdVUTsbvqoaZgT9VL9VZ91HSsqNInVkm9rgaogWqQekPNwzfVEPWWGqqGqeHqbTVCjVSj1Gg1Ro1V49Q7arx6V01Q76mJapKarKaoFDVVTVPvq+lqhpqpPlCz1Idqtpqj5qp5ar76SC1QC1Wq+lgtUp+oNLVYLVFL1TK1XK1QK9UqtVqtUWvVOrVebVAb1Sb1qdqstqitapvarnaoneoztUt9rnarL9Qe9aXaq/6m9qmv1H71tTqgvlEH1bfqkPpOHVbfqyOqqzqqjqnj6kd1Qv2kTqpT6rT6WZ1RZ9U59Ys6r7wCjVpprY0OdILOpBN1Zp1FX6Wz6mw6u75ax/Q1Ooe+VufU1+lcOrfOo/PqJH29zqdDTdpq1pHOrwvouL5BF9Q36mRdSBfWN2mni+ii+mZdTN+ii+tbdQl9my6pb9eldGl9hy6j79Rl9V26nL5bl9f36Aq6oq6kK+t7dRV9n66q79fV9AO6un5Q19AP6Zr6YV1L19Z19CO6rn5U19OP6fq6gW6oG+nG+nHdRD+hm+pmurl+UrfQT+mW+mndSj+jW+tndRv9nG6rn9ft9Au6vX5Rd9AddSf9iz6vve6iu+puurvuoV/RPXUv3Vv30X31q7qffk3316/rAXqgHqTf0IP1m3qIfksP1cP0cP22HqFH6lF6tB6jx+px+h09Xr+rJ+j39EQ9SU/WU3SKnqp7X6w08/8i/93fye9/4d036k36U71Zb9Fb9Ta9Xe/QO/VOvUvv0rv1br1H79F79V69T+/T+/V+fUAf0Af1QX1IH9KH9WF9RB/RR/Ux/bP+UZ/QP+mT+pQ+pX/WZ/QZfe7iPgCDRhltjAlMgslkEk1mk8VcZbKabCa7udrEzDUmh7nW5DTXmVwmt8lj8pokc73JZ0JDxho2kclvCpi4ucEUNDeaZFPIFDY3GWeKmKLm5svlBxc/4f4w/zL9LZ18Mb+xaWyamCamqWlqmpvmpoVpYVqalqaVaWVam9amjWlj2pq2pp1pZ9qb9qaD6WA6mU6ms+lsuiCYbqab6WFeMT1NL9Pb9DF9zaumn+ln+pv+ZoAZYAaZQWawGWyGmCFmqBlqhpvhZoQZYUaZUWaMGWPGmXFmvBlvJpgJZqKZaCabySbFpJj0A2u6mWammWVmmdlmtplr5pr5Zr5ZYBaYVJNqFplFJs0sNovNUrPULDfLzUqz0qw2q81as9asN+tNmtlkNpnNZrPZaraa7Wa7QQOwy+wyu81us8fsMXvNXrPP7DP7zX5zwBwwB81Bc8gcMofNYXPEHDFHzVFz3Bw3J8wJc9KcNKfNaXPGnDHnzDlz3pxPP+0LVKACE5ggIUgIEoPEIEuQJcgaZA2yB9mDWBALcgQ5gpzBdUGuIHeQJ8gbJAXXB/mCMKDABhxEQf6gQBAPbggKBjcGyUGhoHBwU+CCIkHR4OagWHBLUDy4NSgR3BaUDG4PSgWlgzuCMsGdQdngrqBccHdQPrgnqBBUDCoFlYN7gyrBfUHV4P6gWvBAUD14MKgRPBTUDB4OagW1gzrBI0Hd4NGgXvBYUD9oEDQMGgWN/9T63p/M/YTrEnYNM0H3sEf4Stgz7BX2DvuEfcNXw37ha2H/8PVwQDgwHBS+EQ4O3wyHhG+FQ8Nh4fDw7XBEODIcFY4Ox4Rjw3HhO+H48N1wQvheODGcFE4Op4Qp4dRwWvh+OD2cEc4MPwhnhR+Gs8MvEgH44j+HhWFq+HG4KPwkTAsXh0vCpeGycHniinBluCpcHa4J14brwvXhhnBjuCn8NNwcbgm3htvC7eGOcGf4Wbgr/DzcHX4R7gm/DPeGfwv3hV+F+8OvwwPhN+HB8NvwUPhdeDj8PjwS/hAeDY+Fx8MfwxPhT+HJ8FR4Ovw5PBOeDc+Fv4TnQ59+cp9+eCdDhhIogRIpkbJQFspKWSk7ZacYxSgH5aCclJNyUS7KQ3koiZIoH+WjdExM+Sk/xSlOBakgJVMyFabC5MhRUSpKxagYFafiVIJK0D983ZjupDvpLrqL7qa76R66hypSRapMlakKVaGqVJWqUTWqTtWpBtWgmlSTalEtqkN1qC7VpXpUj+pTfWpIDakxNaYm1ISaUlNqTs2pBbWgltSSWlErak2tqQ21obbUltpRO2pP7akDdaBO1Ik6U2fqQl2oG3WjHtSDelJP6k29qS/1pX7Uj/pTfxpAA2gQDaLBNJiG0BAaSsNoOL1NI2gkjaLRNIbG0jgaR+NpPE2gCTSRJtJkmkwplELTaBpNp+k0k2bSLJpFs2k2zaW5NJ/m0wJaQKmUSotoEaVRGi2hJbSMltEKWkGraBWtoTW0jtbRBtpAm2gTbabNtJW20nbaTjtpJ+2iXbSbdtMe2kN7aS/to320n/bTATpAB+kgHaJDdJgO0xE6QkfpKB2n43SCTtBJOkmn6TSdobN0jhLgPHlKtJltFnuVzWqz2ez2avuvcR6b1ybZ620+G9pcNvc/xWStTbaFbGF7k3W2iC1qb/5NXMqWtnfYMvZOW9beZcv9Jq5i77NV7f22mn3AVrb3/lNc3T5oa9hHbU37mK1lG9g6tpGtax+19exjtr5tYBvaRraFfcq2tE/bVvYZ29o++5t4gV1oV9nVdo1da3fZz+1p+7M9ZL+zZ+xZ28V2tX3tq7affc32t6/bAXbgb+Lh9m07wo60o2xgx9ix/xCPvhBPtlNsip1qp9n37XQ74zfxfPuRnWVT7Ww7x8618y7E6T2l2o/tIvuJTbOL7RK71C6zy+0Ku/J/97rUrrcb7Ea7035mN9stdqvdZrfbHRfi9O3Ybb+we+yX9qD91u6zX9n99rA9YL+5EKdv32H7vT1if7BH7TF73P5oT9if7El76sL2p2/7j/YXe956C4ysWLPhgBM4EydyZs7CV3FWzsbZ+WqO8TWcg6/lnHwd5+LcnIfzchJfz/k4ZGLLzBHn5wIc5xu4IN/IyVyIC/NN7LgIF+WbuRjfwsX5Vi7Bt3FJvp1LcWm+g8vwnVyW7+JyfDeX53u4AlfkSlyZ7+UqfB9X5fu5Gj/A1flBrsEPcU1+mGtxba7Dj3BdfpTr8WNcnxtwQ27EjflxbsJPcFNuxs35SW7BT3FLfppb8TPcmp/lNvwct+XnuR2/wO35Re7AHbkTv8Sd+WXuwl25G3fnHvwK9+Re3Jv7cF9+lfvxa9yfX+cBPJAH8Rs8mN/kIfwWD+VhPJzf5hE8kkfxaB7DY3kcv8Pj+V2ewO/xRJ7Ek3kKp/BUnsbv83SewTP5A57FH/JsnsNzeR7P5494AS/kVP6YF/EnnMaLeQkv5WW8nFfwSl7Fq3kNr+V1vJ438EbexJ/yZt7CyNt4O+/gnfwZ7+LPeTd/wXv4S97Lf+N9/BXv56/5AH/DB/lbPsTf8WH+no/wD3yUj/Fx/pFP8E98kk/xaf6Zz/BZPse/8Hn2DBFGKtKRiYIoIcoUJUaZoyzRVVHWKFuUPbo6ikXXRDmia6Oc0XVRrih3lCfKGyVF10f5ojCiyEYcRVH+qEAUj26ICkY3RslRofSP+chFRaKi0c1RseiWqHh0a1Qiui0qGd0elYpKR48+UCa6Myob3RWVi+6Oykf3RBWiilGlqHJ0b1Qlui+qGt0fVYseiIpHD0Y1ooeimtHDUa2odlQneiSqGz0a1Ysei+pHDaKGUaOocfR41CR6ImoaNYuaR09GLaKnopbR01Gr6JmodfTsHz7fLeoe9YheiV6JvL9fz43Pi8+PfxRfEF8YT41/HF8U/ySeFl8cXxJfGl8WXx5fEV8ZXxVfHV8TXxtfF18f3xDfGPe+ciZw6JTTzrjAJbhMLtFldlncVS6ry+ayu6tdzF3jcrhrXU53ncvlcrs8Lq9LSj8OutCRs45d5PK7Ai7ubnAF3Y0u2RVyhd1Nzrkirqhr5Bq7xq6Je8I1dc1cc/eke9I95Z5yT7un3TOutXvWtXHPubbuedfOveBecC+6Dq6j6+Recp3dy66L6+q6uW6uh+vherqerrfr7fq6vq6f6+f6u/5ugBvgBrlBbrAb7Ia4IW6oG+qGu+FuhBvhRrlRbowb48a5cW68G+8muAluYuJEN9lNdikuxU1z09x0N93NdDPdrORZbrab7ea6uW6+m+8WuAUu1aW6RW6RS3Npbolb4pa5ZW6FW+FWuVVujVvj1rl1boPb4Da5TS7x4qnBdrfd7XQ73S63y+12u90e573vuve0d/vcfve1O+C+cQfdt+6Q+84ddt+7I+4Hd9Qdc8fdj+6E+8mddKfcafezO+POunPuF3feeTcu9k5sfOzd2ITYe7GJsUmxybEpsZTY1Ni02Pux6bEZsZmxD2KzYh/GZsfmxObG5sXmxz6KLYgtjKXGPo4tin0SS4stji2JLY0tiy2PeX/95sjn9wV83N/gC/obfbIv5Av7m7zzRXxRf7Mv5m/xxf2tvoS/zZf0t/tSvrS/wz/m6/sGvqFv5Bv7x30T/4Rv6pv55v5J38I/5Vv6p30r/4xv7Z/1bfxzvq1/3rfzL/j2/kXfwXf0nfxLvrN/2XfxXX0339338K/4nr6XP+v7+L7+Vd/Pv+b7+9f9AD/QD/Jv+MH+TT/Ev+WH+mF+uH/bj/Aj/Sg/2o/xY/04/44f79/1E/x7fqKf5Cf7KT7FT/XT/Pt+up/hZ/oP/Cz/oZ/t5/i5fp6f7z/yC/xCn+o/9ov8Jz7NL/ZL/FIPicv9Cr/Sr/Kr/Rq/1q/z6/0Gv9Fv8p/6zX6L3+q3+e1+h9/pP/O7/Od+t//C7/Ff+r3+b36f/8rv91/7A/4bf9B/6w/57/xh/70/4n/wR/0xf9yjP+F/8if9KX/a/+zP+LP+nP/Fn/fe/+W3yoUQQggh/gv80Y/6d/+dx9TFBS783zlAti15D/xrzXW5fl3vpZJaxADgma7ta/99qVChW7duF1+bpiEoMAcAYpfyE+BSvBiaw1PQCppBsd/tr5fqeIYvV99fqP8b6VcUWeBf699ymfojZ12u/omL/cfnACQXuJSTGS7Fl+oXv0z93E0u2/+v9TN/NQ6g6T/kZIVL8aX6ReEJeBZa/dMrhRBCCCGEEEKIX/VSd7T9g+vPC9fnSeZSTia4FP/R9bkQQgghhBBCCCGuvOc7dnr68VatmrWVlSuw0j7br1P4T+nnMisJ/xlt/HkrePHu1X9KP1dgJXPtX3fBZV9z5T6ThBBCCCGEEH+NSyf9V7oTIYQQQgghhBBCCCGEEEIIIYQQQgghhBBCCCGEEEIIIYQQQgghhBAi4/r/8SvHrvQ2CiGEEEIIIYQQQgghhBBCCCGEEEIIIYQQQgghhBBCCCGEEEIIIYQQQgghhBBCCCGEEFfa/woAAP//A5oN8Q==")
ioctl$FS_IOC_GETFSMAP(0xffffffffffffffff, 0xc0c0583b, 0x0)
sync_file_range(r1, 0x0, 0x0, 0x3)


syz_mount_image$hfs(&(0x7f0000000080), &(0x7f0000000240)='./file0\x00', 0x200000, &(0x7f0000000100)={[{@codepage={'codepage', 0x3d, 'cp857'}}, {@type={'type', 0x3d, "56a167c1"}}, {@gid}, {@type={'type', 0x3d, "9d4b9598"}}, {@umask={'umask', 0x3d, 0x8}}, {}, {@iocharset={'iocharset', 0x3d, 'cp864'}}, {@file_umask={'file_umask', 0x3d, 0x3}}]}, 0x4, 0x322, &(0x7f0000000600)="$eJzs3U1r1E4cB/DvZDfbzb+l/9hWCh6rBU+lrQdFBEWKF9+ABynWNoXSWMFWsIK4ehbxJggevXkWfQt6Ed+AnnoonvRSPBiZmTxuZ9Ld7UNa9vuB7m428/CbJJPM7LINiKhvXZ/7/u7ClvwTLoAagMuAA6AJ1AGcxnjz4drGykYYLJUVVFM55J+Azil2pVlcC0xZZT6VI+bLpTqG8u/R4YiiKPqxZ6pfRxILVUfovr+LAwzEvVOtbx55ZIejpdvVX3J7WOxgB48wXGU4RERUvfj678RXiaF4/O44wGQ8Dj/p1//C+GanujiOhfT67+jlSMjt879alc331BRO7n0nmSWayjIeE1G2uRvQR1ZhB4i9ZpUqFsdbXgmDqZYq4DmuxnLJxtTjEpKGKLZoG/ppwjA3LVHW9nKDqg2ubINriX+06xo/beO1ubr5Lx3EJD6Lr2Je+HiDpXT8V4+E3Dhq+/iAl8+g45+2l6ha6etUhVZm8/dTqpIzyR74+D5rpWfbrk3UZCwmshTRPn73kzhfNey5MILixwq6dTP21qlco8Zcs+nyH2OusfZc3rIbBlOL98PSj1IOjHFGJ16KW2ICP/EBc7nxvyNTT8LeMwu9XKiU8ZFR2p66SmnZjwWqA9/rqmeScsO4o8u9wF1cwvD65uPVhTAMHlT/IukqxyQefSDGh6N8Rz7n0qApX7gADqzSv1EUGVfVcZANtMXsqqZefJs2WZ1mW3pxf5WipbdhbtU1e+LstCnfSc4IvdT+NM0lrxbdZP8d176+aT4gRS7CxuEekElVhVU1DHTUU7weKr35ZHUh7P6UQidPttMxfnv7StXh0NGT4y6h53+5+cq0OuvIB79kNhLtVXiuxBnLDGhEPf7X2QwuLdY6ThxMXoTBbNmc6+x54FxbjQ6SGp+1F+vHceI4fitp/CrDK8sh5vANd/j5PxERERERERERERERERERERERERHRSdPtrxF6+TlBscatPvzHG0RERERERERERERERERERERERERERERE+5O7/y9QU3eMaVR+/99aB/f/Te5LQUQ9+xcAAP//GmNcVw==")
chown(&(0x7f0000000040)='./file0\x00', 0x0, 0xffffffffffffffff)


r0 = syz_io_uring_setup(0xf00, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, &(0x7f0000000100), &(0x7f0000000140))
io_uring_enter(r0, 0x1, 0x0, 0x0, 0x0, 0x0)


syz_mount_image$minix(&(0x7f0000000100), &(0x7f00000066c0)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x1810002, &(0x7f000000b100)=ANY=[@ANYBLOB="002ecbc55fe6d6100837adda58fa7d10ab54aee93b992510be054d731ab7da7e75676e729a84f3b6a90100db5e477dbfee9ea3db9e2cdf0af3e9f7718732aaccc2158ad1dc498162eb5e87e3ec955164b6a97fb2a48d7a569258274a727cb0c7227e8f51529264e568b34e6f7ee018b3562d8fdd26e1b83ab2b09862ea8ac241fca01893c02becc286b2b17bd8c515b3dd02562333f6a7273bc91c9841bf3cb673bc8942336c5cebffbb08f82ba108af50c8dabb9628fc8e59c207395f370146898f1f3400f50f5e0566363558fe2c744cbebda08fe49b2155b62fcbb938b0d78d5e36b5e6b7d1c01f8b6423066333a94bb51f311c1d70dc272c6528d8057273e9bffbc8747d7c8a65b368828d39c69fc42125281702192328142ebb5b396e66db522ca6f2ae2ca64ab0d9d3f0eeb890d6b5a376ab004afb2ccc83293222ce378ef0e8d88e873ee168d615985aaabc293ce789dd163747e965405c11730f23faf8053fb37e93d5a54cdce54c1ce09598258ec5892938c5a16cf0c548695c973b45a0bc95feff28efd824744057c5da581fe9215d1a9e358a9da84b4ceb0f586c2ddbabbe2347dd728b8e05ecf90b4c7c9861ce1af7709c9babcdc2bd2175a8496a48942dc2755a5ed6296745ec0810e42050e657b2c0965d423077136da0140277053c8ce91d0000000000000000000000f81a60f8321a29d95e555edc5822e904e5b3821224bb704efb9aea0f736fa06f46b1023fc644c7879a6315e96f6695f65fef95d6dbf22d80c068a20fc98bad02dbeb3c9f478063d2f2f8fd5e8af7a5d5937e5626c71efa3369e99787e78597c01acfa3b273102993abd03263ae4115a65254d32c517eddeb58bbc458d025fdc566906ac145a9db74f46d10805e6c7560f6740cf29445f6aec713655cdd27032c6413f342d8e76782bdc2d96870cf7e84d15838c48aa6af77086acec169846791fbb50b0f648adbc6f4058870827efcf4da44b43c62f3", @ANYBLOB="86e65fc95cec3737ffc083e9420e0fda32ad9e60822bfec34b914022040a90f8741ca4d99221eeaef6d92e8595aacab6491e0f137f25a1eb6ce98ef49a462828f95b2b068d0d761d5b739c59f7553d6f322aa98313062ead06b1ad3b6575bd1804ab3d04d07d4692003ea4284df89a58c389ecd40e7168cf59c6200e16312d9aaa8722fb6a41995420c131b395b247f1e00fa27a838f52cecfdbf295c4f590e11c0121e67f19efbfe3041bbaa77cef1561d494e645805ba55eb8463cf1b56c08e44d71830924371064c7dd3fe5219c8af052190d7376aa62bf5edc726143be953f1494b0f2ac4591e4676317241150e315770ce42000ae7d8d7a416575697ce9989263cf1ac17daa0de31d3defb3c85aba7ba589d4bba6fb05b84fed5d7a3e6fc3692ee0e37c7515caaf9815bad099d93d420ec27e0da0eb2566233ce799632051375cd5b545428d3209708dd8bc9aed1af2b2dd96d8d7617fac7f5f006e165cd1add85134b939cf15175850d2f58d89ab6d4f0aa93ae8ad8f9d4f428b007af33a69188e661ef662cacd512ef14955045acfd4ab4ab699881d2365236c7dd59f7a28f5579b480600f3070bfe97f59e609da493e6750153bca6cad17312baa4879e0fde2f81ae15ce236f64fb898ad4ba6071403b0cafd642e2648d4e3218949dd9e906c76609ac9efe8c6176ad5e9a2f139d7b6e7e248dadbdf1cc74c05ce3ad93359b3928ec022e6ed2512c66bc64087f22a9bc10961477f5ad8acca7d2fbb242d52703ec70225d76713dfd7dc1ce24ab3aede0f39f2740181efdbaf4dd912701fd8c568d8d0daea80a4c9a93a9e28ba48f1dbafaf2e279c0d83c327e045e20d37abc0ecacd0fd6220295c16e0f4849096f0d5a7d337c8460553a5d5b63ed01a55047e22c98e9df86bd22e4ec55b7c311497d97b30709634c58c2eb751a724f6086d562b9c1d3f5fbba7d5c78a9ed08c924ab085ecaf50cb2a0437174550ea20bdba86364bc9453fb4af21f6cac1268118be3eca0ea7d6f94447e7b2cb6247f42836caa7da504f60314182201790974ac7b648a57b8d87b7af022266b3e3fb6303583fa876de471b1d8ac73f0af724581271be1ba839261e7b50f42e1d3c44856f0a5b3e148a7783ed2ed70ee435cec055be29b7997d2e2bb52e39e413b570d383febe08c38310ca77b3622cc45c1a79f0de240f23c0a9a28ea6fc2030f927a1e07fe15ae8e70e170a7152ce5475e199b780390c971c9148f72eff33cac7f0cb23850f1d7f2494733f86bf52df43c2a44b412c5f0cd830e45a9fa097d4f7508fa6e1f33ddf6953b6dbce69aa70f078c4112636c3a902252a17b197ecb8f05beb5a276e0c297990b240bf0156601deb9105be5529e4c6923f14a8c7e1fe6a04cd229194a0b974d473ce6726fc481bb7f4e172eb661162d221a6837849fd618849da2d300d25db9cb8386efba1c099edd5ac31bf9fbda2a8a3b824cbe2e953710555f579d8fcf51b6d14bb098cdb68d25b9962cb163b78b89ddcb201f8882e036d60c2348a52536d06f1dc891fcdb6a6baab4ad69051ae5eefd73f0f67eb0f254dd3b460a2cf56a796d7a16d67e6818119cc8c40facc305657bfa74bdfee0faa0eec37cf000ecce3f86cc6f6cb40ad12a46a0184391fe32616531b35fd3bf1c3d28d00aa052fd7c2c1891448db260db1a9717281a25bf705514aaee649e0630da06fe202e3949674554373d878e13d90f2dc3fcf3143f1cf9605f694b7854038409573d12eb872befe1132f79d4d7a9b1964fc1cd5214fb99a0ffd1e88936d01b052479e777340b29dc97c5b02e8ddee2a848ab31cac8696b6cc6f7630f2aaca9b8032780b68f2e26d42ea1b86b40fd499c394cf8880d3ee6fa4fc6dd4ff236d0a82641a95767ae8f9ef5ed219891f78db23a977a017da0b0c1914334cf3c919ba02eae35d7e98b96c6a9e7e0a6f0f6747cf91c008ebe56522e7dbdf7a942b16e01427ba658543b4438686a62015bcdfffcd05ef587d3ce8a916a6ab0a80c17096a3180a57bc20bdadd9b5380a59567bb4a0b10fcdc5c98a4168753489c45dbf922962dd7a8968de0fc2ad6d4abfd1fcc530a8eea9f025ef6b7442a7a340d1f8b477db5d4e65d864ff2cc559640b47f47381c1d2de06e957199db1dadc94ae0a320b6829ec3e7506856869979f791c2122e6991b3789a87ada5f5169040344b94841e296dc963d38e775cdc5e65eee3bd059e55c5cc1fb31f7061acc19e0b8dcf72b583578fd5557bcc6dbf8b5e0b7185e7d7dca2c08589c8f8d57a8df8b015942f853a69d7ba492337b04c5f3e6e19ec5fbc28aa100174c71912681576dd014bd4d496abcfdf75289dde158bae37686b042df4de5de3dd19105887f3e4e291a30c1f19061bbea43ea75afb678b1377c2bd189b86bf60b1084dcd91d23197e944f691153df147d5051f2d5ed1b0b55677c1e7a6d99aaefc08db70e7d4b5a59e0d8fcffa8fd76d42e4c326b1df840538662c60c046d219beeb81a62c28565cf2a71742bc702a5c9bad03dd69d00da46478bb48d0a85e333c651245ea387f286cc9114f97cbfcdfe0dce4e37da4a41943785781123dace55d34ede0a855b36972ed1442b1404df80f9532a0db809948bd241f3383d56c8a6ef57a9e5549c12f2677a960bf54d7cbaf6560bd86a91ad793bedddbb704324126b5ea2f145b8aeb17d1ef863cde3a3e2749f5bbcbd45673269d8421cc1eb90ec66f7764bb657de407119146d74b460c727c0ffe7a1f85db6f43dea39a1b81bc6f65cfeb8ecba31dae17f16c4d726f4bf778b235ae38c3270c663d1391a4fc48990383dd240714612358ba3d8dfe2dd6f3347146d3e93082a49bdeeec58263a6f7d34e2ed759eb9b0fa298ac063c09ddf39c86d426d1bdbd173cb3283e99cdc301b13557250615e89e870ee820783eec14398fe91996359ff4bfd39f3a36cd5bd697200f5db1b1c4a58bf60d55f99b323450004277e90a32b0ab09327091afe269a871504da4ca79ebdddaa2d246bcbdf3c57c13184cda844255e03b1437ef5e79545b764ec6dd666b17dd0648806805d09e12c68d794a6da81d70454b09fded5d98aaa17d69e0e343995307785cf40e8993dc72d5f2b7ee69efe86ee897d1d0d46a51898c73d09a709fc9180c71d6cae0a6161b4bcd39bc090dd319165d73e312108a1b4b2515c572e03bd4be6184717037e29868d33f7ccc2885af1beae10bef038df839b7cbc126b1baf8f2e14007e49f2d4a95626c625ecdba3333c69cbae6416a18a75769a7020ecdc96630e0a3c920de9329d6bb6b605f6eb5c2ba273bf5efa977bece3bd4ad848fcd2f11a4103f9abed1eee1b7ea2ad1ebb75327125a1eb5f62c6bcd45fcdac8edc6a29aed4a906d86da215b492ff1a726aa4a317af822060a674b9774464d53a477061117cf3a5d38d91bf3df21dfc398ef04f565c53abd7ebb1b41e2bc5bbc5f218598010c829039852216b19d0762cdd7dce5e7620403313180b04a97651350a7c2bf6380363528d733a74a64594d7a801d3fb3fc80cad52a77d5b8518ef89310f8509bef1933ef303562380bb1e381d49b2c08cc56efa1983c63f60107f378704ae1e89b9dfee5b7478e5e6e5dfb62f75387fe010818be933b0504545513739db06db38abc16387f1b7a4f3908a6a2119432997718506a7e750e40215f6d63787aa74cfe039b3d38dec31b9dfbb3ef901a6f66357d60ee51e06f8b72555092b79be38ec8952bda7dc78dc37ef1f35d0506f6a58a1dfe288b5d388df12c5f6f5eeefcc6c4ace03643e7b600904623891d8e31b2ef23fd8137690621f3deaed46cee675b6d968bccb784add5d0ede8e0289a163b00f5a42573936864efcfb2357b6bb5f4e1872aec8f9d7be2059f1d98a3da288016497e5b358dec464e64dfb21edcd591ac769e100db57b3654c3008644839a929063a3c142606efe3edf19f63ef934d7a949bfcf85dcd6b6fd7a64e786f9ddbd1f8bb86ed4203676cc57f4cc32ccbed9888d7be95903dbcf1d60c6e93e7d5683cfd58950f589284e6950651a8624a5bdb4c23d662ff0554914e0c1f1d5e9b189e566d6daba050749cab9e694fb801efd5f6f342cb077912a846536909a9b3b11d95df8983ddc9448f7b1b89017ec4860bd317467d08e6c0656585d4013270fdb5aed829ba2a44725b552014ac4127edb29adf23956e4ed96ef23f052e4ac88252d5c7842b7e255988d2415fb3a46500c686bc3ea214e5c552b916aa0800cd83bce02e2731c8145a28c4fd19891c5e032e762e1f3a183e091bd5475f682593a0d6ac7697d32ec78142699b9664b7f41194daca4867e9d1dce20c3781b03c18a6d5dcc5f716d8d3fa38ed9604b4a8afc0fefa03b02063a33861d8f17574c6801e6b0bb34c54db351554f3b9f7ae6be472cad47aa41a28382bc5e61a001b5b32c7113a363d6a392e757510858313cb6d14d27e34c8e22486376a6c51bdde8827f2005772ec868b67be57f97d9d341074fc5cdef9678b3b72bcd509d53c9244a85bf790e9cf3dcdece8d66c3bbd5baba730ddff52bbb7a66bfe421684192653479b3588a9b4319c57060eb3fb6a7eec73a51bb5836f50944a5d38a81811b0130ecda2966944f475adb4d828a01c88f7dd29de8ae38265bd382d5e9afd9ee27c378cf56e5e4195ca8a4d46f6c28e1e5e9b4a7a3f6942102dfc249020e79e10061ba420ff2d1181ae6020c9ed2f9f386b8b56bf487b9ef0fe86f4443ac720325720b13596efb718ff28082eafd42353321faad5d4ecc427ac9e9a51950c1e4fe8151d8905108eda21082c09b7858dffbea8ad6bad372e8b37949e4d1b5efde2a1974f1d8e05220ae58bf30cd9b914da9a65b08ad806187cc4385994646811851978a1f9a573f8b01db05941db6b7daedca083981b775cce43228e36e27b591be6c435974b6d0e6c082b08a9d07fd3c138431d8238db64cbaabeff4c522d0504329d913afb4f173d19d8c9c84421199074a91aa0c9ece65ddb46032a6dfcb15f09f582c8b6b4adbcaa2f83727eafa9e01a59fa17d481e3afd4a3e94f860be0c8e6d0fe1031c384b0357784f6d170ab328506e22add3dcf367763c2d9e09737aff5ff4836dd7fbd571548bfb94144403c906a5f2763b619df470843ed660f3c48e6d2f6574c6ccff2ef3dd103ed2b60d643f9a9f8435d8878bbfdfa28414935ba08557e8a21a44df75f21dd7be31013a692729649114e2585be2dfddbfe5f2deb6ab8d0cc4e245ae3b1d99d4d4e6c167d798c438664599e10e839864f6beb76d8d193b1cdde48e185fcb897aebc659f14890056b3cc044f32f24f8564fb8e5e6bf92a4365051561e0236916f8c9ff5115793eea703452ca42bfd6bdc3ad80c36e38d090bb8a992ab44ac41c7f661b586cf64ad3822b626c5418a8619a0697c5fd27a59f82b188c38006daa29d3ae07acd4c8bfcb3a3a11428fa92b03d1ee330ce4142b97894222244d43c4bfb8a0d202f7d2fddd32a6fd56a8f32974aa6d036bc641d6fbd85b54be459845f4aaff42d8ae4315d35b08e7cedff13cdf1eb8b3b76e1e82aa91564ef26d8ecb7d757d081a1455f14c6f5b0a53c8e4376083303427ad743b83d28d85724bdb65cada011b2500b058ea5b9124c07824c2b22595cbbfe9287155896f323b0f914334dc4e9db35717f0f6ef50cd4192a9b05b2fdba5840bbc81914b8fe1f34bef54c6f2e1068f8d1a9b51b7b1d9226287cfd7b50eb746a9e2b289fe43d02d8031928992f1b22409221e4557285d440dc2a20b677e8f6160eeb81fc0beca14482c7e6d62105df6b5e9ef5d8607d759513bdd3dd309014811cfc5dc184d990e7f3c1461aa1d1d16e8dc9973d967de1ad69ea605b777213ee511f44396eafe9c090f4997b5c6", @ANYRES64=0x0, @ANYBLOB="375e3992c06be11221202d161d77ad3db665444c4d5065ac49c96b4a4d3f4fb2af3a75bf1326dcf5df52724468db6dadc81ba536783e5e15b2f4538c9bb51509447f507be622f59b61d4ac3aced88579ba33c69b322d642e51e73bcf744c3074fe82902c5999f553bc907eb64c2d4cefc3a5a2037ce6c7c55793a3af1d654e05173054758efca95f059b0fc4354d17b034cd63eb970d3ed859ee0922dd611b714ace6ffb58ac7ed9c705b8671c98ccde8143398357aabf14aa26edeb60c33a292bb76505423992c0600d321a5d321a1ca26de2aa0fb21a0459e6a84fea7cfc57b9356d4cfd40a8c46c818be05358c2c710c9f6f1721c243dbc425acffdc35f27ab3ee614cf027be7204c7dac2baa857317b75008550205397f49c4d01ba70872c0965eb16e26a2852b2380c68eea67f7484a8d3ae82002feccb16721c9f54151ec9f5fa212b3e1f745f329cfc365e3aa6d3d829e81ba158faf5fe0d7dd60f17f410f6593f0fdb6f78547486ceca5acc45a51d37212bffda73a660cda91165d0030ed92f8e11eb413b04e8743f591deb42efad7e7cc72e0d4cc62adeba6e9ad38d339d413b55c6094f612f711a6e1039742dee793189650038cecbed1f24d1c589864d33da4fbc7b9c54abcedabf0c1588bb9ee335afd2c208ef0962bef09e4d1733f4de9ec4699364f246266b25412bd3fb058697e9cd594ed9f077e28d1c14eedb623fe149fb6d67ab845c2c84fc58df1c0af7553dc0cfc8c1a9a2e16973e9c5c7a063e2b3c97aa0dbf0af7dee3d84ad4cbb6eed21a4afc86b0a659e9696e4068d720d16be2dcbd6b8e44143d4e30a7cc895f70222dd11ef989b87bceba801a09a7d0420962cc37afdd76fd4e8a9da8e5865bb3e355aa00c1f09295ac3b575cfecf10e699efb4dcfaacaa75b5ffc3261bd5240a364034660703da23f80a9b23337f75bd0ac20e41a8789d7bc42b63676b99d40741198ab27b31e26c5ed655021ad20b412d0ee90c940ba30afe3e3ddffd3a674f246cc01e4d68fad953faf5d888e02f7c2f045c32bb9eb4cb7806ba199ed62ae4f72b1ff22458473fc8cec55d8baeabb015fd769c9aa6062317f5b04292273a7aa9d4b460fd67a4a5860278f04556c6f2aee59d6c0abf34f76031b4e0093f2415f04541f191b50ff473db3a927f4217b42e7adeb67e1190f9b2ab317e0a5fc993ed8f1f99ffc65d42d79c75dbe6b39e2e729d14f26b7735e325532e98d259af26450f9b4b8073ad91a557fd5e3f4438cada86dbd52341b6218014339fa68c0af9069bee818e61ce736af650afc57b7736bd18387a5a31d39e89dbb50961b51ef7771f51bea188507adcb6aba577d12dc404f71110fac39dce110a9aaf53ed451d002ab52372a1e534205c3f6dcc9033496ed87f716dd5acef6b31af24d360c8fcef10d8299569895ab5263e8e5273eadf4057bf15041c6aed21cc66083b7342c346650102895877f871344d48a76565de75708e79abb9555095475b04a9338921f0db062c5ca0ea23014502271f5628115c897c9015f31cc24986987942d7ab9650040fcc558590b9cb73fa1f352d5edd33a3669a52448826919c2a669c9628f10a7f9aa10c05a54c7b64ef8b13d997db63ddb9a90b8b6b8c757da1d229f6330be424836285c99ba08aee559c77c65e76dce358590fd2760cc29a9e1c3824eb9a638f37320313ada1590af6beb131f828594357019b17869f05795e912cd385101aaef8d2008c594a096966997f64d4de0cb407afd01d169b34e4825b42f09a698aa33f877548f9a4f90655e2a4481dd2befe5180a232e54e86188eaa28277b2bf9cb0a3b46e788c8c476ce80da46b46fd13072950d2c637637ae134a830fb5894a368fdbe2a593a84b8b1fe3e4d9b0b8254c0dc1ca116c632abb5fa1fe4a3b1f3d7b7ba1e95d33e4900ee32427ed88c094c7a0aa725ba9a8ae7bb68c466fa808b2e7046402f6e90e28098c6fbc6388d6d6a8eba46041fd067d2f45f8ca38ee8820a4ce4a5e1a80df67af0f211441aa6186c3db017416c24d4cbc6d5ffb415a4b05fb7c581f6d144da8377684cfd6525e377882a2a939f6d2f588a2c91860573e37f58fd77549a92bf602d89dd5659e5f97f6841b14b7aeda5e3d090c268a5eda7afa3595119e908192849347d6ba275881c31d99b848e6827faf10b698c24095f210bdf308f2acc10432be067c55ec7f230bdb04fee4b59ad25f26226d2064b49a8754e436f36184d2761be0dab6592461caa7fcb08e94fedafab0f244acd5a0c9fd76a22b546c218a6849b214e8297b77fc9080bd534b54b1f6161e976ab1cf2596c92a02e9541bdcfcc45ef2ab4e1e784061af1d57477f56840d36a3468478b1ddd448454b65fe4c8057654ff2494a119d35e43e7bdcc965ff2852f9bfd8fb65968f308604d08ec146dba28d5a1f8def6c96ce9ca893a5e9d39e131af7c75dddd2198fa84f857237e413b6a2af3cbb1bf19e9a62e0d51ea998d40193087082831a580ae4b05739c8b3d981d384003abdd354456887aa521b30a1cb6cc1548be8b8f521ac65ea408c756ec1e791eb99d94911580b8e91b13b62c3ead90166072e1bdedb91582ebf0339f9a58641ff7168a13624024d8e956855d15ed1751fe4c7a0511d44a9328e052fdd543b617f949e1b7e19d410a2f23a181371d98f1323468137d768461b0fca7f76f1654bd98300972bd75666608cc9f046476da7359854081b4355b1ff920b7568f20f53feaac5bf9e9e5ab1af8fb9ab2155af8a1286c68d3e1e4272606cd84824503f827f6dc20640028c30ad7f0a77d322123562d174f4d098c486b7bcaf76435b76a3ce9e06c9767ad0a55503a27cddddee90385d5fe93cfdb4e2c03ccdb9b9e8433e6af6465e1bcd20a7fda3f53613d5c22dcf06363a2b74b27cd4538a92b570a2bf208c036a5b9e5aa338dfc82f3f86223fcf228f682187a2b4ba88b41300994f7860fdfeb336f3933f338dfc0c461f30e51c796c9e3748c393ce201afb9da818f8cea1a7e43a0a662b2cb23d16e575177cf772a45b0e870c81d2a625f91f8fc7c60f372e228536bc1de1ee6ff1a1e411c9ff740a547c62cab8ea3302e5d121fdc7cb9763fc05ccd2e9afd98cbb68ac9fe5f4c8a73c61caf3c6d339f8035d363ed9058a83560efc728dc61163ef67bbbaf051a5038b3143ebe14b8a7bced5c5ce6053db12d6a4eeef0c050d40bcade6a1fa1c807bdb12bbc635a4a284f9e12cd037db49fa95ce7a7990ac253cf60bb2a81c39d08cd68fa2d2f035beead1d9a02fe1bbbf73cc08c71d4622be2728a5d0686a1dfd5e1dfa03eddf156fc4dbe5e1365efaea142d03836877d36320881594a2bffbe41856bd1ba3ea78fc40d68d1aa3301fda4dc1e49fcf6ee5ea51bbc7096a6c84b65a9c05e76f506bb828acd7cd070215d0bbb6b419159e330d09e9e3f9902dce3e530dfe178feadde42768972826103a681541797eaf594e5288d443a2961dab9337b5194a760577b509690fa40e1e7a92e3265536db95b90e520b7537c652fc4c0bbb2e76428d9d9191dae0cc079474513c2f6388a94c6d157bc115436d03f9a8e42d19f99fee9e1083a8afbd3aa0a4e23f1a7a6c6d75c45e4ea54f90b6e3bea1d738b410ecd9fd846e60464b342789fbd7cdd81cd37f704036bdf3ebd591b330b34364cd4ed503d1356a831c74370923c6d498caa943e3b7eca1b9d3716ebb5ad4502bb76628b4a11cda7159d20db641423324ddc2a08e6e3254ab5047733755147835971ca4c2707c072c69c88af88721b63b5f8ba83034e76732fa0dd725a538dd3e57ced026e72109823371f12accf189009156c278daf16d87c70d438aa7074c893c117e79684e267d5c9b64bf527772bf7f75a2b7b5cbefdf272105901eeef1ad62b7ad3bde07719a3cfed6892b6f91f1ea94247b0c8233bff29507097ea6b9912d40947f70fa4ce2130a399a5ca4e9faebe24fc82bccec544ea22e684c1ea1e46d3241aebc61abdc8d69bc6d41f8e534269eb3eb6073d51c013c86cc8d870810c4fd1e0aeb0e0c83c14b296dc506573253d953e3438ee251251691b884fed8905eff4de667b1cdcd8a0b4db8b5f0a3cba448124df097288c19750bdd4318d2d6f6cef85f783f0bf4da320332aa211d4053dd3e62e99822d895d4e58757a51bba293fa118f410cc48695808c239d33b60a6cea2334770369d25f36cd54223aea5a37ccfbeb9b07e872bf72c1ed868ebeb59aaf768cb24152fe7c81f1b19ef7d5a2fd0e15ae061a3bde72936d54cd5a30bc300eabb44c7eb6a2b69cc14dc386da881d57108a375195be503d0701ec01f3841868ee708453eef7930b6ccd211be3370f05a05d294db91636c1cc14b7374d7bf02eb5000c8d6e3a2df660979fc275b77b43b457c3b53a12396b0d809bcea90ba106a3385ad244ff0c544d151f31f50d242776466ec3d9f8cc5bb73f89f7845fed3c4ab340c005d846ec9ef8feda4b3b87e4596bb51de73c141fe07b410f0eea9753f38963633f6ca6ce533e6ed85ea8e28e01098541c3e4dcddd094e2c3de5636c22580495caa0bad814639689fef3c7ce83435021e2907032a9ad1876478c67440a3df10d5ccecf9e2e8232509e8207fd6d68e5d3507ff871c1a73ccf2adde6d800164247949ecabe4edc0e2a13fada9326d2dc4e4a1edb1c9498066638feff434d4bf910fde169288e77776a495b4abf00f7352ec29a8bddef084445d13ea913a810d1c17403b268fd3bb03d20d78f6d3eca57726312e1796991780aee11f09b3048c8b0fa46372b9f2f7ec34d18d04e90db2ed290375a295fa1753aed0b33210ba23053418c42bb2d08929984e5784082810de0e98ec740dc0be77522cb526b6aa609916de0af480e8b8fbd20b4ecc8802a47a2b9f542f3daab393e20d9c3e20b249f7aa64155de85620727bd9be5b1528a0ad16355ab0790ca533a54a33bd91b73db78adfc057c10edc5ea4d3d5b60a4837dfc013601d2d1d9a34f91f1604660560107ace654def9194c588a531d87f0f5260e90fdf7d38087af498ab9c97cc76d4213ec6f1247f2d6bbb50bdc7acf0207fc789828d8964666df6256a2064227cc1b4b5ea963aaf909f8c19f000e8b6d8871757e5b2ce93d4a115449e3d77928839143dc8e7cae8e46cae5b370e761a3d3bde20bdf49079bbeef9dfb8f9e5f3699206dc1096810f632b2346e5efc8e27dd370ce5a4f95f7d787b077144e864908249e4c38c6e2d4955080ed1997f8db0e7e15d3291775f2018487a4ddbc557fc06d86cf2aaba86c52b9e588ff21b3a477210dd1a320710527e01806dbc6faa5d87960b2fe7c374f620b8f8ee0c228070d7c4baec6f3d0483c5fdb6d9f6b39a983ab9b0f3cca3e605b8c62939201a4057f8ab1d45f9b331e748458a3eedc040e0f48bf0ae4ad50b361710ae3d52884cc0188f24a35ca2bcf50a4d02427320850df314584768c7071879b77ee55ea074aa0c9f6ccb571b255235d9229f86d80fa1b0d0061349197ac54b74254f2377fe4c04d1db62b8ab104694ac02e3b3b335ad9de4bac37408d77f122939f9454e9f566bc7e2074ce758193a7784721871a59ef89eab0f970cf69b2db69bde00b1272ee8f4acf6a505d8bec25ff720c134afd67997dba469bc178e9fdb95921e2a56322bd868a44cd41a273b668c64c21327dd78c652695ded3419e4178566e5fcf12c6b678099f6289f80de95dac340788c277a68329a9bb84c9013821e6bc5bf33c5e0dc73a535bf", @ANYRESHEX, @ANYRES16, @ANYRES16, @ANYBLOB="2da97369bd5bd2a022e4fea628166430fb7a26dae38cd827ad7f8cff5d2246bdd2cc0e8101b9631aa9db6c88c4ea13a8fbc6a23601da47409ecba43e29d90521e4a37f2f57fa7ce2366b5b89b5b9529791fb53b47e83c2014cd5779926a7dd8a0de70a50b2baf658b32d6d108efa8d3b6101762c8308a5b3351fd14516c9c33e6c6bd15e956f84604a27325b8ebb315aff3e39aa98ba22dffb1b6a7c1acafedad4ef237de4595f77f679e98e9dcbf01dc5bcbd5c199b9e95c24b", @ANYRES32, @ANYBLOB="0805c250a09347cb0bca3b9f8a8755b3945d3396e6eb14eb64426210d11830f13cb571b8967902d058a39cf86287f14c080fcb8d529bfcda2a3722d8dc8ebe2c29476750ec92bf56619454329e748b3549ba625812d4422958b334db652bd2b9492784354250a06d9b3f22846434889dbea72d8f13aa590030f83b62254e87e4230846ce6bd18bc79e73a1a1fa67571c3ba78979edf79dfc3410b95c51ce90bcca297c2a6995b236c39c5ec957cf8717fa28a560525b50a3a689d2bc34038bc078356614f0c584b2ae572c025c4d8414161f1100073f6fee746c008cefce574d1c1e0333b07febbd41add7375c604f3c34b6606013a8172cea655aa6580601b3668ac91df25f684745c94ad9ffc15548d32a8608c5acb60bc437052b2dd51eea8957d673499f6f685feefb2332976ce89829d1cd967d7dd29336387ff12a", @ANYRES32], 0x1, 0x1cd, &(0x7f0000002180)="$eJzs282O0lAYxvGnBdqZ8ftr48rEhW6kypDo7JwL8AbcTWbqhFjUiBuIibLxPrwMd96JNwCJXkFNSwkUaSltaEH+v2Sm75z06TmTzNs5JSAAe+tm+N2QoUZY+b7/5YGk168k1VOjVikLBLAxvjFp+iTW8uGD9BSA3VD7vXzcTmzwH9PC5C4A7LLxaS3cB/w0pF9/Pp+Poq9Gxv3D+NSUatEPc3lLOsiUHxrh8X5dGs3lbcUvYHxN2L98n+QfKZ4/zLr+aP6jhfzRqmDwyBQYmuHh8cN4/oqkq5KuSbou6Ub0rHVL0u0l818szH8v4/qBIoK/vmZsJFPbxvPNlBPs9HzQPW86nvt0rVlnGlH+Wc68FeVbBfPHOfN2lG+ev/cuEs96mfPqQDrzn/5fz8r+TzR7+bBI/9cL9j+wz3r9wdszz3M/rl+YuVIU+1JMN5LBSHCbXp36VvmayysOt2MZyUXFNyYAG+d86n5wev3Bk0737NK9dN+12icv2set9vMTJ9yXO0V25wC22eyffrbz098SBAAAAAAAAAAAAAAAqnBH0t2qFwEAAACgFGV8nKjq3xEAAAAAAAAAAAAAAAAAAAD4X/wNAAD//7bVPAk=")
rmdir(&(0x7f0000000000)='./file0\x00')


r0 = openat$nullb(0xffffffffffffff9c, &(0x7f0000000080), 0x4000000004002, 0x0)
r1 = dup(r0)
mmap(&(0x7f0000000000/0xb36000)=nil, 0xb36000, 0x3000007, 0x38011, r1, 0x0)
r2 = openat$nullb(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$FS_IOC_SETFLAGS(r2, 0x125f, 0x0)
r3 = openat$capi20(0xffffffffffffff9c, &(0x7f0000003bc0), 0x0, 0x0)
r4 = epoll_create(0x1)
epoll_pwait(r4, &(0x7f0000008040)=[{}], 0x1, 0x401, 0x0, 0x0)
epoll_ctl$EPOLL_CTL_ADD(r4, 0x1, r3, &(0x7f00000000c0))


r0 = open(&(0x7f0000000300)='./bus\x00', 0x14937e, 0x0)
mmap(&(0x7f0000000000/0x600000)=nil, 0x600000, 0x0, 0x4002011, r0, 0x0)
mremap(&(0x7f0000001000/0x3000)=nil, 0x3000, 0x4000, 0x3, &(0x7f0000005000/0x4000)=nil)


r0 = openat$random(0xffffffffffffff9c, &(0x7f00000010c0), 0x0, 0x0)
r1 = openat$fuse(0xffffffffffffff9c, &(0x7f0000001100), 0x2, 0x0)
poll(&(0x7f0000001180)=[{r0}, {r1}], 0x2, 0xfffffffc)


openat(0xffffffffffffff9c, &(0x7f0000000340)='./file0\x00', 0x42, 0x0)
fchmodat(0xffffffffffffff9c, &(0x7f0000000440)='./file0\x00', 0x0)


openat$cgroup_ro(0xffffffffffffff9c, &(0x7f0000000280)='cgroup.controllers\x00', 0x275a, 0x0)
openat$cgroup_ro(0xffffffffffffff9c, &(0x7f0000000180)='cgroup.controllers\x00', 0x275a, 0x0)
mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
r0 = openat$dir(0xffffffffffffff9c, &(0x7f0000000180)='./file0\x00', 0x0, 0x0)
mkdirat(r0, &(0x7f00000000c0)='./file0\x00', 0x0)
openat(r0, &(0x7f0000000080)='./file0\x00', 0x0, 0x0)
openat$vhost_vsock(0xffffffffffffff9c, &(0x7f0000000000), 0x2, 0x0)
syz_open_procfs(0x0, &(0x7f0000000e00)='smaps_rollup\x00')
openat$userio(0xffffffffffffff9c, &(0x7f00000002c0), 0x0, 0x0)
pselect6(0x40, &(0x7f0000000140)={0x9}, 0x0, &(0x7f0000000080)={0x3ff}, 0x0, 0x0)


r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='.\x00', 0x0, 0x0)
preadv2(r0, 0x0, 0x0, 0x0, 0x0, 0x0)


r0 = syz_mount_image$btrfs(&(0x7f00000055c0), &(0x7f0000000000)='./bus\x00', 0x300000a, &(0x7f0000000680)={[{@nodatacow}, {@thread_pool={'thread_pool', 0x3d, 0x3}}, {@nodiscard}, {@datacow}, {@ref_verify}, {@clear_cache}, {@nobarrier}, {@thread_pool={'thread_pool', 0x3d, 0x8}}, {@nodiscard}, {@enospc_debug}, {@ssd_spread}, {@nossd}]}, 0x3, 0x55a3, &(0x7f000000e0c0)="$eJzs3X9snHUdB/DnruvaFdeWMOuArGwDJFtEOjdNCCR2bNNpYTnphE3I+gNH0DmtY8NVCCtinIERijWMwQoLbn9MEYqucyiJBewqul8IJtNFBbPFNWOkOBExYTG9u+d299zaHhMpwuu1tM/zvc/z/d73njx/3PvW73MBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABEHwx+N3TL/t3roJ26+ru++8a85e+2D3kuMX3rq1avND20v2dTz31aNVq1qPLF1w0/2JpkfW93d3BkEs2S+W7t9w2fwrr69vuKI0HLDxc6ltZeVQT5nq+mKqMTbnwcF+uT9NQRAURwYoSm/npXfiOQNkdlfkDzisayf1tE4dP69x28qujc8uu3xL/ktnUOloT2C0pK+rgyeupdrk73jkiEw769KL5Vyiqf7RC+4deREAwFtSk0huMm9H029xM+22aD3Sro202yPt8B1Ce3bjVKTGHTvUPCdH66M0z9pUVCgZcp6Revr8Z9qJaP9IOxI13sI8cw9NR5rSoebZEqmP1jwBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA3k0uuH5m/d49D7/8ldbf/u7h17/16sePrGq8ZaC7/qJ1ix/v2PG9vx2tWtV6ZOmCm+5PND2yvr+7Mwgqk/1iqe6xZ6ri8ZkDddseu6e3puFDC9cUpccNt2OyDg72hzsXVwRBc1blYDhsf3kQJHILyWawIb/wpeTOZ8ICAAAA7yVnJn/HM+1UHCzOaceSaTKW/BdKhcVrJ/W0Th0/r3Hbyq6Nzy67fMupj5cYYrzak46XaVee+IllBeMw/kbHO1EPD12RN87woiNG8/zpx/qnNdfdUHrl7gsWzphdv+XS4CfTD3csX3TfhBfHL9nXXpOX/yuHz//hmZP/AQAA+G/I/9FxhjdS/m+uqZh0cOp3ix67rur44fkP/Lyz7/kn4w8VD3Q//dLYcbf9cnVe/p+c85R5+T+ccZj/48Gp5X8AAAB4N/tf5//avHGGN1L+/8X+zZ//98pvTDk84187Xnj69xdvnVI+/7WyGTe8+cSCVxp2tf0pL//XFJb/x2RPO3xwVzjhZRVBUFP4SQUAAAByhP/vfuKjhTCvpz45iOb1y+4qe3LXG+tvjJ/V8o8zFvfPqv7i7tVf37ApNrChc92O5XNX5OX/2sLyf/E783IBAACAAvxm+y13V395ydYtew7N2XFnYvPYS+a+uuennVf1vXwsUfT8zX15+T9RWP4vGZ2XAwAAAJzEU+MmPnfo0UNfm7177YS9q9rmPD5t3+qFD/xz9t+veOnPxzddWJ6X/xsLy/9l6W165UOq087wrxA6KoKgdHCnJVXoC9o/mSkAAAAAb5Mwpzc1revduX7MrNfOPvzDNSuW/2rvpd++a2P1zQd+XXX7ucf2996Yl/9bhr//f3ing3D9f879//LW/2cVUnf9u8SNAQAAAHg/yl/PH94eP/XNBUN9/36h6/8/euaBko7m8ysnx7dVz3rig31Xra1+fVHHRZ/YfusbH46V//VTefm/rbD8X5S9fTu//w8AAABOwf/b9/8tzhtneCPd/79v3DPnrPnsPT+o/WbZU+e+eXfzd9oPTj9v87QzPlJ0fvecmX/4fl7+by8s/4fb07JfXk94fm6vCIKJgzvpuwluDae7LFLoKs4qpE58pEd92CNd6CrJKiS1RHp8rCIIpgzutEUKp4eF9khhoDxd2BQp7A0L6eshU3g0UugJr7R7y9PTjRZ+FhbSCyy6whUUp2WWRER6HBuqx2DhpD0OZJ4cAADgfSUMz+ksW5zbDKJRtis20gFlIx0QH+mAopEOGBM5IHrgUI8HjbmF8PEfz+1e+so1D9b1Xt1w9KzZe5bc0faBnkW9O7/wo55z/nL1Cws/nZf/NxWW/8NTMTa1GWr9fxCu/09/r2Fm/X9jWKiMFLrCQiJ6x4BE+BypsHtn+ByViXSPgYmZAgAAALynhZ8LFI3yPAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAID/sHfvcVJVd4LATzf9oJum6YgBY4ygRkR3aZomGEQcUXRXo4tNJKtjhtAIjXZoAwq4YsyKr3GV6GLUmBjZwY+jJg6r+CDqRIXoiElGJfE5Kz4HnciqS9BR45gs++m+dYqqW112IaC0+/3+0XWqfud569F17r11LgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/P/hsiX3Nrw58H9965ffW/f697409TdTD9m8y19uqHt3yDlPbT5ocN0tbw1asPCNtklnXtsyffk1G1YuDaGlq1xZUrzsoUHl5aP/cMxdt13xcNO0wVPOrcrUm4mHfp1/yjN3Loytvto/hLvLQqhIB0bUJYHKzP26WN8edSF8LmwJZEu01SYl0g2Hh2tCWBa2BLJV3VsTQl1OYMoTD666rDNxVU0I+4YQqtNtvFCdtFGTDgyrSgK16cCciiTw/uZENnBPeRKAbRbfDNkX/YqW/AwN3Zcr8vqr3G4d+3Slh9cnJhqK53vrsB3cqRxV6QdatulpK6iOHaLg7bHau60XvNsKtvMST1vuF6nMN5TNW0LVoXxm26zpCzrmx0fKQ2Njn2I17aDned2mc2ZsTbrXvA5jBxq2y+vwgdpJ9Ze9PfHglSf/8fR956+duq3dfC5nk+amd7TqkHnN9ZrnMZrg86QXvP0KviUN9aUrhHD8nz5f9sycl3bf+MGrJ068/YWLr5628JopE58d9Iux/3jtLndPu7xg/t/w0fP/+HKOt+V5uWOrH9Ync/P4SF1MbKxP5uYAAADQa/SGvaZfHf3qS6c+dPeiF5cfV/Hdcb86abf6irO/33H8rivHf/HSK9sf36Vg/j+0tOP/8ZB/Xe5oV4cwoStxwYAQdut6PAn8LHbn5AEh7NWVaskPHJYKrA7hC12J/bNVpUr0jSWGpgK/r88EJqQCa2KgJRW4MQaWpAIXxsCKVGBGDKxOBQ6PgdCeP44D6jPjKDlQEwOtyUZcEc9CeKc+tpbaVuuyVQEAAGwnmdlhZf7dnHMdtjVDnF6uqOkpQzwDu2iG6lQN6RlsdlpVtIaKnmoo76mG7LgXffTwC2ou66nmgtMwyvIzfDjkO+UDJu79o7tuHHFT84sTv/vu2OO/8uc33129/z/993vOmX/dAQXz/6aPnv9Xd9ORsoLj/yFM7vobc5dnIh3ZeGtLXgYAAABgG1z12NInbzjgqP9z38v33fmla28oX3311//vKxsv2HvUccPL+v7dt1cUzP8nlHb+f9wn0icnc3g07oaYPSCEpvxAUu3BhYHkqHe/TAAAAAB6g+zx+Oyx8PbMbXKKdno+XZi/ZSvzxwP/E7rNf/mmv372y9c+eeLCYftsuOK/nflB2efH/m6XY9eOfPytPYf9Q0PfwvP/W0o7/782/zbpxJrYiysHhNA3J/BI7GVnoMvQGHj50PxAZvxr4gZYHKvKnJiQrWpxLNEaA02pwLJiJX6bLbFbfiDzZGUbvyA7jvZMiZwAAAAAfOLi7oB4XD6e/3/P5AO+tP+gl8a8uOe9C1+bsPSEU2t/uM8tu74+oGPSmAMnHHLEMwXz/9atO/+/ax5ccHp/R78QRlaE0Cf9w4BHa5OFAWOgriyTuL82qatPuqrzakMY3zmwdFWvZNb/r0ivMfhETVJVDOy29083DetM3FATwsjcwDPfvH5MZ2J+KpBt/Bs1IQzpHG268ZV9k8Yr041f0zeEPXMC2apO7htCZ2NV6aoerM5cxyBd1W3VIQzMCWSrOrA6hIUBgF4q/iudmfvgvIVnz57e0dF2xg5MxH34NWFWe0db44w5HTOri/RpZqrPecsYnVc4plKvfPN8ZomiqUNuH15KOvs7wabctjL78QtOHMzcj9+FKrvG2VyZd3d0esjD9ylsIuR8kyo25PIdPOTa3Eq2PIkF9cf8VaFf6LtgXtsZjWdNnz//jFHJ31KzNyd/42GmZFuNSm+r2u76VsLLo+hqWSkfd1vtl1vJyPmnzR05b+HZI9pPm35K2ylt32keO6q5ecxXx45pHtk5qqbkbw9D3a+7qlND3Xx9iePajkPdvSKnkk/iU0NCQqK3JaYvKTt/wrRf3/+tPdacdtZJe/z9HjNHnPRXl/9m7omNh0z+1fV/ubZg/j/3o+f/8VMnfvJn1mcodvy/IR7mTx7fcpi/NQaWlXr8v6HY0fzsiQFDU4FFMbDIYX4AAAA+G+LuyLg3M+6Vvq7un+4+cuaMQ97/5QlTrv7bseNOPWv9vg0XX33skv+w/p0lq454u2D+v6i03/9vp/X/s0vXf63YMv/7xxJNxdb/Ty/zn13/f1Gx9f/Ty/xn1/9f9ims/78gG0htknes/w8AAHwWfHLr//e4vH/6AgEFGXpc3j99gYCCDD0u41/qBQK2ev3/OR1/UTvo8jnjDh0x98ePrNp7ycDbvvT8xF/vs/SgEfeuvOW9UbcWzP+XlDb/t3A/AAAA7Dwe+mXfb1/87rD7n3rk/SPLLv3txpuO/6u2Aw75w8DmUyYfXfP9m/6tYP6/rLT5/ye//l8odv7/0GKBlmILA1r/DwAAgF6q2Pp/Nw98eejq+SNufOznb97yUusvZo5/7d8t+cFXpg9runnNut80zFhfMP9fUdr8P552UZ6XO/bmw/pkTbuQXtNuY332JwMAAADQO5SHxsbKEvPmrYx62Mdvc11mKdCPSud6+r5BqxaUP3RVWfXGH1wy7ZDGc489c86RF63/fu2TP6md2lh9RsH8f3Vp8/+832U8UDup/rK3Jx784cqT/3j6vvPXTt1y/B8AAADYcUrdLwEAAAAAAAAAAAAAAHz6nmpdetAHo45+Y+Zeo/70jWNf+MHiL37zkb+59s9n/vzw+/Zq3zxsSsHv/8PkrnLFfv8fr/sXf1+wa17u2GrP6/9l7k855taFXUsWPlofwj65gdnnz/5cyFybf7/cwKqp+w/uTJyfLnHfi4e/1pmYlg4cNWKX9zoT41OB1rhI4hfSgXhVxff6pwJxecUn04G4PVakA1WZwCX9k3GUpbfVhrpkW5Wlt9VzdSEMyAlkt9XddUkbZekBXpUKZAd4ejoQBzgpEyhP9+rWfkmvYqAuFv2bfkmvAADYacVvgZVhVntHW1P8Ch9vd6/Iv43yliw7r7DashKbfz6zNNnUIbcPLyXdJ/1ddMu1xitDdecQRhV8Xc3NUtY1yu1TSw+bbtciQ+5ptbfyIuXStnbTVRUfUU0yosYZczpmVvY48NE9Z2mu6DHLqILJTm6W8q5NWkItJfSlhBGVuG1K6HK8Xx4aG/ukco2LwYaQp6dXRKm/189d56/YqyA3z9/WXHtpn8F93v+38Rc99OCAyo5TJ7ddtPtj/zxw1Mwf//DB1mt+XzD/byht/l+dO673MhcDWBSvrHfwgBBaSxwRAAAAfPb9z3OX33HinDUbZq2uePZ3v5tdftyJlZvPueucsy967v7FR13y72/e1viKsqc2nfjGprP++o2ffOW6h8966fAZZ901ad0h69uqb/zuXyw/dUjB/H9oafP/uAcrcyg42duxOl7//4IBIXRdWr8hCfwsDvfkASHs1ZVqiSWSC+p/LZZoSgI/iztM9o8lWlvyq+obAytSgd/XZwKrU4E1MZDZS/HTkNmVc0V9CGO6UpPzS8yNJRpSgeNiYGgq0BgDTalA/xiYkAq82T8TaEkF/jEGQnv+trqzf2ZbAQAAbI3MPKsy/25Iz/NWVPSUoaynDLU9ZSjvKUN1TxmKjSLevyNmqEydvFKWk6kyXWtNqpaCDPFi+Fvdr4IM4bf5OdMFC5qO5x9kzzcoy88w7od3tB70tXk/3nTxjx4/8sALj1xy5duXHt1v8JXP/u/2c/v131RbMP9vKm3+X5t/m7S+Js7/t1z/Lwk8Ert3ZTx1fGgMvHxofiCzY2BNnOwuzlbVkimRmbQvjiUmxMDQVGBuDExIBVonZwLLBucHMjPtbOMXZBtvz5TICQAAAMAnLu4giLtp4vx/5bjwzh5Hvt+8+5UD5457/JHzjphes2t1zT+PX7t0/KXVD+3Xt2D+P6G0+X9sr19uYxfG3rzaP4S7y7b0JhsYUZcE4n6Muvjz+D3qQvhczg6ObIm22qREVarh8HBN8gv1qnRV99YkawzE+1OeeHDVZZ2Jq2pC2Ddn70u2jReqkzZq0oFhVUmgNh2YU5EE4p6fbOCe8iQA2yy7VzC+oDKnumQ1dF+uyOvvs3JN0PTwCvaBdpOvu99c7SjV6Qcy+1Sztu5pK6iOHaLg7bHau603vtsavNtyv0hlvqFs3hKqDuUz22ZNX9AxPz6S+0vWAjvoec79lWop6e3wOlz08Xvbs+p0B5pSHx9N3Zfr/nVYFqt7oHZS/WVvTzx45cl/PH3f+WunltyNIuIPhQ++de4Bz+Vs3h2tOmRec73u86TF50lv/Dcw1NMWQlh+wawnn/iX95+vWN/8Xw4cu/y2Nx9b/pODHpg14gsbLvnyxrfePapg/t9S2vy/InXb5YO4MecNCGF4zsZ9NG7+iQOSz8GcQPIpObAwkBxyX19f9JMTAAAAtrfs7o7s/oL2zG1yQnh6nlyYv2Ur88f9FRO6zV9qvweO+YfvHXrV69/4+vrdL3906VPr/tObrxwx7dAHNj29YuXrzcd+/umC+X/rR8//+6a66fi/4//sII7/d2tn3xXdN/3Aom3aFV1QHTuE4//d2tnfbY7/d8vxf8f/u+P4fw8c/+/Wzv60FXxLmutLVwihdcANt/+idvrwflec860Za3/+9DtN416oO/foO//H4YvDNeet+nPB/H9uafN/6/91v2hfdv2/1mLr/80ttv7fIuv/AQAAO1SRhebS87yC1fsKMqRX7yvI0OMCgT0uMWj9v61e/6/2pLNPeqX+rb2umXj7f75z+oXPn3Tis/v2ef6E20+4aeTVw1/68oaC+f+i0ub/8eXQL7f13rL+39DJRapaEgNzLQwIAADAzqjYDgIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA+XSseXPzFzYv3OeimZz9/0+H/umzNrL1/dcDm0WNObhy+eGDZlX/3L28NWrDwjbZJZ17bMn35NRtWLg2hvatcWVK87KFB5eWj/3DMXbdd8XDTtMFTzq3O1FuZuf1iXu7Y6of1ISzLeaQuJjbWd97ZEphyzK0LKzoTj9aHsE9uYPb5sz/XmbixPoT9cgOrpu4/uDNxfrrEfS8e/lpnYlo6cNSIXd7rTIzPBMrS3b2uf9LdsnR3L+sfwoCcQLa73+6fX1W2jf+YCZSn27i5LmkjBupi0R/VJW3EQEcs0d43hJEVIfRJV/Xr6qSqPumq/r46qapPuqr/Wh3C+BBCRbqqF6uSqirSI19blVQVA7vt/dNNwzoTy6pCGJkbeOab14/pTJyeCmQb/3pVCEM6XzLpxu+oTBqvTDd+VWUIe4YQqtIl/rUiKVGVLvFKRQgDcwLZxk+tCGFh4DMhfvjMzH1w3sKzZ0/v6Gg7YwcmqjJt1YRZ7R1tjTPmdMysTvWpmLKc9ObzPv7Yn990zozO26lDbh9eSroiU66yq8vNlXl3R+/svY/9qs2tZMvzUVB/zF8V+oW+C+a1ndF41vT5888YlfwtNXtz8rdPJppsq1G9ZVvtl1vJyPmnzR05b+HZI9pPm35K2ylt32keO6q5ecxXx45pHtk5qqbk7/YY6vWf/FB3r8ip5JP4AJCQkOhtifK8T7emnf2DvOCL/paOVobqrg/ogmlFbpayrlFuj0Ef9jFH/HG+p/Q4olEFE4eCLM09ZxldMJnYkqUmydL1va5gcphbU3nXJo33y0NjY59i26Eh/27u5n1rGzbvusymKzUNAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/D924EAAAAAAAMj/tRGqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqsAMHAgAAAABA/q+NUFVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVYUdOBYAAAAAEOZvHUbPBgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAwKUAAAD//5twzl8=")
r1 = openat(0xffffffffffffff9c, &(0x7f0000000040)='.\x00', 0x0, 0x0)
ioctl$BTRFS_IOC_SNAP_CREATE_V2(r1, 0x50009417, &(0x7f0000002480)={{r1}, 0x0, 0x0, @unused, @subvolid=0x3})
r2 = openat$smackfs_cipsonum(0xffffffffffffff9c, &(0x7f0000000080)='/sys/fs/smackfs/mapped\x00', 0x2, 0x0)
r3 = creat(&(0x7f0000000100)='./bus\x00', 0x0)
ioctl$FS_IOC_SETFLAGS(r3, 0x40086602, &(0x7f0000000040))
r4 = creat(&(0x7f0000000380)='./bus\x00', 0x0)
lseek(r4, 0x7ffffb, 0x0)
fchdir(r1)
r5 = openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x0, 0x0)
getdents64(r5, &(0x7f00000001c0)=""/202, 0xff4)
ioctl$BTRFS_IOC_START_SYNC(0xffffffffffffffff, 0x80089418, &(0x7f0000000180)=<r6=>0x0)
ioctl$BTRFS_IOC_DEV_INFO(0xffffffffffffffff, 0xd000941e, &(0x7f0000002580)={<r7=>0x0, "f3ec3f2426c98f5f73e7b27f60ff1d2e"})
ioctl$BTRFS_IOC_SUBVOL_CREATE_V2(r5, 0x50009418, &(0x7f0000006300)={{}, r6, 0x1c, @unused=[0x7, 0x0, 0x12a0, 0x1ff], @devid=r7})
ioctl$BTRFS_IOC_START_SYNC(r0, 0x80089418, &(0x7f0000000340)=<r8=>0x0)
ioctl$BTRFS_IOC_SET_RECEIVED_SUBVOL(r1, 0xc0c89425, &(0x7f0000000880)={"4466f95322d0f6a730af2168b487d6a0", r6, r8, {0xa000000000000000, 0xebe6}, {0x0, 0x7fffffff}, 0x8, [0x7e7e, 0x8, 0x9, 0x8, 0x6, 0x7, 0x8, 0xccba, 0x1, 0x100, 0x7f, 0x2, 0x80000000000, 0xa040000000000, 0x0, 0xc646]})
fallocate(r4, 0x100000003, 0x80bf13, 0x28120001)
fallocate(r3, 0x100000003, 0x780100, 0x2811fffe)
syz_mount_image$hfs(&(0x7f0000000000), &(0x7f0000000100)='./bus\x00', 0x12a0001, &(0x7f00000003c0)=ANY=[@ANYBLOB="676928fc", @ANYRESHEX=0x0, @ANYBLOB=',file_umask=00000000000000000000050,codepage=cp869,\x00'], 0x4, 0x32d, &(0x7f0000000540)="$eJzs3U1P1E4cB/DvtLtL9w/hXwFD4smgJJ4I4EHjRWKIF9+AB0NEWBJCxUQxURIjejbGm4mJR2+ejb4FvRjfgJ44GE96IR6smel0d1pnug8sWwjfT+JS2nn4TafTzkCwIKJj68ri1zfnd+U/UQXgA7gEeAACoALgJCaD+5tbbQvym1sCSU7xT5qVzYYtawCdQwvldxWMmPvoYMRxHH9rm+rnQGKh8ghzBBs8YEiPTnU8GHhkB2MnadfxYvSw2MMeHmC0zHCIiKh8+vnv6afEiJ6/ex4wrefhR/35n5nf7JUXx6HQfP57yfexkOfnf3VIrvfWt6LGarKEk73vpatEW1nWayJune4akivLHzamXEYtdioWr762HjVmdlQBT3FZM5JNqM9VpA1RXNHWki9TlrVpgaK2FxtWbajKNsw74h8vqtG6AP7wHS/t1S196iAm8VF8FksixCusNud/lVjIk6POT5gbKkn8s+4SVSvDJFWmla3wT6hKTqU98P5tq5V113kN4MtYbGQpIj9/D9M4X9TcuTCG7I8VktbNuVunco0DFaFWDWau+Wai39ZcE/m66mvVqDGzcidyXfT9ZV3RiefiupjCD7zDojH/92TqabhHZmaUC5VSXxmF7amolI5+zFAD+HZXI5OUq9aOLvYMt3ARo/cebm8sR1Hjbvkb6VDpMfvpPseTXIj6cpR75FcjDQK5UQXQt0r/xHFsPVTBILqgqpp64XWrydsby0Lf8/ZXhbxz5g4tuBMDWACg96R3hF5qf9zMNdQqsKPsv2Rvqz32CzKNagADJK0qc8jHUEcjpd5DpdcebSxHPd2J6IhpdTomb5QdDJVBzrtEsv4z1iuz6q4jP8KC9U/crnCjxDnHCmhMff7X2QquWaxznjicbrRZc505B5zN1eghrfFJvthQx4nD+FvJ7n+VIRbxBTf5838iIiIiIiIiIiIiIiIiIiIiIiIioqOm279G6OXPCbI17h7D/3iDiIiIiIiIiIiIiIiIiIiIiIiIiIiIiGh/jPf/Ar56Y0zN9v7fojc1KX7yhpigH+//9Tt4/6/Y6aKVRGTzNwAA///ltF7V")
syz_mount_image$vfat(&(0x7f0000000440), &(0x7f0000000080)='./file0\x00', 0x800090, &(0x7f0000008740)={[{@shortname_winnt}, {@uni_xlate}, {@shortname_win95}, {@numtail}, {@shortname_lower}, {@utf8no}, {@uni_xlate}, {@fat=@uid}, {@fat=@check_strict}, {@uni_xlate}, {@shortname_win95}, {@iocharset={'iocharset', 0x3d, 'cp855'}}, {@fat=@codepage={'codepage', 0x3d, '855'}}, {@utf8no}, {@utf8}, {@fat=@dmask={'dmask', 0x3d, 0x40}}, {@uni_xlateno}, {@uni_xlateno}, {@iocharset={'iocharset', 0x3d, 'iso8859-6'}}, {@shortname_lower}, {@nonumtail}, {@shortname_win95}, {@rodir}, {@utf8no}, {@nonumtail}, {@shortname_lower}, {@shortname_mixed}, {@utf8}], [{@appraise}, {@pcr={'pcr', 0x3d, 0x10}}]}, 0x6, 0x2d3, &(0x7f0000000a40)="$eJzs3b1rJGUYAPBnNrMfarFbWInggBZWx+Vamw1yB2Iqjy1OCw3eHUh2ES4Q8QPXq8TOxtK/QBD8Q2zsLAVbwc4IgZGZncl+ZNhsJBvx8vsVyZuZ55n3ed+ZJNPkyQcvTw4fZvH46Re/Ra+XRGvYjThJYhCtqH0VS4bfBgDwf3aS5/FnPtNw+tdv1uT2tlgXALA9F/z+r6TlxwdFxE/XVxsAsB33H7z79t7+/t13sqwX9yZfH4+SiCg+z87vPY6PYhyP4nb04zSifFFoR/m2UAzv5Xk+TbPCIF6bTI9HRebk/Z+r6+/9EVHm70Y/BuWhs7eNMv+t/bu72cxC/rSo4/lq/mGRfyf68eJZ8lL+nYb8GHXi9VcX6r8V/fjlw/g4xvGwLGKe/+Vulr2Zf/fX5+8V5RX5yfR41C3j5vKdevLpNd8jAAAAAAAAAAAAAAAAAAAAAACePbeq3jndKPv3FIeq/js7p8UX7chqg+X+PLP8pL7QvD9QtPI8n+bxfd1f53aWZXkVOM9P46W0aiwIAAAAAAAAAAAAAAAAAAAAN9zRp58dHozHj55cyaDuBpBGxN/3I/7tdYYLR16J9cHdas6D8bhVDZdj0sUjsVPHJBFryygWcUXbctHguXM1V4MffmzMKlZ0lEbTqd7Fk7ab57rk4JP2bB8bY+qn6/Agad7D7lnxveLGxeqN60Tz7O1YOdKp7+dqcP0obracTuOp/qW3pfNCOZiuiYlk3ffFG78vLSeJleBO2XGjMb1dDZp2Y/ZsbPQ8R2+Wfv5nRaJbBwAAAAAAAAAAAAAAAAAAbNX8r38bTj5dm9rKu1srCwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACu1fz//28ySJeTN8jqxJOj/2ptAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA3Bz/BAAA//8a6VGq")
mknod$loop(&(0x7f0000000000)='./file0\x00', 0x0, 0x1)
syz_mount_image$ntfs3(&(0x7f0000000080), &(0x7f0000000240)='./bus\x00', 0x1200010, &(0x7f0000000480)={[{@prealloc}, {@fmask={'fmask', 0x3d, 0x3ff}}, {@showmeta}, {@acl}, {@nohidden}, {@nohidden}, {@sparse}, {@discard}, {@nohidden}, {@acl}, {@iocharset={'iocharset', 0x3d, 'maccenteuro'}}, {@discard}, {@prealloc}, {@acl}]}, 0x21, 0x1f1c1, &(0x7f000003e980)="$eJzs3QmYTeUfB/D37Pu+XLvBWEO2RJJ9zb5FKUvITrYoFRJpERGSLcmWJFSSRBIl2ZeEJEmSKCGJ/zN37kyzXP+adr3fz/OYc++Ze3/vued7z5jfOeee+brZuEYtajdPSEhIIDZDkp0n6Qwjw0iN2Pf42LzLsSkT+ze43cJKe8yPuibNM/OuvnvoovxrBmitlptvSmSzfcfXp8oc2Rxuzv71pRZdu/VP6NY/oXefAQkdEjr26TOgQ8eenRM6devfo2RCk56dO/TvnNCtd//O/dJ9u0vPPn37Dkno0LuTofbt17l//4QOvYck9Og8JGFAn4QB/YYkdLirQ7feCSVLlkwwVAK/UcuF//QSAAAAAAAAAAAAAAAAAADAn+PyZXI5yT+9HAAAAAAAAAAAAAAAAAAAAPD71a7XoFYpoqTeZwhD6hKGzGcIIfYvj0v53D93hTpJD20fvZUt+rVuyq0zPUr3vLCXudJUjFeMyTyrFCGka2p9ltSP3mKIEJ0n/Oo4ZEmsaGyaMi7PJZKGpDZpEbs/LLbsDKmabkFuik2rpsw4xcWdOlWS19SSdHUyr7WqaVccIUQm6acOw0anf+25GVdKE+iA/OmG/OmG/OmG/OmG/OmG/K82f25fyV2x/++aof/nYt0we4Xl+jv6/76p9VnSLKv9f9X0KyhlXDm1/29IupF+pF9s/pX2A3AZ13PV+NN8/OU06/nf6kppAh2QP92QP92QP92QP92QP92QP93YTP0/+3/6f/Zq7v9Tz2BInqbt/xuQPuQuUpt0Iz1J59j8K/X/VWLT1P4/Q92Uab6qXPRJ6P/hn3HoN7zzkD/dkD/dkD/dkD/dkD/dkD/dMvf/XKz/P5Wh/xdj+wD4K1SyY/1ySv9f6nf2/+nP82dIkyz3+eml1Je4RNKK9CE9yUDSi3SO1h2WOg5LOqWOyA9Leh0pnwfwo98tF9tSfDKTcQmTPIrgxp4fnZf8ACGBEJLAknSPyfg9EttXUip1fJ64sVv9yRByL+lBOpCe0b0RKecj9CWEFEl9vED01DUdyy32yoelzs+WerZCtivuh7hSmkAH5E835E835E835E+r5N+nkT/dkD/d+Ez9Px/r/2dpmT//z6fuMWqRqdKf2f//oeP8KX/9PzZNe5y/GhlABpB+pCbpTLrE5qffD8D95v0AY0jm/QDReVncDxDt11WSOmrScifdSiTNSQtSjTQiNUk10ozUJO1IPdKI1CaNSTPSkFQjLUg90pg0ykrcmVx5/1+a6KPnOpSK3a4bmyZGl6AFaUbqkeqkJWlBapF2pAGpF13uv15CmtvD0ty+HJNIapN6pEF0qRqRaqQhqfU3LNUvSqW5XZ0QUjPldmyVJ5LGpDqpT2qRGqRFNNuaf+vyJZ+/wqa5HcOkLF/z6LK1jCbcgrQh7UhNUos0JzWic5qQFtF34l+lSZrb8fNtRRqTBqRlNNm/P+P2aW5XTbslpa6/9Mv352+7/1/fDMunx26nTBOjP1dYUu0vXIb/Z9gV5v+Sb73oz75apDVpR5qRxqTx3/JzJcW4NLer/uryVSMNSAPSmNT4W7JNMivN7fjbR/Xodpv0bmtyxSp/3fGfJb+6fM1ILdIk+n9b8+gW0oQ0jq7TvyflNVdYvpSwE0ktUu0f2G5TbM2wSBk/x5m8fH/U78//0BW/k/wDMDG6PdQhdUit6O8uLaPrrkHq/yXNo7871Ir+1P5LpDkSNOxK3/gX+3vOG8XxX7ohf7plPv4vRPt/jths5uP/QrRDLhW30q/1/4XPFiuTdpoyv1BCulYz+rxfPkfARLuiP3j8P1qf4UpG7zdJd17/L+//lCem/q4XazCL2+mnDlsjOk36/bB10g0+eb9BhdhTkn4vLEtKRV8HE1sxKadSFIn9S7uQ2TJcxWFWbBkZvmrqXtq0Uvrvy5cv501d8WmmKeM7THIvtCTlvAE++TwDOd1ycqnL8u/+nAL8NXD9F7ohf7ohf7ohf7ohf7ohf7plvv6fGDv+3z7O5//Ff/jz/2mv/5/lzwXEXkvV1NeZTOISSXXSjQwgvUgH0veKn/tPkfH6ihkv/+cwtaPTK/TTJ6+w+v4h2P9HN+RPN+RPN+RPN+RPN+RPN+RPt8zH/6VY/78xzt//k36l///zP//PkgZZ7fNTTgyPTVPqC9E+vw/pQwZE7/8bzvuPt5/hSuchZJw6sTp/7Lg9tn+6IX+6IX+6IX+6IX+6IX+6IX+6Ze7/5Vj//0ic/l/+11z/n/sd1/9PL+11AaqTDqQTqRG9NmB/ktxPp7+OHpt6a1i68/N/KXspdjPl/HxyKm/6aUzKaOzl5Cf8s+fdY/unG/KnG/KnG/KnG/KnG/KnG/KnW+b+X4n2/wqxufT9vxD9XlLnXS9upT/Q/ydkrMX86vn/f8rfBYiOK3GJpDnpTO4kA0k/0jlDf//L9pFyPD61v7eTr53Zl0k/TapWkzQnTrU60fvjUq6VE7tOwJrUz98nPS65mkvSXydgSZrrJ2WLnoGQ9PqTr/zTNHFZdP2kTFOubTOUEFKP1Mv0+FkHDlZnmV+mKbssOkUfz2V6/CmXO5/0mJQpyVA/bd5s7PVtJL9cX2B+mteX8ng7w+ubEvtHomdUJI9fJOVapFcYO+PjrrQOMj7uSq894+ug8/oH+PwX3ZA/3ZA/3ZA/3ZA/3ZA/3TJ//l+NHf+fH+f4v/qv+fw/9zv6/5QlT56m/fx/S9KX1CAdSP9Y/x/vvPyUo/i/XAuajTvNl/BIdJpShwjJ5w8kxK7Dl0jqkd6kC+kTe1bKzg+9W84e9zywf1/G1/3X9qXY/0c35E835E835E835E835E835E+7YSTW/w9Kvq/Frv8/js18/T/t/+wx+gP9v52x1t90/D86blL/X4sMJgNIZ9KbdIoezx7MpBzPZklr5tev61+XSf6XJHvyKyDtY38lSEv5msXlKxTbNSByiaQx6Ui6k3qp1yj48+pz0fpNycDodRE6EEJyxOp3HUb+lOWXuUTSjHQmfUkH0i+6hyXz+4be4+//NOz/pRvypxvypxvypxvypxvyp1vm4/96tHdnSKk41//X/4br/6W/Lh/7m6/LN5PJfF2+6Lzf8/f4U8dnUm/9N/tT7P+jG/KnG/KnG/KnG/KnG/KnG/KnW+bP/xv/p/830P//x2D7pxvypxvypxvypxvypxvypxvyp1vm/t/8P/2/if7/PwbbP92QP92QP92QP92QP92QP92QP90y9//W/+n/LfT//zHY/umG/OmG/P9zsvTfFPKnG/KnG/KnG/KnW+b+P6WPn0LS9/9M6jPi+7P6fzlrv8DAH4Ltn27In27In27In27I/z/o9G9PFfnTDfnTLXP/7/zD/f9/8zj7vxW2f7ohf7ohf7ohf7ohf7ohf7ohf7pl7v9d9P8UwfZPN+RPN+RPN+RPN+RPN+RPN+RPt8z9v4f+nyLY/umG/OmG/OmG/OmG/OmG/OmG/OmWuf/30f9TBNs/3ZA/3ZA/3ZA/3ZA/3ZA/3ZA/3TL3/wH6f4pg+6cb8qcb8qcb8qcb8qcb8qcb8qdb5v4/RP9PEWz/dEP+dEP+dEP+dEP+dMv42zfQBds/3TL3/5G/o/+/nOxB9P//MGz/dEP+dEP+dEP+dEP+dEP+dEP+dMvc/zOEIXWJQYax6ft/PtbPs1f4+/xJD20fvZXc/9f9jcf/CxGSkLEWw2SuX4oQ0jW1PkMaRW8xRIvO0351nIz1UsYVuUTSlAwkfcgA0oEQMo4QMji6apLG4UhrkvzARNI49blO7AUzafZ71I19LyG6pggpElt5TPImJiTVSGCTd6QkZFhvbGzcU6njsmRr6rhN0z027bhrYv9I9LoNpWLrnY/lmFw33QuPrYvLqdhM+1t8UjU6bR2rk1Tg1+skS80v42uO83qxn+ffAD//6Yb86Yb86Yb86Yb86Yb86Xbl/r8rk77/5wgh2X/l+P/V2v83Jh1Jd1KPdIr14U3IL314qTj9v0fS9+EJJPOLuDr6XGz/dEP+dEP+dEP+dEP+dEP+dEP+dLty/z8uTv+fIzaN5+/o//um1mdIsz+p/5e5RNKMdCZ9SQfSj/QnnWPfH5cyXmw/QJPU/QDNUmtk3A9QNfYvSa7YNIHo0alOyKzcsTWccb39c/sJrpQm0AH50w350w350w350w350w35042L9v/FMvT/VQlHpmQ4/18ghUjPOBV+6fuTpfT9o/sULZPUd19pmpCQtnoy9v8e909eugaxvl+ILdWvjZOxXtK4JaLP7EK6kZ6kc+y8+aR+3056NJ/c759K7ffrkbKk1D1Smhec0u8fiv0jqf0+Q9pHl5CQnLH7v7p8sbWQI3a6fdKY8ZYv9++tnxGftn4h0pmUzpRnUvRbU89pSP4chBy7lfTcG2Ovu3Xq9/novGyxvx9JRJ4kNmjdsl7N5Jpp5tWJM69h45rkm2qx8f/d50v81xT6pxcA/lHIn27In27In27In27In26FrtD/M2Qrk7H/Lxi3wu/u/+PUykr//2f318Niy85Ej+BnPi+matoXnKZeyjQfMyK69H933zw8tli/r2+OnynQAvnTDfnTDfnTDfnTDfnTDfn/0/7ZfrHgFfv/vhmO/4ukYMoV4dL5vf1/vGJ/Sf+fEZ+x/08+/j0s9ZML2VL7/2yx6+ttTX0yR1bE5uuEzFpCCGlEWkW/JmlLBpBepC9pS/qTIeReUoJ0I71IB3IX6Rz915uUJaVIRVKRXEfKkNKkIrmeVCRt0+yJSH+7CqlC2v7JVdO/3wr+yvtN+KPvt8vPpXu/wb8Lfv7TDfnTDfnTDfnTDfnTDfnT7cr9v5zp/P/fd/y/8Nli6aZ/1vH/jOf/X2mclNegxiokjdvmCv1/Vo7/p9RLmeZjHvq/x/+TxhwQ66/HpX6fj877o/tzop890EQysH/nfiUHdxgwoF9pEpvE+V4ZEptE80dfTjP8/Kcb8qcb8qcb8v+PU/7/t5E/3ZD/P+3fd/yfjfb/FeIc/4/nSv1/xn484zTr/X/y8eiM/f+vjRM7vZ8UKZA8zXz+f9kM45C4+xn++DjRhvuK+xlSblVN+8LT1EuZ5uNG/omfM0jO9K//nAH8O+HnP92QP92QP92QP92QP92Q/z/t39f/Jx//3/sHP///Z/b/fdMsXYus9uWx19ApViGlL1dS+/KS5E7Sh/SMPSKr5wGk1E2Z5mNGX0XXAcD2TzfkTzfkTzfkTzfkTzfkTzfk/0/7t/X/JNbjJvXA6a/QL8VdfhwPvirF9rTEzxRogfzphvzphvzphvzphvzphvzpJmWh/5fjVkD/fzWLnynQAvnTDfnTDfnTDfnTDfnTDfnTTc5C/x//UiLo/69mv3J5GPiPQ/50Q/50Q/50Q/50Q/50Q/50U7LQ/6txK6D/v5rFzxRogfzphvzphvzphvz/s9r9lmO7yJ9uyJ9uahb6fy1uBfT/V7P4mQItkD/dkD/dkD/dkD/dkD/dkD/dtCz0/3rcCuj/r2bxMwVaIH+6IX+6IX+6IX+6IX+6IX+66Vno/424FdD/X83iZwq0QP50Q/50Q/50Q/50Q/7/OVn6SD/yp5uRhf7fjFsB/f/VLH6m8J8zMf5s5E835E835E835E835E835E83Mwv9vxW3Avr/q1n8TIEWyJ9uyJ9uyJ9uyJ9uyJ9uyJ9uVhb6fztuBfT/V7P4mQItkD/dkD/dkD/dkD/dkD/dkD/d7Cz0/07cCuj/r2bxMwVaIH+6IX+6IX/6pP1dD/nTDfnTDfnTzclC/+/GrYD+/2oWP1OgBfKnG/KnG/KnG/KnG/KnG/Knm5uF/t+LWwH9/9UsfqZAC+RPN+RPN+RPN+RPN+RPN+RPNy8L/b8ft0L8/l/4cxcT/iLxMwVaIH+6IX/KZPiPGfnTDfnTDfnTDfnTzc9C/x/ErYDj/1ez+JkCLZA/3ZA/3ZA/3ZA/3ZA/3ZA/3YIs9P9h3Aro/69m8TMFWiB/uiF/uiF/uiF/uiF/uiF/uoVZ6P8jcSug/7+axc8UaIH86Yb86Yb86Yb86Yb86Yb86RbJQv+fLW4F9P9Xs/iZAi2QP92QP92QP92QP92QP92QP92yZaH/zx63Avr/q1n8TIEWyJ9uyJ9uyJ9uyJ9uyJ9uyJ9u2bPQ/+eIWwH9/9UsfqZAC+RPN+RPN+RPN+RPN+RPN+RPtxxZ6P9zxq2A/v9qFj9ToAXypxvypxvypxvypxvypxvyp1vOLPT/ueJWQP9/NYufKdAC+dMN+dMN+dMN+dMN+dMN+dMtVxb6/9xxK6D/v5rFzxRogfzphvzphvzphvzphvzphvzpljsL/X+euBXQ/1/N4mcKtED+dEP+dEP+dEP+dEP+dEP+dMuThf4/b9wK6P+vZvEzBVogf7ohf7ohf7ohf7ohf7ohf7rlzUL/nxC3Avr/q1n8TIEWyJ9uyJ9uyJ9uyJ9uyJ9uyJ9uCVno//PFrYD+/2oWP1P4z6uaPEH+dEP+dEP+dEP+dEP+dEP+dMuXhf4/f9wK6P+vZvEzBVogf7ohf7oh//+q4b/pUcifbsifbsifbvmz0P8XiFsB/f/VLH6mQAvkTzfkTzfkTzfkTzfkTzfkT7cCWej/E+NWQP9/NYufKdAC+dMN+dMN+dMN+dMN+dMN+dMtkTCEH+bbhCTEen2fEMISYsvRe9nIZSYvYdjoHSGBEJKQdFvLlnTfzjSf+MmPZ5Lnu0lfNT/5sWnnEZ+8zXipj9NSH8ccSjePZCMLiJtu/Hy/jD8r03wAAAAAAAAAuBKGzzgjTa+evm8HAAAAAAAAgKtRvUY1WxfJcP5/kiKEkBUiIces5PsG2cvEez4f+9qVNIveElK+nulRuueFvcwVp2qsQGwqxu7KXCKpRgaQAaQfqUk6ky7RuQxJPh9Bzvo4GaQdpzrpQDqRGqQnGUj6R+dKse+LpD1pkrVxYudPVM0wjhQdpxsZQHqRDqQvkWLjtCcNslY/5U91JKSvL0Tr9yF9yADSjfQkyedNaL9x+Q3yf9aTnbL8tchgMoB0Jr1JJ5J8mgX7O3Lg071j0ubQgPQhd5Ha0eXvHMubxKbtSf2sjbMk9tzYNGUcnkskDUlt0iK1cvLXLL+OlICrZn4/JddvSLqRfqRf8nk0sS1Lyfr7KXMeCSl5NCedyZ1kIOlHOsc2H/V31GejWzWJJZr2/dqS9CU1SAfSn3QmXHQu92csf2r9VqRPdJvrRTqn1mtNav7R+tH3K8OVjOaakJpvi3R1C58tVibp+VeapmzHnWJ1k+qUiCbYJfb+LEnuTFr+5A2Z5Ex9n6bfnkf3KRqtd6VpxuVPSCB2iegzU8YpFZ2f/VfqZ1z+1Pqx16GmeR1t0tUvHc0/7xXq/+p6Sl48UqRA+vX0S/0y0fl/Xf2y6dafnfpOht/O+KcXAP5RyJ9uyJ9uyJ9uBjl3OY5fHjAsZVbs111yMX2BYX/r4gIAAAAAAADA73LK5c6TNEejhkaP1vDD6hJCWsfmJV8ToFzs+IpPxhA3w+cCsiXPy3A9gF+7n6Rp4rJorWFZGH8mk3n86LzfMf6sAwers8wvh5w7xcYvleZYfvL4uZIfwqW7xkH0628ZBwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA4GpyyuXOE+aX+0MJIQzhh9UlhLSOzfMJISwpR9jYvTHEJUzycwQ3OsmWPC/5AUICISSBJb96P0nTxGXRWsOyMP5MJvP40Xm/Y/xZBw5WZ5nooFGdYuOXIoRUTTd+ruSHcD65zORNGT/69beMAwAAAAAAAPDPYghLOMITgYhEIjJRiEo0ohODmMQiNnGISzzik4CEJEKykewkB8lJcpHcJA/JSxJIPpKfFCCJpCApRAqTIqQoKUauIcVJCVKSXEtKkdKkDClLypHrSHlyPalAKpIbSCVyI6lMbiJVSFVSjVQnNUhNUovUJnVIXVKP1Cc3kwakIWlEGpMmpClpRpqTFqQlaUVuIa1JG3IruY20JbeTO0g70v53Pf8+MpTcTx4gD5JhZDgZQR4iI8nDZBQZTR4hY8ij5DHyOHmCjCVPknFkPHmKTCATydNkEplMppBnyFTyLJlGppMZZCaZRZ4js8nzZA55gcwl88h8soAsJC+SReQlspi8TJaQV8hSsowsJ6+S18jrZAV5g6wkb5JV5C2ymrxN1pC15B2yjrxL1pP3yAaykbxPPiCbyIdkM/mIbCFbyTaynewgO8kuspvsIXvJx2Qf+YTsJwfIQfJpFp9/LsPzBzOEIQzLsAzP8IzIiIzMyIzKqIzO6IzJmIzN2IzLuIzP+EzIhEw2JhuTg8nB5GJyMXmYPEwCk8DkZ/IziUwiU4gpxBRhijDFmGJMcaY4U5IpyZRiSjNlmDJMOaYcU54pz1RgKjIVmUpMJaYyU5mpwlRhqjHVmBpMDaYWU4upw9Rh6jH1mfpMA6YB04hpxDRhmjDNmGZMC6YF04ppxbRmWjO3Mpdj7mDaM+2ZjkxHphPTienCdGG6Ml2Z7kx3pifTk+nN9Gb6Mn2Zfkw/ZgAzgBnEDGIGM0OYIcx9zH3M/cz9zINMDXY4M4IZwYxkRjKjmNHMaGYM8yjzGPMY8wQzlnmSGceMZ8YzE5iJzFlmEjOZmcJMYSqzzzLTmOlMAjuTmcXMYmYzs5k5zBxmLjOPmccsYBYyLzKLmEXMYuZl5mXmFWYps4xZxrzKvMq8zqxgVjArmTeZVcwqZjVzjlnDrGXeYdYx7zLrmXeZDcxGZgPzAbOJ+YDZzGxmtjBbmG3MNmYHs4PZxexi9jB7mI+Zj5lPmE+YscxB5iBziDnEHGYOM0eYI8xR5ihzjDnGHGeOMyeYE8xJ5iRzijnNfMecZs4wZ5izzDnmPHOeucBcYC4yF5lLzKWkjZ9NwrM8K7IiK7Myq7Iqq7M6a7Ima7M267Iu67M+G7Ihm43NxuZgc7C52FxsHjYPmxDdeZafTWQT2UJsIbYIW4QtxhZji7PF2ZJsSbYUW4otw5Zhy7HXseXZ69kKbEX2BrYSW4mtzN7EVmGrstXYamwNtiZbi63N1mbrsnXZ+mx9tgHbgG3ENmL7FO9VvBk7nBnFtGSTkmnNPsncyo5j2rK3s3ew7dgJTAe2IzuR6cR2Zruwd7GTmUlMd7Zj8Z5sL7Y3O43py/YuPp3pzw5gZzKD2HvYwewQ9l72PnYo26n4A+yD7FxmODuCXcCMZB9mR7Gj2cVMTTYpsVrsE+xY9kl2HDuefZ2ZwE5kn2YnsZPZKewz7FT2WXYaO52dwc5kZ7HPsbPZ59k57AvsXHYeO59dwC5kX2QXsS+xi9mX2SXsK+xSdhm7nH2VfY19nV3BvsGuZN9kV7FvsavZt9k17Fr2HXYd+y67nn2P3cBuZN9nP2A3sR+ym9mP2C3sVnYbu53dwe5kd7G72T3sXvZjdh/7CbufPcAeZD9lD7GfsYfZz9kj7BfsUfZL9hj7FXuc/Zo9wX7DnmS/ZU+xp9nv2O/ZM+wP7Fn2HHue/ZG9wP7EXmR/Zi+xl1nCMRzLcRzPCZzISZzMKZzKaZzOGZzJWZzNOZzLeZzPBVzIRbhsXHYuB5eTy8Xl5vJwebkELh+XnyvAJXIFuUJcYa4IV5Qrxl3DFedKcCW5a7lSXGmuDFeWK8ddx5XnrucqcBW5G7hK3I1cZe4mrgpXlavGVedqcDW5Wlxtrg5Xl6vH1edu5hpwDblGXGOuCdeUa8Y151pwLblW3C1ca64Ndyt3G9eWu527g2vHtec6cB25O7lOXGeuC3cX15XrxnXnenA9uV5cb64P15fry/Xj+nEDuIHcIG4QN5gbwt3L/cxd4i5zD3APcsO44dwI7iFuJPcwN4obzT3CjeEe5R7jHuee4MZyT3LjuPHcU9wEbiL3NDeJm8xN4Z7hpnLPctO46dwMbiY3i3uOm809z83hXuDmcvO4+dwCbiH3IjcqVmnJb3j+O3Ge/3h09C3cVm4bt53bwe3kdnG7uS3cXm4vt4/bx+3n9nMHuYPcIe4Qd5g7zB3hjnBHuaPcMe4Yd5w7zp3gTnAnuZPcKe409yP3PXeG+4E7y53jznE/che4C9zF2DogPMOzPMfzvMCLvMTLvMKrvMbrvMGbvMXbvMO7vMf7fMCHfITPxmfnc/A5+Vx8bj4Pn5dP4PPx+fkCfCJfkC/EF+aL8EX5Yvw1fHG+BF+Sv/YPP//Xlq89357vyHfkO/Gd+C58F74r35Xvznfne/I9+d58b74v35fvx/fjB/AD+EH8IH4wP5i/l7+XH8oP5R/gH+CH8cP4EfxD/Ej+YX4UP5p/hB/DP8o/yj/OP86P5cfy4/hx/FP8U/xEfiI/iZ/ET+Gn8FP5qfw0fho/g5/Bz+Jn8bP52fwcfg4/l5/Lz+fn8wv5hfwifhG/mF/ML+GX8Ev5pfxyfjn/Gv8av4Jfwa/kV/Kr+FX8an41v4Zfy6/l1/Hr+PX8en4Dv4F/n3+f38Rv4jfzm/k1/FZ+K7+d387v5Hfyu/nd/F5+L7+P38fv5/fzB/mD/CH+EH+YP8wf4Y/wR/mj/DH+GH+cP86f4E/wJ/mT/Cn+FP8d/x1/hj/Dn+XP8uf58/wF/gJ/kb/IX+IvJf3aJ7ACK/ACL4iCKMiCLKiCKuiCLpiCKdiCLbiCK/iCL4RCKGQTsgk5hBxCLiGXkEfIIyQICUJ+Ib+QKBQUCgmFhSJCUaGYcI1QXCghlBSuFUoJpYUyQlmhnHCdUF64XqggVBRuECoJNwqVhZuEKkJVoZpQXagh1BRqCbWFOkJdoZ5QX7hZaCA0FBoJjYUmQlOhmdBcaCG0FFoJtwithTbCrcJtQlvhduEOoZ3Q/k+tP1p4RBgjPCo8JjwuPCGMFZ4UxgnjhaeECcJE4WlhkjBZmCI8I0wVnhWmCdOFGcJMYZbwnDBbeF6YI7wgzBXmCfOFBcJC4UVhkfCSsFh4WVgivCIsFZYJy4VXhdeE14UVwhvCSuFNYZXwlrBaeFtYI6wV3hHWCe8K64X3hA3CRuF94QNhk/ChsFn4SNgibBW2CduFHcJOYZewW9gj7BU+FvYJnwj7hQPCQeFT4ZDwmXBY+Fw4InwhHBW+FI4JXwnHha+FE8I3wknhW+GUcFr4TvheOCP8IJwVzgnnhR+FC8JPwkXhZ+GScFkgIiOyIifyoiCKoiTKoiKqoibqoiGaoiXaoiO6oif6YiCGYkTMJmYXc4g5xVxibjGPmFdMEPOJ+cUCYqJYUCwkFhaLiEXFYuI1YnGxhFhSvFYsJZYWy4hlxXLidWJ58XqxglhRvEGsJN4oVhZvEquIVcVqYnWxhlhTrCXWFuuIdcV6Yn3xZrGB2FBsJDYWm4hNxWZic7GF2FJsJd4ithbbiLeKt4ltxdvFO8R2Ynuxg9hRvFPsJHYWu4h3iV3FbmJ3sYfYU+wl9hb7iH3Fu8V+Yn9xgDhQHCTeIw4Wh4j3iveJQ8X7xQfEB8Vh4nBxhPiQOFJ8WBwljhYfEceIj4qPiY+LT4hjxSfFceJ48SlxgjhRfFqcJE4Wp4jPiFPFZ8Vp4nRxhjhTnCU+J84WnxfniC+Ic8V54nxxgbhQfFFcJL4kLhZfFpeIr4hLxWXicvFV8TXxdXGF+Ia4UnxTXCW+Ja4W3xbXiGvFd8R14rvievE9cYO4UXxf/EDcJH4obhY/EreIW8Vt4nZxh7hT3CXuFveIe8WPxX3iJ+J+8YB4UPxUPCR+Jh4WPxePiF+IR8UvxWPiV+Jx8WvxhPiNeFL8Vjwlnha/E78Xz4g/iGfFc+J58UfxgviTeFH8WbwkXhaJxEisxEm8JEiiJEmypEiqpEm6ZEimZEm25Eiu5Em+FEihFJGySdmlHFJOKZeUW8oj5ZUSpHxSfqmAlCgVlApJhaUiUlGpmHSNVFwqIZWUrpVKSaWlMlJZqZx0nVReul6qIFWUbpAqSTdKlaWbpCpSVamaVF2qIdWUakm1pTpSXameVF+6WWogNZQaSY2lJlJTqZnUXGohtZRaSbdIraU20q3SbVJb6XbpDqmd1F7qIHWU7pQ6SZ2lLtJdUlepm9Rd6iH1lHpJvaU+Ul/pbqmf1F8aIA2UBkn3SIOlIdK90n3SUOl+6QHpQWmYNFwaIT0kjZQelkZJo6VHpDHSo9Jj0uPSE9JY6UlpnDReekqaIE2UnpYmSZOlKdIz0lTpWWmaNF2aIRFplvScNFt6XpojvSDNleZJ86UF0kLpRWmR9JK0WHpZWiK9Ii2VlknLpVel16TXpRXSG9JK6U1plfSWtFp6W1ojrZXekdZJ70rrpfekDdJG6X3pA2mT9KG0WfpI2iJtlbZJ26Ud0k5pl7Rb2iPtlT6W9kmfSPulA9JB6VPpkPSZdFj6XDoifSEdlb6UjklfScelr6UT0jfSSelb6ZR0WvpO+l46I/0gnZXOSeelH6UL0k/SReln6ZJ0WSIyI7MyJ/OyIIuyJMuyIquyJuuyIZuyJduyI7uyJ/tyIIdyRM4mZ5dzyDnlXHJuOY+cV06Q88n55QJyolxQLiQXlovIReVi8jVycbmEXFK+Vi4ll5bLyGXlcvJ1cnn5ermCXFG+Qa4k3yhXlm+Sq8hV5WpydbmGXFOuJdeW68h15XpyfflmuYHcUCaEDGsiN5Wbyc3lFnJLuZV8i9xabiPfKt8mt5Vvl++Q28nt5Q5yR/lOuZPcWe4i3yV3lbvJ3eUeck+5l9xb7iP3le+W+8n95QHyQHmQfI88WB4i3yvfJw+V75cfkB+Uh8nD5RHyQ/JI+WF5lDxafkQeIz8qPyY/Lj8hj5WflMfJ4+Wn5AnyRPlpeZI8WZ4iPyNPlZ+Vp8nT5RnyTHmW/Jw8W35eniO/IM+V58nz5QXyQvlFeZH8krxYflleIr8iL5WXycvlV+XX5NflFfIb8kr5TXmV/Ja8Wn5bXiOvld+R18nvyuvl9+QN8kb5ffkDeZP8obxZ/kjeIm+Vt8nb5R3yTnmXvFveI++VP5b3yZ/I++UD8kH5U/mQ/Jl8WP5cPiJ/IR+Vv5SPyV/Jx2WGT9638618Sj4tfyd/L5+Rf5DPyufk8/KP8gX5J/mi/LN8Sb4sE4VRWIVTeEVQREVSZEVRVEVTdMVQTMVSbMVRXMVTfCVQQiWiZFOyKzmUnEouJbeSR8mrJCj5lPxKASVRKagUUgorRZSiSjHlGqW4UkIpqVyrlFJKK2WUsko55TqlvHK9UkGpqNygVFJuVCorNylVlKpKNaW6UkOpqdRSait1lLpKPaW+crPSQGmoNFIaK02UpkozpbnSQmmptFJuUVorbZRblduUtsrtyh1KO6W90kHpqNypdFI6K12Uu5SuSjelu9JD6an0UnorfZS+yt1KP6W/MkAZqAxS7lEGK0OUe5X7lKHK/coDyoPKMGW4MkJ5SBmpPKyMUkYrjyhjlEeVx5THlSeUscqTyjhlvPKUMkGZqDytTFImK1OUZ5SpyrPKNGW6MkOZqcxSnlNmK88rc5QXlLnKPGW+skBZqLyoLFJeUhYrLytLlFeUpcoyZbnyqvKa8rqyQnlDWam8qaxS3lJWK28ra5S1yjvKOuVdZb3ynrJB2ai8r3ygbFI+VDYrHylblK3KNmW7skPZqexSdit7lL3Kx8o+5RNlv3JAOah8qhxSPlMOK58rR5QvlKPKl8ox5SvluPK1ckL5RjmpfKucUk4r3ynfK2eUH5SzyjnlvPKjckH5Sbmo/KxcUi4rRGVUVuVUXhVUUZVUWVVUVdVUXTVUU7VUW3VUV/VUXw3UUI2o2dTsag41p5pLza3mUfOqCWo+Nb9aQE1UC6qF1MJqEbWoWky9Ri2ullBLqteqpdTSahm1rFpOvU4tr16vVlArqjeoldQb1crqTWoVtapaTa2u1lBrqrXU2modta5aT62v3qw2UBuqjdTGahO1qdpMba62UFuqrdRb1NZqG/VW9Ta1rXq7eofaTm2vdlA7qneqndTOahf1LrWr2k3trvZQe6q91N5qH7WverfaT+2vDlAHqoPUe9TB6hD1XvU+dah6v/qA+qA6TB2ujlAfUkeqD6uj1NHqI+oY9VH1MfVx9Ql1rPqkOk4drz6lTlAnqk+rk9TJ6hT1GXWq+qw6TZ2uzlBnqrPU59TZ6vPqHPUFda46T52vLlAXqi+qi9SX1MXqy+oS9RV1qbpMXa6+qr6mvq6uUN9QV6pvqqvUt9TV6tvqGnWt+o66Tn1XXa++p25QN6rvqx+om9QP1c3qR+oWdau6Td2u7lB3qrvU3eoeda/6sbpP/UTdrx5QD6qfqofUz9TD6ufqEfUL9aj6pXpM/Uo9rn6tnlC/UU+q36qn1NPqd+r36hn1B/Wsek49r/6oXlB/Ui+qP6uX1Msq0RiN1TiN1wRN1CRN1hRN1TRN1wzN1CzN1hzN1TzN1wIt1CJaNi27lkPLqeXScmt5tLxagpZPy68V0BK1glohrbBWRCuqFdOu0YprJbSS2rVaKa20VkYrq5XTrtPKa9drFbSK2g1aJe1GrbJ2k1ZFq6pV06prNbSaWi2ttlZHq6vV0+prN2sNtIZaI62x1kRrqjXTmmsttJZaK+0WrbXWRrtVu01rq92u3aG109prHbSO2p1aJ62z1kW7S+uqddO6az20nlovrbfWR+ur3a310/prA7SB2iDtHm2wNkS7V7tPG6rdrz2gPagN04ZrI7SHtJHaw9oobbT2iDZGe1R7THtce0Ibqz2pjdPGa09pE7SJ2tPaJG2yNkV7RpuqPatN06ZrM7SZ2iztOW229rw2R3tBm6vN0+ZrC7SF2ovaIu0lbbH2srZEe0Vbqi3Tlmuvaq9pr2srtDe0ldqb2irtLW219ra2RlurvaOt097V1mvvaRu0jdr72gfaJu1DbbP2kbZF26pt07ZrO7Sd2i5tt7ZH26t9rO3TPtH2awe0g9qn2iHtM+2w9rl2RPtCO6p9qR3TvtKOa19rJ7RvtJPat9op7bT2nfa9dkb7QTurndPOaz9qF7SftIvaz9ol7bJGdEZndU7ndUEXdUmXdUVXdU3XdUM3dUu3dUd3dU/39UAP9YieTc+u59Bz6rn03HoePa+eoOfT8+sF9ES9oF5IL6wX0YvqxfRr9OJ6Cb2kfq1eSi+tl9HL6uX06/Ty+vV6Bb2ifoNeSb9Rr6zfpFfRq+rV9Op6Db2mXkuvrdfR6+r19Pr6zXoDvaHeSG+sN9Gb6s305noLvaXeSr9Fb6230W/Vb9Pb6rfrd+jt9PZ6B72jfqfeSe+sd9Hv0rvq3fTueg+9p95L76330fvqd+v99P76AH2gPki/Rx+sD9Hv1e/Th+r36w/oD+rD9OH6CP0hfaT+sD5KH60/oo/RH9Uf0x/Xn9DH6k/q4/Tx+lP6BH2i/rQ+SZ+sT9Gf0afqz+rT9On6DH2mPkt/Tp+tP6/P0V/Q5+rz9Pn6An2h/qK+SH9JX6y/rC/RX9GX6sv05fqr+mv66/oK/Q19pf6mvkp/S1+tv62v0dfq7+jr9Hf19fp7+gZ9o/6+/oG+Sf9Q36x/pG/Rt+rb9O36Dn2nvkvfre/R9+of6/v0T/T9+gH9oP6pfkj/TD+sf64f0b/Qj+pf6sf0r/Tj+tf6Cf0b/aT+rX5KP61/p3+vn9F/0M/q5/Tz+o/6Bf0n/aL+s35Jv6wTgzFYgzN4QzBEQzJkQzFUQzN0wzBMwzJswzFcwzN8IzBCI2JkM7IbOYycRi4jt5HHyGskGPmM/EYBI9EoaBQyChtFjKJGMeMao7hRwihpXGuUMkobZYyyRjnjOqO8cb1Rwaho3GBUMm40Khs3GVWMqkY1o7pRw6hp1DJqG3WMukY9o75xs9HAaGg0MhobTYymRjOjudHCaGm0Mm4xWhttjFuN24y2xu3GHUY7o73Rweho3Gl0MjobXYy7jK5GN6O70cPoafQyeht9jL7G3UY/o78xwBhoDDLuMQYbQ4x7jfuMocb9xgPGg8YwY7gxwnjIGGk8bIwyRhuPGGOMR43HjMeNJ4yxxpPGOGO88ZQxwZhoPG1MMiYbU4xnjKnGs8Y0Y7oxw5hpzDKeM2YbzxtzjBeMucY8Y76xwFhovGgsMl4yFhsvG0uMV4ylxjJjufGq8ZrxurHCeMNYabxprDLeMlYbbxtrjLXGO8Y6411jvfGescHYaLxvfGBsMj40NhsfGVuMrcY2Y7uxw9hp7DJ2G3uMvcbHxj7jE2O/ccA4aHxqHDI+Mw4bnxtHjC+Mo8aXxjHjK+O48bVxwvjGOGl8a5wyThvfGd8bZ4wfjLPGOeO88aNxwfjJuGj8bFwyLhvEZEzW5EzeFEzRlEzZVEzV1EzdNEzTtEzbdEzX9EzfDMzQjJjZzOxmDjOnmcvMbeYx85oJZj4zv1nATDQLmoXMwmYRs6hZzLzGLG6WMEua15qlzNJmGbOsWc68zixvXm9WMCuaN5iVzBvNyuZNZhWzqlnNrG7WMGuatczaZh2zrlnPrG/ebDYwG5qNzMZmE7Op2cxsbrYwW5qtzFvM1mYb81bzNrOtebt5h9nObG92MDuad5qdzM5mF/Mus6vZzexu9jB7mr3M3mYfs695t9nP7G8OMAeag8x7zMHmEPNe8z5zqHm/+YD5oDnMHG6OMB8yR5oPm6PM0eYj5hjzUfMx83HzCXOs+aQ5zhxvPmVOMCeaT5uTzMnmFPMZc6r5rDnNnG7OMGeas8znzNnm8+Yc8wVzrjnPnG8uMBeaL5qLzJfMxebL5hLzFXOpucxcbr5qvma+bq4w3zBXmm+aq8y3zNXm2+Yac635jrnOfNdcb75nbjA3mu+bH5ibzA/NzeZH5hZzq7nN3G7uMHeau8zd5h5zr/mxuc/8xNxvHjAPmp+ah8zPzMPm5+YR8wvzqPmlecz8yjxufm2eML8xT5rfmqfM0+Z35vfmGfMH86x5zjxv/mheMH8yL5o/m5fMyyaxGIu1OIu3BEu0JEu2FEu1NEu3DMu0LMu2HMu1PMu3Aiu0IlY2K7uVw8pp5bJyW3msvFaClc/KbxWwEq2CViGrsFXEKmoVs66xilslrJLWtVYpq7RVxiprlbOus8pb11sVrIrWDVYl60arsnWTVcWqalWzqls1rJpWLau2Vceqa9Wz6ls3Ww2shlYjq7HVxGpqNbOaWy2sllYr6xartdXGutW6zWpr3W7dYbWz2lsdrI7WnVYnq7PVxbrL6mp1s7pbPayeVi+rt9XH6mvdbfWz+lsDrIHWIOsea7A1xLrXus8aat1vPWA9aA2zhlsjrIeskdbD1ihrtPWINcZ61HrMetx6whprPWmNs8ZbT1kTrInW09Yka7I1xXrGmmo9a02zplszrJnWLOs5a7b1vDXHesGaa82z5lsLrIXWi9Yi6yVrsfWytcR6xVpqLbOWW69ar1mvWyusN6yV1pvWKusta7X1trXGWmu9Y62z3rXWW+9ZG6yN1vvWB9Ym60Nrs/WRtcXaam2ztls7rJ3WLmu3tcfaa31s7bM+sfZbB6yD1qfWIesz67D1uXXE+sI6an1pHbO+so5bX1snrG+sk9a31inrtPWd9b11xvrBOmuds85bP1oXrJ+si9bP1iXrskVsxmZtzuZtwRZtyZZtxVZtzdZtwzZty7Ztx3Ztz/btwA7tiJ3Nzm7nsHPauezcdh47r51g57Pz2wXsRLugXcgubBexi9rF7Gvs4nYJu6R9rV3KLm2Xscva5ezr7PL29XYFu6J9g13JvtGubN9kV7Gr2tXs6nYNu6Zdy65t17Hr2vXs+vbNdgO7od3Ibmw3sZvazezmdgu7pd3KvsVubbexb7Vvs9vat9t32O3s9nYHu6N9p93J7mx3se+yu9rd7O52D7un3cvubfex+9p32/3s/vYAe6A9yL7HHmwPse+177OH2vfbD9gP2sPs4fYI+yF7pP2wPcoebT9ij7EftR+zH7efsMfaT9rj7PH2U/YEe6L9tD3JnmxPsZ+xp9rP2tPs6fYMe6Y9y37Onm0/b8+xX7Dn2vPs+fYCe6H9or3IfslebL9sL7FfsZfay+zl9qv2a/br9gr7DXul/aa9yn7LXm2/ba+x19rv2Ovsd+319nv2Bnuj/b79gb3J/tDebH9kb7G32tvs7fYOe6e9y95t77H32h/b++xP7P32Afug/al9yP7MPmx/bh+xv7CP2l/ax+yv7OP21/YJ+xv7pP2tfco+bX9nf2+fsX+wz9rn7PP2j/YF+yf7ov2zfcm+bBOHcViHc3hHcERHcmRHcVRHc3THcEyHc2zHcVzHc3wncEIn4mRzsjs5nJxOLie3k8fJ6yQ4+Zz8TgEn0SnoFHIKO0Wcok4x5xqnuFPCKelc65RySjtlnLJOOec6p7xzvVPBqejc4FRybnQqOzc5VZyqTjWnulPDqenUcmo7dZy6Tj2nvnOz08Bp6DRyGjtNnKaXGae508Jp6bRybnFaO22cW53bnLbO7c4dTjunvdPB6ejc6XRyOjtdnLucrk43p7vTw+np9HJ6O32cvs7dTj+nvzPAGegMcu5xBjtDnHud+5yhzv3OA86DzjBnuDPCecgZ6TzsjHJGO484Y5xHncecx50nnLHOk844Z7zzlDPBmeg87UxyJjtTnGecqc6zzjRnujPDmenMcp5zZjvPO3OcF5y5zjxnvrPAWei86CxyXnIWOy87S5xXnKXOMme586rzmvO6s8J5w1npvOmsct5yVjtvO2uctc47zjrnXWe9856zwdnovO984GxyPnQ2Ox85W5ytzjZnu7PD2enscnY7e5y9zsfOPucTZ79zwDnofOoccj5zDjufO0ecL5yjzpfOMecr57jztXPC+cY56XzrnHJOO9853ztnnB+cs84557zzo3PB+cm56PzsXHIuO8RlXNblXN4VXNGVXNlVXNXVXN01XNO1XNt1XNf1XN8N3NCNuNnc7G4ON6eby83t5nHzugluPje/W8BNdAu6hdzCbhG3qFvMvcYt7pZwS7rXuqXc0m4Zt6xbzr3OLe9e71ZwK7o3uJXcG93K7k1uFbeqW82t7tZwa7q13NpuHbeuW8+t797sNnAbuo3cxm4Tt6nbzG3utnBbuq3cW9zWbhv3Vvc2t617u3uH285t73ZwO7p3up3czm4X9y63q9vN7e72cHu6vdzebh+3r3u328/t7w5wB7qD3Hvcwe4Q9173Pneoe7/7gPugO8wd7o5wH3JHug+7o9zR7iPuGPdR9zH3cfcJd6z7pDvOHe8+5U5wJ7pPu5Pcye4U9xl3qvusO82d7s5wZ7qz3Ofc2e7z7hz3BXeuO8+d7y5wF7ovuovcl9zF7svuEvcVd6m7zF3uvuq+5r7urnDfcFe6b7qr3Lfc1e7b7hp3rfuOu859113vvuducDe677sfuJvcD93N7kfuFneru83d7u5wd7q73N3uHnev+7G7z/3E3e8ecA+6n7qH3M/cw+7n7hH3C/eo+6V7zP3KPe5+7Z5wv3FPut+6p9zT7nfu9+4Z9wf3rHvOPe/+6F5wf3Ivuj+7l9zLLvEYj/U4j/cET/QkT/YUT/U0T/cMz/Qsz/Ycz/U8z/cCL/QiXjYvu5fDy+nl8nJ7eby8XoKXz8vvFfASvYJeIa+wV8Qr6hXzrvGKeyW8kt61XimvtFfGK+uV867zynvXexW8it4NXiXvRq+yd5NXxavqVfOqezW8ml4tr7ZXx6vr1fPqezd7DbyGXiOvsdfEa+o185p7LbyWXivvFq+118a71bvNa+vd7t3htfPaex28jt6dXievs9fFu8vr6nXzuns9vJ5eL6+318fr693t9fP6ewO8gd4g7x5vsDfEu9e7zxvq3e894D3oDfOGeyO8h7yR3sPeKG+094g3xnvUe8x73HvCG+s96Y3zxntPeRO8id7T3iRvsjfFe8ab6j3rTfOmezO8md4s7zlvtve8N8d7wZvrzfPmewu8hd6L3iLvJW+x97K3xHvFW+ot85Z7r3qvea97K7w3vJXem94q7y1vtfe2t8Zb673jrfPe9dZ773kbvI3e+94H3ibvQ2+z95G3xdvqbfO2ezu8nd4ub7e3x9vrfezt8z7x9nsHvIPep94h7zPvsPe5d8T7wjvqfekd877yjntfeye8b7yT3rfeKe+09533vXfG+8E7653zzns/ehe8n7yL3s/eJe+yR3zGZ33O533BF33Jl33FV33N133DN33Lt33Hd33P9/3AD/2In83P7ufwc/q5/Nx+Hj+vn+Dn8/P7BfxEv6BfyC/sF/GL+sX8a/zifgm/pH+tX8ov7Zfxy/rl/Ov88v71fgW/on+DX8m/0a/s3+RX8av61fzqfg2/pl/Lr+3X8ev69fz6/s1+A7+h38hv7Dfxm/rN/OZ+C7+l38q/xW/tt/Fv9W/z2/q3+3f47fz2fge/o3+n38nv7Hfx7/K7+t387n4Pv6ffy+/t9/H7+nf7/fz+/gB/oD/Iv8cf7A/x7/Xv84f69/sP+A/6w/zh/gj/IX+k/7A/yh/tP+KP8R/1H/Mf95/wx/pP+uP88f5T/gR/ov+0P8mf7E/xn/Gn+s/60/zp/gx/pj/Lf86f7T/vz/Ff8Of68/z5/gJ/of+iv8h/yV/sv+wv8V/xl/rL/OX+q/5r/uv+Cv8Nf6X/pr/Kf8tf7b/tr/HX+u/46/x3/fX+e/4Gf6P/vv+Bv8n/0N/sf+Rv8bf62/zt/g5/p7/L3+3v8ff6H/v7/E/8/f4B/6D/qX/I/8w/7H/uH/G/8I/6X/rH/K/84/7X/gn/G/+k/61/yj/tf+d/75/xf/DP+uf88/6P/gX/J/+i/7N/yb/sk4AJ2IAL+EAIxEAK5EAJ1EAL9MAIzMAK7MAJ3MAL/CAIwiASZAuyBzmCnEGuIHeQJ8gbJAT5gvxBgSAxKBgUCgoHRYKiQbHgmqB4UCIoGVwblApKB2WCskG54LqgfHB9UCGoGNwQVApuDCoHNwVVgqpBtaB6UCOoGdQKagd1grpBvaB+cHPQIGgYNAoaB02CpkGzoHnQImgZtApuCVoHbYJbg9uCtsHtwR1Bu6B90CHoGNwZdAo6B12Cu4KuQbege9Aj6Bn0CnoHfYK+wd1Bv6B/MCAYGAwK7gkGB0OCe4P7gqHB/cEDwYPBsGB4MCJ4KBgZPByMCkYHjwRjgkeDx4LHgyeCscGTwbhgfPBUMCGYGDwdTAomB1OCZ4KpwbPBtGB6MCOYGcwKngtmB88Hc4IXgrnBvGB+sCBYGLwYLApeChYHLwdLgleCpcGyYHnwavBa8HqwIngjWBm8GawK3gpWB28Ha4K1wTvBuuDdYH3wXrAh2Bi8H3wQbAo+DDYHHwVbgq3BtmB7sCPYGewKdgd7gr3Bx8G+4JNgf3AgOBh8GhwKPgsOB58HR4IvgqPBl8Gx4KvgePB1cCL4JjgZfBucCk4H3wXfB2eCH4KzwbngfPBjcCH4KbgY/BxcCi4HJGRCNuRCPhRCMZRCOVRCNdRCPTRCM7RCO3RCN/RCPwzCMIyE2cLsYY4wZ5grzB3mCfOGCWG+MH9YIEwMC4aFwsJhkbBoWCy8JiwelghLhteGpcLSYZmwbFguvC4sH14fVggrhjeElcIbw8rhTWGVsGpYLawe1ghrhrXC2mGdsG5YL6wf3hw2CBuGjcLGYZOwadgsbB62CFuGrcJbwtZhm/DW8LawbXh7eEfYLmwfdgg7hneGncLOYZfwrrBr2C3sHvYIe4a9wt5hn7BveHfYL+wfDggHhoPCe8LB4ZDw3vC+cGh4f/hA+GA4LBwejggfCkeGD4ejwtHhI+GY8NHwsfDx8IlwbPhkOC4cHz4VTggnhk+Hk8LJ4ZTwmXBq+Gw4LZwezghnhrPC58LZ4fPhnPCFcG44L5wfLggXhi+Gi8KXwsXhy+GS8JVwabgsXB6+Gr4Wvh6uCN8IV4ZvhqvCt8LV4dvhmnBt+E64Lnw3XB++F24IN4bvhx+Em8IPw83hR+GWcGu4Ldwe7gh3hrvC3eGecG/4cbgv/CTcHx4ID4afhofCz8LD4efhkfCL8Gj4ZXgs/Co8Hn4dngi/CU+G34anwtPhd+H34Znwh/BseC48H/4YXgh/Ci+GP4eXwsshiTARNsJF+IgQESNSRI4oETWiRfSIETEjVsSOOBE34kX8SBAJI5FItkj2SI5IzkiuSO5InkjeSEIkXyR/pEAkMVIwUihSOFIkUjRSLHJNpHikRKRk5NpIqUjpSJlI2Ui5yHWR8pHrIxUiFSM3RCpFboxUjtwUqRKpGqkWqR6pEakZqRWpHakTqRupF6kfuTnSINIw0ijSONIk0jTSLNI80iLSMtIqckukdaRN5Nb/0XZXMX7Ebr/Yw8w0zOyZ8fDML8zMzMzMzMzMzMxMG+Zks2Fmpg1zskmq9+05rU7/F5WqHt99ZNl6vpJ9aT9IfaQB0hBphDRGmiBNkWZIc6QF0hJphbRG2iBtkXZIe6QD0hHphHRGuiBdkW5Id6QH0hPphfRG+iB9kX5If2QAMhAZhAxGhiBDkWHIcGQEMhIZhYxGxiBjkXHIeGQCMhGZhExGpiBTkWnIdGQGMhOZhcxG5iBzkXnIfGQBshBZhCxGliBLkWXIcmQFshJZhaxG1iBrkXXIemQDshHZhGxGtiBbkW3IdmQHshPZhexG9iB7kX3IfiQOOYAcRA4hh5EjyFHkGHIcOYGcRE4hp5EzyFnkHBKPnEcSkAvIReQSchm5glxFriHXkRvITeQWchu5g9xF7iH3kQfIQ+QR8hh5gjxFniHPkRfIS+QV8hp5gyQib5F3yHvkA/IR+YR8Rr4gX5FvyHfkB/IT+YX8RpKQP8hf5B+SDE2OpkBToqnQ1GgaNC2aDk2PZkAzopnQzGgWNCuaDc2O5kBzornQ3GgeFEFRFENxlEBJlEJplEFZlEN5VEBFVEJlVEFVVEMBqqMGaqIQtVAbdVAX9VAfDdAQjdAYmhfNh+ZHC6AF0UJoYbQIWhQthhZHS6Al0VJoabQMWhYth5ZHK6AV0UpoZbQKWhWthlZHa6A10VpobbQOWheth9ZHG6AN0UZoY7QJ2hRthjZHW6At0VZoa7QN2hZth7ZHO6Ad0U5oZ7QL2hXthnZHe6A90V5ob7QP2hfth/ZHB6AD0UHoYHQIOhQdhg5HR6Aj0VHoaHQMOhYdh45HJ6AT0UnoZHQKOhWdhk5HZ6Az0VnobHQOOhedh85HF6AL0UXoYnQJuhRdhi5HV6Ar0VXoanQNuhZdh65HN6Ab0U3oZnQLuhXdhm5Hd6A70V3obnQPuhfdh+5H49AD6EH0EHoYPYIeRY+hx9ET6En0FHoaPYOeRc+h8eh5NAG9gF5EL6GX0SvoVfQaeh29gd5Eb6G30TvoXfQeeh99gD5EH6GP0SfoU/QZ+hx9gb5EX6Gv0TdoIvoWfYe+Rz+gH9FP6Gf0C/oV/YZ+R3+gP9Ff6G80Cf2D/kX/ocmw5FgKLCWWCkuNpcHSYumw9FgGLCOWCcuMZcGyYtmw7FgOLCeWC8uN5cEQDMUwDMcIjMQojMYYjMU4jMcETMQkTMYUTMU0DGA6ZmAmBjELszEHczEP87EAC7EIi2F5sXxYfqwAVhArhBXGimBFsWJYcawEVhIrhZXGymBlsXJYeawCVhGrhFXGqmBVsWpYdawGVhOrhdXG6mB1sXpYfawB1hBrhDXGmmBNsWZYc6wF1hJrhbXG2mBtsXZYe6wD1hHrhHXGumBdsW5Yd6wH1hPrhfXG+mB9sX5Yf2wANhAbhA3GhmBDsWHYcGwENhIbhY3GxmBjsXHYeGwCNhGbhE3GpmBTsWnYdGwGNhObhc3G5mBzsXnYfGwBthBbhC3GlmBLsWXYcmwFthJbha3G1mBrsXXYemwDthHbhG3GtmBbsW3YdmwHthPbhe3G9mB7sX3YfiwOO4AdxA5hh7Ej2FHsGHYcO4GdxE5hp7Ez2FnsHBaPnccSsAvYRewSdhm7gl3FrmHXsRvYTewWdhu7g93F7mH3sQfYQ+wR9hh7gj3FnmHPsRfYS+wV9hp7gyVib7F32HvsA/YR+4R9xr5gX7Fv2HfsB/YT+4X9xpKwP9hf7B+WDE+Op8BT4qnw1HgaPC2eDk+PZ8Az4pnwzHgWPCueDc+O58Bz4rnw3HgeHMFRHMNxnMBJnMJpnMFZnMN5XMBFXMJlXMFVXMMBruMGbuIQt3Abd3AX93AfD/AQj/AYnhfPh+fHC+AF8UJ4YbwIXhQvhhfHS+Al8VJ4abwMXhYvh5fHK+AV8Up4ZbwKXhWvhlfHa+A18Vp4bbwOXhevh9fHG+AN8UZ4Y7wJ3hRvhjfHW+At8VZ4a7wN3hZvh7fHO+Ad8U54Z7wL3hXvhnfHe+A98V54b7wP3hfvh/fHB+AD8UH4YHwIPhQfhg/HR+Aj8VH4aHwMPhYfh4/HJ+AT8Un4ZHwKPhWfhk/HZ+Az8Vn4bHwOPhefh8/HF+AL8UX4YnwJvhRfhi/HV+Ar8VX4anwNvhZfh6/HN+Ab8U34ZnwLvhXfhm/Hd+A78V34bnwPvhffh+/H4/AD+EH8EH4YP4IfxY/hx/ET+En8FH4aP4Ofxc/h8fh5PAG/gF/EL+GX8Sv4Vfwafh2/gd/Eb+G38Tv4Xfwefh9/gD/EH+GP8Sf4U/wZ/hx/gb/EX+Gv8Td4Iv4Wf4e/xz/gH/FP+Gf8C/4V/4Z/x3/gP/Ff+G88Cf+D/8X/4cmI5EQKIiWRikhNpCHSEumI9EQGIiORichMZCGyEtmI7EQOIieRi8hN5CEQAiUwAicIgiQogiYYgiU4gicEQiQkQiYUQiU0AhA6YRAmAQmLsAmHcAmP8ImACImIiBF5iXxEfqIAUZAoRBQmihBFiWJEcaIEUZIoRZQmyhBliXJEeaICUZGoRFQmqhBViWpEdaIGUZOoRdQm6hB1iXpEfaIB0ZBoRDQmmhBNiWZEc6IF0ZJoRbQm2hBtiXZEe6ID0ZHoRHQmuhBdiW5Ed6IH0ZPoRfQm+hB9iX5Ef2IAMZAYRAwmhhBDiWHEcGIEMZIYRYwmxhBjiXHEeGICMZGYREwmphBTiWnEdGIGMZOYRcwm5hBziXnEfGIBsZBYRCwmlhBLiWXEcmIFsZJYRawm1hBriXXEemIDsZHYRGwmthBbiW3EdmIHsZPYRewm9hB7iX3EfiKOOEAcJA4Rh4kjxFHiGHGcOEGcJE4Rp4kzxFniHBFPnCcSiAvEReIScZm4QlwlrhHXiRvETeIWcZu4Q9wl7hH3iQfEQ+IR8Zh4QjwlnhHPiRfES+IV8Zp4QyQSb4l3xHviA/GR+ER8Jr4QX4lvxHfiB/GT+EX8JpKIP8Rf4h+RjExOpiBTkqnI1GQaMi2ZjkxPZiAzkpnIzGQWMiuZjcxO5iBzkrnI3GQeEiFREiNxkiBJkiJpkiFZkiN5UiBFUiJlUiFVUiMBqZMGaZKQtEibdEiX9EifDMiQjMgYmZfMR+YnC5AFyUJkYbIIWZQsRhYnS5AlyVJkabIMWZYsR5YnK5AVyUpkZbIKWZWsRlYna5A1yVpkbbIOWZesR9YnG5ANyUZkY7IJ2ZRsRjYnW5AtyVZka7IN2ZZsR7YnO5AdyU5kZ7IL2ZXsRnYne5A9yV5kb7IP2ZfsR/YnB5ADyUHkYHIIOZQcRg4nR5AjyVHkaHIMOZYcR44nJ5ATyUnkZHIKOZWcRk4nZ5AzyVnkbHIOOZecR84nF5ALyUXkYnIJuZRcRi4nV5AryVXkanINuZZcR64nN5AbyU3kZnILuZXcRm4nd5A7yV3kbnIPuZfcR+4n48gD5EHyEHmYPEIeJY+Rx8kT5EnyFHmaPEOeJc+R8eR5MoG8QF4kL5GXySvkVfIaeZ28Qd4kb5G3yTvkXfIeeZ98QD4kH5GPySfkU/IZ+Zx8Qb4kX5GvyTdkIvmWfEe+Jz+QH8lP5GfyC/mV/EZ+J3+QP8lf5G8yifxD/iX/kcmo5FQKKiWVikpNpaHSUumo9FQGKiOVicpMZaGyUtmo7FQOKieVi8pN5aEQCqUwCqcIiqQoiqYYiqU4iqcESqQkSqYUSqU0ClA6ZVAmBSmLsimHcimP8qmACqmIilF5qXxUfqoAVZAqRBWmilBFqWJUcaoEVZIqRZWmylBlqXJUeaoCVZGqRFWmqlBVqWpUdaoGVZOqRdWm6lB1qXpUfaoB1ZBqRDWmmlBNqWZUc6oF1ZJqRbWm2lBtqXZUe6oD1ZHqRHWmulBdqW5Ud6oH1ZPqRfWm+lADV/ej+lMDqObGIGowNYQaSg2jhlMjqJHUKGo0NYYaS42jxlMTqInUJGoyNYWaSk2jplMzqJnULGo2NYeaS82j5lMLqIXUImoxtYRaSi2jllMrqJXUKmo1tYZaS62j1lMbqI3UJmoztYXaSm2jtlM7qJ3ULmo3tYfaS+2j9lNx1AHqIHWIOkwdoY5Sx6jj1AnqJHWKOk2doc5S56h46jyVQF2gLlKXqMvUFeoqdY26Tt2gblK3qNvUHeoudY+6Tz2gHlKPqMfUE+op9Yx6Tr2gXlKvqNfUGyqReku9o95TH6iP1CfqM/WF+kp9o75TP6if1C/qN5VE/aH+Uv+oZHQyOgWdgk5Fp6LT0GnodHQ6OgOdgc5EZ6Kz0FnobHQ2Ogedg85F56Lz0HlolEZpnMZpkibp/xoszdI8zdMiLdIyLdMqrdKABrRBGzSkIW3TNu3SLu3TPh3SIR2jY3Q+Oh9dgC5AF6IL0UXoInQxuhhdgi5Bl6JL0WXoMnQ5uhxdga5AV6Ir0VXoKnQ1uhpdg65B16Jr0XXoOnQ9uh7dgG5AN6Ib0U3oJnQzuhndgm5Bt6Jb0W3oNnQ7uh3dge5Ad6I70V3oLnQ3uhvdg+5B96J70X3oPnQ/uh89gB5AD6IH0UPoIfQwehg9gh5Bj6JH0WPoMfQ4ehw9gZ5AT6In0VPoKfQ0eho9g55Bz6Jn03PoufQ8ej69gF5IL6IX00voJfQyehm9gl5Br6JX0WvoNfQ6eh29gd5Ab6I30VvoLfQ2ehu9g95B76J30XvoPfQ+eh8dR8fRB+mD9GH6MH2UPkofp4/TJ+mT9Gn6NH2WPkvH0/F0Ap1AX6Qv0pfpy/RV+ip9nb5O36Rv0rfp2/Rd+i59n75PP6Qf0o/px/RT+in9nH5Ov6Rf0q/p13QinUi/o9/RH+gP9Cf6E/2F/kJ/o7/RP+gf9C/6F51EJ9F/6b90OiY9k4HJyGRiMjNZmKzM/9MogzE4QzAkQzF5GOR/Mc0wjMpoDGB0xmBMBjLWfzjG5GXyMfmZAkxBphBT+D9chinLlGPKMxWYikwppvT/4kpMZaYKU5epxtRnajANmVpMY6YOU5epx9RnGjANmUZMY6YN05Zpx7RnOjAdmU5M5//wPmY/c5o5w5xlzjG3mTvMD+Yn85p5w/xifjODmMHMGGYsMy7ZeGYCM5GZxEz+P838357DzGXmMfOZBcxCZhGz+D+8hlnLrGPWMxuYjcwmZvN/eA+zl9nGxDE7mJ3MLmb3f/u/aopjDjAHmUPMYeYIc5SJZ44zJ5iTzKn/q9Z45jyTwFxgbjK3mMvMFeYqc425ztz4b/9XjrvMPeY+84B5ybxiHjNPmKdMIvOcefHf/q98icxb5h3znvnAfGQ+MUnMF+Yr8435/t/5/yt7EvOH+cv8Y5KxydkUbEo2FZuaTcOmZdOx6dkMbEY2E5uZzcJmZbOx2dkcbE42F5ubzcMiLMpiLM4SLMlSLM0yLMtyLM8KrMhKrMwqrMpqLGB11mBNFrIWa7MO67Ie67MBG7IRG2PzsvnY/GwBtiBbiC3MFmGLssXY4mwJtiRbii3NlmHLsuXY8mwFtiJbia3MVmGrstXY6mwNtiZbi63N1mHrsvXY+mwDtiHbiG3MNmGbss3Y5mwLtiXbim3NtmHbsu3Y9mwHtiPbie3MdmG7st3Y7mwPtifbi+3N9mH7sv3Y/uwAdiA7iB3MDmGHssPY4ewIdiQ7ih3NjmHHsuPY8ewEdiI7iZ3MTmGnstPY6ewMdiY7i53NzmHnsvPY+ewCdiG7iF3MLmGXssvY5ewKdiW7il3NrmHXsuvY9ewGdiO7id3MbmG3stvY7ewOdie7i93N7mH3svvY/Wwce4A9yB5iD7NH2KPsMfY4e4I9yZ5iT7Nn2LPsOTaePc8msBfYi+wl9jJ7hb3KXmOvszfYm+wt9jZ7h73L3mPvsw/Yh+wj9jH7hH3KPmOfsy/Yl+wr9jX7hk1k37Lv2PfsB/Yj+4n9zH5hv7Lf2O/sD/Yn+4v9zSaxf9i/7D82GZecS8Gl5FJxqbk0XFouHZeey8Bl5DJxmbksXFYuG5edy8Hl5HJxubk8HMKhHMbhHMGRHMXRHMOxHMfxnMCJnMTJnMKpnMYBTucMzuQgZ3E253Au53E+F3AhF3ExLi+Xj8vPFeAKcoW4wlwRrihXjCvOleBKcqW40lwZrixXjivPVeAqci5XmavCVeWqcdW5GlxNrhZXm6vD1eXqcfW5BlxDrhHXmGvCNeWacc25FlxLrhXXmmvDteXace25DlxHrhPX+f91fgg3lBvGDeeGcyO5Udxobgw3lhvHjecmcBO5Sdxkbgo3lZvGTedmcDO5Wdxsbg43l5vHzecWcAu5Rdxibgm3lFvGLedWcCu5Vdxqbg23llvHrec2cBu5Tdxmbgu3ldvGbed2cDu5Xdxubg+3l9vH7efiuAPcQe4Qd5g7wh3ljnHHuRPcSe4Ud5o7w53lznHx3HkugbvAXeQucZe5K9xV7hp3nbvB3eRucbe5O9xd7h53n3vAPeQecY+5J9xT7hn3nHvBveReca+5N1wi95Z7x73nPnAfuU/cZ+4L95X7xn3nfnA/uV/cby6J+8P95f5xyfjkfAo+JZ+KT82n4dPy6fj0fAY+I5+Jz8xn4bPy2fjsfA4+J5+Lz83n4REe5TEe5wme5Cme5hme5Tme5wVe5CVe5hVe5TUe8Dpv8CYPeYu3eYd3eY/3+YAP+YiP8Xn5fHx+vgBfkC/EF+aL8EX5YnxxvgRfki/Fl+bL8GX5cnx5vgJfka/EV+ar8FX5anx1vgZfk6/F1+br8HX5enx9vgHfkG/EN+ab8E35ZnxzvgXfkm/Ft+bb8G35dnx7vgPfke/Ed+a78F35bnx3vgffk+/F9+b78H35fnx/fgA/kB/ED+aH8EP5YfxwfgQ/kh/Fj+bH8GP5cfx4fgI/kZ/ET+an8FP5afx0fgY/k5/Fz+bn8HP5efx8fgG/kF/EL+aX8Ev5ZfxyfgW/kl/Fr+bX8Gv5dfx6fgO/kd/Eb+a38Fv5bfx2fge/k9/F7+b38Hv5ffx+Po4/wB/kD/GH+SP8Uf4Yf5w/wZ/kT/Gn+TP8Wf4cH8+f5xP4C/xF/hJ/mb/CX+Wv8df5G/xN/hZ/m7/D3+Xv8ff5B/xD/hH/mH/CP+Wf8c/5F/xL/hX/mn/DJ/Jv+Xf8e/4D/5H/xH/mv/Bf+W/8d/4H/5P/xf/mk/g//F/+H59MSC6kEFIKqYTUQhohrZBOSC9kEDIKmYTMQhYhq5BNyC7kEHIKuYTcQh4BEVABE3CBEEiBEmiBEViBE3hBEERBEmRBEVRBE4CgC4ZgClCwBFtwBFfwBF8IhFCIhJiQV8gn5BcKCAWFQkJhoYhQVCgmFBdKCCWFUkJpoYxQVignlBcqCBWFSkJloYpQVagmVBdqCDWFWkJtoY5QV6gn1BcaCA2FRkJjoYnQVGgmNBdaCC2FVkJroY3QVmgntBc6CB2FTkJnoYvQVegmdBd6CD2FXkJvoY/QV+gn9BcGCAOFQcJgYYgwVBgmDBdGCCOFUcJoYYwwVhgnjBcmCBOFScJkYYowVZgmTBdmCDOFWcJsYY4wV5gnzBcWCAuFRcJiYYmwVFgmLBdWCCuFVcJqYY2wVlgnrBc2CBuFTcJmYYuwVdgmbBd2CDuFXcJuYY+wV9gn7BfihAPCQeGQcFg4IhwVjgnHhRPCSeGUcFo4I5wVzgnxwnkhQbggXBQuCZeFK8JV4ZpwXbgh3BRuCbeFO8Jd4Z5wX3ggPBQeCY+FJ8JT4ZnwXHghvBReCa+FN0Ki8FZ4J7wXPggfhU/CZ+GL8FX4JnwXfgg/hV/CbyFJ+CP8Ff4JycTkYgoxpZhKTC2mEdOK6cT0YgYxo5hJzCxmEbOK2cTsYg4xp5hLzC3mERERFTERFwmRFCmRFhmRFTmRFwVRFCVRFhVRFTURiLpoiKYIRUu0RUd0RU/0xUAMxUiMiXnFfGJ+sYBYUCwkFhaLiEXFYmJxsYRYUiwllhbLiGXFcmJ5sYJYUawkVhariFXFamJ1sYZYU6wl1hbriHXFemJ9sYHYUGwkNhabiE3FZmJzsYXYUmwlthbbiG3FdmJ7sYPYUewkdha7iF3FbmJ3sYfYU+wl9hb7iH3FfmJ/cYA4UBwkDhaHiEPFYeJwcYQ4UhwljhbHiGPFceJ4cYI4UZwkThaniFPFaeJ0cYY4U5wlzhbniHPFeeJ8cYG4UFwkLhaXiEvFZeJycYW4UlwlrhbXiGvFdeJ6cYO4Udwkbha3iFvFbeJ2cYe4U9wl7hb3iHvFfeJ+MU48IB4UD4mHxSPiUfGYeFw8IZ4UT4mnxTPiWfGcGC+eFxPEC/+f1rwW34iJ4lvxnfhe/CB+FD+Jn8Uv4lfxm/hd/CH+FH+Jv8Uk8Y/4V/wnJpOSSymklFIqKbWURkorpZPSSxmkjFImKbOURcoqZZOySzmknFIuKbeUR0IkVMIkXCIkUqIkWmIkVuIkXhIkUZIkWVIkVdIkIOmSIZkSlCzJlhzJlTzJlwIplCIpJuWV8kn5pQJSQamQVFgqIhWViknFpRJSSamUVFoqI5WVyknlpQpSRamSVFmqIlWVqknVpRpSTamWVFuqI9WV6kn1pQZSQ6mR1FhqIjWVmknNpRZSS6mV1FpqI7WV2kntpQ5SR6mT1FnqInWVukndpR5ST6mX1FvqI/WV+kn9pQHSQGmQNFgaIg2VhknDpRHSSGmUNFoaI42VxknjpQnSRGmSNFmaIk2VpknTpRnSTGmWNFuaI82V5knzpQXSQmmRtFhaIi2VlknLpRXSSmmVtFpaI62V1knrpQ3SRmmTtFn6n19r75B2Sruk3dIeaa+0T9ovxUkHpIPSIemwdEQ6Kh2TjksnpJPSKem0dEY6K52T4qXzUoJ0QbooXZIuS1ekq9I16bp0Q7op3ZJuS3eku9I96b70QHooPZIeS0+kp9Iz6bn0QnopvZJeS2+kROmt9E56L32QPkqfpM/SF+mr9E36Lv2Qfkq/pN9SkvRH+iv9k5LJyeUUcko5lZxaTiOnldPJ6eUMckY5k5xZziJnlbPJ2eUcck45l5xbziMjMipjMi4TMilTMi0zMitzMi8LsihLsiwrsiprMpB12ZBNGcqWbMuO7Mqe7MuBHMqRHJPzyvnk/HIBuaBcSC4sF5GLysXk4nIJuaRcSi4tl5HLyuXk8nIFuaJcSa4sV5GrytXk6nINuaZcS64t15HryvXk+nIDuaHcSG4sN5Gbys3k5nILuaXcSm4tt5Hbyu3k9nIHuaPcSe4sd5G7yt3k7nIPuafcS+4t95H7yv3k/vIAeaA8SB4sD5GHysPk4fIIeaQ8Sh4tj5HHyuPk8fIEeaI8SZ4sT5GnytPk6fIMeaY8S54tz5HnyvPk+fICeaG8SF4sL5GXysvk5fIKeaW8Sl4tr5HXyuvk9fIGeaO8Sd4sb5G3ytvk7fIOeae8S94t75H3yvvk/XKcfEA+KB+SD8tH5KPyMfm4fEI+KZ+ST8tn5LPyOTlePi8nyBfki/Il+bJ8Rb4qX5Ovyzfkm/It+bZ8R74r35Pvyw/kh/Ij+bH8RH4qP5Ofyy/kl/Ir+bX8Rk6U38rv5PfyB/mj/En+LH+Rv8rf5O/yD/mn/Ev+LSfJf+S/8j85mZJcSaGkVFIpqZU0SlolnZJeyaBkVDIpmZUsSlYlm5JdyaHkVHIpuZU8CqKgCqbgCqGQCqXQCqOwCqfwiqCIiqTIiqKoiqYARVcMxVSgYim24iiu4im+EiihEikxJa+ST8mvFFAKKoWUwkoRpahSTCmulFBKKqWU0koZpaxSTimvVFAqKpWUykoVpapSTamu1FBqKrWU2kodpa5ST6mvNFAaKo2UxkoTpanSTGmutFBaKq2U1kobpa3STmmvdFA6Kp2UzkoXpavSTemu9FB6Kr2U3kofpa/ST+mvDFAGKoOUwcoQZagyTBmujFBGKqOU0coYZawyThmvTFAmKpOUycmTKVOVacp0ZYYyU5mlzFbmKHOVecp8ZYGyUFmkLFaWKEuVZcpyZYWyUlmlrFbWKGuVdcp6ZYOyUdmkbFa2KFuVbcp2ZYeyU9ml7Fb2KHuVfcp+JU45oBxUDimHlSPKUeWYclw5oZxUTimnlTPKWeWcEq+cVxKUC8pF5ZJyWbmiXFWuKdeVG8pN5ZZyW7mj3FXuKfeVB8pD5ZHyWHmiPFWeKc+VF8pL5ZXyWnmjJCpvlXfKe+WD8lH5pHxWvihflW/Kd+WH8lP5pfxWkpQ/yl/ln5JMTa6mUFOqqdTUaho1rZpOTa9mUDOqmdTMahY1q5pNza7mUHOqudTcah4VUVEVU3GVUEmVUmmVUVmVU3lVUEVVUmVVUVVVU4Gqq4ZqqlC1VFt1VFf1VF8N1FCN1JiaV82n5lcLqAXVQmphtYhaVC2mFldLqCXVUmpptYxaVi2nllcrqBXVSmpltYpaVa2mVldrqDXVWmpttY5aV62n1lcbqA3VRmpjtYnaVG2mNldbqC3VVmprtY3aVm2ntlc7qB3VTmpntYvaVe2mdld7qD3VXmpvtY/aV+2n9lcHqAPVQepgdYg6VB2mDldHqCPVUepodYw6Vh2njlcnqBPVSepkdYo6VZ2mTldnqDPVWepsdY46V52nzlcXqAvVRepidYm6VF2mLldXqCvVVepqdY26Vl2nrlc3qBvVTepmdYu6Vd2mbld3qDvVXepudY+6V92n7lfj1APqQfWQelg9oh5Vj6nH1RPqSfWUelo9o55Vz6nx6nk1Qb2gXlQvqZfVK+pV9Zp6Xb2h3lRvqbfVO+pd9Z56X32gPlQfqY/VJ+pT9Zn6XH2hvlRfqa/VN2qi+lZ9p75XP6gf1U/qZ/WL+lX9pn5Xf6g/1V/qbzVJ/aP+Vf+pybTkWgotpZZKS62l0dJq6bT0WgYto5ZJy6xl0bJq2bTsWg4tp5ZLy63l0RAN1TAN1wiN1CiN1hiN1TiN1wRN1CRN1hRN1TQNaLpmaKYGNUuzNUdzNU/ztUALtUiLaXm1fFp+rYBWUCukFdaKaEW1YlpxrYRWUiulldbKaGW1clp5rYJWUaukVdaqaFW1alp1rYZWU6ul1dbqaHW1elp9rYHWUGukNdaaaE21ZlpzrYXWUmultdbaaG21dlp7rYPWUeukdda6aF21blp3rYfWU+ul9db6aH21flp/bYA2UBukDdaGaEO1YdpwbYQ2UhuljdbGaGO1cdp4bYI2UZukTdamaFO1adp0bYY2U5ulzdbmaHO1edp8bYG2UFukLdaWaEu1ZdpybYW2UlulrdbWaGu1ddp6bYO2Udukbda2aFu1bdp2bYe2U9ul7db2aHu1fdp+LU47oB3UDmmHtSPaUe2Ydlw7oZ3UTmmntTPaWe2cFq+d1xK0C9pF7ZJ2WbuiXdWuade1G9pN7ZZ2W7uj3dXuafe1B9pD7ZH2WHuiPdWeac+1F9pL7ZX2WnujJWpvtXfae+2D9lH7pH3WvmhftW/ad+2H9lP7pf3WkrQ/2l/tn5YMJAcpQEqQCqQGaUBakA6kBxlARpAJZAZZQFaQDWQHOUBOkAvkBnkAAlCAARwQgAQUoAEDWMABHghABBKQgQJUoAEAdGAAE0BgARs4wAUe8EEAQhCBGMgL8oH8oAAoCAqBwqAIKAqKgeKgBCgJSoHSoAwoC8qB8qACqAgqgcqgCqgKqoHqoAaoCWqB2qAOqAvqgfqgAWgIGoHGoAloCpqB5qAFaAlagdagDWgL2oH2oAPoCDqBzqAL6Aq6ge6gB+gJeoHeoA/oC/qB/mAAGAgGgcFgCBgKhoHhYAQYCUaB0WAMGAvGgfFgApgIJoHJYAqYCqaB6WAGmAlmgdlgDpgL5oH5YAFYCBaBxWAJWAqWgeVgBVgJVoHVYA1YC9aB9WAD2Ag2gc1gC9gKtoHtYAfYCXaB3WAP2Av2gf0gDhwAB8EhcBgcAUfBMXAcnAAnwSlwGpwBZ8E5EA/OgwRwAVwEl8BlcAVcBdfAdXAD3AS3wG1wB9wF98B98AA8BI/AY/AEPAXPwHPwArwEr8Br8AYkgrfgHXgPPoCP4BP4DL6Ar+Ab+A5+gJ/gF/gNksAf8Bf8A8n05HoKPaWeSk+tp9HT6un09HoGPaOeSc+sZ9Gz6tn07HoOPaeeS8+t59ERHdUxHdcJndQpndYZndU5ndcFXdQlXdYVXdU1Hei6buimDnVLt3VHd3VP9/VAD/VIj+l59Xx6fr2AXlAvpBfWi+hF9WJ6cb2EXlIvpZfWy+hl9XJ6eb2CXlGvpFfWq+hV9Wp6db2GXlOvpdfW6+h19Xp6fb2B3lBvpDfWm+hN9WZ6c72F3lJvpbfW2+ht9XZ6e72D3lHvpHfWu+hd9W56d72H3lPvpffW++h99X56f32APlAfpA/Wh+hD9WH6cH2EPlIfpY/Wx+hj9XH6eH2CPlGfpE/Wp+hT9Wn6dH2GPlOfpc/W5+hz9Xn6fH2BvlBfpC/Wl+hL9WX6cn2FvlJfpa/W1+hr9XX6en2DvlHfpG/Wt+hb9W36dn2HvlPfpe/W9+h79X36fj1OP6Af1A/ph/Uj+lH9mH5cP6Gf1E/pp/Uz+ln9nB6vn9cT9Av6Rf2Sflm/ol/Vr+nX9Rv6Tf2Wflu/o9/V7+n39Qf6Q/2R/lh/oj/Vn+nP9Rf6S/2V/lp/oyfqb/V3+nv9g/5R/6R/1r/oX/Vv+nf9h/5T/6X/1pP0P/pf/Z+ezEhupDBSGqmM1EYaI62RzkhvZDAyGpmMzEYWI6uRzchu5DByGrmM3EYeAzFQAzNwgzBIgzJogzFYgzN4QzBEQzJkQzFUQzOAoRuGYRrQsIz/vfs3NpoYTYxmRnOjhZEvRaEUrY3WRlujrdHeaG90NDoZnY0uRlejm9HN6GH0NHoavY0+Rl+jn9HfGGAMNAYZg40hxhBjmDHMGGGMMEYZo4wxxhhjnDHOmGBMMCYZk4wpxhRjmjHNmGHMMGYZs4w5xhxjnjHPWGAsMBYZi4wlxhJjmbHMWGGsMFYZq4w1xhpjnbHO2GBsMDYZm4wtxhZjm7HN2GHsMHYZu4w9xh5jn7HPiDPijIPGQeOwcdg4ahw1jhvHjZPGSeO0cdo4a5w14o14I8FIMC4aF43LxmXjqnHVuG5cN24aN43bxm3jrnHXuG/cNx4aD43HxmPjqfHMeG68MF4ar4zXxhsj0XhrvDPeGx+Mj8Yn47PxxfhqfDO+Gz+Mn8Yv47eRZPwx/hr/jP8/3y/KpmKqpmYCUzcN0zShaZm26Ziu6Zm+GZihGZkxM6+Zz8xvFjALmoXMwmYRs6hZzCxuljBLmqXM0mYZs6xZzixvVjArmpXMymYVs6pZzaxu1jBrmrXM2mYds65Zz6xvNjAbmo3MxmYTs6nZzGxutjBbmq3M1mYbs63ZzmxvdjA7mp3MzmYXs6vZzexu9jB7mr3M3mYfs6/Zz+xvDjAHmoPMweYQc6g5zBxujjBHmqPM0eYYc6w5zhxvTjAnmpPMyeYUc6o5zZxuzjBnmrPM2eYcc645z5xvLjAXmovMxeYSc6m5zFxurjBXmqvM1eYac625zlxvbjA3mpvMzeYWc6u5zdxu7jB3mrvM3eYec6+5z9xvxpkHzIPmIfOwecQ8ah4zj5snzJPmKfO0ecY8a54z483zZoJ5wbxoXjIvm1fMq+Y187p5w7xp3jJvm3fMu+Y98775wHxoPjIfm0/Mp+Yz87n5wnxpvjJfm2/MRPOt+c58b34wP5qfzM/mF/Or+c38bv4wf5q/zN9mkvnH/Gv+M5PB5DAFTAlTwdQwDUwL08H0MAPMCDPBzDALzAqzwewwB8wJc8HcMA9EIAoxiEMCkpCCNGQgCznIQwGKUIIyVKAKNQigDg1oQggtaEMHutCDPgxgCCMYg3lhPpgfFoAFYSFYGBaBRWExWByWgCVhKVgaloFlYTlYHlaAFWElWBlWgVVhNVgd1oA1YS1YG9aBdWE9WB82gA1hI9gYNoFNYTPYHLaALWEr2Bq2gW1hO9gedoAdYSfYGXaBXWE32B32gD1hL9gb9oF9YT/YHw6AA+EgOBgOgUPhMDgcjoAj4Sg4Go6BY+E4OB5OgBPhJDgZToFT4TQ4Hc6AM+EsOBvOgXPhPDgfLoAL4SK4GC6BS+EyuByugCvhKrgaroFr4Tq4Hm6AG+EmuBlugVvhNrgd7oA74S64G+6Be+E+uB/GwQPwIDwED8Mj8Cg8Bo/DE/AkPAVPwzPwLDwH4+F5mAAvwIvwErwMr8Cr8Bq8Dm/Am/AWvA3vwLvwHrwPH8CH8BF8DJ/Ap/AZfA5fwJfwFXwN38BE+Ba+g+/hB/gRfoKf4Rf4FX6D3+EP+BP+gr9hEvwD/8J/MJmV3EphpbRSWamtNFZaK52V3spgZbQyWZmtLFZWK5uV3cph5bRyWbmtPBZioRZm4RZhkRZl0RZjsRZn8ZZgiZZkyZZiqZZmAUu3DMu0oGVZtuVYruVZvhVYoRVZMSuvlc/KbxWwClqFrMJWEauoVcwqbpWwSlqlrNJWGausVc4qb1WwKlqVrMpWFauqVc2qbtWwalq1rNpWHauuVc+qbzWwGlqNrMZWE6up1cxqbrWwWlqtrNZWG6ut1c5qb3WwOlqdrM5WF6ur1c3qbvWwelq9rN5WH6uv1c/qbw2wBlqDrMHWEGuoNcwabo2wRlqjrNHWGGusNc4ab02wJlqTrMnWFGuqNc2abs2wZlqzrNnWHGuuNc+aby2wFlqLrMXWEmuptcxabq2wVlqrrNXWGmuttc5ab22wNlqbrM3WFmurtc3abu2wdlq7rN3WHmuvtc/ab8VZB6yD1iHrsHXEOmods45bJ6yT1inrtHXGOmuds+Kt81aCdcG6aF2yLltXrKvWNeu6dcO6ad2yblt3rLvWPeu+9cB6aD2yHltPrKfWM+u59cJ6ab2yXltvrETrrfXOem99sD5an6zP1hfrq/XN+m79sH5av6zfVpL1x/pr/bOS2cntFHZKO5Wd2k5jp7XT2entDHZGO5Od2c5iZ7Wz2dntHHZOO5ed285jIzZqYzZuEzZpUzZtMzZrczZvC7ZoS7ZsK7ZqazawdduwTRvalm3bju3anu3bgR3akR2z89r57Px2AbugXcgubBexi9rF7OJ2CbukXcoubZexy9rl7PJ2BbuiXcmubFexq9rV7Op2DbumXcuubdex69r17Pp2A7uh3chubDexm9rN7OZ2C7ul3cpubbex29rt7PZ2B7uj3cnubHexu9rd7O52D7un3cvubfex+9r97P72AHugPcgebA+xh9rD7OH2CHukPcoebY+xx9rj7PH2BHuiPcmebE+xp9rT7On2DHumPcuebc+x59rz7Pn2AnuhvchebC+xl9rL7OX2Cnulvcpeba+x19rr7PX2BnujvcnebG+xt9rb7O32Dnunvcvebe+x99r77P12nH3APmgfsg/bR+yj9jH7uH3CPmmfsk/bZ+yz9jk73j5vJ9gX7Iv2JfuyfcW+al+zr9s37Jv2Lfu2fce+a9+z79sP7If2I/ux/cR+aj+zn9sv7Jf2K/u1/cZOtN/a7+z39gf7o/3J/mx/sb/a3+zv9g/7p/3L/m0n2X/sv/Y/O5mT3EnhpHRSOamdNE5aJ52T3sngZHQyOZmdLE5WJ5uT3cnh5HRyObmdPA7ioA7m4A7hkA7l0A7jsA7n8I7giI7kyI7iqI7mAEd3DMd0oGM5tuM4ruM5vhM4oRM5MSevk8/J7xRwCjqFnMJOEaeoU8wp7pRwSjqlnNJOGaesU84p71RwKjqVnMpOFaeqU82p7tRwajq1nNpOHaeuU8+p7zRwGjqNnMZOE6ep08xp7rRwWjqtnNZOG6et085p73RwOjqdnM5OF6er083p7vRwejq9nN5OH6ev08/p7wxwBjqDnMHOEGeoM8wZ7oxwRjqjnNHOGGesM84Z70xwJjqTnMnOFGeqM82Z7sxwZjqznNnOHGeuM8+Z7yxwFjqLnMXOEmeps8xZ7qxwVjqrnNXOGmets85Z72xwNjqbnM3OFmers83Z7uxw/me/jb3OPme/E+cccA46h5zDzhHnqHPMOe6ccE46p5zTzhnnrHPOiXfOOwnOBeeic8m57FxxrjrXnOvODeemc8u57dxx7jr3nPvOA+eh88h57DxxnjrPnOfOC+el88p57bxxEp23zjvnvfPB+eh8cj47X5yvzjfnu/PD+en8cn47Sc4f56/zz0nmJndTuCndVG5qN42b1k3npnczuBndTG5mN4ub1c3mZndzuDndXG5uN4+LuKiLubhLuKRLubTLuKzLubwruKIrubKruKqrucDVXcM1Xeharu06rut6ru8GbuhGbszN6+Zz87sF3IJuIbewW8Qt6hZzi7sl3JJuKbe0W8Yt65Zzy7sV3IpuJbeyW8Wt6lZzq7s13JpuLbe2W8et69Zz67sN3IZuI7ex28Rt6jZzm7st3JZuK7e128Zt67Zz27sd3I5uJ7ez28Xt6nZzu7s93J5uL7e328ft6/Zz+7sD3IHuIHewO8Qd6g5zh7sj3JHuKHe0O8Yd645zx7sT3InuJHeyO8Wd6k5zp7sz3JnuLHe2O8ed685z57sL3IXuInexu8Rd6i5zl7sr3JXuKne1u8Zd665z17sb3I3uJnezu8Xd6m5zt7s73J3uLne3u8fd6+5z97tx7gH3oHvIPewecY+6x9zj7gn3pHvKPe2ecc+659x497yb4F5wL7qX3MvuFfeqe8297t5wb7q33NvuHfeue8+97z5wH7qP3MfuE/ep+8x97r5wX7qv3NfuGzfRfeu+c9+7H9yP7if3s/vF/ep+c7+7P9yf7i/3t5vk/nH/uv/cZF5yL4WX0kvlpfbSeGm9dF56L4OX0cvkZfayeFm9bF52L4eX08vl5fbyeIiHepiHe4RHepRHe4zHepzHe4InepIne4qnepoHPN0zPNODnuXZnuO5nuf5Xsr/cf7zevm8/F4Br6BXyCvsFfGKesW84l4Jr6RXyivtlfHKeuW88l4Fr6JXyavsVfGqetW86l4Nr6ZXy6vt1fHqevW8+l4Dr6HXyGvsNfGaes285l4Lr6XXymvttfHaeu289l4Hr6PXyevsdfG6et287l4Pr6fXy+vt9fH6ev28/t4Ab6A3yBvsDfGGesO84d4Ib6Q3yhvtjfHGeuO88d4Eb6I3yZvsTfGmetO86d4Mb6Y3y5vtzfHmevO8+d4Cb6G3yFvsLfGWesu85d4Kb6W3ylvtrfHWeuu89d4Gb6O3ydvsbfG2etu87d4xb6e3y9vt7fH2evu8/V6cd8A76B3yDntHvKPeMe+4d8I76Z3yTntnvLPeOS/eO+8leBe8i94l77J3xbvqXfOueze8m94t77Z3x7vr3fPuew+8h94j77H3xHvqPfOeey+8l94r77X3xkv03nrvvPfeB++j98n77H3xvnrfvO/eD++n98v77SV5f7y/3j8vmZ/cT+Gn9FP5qf00flo/nZ/ez+Bn9DP5mf0sflY/m5/dz+Hn9HP5uf08PuKjPubjPuGTPuXTPuOzPufzvuCLvuTLvuKrvuYDX/cN3/Shb/m27/iu7/m+H/ihH/kxP6+fz8/vF/AL+oX8wn4Rv6hfzC/ul/BL+qX80n4Zv6xfzi/vV/Ar+pX8yn4Vv6pfza/u1/Br+rX82n4dv65fz6/vN/Ab+o38xn4Tv6nfzG/ut/Bb+q381n4bv63fzm/vd/A7+p38zn4Xv6vfze/u9/B7+r383n4fv6/fz+/vD/AH+oP8wf4Qf6g/zB/uj/BH+qP80f4Yf6w/zh/vT/An+pP8yf4Uf6o/zZ/uz/Bn+rP82f4cf64/z5/vL/AX+ov8xf4Sf6m/zF/ur/BX+qv81f4af62/zl/vb/A3+pv8zf4Wf6u/zd/u7/B3+rv83f4ef6+/z9/vx/kH/IP+If+wf8Q/6h/zj/sn/JP+Kf+0f8Y/65/z4/3zfoJ/wb/oX/Iv+1f8q/41/7p/w7/p3/Jv+3f8u/49/77/wH/oP/If+0/8p/4z/7n/wn/pv/Jf+2/8RP+t/85/73/wP/qf/M/+F/+r/83/7v/wf/q//N9+kv/H/+v/85MFyYMUQcogVZA6SBOkDdIF6YMMQcYgU5A5yBJkDbIF2YMcQc4gV5A7yBMgARpgAR4QARlQAR0wARtwAR8IgRhIgRwogRpoAQj0wAjMAAZWYKf/H9c+CIIwiIJYkDfIF+QPCgQFg0JB4aBIUDQoFhQPSgQlg1JB6aBMUDYoF5QPKgQVg0pB5aBKUDWoFlQPagQ1g1pB7aBOUDeoF9QPGgQNg0ZB46BJ0DRoFjQPWgQtg1ZB66BN0DZoF7QPOgQdg05B56BL0DXoFnQPegQ9g15B76BP0DfoF/QPBgQDg0HB4GBIMDQYFgwPRgQjg1HB6GBMMDYYF4wPJgQTg0nB5GBKMDWYFkwPZgQzg1nB7GBOMDeYF8wPFgQLg0XB4mBJsDRYFiwPVgQrg1XB6mBNsDZYF6wPNgQbg03B5mBLsDXYFmwPdgQ7g13B7mBPsDfYF+wP4oIDwcHgUHA4OBIcDY4Fx4MTwcngVHA6OBOcDc4F8cH5ICG4EFwMLgWXgyvB1eBacD24EdwMbgW3gzvB3eBecD94EDwMHgWPgyfB0+BZ8Dx4EbwMXgWvgzdBYvA2eBe8Dz4EH4NPwefgS/A1+BZ8D34EP4Nfwe8gKfgT/A3+BcnC5GGKMGWYKkwdpgnThunC9GGGMGOYKcwcZgmzhtnC7GGOMGeYK8wd5gmREA2xEA+JkAypkA6ZkA25kA+FUAylUA6VUA21EIR6aIRmCEMrtEMndEMv9MMgDMMojIV5w3xh/rBAWDAsFBYOi4RFw2Jh8bBEWDIsFZYOy4Rlw3Jh+bBCWDGsFFYOq4RVw2ph9bBGWDOsFdYO64R1w3ph/bBB2DBsFDYOm4RNw2Zh87BF2DJsFbYO24Rtw3Zh+7BD2DHsFHYOu4Rdw25h97BH2DPsFfYO+4R9w35h/3BAOHDooHBwOCQcGg4Lh4cjwpHhqHB0OCYcG44Lx4cTwonhpHByOCWcGk4Lp4czwpnhrHB2OCecG84L54cLwoXhonBxuCRcGi4Ll4crwpXhqnB1uCZcG64L14cbwo3hpnBzuCXcGm4Lt4c7wp3hrnB3uCfcG+4L94dx4YHwYHgoPBweCY+Gx8Lj4YnwZHgqPB2eCc+G58L48HyYEF4IL4aXwsvhlfBqeC28Ht4Ib4a3wtvhnfBueC+8Hz4IH4aPwsfhk/Bp+Cx8Hr4IX4avwtfhmzAxfBu+C9+HH8KP4afwc/gl/Bp+C7+HP8Kf4a/wd5gU/gn/hv/CZFHyKEWUMkoVpY7SRGmjdFH6KEOUMcoUZY6yRFmjbFH2KEeUM8oV5Y7yREiERliER0RERlRER0zERlzER0IkRlIkR0qkRloEIj0yIjOCkRXZkRO5kRf5URCFURTForxRvih/VCAqGBWKCkdFoqJRsah4VCIqGZWKSkdlorJRuah8VCGqGFWKKkdVoqpRtah6VCOqGdWKakd1orpRvah+1CBqGDWKGkdNoqZRs6h51CJqGbWKWkdtorZRu6h91CHqGHWKOkddoq5Rt6h71CPqGfWKekd9or5Rv6h/NCAaGA2KBkdDoqHRsGh4NCIaGY2KRkdjorHRuGh8NCGaGE2KJkdToqnRtGh6NCOaGc2KZkdzornRvGh+tCBaGC2KFkdLoqXRsmh5tCJaGa2KVkdrorXRumh9tCHaGG2KNkdboq3Rtmh7tCPaGe2Kdkd7or3Rvmh/FBcdiA5Gh6LD0ZHoaHQsOh6diE5Gp6LT0ZnobHQuio/ORwnRhehidCm6HF2JrkbXouvRjehmdCu6Hd2J7kb3ovvRg+hh9Ch6HD2JnkbPoufRi+hl9Cp6Hb2JEqO30bvoffQh+hh9ij5HX6Kv0bfoe/Qj+hn9in5HSdGf6G/0L0oWSx5LEUsZSxVLHUsTSxtLF0sfyxDLGMsUyxzLEssayxbLHssRyxnLFcsdyxNDYmgMi+ExIkbGqBgdY2JsjIvxMSEmxqSYHFNiakyLgZgeM2JmDMasmB1zYm7Mi/mxIBbGolgsljeWL5b//+DBHtc0ZxIAgHaPbds241RVbCfv2LZt27Zte3ps27bt3j/7fJdxDkZhNMZgLAYwiCGMw3hMwERMwmRMwVRMw3TMwEzMwmzMwVzMw3wswEIswmJYLaw2Vgeri9XD6mMNsIZYI6wx1gRrijXDmmMtsJZYK6w11gZri7XD2mMdsI5YJ6wz1gXrinXDumM9sJ5YL6w31gfri/XD+mMDsIHYIGwwNgQbig3DhmMjsJHYKGw0NgYbi43DxmMTsInYJGwyNgWbik3DpmMzsJnYLGw2Ngebi83D5mMLsIXYImwxtgRbii3DlmMrsJXYKmw1tgZbi63D1mMbsI3YJmwztgXbim3DtmM7sJ3YLmw3tgfbi+3D9mMHsINYAnYIO4wdwY5ix7Dj2AnsJHYKO42dwc5i57Dz2AXsInYJu4xdwa5i17Dr2A3sJnYLu43dwe5i97D72APsIfYIe4w9wZ5iz+KfYy+wl9gr7DX2BnuLvcPeYx+wj9gn7DP2BfuKfcO+Yz+wn9gv7Df2B/uL/cMSsTg8Hk+CJ8WT4cnxFHhKPBWeGk+Dp8XT4enxDHhGPBOeGc+CZ8Wz4dnxHHhOPBeeG8+D58Xz4fnxAnhBvBBeGC+CF8WL4cXxEnhJvBReGi+Dl8XL4eXxCnhFvBJeGa+CV8Wr4dXxGnhNHMNxnMBJnMJpnMFZHOAQRziH87iAi7iEy7iCq7iG67iBm7iF27iDu7iH+3iAh3iEx/BaeG28Dl4Xr4fXxxvgDfFGeGO8Cd4Ub4Y3x1vgLfFWeGu8Dd4Wb4e3xzvgHfFOeGe8C94V74Z3x3vgPfFeeG+8D94X74f3xwfgA/FB+GB8CD4UH4YPx0fgI/FR+Gh8DD4WH4ePxyfgE/FJ+GR8Cj4Vn4ZPx2fgM/FZ+Gx8Dj4Xn4fPxxfgC/FF+GJ8Cb4UX4Yvx1fgK/FV+Gp8Db4WX4evxzfgG/FN+GZ8C74V34Zvx3fgO/Fd+G58D74X34fvxw/gB/EE/BB+GD+CH8WP4cfxE/hJ/BR+Gj+Dn8XP4efxC/hF/BJ+Gb+CX8Wv4dfxG/hN/BZ+G7+D38Xv4ffxB/hD/BH+GH+CP8Wf4c/xF/hL/BX+Gn+Dv8Xf4e/xD/hH/BP+Gf+Cf8W/4d/xH/hP/Bf+G/+D/8X/4Yl4HBFPJCGSEsmI5EQKIiWRikhNpCHSEumI9EQGIiORichMZCGyEtmI7EQOIieRi8hN5CHyEvmI/EQBoiBRiChMFCGKEsWI4kQJoiRRiihNlCHKEuWI8kQFoiJRiahMVCGqEtWI6kQNoiaBEThBECRBETTBECwBCEgggiN4QiBEQiJkQiFUQiN0wiBMwiJswiFcwiN8IiBCIiJiRC2iNlGHqEvUI+oTDYiGRCOiMdGEaEo0I5oTLYiWRCuiNdGGaEu0I9oTHYiORCeiM9GF6Ep0I7oTPYieRC+iN9GH6Ev0I/oTA4iBxCBiMDGEGEoMI4YTI4iRxChiNDGGGEuMI8YTE4iJxCRiMjGFmEpMI6YTM4iZxCxiNjGHmEvMI+YTC4iFxCJiMbGEWEosI5YTK4iVxCpiNbGGWEusI9YTG4iNxCZiM7GF2EpsI7YTO4idxC5iN7GH2EvsI/YTB4iDRAJxiDhMHCGOEseI48QJ4iRxijhNnCHOEueI88QF4iJxibhMXCGuEteI68QN4iZxi7hN3CHuEveI+8QD4iHxiHhMPCGeEs+I58QL4iXxinhNvCHeEu+I98QH4iPxifhMfCG+Et+I78QP4ifxi/hN/CH+Ev+IRCKOjCeTkEnJZGRyMgWZkkxFpibTkGnJdGR6MgOZkcxEZiazkFnJbGR2MgeZk8xF5ibzkHnJfGR+sgBZkCxEFiaLkEXJYmRxsgRZkixFlibLkGXJcmR5sgJZkaxEViarkFXJamR1sgZZk8RInCRIkqRImmRIlgQkJBHJkTwpkCIpkTKpkCqpkTppkCZpkTbpkC7pkT4ZkCEZkTGyFlmbrEPWJeuR9ckGZEOyEdmYbEI2JZuRzckWZEuyFdmabEO2JduR7ckOZEeyE9mZ7EJ2JbuR3ckeZE+yF9mb7EP2JfuR/ckB5EByEDmYHEIOJYeRw8kR5EhyFDmaHEOOJceR48kJ5ERyEjmZnEJOJaeR08kZ5ExyFjmbnEPOJeeR88kF5EJyEbmYXEIuJZeRy8kV5EpyFbmaXEOuJdeR68kN5EZyE7mZ3EJuJbeR28kd5E5yF7mb3EPuJfeR+8kD5EEygTxEHiaPkEfJY+Rx8gR5kjxFnibPkGfJc+R58gJ5kbxEXiavkFfJa+R18gZ5k7xF3ibvkHfJe+R98gH5kHxEPiafkE/JZ+Rz8gX5knxFvibfkG/Jd+R78gP5kfxEfia/kF/Jb+R38gf5k/xF/ib/kH/Jf2QiGUfFU0mopFQyKjmVgkpJpaJSU2motFQ6Kj2VgcpIZaIyU1morFQ2KjuVg8pJ5aJyU3movFQ+Kj9VgCpIFaIKU0WoolQxqjhVgipJlaJKU2WoslQ5qjxVgapIVaIqU1WoqlQ1qjpVg6pJYRROERRJURRNMRRLAQpSiOIonhIokZIomVIoldIonTIok7Iom3Iol/IonwqokIqoGFWLqk3VoepS9aj6VAOqIdWIakw1oZpSzajmVAuqJdWKak21odpS7aj2VAeqI9WJ6kx1obpS3ajuVA+qJ9WL6k31ofpS/aj+1ABqIDWIGkwNoYZSw6jh1AhqJDWKGk2NocZS46jx1ARqIjWJmkxNoaZS06jp1AxqJjWLmk3NoeZS86j51AJqIbWIWkwtoZZSy6jl1ApqJbWKWk2todZS66j11AZqI7WJ2kxtobZS26jt1A5qJ7WL2k3tofZS+6j91AHqIJVAHaIOU0eoo9Qx6jh1gjpJnaJOU2eos9Q56jx1gbpIXaIuU1eoq9Q16jp1g7pJ3aJuU3eou9Q96j71gHpIPaIeU0+op9Qz6jn1gnpJvaJeU2+ot9Q76j31gfpIfaI+U1+or9Q36jv1g/pJ/aJ+U3+ov9Q/KpGKo+PpJHRSOhmdnE5Bp6RT0anpNHRaOh2dns5AZ6Qz0ZnpLHRWOhudnc5B56Rz0bnpPHReOh+dny5AF6QL0YXpInRRuhhdnC5Bl6RL0aXpMnRZuhxdnq5AV6T/8z9dna5B16QxGqcJmqQpmqYZmqUBDWlEczRPC7RIS7RMK7RKa7ROG7RJW7RNO7RLe7RPB3RIR3SMrkXXpuvQdel6dH26Ad2QbkQ3ppvQTelmdHO6Bd2SbkW3ptvQbel2dHu6A92R7kR3prvQXeludHe6B92T7kX3pvvQfel+dP/ERHogPYgeTA+hh9LD6OH0CHokPYoeTY+hx9Lj6PH0BHoiPYmeTE+hp9LT6On0DHomPYueTc+h59Lz6Pn0AnohvYheTC+hl9LL6OX0CnolvYpeTa+h19Lr6PX0BnojvYneTG+ht9Lb6O30DnonvYveTe+h99L76P30AfognUAfog/TR+ij9DH6OH2CPkmfok/TZ+iz9Dn6PH2Bvkhfoi/TV+ir9DX6On2Dvknfom/Td+i79D36Pv2Afkg/oh/TT+in9DP6Of2Cfkm/ol/Tb+i39Dv6Pf2B/kh/oj/TX+iv9Df6O/2D/kn/on/Tf+i/9D86kY5j4pkkTFImGZOcScGkZFIxqZk0TFomHZOeycBkZDIxmZksTFYmG5OdycHkZHIxuZk8TF4mH5OfKcAUZAoxhZkiTFGmGFOcKcGUZEoxpZkyTFmmHFOeqcBUZCoxlZkqTFWmGlOdqcHUZDAGZwiGZCiGZhiGZQADGcRwDM8IjMhIjMwojMpojM4YjMlYjM04jMt4jM8ETMhETIypxdRm6jB1mXpMfaYB05BpxDRmmjBNmWZMc6YF05JpxbRm2jBtmXZMe6YD05HpxHRmujBdmW5Md6YH05PpxfRm+jB9mX5Mf2YAM5AZxAxmhjBDmWHMcGYEM5IZxYxmxjBjmXHMeGYCM5GZxExmpjBTmWnMdGYGM5OZxcxm5jBzmXnMfGYBs5BZxCxmljBLmWXMcmYFs5JZxaxm1jBrmXXMemYDs5HZxGxmtjBbmW3MdmYHs5PZxexm9jB7mX3MfuYAc5BJYA4xh5kjzFHmGHOcOcGcZE4xp5kzzFnmHHOeucBcZC4xl5krzFXmGnOducHcZG4xt5k7zF3mHnOfecA8ZB4xj5knzFPmGfOcecG8ZF4xr5k3zFvmHfOe+cB8ZD4xn5kvzFfmG/Od+cH8ZH4xv5k/zF/mH5PIxLHxbBI2KZuMTc6mYFOyqdjUbBo2LZuOTc9mYDOymdjMbBY2K5uNzc7mYHOyudjcbB42L5uPzc8WYAuyhdjCbBG2KFuMLc6WYEuypdjSbBm2LFuOLc9WYCuyldjKbBW2KluNrc4mjavJYizOEizJUizNMizLAhayiOVYnhVYkZVYmVVYldVYnTVYk7VYm3VYl/VYnw3YkI3YGFuLrc3WYeuy9dj6bAO2IduIbcw2YZuyzdjmbAu2JduKbc22Yduy7dj2bAe2I9uJ7cx2Ybuy3djubA+2J9uL7c32Yfuy/dj+7AB2IDuIHcwOYYeyw9jh7Ah2JDuKHc2OYcey49jx7AR2IjuJncxOYaey09jp7Ax2JjuLnc3OYeey89j57AJ2IbuIXcwuYZeyy9jl7Ap2JbuKXc2uYdey69j17AZ2I7uJ3cxuYbey29jt7A52J7uL3c3uYfey+9j97AH2IJvAHmIPs0fYo+wx9jh7gj3JnmJPs2fYs+w59jx7gb3IXmIvs1fYq+w19jp7g73J3mJvs3fYu+w99j77gH3IPmIfs0/Yp+wz9jn7gn3JvmJfs2/Yt+w79j37gf3IfmI/s1/Yr+w39jv7g/3J/mJ/s3/Yv+w/NpGNA/EgCUgKkoHkIAVICVKB1CANSAvSgfQgA8gIMoHMIAvICrKB7CAHyAlygdwgD8gL8oH8oAAoCAqBwqAIKAqKgeKgBCgJSoHSoAwoC8qB8qACqAgqgcqgCqgKqoHqoAaoCTCAAwKQgAI0YAALAIAAAQ7wQAAikIAMFKACDejAACawgA0c4AIP+CAAIYhADNQCtUEdUBfUA/VBA9AQNAKNQRPQFDQDzUEL0BK0Aq1BG9AWtAPtQQfQEXQCnUEX0BV0A91BD9AT9AK9QR/QF/QD/cEAMBAMAoPBEDAUDAPDwQgwEowCo8EYMBaMA+PBBDARTAKTwRQwFUwD08EMMBPMArPBHDAXzAPzwQKwECwCi8ESsBQsA8vBCrASrAKrwRqwFqwD68EGsBFsApvBFrAVbAPbwQ6wE+wCu8EesBfsA/vBAXAQJIBD4DA4Ao6CY+A4OAFOglPgNDgDzoJz4Dy4AC6CS+AyuAKugmvgOrgBboJb4Da4A+6Ce+A+eAAegkfgMXgCnoJn4Dl4AV6CV+A1eAPegnfgPfgAPoJP4DP4Ar6Cb+A7+AF+gl/gN/gD/oJ/IBHEwXiYBCaFyWBymAKmhKlgapgGpoXpYHqYAWaEmWBmmAVmhdlgdpgD5oS5YG6YB+aF+WB+WAAWhIVgYVgEFoXFYHFYApaEpWBpWAaWheVgeVgBVoSVYGVYBVaF1WB1WAPWhBjEIQFJSEEaMpCFAEKIIAd5KEARSlCGClShBnVoQBNa0IYOdKEHfRjAEEYwBmvB2rAOrAvrwfqwAWwIG8HGsAlsCpvB5rAFbAlbwdawDWwL28H2sAPsCDvBzrAL7Aq7we6wB+wJe8HesA/sC/vB/nAAHAgHwcFwCBwKh8HhcAQcCUfB0XAMHAvHwfFwApwIJ8HJcAqcCqfB6XAGnAlnwdlwDpwL58H5cAFcCBfBxXAJXAqXweVwBVwJV8HVcA1cC9fB9XAD3Ag3wc1wC9wKt8HtcAfcCXfB3XAP3Av3wf3wADwIE+AheBgegUfhMXgcnoAn4Sl4Gp6BZ+E5eB5egBfhJXgZXoFX4TV4Hd6AN+EteBvegXfhPXgfPoAP4SP4GD6BT+Ez+By+gC/hK/gavoFv4Tv4Hn6AH+En+Bl+gV/hN/gd/oA/4S/4G/6Bf+E/mAjjUDxKgpKiZCg5SoFSolQoNUqD0qJ0KD3KgDKiTCgzyoKyomwoO8qBcqJcKDfKg/KifCg/KoAKokKoMCqCiqJiqDgqgUqiUqg0KoPKonKoPKqAKqJKqDKqgqqiaqg6qoFqIgzhiEAkohCNGMQigCBCiEM8EpCIJCQjBalIQzoykIksZCMHuchDPgpQiCIUQ7VQbVQH1UX1UH3UADVEjVBj1AQ1Rc1Qc9QCtUStUGvUBrVF7VB71AF1RJ1QZ9QFdUXdUHfUA/VEvVBv1Af1Rf1QfzQADUSD0GA0BA1Fw9BwNAKNRKPQaDQGjUXj0Hg0AU1Ek9BkNAVNRdPQdDQDzUSz0Gw0B81F89B8tAAtRIvQYrQELUXL0HK0Aq1Eq9BqtAatRevQerQBbUSb0Ga0BW1F29B2tAPtRLvQbrQH7UX70H50AB1ECegQOoyOoKPoGDqOTqCT6BQ6jc6gs+gcOo8uoIvoErqMrqCr6Bq6jm6gm+gWuo3uoLvoHrqPHqCH6BF6jJ6gp+gZeo5eoJfoFXqN3qC36B16jz6gj+gT+oy+oK/oG/qOfqCf6Bf6jf6gv+gfSkRxXDyXhEvKJeOScym4lFwqLjWXhkvLpePScxm4jFwmLjOXhcvKZeOyczm4nFwuLjeXh8vL5ePycwW4glwhrjBXhCvKFeOKcyW4klwprjRXhivLlePKcxW4ilwlrjJXhavKVeOqczW4mhzG4RzBkRzF0RzDsRzgIIc4juM5gRM5iZM5hVM5jdM5gzM5i7M5h3M5j/O5gAu5iItxtbjaXB2uLlePq8814BpyjbjGXBOuKdeMa8614FpyrbjWXBuuLdeOa8914DpynbjOXBeuK9eN68714HpyvbjeXB+uL9eP688N4AZyg7jB3BBuKDeMG86N4EZyo7jR3BhuLDeOG89N4CZyk7jJ3BRuKjeNm87N4GZys7jZ3BxuLjePm88t4BZyi7jF3BJuKbeMW86t4FZyq7jV3BpuLbeOW89t4DZym7jN3BZuK7eN287t4HZyu7jd3B5uL7eP288d4A5yCdwh7jB3hDvKHeOOcye4k9wp7jR3hjvLnePOcxe4i9wl7jJ3hbvKXeOucze4m9wt7jZ3h7vL3ePucw+4h9wj7jH3hHvKPeOecy+4l9wr7jX3hnvLvePecx+4j9wn7jP3hfvKfeO+cz+4n9wv7jf3h/vL/eMSuTg+nk/CJ+WT8cn5FHxKPhWfmk/Dp+XT8en5DHxGPhOfmc/CZ+Wz8dn5HHxOPhefm8/D5+Xz8fn5AnxBvhBfmC/CF+WL8cX5EnxJvhRfmi/Dl+XL8eX5CnxFvhJfma/CV+Wr8dX5GnxNHuNxnuBJnuJpnuFZHvCQRzzH87zAi7zEy7zCq7zG67zBm7zF27zDu7zH+3zAh3zEx/hafG2+Dl+Xr8fX5xvwDflGfGO+Cd+Ub8Y351vwLflWfGu+Dd+Wb8e35zvwHflOfGe+C9+V78Z353vwPflefG++D9+X78f35wfwA/lB/GB+CD+UH8YP50fwI/lR/Gh+DD+WH8eP5yfwE/lJ/GR+Cj+Vn8ZP52fwM/lZ/Gx+Dj+Xn8fP5xfwC/lF/GJ+Cb+UX8Yv51fwK/lV/Gp+Db+WX8ev5zfwG/lN/GZ+C7+V38Zv53fwO/m4uPi4Pfxefh+/nz/AH+QT+EP8Yf4If5Q/xh/nT/An+VP8af4Mf5Y/x5/nL/AX+Uv8Zf4Kf5W/xl/nb/A3+Vv8bf4Of5e/x9/nH/AP+Uf8Y/4J/5R/xj/nX/Av+Vf8a/4N/5Z/x7/nP/Af+U/8Z/4L/5X/xn/nf/A/+V/8b/4P/5f/xyfycUK8kERIKiQTkgsphJRCKiG1kEZIK6QT0gsZhIxCJiGzkEXIKmQTsgs5hJxCLiG3kEfIK+QT8gsFhIJCIaGwUEQoKhQTigslhJJCKaG0UEYoK5QTygsVhIpCJaGyUEWoKlQTqgs1hJoCJuACIZACJdACI7ACEKCABE7gBUEQBUmQBUVQBU3QBUMwBUuwBUdwBU/whUAIhUiICbWE2kIdoa5QT6gvNBAaCo2ExkIToanQTGgutBBaCq2E1kIboa3QTmgvdBA6Cp2EzkIXoavQTegu9BB6Cr2E3kIfoa/QT+gvDBAGCoOEwcIQYagwTBgujBBGCqOE0cIYYawwThgvTBAmCpOEycIUYaowTZguzBBmCrOE2cIcYa4wT5gvLBAWCouExcISYamwTFgurBBWCquE1cIaYa2wTlgvbBA2CpuEzcIWYauwTdgu7BB2CruE3cIeYa+wT9gvHBAOCgnCIeGwcEQ4KhwTjgsnhJPCKeG0cEY4K5wTzgsXhIvCJeGycEW4KlwTrgs3hJvCLeG2cEe4K9wT7gsPhIfCI+Gx8ER4KjwTngsvhJfCK+G18EZ4K7wT3gsfhI/CJ+Gz8EX4KnwTvgs/hJ/CL+G38Ef4K/wTEoU4MV5MIiYVk4nJxRRiSjGVmFpMI6YV04npxQxiRjGTmFnMImYVs4nZxRxiTjGXmFvMI+YV84n5xQJiQbGQWFgsIhYVi4nFxRJiSbGUWFosI5YVy4nlxQpiRbGSWFmsIlYVq4nVxRpiTRETcZEQSZESaZERWRGIUEQiJ/KiIIqiJMqiIqqiJuqiIZqiJdqiI7qiJ/piIIZiJMbEWmJtsY5YV6wn1hcbiA3FRmJjsYnYVGwmNhdbiC3FVmJrsY3YVmwnthc7iB3FTmJnsYvYVewmdhd7iD3FXmJvsY/YV+wn9hcHiAPFQeJgcYg4VBwmDhdHiCPFUeJocYw4VhwnjhcniBPFSeJkcYo4VZwmThdniDPFWeJscY44V5wnzhcXiAvFReJicUlC2rg4cbm4QlwprhJXi2vEteI6cb24QdwobhI3i1vEreI2cbu4Q9wp7hJ3i3vEveI+cb94QDwoJoiHxMPiEfGoeEw8Lp4QT4qnxNPiGfGseE48L14QL4qXxMviFfGqeE28Lt4Qb4q3xNviHfGueE+8Lz4QH4qPxMfiE/Gp+Ex8Lr4QX4qvxNfiG/Gt+E58L34QP4qfxM/iF/Gr+E38Lv4Qf4q/xN/iH/Gv+E9MFOOkeCmJlFRKJiWXUkgppVRSaimNlPb/aTdQyiRllrJIWaVsUnYph5RTyiXllvJIeaV8Un6pgFRQKiQVlopIRaViUnGphFRSKiWVlspIZaVyUnmpglRRqiRVlqpIVaVqUnWphlRTwiRcIiRSoiRaYiRWAhKUkMRJvCRIoiRJsqRIqqRJumRIpmRJtuRIruRJvhRIoRRJMamWVFuqI9WV6kn1pQZSQ6mR1FhqIjWVmknNpRZSS6mV1FpqI7WV2kntpQ5SR6mT1FnqInWVukndpR5ST6mX1FvqI/WV+kn9pQHSQGmQNFgaIg2VhknDpRHSSGmUNFoaI42VxknjpQnSRGmSNFmaIk2VpknTpRnSTGmWNFuaI82V5knzpQXSQmmRtFhaIi2VlknLpRXSSmmVtFpaI62V1knrpQ3SRmmTtFnaIm2VtknbpR3STmmXtFvaI+2V9kn7pQPSQSlBOiQdlo5IR6Vj0nHphHRSOiWdls5IZ6Vz0nnpgnRRuiRdlq5IV6Vr0nXphnRTuiXdlu5Id6V70n3pgfRQeiQ9lp5IT6Vn0nPphfRSeiW9lt5Ib6V30nvpg/RR+iR9lr5IX6Vv0nfph/RT+iX9lv5If6V/UqIUJ8fLSeSkcjI5uZxCTimnklPLaeS0cjo5vZxBzihnkjPLWeSscjY5u5xDzinnknPLeeS8cj45v1xALigXkgvLReSicjG5uFxCLimXkkvLZeSycjm5vFxBrihXkivLVeSqcjW5ulxDriljMi4TMilTMi0zMisDGcpI5mReFmRRlmRZVmRV1mRdNmRTtmRbdmRX9mRfDuRQjuSYXEuuLdeR68r15PpyA7mh3EhuLDeRm8rN5OZyC7ml3EpuLbeR28rt5PZyB7mj3EnuLHeRu8rd5O5yD7mn3EvuLfeR+8r95P7yAHmgPEgeLA+Rh8rD5OHyCHmkPEoeLY+Rx8rj5PHyBHmiPEmeLE+Rp8rT5OnyDHmmPEueLc+R58rz5PnyAnmhvEheLC+Rl8rL5OXyCnmlvEpeLa+R18rr5PXyBnmjvEneLG+Rt8rb5O3yDnmnvEveLe+R98r75P3yAfmgnCAfkg/LR+Sj8jH5uHxCPimfkk/LZ+Sz8jn5vHxBvihfki/LV+Sr8jX5unxDvinfkm/Ld+S78j35vvxAfig/kh/LT+Sn8jP5ufxCfim/kl/Lb+S38jv5vfxB/ih/kj/LX+Sv8jf5u/xD/in/kn/Lf+S/8j85UY5T4pUkSlIlmZJcSaGkVFIpqZU0SlolnZJeyaBkVDIpmZUsSlYlm5JdyaHkVHIpuZU8Sl4ln5JfKaAUVAophZUiSlGlmFJcKaGUVEoppZUySlmlnFJeqaBUVCoplZUqSlWlmlJdqaHUVDAFVwiFVCiFVhiFVYACFaRwCq8IiqhIiqwoiqpoiq4YiqlYiq04iqt4iq8ESqhESkyppdRW6ih1lXpKfaWB0lBppDRWmihNlWZKc6WF0lJppbRW2ihtlXZKe6WD0lHppHRWuihdlW5Kd6WH0lPppfRW+ih9lX5Kf2WAMlAZpAxWhihDlWHKcGWEMlIZpYxWxihjlXHKeGWCMlGZpExWpihTlWnKdGWGMlOZpcxW5ihzlXnKfGWBslBZpCxWlihLlWXKcmWFslJZpaxW1ihrlXXKemWDslHZpGxWtihblW3KdmWHslPZpexW9ih7lX3KfuWAclBJUA4ph5UjylHlmHJcOaGcVE4pp5UzylnlnHJeuaBcVC4pl5UrylXlmnJduaHcVG4pt5U7yl3lnnJfeaA8VB4pj5UnylPlmfJceaG8VF4pr5U3ylvlnfJe+aB8VD4pn5Uvylflm/Jd+aH8VH4pv5U/yl/ln5KoxKnxahI1qZpMTa6mUFOqqdTUaho1rZpOTa9mUDOqmdTMahY1q5pNza7mUHOqudTcah41r5pPza8WUAuqhdTCahG1qFpMLa6WUEuqpdTSahm1rFpOLa9WUCuqldTKahW1qlpNra7WUGuqmIqrhEqqlEqrjMqqQIUqUjmVVwVVVCVVVhVVVTVVVw3VVC3VVh3VVT3VVwM1VCM1ptZSa6t11LpqPbW+2kBtqDZSG6tN1KZqM7W52kJtqbZSW6tt1LZqO7W92kHtqHZSO6td1K5qN7W72kPtqfZSe6t91L5qP7W/OkAdqA5SB6tD1KHqMHW4OkIdqY5SR6tj1LHqOHW8OkGdqE5SJ6tT1KnqNHW6OkOdqc5SZ6tz1LnqPHW+ukBdqC5SF6tL1KXqMnW5ukJdqa5SV6tr1LXqOnW9ukHdqG5SN6tb1K3qNnW7ukPdqe5Sd6t71L3qPnW/ekA9qCaoh9TD6hH1qHpMPa6eUE+qp9TT6hn1rHpOPa9eUC+ql9TL6hX1qnpNva7eUG+qt9Tb6h31rnpPva8+UB+qj9TH6hP1qfpMfa6+UF+qr9TX6hv1rfpOfa9+UD+qn9TP6hf1q/pN/a7+UH+qv9Tf6h/1r/pPTVTjtHgtiZZUS6Yl11JoKbVUWmotjZZWS6el1zJoGbVMWmYti5ZVy6Zl13JoObVcWm4tj5ZXy6fl1wpoBbVCWmGtiFZUK6YV10poJbVSWmmtjFZWK6eV1ypoFbVKWmWtilZVq6ZV12poNTVMwzVCIzVKozVGYzWgQQ1pnMZrgiZqkiZriqZqmqZrhmZqlmZrjuZqnuZrgRZqkRbTamm1tTpaXa2eVl9roDXUGmmNtSZaU62Z1lxrobXUWmmttTZaW62d1l7roHXUOmmdtS5aV62b1l3rofXUemm9tT5aX62f1l8boA3UBmmDtSHaUG2YNlwboY3URmmjtTHaWG2cNl6boE3UJmmTtSnaVG2aNl2boc3UZmmztTnaXG2eNl9boC3UFmmLtSXaUm2Ztlxboa3UVmmrtTXaWm2dtl7boG3UNmmbtS3aVm2btl3boe3Udmm7tT3aXm2ftl87oB3UErRD2mHtiHZUO6Yd105oJ7VT2mntjHZWO6ed1y5oF7VL2mXtinZVu6Zd125oN7Vb2m3tjnZXu6fd1x5oD7VH2mPtifZUe6Y9115oL7VX2mvtjfZWe6e91z5oH7VP2mfti/ZV+6Z9135oP7Vf2m/tj/ZX+6clanF6vJ5ET6on05PrKfSUeio9tZ5GT6un09PrGfSMeiY9s55Fz6pn07PrOfScei49t55Hz6vn0/PrBfSCeiG9sF5EL6oX04vrJfSSeim9tF5GL6uX08vrFfSKeiW9sl5Fr6pX06vrNfSaOqbjOqGTOqXTOqOzOtChjnRO53VBF3VJl3VFV3VN13VDN3VLt3VHd3VP9/VAD/VIj+m19Np6Hb2uXk+vrzfQG+qN9MZ6E72p3kxvrrfQW+qt9NZ6G72t3k5vr3fQO+qd9M56F72r3k3vrvfQe+q99N56H72v3k/vrw/QB+qD9MH6EH2oPkwfro/QR+qj9NH6GH2sPk4fr0/QJ+qT9Mn6FH2qPk2frs/QZ+qz9Nn6HH2uPk+fry/QF+qL9MX6En2pvkxfrq/QV+qr9NX6Gn2tvk5fr2/QN+qb9M36Fn2rvk3fru/Qd+q79N36Hn2vvk/frx/QD+oJ+iH9sH5EP6of04/rJ/ST+in9tH5GP6uf08/rF/SL+iX9sn5Fv6pf06/rN/Sb+i39tn5Hv6vf0+/rD/SH+iP9sf5Ef6o/05/rL/SX+iv9tf5Gf6u/09/rH/SP+if9s/5F/6p/07/rP/Sf+i/9t/5H/6v/0xP1OCPeSGIkNZIZyY0URkojlZHaSGOkNdIZ6Y0MRkYjk5HZyGJkNbIZ2Y0cRk4jl5HbyGPkNfIZ+Y0CRkGjkFHYKGIUNYoZxY0SRkmjlFHaKGOUNcoZ5Y0KRkWjklHZqGJUNaoZ1Y0aRk0DM3CDMEiDMmiDMVgDGNBABmfwhmCIhmTIhmKohmbohmGYhmXYhmO4hmf4RmCERmTEjFpGbaOOUdeoZ9Q3GhgNjUZGY6OJ0dRoZjQ3WhgtjVZGa6ON0dZoZ7Q3OhgdjU5GZ6OL0dXoZnQ3ehg9jV5Gb6OP0dfoZ/Q3BhgDjUHGYGOIMdQYZgw3RhgjjVHGaGOMMdYYZ4w3JhgTjUnGZGOKMdWYZkw3ZhgzjVnGbGOOMdeYZ8w3FhgLjUXGYmOJsdRYZiw3VhgrjVXGamONsdZYZ6w3NhgbjU3GZmOLsdXYZmw3dhg7jV3GbmOPsdfYZ+w3DhgHjQTjkHHYOGIcNY4Zx40TxknjlHHaOGOcNc4Z540LxkXjknHZuGJcNa4Z140bxk3jlnHbuGPcNe4Z940HxkPjkfHYeGI8NZ4Zz40XxkvjlfHaeGO8Nd4Z740Pxkfjk/HZ+GJ8Nb4Z340fxk/jl/Hb+GP8Nf4ZiUacGW8mMZOayczkZgozpZnKTG2mMdOa6cz0ZgYzo5nJzGxmMbOa2czsZg4zp5nLzG3mMfOa+cz8ZgGzoFnILGwWMYuaxcziZgmzpFnKLG2WMcua5czyZgWzolnJrGxWMaua1czqZg2zpomZuEmYpEmZtMmYrAlMaCKTM3lTMEVTMmVTMVVTM3XTME3TMm3TMV3TM30zMEMzMmNmLbO2Wcesa9Yz65sNzIZmI7Ox2cRsajYzm5stzJZmK7O12cZsa7Yz25sdzI5mJ7Oz2cXsanYzu5s9zJ5mL7O32cfsa/Yz+5sDzIHmIHOwOcQcag4zh5sjzJHmKHO0OcYca44zx5sTzInmJHOyOcWcak4zp5szzJnmLHO2Oceca84z55sLzIXmInOxucRcai4zl5srzJXmKnO1ucZca64z15sbzI3mJnOzucXcam4zt5s7zJ3mLnO3ucfca+4z95sHzINmgnnIPGweMY+ax8zj5gnzpHnKPG2eMc+a58zz5gXzonnJvGxeMa+a18zr5g3zpnnLvG3eMe+a98z75gPzofnIfGw+MZ+az8zn5gvzpfnKfG2+Md+a78z35gfzo/nJ/Gx+Mb+a38zv5g/zp/nL/G3+Mf+a/8xEM86Kt5JYSa1kVnIrhZXSSmWlttJYaa10Vnorg5XRymRltrJYWa1sVnYrh5XTymXltvJYea18Vn6rgFXQKmQVtopYRa1iVnGrhFXSKmWVtspYZa1yVnmrglXRqmRVtqpYVa1qVnWrhlXTwizcIizSoizaYizWAha0kMVZvCVYoiVZsqVYqqVZumVYpmVZtuVYruVZvhVYoRVZMauWVduqY9W16ln1rQZWQ6uR1dhqYjW1mlnNrRZWS6uV1dpqY7W12lntrQ5WR6uT1dnqYnW1ulndrR5WT6uX1dvqY/W1+ln9rQHWQGuQNdgaYg21hlnDrRHWSGuUNdoaY421xlnjrQnWRGuSNdmaYk21plnTrRnWTGuWNduaY8215lnzrQXWQmuRtdhaYi21llnLrRXWSmuVtdpaY6211lnrrQ3WRmuTtdnaYm21tlnbrR3WTmuXtdvaY+219ln7rQPWQSvBOmQdto5YR61j1nHrhHXSOmWdts5YZ61z1nnrgnXRumRdtq5YV61r1nXrhnXTumXdtu5Yd6171n3rgfXQemQ9tp5YT61n1nPrhfXSemW9tt5Yb6131nvrg/XR+mR9tr5YX61v1nfrh/XT+mX9tv5Yf61/VqIVZ8fbSeykdjI7uZ3CTmmnslPbaey0djo7vZ3BzmhnsjPbWeysdjY7u53DzmnnsnPbeey8dj47v13ALmgXsgvbReyidjG7uF3CLmmXskvbZeyydjm7vF3BrmhXsivbVeyqdjW7ul3DrmljNm4TNmlTNm0zNmsDG9rI5mzeFmzRlmzZVmzV1mzdNmzTtmzbdmzX9mzfDuzQjuyYXcuubdex69r17Pp2A7uh3chubDexm9rN7OZ2C7ul3cpubbex29rt7PZ2B7uj3cnubHexu9rd7O52D7un3cvubfex+9r97P72AHugPcgebA+xh9rD7OH2CHukPcoebY+xx9rj7PH2BHuiPcmebE+xp9rT7On2DHumPcuebc+x59rz7Pn2AnuhvchebC+xl9rL7OX2Cnulvcpeba+x19rr7PX2BnujvcnebG+xt9rb7O32Dnunvcvebe+x99r77P32AfugnWAfsg/bR+yj9jH7uH3CPmmfsk/bZ+yz9jn7vH3Bvmhfsi/bV+yr9jX7un3Dvmnfsm/bd+y79j37vv3Afmg/sh/bT+yn9jP7uf3Cfmm/sl/bb+y39jv7vf3B/mh/sj/bX+yv9jf7u/3D/mn/sn/bf+y/9j870Y5z4p0kTlInmZPcSeGkdFI5qZ00TlonnZPeyeBkdDI5mZ0sTlYnm5PdyeHkdHI5uZ08Tl4nn5PfKeAUdAo5hZ0iTlGnmFPcKeGUdEo5pZ0yTlmnnFPeqeBUdCo5lZ0qTlWnmlPdqeHUdDAHdwiHdCiHdhiHdYADHeRwDu8IjuhIjuwojupoju4YjulYju04jut4ju8ETuhETsyp5dR26jh1nXpOfaeB09Bp5DR2mjhNnWZOc6eF09Jp5bR22jhtnXZOe6eD09Hp5HR2ujhdnW5Od6eH09Pp5fR2+jh9nX5Of2eAM9AZ5Ax2hjhDnWHOcGeEM9IZ5Yx2xjhjnXHOeGeCM9GZ5Ex2pjhTnWnOdGeGM9OZ5cx25jhznXnOfGeBs9BZ5Cx2ljhLnWXOcmeFs9JZ5ax21jhrnXXOemeDs9HZ5Gx2tjhbnW3OdmeHs9PZ5ex29jh7nX3OfueAc9BJcA45h50jzlHnmHPcOeGcdE45p50zzlnnnHPeueBcdC45l50rzlXnmnPdueHcdG45t507zl3nnnPfeeA8dB45j50nzlPnmfPceeG8dF45r503zlvnnfPe+eB8dD45n50vzlfnm/Pd+eH8dH45v50/zl/nn5PoxLnxbhI3qZvMTe6mcFO6qdzUbho3rZvOTe9mcDO6mdzMbhY3q5vNze7mcHO6udzcbh43r5vPze8WcAu6hdzCbhG3qFvMLe6WcEu6pdzSbhm3rFvOLe9WcCu6ldzKbhW3qlvNre7WcGu6mIu7hEu6lEu7jMu6wIUucjmXdwVXdCVXdhVXdTVXdw3XdC3Xdh3XdT3XdwM3dCM35tZya7t13LpuPbe+28Bt6DZyG7tN3KZuM7e528Jt6bZyW7tt3LZuO7e928Ht6HZyO7td3K5uN7e728Pt6fZye7t93L5uP7e/O8Ad6A5yB7tD3KHuMHe4O8Id6Y5yR7tj3LHuOHe8O8Gd6E5yJ7tT3KnuNHe6O8Od6c5yZ7tz3LnuPHe+u8Bd6C5yF7tL3KXuMne5u8Jd6a5yV7tr3LXuOne9u8Hd6G5yN7tb3K3uNne7u8Pd6e5yd7t73L3uPne/e8A96Ca4h9zD7hH3qHvMPe6ecE+6p9zT7hn3rHvOPe9ecC+6l9zL7hX3qnvNve7ecG+6t9zb7h33rnvPve8+cB+6j9zH7hP3qfvMfe6+cF+6r9zX7hv3rfvOfe9+cD+6n9zP7hf3q/vN/e7+cH+6v9zf7h/3r/vPTXTjvHgviZfUS+Yl91J4Kb1UXmovjZfWS+el9zJ4Gb1MXmYvi5fVy+Zl93J4Ob1cXm4vj5fXy+fl9wp4Bb1CXmGviFfUK+YV90p4Jb1SXmmvjFfWK+eV9yp4Fb1KXmWvilfVq+ZV92p4NT3Mwz3CIz3Koz3GYz3gQQ95nMd7gid6kid7iqd6mqd7hmd6lmd7jud6nud7gRd6kRfzanm1vTpeXa+eV99r4DX0GnmNvSZeU6+Z19xr4bX0WnmtvTZeW6+d197r4HX0OnmdvS5eV6+b193r4fX0enm9vT5eX6+f198b4A30BnmDvSHeUG+YN9wb4Y30RnmjvTHeWG+cN96b4E30JnmTvSneVG+aN92b4c30ZnmzvTneXG+eN99b4C30FnmLvSXeUm+Zt9xb4a30VnmrvTXeWm+dt97b4G30NnmbvS3eVm+bt93b4e30dnm7vT3eXm+ft9874B30ErxD3mHviHfUO+Yd9054J71T3mnvjHfWO+ed9y54F71L3mXvinfVu+Zd9254N71b3m3vjnfXu+fd9x54D71H3mPviffUe+Y99154L71X3mvvjffWe+e99z54H71P3mfvi/fV++Z99354P71f3m/vj/fX++clenF+vJ/ET+on85P7KfyUfio/tZ/GT+un89P7GfyMfiY/s5/Fz+pn87P7Ofycfi4/t5/Hz+vn8/P7BfyCfiG/sF/EL+oX84v7JfySfim/tF/GL+uX88v7FfyKfiW/sl/Fr+pX86v7NfyaPubjPuGTPuXTPuOzPvChj3zO533BF33Jl33FV33N133DN33Lt33Hd33P9/3AD/3Ij/m1/Np+Hb+uX8+v7zfwG/qN/MZ+E7+p38xv7rfwW/qt/NZ+G7+t385v73fwO/qd/M5+F7+r383v7vfwe/q9/N5+H7+v38/v7w/wB/qD/MH+EH+oP8wf7o/wR/qj/NH+GH+sP84f70/wJ/qT/Mn+FH+qP82f7s/wZ/qz/Nn+HH+uP8+f7y/wF/qL/MX+En+pv8xf7q/wV/qr/NX+Gn+tv85f72/wN/qb/M3+Fn+rv83f7u/wd/q7/N3+Hn+vv8/f7x/wD/oJ/iH/sH/EP+of84/7J/yT/in/tH/GP+uf88/7F/yL/iX/sn/Fv+pf86/7N/yb/i3/tn/Hv+vf8+/7D/yH/iP/sf/Ef+o/85/7L/yX/iv/tf/Gf+u/89/7H/yP/if/s//F/+p/87/7P/yf/i//t//H/+v/8xP9uCA+SBIkDZIFyYMUQcogVZA6SBOkDdIF6YMMQcYgU5A5yBJkDbIF2YMcQc4gV5A7yBPkDfIF+YMCQcGgUFA4KBIUDYoFxYMSQcmgVFA6KBOUDcoF5YMKQcWgUlA5qBJUDaoF1YMaQc0AC/CACMiACuiACdgABDBAARfwgRCIgRTIgRKogRbogRGYgRXYgRO4gRf4QRCEQRTEglpB7aBOUDeoF9QPGgQNg0ZB46BJ0DRoFjQPWgQtg1ZB66BN0DZoF7QPOgQdg05B56BL0DXoFnQPegQ9g15B76BP0DfoF/QPBgQDg0HB4GBIMDQYFgwPRgQjg1HB6GBMMDYYF4wPJgQTg0nB5GBKMDWYFkwPZgQzg1nB7GBOMDeYF8wPFgQLg0XB4mBJsDRYFiwPVgQrg1XB6mBNsDZYF6wPNgQbg03B5mBLsDXYFmwPdgQ7g13B7mBPsDfYF+wPDgQHg4TgUHA4OBIcDY4Fx4MTwcngVHA6OBOcDc4F54MLwcXgUnA5uBJcDa4F14Mbwc3gVnA7uBPcDe4F94MHwcPgUfA4eBI8DZ4Fz4MXwcvgVfA6eBO8Dd4F74MPwcfgU/A5+BJ8Db4F34Mfwc/gV/A7+BP8Df4FiUFcGB8mCZOGycLkYYowZZgqTB2mCdOG6cL0YYYwY5gpzBxmCbOG2cLsYY4wZ5grzB3mCfOG+cL8YYGwYFgoLBwWCYuGxcLiYYmwZFgqLB2WCcuG5cLyYYWwYlgprBxWCauG1cLqYY2wZoiFeEiEZEiFdMiEbAhCGKKQC/lQCMVQCuVQCdVQC/XQCM3QCu3QCd3QC/0wCMMwCmNhrbB2WCesG9YL64cNwoZho7Bx2CRsGjYLm4ctwpZhq7B12CZsG7YL24cdwo5hp7Bz2CXsGnYLu4c9wp5hr7B32CfsG/YL+4cDwoHhoHBwOCQcGg4Lh4cjwpHhqHB0OCYcG44Lx4cTwonhpHByOCWcGk4Lp4czwpnhrHB2OCecG84Li4YLwoXhonBxuCRcGi4Ll4crwpXhqnB1uCZcG64L14cbwo3hpnBzuCXcGm4Lt4c7wp3hrnB3uCfcG+4L94cHwoNhQngoPBweCY+Gx8Lj4YnwZHgqPB2eCc+G58Lz4YXwYngpvBxeCa+G18Lr4Y3wZngrvB3eCe+G98L74YPwYfgofBw+CZ+Gz8Ln4YvwZfgqfB2+Cd+G78L34YfwY/gp/Bx+Cb+G38Lv4Y/wZ/gr/B3+Cf+G/8LEMC6Kj5JESaNkUfIoRZQyShWljtJEaaN0UfooQ5QxyhRljrJEWaNsUfYoR5QzyhXljvJEeaN8Uf6oQFQwKhQVjopERaNiUfGoRFQyKhWVjspEZaNyUfmoQlQxqhRVjqpEVaNqUfWoRlQzwiI8IiIyoiI6YiI2AhGMUMRFfCREYiRFcqREaqRFemREZmRFduREbuRFfhREYRRFsahWVDuqE9WN6kX1owZRw6hR1DhqEjWNmkXNoxZRy6hV1DpqE7WN2kXtow5Rx6hT1DnqEnWNukXdox5Rz6hX1DvqE/WN+kX9owHRwGhQNDgaEg2NhkXDoxHRyGhUNDoaE42NxkXjownRxGhSNDmaEk2NpkXToxnRzGhWNDuaE82N5kXzowXRwmhRtDhaEi2NlkXLoxXRymhVtDpaE62N1kXrow3RxmhTtDnaEm2NtkXbox3RzmhXtDvaE+2N9kX7owPRwSghOhQdjo5ER6Nj0fHoRHQyOhWdjs5EZ6Nz0fnoQnQxuhRdjq5EV6Nr0fXoRnQzuhXdju5Ed6N70f3oQfQwehQ9jp5ET6Nn0fPoRfQyehW9jt5Eb6N30fvoQ/Qx+hR9jr5EX6Nv0ffoR/Qz+hX9jv5Ef6N/UWIUF4uPJYkljSWLJY+liKWMpYqljqWJpY2li6WPZYhljGWKZY5liWWNZYtlj+WI5YzliuWO5YnljeWL5Y8ViBWMFYoVjhWJFY0VixWPlYiVjJWKlY6ViZWNlYuVj1WIVYxVilWOVYlVjVWLVY/ViNWMYTE8RsTIGBWjY0yMjYEYjKH/sWyP7XklAQBAU9u2bdtIbVzbxszct7Zt27ZtW6lt281+2d9xnoM1w5pjLbCWWDzWCmuNtcHaYu2w9lgHrCPWCeuMdcG6Yt2w7lgPrCeGYThGYCRGYTTGYCzGYTwmYCImYTKmYCqmYTpmYCZmYTbmYC7mYT4WYCEGMIghLMJiWC+sN9YH64v1w/pjA7CB2CBsMDYEG4oNw4ZjI7CR2ChsNDYGG4uNw8ZjE7CJ2CRsMjYFm4pNw6ZjM7CZ2CxsNjYHm4vNw+ZjC7CF2CJsMbYEW4otw5ZjK7CV2CpsNbYGW4utw9ZjG7CN2CZsM7YF24ptw7ZjO7Cd2C5sN7YH24vtw/ZjB7CD2CHsMHYEO4odw45jJ7CT2CnsNHYGO4udw85jF7CL2CXsMnYFu4pdw65jN7AE7CZ2C7uN3cHuYvew+9gD7CH2CHuMPcGeYs+w59gL7CX2CnuNvcHeYu+w99gH7CP2CfuMfcG+Yt+w79gP7Cf2C/uN/cH+Yv+wRCwOT4InxZPhyfEUeEo8FZ4aT4OnxdPh6fEMeEY8E54Zz4JnxbPh2fEceE48F54bz4PnxfPh+fECeEG8EF4YL4IXxYvhxfESeEm8FF4aL4OXxcvh5fEKeEW8El4Zr4JXxavh1fEaeE28Fl4br4PXxevh9fEGeEO8Ed4Yb4I3xZvhzfEWeEs8Hm+Ft8bb4G3xdnh7vAPeEe+Ed8a74F3xbnh3vAfeE8dwHCdwEqdwGmdwFudwHhdwEZdwGVdwFddwHTdwE7dwG3dwF/dwHw/wEAc4xBEe4TG8F94b74P3xfvh/fEB+EB8ED4YH4IPxYfhw/ER+Eh8FD4aH4OPxcfh4/EJ+ER8Ej4Zn4JPxafh0/EZ+Ex8Fj4bn4PPxefh8/EF+EJ8Eb4YX4IvxZfhy/EV+Ep8Fb4aX4Ovxdfh6/EN+EZ8E74Z34Jvxbfh2/Ed+E58F74b34Pvxffh+/ED+EH8EH4YP4IfxY/hx/ET+En8FH4aP4Ofxc/h5/EL+EX8En4Zv4Jfxa/h1/EbeAJ+E7+F38bv4Hfxe/h9/AH+EH+EP8af4E/xZ/hz/AX+En+Fv8bf4G/xd/h7/AP+Ef+Ef8a/4F/xb/h3/Af+E/+F/8b/4H/xf3giHkckIZISyYjkRAoiJZGKSE2kIdIS6Yj0RAYiI5GJyExkIbIS2YjsRA4iJ5GLyE3kIfIS+Yj8RAGiIFGIKEwUIYoSxYjiRAmiJFGKKE2UIcoS5YjyRAWiIlGJqExUIaoS1YjqRA2iJlGLqE3UIeoS9Yj6RAOiIdGIaEw0IZoSzYjmRAuiJRFPtCJaE22ItkQ7oj3RgehIdCI6E12IrkQ3ojvRg+hJYAROEARJUARNMARLcARPCIRISIRMKIRKaIROGIRJWIRNOIRLeIRPBERIAAISiIiIGNGL6E30IfoS/Yj+xABiIDGIGEwMIYYSw4jhxAhiJDGKGE2MIcYS44jxxARiIjGJmExMIaYS04jpxAxiJjGLmE3MIeYS84j5xAJiIbGIWEwsIZYSy4jlxApiJbGKWE2sIdYS64j1xAZiI7GJ2ExsIbYS24jtxA5iJ7GL2E3sIfYS+4j9xAHiIHGIOEwcIY4Sx4jjxAniJHGKOE2cIc4S54jzxAXiInGJuExcIa4S14jrxA0igbhJ3CJuE3eIu8Q94j7xgHhIPCIeE0+Ip8Qz4jnxgnhJvCJeE2+It8Q74j3xgfhIfCI+E1+Ir8Q34jvxg/hJ/CJ+E3+Iv8Q/IpGII5OQSclkZHIyBZmSTEWmJtOQacl0ZHoyA5mRzERmJrOQWclsZHYyB5mTzEXmJvOQecl8ZH6yAFmQLEQWJouQRcliZHGyBFmSLEWWJsuQZclyZHmyAlmRrERWJquQVclqZHWyBlmTrEXWJuuQdcl6ZH2yAdmQbEQ2JpuQTclmZHOyBdmSjCdbka3JNmRbsh3ZnuxAdiQ7kZ3JLmRXshvZnexB9iQxEicJkiQpkiYZkiU5kicFUiQlUiYVUiU1UicN0iQt0iYd0iU90icDMiQBCUlERmSM7EX2JvuQfcl+ZH9yADmQHEQOJoeQQ8lh5HByBDmSHEWOJseQY8lx5HhyAjmRnEROJqeQU8lp5HRyBjmTnEXOJueQc8l55HxyAbmQXEQuJpeQS8ll5HJyBbmSXEWuJteQa8l15HpyA7mR3ERuJreQW8lt5HZyB7mT3EXuJveQe8l95H7yAHmQPEQeJo+QR8lj5HHyBHmSPEWeJs+QZ8lz5HnyAnmRvEReJq+QV8lr5HXyBplA3iRvkbfJO+Rd8h55n3xAPiQfkY/JJ+RT8hn5nHxBviRfka/JN+Rb8h35nvxAfiQ/kZ/JL+RX8hv5nfxB/iR/kb/JP+Rf8h+ZSMYlj4tLSiWjklMpqJRUKio1lYZKS6Wj0lMZqIxUJiozlYXKSmWjslM5qJxULio3lYfKS+Wj8lMFqIJUIaowVYQqShWjilMlqJJUKao0VYYqS5WjylMVqIpUJaoyVYWqSlWjqlM1qJpULao2VYeqS9Wj6lMNqIZUI6ox1YRqSjWjmlMtqJZUPNWKak21odpS7aj2VAeqI9WJ6kx1obpS3ajuVA+qJ4VROEVQJEVRNMVQLMVRPCVQIiVRMqVQKqVROmVQJmVRNuVQLuVRPhVQIQUoSCEqomJUL6o31YfqS/Wj+lMDqIHUIGowNYQaSg2jhlMjqJHUKGo0NYYaS42jxlMTqInUJGoyNYWaSk2jplMzqJnULGo2NYeaS82j5lMLqIXUImoxtYRaSi2jllMrqJXUKmo1tYZaS62j1lMbqI3UJmoztYXaSm2jtlM7qJ3ULmo3tYfaS+2j9lMHqIPUIeowdYQ6Sh2jjlMnqJPUKeo0dYY6S52jzlMXqIvUJeoydYW6Sl2jrlM3qATqJnWLuk3doe5S96j71APqIfWIekw9oZ5Sz6jn1AvqJfWKek29od5S76j31AfqI/WJ+kx9ob5S36jv1A/qJ/WL+k39of5S/6hEKo5OQielk9HJ6RR0SjoVnZpOQ6el09Hp6Qx0RjoTnZnOQmels9HZ6Rx0TjoXnZvOQ+el89H56QJ0QboQXZguQheli9HF6RJ0SboUXZouQ5ely9Hl6Qp0RboSXZmuQlelq9HV6Rp0TboWXZuuQ9el69H16QZ0Q7oR3ZhuQjelm9HN6RZ0SzqebkW3ptvQbel2dHu6A92R7kR3prvQXeludHe6B92TxmicJmiSpmiaZmiW5mieFmiRlmiZVmiV1midNmiTtmibdmiX9mifDuiQBjSkER3RMboX3ZvuQ/el+9H96QH0QHoQPZgeQg+lh9HD6RH0SHoUPZoeQ4+lx9Hj6Qn0RHoSPZmeQk+lp9HT6Rn0THoWPZueQ8+l59Hz6QX0QnoRvZheQi+ll9HL6RX0SnoVvZpeQ6+l19Hr6Q30RnoTvZneQm+lt9Hb6R30TnoXvZveQ++l99H76QP0QfoQfZg+Qh+lj9HH6RP0SfoUfZo+Q5+lz9Hn6Qv0RfoSfZm+Ql+lr9HX6Rt0An2TvkXfpu/Qd+l79H36Af2QfkQ/pp/QT+ln9HP6Bf2SfkW/pt/Qb+l39Hv6A/2R/kR/pr/QX+lv9Hf6B/2T/kX/pv/Qf+l/dCIdxyRhkjLJmORMCiYlk4pJzaRh0jLpmPRMBiYjk4nJzGRhsjLZmOxMDiYnk4vJzeRh8jL5mPxMAaYgU4gpzBRhijLFmOJMCaYkU4opzZRhyjLlmPJMBaYiU4mpzFRhqjLVmOpMDaYmU4upzdRh6jL1mPpMA6Yh04hpzDRhmjLNmOZMC6YlE8+0YlozbZi2TDumPdOB6ch0YjozXZiuTDemO9OD6clgDM4QDMlQDM0wDMtwDM8IjMhIjMwojMpojM4YjMlYjM04jMt4jM8ETMgABjKIiZgY04vpzfRh+jL9mP7MAGYgM4gZzAxhhjLDmOHMCGYkM4oZzYxhxjLjmPHMBGYiM4mZzExhpjLTmOnMDGYmM4uZzcxh5jLzmPnMAmYhs4hZzCxhljLLmOXMCmYls4pZzaxh1jLrmPXMBmYjs4nZzGxhtjLbmO3MDmYns4vZzexh9jL7mP3MAeYgc4g5zBxhjjLHmOPMCeYkc4o5zZxhzjLnmPPMBeYic4m5zFxhrjLXmOvMDSaBucncYm4zd5i7zD3mPvOAecg8Yh4zT5inzDPmOfOCecm8Yl4zb5i3zDvmPfOB+ch8Yj4zX5ivzDfmO/OD+cn8Yn4zf5i/zD8mkYljk7BJ2WRscjYFm5JNxaZm07Bp2XRsejYDm5HNxGZms7BZ2WxsdjYHm5PNxeZm87B52XxsfrYAW5AtxBZmi7BF2WJscbYEW5ItxZZmy7Bl2XJsebYCW5GtxFZmq7BV2WpsdbYGW5OtxdZm67B12XpsfbYB25BtxDZmm7BN2WZsc7YF25KNZ1uxrdk2bFu2Hdue7cB2ZDuxndkubFe2G9ud7cH2ZDEWZwmWZCmWZhmWZTmWZwVWZCVWZhVWZTVWZw3WZC3WZh3WZT3WZwM2ZAELWcRGbIztxfZm+7B92X5sf3YAO5AdxA5mh7BD2WHscHYEO5IdxY5mx7Bj2XHseHYCO5GdxE5mp7BT2WnsdHYGO5Odxc5m57Bz2XnsfHYBu5BdxC5ml7BL2WXscnYFu5Jdxa5m17Br2XXsenYDu5HdxG5mt7Bb2W3sdnYHu5Pdxe5m97B72X3sfvYAe5A9xB5mj7BH2WPscfYEe5I9xZ5mz7Bn2XPsefYCe5G9xF5mr7BX2WvsdfYGm8DeZG+xt9k77F32HnuffcA+ZB+xj9kn7FP2GfucfcG+ZF+xr9k37Fv2Hfue/cB+ZD+xn9kv7Ff2G/ud/cH+ZH+xv9k/7F/2H5vIxnFJuKRcMi45l4JLyaXiUnNpuLRcOi49l4HLyGXiMnNZuKxcNi47l4PLyeXicnN5uLxcPi4/V4AryBXiCnNFuKJcMa44V4IryZXiSnNluLJcOa48V4GryFXiKnNVuKpcNa46V4OrydXianN1uLpcPa4+14BryDXiGnNNuKZcM64514JrycVzrbjWXBuuLdeOa8914DpynbjOXBeuK9eN68714HpyGIdzBEdyFEdzDMdyHMdzAidyEidzCqdyGqdzBmdyFmdzDudyHudzARdygIMc4iIuxvXienN9uL5cP64/N4AbyA3iBnNDuKHcMG44N4IbyY3iRnNjuLHcOG48N4GbyE3iJnNTuKncNG46N4Obyc3iZnNzuLncPG4+t4BbyC3iFnNLuKXcMm45t4Jbya3iVnNruLXcOm49t4HbyG3iNnNbuK3cNm47t4Pbye3idnN7uL3cPm4/d4A7yB3iDnNHuKPcMe44d4I7yZ3iTnNnuLPcOe48d4G7yF3iLnNXuKvcNe46d4NL4G5yt7jb3B3uLnePu8894B5yj7jH3BPuKfeMe8694F5yr7jX3BvuLfeOe8994D5yn7jP3BfuK/eN+8794H5yv7jf3B/uL/ePS+Ti+CR8Uj4Zn5xPwafkU/Gp+TR8Wj4dn57PwGfkM/GZ+Sx8Vj4bn53Pwefkc/G5+Tx8Xj4fn58vwBfkC/GF+SJ8Ub4YX5wvwZfkS/Gl+TJ8Wb4cX56vwFfkK/GV+Sp8Vb4aX52vwdfka/G1+Tp8Xb4eX59vwDfkG/GN+SZ8U74Z35xvwbfk4/lWfGu+Dd+Wb8e35zvwHflOfGe+C9+V78Z353vwPXmMx3mCJ3mKp3mGZ3mO53mBF3mJl3mFV3mN13mDN3mLt3mHd3mP9/mAD3nAQx7xER/je/G9+T58X74f358fwA/kB/GD+SH8UH4YP5wfwY/kR/Gj+TH8WH4cP56fwE/kJ/GT+Sn8VH4aP52fwc/kZ/Gz+Tn8XH4eP59fwC/kF/GL+SX8Un4Zv5xfwa/kVyX9P0/x6/kN/EZ+E7+Z38Jv5bfx2/kd/E5+F7+b38Pv5ffx+/kD/EH+EH+YP8If5Y/xx/kT/En+FH+aP8Of5c/x5/kL/EX+En+Zv8Jf5a/x1/kbfAJ/k7/F3+bv8Hf5e/x9/gH/kH/EP+af8E/5Z/xz/gX/kn/Fv+bf8G/5d/x7/gP/kf/Ef+a/8F/5b/x3/gf/k//F/+b/8H/5f3wiHyckEZIKyYTkQgohpZBKSC2kEdIK6YT0QgYho5BJyCxkEbIK2YTsQg4hp5BLyC3kEfIK+YT8QgGhoFBIKCwUEYoKxYTiQgmhpFBKKC2UEcoK5YTyQgWholBJqCxUEaoK1YTqQg2hplBLqC3UEeoK9YT6QgOhodBIaCw0EZoKzYTmQguhpRAvtBJaC22EtkI7ob3QQegodBI6C12ErkI3obvQQ+gpYAIuEAIpUAItMAIrcAIvCIIoSIIsKIIqaIIuGIIpWIItOIIreIIvBEIoAAEKSIiEmNBL6C30EfoK/YT+wgBhoDBIGCwMEYYKw4ThwghhpDBKGC2MEcYK44TxwgRhojBJmCxMEaYK04TpwgxhpjBLmC3MEeYK84T5wgJhobBIWCwsEZYKy4TlwgphpbBKWC2sEdYK64T1wgZho7BJ2CxsEbYK24Ttwg5hp7BL2C3sEfYK+4T9wgHhoHBIOCwcEY4Kx4TjwgnhpHBKOC2cEc4K54TzwgXhonBJuCxcEa4K14Trwg0hQbgp3BJuC3eEu8I94b7wQHgoPBIeC0+Ep8Iz4bnwQngpvBJeC2+Et8I74b3wQfgofBI+C1+Er8I34bvwQ/gp/BJ+C3+Ev8I/IVGIE5OIScVkYnIxhZhSTCWmFtOIacV0Ynoxg5hRzCRmFrOIWcVsYnYxh5hTzCXmFvOIecV8Yn6xgFhQLCQWFouIRcViYnGxhFhSLCWWFsuIZcVyYnmxglhRrCRWFquIVcVqYnWxhlhTrCXWFuuIdcV6Yn2xgdhQbCQ2FpuITcVmYnOxhdhSjBdbia3FNmJbsZ3YXuwgdhQ7iZ3FLmJXsZvYXewh9hQxERcJkRQpkRYZkRU5kRcFURQlURYVURU1URcN0RQt0RYd0RU90RcDMRSBCEUkRmJM7CX2FvuIfcV+Yn9xgDhQHCQOFoeIQ8Vh4nBxhDhSHCWOFseIY8Vx4nhxgjhRnCROFqeIU8Vp4nRxhjhTnCXOFueIc8V54nxxgbhQXCQuFpeIS8Vl4nJxhbhSXCWuFteIa8V14npxg7hR3CRuFreIW8Vt4nZxh7hT3CXuFveIe8V94n7xgHhQPCQeFo+IR8Vj4nHxhHhSPCWeFs+IZ8Vz4nnxgnhRvCReFq+IV8Vr4nXxhpgg3hRvibfFO+Jd8Z54X3wgPhQfiY/FJ+JT8Zn4XHwhvhRfia/FN+Jb8Z34XvwgfhQ/iZ/FL+JX8Zv4Xfwh/hR/ib/FP+Jf8Z+YKMZJSaSkUjIpuZRCSimlklJLaaS0UjopvZRByihlkjJLWaSsUjYpu5RDyinlknJLeaS8Uj4pv1RAKigVkgpLRaSiUjGpuFRCKimVkkpLZaSyUjmpvFRBqihVkipLVaSqUjWpulRDqinVkmpLdaS6Uj2pvtRAaig1khpLTaSmUjOpudRCainFS62k1lIbqa3UTmovdZA6Sp2kzlIXqavUTeou9ZB6SpiES4RESpRES4zESpzES4IkSpIkS4qkSpqkS4ZkSpZkS47kSp7kS4EUSkCCEpIiKSb1knpLfaS+Uj+pvzRAGigNkgZLQ6Sh0jBpuDRCGimNkkZLY6Sx0jhpvDRBmihNkiZLU6Sp0jRpujRDminNkmZLc6S50jxpvrRAWigtkhZLS6Sl0jJpubRCWimtklZLa6S10jppvbRB2ihtkjZLW6St0jZpu7RD2intknZLe6S90j5pv3RAOigdkg5LR6Sj0jHpuHRCOimdkk5LZ6Sz0jnpvHRBuihdki5LV6Sr0jXpunRDSpBuSrek29Id6a50T7ovPZAeSo+kx9IT6an0THouvZBeSq+k19Ib6a30TnovfZA+Sp+kz9IX6av0Tfou/ZB+Sr+k39If6a/0T0qU4uQkclI5mZxcTiGnlFPJqeU0clo5nZxeziBnlDPJmeUsclY5m5xdziHnlHPJueU8cl45n5xfLiAXlAvJheUiclG5mFxcLiGXlEvJpeUyclm5nFxeriBXlCvJleUqclW5mlxdriHXlGvJteU6cl25nlxfbiA3lBvJjeUmclO5mdxcbiG3lOMTEhMT5TZyW7md3F7uIHeUO8md5S5yV7mb3F3uIfeUMRmXCZmUKZmWGZmVOZmXBVmUJVmWFVmVNVmXDdmULdmWHdmVPdmXAzmUgQxlJEdyTO4l95b7yH3lfnJ/eYA8UB4kD5aHyEPlYfJweYQ8Uh4lj5bHyGPlcfJ4eYI8UZ4kT5anyFPlafJ0eYY8U54lz5bnyHPlefJ8eYG8UF4kL5aXyEvlZfJyeYW8Ul4lr5bXyGvldfJ6eYO8Ud4kb5a3yFvlbfJ2eYe8U94l75b3yHvlffJ++YB8UD4kH5aPyEflY/Jx+YR8Uj4ln5bPyGflc/J5+YJ8Ub4kX5avyFfla/J1+YacIN+Ub8m35TvyXfmefF9+ID+UH8mP5SfyU/mZ/Fx+Ib+UX8mv5TfyW/md/F7+IH+UP8mf5S/yV/mb/F3+If+Uf8m/5T/yX/mfnCjHKUmUpEoyJbmSQkmppFJSK2mUtEo6Jb2SQcmoZFIyK1mUrEo2JbuSQ8mp5FJyK3mUvEo+Jb9SQCmoFFIKK0WUokoxpbhSQimplFJKK2WUsko5pbxSQamoVFIqK1WUqko1pbpSQ6mp1FJqK3WUuko9pb7SQGmoNFIaK02UpkozpbnSQmmpxCutlNZKG6Wt0k5pr3RQOiqdlM5KF6Wr0k3prvRQeiqYgiuEQiqUQiuMwiqcwiuCIiqSIiuKoiqaoiuGYiqWYiuO4iqe4iuBEipAgQpSIiWm9FJ6K32Uvko/pb8yQBmoDFIGK0OUocowZbgyQhmpjFJGK2OUsco4ZbwyQZmoTFImK1OUqco0ZboyQ5mpzFJmK3OUuco8Zb6yQFmoLFIWK0uUpcoyZbmyQlmprFJWK2uUtco6Zb2yQdmobFI2K1uUrco2ZbuyQ9mp7FJ2K3uUvco+Zb9yQDmoHFIOK0eUo8ox5bhyQjmpnFJOK2eUs8o55bxyQbmoXFIuK1eUq8o15bpyQ0lQbiq3lNvKHeWuck+5rzxQHiqPlMfKE+Wp8kx5rrxQXiqvlNfKG+Wt8k55r3xQPiqflM/KF+Wr8k35rvxQfiq/lN/KH+Wv8k9JVOLUJGpSNZmaXE2hplRTqanVNGpaNZ2aXs2gZlQzqZnVLGpWNZuaXc2h5lRzqbnVPGpeNZ+aXy2gFlQLqYXVImpRtZhaXC2hllRLqaXVMmpZtZxaXq2gVlQrqZXVKmpVtZpaXa2h1lRrqbXVOmpdtZ5aX22gNlQbqY3VJmpTtZnaXG2htlTj1VZqa7WN2lZtp7ZXO6gd1U5qZ7WL2lXtpnZXe6g9VUzFVUIlVUqlVUZlVU7lVUEVVUmVVUVVVU3VVUM1VUu1VUd1VU/11UANVaBCFamRGlN7qb3VPmpftZ/aXx2gDlQHqYPVIepQdZg6XB2hjlRHqaPVMepYdZw6Xp2gTlQnqZPVKepUdZo6XZ2hzlRnqbPVOepcdZ46X12gLlQXqYvVJepSdZm6XF2hrlRXqavVNepadZ26Xt2gblQ3qZvVLepWdZu6Xd2h7lR3qbvVPepedZ+6Xz2gHlQPqYfVI+pR9Zh6XD2hnlRPqafVM+pZ9Zx6Xr2gXlQvqZfVK+pV9Zp6Xb2hJqg31VvqbfWOele9p95XH6gP1UfqY/WJ+lR9pj5XX6gv1Vfqa/WN+lZ9p75XP6gf1U/qZ/WL+lX9pn5Xf6g/1V/qb/WP+lf9pyaqcVoSLamWTEuupdBSaqm01FoaLa2WTkuvZdAyapm0zFoWLauWTcuu5dByarm03FoeLa+WT8uvFdAKaoW0wloRrahWTCuuldBKaqW00loZraxWTiuvVdAqapW0yloVrapWTauu1dBqarW02lodra5WT6uvNdAaao20xloTranWTGuutdBaavFaK6211kZrq7XT2msdtI5aJ62z1kXrqnXTums9tJ4apuEaoZEapdEao7Eap/GaoImapMmaoqmapumaoZmapdmao7map/laoIUa0KCGtEiLab203lofra/WT+uvDdAGaoO0wdoQbag2TBuujdBGaqO00doYbaw2ThuvTdAmapO0ydoUbao2TZuuzdBmarO02docba42T5uvLdAWaou0xdoSbam2TFuurdBWaqu01doaba22TluvbdA2apu0zdoWbau2Tduu7dB2aru03doeba+2T9uvHdAOaoe0w9oR7ah2TDuundBOaqe009oZ7ax2TjuvXdAuape0y9oV7ap2Tbuu3dAStJvaLe22dke7q93T7msPtIfaI+2x9kR7qj3TnmsvtJfaK+219kZ7q73T3msftI/aJ+2z9kX7qn3Tvms/tJ/aL+239kf7q/3TErU4PYmeVE+mJ9dT6Cn1VHpqPY2eVk+np9cz6Bn1THpmPYueVc+mZ9dz6Dn1XHpuPY+eV8+n59cL6AX1QnphvYheVC+mF9dL6CX1UnppvYxeVi+nl9cr6BX1SnplvYpeVa+mV9dr6DX1WnptvY5eV6+n19cb6A31RnpjvYneVG+mN9db6C31eL2V3lpvo7fV2+nt9Q56R72T3lnvonfVu+nd9R56Tx3TcZ3QSZ3SaZ3RWZ3TeV3QRV3SZV3RVV3Tdd3QTd3Sbd3RXd3TfT3QQx3oUEd6pMf0XnpvvY/eV++n99cH6AP1QfpgfYg+VB+mD9dH6CP1UfpofYw+Vh+nj9cn6BP1SfpkfYo+VZ+mT9dn6DP1WfpsfY4+V5+nz9cX6Av1RfpifYm+VF+mL9dX6Cv1VfpqfY2+Vl+nr9c36Bv1TfpmfYu+Vd+mb9d36Dv1XfpufY++V9+n79cP6Af1Q/ph/Yh+VD+mH9dP6Cf1U/pp/Yx+Vj+nn9cv6Bf1S/pl/Yp+Vb+mX9dv6An6Tf2Wflu/o9/V7+n39Qf6Q/2R/lh/oj/Vn+nP9Rf6S/2V/lp/o7/V3+nv9Q/6R/2T/ln/on/Vv+nf9R/6T/2X/lv/o//V/+mJepyRxEhqJDOSGymMlEYqI7WRxkhrpDPSGxmMjEYmI7ORxchqZDOyGzmMnEYuI7eRx8hr5DPyGwWMgkYho7BRxChqFDOKGyWMkkYpo7RRxihrlDPKGxWMikYlo7JRxahqVDOqGzWMmkYto7ZRx6hr1DPqGw2MhkYjo7HRxGhqNDOaGy2Mlka80cpobbQx2hrtjPZGB6Oj0cnobHQxuhrdjO5GD6OngRm4QRikQRm0wRiswRm8IRiiIRmyoRiqoRm6YRimYRm24Riu4Rm+ERihAQxoICMyYkYvo7fRx+hr9DP6GwOMgcYgY7AxxBhqDDOGGyOMkcYoY7QxxhhrjDPGGxOMicYkY7IxxZhqTDOmGzOMmcYsY7Yxx5hrzDPmGwuMhcYiY7GxxFhqLDOWGyuMlcYqY7WxxlhrrDPWGxuMjcYmY7OxxdhqbDO2GzuMncYuY7exx9hr7DP2GweMg8Yh47BxxDhqHDOOGyeMk8Yp47RxxjhrnDPOGxeMi8Yl47JxxbhqXDOuGzeMBOOmccu4bdwx7hr3jPvGA+Oh8ch4bDwxnhrPjOfGC+Ol8cp4bbwx3hrvjPfGB+Oj8cn4bHwxvhrfjO/GD+On8cv4bfwx/hr/jEQjzkxiJjWTmcnNFGZKM5WZ2kxjpjXTmenNDGZGM5OZ2cxiZjWzmdnNHGZOM5eZ28xj5jXzmfnNAmZBs5BZ2CxiFjWLmcXNEmZJs5RZ2ixjljXLmeXNCmZFs5JZ2axiVjWrmdXNGmZNs5ZZ26xj1jXrmfXNBmZDs5HZ2GxiNjWbmc3NFmZLM95sZbY225htzXZme7OD2dHsZHY2u5hdzW5md7OH2dPETNwkTNKkTNpkTNbkTN4UTNGUTNlUTNXUTN00TNO0TNt0TNf0TN8MzNAEJjSRGZkxs5fZ2+xj9jX7mf3NAeZAc5A52BxiDjWHmcPNEeZIc5Q52hxjjjXHmePNCeZEc5I52ZxiTjWnmdPNGeZMc5Y525xjzjXnmfPNBeZCc5G52FxiLjWXmcvNFeZKc5W52lxjrjXXmevNDeZGc5O52dxibjW3mdvNHeZOc5e529xj7jX3mfvNA+ZB85B52DxiHjWPmcfNE+ZJ85R52jxjnjXPmefNC+ZF85J52bxiXjWvmdfNG2aCedO8Zd4275h3zXvmffOB+dB8ZD42n5hPzWfmc/OF+dJ8Zb4235hvzXfme/OD+dH8ZH42v5hfzW/md/OH+dP8Zf42/5h/zX9mohlnJbGSWsms5FYKK6WVykptpbHSWums9FYGK6OVycpsZbGyWtms7FYOK6eVy8pt5bHyWvms/FYBq6BVyCpsFbGKWsWs4lYJq6RVyiptlbHKWuWs8lYFq6JVyapsVbGqWtWs6lYNq6ZVy6pt1bHqWvWs+lYDq6HVyGpsNbGaWs2s5lYLq6UVb7WyWlttrLZWO6u91cHqaHWyOltdrK5WN6u71cPqaWEWbhEWaVEWbTEWa3EWbwmWaEmWbCmWammWbhmWaVmWbTmWa3mWbwVWaAELWsiKrJjVy+pt9bH6Wv2s/tYAa6A1yBpsDbGGWsOs4dYIa6Q1yhptjbHGWuOs8dYEa6I1yZpsTbGmWnFJ4uJmWDOtWdZsa44115pnzbcWWAutRdZia4m11FpmLbdWWCutVdZqa4211lpnrbc2WButTdZma4u11dpmbbd2WDutXdZua4+119pn7bcOWAetQ9Zh64h11DpmHbdOWCetU9Zp64x11jpnnbcuWBetS9Zl64p11bpmXbduWAnWTeuWddu6Y9217ln3rQfWQ+uR9dh6Yj21nlnPrRfWS+uV9dp6Y7213lnvrQ/WR+uT9dn6Yn21vlnfrR/WT+uX9dv6Y/21/lmJVpydxE5qJ7OT2ynslHYqO7Wdxk5rp7PT2xnsjHYmO7Odxc5qZ7Oz2znsnHYuO7edx85r57Pz2wXsgnYhu7BdxC5qF7OL2yXsknYpu7Rdxi5rl7PL2xXsinYlu7Jdxa5qV7Or2zXsmnYtu7Zdx65r17Pr2w3shnYju7HdxG5qN7Ob2y3slna83cpubbex29rt7PZ2B7uj3cnubHexu9rd7O52D7unjdm4TdikTdm0zdiszdm8LdiiLdmyrdiqrdm6bdimbdm27diu7dm+HdihDWxoIzuyY3Yvu7fdx+5r97P72wPsgfYge7A9xB5qD7OH2yPskfYoe7Q9xh5rj7PH2xPsifYke7I9xZ5qT7On2zPsmfYse7Y9x55rz7Pn2wvshfYie7G9xF5qL7OX2yvslfYqe7W9xl5rr7PX2xvsjfYme7O9xd5qb7O32zvsnfYue7e9x95r77P32wfsg/Yh+7B9xD5qH7OP2yfsk/Yp+7R9xj5rn7PP2xfsi/Yl+7J9xb5qX7Ov2zfsBPumfcu+bd+x79r37Pv2A/uh/ch+bD+xn9rP7Of2C/ul/cp+bb+x39rv7Pf2B/uj/cn+bH+xv9rf7O/2D/un/cv+bf+x/9r/7EQ7zkniJHWSOcmdFE5KJ5WT2knjpHXSOemdDE5GJ5OT2cniZHWyOdmdHE5OJ5eT28nj5HXyOfmdAk5Bp5BT2CniFHWKOcWdEk5Jp5RT2injlHXKOeWdCk5Fp5JT2aniVHWqOdWdGk5Np5ZT26nj1HXqOfWdBk5Dp5HT2GniNHWaOc2dFk5LJ95p5bR22jhtnXZOe6eD09Hp5HR2ujhdnW5Od6eH09PBHNwhHNKhHNphHNbhHN4RHNGRHNlRHNXRHN0xHNOxHNtxHNfxHN8JnNABDnSQEzkxp5fT2+nj9HX6Of2dAc5AZ5Az2BniDHWGOcOdEc5IZ5Qz2hnjjHXGOeOdCc5EZ5Iz2ZniTHWmOdOdGc5MZ5Yz25njzHXmOfOdBc5CZ5Gz2FniLHWWOcudFc5KZ5Wz2lnjrHXWOeudDc5GZ5OT6GxxtjrbnO3ODmens8vZ7exx9jr7nP3OAeegc8g57BxxjjrHnOPOCeekc8o57ZxxzjrnnPPOBeeic8m57FxxrjrXnOvODSfBuenccm47d5y7zj3nvvPAeeg8ch47T5ynzjPnufPCeem8cl47b5y3zjvnvfPB+eh8cj47X5yvzjfnu/PD+en8cn47f5y/zj8n0Ylzk7hJ3WRucjeFm9JN5aZ207hp3XRuejeDm9HN5GZ2s7hZ3WxudjeHm9PN5eZ287h53XxufreAW9At5BZ2i7hF3WJucbeEW9It5ZZ2y7hl3XJuebeCW9Gt5FZ2q7hV3WpudbeGW9Ot5dZ267h13XpufbeB29Bt5DZ2m7hN3WZuc7eF29KNd1u5rdv972duB7ej28nt7HZxu7rd3O5uD7eni7m4S7ikS7m0y7isy7m8K7iiK7myq7iqq7m6a7ima7m267iu67m+G7ihC1zoIjdyY24vt7fbx+3r9nP7uwPcge4gd7A7xB3qDnOHuyPcke4od7Q7xh3rjnPHuxPcie4kd7I7xZ3qTnOnuzPcme4sd7Y7x53rznPnuwvche4id7G7xF3qLnOXuyvcle4qd7W7xl3rrnPXuxvcje4md7O7xd3qbnO3uzvcne4ud7e7x93r7nP3uwfcg+4h97B7xD3qHnOPuyfck+4p97R7xj3rnnPPuxfci+4l97J7xb3qXnOvuzfcBPeme8u97d5x77r33PvuA/eh+8h97D5xn7rP3OfuC/el+8p97b5x37rv3PfuB/ej+8n97H5xv7rf3O/uD/en+8v97f5x/7r/3EQ3zkviJfWSecm9FF5KL5WX2kvjpfXSeem9DF5GL5OX2cviZfWyedm9HF5OL5eX28vj5fXyefm9Al5Br5BX2CviFfWKecW9El5Jr5RX2ivjlfXKeeW9Cl5Fr5JX2aviVfWqedW9Gl5Nr5ZX26vj1fXqefW9Bl5Dr5HX2GviNfWaec29Fl5LL95r5bX22nhtvXZee6+D19Hr5HX2unhdvW5ed6+H19PDPNwjPNKjPNpjPNbjPN4TPNGTPNlTPNXTPN0zPNOzPNtzPNfzPN8LvNADHvSQF3kxr5fX2+vj9fX6ef29Ad5Ab5A32BviDfWGecO9Ed5Ib5Q32hvjjfXGeeO9Cd5Eb5I32ZviTfWmedO9Gd5Mb5Y325vjzfXmefO9Bd5Cb5G32FviLfWWecu9Fd5Kb5W32lvjrfXWeeu9Dd5Gb5O32dvibfW2edu9Hd5Ob5e329vj7fX2efu9A95B75B32DviHfWOece9E95J75R32jvjnfXOeee9C95F75J32bviXfWuede9G16Cd9O75d327nh3vXvefe+B99B75D32nnhPvWfec++F99J75b323nhvvXfee++D99H75H32vnhfvW/ed++H99P75f32/nh/vX9eohfnJ/GT+sn85H4KP6Wfyk/tp/HT+un89H4GP6Ofyc/sZ/Gz+tn87H4OP6efy8/t5/Hz+vn8/H4Bv6BfyC/sF/GL+sX84n4Jv6Rfyi/tl/HL+uX88n4Fv6Jfya/sV/Gr+tX86n4Nv6Zfy6/t1/Hr+vX8+n4Dv6HfyG/sN/Gb+s385n4Lv6Uf77fyW/tt/LZ+O7+938Hv6HfyO/td/K5+N7+738Pv6WM+7hM+6VM+7TM+63M+7wu+6Eu+7Cu+6mu+7hu+6Vu+7Tu+63u+7wd+6AMf+siP/Jjfy+/t9/H7+v38/v4Af6A/yB/sD/GH+sP84f4If6Q/yh/tj/HH+uP88f4Ef6I/yZ/sT/Gn+tP86f4Mf6Y/y5/tz/Hn+vP8+f4Cf6G/yF/sL/GX+sv85f4Kf6W/yl/tr/HX+uv89f4Gf6O/yd/sb/G3+tv87f4Of6e/y9/t7/H3+vv8/f4B/6B/yD/sH/GP+sf84/4J/6R/yj/tn/HP+uf88/4F/6J/yb/sX/Gv+tf86/4NP8G/6d/yb/t3/Lv+Pf++/8B/6D/yH/tP/Kf+M/+5/8J/6b/yX/tv/Lf+O/+9/8H/6H/yP/tf/K/+N/+7/8P/6f/yf/t//L/+Pz/RjwuSBEmDZEHyIEWQMkgVpA7SBGmDdEH6IEOQMcgUZA6yBFmDbEH2IEeQM8gV5A7yBHmDfEH+oEBQMCgUFA6KBEWDYkHxoERQMigVlA7KBGWDckH5oEJQMagUVA6qBFWDakH1oEZQM6gV1A7qBHWDekH9oEHQMGgUNA6aBE2DZkHzoEXQMogPWgWtgzZB26Bd0D7oEHQMOgWdgy5B16Bb0D3oEfQMsAAPiIAMqIAOmIANuIAPhEAMpEAOlEANtEAPjMAMrMAOnMANvMAPgiAMQAADFERBLOgV9A76BH2DfkH/YEAwMBgUDA6GBEODYcHwYEQwMhgVjA7GBGODccH4YEIwMZgUTA6mBFODacH0YEYwM5gVzA7mBHODecH8YEGwMFgULA6WBEuDZcHyYEWwMlgVrA7WBGuDdcH6YEOwMdgUbA62BFuDbcH2YEewM9gV7A72BHuDfcH+4EBwMDgUHA6OBEeDY8Hx4ERwMjgVnA7OBGeDc8H54EJwMbgUXA6uBFeDa8H14EaQENwMbgW3gzvB3eBecD94EDwMHgWPgyfB0+BZ8Dx4EbwMXgWvgzfB2+Bd8D74EHwMPgWfgy/B1+Bb8D34EfwMfgW/gz/B3+BfkBjEhUnCpGGyMHmYIkwZpgpTh2nCtGG6MH2YIcwYZgozh1nCrGG2MHuYI8wZ5gpzh3nCvGG+MH9YICwYFgoLh0XComGxsHhYIiwZlgpLh2XCsmG5sHxYIawYVgorh1XCqmG1sHpYI6wZ1gprh3XCumG9sH7YIGwYNgobh03CpmGzsHnYImwZxoetwtZhm7Bt2C5sH3YIO4adws5hl7Br2C3sHvYIe4ZYiIdESIZUSIdMyIZcyIdCKIZSKIdKqIZaqIdGaIZWaIdO6IZe6IdBGIYghCEKozAW9gp7h33CvmG/sH84IBwYDgoHh0PCoeGwcHg4IhwZjgpHh2PCseG4cHw4IZwYTgonh1PCqeG0cHo4I5wZzgpnh3PCueG8cH64IFwYLgoXh0vCpeGycHm4IlwZrgpXh2vCteG6cH24IdwYbgo3h1vCreG2cHu4I9wZ7gp3h3vCveG+cH94IDwYHgoPh0fCo+Gx8Hh4IjwZngpPh2fCs+G58Hx4IbwYXgovh1fCq+G18Hp4I0wIb4a3wtvhnfBueC+8Hz4IH4aPwsfhk/Bp+Cx8Hr4IX4avwtfhm/Bt+C58H34IP4afws/hl/Br+C38Hv4If4a/wt/hn/Bv+C9MDONAEpAUJAPJQQqQEqQCqUEakBakA+lBBpARZAKZQRaQFWQD2UEOkBPkArlBHpAX5AP5QQFQEBQChUERUBQUA8VBCVASlAKlQRlQFpQD5UEFUBFUApVBFVAVVAPVQQ1QE9QCtUEdUBfUA/VBA9AQNAKNQRPQFDQDzUEL0BLEg1agNWgD2oJ2oD3oADqCTqAz6AK6gm6gO+gBegIM4IAAJKAADRjAAg7wQAAikIAMFKACDejAACawgA0c4AIP+CAAIQAAAgQiEAO9QG/QB/QF/UB/MAAMBIPAYDAEDAXDwHAwAowEo8BoMAaMBePAeDABTASTwGQwBUwF08B0MAPMBLPAbDAHzAXzwHywACwEi8BisAQsBcvAcrACrASrwGqwBqwF68B6sAFsBJvAZrAFbAXbwHawA+wEu8BusAfsBfvAfnAAHASHwGFwBBwFx8BxcAKcBKfAaXAGnAXnwHlwAVwEl8BlcAVcBdfAdXADJICb4Ba4De6Au+AeuA8egIfgEXgMnoCn4Bl4DlKAl+AVeA3egLfgHXgPPoCP4BP4DL6Ar+Ab+A5+gJ/gF/gN/oC/4B9IBHEwCUwKk8HkMAVMCVPB1DANTAvTwfQwA8wIM8HMMAvMCrPB7DAHzAlzwdwwD8wL88H8sAAsCAvBwrAILAqLweKwBCwJS8HSsAwsC8vB8rACrAgrwcqwCqwKq8HqsAasCWvB2rAOrAvrwfqwAWwIG8HGsAlsCpvB5rAFbAnjYSvYGraBbWE72B52gB1hJ9gZdoFdYTfYHfaAPSEGcUhAElKQhgxkIQd5KEARSlCGClShBnVoQBNa0IYOdKEHfRjAEAIIIYIRjMFesDfsA/vCfrA/HAAHwkFwMBwCh8JhcDgcAUfCUXA0HAPHwnFwPJwAJ8JJcDKcAqfCaXA6nAFnwllwNpwD58J5cD5cABfCRXAxXAKXwmVwOVwBV8JVcDVcA9fCdXA93AA3wk1wM9wCt8JtcDvcAXfCXXA33AP3wn1wPzwAD8JD8DA8Ao/CY/A4PAFPwlPwNDwDz8Jz8Dy8AC/CS/AyvAKvwmvwOrwBE+BNeAvehnfgXXgP3ocP4EP4CD6GT+BT+Aw+hy/gS/gKvoZv4Fv4Dr6HH+BH+Al+hl/gV/gNfoc/4E/4C/6Gf+Bf+A8mwjiUBCVFyVBylAKlRKlQapQGpUXpUHqUAWVEmVBmlAVlRdlQdpQD5US5UG6UB+VF+VB+VAAVRIVQYVQEFUXFUHFUApVEpVBpVAaVReVQeVQBVUSVUGVUBVVF1VB1VAPVRLVQbVQH1UX1UH3UADVEjVBj1AQ1Rc1Qc9QCtUTxqBVqjdqgtqgdao86oI6oE+qMuqCuqBvqjnqgnghDOCIQiShEIwaxiEM8EpCIJCQjBalIQzoykIksZCMHuchDPgpQiACCCKEIxVAv1Bv1QX1RP9QfDUAD0SA0GA1BQ9EwNByNQCPRKDQajUFj0Tg0Hk1AE9EkNBlNQVPRNDQdzUAz0Sw0G81Bc9E8NB8tQAvRIrQYLUFL0TK0HK1AK9EqtBqtQWvROrQebUAb0Sa0GW1BW9E2tB3tQDvRLrQb7UF70T60Hx1AB9EhdBgdQUfRMXQcnUAn0Sl0Gp1BZ9E5dB5dQBfRJXQZXUFX0TV0Hd1ACegmuoVuozvoLrqH7qMH6CF6hB6jJ+gpeoaeoxfoJXqFXqM36C16h96jD+gj+oQ+oy/oK/qGvqMf6Cf6hX6jP+gv+ocSUVyUJEoaJYuSRymilFGqKHWUJkobpYvSRxmijFGmKHOUJcoaZYuyRzminFGuKHeUJ8ob5YvyRwWiglGhVHFRkahoVCwqHpWISkalotJRmahsVC4qH1WIKkaVospRlahqVC2qHtWIaka1otpRnahuVC+qHzWIGkaNosZRk6hp1CxqHrWIWkbxUauoddQmahu1i9pHHaKOUaeoc9Ql6hp1i7pHPaKeERbhERGRERXREROxERfxkRCJkRTJkRKpkRbpkRGZkRXZkRO5kRf5URCFEYhghKIoikW9ot5Rn6hv1C/qHw2IBkaDosHRkGhoNCwaHo2IRkajotHRmGhsNC4aH02IJkaTosnRlGhqNC2aHs2IZkazotnRnGhuNC+aHy2IFkaLosXRkmhptCxaHq2IVkarotXRmmhttC5aH22INkabos3RlmhrtC3aHu2Idka7ot3RnmhvtC/aHx2IDkaHosPRkehodCw6Hp2ITkanotPRmehsdC46H12ILkaXosvRlehqdC26Ht2IEqKb0a3odnQnuhvdi+5HD6KH0aPocfQkeho9i55HL6KX0avodfQmehu9i95HH6KP0afoc/Ql+hp9i75HP6Kf0a/od/Qn+hv9ixKjuFiSWNJYsljyWIpYyliqWOpYmljaWLpY+liGWMZYpljmWJZY1li2WPZYjljOWK5Y7lieWN5Yvlj+WIFYwVihWOFYkVjRWLFY8ViJWMlYqVjpWJlY2Vi5WPlYhVjFWKVY5ViVWNVYtVj1WI1YzVitWO1YnVjdWL1Y/ViDWMNYo1jjWJNY01izWPNYi1jLWHys1X8s2+NiHs0CAOA0qW3btm2ltm2761nbu7Pz1rZt27Zt46tz/pwLebA2WFssEWuHtcc6YB2xTlhnrAvWFeuGdcd6YD2xXlhvrA/WF+uH9ccGYAOxQdhgbAg2FBuGDcdGYCOxUdhobAw2FhuHjccmYBOxSdhkbAo2FZuGTcdmYDMxDMMxAiMxCqMxBgMYi3EYjwmYiEmYjCmYimmYjhmYiVmYjTmYi3mYjwVYiEEswhAWw2Zhs7E52FxsHjYfW4AtxBZhi7El2FJsGbYcW4GtxFZhq7E12FpsHbYe24BtxDZhm7Et2FZsG7Yd24HtxHZhu7E92F5sH7YfO4AdxA5hh7Ej2FHsGHYcO4GdxE5hp7Ez2FnsHHYeu4BdxC5hl7Er2FXsGnYdu4HdxG5ht7E72F3sHnYfe4A9xB5hj7En2FPsGfYce4G9xF5hr7E32FvsHfYe+4B9xD5hn7Ev2FfsG/Yd+w/7gf3EfmG/sT/YX+wfloTF4cnweDwBT46nwFPiqfDUeBo8LZ4OT49nwDPimfDMeBY8K54Nz47nwHPiufDceB48L54Pz48XwAvihfDCeBG8KF4ML46XwEvipfDSeBm8LF4OL49XwCvilfDKeBW8Kl4Nr47XwGvitfDaeB28Ll4Pr483wBvijfDGeBO8Kd4Mb463wFvirfDWeBu8LZ6It8Pb4x3wjngnvDPeBe+Kd8O74z3wnngvvDfeB++L98P74wPwgfggfDA+BB+KD8OH4yPwkfgofDQ+Bh+Lj8PH4xPwifgkfDI+BZ+KT8On4zPwmTiG4ziBkziF0ziDA5zFOZzHBVzEJVzGFVzFNVzHDdzELdzGHdzFPdzHAzzEIR7hCI/hs/DZ+Bx8Lj4Pn48vwBfii/DF+BJ8Kb4MX46vwFfiq/DV+Bp8Lb4OX49vwDfim/DN+BZ8K74N347vwHfiu/Dd+B58L74P348fwA/ih/DD+BH8KH4MP46fwE/ip/DT+Bn8LH4OP49fwC/il/DL+BX8Kn4Nv47fwG/it/Db+B38Ln4Pv48/wB/ij/DH+BP8Kf4Mf46/wF/ir/DX+Bv8Lf4Of49/wD/in/DP+Bf8K/4N/47/h//Af+K/8N/4H/wv/g9PwuOIZEQ8kUAkJ1IQKYlURGoiDZGWSEekJzIQGYlMRGYiC5GVyEZkJ3IQOYlcRG4iD5GXyEfkJwoQBYlCRGGiCFGUKEYUJ0oQJYlSRGmiDFGWKEeUJyoQFYlKRGWiClGVqEZUJ2oQNYlaRG2iDlGXqEfUJxoQDYlGRGOiCdGUaEY0J1oQLYlWRGuiDdGWSCTaEe2JDkRHohPRmehCdCW6Ed2JHkRPohfRm+hD9CX6Ef2JAcRAYhAxmBhCDCWGEcOJEcRIYhQxmhhDjCXGEeOJCcREYhIxmZhCTCWmEdOJGcRMAiNwgiBIgiJogiEAwRIcwRMCIRISIRMKoRIaoRMGYRIWYRMO4RIe4RMBERKQiAhExIhZxGxiDjGXmEfMJxYQC4lFxGJiCbGUWEYsJ1YQK4lVxGpiDbGWWEesJzYQG4lNxGZiC7GV2EZsJ3YQO4ldxG5iD7GX2EfsJw4QB4lDxGHiCHGUOEYcJ04QJ4lTxGniDHGWOEecJy4QF4lLxGXiCnGVuEZcJ24QN4lbxG3iDnGXuEfcJx4QD4lHxGPiCfGUeEY8J14QL4lXxGviDfGWeEe8Jz4QH4lPxGfiC/GV+EZ8J/4jfhA/iV/Eb+IP8Zf4RyQRcWQyMp5MIJOTKciUZCoyNZmGTEumI9OTGciMZCYyM5mFzEpmI7OTOcicZC4yN5mHzEvmI/OTBciCZCGyMFmELEoWI4uTJciSZCmyNFmGLEuWI8uTFciKZCWyMlmFrEpWI6uTNciaZC2yNlmHrEvWI+uTDciGZCOyMdmEbEo2I5uTLciWZCuyNdmGbEsmku3I9mQHsiPZiexMdiG7kt3I7mQPsifZi+xN9iH7kv3I/uQAciA5iBxMDiGHksPI4eQIciQ5ihxNjiHHkuPI8eQEciI5iZxMTiGnktPI6eQMciaJkThJkCRJkTTJkIBkSY7kSYEUSYmUSYVUSY3USYM0SYu0SYd0SY/0yYAMSUhGJCJj5CxyNjmHnEvOI+eTC8iF5CJyMbmEXEouI5eTK8iV5CpyNbmGXEuuI9eTG8iN5CZyM7mF3EpuI7eTO8id5C5yN7mH3EvuI/eTB8iD5CHyMHmEPEoeI4+TJ8iT5CnyNHmGPEueI8+TF8iL5CXyMnmFvEpeI6+TN8ib5C3yNnmHvEveI++TD8iH5CPyMfmEfEo+I5+TL8iX5CvyNfmGfEu+I9+TH8iP5CfyM/mF/Ep+I7+T/5E/yJ/kL/I3+Yf8S/4jk8g4KhkVTyVQyakUVEoqFZWaSkOlpdJR6akMVEYqE5WZykJlpbJR2akcVE4qF5WbykPlpfJR+akCVEGqEFWYKkIVpYpRxakSVEmqFFWaKkOVpcpR5akKVEWqElWZqkJVpapR1akaVE2qFlWbqkPVpepR9akGVEOqEdWYakI1pZpRzakWVEuqFdWaakO1pRKpdlR7qgPVkepEdaa6UF2pblR3qgfVk+pF9ab6UH2pflR/agA1kBpEDaaGUEOpYdRwagQ1khpFjabGUGOpcdR4agI1kZpETaamUFOpadR0agY1k8IonCIokqIommIoQLEUR/GUQImURMmUQqmURumUQZmURdmUQ7mUR/lUQIUUpCIKUTFqFjWbmkPNpeZR86kF1EJqEbWYWkItpZZRy6kV1EpqFbWaWkOtpdZR66kN1EZqE7WZ2kJtpbZR26kd1E5qF7Wb2kPtpfZR+6kD1EHqEHWYOkIdpY5Rx6kT1EnqFHWaOkOdpc5R56kL1EXqEnWZukJdpa5R16kb1E3qFnWbukPdpe5R96kH1EPqEfWYekI9pZ5Rz6kX1EvqFfWaekO9pd5R76kP1EfqE/WZ+kJ9pb5R3xvHxcVRP6lf1G/qD/WX+kclUXF0MjqeTqCT0ynolHQqOjWdhk5Lp6PT0xnojHQmOnMSoLPS2ejsdA46J52Lzk3nofPS+ej8dAG6IF2ILkwXoYvSxejidAm6JF2KLk2XocvS5ejydAW6Il2JrkxXoavS1ejqdA26Jl2Lrk3XoevS9ej6dAO6Id2Ibkw3oZvSzejmdAu6Jd2Kbk23odvSiXQ7uj3dge5Id6I7013ornQ3ujvdg+5J96J7033ovnQ/uj89gB5ID6IH00PoofQwejg9gh5Jj6JH02PosfQ4ejw9gZ5IT6In01PoqfQ0ejo9g55JYzROEzRJUzRNMzSgWZqjeVqgRVqiZVqhVVqjddqgTdqibdqhXdqjfTqgQxrSEY3oGD2Lnk3PoefS8+j59AJ6Ib2IXkwvoZfSy+jl9Ap6Jb2KXk2vodfS6+j19AZ6I72J3kxvobfS2+jt9A56J72L3k3voffS++j99AH6IH2IPkwfoY/Sx+jj9An6JH2KPk2foc/S5+jz9AX6In2Jvkxfoa/S1+jr9A36Jn2Lvk3foe/S9+j79AP6If2Ifkw/oZ/Sz+jn9Av6Jf2Kfk2/od/S7+j39Af6I/2J/kx/ob/S3+jv9H/0D/on/Yv+Tf+h/9L/6CQ6jknGxDMJTHImBZOSScWkZtIwaZl0THomA5ORycRkZrIwWZlsTHYmB5OTycXkZvIweZl8TH6mAFOQKcQUZoowRZliTHGmBFOSKcWUZsowZZlyTHmmAlORqcRUZqowVZlqTHWmBlOTqcXUZuowdZl6TH2mAdOQacQ0ZpowTZlmTHOmBdOSacW0ZtowbZlEph3TnunAdGQ6MZ2ZLkxXphvTnenB9GR6Mb2ZPkxfph/TnxnADGQGMYOZIcxQZhgznBnBjGRGMaOZMcxYZhwznpnATGQmMZOZKcxUZhoznZnBzGQwBmcIhmQohmYYBjAswzE8IzAiIzEyozAqozE6YzAmYzE24zAu4zE+EzAhA5mIQUyMmcXMZuYwc5l5zHxmAbOQWcQsZpYwS5llzHJmBbOSWcWsZtYwa5l1zHpmA7OR2cRsZrYwW5ltzHZmB7OT2cXsZvYwe5l9zH7mAHOQOcQcZo4wR5ljzHHmBHOSOcWcZs4wZ5lzzHnmAnORucRcZq4wV5lrzHXmBnOTucXcZu4wd5l7zH3mAfOQecQ8Zp4wT5lnzHPmBfOSecW8Zt4wb5l3zHvmA/OR+cR8Zr4wX5lvzHfmP+YH85P5xfxm/jB/mX9MEhMHkoF4kACSgxQgJUgFUoM0IC1IB9KDDCAjyAQygywgK8gGsoMcICfIBXKDPCAvyAfygwKgICgECoMioCgoBoqDEqAkKAVKgzKgLCgHyoMKoCKoBCqDKqAqqAaqgxqgJqgFaoM6oC6oB+qDBqAhaAQagyagKWgGmoMWoCVoBVqDNqAtSATtQHvQAXQEnUBn0AV0Bd1Ad9AD9AS9QG/QB/QF/UB/MAAMBIPAYDAEDAXDwHAwAowEo8BoMAaMBePAeDABTASTwGQwBUwF08B0MAPMBBjAAQFIQAEaMAAAFnCABwIQgQRkoAAVaEAHBjCBBWzgABd4wAcBCAEEEUAgBmaB2WAOmAvmgflgAVgIFoHFYAlYCpaB5WAFWAlWgdVgDVgL1oH1YAPYCDaBzWAL2Aq2ge1gB9gJdoHdYA/YC/aB/eAAOAgOgcPgCDgKjoHj4AQ4CU6B0+AMOAvOgfPgArgILoHL4Aq4Cq6B6+AGuAlugdvgDrgL7oH74AF4CB6Bx+AJeAqegefgBXgJXoHX4A14C96B9+AD+Ag+gc/gC/gKvoHv4D/wA/wEv8Bv8Af8Bf9AEohjk7HxbAKbnE3BpmRTsanZNGxaNh2bns3AZmQzsZnZLGxWNhubnc3B5mRzsbnZPGxeNh+bny3AFmQLsYXZImxRthhbnC3BlmRLsaXZMmxZthxbnq3AVmQrsZXZKmxVthpbna3B1mRrsbXZOmxdth5bn23ANmQbsY3ZJmxTthnbnG3BtmRbsa3ZNmxbNpFtx7ZnO7Ad2U7xndkubFe2G9ud7cH2ZHuxvdk+bF+2H9ufHcAOZAexg9kh7FB2GDucHcGOZEexo9kx7Fh2HDuencBOZCexk9kp7FR2GjudncHOZDEWZwmWZCmWZhkWsCzLsTwrsCIrsTKrsCqrsTprsCZrsTbrsC6bBOLiAjZkIRuxiI2xs9jZ7Bx2LjuPnc8uYBeyi9jF7BJ2KbuMXc6uYFeyq9jV7Bp2LbuOXc9uYDeym9jN7BZ2K7uN3c7uYHeyu9jd7B52L7uP3c8eYA+yh9jD7BH2KHuMPc6eYE+yp9jT7Bn2LHuOPc9eYC+yl9jL7BX2KnuNvc7eYG+yt9jb7B32LnuPvc8+YB+yj9jH7BP2KfuMfc6+YF+yr9jX7Bv2LfuOfc9+YD+yn9jP7Bf2K/uN/c7+x/5gf7K/2N/sH/Yv+49NYuO4ZFw8l8Al51JwKblUXGouDZeWS8el5zJwGblMXGYuC5eVy8Zl53JwOblcXG4uD5eXy8fl5wpwBblCXGGuCFeUK8YV50pwJblSXGmuDFeWK8eV5ypwFblKXGWuCleVq8ZV52pwNblaXG2uDleXq8fV5xpwDblGXGOuCdeUa8Y151pwLblWXGuuDdeWS+Tace25DlxHrhPXmevCdeW6cd25HlxPrhfXm+vD9eX6cf25AdxAbhA3mBvCDeWGccO5EdxIbhQ3mhvDjeXGceO5CdxEbhI3mZvCTeWmcdO5GdxMDuNwjuBIjuJojuEAx3Icx3MCJ3ISJ3MKp3Iap3MGZ3IWZ3MO53Ie53MBF3KQizjExbhZ3GxuDjeXm8fN5xZwC7lF3GJuCbeUW8Yt51ZwK7lV3GpuDbeWW8et5zZwG7lN3GZuC7eV28Zt53ZwO7ld3G5uD7eX28ft5w5wB7lD3GHuCHeUO8Yd505wJ7lT3GnuDHeWO8ed5y5wF7lL3GXuCneVu8Zd525wN7lb3G3uDneXu8fd5x5wD7lH3GPuCfeUe8Y9515wL7lX3GvuDfeWe8e95z5wH7lP3GfuC/eV+8Z95/7jfnA/uV/cb+4P95f7xyVxcXwyPp5P4JPzKfiUfCo+NZ+GT8un49PzGfiMfCY+M5+Fz8pn47PzOficfC4+N5+Hz8vn4/PzBfiCfCG+MF+EL8oX44vzJfiSfCm+NF+GL8uX48vzFfiKfCW+Ml+Fr8pX46vzNfiafC2+Nl+Hr8vX4+vzDfiGfCO+Md+Eb8o345vzLfiWfCu+Nd+Gb8sn8u349nwHviPfie/Md+G78t347nwPviffi+/N9+H78v34/vwAfiA/iB/MD+GH8sP44fwIfiQ/ih/Nj+HH8uP48fwEfiI/iZ/MT+Gn8tP46fwMfiaP8ThP8CRP8TTP8IBneY7neYEXeYmXeYVXeY3XeYM3eYu3eYd3eY/3+YAPechHPOJj/Cx+Nj+Hn8vP4+fzC/iF/CJ+Mb+EX8ov45fzK/iV/Cp+Nb+GX8uv49fzG/iN/CZ+M7+F38pv47fzO/id/C5+N7+H38vv4/fzB/iD/CH+MH+EP8of44/zJ/iT/Cn+NH+GP8uf48/zF/iL/CX+Mn+Fv8pf46/zN/ib/C3+Nn+Hv8vf4+/zD/iH/CP+Mf+Ef8o/45/zL/iX/Cv+Nf+Gf8u/49/zH/iP/Cf+M/+F/8p/47/z//E/+J/8L/43/4f/y//jk/g4IZkQLyQIyYUUQkohlZBaSCOkFdIJ6YUMQkYhk5BZyCJkFbIJ2YUcQk4hl5BbyCPkFfIJ+YUCQkGhkFBYKCIUFYoJxYUSQkmhlFBaKCOUFcoJ5YUKQkWhklBZqCJUFaoJ1YUaQk2hllBbqCPUFeoJ9YUGQkOhkdBYaCI0FZoJzYUWQkuhldBaaCO0FRKFdkJ7oYPQUegkdBa6CF2FbkJ3oYfQU+gl9Bb6CH2FfkJ/YYAwUBgkDBaGCEOFYcJwYYQwUhgljBbGCGOFccJ4YYIwUZgkTBamCFOFacJ0YYYwU8AEXCAEUqAEWmAEILACJ/CCIIiCJMiCIqiCJuiCIZiCJdiCI7iCJ/hCIIQCFCIBCTFhljBbmCPMFeYJ84UFwkJhkbBYWCIsFZYJy4UVwkphlbBaWCOsFdYJ64UNwkZhk7BZ2CJsFbYJ24Udwk5hl7Bb2CPsFfYJ+4UDwkHhkHBYOCIcFY4Jx4UTwknhlHBaOCOcFc4J54ULwkXhknBZuCJcFa4J14Ubwk3hlnBbuCPcFe4J94UHwkPhkfBYeCI8FZ4Jz4UXwkvhlfBaeCO8Fd4J74UPwkfhk/BZ+CJ8Fb4J34X/hB/CT+GX8Fv4I/wV/glJQpyYTIwXE8TkYgoxpZhKTC2mEdOK6cT0YgYxo5hJzCxmEbOK2cTsYg4xp5hLzC3mEfOK+cT8YgGxoFhILCwWEYuKxcTiYgmxpFhKLC2WEcuK5cTyYgWxolhJrCxWEauK1cTqYg2xplhLrC3WEeuK9cT6YgOxodhIbCw2EZuKzcTmYguxpdhKbC22EduKiWI7sb3YQewodhI7i13ErmI3sbvYQ+wp9hJ7i33EvmI/sb84QBwoDhIHi0PEoeIwcbg4QhwpjhJHi2PEseI4cbw4QZwoThIni1PEqeI0cbo4Q5wpYiIuEiIpUiItMiIQWZETeVEQRVESZVERVVETddEQTdESbdERXdETfTEQQxGKkYjEmDhLnC3OEeeK88T54gJxobhIXCwuEZeKy8Tl4gpxpbhKXC2uEdeK68T14gZxo7hJ3CxuEbeK28Tt4g5xp7hL3C3uEfeK+8T94gHxoHhIPCweEY+Kx8Tj4gnxpHhKPC2eEc+K58Tz4gXxonhJvCxeEa+K18Tr4g3xpnhLvC3eEe+K98T74gPxofhIfCw+EZ+Kz8Tn4gvxpfhKfC2+Ed+K78T34gfxo/hJ/Cx+Eb+K38Tv4n/iD/Gn+Ev8Lf4R/4r/xCQxTkomxUsJUnIphZRSSiWlltJIaaV0Unopg5RRyiRllrJIWaVsUnYph5RTyiXllvJIeaV8Un6pgFRQKiQVlopIRaViUnGphFRSKiWVlspIZaVyUnmpglRRqiRVlqpIVaVqUnWphlRTqiXVlupIdaV6Un2pgdRQaiQ1lppITaVmUnOphdRSaiW1ltpIbaVEqZ3UXuogdZQ6SZ2lLlJXqZvUXeoh9ZR6Sb2lPlJfqZ/UXxogDZQGSYOlIdJQaZg0XBohjZRGSaOlMdJYaZw0XpogTZQmSZOlKdJUaZo0XZohzZQwCZcIiZQoiZYYCUisxEm8JEiiJEmypEiqpEm6ZEimZEm25Eiu5Em+FEihBKVIQlJMmiXNluZIc6V50nxpgbRQWiQtlpZIS6Vl0nJphbRSWiWtltZIa6V10nppg7RR2iRtlrZIW6Vt0nZph7RT2iXtlvZIe6V90n7pgHRQOiQdlo5IR6Vj0nHphHRSOiWdls5IZ6Vz0nnpgnRRuiRdlq5IV6Vr0nXphnRTuiXdlu5Id6V70n3pgfRQeiQ9lp5IT6Vn0nPphfRSeiW9lt5Ib6V30nvpg/RR+iR9lr5IX6Vv0nfpP+mH9FP6Jf2W/kh/pX9SkhQnJ5Pj5QQ5uZxCTimnklPLaeS0cjo5vZxBzihnkjPLWeSscjY5u5xDzinnknPLeeS8cj45v1xALigXkgvLReSicjG5uFxCLimXkkvLZeSycjm5vFxBrihXkivLVeSqcjW5ulxDrinXkmvLdeS6cj25vtxAbig3khvLTeSmcjO5udxCbim3klvLbeS2cqLcTm4vd5A7yp3kznIXuavcTe4u95B7yr3k3nIfua/cT+4vD5AHyoPkwfIQeag8TB4uj5BHyqPk0fIYeaw8Th4vT5AnypPkyfIUeao8TZ4uz5BnypiMy4RMypRMy4wMZFbmZF4WZFGWZFlWZFXWZF02ZFO2ZFt2ZFf2ZF8O5FCGciQjOSbPkmfLc+S58jx5vrxAXigvkhfLS+Sl8jJ5ubxCXimvklfLa+S18jp5vbxB3ihvkjfLW+St8jZ5u7xD3invknfLe+S98j55v3xAPigfkg/LR+Sj8jH5uHxCPimfkk/LZ+Sz8jn5vHxBvihfki/LV+Sr8jX5unxDvinfkm/Ld+S78j35vvxAfig/kh/LT+Sn8jP5ufxCfim/kl/Lb+S38jv5vfxB/ih/kj/LX+Sv8jf5u/yf/EP+Kf+Sf8t/5L/yPzlJjlOSKfFKgpJcSaGkVFIpqZU0SlolnZJeyaBkVDIpmZUsSlYlm5JdyaHkVHIpuZU8Sl4ln5JfKaAUVAophZUiSlGlmFJcKaGUVEoppZUySlmlnFJeqaBUVCoplZUqSlWlmlJdqaHUVGoptZU6Sl2lnlJfaaA0VBopjZUmSlOlmdJcaaG0VFoprZU2SlslUWmntFc6KB2VTkpnpYvSVemmdFd6KD2VXkpvpY/SV+mn9FcGKAOVQcpgZYgyVBmmDFdGKCOVUcpoZYwyVhmnjFcmKBOVScpkZYoyVZmmTFdmKDMVTMEVQiEVSqEVRgEKq3AKrwiKqEiKrCiKqmiKrhiKqViKrTiKq3iKrwRKqEAlUpASU2Yps5U5ylxlnjJfWaAsVBYpi5UlylJlmbJcWaGsVFYpq5U1ylplnbJe2aBsVDYpm5UtylZlm7Jd2aHsVHYpu5U9yl5ln7JfOaAcVA4ph5UjylHlmHJcOaGcVE4pp5UzylnlnHJeuaBcVC4pl5UrylXlmnJduaHcVG4pt5U7yl3lnnJfeaA8VB4pj5UnylPlmfJceaG8VF4pr5U3ylvlnfJe+aB8VD4pn5Uvylflm/Jd+U/5ofxUfim/lT/KX+WfkqTEqcnUeDVBTa6mUFOqqdTUaho1rZpOTa9mUDOqmdTMahY1q5pNza7mUHOqudTcah41r5pPza8WUAuqhdTCahG1qFpMLa6WUEuqpdTSahm1rFpOLa9WUCuqldTKahW1qlpNra7WUGuqtdTaah21rlpPra82UBuqjdTGahO1qdpMba62UFuqrdTWahu1rZqotlPbqx3UjmontbPaRe2qdlO7qz3UnmovtbfaR+2r9lP7qwPUgeogdbA6RB2qDlOHqyPUkeoodbQ6Rh2rjlPHqxPUieokdbI6RZ2qTlOnqzPUmSqm4iqhkiql0iqjApVVOZVXBVVUJVVWFVVVNVVXDdVULdVWHdVVPdVXAzVUoRqpSI2ps9TZ6hx1rjpPna8uUBeqi9TF6hJ1qbpMXa6uUFeqq9TV6hp1rbpOXa9uUDeqm9TN6hZ1q7pN3a7uUHequ9Td6h51r7pP3a8eUA+qh9TD6hH1qHpMPa6eUE+qp9TT6hn1rHpOPa9eUC+ql9TL6hX1qnpNva7eUG+qt9Tb6h31rnpPva8+UB+qj9TH6hP1qfpMfa6+UF+qr9TX6hv1rfpOfa9+UD+qn9TP6hf1q/pN/a7+p/5Qf6q/1N/qH/Wv+k9NUuO0ZFq8lqAl11JoKbVUWmotjZZWS6el1zJoGbVMWmYti5ZVy6Zl13JoObVcWm4tj5ZXy6fl1wpoBbVCWmGtiFZUK6YV10poJbVSWmmtjFZWK6eV1ypoFbVKWmWtilZVq6ZV12poNbVaWm2tjlZXq6fV1xpoDbVGWmOtidZUa6Y111poLbVWWmutjdZWS9Taae21DlpHrZPWWeuiddW6ad21HlpPrZfWW+uj9dX6af21AdpAbZA2WBuiDdWGacO1EdpIbZQ2WhujjdXGaeO1CdpEbZI2WZuiTdWmadO1GdpMDdNwjdBIjdJojdGAxmqcxmuCJmqSJmuKpmqapmuGZmqWZmuO5mqe5muBFmpQizSkxbRZ2mxtjjZXm6fN1xZoC7VF2mJtibZUW6Yt11ZoK7VV2mptjbZWW6et1zZoG7VN2mZti7ZV26Zt13ZoO7Vd2m5tj7ZX26ft1w5oB7VD2mHtiHZUO6Yd105oJ7VT2mntjHZWO6ed1y5oF7VL2mXtinZVu6Zd125oN7Vb2m3tjnZXu6fd1x5oD7VH2mPtifZUe6Y9115oL7VX2mvtjfZWe6e91z5oH7VP2mfti/ZV+6Z91/7Tfmg/tV/ab+2P9lf7pyVpcXoyPV5P0JPrKfSUeio9tZ5GT6un09PrGfSMeiY9s55Fz6pn07PrOfScei49t55Hz6vn0/PrBfSCeiG9sF5EL6oX04vrJfSSeim9tF5GL6uX08vrFfSKeiW9sl5Fr6pX06vrNfSaei29tl5Hr6vX0+vrDfSGeiO9sd5Eb6o305vrLfSWeiu9td5Gb6sn6u309noHvaPeSe+sd9G76t307noPvafeS++t99H76v30/voAfaA+SB+sD9GH6sP04foIfaQ+Sh+tj9HH6uP08fqEhP9LMn2qPk2frs/QZ+qYjuuETuqUTuuMDnRW53ReF3RRl3RZV3RV13RdN3RTt3Rbd3RX93RfD/RQh3qkIz2mz9Jn63P0ufo8fb6+QF+oL9IX60v0pfoyfbm+Ql+pr9JX62v0tfo6fb2+Qd+ob9I361v0rfo2fbu+Q9+p79J363v0vfo+fb9+QD+oH9IP60f0o/ox/bh+Qj+pn9JP62f0s/o5/bx+Qb+oX9Iv61f0q/o1/bp+Q7+p39Jv63f0u/o9/b7+QH+oP9If60/0p/oz/bn+Qn+pv9Jf62/0t/o7/b3+Qf+of9I/61/0r/o3/bv+n/5D/6n/0n/rf/S/+j89SY8zkhnxRoKR3EhhpDRSGamNNEZaI52R3shgZDQyGZmNLEZWI5uR3chh5DRyGbmNPEZeI5+R3yhgFDQKGYWNIkZRo5hR3ChhlDRKGaWNMkZZo5xR3qhgVDQqGZWNKkZVo5pR3ahh1DRqGbWNOkZdo55R32hgNDQaGY2NJkZTo5nR3GhhtDRaGa2NNkZbI9FoZ7Q3OhgdjU5GZ6OL0dXoZnQ3ehg9jV5Gb6OP0dfoZ/Q3BhgDjUHGYGOIMdQYZgw3RhgjjVHGaGOMMdYYZ4w3JhgTjUnGZGOKMdWYZkw3ZhgzDczADcIgDcqgDcYABmtwBm8IhmhIhmwohmpohm4YhmlYhm04hmt4hm8ERmhAIzKQETNmGbONOcZcY54x31hgLDQWGYuNJcZSY5mx3FhhrDRWGauNNcZaY52x3thgbDQ2GZuNLcZWY5ux3dhh7DR2GbuNPcZeY5+x3zhgHDQOGYeNI8ZR45hx3DhhnDROGaeNM8ZZ45xx3rhgXDQuGZeNK8ZV45px3bhh3DRuGbeNO8Zd455x33hgPDQeGY+NJ8ZT45nx3HhhvDReGa+NN8Zb453x3vhgfDQ+GZ+NL8ZX45vx3fjP+GH8NH4Zv40/xl/jn5FkxJnJzHgzwUxupjBTmqnM1GYaM62ZzkxvZjAzmpnMzGYWM6uZzcxu5jBzmrnM3GYeM6+Zz8xvFjALmoXMwmYRs6hZzCxuljBLmqXM0mYZs6xZzixvVjArmpXMymYVs6pZzaxu1jBrmrXM2mYds65Zz6xvNjAbmo3MxmYTs6nZzGxutjBbmq3M1mYbs62ZaLYz25sdzI5mJ7Oz2cXsanYzu5s9zJ5mL7O32cfsa/Yz+5sDzIHmIHOwOcQcag4zh5sjzJHmKHO0OcYca44zx5sTzInmJHOyOcWcak4zp5szzJkmZuImYZImZdImYwKTNTmTNwVTNCVTNhVTNTVTNw3TNC3TNh3TNT3TNwMzNKEZmciMmbPM2eYcc645z5xvLjAXmovMxeYSc6m5zFxurjBXmqvM1eYac625zlxvbjA3mpvMzeYWc6u5zdxu7jB3mrvM3eYec6+5z9xvHjAPmofMw+YR86h5zDxunjBPmqfM0+YZ86x5zjxvXjAvmpfMy+YV86p5zbxu3jBvmrfM2+Yd8655z7xvPjAfmo/Mx+YT86n5zHxuvjBfmq/M1+Yb8635znxvfjA/mp/Mz+YX86v5zfxu/mf+MH+av8zf5h/zr/nPTDLjrGRWvJVgJbdSWCmtVFZqK42V1kpnpbcyWBmtTFZmK4uV1cpmZbdyWDmtXFZuK4+V18pn5bcKWAWtQlZhq4hV1CpmFbdKWCWtUlZpq4xV1ipnlbcqWBWtSlZlq4pV1apmVbdqWDWtWlZtq45V16pn1bcaWA2tRlZjq4nV1GpmNbdaWC2tVlZrq43V1kq02lntrQ5WR6uT1dnqYnW1ulndrR5WT6uX1dvqY/W1+ln9rQHWQGuQNdgaYg21hlnDrRHWSGuUNdoaY421xlnjrQnWRGuSNdmaYk21plnTrRnWTAuzcIuwSIuyaIuxgMVanMVbgiVakiVbiqVamqVbhmValmVbjuVanuVbgRVa0IosZMWsWdZsa44115pnzbcWWAutRdZia4m11FpmLbdWWCutVdZqa4211lpnrbc2WButTdZma4u11dpmbbd2WDutXdZua4+119pn7bcOWAetQ9Zh64h11DpmHbdOWCetU9Zp64x11jpnnbcuWBetS9Zl64p11bpmXbduWDetW9Zt645117pn3bceWA+tR9Zj64n11HpmPbdeWC+tV9Zr64311npnvbc+WB+tT9Zn64v11fpmfbf+s35YP61f1m/rj/XX+mclWXF2MjveTrCT2ynslHYqO7Wdxk5rp7PT2xnsjHYmO7Odxc5qZ7Oz2znsnHYuO7edx85r57Pz2wXsgnYhu7BdxC5qF7OL2yXsknYpu7Rdxi5rl7PL2xXsinYlu7Jdxa5qV7Or2zXsmnYtu7Zdx65r17Pr2w3shnYju7HdxG5qN7Ob2y3slnYru7Xdxm5rJ9rt7PZ2B7uj3cnubHexu9rd7O52D7un3cvubfex+9r97P72AHugPcgebA+xh9rD7OH2CHukPcoebY+xx9rj7PH2BHuiPcmebE+xp9rT7On2DHumjdm4TdikTdm0zdjAZm3O5m3BFm3Jlm3FVm3N1m3DNm3Ltm3Hdm3P9u3ADm1oRzayY/Yse7Y9x55rz7Pn2wvshfYie7G9xF5qL7OX2yvslfYqe7W9xl5rr7PX2xvsjfYme7O9xd5qb7O32zvsnfYue7e9x95r77P32wfsg/Yh+7B9xD5qH7OP2yfsk/Yp+7R9xj5rn7PP2xfsi/Yl+7J9xb5qX7Ov2zfsm/Yt+7Z9x75r37Pv2w/sh/Yj+7H9xH5qP7Of2y/sl/Yr+7X9xn5rv7Pf2x/sj/Yn+7P9xf5qf7O/2//ZP+yf9i/7t/3H/mv/s5PsOCeZE+8kOMmdFE5KJ5WT2knjpHXSOemdDE5GJ5OT2cniZHWyOdmdHE5OJ5eT28nj5HXyOfmdAk5Bp5BT2CniFHWKOcWdEk5Jp5RT2injlHXKOeWdCk5Fp5JT2aniVHWqOdWdGk5Np5ZT26nj1HXqOfWdBk5Dp5HT2GniNHWaOc2dFk5Lp5XT2mnjtHUSnXZOe6eD09Hp5HR2ujhdnW5Od6eH09Pp5fR2+jh9nX5Of2eAM9AZ5Ax2hjhDnWHOcGeEM9IZ5Yx2xjhjnXHOeGeCM9GZ5Ex2pjhTnWnOdGeGM9PBHNwhHNKhHNphHOCwDufwjuCIjuTIjuKojubojuGYjuXYjuO4juf4TuCEDnQiBzkxZ5Yz25njzHXmOfOdBc5CZ5Gz2FniLHWWOcudFc5KZ5Wz2lnjrHXWOeudDc5GZ5Oz2dnibHW2OdudHc5OZ5ez29nj7HX2OfudA85B55Bz2DniHHWOOcedE85J55Rz2jnjnHXOOeedC85F55Jz2bniXHWuOdedG85N55Zz27nj3HXuOfedB85D55Hz2HniPHWeOc+dF85L55Xz2nnjvHXeOe+dD85H55Pz2fnifHW+Od+d/5wfzk/nl/Pb+eP8df45SU6cm8yNdxPc5G4KN6Wbyk3tpnHTuunc9G4GN6Obyc3sZnGzutnc7G4ON6eby83t5nHzuvnc/G4Bt6BbyC3sFnGLusXc4m4Jt6Rbyi3tlnHLuuXc8m4Ft6Jbya3sVnGrutXc6m4Nt6Zby63t1nHruvXc+m4Dt6HbyG3sNnGbus3c5m4Lt6Xbym3ttnHbuoluO7e928Ht6HZyO7td3K5uN7e728Pt6fZye7t93L5uP7e/O8Ad6A5yB7tD3KHuMHe4O8Id6Y5yR7tj3LHuOHe8O8Gd6E5yJ7tT3KnuNHe6O8Od6WIu7hIu6VIu7TIucFmXc3lXcEVXcmVXcVVXc3XXcE3Xcm3XcV3Xc303cEMXupGL3Jg7y53tznHnuvPc+e4Cd6G7yF3sLnGXusvc5e4Kd6W7yl3trnHXuuvc9e4Gd6O7yd3sbnG3utvc7e4Od6e7y93t7nH3uvvc/e4B96B7yD3sHnGPusfc4+4J96R7yj3tnnHPuufc8+4F96J7yb3sXnGvutfc6+4N96Z7y73t3nHvuvfc++4D96H7yH3sPnGfus/c5+4L96X7yn3tvnHfuu/c9+4H96P7yf3sfnG/ut/c7+5/7g/3p/vL/e3+cf+6/9wkN85L5sV7CV5yL4WX0kvlpfbSeGm9dF56L4OX0cvkZfayeFm9bF52L4eX08vl5fbyeHm9fF5+r4BX0CvkFfaKeEW9Yl5xr4RX0ivllfbKeGW9cl55r4JX0avkVfaqeFW9al51r4ZX06vl1fbqeHW9el59r4HX0GvkNfaaeE29Zl5zr4XX0mvltfbaeG29RK+d197r4HX0OnmdvS5eV6+b193r4fX0enm9vT5eX6+f198b4A30BnmDvSHeUG+YN9wb4Y30RnmjvTHeWG+cN96b4E30JnmTvSneVG+aN92b4c30MA/3CI/0KI/2GA94rMd5vCd4oid5sqd4qqd5umd4pmd5tud4rud5vhd4oQe9yENezJvlzfbmeHO9ed58b4G30FvkLfaWeEu9Zd5yb4W30lvlrfbWeGu9dd56b4O30dvkbfa2eFu9bd52b4e309vl7fb2eHu9fd5+74B30DvkHfaOeEe9Y95x74R30jvlnfbOeGe9c95574J30bvkXfaueFe9a95174Z307vl3fbueHe9e95974H30HvkPfaeeE+9Z95z74X30nvlvfbeeG+9d95774P30fvkffa+eF+9b9537z/vh/fT++X99v54f71/XpIX5yfz4/0EP7mfwk/pp/JT+2n8tH46P72fwc/oZ/Iz+1n8rH42P7ufw8/p5/Jz+3n8vH4+P79fwC/oF/IL+0X8on4xv7hfwi/pl/JL+2X8sn45v7xfwa/oV/Ir+1X8qn41v7pfw6/p1/Jr+3X8un49v77fwG/oN/Ib+038pn4zv7nfwm/pt/Jb+238tn6i385v73fwO/qd/M5+F7+r383v7vfwe/q9/N5+H7+v38/v7w/wB/qD/MH+EH+oP8wf7o/wR/qj/NH+GH+sP84f70/wJ/qT/Mn+FH+qP82f7s/wZ/qYj/uET/qUT/uMD3zW53zeF3zRl3zZV3zV13zdN3zTt3zbd3zX93zfD/zQh37kIz/mz/Jn+3P8uf48f76/wF/oL/IX+0v8pf4yf7m/wl/pr/JX+2v8tf46f72/wd/ob/I3+1v8rf42f7u/w9/p7/J3+3v8vf4+f79/wD/oH/IP+0f8o/4x/7h/wj/pn/JP+2f8s/45/7x/wb/oX/Iv+1f8q/41/7p/w7/p3/Jv+3f8u/49/77/wH/oP/If+0/8p/4z/7n/IiEuLs5/7b/x3/rv/Pf+B/+j/8n/7H/xv/rf/JRxcXE//J/+L/+3/8f/6//zk/y4IFkQHyQEyYMUQcogVZA6SBOkDdIF6YMMQcYgU5A5yBJkDbIF2YMcQc4gV5A7yBPkDfIF+YMCQcGgUFA4KBIUDYoFxYMSQcmgVFA6KBOUDcoF5YMKQcWgUlA5qBJUDaoF1YMaQc2gVlA7qBPUDeoF9YMGQcOgUdA4aBI0DZoFzYMWQcugVdA6aBO0DRKDdkH7oEPQMegUdA66BF2DbkH3oEfQM+gV9A76BH2DfkH/YEAwMBgUDA6GBEODYcHwYEQwMhgVjA7GBGODccH4YEIwMZgUTA6mBFODacH0YEYwM8ACPCACMqACOmACELABF/CBEIiBFMiBEqiBFuiBEZiBFdiBE7iBF/hBEIQBDKIABbFgVjA7mBPMDeYF84MFwcJgUbA4WBIsDZYFy4MVwcpgVbA6WBOsDdYF64MNwcZgU7A52BJsDbYF24Mdwc5gV7A72BPsDfYF+4MDwcHgUHA4OBIcDY4Fx4MTwcngVHA6OBOcDc4F54MLwcXgUnA5uBJcDa4F14Mbwc3gVnA7uBPcDe4F94MHwcPgUfA4eBI8DZ4Fz4MXwcvgVfA6eBO8Dd4F74MPwcfgU/A5+BJ8Db4F34P/gh/BzyA+7nfwJ/gb/AuSgrgwWRgfJoTJwxRhyjBVmDpME6YN04XpwwxhxjBTmDnMEmYNs4XZwxxhzjBXmDvME+YN84X5wwJhwbBQWDgsEhYNi4XFwxJhybBUWDosE5YNy4XlwwphxbBSWDmsElYNq4XVwxphzbBWWDusE9YN64X1wwZhw7BR2DhsEjYNm4XNwxZhy7BV2DpsE7YNE8N2YfuwQ9gx7BR2DruEXcNuYfewR9gz7BX2DvuEfcN+Yf9wQDgwHBQODoeEQ8Nh4fBwRDgyHBWODseEY8Nx4fhwQjgxnBRODqeEU8Np4fRwRjgzxEI8JEIypEI6ZEIQsiEX8qEQiqEUyqESqqEW6qERmqEV2qETuqEX+mEQhiEMoxCFsXBWODucE84N54XzwwXhwnBRuDhcEi4Nl4XLwxXhynBVuDpcE64N14Xrww3hxnBTuDncEm4Nt4Xbwx3hznBXuDvcE+4N94X7wwPhwfBQeDg8Eh4Nj4XHwxPhyfBUeDo8E54Nz4XnwwvhxfBSeDm8El4Nr4XXwxvhzfBWeDu8E94N74X3wwfhw/BR+Dh8Ej4Nn4XPwxfhy/BV+Dp8E74N34Xvww/hx/BT+Dn8En4Nv4Xfw//CH+HP8Ff4O/wT/g3/hUlhHEwG42ECTA5TwJQwFUwN08C0MB1MDzPAjDATzAyzwKwwG8wOc8CcMBfMDfPAvDAfzA8LwIKwECwMi8CisBgsDkvAkrAULA3LwLKwHCwPK8CKsBKsDKvAqrAarA5rwJqwFqwN68C6sB6sDxvAhrARbAybwKawGWwOW8CWsBVsDdvAtjARtoPtYQfYEXaCnWEX2BV2g91hD9gT9oK9YR/YF/aD/eEAOBAOgoPhEDgUDoPD4Qg4Eo6Co+EYOBaOg+PhBDgRToKT4RQ4FU6D0+EMOBNiEIcEJCEFachAAFnIQR4KUIQSlKECVahBHRrQhBa0oQNd6EEfBjCEEEYQwRicBWfDOXAunAfnwwVwIVwEF8MlcClcBpfDFXAlXAVXwzVwLVwH18MNcCPcBDfDLXAr3Aa3wx1wJ9wFd8M9cC/cB/fDA/AgPAQPwyPwKDwGj8MT8CQ8BU/DM/AsPAfPwwvwIrwEL8Mr8Cq8Bq/DG/AmvAVvwzvwLrwH78MH8CF8BB/DJ/ApfAafwxfwJXwFX8M38C18B9/DD/Aj/AQ/wy/wK/wGv8P/4A/4E/6Cv+Ef+Bf+g0kwLkoWxUcJUfIoRZQyShWljtJEaaN0UfooQ5QxyhRljrJEWaNsUfYoR5QzyhXljvJEeaN8Uf6oQFQwKhQVjopERaNiUfGoRFQyKhWVjspEZaNyUfmoQlQxShZXOaoSVY2qRdWjGlHNqFZUO6oT1Y3qRfWjBlHDqFHUOGoSNY2aRc2jFlHLqFXUOmoTtY0So3ZR+6hD1DHqFHWOukRdo25R96hH1DPqFfWO+kR9o35R/2hANDAaFA2OhkRDo2HR8GhENDIaFY2OxkRjo3HR+GhCNDGaFE2OpkRTo2nR9GhGNDPCIjwiIjKiIjpiIhCxERfxkRCJkRTJkRKpkRbpkRGZkRXZkRO5kRf5URCFEYyiCEWxaFY0O5oTzY3mRfOjBdHCaFG0OFoSLY2WRcujFdHKaFW0OloTrY3WReujDdHGaFO0OdoSbY22RdujHdHOaFe0O9oT7Y32RfujA9HB6FB0ODoSHY2ORcejE9HJ6FR0OjoTnY3OReejC9HF6FJ0OboSXY2uRdejG9HN6FZ0O7oT3Y3uRfejB9HD6FH0OHoSPY2eRc+jF9HL6FX0OnoTvY3eRe+jD9HH6FP0OfoSfY2+Rd+j/6If0c/oV/Q7+hP9jf5FSVEcSobiUQJKjlKglCgVSo3SoLQoHUqPMqCMKBPKjLKgrCgbyo5yoJwoF8qN8qC8KB/KjwqggqgQKoyKoKKoGCqOSqCSqBQqjcqgsqgcKo8qoIqoEqqMqqCqqBqqjmqgmqgWqo3qoLqoHqqPGqCGqBFqjJqgpqgZao5aoJaoFWqN2qC2KBG1Q+1RB9QRdUKdURfUFXVD3VEP1BP1Qr1RH9QX9UP90QA0EA1Cg9EQNBQNQ8PRCDQSjUKj0Rg0Fo1D49EENBFNQpPRFDQVTUPT0Qw0E2EIRwQiEYVoxCCAWMQhHglIRBKSkYJUpCEdGchEFrKRg1zkIR8FKEQQRQihGJqFZqM5aC6ah+ajBWghWoQWoyVoKVqGlqMVaCVahVajNWgtWofWow1oI9qENqMtaCvahrajHWgn2oV2oz1oL9qH9qMD6CA6hA6jI+goOoaOoxPoJDqFTqMz6Cw6h86jC+giuoQuoyvoKrqGrqMb6Ca6hW6jO+guuofuowfoIXqEHqMn6Cl6hp6jF+gleoVeozfoLXqH3qMP6CP6hD6jL+gr+oa+o//QD/QT/UK/0R/0F/1DSSguliwWH0uIJY+liKWMpYqljqWJpY2li6WPZYhljGWKZY5liWWNZYtlj+WI5YzliuWO5YnljeWL5Y8ViBWMFYoVjhWJFY0VixWPlYiVjJWKlY6ViZWNlYuVj1WIVYxVilWOVYlVjVWLVY/ViNWM1YrVjtWJ1Y3Vi9WPNYg1jDWKNY41iTWNNYs1j7WItYy1irWOtYm1jSX+j2V7bOzjaRQAmtq2bdvGv26apqlt27atte3d2dn51bZt27bui/t8j3OwDlhHrBPWGYvHumAJWFcsEeuGJWHdsR5YT6wX1hvrg/XF+mH9sQHYQGwQNhgbgg3FhmHDsRHYSGwUNhobg43FxmHjsQnYRGwSNhmbgk3FpmHTsRnYTGwWNhubg83F5mHzsQXYQmwRthhbgi3FlmHLsRXYSmwVthpbg63F1mHrsQ3YRmwTthnbgm3FtmHbMQzDMQIjMQqjMQZjMQ7jMQETMQmTMQVTMQ3TMQMzMQuzMQdzMQ/zsQADWIhBLMIQFsN2YDuxXdhubA+2F9uH7ccOYAexQ9hh7Ah2FDuGHcdOYCexU9hp7Ax2FjuHnccuYBexS9hl7Ap2FbuGXcduYDexW9ht7A52F7uH3cceYA+xR9hj7An2FHuGPcdeYC+xV9hr7A32FnuHvcc+YB+xT9hn7Av2FfuGfcd+YD+xX9hv7A/2F/uHxeHJ8OR4CjwlngpPjafB0+Lp8PR4BjwjngnPjGfBs+LZ8Ox4DjwnngvPjefB8+L58Px4AbwgXggvjBfBi+LF8OJ4CbwkXgovjZfBy+Ll8PJ4BbwiXgmvjFfBq+LV8Op4DbwmXguvjdfB6+L18Pp4A7wh3ghvjDfBm+LN8OZ4C7wl3gpvjf+Ht8Hb4u3w9ngHvCPeCe+Mx+Nd8AS8K56Id8OT8O54D7wn3gvvjffB++L98P74AHwgPggfjA/Bh+LD8OH4CHwkPgofjY/Bx+Lj8PH4BHwiPgmfjE/Bp+LT8On4DHwmPgufjc/B5+Lz8Pn4AnwhvghfjC/Bl+LL8OX4Cnwlvgpfja/B1+Lr8PX4BnwjvgnfjG/Bt+Lb8O04huM4gZM4hdM4g7M4h/O4gIu4hMu4gqu4huu4gZu4hdu4g7u4h/t4gAM8xCEe4QiP4TvwnfgufDe+B9+L78P34wfwg/gh/DB+BD+KH8OP4yfwk/gp/DR+Bj+Ln8PP4xfwi/gl/DJ+Bb+KX8Ov4zfwm/gt/DZ+B7+L38Pv4w/wh/gj/DH+BH+KP8Of4y/wl/gr/DX+Bn+Lv8Pf4x/wj/gn/DP+Bf+Kf8O/4z/wn/gv/Df+B/+L/8PjiGREciIFkZJIRaQm0hBpiXREeiIDkZHIRGQmshBZiWxEdiIHkZPIReQm8hB5iXxEfqIAUZAoRBQmihBFiWJEcaIEUZIoRZQmyhBliXJEeaICUZGoRFQmqhBViWpEdaIGUZOoRdQm6hB1iXpEfaIB0ZBoRDQmmhBNiWZEc6IF0ZJoRbQm/iPaEG2JdkR7ogPRkehEdCbiiS5EAtGVSCS6EUlEd6IH0ZPoRfQm+hB9iX5Ef2IAMZAYRAwmhhBDiWHEcGIEMZIYRYwmxhBjiXHEeGICMZGYREwmphBTiWnEdGIGMZOYRcwm5hBziXnEfGIBsZBYRCwmlhBLiWXEcmIFsZJYRawm1hBriXXEemIDsZHYRGwmthBbiW3EdgIjcOJfsv9nBQzBEhzBEwIhEhIhEwqhEhqhEwZhEhZhEw7hEh7hEwEBiJCAREQgIkbsIHYSu4jdxB5iL7GP2E8cIA4Sh4jDxBHiKHGMOE6cIE4Sp4jTxBniLHGOOE9cIC4Sl4jLxBXiKnGNuE7cIG4St4jbxB3iLnGPuE88IB4Sj4jHxBPiKfGMeE68IF4Sr4jXxBviLfGOeE98ID4Sn4jPxBfiK/GN+E78IH4Sv4jfxB/iL/GPiCOTkcnJFGRKMhWZmkxDpiXTkenJDGRGMhOZmcxCZiWzkdnJHGROMheZm8xD5iXzkfnJAmRBshBZmCxCFiWLkcXJEmRJshRZmixDliXLkeXJCmRFshJZmaxCViWrkdXJGmRNshZZm6xD1iXrkfXJBmRDshHZmGxCNiWbkc3JFmRLshXZOl1cXBzZlmxHtic7kB3JTmRnMp7sQiaQXclEshuZRHYne5A9yV5kb7IP2ZfsR/YnB5ADyUHkYHIIOZQcRg4nR5AjyVHkaHIMOZYcR44nJ5ATyUnkZHIKOZWcRk4nZ5AzyVnkbHIOOZecR84nF5ALyUXkYnIJuZRcRi4nV5AryVXkanINuZZcR64nN5AbyU3kZnILuZXcRm4nMRInCZIkKZImGZIlOZInBVIkJVImFVIlNVInDdIkLdImHdIlPdInAxKQIQnJiERkjNxB7iR3kbvJPeRech+5nzxAHiQPkYfJI+RR8hh5nDxBniRPkafJM+RZ8hx5nrxAXiQvkZfJK+RV8hp5nbxB3iRvkbfJO+Rd8h55n3xAPiQfkY/JJ+RT8hn5nHxBviRfka/JN+Rb8h35nvxAfiQ/kZ/JL+RX8hv5nfxB/iR/kb/JP+Rf8h8ZRyWjklMpqJRUKio1lYZKS6Wj0lMZqIxUJiozlYXKSmWjslM5qJxULio3lYfKS+Wj8lMFqIJUIaowVYQqShWjilMlqJJUKao0VYYqS5WjylMVqIpUJaoyVYWqSlWjqlM1qJpULao2VYeqS9Wj6lMNqIZUI6ox1YRqSjWjmlMtqJZUK6o19R/VhmpLtaPaUx2ojlQnqjMVT3WhEqiuVCLVjUqiulM9qJ5UL6o31YfqS/Wj+lMDqIHUIGowNYQaSg2jhlMjqJHUKGo0NYYaS42jxlMTqInUJGoyNYWaSk2jplMzqJnULGo2NYeaS82j5lMLqIXUImoxtYRaSi2jllMrqJXUKmo1tYZaS62j1lMbqI3UJmoztYXaSm2jtlMYhVMERVIURVMMxVIcxVMCJVISJVMKpVIapVMGZVIWZVMO5VIe5VMBBaiQglREISpG7aB2Uruo3dQeai+1j9pPHaAOUoeow9QR6ih1jDpOnaBOUqeo09QZ6ix1jjpPXaAuUpeoy9QV6ip1jbpO3aBuUreo29Qd6i51j7pPPaAeUo+ox9QT6in1jHpOvaBeUq+o19Qb6i31jnpPfaA+Up+oz9QX6iv1jfpO/aB+Ur+o39Qf6i/1j4qjk9HJ6RR0SjoVnRr+z0vRGeiMdCY6M52Fzkpno7PTOeicdC46N52Hzkvno/PTBeiCdCG6MF2ELkoXo4vTJeiSdCm6NF2GLkuXo8vTFeiKdCW6Ml2FrkpXo6vTNeiadC26Nl2HrkvXo+vTDeiGdCO6Md2Ebko3o5vTLeiWdCu6Nf0f3YZuS7ej29Md6I50J7ozHU93oRPornQi3Y1OorvTPeiedC+6N92H7kv3o/vTA+iB9CB6MD2EHkoPo4fTI+iR9Ch6ND2GHkuPo8fTE+iJ9CR6Mj2FnkpPo6fTM+iZ9Cx6Nj2HnkvPo+fTC+iF9CJ6Mb2EXkovo5fTK+iV9Cp6Nb2GXkuvo9fTG+iN9CZ6M72F3kpvo7fTGI3TBE3SFE3TDM3SHM3TAi3SEi3TCq3SGq3TBm3SFm3TDu3SHu3TAQ3okIZ0RCM6Ru+gd9K76N30HnovvY/eTx+gD9KH6MP0EfoofYw+Tp+gT9Kn6NP0GfosfY4+T1+gL9KX6Mv0FfoqfY2+Tt+gb9K36Nv0HfoufY++Tz+gH9KP6Mf0E/op/Yx+Tr+gX9Kv6Nf0G/ot/Y5+T3+gP9Kf6M/0F/or/Y3+Tv+gf9K/6N/0H/ov/Y+OY5IxyZkUTEomFZOaScOkZdIx6ZkMTEYmE5OZycJkZbIx2ZkcTE4mF5ObycPkZfIx+ZkCTEGmEFOYKcIUZYoxxZkSTEmmFFOaKcOUZcox5ZkKTEWmElOZqcJUZaox1ZkaTE2mFlObqcPUZeox9ZkGTEOmEdOYacI0ZZoxzZkWTEumFdOa+Y9pw7Rl2jHtmQ5MR6YT05mJZ7owCUxXJpHpxiQx3ZkeTE+mF9Ob6cP0Zfox/ZkBzEBmEDOYGcIMZYYxw5kRzEhmFDOaGcOMZcYx45kJzERmEjOZmcJMZaYx05kZzExmFjObmcPMZeYx85kFzEJmEbOYWcIsZZYxy5kVzEpmFbOaWcOsZdYx65kNzEZmE7OZ2cJsZbYx2xmMwRmCIRmKoRmGYRmO4RmBERmJkRmFURmN0RmDMRmLsRmHcRmP8ZmAAUzIQCZiEBNjdjA7mV3MbmYPs5fZx+xnDjAHmUPMYeYIc5Q5xhxnTjAnmVPMaeYMc5Y5x5xnLjAXmUvMZeYKc5W5xlxnbjA3mVvMbeYOc5e5x9xnHjAPmUfMY+YJ85R5xjxnXjAvmVfMa+YN85Z5x7xnPjAfmU/MZ+YL85X5xnxnfjA/mV/Mb+YP85f5x8SxydjkbAo2JZuKTc2mYdOy6dj0bAY2I5uJzcxmYbOy2djsbA42J5uLzc3mYfOy+dj8bAG2IFuILcwWYYuyxdjibAm2JFuKLc2WYcuy5djybAW2IluJrcxWYauy1djqbA22JluLrc3WYeuy9dj6bAO2IduIbcw2YZuyzdjmbAu2JduKbc3+x7Zh27Lt2PZsB7Yj24ntzMazXdgEtiubyHZjk9jubA+2J9uL7c32Yfuy/dj+7AB2IDuIHcwOYYeyw9jh7Ah2JDuKHc2OYcey49jx7AR2IjuJncxOYaey09jp7Ax2JjuLnc3OYeey89j57AJ2IbuIXcwuYZeyy9jl7Ap2JbuKXc2uYdey69j17AZ2I7uJ3cxuYbey29jtLMbiLMGSLMXSLMOyLMfyrMCKrMTKrMKqrMbqrMGarMXarMO6rMf6bMACNmQhG7GIjbE72J3sLnY3u4fdy+5j97MH2IPsIfYwe4Q9yh5jj7Mn2JPsKfY0e4Y9y55jz7MX2IvsJfYye4W9yl5jr7M32JvsLfY2e4e9y95j77MP2IfsI/Yx+4R9yj5jn7Mv2JfsK/Y1+4Z9y75j37Mf2I/sJ/Yz+4X9yn5jv7M/2J/sL/Y3+4f9y/5j47hkXHIuBZeSS8Wl5tJwabl0XHouA5eRy8Rl5rJwWblsXHYuB5eTy8Xl5vJwebl8XH6uAFeQK8QV5opwRbliXHGuBFeSK8WV5spwZblyXHmuAleRq8RV5qpwVblqXHWuBleTq8XV5upwdbl6XH2uAdeQa8Q15ppwTblmXHOuBdeSa8W15v7j2nBtuXZce64D15HrxHXm4rkuXALXlUvkunFJXHeuB9eT68X15vpwfbl+XH9uADeQG8QN5oZwQ7lh3HBuBDeSG8WN5sZwY7lx3HhuAjeRm8RN5qZwU7lp3HRuBjeTm8XN5uZwc7l53HxuAbeQW8Qt5pZwS7ll3HJuBbeSW8Wt5tZwa7l13HpuA7eR28Rt5rZwW7lt3HYO43CO4EiO4miO4ViO43hO4ERO4mRO4VRO43TO4EzO4mzO4VzO43wu4AAXcpCLOMTFuB3cTm4Xt5vbw+3l9nH7uQPcQe4Qd5g7wh3ljnHHuRPcSe4Ud5o7w53lznHnuQvcRe4Sd5m7wl3lrnHXuRvcTe4Wd5u7w93l7nH3uQfcQ+4R95h7wj3lnnHPuRfcS+4V95p7w73l3nHvuQ/cR+4T95n7wn3lvnHfuR/cT+4X95v7w/3l/nFxfDI+OZ+CT8mn4lPzafi0fDo+PZ+Bz8hn4jPzWfisfDY+O5+Dz8nn4nPzefi8fD4+P1+AL8gX4gvzRfiifDG+OF+CL8mX4kvzZfiyfDm+PF+Br8hX4ivzVfiqfDW+Ol+Dr8nX4mvzdfi6fD2+Pt+Ab8g34hvzTfimfDO+Od+Cb8m34lvz//Ft+LZ8O74934HvyHfiO/PxfBc+ge/KJ/Ld+CS+O9+D78n34nvzffi+fD++Pz+AH8gP4gfzQ/ih/DB+OD+CH8mP4kfzY/ix/Dh+PD+Bn8hP4ifzU/ip/DR+Oj+Dn8nP4mfzc/i5/Dx+Pr+AX8gv4hfzS/il/DJ+Ob+CX8mv4lfza/i1/Dp+Pb+B38hv4jfzW/it/DZ+O4/xOE/wJE/xNM/wLM/xPC/wIi/xMq/wKq/xOm/wJm/xNu/wLu/xPh/wgA95yEc84mP8Dn4nv4vfze/h9/L7+P38Af4gf4g/zB/hj/LH+OP8Cf4kf4o/zZ/hz/Ln+PP8Bf4if4m/zF/hr/LX+Ov8Df4mf4u/zd/h7/L3+Pv8A/4h/4h/zD/hn/LP+Of8C/4l/4p/zb/h3/Lv+Pf8B/4j/4n/zH/hv/Lf+O/8D/4n/4v/zf/h//L/+DghmZBcSCGkFFIJqYU0QlohnZBeyCBkFDIJmYUsQlYhm5BdyCHkFHIJuYU8Ql4hn5BfKCAUFAoJhYUiQlGhmFBcKCGUFEoJpYUyQlmhnFBeqCBUFCoJlYUqQlWhmlBdqCHUFGoJtYU6Ql2hnlBfaCA0FBoJjYUmQlOhmdBcaCG0FFoJrYX/hDZCW6Gd0F7oIHQUOgmdhXihi5AgdBUShW5CktBd6CH0FHoJvYU+Ql+hn9BfGCAMFAYJg4UhwlBhmDBcGCGMFEYJo4UxwlhhnDBemCBMFCYJk4UpwlRhmjBdmCHMFGYJs4U5wlxhnjBfWCAsFBYJi4UlwlJhmbBcWCGsFFYJq4U1wlphnbBe2CBsFDYJm4UtwlZhm7BdwARcIARSoARaYARW4AReEARRkARZUARV0ARdMARTsARbcARX8ARfCAQghAIUIgEJMWGHsFPYJewW9gh7hX3CfuGAcFA4JBwWjghHhWPCceGEcFI4JZwWzghnhXPCeeGCcFG4JFwWrghXhWvCdeGGcFO4JdwW7gh3hXvCfeGB8FB4JDwWnghPhWfCc+GF8FJ4JbwW3ghvhXfCe+GD8FH4JHwWvghfhW/Cd+GH8FP4JfwW/gh/hX9CnJhMTC6mEFOKqcTUYhoxrZhOTC9mEDOKmcTMYhYxq5hNzC7mEHOKucTcYh4xr5hPzC8WEAuKhcTCYhGxqFhMLC6WEEuKpcTSYhmxrFhOLC9WECuKlcTKYhWxqlhNrC7WEGuKtcTaYh2xrlhPrC82EBuKjcTGYhOxqdhMbC62EFuKrcTW4n9iG7Gt2E5sL3YQO4qdxM5ivNhFTBC7ioliNzFJ7C72EHuKvcTeYh+xr9hP7C8OEAeKg8TB4hBxqDhMHC6OEEeKo8TR4hhxrDhOHC9OECeKk8TJ4hRxqjhNnC7OEGeKs8TZ4hxxrjhPnC8uEBeKi8TF4hJxqbhMXC6uEFeKq8TV4hpxrbhOXC9uEDeKm8TN4hZxq7hN3C5iIi4SIilSIi0yIityIi8KoihKoiwqoipqoi4aoilaoi06oit6oi8GIhBDEYqRiMSYuEPcKe4Sd4t7xL3iPnG/eEA8KB4SD4tHxKPiMfG4eEI8KZ4ST4tnxLPiOfG8eEG8KF4SL4tXxKviNfG6eEO8Kd4Sb4t3xLviPfG++EB8KD4SH4tPxKfiM/G5+EJ8Kb4SX4tvxLfiO/G9+EH8KH4SP4tfxK/iN/G7+EP8Kf4Sf4t/xL/iPzFOSiYll1JIKaVUUmopjZRWSiellzJIGaVMUmYpi5RVyiZll3JIOaVcUm4pj5RXyifllwpIBaVCUmGpiFRUKiYVl0pIJaVSUmmpjFRWKieVlypIFaVKUmWpilRVqiZVl2pINaVaUm2pjlRXqifVlxpIDaVGUmOpidRUaiY1l1pILaVWUmvpP6mN1FZqJ7WXOkgdpU5SZyle6iIlSF2lRKmblCR1l3pIPaVeUm+pj9RX6if1lwZIA6VB0mBpiDRUGiYNl0ZII6VR0mhpjDRWGieNlyZIE6VJ0mRpijRVmiZNl2ZIM6VZ0mxpjjRXmifNlxZIC6VF0mJpibRUWiYtl1ZIK6VV0mppjbRWWietlzZIG6VN0mZpi7RV2iZtlzAJlwiJlCiJlhiJlTiJlwRJlCRJlhRJlTRJlwzJlCzJlhzJlTzJlwIJSKEEpUhCUkzaIe2Udkm7pT3SXmmftF86IB2UDkmHpSPSUemYdFw6IZ2UTkmnpTPSWemcdF66IF2ULkmXpSvSVemadF26Id2Ubkm3pTvSXemedF96ID2UHkmPpSfSU+mZ9Fx6Ib2UXkmvpTfSW+md9F76IH2UPkmfpS/SV+mb9F36If2Ufkm/pT/SX+mfFCcnk5PLKeSUcio5tZxGTiunk9PLGeSMciY5s5xFzipnk7PLOeScci45t5xHzivnk/PLBeSCciG5sFxELioXk4vLJeSScim5tFxGLiuXk8vLFeSKciW5slxFripXk6vLNeSaci25tlxHrivXk+vLDeSGciO5sdxEbio3k5vLLeSWciu5tfyf3EZuK7eT28sd5I5yJ7mzHC93kRPkrnKi3E1OkrvLPeSeci+5t9xH7iv3k/vLA+SB8iB5sDxEHioPk4fLI+SR8ih5tDxGHiuPk8fLE+SJ8iR5sjxFnipPk6fLM+SZ8ix5tjxHnivPk+fLC+SF8iJ5sbxEXiovk5fLK+SV8ip5tbxGXiuvk9fLG+SN8iZ5s7xF3ipvk7fLmIzLhEzKlEzLjMzKnMzLgizKkizLiqzKmqzLhmzKlmzLjuzKnuzLgQzkUIZyJCM5Ju+Qd8q75N3yHnmvvE/eLx+QD8qH5MPyEfmofEw+Lp+QT8qn5NPyGfmsfE4+L1+QL8qX5MvyFfmqfE2+Lt+Qb8q35NvyHfmufE++Lz+QH8qP5MfyE/mp/Ex+Lr+QX8qv5NfyG/mt/E5+L3+QP8qf5M/yF/mr/E3+Lv+Qf8q/5N/yH/mv/E+OU5IpyZUUSkollZJaSaOkVdIp6ZUMSkYlk5JZyaJkVbIp2ZUcSk4ll5JbyaPkVfIp+ZUCSkGlkFJYKaIUVYopxZUSSkmllJJMKaOUVcop5ZUKSkWlklJZqaJUVaop1ZUaSk2lllJbqaPUVeop9ZUGSkOlkdJYaaI0VZoqzZXmSkulpdJaaa20Udoo7ZR2Sgelg9JJ6aTEK/FKgpKgJCqJSpKSpPRQeii9lF5KH6WP0k/ppwxQBiqDlEHKEGWIMkwZpoxQRiijlFHKGGWMMk4Zr0xQJiqTlMnKFGWqMk2ZrsxQZiqzlNnKHGWuMk+ZryxQFiiLlEXKEmWJskxZpqxQViirlFXKGmWNsk5Zp2xQNiiblE3KFmWLsk3ZpmAKrhAKqVAKrTAKq3AKrwiKqEiKrCiKqmiKrhiKqViKrTiKq3iKrwQKUEIFKpGClJiyQ9mp7FJ2K3uUvco+Zb9yQDmoHFIOK0eUo8ox5bhyQjmpnFJOK2eUs8o55bxyQbmoXFIuK1eUq8o15bpyQ7mp3FJuK3eUu8o95b7yQHmoPFIeK0+Up8oz5bnyQnmpvFJeK2+Ut8o75b3yQfmofFI+K1+Ur8o35bvyQ/mp/FJ+K3+Uv8o/JU5NpiZXU6gp1VRqajWNmlZNp6ZXM6gZ1UxqZjWLmlXNpmZXc6g51VxqbjWPmlfNp+ZXC6gF1UJqYbWIWlQtphZXi6sl1ZJqabW0WlYtq5ZXy6sV1YpqZbWyWlWtplZXq6s11ZpqbbW2Wletq9ZX66sN1YZqY7Wx2lRtqjZXm6st1ZZqa7W12kZto7ZT26kd1A5qJ7WTGq/GqwlqgpqoJqpJapLaQ+2h9lJ7qX3UPmo/tZ86QB2gDlIHqUPUIeowdZg6Qh2hjlJHqWPUMeo4dZw6QZ2oTlInq1PUqeo0dbo6Q52pzlRnq7PVueo8dZ66QF2gLlIXqUvUJeoydZm6Ql2prlJXq6vVteo6db26Qd2oblI3q1vUreo2dbuKqbhKqKRKqbTKqKzKqbwqqKIqqbKqqKqqqbpqqKZqqbbqqK7qqb4aqEANVahGKlJj6g51p7pL3a3uUfeq+9T96gH1oHpIPaweUY+qx9Tj6gn1pHpKPa2eUc+q59Tz6gX1onpJvaxeUa+q19Tr6g31pnpLva3eUe+q99T76gP1ofpIfaw+UZ+qz9Tn6gv1pfpKfa2+Ud+q79T3aePUj+on9bP6Rf2qflO/qz/Un+ov9bf6R/2r/lPjtGRaci2FllJLpaXW0mhptXRaei2DllHLpGXWsmhZtWxadi2HllPLpeXW8mh5tXxafq2AVlArpBXWimhFtWJaca2EVlIrpZXWymhltXJaea2CVlGrpFXWqmhVtWpada2GVlOrpdXW6mh1tXpafa2B1lBrpDXWmmhNtWZac62F1lJrpbXW/tPaaG21dlp7rYPWUeukddbitS5agtZVS9S6aUlad62H1lPrpfXW+mh9tX5af22ANlAbpA3WhmhDtWHacG2ENlIbpY3WxmhjtXHaeG2CNlGbpE3WpmhTtWnadG2GNlObpc3W5mhztXnafG2BtlBbpC3WlmhLtWXacm2FtlJbpa3W1mhrtXXaem2DtlHbpG3WtmhbtW3adg3TcI3QSI3SaI3RWI3TeE3QRE3SZE3RVE3TdM3QTM3SbM3RXM3TfC3QgBZqUIs0pMW0HdpObZe2W9uj7dX2afu1A9pB7ZB2WDuiHdWOace1E9pJ7ZR2WjujndXOaee1C9pF7ZJ2WbuiXdWuade1G9pN7ZZ2W7uj3dXuafe1B9pD7ZH2WHuiPdWeac+1F9pL7ZX2WnujvdXeae+1D9pH7ZP2WfuifdW+ad+1H9pP7Zf2W/uj/dX+aXF6Mj25nkJPqafSU+tp9LR6Oj29nkHPqGfSM+tZ9Kx6Nj27nkPPqefSc+t59Lx6Pj2/XkAvqBfSC+tF9KJ6Mb24XkIvqZfSS+tl9LJ6Ob28XkGvqFfSK+tV9Kp6Nb26XkOvqdfSa+t19Lp6Pb2+3kBvqDfSG+tN9KZ6M7253kJvqbfSW+v/6W30tno7vb3eQe+od9I76/F6Fz1B76on6t30JL273kPvqffSe+t99L56P72/PkAfqA/SB+tD9KH6MH24PkIfqY/SR+tj9LH6OH28PkGfqE/SJ+tT9Kn6NH26PkOfqc/SZ+tz9Ln6PH2+vkBfqC/SF+tL9KX6Mn25vkJfqa/SV+tr9LX6On29vkHfqG/SN+tb9K36Nn27jum4TuikTum0zuiszum8LuiiLumyruiqrum6buimbum27uiu7um+HuhAD3WoRzrSY/oOfae+S9+t79H36vv0/foB/aB+SD+sH9GP6sf04/oJ/aR+Sj+tn9HP6uf08/oF/aJ+Sb+sX9Gv6tf06/oN/aZ+S7+t39Hv6vf0+/oD/aH+SH+sP9Gf6s/05/oL/aX+Sn+tv9Hf6u/09/oH/aP+Sf+sf9G/6t/07/oP/af+S/+t/9H/6v/0OCOZkdxIYaQ0UhmpjTRGWiOdkd7IYGQ0MhmZjSxGViObkd3IYeQ0chm5jTxGXiOfkd8oYBQ0ChmFjSJGUaOYUdwoYZQ0ShmljTJGWaOcUd6oYFQ0KhmVjSpGVaOaUd2oYdQ0ahm1jTpGXaOeUd9oYDQ0GhmNjSZGU6OZ0dxoYbQ0Whmtjf+MNkZbo53R3uhgdDQ6GZ2NeKOLkWB0NRKNbkaS0d3oYfQ0ehm9jT5GX6Of0d8YYAw0BhmDjSHGUGOYMdwYYYw0RhmjjTHGWGOcMd6YYEw0JhmTjSnGVGOaMd2YYcw0ZhmzjTnGXGOeMd9YYCw0FhmLjSXGUmOZsdxYYaw0VhmrjTXGWmOdsd7YYGw0NhmbjS3GVmObsd3ADNwgDNKgDNpgDNbgDN4QDNGQDNlQDNXQDN0wDNOwDNtwDNfwDN8IDGCEBjQiAxkxY4ex09hl7Db2GHuNfcZ+44Bx0DhkHDaOGEeNY8Zx44Rx0jhlnDbOGGeNc8Z544Jx0bhkXDauGFeNa8Z144Zx07hl3DbuGHeNe8Z944Hx0HhkPDaeGE+NZ8Zz44Xx0nhlvDbeGG+Nd8Z744Px0fhkfDa+GF+Nb8Z344fx0/hl/Db+GH+Nf0acmcxMbqYwU5qpzNRmGjOtmc5Mb2YwM5qZzMxmFjOrmc3MbuYwc5q5zNxmHjOvmc/MbxYwC5qFzMJmEbOoWcwsbpYwS5qlzNJmGbOsWc4sb1YwK5qVzMpmFbOqWc2sbtYwa5q1zNpmHbOuWc+sbzYwG5qNzMZmE7Op2cxsbrYwW5qtzNbmf2Ybs63ZzmxvdjA7mp3Mzma82cVMMLuaiWY3M8nsbvYwe5q9zN5mH7Ov2c/sbw4wB5qDzMHmEHOoOcwcbo4wR5qjzNHmGHOsOc4cb04wJ5qTzMnmFHOqOc2cbs4wZ5qzzNnmHHOuOc+cby4wF5qLzMXmEnOpucxcbq4wV5qrzNXmGnOtuc5cb24wN5qbzM3mFnOruc3cbmImbhImaVImbTIma3ImbwqmaEqmbCqmamqmbhqmaVqmbTqma3qmbwYmMEMTmpGJzJi5w9xp7jJ3m3vMveY+c795wDxoHjIPm0fMo+Yx87h5wjxpnjJPm2fMs+Y587x5wbxoXjIvm1fMq+Y187p5w7xp3jJvm3fMu+Y98775wHxoPjIfm0/Mp+Yz87n5wnxpvjJfm2/Mt+Y78735wfxofjI/m1/Mr+Y387v5w/xp/jJ/m3/Mv+Y/M85KZiW3UlgprVRWaiuNldZKZ6W3MlgZrUxWZiuLldXKZmW3clg5rVxWbiuPldfKZ+W3ClgFrUJWYauIVdQqZhW3SlglrVJWaauMVdYqZ5W3KlgVrUpWZauKVdWqZlW3alg1rVpWbauOVdeqZ9W3GlgNrUZWY6uJ1dRqZjW3WlgtrVZWa+s/q43V1mpntbc6WB2tTlZnK97qYiVYXa1Eq5uVZHW3elg9rV5Wb6uP1dfqZ/W3BlgDrUHWYGuINdQaZg23RlgjrVHWaGuMNdYaZ423JlgTrUnWZGuKNdWaZk23ZlgzrVnWbGuONdeaZ823FlgLrUXWYmuJtdRaZi23VlgrrVXWamuNtdZaZ623NlgbrU3WZmuLtdXaZm23MAu3CIu0KIu2GIu1OIu3BEu0JEu2FEu1NEu3DMu0LMu2HMu1PMu3AgtYoQWtyEJWzNph7bR2WbutPdZea5+13zpgHbQOWYetI9ZR65h13DphnbROWaetM9ZZ65x13rpgXbQuWZetK9ZV65p13bph3bRuWbetO9Zd655133pgPbQeWY+tJ9bTFHFxcdYL66X1ynptvbHeWu+s99YH66P1yfpsfbG+Wt+s79YP66f1y/pt/bH+Wv+sODuZndxOYae0U9mp7TR2Wjudnd7OYGe0M9mZ7Sx2Vjubnd3OYee0c9m57Tx2Xjufnd8uYBe0C9mF7SJ2UbuYXdwuYZe0S9ml7TJ2WbucXd6uYFe0K9mV7Sp2VbuaXd2uYde0a9m17Tp2XbueXd9uYDe0G9mN7SZ2U7uZ3dxuYbe0W9mt7f/sNnZbu53d3u5gd7Q72Z3teLuLnWB3tRPtbnaS3d3uYfe0e9m97T52X7uf3d8eYA+0B9mD7SH2UHuYPdweYY+0R9mj7TH2WHucPd6eYE+0J9mT7Sn2VHuaPd2eYc+0Z9mz7Tn2XHuePd9eYC+0F9mL7SX2UnuZvdxeYa+0V9mr7TX2Wnudvd7eYG+0N9mb7S32Vnubvf1/QCLOpmzaZmzW5mzeFmzRlmzZVmzV1mzdNmzTtmzbdmzX9mzfDmxghza0IxvZMXuHvdPeZe+299h77X32fvuAfdA+ZB+2j9hH7WP2cfuEfdI+ZZ+2z9hn7XP2efuCfdG+ZF+2r9hX7Wv2dfuGfdO+Zd+279h37Xv2ffuB/dB+ZD+2n9hP7Wf2c/uF/dJ+Zb+239hv7Xf2e/uD/dH+ZH+2v9hf7W/2d/uH/dP+Zf+2/9h/7X92nJPMSe6kcFI6qZzUThonrZPOSe9kcDI6mZzMThYnq5PNye7kcHI6uZzcTh4nr5PPye8UcAo6hZzCThGnqFPMKe6UcEo6pZzSThmnrFPOKe9UcCo6lZzKThWnqlPNqe7UcGo6tZzaTh2nrlPPqe80cBo6jZzGThOnqdPMae60cFo6rZzWzn9OG6et085p73RwOjqdnM5OvNPFSXC6OolONyfJ6e70cHo6vZzeTh+nr9PP6e8McAY6g5zBzhBnqDPMGe6McEY6o5zRzhhnrDPOGe9McCY6k5zJzhRnqjPNme7McGY6s5zZzhxnrjPPme8scBY6i5zFzhJnqbPMWe6scFY6q5zVzhpnrbPOWe9scDY6m5zNzhZnq7PN2e5gDu4QDulQDu0wDutwDu8IjuhIjuwojupoju4YjulYju04jut4ju8EDnBCBzqRg5yYs8PZ6exydjt7nL3OPme/c8A56BxyDjtHnKPOMee4c8I56ZxyTjtnnLPOOee8c8G56FxyLjtXnKvONee6c8O56dxybjt3nLvOPee+88B56DxyHjtPnKfOM+e588J56bxyXjtvnLfOO+e988H56HxyPjtfnK/ON+e788P56fxyfjt/nL/OPyfOTeYmd1O4Kd1Ubmo3jZvWTeemdzO4Gd1MbmY3i5vVzeZmd3O4Od1cbm43j5vXzefmdwu4Bd1CbmG3iFvULeYWd0u4Jd1Sbmm3jFvWLeeWdyu4Fd1KbmW3ilvVreZWd2u4Nd1abm23jlvXrefWdxu4Dd1GbmO3idvUbeY2d1u4Ld1Wbmv3P7eN29Zt57Z3O7gd3U5uZzfe7eImuF3dRLebm+R2d3u4Pd1ebm+3j9vX7ef2dwe4A91B7mB3iDvUHeYOd0e4I91R7mh3jDvWHeeOdye4E91J7mR3ijvVneZOd2e4M91Z7mx3jjvXnefOdxe4C91F7mJ3ibvUXeYud1e4K91V7mp3jbvWXeeudze4G91N7mZ3i7vV3eZudzEXdwmXdCmXdhmXdTmXdwVXdCVXdhVXdTVXdw3XdC3Xdh3XdT3XdwMXuKEL3chFbszd4e50d7m73T3uXnefu9894B50D7mH3SPuUfeYe9w94Z50T7mn3TPuWfece9694F50L7mX3SvuVfeae9294d50b7m33TvuXfeee9994D50H7mP3SfuU/eZ+9x94b50X7mv3TfuW/ed+9794H50P7mf3S/uV/eb+9394f50f7m/3T/uX/efG+cl85J7KbyUXiovtZfGS+ul89J7GbyMXiYvs5fFy+pl87J7ObycXi4vt5fHy+vl8/J7BbyCXiGvsFfEK+oV84p7JbySXimvtFfGK+uV88p7FbyKXiWvslfFq+pV86p7NbyaXi2vtlfHq+vV8+p7DbyGXiOvsdfEa+o185p7LbyWXiuvtfef18Zr67Xz2nsdvI5eJ6+zF+918RK8rl6i181L8rp7PbyeXi+vt9fH6+v18/p7A7yB3iBvsDfEG+oN84Z7I7yR3ihvtDfGG+uN88Z7E7yJ3iRvsjfFm+pN86Z7M7yZ3ixvtjfHm+vN8+Z7C7yF3iJvsbfEW+ot85Z7K7yV3ipvtbfGW+ut89Z7G7yN3iZvs7fF2+pt87Z7mId7hEd6lEd7jMd6nMd7gid6kid7iqd6mqd7hmd6lmd7jud6nud7gQe80INe5CEv5u3wdnq7vN3eHm+vt8/b7x3wDnqHvMPeEe+od8w77p3wTnqnvNPeGe+sd847713wLnqXvMveFe+qd8277t3wbnq3vNveHe+ud8+77z3wHnqPvMfeE++p98x77r3w0sW98l57b7y33jvvvffB++h98j57X7yv3jfvu/fD++n98n57f7y/3j8vzk/mJ/dT+Cn9VH5qP42f1k/np/cz+Bn9TH5mP4uf1c/mZ/dz+Dn9XH5uP4+f18/n5/cL+AX9Qn5hv4hf1C/mF/dL+CX9Un5pv4xf1i/nl/cr+BX9Sn5lv4pf1a/mV/dr+DX9Wn5tv45f16/n1/cb+A39Rn5jv4nf1G/mN/db+C39Vn5r/z+/jd/Wb+e39zv4Hf1Ofmc/3u/iJ/hd/US/m5/kd/d7+D39Xn5vv4/f1+/n9/cH+AP9Qf5gf4g/1B/mD/dH+CP9Uf5of4w/1h/nj/cn+BP9Sf5kf4o/1Z/mT/dn+DP9Wf5sf44/15/nz/cX+Av9Rf5if4m/1F/mL/dX+Cv9Vf5qf42/1l/nr/c3+Bv9Tf5mf0varf42f7uP+bhP+KRP+bTP+KzP+bwv+KIv+bKv+Kqv+bpv+KZv+bbv+K7v+b4f+MAPfehHPvJj/g5/p7/L3+3v8ff6+/z9/gH/oH/IP+wf8Y/6x/zj/gn/pH/KP+2f8c/65/zz/gX/on/Jv+xf8a/61/zr/g3/pn/Lv+3f8e/69/z7/gP/of/If+w/8Z/6z/zn/gv/pf/Kf+2/8d/67/z3/gf/o//J/+x/8b/63/zv/g//p//L/+3/8f/6//y4IFmQPEgRpAxSBamDNEHaIF2QPsgQZAwyBZmDLEHWIFuQPcgR5AxyBbmDPEHeIF+QPygQFAwKBYWDIkHRoFhQPCgRlAxKBaWDMkHZoFxQPqgQVAwqBZWDKkHVoFpQPagR1AxqBbWDOkHdoF5QP2gQNAwaBcmCJkHToFnQPGgRtAxaBa2D/4I2QdugXdA+6BB0DDoFnYP4oEuQEHQNEoNuQVLQPegR9Ax6Bb2DPkHfoF/QPxgQDAwGBYODIcHQYFgwPBgRjAxGBaODMcHYYFwwPpgQTAwmBZODKcHUYFowPZgRzAxmBbODOcHcYF4wP1gQLAwWBYuDJcHSYFmwPFgRrAxWBauDNcHaYF2wPtgQbAw2BZuDLcHWYFuwPcACPCACMqACOmACNuACPhACMZACOVACNdACPTACM7ACO3ACN/ACPwgCEIQBDKIABbFgR7Az2BXsDvYEe4N9wf7gQHAwOBQcDo4ER4NjwfHgRHAyOBWcDs4EZ4NzwfngQnAxuBRcDq4EV4NrwfXgRnAzuBXcDu4Ed4N7wf3gQfAweBQ8Dp4ET4NnwfPgRfAyeBW8Dt4Eb4N3wfvgQ/Ax+BR8Dr4EX4NvwffgR/Az+BX8Dv4Ef4N/QRxIBpKDFCAlSAVSgzQgLUgH0oMMICPIBDKDLCAryAaygxwgJ8gFcoM8IC/IB/KDAqAgKAQKgyKgKCgGioMSoCQoBUqDMqAsKAfKgwqgIqgEKoMqoCqoBqqDGqAmqAVqgzqgLqgH6oMGoCFoBBqDJqApaAaagxagJWgFWoP/QBvQFrQD7UEH0BF0Ap1BPOgCEkBXkAi6gSTQHfQAPUEv0Bv0AX1BP9AfDAADwSAwGAwBQ8EwMByMACPBKDAajAFjwTgwHkwAE8EkMBlMAVPBNDAdzAAzwSwwG8wBc8E8MB8sAAvBIrAYLAFLwTKwHKwAK8EqsBqsAWvBOrAebAAbwSawGWwBW8E2sB1gAAcEIAEFaMAAFnCABwIQgQRkoAAVaEAHBjCBBWzgABd4wAcBACAEEEQAgRjYAXaCXWA32AP2gn1gPzgADoJD4DA4Ao6CY+A4OAFOglPgNDgDzoJz4Dy4AC6CS+AyuAKugmvgOrgBboJb4Da4A+6Ce+A+eAAegkfgMXgCnoJn4Dl4AV6CV+A1eAPegnfgPfgAPoJP4DP4Ar6Cb+A7+AF+gl/gN/gD/oJ/IC5MFiYPU4Qpw1Rh6jBNmDZMF6YPM4QZw0xh5jBLmDXMFmYfcD7MGeYKc4d5wrxhvjB/WCAsGBYKC4dFwqJhsbB4WCIsGZYKS4dlwrJhubB8WCGsGFYKK4dVwqphtbB6WCOsGdYKa4d1wrphvbB+2CBsGDYKG4dNwqZhs7B52CJsGbYKW4f/hW3CtmG7sH3YIewYdgo7h/FhlzAh7Bomht3CpLB72CPsGfYKe4d9wr5hv7B/OCAcGA4KB4dDwqHhsHB4OCIcGY4KR4djwrHhuHB8OCGcGE4KJ4dTwqnhtHB6OCOcGc4KZ4dzwrnhvHB+uCBcGC4KF4dLwqXhsnB5uCJcGa4KV4drwrXhunB9uCHcGG4KN4dbwq3htnB7iIV4SIRkSIV0yIRsyIV8KIRiKIVyqIRqqIV6aIRmaIV26IRu6IV+GIQgDEMYRiEKY+GOcGe4K9wd7gn3hvvC/eGB8GB4KDwcHgmPhsfC4+GJ8GR4KjwdngnPhufC8+GF8GJ4KbwcXgmvhtfC6+GN8GZ4K7wd3gnvhvfC++GD8GH4KHwcPgmfhs/C5+GL8GX4Knwdvgnfhu/C9+GH8GP4Kfwcfgm/ht/C7+GP8Gf4K/wd/gn/hv/COJgMJocpYEqYCqaGaWBamA6mhxlgRpgJZoZZYFaYDWaHOWBOmAvmhnlgXpgP5ocFYEFYCBaGRWBRWAwWhyVgSVgKloZlYFlYDpaHFWBFWAlWhlVgVVgNVoc1YE1YC9aGdWBdWA/Whw1gQ9gINoZNYFPYDDaHLWBL2Aq2hv/BNrAtbAfbww6wI+wEO8N42AUmwK4wEXaDSbA77AF7wl6wN+wD+8J+sD8cAAfCQXAwHAKHwmFwOBwBR8JRcDQcA8fCcXA8nAAnwklwMpwCp8JpcDqcAWfCWXA2nAPnwnlwPlwAF8JFcDFcApfCZXA5XAFXwlVwNVwD18J1cD3cADfCTXAz3AK3wm1wO8QgDglIQgrSkIEs5CAPBShCCcpQgSrUoA4NaEIL2tCBLvSgDwMIYAghjCCCMbgD7oS74G64B+6F++B+eAAehIfgYXgEHoXH4HF4Ap6Ep+BpeAaehefgeXgBXoSX4GV4BV6F1+B1eAPehLfgbXgH3oX34H34AD6Ej+Bj+AQ+hc/gc/gCvoSv4Gv4Br6F7+B7+AF+hJ/gZ/gFfoXf4Hf4A/6Ev+Bv+Af+hf9gXJQsSh6liFJGqaLUUZoobZQuSh9liDJGmaLMUZYoa5Qtyh7liHJGuaLcUZ4ob5Qvyh8ViApGhaLCUZGoaFQsKh6ViEpGpaLSUZmobFQuKh9ViCpGlaLKUZWoalQtqh7ViGpGtaLaUZ2oblQvqh81iBpGjaLGUZOoadQsah61iFpGraLW0X9Rm6ht1C5qH3WIOkados5RfNQlSoi6RolRtygp6h71iHpGvaLeUZ+ob9Qv6h8NiAZGg6LB0ZBoaDQsGh6NiEZGo6LR0ZhobDQuGh9NiCZGk6LJ0ZRoajQtmh7NiGZGs6LZ0ZxobjQvmh8tiBZGi6LF0ZJoabQsWh6tiFZGq6LV0ZpobbQuWh9tiDZGm6LN0ZZoa7Qt2h5hER4RERlRER0xERtxER8JkRhJkRwpkRppkf6/HRgXOZEbeZEfBRGIwghGUYSiWLQj2hntinZHe6K90b5of3QgOhgdig5HR6Kj0bHoeHQiOhmdik5HZ6Kz0bnofHQhuhhdii5HV6Kr0bXoenQjuhndim5Hd6K70b3ofvQgehg9ih5HT6Kn0bPoefQiehm9il5Hb6K30bvoffQh+hh9ij5HX6Kv0bfoe/Qj+hn9in5Hf6K/0b8oDiVDyVEKlBKlQqlRGpQWpUPpUQaUEWVCmVEWlBVlQ9lRDpQT5UK5UR6UF+VD+VEBVBAVQoVREVQUFUPFUQlUEpVCpVEZVBaVQ+VRBVQRVUKVURVUFVVD1VENVBPVQrVRHVQX1UP1UQPUEDVCjVET1BQ1Q81RC9QStUKt0X+oDWqL2qH2qAPqiDqhzigedUEJqCtKRN1QEuqOeqCeqBfqjfqgvqgf6o8GoIFoEBqMhqChaBgajkagkWgUGo3GoLFoHBqPJqCJaBKajKagqWgamo5moJloFpqN5qC5aB6ajxaghWgRWoyWoKVoGVqOVqCVaBVajdagtWgdWo82oI1oE9qMtqCtaBvajjCEIwKRiEI0YhCLOMQjAYlIQjJSkIo0pCMDmchCNnKQizzkowABFCKIIoRQDO1AO9EutBvtQXvRPrQfHUAH0SF0GB1BR9ExdBydQCfRKXQanUFn0Tl0Hl1AF9EldBldQVfRNXQd3UA30S10G91Bd9E9dB89QA/RI/QYPUFP0TP0HL1AL9Er9Bq9QW/RO/QefUAf0Sf0GX1BX9E39B39QD/RL/Qb/UF/0T8UF0sWSx5LEUsZSxVLHUsTSxtLF0sfyxDLGMsUyxzLEssayxbLHssRyxnLFcsdyxPLG8sXyx8rECsYKxQrHCsSKxorFiseKxErGSsVKx0rEysbKxcrH6sQqxirFKscqxKrGqsWqx6rEasZqxWrHasTqxurF6sfaxBrGGsUaxxrEmsaaxZrHmsRaxlrFWsd+y/WJtY21i7WPtYh1jHWKdY5Fh/rEkuIdY0lxrrFkmL/RwI8BWjZLAAA3uqvNtu2bdu2bdvabNu2/dq2ZubLts1tz8V5emG9sT5YX6wf1h8bgA3EBmGDsSHYUGwYNhwbgY3ERmGjsTHYWGwcNh6bgE3EJmGTsSnYVGwaNh2bgc3EZmGzsTnYXGweNh9bgC3EFmEJ2GJsCbYUW4Ytx1ZgK7FV2GpsDbYWW4etxzZgG7FN2GZsC7YV24Ztx3ZgO7Fd2G5sD7YX24ftxw5gB7FD2GHsCHYUO4Ydx05gJ7FT2GnsDHYWO4edxy5gF7FL2GXsCnYVu4Zdx25gNzEMwzECIzEKozEGYzEO4zEBEzEJkzEFUzEN0zEDMzELszEHczEP87EAC7EIAxjEEBbDbmG3sTvYXewedh97gD3EHmGPsSfYU+wZ9hx7gb3EXmGvsTfYW+wd9h77gH3EPmGfsS/YV+wb9h37gf3EfmG/sT/YXywR+4clYXF4Mjw5ngL/D0+Jx8elxuPxNHhaPB2eHs+AZ8Qz4ZnxLHhWPBueHc+B58Rz4bnxPHhePB+eHy+AF8QL4YXxInhRvBheHC+Bl8RL4aXxMnhZvBxeHq+AV8Qr4ZXxKnhVvBpeHa+B18Rr4bXxOnhdvB5eH2+AN8Qb4Y3xJnhTvBneHG+Bt8Rb4a3xNnhbvB3eHu+Ad8Q74Z3xLnhXvBveHe+B98R74b3xPnhfvB/eHx+AD8QH4YPxIfhQfBg+HB+Bj8RH4aPxMfhYfBw+Hp+AT8Qn4ZPxKfhUfBo+HZ+Bz8Rn4bPxOfhcfB4+H1+AL8QX4Qn4YnwJvhRfhi/HV+Ar8VX4anwNvhZfh6/HN+Ab8U34ZnwLvhXfhm/Hd+A78V34bnwPvhffh+/HD+AH8UP4YfwIfhQ/hh/HT+An8VP4afwMfhY/h5/HL+AX8Uv4ZfwKfhW/hl/Hb+A3cQzHcQIncQqncQZncQ7ncQEXcQmXcQVXcQ3XcQM3cQu3cQd3cQ/38QAP8QgHOMQRHsNv4bfxO/hd/B5+H3+AP8Qf4Y/xJ/hT/Bn+HH+Bv8Rf4a/xN/hb/B3+Hv+Af8Q/4Z/xL/hX/Bv+Hf+B/8R/4b/xP/hfPBH/hyfhcUQyIjmRgviPSEmkIlIT8UQaIi2RjkhPZCAyEpmIzEQWIiuRjchO5CByErmI3EQeIi+Rj8hPFCAKEoWIwkQRoihRjChOlCBKEqWI0kQZoixRjihPVCAqEpWIykQVoipRjahO1CBqErWI2kQdoi5Rj6hPNCAaEo2IxkQToinRjGhOtCBaEq2I1kQboi3RjmhPdCA6Ep2IzkQXoivRjehO9CB6Er2I3kQfoi/Rj+hPDCAGEoOIwcQQYigxjBhOjCBGEqOI0cQYYiwxjhhPTCAmEpOIycQUYioxjZhOzCBmErOI2cQcYi4xj5hPLCAWEouIBGIxsYRYSiwjlhMriJXEKmI1sYZYS6wj1hMbiI3EJmIzsYXYSmwjthM7iJ3ELmI3sYfYS+wj9hMHiIPEIeIwcYQ4ShwjjhMniJPEKeI0cYY4S5wjzhMXiIvEJeIycYW4SlwjrhM3iJsERuAEQZAERdAEQ7AER/CEQIiERMiEQqiERuiEQZiERdiEQ7iElyouLo4IiYgABCQQESNuEbeJO8Rd4h5xn3hAPCQeEY+JJ8RT4hnxnHhBvCReEa+JN8Rb4h3xnvhAfCQ+EZ+JL8RX4hvxnfhB/CR+Eb+JP8RfIpH4RyQRcWQyMjmZgvyPTEmmIlOT8WQaMi2ZjkxPZiAzkpnIzGQWMiuZjcxO5iBzkrnI3GQeMi+Zj8xPFiALkoXIwmQRsihZjCxOliBLkqXI0mQZsixZjixPViArkpXIymQVsipZjaxO1iBrkrXI2mQdsi5Zj6xPNiAbko3IxmQTsinZjGxOtiBbkq3I1mQbsi3ZjmxPdiA7kp3IzmQXsivZjexO9iB7kr3I3mQfsi/Zj+xPDiAHkoPIweQQcig5jBxOjiBHkqPI0eQYciw5jhxPTiAnkpPIyeQUcio5jZxOziBnkrPI2eQcci45j5xPLiAXkovIBHIxuYRcSi4jl5MryJXkKnI1uYZcS64j15MbyI3kJnIzuYXcSm4jt5M7yJ3kLnI3uYfcS+4j95MHyIPkIfIweYQ8Sh4jj5MnyJPkKfI0eYY8S54jz5MXyIvkJfIyeYW8Sl4jr5M3yJskRuIkQZIkRdIkQ7IkR/KkQIqkRMqkQqqkRuqkQZqkRdqkQ7qkR/pkQIZkRAISkoiMkbfI2+Qd8i55j7xPPiAfko/Ix+QT8in5jHxOviBfkq/I1+Qb8i35jnxPfiA/kp/Iz+QX8iv5jfxO/iB/kr/I3+Qf8i+ZSP4jk8g4KhmVnEpB/UelpFJRqal4Kg2VlkpHpacyUBmpTFRmKguVlcpGZadyUDmpXFRuKg+Vl8pH5acKUAWpQlRhqghVlCpGFadKUCWpUlRpqgxVlipHlacqUBWpSlRlqgpVlapGVadqUDWpWlRtqg5Vl6pH1acaUA2pRlRjqgnVlGpGNadaUC2pVlRrqg3VlmpHtac6UB2pTlRnqgvVlepGdad6UD2pXlRvqg/Vl+pH9acGUAOpQdRgagg1lBpGDadGUCOpUdRoagw1lhpHjacmUBOpSdRkago1lZpGTadmUDOpWdRsag41l5pHzacWUAupRVQCtZhaQi2lllHLqRXUSmoVtZpaQ62l1lHrqQ3URmoTtZnaQm2ltlHbqR3UTmoXtZvaQ+2l9lH7qQPUQeoQdZg6Qh2ljlHHqRPUSeoUdZo6Q52lzlHnqQvUReoSdZm6Ql2lrlHXqRvUTQqjcIqgSIqiaIqhWIqjeEqgREqiZEqhVEqjdMqgTMqibMqhXMqjfCqgQiqiAAUpRMWoW9Rt6g51l7pH3aceUA+pR9Rj6gn1lHpGPadeUC+pV9Rr6g31lnpHvac+UB+pT9Rn6gv1lfpGfad+UD+pX9Rv6g/1l0qk/lFJVBydjE5Op6D/o1PSqejUdDydhk5Lp6PT0xnojHQmOjOdhc5KZ6Oz0znonHQuOjedh85L56Pz0wXognQhujBdhC5KF6OL0yXoknQpujRdhi5Ll6PL0xXoinQlujJdha5KV6Or0zXomnQtujZdh65L16Pr0w3ohnQjujHdhG5KN6Ob0y3olnQrujXdhm5Lt6Pb0x3ojnQnujPdhe5Kd6O70z3onnQvujfdh+5L96P70wPogfQgejA9hB5KD6OH0yPokfQoejQ9hh5Lj6PH0xPoifQkejI9hZ5KT6On0zPomfQsejY9h55Lz6Pn0wvohfQiOoFeTC+hl9LL6OX0CnolvYpeTa+h19Lr6PX0BnojvYneTG+ht9Lb6O30DnonvYveTe+h99L76P30AfogfYg+TB+hj9LH6OP0CfokfYo+TZ+hz9Ln6PP0BfoifYm+TF+hr9LX6Ov0DfomjdE4TdAkTdE0zdAszdE8LdAiLdEyrdAqrdE6bdAmbdE27dAu7dE+HdAhHdGAhjSiY/Qt+jZ9h75L36Pv0w/oh/Qj+jH9hH5KP6Of0y/ol/Qr+jX9hn5Lv6Pf0x/oj/Qn+jP9hf5Kf6O/0z/on/Qv+jf9h/5LJ9L/6CQ6jknGJGdSMP8xKZlUTGomnknDpGXSMemZDExGJhOTmcnCZGWyMdmZHExOJheTm8nD5GXyMfmZAkxBphBTmCnCFGWKMcWZEkxJphRTminDlGXKMeWZCkxFphJTmanCVGWqMdWZGkxNphZTm6nD1GXqMfWZBkxDphHTmGnCNGWaMc2ZFkxLphXTmmnDtGXaMe2ZDkxHphPTmenCdGW6Md2ZHkxPphfTm+nD9GX6Mf2ZAcxAZhAzmBnCDGWGMcOZEcxIZhQzmhnDjGXGMeOZCcxEZhIzmZnCTGWmMdOZGcxMZhYzm5nDzGXmMfOZBcxCZhGTwCxmljBLmWXMcmYFs5JZxaxm1jBrmXXMemYDs5HZxGxmtjBbmW3MdmYHs5PZxexm9jB7mX3MfuYAc5A5xBxmjjBHmWPMceYEc5I5xZxmzjBnmXPMeeYCc5G5xFxmrjBXmWvMdeYGc5PBGJwhGJKhGJphGJbhGJ4RGJGRGJlRGJXRGJ0xGJOxGJtxGJfxGJ8JmJCJGMBABjEx5hZzm7nD3GXuMfeZB8xD5hHzmHnCPGWeMc+ZF8xL5hXzmnnDvGXeMe+ZD8xH5hPzmfnCfGW+Md+ZH8xP5hfzm/nD/GUSmX9MEhPHJmOTsynY/9iUbCo2NRvPpmHTsunY9GwGNiObic3MZmGzstnY7GwONiebi83N5mHzsvnY/GwBtiBbiC3MFmGLssXY4mwJtiRbii3NlmHLsuXY8mwFtiJbia3MVmGrstXY6mwNtiZbi60dX4ety9Zj67MN2IZsI7Yx24RtyjZjm7Mt2JZsK7Y124Zty7Zj27Md2I5sJ7Yz24XtynZju7M92J5sL7Y324fty/Zj+7MD2IHsIHYwO4Qdyg5jh7Mj2JHsKHY0O4Ydy45jx7MT2InsJHYyO4Wdyk5jp7Mz2JnsLHY2O4edy85j57ML2IXsIjaBXcwuYZeyy9jl7Ap2JbuKXc2uYdey69j17AZ2I7uJ3cxuYbey29jt7A52J7uL3c3uYfey+9j97AH2IHuIPcweYY+yx9jj7An2JHuKPc2eYc+y59jz7AX2InuJvcxeYa+y19jr7A32JouxOEuwJEuxNMuwLMuxPCuwIiuxMquwKquxOmuwJmuxNuuwLuuxPhuwIRuxgIUsYmPsLfY2e4e9y95j77MP2IfsI/Yx+4R9yj5jn7Mv2JfsK/Y1+4Z9y75j37Mf2I/sJ/Yz+4X9yn5jv7M/2J/sL/Y3+4f9yyay/9gkNo5Llizp/7hUXGounkvDpeXScem5DFxGLhOXmcvCZeWycdm5HFxOLheXm8vD5eXycfm5AlxBrhBXmCvCFeWKccW5ElxJrhRXmivDleXKceW5ClxFrhJXmavCVeWqcdW5GlxNrhZXm6vD1eXqcfW5BlxDrhHXmGvCNeWacc25FlxLrhXXmmvDteXace25DlxHrhPXmevCdeW6cd25HlxPrhfXm+vD9eX6cf25AdxAbhA3mBvCDeWGccO5EdxIbhQ3mhvDjeXGceO5CdxEbhI3mZvCTeWmcdO5GdxMbhY3m5vDzeXmcfO5BdxCbhGXwC3mlnBLuWXccm4Ft5Jbxa3m1nBruXXcem4Dt5HbxG3mtnBbuW3cdm4Ht5Pbxe3m9nB7uX3cfu4Ad5A7xB3mjnBHuWPcce4Ed5I7xZ3mznBnuXPcee4Cd5G7xF3mrnBXuWvcde4Gd5PDOJwjOJKjOJpjOJbjOJ4TOJGTOJlTOJXTOJ0zOJOzOJtzOJfzOJ8LuJCLOMBBDnEx7hZ3m7vD3eXucfe5B9xD7hH3mHvCPeWecc+5F9xL7hX3mnvDveXece+5D9xH7hP3mfvCfeW+cd+5H9xP7hf3m/vD/eUSuX9cEhfHJ+OT8yn4//iUfCo+NR/Pp+HT8un49HwGPiOfic/MZ+Gz8tn47HwOPiefi8/N5+Hz8vn4/HwBviBfiC/MF+GL8sX44nwJviRfii/Nl+HL8uX48nwFviJfia/MV+Gr8tX46nwNviZfi6/N1+Hr8vX4+nwDviHfiG/MN+Gb8s345nwLviXfim/Nt+Hb8u349nwHviPfie/Md+G78t347nwPviffi+/N9+H78v34/vwAfiA/iB/MD+GH8sP44fwIfiQ/ih/Nj+HH8uP48fwEfiI/iZ/MT+Gn8tP46fwMfiY/i5/Nz+Hn8vP4+fwCfiG/iE/gF/NL+KX8Mn45v4Jfya/iV/Nr+LX8On49v4HfyG/iN/Nb+K38Nn47v4Pfye/id/N7+L38Pn4/f4A/yB/iD/NH+KP8Mf44f4I/yZ/iT/Nn+LP8Of48f4G/yF/iL/NX+Kv8Nf46f4O/yWM8zhM8yVM8zTM8y3M8zwu8yEu8zCu8ymu8zhu8yVu8zTu8y3u8zwd8yEc84CGP+Bh/i7/N3+Hv8vf4+/wD/iH/iH/MP+Gf8s/45/wL/iX/in/Nv+Hf8u/49/wH/iP/if/Mf+G/8t/47/wP/if/i//N/+H/8on8Pz6JjxOSCcmFFMJ/QkohlZBaiBfSCGmFdEJ6IYOQUcgkZBayCFmFbEJ2IYeQU8gl5BbyCHmFfEJ+oYBQUCgkFBaKCEWFYkJxoYRQUigllBbKCGWFckJ5oYJQUagkVBaqCFWFakJ1oYZQU6gl1BbqCHWFekJ9oYHQUGgkNBaaCE2FZkJzoYXQUmgltBbaCG2FdkJ7oYPQUegkdBa6CF2FbkJ3oYfQU+gl9Bb6CH2FfkJ/YYAwUBgkDBaGCEOFYcJwYYQwUhgljBbGCGOFccJ4YYIwUZgkTBamCFOFacJ0YYYwU5glzBbmCHOFecJ8YYGwUFgkJAiLhSXCUmGZsFxYIawUVgmrhTXCWmGdsF7YIGwUNgmbhS3CVmGbsF3YIewUdgm7hT3CXmGfsF84IBwUDgmHhSPCUeGYcFw4IZwUTgmnhTPCWeGccF64IFwULgmXhSvCVeGacF24IdwUMAEXCIEUKIEWGIEVOIEXBEEUJEEWFEEVNEEXDMEULMEWHMEVPMEXAiEUIgEIUEBCTLgl3BbuCHeFe8J94YHwUHgkPBaeCE+FZ8Jz4YXwUnglvBbeCG+Fd8J74YPwUfgkfBa+CF+Fb8J34YfwU/gl/Bb+CH+FROGfkCTEicnE5GIK8T8xpZhKTC3Gi2nEtGI6Mb2YQcwoZhIzi1nErGI2MbuYQ8wp5hJzi3nEvGI+Mb9YQCwoFhILi0XEomIxsbhYQiwplhJLi2XEsmI5sbxYQawoVhIri1XEqmI1sbpYQ6wp1hJri3XEumI9sb7YQGwoNhIbi03EpmIzsbnYQmwpthJbi23EtmI7sb3YQewodhI7i13ErmI3sbvYQ+wp9hJ7i33EvmI/sb84QBwoDhIHi0PEoeIwcbg4QhwpjhJHi2PEseI4cbw4QZwoThIni1PEqeI0cbo4Q5wpzhJni3PEueI8cb64QFwoLhITxMXiEnGpuExcLq4QV4qrxNXiGnGtuE5cL24QN4qbxM3iFnGruE3cLu4Qd4q7xN3iHnGvuE/cLx4QD4qHxMPiEfGoeEw8Lp4QT4qnxNPiGfGseE48L14QL4qXxMviFfGqeE28Lt4Qb4qYiIuESIqUSIuMyIqcyIuCKIqSKIuKqIqaqIuGaIqWaIuO6Iqe6IuBGIqRCEQoIjEm3hJvi3fEu+I98b74QHwoPhIfi0/Ep+Iz8bn4QnwpvhJfi2/Et+I78b34QfwofhI/i1/Er+I38bv4Q/wp/hJ/i3/Ev2Ki+E9MEuOkZFJyKYUUl5BSSiWlluKlNFJaKZ2UXsogZZQySZmlLFJWKZuUXcoh5ZRySbmlPFJeKZ+UXyogFZQKSYWlIlJRqZhUXCohlZRKSaWlMlJZqZxUXqogVZQqSZWlKlJVqZpUXaoh1ZRqSbWlOlJdqZ5UX2ogNZQaSY2lJlJTqZnUXGohtZRaSa2lNlJbqZ3UXuogdZQ6SZ2lLlJXqZvUXeoh9ZR6Sb2lPlJfqZ/UXxogDZQGSYOlIdJQaZg0XBohjZRGSaOlMdJYaZw0XpogTZQmSZOlKdJUaZo0XZohzZRmSbOlOdJcaZ40X1ogLZQWSQnSYmmJtFRaJi2XVkgrpVXSammNtFZaJ62XNkgbpU3SZmmLtFXaJm2Xdkg7pV3SbmmPtFfaJ+2XDkgHpUPSYemIdFQ6Jh2XTkgnpVPSaemMdFY6J52XLkgXpUvSZemKdFW6Jl2Xbkg3JUzCJUIiJUqiJUZiJU7iJUESJUmSJUVSJU3SJUMyJUuyJUdyJU/ypUAKpUgCEpSQFJNuSbelO9Jd6Z50X3ogPZQeSY+lJ9JT6Zn0XHohvZReSa+lN9Jb6Z30XvogfZQ+SZ+lL9JX6Zv0Xfoh/ZR+Sb+lP9JfKVH6JyVJcXIyObmcQv5PTimnklPL8XIaOa2cTk4vZ5AzypnkzHIWOaucTc4u55Bzyrnk3HIeOa+cT84vF5ALyoXkwnIRuahcTC4ul5BLyqXk0nIZuaxcTi4vV5ArypXkynIVuapcTa4u15BryrXk2nIdua5cT64vN5Abyo3kxnITuancTG4ut5Bbyq3k1nIbua3cTm4vd5A7yp3kznIXuavcTe4u95B7yr3k3nIfua/cT+4vD5AHyoPkwfIQeag8TB4uj5BHyqPk0fIYeaw8Th4vT5AnypPkyfIUeao8TZ4uz5BnyrPk2fIcea48T54vL5AXyovkBHlxYlxcnLxMXi6vkFfKq+TV8hp5rbxOXi9vkDfKm+TN8hZ5q7xN3i7vkHfKu+Td8h55r7xP3i8fkA/Kh+TD8hH5qHxMPi6fkE/Kp+TT8hn5rHxOPi9fkC/Kl+TL8hX5qnxNvi7fkG/KmIzLhEzKlEzLjMzKnMzLgizKkizLiqzKmqzLhmzKlmzLjuzKnuzLgRzKkQxkKCM5Jt+Sb8t35LvyPfm+/EB+KD+SH8tP5KfyM/m5/EJ+Kb+SX8tv5LfyO/m9/EH+KH+SP8tf5K/yN/m7/EP+Kf+Sf8t/5L9yovxPTpLjlGRKciWF8p+SUkmlpFbilTRKWiWdkl7JoGRUMimZlSxKViWbkl3JoeRUcim5lTxKXiWfkl8poBRUCimFlSJKUaWYUlwpoZRUSimllTJKWaWcUl6poFRUKimVlSpKVaWaUl2podRUaim1lTpKXaWeUl9poDRUGimNlSZKU6WZ0lxpobRUWimtlTZKW6Wd0l7poHRUOimdlS5KV6Wb0l3pofRUeim9lT5KX6Wf0l8ZoAxUBimDlSHKUGWYMlwZoYxURimjlTHKWGWcMl6ZoExUJimTlSnKVGWaMl2ZocxUZimzlTnKXGWeMl9ZoCxUFikJymJlibJUWaYsV1YoK5VVympljbJWWaesVzYoG5VNymZli7JV2aZsV3YoO5Vdym5lj7JX2afsVw4oB5VDymHliHJUOaYcV04oJ5VTymnljHJWOaecVy4oF5VLymXlinJVuaZcV24oNxVMwRVCIRVKoRVGYRVO4RVBERVJkRVFURVN0RVDMRVLsRVHcRVP8ZVACZVIAQpUkBJTbim3lTvKXeWecl95oDxUHimPlSfKU+WZ8lx5obxUXimvlTfKW+Wd8l75oHxUPimflS/KV+Wb8l35ofxUfim/lT/KXyVR+ackKXFqMjW5mkL9T02pplJTq/FqGjWtmk5Nr2ZQM6qZ1MxqFjWrmk3NruZQc6q51NxqHjWvmk/NrxZQC6qF1MJqEbWoWkwtrpZQS6ql1NJqGbWsWk4tr1ZQK6qV1MpqFbWqWk2trtZQa6q11NpqHbWuWk+trzZQG6qN1MZqE7Wp2kxtrrZQW6qt1NZqG7Wt2k5tr3ZQO6qd1M5qF7Wr2k3trvZQe6q91N5qH7Wv2k/trw5QB6qD1MHqEHWoOkwdro5QR6qj1NHqGHWsOk4dr05QJ6qT1MnqFHWqOk2drs5QZ6qz1NnqHHWuOk+dry5QF6qL1AR1sbpEXaouU5erK9SV6ip1tbpGXauuU9erG9SN6iZ1s7pF3apuU7erO9Sd6i51t7pH3avuU/erB9SD6iH1sHpEPaoeU4+rJ9ST6in1tHpGPaueU8+rF9SL6iX1snpFvapeU6+rN9SbKqbiKqGSKqXSKqOyKqfyqqCKqqTKqqKqqqbqqqGaqqXaqqO6qqf6aqCGaqQCFapIjam31NvqHfWuek+9rz5QH6qP1MfqE/Wp+kx9rr5QX6qv1NfqG/Wt+k59r35QP6qf1M/qF/Wr+k39rv5Qf6q/1N/qH/Wvmqj+U5PUOC2ZllxLof2npdRSaam1eC2NllZLp6XXMmgZtUxaZi2LllXLpmXXcmg5tVxabi2PllfLp+XXCmgFtUJaYa2IVlQrlpQUF6eV1EpppbUyWlmtnFZeq6BV1CpplbUqWlWtmlZdq6HV1GpptbU6Wl2tnlZfa6A11BppjbUmWlOtmdZca6G11FpprbU2WlutndZe66B11DppnbUuWletm9Zd66H11HppvbU+Wl+tn9ZfG6AN1AZpg7Uh2lBtmDZcG6GN1EZpo7Ux2lhtnDZem6BN1CZpk7Up2lRtmjZdm6HN1GZps7U52lxtnjZfW6At1BZpCdpibYm2VFumLddWaCu1VdpqbY22Vlunrdc2aBu1TdpmbYu2Vdumbdd2aDu1XdpubY+2V9un7dcOaAe1Q9ph7Yh2VDumHddOaCe1U9pp7Yx2VjunndcuaBe1S9pl7Yp2VbumXdduaDc1TMM1QiM1SqM1RmM1TuM1QRM1SZM1RVM1TdM1QzM1S7M1R3M1T/O1QAu1SAMa1JAW025pt7U72l3tnnZfe6A91B5pjxs/0Z5qz7Tn2gvtpfZKe6290d5q77T32gfto/ZJ+6x90b5q37Tv2g/tp/ZL+6390f5qido/LUmL05PpyfUU+n96Sj2VnlqP19PoafV0eno9g55Rz6Rn1rPoWfVsenY9h55Tz6Xn1vPoefV8en69gF5QL6QX1ovoRfVienG9hF5SL6WX1svoZfVyenm9gl5Rr6RX1qvoVfVqenW9hl5Tr6XX1uvodfV6en29gd5Qb6Q31pvoTfVmenO9hd5Sb6W31tvobfV2enu9g95R76R31rvoXfVuene9h95T76X31vvoffV+en99gD5QH6QP1ofoQ/Vh+nB9hD5SH6WP1sfoY/Vx+nh9gj5Rn6RP1qfoU/Vp+nR9hj5Tn6XP1ufoc/V5+nx9gb5QX6Qn6Iv1JfpSfZm+XF+hr9RX6av1NfpafZ2+Xt+gb9Q36Zv1LfpWfZu+Xd+h79R36bv1PfpePU3cfv2AflA/pB/Wj+hH9WP6cf2EflI/pZ/Wz+hn9XP6ef2CflG/pF/Wr+hX9Wv6df2GflPHdFwndFKndFpndFbndF4XdFGXdFlXdFXXdF03dFO3dFt3dFf3dF8P9FCPdKBDHekx/ZZ+W7+j39Xv6ff1B/pD/ZH+WH+iP9Wf6c/1F/pL/ZX+Wn+jv9Xf6e/1D/pH/ZP+Wf+if9W/6d/1H/pP/Zf+W/+j/9UT9X96kh5nJDOSGymM/4yURiojtRFvpDHSGumM9EYGI6ORychsZDGyGtmM7EYOI6eRy8ht5DHyGvmM/EYBo6BRyChsFDGKGsWM4kYJo6RRyihtlDHKGuWM8kYFo6JRyahsVDGqGtWM6kYNo6ZRy6ht1DHqGvWM+kYDo6HRyGhsNDGaGs2M5kYLo6XRymhttDHaGu2M9kYHo6PRyehsdDG6Gt2M7kYPo6fRy+ht9DH6Gv2M/sYAY6AxyBhsDDGGGsOM4cYIY6QxyhhtjDHGGuOM8cYEY6IxyZhsTDGmGtOM6cYMY6Yxy5htzDHmGvOM+cYCY6GxyEgwFhtLjKXGMmO5scJYaawyVhtrjLXGOmO9scHYaGwyNhtbjK3GNmO7scPYaewydht7jL3GPmO/ccA4aBwyDhtHjKPGMeO4ccI4aZwyThtnjLPGOeO8ccG4aFwyLhtXjKvGNeO6ccO4aWAGbhAGaVAGbTAGa3AGbwiGaEiGbCiGamiGbhiGaViGbTiGa3iGbwRGaEQGMKCBjJhxy7ht3DHuGveM+8YD46HxyHhsPDGeGs+M58YL46XxynhtvDHeGu+M98YH46PxyfhsfDG+Gt+M78YP46fxy/ht/DH+GonGPyPJiDOTmcnNFOZ/ZkozlZnajDfTmGnNdGZ6M4OZ0cxkZjazmFnNbGZ2M4eZ08xl5jbzmHnNfGZ+s4BZ0CxkFjaLmEXNYmZxs4RZ0ixlljbLmGXNcmZ5s4JZ0axkVjarmFXNamZ1s4ZZ06xl1jbrmHXNemZ9s4HZ0GxkNjabmE3NZmZzs4XZ0mxltjbbmG3NdmZ7s4PZ0exkdja7mF3NbmZ3s4fZ0+xl9jb7mH3NfmZ/c4A50BxkDjaHmEPNYeZwc4Q50hxljjbHmGPNceZ4c4I50ZxkTjanmFPNaeZ0c4Y505xlzjbnmHPNeeZ8c4G50FxkJpiLzSXmUnOZudxcYa40V1VZba4x15rrzPXmBnOjucncbG4xt5rbzO3mDnOnucvcbe4x95r7zP3mAfOgecg8bB4xj5rHzOPmCfOkeco8bZ4xz5rnzPPmBfOiecm8bF4xr5rXzOvmDfOmiZm4SZikSZm0yZisyZm8KZiiKZmyqZiqqZm6aZimaZm26Ziu6Zm+GZihGZnAhCYyY+Yt87Z5x7xr3jPvmw/Mh+Yj87H5xHxqPjOfmy/Ml+Yr87X5xnxrvjPfmx/Mj+Yn87P5xfxqfjO/mz/Mn+Yv87f5x/xrJpr/zCQzzkpmJbdSWP9ZKa1UVmor3kpjpbXSWemtDFZGK5OV2cpiZbWyWdmtHFZOK5eV28pj5bXyWfmtAlZBq5BV2CpiFbWKWcWtElZJq5RV2ipjlbXKWeWtClZFq5JV2apiVbWqWdWtGlZNq5ZV26pj1bXqWfWtBlZDq5HV2GpiNbWaWc2tFlZLq5XV2mpjtbXaWe2tDlZHq5PV2epidbW6Wd2tHlZPq5fV2+pj9bX6Wf2tAdZAa5A12BpiDbWGWcOtEdZIa5Q12hpjjbXGWeOtCdZEa5I12ZpiTbWmWdOtGdZMa5Y125pjzbXmWfOtBdZCa5GVYC22llhLrWXWcmuFtdJaZa221lhrrXXWemuDtdHaZG22tlhbrW3WdmuHtdPaZe229lh7rX3WfuuAddA6ZB22jlhHrWPWceuEddI6ZZ22zlhnrXPWeeuCddG6ZF22rlhXrWvWdeuGddPCLNwiLNKiLNpiLNbiLN4SLNGSLNlSLNXSLN0yLNOyLNtyLNfyLN8KrNCKLGBBC1kx65Z127pj3bXuWfetB9ZD65H12HpiPbWeWc+tF9ZL65X12npjvbXeWe+tD9ZH65P12fpifbW+Wd+tH9ZP65f12/pj/bUSrX9WkhVnJ7OT2yns/+yUdio7tR1vp7HT2uns9HYGO6Odyc5sZ7Gz2tns7HYOO6edy85t57Hz2vns/HYBu6BdyC5sF7GL2sXs4nYJu6Rdyi5tl7HL2uXs8nYFu6Jdya5sV7Gr2tXs6nYNu6Zdy65t17Hr2vXs+nYDu6HdyG5sN7Gb2s3s5nYLu6Xdym5tt7Hb2u3s9nYHu6Pdye5sd7G72t3s7nYPu6fdy+5t97H72v3s/vYAe6A9yB5sD7GH2sPs4fYIe6Q9yh5tj7HH2uPs8fYEe6I9yZ5sT7Gn2tPs6fYMe6Y9y55tz7Hn2vPs+fYCe6G9yE6wF9tL7KX2Mnu5vcJeaa+yV9tr7LX2Onu9vcHeaG+yN9tb7K32Nnu7vcPeae+yd9t77L32Pnu/fcA+aB+yD9tH7KP2Mfu4fcI+aZ+yT9tn7LP2Ofu8fcG+aF+yL9tX7Kv2Nfu6fcO+aWM2bhM2aVM2bTM2a3M2bwu2aEu2bCu2amu2bhu2aVu2bTu2a3u2bwd2aEc2sKGN7Jh9y75t37Hv2vfs+/YD+6H9yH5sP7Gf2s/s5/YL+6X9yn5tv7Hf2u/s9/YH+6P9yf5sf7G/2t/s7/YP+6f9y/5t/7H/2on2PzvJjnOSOcmdFM5/TkonlZPaiXfSOGmddE56J4OT0cnkZHayOFmdbE52J4eT08nl5HbyOHmdfE5+p4BT0CnkFHaKOEWdYk5xp4RT0inllHbKOGWdck55p4JT0ankVHaqOFWdak51p4ZT06nl1HbqOHWdek59p4HT0GnkNHaaOE2dZk5zp4XT0mnltHbaOG2ddk57p4PT0enkdHa6OF2dbk53p4fT0+nl9Hb6OH2dfk5/Z4Az0BnkDHaGOEOdYc5wZ4Qz0hnljHbGOGOdcc54Z4Iz0ZnkTHamOFOdac50Z4Yz05nlzHbmOHOdec58Z4Gz0FnkJDiLnSXOUmeZs9xZ4ax0VjmrnTXOWmeds97Z4Gx0NjmbnS3OVmebs93Z4ex0djm7nT3OXmefs9854Bx0DjmHnSPOUeeYc9w54Zx0TjmnnTPOWeecc9654Fx0LjmXnSvOVeeac9254dx0MAd3CId0KId2GId1OId3BEd0JEd2FEd1NEd3DMd0LMd2HMd1PMd3Aid0Igc40EFOzLnl3HbuOHede85954Hz0HnkPHaeOE+dZ85z54Xz0nnlvHbeOG+dd85754Pz0fnkfHa+OF+db85354fz0/nl/Hb+OH+dROefk+TEucnc5G4K9z83pZvKTe3Gu2nctG46N72bwc3oZnIzu1ncrG42N7ubw83p5nJzu3ncvG4+N79bwC3oFnILu0Xcom4xt7hbwi3plnJLu2Xcsm45t7xbwa3oVnIru1Xcqm41t7pbw63p1nJru3Xcum49t77bwG3oNnIbu03cpm4zt7nbwm3ptnJbu23ctm47t73bwe3odnI7u13crm43t7vbw+3p9nJ7u33cvm4/t787wB3oDnIHu0Pcoe4wd7g7wh3pjnJHu2Pcse44d7w7wZ3oTnInu1Pcqe40d7o7w53pznJnu3Pcue48d767wF3oLnIT3MXuEnepu8xd7q5wV7qr3NXuGnetu85d725wN7qb3M3uFneru83d7u5wd7q73N3uHnevu8/d7x5wD7qH3MPuEfeoe8w97p5wT7qn3NPuGfese849715wL7qX3MvuFfeqe8297t5wb7qYi7uES7qUS7uMy7qcy7uCK7qSK7uKq7qaq7uGa7qWa7uO67qe67uBG7qRC1zoIjfm3nJvu3fcu+499777wH3oPnIfu0/cp+4z97n7wn3pvnJfu2/ct+479737wf3ofnI/u1/cr+4397v7w/3p/nJ/u3/cv26i+89NcuO8ZF5yL4X3n5fSS+Wl9uK9NF5aL52X3svgZfQyeZm9LF5WL5uX3cvh5fRyebm9PF5eL5+X3yvgFfQKeYW9Il5Rr5hX3CvhlfRKeaW9Ml5Zr5xX3qvgVfQqeZW9Kl5Vr5pX3avh1fRqebW9Ol5dr55X32vgNfQaeY29Jl5Tr5nX3GvhtfRaea29Nl5br53X3uvgdfQ6eZ29Ll5Xr5vX3evh9fR6eb29Pl5fr5/X3xvgDfQGeYO9Id5Qb5g33BvhjfRGeaO9Md5Yb5w33pvgTfQmeZO9Kd5Ub5o33ZvhzfRmebO9Od5cb54331vgLfQWeQneYm+Jt9Rb5i33VngrvVXeam+Nt9Zb5633NngbvU3eZm+Lt9Xb5m33dng7vV3ebm+Pt9fb5+33DngHvUPeYe+Id9Q75h33TngnvVPeae+Md9Y75533LngXvUveZe+Kd9W75l33bng3PczDPcIjPcqjPcZjPc7jPcETPcmTPcVTPc3TPcMzPcuzPcdzPc/zvcALvcgDHvSQF/Nuebe9O95d755333vgPfQeeY+9J95T75n33HvhvfReea+9N95b75333vvgffQ+eZ+9L95X75v33fvh/fR+eb+9P95fL9H75yV5cX4yP7mfwv/PT+mn8lP78X4aP62fzk/vZ/Az+pn8zH4WP6ufzc/u5/Bz+rn83H4eP6+fz8/vF/AL+oX8wn4Rv6hfzC/ul/BL+qX80n4Zv6xfzi/vV/Ar+pX8yn4Vv6pfza/u1/Br+rX82n4dv65fz6/vN/Ab+o38xn4Tv6nfzG/ut/Bb+q381n4bv63fzm/vd/A7+p38zn4Xv6vfze/u9/B7+r383n4fv6/fz+/vD/AH+oP8wf4Qf6g/zB/uj/BH+qP80f4Yf6w/zh/vT/An+pP8yf4Uf6o/zZ/uz/Bn+rP82f4cf64/z5/vL/AX+ov8BH+xv8Rf6i/zl/sr/JX+Kn+1v8Zf66/z1/sb/I3+Jn+zv8Xf6m/zt/s7/J3+Ln+3v8ff6+/z9/sH/IP+If+wf8Q/6h/zj/sn/JP+Kf+0f8Y/65/zz/sX/Iv+Jf+yf8W/6l/zr/s3/Js+5uM+4ZM+5dM+47M+5/O+4Iu+5Mu+4qu+5uu+4Zu+5du+47u+5/t+4Id+5AMf+siP+bf82/4d/65/z7/vP/Af+o/8x/4T/6n/zH/uv/Bf+q/81/4b/63/zn/vf/A/+p/8z/4X/6v/zf/u//B/+r/83/4f/6+f6P/zk/y4IFmQPEgR/BekDFIFqYP4IE2QNkgXpA8yBBmDTEHmIEuQNcgWZA9yBDmDXEHuIE+QN8gX5A8KBAWDQkHhoEhQNCgWFA9KBCWDUkHpoExQNigXlA8qBBWDSkHloEpQNagWVA9qBDWDWkHtoE5QN6gX1A8aBA2DRkHjoEnQNGgWNA9aBC2DVkHroE3QNmgXtA86BB2DTkHnoEvQNegWdA96BD2DXkHvoE/QN+gX9A8GBAODQcHgYEgwNBgWDA9GBCODUcHoYEwwNhgXjA8mBBODScHkYEowNZgWTA9mBDODWcHsYE4wN5gXzA8WBAuDRUFCsDhYEiwNlgXLgxXBymBVsDpYE6wN1gXrgw3BxmBTsDnYEmwNtgXbgx3BzmBXsDvYE+wN9gX7gwPBweBQcDg4EhwNjgXHgxPByeBUcDo4E5wNzgXngwvBxeBScDm4ElwNrgXXgxvBzQAL8IAIyIAK6IAJ2IAL+EAIxEAK5EAJ1EAL9MAIzMAK7MAJ3MAL/CAIwiAKQAADFMSCW8Ht4E5wN7gX3A8eBA+DR8Hj4EnwNHgWPA9eBC+DV8Hr4E3wNngXvA8+BB+DT8Hn4EvwNfgWfA9+BD+DX8Hv4E/wN0gM/gVJQVyYLEwepgj/C1OGqcLUYXyYJkwbpgvThxnCjGGmMHOYJcwaZguzhznCnGGuMHeYJ8wb5gvzhwXCgmGhsHBYJCwaFguLhyXCkmGpsHRYJiwblgvLhxXCimGlsHJYJawaVgurhzXCmmGtsHZYJ6wb1gvrhw3ChmGjsHHYJGwaNgubhy3ClmGrsHXYJmwbtgvbhx3CjmGnsHPYJewadgu7hz3CnmGvsHfYJ+wb9gv7hwPCgeGgcHA4JBwaDguHhyPCkeGocHQ4JhwbjgvHhxPCieGkcHI4JZwaTgunhzPCmeGscHY4J5wbzgvnhwvCheGiMCFcHC4Jl4bLwuXhinBluCpcHa4J14brwvXhhnBjuCncHG4Jt4bbwu3hjnBnuCvcHe4J94b7wv3hgfBgeCg8HB4Jj4bHwuPhifBkeCo8HZ4Jz4bnwvPhhfBieCm8HF4Jr4bXwuvhjfBmiIV4SIRkSIV0yIRsyIV8KIRiKIVyqIRqqIV6aIRmaIV26IRu6IV+GIRhGIUghCEKY+Gt8HZ4J7wb3gvvhw/Ch+Gj8HH4JHwaPgufhy/Cl+Gr8HX4Jnwbvgvfhx/Cj+Gn8HP4Jfwafgu/hz/Cn+Gv8Hf4J/wbJob/wqQwLkoWJY9SRP9FKaNUUeooPkoTpY3SRemjDFHGKFOUOcoSZY2yRdmjHFHOKFeUO8oT5Y3yRfmjAlHBqFBUOCoSFY2KRcWjElHJqFRUOioTlY3KReWjClHFqFJUOaoSVY2qRdWjGlHNqFZUO6oT1Y3qRfWjBlHDqFHUOGoSNY2aRc2jFlHLqFXUOmoTtY3aRe2jDlHHqFPUOeoSdY26Rd2jHlHPqFfUO+oT9Y36Rf2jAdHAaFA0OBoSDY2GRcOjEdHIaFQ0OhoTjY3GReOjCdHEaFI0OZoSTY2mRdOjGdHMaFY0O5oTzY3mRfOjBdHCaFGUEC2OlkRLo2XR8mhFtDJaFa2O1kRro3XR+mhDtDHaFG2OtkRbo23R9mhHtDPaFe2O9kR7o33R/uhAdDA6FB2OjkRHo2PR8ehEdDI6FZ2OzkRno3PR+ehCdDG6FF2OrkRXo2vR9ehGdDPCIjwiIjKiIjpiIjbiIj4SIjGSIjlSIjXSIj0yIjOyIjtyIjfyIj8KojCKIhDBCEWx6FZ0O7oT3Y3uRfejB9HD6FH0OHoSPY2eRc+jF9HL6FX0OnoTvY3eRe+jD9HH6FP0OfoSfY2+Rd+jH9HP6Ff0O/oT/Y0So39RUhQHkoHkIAX4D6QEqUBqEA/SgLQgHUgPMoCMIBPIDLKArCAbyA5ygJwgF8gN8oC8IB/IDwqAgqAQKAyKgKKgGCgOSoCSoBQoDcqAsqAcKA8qgIqgEqgMqoCqoBqoDmqAmqAWqA3qgLqgHqgPGoCGoBFoDJqApqAZaA5agJagFWgN2oC2oB1oDzqAjqAT6Ay6gK6gG+gOeoCeoBfoDfqAvqAf6A8GgIFgEBgMhoChYBgYDkaAkWAUGA3GgLFgHBgPJoCJYBKYDKaAqWAamA5mgJlgFpgN5oC5YB6YDxaAhWARSACLwRKwFCwDy8EKsBKsAqvBGrAWrAPrwQawEWwCm8EWsBVsA9vBDrAT7AK7wR6wF+wD+8EBcBAcAofBEXAUHAPHwQlwEpwCp8EZcBacA+fBBXARXAKXwRVwFVwD18ENcBNgAAcEIAEFaMAAFnCABwIQgQRkoAAVaEAHBjCBBWzgABd4wAcBCEEEAIAAgRi4BW6DO+AuuAfugwfgIXgEHoMn4Cl4Bp6DF+AleAVegzfgLXgH3oMP4CP4BD6DL+Ar+Aa+gx/gJ/gFfoM/4C9IBP9AEoiDyWBymAL+B1PCVDA1jIdpYFqYDqaHGWBGmAlmhllgVpgNZoc5YE6YC+aGeWBemA/mhwVgQVgIFoZFYFFYDBaHJWBJWAqWhmVgWVgOlocVYEVYCVaGVWBVWA1WhzVgTVgL1oZ1YF1YD9aHDWBD2Ag2hk1gU9gMNoctYEvYCraGbWBb2A62hx1gR9gJdoZdYFfYDXaHPWBP2Av2hn1gX9gP9ocD4EA4CA6GQ+BQOAwOhyPgSDgKjoZj4Fg4Do6HE+BEOAlOhlPgVDgNTocz4Ew4C86Gc+BcOA/OhwvgQrgIJsDFcAlcCpfB5XAFXAlXwdVwDVwL18H1cAPcCDfBzXAL3Aq3we1wB9wJd8HdcA/cC/fB/fAAPAgPwcPwCDwKj8Hj8AQ8CU/B0/AMPAvPwfPwArwIL8HL8Aq8Cq/B6/AGvAkxiEMCkpCCNGQgCznIQwGKUIIyVKAKNahDA5rQgjZ0oAs96MMAhjCCAEKIYAzegrfhHXgX3oP34QP4ED6Cj+ET+BQ+g8/hC/gSvoKv4Rv4Fr6D7+EH+BF+gp/hF/gVfoPf4Q/4E/6Cv+Ef+Bcmwn8wCcahZCg5SoH+QylRKpQaxaM0KC1Kh9KjDCgjyoQyoywoK8qGsqMcKCfKhXKjPCgvyofyowKoICqECqMiqCgqhoqjEqgkKoVKozKoLCqHyqMKqCKqhCqjKqgqqoaqoxqoJqqFaqM6qC6qh+qjBqghaoQaoyaoKWqGmqMWqCVqhVqjNqgtaofaow6oI+qEOqMuqCvqhrqjHqgn6oV6oz6oL+qH+qMBaCAahAajIWgoGoaGoxFoJBqFRqMxaCwah8ajCWgimoQmoyloKpqGpqMZaCaahWajOWgumofmowVoIVqEEtBitAQtRcvQcrQCrUSr0Gq0Bq1F69B6tAFtRJvQZrQFbUXb0Ha0A+1Eu9ButAftRfvQfnQAHUSH0GF0BB1Fx9BxdAKdRKfQaXQGnUXn0Hl0AV1El9BldAVdRdfQdXQD3UQYwhGBSEQhGjGIRRzikYBEJCEZKUhFGtKRgUxkIRs5yEUe8lGAQhQhgCBCKIZuodvoDrqL7qH76AF6iB6hx+gJeoqeoefoBXqJXqHX6A16i96h9+gD+og+oc/oC/qKvqHv6Af6iX6h3+gP+osS0T+UhOJiyWLJYyli/8VSxlLFUsfiY2liaWPpYuljGWIZY5limWNZYllj2WLZYzliOWO5YrljeWJ5Y/li+WMFYgVjhWKFY0ViRWPFYsVjJWIlY6VipWNlYmVj5WLlYxViFWOVYpVjVWJVY9Vi1WM1YjVjtWK1Y3VidWP1YvVjDWINY41ijWNNYk1jzWLNYy1iLWOtYq1jbWJtY+1i7WMdYh1jnWKdY11iXWPdYt1jPWI9Y71ivWN9/kfTPTXm1TQKAE3a1LZt27ZtM7XtprZtu29t2+a2bczseepzc76/sW4W1hfrh/XHBmADsUHYYGwINhQbhg3HErER2EhsFDYaG4ONxcZh47EJ2ERsEjYZm4JNxaZh07EZ2ExsFjYbm4PNxeZh87EF2EIsCVuELcaWYEuxZdhybAW2EluFrcbWYGuxddh6bAO2EduEbca2YFuxbdh2bAe2E9uF7cb2YHuxfdh+7AB2EDuEHcaOYEexY9hx7AR2EjuFncbOYGex/7Bz2HnsAnYRu4Rdxq5gV7Fr2HXsBnYTu4Xdxu5gd7F72H3sAfYQe4Q9xp5gT7Fn2HPsBfYSe4W9xt5gb7F32HvsA/YR+4R9xr5gX7Fv2HcMw3CMwEiMwmiMwViMw3hMwERMwmRMwVRMw3TMwEzMwmzMwVzMw3wswEIMYBCLMITFsB/YT+wX9hv7g/3F/mFxeDyeDE+OJ+Ap8JR4Kjw1ngZPi6fD0+MZ8Ix4JjwzngXPimfDs+M58Jx4Ljw3ngfPi+fD8+MF8IJ4IbwwXgQvihfDi+Ml8JJ4Kbw0XgYvi5fDy+MV8Ip4JbwyXgWvilfDq+M18Jp4Lbw2Xgevi9fD6+MN8IZ4I7wx3gRvijfDm+Mt8JZ4K7w13gZvi7fD2+Md8I54J7wz3gXvinfDu+M98J54L7w33gfvi/fD++MD8IH4IHwwPgQfig/Dh+OJ+Ah8JD4KH42Pwcfi4/Dx+AR8Ij4Jn4xPwafi0/Dp+Ax8Jj4Ln43Pwefi8/D5+AJ8IZ6EL8IX40vwpfgyfDm+Al+Jr8JX42vwtfg6fD2+Ad+Ib8I341vwrfg2fDu+A9+J78J343vwvfg+fD9+AD+IH8IP40fwo/gx/Dh+Ao9Ldgo/jZ/Bz+L/4efw8/gF/CJ+Cb+MX8Gv4tfw6/gN/CZ+C7+N38Hv4vfw+/gD/CH+CH+MP8Gf4s/w5/gL/CX+Cn+Nv8Hf4u/w9/gH/CP+Cf+Mf8G/4t/w7ziG4ziBkziFx+MMzuIczuMCLuISLuMKruIaruMGbuIWbuMO7uIe7uMBHuIAh3iEIzyG/8B/4r/w3/gf/C/+D48j4olkRHIigUhBpCRSEamJNERaIh2RnshAZCQyEZmJLERWIhuRnchB5CRyEbmJPEReIh+RnyhAFCQKEYWJIkRRohhRnChBlCRKEaWJMkRZohxRnqhAVCQqEZWJKkRVohpRnahB1CRqEbWJOkRdoh5Rn2hANCQaEY2JJkRTohnRnGhBtCRaEa2JNkRboh3RnuhAdCQ6EZ2JLkRXohvRnehB9CR6Eb2JPkRfoh/RnxhADCQGEYOJIcRQYhgxnEgkRhAjiVHEaGIMMZYYR4wnJhATiUnEZGIKMZWYRkwnZhAziVnEbGIOMZeYR8wnFhALiSRiEbGYWEIsJZYRy4kVxEpiFbGaWEOsJdYR64kNxEZiE7GZ2EJsJbYR24kdxE5iF7Gb2EPsJfYR+4kDxEHiEHGYOEIcJY4Rx4kTxEniFHGaOEOcJf4jzhHniQvEReIScZm4QlwlrhHXiRvETeIWcZu4Q9wl7hH3iQfEQ+IR8Zh4QjwlnhHPiRfES+IV8Zp4Q7wl3hHviQ/ER+IT8Zn4QnwlvhHfCYzACYIgCYqgCYZgCY7gCYEQCYmQCYVQCY3QCYMwCYuwCYdwCY/wiYAICUBAIiIQESN+ED+JX8Rv4g/xl/hHxJHxZDIyOZlApiBTkqnI1GQaMi2ZjkxPZiAzkpnIzGQWMiuZjcxO5iBzkrnI3GQeMi+Zj8xPFiALkoXIwmQRsihZjCxOliBLkqXI0mQZsixZjixPViArkpXIymQVsipZjaxO1iBrkrXI2mQdsi5Zj6xPNiAbko3IxmQTsinZjGxOtiBbkq3I1mQbsi3ZjmxPdiA7kp3IzmQXsivZjexO9iB7kr3I3mQfsi/Zj+xPDiAHkoPIweQQcig5jBxOJpIjyJHkKHI0OYYcS44jx5MTyInkJHIyOYWcSk4jp5MzyJnkLHI2OYecS84j55MLyIVkErmIXEwuIZeSy8jl5ApyJbmKXE2uIdeS68j15AZyI7mJ3ExuIbeS28jt5A5yJ7mL3E3uIfeS+8j95AHyIHmIPEweIY+Sx8jj5AnyJHmKPE2eIc+S/5HnyPPkBfIieYm8TF4hr5LXyOvkDfImeYu8Td4h75L3yPvkA/Ih+Yh8TD4hn5LPyOfkC/Il+Yp8Tb4h35LvyPfkB/Ij+Yn8TH4hv5LfyO8kRuIkQZIkRdIkQ7IkR/KkQIqkRMqkQqqkRuqkQZqkRdqkQ7qkR/pkQIYkICEZkYiMkT/In+Qv8jf5h/xL/iPjqHgqGZWcSqBSUCmpVFRqKg2VlkpHpacyUBmpTFRmKguVlcpGZadyUDmpXFRuKg+Vl8pH5acKUAWpQlRhqghVlCpGFadKUCWpUlRpqgxVlipHlacqUBWpSlRlqgpVlapGVadqUDWpWlRtqg5Vl6pH1acaUA2pRlRjqgnVlGpGNadaUC2pVlRrqg3VlmpHtac6UB2pTlRnqgvVlepGdad6UD2pXlRvqg/Vl+pH9acGUAOpQdRgagg1lBpGDacSqRHUSGoUNZoaQ42lxlHjqQnURGoSNZmaQk2lplHTqRnUTGoWNZuaQ82l5lHzqQXUQiqJWkQtppZQS6ll1HJqBbWSWkWtptZQa6l11HpqA7WR2kRtprZQW6lt1HZqB7WT2kXtpvZQe6l91H7qAHWQOkQdpo5QR6lj1HHqBHWSOkWdps5QZ6n/qHPUeeoCdZG6RF2mrlBXqWvUdeoGdZO6Rd2m7lB3qXvUfeoB9ZB6RD2mnlBPqWfUc+oF9ZJ6Rb2m3lBvqXfUe+oD9ZH6RH2mvlBfqW/UdwqjcIqgSIqiaIqhWIqjeEqgREqiZEqhVEqjdMqgTMqibMqhXMqjfCqgQgpQkIooRMWoH9RP6hf1m/pD/aX+UXF0PJ2MTk4n0CnolHQqOjWdhk5Lp6PT0xnojHQmOjOdhc5KZ6Oz0znonHQuOjedh85L56Pz0wXognQhujBdhC5KF6OL0yXoknQpujRdhi5Ll6PL0xXoinQlujJdha5KV6Or0zXomnQtujZdh65L16Pr0w3ohnQjujHdhG5KN6Ob0y3olnQrujXdhm5Lt6Pb0x3ojnQnujPdhe5Kd6O70z3onnQvujfdh+5L96P70wPogfQgejA9hB5KD6OH04n0CHokPYoeTY+hx9Lj6PH0BHoiPYmeTE+hp9LT6On0DHomPYueTc+h59Lz6Pn0AnohnUQvohfTS+il9DJ6Ob2CXkmvolfTa+i19Dp6Pb2B3khvojfTW+it9DZ6O72D3knvonfTe+i99D56P32APkgfog/TR+ij9DH6OH2CPkmfok/TZ+iz9H/0Ofo8fYG+SF+iL9NX6Kv0Nfo6fYO+Sd+ib9N36Lv0Pfo+/YB+SD+iH9NP6Kf0M/o5/YJ+Sb+iX9Nv6Lf0O/o9/YH+SH+iP9Nf6K/0N/o7jdE4TdAkTdE0zdAszdE8LdAiLdEyrdAqrdE6bdAmbdE27dAu7dE+HdAhDWhIRzSiY/QP+if9i/5N/6H/0v/oOCaeScYkZxKYFExKJhWTmknDpGXSMemZDExGJhOTmcnCZGWyMdmZHExOJheTm8nD5GXyMfmZAkxBphBTmCnCFGWKMcWZEkxJphRTminDlGXKMeWZCkxFphJTmanCVGWqMdWZGkxNphZTm6nD1GXqMfWZBkxDphHTmGnCNGWaMc2ZFkxLphXTmmnDtGXaMe2ZDkxHphPTmenCdGW6Md2ZHkxPphfTm+nD9GX6Mf2ZAcxAZhAzmBnCDGWGMcOZRGYEM5IZxYxmxjBjmXHMeGYCM5GZxExmpjBTmWnMdGYGM5OZxcxm5jBzmXnMfGYBs5BJYhYxi5klzFJmGbOcWcGsZFYxq5k1zFpmHbOe2cBsZDYxm5ktzFZmG7Od2cHsZHYxu5k9zF5mH7OfOcAcZA4xh5kjzFHmGHOcOcGcZE4xp5kzzFnmP+Ycc565wFxkLjGXmSvMVeYac525wdxkbjG3mTvMXeYec595wDxkHjGPmSfMU+YZ85x5wbxkXjGvmTfMWyaOec98YD4yn5jPzBfmK/ON+c5gDM4QDMlQDM0wDMtwDM8IjMhIjMwojMpojM4YjMlYjM04jMt4jM8ETMgABjIRg5gY84P5yfxifjN/mL/MPyaOjWeTscnZBDYFm5JNxaZm07Bp2XRsejYDm5HNxGZms7BZ2WxsdjYHm5PNxeZm87B52XxsfrYAW5AtxBZmi7BF2WJscbYEW5ItxZZmy7Bl2XJsebYCW5GtxFZmq7BV2WpsdbYGW5OtxdZm67B12XpsfbYB25BtxDZmm7BN2WZsc7YF25JtxbZm27Bt2XZse7YD25HtxHZmu7Bd2W5sd7YH25PtxfZm+7B92X5sf3YAO5AdxA5mh7BD2WHscDaRHcGOZEexo9kx7Fh2HDuencBOZCexk9kp7FR2GjudncHOZGexs9k57Fx2HjufXcAuZJPYRexidgm7lF3GLmdXsCvZVexqdg27ll3Hrmc3sBvZTexmdgu7ld3Gbmd3sDvZXexudg+7l93H7mcPsAfZQ+xh9gh7lD3GHmdPsCfZU+xp9gx7lv2PPceeZy+wF9lL7GX2CnuVvcZeZ2+wN9lb7G32DnuXvcfeZx+wD9lH7GP2CfuUfcY+Z1+wL9lX7Gv2DfuWfce+Zz+wH9lP7Gf2C/uV/cZ+ZzEWZwmWZCmWZhmWZTmWZwVWZCVWZhVWZTVWZw3WZC3WZh3WZT3WZwM2ZAEL2YhFbIz9wf5kf7G/2T/sX/YfG8fFc8m45FwCl4JLyaXiUnNpuLRcOi49l4HLyGXiMnNZuKxcNi47l4PLyeXicnN5uLxcPi4/V4AryBXiCnNFuKJcMa44V4IryZXiSnNluLJcOa48V4GryFXiKnNVuKpcNa46V4OrydXianN1uLpcPa4+14BryDXiGnNNuKZcM64514JrybXiWnNtuLZcO64914HryHXiOnNduK5cN64714PryfXienN9uL5cP64/N4AbyA3iBnNDuKHcMG44l8iN4EZyo7jR3BhuLDeOG89N4CZyk7jJ3BRuKjeNm87N4GZys7jZ3BxuLjePm88t4BZySdwibjG3hFvKLeOWcyu4ldwqbjW3hlvLrePWcxu4jdwmbjO3hdvKbeO2czu4ndwubje3h9vL7eP2cwe4g9wh7jB3hDvKHeOOcye4k9wp7jR3hjvL/ced485zF7iL3CXuMneFu8pd465zN7ib3C3uNneHu8vd4+5zD7iH3CPuMfeEe8o9455zL7iX3CvuNfeGe8u9495zH7iP3CfuM/eF+8p9475zGIdzBEdyFEdzDMdyHMdzAidyEidzCqdyGqdzBmdyFmdzDudyHudzARdygINcxCEuxv3gfnK/uN/cH+4v94+L4+P5ZHxyPoFPwafkU/Gp+TR8Wj4dn57PwGfkM/GZ+Sx8Vj4bn53Pwefkc/G5+Tx8Xj4fn58vwBfkC/GF+SJ8Ub4YX5wvwZfkS/Gl+TJ8Wb4cX56vwFfkK/GV+Sp8Vb4aX52vwdfka/G1+Tp8Xb4eX59vwDfkG/GN+SZ8U74Z35xvwbfkW/Gt+TZ8W74d357vwHfkO/Gd+S58V74b353vwffke/G9+T58X74f358fwA/kB/GD+SH8UH4YP5xP5EfwI/lR/Gh+DD+WH8eP5yfwE/lJ/GR+Cj+Vn8ZP52fwM/lZ/Gx+Dj+Xn8fP5xfwC/kkfhG/mF/CL+WX8cv5FfxKfhW/ml/Dr+XX8ev5DfxGfhO/md/Cb+W38dv5HfxOfhe/m9/D7+X38fv5A/xB/hB/mD/CH+WP8cf5E/xJ/hR/mj/Dn+X/48/x5/kL/EX+En+Zv8Jf5a/x1/kb/E3+Fn+bv8Pf5e/x9/kH/EP+Ef+Yf8I/5Z/xz/kX/Ev+Ff+af8O/5d/x7/kP/Ef+E/+Z/8J/5b/x33mMx3mCJ3mKp3mGZ3mO53mBF3mJl3mFV3mN13mDN3mLt3mHd3mP9/mAD3nAQz7iER/jf/A/+V/8b/4P/5f/x8cJ8UIyIbmQIKQQUgqphNRCGiGtkE5IL2QQMgqZhMxCFiGrkE3ILuQQcgq5hNxCHiGvkE/ILxQQCgqFhMJCEaGoUEwoLpQQSgqlhNJCGaGsUE4oL1QQKgqVhMpCFaGqUE2oLtQQagq1hNpCHaGuUE+oLzQQGgqNhMZCE6Gp0ExoLrQQWgqthNZCG6Gt0E5oL3QQOgqdhM5CF6Gr0E3oLvQQegq9hN5CH6Gv0E/oLwwQBgqDhMHCEGGoMEwYLiQKI4SRwihhtDBGGCuME8YLE4SJwiRhsjBFmCpME6YLM4SZwixhtjBHmCvME+YLC4SFQpKwSFgsLBGWCsuE5cIKYaWwSlgtrBHWCuuE9cIGYaOwSdgsbBG2CtuE7cIOYaewS9gt7BH2CvuE/cIB4aBwSDgsHBGOCseE48IJ4aRwSjgtnBHOCv8J54TzwgXhonBJuCxcEa4K14Trwg3hpnBLuC3cEe4K94T7wgPhofBIeCw8EZ4Kz4TnwgvhpfBKeC28Ed4K74T3wgfho/BJ+Cx8Eb4K34TvAibgAiGQAiXQAiOwAifwgiCIgiTIgiKogibogiGYgiXYgiO4gif4QiCEAhCgEAlIiAk/hJ/CL+G38Ef4K/wT4sR4MZmYXEwQU4gpxVRiajGNmFZMJ6YXM4gZxUxiZjGLmFXMJmYXc4g5xVxibjGPmFfMJ+YXC4gFxUJiYbGIWFQsJhYXS4glxVJiabGMWFYsJ5YXK4gVxUpiZbGKWFWsJlYXa4g1xVpibbGOWFesJ9YXG4gNxUZiY7GJ2FRsJjYXW4gtxVZia7GN2FZsJ7YXO4gdxU5iZ7GL2FXsJnYXe4g9xV5ib7GP2FfsJ/YXB4gDxUHiYHGIOFQcJg4XE8UR4khxlDhaHCOOFceJ48UJ4kRxkjhZnCJOFaeJ08UZ4kxxljhbnCPOFeeJ88UF4kIxSVwkLhaXiEvFZeJycYW4UlwlrhbXiGvFdeJ6cYO4Udwkbha3iFvFbeJ2cYe4U9wl7hb3iHvFfeJ+8YB4UDwkHhaPiEfFY+Jx8YR4UjwlnhbPiGfF/8Rz4nnxgnhRvCReFq+IV8Vr4nXxhnhTvCXeFu+Id8V74n3xgfhQfCQ+Fp+IT8Vn4nPxhfhSfCW+Ft+Ib8V34nvxg/hR/CR+Fr+IX8Vv4ncRE3GREEmREmmREVmRE3lREEVREmVREVVRE3XREE3REm3REV3RE30xEEMRiFCMRCTGxB/iTzExw2/xj/hX/CfGSfFSMim5lCClkFJKqaTUUhoprZROSi9lkDJKmaTMUhYpq5RNyi7lkHJKuaTcUh4pr5RPyi8VkApKhaTCUhGpqFRMKi6VkEpKpaTSUhmprFROKi9VkCpKlaTKUhWpqlRNqi7VkGpKtaTaUh2prlRPqi81kBpKjaTGUhOpqdRMai61kFpKraTWUhuprdROai91kDpKnaTOUhepq9RN6i71kHpKvaTeUh+pr9RP6i8NkAZKg6TB0hBpqDRMGi4lSiOkkdIoabQ0RhorjZPGSxOkidIkabI0RZoqTZOmSzOkmdIsabY0R5orzZPmSwukhVKStEhaLC2RlkrLpOXSCmmltEpaLa2R1krrpPXSBmmjtEnaLG2RtkrbpO3SDmmntEvaLe2R9kr7pP3SAemgdEg6LB2RjkrHpOPSCemkdEo6LZ2Rzkr/Seek89IF6aJ0SbosXZGuStek69IN6aZ0S7ot3ZHuSvek+9ID6aH0SHosPZGeSs+k59IL6aX0SnotvZHeSu+k99IH6aP0SfosfZG+St+k7xIm4RIhkRIl0RIjsRIn8ZIgiZIkyZIiqZIm6ZIhmZIl2ZIjuZIn+VIghRKQoBRJSIpJP6Sf0i/pt/RH+iv9k+LkeDmZnFxOkFPIKeVUcmo5jZxWTienlzPIGeVMcmY5i5xVziZnl3PIOeVccm45j5xXzifnlwvIBeVCcmG5iFxULiYXl0vIJeVScmm5jFxWLieXlyvIFeVKcmW5ilxVriZXl2vINeVacm25jlxXrifXlxvIDeVGcmO5idxUbiY3l1vILeVWcmu5jdxWbie3lzvIHeVOcme5i9xV7iZ3l3vIPeVecm+5j9xX7if3lwfIA+VB8mB5iDxUHiYPlxPlEfJIeZQ8Wh4jj5XHyePlCfJEeZI8WZ4iT5WnydPlGfJMeZY8W54jz5XnyfPlBfJCOUleJC+Wl8hL5WXycnmFvFJeJa+W18hr5XXyenmDvFHeJG+Wt8hb5W3ydnmHvFPeJe+W98h75X3yfvmAfFA+JB+Wj8hH5WPycfmEfFI+JZ+Wz8hn5f/kc/J5+YJ8Ub4kX5avyFfla/J1+YZ8U74l35bvyHfle/J9+YH8UH4kP5afyE/lZ/Jz+YX8Un4lv5bfyG/ld/J7+YP8Uf4kf5a/yF/lb/J3GZNxmZBJmZJpmZFZmZN5WZBFWZJlWZFVWZN12ZBN2ZJt2ZFd2ZN9OZBDGchQjmQkx+Qf8k/5l/xb/iP/lf/JcUq8kkxJriQoKZSUSioltZJGSaukU9IrGZSMSiYls5JFyapkU7IrOZScSi4lt5JHyavkU/IrBZSCSiGlsFJEKaoUU4orJZSSSimltFJGKauUU8orFZSKSiWlslJFqapUU6orNZSaSi2ltlJHqavUU+orDZSGSiOlsdJEaao0U5orLZSWSiultdJGaau0U9orHZSOSiels9JF6ap0U7orPZSeSi+lt9JH6av0U/orA5SByiBlsDJEGaoMU4YricoIZaQyShmtjFHGKuOU8coEZaIySZmsTFGmKtOU6coMZaYyS5mtzFHmKvOU+coCZaGSpCxSFitLlKXKMmW5skJZqaxSVitrlLXKOmW9skHZqGxSNitblK3KNmW7skPZqexSdit7lL3KPmW/ckA5qBxSDitHlKPKMeW4ckI5qZxSTitnlLPKf8o55bxyQbmoXFIuK1eUq8o15bpyQ7mp3FJuK3eUu8o95b7yQHmoPFIeK0+Up8oz5bnyQnmpvFJeK2+Ut8o75b3yQfmofFI+K1+Ur8o35buCKbhCKKRCKbTCKKzCKbwiKKIiKbKiKKqiKbpiKKZiKbbiKK7iKb4SKKECFKhEClJiyg/lp/JL+a38Uf4q/5Q4NV5NpiZXE9QUako1lZpaTaOmVdOp6dUMakY1k5pZzaJmVbOp2dUcak41l5pbzaPmVfOp+dUCakG1kFpYLaIWVYupxdUSakm1lFpaLaOWVcup5dUKakW1klpZraJWVaup1dUaak21llpbraPWVeup9dUGakO1kdpYbaI2VZupzdUWaku1ldpabaO2Vdup7dUOake1k9pZ7aJ2Vbup3dUeak+1l9pb7aP2Vfup/dUB6kB1kDpYHaIOVYepw9VEdYQ6Uh2ljlbHqGPVcep4dYI6UZ2kTlanqFPVaep0dYY6U52lzlbnqHPVeep8dYG6UE1SF6mL1SXqUnWZulxdoa5UV6mr1TXqWnWdul7doG5UN6mb1S3qVnWbul3doe5Ud6m71T3qXnWful89oB5UD6mH1SPqUfWYelw9oZ5UT6mn1TPqWfU/9Zx6Xr2gXlQvqZfVK+pV9Zp6Xb2h3lRvqbfVO+pd9Z56X32gPlQfqY/VJ+pT9Zn6XH2hvlRfqa/VN+pb9Z36Xv2gflQ/qZ/VL+pX9Zv6XcVUXCVUUqVUWmVUVuVUXhVUUZVUWVVUVdVUXTVUU7VUW3VUV/VUXw3UUAUqVCMVqTH1h/pT/aX+Vv+of9V/apwWryXTkmsJWgotpZZKS62l0dJq6bT0WgYto5ZJy6xl0bJq2bTsWg4tp5ZLy63l0fJq+bT8WgGtoFZIK6wV0YpqxbTiWgmtpFZKK62V0cpq5bTyWgWtolZJq6xV0apq1bTqWg2tplZLq63V0epq9bT6WgOtodZIa6w10ZpqzbTmWgutpdZKa6210dpq7bT2Wgeto9ZJ66x10bpq3bTuWg+tp9ZL66310fpq/bT+2gBtoDZIG6wN0YZqw7ThWqI2QhupjdJGa2O0sdo4bbw2QZuoTdIma1O0qdo0bbo2Q5upzdJma3O0udo8bb62QFuoJWmLtMXaEm2ptkxbrq3QVmqrtNXaGm2ttk5br23QNmqbtM3aFm2rtk3bru3Qdmq7tN3aHm2vtk/brx3QDmqHtMPaEe2odkw7rp3QTmqntNPaGe2s9p92TjuvXdAuape0y9oV7ap2Tbuu3dBuare029od7a52T7uvPdAeao+0x9oT7an2THuuvdBeaq+019ob7a32TnuvfdA+ap+0z9oX7av2TfuuYRquERqpURqtMRqrcRqvCZqoSZqsKZqqaZquGZqpWZqtOZqreZqvBVqoAQ1qkYa0mPZD+6n90n5rf7S/2j8tTo/Xk+nJ9QQ9hZ5ST6Wn1tPoafV0eno9g55Rz6Rn1rPoWfVsenY9h55Tz6Xn1vPoefV8en69gF5QL6QX1ovoRfVienG9hF5SL6WX1svoZfVyenm9gl5Rr6RX1qvoVfVqenW9hl5Tr6XX1uvodfV6en29gd5Qb6Q31pvoTfVmenO9hd5Sb6W31tvobfV2enu9g95R76R31rvoXfVuene9h95T76X31vvoffV+en99gD5QH6QP1ofoQ/Vh+nA9UR+hj9RH6aP1MfpYfZw+Xp+gT9Qn6ZP1KfpUfZo+XZ+hz9Rn6bP1OfpcfZ4+X1+gL9ST9EX6Yn2JvlRfpi/XV+gr9VX6an2NvlZfp6/XN+gb9U36Zn2LvlXfpm/Xd+g79V36bn2Pvlffp+/XD+gH9UP6Yf2IflQ/ph/XT+gn9VP6af2MflZPExcXd16/oF/UL+mX9Sv6Vf2afl2/od/Ub+m39Tv6Xf2efl9/oD/UH+mP/3fG6C/0l/or/bX+Rn+rv9Pf6x/0j/on/bP+Rf+qf9O/65iO64RO6pRO64zO6pzO64Iu6pIu64qu6pqu64Zu6pZu647u6p7u64Ee6kCHeqQjPab/0H/qv/Tf+h/9r/5PjzPijWRGciPBSGGkNFIZqY00RlojnZHeyGBkNDIZmY0sRlYjm5HdyGHkNHIZuY08Rl4jn5HfKGAUNAoZhY0iRlGjmFHcKGGUNEoZpY0yRlmjnFHeqGBUNCoZlY0qRlWjmlHdqGHUNGoZtY06Rl2jnlHfaGA0NBoZjY0mRlOjmdHcaGG0NFoZrY02RlujndHe6GB0NDoZnY0uRlejm9Hd6GH0NHoZvY0+Rl+jn9HfGGAMNAYZg40hxlBjmDHcSDRGGCONUcZoY4wx1hhnjDcmGBONScZkY4ox1ZhmTDdmGDONWcZsY44x15hnzDcWGAuNJGORsdhYYiw1lhnLjRXGSmOVsdpYY6w11hnrjQ3GRmOTsdnYYmw1thnbjR3GTmOXsdvYY+w19hn7jQPGQeOQcdg4Yhw1jhnHjRPGSeOUcdo4Y5w1/jPOGeeNC8ZF45Jx2bhiXDWuGdeNG8ZN45Zx27hj3DXuGfeNB8ZD45Hx2HhiPDWeGc+NF8ZL45Xx2nhjvDXeGe+ND8ZH45Px2fhifDW+Gd8NzMANwiANyqANxmANzuANwRANyZANxVANzdANwzANy7ANx3ANz/CNwAgNYEAjMpARM34YP41fxm/jj/HX+GfEmfFmMjO5mWCmMFOaqczUZhozrZnOTG9mMDOamczMZhYzq5nNzG7mMHOauczcZh4zr5nPzG8WMAuahczCZhGzqFnMLG6WMEuapczSZhmzrFnOLG9WMCualczKZhWzqlnNrG7WMGuatczaZh2zrlnPrG82MBuajczGZhOzqdnMbG62MFuarczWZhuzrdnObG92MDuanczOZhezq9nN7G72MHuavczeZh+zr9nP7G8OMAeag8zB5hBzqDnMHG4mmiPMkeYoc7Q5xhxrjjPHmxPMieYkc7I5xZxqxsVNN2eYM81Z5mxzjjnXnGfONxeYC80kc5G52FxiLjWXmcvNFeZKc5W52lxjrjXXmevNDeZGc5O52dxibjW3mdvNHeZOc5e529xj7jX3mfvNA+ZB85B52DxiHjWPmcfNE+ZJ85R52jxjnjX/M8+Z580L5kXzknnZvGJeNa+Z180b5k3zlnnbvGPeNe+Z980H5kPzkfnYfGI+NZ+Zz80X5kvzlfnafGO+Nd+Z780P5kfzk/nZ/GJ+Nb+Z303MxE3CJE3KpE3GZE3O5E3BFE3JlE3FVE3N1E3DNE3LtE3HdE3P9M3ADE1gQjMykRkzf5g/zV/mb/OP+df8Z8ZZ8VYyK7mVYKWwUlqprNRWGiutlc5Kb2WwMlqZrMxWFiurlc3KbuWwclq5rNxWHiuvlc/KbxWwClqFrMJWEauoVcwqbpWwSlqlrNJWGausVc4qb1WwKlqVrMpWFauqVc2qbtWwalq1rNpWHauuVc+qbzWwGlqNrMZWE6up1cxqbrWwWlqtrNZWG6ut1c5qb3WwOlqdrM5WF6ur1c3qbvWwelq9rN5WH6uv1c/qbw2wBlqDrMHWEGuoNcwabiVaI6yR1ihrtDXGGmuNs8ZbE6yJ1iRrsjXFmmpNs6ZbM6yZ1ixrtjXHmmvNs+ZbC6yFVpK1yFpsLbGWWsus5dYKa6W1ylptrbHWWuus9dYGa6O1ydpsbbG2Wtus7dYOa6e1y9pt7bH2Wvus/dYB66B1yDpsHbGOWses49YJ66R1yjptnbHOWv9Z56zz1gXronXJumxdsa5a16zr1g3rpnXLum3dse5a96z71gProfXIemw9sZ5az6zn1gvrpfXKem29sd5a76z31gfro/XJ+mx9sb5a36zvFmbhFmGRFmXRFmOxFmfxlmCJlmTJlmKplmbplmGZlmXZlmO5lmf5VmCFFrCgFVnIilk/rJ/WL+u39cf6a/2z4ux4O5md3E6wU9gp7VR2ajuNndZOZ6e3M9gZ7Ux2ZjuLndXOZme3c9g57Vx2bjuPndfOZ+e3C9gF7UJ2YbuIXdQuZhe3S9gl7VJ2abuMXdYuZ5e3K9gV7Up2ZbuKXdWuZle3a9g17Vp2bbuOXdeuZ9e3G9gN7UZ2Y7uJ3dRuZje3W9gt7VZ2a7uN3dZuZ7e3O9gd7U52Z7uL3dXuZne3e9g97V52b7uP3dfuZ/e3B9gD7UH2YHuIPdQeZg+3E+0R9kh7lD3aHmOPtcfZ4+0J9kR7kj3ZnmJPtafZ0+0Z9kx7lj3bnmPPtefZ8+0F9kI7yV5kL7aX2EvtZfZye4W90l5lr7bX2GvtdfZ6e4O90d5kb7a32FvtbfZ2e4e9095l77b32HvtffZ++4B90D5kH7aP2EftY/Zx+4R90j5ln7bP2Gft/+xz9nn7gn3RvmRftq/YV+1r9nX7hn3TvmXftu/Yd+179n37gf3QfmQ/tp/YT+1n9nP7hf3SfmW/tt/Yb+139nv7g/3R/mR/tr/YX+1v9ncbs3GbsEmbsmmbsVmbs3lbsEVbsmVbsVVbs3XbsE3bsm3bsV3bs307sEMb2NCObGTH7B/2T/uX/dv+Y/+1/9lxTryTzEnuJDgpnJROKie1k8ZJ66Rz0jsZnIxOJiezk8XJ6mRzsjs5nJxOLie3k8fJ6+Rz8jsFnIJOIaewU8Qp6hRzijslnJJOKae0U8Yp65RzyjsVnIpOJaeyU8Wp6lRzqjs1nJpOLae2U8ep69Rz6jsNnIZOI6ex08Rp6jRzmjstnJZOK6e108Zp67Rz2jsdnI5OJ6ez08Xp6nRzujs9nJ5OL6e308fp6/Rz+jsDnIHOIGewM8QZ6gxzhjuJzghnpDPKGe2MccY645zxzgRnojPJmexMcaY605zpzgxnpjPLme3MceY685z5zgJnoZPkLHIWO0ucpc4yZ7mzwlnprHJWO2uctc46Z72zwdnobHI2O1ucrc42Z7uzw9np7HJ2O3ucvc4+Z79zwDnoHHIOO0eco84x57hzwjnpnHJOO2ecs85/zjnnvHPBuehcci47V5yrzjXnunPDuenccm47d5y7zj3nvvPAeeg8ch47T5ynzjPnufPCeem8cl47b5y3zjvnvfPB+eh8cj47X5yvzjfnu4M5uEM4pEM5tMM4rMM5vCM4oiM5sqM4qqM5umM4pmM5tuM4ruM5vhM4oQMc6EQOcmLOD+en88v57fxx/jr/nDg33k3mJncT3BRuSjeVm9pN46Z107np3QxuRjeTm9nN4mZ1s7nZ3RxuTjeXm9vN4+Z187n53QJuQbeQW9gt4hZ1i7nF3RJuSbeUW9ot45Z1y7nl3QpuRbeSW9mt4lZ1q7nV3RpuTbeWW9ut49Z167n13QZuQ7eR29ht4jZ1m7nN3RZuS7eV29pt47Z127nt3Q5uR7eT29nt4nZ1u7nd3R5uT7eX29vt4/Z1+7n93QHuQHeQO9gd4g51h7nD3UR3hDvSHeWOdse4Y91x7nh3gjvRneROdqe4U91p7nR3hjvTneXOdue4c9157nx3gbvQTXIXuYvdJe5Sd5m73F3hrnRXuavdNe5ad5273t3gbnQ3uZvdLe5Wd5u73d3h7nR3ubvdPe5ed5+73z3gHnQPuYfdI+5R95h73D3hnnRPuafdM+5Z9z/3nHveveBedC+5l90r7lX3mnvdveHedG+5t9077l33nnvffeA+dB+5j90n7lP3mfvcfeG+dF+5r9037lv3nfve/eB+dD+5n90v7lf3m/vdxVzcJVzSpVzaZVzW5VzeFVzRlVzZVVzV1VzdNVzTtVzbdVzX9VzfDdzQBS50Ixe5MfeH+9P95f52/7h/3X9unBfvJfOSewleCi+ll8pL7aXx0nrpvPReBi+jl8nL7GXxsnrZvOxeDi+nl8vL7eXx8nr5vPxeAa+gV8gr7BXxinrFvOJeCa+kV8or7ZXxynrlvPJeBa+iV8mr7FXxqnrVvOpeDa+mV8ur7dXx6nr1vPpeA6+h18hr7DXxmnrNvOZeC6+l18pr7bXx2nrtvPZeB6+j18nr7HXxunrdvO5eD6+n18vr7fXx+nr9vP7eAG+gN8gb7A3xhnrDvOFeojfCG+mN8kZ7Y7yx3jhvvDfBm+hN8iZ7U7yp3jRvujfDm+nN8mZ7c7y53jxvvrfAW+gleYu8xd4Sb6m3zFvurfBWequ81d4ab623zlvvbfA2epu8zd4Wb6u3zdvu7fB2eru83d4eb6+3z9vvHfAOeoe8w94R76h3zDvunfBOeqe8094Z76z3n3fOO+9d8C56l7zL3hXvqnfNu+7d8G56t7zb3h3vrnfPu+898B56j7zH3hPvqffMe+698F56r7zX3hvvrffOe+998D56n7zP3hfvq/fN++5hHu4RHulRHu0xHutxHu8JnuhJnuwpnuppnu4ZnulZnu05nut5nu8FXugBD3qRh7yY98P76f3yfnt/vL/ePy/Oj/eT+cn9BD+Fn9JP5af20/hp/XR+ej+Dn9HP5Gf2s/hZ/Wx+dj+Hn9PP5ef28/h5/Xx+fr+AX9Av5Bf2i/hF/WJ+cb+EX9Iv5Zf2y/hl/XJ+eb+CX9Gv5Ff2q/hV/Wp+db+GX9Ov5df26/h1/Xp+fb+B39Bv5Df2m/hN/WZ+c7+F39Jv5bf22/ht/XZ+e7+D39Hv5Hf2u/hd/W5+d7+H39Pv5ff2+/h9/X5+f3+AP9Af5A/2h/hD/WH+cD/RH+GP9Ef5o/0x/lh/nD/en+BP9Cf5k/0p/lR/mj/dn+HP9Gf5s/05/lx/nj/fX+Av9JP8Rf5if4m/1F/mL/dX+Cv9Vf5qf42/1l/nr/c3+Bv9Tf5mf4u/1d/mb/d3+Dv9Xf5uf4+/19/n7/cP+Af9Q/5h/4h/1D/mH/dP+Cf9U/5p/4x/1v/PP+ef9y/4F/1L/mX/in/Vv+Zf92/4N/1b/m3/jn/Xv+ff9x/4D/1H/mP/if/Uf+Y/91/4L/1X/mv/jf/Wf+e/9z/4H/1P/mf/i//V/+Z/9zEf9wmf9Cmf9hmf9Tmf9wVf9CVf9hVf9TVf9w3f9C3f9h3f9T3f9wM/9IEP/chHfsz/4f/0f/m//T/+X/+fHxfEB8mC5EFCkCJIGaQKUgdpgrRBuiB9kCHIGGQKMgdZgqxBtiB7kCPIGeQKcgd5grxBviB/UCAoGBQKCgdFgqJBsaB4UCIoGZQKSgdlgrJBuaB8UCGoGFQKKgdVgqpBtaB6UCOoGdQKagd1grpBvaB+0CBoGDQKGgdNgqZBs6B50CJoGbQKWgdtgrZBu6B90CHoGHQKOgddgq5Bt6B70CPoGfQKegd9gr5Bv6B/MCAYGAwKBgdDgqHBsGB4kBiMCEYGo4LRwZhgbDAuGB9MCCYGk4LJwZRgajAtmB7MCGYGs4LZwZxgbjAvmB8sCBYGScGiYHGwJFgaLAuWByuClcGqYHWwJlgbrAvWBxuCjcGmYHOwJdgabAu2BzuCncGuYHewJ9gb7Av2BweCg8Gh4HBwJDgaHAuOByeCk8Gp4HRwJjgb/BecC84HF4KLwaXgcnAluBpcC64HN4Kbwa3gdnAnuBvcC+4HD4KHwaPgcfAkeBo8C54HL4KXwavgdfAmeBu8C94HH4KPwafgc/Al+Bp8C74HWIAHREAGVEAHTMAGXMAHQiAGUiAHSqAGWqAHRmAGVmAHTuAGXuAHQRAGIIBBFKAgFvwIfga/gt/Bn+Bv8C+IC+PDZGHyMCFMEaYMU4WpwzRh2jBdmD7MEGYMM4WZwyxh1jBbmD3MEeYMc4W5wzxh3jBfmD8sEBYMC4WFwyJh0bBYWDwsEZYMS4WlwzJh2bBcWD6sEFYMK4WVwyph1bBaWD2sEdYMa4W1wzph3bBeWD9sEDYMG4WNwyZh07BZ2DxsEbYMW4WtwzZh27Bd2D7sEHYMO4Wdwy5h17Bb2D3sEfYMe4W9wz5h37Bf2D8cEA4MB4WDwyHh0HBYODxMDEeEI8NR4ehwTDg2HBeODyeEE8NJ4eRwSjg1nBZOD2eEM8NZ4exwTjg3nBfODxeEC8OkcFG4OFwSLg2XhcvDFeHKcFW4OlwTrg3XhevDDeHGcFO4OdwSbg23hdvDHeHOcFe4O9wT7g33hfvDA+HB8FB4ODwSHg2PhcfDE+HJ8FR4Okz4f847H14IL4aXwsvhlfBqeC28Ht4Ib4a3wtvhnfBueC+8Hz4IH4aPwsfhk/Bp+Cx8Hr4IX4avwtfhm/Bt+C58H34IP4afws/hl/Br+C38HmIhHhIhGVIhHTIhG3IhHwqhGEqhHCqhGmqhHhqhGVqhHTqhG3qhHwZhGIIQhlGIwlj4I/wZ/gp/h3/Cv+G/MA7Eg2QgOUgAKUBKkAqkBmlAWpAOpAcZQEaQCWQGWUBWkA1kBzlATpAL5AZ5QF6QD+QHBUBBUAgUBkVAUVAMFAclQElQCpQGZUBZUA6UBxVARVAJVAZVQFVQDVQHNUBNUAvUBnVAXVAP1AcNQEPQCDQGTUBT0Aw0By1AS9AKtAZtQFvQDrQHHUBH0Al0Bl1AV9ANdAc9QE/QC/QGfUBf0A/0BwPAQDAIDAZDwFAwDAwHiWAEGAlGgdFgDBgLxoHxYAKYCCaByWAKmAqmgelgBpgJZoHZYA6YC+aB+WABWAiSwCKwGCwBS8EysBysACvBKrAarAFrwTqwHmwAG8EmsBlsAVvBNrAd7AA7wS6wG+wBe8E+sB8cAAfBIXAYHAFHwTFwHJwAJ8EpcBqcAWfBf+AcOA8ugIvgErgMroCr4Bq4Dm6Am+AWuA3ugLvgHrgPHoCH4BF4DJ6Ap+AZeA5egJfgFXgN3oC34B14Dz6Aj+AT+Ay+gK/gG/gOMIADApCAAjRgAAs4wAMBiEACMlCACjSgAwOYwAI2cIALPOCDAIQAAAgigEAM/AA/wS/wG/wBf8E/EAfjYTKYHCbAFDAlTAVTwzQwLUwH08MMMCPMBDPDLDArzAazwxwwJ8wFc8M8MC/MB/PDArAgLAQLwyKwKCwGi8MSsCQsBUvDMrAsLAfLwwqwIqwEK8MqsCqsBqvDGrAmrAVrwzqwLqwH68MGsCFsBBvDJrApbAabwxawJWwFW8M2sC1sB9vDDrAj7AQ7wy6wK+wGu8MesCfsBXvDPrAv7Af7wwFwIBwEB8MhcCgcBofDRDgCjoSj4Gg4Bo6F4+B4OAFOhJPgZDgFToXT4HQ4A86Es+BsOAfOhfPgfLgALoRJcBFcDJfApXAZXA5XwJVwFVwN18C1cB1cDzfAjXAT3Ay3wK1wG9wOd8CdcBfcDffAvXAf3A8PwIPwEDwMj8Cj8Bg8Dk/Ak/AUPA3PwLPwP3gOnocX4EV4CV6GV+BVeA1ehzfgTXgL3oZ34F14D96HD+BD+Ag+hk/gU/gMPocv4Ev4Cr6Gb+Bb+A6+hx/gR/gJfoZf4Ff4DX6HGMQhAUlIQRoykIUc5KEARShBGSpQhRrUoQFNaEEbOtCFHvRhAEMIIIQRRDAGf8Cf8Bf8Df/Av/AfjIvio2RR8ighShGljFJFqaM0UdooXZQ+yhBljDJFmaMsUdYoW5Q9yhHljHJFuaM8Ud4oX5Q/KhAVjApFhaMiUdGoWFQ8KhGVjEpFpaMyUdmoXFQ+qhBVjCpFlaMqUdWoWlQ9qhHVjGpFtaM6Ud2oXlQ/ahA1jBpFjaMmUdOoWdQ8ahG1jFpFraM2UdsoPi4uqUPUMeoUdY66RF2jblH3qEfUM+oV9Y76RH2jflH/aEA0MBoUDY6GREOjYdHwKDEaEY2MRkWjozHR2GhcND6aEE2MJkWToynR1GhaND2aEc2MZkWzoznR3GheND9aEC2MkqJF0eJoSbQ0WhYtj1ZEK6NV0epoTbQ2WhetjzZEG6NN0eZoS7Q12hZtj3ZEO6Nd0e5oT7Q32hftjw5EB6ND0eHoSHQ0OhYdj05EJ6NT0enoTHQ2+i86F52PLkQXo0vR5ehKdDW6Fl2PbkQ3o1vR7ehOdDe6F92PHkQPo0fR4+hJ9DR6Fj2PXkQvo1fR6+hN9DZ6F72PPkQfo0/R5+hL9DX6Fn2PsAiPiIiMqIiOmIiNuIiPhEiMpEiOlEiNtEiPjMiMrMiOnMiNvMiPgiiMQASjKEJRLPoR/Yx+Rb+jP9Hf6F8Uh+JRMpQcJaAUKCVKhVKjNCgtSofSowwoI8qEMqMsKCvKhrKjHCgnyoVyozwoL8qH8qMCqCAqhAqjIqgoKoaKoxKoJCqFSqMyqCwqh8qjCqgiqoQqoyqoKqqGqqMaqCaqhWqjOqguqofqowaoIWqEGqMmqClqhpqjFqglaoVaozaoLWqH2qMOqCPqhDqjLqgr6oa6ox6oJ+qFeqM+qC/qh/qjAWggGoQGoyFoKBqGhqNENAKNRKPQaDQGjUXj0Hg0AU1Ek9BkNAVNRdPQdDQDzUSz0Gw0B81F89B8tAAtREloEVqMlqClaBlajlaglWgVWo3WoLVoHVqPNqCNaBPajLagrWgb2o52oJ1oF9qN9qC9aB/ajw6gg+gQOoyOoKPoGDqOTqCT6BQ6jc6gs+g/dA6dRxfQRXQJXUZX0FV0DV1HN9BNdAvdRnfQXXQP3UcP0EP0CD1GT9BT9Aw9Ry/QS/QKvUZv0Fv0Dr1HH9BH9Al9Rl/QV/QNfUcYwhGBSEQhGjGIRRzikYBEJCEZKUhFGtKRgUxkIRs5yEUe8lGAQgQQRBFCKIZ+oJ/oF/qN/qC/6B+Ki8XHksWSxxJiKWIpY6liqWNpYmlj6WLpYxliGeMTYpljWWJZY9li2WM5YjljuWK5Y3lieWP5YvljBWIFY4VihWNFYkVjxWLFG5eIlYyVipWOlYmVjZWLlY9ViFWMVYpVjlWJVY1Vi1WP1YjVjNWK1Y7VidWN1YvVjzWINYw1ijWONYk1jTWLNY+1iLWMtYq1jrWJtY21i7WPdYh1jHWKdY51iXWNdYt1j/WI9Yz1ivWO9Yn1jfWL9Y8NiA2MDYr9Hzt3AaxXde+N/4ngUqCUQo0DpS013L0CBQqUYhU0QIBAIJBgQYO7u7u7u7u7u7vbXv6fS0Nuacv/nd73Tnvvm89nJlnrWXs/a+1nfeecOb89+5w1ukHdmt1a3drd4G6dbt1uvW5It363QTe027DbqBvWbdxt0g3vRnSbdpt1m3dbdFt2I7utuq27bbptu+267btR3Q7djt1O3c7dLt2u3W7d7t0e3Z7dXt3e3T7dvt1+3f7dAd2B3UHdwd0h3aHdYd3h3RHdkd1R3dHdMd2x3XHd8d0J3YndSd3J3Sndqd1p3endGd2Z3Vnd2d053bnded353QXdhd1F3cXdJd2l3WXd5d0V3ZVtQHd1d013bXddd313Q3djd1N3c3dLd2t3W3d7d0d3Z3dXd3d3T3dvd193f/dA92D3UPdw90j3aPdY93j3RPdk91T3dPdM92z3XPd890L3YvdS93L3Svdq91r3evdG92b3Vvd29073bvde9373Qfdh91H3cfdJ92n3Wdd1oYtd6nJXutq1rhf6hf5hQBgYxgnjhvHC+GGCMGGYKEwcJgmThq+FycLkYYrw9TBl+EaYKnwzTB2mCd8K3w7fCd8N3wvThr4wXZg+fD/MEH4Qfhh+FGYMPw4/CT8NPws/DzOFmcMsYdYwW5g9zBHmDHOFucM8Yd4wX5g/LBAWDAuFhcMi4Rfhl+FX4ddh0bBY+E1YPCwRlgy/DUuFpcMy4Xdh2fD7sFxYPqwQVgwrhT+EP4Y/hT+HlcMqYdWwWlg9rBEGhTXDWmHtMDisE9YN64UhYf2wQRgaNgwbhWFh47BJGB5GhE3DZmHzsEXYMowMW4WtwzZh27Bd2D6MCjuEHcNOYeewS9g17BZ2D3uEPcNeYe+wT9g37Bf2DweEA8NB4eBwSDg0HBYOD0eEI8NR4ehwTDg2HBeODyeEE8NJ4eRwSjg1nBZOD2eEM8NZ4exwTjg3nBfODxeEC8NF4eJwSbg0XBYuD1eEK8NV4epwTbg2XBeuDzeEG8NN4eZwS7g13BZuD3eEO8Nd4e5wT7g33BfuDw+EB8ND4eHwSHg0PBYeD0+EJ8NT4enwTHg2PBeeDy+EF8NL4eXwSng1vBZeD2+EN8Nb4e3wTng3vBfeDx+ED8NH4ePwSfg0fBa6EEIMKeRQQg0t9GK/2D8OiAPjOHHcOF4cP04QJ4wTxYnjJHHS+LU4WZw8ThG/HqeM34hTxW/GqeM08Vvx2/E78bvxe3Ha2Beni9PH78cZ4g/iD+OP4ozxx/En8afxZ/HncaY4c5wlzhpni7PHOeKcca44d5wnzhvni/PHBeKCcaG4cFwk/iL+Mv4q/jouGheLv4mLxyXikvG3cam4dFwm/i4uG38fl4vLxxXiinGl+If4x/in+Oe4clwlrhpXi6vHNeKguGZcK64dB8d14rpxvTgkrh83iEPjhnGjOCxuHDeJw+OIuGncLG4et4hbxpFxq7h13CZuG7eL28dRcYe4Y9wp7hx3ibvG3eLucY+4Z9wr7h33ifvG/eL+8YB4YDwoHhwPiYfGw+Lh8Yh4ZDwqHh2PicfG4+Lx8YR4YjwpnhxPiafG0+Lp8Yx4Zjwrnh3PiefG8+L58YJ4YbwoXhwviZfGy+Ll8Yp4ZbwqXh2vidfG6+L18YZ4Y7wp3hxvibfG2+Lt8Y54Z7wr3h3viffG++L98YH4YHwoPhwfiY/Gx+Lj8Yn4ZHwqPh2fic/G5+Lz8YX4Ynwpvhxfia/G1+Lr8Y34Znwrvh3fie/G9+L78YP4Yfwofhw/iZ/Gz2IXQ4wxxRxLrLHFXuqX+qcBaWAaJ42bxkvjpwnShGmiNHGaJE2avpYmS5OnKdLX05TpG2mq9M00dZomfSt9O30nfTd9L02b+tJ0afr0/TRD+kH6YfpRmjH9OP0k/TT9LP08zZRmTrOkWdNsafY0R5ozzZXmTvOkedN8af60QFowLZQWToukX6Rfpl+lX6dF02LpN2nxtERaMv02LZWWTsuk36Vl0+/Tcmn5tEJaMa2U/pD+mP6U/pxWTqukVdNqafW0RhqU1kxrpbXT4LROWjetl4ak9dMGaWjaMG2UhqWN0yZpeBqRNk2bpc3TFmnLNDJtlbZO26Rt03Zp+zQq7ZB2TDulndMuade0W9o97ZH2THulvdM+ad+0X9o/HZAOTAelg9Mh6dB0WDo8HZGOTEelo9Mx6dh0XDo+nZBOTCelk9Mp6dR0Wjo9nZHOTGels9M56dx0Xjo/XZAuTBeli9Ml6dJ0Wbo8XZGuTFelq9M16dp0Xbo+3ZBuTDelm9Mt6dZ0W7o93ZHuTHelu9M96d50X7o/PZAeTA+lh9Mj6dH0WHo8PZGeTE+lp9Mz6dn0XHo+vZBeTC+ll9Mr6dX0Wno9vZHeTG+lt9M76d30Xno/fZA+TB+lj9Mn6dP0WepSSDGllFNJNbXUy/1y/zwgD8zj5HHzeHn8PEGeME+UJ86T5Enz1/JkefI8Rf56njJ/I0+Vv5mnztPkb+Vv5+/k7+bv5WlzX54uT5+/n2fIP8g/zD/KM+Yf55/kn+af5Z/nmfLMeZY8a54tz57nyHPmufLceZ48b54vz58XyAvmhfLCeZH8i/zL/Kv867xoXiz/Ji+el8hL5t/mpfLSeZn8u7xs/n1eLi+fV8gr5pXyH/If85/yn/PKeZW8al4tr57XyIPymnmtvHYenNfJ6+b18pC8ft4gD80b5o3ysLxx3iQPzyPypnmzvHneIm+ZR+at8tZ5m7xt3i5vn0flHfKOeae8c94l75p3y7vnPfKeea+8d94n75v3y/vnA/KB+aB8cD4kH5oPy4fnI/KR+ah8dD4mH5uPy8fnE/KJ+aR8cj4ln5pPy6fnM/KZ+ax8dj4nn5vPy+fnC/KF+aJ8cb4kX5ovy5fnK/KV+ap8db4mX5uvy9fnG/KN+aZ8c74l35pvy7fnO/Kd+a58d74n35vvy/fnB/KD+aH8cH4kP5ofy4/nJ/KT+an8dH4mP5ufy8/nF/KL+aX8cn4lv5pfy6/nN/Kb+a38dn4nv5vfy+/nD/KH+aP8cf4kf5o/y10OOeaUcy655pZ7pV/pXwaUgWWcMm4Zr4xfJigTlonKxGWSMmn5WpmsTF6mKF8vU5ZvlKnKN8vUZZryrfLt8p3y3fK9Mm3pK9OV6cv3ywzlB+WH5UdlxvLj8pPy0/Kz8vMyU5m5zFJmLbOV2cscZc4yV5m7zFPmLfOV+csCZcGyUFm4LFJ+UX5ZflV+XRYti5XflMXLEmXJ8tuyVFm6LFN+V5Ytvy/LleXLCmXFslL5Q/lj+VP5c1m5rFJWLauV1csaZVBZs6xV1i6Dyzpl3bJeGVLWLxuUoWXDslEZVjYum5ThZUTZtGw2YPOyRdmyjCxbla3LNmXbsl3ZvowqO5Qdy05l57JL2bXsVnYve5Q9y15l77JP2bfsV/YvB5QDy0Hl4HJIObQcVg4vR5Qjy1Hl6HJMObYcV44vJ5QTy0nl5HJKObWcVk4vZ5Qzy1nl7HJOObecV84vF5QLy0Xl4nJJubRcVi4vV5Qry1Xl6nJNubZcV64vN5Qby03l5nJLubXcVm4vd5Q7y13l7nJPubfcV+4vD5QHy0Pl4fJIebQ8Vh4vT5Qny1Pl6fJMebY8V54vL5QXy0vl5fJKebW8Vl4vb5Q3y1vl7fJOebe8V94vH5QPy0fl4/JJ+bR8VroSSiyp5FJKLa30ar/avw6oA+s4ddw6Xh2/TlAnrBPVieskddL6tTpZnbxOUb9ep6zfqFPVb9ap6zT1W/Xb9Tv1u/V7ddraV6er09fv1xnqD+oP64/qjPXH9Sf1p/Vn9ed1pjpznaXOWmers9c56px1rjp3nafOW+er89cF6oJ1obpwXaT+ov6y/qr+ui5aF6u/qYvXJeqS9bd1qbp0Xab+ri5bf1+Xq8vXFeqKdaX6h/rH+qf657pyXaWuWlerq9c16qC6Zl2rrl0H13XqunW9OqSuXzeoQ+uGdaM6rG5cN6nD64i6ad2sbl63qFvWkXWrunXdpm5bt6vb11F1h7pj3anuXHepu9bd6u51j7pn3avuXfep+9b96v71gHpgPageXA+ph9bD6uH1iHpkPaoeXY+px9bj6vH1hHpiPameXE+pp9bT6un1jHpmPaueXc+p59bz6vn1gnphvaheXC+pl9bL6uX1inplvapeXa+p19br6vX1hnpjvaneXG+pt9bb6u31jnpnvaveXe+p99b76v31gfpgfag+XB+pj9bH6uP1ifpkfao+XZ+pz9bn6vP1hfpifam+XF+pr9bX6uv1jfpmfau+Xd+p79b36vv1g/ph/ah+XD+pn9bPaldDjTXVXEuttdVe69f6twFtYBunjdvGa+O3CdqEbaI2cZukTdq+1iZrk7cp2tfblO0bbar2zTZ1m6Z9q327fad9t32vTdv62nRt+vb9NkP7Qfth+1Gbsf24/aT9tP2s/bzN1GZus7RZ22xt9jZHm7PN1eZu87R523xt/rZAW7At1BZui/yX3r9yW6Wt2lZrq7c12qC2Zlurrd0Gt3Xaum29NqSt3zZoQ9uGbaM2rG3cNmnD24i2adusbd62aFu2kW2rtnXbpm3btmvbt1Fth7Zj26nt3HZpu7bd2u5tj7Zn26vt3fZp+7b92v7tgHZgO6gd3A5ph7bD2uHtiHZkO6od3Y5px7bj2vHthHZiO6md3E5pp7bT2untjHZmO6ud3c5p57bz2vntgnZhu6hd3C5pl7bL2uXtinZlu6pd3a5p17br2vXthnZju6nd3G5pt7beqF7vjnZnu6vd3e5p97b72v3tgfZge6g93B5pj7bH2uPtifZke6o93Z5pz7bn2vPthfZie6m93F5pr7bX2uvtjfZme6u93d5p77b32vvtg/Zh+6h93D5pn7bPWtdCiy213EqrrbUeAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/yWLLbHUorP0Jhjzul+vX2/xXr/e6f16vd5k/3newNHtgK+Y5z9OXePz3tSf/7/4F72PN5h1aHy831e14/6jyfr9/dAsvV5vvTHz9+8t+XmvX2+cz8fG+T+u0zt/9KSj2y/WHThght7SvcV6K4x+PWr0tffrLfKlC1lodLvIFwPvD/iH7eQL/2Wnzv/SPH+/a4v89cb1er3xe19uJ+/X//O2tdb+0Rb9c8b7ivGvSpOxg/zHbvIfu8l/7Cb/sZv8x27y/9/mv7euHPCV9f96f1P/DxhdDff/iuv6V9T/G4+Zv39vuX+2/l/kyxv0xbrjj6n/l+4N6Q3vDR89/lX3AQb87T4v8o/b6Qa2v9rn/6m+Kk3GDvIfu8l/7Cb/sZv8x27yH7vJf+zW/+/q//7/P/V////N9f+YJxj+0v51/b9Ub1hv3d5ivSG9ob3Bo8e/qv5feHQ7pv7/m3m/aKdbZMDnb1L/8z+X/Mdu8h+7yX/sJv+xm/zHbvIfu/19/T9gdP3//t/U/+OOvgcw8Ctmmmx0vfxF/T/Lf7H+//Jz/v16y/7Tdf6XfTH/eANm6K3UG9Yb2tust2Fv8OfzjhqzTv/e2mNWHDjqPz7HF78PMOXnR+cY/ZUyZe+4flP0+v1llXGmGP3+z8f+csI4fb1er69/70vn/O2x3uh7JbOMWX9gb4rRvRG9kb2tehv0BvWGfn434ovnETbu9Xozjjl/nN7EY3Z6dG6jP/moMeNTj3laYeqvvA/xVWkydpD/2E3+Yzf5j93kP3aT//+D/omiXv5jt4Gf14UAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAH+tBwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPBPGjFyqw0GDR06eLiOjo7OmM6/+zsTAADw3+0/f+j/d18JAAAAAAAAAAAAAAAAAAAAjL3+FX9O7N/9GQEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/m+8tdz+y6yw2PJ9fX19vcn6jR7svnzOqN6oXht9bODosTa67Tf635arnzn/Y5Peu95/jE067bWbbHP29NdvOtFKF0961Xi9eyZb7a33Z3v5nqnumeatusJ6Q0b0DRnRt9GwTfsG9a05bNimg9YcOrhv7SEjNpipb9mhgweNGNw3ZKMRg4d/6fA6Q4dtvPHIvkEbrT3JhBsPHzxiRN+gjUb2bTB4ZN+mw/o2HT6yb9C6g4Zs1DfTTDP1TTLhv2rv/vdb8cx/9xUAAADwr/H/BQAA//+Z0zrN")
r9 = openat(0xffffffffffffff9c, &(0x7f0000000040)='.\x00', 0x0, 0x0)
fchown(r9, 0xffffffffffffffff, 0x0)
statx(r1, &(0x7f0000000140)='./bus\x00', 0x100, 0x40, &(0x7f0000000180))
r10 = open(&(0x7f00000000c0)='.\x00', 0x0, 0x0)
getdents(r10, &(0x7f0000000040)=""/104, 0x4d)
writev(r2, &(0x7f0000002680)=[{&(0x7f00000025c0)='8', 0x1}], 0x1)
syz_mount_image$exfat(0x0, &(0x7f0000000100)='./bus\x00', 0x4800, 0x0, 0x0, 0x0, &(0x7f0000000000))
openat$cgroup_freezer_state(r1, &(0x7f0000000080), 0x2, 0x0)


r0 = fsopen(&(0x7f0000000040)='ramfs\x00', 0x0)
fsconfig$FSCONFIG_CMD_CREATE(r0, 0x6, 0x0, 0x0, 0x0)
r1 = fsmount(r0, 0x0, 0x0)
mknodat(r1, &(0x7f0000000300)='./file0\x00', 0x1004, 0x0)
faccessat(r1, &(0x7f0000000000)='./file0\x00', 0x2)


mkdirat(0xffffffffffffff9c, &(0x7f0000000080)='./file1\x00', 0x0)
unlinkat(0xffffffffffffff9c, &(0x7f0000000380)='./file1\x00', 0x200)


r0 = open(&(0x7f0000000040)='./file0\x00', 0xa0840, 0x0)
fcntl$setlease(r0, 0x400, 0x0)
utime(&(0x7f0000000080)='./file0\x00', 0x0)


mkdirat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x0)
mkdirat(0xffffffffffffff9c, &(0x7f0000000080)='./file1\x00', 0x0)
mknodat(0xffffffffffffff9c, &(0x7f00000000c0)='./file1/file2\x00', 0x0, 0x0)
linkat(0xffffffffffffff9c, &(0x7f0000000380)='./file1/file2\x00', 0xffffffffffffff9c, &(0x7f00000003c0)='./file0/file2\x00', 0x0)


r0 = open(&(0x7f00000000c0)='.\x00', 0x0, 0x0)
r1 = openat$cgroup_ro(r0, &(0x7f0000000180)='memory.events.local\x00', 0x275a, 0x0)
pwrite64(r1, &(0x7f0000000080)="9a", 0x1, 0x0)
mmap(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x2000001, 0x12, r1, 0x0)
r2 = syz_open_procfs(0x0, &(0x7f0000000280)='fdinfo\x00')
lseek(r2, 0x7, 0x0)
getdents64(r2, 0xffffffffffffffff, 0x43)


mkdir(&(0x7f0000000100)='./file1\x00', 0x0)
newfstatat(0xffffffffffffff9c, &(0x7f0000000280)='./file1\x00', &(0x7f00000002c0)={0x0, 0x0, 0x0, 0x0, 0x0, <r0=>0x0}, 0x0)
mount$tmpfs(0x0, 0x0, 0x0, 0x0, &(0x7f0000000400)=ANY=[@ANYBLOB='gid=', @ANYRESHEX=r0, @ANYBLOB=',mpol=local=static:70483,'])
mount(0x0, &(0x7f0000000380)='./file1\x00', &(0x7f0000000180)='smb3\x00', 0x0, &(0x7f0000000400))


mkdir(&(0x7f0000000440)='./file1\x00', 0x0)
mount(0x0, &(0x7f00000004c0)='./file1\x00', &(0x7f00000002c0)='tmpfs\x00', 0x0, 0x0)
chdir(&(0x7f0000000080)='./file1\x00')
mknod(&(0x7f0000000140)='./file0\x00', 0x1000, 0x0)
openat(0xffffffffffffff9c, &(0x7f000000c380)='./file0\x00', 0x0, 0x0)
r0 = open$dir(&(0x7f0000000140)='./file0\x00', 0x1, 0x0)
r1 = openat$cgroup_ro(0xffffffffffffff9c, &(0x7f0000000280)='blkio.bfq.io_merged_recursive\x00', 0x275a, 0x0)
ftruncate(r1, 0x2000009)
sendfile(r0, r1, 0x0, 0x6)
r2 = open$dir(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
vmsplice(r2, &(0x7f0000000900)=[{&(0x7f0000000840)="05e04eeb14f4", 0x6}], 0x1, 0x0)


r0 = openat$6lowpan_control(0xffffffffffffff9c, &(0x7f0000000140), 0x2, 0x0)
preadv(r0, &(0x7f0000000100)=[{&(0x7f0000002100)=""/112, 0x70}], 0x1, 0x0, 0x0)


mknodat$loop(0xffffffffffffff9c, &(0x7f0000000140)='./file1\x00', 0x0, 0x1)
mount(0x0, &(0x7f0000000180)='./file1\x00', &(0x7f0000000080)='cifs\x00', 0x0, &(0x7f00000001c0)='_\xa1LI\xa5\x91\xbb \x1e!BE\x81J\x82F\x02\xd3A\xe6i_^\xd5\x7fw\x88\x7fk\xaat\xc9c\xbb\xca\xd9\x8f\xe5E|\x98,,\xde2\xad\x00\x10\x00\x00\x8c\n\xa9Y-\x83[\x7f\xb5\xba\x05|\x8d\xbe =jk\xb5`D=\x89\xee\x84\xbbuR\x87{\x90\xab\xbd\xbb@\xd6\xa7\xd0\x84sx\x10\x1cg\x00\xb0\'\xdf\x0eC\ak=e\x96\xf7O\xb1\xfc\x81\xff\xff\xffT\x02\x96\xe7\x9d(2\x7f^r\x01\xf3\x15\x18\xc2\xbb5\x98\xd5\xca\xb4\'R\xd1<\x13\xf7\xa3|7s,\x03K8\xf0\x02\x00\x00\x00-\x85\x00\x00\x00\x00\xb4\xf8\xebW\xd6b\x029TJpj\xbe\x8flea\xb3\xcd\xb8\xea\xca\xb7G\xe0\xcb/\x13\xa1w\xb9M\x9d\xf2\xaa\xf52\x96\x8aL\xad\"~\xff\xbbg<\xcf\xe9\xff5\x826\x14-\xa0~\xc2\x91\xe6')
syz_mount_image$udf(&(0x7f0000000040), &(0x7f0000000000)='./file0\x00', 0x2000040, &(0x7f0000000380)=ANY=[@ANYBLOB='gid=forget,uid=forget,gid=', @ANYRESDEC=0x0, @ANYBLOB=',lastblock=00000000000000000250,gid=ignore,iocharset=cp950,gid=', @ANYRESDEC=0x0, @ANYBLOB="2c5d64696e6963622c6d6f64653d30303030303030303030303030303030303030303030332c6e6f7374726963742c005bb69349e9f45c94183a2d6c5a1808ff0b520494fd53a5385e2b915d674c5505a5cb6af3ba454574cc5d00fd543268ae8c2224a3af4b91cac32042bcffd9718bb1dbcf95cdd32ab811a5b275baeabee3b07a1416d9ebe5d90d8e9e2b4d"], 0xfe, 0xc2d, &(0x7f0000000f40)="$eJzs3U9sHNd9B/DfGy3FldxWTOwoThoXm7ZIZcVy9S+mYhXuqqbZBpBlIhRzC8AVSakLUyRBUo1spAXTSw89BCiKHnIi0BoFUjQwmiLokWldILn4UOTUE9HCRlD0wBYBcgoYzOxbcUmRNi2KEmV9Pjb13Z19b+a9eesZWdCbFwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAxB+8cun0mfSwWwEAPEhXRr96+qz7PwA8Vq76/38AAAAAAAAAAAAAADjoUhTxZKSYu7KWxqv3HfXL7b5bt8eGhrevdiRVNQ9V5cuf+pmz585/6YXBC9283J75gPr322fjtdGrlxovz96cm59aWJiabIzNtCdmJ6d2vYe91t/qZHUCGjdfvzV5/fpC4+zz5zZ9fHvg/f4njg9cHHz21DPdsmNDw8OjG0XqveVr99yQjp1meByOIk5Fiue+99PUiogi9n4u6g927Lc6UnXiZNWJsaHhqiPT7dbMYvnhSPdEFBGNnkrN7jnafiyi1vdA+7CzZsRS2fyywSfL7o3OteZb16anGiOt+cX2Ynt2ZiR1Wlv2pxFFXEgRyxGx2n/37vqiiFqk+M6xtXQtIg51z8MXq4nBO7ej2Mc+7kLZzkZfxHLxCIzZAdYfRbwaKX72zomYyNeZ6lrzhYhXy/xBxFtlvhSRyi/G+Yj3tvke8WiqRRF/WY7/xbU0WV0PuteVy19rfGXm+mxP2e515SPeH+66Ujyk+8ORLflgHPBrUz2KaFVX/LV077/ZAQAAAAAAAAAAAAAAAOB+OxJFfCZSvPIff1LNK45qXvqxi4N/OPCrvXPGn/6Q/ZRln4+IpWJ3c3IP54mBI2kkpYc8l/hxVo8i/jTP//vWw24MAAAAAAAAAAAAAAAAAADAY62In0SKF989kZajd03x9syNxtXWtenOqrDdtX+7a6avr6+vN1InmznHcy7lXM65knM1ZxS5fs5mzvGcSzmXc67kXM0Zh3L9nM2c4zmXci7nXMm5mjNquX7OZs7xnEs5l3Ou5FzNGQdk7V4AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAgI+TIor4RaT49jfWUqSIaEaMRydX+h926wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAUn8q4vuRovFHzTvbahGRqn87TpS/nI/m4TI/Gc3BMl+K5qWcrSprzW89hPazN32piB9Hiv7623cGPI9/X+fdna9BvPXNjXefrXXyUPfDgff7nzh+7OLg8G88vdPrtF0DTl5uz9y63RgbGh4e7dlcy0f/ZM+2gXzc4v50nYhYeOPN11vT01Pz9/6i/Arsofoj9CLVHpeeelG9iNqBaMbD6TuPgfL+/16k+N13/7N7w+/c/+vxK513d+7w8fM/27j/v7h1R7u8/9e21sv3//Kevt39/8mebS/m34301SLqizfn+o5H1BfeePNU+2brxtSNqZnzp09/eXDwy+dO9x2OqF9vT0/1vLovpwsAAAAAAAAAAAAAAADgwUlF/H6kaP14LTUi4nY1X2vg4uCzp545FIeq+Vab5m2/Nnr1UuPl2Ztz81MLC1OTjbGZ9sTs5NRuD1evpnuNDQ3vS2c+1JF9bv+R+suzc2/Mt2/88eK2nx+tX7q2sDjfmtj+4zgSRUSzd8vJqsFjQ8NVo6fbrZmq6si2k+k/ur5UxH9FionzjfT5vC3P/986w3/T/P+lrTvap/n/n+jZVh4zpSJ+Hil+56+ejs9X7Twad52zXO7vIsXJC5/L5eJwWa7bhs5zBTozA8uy/xcp/ukXm8t250M+uVH2zK5P7COiHP9jkeL7f/Hd+M28bfPzH7Yf/6Nbd7RP4/9Uz7ajm55XsOeuk8f/VKR46cm347fytg96/kf32RsncuE7z+fYp/H/VM+2gXzc374/XQcAAAAAAAAAAHik9aUi/j5S/HC4ll7I23bz9/8mt+5on/7+16d7tk3en/WKPvTFnk8qAAAAABwQfamIn0SKG4tv35lDvXn+d8/8z9/bmP85lLZ8Wv05369Vzw24n3/+12sgH3d8790GAAAAAAAAAAAAAAAAAACAAyWlIl7I66mPV/P5J3dcT30lUrzyP8/lcul4Wa67DvxA9Wv9yuzMqUvT07MTrcXWtempxuhca2KqrPtUpFj728/lukW1vnp3vfnOGu8ba7HPR4rhf+iW7azF3l2b/KmNsmfKsp+IFP/9j5vLdtex/tRG2bNl2b+JFF//l+3LHt8oe64s+91I8aOvN7plj5Zlu89H/fRG2ecnZot9GBUAAAAAAAAAAAAAAAAAAAAeN32piD+PFP97c/nOXP68/n9fz9vKW9/sWe9/i9vVOv8D1fr/O72+l/X/q+cKLO10VAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA+HhKUcSbkWLuylpa6S/fd9Qvt2du3R4bGt6+2pFU1TxUlS9/6mfOnjv/pRcGL3Tzg+vfb5+J10avXmq8PHtzbn5qYWFqsjE2056YnZza9R72Wn+rk9UJaNx8/dbk9esLjbPPn9v08e2B9/ufOD5wcfDZU890y44NDQ+P9pSp9d3z0e+Sdth+OIr460jx3Pd+mn7YH1HE3s/Fh3x39tuRqhMnq06MDQ1XHZlut2YWyw9HuieiiGj0VGp2z9EDGIs9aUYslc0vG3yy7N7oXGu+dW16qjHSml9sL7ZnZ0ZSp7VlfxpRxIUUsRwRq/13764ving9Unzn2Fr61/6IQ93z8MUro189fXbndhT72MddKNvZ6ItYLh6BMTvA+qOIf44UP3vnRPxbf0QtOj/xhYhXy/xBxFvRGe9UfjHOR7y3zfeIR1Mtivj/cvwvrqV3+svrQfe6cvlrja/MXJ/tKdu9rjzy94cH6YBfm+pRxI+qK/5a+nf/XQMAAAAAAAAAAAAAAAAcIEX8eqR48d0TqZoffGdOcXvmRuNq69p0Z1pfd+5fd870+vr6eiN1splzPOdSzuWcKzlXc0aR6+dslllfXx/P75dyLudcybmaMw7l+jmbOcdzLuVczrmSczVn1HL9nM2c4zmXci7nXMm5mjMOyNw9AAAAAAAAAAAAAAAAAADg46Wo/knx7W+spfX+zvrS49HJFeuBfuz9MgAA//8hX/ir")
mkdir(&(0x7f00000004c0)='./file1\x00', 0x4)
mkdir(&(0x7f00000001c0)='./file0\x00', 0x0)
mkdir(&(0x7f0000000300)='./bus\x00', 0x0)
mount$overlay(0x0, &(0x7f00000000c0)='./bus\x00', &(0x7f0000000080), 0x0, &(0x7f0000000240)={[{@workdir={'workdir', 0x3d, './bus'}}, {@lowerdir={'lowerdir', 0x3d, './file0'}}, {@upperdir={'upperdir', 0x3d, './file1'}}]})
r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='.\x00', 0x0, 0x0)
fchmod(r0, 0x0)
syz_mount_image$ext4(&(0x7f0000000040)='ext4\x00', &(0x7f0000000200)='./file1\x00', 0x200010, &(0x7f00000000c0)={[{@nolazytime}, {@nombcache}, {@block_validity}, {@norecovery}, {@barrier_val={'barrier', 0x3d, 0x4c}}, {@lazytime}, {@init_itable_val={'init_itable', 0x3d, 0x5}}, {@usrquota}, {@errors_continue}]}, 0xfe, 0x55d, &(0x7f0000000980)="$eJzs3d9rW1UcAPDvTX/sp66DMdQHKezByVy6tv6Y4MN8FB0O9H2G9q6Mpsto0rHWgduDe9mLDEHEgfgH+O7j8B/wrxjoYMgo+uBL5aY3XbYmbZZlSzSfD9ztnPuj55yce07OyUm4AQytyeyfQsSrEfFNEnGo6dho5AcnN89bf3htLtuS2Nj47M8kknxf4/wk//9AHnklIn79OuJEYXu61dW1xVK5nC7n8ana0uWp6urayYtLpYV0Ib00Mzt7+p3Zmfffe7dnZX3z3N/ff3r3o9O3jq1/9/P9w7eTOBMH82PN5XgG15sjkzGZvyZjceaJE6d7kNggSfqdAboykrfzscj6gEMxkrd64P/vq4jYAIZUov3DkGqMAxpz+x7Ng/8zHny4OQHaXv7Rzc9GYm99brR/PXlsZpTNdyd6kH6Wxi9/3LmdbdG7zyEAdnX9RkScGh3d3v8lef/XvVMdnPNkGvo/eHHuZuOft1qNfwpb459oMf450KLtdmP39l+43+KypFefUmfjvw9ajn+3Fq0mRvLYS/Ux31hy4WI5zfq2lyPieIztyeI7reecXr+30e5Y8/gv27L0G2PBPB/3R/c8fs18qVZ6ljI3e3Aj4rWW499kq/6TFvWfvR7nOkzjaHrn9XbHdi//87XxU8QbLev/0YpWsvP65FT9fphq3BXb/XXz6G/t0u93+bP6379z+SeS5vXa6tOn8ePef9J2x7q9/8eTz+vh8Xzf1VKttjwdMZ58sn3/zKNrG/HG+Vn5jx/buf9rdf/vi4gvOiz/zSM32546CPU//1T1//SBex9/+UO79Dur/7froeP5nk76v04z+CyvHQAAAAAAAAyaQkQcjKRQ3AoXCsXi5vc7jsT+QrlSrZ24UFm5NB/138pOxFihsdJ9qOn7ENP592Eb8Zkn4rMRcTgivh3ZV48X5yrl+X4XHgAAAAAAAAAAAAAAAAAAAAbEgTa//8/8PtLykvEXm0PgufLIbxheu7b/XjzpCRhI3v9heHXV/vf1Ph/Ai+f9H4bUWL8zAPST938YXto/DC/tH4aX9g8AAAAAAAAAAAAAAAAAAAAAAAAAAAA9de7s2WzbWH94bS6Lz19ZXVmsXDk5n1YXi0src8W5yvLl4kKlslBOi3OVpd3+XrlSuTw9EytXp2pptTZVXV07v1RZuVQ7f3GptJCeTz1tCAAAAAAAAAAAAAAAAAAAALarrq4tlsrldFlAoKvA6GBkQ6ApcKsHrbvPHRMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANPk3AAD//0unNek=")


syz_mount_image$ntfs3(&(0x7f0000000040), &(0x7f0000000080)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', 0x0, &(0x7f0000001340)={[{}, {@gid}, {@prealloc}, {@sparse}, {@force}, {@umask={'umask', 0x3d, 0x1}}, {@force}, {@iocharset={'iocharset', 0x3d, 'macroman'}}, {@iocharset={'iocharset', 0x3d, 'cp936'}}, {@prealloc}, {@showmeta}]}, 0xff, 0x1f77d, &(0x7f000001fa80)="$eJzs3QmcTfXfB/Df2ff9XNkNTZKQfUmyr9m3kGzZ92whMSRbskRItiRbklBJEkmUZJeEJEmStEvieTlzZ5oZ/Ou0/XqOz7tX33PnzL3nfO/yOZs5537ZcFLdxtUaJSQkJBCWJcnOkXSSSBK5ZCXftuPjLnHJQ4YQwhNCJiYWbF2zR33t8jgz1/r7hyzLs6Gf1nS1+ZpEttutvjxb9Pj2TNuzfHmxcecufRO69E3o2atfQtuEdr169WvbrnuHhPZd+nYrlFC/e4e2fTskdOnZt0OfdL/u2L1X796DEtr2bG+ovft06Ns3oW3PQQndOgxK6NcroV+fQQltO7Xt0jOhUKFCCYZK4A9qspR2BwAAAAAAAAAAAAAAAAAA8Pe4dIlcYmg3AQAAAAAAAAAAAAAAAAAAAH9JtZq1qxYmXOrPDGFIacKQGSkn9sel3IxfBiC4DkCb4FbmoNaI30paYhbofv4Ac62heLUmrvIHCIUJIZ1Tp8+SWsEthgjBOOF350O6xycUH6bMl+cSSR1SjTSO/5wU750hFdI1cjE+rJAy4gJ31aF7KfmSCCvSTYe74vlUyPBCJpD0Q4eRg+GlS5cuXe0l+mdc2Sf8t4X9nMkk/TD954y7Zv6bZcg/F09HyiVC/o38906dPksahs2/nabRNPmXU/Nfh3QhfUif+PhrLQe4jK+rffVhbj75iij/bn7/KpZ2A0AVe0X+2f+Rf/b/U/5Tm08eps1/bdKLdCLVSBfSnXSIj79W/lPCnJr/DNNNGeYhDMcj//D/ypX55+L5P5oh/2J8GZAyyo7nJSX/hf9k/tNv5zOkfuicp5cyfYlLJE1JL9Kd9Cc9SIdgukmp82FJ+9Q58kmXn0fK/oAf/LZ4PBk+mcu4hEmei+DGHx+MS76DcHn7PYEl6e6T8XckvqwsnDp/nrjxW33JIDKYdCNtSfdgaZSyPXJ52Zcv9f4C0VNf6fj7FH/mSanjM6durWT+w8sh/vfvAhHGX5F/Pp7/DVfZ/+czrP//rvz/pfW8Hp9Q/Kp/adfzFUk/0o/0IVVIB9IxPj79coD7w8uBseTK5UAwLuRy4FrbGSmva+p2RobnlTLMzWUOHvT3bGdg/X99u3L9LwT555I/GhnW/0Ka/eHfy/8Bu1e6Ycr4vL/tOac+7rf9CIZU+evr/2D6DFco+Ll+urz99nlPeWqpefOTBwXs9MPcghXMYxIhpFnwwOTlRun4sbtEUpMUI4WD58HEXxg+/vrki/+ftsnMGY7azIv3yPAV4q9wspT71CBpXvCrDFPmn5v3g9msSNlu4JO3M+QMfab0knL8B65nVx7/E+Pr//pX2f8X/+X9/7TH/0PvF/Dpp512v6AS6UL6kR6kLel9zfVxiozHU1PvEh/mZrL9jevjfxvW/9e3K9f/Ujz/a66y/S/9Q9v/aXNeO2zOM/xDWsr0hSDnvUgv0i/4+b+63S/Fh6nLmYz/MBgfOvF/wPl7lzPI//XtyvzL8fwPvEr+ZWrH/7k/cfw/vbTHBSqRtqQ9qRwcG+xLkvOU/jgam3orKd32eZq/D4gHN3X7/EKu9MO4lLmxl5If8N/aTkD+r29X5l8J8q+kP9gf3zNVMuz//8n8J2Tsgvnd7f+/5d8Fgvle3v5vRDqQ+0h/0od0yJDvNHmITyE133zyi9GbST+8PLUqpBHJwzDZmfhxgqPkt+MEG1L3vy/fL3lqLkl/nGBF/P/kZ+vHn3/yCr9B4qrg9UkZJsXvN4QQUpPUvOL+8w4fqcQyvw1Tnkf7q96fI2dd7tzl+6QMSYbpkzTvNxt/flvJb8cXFqd5fin3tzM8vxnx/0mwRZU8/3wpH65rzDvj/a71GmS837Wee8bn8d9aDtNy5f6/+j/+/k+ltv/P/Yn8p3SaPEy7/9+E9CaVSVvSN57/q22XD4sPf/t3f/aqwzxJJAebZvueCMnbDwnx43CJpCbpSTqSXvFHpSz89C7Zuj0w9NDBjM/73/1cYv1/fbty/a/Fj/8nsVce/9P+nvW/nbGLP7n+L+/8iX8XuJz/qmQg6Uc6kJ6kfbA+S97XSc5tM+b3j+vXYJL/vyxL8jMgbUjd+OuXXMNun+SNLxpELpHUI+1IV1Iz9RjF3z39BqR/cFykLeFI1vj0Owd7WH99+pf3rxqSDqQ3aUv6BEvY9LD+/S+5cv2vExKs/xOucvxf/weO/9H+u5z0+/1M6q3r4/OJ9f/17cr1v/E/8m8g/xGD/F/frsy/+T/ybyL/EYP8X9+uzL/1P/JvIf8Rg/xf367Mvx3kn5AxJH3+mdRH/Ha/vyP/10fO/quQ/+vblfl3kP//n+Q/8yDk//p2Zf5d5P86gvxf367Mv4f8X0eQ/+vblfn3kf/rCPJ/fbsy/zHk/zqC/F/frsx/JuT/OoL8X9+uzP8NyP91BPm/vl2Z/+TrfxukN3vl+X+Z/4bz//L+yfN/68bP/0l7fgr3P+aXcXpXP/8l+XzWgcFLkXyeYbPU81nrpT7WIRnO/0lzXnBC/EXKFz+t6Yq/MWLSP+G059GeTZ0vS3amzrdBuvumne+G+P8kOE5TOP66x8+PjU837RNPeS0uZZD2/ffjZzg2S3M+btjpXO3vqjI+Xyzn/4uunf+rXf87y38s/3/H+XWT4n+DmJLDwlfJv0fS5zCBXPkk/n9+zrH+v75dO/9JV8l/1r/h/N8w+f/t+j9M6vV//qnzUyelzI/97XqDycuBhqnTyLgcqJDm+gDZ48OE+IV7dULm5Yi/omn9t5YTuP7n9S35/N/8GfJ/eXkwhs14/Z8TwfC33CdLyf0MrnuQu9LjeqQbpoxPuMp5/+z/XO8nd5PxeoC/N5+M07s834LBIzvGv+8jebv5ct7tNOf9n/0D1/M9mnJtn9S8M6RN0CEh2eI/Z+wr4zDl7/SypvkesKv1l+PPTj8jPuP0i6T55YnU5dHO1G2a5P0gOX7r8mPviD/vZqm/54NxmVMumyzyJLF2syY1qyRPNc246lcZV6deFfJVxXgH/43l4PXqxDXyf+X1/wXyWTC8Vv5/73N5xUo/ZP7/7nwlXeO64CktpV73J8P0Uoa5WTY4lPlv52Z4vK2/Jzef/fVJwF9C9/Pz2TXz3+aK6///tfxf7atm/pH8Z3SN9d+1rve5If6eJOPImvj4y9vzKwghdUnToF7WMriKeG/SMv4tPgVJl+Cq4p1Ih+D/nqQ4KUpKkGLBf6VIYVKCFCUt0yyJ0t8uT8qTln/zVNN/vvgMn6/PMny+hL/6+br0dLrPF/y3XTv/5/5j6/+U7f8/upxJ+dqcy/Ntfo38X239nyLjdf8zfA0Pyc3w/3P9f3me/eLL70mpv+eDcX91+R3se2gi6d+3Q59CA9v269enCIkPrvK7oiQ+CKTPJdb/tP331v9skP98V1n/N6v+x/Of8r0/fz7/yeuj38t/xvmQxOTp5LsxeXjl9n+xDPMhf2g5E34+yYG71nIm5XS91OVMhumlDHMrQspyJv6nE3/n5+Qz8u/sZ8C1/Pfyn7z+3xpy/Z/x+74yDsPkv3eabhpnyOXvzSel55RrPqfkUknNZSFyX/C9oMn3CLsdkDLdlGFuxvx/fBwA63/a/mv5J/EMliYZvx72v/j9FfDXSLQbAKqkEPlP3mJF/qPkT100BCJDDpF/JajIf5QotBsAqpQQ+U8+8o38R4lKuwGgSg2R/+S/vEX+o0Sj3QD827KmPeiuhch/8t+1I/9RotNuAKjSQ+TfCCryHyUG7QaAKiNE/s2gIv9RYtJuAKgyQ+TfCiryHyUW7QaAKitE/pNP30X+o+SKU7LhumKHyH9wFSzkP1Ic2g0AVU6I/Cd/yybyHyUu7QaAKjdE/oOrXyH/keLRbgCo8kLkPziTCPmPFJ92A0CVHyL/saAi/1ESo90AUBULkf9MQUX+oyQT7QaAqkwh8n9DUJH/KLmBdgNA1Q04//e6lpl2A0BV5hD5zxJU5D9KstBuAKjKEiL/WYOK/EdJVtoNAFVZQ+Q/+Qr8yH+UZKPdAFCVLUT+k7/xCvmPkuy0GwCqsofIf/I30iH/UZKDdgNAVY4Q+c8ZVOQ/SnLSbgCoyhki/7mCivxHSS7aDQBVuULkP/lbSpD/KLnatzLB9SMhRP5zBxX5j5LctBsAqnKHyH+eoCL/UZKHdgNAVZ4Q+U/+QlrkP0pupN0AUHVjiPwnf0E18h8libQbAKoSQ+T/pqAi/1FyE+0GgKqbQuQ/b1CR/yjJS7sBoCpviPzfHFTkP0pupt0AUHVziPznCyryHyX5aDcAVOULkf9bgor8R8kttBsAqm4Jkf/8QUX+oyQ/7QaAqvwh8n9rUJH/KLmVdgNA1a0h8l8gqMh/lBSg3QBQVSBE/gsGFfmPkoK0GwCqCobIf6GgIv9RUoh2A0BVoRD5vy2oyH+U3Ea7AaDqthD5LxxU5D9KCtNuAKgqHCL/RYKK/EdJEdoNAFVFQuS/aFCR/ygpSrsBoKpoiPwXCyryHyXFaDcAVBX74/kXkwfIf5QUp90AUFU8xPq/RFCR/ygpQbsBoKpEiPyXDCryHyUlaTcAVJUMkf9SQUX+o6QU7QaAqlIh8h+MQf4jpTTtBoCq0iHyXyaoyH+UlKHdAFBVJkT+bw8q8h8lt9NuAKi6PUT+ywYV+Y+SsrQbAKrKhsj/HUFF/qPkDtoNAFV3hMh/uaAi/1FSjnYDQFW5EPm/M6jIf5TcSbsBoOrOEPkvH1TkP0rK024AqCofIv8Vgor8R0kF2g0AVRVC5L9iUJH/KKlIuwGgqmKI/FcKKvIfJZVoNwBUVQqR/8pBRf6jpDLtBoCqyiHyXyWoyH+UVKHdAFBVJUT+qwYV+Y+SqrQbAKqqhsh/taAi/1FSjXYDQFW1EPmvHlTkP0qq024AqKoeIv81gor8R0kN2g0AVTVC5L9mUJH/KKlJuwGgqmaI/NcKKvIfJbVoNwBU1QqR/7uCivxHyV20GwCq7gqR/9pBTZd/+V9qE/4htWk3AFTVDpH/OkHF+j9K6tBuAKiqEyL/dYOK/EdJXdoNAFV1Q+S/XlCR/yipR7sBoKpeiPzXDyryHyX1aTcAVNUPkf8GQUX+o6QB7QaAqgYh8t8wqMh/lDSk3QBQ1TBE/hsFFfmPkka0GwCqGoXIf+OgIv9R0ph2A0BV4xD5bxJU5D9KmtBuAKhqEiL/TYOK/EdJU9oNAFVNQ+T/7qAi/1FyN+0GgKq7Q+S/WVCR/yhpRrsBoKpZiPw3DyryHyXNaTcAVDUPkf8WQUX+o6QF7QaAqhYh8n9PUJH/KLmHdgNA1T0h8t8yqMh/lLSk3QBQ1TJE/u8NKvIfJffSbgCouve3/KfJ+tXz3yqoyH+UtKLdAFDVKsT6v3VQkf8oaU27AaCqdYj8twkq8h8lbWg3AFS1CZH/tkFF/qOkLe0GgKq2IfLfLqjIf5S0o90AUNUuRP7vCyryHyX30W4AqLovRP7bBxX5j5L2tBsAqtqHyH+HoCL/UdKBdgNAVYcQ+e8YVOQ/SjrSbgD+VmzI+3cMkf9OQUX+o6QT7QaAqk4h8t85qMh/lHSm3QBQ1TlE/rsEFfmPki60GwCquoTIf9egIv9R0pV2A0BV1xD57xZU5D9KutFuAKjqFiL/3YOK/EdJd9oNAFXdQ+S/R1CR/yjpQbsBoKpHiPz3DCryHyU9aTcAVPUMkf9eQUX+o6QX7QaAql5Xyb9yjfz3DiryHyW9aTcAVPUOsf6/P6jIf5TcT7sBoOr+EPnvE1TkP0r60G4AqOoTIv99g4r8R0lf2g0AVX1D5L9fUJH/KOlHuwGgql+I/PcPKvIfJf1pNwBU9Q+R/wFBRf6jA3v/17sBIfL/QFCR/yh5gHYDQNUDIfI/MKjIf5QMpN0AUDUwRP4HBRX5j5JBtBsAqgaFyP/goCL/UTKYdgNA1eAQ+X8wqMh/lDxIuwGg6sEQ+R8SVOQ/SobQbgCoGhIi/w8FFfmPkodoNwBUPRQi/0ODivxHyVDaDQBVQ0Pkf1hQkf8oGUa7AaBqWIj8JwUV+Y+SJNoNAFVJIfI/PKjIf5QMp90AUDU8RP5HBBX5j5IRtBsAqkaEyP/DQUX+o+Rh2g0AVQ+HyP/IoCL/UTKSdgNA1cgQ+X8kqMh/lDxCuwGg6pEQ+R8VVOQ/SkbRbgCoGhUi/6ODivxHyWjaDQBVo0Pkf0xQkf8oGUO7AaBqTIj8jw0q8h8lY2k3AFSNDZH/cUFF/qNkHO0GgKpxIfL/aFCR/yh5lHYDQNWjIfI/PqjIf5SMp90AUDU+RP4fCyryHyWP0W4AqHosRP4nBBX5j5IJtBsAqiaEyP/EoCL/UTKRdgNA1cQQ+Z8UVOQ/SibRbgComhQi/5ODivxHyWTaDQBVk0Pk//GgIv9R8jjtBoCqx0Pkf0pQkf8omUK7AaBqSoj8Tw0q8h8lU2k3ALS4JHj//3j+nwgq8h8lT9BuAKh6IkT+pwUV+Y+SabQbAKqmhcj/9KAi/1EynXYDQNX0EPmfEVTkP0pm0G4AqJoRIv9PBhX5j5InaTcAVD0ZIv8zg4r8R8lM2g0AVTND5P+poCL/UfIU7QaAqqdC5H9WUJH/KJlFuwGgalaI/M8OKvIfJbNpNwBUzQ6R/zlBRf6jZA7tBoCqOSHyPzeoyH+UzKXdAPx10p9/6NwQ+Z8XVOQ/SubRbgComhci/08HFfmPkqdpNwBUPR0i//ODivxHyXzaDQBV80Pk/5mgIv9R8gztBoCqZ0LkfwHDJN9A/iNjAe0GgKoFIfL/bFCx/o+SZ2k3AFQ9GyL/C4OK/EfJQtoNAFULQ+R/UVCR/yhZRLsBoGpRiPwvDiryHyWLaTcAVC0Okf8lQUX+o2QJ7QaAqiUh8r80qMh/lCyl3QBQtTRE/p8LKvIfJc/RbgCoei5E/pcFFfmPkmW0GwCqloXI//NBRf6j5HnaDQBVz4fI//KgIv9Rspx2A0DV8hD5fyGoyH+UvEC7AaDqhRD5XxFU5D9KVtBuAKhaESL/LwYV+Y+SF2k3AFS9GCL/K4OK/EfJStoNAFUr0+T/STNl7NXzvyqoyH+UrKLdAFC1KsT6f3VQkf8oWU27AaBqdYj8vxRU5D9KXqLdAFD1Uoj8vxxU5D9KXqbdAFD1coj8vxJU5D9KXqHdAFD1Soj8rwkq8h8la2g3AFStCZH/V4OK/EfJq7QbAKpeDZH/tUFF/qNkLe0GgKq1IfL/WlCR/yh5jXYDQNVrIfK/LqjIf5Sso90AULUuRP5fDyryHyWv024AqHo9RP7XBxX5j5L1tBsAqtaHyP8bQUX+o+QN2g0AVW+EyP+GoCL/UbKBdgNA1YYQ+d8YVOQ/SjbSbgCo2hgi/28GFfmPkjdpNwBUvRki/5uCivxHySbaDQBVm0Lk/62gIv9R8hbtBoCqt0Lkf3NQkf8o2Uy7AaBqc4j8vx1U5D9K3qbdAFD1doj8bwkq8h8lW2g3AFRtCZH/rUFF/qNkK+0GgKqtIfL/TlCR/yh5h3YDQNU7IfL/blCR/yh5l3YDQNW7IfK/LajIf5Rso90AULUtRP7fCyryHyXv0W4AqHovRP63BxX5j5LttBsAqraHyP/7QUX+o+R92g0AVe+HyP+OoCL/UbKDdgNA1Y4Q+d8ZVOQ/SnbSbgCo2hki/7uCivxHyS7aDQBVu0Lkf3dQkf8o2U27AaBqd4j87wkq8h8le2g3AFTtCZH/vUFF/qNkL+0GgKq9IfK/L6jIf5Tso90AULUvRP73BxX5j5L9tBsAqvaHyP8HQUX+o+QD2g0AVR+EyP+BoCL/UXKAdgNA1YEQ+f8wqMh/lHxIuwGg6sMQ+T8YVOQ/Sg7SbgCoOhgi/x8FFfmPko9oNwBUfRQi/4eCivxHySHaDQBVh0Lk/3BQkf8oOUy7AaDqcIj8Hwkq8h8lR2g3AFQdCZH/j4OK/EfJx7QbAKo+DpH/o0FF/qPkKO0GgKqjIfL/SVCR/yj5hHYDQNUnIfJ/LKjIf5Qco90AUHUsRP4/DSryHyWf0m4AqPo0RP6PBxX5j5LjtBsAqo4ThvBJvk1Igp08xieEsITYcvBTZnKJyUUYNvhBSCCEJFy+rWW+/LN9xXjiJ9+fSR7vXq6an3zftOOIT95gvNT7aan3Y46mG0cykyXETTf/3L/Nf94V4wEAAAAAAADgD0u7r55+vx0AoqRm3SrN8hHlivH5CCFrREJOWsk/G+QAc7XH8/HamTQMbgnxmrTELND9/AHmWkOixyegJg/E+I8yl0gqkn6kH+lDqpAOpGMwliHJxyPl8PPJIO18KpG2pD2pTLqT/qRvMFaK/14kbUj9cPNJ+bcSJv18pGA+XUg/0oO0Jb2JFJ9PG1I73PQT4hNMSD99IZh+L9KL9CNdSHeSfNxU+4P9G+R/vE52Sv9VyUDSj3QgPUl7knyYlf0T7wOf7hOT9n2oTXqRTqRa0H+HtC8iYUgbUivcfLrHH9o9/evEc4mkDqlGGqd7m5jwzyN+nDxlmPZ5JE+/DulC+pA+ycfR48lSwn+ernw/ElLej0akA7mP9Cd9SId4fNQ/MX02SDWJv6NpP69NSG9SmbQlfUmH+L8Mcn9H/6nTb0p6BZnrQTqkTq8ZqfJXpx+8IwxXKPg5IfX9bZxuugfsXsHjrzVM+Zi2j0/38nQKBu9gx/jnM4mQoP/ke2SLzydjnmdw3YPplR7X4/Lw8lSDn1PGZ+w/gRC7YPDIlPkUDsbnvMb049NlrjVMeR5qmufRPP7w5OkXCd7/XH9w+imvT+r0E5Onm+/G9K/Tb/0XDcb/c9Mvlu71s1M/yfDHGbQbAKrw/l/f8P5f3/D+X98McnlvojGpSOqSKqQiaUiqkNakJqlLqpF6pCGpQyqSxqQmqUfq/kMd2GluV4j/7eFlNeLDxKCDxqQhqUkqkSakMalKWpPapGbQ9z8vIc3tpDS3L8UlkmqkJqkddFWXVCR1SNV/oavfFE5zuxIh8b0oQirFN4YTST1SidQiVUll0jh4b6v8q/1VuMbtlF38y5+/y701Cd7hxqQ5aU2qkKqkEakcjKlPGgefxH9K/TS3r/7+NiX1SG3SJHhn//33uE2a2xXS7uOkvn7p+/s3s3tZ7wz9pRzWSxkmBsuVy0sYOpKuMf6397dmsOyrSpqR1qQhqUfq/SvLlRRymtsVfre/iqQ2qU3qkcr/ynt72bw0t6+ej0pBbi9/2ur/w71czYrf7a8hqUrqB+u2RkFC6pN6wWv677zLG67RX8qbnUiqkooUcptiZ4aWUj6PKcPk/ui59hnqyQvAxCAP1Ul1UjXYdmkSvHa1U9cljYJth6rBUvsfkeZIXtK1fgHwH3WJufR3yxVi9km/NzHpdyZwYdhfevrX2jwAuC6cdblzaVdVQ4I1F590ef+7WXxc8jlBxeP7Hj4ZS9wMfxeUOXlchvOBfu/nyxokrgqmlRRi/nOZK+cfjPsT8593+Egllvltdd0+Pv/CabbHk+efPfkuXLpznIL6R+YDAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADw/8lZlztHmN9+HkIIYQifVIMQ0iw+zieEsKQ4YeM/jSUuYZIfI7jBIHPyuOQ7CAmEkASW/O7PlzVIXBVMKynE/OcyV84/GPcn5j/v8JFKLBPMNNA+Pv/ChJAK6eafPfkunE8uMblS5h/UPzIfgP8mhrCEIzwRiEgkIhOFqEQjOjGISSxiE4e4xCM+iZFM5AaSmWQhWUk2kp3kIDlJLpJAcpM85EaSSG4iecnNJB+5heQnt5ICpCApRG4jhUkRUpQUI8VJCVKSlCKlSRlyOylL7iDlyJ2kPKlAKpJKpDKpQqqSaqQ6qUFqklrkLlKb1CF1ST1SnzQgDUkj0pg0IU3J3aQZaU5akHtIS3IvaUVakzZ/6vEPkiHkITKUDCNJZDgZQR4mI8kjZBQZTcaQsWQceZSMJ4+RCWQimUQmk8fJFDKVPEGmkelkBnmSzCRPkVlkNplD5pJ55GkynzxDFpBnyUKyiCwmS8hS8hxZRp4nMiFkBXmRrCSryGryEnmZvELWkFfJWvIaWUdeJ+vJG2QD2UjeJJvIW2QzeZtsIVvJO+Rdso28R7aT98kOspPsIrvJHrKX7CP7yQfkAPmQHCQfkUPkMDlCPg75+J8yPH4gQxjCsAzL8AzPiIzIyIzMqIzK6IzOmIzJ2IzNuIzL+IzPZGIyMZmZzExWJiuTncnO5GRyMglMApOHycMkMolMXiYvk4/Jx+Rn8jMFmAJMIaYQU5gpwhRlijLFmeJMSaYkU5opw5RhyjJlmXJMOaY8U56pyFRkKjOVmapMVaY6U52pydRiajG1mdpMXaYuU5+pzzRkGjKNmcZMU6Yp04xpxrRgWjAtmZZMK6YV04Zpw7Rj2jHtmfZMR6Yj05npzHRlujLdme5MT6Yn05vpzfRh+jD9mH7MAGYAM5AZxAxiHmQeZB5iHmKGMZXZ4cwIZgQzkhnJjGJGM6OZscw45lHmUeYxZgIzkZnETGYmM1OYqcyPzDRmOjODmcGUY59iZjGzmQR2LjOPmcfMZ+YzC5gFzEJmEbOIWcIsZZ5jljHLmOXMC8wLzIvMSmYVs4p5iXmJeYVZw6xh1jKvMeuYdcx65idmA7OReZPZxLzFbGbeYrYwW5ktzLvMNuZdZjuzndnB7GB2MbuYPcweZh+zj/mA+YD5kPmQ+Yj5iJnAHGGOMEeZo8wx5hhznDnOnGBOMCeZk8wp5hRzmjnNnGHOMGeZb5hvmW+Y75nvmR+Zn5hzzDnmPHOeucBcYC4yFy+Hn72MZ3lWZEVWZmVWZVVWZ3XWZE3WZm3WZV3WZ302E5uJzcxmZrOyWdnsbHY2J5uTTWBzs3nYPGwim8jmZfOy+dh8bH42P1uALcAWYguxhdnCbFG2KFucLcGWZEuxpdky7O1sWbYsW469ky3PVmArshXZymwVtipbja3G1mBrsLXYWmxttjZbl63L9irQo0BDdjgzimnCXn5nmrETmRbsJKYley/bim3NTmHasu3YqUx7tgPbke3ETmemMV3ZdgW6sz3Ynuwspjfbs8Bspi/bj53LDGAfYAeyg9jB7IPsELZ9gaHsMHYhM5wdwS5hRrKPsKPY0exypgp7+R2ryj7GTmAnspPYyewrzBR2KvsEO42dzs5gn2Rnsk+xs9jZ7Bx2LjuPfZqdzz7DLmCfZReyi9jF7BJ2Kfscu4x9nl3OvsCuYF9kV7Kr2NXsS+zL7CvsGvZVdi37GruOfZ1dz77BbmA3sm+ym9i32M3s2+wWdiv7Dvsuu419j93Ovs/uYHeyu9jd7B52L7uP3c9+wB5gP2QPsh+xh9jD7BH2Y/Yo+wl7jP2UPc5+xp5gP2dPsl+wp9gv2dPsV+wZ9mv2LPsN+y37Hfs9+wP7I/sTe479mT3P/sJeYH9lL7KXWMIxHMtxHM8JnMhJnMwpnMppnM4ZnMlZnM05nMt5nM/FuEzcDVxmLguXlcvGZedycDm5XFwCl5vLw93IJXI3cXm5m7l83C1cfu5WrgBXkCvE3cYV5opwRbliXHGuBFeSK8WV5spwt3NluTu4ctydXHmuAleRq8RV5qpwVblqXHWuBleTq8XdxdXm6nB1uXpcfa4B15BrxDXmmnBNubu5ZlxzrgV3D9eSu5drxbXm2nBtuXbcfVx7rgPXkevEdea6cF25blx3rgfXk+vF9eZ6c324Plw/rj83gBvADeQGcYO5X7mL3CVuKDeMS+KGcyO4h7mR3CPcKG40N4Yby43jHuXGc49xE7iJ3CRuMvc4N4Wbyj3BTeOmczO4J7mZ3FPcLG42N4eby83jnubmc89wC7hnuYXcIm4xt4Rbyj3HjYpPacUfePybV3n8+GDuO7id3C5uN7eH28vt4/ZzO7gD3AHuIHeQO8Qd4o5wR7ij3FHuGHeMO84d505wJ7iT3EnuFHeKO82d5s5wZ7iz3Dfcz9x33PfcD9yP3E/cT9zP3HnuPHch/hoQnuFZnuN5XuBFXuJlXuFVXuN13uBN3uJt3uFd3uN9PsZn4m/gM/NZ+Kx8Nj47n4PPyefiE/jcfB7+Rj6Rv4nPy9/M5+Nv4fPzt/IF+IJ8If62v/z43+uvDd+Gb8e349vz7fmOfEe+M9+Z78p35bvz3fmefE++N9+b78P34fvx/fgB/AB+ID+QH8wP5ofwQ/ih/FA+iU/iR/AP8yP5R/hR/Gh+DD+WH8eP48fz4/kJ/AR+Ej+Jf5x/nJ/KT+Wn8dP4GfwMfiY/k5/Fz+Ln8HP4efw8fj4/n1/AL+AX8gv5xfxifim/lF/GL+OX88v5FfwKfiW/kl/Nr+Zf5l/m1/Br+LX8Wn4dv45fz6/nN/Ab+Y38Jn4Tv5nfzG/ht/Dv8O/w2/ht/HZ+O7+B38nv5Hfzu/m9/F5+P7+fP8Af4A/yB/lD/CH+CH+EP8of5Y/xx/jj/HH+BH+CP8mf5E/xp/jT/Gn+DH+GP8uf5b/lv+W/57/nf+R/5M/x5/jz/Hn+An+Bv8hfvLzZJ7ACK/ACL4iCKMiCLKiCKuiCLpiCKdiCLbiCK/iCL2QSMgmZhcxCViGrkF3ILuQUcgoJQoKQR8gjJAo3CXmFm4V8wi1CfuFWoYBQUCgk3CYUFooIRYViQnGhhFBSKCWUFsoItwtlhTuEcsKdQnmhglBRqCRUFqoIVYVqQnWhhlBTqCXcJdQW6gh1hXpCfaGB0FBoJDQWmghNhbuFZkJzoYVwj9BSuFdoJbQW2vyt0x8tjBHGCuOER4XxwmPCBGGiMEmYLDwuTBGmCk8I04TpwgzhSWGm8JQwS5gtzBHmCvOEp4X5wjPCAuFZYaGwSFgsLBGWCs8Jy4TnheXCC8IK4UVhpbBKWC28JLwsvCKsEV4V1gqvCeuE14X1whvCBmGj8KawSXhL2Cy8LWwRtgrvCO8K24T3hO3C+8IOYaewS9gt7BH2CvuE/cIHwgHhQ+Gg8JFwSDgsHBE+Fo4KnwjHhE+F48Jnwgnhc+Gk8IVwSvhSOC18JZwRvhbOCt8I3wrfCd8LPwg/Cj8J54SfhfPCL8IF4VfhonBJICIjsiIn8qIgiqIkyqIiqqIm6qIhmqIl2qIjuqIn+mJMzCTeIGYWs4hZxWxidjGHmFPMJSaIucU84o1ioniTmFe8Wcwn3iLmF28VC4gFxULibWJhsYhYVCwmFhdLiCXFUmJpsYx4u1hWvEMsJ94plhcriBXFSmJlsYpYVawmVhdriDXFWuJdYm2xjlhXrCfWFxuIDcVGYmOxidhUvFtsJjYXW4j3iC3Fe8VWYmuxjdhWbCfeJ7YXO4gdxU5iZ7GL2FXsJnYXe4g9xV5ib/F+sY/YV+wn9hcHiA+IA8VB4mCRiEPEh8Sh4jAxSRwujhAfFkeKj4ijxNHiGHGsOE58VBwvPiZOECeKk8TJ4uPiFHGq+IQ4TZwuzhCfFGeKT4mzxNniHHGuOE98WpwvPiMuEJ8VF4qLxMXiEnGp+Jy4THxeXC6+IK4QXxRXiqvE1eJL4sviK+Ia8VVxrfiauE58XVwvviFuEDeKb4qbxLfEzeLb4hZxq/iO+K64TXxP3C6+L+4Qd4q7xN3iHnGvuE/cL34gHhA/FA+KH4mHxMPiEfFj8aj4iXhM/FQ8Ln4mnhA/F0+KX4inxC/F0+JX4hnxa/Gs+I34rfid+L34g/ij+JN4TvxZPC/+Il4QfxUvipdEIjESK3ESLwmSKEmSLCmSKmmSLhmSyTCESI7kSp7kSzEpk3SDlFnKImWVsknZpRxSTimXlCDllvJIN0qJ0k1SXulmKZ90i5RfulUqIBWUCkm3SYWlIlJRqZhUXCohlZRKSaWlMtLtUlnpDqmcdKdUXqogVZQqSZWlKlJVqZpUXaoh1ZRqSXdJtaU6Ul2pnlRfaiA1lBpJjaUmUlPpbqmZ1FxqId0jtZTulVpJraU2UlupnXSf1F7qIHWUOkmdpS5SV6mb1F3qIfWUekm9pfulPlJfqZ/UXxogPSANlAZJg6UHpSHSQ9JQaZiUJA2XRkgPSyOlR6RR0mhpjDRWGic9Ko2XHpMmSBOlSdJk6XFpijRVekKaJk2XZkhPSjOlp6RZ0mxpjjRXmic9Lc2XnpEWSM9KC6VF0mJpibRUek5aJj0vLZdekFZIL0orpVXSaukl6WXpFWmN9Kq0VnpNWie9Lq2X3pA2SBulN6VN0lvSZultaYu0VXpHelfaJr0nbZfel3ZIO6Vd0m5pj7RX2iftlz6QDkgfSgelj6RD0mHpiPSxdFT6RDomfSodlz6TTkifSyelL6RT0pfSaekr6Yz0tXRW+kb6VvpO+l76QfpR+kk6J/0snZd+kS5Iv0oXpUsSkRmZlTmZlwVZlCVZlhVZlTVZlw3ZlC3Zlh3ZlT3Zl2NyJvkGObOcRc4qZ5OzyznknHIuOUHOLeeRb5QT5ZvkvPLNcj75Fjm/fKtcQC4oF5JvkwvLReSicjG5uFxCLimXkkvLZeTb5bLyHXI5+U65vFxBrihXkivLVeSqcjW5ulxDrinXku+Sa8t15LpyPbm+3EBuKDeSG8tN5Kby3XIzubncQr5HbinfK7eSW8tt5LZyO/k+ub3cQe4od5I7y13krnI3ubvcQ+4p95J7y/fLfeS+cj+5vzxAfkAeKA+SB8sPykPkh+Sh8jA5SR4uj5AflkfKj8ij5NHyGHmsPE5+VB4vPyZPkCfKk+TJ8uPyFHmq/IQ8TZ4uz5CflGfKT8mz5NnyHHmuPE9+Wp4vPyMvkJ+VF8qL5MXyEnmp/Jy8TH5eXi6/IK+QX5RXyqvk1fJL8svyK/Ia+VV5rfyavE5+XV4vvyFvkDfKb8qb5LfkzfLb8hZ5q/yO/K68TX5P3i6/L++Qd8q75N3yHnmvvE/eL38gH5A/lA/KH8mH5MPyEflj+aj8iXxM/lQ+Ln8mn5A/l0/KX8in5C/l0/JX8hn5a/ms/I38rfyd/L38g/yj/JN8Tv5ZPi//Il+Qf5UvypdkojAKq3AKrwiKqEiKrCiKqmiKrhiKqViKrTiKq3iKr8SUTMoNSmYli5JVyaZkV3IoOZVcSoKSW8mj3KgkKjcpeZWblXzKLUp+5ValgFJQKaTcphRWiihFlWJKcaWEUlIppZRWyii3K2WVO5Ryyp1KeaWCUlGppFRWqihVlWpKdaWGUlOppdyl1FbqKHWVekp9pYHSUGmkNFaaKE2Vu5VmSnOlhXKP0lK5V2mltFbaKG2Vdsp9Snulg9JR6aR0VrooXZVuSnelh9JT6aX0Vu5X+ih9lX5Kf2WA8oAyUBmkDFYeVIYoDylDlWFKkjJcGaE8rIxUHlFGKaOVMcpYZZzyqDJeeUyZoExUJimTlceVKcpU5QllmjJdmaE8qcxUnlJmKbOVOcpcZZ7ytDJfeUZZoDyrLFQWKYuVJcpS5TllmfK8slx5QVmhvKisVFYpq5WXlJeVV5Q1yqvKWuU1ZZ3yurJeeUPZoGxU3lQ2KW8pm5W3lS3KVuUd5V1lm/Kesl15X9mh7FR2KbuVPcpeZZ+yX/lAOaB8qBxUPlIOKYeVI8rHylHlE+WY8qlyXPlMOaF8rpxUvlBOKV8qp5WvlDPK18pZ5RvlW+U75XvlB+VH5SflnPKzcl75Rbmg/KpcVC4pRGVUVuVUXhVUUZVUWVVUVdVUXTVUU7VUW3VUV/VUX42pmdQb1MxqFjWrmk3NruZQc6q51AQ1t5pHvVFNVG9S86o3q/nUW9T86q1qAbWgWki9TS2sFlGLqsXU4moJtaRaSi2tllFvV8uqd6jl1DvV8moFtaJaSa2sVlGrqtXU6moNtaZaS71Lra3WUeuq9dT6agO1odpIbaw2UZuqd6vN1OZqC/UetaV6r9pKba22Uduq7dT71PZqB7Wj2kntrHZRu6rd1O5qD7Wn2kvtrd6v9lH7qv3U/uoA9QF1oDpIHaw+qA5RH1KHqsPUJHW4OkJ9WB2pPqKOUkerY9Sx6jj1UXW8+pg6QZ2oTlInq4+rU9Sp6hPqNHW6OkN9Up2pPqXOUmerc9S56jz1aXW++oy6QH1WXaguUherS9Sl6nPqMvV5dbn6grpCfVFdqfLqavUl9WX1FXWN+qq6Vn1NXae+rq5X31A3qBvVN9VN6lvqZvVtdYu6VX1HfVfdpr6nblffV3eoO9Vd6m51j7pX3afuVz9QD6gfqgfVj9RD6mH1iPqxelT9RD2mfqoeVz9TT6ifqyfVL9RT6pfqafUr9Yz6tXpW/Ub9Vv1O/V79Qf1R/Uk9p/6snld/US+ov6oX1Usq0RiN1TiN1wRN1CRN1hRN1TRN1wzN1CzN1hzN1TzN12JaJu0GLbOWRcuqZdOyazm0nFouLUHLreXRbtQStZu0vNrNWj7tFi2/dqtWQCuoFdJu0wprRbSiWjGtuFZCK6mV0kprZbTbtbLaHVo57U6tvFZBq6hV0iprVbSqWjWtulZDq6nV0u7Samt1tLpaPa2+1kBrqDXSGmtNtKba3VozrbnWQrtHa6ndq7XSWmtttLZaO+0+rb3WQeuoddI6a120rlo3rbvWQ+up9dJ6a/drfbS+Wj+tvzZAe0AbqA3SBmsPakO0h7Sh2jAtSRuujdAe1kZqj2ijtNHaGG2sNk57VBuvPaZN0CZqk7TJ2uPaFG2q9oQ2TZuuzdCe1GZqT2mztNnaHG2uNk97WpuvPaMt0J7VFmqLtMXaEm2p9py2THteW669oK3QXtRWaqu01dpL2svaK9oa7VVtrfaatk57XVuvvaFt0DZqb2qbtLe0zdrb2hZtq/aO9q62TXtP2669r+3Qdmq7tN3aHm2vtk/br32gHdA+1A5qH2mHtMPaEe1j7aj2iXZM+1Q7rn2mndA+105qX2intC+109pX2hnta+2s9o32rfad9r32g/aj9pN2TvtZO6/9ol3QftUuapc0ojM6q3M6rwu6qEu6rCu6qmu6rhu6qVu6rTu6q3u6r8f0TPoNemY9i55Vz6Zn13PoOfVceoKeW8+j36gn6jfpefWb9Xz6LXp+/Va9gF5QL6TfphfWi+hF9WJ6cb2EXlIvpZfWy+i362X1O/Ry+p16eb2CXlGvpFfWq+hV9Wp6db2GXlOvpd+l19br6HX1enp9vYHeUG+kN9ab6E31u/VmenO9hX6P3lK/V2+lt9bb6G31dvp9enu9g95R76R31rvoXfVuene9h95T76X31u/X++h99X56f32A/oA+UB+kD9Yf1IfoD+lD9WF6kj5cH6E/rI/UH9FH6aP1MfpYfZz+qD5ef0yfoE/UJ+mT9cf1KfpU/Ql9mj5dn6E/qc/Un9Jn6bP1OfpcfZ7+tD5ff0ZfoD+rL9QX6Yv1JfpS/Tl9mf68vlx/QV+hv6iv1Ffpq/WX9Jf1V/Q1+qv6Wv01fZ3+ur5ef0PfoG/U39Q36W/pm/W39S36Vv0d/V19m/6evl1/X9+h79R36bv1PfpefZ++X/9AP6B/qB/UP9IP6Yf1I/rH+lH9E/2Y/ql+XP9MP6F/rp/Uv9BP6V/qp/Wv9DP61/pZ/Rv9W/07/Xv9B/1H/Sf9nP6zfl7/Rb+g/6pf1C/pxGAM1uAM3hAM0ZAM2VAM1dAM3TAM07AM23AM1/AM34gZmYwbjMxGFiOrkc3IbuQwchq5jAQjt5HHuNFING4y8ho3G/mMW4z8xq1GAaOgUci4zShsFDGKGsWM4kYJo6RRyihtlDFuN8oadxjljDuN8kYFo6JRyahsVDGqGtWM6kYNo6ZRy7jLqG3UMeoa9Yz6RgOjodHIaGw0MZoadxvNjOZGC+Meo6Vxr9HKaG20Mdoa7Yz7jPZGB6Oj0cnobHQxuhrdjO5GD6On0cvobdxv9DH6Gv2M/sYA4wFjoDHIGGw8aAwxHjKGGsOMJGO4McJ42BhpPGKMMkYbY4yxxjjjUWO88ZgxwZhoTDImG48bU4ypxhPGNGO6McN40phpPGXMMmYbc4y5xjzjaWO+8YyxwHjWWGgsMhYbS4ylxnPGMuN5Y7nxgrHCeNFYaawyVhsvGS8brxhrjFeNtcZrxjrjdWO98YaxwdhovGlsMt4yNhtvG1uMrcY7xrvGNuM9Y7vxvrHD2GnsMnYbe4y9xj5jv/GBccD40DhofGQcMg4bR4yPjaPGJ8Yx41PjuPGZccL43DhpfGGcMr40ThtfGWeMr42zxjfGt8Z3xvfGD8aPxk/GOeNn47zxi3HB+NW4aFwyiMmYrMmZvCmYoimZsqmYqqmZummYpmmZtumYrumZvhkzM5k3mJnNLGZWM5uZ3cxh5jRzmQlmbjOPeaOZaN5k5jVvNvOZt5j5zVvNAmZBs5B5m1nYLGIWNYuZxc0SZkmzlFnaLGPebpY17zDLmXea5c0KZkWzklnZrGJWNauZ1c0aZk2zlnmXWdusY9Y165n1zQZmQ7OR2dhsYjY17zabmc3NFuY9ZkvzXrOV2dpsY7Y125n3me3NDmZHs5PZ2exidjW7md3NHmZPs5fZ27zf7GP2NfuZ/c0B5gPmQHOQOdh80BxiPmQONYeZSeZwc4T5sDnSfMQcZY42x5hjzXHmo+Z48zFzgjnRnGRONh83p5hTzSfMaeZ0c4b5pDnTfMqcZc4255hzzXnm0+Z88xmTkCSy0FxkLjaXmEvN58zh5vPmcvMFc4X5ornSXGWuNl8yXzZfMdeYr5przdfMdebr5nrzDXODudF809xkvmVuNt82t5hbzXfMd81t5nvmdvN9c4e509xl7jb3mHvNfeZ+8wPzgPmhedD8yDxkHjaPmB+bR81PzGPmp+Zx8zPzhPm5edL8wjxlfmmeNr8yz5hfm2fNb8xvze/M780fzB/Nn8xz5s/mefMX84L5q3nRvGQSi7FYi7N4S7BES7JkS7FUS7N0y7BMy7Jsy7Fcy7N8K2Zlsm6wMltZrKxWNiu7lcPKaeWyEqzcVh7rRivRusnKa91s5bNusfJbt1oFrIJWIes2q7BVxCpqFbOKWyWsklYpq7RVxrrdKmvdYZWz7rTKWxWsilYlq7JVxapqVbOqWzWsmlYt6y6rtlXHqmvVs+pbDayGViOrsdXEamrdbTWzmlstrHuslta9ViurtdXGamu1s+6z2lsdrI5WJ6uz1cXqanWzuls9rJ5WL6u3db/Vx+pr9bP6WwOsB6yB1iBrsPWgNcR6yBpqDbOSrOHWCOtha6T1iDXKGm2NscZa46xHrfHWYxaTNNGaZE22HremWFOtJ6xp1nRrhvWkNdN6ypplzbbmWHOtedbT1nzrGWuB9ay10FpkLbaWWEut56xl1vPWcusFa4X1orXSWmWttl6yXrZesdZYr1prrdesddbr1nrrDWuDtdF609pkvWVttt62tlhbrXesd61t1nvWdut9a4e109pl7bb2WHutfdZ+6wPrgPWhddD6yDpkHbaOWB9bR61PrGPWp9Zx6zPrhPW5ddL6wjplfWmdtr6yzlhfW2etb6xvre+s760frB+tn6xz1s/WeesX64L1q3XRumQRm7FZm7N5W7BFW7JlW7FVW7N127BN27Jt27Fd27N9O2Znsm+wM9tZ7Kx2Nju7ncPOaeeyE+zcdh77RjvRvsnOa99s57NvsfPbt9oF7IJ2Ifs2u7BdxC5qF7OL2yXsknYpu7Rdxr7dLmvfYZez77TL2xXsinYlu7Jdxa5qV7Or2zXsmnYt+y67tl3HrmvXs+vbDeyGdiO7sd3EbmrfbTezm9st7Hvslva9diu7td3Gbmu3s++z29sd7I52J7uz3cXuanezu9s97J52L7u3fb/dx+5r97P72wPsB+yB9iB7sP2gPcR+yB5qD7OT7OH2CPthe6T9iD3KHm2Pscfa4+xH7fH2Y/YEe6I9yZ5sP25PsafaT9jT7On2DPtJe6b9lD3Lnm3Psefa8+yn7fn2M/YC+1l7ob3IXmwvsZfaz9nL7Oft5fYL9gr7RXulvcpebb9kv2y/Yq+xX7XX2q/Z6+zX7fX2G/YGe6P9pr3JfsvebL9tb7G32u/Y79rb7Pfs7fb79g57p73L3m3vsffa++z99gf2AftD+6D9kX3IPmwfsT+2j9qf2MfsT+3j9mf2Cftz+6T9hX3K/tI+bX9ln7G/ts/a39jf2t/Z39s/2D/aP9nn7J/t8/Yv9gX7V/uifckmDuOwDufwjuCIjuTIjuKojubojuGYjuXYjuO4juf4TszJ5NzgZHayOFmdbE52J4eT08nlJDi5nTzOjU6ic5OT17nZyefc4uR3bnUKOAWdQs5tTmGniFPUKeYUd0o4JZ1STmmnjHO7U9a5wynn3OmUdyo4FZ1KTmWnilPVqeZUd2o4NZ1azl1ObaeOU9ep59R3GjgNnUZOY6eJ09S522nmNHdaOPc4LZ17nVZOa6eN09Zp59zntHc6OB2dTk5np4vT1enmdHd6OD2dXk5v536nj9PX6ef0dwY4DzgDnUHOYOdBZ4jzkDPUGeYkOcOdEc7DzkjnEWeUM9oZ44x1xjmPOuOdx5wJzkRnkjPZedyZ4kx1nnCmOdOdGc6TzkznKWeWM9uZ48x15jlPO/OdZ5wFzrPOQmeRs9hZ4ix1nnOWOc87y50XnBXOi85KZ5Wz2nnJedl5xVnjvOqsdV5z1jmvO+udN5wNzkbnTWeT85az2Xnb2eJsdd5x3nW2Oe852533nR3OTmeXs9vZ4+x19jn7nQ+cA86HzkHnI+eQc9g54nzsHHU+cY45nzrHnc+cE87nzknnC+eU86Vz2vnKOeN87Zx1vnG+db5zvnd+cH50fnLOOT87551fnAtLfnUuOpcc4jIu63Iu7wqu6Equ7Cqu6mqu7hqu6Vqu7Tqu63qu78bcTO4NbmY3i5vVzeZmd3O4Od1cboKb283j3ugmuje5ed2b3XzuLW5+91a3gFvQLeTe5hZ2i7hF3WJucbeEW9It5ZZ2y7i3u2XdO9xy7p1uebeCW9Gt5FZ2q7hV3WpudbeGW9Ot5d7l1nbruHXdem59t4Hb0G3kNnabuE3du91mbnO3hXuP29K9123ltnbbuG3ddu59bnu3g9vR7eR2dru4Xd1ubne3h9vT7eX2du93+7h93X5uf3eA+4A70B3kDnYfdIe4D7lD3WFukjvcHeE+7I50H3FHuaPdMe5Yd5z7qDvefcyd4E50J7mT3cfdKe5U9wl3mjvdneE+6c50n3JnubPdOe5cd577tDvffcZd4D7rLnQXuYvdJe5S9zl3mfu8u9x9wV3hvuiudFe5q92X3JfdV9w17qvuWvc1d537urvefcPd4G5033Q3uW+5m9233S3uVvcd9113m/ueu919393h7nR3ubvdPe5ed5+73/3APeB+6B50P3IPuYfdI+7H7lH3E/eY+6l73P3MPeF+7p50v3BPuV+6p92v3DPu1+5Z9xv3W/c793v3B/dH9yf3nPuze979xb3g/upedC+5xGM81uM83hM80ZM82VM81dM83TM807M823M81/M834t5mbwbvMxeFi+rl83L7uXwcnq5vAQvt5fHu9FL9G7y8no3e/m8W7z83q1eAa+gV8i7zSvsFfGKesW84l4Jr6RXyivtlfFu98p6d3jlvDu98l4Fr6JXyavsVfGqetW86l4Nr6ZXy7vLq+3V8ep69bz6XgOvodfIa+w18Zp6d3vNvOZeC+8er6V3r9fKa+218dp67bz7vPZeB6+j18nr7HXxunrdvO5eD6+n18vr7d3v9fH6ev28/t4A7wFvoDfIG+w96A3xHvKGesO8JG+4N8J72BvpPeKN8kZ7Y7yx3jjvUW+895g3wZvoTfIme497U7yp3hPeNG+6N8N70pvpPeXN8mZ7c7y53jzvaW++94y3wHvWW+gt8hZ7S7yl3nPeMu95b7n3grfCe9Fb6a3yVnsveS97r3hrvFe9td5r3jrvdW+994a3wdvovelt8t7yNntve1u8rd473rveNu89b7v3vrfD2+nt8nZ7e7y93j5vv/eBd8D70DvofeQd8g57R7yPvaPeJ94x71PvuPeZd8L73DvpfeGd8r70TntfeWe8r72z3jfet9533vfeD96P3k/eOe9n77z3i3fB+9W76F3yiM/4rM/5vC/4oi/5sq/4qq/5um/4pm/5tu/4ru/5vh/zM/k3+Jn9LH5WP5uf3c/h5/Rz+Ql+bj+Pf6Of6N/k5/Vv9vP5t/j5/Vv9An5Bv5B/m1/YL+IX9Yv5xf0Sfkm/lF/aL+Pf7pf17/DL+Xf65f0KfkW/kl/Zr+JX9av51f0afk2/ln+XX9uv49f16/n1/QZ+Q7+R39hv4jf17/ab+c39Fv49fkv/Xr+V39pv47f12/n3+e39Dn5Hv5Pf2e/id/W7+d39Hn5Pv5ff27/f7+P39fv5/f0B/gP+QH+QP9h/0B/iP+QP9Xk/yR/uj/Af9kf6j/ij/NH+GH+sP85/1B/vP+ZP8Cf6k/zJ/uP+FH+q/4Q/zZ/uz/Cf9Gf6T/mz/Nn+HH+uP89/2p/vP+Mv8J/1F/qL/MX+En+p/5y/zH/eX+6/4K/wX/RX+qv81f5L/sv+K/4a/1V/rf+av85/3V/vv+Fv8Df6b/qb/Lf8zf7b/hZ/q/+O/66/zX/P3+6/7+/wd/q7/N3+Hn+vv8/f73/gH/A/9A/6H/mH/MP+Ef9j/6j/iX/M/9Q/7n/mn/A/90/6X/in/C/90/5X/hn/a/+s/43/rf+d/73/g/+j/5N/zv/ZP+//4l/wf/Uv+pd8EmNibIyL8TEhJsakmBxTYmpMi+kxI2bGrJgdc2JuzIv5sVgsU+yGWOZYlljWWLZY9liOWM5YrlhCLHcsT+zGWGLsplje2M2xfLFbYvljt8YKxArGCsVuixWOFYkVjRWLFY+ViJWMlYqVjpWJ3R4rG7sjVi52Z6x8rEKsYqxSrHKsSqxqrFqseqxGrGasVuyuWO1YnVjdWL1Y/ViDWMNYo1jjWJNY09jdsWax5rEWsXtiLWP3xlrFWsfaxNrG2sXui7WPdYh1jHWKdY51iXWNdYt1j/WI9Yz1ivWO3R/rE+sb6xfrHxsQeyA2MDYoNjj2YGxI7KHY0NiwWFJseOz/aLfHZz+CtoHzsZ2M7ZmemR7/Ytu2bdu2bdv2iZ2cnNi2TuzkJNl67hdb++z9Yqu2dvvdp7q66/r+AdcwZDgyAhmJjEJGI2OQscg4ZDwyAZmITEImI1OQqcg0ZDoyA5mJzEJmI3OQucg8ZD6yAFmILEIWI0uQpcgyZDmyAlmJrEJWI2uQtcg6ZD2yAdmIbEI2I1uQrcg2ZDuyA9mJ7EJ2I3uQvcg+ZD8ShxxADiKHkMPIEeQocgw5jpxATiKnkNPIGeQscg6JR84jCcgF5CJyCbmMXEGuIteQ68gN5CZyC7mN3EHuIveQ+8gD5CHyCHmMPEGeIs+Q58gL5CXyCnmNvEESkbfIO+Q98gH5iHxCPiNfkK/IN+Q78gP5ifxCfiNJyB/kL/IPSYYmR1OgKdFUaGo0DZoWTYemRzOgGdFMaGY0C5oVzYZmR3OgOdFcKIKiKIbiKIGSKIXSKIOyKIfyqICKqITKqIKqqIbqqIEC1EQt1EYh6qAu6qE+GqAhGqExNDeaB82L5kPzowXQgmghtDBaBC2KFkOLoyXQkmgptDRaBi2LlkPLoxXQimgltDJaBa2KVkOrozXQmmgttDZaB62L1kProw3QhmgjtDHaBG2KNkOboy3QlmgrtDXaBm2LtkPbox3QjmgntDPaBe2KdkO7oz3QnmgvtDfaB+2L9kP7owPQgeggdDA6BB2KDkOHoyPQkegodDQ6Bh2LjkPHoxPQiegkdDI6BZ2KTkOnozPQmegsdDY6B52LzkPnowvQhegidDG6BF2KLkOXoyvQlegqdDW6Bl2LrkPXoxvQjegmdDO6Bd2KbkO3ozvQnegudDe6B92L7kP3o3HoAfQgegg9jB5Bj6LH0OPoCfQkego9jZ5Bz6Ln0Hj0PJqAXkAvopfQy+gV9Cp6Db2O3kBvorfQ2+gd9C56D72PPkAfoo/Qx+gT9Cn6DH2OvkBfoq/Q1+gbNBF9i75D36Mf0I/oJ/Qz+gX9in5Dv6M/0J/oL/Q3moT+Qf+i/9BkWHIsBZYSS4WlxtJgabF0WHosA5YRy4RlxrJgWbFsWHYsB5YTy4UhGIphGI4RGIlRGI0xGItxGI8JmIhJmIwpmIppmI4ZGMBMzMJsDGIO5mIe5mMBFmIRFsNyY3mwvFg+LD9WACuIFcIKY0WwolgxrDhWAiuJlcJKY2Wwslg5rDxWAauIVcIqY1Wwqlg1rDpWA6uJ1cJqY3Wwulg9rD7WAGuINcIaY02wplgzrDnWAmuJtcJaY22wtlg7rD3WAeuIdcI6Y12wrlg3rDvWA+uJ9cJ6Y32wvlg/rD82ABuIDcIGY0OwodgwbDg2AhuJjcJGY2Owsdg4bDw2AZuITcImY1Owqdg0bDo2A5uJzcJmY3Owudg8bD62AFuILcIWY0uwpdgybDm2AluJrcJWY2uwtdg6bD22AduIbcI2Y1uwrdg2bDu2A9uJ7cJ2Y3uwvdg+bD8Whx3ADmKHsMPYEewodgw7jp3ATmKnsNPYGewsdg6Lx85jCdgF7CJ2CbuMXcGuYtew69gN7CZ2C7uN3cHuYvew+9gD7CH2CHuMPcGeYs+w59gL7CX2CnuNvcESsbfYO+w99gH7iH3CPmNfsK/YN+w79gP7if3CfmNJ2B/sL/YPS4Ynx1PgKfFUeGo8DZ4WT4enxzPgGfFMeGY8C54Vz4Znx3PgOfFcOIKjOIbjOIGTOIXTOIOzOIfzuICLuITLuIKruIbruIED3MQt3MYh7uAu7uE+HuAhHuExPDeeB8+L58Pz4wXwgnghvDBeBC+KF8OL4yXwkngpvDReBi+Ll8PL4xXwinglvDJeBa+KV8Or4zXwmngtvDZeB6+L18Pr4w3whngjvDHeBG+KN8Ob4y3wlngrvDXeBm+Lt8Pb4x3wjngnvDPeBe+Kd8O74z3wnngvvDfeB++L98P74wPwgfggfDA+BB+KD8OH4yPwkfgofDQ+Bh+Lj8PH4xPwifgkfDI+BZ+KT8On4zPwmfgsfDY+B5+Lz8Pn4wvwhfgifDG+BF+KL8OX4yvwlfgqfDW+Bl+Lr8PX4xvwjfgmfDO+Bd+Kb8O34zvwnfgufDe+B9+L78P343H4Afwgfgg/jB/Bj+LH8OP4Cfwkfgo/jZ/Bz+Ln8Hj8PJ6AX8Av4pfwy/gV/Cp+Db+O38Bv4rfw2/gd/C5+D7+PP8Af4o/wx/gT/Cn+DH+Ov8Bf4q/w1/gbPBF/i7/D3+Mf8I/4J/wz/gX/in/Dv+M/8J/4L/w3noT/wf/i//BkRHIiBZGSSEWkJtIQaYl0RHoiA5GRyERkJrIQWYlsRHYiB5GTyEUgBEpgBE4QBElQBE0wBEtwBE8IhEhIhEwohEpohE4YBCBMwiJsAhIO4RIe4RMBERIRESNyE3mIvEQ+Ij9RgChIFCIKE0WIokQxojhRgihJlCJKE2WIskQ5ojxRgahIVCIqE1WIqkQ1ojpRg6hJ1CJqE3WIukQ9oj7RgGhINCIaE02IpkQzojnRgmhJtCJaE22ItkQ7oj3RgehIdCI6E12IrkQ3ojvRg+hJ9CJ6E32IvkQ/oj8xgBhIDCIGE0OIocQwYjgxghhJjCJGE2OIscQ4YjwxgZhITCImE1OIqcQ0Yjoxg5hJzCJmE3OIucQ8Yj6xgFhILCIWE0uIpcQyYjmxglhJrCJWE2uItcQ6Yj2xgdhIbCI2E1uIrcQ2Yjuxg9hJ7CJ2E3uIvcQ+Yj8RRxwgDhKHiMPEEeIocYw4TpwgThKniNPEGeIscY6IJ84TCcQF4iJxibhMXCGuEteI68QN4iZxi7hN3CHuEveI+8QD4iHxiHhMPCGeEs+I58QL4iXxinhNvCESibfEO+I98YH4SHwiPhNfiK/EN+I78YP4SfwifhNJxB/iL/GPSEYmJ1OQKclUZGoyDZmWTEemJzOQGclMZGYyC5mVzEZmJ3OQOclcJEKiJEbiJEGSJEXSJEOyJEfypECKpETKpEKqpEbqpEEC0iQt0iYh6ZAu6ZE+GZAhGZExMjeZh8xL5iPzkwXIgmQhsjBZhCxKFiOLkyXIkmQpsjRZhixLliPLkxXIimQlsjJZJV1VshpZnaxB1iRrkbXJOmRdsh5Zn2xANiQbkY3JJmRTshnZnGxBtiRbka3JNmRbsh3ZnuxAdiQ7kZ3JLmRXshvZnexB9iR7kb3JPmRfsh/ZnxxADiQHkYPJIeRQchg5nBxBjiRHkaPJMeRYchw5npxATiQnkZPJKeRUcho5nZxBziRnkbPJOeRcch45n1xALiQXkYvJJeRSchm5nFxBriRXkavJNeRach25ntxAbiQ3kZvJLeRWchu5ndxB7iR3kbvJPeRech+5n4wjD5AHyUPkYfIIeZQ8Rh4nT5AnyVPkafIMeZY8R8aT58kE8gJ5kbxEXiavkFfJa+R18gZ5k7xF3ibvkHfJe+R98gH5kHxEPiafkE/JZ+Rz8gX5knxFvibfkInkW/Id+Z78QH4kP5GfyS/kV/Ib+Z38Qf4kf5G/ySTyD/mX/Ecmo5JTKaiUVCoqNZWGSkulo9JTGaiMVCYqM5WFykplo7JTOaicVC4KoVAKo3CKoEiKomiKoViKo3hKoERKomRKoVRKo3TKoABlUhZlU5ByKJfyKJ8KqJCKqBiVm8pD5aXyUfmpAlRBqhBVmCpCFaWKUcWpElRJqhRVmipDlaXKUeWpClRFqhJVmapCVaWqUdWpGlRNqhZVm6pD1aXqUfWpBlRDqhHVmGpCNaWaUc2pFlRLqhXVmmpDtaXaUe2pDlRHqhPVmepCdaW6Ud2pHlRPqhfVm+pD9aX6Uf2pAdRAahA1mBpCDaWGUcOpEdRIahQ1mhpDjaXGUeOpCdREahI1mZpCTaWmUdOpGdRMahY1m5pDzaXmUfOpBdRCahG1mFpCLaWWUcupFdRKahW1mlpDraXWUeupDdRGahO1mdpCbaW2UdupHdROahe1m9pD7aX2UfupOOoAdZA6RB2mjlBHqWPUceoEdZI6RZ2mzlBnqXNUPHWeSqAuUBepS9Rl6gp1lbpGXaduUDepW9Rt6g51l7pH3aceUA+pR9Rj6gn1lHpGPadeUC+pV9Rr6g2VSL2l3lHvqQ/UR+oT9Zn6Qn2lvlHfqR/UT+oX9ZtKov5Qf6l/VDI6OZ2CTkmnolPTaei0dDo6PZ2BzkhnojPTWeisdDY6O52DzknnohEapTEapwmapCmaphmapTmapwVapCVaphVapTVapw0a0CZt0TYNaYd2aY/26YAO6YiO0bnpPHReOh+dny5AF6QL0YXpInRRuhhdnC5Bl6RL0aXpMnRZuhxdnq5AV6Qr0ZXpKnRVuhpdna5B16Rr0bXpOnRduh5dn25AN6Qb0Y3pJnRTuhndnG5Bt6Rb0a3pNnRbuh3dnu5Ad6Q70Z3pLnRXuhvdne5B96R70b3pPnRfuh/dnx5AD6QH0YPpIfRQehg9nB5Bj6RH0aPpMfRYehw9np5AT6Qn0ZPpKfRUeho9nZ5Bz6Rn0bPpOfRceh49n15AL6QX0YvpJfRSehm9nF5Br6RX0avpNfRaeh29nt5Ab6Q30ZvpLfRWehu9nd5B76R30bvpPfReeh+9n46jD9AH6UP0YfoIfZQ+Rh+nT9An6VP0afoMfZY+R8fT5+kE+gJ9kb5EX6av0Ffpa/R1+gZ9k75F36bv0Hfpe/R9+gH9kH5EP6af0E/pZ/Rz+gX9kn5Fv6bf0In0W/od/Z7+QH+kP9Gf6S/0V/ob/Z3+Qf+kf9G/6ST6D/2X/kcnY5IzKZiUTComNZOGScukY9IzGZiMTCYmM5OFycpkY7IzOZicTC4GYVAGY3CGYEiGYmiGYViGY3hGYERGYmRGYVRGY3TGYABjMhZjM5BxGJfxGJ8JmJCJmBiTm8nD5GXyMfmZAkxBphBTmCnCFGWKMcWZEkxJphRTminDlGXKMeWZCkxFphJTmanCVGWqMdWZGkxNphZTm6nD1GXqMfWZBkxDphHTmGnCNGWaMc2ZFkxLphXTmmnDtGXaMe2ZDkxHphPTmenCdGW6Md2ZHkxPphfTm+nDDFzdj+nPDGCaW4OYwcwQZigzjBnOjGBGMqOY0cwYZiwzjhnPTGAmMpOYycwUZiozjZnOzGBmMrOY2cwcZi4zj5nPLGAWMouYxcwSZimzjFnOrGBWMquY1cwaZi2zjlnPbGA2MpuYzcwWZiuzjdnO7GB2MruY3cweZi+zj9nPxDEHmIPMIeYwc4Q5yhxjjjMnmJPMKeY0c4Y5y5xj4pnzTAJzgbnIXGIuM1eYq8w15jpzg7nJ3GJuM3eYu8w95j7zgHnIPGIeM0+Yp8wz5jnzgnnJvGJeM2+YROYt8455z3xgPjKfmM/MF+Yr8435zvxgfjK/mN9MEvOH+cv8Y5KxydgUbAo2FZuKTcOmYdOx6dgMbAY2E5uJzcJmYbOx2dgcbA42F5uLRVmUxVmcJVmSpVma/Z/DszwrsiIrszKrsiqrszoLWMBarMVCFrIu67I+67MhG7IxNsbmYfOw+dh8bAG2AFuILcQWYYuwxdhibAm2BFuKLcWWYcuw5dhybAW2AluJrcRWYauw1dhqbA22BluLrcXWYeuw9dh6bAO2AduIbcQ2YZuwzdhmbAu2BduKbcW2Yduw7dh2bAe2A9uJ7cR2Ybuw3dhubA+2B9uL7cX2Yfuw/dh+7AB2ADuIHcQOYYeww9hh7Ah2BDuKHcWOYcew49hx7AR2AjuJncROYaew09hp7Ax2BjuLnc3OYeey89j57AJ2IbuIXcwuYZewy9hl7Ap2BbuKXcWuYdew69h17AZ2A7uJ3cRuYbew29ht7A52B7uL3cXuYfew+9h9bBwbxx5kD7KH2cPsUfYoe5w9zp5kT7Kn2dPsWfYsG8/GswlsAnuRvcheZi+zV9mr7HX2OnuTvcneZm+zd9m77H32PvuQfcg+Zh+zT9mn7HP2OfuSfcm+Zl+ziWwi+459x35gP7Cf2E/sF/YL+439xv5gf7C/2F9sEpvE/mX/sum49FwGLiOXicvMZeGycv934xzBkRzF0RzDoRz2v8xyHKdzBgc4k7M4m4Oc81/Ow+Xl8nH5uQJcQa4QV/i/XIYry5XjynMVuIpcKa70/3IlrjJXhavLVePqczW4hlwtrjFXh6vL1ePqcw24hlwjrjHXhmvLtePacx24jlwnrvP/xf/+433cfu40d4Y7y53jbnN3uB/cT+4194b7xf3mBnGDuTHcWG4cN56bwE3kJnGT/8tzuLncPG4+t4BbyC3iFv+X13BruXXcem4Dt5HbxG3+L+/h9nLbuDhuB7eT28Xt/o//Z6Y47gB3kDvEHeaOcEe5eO44d4I7yZ36P2eN585zCdwF7iZ3i7vMXeGucte469yN//h/Ou5y97j73APuJfeKe8w94Z5yidxz7sV//D99idxb7h33nvvAfeQ+cUncF+4r9437/p/+/2lP4v5wf7l/XDI+OZ+CT8mn4lPzafi0fDo+PZ+Bz8hn4jPzWfisfDY+O5+Dz8nn4hEe5TEe5wme5Cme5hme5Tme5wVe5CVe5hVe5TVe5w0e8CZv8TYPeYd3eY/3+YAP+YiP8bn5PHxePh+fny/AF+QL8YX5InxRvhhfnC/Bl+RL8aX5MnxZvhxfnq/AV+Qr8ZX5KnxVvhpfna/B1+Rr8bX5Onxdvh5fn2/AN+Qb8Y35JnxTvhnfnG/Bt+Rb8a35Nnxbvh3fnu/Ad+Q78Z35LnxXvhvfne/B9+R78b35Pnxfvh/fnx/AD+QH8YP5IfxQfhg/nB/Bj+RH8aP5MfxYfhw/np/AT+Qn8ZP5KfxUfho/nZ/Bz+Rn8bP5Ofxcfh4/n1/AL+QX8Yv5JfxSfhm/nF/Br+RX8av5Nfxafh2/nt/Ab+Q38Zv5LfxWfhu/nd/B7+R38bv5Pfxefh+/n4/jD/AH+UP8Yf4If5Q/xh/nT/An+VP8af4Mf5Y/x8fz5/kE/gJ/kb/EX+av8Ff5a/x1/gZ/k7/F3+bv8Hf5e/x9/gH/kH/EP+af8E/5Z/xz/gX/kn/Fv+bf8In8W/4d/57/wH/kP/Gf+S/8V/4b/53/wf/kf/G/+ST+D/+X/8cnE5ILKYSUQiohtZBGSCukE9ILGYSMQiYhs5BFyCpkE7ILOYScQi4BEVABE3CBEEiBEmiBEViBE3hBEERBEmRBEVRBE3TBEIBgCpZgC1BwBFfwBF8IhFCIhJiQW8gj5BXyCfmFAkJBoZBQWCgiFBWKCcWFEkJJoZRQWigjlBXKCeWFCkJFwRcqC1WEqkI1obpQQ6gp1BJqC3WEukI9ob7QQGgoNBIaC02EpkIzobnQQmgptBJaC22EtkI7ob3QQegodBI6/z/eDxGGCsOE4cJwYaQwShgtjBHGCuOE8cIEYaIwSZgsTBGmCtOE6cIMYaYwS5gtzBHmCvOE+cICYaGwSFgsLBGWCsuE5cIKYaWwSlgtrBHWCuuE9cIGYaOwSdgsbBG2CtuE7cIOYaewS9gt7BH2CvuE/UKccEA4KBwSDgtHhKPCMeG4cEI4KZwSTgtnhLPCOSFeOC8kCBeEi8Il4bJwRbgqXBOuCzeEm8It4bZwR7gr3BPuCw+Eh8Ij4bHwRHgqPBOeCy+El8Ir4bXwRkgU3grvhPfCB+Gj8En4LHwRvgrfhO/CD+Gn8Ev4LSQJf4S/wj8hmZhcTCGmFFOJqcU0YloxnZhezCBmFDOJmcUsYlYxm5hdzCHmFHOJiIiKmIiLhEiKlEiLjMiKnMiLgiiKkiiLiqiKmqiLhghEU7REW4SiI7qiJ/piIIZiJMbE3GIeMa+YT8wvFhALioXEwmIRsahYTCwulhBLiqXE0mIZsaxYTiwvVhAripXEymIVsapYTawu1hBrirXE2mIdsa5YT6wvNhAbio3ExmITsanYTGwuthBbiq3E1mIbsa3YTmwvdhA7ip3EzmIXsavYTewu9hB7ir3E3mIfsa/YT+wvDhAHioPEweIQcag4TBwujhBHiqPE0eIYcaw4ThwvThAnipPEyeIUcao4TZwuzhBnirPE2eIcca44T5wvLhAXiovExeIScam4TFwurhBXiqvE1eIaca24TlwvbhA3ipvEzeIWcau4Tdwu7hB3irvE3eIeca+4T9wvxokHxIPiIfGweEQ8Kh4Tj4snxJPiKfG0eEY8K54T48XzYoJ4QbwoXhIvi1fEq+I18bp4Q7wp3hJvi3fEu+I98b74QHwoPhIfi0/Ep+Iz8bn4QnwpvhJfi2/ERPGt+E58L34QP4qfxM/iF/Gr+E38Lv4Qf4q/xN9ikvhH/Cv+E5NJyaUUUkoplZRaSiOlldJJ6aUMUkYpk5RZyiJllbJJ2aUcUk4pl4RIqIRJuERIpERJtMRIrMRJvCRIoiRJsqRIqqRJumRIQDIlS7IlKDmSK3mSLwVSKEVSTMot5ZHySvmk/FIBqaBUSCosFZGKSsWk4lIJqaRUSiotlZHKSuWk8lIFqaJUSaosVZGqStWk6lINqaZUS6ot1ZHqSvWk+lIDqaHUSGosNZGaSs2k5lILqaXUSmottZHaSu2k9lIHqaPUSeosdZG6St2k7lIPqafUS+ot9ZH6Sv2k/tIAaaA0SBosDZGGSsOk4dIIaaQ0ShotjZHGSuOk8dIEaaI0SZosTZGmStOk6dIMaaY0S5otzZHmSvOk+dICaaG0SFosLZGWSsuk5dIKaaW0SlotrZHWSuuk9dIGaaO0SdosbZG2Stuk7dIOaae0S9ot7ZH2Svuk/VKcdEA6KB2SDktHpKPSMem4dEI6KZ2STktnpLPSOSleOi8lSBeki9Il6bJ0RboqXZOuSzekm9It6bZ0R7or3ZPuSw+kh9Ij6bH0RHoqPZOeSy+kl9Ir6bX0RkqU3krvpPfSB+mj9En6LH2RvkrfpO/SD+mn9Ev6LSVJf6S/0j8pmZxcTiGnlFPJqeU0clo5nZxeziBnlDPJmeUsclY5m5xdziHnlHPJiIzKmIzLhEzKlEzLjMzKnMzLgizKkizLiqzKmqzLhgxkU7ZkW4ayI7uyJ/tyIIdyJMfk3HIeOa+cT84vF5ALyoXkwnIRuahcTC4ul5BLyqXk0nIZuaxcTi4vV5ArypXkynIVuapcTa4u15BryrXk2nIdua5cT64vN5Abyo3kxnITuancTG4ut5Bbyq3k1nIbua3cTm4vd5A7yp3kznIXuavcTe4u95B7yr3k3nIfua/cT+4vD5AHyoPkwfIQeag8TB4uj5BHyqPk0fIYeaw8Th4vT5AnypPkyfIUeao8TZ4uz5BnyrPk2fIcea48T54vL5AXyovkxfISeam8TF4ur5BXyqvk1fIaea28Tl4vb5A3ypvkzfIWeau8Td4u75B3yrvk3fIeea+8T94vx8kH5IPyIfmwfEQ+Kh+Tj8sn5JPyKfm0fEY+K5+T4+XzcoJ84f/Vm9fyGzlRfiu/k9/LH+SP8if5s/xF/ip/k7/LP+Sf8i/5t5wk/5H/yv/kZEpyJYWSUkmlpFbSKGmVdEp6JYOSUcmkZFayKFmVbEp2JYeSU8mlIAqqYAquEAqpUAqtMAqrcAqvCIqoSIqsKIqqaIquGApQTMVSbAUqjuIqnuIrgRIqkRJTcit5lLxKPiW/UkApqBRSCitFlKJKMaW4UkIpqZRSSitllLJKOaW8UkGpqFRSKitVlKpKNaW6UkOpqdRSait1lLpKPaW+0kBpqDRSGitNlKZKM6W50kJpqbRSWittlLZKO6W90kHpqHRSOitdlK5KN6W70kPpqfRSeit9lL5KP6W/MkAZqAxSBitDlKHKMGW4MkIZqYxSRitjlLHKOGW8MkGZqExSJitTlKnKNGW6MkOZqcxSZitzlLnKPGW+skBZqCxSFitLlKXKMmW5skJZqaxSVitrlLXKOmW9skHZqGxSNitblK3KNmW7skPZqexSdit7lL3KPmW/EqccUA4qh5TDyhHlqHJMOa6cUE4qp5TTyhnlrHJOiVfOKwnKBeWickm5rFxRrirXlOvKDeWmcku5rdxR7ir3lPvKA+Wh8kh5rDxRnirPlOfKC+Wl8kp5rbxREpW3yjvlvfJB+ah8Uj4rX5Svyjflu/JD+an8Un4rScof5a/yT0mmJldTqCnVVGpqNY2aVk2nplczqBnVTGpmNYuaVc2mZldzqDnVXCqioiqm4iqhkiql0iqjsiqn8qqgiqqkyqqiqqqm6qqhAtVULdVWoeqoruqpvhqooRqpMTW3mkfNq+ZT86sF1IJqIbWwWkQtqhZTi6sl1JJqKbW0WkYtq5ZTy6sV1IpqJbWyWkWtqlZTq6s11JpqLbW2Wketq9ZT66sN1IZqI7Wx2kRtqjZTm6st1JZqK7W12kZtq7ZT26sd1I5qJ7Wz2kXtqnZTu6s91J5qL7W32kftq/ZT+6sD1IHqIHWwOkQdqg5Th6sj1JHqKHW0OkYdq45Tx6sT1InqJHWyOkWdqk5Tp6sz1JnqLHW2Okedq85T56sL1IXqInWxukRdqi5Tl6sr1JXqKnW1ukZdq65T16sb1I3qJnWzukXdqm5Tt6s71J3qLnW3ukfdq+5T96tx6gH1oHpIPaweUY+qx9Tj6gn1pHpKPa2eUc+q59R49byaoF5QL6qX1MvqFfWqek29rt5Qb6q31NvqHfWuek+9rz5QH6qP1MfqE/Wp+kx9rr5QX6qv1NfqGzVRfau+U9+rH9SP6if1s/pF/ap+U7+rP9Sf6i/1t5qk/lH/qv/UZFpyLYWWUkulpdbSaGm1dFp6LYOWUcukZdayaFm1bFp2LYeWU8ulIRqqYRquERqpURqtMRqrcRqvCZqoSZqsKZqqaZquGRrQTM3SbA1qjuZqnuZrgRZqkRbTcmt5tLxaPi2/VkArqBXSCmtFtKJaMa24VkIrqZXSSmtltLJaOa28VkGrqFXSKmtVtKpaNa26VkOrqdXSamt1tLpaPa2+1kBrqDXSGmtNtKZaM6251kJrqbXSWmtttLZaO6291kHrqHXSOmtdtK5aN6271kPrqfXSemt9tL5aP62/NkAbqA3SBmtDtKHaMG24NkIbqY3SRmtjtLHaOG28NkGbqE3SJmtTtKnaNG26NkObqc3SZmtztLnaPG2+tkBbqC3SFmtLtKXaMm25tkJbqa3SVmtrtLXaOm29tkHbqG3SNmtbtK3aNm27tkPbqe3Sdmt7tL3aPm2/Fqcd0A5qh7TD2hHtqHZMO66d0E5qp7TT2hntrHZOi9fOawnaBe2idkm7rF3RrmrXtOvaDe2mdku7rd3R7mr3tPvaA+2h9kh7rD3RnmrPtOfaC+2l9kp7rb3RErW32jvtvfZB+6h90j5rX7Sv2jftu/ZD+6n90n5rSdof7a/2T0umJ9dT6Cn1VHpqPY2eVk+np9cz6Bn1THpmPYueVc+mZ9dz6Dn1XDqiozqm4zqhkzql0zqjszqn87qgi7qky7qiq7qm67qhA93ULd3Woe7oru7pvh7ooR7pMT23nkfPq+fT8+sF9IJ6Ib2wXkQvqhfTi+sl9JJ6Kb20XkYvq5fTy+sV9Ip6Jb2yXkWvqlfTq+s19Jp6Lb22Xkevq9fT6+sN9IZ6I72x3kRvqjfTm+st9JZ6K7213kZvq7fT2+sd9I56J72z3kXvqnfTu+s99J56L7233kfvq/fT++sD9IH6IH2wPkQfqg/Th+sj9JH6KH20PkYfq4/Tx+sT9In6JH2yPkWfqk/Tp+sz9Jn6LH22Pkefq8/T5+sL9IX6In2xvkRfqi/Tl+sr9JX6Kn21vkZfq6/T1+sb9I36Jn2zvkXfqm/Tt+s79J36Ln23vkffq+/T9+tx+gH9oH5IP6wf0Y/qx/Tj+gn9pH5KP62f0c/q5/R4/byeoF/QL+qX9Mv6Ff2qfk2/rt/Qb+q39Nv6Hf2ufk+/rz/QH+qP9Mf6E/2p/kx/rr/QX+qv9Nf6Gz1Rf6u/09/rH/SP+if9s/5F/6p/07/rP/Sf+i/9t56k/9H/6v/0ZEZyI4WR0khlpDbSGGmNdEZ6I4OR0chkZDayGFmNbEZ2I4eR08hlIAZqYAZuEAZpUAZtMAZrcAZvCIZoSIZsKIZqaIZuGAYwTMMybAMajuEanuEbgREakREzcht5jLxGPiO/UcAoaBQyChtFjKJGMaO4UcIoaZQyShtljLJGOaO8UcGoaFQyKhtVjKpGNaO6UcOoadQyaht1jLpGPaO+0cBoaDQyGhtNjKZGM6O50cJoabQyWhttjLZGO6O90cHoaHQyOhtdjK5GN6O70cPoafQyeht9jL5GP6O/McAYaAwyBhtDjKHGMGO4McIYaYwyRhtjjLHGOGO8McGYaEwyJhtTjKnGNGO6McOYacwyZhtzjLnGPGO+scBYaCwyFhtLjKXGMmO5scJYaawyVhtrjLXGOmO9scHYaGwyNhtbjK3GNmO7scPYaewydht7jL3GPmO/EWccMA4ah4zDxhHjqHHMOG6cME4ap4zTxhnjrHHOiDfOGwnGBeOiccm4bFwxrhrXjOvGDeOmccu4bdwx7hr3jPvGA+Oh8ch4bDwxnhrPjOfGC+Ol8cp4bbwxEo23xjvjvfHB+Gh8Mj4bX4yvxjfju/HD+Gn8Mn4bScYf46/xz0gGkoMUICVIBVKDNCAtSAfSgwwgI8gEMoMsICvIBrKDHCAnyAUQgAIM4IAAJKAADRjAAg7wQAAikIAMFKACDejAAACYwAI2gMABLvCADwIQggjEQG6QB+QF+UB+UAAUBIVAYVAEFAXFQHFQApQEpUBpUAaUBeVAeVABVASVQGVQBVQF1UB1UAPUBLVAbVAH1AX1QH3QADQEjUBj0AQ0Bc1Ac9ACtAStQGvQBrQF7UB70AF0BJ1AZ9AFdAXdQHfQA/QEvUBv0Af0Bf1AfzAADASDwGAwBAwFw8BwMAKMBKPAaDAGjAXjwHgwAUwEk8BkMAVMBdPAdDADzASzwGwwB8wF88B8sAAsBIvAYrAELAXLwHKwAqwEq8BqsAasBevAerABbASbwGawBWwF28B2sAPsBLvAbrAH7AX7wH4QBw6Ag+AQOAyOgKPgGDgOToCT4BQ4Dc6As+AciAfnQQK4AC6CS+AyuAKugmvgOrgBboJb4Da4A+6Ce+A+eAAegkfgMXgCnoJn4Dl4AV6CV+A1eAMSwVvwDrwHH8BH8Al8Bl/AV/ANfAc/wE/wC/wGSeAP+Av+gWRmcjOFmdJMZaY205hpzXRmejODmdHMZGY2s5hZzWxmdjOHmdPMZSImamImbhImaVImbTIma3ImbwqmaEqmbCqmamqmbhomME3TMm0Tmo7pmp7pm4EZmpEZM3Obecy8Zj4zv1nALGgWMgubRcyiZjGzuFnCLGmWMkubZcyyZjmzvFnBrGhWMiubVcyqZjWzulnDrGnWMmubdcy6Zj2zvtnAbGg2MhubTcymZjOzudnCbGm2Mlubbcy2ZjuzvdnB7Gh2MjubXcyuZjezu9nD7Gn2Mnubfcy+Zj+zvznAHGgOMgebQ8yh5jBzuDnCHGmOMkebY8yx5jhzvDnBnGhOMiebU8yp5jRzujnDnGnOMmebc8y55jxzvrnAXGguMhebS8yl5jJzubnCXGmuMleba8y15jpzvbnB3GhuMjebW8yt5jZzu7nD3GnuMnebe8y95j5zvxlnHjAPmofMw+YR86h5zDxunjBPmqfM0+YZ86x5zow3z5sJ5gXzonnJvGxeMa+a18zr5g3zpnnLvG3eMe+a98z75gPzofnIfGw+MZ+az8zn5gvzpfnKfG2+MRPNt+Y78735wfxofjI/m1/Mr+Y387v5w/xp/jJ/m0nmH/Ov+c9MZiW3UlgprVRWaiuNldZKZ6W3MlgZrUxWZiuLldXKZmW3clg5rVwWYqEWZuEWYZEWZdEWY7EWZ/GWYImWZMmWYqmWZumWYQHLtCzLtqDlWP///t/YamI1sZpZza0WVr4UhVK0tlpbba22VnurvdXR6mR1trpYXa1uVjerh9XT6mn1tvpYfa1+Vn9rgDXQGmQNtoZYQ6xh1jBrhDXCGmWNssZYY6xx1jhrgjXBmmRNsqZYU6xp1jRrhjXDmmXNsuZYc6x51jxrgbXAWmQtspZYS6xl1jJrhbXCWmWtstZYa6x11jprg7XB2mRtsrZYW6xt1jZrh7XD2mXtsvZYe6x91j4rzoqzDloHrcPWYeuoddQ6bh23TlonrdPWaeusddaKt+KtBCvBumhdtC5bl62r1lXrunXdumndtG5bt6271l3rvnXfemg9tB5bj62n1jPrufXCemm9sl5bb6xE6631znpvfbA+Wp+sz9YX66v1zfpu/bB+Wr+s31aS9cf6a/2z/r/cX1RtzdZtwwa2aVu2bUPbsV3bs307sEM7smN2bjuPndfOZ+e3C9gF7UJ2YbuIXdQuZhe3S9gl7VJ2abuMXdYuZ5e3K9gV7Up2ZbuKXdWuZle3a9g17Vp2bbuOXdeuZ9e3G9gN7UZ2Y7uJ3dRuZje3W9gt7VZ2a7uN3dZuZ7e3O9gd7U52Z7uL3dXuZne3e9g97V52b7uP3dfuZ/e3B9gD7UH2YHuIPdQeZg+3R9gj7VH2aHuMPdYeZ4+3J9gT7Un2ZHuKPdWeZk+3Z9gz7Vn2bHuOPdeeZ8+3F9gL7UX2YnuJvdReZi+3V9gr7VX2anuNvdZeZ6+3N9gb7U32ZnuLvdXeZm+3d9g77V32bnuPvdfeZ++34+wD9kH7kH3YPmIftY/Zx+0T9kn7lH3aPmOftc/Z8fZ5O8G+YF+0L9mX7Sv2Vfuafd2+Yd+0b9m37Tv2Xfuefd9+YD+0H9mP7Sf2U/uZ/dx+Yb+0X9mv7Td2ov3Wfme/tz/YH+1P9mf7i/3V/mZ/t3/YP+1f9m87yf5j/7X/2clgcpgCpoSpYGqYBqaF6WB6mAFmhJlgZpgFZoXZYHaYA+aEuSACUYhBHBKQhBSkIQNZyEEeClCEEpShAlWoQR0aEEATWtCGEDrQhR70YQBDGMEYzA3zwLwwH8wPC8CCsBAsDIvAorAYLA5LwJKwFCwNy8CysBwsDyvAirASrAyrwKqwGqwOa8CasBasDevAurAerA8bwIawEWwMm8CmsBlsDlvAlrAVbA3bwLawHWwPO8COsBPsDLvArrAb7A57wJ6wF+wN+8C+sB/sDwfAgXAQHAyHwKFwGBwOR8CRcBQcDcfAsXAcHA8nwIlwEpwMp8CpcBqcDmfAmXAWnA3nwLlwHpwPF8CFcBFcDJfApXAZXA5XwJVwFVwN18C1cB1cDzfAjXAT3Ay3wK1wG9wOd8CdcBfcDffAvXAf3A/j4AF4EB6Ch+EReBQeg8fhCXgSnoKn4Rl4Fp6D8fA8TIAX4EV4CV6GV+BVeA1ehzfgTXgL3oZ34F14D96HD+BD+Ag+hk/gU/gMPocv4Ev4Cr6Gb2AifAvfwffwA/wIP8HP8Av8Cr/B7/AH/Al/wd8wCf6Bf+E/mMxJ7qRwUjqpnNROGietk85J72RwMjqZnMxOFierk83J7uRwcjq5HMRBHczBHcIhHcqhHcZhHc7hHcERHcmRHcVRHc3RHcMBjulYju1Ax3Fcx3N8J3BCJ3JiTm4nj5PXyefkdwo4BZ1CTmGniFPUKeYUd0o4JZ1STmmnjFPWKeeUdyo4FZ1KTmWnilPVqeZUd2o4NZ1aTm2njlPXqefUdxo4DZ1GTmOnidPUaeY0d1o4LZ1WTmunjdPWaee0dzo4HZ1OTmeni9PV6eZ0d3o4PZ1eTm+nj9PX6ef0dwY4A51BzmBniDPUGeYMd0Y4I51RzmhnjDPWGeeMdyY4E51JzmRnijPVmeZMd2Y4M51ZzmxnjjPXmefMdxY4C51FzmJnibPUWeYsd1Y4K51VzmpnjbPWWeesdzY4G51NzmZni7PV2eZsd3Y4O51dzm5nj7PX2efsd+KcA85B55Bz2DniHHWOOcedE85J55Rz2jnjnHXOOfHOeSfBueBcdC45l50rzlXnmnPdueHcdG45t507zl3nnnPfeeA8dB45j50nzlPnmfPceeG8dF45r503TqLz1nnnvHc+OB+dT85n54vz1fnmfHd+OD+dX85vJ8n54/x1/jnJ3ORuCjelm8pN7aZx07rp3PRuBjejm8nN7GZxs7rZ3OxuDjenm8tFXNTFXNwlXNKlXNplXNblXN4VXNGVXNlVXNXVXN01XOCaruXaLnQd13U913cDN3QjN+bmdvO4ed18bn63gFvQLeQWdou4Rd1ibnG3hFvSLeWWdsu4Zd1ybnm3glvRreRWdqu4Vd1qbnW3hlvTreXWduu4dd16bn23gdvQbeQ2dpu4Td1mbnO3hdvSbeW2dtu4bd12bnu3g9vR7eR2dru4Xd1ubne3h9vT7eX2dvu4fd1+bn93gDvQHeQOdoe4Q91h7nB3hDvSHeWOdse4Y91x7nh3gjvRneROdqe4U91p7nR3hjvTneXOdue4c9157nx3gbvQXeQudpe4S91l7nJ3hbvSXeWudte4a9117np3g7vR3eRudre4W91t7nZ3h7vT3eXudve4e9197n43zj3gHnQPuYfdI+5R95h73D3hnnRPuafdM+5Z95wb7553E9wL7kX3knvZveJeda+5190b7k33lnvbvePede+5990H7kP3kfvYfeI+dZ+5z90X7kv3lfvafeMmum/dd+5794P70f3kfna/uF/db+5394f70/3l/naT3D/uX/efm8xL7qXwUnqpvNReGi+tl85L72XwMnqZvMxeFi+rl83L7uXwcnq5PMRDPczDPcIjPcqjPcZjPc7jPcETPcmTPcVTPc3TPcMDnulZnu1Bz/Fcz/N8L/BCL/JiXm4vj5fXy+fl9wp4Bb1CXmGviFfUK+YV90p4Jb1SXmmvjFfWK+eV9yp4Fb1KXmWvilfVq+ZV92p4Nb1aXm2vjlfXq+fV9xp4Db1GXmOvidfUa+Y191p4Lb1WXmuvjdfWa+e19zp4Hb1OXmevi9fV6+Z193p4Pb1eXm+vj9fX6+f19wZ4A71B3mBviDfUG+YN90Z4I71R3mhvjDfWG+eN9yZ4E71J3mRvijfVm+ZN92Z4M71Z3mxvjjfXm+fN9xZ4C71F3mJvibfUW+Yt91Z4K71V3mpvjbfWW+et9zZ4G71N3mZvi7fV2+Zt93Z4O71d3m5vj7fX2+ft9+K8A95B75B32DviHfWOece9E95J75R32jvjnfXOefHeeS/Bu+Bd9C55l70r3lXvmnfdu+Hd9G55t7073l3vnnffe+A99B55j70n3lPvmffce+G99F55r703XqL31nvnvfc+eB+9T95n74v31fvmffd+eD+9X95vL8n74/31/nnJ/OR+Cj+ln8pP7afx0/rp/PR+Bj+jn8nP7Gfxs/rZ/Ox+Dj+nn8tHfNTHfNwnfNKnfNpnfNbnfN4XfNGXfNlXfNXXfN03fOCbvuXbPvQd3/U93/cDP/QjP+bn9vP4ef18fn6/gF/QL+QX9ov4Rf1ifnG/hF/SL+WX9sv4Zf1yfnm/gl/Rr+RX9qv4Vf1qfnW/hl/Tr+XX9uv4df16fn2/gd/Qb+Q39pv4Tf1mfnO/hd/Sb+W39tv4bf12fnu/g9/R7+R39rv4Xf1ufne/h9/T7+X39vv4ff1+fn9/gD/QH+QP9of4Q/1h/nB/hD/SH+WP9sf4Y/1x/nh/gj/Rn+RP9qf4U/1p/nR/hj/Tn+XP9uf4c/15/nx/gb/QX+Qv9pf4S/1l/nJ/hb/SX+Wv9tf4a/11/np/g7/R3+Rv9rf4W/1t/nZ/h7/T3+Xv9vf4e/19/n4/zj/gH/QP+Yf9I/5R/5h/3D/hn/RP+af9M/5Z/5wf75/3E/wL/kX/kn/Zv+Jf9a/51/0b/k3/ln/bv+Pf9e/59/0H/kP/kf/Yf+I/9Z/5z/0X/kv/lf/af+Mn+m/9d/57/4P/0f/kf/a/+F/9b/53/4f/0//l//aT/D/+X/+fnyxIHqQIUgapgtRBmiBtkC5IH2QIMgaZgsxBliBrkC3IHuQIcga5AiRAAyzAAyIgAyqgAyZgAy7gAyEQAymQAyVQAy3QAyMAgRlYgR3AwAncwAv8IAjCIApiQe4gT5A3yBfkDwoEBYNCQeGgSFA0KBYUD0oEJYNSQemgTFA2KBeUDyoEFYNKQeWgSlA1qBZUD2oENYNaQe2gTlA3qBfUDxoEDYNGQeOgSdA0aBY0D1oELYNWQeugTdA2aBe0DzoEHYNOQeegS9A16BZ0D3oEPYNeQe+gT9A36Bf0DwYEA4NBweBgSDA0GBYMD0YEI4NRwehgTDA2GBeMDyYEE4NJweRgSjA1mBZMD2YEM4NZwexgTjA3mBfMDxYEC4NFweJgSbA0WBYsD1YEK4NVwepgTbA2WBesDzYEG4NNweZgS7A12BZsD3YEO4Ndwe5gT7A32BfsD+KCA8HB4FBwODgSHA2OBceDE8HJ4FRwOjgTnA3OBfHB+SAhuBBcDC4Fl4MrwdXgWnA9uBHcDG4Ft4M7wd3gXnA/eBA8DB4Fj4MnwdPgWfA8eBG8DF4Fr4M3QWLwNngXvA8+BB+DT8Hn4EvwNfgWfA9+BD+DX8HvICn4E/wN/gXJwuRhijBlmCpMHaYJ04bpwvRhhjBjmCnMHGYJs4bZwuxhjjBnmCtEQjTEQjwkQjKkQjpkQjbkQj4UQjGUQjlUQjXUQj00QhCaoRXaIQyd0A290A+DMAyjMBbmDvOEecN8Yf6wQFgwLBQWDouERcNiYfGwRFgyLBWWDsuEZcNyYfmwQlgxrBRWDquEVcNqYfWwRlgzrBXWDuuEdcN6Yf2wQdgwbBQ2DpuETcNmYfOwRdgybBW2DtuEbcN2YfuwQ9gx7BR2DruEXcNuYfewR9gz7BX2DvuEfcN+Yf9wQDgwHBQODoeEQ8Nh4fBwRDgyHBWODseEY8Nx4fhwQjgxnBRODqeEU8Np4fRwRjgznBXODueEc8N54fxwQbgwXBQuDpeES8Nl4fJwRbgyXBWuDteEa8N14fpwQ7gx3BRuDreEW8Nt4fZwR7gz3BXuDveEe8N94f4wLjwQHgwPhYfDI+HR8Fh4PDwRngxPhafDM+HZ8FwYH54PE8IL4cXwUng5vBJeDa+F18Mb4c3wVng7vBPeDe+F98MH4cPwUfg4fBI+DZ+Fz8MX4cvwVfg6fBMmhm/Dd+H78EP4MfwUfg6/hF/Db+H38Ef4M/wV/g6Twj/h3/BfmCxKHqWIUkapotRRmihtlC5KH2WIMkaZosxRlihrlC3KHuWIcka5IiRCIyzCIyIiIyqiIyZiIy7iIyESIymSIyVSIy3SIyMCkRlZkR3ByIncyIv8KIjCKIpiUe4oT5Q3yhfljwpEBaNCUeGoSFQ0KhYVj0pEJaNSUemoTFQ2KheVjypEFaNKUeWoSlQ1qhZVj2pENaNaUe2oTlQ3qhfVjxpEDaNGUeOoSdQ0ahY1j1pELaNWUeuoTdQ2ahe1jzpEHaNOUeeoS9Q16hZ1j3pEPaNeUe+oT9Q36hf1jwZEA6NB0eBoSDQ0GhYNj0ZEI6NR0ehoTDQ2GheNjyZEE6NJ0eRoSjQ1mhZNj2ZEM6NZ0exoTjQ3mhfNjxZEC6NF0eJoSbQ0WhYtj1ZEK6NV0epoTbQ2WhetjzZEG6NN0eZoS7Q12hZtj3ZEO6Nd0e5oT7Q32hftj+KiA9HB6FB0ODoSHY2ORcejE9HJ6FR0OjoTnY3ORfHR+SghuhBdjC5Fl6Mr0dXoWnQ9uhHdjG5Ft6M70d3oXnQ/ehA9jB5Fj6Mn0dPoWfQ8ehG9jF5Fr6M3UWL0NnoXvY8+RB+jT9Hn6Ev0NfoWfY9+RD+jX9HvKCn6E/2N/kXJYsljKWIpY6liqWNpYmlj6WLpYxliGWOZYpljWWJZY9li2WM5YjljuWJIDI1hMTxGxMgYFaNjTIyNcTE+JsTEmBSTY0pMjWkxPWbEQMyMWTE7BmNOzI15MT8WxMJYFIvFcsfyxPLG8sXyxwrECsYKxQrHisSKxorFisdKxErGSsVKx8rEysbKxcrHKsQqxirFKseqxKrGqsWqx2rEasb+DwLgATDrJQAA+LKXbdv1t23d3Zdt27Zt266Xbdu1sGxbe78YVgurjdXB6mL1sPpYA6wh1ghrjDXBmmLNsOZYC6wl1gprjbXB2mLtsPZYB6wj1gnrjHXBumLdsO5YD6wn1gvrjfXB+mL9sP7YAGwgNggbjA3BhmLDsOHYCGwkNgobjY3BxmLjsPHYBGwiNgmbjE3BpmLTsOnYDGwmNgubjc3B5mLzsPnYAmwhtghbjC3BlmLLsOXYCmwltgpbja3B1mLrsPXYBmwjtgnbjG3BtmLbsP+w7dgObCe2C9uN7cH2Yvuw/dgB7CB2CDuMHcGOYsew49gJ7CR2CjuNncHOYuew89gF7CJ2CbuMXcGuYtew69gN7CZ2C7uN3cHuYgnYPew+9gB7iCVij7DH2BPsKfYMe469wF5ir7DX2BvsLfYOe499wD5in7DP2BfsK/YN+479wH5iv7Df2B/sL/YPS8Li8GR4cjwFnhJPhafG0+Bp8XR4ejwDnhHPhMfjmfEseFY8G54dz4HnxHPhufE8eF48H54fL4AXxAvhhfEieFG8GF4cL4GXxEvhpfEyeFm8HF4er4BXxCvhlfEqeFW8Gl4dr4HXxDEcxwmcxCmcxhmcxTmcxwVcxCVcxhVcxTVcxw3cxC3cxh3cxT3cxwM8xCMc4BBHeAyvhdfG6+B18Xp4fbwB3hBvhDfGm+BN8WZ4c7wF3hJvhbfG2+Bt8XZ4e7wD3hHvhHfGu+Bd8W54d7wH3hPvhffG++B98X54f3wAPhAfhA/Gh+BD8WH4cHwEPhIfhY/Gx+Bj8XH4eHwCPhGfhE/Gp+BT8Wn4dHwGPhOfhc/G5+Bz8Xn4fHwBvhBfhC/Gl+BL8WX4cnwFvhJfha/G1+Br8XX4enwDvhHfhG/Gt+Bb8W34f/h2fAe+E9+F78b34Hvxffh+/AB+ED+EH8aP4EfxY/hx/AR+Ej+Fn8bP4Gfxc/h5/AJ+Eb+EX8av4Ffxa/h1/AZ+E7+F38bv4HfxBPwefh9/gD/EE/FH+GP8Cf4Uf4Y/x1/gL/FX+Gv8Df4Wf4e/xz/gH/FP+Gf8C/4V/4Z/x3/gP/Ff+G/8D/4X/4cn4XFEMiI5kYJISaQiUhNpiLREOiI9kYHISGQi4onMRBYiK5GNyE7kIHISuYjcRB4iL5GPyE8UIAoShYjCRBGiKFGMKE6UIEoSpYjSRBmiLFGOKE9UICoSlYjKRBWiKlGNqE7UIGoSGIETBEESFEETDMESHMETAiESEiETCqESGqETBmESFmETDuESHuETARESEQEISCAiRtQiahN1iLpEPaI+0YBoSDQiGhNNiKZEM6I50YJoSbQiWhNtiLZEO6I90YHoSHQiOhNdiK5EN6I70YPoSfQiehN9iL5EP6I/MYAYSAwiBhNDiKHEMGI4MYIYSYwiRhNjiLHEOGI8MYGYSEwiJhNTiKnENGI6MYOYScwiZhNziLnEPGI+sYBYSCwiFhNLiKXEMmI5sYJYSawiVhNriLXEOmI9sYHYSGwiNhNbiK3ENuI/Yjuxg9hJ7CJ2E3uIvcQ+Yj9xgDhIHCIOE0eIo8Qx4jhxgjhJnCJOE2eIs8Q54jxxgbhIXCIuE1eIq8Q14jpxg7hJ3CJuE3eIu0QCcY+4TzwgHhKJxCPiMfGEeEo8I54TL4iXxCviNfGGeEu8I94TH4iPxCfiM/GF+Ep8I74TP4ifxC/iN/GH+Ev8I5KIODIZmZxMQaYkU5GpyTRkWjIdmZ7MQGYkM5HxZGYyC5mVzEZmJ3OQOclcZG4yD5mXzEfmJwuQBclCZGGyCFmULEYWJ0uQJclSZGmyDFmWLEeWJyuQFclKZGWyClmVrEZWJ2uQNUmMxEmCJEmKpEmGZEmO5EmBFEmJlEmFVEmN1EmDNEmLtEmHdEmP9MmADMmIBCQkERkja5G1yTpkXbIeWZ9sQDYkG5GNySZkU7IZ2ZxsQbYkW5GtyTZkW7Id2Z7sQHYkO5GdyS5kV7Ib2Z3sQfYke5G9yT5kX7If2Z8cQA4kB5GDySHkUHIYOZwcQY4kR5GjyTHkWHIcOZ6cQE4kJ5GTySnkVHIaOZ2cQc4kZ5GzyTnkXHIeOZ9cQC4kF5GLySXkUnIZuZxcQa4kV5GryTXkWnIduZ7cQG4kN5GbyS3kVnIb+R+5ndxB7iR3kbvJPeRech+5nzxAHiQPkYfJI+RR8hh5nDxBniRPkafJM+RZ8hx5nrxAXiQvkZfJK+RV8hp5nbxB3iRvkbfJO+RdMoG8R94nH5APyUTyEfmYfEI+JZ+Rz8kX5EvyFfmafEO+Jd+R78kP5EfyE/mZ/EJ+Jb+R38kf5E/yF/mb/EP+Jf+RSWQclYxKTqWgUlKpqNRUGiotlY5KT2WgMlKZqHgqM5WFykplo7JTOaicVC4qN5WHykvlo/JTBaiCVCGqMFWEKkoVo4pTJaiSVCmqNFWGKkuVo8pTFaiKVCWqMlWFqkpVo6pTNaiaFEbhFEGRFEXRFEOxFEfxlECJlETJlEKplEbplEGZlEXZlEO5lEf5VECFVEQBClKIilG1qNpUHaouVY+qTzWgGlKNqMZUE6op1YxqTrWgWlKtqNZUG6ot1Y5qT3WgOlKdqM5UF6or1Y3qTvWgelK9qN5UH6ov1Y/qTw2gBlKDqMHUEGooNYwaTo2gRlKjqNHUGGosNY4aT02gJlKTqMnUFGoqNY2aTs2gZlKzqNnUHGouNY+aTy2gFlKLqMXUEmoptYxaTq2gVlKrqNXUGmottY5aT22gNlKbqM3UFmortY36j9pO7aB2Uruo3dQeai+1j9pPHaAOUoeow9QR6ih1jDpOnaBOUqeo09QZ6ix1jkqIv0BdpC5Rl6kr1FXqGnWdukHdpG5Rt6k71F0qgbpH3aceUA+pROoR9Zh6Qj2lnlHPqRfUS+oV9Zp6Q72l3lHvqQ/UR+oT9Zn6Qn2lvlHfqR/UT+oX9Zv6Q/2l/lFJVBydjE5Op6BT0qno1HQaOi2djk5PZ6Az0pnoeDoznYXOSmejs9M56Jx0Ljo3nYfOS+ej89MF6IJ0IbowXYQuSheji9Ml6JJ0Kbo0XYYuS5ejy9MV6Ip0JboyXYWuSlejq9M16Jo0RuM0QZM0RdM0Q7M0R/O0QIu0RMu0Qqu0Ruu0QZu0Rdu0Q7u0R/t0QId0RAMa0oiO0bXo2nQdui5dj65PN6Ab0o3oxnQTuindjG5Ot6Bb0q3o1nQbui3djm5Pd6A70p3oznQXuivdje5O96B70r3o3nQfui/dj+5PD6AH0oPowfQQeig9jB5Oj6BH0qPo0fQYeiw9jh5PT6An0pPoyfQUeio9jZ5Oz6Bn0rPo2fQcei49j55PL6AX0ovoxfQSeim9jF5Or6BX0qvo1fQaei29jl5Pb6A30pvozfQWeiu9jf6P3k7voHfSu+jd9B56L72P3k8foA/Sh+jD9BH6KH2MPk6foE/Sp+jT9Bn6LH2OPk9foC/Sl+jL9BX6Kn2Nvk7foG/St+jb9B36Lp1A36Pv0w/oh3Qi/Yh+TD+hn9LP6Of0C/ol/Yp+Tb+h39Lv6Pf0B/oj/Yn+TH+hv9Lf6O/0D/on/Yv+Tf+h/9L/6CQ6jknGJGdSMCmZVExqJg2TlknHpGcyMBmZTEw8k5nJwmRlsjHZmRxMTiYXk5vJw+Rl8jH5mQJMQaYQU5gpwhRlijHFmRJMSaYUU5opw5RlyjHlmQpMRaYSU5mpwlRlqjHVmRpMTQZjcIZgSIZiaIZhWIZjeEZgREZiZEZhVEZjdMZgTMZibMZhXMZjfCZgQiZiAAMZxMSYWkxtpg5Tl6nH1GcaMA2ZRkxjpgnTlGnGNGdaMC2ZVkxrpg3TlmnHtGc6MB2ZTkxnpgvTlenGdGd6MD2ZXkxvpg/Tl+nH9GcGMAOZQcxgZggzlBnGDGdGMCOZUcxoZgwzlhnHjGcmMBOZScxkZgozlZnGTGdmMDOZWcxsZg4zl5nHzGcWMAuZRcxiZgmzlFnGLGdWMCuZVcxqZg2zllnHrGc2MBuZTcxmZguzldnG/MdsZ3YwO5ldzG5mD7OX2cfsZw4wB5lDzGHmCHOUOcYcZ04wJ5lTzGnmDHOWOcecZy4wF5lLzGXmCnOVucZcZ24wN5lbzG3mDnOXSWDuMfeZB8xDJpF5xDxmnjBPmWfMc+YF85J5xbxm3jBvmXfMe+YD85H5xHxmvjBfmW/Md+YH85P5xfxm/jB/mX9MEhPHJmOTsynYlGwqNjWbhk3LpmPTsxnYjGwmNp7NzGZhs7LZ2OxsDjYnm4vNzeZh87L52PxsAbYgW4gtzBZhi7LF2OJsCbYkW4otzZZhy7Ll2PJsBbYiW4mtzFZhq7LV2OpsDbYmi7E4S7AkS7E0y7Asy7E8K7AiK7Eyq7Aqq7E6a7Ama7E267Au67E+G7AhG7GAhSxiY2wttjZbh63L1mPrsw3YhmwjtjHbhG3KNmObsy3YlmwrtjXbhm3LtmPbsx3YjmwntjPbhe3KdmO7sz3Ynmwvtjfbh+3L9mP7swPYgewgdjA7hB3KDmOHsyPYkewodjQ7hh3LjmPHsxPYiewkdjI7hZ3KTmOnszPYmewsdjY7h53LzmPnswvYhewidjG7hF3KLmOXsyvYlewqdjW7hl3LrmPXsxvYjewmdjO7hd3KbmP/Y7ezO9id7C52N7uH3cvuY/ezB9iD7CH2MHuEPcoeY4+zJ9iT7Cn2NHuGPcueY8+zF9iL7CX2MnuFvcpeY6+zN9ib7C32NnuHvcsmsPfY++wD9iGbyD5iH7NP2KfsM/Y5+4J9yb5iX7Nv2LfsO/Y9+4H9yH5iP7Nf2K/sN/Y7+4P9yf5if7N/2L/sPzaJjeOSccm5FFxKLhWXmkvDpeXScem5DFxGLhMXz2XmsnBZuWxcdi4Hl5PLxeXm8nB5uXxcfq4AV5ArxBXminBFuWJcca4EV5IrxZXmynBluXJcea4CV5GrxFXmqnBVuWpcda4GV5PDOJwjOJKjOJpjOJbjOJ4TOJGTOJlTOJXTOJ0zOJOzOJtzOJfzOJ8LuJCLOMBBDnExrhZXm6vD1eXqcfW5BlxDrhHXmGvCNeWacc25FlxLrhXXmmvDteXace25DlxHrhPXmevCdeW6cd25HlxPrhfXm+vD9eX6cf25AdxAbhA3mBvCDeWGccO5EdxIbhQ3mhvDjeXGceO5CdxEbhI3mZvCTeWmcdO5GdxMbhY3m5vDzeXmcfO5BdxCbhG3mFvCLeWWccu5FdxKbhW3mlvDreXWceu5DdxGbhO3mdvCbeW2cf9x27kd3E5uF7eb28Pt5fZx+7kD3EHuEHeYO8Id5Y5xx7kT3EnuFHeaO8Od5c5x57kL3EXuEneZu8Jd5a5x17kb3E3uFnebu8Pd5RK4e9x97gH3kEvkHnGPuSfcU+4Z95x7wb3kXnGvuTfcW+4d9577wH3kPnGfuS/cV+4b9537wf3kfnG/uT/cX+4fl8TF8cn45HwKPiWfik/Np+HT8un49HwGPiOfiY/nM/NZ+Kx8Nj47n4PPyefic/N5+Lx8Pj4/X4AvyBfiC/NF+KJ8Mb44X4IvyZfiS/Nl+LJ8Ob48X4GvyFfiK/NV+Kp8Nb46X4OvyWM8zhM8yVM8zTM8y3M8zwu8yEu8zCu8ymu8zhu8yVu8zTu8y3u8zwd8yEc84CGP+Bhfi6/N1+Hr8vX4+nwDviHfiG/MN+Gb8s345nwLviXfim/Nt+Hb8u349nwHviPfie/Md+G78t347nwPviffi+/N9+H78v34/vwAfiA/iB/MD+GH8sP44fwIfiQ/ih/Nj+HH8uP48fwEfiI/iZ/MT+Gn8tP46fwMfiY/i5/Nz+Hn8vP4+fwCfiG/iF/ML+GX8sv45fwKfiW/il/Nr+HX8uv49fwGfiO/id/Mb+G38tv4xLjt/A5+J7+L383v4ffy+/j9/AH+IH+IP8wf4Y/yx/jj/An+JH+KP82f4c/y5/jz/AX+In+Jv8xf4a/y1/jr/A3+Jn+Lv83f4e/yCfw9/j7/gH/IJ/KP+Mf8E/4p/4x/zr/gX/Kv+Nf8G/4t/45/z3/gP/Kf+M/8F/4r/43/zv/gf/K/+N/8H/4v/49P4uOEZEJyIYWQUkglpBbSCGmFdEJ6IYOQUcgkxAuZhSxCViGbkF3IIeQUcgm5hTxCXiGfkF8oIBQUCgmFhSJCUaGYUFwoIZQUSgmlhTJCWaGcUF6oIFQUKgmVhSpCVaGaUF2oIdQUMAEXCIEUKIEWGIEVOIEXBEEUJEEWFEEVNEEXDMEULMEWHMEVPMEXAiEUIgEIUEBCTKgl1BbqCHWFekJ9oYHQUGgkNBaaCE2FZkJzoYXQUmgltBbaCG2FdkJ7oYPQUegkdBa6CF2FbkJ3oYfQU+gl9Bb6CH2FfkJ/YYAwUBgkDBaGCEOFYcJwYYQwUhgljBbGCGOFccJ4YYIwUZgkTBamCFOFacJ0YYYwU5glzBbmCHOFecJ8YYGwUFgkLBaWCEuFZcJyYYWwUlglrBbWCGuFdcJ6YYOwUdgkbBa2CFuFbcJ/wnZhh7BT2CXsFvYIe4V9wn7hgHBQOCQcFo4IR4VjwnHhhHBSOCWcFs4IZ4VzwnnhgnBRuCRcFq4IV4VrwnXhhnBTuCXcFu4Id4UE4Z5wX3ggPBQShUfCY+GJ8FR4JjwXXggvhVfCa+GN8FZ4J7wXPggfhU/CZ+GL8FX4JnwXfgg/hV/Cb+GP8Ff4JyQJcWIyMbmYQkwpphJTi2nEtGI6Mb2YQcwoZhLjxcxiFjGrmE3MLuYQc4q5xNxiHjGvmE/MLxYQC4qFxMJiEbGoWEwsLpYQS4qlxNJiGbGsWE4sL1YQK4qVxMpiFbGqWE2sLtYQa4qYiIuESIqUSIuMyIqcyIuCKIqSKIuKqIqaqIuGaIqWaIuO6Iqe6IuBGIqRCEQoIjEm1hJri3XEumI9sb7YQGwoNhIbi03EpmIzsbnYQmwpthJbi23EtmI7sb3YQewodhI7i13ErmI3sbvYQ+wp9hJ7i33EvmI/sb84QBwoDhIHi0PEoeIwcbg4QhwpjhJHi2PEseI4cbw4QZwoThIni1PEqeI0cbo4Q5wpzhJni3PEueI8cb64QFwoLhIXi0vEpeIycbm4QlwprhJXi2vEteI6cb24QdwobhI3i1vEreI28T9xu7hD3CnuEneLe8S94j5xv3hAPCgeEg+LR8Sj4jHxuHhCPCmeEk+LZ8Sz4jnxvHhBTBIviZfFK+JV8Zp4Xbwh3hRvibfFO+JdMUG8J94XH4gPxUTxkfhYfCI+FZ+Jz8UX4kvxlfhafCO+Fd+J78UP4kfxk/hZ/CJ+Fb+J38Uf4k/xl/hb/CP+Ff+JSWKclExKLqWQUkqppNRSGimtlE5KL2WQMkqZpHgps5RFyiplk7JLOaScUi4pt5RHyivlk/JLBaSCUiGpsFREKioVk4pLJaSSUimptFRGKiuVk8pLFaSKUiWpslRFqipVk6pLNaSaEibhEiGREiXREiOxEifxkiCJkiTJkiKpkibpkiGZkiXZkiO5kif5UiCFUiQBCUpIikm1pNpSHamuVE+qLzWQGkqNpMZSE6mp1ExqLrWQWkqtpNZSG6mt1E5qL3WQOkqdpM5SF6mr1E3qLvWQekq9pN5SH6mv1E/qLw2QBkqDpMHSEGmoNEwaLo2QRkqjpNHSGGmsNE4aL02QJkqTpMnSFGmqNE2aLs2QZkqzpNnSHGmuNE+aLy2QFkqLpMXSEmmptExaLq2QVkqrpNXSGmmttE5aL22QNkqbpM3SFmmrtE36T9ou7ZB2Sruk3dIeaa+0T9ovHZAOSoekw9IR6ah0TDounZBOSqek09IZ6ax0TjovXZAuSpeky9IV6ap0Tbou3ZBuSrek29Id6a6UIN2T7ksPpIdSovRIeiw9kZ5Kz6Tn0gvppfRKei29kd5K76T30gfpo/RJ+ix9kb5K36Tv0g/pp/RL+i39kf5K/6QkKU5OJieXU8gp5VRyajmNnFZOJ6eXM8gZ5UxyvJxZziJnlbPJ2eUcck45l5xbziPnlfPJ+eUCckG5kFxYLiIXlYvJxeUSckm5lFxaLiOXlcvJ5eUKckW5klxZriJXlavJ1eUack0Zk3GZkEmZkmmZkVmZk3lZkEVZkmVZkVVZk3XZkE3Zkm3ZkV3Zk305kEM5koEMZSTH5FpybbmOXFeuJ9eXG8gN5UZyY7mJ3FRuJjeXW8gt5VZya7mN3FZuJ7eXO8gd5U5yZ7mL3FXuJneXe8g95V5yb7mP3FfuJ/eXB8gD5UHyYHmIPFQeJg+XR8gj5VHyaHmMPFYeJ4+XJ8gT5UnyZHmKPFWeJk+XZ8gz5VnybHmOPFeeJ8+XF8gL5UXyYnmJvFReJi+XV8gr5VXyanmNvFZeJ6+XN8gb5U3yZnmLvFXeJv8nb5d3yDvlXfJueY+8V94n75cPyAflQ/Jh+Yh8VD4mH5dPyCflU/Jp+Yx8Vj4nn5cvyBflS/Jl+Yp8Vb4mX5dvyDflW/Jt+Y58V06Q78n35QfyQzlRfiQ/lp/IT+Vn8nP5hfxSfiW/lt/Ib+V38nv5g/xR/iR/lr/IX+Vv8nf5h/xT/iX/lv/If+V/cpIcpyRTkisplJRKKiW1kkZJq6RT0isZlIxKJiVeyaxkUbIq2ZTsSg4lp5JLya3kUfIq+ZT8SgGloFJIKawUUYoqxZTiSgmlpFJKKa2UUcoq5ZTySgWlolJJqaxUUaoq1ZTqSg2lpoIpuEIopEIptMIorMIpvCIooiIpsqIoqqIpupImLi7OUmzFUVzFU3wlUEIlUoACFaTElFpKbaWOUlepp9RXGigNlUZKY6WJ0lRppjRXWigtlVZKa6WN0lZpp7RXOigdlU5KZ6WL0lXppnRXeig9lV5Kb6WP0lfpp/RXBigDlUHKYGWIMlQZpgxXRigjlVHKaGWMMlYZp4xXJigTlUnKZGWKMlWZpkxXZigzlVnKbGWOMleZp8xXFigLlUXKYmWJslRZpixXVigrlVXKamWNslZZp6xXNigblU3KZmWLslXZpvynbFd2KDuVXcpuZY+yV9mn7FcOKAeVQ8ph5YhyVDmmHFdOKCeVU8pp5YxyVjmnnFcuKBeVS8pl5YpyVbmmXFduKDeVW8pt5Y5yV0lQ7in3lQfKQyVReaQ8Vp4oT5VnynPlhfJSeaW8Vt4ob5V3ynvlg/JR+aR8Vr4oX5Vvynflh/JT+aX8Vv4of5V/SpISpyZTk6sp1JRqKjW1mkZNq6ZT06sZ1IxqJjVezaxmUbOq2dTsag41p5pLza3mUfOq+dT8agG1oFpILawWUYuqxdTiagm1pFpKLa2WUcuq5dTyagW1olpJraxWUauq1dTqag21poqpuEqopEqptMqorMqpvCqooiqpsqqoqqqpumqopmqptuqoruqpvhqooRqpQIUqUmNqLbW2Wketq9ZT66sN1IZqI7Wx2kRtqjZTm6st1JZqK7W12kZtq7ZT26sd1I5qJ7Wz2kXtqnZTu6s91J5qL7W32kftq/ZT+6sD1IHqIHWwOkQdqg5Th6sj1JHqKHW0OkYdq45Tx6sT1InqJHWyOkWdqk5Tp6sz1JnqLHW2Okedq85T56sL1IXqInWxukRdqi5Tl6sr1JXqKnW1ukZdq65T16sb1I3qJnWzukXdqm5T/1O3qzvUneoudbe6R92r7lP3qwfUg+oh9bB6RD2qHlOPqyfUk+op9bR6Rj2rnlPPqxfUi+ol9bJ6Rb2qXlOvqzfUm+ot9bZ6R72rJqj31PvqA/Whmqg+Uh+rT9Sn6jP1ufpCfam+Ul+rb9S36jv1vfpB/ah+Uj+rX9Sv6jf1u/pD/an+Un+rf9S/6j81SY3TkmnJtRRaSi2VllpLo6XV0mnptQxaRi2TFq9l1rJoWbVsWnYth5ZTy6Xl1vJoebV8Wn6tgFZQK6QV1opoRbViWnGthFZSK6WV1spoZbVyWnmtglZRq6RV1qpoVbVqWnWthlZTwzRcIzRSozRaYzRW4zReEzRRkzRZUzRV0zRdMzRTszRbczRX8zRfC7RQizSgQQ1pMa2WVluro9XV6mn1tQZaQ62R1lhrojXVmmnNtRZaS62V1lpro7XV2mnttQ5aR62T1lnronXVumndtR5aT62X1lvro/XV+mn9tQHaQG2QNlgbog3VhmnDtRHaSG2UNlobo43VxmnjtQnaRG2SNlmbok3VpmnTtRnaTG2WNlubo83V5mnztQXaQm2Rtlhboi3VlmnLtRXaSm2Vtlpbo63V1mnrtQ3aRm2Ttlnbom3Vtmn/adu1HdpObZe2W9uj7dX2afu1A9pB7ZB2WDuiHdWOace1E9pJ7ZR2WjujndXOaee1C9pF7ZJ2WbuiXdWuade1G9pN7ZZ2W7uj3dUStHvafe2B9lBL1B5pj7Un2lPtmfZce6G91F5pr7U32lvtnfZe+6B91D5pn7Uv2lftm/Zd+6H91H5pv7U/2l/tn5akxenJ9OR6Cj2lnkpPrafR0+rp9PR6Bj2jnkmP1zPrWfSsejY9u55Dz6nn0nPrefS8ej49v15AL6gX0gvrRfSiejG9uF5CL6mX0kvrZfSyejm9vF5Br6hX0ivrVfSqejW9ul5Dr6ljOq4TOqlTOq0zOqtzOq8LuqhLuqwruqpruq4buqlbuq07uqt7uq8HeqhHOtChjvSYXkuvrdfR6+r19Pp6A72h3khvrDfRm+rN9OZ6C72l3kpvrbfR2+rt9PZ6B72j3knvrHfRu+rd9O56D72n3kvvrffR++r99P76AH2gPkgfrA/Rh+rD9OH6CH2kPkofrY/Rx+rj9PH6BH2iPkmfrE/Rp+rT9On6DH2mPkufrc/R5+rz9Pn6An2hvkhfrC/Rl+rL9OX6Cn2lvkpfra/R1+rr9PX6Bn2jvknfrG/Rt+rb9P/07foOfae+S9+t79H36vv0/foB/aB+SD+sH9GP6sf04/oJ/aR+Sj+tn9HP6uf08/oF/aJ+Sb+sX9Gv6tf06/oN/aZ+S7+t39Hv6gn6Pf2+/kB/qCfqj/TH+hP9qf5Mf66/0F/qr/TX+hv9rf5Of69/0D/qn/TP+hf9q/5N/67/0H/qv/Tf+h/9r/5PT9LjjGRGciOFkdJIZaQ20hhpjXRGeiODkdHIZMQbmY0sRlYjm5HdyGHkNHIZuY08Rl4jn5HfKGAUNAoZhY0iRlGjmFHcKGGUNEoZpY0yRlmjnFHeqGBUNCoZlY0qRlWjmlHdqGHUNDADNwiDNCiDNhiDNTiDNwRDNCRDNhRDNTRDNwzDNCzDNhzDNTzDNwIjNCIDGNBARsyoZdQ26hh1jXpGfaOB0dBoZDQ2mhhNjWZGc6OF0dJoZbQ22hhtjXZGe6OD0dHoZHQ2uhhdjW5Gd6OH0dPoZfQ2+hh9jX5Gf2OAMdAYZAw2hhhDjWHGcGOEMdIYZYw2xhhjjXHGeGOCMdGYZEw2phhTjWnGdGOGMdOYZcw25hhzjXnGfGOBsdBYZCw2lhhLjWXGcmOFsdJYZaw21hhrjXXGemODsdHYZGw2thhbjW3Gf8Z2Y4ex09hl7Db2GHuNfcZ+44Bx0DhkHDaOGEeNY8Zx44Rx0jhlnDbOGGeNc8Z544Jx0bhkXDauGFeNa8Z144Zx07hl3DbuGHeNBOOecd94YDw0Eo1HxmPjifHUeGY8N14YL41XxmvjjfHWeGe8Nz4YH41Pxmfji/HV+GZ8N34YP41fxm/jj/HX+GckGXFmMjO5mcJMaaYyU5tpzLRmOjO9mcHMaGYy483MZhYzq5nNzG7mMHOauczcZh4zr5nPzG8WMAuahczCZhGzqFnMLG6WMEuapczSZhmzrFnOLG9WMCualczKZhWzqlnNrG7WMGuamImbhEmalEmbjMmanMmbgimakimbiqmamqmbhmmalmmbjumanumbgRmakQlMaCIzZtYya5t1zLpmPbO+2cBsaDYyG5tNzKZmM7O52cJsabYyW5ttzLZmO7O92cHsaHYyO5tdzK5mN7O72cPsafYye5t9zL5mP7O/OcAcaA4yB5tDzKHmMHO4OcIcaY4yR5tjzLHmOHO8OcGcaE4yJ5tTzKnmNHO6OcOcac4yZ5tzzLnmPHO+ucBcaC4yF5tLzKXmMnO5ucJcaa4yV5trzLXmOnO9ucHcaG4yN5tbzK3mNvM/c7u5w9xp7jJ3m3vMveY+c795wDxoHjIPm0fMo+Yx87h5wjxpnjJPm2fMs+Y587x5wbxoXjIvm1fMq+Y187p5w7xp3jJvm3fMu2aCec+8bz4wH5qJ5iPzsfnEfGo+M5+bL8yX5ivztfnGfGu+M9+bH8yP5ifzs/nF/Gp+M7+bP8yf5i/zt/nH/Gv+M5PMOCuZldxKYaW0UlmprTRWWiudld7KYGW0MlnxVmYri5XVymZlt3JYOa1cVm4rj5XXymfltwpYBa1CVmGriFXUKmYVt0pYJa1SVmmrjFXWKmeVtypYFa1KVmWrilXVqmZVt2pYNS3Mwi3CIi3Koi3GYi3O4i3BEi3Jki3FUi3N0i3DMi3Lsi3Hci3P8q3ACq3IAha0kBWzalm1rTpWXaueVd9qYDW0GlmNrSZWU6uZ1dxqYbW0WlmtrTZWW6ud1d7qYHW0OlmdrS5WV6ub1d3qYfW0elm9rT5WX6uf1d8aYA20BlmDrSHWUGuYNdwaYY20RlmjrTHWWGucNd6aYE20JlmTrSnWVGuaNd2aYc20ZlmzrTnWXGueNd9aYC20FlmLrSXWUmuZtdxaYa20VlmrrTXWWmudtd7aYG20NlmbrS3WVmub9Z+13dph7bR2WbutPdZea5+13zpgHbQOWYetI9ZR65h13DphnbROWaetM9ZZ65x13rpgXbQuWZetK9ZV65p13bph3bRuWbetO9ZdK8G6Z923HlgPrUTrkfXYemI9tZ5Zz60X1kvrlfXaemO9td5Z760P1kfrk/XZ+mJ9tb5Z360f1k/rl/Xb+mP9tf5ZSVacncxObqewU9qp7NR2Gjutnc5Ob2ewM9qZ7Hg7s53Fzmpns7PbOeycdi47t53Hzmvns/PbBeyCdiG7sF3ELmoXs4vbJeySdim7tF3GLmuXs8vbFeyKdiW7sl3FrmpXs6vbNeyaNmbjNmGTNmXTNmOzNmfztmCLtmTLtmKrtmbrtmGbtmXbtmO7tmf7dmCHdmQDG9rIjtm17Np2HbuuXc+ubzewG9qN7MZ2E7up3cxubrewW9qt7NZ2G7ut3c5ub3ewO9qd7M52F7ur3c3ubvewe9q97N52H7uv3c/ubw+wB9qD7MH2EHuoPcwebo+wR9qj7NH2GHusPc4eb0+wJ9qT7Mn2FHuqPc2ebs+wZ9qz7Nn2HHuuPc+eby+wF9qL7MX2Enupvcxebq+wV9qr7NX2Gnutvc5eb2+wN9qb7M32Fnurvc3+z95u77B32rvs3fYee6+9z95vH7AP2ofsw/YR+6h9zD5un7BP2qfs0/YZ+6x9zj5vX7Av2pfsy/YV+6p9zb5u37Bv2rfs2/Yd+66dYN+z79sP7Id2ov3Ifmw/sZ/az+zn9gv7pf3Kfm2/sd/a7+z39gf7o/3J/mx/sb/a3+zv9g/7p/3L/m3/sf/a/+wkO85J5iR3UjgpnVROaieNk9ZJ56R3MjgZnUxOvJPZyeJkdbI52Z0cTk4nl5PbyePkdfI5+Z0CTkGnkFPYKeIUdYo5xZ0STkmnlFPaKeOUdco55Z0KTkWnklPZqeJUdao51Z0aTk0Hc3CHcEiHcmiHcViHc3hHcERHcmRHcVRHc3THcEzHcmzHcVzHc3wncEIncoADHeTEnFpObaeOU9ep59R3GjgNnUZOY6eJ09Rp5jR3WjgtnVZOa6eN09Zp57R3OjgdnU5OZ6eL09Xp5nR3ejg9nV5Ob6eP09fp5/R3BjgDnUHOYGeIM9QZ5gx3RjgjnVHOaGeMM9YZ54x3JjgTnUnOZGeKM9WZ5kx3ZjgznVnObGeOM9eZ58x3FjgLnUXOYmeJs9RZ5ix3VjgrnVXOameNs9ZZ56x3NjgbnU3OZmeLs9XZ5vznbHd2ODudXc5uZ4+z19nn7HcOOAedQ85h54hz1DnmHHdOOCedU85p54xz1jnnnHcuOBedS85l54pz1bnmXHduODedW85t545z10lw7jn3nQfOQyfReeQ8dp44T51nznPnhfPSeeW8dt44b513znvng/PR+eR8dr44X51vznfnh/PT+eX8dv44f51/TpIT5yZzk7sp3JRuKje1m8ZN66Zz07sZ3IxuJjfezexmcbO62dzsbg43p5vLze3mcfO6+dz8bgG3oFvILewWcYu6xdzibgm3pFvKLe2Wccu65dzybgW3olvJrexWcau61dzqbg23pou5uEu4pEu5tMu4rMu5vCu4oiu5squ4qqu5umu4pmu5tuu4ruu5vhu4oRu5wIUucmNuLbe2W8et69Zz67sN3IZuI7ex28Rt6jZzm7st3JZuK7e128Zt67Zz27sd3I5uJ7ez28Xt6nZzu7s93J5uL7e328ft6/Zz+7sD3IHuIHewO8Qd6g5zh7sj3JHuKHe0O8Yd645zx7sT3InuJHeyO8Wd6k5zp7sz3JnuLHe2O8ed685z57sL3IXuInexu8Rd6i5zl7sr3JXuKne1u8Zd665z17sb3I3uJnezu8Xd6m5z/3O3uzvcne4ud7e7x93r7nP3uwfcg+4h97B7xD3qHnOPuyfck+4p97R7xj3rnnPPuxfci+4l97J7xb3qXnOvuzfcm+4t97Z7x73rJrj33PvuA/ehm+g+ch+7T9yn7jP3ufvCfem+cl+7b9y37jv3vfvB/eh+cj+7X9yv7jf3u/vD/en+cn+7f9y/7j83yY3zknnJvRReSi+Vl9pL46X10nnpvQxeRi+TF+9l9rJ4Wb1sXnYvh5fTy+Xl9vJ4eb18Xn6vgFfQK+QV9op4Rb1iXnGvhFfSK+WV9sp4Zb1yXnmvglfRq+RV9qp4Vb1qXnWvhlfTwzzcIzzSozzaYzzW4zzeEzzRkzzZUzzV0zzdMzzTszzbczzX8zzfC7zQizzgQQ95Ma+WV9ur49X16nn1vQZeQ6+R19hr4jX1mnnNvRZeS6+V19pr47X12nntvQ5eR6+T19nr4nX1unndvR5eT6+X19vr4/X1+nn9vQHeQG+QN9gb4g31hnnDvRHeSG+UN9obk26sN84b703wJnqTvMneFG+qN82b7s3wZnqzvNneHG+uN8+b7y3wFnqLvMXeEm+pt8xb7q3wVnqrvNXeGm+tt85b723wNnqbvM3eFm+rt837z9vu7fB2eru83d4eb6+3z9vvHfAOeoe8w94R76h3zDvunfBOeqe8094Z76x3zjvvXfAuepe8y94V76p3zbvu3fBuere8294d766X4N3z7nsPvIdeovfIe+w98Z56z7zn3gvvpffKe+298d5677z33gfvo/fJ++x98b5637zv3g/vp/fL++398f56/7wkL85P5if3U/gp/VR+aj+Nn9ZP56f3M/gZ/Ux+vJ/Zz+Jn9bP52f0cfk4/l5/bz+Pn9fP5+f0CfkG/kF/YL+IX9Yv5xf0Sfkm/lF/aL+OX9cv55f0KfkW/kl/Zr+JX9av51f0afk0f83Gf8Emf8mmf8Vmf83lf8EVf8mVf8VVf83Xf8E3f8m3f8V3f830/8EM/8oEPfeTH/Fp+bb+OX9ev59f3G/gN/UZ+Y7+J39Rv5jf3W/gt/VZ+a7+N39Zv57f3O/gd/U5+Z7+L39Xv5nf3e/g9/V5+b7+P39fv5/f3B/gD/UH+YH+IP9Qf5g/3R/gj/VH+aH+MP9Yf54/3J/gT/Un+ZH+KP9Wf5k/3Z/gz/Vn+bH+OP9ef58/3F/gL/UX+Yn+Jv9Rf5i/3V/gr/VX+an+Nv9Zf56/3N/gb/U3+Zn+Lv9Xf5v/nb/d3+Dv9Xf5uf4+/19/n7/cP+Af9Q/5h/4h/1D/mH/dP+HFxcXGn/TP+Wf+cf96/4F/0L/mX/Sv+Vf+af92/4d/0b/m3/Tv+XT/Bv+ff9x/4D/1E/5H/2H/iP/Wf+c/9F/5L/5X/2n/jv/Xf+e/9D/5H/5P/2f/if/W/+d/9H/5P/5f/2//j//X/+Ul+XJAsSB6kCFIGqYLUQZogbZAuSB9kCDIGmYL4IHOQJcgaZAuyBzmCnEGuIHeQJ8gb5AvyBwWCgkGhoHBQJCgaFAuKByWCkkGpoHRQJigblAvKBxWCikGloHJQJagaVAuqBzWCmgEW4AERkAEV0AETsAEX8IEQiIEUyIESqIEW6IERmIEV2IETuIEX+EEQhEEUgAAGKIgFtYLaQZ2gblAvqB80CBoGjYLGQZOgadAsaB60CFoGrYLWQZugbdAuaB90CDoGnYLOQZega9At6B70CHoGvYLeQZ+gb9Av6B8MCAYGg4LBwZBgaDAsGB6MCEYGo4LRwZhgbDAuGB9MCCYGk4LJwZRgajAtmB7MCGYGs4LZwZxgbjAvmB8sCBYGi4LFwZJgabAsWB6sCFYGq4LVwZpgbbAuWB9sCDYGm4LNwZZga7At+C/YHuwIdga7gt3BnmBvsC/YHxwIDgaHgsPBkeBocCw4HpwITgangtPBmeBscC44H1wILgaXgsvBleBqcC24HtwIbga3gtvBneBukBDcC+4HD4KHQWLwKHgcPAmeBs+C58GL4GXwKngdvAneBu+C98GH4GPwKfgcfAm+Bt+C78GP4GfwK/gd/An+Bv+CpCAuTBYmD1OEKcNUYeowTZg2TBemDzOEGcNMYXyYOcwSZg2zhdnDHGHOMFeYO8wT5g3zhfnDAmHBsFBYOCwSFg2LhcXDEmHJsFRYOiwTlg3LheXDCmHFsFJYOawSVg2rhdXDGmHNEAvxkAjJkArpkAnZkAv5UAjFUArlUAnVUAv10AjN0Art0And0Av9MAjDMApBCEMUxsJaYe2wTlg3rBfWDxuEDcNGYeOwSdg0bBY2D1uELcNWYeuwTdg2bBe2DzuEHcNOYeewS9g17BZ2D3uEPcNeYe+wT9g37Bf2DweEA8NB4eBwSDg0HBYOD0eEI8NR4ehwTDg2HBeODyeEE8NJ4eRwSjg1nBZOD2eEM8NZ4exwTjg3nBfODxeEC8NF4eJwSbg0XBYuD1eEK8NV4epwTbg2XBeuDzeEG8NN4eZwS7g13Bb+F24Pd4Q7w13h7nBPuDfcF+4PD4QHw0Ph4fBIeDQ8Fh4PT4Qnw1Ph6fBMeDY8F54PL4QXw0vh5fBKeDW8Fl4Pb4Q3w1vh7fBOeDdMCO+F98MH4cMwMXwUPg6fhE/DZ+Hz8EX4MnwVvg7fhG/Dd+H78EP4MfwUfg6/hF/Db+H38Ef4M/wV/g7/hH/Df2FSGBcli5JHKaKUUaoodZQmShuli9JHGaKMUaYoPsocZYmyRtmi7FGOKGeUK8od5YnyRvmi/FGBqGBUKCocFYmKRsWi4lGJqGRUKiodlYnKRuWi8lGFqGJUKaocVYmqRtWi6lGNqGaERXhERGRERXTERGzERXwkRGIkRXKkRGqkRXpkRGZkRXbkRG7kRX4URGEURSCCEYpiUa2odlQnqhvVi+pHDaKGUaOocdQkaho1i5pHLaKWUauoddQmahu1i9pHHaKOUaeoc9Ql6hp1i7pHPaKeUa+od9Qn6hv1i/pHA6KB0aBocDQkGhoNi4ZHI6KR0ahodDQmGhuNi8ZHE6KJ0aRocjQlmhpNi6ZHM6KZ0axodjQnmhvNi+ZHC6KF0aJocbQkWhoti5ZHK6KV0apodbQmWhuti9ZHG6KN0aZoc7Ql2hpti/6Ltkc7op3Rrmh3tCfaG+2L9kcHooPRoehwdCQ6Gh2LjkcnopPRqeh0dCY6G52LzkcXoovRpehydCW6Gl2Lrkc3opvRreh2dCe6GyVE96L70YPoYZQYPYoeR0+ip9Gz6Hn0InoZvYpeR2+it9G76H30IfoYfYo+R1+ir9G36Hv0I/oZ/Yp+R3+iv9G/KCmKA8lAcpACpASpQGqQBqQF6UB6kAFkBJlAPMgMsoCsIBvIDnKAnCAXyA3ygLwgH8gPCoCCoBAoDIqAoqAYKA5KgJKgFCgNyoCyoBwoDyqAiqASqAyqgKqgGqgOaoCaAAM4IAAJKEADBrCAAzwQgAgkIAMFqEADOjCACSxgAwe4wAM+CEAIIgAABAjEQC1QG9QBdUE9UB80AA1BI9AYNAFNQTPQHLQALUEr0Bq0AW1BO9AedAAdQSfQGXQBXUE30B30AD1BL9Ab9AF9QT/QHwwAA8EgMBgMAUPBMDAcjAAjwSgwGowBY8E4MB5MABPBJDAZTAFTwTQwHcwAM8EsMBvMAXPBPDAfLAALwSKwGCwBS8EysBysACvBKrAarAFrwTqwHmwAG8EmsBlsAVvBNvAf2A52gJ1gF9gN9oC9YB/YDw6Ag+AQOAyOgKPgGDgOToCT4BQ4Dc6As+AcOA8ugIvgErgMroCr4Bq4Dm6Am+AWuA3ugLsgAdwD98ED8BAkgkfgMXgCnoJn4Dl4AV6CV+A1eAPegnfgPfgAPoJP4DP4Ar6Cb+A7+AF+gl/gN/gD/oJ/IAnEwWQwOUwBU8JUMDVMA9PCdDA9zAAzwkwwHmaGWWBWmA1mhzlgTpgL5oZ5YF6YD+aHBWBBWAgWhkVgUVgMFoclYElYCpaGZWBZWA6WhxVgRVgJVoZVYFVYDVaHNWBNiEEcEpCEFKQhA1nIQR4KUIQSlKECVahBHRrQhBa0oQNd6EEfBjCEEQQQQgRjsBasDevAurAerA8bwIawEWwMm8CmsBlsDlvAlrAVbA3bwLawHWwPO8COsBPsDLvArrAb7A57wJ6wF+wN+8C+sB/sDwfAgXAQHAyHwKFwGBwOR8CRcBQcDcfAsXAcHA8nwIlwEpwMp8CpcBqcDmfAmXAWnA3nwLlwHpwPF8CFcBFcDJfApXAZXA5XwJVwFVwN18C1cB1cDzfAjXAT3Ay3wK1wG/wPboc74E64C+6Ge+BeuA/uhwfgQXgIHoZH4FF4DB6HJ+BJeAqehmfgWXgOnocX4EV4CV6GV+BVeA1ehzfgTXgL3oZ34F2YAO/B+/ABfAgT4SP4GD6BT+Ez+By+gC/hK/gavoFv4Tv4Hn6AH+En+Bl+gV/hN/gd/oA/4S/4G/6Bf+E/mATjUDKUHKVAKVEqlBqlQWlROpQeZUAZUSYUjzKjLCgryoayoxwoJ8qFcqM8KC/Kh/KjAqggKoQKoyKoKCqGiqMSqCQqhUqjMqgsKofKowqoIqqEKqMqqCqqhqqjGqgmwhCOCEQiCtGIQSziEI8EJCIJyUhBKtKQjgxkIgvZyEEu8pCPAhSiCAEEEUIxVAvVRnVQXVQP1UcNUEPUCDVGTVBT1Aw1Ry1QS9QKtUZtUFvUDrVHHVBH1Al1Rl1QV9QNdUc9UE/UC/VGfVBf1A/1RwPQQDQIDUZD0FA0DA1HI9BINAqNRmPQWDQOjUcT0EQ0CU1GU9BUNA1NRzPQTDQLzUZz0Fw0D81HC9BCtAgtRkvQUrQMLUcr0Eq0Cq1Ga9BatA6tRxvQRrQJbUZb0Fa0Df2HtqMdaCfahXajPWgv2of2owPoIDqEDqMj6Cg6ho6jE+gkOoVOozPoLDqHzqML6CK6hC6jK+gquoauoxvoJrqFbqM76C5KQPfQffQAPUSJ6BF6jJ6gp+gZeo5eoJfoFXqN3qC36B16jz6gj+gT+oy+oK/oG/qOfqCf6Bf6jf6gv+gfSkJxsWSx5LEUsZSxVLHUsTSxtLF0sfSxDLGMsUyx+FjmWJZY1li2WPZYjljOWK5Y7lieWN5Yvlj+WIFYwVihWOFYkVjRWLFY8ViJWMlYqVjpWJlY2Vi5WPlYhVjFWKVY5ViVWNVYtVj1WI1YzRgWw2NEjIxRMTrGxNgYF+NjQkyMSTE5psTUmBbTY0bMjFkxO+bE3JgX82NBLIxFMRCDMRT7nwR4DMybWwAA3Hmdjc62bXvf2CUnPnFOzHe2bdu2bdu2baP3x30wDMcARmAkRmE0xmAsxmEQ4zEBEzEJkzEFUzENQ5iOGZiJWZiNOZiLeZiPBViIRVgM6471wHpivbDeWB+sL9YP648NwAZig7DB2BBsKDYMG46NwEZio7DR2BhsLDYOG49NwCZik7DJ2BRsKjYNm47NwGZis7DZ2BxsLjYPm48twBZii7DF2BJsKbYMW46twFZiq7DV2BpsLbYOW49twDZim7DN2BZsK7YN247twHZiu7Dd2B5sL7YP248dwA5ih7DD2BHsKHYMO46dwE5ip7DT2BnsLHYOO49dwC5il7DL2BXsKnYNu47dwG5it7Db2B3sLnYPu489wB5ij7DH2BPsKfYMe469wF5ir7DX2BvsLfYOe499wD5in7DP2BfsK/YN+479wH5iv7Df2B/sL/YPS8Li8GR4cjwFnhJPhafG0+Bp8Xg8HZ4ez4BnxDPhmfEseFY8G54dz4HnxHPhufE8eF48Ac+H58cL4AXxQnhhvAheFC+GF8dL4CXxUnhpvAxeFi+Hl8cr4BXxSnhlvApeFa+GV8dr4DXxWnhtvA5eF6+H18cb4A3xRnhjvAneFG+GN8db4C3xVnhrvA3eFm+Ht8f/wzvgHfFOeGe8C94VT8S74RiO4wAncBKncBpncBbncIjzuICLuITLuIKruIYjXMcN3MQt3MYd3MU93McDPMQjPIZ3x3vgPfFeeG+8D94X74f3xwfgA/FB+GB8CD4UH4YPx0fgI/FR+Gh8DD4WH4ePxyfgE/FJ+GR8Cj4Vn4ZPx2fgM/FZ+Gx8Dj4Xn4fPxxfgC/FF+GJ8Cb4UX4Yvx1fgK/FV+Gp8Db4WX4evxzfgG/FN+GZ8C74V34Zvx3fgO/Fd+G58D74X34fvxw/gB/FD+GH8CH4UP4Yfx0/gJ/FT+Gn8DH4WP4efxy/gF/FL+GX8Cn4Vv4Zfx2/gN/Fb+G38Dn4Xv4ffxx/gD/FH+GP8Cf4Uf4Y/x1/gL/FX+Gv8Df4Wf4e/xz/gH/FP+Gf8C/4V/4Z/x3/gP/Ff+G/8D/4X/4cn4XEgGUgOUoCUIBVIDdKAtCAepAPpQQaQEWQCmUEWkBVkA9lBDpAT5AK5QR6QFySAfCA/KAAKgkKgMCgCioJioDgoAUqCUqA0KAPKgnKgPKgAKoJKoDKoAqqCaqA6qAFqglqgNqgD6oJ6oD5oABqCRqAxaAKagmagOWgBWoJWoDVoA9qCdqA9+A90AB1BJ9AZdAFdQSLoBjCAAwAIQAIK0IABLOAABDwQgAgkIAMFqEADCOjAACawgA0c4AIP+CAAIYhADHQHPUBP0Av0Bn1AX9AP9AcDwEAwCAwGQ8BQMAwMByPASDAKjAZjwFgwDowHE8BEMAlMBlPAVDANTAczwEwwC8wGc8BcMA/MBwvAQrAILAZLwFKwDCwHK8BKsAqsBmvAWrAOrAcbwEawCWwGW8BWsA1sBzvATrAL7AZ7wF6wD+wHB8BBcAgcBkfAUXAMHAcnwElwCpwGZ8BZcA6cBxfARXAJXAZXwFVwDVwHN8BNcAvcBnfAXXAP3AcPwEPwCDwGT8BT8Aw8By/AS/AKvAZvwFvwDrwHH8BH8Al8Bl/AV/ANfAc/wE/wC/wGf8Bf8A8kgTgiGZGcSEGkJFIRqYk0RFoinkhHpCcyEBmJTERmIguRlchGZCdyEDmJXERuIg+Rl0gg8hH5iQJEQaIQUZgoQhQlihHFiRJESaIUUZooQ5QlyhHliQpERaISUZmoQlQlqhHViRpETaIWUZuoQ9Ql6hH1iQZEQ6IR0ZhoQjQlmhHNiRZES6IV0ZpoQ7Ql2hHtif+IDkRHohPRmehCdCUSiW4ERuAEIAiCJCiCJhiCJTgCEjwhECIhETKhECqhEYjQCYMwCYuwCYdwCY/wiYAIiYiIEd2JHkRPohfRm+hD9CX6Ef2JAcRAYhAxmBhCDCWGEcOJEcRIYhQxmhhDjCXGEeOJCcREYhIxmZhCTCWmEdOJGcRMYhYxm5hDzCXmEfOJBcRCYhGxmFhCLCWWEcuJFcRKYhWxmlhDrCXWEeuJDcRGYhOxmdhCbCW2EduJHcROYhexm9hD7CX2EfuJA8RB4hBxmDhCHCWOEceJE8RJ4hRxmjhDnCXOEeeJC8RF4hJxmbhCXCWuEdeJG8RN4hZxm7hD3CXuEfeJB8RD4hHxmHhCPCWeEc+JF8RL4hXxmnhDvCXeEe+JD8RH4hPxmfhCfCW+Ed+JH8RP4hfxm/hD/CX+EUlEHJmMTE6mIFOSqcjUZBoyLRlPpiPTkxnIjGQmMjOZhcxKZiOzkznInGQuMjeZh8xLJpD5yPxkAbIgWYgsTBYhi5LFyOJkCbIkWYosTZYhy5LlyPJkBbIiWYmsTFYhq5LVyOpkDbImWYusTdYh65L1yPpkA7Ih2YhsTDYhm5LNyOZkC7Il2YpsTbYh25LtSJSYlNSB7Eh2IjuTXciuZCLZjcRInAQkQZIkRdIkQ7IkR0KSJwVSJCVSJhVSJTUSkTppkCZpkTbpkC7pkT4ZkCEZkTGyO9mD7En2InuTfci+ZD+yPzmAHEgOIgeTQ8ih5DByODmCHEmOIkeTY8ix5DhyPDmBnEhOIieTU8ip5DRyOjmDnEnOImeTc8i55DxyPrmAXEguIheTS8il5DJyObmCXEmuIleTa8i15DpyPbmB3EhuIjeTW8it5DZyO7mD3EnuIneTe8i95D5yP3mAPEgeIg+TR8ij5DHyOHmCPEmeIk+TZ8iz5DnyPHmBvEheIi+TV8ir5DXyOnmDvEneIm+Td8i75D3yPvmAfEg+Ih+TT8in5DPyOfmCfEm+Il+Tb8i35DvyPfmB/Eh+Ij+TX8iv5DfyO/mD/En+In+Tf8i/5D8yiYyjklHJqRRUSioVlZpKQ6Wl4ql0VHoqA5WRykRlprJQWalsVHYqB5WTykXlpvJQeakEKh+VnypAFaQKUYWpIlRRqhhVnCpBlaRKUaWpMlRZqhxVnqpAVaQqUZWpKlRVqhpVnapB1aRqUbWpOlRdqh5Vn2pANaQaUY2pJlRTqhnVnGpBtaRaUa2pNlRbqh3VnvqP6kB1pDpRnakuVFcqkepGYRROAYqgSIqiaIqhWIqjIMVTAiVSEiVTCqVSGoUonTIok7Iom3Iol/IonwqokIqoGNWd6kH1pHpRvak+VF+qH9WfGkANpAZRg6kh1FBqGDWcGkGNpEZRo6kx1FhqHDWemkBNpCZRk6kp1FRqGjWdmkHNpGZRs6k51FxqHjWfWkAtpBZRi6kl1FJqGbWcWkGtpFZRq6k11FpqHbWe2kBtpDZRm6kt1FZqG7Wd2kHtpHZRu6k91F5qH7WfOkAdpA5Rh6kj1FHqGHWcOkGdpE5Rp6kz1FnqHHWeukBdpC5Rl6kr1FXqGnWdukHdpG5Rt6k71F3qHnWfekA9pB5Rj6kn1FPqGfWcekG9pF5Rr6k31FvqHfWe+kB9pD5Rn6kv1FfqG/Wd+kH9pH5Rv6k/1F/qH5VExdHJ6OR0CjolnYpOTaeh09LxdDo6PZ2BzkhnojPTWeisdDY6O52DzknnonPTeei8dAKdj85PF6AL0oXownQRuihdjC5Ol6BL0qXo0nQZuixdji5PV6Ar0pXoynQVuipdja5O16Br0rXo2nQdui5dj65PN6Ab0o3oxnQTuindjG5Ot6Bb0q3o1nQbui3djm5P/0d3oDvSnejOdBe6K51Id6MxGqcBTdAkTdE0zdAszdGQ5mmBFmmJlmmFVmmNRrROG7RJW7RNO7RLe7RPB3RIR3SM7k73oHvSvejedB+6L92P7k8PoAfSg+jB9BB6KD2MHk6PoEfSo+jR9Bh6LD2OHk9PoCfSk+jJ9BR6Kj2Nnk7PoGfSs+jZ9Bx6Lj2Pnk8voBfSi+jF9BJ6Kb2MXk6voFfSq+jV9Bp6Lb2OXk9voDfSm+jN9BZ6K72N3k7voHfSu+jd9B56L72P3k8foA/Sh+jD9BH6KH2MPk6foE/Sp+jT9Bn6LH2OPk9foC/Sl+jL9BX6Kn2Nvk7foG/St+jb9B36Ln2Pvk8/oB/Sj+jH9BP6Kf2Mfk6/oF/Sr+jX9Bv6Lf2Ofk9/oD/Sn+jP9Bf6K/2N/k7/oH/Sv+jf9B/6L/2PTqLjmGRMciYFk5JJxaRm0jBpmXgmHZOeycBkZDIxmZksTFYmG5OdycHkZHIxuZk8TF4mgcnH5GcKMAWZQkxhpghTlCnGFGdKMCWZUkxppgxTlinHlGcqMBWZSklJTBWmKlONqc7UYGoytZjaTB2mLlOPqc80YBoyjZjGTBOmKdOMac60YFoyrZjWTBumLdOOac/8x3RgOjKdmM5MF6Yrk8h0YzAGZwBDMCRDMTTDMCzDMZDhGYERGYmRGYVRGY1BjM4YjMlYjM04jMt4jM8ETMhETIzpzvRgejK9mN5MH6Yv04/pzwxgBjKDmMHMEGYoM4wZzoxgRjKjmNHMGGYsM44Zz0xgJjKTmMnMFGYqM42ZzsxgZjKzmNnMHGYuM4+ZzyxgFjKLmMXMEmYps4xZzqxgVjKrmNXMGmYts45Zz2xgNjKbmM3MFmYrs43ZzuxgdjK7mN3MHmYvs4/ZzxxgDjKHmMPMEeYoc4w5zpxgTjKnmNPMGeYsc445z1xgLjKXmMvMFeYqc425ztxgbjK3mNvMHeYuc4+5zzxgHjKPmMfME+Yp84x5zrxgXjKvmNfMG+Yt8455z3xgPjKfmM/MF+Yr8435zvxgfjK/mN/MH+Yv849JYuLYZGxyNgWbkk3FpmbTsGnZeDYdm57NwGZkM7GZ2SxsVjYbm53NweZkc7G52TxsXjaBzcfmZwuwBdlCbGG2CFuULcYWZ0uwJdlSbGm2DFuWLceWZyuwFdlKbGW2CluVrcZWZ2uwNdlabG22DluXrcfWZxuwDdlGbGO2CduUbcY2Z1uwLdlWbGu2DduWbce2Z/9jO7Ad2U5sZ7YL25VNZLuxGIuzgCVYkqVYmmVYluVYyPKswIqsxMqswqqsxiJWZw3WZC3WZh3WZT3WZwM2ZCM2xnZne7A92V5sb7YP25ftx/ZnB7AD2UHsYHYIO5Qdxg5nR7Aj2VHsaHYMO5Ydx45nJ7AT2UnsZHYKO5Wdxk5nZ7Az2VnsbHYOO5edx85nF7AL2UXsYnYJu5Rdxi5nV7Ar2VXsanYNu5Zdx65nN7Ab2U3sZnYLu5Xdxm5nd7A72V3sbnYPu5fdx+5nD7AH2UPsYfYIe5Q9xh5nT7An2VPsafYMe5Y9x55nL7AX2UvsZfYKe5W9xl5nb7A32VvsbfYOe5e9x95nH7AP2UfsY/YJ+5R9xj5nX7Av2Vfsa/YN+5Z9x75nP7Af2U/sZ/YL+5X9xn5nf7A/2V/sb/YP+5f9xyaxcVwyLjmXgkvJpeJSc2m4tFw8l45Lz2XgMnKZuMxcFi4rl43LzuXgcnK5uNxcHi4vl8Dl4/JzBbiCXCGuMFeEK8oV44pzJbiSXCmuNFeGK8uV48pzFbiKXCWuMleFq8pV46pzNbiaXC2uNleHq8vV4+pzDbiGXCOuMdeEa8o145pzLbiWXCuuNdeGa8u149pz/3EduI5cJ64z14XryiVy3TiMwznAERzJURzNMRzLcRzkeE7gRE7iZE7hVE7jEKdzBmdyFmdzDudyHudzARdyERfjunM9uJ5cL64314fry/Xj+nMDuIHcIG4wN4Qbyg3jhnMjuJHcKG40N4Yby43jxnMTuIncJG4yN4Wbyk3jpnMzuJncLG42N4eby83j5nMLuIXcIm4xt4Rbyi3jlnMruJXcKm41t4Zby63j1nMbuI3cJm4zt4Xbym3jtnM7uJ3cLm43t4fby+3j9nMHuIPcIe4wd4Q7yh3jjnMnuJPcKe40d4Y7y53jznMXuIvcJe4yd4W7yl3jrnM3uJvcLe42d4e7y93j7nMPuIfcI+4x94R7yj3jnnMvuJfcK+4194Z7y73j3nMfuI/cJ+4z94X7yn3jvnM/uJ/cL+4394f7y/3jkrg4mAwmhylgSpgKpoZpYFoYD9PB9DADzAgzwcwwC8wKs8HsMAfMCXPB3DAPzAsTYD6YHxaABWEhWBgWgUVhMVgcloAlYSlYGpaBZWE5WB5WgBVhJVgZVoFVYTVYHdaANWEtWBvWgXVhPVgfNoANYSPYGDaBTWEz2By2gC1hK9gatoFtYTvYHv4HO8COsBPsDLvArjARdoMYxCGABCQhBWnIQBZyEEIeClCEEpShAlWoQQR1aEATWtCGDnShB30YwBBGMAa7wx6wJ+wFe8M+sC/sB/vDAXAgHAQHwyFwKBwGh8MRcCQcBUfDMXAsHAfHwwlwIpwEJ8MpcCqcBqfDGXAmnAVnwzlwLpwH58MFcCFcBBfDJXApXAaXwxVwJVwFV8M1cC1cB9fDDXAj3AQ3wy1wK9wGt8MdcCfcBXfDPXAv3Af3wwPwIDwED8Mj8Cg8Bo/DE/AkPAVPwzPwLDwHz8ML8CK8BC/DK/AqvAavwxvwJrwFb8M78C68B+/DB/AhfAQfwyfwKXwGn8MX8CV8BV/DN/AtfAffww/wI/wEP8Mv8Cv8Br/DH/An/AV/wz/wL/wHk2Acn4xPzqfgU/Kp+NR8Gj4tH8+n49PzGfiMfCY+M5+Fz8pn47PzOficfC4+N5+Hz8sn8Pn4/HwBviBfiC/MF+GL8sX44nwJviRfii/Nl+HL8uX48nwFviJfia/MV+Gr8tX46nwNviZfi6/N1+Hr8vX4+nwDviHfiG/MN+Gb8s345nwLviXfim/Nt+Hb8u349vx/fAe+I9+J78x34bvyiXw3HuNxHvAET/IUT/MMz/IcD3meF3iRl3iZV3iV13jE67zBm7zF27zDu7zH+3zAh3zEx/jufA++J9+L78334fvy/fj+/AB+ID+IH8wP4Yfyw/jh/Ah+JD+KH82P4cfy4/jx/AR+Ij+Jn8xP4afy0/jp/Ax+Jj+Ln83P4efy8/j5/AJ+Ib+IX8wv4Zfyy/jl/Ap+Jb+KX82v4dfy6/j1/AZ+I7+J38xv4bfy2/jt/A5+J7+L383v4ffy+/j9/AH+IH+IP8wf4Y/yx/jj/An+JH+KP82f4c/y5/jz/AX+In+Jv8xf4a/y1/jr/A3+Jn+Lv83f4e/y9/j7/AP+If+If8w/4Z/yz/jn/Av+Jf+Kf82/4d/y7/j3/Af+I/+J/8x/4b/y3/jv/A/+J/+L/83/4f/y//gkPk5IJiQXUggphVRCaiGNkFaIF9IJ6YUMQkYhk5BZyCJkFbIJ2YUcQk4hl5BbyCPkFRKEfEJ+oYBQUCgkFBaKCEWFYkJxoYRQUigllBbKCGWFckJ5oYJQUagkVBaqCFWFakJ1oYZQU6gl1BbqCHWFekJ9oYHQUGgkNBaaCE2FZkJzoYXQUmgltBbaCG2FdkJ74T+hg9BR6CR0FroIXYVEoZuACbgABEIgBUqgBUZgBU6AAi8IgihIgiwogipoAhJ0wRBMwRJswRFcwRN8IRBCIRJiQnehh9BT6CX0FvoIfYV+Qn9hgDBQGCQMFoYIQ4VhwnBhhDBSGCWMFsYIY4VxwnhhgjBRmCRMFqYIU4VpwnRhhjBTmCXMFuYIc4V5wnxhgbBQWCQsFpYIS4VlwnJhhbBSWCWsFtYIa4V1wnphg7BR2CRsFrYIW4VtwnZhh7BT2CXsFvYIe4V9wn7hgHBQOCQcFo4IR4VjwnHhhHBSOCWcFs4IZ4VzwnnhgnBRuCRcFq4IV4VrwnXhhnBTuCXcFu4Id4V7wn3hgfBQeCQ8Fp4IT4VnwnPhhfBSeCW8Ft4Ib4V3wnvhg/BR+CR8Fr4IX4Vvwnfhh/BT+CX8Fv4If4V/QpIQJyYTk4spxJRiKjG1mEZMK8aL6cT0YgYxo5hJzCxmEbOK2cTsYg4xp5hLzC3mEfOKCWI+Mb9YQCwoFhILi0XEomIxsbhYQiwplhJLi2XEsmI5sbxYQawoVhIri1XEqmI1sbpYQ6wp1hJri3XEumI9sb7YQGwoNhIbi03EpmIzsbnYQmwpthJbi23EtmI7sb34n9hB7Ch2EjuLXcSuYqLYTcREXAQiIZIiJdIiI7IiJ0KRFwVRFCVRFhVRFTURibpoiKZoibboiK7oib4YiKEYiTGxu9hD7Cn2EnuLfcS+Yj+xvzhAHCgOEgeLQ8Sh4jBxuDhCHCmOEkeLY8Sx4jhxvDhBnChOEieLU8Sp4jRxujhDnCnOEmeLc8S54jxxvrhAXCguEheLS8Sl4jJxubhCXCmuEleLa8S14jpxvbhB3ChuEjeLW8St4jZxu7hD3CnuEneLe8S94j5xv3hAPCgeEg+LR8Sj4jHxuHhCPCmeEk+LZ8Sz4jnxvHhBvCheEi+LV8Sr4jXxunhDvCneEm+Ld8S74j3xvvhAfCg+Eh+LT8Sn4jPxufhCfCm+El+Lb8S34jvxvfhB/Ch+Ej+LX8Sv4jfxu/hD/Cn+En+Lf8S/4j8xSYyTkknJpRRSSimVlFpKI6WV4qV0Unopg5RRyiRllrJIWaVsUnYph5RTyiXllvJIeaUEKZ+UXyogFZQKSYWlIlJRqZhUXCohlZRKSaWlMlJZqZxUXqogVZQqSZWlKlJVqZpUXaoh1ZRqSbWlOlJdqZ5UX2ogNZQaSY2lJlJTqZnUXGohtZRaSa2lNlJbqZ3UXvpP6iB1lDpJnaUuUlcpUeomYRIuAYmQSImSaImRWImToMRLgiRKkiRLiqRKmoQkXTIkU7IkW3IkV/IkXwqkUIqkmNRd6iH1lHpJvaU+Ul+pn9RfGiANlAZJg6Uh0lBpmDRcGiGNlEZJo6Ux0lhpnDRemiBNlCZJk6Up0lRpmjRdmiHNlGZJs6U50lxpnjRfWiAtlBZJi6Ul0lJpmbRcWiGtlFZJq6U10lppnbRe2iBtlDZJm6Ut0lZpm7Rd2iHtlHZJu6U90l5pn7RfOiAdlA5Jh6Uj0lHpmHRcOiGdlE5Jp6Uz0lnpnHReuiBdlC5Jl6Ur0lXpmnRduiHdlG5Jt6U70l3pnnRfeiA9lB5Jj6Un0lPpmfRceiG9lF5Jr6U30lvpnfRe+iB9lD5Jn6Uv0lfpm/Rd+iH9lH5Jv6U/0l/pn5QkxcnJ5ORyCjmlnEpOLaeR08rxcjo5vZxBzihnkjPLWeSscjY5u5xDzinnknPLeeS8coKcT84vF5ALyoXkwnIRuahcTC4ul5BLyqXk0nIZuaxcTi4vV5ArypXkynIVuapcTa4u15BryrXk2nIdua5cT64vN5Abyo3kxnITuancTG4ut5Bbyq3k1nIbua3cTm4v/yd3kDvKneTOche5q5wod5MxGZeBTMikTMm0zMiszMlQ5mVBFmVJlmVFVmVNRrIuG7IpW7ItOzKM+79QjuSY3F3uIfeUe8m95T5yX7mf3F8eIA+UB8mD5SHyUHmYPFweIY+UR8mj5THyWHmcPF6eIE+UJ8mT5SnyVHmaPF2eIc+UZ8mz5TnyXHmePF9eIC+UF8mL5SXyUnmZvFxeIa+UV8mr5TXyWnmdvF7eIG+UN8mb5S3yVnmbvF3eIe+Ud8m75T3yXnmfvF8+IB+UD8mH5SPyUfmYfFw+IZ+UT8mn5TPyWfmcfF6+IF+UL8mX5SvyVfmafF2+Id+Ub8m35TvyXfmefF9+ID+UH8mP5SfyU/mZ/Fx+Ib+UX8mv5TfyW/md/F7+IH+UP8mf5S/yV/mb/F3+If+Uf8m/5T/yX/mfnCTHKcmU5EoKJaWSSkmtpFHSKvFKOiW9kkHJqGRSMitZlKxKNiW7kkPJqeRScit5lLxKgpJPya8UUAoqhZTCShGlqFJMKa6UUEoqpZTSShmlrFJOKa9UUCoqlZTKShWlqlJNqa7UUGoqtZTaSh2lrlJPqa80UBoqjZTGShOlqdJMaa60UFoqrZTWShulrdJOaa/8p3RQOiqdlM5KF6Wrkqh0UzAFV4BCKKRCKbTCKKzCKVDhFUERFUmRFUVRFU1Biq4YiqlYiq04iqt4iq8ESqhESkzprvRQeiq9lN5KH6Wv0k/prwxQBiqDlMHKEGWoMkwZroxQRiqjlNHKGGWsMk4Zr0xQJiqTlMnKFGWqMk2ZrsxQZiqzlNnKHGWuMk+ZryxQFiqLlMXKEmWpskxZrqxQViqrlNXKGmWtsk5Zr2xQNiqblM3KFmWrsk3ZruxQdiq7lN3KHmWvsk/ZrxxQDiqHlMPKEeWockw5rpxQTiqnlNPKGeWsck45r1xQLiqXlMvKFeWqck25rtxQbiq3lNvKHeWuck+5rzxQHiqPlMfKE+Wp8kx5rrxQXiqvlNfKG+Wt8k55r3xQPiqflM/KF+Wr8k35rvxQfiq/lN/KH+Wv8k9JUuLUZGpyNYWaUk2lplbTqGnVeDWdml7NoGZUM6mZ1SxqVjWbml3NoeZUc6m51TxqXjVBzafmVwuoBdVCamG1iFpULaYWV0uoJdVSamm1jFpWLaeWVyuoFdVKamW1ilpVraZWV2uoNdVaam21jlpXrafWVxuoDdVGamO1idpUbaY2V1uoLdVWamu1jdpWbae2V/9TO6gd1U5qZ7WL2lVNVLupmIqrQCVUUqVUWmVUVuVUqPKqoIqqpMqqoqqqpiJVVw3VVC3VVh3VVT3VVwM1VCM1pnZXe6g91V5qb7WP2lftp/ZXB6gD1UHqYHWIOlQdpg5XR6gj1VHqaHWMOlYdp45XJ6gT1UnqZHWKOlWdpk5XZ6gz1VnqbHWOOledp85XF6gL1UXqYnWJulRdpi5XV6gr1VXqanWNulZdp65XN6gb1U3q5oQt6lZ1m7pd3aHuVHepu9U96l51n7pfPaAeVA+ph9Uj6lH1mHpcPaGeVE+pp9Uz6ln1nHpevaBeVC+pl9Ur6lX1mnpdvaHeVG+pt9U76l31nnpffaA+VB+pj9Un6lP1mfpcfaG+VF+pr9U36lv1nfpe/aB+VD+pn9Uv6lf1m/pd/aH+VH+pv9U/6l/1n5qkxmnJtORaCi2llkpLraXR0mrxWjotvZZBy6hl0jJrWbSsWjYtu5ZDy6nl0nJrebS8WoKWT8uvFdAKaoW0wloRrahWTCuuldBKaqW00loZraxWTiuvVdAqapW0yloVrapWTauu1dBqarW02lodra5WT6uvNdAaao20xloTranWTGuutdBaaq201lobra3WTmuv/ad10DpqnbTOWhetq5aoddMwDdeARmikRmm0xmisxmlQ4zVBEzVJkzVFUzVNQ5quGZqpWZqtOZqreZqvBVqoRVpM66710HpqvbTeWh+tr9ZP668N0AZqg7TB2hBtqDZMG66N0EZqo7TR8WO0sdo4bbw2QZuoTdIma1O0qdo0bbo2Q5upzdJma3O0udo8bb62QFuoLdIWa0u0pdoybbm2QluprdJWa2u0tdo6bb22QduobdI2a1u0rdo2bbu2Q9up7dJ2a3u0vdo+bb92QDuoHdIOa0e0o9ox7bh2QjupndJOa2e0s9o57bx2QbuoXdIua1e0q9o17bp2Q7up3dJua3e0u9o97b72QHuoPdIea0+0p9oz7bn2QnupvdJea2+0t9o77b32QfuofdI+a1+0r9o37bv2Q/up/dJ+a3+0v9o/LUmLQ8lQcpQCpUSpUGqUBqVF8SgdSo8yoIwoE8qMsqCsKBvKjnKgnCgXyo3yoLwoAeVD+VEBVBAVQoVREVQUFUPFUQlUEpVCpVEZVBaVQ+VRBVQRVUKVURVUFVVD1VENVBPVQrVRHVQX1UP1UQPUEDVCjVET1BQ1Q81RC9QStUKtURvUFrVD7dF/qAPqiDqhzqgL6ooSUTeEIRwBRCASUYhGDGIRhyDikYBEJCEZKUhFGkJIRwYykYVs5CAXechHAQpRhGKoO+qBeqJeqDfqg/qifqg/GoAGokFoMBqChqJhaDgagUaiUWg0GoPGonFoPJqAJqJJaDKagqaiaWg6moFmolloNpqD5qJ5aD5agBaiRWgxWoKWomVoOVqBVqJVaDVag9aidWg92oA2ok1oM9qCtqJtaDvagXaiXWg32oP2on1oPzqADqJD6DA6go6iY+g4OoFOolPoNDqDzqJz6Dy6gC6iS+gyuoKuomvoOrqBbqJb6Da6g+6ie+g+eoAeokfoMXqCnqJn6Dl6gV6iV+g1eoPeonfoPfqAPqJP6DP6gr6ib+g7+oF+ol/oN/qD/qJ/KAnF6cn05HoKPaWeSk+tp9HT6vF6Oj29nkHPqGfSM+tZ9Kx6Nj27nkPPqefSc+t59Lx6gp5Pz68X0AvqhfTCehG9qF5ML66X0EvqpfTSehm9rF5OL69X0CvqlfTKehW9ql5Nr67X0GvqtfTaeh29rl5Pr6830BvqjfTGehO9qd5Mb6630FvqrfTWehu9rd5Ob6//p3fQO+qd9M56F72rnqh30zEd14FO6KRO6bTO6KzO6VDndUEXdUmXdUVXdU1Huq4buqlbuq07uqt7uq8HeqhHekzvrvfQe+q99N56H72v3k/vrw/QB+qD9MH6EH2oPkwfro/QR+qj9NH6GH2sPk4fr0/QJ+qT9Mn6FH2qPk2frs/QZ+qz9Nn6HH2uPk+fry/QF+qL9MX6En2pvkxfrq/QV+qr9NX6Gn2tvk5fr2/QN+qb9M36Fn2rvk3fru/Qd+q79N36Hn2vvk/frx/QD+qH9MP6Ef2ofkw/rp/QT+qn9NP6Gf2sfk4/r1/QL+qX9Mv6Ff2qfk2/rt/Qb+q39Nv6Hf2ufk+/rz/QH+qP9Mf6E/2p/kx/rr/QX+qv9Nf6G/2t/k5/r3/QP+qf9M/6F/2r/k3/rv/Qf+q/9N/6H/2v/k9P0uOMZEZyI4WR0khlpDbSGGmNeCOdkd7IYGQ0MhmZjSxGViObkd3IYeQ0chm5jTxGXiPByGfkNwoYBY1CRmGjiFHUKGYUN0oYJY1SRmmjjFHWKGeUNyoYFY1KRmWjilHVqGZUN2oYNY1aRm2jjlHXqGfUNxoYDY1GRmOjidHUaGY0N1oYLY1WRmujjdHWaGe0N1LHdTA6Gp2MzkYXo6uRaHQzMAM3gEEYpEEZtMEYrMEZ0OANwRANyZANxVANzUCGbhiGaViGbTiGa3iGbwRGaERGzOhu9DB6Gr2M3kYfo6/Rz+hvDDAGGoOMwcYQY6gxzBhujDBGGqOM0cYYY6wxzhhvTDAmGpOMycYUY6oxzZhuzDBmGrOM2cYcY64xz5hvLDAWGouMxcYSY6mxzFhurDBWGquM1cYaY62xzlhvbDA2GpuMzcYWY6uxzdhu7DB2GruM3cYeY6+xz9hvHDAOGoeMw8YR46hxzDhunDBOGqeM08YZ46xxzjhvXDAuGpeMy8YV46pxzbhu3DBuGreM28Yd465xz7hvPDAeGo+Mx8YT46nxzHhuvDBeGq+M18Yb463xznhvfDA+Gp+Mz8YX46vxzfhu/DB+Gr+M38Yf46/xz0gy4sxkZnIzhZnSTGWmNtOYac14M52Z3sxgZjQzmZnNLGZWM5uZ3cxh5jRzmbnNPGZeM8HMZ+Y3C5gFzUJmYbOIWdQsZhY3S5glzVJmabOMWdYsZ5Y3K5gVzUpmZbOKWdWsZlY3a5g1zVpmbbOOWdesZ9Y3G5gNzUZmY7OJ2dRsZjY3W5gtzVZma7ON2dZsZ7Y3/zM7mB3NTmZns4vZ1Uw0u5mYiZvAJEzSpEzaZEzW5Exo8qZgiqZkyqZiqqZmIlM3DdM0LdM2HdM1PdM3AzM0IzNmdjd7mD3NXmZvs4/Z1+xn9jcHmAPNQeZgc4g51BxmDjdHmCPNUeZoc4w51hxnjjcnmBPNSeZkc4o51ZxmTjdnmDPNWeZsc44515xnzjcXmAvNReZic4m51FxmLjdXmCvNVeZqc4251lxnrjc3mBvNTeZmc4u51dxmbjd3mDvNXeZuc4+519xn7jcPmAfNQ+Zh84h51DxmHjdPmCfNU+Zp84x51jxnnjcvmBfNS+Zl84p51bxmXjdvmDfNW+Zt845517xn3jcfmA/NR+Zj84n51HxmPjdfmC/NV+Zr84351nxnvjc/mB/NT+Zn84v51fxmfjd/mD/NX+Zv84/51/xnJplxVjIruZXCSmmlslJbaay0VryVzkpvZbAyWpmszFYWK6uVzcpu5bByWrms3FYeK6+VYOWz8lsFrIJWIauwVcQqahWzilslrJJWKau0VcYqa5WzylsVrIpWJauyVcWqalWzqls1rJpWLau2Vceqa9Wz6lsNrIZWI6ux1cRqajWzmlstrJZWK6u11cZqa7Wz2lv/WR2sjlYnq7PVxepqJVrdLMzCLWARFmlRFm0xFmtxFrR4S7BES7JkS7FUS7OQpVuGZVqWZVuO5Vqe5VuBFVqRFbO6Wz2snlYvq7fVx+pr9bP6WwOsgdYga7A1xBpqDbOGWyOskdYoa7Q1xhprjbPGWxOsidYka7I1xZpqTbOmWzOsmdYsa7Y1x5przbPmWwushdYia7G1xFpqLbOWWyusldYqa7W1xlprrbPWWxusjdYma7O1xdpqbbO2WzusndYua7e1x9pr7bP2Wwesg9Yh67B1xDpqHbOOWyesk9Yp67R1xjprnbPOWxesi9Yl67J1xbpqXbOuWzesm9Yt67Z1x7pr3bPuWw+sh9Yj67H1xHpqPbOeWy+sl9Yr67X1xnprvbPeWx+sj9Yn67P1xfpqfbO+Wz+sn9Yv67f1x/pr/bOSrDg7mZ3cTmGntFPZqe00dlo73k5np7cz2BntTHZmO4ud1c5mZ7dz2DntXHZuO4+d106w89n57QJ2QbuQXdguYhe1i9nF7RJ2SbuUXdouY5e1y9nl7Qp2RbuSXdmuYle1q9nV7Rp2TbuWXduuY9e169n17QZ2Q7uR3dhuYje1m9nN7RZ2S7uV3dpuY7e129nt7f/sDnZHu5Pd2e5id7UT7W42ZuM2sAmbtCmbthmbtTkb2rwt2KIt2bKt2Kqt2cjWbcM2bcu2bcd2bc/27cAO7ciO2d3tHnZPu5fd2+5j97X72f3tAfZAe5A92B5iD7WH2cPtEfZIe5Q92h5jj7XH2ePtCfZEe5I92Z5iT7Wn2dPtGfZMe5Y9255jz7Xn2fPtBfZCe5G92F5iL7WX2cvtFfZKe5W92l5jr7XX2evtDfZGe5O92d5ib7W32dvtHfZOe5e9295j77X32fvtA/ZB+5B92D5iH7WP2cftE/ZJ+5R92j5jn7XP2eftC/ZF+5J92b5iX7Wv2dftG/ZN+5Z9275j37Xv2fftB/ZD+5H92H5iP7Wf2c/tF/ZL+5X92n5jv7Xf2e/tD/ZH+5P92f5if7W/2d/tH/ZP+5f92/5j/7X/2Ul2nJPMSe6kcFI6qZzUThonrRPvpHPSOxmcjE4mJ7OTxcnqZHOyOzmcnE4uJ7eTx8nrJDj5nPxOAaegU8gp7BRxijrFnOJOCaekU8op7ZRxyjrlnPJOBaeiU8mp7FRxqjrVnOpODaemU8up7dRx6jr1nPpOA6eh08hp7DRxmjrNnOZOC6el08pp7bRx2jrtnPbOf04Hp6PTyensdHG6OolONwdzcAc4hEM6lEM7jMM6nAMd3hEc0ZEc2VEc1dEc5OiO4ZiO5diO47iO5/hO4IRO5MSc7k4Pp6fTy+nt9HH6Ov2c/s4AZ6AzyBnsDHGGOsOc4c4IZ6QzyhntjHHGOuOc8c4EZ6IzyZnsTHGmOtOc6c4MZ6Yzy5ntzHHmOvOc+c4CZ6GzyFnsLHGWOsuc5c4KZ6WzylntrHHWOuuc9c4GZ6OzydnsbHG2Otuc7c4OZ6ezy9nt7HH2Ovuc/c4B56BzyDnsHHGOOsec484J56RzyjntnHHOOuec884F56JzybnsXHGuOtec684N56Zzy7nt3HHuOvec+84D56HzyHnsPHGeOs+c584L56XzynntvHHeOu+c984H56PzyfnsfHG+Ot+c784P56fzy/nt/HH+Ov+cJCfOTeYmd1O4Kd1Ubmo3jZvWjXfTuendDG5GN5Ob2c3iZnWzudndHG5ON5eb283j5nUT3HxufreAW9At5BZ2i7hF3WJucbeEW9It5ZZ2y7hl3XJuebeCW9Gt5FZ2q7hV3WpudbeGW9Ot5dZ267h13XpufbeB29Bt5DZ2m7hN3WZuc7eF29Jt5bZ227ht3XZue/c/t4Pb0e3kdna7uF3dRLebi7m4C1zCJV3KpV3GZV3OhS7vCq7oSq7sKq7qai5ydddwTddybddxXddzfTdwQzdyY253t4fb0+3l9nb7uH3dfm5/d4A70B3kDnaHuEPdYe5wd4Q70h3ljnbHuGPdce54d4I70Z3kTnanuFPdae50d4Y7053lznbnuHPdee58d4G70F3kLnaXuEvdZe5yd4W70l3lrnbXuGvdde56d4O70d3kbna3uFvdbe52d4e7093l7nb3uHvdfe5+94B70D3kHnaPuEfdY+5x94R70j3lnnbPuGfdc+5594J70b3kXnavuFfda+5194Z7073l3nbvuHfde+5994H70H3kPnafuE/dZ+5z94X70n3lvnbfuG/dd+5794P70f3kfna/uF/db+5394f70/3l/nb/uH/df26SG+cl85J7KbyUXiovtZfGS+vFe+m89F4GL6OXycvsZfGyetm87F4OL6eXy8vt5fHyeglePi+/V8Ar6BXyCntFvKJeMa+4V8Ir6ZXySntlvLJeOa+8V8Gr6FXyKntVvKpeNa+6V8Or6dXyant1vLpePa++18Br6DXyGntNvKZeM6+518Jr6bXyWnttvLZeO6+995/XwevodfI6e128rl6i183DPNwDHuGRHuXRHuOxHudBj/cET/QkT/YUT/U0D3m6Z3imZ3m253iu53m+F3ihF3kxr7vXw+vp9fJ6e328vl4/r783wBvoDfIGe0O8od4wb7g3whvpjfJGe2O8sd44b7w3wZvoTfIme1O8qd40b7o3w5vpzfJme3O8ud48b763wFvoLfIWe0u8pd4yb7m3wlvprfJWe2u8td46b723wdvobfI2e1u8rd42b7u3w9vp7fJ2e3u8vd4+b793wDvoHfIOe0e8o94x77h3wjvpnfJOe2e8s94577x3wbvoXfIue1e8q94177p3w7vp3fJue3e8u9497773wHvoPfIee0+8p94z77n3wnvpvfJee2+8t9477733wfvoffI+e1+8r94377v3w/vp/fJ+e3+8v94/L8mL85P5yf0Ufko/lZ/aT+On9eP9dH56P4Of0c/kZ/az+Fn9bH52P4ef08/l5/bz+Hn9BD+fn98v4Bf0C/mF/SJ+Ub+YX9wv4Zf0S/ml/TJ+Wb+cX96v4Ff0K/mV/Sp+Vb+aX92v4df0a/m1/Tp+Xb+eX99v4Df0G/mN/SZ+U7+Z39xv4bf0W/mt/TZ+W7+d397/z+/gd/Q7+Z39Ln5XP9Hv5mM+7gOf8Emf8mmf8Vmf86HP+4Iv+pIv+4qv+pqPfN03fNO3fNt3fNf3fN8P/NCP/Jjf3e/h9/R7+b39Pn5fv5/f3x/gD/QH+YP9If5Qf5g/3B/hj/RH+aP9Mf5Yf5w/3p/gT/Qn+ZP9Kf5Uf5o/3Z/hz/Rn+bP9Of5cf54/31/gL/QX+Yv9Jf5Sf5m/3F/hr/RX+av9Nf5af52/3t/gb/Q3+Zv9Lf5Wf5u/3d/h7/R3+bv9Pf5ef5+/3z/gH/QP+Yf9I/5R/5h/3D/hn/RP+af9M/5Z/5x/3r/gX/Qv+Zf9K/5V/5p/3b/h3/Rv+bf9O/5d/55/33/gP/Qf+Y/9J/5T/5n/3H/hv/Rf+a/9N/5b/53/3v/gf/Q/+Z/9L/5X/5v/3f/h//R/+b/9P/5f/5+f5McFyYLkQYogZZAqSB2kCdIG8UG6IH2QIcgYZAoyB1mCrEG2IHuQI8gZ5ApyB3mCvEFCkC/IHxQICgaFgsJBkaBoUCwoHpQISgalgtJBmaBsUC4oH1QIKgaVgspBlaBqUC2oHtQIaga1gtpBnaBuUC+oHzQIGgaNgsZBk6Bp0CxoHrQIWgatgtZBm6Bt0C5oH/wXdAg6Bp2CzkGXoGuQGHQLsAAPQEAEZEAFdMAEbMAFMOADIRADKZADJVADLUCBHhiBGViBHTiBG3iBHwRBGERBLOge9Ah6Br2C3kGfoG/QL+gfDAgGBoOCwcGQYGgwLBgejAhGBqOC0cGYYGwwLhgfTAgmBpOCycGUYGowLZgezAhmBrOC2cGcYG4wL5gfLAgWBouCxcGSYGmwLFgerAhWBquC1cGaYG2wLlgfbAg2BpuCzcGWYGuwLdge7Ah2BruC3cGeYG+wL9gfHAgOBoeCw8GR4GhwLDgenAhOBqeC08GZ4GxwLjgfXAguBpeCy8GV4GpwLbge3AhuBreC28Gd4G5wL7gfPAgeBo+Cx8GT4GnwLHgevAheBq+C18Gb4G3wLngffAg+Bp+Cz8GX4GvwLfge/Ah+Br+C38Gf4G/wL0gK4sJkYfIwRZgyTBWmDtOEacP4MF2YPswQZgwzhZnDLGHWMFuYPcwR5gxzhbnDPGHeMCHMF+YPC4QFw0Jh4bBIWDQsFhYPS4Qlw1Jh6bBMWDYsF5YPK4QVw0ph5bBKWDWsFlYPa4Q1w1ph7bBOWDesF9YPG4QNw0Zh47BJ2DRsFjYPW4Qtw1Zh67BN2DZsF7YP/ws7hB3DTmHnsEvYNUwMu4VYiIcgJEIypEI6ZEI25EIY8qEQiqEUyqESqqEWolAPjdAMrdAOndANvdAPgzAMozAWdg97hD3DXmHvsE/YN+wX9g8HhAPDQeHgcEg4NBwWDg9HhCPDUeHocEw4NhwXjg8nhBPDSeHkcEo4NZwWTg9nhDPDWeHscE44N5wXzg8XhAvDReHicEm4NFwWLg9XhCvDVeHqcE24NlwXrg83hBvDTeHmcEu4NdwWbg93hDvDXeHucE+4N9wX7g8PhAfDQ+Hh8Eh4NDwWHg9PhCfDU+Hp8Ex4NjwXng8vhBfDS+Hl8Ep4NbwWXg9vhDfDW+Ht8E54N7wX3g8fhA/DR+Hj8En4NHwWPg9fhC/DV+Hr8E34NnwXvg8/hB/DT+Hn8Ev4NfwWfg9/hD/DX+Hv8E/4N/wXJoVxUbIoeZQiShmlilJHaaK0UXyULkofZYgyRpmizFGWKGuULcoe5YhyRrmi3FGeKG+UEOWL8kcFooJRoahwVCQqGhWLikclopJRqah0VCYqG5WLykcVoopRpahyVCWqGlWLqkc1oppRrah2VCeqG9WL6kcNooZRo6hx1CRqGjWLmkctopZRq6h11CZqG7WL2kf/RR2ijlGnqHPUJeoaJUbdIizCIxARERlRER0xERtxEYz4SIjESIrkSInUSItQpEdGZEZWZEdO5EZe5EdBFEZRFIu6Rz2inlGvqHfUJ+ob9Yv6RwOigdGgaHA0JBoaDYuGRyOikdGoaHQ0JhobjYvGRxOiidGkaHI0JZoaTYumRzOimdGsaHY0J5obzYvmRwuihdGiaHG0JFoaLYuWRyuildGqaHW0JlobrYvWRxuijdGmaHO0JdoabYu2RzuindGuaHe0J9ob7Yv2Rweig9Gh6HB0JDoaHYuORyeik9Gp6HR0JjobnYvORxeii9Gl6HJ0JboaXYuuRzeim9Gt6HZ0J7ob3YvuRw+ih9Gj6HH0JHoaPYueRy+il9Gr6HX0JnobvYveRx+ij9Gn6HP0JfoafYu+Rz+in9Gv6Hf0J/ob/YuSorhYsljyWIpYyliqWOpYmljaWHwsXSx9LEMsYyxTLHMsSyxrLFsseyxHLGcsVyx3LE8sbywhli+WP1YgVjBWKFY4ViRWNFYsVjxWIlYyVipWOlYmVjZWLlY+ViFWMVYpVjlWJVY1Vi1WPVYjVjNWK1Y7VidWN1YvVj/WINYw1ijWONYk1jTWLNY81iLWMtYq1jrWJtY21i7WPvZfrEOsY6xTrHOsS6xrLDHWLYbF8Bj4Hw33uKA3swAAeLe2bdu2bXu3thXbTiYzeWvbtm3btu32/DnfjTxYb6wP1hfrhyVgiVh/bAA2EBuEDcaGYEOxYdhwbAQ2EhuFjcbGYGOxcdh4bAI2EZuETcamYFMxDMMxAiMxCqMxBmMxDuMxARMxCZMxBVMxDdMxAzMxC7MxB3MxD/OxAANYiEEMYREWw6Zh07EZ2ExsFjYbm4PNxeZh87EF2EJsEbYYW4ItxZZhy7EV2EpsFbYaW4OtxdZh67EN2EZsE7YZ24JtxbZh27Ed2E5sF7Yb24PtxfZh+7ED2EHsEHYYO4IdxY5hx7ET2EnsFHYaO4Odxc5h57EL2EXsEnYZu4Jdxa5h17Eb2E3sFnYbu4Pdxe5h97EH2EPsEfYYe4I9xZ5hz7EX2EvsFfYae4O9xd5h77EP2EfsE/YZ+4J9xb5h37Ef2E/sF/Yb+4P9xf5hcXg8ngRPiifDk+Mp8JR4Kjw1ngZPi6fD0+MZ8Ix4JjwzngXPimfDs+M58Jx4Ljw3ngfPi+fD8+MF8IJ4IbwwXgQvihfDi+Ml8JJ4Kbw0XgYvi5fDy+MV8Ip4JbwyXgWvilfDq+M18Jp4Lbw2Xgevi9fD6+MN8IZ4I7wx3gRvijfDm+Mt8JZ4K7w13gZvi7fD2+Md8I54J7wz3gXvinfDu+M98J54L7w33gfvi/fDE/BEvD8+AB+ID8IH40PwofgwfDg+Ah+Jj8JH42Pwsfg4fDw+AZ+IT8In41PwqTiG4ziBkziF0ziDsziH87iAi7iEy7iCq7iG67iBm7iF27iDuynicB8PcICHOMQRHuExfBo+HZ+Bz8Rn4bPxOfhcfB4+H1+AL8QX4YvxJfhSfBm+HF+Br8RX4avxNfhafB2+Ht+Ab8Q34ZvxLfhWfBu+Hd+B78R34bvxPfhefB++Hz+AH8QP4YfxI/hR/Bh+HD+Bn8RP4afxM/hZ/Bx+Hr+AX8Qv4ZfxK/hV/Bp+Hb+B38Rv4bfxO/hd/B5+H3+AP8Qf4Y/xJ/hT/Bn+HH+Bv8Rf4a/xN/hb/B3+Hv+Af8Q/4Z/xL/hX/Bv+Hf+B/8R/4b/xP/hf/B8eR8QTSYikRDIiOZGCSEmkIlITaYi0RDoiPZGByEhkIjITWYisRDYiO5GDyEnkInITeYi8RD4iP1GAKEgUIgoTRYiiRDGiOFGCKEmUIkoTZYiyRDmiPFGBqEhUIioTVYiqRDWiOlGDqEnUImoTdYi6RD2iPtGAaEg0IhoTTYimRDOiOdGCaEm0IloTbYi2RDuiPdGB6Eh0IjoTXYiuRDeiO9GD6En0InoTfYi+RD8igUgk+hMDiIHEIGIwMYQYSgwjhhMjiJHEKGI0MYYYS4wjxhMTiInEJGIyMYWYSmAEThAESVAETTAES3AETwiESEiETCiESmiEThiESViETTiES3iETwQEIEICEoiIiBgxjZhOzCBmErOI2cQcYi4xj5hPLCAWEouIxcQSYimxjFhOrCBWEquI1cQaYi2xjlhPbCA2EpuIzcQWYiuxjdhO7CB2EruI3cQeYi+xj9hPHCAOEoeIw8QR4ihxjDhOnCBOEqeI08QZ4ixxjjhPXCAuEpeIy8QV4ipxjbhO3CBuEreI28Qd4i5xj7hPPCAeEo+Ix8QT4inxjHhOvCBeEq+I18Qb4i3xjnhPfCA+Ep+Iz8QX4ivxjfhO/CB+Er+I38Qf4i/xj4gj48kkZFIyGZmcTEGmJFORqck0ZFoyHZmezEBmJDORmcksZFYyG5mdzEHmJHORuck8ZF4yH5mfLEAWJAuRhckiZFGyGFmcLEGWJEuRpckyZFmyHFmerEBWJCuRlckqZFWyGlmdrEHWJGuRtck6ZF2yHlmfbEA2JBuRjckmZFOyGdmcbEG2JFuRrck2ZFuyHdme7EB2JDuRnckuZFeyG9md7EH2JHuRvck+ZF+yH5lAJpL9yQHkQHIQOZgcQg4lh5HDyRHkSHIUOZocQ44lx5HjyQnkRHISOZmcQk4lMRInCZIkKZImGZIlOZInBVIkJVImFVIlNVInDdIkLdImHdIlPdInAxKQIQlJREZkjJxGTidnkDPJWeRscg45l5xHzicXkAvJReRicgm5lFxGLidXkCvJVeRqcg25llxHric3kBvJTeRmcgu5ldxGbid3kDvJXeRucg+5l9xH7icPkAfJQ+Rh8gh5lDxGHidPkCfJU+Rp8gx5ljxHnicvkBfJS+Rl8gp5lbxGXidvkDfJW+Rt8g55l7xH3icfkA/JR+Rj8gn5lHxGPidfkC/JV+Rr8g35lnxHvic/kB/JT+Rn8gv5lfxGfid/kD/JX+Rv8g/5l/xHxlHxVBIqKZWMSk6loFJSqajUVBoqLZWOSk9loDJSmajMVBYqK5WNyk7loHJSuajcVB4qL5WPyk8VoApShajCVBGqKFWMKk6VoEpSpajSVBmqLFWOKk9VoCpSlajKVBWqKlWNqk7VoGpStajaVB2qLlWPqk81oBpSjajGVBOqKdWMak61oFpSrajWVBuqLdWOak91oDpSnajOVBeqK9WN6k71oHpSvajeVB+qL9WPSqASqf7UAGogNYgaTA2hhlLDqOHUCGokNYoaTY2hxlLjqPHUBGoiNYmaTE2hplIYhVMERVIURVMMxVIcxVMCJVISJVMKpVIapVMGZVIWZVMO5VIe5VMBBaiQghSiIipGTaOmUzOomdQsajY1h5pLzaPmUwuohdQiajG1hFpKLaOWUyuoldQqajW1hlpLraPWUxuojdQmajO1hdpKbaO2UzuondQuaje1h9pL7aP2Uweog9Qh6jB1hDpKHaOOUyeok9Qp6jR1hjpLnaPOUxeoi9Ql6jJ1hbpKXaOuUzeom9Qt6jZ1h7pL3aPuUw+oh9Qj6jH1hHpKPaOeUy+ol9Qr6jX1hnpLvaPeUx+oj9Qn6jP1hfpKfaO+Uz+on9Qv6jf1h/pL/aPi6Hg6CZ2UTkYnp1PQKelUdGo6DZ2WTkenpzPQGelMdGY6C52VzkZnp3PQOelcdG46D52XzkfnpwvQBelCdGG6CF2ULkYXp0vQJelSdGm6DF2WLkeXpyvQFelKdGW6Cl2VrkZXp2vQNeladG26Dl2XrkfXpxvQDelGdGO6Cd2UbkY3p1vQLelWdGu6Dd2Wbke3pzvQHelOdGe6C92V7kZ3p3vQPeledG+6D92X7kcn0Il0f3oAPZAeRA+mh9BD6WH0cHoEPZIeRY+mx9Bj6XH0eHoCPZGeRE+mp9BTaYzGaYImaYqmaYZmaY7maYEWaYmWaYVWaY3WaYM2aYu2aYd2aY/26YAGdEhDGtERHaOn0dPpGfRMehY9m55Dz6Xn0fPpBfRCehG9mF5CL6WX0cvpFfRKehW9ml5Dr6XX0evpDfRGehO9md5Cb6W30dvpHfROehe9m95D76X30fvpA/RB+hB9mD5CH6WP0cfpE/RJ+hR9mj5Dn6XP0efpC/RF+hJ9mb5CX6Wv0dfpG/RN+hZ9m75D36Xv0ffpB/RD+hH9mH5CP6Wf0c/pF/RL+hX9mn5Dv6Xf0e/pD/RH+hP9mf5Cf6W/0d/pH/RP+hf9m/5D/6X/0XFMPJOEScokY5IzKZiUTComNZOGScukY9IzGZiMTCYmM5OFycpkY7IzOZicTC4mN5OHycvkY/IzBZiCTCGmMFOEKcoUY4ozJZiSTCmmNFOGKcuUY8ozFZiKTCWmMlOFqcpUY6ozNZiaTC2mNlOHqcvUY+ozDZiGTCOmMdOEaco0Y5ozLZiWTCumNdOGacu0Y9ozHZiOTCemM9OF6cp0Y7ozPZieTC+mN9OH6cv0YxKYRKY/M4AZyAxiBjNDmKHMMGY4M4IZyYxiRjNjmLHMOGY8M4GZyExiJjNTmKkMxuAMwZAMxdAMw7AMx/CMwIiMxMiMwqiMxuiMwZiMxdiMw7iMx/hMwAAmZCCDmIiJMdOY6cwMZiYzi5nNzGHmMvOY+cwCZiGziFnMLGGWMsuY5cwKZiWzilnNrGHWMuuY9cwGZiOzidnMbGG2MtuY7cwOZiezi9nN7GH2MvuY/cwB5iBziDnMHGGOMseY48wJ5iRzijnNnGHOMueY88wF5iJzibnMXGGuMteY68wN5iZzi7nN3GHuMveY+8wD5iHziHnMPGGeMs+Y58wL5iXzinnNvGHeMu+Y98wH5iPzifnMfGG+Mt+Y78wP5ifzi/nN/GH+Mv+YODaeTcImZZOxydkUbEo2FZuaTcOmZdOx6dkMbEY2E5uZzcJmZbOx2dkcbE42F5ubzcPmZfOx+dkCbEG2EFuYLcIWZYuxxdkSbEm2FFuaLcOWZcux5dkKbEW2EluZrcJWZaux1dkabE22FlubrcPWZeux9dkGbEO2EduYbcI2ZZuxzdkWbEu2FduabcO2Zdux7dkObEe2E9uZ7cJ2Zbux3dkebE+2F9ub7cP2ZfuxCWwi258dwA5kB7GD2SHsUHYYO5wdwY5kR7Gj2THsWHYcO56dwE5kJ7GT2SnsVBZjcZZgSZZiaZZhWZZjeVZgRVZiZVZhVVZjddZgTdZibdZhXdZjfTZgARuykEVsxMbYaex0dgY7k53FzmbnsHPZeex8dgG7kF3ELmaXsEvZZexydgW7kuXjVrNr2LXsOnY9u4HdyG5iN7Nb2K3sNnY7u4Pdye5id7N72L3sPnY/e4A9yB5iD7NH2KPsMfY4e4I9yZ5iT7Nn2LPsOfY8e4G9yF5iL7NX2KvsNfY6e4O9yd5ib7N32LvsPfY++4B9yD5iH7NP2KfsM/Y5+4J9yb5iX7Nv2LfsO/Y9+4H9yH5iP7Nf2K/sN/Y7+4P9yf5if7N/2L/sPzaOi+eScEm5ZFxyLgWXkkvFpebScGm5dFx6LgOXkcvEZeaycFm5bFx2LgeXk8vF5ebycHm5fFx+rgBXkCvEFeaKcEW5YlxxrgRXkivFlebKcGW5clx5rgJXkavEVeaqcFW5alx1rgZXk6vF1ebqcHW5elx9rgHXkGvENeaacE25ZlxzrgXXkmvFtebacG25dlx7rgPXkevEdea6cF25blx3rgfXk+vF9eb6cH25flwCl8j15wZwA7lB3GBuCDeUG8YN50ZwI7lR3GhuDDeWG8eN5yZwE7lJ3GRuCjeVwzicIziSoziaYziW4zieEziRkziZUziV0zidMziTszibcziX8zifCzjAhRzkEBdxMW4aN52bwc3kZnGzuTncXG4eN59bwC3kFnGLuSXcUm4Zt5xbwa3kVnGruTXcWm4dt57bwG3kNnGbuS3cVm4bt53bwe3kdnG7uT3cXm4ft587wB3kDnGHuSPcUe4Yd5w7wZ3kTnGnuTPcWe4cd567wF3kLnGXuSvcVe4ad527wd3kbnG3uTvcXe4ed597wD3kHnGPuSfcU+4Z95x7wb3kXnGvuTfcW+4d9577wH3kPnGfuS/cV+4b9537wf3kfnG/uT/cX+4fF8fH80n4pHwyPjmfgk/Jp+JT82n4tHw6Pj2fgc/IZ+Iz81n4rHw2Pjufg8/J5+Jz83n4vHw+Pj9fgC/IF+IL80X4onwxvjhfgi/Jl+JL82X4snw5vjxfga/IV+Ir81X4qnw1vjpfg6/J1+Jr83X4unw9vj7fgG/IN+Ib8034pnwzvjnfgm/Jt+Jb8234tnw7vj3fge/Id+I78134rnw3vjvfg+/J9+J78334vnw/PoFP5PvzA/iB/CB+MD+EH8oP44fzI/iR/Ch+ND+GH8uP48fzE/iJ/CR+Mj+Fn8pjPM4TPMlTPM0zPMtzPM8LvMhLvMwrvMprvM4bvMlbvM07vMt7vM8HPOBDHvKIj/gYP42fzs/gZ/Kz+Nn8HH4uP4+fzy/gF/KL+MX8En4pv4xfzq/gV/Kr+NX8Gn4tv45fz2/gN/Kb+M38Fn4rv43fzu/gd/K7+N38Hn4vv4/fzx/gD/KH+MP8Ef4of4w/zp/gT/Kn+NP8Gf4sf44/z1/gL/KX+Mv8Ff4qf42/zt/gb/K3+Nv8Hf4uf4+/zz/gH/KP+Mf8E/4p/4x/zr/gX/Kv+Nf8G/4t/45/z3/gP/Kf+M/8F/4r/43/zv/gf/K/+N/8H/4v/4+PE+KFJEJSIZmQXEghpBRSCamFNEJaIZ2QXsggZBQyCZmFLEJWIZuQXcgh5BRyCbmFPEJeIZ+QXyggFBQKCYWFIkJRoZhQXCghlBRKCaWFMkJZoZxQXqggVBQqCZWFKkJVoZpQXagh1BRqCbWFOkJdoZ5QX2ggNBQaCY2FJkJToZnQXGghtBRaCa2FNkJboZ3QXuggdBQ6CZ2FLkJXoZvQXegh9BR6Cb2FPkJfoZ+QICQK/YUBwkBhkDBYGCIMFYYJw4URwkhhlDBaGCOMFcYJ44UJwkRhkjBZmCJMFTABFwiBFCiBFhiBFTiBFwRBFCRBFhRBFTRBFwzBFCzBFhzBFTzBFwIBCKEABSREQkyYJkwXZggzhVnCbGGOMFeYJ8wXFggLhUXCYmGJsFRYJiwXVggrhVXCamGNsFZYJ6wXNggbhU3CZmGLsFXYJmwXdgg7hV3CbmGPsFfYJ+wXDggHhUPCYeGIcFQ4JhwXTggnhVPCaeGMcFY4J5wXLggXhUvCZeGKcFW4JlwXbgg3hVvCbeGOcFe4J9wXHggPhUfCY+GJ8FR4JjwXXggvhVfCa+GN8FZ4J7wXPggfhU/CZ+GL8FX4JnwXfgg/hV/Cb+GP8Ff4J8SJ8WISMamYTEwuphBTiqnE1GIaMa2YTkwvZhAzipnEzGIWMauYTcwu5hBzirnE3GIeMa+YT8wvFhALioXEwmIRsahYTCwulhBLiqXE0mIZsaxYTiwvVhAripXEymIVsapYTawu1hBrirXE2mIdsa5YT6wvNhAbio3ExmITsanYTGwuthBbiq3E1mIbsa3YTmwvdhA7ip3EzmIXsavYTewu9hB7ir3E3mIfsa/YT0wQE8X+4gBxoDhIHCwOEYeKw8Th4ghxpDhKHC2OEceK48Tx4gRxojhJnCxOEaeKmIiLhEiKlEiLjMiKnMiLgiiKkiiLiqiKmqiLhmiKlmiLjuiKnuiLgQjEUIQiEiMxJk4Tp4szxJniLHG2OEecK84T54sLxIXiInGxuERcKi4Tl4srxJXiKnG1uEZcK64T14sbxI3iJnGzuEXcKm4Tt4s7xJ3iLnG3uEfcK+4T94sHxIPiIfGweEQ8Kh4Tj4snxJPiKfG0eEY8K54Tz4sXxIviJfGyeEW8Kl4Tr4s3xJviLfG2eEe8K94T74sPxIfiI/Gx+ER8Kj4Tn4svxJfiK/G1+EZ8K74T34sfxI/iJ/Gz+EX8Kn4Tv4s/xJ/iL/G3+Ef8K/4T46R4KYmUVEomJZdSSCmlVFJqKY2UVkonpZcySBmlTFJmKYuUVcomZZdySDmlXFJuKY+UV8on5ZcKSAWlQlJhqYhUVComFZdKSCWlUlJpqYxUVionlZcqSBWlSlJlqYpUVaomVZdqSDWlWlJtqY5UV6on1ZcaSA2lRlJjqYnUVGomNZdaSC2lVlJrqY3UVmontZc6SB2lTlJnqYvUVeomdZd6SD2lXlJvqY/UV+onJUiJUn9pgDRQGiQNloZIQ6Vh0nBphDRSGiWNlsZIY6Vx0nhpgjRRmiRNlqZIUyVMwiVCIiVKoiVGYiVO4iVBEiVJkiVFUiVN0iVDMiVLsiVHciVP8qVAAlIoQQlJkRSTpknTpRnSTGmWNFuaI82V5knzpQXSQmmRtFhaIi2VlknLpRXSSmmVtFpaI62V1knrpQ3SRmmTtFnaIm2VtknbpR3STmmXtFvaI+2V9kn7pQPSQemQdFg6Ih2VjknHpRPSSemUdFo6I52VzknnpQvSRemSdFm6Il2VrknXpRvSTemWdFu6I92V7kn3pQfSQ+mR9Fh6Ij2VnknPpRfSS+mV9Fp6I72V3knvpQ/SR+mT9Fn6In2VvknfpR/ST+mX9Fv6I/2V/klxcrycRE4qJ5OTyynklHIqObWcRk4rp5PTyxnkjHImObOcRc4qZ5OzyznknHIuObecR84r55PzywXkgnIhubBcRC4qF5OLyyXkknIpubRcRi4rl5PLyxXkinIlubJcRa4qV5OryzXkmnItubZcR64r15Pryw3khnIjubHcRG4qN5Obyy3klnIrubXcRm4rt5Pbyx3kjnInubPcRe4qd5O7yz3knnIvubfcR+4r95MT5ES5vzxAHigPkgfLQ+Sh8jB5uDxCHimPkkfLY+Sx8jh5vDxBnihPkifLU+SpMibjMiGTMiXTMiOzMifzsiCLsiTLsiKrsibrsiGbsiXbsiO7sif7ciADOZShjORIjsnT5OnyDHmmPEueLc+R58rz5PnyAnmhvEheLC+Rl8rL5OXyCnmlvEpeLa+R18rr5PXyBnmjvEneLG+Rt8rb5O3yDnmnvEveLe+R98r75P3yAfmgfEg+LB+Rj8rH5OPyCfmkfEo+LZ+Rz8rn5PPyBfmifEm+LF+Rr8rX5OvyDfmmfEu+Ld+R78r35PvyA/mh/Eh+LD+Rn8rP5OfyC/ml/Ep+Lb+R38rv5PfyB/mj/En+LH+Rv8rf5O/yD/mn/Ev+Lf+R/8r/5DglXkmiJFWSKcmVFEpKJZWSWkmjpFXSKemVDEpGJZOSWcmiZFWyKdmVHEpOJZeSW8mj5FXyKfmVAkpBpZBSWCmiFFWKKcWVEkpJpZRSWimjlFXKKeWVCkpFpZJSWamiVFWqKdWVGkpNpZZSW6mj1FXqKfWVBkpDpZHSWGmiNFWaKc2VFkpLpZXSWmmjtFXaKe2VDkpHpZPSWemidFW6Kd2VHkpPpZfSW+mj9FX6KQlKotJfGaAMVAYpg5UhylBlmDJcGaGMVEYpo5UxylhlnDJemaBMVCYpk5UpylQFU3CFUEiFUmiFUViFU3hFUERFUmRFUVRFU3TFUEzFUmzFUVzFU3wlUIASKlBBSqTElGnKdGWGMlOZpcxW5ihzlXnKfGWBslBZpCxWlihLlWXKcmWFslJZpaxW1ihrlXXKemWDslHZpGxWtihblW3KdmWHslPZpexW9ih7lX3KfuWAclA5pBxWjihHlWPKceWEclI5pZxWzihnlXPKeeWCclG5pFxWrihXlWvKdeWGclO5pdxW7ih3lXvKfeWB8lB5pDxWnihPlWfKc+WF8lJ5pbxW3ihvlXfKe+WD8lH5pHxWvihflW/Kd+WH8lP5pfxW/ih/lX9KnBqvJlGTqsnU5GoKNaWaSk2tplHTqunU9GoGNaOaSc2sZlGzqtnU7GoONaeaS82t5lHzqvnU/GoBtaBaSC2sFlGLqsXU4moJtaRaSi2tllHLquXU8moFtaJaSa2sVlGrqtXU6moNtaZaS62t1lHrqvXU+moDtaHaSG2sNlGbqs3U5moLtaXaSm2ttlHbqu3U9moHtaPaSe2sdlG7qt3U7moPtafaS+2t9lH7qv3UBDVR7a8OUAeqg9TB6hB1qDpMHa6OUEeqo9TR6hh1rDpOHa9OUCeqk9TJ6hR1qoqpuEqopEqptMqorMqpvCqooiqpsqqoqqqpumqopmqptuqoruqpvhqoQA1VqCI1UmPqNHW6OkOdqc5SZ6tz1LnqPHW+ukBdqC5SF6tL1KXqMnW5ukJdqa5SV6tr1LXqOnW9ukHdqG5SN6tb1K3qNnW7ukPdqe5Sd6t71L3qPnW/ekA9qB5SD6tH1KPqMfW4ekI9qZ5ST6tn1LPqOfW8ekG9qF5SL6tX1KvqNfW6ekO9qd5Sb6t31LvqPfW++kB9qD5SH6tP1KfqM/W5+kJ9qb5SX6tv1LfqO/W9+kH9qH5SP6tf1K/qN/W7+kP9qf5Sf6t/1L/qPzVOi9eSaEm1ZFpyLYWWUkulpdbSaGm1dFp6LYOWUcukZdayaFm1bFp2LYeWU8ul5dbyaHm1fFp+rYBWUCukFdaKaEW1YlpxrYRWUiulldbKaGW1clp5rYJWUaukVdaqaFW1alp1rYZWU6ul1dbqaHW1elp9rYHWUGukNdaaaE21ZlpzrYXWUmultdbaaG21dlp7rYPWUeukdda6aF21blp3rYfWU+ul9db6aH21flqClqj11wZoA7VB2mBtiDZUG6YN10ZoI7VR2mhtjDZWG6eN1yZoE7VJ2mRtijZVwzRcIzRSozRaYzRW4zReEzRRkzRZUzRV0zRdMzRTszRbczRX8zRfCzSghRrUkBZpMW2aNl2boc3UZmmztTnaXG2eNl9boC3UFmmLtSXaUm2Ztlxboa3UVmmrtTXaWm2dtl7boG3UNmmbtS3aVm2btl3boe3Udmm7tT3aXm2ftl87oB3UDmmHtSPaUe2Ydlw7oZ3UTmmntTPaWe2cdl67oF3ULmmXtSvaVe2adl27od3Ubmm3tTvaXe2edl97oD3UHmmPtSfaU+2Z9lx7ob3UXmmvtTfaW+2d9l77oH3UPmmftS/aV+2b9l37of3Ufmm/tT/aX+2fFqfH60n0pHoyPbmeQk+pp9JT62n0tHo6Pb2eQc+oZ9Iz61n0rHo2PbueQ8+p59Jz63n0vHo+Pb9eQC+oF9IL60X0onoxvbheQi+pl9JL62X0sno5vbxeQa+oV9Ir61X0qno1vbpeQ6+p19Jr63X0uno9vb7eQG+oN9Ib6030pnozvbneQm+pt9Jb6230tno7vb3eQe+od9I76130rno3vbveQ++p99J76330vno/PUFP1PvrA/SB+iB9sD5EH6oP04frI/SR+ih9tD5GH6uP08frE/SJ+iR9sj5Fn6pjOq4TOqlTOq0zOqtzOq8LuqhLuqwruqpruq4buqlbuq07uqt7uq8HOtBDHepIj/SYPk2frs/QZ+qz9Nn6HH2uPk+fry/QF+qL9MX6En2pvkxfrq/QV+qr9NX6Gn2tvk5fr2/QN+qb9M36Fn2rvk3fru/Qd+q79N36Hn2vvk/frx/QD+qH9MP6Ef2ofkw/rp/QT+qn9NP6Gf2sfk4/r1/QL+qX9Mv6Ff2qfk2/rt/Qb+q39Nv6Hf2ufk+/rz/QH+qP9Mf6E/2p/kx/rr/QX+qv9Nf6G/2t/k5/r3/QP+qf9M/6F/2r/k3/rv/Qf+q/9N/6H/2v/k+PM+KNJEZSI5mR3EhhpDRSGamNNEZaI52R3shgZDQyGZmNLEZWI5uR3chh5DRyGbmNPEZeI5+R3yhgFDQKGYWNIkZRo5hR3ChhlDRKGaWNMkZZo5xR3qhgVDQqGZWNKkZVo5pR3ahh1DRqGbWNOkZdo55R32hgNDQaGY2NJkZTo5nR3GhhtDRaGa2NNkZbo53R3uhgdDQ6GZ2NLkZXo5vR3ehh9DR6Gb2NPkZfo5+RYCQa/Y0BxkBjkDHYGGIMNYYZw40RxkhjlDHaGGOMNcYZ440JxkRjkjHZmGJMNTADNwiDNCiDNhiDNTiDNwRDNCRDNhRDNTRDNwzDNCzDNhzDNTzDNwIDGKEBDWRERsyYZkw3ZhgzjVnGbGOOMdeYZ8w3FhgLjUXGYmOJsdRYZiw3VhgrjVXGamONsdZYZ6w3NhgbjU3GZmOLsdXYZmw3dhg7jV3GbmOPsdfYZ+w3DhgHjUPGYeOIcdQ4Zhw3ThgnjVPGaeOMcdY4Z5w3LhgXjUvGZeOKcdW4Zlw3bhg3jVvGbeOOcde4Z9w3HhgPjUfGY+OJ8dR4Zjw3XhgvjVfGa+ON8dZ4Z7w3PhgfjU/GZ+OL8dX4Znw3fhg/jV/Gb+OP8df4Z8SZ8WYSM6mZzExupjBTmqnM1GYaM62ZzkxvZjAzmpnMzGYWM6uZzcxu5jBzmrnM3GYeM6+Zz8xvFjALmoXMwmYRs6hZzCxuljBLmqXM0mYZs6xZzixvVjArmpXMymYVs6pZzaxu1jBrmrXM2mYds65Zz6xvNjAbmo3MxmYTs6nZzGxutjBbmq3M1mYbs63ZzmxvdjA7mp3MzmYXs6vZzexu9jB7mr3M3mYfs6/Zz0wwE83+5gBzoDnIHGwOMYeaw8zh5ghzpDnKHG2OMcea48zx5gRzojnJnGxOMaeamImbhEmalEmbjMmanMmbgimakimbiqmamqmbhmmalmmbjumanumbgQnM0IQmMiMzZk4zp5szzJnmLHO2Oceca84z55sLzIXmInOxucRcai4zl5srzJXmKnO1ucZca64z15sbzI3mJnOzucXcam4zt5s7zJ3mLnO3ucfca+4z95sHzIPmIfOwecQ8ah4zj5snzJPmKfO0ecY8a54zz5sXzIvmJfOyecW8al4zr5s3zJvmLfO2ece8a94z75sPzIfmI/Ox+cR8aj4zn5svzJfmK/O1+cZ8a74z35sfzI/mJ/Oz+cX8an4zv5s/zJ/mL/O3+cf8a/4z46x4K4mV1EpmJbdSWCmtVFZqK42V1kpnpbcyWBmtTFZmK4uV1cpmZbdyWDmtXFZuK4+V18pn5bcKWAWtQlZhq4hV1CpmFbdKWCWtUlZpq4xV1ipnlbcqWBWtSlZlq4pV1apmVbdqWDWtWlZtq45V16pn1bcaWA2tRlZjq4nV1GpmNbdaWC2tVlZrq43V1mpntbc6WB2tTlZnq4vV1epmdbd6WD2tXlZvq4/V1+pnJViJVn9rgDXQGmQNtoZYQ61h1nBrhDXSGmWNtsZYY61x1nhrgjXRmmRNtqZYUy3Mwi3CIi3Koi3GYi3O4i3BEi3Jki3FUi3N0i3DMi3Lsi3Hci3P8q3AAlZoQQtZkRWzplnTrRnWTGuWNduaY8215lnzrQXWQmuRtdhaYi21llnLrRXWSmuVtdpaY6211lnrrQ3WRmuTtdnaYm21tlnbrR3WTmuXtdvaY+219ln7rQPWQeuQddg6Yh21jlnHrRPWSeuUddo6Y521zlnnrQvWReuSddm6Yl21rlnXrRvWTeuWddu6Y9217ln3rQfWQ+uR9dh6Yj21nlnPrRfWS+uV9dp6Y7213lnvrQ/WR+uT9dn6Yn21vlnfrR/WT+uX9dv6Y/21/llxdrydxE5qJ7OT2ynslHYqO7Wdxk5rp7PT2xnsjHYmO7Odxc5qZ7Oz2znsnHYuO7edx85r57Pz2wXsgnYhu7BdxC5qF7OL2yXsknYpu7Rdxi5rl7PL2xXsinYlu7Jdxa5qV7Or2zXsmnYtu7Zdx65r17Pr2w3shnYju7HdxG5qN7Ob2y3slnYru7Xdxm5rt7Pb2x3sjnYnu7Pdxe5qd7O72z3snnYvu7fdx+5r97MT7ES7vz3AHmgPsgfbQ+yh9jB7uD3CHmmPskfbY+yx9jh7vD3BnmhPsifbU+ypNmbjNmGTNmXTNmOzNmfztmCLtmTLtmKrtmbrtmGbtmXbtmO7tmf7dmADO7ShjezIjtnT7On2DHumPcuebc+x59rz7Pn2AnuhvchebC+xl9rL7OX2Cnulvcpeba+x19rr7PX2BnujvcnebG+xt9rb7O32Dnunvcvebe+x99r77P32Afugfcg+bB+xj9rH7OP2Cfukfco+bZ+xz9rn7PP2Bfuifcm+bF+xr9rX7Ov2Dfumfcu+bd+x79r37Pv2A/uh/ch+bD+xn9rP7Of2C/ul/cp+bb+x39rv7Pf2B/uj/cn+bH+xv9rf7O/2D/un/cv+bf+x/9r/7Dgn3kniJHWSOcmdFE5KJ5WT2knjpHXSOemdDE5GJ5OT2cniZHWyOdmdHE5OJ5eT28nj5HXyOfmdAk5Bp5BT2CniFHWKOcWdEk5Jp5RT2injlHXKOeWdCk5Fp5JT2aniVHWqOdWdGk5Np5ZT26nj1HXqOfWdBk5Dp5HT2GniNHWaOc2dFk5Lp5XT2mnjtHXaOe2dDk5Hp5PT2enidHW6Od2dHk5Pp5fT2+nj9HX6OQlOotPfGeAMdAY5g50hzlBnmDPcGeGMdEY5o50xzlhnnDPemeBMdCY5k50pzlQHc3CHcEiHcmiHcbL9X4YTHcmRHcVRHc3RHcMxHcuxHcdxHc/xncABTuhABzmRE3OmOdOdGc5MZ5Yz25njzHXmOfOdBc5CZ5Gz2FniLHWWOcudFc5KZ5Wz2lnjrHXWOeudDc5GZ5Oz2dnibHW2OdudHc5OZ5ez29nj7HX2OfudA85B55Bz2DniHHWOOcedE85J55Rz2jnjnHXOOeedC85F55Jz2bniXHWuOdedG85N55Zz27nj3HXuOfedB85D55Hz2HniPHWeOc+dF85L55Xz2nnjvHXeOe+dD85H55Pz2fnifHW+Od+dH85P55fz2/nj/HX+OXFuvJvETeomc5O7KdyUbio3tZvGTeumc9O7GdyMbiY3s5vFzepmc7O7Odycbi43t5vHzevmc/O7BdyCbiG3sFvELeoWc4u7JdySbim3tFvGLeuWc8u7FdyKbiW3slvFrepWc6u7Ndyabi23tlvHrevWc+u7DdyGbiO3sdvEbeo2c5u7LdyWbiu3tdvGbeu2c9u7HdyObie3s9vF7ep2c7u7Pdyebi+3t9vH7ev2cxPcRLe/O8Ad6A5yB7tD3KHuMHe4O8Id6Y5yR7tj3LHuOHe8O8Gd6E5yJ7tT3Kku5uIu4ZIu5dIu47Iu5/Ku4Iqu5Mqu4qqu5uqu4Zqu5dqu47qu5/pu4AI3dKGL3MiNudPc6e4Md6Y7y53tznHnuvPc+e4Cd6G7yF3sLnGXusvc5e4Kd6W7yl3trnHXuuvc9e4Gd6O7yd3sbnG3utvc7e4Od6e7y93t7nH3uvvc/e4B96B7yD3sHnGPusfc4+4J96R7yj3tnnHPuufc8+4F96J7yb3sXnGvutfc6+4N96Z7y73t3nHvuvfc++4D96H7yH3sPnGfus/c5+4L96X7yn3tvnHfuu/c9+4H96P7yf3sfnG/ut/c7+4P96f7y/3t/nH/uv/cOC/eS+Il9ZJ5yb0UXkovlZfaS+Ol9dJ56b0MXkYvk5fZy+Jl9bJ52b0cXk4vl5fby+Pl9fJ5+b0CXkGvkFfYK+IV9Yp5xb0SXkmvlFfaK+OV9cp55b0KXkWvklfZq+JV9ap51b0aXk2vllfbq+PV9ep59b0GXkOvkdfYa+I19Zp5zb0WXkuvldfaa+O19dp57b0OXkevk9fZ6+J19bp53b0eXk+vl9fb6+P19fp5CV6i198b4A30BnmDvSHeUG+YN9wb4Y30RnmjvTHeWG+cN96L+89/9KZ6mId7hEd6lEd7jMd6nMd7gid6kid7iqd6mqd7hmd6lmd7jud6nud7gQe80IMe8iIv5k3zpnszvJneLG+2N8eb683z5nsLvIXeIm+xt8Rb6i3zlnsrvJXeKm+1t8Zb663z1nsbvI3eJm+zt8Xb6m3ztns7vJ3eLm+3t8fb6+3z9nsHvIPeIe+wd8Q76h3zjnsnvJPeKe+0d8Y7653zznsXvIveJe+yd8W76l3zrns3vJveLe+2d8e7693z7nsPvIfeI++x98R76j3znnsvvJfeK++198Z7673z3nsfvI/eJ++z98X76n3zvns/vJ/eL++398f76/3z4vx4P4mf1E/mJ/dT+Cn9VH5qP42f1k/np/cz+Bn9TH5mP4uf1c/mZ/dz+Dn9XH5uP4+f18/n5/cL+AX9Qn5hv4hf1C/mF/dL+CX9Un5pv4xf1i/nl/cr+BX9Sn5lv4pf1a/mV/dr+DX9Wn5tv45f16/n1/cb+A39Rn5jv4nf1G/mN/db+C39Vn5rv43f1m/nt/c7+B39Tn5nv4vf1e/md/d7+D39Xn5vv4/f1+/nJ/iJfn9/gD/QH+QP9of4Q/1h/nB/hD/SH+WP9sf4Y/1x/nh/gj/Rn+RP9qf4U33Mx33CJ33Kp33GZ33O533BF33Jl33FV33N133DN33Lt33Hd33P9/3AB37oQx/5kR/zp/nT/Rn+TH+WP9uf48/15/nz/QX+Qn+Rv9hf4i/1l/nL/RX+Sn+Vv9pf46/11/nr/Q3+Rn+Tv9nf4m/1t/nb/R3+Tn+Xv9vf4+/19/n7/QP+Qf+Qf9g/4h/1j/nH/RP+Sf+Uf9o/45/1z/nn/Qv+Rf+Sf9m/4l/1r/nX/Rv+Tf+Wf9u/49/17/n3/Qf+Q/+R/9h/4j/1n/nP/Rf+S/+V/9p/47/13/nv/Q/+R/+T/9n/4n/1v/nf/R/+T/+X/9v/4//1//lxQXyQJEgaJAuSBymClEGqIHWQJkgbpAvSBxmCjEGmIHOQJcgaZAuyBzmCnEGuIHeQJ8gb5AvyBwWCgkGhoHBQJCgaFAuKByWCkkGpoHRQJigblAvKBxWCikGloHJQJagaVAuqBzWCmkGtoHZQJ6gb1AvqBw2ChkGjoHHQJGgaNAuaBy2ClkGroHXQJmgbtAvaBx2CjkGnoHPQJegadAu6Bz2CnkGvoHfQJ+gb9AsSgsSgfzAgGBgMCgYHQ4KhwbBgeDAiGBmMCkYHY4KxwbhgfDAhmBhMCiYHU4KpARbgARGQARXQAROwARfwgRCIgRTIgRKogRbogRGYgRXYgRO4gRf4QRCAIAxggIIoiAXTgunBjGBmMCuYHcwJ5gbzgvnBgmBhsChYHCwJlgbLguXBimBlsCpYHawJ1gbrgvXBhmBjsCnYHGwJtgbbgu3BjmBnsCvYHewJ9gb7gv3BgeBgcCg4HBwJjgbHguPBieBkcCo4HZwJzgbngvPBheBicCm4HFwJrgbXguvBjeBmcCu4HdwJ7gb3gvvBg+Bh8Ch4HDwJngbPgufBi+Bl8Cp4HbwJ3gbvgvfBh+Bj8Cn4HHwJvgbfgu/Bj+Bn8Cv4HfwJ/gb/gjgQD5KApCAZSA5SgJQgFUgN0oC0IB1IDzKAjCATyAyygKwgG8gOcoCcIBfIDfKAvCAfyA8KgIKgECgMioCioBgoDkqAkqAUKA3KgLKgHCgPKoCKoBKoDKqAqqAaqA5qgJqgFqgN6oC6oB6oDxqAhqARaAyagKagGWgOWoCWoBVoDdqAtqAdaA86gI6gE+gMuoCuoBvoDnqAnqAX6A36gL6gH0gAiaA/GAAGgkFgMBgChoJhYDgYAUaCUWA0GAPGgnFgPJgAJoJJYDKYAqYCDOCAACSgAA0YwAIO8EAAIpCADBSgAg3owAAmsIANHOACD/ggAACEAAIEIhAD08B0MAPMBLPAbDAHzAXzwHywACwEi8BisAQsBcvAcrACrASrwGqwBqwF68B6sAFsBJvAZrAFbAXbwHawA+wEu8BusAfsBfvAfnAAHASHwGFwBBwFx8BxcAKcBKfAaXAGnAXnwHlwAVwEl8BlcAVcBdfAdXAD3AS3wG1wB9wF98B98AA8BI/AY/AEPAXPwHPwArwEr8Br8Aa8Be/Ae/ABfASfwGfwBXwF38B38AP8BL/Ab/AH/AX/QFwYHyYJk4bJwuRhijBlmCpMHaYJ04bpwvRhhjBjmCnMHGYJs4bZwuxhjjBnmCvMHeYJ84b5wvxhgbBgWCgsHBYJi4bFwuJhibBkWCosHZYJy4blwvJhhbBiWCmsHFYJq4bVwuphjbBmWCusHdYJ64b1wvphg7Bh2ChsHDYJm4bNwuZhi7Bl2CpsHbYJ24btwvZhh7Bj2CnsHHYJu4bdwu5hj7Bn2CvsHfYJ+4b9woQwMewfDggHhoPCweGQcGg4LBwejghHhqPC0eGYcGw4LhwfTggnhpPCyeGUcGqIhXhIhGRIhXTIhGzIhXwohGIohXKohGqohXpohGZohXbohG7ohX4YhCAMQxiiMApj4bRwejgjnBnOCmeHc8K54bxwfrggXBguCheHS8Kl4bIwPi4ubmW4KlwdrgnXhuvC9eGGcGO4Kdwcbgm3htvC7eGOcGe4K9wd7gn3hvvC/eGB8GB4KDwcHgmPhsfC4+GJ8GR4KjwdngnPhufC8+GF8GJ4KbwcXgmvhtfC6+GN8GZ4K7wd3gnvhvfC++GD8GH4KHwcPgmfhs/C5+GL8GX4Knwdvgnfhu/C9+GH8GP4Kfwcfgm/ht/C7+GP8Gf4K/wd/gn/hv/COBgPk8CkMBlMDlPAlDAVTA3TwLQwHUwPM8CMMBPMDLPArDAbzA5zwJwwF8wN88C8MB/MDwvAgrAQLAyLwKKwGCwOS8CSsBQsDcvAsrAcLA8rwIqwEqwMq8CqsBqsDmvAmrAWrA3rwLqwHqwPG8CGsBFsDJvAprAZbA5bwJawFWwN28C2sB1sDzvAjrAT7Ay7wK6wG+wOe8CesBfsDfvAvrAfTICJsD8cAAfCQXAwHAKHwmFwOBwBR8JRcDQcA8fCcXA8nAAnwklwMpwCp0IM4pCAJKQgDRnIQg7yUIAilKAMFahCDerQgCa0oA0d6EIP+jCAAIYQQgQjGIPT4HQ4A86Es+BsOAfOhfPgfLgALoSL4GK4BC6Fy+ByuAKuhKvgargGroXr4Hq4AW6Em+BmuAVuhdvgdrgD7oS74G64B+6F++B+eAAehIfgYXgEHoXH4HF4Ap6Ep+BpeAaehefgeXgBXoSX4GV4BV6F1+B1eAPehLfgbXgH3oX34H34AD6Ej+Bj+AQ+hc/gc/gCvoSv4Gv4Br6F7+B7+AF+hJ/gZ/gFfoXf4Hf4A/6Ev+Bv+Af+hf9gHIpHSVBSlAwlRylQSpQKpUZpUFqUDqVHGVBGlAllRllQVpQNZUc5UE6UC+VGeVBelA/lRwVQQVQIFUZFUFFUDBVHJVBJVAqVRmVQWVQOlUcVUEVUCVVGVVBVVA1VRzVQTVQL1UZ1UF1UD9VHDVBD1Ag1Rk1QU9QMNUctUEvUCrVGbVBb1A61Rx1QR9QJdUZdUFfUDXVHPVBP1Av1Rn1QX9QPJaBE1B8NQAPRIDQYDUFD0TA0HI1AI9EoNBqNQWPRODQeTUAT0SQ0GU1BUxGGcEQgElGIRgxiEYd4JCARSUhGClKRhnRkIBNZyEYOcpGHfBQggEIEEUIRiqFpaDqagWaiWWg2moPmonloPlqAFqJFaDFagpaiZWg5WoFWolVoNVqD1qJ1aD3agDaiTWgz2oK2om1oO9qBdqJdaDfag/aifWg/OoAOokPoMDqCjqJj6Dg6gU6iU+g0OoPOonPoPLqALqJL6DK6gq6ia+g6uoFuolvoNrqD7qJ76D56gB6iR+gxeoKeomfoOXqBXqJX6DV6g96id+g9+oA+ok/oM/qCvqJv6Dv6gX6iX+g3+oP+on8oLoqPkkRJo2RR8ihFlDJKFaWO0kRpo3RR+ihDlDHKFGWOskRZo2xR9ihHlDPKFeWO8kR5o3xR/qhAVDAqFBWOikRFo2JR8ahEVDIqFZWOykRlo3JR+ahCVDGqFFWOqkRVo2pR9ahGVDOqFdWO6kR1o3pR/ahB1DBqFDWOmkRNo2ZR86hF1DJqFbWO2kRto3ZR+6hD1DHqFHWOukRdo25R96hH1DPqFfWO+kR9o35RQpQY9Y8GRAOjQdHgaEg0NBoWDY9GRCOjUdHoaEw0NhoXjY8mRBOjSdHkaEo0NcIiPCIiMqIiOmIiNuIiPhIiMV6K5EiJ1EiL9MiIzMiK7MiJ3MiL/CiIQBRGMEJRFMWiadH0aEY0M5oVzY7mRHOjedH8aEG0MFoULY6WREujZdHyaEW0MloVrY7WRGujddH6aEO0MdoUbY62RFujbdH2aEe0M9oV7Y72RHujfdH+6EB0MDoUHY6OREejY9Hx6ER0MjoVnY7ORGejc9H56EJ0MboUXY6uRFeja9H16EZ0M7oV3Y7uRHeje9H96EH0MHoUPY6eRE+jZ9Hz6EX0MnoVvY7eRG+jd9H76EP0MfoUfY6+RF+jb9H36Ef0M/oV/Y7+RH+jf1FcLD6WJJY0liyWPJYiljKWKpY6liaWNpYulj6WIZYxlimWOZYlljWWLZY9liOWM5YrljuWJ5Y3li+WP1YgVjBWKFY4ViRWNFYsVjxWIlYyVipWOlYmVjZWLlY+ViFWMVYpVjlWJVY1Vi1WPVYjVjNWK1Y7VidWN1YvVj/WINYw1ijWONYk1jTWLNY81iLWMtYq1jrWJtY21i7WPtYh1jHWKdY51iXWNdYt1j3WI9Yz1ivWO9Yn1jfWL5YQS4z9j4V7XMyrSQAAnNq2bdv4ats2U9tKU7s9to2ZOW9t27ZtM/tnb+QZjA3BhmLDsOHYCGwkNgobjY3BxmLjsPFYPDYBm4hNwiZjU7Cp2DRsOjYDm4nNwmZjc7C52DxsPrYAW4gtwhZjS7Cl2DIsAVuOJWIrsJXYKmw1tgZbi63D1mMbsI3YJmwztgXbim3DMAzHCIzEKIzGGIzFOIzHBEzEJEzGFEzFNEzHDMzELMzGHMzFPMzHAizEAAYxhEVYDNuO7cB2Yruw3dgebC+2D9uPHcAOYoeww9gR7Ch2DDuOncBOYqew09gZ7Cx2DjuPXcAuYpewy9gV7Cp2DbuO3cBuYrew29gd7C52D7uPPcAeYo+wx9gT7Cn2DHuOvcBeYq+w19gb7C32DnuPfcA+Yp+wz9gX7Cv2DfuO/cB+Yr+w39gf7C/2D0vC4vBkeHI8BZ4ST4WnxtPgafF0eHo8A54Rz4RnxrPgWfFseHY8B54Tz4XnxvPgefF8eH68AF4QL4QXxovgRfFieHG8BF4SL4WXxsvgZfFyeHm8Al4Rr4RXxqvgVfFqeHW8Bl4Tr4XXxuvgdfF6eH28Ad4Qb4Q3xpvgTfH/8GZ4c7wF3hJvhbfG2+Bt8XZ4e7wD3hHvhHfGu+Bd8W54d7wH3hPvhffG++B98X54f3wAPhAfhA/Gh+BD8WH4cHwEPhIfhY/Gx+Bj8XH4eDwen4BPxCfhk/Ep+FR8Gj4dn4HPxGfhs/E5+Fx8Hj4fX4AvxBfhi/El+FJ8GZ6AL8cT8RX4SnwVvhpfg6/F1+Hr8Q34RnwTvhnfgm/Ft+EYjuMETuIUTuMMzuIczuMCLuISLuMKruIaruMGbuIWbuMO7uIe7uMBHuIAhzjCIzyGb8d34DvxXfhufA++F9+H78cP4AfxQ/hh/Ah+FD+GH8dP4CfxU/hp/Ax+Fj+Hn8cv4BfxS/hl/Ap+Fb+GX8dv4DfxW/ht/A5+F7+H38cf4A/xR/hj/An+FH+GP8df4C/xV/hr/A3+Fn+Hv8c/4B/xT/hn/Av+Ff+Gf8d/4D/xX/hv/A/+F/+HJ+FxRDIiOZGCSEmkIlITaYi0RDoiPZGByEhkIjITWYisRDYiO5GDyEnkInITeYi8RD4iP1GAKEgUIgoTRYiiRDGiOFGCKEmUIkoTZYiyRDmiPFGBqEhUIioTVYiqRDWiOlGDqEnUImoTdYi6RD2iPtGAaEg0IhoTTYimxH9EM6I50YJoSbQiWhNtiLZEO6I90YHoSHQiOhNdiK5EN6I70YPoSfQiehN9iL5EP6I/MYAYSAwiBhNDiKHEMGI4MYIYSYwiRhNjiLHEOGI8EU9MICYSk4jJxBRiKjGNmE7MIGYSs4jZRFLSXGIeMZ9YQCwkFhGLiSXEUmIZkUAsJxKJFcRKYhWxmlhDrCXWEeuJDcRGYhOxmdhCbCW2ERiBEwRBEhRBEwzBEhzBEwIhEhIhEwqhEhqhEwZhEhZhEw7hEh7hEwEREoCABCIiIkZsJ3YQO4ldxG5iD7GX2EfsJw4QB4lDxGHiCHGUOEYcJ04QJ4lTxGniDHGWOEecJy4QF4lLxGXiCnGVuEZcJ24QN4lbxG3iDnGXuEfcJx4QD4lHxGPiCfGUeEY8J14QL4lXxGviDfGWeEe8Jz4QH4lPxGfiC/GV+EZ8J34QP4lfxG/iD/GX+EckEXFkMjI5mYJMSaYiU5NpyLRkOjI9mYHMSGYiM5NZyKxkNjI7mYPMSeYic5N5yLxkPjI/WYAsSBYiC5NFyKJkMbI4WYIsSZYiS5NlyLJkObI8WYGsSFYiK5NVyKpkNbI6WYOsSdYia5N1yLpkPbI+2YBsSDYiG5NNyKbkf2QzsjnZgmxJtiJbk23ItmQ7sj3ZgexIdiI7k13IrmQ3sjvZg+xJ9iJ7k33IvmQ/sj85gBxIDiIHk0PIoeQwcjg5ghxJjiJHk2PIseQ4cjwZT04gJ5KTyMnkFHIqOY2cTs4gZ5KzyNnkHHIuOY+cTy4gF5KLyMXkEnIpuYxMIJeTieQKciW5ilxNriHXkuvI9eQGciO5idxMbiG3kttIjMRJgiRJiqRJhmRJjuRJgRRJiZRJhVRJjdRJgzRJi7RJh3RJj/TJgAxJQEISkREZI7eTO8id5C5yN7mH3EvuI/eTB8iD5CHyMHmEPEoeI4+TJ8iT5CnyNHmGPEueI8+TF8iL5CXyMnmFvEpeI6+TN8ib5C3yNnmHvEveI++TD8iH5CPyMfmEfEo+I5+TL8iX5CvyNfmGfEu+I9+TH8iP5CfyM/mF/Ep+I7+TP8if5C/yN/mH/Ev+I5PIOCoZlZxKQaWkUlGpqTRUWiodlZ7KQGWkMlGZqSxUVioblZ3KQeWkclG5qTxUXioflZ8qQBWkClGFqSJUUaoYVZwqQZWkSlGlqTJUWaocVZ6qQFWkKlGVqSpUVaoaVZ2qQdWkalG1qTpUXaoeVZ9qQDWkGlGNqSZUU+o/qhnVnGpBtaRaUa2pNlRbqh3VnupAdaQ6UZ2pLlRXqhvVnepB9aR6Ub1T9aH6Uv2o/tQAaiA1iBpMDaGGUsOo4dQIaiQ1ihpNjaHGUuOo8VQ8NYGaSE2iJlNTqKnUNGo6NYOaSc2iZlNzqLnUPGo+tYBaSC2iFlNLqKXUMiqBWk4lUiuoldQqajW1hlpLraPWUxuojdQmajO1hdpKbaMwCqcIiqQoiqYYiqU4iqcESqQkSqYUSqU0SqcMyqQsyqYcyqU8yqcCKqQABSlERVSM2k7toHZSu6jd1B7qX1JS0n7qAHWQOkQdpo5QR6lj1HHqBHWSOkWdps5QZ6lz1HnqAnWRukRdpq5QV6lr1HXqBnWTukXdpu5Qd6l71H3qAfWQekQ9pp5QT6ln1HPqBfWSekW9pt5Qb6l31HvqA/WR+kR9pr5QX6lv1HfqB/WT+kX9pv5Qf6l/VBIVRyejk9Mp6JR0Kjo1nYZOS6ej09MZ6Ix0JjoznYXOSmejs9M56Jx0Ljo3nYfOS+ej89MF6IJ0IbowXYQuSheji9Ml6JJ0Kbo0XYYuS5ejy9MV6Ip0JboyXYWuSlejq9M16Jp0Lbo2XYeuS9ej69MN6IZ0I7ox3YRuSv9HN6Ob0y3olnQrujXdhm5Lt6Pb0x3ojnQnujPdhe5Kd6O70z3onnQvujfdh+5L96P70wPogfQgejA9hB5KD6OH0yPokfQoejQ9hh5Lj6PH0/H0BHoiPYmeTE+hp9LT6On0DHomPYueTc+h59Lz6Pn0AnohvYheTC+hl9LL6AR6OZ1Ir6BX0qvo1fQaei29jl5Pb6A30pvozfQWeiu9jcZonCZokqZommZoluZonhZokZZomVZoldZonTZok7Zom3Zol/Zonw7okAY0pBEd0TF6O72D3knvonfTe+i99D56P32APkgfog/TR+ij9DH6OH2CPkmfok/TZ+iz9Dn6PH2Bvkhfoi/TV+ir9DX6On2Dvknfom/Td+i79D36Pv2Afkg/oh/TT+in9DP6Of2Cfkm/ol/Tb+i39Dv6Pf2B/kh/oj/TX+iv9Df6O/2D/kn/on/Tf+i/9D86iY5jkjHJmRRMSiYVk5pJw6Rl0jHpmQxMRiYTk5nJwmRlsjHZmRxMTiYXk5vJw+Rl8jH5mQJMQaYQU5gpwhRlijHFmRJMSaYUU5opw5RlyjHlmQpMRaYSU5mpwlRlqjHVmRpMTaYWU5upw9Rl6jH1mQZMQ6YR05hpwjRl/mOaMc2ZFkxLphXTmmnDtGXaMe2ZDkxHphPTmenCdGW6Md2ZHkxPphfTm+nD9GX6Mf2ZAcxAZhAzmBnCDGWGMcOZEcxIZhQzmhnDjGXGMeOZeGYCM5GZxExmpjBTmWnMdGYGM5OZxcxm5jBzmXnMfGYBs5BZxCxmljBLmWVMArOcSWRWMCuZVcxqZg2zllnHrGc2MBuZTcxmZguzldnGYAzOEAzJUAzNMAzLcAzPCIzISIzMKIzKaIzOGIzJWIzNOIzLeIzPBEzIAAYyiImYGLOd2cHsZHYxu5k9zF5mH7OfOcAcZA4xh5kjzFHmGHOcOcGcZE4xp5kzzFnmHHOeucBcZC4xl5krzFXmGnOducHcZG4xt5k7zF3mHnOfecA8ZB4xj5knzFPmGfOcecG8ZF4xr5k3zFvmHfOe+cB8ZD4xn5kvzFfmG/Od+cH8ZH4xv5k/zF/mH5PExLHJ2ORsCjYlm4pNzaZh07Lp2PRsBjYjm4nNzGZhs7LZ2OxsDjYnm4vNzeZh87L52PxsAbYgW4gtzBZhi7LF2OJsCbYkW4otzZZhy7Ll2PJsBbYiW4mtzFZhq7LV2OpsDbYmW4utzdZh67L12PpsA7Yh24htzDZhm7L/sc3Y5mwLtiXbim3NtmHbsu3Y9mwHtiPbie3MdmG7st3Y7mwPtifbi+3N9mH7sv3Y/uwAdiA7iB3MDmGHssPY4ewIdiQ7ih3NjmHHsuPY8Ww8O4GdyE5iJ7NT2KnsNHY6O4Odyc5iZ7Nz2LnsPHY+u4BdyC5iF7NL2KXsMjaBXc4msivYlewqdjW7hl3LrmPXsxvYjewmdjO7hd3KbmMxFmcJlmQplmYZlmU5lmcFVmQlVmYVVmU1VmcN1mQt1mYd1mU91mcDNmQBC1nERmyM3c7uYHeyu9jd7B52L7uP3c8eYA+yh9jD7BH2KHuMPc6eYE+yp9jT7Bn2LHuOPc9eYC+yl9jL7BX2KnuNvc7eYG+yt9jb7B32LnuPvc8+YB+yj9jH7BP2KfuMfc6+YF+yr9jX7Bv2LfuOfc9+YD+yn9jP7Bf2K/uN/c7+YH+yv9jf7B/2L/uPTWLjuGRcci4Fl5JLxaXm0nBpuXRcei4Dl5HLxGXmsnBZuWxcdi4Hl5PLxeXm8nB5uXxcfq4AV5ArxBXminBFuWJcca4EV5IrxZXmynBluXJcea4CV5GrxFXmqnBVuWpcda4GV5OrxdXm6nB1uXpcfa4B15BrxDXmmnBNuf+4ZlxzrgXXkmvFtebacG25dlx7rgPXkevEdea6cF25blx3rgfXk+vF9eb6cH25flx/bgA3kBvEDeaGcEO5YdxwbgQ3khvFjebGcGO5cdx4Lp6bwE3kJnGTuSncVG4aN52bwc3kZnGzuTncXG4eN59bwC3kFnGLuSXcUm4Zl8At5xK5FdxKbhW3mlvDreXWceu5DdxGbhO3mdvCbeW2cRiHcwRHchRHcwzHchzHcwInchIncwqnchqncwZnchZncw7nch7ncwEXcoCDHOIiLsZt53ZwO7ld3G5uD7eX28ft5w5wB7lD3GHuCHeUO8Yd505wJ7lT3GnuDHeWO8ed5y5wF7lL3GXuCneVu8Zd525wN7lb3G3uDneXu8fd5x5wD7lH3GPuCfeUe8Y9515wL7lX3GvuDfeWe8e95z5wH7lP3GfuC/eV+8Z9535wP7lf3G/uD/eX+8clcXF8Mj45n4JPyafiU/Np+LR8Oj49n4HPyGfiM/NZ+Kx8Nj47n4PPyefic/N5+Lx8Pj4/X4AvyBfiC/NF+KJ8Mb44X4IvyZfiS/Nl+LJ8Ob48X4GvyFfiK/NV+Kp8Nb46X4Ovydfia/N1+Lp8Pb4+34BvyDfiG/NN+Kb8f3wzvjnfgm/Jt+Jb8234tnw7vj3fge/Id+I78134rnw3vjvfg+/J9+J78334vnw/vj8/gB/ID+IH80P4ofwwfjg/gh/Jj+JH82P4sfw4fjwfz0/gJ/KT+Mn8FH4qP42fzs/gZ/Kz+Nn8HH4uP4+fzy/gF/KL+MX8En4pv4xP4JfzifwKfiW/il/Nr+HX8uv49fwGfiO/id/Mb+G38tt4jMd5gid5iqd5hmd5jud5gRd5iZd5hVd5jdd5gzd5i7d5h3d5j/f5gA95wEMe8REf47fzO/id/C5+N7+H38vv4/fzB/iD/CH+MH+EP8of44/zJ/iT/Cn+NH+GP8uf48/zF/iL/CX+Mn+Fv8pf46/zN/ib/C3+Nn+Hv8vf4+/zD/iH/CP+Mf+Ef8o/45/zL/iX/Cv+Nf+Gf8u/49/zH/iP/Cf+M/+F/8p/47/zP/if/C/+N/+H/8v/45P4OCGZkFxIIaQUUgmphTRCWiGdkF7IIGQUMgmZhSxCViGbkF3IIeQUcgm5hTxCXiGfkF8oIBQUCgmFhSJCUaGYUFwoIZQUSgmlhTJCWaGcUF6oIFQUKgmVhSpCVaGaUF2oIdQUagm1hTpCXaGeUF9oIDQUGgmNhSZCU+E/oZnQXGghtBRaCa2FNkJboZ3QXuggdBQ6CZ2FLkJXoZvQXegh9BR6Cb2FPkJfoZ/QXxggDBQGCYOFIcJQYZgwXBghjBRGCaOFMcJYYZwwXogXJggThUnCZGGKMFWYJkwXZggzhVnCbGGOMFeYJ8wXFggLhUXCYmGJsFRYJiQIy4VEYYWwUlglrBbWCGuFdcJ6YYOwUdgkbBa2CFuFbQIm4AIhkAIl0AIjsAIn8IIgiIIkyIIiqIIm6IIhmIIl2IIjuIIn+EIghAIQoICESIgJ24Udwk5hl7Bb2CPsFfYJ+4UDwkHhkHBYOCIcFY4Jx4UTwknhlHBaOCOcFc4J54ULwkXhknBZuCJcFa4J14Ubwk3hlnBbuCPcFe4J94UHwkPhkfBYeCI8FZ4Jz4UXwkvhlfBaeCO8Fd4J74UPwkfhk/BZ+CJ8Fb4J34Ufwk/hl/Bb+CP8Ff4JSUKcmExMLqYQU4qpxNRiGjGtmE5ML2YQM4qZxMxiFjGrmE3MLuYQc4q5xNxiHjGvmE/MLxYQC4qFxMJiEbGoWEwsLpYQS4qlxNJiGbGsWE4sL1YQK4qVxMpiFbGqWE2sLtYQa4q1xNpiHbGuWE+sLzYQG4qNxMZiE7Gp+J/YTGwuthBbiq3E1mIbsa3YTmwvdhA7ip3EzmIXsavYTewu9hB7ir3E3mIfsa/YT+wvDhAHioPEweIQcag4TBwujhBHiqPE0eIYcaw4ThwvxosTxIniJHGyOEWcKk4Tp4szxJniLHG2OEecK84T54sLxIXiInGxuERcKi4TE8TlYqK4QlwprhJXi2vEteI6cb24QdwobhI3i1vEreI2ERNxkRBJkRJpkRFZkRN5URBFURJlURFVURN10RBN0RJt0RFd0RN9MRBDEYhQRGIkxsTt4g5xp7hL3C3uEfeK+8T94gHxoHhIPCweEY+Kx8Tj4gnxpHhKPC2eEc+K58Tz4gXxonhJvCxeEa+K18Tr4g3xpnhLvC3eEe+K98T74gPxofhIfCw+EZ+Kz8Tn4gvxpfhKfC2+Ed+K78T34gfxo/hJ/Cx+Eb+K38Tv4g/xp/hL/C3+Ef+K/8QkMU5KJiWXUkgppVRSaimNlFZKJ6WXMkgZpUxSZimLlFXKJmWXckg5pVxSbimPlFfKJ+WXCkgFpUJSYamIVFQqJhWXSkglpVJSaamMVFYqJ5WXKkgVpUpSZamKVFWqJlWXakg1pVpSbamOVFeqJ9WXGkgNpUZSY6mJ1FT6T2omNZdaSC2lVlJrqY3UVmontZc6SB2lTlJnqYvUVeomdZd6SD2lXlJvqY/UV+on9ZcGSAOlQdJgaYg0VBomDZdGSCOlUdJoaYw0VhonjZfipQnSRGmSNFmaIk2VpknTpRnSTGmWNFuaI82V5knzpQXSQmmRtFhaIi2VlkkJ0nIpUVohrZRWSaulNdJaaZ20XtogbZQ2SZulLdJWaZuESbhESKRESbTESKzESbwkSKIkSbKkSKqkSbpkSKZkSbbkSK7kSb4USKEEJCghKZJi0nZph7RT2iXtlvZIe6V90n7pgHRQOiQdlo5IR6Vj0nHphHRSOiWdls5IZ6Vz0nnpgnRRuiRdlq5IV6Vr0nXphnRTuiXdlu5Id6V70n3pgfRQeiQ9lp5IT6Vn0nPphfRSeiW9lt5Ib6V30nvpg/RR+iR9lr5IX6Vv0nfph/RT+iX9lv5If6V/UpIUJyeTk8sp5JRyKjm1nEZOK6eT08sZ5IxyJjmznEXOKmeTs8s55JxyLjm3nEfOK+eT88sF5IJyIbmwXEQuKheTi8sl5JJyKbm0XEYuK5eTy8sV5IpyJbmyXEWuKleTq8s15JpyLbm2XEeuK9eT68sN5IZyI7mx3ERuKv8nN5Obyy3klnIrubXcRm4rt5Pbyx3kjnInubPcRe4qd5O7yz3knnIvubfcR+4r95P7ywPkgfIgebA8RB4qD5OHyyPkkfIoebQ8Rh4rj5PHy/HyBHmiPEmeLE+Rp8rT5OnyDHmmPEueLc+R58rz5PnyAnmhvEheLC+Rl8rL5AR5uZwor5BXyqvk1fIaea28Tl4vb5A3ypvkzfIWeau8TcZkXCZkUqZkWmZkVuZkXhZkUZZkWVZkVdZkXTZkU7ZkW3ZkV/ZkXw7kUAYylJEcyTF5u7xD3invknfLe+S98j55v3xAPigfkg/LR+Sj8jH5uHxCPimfkk/LZ+Sz8jn5vHxBvihfki/LV+Sr8jX5unxDvinfkm/Ld+S78j35vvxAfig/kh/LT+Sn8jP5ufxCfim/kl/Lb+S38jv5vfxB/ih/kj/LX+Sv8jf5u/xD/in/kn/Lf+S/8j85SY5TkinJlRRKSiWVklpJo6RV0inplQxKRiWTklnJomRVsinZlRxKTiWXklvJo+RV8in5lQJKQaWQUlgpohRViinFlRJKSaWUUlopo5RVyinllQpKRaWSUlmpolRVqinVlRpKTaWWUlupo9RV6in1lQZKQ6WR0lhpojRV/lOaKc2VFkpLpZXSWmmjtFXaKe2VDkpHpZPSWemidFW6Kd2VHkpPpZfSW+mj9FX6Kf2VAcpAZZAyWBmiDFWGKcOVEcpIZZQyWhmjjFXGKeOVeGWCMlGZpExWpihTlWnKdGWGMlOZpcxW5ihzlXnKfGWBslBZpCxWlihLlWVKgrJcSVRWKCuVVcpqZY2yVlmnrFc2KBuVTcpmZYuyVdmmYAquEAqpUAqtMAqrcAqvCIqoSIqsKIqqaIquGIqpWIqtOIqreIqvBEqoAAUqSImUmLJd2aHsVHYpu5U9yl5ln7JfOaAcVA4ph5UjylHlmHJcOaGcVE4pp5UzylnlnHJeuaBcVC4pl5UrylXlmnJduaHcVG4pt5U7yl3lnnJfeaA8VB4pj5UnylPlmfJceaG8VF4pr5U3ylvlnfJe+aB8VD4pn5Uvylflm/Jd+aH8VH4pv5U/yl/ln5KkxKnJ1ORqCjWlmkpNraZR06rp1PRqBjWjmknNrGZRs6rZ1OxqDjWnmkvNreZR86r51PxqAbWgWkgtrBZRi6rF1OJqCbWkWkotrZZRy6rl1PJqBbWiWkmtrFZRq6rV1OpqDbWmWkutrdZR66r11PpqA7Wh2khtrDZRm6r/qc3U5moLtaXaSm2ttlHbqu3U9moHtaPaSe2sdlG7qt3U7moPtafaS+2t9lH7qv3U/uoAdaA6SB2sDlGHqsPU4eoIdaQ6Sh2tjlHHquPU8Wq8OkGdqE5SJ6tT1KnqNHW6OkOdqc5SZ6tz1LnqPHW+ukBdqC5SF6tL1KXqMjVBXa4mqivUleoqdbW6Rl2rrlPXqxvUjeomdbO6Rd2qblMxFVcJlVQplVYZlVU5lVcFVVQlVVYVVVU1VVcN1VQt1VYd1VU91VcDNVSBClWkRmpM3a7uUHequ9Td6h51r7pP3a8eUA+qh9TD6hH1qHpMPa6eUE+qp9TT6hn1rHpOPa9eUC+ql9TL6hX1qnpNva7eUG+qt9Tb6h31rnpPva8+UB+qj9TH6hP1qfpMfa6+UF+qr9TX6hv1rfpOfa9+UD+qn9TP6hf1q/pN/a7+UH+qv9Tf6h/1r/pPTVLjtGRaci2FllJLpaXW0mhptXRaei2DllHLpGXWsmhZtWxadi2HllPLpeXW8mh5tXxafq2AVlArpBXWimhFtWJaca2EVlIrpZXWymhltXJaea2CVlGrpFXWqmhVtWpada2GVlOrpdXW6mh1tXpafa2B1lBrpDXWmmhNtaZaM62Z1kJrobXSWmlttDZaO62d1kHroHXSOmldtC5aN62b1kProfXSeml9tD5aP62fNkAboA3SBmlDtKHaMG2YNkIboY3SRmljtDHaOG2cFq/FaxO1SdpkbYo2VZumTddmaDO1WdpsbY42V5unzdcWaAu1RdpibbG2VFuqJWgJWqKWqK3UVmqrtdXaWm2ttl5br23UNmqbtc3aVm2rhmmYRmikRmm0xmisxmm8JmiiJmmypmiqpmm6ZmimZmm25miu5mm+FmihBjSoIS3SYtp2bYe2U9ul7db2aHu1fdp+7YB2UDukHdaOaEe1Y9px7YR2UjulndbOaGe1c9p57YJ2UbukXdauaFe1a9p17YZ2U7ul3dbuaHe1e9p97YH2UHukPdaeaE+1Z9pz7YX2UnulvdbeaG+1d9p77YP2Ufukfda+aF+1b9p37Yf2U/ul/db+aH+1f1qSFqcn05PrKfSUeio9tZ5GT6un09PrGfSMeiY9s55Fz6pn07PrOfScei49t55Hz6vn0/PrBfSCeiG9sF5EL6oX04vrxfWSekm9tF5aL6uX1cvr5fWKekW9sl5Zr6pX06vr1fWaek29tl5br6vX1evr9fWGekO9sd5Yb6o31ZvpzfQWegu9ld5Kb6O30dvp7fQOege9k95J76J30bvp3fQeeg+9l95L76P30fvp/fQB+gB9kD5IH6IP0Yfpw/QR+gh9lD5KH6OP0cfp4/R4PV6fqE/UJ+tT9Kn6NH26PkOfqc/SZ+tz9Dn6PH2evkBfqC/UF+uL9aX6Uj1BT9AT9UR9pb5KX62v0dfo6/T1+gZ9o75J36xv0bfq23RMx3VCJ3VKp3VGZ3VO53VBF3VJl3VFV3VN13VDN3VLt3VHd3VP9/VAD3WgQx3pkR7Tt+s79J36Ln23vkffq+/T9+sH9IP6If2wfkQ/qh/Tj+sn9JP6Kf20fkY/q5/Tz+sX9Iv6Jf2yfkW/ql/Tr+s39Jv6Lf22fke/q9/T7+sP9If6I/2x/kR/qj/Tn+sv9HP6K/21/kZ/q7/T3+sf9I/6J/2z/kX/qn/Tv+s/9J/6L/23/kf/q//Tk/Q4I5mR3EhhpDRSGamNNEZaI52R3shgZDQyGZmNLEZWI5uR3chh5DRyGbmNPEZeI5+R3yhgFDQKGYWNIkZRo5hR3ChhlDRKGaWNMkZZo5xR3qhgVDQqGZWNKkZVo5pR3ahh1DRqGbWNOkZdo55R32hgNDQaGY2NJkZT4z+jmdHcaGG0NFoZrY02RlujndHe6GB0NDoZnY0uRlejm9Hd6GH0NHoZvY0+Rl+jn9HfGGAMNAYZg40hxlBjmDHcGGGMNEYZo40xxlhjnDHeiDcmGBONScZkY4ox1ZhmTDdmGDONWcZsY44x15hnzDcWGAuNRcZiY4mx1FhmJBjLjURjhbHSWGWsNtYYa411xnpjg7HR2GRsNrYYW41tBmbgBmGQBmXQBmOwBmfwhmCIhmTIhmKohmbohmGYhmXYhmO4hmf4RmCEBjCggYzIiBnbjR3GTmOXsdvYY+w19hn7jQPGQeOQcdg4Yhw1jhnHjRPGSeOUcdo4Y5w1zhnnjQvGReOScdm4Ylw1rhnXjRvGTeOWcdu4Y9w17hn3jQfGQ+OR8dh4Yjw1nhnPjRfGS+OV8dp4Y7w13hnvjQ/GR+OT8dn4Ynw1vhnfjR/GT+OX8dv4Y/w1/hlJRpyZzExupjBTmqnM1GYaM62ZzkxvZjAzmpnMzGYWM6uZzcxu5jBzmrnM3GYeM68Zb+Y3C5gFzUJmYbOIWdQsZhY3S5glzVJmabOMWdYsZ5Y3K5gVzUpmZbOKWdWsZlY3a5g1zVpmbbOOWdesZ9Y3G5gNzUZmY7OJ2dT8z2xmNjdbmC3NVmZrs43Z1mxntjc7mB3NTmZns4vZ1exmdjd7mD3NXmZvs4/Z1+xn9jcHmAPNQeZgc4g51BxmDjdHmCPNUeZoc4w51hxnjjfjzQnmRHOSOdmcYk41p5nTzRnmTHOWOducY84155nzzQXmQnORudhcYi41l5kJ5nIz0VxhrjRXmavNNeZac5253txgbjQ3mZvNLeZWc5uJmbhJmKRJmbTJmKzJmbwpmKIpmbKpmKqpmbppmKZpmbbpmK7pmb4ZmKEJTGgiMzJj5nZzh7nT3GXuNveYe8195n7zgHnQPGQeNo+YR81j5nHzhHnSPGWeNs+YZ81z5nnzgnnRvGReNq+YV81r5nXzhnnTvGXeNu+Yd8175n3zgfnQfGQ+Np+YT81n5nPzhfnSfGW+Nt+Yb8135nvzg/nR/GR+Nr+YX81v5nfzh/nT/GX+Nv+Yf81/ZpIZZyWzklsprJRWKiu1lcZKa6Wz0lsZrIxWJiuzlcXKamWzsls5rJxWLiu3lcfKa+Wz8lsFrIJWIauwVcQqahWzilslrJJWKau0VcYqa5WzylsVrIpWJauyVcWqalWzqls1rJpWLau2Vceqa9Wz6lsNrIZWI6ux1cRqav1nNbOaWy2sllYrq7XVxmprtbPaWx2sjlYnq7PVxepqdbO6Wz2snlYvq7fVx+pr9bP6WwOsgdYga7A1xBpqDbOGWyOskdYoa7Q1xhprjbPGW/HWBGuiNcmabE2xplrTrOnWDGumNcuabc2x5lrzrPnWAmuhtchabC2xllrLrARruZVorbBWWqus1dYaa621zlpvbbA2WpuszdYWa6u1zcIs3CIs0qIs2mIs1uIs3hIs0ZIs2VIs1dIs3TIs07Is23Is1/Is3wqs0AIWtJAVWTFru7XD2mntsnZbe6y91j5rv3XAOmgdsg5bR6yj1jHruHXCOmmdsk5bZ6yz1jnrvHXBumhdsi5bV6yr1jXrunXDumndsm5bd6y71j3rvvXAemg9sh5bT6yn1jPrufXCemm9sl5bb6y31jvrvfXB+mh9sj5bX6yv1jfru/XD+mn9sn5bf6y/1j8ryYqzk9nJ7RR2SjuVndpOY6e109np7Qx2RjuTndnOYme1s9nZ7Rx2TjuXndvOY+e189n57QJ2QbuQXdguYhe1i9nF7RJ2SbuUXdouY5e1y9nl7Qp2RbuSXdmuYle1q9nV7Rp2TbuWXduuY9e169n17QZ2Q7uR3dhuYje1/7Ob2c3tFnZLu5Xd2m5jt7Xb2e3tDnZHu5Pd2e5id7W72d3tHnZPu5fd2+5j97X72f3tAfZAe5A92B5iD7WH2cPtEfZIe5Q92h5jj7XH2ePteHuCPdGeZE+2p9hT7Wn2dHuGPdOeZc+259hz7Xn2fHuBvdBeZC+2l9hL7WV2gr3cTrRX2CvtVfZqe4291l5nr7c32BvtTfZme4u91d5mYzZuEzZpUzZtMzZrczZvC7ZoS7ZsK7Zqa7ZuG7ZpW7ZtO7Zre7ZvB3ZoAxvayI7smL3d3mHvtHfZu+099l57n73fPmAftA/Zh+0j9lH7mH3cPmGftE/Zp+0z9ln7nH3evmBftC/Zl+0r9lX7mn3dvmHftG/Zt+079l37nn3ffmA/tB/Zj+0n9lP7mf3cfmG/tF/Zr+039lv7nf3e/mB/tD/Zn+0v9lf7m/3d/mH/tH/Zv+0/9l/7n51kxznJnOROCielk8pJ7aRx0jrpnPROBiejk8nJ7GRxsjrZnOxODienk8vJ7eRx8jr5nPxOAaegU8gp7BRxijrFnOJOCaekU8op7ZRxyjrlnPJOBaeiU8mp7FRxqjrVnOpODaemU8up7dRx6jr1nPpOA6eh08hp7DRxmjr/Oc2c5k4Lp6XTymnttHHaOu2c9k4Hp6PTyensdHG6Ot2c7k4Pp6fTy+nt9HH6Ov2c/s4AZ6AzyBnsDHGGOsOc4c4IZ6QzyhntjHHGOuOc8U68M8GZ6ExyJjtTnKnONGe6M8OZ6cxyZjtznLnOPGe+s8BZ6CxyFjtLnKXOMifBWe4kOiuclc4qZ3XKOGets85Z72xwNjqbnM3OFmers83BHNwhHNKhHNphHNbhHN4RHNGRHNlRHNXRHN0xHNOxHNtxHNfxHN8JnNABDnSQEzkxZ7uzw9np7HJ2O3ucvc4+Z79zwDnoHHIOO0eco84x57hzwjnpnHJOO2ecs84557xzwbnoXHIuO1ecq84157pzw7np3HJuO3ecu849577zwHnoPHIeO0+cp84z57nzwnnpvHJeO2+ct847573zwfnofHI+O1+cr84357vzw/np/HJ+O3+cv84/J8mJc5O5yd0Ubko3lZvaTeOmddO56d0MbkY3k5vZzeJmdbO52d0cbk43l5vbzePmdfO5+d0CbkG3kFvYLeIWdYu5xd0Sbkm3lFvaLeOWdcu55d0KbkW3klvZreJWdau51d0abk23llvbrePWdeu59d0GbkO3kdvYbeI2df9zm7nN3RZuS7eV29pt47Z127nt3Q5uR7eT29nt4nZ1u7nd3R5uT7eX29vt4/Z1+7n93QHuQHeQO9gd4g51h7nD3RHuSHeUO9od4451x7nj3Xh3gjvRneROdqe4U91p7nR3hjvTneXOdue4c9157nx3gbvQXeQudpe4S91lboK73E10V7gr3VXuaneNu9Zd5653N7gb3U3uZneLu9Xd5mIu7hIu6VIu7TIu63Iu7wqu6Equ7Cqu6mqu7hqu6Vqu7Tqu63qu7wZu6AIXusiN3Ji73d3h7nR3ubvdPe5ed5+73z3gHnQPuYfdI+5R95h73D3hnnRPuafdM+5Z95x73r3gXnQvuZfdK+5V95p73b3h3nRvubfdO+5d9557333gPnQfuY/dJ+5T95n73H3hvnRfua/dN+5b95373v3gfnQ/uZ/dL+5X95v73f3h/nR/ub/dP+5f95+b5MZ5ybzkXgovpZfKS+2l8dJ66bz0XgYvo5fJy+xl8bJ62bzsXg4vp5fLy+3l8fJ6+bz8XgGvoFfIK+wV8Yp6xbziXgmvpFfKK+2V8cp65bzyXgWvolfJq+xV8ap61bzqXg2vplfLq+3V8ep69bz6XgOvodfIa+w18Zp6/3nNvOZeC6+l18pr7bXx2nrtvPZeB6+j18nr7HXxunrdvO5eD6+n18vr7fXx+nr9vP7eAG+gN8gb7A3xhnrDvOHeCG+kN8ob7Y3xxnrjvPFevDfBm+hN8iZ7U7yp3jRvujfDm+nN8mZ7c7y53jxvvrfAW+gt8hZ7S7yl3jIvwVvuJXorvJXeKm+1t8Zb663z1nsbvI3eJm+zt8Xb6m3zMA/3CI/0KI/2GI/1OI/3BE/0JE/2FE/1NE/3DM/0LM/2HM/1PM/3Ai/0gAc95EVezNvu7fB2eru83d4eb6+3z9vvHfAOeoe8w94R76h3zDvunfBOeqe8094Z76x3zjvvXfAuepe8y94V76p3zbvu3fBuere8294d7653z7vvPfAeeo+8x94T76n3zHvuvfBeeq+8194b7633znvvffA+ep+8z94X76v3zfvu/fB+er+8394f76/3z0vy4vxkfnI/hZ/ST+Wn9tP4af10fno/g5/Rz+Rn9rP4Wf1sfnY/h5/Tz+Xn9vP4ef18fn6/gF/QL+QX9ov4Rf1ifnG/hF/SL+WX9sv4Zf1yfnm/gl/Rr+RX9qv4Vf1qfnW/hl/Tr+XX9uv4df16fn2/gd/Qb+Q39pv4aePi4pr5zf0Wfku/ld/ab+O39dv57f0Ofke/k9/Z7+J39bv53f0efk+/l9/b7+P39fv5/f0B/kB/kD/YH+IP9Yf5w/0R/kh/lD/aH+OP9cf54/14f4I/0Z/kT/an+FP9af50f4Y/05/lz/bn+HP9ef58f4G/0F/kL/aX+Ev9ZX6Cv9xP9Ff4K/1V/mp/jb/WX+ev9zf4G/1N/mZ/i7/V3+ZjPu4TPulTPu0zPutzPu8LvuhLvuwrvuprvu4bvulbvu07vut7vu8HfugDH/rIj/yYv93f4e/0d/m7/T3+Xn+fv98/4B/0D/mH/SP+Uf+Yf9w/4Z/0T/mn/TP+Wf+cf96/4F/0L/mX/Sv+Vf+af92/4d/0b/m3/Tv+Xf+ef99/4D/0H/mP/Sf+U/+Z/9x/4b/0X/mv/Tf+W/+d/97/4H/0P/mf/S/+V/+b/93/4f/0f/m//T/+X/+fn+THBcmC5EGKIGWQKkgdpAnSBumC9EGGIGOQKcgcZAmyBtmC7EGOIGeQK8gd5AnyBvmC/EGBoGBQKCgcFAmKBsWC4kGJoGRQKigdlAnKBuWC8kGFoGJQKagcVAmqBtWC6kGNoGZQK6gd1AnqBvWC+kGDoGHQKGgcNAmaBv8FzYLmQYugZdAqaB20CdoG7YL2QYegY9Ap6Bx0CboG3YLuQY+gZ9Ar6B30CfoG/YL+wYBgYDAoGBwMCYYGw4LhwYhgZDAqGB2MCcYG44LxQXwwIZgYTAomB1OCqcG0YHowI5gZzApmB3OCucG8YH6wIFgYLAoWB0uCpcGyICFYHiQGK4KVwapgdbAmWBusC9YHG4KNwaZgc7Al2BpsC7AAD4iADKiADpiADbiAD4RADKRADpRADbRAD4zADKzADpzADbzAD4IgDEAAAxREQSzYHuwIdga7gt3BnmBvsC/YHxwIDgaHgsPBkeBocCw4HpwITgangtPBmeBscC44H1wILgaXgsvBleBqcC24HtwIbga3gtvBneBucC+4HzwIHgaPgsfBk+Bp8Cx4HrwIXgavgtfBm+Bt8C54H3wIPgafgs/Bl+Br8C34HvwIfga/gt/Bn+Bv8C9ICuLCZGHyMEWYMkwVpg7ThGnDdGH6MEOYMcwUZg6zhFnDbGH2MEeYM8wV5g7zhHnDfGH+sEBYMCwUFg6LhEXDYmHxsERYMiwVlg7LhGXDcmH5sEJYMawUVg6rhFXDamH1sEZYM6wV1g7rhHXDemH9sEHYMGwUNg6bhE3D/8JmYfOwRdgybBW2DtuEbcN2YfuwQ9gx7BR2DruEXcNuYfewR9gz7BX2DvuEfcN+Yf9wQDgwHBQODoeEQ8Nh4fBwRDgyHBWODseEY8Nx4fgwPpwQTgwnhZPDKeHUcFo4PZwRzgxnhbPDOeHccF44P1wQLgwXhYvDJeHScFmYEC4PE8MV4cpwVbg6XBOuDdeF68MN4cZwU7g53BJuDbeFWIiHREiGVEiHTMiGXMiHQiiGUiiHSqiGWqiHRmiGVmiHTuiGXuiHQRiGIIQhCqMwFm4Pd4Q7w13h7nBPuDfcF+4PD4QHw0Ph4fBIeDQ8Fh4PT4Qnw1Ph6fBMeDY8F54PL4QXw0vh5fBKeDW8Fl4Pb4Q3w1vh7fBOeDe8F94PH4QPw0fh4/BJ+DR8Fj4PX4Qvw1fh6/BN+DZ8F74PP4Qfw0/h5/BL+DX8Fn4Pf4Q/w1/h7/BP+Df8FyaFcSAZSA5SgJQgFUgN0oC0IB1IDzKAjCATyAyygKwgG8gOcoCcIBfIDfKAvCAfyA8KgIKgECgMioCioBgoDkqAkqAUKA3KgLKgHCgPKoCKoBKoDKqAqqAaqA5qgJqgFqgN6oC6oB6oDxqAhqARaAyagKbgP9AMNActQEvQCrQGbUBb0A60Bx1AR9AJdAZdQFfQDXQHPUBP0Av0Bn1AX9AP9AcDwEAwCAwGQ8BQMAwMByPASDAKjAZjwFgwDowH8WACmAgmgclgCpgKpoHpYAaYCWaB2WAOmAvmgflgAVgIFoHFYAlYCpaBBLAcJIIVYCVYBVaDNWAtWAfWgw1gI9gENoMtYCvYBjCAAwKQgAI0YAALOMADAYhAAjJQgAo0oAMDmMACNnCACzzggwCEAAAIEIhADGwHO8BOsAvsBnvAXrAP7AcHwEFwCBwGR8BRcAwcByfASXAKnAZnwFlwDpwHF8BFcAlcBlfAVXANXAc3wE1wC9wGd8BdcA/cBw/AQ/AIPAZPwFPwDDwHL8BL8Aq8Bm/AW/AOvAcfwEfwCXwGX8BX8A18Bz/AT/AL/AZ/wF/wDySBOJgMJocpYEqYCqaGaWBamA6mhxlgRpgJZoZZYFaYDWaHOWBOmAvmhnlgXpgP5ocFYEFYCBaGRWBRWAwWhyVgSVgKloZlYFlYDpaHFWBFWAlWhlVgVVgNVoc1YE1YC9aGdWBdWA/Whw1gQ9gINoZNYFP4H2wGm8MWsCVsBVvDNrAtbAfbww6wI+wEO8MusCvsBrvDHrAn7AV7wz6wL+wH+8MBcCAcBAfDIXAoHAaHwxFwJBwFR8MxcCwcB8fDeDgBToST4GQ4BU6F0+B0OAPOhLPgbDgHzoXz4Hy4AC6Ei+BiuAQuhctgAlwOE+EKuBKugqvhGrgWroPr4Qa4EW6Cm+EWuBVugxjEIQFJSEEaMpCFHOShAEUoQRkqUIUa1KEBTWhBGzrQhR70YQBDCCCECEYwBrfDHXAn3AV3wz1wL9wH98MD8CA8BA/DI/AoPAaPwxPwJDwFT8Mz8Cw8B8/DC/AivAQvwyvwKrwGr8Mb8Ca8BW/DO/AuvAfvwwfwIXwEH8Mn8Cl8Bp/DF/AlfAVfwzfwLXwH38MP8CP8BD/DL/Ar/Aa/wx/wJ/wFf8M/8C/8B5NgHEqGkqMUKCVKhVKjNCgtSofSowwoI8qEMqMsKCvKhrKjHCgnyoVyozwoL8qH8qMCqCAqhAqjIqgoKoaKoxKoJCqFSqMyqCwqh8qjCqgiqoQqoyqoKqqGqqMaqCaqhWqjOqguqofqowaoIWqEGqMmqCn6DzVDzVEL1BK1Qq1RG9QWtUPtUQfUEXVCnVEX1BV1Q91RD9QT9UK9UR/UF/VD/dEANBANQoPREDQUDUPD0Qg0Eo1Co9EYNBaNQ+NRPJqAJqJJaDKagqaiaWg6moFmolloNpqD5qJ5aD5agBaiRWgxWoKWomUoAS1HiWgFWolWodVoDVqL1qH1aAPaiDahzWgL2oq2IQzhiEAkohCNGMQiDvFIQCKSkIwUpCIN6chAJrKQjRzkIg/5KEAhAggihCIUQ9vRDrQT7UK70R60F+1D+9EBdBAdQofREXQUHUPH0Ql0Ep1Cp9EZdBadQ+fRBXQRXUKX0RV0FV1D19ENdBPdQrfRHXQX3UP30QP0ED1Cj9ET9BQ9Q8/RC/QSvUKv0Rv0Fr1D79EH9BF9Qp/RF/QVfUPf0Q/0E/1Cv9Ef9Bf9Q0koLkoWJY9SRCmjVCn+T6VF6aMMUcYoU5Q5yhJljbJF2aMcUc4oV5Q7yhPljfJF+aMCUcGoUFQ4KhIVjYpFxaMSUcmoVFQ6KhOVjcpF5aMKUcWoUlQ5qhJVjapF1aMaUc2oVlQ7qhPVjepF9aMGUcOoUdQ4ahI1jf6LmkXNoxZRy6hV1DpqE7WN2kXtow5Rx6hT1DnqEnWNukXdox5Rz6hX1DvqE/WN+kX9owHRwGhQNDgaEg2NhkXDoxHRyGhUNDoaE42NxkXjo/hoQjQxmhRNjqZEU6Np0fRoRjQzmhXNjuZEc6N50fxoQbQwWhQtjpZES6NlUUK0PEqMVkQro1XR6mhNtDZaF62PNkQbo03R5mhLtDXaFmERHhERGVERHTERG3ERHwmRGEmRHCmRGmmRHhmRGVmRHTmRG3mRHwVRGIEIRiiKoli0PdoR7Yx2RbujPdHeaF+0PzoQHYwORYejI9HR6Fh0PDoRnYxORaejM9HZ6Fx0ProQXYwuRZejK9HV6Fp0PboR3YxuRbejO9Hd6F50P3oQPYweRY+jJ9HT6Fn0PHoRvYxeRa+jN9Hb6F30PvoQfYw+RZ+jL9HX6Fv0PfoR/Yx+Rb+jP9Hf6F+UFMXFksWSx1LEUsZSxVLH0sTSxtLF0scyxDLGMsUyx7LEssayxbLHcsRyxnLFcsfyxPLG8sXyxwrECsYKxQrHisSKxorFisdKxErGSsVKx8rEysbKxcrHKsQqxirFKseqxKrGqsWqx2rEasZqxWrH6sTqxurF6scaxBrGGsUax5rEmsb+izWLNY+1iLWMtYq1jrWJtY21i7WPdYh1jHWKdY51iXWNdYt1j/WI9Yz1ivWO9Yn1jfWL9Y8NiA2MDYoNjg2JDY39j0V7WhucaRAA+I5tf2Pbtm3btm3btm3FtpPudDK2bWsP9r+NemoQNhgbgg3FhmHDsRHYSGwUNhobg43FxmHjsQnYRGwSNhmbgk3FpmHTsRnYTGwWNhubg83F5mHzsQXYQmwRthhbgi3FlmHLsRXYSmwVthpbg63F1mHrsQ3YRmwTthnbgm3FtmHbsR3YTmwXthvbg+3F9mH7sQPYQewQdhg7gh3FjmHHsRPYSewUdho7g53FzmHnsQvYRewSdhm7gl3FMAzHCIzEKIzGGIzFOIzHBEzEJEzGFEzFNEzHDMzELMzGHMzFPMzHAgxgEAsxhEVYjF3DrmM3sJvYLew2dge7i93D7mMPsIfYI+wx9gR7ij3DnmMvsJfYK+w19gZ7i73D3mMfsI/YJ+wz9gX7in3DvmM/sJ/YL+w39gf7i/3DEvBEeGI8CZ4UT4Ynx1PgKfFUeGo8DZ4WT4enxzPgGfFMeGY8C54Vz4Znx3PgOfFc+H94bjwPnhfPh+fHC+AF8UJ4YbwIXhQvhhfHS+Al8VJ4abwMXhYvh5fHK+AV8Up4ZbwKXhWvhlfHa+A18Vp4bbwOXhevh9fHG+AN8UZ4Y7wJ3hRvhjfHW+At8VZ4a7wN3hZvh7fHO+Ad8U54Z7wL3hXvhnfHe+A98V54b7wP3hfvh/fHB+AD8UH4YHwIPhQfhg/HR+Aj8VH4aHwMPhYfh4/HJ+AT8Un4ZHwKPhWfhk/HZ+Az8Vn4bHwOPhefh8/HF+AL8UX4YnwJvhRfhi/HV+Ar8VX4anwNvhZfh6/HN+Ab8U34ZnwLvhXfhm/Hd+A78V34bnwPvhffh+/HD+AH8UP4YfwIfhQ/hh/HT+An8VP4afwMfhY/h5/HL+AX8Uv4ZfwKfhXHcBwncBKncBpncBbncB4XcBGXcBlXcBXXcB03cBO3cBt3cBf3cB8PcIBDPMQRHuExfg2/jt/Ab+K38Nv4Hfwufg+/jz/AH+KP8Mf4E/wp/gx/jr/AX+Kv8Nf4G/wt/g5/j3/AP+Kf8M/4F/wr/g3/jv/Af+K/8N/4H/wv/g9PIBIRiYkkRFIiGZGcSEGkJFIRqYk0RFoiHZGeyEBkJDIRmYksRFYiG5GdyEHkJHIR/xG5iTxEXiIfkZ8oQBQkChGFiSJEUaIYUZwoQZQkShGliTJEWaIcUZ6oQFQkKhGViSpEVaIaUZ2oQdQkahG1iTpEXaIeUZ9oQDQkGhGNiSZEU6IZ0ZxoQbQkWhGtiTZEW6Id0Z7oQHQkOhGdiS5EV6Ib0Z3oQfQkehG9iT5EX6If0Z8YQAwkBhGDiSHEUGIYMZwYQYwkRhGjiTHEWGIcMZ6YQEwkJhGTiSnEVGIaMZ2YQcwkZhGziTnEXGIeMZ9YQCwkFhGLiSXEUmIZsZxYQawkVhGriTXEWmIdsZ7YQGwkNhGbiS3EVmIbsZ3YQewkdhG7iT3EXmIfsZ84QBwkDhGHiSPEUeIYcZw4QZwkThGniTPEWeIccZ64QFwkLhGXiSvEVQIjcIIgSIIiaIIhWIIjeEIgREIiZEIhVEIjdMIgTMIibMIhXMIjfCIgAAGJkEBERMTENeI6cYO4SdwibhN3iLvEPeI+8YB4SDwiHhNPiKfEM+I58YJ4SbwiXhNviLfEO+I98YH4SHwiPhNfiK/EN+I78YP4SfwifhN/iL/EPyKBTEQmJpOQSclkZHIyBZmSTEWmJtOQacl0ZHoyA5mRzERmJrOQWclsZHYyB5mTzEX+R+Ym85B5yXxkfrIAWZAsRBYmi5BFyWJkcbIEWZIsRZYmy5BlyXJkebICWZGsRFYmq5BVyWpkdbIGWZOsRdYm65B1yXpkfbIB2ZBsRDYmm5BNyWZkc7IF2ZJsRbYm25BtyXZke7ID2ZHsRHYmu5BdyW5kd7IH2ZPsRfYm+5B9yX5kf3IAOZAcRA4mh5BDyWHkcHIEOZIcRY4mx5BjyXHkeHICOZGcRE4mp5BTyWnkdHIGOZOcRc4m55BzyXnkfHIBuZBcRC4ml5BLyWXkcnIFuZJcRa4m15BryXXkenIDuZHcRG4mt5BbyW3kdnIHuZPcRe4m95B7yX3kfvIAeZA8RB4mj5BHyWPkcfIEeZI8RZ4mz5BnyXPkefICeZG8RF4mr5BXSYzESYIkSYqkSYZkSY7kSYEUSYmUSYVUSY3USYM0SYu0SYd0SY/0yYAEJCRDEpERGZPXyOvkDfImeYu8Td4h75L3yPvkA/Ih+Yh8TD4hn5LPyOfkC/Il+Yp8Tb4h35LvyPfkB/Ij+Yn8TH4hv5LfyO/kD/In+Yv8Tf4h/5L/yAQqEZWYSkIlpZJRyakUVEoqFZWaSkOlpdJR6akMVEYqE5WZykJlpbJR2akcVE4qF/UflZvKQ+Wl8lH5qQJUQaoQVZgqQhWlilHFqRJUSaoUVZoqQ5WlylHlqQpURaoSVZmqQlWlqlHVqRpUTaoWVZuqQ9Wl6lH1qQZUQ6oR1ZhqQjWlmlHNqRZUS6oV1ZpqQ7Wl2lHtqQ5UR6oT1ZnqQnWlulHdqR5UT6oX1ZvqQ/Wl+lH9qQHUQGoQNZgaQg2lhlHDqRHUSGoUNZoaQ42lxlHjqQnURGoSNZmaQk2lplHTqRnUTGoWNZuaQ82l5lHzqQXUQmoRtZhaQi2lllHLqRXUSmoVtZpaQ62l1lHrqQ3URmoTtZnaQm2ltlHbqR3UTmoXtZvaQ+2l9lH7qQPUQeoQdZg6Qh2ljlHHqRPUSeoUdZo6Q52lzlHnqQvUReoSdZm6Ql2lMAqnCIqkKIqmGIqlOIqnBEqkJEqmFEqlNEqnDMqkLMqmHMqlPMqnAgpQkAopREVUTF2jrlM3qJvULeo2dYe6S92j7lMPqIfUI+ox9YR6Sj2jnlMvqJfUK+o19YZ6S72j3lMfqI/UJ+oz9YX6Sn2jvlM/qJ/UL+o39Yf6S/2jEuhEdGI6CZ2UTkYnp1PQKelUdGo6DZ2WTkenpzPQGelMdGY6C52VzkZnp3PQOelc9H90bjoPnZfOR+enC9AF6UJ0YboIXZQuRhenS9Al6VJ0aboMXZYuR5enK9AV6Up0ZboKXZWuRlena9A16Vp0bboOXZeuR9enG9AN6UZ0Y7oJ3ZRuRjenW9At6VZ0a7oN3ZZuR7enO9Ad6U50Z7oL3ZXuRnene9A96V50b7oP3ZfuR/enB9AD6UH0YHoIPZQeRg+nR9Aj6VH0aHoMPZYeR4+nJ9AT6Un0ZHoKPZWeRk+nZ9Az6Vn0bHoOPZeeR8+nF9AL6UX0YnoJvZReRi+nV9Ar6VX0anoNvZZeR6+nN9Ab6U30ZnoLvZXeRm+nd9A76V30bnoPvZfeR++nD9AH6UP0YfoIfZQ+Rh+nT9An6VP0afoMfZY+R5+nL9AX6Uv0ZfoKfZXGaJwmaJKmaJpmaJbmaJ4WaJGWaJlWaJXWaJ02aJO2aJt2aJf2aJ8OaEBDOqQRHdExfY2+Tt+gb9K36Nv0HfoufY++Tz+gH9KP6Mf0E/op/Yx+Tr+gX9Kv6Nf0G/ot/Y5+T3+gP9Kf6M/0F/or/Y3+Tv+gf9K/6N/0H/ov/Y9OYBIxiZkkTFImGZOcScGkZFIxqZk0TFomHZOeycBkZDIxmZksTFYmG5OdycHkZHIx/zG5mTxMXiYfk58pwBRkCjGFmSJMUaYYU5wpwZRkSjGlmTJMWaYcU56pwFRkKjGVmSpMVaYaU52pwdRkajG1mTpMXaYeU59pwDRkGjGNmSZMU6YZ05xpwbRkWjGtmTZMW6Yd057pwHRkOjGdmS5MV6Yb053pwfRkejG9mT5MX6Yf058ZwAxkBjGDmSHMUGYYM5wZwYxkRjGjmTHMWGYcM56ZwExkJjGTmSnMVGYaM52ZwcxkZjGzmTnMXGYeM59ZwCxkFjGLmSXMUmYZs5xZwaxkVjGrmTXMWmYds57ZwGxkNjGbmS3MVmYbs53ZwexkdjG7mT3MXmYfs585wBxkDjGHmSPMUeYYc5w5wZxkTjGnmTPMWeYcc565wFxkLjGXmSvMVQZjcIZgSIZiaIZhWIZjeEZgREZiZEZhVEZjdMZgTMZibMZhXMZjfCZgAAOZkEFMxMTMNeY6c4O5ydxibjN3mLvMPeY+84B5yDxiHjNPmKfMM+Y584J5ybxiXjNvmLfMO+Y984H5yHxiPjNfmK/MN+Y784P5yfxifjN/mL/MPyaBTcQmZpOwSdlkbHI2BZuSTcWmZtOwadl0bHo2A5uRzcRmZrOwWdlsbHY2B5uTzcX+x+Zm87B52XxsfrYAW5AtxBZmi7BF2WJscbYEW5ItxZZmy7Bl2XJsebYCW5GtxFZmq7BV2WpsdbYGW5OtxdZm67B12XpsfbYB25BtxDZmm7BN2WZsc7YF25JtxbZm27Bt2XZse7YD25HtxHZmu7Bd2W5sd7YH25PtxfZm+7B92X5sf3YAO5AdxA5mh7BD2WGJkv6/YLCj2THsWHYcO56dwE5kJ7GT2SnsVHYaO52dwc5kZ7Gz2TnsXHYeO59dwC5kF7GL2SXsUnYZu5xdwa5kV7Gr2TXsWnYdu57dwG5kN7Gb2S3sVnYbu53dwe5kd7G72T3sXnYfu589wB5kD7GH2SPsUfYYe5w9wZ5kT7Gn2TPsWfYce569wF5kL7GX2SvsVRZjcZZgSZZiaZZhWZZjeVZgRVZiZVZhVVZjddZgTdZibdZhXdZjfTZgAQvZkEVsxMbsNfY6e4O9yd5ib7N32LvsPfY++4B9yD5iH7NP2KfsM/Y5+4J9yb5iX7Nv2LfsO/Y9+4H9yH5iP7Nf2K/sN/Y7+4P9yf5if7N/2L/sPzaBS8Ql5pJwSblkXHIuBZeSS8Wl5tJwabl0XHouA5eRy8Rl5rJwWblsXHYuB5eTy8X9x+Xm8nB5uXxcfq4AV5ArxBXminBFuWJcca4EV5IrxZXmynBluXJcea4CV5GrxFXmqnBVuWpcda4GV5OrxdXm6nB1uXpcfa4B15BrxDXmmnBNuWZcc64F15JrxbXm2nBtuXZce64D15HrxHXmunBduW5cd64H15PrxfXm+nB9uX5cf24AN5AbxA3mhnBDuWHccG4EN5IbxY3mxnBjuXHceG4CN5GbxE3mpnBTuWncdG4GN5Obxc3m5nBzuXncfG4Bt5BbxC3mlnBLuWXccm4Ft5Jbxa3m1nBruXXcem4Dt5HbxG3mtnBbuW3cdm4Ht5Pbxe3m9nB7uX3cfu4Ad5A7xB3mjnBHuWPcce4Ed5I7xZ3mznBnuXPcee4Cd5G7xF3mrnBXOYzDOYIjOYqjOYZjOY7jOYETOYmTOYVTOY3TOYMzOYuzOYdzOY/zuYADHORCDnERF3PXuOvcDe4md4u7zd3h7nL3uPvcA+4h94h7zD3hnnLPuOfcC+4l94p7zb3h3nLvuPfcB+4j94n7zH3hvnLfuO/cD+4n94v7zf3h/nL/uAQ+EZ+YT8In5ZPxyfkUfEo+FZ+aT8On5dPx6fkMfEY+E5+Zz8Jn5bPx2fkcfE4+F/8fn5vPw+fl8/H5+QJ8Qb4QX5gvwhfli/HF+RJ8Sb4UX5ovw5fly/Hl+Qp8Rb4SX5mvwlflq/HV+Rp8Tb4WX5uvw9fl6/H1+QZ8Q74R35hvwjflm/HN+RZ8S74V35pvw7fl2/Ht+Q58R74T35nvwnflu/Hd+R58T74X35vvw/fl+/H9+QH8QH4QP5gfwg/lh/HD+RH8SH4UP5ofw4/lx/Hj+Qn8RH4SP5mfwk/lp/HT+Rn8TH4WP5ufw8/l5/Hz+QX8Qn4Rv5hfwi/ll/HL+RX8Sn4Vv5pfw6/l1/Hr+Q38Rn4Tv5nfwm/lt/Hb+R38Tn4Xv5vfw+/l9/H7+QP8Qf4Qf5g/wh/lj/HH+RP8Sf4Uf5o/w5/lz/Hn+Qv8Rf4Sf5m/wl/lMR7nCZ7kKZ7mGZ7lOZ7nBV7kJV7mFV7lNV7nDd7kLd7mHd7lPd7nAx7wkA95xEd8zF/jr/M3+Jv8Lf42f4e/y9/j7/MP+If8I/4x/4R/yj/jn/Mv+Jf8K/41/4Z/y7/j3/Mf+I/8J/4z/4X/yn/jv/M/+J/8L/43/4f/y//jE4REQmIhiZBUSCYkF1IIKYVUQmohjZBWSCekFzIIGYVMQmYhi5BVyCZkF3IIOYVcwn9CbiGPkFfIJ+QXCggFhUJCYaGIUFQoJhQXSgglhVJCaaGMUFYoJ5QXKggVhUpCZaGKUFWoJlQXagg1hVpCbaGOUFeoJ9QXGggNhUZCY6GJ0FRoJjQXWggthVZCa6GN0FZoJ7QXOggdhU5CZ6GL0FXoJnQXegg9hV5Cb6GP0FfoJ/QXBggDhUHCYGGIMFQYJgwXRggjhVHCaGGMMFYYJ4wXJggThUnCZGGKMFWYJkwXZggzhVnCbGGOMFeYJ8wXFggLhUXCYmGJsFRYJiwXVggrhVXCamGNsFZYJ6wXNggbhU3CZmGLsFXYJmwXdgg7hV3CbmGPsFfYJ+wXDggHhUPCYeGIcFQ4JhwXTggnhVPCaeGMcFY4J5wXLggXhUvCZeGKcFXABFwgBFKgBFpgBFbgBF4QBFGQBFlQBFXQBF0wBFOwBFtwBFfwBF8IBCBAIRSQEAmxcE24LtwQbgq3hNvCHeGucE+4LzwQHgqPhMfCE+Gp8Ex4LrwQXgqvhNfCG+Gt8E54L3wQPgqfhM/CF+Gr8E34LvwQfgq/hN/CH+Gv8E9IEBOJicUkYlIxmZhcTCGmFFOJqcU0YloxnZhezCBmFDOJmcUsYlYxm5hdzCHmFHOJ/4m5xTxiXjGfmF8sIBYUC4mFxSJiUbGYWFwsIZYUS4mlxTJiWbGcWF6sIFYUK4mVxSpiVbGaWF2sIdYUa4m1xTpiXbGeWF9sIDYUG4mNxSZiU7GZ2FxsIbYUW4mtxTZiW7Gd2F7sIHYUO4mdxS5iV7Gb2F3sIfYUe4m9xT5iX7Gf2F8cIA4UB4mDxSHiUHGYOFwcIY4UR4mjxTHiWHGcOF6cIE4UJ4mTxSniVHGaOF2cIc4UZ4mzxTniXHGeOF9cIC4UF4mLxSXiUnGZuFxcIa4UV4mrxTXiWnGduF7cIG4UN4mbxS3iVnGbuF3cIe4Ud4m7xT3iXnGfuF88IB4UD4mHxSPiUfGYeFw8IZ4UT4mnxTPiWfGceF68IF4UL4mXxSviVRETcZEQSZESaZERWZETeVEQRVESZVERVVETddEQTdESbdERXdETfTEQgQjFUERiJMbiNfG6eEO8Kd4Sb4t3xLviPfG++EB8KD4SH4tPxKfiM/G5+EJ8Kb4SX4tvxLfiO/G9+EH8KH4SP4tfxK/iN/G7+EP8Kf4Sf4t/xL/iPzFBSiQllpJISaVkUnIphZRSSiWlltJIaaV0Unopg5RRyiRllrJIWaVsUnYph5RTyiX9J+WW8kh5pXxSfqmAVFAqJBWWikhFpWJScamEVFIqJZWWykhlpXJSeamCVFGqJFWWqkhVpWpSdamGVFOqJdWW6kh1pXpSfamB1FBqJDWWmkhNpWZSc6mF1FJqJbWW2khtpXZSe6mD1FHqJHWWukhdpW5Sd6mH1FPqJfWW+kh9pX5Sf2mANFAaJA2WhkhDpWHScGmENFIaJY2WxkhjpXHSeGmCNFGaJE2WpkhTpWnSdGmGNFOaJc2W5khzpXnSfGmBtFBaJC2WlkhLpWXScmmFtFJaJa2W1khrpXXSemmDtFHaJG2WtkhbpW3SdmmHtFPaJe2W9kh7pX3SfumAdFA6JB2WjkhHpWPScemEdFI6JZ2WzkhnpXPSeemCdFG6JF2WrkhXJUzCJUIiJUqiJUZiJU7iJUESJUmSJUVSJU3SJUMyJUuyJUdyJU/ypUACEpRCCUmRFEvXpOvSDemmdEu6Ld2R7kr3pPvSA+mh9Eh6LD2RnkrPpOfSC+ml9Ep6Lb2R3krvpPfSB+mj9En6LH2RvkrfpO/SD+mn9Ev6Lf2R/kr/pAQ5kZxYTiInlZPJyeUUcko5lZxaTiOnldPJ6eUMckY5k5xZziJnlbPJ2eUcck45V6qEhAQ5j5xXzifnlwvIBeVCcmG5iFxULiYXl0vIJeVScmm5jFxWLieXlyvIFeVKcmW5ilxVriZXl2vINeVacm25jlxXrifXlxvIDeVGcmO5idxUbiY3l1vILeVWcmu5jdxWbie3lzvIHeVOcme5i9xV7iZ3l3vIPeVecm+5j9xX7if3lwfIA+VB8mB5iDxUHiYPl0fII+VR8mh5jDxWHiePlyfIE+VJ8mR5ijxVniZPl2fIM+VZ8mx5jjxXnifPlxfIC+VF8mJ5ibxUXiYvl1fIK+VV8mp5jbxWXievlzfIG+VN8mZ5i7xV3iZvl3fIO+Vd8m55j7xX3ifvlw/IB+VD8mH5iHxUPiYfl0/IJ+VT8mn5jHxWPiefly/IF+VL8mX5inxVxmRcJmRSpmRaZmRW5mReFmRRlmRZVmRV1mRdNmRTtmRbdmRX9mRfDmQgQzmUkRzJsXxNvi7fkG/Kt+Tb8h35rnxPvi8/kB/Kj+TH8hP5qfxMfi6/kF/Kr+TX8hv5rfxOfi9/kD/Kn+TP8hf5q/xN/i7/kH/Kv+Tf8h/5r/xPTlASKYmVJEpSJZmSXEmhpFRSKamVNEpaJZ2SXsmgZFQyKZmVLEpWJZuSXcmh5FRyKf8puZU8Sl4ln5JfKaAUVAophZUiSlGlmFJcKaGUVEoppZUySlmlnFJeqaBUVCoplZUqSlWlmlJdqaHUVGoptZU6Sl2lnlJfaaA0VBopjZUmSlOlmdJcaaG0VFoprZU2SlulndJe6aB0VDopnZUuSlelm9Jd6aH0VHopvZU+Sl+ln9JfGaAMVAYpg5UhylBlmDJcGaGMVEYpo5UxylhlnDJemaBMVCYpk5UpylRlmjJdmaHMVGYps5U5ylxlnjJfWaAsVBYpi5UlylJlmbJcWaGsVFYpq5U1ylplnbJe2aBsVDYpm5UtylZlm7Jd2aHsVHYpu5U9yl5ln7JfOaAcVA4ph5UjylHlmHJcOaGcVE4pp5UzylnlnHJeuaBcVC4pl5UrylUFU3CFUEiFUmiFUViFU3hFUERFUmRFUVRFU3TFUEzFUmzFUVzFU3wlUIAClVBBSqTEyjXlunJDuancUm4rd5S7yj3lvvJAeag8Uh4rT5SnyjPlufJCeam8Ul4rb5S3yjvlvfJB+ah8Uj4rX5Svyjflu/JD+an8Un4rf5S/yj8lQU2kJlaTqEnVZGpyNYWaUk2lplbTqGnVdGp6NYOaUc2kZlazqFnVbGp2NYeaU82l/qfmVvOoedV8an61gFpQLaQWVouoRdVianG1hFpSLaWWVsuoZdVyanm1glpRraRWVquoVdVqanW1hlpTraXWVuuoddV6an21gdpQbaQ2VpuoTdVmanO1hdpSbaW2VtuobdV2anu1g9pR7aR2VruoXdVuane1h9pT7aX2VvuofdV+an91gDpQHaQOVoeoQ9Vh6nB1hDpSHaWOVseoY9Vx6nh1gjpRnaROVqeoU9Vp6nR1hjpTnaXOVueoc9V56nx1gbpQXaQuVpeoS9Vl6nJ1hbpSXaWuVteoa9V16np1g7pR3aRuVreoW9Vt6nZ1h7pT3aXuVveoe9V96n71gHpQPaQeVo+oR9Vj6nH1hHpSPaWeVs+oZ9Vz6nn1gnpRvaReVq+oV1VMxVVCJVVKpVVGZVVO5VVBFVVJlVVFVVVN1VVDNVVLtVVHdVVP9dVABSpUQxWpkRqr19Tr6g31pnpLva3eUe+q99T76gP1ofpIfaw+UZ+qz9Tn6gv1pfpKfa2+Ud+q79T36gf1o/pJ/ax+Ub+q39Tv6g/1p/pL/a3+Uf+q/9QELZGWWEuiJdWSacm1FFpKLZWWWkujpdXSaem1DFpGLZOWWcuiZdWyadm1HFpOLZf2n5Zby6Pl1fJp+bUCWkGtkFZYK6IV1YppxbUSWkmtlFZaK6OV1cpp5bUKWkWtklZZq6JV1app1bUaWk2tllZbq6PV1epp9bUGWkOtkdZYa6I11ZppzbUWWkutldZaa6O11dpp7bUOWketk9ZZ66J11bpp3bUeWk+tl9Zb66P11fpp/bUB2kBtkDZYG6IN1YZpw7UR2khtlDZaG6ON1cZp47UJ2kRtkjZZm6JN1aZp07UZ2kxtljZbm6PN1eZp87UF2kJtkbZYW6It1ZZpy7UV2kptlbZaW6Ot1dZp67UN2kZtk7ZZ26Jt1bZp27Ud2k5tl7Zb26Pt1fZp+7UD2kHtkHZYO6Id1Y5px7UT2kntlHZaO6Od1c5p57UL2kXtknZZu6Jd1TAN1wiN1CiN1hiN1TiN1wRN1CRN1hRN1TRN1wzN1CzN1hzN1TzN1wINaFALNaRFWqxd065rN7Sb2i3ttnZHu6vd0+5rD7SH2iPtsfZEe6o9055rL7SX2ivttfZGe6u9095rH7SP2ifts/ZF+6p9075rP7Sf2i/tt/ZH+6v90xL0RHpiPYmeVE+mJ9dT6Cn1VHpqPY2eVk+np9cz6Bn1THpmPYueVc+mZ9dz6Dn1XPp/em49j55Xz6fn1wvoBfVCemG9iF5UL6YX10voJfVSemm9jF5WL6eX1yvoFfVKemW9il5Vr6ZX12voNfVaem29jl5Xr6fX1xvoDfVGemO9id5Ub6Y311voLfVWemu9jd5Wb6e31zvoHfVOeme9i95V76Z313voPfVeem+9j95X76f31wfoA/VB+mB9iD5UH6YP10foI/VR+mh9jD5WH6eP1yfoE/VJ+mR9ij5Vn6ZP12foM/VZ+mx9jj5Xn6fP1xfoC/VF+mJ9ib5UX6Yv11foK/VV+mp9jb5WX6ev1zfoG/VN+mZ9i75V36Zv13foO/Vd+m59j75X36fv1w/oB/VD+mH9iH5UP6Yf10/oJ/VT+mn9jH5WP6ef1y/oF/VL+mX9in5Vx3RcJ3RSp3RaZ3RW53ReF3RRl3RZV3RV13RdN3RTt3Rbd3RX93RfD3SgQz3UkR7psX5Nv67f0G/qt/Tb+h39rn5Pv68/0B/qj/TH+hP9qf5Mf66/0F/qr/TX+hv9rf5Of69/0D/qn/TP+hf9q/5N/67/0H/qv/Tf+h/9r/5PTzASGYmNJEZSI5mR3EhhpDRSGamNNEZaI52R3shgZDQyGZmNLEZWI5uR3chh5DRyGf8ZuY08Rl4jn5HfKGAUNAoZhY0iRlGjmFHcKGGUNEoZpY0yRlmjnFHeqGBUNCoZlY0qRlWjmlHdqGHUNGoZtY06Rl2jnlHfaGA0NBoZjY0mRlOjmdHcaGG0NFoZrY02RlujndHe6GB0NDoZnY0uRlejm9Hd6GH0NHoZvY0+Rl+jn9HfGGAMNAYZg40hxlBjmDHcGGGMNEYZo40xxlhjnDHemGBMNCYZk40pxlRjmjHdmGHMNGYZs405xlxjnjHfWGAsNBYZi40lxlJjmbHcWGGsNFYZq401xlpjnbHe2GBsNDYZm40txlZjm7Hd2GHsNHYZu409xl5jn7HfOGAcNA4Zh40jxlHjmHHcOGGcNE4Zp40zxlnjnHHeuGBcNC4Zl40rxlUDM3CDMEiDMmiDMViDM3hDMERDMmRDMVRDM3TDMEzDMmzDMVzDM3wjMIABjdBARmTExjXjunHDuGncMm4bd4y7xj3jvvHAeGg8Mh4bT4ynxjPjufHCeGm8Ml4bb4y3xjvjvfHB+Gh8Mj4bX4yvxjfju/HD+Gn8Mn4bf4y/xj8jwUxkJjaTmEnNZGZyM4WZ0kxlpjbTmGnNdGZ6M4OZ0cxkZjazmFnNbGZ2M4eZ08xl/mfmNvOYec18Zn6zgFnQLGQWNouYRc1iZnGzhFnSLGWWNsuYZc1yZnmzglnRrGRWNquYVc1qZnWzhlnTrGXWNuuYdc16Zn2zgdnQbGQ2NpuYTc1mZnOzhdnSbGW2NtuYbc12Znuzg9nR7GR2NruYXc1uZnezh9nT7GX2NvuYfc1+Zn9zgDnQHGQONoeYQ81h5nBzhDnSHGWONseYY81x5nhzgjnRnGRONqeYU81p5nRzhjnTnGXONueYc8155nxzgbnQXGQuNpeYS81l5nJzhbnSXGWuNteYa8115npzg7nR3GRuNreYW81t5nZzh7nT3GXuNveYe8195n7zgHnQPGQeNo+YR81j5nHzhHnSPGWeNs+YZ81z5nnzgnnRvGReNq+YV03MxE3CJE3KpE3GZE3O5E3BFE3JlE3FVE3N1E3DNE3LtE3HdE3P9M3ABCY0QxOZkRmb18zr5g3zpnnLvG3eMe+a98z75gPzofnIfGw+MZ+az8zn5gvzpfnKfG2+Md+a78z35gfzo/nJ/Gx+Mb+a38zv5g/zp/nL/G3+Mf+a/8wEK5GV2EpiJbWSWcmtFFZKK5WV2kpjpbXSWemtDFZGK5OV2cpiZbWyWdmtHFZOK5f1n5XbymPltfJZ+a0CVkGrkFXYKmIVtYpZxa0SVkmrlFXaKmOVtcpZ5a0KVkWrklXZqmJVtapZ1a0aVk2rllXbqmPVtepZ9a0GVkOrkdXYamI1tZpZza0WVkurldXaamO1tdpZ7a0OVkerk9XZ6mJ1tbpZ3a0eVk+rl9Xb6mP1tfpZ/a0B1kBrkDXYGmINtYZZw60R1khrlDXaGmONtcZZ460J1kRrkjXZmmJNtaZZ060Z1kxrljXbmmPNteZZ860F1kJrkbXYWmIttZZZy60V1kprlbXaWmOttdZZ660N1kZrk7XZ2mJttbZZ260d1k5rl7Xb2mPttfZZ+60D1kHrkHXYOmIdtY5Zx60T1knrlHXaOmOdtc5Z560L1kXrknXZumJdtTALtwiLtCiLthiLtTiLtwRLtCRLthRLtTRLtwzLtCzLthzLtTzLtwILWNAKLWRFVmxds65bN6yb1i3rtnXHumvds+5bD6yH1iPrsfXEemo9s55bL6yX1ivrtfXGemu9s95bH6yP1ifrs/XF+mp9s75bP6yf1i/rt/XH+mv9sxLsRHZiO4md1E5mJ7dT2CntVHZqO42d1k5np7cz2BntTHZmO4ud1c5mZ7dz2DntXPZ/dm47j53XzmfntwvYBe1CdmG7iF3ULmYXt0vYJe1Sdmm7jF3WLmeXtyvYFe1KdmW7il3VrmZXt2vYNe1adm27jl3XrmfXtxvYDe1GdmO7id3UbmY3t1vYLe1Wdmu7jd3Wbme3tzvYHe1Odme7i93V7mZ3t3vYPe1edm+7j93X7mf3twfYA+1B9mB7iD3UHmYPt0fYI+1R9mh7jD3WHmePtyfYE+1J9mR7ij3VnmZPt2fYM+1Z9mx7jj3XnmfPtxfYC+1F9mJ7ib3UXmYvt1fYK+1V9mp7jb3WXmevtzfYG+1N9mZ7i73V3mZvt3fYO+1d9m57j73X3mfvtw/YB+1D9mH7iH3UPmYft0/YJ+1T9mn7jH3WPmefty/YF+1L9mX7in3VxmzcJmzSpmzaZmzW5mzeFmzRlmzZVmzV1mzdNmzTtmzbdmzX9mzfDmxgQzu0kR3ZsX3Nvm7fsG/at+zb9h37rn3Pvm8/sB/aj+zH9hP7qf3Mfm6/sF/ar+zX9hv7rf3Ofm9/sD/an+zP9hf7q/3N/m7/sH/av+zf9h/7r/3PTnASOYmdJE5SJ5mT3EnhpHRSOamdNE5aJ52T3sngZHQyOZmdLE5WJ5uT3cnh5HRyOf85uZ08Tl4nn5PfKeAUdAo5hZ0iTlGnmFPcKeGUdEo5pZ0yTlmnnFPeqeBUdCo5lZ0qTlWnmlPdqeHUdGo5tZ06Tl2nnlPfaeA0dBo5jZ0mTlOnmdPcaeG0dFo5rZ02TlunndPe6eB0dDo5nZ0uTlenm9Pd6eH0dHo5vZ0+Tl+nn9PfGeAMdAY5g50hzlBnmDPcGeGMdEY5o50xzlhnnDPemeBMdCY5k50pzlRnmjPdmeHMdGY5s505zlxnnjPfWeAsdBY5i50lzlJnmbPcWeGsdFY5q501zlpnnbPe2eBsdDY5m50tzlZnm7Pd2eHsdHY5u509zl5nn7PfOeAcdA45h50jzlHnmHPcOeGcdE45p50zzlnnnHPeueBcdC45l50rzlUHc3CHcEiHcmiHcViHc3hHcERHcmRHcVRHc3THcEzHcmzHcVzHc3wncIADndBBTuTEzjXnunPDuenccm47d5y7zj3nvvPAeeg8ch47T5ynzjPnufPCeem8cl47b5y3zjvnvfPB+eh8cj47X5yvzjfnu/PD+en8cn47f5y/zj8nwU3kJnaTuEndZG5yN4Wb0k3lpnbTuGnddG56N4Ob0c3kZnazuFndbG52N4eb083l/ufmdvO4ed18bn63gFvQLeQWdou4Rd1ibnG3hFvSLeWWdsu4Zd1ybnm3glvRreRWdqu4Vd1qbnW3hlvTreXWduu4dd16bn23gdvQbeQ2dpu4Td1mbnO3hdvSbeW2dtu4bd12bnu3g9vR7eR2dru4Xd1ubne3h9vT7eX2dvu4fd1+bn93gDvQHeQOdoe4Q91h7nB3hDvSHeWOdse4Y91x7nh3gjvRneROdqe4U91p7nR3hjvTneXOdue4c9157nx3gbvQXeQudpe4S91l7nJ3hbvSXeWudte4a9117np3g7vR3eRudre4W91t7nZ3h7vT3eXudve4e9197n73gHvQPeQedo+4R91jbvKEhIST7in3tHvGPeuec8+7F9yL7iX3snvFvepiLu4SLulSLu0yLutyLu8KruhKruwqrupqru4arularu06rut6ru8GLnChG7rIjdzYveZed2+4N91b7m33jnvXvefedx+4D91H7mP3ifvUfeY+d1+4L91X7mv3jfvWfee+dz+4H91P7mf3i/vV/eZ+d3+4P91f7m/3j/vX/ecmeIm8xF4SL6mXzEvupfBSeqm81F4aL62XzkvvZfAyepm8zF4WL6uXzcvu5fByerm8/7zcXh4vr5fPy+8V8Ap6hbzCXhGvqFfMK+6V8Ep6pbzSXhmvrFfOK+9V8Cp6lbzKXhWvqlfNq+7V8Gp6tbzaXh2vrlfPq+818Bp6jbzGXhOvqdfMa+618Fp6rbzWXhuvrdfOa+918Dp6nbzOXhevq9fN6+718Hp6vbzeXh+vr9fP6+8N8AZ6g7zB3hBvqDfMG+6N8EZ6o7zR3hhvrDfOG+9N8CZ6k7zJ3hRvqjfNm+7N8GZ6s7zZ3hxvrjfPm+8t8BZ6i7zF3hJvqbfMW+6t8FZ6q7zV3hpvrbfOW+9t8DZ6m7zN3hZvq7fN2+7t8HZ6u7zd3h5vr7fP2+8d8A56h7zD3hHvqHfMO+6d8E56p7zT3hnvrHfOO+9d8C56l7zL3hXvqod5uEd4pEd5tMd4rMd5vCd4oid5sqd4qqd5umd4pmd5tud4rud5vhd4wINe6CEv8mLvmnfdu+Hd9G55t7073l3vnnffe+A99B55j70n3lPvmffce+G99F55r7033lvvnffe++B99D55n70v3lfvm/fd++H99H55v70/3l/vn5fgJ/IT+0n8pH4yP7mfwk/pp/JT+2n8tH46P72fwc/oZ/Iz+1n8rH42P7ufw8/p5/L/83P7efy8fj4/v1/AL+gX8gv7RfyifjG/uF/CL+mX8kv7Zfyyfjm/vF/Br+hX8iv7VfyqfjW/ul/Dr+nX8mv7dfy6fj2/vt/Ab+g38hv7TfymfjO/ud/Cb+m38lv7bfy2fju/vd/B7+h38jv7Xfyufje/u9/D7+n38nv7ffy+fj+/vz/AH+gP8gf7Q/yh/jB/uD/CH+mP8kf7Y/yx/jh/vD/Bn+hP8if7U/yp/jR/uj/Dn+nP8mf7c/y5/jx/vr/AX+gv8hf7S/yl/jJ/ub/CX+mv8lf7a/y1/jp/vb/B3+hv8jf7W/yt/jZ/u7/D3+nv8nf7e/y9/j5/v3/AP+gf8g/7R/yj/jH/uH/CP+mf8k/7Z/yz/jn/vH/Bv+hf8i/7V/yrPubjPuGTPuXTPuOzPufzvuCLvuTLvuKrvubrvuGbvuXbvuO7vuf7fuADH/qhj/zIj/1r/nX/hn/Tv+Xf9u/4d/17/n3/gf/Qf+Q/9p/4T/1n/nP/hf/Sf+W/9t/4b/13/nv/g//R/+R/9r/4X/1v/nf/h//T/+X/9v/4f/1/fkKQKEgcJAmSBsmC5EGKIGWQKkgdpAnSBumC9EGGIGOQKcgcZAmyBtmC7EGOIGeQK/gvyB3kCfIG+YL8QYGgYFAoKBwUCYoGxYLiQYmgZFAqKB2UCcoG5YLyQYWgYlApqBxUCaoG1YLqQY2gZlArqB3UCeoG9YL6QYOgYdAoaBw0CZoGzYLmQYugZdAqaB20CdoG7YL2QYegY9Ap6Bx0CboG3YLuQY+gZ9Ar6B30CfoG/YL+wYBgYDAoGBwMCYYGw4LhwYhgZDAqGB2MCcYG44LxwYRgYjApmBxMCaYG04LpwYxgZjArmB3MCeYG84L5wYJgYbAoWBwsCZYGy4LlwYpgZbAqWB2sCdYG64L1wYZgY7Ap2BxsCbYG24LtwY5gZ7Ar2B3sCfYG+4L9wYHgYHAoOBwcCY4Gx4LjwYngZHAqOB2cCc4G54LzwYXgYnApuBxcCa4GWIAHREAGVEAHTMAGXMAHQiAGUiAHSqAGWqAHRmAGVmAHTuAGXuAHQQACGIQBCqIgDq4F14Mbwc3gVnA7uBPcDe4F94MHwcPgUfA4eBI8DZ4Fz4MXwcvgVfA6eBO8Dd4F74MPwcfgU/A5+BJ8Db4F34Mfwc/gV/A7+BP8Df4FCSARSAySgKQgGUgOUoCUIBVIDdKAtCAdSA8ygIwgE8gMsoCsIBvIDnKAnCAX+A/kBnlAXpAP5AcFQEFQCBQGRUBRUAwUByVASVAKlAZlQFlQDpQHFUBFUAlUBlVAVVANVAc1QE1QC9QGdUBdUA/UBw1AQ9AINAZNQFPQDDQHLUBL0Aq0Bm1AW9AOtAcdQEfQCXQGXUBX0A10Bz1AT9AL9AZ9QF/QD/QHA8BAMAgMBkPAUDAMDAcjwEgwCowGY8BYMA6MBxPARDAJTAZTwFQwDUwHM8BMMAvMBnPAXDAPzAcLwEKwCCwGS8BSsAwsByvASrAKrAZrwFqwDqwHG8BGsAlsBlvAVrANbAc7wE6wC+wGe8BesA/sBwfAQXAIHAZHwFFwDBwHJ8BJcAqcBmfAWXAOnAcXwEVwCVwGV8BVgAEcEIAEFKABA1jAAR4IQAQSkIECVKABHRjABBawgQNc4AEfBAAACEKAQARicA1cBzfATXAL3AZ3wF1wD9wHD8BD8Ag8Bk/AU/AMPAcvwEvwCrwGb8Bb8A68Bx/AR/AJfAZfwFfwDXwHP8BP8Av8Bn/AX/APJMBEMDFMApPCZDA5TAFTwlQwNUwD08J0MD3MADPCTDAzzAKzwmwwO8wBc8Jc8D+YG+aBeWE+mB8WgAVhIVgYFoFFYTFYHJaAJWEpWBqWgWVhOVgeVoAVYSVYGVaBVWE1WB3WgDVhLVgb1oF1YT1YHzaADWEj2Bg2gU1hM9gctoAtYSvYGraBbWE72B52gB1hJ9gZdoFdYTfYHfaAPWEv2Bv2gX1hP9gfDoAD4SA4GA6BQ+EwOByOgCPhKDgajoFj4Tg4Hk6AE+EkOBlOgVPhNDgdzoAz4Sw4G86Bc+E8OB8ugAvhIrgYLoFL4TK4HK6AK+EquBqugWvhOrgeboAb4Sa4GW6BW+E2uB3ugDvhLrgb7oF74T64Hx6AB+EheBgegUfhMXgcnoAn4Sl4Gp6BZ+E5eB5egBfhJXgZXoFXIQZxSEASUpCGDGQhB3koQBFKUIYKVKEGdWhAE1rQhg50oQd9GEAAIQwhghGM4TV4Hd6AN+EteBvegXfhPXgfPoAP4SP4GD6BT+Ez+By+gC/hK/gavoFv4Tv4Hn6AH+En+Bl+gV/hN/gd/oA/4S/4G/6Bf+E/mBAmChOHScKkYbIweZgiTBmmClOHacK0YbowfZghzBhmCjOHWcKsYbYwe5gjzBnmCv8Lc4d5wrxhvjB/WCAsGBYKC4dFwqJhsbB4WCIsGZYKS4dlwrJhubB8WCGsGFYKK4dVwqphtbB6WCOsGdYKa4d1wrphvbB+2CBsGDYKG4dNwqZhs7B52CJsGbYKW4dtwrZhu7B92CHsGHYKO4ddwq5ht7B72CPsGfYKe4d9wr5hv7B/OCAcGA4KB4dDwqHhsHB4OCIcGY4KR4djwrHhuHB8OCGcGE4KJ4dTwqnhtHB6OCOcGc4KZ4dzwrnhvHB+uCBcGC4KF4dLwqXhsnB5uCJcGa4KVyf8bxKHG8KN4aZwc7gl3BpuC7eHO8Kd4a5wd7gn3BvuC/eHB8KD4aHwcHgkPBoeC4+HJ8KT4anwdHgmPBueC8+HF8KL4aXwcnglvBpiIR4SIRlSIR0yIRtyIR8KoRhKoRwqoRpqoR4aoRlaoR06oRt6oR8GIQhhGIYojMI4vBZeD2+EN8Nb4e3wTng3vBfeDx+ED8NH4ePwSfg0fBY+D1+EL8NX4evwTfg2fBe+Dz+EH8NP4efwS/g1/BZ+D3+EP8Nf4e/wT/g3/BcmoEQoMUqCkqJkKDlKgVKiVCg1SoPSonQoPcqAMqJMKDPKgrKibCg7yoFyolzoP5Qb5UF5UT6UHxVABVEhVBgVQUVRMVQclUAlUSlUGpVBZVE5VB5VQBVRJVQZVUFVUTVUHdVANVEtVBvVQXVRPVQfNUANUSPUGDVBTVEz1By1QC1RK9QatUFtUTvUHnVAHVEn1Bl1QV1RN9Qd9UA9US/UG/VBfVE/1B8NQAPRIDQYDUFD0TA0HI1AI9EoNBqNQWPRODQeTUAT0SQ0GU1BU9E0NB3NQDPRLDQbzUFz0Tw0Hy1AC9EitBgtQUvRMrQcrUAr0Sq0Gq1Ba9E6tB5tQBvRJrQZbUFb0Ta0He1AO9EutBvtQXvRPrQfHUAH0SF0GB1BR9ExdBydQCfRKXQanUFn0Tl0Hl1AF9EldBldQVcRhnBEIBJRiEYMYhGHeCQgEUlIRgpSkYZ0ZCATWchGDnKRh3wUIIAgChFCEYrRNXQd3UA30S10G91Bd9E9dB89QA/RI/QYPUFP0TP0HL1AL9Er9Bq9QW/RO/QefUAf0Sf0GX1BX9E39B39QD/RL/Qb/UF/0T+UECWKEkdJoqRRsih5lCJKGaWKUkdporRRuih9lCHKGGWKMkdZoqxRtih7lCPKGeWK/otyR3mivFG+KH9UICoYFYoKR0WiolGxqHhUIioZlYpKR2WislG5qHxUIaoYVYoqR1WiqlG1qHpUI6oZ1YpqR3WiulG9qH7UIGoYNYoaR02iplGzqHnUImoZtYpaR22itlG7qH3UIeoYdYo6R12irlG3qHvUI+oZ9Yp6R32ivlG/qH80IBoYDYoGR0OiodGwaHg0IhoZjYpGR2OisdG4aHw0IZoYTYomR1OiqdG0aHo0I5oZzYpmR3OiudG8aH60IFoYLYoWR0uipdGyaHm0IloZrYpWR2uitdG6aH20IdoYbYo2R1uirdG2aHu0I9oZ7Yp2p9oT7Y32RfujA9HB6FB0ODoSHY2ORcejE9HJ6FR0OjoTnY3OReejC9HF6FJ0OboSXY2wCI+IiIyoiI6YiI24iI+ESIykSI6USI20SI+MyIysyI6cyI28yI+CCEQwCiMURVEcXYuuRzeim9Gt6HZ0J7ob3YvuRw+ih9Gj6HH0JHoaPYueRy+il9Gr6HX0JnobvYveRx+ij9Gn6HP0JfoafYu+Rz+in9Gv6Hf0J/ob/YsS4kRx4jhJnDROFiePU8Qp41Rx6jhNnDZOF6ePM8QZ40xx5jhLnDXOFmePc8Q541zxf3HuOE+cN84X548LxAXjQnHhuEhcNC4WF49LxCXjUnHpuExcNi4Xl48rxBXjSnHluEpcNa4WV49rxDXjWnHtuE5cN64X148bxA3jRnHjuEncNG4WN49bxC3jVnHruE3cNm4Xt487xB3jTnHnuEvcNe4Wd497xD3jXnHvuE/cN+4X948HxAPjQfHgeEg8NB4WD/8/ku1pYYwdAQDwX9u2bRuntm3btm3btm3bGNtWkqnbvdgH+bAR2EhsFDYaG4ONxcZh47EJ2ERsEjYZm4JNxaZh07EZ2ExsFjYbm4PNxeZh87EF2EJsEbYYW4ItxZZhy7EV2EpsFbYaW4OtxdZh67EN2EZsE7YZ24JtxbZh27Ed2E5sF7Yb24PtxfZh+7ED2EHsEHYYO4IdxY5hx7ET2EnsFHYaO4Odxc5h57EL2EXsEnYZu4Jdxa5h17Eb2E3sFnYbu4Pdxe5h97EH2EPsEfYYe4I9xZ5hz7EX2EvsFfYae4O9xd5h77EP2EfsE/YZ+4J9xTAMxwiMxCiMxhiMxTiMxwRMxCRMxhRMxTRMxwzMxCzMxhzMxTzMxwIsxCIMYBBDWIx9w75jP7Cf2C/sN/YH+4v9wxLwRHhiPAmeFE+GJ8dT4CnxVHhqPA2eFk+Hp8cz4BnxTHhmPAueFc+GZ8dz4DnxXHhuPA+eF8+H58cL4AXxQnhhvAheFC+GF8dL4CXxUnhpvAxeFi+Hl8cr4BXxSnhlvApeFa+GV8dr4DXxWnhtvA5eF6+H18cb4A3x//BGeGO8Cd4Ub4Y3x1vgLfFWeGu8Dd4Wb4e3xzvgHfFOeGe8C94V74Z3x3vgPfFeeG+8D94X74f3xwfgA/FB+GB8CD4UH4YPx0fgI/FR+Gh8DD4WH4ePxyfgE/FJ+GR8Cj4Vn4ZPx2fgM/FZ+Gx8Dj4Xn4fPxxfgC/FF+GJ8Cb4UX4Yvx1fgK/FV+Gp8Db4WX4evxzfgG/FN+GZ8C74V34Zvx3fgO/Fd+G58D74X34fvxw/gB/FD+GH8CH4UP4Yfx0/gJ/FT+Gn8DH4WP4efxy/gF/FL+GX8Cn4Vv4Zfx2/gN/Fb+G38Dn4Xv4ffxx/gD/FH+GP8Cf4Uf4Y/x1/gL/FX+Gv8Df4Wf4e/xz/gH/FP+Gf8C/4Vx3AcJ3ASp3AaZ3AW53AeF3ARl3AZV3AV13AdN3ATt3Abd3AX93AfD/AQj3CAQxzhMf4N/47/wH/iv/Df+B/8L/4PTyASEYmJJERSIhmRnEhBpCRSEamJNERaIh2RnshAZCQyEZmJLERWIhuRnchB5CRyEbmJPEReIh+RnyhAFCQKEYWJIkRRohhRnChBlCRKEaWJMkRZohxRnqhAVCQqEZWJKkRVohpRnahB1CRqEbWJOkRdoh5Rn2hANCT+IxoRjYkmRFOiGdGcaEG0JFoRrYk2RFuiHdGe6EB0JDoRnYkuRFeiG9Gd6EH0JHoRvYk+RF+iH9GfGEAMJAYRg4khxFBiGDGcGEGMJEYRo4kxxFhiHDGemEBMJCYRk4kpxFRiGjGdmEHMJGYRs4k5xFxiHjGfWEAsJBYRi4klxFJiGbGcWEGsJFYRq4k1xFpiHbGe2EBsJDYRm4ktxFZiG7Gd2EHsJHYRu4k9xF5iH7GfOEAcJA4Rh4kjxFHiGHGcOEGcJE4Rp4kzxFniHHGeuEBcJC4Rl4krxFXiGnGduEHcJG4Rt4k7xF3iHnGfeEA8JB4Rj4knxFPiGfGceEG8JF4Rr4k3xFviHfGe+EAkEJ+Iz8QX4iuBEThBECRBETTBECzBETwhECIhETKhECqhETphECZhETbhEC7hET4RECEREYCABCJi4hvxnfhB/CR+Eb+JP8Rf4h+RQCYiE5NJyKRkMjI5mYJMSaYiU5NpyLRkOjI9mYHMSGYiM5NZyKxkNjI7mYPMSeYic5N5yLxkPjI/WYAsSBYiC5NFyKJkMbI4WYIsSZYiS5NlyLJkObI8WYGsSFYiK5NVyKpkNbI6WYOsSdYia5N1yLpkPbI+2YBsSP5HNiIbk03IpmQzsjnZgmxJtiJbk23ItmQ7sj3ZgexIdiI7k13IrmQ3sjvZg+xJ9iJ7k33IvmQ/sj85gBxIDiIHk0PIoeQwcjg5ghxJjiJHk2PIseQ4cjw5gZxITiInk1PIqeQ0cjo5g5xJziJnk3PIueQ8cj65gFxILiIXk0vIpeQycjm5glxJriJXk2vIteQ6cj25gdxIbiI3k1vIreQ2cju5g9xJ7iJ3k3vIveQ+cj95gDxIHiIPk0fIo+Qx8jh5gjxJniJPk2fIs+Q58jx5gbxIXiIvk1fIq+Q18jp5g7xJ3iJvk3fIu+Q98j75gHxIPiIfk0/Ip+Qz8jn5gnxJviJfk2/It+Q78j35gfxIfiI/k1/IryRG4iRBkiRF0iRDsiRH8qRAiqREyqRCqqRG6qRBmqRF2qRDuqRH+mRAhmREAhKSiIzJb+R38gf5k/xF/ib/kH/Jf2QClYhKTCWhklLJqORUCiollYpKTaWh0lLpqPRUBiojlYnKTGWhslLZqOxUDionlYvKTeWh8lL5qPxUAaogVYgqTBWhilLFqOJUCaokVYoqTZWhylLlqPJUBaoiVYmqTFWhqlLVqOpUDaomVYuqTdWh6lL1qPpUA6oh9R/ViGpMNaGaUs2o5lQLqiXVimpNtaHaUu2o9lQHqiPViepMdaG6Ut2o7lQPqifVi+pN9aH6Uv2o/tQAaiA1iBpMDaGGUsOo4dQIaiQ1ihpNjaHGUuOo8dQEaiI1iZpMTaGmUtOo6dQMaiY1i5pNzaHmUvOo+dQCaiG1iFpMLaGWUsuo5dQKaiW1ilpNraHWUuuo9dQGaiO1idpMbaG2Utuo7dQOaie1i9pN7aH2Uvuo/dQB6iB1iDpMHaGOUseo49QJ6iR1ijpNnaHOUueo89QF6iJ1ibpMXaGuUteo69QN6iZ1i7pN3aHuUveo+9QD6iH1iHpMPaGeUs+o59QL6iX1inpNvaHeUu+o99QH6iP1ifpMfaG+UhiFUwRFUhRFUwzFUhzFUwIlUhIlUwqlUhqlUwZlUhZlUw7lUh7lUwEVUhEFKEghKqa+Ud+pH9RP6hf1m/pD/aX+UQl0IjoxnYROSiejk9Mp6JR0Kjo1nYZOS6ej09MZ6Ix0JjoznYXOSmejs9M56Jx0Ljo3nYfOS+ej89MF6IJ0IbowXYQuSheji9Ml6JJ0Kbo0XYYuS5ejy9MV6Ip0JboyXYWuSlejq9M16Jp0Lbo2XYeuS9ej69MN6Ib0f3QjujHdhG5KN6Ob0y3olnQrujXdhm5Lt6Pb0x3ojnQnujPdhe5Kd6O70z3onnQvujfdh+5L96P70wPogfQgejA9hB5KD6OH0yPokfQoejQ9hh5Lj6PH0xPoifQkejI9hZ5KT6On0zPomfQsejY9h55Lz6Pn0wvohfQiejG9hF5KL6OX0yvolfQqejW9hl5Lr6PX0xvojfQmejO9hd5Kb6O30zvonfQueje9h95L76P30wfog/Qh+jB9hD5KH6OP0yfok/Qp+jR9hj5Ln6PP0xfoi/Ql+jJ9hb5KX6Ov0zfom/Qt+jZ9h75L36Pv0w/oh/Qj+jH9hH5KP6Of0y/ol/Qr+jX9hn5Lv6Pf0x/oj/Qn+jP9hf5KYzROEzRJUzRNMzRLczRPC7RIS7RMK7RKa7ROG7RJW7RNO7RLe7RPB3RIRzSgIY3omP5Gf6d/0D/pX/Rv+g/9l/5HJzCJmMRMEiYpk4xJzqRgUjKpmNRMGiYtk45Jz2RgMjKZmMxMFiYrk43JzuRgcjK5mNxMHiYvk4/JzxRgCjKFmMJMEaYoU4wpzpRgSjKlmNJMGaYsU44pz1RgKjKVmMpMFaYqU42pztRgajK1mNpMHaYuU4+pzzRgGjL/MY2YxkwTpinTjGnOtGBaMq2Y1kwbpi3TjmnPdGA6Mp2YzkwXpivTjenO9GB6Mr2Y3kwfpi/Tj+nPDGAGMoOYwcwQZigzjBnOjGBGMqOY0cwYZiwzjhnPTGAmMpOYycwUZiozjZnOzGBmMrOY2cwcZi4zj5nPLGAWMouYxcwSZimzjFnOrGBWMquY1cwaZi2zjlnPbGA2MpuYzcwWZiuzjdnO7GB2MruY3cweZi+zj9nPHGAOMoeYw8wR5ihzjDnOnGBOMqeY08wZ5ixzjjnPXGAuMpeYy8wV5ipzjbnO3GBuMreY28wd5i5zj7nPPGAeMo+Yx8wT5inzjHnOvGBeMq+Y18wb5i3zjnnPfGA+Mp+Yz8wX5iuDMThDMCRDMTTDMCzDMTwjMCIjMTKjMCqjMTpjMCZjMTbjMC7jMT4TMCETMYCBDGJi5hvznfnB/GR+Mb+ZP8xf5h+TwCZiE7NJ2KRsMjY5m4JNyaZiU7Np2LRsOjY9m4HNyGZiM7NZ2KxsNjY7m4PNyeZic7N52LxsPjY/W4AtyBZiC7NF2KJsMbY4W4ItyZZiS7Nl2LJsObY8W4GtyFZiK7NV2KpsNbY6W4OtydZia7N12LpsPbY+24BtyP7HNmIbs03Ypmwztjnbgm3JtmJbs23Ytmw7tj3bge3IdmI7s13Yrmw3tjvbg+3J9mJ7s33Yvmw/tj87gB3IDmIHs0PYoewwdjg7gh3JjmJHs2PYsew4djw7gZ3ITmIns1PYqew0djo7g53JzmJns3PYuew8dj67gF3ILmIXs0vYpewydjm7gl3JrmJXs2vYtew6dj27gd3IbmI3s1vYrew2dju7g93J7mJ3s3vYvew+dj97gD3IHmIPs0fYo+wx9jh7gj3JnmJPs2fYs+w59jx7gb3IXmIvs1fYq+w19jp7g73J3mJvs3fYu+w99j77gH3IPmIfs0/Yp+wz9jn7gn3JvmJfs2/Yt+w79j37gf3IfmI/s1/YryzG4izBkizF0izDsizH8qzAiqzEyqzCqqzG6qzBmqzF2qzDuqzH+mzAhmzEAhayiI3Zb+x39gf7k/3F/mb/sH/Zf2wCl4hLzCXhknLJuORcCi4ll4pLzaXh0nLpuPRcBi4jl4nLzGXhsnLZuOxcDi4nl4vLzeXh8nL5uPxcAa4gV4grzBXhinLFuOJcCa4kV4orzZXhynLluPJcBa4iV4mrzFXhqnLVuOpcDa4mV4urzdXh6nL1uPpcA64h9x/XiGvMNeGacs245lwLriXXimvNteHacu249lwHriPXievMdeG6ct247lwPrifXi+vN9eH6cv24/twAbiA3iBvMDeGGcsO44dwIbiQ3ihvNjeHGcuO48dwEbiI3iZvMTeGmctO46dwMbiY3i5vNzeHmcvO4+dwCbiG3iFvMLeGWcsu45dwKbiW3ilvNreHWcuu49dwGbiO3idvMbeG2ctu47dwObie3i9vN7eH2cvu4/dwB7iB3iDvMHeGOcse449wJ7iR3ijvNneHOcue489wF7iJ3ibvMXeGucte469wN7iZ3i7vN3eHucve4+9wD7iH3iHvMPeGecs+459wL7iX3invNveHecu+499wH7iP3ifvMfeG+chiHcwRHchRHcwzHchzHcwInchIncwqnchqncwZnchZncw7nch7ncwEXchEHOMghLua+cd+5H9xP7hf3m/vD/eX+cQl8Ij4xn4RPyifjk/Mp+JR8Kj41n4ZPy6fj0/MZ+Ix8Jj4zn4XPymfjs/M5+Jx8Lj43n4fPy+fj8/MF+IJ8Ib4wX4Qvyhfji/Ml+JJ8Kb40X4Yvy5fjy/MV+Ip8Jb4yX4Wvylfjq/M1+Jp8Lb42X4evy9fj6/MN+Ib8f3wjvjHfhG/KN+Ob8y34lnwrvjXfhm/Lt+Pb8x34jnwnvjPfhe/Kd+O78z34nnwvvjffh+/L9+P78wP4gfwgfjA/hB/KD+OH8yP4kfwofjQ/hh/Lj+PH8xP4ifwkfjI/hZ/KT+On8zP4mfwsfjY/h5/Lz+Pn8wv4hfwifjG/hF/KL+OX8yv4lfwqfjW/hl/Lr+PX8xv4jfwmfjO/hd/Kb+O38zv4nfwufje/h9/L7+P38wf4g/wh/jB/hD/KH+OP8yf4k/wp/jR/hj/Ln+PP8xf4i/wl/jJ/hb/KX+Ov8zf4m/wt/jZ/h7/L3+Pv8w/4h/wj/jH/hH/KP+Of8y/4l/wr/jX/hn/Lv+Pf8x/4j/wn/jP/hf/KYzzOEzzJUzzNMzzLczzPC7zIS7zMK7zKa7zOG7zJW7zNO7zLe7zPB3zIRzzgIY/4mP/Gf+d/8D/5X/xv/g//l//HJwiJhMRCEiGpkExILqQQUgqphNRCGiGtkE5IL2QQMgqZhMxCFiGrkE3ILuQQcgq5hNxCHiGvkE/ILxQQCgqFhMJCEaGoUEwoLpQQSgqlhNJCGaGsUE4oL1QQKgqVhMpCFaGqUE2oLtQQagq1hNpCHaGuUE+oLzQQGgr/CY2ExkIToanQTGgutBBaCq2E1kIboa3QTmgvdBA6Cp2EzkIXoavQTegu9BB6Cr2E3kIfoa/QT+gvDBAGCoOEwcIQYagwTBgujBBGCqOE0cIYYawwThgvTBAmCpOEycIUYaowTZguzBBmCrOE2cIcYa4wT5gvLBAWCouExcISYamwTFgurBBWCquE1cIaYa2wTlgvbBA2CpuEzcIWYauwTdgu7BB2CruE3cIeYa+wT9gvHBAOCoeEw8IR4ahwTDgunBBOCqeE08IZ4axwTjgvXBAuCpeEy8IV4apwTbgu3BBuCreE28Id4a5wT7gvPBAeCo+Ex8IT4anwTHguvBBeCq+E18Ib4a3wTngvfBA+Cp+Ez8IX4auACbhACKRACbTACKzACbwgCKIgCbKgCKqgCbpgCKZgCbbgCK7gCb4QCKEQCUCAAhJi4ZvwXfgh/BR+Cb+FP8Jf4Z+QICYSE4tJxKRiMjG5mEJMKaYSU4tpxLRiOjG9mEHMKGYSM4tZxKxiNjG7mEPMKeYSc4t5xLxiPjG/WEAsKBYSC4tFxKJiMbG4WEIsKZYSS4tlxLJiObG8WEGsKFYSK4tVxKpiNbG6WEOsKdYSa4t1xLpiPbG+2EBsKP4nNhIbi03EpmIzsbnYQmwpthJbi23EtmI7sb3YQewodhI7i13ErmI3sbvYQ+wp9hJ7i33EvmI/sb84QBwoDhIHi0PEoeIwcbg4QhwpjhJHi2PEseI4cbw4QZwoThIni1PEqeI0cbo4Q5wpzhJni3PEueI8cb64QFwoLhIXi0vEpeIycbm4QlwprhJXi2vEteI6cb24QdwobhI3i1vEreI2cbu4Q9wp7hJ3i3vEveI+cb94QDwoHhIPi0fEo+Ix8bh4QjwpnhJPi2fEs+I58bx4QbwoXhIvi1fEq+I18bp4Q7wp3hJvi3fEu+I98b74QHwoPhIfi0/Ep+Iz8bn4QnwpvhJfi2/Et+I78b34QfwofhI/i1/EryIm4iIhkiIl0iIjsiIn8qIgiqIkyqIiqqIm6qIhmqIl2qIjuqIn+mIghmIkAhGKSIzFb+J38Yf4U/wl/hb/iH/Ff2KClEhKLCWRkkrJpORSCimllEpKLaWR0krppPRSBimjlEnKLGWRskrZpOxSDimnlEvKLeWR8kr5pPxSAamgVEgqLBWRikrFpOJSCamkVEoqLZWRykrlpPJSBamiVEmqLFWRqkrVpOpSDammVEuqLdWR6kr1pPpSA6mh9J/USGosNZGaSs2k5lILqaXUSmottZHaSu2k9lIHqaPUSeosdZG6St2k7lIPqafUS+ot9ZH6Sv2k/tIAaaA0SBosDZGGSsOk4dIIaaQ0ShotjZHGSuOk8dIEaaI0SZosTZGmStOk6dIMaaY0S5otzZHmSvOk+dICaaG0SFosLZGWSsuk5dIKaaW0SlotrZHWSuuk9dIGaaO0SdosbZG2Stuk7dIOaae0S9ot7ZH2Svuk/dIB6aB0SDosHZGOSsek49IJ6aR0SjotnZHOSuek89IF6aJ0SbosXZGuStek69IN6aZ0S7ot3ZHuSvek+9ID6aH0SHosPZGeSs+k59IL6aX0SnotvZHeSu+k99IH6aP0SfosfZG+SpiES4RESpRES4zESpzES4IkSpIkS4qkSpqkS4ZkSpZkS47kSp7kS4EUSpEEJCghKZa+Sd+lH9JP6Zf0W/oj/ZX+SQlyIjmxnEROKieTk8sp5JRyKjm1nEZOK6eT08sZ5IxyJjmznEXOKmeTs8s55JxyLjm3nEfOK+eT88sF5IJyIbmwXEQuKheTi8sl5JJyKbm0XEYuK5eTy8sV5IpyJbmyXEWuKleTq8s15JpyLbm2XEeuK9eT68sN5Ibyf3IjubHcRG4qN5Obyy3klnIrubXcRm4rt5Pbyx3kjnInubPcRe4qd5O7yz3knnIvubfcR+4r95P7ywPkgfIgebA8RB4qD5OHyyPkkfIoebQ8Rh4rj5PHyxPkifIkebI8RZ4qT5OnyzPkmfIsebY8R54rz5PnywvkhfIiebG8RF4qL5OXyyvklfIqebW8Rl4rr5PXyxvkjfImebO8Rd4qb5O3yzvknfIuebe8R94r75P3ywfkg/Ih+bB8RD4qH5OPyyfkk/Ip+bR8Rj4rn5PPyxfki/Il+bJ8Rb4qX5Ovyzfkm/It+bZ8R74r35Pvyw/kh/Ij+bH8RH4qP5Ofyy/kl/Ir+bX8Rn4rv5Pfyx/kj/In+bP8Rf4qYzIuEzIpUzItMzIrczIvC7IoS7IsK7Iqa7IuG7IpW7ItO7Ire7IvB3IoRzKQoYzkWP4mf5d/yD/lX/Jv+Y/8V/4nJyiJlMRKEiWpkkxJrqRQUiqplNRKGiWtkk5Jr2RQMiqZlMxKFiWrkk3JruRQciq5lNxKHiWvkk/JrxRQCiqFlMJKEaWoUkwprpRQSiqllNJKGaWsUk4pr1RQKiqVlMpKFaWqUk2prtRQaiq1lNpKHaWuUk+przRQGir/KY2UxkpKpanSTGmutFBaKq2U1kobpa3STmmvdFA6Kp2UzkoXpavSTemu9FB6Kr2U3kofpa/ST+mvDFAGKoOUwcoQZagyTBmujFBGKqOU0coYZawyThmvTFAmKpOUycoUZaoyTZmuzFBmKrOU2cocZa4yT5mvLFAWKouUxcoSZamyTFmurFBWKquU1coaZa2yTlmvbFA2KpuUzcoWZauyTdmu7FB2KruU3coeZa+yT9mvHFAOKoeUw8oR5ahyTDmunFBOKqeU08oZ5axyTjmvXFAuKpeUy8oV5apyTbmu3FBuKreU28od5a5yT7mvPFAeKo+Ux8oT5anyTHmuvFBeKq+U18ob5a3yTnmvfFA+Kp+Uz8oX5auCKbhCKKRCKbTCKKzCKbwiKKIiKbKiKKqiKbpiKKZiKbbiKK7iKb4SKKESKUCBClJi5ZvyXfmh/FR+Kb+VP8pf5Z+SoCZSE6tJ1KRqMjW5mkJNqaZSU6tp1LRqOjW9mkHNqGZSM6tZ1KxqNjW7mkPNqeZSc6t51LxqPjW/WkAtqBZSC6tF1KJqMbW4WkItqZZSS6tl1LJqObW8WkGtqFZSK6tV1KpqNbW6WkOtqdZSa6t11LpqPbW+2kBtqP6nNlIbq03UpmoztbnaQm2ptlJbq23Utmo7tb3aQe2odlI7q13Urmo3tbvaQ+2p9lJ7q33Uvmo/tb86QB2oDlIHq0PUoeowdbg6Qh2pjlJHq2PUseo4dbw6QZ2oTlInq1PUqeo0dbo6Q52pzlJnq3PUueo8db66QF2oLlIXq0vUpeoydbm6Ql2prlJXq2vUteo6db26Qd2oblI3q1vUreo2dbu6Q92p7lJ3q3vUveo+db96QD2oHlIPq0fUo+ox9bh6Qj2pnlJPq2fUs+o59bx6Qb2oXlIvq1fUq+o19bp6Q72p3lJvq3fUu+o99b76QH2oPlIfq0/Up+oz9bn6Qn2pvlJfq2/Ut+o79b36Qf2oflI/q1/Uryqm4iqhkiql0iqjsiqn8qqgiqqkyqqiqqqm6qqhmqql2qqjuqqn+mqghmqkAhWqSI3Vb+p39Yf6U/2l/lb/qH/Vf2qClkhLrCXRkmrJtORaCi2llkpLraXR0mrptPRaBi2jlknLrGXRsmrZtOxaDi2nlkvLreXR8mr5tPxaAa2gVkgrrBXRimrFtOJaCa2kVkorrZXRymrltPJaBa2iVkmrrFXRqmrVtOpaDa2mVkurrdXR6mr1tPpaA62h9p/WSGusNdGaas205loLraXWSmuttdHaau209loHraPWSeusddG6at207loPrafWS+ut9dH6av20/toAbaA2SBusDdGGasO04doIbaQ2ShutjdHGauO08doEbaI2SZusTdGmatO06doMbaY2S5utzdHmavO0+doCbaG2SFusLdGWasu05doKbaW2SlutrdHWauu09doGbaO2SdusbdG2atu07doObae2S9ut7dH2avu0/doB7aB2SDusHdGOase049oJ7aR2SjutndHOaue089oF7aJ2SbusXdGuate069oN7aZ2S7ut3dHuave0+9oD7aH2SHusPdGeas+059oL7aX2SnutvdHeau+099oH7aP2SfusfdG+apiGa4RGapRGa4zGapzGa4ImapIma4qmapqma4ZmapZma47map7ma4EWapEGNKghLda+ad+1H9pP7Zf2W/uj/dX+aQl6Ij2xnkRPqifTk+sp9JR6Kj21nkZPq6fT0+sZ9Ix6Jj2znkXPqmfTs+s59Jx6Lj23nkfPq+fT8+sF9IJ6Ib2wXkQvqhfTi+sl9JJ6Kb20XkYvq5fTy+sV9Ip6Jb2yXkWvqlfTq+s19Jp6Lb22Xkevq9fT6+sN9Ib6f3ojvbHeRG+qN9Ob6y30lnorvbXeRm+rt9Pb6x30jnonvbPeRe+qd9O76z30nnovvbfeR09I2k/vrw/QB+qD9MH6EH2oPkwfro/QR+qj9NH6GH2sPk4fr0/QJ+qT9Mn6FH2qPk2frs/QZ+qz9Nn6HH2uPk+fry/QF+qL9MX6En2pvkxfrq/QV+qr9NX6Gn2tvk5fr2/QN+qb9M36Fn2rvk3fru/Qd+q79N36Hn2vvk/frx/QD+qH9MP6Ef2ofkw/rp/QT+qn9NP6Gf2sfk4/r1/QL+qX9Mv6Ff2qfk2/rt/Qb+q39Nv6Hf2ufk+/rz/QH+qP9Mf6E/2p/kx/rr/QX+qv9Nf6G/2t/k5/r3/QP+qf9M/6F/2rjum4TuikTum0zuiszum8LuiiLumyruiqrum6buimbum27uiu7um+HuihHulAhzrSY/2b/l3/of/Uf+m/9T/6X/2fnmAkMhIbSYykRjIjuZHCSGmkMlIbaYy0RjojvZHByGhkMjIbWYysRjYju5HDyGnkMnIbeYy8Rj4jv1HAKGgUMgobRYyiRjGjuFHCKGmUMkobZYyyRjmjvFHBqGhUMiobVYyqRjWjulHDqGnUMmobdYy6Rj2jvtHAaGj8ZzQyGhtNjKZGM6O50cJoabQyWhttjLZGO6O90cHoaHQyOhtdjK5GN6O70cPoafQyeht9jL5GP6O/McAYaAwyBhtDjKHGMGO4McIYaYwyRhtjjLHGOGO8McGYaEwyJhtTjKnGNGO6McOYacwyZhtzjLnGPGO+scBYaCwyFhtLjKXGMmO5scJYaawyVhtrjLXGOmO9scHYaGwyNhtbjK3GNmO7scPYaewydht7jL3GPmO/ccA4aBwyDhtHjKPGMeO4ccI4aZwyThtnjLPGOeO8ccG4aFwyLhtXjKvGNeO6ccO4adwybht3jLvGPeO+8cB4aDwyHhtPjKfGM+O58cJ4abwyXhtvjLfGO+O98cH4aHwyPhtfjK8GZuAGYZAGZdAGY7AGZ/CGYIiGZMiGYqiGZuiGYZiGZdiGY7iGZ/hGYIRGZAADGsiIjW/Gd+OH8dP4Zfw2/hh/jX9GgpnITGwmMZOayczkZgozpZnKTG2mMdOa6cz0ZgYzo5nJzGxmMbOa2czsZg4zp5nLzG3mMfOa+cz8ZgGzoFnILGwWMYuaxcziZgmzpFnKLG2WMcua5czyZgWzolnJrGxWMaua1czqZg2zplnLrG3WMeua9cz6ZgOzofmf2chsbDYxm5rNzOZmC7Ol2cpsbbYx25rtzPZmB7Oj2cnsbHYxu5rdzO5mD7On2cvsbfYx+5r9zP7mAHOgOcgcbA4xh5rDzOHmCHOkOcocbY4xx5rjzPHmBHOiOcmcbE4xp5rTzOnmDHOmOcucbc4x55rzzPnmAnOhuchcbC4xl5rLzOXmCnOlucpcba4x15rrzPXmBnOjucncbG4xt5rbzO3mDnOnucvcbe4x95r7zP3mAfOgecg8bB4xj5rHzOPmCfOkeco8bZ4xz5rnzPPmBfOiecm8bF4xr5rXzOvmDfOmecu8bd4x75r3zPvmA/Oh+ch8bD4xn5rPzOfmC/Ol+cp8bb4x35rvzPfmB/Oj+cn8bH4xv5qYiZuESZqUSZuMyZqcyZuCKZqSKZuKqZqaqZuGaZqWaZuO6Zqe6ZuBGZqRCUxoIjM2v5nfzR/mT/OX+dv8Y/41/5kJViIrsZXESmols5JbKayUViortZXGSmuls9JbGayMViYrs5XFympls7JbOaycVi4rt5XHymvls/JbBayCViGrsFXEKmoVs4pbJaySVimrtFXGKmuVs8pbFayKViWrslXFqmpVs6pbNayaVi2rtlXHqmvVs+pbDayG1n9WI6ux1cRqajWzmlstrJZWK6u11cZqa7Wz2lsdrI5WJ6uz1cXqanWzuls9rJ5WL6u31cfqa/Wz+lsDrIHWIGuwNcQaag2zhlsjrJHWKGu0NcYaa42zxlsTrInWJGuyNcWaak2zplszrJnWLGu2Nceaa82z5lsLrIXWImuxtcRaai2zllsrrJXWKmu1tcZaa62z1lsbrI3WJmuztcXaam2ztls7rJ3WLmu3tcfaa+2z9lsHrIPWIeuwdcQ6ah2zjlsnrJPWKeu0dcY6a52zzlsXrIvWJeuydcW6al2zrls3rJvWLeu2dce6a92z7lsPrIfWI+ux9cR6aj2znlsvrJfWK+u19cZ6a72z3lsfrI/WJ+uz9cX6amEWbhEWaVEWbTEWa3EWbwmWaEmWbCmWammWbhmWaVmWbTmWa3mWbwVWaEUWsKCFrNj6Zn23flg/rV/Wb+uP9df6ZyXYiezEdhI7qZ3MTm6nsFPaqezUdho7rZ3OTm9nsDPamezMdhY7q53Nzm7nsHPauezcdh47r53Pzm8XsAvahezCdhG7qF3MLm6XsEvapezSdhm7rF3OLm9XsCvalezKdhW7ql3Nrm7XsGvatezadh27rl3Prm83sBva/9mN7MZ2E7up3cxubrewW9qt7NZ2G7ut3c5ub3ewO9qd7M52F7ur3c3ubvewe9q97N52H7uv3c/ubw+wB9qD7MH2EHuoPcwebo+wR9qj7NH2GHusPc4eb0+wJ9qT7Mn2FHuqPc2ebs+wZ9qz7Nn2HHuuPc+eby+wF9qL7MX2Enupvcxebq+wV9qr7NX2Gnutvc5eb2+wN9qb7M32Fnurvc3ebu+wd9q77N32Hnuvvc/ebx+wD9qH7MP2Efuofcw+bp+wT9qn7NP2Gfusfc4+b1+wL9qX7Mv2Ffuqfc2+bt+wb9q37Nv2Hfuufc++bz+wH9qP7Mf2E/up/cx+br+wX9qv7Nf2G/ut/c5+b3+wP9qf7M/2F/urjdm4TdikTdm0zdiszdm8LdiiLdmyrdiqrdm6bdimbdm27diu7dm+HdihHdnAhjayY/ub/d3+Yf+0f9m/7T/2X/ufneAkdhI7SZykTjInuZPCSemkclI7aZy0TjonvZPByehkcjI7WZysTjYnu5PDyenkcnI7eZy8Tj4nv1PAKegUcgo7RZyiTjGnuFPCKemUcko7ZZyyTjmnvFPBqehUcio7VZyqTjWnulPDqenUcmo7dZy6Tj2nvtPAaej85zRyGjtNnKZOM6e508Jp6bRyWjttnLZOO6e908Hp6HRyOjtdnK5ON6e708Pp6fRyejt9nL5OP6e/M8AZ6AxyBjtDnKHOMGe4M8IZ6YxyRjtjnLHOOGe8M8GZ6ExyJjtTnKnONGe6M8OZ6cxyZjtznLnOPGe+s8BZ6CxyFjtLnKXOMme5s8JZ6axyVjtrnLXOOme9s8HZ6GxyNjtbnK3ONme7s8PZ6exydjt7nL3OPme/c8A56BxyDjtHnKPOMee4c8I56ZxyTjtnnLPOOee8c8G56FxyLjtXnKvONee6c8O56dxybjt3nLvOPee+88B56DxyHjtPnKfOM+e588J56bxyXjtvnLfOO+e988H56HxyPjtfnK8O5uAO4ZAO5dAO47AO5/CO4IiO5MiO4qiO5uiO4ZiO5diO47iO5/hO4IRO5AAHOsiJnW/Od+eH89P55fx2/jh/nX9OgpvITewmcZO6ydzkbgo3pZvKTe2mcdO66dz0bgY3o5vJzexmcbO62dzsbg43p5vLze3mcfO6+dz8bgG3oFvILewWcYu6xdzibgm3pFvKLe2Wccu65dzybgW3olvJrexWcau61dzqbg23plvLre3Wceu69dz6bgO3ofuf28ht7DZxm7rN3OZuC7el28pt7bZx27rt3PZuB7ej28nt7HZxu7rd3O5uD7en28vt7fZx+7r93P7uAHegO8gd7A5xh7rD3OHuCHekO8od7Y5xx7rj3PHuBHeiO8md7E5xp7rT3OnuDHemO8ud7SZJmOvOc+e7C9yF7iJ3sbvEXeouc5e7K9yV7ip3tbvGXeuuc9e7G9yN7iZ3s7vF3epuc7e7O9yd7i53t7vH3evuc/e7B9yD7iH3sHvEPeoec4+7J9yT7in3tHvGPeuec8+7F9yL7iX3snvFvepec6+7N9yb7i33tnvHvevec++7D9yH7iP3sfvEfeo+c5+7L9yX7iv3tfvGfeu+c9+7H9yP7if3s/vF/epiLu4SLulSLu0yLutyLu8KruhKruwqrupqru4arularu06rut6ru8GbuhGLnChi9zY/eZ+d3+4P91f7m/3j/vX/ecmeIm8xF4SL6mXzEvupfBSeqm81F4aL62XzkvvZfAyepm8zF4WL6uXzcvu5fByerm83F4eL6+Xz8vvFfAKeoW8wl4Rr6hXzCvulfBKeqW80l4Zr6xXzivvVfAqepW8yl4Vr6pXzavu1fBqerW82l4dr65Xz6vvNfAaev95jbzGXhOvqdfMa+618Fp6rbzWXhuvrdfOa+918Dp6nbzOXhevq9fN6+718Hp6vbzeXh+vr9fP6+8N8AZ6g7zB3hBvqDfMG+6N8EZ6o7zR3hhvrDfOG+9N8CZ6k7zJ3hRvqjfNm+7N8GZ6s7zZ3hxvrjfPm+8t8BZ6i7zF3hJvqbfMW+6t8FZ6q7zV3hpvrbfOW+9t8DZ6m7zN3hZvq7fN2+7t8HZ6u7zd3h5vr7fP2+8d8A56h7zD3hHvqHfMO+6d8E56p7zT3hnvrHfOO+9d8C56l7zL3hXvqnfNu+7d8G56t7zb3h3vrnfPu+898B56j7zH3hPvqffMe+698F56r7zX3hvvrffOe+998D56n7zP3hfvq4d5uEd4pEd5tMd4rMd5vCd4oid5sqd4qqd5umd4pmd5tud4rud5vhd4oRd5wIMe8mLvm/fd++H99H55v70/3l/vn5fgJ/IT+0n8pH4yP7mfwk/pp/JT+2n8tH46P72fwc/oZ/Iz+1n8rH42P7ufw8/p5/Jz+3n8vH4+P79fwC/oF/IL+0X8on4xv7hfwi/pl/JL+2X8sn45v7xfwa/oV/Ir+1X8qn41v7pfw6/p1/Jr+3X8un49v77fwG/o/+c38hv7TfymfjO/ud/Cb+m38lv7bfy2fju/vd/B7+h38jv7Xfyufje/u9/D7+n38nv7ffy+fj+/vz/AH+gP8gf7Q/yh/jB/uD/CH+mP8kf7Y/yx/jh/vD/Bn+hP8if7U/xEiab50/0Z/kx/lj/bn+PP9ef58/0F/kJ/kb/YX+Iv9Zf5y/0V/kp/lb/aX+Ov9df56/0N/kZ/k7/Z3+Jv9bf52/0d/k5/l7/b3+Pv9ff5+/0D/kH/kH/YP+If9Y/5x/0T/kn/lH/aP+Of9c/55/0L/kX/kn/Zv+Jf9a/51/0b/k3/ln/bv+Pf9e/59/0H/kP/kf/Yf+I/9Z/5z/0X/kv/lf/af+O/9d/57/0P/kf/k//Z/+J/9TEf9wmf9Cmf9hmf9Tmf9wVf9CVf9hVf9TVf9w3f9C3f9h3f9T3f9wM/9CMf+NBHfux/87/7P/yf/i//t//H/+v/8xOCREHiIEmQNEgWJA9SBCmDVEHqIE2QNkgXpA8yBBmDTEHmIEuQNcgWZA9yBDmDXEHuIE+QN8gX5A8KBAWDQkHhoEhQNCgWFA9KBCWDUkHpoExQNigXlA8qBBWDSkHloEpQNagWVA9qBDWDWkHtoE5QN6gX1A8aBA2TJASNgsZBk6Bp0CxoHrQIWgatgtZBm6Bt0C5oH3QIOgadgs5Bl6Br0C3oHvQIega9gt5Bn6Bv0C/oHwwIBgaDgsHBkGBoMCwYHowIRgajgtHBmGBsMC4YH0wIJgaTgsnBlGBqMC2YHswIZgazgtnBnGBuMC+YHywIFgaLgsXBkmBpsCxYHqwIVgargtXBmmBtsC5YH2wINgabgs3BlmBrsC3YHuwIdga7gt3BnmBvsC/YHxwIDgaHgsPBkeBocCw4HpwITgangtPBmeBscC44H1wILgaXgsvBleBqcC24HtwIbga3gtvBneBucC+4HzwIHgaPgsfBk+Bp8Cx4HrwIXgavgtfBm+Bt8C54H3wIPgafgs/Bl+BrgAV4QARkQAV0wARswAV8IARiIAVyoARqoAV6YARmYAV24ARu4AV+EARhEAUggAEK4uBb8D34EfwMfgW/gz/B3+BfkBAmChOHScKkYbIweZgiTBmmClOHacK0YbowfZghzBhmCjOHWcKsYbYwe5gjzBnmCnOHecK8Yb4wf1ggLBgWCguHRcKiYbGweFgiLBmWCkuHZcKyYbmwfFghrBhWCiuHVcKqYbWwelgjrBnWCmuHdcK6Yb2wftggbBj+FzYKG4dNwqZhs7B52CJsGbYKW4dtwrZhu7B92CHsGHYKO4ddwq5ht7B72CPsGfYKe4d9wr5hv7B/OCAcGA4KB4dDwqHhsHB4OCIcGY4KR4djwrHhuHB8OCGcGE4KJ4dTwqnhtHB6OCOcGc4KZ4dzwrnhvHB+uCBcGC4KF4dLwqXhsnB5uCJcGa4KV4drwrXhunB9uCHcGG4KN4dbwq3htnB7uCPcGe4Kd4d7wr3hvnB/eCA8GB4KD4dHwqPhsfB4eCI8GZ4KT4dnwrPhufB8eCG8GF4KL4dXwqvhtfB6eCO8Gd4Kb4d3wrvhvfB++CB8GD4KH4dPwqfhs/B5+CJ8Gb4KX4dvwrfhu/B9+CH8GH4KP4dfwq8hFuIhEZIhFdIhE7IhF/KhEIqhFMqhEqqhFuqhEZqhFdqhE7qhF/phEIZhFIIQhiiMw2/h9/BH+DP8Ff4O/4R/w39hQpQoShwliZJGyaLkUYooZZQqSh2lidJG6aL0UYYoY5QpyhxlibJG2aLsUY4oZ5Qryh3lifJG+aL8UYGoYFQoKhwViYpGxaLiUYmoZFQqKh2VicpG5aLyUYWoYlQpqhxViapG1aLqUY2oZlQrqh3ViepG9aL6UYOoYfRf1ChqHDWJmkbNouZRi6hl1CpqHbWJ2kbtovZRh6hj1CnqHHWJukbdou5Rj6hn1CvqHfWJ+kb9ov7RgGhgNCgaHA2JhkbDouHRiGhkNCoaHY2JxkbjovHRhGhiNCmaHE2JpkbTounRjGhmNCuaHc2J5kbzovnRgmhhlJDx/9h7WbQ8WhGtjFZFq6M10dpoXbQ+2hBtjDZFm6Mt0dZoW7Q92hHtjHZFu6M90d5oX7Q/OhAdjA5Fh6Mj0dHoWHQ8OhGdjE5Fp6Mz0dnoXHQ+uhBdjC5Fl6Mr0dXoWnQ9uhHdjG5Ft6M70d3oXnQ/ehA9jB5Fj6Mn0dPoWfQ8ehG9jF5Fr6M30dvoXfQ++hB9jD5Fn6Mv0dcIi/CIiMiIiuiIidiIi/hIiMRIiuRIidRIi/TIiMzIiuzIidzIi/woiMIoikAEIxTF0bfoe/Qj+hn9in5Hf6K/0b8oASQCiUESkBQkA8lBCpASpAKpQRqQFqQD6UEGkBFkAplBFpAVZAPZQQ6QE+QCuUEekBfkA/lBAVAQFAKFQRFQFBQDxUEJUBKUAqVBGVAWlAPlQQVQEVQClUEVUBVUA9VBDVAT1AK1QR1QF9QD9UED0BD8BxqBxqAJaAqageagBWgJWoHWoA1oC9qB9qAD6Ag6gc6gC+gKuoHuoAfoCXqB3qAP6Av6gf5gABgIBoHBYAgYCoaB4WAEGAlGgdFgDBgLxoHxYAKYCCaByWAKmAqmgelgBpgJZoHZYA6YC+aB+WABWAgWgcVgCVgKloHlYAVYCVaB1WANWAvWgfVgA9gINoHNYAvYCraB7WAH2Al2gd1gD9gL9oH94AA4CA6Bw+AIOAqOgePgBDgJToHT4Aw4C86B8+ACuAgugcvgCrgKroHr4Aa4CW6B2+AOuAvugfvgAXgIHoHH4Al4Cp6B5+AFeAlegdfgDXgL3oH34AP4CD6Bz+AL+AowgAMCkIACNGAACzjAAwGIQAIyUIAKNKADA5jAAjZwgAs84IMAhCACAECAQAy+ge/gB/gJfoHf4A/4C/6BBJgIJoZJYFKYDCaHKWBKmAqmhmlgWpgOpocZYEaYCWaGWWBWmA1mhzlgTpgL5oZ5YF6YD+aHBWBBWAgWhkVgUVgMFoclYElYCpaGZWBZWA6WhxVgRVgJVoZVYFVYDVaHNWBNWAvWhnVgXVgP1ocNYEP4H2wEG8MmsClsBpvDFrAlbAVbwzawLWwH28MOsCPsBDvDLrAr7Aa7wx6wJ+wFe8M+sC/sB/vDAXAgHAQHwyFwKBwGh8MRcCQcBUfDMXAsHAfHwwlwIpwEJ8MpcCqcBqfDGXAmnAVnwzlwLpwH58MFcCFcBBfDJXApXAaXwxVwJVwFV8M1cC1cB9fDDXAj3AQ3wy1wK9wGt8MdcCfcBXfDPXAv3Af3wwPwIDwED8Mj8Cg8Bo/DE/AkPAVPwzPwLDwHz8ML8CK8BC/DK/AqvAavwxvwJrwFb8M78C68B+/DB/AhfAQfwyfwKXwGn8MX8CV8BV/DN/AtfAffww/wI/wEP8Mv8CvEIA4JSEIK0pCBLOQgDwUoQgnKUIEq1KAODWhCC9rQgS70oA8DGMIIAgghgjH8Br/DH/An/AV/wz/wL/wHE1AilBglQUlRMpQcpUApUSqUGqVBaVE6lB5lQBlRJpQZZUFZUTaUHeVAOVEulBvlQXlRPpQfFUAFUSFUGBVBRVExVByVQCVRKVQalUFlUTlUHlVAFVElVBlVQVVRNVQd1UA1US1UG9VBdVE9VB81QA3Rf6gRaoyaoKaoGWqOWqCWqBVqjdqgtqgdao86oI6oE+qMuqCuqBvqjnqgnqgX6o36oL6oH+qPBqCBaBAajIagoWgYGo5GoJFoFBqNxqCxaBwajyagiWgSmoymoKloGpqOZqCZaBaajeaguWgemo8WoIVoEVqMlqClaBlajlaglWgVWo3WoLVoHVqPNqCNaBPajLagrWgb2o52oJ1oF9qN9qC9aB/ajw6gg+gQOoyOoKPoGDqOTqCT6BQ6jc6gs+gcOo8uoIvoErqMrqCr6Bq6jm6gm+gWuo3uoLvoHrqPHqCH6BF6jJ6gp+gZeo5eoJfoFXqN3qC36B16jz6gj+gT+oy+oK8IQzgiEIkoRCMGsYhDPBKQiCQkIwWpSEM6MpCJLGQjB7nIQz4KUIgiBBBECMXoG/qOfqCf6Bf6jf6gv+gfSogTxYnjJHHSOFmcPE4Rp4xTxanjNHHaOF2cPs4QZ4wzxZnjLHHWOFucPc4R54xzxbnjPHHeOF+cPy4QF4wLxYXjInHRuFhcPC4Rl4xLxaXjMnHZuFxcPq4QV4wrxZXjKnHVuFpcPa4R14xrxbXjOnHduF5cP24QN4z/ixvFjeMmcdO4Wdw8bhG3jFvFreM2cdu4Xdw+7hB3jDvFneMucde4W9w97hH3jHvFveM+cd+4X9w/HhAPjAfFg+Mh8dB4WDw8HhGPjP/Hrl0Aa1Yc+qL/RnB3h8Hd3YlAQgIJwULwAQYYGBiYwQYd3N3d3d3d3d3dHZa0rVchA/eQnLx7z311T15u/X5VM91fr/66e/Ve/661q/ZW1dbVkGqbattqaLVdtX01rBpe7VDtWO1U7VztUo2odq12q3av9qj2rPaqRlZ7V/tU+1b7VftXB1QHVgdVB1eHVIdWh1WHV0dUR1ZHVUdXx1THVsdVx1cnVCdWJ1UnV6dUp1anVadXZ1RnVmdVZ1fnVOdW51XnVxdUF1YXVRdXl1SXVpdVl1dXVFdWV1VXV9dU11bXVddXN1Q3VjdVN1e3VLdWt1W3V3dUd1Z3VXdX91T3VvdV91cPVA9WD1UPV49Uj1aPVY9XT1RPVk9VT1fPVM9Wz1XPVy9UL1YvVS9Xr1SvVq9Vr1dvVG9Wb1VvV+9U71bvVe9XH1QfVh9VH1efVJ9Wn1WfV19UX1ZfVV9X31TfVt9V31dVVVdN1VahilWqclWqrurVfeq+db+6fz1aPXo9Rj1mPVY9dj1OPW49Xj1+PUE9YT1RPXE9ST1pPVk9eT1FPWU9VT11PU09bT1dPX09Qz2gnrGeqZ65nqWetZ6tnr2eo56znqueu56nnreer56/XqBesF6oXrhepF60XqxevF6iXrJeql66XqZetl6uXr5eof5F/cv6V/Wv6xXrleqx/vYi1vt9vUq9av2H+o/1avWf6tXrNeo167Xqtes/1+vUf6nXrder1683qDesN6o3rgfWm9Sb1pvVg+rN6y3qLevB9Vb11vWQept623povV29fT2sHl7vUO9Y71TvXO9Sj6h3rXerd6/3qPes96pH1nvX+9T71vvV+9cH1AfWB9UH14fUh9aH1YfXR9RH1kfVR9fH1MfWx9XH1yfUJ9Yn1SfXp9Sn1qfVp9dn1GfWZ9Vn1+fU59bn1efXF9QX1hfVF9eX1JfWl9WX11fUV9ZX1VfX19TX1tfV19c31DfWN9U317fUt9a31bfXd9R31nfVd9f31PfW99X31w/UD9YP1Q/Xj9SP1o/Vj9dP1E/WT9VP18/Uz9bP1c/XL9Qv1i/VL9ev1K/Wr9Wv12/Ub9Zv1W/X79Tv1u/V79cf1B/WH9Uf15/Un9af1Z/XX9Rf1l/VX9ff1N/W39Xf11Vd103d1qGOdapzXequ7jV9mr5Nv6Z/M1ozejNGM2YzVjN2M04zbjNeM34zQTNhM1EzcTNJM2kzWTN5M0UzZTNVM3UzTTNtM10zfTNDM6CZsZmpmbmZpZm1ma2ZvZmjmbOZq5m7maeZt5mvmb9ZoFmwWahZuFmkWbRZrFm8WaJZslmqWbpZplm2Wa5Zvlmh+UXzy+ZXza+bFZuVmt80v21Wbn7Xp9fr/fVf88dmteZPzerNGs2azVrN2s2fm3WavzTrNus16zcbNBs2GzUbNwObTZpNm82aQc3mzRbNls3gZqtm62ZIs02zbTO02a7ZvhnWDG92aHZsdmp2bnZpRjS7Nrs1uzd7NKONetr2afZt9mv2bw5oDmwOag5uDmkObQ5rDm+OaI5sjmqObo5pjm2Oa45vTmhObE5qTm5OaU5tTmtOb85ozmzOas5uzmnObc5rzm8uaC5sLmoubi5pLm0uay5vrmiubK5qrm6uaa5trmuub25obmxuam5ubmlubW5rbm/uaO5s7mrubu5p7m3ua+5vHmgebB5qHm4eaR5tHmseb55onmyeap5unmmebZ5rnm9eaF5sXmpebl5pXm1ea15v3mjebN5q3m7ead5t3mvebz5oPmw+aj5uPmk+bT5rPm++aL5svmq+br5pvm2+a75vqqZumqZtQhOb1OSmNF3Ta/u0fdt+bf92tHb0dox2zHasdux2nHbcdrx2/HaCdsJ2onbidpJ20naydvJ2inbKdqp26naadtp2unb6doZ2QDtjO1M7cztLO2s7Wzt7O0c7ZztXO3c7TztvO187f7tAu2C7ULtwu0i7aLtYu3i7RLtku1S7dLtMu2y7XLt8u0L7i/aX7a/aX7crtiu1v2l/267c/q79fbtKu2r7h/aP7Wrtn9rV2zXaNdu12rXbP7frtH9p123Xa9dvN2g3bDdqN24Htpu0m7abtYPazdst2i3bwe1W7dbtkHabdtt2aLtdu307rB3e7tDu2O7U7tzu0o5od213a3dv92j3bPdqR7Z7t/u0+7b7tfu3B7QHtge1B7eHtIe2h7WHt0e0R7ZHtUe3x7THtse1x7cntCe2J7Unt6e0p7antae3Z7Rntme1Z7fntOe257Xntxe0F7YXtRe3l7SXtpe1l7dXtFe2V7VXt9e017bXtde3N7Q3tje1N7e3tLe2t7W3t3e0d7Z3tXe397T3tve197cPtA+2D7UPt4+0j7aPtY+3T7RPtk+1T7fPtM+2z7XPty+0L7YvtS+3r7Svtq+1r7dvtG+2b7Vvt++077bvte+3H7Qfth+1H7eftJ+2n7Wft1+0X7ZftV+337Tftt+137dVW7dN27ahjW1qc1varu2FPqFv6Bf6h9HC6GGMMGYYK4wdxgnjhvHC+GGCMGGYKEwcJgmThsnC5GGKMGWYKkwdpgnThunC9GGGMCDMGGYKM4dZwqxhtjB7mCPMGeYKc4d5wrxhvjB/WCAsGBYKC4dFwqJhsbB4WCIsGZYKS4dlwrJhubB8WCH8Ivwy/Cr8OqwYVgq/Cb8NK4ffhd+HVcKq4Q/hj2G18KewelgjrBnWCmuHP4d1wl/CumG9sH7YIGwYNgobh4Fhk7Bp2CwMCpuHLcKWYXDYKmwdhoRtwrZhaNgubB+GheFhh7Bj2CnsHHYJI8KuYbewe9gj7Bn2CiPD3mGfsG/YL+wfDggHhoPCweGQcGg4LBwejghHhqPC0eGYcGw4LhwfTggnhpPCyeGUcGo4LZwezghnhrPC2eGccG44L5wfLggXhovCxeGScGm4LFwerghXhqvC1eGacG24Llwfbgg3hpvCzeGWcGu4Ldwe7gh3hrvC3eGecG+4L9wfHggPhofCw+GR8Gh4LDwenghPhqfC0+GZ8Gx4LjwfXggvhpfCy+GV8Gp4Lbwe3ghvhrfC2+Gd8G54L7wfPggfho/Cx+GT8Gn4LHwevghfhq/C1+Gb8G34LnwfqlCHJrQhhBhSyKGELvRin9g39ov942hx9DhGHDOOFceO48Rx43hx/DhBnDBOFCeOk8RJ42Rx8jhFnDJOFaeO08Rp43Rx+jhDHBBnjDPFmeMscdY4W5w9zhHnjHPFueM8cd44X5w/LhAXjAvFheMicdG4WFw8LhGXjEvFpeMycdm4XFw+rhB/EX8ZfxV/HVeMK8XfxN/GlePv4u/jKnHV+If4x7ha/FNcPa4R14xrxbXjn+M68S9x3bheXD9uEDeMG8WN48C4Sdw0bhYHxc3jFnHLODhuFbeOQ+I2cds4NG4Xt4/D4vC4Q9wx7hR3jrvEEXHXuFvcPe4R94x7xZFx77hP3DfuF/ePB8QD40Hx4HhIPDQeFg+PR8Qj41Hx6HhMPDYeF4+PJ8QT40nx5HhKPDWeFk+PZ8Qz41nx7HhOPDeeF8+PF8QL40Xx4nhJvDReFi+PV8Qr41Xx6nhNvDZeF6+PN8Qb403x5nhLvDXeFm+Pd8Q7413x7nhPvDfeF++PD8QH40Px4fhIfDQ+Fh+PT8Qn41Px6fhMfDY+F5+PL8QX40vx5fhKfDW+Fl+Pb8Q341vx7fhOfDe+F9+PH8QP40fx4/hJ/DR+Fj+PX8Qv41fx6/hN/DZ+F7+PVaxjE9sYYowp5lhiF3upT+qb+qX+abQ0ehojjZnGSmOncdK4abw0fpogTZgmShOnSdKkabI0eZoiTZmmSlOnadK0abo0fZohDUgzppnSzGmWNGuaLc2e5khzprnS3GmeNG+aL82fFkgLpoXSwmmRtGhaLC2elkhLpqXS0mmZtGxaLi2fVki/SL9Mv0q/TiumldJv0m/Tyul36fdplbRq+kP6Y1ot/SmtntZIa6a10trpz2md9Je0blovrZ82SBumjdLGaWDaJG2aNkuD0uZpi7RlGpy2SlunIWmbtG0amrZL26dhaXjaIe2Ydko7p13SiLRr2i3tnvZIe6a90si0d9on7Zv2S/unA9KB6aB0cDokHZoOS4enI9KR6ah0dDomHZuOS8enE9KJ6aR0cjolnZpOS6enM9KZ6ax0djonnZvOS+enC9KF6aJ0cbokXZouS5enK9KV6ap0dbomXZuuS9enG9KN6aZ0c7ol3ZpuS7enO9Kd6a50d7on3ZvuS/enB9KD6aH0cHokPZoeS4+nJ9KT6an0dHomPZueS8+nF9KL6aX0cnolvZpeS6+nN9Kb6a30dnonvZveS++nD9KH6aP0cfokfZo+S5+nL9KX6av0dfomfZu+S9+nKtWpSW0KKaaUciqpS73cJ/fN/XL/PFoePY+Rx8xj5bHzOHncPF4eP0+QJ8wT5YnzJHnSPFmePE+Rp8xT5anzNHnaPF2ePs+QB+QZ80x55jxLnjXPlmfPc+Q581x57jxPnjfPl+fPC+QF80J54bxIXjQvlhfPS+Ql81J56bxMXjYvl5fPK+Rf5F/mX+Vf5xXzSvk3+bd55fy7/Pu8Sl41/yH/Ma+W/5RXz2vkNfNaee3857xO/kteN6+X188b5A3zRnnjPDBvkjfNm+VBefO8Rd4yD85b5a3zkLxN3jYPzdvl7fOwPDzvkHfMO+Wd8y55RN4175Z3z3vkPfNeeWTeO++T98375f3zAfnAfFA+OB+SD82H5cPzEfnIfFQ+Oh+Tj83H5ePzCfnEfFI+OZ+ST82n5dPzGfnMfFY+O5+Tz83n5fPzBfnCfFG+OF+SL82X5cvzFfnKfFW+Ol+Tr83X5evzDfnGfFO+Od+Sb8235dvzHfnOfFe+O9+T78335fvzA/nB/FB+OD+SH82P5cfzE/nJ/FR+Oj+Tn83P5efzC/nF/FJ+Ob+SX82v5dfzG/nN/FZ+O7+T383v5ffzB/nD/FH+OH+SP82f5c/zF/nL/FX+On+Tv83f5e9zlevc5DaHHHPKOZfc5V7pU/qWfqV/Ga2MXsYoY5axythlnDJuGa+MXyYoE5aJysRlkjJpmaxMXqYoU5apytRlmjJtma5MX2YoA8qMZaYyc5mlzFpmK7OXOcqcZa4yd5mnzFvmK/OXBcqCZaGycFmkLFoWK4uXJcqSZamydFmmLFuWK8uXFcovyi/Lr8qvy4plpfKb8tuycvld+X1Zpaxa/lD+WFYrfyqrlzXKmmWtsnb5c1mn/KWsW9Yr65cNyoZlo7JxGVg2KZuWzcqgsnnZomxZBpetytZlSNmmbFuGlu3K9mVYGV52KDuWncrOZZcyouxadiu7lz3KnmWvMrLsXfYp+5b9yv7lgHJgOagcXA4ph5bDyuHliHJkOaocXY4px5bjyvHlhHJiOamcXE4pp5bTyunljHJmOaucXc4p55bzyvnlgnJhuahcXC4pl5bLyuXlinJluapcXa4p15bryvXlhnJjuancXG4pt5bbyu3ljnJnuavcXe4p95b7yv3lgfJgeag8XB4pj5bHyuPlifJkeao8XZ4pz5bnyvPlhfJieam8XF4pr5bXyuvljfJmeau8Xd4p75b3yvvlg/Jh+ah8XD4pn5bPyufli/Jl+ap8Xb4p35bvyvelKnVpSltCiSWVXErpSq/r0/Xt+nX9u9G60bsxujG7sbqxu3G6cbvxuvG7CboJu4m6ibtJukm7ybrJuym6Kbupuqm7abppu+m66bsZugHdjN1M3czdLN2s3Wzd7N0c3ZzdXN3c3TzdvN183fzdAt2C3ULdwt0i3aLdYt3i3RLdkt1S3dLdMt2y3XLd8t0K/1vfX69bv9ug27DbqNu4G9ht0m3abdYN6jbvtui27AZ3W3Vbd0O6bbptu6Hddt323bBueLdDt2O3U7dzt0s3otu1263bvduj27PbqxvZ7d3t0+3b7dft3x3QHdgd1B3cHdId2h3WHd4d0R3ZHdUd3R3THdsd1x3fndCd2J3Undyd0p3andad3p3Rndmd1Z3dndOd253Xnd9d0F3YXdRd3F3SXdpd1l3eXdFd2V3VXd1d013bXddd393Q3djd1N3c3dLd2t3W3d7d0d3Z3dXd3d3T3dvd193fPdA92D3UPdw90j3aPdY93j3RPdk91T3dPdM92z3XPd+90L3YvdS93L3Svdq91r3evdG92b3Vvd29073bvde9333Qfdh91H3cfdJ92n3Wfd590X3ZfdV93X3Tfdt9133fVV3dNV3bhS52qctd6bquBwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAwA9WWnmVFRfo9fvpc59en94SvT69k/r0er3+/6Pfj9Ufe07Y6/U2/qE25Q///3ZUbeTF488zJLzU55+Vo/9ni+jzj00L9Hq9LX8av2/vdz/U+vRG+6FttP/pPL0howYaVf44b/9+s/RW7a3UW3PU55Gj1t6nt8LPFlJGlSv82JD6/aflxN0PW9G76mfj9PuH+1nh7zZyQO/n5UR9xvyh7Lqu+8+26P+Mf1wn///2X33Oxuz9vPz5c9bvn+Z/nb/Lf79R6eg76vN/R/63+2n8vr3V/6v5n/A/LPQ/5H/Mn/K/am9wb1hv2Kj2f3YO9Pv7fZ3wPy9n7N/16/235/f/q77/6gXwL9X3H/Lf9/8l/33/nfL/0+L/Vv7H/K/SG9rbordSb3BvSG/QqPZ/lv8fw/xT/v9u3B/LmXp9+vWXf/6t/GP++43K/1t/l//RR50BPzZNOCovP+Z/gf/N/P/8Pb9Pb7X/cs5/7sfxx+g3S2/t3tDekN6OvW16g34Yd+RP8/TtbfbTjP1H/vU+fvx9YNIfri4yKhmT9s7sM3Gvz99mGW3iUd//oe1vHUb76/v7gL69n/X5+2u9UWflAj/N37838aja8N6I3q69rXsDe0N+OI1+fB/569k3x0/9R+uN+9NOj/o5jbrzkT+1T/nT28qU/8vnUP//eRf+L9b/h+cEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOA/6gEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD8E8NH7Lr1wCFDBg1TUVFR+anyrz6ZgP/T/kfo/9UrAQAAAAAAAAAAAP5X/Xf8OfG/+h4BAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA+Pfz6epH/WHNldYYMGDAgF7fvqMam5/3Gdkb2esm+Ft9wlFtXb+/lX16vV7/Xq935CzzbrTyNquN89e28We4Y/vdL5vprh3GWfu68W8do/f4hBt++tVC7z0++eNTfVrW3HLw8AGDhw/YdugOAwYO2GTo0B0GbjJk0IDNBg/fer4Bqw0ZNHD4oAGDtx0+aNjPLm8+ZOh2240YMHDbzcYbe7thg4YPHzBw2xEDth40YsAOQwfsMGzEgIFbDBy87YD55ptvwHhj/7dt3r+9tS75fwIAAP//yxG9Rg==")
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='.\x00', 0x0, 0x0)
open_by_handle_at(r0, &(0x7f0000000240)=@reiserfs_2={0x4b, 0x2, {0xb, 0xb}}, 0x0)


syz_mount_image$udf(&(0x7f0000000040), &(0x7f0000000500)='./file0\x00', 0x18008, &(0x7f0000000000)=ANY=[@ANYRES32=0x0, @ANYRESDEC], 0xfe, 0x4b1, &(0x7f0000001d00)="$eJzs201sVNUbx/HfM3c6TIf+/5YXCxgCTTSxgkBfsEBqYnix0YQXLVQj8SWVTrHSdkinKCUgLNWdC5Yu3bpwZdwaEpfGhcEYFibIxs2sxB3m3LlvM5TOjG1nKP1+CJx7zzx3OOc8c+ecM5kRAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACQjrx6uLfPWt0KAADQTCdPj/QOMP8DALCmnGH/DwAAsJaYPP0u054LJTvhn5dlj0/OXLo8emx44cvaTaaUPD/e/c329Q/sf2nwwMGwXPz65bZNp06fOdx9tDB9cTZfLObHu0dnJs8VxvN1P8NSr6+2yx+A7ukLl8YnJord/XsHKh6+3Hlv3fquzqHB945mwtjRY8PDpxMx6bb//L8/5FEr/Iw8vSDTx99/ayclpbT0sajx2llp7X4ndvmdGD027HdkanJsZs49aKkgKlU5JplwjJqQiyVJSa5dllmePVubPP0g05F9JTslyQvHYbf/wXBd7WmFtNu6SurRKsjZY2ydPH0g0619nXojGFc//xnpaqsbhxWXDu7/gpXsTf/9wN1P7m3z+Fvdr89MFBKxlgruqNU+PzTTY/7elJWnU/4dX7IR7Wx1c9Bk7fI0LVPmq0/8dYX8delTQwd27DyUXGFsqfE8LnZvcHPVMye3BUsHS7k/y98v1Cdrnv6U6f5vWf+8J5wDpBsPFrvwj6Y0DyvNPE3J9M+1klnVvtRL7O8jq33uX9n2t2ePFi7Oz06e/2huwcdz2cMfFudmx84t/HB57+ola2rtY6ulGtuS5ay84/v801J0XbAH+F/5LG7NN1fj10JPVRlKvn7qOa57F9vAOsq1yczTXZkm3t9anmeUa3hs1gKX/2GZiqWfLcx0kP90+SyR/5fj8ctaZRnxc/v/8uda4Vpi29nNj6pfify7Nrn8vyPTkb+3Bp9plPPvVcW6uC6Z3r25PYhLZVxcOuxO+RknJqfyvS72gUwbfwpj5cfmgthNcWyfiy3K9MWtytj1QezmOLbfxd6W6c6vC8c+HccOuNh5l6873WFszsXuCGK74ti95wpT47WG1eW/X6a3r79mYZ8fmf/E/X+jqow8lPPFj5cr/52JuhtBXs8G+U/XyP+XMs3/tT3stz/24ctqg/9vnH+3Vv7uZmVsuKHcGMf21dutVnP53yDTvVduR30O+hacxhlK5v+ZdGUZjWuL8r8hUdcZtCvT4FisRcX5KxfGpqbysxxwwAEH0UGr35nQDG7+H3Gz+qBn4TommP87ymfxiun+Z/H8P1RVRlo0/29M1A0Fq5a2tJSdm77YtkXKFuev7JmcHjufP5+fGdg/2Nt/aH/vwMG2TLi4i4/qHrsngcv/bpmu/fhLtI+pXP8tvP7PVZWRFuV/U7JPFeuauodiTXL575Bp8O7taL+52Po/3P/3PFtZRvdfi/K/OVHXGbSro8GxAAAAAAAAAAAAAAAAAIDVJGeenpPp8siLFv6GqJ7v/41XlZHl//5X+YfJNb7/1ZWoG2/S7xoaGmgAAAAAAAAAAIAmScnT1zI9r5JddxUd0olkiSfavwEAAP//G6xIAA==")
sync()
unlink(&(0x7f0000000140)='./file1\x00')


syz_mount_image$ext4(&(0x7f0000000780)='ext4\x00', &(0x7f0000000240)='./file0\x00', 0x2000480, &(0x7f0000000000)={[{@jqfmt_vfsv0}, {@errors_remount}]}, 0x1, 0x784, &(0x7f00000007c0)="$eJzs3c1rHOUfAPDvbJKmSfv7JYKg9RQQNFC6MTW2Ch4qHkSwUNCz7bLZhppNtmQ3pQkBW0TwIqh4EPTSsy/15tWXq/4XHqSlalqseJDI7Eu7bXbTTZvdDeTzgck+z8xsnue7z8wzz+wMOwHsWRPpn0zEoYj4KIkYq89PImKomhqMOFFb7/b6Wj6dktjYePOPpLrOrfW1fDS9J3WgnnkyIn58P+JwZnO55ZXV+VyxWFiq56cqC+enyiurR84t5OYKc4XFY9MzM0ePv3D82M7F+tcvqwevf/zas9+c+Oe9J65++FMSJ+JgfVlzHDtlIibqn8lQ+hHe49WdLqzPkn5XgIeS7poDtb08DsVYDFRTbYz0smYAQLe8GxEbAMAekzj+A8Ae0/ge4Nb6Wr4x9fcbid668UpE7K/F37i+WVsyWL9mt796HXT0VnLPlZEkIsZ3oPyJiPjiu7e/Sqfo0nVIgFYuXY6IM+MTm/v/ZNM9C9v13FYLN4arLxP3zdb/Qe98n45/Xmw1/svcGf9Ei/HPcIt992E8eP/PXNuBYtpKx38vN93bdrsp/rrxgXruf9Ux31By9lyxkPZt/4+IyRgaTvPT1VVb3wU1efPfm+3Kbx7//fnJO1+m5aevd9fIXBscvvc9s7lK7lHjbrhxOeKpwVbxJ3faP2kz/j3VYRmvv/TB5+2WpfGn8TamzfF318aViGdatv/dtky2vD9xqro5TDU2iha+/fWz0XblN7d/OqXlN84FeiFt/9Gt4x9Pmu/XLG+/jJ+vjP3QbtmD42+9/e9L3qqm99XnXcxVKkvTEfuSNzbPP3r3vY18Y/00/smnW+//W23/6TnhmQ7jH7z++9cPH393pfHPbqv9t5+4ent+oF35nbX/TDU1WZ/TSf/XaQUf5bMDAAAAAAAAAAAAAAAAAAAAAAAAgE5lIuJgJJnsnXQmk83WnuH9eIxmiqVy5fDZ0vLibFSflT0eQ5nGT12ONf0e6nT99/Ab+aP35Z+PiMci4tPhkWo+my8VZ/sdPAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADUHWjz/P/Ub8P9rh0A0DX7+10BAKDnHP8BYO/Z3vF/pGv1AAB6x/k/AOw9HR//z3S3HgBA7zj/BwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAoMtOnTyZTht/r6/l0/zshZXl+dKFI7OF8nx2YTmfzZeWzmfnSqW5YiGbLy20/UeXai/FUun8TCwuX5yqFMqVqfLK6umF0vJi5fS5hdxc4XRhqGeRAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEDnyiur87lisbAksWViZHdUY9ckBmNXVEOia4nmXmKkfx0UAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAwC73XwAAAP//+Lkq2Q==")
r0 = openat$cgroup_ro(0xffffffffffffff9c, &(0x7f00000001c0)='cpuacct.usage_sys\x00', 0x275a, 0x0)
write$binfmt_script(r0, &(0x7f0000000080)={'#! ', './file0'}, 0xb)
write$FUSE_ATTR(r0, &(0x7f0000000280)={0x78}, 0x78)
mmap(&(0x7f0000000000/0xb36000)=nil, 0xb36000, 0x2, 0x28011, r0, 0x0)
clock_gettime(0x0, &(0x7f0000000340))
r1 = openat$adsp1(0xffffffffffffff9c, &(0x7f0000000380), 0x200000, 0x0)
fcntl$addseals(r1, 0x409, 0x0)
syz_mount_image$ext4(&(0x7f0000000000)='ext4\x00', &(0x7f0000000040)='./bus\x00', 0x1810714, &(0x7f0000000100)={[{@jqfmt_vfsold}, {@noblock_validity}, {@usrquota}, {@prjquota}, {@debug_want_extra_isize={'debug_want_extra_isize', 0x3d, 0x6}}, {@jqfmt_vfsold}, {@usrjquota, 0x5}, {@min_batch_time={'min_batch_time', 0x3d, 0xa9f}}, {@nodiscard}]}, 0xff, 0x467, &(0x7f0000000780)="$eJzs289vFFUcAPDvzG7LL6EVEQVBqmhs/NHSgsrBi0YTD5qY6AGPtS0EWaihNRFCtBqDR0Pi3Xg08S/w5MmoJxOvejckRIkJ6IU1szvTdpfd0h9blnQ/n2S67+28nXnfvnkzb+btBtCzhrI/ScR9EfF7RAzUs40FhuovN69fmvz3+qXJJKrVt//qr5W7cf3SZFG0+NyOPDOcRqSfJ/lOGs1euHhmolKZPp/nR+fOfjA6e+Hic6fPTpyaPjV9bvz48WNHx158Yfz5jsSZxXdj/8czB/a9/u6VNydPXHnv5++y+u49WF+/NI5OGcoC/7ta07zuyU7vrMtuVRfjTMrdrg0rVYqIrLn6av1/IEqx2HgD8dpnXa0csKGyc/aWFm/nr/NVYBNLots1ALqjuOBn97/FcheHH1137eX6DVAW9818qa8pR5qX6dvA/Q9FxIn5/77Olmh6DrHQPv0bWAEAoOf8kI1/nm01/ktj75Jyu/K5ocGIuD8idkfEAxGxJyIejKiVfSgiHl7l/punhm4ff6ZX1xTYCmXjv5fyua3G8V8x+ovBUp7bWYu/Lzl5ujJ9JP+fDEffliw/1mrjxSZe/e3LdvtfOv7Llmz/xVgw38jVctMDuqmJuYlODUqvfRqxv9wq/mRhJiCJiH0RsX91m95VJE4//e2BdoXuHP8yOjDPVP0m4ql6+89HU/yFZPn5ydGtUZk+MlocFbf75dfLb7Xbf8v4b+1cf2ArlLX/9sbjf2FdqfZ34J9k6XztbKz6huTyH1+0vacsr/H470/eqc3pFjX5aGJu7vxYRH/yRkTz++OLny3yRfks/uHDrfv/7vwzWfyPRER2EB+MiEcj4lDedo9FxOMRcXiZ+H965Yn3261b1/EfsW2F5drK4p9qef5bOP4HG9t/9YnSmR+/X3v8Wfsfq6WG83dq5787aF+drXmJxaMZAAAANru09t34JB1ZSKfpyEj9O/x7YntamZmde+bkzIfnpurfoR+MvrR40jWw5HnoWDKfb7GeH8+fFRfrj+bPjb8qbavlRyZnKlNdjh163Y7G/n+o6P+ZP0vdrh2w4fxeC3pXc/9Pu1QP4O5z/Yfepf9D79L/oXe16v+fNOXNBcDm5PoPvUv/h96l/0Pv0v+hJ63nd/0blSgv8+t9iXslEek9UQ2JFonsgr5lnb2722cmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAzvg/AAD//7K/8i4=")
linkat(0xffffffffffffffff, &(0x7f00000001c0)='./file0\x00', 0xffffffffffffffff, 0x0, 0x1000)
mkdir(&(0x7f0000000480)='./bus\x00', 0x0)
mount$tmpfs(0x0, &(0x7f00000002c0)='./file0\x00', &(0x7f0000000300), 0x0, &(0x7f00000003c0)=ANY=[@ANYBLOB='nr_inodes'])
mkdir(&(0x7f0000000400)='./file1\x00', 0x0)
mkdir(0x0, 0x0)
mkdirat(0xffffffffffffff9c, &(0x7f0000000140)='./file1/file4\x00', 0x0)
symlink(&(0x7f0000000080)='./bus\x00', &(0x7f00000000c0)='./bus\x00')
mount(&(0x7f00000000c0)=@sg0, 0x0, &(0x7f0000000080)='cifs\x00', 0x0, 0x0)


mmap(&(0x7f0000001000/0xc00000)=nil, 0xc00000, 0x0, 0x3032, 0xffffffffffffffff, 0x0)
mknod(&(0x7f0000000140)='./file0\x00', 0x1000, 0x0)
r0 = open(&(0x7f0000000040)='./file0\x00', 0x42082, 0x0)
write$FUSE_IOCTL(r0, &(0x7f0000000100)={0x20}, 0xfdef)
mknod(&(0x7f0000000080)='./bus\x00', 0x1000, 0x0)
open(&(0x7f00000002c0)='./bus\x00', 0x0, 0x0)
r1 = open$dir(&(0x7f0000000300)='./file0\x00', 0x0, 0x0)
r2 = creat(&(0x7f0000000000)='./bus\x00', 0x0)
tee(r1, r2, 0x3, 0x0)


syz_mount_image$vfat(&(0x7f0000000000), &(0x7f0000000100)='./file2\x00', 0x0, 0x0, 0x0, 0x0, &(0x7f0000000000))
syz_mount_image$hfs(&(0x7f0000000040), &(0x7f0000000100)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', 0x30008c0, &(0x7f0000000180)={[{}, {@codepage={'codepage', 0x3d, 'iso8859-6'}}, {@iocharset={'iocharset', 0x3d, 'maccyrillic'}}, {}, {@part={'part', 0x3d, 0x5}}]}, 0x11, 0x2b6, &(0x7f0000000200)="$eJzs3U9rE0EYx/HfbNI22lK3tiJ4rBb0Ilov4iUieRGeRG0iFENFrfjnVMWTiN69+xZ8EV4U34CePPkC6mllZifZJLvZTUOTber3Aw2b7D47z2T/zDyBsgLw37rV+Pn52m/7Z6SKKtK7G1IgqSZVJZ3R2dqznd3t3Xarmbejiouwf0ZxpElts7XTygq1cS7CC+27qpZ6P8NkRFF081fZSaB07urPEEgL/up062tTzyzf6zHj9g45j1lj9rWvF1ouOw8AQLn8+B/4cX7Jz9+DQNrww/6RHP/HtV92AhMX5a7tGf9dlRUZe3xPuVVJvedKOLs+6FSJo7Q8N/B+XvGZ1TfBNEVVpcslOPFgu926vPWo3Qz0RnWvZ7M199qMT92OgmzXM2rTHCP03WTPKBddH+ZsHzbj/J9L6st/dcwWx2a+mu/mjgn1Sc3u/K8aGXuY3JEKB45UnP+V4Xt0vQztVvK3jXq9HvRtsuIaOedb8Ap6WcuuSNQ5o1bU/wNBWJSnizo9EBX37mpB1Gpm1Gbn3ZCotb4o25vu2Ty8vUkzH8xts64/+qJGz/w/sPltKPfKTK4asxEPBe4bj/szn91c1e0zTI0c6cul+y0uDEv9b/49DQfwXvd1XctPX756WGm3W0/swr2MhcdL3U/m3kqZ25S8oL3kkwVFTmrjzqA0zcQuHeoO7f2jcGN7lR2Jg3KsFxrfpnsilbFQ8v0JU5Ec9LIzQUnsvMvE9V9Sr1TjyZ59CTPn6SP+EOD3GNk5dreCS2KjeEYu6eSBKrjF4RVcuuZK1Yyu5jp/Uboweouhz/OYMA390F1+/wcAAAAAAAAAAAAAAAAAAJg10/h3grL7CAAAAAAAAAAAAAAAAAAAAADArOs+/1ed5/9qtOf/Dj6K5TCf//txRzz/F5i8fwEAAP//FZd8vg==")
mkdir(&(0x7f0000000400)='./file1\x00', 0x0)
chmod(&(0x7f00000001c0)='./file3\x00', 0x0)
unlink(&(0x7f0000000180)='./file1\x00')


r0 = fsopen(&(0x7f0000000000)='tmpfs\x00', 0x0)
fsconfig$FSCONFIG_CMD_CREATE(r0, 0x6, 0x0, 0x0, 0x0)
r1 = fsmount(r0, 0x0, 0x0)
symlinkat(&(0x7f0000000300)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', r1, &(0x7f0000000100)='./file0\x00')
openat(r1, &(0x7f0000000140)='./file0\x00', 0x2441, 0x0)
openat(r1, &(0x7f0000000040)='./file0\x00', 0xa1d6a2368f64ac07, 0x0)


syz_mount_image$vfat(&(0x7f0000000580), &(0x7f00000005c0)='./file0\x00', 0x0, &(0x7f00000001c0)=ANY=[@ANYBLOB="00faffffff"], 0x1, 0x576, &(0x7f0000000640)="$eJzs2sFqE10YBuDvb/u3xc1kYTfiYsCNq5L2ChykhWJAqGShKwebQsmEwgQCycK6c+XC2/ByvACvo4tCJJkQE41ubBk1zwPhvHDyhm82mbM4rx/2umeX/fOPH77E7pM0NiJi4zqiMU2V/2brxjRvx6J3AQD8bU5P86zuGbhbZZnlkzPczg877U+1DAQAAAAAAAAAAMBvc/8fANaP+///vrLM8u3Z+W2Z+/8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAfa7H42T8i0/d8wEAt8/7HwDWz4uXr55lrdbRaZruRhTvB+1Bu1qr/ew8LqKITjQjiZuI8VyVj09aR810qhF7vauqP1k3l/sHkURjdf+g6qfL/f/j3mL/MJK4v7p/uLK/HY8fLfT3I4nPb+IyijiLSfdb/+1Bmj593vquvzP9HgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKyT/XSuEXu9q0F70I6YrJvV/v7P9qt+dh4XUUQnmpHETcR4rsrHJ62j5uwHlvtb8WCr3mcHAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAlvWHo25eFJ1SEARhHur+ZwIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOrSH466eVF0yn7dkwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAf4r+cNTNi6JT3mGo+xkBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAuH1fAwAA//9JmLyV")
syz_mount_image$ext4(&(0x7f0000000180)='ext4\x00', &(0x7f00000000c0)='./file0\x00', 0x800714, &(0x7f00000003c0)={[{@dioread_nolock}, {@journal_dev={'journal_dev', 0x3d, 0x7}}, {@quota}, {@noinit_itable}, {@errors_continue}, {@errors_continue}, {@errors_remount}, {@delalloc}, {@auto_da_alloc}, {@norecovery}, {@errors_continue}, {@journal_ioprio={'journal_ioprio', 0x3d, 0x1}}]}, 0xee, 0x442, &(0x7f0000000d00)="$eJzs281vG0UbAPBn10n6vv1KKKW0oYVAQUR8JE1aoAcuIJA4gIQEh3IMSVqFug1qgkSrCAJC5YgqcUcckfgLOMEFASckrnBHlSqIkFo4Ba29m9punObDrkP9+0nbznjHnnk8O97xTBxA1xrK/kkidkfErxHRX83WFxiq/ndjaWHy76WFySSWl9/4I6mUu760MFkULZ63K88MpxHpJ0leSb25i5fOTpTL0xfy/Oj8uXdH5y5eenrm3MSZ6TPT58dPnjxxfOy5Z8efaUmcWVzXBz+YPXzolbeuvDZ56srbP36dtffAker52jhaZSgL/M/lisZzj7W6sg7bU5NOejrYEDakFBFZd/VWxn9/lOJm5/XHyx93tHFAW2X3ph3NTy8uA3exJDrdAqAziht99v23OO7Q1GNbuPZC9QtQFveN/Kie6Yk0L9PbxvqHIuLU4j9fZEe0aR0CAKDWt9n856nV5n9pHKgptzffQxmIiHsiYl9E3BsR+yPivohK2fsj4uAG62/cGrp1/pNe3VRg65TN/57P97bq53/F7C8GSnluTyX+3uT0THn6WP6eDEfvjiw/tkYd3730y2fNztXO/7Ijq7+YC+btuNrTsEA3NTE/0apJ6bWPIgZ7Vos/WdkJSCLiUEQMbuyl9xaJmSe+Otys0O3jX0ML9pmWv4x4vNr/i9EQfyFZe39y9H9Rnj42WlwVt/rp58uvN6t/S/G3QNb/O+uv/4YS/X8ltfu1cxuv4/Jvnzb9TrPZ678vebOyZ92XP/b+xPz8hbGIvuTVSr7u8fGbzy3yRfks/uGjq4//fflzsvgfiIjsIj4SEQ9GxEN52x+OiEci4uga8f/w4qPvbD7+9srin1r182/l+h+o7/+NJ0pnv/+mWf3r6/8TldRw/kjl8+821tvArbx3AAAA8F+RRsTuSNKRlXSajoxU/4Z/f+xMy7Nz80+enn3v/FT1NwID0ZsWK139NeuhY8li/orV/Hi+VlycP56vG39e+n8lPzI5W57qcOzQ7XY1Gf+Z30udbh3Qdn6vBd2rcfynHWoHcOe5/0P3Mv6hexn/0L1WG/8fNuTtBcDdyf0fupfxD93L+IfuZfxDV9rK7/olujkR6bZoRrsScXBbNKNziU5/MgEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALTGvwEAAP//KivtwQ==")
syz_open_procfs$namespace(0x0, &(0x7f0000000180)='ns/cgroup\x00')
mkdirat(0xffffffffffffff9c, &(0x7f0000000280)='./file0\x00', 0x0)
mount$bind(&(0x7f00000002c0)='.\x00', &(0x7f0000000200)='./file0/../file0\x00', 0x0, 0x101091, 0x0)
mount$bind(&(0x7f0000000880)='./file0\x00', &(0x7f00000007c0)='./file0/file0\x00', 0x0, 0x81105a, 0x0)
chroot(&(0x7f0000000300)='./file0/../file0/../file0/../file0\x00')
chdir(&(0x7f0000000400)='./file0\x00')
pivot_root(&(0x7f0000000000)='./file0/../file0\x00', &(0x7f00000001c0)='./file0/../file0/../file0\x00')
syz_open_procfs$namespace(0x0, &(0x7f0000000040)='ns/cgroup\x00')
openat$cgroup_ro(0xffffffffffffff9c, &(0x7f00000000c0)='memory.events\x00', 0x275a, 0x0)
r0 = epoll_create1(0x0)
syz_open_procfs(0xffffffffffffffff, &(0x7f0000000000)='attr/current\x00')
epoll_create1(0x0)
r1 = dup(r0)
fcntl$dupfd(r1, 0x0, r1)
epoll_create1(0x0)
mount$afs(&(0x7f0000000140)=@cell={0x23, 'syz1:', 'syz1'}, &(0x7f0000000240)='./file0\x00', &(0x7f0000000340), 0x8, &(0x7f00000004c0)={[{@flock_strict}, {@flock_write}, {@flock_write}, {}, {}, {@flock_write}, {@flock_strict}], [{@smackfsdef={'smackfsdef', 0x3d, 'memory.events\x00'}}, {@subj_role={'subj_role', 0x3d, '-+'}}, {@appraise}, {@pcr={'pcr', 0x3d, 0x25}}, {@smackfsfloor={'smackfsfloor', 0x3d, '/&:.'}}]})
openat$cgroup_ro(0xffffffffffffff9c, &(0x7f00000001c0)='pids.current\x00', 0x275a, 0x0)
openat$6lowpan_enable(0xffffffffffffff9c, &(0x7f0000000080), 0x2, 0x0)
r2 = openat$rdma_cm(0xffffffffffffff9c, &(0x7f0000000000), 0x2, 0x0)
ioctl$F2FS_IOC_MOVE_RANGE(0xffffffffffffffff, 0xc020f509, &(0x7f0000000080)={r2})
writev(r2, &(0x7f00000000c0)=[{&(0x7f0000000080), 0xfffffebe}], 0x1)
mkdirat$binderfs(0xffffffffffffff9c, &(0x7f0000000100)='./binderfs2\x00', 0x1ff)


quotactl$Q_SYNC(0xffffffff80000101, 0x0, 0x0, 0x0)


syz_mount_image$f2fs(&(0x7f00000004c0), &(0x7f0000000080)='./file0\x00', 0x450, &(0x7f0000005f80)=ANY=[@ANYBLOB="66617374626f6f742c71756f7461000000000000003b814e50a959736d65720f73ecea54b5e5be45ace9a88f723cb005aeff24212c651baef614d442ae89412ad3dcd0b7586d02002a6d6d65cacd4fc5002207ce994dda65c4b1d23a9bd5ba0f4ce5e0b5a5718c6aa918080002223d2753a5cac974110144cd0a1e368652324a41b31e1eb3b32dccbdf8f68bd96a45a75427a5f789d267fd92f6a5540200b81d5b9fa9b40fe4d7fbd50a6afc3a989c6d60045663c59cbdc4c700000000bc7f6b22df0191acf5912afdcc1c061835177068c40f757dd123d2600b1c544f1525aa8d00000000000000000000002e8b5c733d362417c17f527c0bfebec112d57fc69fabb9b31ef97b2147931ff60cdf666c25244218b1f1a6010000000100000020563b835d0e8e9a09070ef1691fcb2f37bda5d4e3d9d7a2d0ac82b45a53001057f321acc45d5e065a461de90100000077d200000000000040b78f0dd3836f5ab2f6a1a5b798bb7752f192c6b48e568973a59cd9c74bd9a14721856c5499cd8f93f8beaa9cf76718ce7244c84268030000000000000208886b313bd01a22d576e414011a4f0a897515329f86d4585fa0ea17068f8af349696da4a2b3e24310ca52ec51bc23b57897cb55a2d513e6a00765ee3f58b471c54dd57f0af584afe4a21f92b515d7f2fa6fbb273ca0f751e684584320534667aea39ad7222c8ef531f514939177a47395e94c1723abb3fd44fd64fde4b45cc2f55f4ae05ff48648a4c998257856bcdcf2fa02010000001f54fb936570450e91c8d55abad76a7b7a000016f81ec9da9ccc1191c211632266d907e4d9b23496ae19bac24dc23c43f514f1b4af19988bbe61ee29a368a999435d6872d01b79c7821e875859dfbf3c57e4f1fb0be46cb5f7a0fa13516c0926d19dd2d5862085e1e4cb8279be17cba17ee4d06ad97b4ca282e73ea142b01b4a742fa11c0927ba811dd60903d575db449d775021b542db617086b3ed42e6e60fe043cff79b0c067c584bbf82657974c3736912b4b522052b9467d0da116ccc1652d861a420f09aaf67d3e9f6160100000001000000ae6335ad9896abd3cc00413638cb9bc62ab8054325d72e9144cf4f88702f586507e3147198e0bc4060a7c8f4dce73b653177ecf8228e6e6fae02510000000000000000000000000000f43739fdd2d24e50e0233acfe1c8639070fe00f40b0d01f8a0a35fcfe3ea10faf9c24b8488ed4ed83fb06a9a7c57442ede9e1fc2853b8f4d2241cff61d0125b7750e3fdae6a4ab9c776a191ed8098a780ea2bbaa64978cd3a6458fcc6b949bcbca0dceb7361f66e46731eba4f3aed335e7c8c541e82453218a19d39489e1525466ac93759787e767f601931d94c9c426489b741a6bc8abf475e4bf859e1ce7f7227069e9f51e25fa3d1b18dc565180a1af464a1dd697db85e2b27b90f6bd7cf1b6bc0bcd8ba552ced3d3cfbf9c9bc04f65b6f83cb40173b4bdc393d47e5da95b63a40ac18daf11e8d0706b47795fbe2b56d0ea7ffc5a59ede88621a08b25ca6ebe041317b62373a60951af33eb7954a9731aaa125add0913ed2435a207439e9122512d77096747a4b404459cebc8faff8f7a31758e630c75a1ff90402754d339dc21cf6b8e04e1aedf14df0b4aaf0e03194df3eb41ba066bc343b323a3162d7e7ba687633c2faa8f28b42364b72e3a457476fd6b2a54e670ba798172c44c4390f73fdab743a4cac88b2bd0545b8483f2e2f9846b138a4d8a7332978da70e9050417087c5ae034a735e8b448dd9701404", @ANYRES32], 0x2, 0x5558, &(0x7f0000000a00)="$eJzs3EtvG9UXAPBju2n/ff4jxIJdR6qQEqm26vQh2BVoxUO0qngsWIFju5Zb2xPFrhOyQoIlYsE3QSCxQmLDZ2DBmh1iAWKHBPLMmBLCo5XdOGl/P2l85t65PnPvyEp0ZiwH8MRaTn75qRRn4nhEVCLiVCmy/VKxZa7m4ZmIOBsR5T9tpaL/j46jEXEiIs5Mkkd8PczHTA59en587vKPr/381bfHjpz87MvvFrdqYNGejYj+Rr6/1c9j2snjnaK/Me5msX9pXMT8QP9u0U7zuNVezzJsNabjGlm82MnHpxv3hpN4u9doTmKnezvr3xjkJxyOO9M82RvuNDazdqu9nsXuMM1iZyef1/ZO/vdyZzjK87SKfO9n6WM0msa8v73dztezcTeLzcGo6M/zpq329iSOi1icLpppr5XNY32WK32wvd4d3NtOxu3NYTcdJJdr9edq9SvV+mbaao/al6qNfuvKpWSl05sMq47ajf7VTpp2eu1aM+2vJiudZrNarycr19rr3cYgqddrF2sXqpdXi73zycs33056rWRlEl/sDu4d7faGye10M8nfsZqs1S4+v5qcqydv3riV3Hrj+vUbt95699o7N1+48epLxaA900pW1i6srVXrF6pr9dUDsP7J/90HXP9olvV/VEz6IdZfmu3ywL/zAQN4aHvq/5hv/V8J9T+w12Gv/2Oe9f+kpFL//3f9W569/p+p/j2o9f8hXj/MRP0PAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPDE+n7p81eyneW8fbLoP110PVW0SxFRjojf/kYlju7KWSnyLP3D+KW/zOGbUmQZJuc4VmwnIuJqsf36/0d9FQAAAODx9cUHZz/Jq/X8ZXnRE2I/5Tdtyqfem1O+UkQsLf8wp2zlycvTc0qWfb6PxPacsmU3sP43p2T5Lbcj88r2QCrT8OHp+53Zgkp5KO/rdAAAgH1R2RX2twoBAABgP3286AmwGKWYPsqcPgvOvnl//9Hm8V3HAAAAgEOotOgJAAAAAI9cVv/7/T8AAAB4vOW//wcAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPzOzv3kpg1EcQB+NhjoPxVV3fcq3cExeoQuuywcoJfgCPQKuQBnILvss4kgwh4hOQIpCuNYoO+TbDM2+s0MsHljYQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOjSfbWe///749+lObv9ZfLMBgAAADhlW63n9Ytp0/6Uzn9Jp76ldhERZUScqt0HMWplDlJOdeb91Ysx3EXUCYc+xmn7GBE/0/b0tetPAQAAAG7XZrmaNdV6s5v2PSDeU7NoU37+lSmviIhq+pAprTzsvmcKq3/fw/iTKa1ewJpkCmuW3Ianr41yddI2aB3STCaL+kusW2U3/QIAAH1qVwJnqhAAAABuwO++B0A/iuPueJ9x3BzSDcEPrRYAAABwhYq+BwAAAAB0rq7/r+H5f4/+lgAAAABv1jz/DwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAgC5tq/V8s1zNzl1fvDJnt79MvhkBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA8sz/3KBACYQxAs4u/ncz9DysRLa1t3oOBkDDFBwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA8OZ3v/yfcDVHkqltw9x6JFk6NaydGrbODXs/jK+vAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABO9uclBUIgCKJgzvjfSd//sJKgZxAhAhoeVdSiAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC+6He//J+YGmeSudPG0vFIsnbV2Lpq7D1oHD0Yb/8GAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAudu6fN24yDAD4c77z9Q8gQkAZAqhIDLDQ5FpaOsIAihj4CEhReimBK4U2A60iUBaYUOYuCEaEkEBh63fo3EhdytYhQ5CYQfbZV7e90qM09tH8ftJ773O2877P67OiPLETAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKO2+Ey8lRdzOXmaGcbntxt7GStbv3NNnrm3dnM9aFrceNtE3bz/+5Kfby9U3x+Yqb76qPxkAAAAOhnZZ30fErXR7KeuTmbz+T8tjspr/+2eGcVnP31v37+xtHC52zZf1/2+/3n5hNNHMcJ5s0NW1QX/x/lQ6+7TEqffsQ4/o5Gc+/91LO/9Akvc3n99N8/PZ+vb69Xe7eXiojmwBgEdxvOyLoPx5KOt7TSYGwIHRqRTeZf3fnmk2JwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIA67G7GU2Xcioj5zp04s7O3sTKuv7Z1c75sp69e3aqOmQ2RRsTq2qCf1riWaXfp8pVPlgeD/sWxQcQDd/334FhE7M/IDwhizK4PJ/jyiH8+prg8o55V/LugNR1pNBokxedTbOnWe9Xtb1Bee49/5Ia+IQEA8MRKi5bV9bfS7aVsW2s24q8f7q7/X6vEMWH9f/uj0zeqc1Xr/15tK5x+C+vnP1u4dPnKG2vnl8/1z/U/ffNE763eyTOnTp1ZyM7V4sJqJP3FptMEAADgf6xbtGr9n8zef///aCWOCev/z7/rfVmdq63+H+vOTb+mMwEAADiIuqPouVf+/KM15ohWtxtfLK+vX+wNX0fvTwxfa033ER0qWrX+b882nRUAAABQh93N1l33/89W4pjw/v/TP774c3XMdkQcibgQEf3jKxcGZ+tbzlSr4w+V84m6Ta8UAACAphwpWvX+f5o//5+MHnlIIuL1V4dx+b+uJqn/2+99/VN1rurz/yfrW+JUSuaG5yPv5yI6c01nBAAAwJPscNGyYv/3dHvp41+OftD1/D8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABA3f4OAAD//1AjNPw=")
r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='./file1\x00', 0x105042, 0x0)
openat$sysfs(0xffffffffffffff9c, 0x0, 0x0, 0x0)
openat(0xffffffffffffff9c, 0x0, 0x0, 0x0)
getdents64(0xffffffffffffffff, 0xfffffffffffffffe, 0x29)
mmap$IORING_OFF_SQ_RING(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x0, 0x11, r0, 0x0)
msync(&(0x7f0000000000/0x1000)=nil, 0x1000, 0x4)


open(&(0x7f0000000100)='./file0\x00', 0x101bff, 0x0)
r0 = open$dir(&(0x7f0000000240)='.\x00', 0x0, 0x0)
renameat(r0, &(0x7f0000000400)='./file0\x00', r0, &(0x7f0000000440)='./file1\x00')


r0 = openat$audio(0xffffffffffffff9c, &(0x7f0000000000), 0x40b41, 0x0)
readahead(r0, 0x0, 0x0)
fcntl$getownex(0xffffffffffffffff, 0x10, 0x0)
r1 = syz_open_procfs(0x0, &(0x7f0000000080)='comm\x00')
syncfs(r1)
r2 = openat$cuse(0xffffffffffffff9c, &(0x7f00000000c0), 0x2, 0x0)
read$FUSE(r1, &(0x7f0000000100)={0x2020, 0x0, <r3=>0x0}, 0x2020)
write$FUSE_LK(r2, &(0x7f0000002140)={0x28, 0xffffffffffffffda, r3, {{0x8, 0x5, 0x2}}}, 0x28)
r4 = fsmount(r1, 0x0, 0xc)
splice(r4, &(0x7f0000002180)=0x4, 0xffffffffffffffff, &(0x7f00000021c0)=0x1f, 0x7fffffff, 0x4)
close(r4)
write$FUSE_INIT(r1, &(0x7f0000002200)={0x50, 0x0, r3, {0x7, 0x28, 0xff, 0x100d008, 0x101, 0x3, 0x7fffffff, 0xe259}}, 0x50)
pread64(r1, &(0x7f0000002280)=""/156, 0x9c, 0x5)
write$FUSE_LSEEK(r2, &(0x7f0000002340)={0x18, 0x429fd2b2c8c4586c, r3, {0x4}}, 0x18)
ioctl$FS_IOC_GETFSLABEL(r2, 0x81009431, &(0x7f0000002380))
r5 = openat$rdma_cm(0xffffffffffffff9c, &(0x7f0000002480), 0x2, 0x0)
ioctl$BTRFS_IOC_SYNC(r4, 0x9408, 0x0)
write$6lowpan_enable(r4, &(0x7f00000024c0)='1', 0x1)
r6 = openat$btrfs_control(0xffffffffffffff9c, &(0x7f0000002500), 0x0, 0x0)
preadv(0xffffffffffffffff, 0x0, 0x0, 0x80000000, 0x4a9)
splice(r0, &(0x7f0000003780)=0x4, r1, 0x0, 0x0, 0x0)
r7 = openat$dir(0xffffffffffffff9c, 0x0, 0x400, 0x80)
write$FUSE_INIT(r6, &(0x7f0000003840)={0x50, 0x0, r3, {0x7, 0x28, 0x8000, 0x1206f40, 0xfffd, 0x6, 0x1, 0x1}}, 0x50)
getdents(r7, &(0x7f00000038c0)=""/85, 0x55)
ioctl$BTRFS_IOC_BALANCE_PROGRESS(r5, 0x84009422, &(0x7f0000003940)={0x0, 0x0, {0x0, @usage, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @struct}, {}, {0x0, @usage, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @struct}})
ioctl$FS_IOC_ENABLE_VERITY(r7, 0x40806685, &(0x7f0000003f00)={0x1, 0x0, 0x1000, 0xf7, &(0x7f0000003d40)="6e999826adc5b54572051a5529303e5ba8e875cc6b9a3f4aee183f90cc6ac4fa2b1068463db92cab90074cef4e876cc398234423398e11a927fb6dffa8c710e430a558fa08ad601ddd61526184987d088cc71f2bbfb0a247abd457a901a2fdf65ef11109a2030fd3245ca1dfc7121f5e1b0e4ac3632a635c179f80de88621bb68f15c213af68fb35316c872adc6da6d16b6989df81d874885206a713c79a7aef9242c3d4274434a1859349e1b921dc59ff6fb8a8c770ff24a142b1cdec927e5535b80913829c0232e1c0b70b1dfc60dd7d11e0714291087af1450a41d50ec8a636d10579b1a15534f81e93596e1d16fb4f239628414030", 0x9b, 0x0, &(0x7f0000003e40)="1635c5f4ecdcd31e8f6ed59c1d525b1ef0e0cda094ec88086f4c881450e14ea1a2cd8eccc043d7084787b0233f31bb92cf6691c956ecb97f03c5b1592a8ee102d73a310933dacaf116a4c54cdeec04a1968bc6ab0b6f79d9c0f2109e2fb1b3cb9a07b337ce6ce6b9d16f80fa0d413d7e1ae048534bb0e7308f98fb93176f05c7f092b3219e9720baeb0418469a5f7a4cc7a7aace49d0dfa55548c8"})
ioctl$BTRFS_IOC_START_SYNC(r5, 0x80089418, &(0x7f0000003f80))
ioctl$FS_IOC_READ_VERITY_METADATA(r7, 0xc0286687, &(0x7f0000004040)={0x3, 0xdd34, 0x41, &(0x7f0000003fc0)=""/65})
write$binfmt_aout(r6, &(0x7f0000004080)={{0x0, 0x2, 0xbc, 0x110, 0x192, 0x81, 0x96cd, 0xfffffffb}, "75be70cc9f1b6cac76eae7235419cce7e06c0ec8af45910b2ee8d727a86458d86cb516edb6ebddea70327822b73f0b15fa96aca3790811006db7c6e4a2b7ce20c337f19376545c863d0ac045fb5a7d0f", ['\x00']}, 0x170)
open(&(0x7f0000004200)='./file0\x00', 0x40000, 0x116)


mknod(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
name_to_handle_at(0xffffffffffffff9c, &(0x7f00000000c0)='./file0\x00', &(0x7f00000001c0)=@reiserfs_6={0x18}, &(0x7f0000000200), 0x0)


r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x2c41, 0x0)
flock(r0, 0x5)
r1 = openat(0xffffffffffffff9c, &(0x7f00000001c0)='./file0\x00', 0x0, 0x0)
flock(r1, 0x2)
r2 = openat(0xffffffffffffff9c, &(0x7f00000001c0)='./file0\x00', 0x0, 0x0)
flock(r2, 0x2)


r0 = epoll_create(0x100)
r1 = openat$dsp(0xffffffffffffff9c, &(0x7f0000000180), 0x0, 0x0)
epoll_ctl$EPOLL_CTL_ADD(r0, 0x1, r1, &(0x7f0000000240))
syz_mount_image$xfs(&(0x7f0000009800), &(0x7f0000009840)='./file0\x00', 0x300c0c5, &(0x7f0000000340), 0x9, 0x980a, &(0x7f0000009880)="$eJzs3QXYZHXBsPFnl6UbCwNZGotuDEJBBAEJKSWkJQWUMAARBEQBSUERUKREkO7u7u7u7vyuZXcV1xte/b73+nhf7/u+rueZmTNnznPm/ztznmf2zM4sv9DSCwwMTDQwvJGnf+/glz995OYvL7DD0dcf9tLChy565IjJYww/GTJ0xOkUI06nHBgYGDRiOYOGTxs85PgTBg+MNTAw+J3LHW/scQeNNzCwxoiL8404nXP4yfjXjJzvrVHiFR22GoN2Hv41/IcODAyMPezMMwMzLvfO5YxYr5n+6Y5KW37+hRf6u9Xf3IaN35AR59/5Ncbwr/EvGxgY/9IB3j4Gvb/36O2fP9GeF6yw4vu8Hv8rWn7+hRcZxX/YY3G0EdPmHPYYH/UxaGzU7XzonHeM3AcNwh3n/5KWn3+hxQbefT8/sPKbB2z61tv7zcHjDAwMHndgYPB4AwODxx8YGDzBwMDgCd9vl/p/633d+Kqqqup9af4FZl1g2PO9Uf4eGGvk37X0d+FKbyxxx8DAwOjD5xk8x8jnglVVVVVVVVX1P7P5F5h1QXj+P9F7Pf8/7tRXp+/5f1VVVVVVVdX/nhaZf4FZhz2PH+X5/6Tv9fx/mwdeXH/Ea//nHH6rN9/fO1FVVVVVVVVV79lSX3/7+f+4ozz/H/pez/8fnnenB0fMN/LfDd54xyIHveP1BK+9Y/po75j/1XdMH/KO6e+cf4yBgcFDRkx//e+TB48z7Lp/nn/wRMPWe8T0V94xfaa////mIVO/Y/rM75g+7TumzzJiXYdNn+4d02d/x/zT/ytjW1VVVVVVVVVVVVVVVVVV9f+rN5849cy/v+f7Jwfe8f7tf3sf9xGvCxh0zNlXX/2+rej/jAb98+shtnm/1+n/tWHOYx0xdGBgw2Xf71Wp96H3+/Ma6v0tf3f5u8vfXf7u8neXv7v83eXvLn93+bvL313+7vIX9y7H///2/v+THbrGyP8zP+fBu04+wTtvu+47zo/1/3Wt37f+U4//D6w7aGBghO9Ew1wXn3+pZaYfGBiYYPJdD55t4G/XzTXsunknGW3EGyQMfxuEaYbwgrcZ8ano24yYdcQyjnl7+Yu8ddBog0ZZiXc08YmHHLLO8i/NNurptO9+PwaPPPOhS7ZYcOR7WQweZaZ3205HLn/kfRnVecS6Tz9s3WfcbIONZ9x0y60+t+4Gq6295tprbjj3nHPONfucc8w9y4xrrbv+mjMN//5uYzb07e9T/StjNumoY/bE/O8cs1Hv27uN2dD3HrO3lzjmHqusN3LMRl21/2rMpnrvMRu67ogfNGTo6AOrvj02gwYGhkw9+sAWwy7MPObAwJBpRsw76bB5Pz/J4IGB3f5+R4edG/Nv2+CgbYbNs/xCSy/w94/d/+eP33+Xz7MfY/jJkBFDMmSKEadTDv8xEw38fVMcPOT4EwYPu+//MMzjjT3uoPEGBtYYcXG+Eadzj7j2ypHzvcvnrI+yom+/zcrOw7/+5jH2sDNHLTnhTe9czrsQ/Hf0f/X7/5+85hr0t4EaNOJrxDzDveZfeJG//6y3h2G+ke8JM+x3yzCT/4ePsf8v+6f1HTrW229y+27r+x7vizv8/sP2Ne1j5w3973pfXN53LPb296H/yr5j4L33HaP9/ezfp655xWSj7jsWffdV/IfHxcgxGnOUmd5t3zH9vvNv/fa+6b33HYutO+KDhv6+7xg8MDBkqpH7jmE7kmlHH9ht2IVZhl2YbvSBw4ZdmPXtC2MPnD3swgyrb7T+GoPeftueEcudadhy55tk0Nt3/uoZ5z9ytD3femvqEesy0yg71hHbx9B3/n6cf5LhwzbytrTcuzcbft20I5Y787+x3JG3peXOtcnw66YbsdxZRlnuaO+x3JG3Hfl4GDbr24+H6Qf9wws/4fG70CiP30Ejd+vvuMnIrzGGf41/2cjPhoLHy7+136HH70Tvsb7v8blWuL0Nm7bd4lec+t/1uVa0vmO99/q+2+dwv+v6rjf2Fkv8N6zvoHes7z9sZxvPM3xbmX7Edjb7v7H9jrztqPux0d++djjD9P/Kfmy+f9qPbTva4FFW/h29299Aa8D8w89P+relzX34aBOOHPvRR1nuf/U30PTvvR+baN1RbrfQgQODaMwffvyQdcb8L8Z89IF//Ft95JiPvO17jfl0/zzmg/5pzOd67zH/V//unH6q4deP9h5jPnjHTRcfOeZj/JtjPt2/O+bzDYxGY77bfsPH7b32p+825iNv+49jPmTgKwMDA1OPGPNp/5XtfKb/nu18HJh/+PkH/jbpksnO2OUd+5hB/86YT/vvjvnQv23nU7993ZSDB8YYY2CL1TbbbJOZh38feXGW4d95X/T0/cPH+b1+l76b0cjbvtfjYup/xWjof4/Rx4b88/wjn3SMvLzs2UdN/H+7L5r63zMatNAA74tOvmj4uL3X30XvNuYjb0u/B4e+4/ajPq97j/ejxPs0bNo+++51+8hFjrhZ70f5r/cPm+3If+Nbd9SdfP2n1r//u8vfXf7u8neXv7h3Of7/t/f/f3iipR8b8SR59N3nWXv193t93+f+o4//j/D9h+P/q689z+7DngqOuO49j88On+d/5PHZOYefjH/NyPlGPT7IK/rux2en22OGpQb+/xyf/b9q5GP1X3ge3/7fXf7u8neXv7v83eXvLn93+bvL313+7vJ3l7+7/N3lL+5djv+PfI3y0P3fuH+bEQdCR997r9XfeL/X933uP/r4/wjffzj+/8bqe+09eOBv173n8f/h8ziO/y+x/54nDPwPPv4/8rHa8f/6L8rfXf7u8neXv7v83eXvLn93+bvL313+7vJ3l7+7/MW9y/H/+Ua+DmCOVdbaeeTrATac+Uvbv9/r+z73n3r8v8//99b+313+7vJ3l7+7/N3l7y5/d/m7y99d/u7yd5e/u/zd5S9uxPH/gVE+1nHJtgsMjv//7+5d/L+RP2bxXyp/zOK/dP6YxX+Z/DGL/7L5Yxb/b+aPWfyXyx+z+C+fP2bxXyF/zOK/Yv6YxX+l/DGL/7fyxyz+384fs/ivnD9m8V8lf8ziv2r+mMV/tfwxi/938scs/qvnj1n818gfs/ivmT9m8V8rf8ziv3b+mMV/nfwxi/+6+WMW/+/mj1n818sfs/ivnz9m8d8gf8ziv2H+mMV/o/wxi//G+WMW/+/lj1n8N8kfs/hvmj9m8d8sf8zi//38MYv/D/LHLP6b549Z/LfIH7P4b5k/ZvHfKn/M4v/D/DGL/4/yxyz+P84fs/j/JH/M4r91/pjFf5v8MYv/tvljFv+f5o9Z/LfLH7P4/yx/zOK/ff6YxX+H/DGL/8/zxyz+O+aPWfx3yh+z+O+cP2bx/0X+mMV/l/wxi/8v88cs/r/KH7P475o/ZvHfLX/M4r97/pjF/9f5Yxb/PfLHLP575o9Z/PfKH7P4750/ZvHfJ3/M4r9v/pjF/zf5Yxb//fLHLP77549Z/H+bP2bx/13+mMX/gPwxi//v88cs/gfmj1n8D8ofs/gfnD9m8f9D/pjF/4/5Yxb/Q/LHLP5/yh+z+B+aP2bxPyx/zOJ/eP6Yxf+I/DGL/5H5Yxb/P+ePWfyPyh+z+P8lf8zif3T+mMX/mPwxi/9f88cs/sfmj1n8j8sfs/gfnz9m8T8hf8zif2L+mMX/pPwxi//J+WMW/1Pyxyz+p+aPWfxPyx+z+J+eP2bxPyN/zOJ/Zv6Yxf+s/DGL/9n5Yxb/c/LHLP7n5o9Z/M/LH7P4n58/ZvG/IH/M4n9h/pjF/6L8MYv/xfljFv9L8scs/pfmj1n8L8sfs/hfnj9m8b8if8zif2X+mMX/qvwxi//V+WMW/2vyxyz+1+aPWfyvyx+z+F+fP2bxvyF/zOJ/Y/6Yxf+m/DGL/835Yxb/W/LHLP635o9Z/G/LH7P4354/ZvG/I3/M4n9n/pjF/678MYv/3fljFv978scs/vfmj1n878sfs/jfnz9m8X8gf8zi/2D+mMX/ofwxi//D+WMW/0fyxyz+j+aPWfwfyx+z+D+eP2bxfyJ/zOL/ZP6Yxf+p/DGL/9P5Yxb/Z/LHLP7P5o9Z/J/LH7P4P58/ZvF/IX/M4v9i/pjF/6X8MYv/y/ljFv9X8scs/q/mj1n8X8sfs/i/nj9m8X8jf8zi/2b+mMX/rfwxif+ggfwxi/+g/DGL/+D8MYv/aPljFv8h+WMW/9Hzxyz+Y+SPWfzHzB+z+I+VP2bxHzt/zOI/Tv6YxX/c/DGL/3j5Yxb/8fPHLP4T5I9Z/CfMH7P4T5Q/ZvGfOH/M4j9J/pjF/wP5Yxb/D+aPWfw/lD9m8f9w/pjF/yP5Yxb/SfPHLP4fzR+z+H8sf8zi//H8MYv/J/LHLP6T5Y9Z/D+ZP2bxnzx/zOI/NH/M4j9F/pjFf8r8MYv/VPljFv+p88cs/tPkj1n8p80fs/hPlz9m8Z8+f8zi/6n8MYv/p/PHLP6fyR+z+H82f8zi/7n8MYv/DPljFv8Z88cs/jPlj1n8Z84fs/jPkj9m8Z81f8ziP1v+mMV/9vwxi/8c+WMW/znzxyz+c+WPWfznzh+z+M+TP2bxnzd/zOL/+fwxi/8X8scs/l/MH7P4fyl/zOI/X/6YxX/+/DGL/wL5Yxb/BfPHLP5fzh+z+H8lf8ziv1D+mMV/4fwxi/9X88cs/ovkj1n8v5Y/ZvFfNH/M4r9Y/pjF/+v5Yxb/xfPHLP5L5I9Z/JfMH7P4fyN/zOK/VP6YxX/p/DGL/zL5Yxb/ZfPHLP7fzB+z+C+XP2bxXz5/zOK/Qv6YxX/F/DGL/0r5Yxb/b+WPWfy/nT9m8V85f8ziv0r+mMV/1fwxi/9q+WMW/+/kj1n8V88fs/ivkT9m8V8zf8ziv1b+mMV/7fwxi/86+WMW/3Xzxyz+380fs/ivlz9m8V8/f8ziv0H+mMV/w/wxi/9G+WMW/43zxyz+38sfs/hvkj9m8d80f8ziv1n+mMX/+/ljFv8f5I9Z/DfPH7P4b5E/ZvHfMn/M4r9V/pjF/4f5Yxb/H+WPWfx/nD9m8f9J/pjFf+v8MYv/NvljFv9t88cs/j/NH7P4b5c/ZvH/Wf6YxX/7/DGL/w75Yxb/n+ePWfx3zB+z+O+UP2bx3zl/zOL/i/wxi/8u+WMW/1/mj1n8f5U/ZvHfNX/M4r9b/pjFf/f8MYv/r/PHLP575I9Z/PfMH7P475U/ZvHfO3/M4r9P/pjFf9/8MYv/b/LHLP775Y9Z/PfPH7P4/zZ/zOL/u/wxi/8B+WMW/9/nj1n8D8wfs/gflD9m8T84f8zi/4f8MYv/H/PHLP6H5I9Z/P+UP2bxPzR/zOJ/WP6Yxf/w/DGL/xH5Yxb/I/PHLP5/zh+z+B+VP2bx/0v+mMX/6Pwxi/8x+WMW/7/mj1n8j80fs/gflz9m8T8+f8zif0L+mMX/xPwxi/9J+WMW/5Pzxyz+p+SPWfxPzR+z+J+WP2bxPz1/zOJ/Rv6Yxf/M/DGL/1n5Yxb/s/PHLP7n5I9Z/M/NH7P4n5c/ZvE/P3/M4n9B/pjF/8L8MYv/RfljFv+L88cs/pfkj1n8L80fs/hflj9m8b88f8zif0X+mMX/yvwxi/9V+WMW/6vzxyz+1+SPWfyvzR+z+F+XP2bxvz5/zOJ/Q/6Yxf/G/DGL/035Yxb/m/PHLP635I9Z/G/NH7P435Y/ZvG/PX/M4n9H/pjF/878MYv/XfljFv+788cs/vfkj1n8780fs/jflz9m8b8/f8zi/0D+mMX/wfwxi/9D+WMW/4fzxyz+j+SPWfwfzR+z+D+WP2bxfzx/zOL/RP6Yxf/J/DGL/1P5Yxb/p/PHLP7P5I9Z/J/NH7P4P5c/ZvF/Pn/M4v9C/pjF/8X8MYv/S/ljFv+X88cs/q/kj1n8X80fs/i/lj9m8X89f8zi/0b+mMX/zfwxi/9b+WMS/7fP5v/PWfwH5Y9Z/Afnj1n8R8sfs/gPyR+z+I+eP2bxHyN/zOI/Zv6YxX+s/DGL/9j5Yxb/cfLHLP7j5o9Z/MfLH7P4j58/ZvGfIH/M4j9h/pjFf6L8MYv/xPljFv9J8scs/h/IH7P4fzB/zOL/ofwxi/+H88cs/h/JH7P4T5o/ZvH/aP6Yxf9j+WMW/4/nj1n8P5E/ZvGfLH/M4v/J/DGL/+T5Yxb/ofljFv8p8scs/lPmj1n8p8ofs/hPnT9m8Z8mf8ziP23+mMV/uvwxi//0+WMW/0/lj1n8P50/ZvH/TP6Yxf+z+WMW/8/lj1n8Z8gfs/jPmD9m8Z8pf8ziP3P+mMV/lvwxi/+s+WMW/9nyxyz+s+ePWfznyB+z+M+ZP2bxnyt/zOI/d/6YxX+e/DGL/7z5Yxb/z+ePWfy/kD9m8f9i/pjF/0v5Yxb/+fLHLP7z549Z/BfIH7P4L5g/ZvH/cv6Yxf8r+WMW/4Xyxyz+C+ePWfy/mj9m8V8kf8zi/7X8MYv/ovljFv/F8scs/l/PH7P4L54/ZvFfIn/M4r9k/pjF/xv5Yxb/pfLHLP5L549Z/JfJH7P4L5s/ZvH/Zv6YxX+5/DGL//L5Yxb/FfLHLP4r5o9Z/FfKH7P4fyt/zOL/7fwxi//K+WMW/1Xyxyz+q+aPWfxXyx+z+H8nf8ziv3r+mMV/jfwxi/+a+WMW/7Xyxyz+a+ePWfzXyR+z+K+bP2bx/27+mMV/vfwxi//6+WMW/w3yxyz+G+aPWfw3yh+z+G+cP2bx/17+mMV/k/wxi/+m+WMW/83yxyz+388fs/j/IH/M4r95/pjFf4v8MYv/lvljFv+t8scs/j/MH7P4/yh/zOL/4/wxi/9P8scs/lvnj1n8t8kfs/hvmz9m8f9p/pjFf7v8MYv/z/LHLP7b549Z/HfIH7P4/zx/zOK/Y/6YxX+n/DGL/875Yxb/X+SPWfx3yR+z+P8yf8zi/6v8MYv/rvljFv/d8scs/rvnj1n8f50/ZvHfI3/M4r9n/pjFf6/8MYv/3vljFv998scs/vvmj1n8f5M/ZvHfL3/M4r9//pjF/7f5Yxb/3+WPWfwPyB+z+P8+f8zif2D+mMX/oPwxi//B+WMW/z/kj1n8/5g/ZvE/JH/M4v+n/DGL/6H5Yxb/w/LHLP6H549Z/I/IH7P4H5k/ZvH/c/6Yxf+o/DGL/1/yxyz+R+ePWfyPyR+z+P81f8zif2z+mMX/uPwxi//x+WMW/xPyxyz+J+aPWfxPyh+z+J+cP2bxPyV/zOJ/av6Yxf+0/DGL/+n5Yxb/M/LHLP5n5o9Z/M/KH7P4n50/ZvE/J3/M4n9u/pjF/7z8MYv/+fljFv8L8scs/hfmj1n8L8ofs/hfnD9m8b8kf8zif2n+mMX/svwxi//l+WMW/yvyxyz+V+aPWfyvyh+z+F+dP2bxvyZ/zOJ/bf6Yxf+6/DGL//X5Yxb/G/LHLP435o9Z/G/KH7P435w/ZvG/JX/M4n9r/pjF/7b8MYv/7fljFv878scs/nfmj1n878ofs/jfnT9m8b8nf8zif2/+mMX/vvwxi//9+WMW/wfyxyz+D+aPWfwfyh+z+D+cP2bxfyR/zOL/aP6Yxf+x/DGL/+P5Yxb/J/LHLP5P5o9Z/J/KH7P4P50/ZvF/Jn/M4v9s/pjF/7n8MYv/8/ljFv8X8scs/i/mj1n8X8ofs/i/nD9m8X8lf8zi/2r+mMX/tfwxi//r+WMW/zfyxyz+b+aPWfzfyh+T+I82kD9m8R+UP2bxH5w/ZvEfLX/M4j8kf8ziP3r+mMV/jPwxi/+Y+WMW/7Hyxyz+Y+ePWfzHyR+z+I+bP2bxHy9/zOI/fv6YxX+C/DGL/4T5Yxb/ifLHLP4T549Z/CfJH7P4fyB/zOL/wfwxi/+H8scs/h/OH7P4fyR/zOI/af6Yxf+j+WMW/4/lj1n8P54/ZvH/RP6YxX+y/DGL/yfzxyz+k+ePWfyH5o9Z/KfIH7P4T5k/ZvGfKn/M4j91/pjFf5r8MYv/tPljFv/p8scs/tPnj1n8P5U/ZvH/dP6Yxf8z+WMW/8/mj1n8P5c/ZvGfIX/M4j9j/pjFf6b8MYv/zPljFv9Z8scs/rPmj1n8Z8sfs/jPnj9m8Z8jf8ziP2f+mMV/rvwxi//c+WMW/3nyxyz+8+aPWfw/nz9m8f9C/pjF/4v5Yxb/L+WPWfznyx+z+M+fP2bxXyB/zOK/YP6Yxf/L+WMW/6/kj1n8F8ofs/gvnD9m8f9q/pjFf5H8MYv/1/LHLP6L5o9Z/BfLH7P4fz1/zOK/eP6YxX+J/DGL/5L5Yxb/b+SPWfyXyh+z+C+dP2bxXyZ/zOK/bP6Yxf+b+WMW/+Xyxyz+y+ePWfxXyB+z+K+YP2bxXyl/zOL/rfwxi/+388cs/ivnj1n8V8kfs/ivmj9m8V8tf8zi/538MYv/6vljFv818scs/mvmj1n818ofs/ivnT9m8V8nf8ziv27+mMX/u/ljFv/18scs/uvnj1n8N8gfs/hvmD9m8d8of8ziv3H+mMX/e/ljFv9N8scs/pvmj1n8N8sfs/h/P3/M4v+D/DGL/+b5Yxb/LfLHLP5b5o9Z/LfKH7P4/zB/zOL/o/wxi/+P88cs/j/JH7P4b50/ZvHfJn/M4r9t/pjF/6f5Yxb/7fLHLP4/yx+z+G+fP2bx3yF/zOL/8/wxi/+O+WMW/53yxyz+O+ePWfx/kT9m8d8lf8zi/8v8MYv/r/LHLP675o9Z/HfLH7P4754/ZvH/df6YxX+P/DGL/575Yxb/vfLHLP57549Z/PfJH7P475s/ZvH/Tf6YxX+//DGL//75Yxb/3+aPWfx/lz9m8T8gf8zi//v8MYv/gfljFv+D8scs/gfnj1n8/5A/ZvH/Y/6Yxf+Q/DGL/5/yxyz+h+aPWfwPyx+z+B+eP2bxPyJ/zOJ/ZP6Yxf/P+WMW/6Pyxyz+f8kfs/gfnT9m8T8mf8zi/9f8MYv/sfljFv/j8scs/sfnj1n8T8gfs/ifmD9m8T8pf8zif3L+mMX/lPwxi/+p+WMW/9Pyxyz+p+ePWfzPyB+z+J+ZP2bxPyt/zOJ/dv6Yxf+c/DGL/7n5Yxb/8/LHLP7n549Z/C/IH7P4X5g/ZvG/KH/M4n9x/pjF/5L8MYv/pfljFv/L8scs/pfnj1n8r8gfs/hfmT9m8b8qf8zif3X+mMX/mvwxi/+1+WMW/+vyxyz+1+ePWfxvyB+z+N+YP2bxvyl/zOJ/c/6Yxf+W/DGL/635Yxb/2/LHLP63549Z/O/IH7P435k/ZvG/K3/M4n93/pjF/578MYv/vfljFv/78scs/vfnj1n8H8gfs/g/mD9m8X8of8zi/3D+mMX/kfwxi/+j+WMW/8fyxyz+j+ePWfyfyB+z+D+ZP2bxfyp/zOL/dP6Yxf+Z/DGL/7P5Yxb/5/LHLP7P549Z/F/IH7P4v5g/ZvF/KX/M4v9y/pjF/5X8MYv/q/ljFv/X8scs/q/nj1n838gfs/i/mT9m8X8rf0ziP2Qgf8ziPyh/zOI/OH/M4j9a/pjFf0j+mMV/9Pwxi/8Y+WMW/zHzxyz+Y+WPWfzHzh+z+I+TP2bxHzd/zOI/Xv6YxX/8/DGL/wT5Yxb/CfPHLP4T5Y9Z/CfOH7P4T5I/ZvH/QP6Yxf+D+WMW/w/lj1n8P5w/ZvH/SP6YxX/S/DGL/0fzxyz+H8sfs/h/PH/M4v+J/DGL/2T5Yxb/T+aPWfwnzx+z+A/NH7P4T5E/ZvGfMn/M4j9V/pjFf+r8MYv/NPljFv9p88cs/tPlj1n8p88fs/h/Kn/M4v/p/DGL/2fyxyz+n80fs/h/Ln/M4j9D/pjFf8b8MYv/TPljFv+Z88cs/rPkj1n8Z80fs/jPlj9m8Z89f8ziP0f+mMV/zvwxi/9c+WMW/7nzxyz+8+SPWfznzR+z+H8+f8zi/4X8MYv/F/PHLP5fyh+z+M+XP2bxnz9/zOK/QP6YxX/B/DGL/5fzxyz+X8kfs/gvlD9m8V84f8zi/9X8MYv/IvljFv+v5Y9Z/BfNH7P4L5Y/ZvH/ev6YxX/x/DGL/xL5Yxb/JfPHLP7fyB+z+C+VP2bxXzp/zOK/TP6YxX/Z/DGL/zfzxyz+y+WPWfyXzx+z+K+QP2bxXzF/zOK/Uv6Yxf9b+WMW/2/nj1n8V84fs/ivkj9m8V81f8ziv1r+mMX/O/ljFv/V88cs/mvkj1n818wfs/ivlT9m8V87f8ziv07+mMV/3fwxi/9388cs/uvlj1n8188fs/hvkD9m8d8wf8ziv1H+mMV/4/wxi//38scs/pvkj1n8N80fs/hvlj9m8f9+/pjF/wf5Yxb/zfPHLP5b5I9Z/LfMH7P4b5U/ZvH/Yf6Yxf9H+WMW/x/nj1n8f5I/ZvHfOn/M4r9N/pjFf9v8MYv/T/PHLP7b5Y9Z/H+WP2bx3z5/zOK/Q/6Yxf/n+WMW/x3zxyz+O+WPWfx3zh+z+P8if8ziv0v+mMX/l/ljFv9f5Y9Z/HfNH7P475Y/ZvHfPX/M4v/r/DGL/x75Yxb/PfPHLP575Y9Z/PfOH7P475M/ZvHfN3/M4v+b/DGL/375Yxb//fPHLP6/zR+z+P8uf8zif0D+mMX/9/ljFv8D88cs/gflj1n8D84fs/j/IX/M4v/H/DGL/yH5Yxb/P+WPWfwPzR+z+B+WP2bxPzx/zOJ/RP6Yxf/I/DGL/5/zxyz+R+WPWfz/kj9m8T86f8zif0z+mMX/r/ljFv9j88cs/sflj1n8j88fs/ifkD9m8T8xf8zif1L+mMX/5Pwxi/8p+WMW/1Pzxyz+p+WPWfxPzx+z+J+RP2bxPzN/zOJ/Vv6Yxf/s/DGL/zn5Yxb/c/PHLP7n5Y9Z/M/PH7P4X5A/ZvG/MH/M4n9R/pjF/+L8MYv/JfljFv9L88cs/pflj1n8L88fs/hfkT9m8b8yf8zif1X+mMX/6vwxi/81+WMW/2vzxyz+1+WPWfyvzx+z+N+QP2bxvzF/zOJ/U/6Yxf/m/DGL/y35Yxb/W/PHLP635Y9Z/G/PH7P435E/ZvG/M3/M4n9X/pjF/+78MYv/PfljFv9788cs/vflj1n8788fs/g/kD9m8X8wf8zi/1D+mMX/4fwxi/8j+WMW/0fzxyz+j+WPWfwfzx+z+D+RP2bxfzJ/zOL/VP6Yxf/p/DGL/zP5Yxb/Z/PHLP7P5Y9Z/J/PH7P4v5A/ZvF/MX/M4v9S/pjF/+X8MYv/K/ljFv9X88cs/q/lj1n8X88fs/i/kT9m8X8zf8zi/1b+mMR/9IH8MYv/oPwxi//g/DGL/2j5Yxb/IfljFv/R88cs/mPkj1n8x8wfs/iPlT9m8R87f8ziP07+mMV/3Pwxi/94+WMW//Hzxyz+E+SPWfwnzB+z+E+UP2bxnzh/zOI/Sf6Yxf8D+WMW/w/mj1n8P5Q/ZvH/cP6Yxf8j+WMW/0nzxyz+H80fs/h/LH/M4v/x/DGL/yfyxyz+k+WPWfw/mT9m8Z88f8ziPzR/zOI/Rf6YxX/K/DGL/1T5Yxb/qfPHLP7T5I9Z/KfNH7P4T5c/ZvGfPn/M4v+p/DGL/6fzxyz+n8kfs/h/Nn/M4v+5/DGL/wz5Yxb/GfPHLP4z5Y9Z/GfOH7P4z5I/ZvGfNX/M4j9b/pjFf/b8MYv/HPljFv8588cs/nPlj1n8584fs/jPkz9m8Z83f8zi//n8MYv/F/LHLP5fzB+z+H8pf8ziP1/+mMV//vwxi/8C+WMW/wXzxyz+X84fs/h/JX/M4r9Q/pjFf+H8MYv/V/PHLP6L5I9Z/L+WP2bxXzR/zOK/WP6Yxf/r+WMW/8Xzxyz+S+SPWfyXzB+z+H8jf8ziv1T+mMV/6fwxi/8y+WMW/2Xzxyz+38wfs/gvlz9m8V8+f8ziv0L+mMV/xfwxi/9K+WMW/2/lj1n8v50/ZvFfOX/M4r9K/pjFf9X8MYv/avljFv/v5I9Z/FfPH7P4r5E/ZvFfM3/M4r9W/pjFf+38MYv/OvljFv9188cs/t/NH7P4r5c/ZvFfP3/M4r9B/pjFf8P8MYv/RvljFv+N88cs/t/LH7P4b5I/ZvHfNH/M4r9Z/pjF//v5Yxb/H+SPWfw3zx+z+G+RP2bx3zJ/zOK/Vf6Yxf+H+WMW/x/lj1n8f5w/ZvH/Sf6YxX/r/DGL/zb5Yxb/bfPHLP4/zR+z+G+XP2bx/1n+mMV/+/wxi/8O+WMW/5/nj1n8d8wfs/jvlD9m8d85f8zi/4v8MYv/LvljFv9f5o9Z/H+VP2bx3zV/zOK/W/6YxX/3/DGL/6/zxyz+e+SPWfz3zB+z+O+VP2bx3zt/zOK/T/6YxX/f/DGL/2/yxyz+++WPWfz3zx+z+P82f8zi/7v8MYv/AfljFv/f549Z/A/MH7P4H5Q/ZvE/OH/M4v+H/DGL/x/zxyz+h+SPWfz/lD9m8T80f8zif1j+mMX/8Pwxi/8R+WMW/yPzxyz+f84fs/gflT9m8f9L/pjF/+j8MYv/MfljFv+/5o9Z/I/NH7P4H5c/ZvE/Pn/M4n9C/pjF/8T8MYv/SfljFv+T88cs/qfkj1n8T80fs/iflj9m8T89f8zif0b+mMX/zPwxi/9Z+WMW/7Pzxyz+5+SPWfzPzR+z+J+XP2bxPz9/zOJ/Qf6Yxf/C/DGL/0X5Yxb/i/PHLP6X5I9Z/C/NH7P4X5Y/ZvG/PH/M4n9F/pjF/8r8MYv/VfljFv+r88cs/tfkj1n8r80fs/hflz9m8b8+f8zif0P+mMX/xvwxi/9N+WMW/5vzxyz+t+SPWfxvzR+z+N+WP2bxvz1/zOJ/R/6Yxf/O/DGL/135Yxb/u/PHLP735I9Z/O/NH7P435c/ZvG/P3/M4v9A/pjF/8H8MYv/Q/ljFv+H88cs/o/kj1n8H80fs/g/lj9m8X88f8zi/0T+mMX/yfwxi/9T+WMW/6fzxyz+z+SPWfyfzR+z+D+XP2bxfz5/zOL/Qv6Yxf/F/DGL/0v5Yxb/l/PHLP6v5I9Z/F/NH7P4v5Y/ZvF/PX/M4v9G/pjF/838MYv/W/ljEv8xBvLHLP6D8scs/oPzxyz+o+WPWfyH5I9Z/EfPH7P4j5E/ZvEfM3/M4j9W/pjFf+z8MYv/OPljFv9x88cs/uPlj1n8x88fs/hPkD9m8Z8wf8ziP1H+mMV/4vwxi/8k+WMW/w/kj1n8P5g/ZvH/UP6Yxf/D+WMW/4/kj1n8J80fs/h/NH/M4v+x/DGL/8fzxyz+n8gfs/hPlj9m8f9k/pjFf/L8MYv/0Pwxi/8U+WMW/ynzxyz+U+WPWfynzh+z+E+TP2bxnzZ/zOI/Xf6YxX/6/DGL/6fyxyz+n84fs/h/Jn/M4v/Z/DGL/+fyxyz+M+SPWfxnzB+z+M+UP2bxnzl/zOI/S/6YxX/W/DGL/2z5Yxb/2fPHLP5z5I9Z/OfMH7P4z5U/ZvGfO3/M4j9P/pjFf978MYv/5/PHLP5fyB+z+H8xf8zi/6X8MYv/fPljFv/588cs/gvkj1n8F8wfs/h/OX/M4v+V/DGL/0L5Yxb/hfPHLP5fzR+z+C+SP2bx/1r+mMV/0fwxi/9i+WMW/6/nj1n8F88fs/gvkT9m8V8yf8zi/438MYv/UvljFv+l88cs/svkj1n8l80fs/h/M3/M4r9c/pjFf/n8MYv/CvljFv8V88cs/ivlj1n8v5U/ZvH/dv6YxX/l/DGL/yr5Yxb/VfPHLP6r5Y9Z/L+TP2bxXz1/zOK/Rv6YxX/N/DGL/1r5Yxb/tfPHLP7r5I9Z/NfNH7P4fzd/zOK/Xv6YxX/9/DGL/wb5Yxb/DfPHLP4b5Y9Z/DfOH7P4fy9/zOK/Sf6YxX/T/DGL/2b5Yxb/7+ePWfx/kD9m8d88f8ziv0X+mMV/y/wxi/9W+WMW/x/mj1n8f5Q/ZvH/cf6Yxf8n+WMW/63zxyz+2+SPWfy3zR+z+P80f8ziv13+mMX/Z/ljFv/t88cs/jvkj1n8f54/ZvHfMX/M4r9T/pjFf+f8MYv/L/LHLP675I9Z/H+ZP2bx/1X+mMV/1/wxi/9u+WMW/93zxyz+v84fs/jvkT9m8d8zf8ziv1f+mMV/7/wxi/8++WMW/33zxyz+v8kfs/jvlz9m8d8/f8zi/9v8MYv/7/LHLP4H5I9Z/H+fP2bxPzB/zOJ/UP6Yxf/g/DGL/x/yxyz+f8wfs/gfkj9m8f9T/pjF/9D8MYv/YfljFv/D88cs/kfkj1n8j8wfs/j/OX/M4n9U/pjF/y/5Yxb/o/PHLP7H5I9Z/P+aP2bxPzZ/zOJ/XP6Yxf/4/DGL/wn5Yxb/E/PHLP4n5Y9Z/E/OH7P4n5I/ZvE/NX/M4n9a/pjF//T8MYv/GfljFv8z88cs/mflj1n8z84fs/ifkz9m8T83f8zif17+mMX//Pwxi/8F+WMW/wvzxyz+F+WPWfwvzh+z+F+SP2bxvzR/zOJ/Wf6Yxf/y/DGL/xX5Yxb/K/PHLP5X5Y9Z/K/OH7P4X5M/ZvG/Nn/M4n9d/pjF//r8MYv/DfljFv8b88cs/jflj1n8b84fs/jfkj9m8b81f8zif1v+mMX/9vwxi/8d+WMW/zvzxyz+d+WPWfzvzh+z+N+TP2bxvzd/zOJ/X/6Yxf/+/DGL/wP5Yxb/B/PHLP4P5Y9Z/B/OH7P4P5I/ZvF/NH/M4v9Y/pjF//H8MYv/E/ljFv8n88cs/k/lj1n8n84fs/g/kz9m8X82f8zi/1z+mMX/+fwxi/8L+WMW/xfzxyz+L+WPWfxfzh+z+L+SP2bxfzV/zOL/Wv6Yxf/1/DGL/xv5Yxb/N/PHLP5v5Y9J/MccyB+z+A/KH7P4D84fs/iPlj9m8R+SP2bxHz1/zOI/Rv6YxX/M/DGL/1j5Yxb/sfPHLP7j5I9Z/MfNH7P4j5c/ZvEfP3/M4j9B/pjFf8L8MYv/RPljFv+J88cs/pPkj1n8P5A/ZvH/YP6Yxf9D+WMW/w/nj1n8P5I/ZvGfNH/M4v/R/DGL/8fyxyz+H88fs/h/In/M4j9Z/pjF/5P5Yxb/yfPHLP5D88cs/lPkj1n8p8wfs/hPlT9m8Z86f8ziP03+mMV/2vwxi/90+WMW/+nzxyz+n8ofs/h/On/M4v+Z/DGL/2fzxyz+n8sfs/jPkD9m8Z8xf8ziP1P+mMV/5vwxi/8s+WMW/1nzxyz+s+WPWfxnzx+z+M+RP2bxnzN/zOI/V/6YxX/u/DGL/zz5Yxb/efPHLP6fzx+z+H8hf8zi/8X8MYv/l/LHLP7z5Y9Z/OfPH7P4L5A/ZvFfMH/M4v/l/DGL/1fyxyz+C+WPWfwXzh+z+H81f8ziv0j+mMX/a/ljFv9F88cs/ovlj1n8v54/ZvFfPH/M4r9E/pjFf8n8MYv/N/LHLP5L5Y9Z/JfOH7P4L5M/ZvFfNn/M4v/N/DGL/3L5Yxb/5fPHLP4r5I9Z/FfMH7P4r5Q/ZvH/Vv6Yxf/b+WMW/5Xzxyz+q+SPWfxXzR+z+K+WP2bx/07+mMV/9fwxi/8a+WMW/zXzxyz+a+WPWfzXzh+z+K+TP2bxXzd/zOL/3fwxi/96+WMW//Xzxyz+G+SPWfw3zB+z+G+UP2bx3zh/zOL/vfwxi/8m+WMW/03zxyz+m+WPWfy/nz9m8f9B/pjFf/P8MYv/FvljFv8t88cs/lvlj1n8f5g/ZvH/Uf6Yxf/H+WMW/5/kj1n8t84fs/hvkz9m8d82f8zi/9P8MYv/dvljFv+f5Y9Z/LfPH7P475A/ZvH/ef6YxX/H/DGL/075Yxb/nfPHLP6/yB+z+O+SP2bx/2X+mMX/V/ljFv9d88cs/rvlj1n8d88fs/j/On/M4r9H/pjFf8/8MYv/XvljFv+988cs/vvkj1n8980fs/j/Jn/M4r9f/pjFf//8MYv/b/PHLP6/yx+z+B+QP2bx/33+mMX/wPwxi/9B+WMW/4Pzxyz+f8gfs/j/MX/M4n9I/pjF/0/5Yxb/Q/PHLP6H5Y9Z/A/PH7P4H5E/ZvE/Mn/M4v/n/DGL/1H5Yxb/v+SPWfyPzh+z+B+TP2bx/2v+mMX/2Pwxi/9x+WMW/+Pzxyz+J+SPWfxPzB+z+J+UP2bxPzl/zOJ/Sv6Yxf/U/DGL/2n5Yxb/0/PHLP5n5I9Z/M/MH7P4n5U/ZvE/O3/M4n9O/pjF/9z8MYv/efljFv/z88cs/hfkj1n8L8wfs/hflD9m8b84f8zif0n+mMX/0vwxi/9l+WMW/8vzxyz+V+SPWfyvzB+z+F+VP2bxvzp/zOJ/Tf6Yxf/a/DGL/3X5Yxb/6/PHLP435I9Z/G/MH7P435Q/ZvG/OX/M4n9L/pjF/9b8MYv/bfljFv/b88cs/nfkj1n878wfs/jflT9m8b87f8zif0/+mMX/3vwxi/99+WMW//vzxyz+D+SPWfwfzB+z+D+UP2bxfzh/zOL/SP6Yxf/R/DGL/2P5Yxb/x/PHLP5P5I9Z/J/MH7P4P5U/ZvF/On/M4v9M/pjF/9n8MYv/c/ljFv/n88cs/i/kj1n8X8wfs/i/lD9m8X85f8zi/0r+mMX/1fwxi/9r+WMW/9fzxyz+b+SPWfzfzB+z+L+VPybxH2sgf8ziPyh/zOI/OH/M4j9a/pjFf0j+mMV/9Pwxi/8Y+WMW/zHzxyz+Y+WPWfzHzh+z+I+TP2bxHzd/zOI/Xv6YxX/8/DGL/wT5Yxb/CfPHLP4T5Y9Z/CfOH7P4T5I/ZvH/QP6Yxf+D+WMW/w/lj1n8P5w/ZvH/SP6YxX/S/DGL/0fzxyz+H8sfs/h/PH/M4v+J/DGL/2T5Yxb/T+aPWfwnzx+z+A/NH7P4T5E/ZvGfMn/M4j9V/pjFf+r8MYv/NPljFv9p88cs/tPlj1n8p88fs/h/Kn/M4v/p/DGL/2fyxyz+n80fs/h/Ln/M4j9D/pjFf8b8MYv/TPljFv+Z88cs/rPkj1n8Z80fs/jPlj9m8Z89f8ziP0f+mMV/zvwxi/9c+WMW/7nzxyz+8+SPWfznzR+z+H8+f8zi/4X8MYv/F/PHLP5fyh+z+M+XP2bxnz9/zOK/QP6YxX/B/DGL/5fzxyz+X8kfs/gvlD9m8V84f8zi/9X8MYv/IvljFv+v5Y9Z/BfNH7P4L5Y/ZvH/ev6YxX/x/DGL/xL5Yxb/JfPHLP7fyB+z+C+VP2bxXzp/zOK/TP6YxX/Z/DGL/zfzxyz+y+WPWfyXzx+z+K+QP2bxXzF/zOK/Uv6Yxf9b+WMW/2/nj1n8V84fs/ivkj9m8V81f8ziv1r+mMX/O/ljFv/V88cs/mvkj1n818wfs/ivlT9m8V87f8ziv07+mMV/3fwxi/9388cs/uvlj1n8188fs/hvkD9m8d8wf8ziv1H+mMV/4/wxi//38scs/pvkj1n8N80fs/hvlj9m8f9+/pjF/wf5Yxb/zfPHLP5b5I9Z/LfMH7P4b5U/ZvH/Yf6Yxf9H+WMW/x/nj1n8f5I/ZvHfOn/M4r9N/pjFf9v8MYv/T/PHLP7b5Y9Z/H+WP2bx3z5/zOK/Q/6Yxf/n+WMW/x3zxyz+O+WPWfx3zh+z+P8if8ziv0v+mMX/l/ljFv9f5Y9Z/HfNH7P475Y/ZvHfPX/M4v/r/DGL/x75Yxb/PfPHLP575Y9Z/PfOH7P475M/ZvHfN3/M4v+b/DGL/375Yxb//fPHLP6/zR+z+P8uf8zif0D+mMX/9/ljFv8D88cs/gflj1n8D84fs/j/IX/M4v/H/DGL/yH5Yxb/P+WPWfwPzR+z+B+WP2bxPzx/zOJ/RP6Yxf/I/DGL/5/zxyz+R+WPWfz/kj9m8T86f8zif0z+mMX/r/ljFv9j88cs/sflj1n8j88fs/ifkD9m8T8xf8zif1L+mMX/5Pwxi/8p+WMW/1Pzxyz+p+WPWfxPzx+z+J+RP2bxPzN/zOJ/Vv6Yxf/s/DGL/zn5Yxb/c/PHLP7n5Y9Z/M/PH7P4X5A/ZvG/MH/M4n9R/pjF/+L8MYv/JfljFv9L88cs/pflj1n8L88fs/hfkT9m8b8yf8zif1X+mMX/6vwxi/81+WMW/2vzxyz+1+WPWfyvzx+z+N+QP2bxvzF/zOJ/U/6Yxf/m/DGL/y35Yxb/W/PHLP635Y9Z/G/PH7P435E/ZvG/M3/M4n9X/pjF/+78MYv/PfljFv9788cs/vflj1n8788fs/g/kD9m8X8wf8zi/1D+mMX/4fwxi/8j+WMW/0fzxyz+j+WPWfwfzx+z+D+RP2bxfzJ/zOL/VP6Yxf/p/DGL/zP5Yxb/Z/PHLP7P5Y9Z/J/PH7P4v5A/ZvF/MX/M4v9S/pjF/+X8MYv/K/ljFv9X88cs/q/lj1n8X88fs/i/kT9m8X8zf8zi/1b+mMR/7IH8MYv/oPwxi//g/DGL/2j5Yxb/IfljFv/R88cs/mPkj1n8x8wfs/iPlT9m8R87f8ziP07+mMV/3Pwxi/94+WMW//Hzxyz+E+SPWfwnzB+z+E+UP2bxnzh/zOI/Sf6Yxf8D+WMW/w/mj1n8P5Q/ZvH/cP6Yxf8j+WMW/0nzxyz+H80fs/h/LH/M4v/x/DGL/yfyxyz+k+WPWfw/mT9m8Z88f8ziPzR/zOI/Rf6YxX/K/DGL/1T5Yxb/qfPHLP7T5I9Z/KfNH7P4T5c/ZvGfPn/M4v+p/DGL/6fzxyz+n8kfs/h/Nn/M4v+5/DGL/wz5Yxb/GfPHLP4z5Y9Z/GfOH7P4z5I/ZvGfNX/M4j9b/pjFf/b8MYv/HPljFv8588cs/nPlj1n8584fs/jPkz9m8Z83f8zi//n8MYv/F/LHLP5fzB+z+H8pf8ziP1/+mMV//vwxi/8C+WMW/wXzxyz+X84fs/h/JX/M4r9Q/pjFf+H8MYv/V/PHLP6L5I9Z/L+WP2bxXzR/zOK/WP6Yxf/r+WMW/8Xzxyz+S+SPWfyXzB+z+H8jf8ziv1T+mMV/6fwxi/8y+WMW/2Xzxyz+38wfs/gvlz9m8V8+f8ziv0L+mMV/xfwxi/9K+WMW/2/lj1n8v50/ZvFfOX/M4r9K/pjFf9X8MYv/avljFv/v5I9Z/FfPH7P4r5E/ZvFfM3/M4r9W/pjFf+38MYv/OvljFv9188cs/t/NH7P4r5c/ZvFfP3/M4r9B/pjFf8P8MYv/RvljFv+N88cs/t/LH7P4b5I/ZvHfNH/M4r9Z/pjF//v5Yxb/H+SPWfw3zx+z+G+RP2bx3zJ/zOK/Vf6Yxf+H+WMW/x/lj1n8f5w/ZvH/Sf6YxX/r/DGL/zb5Yxb/bfPHLP4/zR+z+G+XP2bx/1n+mMV/+/wxi/8O+WMW/5/nj1n8d8wfs/jvlD9m8d85f8zi/4v8MYv/LvljFv9f5o9Z/H+VP2bx3zV/zOK/W/6YxX/3/DGL/6/zxyz+e+SPWfz3zB+z+O+VP2bx3zt/zOK/T/6YxX/f/DGL/2/yxyz+++WPWfz3zx+z+P82f8zi/7v8MYv/AfljFv/f549Z/A/MH7P4H5Q/ZvE/OH/M4v+H/DGL/x/zxyz+h+SPWfz/lD9m8T80f8zif1j+mMX/8Pwxi/8R+WMW/yPzxyz+f84fs/gflT9m8f9L/pjF/+j8MYv/MfljFv+/5o9Z/I/NH7P4H5c/ZvE/Pn/M4n9C/pjF/8T8MYv/SfljFv+T88cs/qfkj1n8T80fs/iflj9m8T89f8zif0b+mMX/zPwxi/9Z+WMW/7Pzxyz+5+SPWfzPzR+z+J+XP2bxPz9/zOJ/Qf6Yxf/C/DGL/0X5Yxb/i/PHLP6X5I9Z/C/NH7P4X5Y/ZvG/PH/M4n9F/pjF/8r8MYv/VfljFv+r88cs/tfkj1n8r80fs/hflz9m8b8+f8zif0P+mMX/xvwxi/9N+WMW/5vzxyz+t+SPWfxvzR+z+N+WP2bxvz1/zOJ/R/6Yxf/O/DGL/135Yxb/u/PHLP735I9Z/O/NH7P435c/ZvG/P3/M4v9A/pjF/8H8MYv/Q/ljFv+H88cs/o/kj1n8H80fs/g/lj9m8X88f8zi/0T+mMX/yfwxi/9T+WMW/6fzxyz+z+SPWfyfzR+z+D+XP2bxfz5/zOL/Qv6Yxf/F/DGL/0v5Yxb/l/PHLP6v5I9Z/F/NH7P4v5Y/ZvF/PX/M4v9G/pjF/838MYv/W/ljEv9xBvLHLP6D8scs/oPzxyz+o+WPWfyH5I9Z/EfPH7P4j5E/ZvEfM3/M4j9W/pjFf+z8MYv/OPljFv9x88cs/uPlj1n8x88fs/hPkD9m8Z8wf8ziP1H+mMV/4vwxi/8k+WMW/w/kj1n8P5g/ZvH/UP6Yxf/D+WMW/4/kj1n8J80fs/h/NH/M4v+x/DGL/8fzxyz+n8gfs/hPlj9m8f9k/pjFf/L8MYv/0Pwxi/8U+WMW/ynzxyz+U+WPWfynzh+z+E+TP2bxnzZ/zOI/Xf6YxX/6/DGL/6fyxyz+n84fs/h/Jn/M4v/Z/DGL/+fyxyz+M+SPWfxnzB+z+M+UP2bxnzl/zOI/S/6YxX/W/DGL/2z5Yxb/2fPHLP5z5I9Z/OfMH7P4z5U/ZvGfO3/M4j9P/pjFf978MYv/5/PHLP5fyB+z+H8xf8zi/6X8MYv/fPljFv/588cs/gvkj1n8F8wfs/h/OX/M4v+V/DGL/0L5Yxb/hfPHLP5fzR+z+C+SP2bx/1r+mMV/0fwxi/9i+WMW/6/nj1n8F88fs/gvkT9m8V8yf8zi/438MYv/UvljFv+l88cs/svkj1n8l80fs/h/M3/M4r9c/pjFf/n8MYv/CvljFv8V88cs/ivlj1n8v5U/ZvH/dv6YxX/l/DGL/yr5Yxb/VfPHLP6r5Y9Z/L+TP2bxXz1/zOK/Rv6YxX/N/DGL/1r5Yxb/tfPHLP7r5I9Z/NfNH7P4fzd/zOK/Xv6YxX/9/DGL/wb5Yxb/DfPHLP4b5Y9Z/DfOH7P4fy9/zOK/Sf6YxX/T/DGL/2b5Yxb/7+ePWfx/kD9m8d88f8ziv0X+mMV/y/wxi/9W+WMW/x/mj1n8f5Q/ZvH/cf6Yxf8n+WMW/63zxyz+2+SPWfy3zR+z+P80f8ziv13+mMX/Z/ljFv/t88cs/jvkj1n8f54/ZvHfMX/M4r9T/pjFf+f8MYv/L/LHLP675I9Z/H+ZP2bx/1X+mMV/1/wxi/9u+WMW/93zxyz+v84fs/jvkT9m8d8zf8ziv1f+mMV/7/wxi/8++WMW/33zxyz+v8kfs/jvlz9m8d8/f8zi/9v8MYv/7/LHLP4H5I9Z/H+fP2bxPzB/zOJ/UP6Yxf/g/DGL/x/yxyz+f8wfs/gfkj9m8f9T/pjF/9D8MYv/YfljFv/D88cs/kfkj1n8j8wfs/j/OX/M4n9U/pjF/y/5Yxb/o/PHLP7H5I9Z/P+aP2bxPzZ/zOJ/XP6Yxf/4/DGL/wn5Yxb/E/PHLP4n5Y9Z/E/OH7P4n5I/ZvE/NX/M4n9a/pjF//T8MYv/GfljFv8z88cs/mflj1n8z84fs/ifkz9m8T83f8zif17+mMX//Pwxi/8F+WMW/wvzxyz+F+WPWfwvzh+z+F+SP2bxvzR/zOJ/Wf6Yxf/y/DGL/xX5Yxb/K/PHLP5X5Y9Z/K/OH7P4X5M/ZvG/Nn/M4n9d/pjF//r8MYv/DfljFv8b88cs/jflj1n8b84fs/jfkj9m8b81f8zif1v+mMX/9vwxi/8d+WMW/zvzxyz+d+WPWfzvzh+z+N+TP2bxvzd/zOJ/X/6Yxf/+/DGL/wP5Yxb/B/PHLP4P5Y9Z/B/OH7P4P5I/ZvF/NH/M4v9Y/pjF//H8MYv/E/ljFv8n88cs/k/lj1n8n84fs/g/kz9m8X82f8zi/1z+mMX/+fwxi/8L+WMW/xfzxyz+L+WPWfxfzh+z+L+SP2bxfzV/zOL/Wv6Yxf/1/DGL/xv5Yxb/N/PHLP5v5Y9J/McdyB+z+A/KH7P4D84fs/iPlj9m8R+SP2bxHz1/zOI/Rv6YxX/M/DGL/1j5Yxb/sfPHLP7j5I9Z/MfNH7P4j5c/ZvEfP3/M4j9B/pjFf8L8MYv/RPljFv+J88cs/pPkj1n8P5A/ZvH/YP6Yxf9D+WMW/w/nj1n8P5I/ZvGfNH/M4v/R/DGL/8fyxyz+H88fs/h/In/M4j9Z/pjF/5P5Yxb/yfPHLP5D88cs/lPkj1n8p8wfs/hPlT9m8Z86f8ziP03+mMV/2vwxi/90+WMW/+nzxyz+n8ofs/h/On/M4v+Z/DGL/2fzxyz+n8sfs/jPkD9m8Z8xf8ziP1P+mMV/5vwxi/8s+WMW/1nzxyz+s+WPWfxnzx+z+M+RP2bxnzN/zOI/V/6YxX/u/DGL/zz5Yxb/efPHLP6fzx+z+H8hf8zi/8X8MYv/l/LHLP7z5Y9Z/OfPH7P4L5A/ZvFfMH/M4v/l/DGL/1fyxyz+C+WPWfwXzh+z+H81f8ziv0j+mMX/a/ljFv9F88cs/ovlj1n8v54/ZvFfPH/M4r9E/pjFf8n8MYv/N/LHLP5L5Y9Z/JfOH7P4L5M/ZvFfNn/M4v/N/DGL/3L5Yxb/5fPHLP4r5I9Z/FfMH7P4r5Q/ZvH/Vv6Yxf/b+WMW/5Xzxyz+q+SPWfxXzR+z+K+WP2bx/07+mMV/9fwxi/8a+WMW/zXzxyz+a+WPWfzXzh+z+K+TP2bxXzd/zOL/3fwxi/96+WMW//Xzxyz+G+SPWfw3zB+z+G+UP2bx3zh/zOL/vfwxi/8m+WMW/03zxyz+m+WPWfy/nz9m8f9B/pjFf/P8MYv/FvljFv8t88cs/lvlj1n8f5g/ZvH/Uf6Yxf/H+WMW/5/kj1n8t84fs/hvkz9m8d82f8zi/9P8MYv/dvljFv+f5Y9Z/LfPH7P475A/ZvH/ef6YxX/H/DGL/075Yxb/nfPHLP6/yB+z+O+SP2bx/2X+mMX/V/ljFv9d88cs/rvlj1n8d88fs/j/On/M4r9H/pjFf8/8MYv/XvljFv+988cs/vvkj1n8980fs/j/Jn/M4r9f/pjFf//8MYv/b/PHLP6/yx+z+B+QP2bx/33+mMX/wPwxi/9B+WMW/4Pzxyz+f8gfs/j/MX/M4n9I/pjF/0/5Yxb/Q/PHLP6H5Y9Z/A/PH7P4H5E/ZvE/Mn/M4v/n/DGL/1H5Yxb/v+SPWfyPzh+z+B+TP2bx/2v+mMX/2Pwxi/9x+WMW/+Pzxyz+J+SPWfxPzB+z+J+UP2bxPzl/zOJ/Sv6Yxf/U/DGL/2n5Yxb/0/PHLP5n5I9Z/M/MH7P4n5U/ZvE/O3/M4n9O/pjF/9z8MYv/efljFv/z88cs/hfkj1n8L8wfs/hflD9m8b84f8zif0n+mMX/0vwxi/9l+WMW/8vzxyz+V+SPWfyvzB+z+F+VP2bxvzp/zOJ/Tf6Yxf/a/DGL/3X5Yxb/6/PHLP435I9Z/G/MH7P435Q/ZvG/OX/M4n9L/pjF/9b8MYv/bfljFv/b88cs/nfkj1n878wfs/jflT9m8b87f8zif0/+mMX/3vwxi/99+WMW//vzxyz+D+SPWfwfzB+z+D+UP2bxfzh/zOL/SP6Yxf/R/DGL/2P5Yxb/x/PHLP5P5I9Z/J/MH7P4P5U/ZvF/On/M4v9M/pjF/9n8MYv/c/ljFv/n88cs/i/kj1n8X8wfs/i/lD9m8X85f8zi/0r+mMX/1fwxi/9r+WMW/9fzxyz+b+SPWfzfzB+z+L+VPybxH28gf8ziPyh/zOI/OH/M4j9a/pjFf0j+mMV/9Pwxi/8Y+WMW/zHzxyz+Y+WPWfzHzh+z+I+TP2bxHzd/zOI/Xv6YxX/8/DGL/wT5Yxb/CfPHLP4T5Y9Z/CfOH7P4T5I/ZvH/QP6Yxf+D+WMW/w/lj1n8P5w/ZvH/SP6YxX/S/DGL/0fzxyz+H8sfs/h/PH/M4v+J/DGL/2T5Yxb/T+aPWfwnzx+z+A/NH7P4T5E/ZvGfMn/M4j9V/pjFf+r8MYv/NPljFv9p88cs/tPlj1n8p88fs/h/Kn/M4v/p/DGL/2fyxyz+n80fs/h/Ln/M4j9D/pjFf8b8MYv/TPljFv+Z88cs/rPkj1n8Z80fs/jPlj9m8Z89f8ziP0f+mMV/zvwxi/9c+WMW/7nzxyz+8+SPWfznzR+z+H8+f8zi/4X8MYv/F/PHLP5fyh+z+M+XP2bxnz9/zOK/QP6YxX/B/DGL/5fzxyz+X8kfs/gvlD9m8V84f8zi/9X8MYv/IvljFv+v5Y9Z/BfNH7P4L5Y/ZvH/ev6YxX/x/DGL/xL5Yxb/JfPHLP7fyB+z+C+VP2bxXzp/zOK/TP6YxX/Z/DGL/zfzxyz+y+WPWfyXzx+z+K+QP2bxXzF/zOK/Uv6Yxf9b+WMW/2/nj1n8V84fs/ivkj9m8V81f8ziv1r+mMX/O/ljFv/V88cs/mvkj1n818wfs/ivlT9m8V87f8ziv07+mMV/3fwxi/9388cs/uvlj1n8188fs/hvkD9m8d8wf8ziv1H+mMV/4/wxi//38scs/pvkj1n8N80fs/hvlj9m8f9+/pjF/wf5Yxb/zfPHLP5b5I9Z/LfMH7P4b5U/ZvH/Yf6Yxf9H+WMW/x/nj1n8f5I/ZvHfOn/M4r9N/pjFf9v8MYv/T/PHLP7b5Y9Z/H+WP2bx3z5/zOK/Q/6Yxf/n+WMW/x3zxyz+O+WPWfx3zh+z+P8if8ziv0v+mMX/l/ljFv9f5Y9Z/HfNH7P475Y/ZvHfPX/M4v/r/DGL/x75Yxb/PfPHLP575Y9Z/PfOH7P475M/ZvHfN3/M4v+b/DGL/375Yxb//fPHLP6/zR+z+P8uf8zif0D+mMX/9/ljFv8D88cs/gflj1n8D84fs/j/IX/M4v/H/DGL/yH5Yxb/P+WPWfwPzR+z+B+WP2bxPzx/zOJ/RP6Yxf/I/DGL/5/zxyz+R+WPWfz/kj9m8T86f8zif0z+mMX/r/ljFv9j88cs/sflj1n8j88fs/ifkD9m8T8xf8zif1L+mMX/5Pwxi/8p+WMW/1Pzxyz+p+WPWfxPzx+z+J+RP2bxPzN/zOJ/Vv6Yxf/s/DGL/zn5Yxb/c/PHLP7n5Y9Z/M/PH7P4X5A/ZvG/MH/M4n9R/pjF/+L8MYv/JfljFv9L88cs/pflj1n8L88fs/hfkT9m8b8yf8zif1X+mMX/6vwxi/81+WMW/2vzxyz+1+WPWfyvzx+z+N+QP2bxvzF/zOJ/U/6Yxf/m/DGL/y35Yxb/W/PHLP635Y9Z/G/PH7P435E/ZvG/M3/M4n9X/pjF/+78MYv/PfljFv9788cs/vflj1n8788fs/g/kD9m8X8wf8zi/1D+mMX/4fwxi/8j+WMW/0fzxyz+j+WPWfwfzx+z+D+RP2bxfzJ/zOL/VP6Yxf/p/DGL/zP5Yxb/Z/PHLP7P5Y9Z/J/PH7P4v5A/ZvF/MX/M4v9S/pjF/+X8MYv/K/ljFv9X88cs/q/lj1n8X88fs/i/kT9m8X8zf8zi/1b+mMR//IH8MYv/oPwxi//g/DGL/2j5Yxb/IfljFv/R88cs/mPkj1n8x8wfs/iPlT9m8R87f8ziP07+mMV/3Pwxi/94+WMW//Hzxyz+E+SPWfwnzB+z+E+UP2bxnzh/zOI/Sf6Yxf8D+WMW/w/mj1n8P5Q/ZvH/cP6Yxf8j+WMW/0nzxyz+H80fs/h/LH/M4v/x/DGL/yfyxyz+k+WPWfw/mT9m8Z88f8ziPzR/zOI/Rf6YxX/K/DGL/1T5Yxb/qfPHLP7T5I9Z/KfNH7P4T5c/ZvGfPn/M4v+p/DGL/6fzxyz+n8kfs/h/Nn/M4v+5/DGL/wz5Yxb/GfPHLP4z5Y9Z/GfOH7P4z5I/ZvGfNX/M4j9b/pjFf/b8MYv/HPljFv8588cs/nPlj1n8584fs/jPkz9m8Z83f8zi//n8MYv/F/LHLP5fzB+z+H8pf8ziP1/+mMV//vwxi/8C+WMW/wXzxyz+X84fs/h/JX/M4r9Q/pjFf+H8MYv/V/PHLP6L5I9Z/L+WP2bxXzR/zOK/WP6Yxf/r+WMW/8Xzxyz+S+SPWfyXzB+z+H8jf8ziv1T+mMV/6fwxi/8y+WMW/2Xzxyz+38wfs/gvlz9m8V8+f8ziv0L+mMV/xfwxi/9K+WMW/2/lj1n8v50/ZvFfOX/M4r9K/pjFf9X8MYv/avljFv/v5I9Z/FfPH7P4r5E/ZvFfM3/M4r9W/pjFf+38MYv/OvljFv9188cs/t/NH7P4r5c/ZvFfP3/M4r9B/pjFf8P8MYv/RvljFv+N88cs/t/LH7P4b5I/ZvHfNH/M4r9Z/pjF//v5Yxb/H+SPWfw3zx+z+G+RP2bx3zJ/zOK/Vf6Yxf+H+WMW/x/lj1n8f5w/ZvH/Sf6YxX/r/DGL/zb5Yxb/bfPHLP4/zR+z+G+XP2bx/1n+mMV/+/wxi/8O+WMW/5/nj1n8d8wfs/jvlD9m8d85f8zi/4v8MYv/LvljFv9f5o9Z/H+VP2bx3zV/zOK/W/6YxX/3/DGL/6/zxyz+e+SPWfz3zB+z+O+VP2bx3zt/zOK/T/6YxX/f/DGL/2/yxyz+++WPWfz3zx+z+P82f8zi/7v8MYv/AfljFv/f549Z/A/MH/tP9B80MLDNwCj+B+WP/Sf60+P/4Pwxi/8f8scs/n/MH7P4H5I/ZvH/U/6Yxf/Q/DGL/2H5Yxb/w/PHLP5H5I9Z/I/MH7P4/zl/zOJ/VP6Yxf8v+WMW/6Pzxyz+x+SPWfz/mj9m8T82f8zif1z+mMX/+Pwxi/8J+WMW/xPzxyz+J+WPWfxPzh+z+J+SP2bxPzV/zOJ/Wv6Yxf/0/DGL/xn5Yxb/M/PHLP5n5Y9Z/M/OH7P4n5M/ZvE/N3/M4n9e/pjF//z8MYv/BfljFv8L88cs/hflj1n8L84fs/hfkj9m8b80f8zif1n+mMX/8vwxi/8V+WMW/yvzxyz+V+WPWfyvzh+z+F+TP2bxvzZ/zOJ/Xf6Yxf/6/DGL/w35Yxb/G/PHLP435Y9Z/G/OH7P435I/ZvG/NX/M4n9b/pjF//b8MYv/HfljFv8788cs/nflj1n8784fs/jfkz9m8b83f8zif1/+mMX//vwxi/8D+WMW/wfzxyz+D+WPWfwfzh+z+D+SP2bxfzR/zOL/WP6Yxf/x/DGL/xP5Yxb/J/PHLP5P5Y9Z/J/OH7P4P5M/ZvF/Nn/M4v9c/pjF//n8MYv/C/ljFv8X88cs/i/lj1n8X84fs/i/kj9m8X81f8zi/1r+mMX/9fwxi/8b+WMW/zfzxyz+b+WPSfwnGMgfs/gPyh+z+A/OH7P4j5Y/ZvEfkj9m8R89f8ziP0b+mMV/zPwxi/9Y+WMW/7Hzxyz+4+SPWfzHzR+z+I+XP2bxHz9/zOI/Qf6YxX/C/DGL/0T5Yxb/ifPHLP6T5I9Z/D+QP2bx/2D+mMX/Q/ljFv8P549Z/D+SP2bxnzR/zOL/0fwxi//H8scs/h/PH7P4fyJ/zOI/Wf6Yxf+T+WMW/8nzxyz+Q/PHLP5T5I9Z/KfMH7P4T5U/ZvGfOn/M4j9N/pjFf9r8MYv/dPljFv/p88cs/p/KH7P4fzp/zOL/mfwxi/9n88cs/p/LH7P4z5A/ZvGfMX/M4j9T/pjFf+b8MYv/LPljFv9Z88cs/rPlj1n8Z88fs/jPkT9m8Z8zf8ziP1f+mMV/7vwxi/88+WMW/3nzxyz+n88fs/h/IX/M4v/F/DGL/5fyxyz+8+WPWfznzx+z+C+QP2bxXzB/zOL/5fwxi/9X8scs/gvlj1n8F84fs/h/NX/M4r9I/pjF/2v5Yxb/RfPHLP6L5Y9Z/L+eP2bxXzx/zOK/RP6YxX/J/DGL/zfyxyz+S+WPWfyXzh+z+C+TP2bxXzZ/zOL/zfwxi/9y+WMW/+Xzxyz+K+SPWfxXzB+z+K+UP2bx/1b+mMX/2/ljFv+V88cs/qvkj1n8V80fs/ivlj9m8f9O/pjFf/X8MYv/GvljFv8188cs/mvlj1n8184fs/ivkz9m8V83f8zi/938MYv/evljFv/188cs/hvkj1n8N8wfs/hvlD9m8d84f8zi/738MYv/JvljFv9N88cs/pvlj1n8v58/ZvH/Qf6YxX/z/DGL/xb5Yxb/LfPHLP5b5Y9Z/H+YP2bx/1H+mMX/x/ljFv+f5I9Z/LfOH7P4b5M/ZvHfNn/M4v/T/DGL/3b5Yxb/n+WPWfy3zx+z+O+QP2bx/3n+mMV/x/wxi/9O+WMW/53zxyz+v8gfs/jvkj9m8f9l/pjF/1f5Yxb/XfPHLP675Y9Z/HfPH7P4/zp/zOK/R/6YxX/P/DGL/175Yxb/vfPHLP775I9Z/PfNH7P4/yZ/zOK/X/6YxX///DGL/2/zxyz+v8sfs/gfkD9m8f99/pjF/8D8MYv/QfljFv+D88cs/n/IH7P4/zF/zOJ/SP6Yxf9P+WMW/0Pzxyz+h+WPjeJ/7H+q/+H5Y5bH/xH5Yxb/I/PHLP5/zh+z+B+VP2bx/0v+mMX/6Pwxi/8x+WMW/7/mj1n8j80fs/gflz9m8T8+f8zif0L+mMX/xPwxi/9J+WMW/5Pzxyz+p+SPWfxPzR+z+J+WP2bxPz1/zOJ/Rv6Yxf/M/DGL/1n5Yxb/s/PHLP7n5I9Z/M/NH7P4n5c/ZvE/P3/M4n9B/pjF/8L8MYv/RfljFv+L88cs/pfkj1n8L80fs/hflj9m8b88f8zif0X+mMX/yvwxi/9V+WMW/6vzxyz+1+SPWfyvzR+z+F+XP2bxvz5/zOJ/Q/6Yxf/G/DGL/035Yxb/m/PHLP635I9Z/G/NH7P435Y/ZvG/PX/M4n9H/pjF/878MYv/XfljFv+788cs/vfkj1n8780fs/jflz9m8b8/f8zi/0D+mMX/wfwxi/9D+WMW/4fzxyz+j+SPWfwfzR+z+D+WP2bxfzx/zOL/RP6Yxf/J/DGL/1P5Yxb/p/PHLP7P5I9Z/J/NH7P4P5c/ZvF/Pn/M4v9C/pjF/8X8MYv/S/ljFv+X88cs/q/kj1n8X80fs/i/lj9m8X89f8zi/0b+mMX/zfwxi/9b+WMS/wkH8scs/oPyxyz+g/PHLP6j5Y9Z/Ifkj1n8R88fs/iPkT9m8R8zf8ziP1b+mMV/7Pwxi/84+WMW/3Hzxyz+4+WPWfzHzx+z+E+QP2bxnzB/zOL/f9i5+3Cv6/qO41/gg5KZN3O10DACUTQVBBXJvEFLPQJHu7GUBCXFmwnKlAJdKdmNtZWzWMXaSvImdaQWzpprOMo5p4tmRbXNxtjMsVa4LCdulew6cA5xTu/O5fny3dcr34/HH5zz+53ev5TndfXid7hOu+kfytJ/d/1DWfr/hv6hLP330D+Upf9v6h/K0v/F+oey9H+J/qEs/X9L/1CW/i/VP5Sl/3D9Q1n676l/KEv/vfQPZen/Mv1DWfqP0D+Upf/e+oey9H+5/qEs/UfqH8rS/xX6h7L0H6V/KEv/0fqHsvTfR/9Qlv5j9A9l6b+v/qEs/ffTP5Sl/1j9Q1n6769/KEv/A/QPZen/Sv1DWfofqH8oS/+D9A9l6X+w/qEs/cfpH8rSf7z+oSz9D9E/lKX/BP1DWfpP1D+Upf+h+oey9D9M/1CW/ofrH8rSf5L+oSz9j9A/lKX/ZP1DWfq/Sv9Qlv5H6h/K0v/V+oey9D9K/1CW/kfrH8rS/xj9Q1n6H6t/KEv/KfqHsvQ/Tv9Qlv7H6x/K0v81+oey9H+t/qEs/U/QP5Sl/4n6h7L0P0n/UJb+HfqHsvQ/Wf9Qlv5T9Q9l6T9N/1CW/tP1D2Xp36l/KEv/U/QPZel/qv6hLP1fp38oS//X6x/K0v8N+oey9H+j/qEs/U/TP5Sl/5v0D2Xp/2b9Q1n6n65/KEv/M/QPZek/Q/9Qlv5v0T+Upf+Z+oey9J+pfyhL/1n6h7L0P0v/UJb+Z+sfytJ/tv6hLP3fqn8oS/9z9A9l6X+u/qEs/efoH8rS/zz9Q1n6n69/KEv/C/QPZel/of6hLP1/W/9Qlv4X6R/K0n+u/qEs/efpH8rS/2L9Q1n6X6J/KEv/+fqHsvT/Hf1DWfpfqn8oS//L9A9l6b9A/1CW/m/TP5Sl/9v1D2Xpv1D/UJb+i/QPZel/uf6hLP2v0D+Upf/v6h/K0v8d+oey9H+n/qEs/a/UP5Sl/1X6h7L0X6x/KEv/d+kfytL/av1DWfq/W/9Qlv7v0T+Upf979Q9l6f8+/UNZ+l+jfyhL//frH8rS/wP6h7L0/z39Q1n6/77+oSz9P6h/KEv/D+kfytL/Wv1DWfr/gf6hLP2v0z+Upf+H9Q9l6f8R/UNZ+i/RP5Sl/x/qH8rS/6P6h7L0/5j+oSz9P65/KEv/pfqHsvT/I/1DWfp/Qv9Qlv5/rH8oS/8/0T+Upf8n9Q9l6f8p/UNZ+l+vfyhL/2X6h7L0/7T+oSz9b9A/lKX/jfqHsvS/Sf9Qlv436x/K0v8z+oey9L9F/1CW/rfqH8rS/zb9Q1n6/6n+oSz9l+sfytL/s/qHsvS/Xf9Qlv536B/K0v9O/UNZ+n9O/1CW/p/XP5Sl/wr9Q1n636V/KEv/P9M/lKX/3fqHsvT/gv6hLP2/qH8oS/8/1z+Upf89+oey9P8L/UNZ+n9J/1CW/n+pfyhL/5X6h7L0v1f/UJb+f6V/KEv/VfqHsvT/sv6hLP2/on8oS//79A9l6f/X+oey9L9f/1CW/n+jfyhL/wf0D2Xp/7f6h7L0f1D/UJb+D+kfytL/7/QPZen/Vf1DWfqv1j+Upf/X9A9l6f/3+oey9H9Y/1CW/l/XP5Sl/zf0D2Xp/039Q1n6r9E/lKX/t/QPZen/bf1DWfp/R/9Qlv7/oH8oS/9/1D+Upf8/6R/K0v8R/UNZ+n9X/1CW/v+sfyhL/7X6h7L0/xf9Q1n6r9M/lKX/v+ofytL/3/QPZen/qP6hLP2/p38oS//H9A9l6f/v+oey9F+vfyhL///QP5Sl//f1D2Xp/5/6h7L0/4H+oSz9f6h/KEv/DfqHsvR/XP9Qlv7/pX8oS/8f6R/K0v8J/UNZ+v9Y/1CW/j/RP5Sl/5P6h7L0/2/9Q1n6P6V/KEv/jfqHsvR/Wv9Qlv7/o38oS///1T+Upf9P9Q9l6f8z/UNZ+v9c/1CW/s/oH8rSf5P+oST9d6v0D2XpP0j/UJb+g/UPZek/RP9Qlv5F/1CW/kP1D2Xpv4P+oSz9d9Q/lKX/MP1DWfq/QP9Qlv476R/K0v+F+oey9N9Z/1CW/i/SP5Sl/y76h7L031X/0POuPwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABQVR2dUzYMGdTrqSHbPhixes7mj4duPP2Cm2/+wu49H7u/PDV4ycHbPti0adOmwU9WX+1+uGNVVV3/bbt1Px7W97jr9a+asnTslkdl5A1PH7B84dPHve/ONbduPPGWqcuHbn52aHX2eRfOnTN+cFWV0UOrRV0PDhlUVWXfodV1XQ8mdD3Yb2h1a9eDiZsfvKBa1fXg4HMumXtu1xNja/+eAcCvu47OxdWQXotd9frTwLb7f9WUuxb3fOznJXterVTd+3/Pij3e3udrPX7F/ve8fnlF3/0f8L8gAPBLBrb/943s+djPS/7S+//pF39sTfS1X73/Pa9fRtl/AGhe8P3/Xhvd9/v+fb7/H/05YOv9RWct2bFr/2/ZeNqC7qfKs/n+/y9ev4zuu/+De33/f1BVlX16vv+/Y1WVMdv52wEAKXR0vmtDf+//+9//smefm0Hb7v/ut686rWv/1+5050u6nxo6wP3fp7/3/yf0+WcFAJ6djs5Pb+rz/n8A+1/tG7zk1v0/ftGDL+7a/6+v/d7Ibb42kP0f03f/xy2YN3/cZZdfcdCF82afP+f8ORdPnjTpiMMmHT55wrjN3xHY8ut2/qYAwPPc9r3/r3bqczOoqh7bev/BlSMe7Nr/CZ846YLup4YNcP/37ff9/0jv/wGgl1GDqx12qBbNXrDg0kO2/NrzcMKWX7f8x4L9H8Df/4/u+SG6np8ZHFRVL91633nZ+wd37f+No/dc2f3UDgPc//363f9je/+sIgDw7Gzn+/9z+9z02v9dh9w2uWv/Tx6z/zXdTw307//H9rv/y7z/B4A6Ojqr/9c30V37f/XOP7++3nXZ38//AUDz2tj/x0e8e1S963KA/QeA5rWx/3OOedHd9a7LK+0/ADSvjf1/oGPJzHrX5UD7DwDNa2P/p1004qF61+Ug+w8AzWtj/0decc7CetflYPsPAM1rY/+Xn75hXb3rMs7+A0Dz2tj/M4etfaLedRlv/wGgeW3s/3cOOPW8etflEPsPAM1rY/+/dOTBj9S7LhPsPwA0r439Hz982bR612Wi/QeA5rWx/x+55Oib6l2XQ+0/ADSvjf0fPv1zE+tdl8PsPwA0r439f9uZX7u23nU53P4DQPPa2P9nrjx2eL3rMsn+A0Dz2tj/VQ/sPaPedTnC/gNA89rY/6mPXbe63nWZbP8BoHlt7P9+Tz4+v951eZX9B4DmtbH/n/322evrXZcj7T8ANK+N/d9l2TO71rsur7b/ANC8Nvb/vVfPX1rvuhxl/wGgeW3s//eX7DKm3nU52v4DQPPa2P+3fn7xinrX5Rj7DwDNa2P/v/HoxKn1rsux9h8AmtfG/p/x5ZXfrXddpth/AGheG/t/2MP3za53XY6z/wDQvDb2/94fT3+q3nU53v4DQPPa2P+Xveebe9e7Lq+x/wDQvDb2/+OfmfmBetfltfYfAJrXxv7/9IujD613XU6w/wDQvDb2f/5Hb7+x3nU50f4DQPPa2P/OA3+yrt51Ocn+A0Dz2tj/+3d558J616XD/gNA89rY/9tePuShetflZPsPAM1rY/9HHfWhmfWuy1T7DwDNa2P/F0/d4+5612Wa/QeA5rWx/y+88JOj6l2X6fYfAJrXxv6fv+jR6+tdl077DwDNa2P/fzhj3rB61+UU+w8AzWtj//faadbwetflVPsPAM1rY/+vG/vwtfWuy+vsPwA0r439/9nkOybWuy6vt/8A0Lw29n/hXiNvqndd3mD/AaB5bez/t+bdO63edXmj/QeA5rWx/7NOGf9Ivetymv0HgOa1sf8TZnWeV++6vMn+A0Dz2tj/e96x6ol61+XN9h8AmtfG/l9z/6dW1Lsup9t/AGheG/u/8/rdxtS7LmfYfwBoXhv7f/ZTFy+td11m2H8AaF4b+/+DNet2rXdd3mL/AaB5bex/xw1Xrq93Xc60/wDQvDb2/yuLfzS/3nWZaf8BoHlt7P8dH752db3rMsv+A0Dz2tj/MXdVM+pdl7PsPwA077LLr7ho9ty5cy71iU984pOtnzzX/8sEAAA07Rd/6H+u/0kAAAAAAAAAAAAAAAAAAAAgrzb+78Se639HAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP6PHTgQAAAAAADyf22EqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqKuzAgQAAAAAAkP9rI1RVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVhBw4EAAAAAAT5W68wQAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAB8BQAA//+3E9IO")
ppoll(&(0x7f0000000080)=[{r0}], 0x1, &(0x7f0000000100), 0x0, 0x0)


syz_mount_image$ext4(&(0x7f00000004c0)='ext4\x00', &(0x7f0000000500)='./file0\x00', 0x2000000, &(0x7f0000000140)={[{@errors_remount}, {@nobh}, {@max_batch_time={'max_batch_time', 0x3d, 0x3ff}}, {@nojournal_checksum}, {@usrquota}, {@dioread_nolock}, {@nodiscard}]}, 0x21, 0x4b0, &(0x7f0000000540)="$eJzs3cFPG9kZAPBvBgiEkEDaHNqqbdI0bVpFscFJUJRTemlVRZGqRj31kFBwEMLGCJs00BzI/1CpkXpq/4QeKvVQKafee9u97SV7WCm7G+0qrLQHr2ZsCCEY2A3BK/z7SU8zb57x9x7WvGc+wC+AnnUuItYi4lhE3IuI0fb1pF3iZqtkj3v54tH0+otH00k0m3c+SfL27Fps+ZrMifZzDkXE738T8afkzbj1ldX5qUqlvNSuFxvVxWJ9ZfXyXHVqtjxbXiiVJicmx69fuVY6sLGerf7r+a/nbv3hv//50bP/r/3yL1m3RtptW8dxkFpDH9iMk+mPiFvvIlgX9LXHc6zbHeEbSSPiOxFxPr//R6MvfzUBgKOs2RyN5ujW+iv92+oAwNGQ5jmwJC20cwEjkaaFQiuHdyaG00qt3rh0v7a8MNPKlY3FQHp/rlIeb+cKx2IgyeoT+fmremlb/UpEnI6Ivw4ez+uF6VplpptvfACgh53Ytv5/Ptha/wGAI26o2x0AAA6d9R8Aeo/1HwB6j/UfAHqP9R8Aeo/1HwB6j/UfAHrK727fzkpzvf351zMPVpbnaw8uz5Tr84Xq8nRhura0WJit1Wbzz+yp7vV8lVptceJqLD8sNsr1RrG+snq3WlteaNzNP9f7bnngUEYFAOzm9Nmn7ycRsXbjeF5iy14O1mo42tJudwDomr5udwDoGrt9Qe/yMz6wwxa9r+n4J0JPDr4vwOG4+H35f+hV8v/Qu+T/oXfJ/0PvajYTe/wDQI+R4wf8/h8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC+vpG8JGmhvRf4SKRpoRBxMiLGYiC5P1cpj0fEqYh4b3BgMKtPdLvTAMBbSj9K2vt/XRy9MLK99VjyxWB+jIg///3O3x5ONRpLE9n1TzevN560r5e60X8AYC8b6/TGOr7h5YtH0xvlMPvz/FetzUWzuOvt0mrpj/78OBQDETH8WdKut2TvV/oOIP7a44j43k7jT/LcyFh759Pt8bPYJw81fvpa/DRvax2z78V3D6Av0GueZvPPzZ3uvzTO5ced7/+hfIZ6exvz3/ob81+6Of/1dZj/zu03xtX//bZj2+OIH/TvFD/ZjJ90iH9hn/E/+OGPz3dqa/4j4mLsHH9rrGKjulisr6xenqtOzZZnywul0uTE5Pj1K9dKxTxHXdzIVL/p4xuXTu02/uEO8Yf2GP/P9jn+f355748/2SX+L3668+t/Zpf42Zr4833Gnxr+d8ftu7P4Mx3Gv9frf2mf8Z99uDqzz4cCAIegvrI6P1WplJecdOmk79vRDSdOXjvp9swEvGuvbvpu9wQAAAAAAAAAAAAAAOjkMP6dqNtjBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA4Oj6KgAA//+r59Ta")
r0 = open_tree(0xffffffffffffff9c, &(0x7f0000000000)='\x00', 0x9801)
fchdir(r0)
r1 = open(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
dup3(r1, r0, 0x0)
faccessat2(0xffffffffffffff9c, &(0x7f00000029c0)='./file0\x00', 0x2, 0x0)


quotactl$Q_SETQUOTA(0x0, 0x0, 0x0, 0x0)


mkdir(&(0x7f0000000100)='./file1\x00', 0x0)
mkdirat(0xffffffffffffff9c, &(0x7f0000000100)='./file0\x00', 0x0)
mkdir(&(0x7f0000000300)='./bus\x00', 0x0)
mount$overlay(0x0, &(0x7f00000000c0)='./bus\x00', &(0x7f0000000080), 0x0, &(0x7f0000000400)={[{@workdir={'workdir', 0x3d, './bus'}}, {@lowerdir={'lowerdir', 0x3d, './file0'}}, {@upperdir={'upperdir', 0x3d, './file1'}}]})
chdir(&(0x7f0000001180)='./bus\x00')
mkdir(&(0x7f0000000000)='./control\x00', 0x0)
r0 = open(&(0x7f0000022ff6)='./control\x00', 0x0, 0x0)
getdents64(r0, 0x0, 0x0)
lseek(r0, 0x0, 0x0)
syz_mount_image$fuse(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0)


syz_read_part_table(0x107d, &(0x7f0000001080)="$eJzs0L1Nw0AABeDnc+IfiYINGCUDMAEtLUPg9KzBMHSMQElDR4fg0DlBYQFI832Nn5/eSacLZ7UpSWrdJmnpKkuOcfW+T/oMU/r1d3k51F/dr02m7NtnTHLfwnaXXK7FR623JUuW1Lvusz7/nKhvT2MeX2/6OZtcp6SfW72bHy6SDIfReByXpJtOVTecbl/+7F0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA4D99BwAA//9wWRao")
syz_mount_image$fuse(0x0, &(0x7f0000000140)='./file0\x00', 0x0, 0x0, 0x0, 0x0, 0x0)
mount(0x0, &(0x7f0000000080)='./file0\x00', &(0x7f0000000340)='devpts\x00', 0x0, 0x0)
mount(0x0, &(0x7f0000000180)='./file0\x00', 0x0, 0x104000, 0x0)
r0 = creat(&(0x7f0000000100)='./bus\x00', 0x0)
ioctl$FS_IOC_SETFLAGS(r0, 0x40086602, &(0x7f0000000040))
r1 = creat(&(0x7f00000007c0)='./bus\x00', 0x0)
ioctl$EXT4_IOC_CLEAR_ES_CACHE(r1, 0x6612)
r2 = epoll_create1(0x0)
epoll_ctl$EPOLL_CTL_ADD(r2, 0x1, 0xffffffffffffffff, &(0x7f0000002740))
epoll_pwait2(r2, &(0x7f0000000680)=[{}, {}], 0x2, &(0x7f00000006c0)={0x0, 0x989680}, &(0x7f0000000700)={[0xf4f]}, 0x8)
r3 = syz_open_procfs(0x0, &(0x7f0000000000)='mountinfo\x00')
r4 = epoll_create1(0x0)
epoll_ctl$EPOLL_CTL_ADD(r4, 0x1, r3, &(0x7f00000002c0))
syz_mount_image$tmpfs(0x0, &(0x7f00000000c0)='./file0\x00', 0x0, 0x0, 0x0, 0x0, &(0x7f0000000000))
r5 = open_tree(0xffffffffffffff9c, &(0x7f00000001c0)='./file0\x00', 0x81901)
move_mount(r5, &(0x7f0000000140)='.\x00', 0xffffffffffffff9c, &(0x7f0000000180)='./file0\x00', 0x0)
mount_setattr(0xffffffffffffff9c, &(0x7f0000000140)='./file0\x00', 0x0, &(0x7f0000000180)={0x0, 0x0, 0x100000}, 0x20)
openat$ttyS3(0xffffffffffffff9c, &(0x7f0000000740), 0xeccd4822749cf91f, 0x0)
mount$bind(&(0x7f0000000100)='./file0\x00', &(0x7f00000000c0)='./file0\x00', 0x0, 0x1a49002, 0x0)
mount$bind(&(0x7f0000000200)='./file0\x00', &(0x7f0000000300)='./file0\x00', 0x0, 0x39a72d8, 0x0)
pwritev2(0xffffffffffffffff, &(0x7f0000000600)=[{&(0x7f0000000040)="d8ce5c5a", 0x4}, {&(0x7f0000000380)="cecb4558fabe9d60cd8be663e9e7aac08f322adfdc583f06373dc110209fd49a208cebc342d1856b238c25e094c4c00bfd631c51554a050bd7963a1fee493f01f57d322b4d91fc101cab375a90f43c62f263d0b2f05e6ecef602c6161a3adbca1dd2a30009c7b303a80dbfa5d083b5c9aa0d3e1d45bfbd1787a45924d91294bf283f6354bd67d159214256976a282c344a881037e4a0c51a43da62c549f5a214553775d5", 0xa4}, {&(0x7f0000000440)="a126befed80cc2233fd8c17407", 0xd}, {&(0x7f0000000480)="748dc16fdcbc2e638aa052999c8ee4f01ba63679a1a3866e6495ca2a30a251315d7470c29596588a160450b604e0a70632f65dfbe7a032c3821c3358e575266ca3e481db541cffe0a02eb38bcaa804203f328bb7111c500b657276d9b79b9dcdd8abcbfd41011fe7880e7576ecd3f22572dcc94750fe6dd61a3bbb88ee243a3f1a926dd04ebf71de041c5a702c2b27420a7ea6781dcf5bc8e7988ac2800eb115c01b7dffc24b4bf046061eeaff2816e2", 0xb0}, {0x0}], 0x5, 0x200, 0x0, 0x15)
mount$bind(&(0x7f0000000240)='.\x00', &(0x7f0000000280)='./file0\x00', 0x0, 0x1005848, 0x0)


mknodat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x0, 0x0)
renameat2(0xffffffffffffff9c, &(0x7f0000000440)='./file0\x00', 0xffffffffffffff9c, &(0x7f0000000480)='./file7\x00', 0x0)


syz_io_uring_setup(0x0, 0xfffffffffffffffd, 0x0, 0x0)


syz_mount_image$jfs(&(0x7f0000000040), &(0x7f0000000240)='./file7\x00', 0xc03, &(0x7f0000001a40)=ANY=[@ANYRES8=0x0, @ANYRESOCT=0x0, @ANYRES16, @ANYRES16=0x0, @ANYRESDEC, @ANYRESOCT=0x0, @ANYRES64], 0x2, 0x5f51, &(0x7f0000007240)="$eJzs3VtvHGf9B/DfHrw+9N/U6kWVf8QhTTm0lCaNkzYtp6ZC4gIEVKpyn8i4VUQKKAkVrSziKhL3SFyj8iK4BqHeIBWJl8AbiBT3hgpEB439PMl4vM7aJN6x/Xw+kjP722fG+0y+Hs+u5/AEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABDf/96bZ3sRcfnd9MRixP/FIKIfMV/XJ+OvEXExzz+MiOOx0RxPRcSx2Yh6+Y1/nog4HxEfH4u4u766XD+9tMt+vPTGnb/98M0frf32i3/5+78/+v2f2u1vfPTdH/zxVsTi8d/87j+3Hs26AwAAQCmqqqp6Gx/zI06kz/f9rjsFAExF3v9XSX5erX6Yehhbdd0ftVqtVm9XjXerWUTEWnOZ+j2Dw/EAcMisxaddd4EOyb9ow4h4rOtOAAdar+sOsC/urq8u91K+veb+4ORmez4XZEv+a71713fsNJ2kfY7JtH6+bscgntyhP/NT6sNBkvPvt/O/vNk+SvPtd/7TslP+o81Ln4qT8x+08285Ovn3x+Zfqpz/cE/5D+QPAAAAAAAHWP77/2LHx39nH35VduVBx39PTqkPAAAAAAAAAPCo7WL8v/rBjuP/3WP8PwAAADiw6s/qtQ+P3X9up3ux1c9f6kU83pofKEy6WGah634AAAAAAAAAAAAAQEmGm+fwXupFzETE4wsLVVXVX03teq8edvnDrvT1h5J1/UseAAA2fXysdS1/L2IuIi6le/3NLCwsVNXc/EK1UM3P5vezo9m5ar7xuTZP6+dmR7t4QzwcVfU3m2ss1zTp8/Kk9vb3q19rVA120bHp6DBwAIiIzb3RXXukI6aqnoiu3+VwONj+jx7bP7vR9c8pAAAAsP+qqqp66XbeJ9Ix/37XnQIApiLv/9vHBdRqtVqtVh+9uqka71aziIi15jL1ewbD8QPAIbMWn3bdBTok/6INI+J4150ADrRe1x1gX9xdX13upXx7zf1BGt89nwuyJf+13sZyeflx00na55hM6+frdgziyR3689SU+nCQ5Pz77fwvb7aP0nz7nf+07JR/vZ6LHfSnazn/QTv/lqOTf39s/qXK+Q/3lP9A/gAAAAAAcIDlv/8vOv6bVxkAAAAAAAAADp2766vL+brXfPz/c2Pmc/3n0ZTz78m/SDn/fjv/1gk5g8bjO6/fz/+T9dXlD///whfy9MDnPzMY1a890+sPhpvn/Pwz39p0JV7cNn89TzXzVlyNa7ESZ7e1z2xpX5rQfm5b+6hun8/tp2M5fh7X4if32mcnnBg1N6G9mtCe8x/Y/ouU8x82vur8F1J7rzWt3fmgv227b07Hvc7FH392YfvWNX23Y3Bv3Zrq9TvVQX82/k8eG8Uvb6xcP/2rKzdvXj8babLl2aVIk0cs5z+TvnL+zz6z2Z5/7ze31zsfjPac/0FxO4Y75v9M43G9vs9NuW9dyPmP0lfOP++Bxm//hzn/nbf/5zvoDwAAAAAAAAAAAAAAADxIVVUbl4hejIiX0/U/XV2bCQBMV97/V/lmGIlarVar1eqjVzdV473WLCLiz81l6vcMvx73zQCAg+yziPhH152gM/IvWL7fXz39UtedAabqxnvv//TKtWsr12/8L0tXg0ffIwAAAAAAAABgr/L4nycb4z9vnAfUGjd6y/ivr8fJT9ZXl99d/NfnD934n/3RYGOs87RCT0dzfO7tIxSfigeP/z2c8HozE9pHE9pnJ7TPTWgfe6FHQ87/6ZRxzv9EWrEHjf+a829PJ7xkpx40/uuzHfSnazn/U2ms55z/V1vzNfOv/nCYx//tb8n/zM13fnHmxnvvv3D1nStvr7y98rNz55dePbt04fwrL5556+q1lfRvhz3eXzn/PPa180DLkvPPmcu/LDn/L6da/mXJ+X8l1fIvS84/v9+Tf1ly/vmzj/zLkvN/LtXyL0vO/2upln9Zcv7Pp1r+Zcn5fz3V8i9Lzv+FVMu/LDn/06mWf1ly/mdSLf+y5PzzES75lyXnn89skH9Zcv5LqZZ/WXL+51It/7Lk/M+nWv5lyfm/lGr5lyXn/3Kq95C/e38dATn/C6m2/Zcl5/9KquVflpz/q6mWf1ly/t9ItfzLkvP/ZqrlX5ac/7dSLf+y5Py/nWr5lyXn/524fzGp/MuR838t1bb/sty//78HHnjgQX7Q9W8mAAAAAAAAAAAAAKBtGqcTd72OAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/2UHDgQAAAAAgPxfG6GqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqoKO3AgAAAAAADk/9oIVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVdi7uxi5yvt+4Gff7LUh9iYxhBcDa2PAwOJdv+CX/78OBkJKoW0ICfSN1Lj22jjxW71rAgiJjaApEkjlggtaKSkgVOUiVVCbqEGiEZUqtelNe9XeVGmlRhWKQuVEvWlUcHXmPM/jmdnZmV3vrj1zzueD8M87c2bm2TNnZvc71ncGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACg3oZ7J/+wL8uy/P/aHyNZdnn+91XZvvzLmZ2XeoUAAADAYn1Y+/Pba9MJ++Zxobpt/u76f/zuuXPnzmXZn6z71Kv5DEazbM3KLCvOC4Z/cqphm+D5bLivv+7r/g43P9Dh/MEO5w91OH9Fh/NXdjh/uMP5s3bALKuK12NqV7ap9teRYpdm67Kh2nmbWlzq+b6V/f3xtZyavtplzg0dzo5mx7LJbGLWZfpq/2XZuxvy27o/i7fVX3db67MsO/uzZw/GNfSFfbwpa7ixmvr77oO7s9Gf/+zZg0+O/OK6VrPjbpi10izbvDFf5wtZdv7lqqwvW5n2SVxnf90617dY50DDOvtql8v/3rzOs/NcZ/y+h8M6/6nNOteH0566McuymWzObZo9n/Vnq5tuNe3v4eKIyK8jvys/kQ0u6DjZMI/jJL/Mj29sPE6aj8m4/zeEfTI4xxrq744PvrZi1n6/0OMk/6674VjNr/uh/EaHh+tfWm04VvNtnr1p7mOg5X3X4hhIx3LdMbCx0zHQv2Kgdgz0n1/zxoZjYOusy/RnfbXbev+m9sfA+PTxU+NTTz9zx9HjB45MHpk8sX3Htj1bt+3asXti/PDRY5Phz4Xt0h6yOutPx+DG8FwTj8FbmratPyTPvVE8Dl6/etf1reZC1jC8RI+DxawhC8fLF2/OF3R5fzbHMZ5v88LmxT8O0s/9usfBYN3joOVzaovHweA8Hgf5Nmc3z+9n5mDd/63W0Oq5cCmOgZG6Y2AxPw/r13AhPw/z23zs1rmfC9eHdb1420J/Hg7MOgbit9UXHnv5Ken3veHdYb/MPi6uyc+4bEV2Zmry9JanDkxPn96ahXFRfLLuvmo+XlbXfU/ZrOOlf8HHy77PfbTrmhanj4R9NXx7+/sq32bHWPv7qvbs3np/Npy6LQtjiV3s/dnqp1m+P1OWaLM/821euGPxvwumXFL3/DfU6flvYGiweP4bSHtjqOH5b/ZdM1BbWZadvWN+z39D4f+L/fy3rkue//J99diW9sdAvs2L4ws9BgbbPv/dGGZfWM+tITEM1+X+j2rnzxSHad192fG4GRwcCsfNYLzFxuNm+6zL5NeW3/bmiQs7bjbf2HhfNfzeUsLjJt9Xr060P27ybd7buvjnjlXxr3XPHSs6HQNDAyvy9Q6lg6B4vju3Kh4DW7KD2cnsWHYoXSa/l/PbGts2v2NgRfj/Yj93XNUlx0C+r17b1v4YyLf52+1L+7vT5nBK2qbud6fm1xfmyvzXDJ6/vubdttSZP1/nZ3a0f20o3+anOxaaM9rvp9vDKZe12E/Nj5+5julDWef9tFTHdL7OY3e2f20q32bdznkeT/uyLHvnpbeK17uK13f/4sw/f7fhdd9Wrym/89Jbn7/6hz9cyPoBALhwH9X+nFlR/K5Z9y/W8/n3fwAAAKAnxNzfH2Yi/wMAAEBpxNw/EGYi/wMAAEBpxNw/GGayr9O745XDE31ff/XD57L0boDngnh+fBnkoZXFdrHjPRO+Hj13Xn76PW8NPfi95+Z32/1Zlv3vA9e23P6JlXFdhVNxnQONp89y1Q3zuv3HHzm/Xf37J5ztL64/fj/zfRkodpXf/bd7atc7emsx33sgq82HZ158vnb9e4qv4/bv/0ex3TfCm5bsO9zXcPnNYT2bwhwN7ynz0Krz+yGf8XLfeffI33/60fO3Fy/Xt3FN7dt8bUtxvfE9ol75y2L7+H3Ptf6/fulb38m3f+qm1ut/rr/1+t8P1/vjMP/ng+L0+n3+vbr1/0FYf7y9eLktb/6g5frf/qti+7fDcfF6mM3rv/uPrvuw1f0Vb2ffYHG5ePsTf3Zf7XLx+uL1N69/ePyehv3RfP3vvVlcz94n/3ugfvt4eryd6PHBxuO7L9y/DT3yLMu+9fWsYT9nQ8Xl3mlaf7y+U4Ot13970zpPvfFE7fLN30/0zUfubfn9xvXs+/ORhu/nlTVh//Wv+of8et+/NhyP4fxfzBTX1/xepm+vaXy+idu/PlI8buP1jTet/5Wm9c/ckO+7zuu//+fF+t++a2XD+vetDcfTx4vZaf1H/nRtw+Xf+GyxntNfHTtxcurM0fgeByNNj+OVw6tWX3b5x9asDc+lzV/vPzn9xOTp0YnRiSwb7cG3DFzu9b8Z5n8VY2bpb6HwL4PFcffyg8XPrVuGiq9fCac/Hu7P+PPxm3881HC8Nt/vM8PFXOz6bwvrmK/1m36ye14b/ueOt1/914e/1Px7Qfx+Tl0xXPv+XttwZe28vveK85ufrzr59ysaH9c/WlfM74f9ei68M/PGK4vba77++N4kL3+hePzG3+Ti5bOm9xMZGWj8Pha7/h+F32N+cFXj8188Pr7/XNO7OY9kffkSZsLzQzZTnB+3ivv75bNXtry9+D482czVC1nmnKaenho/dvTEmafGpyenpsennn5m//GTZ05M76+9d+n+L3e6/PnH9+ra4/vQ5M4dWe3RfrIYy+xSr//UIwcP7Zq4+dDk4QNnDk8/cmry9JGDU1MHJw9N3Xzg8OHJr3a6/NFDe7du27N917axI0cP7d29Z8/2PWNHT5zMl1EsqoOdE18ZO3F6f+0iU3t37Nl65507JsaOnzw0uXfXxMTYmU6Xr/1sGssv/eTY6cljB6aPHp8cmzr6zOTerXt27tzW8d0fj586PDU6fvrMifEzU5Onx4vvZXS6dnL+s6/T5amGqbXh+a5JX/jt/L7bd6b3x8299bU5r6rYZKTxxJ+G94L6xvD23fP5Oub+oTAT//4PAAAApRFzf/h8ivOvu8v/AAAAUBrhA//CZ0b6938AAAAoo5j7h8NMKpL/9f/1/y+g/5/q2vr/+v+Z/r/+fwf6//r/7ej/6//38vr1//X/6azb+v8x96/KskrmfwAAAKiCmPtXh5nI/wAAAFAaMfdfFmYi/wMAAEBpxNx/eZhJRfK//r/+v8//L1//vy/LZvT/9f+7hf6//n87+v/6/728fv1//X8667b+f8z9HwszqUj+BwAAgCqIuX9NmIn8DwAAAKURc//aMBP5HwAAAEoj5v6RMJMy5f+75j5L/1//X/+/fP1/n/9f0P/vDvr/+v/t6P/r//fy+vX/9f/prNv6/zH3fzzMpEz5HwAAACou5v5PhJnI/wAAAFAaMfd/MsxE/gcAAIDSiLl/XZhJRfK//r/+v/6//r/+v/7/ctL/L0f/f1U4Wf+/kf6//r/+v/4/7XVb/z/m/ivCTCqS/wEAAKAKYu6/MsxE/gcAAIDSiLn/U2Em8j8AAACURsz9V4WZVCT/6//r/+v/6//r/+v/Lyf9/3L0/+PJ+v+N9P/1//X/9f9pr9v6/zH3Xx1mUpH8DwAAAFUQc/81YSbyPwAAAJRGzP3XhpnI/wAAAFAaMfevDzOpSP7X/9f/1//X/9f/1/9fTvr/+v/t6P/r//fy+vX/9f/prNv6/zH3XxdmUpH8DwAAAFUQc//1YSbyPwAAAJRGzP03hJnI/wAAAFAaMfePhplUJP/r/+v/6//r/+v/6/8vJ/1//f929P/1/3t5/fr/+v901m39/5j7N4SZVCT/AwAAQBXE3L8xzET+BwAAgNKIuf/GMBP5HwAAAEoj5v5NYSYVyf/6//r/+v9l7f8P6P/r/3cF/X/9/3b0//X/e3n9+v/6/3TWbf3/mPtvCjOpSP4HAACAKoi5/+YwE/kfAAAASiPm/lvCTOR/AAAAKI2Y+zeHmVQk/+v/6//r/5e1/+/z//X/u4P+v/5/O/r/+v+9vH79f/1/Ouu2/n/M/beGmVQk/wMAAEAVxNx/W5iJ/A8AAAClEXP/7WEm8j8AAACURsz9Y2EmFcn/+v/6//r/+v/6//r/y0n/X/+/Hf1//f9eXr/+v/4/nXVb/z/m/jvCTCqS/wEAAKAKYu7fEmYi/wMAAEBpxNw/HmYi/wMAAEBpxNw/EWZSkfyv/6//r/+v/6//r/+/nPT/9f/b0f/X/+/l9ev/6//TWbf1/2Pu3xpmUpH8DwAAAD3quoVsHHP/tjAT+R8AAABKI+b+7WEm8j8AAACURsz9O8JMKpL/9f/1//X/9f/1//X/l5P+v/5/O/X9//yS+v9V6f/P9ZOmV9Zf0P/X/6ezbuv/x9x/Z5hJRfI/AAAAVEHM/TvDTOR/AAAAKI2Y+3eFmczO/39z8VYFAAAALKWY+3eHmfT8v//Pr1dVqf7/gfvTX/X/C/r/+v+Z/r/+/zLT/9f/b8fn/1e1/780LvX69f/1/+ms2/r/MffvCTPp+fwPAAAARDH3/78wE/kfAAAASiPm/v8fZiL/AwAAQGnE3P9LYSYVyf+V6v/X0f8v6P/r/2f6//r/y0z/X/+/Hf1//f9eXn/39v+/vTrL9P/pDt3W/4+5f2+YSUXyPwAAAFRBzP2fDjOR/wEAAKA0Yu6/K8xE/gcAAIDSiLl/X5hJRfK//r/+v/6//r/+v/7/ctL/1/9vZ3H9/1H9/0W61P35Xl9/9/b/ff4/3aPb+v8x998dZlKR/A8AAABVEHP/PWEm8j8AAACURsz994aZyP8AAABQGjH3fybMpCL5/6L1/1sUivX/9f8z/X/9f/1//f9F0v+vcv/f5/8v1qXuz/f6+vX/9f/prNv6/zH33xdmUpH8DwAAAFUQc/9nw0zkfwAAACiNmPt/OcxE/gcAAIDSiLn//jCTiuR/n/+v/6//r/+v/6//v5z0//X/29H/1//v5fXr/+v/01m39f9j7v+VMJOK5H8AAACogpj7Hwgzkf8BAACgNGLufzDMRP4HAACA0oi5/1fDTCqS//X/9f/1//X/9f/1/5eT/r/+fzv6//r/vbx+/X/9fzrrtv5/zP2/FmZSkfwPAAAAVRBz/6+Hmcj/AAAAUBox938uzET+BwAAgNKIuf+hMJOK5H/9f/1//X/9f/1//f/lpP+v/9+O/r/+fy+vX/9f/5/Ouq3/H3P/58NMKpL/AQAAoApi7n84zET+BwAAgNKIuf8LYSbyPwAAAPSAs/PaKub+L4aZVCT/6//r/+v/6/8vYf9/Rab/n+j/r6r9qf+v/9+O/r/+fy+vX/9f/5/Ouq3/H3P/I2EmFcn/AAAAUAUx9z8aZiL/AwAAQGnE3P8bYSbyPwAAAJRGzP2/GWZSkfyv/6//r/+v/+/z//X/l5P+v/5/O/r/+v+9vH79f/1/Ouu2/n/M/b8VZlKR/A8AAABVEHP/b4eZyP8AAABQGjH3/06YifwPAAAApRFz/2NhJhXJ/0X//9GD+v8F/X/9f/1//f9I/39p6P/r/7ej/6//38vr1//X/6ezbuv/x9z/pTCTiuR/AAAAqIKY+383zET+BwAAgNKIuX9/mIn8DwAAAKURc//jYSYVyf8+/1//X/9f/38h/f9VLU7X/y/o/7em/6//347+f5n7/yuWZI2Xbv1zPWENpr/p/+v/01m39f9j7j8QZlKR/A8AAABVEHP/74WZyP8AAABQGjH3Hwwzkf8BAACgNGLuPxRmUpH8r/+v/6//r//fI5//P5QtR/9/Rv9/uZWk//+e/n9B/7+R/r/P/9f/1/+nvW7r/8fcPxlmUpH8DwAAAL1uPu86GnP/4TAT+R8AAABKI+b+I2Em8j8AAACURsz9T4SZVCT/d2P//wb9f/1//f90Pfr/Pv9f/789n/+v/5/p/1+wS92f7/X16//r/9NZt/X/Y+4/GmZSkfwPAAAAVRBz/5fDTOR/AAAAKI2Y+78SZiL/AwAAQGnE3H8szKQi+b8b+/+Z/r/+v/5/uh79f/1//f/29P/1/zP9/wt2qfvzvb5+/X/9fzrrtv5/zP3Hw0wqkv8BAACgCmLuPxFmIv8DAABAacTcfzLMRP4HAACA0oi5/1SYSU/m/745e7tz0f/X/++2/n9987LU/f+V+v/6//r/S0H/X/8/0/+/YJe6P9/r69f/1/+ns27r/8fc//thJj2Z/wEAAIBWYu4/HWYi/wMAAEBpxNw/FWYi/wMAAEBpxNw/HWbSPv/3L++qLh79f/3/buv/V/bz/wf1/yP9f/3/hdD/1///P/bua0evs4rj8OeIGDgAzhH3AuI+OOYK6JCY3kKvoYUaeu+9hQ6h9957b6FIQZpZazmD7b09I3+ed7/reU5WnHhmdmTH0V+jn/ZO/39ip93Pb/359f/6f9aN1v/n7n9A3OL7/wAAADCN3P0PjFvsfwAAAJhG7v4HxS32PwAAAEwjd/+D45Ym+//q9v/3PfIj/b/+f3ei/v/O9bFT9f/e/3/+11X/r/8/hq79f/5JqP8/pP8/mdPu57f+/Pp//T/rRuv/c/c/JG5psv8BAACgg9z9D41b7H8AAACYRu7+h8Ut9j8AAABMI3f/w+OWJvvf+//1/9vr/yd9/7/+v+j/9f/H0bX/T/r/Q/r/kzntfn7rz6//1/+zbrT+P3f/I+KWJvsfAAAAOsjdf13cYv8DAADANHL3Xx+32P8AAAAwjdz95+KWJvtf/6//1//r//X/+v990v/r/5fo/8ft/2/X/69+ff2//p91o/X/527YHez+wy/Tb/8DAABAB7n7HxW32P8AAAAwjdz9j45b7H8AAACYRu7+x8QtTfa//l//r//X/+v/9f/7pP/X/y/R/4/b/3v/v/5/7eP1/1yO0fr/3P2PjVua7H8AAADoIHf/4+IW+x8AAACmkbv/8XGL/Q8AAADTyN3/hLilyf7X/+v/9f/6f/2//n+f9P/6/yUb6f/jU1z4y6P/n7r/v/+9Vj7+kv3/mZ3+X/9PGK3/z93/xLilyf4HAACADnL3Pylusf8BAABgGrn7nxy32P8AAAAwjdz9N8QtTfa//l//r//X/+v/9f/7pP/X/y/ZSP9/Sfr/qfv/1a/v/f/6f9aN1v/n7n9K3NJk/wMAAEAHufufGrfY/wAAADCN3P1Pi1vsfwAAAJhG7v6nxy1N9r/+f73/v+5u659P/3/x59f/6//1//p//f8G+v+bL/IT9f+XRf/fqP+/64Ufr//X/7NutP4/d/8z4pYm+x8AAAA6yN3/zLjF/gcAAIBp5O5/Vtxi/wMAAMA0cvc/O265z253mRn7pun/vf9f/6//1//r//dJ/7+B/v9i9P+XRf/fqP+/CP2//p91o/X/ufufE7f4/j8AAABMI3f/c+MW+x8AAACmkbv/eXGL/Q8AAADTyN3//Lilyf7X/+v/9f9Xtf+/5y033U//H/9c/6//vxL0//r/nf7/xE67n9/68+v/9f+sG63/z91/Y9zSZP8DAABAB7n7XxC32P8AAAAwjdz9L4xb7H8AAACYRu7+F8UtTfa//l//r//3/n/9/+HzPzJ+Pyb9/5Wh/9f/L7kC/f+Nd9f/n9hp9/Nbf379v/6fdaP1/7n7Xxy3NNn/AAAA0EHu/pfELfY/AAAATCN3/01xi/0PAAAA08jd/9K4pcn+1//r//X/+n/9v/f/75P+X/+/xPv/9f9bfn79v/6fdaP1/7n7Xxa3NNn/AAAA0EHu/pfHLfY/AAAATCN3/yviFvsfAAAAppG7/5VxS5P9r//X/+v/L9X/36T//z/6f/3/Sej/9f9L9P/6/y0/v/5f/8+60fr/3P2vilua7H8AAADoIHf/zXGL/Q8AAADTyN3/6rjF/gcAAIBp5O5/TdzSZP/r//X/+n/v/9f/6//3Sf+v/1+i/9f/b/n59f/6f9aN1v/n7n9t3NJk/wMAAEAHuftfF7fY/wAAADCN3P2vj1vsfwAAAJhG7v43xC1N9r/+X/+v/9f/6//1//uk/9f/L9H/6/+3/Pz6f/0/60br/3P3vzFuabL/AQAAoIPc/W+KW+x/AAAAmEbu/jfHLfY/AAAATCN3/1vilib7X/+v/9f/6//1//r/fdL/6/+X6P/1/1t+fv2//p91o/X/ufvfGrc02f8AAADQQe7+t8Ut9j8AAABMI3f/2+MW+x8AAACmkbv/HXFLk/2v/9f/6//1//p//f8+6f/1/0u21/9fe+RH+n/9v/5f/8+y0fr/3P3vjFua7H8AAADoIHf/u+IW+x8AAACmkbv/3XGL/Q8AAADTyN3/nrilyf7X/3fu/8+c2+30/zv9v/5f/79X+n/9/5Lt9f9H6f/1//p//T/LRuv/c/e/N25psv8BAACgg9z974tb7H8AAACYRu7+98ct9j8AAABMI3f/B+KWJvtf/9+5//f+f/3/0efU/+v/90H/r/9fov/X/2/5+Ufu/6/R/zOI0fr/3P0fjFua7H8AAADoIHf/h+IW+x8AAACmkbv/w3GL/Q8AAADTyN3/kbilyf7X/+v/9f/6/zv2/2cv/lv8gP5f/38S+n/9/xL9v/5/y88/cv/v/f+MYrT+P3f/R+OWJvsfAAAAOsjd/7G4xf4HAACAaeTu/3jcYv8DAADANHL33xK3NNn/+n/9v/5f/+/9//r/fdL/6/+X6P831//f6Y4/0P/r//X/rBmt/8/d/4m4pcn+BwAAgA5y938ybrH/AQAAYBq5+z8Vt9j/AAAAMI3c/Z+OW5rsf/2//l//r//X/+v/90n/r/9fov/fXP9/hP5f/6//Z81o/X/u/s/ELU32PwAAAHSQu/+zcYv9DwAAANPI3f+5uMX+BwAAgGnk7v983NJk/+v/9f/6/2P1/9fr//X/+v/j0f/r/5ec7//vvevR/18bf6H/n+H59f/6f9aN1v/n7v9C3NJk/wMAAEAHuftvjVvsfwAAAJhG7v4vxi32PwAAAEwjd/+X4pYm+1//r//X/3v/v/5f/79P+v/j9v9nj/Vc8/T/3v+/0/9v7vn1//p/1o3W/+fu/3Lc0mT/AwAAQAe5+78St9j/AAAAMI3c/V+NW+x/AAAAmEbu/q/FLU32v/5f/6//1//r//X/+6T/9/7/Jfp//f+Wn1//r/9n3Wj9f+7+r8ctTfY/AAAAdJC7/xtxi/0PAAAA08jd/824xf4HAACAaeTu/1bc0mT/6//1//p//b/+f5D+/8w5/f8J6P/1/zv9/4mddj+/9efX/+v/WTda/5+7/9txS5P9DwAAAB3k7v9O3GL/AwAAwDRy9383brH/AQAAYBq5+78XtzTZ/yv9fzVw+v9l+v/dwX8/+v+jn1//r//3/n/9v/5/mf5f/7/l59f/6/9ZN1r/n7v/+3HL+eF39vj/lgAAAMBIcvf/IG5p8v1/AAAA6CB3/w/jFvsfAAAAppG7/0dxS5P97/3/+n/v/9f/6//1//uk/9f/L9H/6/+3/Pz6f/0/60br/3P3/zhuabL/AQAAoIPc/T+JW+x/AAAAmEbu/p/GLfY/AAAATCN3/8/ilib7X/+v/9f/6//1//r/fdL/6/+X6P/1/1t+fv2//p91o/X/uft/Hrc02f8AAACwfdes/ozc/b+IW+x/AAAAmEbu/l/GLfY/AAAATCN3/6/ilib7X/+v/9f/6//1//r/fdL/6/+X6P/1/1t+fv2//p91o/X/uft/Hbc02f8AAADQQe7+38Qt9j8AAABMI3f/b+MW+x8AAACmkbv/d3FLk/2v/99H/3+r/l//f0D/r//X/2+//z8Tf+Do/w/p/4/S/+v/9f/6f5aN1v/n7v993NJk/wMAAEAHufv/ELfY/wAAADCN3P1/jFvsfwAAAJhG7v4/xS1N9v88/X886RD9v/f/6/8P6f/1//r/7ff/Sf9/SP9/lP5f/6//1/+zbLT+P3f/n+OWJvsfAAAAOsjd/5e4xf4HAACAaeTu/2vcYv8DAADANHL3/y1uabL/5+n/g/5f/6//1//H39f/j0H/r/9fov/X/2/5+fX/+n/Wjdb/5+7/e9zSZP8DAABAB7n7/xG32P8AAAAwjdz9/4xb7H8AAACYRu7+2+KWC/b/2av4VFeP/l//v7/+//Z77Hb6f/2//l//r//X/1+a/l//v+Xn1//r/1k3Wv9/28H/a++y+9fBR/v+PwAAAMwod/+/4xb7HwAAAKaRu/8/cYv9DwAAANPI3f/fuKXJ/tf/6/+9/1//r//X/++T/l//v0T/r//f8vPr//X/rBut/8/d/78AAAD//7KvnKc=")
r0 = openat(0xffffffffffffff9c, &(0x7f0000000100)='./file1\x00', 0x0, 0x0)
fsync(r0)


mkdirat(0xffffffffffffff9c, &(0x7f0000002040)='./file0\x00', 0x0)
mount(0x0, &(0x7f0000000080)='./file0\x00', &(0x7f00000000c0)='ramfs\x00', 0x2000000, 0x0)
chdir(&(0x7f0000000240)='./file0\x00')
r0 = open(&(0x7f0000000080)='./bus\x00', 0x141042, 0x0)
mknod(&(0x7f0000000040)='./file0\x00', 0x8001420, 0x0)
r1 = open$dir(&(0x7f0000000100)='./file0\x00', 0x2, 0x0)
r2 = open(&(0x7f0000000080)='./bus\x00', 0x0, 0x0)
splice(r1, 0x0, r0, 0x0, 0x1001, 0x0)
r3 = open$dir(&(0x7f0000000180)='./file0\x00', 0x7e, 0x0)
r4 = dup2(r3, r2)
write$FUSE_IOCTL(r4, &(0x7f0000000000)={0x20}, 0x20)
r5 = openat(0xffffffffffffff9c, &(0x7f0000000200)='./bus\x00', 0x141842, 0x0)
r6 = openat$cgroup_ro(0xffffffffffffff9c, &(0x7f0000000180)='blkio.bfq.io_service_time_recursive\x00', 0x275a, 0x0)
write$binfmt_script(r6, &(0x7f0000000200), 0xfea7)
copy_file_range(r6, &(0x7f00000001c0), r5, 0x0, 0xffffffffa003e45b, 0x700000000000000)


r0 = epoll_create1(0x0)
epoll_wait(r0, &(0x7f0000000000)=[{}], 0x1, 0x0)


mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
mount(0x0, &(0x7f0000000080)='./file0\x00', &(0x7f0000000940)='tmpfs\x00', 0x0, 0x0)
chdir(&(0x7f0000000380)='./file0\x00')
mkdir(&(0x7f0000000580)='./bus\x00', 0x0)
mkdir(&(0x7f00000002c0)='./file1\x00', 0x47)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000140)='./bus\x00', 0x0, 0x0)
ioctl$FS_IOC_FSSETXATTR(r0, 0x401c5820, &(0x7f00000000c0)={0x8})
rename(&(0x7f0000000180)='./file1\x00', &(0x7f00000001c0)='./bus\x00')
r1 = openat$smackfs_cipsonum(0xffffffffffffff9c, &(0x7f0000000080)='/sys/fs/smackfs/mapped\x00', 0x2, 0x0)
sendfile(r1, r0, &(0x7f0000000680)=0x1, 0x8000000000000000)
stat(&(0x7f0000000200)='./file2\x00', &(0x7f0000000240))
r2 = open(&(0x7f00000000c0)='./file0\x00', 0x81ff, 0x0)
fcntl$setlease(r2, 0x400, 0x0)
preadv(r2, &(0x7f0000000600)=[{&(0x7f0000000300)=""/88, 0x58}, {&(0x7f0000000980)=""/4096, 0x1000}, {&(0x7f00000003c0)=""/158, 0x9e}, {&(0x7f0000000480)=""/141, 0x8d}, {&(0x7f0000000540)=""/51, 0x33}, {&(0x7f00000005c0)=""/49, 0x31}], 0x6, 0x4, 0xffb)
r3 = open(&(0x7f0000000040)='./file0\x00', 0x113600, 0x60)
r4 = openat$cgroup_ro(0xffffffffffffff9c, &(0x7f0000000240)='blkio.throttle.io_serviced\x00', 0x275a, 0x0)
fcntl$lock(r4, 0x7, &(0x7f0000000000))
fcntl$lock(r4, 0x25, &(0x7f0000000140)={0x0, 0x0, 0x800000000000})
fcntl$lock(r4, 0x26, &(0x7f0000000100)={0x2, 0x0, 0x0, 0x17f})
fcntl$setlease(r3, 0x400, 0x1)
writev(r1, &(0x7f0000002680)=[{&(0x7f00000025c0)='8', 0x1}, {&(0x7f0000002600)='h', 0x1}], 0x2)
ioctl$EXT4_IOC_PRECACHE_EXTENTS(r2, 0x6612)


syz_mount_image$vfat(&(0x7f0000000580), &(0x7f00000005c0)='./file0\x00', 0x0, &(0x7f00000001c0)=ANY=[@ANYBLOB="00faffffff"], 0x1, 0x576, &(0x7f0000000640)="$eJzs2sFqE10YBuDvb/u3xc1kYTfiYsCNq5L2ChykhWJAqGShKwebQsmEwgQCycK6c+XC2/ByvACvo4tCJJkQE41ubBk1zwPhvHDyhm82mbM4rx/2umeX/fOPH77E7pM0NiJi4zqiMU2V/2brxjRvx6J3AQD8bU5P86zuGbhbZZnlkzPczg877U+1DAQAAAAAAAAAAMBvc/8fANaP+///vrLM8u3Z+W2Z+/8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAfa7H42T8i0/d8wEAt8/7HwDWz4uXr55lrdbRaZruRhTvB+1Bu1qr/ew8LqKITjQjiZuI8VyVj09aR810qhF7vauqP1k3l/sHkURjdf+g6qfL/f/j3mL/MJK4v7p/uLK/HY8fLfT3I4nPb+IyijiLSfdb/+1Bmj593vquvzP9HgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKyT/XSuEXu9q0F70I6YrJvV/v7P9qt+dh4XUUQnmpHETcR4rsrHJ62j5uwHlvtb8WCr3mcHAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAlvWHo25eFJ1SEARhHur+ZwIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOrSH466eVF0yn7dkwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAf4r+cNTNi6JT3mGo+xkBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAuH1fAwAA//9JmLyV")
lstat(&(0x7f0000000380)='./file0/../file0/../file0/../file0/../file0\x00', &(0x7f0000000bc0))


syz_mount_image$udf(&(0x7f00000000c0), &(0x7f0000000440)='./file0\x00', 0x1000a05, &(0x7f0000000100)=ANY=[@ANYBLOB="66696c657365743d3030303030303030303030313032352c726f6f746469723d30303030303030303030303030303030303030332c706172746974696f6e3d30303030303030303030303030303030303030302c6769643d69676e6f7265006e6f6164696e6963622cde6661736b3d30303030303030303030303030303030303030343732302c62733d30303230303030303030303230303030303030362c6769643d69676e6f72652c12e7090021b0d36853f83b3237931f7bf54d593a4b0e2a94b2208a0318c4095a29b12750ca47c7a60cd523accdee2710fd9dc6171d154f9426ce2b8e251b"], 0x0, 0xc3c, &(0x7f0000000ec0)="$eJzs3UFsHNd9B+D/G5FaUm4qJk4Vu43bdVukMmO5sqSYilW4q5pmG0CWiVDMLQBXJKUuTJEESTWykbZMLz30EKAoesiJQGsUSNHAqIugR7Z1geTiQ5FTT0QLG0HRA1sECFDA2GJm30pLirIYUZQo+/ts6rc7897sezPDGYrQmxcAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQMTvvHL+5PPpYbcCAHiQLk589eQp938A+ES55O//AAAAAAAAAAAAAABw0KUo4vFIsXhxM01V7zsGLrTmr9+YHB3budpgqmoeqsqXXwPPnzp95ksvjJzt5kfXv9+ejNcmLp2vv7xwbXFpdnl5dqY+Od+aXpgpdr2FnevP3nOLhssdUIuI6zNXrizXTz13esvqG0Mf1B47NnRu5JkTT3XK3qhPjo6NTfSU6eu/50+/zZ1GeByOIk5Eime/9+PUjIhqh+1xX9zl3Nlvg1UnhqtOTI6OVR2ZazXnV8qV490dUUTUeyo1uvvoARyLPWlErJbNLxs8XHZvYrG51Lw8N1sfby6ttFZaC/PjqdPasj/1KOJsiliLiI3a7ZvrjyL6IsV3jm6myxFxqLsfvlgNDL5zO3b/nbUvynbW+yPWikfgmB1gtSji1Ujxk3ePx3S5z/JXfCHi1TK/H/FWmS9FpPLEOBPx/g7nEY+mvijiz8rjf24zzVTXg+515cLX6l+Zv7LQU7Z7Xdnb/aGdz54Hfn8Y3JYPxgG/Ng1EEc3qir+Z7v2HHQAAAAAAAAAAAAAAAADut8Eo4slI8cq//UE1rjiqcelHz4387tCnbo4ZTxFP3GU7Kf4vnouI1WJ3Y3IP5yHE42k8pYc8lviTbCCK+MM8/u9bD7sxAAAAAAAAAAAAAAAAAAAAn2hF/ChSvPje8bQWvXOKt+av1i81L891ZoXtzv3bnTO93W6366mTjZxTOVdzruVcz7mRM4pcP2cj51TO1ZxrRUQtItbz+42ccSjXz9nIOZVzNedazvWcGzmjL9fP2cg5lXM151rO9ZwbOeOAzN0LAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPBxUkQRH0aKb39jM0WKiEbEVHRyvfawWwcAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAlGqpiHciRf33GjeX9UVEqv7vOF7+cSYah8v8TDRGynwpGudzNqvsa3zrIbSfvelPRfwwUtQG3r55wD/VOf79nXc3T4N465u33v1iXycPdVcOfVB77NjRcyNjv/zEnV6nnRowfKE1f/1GfXJ0bGyiZ3Ff/vTP9Cwbyp9b3J+uExHLb7z5enNubnbp3l+Up8Aeqnux6xeH7ed7e9HOZ/vPWj36DkDjH9KLh3xh4oEo7//vR4rffO/fuzf87s9/P9d5d/MOHz/9o1v3/xe3b2if7v+P9yx7Mf800t8XMbBybbH/WMTA8htvnmhda16dvTo7f+bkyS+PjHz59Mn+wxEDV1q/EhGdV3OzJ/e8qwAAAAAAAAAAAAAAAAAerFTEb0eK5g83Uz0iblTjtYbOjTxz4qlDcagab7Vl3NZrE5fO119euLa4NLu8PDtTn5xvTS/MzO724waq4V6To2P70pm7Gtzn9g8OvLyw+MZS6+rvr+y4/sjA+cvLK0vN6Z1Xx2AUEY3eJcNVgydHx6pGz7Wa81XV8R0H0/3s+lMR/xEpps/U09N5WR7/t32E/5bx/6vbN7RP4/8+3bOs/MyUivhppPiNP38inq7aeSRu22e53F9HiuGzn8/l4nBZrtuGznMFOiMDy7L/Eyn+/sOtZbvjIR+/Vfb5Xe/YR0R5/I9Ginf+9Lvxq3lZ35bnP+x8/I9s39Auj3/7j7fVu8vx/2zPsiNbnldwf/r/SVce/xOR4qXH345fy8v6PuL5H91nbxzPhW8+n2Ofvv/bPZWG8uf++n3qOwAAAAAAAAAAwKOsPxXxN5HiqbG+9EJetpt//zezfUP79O+/PtezbOb+zFd01xdhiikAAAAAPib6UxE/ihRXV96+OYZ66/jvnvGfv3XrF2Ojadva6vd8P189N+B+/v6v11D+3Km9dxsAAAAAAAAAAAAAAAAAAAAOlJSKeCHPpz61w3zqgz3l1iPFK//1bC6XjpXluvPAD1V/DlxcmD9xfm5uYbq50rw8N1ufWGxOz5Z1PxspNv/q87luUc2v/vS2tnTnYl+KFGN/2y3bmYu9Ozd5Zz7wzlzsZdlPR4r//LutZbvzWP/CrbKnyrJ/GSm+/o87lz12q+zpsux3I8UPvl7vlj1Slu0+H/Vzt8o+N73gUaEAAAAAAAAAAAAAAAAAAADsXX8q4k8ixX9fW7s5lj/P/9/f87by1jd75vvf5kY1z/9QNf//nV7fy/z/Q7fX+LDdEdE+eo+9BgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAgy1FEW9GisWLm2m9Vr7vGLjQmr9+Y3J0bOdqg6mqeagqX34NPH/q9JkvvTBytpsfXf9+ezJem7h0vv7ywrXFpdnl5dmZ+uR8a3phZnbXW9hr/e2Gqx1Qv/b69ZkrV5brp547vWX1jaEPao8dGzo38syJp7plJ0fHxiZ6yvT13/On3ybdYfnhKOIvIsWz3/tx+qdaRBF73xd3OXf222DVieGqE5OjY1VH5lrN+ZVy5Xh3RxQR9Z5Kje4+egDHYk8aEatl88sGD5fdm1hsLjUvz83Wx5tLK62V1sL8eOq0tuxPPYo4myLWImKjdvvm+qOI1yPFd45upn+uRRzq7ocvXpz46slTd25HsY993IWynfX+iLXiEThmB1gtiviHSPGTd4/Hv9Qi+qLzFV+IeLXM70e8FZ3jncoT40zE+zucRzya+qKI/y2P/7nN9G6tvB50rysXvlb/yso7vWW715VH/v7wIB3wa9NAFPGD6oq/mf7V9zUAAAAAAAAAAAAAAADAAVLEL0WKF987nqrxwd1B0Uut+av1S83Lc51hfd2xf93V7Xa7XU+dbOScyrmacy3nes6NnFHk+jkbZQ6021P5/WrOtZzrOTdyxqFcP2cj51TO1ZxrOddzbuSMvlw/ZyPnVM7VnGs513Nu5IwDMnYPAAAAAAAAAAAAAAAAAAD4eCmq/1J8+xubqV3rzC89FZ1cNx/ox97/BwAA//8Mp/j3")
chdir(&(0x7f0000000380)='./file0\x00')
r0 = openat(0xffffffffffffff9c, &(0x7f0000000100)='.\x00', 0x0, 0x0)
fstatfs(r0, &(0x7f0000000280)=""/16)


r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x0, 0x0)
getdents64(r0, &(0x7f00000001c0)=""/202, 0xff4)


r0 = syz_io_uring_setup(0x5, &(0x7f0000000140)={0x0, 0x0, 0x3e40}, &(0x7f0000ffb000), &(0x7f0000fff000))
io_uring_register$IORING_REGISTER_RESTRICTIONS(r0, 0xb, &(0x7f0000000440)=[@ioring_restriction_sqe_flags_allowed, @ioring_restriction_sqe_flags_allowed], 0x2)


r0 = syz_open_procfs(0x0, &(0x7f0000000040)='task\x00')
fstat(r0, &(0x7f0000000180))


syz_mount_image$fuse(0x0, &(0x7f00000000c0)='./file0\x00', 0x0, 0x0, 0x0, 0x0, 0x0)
lchown(&(0x7f0000000680)='./file0\x00', 0x0, 0xee00)


syz_mount_image$vfat(&(0x7f0000003880), &(0x7f0000000e80)='./file0aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\x00', 0x0, &(0x7f00000001c0)={[{@utf8}, {@utf8}, {@uni_xlate}, {@fat=@codepage={'codepage', 0x3d, '936'}}, {@iocharset={'iocharset', 0x3d, 'maccenteuro'}}, {@uni_xlateno}, {@shortname_win95}, {@shortname_win95}, {@iocharset={'iocharset', 0x3d, 'macgaelic'}}, {@numtail}, {@iocharset={'iocharset', 0x3d, 'iso8859-4'}}, {@uni_xlate}, {@utf8no}, {@shortname_mixed}, {@shortname_lower}, {@shortname_lower}]}, 0x1, 0x2ad, &(0x7f0000000580)="$eJzs3c9qY1UYAPDvpmly1UWycCWKF3ThapjOE6RIBwazUrJQFyrODMgkCDNQ8A/GWQmu3Lj0CQRhdr6EG9/ABxDc2UXhyE3ubZKapL3QtP75/TY9Ped893znntuWLu6XD1+ePLpfxMOnX/4WeZ5FaxCDOMmiH62ofR0rBt8FAPBvdpJS/JHmmsRlEZHvLi0AYIca//1/tvOUAIAde+fd9946HA6P3i6KPO5Ovjkelf/Zl1/n44cP45MYx4O4Hb04jUhn5u27KaVpuyj14/XJ9HhURk4++KW6/uHvEbP4g+hFf9a1Gn9veHRQzC3FT8s8nq/WH5Txd6IXL65Z/97w6M6a+Bh14o3XlvK/Fb349eP4NMZxf5bEIv6rg6J4M33/5xfvl+mV8dn0eNSdzVtIe9d8NAAAAAAAAAAAAAAAAAAAAAAA/IfdqmrndGNWv6fsqurv7J2W3+xHUeuv1ueZx2f1hc7VB5qm+KGur3O7KIpUTVzEt+OldrRvZtcAAAAAAAAAAAAAAAAAAADwz/Lks88ffTQeP3h8JY26GkD9Wn/z6zybRQ2Wel6N7VHdxVqtqrllidir52QRW/MpN3FFt+WixnObcv7xp6YXzC+es1+ulW+aUx5hw0Xr417qibonW38Pu2eT8/oh+bnuSSmlTlxy9c6modTo8eusHeo1PsrOC7PGdMucyDYl9u0ri5/Laig7v4vOyn1eaexXjdh0gnmj5/nvvysy1ToAAAAAAAAAAAAAAAAAAGCnFi/9rhl8ujW0lbo7SwsAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAArtXi8/8bNKZV8CUmd+LxkxveIgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP8DfwUAAP//o0VbPQ==")
r0 = openat(0xffffffffffffff9c, &(0x7f0000004400)='./bus\x00', 0x6b142, 0x0)
pwritev2(r0, &(0x7f0000000100)=[{&(0x7f0000000040)='\x00', 0x1}], 0x1, 0x8000000, 0x0, 0x0)
pwritev2(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0)
truncate(&(0x7f0000000400)='./bus\x00', 0x0)
mkdir(&(0x7f0000000100)='./file0\x00', 0x0)
ioctl$EXT4_IOC_MOVE_EXT(0xffffffffffffffff, 0x40305829, 0x0)
statfs(&(0x7f0000000000)='./file0\x00', &(0x7f0000001f80)=""/234)


syz_io_uring_setup(0x0, 0xfffffffffffffffe, 0x0, 0x0)


syz_mount_image$udf(&(0x7f0000000040), &(0x7f0000000500)='./file0\x00', 0x18008, &(0x7f0000000000)=ANY=[@ANYRES32=0x0, @ANYRESDEC], 0xfe, 0x4b1, &(0x7f0000001d00)="$eJzs201sVNUbx/HfM3c6TIf+/5YXCxgCTTSxgkBfsEBqYnix0YQXLVQj8SWVTrHSdkinKCUgLNWdC5Yu3bpwZdwaEpfGhcEYFibIxs2sxB3m3LlvM5TOjG1nKP1+CJx7zzx3OOc8c+ecM5kRAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACQjrx6uLfPWt0KAADQTCdPj/QOMP8DALCmnGH/DwAAsJaYPP0u054LJTvhn5dlj0/OXLo8emx44cvaTaaUPD/e/c329Q/sf2nwwMGwXPz65bZNp06fOdx9tDB9cTZfLObHu0dnJs8VxvN1P8NSr6+2yx+A7ukLl8YnJord/XsHKh6+3Hlv3fquzqHB945mwtjRY8PDpxMx6bb//L8/5FEr/Iw8vSDTx99/ayclpbT0sajx2llp7X4ndvmdGD027HdkanJsZs49aKkgKlU5JplwjJqQiyVJSa5dllmePVubPP0g05F9JTslyQvHYbf/wXBd7WmFtNu6SurRKsjZY2ydPH0g0619nXojGFc//xnpaqsbhxWXDu7/gpXsTf/9wN1P7m3z+Fvdr89MFBKxlgruqNU+PzTTY/7elJWnU/4dX7IR7Wx1c9Bk7fI0LVPmq0/8dYX8delTQwd27DyUXGFsqfE8LnZvcHPVMye3BUsHS7k/y98v1Cdrnv6U6f5vWf+8J5wDpBsPFrvwj6Y0DyvNPE3J9M+1klnVvtRL7O8jq33uX9n2t2ePFi7Oz06e/2huwcdz2cMfFudmx84t/HB57+ola2rtY6ulGtuS5ay84/v801J0XbAH+F/5LG7NN1fj10JPVRlKvn7qOa57F9vAOsq1yczTXZkm3t9anmeUa3hs1gKX/2GZiqWfLcx0kP90+SyR/5fj8ctaZRnxc/v/8uda4Vpi29nNj6pfify7Nrn8vyPTkb+3Bp9plPPvVcW6uC6Z3r25PYhLZVxcOuxO+RknJqfyvS72gUwbfwpj5cfmgthNcWyfiy3K9MWtytj1QezmOLbfxd6W6c6vC8c+HccOuNh5l6873WFszsXuCGK74ti95wpT47WG1eW/X6a3r79mYZ8fmf/E/X+jqow8lPPFj5cr/52JuhtBXs8G+U/XyP+XMs3/tT3stz/24ctqg/9vnH+3Vv7uZmVsuKHcGMf21dutVnP53yDTvVduR30O+hacxhlK5v+ZdGUZjWuL8r8hUdcZtCvT4FisRcX5KxfGpqbysxxwwAEH0UGr35nQDG7+H3Gz+qBn4TommP87ymfxiun+Z/H8P1RVRlo0/29M1A0Fq5a2tJSdm77YtkXKFuev7JmcHjufP5+fGdg/2Nt/aH/vwMG2TLi4i4/qHrsngcv/bpmu/fhLtI+pXP8tvP7PVZWRFuV/U7JPFeuauodiTXL575Bp8O7taL+52Po/3P/3PFtZRvdfi/K/OVHXGbSro8GxAAAAAAAAAAAAAAAAAIDVJGeenpPp8siLFv6GqJ7v/41XlZHl//5X+YfJNb7/1ZWoG2/S7xoaGmgAAAAAAAAAAIAmScnT1zI9r5JddxUd0olkiSfavwEAAP//G6xIAA==")
unlink(&(0x7f0000000140)='./file1\x00')

__percpu_counter_compare25%of 8
__percpu_counter_init_many34%of 9
__percpu_counter_limited_add18%of 35
__percpu_counter_sum60%of 5
compute_batch_value---of 1
percpu_counter_add_batch91%of 11
percpu_counter_cpu_dead---of 6
percpu_counter_destroy_many---of 17
percpu_counter_fixup_free---of 3
percpu_counter_set---of 5
percpu_counter_sync---of 1
-----------
SUMMARY36%of 68

ovl_cache_entry_new---of 15
ovl_cache_free---of 4
ovl_cache_put---of 9
ovl_cache_update---of 33
ovl_check_d_type---of 4
ovl_check_d_type_supported---of 3
ovl_check_empty_dir---of 14
ovl_cleanup_whiteouts---of 9
ovl_dir_cache_free---of 5
ovl_dir_fsync---of 6
ovl_dir_llseek18%of 17
ovl_dir_open---of 4
ovl_dir_read---of 13
ovl_dir_read_merged---of 9
ovl_dir_real_file---of 7
ovl_dir_release---of 5
ovl_fill_merge---of 25
ovl_fill_plain---of 4
ovl_fill_real15%of 20
ovl_indexdir_cleanup---of 25
ovl_iterate12%of 88
ovl_workdir_cleanup---of 27
shared_ovl_iterate100%of 1
-----------
SUMMARY14%of 126

__register_nls---of 6
char2uni---of 1
find_nls---of 8
load_nls---of 3
load_nls_default---of 3
uni2char---of 5
unload_nls---of 3
unregister_nls---of 6
utf16s_to_utf8s---of 33
utf32_to_utf820%of 21
utf8_to_utf32---of 20
utf8s_to_utf16s---of 16
-----------
SUMMARY20%of 21

-----------
SUMMARY---of 0

__sysfs_match_string---of 15
devm_kasprintf_strarray---of 10
devm_kfree_strarray---of 5
kasprintf_strarray---of 9
kfree_strarray---of 5
kstrdup_and_replace---of 7
kstrdup_quotable---of 26
kstrdup_quotable_cmdline---of 14
kstrdup_quotable_file---of 5
match_string---of 6
memcpy_and_pad67%of 3
parse_int_array_user---of 6
skip_spaces67%of 3
strim---of 7
string_escape_mem---of 63
string_get_size---of 19
string_unescape---of 31
strreplace---of 6
sysfs_streq---of 11
-----------
SUMMARY67%of 6

__ia32_compat_sys_ftruncate---of 4
__ia32_compat_sys_open---of 5
__ia32_compat_sys_openat---of 5
__ia32_compat_sys_truncate---of 1
__ia32_sys_access---of 1
__ia32_sys_chdir---of 1
__ia32_sys_chmod---of 5
__ia32_sys_chown---of 1
__ia32_sys_chroot---of 1
__ia32_sys_close---of 7
__ia32_sys_close_range---of 1
__ia32_sys_creat---of 1
__ia32_sys_faccessat---of 1
__ia32_sys_faccessat2---of 1
__ia32_sys_fallocate---of 4
__ia32_sys_fchdir---of 1
__ia32_sys_fchmod---of 7
__ia32_sys_fchmodat---of 5
__ia32_sys_fchmodat2---of 1
__ia32_sys_fchown---of 1
__ia32_sys_fchownat---of 1
__ia32_sys_ftruncate---of 4
__ia32_sys_lchown---of 1
__ia32_sys_open---of 5
__ia32_sys_openat---of 5
__ia32_sys_openat2---of 1
__ia32_sys_truncate---of 1
__se_sys_chdir40%of 5
__se_sys_chroot25%of 8
__se_sys_fchdir43%of 7
__se_sys_openat2---of 11
__x64_compat_sys_ftruncate---of 4
__x64_compat_sys_open---of 5
__x64_compat_sys_openat---of 5
__x64_compat_sys_truncate---of 1
__x64_sys_access---of 1
__x64_sys_chdir100%of 1
__x64_sys_chmod40%of 5
__x64_sys_chown100%of 1
__x64_sys_chroot100%of 1
__x64_sys_close43%of 7
__x64_sys_close_range---of 1
__x64_sys_creat100%of 1
__x64_sys_faccessat100%of 1
__x64_sys_faccessat2100%of 1
__x64_sys_fallocate50%of 4
__x64_sys_fchdir100%of 1
__x64_sys_fchmod43%of 7
__x64_sys_fchmodat40%of 5
__x64_sys_fchmodat2---of 1
__x64_sys_fchown100%of 1
__x64_sys_fchownat100%of 1
__x64_sys_ftruncate40%of 5
__x64_sys_lchown100%of 1
__x64_sys_open60%of 5
__x64_sys_openat80%of 5
__x64_sys_openat2---of 1
__x64_sys_truncate100%of 1
__x64_sys_vhangup---of 3
break_lease40%of 5
build_open_flags45%of 20
build_open_how---of 5
chmod_common28%of 11
chown_common45%of 18
dentry_create---of 5
dentry_open40%of 5
do_dentry_open43%of 70
do_faccessat21%of 34
do_fchmodat---of 6
do_fchownat50%of 8
do_ftruncate25%of 12
do_sys_ftruncate---of 5
do_sys_open---of 5
do_sys_openat250%of 6
do_sys_truncate43%of 7
do_truncate43%of 7
file_open_name---of 7
file_open_root58%of 7
file_path---of 1
filp_close100%of 1
filp_flush67%of 6
filp_open---of 8
finish_no_open---of 1
finish_open---of 3
fsnotify_file_area_perm40%of 5
fsnotify_modify23%of 9
generic_file_open75%of 4
kernel_file_open---of 4
ksys_fallocate---of 4
ksys_fchown38%of 8
nonseekable_open---of 1
sb_end_write40%of 10
sb_start_write40%of 10
stream_open---of 1
vfs_fallocate24%of 26
vfs_fchmod---of 4
vfs_fchown---of 5
vfs_open100%of 1
vfs_truncate17%of 12
-----------
SUMMARY41%of 376

-----------
SUMMARY---of 0

__btrfs_ioctl_snap_create---of 16
_btrfs_ioctl_send---of 6
_btrfs_ioctl_set_received_subvol---of 36
btrfs_check_ioctl_vol_args_path---of 1
btrfs_compat_ioctl---of 1
btrfs_dev_name---of 7
btrfs_exclop_balance---of 6
btrfs_exclop_finish---of 1
btrfs_exclop_start---of 3
btrfs_exclop_start_try_lock---of 4
btrfs_exclop_start_unlock---of 1
btrfs_fileattr_get---of 1
btrfs_fileattr_set---of 39
btrfs_ioctl---of 65
btrfs_ioctl_add_dev---of 15
btrfs_ioctl_balance---of 26
btrfs_ioctl_balance_ctl---of 5
btrfs_ioctl_balance_progress---of 5
btrfs_ioctl_default_subvol---of 12
btrfs_ioctl_defrag---of 14
btrfs_ioctl_dev_info---of 39
btrfs_ioctl_dev_replace---of 12
btrfs_ioctl_encoded_read---of 22
btrfs_ioctl_encoded_write---of 23
btrfs_ioctl_fitrim---of 25
btrfs_ioctl_fs_info---of 23
btrfs_ioctl_get_dev_stats---of 7
btrfs_ioctl_get_features---of 1
btrfs_ioctl_get_fslabel---of 4
btrfs_ioctl_get_subvol_info---of 13
btrfs_ioctl_get_subvol_rootref---of 18
btrfs_ioctl_get_supported_features---of 1
btrfs_ioctl_ino_lookup---of 8
btrfs_ioctl_ino_lookup_user---of 28
btrfs_ioctl_ino_to_path---of 9
btrfs_ioctl_logical_to_ino---of 11
btrfs_ioctl_qgroup_assign---of 9
btrfs_ioctl_qgroup_create---of 11
btrfs_ioctl_qgroup_limit---of 7
btrfs_ioctl_quota_ctl---of 8
btrfs_ioctl_quota_rescan---of 6
btrfs_ioctl_quota_rescan_status---of 4
btrfs_ioctl_quota_rescan_wait---of 3
btrfs_ioctl_resize---of 38
btrfs_ioctl_rm_dev---of 13
btrfs_ioctl_rm_dev_v2---of 16
btrfs_ioctl_scrub---of 10
btrfs_ioctl_scrub_cancel---of 3
btrfs_ioctl_scrub_progress---of 5
btrfs_ioctl_set_features---of 13
btrfs_ioctl_set_fslabel---of 7
btrfs_ioctl_set_received_subvol---of 4
btrfs_ioctl_set_received_subvol_32---of 5
btrfs_ioctl_snap_create---of 5
btrfs_ioctl_snap_create_v2---of 10
btrfs_ioctl_snap_destroy---of 35
btrfs_ioctl_space_info---of 26
btrfs_ioctl_start_sync---of 7
btrfs_ioctl_subvol_getflags---of 4
btrfs_ioctl_subvol_setflags---of 15
btrfs_ioctl_tree_search---of 5
btrfs_ioctl_tree_search_v2---of 6
btrfs_ioctl_wait_sync---of 4
btrfs_is_empty_uuid---of 17
btrfs_may_delete---of 16
btrfs_mksnapshot---of 3
btrfs_mksubvol---of 16
btrfs_search_path_in_tree---of 10
btrfs_sync_inode_flags_to_i_flags67%of 3
btrfs_update_ioctl_balance_args---of 7
check_feature_bits---of 8
copy_to_sk---of 21
create_snapshot---of 26
create_subvol---of 32
d_delete_notify---of 4
exclop_start_or_cancel_reloc---of 9
file_end_write---of 11
file_start_write---of 11
fsnotify_access---of 9
fsnotify_mkdir---of 7
fsnotify_modify---of 9
init_sync_kiocb---of 8
key_in_sk---of 3
rcu_read_unlock---of 6
search_ioctl---of 16
-----------
SUMMARY67%of 3

btrfs_alloc_subpage---of 5
btrfs_attach_subpage---of 31
btrfs_detach_subpage---of 32
btrfs_folio_assert_not_dirty13%of 24
btrfs_folio_clamp_clear_checked---of 14
btrfs_folio_clamp_clear_dirty---of 10
btrfs_folio_clamp_clear_ordered---of 14
btrfs_folio_clamp_clear_uptodate---of 14
btrfs_folio_clamp_clear_writeback---of 8
btrfs_folio_clamp_set_checked---of 14
btrfs_folio_clamp_set_dirty---of 8
btrfs_folio_clamp_set_ordered22%of 14
btrfs_folio_clamp_set_uptodate---of 14
btrfs_folio_clamp_set_writeback---of 8
btrfs_folio_clamp_test_checked---of 14
btrfs_folio_clamp_test_dirty---of 14
btrfs_folio_clamp_test_ordered---of 14
btrfs_folio_clamp_test_uptodate---of 16
btrfs_folio_clamp_test_writeback---of 14
btrfs_folio_clear_checked---of 14
btrfs_folio_clear_dirty20%of 10
btrfs_folio_clear_ordered---of 14
btrfs_folio_clear_uptodate---of 14
btrfs_folio_clear_writeback---of 8
btrfs_folio_dec_eb_refs---of 20
btrfs_folio_end_writer_lock12%of 17
btrfs_folio_inc_eb_refs12%of 18
btrfs_folio_set_checked---of 14
btrfs_folio_set_dirty25%of 8
btrfs_folio_set_ordered---of 14
btrfs_folio_set_uptodate---of 14
btrfs_folio_set_writeback25%of 8
btrfs_folio_start_writer_lock9%of 24
btrfs_folio_test_checked---of 14
btrfs_folio_test_dirty22%of 14
btrfs_folio_test_ordered---of 14
btrfs_folio_test_uptodate25%of 16
btrfs_folio_test_writeback---of 14
btrfs_folio_unlock_writer---of 24
btrfs_free_subpage---of 1
btrfs_init_subpage_info---of 3
btrfs_is_subpage34%of 6
btrfs_subpage_assert---of 23
btrfs_subpage_clear_and_test_dirty---of 5
btrfs_subpage_clear_checked---of 11
btrfs_subpage_clear_dirty---of 3
btrfs_subpage_clear_ordered---of 12
btrfs_subpage_clear_uptodate---of 11
btrfs_subpage_clear_writeback---of 14
btrfs_subpage_dump_bitmap---of 11
btrfs_subpage_end_reader---of 9
btrfs_subpage_set_checked---of 12
btrfs_subpage_set_dirty---of 5
btrfs_subpage_set_ordered---of 11
btrfs_subpage_set_uptodate---of 12
btrfs_subpage_set_writeback---of 13
btrfs_subpage_start_reader---of 7
btrfs_subpage_test_checked---of 5
btrfs_subpage_test_dirty---of 5
btrfs_subpage_test_ordered---of 5
btrfs_subpage_test_uptodate---of 5
btrfs_subpage_test_writeback---of 5
folio_lock34%of 9
-----------
SUMMARY18%of 168

alloc_shrinker_info---of 15
do_shrink_slab---of 76
free_shrinker_info---of 12
reparent_shrinker_deferred---of 23
set_shrinker_bit25%of 20
shrink_slab---of 97
shrinker_alloc6%of 37
shrinker_free---of 14
shrinker_free_rcu_cb---of 1
shrinker_register50%of 4
-----------
SUMMARY15%of 61

-----------
SUMMARY---of 0

native_steal_clock---of 1
native_tlb_remove_table67%of 3
paravirt_disable_iospace---of 1
paravirt_set_sched_clock---of 1
-----------
SUMMARY67%of 3

-----------
SUMMARY---of 0

address_val---of 8
bdev_name16%of 13
bitmap_list_string---of 14
bitmap_string---of 13
bprintf---of 1
bstr_printf---of 53
clock---of 8
date_str---of 5
default_pointer---of 53
dentry_name---of 41
device_node_string---of 66
err_ptr---of 8
escaped_string---of 16
file_dentry_name---of 8
fill_ptr_key---of 1
flags_string---of 66
format_decode44%of 50
fourcc_string---of 33
fwnode_full_name_string---of 9
fwnode_string---of 27
hex_string---of 21
ip4_addr_string---of 7
ip4_addr_string_sa---of 17
ip4_string---of 40
ip6_addr_string---of 10
ip6_addr_string_sa---of 27
ip6_compressed_string---of 45
ip6_string---of 15
ip_addr_string---of 41
mac_address_string---of 26
netdev_bits---of 26
num_to_str---of 17
number31%of 89
pointer4%of 85
pointer_string---of 1
ptr_to_hashval---of 3
put_dec75%of 4
put_dec_full858%of 7
put_dec_trunc860%of 10
resource_string---of 111
restricted_pointer---of 31
rtc_str---of 17
scnprintf---of 4
simple_strntoll67%of 3
simple_strntoull60%of 5
simple_strtol---of 3
simple_strtoll---of 1
simple_strtoul100%of 1
simple_strtoull100%of 1
skip_atoi100%of 3
snprintf100%of 1
special_hex_number---of 1
sprintf100%of 1
sscanf100%of 1
string39%of 13
string_nocheck---of 7
symbol_string---of 15
time64_str---of 1
time_and_date---of 21
time_str---of 5
uuid_string---of 23
vbin_printf---of 78
vscnprintf50%of 4
vsnprintf30%of 67
vsprintf---of 1
vsscanf12%of 116
widen_string14%of 22
-----------
SUMMARY25%of 496

__dquot_alloc_space6%of 69
__dquot_drop---of 22
__dquot_free_space11%of 39
__dquot_initialize5%of 46
__dquot_transfer---of 106
__quota_error---of 3
add_dquot_ref---of 15
do_proc_dqstats---of 4
dqcache_shrink_count---of 1
dqcache_shrink_scan---of 15
dqget---of 37
dqput---of 14
dquot_acquire---of 19
dquot_add_inodes---of 27
dquot_add_space---of 33
dquot_alloc---of 1
dquot_alloc_inode4%of 52
dquot_claim_space_nodirty17%of 18
dquot_commit---of 9
dquot_commit_info---of 3
dquot_destroy---of 1
dquot_disable---of 84
dquot_drop34%of 6
dquot_file_open50%of 4
dquot_free_inode7%of 32
dquot_get_dqblk---of 3
dquot_get_next_dqblk---of 5
dquot_get_next_id---of 11
dquot_get_state---of 13
dquot_initialize100%of 1
dquot_initialize_needed---of 12
dquot_load_quota_inode---of 21
dquot_load_quota_sb---of 44
dquot_mark_dquot_dirty---of 9
dquot_quota_disable---of 23
dquot_quota_enable---of 19
dquot_quota_off---of 1
dquot_quota_on---of 4
dquot_quota_on_mount---of 4
dquot_quota_sync29%of 21
dquot_reclaim_space_nodirty---of 18
dquot_release---of 13
dquot_resume---of 11
dquot_scan_active---of 11
dquot_set_dqblk---of 52
dquot_set_dqinfo---of 21
dquot_transfer16%of 13
dquot_write_dquot---of 6
dquot_writeback_dquots39%of 34
mark_all_dquot_dirty---of 25
mark_info_dirty---of 3
prepare_warning---of 8
quota_release_workfn---of 17
register_quota_format---of 1
srcu_read_lock_held---of 3
unregister_quota_format---of 4
-----------
SUMMARY13%of 335

-----------
SUMMARY---of 0

cmp_ex_search100%of 1
cmp_ex_sort---of 1
search_extable100%of 1
sort_extable---of 1
swap_ex---of 1
trim_init_extable---of 14
-----------
SUMMARY100%of 2

__fuse_get_request---of 4
copy_out_args---of 15
flush_bg_queue---of 16
folio_get---of 3
folio_mapped---of 16
folio_put---of 4
fuse_abort_conn---of 37
fuse_copy_args---of 16
fuse_copy_fill---of 23
fuse_copy_finish---of 6
fuse_copy_one---of 9
fuse_copy_page---of 116
fuse_dequeue_forget---of 8
fuse_dev_cleanup---of 1
fuse_dev_do_read---of 42
fuse_dev_do_write---of 202
fuse_dev_fasync---of 3
fuse_dev_ioctl---of 13
fuse_dev_open---of 1
fuse_dev_poll23%of 9
fuse_dev_read---of 4
fuse_dev_release---of 15
fuse_dev_splice_read---of 14
fuse_dev_splice_write---of 21
fuse_dev_wake_and_unlock---of 1
fuse_dev_write---of 4
fuse_get_req---of 28
fuse_get_unique---of 1
fuse_len_args---of 8
fuse_put_request---of 10
fuse_queue_forget---of 3
fuse_read_forget---of 65
fuse_read_interrupt---of 22
fuse_request_end---of 16
fuse_retrieve_end---of 1
fuse_set_initialized---of 1
fuse_simple_background---of 23
fuse_simple_request---of 72
fuse_wait_aborted---of 5
list_move---of 5
list_move_tail---of 5
put_page---of 14
queue_interrupt---of 8
-----------
SUMMARY23%of 9

__kthread_cancel_work_sync---of 11
__kthread_create_on_node---of 8
__kthread_create_worker---of 7
__kthread_init_worker---of 1
__kthread_parkme---of 4
__kthread_queue_delayed_work---of 13
free_kthread_struct---of 6
get_kthread_comm---of 7
kthread---of 8
kthread_associate_blkcg---of 32
kthread_bind---of 3
kthread_bind_mask---of 3
kthread_blkcg50%of 4
kthread_cancel_delayed_work_sync---of 1
kthread_cancel_work_sync---of 1
kthread_complete_and_exit---of 3
kthread_create_on_cpu---of 6
kthread_create_on_node---of 1
kthread_create_worker---of 1
kthread_create_worker_on_cpu---of 1
kthread_data---of 3
kthread_delayed_work_timer_fn---of 10
kthread_destroy_worker---of 6
kthread_flush_work---of 7
kthread_flush_work_fn---of 1
kthread_flush_worker---of 7
kthread_freezable_should_stop---of 8
kthread_func---of 4
kthread_insert_work34%of 27
kthread_is_per_cpu75%of 4
kthread_mod_delayed_work---of 8
kthread_park---of 8
kthread_parkme---of 3
kthread_probe_data---of 4
kthread_queue_delayed_work---of 7
kthread_queue_work43%of 7
kthread_set_per_cpu---of 8
kthread_should_park---of 3
kthread_should_stop---of 3
kthread_should_stop_or_park---of 4
kthread_stop---of 37
kthread_stop_put---of 4
kthread_unpark---of 6
kthread_unuse_mm---of 9
kthread_use_mm---of 9
kthread_worker_fn---of 48
kthreadd---of 22
set_kthread_struct---of 6
to_kthread---of 3
tsk_fork_get_node---of 3
-----------
SUMMARY41%of 42

__ia32_sys_mremap---of 1
__se_sys_mremap22%of 71
__x64_sys_mremap100%of 1
move_page_tables9%of 110
move_pgt_entry---of 31
move_vma24%of 59
vm_flags_clear---of 6
vma_to_resize27%of 15
-----------
SUMMARY17%of 256

-----------
SUMMARY---of 0

__import_iovec32%of 25
__iov_iter_get_pages_alloc---of 29
_copy_from_iter5%of 81
_copy_from_iter_flushcache---of 74
_copy_from_iter_nocache---of 74
_copy_mc_to_iter---of 79
_copy_to_iter5%of 81
bvec_npages---of 5
copy_compat_iovec_from_user---of 8
copy_page_from_iter75%of 4
copy_page_from_iter_atomic7%of 82
copy_page_to_iter40%of 5
copy_page_to_iter_nofault---of 80
dup_iter---of 5
fault_in_iov_iter_readable34%of 9
fault_in_iov_iter_writeable---of 9
find_subpage---of 16
get_page---of 9
import_iovec67%of 3
import_ubuf---of 4
iov_iter_advance30%of 10
iov_iter_aligned_iovec---of 7
iov_iter_alignment25%of 8
iov_iter_alignment_bvec---of 4
iov_iter_alignment_iovec---of 10
iov_iter_bvec67%of 3
iov_iter_bvec_advance---of 6
iov_iter_discard---of 3
iov_iter_extract_bvec_pages---of 34
iov_iter_extract_kvec_pages---of 31
iov_iter_extract_pages19%of 27
iov_iter_extract_xarray_pages---of 34
iov_iter_gap_alignment---of 9
iov_iter_get_pages2---of 4
iov_iter_get_pages_alloc2---of 3
iov_iter_init67%of 3
iov_iter_iovec_advance50%of 10
iov_iter_is_aligned16%of 13
iov_iter_kvec---of 3
iov_iter_npages38%of 8
iov_iter_restore---of 7
iov_iter_revert19%of 11
iov_iter_single_seg_count---of 6
iov_iter_xarray---of 3
iov_iter_zero---of 76
iov_npages---of 9
iovec_from_user36%of 17
iter_xarray_get_pages---of 34
page_copy_sane15%of 14
want_pages_array---of 6
xas_next---of 12
xas_reload---of 24
-----------
SUMMARY16%of 414

-----------
SUMMARY---of 0

__get_user_pages21%of 81
__gup_longterm_locked6%of 173
__mm_populate39%of 18
_compound_head---of 7
check_vma_flags18%of 29
fault_in_readable50%of 12
fault_in_safe_writeable---of 11
fault_in_subpage_writeable---of 12
fault_in_writeable---of 12
faultin_page_range---of 46
fixup_user_fault---of 37
folio_add_pin---of 12
folio_lock---of 9
folio_put_refs---of 3
follow_page---of 5
follow_page_mask2%of 161
follow_page_pte16%of 133
follow_pfn_pte---of 7
get_dump_page---of 12
get_user_pages---of 44
get_user_pages_fast---of 3
get_user_pages_fast_only---of 3
get_user_pages_remote---of 50
get_user_pages_unlocked---of 44
gup_fast_devmap_leaf---of 11
gup_fast_fallback15%of 182
gup_fast_folio_allowed20%of 20
gup_fast_undo_dev_pagemap---of 16
gup_must_unshare10%of 33
gup_put_folio---of 18
is_valid_gup_args30%of 10
pin_user_pages---of 3
pin_user_pages_fast67%of 3
pin_user_pages_remote---of 3
pin_user_pages_unlocked---of 3
populate_vma_page_range37%of 11
put_dev_pagemap---of 15
sanity_check_pinned_pages12%of 85
try_get_folio20%of 20
try_grab_folio17%of 30
try_grab_page34%of 21
unpin_user_page---of 7
unpin_user_page_range_dirty_lock---of 34
unpin_user_pages---of 20
unpin_user_pages_dirty_lock---of 28
-----------
SUMMARY14%of 1022

char2uni100%of 1
uni2char---of 5
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

__sysvec_irq_work---of 29
arch_irq_work_raise67%of 3
fred_sysvec_irq_work---of 4
-----------
SUMMARY67%of 3

-----------
SUMMARY---of 0

init_once100%of 1
minix_alloc_inode100%of 1
minix_bmap---of 1
minix_evict_inode40%of 10
minix_fill_super15%of 50
minix_free_in_core_inode---of 1
minix_get_block67%of 3
minix_get_tree100%of 1
minix_getattr---of 3
minix_iget15%of 21
minix_init_fs_context100%of 1
minix_prepare_chunk100%of 1
minix_put_super---of 16
minix_read_folio100%of 1
minix_reconfigure---of 11
minix_set_inode---of 5
minix_statfs---of 3
minix_truncate---of 7
minix_write_begin---of 3
minix_write_failed---of 8
minix_write_inode22%of 23
minix_writepages100%of 1
-----------
SUMMARY25%of 114

__blk_mq_alloc_disk---of 7
__blk_mq_alloc_driver_tag20%of 20
__blk_mq_alloc_requests28%of 59
__blk_mq_complete_request_remote---of 1
__blk_mq_end_request32%of 22
__blk_mq_free_request43%of 14
__blk_mq_requeue_request---of 24
__blk_mq_unfreeze_queue---of 7
__sbitmap_for_each_set---of 17
blk_account_io_completion38%of 8
blk_account_io_done31%of 26
blk_account_io_start35%of 23
blk_add_rq_to_plug79%of 14
blk_done_softirq---of 6
blk_dump_rq_flags---of 7
blk_end_sync_rq---of 1
blk_execute_rq---of 12
blk_execute_rq_nowait---of 7
blk_freeze_queue---of 4
blk_freeze_queue_start---of 4
blk_hctx_poll---of 13
blk_insert_cloned_request---of 31
blk_mq_alloc_and_init_hctx---of 28
blk_mq_alloc_disk_for_queue---of 4
blk_mq_alloc_map_and_rqs---of 33
blk_mq_alloc_queue---of 4
blk_mq_alloc_request---of 20
blk_mq_alloc_request_hctx---of 32
blk_mq_alloc_set_map_and_rqs---of 23
blk_mq_alloc_sq_tag_set---of 1
blk_mq_alloc_tag_set---of 40
blk_mq_cancel_work_sync---of 4
blk_mq_check_expired---of 6
blk_mq_check_inflight---of 9
blk_mq_complete_request67%of 3
blk_mq_complete_request_remote15%of 27
blk_mq_delay_kick_requeue_list---of 1
blk_mq_delay_run_hw_queue9%of 23
blk_mq_delay_run_hw_queues---of 13
blk_mq_dequeue_from_ctx---of 4
blk_mq_destroy_queue---of 8
blk_mq_dispatch_rq_list14%of 74
blk_mq_dispatch_wake---of 4
blk_mq_end_request67%of 3
blk_mq_end_request_batch---of 60
blk_mq_exit_hctx---of 20
blk_mq_exit_queue---of 10
blk_mq_flush_busy_ctxs---of 1
blk_mq_flush_plug_list34%of 78
blk_mq_free_map_and_rqs---of 3
blk_mq_free_plug_rqs---of 5
blk_mq_free_request47%of 15
blk_mq_free_rq_map---of 1
blk_mq_free_rqs---of 28
blk_mq_free_tag_set---of 18
blk_mq_freeze_queue---of 1
blk_mq_freeze_queue_wait---of 5
blk_mq_freeze_queue_wait_timeout---of 5
blk_mq_get_hctx_node---of 11
blk_mq_handle_expired---of 10
blk_mq_has_request---of 3
blk_mq_hctx_mark_pending50%of 12
blk_mq_hctx_notify_dead---of 20
blk_mq_hctx_notify_offline---of 15
blk_mq_hctx_notify_online---of 5
blk_mq_in_flight---of 1
blk_mq_in_flight_rw---of 1
blk_mq_inc_active_requests---of 4
blk_mq_init_allocated_queue---of 42
blk_mq_insert_request---of 19
blk_mq_kick_requeue_list100%of 1
blk_mq_map_swqueue---of 60
blk_mq_plug_issue_direct35%of 20
blk_mq_poll---of 1
blk_mq_put_rq_ref---of 7
blk_mq_queue_inflight---of 1
blk_mq_quiesce_queue---of 6
blk_mq_quiesce_queue_nowait---of 3
blk_mq_quiesce_tagset---of 9
blk_mq_realloc_hw_ctxs---of 15
blk_mq_release---of 11
blk_mq_request_issue_directly28%of 18
blk_mq_requeue_request---of 8
blk_mq_requeue_work---of 19
blk_mq_rq_cpu---of 1
blk_mq_rq_ctx_init63%of 8
blk_mq_rq_inflight---of 7
blk_mq_run_hw_queue25%of 52
blk_mq_run_hw_queues---of 12
blk_mq_run_work_fn---of 13
blk_mq_start_hw_queue---of 1
blk_mq_start_hw_queues---of 4
blk_mq_start_request30%of 30
blk_mq_start_stopped_hw_queue---of 3
blk_mq_start_stopped_hw_queues---of 8
blk_mq_stop_hw_queue---of 1
blk_mq_stop_hw_queues---of 4
blk_mq_submit_bio28%of 119
blk_mq_timeout_work---of 13
blk_mq_try_issue_directly---of 25
blk_mq_try_issue_list_directly---of 18
blk_mq_unfreeze_queue---of 5
blk_mq_unquiesce_queue---of 4
blk_mq_unquiesce_tagset---of 6
blk_mq_update_nr_hw_queues---of 75
blk_mq_update_nr_requests---of 25
blk_mq_update_queue_map---of 15
blk_mq_update_tag_set_shared---of 17
blk_mq_wait_quiesce_done---of 3
blk_mq_wake_waiters---of 7
blk_print_req_error---of 3
blk_rq_init40%of 5
blk_rq_is_poll---of 3
blk_rq_poll---of 5
blk_rq_prep_clone---of 19
blk_rq_unprep_clone---of 4
blk_softirq_cpu_dead---of 6
blk_steal_bios---of 3
blk_update_request23%of 49
dispatch_rq_from_ctx---of 14
flush_busy_ctx---of 8
percpu_ref_get_many31%of 13
percpu_ref_put_many29%of 14
percpu_ref_tryget---of 16
rcu_read_unlock---of 6
srcu_read_lock---of 1
srcu_read_unlock---of 3
trace_block_plug27%of 15
trace_block_rq_complete27%of 15
trace_block_rq_error---of 15
trace_block_rq_insert27%of 15
trace_block_unplug27%of 15
-----------
SUMMARY29%of 810

__bpf_trace_file_check_and_advance_wb_err---of 1
__bpf_trace_filemap_set_wb_err---of 1
__bpf_trace_mm_filemap_op_page_cache---of 1
__filemap_add_folio19%of 101
__filemap_fdatawait_range50%of 16
__filemap_fdatawrite_range100%of 1
__filemap_get_folio32%of 60
__filemap_remove_folio33%of 31
__filemap_set_wb_err---of 15
__folio_lock100%of 1
__folio_lock_killable---of 1
__folio_lock_or_retry---of 21
__generic_file_write_iter25%of 8
__ia32_sys_cachestat---of 1
__probestub_file_check_and_advance_wb_err---of 1
__probestub_filemap_set_wb_err---of 1
__probestub_mm_filemap_add_to_page_cache---of 1
__probestub_mm_filemap_delete_from_page_cache---of 1
__se_sys_cachestat---of 48
__traceiter_file_check_and_advance_wb_err---of 4
__traceiter_filemap_set_wb_err---of 4
__traceiter_mm_filemap_add_to_page_cache---of 4
__traceiter_mm_filemap_delete_from_page_cache---of 4
__x64_sys_cachestat---of 1
count_memcg_event_mm30%of 24
delete_from_page_cache_batch43%of 47
do_read_cache_folio16%of 38
do_read_cache_page30%of 10
do_sync_mmap_readahead12%of 17
file_check_and_advance_wb_err13%of 16
file_fdatawait_range100%of 1
file_write_and_wait_range50%of 6
filemap_add_folio25%of 29
filemap_alloc_folio_noprof10%of 20
filemap_check_errors60%of 5
filemap_fault21%of 83
filemap_fault_recheck_pte_none34%of 12
filemap_fdatawait_keep_errors67%of 3
filemap_fdatawait_range100%of 1
filemap_fdatawait_range_keep_errors---of 3
filemap_fdatawrite100%of 1
filemap_fdatawrite_range---of 1
filemap_fdatawrite_wbc60%of 5
filemap_flush100%of 1
filemap_free_folio25%of 20
filemap_get_entry25%of 20
filemap_get_folios100%of 1
filemap_get_folios_contig23%of 44
filemap_get_folios_tag29%of 35
filemap_get_pages8%of 103
filemap_get_read_batch20%of 50
filemap_invalidate_inode---of 6
filemap_invalidate_lock_two---of 5
filemap_invalidate_unlock_two---of 5
filemap_map_pages15%of 92
filemap_page_mkwrite34%of 27
filemap_range_has_page---of 15
filemap_range_has_writeback---of 40
filemap_read---of 43
filemap_read_folio27%of 34
filemap_release_folio28%of 22
filemap_remove_folio37%of 11
filemap_splice_read25%of 28
filemap_unaccount_folio20%of 65
filemap_write_and_wait_range50%of 6
find_get_entries30%of 37
find_lock_entries27%of 68
folio_add_wait_queue---of 9
folio_contains30%of 24
folio_end_private_2---of 17
folio_end_read22%of 23
folio_end_writeback24%of 30
folio_lock34%of 9
folio_put---of 4
folio_size30%of 10
folio_unlock32%of 16
folio_wait_bit100%of 1
folio_wait_bit_common33%of 58
folio_wait_bit_killable---of 1
folio_wait_private_2---of 10
folio_wait_private_2_killable---of 10
folio_wake_bit40%of 10
generic_file_direct_write---of 8
generic_file_mmap---of 4
generic_file_read_iter---of 13
generic_file_readonly_mmap---of 5
generic_file_write_iter34%of 9
generic_perform_write47%of 15
inode_to_wb---of 6
kiocb_invalidate_pages30%of 10
kiocb_invalidate_post_direct_write40%of 5
kiocb_write_and_wait---of 9
mapping_read_folio_gfp---of 1
mapping_seek_hole_data---of 63
maybe_unlock_mmap_for_io---of 6
migration_entry_wait_on_locked---of 53
next_uptodate_folio16%of 52
page_cache_next_miss---of 5
page_cache_prev_miss---of 16
perf_trace_file_check_and_advance_wb_err---of 8
perf_trace_filemap_set_wb_err---of 8
perf_trace_mm_filemap_op_page_cache---of 15
read_cache_folio---of 1
read_cache_page100%of 1
read_cache_page_gfp---of 1
release_fault_lock16%of 13
replace_page_cache_folio---of 70
splice_folio_into_pipe34%of 6
trace_event_raw_event_file_check_and_advance_wb_err---of 7
trace_event_raw_event_filemap_set_wb_err---of 7
trace_event_raw_event_mm_filemap_op_page_cache---of 14
trace_mm_filemap_delete_from_page_cache27%of 15
trace_raw_output_file_check_and_advance_wb_err---of 3
trace_raw_output_filemap_set_wb_err---of 3
trace_raw_output_mm_filemap_op_page_cache---of 3
wake_page_function40%of 10
xas_next34%of 12
xas_next_entry42%of 17
xas_reload21%of 24
-----------
SUMMARY26%of 1570

ntfs_cmp_names---of 17
ntfs_cmp_names_cpu42%of 17
ntfs_names_hash---of 7
-----------
SUMMARY42%of 17

__bpf_trace_ma_op---of 1
__bpf_trace_ma_read---of 1
__bpf_trace_ma_write---of 1
__mt_destroy25%of 12
__mt_dup---of 4
__probestub_ma_op---of 1
__probestub_ma_read---of 1
__probestub_ma_write---of 1
__traceiter_ma_op---of 4
__traceiter_ma_read---of 4
__traceiter_ma_write---of 4
mab_calc_split---of 30
mab_mas_cp---of 27
mab_no_null_split---of 9
mas_adopt_children---of 18
mas_alloc_cyclic37%of 11
mas_alloc_nodes21%of 29
mas_ascend14%of 30
mas_bulk_rebalance---of 7
mas_commit_b_node---of 53
mas_descend27%of 23
mas_destroy4%of 86
mas_dump---of 12
mas_dup_build---of 48
mas_dup_free---of 37
mas_empty_area25%of 79
mas_empty_area_rev34%of 109
mas_erase---of 16
mas_expected_entries---of 7
mas_find32%of 48
mas_find_child---of 20
mas_find_range---of 48
mas_find_range_rev---of 3
mas_find_rev---of 3
mas_find_rev_setup---of 35
mas_get_slot34%of 15
mas_insert28%of 11
mas_leaf_max_gap48%of 25
mas_leaf_set_meta---of 5
mas_mab_cp---of 37
mas_max_gap50%of 4
mas_new_root---of 21
mas_next67%of 3
mas_next_node21%of 62
mas_next_range67%of 3
mas_next_setup10%of 32
mas_next_sibling---of 16
mas_next_slot34%of 48
mas_nomem25%of 8
mas_pause---of 1
mas_pop_node32%of 16
mas_preallocate29%of 53
mas_prev67%of 3
mas_prev_node19%of 64
mas_prev_range67%of 3
mas_prev_setup4%of 50
mas_prev_slot28%of 47
mas_push_data---of 43
mas_replace_node20%of 15
mas_root_expand34%of 18
mas_set_height67%of 3
mas_set_parent---of 5
mas_skip_node---of 27
mas_spanning_rebalance---of 93
mas_split_final_node---of 5
mas_start67%of 15
mas_store---of 13
mas_store_b_node---of 45
mas_store_gfp31%of 13
mas_store_prealloc25%of 12
mas_update_gap15%of 28
mas_walk23%of 9
mas_wmb_replace---of 99
mas_wr_dump---of 1
mas_wr_end_piv34%of 18
mas_wr_modify31%of 88
mas_wr_spanning_store---of 46
mas_wr_store_entry34%of 9
mas_wr_walk45%of 18
mas_wr_walk_descend64%of 19
mas_wr_walk_index---of 11
mast_ascend---of 25
mast_fill_bnode---of 27
mast_spanning_rebalance---of 45
mast_split_data---of 21
mt_cache_shrink---of 1
mt_destroy_walk---of 39
mt_dump---of 13
mt_dump_entry---of 10
mt_dump_node---of 186
mt_find32%of 32
mt_find_after---of 3
mt_free_rcu---of 1
mt_free_walk---of 25
mt_next---of 13
mt_prev---of 13
mt_validate40%of 244
mte_dead_leaves---of 15
mte_dead_walk---of 24
mte_destroy_descend---of 25
mtree_alloc_cyclic50%of 4
mtree_alloc_range---of 7
mtree_alloc_rrange---of 7
mtree_destroy---of 1
mtree_dup---of 4
mtree_erase---of 1
mtree_insert---of 1
mtree_insert_range---of 6
mtree_load32%of 41
mtree_range_walk54%of 30
mtree_store---of 1
mtree_store_range---of 6
perf_trace_ma_op---of 8
perf_trace_ma_read---of 8
perf_trace_ma_write---of 8
trace_event_raw_event_ma_op---of 7
trace_event_raw_event_ma_read---of 7
trace_event_raw_event_ma_write---of 7
trace_ma_op---of 15
trace_ma_read27%of 15
trace_ma_write27%of 15
trace_raw_output_ma_op---of 3
trace_raw_output_ma_read---of 3
trace_raw_output_ma_write---of 3
-----------
SUMMARY30%of 1520

-----------
SUMMARY---of 0

fixup_vdso_exception25%of 8
-----------
SUMMARY25%of 8

-----------
SUMMARY---of 0

bad_file_open---of 1
bad_inode_atomic_open---of 1
bad_inode_create---of 1
bad_inode_fiemap---of 1
bad_inode_get_acl---of 1
bad_inode_get_link---of 1
bad_inode_getattr---of 1
bad_inode_link---of 1
bad_inode_listxattr---of 1
bad_inode_lookup---of 1
bad_inode_mkdir---of 1
bad_inode_mknod---of 1
bad_inode_permission---of 1
bad_inode_readlink---of 1
bad_inode_rename2---of 1
bad_inode_rmdir---of 1
bad_inode_set_acl---of 1
bad_inode_setattr---of 1
bad_inode_symlink---of 1
bad_inode_tmpfile---of 1
bad_inode_unlink---of 1
bad_inode_update_time---of 1
iget_failed---of 3
is_bad_inode100%of 1
make_bad_inode---of 3
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

fsnotify_change59%of 12
inode_newsize_ok---of 7
may_setattr---of 9
notify_change48%of 48
setattr_copy100%of 13
setattr_prepare40%of 38
setattr_should_drop_sgid---of 4
setattr_should_drop_suidgid45%of 9
try_break_deleg58%of 7
-----------
SUMMARY52%of 127

__ia32_compat_sys_lseek---of 9
__ia32_compat_sys_preadv---of 9
__ia32_compat_sys_preadv2---of 1
__ia32_compat_sys_preadv64---of 8
__ia32_compat_sys_preadv64v2---of 1
__ia32_compat_sys_pwritev---of 9
__ia32_compat_sys_pwritev2---of 1
__ia32_compat_sys_pwritev64---of 8
__ia32_compat_sys_pwritev64v2---of 1
__ia32_compat_sys_sendfile---of 1
__ia32_compat_sys_sendfile64---of 1
__ia32_sys_copy_file_range---of 1
__ia32_sys_llseek---of 1
__ia32_sys_lseek---of 9
__ia32_sys_pread64---of 6
__ia32_sys_preadv---of 8
__ia32_sys_preadv2---of 1
__ia32_sys_pwrite64---of 6
__ia32_sys_pwritev---of 8
__ia32_sys_pwritev2---of 1
__ia32_sys_read---of 1
__ia32_sys_readv---of 1
__ia32_sys_sendfile---of 1
__ia32_sys_sendfile64---of 1
__ia32_sys_write---of 1
__ia32_sys_writev---of 1
__kernel_read---of 25
__kernel_write---of 1
__kernel_write_iter---of 24
__se_compat_sys_preadv2---of 10
__se_compat_sys_preadv64v2---of 10
__se_compat_sys_pwritev2---of 10
__se_compat_sys_pwritev64v2---of 10
__se_compat_sys_sendfile---of 4
__se_compat_sys_sendfile64---of 4
__se_sys_copy_file_range37%of 19
__se_sys_llseek---of 10
__se_sys_preadv240%of 10
__se_sys_pwritev250%of 10
__se_sys_sendfile---of 4
__se_sys_sendfile6475%of 4
__x64_compat_sys_lseek---of 9
__x64_compat_sys_preadv---of 9
__x64_compat_sys_preadv2---of 1
__x64_compat_sys_preadv64---of 9
__x64_compat_sys_preadv64v2---of 1
__x64_compat_sys_pwritev---of 9
__x64_compat_sys_pwritev2---of 1
__x64_compat_sys_pwritev64---of 9
__x64_compat_sys_pwritev64v2---of 1
__x64_compat_sys_sendfile---of 1
__x64_compat_sys_sendfile64---of 1
__x64_sys_copy_file_range100%of 1
__x64_sys_llseek---of 1
__x64_sys_lseek56%of 9
__x64_sys_pread6443%of 7
__x64_sys_preadv45%of 9
__x64_sys_preadv2100%of 1
__x64_sys_pwrite6443%of 7
__x64_sys_pwritev45%of 9
__x64_sys_pwritev2100%of 1
__x64_sys_read100%of 1
__x64_sys_readv100%of 1
__x64_sys_sendfile---of 1
__x64_sys_sendfile64100%of 1
__x64_sys_write100%of 1
__x64_sys_writev100%of 1
default_llseek---of 14
do_iter_readv_writev40%of 23
do_readv59%of 12
do_sendfile29%of 38
do_writev50%of 12
fixed_size_llseek---of 3
generic_file_llseek---of 1
generic_file_llseek_size23%of 18
generic_file_rw_checks---of 7
generic_write_check_limits---of 6
generic_write_checks50%of 4
generic_write_checks_count47%of 13
kernel_read---of 3
kernel_write---of 6
ksys_pread64---of 7
ksys_pwrite64---of 7
ksys_read60%of 10
ksys_write70%of 10
no_seek_end_llseek---of 3
no_seek_end_llseek_size---of 3
noop_llseek---of 1
rw_verify_area36%of 25
sb_end_write40%of 10
sb_start_write40%of 10
vfs_copy_file_range23%of 53
vfs_iocb_iter_read---of 16
vfs_iocb_iter_write---of 16
vfs_iter_read---of 16
vfs_iter_write---of 19
vfs_llseek67%of 3
vfs_read29%of 28
vfs_readv36%of 28
vfs_setpos---of 7
vfs_write47%of 32
vfs_writev36%of 31
warn_unsupported---of 3
-----------
SUMMARY40%of 452

__pm_relax50%of 4
__pm_stay_awake---of 3
device_set_wakeup_capable---of 7
device_set_wakeup_enable---of 5
device_wakeup_arm_wake_irqs---of 6
device_wakeup_attach_irq---of 4
device_wakeup_detach_irq---of 3
device_wakeup_disable---of 4
device_wakeup_disarm_wake_irqs---of 6
device_wakeup_enable---of 13
pm_get_wakeup_count---of 7
pm_print_active_wakeup_sources---of 16
pm_relax---of 5
pm_save_wakeup_count---of 3
pm_stay_awake---of 4
pm_system_cancel_wakeup---of 5
pm_system_irq_wakeup---of 9
pm_system_wakeup---of 1
pm_wakeup_clear---of 4
pm_wakeup_dev_event---of 3
pm_wakeup_irq---of 1
pm_wakeup_pending---of 6
pm_wakeup_timer_fn---of 5
pm_wakeup_ws_event---of 5
print_wakeup_source_stats---of 4
wakeup_source_add---of 4
wakeup_source_create---of 5
wakeup_source_deactivate---of 21
wakeup_source_destroy---of 6
wakeup_source_register---of 9
wakeup_source_remove---of 4
wakeup_source_report_event---of 24
wakeup_source_unregister---of 6
wakeup_sources_read_lock---of 1
wakeup_sources_read_unlock---of 3
wakeup_sources_stats_open---of 1
wakeup_sources_stats_seq_next---of 3
wakeup_sources_stats_seq_show---of 1
wakeup_sources_stats_seq_start---of 7
wakeup_sources_stats_seq_stop---of 3
wakeup_sources_walk_next---of 3
wakeup_sources_walk_start---of 1
-----------
SUMMARY50%of 4

-----------
SUMMARY---of 0

btrfs_getxattr29%of 7
btrfs_initxattrs---of 8
btrfs_listxattr---of 17
btrfs_setxattr---of 30
btrfs_setxattr_trans---of 12
btrfs_xattr_handler_get---of 1
btrfs_xattr_handler_get_security50%of 6
btrfs_xattr_handler_set---of 3
btrfs_xattr_handler_set_prop---of 9
btrfs_xattr_handler_set_security---of 4
btrfs_xattr_security_init---of 1
-----------
SUMMARY39%of 13

-----------
SUMMARY---of 0

__ia32_compat_sys_timer_create---of 4
__ia32_sys_clock_adjtime---of 10
__ia32_sys_clock_adjtime32---of 10
__ia32_sys_clock_getres---of 8
__ia32_sys_clock_getres_time32---of 8
__ia32_sys_clock_gettime---of 7
__ia32_sys_clock_gettime32---of 7
__ia32_sys_clock_nanosleep---of 1
__ia32_sys_clock_nanosleep_time32---of 1
__ia32_sys_clock_settime---of 8
__ia32_sys_clock_settime32---of 8
__ia32_sys_timer_create---of 4
__ia32_sys_timer_delete---of 1
__ia32_sys_timer_getoverrun---of 3
__ia32_sys_timer_gettime---of 1
__ia32_sys_timer_gettime32---of 1
__ia32_sys_timer_settime---of 5
__ia32_sys_timer_settime32---of 5
__lock_timer---of 23
__se_sys_clock_nanosleep---of 10
__se_sys_clock_nanosleep_time32---of 10
__se_sys_timer_delete---of 11
__se_sys_timer_gettime---of 5
__se_sys_timer_gettime32---of 5
__x64_compat_sys_timer_create---of 4
__x64_sys_clock_adjtime---of 10
__x64_sys_clock_adjtime32---of 10
__x64_sys_clock_getres---of 8
__x64_sys_clock_getres_time32---of 8
__x64_sys_clock_gettime43%of 7
__x64_sys_clock_gettime32---of 7
__x64_sys_clock_nanosleep---of 1
__x64_sys_clock_nanosleep_time32---of 1
__x64_sys_clock_settime---of 8
__x64_sys_clock_settime32---of 8
__x64_sys_timer_create---of 4
__x64_sys_timer_delete---of 1
__x64_sys_timer_getoverrun---of 3
__x64_sys_timer_gettime---of 1
__x64_sys_timer_gettime32---of 1
__x64_sys_timer_settime---of 5
__x64_sys_timer_settime32---of 5
common_hrtimer_arm---of 7
common_hrtimer_forward---of 1
common_hrtimer_rearm---of 1
common_hrtimer_remaining---of 1
common_hrtimer_try_to_cancel---of 1
common_nsleep---of 1
common_nsleep_timens---of 4
common_timer_create---of 1
common_timer_del---of 3
common_timer_get---of 8
common_timer_set---of 10
common_timer_wait_running---of 1
do_clock_adjtime---of 7
do_timer_create---of 53
do_timer_settime---of 13
exit_itimers---of 14
k_itimer_rcu_free---of 1
posix_clock_realtime_adj---of 1
posix_clock_realtime_set---of 1
posix_get_boottime_ktime---of 1
posix_get_boottime_timespec---of 1
posix_get_coarse_res---of 1
posix_get_hrtimer_res---of 1
posix_get_monotonic_coarse---of 1
posix_get_monotonic_ktime---of 1
posix_get_monotonic_raw---of 1
posix_get_monotonic_timespec---of 1
posix_get_realtime_coarse---of 1
posix_get_realtime_ktime---of 1
posix_get_realtime_timespec100%of 1
posix_get_tai_ktime---of 1
posix_get_tai_timespec---of 1
posix_timer_event---of 1
posix_timer_fn---of 6
posixtimer_rearm---of 5
timer_wait_running---of 13
-----------
SUMMARY50%of 8

-----------
SUMMARY---of 0

__shmem_file_setup28%of 11
cond_resched_rcu---of 11
folio_flags---of 10
folio_lock34%of 9
folio_put50%of 4
folio_size30%of 10
folio_swap---of 19
put_swap_device---of 14
shmem_add_to_page_cache23%of 94
shmem_alloc_and_add_folio18%of 50
shmem_alloc_inode100%of 1
shmem_charge---of 4
shmem_create100%of 1
shmem_destroy_inode60%of 5
shmem_enabled_show---of 1
shmem_enabled_store---of 14
shmem_encode_fh---of 5
shmem_error_remove_folio---of 1
shmem_evict_inode28%of 22
shmem_falloc_wait---of 23
shmem_fallocate---of 47
shmem_fault---of 16
shmem_fh_to_dentry---of 5
shmem_file_llseek---of 5
shmem_file_open100%of 1
shmem_file_read_iter---of 32
shmem_file_setup100%of 1
shmem_file_setup_with_mnt---of 1
shmem_file_splice_read27%of 26
shmem_file_write_iter40%of 5
shmem_fileattr_get100%of 1
shmem_fileattr_set34%of 9
shmem_fill_super29%of 25
shmem_free_fc50%of 4
shmem_free_in_core_inode---of 3
shmem_get_dquots100%of 1
shmem_get_folio---of 1
shmem_get_folio_gfp15%of 121
shmem_get_inode47%of 32
shmem_get_link14%of 15
shmem_get_offset_ctx---of 1
shmem_get_parent---of 1
shmem_get_partial_folio29%of 7
shmem_get_policy---of 1
shmem_get_tree100%of 1
shmem_get_unmapped_area---of 23
shmem_getattr---of 19
shmem_init_fs_context67%of 3
shmem_init_inode100%of 1
shmem_initxattrs20%of 15
shmem_inode_acct_blocks40%of 15
shmem_inode_unacct_blocks58%of 7
shmem_is_huge---of 15
shmem_kernel_file_setup---of 1
shmem_link---of 11
shmem_listxattr---of 1
shmem_lock---of 6
shmem_mapping100%of 1
shmem_match---of 3
shmem_mfill_atomic_pte---of 62
shmem_mkdir---of 3
shmem_mknod34%of 9
shmem_mmap---of 16
shmem_parse_one---of 46
shmem_parse_options16%of 13
shmem_partial_swap_usage---of 19
shmem_put_link50%of 4
shmem_put_super---of 3
shmem_read_folio_gfp---of 3
shmem_read_mapping_page_gfp---of 12
shmem_recalc_inode---of 3
shmem_reconfigure---of 50
shmem_rename2---of 13
shmem_replace_folio---of 50
shmem_rmdir---of 3
shmem_set_policy---of 1
shmem_setattr23%of 45
shmem_should_replace_folio---of 3
shmem_show_options---of 36
shmem_statfs---of 5
shmem_swap_usage---of 5
shmem_swapin_folio---of 74
shmem_symlink23%of 18
shmem_tmpfile---of 8
shmem_truncate_range---of 1
shmem_uncharge---of 3
shmem_undo_range13%of 116
shmem_unlink---of 5
shmem_unlock_mapping---of 7
shmem_unuse---of 48
shmem_unused_huge_count---of 1
shmem_unused_huge_scan---of 3
shmem_unused_huge_shrink---of 56
shmem_write_begin27%of 15
shmem_write_end26%of 27
shmem_writepage---of 100
shmem_xattr_handler_get100%of 1
shmem_xattr_handler_set---of 12
shmem_zero_setup---of 4
synchronous_wake_function---of 3
vma_is_anon_shmem---of 1
vma_is_shmem---of 1
xas_next_entry---of 17
zero_pipe_buf_get---of 1
zero_pipe_buf_release100%of 1
zero_pipe_buf_try_steal---of 1
zero_user_segments---of 17
-----------
SUMMARY25%of 747

ext4_init_security100%of 1
ext4_initxattrs40%of 5
ext4_xattr_security_get100%of 1
ext4_xattr_security_set---of 1
-----------
SUMMARY58%of 7

-----------
SUMMARY---of 0

__bpf_trace_vector_activate---of 1
__bpf_trace_vector_alloc---of 1
__bpf_trace_vector_alloc_managed---of 1
__bpf_trace_vector_config---of 1
__bpf_trace_vector_free_moved---of 1
__bpf_trace_vector_mod---of 1
__bpf_trace_vector_reserve---of 1
__bpf_trace_vector_setup---of 1
__bpf_trace_vector_teardown---of 1
__bpf_trace_x86_irq_vector---of 1
__common_interrupt---of 13
__probestub_call_function_entry---of 1
__probestub_call_function_exit---of 1
__probestub_call_function_single_entry---of 1
__probestub_call_function_single_exit---of 1
__probestub_deferred_error_apic_entry---of 1
__probestub_deferred_error_apic_exit---of 1
__probestub_error_apic_entry---of 1
__probestub_error_apic_exit---of 1
__probestub_irq_work_entry---of 1
__probestub_irq_work_exit---of 1
__probestub_local_timer_entry---of 1
__probestub_local_timer_exit---of 1
__probestub_reschedule_entry---of 1
__probestub_reschedule_exit---of 1
__probestub_spurious_apic_entry---of 1
__probestub_spurious_apic_exit---of 1
__probestub_thermal_apic_entry---of 1
__probestub_thermal_apic_exit---of 1
__probestub_threshold_apic_entry---of 1
__probestub_threshold_apic_exit---of 1
__probestub_vector_activate---of 1
__probestub_vector_alloc---of 1
__probestub_vector_alloc_managed---of 1
__probestub_vector_clear---of 1
__probestub_vector_config---of 1
__probestub_vector_deactivate---of 1
__probestub_vector_free_moved---of 1
__probestub_vector_reserve---of 1
__probestub_vector_reserve_managed---of 1
__probestub_vector_setup---of 1
__probestub_vector_teardown---of 1
__probestub_vector_update---of 1
__probestub_x86_platform_ipi_entry---of 1
__probestub_x86_platform_ipi_exit---of 1
__sysvec_kvm_posted_intr_wakeup_ipi---of 1
__sysvec_thermal---of 31
__sysvec_x86_platform_ipi---of 31
__traceiter_call_function_entry---of 4
__traceiter_call_function_exit---of 4
__traceiter_call_function_single_entry---of 4
__traceiter_call_function_single_exit---of 4
__traceiter_deferred_error_apic_entry---of 4
__traceiter_deferred_error_apic_exit---of 4
__traceiter_error_apic_entry---of 4
__traceiter_error_apic_exit---of 4
__traceiter_irq_work_entry---of 4
__traceiter_irq_work_exit---of 4
__traceiter_local_timer_entry---of 4
__traceiter_local_timer_exit---of 4
__traceiter_reschedule_entry---of 4
__traceiter_reschedule_exit---of 4
__traceiter_spurious_apic_entry---of 4
__traceiter_spurious_apic_exit---of 4
__traceiter_thermal_apic_entry---of 4
__traceiter_thermal_apic_exit---of 4
__traceiter_threshold_apic_entry---of 4
__traceiter_threshold_apic_exit---of 4
__traceiter_vector_activate---of 4
__traceiter_vector_alloc---of 4
__traceiter_vector_alloc_managed---of 4
__traceiter_vector_clear---of 4
__traceiter_vector_config---of 4
__traceiter_vector_deactivate---of 4
__traceiter_vector_free_moved---of 4
__traceiter_vector_reserve---of 4
__traceiter_vector_reserve_managed---of 4
__traceiter_vector_setup---of 4
__traceiter_vector_teardown---of 4
__traceiter_vector_update---of 4
__traceiter_x86_platform_ipi_entry---of 4
__traceiter_x86_platform_ipi_exit---of 4
ack_bad_irq---of 3
arch_irq_stat---of 1
arch_irq_stat_cpu---of 5
arch_show_interrupts---of 79
dummy_handler---of 1
fixup_irqs---of 9
fred_sysvec_kvm_posted_intr_ipi---of 3
fred_sysvec_kvm_posted_intr_nested_ipi---of 3
fred_sysvec_kvm_posted_intr_wakeup_ipi---of 4
fred_sysvec_thermal---of 4
fred_sysvec_x86_platform_ipi---of 4
kvm_set_cpu_l1tf_flush_l1d100%of 1
kvm_set_posted_intr_wakeup_handler---of 3
perf_perm_irq_work_exit---of 1
perf_trace_vector_activate---of 8
perf_trace_vector_alloc---of 8
perf_trace_vector_alloc_managed---of 8
perf_trace_vector_config---of 8
perf_trace_vector_free_moved---of 8
perf_trace_vector_mod---of 8
perf_trace_vector_reserve---of 8
perf_trace_vector_setup---of 8
perf_trace_vector_teardown---of 8
perf_trace_x86_irq_vector---of 8
trace_event_raw_event_vector_activate---of 7
trace_event_raw_event_vector_alloc---of 7
trace_event_raw_event_vector_alloc_managed---of 7
trace_event_raw_event_vector_config---of 7
trace_event_raw_event_vector_free_moved---of 7
trace_event_raw_event_vector_mod---of 7
trace_event_raw_event_vector_reserve---of 7
trace_event_raw_event_vector_setup---of 7
trace_event_raw_event_vector_teardown---of 7
trace_event_raw_event_x86_irq_vector---of 7
trace_raw_output_vector_activate---of 3
trace_raw_output_vector_alloc---of 3
trace_raw_output_vector_alloc_managed---of 3
trace_raw_output_vector_config---of 3
trace_raw_output_vector_free_moved---of 3
trace_raw_output_vector_mod---of 3
trace_raw_output_vector_reserve---of 3
trace_raw_output_vector_setup---of 3
trace_raw_output_vector_teardown---of 3
trace_raw_output_x86_irq_vector---of 3
-----------
SUMMARY100%of 1

ovl_acceptable---of 4
ovl_check_fb_len---of 7
ovl_check_origin---of 5
ovl_check_origin_fh---of 17
ovl_check_redirect---of 6
ovl_decode_real_fh---of 10
ovl_get_fh---of 25
ovl_get_index_fh---of 6
ovl_get_index_name---of 4
ovl_get_index_name_fh---of 3
ovl_index_upper---of 8
ovl_lookup---of 130
ovl_lookup_index---of 23
ovl_lookup_layer---of 9
ovl_lookup_single---of 33
ovl_lower_positive---of 14
ovl_path_next---of 18
ovl_verify_fh---of 6
ovl_verify_index---of 25
ovl_verify_lowerdata12%of 36
ovl_verify_origin_xattr---of 7
ovl_verify_set_fh---of 9
-----------
SUMMARY12%of 36

__ima_inode_hash---of 16
ima_bprm_check---of 3
ima_file_check100%of 1
ima_file_free31%of 13
ima_file_hash---of 3
ima_file_mmap58%of 7
ima_file_mprotect---of 7
ima_get_current_hash_algo---of 1
ima_inode_hash---of 3
ima_kernel_module_request---of 1
ima_kexec_cmdline---of 5
ima_load_data---of 9
ima_measure_critical_data---of 3
ima_post_create_tmpfile---of 6
ima_post_load_data---of 6
ima_post_path_mknod34%of 6
ima_post_read_file---of 5
ima_read_file---of 3
integrity_inode_attrs_changed---of 4
mmap_violation_check---of 5
process_buffer_measurement---of 26
process_measurement10%of 101
-----------
SUMMARY17%of 128

___perf_sw_event---of 34
__ia32_sys_perf_event_open---of 1
__perf_cgroup_move---of 3
__perf_event_account_interrupt---of 8
__perf_event_disable---of 24
__perf_event_enable---of 27
__perf_event_exit_context---of 6
__perf_event_header__init_id---of 14
__perf_event_output_stop---of 11
__perf_event_overflow---of 38
__perf_event_period---of 14
__perf_event_read---of 36
__perf_event_read_value---of 4
__perf_event_stop---of 5
__perf_event_task_sched_in---of 84
__perf_event_task_sched_out---of 139
__perf_install_in_context---of 20
__perf_pmu_install_event---of 6
__perf_pmu_output_stop---of 15
__perf_pmu_remove---of 36
__perf_read_group_add---of 27
__perf_remove_from_context---of 53
__perf_sw_event---of 8
__pmu_ctx_sched_out---of 24
__se_sys_perf_event_open---of 214
__update_context_time---of 6
__x64_sys_perf_event_open---of 1
_free_event---of 59
_perf_event_disable---of 3
_perf_event_enable---of 6
_perf_event_refresh---of 8
_perf_event_reset---of 1
account_event---of 38
add_event_to_ctx---of 40
alloc_perf_context---of 6
calc_timer_values---of 10
cpu_clock_event_add---of 5
cpu_clock_event_del---of 3
cpu_clock_event_init---of 7
cpu_clock_event_read---of 1
cpu_clock_event_start---of 4
cpu_clock_event_stop---of 3
cpu_clock_event_update---of 4
ctx_event_to_rotate---of 57
ctx_groups_sched_in---of 7
ctx_resched---of 41
ctx_sched_in---of 17
ctx_sched_out---of 24
event_function---of 18
event_function_call---of 14
event_sched_in---of 37
event_sched_out---of 38
exclusive_event_destroy---of 4
find_get_context---of 21
find_get_pmu_context---of 22
free_ctx---of 1
free_epc_rcu---of 1
free_event---of 3
free_event_rcu---of 3
get_uid---of 4
group_sched_out---of 15
inherit_event---of 14
inherit_task_group---of 37
ktime_get_boottime_ns---of 1
ktime_get_clocktai_ns---of 1
ktime_get_real_ns---of 1
list_del_event---of 21
nr_addr_filters_show---of 1
perf_addr_filters_splice---of 13
perf_adjust_freq_unthr_context---of 18
perf_adjust_freq_unthr_events---of 24
perf_adjust_period---of 23
perf_allow_kernel---of 5
perf_bp_event---of 11
perf_callchain---of 5
perf_cgroup_attach---of 7
perf_cgroup_css_alloc---of 4
perf_cgroup_css_free---of 1
perf_cgroup_css_online---of 13
perf_cgroup_from_task---of 11
perf_cgroup_set_timestamp---of 8
perf_cgroup_switch---of 31
perf_compat_ioctl---of 8
perf_copy_attr---of 34
perf_cpu_task_ctx---of 7
perf_cpu_time_max_percent_handler---of 5
perf_detach_cgroup---of 15
perf_duration_warn---of 3
perf_event__output_id_sample---of 14
perf_event_account_interrupt---of 1
perf_event_addr_filters_apply---of 28
perf_event_addr_filters_sync---of 4
perf_event_alloc---of 117
perf_event_attrs---of 1
perf_event_aux_event---of 5
perf_event_bpf_event---of 8
perf_event_bpf_output---of 6
perf_event_cgroup_output---of 9
perf_event_comm---of 3
perf_event_comm_output---of 13
perf_event_create_kernel_counter---of 23
perf_event_ctx_lock_nested---of 24
perf_event_delayed_put---of 3
perf_event_disable---of 3
perf_event_disable_inatomic---of 1
perf_event_disable_local---of 20
perf_event_enable---of 6
perf_event_exec---of 56
perf_event_exit_cpu---of 5
perf_event_exit_event---of 8
perf_event_exit_task---of 33
perf_event_fork---of 5
perf_event_free_bpf_prog---of 4
perf_event_free_task---of 17
perf_event_get---of 4
perf_event_groups_insert---of 20
perf_event_header__init_id---of 3
perf_event_idx_default---of 1
perf_event_init_cpu---of 16
perf_event_init_task---of 20
perf_event_init_userpage---of 18
perf_event_itrace_started---of 1
perf_event_ksymbol---of 9
perf_event_ksymbol_output---of 9
perf_event_max_sample_rate_handler---of 6
perf_event_mmap4%of 62
perf_event_mmap_output---of 31
perf_event_modify_breakpoint---of 9
perf_event_mux_interval_ms_show---of 1
perf_event_mux_interval_ms_store---of 8
perf_event_namespaces---of 16
perf_event_namespaces_output---of 9
perf_event_nop_int---of 1
perf_event_output---of 15
perf_event_output_backward---of 15
perf_event_output_forward---of 15
perf_event_overflow---of 1
perf_event_pause---of 7
perf_event_period---of 9
perf_event_read---of 31
perf_event_read_local---of 42
perf_event_read_value---of 1
perf_event_refresh---of 1
perf_event_release_kernel---of 29
perf_event_set_bpf_prog---of 22
perf_event_set_output---of 22
perf_event_set_state---of 10
perf_event_switch_output---of 11
perf_event_sysfs_show---of 3
perf_event_task_disable---of 13
perf_event_task_enable---of 19
perf_event_task_output---of 17
perf_event_task_tick---of 24
perf_event_text_poke---of 3
perf_event_text_poke_output---of 16
perf_event_update_sibling_time---of 16
perf_event_update_time---of 7
perf_event_update_userpage---of 24
perf_event_wakeup---of 22
perf_fasync---of 1
perf_get_event---of 3
perf_get_page_size---of 21
perf_group_detach---of 43
perf_install_in_context---of 32
perf_ioctl---of 148
perf_iterate_ctx---of 17
perf_iterate_sb---of 39
perf_lock_task_context---of 52
perf_log_lost_samples---of 5
perf_log_throttle---of 7
perf_mmap---of 56
perf_mmap_close---of 61
perf_mmap_fault---of 30
perf_mmap_open---of 5
perf_mux_hrtimer_handler---of 44
perf_mux_hrtimer_restart_ipi---of 3
perf_output_read---of 56
perf_output_sample---of 104
perf_pending_irq---of 18
perf_pending_task---of 16
perf_pmu_cancel_txn---of 6
perf_pmu_commit_txn---of 6
perf_pmu_disable---of 5
perf_pmu_enable---of 5
perf_pmu_migrate_context---of 17
perf_pmu_nop_int---of 1
perf_pmu_nop_txn---of 1
perf_pmu_nop_void---of 1
perf_pmu_register---of 30
perf_pmu_resched---of 5
perf_pmu_sched_task---of 25
perf_pmu_start_txn---of 6
perf_pmu_unregister---of 8
perf_poll---of 9
perf_prepare_header---of 3
perf_prepare_sample---of 83
perf_read---of 22
perf_reboot---of 5
perf_register_guest_info_callbacks---of 4
perf_release---of 1
perf_remove_from_context---of 7
perf_remove_from_owner---of 25
perf_report_aux_output_id---of 5
perf_sample_event_took---of 7
perf_sched_cb_dec---of 6
perf_sched_cb_inc---of 8
perf_sched_delayed---of 3
perf_swevent_add---of 16
perf_swevent_del---of 3
perf_swevent_event---of 20
perf_swevent_get_recursion_context---of 5
perf_swevent_hrtimer---of 13
perf_swevent_init---of 9
perf_swevent_put_recursion_context---of 3
perf_swevent_read---of 1
perf_swevent_set_period---of 5
perf_swevent_start---of 1
perf_swevent_stop---of 1
perf_tp_event---of 124
perf_tp_event_init---of 5
perf_trace_run_bpf_submit---of 7
perf_try_init_event---of 20
perf_unpin_context---of 1
perf_unregister_guest_info_callbacks---of 3
perf_uprobe_event_init---of 8
pmu_dev_alloc---of 7
pmu_dev_is_visible---of 4
pmu_dev_release---of 1
put_ctx---of 10
put_pmu_ctx---of 12
rb_free_rcu---of 1
ref_ctr_offset_show---of 1
remote_function---of 5
retprobe_show---of 1
ring_buffer_attach---of 22
ring_buffer_get---of 23
ring_buffer_put---of 5
sw_perf_event_destroy---of 9
swevent_hlist_get---of 17
swevent_hlist_put_cpu---of 9
task_clock_event_add---of 5
task_clock_event_del---of 3
task_clock_event_init---of 7
task_clock_event_read---of 1
task_clock_event_start---of 4
task_clock_event_stop---of 3
task_clock_event_update---of 4
tp_perf_event_destroy---of 1
type_show---of 1
unclone_ctx---of 6
update_cgrp_time_from_cpuctx---of 8
update_context_time---of 4
visit_groups_merge---of 169
-----------
SUMMARY4%of 62

-----------
SUMMARY---of 0

__btrfs_balance---of 93
__btrfs_free_extra_devids---of 21
add_missing_dev---of 7
balance_kthread---of 3
btrfs_add_chunk_map---of 13
btrfs_add_dev_item---of 5
btrfs_alloc_device---of 19
btrfs_assign_next_active_device---of 15
btrfs_balance---of 63
btrfs_bg_flags_to_raid_index67%of 3
btrfs_bg_type_to_factor50%of 4
btrfs_bg_type_to_raid_name---of 5
btrfs_calc_stripe_length---of 4
btrfs_cancel_balance---of 10
btrfs_check_rw_degradable---of 22
btrfs_chunk_alloc_add_chunk_item---of 30
btrfs_chunk_max_errors---of 4
btrfs_chunk_writeable---of 16
btrfs_clear_sb_rdonly---of 1
btrfs_close_devices---of 12
btrfs_cmp_device_info---of 5
btrfs_commit_device_sizes---of 8
btrfs_create_chunk---of 124
btrfs_create_uuid_tree---of 8
btrfs_describe_block_groups---of 22
btrfs_destroy_dev_replace_tgtdev---of 12
btrfs_dev_name---of 8
btrfs_dev_stat_inc_and_print---of 22
btrfs_device_init_dev_stats---of 29
btrfs_find_chunk_map---of 1
btrfs_find_chunk_map_nolock12%of 18
btrfs_find_device---of 30
btrfs_find_device_by_devspec---of 4
btrfs_finish_sprout---of 15
btrfs_forget_devices---of 1
btrfs_free_chunk_map---of 6
btrfs_free_device---of 5
btrfs_free_extra_devids---of 4
btrfs_free_stale_devices---of 24
btrfs_full_stripe_len---of 12
btrfs_get_bdev_and_sb---of 9
btrfs_get_bioc---of 6
btrfs_get_chunk_map40%of 5
btrfs_get_dev_args_from_path---of 10
btrfs_get_dev_stats---of 15
btrfs_get_fs_uuids---of 1
btrfs_grow_device---of 7
btrfs_init_dev_stats---of 9
btrfs_init_devices_late---of 11
btrfs_init_new_device---of 74
btrfs_init_sprout---of 10
btrfs_is_parity_mirror---of 8
btrfs_map_block13%of 93
btrfs_map_discard---of 24
btrfs_map_repair_block---of 20
btrfs_mapping_tree_free---of 12
btrfs_may_alloc_data_chunk---of 6
btrfs_nr_parity_stripes---of 4
btrfs_num_copies24%of 13
btrfs_open_devices---of 6
btrfs_pause_balance---of 8
btrfs_pinned_by_swapfile---of 7
btrfs_put_bioc---of 5
btrfs_put_dev_args_from_path---of 1
btrfs_read_chunk_tree---of 70
btrfs_read_sys_array---of 13
btrfs_recover_balance---of 9
btrfs_release_disk_super---of 14
btrfs_relocate_chunk---of 13
btrfs_relocate_sys_chunks---of 16
btrfs_remove_chunk---of 71
btrfs_remove_chunk_map---of 11
btrfs_repair_one_zone---of 7
btrfs_resume_balance_async---of 6
btrfs_rm_dev_item---of 4
btrfs_rm_dev_replace_free_srcdev---of 12
btrfs_rm_dev_replace_remove_srcdev---of 14
btrfs_rm_device---of 59
btrfs_run_dev_stats7%of 64
btrfs_sb_fsid_ptr---of 1
btrfs_scan_one_device---of 34
btrfs_scratch_superblocks---of 22
btrfs_set_sb_rdonly---of 1
btrfs_setup_sprout---of 14
btrfs_shrink_device---of 43
btrfs_update_device---of 6
btrfs_uuid_scan_kthread---of 34
btrfs_verify_dev_extents---of 41
clone_fs_devices---of 20
close_fs_devices---of 33
contains_pending_extent---of 8
describe_balance_args---of 40
describe_balance_start_or_resume---of 14
dev_extent_hole_check---of 15
device_list_add---of 57
devid_cmp---of 1
find_fsid---of 9
find_live_mirror---of 15
find_next_devid---of 6
free_fs_devices---of 12
handle_ops_on_dev_replace---of 14
init_first_rw_device---of 4
insert_balance_item---of 5
map_blocks_raid56_read---of 7
open_fs_devices---of 35
rcu_read_unlock---of 6
read_one_chunk---of 38
read_seqbegin---of 10
relocating_repair_kthread---of 6
remove_chunk_item---of 13
reset_balance_state---of 9
sb_end_write---of 10
sb_start_write---of 10
update_balance_args---of 13
update_dev_time---of 3
validate_convert_profile---of 9
-----------
SUMMARY14%of 200

_copy_from_user50%of 6
_copy_to_user50%of 4
check_zeroed_user---of 11
-----------
SUMMARY50%of 10

__fsnotify_inode_delete100%of 1
__fsnotify_parent24%of 17
__fsnotify_update_child_dentry_flags---of 10
__fsnotify_vfsmount_delete---of 1
fsnotify11%of 143
fsnotify_event_needs_parent---of 3
fsnotify_first_mark---of 14
fsnotify_handle_inode_event---of 14
fsnotify_sb_delete---of 20
fsnotify_sb_free---of 1
-----------
SUMMARY13%of 161

-----------
SUMMARY---of 0

cgroupns_get34%of 6
cgroupns_install---of 12
cgroupns_owner---of 1
cgroupns_put40%of 5
copy_cgroup_ns---of 29
free_cgroup_ns---of 7
-----------
SUMMARY37%of 11

-----------
SUMMARY---of 0

__ia32_sys_io_uring_register---of 1
__io_register_iowq_aff---of 3
__se_sys_io_uring_register9%of 101
__x64_sys_io_uring_register100%of 1
io_eventfd_register---of 11
io_eventfd_unregister---of 12
io_probe---of 10
io_register_iowq_aff---of 6
io_register_iowq_max_workers---of 27
io_register_restrictions27%of 15
io_unregister_personality---of 4
-----------
SUMMARY12%of 117

__irq_work_queue_local24%of 26
irq_work_needs_cpu---of 14
irq_work_queue58%of 7
irq_work_queue_on---of 14
irq_work_run---of 15
irq_work_single---of 5
irq_work_sync---of 11
irq_work_tick---of 17
-----------
SUMMARY31%of 33

-----------
SUMMARY---of 0

__blk_should_fake_timeout---of 1
blk_abort_request---of 1
blk_add_timer63%of 8
blk_rq_timeout---of 1
part_timeout_show---of 1
part_timeout_store---of 4
-----------
SUMMARY63%of 8

-----------
SUMMARY---of 0

proc_self_get_link50%of 4
proc_setup_self---of 4
-----------
SUMMARY50%of 4

add_system_zone---of 16
ext4_check_blockref---of 10
ext4_destroy_system_zone---of 4
ext4_exit_system_zone---of 1
ext4_inode_block_valid100%of 1
ext4_release_system_zone---of 8
ext4_sb_block_valid35%of 29
ext4_setup_system_zone---of 51
-----------
SUMMARY37%of 30

-----------
SUMMARY---of 0

__delete_from_swap_cache---of 51
__read_swap_cache_async---of 33
add_to_swap---of 18
add_to_swap_cache---of 106
clear_shadow_from_swap_cache---of 26
delete_from_swap_cache---of 11
exit_swap_address_space---of 9
filemap_get_incore_folio---of 7
free_page_and_swap_cache---of 11
free_pages_and_swap_cache43%of 21
free_swap_cache13%of 41
get_shadow_from_swap_cache---of 3
init_swap_address_space---of 7
put_swap_device---of 14
read_swap_cache_async---of 6
show_swap_cache_info---of 1
swap_cache_get_folio---of 23
swap_cluster_readahead---of 27
swapin_readahead---of 57
vma_ra_enabled_show---of 1
vma_ra_enabled_store---of 1
xas_next---of 12
-----------
SUMMARY23%of 62

is_acl_valid---of 12
is_sd_valid---of 23
mark_as_free_ex---of 20
ntfs_bad_inode---of 1
ntfs_bio_fill_1---of 38
ntfs_bio_pages---of 24
ntfs_bread---of 6
ntfs_bread_run---of 12
ntfs_check_for_free_space---of 7
ntfs_clear_mft_tail---of 11
ntfs_extend_init---of 13
ntfs_extend_mft---of 9
ntfs_fix_post_read---of 14
ntfs_fix_pre_write---of 12
ntfs_get_bh---of 33
ntfs_get_security_by_id---of 12
ntfs_insert_reparse---of 3
ntfs_insert_security---of 37
ntfs_loadlog_and_replay---of 11
ntfs_look_for_free_space---of 15
ntfs_look_free_mft---of 48
ntfs_mark_rec_free---of 14
ntfs_new_inode---of 5
ntfs_objid_init---of 6
ntfs_objid_remove---of 3
ntfs_read_bh54%of 13
ntfs_read_run_nb25%of 37
ntfs_refresh_zone---of 9
ntfs_remove_reparse---of 8
ntfs_reparse_init---of 6
ntfs_sb_read---of 7
ntfs_sb_write---of 23
ntfs_sb_write_run---of 16
ntfs_security_init---of 28
ntfs_set_label---of 7
ntfs_set_state---of 16
ntfs_update_mftmirr---of 23
ntfs_vbo_to_lbo---of 9
ntfs_write_bh43%of 21
run_deallocate---of 6
valid_windows_name---of 32
-----------
SUMMARY36%of 71

__ia32_compat_sys_newfstat---of 6
__ia32_compat_sys_newfstatat---of 3
__ia32_compat_sys_newlstat---of 3
__ia32_compat_sys_newstat---of 3
__ia32_sys_fstat---of 6
__ia32_sys_lstat---of 3
__ia32_sys_newfstat---of 6
__ia32_sys_newfstatat---of 3
__ia32_sys_newlstat---of 3
__ia32_sys_newstat---of 3
__ia32_sys_readlink---of 1
__ia32_sys_readlinkat---of 1
__ia32_sys_stat---of 3
__ia32_sys_statx---of 4
__inode_add_bytes67%of 3
__inode_sub_bytes---of 3
__x64_compat_sys_newfstat---of 6
__x64_compat_sys_newfstatat---of 3
__x64_compat_sys_newlstat---of 3
__x64_compat_sys_newstat---of 3
__x64_sys_fstat---of 6
__x64_sys_lstat---of 3
__x64_sys_newfstat50%of 6
__x64_sys_newfstatat67%of 3
__x64_sys_newlstat67%of 3
__x64_sys_newstat100%of 3
__x64_sys_readlink100%of 1
__x64_sys_readlinkat100%of 1
__x64_sys_stat---of 3
__x64_sys_statx50%of 4
cp_compat_stat---of 9
cp_new_stat100%of 1
cp_old_stat---of 9
cp_statx100%of 1
do_readlinkat40%of 10
do_statx---of 4
generic_fill_statx_attr60%of 5
generic_fillattr50%of 6
getname_statx_lookup_flags---of 1
inode_add_bytes67%of 3
inode_get_bytes100%of 1
inode_set_bytes100%of 1
inode_sub_bytes67%of 3
vfs_fstat---of 4
vfs_fstatat29%of 7
vfs_getattr45%of 9
vfs_getattr_nosec58%of 7
vfs_statx47%of 13
-----------
SUMMARY55%of 91

close_pdeo---of 6
init_once---of 1
proc_alloc_inode67%of 3
proc_entry_rundown---of 6
proc_evict_inode---of 5
proc_free_inode---of 5
proc_get_inode---of 16
proc_get_link---of 5
proc_invalidate_siblings_dcache---of 47
proc_put_link---of 3
proc_reg_compat_ioctl---of 10
proc_reg_get_unmapped_area---of 10
proc_reg_llseek---of 7
proc_reg_mmap---of 10
proc_reg_open---of 18
proc_reg_poll---of 10
proc_reg_read---of 10
proc_reg_read_iter---of 7
proc_reg_release---of 6
proc_reg_unlocked_ioctl---of 10
proc_reg_write---of 10
proc_show_options---of 11
-----------
SUMMARY67%of 3

-----------
SUMMARY---of 0

__fat_fs_error---of 7
_fat_msg---of 1
fat_chain_add---of 31
fat_clusters_flush---of 9
fat_sync_bhs---of 10
fat_time_fat2unix80%of 5
fat_time_unix2fat---of 9
fat_truncate_atime---of 3
fat_truncate_mtime---of 1
fat_truncate_time45%of 9
fat_update_time---of 8
-----------
SUMMARY58%of 14

-----------
SUMMARY---of 0

copy_fpstate_to_sigframe21%of 48
fpu__alloc_mathframe50%of 4
fpu__restore_sig---of 73
os_xrstor_safe---of 8
os_xsave---of 5
-----------
SUMMARY24%of 52

iomap_iter28%of 69
-----------
SUMMARY28%of 69

__bpf_trace_module_free---of 1
__bpf_trace_module_load---of 1
__bpf_trace_module_refcnt---of 1
__bpf_trace_module_request---of 1
__ia32_sys_delete_module---of 1
__ia32_sys_finit_module---of 1
__ia32_sys_init_module---of 1
__is_module_percpu_address18%of 17
__layout_sections---of 13
__module_address13%of 16
__module_get67%of 3
__module_text_address---of 4
__probestub_module_free---of 1
__probestub_module_get---of 1
__probestub_module_load---of 1
__probestub_module_put---of 1
__probestub_module_request---of 1
__se_sys_delete_module---of 25
__se_sys_finit_module---of 26
__se_sys_init_module---of 12
__symbol_get---of 15
__symbol_put---of 4
__traceiter_module_free---of 4
__traceiter_module_get---of 4
__traceiter_module_load---of 4
__traceiter_module_put---of 4
__traceiter_module_request---of 4
__x64_sys_delete_module---of 1
__x64_sys_finit_module---of 1
__x64_sys_init_module---of 1
apply_relocations---of 12
arch_mod_section_prepend---of 1
cmp_name---of 1
complete_formation---of 13
do_free_init---of 4
do_init_module---of 14
find_module---of 1
find_module_all---of 14
find_module_sections---of 130
find_symbol---of 27
flush_module_init_free_work---of 1
free_mod_mem---of 13
free_modinfo---of 6
free_modinfo_srcversion---of 1
free_modinfo_version---of 1
free_module---of 17
is_module_address---of 3
is_module_percpu_address---of 1
is_module_text_address50%of 6
load_module---of 205
modinfo_srcversion_exists---of 1
modinfo_version_exists---of 1
module_arch_freeing_init---of 1
module_augment_kernel_taints---of 79
module_elf_check_arch---of 1
module_exit_section---of 1
module_flags---of 12
module_flags_taint---of 6
module_frob_arch_sections---of 1
module_get_offset_and_type---of 3
module_init_layout_section---of 1
module_init_section---of 1
module_next_tag_pair---of 6
module_patient_check_exists---of 17
module_put10%of 20
module_refcount---of 1
module_unload_free---of 10
module_unload_init---of 1
percpu_modalloc---of 6
perf_trace_module_free---of 8
perf_trace_module_load---of 8
perf_trace_module_refcnt---of 8
perf_trace_module_request---of 8
post_relocation---of 5
print_modules---of 10
register_module_notifier---of 1
resolve_symbol---of 56
search_module_extables---of 6
setup_modinfo---of 15
setup_modinfo_srcversion---of 1
setup_modinfo_version---of 1
show_coresize---of 1
show_initsize---of 1
show_initstate---of 5
show_modinfo_srcversion---of 1
show_modinfo_version---of 1
show_refcnt---of 1
show_taint---of 6
simplify_symbols---of 29
store_uevent---of 1
symbol_put_addr---of 8
trace_event_raw_event_module_free---of 7
trace_event_raw_event_module_load---of 7
trace_event_raw_event_module_refcnt---of 7
trace_event_raw_event_module_request---of 7
trace_module_get---of 15
trace_module_load---of 15
trace_raw_output_module_free---of 3
trace_raw_output_module_load---of 3
trace_raw_output_module_refcnt---of 3
trace_raw_output_module_request---of 3
try_module_get29%of 7
try_to_force_load---of 1
unknown_module_param_cb---of 5
unregister_module_notifier---of 1
-----------
SUMMARY21%of 69

-----------
SUMMARY---of 0

__btrfs_abort_transaction---of 3
__btrfs_end_transaction42%of 24
__btrfs_wait_marked_extents---of 7
add_pending_snapshot25%of 8
btrfs_add_dead_root---of 13
btrfs_add_dropped_root---of 3
btrfs_attach_transaction---of 1
btrfs_attach_transaction_barrier50%of 4
btrfs_clean_one_deleted_snapshot---of 6
btrfs_commit_transaction10%of 112
btrfs_commit_transaction_async---of 10
btrfs_end_transaction100%of 1
btrfs_end_transaction_throttle---of 1
btrfs_join_transaction100%of 1
btrfs_join_transaction_nostart---of 1
btrfs_join_transaction_spacecache---of 1
btrfs_maybe_wake_unfinished_drop---of 4
btrfs_put_transaction16%of 19
btrfs_record_root_in_trans80%of 5
btrfs_should_end_transaction---of 5
btrfs_start_transaction---of 1
btrfs_start_transaction_fallback_global_rsv---of 1
btrfs_throttle---of 1
btrfs_trans_release_chunk_metadata67%of 3
btrfs_trans_release_metadata23%of 9
btrfs_transaction_blocked---of 4
btrfs_transaction_exit---of 1
btrfs_wait_for_commit---of 19
btrfs_wait_tree_log_extents---of 6
btrfs_write_and_wait_transaction---of 1
btrfs_write_marked_extents---of 9
commit_cowonly_roots25%of 24
commit_fs_roots12%of 18
create_pending_snapshot---of 71
create_pending_snapshots34%of 6
join_transaction34%of 21
qgroup_account_snapshot---of 12
record_root_in_trans34%of 9
refcount_inc50%of 4
sb_end_intwrite40%of 10
start_transaction15%of 80
switch_commit_roots---of 14
trace_btrfs_space_reservation---of 15
trace_btrfs_transaction_commit---of 15
wait_current_trans---of 13
wait_for_commit---of 15
-----------
SUMMARY22%of 358

get_stack_info31%of 13
stack_type_name---of 7
-----------
SUMMARY31%of 13

-----------
SUMMARY---of 0

jfs_fsync40%of 5
jfs_open---of 10
jfs_release---of 4
jfs_setattr---of 25
-----------
SUMMARY40%of 5

__bpf_trace_alloc_extent_state---of 1
__bpf_trace_btrfs__block_group---of 1
__bpf_trace_btrfs__chunk---of 1
__bpf_trace_btrfs__file_extent_item_inline---of 1
__bpf_trace_btrfs__file_extent_item_regular---of 1
__bpf_trace_btrfs__inode---of 1
__bpf_trace_btrfs__ordered_extent---of 1
__bpf_trace_btrfs__prelim_ref---of 1
__bpf_trace_btrfs__qgroup_rsv_data---of 1
__bpf_trace_btrfs__reserve_extent---of 1
__bpf_trace_btrfs__reserved_extent---of 1
__bpf_trace_btrfs__space_info_update---of 1
__bpf_trace_btrfs__work---of 1
__bpf_trace_btrfs__work__done---of 1
__bpf_trace_btrfs__writepage---of 1
__bpf_trace_btrfs_add_block_group---of 1
__bpf_trace_btrfs_clear_extent_bit---of 1
__bpf_trace_btrfs_convert_extent_bit---of 1
__bpf_trace_btrfs_cow_block---of 1
__bpf_trace_btrfs_delayed_data_ref---of 1
__bpf_trace_btrfs_delayed_ref_head---of 1
__bpf_trace_btrfs_delayed_tree_ref---of 1
__bpf_trace_btrfs_dump_space_info---of 1
__bpf_trace_btrfs_extent_map_shrinker_count---of 1
__bpf_trace_btrfs_extent_map_shrinker_remove_em---of 1
__bpf_trace_btrfs_extent_map_shrinker_scan_enter---of 1
__bpf_trace_btrfs_extent_map_shrinker_scan_exit---of 1
__bpf_trace_btrfs_failed_cluster_setup---of 1
__bpf_trace_btrfs_find_cluster---of 1
__bpf_trace_btrfs_finish_ordered_extent---of 1
__bpf_trace_btrfs_flush_space---of 1
__bpf_trace_btrfs_get_extent---of 1
__bpf_trace_btrfs_get_raid_extent_offset---of 1
__bpf_trace_btrfs_handle_em_exist---of 1
__bpf_trace_btrfs_inode_mod_outstanding_extents---of 1
__bpf_trace_btrfs_insert_one_raid_extent---of 1
__bpf_trace_btrfs_locking_events---of 1
__bpf_trace_btrfs_qgroup_account_extent---of 1
__bpf_trace_btrfs_qgroup_extent---of 1
__bpf_trace_btrfs_raid56_bio---of 1
__bpf_trace_btrfs_raid_extent_delete---of 1
__bpf_trace_btrfs_reserve_ticket---of 1
__bpf_trace_btrfs_set_extent_bit---of 1
__bpf_trace_btrfs_setup_cluster---of 1
__bpf_trace_btrfs_sleep_tree_lock---of 1
__bpf_trace_btrfs_space_reservation---of 1
__bpf_trace_btrfs_sync_file---of 1
__bpf_trace_btrfs_sync_fs---of 1
__bpf_trace_btrfs_transaction_commit---of 1
__bpf_trace_btrfs_trigger_flush---of 1
__bpf_trace_btrfs_workqueue---of 1
__bpf_trace_btrfs_workqueue_done---of 1
__bpf_trace_btrfs_writepage_end_io_hook---of 1
__bpf_trace_find_free_extent---of 1
__bpf_trace_find_free_extent_have_block_group---of 1
__bpf_trace_find_free_extent_search_loop---of 1
__bpf_trace_free_extent_state---of 1
__bpf_trace_qgroup_meta_convert---of 1
__bpf_trace_qgroup_meta_free_all_pertrans---of 1
__bpf_trace_qgroup_meta_reserve---of 1
__bpf_trace_qgroup_num_dirty_extents---of 1
__bpf_trace_qgroup_update_counters---of 1
__bpf_trace_qgroup_update_reserve---of 1
__probestub___extent_writepage---of 1
__probestub_add_delayed_data_ref---of 1
__probestub_add_delayed_ref_head---of 1
__probestub_add_delayed_tree_ref---of 1
__probestub_alloc_extent_state---of 1
__probestub_btrfs_add_block_group---of 1
__probestub_btrfs_add_reclaim_block_group---of 1
__probestub_btrfs_add_unused_block_group---of 1
__probestub_btrfs_all_work_done---of 1
__probestub_btrfs_chunk_alloc---of 1
__probestub_btrfs_chunk_free---of 1
__probestub_btrfs_clear_extent_bit---of 1
__probestub_btrfs_convert_extent_bit---of 1
__probestub_btrfs_cow_block---of 1
__probestub_btrfs_done_preemptive_reclaim---of 1
__probestub_btrfs_extent_map_shrinker_count---of 1
__probestub_btrfs_extent_map_shrinker_remove_em---of 1
__probestub_btrfs_extent_map_shrinker_scan_enter---of 1
__probestub_btrfs_extent_map_shrinker_scan_exit---of 1
__probestub_btrfs_fail_all_tickets---of 1
__probestub_btrfs_failed_cluster_setup---of 1
__probestub_btrfs_find_cluster---of 1
__probestub_btrfs_finish_ordered_extent---of 1
__probestub_btrfs_flush_space---of 1
__probestub_btrfs_get_extent---of 1
__probestub_btrfs_get_extent_show_fi_inline---of 1
__probestub_btrfs_get_extent_show_fi_regular---of 1
__probestub_btrfs_get_raid_extent_offset---of 1
__probestub_btrfs_handle_em_exist---of 1
__probestub_btrfs_inode_evict---of 1
__probestub_btrfs_inode_mod_outstanding_extents---of 1
__probestub_btrfs_inode_new---of 1
__probestub_btrfs_inode_request---of 1
__probestub_btrfs_insert_one_raid_extent---of 1
__probestub_btrfs_ordered_extent_add---of 1
__probestub_btrfs_ordered_extent_dec_test_pending---of 1
__probestub_btrfs_ordered_extent_lookup---of 1
__probestub_btrfs_ordered_extent_lookup_first---of 1
__probestub_btrfs_ordered_extent_lookup_first_range---of 1
__probestub_btrfs_ordered_extent_lookup_for_logging---of 1
__probestub_btrfs_ordered_extent_lookup_range---of 1
__probestub_btrfs_ordered_extent_mark_finished---of 1
__probestub_btrfs_ordered_extent_put---of 1
__probestub_btrfs_ordered_extent_remove---of 1
__probestub_btrfs_ordered_extent_split---of 1
__probestub_btrfs_ordered_extent_start---of 1
__probestub_btrfs_ordered_sched---of 1
__probestub_btrfs_prelim_ref_insert---of 1
__probestub_btrfs_prelim_ref_merge---of 1
__probestub_btrfs_qgroup_account_extent---of 1
__probestub_btrfs_qgroup_account_extents---of 1
__probestub_btrfs_qgroup_release_data---of 1
__probestub_btrfs_qgroup_reserve_data---of 1
__probestub_btrfs_qgroup_trace_extent---of 1
__probestub_btrfs_raid_extent_delete---of 1
__probestub_btrfs_reclaim_block_group---of 1
__probestub_btrfs_remove_block_group---of 1
__probestub_btrfs_reserve_extent---of 1
__probestub_btrfs_reserve_extent_cluster---of 1
__probestub_btrfs_reserve_ticket---of 1
__probestub_btrfs_reserved_extent_alloc---of 1
__probestub_btrfs_reserved_extent_free---of 1
__probestub_btrfs_set_extent_bit---of 1
__probestub_btrfs_set_lock_blocking_read---of 1
__probestub_btrfs_set_lock_blocking_write---of 1
__probestub_btrfs_setup_cluster---of 1
__probestub_btrfs_skip_unused_block_group---of 1
__probestub_btrfs_space_reservation---of 1
__probestub_btrfs_sync_file---of 1
__probestub_btrfs_sync_fs---of 1
__probestub_btrfs_transaction_commit---of 1
__probestub_btrfs_tree_lock---of 1
__probestub_btrfs_tree_read_lock---of 1
__probestub_btrfs_tree_read_lock_atomic---of 1
__probestub_btrfs_tree_read_unlock---of 1
__probestub_btrfs_tree_read_unlock_blocking---of 1
__probestub_btrfs_tree_unlock---of 1
__probestub_btrfs_trigger_flush---of 1
__probestub_btrfs_truncate_show_fi_inline---of 1
__probestub_btrfs_truncate_show_fi_regular---of 1
__probestub_btrfs_try_tree_read_lock---of 1
__probestub_btrfs_try_tree_write_lock---of 1
__probestub_btrfs_work_queued---of 1
__probestub_btrfs_work_sched---of 1
__probestub_btrfs_workqueue_alloc---of 1
__probestub_btrfs_workqueue_destroy---of 1
__probestub_btrfs_writepage_end_io_hook---of 1
__probestub_find_free_extent---of 1
__probestub_find_free_extent_have_block_group---of 1
__probestub_find_free_extent_search_loop---of 1
__probestub_free_extent_state---of 1
__probestub_qgroup_meta_convert---of 1
__probestub_qgroup_meta_free_all_pertrans---of 1
__probestub_qgroup_meta_reserve---of 1
__probestub_qgroup_num_dirty_extents---of 1
__probestub_qgroup_update_counters---of 1
__probestub_qgroup_update_reserve---of 1
__probestub_raid56_read---of 1
__probestub_raid56_write---of 1
__probestub_run_delayed_data_ref---of 1
__probestub_run_delayed_ref_head---of 1
__probestub_run_delayed_tree_ref---of 1
__probestub_update_bytes_may_use---of 1
__probestub_update_bytes_pinned---of 1
__traceiter___extent_writepage---of 4
__traceiter_add_delayed_data_ref---of 4
__traceiter_add_delayed_ref_head---of 4
__traceiter_add_delayed_tree_ref---of 4
__traceiter_alloc_extent_state---of 4
__traceiter_btrfs_add_block_group---of 4
__traceiter_btrfs_add_reclaim_block_group---of 4
__traceiter_btrfs_add_unused_block_group---of 4
__traceiter_btrfs_all_work_done---of 4
__traceiter_btrfs_chunk_alloc---of 4
__traceiter_btrfs_chunk_free---of 4
__traceiter_btrfs_clear_extent_bit---of 4
__traceiter_btrfs_convert_extent_bit---of 4
__traceiter_btrfs_cow_block---of 4
__traceiter_btrfs_done_preemptive_reclaim---of 4
__traceiter_btrfs_extent_map_shrinker_count---of 4
__traceiter_btrfs_extent_map_shrinker_remove_em---of 4
__traceiter_btrfs_extent_map_shrinker_scan_enter---of 4
__traceiter_btrfs_extent_map_shrinker_scan_exit---of 4
__traceiter_btrfs_fail_all_tickets---of 4
__traceiter_btrfs_failed_cluster_setup---of 4
__traceiter_btrfs_find_cluster---of 4
__traceiter_btrfs_finish_ordered_extent---of 4
__traceiter_btrfs_flush_space---of 4
__traceiter_btrfs_get_extent---of 4
__traceiter_btrfs_get_extent_show_fi_inline---of 4
__traceiter_btrfs_get_extent_show_fi_regular---of 4
__traceiter_btrfs_get_raid_extent_offset---of 4
__traceiter_btrfs_handle_em_exist---of 4
__traceiter_btrfs_inode_evict---of 4
__traceiter_btrfs_inode_mod_outstanding_extents---of 4
__traceiter_btrfs_inode_new---of 4
__traceiter_btrfs_inode_request---of 4
__traceiter_btrfs_insert_one_raid_extent---of 4
__traceiter_btrfs_ordered_extent_add---of 4
__traceiter_btrfs_ordered_extent_dec_test_pending---of 4
__traceiter_btrfs_ordered_extent_lookup---of 4
__traceiter_btrfs_ordered_extent_lookup_first---of 4
__traceiter_btrfs_ordered_extent_lookup_first_range---of 4
__traceiter_btrfs_ordered_extent_lookup_for_logging---of 4
__traceiter_btrfs_ordered_extent_lookup_range---of 4
__traceiter_btrfs_ordered_extent_mark_finished---of 4
__traceiter_btrfs_ordered_extent_put---of 4
__traceiter_btrfs_ordered_extent_remove---of 4
__traceiter_btrfs_ordered_extent_split---of 4
__traceiter_btrfs_ordered_extent_start---of 4
__traceiter_btrfs_ordered_sched---of 4
__traceiter_btrfs_prelim_ref_insert---of 4
__traceiter_btrfs_prelim_ref_merge---of 4
__traceiter_btrfs_qgroup_account_extent---of 4
__traceiter_btrfs_qgroup_account_extents---of 4
__traceiter_btrfs_qgroup_release_data---of 4
__traceiter_btrfs_qgroup_reserve_data---of 4
__traceiter_btrfs_qgroup_trace_extent---of 4
__traceiter_btrfs_raid_extent_delete---of 4
__traceiter_btrfs_reclaim_block_group---of 4
__traceiter_btrfs_remove_block_group---of 4
__traceiter_btrfs_reserve_extent---of 4
__traceiter_btrfs_reserve_extent_cluster---of 4
__traceiter_btrfs_reserve_ticket---of 4
__traceiter_btrfs_reserved_extent_alloc---of 4
__traceiter_btrfs_reserved_extent_free---of 4
__traceiter_btrfs_set_extent_bit---of 4
__traceiter_btrfs_set_lock_blocking_read---of 4
__traceiter_btrfs_set_lock_blocking_write---of 4
__traceiter_btrfs_setup_cluster---of 4
__traceiter_btrfs_skip_unused_block_group---of 4
__traceiter_btrfs_space_reservation---of 4
__traceiter_btrfs_sync_file---of 4
__traceiter_btrfs_sync_fs---of 4
__traceiter_btrfs_transaction_commit---of 4
__traceiter_btrfs_tree_lock---of 4
__traceiter_btrfs_tree_read_lock---of 4
__traceiter_btrfs_tree_read_lock_atomic---of 4
__traceiter_btrfs_tree_read_unlock---of 4
__traceiter_btrfs_tree_read_unlock_blocking---of 4
__traceiter_btrfs_tree_unlock---of 4
__traceiter_btrfs_trigger_flush---of 4
__traceiter_btrfs_truncate_show_fi_inline---of 4
__traceiter_btrfs_truncate_show_fi_regular---of 4
__traceiter_btrfs_try_tree_read_lock---of 4
__traceiter_btrfs_try_tree_write_lock---of 4
__traceiter_btrfs_work_queued---of 4
__traceiter_btrfs_work_sched---of 4
__traceiter_btrfs_workqueue_alloc---of 4
__traceiter_btrfs_workqueue_destroy---of 4
__traceiter_btrfs_writepage_end_io_hook---of 4
__traceiter_find_free_extent---of 4
__traceiter_find_free_extent_have_block_group---of 4
__traceiter_find_free_extent_search_loop---of 4
__traceiter_free_extent_state---of 4
__traceiter_qgroup_meta_convert---of 4
__traceiter_qgroup_meta_free_all_pertrans---of 4
__traceiter_qgroup_meta_reserve---of 4
__traceiter_qgroup_num_dirty_extents---of 4
__traceiter_qgroup_update_counters---of 4
__traceiter_qgroup_update_reserve---of 4
__traceiter_raid56_read---of 4
__traceiter_raid56_write---of 4
__traceiter_run_delayed_data_ref---of 4
__traceiter_run_delayed_ref_head---of 4
__traceiter_run_delayed_tree_ref---of 4
__traceiter_update_bytes_may_use---of 4
__traceiter_update_bytes_pinned---of 4
btrfs_check_options---of 17
btrfs_cmp_device_free_bytes---of 1
btrfs_control_ioctl---of 12
btrfs_control_open---of 1
btrfs_dup_fs_context---of 4
btrfs_fc_test_super---of 1
btrfs_free_cached_objects---of 1
btrfs_free_fs_context---of 7
btrfs_freeze---of 3
btrfs_get_subvol_name_from_objectid---of 17
btrfs_get_tree---of 52
btrfs_init_fs_context---of 4
btrfs_interface_exit---of 1
btrfs_kill_super---of 1
btrfs_nr_cached_objects---of 15
btrfs_parse_param---of 73
btrfs_put_super---of 1
btrfs_reconfigure---of 155
btrfs_run_sanity_tests---of 1
btrfs_set_free_space_cache_settings---of 12
btrfs_show_devname---of 18
btrfs_show_options---of 64
btrfs_statfs---of 76
btrfs_sync_fs15%of 40
btrfs_unfreeze---of 12
perf_trace_alloc_extent_state---of 8
perf_trace_btrfs__block_group---of 9
perf_trace_btrfs__chunk---of 9
perf_trace_btrfs__file_extent_item_inline---of 9
perf_trace_btrfs__file_extent_item_regular---of 9
perf_trace_btrfs__inode---of 9
perf_trace_btrfs__ordered_extent---of 9
perf_trace_btrfs__prelim_ref---of 11
perf_trace_btrfs__qgroup_rsv_data---of 9
perf_trace_btrfs__reserve_extent---of 9
perf_trace_btrfs__reserved_extent---of 9
perf_trace_btrfs__space_info_update---of 9
perf_trace_btrfs__work---of 9
perf_trace_btrfs__work__done---of 9
perf_trace_btrfs__writepage---of 9
perf_trace_btrfs_add_block_group---of 9
perf_trace_btrfs_clear_extent_bit---of 11
perf_trace_btrfs_convert_extent_bit---of 11
perf_trace_btrfs_cow_block---of 9
perf_trace_btrfs_delayed_data_ref---of 9
perf_trace_btrfs_delayed_ref_head---of 9
perf_trace_btrfs_delayed_tree_ref---of 9
perf_trace_btrfs_dump_space_info---of 9
perf_trace_btrfs_extent_map_shrinker_count---of 9
perf_trace_btrfs_extent_map_shrinker_remove_em---of 9
perf_trace_btrfs_extent_map_shrinker_scan_enter---of 9
perf_trace_btrfs_extent_map_shrinker_scan_exit---of 9
perf_trace_btrfs_failed_cluster_setup---of 9
perf_trace_btrfs_find_cluster---of 9
perf_trace_btrfs_finish_ordered_extent---of 9
perf_trace_btrfs_flush_space---of 9
perf_trace_btrfs_get_extent---of 9
perf_trace_btrfs_get_raid_extent_offset---of 9
perf_trace_btrfs_handle_em_exist---of 9
perf_trace_btrfs_inode_mod_outstanding_extents---of 9
perf_trace_btrfs_insert_one_raid_extent---of 9
perf_trace_btrfs_locking_events---of 9
perf_trace_btrfs_qgroup_account_extent---of 9
perf_trace_btrfs_qgroup_extent---of 9
perf_trace_btrfs_raid56_bio---of 9
perf_trace_btrfs_raid_extent_delete---of 9
perf_trace_btrfs_reserve_ticket---of 9
perf_trace_btrfs_set_extent_bit---of 11
perf_trace_btrfs_setup_cluster---of 9
perf_trace_btrfs_sleep_tree_lock---of 9
perf_trace_btrfs_space_reservation---of 9
perf_trace_btrfs_sync_file---of 9
perf_trace_btrfs_sync_fs---of 9
perf_trace_btrfs_transaction_commit---of 9
perf_trace_btrfs_trigger_flush---of 9
perf_trace_btrfs_workqueue---of 9
perf_trace_btrfs_workqueue_done---of 9
perf_trace_btrfs_writepage_end_io_hook---of 9
perf_trace_find_free_extent---of 9
perf_trace_find_free_extent_have_block_group---of 9
perf_trace_find_free_extent_search_loop---of 9
perf_trace_free_extent_state---of 8
perf_trace_qgroup_meta_convert---of 9
perf_trace_qgroup_meta_free_all_pertrans---of 9
perf_trace_qgroup_meta_reserve---of 9
perf_trace_qgroup_num_dirty_extents---of 9
perf_trace_qgroup_update_counters---of 9
perf_trace_qgroup_update_reserve---of 11
register_btrfs---of 1
trace_event_raw_event_alloc_extent_state---of 7
trace_event_raw_event_btrfs__block_group---of 8
trace_event_raw_event_btrfs__chunk---of 8
trace_event_raw_event_btrfs__file_extent_item_inline---of 8
trace_event_raw_event_btrfs__file_extent_item_regular---of 8
trace_event_raw_event_btrfs__inode---of 8
trace_event_raw_event_btrfs__ordered_extent---of 8
trace_event_raw_event_btrfs__prelim_ref---of 10
trace_event_raw_event_btrfs__qgroup_rsv_data---of 8
trace_event_raw_event_btrfs__reserve_extent---of 8
trace_event_raw_event_btrfs__reserved_extent---of 8
trace_event_raw_event_btrfs__space_info_update---of 8
trace_event_raw_event_btrfs__work---of 8
trace_event_raw_event_btrfs__work__done---of 8
trace_event_raw_event_btrfs__writepage---of 8
trace_event_raw_event_btrfs_add_block_group---of 8
trace_event_raw_event_btrfs_clear_extent_bit---of 10
trace_event_raw_event_btrfs_convert_extent_bit---of 10
trace_event_raw_event_btrfs_cow_block---of 8
trace_event_raw_event_btrfs_delayed_data_ref---of 8
trace_event_raw_event_btrfs_delayed_ref_head---of 8
trace_event_raw_event_btrfs_delayed_tree_ref---of 8
trace_event_raw_event_btrfs_dump_space_info---of 8
trace_event_raw_event_btrfs_extent_map_shrinker_count---of 8
trace_event_raw_event_btrfs_extent_map_shrinker_remove_em---of 8
trace_event_raw_event_btrfs_extent_map_shrinker_scan_enter---of 8
trace_event_raw_event_btrfs_extent_map_shrinker_scan_exit---of 8
trace_event_raw_event_btrfs_failed_cluster_setup---of 8
trace_event_raw_event_btrfs_find_cluster---of 8
trace_event_raw_event_btrfs_finish_ordered_extent---of 8
trace_event_raw_event_btrfs_flush_space---of 8
trace_event_raw_event_btrfs_get_extent---of 8
trace_event_raw_event_btrfs_get_raid_extent_offset---of 8
trace_event_raw_event_btrfs_handle_em_exist---of 8
trace_event_raw_event_btrfs_inode_mod_outstanding_extents---of 8
trace_event_raw_event_btrfs_insert_one_raid_extent---of 8
trace_event_raw_event_btrfs_locking_events---of 8
trace_event_raw_event_btrfs_qgroup_account_extent---of 8
trace_event_raw_event_btrfs_qgroup_extent---of 8
trace_event_raw_event_btrfs_raid56_bio---of 8
trace_event_raw_event_btrfs_raid_extent_delete---of 8
trace_event_raw_event_btrfs_reserve_ticket---of 8
trace_event_raw_event_btrfs_set_extent_bit---of 10
trace_event_raw_event_btrfs_setup_cluster---of 8
trace_event_raw_event_btrfs_sleep_tree_lock---of 8
trace_event_raw_event_btrfs_space_reservation---of 8
trace_event_raw_event_btrfs_sync_file---of 8
trace_event_raw_event_btrfs_sync_fs---of 8
trace_event_raw_event_btrfs_transaction_commit---of 8
trace_event_raw_event_btrfs_trigger_flush---of 8
trace_event_raw_event_btrfs_workqueue---of 8
trace_event_raw_event_btrfs_workqueue_done---of 8
trace_event_raw_event_btrfs_writepage_end_io_hook---of 8
trace_event_raw_event_find_free_extent---of 8
trace_event_raw_event_find_free_extent_have_block_group---of 8
trace_event_raw_event_find_free_extent_search_loop---of 8
trace_event_raw_event_free_extent_state---of 7
trace_event_raw_event_qgroup_meta_convert---of 8
trace_event_raw_event_qgroup_meta_free_all_pertrans---of 8
trace_event_raw_event_qgroup_meta_reserve---of 8
trace_event_raw_event_qgroup_num_dirty_extents---of 8
trace_event_raw_event_qgroup_update_counters---of 8
trace_event_raw_event_qgroup_update_reserve---of 10
trace_raw_output_alloc_extent_state---of 4
trace_raw_output_btrfs__block_group---of 3
trace_raw_output_btrfs__chunk---of 4
trace_raw_output_btrfs__file_extent_item_inline---of 4
trace_raw_output_btrfs__file_extent_item_regular---of 4
trace_raw_output_btrfs__inode---of 4
trace_raw_output_btrfs__ordered_extent---of 4
trace_raw_output_btrfs__prelim_ref---of 3
trace_raw_output_btrfs__qgroup_rsv_data---of 3
trace_raw_output_btrfs__reserve_extent---of 3
trace_raw_output_btrfs__reserved_extent---of 3
trace_raw_output_btrfs__space_info_update---of 3
trace_raw_output_btrfs__work---of 3
trace_raw_output_btrfs__work__done---of 3
trace_raw_output_btrfs__writepage---of 4
trace_raw_output_btrfs_add_block_group---of 3
trace_raw_output_btrfs_clear_extent_bit---of 3
trace_raw_output_btrfs_convert_extent_bit---of 3
trace_raw_output_btrfs_cow_block---of 4
trace_raw_output_btrfs_delayed_data_ref---of 6
trace_raw_output_btrfs_delayed_ref_head---of 3
trace_raw_output_btrfs_delayed_tree_ref---of 6
trace_raw_output_btrfs_dump_space_info---of 3
trace_raw_output_btrfs_extent_map_shrinker_count---of 3
trace_raw_output_btrfs_extent_map_shrinker_remove_em---of 6
trace_raw_output_btrfs_extent_map_shrinker_scan_enter---of 4
trace_raw_output_btrfs_extent_map_shrinker_scan_exit---of 4
trace_raw_output_btrfs_failed_cluster_setup---of 3
trace_raw_output_btrfs_find_cluster---of 3
trace_raw_output_btrfs_finish_ordered_extent---of 4
trace_raw_output_btrfs_flush_space---of 3
trace_raw_output_btrfs_get_extent---of 6
trace_raw_output_btrfs_get_raid_extent_offset---of 3
trace_raw_output_btrfs_handle_em_exist---of 3
trace_raw_output_btrfs_inode_mod_outstanding_extents---of 4
trace_raw_output_btrfs_insert_one_raid_extent---of 3
trace_raw_output_btrfs_locking_events---of 3
trace_raw_output_btrfs_qgroup_account_extent---of 3
trace_raw_output_btrfs_qgroup_extent---of 3
trace_raw_output_btrfs_raid56_bio---of 5
trace_raw_output_btrfs_raid_extent_delete---of 3
trace_raw_output_btrfs_reserve_ticket---of 3
trace_raw_output_btrfs_set_extent_bit---of 3
trace_raw_output_btrfs_setup_cluster---of 3
trace_raw_output_btrfs_sleep_tree_lock---of 3
trace_raw_output_btrfs_space_reservation---of 3
trace_raw_output_btrfs_sync_file---of 4
trace_raw_output_btrfs_sync_fs---of 3
trace_raw_output_btrfs_transaction_commit---of 4
trace_raw_output_btrfs_trigger_flush---of 3
trace_raw_output_btrfs_workqueue---of 3
trace_raw_output_btrfs_workqueue_done---of 3
trace_raw_output_btrfs_writepage_end_io_hook---of 4
trace_raw_output_find_free_extent---of 4
trace_raw_output_find_free_extent_have_block_group---of 4
trace_raw_output_find_free_extent_search_loop---of 4
trace_raw_output_free_extent_state---of 3
trace_raw_output_qgroup_meta_convert---of 4
trace_raw_output_qgroup_meta_free_all_pertrans---of 4
trace_raw_output_qgroup_meta_reserve---of 4
trace_raw_output_qgroup_num_dirty_extents---of 3
trace_raw_output_qgroup_update_counters---of 3
trace_raw_output_qgroup_update_reserve---of 3
unregister_btrfs---of 1
-----------
SUMMARY15%of 40

-----------
SUMMARY---of 0

_setid_policy_lookup---of 14
safesetid_security_capable29%of 7
safesetid_task_fix_setgid---of 22
safesetid_task_fix_setgroups---of 18
safesetid_task_fix_setuid---of 22
setid_policy_lookup---of 35
-----------
SUMMARY29%of 7

__bpf_trace_dax_insert_mapping---of 1
__bpf_trace_dax_pmd_fault_class---of 1
__bpf_trace_dax_pmd_insert_mapping_class---of 1
__bpf_trace_dax_pmd_load_hole_class---of 1
__bpf_trace_dax_pte_fault_class---of 1
__bpf_trace_dax_writeback_one---of 1
__bpf_trace_dax_writeback_range_class---of 1
__dax_invalidate_entry---of 11
__probestub_dax_insert_mapping---of 1
__probestub_dax_insert_pfn_mkwrite---of 1
__probestub_dax_insert_pfn_mkwrite_no_entry---of 1
__probestub_dax_load_hole---of 1
__probestub_dax_pmd_fault---of 1
__probestub_dax_pmd_fault_done---of 1
__probestub_dax_pmd_insert_mapping---of 1
__probestub_dax_pmd_load_hole---of 1
__probestub_dax_pmd_load_hole_fallback---of 1
__probestub_dax_pte_fault---of 1
__probestub_dax_pte_fault_done---of 1
__probestub_dax_writeback_one---of 1
__probestub_dax_writeback_range---of 1
__probestub_dax_writeback_range_done---of 1
__traceiter_dax_insert_mapping---of 4
__traceiter_dax_insert_pfn_mkwrite---of 4
__traceiter_dax_insert_pfn_mkwrite_no_entry---of 4
__traceiter_dax_load_hole---of 4
__traceiter_dax_pmd_fault---of 4
__traceiter_dax_pmd_fault_done---of 4
__traceiter_dax_pmd_insert_mapping---of 4
__traceiter_dax_pmd_load_hole---of 4
__traceiter_dax_pmd_load_hole_fallback---of 4
__traceiter_dax_pte_fault---of 4
__traceiter_dax_pte_fault_done---of 4
__traceiter_dax_writeback_one---of 4
__traceiter_dax_writeback_range---of 4
__traceiter_dax_writeback_range_done---of 4
dax_dedupe_file_range_compare---of 18
dax_delete_mapping_entry---of 3
dax_disassociate_entry---of 11
dax_fault_iter---of 94
dax_file_unshare---of 17
dax_finish_sync_fault---of 42
dax_insert_entry---of 39
dax_invalidate_mapping_entry_sync---of 1
dax_iomap_copy_around---of 26
dax_iomap_fault---of 124
dax_iomap_rw---of 53
dax_layout_busy_page100%of 1
dax_layout_busy_page_range10%of 22
dax_lock_folio---of 29
dax_lock_mapping_entry---of 27
dax_remap_file_range_prep---of 1
dax_truncate_page---of 5
dax_unlock_entry---of 5
dax_unlock_folio---of 3
dax_unlock_mapping_entry---of 3
dax_writeback_mapping_range---of 83
dax_zero_range---of 18
get_unlocked_entry---of 7
grab_mapping_entry---of 24
perf_trace_dax_insert_mapping---of 8
perf_trace_dax_pmd_fault_class---of 8
perf_trace_dax_pmd_insert_mapping_class---of 8
perf_trace_dax_pmd_load_hole_class---of 8
perf_trace_dax_pte_fault_class---of 8
perf_trace_dax_writeback_one---of 8
perf_trace_dax_writeback_range_class---of 8
put_unlocked_entry---of 5
trace_event_raw_event_dax_insert_mapping---of 7
trace_event_raw_event_dax_pmd_fault_class---of 7
trace_event_raw_event_dax_pmd_insert_mapping_class---of 7
trace_event_raw_event_dax_pmd_load_hole_class---of 7
trace_event_raw_event_dax_pte_fault_class---of 7
trace_event_raw_event_dax_writeback_one---of 7
trace_event_raw_event_dax_writeback_range_class---of 7
trace_raw_output_dax_insert_mapping---of 3
trace_raw_output_dax_pmd_fault_class---of 3
trace_raw_output_dax_pmd_insert_mapping_class---of 3
trace_raw_output_dax_pmd_load_hole_class---of 3
trace_raw_output_dax_pte_fault_class---of 3
trace_raw_output_dax_writeback_one---of 3
trace_raw_output_dax_writeback_range_class---of 3
wake_exceptional_entry_func---of 4
xas_next_entry---of 17
-----------
SUMMARY14%of 23

-----------
SUMMARY---of 0

__scrub_blocked_if_needed---of 8
btrfs_dev_name---of 8
btrfs_scrub_cancel---of 9
btrfs_scrub_cancel_dev---of 9
btrfs_scrub_continue---of 1
btrfs_scrub_dev---of 39
btrfs_scrub_pause25%of 8
btrfs_scrub_progress---of 4
calc_sector_number---of 15
find_first_extent_item---of 16
finish_extent_writes_for_zoned---of 4
flush_scrub_stripes---of 34
get_extent_info---of 4
init_scrub_stripe---of 14
raid56_scrub_wait_endio---of 1
rcu_read_unlock---of 6
scrub_chunk---of 15
scrub_enumerate_chunks---of 49
scrub_find_fill_first_stripe---of 39
scrub_free_ctx---of 9
scrub_pause_off---of 1
scrub_print_common_warning---of 25
scrub_print_warning_inode---of 42
scrub_put_ctx---of 4
scrub_read_endio---of 19
scrub_repair_read_endio---of 21
scrub_setup_ctx---of 7
scrub_simple_mirror---of 26
scrub_stripe---of 83
scrub_stripe_read_repair_worker---of 99
scrub_stripe_submit_repair_read---of 26
scrub_submit_initial_read---of 49
scrub_submit_write_bio---of 18
scrub_supers---of 24
scrub_verify_one_stripe---of 53
scrub_workers_get---of 14
scrub_workers_put---of 4
scrub_write_endio---of 17
scrub_write_sectors---of 18
submit_initial_group_read---of 16
wait_scrub_stripe_io---of 5
-----------
SUMMARY25%of 8

__add_preferred_console---of 42
__bpf_trace_console---of 1
__console_rewind_all---of 11
__down_trylock_console_sem55%of 11
__ia32_sys_syslog---of 1
__pr_flush---of 24
__printk_cpu_sync_put---of 3
__printk_cpu_sync_try_get---of 4
__printk_cpu_sync_wait---of 3
__printk_ratelimit---of 1
__probestub_console---of 1
__traceiter_console---of 4
__wake_up_klogd58%of 7
__x64_sys_syslog---of 1
_printk100%of 1
_printk_deferred---of 1
add_preferred_console---of 1
console_conditional_schedule---of 3
console_cpu_notify---of 7
console_device---of 9
console_flush_all47%of 49
console_flush_on_panic---of 3
console_force_preferred_locked---of 15
console_init_seq---of 16
console_list_lock---of 5
console_list_unlock---of 1
console_lock---of 5
console_opt_add_preferred_console---of 1
console_prepend_dropped---of 4
console_replay_all---of 6
console_srcu_read_lock---of 1
console_srcu_read_lock_is_held---of 3
console_srcu_read_unlock---of 3
console_srcu_write_flags---of 4
console_start---of 8
console_stop---of 8
console_trylock---of 6
console_unblank---of 30
console_unlock47%of 15
console_verbose---of 4
defer_console_output---of 1
devkmsg_emit---of 1
devkmsg_llseek---of 7
devkmsg_open---of 10
devkmsg_poll---of 6
devkmsg_read---of 14
devkmsg_release---of 4
devkmsg_sysctl_set_loglvl---of 10
devkmsg_write---of 12
do_syslog---of 34
early_printk---of 3
find_first_fitting_seq---of 11
info_print_prefix60%of 5
is_console_locked---of 1
kmsg_dump---of 18
kmsg_dump_get_buffer---of 25
kmsg_dump_get_line---of 19
kmsg_dump_reason_str---of 6
kmsg_dump_register---of 5
kmsg_dump_rewind---of 3
kmsg_dump_unregister---of 4
lockdep_assert_console_list_lock_held---of 4
log_buf_addr_get---of 1
log_buf_len_get---of 1
log_buf_vmcoreinfo_setup---of 1
msg_add_dict_text---of 18
other_cpu_in_panic---of 3
perf_trace_console---of 9
printk_get_next_message27%of 26
printk_parse_prefix---of 28
printk_percpu_data_ready---of 1
printk_sprint36%of 34
printk_timed_ratelimit---of 4
printk_trigger_flush---of 1
register_console---of 52
resume_console---of 15
suspend_console---of 15
syslog_print---of 32
syslog_print_all---of 22
this_cpu_in_panic100%of 1
trace_event_raw_event_console---of 8
trace_raw_output_console---of 3
try_enable_preferred_console---of 20
unregister_console---of 5
unregister_console_locked---of 23
vprintk_default100%of 1
vprintk_deferred---of 1
vprintk_emit20%of 36
vprintk_store40%of 53
wake_up_klogd---of 1
wake_up_klogd_work_func---of 12
-----------
SUMMARY39%of 239

-----------
SUMMARY---of 0

vhost_transport_cancel_pkt---of 20
vhost_transport_do_send_pkt---of 44
vhost_transport_get_local_cid---of 1
vhost_transport_msgzerocopy_allow---of 1
vhost_transport_send_pkt---of 24
vhost_transport_send_pkt_work---of 1
vhost_transport_seqpacket_allow---of 18
vhost_vsock_chr_poll100%of 1
vhost_vsock_chr_read_iter---of 1
vhost_vsock_chr_write_iter---of 1
vhost_vsock_dev_ioctl---of 50
vhost_vsock_dev_open---of 4
vhost_vsock_dev_release---of 8
vhost_vsock_handle_rx_kick---of 1
vhost_vsock_handle_tx_kick---of 44
vhost_vsock_reset_orphans---of 10
-----------
SUMMARY100%of 1

elevator_alloc---of 3
elevator_disable---of 21
elevator_exit---of 1
elevator_find_get---of 10
elevator_init_mq---of 17
elevator_release---of 1
elevator_switch---of 27
elv_attempt_insert_merge29%of 14
elv_attr_show---of 4
elv_attr_store---of 4
elv_bio_merge_ok---of 4
elv_former_request---of 3
elv_iosched_show---of 10
elv_iosched_store---of 18
elv_latter_request---of 3
elv_merge25%of 20
elv_merge_requests---of 8
elv_merged_request---of 9
elv_rb_add100%of 4
elv_rb_del67%of 3
elv_rb_find86%of 7
elv_rb_former_request---of 1
elv_rb_latter_request---of 1
elv_register---of 20
elv_register_queue---of 12
elv_rqhash_add75%of 4
elv_rqhash_del60%of 5
elv_rqhash_find27%of 15
elv_rqhash_reposition---of 6
elv_unregister---of 5
elv_unregister_queue---of 7
-----------
SUMMARY44%of 72

_prb_commit55%of 11
_prb_read_valid29%of 38
data_alloc35%of 20
data_push_tail14%of 22
desc_read67%of 15
desc_update_last_finalized29%of 7
get_data20%of 25
prb_commit34%of 6
prb_final_commit100%of 1
prb_first_seq50%of 4
prb_first_valid_seq---of 3
prb_init---of 7
prb_next_reserve_seq---of 14
prb_next_seq---of 4
prb_read_valid100%of 1
prb_read_valid_info---of 1
prb_record_text_space---of 1
prb_reserve26%of 55
prb_reserve_in_last24%of 73
space_used27%of 15
-----------
SUMMARY30%of 293

-----------
SUMMARY---of 0

__bpf_trace_hugepage_set---of 1
__bpf_trace_hugepage_update---of 1
__bpf_trace_migration_pmd---of 1
__folio_rmap_sanity_checks---of 28
__pmd_trans_huge_lock---of 3
__probestub_hugepage_set_pmd---of 1
__probestub_hugepage_set_pud---of 1
__probestub_hugepage_update_pmd---of 1
__probestub_hugepage_update_pud---of 1
__probestub_remove_migration_pmd---of 1
__probestub_set_migration_pmd---of 1
__pud_trans_huge_lock---of 3
__split_huge_page---of 298
__split_huge_pmd3%of 204
__split_huge_pud---of 12
__thp_vma_allowable_orders9%of 36
__traceiter_hugepage_set_pmd---of 4
__traceiter_hugepage_set_pud---of 4
__traceiter_hugepage_update_pmd---of 4
__traceiter_hugepage_update_pud---of 4
__traceiter_remove_migration_pmd---of 4
__traceiter_set_migration_pmd---of 4
_compound_head---of 7
add_mm_counter---of 1
anon_fault_alloc_show---of 7
anon_fault_fallback_charge_show---of 7
anon_fault_fallback_show---of 7
can_change_pmd_writable---of 36
can_split_folio---of 33
change_huge_pmd---of 48
copy_huge_pmd---of 91
copy_huge_pud---of 6
current_gfp_context---of 5
deferred_split_count---of 1
deferred_split_folio---of 43
deferred_split_scan---of 32
defrag_show---of 5
defrag_store---of 7
do_huge_pmd_anonymous_page---of 85
do_huge_pmd_numa_page---of 44
do_huge_pmd_wp_page---of 146
enabled_show---of 3
enabled_store---of 7
file_thp_enabled---of 4
filemap_nr_thps_dec---of 6
folio_flags---of 10
folio_large_mapcount---of 9
folio_lock---of 9
folio_mapcount---of 9
folio_maybe_dma_pinned---of 9
folio_memcg---of 12
folio_nr_pages---of 9
folio_order---of 9
folio_put---of 4
folio_test_pmd_mappable---of 9
folio_try_share_anon_rmap_pmd---of 89
folio_undo_large_rmappable---of 16
follow_devmap_pmd---of 17
hpage_pmd_size_show---of 1
huge_pmd_set_accessed---of 4
huge_pud_set_accessed---of 4
madvise_free_huge_pmd---of 73
maybe_pmd_mkwrite---of 3
mm_get_huge_zero_folio---of 14
mm_put_huge_zero_folio---of 4
move_huge_pmd---of 23
move_pages_huge_pmd---of 120
perf_trace_hugepage_set---of 8
perf_trace_hugepage_update---of 8
perf_trace_migration_pmd---of 8
pfn_swap_entry_folio---of 17
pfn_swap_entry_to_page---of 18
pte_free---of 18
put_anon_vma---of 3
remap_page---of 21
remove_migration_pmd---of 72
set_huge_zero_folio---of 8
set_pmd_migration_entry---of 68
shrink_huge_zero_page_count---of 1
shrink_huge_zero_page_scan---of 6
single_hugepage_flag_show---of 1
single_hugepage_flag_store---of 5
split_huge_page_to_list_to_order---of 113
split_huge_pages_all---of 65
split_huge_pages_write---of 147
split_huge_pmd_address---of 3
swpout_fallback_show---of 7
swpout_show---of 7
thp_get_unmapped_area100%of 1
thp_get_unmapped_area_vmflags46%of 11
thpsize_enabled_show---of 4
thpsize_enabled_store---of 6
thpsize_release---of 1
touch_pmd---of 3
touch_pud---of 3
trace_event_raw_event_hugepage_set---of 7
trace_event_raw_event_hugepage_update---of 7
trace_event_raw_event_migration_pmd---of 7
trace_raw_output_hugepage_set---of 3
trace_raw_output_hugepage_update---of 3
trace_raw_output_migration_pmd---of 3
unmap_folio---of 18
use_zero_page_show---of 1
use_zero_page_store---of 5
vma_adjust_trans_huge43%of 19
vma_thp_gfp_mask---of 7
vmf_insert_pfn_pmd---of 23
vmf_insert_pfn_pud---of 21
zap_huge_pmd---of 38
zap_huge_pud---of 9
-----------
SUMMARY9%of 271

-----------
SUMMARY---of 0

hfs_asc2mac35%of 20
hfs_mac2asc---of 19
-----------
SUMMARY35%of 20

_snd_pcm_hw_param_min---of 31
_snd_pcm_hw_param_set---of 17
snd_pcm_hw_param_max---of 45
snd_pcm_hw_param_near---of 31
snd_pcm_oss_change_params_locked---of 179
snd_pcm_oss_disconnect_minor---of 10
snd_pcm_oss_format_from---of 19
snd_pcm_oss_format_to---of 19
snd_pcm_oss_get_active_substream---of 14
snd_pcm_oss_get_caps---of 7
snd_pcm_oss_get_formats---of 29
snd_pcm_oss_get_odelay---of 9
snd_pcm_oss_get_ptr---of 29
snd_pcm_oss_get_space---of 22
snd_pcm_oss_get_trigger---of 7
snd_pcm_oss_ioctl---of 63
snd_pcm_oss_ioctl_compat---of 1
snd_pcm_oss_make_ready---of 10
snd_pcm_oss_mmap---of 21
snd_pcm_oss_nonblock---of 1
snd_pcm_oss_open---of 59
snd_pcm_oss_poll22%of 23
snd_pcm_oss_post---of 4
snd_pcm_oss_proc_read---of 6
snd_pcm_oss_proc_write---of 46
snd_pcm_oss_read---of 26
snd_pcm_oss_read2---of 11
snd_pcm_oss_read3---of 17
snd_pcm_oss_readv3---of 12
snd_pcm_oss_register_minor---of 23
snd_pcm_oss_release---of 11
snd_pcm_oss_release_substream---of 4
snd_pcm_oss_reset---of 5
snd_pcm_oss_set_channels---of 14
snd_pcm_oss_set_format---of 15
snd_pcm_oss_set_fragment---of 19
snd_pcm_oss_set_rate---of 13
snd_pcm_oss_set_subdivide---of 28
snd_pcm_oss_set_trigger---of 24
snd_pcm_oss_sync---of 46
snd_pcm_oss_sync1---of 18
snd_pcm_oss_unregister_minor---of 7
snd_pcm_oss_write---of 47
snd_pcm_oss_write3---of 12
snd_pcm_oss_writev3---of 12
snd_pcm_plugin_append---of 1
-----------
SUMMARY22%of 23

-----------
SUMMARY---of 0

__io_napi_add---of 32
__io_napi_adjust_timeout---of 10
__io_napi_busy_loop---of 32
io_napi_busy_loop_should_end---of 9
io_napi_free---of 9
io_napi_init100%of 1
io_napi_sqpoll_busy_poll---of 30
io_register_napi---of 8
io_unregister_napi---of 4
-----------
SUMMARY100%of 1

should_fail_usercopy100%of 1
-----------
SUMMARY100%of 1

arch_uprobe_abort_xol---of 5
arch_uprobe_analyze_insn---of 100
arch_uprobe_exception_notify43%of 7
arch_uprobe_post_xol---of 10
arch_uprobe_pre_xol---of 6
arch_uprobe_skip_sstep---of 5
arch_uprobe_xol_was_trapped---of 1
arch_uretprobe_hijack_return_addr---of 6
arch_uretprobe_is_alive---of 1
branch_emulate_op---of 38
branch_post_xol_op---of 3
default_abort_op---of 5
default_post_xol_op---of 11
default_pre_xol_op---of 5
push_emulate_op---of 3
-----------
SUMMARY43%of 7

clear_ti_thread_flag---of 1
do_error_trap34%of 12
do_int3---of 1
do_int3_user---of 7
do_trap37%of 11
get_si_code---of 1
handle_invalid_op---of 1
is_sysenter_singlestep---of 1
is_valid_bugaddr---of 3
math_error---of 12
native_read_msr---of 3
notify_debug---of 1
test_ti_thread_flag---of 1
wrmsrl---of 3
-----------
SUMMARY35%of 23

add_inode_ref---of 75
backref_in_log---of 6
btrfs_del_dir_entries_in_log---of 13
btrfs_del_inode_ref_in_log---of 9
btrfs_end_log_trans---of 4
btrfs_free_log---of 3
btrfs_free_log_root_tree67%of 3
btrfs_init_log_ctx---of 1
btrfs_init_log_ctx_scratch_eb---of 4
btrfs_log_all_parents---of 24
btrfs_log_all_xattrs---of 17
btrfs_log_changed_extents---of 74
btrfs_log_dentry_safe---of 1
btrfs_log_holes---of 17
btrfs_log_inode---of 184
btrfs_log_inode_parent---of 51
btrfs_log_new_name---of 32
btrfs_log_prealloc_extents---of 27
btrfs_pin_log_trans---of 1
btrfs_record_snapshot_destroy---of 1
btrfs_record_unlink_dir---of 5
btrfs_recover_log_trees---of 49
btrfs_release_log_ctx_extents---of 7
btrfs_remove_all_log_ctxs---of 8
btrfs_sync_log---of 75
check_item_in_log---of 11
clean_log_buffer---of 5
copy_items---of 39
delete_conflicting_dir_entry---of 8
drop_inode_items---of 11
drop_one_dir_item---of 6
extent_cmp---of 1
fill_inode_item---of 3
find_dir_range---of 16
fixup_inode_link_count---of 32
fixup_inode_link_counts---of 11
free_log_tree---of 7
inode_in_dir---of 7
inode_logged---of 11
insert_delayed_items_batch---of 5
insert_dir_log_key---of 6
insert_one_name---of 4
join_running_log_trans---of 9
link_to_fixup_dir---of 5
log_all_new_ancestors---of 53
log_csums---of 5
log_delayed_deletion_items---of 29
log_delayed_insertion_items---of 16
log_dir_items---of 54
log_directory_changes---of 16
log_inode_item---of 9
log_new_dir_dentries---of 41
overwrite_item---of 30
process_one_buffer---of 9
read_one_inode---of 1
replay_dir_deletes---of 21
replay_one_buffer---of 58
replay_one_dir_item---of 6
replay_one_extent---of 40
replay_one_name---of 30
wait_log_commit---of 9
walk_down_log_tree---of 39
walk_log_tree---of 19
walk_up_log_tree---of 24
-----------
SUMMARY67%of 3

crc_itu_t58%of 7
-----------
SUMMARY58%of 7

char2uni100%of 1
uni2char---of 5
-----------
SUMMARY100%of 1

smack_from_secid---of 17
smack_log10%of 20
smack_log_callback---of 3
smack_populate_secattr---of 5
smack_privileged67%of 3
smack_privileged_cred14%of 23
smk_access12%of 25
smk_access_entry29%of 7
smk_curacc100%of 1
smk_find_entry---of 5
smk_import_entry---of 24
smk_insert_entry---of 5
smk_netlbl_mls---of 31
smk_parse_smack---of 13
smk_tskacc22%of 14
-----------
SUMMARY18%of 93

ulist_add---of 1
ulist_add_merge---of 21
ulist_alloc---of 6
ulist_del---of 12
ulist_free---of 5
ulist_init100%of 1
ulist_next---of 5
ulist_reinit---of 4
ulist_release---of 4
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

__get_acl12%of 18
__posix_acl_chmod---of 22
__posix_acl_create---of 11
do_get_acl---of 14
do_set_acl---of 7
forget_all_cached_acls---of 9
forget_cached_acl---of 8
get_cached_acl22%of 32
get_cached_acl_rcu---of 11
get_inode_acl100%of 1
posix_acl_alloc---of 3
posix_acl_chmod20%of 10
posix_acl_clone---of 4
posix_acl_create20%of 15
posix_acl_create_masq---of 13
posix_acl_equiv_mode---of 14
posix_acl_from_mode---of 6
posix_acl_from_xattr---of 21
posix_acl_init---of 1
posix_acl_listxattr---of 8
posix_acl_permission---of 21
posix_acl_release---of 5
posix_acl_to_xattr---of 9
posix_acl_update_mode---of 15
posix_acl_valid---of 22
posix_acl_xattr_list---of 1
set_cached_acl42%of 12
set_posix_acl---of 10
simple_acl_create30%of 10
simple_set_acl---of 6
vfs_get_acl---of 8
vfs_remove_acl---of 35
vfs_set_acl---of 46
-----------
SUMMARY24%of 98

__fat_write_inode---of 19
_fat_bmap---of 1
delayed_free---of 3
fat_add_cluster---of 4
fat_alloc_inode67%of 3
fat_attach38%of 8
fat_block_truncate_page100%of 1
fat_build_inode34%of 15
fat_detach---of 9
fat_direct_IO---of 8
fat_evict_inode---of 5
fat_fill_inode21%of 39
fat_fill_super---of 164
fat_flush_inodes20%of 10
fat_free_inode---of 1
fat_get_block---of 27
fat_get_block_bmap---of 10
fat_iget---of 8
fat_put_super---of 1
fat_read_folio---of 1
fat_read_root---of 6
fat_readahead---of 1
fat_remount---of 4
fat_set_state---of 9
fat_show_options---of 54
fat_statfs34%of 6
fat_sync_inode---of 1
fat_write_begin---of 4
fat_write_end---of 7
fat_write_inode---of 3
fat_writepages---of 1
init_once100%of 1
-----------
SUMMARY29%of 83

__bpf_trace_alloc_vmap_area---of 1
__bpf_trace_free_vmap_area_noflush---of 1
__bpf_trace_purge_vmap_area_lazy---of 1
__get_vm_area_caller---of 1
__get_vm_area_node---of 9
__probestub_alloc_vmap_area---of 1
__probestub_free_vmap_area_noflush---of 1
__probestub_purge_vmap_area_lazy---of 1
__purge_vmap_area_lazy---of 37
__traceiter_alloc_vmap_area---of 4
__traceiter_free_vmap_area_noflush---of 4
__traceiter_purge_vmap_area_lazy---of 4
__vmalloc_node_noprof---of 1
__vmalloc_node_range_noprof---of 64
__vmalloc_noprof---of 1
__vmap_pages_range_noflush---of 43
__vunmap_range_noflush---of 23
_vm_unmap_aliases---of 30
aligned_vread_iter---of 8
alloc_vmap_area---of 127
check_sparse_vm_area---of 7
decay_va_pool_node---of 42
delayed_vfree_work---of 4
drain_vmap_area_work---of 1
find_unlink_vmap_area---of 12
find_vm_area---of 11
find_vmap_area---of 11
find_vmap_area_exceed_addr_lock---of 23
free_unmap_vmap_area---of 1
free_vm_area---of 3
free_vmap_area---of 66
free_vmap_area_noflush---of 24
free_vmap_area_rb_augment_cb_rotate---of 5
free_vmap_block---of 9
get_vm_area---of 1
get_vm_area_caller---of 1
insert_vmap_area---of 14
insert_vmap_area_augment---of 25
ioremap_page_range---of 19
is_vmalloc_addr100%of 1
is_vmalloc_or_module_addr---of 1
mod_memcg_page_state---of 36
pcpu_free_vm_areas---of 5
pcpu_get_vm_areas---of 260
perf_trace_alloc_vmap_area---of 8
perf_trace_free_vmap_area_noflush---of 8
perf_trace_purge_vmap_area_lazy---of 8
pfn_valid---of 29
purge_fragmented_block---of 8
purge_vmap_node---of 19
reclaim_and_purge_vmap_areas---of 25
reclaim_list_global---of 67
register_vmap_purge_notifier---of 1
remap_vmalloc_range---of 1
remap_vmalloc_range_partial---of 19
remove_vm_area---of 5
trace_event_raw_event_alloc_vmap_area---of 7
trace_event_raw_event_free_vmap_area_noflush---of 7
trace_event_raw_event_purge_vmap_area_lazy---of 7
trace_raw_output_alloc_vmap_area---of 3
trace_raw_output_free_vmap_area_noflush---of 3
trace_raw_output_purge_vmap_area_lazy---of 3
unregister_vmap_purge_notifier---of 1
vfree---of 11
vfree_atomic---of 5
vm_area_map_pages---of 3
vm_area_unmap_pages---of 3
vm_flags_set---of 6
vm_map_ram---of 44
vm_reset_perms---of 18
vm_unmap_aliases---of 1
vm_unmap_ram---of 16
vmalloc_32_noprof---of 1
vmalloc_32_user_noprof---of 1
vmalloc_dump_obj---of 10
vmalloc_huge_noprof---of 1
vmalloc_info_show---of 50
vmalloc_node_noprof---of 1
vmalloc_noprof---of 1
vmalloc_nr_pages---of 1
vmalloc_to_page---of 12
vmalloc_to_pfn---of 1
vmalloc_user_noprof---of 1
vmap---of 9
vmap_node_shrink_count---of 6
vmap_node_shrink_scan---of 4
vmap_page_range---of 1
vmap_pages_range_noflush---of 1
vmap_pfn---of 5
vmap_pfn_apply---of 8
vmap_range_noflush---of 44
vread_iter---of 50
vunmap---of 5
vunmap_range---of 1
vunmap_range_noflush---of 1
vzalloc_node_noprof---of 1
vzalloc_noprof---of 1
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

__ia32_sys_splice---of 1
__ia32_sys_tee---of 9
__ia32_sys_vmsplice---of 1
__se_sys_splice30%of 27
__se_sys_vmsplice8%of 53
__splice_from_pipe30%of 20
__x64_sys_splice100%of 1
__x64_sys_tee34%of 9
__x64_sys_vmsplice100%of 1
add_to_pipe---of 4
copy_splice_read---of 16
direct_file_splice_eof---of 3
direct_splice_actor40%of 5
do_splice9%of 60
do_splice_direct40%of 5
do_tee15%of 35
file_end_write37%of 11
file_start_write37%of 11
folio_lock---of 9
ipipe_prep25%of 8
iter_file_splice_write37%of 33
opipe_prep23%of 9
page_cache_pipe_buf_confirm18%of 23
page_cache_pipe_buf_release100%of 1
page_cache_pipe_buf_try_steal---of 19
pipe_clear_nowait40%of 5
pipe_to_user100%of 1
put_page29%of 14
splice_direct_to_actor26%of 31
splice_file_range---of 9
splice_file_range_actor---of 5
splice_file_to_pipe27%of 15
splice_from_pipe---of 1
splice_from_pipe_next38%of 16
splice_grow_spd---of 4
splice_shrink_spd---of 3
splice_to_pipe---of 9
splice_to_socket---of 43
user_page_pipe_buf_try_steal---of 3
vfs_splice_read---of 10
wait_for_space29%of 7
warn_unsupported---of 4
-----------
SUMMARY24%of 401

__bpf_trace_io_uring_complete---of 1
__bpf_trace_io_uring_cqe_overflow---of 1
__bpf_trace_io_uring_cqring_wait---of 1
__bpf_trace_io_uring_create---of 1
__bpf_trace_io_uring_defer---of 1
__bpf_trace_io_uring_fail_link---of 1
__bpf_trace_io_uring_file_get---of 1
__bpf_trace_io_uring_link---of 1
__bpf_trace_io_uring_local_work_run---of 1
__bpf_trace_io_uring_poll_arm---of 1
__bpf_trace_io_uring_queue_async_work---of 1
__bpf_trace_io_uring_register---of 1
__bpf_trace_io_uring_req_failed---of 1
__bpf_trace_io_uring_short_write---of 1
__bpf_trace_io_uring_submit_req---of 1
__bpf_trace_io_uring_task_add---of 1
__bpf_trace_io_uring_task_work_run---of 1
__ia32_sys_io_uring_enter---of 1
__ia32_sys_io_uring_setup---of 1
__io_alloc_req_refill---of 8
__io_arm_ltimeout---of 6
__io_commit_cqring_flush---of 11
__io_cqring_overflow_flush---of 21
__io_prep_linked_timeout---of 6
__io_req_find_next_prep---of 1
__io_req_task_work_add---of 22
__io_run_local_work---of 35
__io_submit_flush_completions---of 49
__io_uring_cancel---of 1
__probestub_io_uring_complete---of 1
__probestub_io_uring_cqe_overflow---of 1
__probestub_io_uring_cqring_wait---of 1
__probestub_io_uring_create---of 1
__probestub_io_uring_defer---of 1
__probestub_io_uring_fail_link---of 1
__probestub_io_uring_file_get---of 1
__probestub_io_uring_link---of 1
__probestub_io_uring_local_work_run---of 1
__probestub_io_uring_poll_arm---of 1
__probestub_io_uring_queue_async_work---of 1
__probestub_io_uring_register---of 1
__probestub_io_uring_req_failed---of 1
__probestub_io_uring_short_write---of 1
__probestub_io_uring_submit_req---of 1
__probestub_io_uring_task_add---of 1
__probestub_io_uring_task_work_run---of 1
__se_sys_io_uring_enter5%of 135
__se_sys_io_uring_setup24%of 13
__traceiter_io_uring_complete---of 4
__traceiter_io_uring_cqe_overflow---of 4
__traceiter_io_uring_cqring_wait---of 4
__traceiter_io_uring_create---of 4
__traceiter_io_uring_defer---of 4
__traceiter_io_uring_fail_link---of 4
__traceiter_io_uring_file_get---of 4
__traceiter_io_uring_link---of 4
__traceiter_io_uring_local_work_run---of 4
__traceiter_io_uring_poll_arm---of 4
__traceiter_io_uring_queue_async_work---of 4
__traceiter_io_uring_register---of 4
__traceiter_io_uring_req_failed---of 4
__traceiter_io_uring_short_write---of 4
__traceiter_io_uring_submit_req---of 4
__traceiter_io_uring_task_add---of 4
__traceiter_io_uring_task_work_run---of 4
__x64_sys_io_uring_enter100%of 1
__x64_sys_io_uring_setup100%of 1
ctx_flush_and_put---of 7
get_task_struct---of 4
get_uid---of 4
io_activate_pollwq---of 7
io_activate_pollwq_cb---of 1
io_alloc_async_data---of 5
io_alloc_cache_put---of 4
io_alloc_hash_table67%of 3
io_allocate_scq_urings46%of 11
io_cancel_ctx_cb---of 1
io_cancel_defer_files---of 14
io_cancel_task_cb---of 13
io_check_restriction---of 4
io_clean_op---of 25
io_cqe_cache_refill---of 6
io_cqring_event_overflow---of 25
io_drain_req---of 38
io_eventfd_ops---of 5
io_eventfd_signal---of 29
io_fallback_req_func---of 7
io_fallback_tw---of 11
io_file_get_fixed---of 16
io_file_get_flags---of 4
io_file_get_normal---of 19
io_free_req---of 1
io_get_cqe_overflow---of 22
io_handle_tw_list---of 22
io_iopoll_try_reap_events---of 8
io_is_uring_fops100%of 1
io_issue_sqe---of 58
io_match_task_safe---of 13
io_move_task_work_from_local---of 4
io_poll_issue---of 4
io_post_aux_cqe---of 8
io_prep_async_work---of 21
io_put_task_remote---of 6
io_queue_async---of 18
io_queue_deferred---of 9
io_queue_iowq---of 29
io_queue_next---of 5
io_queue_sqe_fallback---of 4
io_req_caches_free---of 5
io_req_cqe_overflow---of 1
io_req_defer_failed---of 16
io_req_normal_work_add---of 8
io_req_post_cqe---of 22
io_req_task_cancel---of 4
io_req_task_complete---of 6
io_req_task_queue---of 1
io_req_task_queue_fail---of 1
io_req_task_submit---of 8
io_ring_ctx_alloc15%of 41
io_ring_ctx_free---of 53
io_ring_ctx_ref_free---of 1
io_ring_ctx_wait_and_kill---of 4
io_ring_exit_work---of 31
io_rings_free---of 3
io_run_local_work---of 1
io_run_task_work---of 15
io_run_task_work_sig---of 6
io_submit_fail_init---of 21
io_submit_sqes3%of 84
io_task_refs_refill---of 4
io_tctx_exit_cb---of 4
io_uring_cancel_generic---of 30
io_uring_create31%of 56
io_uring_drop_tctx_refs---of 5
io_uring_install_fd67%of 3
io_uring_poll---of 10
io_uring_release---of 1
io_uring_try_cancel_iowq---of 7
io_uring_try_cancel_requests---of 22
io_wake_function---of 6
io_wq_free_work---of 9
io_wq_submit_work---of 43
percpu_ref_get_many---of 13
percpu_ref_put_many---of 14
perf_trace_io_uring_complete---of 8
perf_trace_io_uring_cqe_overflow---of 8
perf_trace_io_uring_cqring_wait---of 8
perf_trace_io_uring_create---of 8
perf_trace_io_uring_defer---of 8
perf_trace_io_uring_fail_link---of 8
perf_trace_io_uring_file_get---of 8
perf_trace_io_uring_link---of 8
perf_trace_io_uring_local_work_run---of 8
perf_trace_io_uring_poll_arm---of 8
perf_trace_io_uring_queue_async_work---of 8
perf_trace_io_uring_register---of 8
perf_trace_io_uring_req_failed---of 8
perf_trace_io_uring_short_write---of 8
perf_trace_io_uring_submit_req---of 8
perf_trace_io_uring_task_add---of 8
perf_trace_io_uring_task_work_run---of 8
req_ref_put_and_test---of 4
tctx_task_work---of 3
tctx_task_work_run---of 20
trace_event_raw_event_io_uring_complete---of 7
trace_event_raw_event_io_uring_cqe_overflow---of 7
trace_event_raw_event_io_uring_cqring_wait---of 7
trace_event_raw_event_io_uring_create---of 7
trace_event_raw_event_io_uring_defer---of 7
trace_event_raw_event_io_uring_fail_link---of 7
trace_event_raw_event_io_uring_file_get---of 7
trace_event_raw_event_io_uring_link---of 7
trace_event_raw_event_io_uring_local_work_run---of 7
trace_event_raw_event_io_uring_poll_arm---of 7
trace_event_raw_event_io_uring_queue_async_work---of 7
trace_event_raw_event_io_uring_register---of 7
trace_event_raw_event_io_uring_req_failed---of 7
trace_event_raw_event_io_uring_short_write---of 7
trace_event_raw_event_io_uring_submit_req---of 7
trace_event_raw_event_io_uring_task_add---of 7
trace_event_raw_event_io_uring_task_work_run---of 7
trace_io_uring_complete---of 15
trace_io_uring_create27%of 15
trace_io_uring_link---of 15
trace_raw_output_io_uring_complete---of 3
trace_raw_output_io_uring_cqe_overflow---of 3
trace_raw_output_io_uring_cqring_wait---of 3
trace_raw_output_io_uring_create---of 3
trace_raw_output_io_uring_defer---of 3
trace_raw_output_io_uring_fail_link---of 3
trace_raw_output_io_uring_file_get---of 3
trace_raw_output_io_uring_link---of 3
trace_raw_output_io_uring_local_work_run---of 3
trace_raw_output_io_uring_poll_arm---of 3
trace_raw_output_io_uring_queue_async_work---of 3
trace_raw_output_io_uring_register---of 3
trace_raw_output_io_uring_req_failed---of 3
trace_raw_output_io_uring_short_write---of 3
trace_raw_output_io_uring_submit_req---of 3
trace_raw_output_io_uring_task_add---of 3
trace_raw_output_io_uring_task_work_run---of 3
-----------
SUMMARY14%of 364

__dispose_buffer---of 5
__jbd2_journal_file_buffer45%of 29
__jbd2_journal_refile_buffer---of 16
__jbd2_journal_temp_unlink_buffer40%of 23
do_get_write_access19%of 49
folio_size---of 10
jbd2__journal_restart---of 21
jbd2__journal_start46%of 11
jbd2_buffer_abort_trigger---of 4
jbd2_buffer_frozen_trigger---of 4
jbd2_journal_begin_ordered_truncate40%of 5
jbd2_journal_destroy_transaction_cache---of 1
jbd2_journal_dirty_metadata22%of 41
jbd2_journal_extend---of 19
jbd2_journal_file_buffer---of 1
jbd2_journal_file_inode40%of 15
jbd2_journal_forget15%of 21
jbd2_journal_free_reserved---of 5
jbd2_journal_free_transaction---of 3
jbd2_journal_get_create_access24%of 13
jbd2_journal_get_undo_access---of 13
jbd2_journal_get_write_access43%of 7
jbd2_journal_inode_ranged_wait---of 1
jbd2_journal_inode_ranged_write100%of 1
jbd2_journal_invalidate_folio---of 39
jbd2_journal_lock_updates---of 6
jbd2_journal_refile_buffer---of 3
jbd2_journal_restart---of 1
jbd2_journal_set_triggers---of 3
jbd2_journal_start---of 1
jbd2_journal_start_reserved---of 5
jbd2_journal_stop29%of 35
jbd2_journal_try_to_free_buffers24%of 17
jbd2_journal_unfile_buffer---of 4
jbd2_journal_unlock_updates---of 3
jbd2_journal_wait_updates---of 5
jbd2_write_access_granted35%of 20
start_this_handle17%of 61
stop_this_handle47%of 13
trace_jbd2_handle_start27%of 15
wait_transaction_locked---of 3
-----------
SUMMARY28%of 376

mi_enum_attr28%of 37
mi_find_attr47%of 13
mi_format_new---of 13
mi_get---of 9
mi_init100%of 1
mi_insert_attr---of 14
mi_pack_runs---of 3
mi_put---of 6
mi_read19%of 22
mi_remove_attr---of 8
mi_resize_attr---of 9
mi_write40%of 5
-----------
SUMMARY30%of 78

-----------
SUMMARY---of 0

cpu_online---of 3
io_put_sq_data---of 5
io_sq_offload_create7%of 45
io_sq_thread---of 92
io_sq_thread_finish---of 7
io_sq_thread_park---of 5
io_sq_thread_stop---of 7
io_sq_thread_unpark---of 5
io_sqpoll_wait_sq---of 6
io_sqpoll_wq_cpu_affinity---of 4
-----------
SUMMARY7%of 45

chacha_block_generic100%of 1
chacha_permute58%of 7
hchacha_block_generic---of 1
-----------
SUMMARY63%of 8

-----------
SUMMARY---of 0

memb_group_features_show---of 1
null_add_dev---of 64
null_alloc_dev---of 4
null_cmd_timer_expired---of 1
null_complete_rq100%of 1
null_del_dev---of 10
null_destroy_dev---of 3
null_free_device_storage---of 13
null_free_sector---of 11
null_handle_discard---of 6
null_init_hctx---of 3
null_insert_page---of 36
null_make_cache_space---of 40
null_map_queues---of 14
null_poll---of 21
null_process_cmd6%of 59
null_queue_rq24%of 34
null_queue_rqs50%of 8
null_set_irqmode---of 4
null_set_queue_mode---of 4
null_timeout_rq---of 8
nullb_bwtimer_fn---of 3
nullb_device_badblocks_show---of 1
nullb_device_badblocks_store---of 11
nullb_device_blocking_show---of 1
nullb_device_blocking_store---of 6
nullb_device_blocksize_show---of 1
nullb_device_blocksize_store---of 6
nullb_device_cache_size_show---of 1
nullb_device_cache_size_store---of 6
nullb_device_completion_nsec_show---of 1
nullb_device_completion_nsec_store---of 6
nullb_device_discard_show---of 1
nullb_device_discard_store---of 6
nullb_device_fua_show---of 1
nullb_device_fua_store---of 6
nullb_device_home_node_show---of 1
nullb_device_home_node_store---of 6
nullb_device_hw_queue_depth_show---of 1
nullb_device_hw_queue_depth_store---of 6
nullb_device_index_show---of 1
nullb_device_index_store---of 6
nullb_device_irqmode_show---of 1
nullb_device_irqmode_store---of 6
nullb_device_max_sectors_show---of 1
nullb_device_max_sectors_store---of 6
nullb_device_mbps_show---of 1
nullb_device_mbps_store---of 6
nullb_device_memory_backed_show---of 1
nullb_device_memory_backed_store---of 6
nullb_device_no_sched_show---of 1
nullb_device_no_sched_store---of 6
nullb_device_poll_queues_show---of 1
nullb_device_poll_queues_store---of 10
nullb_device_power_show---of 1
nullb_device_power_store---of 10
nullb_device_queue_mode_show---of 1
nullb_device_queue_mode_store---of 6
nullb_device_release---of 3
nullb_device_shared_tag_bitmap_show---of 1
nullb_device_shared_tag_bitmap_store---of 6
nullb_device_shared_tags_show---of 1
nullb_device_shared_tags_store---of 6
nullb_device_size_show---of 1
nullb_device_size_store---of 6
nullb_device_submit_queues_show---of 1
nullb_device_submit_queues_store---of 10
nullb_device_use_per_node_hctx_show---of 1
nullb_device_use_per_node_hctx_store---of 6
nullb_device_virt_boundary_show---of 1
nullb_device_virt_boundary_store---of 6
nullb_device_zone_append_max_sectors_show---of 1
nullb_device_zone_append_max_sectors_store---of 6
nullb_device_zone_capacity_show---of 1
nullb_device_zone_capacity_store---of 6
nullb_device_zone_max_active_show---of 1
nullb_device_zone_max_active_store---of 6
nullb_device_zone_max_open_show---of 1
nullb_device_zone_max_open_store---of 6
nullb_device_zone_nr_conv_show---of 1
nullb_device_zone_nr_conv_store---of 6
nullb_device_zone_offline_store---of 1
nullb_device_zone_readonly_store---of 1
nullb_device_zone_size_show---of 1
nullb_device_zone_size_store---of 6
nullb_device_zoned_show---of 1
nullb_device_zoned_store---of 6
nullb_group_drop_item---of 3
nullb_group_make_group---of 12
nullb_setup_bwtimer---of 1
-----------
SUMMARY16%of 102

__ia32_sys_memfd_secret---of 1
__se_sys_memfd_secret---of 11
__x64_sys_memfd_secret---of 1
folio_put---of 4
secretmem_active---of 1
secretmem_fault---of 14
secretmem_free_folio---of 23
secretmem_init_fs_context---of 1
secretmem_migrate_folio---of 1
secretmem_mmap---of 8
secretmem_release---of 1
secretmem_setattr---of 4
vma_is_secretmem100%of 1
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

__anon_vma_prepare40%of 15
__bpf_trace_migration_pte---of 1
__bpf_trace_mm_migrate_pages---of 1
__bpf_trace_mm_migrate_pages_start---of 1
__bpf_trace_tlb_flush---of 1
__folio_rmap_sanity_checks24%of 34
__folio_set_anon---of 4
__probestub_mm_migrate_pages---of 1
__probestub_mm_migrate_pages_start---of 1
__probestub_remove_migration_pte---of 1
__probestub_set_migration_pte---of 1
__probestub_tlb_flush---of 1
__put_anon_vma34%of 9
__traceiter_mm_migrate_pages---of 4
__traceiter_mm_migrate_pages_start---of 4
__traceiter_remove_migration_pte---of 4
__traceiter_set_migration_pte---of 4
__traceiter_tlb_flush---of 4
anon_vma_clone18%of 23
anon_vma_ctor---of 1
anon_vma_fork---of 9
flush_tlb_batched_pending67%of 3
folio_add_anon_rmap_pmd---of 115
folio_add_anon_rmap_ptes---of 118
folio_add_file_rmap_pmd---of 34
folio_add_file_rmap_ptes31%of 26
folio_add_new_anon_rmap10%of 118
folio_entire_mapcount---of 9
folio_get_anon_vma---of 24
folio_large_mapcount---of 9
folio_lock_anon_vma_read---of 31
folio_mapcount34%of 9
folio_maybe_dma_pinned---of 9
folio_mkclean31%of 13
folio_move_anon_rmap---of 10
folio_not_mapped---of 1
folio_referenced---of 20
folio_referenced_one---of 77
folio_remove_rmap_pmd---of 28
folio_remove_rmap_ptes25%of 20
folio_size---of 10
folio_test_large---of 7
folio_test_pmd_mappable---of 9
folio_try_share_anon_rmap_pte---of 89
hugetlb_add_anon_rmap---of 60
hugetlb_add_new_anon_rmap---of 36
hugetlb_remove_rmap---of 3
invalid_folio_referenced_vma---of 28
invalid_migration_vma---of 1
invalid_mkclean_vma100%of 1
make_device_exclusive_range---of 35
mm_find_pmd75%of 4
page_address_in_vma---of 17
page_make_device_exclusive_one---of 45
page_mkclean_one34%of 9
page_vma_mapped_walk_done---of 10
page_vma_mkclean_one30%of 30
perf_trace_migration_pte---of 8
perf_trace_mm_migrate_pages---of 8
perf_trace_mm_migrate_pages_start---of 8
perf_trace_tlb_flush---of 8
pfn_mkclean_range---of 9
put_anon_vma---of 3
rcu_read_unlock---of 6
rmap_walk---of 4
rmap_walk_anon---of 43
rmap_walk_file26%of 47
rmap_walk_locked---of 4
set_tlb_ubc_flush_pending---of 13
should_defer_flush---of 10
trace_event_raw_event_migration_pte---of 7
trace_event_raw_event_mm_migrate_pages---of 7
trace_event_raw_event_mm_migrate_pages_start---of 7
trace_event_raw_event_tlb_flush---of 7
trace_raw_output_migration_pte---of 3
trace_raw_output_mm_migrate_pages---of 3
trace_raw_output_mm_migrate_pages_start---of 3
trace_raw_output_tlb_flush---of 3
trace_set_migration_pte---of 15
try_to_migrate---of 12
try_to_migrate_one---of 239
try_to_unmap---of 7
try_to_unmap_flush---of 3
try_to_unmap_flush_dirty---of 4
try_to_unmap_one---of 184
unlink_anon_vmas60%of 25
-----------
SUMMARY26%of 386

-----------
SUMMARY---of 0

hfs_bnode_split---of 16
hfs_brec_insert---of 21
hfs_brec_keylen37%of 11
hfs_brec_lenoff100%of 1
hfs_brec_remove---of 11
hfs_brec_update_parent---of 25
hfs_btree_inc_height---of 13
-----------
SUMMARY42%of 12

virtio_pci_freeze---of 3
virtio_pci_probe---of 15
virtio_pci_release_dev---of 1
virtio_pci_remove---of 5
virtio_pci_restore---of 3
virtio_pci_resume---of 7
virtio_pci_sriov_configure---of 6
virtio_pci_suspend---of 7
virtio_pci_vf_get_pf_dev---of 1
vp_bus_name---of 3
vp_config_changed---of 1
vp_del_vqs---of 20
vp_find_vqs---of 16
vp_find_vqs_msix---of 44
vp_get_vq_affinity---of 4
vp_interrupt---of 7
vp_notify100%of 1
vp_set_vq_affinity---of 5
vp_setup_vq---of 6
vp_synchronize_vectors---of 6
vp_vring_interrupt---of 4
-----------
SUMMARY100%of 1

__ext4fs_dirhash28%of 33
ext4fs_dirhash34%of 9
str2hashbuf_signed53%of 19
str2hashbuf_unsigned69%of 19
-----------
SUMMARY44%of 80

-----------
SUMMARY---of 0

FUA_show---of 1
allow_restart_show---of 1
allow_restart_store---of 7
app_tag_own_show---of 1
cache_type_show---of 1
cache_type_store---of 17
manage_runtime_start_stop_show---of 1
manage_runtime_start_stop_store---of 4
manage_shutdown_show---of 1
manage_shutdown_store---of 4
manage_start_stop_show---of 1
manage_system_start_stop_show---of 1
manage_system_start_stop_store---of 4
max_medium_access_timeouts_show---of 1
max_medium_access_timeouts_store---of 3
max_retries_show---of 1
max_retries_store---of 4
max_write_same_blocks_show---of 1
max_write_same_blocks_store---of 9
protection_mode_show---of 6
protection_type_show---of 1
protection_type_store---of 5
provisioning_mode_show---of 3
provisioning_mode_store---of 7
read_capacity_10---of 13
read_capacity_16---of 36
read_capacity_error---of 9
scsi_disk_free_disk---of 1
scsi_disk_release---of 1
sd_check_events---of 27
sd_completed_bytes---of 9
sd_config_discard---of 8
sd_config_write_same---of 14
sd_default_probe---of 1
sd_done---of 41
sd_eh_action---of 30
sd_eh_reset---of 1
sd_get_unique_id---of 27
sd_getgeo---of 5
sd_init_command20%of 67
sd_ioctl---of 8
sd_open---of 23
sd_pr_clear---of 1
sd_pr_in_command---of 15
sd_pr_out_command---of 15
sd_pr_preempt---of 1
sd_pr_read_keys---of 9
sd_pr_read_reservation---of 5
sd_pr_register---of 3
sd_pr_release---of 1
sd_pr_reserve---of 3
sd_print_result---of 5
sd_print_sense_hdr---of 1
sd_probe---of 43
sd_release---of 7
sd_remove---of 3
sd_rescan---of 1
sd_resume---of 1
sd_resume_runtime---of 9
sd_resume_system---of 11
sd_revalidate_disk---of 413
sd_setup_protect_cmnd---of 19
sd_setup_rw10_cmnd67%of 3
sd_setup_rw16_cmnd---of 3
sd_setup_rw32_cmnd---of 3
sd_setup_rw6_cmnd---of 4
sd_setup_write_same10_cmnd---of 9
sd_setup_write_same16_cmnd---of 9
sd_shutdown---of 13
sd_start_stop_device---of 13
sd_suspend_common---of 16
sd_suspend_runtime---of 1
sd_suspend_system---of 4
sd_sync_cache---of 22
sd_uninit_command---of 3
sd_unlock_native_capacity---of 3
thin_provisioning_show---of 1
zeroing_mode_show---of 3
zeroing_mode_store---of 4
zoned_cap_show---of 5
-----------
SUMMARY22%of 70

fat12_ent_blocknr50%of 6
fat12_ent_bread29%of 14
fat12_ent_get67%of 3
fat12_ent_next---of 13
fat12_ent_put60%of 5
fat12_ent_set_ptr40%of 5
fat16_ent_get---of 3
fat16_ent_next---of 1
fat16_ent_put---of 1
fat16_ent_set_ptr---of 3
fat32_ent_get67%of 3
fat32_ent_next---of 1
fat32_ent_put---of 3
fat32_ent_set_ptr67%of 3
fat_alloc_clusters---of 49
fat_collect_bhs54%of 15
fat_count_free_clusters---of 30
fat_ent_access_init---of 5
fat_ent_blocknr50%of 8
fat_ent_bread50%of 6
fat_ent_read12%of 36
fat_ent_read_block---of 12
fat_ent_reada---of 6
fat_ent_write---of 4
fat_free_clusters25%of 49
fat_mirror_bhs40%of 15
fat_trim_fs---of 64
-----------
SUMMARY33%of 168

fat_cont_expand---of 7
fat_fallocate---of 11
fat_file_fsync---of 4
fat_file_release---of 4
fat_generic_ioctl---of 46
fat_getattr60%of 5
fat_setattr27%of 38
fat_truncate_blocks19%of 44
-----------
SUMMARY25%of 87

__ia32_compat_sys_fstatfs---of 1
__ia32_compat_sys_fstatfs64---of 1
__ia32_compat_sys_statfs---of 1
__ia32_compat_sys_statfs64---of 1
__ia32_compat_sys_ustat---of 1
__ia32_sys_fstatfs---of 3
__ia32_sys_fstatfs64---of 4
__ia32_sys_statfs---of 3
__ia32_sys_statfs64---of 4
__ia32_sys_ustat---of 1
__se_compat_sys_fstatfs---of 6
__se_compat_sys_statfs---of 6
__se_compat_sys_ustat---of 7
__se_sys_ustat---of 7
__x64_compat_sys_fstatfs---of 1
__x64_compat_sys_fstatfs64---of 1
__x64_compat_sys_statfs---of 1
__x64_compat_sys_statfs64---of 1
__x64_compat_sys_ustat---of 1
__x64_sys_fstatfs67%of 3
__x64_sys_fstatfs64---of 4
__x64_sys_statfs67%of 3
__x64_sys_statfs64---of 4
__x64_sys_ustat---of 1
fd_statfs34%of 9
kcompat_sys_fstatfs64---of 5
kcompat_sys_statfs64---of 5
user_statfs23%of 9
vfs_get_fsid---of 5
vfs_statfs---of 6
-----------
SUMMARY38%of 24

__lookup_extent_mapping43%of 19
add_extent_mapping52%of 29
alloc_extent_map67%of 3
btrfs_add_extent_mapping---of 27
btrfs_drop_extent_map_range33%of 46
btrfs_free_extent_maps---of 75
btrfs_replace_extent_map_range50%of 4
clear_em_logging---of 6
extent_map_exit---of 1
extent_map_tree_init100%of 1
free_extent_map63%of 8
lookup_extent_mapping100%of 1
merge_extent_mapping---of 12
remove_extent_mapping---of 13
replace_extent_mapping43%of 19
search_extent_mapping100%of 1
split_extent_map---of 13
try_merge_map---of 31
unpin_extent_cache---of 4
-----------
SUMMARY45%of 131

fs_ftype_to_dtype67%of 3
fs_umode_to_dtype---of 1
fs_umode_to_ftype100%of 1
-----------
SUMMARY75%of 4

-----------
SUMMARY---of 0

__push_leaf_left---of 38
__push_leaf_right---of 31
abort_should_print_stack---of 5
add_root_to_dirty_list34%of 12
balance_level---of 74
balance_node_right---of 15
btrfs_alloc_path100%of 1
btrfs_bin_search65%of 17
btrfs_block_can_be_shared---of 8
btrfs_comp_cpu_keys58%of 7
btrfs_copy_root---of 22
btrfs_cow_block24%of 26
btrfs_csum_type_size---of 3
btrfs_ctree_exit---of 1
btrfs_del_items37%of 22
btrfs_del_leaf---of 5
btrfs_del_ptr---of 23
btrfs_duplicate_item---of 3
btrfs_extend_item---of 11
btrfs_find_item---of 10
btrfs_find_next_key---of 25
btrfs_force_cow_block26%of 54
btrfs_free_path100%of 3
btrfs_get_next_valid_item50%of 4
btrfs_get_num_csums---of 1
btrfs_insert_empty_items40%of 5
btrfs_insert_item---of 6
btrfs_leaf_free_space---of 3
btrfs_next_old_item---of 3
btrfs_next_old_leaf11%of 57
btrfs_prev_leaf---of 17
btrfs_previous_extent_item---of 11
btrfs_previous_item---of 11
btrfs_read_node_slot---of 7
btrfs_release_path88%of 8
btrfs_root_node17%of 24
btrfs_search_backwards---of 14
btrfs_search_forward---of 33
btrfs_search_old_slot---of 31
btrfs_search_slot37%of 141
btrfs_search_slot_for_read---of 14
btrfs_set_item_key_safe---of 17
btrfs_setup_item_for_insert---of 1
btrfs_split_item---of 3
btrfs_super_csum_driver---of 5
btrfs_super_csum_name---of 3
btrfs_super_csum_size---of 3
btrfs_tree_unlock_rw---of 4
btrfs_truncate_item---of 15
check_sibling_keys---of 10
copy_for_split---of 14
finish_need_commit_sem_search---of 11
fixup_low_keys---of 12
get_eb_offset_in_folio30%of 10
insert_new_root---of 15
insert_ptr---of 21
leaf_space_used50%of 4
push_for_double_split---of 12
push_leaf_left17%of 12
push_leaf_right16%of 13
push_node_left---of 19
push_nodes_for_insert---of 38
read_block_for_search18%of 23
reada_for_balance---of 10
reada_for_search---of 29
root_sub_used_bytes---of 1
setup_items_for_insert45%of 18
setup_leaf_for_split---of 19
split_item---of 9
split_leaf---of 51
split_node---of 29
unlock_up49%of 29
update_ref_for_cow14%of 38
-----------
SUMMARY31%of 528

-----------
SUMMARY---of 0

__radix_tree_delete30%of 10
__radix_tree_lookup---of 18
__radix_tree_preload15%of 34
__radix_tree_replace47%of 13
delete_node13%of 31
idr_destroy---of 12
idr_get_free33%of 37
idr_preload---of 9
node_tag_clear38%of 16
radix_tree_cpu_dead---of 6
radix_tree_delete100%of 1
radix_tree_delete_item32%of 22
radix_tree_extend14%of 22
radix_tree_gang_lookup---of 12
radix_tree_gang_lookup_tag15%of 14
radix_tree_gang_lookup_tag_slot---of 11
radix_tree_insert45%of 27
radix_tree_iter_delete---of 3
radix_tree_iter_replace100%of 1
radix_tree_iter_resume---of 1
radix_tree_iter_tag_clear100%of 1
radix_tree_lookup54%of 13
radix_tree_lookup_slot---of 13
radix_tree_maybe_preload---of 9
radix_tree_next_chunk6%of 59
radix_tree_node_alloc28%of 11
radix_tree_node_ctor100%of 1
radix_tree_node_rcu_free---of 1
radix_tree_preload67%of 3
radix_tree_replace_slot---of 7
radix_tree_tag_clear---of 13
radix_tree_tag_get---of 16
radix_tree_tag_set39%of 21
radix_tree_tagged---of 3
-----------
SUMMARY26%of 337

-----------
SUMMARY---of 0

dont_mount---of 1
hfs_bmap---of 1
hfs_delete_inode---of 8
hfs_direct_IO---of 4
hfs_evict_inode---of 4
hfs_file_fsync---of 3
hfs_file_lookup---of 8
hfs_file_open---of 3
hfs_file_release---of 5
hfs_iget58%of 7
hfs_inode_read_fork---of 7
hfs_inode_setattr39%of 18
hfs_inode_write_fork---of 5
hfs_new_inode---of 7
hfs_read_folio---of 1
hfs_read_inode43%of 7
hfs_release_folio---of 20
hfs_test_inode50%of 4
hfs_write_begin---of 3
hfs_write_failed---of 3
hfs_write_inode---of 25
hfs_writepages---of 1
-----------
SUMMARY45%of 36

-----------
SUMMARY---of 0

__ia32_compat_sys_getdents---of 1
__ia32_compat_sys_old_readdir---of 6
__ia32_sys_getdents---of 1
__ia32_sys_getdents64---of 1
__ia32_sys_old_readdir---of 6
__se_compat_sys_getdents---of 11
__se_sys_getdents55%of 11
__se_sys_getdents6464%of 11
__x64_compat_sys_getdents---of 1
__x64_compat_sys_old_readdir---of 6
__x64_sys_getdents100%of 1
__x64_sys_getdents64100%of 1
__x64_sys_old_readdir---of 6
compat_filldir---of 30
compat_fillonedir---of 25
filldir35%of 29
filldir6442%of 29
fillonedir---of 24
fsnotify_access34%of 9
iterate_dir27%of 23
wrap_directory_iterator67%of 3
-----------
SUMMARY42%of 117

__put_user_ns---of 1
cmp_extents_forward---of 1
cmp_extents_reverse---of 1
cmp_map_id---of 1
create_user_ns---of 55
current_in_userns---of 4
enforced_nproc_rlimit---of 4
free_user_ns---of 10
from_kgid13%of 24
from_kgid_munged12%of 25
from_kprojid13%of 24
from_kprojid_munged---of 25
from_kuid13%of 24
from_kuid_munged12%of 25
gid_m_show---of 26
gid_m_start---of 5
in_userns---of 4
m_next---of 1
m_stop---of 1
make_kgid27%of 15
make_kprojid20%of 15
make_kuid27%of 15
map_id_down---of 15
map_id_up---of 24
map_write---of 61
new_idmap_permitted---of 55
ns_get_owner---of 7
proc_gid_map_write---of 4
proc_projid_map_write---of 4
proc_setgroups_show---of 1
proc_setgroups_write---of 11
proc_uid_map_write---of 4
projid_m_show---of 26
projid_m_start---of 5
sort_idmaps---of 4
uid_m_show---of 26
uid_m_start---of 5
unshare_userns---of 6
userns_get---of 20
userns_install---of 16
userns_may_setgroups---of 3
userns_owner---of 1
userns_put---of 5
-----------
SUMMARY16%of 167

__bio_integrity_endio---of 8
bio_integrity_add_page---of 11
bio_integrity_advance---of 13
bio_integrity_alloc---of 11
bio_integrity_clone---of 5
bio_integrity_free---of 10
bio_integrity_map_user---of 56
bio_integrity_prep8%of 25
bio_integrity_process---of 13
bio_integrity_trim---of 5
bio_integrity_unmap_free_user---of 26
bio_integrity_verify_fn---of 1
bioset_integrity_create---of 5
bioset_integrity_free---of 1
blk_flush_integrity---of 1
-----------
SUMMARY8%of 25

__loop_clr_fd---of 16
__loop_update_dio---of 19
css_put---of 15
lo_compat_ioctl---of 16
lo_complete_rq---of 11
lo_free_disk---of 3
lo_ioctl---of 70
lo_release---of 5
lo_rw_aio---of 33
lo_rw_aio_complete---of 5
loop_add---of 16
loop_attr_do_show_autoclear---of 1
loop_attr_do_show_backing_file---of 5
loop_attr_do_show_dio---of 1
loop_attr_do_show_offset---of 1
loop_attr_do_show_partscan---of 1
loop_attr_do_show_sizelimit---of 1
loop_configure---of 49
loop_control_ioctl---of 19
loop_free_idle_workers---of 11
loop_free_idle_workers_timer---of 1
loop_get_status---of 5
loop_info64_from_compat---of 3
loop_info64_to_compat---of 3
loop_probe---of 6
loop_process_work---of 74
loop_queue_rq32%of 35
loop_reconfigure_limits---of 9
loop_reread_partitions---of 3
loop_rootcg_workfn---of 1
loop_set_hw_queue_depth---of 4
loop_set_size---of 3
loop_set_status---of 20
loop_set_status_from_info---of 8
loop_update_rotational---of 4
loop_validate_file---of 11
loop_workfn---of 1
max_loop_param_set_int---of 3
-----------
SUMMARY32%of 35

udf_dstrCS0toChar---of 5
udf_get_filename50%of 4
udf_name_conv_char24%of 25
udf_name_from_CS029%of 50
udf_put_filename---of 20
-----------
SUMMARY28%of 79

-----------
SUMMARY---of 0

empty_inline_dir---of 19
ext4_add_dirent_to_inline---of 7
ext4_convert_inline_data18%of 17
ext4_convert_inline_data_nolock---of 29
ext4_create_inline_data---of 10
ext4_da_write_inline_data_begin---of 52
ext4_delete_inline_entry---of 18
ext4_destroy_inline_data67%of 3
ext4_destroy_inline_data_nolock24%of 13
ext4_find_inline_data_nolock34%of 9
ext4_find_inline_entry---of 12
ext4_finish_convert_inline_dir---of 19
ext4_get_first_inline_block---of 3
ext4_get_max_inline_size---of 5
ext4_inline_data_iomap---of 10
ext4_inline_data_truncate---of 33
ext4_inlinedir_to_tree---of 33
ext4_prepare_inline_data---of 7
ext4_read_inline_dir---of 34
ext4_read_inline_folio---of 28
ext4_read_inline_link---of 11
ext4_readpage_inline---of 29
ext4_try_add_inline_entry---of 25
ext4_try_create_inline_dir---of 6
ext4_try_to_write_inline_data---of 68
ext4_update_inline_data---of 12
ext4_write_inline_data_end---of 49
folio_put---of 4
folio_size---of 10
folio_test_uptodate---of 9
get_max_inline_xattr_value_size---of 13
lock_buffer---of 3
-----------
SUMMARY27%of 42

alloc_ucounts15%of 20
dec_rlimit_put_ucounts56%of 9
dec_rlimit_ucounts---of 8
dec_ucount40%of 10
get_ucounts---of 3
inc_rlimit_get_ucounts27%of 23
inc_rlimit_ucounts---of 8
inc_ucount25%of 12
is_rlimit_overlimit---of 9
put_ucounts23%of 9
retire_userns_sysctls---of 1
set_is_seen---of 1
set_lookup---of 1
set_permissions---of 1
setup_userns_sysctls---of 5
-----------
SUMMARY28%of 83

-----------
SUMMARY---of 0

udf_add_extendedattr---of 19
udf_get_extendedattr---of 13
udf_new_tag---of 1
udf_read_ptagged100%of 1
udf_read_tagged19%of 11
udf_tag_checksum100%of 1
udf_update_tag---of 1
-----------
SUMMARY31%of 13

-----------
SUMMARY---of 0

bsearch60%of 5
-----------
SUMMARY60%of 5

__read_once_word_nocheck100%of 1
__unwind_start30%of 20
deref_stack_reg25%of 8
orc_sort_cmp---of 4
orc_sort_swap---of 1
unwind_dump---of 9
unwind_get_return_address75%of 4
unwind_get_return_address_ptr---of 5
unwind_module_init---of 3
unwind_next_frame12%of 154
-----------
SUMMARY17%of 187

-----------
SUMMARY---of 0

generate_random_guid---of 1
generate_random_uuid---of 1
guid_gen---of 1
guid_parse---of 18
uuid_gen100%of 1
uuid_is_valid---of 15
uuid_parse---of 18
-----------
SUMMARY100%of 1

ramfs_create---of 1
ramfs_fill_super100%of 1
ramfs_free_fc100%of 1
ramfs_get_inode34%of 6
ramfs_get_tree100%of 1
ramfs_init_fs_context67%of 3
ramfs_kill_sb---of 1
ramfs_mkdir---of 3
ramfs_mknod---of 5
ramfs_parse_param---of 5
ramfs_show_options---of 3
ramfs_symlink---of 6
ramfs_tmpfile---of 6
-----------
SUMMARY59%of 12

__vhost_add_used_n---of 18
__vhost_get_user---of 15
__vhost_vq_attach_worker---of 15
__vhost_worker_flush---of 5
iotlb_access_ok---of 12
log_used---of 14
log_write---of 10
memory_access_ok---of 17
translate_desc---of 12
vhost_add_used---of 1
vhost_add_used_and_signal---of 1
vhost_add_used_and_signal_n---of 1
vhost_add_used_n---of 15
vhost_chr_poll50%of 4
vhost_chr_read_iter---of 22
vhost_chr_write_iter---of 37
vhost_clear_msg---of 11
vhost_dequeue_msg---of 4
vhost_dev_check_owner---of 1
vhost_dev_cleanup---of 27
vhost_dev_flush---of 4
vhost_dev_has_owner---of 1
vhost_dev_init---of 6
vhost_dev_ioctl---of 35
vhost_dev_reset_owner---of 4
vhost_dev_reset_owner_prepare---of 1
vhost_dev_set_owner---of 27
vhost_dev_stop---of 11
vhost_disable_notify---of 8
vhost_discard_vq_desc---of 1
vhost_enable_notify---of 28
vhost_enqueue_msg---of 3
vhost_exceeds_weight---of 4
vhost_flush_work---of 1
vhost_free_worker---of 7
vhost_get_vq_desc---of 118
vhost_init_device_iotlb---of 5
vhost_iotlb_miss---of 6
vhost_iotlb_notify_vq---of 9
vhost_log_access_ok---of 1
vhost_log_write---of 21
vhost_new_msg---of 3
vhost_poll_func---of 1
vhost_poll_init---of 1
vhost_poll_queue---of 1
vhost_poll_start---of 11
vhost_poll_stop---of 3
vhost_poll_wakeup---of 4
vhost_run_work_list---of 5
vhost_set_backend_features---of 4
vhost_signal---of 26
vhost_update_used_flags---of 8
vhost_vq_access_ok---of 6
vhost_vq_attach_worker---of 5
vhost_vq_avail_empty---of 8
vhost_vq_has_work---of 18
vhost_vq_init_access---of 15
vhost_vq_is_setup---of 9
vhost_vq_work_queue---of 19
vhost_vring_ioctl---of 80
vhost_work_init---of 1
vhost_worker_create---of 5
vhost_worker_ioctl---of 26
vhost_worker_killed---of 13
vq_log_access_ok---of 14
vq_log_used_access_ok---of 4
vq_meta_prefetch---of 5
-----------
SUMMARY50%of 4

__bpf_trace_mm_lru_activate---of 1
__bpf_trace_mm_lru_insertion---of 1
__folio_batch_release100%of 3
__folio_put30%of 24
__lru_add_drain_all---of 20
__page_cache_release25%of 119
__probestub_mm_lru_activate---of 1
__probestub_mm_lru_insertion---of 1
__traceiter_mm_lru_activate---of 4
__traceiter_mm_lru_insertion---of 4
const_folio_flags---of 10
deactivate_file_folio---of 27
folio_activate25%of 41
folio_activate_fn23%of 148
folio_add_lru27%of 42
folio_add_lru_vma30%of 10
folio_batch_add_and_move39%of 13
folio_batch_move_lru42%of 29
folio_batch_remove_exceptionals56%of 9
folio_deactivate---of 41
folio_mapped---of 16
folio_mark_accessed26%of 81
folio_mark_lazyfree---of 57
folio_memcg25%of 12
folio_nr_pages---of 9
folio_rotate_reclaimable---of 54
folios_put_refs37%of 36
lru_add_drain42%of 17
lru_add_drain_all---of 1
lru_add_drain_cpu30%of 40
lru_add_drain_cpu_zone---of 17
lru_add_drain_per_cpu---of 17
lru_add_fn22%of 123
lru_cache_disable---of 1
lru_deactivate_file_fn---of 212
lru_deactivate_fn---of 140
lru_lazyfree_fn---of 162
lru_move_tail_fn---of 124
lru_note_cost---of 12
lru_note_cost_refault---of 25
perf_trace_mm_lru_activate---of 8
perf_trace_mm_lru_insertion---of 72
put_pages_list---of 13
release_pages---of 21
trace_event_raw_event_mm_lru_activate---of 7
trace_event_raw_event_mm_lru_insertion---of 71
trace_raw_output_mm_lru_activate---of 3
trace_raw_output_mm_lru_insertion---of 3
zone_stat_mod_folio---of 4
-----------
SUMMARY28%of 747

__btrfs_free_extent9%of 124
__btrfs_inc_extent_ref---of 22
__btrfs_mod_ref---of 18
__btrfs_run_delayed_refs22%of 178
alloc_reserved_extent24%of 17
alloc_reserved_file_extent---of 11
btrfs_alloc_logged_file_extent---of 10
btrfs_alloc_reserved_file_extent---of 5
btrfs_alloc_tree_block30%of 20
btrfs_cleanup_ref_head_accounting40%of 10
btrfs_clear_treelog_bg---of 4
btrfs_cross_ref_exist---of 8
btrfs_dec_ref---of 1
btrfs_discard_extent---of 29
btrfs_drop_snapshot---of 77
btrfs_drop_subtree---of 15
btrfs_error_unpin_extent_range---of 1
btrfs_exclude_logged_extents---of 12
btrfs_finish_extent_commit---of 19
btrfs_free_extent---of 7
btrfs_free_reserved_extent---of 3
btrfs_free_tree_block27%of 15
btrfs_get_extent_inline_ref_type14%of 22
btrfs_get_extent_owner_root---of 5
btrfs_inc_extent_ref---of 6
btrfs_inc_ref---of 1
btrfs_issue_discard---of 29
btrfs_lookup_data_extent---of 3
btrfs_lookup_extent_info---of 49
btrfs_pin_extent---of 3
btrfs_pin_extent_for_log_replay---of 4
btrfs_pin_reserved_extent---of 3
btrfs_reserve_extent47%of 13
btrfs_run_delayed_refs28%of 11
btrfs_set_disk_extent_flags---of 4
btrfs_space_info_update_bytes_pinned---of 21
btrfs_tree_unlock_rw---of 4
btrfs_trim_fs---of 49
check_committed_ref---of 16
check_delayed_ref---of 18
check_ref_cleanup19%of 11
do_walk_down---of 73
extent_data_ref_count---of 7
find_free_extent13%of 277
find_next_key---of 11
hash_extent_data_ref---of 1
insert_extent_data_ref---of 26
insert_inline_extent_backref---of 5
insert_tree_block_ref---of 3
lookup_extent_data_ref---of 16
lookup_inline_extent_backref13%of 66
lookup_tree_block_ref---of 1
pin_down_extent---of 3
reada_walk_down---of 32
remove_extent_data_ref---of 11
setup_inline_extent_backref---of 18
trace_btrfs_reserve_extent_cluster---of 15
trace_btrfs_reserved_extent_free---of 15
trace_btrfs_space_reservation---of 15
unpin_extent_range---of 46
update_inline_extent_backref---of 26
walk_down_proc---of 55
walk_down_tree---of 14
walk_up_proc---of 63
walk_up_tree---of 22
-----------
SUMMARY17%of 764

-----------
SUMMARY---of 0

ns_dname---of 1
ns_get_name50%of 4
ns_get_path---of 3
ns_get_path_cb---of 3
ns_ioctl---of 12
ns_match---of 3
nsfs_evict---of 1
nsfs_init_fs_context---of 3
nsfs_init_inode---of 1
nsfs_put_data---of 1
nsfs_show_path---of 1
open_related_ns---of 6
proc_ns_file---of 1
-----------
SUMMARY50%of 4

-----------
SUMMARY---of 0

time64_to_tm60%of 5
-----------
SUMMARY60%of 5

__blk_mq_sched_dispatch_requests23%of 72
__blk_mq_sched_restart---of 1
blk_mq_exit_sched---of 17
blk_mq_init_sched---of 25
blk_mq_sched_bio_merge43%of 14
blk_mq_sched_dispatch_requests34%of 6
blk_mq_sched_free_rqs---of 7
blk_mq_sched_mark_restart_hctx---of 3
blk_mq_sched_try_insert_merge38%of 8
sched_rq_cmp---of 1
-----------
SUMMARY27%of 100

cap_bprm_creds_from_file---of 62
cap_capable38%of 8
cap_capget---of 16
cap_capset---of 15
cap_convert_nscap---of 16
cap_inode_getsecurity---of 25
cap_inode_killpriv---of 1
cap_inode_need_killpriv100%of 1
cap_inode_removexattr---of 5
cap_inode_setxattr---of 4
cap_mmap_addr23%of 9
cap_mmap_file100%of 1
cap_ptrace_access_check---of 19
cap_ptrace_traceme---of 19
cap_safe_nice---of 22
cap_settime---of 1
cap_task_fix_setuid---of 28
cap_task_prctl---of 45
cap_task_setioprio---of 1
cap_task_setnice---of 1
cap_task_setscheduler---of 1
cap_vm_enough_memory38%of 8
get_vfs_caps_from_disk---of 17
rootid_owns_currentns---of 6
-----------
SUMMARY38%of 27

-----------
SUMMARY---of 0

tomoyo_check_mount_acl50%of 6
tomoyo_mount_permission33%of 34
-----------
SUMMARY35%of 40

udf_add_fid_counter40%of 5
udf_add_nondir---of 3
udf_create---of 3
udf_encode_fh---of 5
udf_fh_to_dentry---of 7
udf_fh_to_parent---of 7
udf_fiiter_add_entry---of 58
udf_fiiter_delete_entry---of 3
udf_fiiter_find_entry39%of 21
udf_get_parent---of 3
udf_link---of 4
udf_lookup40%of 5
udf_mkdir---of 5
udf_mknod---of 4
udf_rename---of 36
udf_rmdir---of 14
udf_symlink---of 49
udf_tmpfile---of 3
udf_unlink38%of 8
-----------
SUMMARY39%of 39

__ia32_sys_readahead---of 10
__x64_sys_readahead30%of 10
do_page_cache_ra---of 4
file_ra_state_init100%of 1
force_page_cache_ra40%of 10
ksys_readahead---of 10
ondemand_readahead---of 47
page_cache_async_ra---of 17
page_cache_ra_order9%of 48
page_cache_ra_unbounded30%of 24
page_cache_sync_ra---of 9
read_pages22%of 28
readahead_expand---of 37
readahead_folio10%of 22
try_context_readahead---of 13
-----------
SUMMARY19%of 143

jfsIOWait---of 13
jfs_flush_journal5%of 42
jfs_syncpt---of 3
lbmAllocate---of 7
lbmDirectWrite---of 5
lbmFree---of 3
lbmIODone---of 51
lbmIOWait---of 12
lbmLogShutdown---of 6
lbmRead---of 12
lbmStartIO---of 10
lbmWrite---of 12
lmGroupCommit---of 24
lmLog---of 24
lmLogClose---of 14
lmLogFileSystem---of 17
lmLogFormat---of 13
lmLogInit---of 44
lmLogOpen---of 32
lmLogShutdown---of 16
lmLogSync---of 18
lmNextPage---of 13
lmWriteRecord---of 38
-----------
SUMMARY5%of 42

__ia32_sys_map_shadow_stack---of 9
__x64_sys_map_shadow_stack---of 9
alloc_shstk---of 14
reset_thread_features---of 1
restore_signal_shadow_stack---of 27
setup_signal_shadow_stack15%of 14
shstk_alloc_thread_stack---of 8
shstk_disable---of 9
shstk_free---of 12
shstk_prctl---of 13
shstk_setup---of 11
wrss_control---of 13
-----------
SUMMARY15%of 14

__do_sys_munlockall---of 11
__ia32_sys_mlock---of 1
__ia32_sys_mlock2---of 3
__ia32_sys_mlockall---of 1
__ia32_sys_munlock---of 1
__se_sys_mlockall---of 22
__se_sys_munlock---of 11
__x64_sys_mlock---of 1
__x64_sys_mlock2---of 3
__x64_sys_mlockall---of 1
__x64_sys_munlock---of 1
apply_mlockall_flags---of 9
apply_vma_lock_flags---of 11
can_do_mlock67%of 3
do_mlock---of 28
folio_evictable---of 20
folio_lruvec_relock_irq---of 19
folio_nr_pages---of 9
mlock_drain_local39%of 21
mlock_drain_remote---of 9
mlock_fixup---of 10
mlock_folio29%of 50
mlock_folio_batch---of 533
mlock_new_folio---of 49
mlock_pte_range---of 80
mlock_vma_pages_range---of 16
munlock_folio---of 32
need_mlock_drain---of 3
user_shm_lock---of 6
user_shm_unlock---of 1
vm_flags_reset---of 6
vma_start_write---of 6
-----------
SUMMARY33%of 74

task_work_add16%of 13
task_work_cancel---of 8
task_work_cancel_match---of 8
task_work_run78%of 9
-----------
SUMMARY41%of 22

-----------
SUMMARY---of 0

__bpf_trace_ext4__bitmap_load---of 1
__bpf_trace_ext4__es_extent---of 1
__bpf_trace_ext4__es_shrink_enter---of 1
__bpf_trace_ext4__fallocate_mode---of 1
__bpf_trace_ext4__folio_op---of 1
__bpf_trace_ext4__map_blocks_enter---of 1
__bpf_trace_ext4__map_blocks_exit---of 1
__bpf_trace_ext4__mb_new_pa---of 1
__bpf_trace_ext4__mballoc---of 1
__bpf_trace_ext4__trim---of 1
__bpf_trace_ext4__truncate---of 1
__bpf_trace_ext4__write_begin---of 1
__bpf_trace_ext4__write_end---of 1
__bpf_trace_ext4_alloc_da_blocks---of 1
__bpf_trace_ext4_allocate_blocks---of 1
__bpf_trace_ext4_allocate_inode---of 1
__bpf_trace_ext4_begin_ordered_truncate---of 1
__bpf_trace_ext4_collapse_range---of 1
__bpf_trace_ext4_da_release_space---of 1
__bpf_trace_ext4_da_reserve_space---of 1
__bpf_trace_ext4_da_update_reserve_space---of 1
__bpf_trace_ext4_da_write_pages---of 1
__bpf_trace_ext4_da_write_pages_extent---of 1
__bpf_trace_ext4_discard_blocks---of 1
__bpf_trace_ext4_discard_preallocations---of 1
__bpf_trace_ext4_drop_inode---of 1
__bpf_trace_ext4_error---of 1
__bpf_trace_ext4_es_find_extent_range_enter---of 1
__bpf_trace_ext4_es_find_extent_range_exit---of 1
__bpf_trace_ext4_es_insert_delayed_block---of 1
__bpf_trace_ext4_es_lookup_extent_enter---of 1
__bpf_trace_ext4_es_lookup_extent_exit---of 1
__bpf_trace_ext4_es_remove_extent---of 1
__bpf_trace_ext4_es_shrink---of 1
__bpf_trace_ext4_es_shrink_scan_exit---of 1
__bpf_trace_ext4_evict_inode---of 1
__bpf_trace_ext4_ext_convert_to_initialized_enter---of 1
__bpf_trace_ext4_ext_convert_to_initialized_fastpath---of 1
__bpf_trace_ext4_ext_handle_unwritten_extents---of 1
__bpf_trace_ext4_ext_load_extent---of 1
__bpf_trace_ext4_ext_remove_space---of 1
__bpf_trace_ext4_ext_remove_space_done---of 1
__bpf_trace_ext4_ext_rm_idx---of 1
__bpf_trace_ext4_ext_rm_leaf---of 1
__bpf_trace_ext4_ext_show_extent---of 1
__bpf_trace_ext4_fallocate_exit---of 1
__bpf_trace_ext4_fc_cleanup---of 1
__bpf_trace_ext4_fc_commit_start---of 1
__bpf_trace_ext4_fc_commit_stop---of 1
__bpf_trace_ext4_fc_replay---of 1
__bpf_trace_ext4_fc_replay_scan---of 1
__bpf_trace_ext4_fc_stats---of 1
__bpf_trace_ext4_fc_track_dentry---of 1
__bpf_trace_ext4_fc_track_inode---of 1
__bpf_trace_ext4_fc_track_range---of 1
__bpf_trace_ext4_forget---of 1
__bpf_trace_ext4_free_blocks---of 1
__bpf_trace_ext4_free_inode---of 1
__bpf_trace_ext4_fsmap_class---of 1
__bpf_trace_ext4_get_implied_cluster_alloc_exit---of 1
__bpf_trace_ext4_getfsmap_class---of 1
__bpf_trace_ext4_insert_range---of 1
__bpf_trace_ext4_invalidate_folio_op---of 1
__bpf_trace_ext4_journal_start_inode---of 1
__bpf_trace_ext4_journal_start_reserved---of 1
__bpf_trace_ext4_journal_start_sb---of 1
__bpf_trace_ext4_lazy_itable_init---of 1
__bpf_trace_ext4_load_inode---of 1
__bpf_trace_ext4_mark_inode_dirty---of 1
__bpf_trace_ext4_mb_discard_preallocations---of 1
__bpf_trace_ext4_mb_release_group_pa---of 1
__bpf_trace_ext4_mb_release_inode_pa---of 1
__bpf_trace_ext4_mballoc_alloc---of 1
__bpf_trace_ext4_mballoc_prealloc---of 1
__bpf_trace_ext4_nfs_commit_metadata---of 1
__bpf_trace_ext4_other_inode_update_time---of 1
__bpf_trace_ext4_prefetch_bitmaps---of 1
__bpf_trace_ext4_read_block_bitmap_load---of 1
__bpf_trace_ext4_remove_blocks---of 1
__bpf_trace_ext4_request_blocks---of 1
__bpf_trace_ext4_request_inode---of 1
__bpf_trace_ext4_shutdown---of 1
__bpf_trace_ext4_sync_file_enter---of 1
__bpf_trace_ext4_sync_file_exit---of 1
__bpf_trace_ext4_sync_fs---of 1
__bpf_trace_ext4_unlink_enter---of 1
__bpf_trace_ext4_unlink_exit---of 1
__bpf_trace_ext4_update_sb---of 1
__bpf_trace_ext4_writepages---of 1
__bpf_trace_ext4_writepages_result---of 1
__ext4_error---of 4
__ext4_error_file---of 5
__ext4_error_inode---of 5
__ext4_grp_locked_error19%of 22
__ext4_msg50%of 4
__ext4_sb_bread_gfp30%of 10
__ext4_std_error---of 16
__ext4_warning---of 3
__ext4_warning_inode67%of 3
__probestub_ext4_alloc_da_blocks---of 1
__probestub_ext4_allocate_blocks---of 1
__probestub_ext4_allocate_inode---of 1
__probestub_ext4_begin_ordered_truncate---of 1
__probestub_ext4_collapse_range---of 1
__probestub_ext4_da_release_space---of 1
__probestub_ext4_da_reserve_space---of 1
__probestub_ext4_da_update_reserve_space---of 1
__probestub_ext4_da_write_begin---of 1
__probestub_ext4_da_write_end---of 1
__probestub_ext4_da_write_pages---of 1
__probestub_ext4_da_write_pages_extent---of 1
__probestub_ext4_discard_blocks---of 1
__probestub_ext4_discard_preallocations---of 1
__probestub_ext4_drop_inode---of 1
__probestub_ext4_error---of 1
__probestub_ext4_es_cache_extent---of 1
__probestub_ext4_es_find_extent_range_enter---of 1
__probestub_ext4_es_find_extent_range_exit---of 1
__probestub_ext4_es_insert_delayed_block---of 1
__probestub_ext4_es_insert_extent---of 1
__probestub_ext4_es_lookup_extent_enter---of 1
__probestub_ext4_es_lookup_extent_exit---of 1
__probestub_ext4_es_remove_extent---of 1
__probestub_ext4_es_shrink---of 1
__probestub_ext4_es_shrink_count---of 1
__probestub_ext4_es_shrink_scan_enter---of 1
__probestub_ext4_es_shrink_scan_exit---of 1
__probestub_ext4_evict_inode---of 1
__probestub_ext4_ext_convert_to_initialized_enter---of 1
__probestub_ext4_ext_convert_to_initialized_fastpath---of 1
__probestub_ext4_ext_handle_unwritten_extents---of 1
__probestub_ext4_ext_load_extent---of 1
__probestub_ext4_ext_map_blocks_enter---of 1
__probestub_ext4_ext_map_blocks_exit---of 1
__probestub_ext4_ext_remove_space---of 1
__probestub_ext4_ext_remove_space_done---of 1
__probestub_ext4_ext_rm_idx---of 1
__probestub_ext4_ext_rm_leaf---of 1
__probestub_ext4_ext_show_extent---of 1
__probestub_ext4_fallocate_enter---of 1
__probestub_ext4_fallocate_exit---of 1
__probestub_ext4_fc_cleanup---of 1
__probestub_ext4_fc_commit_start---of 1
__probestub_ext4_fc_commit_stop---of 1
__probestub_ext4_fc_replay---of 1
__probestub_ext4_fc_replay_scan---of 1
__probestub_ext4_fc_stats---of 1
__probestub_ext4_fc_track_create---of 1
__probestub_ext4_fc_track_inode---of 1
__probestub_ext4_fc_track_link---of 1
__probestub_ext4_fc_track_range---of 1
__probestub_ext4_fc_track_unlink---of 1
__probestub_ext4_forget---of 1
__probestub_ext4_free_blocks---of 1
__probestub_ext4_free_inode---of 1
__probestub_ext4_fsmap_high_key---of 1
__probestub_ext4_fsmap_low_key---of 1
__probestub_ext4_fsmap_mapping---of 1
__probestub_ext4_get_implied_cluster_alloc_exit---of 1
__probestub_ext4_getfsmap_high_key---of 1
__probestub_ext4_getfsmap_low_key---of 1
__probestub_ext4_getfsmap_mapping---of 1
__probestub_ext4_ind_map_blocks_enter---of 1
__probestub_ext4_ind_map_blocks_exit---of 1
__probestub_ext4_insert_range---of 1
__probestub_ext4_invalidate_folio---of 1
__probestub_ext4_journal_start_inode---of 1
__probestub_ext4_journal_start_reserved---of 1
__probestub_ext4_journal_start_sb---of 1
__probestub_ext4_journalled_invalidate_folio---of 1
__probestub_ext4_journalled_write_end---of 1
__probestub_ext4_lazy_itable_init---of 1
__probestub_ext4_load_inode---of 1
__probestub_ext4_load_inode_bitmap---of 1
__probestub_ext4_mark_inode_dirty---of 1
__probestub_ext4_mb_bitmap_load---of 1
__probestub_ext4_mb_buddy_bitmap_load---of 1
__probestub_ext4_mb_discard_preallocations---of 1
__probestub_ext4_mb_new_group_pa---of 1
__probestub_ext4_mb_new_inode_pa---of 1
__probestub_ext4_mb_release_group_pa---of 1
__probestub_ext4_mb_release_inode_pa---of 1
__probestub_ext4_mballoc_alloc---of 1
__probestub_ext4_mballoc_discard---of 1
__probestub_ext4_mballoc_free---of 1
__probestub_ext4_mballoc_prealloc---of 1
__probestub_ext4_nfs_commit_metadata---of 1
__probestub_ext4_other_inode_update_time---of 1
__probestub_ext4_prefetch_bitmaps---of 1
__probestub_ext4_punch_hole---of 1
__probestub_ext4_read_block_bitmap_load---of 1
__probestub_ext4_read_folio---of 1
__probestub_ext4_release_folio---of 1
__probestub_ext4_remove_blocks---of 1
__probestub_ext4_request_blocks---of 1
__probestub_ext4_request_inode---of 1
__probestub_ext4_shutdown---of 1
__probestub_ext4_sync_file_enter---of 1
__probestub_ext4_sync_file_exit---of 1
__probestub_ext4_sync_fs---of 1
__probestub_ext4_trim_all_free---of 1
__probestub_ext4_trim_extent---of 1
__probestub_ext4_truncate_enter---of 1
__probestub_ext4_truncate_exit---of 1
__probestub_ext4_unlink_enter---of 1
__probestub_ext4_unlink_exit---of 1
__probestub_ext4_update_sb---of 1
__probestub_ext4_write_begin---of 1
__probestub_ext4_write_end---of 1
__probestub_ext4_writepages---of 1
__probestub_ext4_writepages_result---of 1
__probestub_ext4_zero_range---of 1
__traceiter_ext4_alloc_da_blocks---of 4
__traceiter_ext4_allocate_blocks---of 4
__traceiter_ext4_allocate_inode---of 4
__traceiter_ext4_begin_ordered_truncate---of 4
__traceiter_ext4_collapse_range---of 4
__traceiter_ext4_da_release_space---of 4
__traceiter_ext4_da_reserve_space---of 4
__traceiter_ext4_da_update_reserve_space---of 4
__traceiter_ext4_da_write_begin---of 4
__traceiter_ext4_da_write_end---of 4
__traceiter_ext4_da_write_pages---of 4
__traceiter_ext4_da_write_pages_extent---of 4
__traceiter_ext4_discard_blocks---of 4
__traceiter_ext4_discard_preallocations---of 4
__traceiter_ext4_drop_inode---of 4
__traceiter_ext4_error---of 4
__traceiter_ext4_es_cache_extent---of 4
__traceiter_ext4_es_find_extent_range_enter---of 4
__traceiter_ext4_es_find_extent_range_exit---of 4
__traceiter_ext4_es_insert_delayed_block---of 4
__traceiter_ext4_es_insert_extent---of 4
__traceiter_ext4_es_lookup_extent_enter---of 4
__traceiter_ext4_es_lookup_extent_exit---of 4
__traceiter_ext4_es_remove_extent---of 4
__traceiter_ext4_es_shrink---of 4
__traceiter_ext4_es_shrink_count---of 4
__traceiter_ext4_es_shrink_scan_enter---of 4
__traceiter_ext4_es_shrink_scan_exit---of 4
__traceiter_ext4_evict_inode---of 4
__traceiter_ext4_ext_convert_to_initialized_enter---of 4
__traceiter_ext4_ext_convert_to_initialized_fastpath---of 4
__traceiter_ext4_ext_handle_unwritten_extents---of 4
__traceiter_ext4_ext_load_extent---of 4
__traceiter_ext4_ext_map_blocks_enter---of 4
__traceiter_ext4_ext_map_blocks_exit---of 4
__traceiter_ext4_ext_remove_space---of 4
__traceiter_ext4_ext_remove_space_done---of 4
__traceiter_ext4_ext_rm_idx---of 4
__traceiter_ext4_ext_rm_leaf---of 4
__traceiter_ext4_ext_show_extent---of 4
__traceiter_ext4_fallocate_enter---of 4
__traceiter_ext4_fallocate_exit---of 4
__traceiter_ext4_fc_cleanup---of 4
__traceiter_ext4_fc_commit_start---of 4
__traceiter_ext4_fc_commit_stop---of 4
__traceiter_ext4_fc_replay---of 4
__traceiter_ext4_fc_replay_scan---of 4
__traceiter_ext4_fc_stats---of 4
__traceiter_ext4_fc_track_create---of 4
__traceiter_ext4_fc_track_inode---of 4
__traceiter_ext4_fc_track_link---of 4
__traceiter_ext4_fc_track_range---of 4
__traceiter_ext4_fc_track_unlink---of 4
__traceiter_ext4_forget---of 4
__traceiter_ext4_free_blocks---of 4
__traceiter_ext4_free_inode---of 4
__traceiter_ext4_fsmap_high_key---of 4
__traceiter_ext4_fsmap_low_key---of 4
__traceiter_ext4_fsmap_mapping---of 4
__traceiter_ext4_get_implied_cluster_alloc_exit---of 4
__traceiter_ext4_getfsmap_high_key---of 4
__traceiter_ext4_getfsmap_low_key---of 4
__traceiter_ext4_getfsmap_mapping---of 4
__traceiter_ext4_ind_map_blocks_enter---of 4
__traceiter_ext4_ind_map_blocks_exit---of 4
__traceiter_ext4_insert_range---of 4
__traceiter_ext4_invalidate_folio---of 4
__traceiter_ext4_journal_start_inode---of 4
__traceiter_ext4_journal_start_reserved---of 4
__traceiter_ext4_journal_start_sb---of 4
__traceiter_ext4_journalled_invalidate_folio---of 4
__traceiter_ext4_journalled_write_end---of 4
__traceiter_ext4_lazy_itable_init---of 4
__traceiter_ext4_load_inode---of 4
__traceiter_ext4_load_inode_bitmap---of 4
__traceiter_ext4_mark_inode_dirty---of 4
__traceiter_ext4_mb_bitmap_load---of 4
__traceiter_ext4_mb_buddy_bitmap_load---of 4
__traceiter_ext4_mb_discard_preallocations---of 4
__traceiter_ext4_mb_new_group_pa---of 4
__traceiter_ext4_mb_new_inode_pa---of 4
__traceiter_ext4_mb_release_group_pa---of 4
__traceiter_ext4_mb_release_inode_pa---of 4
__traceiter_ext4_mballoc_alloc---of 4
__traceiter_ext4_mballoc_discard---of 4
__traceiter_ext4_mballoc_free---of 4
__traceiter_ext4_mballoc_prealloc---of 4
__traceiter_ext4_nfs_commit_metadata---of 4
__traceiter_ext4_other_inode_update_time---of 4
__traceiter_ext4_prefetch_bitmaps---of 4
__traceiter_ext4_punch_hole---of 4
__traceiter_ext4_read_block_bitmap_load---of 4
__traceiter_ext4_read_folio---of 4
__traceiter_ext4_release_folio---of 4
__traceiter_ext4_remove_blocks---of 4
__traceiter_ext4_request_blocks---of 4
__traceiter_ext4_request_inode---of 4
__traceiter_ext4_shutdown---of 4
__traceiter_ext4_sync_file_enter---of 4
__traceiter_ext4_sync_file_exit---of 4
__traceiter_ext4_sync_fs---of 4
__traceiter_ext4_trim_all_free---of 4
__traceiter_ext4_trim_extent---of 4
__traceiter_ext4_truncate_enter---of 4
__traceiter_ext4_truncate_exit---of 4
__traceiter_ext4_unlink_enter---of 4
__traceiter_ext4_unlink_exit---of 4
__traceiter_ext4_update_sb---of 4
__traceiter_ext4_write_begin---of 4
__traceiter_ext4_write_end---of 4
__traceiter_ext4_writepages---of 4
__traceiter_ext4_writepages_result---of 4
__traceiter_ext4_zero_range---of 4
_ext4_show_options---of 107
ext4_acquire_dquot---of 11
ext4_alloc_flex_bg_array---of 34
ext4_alloc_inode67%of 3
ext4_apply_options---of 58
ext4_block_bitmap100%of 3
ext4_block_bitmap_set---of 3
ext4_block_group_meta_init---of 26
ext4_calculate_overhead---of 54
ext4_check_feature_compatibility---of 38
ext4_check_geometry---of 26
ext4_check_journal_data_mode---of 10
ext4_check_opt_consistency---of 109
ext4_clear_inode60%of 5
ext4_clear_journal_err---of 14
ext4_commit_super34%of 15
ext4_decode_error---of 11
ext4_destroy_inode60%of 5
ext4_drop_inode28%of 18
ext4_enable_quotas---of 21
ext4_fast_commit_init---of 1
ext4_fc_free---of 3
ext4_feature_set_ok---of 7
ext4_fh_to_dentry100%of 1
ext4_fh_to_parent---of 1
ext4_fill_flex_info---of 29
ext4_fill_super---of 253
ext4_flex_groups_free---of 20
ext4_force_commit---of 3
ext4_free_group_clusters100%of 3
ext4_free_group_clusters_set100%of 3
ext4_free_in_core_inode---of 3
ext4_free_inodes_count100%of 3
ext4_free_inodes_set100%of 3
ext4_freeze---of 5
ext4_get_dquots100%of 1
ext4_get_journal_inode---of 6
ext4_get_stripe_size---of 12
ext4_get_tree---of 1
ext4_group_desc_csum15%of 20
ext4_group_desc_csum_set63%of 8
ext4_group_desc_csum_verify---of 10
ext4_group_desc_free---of 21
ext4_group_desc_init---of 94
ext4_handle_clustersize---of 12
ext4_handle_error39%of 21
ext4_hash_info_init---of 6
ext4_init_fs_context---of 3
ext4_inode_bitmap100%of 3
ext4_inode_bitmap_set---of 3
ext4_inode_table100%of 3
ext4_inode_table_set---of 3
ext4_itable_unused_count67%of 3
ext4_itable_unused_set67%of 3
ext4_journal_bmap---of 4
ext4_journal_commit_callback---of 17
ext4_journal_finish_inode_data_buffers---of 3
ext4_journal_submit_inode_data_buffers---of 3
ext4_journalled_writepage_callback---of 8
ext4_kill_sb---of 4
ext4_lazyinit_thread---of 88
ext4_load_and_init_journal---of 87
ext4_mark_dquot_dirty---of 5
ext4_mark_group_bitmap_corrupted34%of 9
ext4_mark_recovery_complete---of 10
ext4_nfs_commit_metadata---of 15
ext4_nfs_get_inode50%of 4
ext4_parse_param---of 68
ext4_parse_test_dummy_encryption---of 4
ext4_percpu_param_destroy---of 1
ext4_percpu_param_init---of 10
ext4_put_super---of 36
ext4_quota_off---of 12
ext4_quota_on---of 13
ext4_quota_read---of 12
ext4_quota_write---of 17
ext4_read_bh---of 8
ext4_read_bh_lock60%of 5
ext4_read_bh_nowait43%of 7
ext4_reconfigure---of 144
ext4_register_li_request---of 25
ext4_release_dquot---of 11
ext4_sb_bread100%of 1
ext4_sb_bread_unmovable---of 1
ext4_sb_breadahead_unmovable---of 4
ext4_seq_options_show---of 1
ext4_set_resv_clusters---of 6
ext4_setup_super---of 23
ext4_show_options---of 1
ext4_shutdown---of 1
ext4_statfs33%of 31
ext4_superblock_csum---of 4
ext4_superblock_csum_set56%of 9
ext4_sync_fs36%of 25
ext4_unfreeze---of 8
ext4_unregister_li_request---of 5
ext4_update_dynamic_rev---of 3
ext4_update_super19%of 53
ext4_used_dirs_count67%of 3
ext4_used_dirs_set67%of 3
ext4_write_dquot---of 8
ext4_write_info---of 3
init_once100%of 1
note_qf_name---of 7
perf_trace_ext4__bitmap_load---of 8
perf_trace_ext4__es_extent---of 8
perf_trace_ext4__es_shrink_enter---of 8
perf_trace_ext4__fallocate_mode---of 8
perf_trace_ext4__folio_op---of 8
perf_trace_ext4__map_blocks_enter---of 8
perf_trace_ext4__map_blocks_exit---of 8
perf_trace_ext4__mb_new_pa---of 8
perf_trace_ext4__mballoc---of 9
perf_trace_ext4__trim---of 8
perf_trace_ext4__truncate---of 8
perf_trace_ext4__write_begin---of 8
perf_trace_ext4__write_end---of 8
perf_trace_ext4_alloc_da_blocks---of 8
perf_trace_ext4_allocate_blocks---of 8
perf_trace_ext4_allocate_inode---of 8
perf_trace_ext4_begin_ordered_truncate---of 8
perf_trace_ext4_collapse_range---of 8
perf_trace_ext4_da_release_space---of 8
perf_trace_ext4_da_reserve_space---of 8
perf_trace_ext4_da_update_reserve_space---of 8
perf_trace_ext4_da_write_pages---of 8
perf_trace_ext4_da_write_pages_extent---of 8
perf_trace_ext4_discard_blocks---of 8
perf_trace_ext4_discard_preallocations---of 8
perf_trace_ext4_drop_inode---of 8
perf_trace_ext4_error---of 8
perf_trace_ext4_es_find_extent_range_enter---of 8
perf_trace_ext4_es_find_extent_range_exit---of 8
perf_trace_ext4_es_insert_delayed_block---of 8
perf_trace_ext4_es_lookup_extent_enter---of 8
perf_trace_ext4_es_lookup_extent_exit---of 8
perf_trace_ext4_es_remove_extent---of 8
perf_trace_ext4_es_shrink---of 8
perf_trace_ext4_es_shrink_scan_exit---of 8
perf_trace_ext4_evict_inode---of 8
perf_trace_ext4_ext_convert_to_initialized_enter---of 8
perf_trace_ext4_ext_convert_to_initialized_fastpath---of 8
perf_trace_ext4_ext_handle_unwritten_extents---of 8
perf_trace_ext4_ext_load_extent---of 8
perf_trace_ext4_ext_remove_space---of 8
perf_trace_ext4_ext_remove_space_done---of 8
perf_trace_ext4_ext_rm_idx---of 8
perf_trace_ext4_ext_rm_leaf---of 8
perf_trace_ext4_ext_show_extent---of 8
perf_trace_ext4_fallocate_exit---of 8
perf_trace_ext4_fc_cleanup---of 8
perf_trace_ext4_fc_commit_start---of 8
perf_trace_ext4_fc_commit_stop---of 8
perf_trace_ext4_fc_replay---of 8
perf_trace_ext4_fc_replay_scan---of 8
perf_trace_ext4_fc_stats---of 8
perf_trace_ext4_fc_track_dentry---of 8
perf_trace_ext4_fc_track_inode---of 8
perf_trace_ext4_fc_track_range---of 8
perf_trace_ext4_forget---of 8
perf_trace_ext4_free_blocks---of 8
perf_trace_ext4_free_inode---of 8
perf_trace_ext4_fsmap_class---of 8
perf_trace_ext4_get_implied_cluster_alloc_exit---of 8
perf_trace_ext4_getfsmap_class---of 8
perf_trace_ext4_insert_range---of 8
perf_trace_ext4_invalidate_folio_op---of 8
perf_trace_ext4_journal_start_inode---of 8
perf_trace_ext4_journal_start_reserved---of 8
perf_trace_ext4_journal_start_sb---of 8
perf_trace_ext4_lazy_itable_init---of 8
perf_trace_ext4_load_inode---of 8
perf_trace_ext4_mark_inode_dirty---of 8
perf_trace_ext4_mb_discard_preallocations---of 8
perf_trace_ext4_mb_release_group_pa---of 8
perf_trace_ext4_mb_release_inode_pa---of 8
perf_trace_ext4_mballoc_alloc---of 8
perf_trace_ext4_mballoc_prealloc---of 8
perf_trace_ext4_nfs_commit_metadata---of 8
perf_trace_ext4_other_inode_update_time---of 8
perf_trace_ext4_prefetch_bitmaps---of 8
perf_trace_ext4_read_block_bitmap_load---of 8
perf_trace_ext4_remove_blocks---of 8
perf_trace_ext4_request_blocks---of 8
perf_trace_ext4_request_inode---of 8
perf_trace_ext4_shutdown---of 8
perf_trace_ext4_sync_file_enter---of 8
perf_trace_ext4_sync_file_exit---of 8
perf_trace_ext4_sync_fs---of 8
perf_trace_ext4_unlink_enter---of 8
perf_trace_ext4_unlink_exit---of 8
perf_trace_ext4_update_sb---of 8
perf_trace_ext4_writepages---of 8
perf_trace_ext4_writepages_result---of 8
print_daily_error_info---of 13
register_as_ext2---of 3
register_as_ext3---of 3
trace_event_raw_event_ext4__bitmap_load---of 7
trace_event_raw_event_ext4__es_extent---of 7
trace_event_raw_event_ext4__es_shrink_enter---of 7
trace_event_raw_event_ext4__fallocate_mode---of 7
trace_event_raw_event_ext4__folio_op---of 7
trace_event_raw_event_ext4__map_blocks_enter---of 7
trace_event_raw_event_ext4__map_blocks_exit---of 7
trace_event_raw_event_ext4__mb_new_pa---of 7
trace_event_raw_event_ext4__mballoc---of 8
trace_event_raw_event_ext4__trim---of 7
trace_event_raw_event_ext4__truncate---of 7
trace_event_raw_event_ext4__write_begin---of 7
trace_event_raw_event_ext4__write_end---of 7
trace_event_raw_event_ext4_alloc_da_blocks---of 7
trace_event_raw_event_ext4_allocate_blocks---of 7
trace_event_raw_event_ext4_allocate_inode---of 7
trace_event_raw_event_ext4_begin_ordered_truncate---of 7
trace_event_raw_event_ext4_collapse_range---of 7
trace_event_raw_event_ext4_da_release_space---of 7
trace_event_raw_event_ext4_da_reserve_space---of 7
trace_event_raw_event_ext4_da_update_reserve_space---of 7
trace_event_raw_event_ext4_da_write_pages---of 7
trace_event_raw_event_ext4_da_write_pages_extent---of 7
trace_event_raw_event_ext4_discard_blocks---of 7
trace_event_raw_event_ext4_discard_preallocations---of 7
trace_event_raw_event_ext4_drop_inode---of 7
trace_event_raw_event_ext4_error---of 7
trace_event_raw_event_ext4_es_find_extent_range_enter---of 7
trace_event_raw_event_ext4_es_find_extent_range_exit---of 7
trace_event_raw_event_ext4_es_insert_delayed_block---of 7
trace_event_raw_event_ext4_es_lookup_extent_enter---of 7
trace_event_raw_event_ext4_es_lookup_extent_exit---of 7
trace_event_raw_event_ext4_es_remove_extent---of 7
trace_event_raw_event_ext4_es_shrink---of 7
trace_event_raw_event_ext4_es_shrink_scan_exit---of 7
trace_event_raw_event_ext4_evict_inode---of 7
trace_event_raw_event_ext4_ext_convert_to_initialized_enter---of 7
trace_event_raw_event_ext4_ext_convert_to_initialized_fastpath---of 7
trace_event_raw_event_ext4_ext_handle_unwritten_extents---of 7
trace_event_raw_event_ext4_ext_load_extent---of 7
trace_event_raw_event_ext4_ext_remove_space---of 7
trace_event_raw_event_ext4_ext_remove_space_done---of 7
trace_event_raw_event_ext4_ext_rm_idx---of 7
trace_event_raw_event_ext4_ext_rm_leaf---of 7
trace_event_raw_event_ext4_ext_show_extent---of 7
trace_event_raw_event_ext4_fallocate_exit---of 7
trace_event_raw_event_ext4_fc_cleanup---of 7
trace_event_raw_event_ext4_fc_commit_start---of 7
trace_event_raw_event_ext4_fc_commit_stop---of 7
trace_event_raw_event_ext4_fc_replay---of 7
trace_event_raw_event_ext4_fc_replay_scan---of 7
trace_event_raw_event_ext4_fc_stats---of 7
trace_event_raw_event_ext4_fc_track_dentry---of 7
trace_event_raw_event_ext4_fc_track_inode---of 7
trace_event_raw_event_ext4_fc_track_range---of 7
trace_event_raw_event_ext4_forget---of 7
trace_event_raw_event_ext4_free_blocks---of 7
trace_event_raw_event_ext4_free_inode---of 7
trace_event_raw_event_ext4_fsmap_class---of 7
trace_event_raw_event_ext4_get_implied_cluster_alloc_exit---of 7
trace_event_raw_event_ext4_getfsmap_class---of 7
trace_event_raw_event_ext4_insert_range---of 7
trace_event_raw_event_ext4_invalidate_folio_op---of 7
trace_event_raw_event_ext4_journal_start_inode---of 7
trace_event_raw_event_ext4_journal_start_reserved---of 7
trace_event_raw_event_ext4_journal_start_sb---of 7
trace_event_raw_event_ext4_lazy_itable_init---of 7
trace_event_raw_event_ext4_load_inode---of 7
trace_event_raw_event_ext4_mark_inode_dirty---of 7
trace_event_raw_event_ext4_mb_discard_preallocations---of 7
trace_event_raw_event_ext4_mb_release_group_pa---of 7
trace_event_raw_event_ext4_mb_release_inode_pa---of 7
trace_event_raw_event_ext4_mballoc_alloc---of 7
trace_event_raw_event_ext4_mballoc_prealloc---of 7
trace_event_raw_event_ext4_nfs_commit_metadata---of 7
trace_event_raw_event_ext4_other_inode_update_time---of 7
trace_event_raw_event_ext4_prefetch_bitmaps---of 7
trace_event_raw_event_ext4_read_block_bitmap_load---of 7
trace_event_raw_event_ext4_remove_blocks---of 7
trace_event_raw_event_ext4_request_blocks---of 7
trace_event_raw_event_ext4_request_inode---of 7
trace_event_raw_event_ext4_shutdown---of 7
trace_event_raw_event_ext4_sync_file_enter---of 7
trace_event_raw_event_ext4_sync_file_exit---of 7
trace_event_raw_event_ext4_sync_fs---of 7
trace_event_raw_event_ext4_unlink_enter---of 7
trace_event_raw_event_ext4_unlink_exit---of 7
trace_event_raw_event_ext4_update_sb---of 7
trace_event_raw_event_ext4_writepages---of 7
trace_event_raw_event_ext4_writepages_result---of 7
trace_ext4_error27%of 15
trace_raw_output_ext4__bitmap_load---of 3
trace_raw_output_ext4__es_extent---of 3
trace_raw_output_ext4__es_shrink_enter---of 3
trace_raw_output_ext4__fallocate_mode---of 3
trace_raw_output_ext4__folio_op---of 3
trace_raw_output_ext4__map_blocks_enter---of 3
trace_raw_output_ext4__map_blocks_exit---of 3
trace_raw_output_ext4__mb_new_pa---of 3
trace_raw_output_ext4__mballoc---of 3
trace_raw_output_ext4__trim---of 3
trace_raw_output_ext4__truncate---of 3
trace_raw_output_ext4__write_begin---of 3
trace_raw_output_ext4__write_end---of 3
trace_raw_output_ext4_alloc_da_blocks---of 3
trace_raw_output_ext4_allocate_blocks---of 3
trace_raw_output_ext4_allocate_inode---of 3
trace_raw_output_ext4_begin_ordered_truncate---of 3
trace_raw_output_ext4_collapse_range---of 3
trace_raw_output_ext4_da_release_space---of 3
trace_raw_output_ext4_da_reserve_space---of 3
trace_raw_output_ext4_da_update_reserve_space---of 3
trace_raw_output_ext4_da_write_pages---of 3
trace_raw_output_ext4_da_write_pages_extent---of 3
trace_raw_output_ext4_discard_blocks---of 3
trace_raw_output_ext4_discard_preallocations---of 3
trace_raw_output_ext4_drop_inode---of 3
trace_raw_output_ext4_error---of 3
trace_raw_output_ext4_es_find_extent_range_enter---of 3
trace_raw_output_ext4_es_find_extent_range_exit---of 3
trace_raw_output_ext4_es_insert_delayed_block---of 3
trace_raw_output_ext4_es_lookup_extent_enter---of 3
trace_raw_output_ext4_es_lookup_extent_exit---of 4
trace_raw_output_ext4_es_remove_extent---of 3
trace_raw_output_ext4_es_shrink---of 3
trace_raw_output_ext4_es_shrink_scan_exit---of 3
trace_raw_output_ext4_evict_inode---of 3
trace_raw_output_ext4_ext_convert_to_initialized_enter---of 3
trace_raw_output_ext4_ext_convert_to_initialized_fastpath---of 3
trace_raw_output_ext4_ext_handle_unwritten_extents---of 3
trace_raw_output_ext4_ext_load_extent---of 3
trace_raw_output_ext4_ext_remove_space---of 3
trace_raw_output_ext4_ext_remove_space_done---of 3
trace_raw_output_ext4_ext_rm_idx---of 3
trace_raw_output_ext4_ext_rm_leaf---of 3
trace_raw_output_ext4_ext_show_extent---of 3
trace_raw_output_ext4_fallocate_exit---of 3
trace_raw_output_ext4_fc_cleanup---of 3
trace_raw_output_ext4_fc_commit_start---of 3
trace_raw_output_ext4_fc_commit_stop---of 3
trace_raw_output_ext4_fc_replay---of 3
trace_raw_output_ext4_fc_replay_scan---of 3
trace_raw_output_ext4_fc_stats---of 3
trace_raw_output_ext4_fc_track_dentry---of 3
trace_raw_output_ext4_fc_track_inode---of 3
trace_raw_output_ext4_fc_track_range---of 3
trace_raw_output_ext4_forget---of 3
trace_raw_output_ext4_free_blocks---of 3
trace_raw_output_ext4_free_inode---of 3
trace_raw_output_ext4_fsmap_class---of 3
trace_raw_output_ext4_get_implied_cluster_alloc_exit---of 3
trace_raw_output_ext4_getfsmap_class---of 3
trace_raw_output_ext4_insert_range---of 3
trace_raw_output_ext4_invalidate_folio_op---of 3
trace_raw_output_ext4_journal_start_inode---of 3
trace_raw_output_ext4_journal_start_reserved---of 3
trace_raw_output_ext4_journal_start_sb---of 3
trace_raw_output_ext4_lazy_itable_init---of 3
trace_raw_output_ext4_load_inode---of 3
trace_raw_output_ext4_mark_inode_dirty---of 3
trace_raw_output_ext4_mb_discard_preallocations---of 3
trace_raw_output_ext4_mb_release_group_pa---of 3
trace_raw_output_ext4_mb_release_inode_pa---of 3
trace_raw_output_ext4_mballoc_alloc---of 5
trace_raw_output_ext4_mballoc_prealloc---of 3
trace_raw_output_ext4_nfs_commit_metadata---of 3
trace_raw_output_ext4_other_inode_update_time---of 3
trace_raw_output_ext4_prefetch_bitmaps---of 3
trace_raw_output_ext4_read_block_bitmap_load---of 3
trace_raw_output_ext4_remove_blocks---of 3
trace_raw_output_ext4_request_blocks---of 3
trace_raw_output_ext4_request_inode---of 3
trace_raw_output_ext4_shutdown---of 3
trace_raw_output_ext4_sync_file_enter---of 3
trace_raw_output_ext4_sync_file_exit---of 3
trace_raw_output_ext4_sync_fs---of 3
trace_raw_output_ext4_unlink_enter---of 3
trace_raw_output_ext4_unlink_exit---of 3
trace_raw_output_ext4_update_sb---of 3
trace_raw_output_ext4_writepages---of 3
trace_raw_output_ext4_writepages_result---of 3
update_super_work---of 12
-----------
SUMMARY39%of 329

-----------
SUMMARY---of 0

__find_nth_and_andnot_bit---of 13
__find_nth_and_bit---of 13
__find_nth_andnot_bit---of 13
__find_nth_bit---of 13
_find_first_and_and_bit---of 6
_find_first_and_bit---of 6
_find_first_bit---of 6
_find_first_zero_bit50%of 6
_find_last_bit40%of 5
_find_next_and_bit---of 6
_find_next_andnot_bit---of 6
_find_next_bit100%of 6
_find_next_or_bit---of 6
_find_next_zero_bit100%of 6
find_next_clump8---of 8
-----------
SUMMARY74%of 23

-----------
SUMMARY---of 0

tomoyo_check_mkdev_acl---of 22
tomoyo_check_open_permission60%of 10
tomoyo_check_path2_acl---of 11
tomoyo_check_path_acl72%of 7
tomoyo_check_path_number_acl42%of 12
tomoyo_compare_name_union50%of 4
tomoyo_compare_number_union75%of 4
tomoyo_execute_permission---of 7
tomoyo_merge_mkdev_acl---of 1
tomoyo_merge_path2_acl---of 1
tomoyo_merge_path_acl---of 1
tomoyo_merge_path_number_acl---of 1
tomoyo_mkdev_perm---of 9
tomoyo_path2_perm55%of 22
tomoyo_path_number_perm56%of 20
tomoyo_path_perm57%of 16
tomoyo_path_permission50%of 8
tomoyo_put_name_union---of 5
tomoyo_put_number_union---of 3
tomoyo_same_mkdev_acl---of 18
tomoyo_same_mount_acl---of 12
tomoyo_same_path2_acl---of 5
tomoyo_same_path_acl---of 3
tomoyo_same_path_number_acl---of 8
tomoyo_update_mount_acl---of 20
tomoyo_write_file---of 44
-----------
SUMMARY56%of 103

-----------
SUMMARY---of 0

__bpf_address_lookup---of 23
__bpf_call_base---of 1
__bpf_free_used_btfs---of 6
__bpf_free_used_maps---of 8
__bpf_prog_array_free_sleepable_cb---of 1
__bpf_prog_free---of 3
__bpf_prog_ret0_warn---of 1
__bpf_prog_ret1---of 1
__bpf_trace_bpf_xdp_link_attach_failed---of 1
__bpf_trace_mem_connect---of 1
__bpf_trace_mem_disconnect---of 1
__bpf_trace_mem_return_failed---of 1
__bpf_trace_xdp_bulk_tx---of 1
__bpf_trace_xdp_cpumap_enqueue---of 1
__bpf_trace_xdp_cpumap_kthread---of 1
__bpf_trace_xdp_devmap_xmit---of 1
__bpf_trace_xdp_exception---of 1
__bpf_trace_xdp_redirect_template---of 1
__probestub_bpf_xdp_link_attach_failed---of 1
__probestub_mem_connect---of 1
__probestub_mem_disconnect---of 1
__probestub_mem_return_failed---of 1
__probestub_xdp_bulk_tx---of 1
__probestub_xdp_cpumap_enqueue---of 1
__probestub_xdp_cpumap_kthread---of 1
__probestub_xdp_devmap_xmit---of 1
__probestub_xdp_exception---of 1
__probestub_xdp_redirect---of 1
__probestub_xdp_redirect_err---of 1
__probestub_xdp_redirect_map---of 1
__probestub_xdp_redirect_map_err---of 1
__traceiter_bpf_xdp_link_attach_failed---of 4
__traceiter_mem_connect---of 4
__traceiter_mem_disconnect---of 4
__traceiter_mem_return_failed---of 4
__traceiter_xdp_bulk_tx---of 4
__traceiter_xdp_cpumap_enqueue---of 4
__traceiter_xdp_cpumap_kthread---of 4
__traceiter_xdp_devmap_xmit---of 4
__traceiter_xdp_exception---of 4
__traceiter_xdp_redirect---of 4
__traceiter_xdp_redirect_err---of 4
__traceiter_xdp_redirect_map---of 4
__traceiter_xdp_redirect_map_err---of 4
bpf_adj_branches---of 35
bpf_get_kallsym---of 17
bpf_get_raw_cpu_id---of 1
bpf_internal_load_pointer_neg_helper---of 8
bpf_jit_add_poke_descriptor---of 11
bpf_jit_alloc_exec---of 1
bpf_jit_alloc_exec_limit---of 1
bpf_jit_binary_alloc---of 10
bpf_jit_binary_free---of 1
bpf_jit_binary_pack_alloc---of 11
bpf_jit_binary_pack_finalize---of 3
bpf_jit_binary_pack_free---of 1
bpf_jit_binary_pack_hdr---of 1
bpf_jit_blind_constants---of 66
bpf_jit_charge_modmem---of 5
bpf_jit_compile---of 1
bpf_jit_fill_hole_with_zero---of 1
bpf_jit_free_exec---of 1
bpf_jit_get_func_addr---of 9
bpf_jit_inlines_helper_call---of 1
bpf_jit_needs_zext---of 1
bpf_jit_prog_release_other---of 1
bpf_jit_supports_far_kfunc_call---of 1
bpf_jit_uncharge_modmem---of 1
bpf_ksym_add---of 11
bpf_ksym_del---of 4
bpf_opcode_in_insntable---of 1
bpf_patch_insn_single---of 22
bpf_prog_alloc---of 9
bpf_prog_alloc_jited_linfo---of 5
bpf_prog_alloc_no_stats---of 18
bpf_prog_array_alloc---of 3
bpf_prog_array_copy---of 21
bpf_prog_array_copy_info---of 11
bpf_prog_array_copy_to_user---of 10
bpf_prog_array_delete_safe---of 5
bpf_prog_array_delete_safe_at---of 8
bpf_prog_array_free---of 3
bpf_prog_array_free_sleepable---of 3
bpf_prog_array_is_empty---of 3
bpf_prog_array_length---of 4
bpf_prog_array_update_at---of 8
bpf_prog_calc_tag---of 15
bpf_prog_fill_jited_linfo---of 7
bpf_prog_free---of 3
bpf_prog_free_deferred---of 25
bpf_prog_jit_attempt_done---of 5
bpf_prog_kallsyms_add---of 11
bpf_prog_kallsyms_del---of 4
bpf_prog_kallsyms_del_all---of 10
bpf_prog_ksym_find---of 11
bpf_prog_map_compatible---of 9
bpf_prog_pack_alloc---of 15
bpf_prog_pack_free---of 16
bpf_prog_realloc---of 6
bpf_prog_select_runtime---of 32
bpf_remove_insns---of 3
bpf_user_rnd_init_once---of 4
bpf_user_rnd_u32---of 5
is_bpf_text_address23%of 18
perf_trace_bpf_xdp_link_attach_failed---of 8
perf_trace_mem_connect---of 8
perf_trace_mem_disconnect---of 8
perf_trace_mem_return_failed---of 8
perf_trace_xdp_bulk_tx---of 8
perf_trace_xdp_cpumap_enqueue---of 8
perf_trace_xdp_cpumap_kthread---of 8
perf_trace_xdp_devmap_xmit---of 8
perf_trace_xdp_exception---of 8
perf_trace_xdp_redirect_template---of 13
search_bpf_extables---of 23
trace_event_raw_event_bpf_xdp_link_attach_failed---of 7
trace_event_raw_event_mem_connect---of 7
trace_event_raw_event_mem_disconnect---of 7
trace_event_raw_event_mem_return_failed---of 7
trace_event_raw_event_xdp_bulk_tx---of 7
trace_event_raw_event_xdp_cpumap_enqueue---of 7
trace_event_raw_event_xdp_cpumap_kthread---of 7
trace_event_raw_event_xdp_devmap_xmit---of 7
trace_event_raw_event_xdp_exception---of 7
trace_event_raw_event_xdp_redirect_template---of 12
trace_raw_output_bpf_xdp_link_attach_failed---of 3
trace_raw_output_mem_connect---of 3
trace_raw_output_mem_disconnect---of 3
trace_raw_output_mem_return_failed---of 3
trace_raw_output_xdp_bulk_tx---of 3
trace_raw_output_xdp_cpumap_enqueue---of 3
trace_raw_output_xdp_cpumap_kthread---of 3
trace_raw_output_xdp_devmap_xmit---of 3
trace_raw_output_xdp_exception---of 3
trace_raw_output_xdp_redirect_template---of 3
-----------
SUMMARY23%of 18

-----------
SUMMARY---of 0

timerqueue_add58%of 7
timerqueue_del60%of 5
timerqueue_iterate_next---of 3
-----------
SUMMARY59%of 12

__io_futex_cancel---of 9
io_futex_cache_free---of 6
io_futex_cache_init67%of 3
io_futex_cancel---of 20
io_futex_complete---of 10
io_futex_prep---of 8
io_futex_remove_all---of 11
io_futex_wait---of 27
io_futex_wake---of 3
io_futex_wake_fn---of 3
io_futex_wakev_fn---of 5
io_futexv_complete---of 10
io_futexv_prep---of 11
io_futexv_wait---of 16
io_ring_submit_unlock---of 6
-----------
SUMMARY67%of 3

__ia32_compat_sys_open_by_handle_at---of 1
__ia32_sys_name_to_handle_at---of 1
__ia32_sys_open_by_handle_at---of 1
__se_sys_name_to_handle_at25%of 20
__x64_compat_sys_open_by_handle_at---of 1
__x64_sys_name_to_handle_at100%of 1
__x64_sys_open_by_handle_at100%of 1
do_handle_open22%of 19
vfs_dentry_acceptable100%of 1
-----------
SUMMARY29%of 42

-----------
SUMMARY---of 0

alloc_tree_mod_elem---of 4
btrfs_get_old_root---of 42
btrfs_get_tree_mod_seq---of 4
btrfs_old_root_level---of 4
btrfs_put_tree_mod_seq---of 12
btrfs_tree_mod_log_eb_copy---of 49
btrfs_tree_mod_log_free_eb7%of 29
btrfs_tree_mod_log_insert_key16%of 13
btrfs_tree_mod_log_insert_move---of 33
btrfs_tree_mod_log_insert_root8%of 28
btrfs_tree_mod_log_lowest_seq67%of 3
btrfs_tree_mod_log_rewind---of 19
tree_mod_log_insert---of 12
tree_mod_log_oldest_root---of 17
tree_mod_log_rewind---of 17
-----------
SUMMARY11%of 73

btrfs_check_dir_item_collision---of 12
btrfs_delete_one_dir_name---of 3
btrfs_insert_dir_item---of 9
btrfs_insert_xattr_item---of 4
btrfs_lookup_dir_index_item---of 4
btrfs_lookup_dir_item50%of 4
btrfs_lookup_xattr50%of 4
btrfs_match_dir_item_name29%of 7
btrfs_search_dir_index_item---of 8
insert_with_overflow---of 8
-----------
SUMMARY40%of 15

dir_is_empty---of 26
dir_search_u36%of 14
ntfs_nls_to_utf1610%of 30
ntfs_read_hdr26%of 31
ntfs_readdir31%of 42
ntfs_utf16_to_nls45%of 9
-----------
SUMMARY27%of 126

btrfs_bio_counter_dec---of 3
btrfs_bio_counter_inc_blocked40%of 5
btrfs_bio_counter_sub---of 3
btrfs_dev_name---of 8
btrfs_dev_replace_by_ioctl---of 85
btrfs_dev_replace_cancel---of 22
btrfs_dev_replace_finishing---of 34
btrfs_dev_replace_is_ongoing34%of 6
btrfs_dev_replace_kthread---of 32
btrfs_dev_replace_status---of 5
btrfs_dev_replace_suspend_for_unmount---of 3
btrfs_dev_replace_update_device_in_mapping_tree---of 13
btrfs_finish_block_group_to_copy---of 22
btrfs_init_dev_replace---of 27
btrfs_resume_dev_replace_async---of 11
btrfs_rm_dev_replace_blocked---of 5
btrfs_rm_dev_replace_unblocked---of 1
btrfs_run_dev_replace15%of 14
rcu_read_unlock---of 6
-----------
SUMMARY24%of 25

cipso2_seq_show---of 5
cipso_seq_next---of 6
cipso_seq_show---of 6
cipso_seq_start---of 1
load2_seq_next---of 6
load2_seq_show---of 4
load2_seq_start---of 1
load_self2_seq_next---of 6
load_self2_seq_show---of 1
load_self2_seq_start---of 1
load_self_seq_next---of 6
load_self_seq_show---of 1
load_self_seq_start---of 1
load_seq_show---of 4
net4addr_seq_next---of 6
net4addr_seq_show---of 3
net4addr_seq_start---of 1
net6addr_seq_next---of 6
net6addr_seq_show---of 3
net6addr_seq_start---of 1
onlycap_seq_next---of 6
onlycap_seq_show---of 1
onlycap_seq_start---of 1
relabel_self_seq_next---of 6
relabel_self_seq_show---of 1
relabel_self_seq_start---of 1
smk_cipso_doi---of 6
smk_destroy_label_list---of 4
smk_fill_rule---of 41
smk_fill_super---of 3
smk_get_tree---of 1
smk_init_fs_context---of 1
smk_net4addr_insert---of 11
smk_net6addr_insert---of 11
smk_open_cipso---of 1
smk_open_cipso2---of 1
smk_open_load---of 1
smk_open_load2---of 1
smk_open_load_self---of 1
smk_open_load_self2---of 1
smk_open_net4addr---of 1
smk_open_net6addr---of 1
smk_open_onlycap---of 1
smk_open_relabel_self---of 1
smk_parse_label_list---of 9
smk_parse_long_rule---of 22
smk_read_ambient---of 4
smk_read_direct---of 3
smk_read_doi---of 3
smk_read_logging---of 3
smk_read_mapped---of 3
smk_read_ptrace---of 3
smk_read_syslog---of 4
smk_rule_show---of 18
smk_seq_start---of 20
smk_seq_stop---of 6
smk_set_access---of 10
smk_set_cipso---of 35
smk_unlbl_ambient---of 8
smk_write_access---of 6
smk_write_access2---of 5
smk_write_ambient---of 6
smk_write_change_rule---of 3
smk_write_cipso---of 1
smk_write_cipso2---of 1
smk_write_direct19%of 11
smk_write_doi---of 6
smk_write_load---of 8
smk_write_load2---of 3
smk_write_load_self---of 1
smk_write_load_self2---of 1
smk_write_logging---of 7
smk_write_mapped55%of 11
smk_write_net4addr---of 33
smk_write_net6addr---of 52
smk_write_onlycap---of 14
smk_write_ptrace---of 7
smk_write_relabel_self---of 17
smk_write_revoke_subj---of 10
smk_write_rules_list---of 20
smk_write_syslog---of 6
-----------
SUMMARY37%of 22

___pmd_free_tlb---of 5
___pte_free_tlb50%of 4
___pud_free_tlb---of 4
__native_set_fixmap---of 3
arch_check_zapped_pmd---of 7
arch_check_zapped_pte29%of 7
lruvec_stat_sub_folio40%of 15
native_set_fixmap---of 7
pgd_alloc---of 7
pgd_free---of 3
pgd_page_get_mm---of 1
pmd_clear_huge---of 3
pmd_free_pte_page---of 1
pmd_mkwrite---of 3
pmd_set_huge---of 8
pmdp_clear_flush_young---of 6
pmdp_invalidate_ad---of 5
pmdp_set_access_flags---of 4
pmdp_test_and_clear_young---of 3
pte_alloc_one30%of 20
pte_mkwrite67%of 3
ptep_clear_flush_young---of 3
ptep_set_access_flags67%of 3
ptep_test_and_clear_young---of 3
pud_clear_huge---of 3
pud_free_pmd_page---of 13
pud_set_huge---of 7
pudp_set_access_flags---of 4
pudp_test_and_clear_young---of 3
-----------
SUMMARY39%of 52

__printk_safe_enter100%of 1
__printk_safe_exit100%of 1
vprintk50%of 4
-----------
SUMMARY67%of 6

__bio_split_to_limits26%of 35
__blk_rq_map_sg37%of 41
attempt_merge9%of 34
bio_attempt_back_merge29%of 32
bio_attempt_discard_merge---of 17
bio_attempt_front_merge---of 61
bio_split_rw60%of 25
bio_split_to_limits---of 8
bio_will_gap8%of 25
blk_account_io_merge_bio43%of 7
blk_account_io_merge_request---of 9
blk_attempt_bio_merge32%of 16
blk_attempt_plug_merge50%of 8
blk_attempt_req_merge100%of 1
blk_bio_list_merge---of 5
blk_mq_sched_try_merge11%of 19
blk_recalc_rq_segments---of 21
blk_rq_merge_ok25%of 16
blk_rq_set_mixed_merge---of 7
blk_try_merge---of 6
ll_back_merge_fn16%of 32
ll_merge_requests_fn---of 19
req_attempt_discard_merge---of 16
trace_block_rq_merge---of 15
-----------
SUMMARY27%of 291

__fsnotify_recalc_mask---of 19
fsnotify_add_mark---of 5
fsnotify_add_mark_locked---of 50
fsnotify_clear_marks_by_group---of 28
fsnotify_compare_groups---of 7
fsnotify_conn_mask---of 6
fsnotify_connector_destroy_workfn---of 4
fsnotify_destroy_mark---of 8
fsnotify_destroy_marks9%of 23
fsnotify_detach_mark---of 13
fsnotify_find_mark---of 17
fsnotify_finish_user_wait---of 21
fsnotify_free_mark---of 4
fsnotify_get_mark---of 6
fsnotify_grab_connector31%of 13
fsnotify_init_mark---of 1
fsnotify_mark_destroy_workfn---of 8
fsnotify_prepare_user_wait---of 21
fsnotify_put_mark---of 28
fsnotify_recalc_mask---of 4
fsnotify_update_sb_watchers---of 16
fsnotify_wait_marks_destroyed---of 1
-----------
SUMMARY17%of 36

-----------
SUMMARY---of 0

dev_exception_add---of 15
dev_exception_rm---of 15
dev_exceptions_copy---of 14
devcgroup_access_write---of 167
devcgroup_check_permission13%of 49
devcgroup_css_alloc---of 3
devcgroup_css_free---of 8
devcgroup_offline---of 1
devcgroup_online---of 4
devcgroup_seq_show---of 29
parent_has_perm---of 35
-----------
SUMMARY13%of 49

-----------
SUMMARY---of 0

__read_end_io---of 15
bio_first_folio---of 18
bio_post_read_processing---of 7
decrypt_work---of 3
ext4_exit_post_read_processing---of 1
ext4_mpage_readpages14%of 117
folio_size30%of 10
folio_zero_segment50%of 14
mpage_end_io---of 4
verity_work---of 1
-----------
SUMMARY19%of 141

-----------
SUMMARY---of 0

alloc_mnt_idmap---of 16
from_vfsgid40%of 5
from_vfsuid40%of 5
make_vfsgid34%of 6
make_vfsuid34%of 6
mnt_idmap_get40%of 5
mnt_idmap_put---of 8
vfsgid_in_group_p100%of 1
-----------
SUMMARY40%of 28

__cpu_to_node50%of 4
__node_distance---of 3
cpumask_of_node---of 4
debug_cpumask_set_cpu---of 8
early_cpu_to_node---of 7
memory_add_physaddr_to_nid---of 12
numa_clear_node---of 1
numa_cpu_node---of 7
numa_set_node---of 8
phys_to_target_node---of 21
-----------
SUMMARY50%of 4

__check_object_size17%of 43
check_stack_object75%of 4
-----------
SUMMARY22%of 47

____fput100%of 1
__fput49%of 29
__fput_sync67%of 3
alloc_empty_backing_file---of 4
alloc_empty_file34%of 9
alloc_empty_file_noaccount---of 4
alloc_file_clone---of 3
alloc_file_pseudo50%of 4
alloc_file_pseudo_noaccount43%of 7
backing_file_user_path---of 1
delayed_fput---of 4
file_free50%of 8
file_init_path70%of 13
flush_delayed_fput---of 4
fput45%of 9
get_max_files---of 1
init_file60%of 5
proc_nr_files---of 1
put_cred---of 4
-----------
SUMMARY52%of 88

-----------
SUMMARY---of 0

__ext4_xattr_set_credits28%of 18
check_xattrs25%of 40
ext4_evict_ea_inode---of 8
ext4_expand_extra_isize_ea---of 80
ext4_get_inode_usage---of 20
ext4_listxattr---of 43
ext4_xattr_block_cache_insert---of 3
ext4_xattr_block_csum---of 10
ext4_xattr_block_csum_set---of 7
ext4_xattr_block_find17%of 12
ext4_xattr_block_set---of 141
ext4_xattr_create_cache---of 1
ext4_xattr_delete_inode21%of 34
ext4_xattr_destroy_cache---of 3
ext4_xattr_get24%of 26
ext4_xattr_ibody_find54%of 13
ext4_xattr_ibody_get25%of 20
ext4_xattr_ibody_set24%of 17
ext4_xattr_inode_array_free40%of 5
ext4_xattr_inode_dec_ref_all---of 40
ext4_xattr_inode_free_quota---of 6
ext4_xattr_inode_get---of 8
ext4_xattr_inode_iget---of 7
ext4_xattr_inode_inc_ref_all---of 15
ext4_xattr_inode_lookup_create---of 72
ext4_xattr_inode_read---of 20
ext4_xattr_inode_set_class---of 1
ext4_xattr_inode_update_ref---of 16
ext4_xattr_inode_verify_hashes---of 24
ext4_xattr_release_block---of 32
ext4_xattr_set---of 8
ext4_xattr_set_credits---of 11
ext4_xattr_set_entry28%of 76
ext4_xattr_set_handle25%of 69
ext4_xattr_update_super_block40%of 5
ext4_xattr_value_same---of 4
lock_buffer---of 3
mb_cache_entry_put---of 4
-----------
SUMMARY27%of 335

char2uni---of 1
uni2char40%of 5
-----------
SUMMARY40%of 5

capi20_proc_show---of 4
capi20ncci_proc_show---of 7
capi_compat_ioctl---of 5
capi_open---of 4
capi_poll40%of 5
capi_read---of 14
capi_recv_message---of 20
capi_release---of 5
capi_unlocked_ioctl---of 46
capi_write---of 16
capiminor_destroy---of 6
capinc_tty_chars_in_buffer---of 3
capinc_tty_cleanup---of 1
capinc_tty_close---of 1
capinc_tty_flush_chars---of 3
capinc_tty_hangup---of 1
capinc_tty_install---of 10
capinc_tty_open---of 3
capinc_tty_put_char---of 11
capinc_tty_send_xchar---of 3
capinc_tty_start---of 1
capinc_tty_stop---of 1
capinc_tty_throttle---of 1
capinc_tty_unthrottle---of 1
capinc_tty_write---of 7
capinc_tty_write_room---of 3
capincci_alloc---of 13
capincci_free---of 12
handle_minor_recv---of 18
handle_minor_send---of 18
-----------
SUMMARY40%of 5

__bpf_trace_csd_function---of 1
__bpf_trace_csd_queue_cpu---of 1
__flush_smp_call_function_queue---of 130
__probestub_csd_function_entry---of 1
__probestub_csd_function_exit---of 1
__probestub_csd_queue_cpu---of 1
__smp_call_single_queue28%of 22
__traceiter_csd_function_entry---of 4
__traceiter_csd_function_exit---of 4
__traceiter_csd_queue_cpu---of 4
do_nothing---of 1
flush_smp_call_function_queue---of 14
generic_exec_single---of 51
generic_smp_call_function_single_interrupt---of 1
kick_all_cpus_sync---of 3
on_each_cpu_cond_mask67%of 3
perf_trace_csd_function---of 8
perf_trace_csd_queue_cpu---of 8
smp_call_function---of 3
smp_call_function_any---of 20
smp_call_function_many---of 1
smp_call_function_many_cond15%of 189
smp_call_function_single---of 83
smp_call_function_single_async---of 5
smp_call_on_cpu---of 6
smp_call_on_cpu_callback---of 5
smpcfd_dead_cpu---of 3
smpcfd_dying_cpu---of 1
smpcfd_prepare_cpu---of 3
trace_csd_queue_cpu---of 15
trace_event_raw_event_csd_function---of 7
trace_event_raw_event_csd_queue_cpu---of 7
trace_raw_output_csd_function---of 3
trace_raw_output_csd_queue_cpu---of 3
wake_up_all_idle_cpus---of 10
-----------
SUMMARY17%of 214

__folio_throttle_swaprate23%of 9
__ia32_sys_swapoff---of 1
__ia32_sys_swapon---of 1
__page_file_index---of 7
__se_sys_swapoff---of 239
__se_sys_swapon---of 204
__swap_count---of 3
__swap_duplicate---of 16
__swap_entry_free---of 12
__try_to_reclaim_swap---of 52
__x64_sys_swapoff---of 1
__x64_sys_swapon---of 1
_enable_swap_info---of 9
add_swap_count_continuation---of 18
add_swap_extent---of 8
count_swap_pages---of 7
del_from_avail_list---of 8
find_first_swap---of 7
folio_free_swap---of 55
folio_lock---of 9
folio_order---of 9
free_cluster---of 6
free_swap_and_cache_nr---of 13
generic_max_swapfile_size---of 1
get_swap_device---of 25
get_swap_page_of_type---of 8
get_swap_pages---of 25
has_usable_swap---of 1
percpu_ref_put---of 14
put_swap_folio---of 43
scan_swap_map_slots---of 110
scan_swap_map_try_ssd_cluster---of 28
si_swapinfo---of 8
swap_count_continued---of 20
swap_discard_work---of 1
swap_do_scheduled_discard---of 20
swap_duplicate---of 4
swap_folio_sector---of 9
swap_free---of 8
swap_next---of 12
swap_page_trans_huge_swapped---of 8
swap_range_free---of 19
swap_shmem_alloc---of 1
swap_show---of 3
swap_start---of 9
swap_stop---of 1
swap_swapcount---of 4
swap_type_of---of 11
swap_users_ref_free---of 1
swapcache_clear---of 9
swapcache_free_entries---of 32
swapcache_mapping---of 3
swapcache_prepare---of 1
swapdev_block---of 12
swaps_open---of 3
swaps_poll---of 6
swp_entry_cmp---of 1
swp_swap_info---of 3
swp_swapcount---of 16
-----------
SUMMARY23%of 9

debugfs_atomic_t_get---of 1
debugfs_atomic_t_set---of 1
debugfs_attr_read---of 5
debugfs_attr_write---of 5
debugfs_attr_write_signed---of 5
debugfs_create_atomic_t---of 1
debugfs_create_blob---of 1
debugfs_create_bool---of 1
debugfs_create_devm_seqfile---of 4
debugfs_create_regset32---of 1
debugfs_create_size_t---of 1
debugfs_create_str---of 1
debugfs_create_u16---of 1
debugfs_create_u32---of 1
debugfs_create_u32_array---of 1
debugfs_create_u64---of 1
debugfs_create_u8---of 1
debugfs_create_ulong---of 1
debugfs_create_x16---of 1
debugfs_create_x32---of 1
debugfs_create_x64---of 1
debugfs_create_x8---of 1
debugfs_devm_entry_open---of 1
debugfs_enter_cancellation---of 9
debugfs_file_get34%of 15
debugfs_file_put---of 4
debugfs_leave_cancellation---of 6
debugfs_print_regs32---of 6
debugfs_read_file_bool---of 5
debugfs_read_file_str---of 10
debugfs_real_fops---of 3
debugfs_regset32_open---of 1
debugfs_regset32_show---of 8
debugfs_size_t_get---of 1
debugfs_size_t_set---of 1
debugfs_u16_get---of 1
debugfs_u16_set---of 1
debugfs_u32_get---of 1
debugfs_u32_set---of 1
debugfs_u64_get---of 1
debugfs_u64_set---of 1
debugfs_u8_get---of 1
debugfs_u8_set---of 1
debugfs_ulong_get---of 1
debugfs_ulong_set---of 1
debugfs_write_file_bool---of 6
debugfs_write_file_str---of 17
default_read_file---of 1
default_write_file---of 1
fops_atomic_t_open---of 1
fops_atomic_t_ro_open---of 1
fops_atomic_t_wo_open---of 1
fops_size_t_open---of 1
fops_size_t_ro_open---of 1
fops_size_t_wo_open---of 1
fops_u16_open---of 1
fops_u16_ro_open---of 1
fops_u16_wo_open---of 1
fops_u32_open---of 1
fops_u32_ro_open---of 1
fops_u32_wo_open---of 1
fops_u64_open---of 1
fops_u64_ro_open---of 1
fops_u64_wo_open---of 1
fops_u8_open---of 1
fops_u8_ro_open---of 1
fops_u8_wo_open---of 1
fops_ulong_open---of 1
fops_ulong_ro_open---of 1
fops_ulong_wo_open---of 1
fops_x16_open---of 1
fops_x16_ro_open---of 1
fops_x16_wo_open---of 1
fops_x32_open---of 1
fops_x32_ro_open---of 1
fops_x32_wo_open---of 1
fops_x64_open---of 1
fops_x64_ro_open---of 1
fops_x64_wo_open---of 1
fops_x8_open---of 1
fops_x8_ro_open---of 1
fops_x8_wo_open---of 1
full_proxy_llseek---of 7
full_proxy_open---of 41
full_proxy_poll---of 7
full_proxy_read43%of 7
full_proxy_release---of 10
full_proxy_unlocked_ioctl---of 7
full_proxy_write---of 7
open_proxy_open---of 25
read_file_blob---of 5
u32_array_open---of 5
u32_array_read---of 1
u32_array_release---of 1
write_file_blob---of 5
-----------
SUMMARY37%of 22

-----------
SUMMARY---of 0

__add_to_discard_list---of 19
__btrfs_discard_schedule_work---of 22
add_to_discard_unused_list---of 9
btrfs_discard_calc_delay---of 9
btrfs_discard_cancel_work---of 9
btrfs_discard_check_filter---of 18
btrfs_discard_cleanup---of 22
btrfs_discard_init---of 1
btrfs_discard_punt_unused_bgs_list---of 6
btrfs_discard_queue_work---of 8
btrfs_discard_resume---of 7
btrfs_discard_schedule_work---of 1
btrfs_discard_stop---of 1
btrfs_discard_update_discardable19%of 11
btrfs_discard_workfn---of 44
-----------
SUMMARY19%of 11

-----------
SUMMARY---of 0

__change_pid---of 14
__ia32_sys_pidfd_getfd---of 1
__ia32_sys_pidfd_open---of 1
__se_sys_pidfd_getfd---of 16
__se_sys_pidfd_open---of 8
__task_pid_nr_ns31%of 26
__x64_sys_pidfd_getfd---of 1
__x64_sys_pidfd_open---of 1
alloc_pid---of 40
attach_pid---of 6
change_pid---of 6
delayed_put_pid---of 5
detach_pid---of 1
disable_pid_allocation---of 1
exchange_tids---of 5
find_ge_pid---of 1
find_get_pid---of 17
find_get_task_by_vpid---of 17
find_pid_ns---of 1
find_task_by_pid_ns---of 13
find_task_by_vpid---of 3
find_vpid---of 3
free_pid---of 8
get_pid_task27%of 15
get_task_pid27%of 23
pid_nr_ns---of 5
pid_task30%of 10
pid_vnr---of 7
pidfd_get_pid---of 9
pidfd_get_task---of 8
put_pid40%of 5
task_active_pid_ns67%of 3
transfer_pid---of 6
-----------
SUMMARY31%of 82

-----------
SUMMARY---of 0

__copy_xstate_to_uabi_buf---of 38
__raw_xsave_addr---of 11
__xfd_enable_feature---of 42
arch_set_user_pkey_access---of 20
copy_sigframe_from_user_to_xstate---of 1
copy_uabi_from_kernel_to_xstate---of 1
copy_uabi_to_xstate---of 40
copy_xstate_to_uabi_buf---of 1
cpu_has_xfeatures---of 4
fpstate_clear_xstate_component---of 4
fpstate_free---of 3
fpu__init_cpu_xstate---of 20
fpu__resume_cpu---of 15
fpu_xstate_prctl---of 35
get_xsave_addr---of 8
proc_pid_arch_status---of 6
xfd_enable_feature---of 1
xfd_validate_state29%of 7
xfeature_get_offset---of 16
xfeature_size---of 4
xrstors---of 10
xsaves---of 10
xstate_calculate_size---of 8
xstate_get_guest_group_perm---of 1
-----------
SUMMARY29%of 7

__phys_addr50%of 8
__phys_addr_symbol67%of 3
__virt_addr_valid26%of 35
-----------
SUMMARY33%of 46

-----------
SUMMARY---of 0

ima_iint_find75%of 4
ima_iint_init_once---of 1
ima_inode_free40%of 5
ima_inode_get---of 10
-----------
SUMMARY56%of 9

__ia32_sys_futimesat---of 6
__ia32_sys_futimesat_time32---of 1
__ia32_sys_utime---of 1
__ia32_sys_utime32---of 1
__ia32_sys_utimensat---of 7
__ia32_sys_utimensat_time32---of 7
__ia32_sys_utimes---of 1
__ia32_sys_utimes_time32---of 1
__se_sys_utime34%of 9
__se_sys_utime32---of 9
__se_sys_utimes---of 10
__x64_sys_futimesat---of 6
__x64_sys_futimesat_time32---of 1
__x64_sys_utime100%of 1
__x64_sys_utime32---of 1
__x64_sys_utimensat---of 7
__x64_sys_utimensat_time32---of 7
__x64_sys_utimes---of 1
__x64_sys_utimes_time32---of 1
do_compat_futimesat---of 8
do_utimes---of 10
vfs_utimes20%of 21
-----------
SUMMARY26%of 31

-----------
SUMMARY---of 0

__dev_printk---of 8
__device_link_del---of 13
__device_links_no_driver---of 12
__device_links_queue_sync_state---of 18
__fw_devlink_link_to_consumers---of 31
__fw_devlink_link_to_suppliers---of 14
__fw_devlink_pickup_dangling_consumers---of 16
__fw_devlink_relax_cycles---of 51
__fwnode_link_add---of 10
__fwnode_link_cycle---of 3
__root_device_register---of 9
_dev_alert---of 1
_dev_crit---of 1
_dev_emerg---of 1
_dev_err---of 1
_dev_info---of 1
_dev_notice---of 1
_dev_printk---of 1
_dev_warn---of 1
auto_remove_on_show---of 1
class_dir_child_ns_type---of 1
class_dir_release---of 1
cleanup_glue_dir---of 11
dev_attr_show---of 4
dev_attr_store---of 3
dev_driver_string---of 5
dev_err_probe---of 4
dev_printk_emit---of 1
dev_set_name---of 1
dev_show---of 1
dev_uevent---of 32
dev_uevent_filter---of 5
dev_uevent_name---of 4
dev_vprintk_emit---of 8
device_add---of 52
device_add_attrs---of 32
device_add_class_symlinks---of 15
device_add_groups---of 1
device_change_owner---of 22
device_check_offline---of 10
device_create---of 6
device_create_bin_file---of 3
device_create_file---of 8
device_create_release---of 4
device_create_sys_dev_entry---of 3
device_create_with_groups---of 6
device_del---of 37
device_destroy---of 5
device_find_any_child---of 7
device_find_child---of 9
device_find_child_by_name---of 11
device_for_each_child---of 6
device_for_each_child_reverse---of 6
device_get_devnode---of 16
device_get_ownership---of 4
device_initialize---of 3
device_is_dependent---of 16
device_link_add---of 65
device_link_del---of 1
device_link_init_status---of 8
device_link_put_kref---of 6
device_link_release_fn---of 10
device_link_remove---of 5
device_link_wait_removal---of 1
device_links_busy---of 7
device_links_check_suppliers---of 35
device_links_driver_bound---of 59
device_links_driver_cleanup---of 16
device_links_flush_sync_list---of 16
device_links_force_bind---of 9
device_links_no_driver---of 7
device_links_read_lock---of 1
device_links_read_lock_held---of 3
device_links_read_unlock---of 3
device_links_supplier_sync_state_pause---of 1
device_links_supplier_sync_state_resume---of 8
device_links_unbind_consumers---of 11
device_match_acpi_dev---of 1
device_match_acpi_handle---of 3
device_match_any---of 1
device_match_devt---of 1
device_match_fwnode---of 1
device_match_name---of 3
device_match_of_node---of 1
device_move---of 36
device_namespace---of 4
device_offline---of 13
device_online---of 7
device_pm_move_to_tail---of 3
device_register---of 1
device_release---of 10
device_remove_attrs---of 9
device_remove_bin_file---of 3
device_remove_class_symlinks---of 10
device_remove_file---of 3
device_remove_file_self---of 3
device_remove_groups---of 1
device_rename---of 12
device_reorder_to_tail---of 15
device_set_node---of 3
device_set_of_node_from_dev---of 1
device_show_bool---of 1
device_show_int---of 1
device_show_string---of 1
device_show_ulong---of 1
device_shutdown---of 30
device_store_bool---of 1
device_store_int---of 4
device_store_ulong---of 3
device_unregister---of 6
devices_kset_move_after---of 11
devices_kset_move_before---of 11
devices_kset_move_last---of 9
devlink_add_symlinks---of 31
devlink_dev_release---of 1
devlink_remove_symlinks---of 23
devm_attr_group_remove---of 3
devm_device_add_group---of 4
fw_devlink_create_devlink---of 59
fw_devlink_dev_sync_state---of 15
fw_devlink_drivers_done---of 1
fw_devlink_is_strict---of 3
fw_devlink_link_device---of 3
fw_devlink_no_driver---of 6
fw_devlink_parse_fwtree---of 9
fw_devlink_probing_done---of 1
fw_devlink_purge_absent_suppliers---of 5
fw_devlink_unblock_consumers---of 10
fwnode_link_add---of 1
fwnode_links_purge---of 1
fwnode_links_purge_consumers---of 10
fwnode_links_purge_suppliers---of 10
get_device---of 3
get_device_parent---of 17
kill_device---of 6
klist_children_get---of 3
klist_children_put---of 3
kref_get---of 4
lock_device_hotplug---of 1
lock_device_hotplug_sysfs---of 3
online_show---of 1
online_store---of 10
pm_runtime_put_noidle---of 5
put_device67%of 3
refcount_inc---of 4
removable_show---of 1
root_device_release---of 1
root_device_unregister---of 8
runtime_pm_show---of 1
set_primary_fwnode---of 12
set_secondary_fwnode---of 6
status_show---of 8
sync_state_only_show---of 1
sync_state_resume_initcall---of 1
uevent_show---of 15
uevent_store---of 3
unlock_device_hotplug---of 1
virtual_device_parent---of 3
waiting_for_supplier_show---of 6
-----------
SUMMARY67%of 3

__dd_dispatch_request55%of 37
dd_async_depth_show---of 1
dd_bio_merge67%of 3
dd_depth_updated---of 1
dd_dispatch_request26%of 31
dd_exit_sched---of 13
dd_finish_request---of 5
dd_has_work16%of 19
dd_init_hctx---of 1
dd_init_sched---of 4
dd_insert_requests34%of 45
dd_limit_depth67%of 3
dd_merged_requests---of 19
dd_owned_by_driver_show---of 10
dd_prepare_request100%of 1
dd_queued_show---of 10
dd_request_merge34%of 9
dd_request_merged---of 5
deadline_async_depth_show---of 1
deadline_async_depth_store---of 4
deadline_batching_show---of 1
deadline_dispatch0_next---of 1
deadline_dispatch0_start---of 1
deadline_dispatch0_stop---of 1
deadline_dispatch1_next---of 1
deadline_dispatch1_start---of 1
deadline_dispatch1_stop---of 1
deadline_dispatch2_next---of 1
deadline_dispatch2_start---of 1
deadline_dispatch2_stop---of 1
deadline_fifo_batch_show---of 1
deadline_fifo_batch_store---of 4
deadline_front_merges_show---of 1
deadline_front_merges_store---of 5
deadline_prio_aging_expire_show---of 1
deadline_prio_aging_expire_store---of 4
deadline_read0_fifo_next---of 1
deadline_read0_fifo_start---of 1
deadline_read0_fifo_stop---of 1
deadline_read0_next_rq_show---of 5
deadline_read1_fifo_next---of 1
deadline_read1_fifo_start---of 1
deadline_read1_fifo_stop---of 1
deadline_read1_next_rq_show---of 5
deadline_read2_fifo_next---of 1
deadline_read2_fifo_start---of 1
deadline_read2_fifo_stop---of 1
deadline_read2_next_rq_show---of 5
deadline_read_expire_show---of 1
deadline_read_expire_store---of 4
deadline_starved_show---of 1
deadline_write0_fifo_next---of 1
deadline_write0_fifo_start---of 1
deadline_write0_fifo_stop---of 1
deadline_write0_next_rq_show---of 5
deadline_write1_fifo_next---of 1
deadline_write1_fifo_start---of 1
deadline_write1_fifo_stop---of 1
deadline_write1_next_rq_show---of 5
deadline_write2_fifo_next---of 1
deadline_write2_fifo_start---of 1
deadline_write2_fifo_stop---of 1
deadline_write2_next_rq_show---of 5
deadline_write_expire_show---of 1
deadline_write_expire_store---of 4
deadline_writes_starved_show---of 1
deadline_writes_starved_store---of 3
-----------
SUMMARY37%of 148

__ia32_sys_fgetxattr---of 1
__ia32_sys_flistxattr---of 7
__ia32_sys_fremovexattr---of 1
__ia32_sys_fsetxattr---of 1
__ia32_sys_getxattr---of 1
__ia32_sys_lgetxattr---of 1
__ia32_sys_listxattr---of 5
__ia32_sys_llistxattr---of 5
__ia32_sys_lremovexattr---of 1
__ia32_sys_lsetxattr---of 1
__ia32_sys_removexattr---of 1
__ia32_sys_setxattr---of 1
__se_sys_fgetxattr---of 11
__se_sys_fremovexattr---of 9
__se_sys_fsetxattr---of 9
__vfs_getxattr50%of 20
__vfs_removexattr---of 20
__vfs_removexattr_locked---of 18
__vfs_setxattr---of 20
__vfs_setxattr_locked---of 10
__vfs_setxattr_noperm---of 22
__x64_sys_fgetxattr---of 1
__x64_sys_flistxattr---of 7
__x64_sys_fremovexattr---of 1
__x64_sys_fsetxattr---of 1
__x64_sys_getxattr---of 1
__x64_sys_lgetxattr---of 1
__x64_sys_listxattr---of 5
__x64_sys_llistxattr---of 5
__x64_sys_lremovexattr---of 1
__x64_sys_lsetxattr---of 1
__x64_sys_removexattr---of 1
__x64_sys_setxattr---of 1
do_getxattr---of 14
do_setxattr---of 4
generic_listxattr---of 12
listxattr---of 16
may_write_xattr---of 5
path_getxattr---of 9
path_removexattr---of 7
path_setxattr---of 7
removexattr---of 5
setxattr---of 12
setxattr_copy---of 8
simple_xattr_add50%of 4
simple_xattr_alloc50%of 4
simple_xattr_free---of 3
simple_xattr_get45%of 9
simple_xattr_list---of 11
simple_xattr_set---of 19
simple_xattr_space100%of 1
simple_xattrs_free38%of 8
simple_xattrs_init100%of 1
vfs_getxattr---of 10
vfs_getxattr_alloc---of 22
vfs_listxattr---of 5
vfs_removexattr---of 8
vfs_setxattr---of 14
xattr_full_name67%of 3
xattr_list_one---of 4
xattr_permission---of 16
xattr_supports_user_prefix---of 8
-----------
SUMMARY50%of 50

bt_dev_init---of 4
bt_xmit---of 131
chan_alloc_skb_cb---of 3
chan_close_cb---of 26
chan_get_sndtimeo_cb---of 1
chan_new_conn_cb---of 4
chan_ready_cb---of 19
chan_recv_cb---of 26
chan_resume_cb---of 7
chan_state_change_cb---of 12
chan_suspend_cb---of 7
delete_netdev---of 1
device_event---of 8
do_enable_set---of 38
do_notify_peers---of 1
get_l2cap_conn---of 29
give_skb_to_upper---of 3
header_create---of 1
l2cap_chan_no_defer---of 1
l2cap_chan_no_set_shutdown---of 1
l2cap_chan_no_teardown---of 1
lookup_dev---of 14
lookup_peer---of 17
lowpan_control_open---of 1
lowpan_control_show29%of 7
lowpan_control_write---of 24
lowpan_enable_fops_open---of 1
lowpan_enable_get---of 1
lowpan_enable_set---of 3
netdev_setup---of 1
-----------
SUMMARY29%of 7

page_counter_cancel50%of 4
page_counter_charge84%of 6
page_counter_memparse---of 4
page_counter_set_low---of 4
page_counter_set_max---of 5
page_counter_set_min---of 4
page_counter_try_charge56%of 9
page_counter_uncharge75%of 4
propagate_protected_usage50%of 8
-----------
SUMMARY62%of 31

-----------
SUMMARY---of 0

mempool_alloc_noprof42%of 12
mempool_alloc_pages---of 1
mempool_alloc_preallocated---of 3
mempool_alloc_slab67%of 3
mempool_create_node_noprof---of 7
mempool_destroy---of 5
mempool_exit---of 4
mempool_free23%of 9
mempool_free_pages---of 1
mempool_free_slab100%of 1
mempool_init_node---of 13
mempool_init_noprof---of 1
mempool_kfree---of 1
mempool_kmalloc---of 1
mempool_kvfree---of 1
mempool_kvmalloc---of 1
mempool_resize---of 17
remove_element---of 6
-----------
SUMMARY40%of 25

lock_page27%of 15
minix_add_link---of 24
minix_delete_entry50%of 8
minix_dotdot---of 5
minix_empty_dir12%of 17
minix_find_entry38%of 16
minix_inode_by_name50%of 4
minix_make_empty---of 22
minix_readdir---of 14
minix_set_link---of 8
unmap_and_put_page30%of 10
-----------
SUMMARY30%of 70

__ia32_sys_getgroups---of 1
__ia32_sys_setgroups---of 1
__se_sys_getgroups---of 8
__se_sys_setgroups---of 19
__x64_sys_getgroups---of 1
__x64_sys_setgroups---of 1
gid_cmp---of 1
groups_alloc---of 3
groups_free---of 1
groups_search---of 8
groups_sort---of 1
in_egroup_p---of 9
in_group_p45%of 9
may_setgroups---of 3
set_current_groups---of 4
set_groups---of 7
-----------
SUMMARY45%of 9

-----------
SUMMARY---of 0

__ia32_sys_madvise---of 1
__ia32_sys_process_madvise---of 1
__se_sys_process_madvise---of 22
__x64_sys_madvise---of 1
__x64_sys_process_madvise---of 1
anon_vma_name50%of 4
anon_vma_name_alloc---of 3
anon_vma_name_free---of 1
do_madvise---of 258
folio_get---of 3
folio_large_mapcount---of 9
folio_likely_mapped_shared---of 28
folio_lock---of 9
folio_mapcount---of 9
folio_put---of 4
madvise_cold_or_pageout_pte_range---of 191
madvise_dontneed_free_valid_vma---of 12
madvise_folio_pte_batch---of 40
madvise_free_pte_range---of 138
madvise_set_anon_name---of 21
madvise_update_vma---of 42
swapin_walk_pmd_entry---of 25
-----------
SUMMARY50%of 4

-----------
SUMMARY---of 0

__ia32_sys_msync---of 1
__se_sys_msync34%of 36
__x64_sys_msync100%of 1
-----------
SUMMARY36%of 37

bio_first_folio---of 18
btree_csum_one_bio---of 14
btree_invalidate_folio---of 18
btree_migrate_folio---of 11
btree_release_folio---of 16
btree_writepages40%of 5
btrfs_add_log_tree---of 8
btrfs_alloc_log_tree_node---of 3
btrfs_alloc_root---of 6
btrfs_block_group_root---of 3
btrfs_btree_balance_dirty---of 4
btrfs_btree_balance_dirty_nodelay---of 4
btrfs_buffer_uptodate20%of 10
btrfs_check_features---of 17
btrfs_check_leaked_roots---of 1
btrfs_check_super_csum---of 1
btrfs_check_uuid_tree---of 3
btrfs_cleanup_bg_io---of 5
btrfs_cleanup_dirty_bgs---of 14
btrfs_cleanup_one_transaction---of 72
btrfs_cleanup_transaction---of 78
btrfs_commit_super---of 3
btrfs_create_tree---of 7
btrfs_csum_root---of 12
btrfs_drop_and_free_fs_root---of 7
btrfs_end_empty_barrier---of 1
btrfs_end_super_write---of 37
btrfs_extent_root42%of 12
btrfs_find_create_tree_block100%of 1
btrfs_free_fs_info---of 9
btrfs_free_fs_roots---of 16
btrfs_get_free_objectid---of 3
btrfs_get_fs_root---of 1
btrfs_get_fs_root_commit_root---of 5
btrfs_get_global_root---of 84
btrfs_get_new_fs_root---of 1
btrfs_get_num_tolerated_disk_barrier_failures---of 20
btrfs_get_root_ref---of 42
btrfs_global_root58%of 7
btrfs_global_root_delete---of 1
btrfs_global_root_insert---of 7
btrfs_init_csum_hash---of 6
btrfs_init_fs_info---of 1
btrfs_init_log_root_tree---of 9
btrfs_init_root_free_objectid---of 6
btrfs_init_workqueues---of 14
btrfs_insert_fs_root---of 19
btrfs_lookup_fs_root---of 8
btrfs_mark_buffer_dirty34%of 6
btrfs_put_root---of 10
btrfs_read_dev_one_super---of 14
btrfs_read_dev_super---of 1
btrfs_read_extent_buffer---of 22
btrfs_read_tree_root---of 3
btrfs_replay_log---of 13
btrfs_start_pre_rw_mount---of 63
btrfs_stop_all_workers---of 11
btrfs_uuid_rescan_kthread---of 4
btrfs_validate_extent_buffer---of 31
btrfs_validate_super---of 60
cleaner_kthread---of 13
close_ctree---of 29
csum_tree_block---of 10
folio_size---of 10
free_root_pointers---of 23
init_tree_roots---of 61
load_global_roots_objectid---of 21
load_super_root---of 8
open_ctree---of 82
read_tree_block---of 6
read_tree_root_path---of 13
trace_btrfs_transaction_commit---of 15
transaction_kthread---of 16
write_all_supers---of 104
-----------
SUMMARY40%of 41

strncpy_from_user20%of 15
-----------
SUMMARY20%of 15

__ia32_sys_mmap---of 3
__x64_sys_mmap67%of 3
arch_get_unmapped_area---of 1
arch_get_unmapped_area_topdown100%of 1
arch_get_unmapped_area_topdown_vmflags27%of 30
arch_get_unmapped_area_vmflags---of 26
-----------
SUMMARY33%of 34

-----------
SUMMARY---of 0

PageUptodate---of 15
__btrfs_add_free_space---of 35
__btrfs_add_free_space_zoned---of 15
__btrfs_remove_free_space_cache---of 13
__btrfs_return_cluster_to_free_space---of 25
add_bytes_to_bitmap---of 12
bitmap_clear_bits---of 13
btrfs_add_free_space---of 3
btrfs_add_free_space_async_trimmed---of 3
btrfs_add_free_space_unused---of 3
btrfs_alloc_from_cluster---of 31
btrfs_dump_free_space---of 7
btrfs_find_space_cluster---of 43
btrfs_find_space_for_alloc30%of 48
btrfs_free_space_cache_v1_active---of 1
btrfs_free_space_exit---of 1
btrfs_init_free_cluster---of 1
btrfs_init_free_space_ctl---of 1
btrfs_is_free_space_trimmed---of 5
btrfs_remove_free_space---of 30
btrfs_remove_free_space_cache---of 6
btrfs_remove_free_space_inode---of 10
btrfs_return_cluster_to_free_space---of 5
btrfs_set_free_space_cache_v1_active---of 9
btrfs_trim_block_group---of 8
btrfs_trim_block_group_bitmaps---of 3
btrfs_trim_block_group_extents---of 3
btrfs_truncate_free_space_cache---of 15
btrfs_wait_cache_io---of 12
btrfs_write_out_cache---of 35
cleanup_bitmap_list---of 6
cleanup_write_cache_enospc---of 1
create_free_space_inode---of 5
do_trimming---of 14
free_bitmap---of 12
io_ctl_add_entry---of 9
io_ctl_check_crc---of 9
io_ctl_check_generation---of 6
io_ctl_drop_pages---of 13
io_ctl_prepare_pages---of 33
io_ctl_read_bitmap---of 4
io_ctl_read_entry---of 6
io_ctl_zero_remaining_pages---of 14
link_free_space39%of 21
load_free_space_cache---of 54
lookup_free_space_inode---of 13
put_page---of 14
recalculate_thresholds---of 5
relink_bitmap_entry---of 18
remove_from_bitmap---of 12
search_bitmap---of 11
setup_cluster_bitmap---of 33
setup_cluster_no_bitmap---of 28
steal_from_bitmap---of 25
trace_btrfs_setup_cluster---of 15
tree_insert_offset36%of 17
tree_search_offset14%of 38
trim_bitmaps---of 38
trim_no_bitmap---of 42
try_merge_free_space---of 32
update_cache_item---of 7
use_bitmap---of 5
write_bitmap_entries---of 20
write_cache_extent_entries---of 23
write_pinned_extent_entries---of 7
-----------
SUMMARY27%of 124

-----------
SUMMARY---of 0

__create_xol_area---of 24
__update_ref_ctr---of 7
__uprobe_register---of 27
__uprobe_unregister---of 9
arch_uprobe_copy_ixol---of 3
arch_uprobe_ignore---of 1
dup_xol_work---of 6
find_uprobe---of 11
install_breakpoint---of 21
is_swbp_insn---of 1
is_trap_insn---of 1
put_page---of 14
put_uprobe---of 11
register_for_each_vma---of 53
set_orig_insn---of 1
set_swbp---of 1
update_ref_ctr---of 37
uprobe_apply---of 5
uprobe_clear_state---of 11
uprobe_copy_process---of 15
uprobe_deny_signal23%of 9
uprobe_dup_mmap---of 3
uprobe_end_dup_mmap---of 10
uprobe_free_utask---of 7
uprobe_get_swbp_addr---of 1
uprobe_get_trap_addr---of 4
uprobe_mmap4%of 59
uprobe_munmap14%of 15
uprobe_notify_resume---of 126
uprobe_post_sstep_notifier---of 4
uprobe_pre_sstep_notifier---of 6
uprobe_register---of 1
uprobe_register_refctr---of 1
uprobe_start_dup_mmap---of 10
uprobe_unregister---of 3
uprobe_write_opcode---of 160
xol_free_insn_slot---of 9
-----------
SUMMARY8%of 83

char2uni---of 1
uni2char60%of 5
-----------
SUMMARY60%of 5

__kvm_cpuid_base---of 6
__send_ipi_mask---of 25
__sysvec_kvm_asyncpf_interrupt---of 4
apf_task_wake_all---of 12
arch_haltpoll_disable---of 5
arch_haltpoll_enable---of 8
fred_sysvec_kvm_asyncpf_interrupt---of 4
kvm_arch_para_features---of 3
kvm_arch_para_hints---of 3
kvm_async_pf_task_wait_schedule---of 15
kvm_async_pf_task_wake---of 12
kvm_cpu_down_prepare---of 7
kvm_cpu_online---of 7
kvm_crash_shutdown---of 1
kvm_disable_host_haltpoll---of 3
kvm_enable_host_haltpoll---of 3
kvm_flush_tlb_multi---of 10
kvm_guest_apic_eoi_write---of 5
kvm_guest_cpu_init---of 32
kvm_guest_cpu_offline---of 21
kvm_io_delay---of 1
kvm_kick_cpu---of 3
kvm_para_available---of 3
kvm_pv_guest_cpu_reboot---of 1
kvm_pv_reboot_notify---of 3
kvm_resume---of 13
kvm_send_ipi_mask---of 1
kvm_send_ipi_mask_allbutself---of 5
kvm_smp_send_call_func_ipi---of 9
kvm_steal_clock---of 6
kvm_suspend---of 6
kvm_wait20%of 10
pv_ipi_supported---of 5
pv_tlb_flush_supported---of 12
-----------
SUMMARY20%of 10

tomoyo_init_log---of 75
tomoyo_poll_log---of 5
tomoyo_read_log---of 8
tomoyo_write_log---of 1
tomoyo_write_log211%of 28
-----------
SUMMARY11%of 28

bmp_buf_get---of 21
bmp_buf_put---of 8
cmp_fnames58%of 7
cmp_sdh---of 8
cmp_uint---of 4
cmp_uints---of 11
fnd_clear58%of 14
fnd_pop---of 3
hdr_delete_de---of 4
hdr_find_e50%of 18
hdr_insert_de---of 8
hdr_insert_head---of 5
ib_is_empty---of 5
indx_add_allocate---of 26
indx_clear---of 1
indx_delete_entry---of 116
indx_find22%of 33
indx_find_buffer---of 20
indx_find_raw---of 53
indx_find_sort---of 45
indx_free_children---of 23
indx_get_entry_to_replace---of 31
indx_get_root34%of 12
indx_init25%of 20
indx_insert_entry---of 23
indx_insert_into_buffer---of 44
indx_insert_into_root---of 67
indx_mark_free---of 3
indx_mark_used---of 3
indx_new---of 10
indx_read23%of 45
indx_shrink---of 18
indx_update_dup---of 18
indx_used_bit38%of 8
put_indx_node---of 7
scan_for_free---of 3
scan_for_used---of 3
scan_nres_bitmap---of 31
-----------
SUMMARY32%of 157

-----------
SUMMARY---of 0

__ext4_forget17%of 36
__ext4_handle_dirty_metadata45%of 20
__ext4_journal_ensure_credits34%of 12
__ext4_journal_get_create_access19%of 16
__ext4_journal_get_write_access22%of 19
__ext4_journal_start_reserved---of 20
__ext4_journal_start_sb29%of 35
__ext4_journal_stop50%of 6
ext4_inode_journal_mode43%of 14
ext4_journal_abort_handle---of 8
ext4_journal_check_start50%of 8
-----------
SUMMARY30%of 166

__sb_end_write40%of 10
ext4_buffered_write_iter40%of 10
ext4_dax_fault---of 1
ext4_dax_huge_fault---of 22
ext4_dio_write_end_io38%of 8
ext4_file_mmap---of 14
ext4_file_open28%of 18
ext4_file_read_iter---of 19
ext4_file_splice_read67%of 3
ext4_file_write_iter17%of 78
ext4_handle_inode_extension39%of 18
ext4_inode_extension_cleanup30%of 10
ext4_llseek---of 8
ext4_release_file---of 11
ext4_write_checks34%of 9
lock_buffer67%of 3
sb_start_intwrite_trylock42%of 12
-----------
SUMMARY29%of 179

-----------
SUMMARY---of 0

add_delayed_ref16%of 77
add_delayed_ref_head41%of 32
btrfs_add_delayed_data_ref---of 4
btrfs_add_delayed_extent_op---of 3
btrfs_add_delayed_tree_ref50%of 4
btrfs_check_delayed_seq---of 4
btrfs_check_space_for_delayed_refs---of 1
btrfs_dec_delayed_refs_rsv_bg_inserts---of 3
btrfs_dec_delayed_refs_rsv_bg_updates---of 3
btrfs_delayed_ref_exit---of 1
btrfs_delayed_ref_lock22%of 14
btrfs_delayed_refs_rsv_refill---of 9
btrfs_delayed_refs_rsv_release67%of 3
btrfs_delete_ref_head46%of 11
btrfs_find_delayed_ref_head60%of 10
btrfs_inc_delayed_refs_rsv_bg_inserts---of 1
btrfs_inc_delayed_refs_rsv_bg_updates100%of 1
btrfs_init_data_ref---of 8
btrfs_init_tree_ref50%of 8
btrfs_merge_delayed_refs16%of 38
btrfs_migrate_to_delayed_refs_rsv---of 10
btrfs_put_delayed_ref40%of 5
btrfs_select_ref_head44%of 25
btrfs_space_info_free_bytes_may_use---of 21
btrfs_update_delayed_refs_rsv67%of 3
drop_delayed_ref---of 15
init_delayed_ref_head45%of 9
trace_btrfs_space_reservation---of 15
update_existing_head_ref---of 21
-----------
SUMMARY31%of 240

__ext4_find_entry31%of 92
__ext4_link37%of 19
__ext4_read_dirblock36%of 28
__ext4_unlink---of 31
add_dirent_to_buf59%of 17
do_split32%of 75
dx_insert_block---of 4
dx_node_limit---of 9
dx_probe16%of 53
ext4_add_entry14%of 91
ext4_add_nondir50%of 6
ext4_append29%of 14
ext4_ci_compare---of 11
ext4_create40%of 15
ext4_delete_entry30%of 17
ext4_dirblock_csum_verify24%of 13
ext4_dx_csum---of 8
ext4_dx_csum_verify---of 15
ext4_empty_dir18%of 35
ext4_find_delete_entry43%of 7
ext4_find_dest_de55%of 11
ext4_find_entry50%of 4
ext4_fname_setup_ci_filename19%of 11
ext4_generic_delete_entry38%of 8
ext4_get_parent---of 10
ext4_handle_dirty_dirblock39%of 13
ext4_handle_dirty_dx_node20%of 15
ext4_has_metadata_csum---of 6
ext4_htree_fill_tree8%of 40
ext4_inc_count---of 5
ext4_init_dot_dotdot---of 9
ext4_init_new_dir42%of 24
ext4_initialize_dirent_tail---of 3
ext4_insert_dentry46%of 11
ext4_link40%of 10
ext4_lookup17%of 24
ext4_match28%of 11
ext4_mkdir24%of 34
ext4_mknod---of 15
ext4_rename216%of 173
ext4_rename_delete63%of 8
ext4_rename_dir_finish---of 8
ext4_rename_dir_prepare---of 20
ext4_resetent---of 7
ext4_rmdir27%of 30
ext4_search_dir---of 9
ext4_setent---of 7
ext4_symlink23%of 31
ext4_tmpfile---of 14
ext4_unlink---of 34
ext4_update_dir_count---of 9
ext4_update_dx_flag40%of 5
ext4_whiteout_for_rename---of 10
htree_dirblock_to_tree37%of 38
make_indexed_dir29%of 50
-----------
SUMMARY26%of 1033

-----------
SUMMARY---of 0

__ia32_compat_sys_x32_rt_sigreturn---of 8
__ia32_sys_rt_sigreturn---of 8
copy_siginfo_to_user32---of 4
restore_sigcontext---of 6
sigaction_compat_abi---of 6
x32_copy_siginfo_to_user---of 3
x32_setup_rt_frame---of 43
x64_setup_rt_frame9%of 47
-----------
SUMMARY9%of 47

-----------
SUMMARY---of 0

block_group_cache_tree_search22%of 14
btrfs_add_block_group_cache---of 9
btrfs_add_new_free_space---of 14
btrfs_add_reserved_bytes32%of 32
btrfs_block_group_should_use_size_class67%of 3
btrfs_cache_block_group---of 23
btrfs_caching_ctl_wait_done---of 5
btrfs_calc_block_group_size_class100%of 1
btrfs_chunk_alloc---of 50
btrfs_create_block_group_cache---of 4
btrfs_create_pending_block_groups5%of 49
btrfs_dec_block_group_reservations50%of 4
btrfs_dec_block_group_ro---of 7
btrfs_dec_block_group_swap_extents---of 4
btrfs_dec_nocow_writers---of 3
btrfs_delete_unused_bgs---of 44
btrfs_force_chunk_alloc---of 1
btrfs_free_block_groups---of 60
btrfs_free_reserved_bytes---of 5
btrfs_freeze_block_group---of 1
btrfs_get_alloc_profile35%of 49
btrfs_get_block_group50%of 4
btrfs_get_caching_control---of 5
btrfs_inc_block_group_ro---of 23
btrfs_inc_block_group_swap_extents---of 3
btrfs_inc_nocow_writers---of 4
btrfs_lookup_block_group100%of 1
btrfs_lookup_first_block_group---of 1
btrfs_make_block_group---of 21
btrfs_mark_bg_to_reclaim---of 21
btrfs_mark_bg_unused---of 11
btrfs_need_cleaner_sleep---of 4
btrfs_next_block_group---of 6
btrfs_put_block_group12%of 17
btrfs_put_block_group_cache---of 7
btrfs_read_block_groups---of 104
btrfs_reclaim_bgs---of 3
btrfs_reclaim_bgs_work---of 41
btrfs_remove_block_group---of 97
btrfs_reserve_chunk_metadata---of 1
btrfs_rmap_block---of 21
btrfs_setup_space_cache23%of 9
btrfs_space_info_update_bytes_pinned29%of 21
btrfs_start_dirty_block_groups---of 49
btrfs_start_trans_remove_block_group---of 7
btrfs_unfreeze_block_group---of 8
btrfs_update_block_group50%of 24
btrfs_use_block_group_size_class---of 5
btrfs_wait_block_group_cache_done---of 8
btrfs_wait_block_group_cache_progress---of 18
btrfs_wait_block_group_reservations---of 7
btrfs_wait_nocow_writers---of 5
btrfs_write_dirty_block_groups---of 34
cache_save_setup---of 32
caching_thread---of 78
check_system_chunk---of 5
clean_pinned_extents---of 10
exclude_super_stripes---of 12
fill_dummy_bgs---of 8
inc_block_group_ro---of 15
list_move---of 5
reclaim_bgs_cmp---of 1
reserve_chunk_space---of 14
sb_end_write---of 10
set_avail_alloc_bits---of 7
trace_btrfs_add_block_group---of 15
trace_btrfs_add_unused_block_group---of 15
trace_btrfs_reclaim_block_group---of 15
trace_btrfs_skip_unused_block_group---of 15
trace_btrfs_space_reservation27%of 15
update_block_group_item---of 6
-----------
SUMMARY28%of 243

I_BDEV100%of 1
bd_abort_claiming---of 6
bd_finish_claiming34%of 15
bd_init_fs_context---of 3
bd_may_claim---of 10
bd_prepare_to_claim31%of 13
bd_yield_claim---of 18
bdev_add---of 3
bdev_alloc---of 7
bdev_alloc_inode---of 3
bdev_drop---of 1
bdev_evict_inode---of 1
bdev_file_open_by_dev24%of 13
bdev_file_open_by_path---of 11
bdev_fput---of 8
bdev_free_inode---of 7
bdev_freeze---of 11
bdev_mark_dead---of 8
bdev_open29%of 42
bdev_permission40%of 5
bdev_release---of 21
bdev_set_nr_sectors---of 1
bdev_statx_dioalign---of 6
bdev_thaw---of 12
bdev_unhash---of 3
blkdev_flush_mapping---of 13
blkdev_get_no_open40%of 5
blkdev_get_whole39%of 18
blkdev_put_no_open100%of 1
block_size67%of 3
disk_live100%of 1
file_bdev100%of 1
init_once---of 1
invalidate_bdev67%of 3
lookup_bdev29%of 7
nr_blockdev_pages---of 4
sb_min_blocksize---of 6
sb_set_blocksize50%of 4
set_blocksize47%of 15
sync_bdevs67%of 9
sync_blockdev100%of 3
sync_blockdev_nowait67%of 3
sync_blockdev_range---of 1
truncate_bdev_range---of 4
-----------
SUMMARY41%of 162

-----------
SUMMARY---of 0

ima_init_template_list---of 20
ima_restore_measurement_list---of 32
ima_template_desc_buf---of 3
ima_template_desc_current67%of 3
ima_template_has_modsig---of 7
lookup_template_desc---of 15
template_desc_init_fields---of 42
-----------
SUMMARY67%of 3

bpf_fd_inode_storage_delete_elem---of 6
bpf_fd_inode_storage_lookup_elem---of 4
bpf_fd_inode_storage_update_elem---of 8
bpf_inode_storage_delete---of 8
bpf_inode_storage_free18%of 23
bpf_inode_storage_get---of 11
inode_storage_lookup---of 30
inode_storage_map_alloc---of 1
inode_storage_map_free---of 1
inode_storage_ptr---of 3
notsupp_get_next_key---of 1
-----------
SUMMARY18%of 23

-----------
SUMMARY---of 0

__sbitmap_queue_get100%of 1
__sbitmap_queue_get_batch---of 36
sbitmap_add_wait_queue---of 3
sbitmap_any_bit_set50%of 6
sbitmap_bitmap_show---of 31
sbitmap_del_wait_queue---of 5
sbitmap_find_bit54%of 15
sbitmap_finish_wait---of 3
sbitmap_get43%of 14
sbitmap_get_shallow34%of 12
sbitmap_init_node---of 20
sbitmap_prepare_to_wait---of 3
sbitmap_queue_clear38%of 8
sbitmap_queue_clear_batch---of 17
sbitmap_queue_get_shallow67%of 3
sbitmap_queue_init_node---of 7
sbitmap_queue_min_shallow_depth---of 3
sbitmap_queue_recalculate_wake_batch---of 1
sbitmap_queue_resize---of 3
sbitmap_queue_show---of 7
sbitmap_queue_wake_all---of 17
sbitmap_queue_wake_up7%of 31
sbitmap_resize---of 8
sbitmap_show---of 10
sbitmap_weight---of 14
sbq_calc_wake_batch---of 6
-----------
SUMMARY33%of 90

__bio_add_page50%of 6
__bio_advance34%of 15
__bio_clone34%of 12
__bio_release_pages---of 9
bio_add_folio67%of 3
bio_add_folio_nofail---of 10
bio_add_hw_page---of 19
bio_add_page45%of 18
bio_add_pc_page---of 1
bio_add_zone_append_page---of 4
bio_alloc_bioset27%of 45
bio_alloc_clone---of 4
bio_alloc_irq_cache_splice---of 10
bio_alloc_rescue---of 6
bio_await_chain---of 4
bio_chain50%of 4
bio_chain_and_submit---of 5
bio_chain_endio---of 4
bio_check_pages_dirty---of 18
bio_copy_data---of 1
bio_copy_data_iter---of 15
bio_cpu_dead---of 11
bio_dirty_fn---of 6
bio_endio20%of 36
bio_first_folio---of 18
bio_free58%of 7
bio_free_pages---of 6
bio_init67%of 3
bio_init_clone---of 5
bio_iov_bvec_set---of 6
bio_iov_iter_get_pages25%of 52
bio_kmalloc---of 3
bio_next_folio---of 10
bio_put16%of 19
bio_reset---of 3
bio_set_pages_dirty---of 4
bio_split31%of 13
bio_trim---of 9
bio_truncate---of 24
bio_uninit35%of 20
bio_wait_end_io---of 1
bioset_exit---of 23
bioset_init---of 17
biovec_init_pool---of 1
biovec_slab6%of 127
blk_next_bio---of 5
bvec_alloc---of 6
bvec_free---of 5
bvec_try_merge_hw_page---of 11
folio_lock---of 9
folio_size---of 10
guard_bio_eod40%of 5
punt_bios_to_rescuer---of 14
submit_bio_wait50%of 4
submit_bio_wait_endio---of 1
zero_fill_bio_iter---of 9
-----------
SUMMARY23%of 389

ima_appraise_measurement---of 80
ima_check_blacklist---of 4
ima_get_cache_status---of 8
ima_get_hash_algo---of 14
ima_inode_post_setattr23%of 9
ima_inode_remove_acl---of 6
ima_inode_removexattr---of 10
ima_inode_set_acl---of 6
ima_inode_setxattr---of 25
ima_must_appraise67%of 3
ima_read_xattr---of 1
ima_update_xattr---of 10
is_ima_appraise_enabled---of 1
-----------
SUMMARY34%of 12

-----------
SUMMARY---of 0

__ia32_compat_sys_gettimeofday---of 1
__ia32_compat_sys_settimeofday---of 1
__ia32_sys_adjtimex---of 3
__ia32_sys_adjtimex_time32---of 1
__ia32_sys_gettimeofday---of 1
__ia32_sys_settimeofday---of 1
__ia32_sys_stime---of 4
__ia32_sys_stime32---of 4
__ia32_sys_time---of 4
__ia32_sys_time32---of 4
__msecs_to_jiffies67%of 3
__se_compat_sys_gettimeofday---of 8
__se_compat_sys_settimeofday---of 21
__se_sys_adjtimex_time32---of 3
__se_sys_gettimeofday---of 8
__se_sys_settimeofday---of 21
__usecs_to_jiffies---of 3
__x64_compat_sys_gettimeofday---of 1
__x64_compat_sys_settimeofday---of 1
__x64_sys_adjtimex---of 3
__x64_sys_adjtimex_time32---of 1
__x64_sys_gettimeofday---of 1
__x64_sys_settimeofday---of 1
__x64_sys_stime---of 4
__x64_sys_stime32---of 4
__x64_sys_time---of 4
__x64_sys_time32---of 4
clock_t_to_jiffies---of 1
do_sys_settimeofday64---of 13
get_itimerspec64---of 9
get_old_itimerspec32---of 4
get_old_timespec32---of 4
get_old_timex32---of 3
get_timespec6440%of 5
jiffies64_to_msecs---of 1
jiffies64_to_nsecs---of 1
jiffies_64_to_clock_t---of 1
jiffies_to_clock_t---of 1
jiffies_to_msecs---of 1
jiffies_to_timespec64---of 1
jiffies_to_usecs---of 1
mktime64100%of 1
ns_to_kernel_old_timeval---of 4
ns_to_timespec64---of 4
nsec_to_clock_t---of 1
nsecs_to_jiffies100%of 1
nsecs_to_jiffies64---of 1
put_itimerspec64---of 3
put_old_itimerspec32---of 3
put_old_timespec32---of 3
put_old_timex32---of 1
put_timespec64100%of 1
set_normalized_timespec6443%of 7
timespec64_add_safe45%of 9
timespec64_to_jiffies---of 1
-----------
SUMMARY52%of 27

clear_shadow_entry---of 6
folio_contains21%of 24
folio_invalidate---of 3
folio_lock45%of 9
folio_mapped19%of 16
folio_size30%of 10
generic_error_remove_folio---of 5
invalidate_inode_pages2---of 1
invalidate_inode_pages2_range25%of 57
invalidate_mapping_pages---of 1
mapping_evict_folio---of 27
mapping_try_invalidate---of 20
pagecache_isize_extended31%of 13
truncate_cleanup_folio40%of 20
truncate_folio_batch_exceptionals20%of 26
truncate_inode_folio67%of 3
truncate_inode_pages100%of 1
truncate_inode_pages_final67%of 3
truncate_inode_pages_range38%of 59
truncate_inode_partial_folio---of 33
truncate_pagecache100%of 1
truncate_pagecache_range---of 3
truncate_setsize67%of 3
-----------
SUMMARY32%of 245

__io_uaddr_map---of 6
io_pages_free---of 3
io_pages_map17%of 24
io_pages_unmap---of 12
io_pin_pages---of 8
io_uring_get_unmapped_area39%of 13
io_uring_mmap43%of 7
io_uring_mmap_pages50%of 6
io_uring_validate_mmap_request40%of 10
put_page---of 14
-----------
SUMMARY32%of 60

__list_lru_init40%of 15
__list_lru_walk_one---of 22
list_lru_add56%of 9
list_lru_add_obj50%of 4
list_lru_count_node---of 1
list_lru_count_one---of 19
list_lru_del---of 7
list_lru_del_obj---of 4
list_lru_destroy---of 25
list_lru_isolate---of 3
list_lru_isolate_move---of 5
list_lru_walk_node---of 7
list_lru_walk_one---of 3
list_lru_walk_one_irq---of 3
memcg_list_lru_alloc25%of 37
memcg_reparent_list_lrus---of 35
-----------
SUMMARY34%of 65

__free_page_ext---of 10
alloc_page_ext---of 3
init_section_page_ext---of 8
offline_page_ext---of 12
online_page_ext---of 11
page_ext_callback---of 6
page_ext_get25%of 20
page_ext_put29%of 7
pgdat_page_ext_init---of 1
-----------
SUMMARY26%of 27

__hfs_brec_find58%of 7
hfs_brec_find37%of 11
hfs_brec_goto---of 11
hfs_brec_read50%of 4
hfs_find_exit100%of 1
hfs_find_init34%of 6
-----------
SUMMARY45%of 29

ex_get_fixup_type---of 3
fixup_exception5%of 85
-----------
SUMMARY5%of 85

-----------
SUMMARY---of 0

__bpf_trace_scsi_cmd_done_timeout_template---of 1
__bpf_trace_scsi_dispatch_cmd_error---of 1
__bpf_trace_scsi_dispatch_cmd_start---of 1
__bpf_trace_scsi_eh_wakeup---of 1
__probestub_scsi_dispatch_cmd_done---of 1
__probestub_scsi_dispatch_cmd_error---of 1
__probestub_scsi_dispatch_cmd_start---of 1
__probestub_scsi_dispatch_cmd_timeout---of 1
__probestub_scsi_eh_wakeup---of 1
__scsi_device_lookup---of 9
__scsi_device_lookup_by_target---of 7
__scsi_iterate_devices---of 10
__starget_for_each_device---of 10
__traceiter_scsi_dispatch_cmd_done---of 4
__traceiter_scsi_dispatch_cmd_error---of 4
__traceiter_scsi_dispatch_cmd_start---of 4
__traceiter_scsi_dispatch_cmd_timeout---of 4
__traceiter_scsi_eh_wakeup---of 4
perf_trace_scsi_cmd_done_timeout_template---of 13
perf_trace_scsi_dispatch_cmd_error---of 9
perf_trace_scsi_dispatch_cmd_start---of 9
perf_trace_scsi_eh_wakeup---of 8
scsi_attach_vpd---of 24
scsi_cdl_check---of 19
scsi_cdl_enable---of 26
scsi_change_queue_depth---of 5
scsi_device_get---of 5
scsi_device_lookup---of 13
scsi_device_lookup_by_target---of 14
scsi_device_max_queue_depth---of 1
scsi_device_put---of 1
scsi_finish_command---of 13
scsi_get_vpd_buf---of 10
scsi_get_vpd_page---of 12
scsi_get_vpd_size---of 15
scsi_log_completion---of 8
scsi_log_send50%of 4
scsi_report_opcode---of 16
scsi_track_queue_full---of 9
scsi_update_vpd_page---of 9
starget_for_each_device---of 10
trace_event_raw_event_scsi_cmd_done_timeout_template---of 12
trace_event_raw_event_scsi_dispatch_cmd_error---of 8
trace_event_raw_event_scsi_dispatch_cmd_start---of 8
trace_event_raw_event_scsi_eh_wakeup---of 7
trace_raw_output_scsi_cmd_done_timeout_template---of 3
trace_raw_output_scsi_dispatch_cmd_error---of 3
trace_raw_output_scsi_dispatch_cmd_start---of 3
trace_raw_output_scsi_eh_wakeup---of 3
-----------
SUMMARY50%of 4

-----------
SUMMARY---of 0

__blk_mq_get_tag30%of 17
__blk_mq_tag_busy---of 9
__blk_mq_tag_idle---of 7
blk_mq_all_tag_iter---of 6
blk_mq_free_tags---of 1
blk_mq_get_tag20%of 21
blk_mq_get_tags---of 5
blk_mq_init_bitmaps---of 4
blk_mq_init_tags---of 6
blk_mq_put_tag50%of 4
blk_mq_put_tags---of 1
blk_mq_queue_tag_busy_iter---of 25
blk_mq_tag_resize_shared_tags---of 1
blk_mq_tag_update_depth---of 7
blk_mq_tag_update_sched_shared_tags---of 1
blk_mq_tag_wakeup_all---of 3
blk_mq_tagset_busy_iter---of 13
blk_mq_tagset_count_completed_rqs---of 3
blk_mq_tagset_wait_completed_request---of 15
blk_mq_unique_tag100%of 1
bt_iter---of 12
bt_tags_iter---of 16
sbitmap_for_each_set---of 17
-----------
SUMMARY28%of 43

-----------
SUMMARY---of 0

btrfs_advance_sb_log---of 22
btrfs_calc_zone_unusable---of 4
btrfs_can_activate_zone---of 14
btrfs_check_active_zone_reservation---of 14
btrfs_check_meta_write_pointer6%of 39
btrfs_check_mountopts_zoned---of 6
btrfs_check_zoned_mode---of 25
btrfs_clear_data_reloc_bg---of 3
btrfs_clone_dev_zone_info---of 6
btrfs_destroy_dev_zone_info---of 3
btrfs_ensure_empty_zones---of 44
btrfs_find_allocatable_zones---of 32
btrfs_finish_ordered_zoned---of 22
btrfs_free_zone_cache---of 7
btrfs_get_dev_zone---of 4
btrfs_get_dev_zone_info---of 138
btrfs_get_dev_zone_info_all_devices---of 8
btrfs_get_dev_zones---of 29
btrfs_load_block_group_dup---of 11
btrfs_load_block_group_raid0---of 12
btrfs_load_block_group_raid1---of 18
btrfs_load_block_group_raid10---of 14
btrfs_load_block_group_single---of 4
btrfs_load_block_group_zone_info---of 93
btrfs_record_physical_zoned---of 1
btrfs_reset_device_zone---of 13
btrfs_reset_sb_log_zones---of 19
btrfs_sb_log_location---of 15
btrfs_sb_log_location_bdev---of 25
btrfs_schedule_zone_finish_bg---of 5
btrfs_sync_zone_write_pointer---of 24
btrfs_use_zone_append20%of 10
btrfs_zone_activate---of 25
btrfs_zone_finish---of 3
btrfs_zone_finish_endio---of 5
btrfs_zone_finish_endio_workfn---of 1
btrfs_zone_finish_one_bg---of 13
btrfs_zoned_activate_one_bg---of 17
btrfs_zoned_issue_zeroout---of 8
btrfs_zoned_release_data_reloc_bg---of 7
btrfs_zoned_should_reclaim---of 8
copy_zone_info_cb---of 1
do_zone_finish---of 38
rcu_read_unlock---of 6
sb_log_location---of 19
sb_write_pointer---of 18
wait_eb_writebacks---of 38
-----------
SUMMARY9%of 49

-----------
SUMMARY---of 0

__folio_cancel_dirty28%of 29
__folio_end_writeback27%of 46
__folio_mark_dirty29%of 60
__folio_start_writeback27%of 65
__wb_calc_thresh---of 6
__wb_update_bandwidth22%of 19
__wb_writeout_add50%of 6
balance_dirty_pages---of 73
balance_dirty_pages_ratelimited100%of 1
balance_dirty_pages_ratelimited_flags19%of 61
bdi_get_max_bytes---of 1
bdi_get_min_bytes---of 1
bdi_set_max_bytes---of 5
bdi_set_max_ratio---of 4
bdi_set_max_ratio_no_scale---of 4
bdi_set_min_bytes---of 7
bdi_set_min_ratio---of 6
bdi_set_min_ratio_no_scale---of 6
bdi_set_strict_limit---of 3
cgwb_calc_thresh---of 6
dirty_background_bytes_handler---of 3
dirty_background_ratio_handler---of 3
dirty_bytes_handler---of 4
dirty_ratio_handler---of 4
dirty_writeback_centisecs_handler---of 4
do_writepages40%of 23
domain_dirty_limits---of 24
filemap_dirty_folio32%of 16
folio_account_cleaned39%of 18
folio_clear_dirty_for_io29%of 56
folio_index19%of 16
folio_mark_dirty20%of 15
folio_redirty_for_writepage---of 26
folio_wait_stable67%of 3
folio_wait_writeback40%of 10
folio_wait_writeback_killable---of 10
global_dirty_limits---of 1
laptop_io_completion---of 1
laptop_mode_timer_fn---of 1
laptop_sync_completion---of 14
node_dirty_ok54%of 13
noop_dirty_folio22%of 14
page_writeback_cpu_online---of 1
percpu_ref_put_many---of 14
percpu_ref_tryget---of 16
set_page_dirty_lock---of 15
tag_pages_for_writeback57%of 23
task_get_css---of 23
trace_balance_dirty_pages---of 15
trace_folio_wait_writeback27%of 15
wb_calc_thresh---of 6
wb_dirty_limits---of 5
wb_domain_exit---of 1
wb_domain_init---of 1
wb_over_bg_thresh---of 16
wb_position_ratio---of 13
wb_update_bandwidth---of 1
wb_update_dirty_ratelimit---of 22
wb_writeout_inc---of 7
write_cache_pages67%of 6
writeback_iter28%of 96
writeback_set_ratelimit---of 1
writeout_period---of 3
-----------
SUMMARY30%of 611

_compound_head---of 7
decompress_lzx_xpress---of 8
lock_page---of 15
ni_add_name---of 24
ni_add_subrecord---of 9
ni_clear---of 16
ni_create_attr_list---of 31
ni_decompress_file---of 49
ni_delete_all---of 20
ni_enum_attr_ex9%of 24
ni_expand_list---of 34
ni_fiemap---of 40
ni_find_attr10%of 41
ni_fname_name---of 17
ni_fname_type---of 11
ni_ins_attr_ext---of 47
ni_ins_new_attr---of 14
ni_insert_attr---of 30
ni_insert_nonresident---of 13
ni_insert_resident---of 6
ni_is_dirty---of 8
ni_load_all_mi---of 21
ni_load_attr---of 31
ni_load_mi---of 4
ni_load_mi_ex---of 18
ni_new_attr_flags---of 10
ni_parse_reparse---of 25
ni_read_frame---of 88
ni_readpage_cmpr---of 44
ni_remove_attr---of 37
ni_remove_attr_le---of 3
ni_remove_mi---of 1
ni_remove_name---of 17
ni_remove_name_undo---of 19
ni_rename---of 5
ni_std---of 6
ni_std5---of 6
ni_try_remove_attr_list---of 37
ni_update_parent---of 26
ni_write_frame---of 34
ni_write_inode11%of 47
put_page---of 14
-----------
SUMMARY10%of 112

__do_pipe_flags---of 8
__ia32_sys_pipe---of 1
__ia32_sys_pipe2---of 1
__x64_sys_pipe---of 1
__x64_sys_pipe2---of 1
_compound_head---of 7
account_pipe_buffers---of 1
alloc_pipe_info34%of 18
anon_pipe_buf_release30%of 10
anon_pipe_buf_try_steal---of 29
create_pipe_files---of 10
do_pipe2---of 4
do_pipe_flags---of 3
do_proc_dopipe_max_size_conv---of 8
fifo_open---of 29
free_pipe_info---of 12
generic_pipe_buf_get34%of 9
generic_pipe_buf_release---of 1
generic_pipe_buf_try_steal---of 22
get_pipe_info60%of 5
pipe_double_lock50%of 6
pipe_fasync---of 7
pipe_fcntl---of 24
pipe_ioctl---of 11
pipe_is_unprivileged_user---of 3
pipe_lock100%of 3
pipe_lock_cmp_fn100%of 1
pipe_poll36%of 14
pipe_read24%of 39
pipe_release---of 9
pipe_resize_ring---of 11
pipe_unlock100%of 3
pipe_wait_readable40%of 10
pipe_wait_writable---of 12
pipe_write11%of 64
pipefs_dname---of 1
pipefs_init_fs_context---of 3
proc_dopipe_max_size---of 1
put_page---of 14
round_pipe_size---of 5
too_many_pipe_buffers_hard---of 1
too_many_pipe_buffers_soft---of 1
wait_for_partner---of 9
-----------
SUMMARY28%of 182

tomoyo_encode65%of 14
tomoyo_encode2---of 14
tomoyo_get_local_path40%of 20
tomoyo_realpath_from_path60%of 20
tomoyo_realpath_nofollow---of 4
-----------
SUMMARY54%of 54

-----------
SUMMARY---of 0

__page_table_check_pmd_clear---of 3
__page_table_check_pmd_set---of 20
__page_table_check_pte_clear67%of 3
__page_table_check_pte_clear_range---of 12
__page_table_check_ptes_set27%of 23
__page_table_check_pud_clear---of 3
__page_table_check_pud_set---of 6
__page_table_check_zero32%of 16
page_table_check_clear24%of 26
page_table_check_set24%of 26
pfn_valid21%of 29
-----------
SUMMARY26%of 123

__should_fail_alloc_page50%of 6
-----------
SUMMARY50%of 6

flush_descriptor---of 6
insert_revoke_hash50%of 6
jbd2_clear_buffer_revoked_flags---of 9
jbd2_journal_cancel_revoke43%of 14
jbd2_journal_clear_revoke---of 9
jbd2_journal_destroy_revoke---of 11
jbd2_journal_destroy_revoke_record_cache---of 1
jbd2_journal_destroy_revoke_table_cache---of 1
jbd2_journal_init_revoke---of 9
jbd2_journal_init_revoke_table---of 13
jbd2_journal_revoke34%of 15
jbd2_journal_set_revoke---of 8
jbd2_journal_switch_revoke_table---of 6
jbd2_journal_test_revoke---of 7
jbd2_journal_write_revoke_records---of 28
-----------
SUMMARY40%of 35

__btrfs_submit_bio19%of 11
btrfs_bio_alloc100%of 1
btrfs_bio_end_io---of 7
btrfs_bio_init---of 1
btrfs_bioset_exit---of 1
btrfs_check_read_bio---of 60
btrfs_clone_write_end_io---of 16
btrfs_dev_name---of 8
btrfs_end_bio_work---of 4
btrfs_orig_bbio_end_io---of 18
btrfs_orig_write_end_io---of 17
btrfs_raid56_end_io---of 5
btrfs_repair_io_failure---of 21
btrfs_simple_end_io---of 18
btrfs_submit_bio19%of 75
btrfs_submit_dev_bio25%of 28
btrfs_submit_repair_write---of 19
rcu_read_unlock---of 6
run_one_async_done---of 4
run_one_async_start---of 5
-----------
SUMMARY21%of 115

-----------
SUMMARY---of 0

can_stop_idle_tick---of 13
cpu_active---of 3
get_cpu_idle_time_us---of 13
get_cpu_iowait_time_us---of 13
get_jiffies_update---of 6
seqcount_lockdep_reader_access---of 7
tick_check_oneshot_change---of 11
tick_clock_notify---of 5
tick_do_update_jiffies64---of 8
tick_get_tick_sched---of 3
tick_irq_enter---of 13
tick_nohz_get_idle_calls---of 3
tick_nohz_get_idle_calls_cpu---of 3
tick_nohz_get_next_hrtimer---of 1
tick_nohz_get_sleep_length---of 8
tick_nohz_handler---of 15
tick_nohz_idle_enter---of 15
tick_nohz_idle_exit---of 23
tick_nohz_idle_got_tick---of 5
tick_nohz_idle_restart_tick---of 9
tick_nohz_idle_retain_tick---of 3
tick_nohz_idle_stop_tick---of 48
tick_nohz_irq_exit---of 5
tick_nohz_lowres_handler---of 5
tick_nohz_next_event---of 18
tick_nohz_restart_sched_tick---of 7
tick_nohz_start_idle---of 10
tick_nohz_stop_idle---of 11
tick_nohz_tick_stopped67%of 3
tick_nohz_tick_stopped_cpu---of 3
tick_oneshot_notify---of 3
tick_sched_timer_cancel---of 4
tick_sched_timer_dying---of 8
tick_setup_sched_timer---of 25
-----------
SUMMARY67%of 3

-----------
SUMMARY---of 0

__bpf_trace_signal_deliver---of 1
__bpf_trace_signal_generate---of 1
__compat_save_altstack---of 1
__copy_siginfo_from_user---of 19
__copy_siginfo_to_user32---of 1
__dequeue_signal---of 21
__do_sys_pause---of 4
__flush_itimer_signals---of 10
__ia32_compat_sys_rt_sigaction---of 1
__ia32_compat_sys_rt_sigpending---of 3
__ia32_compat_sys_rt_sigprocmask---of 1
__ia32_compat_sys_rt_sigqueueinfo---of 5
__ia32_compat_sys_rt_sigsuspend---of 4
__ia32_compat_sys_rt_sigtimedwait_time32---of 1
__ia32_compat_sys_rt_sigtimedwait_time64---of 1
__ia32_compat_sys_rt_tgsigqueueinfo---of 1
__ia32_compat_sys_sigaction---of 1
__ia32_compat_sys_sigaltstack---of 1
__ia32_compat_sys_sigpending---of 1
__ia32_sys_kill---of 1
__ia32_sys_pidfd_send_signal---of 1
__ia32_sys_rt_sigaction---of 9
__ia32_sys_rt_sigpending---of 3
__ia32_sys_rt_sigprocmask---of 1
__ia32_sys_rt_sigqueueinfo---of 5
__ia32_sys_rt_sigsuspend---of 4
__ia32_sys_rt_sigtimedwait---of 1
__ia32_sys_rt_sigtimedwait_time32---of 1
__ia32_sys_rt_tgsigqueueinfo---of 6
__ia32_sys_sigaltstack---of 6
__ia32_sys_signal---of 3
__ia32_sys_sigpending---of 1
__ia32_sys_sigprocmask---of 1
__ia32_sys_sigsuspend---of 1
__ia32_sys_ssetmask---of 1
__ia32_sys_tgkill---of 1
__ia32_sys_tkill---of 1
__kill_pgrp_info---of 5
__lock_task_sighand---of 19
__probestub_signal_deliver---of 1
__probestub_signal_generate---of 1
__save_altstack---of 1
__se_compat_sys_rt_sigaction---of 7
__se_compat_sys_rt_sigprocmask---of 10
__se_compat_sys_rt_sigtimedwait_time32---of 8
__se_compat_sys_rt_sigtimedwait_time64---of 8
__se_compat_sys_rt_tgsigqueueinfo---of 6
__se_compat_sys_sigaction---of 15
__se_sys_kill---of 16
__se_sys_pidfd_send_signal---of 33
__se_sys_rt_sigprocmask---of 11
__se_sys_rt_sigtimedwait---of 10
__se_sys_rt_sigtimedwait_time32---of 10
__se_sys_sigprocmask---of 10
__se_sys_tgkill---of 3
__se_sys_tkill---of 3
__send_signal_locked20%of 50
__set_current_blocked50%of 6
__sigqueue_alloc25%of 32
__sigqueue_free---of 4
__traceiter_signal_deliver---of 4
__traceiter_signal_generate---of 4
__x64_compat_sys_rt_sigaction---of 1
__x64_compat_sys_rt_sigpending---of 3
__x64_compat_sys_rt_sigprocmask---of 1
__x64_compat_sys_rt_sigqueueinfo---of 5
__x64_compat_sys_rt_sigsuspend---of 4
__x64_compat_sys_rt_sigtimedwait_time32---of 1
__x64_compat_sys_rt_sigtimedwait_time64---of 1
__x64_compat_sys_rt_tgsigqueueinfo---of 1
__x64_compat_sys_sigaction---of 1
__x64_compat_sys_sigaltstack---of 1
__x64_compat_sys_sigpending---of 1
__x64_sys_kill---of 1
__x64_sys_pidfd_send_signal---of 1
__x64_sys_restart_syscall---of 1
__x64_sys_rt_sigaction---of 9
__x64_sys_rt_sigpending---of 3
__x64_sys_rt_sigprocmask---of 1
__x64_sys_rt_sigqueueinfo---of 5
__x64_sys_rt_sigsuspend---of 4
__x64_sys_rt_sigtimedwait---of 1
__x64_sys_rt_sigtimedwait_time32---of 1
__x64_sys_rt_tgsigqueueinfo---of 6
__x64_sys_sgetmask---of 1
__x64_sys_sigaltstack---of 6
__x64_sys_signal---of 3
__x64_sys_sigpending---of 1
__x64_sys_sigprocmask---of 1
__x64_sys_sigsuspend---of 1
__x64_sys_ssetmask---of 1
__x64_sys_tgkill---of 1
__x64_sys_tkill---of 1
calculate_sigpending---of 1
cgroup_threadgroup_change_end---of 10
check_kill_permission---of 21
compat_restore_altstack---of 6
complete_signal15%of 57
copy_siginfo_from_user---of 19
copy_siginfo_from_user32---of 3
copy_siginfo_to_external32---of 26
copy_siginfo_to_user67%of 3
dequeue_signal---of 12
do_compat_sigaltstack---of 7
do_freezer_trap---of 3
do_jobctl_trap---of 8
do_no_restart_syscall---of 1
do_notify_parent---of 63
do_notify_parent_cldstop---of 50
do_notify_pidfd---of 3
do_send_sig_info---of 3
do_send_specific---of 20
do_sigaction---of 21
do_sigaltstack---of 18
do_signal_stop---of 34
do_sigtimedwait---of 14
exit_signals---of 18
flush_itimer_signals---of 1
flush_signal_handlers---of 9
flush_signals---of 17
flush_sigqueue---of 9
flush_sigqueue_mask---of 13
force_exit_sig---of 1
force_fatal_sig---of 1
force_sig100%of 1
force_sig_bnderr---of 1
force_sig_fault100%of 1
force_sig_fault_to_task---of 1
force_sig_fault_trapno---of 1
force_sig_info---of 1
force_sig_info_to_task27%of 19
force_sig_mceerr---of 3
force_sig_pkuerr---of 1
force_sig_ptrace_errno_trap---of 1
force_sig_seccomp---of 1
force_sigsegv---of 3
get_signal15%of 87
group_send_sig_info---of 15
ignore_signals---of 3
kernel_sigaction---of 5
kill_pgrp---of 5
kill_pid---of 1
kill_pid_info---of 1
kill_pid_info_type---of 15
kill_pid_usb_asyncio---of 26
kill_proc_info---of 11
lockdep_assert_task_sighand_held---of 20
next_signal---of 3
perf_trace_signal_deliver---of 10
perf_trace_signal_generate---of 10
post_copy_siginfo_from_user32---of 27
prepare_signal15%of 40
print_dropped_signal---of 4
ptrace_notify---of 4
ptrace_signal---of 34
ptrace_stop---of 25
ptrace_trap_notify---of 13
rcu_read_unlock---of 6
recalc_sigpending25%of 8
restore_altstack---of 3
retarget_shared_pending---of 16
send_sig---of 4
send_sig_fault---of 4
send_sig_fault_trapno---of 4
send_sig_info---of 4
send_sig_mceerr---of 5
send_sig_perf---of 3
send_signal_locked22%of 46
send_sigqueue---of 27
set_compat_user_sigmask---of 5
set_current_blocked---of 1
set_user_sigmask60%of 5
siginfo_layout---of 10
signal_setup_done40%of 10
signal_wake_up_state50%of 6
sigprocmask---of 7
sigqueue_alloc---of 1
sigqueue_free---of 6
sigsuspend---of 4
task_clear_jobctl_pending---of 4
task_clear_jobctl_trapping---of 3
task_join_group_stop---of 10
task_participate_group_stop---of 12
task_set_jobctl_pending---of 9
trace_event_raw_event_signal_deliver---of 9
trace_event_raw_event_signal_generate---of 9
trace_raw_output_signal_deliver---of 3
trace_raw_output_signal_generate---of 3
trace_signal_deliver27%of 15
trace_signal_generate27%of 15
unhandled_signal---of 8
zap_other_threads---of 10
-----------
SUMMARY22%of 401

btrfs_add_root_ref---of 9
btrfs_check_and_init_root_item---of 3
btrfs_del_root---of 5
btrfs_del_root_ref---of 11
btrfs_find_orphan_roots---of 19
btrfs_find_root---of 16
btrfs_insert_root---of 1
btrfs_set_root_node100%of 1
btrfs_subvolume_reserve_metadata---of 10
btrfs_update_root12%of 17
btrfs_update_root_times100%of 1
-----------
SUMMARY22%of 19

__ia32_sys_userfaultfd---of 5
__x64_sys_userfaultfd---of 5
assert_fault_locked---of 10
dup_userfaultfd---of 18
dup_userfaultfd_complete---of 6
handle_userfault---of 80
init_once_userfaultfd_ctx---of 1
mmget_not_zero---of 5
mremap_userfaultfd_complete50%of 4
mremap_userfaultfd_prep20%of 10
new_userfaultfd---of 7
userfaultfd_ctx_put---of 13
userfaultfd_dev_ioctl---of 3
userfaultfd_event_wait_completion---of 39
userfaultfd_ioctl---of 361
userfaultfd_poll---of 8
userfaultfd_read_iter---of 61
userfaultfd_release---of 25
userfaultfd_remove---of 8
userfaultfd_set_vm_flags---of 8
userfaultfd_show_fdinfo---of 6
userfaultfd_unmap_complete34%of 6
userfaultfd_unmap_prep13%of 16
userfaultfd_wake_function---of 7
userfaultfd_wp_async---of 3
userfaultfd_wp_unpopulated---of 3
vma_iter_set---of 1
-----------
SUMMARY23%of 36

-----------
SUMMARY---of 0

__bpf_trace_x86_fpu---of 1
__probestub_x86_fpu_after_restore---of 1
__probestub_x86_fpu_after_save---of 1
__probestub_x86_fpu_before_restore---of 1
__probestub_x86_fpu_before_save---of 1
__probestub_x86_fpu_copy_dst---of 1
__probestub_x86_fpu_copy_src---of 1
__probestub_x86_fpu_dropped---of 1
__probestub_x86_fpu_init_state---of 1
__probestub_x86_fpu_regs_activated---of 1
__probestub_x86_fpu_regs_deactivated---of 1
__probestub_x86_fpu_xstate_check_failed---of 1
__traceiter_x86_fpu_after_restore---of 4
__traceiter_x86_fpu_after_save---of 4
__traceiter_x86_fpu_before_restore---of 4
__traceiter_x86_fpu_before_save---of 4
__traceiter_x86_fpu_copy_dst---of 4
__traceiter_x86_fpu_copy_src---of 4
__traceiter_x86_fpu_dropped---of 4
__traceiter_x86_fpu_init_state---of 4
__traceiter_x86_fpu_regs_activated---of 4
__traceiter_x86_fpu_regs_deactivated---of 4
__traceiter_x86_fpu_xstate_check_failed---of 4
fpregs_activate27%of 15
fpregs_assert_state_consistent60%of 5
fpregs_lock_and_load---of 16
fpregs_mark_activate---of 1
fpstate_init_user---of 4
fpstate_reset---of 1
fpu__clear_user_states39%of 13
fpu__drop---of 32
fpu__exception_code---of 8
fpu_alloc_guest_fpstate---of 7
fpu_clone---of 46
fpu_copy_guest_fpstate_to_uabi---of 4
fpu_copy_uabi_to_guest_fpstate---of 8
fpu_enable_guest_xfd_features---of 8
fpu_flush_thread---of 4
fpu_free_guest_fpstate---of 4
fpu_reset_from_exception_fixup---of 1
fpu_swap_kvm_fpstate---of 11
fpu_sync_fpstate---of 33
fpu_sync_guest_vmexit_xfd_state---of 8
fpu_thread_struct_whitelist---of 1
fpu_update_guest_xfd---of 6
irq_fpu_usable40%of 5
kernel_fpu_begin_mask50%of 14
kernel_fpu_end60%of 5
perf_trace_x86_fpu---of 9
restore_fpregs_from_fpstate37%of 11
save_fpregs_to_fpstate50%of 10
switch_fpu_return60%of 5
trace_event_raw_event_x86_fpu---of 8
trace_raw_output_x86_fpu---of 3
-----------
SUMMARY44%of 83

-----------
SUMMARY---of 0

__fscrypt_fname_encrypted_size---of 6
fscrypt_base64url_decode---of 9
fscrypt_d_revalidate---of 5
fscrypt_fname_alloc_buffer---of 3
fscrypt_fname_disk_to_usr---of 23
fscrypt_fname_encrypt---of 8
fscrypt_fname_encrypted_size---of 6
fscrypt_fname_free_buffer67%of 3
fscrypt_fname_siphash---of 3
fscrypt_match_name50%of 6
fscrypt_setup_filename9%of 23
-----------
SUMMARY22%of 32

__account_locked_vm---of 11
__vcalloc_noprof---of 3
__vm_enough_memory15%of 14
__vmalloc_array_noprof---of 3
account_locked_vm---of 11
folio_anon_vma---of 1
folio_copy---of 12
folio_mapping30%of 17
get_cmdline---of 7
kfree_const---of 3
kmemdup_array---of 3
kmemdup_noprof---of 3
kmemdup_nul50%of 4
kstrdup50%of 4
kstrdup_const100%of 3
kstrndup---of 4
kvfree---of 3
kvfree_sensitive---of 4
kvmalloc_node_noprof38%of 8
kvmemdup---of 3
kvrealloc_noprof---of 5
mem_dump_obj---of 6
memcmp_pages---of 1
memdup_user40%of 5
memdup_user_nul---of 5
overcommit_kbytes_handler---of 3
overcommit_policy_handler---of 6
overcommit_ratio_handler---of 3
page_offline_begin---of 1
page_offline_end---of 1
page_offline_freeze---of 1
page_offline_thaw---of 1
randomize_page---of 3
randomize_stack_top---of 3
strndup_user40%of 5
sync_overcommit_as---of 1
vcalloc_noprof---of 3
vm_commit_limit---of 3
vm_memory_committed---of 1
vm_mmap---of 3
vm_mmap_pgoff50%of 14
vma_is_stack_for_current---of 3
vma_set_file---of 4
vmalloc_array_noprof---of 3
vmemdup_user---of 10
-----------
SUMMARY38%of 74

ovl_alloc_entry---of 3
ovl_already_copied_up---of 11
ovl_can_decode_fh---of 5
ovl_check_metacopy_xattr---of 21
ovl_check_protattr---of 20
ovl_check_setxattr---of 6
ovl_copy_up_end---of 3
ovl_copy_up_start---of 12
ovl_copyattr---of 9
ovl_dentry_clear_flag---of 1
ovl_dentry_get_redirect---of 1
ovl_dentry_has_upper_alias---of 1
ovl_dentry_has_xwhiteouts---of 1
ovl_dentry_init_flags---of 11
ovl_dentry_init_reval---of 11
ovl_dentry_is_opaque---of 1
ovl_dentry_is_whiteout---of 3
ovl_dentry_lower---of 5
ovl_dentry_lowerdata---of 6
ovl_dentry_needs_data_copy_up---of 9
ovl_dentry_needs_data_copy_up_locked---of 4
ovl_dentry_real---of 6
ovl_dentry_remote---of 1
ovl_dentry_set_flag---of 1
ovl_dentry_set_lowerdata---of 9
ovl_dentry_set_opaque---of 1
ovl_dentry_set_redirect---of 1
ovl_dentry_set_upper_alias---of 1
ovl_dentry_set_xwhiteouts---of 1
ovl_dentry_test_flag---of 1
ovl_dentry_update_reval---of 3
ovl_dentry_upper---of 1
ovl_dentry_weird---of 1
ovl_dir_cache---of 4
ovl_dir_modified---of 7
ovl_drop_write---of 3
ovl_end_write---of 12
ovl_ensure_verity_loaded---of 5
ovl_free_entry---of 6
ovl_get_dir_xattr_val---of 8
ovl_get_redirect_xattr---of 25
ovl_get_verity_digest---of 15
ovl_get_write_access---of 3
ovl_has_upperdata---of 7
ovl_i_dentry_upper---of 1
ovl_i_path_real---of 10
ovl_index_all---of 5
ovl_indexdir---of 5
ovl_init_uuid_xattr---of 20
ovl_inode_lower---of 5
ovl_inode_lowerdata---of 9
ovl_inode_real---of 8
ovl_inode_realdata---of 18
ovl_inode_update---of 5
ovl_inode_upper---of 3
ovl_inode_version_get---of 3
ovl_inuse_trylock---of 3
ovl_inuse_unlock---of 4
ovl_is_inuse---of 1
ovl_is_metacopy_dentry25%of 12
ovl_is_whiteout---of 4
ovl_layer_lower40%of 5
ovl_layer_set_xwhiteouts---of 3
ovl_lock_rename_workdir---of 5
ovl_lowerdata_redirect50%of 4
ovl_need_index---of 17
ovl_nlink_end---of 36
ovl_nlink_start---of 16
ovl_override_creds67%of 3
ovl_path_check_origin_xattr---of 5
ovl_path_check_xwhiteout_xattr---of 7
ovl_path_is_whiteout---of 5
ovl_path_lower---of 5
ovl_path_lowerdata---of 6
ovl_path_open---of 7
ovl_path_real29%of 7
ovl_path_realdata30%of 10
ovl_path_type22%of 14
ovl_path_upper---of 3
ovl_put_write_access---of 3
ovl_set_dir_cache---of 1
ovl_set_impure---of 10
ovl_set_metacopy_xattr---of 9
ovl_set_protattr---of 19
ovl_set_upperdata---of 1
ovl_stack_alloc---of 1
ovl_stack_cpy---of 6
ovl_stack_free---of 4
ovl_stack_put---of 4
ovl_start_write---of 12
ovl_sync_status---of 4
ovl_validate_verity---of 23
ovl_verify_lower---of 5
ovl_want_write---of 3
ovl_workdir---of 3
-----------
SUMMARY31%of 55

__iterate_supers---of 8
__put_super60%of 10
alloc_super25%of 16
bdev_read_only---of 4
bdev_super_lock---of 16
compare_single---of 1
deactivate_locked_super---of 3
deactivate_super40%of 5
destroy_super_rcu---of 1
destroy_super_work---of 5
do_emergency_remount---of 1
do_emergency_remount_callback---of 8
do_thaw_all---of 1
do_thaw_all_callback---of 7
drop_super---of 1
drop_super_exclusive---of 1
emergency_remount---of 3
emergency_thaw_all---of 3
free_anon_bdev---of 1
freeze_inc---of 9
freeze_super---of 58
fs_bdev_freeze---of 16
fs_bdev_mark_dead---of 6
fs_bdev_sync---of 3
fs_bdev_thaw---of 15
generic_shutdown_super---of 11
get_anon_bdev---of 3
get_tree_bdev25%of 12
get_tree_keyed---of 6
get_tree_nodev34%of 6
get_tree_single---of 6
grab_super---of 10
iterate_supers70%of 10
iterate_supers_type---of 9
kill_anon_super---of 1
kill_block_super---of 3
kill_litter_super---of 3
kill_super_notify---of 8
lockdep_sb_freeze_release---of 1
mount_bdev---of 10
mount_capable100%of 3
mount_nodev---of 5
mount_single---of 11
put_super---of 1
reconfigure_single---of 4
reconfigure_super---of 27
retire_super---of 5
sb_freeze_unlock---of 6
sb_init_dio_done_wq---of 4
set_anon_super---of 3
set_anon_super_fc67%of 3
set_bdev_super---of 1
setup_bdev_super27%of 15
sget---of 17
sget_dev---of 1
sget_fc35%of 23
super_cache_count---of 7
super_cache_scan---of 9
super_lock47%of 15
super_s_dev_set100%of 1
super_s_dev_test---of 3
super_setup_bdi---of 1
super_setup_bdi_name---of 5
super_trylock_shared---of 5
test_bdev_super---of 3
test_keyed_super---of 1
test_single_super---of 1
thaw_super---of 3
thaw_super_locked---of 17
user_get_super43%of 7
vfs_get_tree34%of 9
-----------
SUMMARY41%of 135

-----------
SUMMARY---of 0

attach_dn---of 7
dnotify_flush10%of 20
dnotify_free_mark---of 3
dnotify_handle_event---of 13
dnotify_recalc_inode_mask---of 7
fcntl_dirnotify---of 31
fsnotify_group_unlock---of 3
-----------
SUMMARY10%of 20

_compound_head---of 7
bh_read---of 3
folio_zero_segment---of 14
inode_write_data---of 58
ntfs3_write_inode100%of 1
ntfs_bmap---of 1
ntfs_create_inode---of 73
ntfs_create_reparse_buffer---of 15
ntfs_direct_IO---of 10
ntfs_evict_inode---of 1
ntfs_flush_inodes---of 9
ntfs_get_block---of 3
ntfs_get_block_bmap---of 3
ntfs_get_block_direct_IO_R---of 3
ntfs_get_block_direct_IO_W---of 3
ntfs_get_block_vbo---of 41
ntfs_get_block_write_begin---of 3
ntfs_get_link---of 5
ntfs_iget515%of 134
ntfs_link_inode---of 4
ntfs_read_folio---of 7
ntfs_readahead---of 6
ntfs_readlink_hlp---of 48
ntfs_resident_writepage---of 9
ntfs_set_inode100%of 1
ntfs_set_size---of 7
ntfs_sync_inode---of 1
ntfs_test_inode100%of 1
ntfs_translate_junction---of 15
ntfs_unlink_inode---of 18
ntfs_write_begin---of 7
ntfs_write_end---of 23
ntfs_writepages---of 4
put_page---of 14
reset_log_file---of 8
-----------
SUMMARY17%of 137

-----------
SUMMARY---of 0

tomoyo_commit_condition---of 19
tomoyo_condition1%of 207
tomoyo_get_attributes---of 13
tomoyo_get_condition---of 106
-----------
SUMMARY1%of 207

do_ovl_get_acl---of 7
ovl_fiemap---of 4
ovl_fileattr_get---of 5
ovl_fileattr_set---of 5
ovl_fill_inode---of 7
ovl_get_acl---of 4
ovl_get_acl_path---of 16
ovl_get_inode---of 69
ovl_get_inode_acl---of 1
ovl_get_link---of 3
ovl_get_nlink---of 21
ovl_get_trap_inode---of 5
ovl_getattr20%of 47
ovl_inode_init---of 19
ovl_inode_set---of 1
ovl_inode_test---of 1
ovl_lookup_inode---of 8
ovl_lookup_trap_inode---of 5
ovl_new_inode---of 3
ovl_permission---of 11
ovl_real_fileattr_get---of 5
ovl_real_fileattr_set---of 5
ovl_set_acl---of 27
ovl_set_nlink_common---of 6
ovl_set_nlink_lower---of 1
ovl_set_nlink_upper---of 1
ovl_setattr---of 19
ovl_update_time50%of 6
-----------
SUMMARY23%of 53

-----------
SUMMARY---of 0

hfs_compare_dentry---of 8
hfs_hash_dentry58%of 7
hfs_strcmp100%of 4
-----------
SUMMARY73%of 11

__register_chrdev---of 14
__register_chrdev_region---of 44
__unregister_chrdev---of 9
alloc_chrdev_region---of 3
base_probe---of 3
cd_forget---of 3
cdev_add---of 4
cdev_alloc---of 3
cdev_default_release---of 6
cdev_del---of 1
cdev_device_add---of 10
cdev_device_del---of 3
cdev_dynamic_release---of 6
cdev_init---of 1
cdev_put67%of 3
cdev_set_parent---of 3
chrdev_open---of 20
chrdev_show---of 6
exact_lock---of 4
exact_match---of 1
register_chrdev_region---of 13
unregister_chrdev_region---of 11
-----------
SUMMARY67%of 3

-----------
SUMMARY---of 0

alloc_fs_context30%of 24
fc_drop_locked---of 1
finish_clean_context34%of 6
fs_context_for_mount100%of 1
fs_context_for_reconfigure100%of 1
fs_context_for_submount---of 4
generic_parse_monolithic25%of 12
legacy_fs_context_dup---of 5
legacy_fs_context_free---of 4
legacy_get_tree---of 4
legacy_init_fs_context67%of 3
legacy_parse_monolithic---of 5
legacy_parse_param---of 20
legacy_reconfigure---of 4
logfc---of 9
parse_monolithic_mount_data100%of 1
put_fs_context23%of 36
vfs_clean_context40%of 5
vfs_dup_fs_context---of 19
vfs_parse_fs_param25%of 12
vfs_parse_fs_param_source---of 5
vfs_parse_fs_string50%of 4
vfs_parse_monolithic_sep---of 12
-----------
SUMMARY31%of 105

__bio_queue_enter---of 18
__blk_flush_plug22%of 14
__bpf_trace_block_bio---of 1
__bpf_trace_block_bio_complete---of 1
__bpf_trace_block_bio_remap---of 1
__bpf_trace_block_buffer---of 1
__bpf_trace_block_plug---of 1
__bpf_trace_block_rq---of 1
__bpf_trace_block_rq_completion---of 1
__bpf_trace_block_rq_remap---of 1
__bpf_trace_block_rq_requeue---of 1
__bpf_trace_block_split---of 1
__bpf_trace_block_unplug---of 1
__probestub_block_bio_backmerge---of 1
__probestub_block_bio_bounce---of 1
__probestub_block_bio_complete---of 1
__probestub_block_bio_frontmerge---of 1
__probestub_block_bio_queue---of 1
__probestub_block_bio_remap---of 1
__probestub_block_dirty_buffer---of 1
__probestub_block_getrq---of 1
__probestub_block_io_done---of 1
__probestub_block_io_start---of 1
__probestub_block_plug---of 1
__probestub_block_rq_complete---of 1
__probestub_block_rq_error---of 1
__probestub_block_rq_insert---of 1
__probestub_block_rq_issue---of 1
__probestub_block_rq_merge---of 1
__probestub_block_rq_remap---of 1
__probestub_block_rq_requeue---of 1
__probestub_block_split---of 1
__probestub_block_touch_buffer---of 1
__probestub_block_unplug---of 1
__submit_bio59%of 12
__traceiter_block_bio_backmerge---of 4
__traceiter_block_bio_bounce---of 4
__traceiter_block_bio_complete---of 4
__traceiter_block_bio_frontmerge---of 4
__traceiter_block_bio_queue---of 4
__traceiter_block_bio_remap---of 4
__traceiter_block_dirty_buffer---of 4
__traceiter_block_getrq---of 4
__traceiter_block_io_done---of 4
__traceiter_block_io_start---of 4
__traceiter_block_plug---of 4
__traceiter_block_rq_complete---of 4
__traceiter_block_rq_error---of 4
__traceiter_block_rq_insert---of 4
__traceiter_block_rq_issue---of 4
__traceiter_block_rq_merge---of 4
__traceiter_block_rq_remap---of 4
__traceiter_block_rq_requeue---of 4
__traceiter_block_split---of 4
__traceiter_block_touch_buffer---of 4
__traceiter_block_unplug---of 4
bdev_end_io_acct---of 11
bdev_start_io_acct---of 5
bio_end_io_acct_remapped---of 1
bio_poll---of 26
bio_start_io_acct---of 1
blk_alloc_queue---of 7
blk_check_plugged---of 12
blk_clear_pm_only---of 4
blk_finish_plug100%of 3
blk_free_queue_rcu---of 1
blk_get_queue40%of 5
blk_io_schedule67%of 3
blk_lld_busy---of 4
blk_op_str---of 4
blk_put_queue---of 5
blk_queue_enter---of 21
blk_queue_exit29%of 14
blk_queue_flag_clear---of 1
blk_queue_flag_set---of 1
blk_queue_flag_test_and_set---of 1
blk_queue_start_drain---of 3
blk_queue_usage_counter_release---of 1
blk_rq_timed_out_timer---of 1
blk_set_pm_only---of 1
blk_start_plug100%of 3
blk_start_plug_nr_ios---of 3
blk_status_to_errno67%of 3
blk_status_to_str---of 3
blk_sync_queue---of 1
blk_timeout_work---of 1
blk_try_enter_queue---of 28
errno_to_blk_status---of 20
iocb_bio_iopoll---of 13
kblockd_mod_delayed_work_on100%of 1
kblockd_schedule_work---of 1
perf_trace_block_bio---of 8
perf_trace_block_bio_complete---of 9
perf_trace_block_bio_remap---of 8
perf_trace_block_buffer---of 8
perf_trace_block_plug---of 8
perf_trace_block_rq---of 13
perf_trace_block_rq_completion---of 11
perf_trace_block_rq_remap---of 10
perf_trace_block_rq_requeue---of 13
perf_trace_block_split---of 8
perf_trace_block_unplug---of 8
should_fail_bio67%of 3
should_fail_request---of 3
submit_bio34%of 12
submit_bio_noacct23%of 88
submit_bio_noacct_nocheck30%of 40
trace_event_raw_event_block_bio---of 7
trace_event_raw_event_block_bio_complete---of 8
trace_event_raw_event_block_bio_remap---of 7
trace_event_raw_event_block_buffer---of 7
trace_event_raw_event_block_plug---of 7
trace_event_raw_event_block_rq---of 12
trace_event_raw_event_block_rq_completion---of 10
trace_event_raw_event_block_rq_remap---of 9
trace_event_raw_event_block_rq_requeue---of 12
trace_event_raw_event_block_split---of 7
trace_event_raw_event_block_unplug---of 7
trace_raw_output_block_bio---of 3
trace_raw_output_block_bio_complete---of 3
trace_raw_output_block_bio_remap---of 3
trace_raw_output_block_buffer---of 3
trace_raw_output_block_plug---of 3
trace_raw_output_block_rq---of 3
trace_raw_output_block_rq_completion---of 3
trace_raw_output_block_rq_remap---of 3
trace_raw_output_block_rq_requeue---of 3
trace_raw_output_block_split---of 3
trace_raw_output_block_unplug---of 3
update_io_ticks88%of 8
-----------
SUMMARY35%of 209

copy_from_kernel_nofault_allowed40%of 5
-----------
SUMMARY40%of 5

-----------
SUMMARY---of 0

__entry_find---of 22
__mb_cache_entry_free---of 9
hlist_bl_lock29%of 7
mb_cache_count---of 1
mb_cache_create---of 14
mb_cache_destroy---of 11
mb_cache_entry_create22%of 28
mb_cache_entry_delete_or_get---of 6
mb_cache_entry_find_first---of 1
mb_cache_entry_find_next---of 1
mb_cache_entry_get---of 15
mb_cache_entry_touch---of 1
mb_cache_entry_wait_unused---of 5
mb_cache_scan---of 1
mb_cache_shrink---of 13
mb_cache_shrink_worker---of 1
-----------
SUMMARY23%of 35

ext4_init_orphan_info---of 35
ext4_orphan_add26%of 43
ext4_orphan_cleanup---of 65
ext4_orphan_del45%of 27
ext4_orphan_file_block_trigger---of 6
ext4_orphan_file_empty---of 6
ext4_process_orphan---of 7
ext4_release_orphan_info---of 7
lock_buffer67%of 3
-----------
SUMMARY35%of 73

__io_account_mem---of 6
__io_register_rsrc_update---of 54
__io_sqe_buffers_unregister---of 8
__io_sqe_files_unregister---of 11
io_alloc_page_table---of 6
io_buffer_unmap---of 10
io_copy_iov---of 4
io_file_bitmap_set---of 3
io_files_update---of 23
io_files_update_prep---of 6
io_free_page_table---of 4
io_import_fixed---of 8
io_queue_rsrc_removal---of 14
io_register_files_update---of 6
io_register_rsrc---of 13
io_register_rsrc_update---of 7
io_rsrc_data_alloc---of 7
io_rsrc_data_free---of 5
io_rsrc_node_alloc50%of 6
io_rsrc_node_destroy---of 4
io_rsrc_node_ref_zero---of 19
io_rsrc_ref_quiesce---of 19
io_sqe_buffer_register---of 84
io_sqe_buffers_register---of 23
io_sqe_buffers_unregister---of 4
io_sqe_files_register---of 17
io_sqe_files_unregister---of 4
-----------
SUMMARY50%of 6

__add_relation_rb---of 6
__btrfs_qgroup_free_meta---of 7
__btrfs_qgroup_release_data7%of 32
__btrfs_qgroup_reserve_meta---of 4
__del_qgroup_rb---of 17
__del_qgroup_relation---of 44
add_qgroup_item---of 8
add_qgroup_rb---of 7
add_qgroup_relation_item---of 3
add_relation_rb---of 16
btrfs_add_qgroup_relation---of 22
btrfs_check_quota_leak---of 11
btrfs_create_qgroup---of 18
btrfs_del_qgroup_relation---of 1
btrfs_free_qgroup_config---of 4
btrfs_free_squota_rsv---of 5
btrfs_limit_qgroup---of 24
btrfs_qgroup_account_extent---of 72
btrfs_qgroup_account_extents14%of 43
btrfs_qgroup_add_swapped_blocks---of 27
btrfs_qgroup_check_inherit---of 17
btrfs_qgroup_check_reserved_leak---of 7
btrfs_qgroup_clean_swapped_blocks---of 26
btrfs_qgroup_convert_reserved_meta---of 49
btrfs_qgroup_destroy_extent_records---of 4
btrfs_qgroup_enabled---of 1
btrfs_qgroup_free_data---of 1
btrfs_qgroup_free_meta_all_pertrans---of 17
btrfs_qgroup_free_refroot---of 34
btrfs_qgroup_full_accounting67%of 3
btrfs_qgroup_inherit---of 147
btrfs_qgroup_init_swapped_blocks---of 1
btrfs_qgroup_mode---of 3
btrfs_qgroup_release_data100%of 1
btrfs_qgroup_rescan---of 13
btrfs_qgroup_rescan_resume---of 3
btrfs_qgroup_rescan_worker---of 60
btrfs_qgroup_reserve_data---of 4
btrfs_qgroup_reserve_meta34%of 6
btrfs_qgroup_trace_extent---of 6
btrfs_qgroup_trace_extent_nolock---of 28
btrfs_qgroup_trace_extent_post---of 7
btrfs_qgroup_trace_leaf_items---of 11
btrfs_qgroup_trace_subtree---of 44
btrfs_qgroup_trace_subtree_after_cow5%of 45
btrfs_qgroup_wait_for_completion---of 4
btrfs_quota_disable---of 32
btrfs_quota_enable---of 76
btrfs_read_qgroup_config---of 63
btrfs_record_squota_delta7%of 31
btrfs_remove_qgroup---of 39
btrfs_run_qgroups12%of 25
del_qgroup_relation_item---of 5
qgroup_dirty---of 4
qgroup_mark_inconsistent---of 4
qgroup_rescan_init---of 14
qgroup_rescan_zero_tracking---of 7
qgroup_reserve---of 40
qgroup_reserve_data---of 37
qgroup_trace_new_subtree_blocks---of 38
qgroup_update_refcnt---of 38
quick_update_accounting---of 38
trace_qgroup_meta_reserve---of 15
trace_qgroup_update_reserve---of 15
try_flush_qgroup---of 8
update_qgroup_limit_item---of 4
update_qgroup_status_item---of 4
-----------
SUMMARY11%of 186

call_blocking_lsm_notifier---of 1
inode_free_by_rcu---of 1
lsm_append---of 8
lsm_fill_user_ctx---of 6
lsm_inode_alloc---of 3
register_blocking_lsm_notifier---of 1
security_audit_rule_free---of 4
security_audit_rule_init---of 4
security_audit_rule_known---of 4
security_audit_rule_match---of 4
security_binder_set_context_mgr---of 4
security_binder_transaction---of 4
security_binder_transfer_binder---of 4
security_binder_transfer_file---of 4
security_bpf---of 4
security_bpf_map---of 4
security_bpf_map_create---of 4
security_bpf_map_free---of 4
security_bpf_prog---of 4
security_bpf_prog_free---of 4
security_bpf_prog_load---of 4
security_bpf_token_capable---of 4
security_bpf_token_cmd---of 4
security_bpf_token_create---of 4
security_bpf_token_free---of 4
security_bprm_check---of 4
security_bprm_committed_creds---of 4
security_bprm_committing_creds---of 4
security_bprm_creds_for_exec---of 4
security_bprm_creds_from_file---of 4
security_capable75%of 4
security_capget---of 4
security_capset---of 4
security_create_user_ns---of 4
security_cred_alloc_blank---of 7
security_cred_free---of 5
security_cred_getsecid---of 4
security_current_getsecid_subj75%of 4
security_d_instantiate72%of 7
security_dentry_create_files_as---of 4
security_dentry_init_security---of 4
security_file_alloc58%of 7
security_file_fcntl75%of 4
security_file_free50%of 6
security_file_ioctl75%of 4
security_file_ioctl_compat---of 4
security_file_lock75%of 4
security_file_mprotect---of 4
security_file_open20%of 26
security_file_permission75%of 4
security_file_post_open75%of 4
security_file_receive---of 4
security_file_release75%of 4
security_file_send_sigiotask---of 4
security_file_set_fowner75%of 4
security_file_truncate75%of 4
security_free_mnt_opts40%of 5
security_fs_context_dup---of 4
security_fs_context_parse_param58%of 7
security_fs_context_submount---of 4
security_getprocattr---of 5
security_getselfattr---of 23
security_ib_alloc_security---of 4
security_ib_endport_manage_subnet---of 4
security_ib_free_security---of 4
security_ib_pkey_access---of 4
security_inet_conn_established---of 4
security_inet_conn_request---of 4
security_inet_csk_clone---of 4
security_inode_alloc58%of 7
security_inode_copy_up---of 4
security_inode_copy_up_xattr---of 4
security_inode_create60%of 5
security_inode_follow_link60%of 5
security_inode_free67%of 6
security_inode_get_acl---of 5
security_inode_getattr60%of 5
security_inode_getsecctx---of 4
security_inode_getsecid---of 4
security_inode_getsecurity---of 5
security_inode_getxattr---of 5
security_inode_init_security45%of 18
security_inode_init_security_anon75%of 4
security_inode_invalidate_secctx---of 4
security_inode_killpriv---of 4
security_inode_link60%of 5
security_inode_listsecurity---of 5
security_inode_listxattr---of 5
security_inode_mkdir60%of 5
security_inode_mknod---of 5
security_inode_need_killpriv75%of 4
security_inode_notifysecctx---of 4
security_inode_permission80%of 5
security_inode_post_create_tmpfile---of 5
security_inode_post_remove_acl---of 5
security_inode_post_removexattr---of 5
security_inode_post_set_acl---of 5
security_inode_post_setattr60%of 5
security_inode_post_setxattr---of 5
security_inode_readlink60%of 5
security_inode_remove_acl---of 5
security_inode_removexattr---of 6
security_inode_rename42%of 12
security_inode_rmdir60%of 5
security_inode_set_acl---of 5
security_inode_setattr60%of 5
security_inode_setsecctx---of 4
security_inode_setsecurity---of 5
security_inode_setxattr---of 6
security_inode_symlink60%of 5
security_inode_unlink60%of 5
security_ipc_getsecid---of 4
security_ipc_permission---of 4
security_ismaclabel---of 4
security_kernel_act_as---of 4
security_kernel_create_files_as---of 4
security_kernel_load_data---of 4
security_kernel_module_request---of 4
security_kernel_post_load_data---of 4
security_kernel_post_read_file---of 4
security_kernel_read_file---of 4
security_kernfs_init_security---of 4
security_key_alloc---of 4
security_key_free---of 4
security_key_getsecurity---of 4
security_key_permission---of 4
security_key_post_create_or_update---of 4
security_locked_down---of 4
security_mmap_addr75%of 4
security_mmap_file72%of 7
security_move_mount75%of 4
security_mptcp_add_subflow---of 4
security_msg_msg_alloc---of 7
security_msg_msg_free---of 4
security_msg_queue_alloc---of 7
security_msg_queue_associate---of 4
security_msg_queue_free---of 4
security_msg_queue_msgctl---of 4
security_msg_queue_msgrcv---of 4
security_msg_queue_msgsnd---of 4
security_netlink_send---of 4
security_path_chmod60%of 5
security_path_chown60%of 5
security_path_chroot75%of 4
security_path_link60%of 5
security_path_mkdir60%of 5
security_path_mknod60%of 5
security_path_notify---of 4
security_path_post_mknod60%of 5
security_path_rename63%of 8
security_path_rmdir60%of 5
security_path_symlink60%of 5
security_path_truncate60%of 5
security_path_unlink60%of 5
security_perf_event_alloc---of 4
security_perf_event_free---of 4
security_perf_event_open---of 4
security_perf_event_read---of 4
security_perf_event_write---of 4
security_post_notification---of 4
security_prepare_creds---of 7
security_ptrace_access_check---of 4
security_ptrace_traceme---of 4
security_quota_on---of 4
security_quotactl75%of 4
security_release_secctx---of 4
security_req_classify_flow---of 4
security_sb_alloc58%of 7
security_sb_clone_mnt_opts---of 4
security_sb_delete---of 4
security_sb_eat_lsm_opts75%of 4
security_sb_free---of 4
security_sb_kern_mount75%of 4
security_sb_mnt_opts_compat---of 4
security_sb_mount75%of 4
security_sb_pivotroot75%of 4
security_sb_remount---of 4
security_sb_set_mnt_opts60%of 5
security_sb_show_options---of 4
security_sb_statfs75%of 4
security_sb_umount---of 4
security_sctp_assoc_established---of 4
security_sctp_assoc_request---of 4
security_sctp_bind_connect---of 4
security_sctp_sk_clone---of 4
security_secctx_to_secid---of 4
security_secid_to_secctx---of 4
security_secmark_refcount_dec---of 4
security_secmark_refcount_inc---of 4
security_secmark_relabel_packet---of 4
security_sem_alloc---of 7
security_sem_associate---of 4
security_sem_free---of 4
security_sem_semctl---of 4
security_sem_semop---of 4
security_setprocattr---of 5
security_setselfattr---of 10
security_settime64---of 4
security_shm_alloc---of 7
security_shm_associate---of 4
security_shm_free---of 4
security_shm_shmat---of 4
security_shm_shmctl---of 4
security_sk_alloc---of 4
security_sk_classify_flow---of 4
security_sk_clone---of 4
security_sk_free---of 4
security_skb_classify_flow---of 4
security_sock_graft---of 4
security_sock_rcv_skb---of 4
security_socket_accept---of 4
security_socket_bind---of 4
security_socket_connect---of 4
security_socket_create---of 4
security_socket_getpeername---of 4
security_socket_getpeersec_dgram---of 4
security_socket_getpeersec_stream---of 4
security_socket_getsockname---of 4
security_socket_getsockopt---of 4
security_socket_listen---of 4
security_socket_post_create---of 4
security_socket_recvmsg---of 4
security_socket_sendmsg---of 4
security_socket_setsockopt---of 4
security_socket_shutdown---of 4
security_socket_socketpair---of 4
security_syslog---of 4
security_task_alloc---of 7
security_task_fix_setgid---of 4
security_task_fix_setgroups---of 4
security_task_fix_setuid---of 4
security_task_free---of 4
security_task_getioprio---of 4
security_task_getpgid---of 4
security_task_getscheduler---of 4
security_task_getsecid_obj---of 4
security_task_getsid---of 4
security_task_kill---of 4
security_task_movememory---of 4
security_task_prctl---of 7
security_task_prlimit---of 4
security_task_setioprio---of 4
security_task_setnice---of 4
security_task_setpgid---of 4
security_task_setrlimit---of 4
security_task_setscheduler---of 4
security_task_to_inode75%of 4
security_transfer_creds---of 4
security_tun_dev_alloc_security---of 4
security_tun_dev_attach---of 4
security_tun_dev_attach_queue---of 4
security_tun_dev_create---of 4
security_tun_dev_free_security---of 4
security_tun_dev_open---of 4
security_unix_may_send---of 4
security_unix_stream_connect---of 4
security_uring_cmd---of 4
security_uring_override_creds---of 4
security_uring_sqpoll---of 4
security_vm_enough_memory_mm75%of 4
security_watch_key---of 4
security_xfrm_decode_session---of 4
security_xfrm_policy_alloc---of 4
security_xfrm_policy_clone---of 4
security_xfrm_policy_delete---of 4
security_xfrm_policy_free---of 4
security_xfrm_policy_lookup---of 4
security_xfrm_state_alloc---of 4
security_xfrm_state_alloc_acquire---of 4
security_xfrm_state_delete---of 4
security_xfrm_state_free---of 4
security_xfrm_state_pol_flow_match---of 3
unregister_blocking_lsm_notifier---of 1
-----------
SUMMARY60%of 330

char2uni---of 1
uni2char40%of 5
-----------
SUMMARY40%of 5

-----------
SUMMARY---of 0

ext4_block_bitmap_csum_set50%of 10
ext4_block_bitmap_csum_verify30%of 10
ext4_count_free---of 1
ext4_inode_bitmap_csum_set30%of 10
ext4_inode_bitmap_csum_verify---of 10
-----------
SUMMARY37%of 30

hfs_create---of 4
hfs_dir_release---of 4
hfs_lookup40%of 5
hfs_mkdir---of 4
hfs_readdir---of 30
hfs_remove---of 6
hfs_rename---of 10
-----------
SUMMARY40%of 5

btrfs_alloc_block_rsv---of 3
btrfs_block_rsv_add50%of 4
btrfs_block_rsv_add_bytes---of 4
btrfs_block_rsv_check---of 1
btrfs_block_rsv_migrate---of 7
btrfs_block_rsv_refill---of 6
btrfs_block_rsv_release35%of 23
btrfs_block_rsv_use_bytes---of 4
btrfs_check_trunc_cache_free_space---of 1
btrfs_free_block_rsv---of 3
btrfs_init_block_rsv---of 1
btrfs_init_global_block_rsv---of 1
btrfs_init_metadata_block_rsv100%of 1
btrfs_init_root_block_rsv---of 11
btrfs_release_global_block_rsv---of 17
btrfs_space_info_update_bytes_may_use---of 35
btrfs_update_global_block_rsv---of 17
btrfs_use_block_rsv15%of 27
-----------
SUMMARY28%of 55

__kfence_alloc42%of 12
__kfence_free50%of 8
alloc_covered_contains67%of 3
check_canary22%of 38
get_alloc_stack_hash30%of 17
kfence_check_canary_callback---of 5
kfence_debugfs_init---of 3
kfence_guarded_alloc37%of 41
kfence_guarded_free40%of 15
kfence_handle_page_fault---of 18
kfence_init_enable---of 5
kfence_init_pool---of 22
kfence_ksize---of 4
kfence_object_start50%of 4
kfence_protect40%of 5
kfence_shutdown_cache---of 15
kfence_unprotect40%of 5
metadata_update_state67%of 6
next_object---of 1
objects_open---of 4
param_get_sample_interval---of 3
param_set_sample_interval---of 21
rcu_guarded_free---of 1
show_object---of 1
start_object---of 1
stats_open---of 1
stats_show---of 1
stop_object---of 1
toggle_allocation_gate---of 6
wake_up_kfence_timer---of 1
-----------
SUMMARY36%of 154

__ext4_new_inode31%of 200
ext4_chksum50%of 4
ext4_count_dirs---of 6
ext4_count_free_inodes---of 6
ext4_end_bitmap_read---of 4
ext4_free_inode28%of 55
ext4_has_group_desc_csum72%of 7
ext4_has_metadata_csum84%of 6
ext4_init_inode_table---of 19
ext4_lock_group23%of 9
ext4_mark_bitmap_end---of 7
ext4_mark_inode_used---of 25
ext4_orphan_get---of 21
ext4_read_inode_bitmap10%of 55
ext4_xattr_credits_for_new_inode38%of 8
find_group_orlov16%of 46
find_inode_bit25%of 12
get_orlov_stats23%of 18
rcu_read_unlock34%of 6
trace_ext4_allocate_inode27%of 15
trace_ext4_load_inode_bitmap---of 15
-----------
SUMMARY27%of 441

-----------
SUMMARY---of 0

sg_alloc_table_chained50%of 6
sg_free_table_chained---of 3
sg_pool_alloc60%of 5
sg_pool_free---of 5
-----------
SUMMARY55%of 11

virtscsi_abort---of 3
virtscsi_add_cmd44%of 16
virtscsi_change_queue_depth---of 1
virtscsi_commit_rqs---of 3
virtscsi_complete_cmd---of 20
virtscsi_ctrl_done---of 8
virtscsi_device_alloc---of 1
virtscsi_device_reset---of 3
virtscsi_eh_timed_out---of 1
virtscsi_event_done---of 8
virtscsi_freeze---of 1
virtscsi_handle_event---of 23
virtscsi_init---of 22
virtscsi_kick_event---of 3
virtscsi_map_queues---of 10
virtscsi_mq_poll---of 4
virtscsi_probe---of 13
virtscsi_queuecommand23%of 18
virtscsi_remove---of 3
virtscsi_req_done---of 1
virtscsi_restore---of 6
virtscsi_tmf---of 5
virtscsi_vq_done---of 6
-----------
SUMMARY33%of 34

__bpf_trace_exit_mmap---of 1
__bpf_trace_vm_unmapped_area---of 1
__bpf_trace_vma_mas_szero---of 1
__bpf_trace_vma_store---of 1
__get_unmapped_area43%of 14
__ia32_sys_brk---of 1
__ia32_sys_mmap_pgoff---of 1
__ia32_sys_munmap---of 1
__ia32_sys_remap_file_pages---of 1
__install_special_mapping---of 7
__mas_set_range38%of 8
__probestub_exit_mmap---of 1
__probestub_vm_unmapped_area---of 1
__probestub_vma_mas_szero---of 1
__probestub_vma_store---of 1
__se_sys_brk---of 43
__se_sys_remap_file_pages---of 30
__split_vma58%of 35
__traceiter_exit_mmap---of 4
__traceiter_vm_unmapped_area---of 4
__traceiter_vma_mas_szero---of 4
__traceiter_vma_store---of 4
__vm_munmap34%of 12
__x64_sys_brk---of 1
__x64_sys_mmap_pgoff---of 1
__x64_sys_munmap100%of 1
__x64_sys_remap_file_pages---of 1
_install_special_mapping---of 1
can_vma_merge_after36%of 17
can_vma_merge_before35%of 20
copy_vma25%of 29
do_brk_flags---of 39
do_mmap33%of 62
do_munmap100%of 1
do_vma_munmap---of 3
do_vmi_align_munmap44%of 79
do_vmi_munmap39%of 13
dup_anon_vma---of 9
exit_mmap---of 59
expand_downwards---of 42
expand_stack---of 24
expand_stack_locked---of 1
file_mmap_ok43%of 7
find_extend_vma_locked---of 9
find_mergeable_anon_vma16%of 26
find_vma50%of 4
find_vma_intersection50%of 4
find_vma_prev---of 3
generic_get_unmapped_area---of 18
generic_get_unmapped_area_topdown---of 22
get_file50%of 4
init_admin_reserve---of 1
init_reserve_notifier---of 3
init_user_reserve---of 1
insert_vm_struct---of 14
install_special_mapping---of 1
ksys_mmap_pgoff19%of 22
mapping_map_writable40%of 5
may_expand_vm19%of 11
mlock_future_ok50%of 4
mm_drop_all_locks---of 20
mm_get_unmapped_area67%of 3
mm_get_unmapped_area_vmflags67%of 3
mm_take_all_locks---of 48
mmap_region28%of 104
mmap_write_unlock---of 6
perf_trace_exit_mmap---of 8
perf_trace_vm_unmapped_area---of 8
perf_trace_vma_mas_szero---of 8
perf_trace_vma_store---of 8
reserve_mem_notifier---of 10
special_mapping_close---of 1
special_mapping_fault---of 17
special_mapping_mremap---of 4
special_mapping_name---of 1
special_mapping_split---of 1
trace_event_raw_event_exit_mmap---of 7
trace_event_raw_event_vm_unmapped_area---of 7
trace_event_raw_event_vma_mas_szero---of 7
trace_event_raw_event_vma_store---of 7
trace_raw_output_exit_mmap---of 3
trace_raw_output_vm_unmapped_area---of 4
trace_raw_output_vma_mas_szero---of 3
trace_raw_output_vma_store---of 3
unlink_file_vma75%of 4
unmap_region72%of 7
validate_mm43%of 19
vm_brk_flags---of 22
vm_flags_clear---of 6
vm_flags_set---of 6
vm_munmap---of 1
vm_stat_account40%of 5
vm_unmapped_area15%of 41
vma_complete25%of 44
vma_expand---of 32
vma_is_special_mapping---of 3
vma_iter_store31%of 13
vma_link40%of 10
vma_link_file75%of 4
vma_merge11%of 118
vma_merge_extend---of 1
vma_merge_new_vma100%of 1
vma_modify---of 10
vma_needs_dirty_tracking---of 10
vma_prepare37%of 19
vma_set_page_prot31%of 13
vma_shrink---of 14
vma_start_write67%of 6
vma_wants_writenotify---of 12
-----------
SUMMARY31%of 792

_kstrtol---of 3
_kstrtoul---of 3
_parse_integer---of 1
_parse_integer_fixup_radix23%of 9
_parse_integer_limit50%of 10
kstrtobool---of 20
kstrtobool_from_user---of 3
kstrtoint---of 4
kstrtoint_from_user---of 3
kstrtol_from_user---of 3
kstrtoll---of 9
kstrtoll_from_user---of 3
kstrtos16---of 4
kstrtos16_from_user---of 3
kstrtos8---of 4
kstrtos8_from_user---of 3
kstrtou16---of 4
kstrtou16_from_user---of 3
kstrtou8---of 4
kstrtou8_from_user---of 3
kstrtouint---of 4
kstrtouint_from_user---of 3
kstrtoul_from_user---of 3
kstrtoull---of 5
kstrtoull_from_user---of 3
-----------
SUMMARY37%of 19

-----------
SUMMARY---of 0

__f_setown100%of 1
__ia32_compat_sys_fcntl---of 8
__ia32_compat_sys_fcntl64---of 1
__ia32_sys_fcntl---of 1
__se_sys_fcntl29%of 14
__x64_compat_sys_fcntl---of 8
__x64_compat_sys_fcntl64---of 1
__x64_sys_fcntl100%of 1
check_fcntl_cmd---of 8
do_compat_fcntl64---of 35
do_fcntl3%of 85
f_delown---of 1
f_getown---of 13
f_modown38%of 8
f_setown---of 17
fasync_alloc100%of 1
fasync_free---of 1
fasync_helper---of 5
fasync_insert_entry40%of 5
fasync_remove_entry---of 6
kill_fasync7%of 30
put_compat_flock---of 1
put_compat_flock64---of 1
send_sigio---of 21
send_sigio_to_task---of 9
send_sigurg---of 24
sigio_perm---of 21
-----------
SUMMARY12%of 145

do_setattr---of 14
smack_add_opt---of 17
smack_audit_rule_init---of 7
smack_audit_rule_known---of 6
smack_audit_rule_match---of 9
smack_bprm_creds_for_exec---of 30
smack_cred_alloc_blank---of 1
smack_cred_free---of 6
smack_cred_getsecid---of 11
smack_cred_prepare---of 15
smack_cred_transfer---of 1
smack_current_getsecid_subj100%of 1
smack_d_instantiate35%of 23
smack_dentry_create_files_as---of 14
smack_file_alloc_security100%of 1
smack_file_fcntl29%of 7
smack_file_ioctl43%of 7
smack_file_lock67%of 3
smack_file_open100%of 1
smack_file_receive---of 5
smack_file_send_sigiotask---of 18
smack_file_set_fowner100%of 1
smack_free_mnt_opts---of 1
smack_from_netlbl---of 53
smack_fs_context_dup---of 4
smack_fs_context_parse_param50%of 4
smack_fs_context_submount---of 15
smack_getprocattr---of 4
smack_getselfattr---of 3
smack_inet_conn_request---of 25
smack_inet_csk_clone---of 3
smack_inode_alloc_security100%of 1
smack_inode_copy_up---of 4
smack_inode_copy_up_xattr---of 1
smack_inode_get_acl---of 1
smack_inode_getattr100%of 1
smack_inode_getsecctx---of 1
smack_inode_getsecid---of 1
smack_inode_getsecurity---of 11
smack_inode_getxattr---of 1
smack_inode_init_security24%of 25
smack_inode_link50%of 4
smack_inode_listsecurity---of 3
smack_inode_notifysecctx---of 1
smack_inode_permission72%of 7
smack_inode_post_setxattr---of 9
smack_inode_remove_acl---of 1
smack_inode_removexattr---of 17
smack_inode_rename50%of 4
smack_inode_rmdir67%of 3
smack_inode_set_acl---of 1
smack_inode_setattr67%of 3
smack_inode_setsecctx---of 1
smack_inode_setsecurity---of 14
smack_inode_setxattr---of 19
smack_inode_unlink67%of 3
smack_ipc_alloc_security---of 1
smack_ipc_getsecid---of 1
smack_ipc_permission---of 1
smack_ipv6host_label---of 20
smack_ismaclabel---of 1
smack_kernel_act_as---of 1
smack_kernel_create_files_as---of 1
smack_key_alloc---of 1
smack_key_free---of 1
smack_key_getsecurity---of 4
smack_key_permission---of 18
smack_mmap_file12%of 25
smack_msg_msg_alloc_security---of 1
smack_msg_queue_associate---of 1
smack_msg_queue_msgctl---of 9
smack_msg_queue_msgrcv---of 1
smack_msg_queue_msgsnd---of 1
smack_netlbl_add---of 4
smack_post_notification---of 3
smack_ptrace_access_check---of 1
smack_ptrace_traceme---of 1
smack_sb_alloc_security100%of 1
smack_sb_eat_lsm_opts16%of 33
smack_sb_statfs100%of 1
smack_secctx_to_secid---of 3
smack_secid_to_secctx---of 3
smack_sem_associate---of 1
smack_sem_semctl---of 16
smack_sem_semop---of 1
smack_set_mnt_opts8%of 26
smack_setprocattr---of 3
smack_setselfattr---of 1
smack_shm_associate---of 1
smack_shm_shmat---of 1
smack_shm_shmctl---of 11
smack_sk_alloc_security---of 6
smack_sk_clone_security---of 1
smack_sk_free_security---of 1
smack_sock_graft---of 5
smack_socket_connect---of 12
smack_socket_getpeersec_dgram---of 24
smack_socket_getpeersec_stream---of 12
smack_socket_post_create---of 6
smack_socket_sendmsg---of 10
smack_socket_sock_rcv_skb---of 55
smack_socket_socketpair---of 1
smack_syslog---of 3
smack_task_getioprio---of 1
smack_task_getpgid---of 1
smack_task_getscheduler---of 1
smack_task_getsecid_obj---of 1
smack_task_getsid---of 1
smack_task_kill---of 4
smack_task_movememory---of 1
smack_task_setioprio---of 1
smack_task_setnice---of 1
smack_task_setpgid---of 1
smack_task_setscheduler---of 1
smack_task_to_inode100%of 1
smack_unix_may_send---of 3
smack_unix_stream_connect---of 5
smack_uring_cmd---of 3
smack_uring_override_creds---of 3
smack_uring_sqpoll---of 1
smack_watch_key---of 6
smk_fetch34%of 6
smk_ipv4_check---of 18
smk_of_task_struct_obj25%of 16
smk_ptrace_rule_check---of 30
-----------
SUMMARY31%of 208

__kobject_del---of 9
dynamic_kobj_release---of 3
kobj_attr_show---of 3
kobj_attr_store---of 3
kobj_child_ns_ops---of 5
kobj_kset_leave---of 4
kobj_ns_current_may_mount---of 4
kobj_ns_drop---of 5
kobj_ns_grab_current---of 4
kobj_ns_initial---of 4
kobj_ns_netlink---of 4
kobj_ns_ops---of 5
kobj_ns_type_register---of 4
kobj_ns_type_registered---of 3
kobject_add---of 5
kobject_add_internal---of 37
kobject_create_and_add---of 6
kobject_del---of 3
kobject_get---of 7
kobject_get_ownership---of 3
kobject_get_path---of 11
kobject_get_unless_zero38%of 8
kobject_init---of 5
kobject_init_and_add---of 3
kobject_move---of 15
kobject_namespace---of 7
kobject_put13%of 24
kobject_rename---of 14
kobject_set_name---of 1
kobject_set_name_vargs---of 6
kset_create_and_add---of 5
kset_find_obj---of 7
kset_get_ownership---of 4
kset_init---of 3
kset_register---of 5
kset_release---of 3
kset_unregister---of 4
-----------
SUMMARY19%of 32

-----------
SUMMARY---of 0

ext4_block_to_path27%of 15
ext4_clear_blocks---of 12
ext4_find_shared---of 18
ext4_free_branches---of 24
ext4_free_data---of 17
ext4_get_branch20%of 15
ext4_ind_map_blocks19%of 113
ext4_ind_remove_space---of 77
ext4_ind_trans_blocks---of 1
ext4_ind_truncate---of 35
ext4_ind_truncate_ensure_credits---of 29
ext4_splice_branch25%of 16
ext4_update_inode_fsync_trans34%of 6
-----------
SUMMARY21%of 165

count_shadow_nodes---of 8
folio_memcg25%of 12
scan_shadow_nodes---of 1
shadow_lru_isolate---of 12
workingset_activation35%of 46
workingset_age_nonresident---of 10
workingset_eviction---of 53
workingset_refault---of 69
workingset_test_recent---of 63
workingset_update_node50%of 10
-----------
SUMMARY36%of 68

char2uni100%of 1
uni2char---of 5
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

__btrfs_add_delayed_item---of 11
__btrfs_commit_inode_delayed_items---of 67
__btrfs_kill_delayed_node---of 15
__btrfs_release_delayed_node25%of 28
__btrfs_run_delayed_items12%of 17
__btrfs_update_delayed_inode---of 19
btrfs_assert_delayed_root_empty29%of 7
btrfs_async_run_delayed_root---of 14
btrfs_balance_delayed_items16%of 13
btrfs_commit_inode_delayed_inode---of 7
btrfs_commit_inode_delayed_items---of 5
btrfs_delayed_delete_inode_ref---of 5
btrfs_delayed_inode_exit---of 1
btrfs_delayed_item_release_metadata---of 3
btrfs_delayed_update_inode31%of 13
btrfs_delete_delayed_dir_index---of 23
btrfs_destroy_delayed_inodes---of 8
btrfs_fill_inode50%of 4
btrfs_get_delayed_node15%of 20
btrfs_get_or_create_delayed_node30%of 10
btrfs_init_delayed_root---of 1
btrfs_inode_delayed_dir_index_count---of 4
btrfs_insert_delayed_dir_index---of 11
btrfs_kill_all_delayed_nodes---of 22
btrfs_kill_delayed_inode_items---of 3
btrfs_log_get_delayed_items---of 25
btrfs_log_put_delayed_items---of 21
btrfs_next_delayed_node---of 8
btrfs_readdir_delayed_dir_index36%of 14
btrfs_readdir_get_delayed_items32%of 22
btrfs_readdir_put_delayed_items18%of 17
btrfs_release_delayed_inode---of 7
btrfs_release_delayed_iref---of 6
btrfs_release_delayed_item---of 14
btrfs_release_dir_index_item_space---of 4
btrfs_remove_delayed_node---of 3
btrfs_run_delayed_items100%of 1
btrfs_run_delayed_items_nr---of 1
btrfs_should_delete_dir_index40%of 5
fill_stack_inode_item100%of 1
trace_btrfs_space_reservation27%of 15
-----------
SUMMARY26%of 187

-----------
SUMMARY---of 0

__ia32_compat_sys_old_select---of 3
__ia32_compat_sys_ppoll_time32---of 1
__ia32_compat_sys_ppoll_time64---of 1
__ia32_compat_sys_pselect6_time32---of 5
__ia32_compat_sys_pselect6_time64---of 5
__ia32_compat_sys_select---of 1
__ia32_sys_poll---of 1
__ia32_sys_ppoll---of 1
__ia32_sys_pselect6---of 1
__ia32_sys_select---of 1
__pollwait28%of 11
__se_compat_sys_ppoll_time32---of 8
__se_compat_sys_ppoll_time64---of 8
__se_sys_poll43%of 7
__se_sys_ppoll38%of 8
__se_sys_pselect631%of 13
__se_sys_select---of 6
__x64_compat_sys_old_select---of 3
__x64_compat_sys_ppoll_time32---of 1
__x64_compat_sys_ppoll_time64---of 1
__x64_compat_sys_pselect6_time32---of 6
__x64_compat_sys_pselect6_time64---of 6
__x64_compat_sys_select---of 1
__x64_sys_poll100%of 1
__x64_sys_ppoll100%of 1
__x64_sys_pselect6100%of 1
__x64_sys_select---of 1
compat_core_sys_select---of 46
core_sys_select26%of 43
do_compat_pselect---of 12
do_compat_select---of 6
do_restart_poll---of 5
do_select34%of 72
do_sys_poll35%of 47
poll_freewait60%of 10
poll_initwait---of 1
poll_select_finish19%of 22
poll_select_set_timeout50%of 4
pollwake---of 4
select_estimate_accuracy40%of 5
set_fd_set75%of 4
signal_pending---of 3
-----------
SUMMARY34%of 249

-----------
SUMMARY---of 0

flush_mdb---of 1
hfs_alloc_inode100%of 1
hfs_fill_super---of 51
hfs_free_inode---of 1
hfs_init_once---of 1
hfs_mark_mdb_dirty---of 4
hfs_mount---of 1
hfs_put_super---of 1
hfs_remount---of 5
hfs_show_options---of 19
hfs_statfs---of 1
hfs_sync_fs---of 1
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

btrfs_get_1637%of 11
btrfs_get_3237%of 11
btrfs_get_6437%of 11
btrfs_get_837%of 11
btrfs_get_token_16---of 20
btrfs_get_token_3215%of 20
btrfs_get_token_64---of 20
btrfs_get_token_8---of 20
btrfs_init_map_token100%of 1
btrfs_node_key100%of 1
btrfs_set_16---of 11
btrfs_set_3237%of 11
btrfs_set_6437%of 11
btrfs_set_837%of 11
btrfs_set_token_16---of 20
btrfs_set_token_3215%of 20
btrfs_set_token_64---of 20
btrfs_set_token_8---of 20
get_eb_offset_in_folio30%of 10
-----------
SUMMARY31%of 129

__bpf_trace_notifier_info---of 1
__probestub_notifier_register---of 1
__probestub_notifier_run---of 1
__probestub_notifier_unregister---of 1
__traceiter_notifier_register---of 4
__traceiter_notifier_run---of 4
__traceiter_notifier_unregister---of 4
atomic_notifier_call_chain28%of 11
atomic_notifier_call_chain_is_empty---of 1
atomic_notifier_chain_register---of 1
atomic_notifier_chain_register_unique_prio---of 1
atomic_notifier_chain_unregister---of 1
blocking_notifier_call_chain---of 3
blocking_notifier_call_chain_robust---of 4
blocking_notifier_chain_register---of 3
blocking_notifier_chain_register_unique_prio---of 3
blocking_notifier_chain_unregister---of 3
notifier_call_chain40%of 23
notifier_chain_register---of 21
notifier_chain_unregister---of 20
notify_die34%of 6
perf_trace_notifier_info---of 8
raw_notifier_call_chain---of 1
raw_notifier_call_chain_robust---of 3
raw_notifier_chain_register---of 1
raw_notifier_chain_unregister---of 1
register_die_notifier---of 1
srcu_init_notifier_head---of 3
srcu_notifier_call_chain67%of 3
srcu_notifier_chain_register---of 3
srcu_notifier_chain_unregister---of 3
trace_event_raw_event_notifier_info---of 7
trace_raw_output_notifier_info---of 3
unregister_die_notifier---of 1
-----------
SUMMARY38%of 43

netconsole_netdev_event---of 25
write_ext_msg---of 28
write_msg24%of 13
-----------
SUMMARY24%of 13

-----------
SUMMARY---of 0

__cgroup_account_cputime60%of 5
__cgroup_account_cputime_field---of 12
__cgroup_rstat_lock---of 32
__cgroup_rstat_unlock---of 15
bpf_rstat_flush---of 1
cgroup_base_stat_cputime_show---of 6
cgroup_rstat_exit---of 7
cgroup_rstat_flush---of 1
cgroup_rstat_flush_hold---of 1
cgroup_rstat_flush_locked---of 99
cgroup_rstat_flush_release---of 1
cgroup_rstat_init---of 8
cgroup_rstat_updated24%of 60
-----------
SUMMARY27%of 65

-----------
SUMMARY---of 0

__copy_overflow---of 1
copy_from_kernel_nofault59%of 17
copy_from_user_nofault---of 4
copy_to_kernel_nofault---of 16
copy_to_user_nofault---of 4
strncpy_from_kernel_nofault---of 6
strncpy_from_user_nofault---of 4
strnlen_user_nofault---of 1
-----------
SUMMARY59%of 17

udf_get_pblock40%of 5
udf_get_pblock_meta25---of 12
udf_get_pblock_spar15---of 12
udf_get_pblock_virt15---of 15
udf_get_pblock_virt20---of 1
udf_relocate_blocks---of 30
udf_try_read_meta---of 8
-----------
SUMMARY40%of 5

load_block_bitmap30%of 17
udf_free_blocks10%of 71
udf_new_block---of 65
udf_prealloc_blocks---of 41
-----------
SUMMARY14%of 88

-----------
SUMMARY---of 0

__udf_add_aext---of 17
__udf_get_block---of 8
__udf_iget18%of 87
folio_size---of 10
inode_bmap---of 19
udf_add_aext---of 8
udf_adinicb_read_folio---of 8
udf_adinicb_writepage---of 10
udf_bmap---of 3
udf_bread---of 11
udf_current_aext---of 16
udf_delete_aext---of 22
udf_direct_IO---of 5
udf_do_extend_file---of 45
udf_evict_inode39%of 13
udf_expand_file_adinicb---of 27
udf_get_block---of 1
udf_get_block_wb---of 1
udf_map_block---of 160
udf_next_aext---of 8
udf_read_folio---of 3
udf_readahead---of 3
udf_setsize11%of 56
udf_setup_indirect_aext---of 15
udf_update_extra_perms---of 3
udf_update_inode30%of 55
udf_write_aext---of 14
udf_write_begin---of 13
udf_write_end---of 13
udf_write_failed---of 6
udf_write_inode---of 1
udf_writepages---of 3
-----------
SUMMARY20%of 211

-----------
SUMMARY---of 0

__ia32_sys_capget---of 1
__ia32_sys_capset---of 1
__se_sys_capget---of 21
__se_sys_capset---of 13
__x64_sys_capget---of 1
__x64_sys_capset---of 1
cap_validate_magic---of 8
capable50%of 4
capable_wrt_inode_uidgid40%of 5
file_ns_capable---of 3
has_capability---of 1
has_capability_noaudit---of 1
has_ns_capability---of 16
has_ns_capability_noaudit---of 16
ns_capable50%of 4
ns_capable_noaudit50%of 4
ns_capable_setid---of 4
privileged_wrt_inode_uidgid---of 3
ptracer_capable---of 18
-----------
SUMMARY48%of 17

-----------
SUMMARY---of 0

__fprop_add_percpu---of 1
__fprop_add_percpu_max40%of 5
fprop_fraction_percpu---of 12
fprop_global_destroy---of 1
fprop_global_init---of 3
fprop_local_destroy_percpu---of 1
fprop_local_init_percpu---of 3
fprop_new_period---of 15
fprop_reflect_period_percpu25%of 8
-----------
SUMMARY31%of 13

crc32c67%of 3
-----------
SUMMARY67%of 3

-----------
SUMMARY---of 0

__cyc2ns_read---of 3
__set_cyc2ns_scale---of 11
calibrate_delay_is_known---of 16
check_tsc_unstable---of 1
convert_art_ns_to_tsc---of 1
convert_art_to_tsc---of 1
cyc2ns_read_begin---of 3
cyc2ns_read_end---of 3
mark_tsc_unstable---of 4
native_calibrate_cpu---of 3
native_calibrate_cpu_early---of 33
native_calibrate_tsc---of 16
native_read_msr---of 3
native_sched_clock_from_tsc---of 6
pit_hpet_ptimer_calibrate_cpu---of 35
read_tsc100%of 1
recalibrate_cpu_khz---of 1
sched_clock67%of 3
set_cyc2ns_scale---of 11
time_cpufreq_notifier---of 15
tsc_clocksource_watchdog_disabled---of 4
tsc_cs_enable---of 1
tsc_cs_mark_unstable---of 4
tsc_cs_tick_stable---of 4
tsc_read_refs---of 20
tsc_refine_calibration_work---of 18
tsc_restore_sched_clock_state---of 16
tsc_resume---of 1
tsc_save_sched_clock_state---of 4
unsynchronized_tsc---of 9
using_native_sched_clock---of 1
-----------
SUMMARY75%of 4

-----------
SUMMARY---of 0

__bpf_trace_mm_collapse_huge_page---of 1
__bpf_trace_mm_collapse_huge_page_isolate---of 1
__bpf_trace_mm_collapse_huge_page_swapin---of 1
__bpf_trace_mm_khugepaged_collapse_file---of 1
__bpf_trace_mm_khugepaged_scan_file---of 1
__bpf_trace_mm_khugepaged_scan_pmd---of 1
__collapse_huge_page_copy_failed---of 1
__collapse_huge_page_isolate---of 109
__khugepaged_enter---of 11
__khugepaged_exit---of 19
__probestub_mm_collapse_huge_page---of 1
__probestub_mm_collapse_huge_page_isolate---of 1
__probestub_mm_collapse_huge_page_swapin---of 1
__probestub_mm_khugepaged_collapse_file---of 1
__probestub_mm_khugepaged_scan_file---of 1
__probestub_mm_khugepaged_scan_pmd---of 1
__traceiter_mm_collapse_huge_page---of 4
__traceiter_mm_collapse_huge_page_isolate---of 4
__traceiter_mm_collapse_huge_page_swapin---of 4
__traceiter_mm_khugepaged_collapse_file---of 4
__traceiter_mm_khugepaged_scan_file---of 4
__traceiter_mm_khugepaged_scan_pmd---of 4
add_mm_counter---of 1
alloc_charge_folio---of 34
alloc_sleep_millisecs_show---of 1
alloc_sleep_millisecs_store---of 3
collapse_pte_mapped_thp---of 58
collect_mm_slot---of 12
current_is_khugepaged---of 1
defrag_show---of 1
defrag_store---of 1
filemap_nr_thps_dec---of 6
find_pmd_or_thp_or_none---of 7
folio_large_mapcount---of 9
folio_likely_mapped_shared---of 28
folio_mapcount---of 9
folio_order---of 9
folio_put---of 4
full_scans_show---of 1
hpage_collapse_scan_file---of 345
hpage_collapse_scan_pmd---of 282
hugepage_madvise---of 4
hugepage_vma_revalidate---of 21
is_refcount_suitable---of 23
khugepaged---of 104
khugepaged_enter_vma40%of 15
khugepaged_min_free_kbytes_update---of 7
madvise_collapse---of 67
max_ptes_none_show---of 1
max_ptes_none_store---of 4
max_ptes_shared_show---of 1
max_ptes_shared_store---of 4
max_ptes_swap_show---of 1
max_ptes_swap_store---of 4
mm_counter_file---of 7
mmu_notifier_invalidate_range_end---of 5
mmu_notifier_invalidate_range_start---of 3
pages_collapsed_show---of 1
pages_to_scan_show---of 1
pages_to_scan_store---of 4
perf_trace_mm_collapse_huge_page---of 8
perf_trace_mm_collapse_huge_page_isolate---of 8
perf_trace_mm_collapse_huge_page_swapin---of 8
perf_trace_mm_khugepaged_collapse_file---of 8
perf_trace_mm_khugepaged_scan_file---of 8
perf_trace_mm_khugepaged_scan_pmd---of 8
pmd_lock---of 1
pte_unmap---of 6
ptep_clear---of 3
release_pte_folio---of 17
release_pte_pages---of 25
scan_sleep_millisecs_show---of 1
scan_sleep_millisecs_store---of 3
set_huge_pmd---of 26
set_recommended_min_free_kbytes---of 13
start_stop_khugepaged---of 12
trace_event_raw_event_mm_collapse_huge_page---of 7
trace_event_raw_event_mm_collapse_huge_page_isolate---of 7
trace_event_raw_event_mm_collapse_huge_page_swapin---of 7
trace_event_raw_event_mm_khugepaged_collapse_file---of 7
trace_event_raw_event_mm_khugepaged_scan_file---of 7
trace_event_raw_event_mm_khugepaged_scan_pmd---of 7
trace_mm_collapse_huge_page_isolate---of 15
trace_raw_output_mm_collapse_huge_page---of 3
trace_raw_output_mm_collapse_huge_page_isolate---of 3
trace_raw_output_mm_collapse_huge_page_swapin---of 3
trace_raw_output_mm_khugepaged_collapse_file---of 3
trace_raw_output_mm_khugepaged_scan_file---of 3
trace_raw_output_mm_khugepaged_scan_pmd---of 3
xas_next---of 12
xas_next_entry---of 17
-----------
SUMMARY40%of 15

-----------
SUMMARY---of 0

exportfs_decode_fh100%of 1
exportfs_decode_fh_raw20%of 21
exportfs_encode_fh37%of 11
exportfs_encode_inode_fh---of 8
exportfs_get_name17%of 12
filldir_one38%of 8
find_acceptable_alias---of 10
reconnect_path28%of 36
-----------
SUMMARY27%of 89

ext4_sync_file22%of 46
-----------
SUMMARY22%of 46

-----------
SUMMARY---of 0

__ext4_ext_check20%of 36
__ext4_ext_dirty58%of 7
__read_extent_tree_block11%of 29
ext4_alloc_file_blocks---of 32
ext4_clu_mapped25%of 24
ext4_convert_unwritten_extents43%of 19
ext4_convert_unwritten_io_end_vec---of 9
ext4_datasem_ensure_credits34%of 6
ext4_es_is_delayed---of 1
ext4_ext_calc_credits_for_single_extent---of 6
ext4_ext_check_inode---of 1
ext4_ext_check_overlap65%of 20
ext4_ext_clear_bb---of 26
ext4_ext_correct_indexes29%of 14
ext4_ext_find_goal60%of 5
ext4_ext_get_access---of 4
ext4_ext_index_trans_blocks50%of 4
ext4_ext_init---of 1
ext4_ext_insert_extent23%of 140
ext4_ext_insert_index---of 13
ext4_ext_map_blocks22%of 251
ext4_ext_next_allocated_block---of 12
ext4_ext_precache---of 17
ext4_ext_release---of 1
ext4_ext_remove_space32%of 209
ext4_ext_replay_set_iblocks---of 57
ext4_ext_replay_shrink_inode---of 15
ext4_ext_replay_update_ex---of 22
ext4_ext_rm_idx---of 29
ext4_ext_search_right44%of 23
ext4_ext_shift_extents26%of 66
ext4_ext_tree_init100%of 1
ext4_ext_truncate43%of 7
ext4_ext_try_to_merge39%of 13
ext4_ext_try_to_merge_right50%of 18
ext4_ext_zeroout---of 1
ext4_extent_block_csum_set34%of 9
ext4_fallocate8%of 96
ext4_fiemap---of 10
ext4_find_extent58%of 38
ext4_free_ext_path---of 6
ext4_get_es_cache---of 28
ext4_iomap_xattr_begin---of 9
ext4_rereserve_cluster---of 5
ext4_split_extent45%of 9
ext4_split_extent_at24%of 30
ext4_swap_extents---of 67
ext4_update_inode_fsync_trans43%of 7
ext4_update_inode_size---of 9
ext4_zero_range---of 57
ext4_zeroout_es---of 3
get_implied_cluster_alloc---of 21
trace_ext4_ext_convert_to_initialized_fastpath---of 15
trace_ext4_fallocate_enter---of 15
trace_ext4_fallocate_exit---of 15
trace_ext4_get_implied_cluster_alloc_exit---of 15
-----------
SUMMARY28%of 1081

__blkcg_rstat_flush---of 24
__blkg_prfill_u64---of 4
__blkg_release---of 11
bio_associate_blkg34%of 15
bio_associate_blkg_from_css22%of 74
bio_blkcg_css50%of 4
bio_clone_blkg_association50%of 4
blk_cgroup_bio_start17%of 12
blk_cgroup_congested34%of 15
blkcg_activate_policy---of 42
blkcg_add_delay---of 3
blkcg_css19%of 11
blkcg_css_alloc---of 41
blkcg_css_free---of 15
blkcg_css_offline---of 1
blkcg_css_online---of 5
blkcg_deactivate_policy---of 23
blkcg_exit---of 3
blkcg_exit_disk---of 1
blkcg_get_cgwb_list---of 1
blkcg_init_disk---of 8
blkcg_maybe_throttle_current4%of 51
blkcg_pin_online---of 4
blkcg_policy_register---of 35
blkcg_policy_unregister---of 21
blkcg_print_blkgs---of 20
blkcg_print_stat---of 48
blkcg_punt_bio_submit---of 3
blkcg_reset_stats---of 26
blkcg_rstat_flush---of 3
blkcg_scale_delay---of 8
blkcg_schedule_throttle---of 10
blkcg_unpin_online---of 11
blkg_alloc41%of 22
blkg_async_bio_workfn---of 7
blkg_conf_exit---of 5
blkg_conf_init---of 1
blkg_conf_open_bdev---of 8
blkg_conf_prep---of 59
blkg_create27%of 89
blkg_destroy---of 37
blkg_destroy_all---of 20
blkg_dev_name---of 3
blkg_free_workfn---of 17
blkg_get31%of 13
blkg_init_queue---of 1
blkg_release---of 1
blkg_tryget25%of 16
percpu_ref_put29%of 14
radix_tree_preload_end---of 10
-----------
SUMMARY24%of 340

__bpf_trace_filelock_lease---of 1
__bpf_trace_filelock_lock---of 1
__bpf_trace_generic_add_lease---of 1
__bpf_trace_leases_conflict---of 1
__bpf_trace_locks_get_lock_context---of 1
__break_lease6%of 96
__ia32_sys_flock---of 1
__locks_delete_block17%of 12
__locks_insert_block---of 12
__locks_wake_up_blocks---of 16
__probestub_break_lease_block---of 1
__probestub_break_lease_noblock---of 1
__probestub_break_lease_unblock---of 1
__probestub_fcntl_setlk---of 1
__probestub_flock_lock_inode---of 1
__probestub_generic_add_lease---of 1
__probestub_generic_delete_lease---of 1
__probestub_leases_conflict---of 1
__probestub_locks_get_lock_context---of 1
__probestub_locks_remove_posix---of 1
__probestub_posix_lock_inode---of 1
__probestub_time_out_leases---of 1
__se_sys_flock34%of 18
__traceiter_break_lease_block---of 4
__traceiter_break_lease_noblock---of 4
__traceiter_break_lease_unblock---of 4
__traceiter_fcntl_setlk---of 4
__traceiter_flock_lock_inode---of 4
__traceiter_generic_add_lease---of 4
__traceiter_generic_delete_lease---of 4
__traceiter_leases_conflict---of 4
__traceiter_locks_get_lock_context---of 4
__traceiter_locks_remove_posix---of 4
__traceiter_posix_lock_inode---of 4
__traceiter_time_out_leases---of 4
__x64_sys_flock100%of 1
do_lock_file_wait---of 14
fcntl_getlease---of 17
fcntl_getlk---of 25
fcntl_setlease30%of 10
fcntl_setlk---of 45
files_lookup_fd_locked---of 6
flock_lock_inode23%of 62
flock_locks_conflict---of 4
generic_setlease17%of 77
kernel_setlease---of 5
lease_break_callback---of 1
lease_get_mtime---of 8
lease_modify---of 13
lease_register_notifier---of 1
lease_setup67%of 3
lease_unregister_notifier---of 1
leases_conflict28%of 22
lock_get_status---of 19
locks_alloc_lease---of 3
locks_alloc_lock---of 3
locks_check_ctx_lists---of 5
locks_copy_conflock---of 4
locks_copy_lock---of 9
locks_delete_block---of 1
locks_dump_ctx_list---of 4
locks_free_lease---of 1
locks_free_lock---of 1
locks_free_lock_context67%of 3
locks_get_lock_context25%of 20
locks_init_lease---of 1
locks_init_lock---of 1
locks_insert_lock_ctx60%of 10
locks_lock_inode_wait12%of 18
locks_next---of 1
locks_owner_has_blockers---of 7
locks_release_private25%of 12
locks_remove_file6%of 36
locks_remove_posix10%of 22
locks_show---of 16
locks_start---of 1
locks_stop---of 1
locks_translate_pid---of 13
locks_unlink_lock_ctx---of 14
percpu_down_read40%of 10
percpu_up_read40%of 10
perf_trace_filelock_lease---of 9
perf_trace_filelock_lock---of 9
perf_trace_generic_add_lease---of 8
perf_trace_leases_conflict---of 8
perf_trace_locks_get_lock_context---of 8
posix_lock_file---of 1
posix_lock_inode---of 137
posix_locks_conflict---of 6
posix_test_lock---of 25
show_fd_locks---of 20
time_out_leases33%of 28
trace_event_raw_event_filelock_lease---of 8
trace_event_raw_event_filelock_lock---of 8
trace_event_raw_event_generic_add_lease---of 7
trace_event_raw_event_leases_conflict---of 7
trace_event_raw_event_locks_get_lock_context---of 7
trace_generic_delete_lease---of 15
trace_raw_output_filelock_lease---of 3
trace_raw_output_filelock_lock---of 3
trace_raw_output_generic_add_lease---of 3
trace_raw_output_leases_conflict---of 3
trace_raw_output_locks_get_lock_context---of 3
vfs_cancel_lock---of 5
vfs_inode_has_locks---of 4
vfs_lock_file---of 5
vfs_setlease40%of 10
vfs_test_lock---of 5
-----------
SUMMARY20%of 480

-----------
SUMMARY---of 0

__d_path---of 6
__dentry_path27%of 30
__ia32_sys_getcwd---of 1
__se_sys_getcwd22%of 32
__x64_sys_getcwd100%of 1
d_absolute_path67%of 6
d_path30%of 27
dentry_path---of 6
dentry_path_raw67%of 3
dynamic_dname---of 3
prepend---of 6
prepend_path34%of 53
seqcount_lockdep_reader_access58%of 7
simple_dname---of 12
-----------
SUMMARY33%of 159

-----------
SUMMARY---of 0

__bpf_trace_task_newtask---of 1
__bpf_trace_task_rename---of 1
__cleanup_sighand---of 4
__delayed_free_task---of 1
__do_sys_vfork---of 1
__ia32_sys_clone---of 1
__ia32_sys_clone3---of 1
__ia32_sys_set_tid_address---of 1
__ia32_sys_unshare---of 1
__mas_set_range---of 8
__mmdrop---of 22
__mmput---of 15
__pidfd_prepare---of 4
__probestub_task_newtask---of 1
__probestub_task_rename---of 1
__put_task_struct---of 14
__put_task_struct_rcu_cb---of 1
__se_sys_clone3---of 11
__traceiter_task_newtask---of 4
__traceiter_task_rename---of 4
__vm_area_free---of 5
__x64_sys_clone---of 1
__x64_sys_clone3---of 1
__x64_sys_fork---of 1
__x64_sys_set_tid_address---of 1
__x64_sys_unshare---of 1
account_kernel_stack---of 15
copy_clone_args_from_user---of 19
copy_files---of 6
copy_fs---of 4
copy_mm---of 101
copy_oom_score_adj---of 3
copy_process---of 147
copy_seccomp---of 6
copy_sighand---of 7
copy_signal---of 4
create_io_thread---of 1
dup_task_struct---of 20
exec_mm_release---of 1
exit_mm_release---of 1
exit_task_stack_account---of 17
fatal_signal_pending---of 3
free_signal_struct---of 6
free_task---of 7
free_vm_stack_cache---of 7
get_mm_exe_file---of 11
get_task_exe_file---of 4
get_task_mm---of 4
idle_dummy---of 1
kernel_clone---of 38
kernel_thread---of 1
ksys_unshare---of 48
lockdep_tasklist_lock_is_held100%of 1
memcg_charge_kernel_stack---of 30
mm_access---of 8
mm_alloc---of 3
mm_init---of 17
mm_release---of 15
mmdrop_async_fn---of 1
mmput---of 3
mmput_async---of 3
mmput_async_fn---of 1
nr_processes---of 5
perf_trace_task_newtask---of 8
perf_trace_task_rename---of 8
pidfd_prepare---of 6
ptrace_event_pid---of 21
ptrace_init_task---of 5
put_cred---of 3
put_task_stack43%of 7
rcu_copy_process---of 1
rcu_read_unlock---of 6
refcount_inc---of 4
replace_mm_exe_file---of 34
set_mm_exe_file---of 10
set_task_stack_end_magic---of 1
sighand_ctor---of 1
syscall_tracepoint_update---of 3
sysctl_max_threads---of 3
thread_stack_free_rcu---of 4
trace_event_raw_event_task_newtask---of 7
trace_event_raw_event_task_rename---of 7
trace_raw_output_task_newtask---of 3
trace_raw_output_task_rename---of 3
trace_task_newtask---of 15
tty_kref_get---of 5
unshare_fd---of 5
unshare_files---of 5
user_mode_thread---of 1
vm_area_alloc50%of 4
vm_area_dup25%of 8
vm_area_free100%of 1
vm_area_free_rcu_cb---of 3
walk_process_tree---of 8
-----------
SUMMARY43%of 21

-----------
SUMMARY---of 0

__cpuset_memory_pressure_bump---of 27
compute_partition_effective_cpumask---of 30
cpuset_attach---of 22
cpuset_attach_task---of 8
cpuset_bind---of 4
cpuset_can_attach---of 33
cpuset_can_fork---of 32
cpuset_cancel_attach---of 6
cpuset_cancel_fork---of 23
cpuset_change_task_nodemask---of 6
cpuset_common_seq_show---of 10
cpuset_cpu_is_isolated67%of 3
cpuset_cpus_allowed---of 25
cpuset_cpus_allowed_fallback---of 25
cpuset_css_alloc---of 5
cpuset_css_free---of 1
cpuset_css_offline---of 8
cpuset_css_online---of 38
cpuset_force_rebuild---of 1
cpuset_fork---of 36
cpuset_handle_hotplug---of 106
cpuset_lock---of 1
cpuset_mem_spread_node---of 17
cpuset_mems_allowed---of 23
cpuset_mems_allowed_intersects---of 1
cpuset_migrate_mm---of 4
cpuset_migrate_mm_workfn---of 1
cpuset_migrate_tasks_workfn---of 5
cpuset_node_allowed7%of 29
cpuset_nodemask_valid_mems_allowed---of 1
cpuset_post_attach---of 1
cpuset_print_current_mems_allowed---of 20
cpuset_read_s64---of 3
cpuset_read_u64---of 17
cpuset_slab_spread_node---of 17
cpuset_task_status_allowed---of 1
cpuset_track_online_nodes---of 1
cpuset_unlock---of 1
cpuset_update_active_cpus---of 1
cpuset_update_task_spread_flags---of 6
cpuset_write_resmask---of 150
cpuset_write_s64---of 11
cpuset_write_u64---of 14
css_get---of 14
css_put---of 15
css_tryget_online---of 20
current_cpuset_is_being_rebound---of 20
dec_dl_tasks_cs---of 10
guarantee_online_cpus---of 23
inc_dl_tasks_cs---of 10
is_cpuset_subset---of 5
partition_is_populated---of 29
partition_xcpus_add---of 10
partition_xcpus_del---of 10
proc_cpuset_show---of 41
rcu_read_unlock---of 6
rebuild_sched_domains---of 1
rebuild_sched_domains_locked---of 148
remote_cpus_update---of 15
remote_partition_check---of 12
remote_partition_disable---of 12
reset_partition_data---of 10
sched_partition_show---of 10
sched_partition_write---of 9
update_cpumasks_hier---of 82
update_domain_attr_tree---of 22
update_flag---of 13
update_parent_effective_cpumask---of 94
update_partition_sd_lb---of 7
update_prstate---of 41
update_sibling_cpumasks---of 36
update_tasks_cpumask---of 7
update_tasks_nodemask---of 10
validate_change---of 63
-----------
SUMMARY13%of 32

udf_disk_stamp_to_time50%of 6
udf_time_to_disk_stamp100%of 1
-----------
SUMMARY58%of 7

-----------
SUMMARY---of 0

___d_drop40%of 10
__d_add69%of 22
__d_alloc55%of 11
__d_drop---of 3
__d_free---of 1
__d_free_external---of 1
__d_instantiate70%of 20
__d_lookup37%of 33
__d_lookup_rcu34%of 21
__d_lookup_rcu_op_compare22%of 19
__d_lookup_unhash40%of 10
__d_lookup_unhash_wake---of 1
__d_move53%of 65
__d_obtain_alias27%of 30
__d_rehash50%of 10
__d_unalias25%of 8
__dentry_kill66%of 29
d_add67%of 3
d_add_ci---of 15
d_alloc75%of 4
d_alloc_anon---of 1
d_alloc_cursor---of 4
d_alloc_name---of 4
d_alloc_parallel12%of 67
d_alloc_pseudo75%of 4
d_ancestor---of 4
d_delete50%of 4
d_drop---of 3
d_exact_alias---of 16
d_exchange---of 9
d_find_alias25%of 8
d_find_alias_rcu---of 8
d_find_any_alias---of 3
d_genocide---of 1
d_genocide_kill---of 6
d_hash_and_lookup---of 7
d_instantiate50%of 4
d_instantiate_new40%of 5
d_invalidate---of 8
d_lookup75%of 4
d_lru_add72%of 7
d_make_root40%of 5
d_mark_dontcache---of 4
d_mark_tmpfile---of 6
d_move100%of 1
d_obtain_alias100%of 1
d_obtain_root---of 1
d_prune_aliases---of 6
d_rehash---of 1
d_same_name---of 6
d_set_d_op75%of 20
d_set_mounted70%of 10
d_splice_alias53%of 17
d_tmpfile---of 4
d_walk17%of 48
dentry_free75%of 8
dentry_lru_isolate---of 11
dentry_lru_isolate_shrink---of 6
dentry_unlink_inode62%of 13
dget_parent15%of 27
do_one_tree---of 3
dput23%of 22
dput_to_list25%of 12
fast_dput59%of 17
find_submount---of 3
hlist_bl_lock29%of 7
hlist_bl_unlock---of 4
is_subdir36%of 17
lock_for_kill34%of 9
path_check_mount---of 5
path_has_submounts---of 1
proc_nr_dentry---of 13
prune_dcache_sb---of 1
rcu_read_unlock34%of 6
read_seqbegin50%of 10
read_word_at_a_time100%of 1
release_dentry_name_snapshot60%of 5
retain_dentry---of 13
select_collect25%of 8
select_collect2---of 11
shrink_dcache_for_umount---of 8
shrink_dcache_parent17%of 12
shrink_dcache_sb---of 15
shrink_dentry_list9%of 24
shrink_kill---of 22
start_dir_add34%of 9
take_dentry_name_snapshot100%of 3
to_shrink_list---of 13
umount_check---of 7
write_seqlock100%of 1
write_sequnlock100%of 1
-----------
SUMMARY40%of 712

__ia32_sys_fadvise64---of 6
__ia32_sys_fadvise64_64---of 6
__x64_sys_fadvise64---of 6
__x64_sys_fadvise64_64---of 6
generic_fadvise12%of 17
ksys_fadvise64_64---of 6
vfs_fadvise100%of 3
-----------
SUMMARY25%of 20

-----------
SUMMARY---of 0

setup---of 1
vfat_add_entry---of 137
vfat_cmp---of 10
vfat_cmpi46%of 11
vfat_create---of 6
vfat_fill_super---of 1
vfat_hash---of 4
vfat_hashi45%of 9
vfat_lookup27%of 15
vfat_mkdir---of 7
vfat_mount---of 1
vfat_rename2---of 108
vfat_revalidate---of 4
vfat_revalidate_ci40%of 5
vfat_rmdir---of 8
vfat_unlink---of 7
-----------
SUMMARY38%of 40

-----------
SUMMARY---of 0

__bitmap_and---of 9
__bitmap_andnot---of 9
__bitmap_clear72%of 7
__bitmap_complement---of 8
__bitmap_equal---of 8
__bitmap_intersects---of 8
__bitmap_or---of 8
__bitmap_or_equal---of 7
__bitmap_replace---of 7
__bitmap_set72%of 7
__bitmap_shift_left---of 14
__bitmap_shift_right---of 8
__bitmap_subset---of 8
__bitmap_weight---of 6
__bitmap_weight_and---of 6
__bitmap_weight_andnot---of 6
__bitmap_xor---of 8
bitmap_alloc---of 1
bitmap_alloc_node---of 1
bitmap_bitremap---of 16
bitmap_cut---of 17
bitmap_find_next_zero_area_off---of 4
bitmap_fold---of 4
bitmap_free---of 1
bitmap_from_arr32---of 8
bitmap_onto---of 5
bitmap_remap---of 17
bitmap_to_arr32---of 8
bitmap_zalloc---of 1
bitmap_zalloc_node---of 1
devm_bitmap_alloc---of 4
devm_bitmap_free---of 1
devm_bitmap_zalloc---of 4
-----------
SUMMARY72%of 14

-----------
SUMMARY---of 0

fscrypt_derive_dirhash_key---of 3
fscrypt_destroy_prepared_key---of 1
fscrypt_drop_inode---of 5
fscrypt_free_inode---of 4
fscrypt_get_encryption_info---of 12
fscrypt_hash_inode_number---of 5
fscrypt_prepare_key---of 10
fscrypt_prepare_new_inode19%of 11
fscrypt_put_encryption_info100%of 1
fscrypt_set_per_file_enc_key---of 1
fscrypt_setup_encryption_info---of 63
put_crypt_info25%of 8
refcount_inc---of 4
setup_per_mode_enc_key---of 9
-----------
SUMMARY25%of 20

-----------
SUMMARY---of 0

__ia32_compat_sys_ptrace---of 1
__ia32_sys_ptrace---of 1
__ptrace_detach---of 9
__ptrace_link---of 6
__ptrace_may_access5%of 40
__ptrace_unlink---of 14
__se_compat_sys_ptrace---of 16
__se_sys_ptrace---of 16
__x64_compat_sys_ptrace---of 1
__x64_sys_ptrace---of 1
compat_ptrace_request---of 34
exit_ptrace---of 9
generic_ptrace_peekdata---of 9
generic_ptrace_pokedata---of 7
ptrace_access_vm---of 7
ptrace_attach---of 35
ptrace_check_attach---of 14
ptrace_may_access100%of 1
ptrace_readdata---of 15
ptrace_regset---of 10
ptrace_request---of 95
ptrace_setsiginfo---of 4
ptrace_traceme---of 9
ptrace_writedata---of 14
-----------
SUMMARY8%of 41

-----------
SUMMARY---of 0

add_block_entry---of 18
add_extent_data_ref---of 22
add_shared_data_ref---of 16
add_tree_block---of 25
btrfs_build_ref_tree---of 54
btrfs_free_ref_cache---of 5
btrfs_free_ref_tree_range---of 21
btrfs_ref_tree_mod3%of 70
dump_block_entry---of 12
dump_ref_action---of 3
free_block_entry---of 12
-----------
SUMMARY3%of 70

-----------
SUMMARY---of 0

__add_reloc_root---of 10
__del_reloc_root---of 13
add_data_references---of 39
add_tree_block---of 18
btrfs_get_reloc_bg_bytenr---of 7
btrfs_grab_root---of 8
btrfs_init_reloc_root17%of 12
btrfs_recover_relocation---of 61
btrfs_reloc_clone_csums---of 7
btrfs_reloc_cow_block---of 29
btrfs_reloc_post_snapshot---of 65
btrfs_reloc_pre_snapshot---of 7
btrfs_relocate_block_group---of 42
btrfs_should_cancel_balance---of 5
btrfs_should_ignore_reloc_root---of 5
btrfs_update_reloc_root---of 26
build_backref_tree---of 42
calcu_metadata_size---of 13
clean_dirty_subvols---of 11
create_reloc_inode---of 22
create_reloc_root---of 14
delete_block_group_cache---of 7
describe_relocation---of 1
do_relocation---of 57
find_next_extent---of 18
find_reloc_root---of 14
free_reloc_roots---of 4
invalidate_extent_cache---of 18
mark_garbage_root---of 3
memcmp_node_keys---of 3
merge_reloc_root---of 60
merge_reloc_roots---of 22
prealloc_file_extent_cluster---of 22
prepare_to_merge---of 35
prepare_to_relocate---of 6
relocate_block_group---of 33
relocate_data_extent---of 14
relocate_file_extent_cluster---of 81
relocate_tree_blocks---of 82
replace_file_extents---of 45
replace_path---of 60
select_one_root---of 13
select_reloc_root---of 38
setup_relocation_extent_mapping---of 3
unset_reloc_control---of 1
update_backref_cache---of 31
walk_down_reloc_tree---of 21
walk_up_reloc_tree---of 23
-----------
SUMMARY17%of 12

__fscrypt_encrypt_symlink---of 11
__fscrypt_prepare_link---of 3
__fscrypt_prepare_lookup---of 8
__fscrypt_prepare_readdir---of 1
__fscrypt_prepare_rename---of 11
__fscrypt_prepare_setattr---of 6
fscrypt_file_open20%of 26
fscrypt_get_symlink---of 12
fscrypt_prepare_lookup_partial---of 7
fscrypt_prepare_setflags25%of 8
fscrypt_prepare_symlink40%of 5
fscrypt_symlink_getattr---of 6
-----------
SUMMARY24%of 39

-----------
SUMMARY---of 0

__access_remote_vm---of 38
__apply_to_page_range---of 70
__do_fault29%of 21
__folio_rmap_sanity_checks---of 21
__get_locked_pte---of 5
__might_fault50%of 4
__pmd_alloc---of 13
__pte_alloc34%of 6
__pte_alloc_kernel---of 5
__pud_alloc---of 11
__vm_insert_mixed---of 8
_compound_head---of 7
access_process_vm---of 3
access_remote_vm---of 1
add_mm_rss_vec56%of 9
apply_to_existing_page_range---of 1
apply_to_page_range---of 1
clear_gigantic_page---of 4
clear_huge_page---of 11
copy_folio_from_user---of 17
copy_page_range---of 41
copy_pmd_range---of 495
copy_present_page---of 20
copy_user_gigantic_page---of 5
copy_user_large_folio---of 22
count_memcg_event_mm30%of 24
do_page_mkwrite34%of 15
do_set_pmd---of 35
do_swap_page---of 260
do_wp_page11%of 311
fault_around_bytes_fops_open---of 1
fault_around_bytes_get---of 1
fault_around_bytes_set---of 4
fault_dirty_shared_page37%of 22
finish_fault20%of 40
finish_mkwrite_fault30%of 10
folio_dup_file_rmap_ptes---of 10
folio_get---of 3
folio_large_mapcount---of 9
folio_lock---of 9
folio_lock_or_retry---of 9
folio_mapcount34%of 9
folio_prealloc34%of 9
folio_pte_batch---of 41
folio_put50%of 4
folio_try_dup_anon_rmap_ptes---of 155
follow_pte---of 18
free_pgd_range50%of 24
free_pgtables27%of 26
generic_access_phys---of 29
get_page---of 9
handle_mm_fault28%of 97
handle_pte_fault10%of 407
handle_pte_marker---of 3
insert_page_into_pte_locked27%of 23
insert_pfn---of 29
lock_mm_and_find_vma29%of 21
lock_vma_under_rcu21%of 24
lruvec_stat_add_folio---of 15
lruvec_stat_sub_folio---of 15
mm_counter---of 8
mm_counter_file---of 7
mm_trace_rss_stat27%of 15
mmap_read_lock_killable---of 5
mmap_read_trylock60%of 5
mmap_read_unlock67%of 3
mmap_write_downgrade---of 6
mmap_write_unlock---of 6
numa_migrate_prep---of 7
numa_rebuild_single_mapping---of 13
pfn_swap_entry_folio---of 17
pfn_swap_entry_to_page---of 18
pfn_valid---of 29
pmd_install---of 3
print_bad_pte---of 16
print_vma_addr---of 11
pte_unmap---of 6
ptlock_alloc67%of 3
ptlock_free100%of 1
put_page---of 14
put_swap_device---of 14
remap_pfn_range---of 4
remap_pfn_range_notrack---of 46
remove_device_exclusive_entry---of 31
restore_exclusive_pte---of 48
set_pte_range39%of 21
try_restore_exclusive_pte---of 15
unmap_mapping_folio---of 18
unmap_mapping_pages---of 3
unmap_mapping_range40%of 5
unmap_mapping_range_tree---of 4
unmap_page_range13%of 204
unmap_single_vma42%of 12
unmap_vmas50%of 14
upgrade_mmap_lock_carefully---of 11
vm_insert_page---of 41
vm_insert_pages22%of 55
vm_iomap_memory---of 7
vm_map_pages---of 6
vm_map_pages_zero---of 6
vm_normal_folio---of 8
vm_normal_folio_pmd---of 8
vm_normal_page45%of 9
vm_normal_page_pmd---of 12
vma_end_read---of 11
vma_pgtable_walk_begin67%of 3
vma_pgtable_walk_end67%of 3
vmf_anon_prepare---of 7
vmf_insert_mixed---of 1
vmf_insert_mixed_mkwrite---of 1
vmf_insert_pfn---of 1
vmf_insert_pfn_prot---of 11
walk_to_pmd34%of 12
wp_huge_pmd---of 9
wp_page_reuse12%of 34
zap_page_range_single---of 13
zap_vma_ptes---of 6
-----------
SUMMARY19%of 1505

__ia32_compat_sys_ioctl---of 1
__ia32_sys_ioctl---of 1
__se_compat_sys_ioctl---of 34
__se_sys_ioctl38%of 8
__x64_compat_sys_ioctl---of 1
__x64_sys_ioctl100%of 1
compat_ptr_ioctl---of 3
copy_fsxattr_to_user---of 1
do_vfs_ioctl8%of 116
fiemap_fill_next_extent---of 5
fiemap_prep---of 8
fileattr_fill_flags54%of 15
fileattr_fill_xflags---of 15
vfs_fileattr_get---of 3
vfs_fileattr_set29%of 35
vfs_ioctl---of 3
-----------
SUMMARY18%of 175

blk_ioprio_exit---of 1
blk_ioprio_init---of 1
blkcg_set_ioprio25%of 16
ioprio_alloc_cpd---of 6
ioprio_alloc_pd50%of 4
ioprio_free_cpd---of 1
ioprio_free_pd---of 1
ioprio_set_prio_policy---of 7
ioprio_show_prio_policy---of 6
-----------
SUMMARY30%of 20

ext4_bg_has_super---of 16
ext4_bg_num_gdb---of 5
ext4_claim_free_clusters100%of 3
ext4_count_free_clusters---of 9
ext4_free_clusters_after_init---of 27
ext4_get_group_desc32%of 22
ext4_get_group_info27%of 19
ext4_get_group_no_and_offset58%of 7
ext4_get_group_number40%of 5
ext4_has_free_clusters63%of 16
ext4_has_group_desc_csum43%of 7
ext4_init_block_bitmap---of 35
ext4_inode_to_goal_block67%of 9
ext4_lock_group23%of 9
ext4_new_meta_blocks38%of 8
ext4_num_base_meta_blocks---of 6
ext4_num_base_meta_clusters---of 8
ext4_read_block_bitmap50%of 4
ext4_read_block_bitmap_nowait55%of 31
ext4_should_retry_alloc34%of 6
ext4_validate_block_bitmap31%of 56
ext4_wait_block_bitmap58%of 7
trace_ext4_read_block_bitmap_load27%of 15
-----------
SUMMARY41%of 224

-----------
SUMMARY---of 0

__bpf_trace_hw_interval_param---of 1
__bpf_trace_hw_mask_param---of 1
__probestub_hw_interval_param---of 1
__probestub_hw_mask_param---of 1
__traceiter_hw_interval_param---of 4
__traceiter_hw_mask_param---of 4
_snd_pcm_stream_lock_irqsave---of 3
_snd_pcm_stream_lock_irqsave_nested---of 3
class_pcm_stream_lock_irq_constructor---of 3
class_pcm_stream_lock_irq_destructor---of 4
fixup_unreferenced_params---of 54
pcm_release_private---of 3
perf_trace_hw_interval_param---of 8
perf_trace_hw_mask_param---of 8
refcount_inc---of 4
relink_to_local---of 9
snd_pcm_action_group---of 26
snd_pcm_action_lock_irq---of 11
snd_pcm_action_nonatomic---of 9
snd_pcm_capture_open---of 4
snd_pcm_channel_info---of 11
snd_pcm_channel_info_user---of 4
snd_pcm_common_ioctl---of 44
snd_pcm_delay---of 20
snd_pcm_do_drain_init---of 25
snd_pcm_do_pause---of 3
snd_pcm_do_prepare---of 10
snd_pcm_do_reset---of 9
snd_pcm_do_resume---of 6
snd_pcm_do_start---of 4
snd_pcm_do_stop---of 6
snd_pcm_do_suspend---of 6
snd_pcm_drain---of 37
snd_pcm_drain_done---of 10
snd_pcm_drop---of 11
snd_pcm_fasync---of 5
snd_pcm_forward---of 26
snd_pcm_forward_ioctl---of 5
snd_pcm_group_assign---of 5
snd_pcm_group_init---of 1
snd_pcm_group_unref---of 9
snd_pcm_hw_free---of 30
snd_pcm_hw_params---of 83
snd_pcm_hw_params_old_user---of 9
snd_pcm_hw_params_user---of 6
snd_pcm_hw_refine---of 62
snd_pcm_hw_refine_old_user---of 10
snd_pcm_hw_refine_user---of 7
snd_pcm_hw_rule_buffer_bytes_max---of 3
snd_pcm_hw_rule_div---of 7
snd_pcm_hw_rule_format---of 11
snd_pcm_hw_rule_mul---of 7
snd_pcm_hw_rule_muldivk---of 7
snd_pcm_hw_rule_mulkdiv---of 7
snd_pcm_hw_rule_rate---of 3
snd_pcm_hw_rule_sample_bits---of 7
snd_pcm_hw_rule_subformats---of 8
snd_pcm_info---of 1
snd_pcm_info_user---of 4
snd_pcm_ioctl---of 3
snd_pcm_ioctl_channel_info_compat---of 10
snd_pcm_ioctl_compat---of 40
snd_pcm_ioctl_delay_compat---of 3
snd_pcm_ioctl_forward_compat---of 3
snd_pcm_ioctl_hw_params_compat---of 16
snd_pcm_ioctl_rewind_compat---of 3
snd_pcm_ioctl_sw_params_compat---of 21
snd_pcm_ioctl_sync_ptr_buggy---of 21
snd_pcm_ioctl_sync_ptr_compat---of 36
snd_pcm_ioctl_sync_ptr_x32---of 33
snd_pcm_ioctl_xferi_compat---of 8
snd_pcm_ioctl_xfern_compat---of 18
snd_pcm_kernel_ioctl---of 12
snd_pcm_lib_default_mmap---of 9
snd_pcm_lib_mmap_iomem---of 3
snd_pcm_link---of 27
snd_pcm_mmap---of 29
snd_pcm_mmap_control_fault---of 10
snd_pcm_mmap_data---of 13
snd_pcm_mmap_data_close---of 1
snd_pcm_mmap_data_fault---of 15
snd_pcm_mmap_data_open---of 1
snd_pcm_mmap_status_fault---of 10
snd_pcm_open---of 15
snd_pcm_open_substream---of 75
snd_pcm_pause---of 6
snd_pcm_playback_open---of 4
snd_pcm_poll---of 20
snd_pcm_post_drain_init---of 1
snd_pcm_post_pause---of 5
snd_pcm_post_prepare---of 8
snd_pcm_post_reset---of 9
snd_pcm_post_resume---of 3
snd_pcm_post_start---of 6
snd_pcm_post_stop---of 4
snd_pcm_post_suspend---of 3
snd_pcm_pre_drain_init---of 5
snd_pcm_pre_pause---of 6
snd_pcm_pre_prepare---of 7
snd_pcm_pre_reset---of 1
snd_pcm_pre_resume---of 3
snd_pcm_pre_start---of 6
snd_pcm_pre_stop---of 3
snd_pcm_pre_suspend---of 6
snd_pcm_prepare---of 9
snd_pcm_read---of 8
snd_pcm_readv---of 23
snd_pcm_release---of 4
snd_pcm_release_substream---of 19
snd_pcm_rewind---of 27
snd_pcm_rewind_ioctl---of 5
snd_pcm_start---of 11
snd_pcm_status64---of 29
snd_pcm_status_user32---of 4
snd_pcm_status_user64---of 4
snd_pcm_status_user_compat64---of 7
snd_pcm_stop---of 11
snd_pcm_stop_xrun---of 11
snd_pcm_stream_group_ref---of 17
snd_pcm_stream_lock---of 3
snd_pcm_stream_lock_irq67%of 3
snd_pcm_stream_lock_nested---of 3
snd_pcm_stream_unlock---of 3
snd_pcm_stream_unlock_irq67%of 3
snd_pcm_stream_unlock_irqrestore---of 3
snd_pcm_suspend_all---of 42
snd_pcm_sw_params---of 32
snd_pcm_sw_params_user---of 3
snd_pcm_sync_ptr---of 21
snd_pcm_sync_stop---of 9
snd_pcm_trigger_tstamp---of 7
snd_pcm_tstamp---of 4
snd_pcm_undo_pause---of 3
snd_pcm_undo_resume---of 6
snd_pcm_undo_start---of 3
snd_pcm_unlink---of 17
snd_pcm_write---of 8
snd_pcm_writev---of 23
snd_pcm_xferi_frames_ioctl---of 6
snd_pcm_xfern_frames_ioctl---of 8
snd_pcm_xrun---of 9
trace_event_raw_event_hw_interval_param---of 7
trace_event_raw_event_hw_mask_param---of 7
trace_hw_interval_param---of 15
trace_hw_mask_param---of 15
trace_raw_output_hw_interval_param---of 3
trace_raw_output_hw_mask_param---of 3
-----------
SUMMARY67%of 6

arch_stack_walk50%of 10
arch_stack_walk_reliable---of 11
arch_stack_walk_user---of 10
-----------
SUMMARY50%of 10

-----------
SUMMARY---of 0

__generic_file_fsync40%of 5
alloc_anon_inode67%of 3
always_delete_dentry100%of 1
dcache_dir_close---of 1
dcache_dir_lseek---of 14
dcache_dir_open---of 1
dcache_readdir---of 16
direct_write_fallback---of 4
empty_dir_getattr---of 1
empty_dir_listxattr---of 1
empty_dir_llseek---of 1
empty_dir_lookup---of 1
empty_dir_readdir---of 7
empty_dir_setattr---of 1
folio_size30%of 10
generic_check_addressable---of 6
generic_ci_d_compare---of 8
generic_ci_d_hash---of 6
generic_encode_ino32_fh40%of 5
generic_fh_to_dentry50%of 4
generic_fh_to_parent---of 5
generic_file_fsync67%of 3
generic_read_dir---of 1
generic_set_sb_d_ops---of 4
init_pseudo---of 3
inode_maybe_inc_iversion50%of 6
inode_query_iversion40%of 5
is_empty_dir_inode---of 3
kfree_link100%of 1
make_empty_dir_inode---of 1
memory_read_from_buffer---of 4
noop_direct_IO---of 1
noop_fsync100%of 1
offset_dir_llseek---of 6
offset_readdir---of 29
path_from_stashed---of 44
pseudo_fs_fill_super---of 4
pseudo_fs_free---of 1
pseudo_fs_get_tree---of 1
scan_positives---of 18
simple_attr_open---of 3
simple_attr_read---of 8
simple_attr_release---of 1
simple_attr_write---of 1
simple_attr_write_signed---of 1
simple_attr_write_xsigned---of 8
simple_empty---of 7
simple_fill_super---of 10
simple_get_link---of 1
simple_getattr---of 1
simple_inode_init_ts100%of 1
simple_link---of 3
simple_lookup50%of 4
simple_nosetlease---of 1
simple_offset_add50%of 4
simple_offset_destroy---of 1
simple_offset_empty---of 9
simple_offset_init100%of 1
simple_offset_remove---of 3
simple_offset_rename---of 8
simple_offset_rename_exchange---of 13
simple_open---of 3
simple_pin_fs---of 5
simple_read_folio---of 7
simple_read_from_buffer---of 6
simple_recursive_removal---of 33
simple_release_fs---of 3
simple_rename---of 12
simple_rename_exchange---of 7
simple_rename_timestamp---of 5
simple_rmdir---of 3
simple_setattr---of 4
simple_statfs---of 1
simple_transaction_get---of 5
simple_transaction_read---of 3
simple_transaction_release---of 1
simple_transaction_set---of 3
simple_unlink---of 1
simple_write_begin28%of 11
simple_write_end26%of 27
simple_write_to_buffer---of 6
stashed_dentry_prune---of 4
zero_user_segments48%of 17
-----------
SUMMARY42%of 109

__ia32_sys_mseal---of 1
__se_sys_mseal---of 29
__x64_sys_mseal---of 1
can_modify_mm75%of 4
can_modify_mm_madv---of 17
-----------
SUMMARY75%of 4

-----------
SUMMARY---of 0

__bpf_trace_cgroup---of 1
__bpf_trace_cgroup_event---of 1
__bpf_trace_cgroup_migrate---of 1
__bpf_trace_cgroup_root---of 1
__bpf_trace_cgroup_rstat---of 1
__cgroup_procs_start---of 12
__cgroup_procs_write---of 9
__cgroup_task_count---of 7
__probestub_cgroup_attach_task---of 1
__probestub_cgroup_destroy_root---of 1
__probestub_cgroup_freeze---of 1
__probestub_cgroup_mkdir---of 1
__probestub_cgroup_notify_frozen---of 1
__probestub_cgroup_notify_populated---of 1
__probestub_cgroup_release---of 1
__probestub_cgroup_remount---of 1
__probestub_cgroup_rename---of 1
__probestub_cgroup_rmdir---of 1
__probestub_cgroup_rstat_cpu_lock_contended---of 1
__probestub_cgroup_rstat_cpu_lock_contended_fastpath---of 1
__probestub_cgroup_rstat_cpu_locked---of 1
__probestub_cgroup_rstat_cpu_locked_fastpath---of 1
__probestub_cgroup_rstat_cpu_unlock---of 1
__probestub_cgroup_rstat_cpu_unlock_fastpath---of 1
__probestub_cgroup_rstat_lock_contended---of 1
__probestub_cgroup_rstat_locked---of 1
__probestub_cgroup_rstat_unlock---of 1
__probestub_cgroup_setup_root---of 1
__probestub_cgroup_transfer_tasks---of 1
__probestub_cgroup_unfreeze---of 1
__traceiter_cgroup_attach_task---of 4
__traceiter_cgroup_destroy_root---of 4
__traceiter_cgroup_freeze---of 4
__traceiter_cgroup_mkdir---of 4
__traceiter_cgroup_notify_frozen---of 4
__traceiter_cgroup_notify_populated---of 4
__traceiter_cgroup_release---of 4
__traceiter_cgroup_remount---of 4
__traceiter_cgroup_rename---of 4
__traceiter_cgroup_rmdir---of 4
__traceiter_cgroup_rstat_cpu_lock_contended---of 4
__traceiter_cgroup_rstat_cpu_lock_contended_fastpath---of 4
__traceiter_cgroup_rstat_cpu_locked---of 4
__traceiter_cgroup_rstat_cpu_locked_fastpath---of 4
__traceiter_cgroup_rstat_cpu_unlock---of 4
__traceiter_cgroup_rstat_cpu_unlock_fastpath---of 4
__traceiter_cgroup_rstat_lock_contended---of 4
__traceiter_cgroup_rstat_locked---of 4
__traceiter_cgroup_rstat_unlock---of 4
__traceiter_cgroup_setup_root---of 4
__traceiter_cgroup_transfer_tasks---of 4
__traceiter_cgroup_unfreeze---of 4
allocate_cgrp_cset_links---of 11
cgroup2_parse_param---of 8
cgroup_add_cftypes---of 10
cgroup_add_dfl_cftypes---of 5
cgroup_add_legacy_cftypes---of 5
cgroup_addrm_files---of 50
cgroup_apply_cftypes---of 18
cgroup_apply_control---of 32
cgroup_apply_control_enable---of 44
cgroup_attach_lock---of 3
cgroup_attach_permissions---of 36
cgroup_attach_task---of 49
cgroup_attach_unlock---of 3
cgroup_can_fork---of 63
cgroup_cancel_fork---of 5
cgroup_control---of 5
cgroup_controllers_show---of 8
cgroup_cpu_pressure_show---of 6
cgroup_cpu_pressure_write---of 1
cgroup_css30%of 10
cgroup_css_set_put_fork---of 8
cgroup_destroy_locked---of 25
cgroup_do_get_tree---of 8
cgroup_e_css---of 5
cgroup_events_show---of 4
cgroup_exit---of 36
cgroup_favor_dynmods---of 4
cgroup_file_notify---of 5
cgroup_file_notify_timer---of 5
cgroup_file_open---of 12
cgroup_file_poll---of 3
cgroup_file_release---of 7
cgroup_file_show---of 3
cgroup_file_write---of 23
cgroup_finalize_control---of 33
cgroup_fork---of 1
cgroup_free---of 12
cgroup_free_root---of 3
cgroup_freeze_show---of 4
cgroup_freeze_write---of 8
cgroup_fs_context_free---of 5
cgroup_get_e_css37%of 19
cgroup_get_from_fd---of 10
cgroup_get_from_id---of 30
cgroup_get_from_path---of 24
cgroup_get_live---of 5
cgroup_get_tree---of 10
cgroup_idr_alloc---of 10
cgroup_init_cftypes---of 18
cgroup_init_fs_context---of 16
cgroup_io_pressure_show---of 6
cgroup_io_pressure_write---of 1
cgroup_irq_pressure_show---of 6
cgroup_irq_pressure_write---of 1
cgroup_kill_sb---of 6
cgroup_kill_write---of 36
cgroup_kn_lock_live---of 11
cgroup_kn_unlock---of 5
cgroup_lock_and_drain_offline---of 20
cgroup_max_depth_show---of 6
cgroup_max_depth_write---of 10
cgroup_max_descendants_show---of 6
cgroup_max_descendants_write---of 10
cgroup_memory_pressure_show---of 6
cgroup_memory_pressure_write---of 1
cgroup_migrate---of 7
cgroup_migrate_add_src---of 22
cgroup_migrate_add_task---of 27
cgroup_migrate_execute---of 56
cgroup_migrate_finish---of 14
cgroup_migrate_prepare_dst---of 26
cgroup_migrate_vet_dst---of 15
cgroup_mkdir---of 38
cgroup_on_dfl---of 1
cgroup_parse_float---of 23
cgroup_path_from_kernfs_id---of 3
cgroup_path_ns---of 1
cgroup_path_ns_locked---of 1
cgroup_post_fork---of 24
cgroup_pressure_poll---of 1
cgroup_pressure_release---of 1
cgroup_pressure_show---of 6
cgroup_pressure_write---of 21
cgroup_print_ss_mask---of 11
cgroup_procs_next---of 3
cgroup_procs_release---of 3
cgroup_procs_show---of 1
cgroup_procs_start---of 6
cgroup_procs_write---of 1
cgroup_procs_write_finish---of 36
cgroup_procs_write_start---of 30
cgroup_propagate_control---of 31
cgroup_psi_enabled---of 3
cgroup_reconfigure---of 5
cgroup_release---of 26
cgroup_rm_cftypes---of 5
cgroup_rm_cftypes_locked---of 12
cgroup_rmdir---of 22
cgroup_root_from_kf---of 1
cgroup_save_control---of 15
cgroup_seqfile_next---of 1
cgroup_seqfile_show---of 8
cgroup_seqfile_start---of 1
cgroup_seqfile_stop---of 3
cgroup_setup_root---of 41
cgroup_show_options---of 11
cgroup_show_path---of 28
cgroup_sk_alloc---of 24
cgroup_sk_clone---of 3
cgroup_sk_free---of 3
cgroup_ssid_enabled---of 3
cgroup_stat_show---of 4
cgroup_subtree_control_show---of 4
cgroup_subtree_control_write---of 70
cgroup_task_count---of 7
cgroup_taskset_first---of 6
cgroup_taskset_next---of 6
cgroup_threadgroup_change_end---of 10
cgroup_threads_start---of 1
cgroup_threads_write---of 1
cgroup_tryget---of 17
cgroup_tryget_css---of 13
cgroup_type_show---of 17
cgroup_type_write---of 42
cgroup_update_populated---of 29
cgroup_v1v2_get_from_fd---of 6
cpu_local_stat_show---of 8
cpu_stat_show---of 8
cpuset_init_fs_context---of 3
cset_cgroup_from_root---of 12
css_clear_dir---of 9
css_free_rwork_fn---of 47
css_from_id---of 3
css_has_online_children---of 21
css_killed_ref_fn---of 3
css_killed_work_fn---of 14
css_next_child---of 12
css_next_descendant_post---of 25
css_next_descendant_pre---of 19
css_populate_dir---of 14
css_release---of 1
css_release_work_fn---of 28
css_rightmost_descendant---of 16
css_set_hash---of 1
css_set_move_task---of 46
css_task_iter_advance---of 20
css_task_iter_advance_css_set---of 32
css_task_iter_end---of 11
css_task_iter_next---of 11
css_task_iter_start---of 4
css_tryget_online---of 20
css_tryget_online_from_dir---of 22
css_visible---of 16
delegate_show---of 24
features_show---of 1
find_css_set---of 57
init_and_link_css---of 16
init_cgroup_housekeeping---of 1
init_cgroup_root---of 7
kill_css---of 7
link_css_set---of 15
of_css---of 4
online_css---of 11
percpu_ref_get---of 13
percpu_ref_put---of 14
perf_trace_cgroup---of 8
perf_trace_cgroup_event---of 8
perf_trace_cgroup_migrate---of 8
perf_trace_cgroup_root---of 8
perf_trace_cgroup_rstat---of 8
pressure_write---of 16
proc_cgroup_show---of 34
put_css_set---of 3
put_css_set_locked---of 31
rebind_subsystems---of 88
task_cgroup_from_root---of 10
task_dfl_cgroup---of 10
trace_cgroup_mkdir---of 15
trace_cgroup_setup_root---of 15
trace_event_raw_event_cgroup---of 7
trace_event_raw_event_cgroup_event---of 7
trace_event_raw_event_cgroup_migrate---of 7
trace_event_raw_event_cgroup_root---of 7
trace_event_raw_event_cgroup_rstat---of 7
trace_raw_output_cgroup---of 3
trace_raw_output_cgroup_event---of 3
trace_raw_output_cgroup_migrate---of 3
trace_raw_output_cgroup_root---of 3
trace_raw_output_cgroup_rstat---of 3
-----------
SUMMARY35%of 29

__do_wait---of 33
__ia32_compat_sys_wait4---of 4
__ia32_compat_sys_waitid---of 1
__ia32_sys_wait4---of 4
__ia32_sys_waitid---of 1
__ia32_sys_waitpid---of 1
__se_compat_sys_waitid---of 19
__se_sys_waitid---of 17
__wake_up_parent---of 1
__x64_compat_sys_wait4---of 4
__x64_compat_sys_waitid---of 1
__x64_sys_wait4---of 4
__x64_sys_waitid---of 1
__x64_sys_waitpid---of 1
child_wait_callback---of 10
delayed_put_task_struct---of 18
do_exit---of 122
do_group_exit---of 6
do_wait---of 20
exit_mm---of 13
get_task_struct---of 4
is_current_pgrp_orphaned---of 13
kernel_wait---of 4
kernel_wait4---of 9
kernel_waitid_prepare---of 14
kill_orphaned_pgrp---of 21
make_task_dead---of 12
mm_update_next_owner---of 27
oops_count_show---of 1
pid_child_should_wake---of 10
put_task_struct---of 4
put_task_struct_rcu_user50%of 4
rcu_read_unlock---of 6
rcuwait_wake_up---of 18
refcount_inc---of 4
release_task---of 70
trace_sched_process_exit---of 15
wait_consider_task---of 118
-----------
SUMMARY50%of 4

blk_alloc_flush_queue---of 7
blk_flush_complete_seq30%of 30
blk_free_flush_queue---of 3
blk_insert_flush50%of 10
blk_mq_hctx_set_fq_lock_class---of 1
blkdev_issue_flush100%of 1
flush_end_io---of 34
is_flush_rq---of 1
mq_flush_data_end_io---of 11
-----------
SUMMARY37%of 41

crypto_alloc_shash---of 1
crypto_clone_shash---of 15
crypto_grab_shash---of 1
crypto_has_shash---of 1
crypto_register_shash---of 16
crypto_register_shashes---of 23
crypto_shash_digest---of 3
crypto_shash_exit_tfm---of 1
crypto_shash_export---of 3
crypto_shash_final---of 1
crypto_shash_finup---of 1
crypto_shash_free_instance---of 1
crypto_shash_import---of 4
crypto_shash_init_tfm---of 11
crypto_shash_report---of 1
crypto_shash_setkey---of 5
crypto_shash_show---of 1
crypto_shash_tfm_digest---of 3
crypto_shash_update100%of 1
crypto_unregister_shash---of 1
crypto_unregister_shashes---of 4
hash_prepare_alg---of 4
shash_default_digest---of 3
shash_default_finup---of 3
shash_free_singlespawn_instance---of 1
shash_no_setkey---of 1
shash_register_instance---of 17
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

arch_mmap_rnd---of 4
arch_pick_mmap_layout---of 16
arch_vma_name---of 1
get_mmap_base50%of 4
mmap_address_hint_valid---of 5
pfn_modify_allowed---of 38
task_size_32bit---of 1
task_size_64bit---of 1
valid_mmap_phys_addr_range---of 3
valid_phys_addr_range---of 1
-----------
SUMMARY50%of 4

__tlb_remove_folio_pages100%of 1
__tlb_remove_folio_pages_size43%of 14
__tlb_remove_page_size100%of 1
tlb_finish_mmu67%of 6
tlb_flush_mmu40%of 20
tlb_flush_rmap_batch---of 13
tlb_flush_rmaps---of 4
tlb_gather_mmu67%of 3
tlb_gather_mmu_fullmm---of 1
tlb_remove_table---of 11
tlb_remove_table_rcu---of 4
tlb_remove_table_smp_sync---of 1
tlb_remove_table_sync_one---of 1
tlb_table_flush25%of 8
-----------
SUMMARY46%of 53

__ext4_fc_track_create---of 17
__ext4_fc_track_link---of 17
__ext4_fc_track_unlink---of 17
__track_dentry_update---of 15
ext4_end_buffer_io_sync---of 4
ext4_fc_cleanup---of 57
ext4_fc_commit3%of 68
ext4_fc_del10%of 22
ext4_fc_destroy_dentry_cache---of 1
ext4_fc_info_show---of 3
ext4_fc_init---of 3
ext4_fc_init_inode100%of 1
ext4_fc_mark_ineligible17%of 12
ext4_fc_record_regions---of 9
ext4_fc_replay---of 145
ext4_fc_replay_check_excluded---of 8
ext4_fc_replay_cleanup---of 1
ext4_fc_replay_link_internal---of 13
ext4_fc_reserve_space---of 10
ext4_fc_set_bitmaps_and_counters---of 14
ext4_fc_start_update---of 7
ext4_fc_stop_update---of 5
ext4_fc_submit_bh---of 7
ext4_fc_track_create40%of 5
ext4_fc_track_inode12%of 25
ext4_fc_track_link40%of 5
ext4_fc_track_range12%of 27
ext4_fc_track_unlink40%of 5
ext4_fc_update_stats---of 20
ext4_fc_wait_committing_inode---of 4
ext4_fc_write_inode---of 9
ext4_fc_write_inode_data---of 14
trace_ext4_fc_replay---of 15
-----------
SUMMARY12%of 170

ida_alloc_range22%of 33
ida_destroy---of 22
ida_free---of 10
idr_alloc50%of 4
idr_alloc_cyclic---of 6
idr_alloc_u3260%of 5
idr_find100%of 1
idr_for_each---of 9
idr_get_next---of 13
idr_get_next_ul---of 11
idr_remove---of 1
idr_replace---of 4
-----------
SUMMARY31%of 43

ext4_fname_free_filename100%of 1
ext4_fname_prepare_lookup38%of 8
ext4_fname_setup_filename50%of 4
ext4_get_context---of 1
ext4_get_dummy_policy100%of 1
ext4_has_stable_inodes---of 1
ext4_ioctl_get_encryption_pwsalt---of 24
ext4_set_context---of 18
lock_buffer---of 3
-----------
SUMMARY50%of 14

__ia32_sys_fdatasync---of 6
__ia32_sys_fsync---of 7
__ia32_sys_sync_file_range---of 5
__ia32_sys_sync_file_range2---of 5
__ia32_sys_syncfs---of 1
__se_sys_syncfs50%of 4
__x64_sys_fdatasync50%of 6
__x64_sys_fsync58%of 7
__x64_sys_sync100%of 1
__x64_sys_sync_file_range60%of 5
__x64_sys_sync_file_range2---of 5
__x64_sys_syncfs100%of 1
do_sync_work---of 1
emergency_sync---of 3
ksys_sync67%of 3
ksys_sync_file_range---of 5
sync_file_range32%of 16
sync_filesystem37%of 11
sync_fs_one_sb80%of 5
sync_inodes_one_sb100%of 3
vfs_fsync---of 5
vfs_fsync_range60%of 5
-----------
SUMMARY53%of 67

evm_file_release72%of 7
evm_inode_alloc_security67%of 3
evm_inode_copy_up_xattr---of 4
evm_inode_init_security19%of 11
evm_inode_post_remove_acl---of 1
evm_inode_post_removexattr---of 13
evm_inode_post_set_acl---of 1
evm_inode_post_setattr20%of 10
evm_inode_post_setxattr---of 15
evm_inode_remove_acl---of 1
evm_inode_removexattr---of 3
evm_inode_set_acl---of 20
evm_inode_setattr11%of 28
evm_inode_setxattr---of 7
evm_metadata_changed---of 8
evm_post_path_mknod50%of 4
evm_protect_xattr---of 44
evm_protected_xattr---of 1
evm_protected_xattr_common---of 12
evm_protected_xattr_if_enabled---of 1
evm_read_protected_xattrs---of 15
evm_revalidate_status---of 7
evm_verify_hmac---of 44
evm_verifyxattr---of 4
is_unsupported_hmac_fs---of 4
-----------
SUMMARY26%of 63

ioport_map---of 1
ioport_unmap---of 1
ioread16---of 5
ioread16_rep---of 7
ioread16be---of 5
ioread32---of 5
ioread32_rep---of 7
ioread32be---of 5
ioread64_hi_lo---of 5
ioread64_lo_hi---of 5
ioread64be_hi_lo---of 5
ioread64be_lo_hi---of 5
ioread8---of 5
ioread8_rep---of 7
iowrite1640%of 5
iowrite16_rep---of 7
iowrite16be---of 5
iowrite32---of 5
iowrite32_rep---of 7
iowrite32be---of 5
iowrite64_hi_lo---of 5
iowrite64_lo_hi---of 5
iowrite64be_hi_lo---of 5
iowrite64be_lo_hi---of 5
iowrite8---of 5
iowrite8_rep---of 7
pci_iounmap---of 5
-----------
SUMMARY40%of 5

__hrtimer_get_remaining---of 4
__hrtimer_init50%of 8
__hrtimer_next_event_base---of 14
__hrtimer_run_queues---of 55
__ia32_sys_nanosleep---of 5
__ia32_sys_nanosleep_time32---of 5
__remove_hrtimer22%of 14
__x64_sys_nanosleep---of 5
__x64_sys_nanosleep_time32---of 5
clock_was_set---of 27
clock_was_set_delayed---of 1
clock_was_set_work---of 1
debug_deactivate27%of 15
debug_init---of 15
destroy_hrtimer_on_stack---of 1
do_nanosleep---of 18
enqueue_hrtimer32%of 19
hrtimer_active---of 9
hrtimer_cancel---of 4
hrtimer_debug_hint---of 1
hrtimer_fixup_activate---of 3
hrtimer_fixup_free---of 5
hrtimer_fixup_init---of 5
hrtimer_forward---of 6
hrtimer_get_next_event---of 6
hrtimer_init---of 1
hrtimer_init_on_stack---of 1
hrtimer_init_sleeper---of 1
hrtimer_init_sleeper_on_stack---of 1
hrtimer_interrupt---of 16
hrtimer_nanosleep---of 4
hrtimer_nanosleep_restart---of 1
hrtimer_next_event_without---of 6
hrtimer_reprogram---of 15
hrtimer_run_queues---of 10
hrtimer_run_softirq---of 5
hrtimer_sleeper_start_expires---of 1
hrtimer_start_range_ns28%of 36
hrtimer_try_to_cancel34%of 15
hrtimer_wakeup---of 3
hrtimers_cpu_dying---of 18
hrtimers_prepare_cpu---of 5
hrtimers_resume_local---of 5
ktime_add_safe---of 1
ktime_get_boottime---of 1
ktime_get_clocktai---of 1
ktime_get_real---of 1
nanosleep_copyout---of 6
retrigger_next_event---of 16
schedule_hrtimeout---of 1
schedule_hrtimeout_range100%of 1
schedule_hrtimeout_range_clock38%of 8
-----------
SUMMARY32%of 116

-----------
SUMMARY---of 0

__disk_unblock_events---of 9
disk_add_events---of 4
disk_alloc_events---of 5
disk_block_events50%of 4
disk_check_events---of 13
disk_check_media_change---of 8
disk_del_events---of 6
disk_events_async_show---of 1
disk_events_poll_msecs_show---of 3
disk_events_poll_msecs_store---of 7
disk_events_set_dfl_poll_msecs---of 8
disk_events_show---of 8
disk_events_workfn---of 1
disk_flush_events---of 4
disk_force_media_change---of 3
disk_release_events---of 4
disk_unblock_events67%of 3
-----------
SUMMARY58%of 7

__lockup_detector_reconfigure---of 21
arch_touch_nmi_watchdog100%of 1
lockup_detector_cleanup---of 4
lockup_detector_offline_cpu---of 5
lockup_detector_online_cpu---of 5
lockup_detector_reconfigure---of 1
lockup_detector_soft_poweroff---of 1
proc_nmi_watchdog---of 3
proc_soft_watchdog---of 1
proc_watchdog---of 1
proc_watchdog_common---of 5
proc_watchdog_cpumask---of 3
proc_watchdog_thresh---of 4
softlockup_fn---of 3
softlockup_start_fn---of 1
softlockup_stop_fn---of 1
touch_all_softlockup_watchdogs---of 5
touch_softlockup_watchdog100%of 1
touch_softlockup_watchdog_sched---of 1
touch_softlockup_watchdog_sync---of 1
update_report_ts---of 1
watchdog_disable---of 7
watchdog_enable---of 9
watchdog_hardlockup_check---of 42
watchdog_hardlockup_start---of 1
watchdog_hardlockup_stop---of 1
watchdog_hardlockup_touch_cpu---of 3
watchdog_timer_fn---of 43
-----------
SUMMARY100%of 2

__io_async_cancel---of 23
init_hash_table75%of 4
io_async_cancel---of 8
io_async_cancel_prep---of 11
io_cancel_cb---of 17
io_cancel_req_match---of 17
io_sync_cancel---of 34
io_try_cancel---of 16
-----------
SUMMARY75%of 4

udf_copy_fi24%of 13
udf_fiiter_advance25%of 12
udf_fiiter_advance_blk---of 9
udf_fiiter_append_blk---of 18
udf_fiiter_bread_blk---of 24
udf_fiiter_init22%of 14
udf_fiiter_load_bhs---of 20
udf_fiiter_release60%of 5
udf_fiiter_update_elen---of 5
udf_fiiter_write_fi25%of 41
udf_get_filelongad---of 6
udf_get_fileshortad---of 6
udf_verify_fi34%of 9
-----------
SUMMARY27%of 94

__crc32c_pcl_intel_finup---of 10
crc32c_intel_cra_init---of 1
crc32c_intel_digest---of 7
crc32c_intel_final---of 1
crc32c_intel_finup---of 7
crc32c_intel_init---of 1
crc32c_intel_setkey---of 3
crc32c_intel_update---of 7
crc32c_pcl_intel_digest---of 1
crc32c_pcl_intel_finup---of 1
crc32c_pcl_intel_update90%of 10
-----------
SUMMARY90%of 10

-----------
SUMMARY---of 0

__bpf_trace_ksm_advisor---of 1
__bpf_trace_ksm_enter_exit_template---of 1
__bpf_trace_ksm_merge_one_page---of 1
__bpf_trace_ksm_merge_with_ksm_page---of 1
__bpf_trace_ksm_remove_ksm_page---of 1
__bpf_trace_ksm_remove_rmap_item---of 1
__bpf_trace_ksm_scan_template---of 1
__ksm_add_vma---of 10
__ksm_enter---of 25
__ksm_exit---of 38
__probestub_ksm_advisor---of 1
__probestub_ksm_enter---of 1
__probestub_ksm_exit---of 1
__probestub_ksm_merge_one_page---of 1
__probestub_ksm_merge_with_ksm_page---of 1
__probestub_ksm_remove_ksm_page---of 1
__probestub_ksm_remove_rmap_item---of 1
__probestub_ksm_start_scan---of 1
__probestub_ksm_stop_scan---of 1
__stable_node_chain---of 38
__traceiter_ksm_advisor---of 4
__traceiter_ksm_enter---of 4
__traceiter_ksm_exit---of 4
__traceiter_ksm_merge_one_page---of 4
__traceiter_ksm_merge_with_ksm_page---of 4
__traceiter_ksm_remove_ksm_page---of 4
__traceiter_ksm_remove_rmap_item---of 4
__traceiter_ksm_start_scan---of 4
__traceiter_ksm_stop_scan---of 4
_compound_head---of 7
advisor_max_cpu_show---of 1
advisor_max_cpu_store---of 3
advisor_max_pages_to_scan_show---of 1
advisor_max_pages_to_scan_store---of 3
advisor_min_pages_to_scan_show---of 1
advisor_min_pages_to_scan_store---of 3
advisor_mode_show---of 1
advisor_mode_store---of 6
advisor_target_scan_time_show---of 1
advisor_target_scan_time_store---of 4
alloc_stable_node_chain---of 7
break_cow---of 14
break_ksm---of 5
break_ksm_pmd_entry---of 36
dec_mm_counter---of 1
folio_get---of 3
folio_large_mapcount---of 9
folio_mapcount---of 9
folio_migrate_ksm---of 19
folio_set_stable_node---of 25
full_scans_show---of 1
general_profit_show---of 1
ksm_add_vma67%of 3
ksm_del_vmas---of 13
ksm_disable---of 7
ksm_disable_merge_any---of 6
ksm_do_scan---of 419
ksm_enable_merge_any---of 8
ksm_get_folio---of 55
ksm_madvise16%of 13
ksm_map_zero_page---of 1
ksm_memory_callback---of 28
ksm_might_need_to_copy---of 49
ksm_process_profit---of 1
ksm_scan_thread---of 24
ksm_zero_pages_show---of 1
lock_page---of 15
max_page_sharing_show---of 1
max_page_sharing_store---of 8
merge_across_nodes_show---of 1
merge_across_nodes_store---of 11
pages_scanned_show---of 1
pages_shared_show---of 1
pages_sharing_show---of 1
pages_skipped_show---of 1
pages_to_scan_show---of 1
pages_to_scan_store---of 4
pages_unshared_show---of 1
pages_volatile_show---of 1
perf_trace_ksm_advisor---of 8
perf_trace_ksm_enter_exit_template---of 8
perf_trace_ksm_merge_one_page---of 8
perf_trace_ksm_merge_with_ksm_page---of 8
perf_trace_ksm_remove_ksm_page---of 8
perf_trace_ksm_remove_rmap_item---of 8
perf_trace_ksm_scan_template---of 8
pfn_pte---of 5
pte_unmap---of 6
put_page---of 14
remove_all_stable_nodes---of 22
remove_node_from_stable_tree---of 46
remove_rmap_item_from_tree---of 14
remove_stable_node---of 7
replace_page---of 85
rmap_walk_ksm---of 31
run_show---of 1
run_store---of 36
sleep_millisecs_show---of 1
sleep_millisecs_store---of 3
smart_scan_show---of 1
smart_scan_store---of 3
stable_node_chains_prune_millisecs_show---of 1
stable_node_chains_prune_millisecs_store---of 3
stable_node_chains_show---of 1
stable_node_dups_show---of 1
trace_event_raw_event_ksm_advisor---of 7
trace_event_raw_event_ksm_enter_exit_template---of 7
trace_event_raw_event_ksm_merge_one_page---of 7
trace_event_raw_event_ksm_merge_with_ksm_page---of 7
trace_event_raw_event_ksm_remove_ksm_page---of 7
trace_event_raw_event_ksm_remove_rmap_item---of 7
trace_event_raw_event_ksm_scan_template---of 7
trace_raw_output_ksm_advisor---of 3
trace_raw_output_ksm_enter_exit_template---of 3
trace_raw_output_ksm_merge_one_page---of 3
trace_raw_output_ksm_merge_with_ksm_page---of 3
trace_raw_output_ksm_remove_ksm_page---of 3
trace_raw_output_ksm_remove_rmap_item---of 3
trace_raw_output_ksm_scan_template---of 3
try_to_merge_one_page---of 242
try_to_merge_with_ksm_page---of 27
trylock_page---of 13
unmerge_ksm_pages---of 7
use_zero_pages_show---of 1
use_zero_pages_store---of 3
wait_while_offlining---of 6
-----------
SUMMARY25%of 16

-----------
SUMMARY---of 0

__hfs_bnode_create22%of 14
hfs_bnode_clear---of 3
hfs_bnode_copy---of 4
hfs_bnode_create---of 12
hfs_bnode_dump---of 20
hfs_bnode_find28%of 40
hfs_bnode_findhash---of 6
hfs_bnode_free---of 18
hfs_bnode_get---of 3
hfs_bnode_move---of 3
hfs_bnode_put31%of 13
hfs_bnode_read34%of 6
hfs_bnode_read_key---of 11
hfs_bnode_read_u1640%of 5
hfs_bnode_read_u867%of 3
hfs_bnode_unhash---of 4
hfs_bnode_unlink---of 16
hfs_bnode_write---of 3
hfs_bnode_write_u16---of 3
hfs_bnode_write_u8---of 3
-----------
SUMMARY30%of 81

tomoyo_convert_time---of 1
tomoyo_correct_domain---of 15
tomoyo_correct_path---of 4
tomoyo_correct_word---of 1
tomoyo_correct_word2---of 31
tomoyo_domain_def---of 6
tomoyo_domain_quota_is_ok12%of 18
tomoyo_file_matches_pattern2---of 62
tomoyo_fill_path_info54%of 15
tomoyo_find_domain---of 8
tomoyo_get_domainname---of 11
tomoyo_get_exe---of 4
tomoyo_get_mode45%of 9
tomoyo_init_request_info55%of 11
tomoyo_normalize_line---of 11
tomoyo_parse_name_union---of 6
tomoyo_parse_number_union---of 19
tomoyo_parse_ulong---of 7
tomoyo_path_matches_pattern50%of 6
tomoyo_path_matches_pattern2---of 41
tomoyo_permstr---of 4
tomoyo_print_ulong80%of 5
tomoyo_read_token---of 3
tomoyo_str_starts---of 3
-----------
SUMMARY43%of 64

__cleanup_mnt---of 1
__detach_mounts---of 19
__do_loopback29%of 14
__ia32_sys_fsmount---of 1
__ia32_sys_listmount---of 1
__ia32_sys_mount---of 1
__ia32_sys_mount_setattr---of 1
__ia32_sys_move_mount---of 1
__ia32_sys_oldumount---of 3
__ia32_sys_open_tree---of 1
__ia32_sys_pivot_root---of 1
__ia32_sys_statmount---of 1
__ia32_sys_umount---of 4
__is_local_mountpoint---of 4
__legitimize_mnt43%of 7
__lookup_mnt---of 9
__mnt_is_readonly67%of 3
__put_mountpoint60%of 5
__se_sys_fsmount29%of 32
__se_sys_listmount---of 42
__se_sys_mount45%of 18
__se_sys_mount_setattr16%of 125
__se_sys_move_mount7%of 32
__se_sys_open_tree28%of 29
__se_sys_pivot_root26%of 50
__se_sys_statmount---of 63
__x64_sys_fsmount100%of 1
__x64_sys_listmount---of 1
__x64_sys_mount100%of 1
__x64_sys_mount_setattr100%of 1
__x64_sys_move_mount100%of 1
__x64_sys_oldumount---of 3
__x64_sys_open_tree100%of 1
__x64_sys_pivot_root100%of 1
__x64_sys_statmount---of 1
__x64_sys_umount---of 4
alloc_mnt_ns30%of 10
alloc_vfsmnt38%of 8
attach_mnt55%of 11
attach_recursive_mnt18%of 88
can_move_mount_beneath---of 11
check_for_nsfs_mounts25%of 12
cleanup_mnt---of 17
clone_mnt31%of 33
clone_private_mount---of 11
collect_mounts---of 3
commit_tree50%of 22
copy_mnt_id_req---of 9
copy_mnt_ns---of 44
copy_tree11%of 39
count_mounts---of 12
current_chrooted---of 6
delayed_free_vfsmnt---of 1
delayed_mntput---of 4
dissolve_on_fput50%of 6
do_change_type---of 14
do_lock_mount30%of 17
do_loopback---of 13
do_mount---of 3
do_move_mount30%of 31
do_move_mount_old---of 5
do_new_mount25%of 36
drop_collected_mounts---of 1
fc_mount---of 3
finish_automount---of 34
free_mnt_ns43%of 7
from_mnt_ns---of 1
get_mountpoint43%of 21
graft_tree---of 4
invent_group_ids19%of 22
is_path_reachable---of 5
iterate_mounts---of 5
kern_mount---of 3
kern_unmount---of 5
kern_unmount_array---of 24
lock_mnt_tree---of 8
lock_mount_hash100%of 1
lookup_mnt22%of 28
m_next---of 3
m_show---of 1
m_start---of 7
m_stop---of 1
mark_mounts_for_expiry---of 16
may_mount100%of 1
may_umount---of 1
may_umount_tree---of 13
mnt_add_count100%of 1
mnt_add_to_ns---of 6
mnt_change_mountpoint---of 17
mnt_clone_internal---of 3
mnt_drop_write67%of 3
mnt_drop_write_file50%of 4
mnt_get_count60%of 5
mnt_get_write_access40%of 10
mnt_get_write_access_file80%of 5
mnt_make_shortterm---of 3
mnt_may_suid---of 4
mnt_put_write_access67%of 3
mnt_put_write_access_file75%of 4
mnt_release_group_id---of 1
mnt_set_expiry---of 3
mnt_set_mountpoint---of 3
mnt_want_write67%of 3
mnt_want_write_file34%of 6
mnt_warn_timestamp_expiry29%of 7
mntget67%of 3
mntns_get---of 5
mntns_install---of 11
mntns_owner---of 1
mntns_put---of 1
mntput75%of 4
mntput_no_expire15%of 27
mount_subtree---of 13
mount_too_revealing10%of 20
namespace_unlock58%of 14
our_mnt---of 1
path_is_mountpoint---of 21
path_is_under---of 5
path_mount12%of 52
path_overmounted---of 19
path_umount---of 50
put_mnt_ns---of 4
rcu_read_unlock34%of 6
read_seqbegin50%of 10
sb_end_write40%of 10
sb_prepare_remount_readonly---of 19
sb_start_write40%of 10
set_mount_attributes---of 3
show_path---of 3
tree_contains_unbindable23%of 9
umount_tree28%of 51
unhash_mnt---of 9
unlock_mount_hash100%of 1
vfs_create_mount45%of 9
vfs_kern_mount---of 9
vfs_submount---of 3
-----------
SUMMARY28%of 991

-----------
SUMMARY---of 0

__bpf_trace_hrtimer_class---of 1
__bpf_trace_hrtimer_expire_entry---of 1
__bpf_trace_hrtimer_init---of 1
__bpf_trace_hrtimer_start---of 1
__bpf_trace_itimer_expire---of 1
__bpf_trace_itimer_state---of 1
__bpf_trace_tick_stop---of 1
__bpf_trace_timer_base_idle---of 1
__bpf_trace_timer_class---of 1
__bpf_trace_timer_expire_entry---of 1
__bpf_trace_timer_start---of 1
__get_next_timer_interrupt---of 31
__mod_timer24%of 43
__probestub_hrtimer_cancel---of 1
__probestub_hrtimer_expire_entry---of 1
__probestub_hrtimer_expire_exit---of 1
__probestub_hrtimer_init---of 1
__probestub_hrtimer_start---of 1
__probestub_itimer_expire---of 1
__probestub_itimer_state---of 1
__probestub_tick_stop---of 1
__probestub_timer_base_idle---of 1
__probestub_timer_cancel---of 1
__probestub_timer_expire_entry---of 1
__probestub_timer_expire_exit---of 1
__probestub_timer_init---of 1
__probestub_timer_start---of 1
__round_jiffies---of 1
__round_jiffies_relative---of 1
__round_jiffies_up---of 1
__round_jiffies_up_relative---of 1
__run_timer_base---of 31
__timer_delete60%of 10
__timer_delete_sync47%of 13
__traceiter_hrtimer_cancel---of 4
__traceiter_hrtimer_expire_entry---of 4
__traceiter_hrtimer_expire_exit---of 4
__traceiter_hrtimer_init---of 4
__traceiter_hrtimer_start---of 4
__traceiter_itimer_expire---of 4
__traceiter_itimer_state---of 4
__traceiter_tick_stop---of 4
__traceiter_timer_base_idle---of 4
__traceiter_timer_cancel---of 4
__traceiter_timer_expire_entry---of 4
__traceiter_timer_expire_exit---of 4
__traceiter_timer_init---of 4
__traceiter_timer_start---of 4
__try_to_del_timer_sync40%of 10
add_timer---of 3
add_timer_global67%of 3
add_timer_local---of 3
add_timer_on---of 13
calc_wheel_index60%of 10
call_timer_fn---of 35
destroy_timer_on_stack---of 1
detach_timer32%of 19
enqueue_timer31%of 23
fetch_next_timer_interrupt---of 15
fetch_next_timer_interrupt_remote---of 9
get_next_timer_interrupt---of 1
init_timer_key30%of 17
init_timer_on_stack_key---of 3
lock_timer_base38%of 8
mod_timer100%of 1
mod_timer_pending---of 1
msleep---of 4
msleep_interruptible---of 6
next_expiry_recalc---of 9
perf_trace_hrtimer_class---of 8
perf_trace_hrtimer_expire_entry---of 8
perf_trace_hrtimer_init---of 8
perf_trace_hrtimer_start---of 8
perf_trace_itimer_expire---of 9
perf_trace_itimer_state---of 8
perf_trace_tick_stop---of 8
perf_trace_timer_base_idle---of 8
perf_trace_timer_class---of 8
perf_trace_timer_expire_entry---of 8
perf_trace_timer_start---of 8
process_timeout---of 1
round_jiffies---of 1
round_jiffies_relative---of 1
round_jiffies_up---of 1
round_jiffies_up_relative---of 1
run_timer_softirq---of 9
schedule_timeout40%of 5
schedule_timeout_idle---of 1
schedule_timeout_interruptible---of 1
schedule_timeout_killable---of 1
schedule_timeout_uninterruptible---of 1
stub_timer---of 1
timer_base_is_idle---of 1
timer_base_try_to_set_idle---of 3
timer_clear_idle---of 1
timer_debug_hint---of 4
timer_delete100%of 1
timer_delete_sync---of 1
timer_expire_remote---of 3
timer_fixup_activate---of 4
timer_fixup_assert_init---of 3
timer_fixup_free---of 3
timer_fixup_init---of 3
timer_is_static_object---of 3
timer_lock_remote_bases---of 7
timer_migration_handler---of 5
timer_reduce---of 1
timer_shutdown---of 1
timer_shutdown_sync---of 1
timer_unlock_remote_bases---of 3
timer_update_keys---of 4
timers_dead_cpu---of 19
timers_prepare_cpu---of 7
timers_update_nohz---of 1
trace_event_raw_event_hrtimer_class---of 7
trace_event_raw_event_hrtimer_expire_entry---of 7
trace_event_raw_event_hrtimer_init---of 7
trace_event_raw_event_hrtimer_start---of 7
trace_event_raw_event_itimer_expire---of 8
trace_event_raw_event_itimer_state---of 7
trace_event_raw_event_tick_stop---of 7
trace_event_raw_event_timer_base_idle---of 7
trace_event_raw_event_timer_class---of 7
trace_event_raw_event_timer_expire_entry---of 7
trace_event_raw_event_timer_start---of 7
trace_raw_output_hrtimer_class---of 3
trace_raw_output_hrtimer_expire_entry---of 3
trace_raw_output_hrtimer_init---of 3
trace_raw_output_hrtimer_start---of 3
trace_raw_output_itimer_expire---of 3
trace_raw_output_itimer_state---of 3
trace_raw_output_tick_stop---of 3
trace_raw_output_timer_base_idle---of 3
trace_raw_output_timer_class---of 3
trace_raw_output_timer_expire_entry---of 3
trace_raw_output_timer_start---of 3
trace_timer_base_idle---of 15
try_to_del_timer_sync---of 1
update_process_times---of 10
usleep_range_state---of 5
-----------
SUMMARY37%of 163

__bpf_trace_wbt_lat---of 1
__bpf_trace_wbt_stat---of 1
__bpf_trace_wbt_step---of 1
__bpf_trace_wbt_timer---of 1
__probestub_wbt_lat---of 1
__probestub_wbt_stat---of 1
__probestub_wbt_step---of 1
__probestub_wbt_timer---of 1
__traceiter_wbt_lat---of 4
__traceiter_wbt_stat---of 4
__traceiter_wbt_step---of 4
__traceiter_wbt_timer---of 4
perf_trace_wbt_lat---of 8
perf_trace_wbt_stat---of 8
perf_trace_wbt_step---of 8
perf_trace_wbt_timer---of 8
rwb_trace_step---of 15
scale_down---of 5
scale_up---of 11
trace_event_raw_event_wbt_lat---of 7
trace_event_raw_event_wbt_stat---of 7
trace_event_raw_event_wbt_step---of 7
trace_event_raw_event_wbt_timer---of 7
trace_raw_output_wbt_lat---of 3
trace_raw_output_wbt_stat---of 3
trace_raw_output_wbt_step---of 3
trace_raw_output_wbt_timer---of 3
trace_wbt_lat---of 15
trace_wbt_stat---of 15
wb_timer_fn---of 49
wbt_background_show---of 1
wbt_cleanup---of 8
wbt_cleanup_cb---of 1
wbt_curr_win_nsec_show---of 1
wbt_data_dir100%of 1
wbt_default_latency_nsec---of 1
wbt_disable_default---of 6
wbt_disabled---of 6
wbt_done34%of 12
wbt_enable_default---of 12
wbt_enabled_show---of 1
wbt_exit---of 1
wbt_get_min_lat---of 5
wbt_id_show---of 1
wbt_inflight_cb34%of 9
wbt_inflight_show---of 1
wbt_init---of 7
wbt_issue67%of 6
wbt_min_lat_nsec_show---of 1
wbt_normal_show---of 1
wbt_queue_depth_changed---of 3
wbt_requeue---of 5
wbt_rqw_done---of 10
wbt_set_min_lat---of 5
wbt_track50%of 8
wbt_unknown_cnt_show---of 1
wbt_update_limits---of 10
wbt_wait54%of 13
-----------
SUMMARY47%of 49

__blkdev_direct_IO---of 30
blkdev_bio_end_io---of 12
blkdev_bio_end_io_async---of 6
blkdev_direct_IO---of 44
blkdev_direct_write---of 6
blkdev_fallocate---of 20
blkdev_fsync---of 3
blkdev_get_block67%of 3
blkdev_iomap_begin---of 5
blkdev_llseek---of 1
blkdev_mmap---of 5
blkdev_open45%of 9
blkdev_read_folio---of 1
blkdev_read_iter---of 16
blkdev_readahead100%of 1
blkdev_release---of 1
blkdev_write_begin---of 1
blkdev_write_end---of 14
blkdev_write_iter---of 19
blkdev_writepages100%of 1
file_to_blk_mode---of 3
generic_write_sync---of 7
-----------
SUMMARY58%of 14

userio_char_open---of 4
userio_char_poll50%of 4
userio_char_read---of 17
userio_char_release---of 3
userio_char_write---of 12
userio_device_write---of 5
-----------
SUMMARY50%of 4

hfs_cat_build_key67%of 3
hfs_cat_build_record---of 3
hfs_cat_create---of 13
hfs_cat_delete---of 17
hfs_cat_find_brec---of 5
hfs_cat_keycmp100%of 3
hfs_cat_move---of 22
-----------
SUMMARY84%of 6

-----------
SUMMARY---of 0

__change_page_attr_set_clr---of 105
__cpa_flush_all---of 4
__cpa_flush_tlb---of 7
__cpa_process_fault---of 41
__set_memory_prot---of 1
__unmap_pmd_range---of 22
_set_memory_uc---of 1
_set_memory_wb---of 1
_set_memory_wc---of 3
_set_memory_wt---of 1
_set_pages_array---of 12
arch_invalidate_pmem---of 4
arch_report_meminfo---of 3
change_page_attr_set_clr---of 40
clear_mce_nospec---of 1
clflush_cache_range---of 4
cpu_cache_has_invalidate_memregion---of 4
cpu_cache_invalidate_memregion---of 4
kernel_page_present---of 1
lookup_address100%of 1
lookup_address_in_pgd---of 1
lookup_address_in_pgd_attr25%of 8
lookup_pmd_address---of 4
populate_pmd---of 22
populate_pte---of 8
set_direct_map_default_noflush---of 1
set_direct_map_invalid_noflush---of 1
set_mce_nospec---of 4
set_memory_4k---of 1
set_memory_decrypted---of 1
set_memory_encrypted---of 1
set_memory_global---of 1
set_memory_nonglobal---of 1
set_memory_np---of 1
set_memory_np_noalias---of 1
set_memory_nx---of 3
set_memory_p---of 1
set_memory_ro---of 1
set_memory_rox---of 1
set_memory_rw---of 1
set_memory_uc---of 4
set_memory_wb---of 3
set_memory_wc---of 5
set_memory_x---of 3
set_pages_array_uc---of 1
set_pages_array_wb---of 5
set_pages_array_wc---of 1
set_pages_ro---of 1
set_pages_rw---of 1
set_pages_uc---of 1
set_pages_wb---of 3
slow_virt_to_phys---of 5
static_protections---of 21
unmap_pmd_range---of 19
update_page_count---of 3
-----------
SUMMARY34%of 9

btrfs_del_inode_ref---of 16
btrfs_find_name_in_backref---of 7
btrfs_find_name_in_ext_backref---of 8
btrfs_insert_empty_inode---of 1
btrfs_insert_inode_ref---of 15
btrfs_lookup_inode25%of 8
btrfs_lookup_inode_extref---of 4
btrfs_truncate_inode_items---of 96
-----------
SUMMARY25%of 8

-----------
SUMMARY---of 0

___ratelimit32%of 19
-----------
SUMMARY32%of 19

chroot_fs_refs69%of 22
copy_fs_struct---of 3
current_umask100%of 1
exit_fs---of 4
free_fs_struct---of 1
set_fs_pwd50%of 6
set_fs_root50%of 6
unshare_fs_struct---of 4
-----------
SUMMARY63%of 35

blk_alloc_queue_stats---of 3
blk_free_queue_stats---of 4
blk_rq_stat_add---of 1
blk_rq_stat_init---of 1
blk_rq_stat_sum---of 3
blk_stat_add30%of 20
blk_stat_add_callback---of 10
blk_stat_alloc_callback---of 5
blk_stat_disable_accounting---of 4
blk_stat_enable_accounting---of 4
blk_stat_free_callback---of 3
blk_stat_free_callback_rcu---of 1
blk_stat_remove_callback---of 6
blk_stat_timer_fn---of 13
-----------
SUMMARY30%of 20

__percpu_ref_switch_mode---of 20
percpu_ref_exit---of 7
percpu_ref_get---of 13
percpu_ref_init40%of 10
percpu_ref_is_zero---of 4
percpu_ref_kill_and_confirm---of 4
percpu_ref_noop_confirm_switch---of 1
percpu_ref_put---of 14
percpu_ref_reinit---of 6
percpu_ref_resurrect---of 5
percpu_ref_switch_to_atomic---of 1
percpu_ref_switch_to_atomic_rcu---of 17
percpu_ref_switch_to_atomic_sync---of 5
percpu_ref_switch_to_percpu---of 1
-----------
SUMMARY40%of 10

udf_free_inode100%of 1
udf_new_inode---of 15
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

__bpf_trace_emulate_vsyscall---of 1
__probestub_emulate_vsyscall---of 1
__traceiter_emulate_vsyscall---of 4
emulate_vsyscall4%of 50
gate_vma_name---of 1
get_gate_vma50%of 4
in_gate_area---of 6
in_gate_area_no_mm100%of 1
perf_trace_emulate_vsyscall---of 8
secure_computing---of 3
trace_event_raw_event_emulate_vsyscall---of 7
trace_raw_output_emulate_vsyscall---of 3
warn_bad_vsyscall---of 4
write_ok_or_segv---of 3
-----------
SUMMARY10%of 55

-----------
SUMMARY---of 0

__get_task_comm67%of 3
__ia32_compat_sys_execve---of 1
__ia32_compat_sys_execveat---of 1
__ia32_sys_execve---of 1
__ia32_sys_execveat---of 1
__register_binfmt---of 5
__set_task_comm---of 17
__x64_compat_sys_execve---of 1
__x64_compat_sys_execveat---of 1
__x64_sys_execve---of 1
__x64_sys_execveat---of 1
acct_arg_size---of 3
alloc_bprm---of 35
begin_new_exec---of 67
bprm_change_interp---of 3
bprm_execve---of 87
cgroup_threadgroup_change_begin---of 10
cgroup_threadgroup_change_end---of 10
copy_string_kernel---of 8
copy_strings---of 23
count---of 12
do_execveat_common---of 18
do_open_execat---of 14
exec_mmap---of 22
finalize_exec---of 1
free_bprm---of 12
get_arg_page---of 23
kernel_execve---of 38
mmap_write_unlock---of 6
open_exec---of 3
path_noexec67%of 3
proc_dointvec_minmax_coredump---of 3
put_arg_page---of 14
remove_arg_zero---of 7
set_binfmt---of 5
set_dumpable---of 4
setup_arg_pages---of 38
setup_new_exec---of 3
unregister_binfmt---of 3
unshare_sighand---of 4
would_dump---of 16
-----------
SUMMARY67%of 6

-----------
SUMMARY---of 0

init_once---of 1
is_legacy_ntfs100%of 1
lock_buffer---of 3
ntfs3_kill_sb---of 3
ntfs3_label_open---of 1
ntfs3_label_show---of 1
ntfs3_label_write---of 10
ntfs3_put_sbi---of 11
ntfs3_volinfo---of 1
ntfs3_volinfo_open---of 1
ntfs_alloc_inode67%of 3
ntfs_discard---of 11
ntfs_export_get_inode50%of 4
ntfs_fh_to_dentry100%of 1
ntfs_fh_to_parent---of 1
ntfs_fill_super---of 156
ntfs_free_inode---of 1
ntfs_fs_free---of 5
ntfs_fs_get_tree---of 1
ntfs_fs_parse_param---of 26
ntfs_fs_reconfigure---of 14
ntfs_init_fs_context---of 6
ntfs_inode_printk---of 30
ntfs_map_page---of 3
ntfs_nfs_commit_metadata---of 1
ntfs_printk---of 23
ntfs_put_shared---of 25
ntfs_put_super---of 3
ntfs_set_shared---of 37
ntfs_show_options---of 29
ntfs_shutdown---of 1
ntfs_statfs---of 1
ntfs_sync_fs---of 10
ntfs_unmap_meta---of 8
ntfs_unmap_page---of 14
sb_getblk---of 1
-----------
SUMMARY67%of 9

-----------
SUMMARY---of 0

bdi_alloc---of 4
bdi_debug_stats_open---of 1
bdi_debug_stats_show---of 18
bdi_dev_name---of 3
bdi_get_by_id---of 13
bdi_init---of 3
bdi_put---of 9
bdi_register---of 1
bdi_register_va---of 32
bdi_set_owner---of 3
bdi_unregister---of 19
cgwb_debug_stats_open---of 1
cgwb_debug_stats_show---of 29
cgwb_free_rcu---of 1
cgwb_kill---of 12
cgwb_release---of 1
cgwb_release_workfn---of 11
cleanup_offline_cgwbs_workfn---of 19
collect_wb_stats---of 15
css_get---of 14
inode_to_bdi75%of 4
max_bytes_show---of 1
max_bytes_store---of 3
max_ratio_fine_show---of 1
max_ratio_fine_store---of 3
max_ratio_show---of 1
max_ratio_store---of 3
min_bytes_show---of 1
min_bytes_store---of 3
min_ratio_fine_show---of 1
min_ratio_fine_store---of 3
min_ratio_show---of 1
min_ratio_store---of 3
percpu_ref_put_many---of 14
read_ahead_kb_show---of 1
read_ahead_kb_store---of 3
stable_pages_required_show---of 3
strict_limit_show---of 1
strict_limit_store---of 3
wb_blkcg_offline---of 4
wb_get_create---of 32
wb_get_lookup---of 18
wb_init---of 4
wb_memcg_offline---of 4
wb_shutdown---of 6
wb_tryget---of 17
wb_update_bandwidth_workfn---of 1
-----------
SUMMARY75%of 4

-----------
SUMMARY---of 0

__ia32_sys_quotactl---of 1
__ia32_sys_quotactl_fd---of 1
__se_sys_quotactl11%of 56
__se_sys_quotactl_fd---of 34
__x64_sys_quotactl100%of 1
__x64_sys_quotactl_fd---of 1
compat_copy_fs_qfilestat---of 3
copy_qcinfo_from_xfs_dqblk---of 13
do_quotactl---of 40
qtype_enforce_flag---of 5
quota_disable---of 4
quota_enable---of 4
quota_getfmt---of 3
quota_getinfo---of 8
quota_getnextquota---of 9
quota_getnextxquota---of 16
quota_getquota---of 13
quota_getxquota---of 16
quota_getxstate---of 23
quota_getxstatev---of 14
quota_quotaoff---of 9
quota_quotaon---of 10
quota_rmxquota---of 4
quota_setinfo---of 12
quota_setquota---of 20
quota_setxquota---of 39
quota_sync_one58%of 7
-----------
SUMMARY18%of 64

__dump_page_owner---of 12
__folio_copy_owner---of 13
__reset_page_owner39%of 13
__set_page_owner43%of 14
__set_page_owner_migrate_reason---of 3
__split_page_owner---of 6
lseek_page_owner---of 4
page_owner_stack_open---of 1
page_owner_threshold_get---of 1
page_owner_threshold_set---of 1
pagetypeinfo_showmixedcount_print---of 24
pfn_valid---of 29
proc_page_owner_threshold_open---of 1
read_page_owner---of 66
register_dummy_stack---of 1
register_early_stack---of 1
register_failure_stack---of 1
save_stack75%of 4
stack_next---of 3
stack_print---of 7
stack_start---of 4
stack_stop---of 1
-----------
SUMMARY46%of 31

char2uni50%of 6
uni2char20%of 10
-----------
SUMMARY32%of 16

-----------
SUMMARY---of 0

__ia32_sys_memfd_create---of 1
__se_sys_memfd_create15%of 41
__x64_sys_memfd_create100%of 1
memfd_fcntl---of 81
memfd_folio_has_extra_refs---of 24
-----------
SUMMARY17%of 42

__btrfs_check_leaf---of 227
__btrfs_check_node---of 10
block_group_err---of 1
btrfs_check_chunk_valid---of 42
btrfs_check_eb_owner---of 8
btrfs_check_leaf---of 1
btrfs_check_node---of 1
btrfs_verify_level_key45%of 9
check_inode_key---of 13
check_prev_ino---of 11
check_root_key---of 12
chunk_err---of 6
dev_item_err---of 1
dir_item_err---of 1
extent_err---of 5
file_extent_err---of 1
generic_err---of 1
-----------
SUMMARY45%of 9

-----------
SUMMARY---of 0

__anon_vma_interval_tree_augment_rotate---of 5
anon_vma_interval_tree_insert29%of 7
anon_vma_interval_tree_iter_first---of 12
anon_vma_interval_tree_iter_next---of 16
anon_vma_interval_tree_remove15%of 42
anon_vma_interval_tree_verify60%of 5
vma_interval_tree_augment_rotate60%of 5
vma_interval_tree_insert86%of 7
vma_interval_tree_insert_after---of 10
vma_interval_tree_iter_first25%of 12
vma_interval_tree_iter_next19%of 16
vma_interval_tree_remove46%of 42
-----------
SUMMARY34%of 136

-----------
SUMMARY---of 0

HAS_UNMAPPED_ID67%of 3
__check_sticky---of 4
__filename_parentat50%of 22
__ia32_sys_link---of 1
__ia32_sys_linkat---of 1
__ia32_sys_mkdir---of 1
__ia32_sys_mkdirat---of 1
__ia32_sys_mknod---of 1
__ia32_sys_mknodat---of 1
__ia32_sys_rename---of 1
__ia32_sys_renameat---of 1
__ia32_sys_renameat2---of 1
__ia32_sys_rmdir---of 1
__ia32_sys_symlink---of 1
__ia32_sys_symlinkat---of 1
__ia32_sys_unlink---of 1
__ia32_sys_unlinkat---of 4
__kern_path_locked---of 5
__lookup_slow34%of 12
__traverse_mounts39%of 26
__x64_sys_link100%of 1
__x64_sys_linkat100%of 1
__x64_sys_mkdir100%of 1
__x64_sys_mkdirat100%of 1
__x64_sys_mknod100%of 1
__x64_sys_mknodat100%of 1
__x64_sys_rename100%of 1
__x64_sys_renameat100%of 1
__x64_sys_renameat2100%of 1
__x64_sys_rmdir100%of 1
__x64_sys_symlink100%of 1
__x64_sys_symlinkat100%of 1
__x64_sys_unlink100%of 1
__x64_sys_unlinkat50%of 4
check_acl---of 9
choose_mountpoint21%of 34
choose_mountpoint_rcu---of 9
complete_walk39%of 13
d_delete_notify50%of 4
do_file_open_root48%of 17
do_filp_open55%of 11
do_linkat22%of 19
do_mkdirat45%of 9
do_mknodat24%of 25
do_o_path---of 5
do_renameat224%of 43
do_rmdir34%of 12
do_symlinkat58%of 7
do_tmpfile---of 7
do_unlinkat23%of 22
done_path_create---of 1
dont_mount100%of 1
drop_links---of 6
filename_create34%of 9
filename_lookup50%of 18
follow_down---of 5
follow_down_one---of 4
follow_up---of 4
fsnotify_create---of 7
fsnotify_link60%of 10
fsnotify_link_count50%of 4
fsnotify_move56%of 20
full_name_hash80%of 5
generic_permission37%of 22
getname100%of 1
getname_flags35%of 20
getname_kernel34%of 9
getname_uflags---of 1
handle_dots24%of 43
handle_lookup_down---of 4
hashlen_string---of 4
inode_permission50%of 20
kern_path100%of 1
kern_path_create---of 1
kern_path_locked---of 1
kernel_tmpfile_open---of 4
leave_rcu34%of 6
legitimize_links23%of 9
legitimize_path---of 5
link_path_walk61%of 38
lock_rename---of 3
lock_rename_child---of 6
lock_two_directories---of 7
lookup_fast32%of 16
lookup_one---of 9
lookup_one_common32%of 16
lookup_one_len---of 9
lookup_one_len_unlocked---of 1
lookup_one_positive_unlocked---of 4
lookup_one_qstr_excl40%of 10
lookup_one_unlocked29%of 7
lookup_positive_unlocked---of 4
lookup_slow100%of 1
may_create28%of 11
may_delete32%of 25
may_linkat23%of 9
may_open31%of 26
may_open_dev67%of 3
nd_alloc_stack---of 3
nd_jump_link---of 6
nd_jump_root31%of 13
page_get_link---of 20
page_put_link---of 1
page_readlink---of 3
page_symlink---of 7
path_get67%of 3
path_init37%of 57
path_lookupat53%of 17
path_openat37%of 158
path_parentat---of 4
path_pts---of 9
path_put100%of 1
pick_link18%of 45
put_link---of 5
put_page---of 14
putname34%of 6
rcu_read_unlock---of 6
readlink_copy50%of 4
seqcount_lockdep_reader_access58%of 7
set_root42%of 12
step_into32%of 44
terminate_walk75%of 12
try_break_deleg29%of 7
try_lookup_one_len---of 9
try_to_unlazy25%of 16
try_to_unlazy_next---of 15
unlock_rename---of 3
user_path_at_empty100%of 1
user_path_create---of 1
user_path_locked_at---of 1
vfs_create36%of 14
vfs_get_link---of 4
vfs_link17%of 18
vfs_mkdir30%of 17
vfs_mknod---of 18
vfs_mkobj---of 10
vfs_path_lookup---of 1
vfs_path_parent_lookup---of 1
vfs_readlink50%of 10
vfs_rename29%of 64
vfs_rmdir36%of 17
vfs_symlink37%of 11
vfs_tmpfile---of 11
vfs_unlink22%of 23
walk_component36%of 14
-----------
SUMMARY37%of 1247

-----------
SUMMARY---of 0

__pte_offset_map38%of 16
__pte_offset_map_lock40%of 5
pgd_clear_bad---of 1
pgtable_trans_huge_deposit---of 5
pgtable_trans_huge_withdraw---of 5
pmd_clear_bad---of 1
pmdp_collapse_flush---of 7
pmdp_huge_clear_flush---of 7
pmdp_invalidate---of 7
pte_free_defer---of 1
pte_free_now---of 18
pte_offset_map_nolock67%of 3
pte_unmap---of 6
ptep_clear_flush43%of 7
pud_clear_bad---of 1
pudp_huge_clear_flush---of 7
rcu_read_unlock---of 6
-----------
SUMMARY42%of 31

__btrfs_prealloc_file_range---of 27
__btrfs_unlink_inode---of 23
__cow_file_range_inline---of 34
_compound_head---of 7
acls_after_inode_item19%of 16
add_async_extent---of 4
btrfs_add_delayed_iput20%of 10
btrfs_add_link---of 26
btrfs_add_swapfile_pin---of 13
btrfs_alloc_inode40%of 10
btrfs_assert_inode_range_clean---of 3
btrfs_check_sector_csum---of 3
btrfs_cleanup_ordered_extents---of 18
btrfs_clear_delalloc_extent45%of 20
btrfs_cont_expand---of 24
btrfs_create---of 3
btrfs_create_common---of 7
btrfs_create_dio_extent---of 8
btrfs_create_new_inode---of 95
btrfs_data_csum_ok---of 14
btrfs_del_delalloc_inode34%of 12
btrfs_delete_subvolume---of 44
btrfs_dentry_delete38%of 8
btrfs_destroy_cachep---of 1
btrfs_destroy_inode---of 31
btrfs_dio_end_io---of 7
btrfs_dio_iomap_begin---of 44
btrfs_dio_iomap_end---of 8
btrfs_dio_read---of 1
btrfs_dio_submit_io---of 12
btrfs_dio_write---of 1
btrfs_dir_llseek---of 5
btrfs_dirty_inode30%of 10
btrfs_do_encoded_write---of 55
btrfs_drop_inode---of 5
btrfs_encoded_io_compression_from_extent---of 7
btrfs_encoded_read---of 43
btrfs_encoded_read_endio---of 5
btrfs_encoded_read_regular---of 15
btrfs_encoded_read_regular_fill_pages---of 9
btrfs_evict_inode---of 46
btrfs_extent_readonly---of 3
btrfs_fiemap---of 8
btrfs_find_actor---of 3
btrfs_find_first_inode---of 14
btrfs_finish_one_ordered---of 69
btrfs_finish_ordered_io---of 5
btrfs_free_inode---of 1
btrfs_get_blocks_direct_write---of 31
btrfs_get_extent9%of 73
btrfs_getattr64%of 11
btrfs_iget---of 1
btrfs_iget_path32%of 29
btrfs_init_locked_inode34%of 12
btrfs_inode_lock43%of 7
btrfs_inode_unlock60%of 5
btrfs_invalidate_folio---of 33
btrfs_link---of 19
btrfs_lookup100%of 1
btrfs_lookup_dentry9%of 36
btrfs_merge_delalloc_extent---of 8
btrfs_migrate_folio---of 21
btrfs_mkdir---of 3
btrfs_mknod---of 3
btrfs_mod_outstanding_extents27%of 19
btrfs_new_inode_args_destroy---of 9
btrfs_new_inode_prepare---of 11
btrfs_new_subvol_inode---of 3
btrfs_opendir---of 8
btrfs_orphan_add---of 6
btrfs_orphan_cleanup---of 36
btrfs_permission23%of 9
btrfs_prealloc_file_range---of 1
btrfs_prealloc_file_range_trans---of 1
btrfs_print_data_csum_error---of 22
btrfs_real_readdir33%of 34
btrfs_release_delalloc_bytes---of 3
btrfs_release_folio---of 17
btrfs_rename2---of 142
btrfs_rmdir---of 11
btrfs_run_delalloc_range7%of 45
btrfs_run_delalloc_work---of 3
btrfs_run_delayed_iputs---of 6
btrfs_set_delalloc_extent27%of 19
btrfs_set_extent_delalloc---of 13
btrfs_set_inode_index---of 5
btrfs_set_inode_index_count---of 8
btrfs_set_range_writeback23%of 27
btrfs_setattr7%of 49
btrfs_split_delalloc_extent---of 8
btrfs_start_delalloc_roots---of 26
btrfs_start_delalloc_snapshot---of 3
btrfs_swap_activate---of 54
btrfs_swap_deactivate---of 7
btrfs_symlink---of 15
btrfs_tmpfile---of 8
btrfs_truncate_block---of 58
btrfs_unlink---of 7
btrfs_unlink_inode---of 3
btrfs_unlink_subvol---of 31
btrfs_update_inode34%of 6
btrfs_update_inode_bytes---of 6
btrfs_update_inode_fallback---of 3
btrfs_update_inode_item---of 4
btrfs_update_time50%of 4
btrfs_wait_on_delayed_iputs---of 7
btrfs_writepage_cow_fixup10%of 20
btrfs_writepage_fixup_worker---of 62
can_cow_file_range_inline---of 6
can_nocow_extent---of 21
can_nocow_file_extent---of 21
compress_file_range---of 53
cow_file_range22%of 41
cow_file_range_inline---of 8
create_io_em20%of 21
data_reloc_print_warning_inode---of 8
evict_refill_and_join---of 6
fallback_to_cow8%of 40
fill_inode_item---of 1
folio_size---of 10
folio_zero_range---of 14
get_extent_allocation_hint40%of 5
init_once---of 1
inode_tree_add34%of 9
insert_ordered_extent_file_extent---of 6
insert_reserved_file_extent---of 19
may_destroy_subvol---of 10
put_page29%of 14
read_inline_extent---of 19
run_delalloc_cow---of 5
run_delalloc_nocow10%of 51
run_delayed_iput_locked---of 5
start_delalloc_inodes---of 30
submit_compressed_extents---of 49
submit_uncompressed_range---of 9
trace_btrfs_get_extent_show_fi_inline---of 15
uncompress_inline---of 4
wait_subpage_spinlock---of 17
-----------
SUMMARY21%of 673

tomoyo_assign_domain---of 26
tomoyo_assign_namespace---of 27
tomoyo_check_acl62%of 13
tomoyo_dump_page---of 27
tomoyo_find_next_domain---of 73
tomoyo_update_domain---of 23
tomoyo_update_policy---of 12
tomoyo_write_aggregator---of 21
tomoyo_write_transition_control---of 34
-----------
SUMMARY62%of 13

__ext4_expand_extra_isize---of 8
__ext4_get_inode_loc13%of 39
__ext4_iget4%of 120
__ext4_journalled_invalidate_folio---of 23
__ext4_mark_inode_dirty21%of 29
_ext4_get_block47%of 13
check_igot_inode---of 7
do_journal_get_write_access---of 7
ext4_alloc_da_blocks---of 17
ext4_begin_ordered_truncate30%of 17
ext4_block_write_begin30%of 89
ext4_block_zero_page_range---of 42
ext4_blocks_for_truncate---of 6
ext4_bmap---of 8
ext4_bread38%of 8
ext4_bread_batch18%of 29
ext4_break_layouts23%of 9
ext4_buffer_uptodate---of 4
ext4_can_truncate---of 5
ext4_change_inode_journal_flag---of 23
ext4_chksum---of 4
ext4_chunk_trans_blocks50%of 6
ext4_da_get_block_prep35%of 52
ext4_da_release_space37%of 22
ext4_da_reserve_space25%of 24
ext4_da_update_reserve_space29%of 28
ext4_da_write_begin23%of 35
ext4_da_write_end27%of 56
ext4_dax_writepages---of 3
ext4_dio_alignment30%of 10
ext4_dirty_folio23%of 18
ext4_dirty_inode67%of 3
ext4_do_writepages38%of 155
ext4_es_is_delayed100%of 1
ext4_es_is_delonly100%of 1
ext4_es_is_mapped100%of 1
ext4_evict_inode28%of 59
ext4_expand_extra_isize---of 13
ext4_file_getattr---of 8
ext4_fill_raw_inode58%of 49
ext4_get_block---of 1
ext4_get_block_unwritten80%of 5
ext4_get_fc_inode_loc---of 1
ext4_get_inode_loc67%of 3
ext4_get_projid---of 3
ext4_get_reserved_space100%of 1
ext4_getattr29%of 32
ext4_getblk40%of 20
ext4_has_group_desc_csum---of 7
ext4_has_metadata_csum---of 6
ext4_iget_extra_inode---of 8
ext4_inode_attach_jinode29%of 7
ext4_inode_blocks---of 5
ext4_inode_csum24%of 17
ext4_inode_csum_set---of 10
ext4_inode_csum_verify---of 10
ext4_inode_is_fast_symlink37%of 11
ext4_inode_set_iversion_queried---of 3
ext4_invalidate_folio28%of 18
ext4_iomap_begin42%of 34
ext4_iomap_begin_report---of 16
ext4_iomap_end100%of 1
ext4_iomap_overwrite_begin---of 4
ext4_iomap_swap_activate---of 1
ext4_issue_zeroout---of 8
ext4_journal_folio_buffers---of 13
ext4_journalled_dirty_folio---of 19
ext4_journalled_invalidate_folio---of 3
ext4_journalled_write_end---of 65
ext4_journalled_zero_new_buffers---of 12
ext4_map_blocks57%of 71
ext4_mark_iloc_dirty24%of 76
ext4_normal_submit_inode_data_buffers---of 1
ext4_page_mkwrite18%of 41
ext4_print_free_blocks---of 7
ext4_punch_hole---of 48
ext4_read_folio---of 19
ext4_readahead50%of 4
ext4_release_folio30%of 24
ext4_reserve_inode_write29%of 7
ext4_set_aops63%of 8
ext4_set_inode_flags32%of 16
ext4_set_iomap38%of 29
ext4_setattr40%of 80
ext4_should_dioread_nolock---of 6
ext4_truncate33%of 71
ext4_update_disksize_before_punch---of 11
ext4_update_inode_fsync_trans---of 7
ext4_wait_for_tail_page_commit---of 15
ext4_walk_page_buffers---of 8
ext4_write_begin24%of 59
ext4_write_end34%of 50
ext4_write_inode24%of 17
ext4_writepage_trans_blocks63%of 8
ext4_writepages60%of 5
ext4_zero_partial_blocks---of 12
folio_lock45%of 9
folio_size30%of 10
folio_test_uptodate---of 9
i_gid_needs_update100%of 3
lock_buffer---of 3
mpage_folio_done34%of 9
mpage_prepare_extent_to_map33%of 101
mpage_process_page_bufs66%of 29
mpage_release_unused_pages28%of 65
mpage_submit_folio58%of 7
percpu_down_read40%of 10
percpu_up_read40%of 10
trace_ext4_load_inode---of 15
trace_ext4_writepages27%of 15
trace_ext4_writepages_result27%of 15
wait_on_buffer---of 3
write_end_fn---of 5
zero_user_segments48%of 17
-----------
SUMMARY32%of 1888

V2_minix_blocks---of 7
V2_minix_get_block100%of 1
V2_minix_truncate19%of 49
block_to_path7%of 31
free_branches---of 12
get_block8%of 68
-----------
SUMMARY12%of 149

-----------
SUMMARY---of 0

__mpage_writepage19%of 96
bio_first_folio34%of 18
bio_next_folio20%of 10
do_mpage_readpage27%of 96
folio_size30%of 10
folio_zero_segment50%of 14
mpage_read_end_io25%of 16
mpage_read_folio---of 3
mpage_readahead31%of 26
mpage_write_end_io---of 15
mpage_writepages67%of 3
-----------
SUMMARY26%of 289

-----------
SUMMARY---of 0

arch_bp_generic_fields---of 11
arch_check_bp_in_kernelspace---of 6
arch_install_hw_breakpoint---of 23
arch_uninstall_hw_breakpoint---of 21
decode_dr7---of 5
encode_dr7---of 5
flush_ptrace_hw_breakpoint---of 1
hw_breakpoint_arch_parse---of 27
hw_breakpoint_exceptions_notify9%of 23
hw_breakpoint_pmu_read---of 1
hw_breakpoint_restore---of 1
-----------
SUMMARY9%of 23

-----------
SUMMARY---of 0

__bpf_trace_workqueue_activate_work---of 1
__bpf_trace_workqueue_execute_end---of 1
__bpf_trace_workqueue_execute_start---of 1
__bpf_trace_workqueue_queue_work---of 1
__cancel_work---of 15
__cancel_work_sync---of 8
__flush_work---of 52
__flush_workqueue10%of 61
__init_work67%of 3
__probestub_workqueue_activate_work---of 1
__probestub_workqueue_execute_end---of 1
__probestub_workqueue_execute_start---of 1
__probestub_workqueue_queue_work---of 1
__pwq_activate_work---of 11
__queue_delayed_work50%of 14
__queue_work25%of 81
__traceiter_workqueue_activate_work---of 4
__traceiter_workqueue_execute_end---of 4
__traceiter_workqueue_execute_start---of 4
__traceiter_workqueue_queue_work---of 4
__warn_flushing_systemwide_wq---of 1
alloc_unbound_pwq---of 38
alloc_workqueue---of 81
alloc_workqueue_attrs---of 3
apply_workqueue_attrs---of 1
apply_workqueue_attrs_locked---of 10
apply_wqattrs_commit---of 7
apply_wqattrs_prepare---of 32
assign_work---of 17
bh_pool_irq_work---of 3
bh_pool_kick_highpri---of 1
bh_pool_kick_normal---of 1
bh_worker---of 22
cancel_delayed_work---of 1
cancel_delayed_work_sync---of 1
cancel_work---of 1
cancel_work_sync---of 1
check_flush_dependency---of 14
cpumask_isolated_show---of 1
cpumask_requested_show---of 1
cpumask_show---of 1
cpumask_store---of 5
create_worker---of 14
current_is_workqueue_rescuer---of 5
current_work---of 5
delayed_work_timer_fn---of 1
destroy_delayed_work_on_stack---of 1
destroy_work_on_stack---of 1
destroy_workqueue---of 45
disable_delayed_work---of 1
disable_delayed_work_sync---of 6
disable_work---of 1
disable_work_sync---of 6
drain_dead_softirq_workfn---of 5
drain_workqueue---of 11
enable_delayed_work---of 1
enable_work---of 14
execute_in_process_context---of 3
flush_delayed_work---of 5
flush_rcu_work---of 3
flush_work---of 1
flush_workqueue_prep_pwqs45%of 20
format_worker_id---of 5
free_workqueue_attrs---of 3
freeze_workqueues_begin---of 6
freeze_workqueues_busy---of 25
get_pwq---of 6
get_work_pool50%of 10
idle_cull_fn---of 11
idle_worker_timeout---of 6
init_rescuer---of 7
init_worker_pool---of 4
insert_work50%of 10
install_unbound_pwq---of 15
jhash---of 17
kick_pool36%of 17
max_active_show---of 1
max_active_store---of 4
mod_delayed_work_on78%of 9
move_linked_works---of 10
parse_affn_scope---of 7
per_cpu_show---of 1
perf_trace_workqueue_activate_work---of 8
perf_trace_workqueue_execute_end---of 8
perf_trace_workqueue_execute_start---of 8
perf_trace_workqueue_queue_work---of 8
pool_mayday_timeout---of 19
pr_cont_pool_info---of 5
pr_cont_work---of 16
print_worker_info---of 8
process_scheduled_works---of 77
put_pwq_unlocked---of 7
put_unbound_pool---of 32
pwq_dec_nr_in_flight20%of 45
pwq_release_workfn---of 23
pwq_tryinc_nr_active28%of 22
queue_delayed_work_on59%of 12
queue_rcu_work---of 4
queue_work_node---of 18
queue_work_on84%of 12
rcu_free_pool---of 3
rcu_free_pwq---of 1
rcu_free_wq---of 12
rcu_work_rcufn---of 3
rescuer_thread---of 36
schedule_on_each_cpu---of 10
set_work_pool_and_clear_pending---of 3
set_worker_desc---of 5
set_worker_dying---of 18
show_all_workqueues---of 36
show_freezable_workqueues---of 16
show_one_workqueue---of 11
show_pwq---of 41
thaw_workqueues---of 5
trace_event_raw_event_workqueue_activate_work---of 7
trace_event_raw_event_workqueue_execute_end---of 7
trace_event_raw_event_workqueue_execute_start---of 7
trace_event_raw_event_workqueue_queue_work---of 7
trace_raw_output_workqueue_activate_work---of 3
trace_raw_output_workqueue_execute_end---of 3
trace_raw_output_workqueue_execute_start---of 3
trace_raw_output_workqueue_queue_work---of 3
trace_workqueue_activate_work27%of 15
unbind_worker---of 8
unbound_pwq---of 10
work_busy23%of 18
work_debug_hint---of 1
work_fixup_free---of 3
work_fixup_init---of 3
work_for_cpu_fn---of 1
work_grab_pending28%of 44
work_is_static_object---of 1
work_offqd_unpack---of 3
work_on_cpu_key---of 1
work_on_cpu_safe_key---of 5
worker_attach_to_pool---of 10
worker_detach_from_pool---of 10
worker_enter_idle---of 15
worker_leave_idle---of 7
worker_thread---of 41
workqueue_apply_unbound_cpumask---of 25
workqueue_congested---of 17
workqueue_offline_cpu---of 26
workqueue_online_cpu---of 56
workqueue_prepare_cpu---of 9
workqueue_set_max_active---of 7
workqueue_set_min_active---of 3
workqueue_softirq_action---of 6
workqueue_softirq_dead---of 11
workqueue_sysfs_register---of 9
workqueue_unbound_exclude_cpumask---of 3
wq_adjust_max_active---of 20
wq_affinity_strict_show---of 1
wq_affinity_strict_store---of 9
wq_affn_dfl_get---of 3
wq_affn_dfl_set---of 10
wq_affn_scope_show---of 5
wq_affn_scope_store---of 9
wq_barrier_func---of 1
wq_cpumask_show---of 1
wq_cpumask_store---of 9
wq_device_release---of 1
wq_nice_show---of 1
wq_nice_store---of 10
wq_update_node_max_active---of 30
wq_update_pod---of 23
wq_watchdog_param_set_thresh---of 8
wq_watchdog_timer_fn---of 63
wq_watchdog_touch50%of 4
wq_worker_comm---of 5
wq_worker_last_func---of 1
wq_worker_running---of 6
wq_worker_sleeping---of 6
wq_worker_tick---of 13
wqattrs_pod_type---of 13
-----------
SUMMARY31%of 397

-----------
SUMMARY---of 0

arch_do_signal_or_restart24%of 30
get_sigframe25%of 20
get_sigframe_size---of 1
sigaltstack_size_valid---of 10
signal_fault---of 4
-----------
SUMMARY24%of 50

fscrypt_context_for_new_inode---of 5
fscrypt_dummy_policies_equal---of 4
fscrypt_get_dummy_policy---of 3
fscrypt_get_policy---of 9
fscrypt_has_permitted_context---of 27
fscrypt_ioctl_get_nonce---of 7
fscrypt_ioctl_get_policy---of 11
fscrypt_ioctl_get_policy_ex---of 12
fscrypt_ioctl_set_policy---of 14
fscrypt_parse_test_dummy_encryption---of 11
fscrypt_policies_equal---of 3
fscrypt_policy_from_context---of 6
fscrypt_policy_to_inherit34%of 6
fscrypt_policy_to_key_spec---of 4
fscrypt_set_context---of 8
fscrypt_show_test_dummy_encryption---of 4
fscrypt_supported_policy---of 40
set_encryption_policy---of 12
-----------
SUMMARY34%of 6

__seq_open_private---of 6
__seq_puts---of 3
mangle_path---of 8
seq_bprintf---of 4
seq_dentry---of 15
seq_escape_mem---of 7
seq_file_path---of 1
seq_hex_dump---of 16
seq_hlist_next---of 1
seq_hlist_next_percpu---of 20
seq_hlist_next_rcu---of 11
seq_hlist_start---of 3
seq_hlist_start_head---of 4
seq_hlist_start_head_rcu---of 3
seq_hlist_start_percpu---of 9
seq_hlist_start_rcu---of 15
seq_list_next---of 1
seq_list_next_rcu---of 1
seq_list_start---of 5
seq_list_start_head---of 6
seq_list_start_head_rcu---of 3
seq_list_start_rcu---of 15
seq_lseek---of 9
seq_open---of 5
seq_open_private---of 1
seq_pad---of 7
seq_path---of 15
seq_path_root---of 18
seq_printf50%of 4
seq_put_decimal_ll---of 14
seq_put_decimal_ull---of 1
seq_put_decimal_ull_width---of 11
seq_put_hex_ll---of 16
seq_putc67%of 3
seq_read25%of 8
seq_read_iter32%of 38
seq_release---of 1
seq_release_private---of 1
seq_vprintf---of 4
seq_write---of 3
single_next100%of 1
single_open---of 6
single_open_size---of 5
single_release---of 1
single_start100%of 1
single_stop100%of 1
traverse34%of 15
-----------
SUMMARY37%of 71

run_add_entry---of 39
run_clone---of 4
run_collapse_range---of 24
run_consolidate---of 15
run_get_entry---of 9
run_get_highest_vcn---of 16
run_insert_range---of 18
run_is_mapped_full---of 16
run_lookup_entry22%of 19
run_pack---of 66
run_truncate---of 14
run_truncate_around---of 3
run_truncate_head---of 19
run_unpack---of 47
run_unpack_ex---of 15
-----------
SUMMARY22%of 19

-----------
SUMMARY---of 0

_compound_head---of 7
ntfs3_setattr---of 27
ntfs_compat_ioctl---of 1
ntfs_compress_write---of 92
ntfs_extend---of 17
ntfs_extend_initialized_size---of 18
ntfs_fallocate---of 69
ntfs_fiemap---of 3
ntfs_file_mmap---of 18
ntfs_file_open28%of 11
ntfs_file_read_iter---of 10
ntfs_file_release---of 5
ntfs_file_splice_read---of 5
ntfs_file_write_iter---of 26
ntfs_get_frame_pages---of 21
ntfs_getattr50%of 6
ntfs_ioctl---of 7
ntfs_zero_range---of 34
put_page---of 14
zero_user_segments---of 14
-----------
SUMMARY36%of 17

fsnotify_destroy_event---of 6
fsnotify_flush_notify---of 10
fsnotify_get_cookie100%of 1
fsnotify_insert_event---of 15
fsnotify_peek_first_event---of 4
fsnotify_remove_first_event---of 9
fsnotify_remove_queued_event---of 4
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

__destroy_inode23%of 31
__iget100%of 1
__insert_inode_hash60%of 5
__remove_inode_hash---of 4
address_space_init_once---of 1
atime_needs_update45%of 20
bmap---of 3
clear_inode29%of 7
clear_nlink67%of 3
current_time72%of 7
dentry_needs_remove_privs75%of 4
destroy_inode---of 5
discard_new_inode---of 6
drop_nlink80%of 5
dump_mapping---of 11
evict44%of 23
evict_inodes---of 23
file_modified100%of 1
file_modified_flags50%of 18
file_remove_privs100%of 1
file_remove_privs_flags40%of 15
file_update_time80%of 15
find_inode30%of 10
find_inode_by_ino_rcu40%of 15
find_inode_fast30%of 10
find_inode_nowait---of 10
find_inode_rcu---of 15
free_inode_nonrcu---of 1
generic_delete_inode100%of 1
generic_update_time---of 3
get_next_ino56%of 9
get_nr_dirty_inodes56%of 9
i_callback---of 3
iget5_locked34%of 12
iget_locked25%of 29
igrab67%of 3
ihold67%of 3
ilookup45%of 9
ilookup563%of 8
ilookup5_nowait---of 3
in_group_or_capable67%of 3
inc_nlink50%of 4
init_once---of 1
init_special_inode---of 6
inode_add_lru34%of 9
inode_dio_wait34%of 6
inode_init_always60%of 5
inode_init_once100%of 1
inode_init_owner75%of 4
inode_insert538%of 16
inode_lru_isolate---of 17
inode_needs_sync34%of 6
inode_nohighmem100%of 1
inode_owner_or_capable40%of 5
inode_sb_list_add---of 3
inode_set_ctime_current100%of 1
inode_set_flags60%of 5
inode_update_time---of 4
inode_update_timestamps70%of 13
insert_inode_locked47%of 15
insert_inode_locked4---of 3
invalidate_inodes---of 23
iput21%of 43
iunique29%of 21
kiocb_modified---of 1
lock_two_nondirectories46%of 11
lockdep_annotate_inode_mutex_key75%of 4
mode_strip_sgid29%of 7
new_inode50%of 4
new_inode_pseudo40%of 10
no_open---of 1
proc_nr_inodes---of 9
prune_icache_sb---of 6
set_nlink40%of 5
timestamp_truncate---of 7
touch_atime43%of 26
unlock_new_inode67%of 6
unlock_two_nondirectories---of 7
wait_on_inode---of 3
-----------
SUMMARY43%of 505

proc_ns_dir_lookup47%of 13
proc_ns_dir_readdir---of 16
proc_ns_get_link---of 9
proc_ns_instantiate---of 3
proc_ns_readlink38%of 8
-----------
SUMMARY43%of 21

-----------
SUMMARY---of 0

strnlen_user30%of 10
-----------
SUMMARY30%of 10

__get_vma_policy---of 4
__ia32_sys_get_mempolicy---of 1
__ia32_sys_mbind---of 1
__ia32_sys_migrate_pages---of 1
__ia32_sys_set_mempolicy---of 8
__ia32_sys_set_mempolicy_home_node---of 1
__mpol_dup---of 11
__mpol_equal---of 15
__mpol_put---of 3
__se_sys_get_mempolicy---of 90
__se_sys_mbind---of 94
__se_sys_migrate_pages---of 41
__se_sys_set_mempolicy_home_node---of 33
__x64_sys_get_mempolicy---of 1
__x64_sys_mbind---of 1
__x64_sys_migrate_pages---of 1
__x64_sys_set_mempolicy---of 8
__x64_sys_set_mempolicy_home_node---of 1
alloc_migration_target_by_mpol---of 29
alloc_pages_bulk_array_mempolicy_noprof---of 88
alloc_pages_mpol_noprof16%of 19
alloc_pages_noprof34%of 6
apply_policy_zone---of 3
change_prot_numa---of 3
do_migrate_pages---of 31
do_set_mempolicy---of 12
folio_alloc_noprof34%of 6
folio_large_mapcount---of 9
folio_likely_mapped_shared---of 36
get_il_weight---of 18
get_nodes---of 17
get_task_policy40%of 5
get_vma_policy25%of 16
huge_node---of 1
init_nodemask_of_mempolicy---of 10
mbind_range---of 37
mempolicy_in_oom_domain---of 5
mempolicy_kobj_release---of 6
mempolicy_slab_node8%of 26
migrate_folio_add---of 23
mpol_free_shared_policy25%of 8
mpol_misplaced---of 44
mpol_new---of 13
mpol_new_nodemask---of 3
mpol_new_preferred---of 4
mpol_parse_str---of 51
mpol_put_task_policy---of 4
mpol_rebind_default---of 1
mpol_rebind_mm---of 25
mpol_rebind_nodemask---of 6
mpol_rebind_preferred---of 1
mpol_rebind_task---of 8
mpol_set_nodemask---of 8
mpol_set_shared_policy---of 47
mpol_shared_policy_init10%of 20
mpol_shared_policy_lookup17%of 12
mpol_to_str---of 17
node_show---of 1
node_store---of 13
numa_default_policy---of 1
numa_nearest_node---of 10
page_rmappable_folio18%of 17
page_to_nid---of 3
page_zone---of 4
policy_nodemask6%of 36
queue_folios_hugetlb---of 43
queue_folios_pte_range---of 81
queue_pages_test_walk---of 28
read_mems_allowed_begin---of 11
sysfs_wi_release---of 6
vma_alloc_folio_noprof40%of 5
vma_dup_policy50%of 4
vma_migratable---of 14
vma_policy_mof---of 12
weighted_interleave_nid---of 40
weighted_interleave_nodes---of 15
-----------
SUMMARY17%of 180

__register_virtio_driver---of 4
device_show---of 1
features_show---of 3
is_virtio_device---of 1
modalias_show---of 1
register_virtio_device---of 15
status_show---of 1
unregister_virtio_device---of 1
unregister_virtio_driver---of 1
vendor_show---of 1
virtio_add_status---of 1
virtio_check_driver_offered_feature30%of 10
virtio_config_changed---of 5
virtio_config_enable---of 5
virtio_dev_match---of 8
virtio_dev_probe---of 54
virtio_dev_remove---of 5
virtio_device_freeze---of 10
virtio_device_ready---of 3
virtio_device_restore---of 19
virtio_features_ok---of 8
virtio_init---of 3
virtio_reset_device---of 1
virtio_uevent---of 1
-----------
SUMMARY30%of 10

add_to_page_cache_lru---of 7
clear_page_dirty_for_io---of 7
end_page_writeback---of 7
grab_cache_page_write_begin---of 1
isolate_lru_page---of 9
mark_page_accessed29%of 7
page_mapping---of 7
pagecache_get_page30%of 10
putback_lru_page---of 7
redirty_page_for_writepage---of 7
set_page_dirty---of 7
set_page_writeback---of 7
unlock_page29%of 7
wait_for_stable_page---of 7
wait_on_page_writeback---of 7
-----------
SUMMARY30%of 24

__anon_inode_getfile39%of 13
anon_inode_create_getfd---of 4
anon_inode_create_getfile100%of 1
anon_inode_getfd40%of 10
anon_inode_getfile43%of 7
anon_inode_getfile_fmode---of 9
anon_inodefs_dname---of 1
anon_inodefs_init_fs_context---of 3
-----------
SUMMARY42%of 31

btrfs_drew_lock_init---of 1
btrfs_drew_read_lock---of 5
btrfs_drew_read_unlock---of 3
btrfs_drew_try_write_lock---of 5
btrfs_drew_write_lock---of 8
btrfs_drew_write_unlock---of 3
btrfs_lock_root_node50%of 4
btrfs_maybe_reset_lockdep_class40%of 5
btrfs_read_lock_root_node50%of 4
btrfs_set_buffer_lockdep_class75%of 4
btrfs_tree_lock_nested30%of 17
btrfs_tree_read_lock_nested30%of 17
btrfs_tree_read_unlock27%of 15
btrfs_tree_unlock27%of 15
btrfs_tree_unlock_rw---of 4
btrfs_try_read_lock_root_node---of 5
btrfs_try_tree_read_lock---of 16
btrfs_try_tree_write_lock---of 16
btrfs_unlock_up_safe34%of 12
-----------
SUMMARY34%of 93

__bpf_trace_cpuhp_enter---of 1
__bpf_trace_cpuhp_exit---of 1
__bpf_trace_cpuhp_multi_enter---of 1
__cpu_down_maps_locked---of 1
__cpuhp_remove_state---of 1
__cpuhp_remove_state_cpuslocked---of 16
__cpuhp_setup_state---of 1
__cpuhp_setup_state_cpuslocked---of 31
__cpuhp_state_add_instance100%of 1
__cpuhp_state_add_instance_cpuslocked18%of 23
__cpuhp_state_remove_instance---of 13
__probestub_cpuhp_enter---of 1
__probestub_cpuhp_exit---of 1
__probestub_cpuhp_multi_enter---of 1
__traceiter_cpuhp_enter---of 4
__traceiter_cpuhp_exit---of 4
__traceiter_cpuhp_multi_enter---of 4
_cpu_down---of 26
_cpu_up---of 22
active_show---of 3
add_cpu---of 1
bringup_hibernate_cpu---of 6
clear_tasks_mm_cpumask---of 21
control_show---of 6
control_store---of 16
cpu_device_down---of 1
cpu_device_up---of 1
cpu_down---of 7
cpu_hotplug_disable---of 1
cpu_hotplug_enable---of 4
cpu_hotplug_pm_callback---of 9
cpu_maps_update_begin---of 1
cpu_maps_update_done---of 1
cpu_mitigations_auto_nosmt---of 1
cpu_mitigations_off---of 1
cpu_smt_possible---of 1
cpu_up---of 16
cpuhp_ap_report_dead---of 3
cpuhp_ap_sync_alive---of 8
cpuhp_bringup_ap---of 21
cpuhp_complete_idle_dead---of 1
cpuhp_invoke_callback---of 26
cpuhp_issue_call---of 14
cpuhp_kick_ap---of 8
cpuhp_kick_ap_alive---of 9
cpuhp_kick_ap_work---of 3
cpuhp_online_idle---of 6
cpuhp_report_idle_dead---of 6
cpuhp_reset_state---of 11
cpuhp_set_state---of 8
cpuhp_should_run---of 3
cpuhp_smt_disable---of 13
cpuhp_smt_enable---of 9
cpuhp_thread_fun---of 23
cpuhp_wait_for_sync_state---of 8
cpus_read_lock40%of 10
cpus_read_trylock---of 12
cpus_read_unlock40%of 10
cpus_write_lock---of 1
cpus_write_unlock---of 1
fail_show---of 3
fail_store---of 12
finish_cpu---of 5
freeze_secondary_cpus---of 21
init_cpu_online---of 1
init_cpu_possible---of 1
init_cpu_present---of 1
lockdep_assert_cpus_held---of 5
lockdep_is_cpus_held100%of 1
notify_cpu_starting---of 9
perf_trace_cpuhp_enter---of 8
perf_trace_cpuhp_exit---of 8
perf_trace_cpuhp_multi_enter---of 8
remove_cpu---of 1
set_cpu_online---of 9
smp_shutdown_nonboot_cpus---of 20
state_show---of 3
states_show---of 5
take_cpu_down---of 10
takedown_cpu---of 18
target_show---of 3
target_store---of 14
thaw_secondary_cpus---of 11
trace_cpuhp_enter---of 15
trace_cpuhp_exit---of 15
trace_cpuhp_multi_enter---of 15
trace_event_raw_event_cpuhp_enter---of 7
trace_event_raw_event_cpuhp_exit---of 7
trace_event_raw_event_cpuhp_multi_enter---of 7
trace_raw_output_cpuhp_enter---of 3
trace_raw_output_cpuhp_exit---of 3
trace_raw_output_cpuhp_multi_enter---of 3
trace_suspend_resume---of 15
-----------
SUMMARY32%of 45

__start_tx---of 14
__stop_tx---of 18
default_serial_dl_read---of 1
default_serial_dl_write---of 1
hub6_serial_in---of 3
hub6_serial_out---of 3
io_serial_in67%of 3
io_serial_out67%of 3
mem16_serial_in---of 3
mem16_serial_out---of 3
mem32_serial_in---of 3
mem32_serial_out---of 3
mem32be_serial_in---of 3
mem32be_serial_out---of 3
mem_serial_in---of 3
mem_serial_out---of 3
rx_trig_bytes_show---of 7
rx_trig_bytes_store---of 13
serial8250_break_ctl---of 5
serial8250_clear_and_reinit_fifos---of 3
serial8250_config_port---of 134
serial8250_console_exit---of 3
serial8250_console_putchar---of 1
serial8250_console_setup---of 10
serial8250_console_write25%of 56
serial8250_default_handle_irq---of 5
serial8250_do_get_mctrl---of 7
serial8250_do_pm---of 10
serial8250_do_set_divisor---of 3
serial8250_do_set_ldisc---of 8
serial8250_do_set_mctrl---of 3
serial8250_do_set_termios---of 52
serial8250_do_shutdown---of 20
serial8250_do_startup---of 94
serial8250_em485_config---of 11
serial8250_em485_destroy---of 3
serial8250_em485_handle_start_tx---of 3
serial8250_em485_handle_stop_tx---of 7
serial8250_em485_start_tx---of 7
serial8250_em485_stop_tx---of 11
serial8250_enable_ms---of 9
serial8250_get_mctrl---of 3
serial8250_handle_irq---of 27
serial8250_init_port---of 1
serial8250_modem_status---of 12
serial8250_pm---of 3
serial8250_read_char---of 14
serial8250_release_port---of 16
serial8250_request_port---of 1
serial8250_request_std_resource---of 18
serial8250_rpm_get---of 3
serial8250_rpm_get_tx---of 4
serial8250_rpm_put---of 3
serial8250_rpm_put_tx---of 4
serial8250_rx_chars---of 4
serial8250_set_defaults---of 22
serial8250_set_ldisc---of 3
serial8250_set_mctrl---of 5
serial8250_set_termios---of 3
serial8250_shutdown---of 3
serial8250_start_tx---of 17
serial8250_startup---of 3
serial8250_stop_rx---of 8
serial8250_stop_tx---of 7
serial8250_throttle---of 1
serial8250_tx_chars---of 24
serial8250_tx_empty---of 8
serial8250_tx_threshold_handle_irq---of 3
serial8250_type---of 1
serial8250_unthrottle---of 1
serial8250_update_uartclk---of 5
serial8250_verify_port---of 4
size_fifo---of 11
uart_handle_break---of 5
wait_for_xmitr45%of 9
-----------
SUMMARY31%of 71

__fs_parse27%of 23
fs_lookup_param---of 11
fs_param_is_blob---of 3
fs_param_is_blockdev---of 1
fs_param_is_bool---of 14
fs_param_is_enum---of 10
fs_param_is_fd---of 10
fs_param_is_path---of 1
fs_param_is_s32---of 7
fs_param_is_string---of 5
fs_param_is_u32---of 7
fs_param_is_u64---of 7
fs_validate_description---of 10
lookup_constant50%of 6
validate_constant_table---of 18
-----------
SUMMARY32%of 29

alloc_ordered_extent43%of 7
btrfs_add_ordered_sum---of 3
btrfs_alloc_ordered_extent35%of 32
btrfs_dec_test_ordered_pending---of 45
btrfs_finish_ordered_extent---of 20
btrfs_get_ordered_extents_for_logging---of 26
btrfs_lock_and_flush_ordered_range---of 7
btrfs_lookup_first_ordered_extent---of 35
btrfs_lookup_first_ordered_range---of 35
btrfs_lookup_ordered_extent28%of 36
btrfs_lookup_ordered_range---of 55
btrfs_mark_ordered_extent_error---of 4
btrfs_mark_ordered_io_finished---of 41
btrfs_mod_outstanding_extents27%of 19
btrfs_put_ordered_extent19%of 27
btrfs_remove_ordered_extent---of 35
btrfs_run_ordered_extent_work---of 1
btrfs_split_ordered_extent---of 55
btrfs_start_ordered_extent---of 22
btrfs_try_lock_ordered_range---of 4
btrfs_wait_ordered_extents---of 32
btrfs_wait_ordered_range---of 8
btrfs_wait_ordered_roots20%of 20
can_finish_ordered_extent---of 45
finish_ordered_fn---of 1
ordered_data_exit---of 1
-----------
SUMMARY27%of 141

-----------
SUMMARY---of 0

__ia32_sys_fsconfig---of 1
__ia32_sys_fsopen---of 1
__ia32_sys_fspick---of 1
__se_sys_fsconfig14%of 61
__se_sys_fsopen23%of 9
__se_sys_fspick23%of 9
__x64_sys_fsconfig100%of 1
__x64_sys_fsopen100%of 1
__x64_sys_fspick100%of 1
fscontext_read---of 8
fscontext_release---of 3
vfs_cmd_create34%of 9
-----------
SUMMARY20%of 91

-----------
SUMMARY---of 0

__es_find_extent_range52%of 25
__es_insert_extent43%of 59
__es_remove_extent41%of 82
count_rsvd35%of 23
es_do_reclaim_extents---of 18
ext4_clear_inode_es---of 6
ext4_es_cache_extent27%of 30
ext4_es_count---of 15
ext4_es_delayed_clu36%of 25
ext4_es_find_extent_range24%of 30
ext4_es_free_extent56%of 9
ext4_es_init_tree100%of 1
ext4_es_insert_delayed_block25%of 45
ext4_es_insert_extent8%of 214
ext4_es_lookup_extent43%of 45
ext4_es_register_shrinker---of 7
ext4_es_remove_extent29%of 25
ext4_es_scan---of 69
ext4_es_scan_clu40%of 5
ext4_es_scan_range40%of 5
ext4_es_unregister_shrinker---of 1
ext4_exit_es---of 1
ext4_exit_pending---of 1
ext4_init_pending_tree100%of 1
ext4_is_pending---of 10
ext4_remove_pending---of 10
ext4_seq_es_shrinker_info_show---of 10
-----------
SUMMARY27%of 624

ima_add_violation---of 6
ima_alloc_init_template---of 13
ima_audit_measurement---of 10
ima_collect_measurement---of 26
ima_d_path---of 5
ima_free_template_entry---of 4
ima_get_action100%of 1
ima_store_measurement---of 17
ima_store_template---of 4
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

fat_bmap27%of 19
fat_cache_add8%of 27
fat_cache_destroy---of 1
fat_cache_inval_inode43%of 7
fat_get_cluster17%of 42
fat_get_mapped_cluster34%of 9
init_once---of 1
-----------
SUMMARY20%of 104

__btrfs_dump_space_info---of 11
__reserve_bytes8%of 83
btrfs_account_ro_block_groups_free_space---of 7
btrfs_add_bg_to_space_info---of 10
btrfs_async_reclaim_data_space---of 16
btrfs_async_reclaim_metadata_space---of 12
btrfs_calc_reclaim_metadata_size---of 12
btrfs_can_overcommit---of 9
btrfs_clear_space_info_full---of 4
btrfs_dump_space_info---of 7
btrfs_dump_space_info_for_trans_abort---of 4
btrfs_find_space_info50%of 4
btrfs_init_async_reclaim_work---of 1
btrfs_init_space_info---of 6
btrfs_preempt_reclaim_metadata_space---of 29
btrfs_reserve_data_bytes---of 9
btrfs_reserve_metadata_bytes50%of 4
btrfs_space_info_update_bytes_may_use29%of 21
btrfs_space_info_used---of 4
btrfs_try_granting_tickets---of 18
btrfs_update_space_info_chunk_size---of 1
create_space_info---of 12
dump_global_block_rsv---of 1
flush_space---of 53
maybe_fail_all_tickets---of 34
need_preemptive_reclaim15%of 20
priority_reclaim_metadata_space---of 15
steal_from_global_rsv---of 11
trace_btrfs_space_reservation27%of 15
trace_btrfs_trigger_flush---of 15
-----------
SUMMARY16%of 147

-----------
SUMMARY---of 0

__bforget---of 6
__bh_read---of 6
__bh_read_batch---of 12
__block_commit_write65%of 14
__block_write_begin29%of 7
__block_write_begin_int19%of 99
__block_write_full_folio23%of 67
__bread_gfp62%of 13
__breadahead34%of 9
__brelse67%of 3
__find_get_block34%of 62
__lock_buffer67%of 3
__sync_dirty_buffer47%of 13
__wait_on_buffer67%of 3
alloc_buffer_head34%of 9
alloc_page_buffers---of 7
bdev_getblk48%of 23
bh_read---of 3
bh_uptodate_or_lock29%of 7
block_commit_write---of 7
block_dirty_folio65%of 14
block_invalidate_folio29%of 21
block_is_partially_uptodate---of 11
block_page_mkwrite28%of 11
block_read_full_folio33%of 56
block_truncate_page9%of 35
block_write_begin---of 10
block_write_end34%of 9
block_write_full_folio50%of 4
buffer_check_dirty_writeback---of 23
buffer_exit_cpu_dead---of 9
clean_bdev_aliases14%of 22
cont_write_begin---of 25
create_empty_buffers30%of 37
decrypt_bh---of 8
drop_buffers29%of 25
end_bio_bh_io_sync---of 3
end_buffer_async_read---of 17
end_buffer_async_read_io---of 10
end_buffer_async_write---of 18
end_buffer_read_sync---of 4
end_buffer_write_sync---of 6
folio_alloc_buffers31%of 23
folio_attach_private25%of 8
folio_create_buffers37%of 11
folio_init_buffers41%of 22
folio_lock34%of 9
folio_set_bh---of 3
folio_size30%of 10
folio_test_uptodate---of 9
folio_zero_new_buffers24%of 21
free_buffer_head34%of 9
generic_block_bmap---of 3
generic_buffers_fsync---of 7
generic_buffers_fsync_noflush60%of 5
generic_cont_expand_simple---of 5
generic_write_end---of 7
has_bh_in_lru---of 19
inode_has_buffers100%of 1
invalidate_bh_lru---of 9
invalidate_bh_lrus---of 1
invalidate_bh_lrus_cpu---of 10
invalidate_inode_buffers23%of 9
mark_buffer_async_write---of 3
mark_buffer_dirty34%of 30
mark_buffer_dirty_inode56%of 9
mark_buffer_write_io_error---of 10
put_page---of 14
remove_inode_buffers---of 10
submit_bh100%of 1
submit_bh_wbc43%of 14
sync_dirty_buffer100%of 1
sync_mapping_buffers21%of 43
touch_buffer27%of 15
try_to_free_buffers29%of 28
unlock_buffer100%of 1
verify_bh---of 4
write_boundary_block---of 4
write_dirty_buffer---of 6
zero_user_segments48%of 17
-----------
SUMMARY31%of 853

__add_to_free_space_tree26%of 35
add_block_group_free_space---of 10
add_new_free_space_info---of 5
add_to_free_space_tree23%of 9
btrfs_create_free_space_tree---of 14
btrfs_delete_free_space_tree---of 10
btrfs_rebuild_free_space_tree---of 11
clear_free_space_tree---of 7
free_space_next_bitmap---of 6
free_space_test_bit---of 5
load_free_space_tree---of 41
modify_free_space_bitmap---of 29
populate_free_space_tree---of 23
remove_block_group_free_space---of 27
remove_from_free_space_tree21%of 29
search_free_space_info50%of 6
set_free_space_tree_thresholds---of 3
update_free_space_extent_count4%of 79
-----------
SUMMARY15%of 158

-----------
SUMMARY---of 0

__find_next_entry---of 23
__ftrace_trace_stack---of 15
__remove_instance---of 24
__trace_array_puts---of 11
__trace_array_vprintk---of 20
__trace_bputs---of 9
__trace_event_discard_commit---of 4
__trace_puts---of 1
__trace_stack---of 3
__tracing_resize_ring_buffer---of 9
add_tracer_options---of 31
allocate_trace_buffers---of 7
buffer_percent_read---of 1
buffer_percent_write---of 4
buffer_pipe_buf_get---of 5
buffer_pipe_buf_release---of 4
buffer_spd_release---of 4
buffer_subbuf_size_read---of 3
buffer_subbuf_size_write---of 6
call_filter_check_discard---of 4
close_pipe_on_cpu---of 9
disable_trace_buffered_event---of 1
disable_trace_on_warning---of 4
dummy_set_flag---of 1
enable_trace_buffered_event---of 1
err_pos---of 4
ftrace_dump---of 25
ftrace_dump_on_oops_enabled---of 1
ftrace_dump_one---of 50
ftrace_exports---of 8
ftrace_now---of 3
get_total_entries_cpu---of 5
init_trace_flags_index---of 1
init_tracer_tracefs---of 87
instance_mkdir---of 8
instance_rmdir---of 8
is_tracing_stopped---of 1
ns2usecs---of 1
nsecs_to_usecs---of 1
peek_next_entry---of 7
print_event_info---of 5
print_hex_fmt---of 10
print_raw_fmt---of 8
print_trace_fmt---of 22
print_trace_header---of 12
print_trace_line---of 23
queued_spin_lock_slowpath---of 3
rb_simple_read---of 3
rb_simple_write---of 14
register_ftrace_export---of 8
s_next---of 19
s_show---of 12
s_start---of 21
s_stop---of 4
seq_buf_str---of 3
set_tracer_flag---of 22
show_traces_open---of 10
show_traces_release---of 4
t_next---of 7
t_show---of 3
t_start---of 14
t_stop---of 1
test_can_verify---of 3
test_can_verify_check---of 3
trace_array_create_dir---of 6
trace_array_create_systems---of 14
trace_array_destroy---of 5
trace_array_find---of 7
trace_array_find_get---of 8
trace_array_get---of 4
trace_array_get_by_name---of 11
trace_array_init_printk---of 7
trace_array_printk---of 5
trace_array_printk_buf---of 3
trace_array_put---of 4
trace_array_vprintk---of 1
trace_automount---of 4
trace_buffer_lock_reserve---of 3
trace_buffer_unlock_commit_nostack---of 4
trace_buffer_unlock_commit_regs---of 18
trace_buffered_event_disable---of 9
trace_buffered_event_enable---of 17
trace_check_vprintf---of 54
trace_clock_in_ns---of 3
trace_create_file---of 3
trace_default_header---of 8
trace_die_panic_handler50%of 4
trace_dump_stack---of 3
trace_empty---of 17
trace_event_buffer_commit---of 27
trace_event_buffer_lock_reserve---of 12
trace_event_format---of 15
trace_filter_add_remove_task---of 5
trace_find_filtered_pid---of 1
trace_find_next_entry---of 9
trace_find_next_entry_inc---of 5
trace_function---of 10
trace_get_user---of 21
trace_handle_return---of 3
trace_ignore_this_task---of 6
trace_init_global_iter---of 10
trace_iter_expand_format---of 5
trace_keep_overwrite---of 1
trace_last_func_repeats---of 5
trace_latency_header---of 6
trace_min_max_read---of 3
trace_min_max_write---of 14
trace_module_notify---of 6
trace_options_core_read---of 3
trace_options_core_write---of 7
trace_options_read---of 1
trace_options_write---of 7
trace_parse_run_command---of 15
trace_parser_get_init---of 3
trace_parser_put---of 1
trace_pid_next---of 3
trace_pid_show---of 1
trace_pid_start---of 6
trace_pid_write---of 17
trace_printk_init_buffers---of 10
trace_printk_seq---of 7
trace_printk_start_comm---of 3
trace_set_options---of 11
trace_set_ring_buffer_expanded---of 1
trace_total_entries---of 5
trace_total_entries_cpu---of 1
trace_vbprintk---of 21
trace_vprintk---of 1
tracepoint_printk_sysctl---of 6
tracer_init---of 1
tracer_tracing_is_on---of 3
tracer_tracing_off---of 3
tracer_tracing_on---of 3
tracing_alloc_snapshot---of 3
tracing_buffers_flush---of 1
tracing_buffers_ioctl---of 6
tracing_buffers_mmap---of 1
tracing_buffers_mmap_close---of 3
tracing_buffers_open---of 13
tracing_buffers_poll---of 6
tracing_buffers_read---of 27
tracing_buffers_release---of 5
tracing_buffers_splice_read---of 32
tracing_check_open_get_tr---of 7
tracing_clock_open---of 10
tracing_clock_show---of 1
tracing_clock_write---of 5
tracing_cond_snapshot_data---of 1
tracing_cpumask_read---of 4
tracing_cpumask_write---of 4
tracing_entries_read---of 12
tracing_entries_write---of 5
tracing_err_log_open---of 19
tracing_err_log_release---of 6
tracing_err_log_seq_next---of 1
tracing_err_log_seq_show---of 5
tracing_err_log_seq_start---of 1
tracing_err_log_seq_stop---of 1
tracing_err_log_write---of 1
tracing_event_time_stamp---of 3
tracing_free_buffer_release---of 7
tracing_free_buffer_write---of 1
tracing_gen_ctx_irq_test---of 1
tracing_init_dentry---of 5
tracing_is_disabled---of 1
tracing_is_enabled---of 1
tracing_is_on---of 3
tracing_iter_reset---of 11
tracing_log_err---of 12
tracing_lseek---of 3
tracing_mark_open---of 7
tracing_mark_raw_write---of 10
tracing_mark_write---of 21
tracing_off---of 3
tracing_on---of 3
tracing_open---of 44
tracing_open_file_tr---of 11
tracing_open_generic---of 4
tracing_open_generic_tr---of 7
tracing_open_options---of 7
tracing_open_pipe---of 25
tracing_poll_pipe---of 6
tracing_read_pipe---of 30
tracing_readme_read---of 1
tracing_release---of 20
tracing_release_file_tr---of 4
tracing_release_generic_tr---of 4
tracing_release_options---of 4
tracing_release_pipe---of 8
tracing_reset_all_online_cpus---of 9
tracing_reset_all_online_cpus_unlocked---of 9
tracing_reset_online_cpus---of 4
tracing_resize_ring_buffer---of 6
tracing_set_clock---of 12
tracing_set_cpumask---of 18
tracing_set_filter_buffering---of 4
tracing_set_trace_read---of 1
tracing_set_trace_write---of 4
tracing_set_tracer---of 20
tracing_single_release_file_tr---of 4
tracing_single_release_tr---of 4
tracing_snapshot---of 3
tracing_snapshot_alloc---of 3
tracing_snapshot_cond---of 3
tracing_snapshot_cond_disable---of 1
tracing_snapshot_cond_enable---of 1
tracing_spd_release_pipe---of 1
tracing_splice_read_pipe---of 41
tracing_start---of 1
tracing_start_tr---of 10
tracing_stats_read---of 6
tracing_stop---of 1
tracing_stop_tr---of 8
tracing_thresh_read---of 1
tracing_thresh_write---of 7
tracing_time_stamp_mode_open---of 10
tracing_time_stamp_mode_show---of 1
tracing_total_entries_read---of 9
tracing_trace_options_open---of 10
tracing_trace_options_show---of 10
tracing_trace_options_write---of 5
tracing_update_buffers---of 3
tracing_wait_pipe---of 14
tracing_write_stub---of 1
unregister_ftrace_export---of 10
wait_pipe_cond---of 3
within_module_core---of 5
-----------
SUMMARY50%of 4

-----------
SUMMARY---of 0

btrfs_alloc_ordered_workqueue---of 4
btrfs_alloc_workqueue---of 4
btrfs_destroy_workqueue---of 16
btrfs_flush_workqueue---of 1
btrfs_init_work100%of 1
btrfs_queue_work30%of 20
btrfs_work_helper---of 45
btrfs_work_owner---of 1
btrfs_workqueue_normal_congested---of 3
btrfs_workqueue_owner---of 1
btrfs_workqueue_set_max---of 3
trace_btrfs_all_work_done---of 15
trace_btrfs_workqueue_alloc---of 15
-----------
SUMMARY34%of 21

__ia32_sys_sysfs---of 1
__se_sys_sysfs---of 17
__x64_sys_sysfs---of 1
filesystems_proc_show---of 4
get_filesystem100%of 1
get_fs_type24%of 26
put_filesystem100%of 1
register_filesystem---of 13
unregister_filesystem---of 6
-----------
SUMMARY29%of 28

-----------
SUMMARY---of 0

collect_domain_accesses---of 10
current_check_refer_path7%of 49
find_rule---of 17
hook_file_alloc_security100%of 1
hook_file_ioctl12%of 18
hook_file_ioctl_compat---of 18
hook_file_open12%of 18
hook_file_truncate100%of 1
hook_inode_free_security67%of 3
hook_move_mount19%of 11
hook_path_link100%of 1
hook_path_mkdir19%of 11
hook_path_mknod11%of 19
hook_path_rename100%of 1
hook_path_rmdir19%of 11
hook_path_symlink19%of 11
hook_path_truncate19%of 11
hook_path_unlink19%of 11
hook_sb_delete---of 39
hook_sb_mount19%of 11
hook_sb_pivotroot19%of 11
hook_sb_remount---of 11
hook_sb_umount---of 11
is_access_to_paths_allowed---of 36
landlock_append_fs_rule---of 39
release_inode---of 4
scope_to_request---of 6
-----------
SUMMARY16%of 199

-----------
SUMMARY---of 0

__virtio_unbreak_device---of 4
__virtqueue_break---of 1
__virtqueue_unbreak---of 1
__vring_new_virtqueue---of 19
detach_buf_packed---of 14
detach_buf_split---of 22
virtio_break_device---of 4
virtio_max_dma_size---of 3
virtqueue_add12%of 141
virtqueue_add_inbuf---of 1
virtqueue_add_inbuf_ctx---of 1
virtqueue_add_outbuf---of 1
virtqueue_add_sgs72%of 7
virtqueue_detach_unused_buf---of 13
virtqueue_disable_cb---of 8
virtqueue_dma_dev---of 3
virtqueue_dma_map_single_attrs---of 6
virtqueue_dma_mapping_error---of 1
virtqueue_dma_need_sync---of 4
virtqueue_dma_sync_single_range_for_cpu---of 4
virtqueue_dma_sync_single_range_for_device---of 4
virtqueue_dma_unmap_single_attrs---of 3
virtqueue_enable_cb---of 15
virtqueue_enable_cb_delayed---of 14
virtqueue_enable_cb_prepare---of 10
virtqueue_get_avail_addr---of 4
virtqueue_get_buf---of 1
virtqueue_get_buf_ctx---of 19
virtqueue_get_desc_addr---of 3
virtqueue_get_used_addr---of 4
virtqueue_get_vring---of 1
virtqueue_get_vring_size---of 1
virtqueue_is_broken---of 1
virtqueue_kick---of 18
virtqueue_kick_prepare30%of 10
virtqueue_notify50%of 4
virtqueue_poll---of 6
virtqueue_reset---of 13
virtqueue_resize---of 43
virtqueue_set_dma_premapped---of 3
vring_alloc_queue_packed---of 13
vring_alloc_queue_split---of 13
vring_create_virtqueue---of 3
vring_create_virtqueue_dma---of 3
vring_create_virtqueue_packed---of 18
vring_create_virtqueue_split---of 5
vring_del_virtqueue---of 3
vring_free---of 12
vring_free_packed---of 10
vring_interrupt---of 14
vring_map_single---of 6
vring_new_virtqueue---of 3
vring_notification_data---of 3
vring_transport_features---of 1
vring_unmap_extra_packed---of 5
-----------
SUMMARY17%of 162

__ext4_set_acl---of 23
ext4_get_acl---of 35
ext4_init_acl13%of 16
ext4_set_acl---of 17
-----------
SUMMARY13%of 16

btrfs_alloc_dummy_sum---of 3
btrfs_csum_file_blocks---of 53
btrfs_csum_one_bio---of 17
btrfs_del_csums---of 34
btrfs_extent_item_to_extent_map---of 16
btrfs_file_extent_end---of 4
btrfs_inode_clear_file_extent_range---of 5
btrfs_inode_safe_disk_i_size_write---of 7
btrfs_inode_set_file_extent_range40%of 5
btrfs_insert_hole_extent---of 4
btrfs_lookup_bio_sums---of 37
btrfs_lookup_csum---of 10
btrfs_lookup_csums_bitmap---of 51
btrfs_lookup_csums_list---of 52
btrfs_lookup_file_extent100%of 1
truncate_one_csum---of 8
-----------
SUMMARY50%of 6

bio_first_folio---of 18
ext4_alloc_io_end_vec50%of 4
ext4_bio_write_folio28%of 106
ext4_end_bio---of 13
ext4_end_io_rsv_work---of 15
ext4_exit_pageio---of 1
ext4_finish_bio---of 37
ext4_get_io_end---of 4
ext4_init_io_end67%of 3
ext4_io_submit75%of 4
ext4_io_submit_init100%of 1
ext4_last_io_end_vec67%of 3
ext4_put_io_end43%of 7
ext4_put_io_end_defer25%of 12
ext4_release_io_end47%of 13
folio_size30%of 10
-----------
SUMMARY34%of 163

-----------
SUMMARY---of 0

__xa_alloc---of 7
__xa_alloc_cyclic---of 9
__xa_clear_mark67%of 3
__xa_cmpxchg38%of 8
__xa_erase---of 3
__xa_insert---of 7
__xa_set_mark67%of 3
__xa_store58%of 7
__xas_next11%of 28
__xas_nomem20%of 10
__xas_prev---of 28
node_set_marks---of 16
xa_clear_mark---of 3
xa_delete_node---of 8
xa_destroy---of 12
xa_erase---of 3
xa_extract---of 68
xa_find30%of 17
xa_find_after---of 26
xa_get_mark---of 48
xa_get_order---of 13
xa_load24%of 13
xa_parent43%of 7
xa_set_mark---of 3
xa_store100%of 1
xa_store_range---of 24
xas_clear_mark50%of 22
xas_create44%of 99
xas_create_range---of 22
xas_destroy---of 4
xas_find59%of 31
xas_find_conflict14%of 58
xas_find_marked57%of 64
xas_free_nodes---of 24
xas_get_mark---of 6
xas_get_order---of 13
xas_init_marks75%of 4
xas_load31%of 26
xas_nomem34%of 6
xas_pause---of 16
xas_set_mark48%of 19
xas_split---of 22
xas_split_alloc---of 14
xas_start30%of 37
xas_store47%of 86
-----------
SUMMARY40%of 549

__fat_readdir---of 58
__fat_remove_entries---of 14
fat_add_entries---of 132
fat_alloc_new_dir---of 11
fat_checksum100%of 1
fat_compat_dir_ioctl---of 8
fat_compat_ioctl_filldir---of 17
fat_dir_empty---of 14
fat_dir_ioctl---of 8
fat_get_dotdot_entry---of 9
fat_get_entry58%of 28
fat_ioctl_filldir---of 17
fat_parse_long20%of 21
fat_parse_short40%of 56
fat_readdir---of 1
fat_remove_entries---of 17
fat_scan---of 10
fat_scan_logstart---of 12
fat_search_long21%of 39
fat_shortname2uni14%of 15
fat_subdirs64%of 11
fat_zeroed_cluster---of 47
lock_buffer---of 3
-----------
SUMMARY36%of 171

check_pte16%of 13
huge_pte_lock---of 7
hugetlb_walk---of 5
not_found---of 10
page_vma_mapped_walk12%of 94
swp_offset_pfn---of 3
-----------
SUMMARY13%of 107

-----------
SUMMARY---of 0

__io_wq_cpu_online---of 11
create_io_worker---of 17
create_worker_cb---of 9
create_worker_cont---of 20
io_acct_cancel_pending_work---of 19
io_init_new_worker---of 5
io_queue_worker_create---of 16
io_run_task_work---of 15
io_task_work_match---of 3
io_task_worker_match---of 1
io_worker_cancel_cb---of 6
io_worker_get---of 7
io_worker_handle_work---of 47
io_worker_ref_put---of 3
io_workqueue_create---of 3
io_wq_activate_free_worker---of 11
io_wq_cancel_cb---of 21
io_wq_cancel_tw_create---of 6
io_wq_cpu_affinity---of 13
io_wq_cpu_offline---of 1
io_wq_cpu_online---of 1
io_wq_create27%of 15
io_wq_dec_running---of 6
io_wq_enqueue---of 34
io_wq_exit_start---of 1
io_wq_for_each_worker---of 12
io_wq_hash_wake---of 17
io_wq_hash_work---of 1
io_wq_max_workers---of 19
io_wq_put_and_exit---of 34
io_wq_work_match_all---of 1
io_wq_work_match_item---of 1
io_wq_worker---of 40
io_wq_worker_affinity---of 5
io_wq_worker_cancel---of 7
io_wq_worker_running---of 5
io_wq_worker_sleeping---of 5
io_wq_worker_stopped---of 4
io_wq_worker_wake---of 3
-----------
SUMMARY27%of 15

__alloc_disk_node---of 8
__blk_alloc_disk---of 4
__blk_mark_disk_dead---of 4
__register_blkdev---of 12
blk_alloc_ext_minor---of 1
blk_free_ext_minor---of 1
blk_mark_disk_dead---of 1
blk_report_disk_dead---of 28
blk_request_module---of 9
blkdev_show---of 6
block_devnode---of 3
block_uevent---of 1
del_gendisk---of 27
device_add_disk---of 38
disk_alignment_offset_show---of 1
disk_badblocks_show---of 3
disk_badblocks_store---of 3
disk_capability_show---of 3
disk_discard_alignment_show---of 1
disk_ext_range_show---of 1
disk_has_partscan---of 3
disk_hidden_show---of 1
disk_range_show---of 1
disk_release---of 10
disk_removable_show---of 1
disk_ro_show---of 3
disk_scan_partitions---of 11
disk_seqf_next---of 3
disk_seqf_start---of 5
disk_seqf_stop---of 3
disk_uevent---of 28
disk_visible---of 4
diskseq_show---of 1
diskstats_show---of 24
inc_diskseq---of 1
invalidate_disk---of 1
part_devt---of 13
part_fail_show---of 1
part_fail_store---of 5
part_in_flight60%of 5
part_inflight_show---of 10
part_size_show---of 1
part_stat_read_all---of 5
part_stat_show---of 8
partscan_show---of 3
put_disk---of 3
set_capacity---of 1
set_capacity_and_notify---of 6
set_disk_ro---of 5
show_partition---of 18
show_partition_start---of 7
unregister_blkdev---of 5
-----------
SUMMARY60%of 5

__put_cred---of 6
abort_creds---of 5
commit_creds---of 42
copy_creds---of 47
cred_alloc_blank---of 6
cred_fscmp---of 16
exit_creds---of 10
get_task_cred---of 22
override_creds100%of 1
prepare_creds---of 34
prepare_exec_creds---of 3
prepare_kernel_cred---of 23
put_cred_rcu---of 12
revert_creds50%of 4
set_create_files_as---of 4
set_cred_ucounts---of 6
set_security_override---of 1
set_security_override_from_ctx---of 3
-----------
SUMMARY60%of 5

add_partition---of 24
bdev_add_partition---of 5
bdev_del_partition---of 4
bdev_disk_changed10%of 54
bdev_resize_partition---of 5
disk_unlock_native_capacity---of 4
drop_partition---of 4
part_alignment_offset_show---of 1
part_discard_alignment_show---of 1
part_partition_show---of 1
part_release---of 1
part_ro_show---of 4
part_start_show---of 1
part_uevent---of 4
partition_overlaps---of 17
read_part_sector---of 12
whole_disk_show---of 1
xa_insert---of 1
-----------
SUMMARY10%of 54

__bpf_trace_jbd2_checkpoint---of 1
__bpf_trace_jbd2_checkpoint_stats---of 1
__bpf_trace_jbd2_commit---of 1
__bpf_trace_jbd2_end_commit---of 1
__bpf_trace_jbd2_handle_extend---of 1
__bpf_trace_jbd2_handle_start_class---of 1
__bpf_trace_jbd2_handle_stats---of 1
__bpf_trace_jbd2_journal_shrink---of 1
__bpf_trace_jbd2_lock_buffer_stall---of 1
__bpf_trace_jbd2_run_stats---of 1
__bpf_trace_jbd2_shrink_checkpoint_list---of 1
__bpf_trace_jbd2_shrink_scan_exit---of 1
__bpf_trace_jbd2_submit_inode_data---of 1
__bpf_trace_jbd2_update_log_tail---of 1
__bpf_trace_jbd2_write_superblock---of 1
__jbd2_journal_force_commit---of 7
__jbd2_log_start_commit34%of 9
__jbd2_update_log_tail---of 19
__probestub_jbd2_checkpoint---of 1
__probestub_jbd2_checkpoint_stats---of 1
__probestub_jbd2_commit_flushing---of 1
__probestub_jbd2_commit_locking---of 1
__probestub_jbd2_commit_logging---of 1
__probestub_jbd2_drop_transaction---of 1
__probestub_jbd2_end_commit---of 1
__probestub_jbd2_handle_extend---of 1
__probestub_jbd2_handle_restart---of 1
__probestub_jbd2_handle_start---of 1
__probestub_jbd2_handle_stats---of 1
__probestub_jbd2_lock_buffer_stall---of 1
__probestub_jbd2_run_stats---of 1
__probestub_jbd2_shrink_checkpoint_list---of 1
__probestub_jbd2_shrink_count---of 1
__probestub_jbd2_shrink_scan_enter---of 1
__probestub_jbd2_shrink_scan_exit---of 1
__probestub_jbd2_start_commit---of 1
__probestub_jbd2_submit_inode_data---of 1
__probestub_jbd2_update_log_tail---of 1
__probestub_jbd2_write_superblock---of 1
__traceiter_jbd2_checkpoint---of 4
__traceiter_jbd2_checkpoint_stats---of 4
__traceiter_jbd2_commit_flushing---of 4
__traceiter_jbd2_commit_locking---of 4
__traceiter_jbd2_commit_logging---of 4
__traceiter_jbd2_drop_transaction---of 4
__traceiter_jbd2_end_commit---of 4
__traceiter_jbd2_handle_extend---of 4
__traceiter_jbd2_handle_restart---of 4
__traceiter_jbd2_handle_start---of 4
__traceiter_jbd2_handle_stats---of 4
__traceiter_jbd2_lock_buffer_stall---of 4
__traceiter_jbd2_run_stats---of 4
__traceiter_jbd2_shrink_checkpoint_list---of 4
__traceiter_jbd2_shrink_count---of 4
__traceiter_jbd2_shrink_scan_enter---of 4
__traceiter_jbd2_shrink_scan_exit---of 4
__traceiter_jbd2_start_commit---of 4
__traceiter_jbd2_submit_inode_data---of 4
__traceiter_jbd2_update_log_tail---of 4
__traceiter_jbd2_write_superblock---of 4
commit_timeout---of 1
folio_size---of 10
jbd2_alloc---of 11
jbd2_complete_transaction50%of 8
jbd2_descriptor_block_csum_set---of 9
jbd2_fc_begin_commit---of 6
jbd2_fc_end_commit---of 3
jbd2_fc_end_commit_fallback---of 5
jbd2_fc_get_buf---of 10
jbd2_fc_release_bufs---of 5
jbd2_fc_wait_bufs---of 7
jbd2_free---of 8
jbd2_journal_abort---of 8
jbd2_journal_ack_err---of 3
jbd2_journal_add_journal_head37%of 22
jbd2_journal_blocks_per_page67%of 3
jbd2_journal_bmap---of 7
jbd2_journal_check_available_features---of 4
jbd2_journal_check_used_features---of 7
jbd2_journal_clear_err---of 3
jbd2_journal_clear_features---of 5
jbd2_journal_destroy---of 33
jbd2_journal_destroy_caches---of 1
jbd2_journal_errno---of 3
jbd2_journal_flush---of 40
jbd2_journal_force_commit---of 3
jbd2_journal_force_commit_nested---of 1
jbd2_journal_get_descriptor_buffer---of 7
jbd2_journal_get_log_tail---of 5
jbd2_journal_grab_journal_head67%of 6
jbd2_journal_init_dev---of 4
jbd2_journal_init_inode---of 8
jbd2_journal_init_jbd_inode---of 1
jbd2_journal_load---of 24
jbd2_journal_next_log_block---of 10
jbd2_journal_put_journal_head28%of 18
jbd2_journal_release_jbd_inode---of 8
jbd2_journal_set_features6%of 34
jbd2_journal_shrink_count---of 15
jbd2_journal_shrink_scan---of 29
jbd2_journal_start_commit50%of 6
jbd2_journal_update_sb_errno---of 3
jbd2_journal_update_sb_log_tail---of 9
jbd2_journal_wipe---of 5
jbd2_journal_write_metadata_buffer---of 28
jbd2_log_start_commit---of 1
jbd2_log_wait_commit58%of 14
jbd2_mark_journal_empty---of 8
jbd2_seq_info_next---of 1
jbd2_seq_info_open---of 5
jbd2_seq_info_release---of 1
jbd2_seq_info_show---of 5
jbd2_seq_info_start---of 1
jbd2_seq_info_stop---of 1
jbd2_trans_will_send_data_barrier40%of 10
jbd2_transaction_committed60%of 5
jbd2_update_log_tail---of 3
jbd2_write_superblock---of 34
jbd_lock_bh_journal_head29%of 7
journal_init_common---of 58
journal_tag_bytes---of 4
kjournald2---of 18
perf_trace_jbd2_checkpoint---of 8
perf_trace_jbd2_checkpoint_stats---of 8
perf_trace_jbd2_commit---of 8
perf_trace_jbd2_end_commit---of 8
perf_trace_jbd2_handle_extend---of 8
perf_trace_jbd2_handle_start_class---of 8
perf_trace_jbd2_handle_stats---of 8
perf_trace_jbd2_journal_shrink---of 8
perf_trace_jbd2_lock_buffer_stall---of 8
perf_trace_jbd2_run_stats---of 8
perf_trace_jbd2_shrink_checkpoint_list---of 8
perf_trace_jbd2_shrink_scan_exit---of 8
perf_trace_jbd2_submit_inode_data---of 8
perf_trace_jbd2_update_log_tail---of 8
perf_trace_jbd2_write_superblock---of 8
trace_event_raw_event_jbd2_checkpoint---of 7
trace_event_raw_event_jbd2_checkpoint_stats---of 7
trace_event_raw_event_jbd2_commit---of 7
trace_event_raw_event_jbd2_end_commit---of 7
trace_event_raw_event_jbd2_handle_extend---of 7
trace_event_raw_event_jbd2_handle_start_class---of 7
trace_event_raw_event_jbd2_handle_stats---of 7
trace_event_raw_event_jbd2_journal_shrink---of 7
trace_event_raw_event_jbd2_lock_buffer_stall---of 7
trace_event_raw_event_jbd2_run_stats---of 7
trace_event_raw_event_jbd2_shrink_checkpoint_list---of 7
trace_event_raw_event_jbd2_shrink_scan_exit---of 7
trace_event_raw_event_jbd2_submit_inode_data---of 7
trace_event_raw_event_jbd2_update_log_tail---of 7
trace_event_raw_event_jbd2_write_superblock---of 7
trace_raw_output_jbd2_checkpoint---of 3
trace_raw_output_jbd2_checkpoint_stats---of 3
trace_raw_output_jbd2_commit---of 3
trace_raw_output_jbd2_end_commit---of 3
trace_raw_output_jbd2_handle_extend---of 3
trace_raw_output_jbd2_handle_start_class---of 3
trace_raw_output_jbd2_handle_stats---of 3
trace_raw_output_jbd2_journal_shrink---of 3
trace_raw_output_jbd2_lock_buffer_stall---of 3
trace_raw_output_jbd2_run_stats---of 3
trace_raw_output_jbd2_shrink_checkpoint_list---of 3
trace_raw_output_jbd2_shrink_scan_exit---of 3
trace_raw_output_jbd2_submit_inode_data---of 3
trace_raw_output_jbd2_update_log_tail---of 3
trace_raw_output_jbd2_write_superblock---of 3
-----------
SUMMARY34%of 142

children_seq_next---of 1
children_seq_open---of 1
children_seq_show---of 1
children_seq_start---of 1
children_seq_stop---of 1
do_task_stat---of 81
get_children_pid---of 15
get_pid---of 5
proc_pid_statm---of 3
proc_pid_status---of 126
proc_task_name50%of 6
proc_tgid_stat---of 1
proc_tid_stat---of 1
render_sigset_t---of 3
-----------
SUMMARY50%of 6

errseq_check67%of 3
errseq_check_and_advance50%of 4
errseq_sample100%of 1
errseq_set---of 5
-----------
SUMMARY63%of 8

-----------
SUMMARY---of 0

__iomap_dio_rw20%of 93
iomap_dio_bio_end_io---of 12
iomap_dio_bio_iter46%of 70
iomap_dio_complete28%of 36
iomap_dio_complete_work---of 1
iomap_dio_deferred_complete---of 1
iomap_dio_rw67%of 3
iomap_dio_zero39%of 13
trace_iomap_dio_rw_queued---of 15
-----------
SUMMARY32%of 215

-----------
SUMMARY---of 0

__close_range---of 45
__f_unlock_pos100%of 1
__fdget60%of 5
__fdget_pos50%of 10
__fdget_raw75%of 4
__fget_files24%of 21
__free_fdtable---of 1
__get_unused_fd_flags---of 1
__ia32_sys_dup---of 4
__ia32_sys_dup2---of 1
__ia32_sys_dup3---of 1
__put_unused_fd34%of 9
__se_sys_dup29%of 23
__x64_sys_dup50%of 4
__x64_sys_dup2100%of 1
__x64_sys_dup3100%of 1
alloc_fd41%of 22
alloc_fdtable---of 9
close_fd---of 3
do_close_on_exec---of 14
do_dup235%of 20
dup_fd---of 34
exit_files---of 3
expand_files12%of 27
f_dupfd---of 6
fd_install18%of 28
fget100%of 1
fget_raw---of 1
fget_task---of 3
file_close_fd100%of 1
file_close_fd_locked39%of 13
free_fdtable_rcu---of 1
get_close_on_exec---of 17
get_file_active---of 17
get_file_rcu---of 9
get_unused_fd_flags100%of 1
iterate_fd---of 20
ksys_dup325%of 12
lookup_fdget_rcu---of 10
put_files_struct---of 13
put_unused_fd100%of 1
rcu_read_unlock_sched---of 8
receive_fd---of 9
receive_fd_replace---of 4
replace_fd---of 6
set_close_on_exec---of 10
task_lookup_fdget_rcu---of 11
task_lookup_next_fdget_rcu---of 19
-----------
SUMMARY31%of 205

lockref_get100%of 1
lockref_get_not_dead67%of 3
lockref_get_not_zero67%of 3
lockref_mark_dead67%of 3
lockref_put_not_zero---of 3
lockref_put_or_lock---of 3
lockref_put_return100%of 1
-----------
SUMMARY73%of 11

-----------
SUMMARY---of 0

__ext4_check_dir_entry58%of 26
call_filldir34%of 12
ext4_check_all_de---of 5
ext4_dir_llseek32%of 16
ext4_htree_free_dir_info---of 5
ext4_htree_store_dirent42%of 12
ext4_readdir11%of 138
ext4_release_dir---of 6
-----------
SUMMARY22%of 204

-----------
SUMMARY---of 0

__io_uring_add_tctx_node46%of 11
__io_uring_add_tctx_node_from_submit50%of 6
__io_uring_free---of 7
io_ring_add_registered_file---of 8
io_ringfd_register---of 24
io_ringfd_unregister---of 12
io_uring_alloc_task_context38%of 8
io_uring_clean_tctx---of 6
io_uring_del_tctx_node---of 11
io_uring_unreg_ringfd---of 33
-----------
SUMMARY44%of 25

-----------
SUMMARY---of 0

minix_V1_raw_inode---of 5
minix_V2_raw_inode40%of 5
minix_count_free_blocks---of 9
minix_count_free_inodes---of 7
minix_free_block23%of 9
minix_free_inode22%of 19
minix_new_block---of 7
minix_new_inode---of 11
-----------
SUMMARY25%of 33

btrfs_add_inode_defrag30%of 17
btrfs_auto_defrag_exit---of 1
btrfs_cleanup_defrag_inodes---of 4
btrfs_defrag_file---of 134
btrfs_defrag_root---of 59
btrfs_run_defrag_inodes---of 46
defrag_collect_targets---of 44
defrag_lookup_extent---of 38
folio_lock---of 9
-----------
SUMMARY30%of 17

minix_create---of 1
minix_link---of 3
minix_lookup50%of 4
minix_mkdir---of 5
minix_mknod---of 5
minix_rename---of 34
minix_rmdir40%of 5
minix_symlink---of 6
minix_tmpfile---of 4
minix_unlink---of 4
unmap_and_put_page30%of 10
-----------
SUMMARY37%of 19

add_rules---of 13
gid_eq---of 1
gid_gt---of 1
gid_lt---of 1
ima_alloc_rule_opt_list---of 9
ima_appraise_signature---of 27
ima_check_policy---of 1
ima_delete_rules---of 6
ima_free_rule---of 5
ima_lsm_policy_change---of 18
ima_match_policy27%of 123
ima_parse_add_rule---of 322
ima_parse_appraise_algos---of 10
ima_policy_next---of 11
ima_policy_show---of 116
ima_policy_start---of 24
ima_policy_stop---of 1
ima_rule_contains_lsm_cond---of 7
ima_update_policy---of 5
ima_update_policy_flags---of 21
uid_eq---of 1
uid_gt---of 1
uid_lt---of 1
vfsgid_eq_kgid---of 1
vfsgid_gt_kgid---of 1
vfsgid_lt_kgid---of 1
vfsuid_eq_kuid---of 1
vfsuid_gt_kuid---of 1
vfsuid_lt_kuid---of 1
-----------
SUMMARY27%of 123

-----------
SUMMARY---of 0

__sg_alloc_table70%of 10
__sg_free_table---of 12
__sg_page_iter_dma_next---of 9
__sg_page_iter_next---of 9
__sg_page_iter_start---of 1
extract_iter_to_sg---of 102
sg_alloc_append_table_from_pages---of 54
sg_alloc_table---of 10
sg_alloc_table_from_pages_segment---of 4
sg_copy_buffer---of 20
sg_copy_from_buffer---of 5
sg_copy_to_buffer---of 5
sg_free_append_table---of 8
sg_free_table---of 8
sg_init_one50%of 4
sg_init_table100%of 1
sg_last---of 13
sg_miter_get_next_page---of 11
sg_miter_next---of 5
sg_miter_skip---of 5
sg_miter_start---of 3
sg_miter_stop---of 9
sg_nents---of 7
sg_nents_for_len---of 9
sg_next100%of 4
sg_pcopy_from_buffer---of 1
sg_pcopy_to_buffer---of 1
sg_zero_buffer---of 18
sgl_alloc---of 1
sgl_alloc_order---of 31
sgl_free---of 10
sgl_free_n_order---of 10
sgl_free_order---of 10
-----------
SUMMARY74%of 19

__bpf_trace_percpu_alloc_percpu---of 1
__bpf_trace_percpu_alloc_percpu_fail---of 1
__bpf_trace_percpu_create_chunk---of 1
__bpf_trace_percpu_destroy_chunk---of 1
__bpf_trace_percpu_free_percpu---of 1
__is_kernel_percpu_address38%of 8
__probestub_percpu_alloc_percpu---of 1
__probestub_percpu_alloc_percpu_fail---of 1
__probestub_percpu_create_chunk---of 1
__probestub_percpu_destroy_chunk---of 1
__probestub_percpu_free_percpu---of 1
__traceiter_percpu_alloc_percpu---of 4
__traceiter_percpu_alloc_percpu_fail---of 4
__traceiter_percpu_create_chunk---of 4
__traceiter_percpu_destroy_chunk---of 4
__traceiter_percpu_free_percpu---of 4
free_percpu---of 65
is_kernel_percpu_address---of 6
pcpu_alloc_area46%of 22
pcpu_alloc_noprof22%of 103
pcpu_alloc_size---of 10
pcpu_balance_free---of 39
pcpu_balance_workfn---of 53
pcpu_block_refresh_hint100%of 6
pcpu_block_update46%of 24
pcpu_block_update_hint_alloc54%of 26
pcpu_chunk_depopulated---of 7
pcpu_chunk_populated---of 7
pcpu_chunk_refresh_hint---of 23
pcpu_chunk_relocate20%of 15
pcpu_create_chunk---of 28
pcpu_depopulate_chunk---of 22
pcpu_dump_alloc_info---of 25
pcpu_find_block_fit56%of 9
pcpu_free_area---of 25
pcpu_get_pages---of 8
pcpu_memcg_post_alloc_hook7%of 32
pcpu_next_fit_region67%of 15
pcpu_nr_pages---of 1
pcpu_obj_full_size---of 1
pcpu_populate_chunk---of 44
per_cpu_ptr_to_phys---of 10
perf_trace_percpu_alloc_percpu---of 8
perf_trace_percpu_alloc_percpu_fail---of 8
perf_trace_percpu_create_chunk---of 8
perf_trace_percpu_destroy_chunk---of 8
perf_trace_percpu_free_percpu---of 8
trace_event_raw_event_percpu_alloc_percpu---of 7
trace_event_raw_event_percpu_alloc_percpu_fail---of 7
trace_event_raw_event_percpu_create_chunk---of 7
trace_event_raw_event_percpu_destroy_chunk---of 7
trace_event_raw_event_percpu_free_percpu---of 7
trace_percpu_create_chunk---of 15
trace_percpu_free_percpu---of 15
trace_raw_output_percpu_alloc_percpu---of 4
trace_raw_output_percpu_alloc_percpu_fail---of 3
trace_raw_output_percpu_create_chunk---of 3
trace_raw_output_percpu_destroy_chunk---of 3
trace_raw_output_percpu_free_percpu---of 3
-----------
SUMMARY34%of 260

__set_oom_adj---of 58
auxv_open---of 3
auxv_read---of 6
comm_open---of 1
comm_show40%of 5
comm_write---of 8
do_io_accounting---of 30
environ_open---of 3
environ_read---of 12
map_files_d_revalidate---of 30
map_files_get_link---of 29
mem_lseek---of 4
mem_open---of 3
mem_read---of 1
mem_release---of 4
mem_rw---of 16
mem_write---of 1
mmap_read_unlock---of 3
next_tgid---of 21
oom_adj_read---of 7
oom_adj_write---of 9
oom_score_adj_read---of 5
oom_score_adj_write---of 7
pid_delete_dentry100%of 1
pid_getattr---of 22
pid_revalidate29%of 14
pid_update_inode100%of 1
proc_attr_dir_lookup---of 1
proc_attr_dir_readdir---of 1
proc_coredump_filter_read---of 7
proc_coredump_filter_write---of 13
proc_cwd_link---of 7
proc_exe_link---of 7
proc_fail_nth_read---of 5
proc_fail_nth_write---of 6
proc_fault_inject_read---of 5
proc_fault_inject_write---of 9
proc_fd_access_allowed---of 5
proc_fill_cache---of 9
proc_flush_pid---of 1
proc_gid_map_open---of 1
proc_id_map_open---of 39
proc_id_map_release---of 5
proc_loginuid_read---of 5
proc_loginuid_write---of 22
proc_map_files_get_link---of 8
proc_map_files_instantiate---of 3
proc_map_files_lookup---of 29
proc_map_files_readdir---of 28
proc_mem_open---of 7
proc_oom_score---of 3
proc_pid_attr_open---of 3
proc_pid_attr_read---of 7
proc_pid_attr_write---of 31
proc_pid_cmdline_read---of 28
proc_pid_evict_inode---of 5
proc_pid_get_link---of 5
proc_pid_instantiate---of 4
proc_pid_ksm_merging_pages---of 3
proc_pid_ksm_stat---of 3
proc_pid_limits---of 10
proc_pid_lookup---of 26
proc_pid_make_inode50%of 4
proc_pid_permission37%of 11
proc_pid_personality---of 4
proc_pid_readdir---of 21
proc_pid_readlink---of 7
proc_pid_schedstat---of 1
proc_pid_stack---of 8
proc_pid_syscall---of 6
proc_pid_wchan---of 5
proc_pident_instantiate---of 8
proc_pident_lookup---of 11
proc_pident_readdir---of 16
proc_projid_map_open---of 1
proc_root_link---of 7
proc_sessionid_read---of 5
proc_setattr---of 4
proc_setgroups_open---of 43
proc_setgroups_release---of 5
proc_single_open---of 1
proc_single_show---of 5
proc_smack_attr_dir_iterate---of 1
proc_smack_attr_dir_lookup---of 1
proc_task_getattr40%of 5
proc_task_instantiate---of 4
proc_task_lookup---of 29
proc_task_readdir---of 58
proc_tgid_base_lookup---of 1
proc_tgid_base_readdir---of 1
proc_tgid_io_accounting---of 1
proc_tid_base_lookup---of 1
proc_tid_base_readdir---of 1
proc_tid_comm_permission---of 7
proc_tid_io_accounting---of 4
proc_timers_open---of 3
proc_uid_map_open---of 1
show_timer---of 3
task_dump_owner29%of 21
tgid_pidfd_to_pid---of 3
timens_offsets_open---of 1
timens_offsets_show---of 5
timens_offsets_write---of 27
timers_next---of 1
timers_start---of 4
timers_stop---of 7
timerslack_ns_open---of 1
timerslack_ns_show---of 28
timerslack_ns_write---of 31
-----------
SUMMARY36%of 62

s8250_options---of 1
serial8250_backup_timeout---of 13
serial8250_get_port---of 3
serial8250_interrupt---of 10
serial8250_probe---of 7
serial8250_register_8250_port---of 93
serial8250_remove---of 8
serial8250_resume---of 6
serial8250_resume_port---of 6
serial8250_set_isa_configurator---of 1
serial8250_setup_port---of 6
serial8250_suspend---of 6
serial8250_suspend_port---of 9
serial8250_timeout---of 1
serial8250_unregister_port---of 7
serial_8250_overrun_backoff_work---of 1
serial_do_unlink---of 10
univ8250_config_port---of 26
univ8250_console_exit---of 3
univ8250_console_match---of 18
univ8250_console_setup---of 16
univ8250_console_write67%of 3
univ8250_release_irq---of 7
univ8250_release_port---of 7
univ8250_request_port---of 9
univ8250_setup_irq---of 11
univ8250_setup_timer---of 6
-----------
SUMMARY67%of 3

ext4_discard_allocated_blocks---of 14
ext4_discard_preallocations30%of 51
ext4_discard_work---of 22
ext4_exit_mballoc---of 1
ext4_free_blocks29%of 107
ext4_group_add_blocks---of 23
ext4_grp_offs_to_block---of 3
ext4_has_group_desc_csum43%of 7
ext4_issue_discard---of 23
ext4_mb_add_groupinfo---of 65
ext4_mb_alloc_groupinfo---of 27
ext4_mb_complex_scan_group56%of 34
ext4_mb_discard_group_preallocations5%of 40
ext4_mb_discard_lg_preallocations---of 31
ext4_mb_discard_preallocations_should_retry30%of 37
ext4_mb_find_by_goal29%of 32
ext4_mb_find_good_group_avg_frag_lists---of 9
ext4_mb_free_metadata58%of 21
ext4_mb_generate_buddy50%of 10
ext4_mb_generate_from_pa17%of 12
ext4_mb_good_group53%of 23
ext4_mb_init---of 85
ext4_mb_init_cache25%of 94
ext4_mb_init_group20%of 30
ext4_mb_initialize_context71%of 17
ext4_mb_load_buddy_gfp12%of 76
ext4_mb_mark_bb---of 10
ext4_mb_mark_context40%of 69
ext4_mb_mark_diskspace_used43%of 14
ext4_mb_mark_free_simple55%of 11
ext4_mb_new_blocks26%of 194
ext4_mb_new_group_pa---of 25
ext4_mb_new_inode_pa34%of 45
ext4_mb_normalize_request37%of 91
ext4_mb_pa_callback---of 5
ext4_mb_pa_put_free43%of 7
ext4_mb_prefetch55%of 11
ext4_mb_prefetch_fini55%of 11
ext4_mb_regular_allocator31%of 165
ext4_mb_release---of 47
ext4_mb_release_group_pa---of 19
ext4_mb_release_inode_pa32%of 29
ext4_mb_scan_aligned---of 13
ext4_mb_seq_groups_next---of 4
ext4_mb_seq_groups_show---of 22
ext4_mb_seq_groups_start---of 4
ext4_mb_seq_groups_stop---of 1
ext4_mb_seq_structs_summary_next---of 4
ext4_mb_seq_structs_summary_show---of 9
ext4_mb_seq_structs_summary_start---of 4
ext4_mb_seq_structs_summary_stop---of 1
ext4_mb_simple_scan_group32%of 19
ext4_mb_try_best_found22%of 14
ext4_mb_unload_buddy34%of 9
ext4_mb_use_best_found50%of 10
ext4_mb_use_inode_pa39%of 13
ext4_mb_use_preallocated30%of 54
ext4_mballoc_query_range---of 23
ext4_process_freed_data---of 29
ext4_seq_mb_stats_show---of 3
ext4_trim_fs---of 68
ext4_try_merge_freed_extent25%of 8
ext4_try_to_trim_range---of 55
folio_test_uptodate---of 9
mb_find_extent44%of 30
mb_find_order_for_block55%of 11
mb_free_blocks48%of 48
mb_mark_used49%of 49
mb_regenerate_buddy---of 14
mb_set_bit---of 1
mb_set_bits---of 6
mb_set_largest_free_order31%of 13
mb_update_avg_fragment_size17%of 12
trace_ext4_mballoc_discard27%of 15
-----------
SUMMARY33%of 1543

-----------
SUMMARY---of 0

fill_name_de---of 4
ntfs3_get_parent38%of 8
ntfs_create---of 1
ntfs_d_compare---of 13
ntfs_d_hash---of 8
ntfs_link---of 9
ntfs_lookup43%of 7
ntfs_mkdir---of 1
ntfs_mknod---of 1
ntfs_rename---of 35
ntfs_rmdir---of 3
ntfs_symlink---of 3
ntfs_unlink---of 3
-----------
SUMMARY40%of 15

-----------
SUMMARY---of 0

__get_random_u32_below---of 6
__ia32_sys_getrandom---of 9
__x64_sys_getrandom---of 9
_credit_init_bits---of 11
_get_random_bytes25%of 8
add_device_randomness---of 1
add_disk_randomness---of 4
add_hwgenerator_randomness---of 12
add_input_randomness---of 3
add_interrupt_randomness---of 11
add_timer_randomness---of 9
crng_fast_key_erasure---of 1
crng_make_state46%of 31
crng_reseed---of 7
crng_set_ready---of 1
entropy_timer---of 5
execute_with_initialized_rng---of 4
extract_entropy---of 45
get_random_bytes100%of 1
get_random_bytes_user50%of 10
get_random_u16---of 31
get_random_u3239%of 31
get_random_u64---of 31
get_random_u836%of 31
mix_interrupt_randomness---of 9
mix_pool_bytes---of 1
proc_do_rointvec---of 3
proc_do_uuid---of 5
rand_initialize_disk---of 3
random_fasync---of 1
random_ioctl---of 25
random_online_cpu---of 3
random_pm_notification---of 6
random_poll50%of 6
random_prepare_cpu---of 3
random_read_iter---of 8
random_write_iter---of 1
rng_is_initialized---of 3
try_to_generate_entropy---of 23
urandom_read_iter30%of 10
wait_for_random_bytes---of 12
write_pool_user---of 7
-----------
SUMMARY40%of 128

-----------
SUMMARY---of 0

__bad_area_nosemaphore34%of 15
__bpf_trace_x86_exceptions---of 1
__probestub_page_fault_kernel---of 1
__probestub_page_fault_user---of 1
__traceiter_page_fault_kernel---of 4
__traceiter_page_fault_user---of 4
bad_area_access_error15%of 21
bad_area_nosemaphore100%of 1
do_kern_addr_fault60%of 5
do_sigbus40%of 5
dump_pagetable---of 10
fatal_signal_pending67%of 3
fault_in_kernel_space---of 1
is_errata93---of 8
is_prefetch8%of 25
kernelmode_fixup_or_oops50%of 6
page_fault_oops---of 33
perf_trace_x86_exceptions---of 8
pgtable_bad---of 1
show_ldttss---of 5
spurious_kernel_fault6%of 38
spurious_kernel_fault_check---of 11
trace_event_raw_event_x86_exceptions---of 7
trace_raw_output_x86_exceptions---of 3
vma_end_read28%of 11
-----------
SUMMARY20%of 130

bpf_bprm_opts_set---of 3
bpf_get_attach_cookie---of 1
bpf_ima_file_hash---of 1
bpf_ima_inode_hash---of 1
bpf_ima_inode_hash_allowed---of 1
bpf_lsm_audit_rule_free---of 1
bpf_lsm_audit_rule_init---of 1
bpf_lsm_audit_rule_known---of 1
bpf_lsm_audit_rule_match---of 1
bpf_lsm_binder_set_context_mgr---of 1
bpf_lsm_binder_transaction---of 1
bpf_lsm_binder_transfer_binder---of 1
bpf_lsm_binder_transfer_file---of 1
bpf_lsm_bpf---of 1
bpf_lsm_bpf_map---of 1
bpf_lsm_bpf_map_create---of 1
bpf_lsm_bpf_map_free---of 1
bpf_lsm_bpf_prog---of 1
bpf_lsm_bpf_prog_free---of 1
bpf_lsm_bpf_prog_load---of 1
bpf_lsm_bpf_token_capable---of 1
bpf_lsm_bpf_token_cmd---of 1
bpf_lsm_bpf_token_create---of 1
bpf_lsm_bpf_token_free---of 1
bpf_lsm_bprm_check_security---of 1
bpf_lsm_bprm_committed_creds---of 1
bpf_lsm_bprm_committing_creds---of 1
bpf_lsm_bprm_creds_for_exec---of 1
bpf_lsm_bprm_creds_from_file---of 1
bpf_lsm_capable100%of 1
bpf_lsm_capget---of 1
bpf_lsm_capset---of 1
bpf_lsm_cred_alloc_blank---of 1
bpf_lsm_cred_free---of 1
bpf_lsm_cred_getsecid---of 1
bpf_lsm_cred_prepare---of 1
bpf_lsm_cred_transfer---of 1
bpf_lsm_current_getsecid_subj100%of 1
bpf_lsm_d_instantiate100%of 1
bpf_lsm_dentry_create_files_as---of 1
bpf_lsm_dentry_init_security---of 1
bpf_lsm_file_alloc_security100%of 1
bpf_lsm_file_fcntl100%of 1
bpf_lsm_file_free_security100%of 1
bpf_lsm_file_ioctl100%of 1
bpf_lsm_file_ioctl_compat---of 1
bpf_lsm_file_lock100%of 1
bpf_lsm_file_mprotect---of 1
bpf_lsm_file_open100%of 1
bpf_lsm_file_permission100%of 1
bpf_lsm_file_post_open100%of 1
bpf_lsm_file_receive---of 1
bpf_lsm_file_release100%of 1
bpf_lsm_file_send_sigiotask---of 1
bpf_lsm_file_set_fowner100%of 1
bpf_lsm_file_truncate100%of 1
bpf_lsm_find_cgroup_shim---of 5
bpf_lsm_fs_context_dup---of 1
bpf_lsm_fs_context_parse_param100%of 1
bpf_lsm_fs_context_submount---of 1
bpf_lsm_func_proto---of 21
bpf_lsm_getprocattr---of 1
bpf_lsm_getselfattr---of 1
bpf_lsm_ib_alloc_security---of 1
bpf_lsm_ib_endport_manage_subnet---of 1
bpf_lsm_ib_free_security---of 1
bpf_lsm_ib_pkey_access---of 1
bpf_lsm_inet_conn_established---of 1
bpf_lsm_inet_conn_request---of 1
bpf_lsm_inet_csk_clone---of 1
bpf_lsm_inode_alloc_security100%of 1
bpf_lsm_inode_copy_up---of 1
bpf_lsm_inode_copy_up_xattr---of 1
bpf_lsm_inode_create100%of 1
bpf_lsm_inode_follow_link100%of 1
bpf_lsm_inode_free_security100%of 1
bpf_lsm_inode_get_acl---of 1
bpf_lsm_inode_getattr100%of 1
bpf_lsm_inode_getsecctx---of 1
bpf_lsm_inode_getsecid---of 1
bpf_lsm_inode_getsecurity---of 1
bpf_lsm_inode_getxattr---of 1
bpf_lsm_inode_init_security100%of 1
bpf_lsm_inode_init_security_anon100%of 1
bpf_lsm_inode_invalidate_secctx---of 1
bpf_lsm_inode_killpriv---of 1
bpf_lsm_inode_link100%of 1
bpf_lsm_inode_listsecurity---of 1
bpf_lsm_inode_listxattr---of 1
bpf_lsm_inode_mkdir100%of 1
bpf_lsm_inode_mknod---of 1
bpf_lsm_inode_need_killpriv100%of 1
bpf_lsm_inode_notifysecctx---of 1
bpf_lsm_inode_permission100%of 1
bpf_lsm_inode_post_create_tmpfile---of 1
bpf_lsm_inode_post_remove_acl---of 1
bpf_lsm_inode_post_removexattr---of 1
bpf_lsm_inode_post_set_acl---of 1
bpf_lsm_inode_post_setattr100%of 1
bpf_lsm_inode_post_setxattr---of 1
bpf_lsm_inode_readlink100%of 1
bpf_lsm_inode_remove_acl---of 1
bpf_lsm_inode_removexattr---of 1
bpf_lsm_inode_rename100%of 1
bpf_lsm_inode_rmdir100%of 1
bpf_lsm_inode_set_acl---of 1
bpf_lsm_inode_setattr100%of 1
bpf_lsm_inode_setsecctx---of 1
bpf_lsm_inode_setsecurity---of 1
bpf_lsm_inode_setxattr---of 1
bpf_lsm_inode_symlink100%of 1
bpf_lsm_inode_unlink100%of 1
bpf_lsm_ipc_getsecid---of 1
bpf_lsm_ipc_permission---of 1
bpf_lsm_is_sleepable_hook---of 1
bpf_lsm_is_trusted---of 1
bpf_lsm_ismaclabel---of 1
bpf_lsm_kernel_act_as---of 1
bpf_lsm_kernel_create_files_as---of 1
bpf_lsm_kernel_load_data---of 1
bpf_lsm_kernel_module_request---of 1
bpf_lsm_kernel_post_load_data---of 1
bpf_lsm_kernel_post_read_file---of 1
bpf_lsm_kernel_read_file---of 1
bpf_lsm_kernfs_init_security---of 1
bpf_lsm_key_alloc---of 1
bpf_lsm_key_free---of 1
bpf_lsm_key_getsecurity---of 1
bpf_lsm_key_permission---of 1
bpf_lsm_key_post_create_or_update---of 1
bpf_lsm_locked_down---of 1
bpf_lsm_mmap_addr100%of 1
bpf_lsm_mmap_file100%of 1
bpf_lsm_move_mount100%of 1
bpf_lsm_mptcp_add_subflow---of 1
bpf_lsm_msg_msg_alloc_security---of 1
bpf_lsm_msg_msg_free_security---of 1
bpf_lsm_msg_queue_alloc_security---of 1
bpf_lsm_msg_queue_associate---of 1
bpf_lsm_msg_queue_free_security---of 1
bpf_lsm_msg_queue_msgctl---of 1
bpf_lsm_msg_queue_msgrcv---of 1
bpf_lsm_msg_queue_msgsnd---of 1
bpf_lsm_netlink_send---of 1
bpf_lsm_path_chmod100%of 1
bpf_lsm_path_chown100%of 1
bpf_lsm_path_chroot100%of 1
bpf_lsm_path_link100%of 1
bpf_lsm_path_mkdir100%of 1
bpf_lsm_path_mknod100%of 1
bpf_lsm_path_notify---of 1
bpf_lsm_path_post_mknod100%of 1
bpf_lsm_path_rename100%of 1
bpf_lsm_path_rmdir100%of 1
bpf_lsm_path_symlink100%of 1
bpf_lsm_path_truncate100%of 1
bpf_lsm_path_unlink100%of 1
bpf_lsm_perf_event_alloc---of 1
bpf_lsm_perf_event_free---of 1
bpf_lsm_perf_event_open---of 1
bpf_lsm_perf_event_read---of 1
bpf_lsm_perf_event_write---of 1
bpf_lsm_post_notification---of 1
bpf_lsm_ptrace_access_check---of 1
bpf_lsm_ptrace_traceme---of 1
bpf_lsm_quota_on---of 1
bpf_lsm_quotactl100%of 1
bpf_lsm_release_secctx---of 1
bpf_lsm_req_classify_flow---of 1
bpf_lsm_sb_alloc_security100%of 1
bpf_lsm_sb_clone_mnt_opts---of 1
bpf_lsm_sb_delete---of 1
bpf_lsm_sb_eat_lsm_opts100%of 1
bpf_lsm_sb_free_mnt_opts---of 1
bpf_lsm_sb_free_security---of 1
bpf_lsm_sb_kern_mount100%of 1
bpf_lsm_sb_mnt_opts_compat---of 1
bpf_lsm_sb_mount100%of 1
bpf_lsm_sb_pivotroot100%of 1
bpf_lsm_sb_remount---of 1
bpf_lsm_sb_set_mnt_opts100%of 1
bpf_lsm_sb_show_options---of 1
bpf_lsm_sb_statfs100%of 1
bpf_lsm_sb_umount---of 1
bpf_lsm_sctp_assoc_established---of 1
bpf_lsm_sctp_assoc_request---of 1
bpf_lsm_sctp_bind_connect---of 1
bpf_lsm_sctp_sk_clone---of 1
bpf_lsm_secctx_to_secid---of 1
bpf_lsm_secid_to_secctx---of 1
bpf_lsm_secmark_refcount_dec---of 1
bpf_lsm_secmark_refcount_inc---of 1
bpf_lsm_secmark_relabel_packet---of 1
bpf_lsm_sem_alloc_security---of 1
bpf_lsm_sem_associate---of 1
bpf_lsm_sem_free_security---of 1
bpf_lsm_sem_semctl---of 1
bpf_lsm_sem_semop---of 1
bpf_lsm_setprocattr---of 1
bpf_lsm_setselfattr---of 1
bpf_lsm_settime---of 1
bpf_lsm_shm_alloc_security---of 1
bpf_lsm_shm_associate---of 1
bpf_lsm_shm_free_security---of 1
bpf_lsm_shm_shmat---of 1
bpf_lsm_shm_shmctl---of 1
bpf_lsm_sk_alloc_security---of 1
bpf_lsm_sk_clone_security---of 1
bpf_lsm_sk_free_security---of 1
bpf_lsm_sk_getsecid---of 1
bpf_lsm_sock_graft---of 1
bpf_lsm_socket_accept---of 1
bpf_lsm_socket_bind---of 1
bpf_lsm_socket_connect---of 1
bpf_lsm_socket_create---of 1
bpf_lsm_socket_getpeername---of 1
bpf_lsm_socket_getpeersec_dgram---of 1
bpf_lsm_socket_getpeersec_stream---of 1
bpf_lsm_socket_getsockname---of 1
bpf_lsm_socket_getsockopt---of 1
bpf_lsm_socket_listen---of 1
bpf_lsm_socket_post_create---of 1
bpf_lsm_socket_recvmsg---of 1
bpf_lsm_socket_sendmsg---of 1
bpf_lsm_socket_setsockopt---of 1
bpf_lsm_socket_shutdown---of 1
bpf_lsm_socket_sock_rcv_skb---of 1
bpf_lsm_socket_socketpair---of 1
bpf_lsm_syslog---of 1
bpf_lsm_task_alloc---of 1
bpf_lsm_task_fix_setgid---of 1
bpf_lsm_task_fix_setgroups---of 1
bpf_lsm_task_fix_setuid---of 1
bpf_lsm_task_free---of 1
bpf_lsm_task_getioprio---of 1
bpf_lsm_task_getpgid---of 1
bpf_lsm_task_getscheduler---of 1
bpf_lsm_task_getsecid_obj---of 1
bpf_lsm_task_getsid---of 1
bpf_lsm_task_kill---of 1
bpf_lsm_task_movememory---of 1
bpf_lsm_task_prctl---of 1
bpf_lsm_task_prlimit---of 1
bpf_lsm_task_setioprio---of 1
bpf_lsm_task_setnice---of 1
bpf_lsm_task_setpgid---of 1
bpf_lsm_task_setrlimit---of 1
bpf_lsm_task_setscheduler---of 1
bpf_lsm_task_to_inode100%of 1
bpf_lsm_tun_dev_alloc_security---of 1
bpf_lsm_tun_dev_attach---of 1
bpf_lsm_tun_dev_attach_queue---of 1
bpf_lsm_tun_dev_create---of 1
bpf_lsm_tun_dev_free_security---of 1
bpf_lsm_tun_dev_open---of 1
bpf_lsm_unix_may_send---of 1
bpf_lsm_unix_stream_connect---of 1
bpf_lsm_uring_cmd---of 1
bpf_lsm_uring_override_creds---of 1
bpf_lsm_uring_sqpoll---of 1
bpf_lsm_userns_create---of 1
bpf_lsm_verify_prog---of 4
bpf_lsm_vm_enough_memory100%of 1
bpf_lsm_watch_key---of 1
bpf_lsm_xfrm_decode_session---of 1
bpf_lsm_xfrm_policy_alloc_security---of 1
bpf_lsm_xfrm_policy_clone_security---of 1
bpf_lsm_xfrm_policy_delete_security---of 1
bpf_lsm_xfrm_policy_free_security---of 1
bpf_lsm_xfrm_policy_lookup---of 1
bpf_lsm_xfrm_state_alloc---of 1
bpf_lsm_xfrm_state_alloc_acquire---of 1
bpf_lsm_xfrm_state_delete_security---of 1
bpf_lsm_xfrm_state_free_security---of 1
bpf_lsm_xfrm_state_pol_flow_match---of 1
btf_id_cmp_func---of 1
-----------
SUMMARY100%of 58

-----------
SUMMARY---of 0

blk_integrity_compare---of 13
blk_integrity_merge_bio20%of 10
blk_integrity_merge_rq---of 16
blk_integrity_nop_complete---of 1
blk_integrity_nop_fn---of 1
blk_integrity_nop_prepare---of 1
blk_integrity_register---of 6
blk_integrity_unregister---of 3
blk_rq_count_integrity_sg---of 17
blk_rq_map_integrity_sg---of 18
device_is_integrity_capable_show---of 1
format_show---of 4
protection_interval_bytes_show---of 4
read_verify_show---of 1
read_verify_store---of 1
tag_size_show---of 1
write_generate_show---of 1
write_generate_store---of 1
-----------
SUMMARY20%of 10

_atomic_dec_and_lock58%of 7
_atomic_dec_and_lock_irqsave29%of 7
_atomic_dec_and_raw_lock---of 7
_atomic_dec_and_raw_lock_irqsave---of 7
-----------
SUMMARY43%of 14

-----------
SUMMARY---of 0

__ep_eventpoll_poll37%of 30
__ep_remove---of 31
__ia32_compat_sys_epoll_pwait---of 4
__ia32_compat_sys_epoll_pwait2---of 5
__ia32_sys_epoll_create---of 3
__ia32_sys_epoll_create1---of 1
__ia32_sys_epoll_ctl---of 4
__ia32_sys_epoll_pwait---of 4
__ia32_sys_epoll_pwait2---of 5
__ia32_sys_epoll_wait---of 4
__x64_compat_sys_epoll_pwait---of 4
__x64_compat_sys_epoll_pwait2---of 5
__x64_sys_epoll_create67%of 3
__x64_sys_epoll_create1100%of 1
__x64_sys_epoll_ctl50%of 4
__x64_sys_epoll_pwait50%of 4
__x64_sys_epoll_pwait240%of 5
__x64_sys_epoll_wait50%of 4
do_compat_epoll_pwait---of 7
do_epoll_create34%of 9
do_epoll_ctl15%of 74
do_epoll_pwait43%of 7
do_epoll_wait27%of 101
ep_autoremove_wake_function67%of 3
ep_busy_loop_end---of 7
ep_clear_and_put---of 14
ep_destroy_wakeup_source---of 7
ep_done_scan25%of 20
ep_eventpoll_ioctl---of 14
ep_eventpoll_poll100%of 1
ep_eventpoll_release---of 3
ep_insert24%of 96
ep_loop_check_proc---of 10
ep_modify---of 36
ep_pm_stay_awake_rcu28%of 18
ep_poll_callback24%of 30
ep_ptable_queue_proc---of 5
ep_remove_safe---of 3
ep_show_fdinfo---of 5
ep_unregister_pollwait---of 16
epoll_mutex_lock---of 3
eventpoll_release_file---of 7
get_epoll_tfile_raw_ptr---of 8
reverse_path_check_proc---of 10
-----------
SUMMARY27%of 410

__bpf_trace_balance_dirty_pages---of 1
__bpf_trace_bdi_dirty_ratelimit---of 1
__bpf_trace_flush_foreign---of 1
__bpf_trace_global_dirty_state---of 1
__bpf_trace_inode_foreign_history---of 1
__bpf_trace_inode_switch_wbs---of 1
__bpf_trace_track_foreign_dirty---of 1
__bpf_trace_wbc_class---of 1
__bpf_trace_writeback_bdi_register---of 1
__bpf_trace_writeback_class---of 1
__bpf_trace_writeback_dirty_inode_template---of 1
__bpf_trace_writeback_folio_template---of 1
__bpf_trace_writeback_inode_template---of 1
__bpf_trace_writeback_pages_written---of 1
__bpf_trace_writeback_queue_io---of 1
__bpf_trace_writeback_sb_inodes_requeue---of 1
__bpf_trace_writeback_single_inode_template---of 1
__bpf_trace_writeback_work_class---of 1
__bpf_trace_writeback_write_inode_template---of 1
__inode_attach_wb10%of 32
__mark_inode_dirty40%of 82
__probestub_balance_dirty_pages---of 1
__probestub_bdi_dirty_ratelimit---of 1
__probestub_flush_foreign---of 1
__probestub_folio_wait_writeback---of 1
__probestub_global_dirty_state---of 1
__probestub_inode_foreign_history---of 1
__probestub_inode_switch_wbs---of 1
__probestub_sb_clear_inode_writeback---of 1
__probestub_sb_mark_inode_writeback---of 1
__probestub_track_foreign_dirty---of 1
__probestub_wbc_writepage---of 1
__probestub_writeback_bdi_register---of 1
__probestub_writeback_dirty_folio---of 1
__probestub_writeback_dirty_inode---of 1
__probestub_writeback_dirty_inode_enqueue---of 1
__probestub_writeback_dirty_inode_start---of 1
__probestub_writeback_exec---of 1
__probestub_writeback_lazytime---of 1
__probestub_writeback_lazytime_iput---of 1
__probestub_writeback_mark_inode_dirty---of 1
__probestub_writeback_pages_written---of 1
__probestub_writeback_queue---of 1
__probestub_writeback_queue_io---of 1
__probestub_writeback_sb_inodes_requeue---of 1
__probestub_writeback_single_inode---of 1
__probestub_writeback_single_inode_start---of 1
__probestub_writeback_start---of 1
__probestub_writeback_wait---of 1
__probestub_writeback_wake_background---of 1
__probestub_writeback_write_inode---of 1
__probestub_writeback_write_inode_start---of 1
__probestub_writeback_written---of 1
__traceiter_balance_dirty_pages---of 4
__traceiter_bdi_dirty_ratelimit---of 4
__traceiter_flush_foreign---of 4
__traceiter_folio_wait_writeback---of 4
__traceiter_global_dirty_state---of 4
__traceiter_inode_foreign_history---of 4
__traceiter_inode_switch_wbs---of 4
__traceiter_sb_clear_inode_writeback---of 4
__traceiter_sb_mark_inode_writeback---of 4
__traceiter_track_foreign_dirty---of 4
__traceiter_wbc_writepage---of 4
__traceiter_writeback_bdi_register---of 4
__traceiter_writeback_dirty_folio---of 4
__traceiter_writeback_dirty_inode---of 4
__traceiter_writeback_dirty_inode_enqueue---of 4
__traceiter_writeback_dirty_inode_start---of 4
__traceiter_writeback_exec---of 4
__traceiter_writeback_lazytime---of 4
__traceiter_writeback_lazytime_iput---of 4
__traceiter_writeback_mark_inode_dirty---of 4
__traceiter_writeback_pages_written---of 4
__traceiter_writeback_queue---of 4
__traceiter_writeback_queue_io---of 4
__traceiter_writeback_sb_inodes_requeue---of 4
__traceiter_writeback_single_inode---of 4
__traceiter_writeback_single_inode_start---of 4
__traceiter_writeback_start---of 4
__traceiter_writeback_wait---of 4
__traceiter_writeback_wake_background---of 4
__traceiter_writeback_write_inode---of 4
__traceiter_writeback_write_inode_start---of 4
__traceiter_writeback_written---of 4
__wakeup_flusher_threads_bdi40%of 10
__writeback_inodes_sb_nr50%of 4
__writeback_inodes_wb---of 9
__writeback_single_inode20%of 87
bdi_split_work_to_wbs27%of 41
cgroup_writeback_by_id---of 25
cgroup_writeback_umount---of 3
cleanup_offline_cgwb---of 22
dirtytime_interval_handler---of 3
folio_memcg---of 12
inode_cgwb_move_to_attached28%of 11
inode_io_list_del67%of 3
inode_io_list_move_locked47%of 13
inode_prepare_wbs_switch---of 11
inode_switch_wbs---of 28
inode_switch_wbs_work_fn---of 77
inode_wait_for_writeback50%of 4
locked_inode_to_wb_and_lock_list30%of 10
move_expired_inodes---of 22
percpu_ref_put_many---of 14
percpu_ref_tryget---of 16
perf_trace_balance_dirty_pages---of 11
perf_trace_bdi_dirty_ratelimit---of 9
perf_trace_flush_foreign---of 9
perf_trace_global_dirty_state---of 8
perf_trace_inode_foreign_history---of 11
perf_trace_inode_switch_wbs---of 9
perf_trace_track_foreign_dirty---of 13
perf_trace_wbc_class---of 11
perf_trace_writeback_bdi_register---of 9
perf_trace_writeback_class---of 9
perf_trace_writeback_dirty_inode_template---of 9
perf_trace_writeback_folio_template---of 14
perf_trace_writeback_inode_template---of 8
perf_trace_writeback_pages_written---of 8
perf_trace_writeback_queue_io---of 9
perf_trace_writeback_sb_inodes_requeue---of 14
perf_trace_writeback_single_inode_template---of 11
perf_trace_writeback_work_class---of 11
perf_trace_writeback_write_inode_template---of 11
queue_io---of 24
redirty_tail_locked29%of 7
sb_clear_inode_writeback---of 19
sb_mark_inode_writeback27%of 19
sync_inode_metadata100%of 1
sync_inodes_sb41%of 37
trace_event_raw_event_balance_dirty_pages---of 10
trace_event_raw_event_bdi_dirty_ratelimit---of 8
trace_event_raw_event_flush_foreign---of 8
trace_event_raw_event_global_dirty_state---of 7
trace_event_raw_event_inode_foreign_history---of 10
trace_event_raw_event_inode_switch_wbs---of 8
trace_event_raw_event_track_foreign_dirty---of 12
trace_event_raw_event_wbc_class---of 10
trace_event_raw_event_writeback_bdi_register---of 8
trace_event_raw_event_writeback_class---of 8
trace_event_raw_event_writeback_dirty_inode_template---of 8
trace_event_raw_event_writeback_folio_template---of 13
trace_event_raw_event_writeback_inode_template---of 7
trace_event_raw_event_writeback_pages_written---of 7
trace_event_raw_event_writeback_queue_io---of 8
trace_event_raw_event_writeback_sb_inodes_requeue---of 13
trace_event_raw_event_writeback_single_inode_template---of 10
trace_event_raw_event_writeback_work_class---of 10
trace_event_raw_event_writeback_write_inode_template---of 10
trace_raw_output_balance_dirty_pages---of 3
trace_raw_output_bdi_dirty_ratelimit---of 3
trace_raw_output_flush_foreign---of 3
trace_raw_output_global_dirty_state---of 3
trace_raw_output_inode_foreign_history---of 3
trace_raw_output_inode_switch_wbs---of 3
trace_raw_output_track_foreign_dirty---of 3
trace_raw_output_wbc_class---of 3
trace_raw_output_writeback_bdi_register---of 3
trace_raw_output_writeback_class---of 3
trace_raw_output_writeback_dirty_inode_template---of 3
trace_raw_output_writeback_folio_template---of 3
trace_raw_output_writeback_inode_template---of 3
trace_raw_output_writeback_pages_written---of 3
trace_raw_output_writeback_queue_io---of 3
trace_raw_output_writeback_sb_inodes_requeue---of 3
trace_raw_output_writeback_single_inode_template---of 3
trace_raw_output_writeback_work_class---of 3
trace_raw_output_writeback_write_inode_template---of 3
trace_writeback_pages_written---of 15
try_to_writeback_inodes_sb---of 3
wakeup_dirtytime_writeback---of 20
wakeup_flusher_threads38%of 16
wakeup_flusher_threads_bdi---of 11
wb_get15%of 14
wb_io_lists_depopulated29%of 7
wb_put---of 4
wb_queue_work25%of 24
wb_start_background_writeback---of 17
wb_wait_for_completion60%of 5
wb_wakeup_delayed67%of 3
wb_workfn---of 48
wb_writeback---of 62
wbc_account_cgroup_owner14%of 15
wbc_attach_and_unlock_inode15%of 14
wbc_detach_inode8%of 28
write_inode_now---of 3
writeback_inodes_sb100%of 1
writeback_inodes_sb_nr---of 1
writeback_inodes_wb---of 3
writeback_sb_inodes---of 52
writeback_single_inode32%of 22
xas_next_marked---of 16
-----------
SUMMARY29%of 510

llist_add_batch50%of 4
llist_del_first---of 5
llist_del_first_this---of 5
llist_reverse_order---of 4
-----------
SUMMARY50%of 4

_udf_err---of 1
_udf_warn---of 1
identify_vsd---of 18
init_once---of 1
lvid_get_unique_id---of 5
udf_alloc_inode67%of 3
udf_check_anchor_block---of 10
udf_close_lvid---of 13
udf_compute_nr_groups---of 1
udf_fill_partdesc_info---of 32
udf_fill_super---of 64
udf_find_metadata_inode_efe---of 4
udf_free_fc---of 1
udf_free_in_core_inode---of 1
udf_get_tree---of 1
udf_init_fs_context---of 4
udf_load_logicalvolint---of 19
udf_load_vat---of 23
udf_load_vrs---of 52
udf_open_lvid---of 5
udf_parse_param---of 44
udf_process_sequence---of 154
udf_put_super---of 5
udf_reconfigure---of 7
udf_sb_free_partitions---of 23
udf_sb_lvidiu67%of 3
udf_show_options---of 35
udf_statfs22%of 32
udf_sync_fs100%of 3
-----------
SUMMARY35%of 41

-----------
SUMMARY---of 0

tomoyo_addprintf---of 1
tomoyo_check_profile---of 9
tomoyo_close_control---of 4
tomoyo_flush---of 19
tomoyo_init_policy_namespace---of 4
tomoyo_io_printf---of 5
tomoyo_open_control---of 24
tomoyo_parse_policy---of 10
tomoyo_poll_control---of 3
tomoyo_poll_query---of 5
tomoyo_print_name_union---of 9
tomoyo_print_number_union---of 3
tomoyo_print_number_union_nospace---of 7
tomoyo_profile100%of 1
tomoyo_read_control---of 20
tomoyo_read_domain---of 50
tomoyo_read_domain2---of 200
tomoyo_read_exception---of 125
tomoyo_read_manager---of 24
tomoyo_read_pid---of 22
tomoyo_read_profile---of 51
tomoyo_read_query---of 15
tomoyo_read_stat---of 19
tomoyo_read_version---of 3
tomoyo_same_manager---of 1
tomoyo_same_task_acl---of 1
tomoyo_set_group---of 9
tomoyo_set_slash---of 3
tomoyo_set_space---of 3
tomoyo_set_string---of 3
tomoyo_supervisor10%of 54
tomoyo_update_stat---of 5
tomoyo_write_answer---of 10
tomoyo_write_control---of 70
tomoyo_write_domain---of 28
tomoyo_write_domain2---of 7
tomoyo_write_exception---of 15
tomoyo_write_manager---of 8
tomoyo_write_pid---of 1
tomoyo_write_profile---of 46
tomoyo_write_stat---of 8
tomoyo_write_task---of 5
-----------
SUMMARY11%of 55

change_mnt_propagation20%of 30
get_dominating_id---of 10
propagate_mnt18%of 39
propagate_mount_busy---of 32
propagate_mount_unlock---of 23
propagate_one12%of 18
propagate_umount---of 78
propagation_would_overmount---of 8
umount_one---of 9
-----------
SUMMARY18%of 87

-----------
SUMMARY---of 0

__rq_qos_cleanup---of 5
__rq_qos_done60%of 5
__rq_qos_done_bio60%of 5
__rq_qos_issue60%of 5
__rq_qos_merge60%of 5
__rq_qos_queue_depth_changed---of 5
__rq_qos_requeue---of 5
__rq_qos_throttle60%of 5
__rq_qos_track60%of 5
rq_depth_calc_max_depth---of 6
rq_depth_scale_down---of 7
rq_depth_scale_up---of 7
rq_qos_add---of 10
rq_qos_del---of 9
rq_qos_exit---of 4
rq_qos_wait19%of 11
rq_qos_wake_function---of 4
rq_wait_inc_below40%of 5
-----------
SUMMARY48%of 46

-----------
SUMMARY---of 0

tomoyo_bprm_check_security---of 4
tomoyo_bprm_committed_creds---of 1
tomoyo_cred_prepare---of 4
tomoyo_domain50%of 4
tomoyo_file_fcntl34%of 6
tomoyo_file_ioctl100%of 1
tomoyo_file_open40%of 5
tomoyo_file_truncate100%of 1
tomoyo_inode_getattr100%of 1
tomoyo_path_chmod100%of 1
tomoyo_path_chown84%of 6
tomoyo_path_chroot100%of 1
tomoyo_path_link100%of 1
tomoyo_path_mkdir100%of 1
tomoyo_path_mknod50%of 4
tomoyo_path_rename50%of 4
tomoyo_path_rmdir100%of 1
tomoyo_path_symlink100%of 1
tomoyo_path_truncate100%of 1
tomoyo_path_unlink100%of 1
tomoyo_sb_mount100%of 1
tomoyo_sb_pivotroot100%of 1
tomoyo_sb_umount---of 1
tomoyo_socket_bind---of 1
tomoyo_socket_connect---of 1
tomoyo_socket_listen---of 1
tomoyo_socket_sendmsg---of 1
tomoyo_task_alloc---of 1
tomoyo_task_free---of 5
-----------
SUMMARY67%of 42

__alloc_dummy_extent_buffer---of 23
__alloc_extent_buffer67%of 3
__extent_writepage_io19%of 49
__process_pages_contig47%of 26
__unlock_for_delalloc---of 4
__write_extent_buffer14%of 23
_compound_head---of 7
alloc_dummy_extent_buffer---of 1
alloc_eb_folio_array28%of 18
alloc_extent_buffer17%of 145
assert_eb_folio_uptodate---of 16
attach_extent_buffer_folio18%of 23
bio_first_folio---of 18
bio_next_folio---of 10
btree_clear_folio_dirty---of 23
btree_write_cache_pages19%of 82
btrfs_alloc_folio_array---of 12
btrfs_alloc_page_array---of 7
btrfs_clear_buffer_dirty14%of 36
btrfs_clone_extent_buffer---of 26
btrfs_do_readpage---of 66
btrfs_read_folio---of 1
btrfs_readahead---of 68
btrfs_readahead_node_child---of 1
btrfs_readahead_tree_block---of 5
btrfs_release_extent_buffer_pages56%of 9
btrfs_release_extent_buffer_rcu---of 1
btrfs_writepages24%of 146
check_buffer_tree_ref67%of 6
clear_extent_buffer_reading---of 1
clear_extent_buffer_uptodate---of 23
clear_page_extent_mapped---of 17
copy_extent_buffer---of 17
copy_extent_buffer_full45%of 9
detach_extent_buffer_folio16%of 51
emit_fiemap_extent---of 14
end_bbio_data_read---of 75
end_bbio_data_write---of 22
end_bbio_meta_read---of 11
end_bbio_meta_write---of 11
end_page_read---of 35
extent_buffer_bitmap_clear---of 4
extent_buffer_bitmap_set---of 4
extent_buffer_free_cachep---of 1
extent_buffer_get_byte---of 7
extent_buffer_test_bit---of 7
extent_clear_unlock_delalloc100%of 1
extent_fiemap---of 77
extent_invalidate_folio---of 4
extent_range_clear_dirty_for_io---of 5
extent_write_locked_range---of 57
fiemap_process_hole---of 26
find_extent_buffer43%of 19
find_extent_buffer_nolock28%of 22
find_lock_delalloc_range34%of 18
folio_attach_private25%of 8
folio_detach_private24%of 17
folio_index---of 16
folio_lock34%of 9
folio_put---of 4
folio_size30%of 10
free_extent_buffer44%of 16
free_extent_buffer_stale43%of 7
lock_delalloc_pages31%of 33
lock_extent_buffer_for_io34%of 6
memcmp_extent_buffer25%of 12
memcpy_extent_buffer15%of 14
memmove_extent_buffer22%of 14
memset_extent_buffer23%of 9
memzero_extent_buffer50%of 4
put_page---of 14
read_extent_buffer22%of 14
read_extent_buffer_pages---of 28
read_extent_buffer_to_user_nofault---of 14
release_extent_buffer50%of 10
set_extent_buffer_dirty29%of 39
set_extent_buffer_uptodate28%of 22
set_folio_extent_mapped28%of 11
set_page_extent_mapped---of 7
submit_extent_page42%of 31
submit_one_bio23%of 9
try_release_extent_buffer---of 62
try_release_extent_mapping---of 16
wait_on_extent_buffer_writeback---of 3
write_extent_buffer100%of 1
write_one_eb30%of 40
writepage_delalloc46%of 11
-----------
SUMMARY26%of 1033

-----------
SUMMARY---of 0

accumulate_nsecs_to_secs---of 9
change_clocksource---of 13
delta_to_ns_safe---of 3
do_adjtimex---of 41
do_settimeofday64---of 19
do_timer---of 1
dummy_clock_read---of 3
get_device_system_crosststamp---of 33
getboottime64---of 1
ktime_get50%of 8
ktime_get_boot_fast_ns---of 1
ktime_get_coarse_real_ts6450%of 8
ktime_get_coarse_ts64---of 8
ktime_get_coarse_with_offset---of 12
ktime_get_fast_timestamps---of 9
ktime_get_mono_fast_ns---of 8
ktime_get_raw---of 6
ktime_get_raw_fast_ns---of 8
ktime_get_raw_ts64---of 8
ktime_get_real_fast_ns---of 8
ktime_get_real_seconds100%of 1
ktime_get_real_ts6440%of 10
ktime_get_resolution_ns---of 10
ktime_get_seconds67%of 3
ktime_get_snapshot---of 18
ktime_get_tai_fast_ns---of 1
ktime_get_ts6450%of 10
ktime_get_update_offsets_now---of 10
ktime_get_with_offset---of 10
ktime_mono_to_any---of 8
pvclock_gtod_register_notifier---of 1
pvclock_gtod_unregister_notifier---of 1
random_get_entropy_fallback---of 3
seqcount_lockdep_reader_access58%of 7
timekeeping_advance---of 53
timekeeping_forward_now---of 12
timekeeping_get_ns36%of 14
timekeeping_inject_offset---of 14
timekeeping_max_deferment---of 6
timekeeping_notify---of 3
timekeeping_resume---of 12
timekeeping_suspend---of 12
timekeeping_update---of 11
timekeeping_valid_for_hres---of 6
timekeeping_warp_clock---of 3
tk_set_wall_to_mono---of 3
tk_setup_internals---of 10
tk_xtime_add---of 12
update_wall_time---of 3
-----------
SUMMARY48%of 61

-----------
SUMMARY---of 0

__bio_crypt_advance---of 8
__bio_crypt_clone---of 3
__bio_crypt_free_ctx---of 1
__blk_crypto_bio_prep---of 17
__blk_crypto_free_request---of 3
__blk_crypto_rq_bio_prep---of 4
__blk_crypto_rq_get_keyslot---of 1
__blk_crypto_rq_put_keyslot---of 1
bio_crypt_ctx_mergeable20%of 10
bio_crypt_dun_increment---of 6
bio_crypt_dun_is_contiguous---of 8
bio_crypt_rq_ctx_compatible67%of 3
bio_crypt_set_ctx---of 3
blk_crypto_config_supported---of 1
blk_crypto_config_supported_natively---of 1
blk_crypto_evict_key---of 6
blk_crypto_init_key---of 5
blk_crypto_start_using_key---of 3
-----------
SUMMARY31%of 13

-----------
SUMMARY---of 0

ovl_change_flags---of 10
ovl_copy_file_range---of 1
ovl_copyfile---of 27
ovl_fadvise50%of 10
ovl_fallocate---of 11
ovl_file_accessed---of 6
ovl_file_modified---of 1
ovl_flush---of 12
ovl_fsync---of 9
ovl_llseek---of 14
ovl_mmap---of 3
ovl_open---of 8
ovl_open_realfile---of 8
ovl_read_iter---of 13
ovl_real_fdget_meta40%of 10
ovl_release---of 1
ovl_remap_file_range---of 6
ovl_splice_read---of 12
ovl_splice_write---of 12
ovl_write_iter---of 15
-----------
SUMMARY45%of 20

-----------
SUMMARY---of 0

__clear_extent_bit26%of 66
__set_extent_bit21%of 125
alloc_extent_state25%of 16
btrfs_find_delalloc_range41%of 22
cache_state---of 7
clear_record_extent_bits67%of 3
clear_state_bit42%of 12
convert_extent_bit---of 97
count_range_bits32%of 38
extent_io_tree_init100%of 1
extent_io_tree_panic---of 3
extent_io_tree_release---of 8
extent_io_tree_to_fs_info---of 3
extent_io_tree_to_inode---of 3
extent_io_tree_to_inode_const---of 3
extent_state_free_cachep---of 1
find_contiguous_extent_bit---of 21
find_first_clear_extent_bit---of 27
find_first_extent_bit---of 28
free_extent_state30%of 20
insert_state38%of 29
insert_state_fast---of 7
lock_extent7%of 31
merge_state50%of 12
set_extent_bit100%of 1
set_record_extent_bits---of 3
set_state_bits---of 7
split_state---of 9
test_range_bit14%of 23
test_range_bit_exists---of 17
try_lock_extent---of 4
-----------
SUMMARY27%of 399

__scsi_internal_device_block_nowait---of 12
__scsi_queue_insert---of 12
device_quiesce_fn---of 1
device_resume_fn---of 5
device_unblock---of 1
scsi_alloc_request---of 3
scsi_alloc_sgtables24%of 26
scsi_block_requests---of 1
scsi_block_targets---of 3
scsi_build_sense---of 1
scsi_cleanup_rq---of 10
scsi_commit_rqs---of 1
scsi_complete---of 16
scsi_dec_host_busy---of 17
scsi_device_block---of 6
scsi_device_from_queue---of 5
scsi_device_quiesce---of 11
scsi_device_resume---of 5
scsi_device_set_state---of 45
scsi_device_state_check---of 15
scsi_device_unbusy---of 10
scsi_done---of 1
scsi_done_direct---of 1
scsi_done_internal---of 23
scsi_end_request---of 40
scsi_evt_thread---of 33
scsi_execute_cmd---of 50
scsi_exit_queue---of 1
scsi_failures_reset_retries---of 4
scsi_free_sgtables---of 6
scsi_host_block---of 7
scsi_host_unblock---of 5
scsi_init_command50%of 4
scsi_init_hctx---of 1
scsi_init_limits---of 11
scsi_init_sense_cache---of 3
scsi_internal_device_block_nowait---of 4
scsi_internal_device_unblock---of 12
scsi_internal_device_unblock_nowait---of 12
scsi_io_completion---of 17
scsi_io_completion_action---of 86
scsi_io_completion_nz_result---of 53
scsi_kick_sdev_queue---of 3
scsi_kmap_atomic_sg---of 11
scsi_kunmap_atomic_sg---of 3
scsi_map_queues---of 3
scsi_mode_select---of 10
scsi_mode_sense---of 25
scsi_mq_exit_request---of 3
scsi_mq_free_tags---of 1
scsi_mq_get_budget12%of 17
scsi_mq_get_rq_budget_token---of 1
scsi_mq_init_request---of 7
scsi_mq_lld_busy---of 6
scsi_mq_poll---of 3
scsi_mq_put_budget---of 8
scsi_mq_requeue_cmd---of 13
scsi_mq_set_rq_budget_token100%of 1
scsi_mq_setup_tags---of 5
scsi_queue_insert---of 1
scsi_queue_rq17%of 130
scsi_requeue_run_queue---of 1
scsi_run_host_queues---of 4
scsi_run_queue---of 26
scsi_run_queue_async---of 8
scsi_start_queue---of 3
scsi_target_quiesce---of 1
scsi_target_resume---of 1
scsi_target_unblock---of 3
scsi_test_unit_ready---of 10
scsi_unblock_requests---of 4
scsi_vpd_lun_id---of 58
scsi_vpd_tpg_id---of 29
sdev_disable_disk_events---of 1
sdev_enable_disk_events---of 3
sdev_evt_alloc---of 6
sdev_evt_send---of 3
sdev_evt_send_simple---of 7
target_block---of 3
target_unblock---of 3
-----------
SUMMARY18%of 178